From 68d993e913ee0ac61dfcb10cabb1f8e43ea4234a Mon Sep 17 00:00:00 2001
From: Aiden Grossman <agrossman154@yahoo.com>
Date: Wed, 25 Oct 2023 21:55:38 -0700
Subject: [PATCH 001/877] [Github] Add lld to docs CI (#69821)

This patch adds the lld documentation to the documentation github
actions CI to automatically validate in PRs/at tip of tree that the docs
build and there aren't any Sphinx warnings. There is existing buildbot
coverage for the lld docs, but this much more convienient to use in
cases like PRs.
---
 .github/workflows/docs.yml | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
index a0670625bb3e3..cbb3706cc1bcf 100644
--- a/.github/workflows/docs.yml
+++ b/.github/workflows/docs.yml
@@ -20,6 +20,7 @@ on:
       - 'libunwind/docs/**'
       - 'libcxx/docs/**'
       - 'libc/docs/**'
+      - 'lld/docs/**'
   pull_request:
     paths:
       - 'llvm/docs/**'
@@ -29,6 +30,7 @@ on:
       - 'libunwind/docs/**'
       - 'libcxx/docs/**'
       - 'libc/docs/**'
+      - 'lld/docs/**'
 
 jobs:
   check-docs-build:
@@ -63,6 +65,8 @@ jobs:
               - 'libcxx/docs/**'
             libc:
               - 'libc/docs/**'
+            lld:
+              - 'lld/docs/**'
       - name: Fetch LLVM sources (PR)
         if: ${{ github.event_name == 'pull_request' }}
         uses: actions/checkout@v4
@@ -116,4 +120,9 @@ jobs:
         run: |
           cmake -B libc-build -GNinja -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_RUNTIMES="libc" -DLLVM_ENABLE_SPHINX=ON ./runtimes
           TZ=UTC ninja -C docs-libc-html
+      - name: Build LLD docs
+        if: steps.docs-changed-subprojects.outputs.lld_any_changed == 'true'
+        run: |
+          cmake -B lld-build -GNinja -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_PROJECTS="lld" -DLLVM_ENABLE_SPHINX=ON ./llvm
+          TZ=UTC ninja -C lld-build docs-lld-html
 

From 13ea1146a78ef1e00d88b50fd0f903f336751003 Mon Sep 17 00:00:00 2001
From: Rana Pratap Reddy <109514914+ranapratap55@users.noreply.github.com>
Date: Thu, 26 Oct 2023 10:26:11 +0530
Subject: [PATCH 002/877] [AMDGPU] Lower __builtin_amdgcn_read_exec_hi to use
 amdgcn_ballot (#69567)

Currently __builtin_amdgcn_read_exec_hi lowers to llvm.read_register,
this patch lowers it to use amdgcn_ballot.
---
 clang/lib/CodeGen/CGBuiltin.cpp               | 21 +++++++++++-----
 .../CodeGenOpenCL/builtins-amdgcn-wave32.cl   | 24 +++++++++++++++++++
 .../CodeGenOpenCL/builtins-amdgcn-wave64.cl   | 23 ++++++++++++++++++
 clang/test/CodeGenOpenCL/builtins-amdgcn.cl   |  4 +++-
 4 files changed, 65 insertions(+), 7 deletions(-)

diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index e1211bb8949b6..85be8bdd00516 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -7995,15 +7995,23 @@ enum SpecialRegisterAccessKind {
   Write,
 };
 
+// Generates the IR for __builtin_read_exec_*.
+// Lowers the builtin to amdgcn_ballot intrinsic.
 static Value *EmitAMDGCNBallotForExec(CodeGenFunction &CGF, const CallExpr *E,
                                       llvm::Type *RegisterType,
-                                      llvm::Type *ValueType) {
+                                      llvm::Type *ValueType, bool isExecHi) {
   CodeGen::CGBuilderTy &Builder = CGF.Builder;
   CodeGen::CodeGenModule &CGM = CGF.CGM;
 
-  llvm::Type *ResultType = CGF.ConvertType(E->getType());
-  Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_ballot, {ResultType});
+  Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_ballot, {RegisterType});
   llvm::Value *Call = Builder.CreateCall(F, {Builder.getInt1(true)});
+
+  if (isExecHi) {
+    Value *Rt2 = Builder.CreateLShr(Call, 32);
+    Rt2 = Builder.CreateTrunc(Rt2, CGF.Int32Ty);
+    return Rt2;
+  }
+
   return Call;
 }
 
@@ -17857,10 +17865,11 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
     return Builder.CreateCall(F, {Addr, Val, ZeroI32, ZeroI32, ZeroI1});
   }
   case AMDGPU::BI__builtin_amdgcn_read_exec:
+    return EmitAMDGCNBallotForExec(*this, E, Int64Ty, Int64Ty, false);
   case AMDGPU::BI__builtin_amdgcn_read_exec_lo:
-  case AMDGPU::BI__builtin_amdgcn_read_exec_hi: {
-    return EmitAMDGCNBallotForExec(*this, E, Int64Ty, Int64Ty);
-  }
+    return EmitAMDGCNBallotForExec(*this, E, Int32Ty, Int32Ty, false);
+  case AMDGPU::BI__builtin_amdgcn_read_exec_hi:
+    return EmitAMDGCNBallotForExec(*this, E, Int64Ty, Int64Ty, true);
   case AMDGPU::BI__builtin_amdgcn_image_bvh_intersect_ray:
   case AMDGPU::BI__builtin_amdgcn_image_bvh_intersect_ray_h:
   case AMDGPU::BI__builtin_amdgcn_image_bvh_intersect_ray_l:
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-wave32.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-wave32.cl
index a4d14cf1f6cf0..43553131f63c5 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-wave32.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-wave32.cl
@@ -13,6 +13,8 @@ void test_ballot_wave32(global uint* out, int a, int b)
   *out = __builtin_amdgcn_ballot_w32(a == b);
 }
 
+// CHECK: declare i32 @llvm.amdgcn.ballot.i32(i1) #[[$NOUNWIND_READONLY:[0-9]+]]
+
 // CHECK-LABEL: @test_ballot_wave32_target_attr(
 // CHECK: call i32 @llvm.amdgcn.ballot.i32(i1 %{{.+}})
 __attribute__((target("wavefrontsize32")))
@@ -21,6 +23,28 @@ void test_ballot_wave32_target_attr(global uint* out, int a, int b)
   *out = __builtin_amdgcn_ballot_w32(a == b);
 }
 
+// CHECK-LABEL: @test_read_exec(
+// CHECK: call i64 @llvm.amdgcn.ballot.i64(i1 true)
+void test_read_exec(global uint* out) {
+  *out = __builtin_amdgcn_read_exec();
+}
+
+// CHECK: declare i64 @llvm.amdgcn.ballot.i64(i1) #[[$NOUNWIND_READONLY:[0-9]+]]
+
+// CHECK-LABEL: @test_read_exec_lo(
+// CHECK: call i32 @llvm.amdgcn.ballot.i32(i1 true)
+void test_read_exec_lo(global uint* out) {
+  *out = __builtin_amdgcn_read_exec_lo();
+}
+
+// CHECK-LABEL: @test_read_exec_hi(
+// CHECK: call i64 @llvm.amdgcn.ballot.i64(i1 true)
+// CHECK: lshr i64 [[A:%.*]], 32
+// CHECK: trunc i64 [[B:%.*]] to i32
+void test_read_exec_hi(global uint* out) {
+  *out = __builtin_amdgcn_read_exec_hi();
+}
+
 #if __AMDGCN_WAVEFRONT_SIZE != 32
 #error Wrong wavesize detected
 #endif
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-wave64.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-wave64.cl
index 563c9a2a240c1..53f34c6a44ae7 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-wave64.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-wave64.cl
@@ -13,6 +13,8 @@ void test_ballot_wave64(global ulong* out, int a, int b)
   *out = __builtin_amdgcn_ballot_w64(a == b);
 }
 
+// CHECK: declare i64 @llvm.amdgcn.ballot.i64(i1) #[[$NOUNWIND_READONLY:[0-9]+]]
+
 // CHECK-LABEL: @test_ballot_wave64_target_attr(
 // CHECK: call i64 @llvm.amdgcn.ballot.i64(i1 %{{.+}})
 __attribute__((target("wavefrontsize64")))
@@ -21,6 +23,27 @@ void test_ballot_wave64_target_attr(global ulong* out, int a, int b)
   *out = __builtin_amdgcn_ballot_w64(a == b);
 }
 
+// CHECK-LABEL: @test_read_exec(
+// CHECK: call i64 @llvm.amdgcn.ballot.i64(i1 true)
+void test_read_exec(global ulong* out) {
+  *out = __builtin_amdgcn_read_exec();
+}
+
+// CHECK-LABEL: @test_read_exec_lo(
+// CHECK: call i32 @llvm.amdgcn.ballot.i32(i1 true)
+void test_read_exec_lo(global ulong* out) {
+  *out = __builtin_amdgcn_read_exec_lo();
+}
+
+// CHECK: declare i32 @llvm.amdgcn.ballot.i32(i1) #[[$NOUNWIND_READONLY:[0-9]+]]
+
+// CHECK-LABEL: @test_read_exec_hi(
+// CHECK: call i64 @llvm.amdgcn.ballot.i64(i1 true)
+// CHECK: lshr i64 [[A:%.*]], 32
+void test_read_exec_hi(global ulong* out) {
+  *out = __builtin_amdgcn_read_exec_hi();
+}
+
 #if __AMDGCN_WAVEFRONT_SIZE != 64
 #error Wrong wavesize detected
 #endif
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
index 8938642e3b19f..0bc9a54682d3e 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
@@ -526,7 +526,9 @@ void test_read_exec_lo(global uint* out) {
 // CHECK: declare i32 @llvm.amdgcn.ballot.i32(i1) #[[$NOUNWIND_READONLY:[0-9]+]]
 
 // CHECK-LABEL: @test_read_exec_hi(
-// CHECK: call i32 @llvm.amdgcn.ballot.i32(i1 true)
+// CHECK: call i64 @llvm.amdgcn.ballot.i64(i1 true)
+// CHECK: lshr i64 [[A:%.*]], 32
+// CHECK: trunc i64 [[B:%.*]] to i32
 void test_read_exec_hi(global uint* out) {
   *out = __builtin_amdgcn_read_exec_hi();
 }

From 9c3c0e324f5152c699cc14b79a630589be5eced3 Mon Sep 17 00:00:00 2001
From: Yingwei Zheng <dtcxzyw2333@gmail.com>
Date: Thu, 26 Oct 2023 13:00:36 +0800
Subject: [PATCH 003/877] [RISCV] Separate addend from FMA operands to support
 cascade FMA. NFC. (#70241)

This PR separate addend from FMA operands to support cascade FMA. In
some microarchitectures (e.g., ARM cortex-a72 and XiangShan-NanHu), FP
multiply-accumulate pipelines support late-forwarding of accumulate
operands, which reduces the latency of a sequence of multiply-accumulate
instructions.
See also #70232.
---
 llvm/lib/Target/RISCV/RISCVInstrInfoD.td         | 2 +-
 llvm/lib/Target/RISCV/RISCVInstrInfoF.td         | 2 +-
 llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td       | 2 +-
 llvm/lib/Target/RISCV/RISCVSchedRocket.td        | 2 ++
 llvm/lib/Target/RISCV/RISCVSchedSiFive7.td       | 3 +++
 llvm/lib/Target/RISCV/RISCVSchedSyntacoreSCR1.td | 2 ++
 llvm/lib/Target/RISCV/RISCVSchedule.td           | 3 +++
 7 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoD.td b/llvm/lib/Target/RISCV/RISCVInstrInfoD.td
index 59312f02aeceb..34becfafe7747 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoD.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoD.td
@@ -78,7 +78,7 @@ def FSD : FPStore_r<0b011, "fsd", FPR64, WriteFST64>;
 } // Predicates = [HasStdExtD]
 
 foreach Ext = DExts in {
-  let SchedRW = [WriteFMA64, ReadFMA64, ReadFMA64, ReadFMA64] in {
+  let SchedRW = [WriteFMA64, ReadFMA64, ReadFMA64, ReadFMA64Addend] in {
     defm FMADD_D  : FPFMA_rrr_frm_m<OPC_MADD,  0b01, "fmadd.d",  Ext>;
     defm FMSUB_D  : FPFMA_rrr_frm_m<OPC_MSUB,  0b01, "fmsub.d",  Ext>;
     defm FNMSUB_D : FPFMA_rrr_frm_m<OPC_NMSUB, 0b01, "fnmsub.d", Ext>;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoF.td b/llvm/lib/Target/RISCV/RISCVInstrInfoF.td
index 8726245f1602e..3a5794bb2d194 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoF.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoF.td
@@ -302,7 +302,7 @@ def FSW : FPStore_r<0b010, "fsw", FPR32, WriteFST32>;
 } // Predicates = [HasStdExtF]
 
 foreach Ext = FExts in {
-  let SchedRW = [WriteFMA32, ReadFMA32, ReadFMA32, ReadFMA32] in {
+  let SchedRW = [WriteFMA32, ReadFMA32, ReadFMA32, ReadFMA32Addend] in {
     defm FMADD_S  : FPFMA_rrr_frm_m<OPC_MADD,  0b00, "fmadd.s",  Ext>;
     defm FMSUB_S  : FPFMA_rrr_frm_m<OPC_MSUB,  0b00, "fmsub.s",  Ext>;
     defm FNMSUB_S : FPFMA_rrr_frm_m<OPC_NMSUB, 0b00, "fnmsub.s", Ext>;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td
index b65e9f5af0331..1dc391d3f084f 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td
@@ -85,7 +85,7 @@ def FSH : FPStore_r<0b001, "fsh", FPR16, WriteFST16>;
 } // Predicates = [HasHalfFPLoadStoreMove]
 
 foreach Ext = ZfhExts in {
-  let SchedRW = [WriteFMA16, ReadFMA16, ReadFMA16, ReadFMA16] in {
+  let SchedRW = [WriteFMA16, ReadFMA16, ReadFMA16, ReadFMA16Addend] in {
     defm FMADD_H  : FPFMA_rrr_frm_m<OPC_MADD,  0b10, "fmadd.h",  Ext>;
     defm FMSUB_H  : FPFMA_rrr_frm_m<OPC_MSUB,  0b10, "fmsub.h",  Ext>;
     defm FNMSUB_H : FPFMA_rrr_frm_m<OPC_NMSUB, 0b10, "fnmsub.h", Ext>;
diff --git a/llvm/lib/Target/RISCV/RISCVSchedRocket.td b/llvm/lib/Target/RISCV/RISCVSchedRocket.td
index 8fbc9afe267c5..bb9dfe5d01240 100644
--- a/llvm/lib/Target/RISCV/RISCVSchedRocket.td
+++ b/llvm/lib/Target/RISCV/RISCVSchedRocket.td
@@ -206,7 +206,9 @@ def : ReadAdvance<ReadFAdd64, 0>;
 def : ReadAdvance<ReadFMul32, 0>;
 def : ReadAdvance<ReadFMul64, 0>;
 def : ReadAdvance<ReadFMA32, 0>;
+def : ReadAdvance<ReadFMA32Addend, 0>;
 def : ReadAdvance<ReadFMA64, 0>;
+def : ReadAdvance<ReadFMA64Addend, 0>;
 def : ReadAdvance<ReadFDiv32, 0>;
 def : ReadAdvance<ReadFDiv64, 0>;
 def : ReadAdvance<ReadFSqrt32, 0>;
diff --git a/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td b/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td
index 96ebe8e3e6768..d2447cf23e266 100644
--- a/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td
+++ b/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td
@@ -933,10 +933,13 @@ def : ReadAdvance<ReadFAdd32, 0>;
 def : ReadAdvance<ReadFAdd64, 0>;
 def : ReadAdvance<ReadFMul16, 0>;
 def : ReadAdvance<ReadFMA16, 0>;
+def : ReadAdvance<ReadFMA16Addend, 0>;
 def : ReadAdvance<ReadFMul32, 0>;
 def : ReadAdvance<ReadFMul64, 0>;
 def : ReadAdvance<ReadFMA32, 0>;
+def : ReadAdvance<ReadFMA32Addend, 0>;
 def : ReadAdvance<ReadFMA64, 0>;
+def : ReadAdvance<ReadFMA64Addend, 0>;
 def : ReadAdvance<ReadFDiv16, 0>;
 def : ReadAdvance<ReadFDiv32, 0>;
 def : ReadAdvance<ReadFDiv64, 0>;
diff --git a/llvm/lib/Target/RISCV/RISCVSchedSyntacoreSCR1.td b/llvm/lib/Target/RISCV/RISCVSchedSyntacoreSCR1.td
index 960258c8bc7df..06ad2075b0736 100644
--- a/llvm/lib/Target/RISCV/RISCVSchedSyntacoreSCR1.td
+++ b/llvm/lib/Target/RISCV/RISCVSchedSyntacoreSCR1.td
@@ -164,7 +164,9 @@ def : ReadAdvance<ReadFAdd64, 0>;
 def : ReadAdvance<ReadFMul32, 0>;
 def : ReadAdvance<ReadFMul64, 0>;
 def : ReadAdvance<ReadFMA32, 0>;
+def : ReadAdvance<ReadFMA32Addend, 0>;
 def : ReadAdvance<ReadFMA64, 0>;
+def : ReadAdvance<ReadFMA64Addend, 0>;
 def : ReadAdvance<ReadFDiv32, 0>;
 def : ReadAdvance<ReadFDiv64, 0>;
 def : ReadAdvance<ReadFSqrt32, 0>;
diff --git a/llvm/lib/Target/RISCV/RISCVSchedule.td b/llvm/lib/Target/RISCV/RISCVSchedule.td
index af318ea5bf685..f6c1b096ad90c 100644
--- a/llvm/lib/Target/RISCV/RISCVSchedule.td
+++ b/llvm/lib/Target/RISCV/RISCVSchedule.td
@@ -150,8 +150,11 @@ def ReadFMul16      : SchedRead;    // 16-bit floating point multiply
 def ReadFMul32      : SchedRead;    // 32-bit floating point multiply
 def ReadFMul64      : SchedRead;    // 64-bit floating point multiply
 def ReadFMA16       : SchedRead;    // 16-bit floating point fused multiply-add
+def ReadFMA16Addend : SchedRead;    // 16-bit floating point fused multiply-add (addend)
 def ReadFMA32       : SchedRead;    // 32-bit floating point fused multiply-add
+def ReadFMA32Addend : SchedRead;    // 32-bit floating point fused multiply-add (addend)
 def ReadFMA64       : SchedRead;    // 64-bit floating point fused multiply-add
+def ReadFMA64Addend : SchedRead;    // 64-bit floating point fused multiply-add (addend)
 def ReadFDiv16      : SchedRead;    // 16-bit floating point divide
 def ReadFDiv32      : SchedRead;    // 32-bit floating point divide
 def ReadFDiv64      : SchedRead;    // 64-bit floating point divide

From 1097c71dbeefaff0c353c90cb57bc07b6ede6383 Mon Sep 17 00:00:00 2001
From: Piotr Zegar <me@piotrzegar.pl>
Date: Thu, 26 Oct 2023 07:10:39 +0200
Subject: [PATCH 004/877] [clang-tidy] Support functional cast in
 bugprone-dangling-handle (#69067)

Add support for constructor conversion based functional
cast. Allows to detect issues like:
const std::string_view test1 = std::string(a);
---
 .../clang-tidy/bugprone/DanglingHandleCheck.cpp      | 12 +++++++++---
 clang-tools-extra/docs/ReleaseNotes.rst              |  5 +++++
 .../clang-tidy/checkers/bugprone/dangling-handle.cpp |  8 ++++++++
 3 files changed, 22 insertions(+), 3 deletions(-)

diff --git a/clang-tools-extra/clang-tidy/bugprone/DanglingHandleCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/DanglingHandleCheck.cpp
index 9ded699ba78e6..d55df3a6d7b74 100644
--- a/clang-tools-extra/clang-tidy/bugprone/DanglingHandleCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/DanglingHandleCheck.cpp
@@ -33,14 +33,20 @@ handleFrom(const ast_matchers::internal::Matcher<RecordDecl> &IsAHandle,
 
 ast_matchers::internal::Matcher<Stmt> handleFromTemporaryValue(
     const ast_matchers::internal::Matcher<RecordDecl> &IsAHandle) {
+
+  const auto TemporaryExpr = anyOf(
+      cxxBindTemporaryExpr(),
+      cxxFunctionalCastExpr(
+          hasCastKind(CK_ConstructorConversion),
+          hasSourceExpression(ignoringParenImpCasts(cxxBindTemporaryExpr()))));
   // If a ternary operator returns a temporary value, then both branches hold a
   // temporary value. If one of them is not a temporary then it must be copied
   // into one to satisfy the type of the operator.
   const auto TemporaryTernary = conditionalOperator(
-      hasTrueExpression(ignoringParenImpCasts(cxxBindTemporaryExpr())),
-      hasFalseExpression(ignoringParenImpCasts(cxxBindTemporaryExpr())));
+      hasTrueExpression(ignoringParenImpCasts(TemporaryExpr)),
+      hasFalseExpression(ignoringParenImpCasts(TemporaryExpr)));
 
-  return handleFrom(IsAHandle, anyOf(cxxBindTemporaryExpr(), TemporaryTernary));
+  return handleFrom(IsAHandle, anyOf(TemporaryExpr, TemporaryTernary));
 }
 
 ast_matchers::internal::Matcher<RecordDecl> isASequence() {
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index 6d1992e12130d..ac95afd782e1d 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -207,6 +207,11 @@ New check aliases
 Changes in existing checks
 ^^^^^^^^^^^^^^^^^^^^^^^^^^
 
+- Improved :doc:`bugprone-dangling-handle
+  <clang-tidy/checks/bugprone/dangling-handle>` check to support functional
+  casting during type conversions at variable initialization, now with improved
+  compatibility for C++17 and later versions.
+
 - Improved :doc:`bugprone-lambda-function-name
   <clang-tidy/checks/bugprone/lambda-function-name>` check by adding option
   `IgnoreMacros` to ignore warnings in macros.
diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/dangling-handle.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/dangling-handle.cpp
index 23cda53217643..96c812617038a 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/bugprone/dangling-handle.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/dangling-handle.cpp
@@ -108,6 +108,14 @@ void Positives() {
   std::string_view view4(ReturnsAString());
   // CHECK-MESSAGES-CXX14: [[@LINE-1]]:20: warning: std::basic_string_view outlives
   // CHECK-MESSAGES-CXX17: [[@LINE-2]]:26: warning: std::basic_string_view outlives
+
+  std::string_view view5 = std::string("test");
+  // CHECK-MESSAGES-CXX14: [[@LINE-1]]:20: warning: std::basic_string_view outlives its value [bugprone-dangling-handle]
+  // CHECK-MESSAGES-CXX17: [[@LINE-2]]:28: warning: std::basic_string_view outlives its value [bugprone-dangling-handle]
+
+  std::string_view view6 = std::string{"test"};
+  // CHECK-MESSAGES-CXX14: [[@LINE-1]]:20: warning: std::basic_string_view outlives its value [bugprone-dangling-handle]
+  // CHECK-MESSAGES-CXX17: [[@LINE-2]]:28: warning: std::basic_string_view outlives its value [bugprone-dangling-handle]
 }
 
 void OtherTypes() {

From af07d7ba883b6e4921820d88b6679f294a0b9fa5 Mon Sep 17 00:00:00 2001
From: Piotr Zegar <me@piotrzegar.pl>
Date: Thu, 26 Oct 2023 07:11:01 +0200
Subject: [PATCH 005/877] [clang-tidy] Improved
 cppcoreguidelines-pro-type-const-cast (#69501)

Improved cppcoreguidelines-pro-type-const-cast check to ignore casts to
const type (controlled by option) and casts in implicitly invoked code.

Fixes #69319
---
 .../ProTypeConstCastCheck.cpp                 | 49 ++++++++++-
 .../cppcoreguidelines/ProTypeConstCastCheck.h | 12 ++-
 clang-tools-extra/docs/ReleaseNotes.rst       |  5 ++
 .../cppcoreguidelines/pro-type-const-cast.rst | 32 ++++++-
 .../cppcoreguidelines/pro-type-const-cast.cpp | 86 ++++++++++++++++++-
 .../nonstandard-file-extension.test           |  2 +-
 6 files changed, 174 insertions(+), 12 deletions(-)

diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeConstCastCheck.cpp b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeConstCastCheck.cpp
index ef803ab85fa08..8c44c1bfb62b6 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeConstCastCheck.cpp
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeConstCastCheck.cpp
@@ -14,13 +14,60 @@ using namespace clang::ast_matchers;
 
 namespace clang::tidy::cppcoreguidelines {
 
+static bool hasConstQualifier(QualType Type) {
+  const QualType PtrType = Type->getPointeeType();
+  if (!PtrType.isNull())
+    return hasConstQualifier(PtrType);
+
+  return Type.isConstQualified();
+}
+
+static bool hasVolatileQualifier(QualType Type) {
+  const QualType PtrType = Type->getPointeeType();
+  if (!PtrType.isNull())
+    return hasVolatileQualifier(PtrType);
+  return Type.isVolatileQualified();
+}
+
+ProTypeConstCastCheck::ProTypeConstCastCheck(StringRef Name,
+                                             ClangTidyContext *Context)
+    : ClangTidyCheck(Name, Context),
+      StrictMode(Options.getLocalOrGlobal("StrictMode", false)) {}
+
+void ProTypeConstCastCheck::storeOptions(ClangTidyOptions::OptionMap &Opts) {
+  Options.store(Opts, "StrictMode", StrictMode);
+}
+
 void ProTypeConstCastCheck::registerMatchers(MatchFinder *Finder) {
   Finder->addMatcher(cxxConstCastExpr().bind("cast"), this);
 }
 
 void ProTypeConstCastCheck::check(const MatchFinder::MatchResult &Result) {
   const auto *MatchedCast = Result.Nodes.getNodeAs<CXXConstCastExpr>("cast");
-  diag(MatchedCast->getOperatorLoc(), "do not use const_cast");
+  if (StrictMode) {
+    diag(MatchedCast->getOperatorLoc(), "do not use const_cast");
+    return;
+  }
+
+  const QualType TargetType = MatchedCast->getType().getCanonicalType();
+  const QualType SourceType =
+      MatchedCast->getSubExpr()->getType().getCanonicalType();
+
+  const bool RemovingConst =
+      hasConstQualifier(SourceType) && !hasConstQualifier(TargetType);
+  const bool RemovingVolatile =
+      hasVolatileQualifier(SourceType) && !hasVolatileQualifier(TargetType);
+
+  if (!RemovingConst && !RemovingVolatile) {
+    // Cast is doing nothing.
+    return;
+  }
+
+  diag(MatchedCast->getOperatorLoc(),
+       "do not use const_cast to remove%select{| const}0%select{| "
+       "and}2%select{| volatile}1 qualifier")
+      << RemovingConst << RemovingVolatile
+      << (RemovingConst && RemovingVolatile);
 }
 
 } // namespace clang::tidy::cppcoreguidelines
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeConstCastCheck.h b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeConstCastCheck.h
index f7ae9bbb60dcd..8d93633a321b5 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeConstCastCheck.h
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeConstCastCheck.h
@@ -13,19 +13,25 @@
 
 namespace clang::tidy::cppcoreguidelines {
 
-/// This check flags all instances of const_cast
+/// Imposes limitations on the use of const_cast within C++ code.
 ///
 /// For the user-facing documentation see:
 /// http://clang.llvm.org/extra/clang-tidy/checks/cppcoreguidelines/pro-type-const-cast.html
 class ProTypeConstCastCheck : public ClangTidyCheck {
 public:
-  ProTypeConstCastCheck(StringRef Name, ClangTidyContext *Context)
-      : ClangTidyCheck(Name, Context) {}
+  ProTypeConstCastCheck(StringRef Name, ClangTidyContext *Context);
   bool isLanguageVersionSupported(const LangOptions &LangOpts) const override {
     return LangOpts.CPlusPlus;
   }
   void registerMatchers(ast_matchers::MatchFinder *Finder) override;
   void check(const ast_matchers::MatchFinder::MatchResult &Result) override;
+  void storeOptions(ClangTidyOptions::OptionMap &Opts) override;
+  std::optional<TraversalKind> getCheckTraversalKind() const override {
+    return TK_IgnoreUnlessSpelledInSource;
+  }
+
+private:
+  const bool StrictMode;
 };
 
 } // namespace clang::tidy::cppcoreguidelines
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index ac95afd782e1d..13003a118c36a 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -255,6 +255,11 @@ Changes in existing checks
   <clang-tidy/checks/cppcoreguidelines/pro-bounds-constant-array-index>` check
   to perform checks on derived classes of  ``std::array``.
 
+- Improved :doc:`cppcoreguidelines-pro-type-const-cast
+  <clang-tidy/checks/cppcoreguidelines/pro-type-const-cast>` check to ignore
+  casts to ``const`` or ``volatile`` type (controlled by `StrictMode` option)
+  and casts in implicitly invoked code.
+
 - Improved :doc:`cppcoreguidelines-pro-type-member-init
   <clang-tidy/checks/cppcoreguidelines/pro-type-member-init>` check to ignore
   dependent delegate constructors.
diff --git a/clang-tools-extra/docs/clang-tidy/checks/cppcoreguidelines/pro-type-const-cast.rst b/clang-tools-extra/docs/clang-tidy/checks/cppcoreguidelines/pro-type-const-cast.rst
index eb572e625f129..961a591cb81f8 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/cppcoreguidelines/pro-type-const-cast.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/cppcoreguidelines/pro-type-const-cast.rst
@@ -3,11 +3,35 @@
 cppcoreguidelines-pro-type-const-cast
 =====================================
 
-This check flags all uses of ``const_cast`` in C++ code.
+Imposes limitations on the use of ``const_cast`` within C++ code. It depends on
+the :option:`StrictMode` option setting to determine whether it should flag all
+instances of ``const_cast`` or only those that remove either ``const`` or
+``volatile`` qualifier.
 
-Modifying a variable that was declared const is undefined behavior, even with
-``const_cast``.
+Modifying a variable that has been declared as ``const`` in C++ is generally
+considered undefined behavior, and this remains true even when using
+``const_cast``. In C++, the ``const`` qualifier indicates that a variable is
+intended to be read-only, and the compiler enforces this by disallowing any
+attempts to change the value of that variable.
+
+Removing the ``volatile`` qualifier in C++ can have serious consequences. This
+qualifier indicates that a variable's value can change unpredictably, and
+removing it may lead to undefined behavior, optimization problems, and debugging
+challenges. It's essential to retain the ``volatile`` qualifier in situations
+where the variable's volatility is a crucial aspect of program correctness and
+reliability.
 
 This rule is part of the `Type safety (Type 3)
 <https://isocpp.github.io/CppCoreGuidelines/CppCoreGuidelines#Pro-type-constcast>`_
-profile from the C++ Core Guidelines.
+profile and `ES.50: Don’t cast away const
+<https://isocpp.github.io/CppCoreGuidelines/CppCoreGuidelines#es50-dont-cast-away-const>`_
+rule from the C++ Core Guidelines.
+
+Options
+-------
+
+.. option:: StrictMode
+
+  When this setting is set to `true`, it means that any usage of ``const_cast``
+  is not allowed. On the other hand, when it's set to `false`, it permits
+  casting to ``const`` or ``volatile`` types. Default value is `false`.
diff --git a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/pro-type-const-cast.cpp b/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/pro-type-const-cast.cpp
index 2d32e13723abf..be70e3ba35699 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/pro-type-const-cast.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/pro-type-const-cast.cpp
@@ -1,6 +1,86 @@
-// RUN: %check_clang_tidy %s cppcoreguidelines-pro-type-const-cast %t
+// RUN: %check_clang_tidy -check-suffix=STRICT  %s cppcoreguidelines-pro-type-const-cast %t -- -config="{CheckOptions: {StrictMode: true}}"
+// RUN: %check_clang_tidy -check-suffix=NSTRICT %s cppcoreguidelines-pro-type-const-cast %t
 
+namespace Const {
 const int *i;
 int *j;
-void f() { j = const_cast<int *>(i); }
-// CHECK-MESSAGES: :[[@LINE-1]]:16: warning: do not use const_cast [cppcoreguidelines-pro-type-const-cast]
+
+void f() {
+  j = const_cast<int *>(i);
+  // CHECK-MESSAGES-NSTRICT: :[[@LINE-1]]:7: warning: do not use const_cast to remove const qualifier [cppcoreguidelines-pro-type-const-cast]
+  // CHECK-MESSAGES-STRICT:  :[[@LINE-2]]:7: warning: do not use const_cast [cppcoreguidelines-pro-type-const-cast]
+
+  i = const_cast<const int*>(j);
+  // CHECK-MESSAGES-STRICT: :[[@LINE-1]]:7: warning: do not use const_cast [cppcoreguidelines-pro-type-const-cast]
+
+  j = *const_cast<int **>(&i);
+  // CHECK-MESSAGES-NSTRICT: :[[@LINE-1]]:8: warning: do not use const_cast to remove const qualifier [cppcoreguidelines-pro-type-const-cast]
+  // CHECK-MESSAGES-STRICT:  :[[@LINE-2]]:8: warning: do not use const_cast [cppcoreguidelines-pro-type-const-cast]
+
+  i = *const_cast<const int**>(&j);
+  // CHECK-MESSAGES-STRICT: :[[@LINE-1]]:8: warning: do not use const_cast [cppcoreguidelines-pro-type-const-cast]
+
+  j = &const_cast<int&>(*i);
+  // CHECK-MESSAGES-NSTRICT: :[[@LINE-1]]:8: warning: do not use const_cast to remove const qualifier [cppcoreguidelines-pro-type-const-cast]
+  // CHECK-MESSAGES-STRICT:  :[[@LINE-2]]:8: warning: do not use const_cast [cppcoreguidelines-pro-type-const-cast]
+
+  i = &const_cast<const int&>(*j);
+  // CHECK-MESSAGES-STRICT: :[[@LINE-1]]:8: warning: do not use const_cast [cppcoreguidelines-pro-type-const-cast]
+}
+}
+
+namespace Volatile {
+volatile int *i;
+int *j;
+
+void f() {
+  j = const_cast<int *>(i);
+  // CHECK-MESSAGES-NSTRICT: :[[@LINE-1]]:7: warning: do not use const_cast to remove volatile qualifier [cppcoreguidelines-pro-type-const-cast]
+  // CHECK-MESSAGES-STRICT:  :[[@LINE-2]]:7: warning: do not use const_cast [cppcoreguidelines-pro-type-const-cast]
+
+  i = const_cast<volatile int*>(j);
+  // CHECK-MESSAGES-STRICT: :[[@LINE-1]]:7: warning: do not use const_cast [cppcoreguidelines-pro-type-const-cast]
+
+  j = *const_cast<int **>(&i);
+  // CHECK-MESSAGES-NSTRICT: :[[@LINE-1]]:8: warning: do not use const_cast to remove volatile qualifier [cppcoreguidelines-pro-type-const-cast]
+  // CHECK-MESSAGES-STRICT:  :[[@LINE-2]]:8: warning: do not use const_cast [cppcoreguidelines-pro-type-const-cast]
+
+  i = *const_cast<volatile int**>(&j);
+  // CHECK-MESSAGES-STRICT: :[[@LINE-1]]:8: warning: do not use const_cast [cppcoreguidelines-pro-type-const-cast]
+
+  j = &const_cast<int&>(*i);
+  // CHECK-MESSAGES-NSTRICT: :[[@LINE-1]]:8: warning: do not use const_cast to remove volatile qualifier [cppcoreguidelines-pro-type-const-cast]
+  // CHECK-MESSAGES-STRICT:  :[[@LINE-2]]:8: warning: do not use const_cast [cppcoreguidelines-pro-type-const-cast]
+
+  i = &const_cast<volatile int&>(*j);
+  // CHECK-MESSAGES-STRICT: :[[@LINE-1]]:8: warning: do not use const_cast [cppcoreguidelines-pro-type-const-cast]
+}
+}
+
+namespace ConstAndVolatile {
+const volatile int *i;
+int *j;
+
+void f() {
+  j = const_cast<int *>(i);
+  // CHECK-MESSAGES-NSTRICT: :[[@LINE-1]]:7: warning: do not use const_cast to remove const and volatile qualifier [cppcoreguidelines-pro-type-const-cast]
+  // CHECK-MESSAGES-STRICT:  :[[@LINE-2]]:7: warning: do not use const_cast [cppcoreguidelines-pro-type-const-cast]
+
+  i = const_cast<const volatile int*>(j);
+  // CHECK-MESSAGES-STRICT: :[[@LINE-1]]:7: warning: do not use const_cast [cppcoreguidelines-pro-type-const-cast]
+
+  j = *const_cast<int **>(&i);
+  // CHECK-MESSAGES-NSTRICT: :[[@LINE-1]]:8: warning: do not use const_cast to remove const and volatile qualifier [cppcoreguidelines-pro-type-const-cast]
+  // CHECK-MESSAGES-STRICT:  :[[@LINE-2]]:8: warning: do not use const_cast [cppcoreguidelines-pro-type-const-cast]
+
+  i = *const_cast<const volatile int**>(&j);
+  // CHECK-MESSAGES-STRICT: :[[@LINE-1]]:8: warning: do not use const_cast [cppcoreguidelines-pro-type-const-cast]
+
+  j = &const_cast<int&>(*i);
+  // CHECK-MESSAGES-NSTRICT: :[[@LINE-1]]:8: warning: do not use const_cast to remove const and volatile qualifier [cppcoreguidelines-pro-type-const-cast]
+  // CHECK-MESSAGES-STRICT:  :[[@LINE-2]]:8: warning: do not use const_cast [cppcoreguidelines-pro-type-const-cast]
+
+  i = &const_cast<const volatile int&>(*j);
+  // CHECK-MESSAGES-STRICT: :[[@LINE-1]]:8: warning: do not use const_cast [cppcoreguidelines-pro-type-const-cast]
+}
+}
diff --git a/clang-tools-extra/test/clang-tidy/infrastructure/nonstandard-file-extension.test b/clang-tools-extra/test/clang-tidy/infrastructure/nonstandard-file-extension.test
index 4cb4d1171195a..9f98d86d0bcc0 100644
--- a/clang-tools-extra/test/clang-tidy/infrastructure/nonstandard-file-extension.test
+++ b/clang-tools-extra/test/clang-tidy/infrastructure/nonstandard-file-extension.test
@@ -3,4 +3,4 @@
 const int *i;
 int *j;
 void f() { j = const_cast<int *>(i); }
-// CHECK-MESSAGES: :[[@LINE-1]]:16: warning: do not use const_cast [cppcoreguidelines-pro-type-const-cast]
+// CHECK-MESSAGES: :[[@LINE-1]]:16: warning: do not use const_cast to remove const qualifier [cppcoreguidelines-pro-type-const-cast]

From fd06155acb620b047fa1d586383f8738e17dbec3 Mon Sep 17 00:00:00 2001
From: Piotr Zegar <me@piotrzegar.pl>
Date: Thu, 26 Oct 2023 07:16:25 +0200
Subject: [PATCH 006/877] [clang-tidy] Improved
 cppcoreguidelines-narrowing-conversions.IgnoreConversionFromTypes (#69242)

Extended IgnoreConversionFromTypes option to include
types without a declaration, such as built-in types.
---
 .../NarrowingConversionsCheck.cpp             | 38 ++++++++++++++-----
 clang-tools-extra/docs/ReleaseNotes.rst       |  5 +++
 ...sions-ignoreconversionfromtypes-option.cpp |  9 ++++-
 3 files changed, 41 insertions(+), 11 deletions(-)

diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/NarrowingConversionsCheck.cpp b/clang-tools-extra/clang-tidy/cppcoreguidelines/NarrowingConversionsCheck.cpp
index 1b858db511f50..45fef9471d521 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/NarrowingConversionsCheck.cpp
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/NarrowingConversionsCheck.cpp
@@ -14,6 +14,7 @@
 #include "clang/ASTMatchers/ASTMatchFinder.h"
 #include "clang/ASTMatchers/ASTMatchers.h"
 #include "llvm/ADT/APSInt.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallVector.h"
 
@@ -23,6 +24,26 @@ using namespace clang::ast_matchers;
 
 namespace clang::tidy::cppcoreguidelines {
 
+namespace {
+
+AST_MATCHER_P(QualType, hasAnyType, std::vector<StringRef>, Names) {
+  if (Names.empty())
+    return false;
+
+  std::string Name = Node.getLocalUnqualifiedType().getAsString();
+  return llvm::any_of(Names, [&Name](StringRef Ref) { return Ref == Name; });
+}
+
+AST_MATCHER(FieldDecl, hasIntBitwidth) {
+  assert(Node.isBitField());
+  const ASTContext &Ctx = Node.getASTContext();
+  unsigned IntBitWidth = Ctx.getIntWidth(Ctx.IntTy);
+  unsigned CurrentBitWidth = Node.getBitWidthValue(Ctx);
+  return IntBitWidth == CurrentBitWidth;
+}
+
+} // namespace
+
 NarrowingConversionsCheck::NarrowingConversionsCheck(StringRef Name,
                                                      ClangTidyContext *Context)
     : ClangTidyCheck(Name, Context),
@@ -53,25 +74,22 @@ void NarrowingConversionsCheck::storeOptions(
   Options.store(Opts, "PedanticMode", PedanticMode);
 }
 
-AST_MATCHER(FieldDecl, hasIntBitwidth) {
-  assert(Node.isBitField());
-  const ASTContext &Ctx = Node.getASTContext();
-  unsigned IntBitWidth = Ctx.getIntWidth(Ctx.IntTy);
-  unsigned CurrentBitWidth = Node.getBitWidthValue(Ctx);
-  return IntBitWidth == CurrentBitWidth;
-}
-
 void NarrowingConversionsCheck::registerMatchers(MatchFinder *Finder) {
   // ceil() and floor() are guaranteed to return integers, even though the type
   // is not integral.
   const auto IsCeilFloorCallExpr = expr(callExpr(callee(functionDecl(
       hasAnyName("::ceil", "::std::ceil", "::floor", "::std::floor")))));
 
+  std::vector<StringRef> IgnoreConversionFromTypesVec =
+      utils::options::parseStringList(IgnoreConversionFromTypes);
+
   // We may want to exclude other types from the checks, such as `size_type`
   // and `difference_type`. These are often used to count elements, represented
   // in 64 bits and assigned to `int`. Rarely are people counting >2B elements.
-  const auto IsConversionFromIgnoredType = hasType(namedDecl(
-      hasAnyName(utils::options::parseStringList(IgnoreConversionFromTypes))));
+  const auto IsConversionFromIgnoredType =
+      anyOf(hasType(namedDecl(hasAnyName(IgnoreConversionFromTypesVec))),
+            allOf(unless(hasType(namedDecl())),
+                  hasType(qualType(hasAnyType(IgnoreConversionFromTypesVec)))));
 
   // `IsConversionFromIgnoredType` will ignore narrowing calls from those types,
   // but not expressions that are promoted to an ignored type as a result of a
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index 13003a118c36a..c93775beb8aea 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -243,6 +243,11 @@ Changes in existing checks
   coroutine functions and increase issue detection for cases involving type
   aliases with references.
 
+- Improved :doc:`cppcoreguidelines-narrowing-conversions
+  <clang-tidy/checks/cppcoreguidelines/narrowing-conversions>` check by
+  extending the `IgnoreConversionFromTypes` option to include types without a
+  declaration, such as built-in types.
+
 - Improved :doc:`cppcoreguidelines-prefer-member-initializer
   <clang-tidy/checks/cppcoreguidelines/prefer-member-initializer>` check to
   ignore delegate constructors.
diff --git a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/narrowing-conversions-ignoreconversionfromtypes-option.cpp b/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/narrowing-conversions-ignoreconversionfromtypes-option.cpp
index ab9aabf44ff68..91e908f535a0d 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/narrowing-conversions-ignoreconversionfromtypes-option.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/narrowing-conversions-ignoreconversionfromtypes-option.cpp
@@ -4,7 +4,7 @@
 // RUN: %check_clang_tidy -check-suffix=IGNORED %s \
 // RUN: cppcoreguidelines-narrowing-conversions %t -- \
 // RUN: -config='{CheckOptions: { \
-// RUN:   cppcoreguidelines-narrowing-conversions.IgnoreConversionFromTypes: "global_size_t;nested_size_type" \
+// RUN:   cppcoreguidelines-narrowing-conversions.IgnoreConversionFromTypes: "global_size_t;nested_size_type;long" \
 // RUN: }}'
 
 // We use global_size_t instead of 'size_t' because windows predefines size_t.
@@ -72,3 +72,10 @@ void most_narrowing_is_not_ok() {
   // CHECK-MESSAGES-DEFAULT: :[[@LINE-1]]:7: warning: narrowing conversion from 'long long' to signed type 'int' is implementation-defined [cppcoreguidelines-narrowing-conversions]
   // CHECK-MESSAGES-IGNORED: :[[@LINE-2]]:7: warning: narrowing conversion from 'long long' to signed type 'int' is implementation-defined [cppcoreguidelines-narrowing-conversions]
 }
+
+void test_ignore_builtin_type_pr58809() {
+  long x = 123;
+  short y = x;
+  // CHECK-MESSAGES-DEFAULT: :[[@LINE-1]]:13: warning: narrowing conversion from 'long' to signed type 'short' is implementation-defined [cppcoreguidelines-narrowing-conversions]
+  // CHECK-MESSAGES-NOT-IGNORED: :[[@LINE-2]]:13: warning: narrowing conversion from 'long' to signed type 'short' is implementation-defined [cppcoreguidelines-narrowing-conversions]
+}

From ec6da0652282d29569faa628d2180909fa588906 Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph@gmail.com>
Date: Fri, 20 Oct 2023 00:35:45 -0700
Subject: [PATCH 007/877] Apply clang-tidy fixes for misc-include-cleaner in
 MLIR examples

---
 mlir/examples/toy/Ch2/toyc.cpp                |  6 +++++-
 mlir/examples/toy/Ch3/mlir/Dialect.cpp        | 12 +++++++++++
 mlir/examples/toy/Ch3/mlir/MLIRGen.cpp        | 15 ++++++++++++--
 mlir/examples/toy/Ch3/mlir/ToyCombine.cpp     |  5 +++--
 mlir/examples/toy/Ch3/parser/AST.cpp          |  3 +++
 mlir/examples/toy/Ch3/toyc.cpp                |  9 ++++++++-
 mlir/examples/toy/Ch4/mlir/Dialect.cpp        | 15 ++++++++++++++
 mlir/examples/toy/Ch4/mlir/MLIRGen.cpp        | 15 ++++++++++++--
 .../toy/Ch4/mlir/ShapeInferencePass.cpp       |  8 ++++++++
 mlir/examples/toy/Ch4/mlir/ToyCombine.cpp     |  5 +++--
 mlir/examples/toy/Ch4/parser/AST.cpp          |  3 +++
 mlir/examples/toy/Ch4/toyc.cpp                |  9 ++++++++-
 mlir/examples/toy/Ch5/mlir/Dialect.cpp        | 15 ++++++++++++++
 .../toy/Ch5/mlir/LowerToAffineLoops.cpp       | 18 +++++++++++++++++
 mlir/examples/toy/Ch5/mlir/MLIRGen.cpp        | 15 ++++++++++++--
 .../toy/Ch5/mlir/ShapeInferencePass.cpp       |  8 ++++++++
 mlir/examples/toy/Ch5/mlir/ToyCombine.cpp     |  5 +++--
 mlir/examples/toy/Ch5/parser/AST.cpp          |  3 +++
 mlir/examples/toy/Ch5/toyc.cpp                |  9 ++++++++-
 mlir/examples/toy/Ch6/mlir/Dialect.cpp        | 15 ++++++++++++++
 .../toy/Ch6/mlir/LowerToAffineLoops.cpp       | 18 +++++++++++++++++
 mlir/examples/toy/Ch6/mlir/LowerToLLVM.cpp    | 13 ++++++++++--
 mlir/examples/toy/Ch6/mlir/MLIRGen.cpp        | 15 ++++++++++++--
 .../toy/Ch6/mlir/ShapeInferencePass.cpp       |  8 ++++++++
 mlir/examples/toy/Ch6/mlir/ToyCombine.cpp     |  5 +++--
 mlir/examples/toy/Ch6/parser/AST.cpp          |  3 +++
 mlir/examples/toy/Ch6/toyc.cpp                | 11 +++++++++-
 mlir/examples/toy/Ch7/mlir/Dialect.cpp        | 20 +++++++++++++++++++
 .../toy/Ch7/mlir/LowerToAffineLoops.cpp       | 18 +++++++++++++++++
 mlir/examples/toy/Ch7/mlir/LowerToLLVM.cpp    | 13 ++++++++++--
 mlir/examples/toy/Ch7/mlir/MLIRGen.cpp        | 18 ++++++++++++++++-
 .../toy/Ch7/mlir/ShapeInferencePass.cpp       |  8 ++++++++
 mlir/examples/toy/Ch7/mlir/ToyCombine.cpp     |  9 +++++++--
 mlir/examples/toy/Ch7/parser/AST.cpp          |  3 +++
 mlir/examples/toy/Ch7/toyc.cpp                | 11 +++++++++-
 .../transform/Ch2/lib/MyExtension.cpp         |  8 ++++++++
 36 files changed, 345 insertions(+), 29 deletions(-)

diff --git a/mlir/examples/toy/Ch2/toyc.cpp b/mlir/examples/toy/Ch2/toyc.cpp
index 59ac5cab0637b..fa431972e211e 100644
--- a/mlir/examples/toy/Ch2/toyc.cpp
+++ b/mlir/examples/toy/Ch2/toyc.cpp
@@ -10,15 +10,19 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "toy/AST.h"
 #include "toy/Dialect.h"
+#include "toy/Lexer.h"
 #include "toy/MLIRGen.h"
 #include "toy/Parser.h"
 #include <memory>
+#include <string>
+#include <system_error>
+#include <utility>
 
 #include "mlir/IR/AsmState.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/MLIRContext.h"
-#include "mlir/IR/Verifier.h"
 #include "mlir/Parser/Parser.h"
 
 #include "llvm/ADT/StringRef.h"
diff --git a/mlir/examples/toy/Ch3/mlir/Dialect.cpp b/mlir/examples/toy/Ch3/mlir/Dialect.cpp
index 2e492404c3f6c..79d82e59645d9 100644
--- a/mlir/examples/toy/Ch3/mlir/Dialect.cpp
+++ b/mlir/examples/toy/Ch3/mlir/Dialect.cpp
@@ -13,10 +13,22 @@
 
 #include "toy/Dialect.h"
 
+#include "mlir/IR/Attributes.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/OpImplementation.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/OperationSupport.h"
+#include "mlir/IR/Value.h"
 #include "mlir/Interfaces/FunctionImplementation.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Support/LogicalResult.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Casting.h"
+#include <algorithm>
+#include <string>
 
 using namespace mlir;
 using namespace mlir::toy;
diff --git a/mlir/examples/toy/Ch3/mlir/MLIRGen.cpp b/mlir/examples/toy/Ch3/mlir/MLIRGen.cpp
index b1abd37c57257..2f0a88f7095b7 100644
--- a/mlir/examples/toy/Ch3/mlir/MLIRGen.cpp
+++ b/mlir/examples/toy/Ch3/mlir/MLIRGen.cpp
@@ -12,20 +12,31 @@
 //===----------------------------------------------------------------------===//
 
 #include "toy/MLIRGen.h"
+#include "mlir/IR/Block.h"
+#include "mlir/IR/Diagnostics.h"
+#include "mlir/IR/Value.h"
+#include "mlir/Support/LogicalResult.h"
 #include "toy/AST.h"
 #include "toy/Dialect.h"
 
-#include "mlir/IR/Attributes.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/Verifier.h"
+#include "toy/Lexer.h"
 
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/ScopedHashTable.h"
-#include "llvm/Support/raw_ostream.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
+#include <cassert>
+#include <cstdint>
+#include <functional>
 #include <numeric>
+#include <optional>
+#include <vector>
 
 using namespace mlir::toy;
 using namespace toy;
diff --git a/mlir/examples/toy/Ch3/mlir/ToyCombine.cpp b/mlir/examples/toy/Ch3/mlir/ToyCombine.cpp
index 01bd10c2fd173..3ce35c86eb880 100644
--- a/mlir/examples/toy/Ch3/mlir/ToyCombine.cpp
+++ b/mlir/examples/toy/Ch3/mlir/ToyCombine.cpp
@@ -11,10 +11,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "mlir/IR/Matchers.h"
+#include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/Value.h"
+#include "mlir/Support/LogicalResult.h"
 #include "toy/Dialect.h"
-#include <numeric>
 using namespace mlir;
 using namespace toy;
 
diff --git a/mlir/examples/toy/Ch3/parser/AST.cpp b/mlir/examples/toy/Ch3/parser/AST.cpp
index 2eaabb1b529e1..2546f2a9725d6 100644
--- a/mlir/examples/toy/Ch3/parser/AST.cpp
+++ b/mlir/examples/toy/Ch3/parser/AST.cpp
@@ -12,9 +12,12 @@
 
 #include "toy/AST.h"
 
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/ADT/TypeSwitch.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/raw_ostream.h"
+#include <string>
 
 using namespace toy;
 
diff --git a/mlir/examples/toy/Ch3/toyc.cpp b/mlir/examples/toy/Ch3/toyc.cpp
index 4b7f41285e066..8c27a7af97a00 100644
--- a/mlir/examples/toy/Ch3/toyc.cpp
+++ b/mlir/examples/toy/Ch3/toyc.cpp
@@ -10,7 +10,11 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "mlir/IR/Diagnostics.h"
+#include "mlir/Support/LogicalResult.h"
+#include "toy/AST.h"
 #include "toy/Dialect.h"
+#include "toy/Lexer.h"
 #include "toy/MLIRGen.h"
 #include "toy/Parser.h"
 
@@ -19,7 +23,6 @@
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/Verifier.h"
 #include "mlir/Parser/Parser.h"
-#include "mlir/Pass/Pass.h"
 #include "mlir/Pass/PassManager.h"
 #include "mlir/Transforms/Passes.h"
 
@@ -29,6 +32,10 @@
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/raw_ostream.h"
+#include <memory>
+#include <string>
+#include <system_error>
+#include <utility>
 
 using namespace toy;
 namespace cl = llvm::cl;
diff --git a/mlir/examples/toy/Ch4/mlir/Dialect.cpp b/mlir/examples/toy/Ch4/mlir/Dialect.cpp
index de33b57c6a804..cc0ea5c4a6375 100644
--- a/mlir/examples/toy/Ch4/mlir/Dialect.cpp
+++ b/mlir/examples/toy/Ch4/mlir/Dialect.cpp
@@ -13,11 +13,26 @@
 
 #include "toy/Dialect.h"
 
+#include "mlir/IR/Attributes.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/Location.h"
 #include "mlir/IR/OpImplementation.h"
+#include "mlir/IR/OperationSupport.h"
+#include "mlir/IR/ValueRange.h"
+#include "mlir/Interfaces/CallInterfaces.h"
 #include "mlir/Interfaces/FunctionImplementation.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Support/LogicalResult.h"
 #include "mlir/Transforms/InliningUtils.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Casting.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <string>
 
 using namespace mlir;
 using namespace mlir::toy;
diff --git a/mlir/examples/toy/Ch4/mlir/MLIRGen.cpp b/mlir/examples/toy/Ch4/mlir/MLIRGen.cpp
index 0b200f105fbf5..6c5474a9646bc 100644
--- a/mlir/examples/toy/Ch4/mlir/MLIRGen.cpp
+++ b/mlir/examples/toy/Ch4/mlir/MLIRGen.cpp
@@ -12,20 +12,31 @@
 //===----------------------------------------------------------------------===//
 
 #include "toy/MLIRGen.h"
+#include "mlir/IR/Block.h"
+#include "mlir/IR/Diagnostics.h"
+#include "mlir/IR/Value.h"
+#include "mlir/Support/LogicalResult.h"
 #include "toy/AST.h"
 #include "toy/Dialect.h"
 
-#include "mlir/IR/Attributes.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/Verifier.h"
+#include "toy/Lexer.h"
 
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/ScopedHashTable.h"
-#include "llvm/Support/raw_ostream.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
+#include <cassert>
+#include <cstdint>
+#include <functional>
 #include <numeric>
+#include <optional>
+#include <vector>
 
 using namespace mlir::toy;
 using namespace toy;
diff --git a/mlir/examples/toy/Ch4/mlir/ShapeInferencePass.cpp b/mlir/examples/toy/Ch4/mlir/ShapeInferencePass.cpp
index d45baa14ab3e8..a9e995ed91bff 100644
--- a/mlir/examples/toy/Ch4/mlir/ShapeInferencePass.cpp
+++ b/mlir/examples/toy/Ch4/mlir/ShapeInferencePass.cpp
@@ -11,13 +11,21 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/Types.h"
 #include "mlir/Pass/Pass.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Support/TypeID.h"
 #include "toy/Dialect.h"
 #include "toy/Passes.h"
 #include "toy/ShapeInferenceInterface.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
+#include <memory>
 
 #define DEBUG_TYPE "shape-inference"
 
diff --git a/mlir/examples/toy/Ch4/mlir/ToyCombine.cpp b/mlir/examples/toy/Ch4/mlir/ToyCombine.cpp
index 01bd10c2fd173..3ce35c86eb880 100644
--- a/mlir/examples/toy/Ch4/mlir/ToyCombine.cpp
+++ b/mlir/examples/toy/Ch4/mlir/ToyCombine.cpp
@@ -11,10 +11,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "mlir/IR/Matchers.h"
+#include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/Value.h"
+#include "mlir/Support/LogicalResult.h"
 #include "toy/Dialect.h"
-#include <numeric>
 using namespace mlir;
 using namespace toy;
 
diff --git a/mlir/examples/toy/Ch4/parser/AST.cpp b/mlir/examples/toy/Ch4/parser/AST.cpp
index 2eaabb1b529e1..2546f2a9725d6 100644
--- a/mlir/examples/toy/Ch4/parser/AST.cpp
+++ b/mlir/examples/toy/Ch4/parser/AST.cpp
@@ -12,9 +12,12 @@
 
 #include "toy/AST.h"
 
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/ADT/TypeSwitch.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/raw_ostream.h"
+#include <string>
 
 using namespace toy;
 
diff --git a/mlir/examples/toy/Ch4/toyc.cpp b/mlir/examples/toy/Ch4/toyc.cpp
index 5e46b41ed1294..d35e07cf3f20b 100644
--- a/mlir/examples/toy/Ch4/toyc.cpp
+++ b/mlir/examples/toy/Ch4/toyc.cpp
@@ -10,7 +10,11 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "mlir/IR/Diagnostics.h"
+#include "mlir/Support/LogicalResult.h"
+#include "toy/AST.h"
 #include "toy/Dialect.h"
+#include "toy/Lexer.h"
 #include "toy/MLIRGen.h"
 #include "toy/Parser.h"
 #include "toy/Passes.h"
@@ -20,7 +24,6 @@
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/Verifier.h"
 #include "mlir/Parser/Parser.h"
-#include "mlir/Pass/Pass.h"
 #include "mlir/Pass/PassManager.h"
 #include "mlir/Transforms/Passes.h"
 
@@ -30,6 +33,10 @@
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/raw_ostream.h"
+#include <memory>
+#include <string>
+#include <system_error>
+#include <utility>
 
 using namespace toy;
 namespace cl = llvm::cl;
diff --git a/mlir/examples/toy/Ch5/mlir/Dialect.cpp b/mlir/examples/toy/Ch5/mlir/Dialect.cpp
index 2d474af1300af..74adfeb64cce5 100644
--- a/mlir/examples/toy/Ch5/mlir/Dialect.cpp
+++ b/mlir/examples/toy/Ch5/mlir/Dialect.cpp
@@ -13,11 +13,26 @@
 
 #include "toy/Dialect.h"
 
+#include "mlir/IR/Attributes.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/Location.h"
 #include "mlir/IR/OpImplementation.h"
+#include "mlir/IR/OperationSupport.h"
+#include "mlir/IR/ValueRange.h"
+#include "mlir/Interfaces/CallInterfaces.h"
 #include "mlir/Interfaces/FunctionImplementation.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Support/LogicalResult.h"
 #include "mlir/Transforms/InliningUtils.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Casting.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <string>
 
 using namespace mlir;
 using namespace mlir::toy;
diff --git a/mlir/examples/toy/Ch5/mlir/LowerToAffineLoops.cpp b/mlir/examples/toy/Ch5/mlir/LowerToAffineLoops.cpp
index fd589ddf84541..240b9f9338665 100644
--- a/mlir/examples/toy/Ch5/mlir/LowerToAffineLoops.cpp
+++ b/mlir/examples/toy/Ch5/mlir/LowerToAffineLoops.cpp
@@ -12,7 +12,17 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/BuiltinDialect.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/Diagnostics.h"
+#include "mlir/IR/DialectRegistry.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/ValueRange.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Support/LogicalResult.h"
+#include "mlir/Support/TypeID.h"
 #include "toy/Dialect.h"
 #include "toy/Passes.h"
 
@@ -22,7 +32,15 @@
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/DialectConversion.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Sequence.h"
+#include "llvm/Support/Casting.h"
+#include <algorithm>
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <utility>
 
 using namespace mlir;
 
diff --git a/mlir/examples/toy/Ch5/mlir/MLIRGen.cpp b/mlir/examples/toy/Ch5/mlir/MLIRGen.cpp
index 0b200f105fbf5..6c5474a9646bc 100644
--- a/mlir/examples/toy/Ch5/mlir/MLIRGen.cpp
+++ b/mlir/examples/toy/Ch5/mlir/MLIRGen.cpp
@@ -12,20 +12,31 @@
 //===----------------------------------------------------------------------===//
 
 #include "toy/MLIRGen.h"
+#include "mlir/IR/Block.h"
+#include "mlir/IR/Diagnostics.h"
+#include "mlir/IR/Value.h"
+#include "mlir/Support/LogicalResult.h"
 #include "toy/AST.h"
 #include "toy/Dialect.h"
 
-#include "mlir/IR/Attributes.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/Verifier.h"
+#include "toy/Lexer.h"
 
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/ScopedHashTable.h"
-#include "llvm/Support/raw_ostream.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
+#include <cassert>
+#include <cstdint>
+#include <functional>
 #include <numeric>
+#include <optional>
+#include <vector>
 
 using namespace mlir::toy;
 using namespace toy;
diff --git a/mlir/examples/toy/Ch5/mlir/ShapeInferencePass.cpp b/mlir/examples/toy/Ch5/mlir/ShapeInferencePass.cpp
index d45baa14ab3e8..a9e995ed91bff 100644
--- a/mlir/examples/toy/Ch5/mlir/ShapeInferencePass.cpp
+++ b/mlir/examples/toy/Ch5/mlir/ShapeInferencePass.cpp
@@ -11,13 +11,21 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/Types.h"
 #include "mlir/Pass/Pass.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Support/TypeID.h"
 #include "toy/Dialect.h"
 #include "toy/Passes.h"
 #include "toy/ShapeInferenceInterface.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
+#include <memory>
 
 #define DEBUG_TYPE "shape-inference"
 
diff --git a/mlir/examples/toy/Ch5/mlir/ToyCombine.cpp b/mlir/examples/toy/Ch5/mlir/ToyCombine.cpp
index 01bd10c2fd173..3ce35c86eb880 100644
--- a/mlir/examples/toy/Ch5/mlir/ToyCombine.cpp
+++ b/mlir/examples/toy/Ch5/mlir/ToyCombine.cpp
@@ -11,10 +11,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "mlir/IR/Matchers.h"
+#include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/Value.h"
+#include "mlir/Support/LogicalResult.h"
 #include "toy/Dialect.h"
-#include <numeric>
 using namespace mlir;
 using namespace toy;
 
diff --git a/mlir/examples/toy/Ch5/parser/AST.cpp b/mlir/examples/toy/Ch5/parser/AST.cpp
index 2eaabb1b529e1..2546f2a9725d6 100644
--- a/mlir/examples/toy/Ch5/parser/AST.cpp
+++ b/mlir/examples/toy/Ch5/parser/AST.cpp
@@ -12,9 +12,12 @@
 
 #include "toy/AST.h"
 
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/ADT/TypeSwitch.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/raw_ostream.h"
+#include <string>
 
 using namespace toy;
 
diff --git a/mlir/examples/toy/Ch5/toyc.cpp b/mlir/examples/toy/Ch5/toyc.cpp
index db09315159168..e0742b8e992b3 100644
--- a/mlir/examples/toy/Ch5/toyc.cpp
+++ b/mlir/examples/toy/Ch5/toyc.cpp
@@ -11,7 +11,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/Dialect/Func/Extensions/AllExtensions.h"
+#include "mlir/IR/Diagnostics.h"
+#include "mlir/Support/LogicalResult.h"
+#include "toy/AST.h"
 #include "toy/Dialect.h"
+#include "toy/Lexer.h"
 #include "toy/MLIRGen.h"
 #include "toy/Parser.h"
 #include "toy/Passes.h"
@@ -23,7 +27,6 @@
 #include "mlir/IR/Verifier.h"
 #include "mlir/InitAllDialects.h"
 #include "mlir/Parser/Parser.h"
-#include "mlir/Pass/Pass.h"
 #include "mlir/Pass/PassManager.h"
 #include "mlir/Transforms/Passes.h"
 
@@ -33,6 +36,10 @@
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/raw_ostream.h"
+#include <memory>
+#include <string>
+#include <system_error>
+#include <utility>
 
 using namespace toy;
 namespace cl = llvm::cl;
diff --git a/mlir/examples/toy/Ch6/mlir/Dialect.cpp b/mlir/examples/toy/Ch6/mlir/Dialect.cpp
index 2d474af1300af..74adfeb64cce5 100644
--- a/mlir/examples/toy/Ch6/mlir/Dialect.cpp
+++ b/mlir/examples/toy/Ch6/mlir/Dialect.cpp
@@ -13,11 +13,26 @@
 
 #include "toy/Dialect.h"
 
+#include "mlir/IR/Attributes.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/Location.h"
 #include "mlir/IR/OpImplementation.h"
+#include "mlir/IR/OperationSupport.h"
+#include "mlir/IR/ValueRange.h"
+#include "mlir/Interfaces/CallInterfaces.h"
 #include "mlir/Interfaces/FunctionImplementation.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Support/LogicalResult.h"
 #include "mlir/Transforms/InliningUtils.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Casting.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <string>
 
 using namespace mlir;
 using namespace mlir::toy;
diff --git a/mlir/examples/toy/Ch6/mlir/LowerToAffineLoops.cpp b/mlir/examples/toy/Ch6/mlir/LowerToAffineLoops.cpp
index fd589ddf84541..240b9f9338665 100644
--- a/mlir/examples/toy/Ch6/mlir/LowerToAffineLoops.cpp
+++ b/mlir/examples/toy/Ch6/mlir/LowerToAffineLoops.cpp
@@ -12,7 +12,17 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/BuiltinDialect.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/Diagnostics.h"
+#include "mlir/IR/DialectRegistry.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/ValueRange.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Support/LogicalResult.h"
+#include "mlir/Support/TypeID.h"
 #include "toy/Dialect.h"
 #include "toy/Passes.h"
 
@@ -22,7 +32,15 @@
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/DialectConversion.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Sequence.h"
+#include "llvm/Support/Casting.h"
+#include <algorithm>
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <utility>
 
 using namespace mlir;
 
diff --git a/mlir/examples/toy/Ch6/mlir/LowerToLLVM.cpp b/mlir/examples/toy/Ch6/mlir/LowerToLLVM.cpp
index 67b82f91b1dbd..f91d880413c9c 100644
--- a/mlir/examples/toy/Ch6/mlir/LowerToLLVM.cpp
+++ b/mlir/examples/toy/Ch6/mlir/LowerToLLVM.cpp
@@ -22,6 +22,14 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "mlir/Dialect/LLVMIR/LLVMAttrs.h"
+#include "mlir/Dialect/LLVMIR/LLVMTypes.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Support/LogicalResult.h"
+#include "mlir/Support/TypeID.h"
 #include "toy/Dialect.h"
 #include "toy/Passes.h"
 
@@ -34,7 +42,6 @@
 #include "mlir/Conversion/LLVMCommon/TypeConverter.h"
 #include "mlir/Conversion/MemRefToLLVM/MemRefToLLVM.h"
 #include "mlir/Conversion/SCFToControlFlow/SCFToControlFlow.h"
-#include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
@@ -42,7 +49,9 @@
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/DialectConversion.h"
-#include "llvm/ADT/Sequence.h"
+#include "llvm/Support/Casting.h"
+#include <memory>
+#include <utility>
 
 using namespace mlir;
 
diff --git a/mlir/examples/toy/Ch6/mlir/MLIRGen.cpp b/mlir/examples/toy/Ch6/mlir/MLIRGen.cpp
index 0b200f105fbf5..6c5474a9646bc 100644
--- a/mlir/examples/toy/Ch6/mlir/MLIRGen.cpp
+++ b/mlir/examples/toy/Ch6/mlir/MLIRGen.cpp
@@ -12,20 +12,31 @@
 //===----------------------------------------------------------------------===//
 
 #include "toy/MLIRGen.h"
+#include "mlir/IR/Block.h"
+#include "mlir/IR/Diagnostics.h"
+#include "mlir/IR/Value.h"
+#include "mlir/Support/LogicalResult.h"
 #include "toy/AST.h"
 #include "toy/Dialect.h"
 
-#include "mlir/IR/Attributes.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/Verifier.h"
+#include "toy/Lexer.h"
 
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/ScopedHashTable.h"
-#include "llvm/Support/raw_ostream.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
+#include <cassert>
+#include <cstdint>
+#include <functional>
 #include <numeric>
+#include <optional>
+#include <vector>
 
 using namespace mlir::toy;
 using namespace toy;
diff --git a/mlir/examples/toy/Ch6/mlir/ShapeInferencePass.cpp b/mlir/examples/toy/Ch6/mlir/ShapeInferencePass.cpp
index d45baa14ab3e8..a9e995ed91bff 100644
--- a/mlir/examples/toy/Ch6/mlir/ShapeInferencePass.cpp
+++ b/mlir/examples/toy/Ch6/mlir/ShapeInferencePass.cpp
@@ -11,13 +11,21 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/Types.h"
 #include "mlir/Pass/Pass.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Support/TypeID.h"
 #include "toy/Dialect.h"
 #include "toy/Passes.h"
 #include "toy/ShapeInferenceInterface.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
+#include <memory>
 
 #define DEBUG_TYPE "shape-inference"
 
diff --git a/mlir/examples/toy/Ch6/mlir/ToyCombine.cpp b/mlir/examples/toy/Ch6/mlir/ToyCombine.cpp
index 01bd10c2fd173..3ce35c86eb880 100644
--- a/mlir/examples/toy/Ch6/mlir/ToyCombine.cpp
+++ b/mlir/examples/toy/Ch6/mlir/ToyCombine.cpp
@@ -11,10 +11,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "mlir/IR/Matchers.h"
+#include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/Value.h"
+#include "mlir/Support/LogicalResult.h"
 #include "toy/Dialect.h"
-#include <numeric>
 using namespace mlir;
 using namespace toy;
 
diff --git a/mlir/examples/toy/Ch6/parser/AST.cpp b/mlir/examples/toy/Ch6/parser/AST.cpp
index 2eaabb1b529e1..2546f2a9725d6 100644
--- a/mlir/examples/toy/Ch6/parser/AST.cpp
+++ b/mlir/examples/toy/Ch6/parser/AST.cpp
@@ -12,9 +12,12 @@
 
 #include "toy/AST.h"
 
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/ADT/TypeSwitch.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/raw_ostream.h"
+#include <string>
 
 using namespace toy;
 
diff --git a/mlir/examples/toy/Ch6/toyc.cpp b/mlir/examples/toy/Ch6/toyc.cpp
index 30522aa46107c..fe2137cfdfbfc 100644
--- a/mlir/examples/toy/Ch6/toyc.cpp
+++ b/mlir/examples/toy/Ch6/toyc.cpp
@@ -11,7 +11,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/Dialect/Func/Extensions/AllExtensions.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Support/LogicalResult.h"
+#include "toy/AST.h"
 #include "toy/Dialect.h"
+#include "toy/Lexer.h"
 #include "toy/MLIRGen.h"
 #include "toy/Parser.h"
 #include "toy/Passes.h"
@@ -26,7 +30,6 @@
 #include "mlir/IR/Verifier.h"
 #include "mlir/InitAllDialects.h"
 #include "mlir/Parser/Parser.h"
-#include "mlir/Pass/Pass.h"
 #include "mlir/Pass/PassManager.h"
 #include "mlir/Target/LLVMIR/Dialect/Builtin/BuiltinToLLVMIRTranslation.h"
 #include "mlir/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.h"
@@ -34,6 +37,7 @@
 #include "mlir/Transforms/Passes.h"
 
 #include "llvm/ADT/StringRef.h"
+#include "llvm/ExecutionEngine/Orc/JITTargetMachineBuilder.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorOr.h"
@@ -41,6 +45,11 @@
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/TargetSelect.h"
 #include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <memory>
+#include <string>
+#include <system_error>
+#include <utility>
 
 using namespace toy;
 namespace cl = llvm::cl;
diff --git a/mlir/examples/toy/Ch7/mlir/Dialect.cpp b/mlir/examples/toy/Ch7/mlir/Dialect.cpp
index 8db5d735e1db0..f17173f007645 100644
--- a/mlir/examples/toy/Ch7/mlir/Dialect.cpp
+++ b/mlir/examples/toy/Ch7/mlir/Dialect.cpp
@@ -13,12 +13,32 @@
 
 #include "toy/Dialect.h"
 
+#include "mlir/IR/Attributes.h"
 #include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/DialectImplementation.h"
+#include "mlir/IR/Location.h"
+#include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/OpImplementation.h"
+#include "mlir/IR/OperationSupport.h"
+#include "mlir/IR/TypeSupport.h"
+#include "mlir/IR/ValueRange.h"
+#include "mlir/Interfaces/CallInterfaces.h"
 #include "mlir/Interfaces/FunctionImplementation.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Support/LogicalResult.h"
 #include "mlir/Transforms/InliningUtils.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Casting.h"
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <string>
 
 using namespace mlir;
 using namespace mlir::toy;
diff --git a/mlir/examples/toy/Ch7/mlir/LowerToAffineLoops.cpp b/mlir/examples/toy/Ch7/mlir/LowerToAffineLoops.cpp
index fd589ddf84541..240b9f9338665 100644
--- a/mlir/examples/toy/Ch7/mlir/LowerToAffineLoops.cpp
+++ b/mlir/examples/toy/Ch7/mlir/LowerToAffineLoops.cpp
@@ -12,7 +12,17 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/BuiltinDialect.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/Diagnostics.h"
+#include "mlir/IR/DialectRegistry.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/ValueRange.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Support/LogicalResult.h"
+#include "mlir/Support/TypeID.h"
 #include "toy/Dialect.h"
 #include "toy/Passes.h"
 
@@ -22,7 +32,15 @@
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/DialectConversion.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Sequence.h"
+#include "llvm/Support/Casting.h"
+#include <algorithm>
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <utility>
 
 using namespace mlir;
 
diff --git a/mlir/examples/toy/Ch7/mlir/LowerToLLVM.cpp b/mlir/examples/toy/Ch7/mlir/LowerToLLVM.cpp
index 67b82f91b1dbd..f91d880413c9c 100644
--- a/mlir/examples/toy/Ch7/mlir/LowerToLLVM.cpp
+++ b/mlir/examples/toy/Ch7/mlir/LowerToLLVM.cpp
@@ -22,6 +22,14 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "mlir/Dialect/LLVMIR/LLVMAttrs.h"
+#include "mlir/Dialect/LLVMIR/LLVMTypes.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Support/LogicalResult.h"
+#include "mlir/Support/TypeID.h"
 #include "toy/Dialect.h"
 #include "toy/Passes.h"
 
@@ -34,7 +42,6 @@
 #include "mlir/Conversion/LLVMCommon/TypeConverter.h"
 #include "mlir/Conversion/MemRefToLLVM/MemRefToLLVM.h"
 #include "mlir/Conversion/SCFToControlFlow/SCFToControlFlow.h"
-#include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
@@ -42,7 +49,9 @@
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/DialectConversion.h"
-#include "llvm/ADT/Sequence.h"
+#include "llvm/Support/Casting.h"
+#include <memory>
+#include <utility>
 
 using namespace mlir;
 
diff --git a/mlir/examples/toy/Ch7/mlir/MLIRGen.cpp b/mlir/examples/toy/Ch7/mlir/MLIRGen.cpp
index 7f2a4c29ca6b5..0f8e8df38525f 100644
--- a/mlir/examples/toy/Ch7/mlir/MLIRGen.cpp
+++ b/mlir/examples/toy/Ch7/mlir/MLIRGen.cpp
@@ -12,6 +12,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "toy/MLIRGen.h"
+#include "mlir/IR/Block.h"
+#include "mlir/IR/Diagnostics.h"
+#include "mlir/IR/Value.h"
+#include "mlir/Support/LogicalResult.h"
 #include "toy/AST.h"
 #include "toy/Dialect.h"
 
@@ -21,12 +25,24 @@
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/Verifier.h"
+#include "toy/Lexer.h"
 
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/ScopedHashTable.h"
-#include "llvm/Support/raw_ostream.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <functional>
 #include <numeric>
 #include <optional>
+#include <tuple>
+#include <utility>
+#include <vector>
 
 using namespace mlir::toy;
 using namespace toy;
diff --git a/mlir/examples/toy/Ch7/mlir/ShapeInferencePass.cpp b/mlir/examples/toy/Ch7/mlir/ShapeInferencePass.cpp
index d45baa14ab3e8..a9e995ed91bff 100644
--- a/mlir/examples/toy/Ch7/mlir/ShapeInferencePass.cpp
+++ b/mlir/examples/toy/Ch7/mlir/ShapeInferencePass.cpp
@@ -11,13 +11,21 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/Types.h"
 #include "mlir/Pass/Pass.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Support/TypeID.h"
 #include "toy/Dialect.h"
 #include "toy/Passes.h"
 #include "toy/ShapeInferenceInterface.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
+#include <memory>
 
 #define DEBUG_TYPE "shape-inference"
 
diff --git a/mlir/examples/toy/Ch7/mlir/ToyCombine.cpp b/mlir/examples/toy/Ch7/mlir/ToyCombine.cpp
index 09be97ea3c015..72f5e4b5c847b 100644
--- a/mlir/examples/toy/Ch7/mlir/ToyCombine.cpp
+++ b/mlir/examples/toy/Ch7/mlir/ToyCombine.cpp
@@ -11,10 +11,15 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "mlir/IR/Matchers.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/OpDefinition.h"
 #include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/Value.h"
+#include "mlir/Support/LogicalResult.h"
 #include "toy/Dialect.h"
-#include <numeric>
+#include "llvm/Support/Casting.h"
+#include <cstddef>
 using namespace mlir;
 using namespace toy;
 
diff --git a/mlir/examples/toy/Ch7/parser/AST.cpp b/mlir/examples/toy/Ch7/parser/AST.cpp
index 3542f8f9e11cf..e38a743a769c6 100644
--- a/mlir/examples/toy/Ch7/parser/AST.cpp
+++ b/mlir/examples/toy/Ch7/parser/AST.cpp
@@ -12,9 +12,12 @@
 
 #include "toy/AST.h"
 
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/ADT/TypeSwitch.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/raw_ostream.h"
+#include <string>
 
 using namespace toy;
 
diff --git a/mlir/examples/toy/Ch7/toyc.cpp b/mlir/examples/toy/Ch7/toyc.cpp
index a83292ecac5fe..d4cc8e7279d3b 100644
--- a/mlir/examples/toy/Ch7/toyc.cpp
+++ b/mlir/examples/toy/Ch7/toyc.cpp
@@ -11,7 +11,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/Dialect/Func/Extensions/AllExtensions.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Support/LogicalResult.h"
+#include "toy/AST.h"
 #include "toy/Dialect.h"
+#include "toy/Lexer.h"
 #include "toy/MLIRGen.h"
 #include "toy/Parser.h"
 #include "toy/Passes.h"
@@ -26,7 +30,6 @@
 #include "mlir/IR/Verifier.h"
 #include "mlir/InitAllDialects.h"
 #include "mlir/Parser/Parser.h"
-#include "mlir/Pass/Pass.h"
 #include "mlir/Pass/PassManager.h"
 #include "mlir/Target/LLVMIR/Dialect/Builtin/BuiltinToLLVMIRTranslation.h"
 #include "mlir/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.h"
@@ -34,6 +37,7 @@
 #include "mlir/Transforms/Passes.h"
 
 #include "llvm/ADT/StringRef.h"
+#include "llvm/ExecutionEngine/Orc/JITTargetMachineBuilder.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorOr.h"
@@ -41,6 +45,11 @@
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/TargetSelect.h"
 #include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <memory>
+#include <string>
+#include <system_error>
+#include <utility>
 
 using namespace toy;
 namespace cl = llvm::cl;
diff --git a/mlir/examples/transform/Ch2/lib/MyExtension.cpp b/mlir/examples/transform/Ch2/lib/MyExtension.cpp
index af2890d3b5239..031c52c307382 100644
--- a/mlir/examples/transform/Ch2/lib/MyExtension.cpp
+++ b/mlir/examples/transform/Ch2/lib/MyExtension.cpp
@@ -15,6 +15,14 @@
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/Dialect/Transform/IR/TransformDialect.h"
+#include "mlir/Dialect/Transform/IR/TransformInterfaces.h"
+#include "mlir/Dialect/Transform/IR/TransformTypes.h"
+#include "mlir/IR/DialectRegistry.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/Interfaces/SideEffectInterfaces.h"
+#include "mlir/Support/LLVM.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
 
 // Define a new transform dialect extension. This uses the CRTP idiom to
 // identify extensions.

From f5063bf7edbb6368deb7354b4340eb62b8329d2e Mon Sep 17 00:00:00 2001
From: Congcong Cai <congcongcai0907@163.com>
Date: Thu, 26 Oct 2023 13:37:50 +0800
Subject: [PATCH 008/877] [clang-tidy][NFC]refactor
 PreferMemberInitializerCheck for readability

---
 .../PreferMemberInitializerCheck.cpp          | 252 +++++++++---------
 1 file changed, 126 insertions(+), 126 deletions(-)

diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/PreferMemberInitializerCheck.cpp b/clang-tools-extra/clang-tidy/cppcoreguidelines/PreferMemberInitializerCheck.cpp
index aac824f15fa57..0ef13ae298033 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/PreferMemberInitializerCheck.cpp
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/PreferMemberInitializerCheck.cpp
@@ -178,140 +178,140 @@ void PreferMemberInitializerCheck::check(
     const FieldDecl *Field = nullptr;
     const Expr *InitValue = nullptr;
     std::tie(Field, InitValue) = isAssignmentToMemberOf(Class, S, Ctor);
-    if (Field) {
-      if (IsUseDefaultMemberInitEnabled && getLangOpts().CPlusPlus11 &&
-          Ctor->isDefaultConstructor() &&
-          (getLangOpts().CPlusPlus20 || !Field->isBitField()) &&
-          !Field->hasInClassInitializer() &&
-          (!isa<RecordDecl>(Class->getDeclContext()) ||
-           !cast<RecordDecl>(Class->getDeclContext())->isUnion()) &&
-          shouldBeDefaultMemberInitializer(InitValue)) {
-
-        bool InvalidFix = false;
-        SourceLocation FieldEnd =
-            Lexer::getLocForEndOfToken(Field->getSourceRange().getEnd(), 0,
-                                       *Result.SourceManager, getLangOpts());
-        InvalidFix |= FieldEnd.isInvalid() || FieldEnd.isMacroID();
-        SourceLocation SemiColonEnd;
-        if (auto NextToken = Lexer::findNextToken(
-                S->getEndLoc(), *Result.SourceManager, getLangOpts()))
-          SemiColonEnd = NextToken->getEndLoc();
-        else
-          InvalidFix = true;
-        auto Diag =
-            diag(S->getBeginLoc(), "%0 should be initialized in an in-class"
-                                   " default member initializer")
-            << Field;
-        if (InvalidFix)
+    if (!Field)
+      continue;
+    const bool IsInDefaultMemberInitializer =
+        IsUseDefaultMemberInitEnabled && getLangOpts().CPlusPlus11 &&
+        Ctor->isDefaultConstructor() &&
+        (getLangOpts().CPlusPlus20 || !Field->isBitField()) &&
+        !Field->hasInClassInitializer() &&
+        (!isa<RecordDecl>(Class->getDeclContext()) ||
+         !cast<RecordDecl>(Class->getDeclContext())->isUnion()) &&
+        shouldBeDefaultMemberInitializer(InitValue);
+    if (IsInDefaultMemberInitializer) {
+      bool InvalidFix = false;
+      SourceLocation FieldEnd =
+          Lexer::getLocForEndOfToken(Field->getSourceRange().getEnd(), 0,
+                                     *Result.SourceManager, getLangOpts());
+      InvalidFix |= FieldEnd.isInvalid() || FieldEnd.isMacroID();
+      SourceLocation SemiColonEnd;
+      if (auto NextToken = Lexer::findNextToken(
+              S->getEndLoc(), *Result.SourceManager, getLangOpts()))
+        SemiColonEnd = NextToken->getEndLoc();
+      else
+        InvalidFix = true;
+      auto Diag =
+          diag(S->getBeginLoc(), "%0 should be initialized in an in-class"
+                                 " default member initializer")
+          << Field;
+      if (InvalidFix)
+        continue;
+      CharSourceRange StmtRange =
+          CharSourceRange::getCharRange(S->getBeginLoc(), SemiColonEnd);
+
+      SmallString<128> Insertion(
+          {UseAssignment ? " = " : "{",
+           Lexer::getSourceText(
+               CharSourceRange(InitValue->getSourceRange(), true),
+               *Result.SourceManager, getLangOpts()),
+           UseAssignment ? "" : "}"});
+
+      Diag << FixItHint::CreateInsertion(FieldEnd, Insertion)
+           << FixItHint::CreateRemoval(StmtRange);
+
+    } else {
+      StringRef InsertPrefix = "";
+      bool HasInitAlready = false;
+      SourceLocation InsertPos;
+      SourceRange ReplaceRange;
+      bool AddComma = false;
+      bool InvalidFix = false;
+      unsigned Index = Field->getFieldIndex();
+      const CXXCtorInitializer *LastInListInit = nullptr;
+      for (const CXXCtorInitializer *Init : Ctor->inits()) {
+        if (!Init->isWritten() || Init->isInClassMemberInitializer())
           continue;
-        CharSourceRange StmtRange =
-            CharSourceRange::getCharRange(S->getBeginLoc(), SemiColonEnd);
-
-        SmallString<128> Insertion(
-            {UseAssignment ? " = " : "{",
-             Lexer::getSourceText(
-                 CharSourceRange(InitValue->getSourceRange(), true),
-                 *Result.SourceManager, getLangOpts()),
-             UseAssignment ? "" : "}"});
-
-        Diag << FixItHint::CreateInsertion(FieldEnd, Insertion)
-             << FixItHint::CreateRemoval(StmtRange);
-
-      } else {
-        StringRef InsertPrefix = "";
-        bool HasInitAlready = false;
-        SourceLocation InsertPos;
-        SourceRange ReplaceRange;
-        bool AddComma = false;
-        bool InvalidFix = false;
-        unsigned Index = Field->getFieldIndex();
-        const CXXCtorInitializer *LastInListInit = nullptr;
-        for (const CXXCtorInitializer *Init : Ctor->inits()) {
-          if (!Init->isWritten() || Init->isInClassMemberInitializer())
-            continue;
-          if (Init->getMember() == Field) {
-            HasInitAlready = true;
-            if (isa<ImplicitValueInitExpr>(Init->getInit()))
-              InsertPos = Init->getRParenLoc();
-            else {
-              ReplaceRange = Init->getInit()->getSourceRange();
-            }
-            break;
-          }
-          if (Init->isMemberInitializer() &&
-              Index < Init->getMember()->getFieldIndex()) {
-            InsertPos = Init->getSourceLocation();
-            // There are initializers after the one we are inserting, so add a
-            // comma after this insertion in order to not break anything.
-            AddComma = true;
-            break;
+        if (Init->getMember() == Field) {
+          HasInitAlready = true;
+          if (isa<ImplicitValueInitExpr>(Init->getInit()))
+            InsertPos = Init->getRParenLoc();
+          else {
+            ReplaceRange = Init->getInit()->getSourceRange();
           }
-          LastInListInit = Init;
+          break;
         }
-        if (HasInitAlready) {
-          if (InsertPos.isValid())
-            InvalidFix |= InsertPos.isMacroID();
-          else
-            InvalidFix |= ReplaceRange.getBegin().isMacroID() ||
-                          ReplaceRange.getEnd().isMacroID();
-        } else {
-          if (InsertPos.isInvalid()) {
-            if (LastInListInit) {
-              InsertPos = Lexer::getLocForEndOfToken(
-                  LastInListInit->getRParenLoc(), 0, *Result.SourceManager,
-                  getLangOpts());
-              // Inserting after the last constructor initializer, so we need a
-              // comma.
-              InsertPrefix = ", ";
-            } else {
-              InsertPos = Lexer::getLocForEndOfToken(
-                  Ctor->getTypeSourceInfo()
-                      ->getTypeLoc()
-                      .getAs<clang::FunctionTypeLoc>()
-                      .getLocalRangeEnd(),
-                  0, *Result.SourceManager, getLangOpts());
-
-              // If this is first time in the loop, there are no initializers so
-              // `:` declares member initialization list. If this is a
-              // subsequent pass then we have already inserted a `:` so continue
-              // with a comma.
-              InsertPrefix = FirstToCtorInits ? " : " : ", ";
-            }
-          }
+        if (Init->isMemberInitializer() &&
+            Index < Init->getMember()->getFieldIndex()) {
+          InsertPos = Init->getSourceLocation();
+          // There are initializers after the one we are inserting, so add a
+          // comma after this insertion in order to not break anything.
+          AddComma = true;
+          break;
+        }
+        LastInListInit = Init;
+      }
+      if (HasInitAlready) {
+        if (InsertPos.isValid())
           InvalidFix |= InsertPos.isMacroID();
+        else
+          InvalidFix |= ReplaceRange.getBegin().isMacroID() ||
+                        ReplaceRange.getEnd().isMacroID();
+      } else {
+        if (InsertPos.isInvalid()) {
+          if (LastInListInit) {
+            InsertPos = Lexer::getLocForEndOfToken(
+                LastInListInit->getRParenLoc(), 0, *Result.SourceManager,
+                getLangOpts());
+            // Inserting after the last constructor initializer, so we need a
+            // comma.
+            InsertPrefix = ", ";
+          } else {
+            InsertPos = Lexer::getLocForEndOfToken(
+                Ctor->getTypeSourceInfo()
+                    ->getTypeLoc()
+                    .getAs<clang::FunctionTypeLoc>()
+                    .getLocalRangeEnd(),
+                0, *Result.SourceManager, getLangOpts());
+
+            // If this is first time in the loop, there are no initializers so
+            // `:` declares member initialization list. If this is a
+            // subsequent pass then we have already inserted a `:` so continue
+            // with a comma.
+            InsertPrefix = FirstToCtorInits ? " : " : ", ";
+          }
         }
+        InvalidFix |= InsertPos.isMacroID();
+      }
 
-        SourceLocation SemiColonEnd;
-        if (auto NextToken = Lexer::findNextToken(
-                S->getEndLoc(), *Result.SourceManager, getLangOpts()))
-          SemiColonEnd = NextToken->getEndLoc();
+      SourceLocation SemiColonEnd;
+      if (auto NextToken = Lexer::findNextToken(
+              S->getEndLoc(), *Result.SourceManager, getLangOpts()))
+        SemiColonEnd = NextToken->getEndLoc();
+      else
+        InvalidFix = true;
+
+      auto Diag = diag(S->getBeginLoc(), "%0 should be initialized in a member"
+                                         " initializer of the constructor")
+                  << Field;
+      if (InvalidFix)
+        continue;
+      StringRef NewInit = Lexer::getSourceText(
+          CharSourceRange(InitValue->getSourceRange(), true),
+          *Result.SourceManager, getLangOpts());
+      if (HasInitAlready) {
+        if (InsertPos.isValid())
+          Diag << FixItHint::CreateInsertion(InsertPos, NewInit);
         else
-          InvalidFix = true;
-
-        auto Diag =
-            diag(S->getBeginLoc(), "%0 should be initialized in a member"
-                                   " initializer of the constructor")
-            << Field;
-        if (InvalidFix)
-          continue;
-        StringRef NewInit = Lexer::getSourceText(
-            CharSourceRange(InitValue->getSourceRange(), true),
-            *Result.SourceManager, getLangOpts());
-        if (HasInitAlready) {
-          if (InsertPos.isValid())
-            Diag << FixItHint::CreateInsertion(InsertPos, NewInit);
-          else
-            Diag << FixItHint::CreateReplacement(ReplaceRange, NewInit);
-        } else {
-          SmallString<128> Insertion({InsertPrefix, Field->getName(), "(",
-                                      NewInit, AddComma ? "), " : ")"});
-          Diag << FixItHint::CreateInsertion(InsertPos, Insertion,
-                                             FirstToCtorInits);
-          FirstToCtorInits = areDiagsSelfContained();
-        }
-        Diag << FixItHint::CreateRemoval(
-            CharSourceRange::getCharRange(S->getBeginLoc(), SemiColonEnd));
+          Diag << FixItHint::CreateReplacement(ReplaceRange, NewInit);
+      } else {
+        SmallString<128> Insertion({InsertPrefix, Field->getName(), "(",
+                                    NewInit, AddComma ? "), " : ")"});
+        Diag << FixItHint::CreateInsertion(InsertPos, Insertion,
+                                           FirstToCtorInits);
+        FirstToCtorInits = areDiagsSelfContained();
       }
+      Diag << FixItHint::CreateRemoval(
+          CharSourceRange::getCharRange(S->getBeginLoc(), SemiColonEnd));
     }
   }
 }

From 926173c614784149889b2c975adccf52bcece75b Mon Sep 17 00:00:00 2001
From: KAWASHIMA Takahiro <t-kawashima@fujitsu.com>
Date: Thu, 26 Oct 2023 14:51:20 +0900
Subject: [PATCH 009/877] [AArch64] Prevent argument promotion of vector with
 size > 128 bits (#70034)

This patch prevents argument promotion from promoting pointers to
fixed-length vector types larger than 128 bits like `<8 x float>` into
the values of the pointees.

Such vector types are used for SVE VLS but there is no ABI for SVE VLS
arguments and the backend cannot lower such value arguments.

Fixes #69147
---
 .../AArch64/AArch64TargetTransformInfo.cpp    |  24 +++
 .../AArch64/AArch64TargetTransformInfo.h      |   3 +
 llvm/test/CodeGen/AArch64/arg_promotion.ll    | 190 ++++++++++++++++++
 3 files changed, 217 insertions(+)
 create mode 100644 llvm/test/CodeGen/AArch64/arg_promotion.ll

diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index d8a0e68d71237..f121dc40e9fe6 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -212,6 +212,30 @@ bool AArch64TTIImpl::areInlineCompatible(const Function *Caller,
   return (CallerBits & CalleeBits) == CalleeBits;
 }
 
+bool AArch64TTIImpl::areTypesABICompatible(
+    const Function *Caller, const Function *Callee,
+    const ArrayRef<Type *> &Types) const {
+  if (!BaseT::areTypesABICompatible(Caller, Callee, Types))
+    return false;
+
+  // We need to ensure that argument promotion does not attempt to promote
+  // pointers to fixed-length vector types larger than 128 bits like
+  // <8 x float> (and pointers to aggregate types which have such fixed-length
+  // vector type members) into the values of the pointees. Such vector types
+  // are used for SVE VLS but there is no ABI for SVE VLS arguments and the
+  // backend cannot lower such value arguments. The 128-bit fixed-length SVE
+  // types can be safely treated as 128-bit NEON types and they cannot be
+  // distinguished in IR.
+  if (ST->useSVEForFixedLengthVectors() && llvm::any_of(Types, [](Type *Ty) {
+        auto FVTy = dyn_cast<FixedVectorType>(Ty);
+        return FVTy &&
+               FVTy->getScalarSizeInBits() * FVTy->getNumElements() > 128;
+      }))
+    return false;
+
+  return true;
+}
+
 bool AArch64TTIImpl::shouldMaximizeVectorBandwidth(
     TargetTransformInfo::RegisterKind K) const {
   assert(K != TargetTransformInfo::RGK_Scalar);
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index 0501c42285232..c08004ad299fd 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -77,6 +77,9 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
   bool areInlineCompatible(const Function *Caller,
                            const Function *Callee) const;
 
+  bool areTypesABICompatible(const Function *Caller, const Function *Callee,
+                             const ArrayRef<Type *> &Types) const;
+
   /// \name Scalar TTI Implementations
   /// @{
 
diff --git a/llvm/test/CodeGen/AArch64/arg_promotion.ll b/llvm/test/CodeGen/AArch64/arg_promotion.ll
new file mode 100644
index 0000000000000..cc37d230c6cbe
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/arg_promotion.ll
@@ -0,0 +1,190 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+
+; RUN: opt -S -passes=argpromotion -mtriple=aarch64-unknwon-linux-gnu < %s | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+; Don't promote a vector pointer argument when the pointee type size is greater
+; than 128 bits.
+
+define dso_local void @caller_8xi32(ptr noalias %src, ptr noalias %dst) #0 {
+; CHECK-LABEL: define dso_local void @caller_8xi32(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    call fastcc void @callee_8xi32(ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  call fastcc void @callee_8xi32(ptr noalias %src, ptr noalias %dst)
+  ret void
+}
+
+define internal fastcc void @callee_8xi32(ptr noalias %src, ptr noalias %dst) #0 {
+; CHECK-LABEL: define internal fastcc void @callee_8xi32(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i32>, ptr [[SRC:%.*]], align 16
+; CHECK-NEXT:    store <8 x i32> [[TMP0]], ptr [[DST:%.*]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = load <8 x i32>, ptr %src, align 16
+  store <8 x i32> %0, ptr %dst, align 16
+  ret void
+}
+
+; Promote a vector pointer argument when the pointee type size is 128 bits or
+; less.
+
+define dso_local void @caller_4xi32(ptr noalias %src, ptr noalias %dst) #1 {
+; CHECK-LABEL: define dso_local void @caller_4xi32(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SRC_VAL:%.*]] = load <4 x i32>, ptr [[SRC:%.*]], align 16
+; CHECK-NEXT:    call fastcc void @callee_4xi32(<4 x i32> [[SRC_VAL]], ptr noalias [[DST:%.*]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  call fastcc void @callee_4xi32(ptr noalias %src, ptr noalias %dst)
+  ret void
+}
+
+define internal fastcc void @callee_4xi32(ptr noalias %src, ptr noalias %dst) #1 {
+; CHECK-LABEL: define internal fastcc void @callee_4xi32(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    store <4 x i32> [[SRC_0_VAL:%.*]], ptr [[DST:%.*]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = load <4 x i32>, ptr %src, align 16
+  store <4 x i32> %0, ptr %dst, align 16
+  ret void
+}
+
+; A scalar pointer argument is promoted even when the pointee type size is
+; greater than 128 bits.
+
+define dso_local void @caller_i256(ptr noalias %src, ptr noalias %dst) #0 {
+; CHECK-LABEL: define dso_local void @caller_i256(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SRC_VAL:%.*]] = load i256, ptr [[SRC:%.*]], align 16
+; CHECK-NEXT:    call fastcc void @callee_i256(i256 [[SRC_VAL]], ptr noalias [[DST:%.*]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  call fastcc void @callee_i256(ptr noalias %src, ptr noalias %dst)
+  ret void
+}
+
+define internal fastcc void @callee_i256(ptr noalias %src, ptr noalias %dst) #0 {
+; CHECK-LABEL: define internal fastcc void @callee_i256(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    store i256 [[SRC_0_VAL:%.*]], ptr [[DST:%.*]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = load i256, ptr %src, align 16
+  store i256 %0, ptr %dst, align 16
+  ret void
+}
+
+; A scalable vector pointer argument is not a target of ArgumentPromotionPass.
+
+define dso_local void @caller_nx4xi32(ptr noalias %src, ptr noalias %dst) #2 {
+; CHECK-LABEL: define dso_local void @caller_nx4xi32(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    call fastcc void @callee_nx4xi32(ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  call fastcc void @callee_nx4xi32(ptr noalias %src, ptr noalias %dst)
+  ret void
+}
+
+define internal fastcc void @callee_nx4xi32(ptr noalias %src, ptr noalias %dst) #2 {
+; CHECK-LABEL: define internal fastcc void @callee_nx4xi32(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <vscale x 4 x i32>, ptr [[SRC:%.*]], align 16
+; CHECK-NEXT:    store <vscale x 4 x i32> [[TMP0]], ptr [[DST:%.*]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = load <vscale x 4 x i32>, ptr %src, align 16
+  store <vscale x 4 x i32> %0, ptr %dst, align 16
+  ret void
+}
+
+; Don't promote a structure pointer argument when the pointee vector member
+; type size is greater than 128 bits.
+
+%struct_8xi32 = type { <8 x i32>, <8 x i32> }
+
+define dso_local void @caller_struct8xi32(ptr noalias %src, ptr noalias %dst) #0 {
+; CHECK-LABEL: define dso_local void @caller_struct8xi32(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    call fastcc void @callee_struct8xi32(ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  call fastcc void @callee_struct8xi32(ptr noalias %src, ptr noalias %dst)
+  ret void
+}
+
+define internal fastcc void @callee_struct8xi32(ptr noalias %src, ptr noalias %dst) #0 {
+; CHECK-LABEL: define internal fastcc void @callee_struct8xi32(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i32>, ptr [[SRC:%.*]], align 16
+; CHECK-NEXT:    store <8 x i32> [[TMP0]], ptr [[DST:%.*]], align 16
+; CHECK-NEXT:    [[SRC2:%.*]] = getelementptr inbounds [[STRUCT_8XI32:%.*]], ptr [[SRC]], i64 0, i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, ptr [[SRC2]], align 16
+; CHECK-NEXT:    [[DST2:%.*]] = getelementptr inbounds [[STRUCT_8XI32]], ptr [[DST]], i64 0, i32 1
+; CHECK-NEXT:    store <8 x i32> [[TMP1]], ptr [[DST2]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = load <8 x i32>, ptr %src, align 16
+  store <8 x i32> %0, ptr %dst, align 16
+  %src2 = getelementptr inbounds %struct_8xi32, ptr %src, i64 0, i32 1
+  %1 = load <8 x i32>, ptr %src2, align 16
+  %dst2 = getelementptr inbounds %struct_8xi32, ptr %dst, i64 0, i32 1
+  store <8 x i32> %1, ptr %dst2, align 16
+  ret void
+}
+
+; Promote a structure pointer argument when the pointee vector member type size
+; is 128 bits or less.
+
+%struct_4xi32 = type { <4 x i32>, <4 x i32> }
+
+define dso_local void @caller_struct4xi32(ptr noalias %src, ptr noalias %dst) #1 {
+; CHECK-LABEL: define dso_local void @caller_struct4xi32(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SRC_VAL:%.*]] = load <4 x i32>, ptr [[SRC:%.*]], align 16
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[SRC]], i64 16
+; CHECK-NEXT:    [[SRC_VAL1:%.*]] = load <4 x i32>, ptr [[TMP0]], align 16
+; CHECK-NEXT:    call fastcc void @callee_struct4xi32(<4 x i32> [[SRC_VAL]], <4 x i32> [[SRC_VAL1]], ptr noalias [[DST:%.*]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  call fastcc void @callee_struct4xi32(ptr noalias %src, ptr noalias %dst)
+  ret void
+}
+
+define internal fastcc void @callee_struct4xi32(ptr noalias %src, ptr noalias %dst) #1 {
+; CHECK-LABEL: define internal fastcc void @callee_struct4xi32(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    store <4 x i32> [[SRC_0_VAL:%.*]], ptr [[DST:%.*]], align 16
+; CHECK-NEXT:    [[DST2:%.*]] = getelementptr inbounds [[STRUCT_4XI32:%.*]], ptr [[DST]], i64 0, i32 1
+; CHECK-NEXT:    store <4 x i32> [[SRC_16_VAL:%.*]], ptr [[DST2]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = load <4 x i32>, ptr %src, align 16
+  store <4 x i32> %0, ptr %dst, align 16
+  %src2 = getelementptr inbounds %struct_4xi32, ptr %src, i64 0, i32 1
+  %1 = load <4 x i32>, ptr %src2, align 16
+  %dst2 = getelementptr inbounds %struct_4xi32, ptr %dst, i64 0, i32 1
+  store <4 x i32> %1, ptr %dst2, align 16
+  ret void
+}
+
+attributes #0 = { noinline vscale_range(2,2) "target-features"="+v8.2a,+neon,+sve" }
+attributes #1 = { noinline vscale_range(1,1) "target-features"="+v8.2a,+neon,+sve" }
+attributes #2 = { noinline "target-features"="+v8.2a,+neon,+sve" }

From a6d509fadbf7565baf336c2e25d1798fd40e59c9 Mon Sep 17 00:00:00 2001
From: Tobias Hieta <tobias@hieta.se>
Date: Thu, 26 Oct 2023 08:31:33 +0200
Subject: [PATCH 010/877] [Support] Better error msg when cache dir can't be
 created. (#69575)

On windows if you passed /lldltocache:D:\tmp to lld and you didn't have
D: mounted it fail to create the cache dir D:\tmp, but the error message
is pretty hard to understand:

```
c:\code\llvm\llvm-project\out\debug>bin\lld-link.exe /lldltocache:D:\tmp
hello.obj
LLVM ERROR: no such file or directory

PLEASE submit a bug report to
https://github.com/llvm/llvm-project/issues/ and include the crash
backtrace.
Exception Code: 0xC000001D
```

Which lead one of our users to report this as a crash. I have just added
a bit better message so it now says:

```
c:\code\llvm\llvm-project\out\debug>bin\lld-link.exe /lldltocache:D:\tmp
hello.obj
LLVM ERROR: Can't create cache directory: D:\tmp

PLEASE submit a bug report to
https://github.com/llvm/llvm-project/issues/ and include the crash
backtrace.
```

I am not sure this is a fatal error because it's not something that
really should be reported as a bug to LLVM. But at least this gives a
bit more visibility on what to change.
---
 lld/test/COFF/lto-cache-errors.ll | 20 ++++++++++++++++++++
 llvm/lib/Support/Caching.cpp      |  4 +++-
 2 files changed, 23 insertions(+), 1 deletion(-)
 create mode 100644 lld/test/COFF/lto-cache-errors.ll

diff --git a/lld/test/COFF/lto-cache-errors.ll b/lld/test/COFF/lto-cache-errors.ll
new file mode 100644
index 0000000000000..55244e5690dc3
--- /dev/null
+++ b/lld/test/COFF/lto-cache-errors.ll
@@ -0,0 +1,20 @@
+; REQUIRES: x86
+;; Not supported on windows since we use permissions to deny the creation
+; UNSUPPORTED: system-windows
+
+; RUN: opt -module-hash -module-summary %s -o %t.o
+; RUN: opt -module-hash -module-summary %p/Inputs/lto-cache.ll -o %t2.o
+; RUN: rm -Rf %t.cache && mkdir %t.cache
+; RUN: chmod 444 %t.cache
+
+;; Check emit warnings when we can't create the cache dir
+; RUN: not --crash lld-link /lldltocache:%t.cache/nonexistant/ /out:%t3 /entry:main %t2.o %t.o 2>&1 | FileCheck %s
+; CHECK: LLVM ERROR: can't create cache directory {{.*}}/nonexistant/: Permission denied
+
+target datalayout = "e-m:w-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-windows-msvc"
+
+define void @globalfunc() #0 {
+entry:
+  ret void
+}
diff --git a/llvm/lib/Support/Caching.cpp b/llvm/lib/Support/Caching.cpp
index f20f08a865c76..628e23e1cb3d1 100644
--- a/llvm/lib/Support/Caching.cpp
+++ b/llvm/lib/Support/Caching.cpp
@@ -145,7 +145,9 @@ Expected<FileCache> llvm::localCache(const Twine &CacheNameRef,
       // ensures the filesystem isn't mutated until the cache is.
       if (std::error_code EC = sys::fs::create_directories(
               CacheDirectoryPath, /*IgnoreExisting=*/true))
-        return errorCodeToError(EC);
+        return createStringError(EC, Twine("can't create cache directory ") +
+                                         CacheDirectoryPath + ": " +
+                                         EC.message());
 
       // Write to a temporary to avoid race condition
       SmallString<64> TempFilenameModel;

From c285b7f5139d0870d5ccbdc3f73b254004211030 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke@igalia.com>
Date: Thu, 26 Oct 2023 07:59:25 +0100
Subject: [PATCH 011/877] [RISCV] Add tests for vmadd for VP intrinsics. NFC
 (#70042)

We have VP tests for vmacc but not vmadd. This copies the vmacc tests
but swaps
the false operand of vp.merge to be the multiplicand instead of the
addend.

This shows how we could fold the vmerge into the vmadd's mask if we
commuted %a
and %b.
---
 llvm/test/CodeGen/RISCV/rvv/vmadd-vp.ll | 2427 +++++++++++++++++++++++
 1 file changed, 2427 insertions(+)
 create mode 100644 llvm/test/CodeGen/RISCV/rvv/vmadd-vp.ll

diff --git a/llvm/test/CodeGen/RISCV/rvv/vmadd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vmadd-vp.ll
new file mode 100644
index 0000000000000..6a6d7d2a41424
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/vmadd-vp.ll
@@ -0,0 +1,2427 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v,+m -target-abi=ilp32d \
+; RUN:     -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v,+m -target-abi=lp64d \
+; RUN:     -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64
+
+declare <vscale x 1 x i8> @llvm.vp.mul.nxv1i8(<vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i1>, i32)
+declare <vscale x 1 x i8> @llvm.vp.add.nxv1i8(<vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i1>, i32)
+declare <vscale x 1 x i8> @llvm.vp.merge.nxv1i8(<vscale x 1 x i1>, <vscale x 1 x i8>, <vscale x 1 x i8>, i32)
+declare <vscale x 1 x i8> @llvm.vp.select.nxv1i8(<vscale x 1 x i1>, <vscale x 1 x i8>, <vscale x 1 x i8>, i32)
+
+define <vscale x 1 x i8> @vmadd_vv_nxv1i8(<vscale x 1 x i8> %a, <vscale x 1 x i8> %b, <vscale x 1 x i8> %c,  <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmadd_vv_nxv1i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vmadd.vv v9, v8, v10
+; CHECK-NEXT:    vsetvli zero, zero, e8, mf8, tu, ma
+; CHECK-NEXT:    vmerge.vvm v8, v8, v9, v0
+; CHECK-NEXT:    ret
+  %splat = insertelement <vscale x 1 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 1 x i1> %splat, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %x = call <vscale x 1 x i8> @llvm.vp.mul.nxv1i8(<vscale x 1 x i8> %a, <vscale x 1 x i8> %b, <vscale x 1 x i1> %allones, i32 %evl)
+  %y = call <vscale x 1 x i8> @llvm.vp.add.nxv1i8(<vscale x 1 x i8> %x, <vscale x 1 x i8> %c, <vscale x 1 x i1> %allones, i32 %evl)
+  %u = call <vscale x 1 x i8> @llvm.vp.merge.nxv1i8(<vscale x 1 x i1> %m, <vscale x 1 x i8> %y, <vscale x 1 x i8> %a, i32 %evl)
+  ret <vscale x 1 x i8> %u
+}
+
+define <vscale x 1 x i8> @vmadd_vv_nxv1i8_unmasked(<vscale x 1 x i8> %a, <vscale x 1 x i8> %b, <vscale x 1 x i8> %c,  <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmadd_vv_nxv1i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vmadd.vv v9, v8, v10
+; CHECK-NEXT:    vsetvli zero, zero, e8, mf8, tu, ma
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    ret
+  %splat = insertelement <vscale x 1 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 1 x i1> %splat, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %x = call <vscale x 1 x i8> @llvm.vp.mul.nxv1i8(<vscale x 1 x i8> %a, <vscale x 1 x i8> %b, <vscale x 1 x i1> %allones, i32 %evl)
+  %y = call <vscale x 1 x i8> @llvm.vp.add.nxv1i8(<vscale x 1 x i8> %x, <vscale x 1 x i8> %c, <vscale x 1 x i1> %allones, i32 %evl)
+  %u = call <vscale x 1 x i8> @llvm.vp.merge.nxv1i8(<vscale x 1 x i1> %allones, <vscale x 1 x i8> %y, <vscale x 1 x i8> %a, i32 %evl)
+  ret <vscale x 1 x i8> %u
+}
+
+define <vscale x 1 x i8> @vmadd_vx_nxv1i8(<vscale x 1 x i8> %a, i8 %b, <vscale x 1 x i8> %c,  <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmadd_vx_nxv1i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, tu, mu
+; CHECK-NEXT:    vmadd.vx v8, a0, v9, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i8> %elt.head, <vscale x 1 x i8> poison, <vscale x 1 x i32> zeroinitializer
+  %splat = insertelement <vscale x 1 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 1 x i1> %splat, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %x = call <vscale x 1 x i8> @llvm.vp.mul.nxv1i8(<vscale x 1 x i8> %a, <vscale x 1 x i8> %vb, <vscale x 1 x i1> %allones, i32 %evl)
+  %y = call <vscale x 1 x i8> @llvm.vp.add.nxv1i8(<vscale x 1 x i8> %x, <vscale x 1 x i8> %c, <vscale x 1 x i1> %allones, i32 %evl)
+  %u = call <vscale x 1 x i8> @llvm.vp.merge.nxv1i8(<vscale x 1 x i1> %m, <vscale x 1 x i8> %y, <vscale x 1 x i8> %a, i32 %evl)
+  ret <vscale x 1 x i8> %u
+}
+
+define <vscale x 1 x i8> @vmadd_vx_nxv1i8_unmasked(<vscale x 1 x i8> %a, i8 %b, <vscale x 1 x i8> %c,  <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmadd_vx_nxv1i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, tu, ma
+; CHECK-NEXT:    vmadd.vx v8, a0, v9
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i8> %elt.head, <vscale x 1 x i8> poison, <vscale x 1 x i32> zeroinitializer
+  %splat = insertelement <vscale x 1 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 1 x i1> %splat, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %x = call <vscale x 1 x i8> @llvm.vp.mul.nxv1i8(<vscale x 1 x i8> %a, <vscale x 1 x i8> %vb, <vscale x 1 x i1> %allones, i32 %evl)
+  %y = call <vscale x 1 x i8> @llvm.vp.add.nxv1i8(<vscale x 1 x i8> %x, <vscale x 1 x i8> %c, <vscale x 1 x i1> %allones, i32 %evl)
+  %u = call <vscale x 1 x i8> @llvm.vp.merge.nxv1i8(<vscale x 1 x i1> %allones, <vscale x 1 x i8> %y, <vscale x 1 x i8> %a, i32 %evl)
+  ret <vscale x 1 x i8> %u
+}
+
+define <vscale x 1 x i8> @vmadd_vv_nxv1i8_ta(<vscale x 1 x i8> %a, <vscale x 1 x i8> %b, <vscale x 1 x i8> %c,  <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmadd_vv_nxv1i8_ta:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT:    vmadd.vv v9, v8, v10
+; CHECK-NEXT:    vmerge.vvm v8, v8, v9, v0
+; CHECK-NEXT:    ret
+  %splat = insertelement <vscale x 1 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 1 x i1> %splat, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %x = call <vscale x 1 x i8> @llvm.vp.mul.nxv1i8(<vscale x 1 x i8> %a, <vscale x 1 x i8> %b, <vscale x 1 x i1> %allones, i32 %evl)
+  %y = call <vscale x 1 x i8> @llvm.vp.add.nxv1i8(<vscale x 1 x i8> %x, <vscale x 1 x i8> %c, <vscale x 1 x i1> %allones, i32 %evl)
+  %u = call <vscale x 1 x i8> @llvm.vp.select.nxv1i8(<vscale x 1 x i1> %m, <vscale x 1 x i8> %y, <vscale x 1 x i8> %a, i32 %evl)
+  ret <vscale x 1 x i8> %u
+}
+
+define <vscale x 1 x i8> @vmadd_vx_nxv1i8_ta(<vscale x 1 x i8> %a, i8 %b, <vscale x 1 x i8> %c,  <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmadd_vx_nxv1i8_ta:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NEXT:    vmacc.vx v9, a0, v8
+; CHECK-NEXT:    vmerge.vvm v8, v8, v9, v0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i8> %elt.head, <vscale x 1 x i8> poison, <vscale x 1 x i32> zeroinitializer
+  %splat = insertelement <vscale x 1 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 1 x i1> %splat, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %x = call <vscale x 1 x i8> @llvm.vp.mul.nxv1i8(<vscale x 1 x i8> %a, <vscale x 1 x i8> %vb, <vscale x 1 x i1> %allones, i32 %evl)
+  %y = call <vscale x 1 x i8> @llvm.vp.add.nxv1i8(<vscale x 1 x i8> %x, <vscale x 1 x i8> %c, <vscale x 1 x i1> %allones, i32 %evl)
+  %u = call <vscale x 1 x i8> @llvm.vp.select.nxv1i8(<vscale x 1 x i1> %m, <vscale x 1 x i8> %y, <vscale x 1 x i8> %a, i32 %evl)
+  ret <vscale x 1 x i8> %u
+}
+
+declare <vscale x 2 x i8> @llvm.vp.mul.nxv2i8(<vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i1>, i32)
+declare <vscale x 2 x i8> @llvm.vp.add.nxv2i8(<vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i1>, i32)
+declare <vscale x 2 x i8> @llvm.vp.merge.nxv2i8(<vscale x 2 x i1>, <vscale x 2 x i8>, <vscale x 2 x i8>, i32)
+declare <vscale x 2 x i8> @llvm.vp.select.nxv2i8(<vscale x 2 x i1>, <vscale x 2 x i8>, <vscale x 2 x i8>, i32)
+
+define <vscale x 2 x i8> @vmadd_vv_nxv2i8(<vscale x 2 x i8> %a, <vscale x 2 x i8> %b, <vscale x 2 x i8> %c,  <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmadd_vv_nxv2i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vmadd.vv v9, v8, v10
+; CHECK-NEXT:    vsetvli zero, zero, e8, mf4, tu, ma
+; CHECK-NEXT:    vmerge.vvm v8, v8, v9, v0
+; CHECK-NEXT:    ret
+  %splat = insertelement <vscale x 2 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 2 x i1> %splat, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %x = call <vscale x 2 x i8> @llvm.vp.mul.nxv2i8(<vscale x 2 x i8> %a, <vscale x 2 x i8> %b, <vscale x 2 x i1> %allones, i32 %evl)
+  %y = call <vscale x 2 x i8> @llvm.vp.add.nxv2i8(<vscale x 2 x i8> %x, <vscale x 2 x i8> %c, <vscale x 2 x i1> %allones, i32 %evl)
+  %u = call <vscale x 2 x i8> @llvm.vp.merge.nxv2i8(<vscale x 2 x i1> %m, <vscale x 2 x i8> %y, <vscale x 2 x i8> %a, i32 %evl)
+  ret <vscale x 2 x i8> %u
+}
+
+define <vscale x 2 x i8> @vmadd_vv_nxv2i8_unmasked(<vscale x 2 x i8> %a, <vscale x 2 x i8> %b, <vscale x 2 x i8> %c,  <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmadd_vv_nxv2i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vmadd.vv v9, v8, v10
+; CHECK-NEXT:    vsetvli zero, zero, e8, mf4, tu, ma
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    ret
+  %splat = insertelement <vscale x 2 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 2 x i1> %splat, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %x = call <vscale x 2 x i8> @llvm.vp.mul.nxv2i8(<vscale x 2 x i8> %a, <vscale x 2 x i8> %b, <vscale x 2 x i1> %allones, i32 %evl)
+  %y = call <vscale x 2 x i8> @llvm.vp.add.nxv2i8(<vscale x 2 x i8> %x, <vscale x 2 x i8> %c, <vscale x 2 x i1> %allones, i32 %evl)
+  %u = call <vscale x 2 x i8> @llvm.vp.merge.nxv2i8(<vscale x 2 x i1> %allones, <vscale x 2 x i8> %y, <vscale x 2 x i8> %a, i32 %evl)
+  ret <vscale x 2 x i8> %u
+}
+
+define <vscale x 2 x i8> @vmadd_vx_nxv2i8(<vscale x 2 x i8> %a, i8 %b, <vscale x 2 x i8> %c,  <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmadd_vx_nxv2i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, tu, mu
+; CHECK-NEXT:    vmadd.vx v8, a0, v9, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 2 x i8> %elt.head, <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer
+  %splat = insertelement <vscale x 2 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 2 x i1> %splat, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %x = call <vscale x 2 x i8> @llvm.vp.mul.nxv2i8(<vscale x 2 x i8> %a, <vscale x 2 x i8> %vb, <vscale x 2 x i1> %allones, i32 %evl)
+  %y = call <vscale x 2 x i8> @llvm.vp.add.nxv2i8(<vscale x 2 x i8> %x, <vscale x 2 x i8> %c, <vscale x 2 x i1> %allones, i32 %evl)
+  %u = call <vscale x 2 x i8> @llvm.vp.merge.nxv2i8(<vscale x 2 x i1> %m, <vscale x 2 x i8> %y, <vscale x 2 x i8> %a, i32 %evl)
+  ret <vscale x 2 x i8> %u
+}
+
+define <vscale x 2 x i8> @vmadd_vx_nxv2i8_unmasked(<vscale x 2 x i8> %a, i8 %b, <vscale x 2 x i8> %c,  <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmadd_vx_nxv2i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, tu, ma
+; CHECK-NEXT:    vmadd.vx v8, a0, v9
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 2 x i8> %elt.head, <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer
+  %splat = insertelement <vscale x 2 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 2 x i1> %splat, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %x = call <vscale x 2 x i8> @llvm.vp.mul.nxv2i8(<vscale x 2 x i8> %a, <vscale x 2 x i8> %vb, <vscale x 2 x i1> %allones, i32 %evl)
+  %y = call <vscale x 2 x i8> @llvm.vp.add.nxv2i8(<vscale x 2 x i8> %x, <vscale x 2 x i8> %c, <vscale x 2 x i1> %allones, i32 %evl)
+  %u = call <vscale x 2 x i8> @llvm.vp.merge.nxv2i8(<vscale x 2 x i1> %allones, <vscale x 2 x i8> %y, <vscale x 2 x i8> %a, i32 %evl)
+  ret <vscale x 2 x i8> %u
+}
+
+define <vscale x 2 x i8> @vmadd_vv_nxv2i8_ta(<vscale x 2 x i8> %a, <vscale x 2 x i8> %b, <vscale x 2 x i8> %c,  <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmadd_vv_nxv2i8_ta:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf4, ta, ma
+; CHECK-NEXT:    vmadd.vv v9, v8, v10
+; CHECK-NEXT:    vmerge.vvm v8, v8, v9, v0
+; CHECK-NEXT:    ret
+  %splat = insertelement <vscale x 2 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 2 x i1> %splat, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %x = call <vscale x 2 x i8> @llvm.vp.mul.nxv2i8(<vscale x 2 x i8> %a, <vscale x 2 x i8> %b, <vscale x 2 x i1> %allones, i32 %evl)
+  %y = call <vscale x 2 x i8> @llvm.vp.add.nxv2i8(<vscale x 2 x i8> %x, <vscale x 2 x i8> %c, <vscale x 2 x i1> %allones, i32 %evl)
+  %u = call <vscale x 2 x i8> @llvm.vp.select.nxv2i8(<vscale x 2 x i1> %m, <vscale x 2 x i8> %y, <vscale x 2 x i8> %a, i32 %evl)
+  ret <vscale x 2 x i8> %u
+}
+
+define <vscale x 2 x i8> @vmadd_vx_nxv2i8_ta(<vscale x 2 x i8> %a, i8 %b, <vscale x 2 x i8> %c,  <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmadd_vx_nxv2i8_ta:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NEXT:    vmacc.vx v9, a0, v8
+; CHECK-NEXT:    vmerge.vvm v8, v8, v9, v0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 2 x i8> %elt.head, <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer
+  %splat = insertelement <vscale x 2 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 2 x i1> %splat, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %x = call <vscale x 2 x i8> @llvm.vp.mul.nxv2i8(<vscale x 2 x i8> %a, <vscale x 2 x i8> %vb, <vscale x 2 x i1> %allones, i32 %evl)
+  %y = call <vscale x 2 x i8> @llvm.vp.add.nxv2i8(<vscale x 2 x i8> %x, <vscale x 2 x i8> %c, <vscale x 2 x i1> %allones, i32 %evl)
+  %u = call <vscale x 2 x i8> @llvm.vp.select.nxv2i8(<vscale x 2 x i1> %m, <vscale x 2 x i8> %y, <vscale x 2 x i8> %a, i32 %evl)
+  ret <vscale x 2 x i8> %u
+}
+
+declare <vscale x 4 x i8> @llvm.vp.mul.nxv4i8(<vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i1>, i32)
+declare <vscale x 4 x i8> @llvm.vp.add.nxv4i8(<vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i1>, i32)
+declare <vscale x 4 x i8> @llvm.vp.merge.nxv4i8(<vscale x 4 x i1>, <vscale x 4 x i8>, <vscale x 4 x i8>, i32)
+declare <vscale x 4 x i8> @llvm.vp.select.nxv4i8(<vscale x 4 x i1>, <vscale x 4 x i8>, <vscale x 4 x i8>, i32)
+
+define <vscale x 4 x i8> @vmadd_vv_nxv4i8(<vscale x 4 x i8> %a, <vscale x 4 x i8> %b, <vscale x 4 x i8> %c,  <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmadd_vv_nxv4i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vmadd.vv v9, v8, v10
+; CHECK-NEXT:    vsetvli zero, zero, e8, mf2, tu, ma
+; CHECK-NEXT:    vmerge.vvm v8, v8, v9, v0
+; CHECK-NEXT:    ret
+  %splat = insertelement <vscale x 4 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 4 x i1> %splat, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %x = call <vscale x 4 x i8> @llvm.vp.mul.nxv4i8(<vscale x 4 x i8> %a, <vscale x 4 x i8> %b, <vscale x 4 x i1> %allones, i32 %evl)
+  %y = call <vscale x 4 x i8> @llvm.vp.add.nxv4i8(<vscale x 4 x i8> %x, <vscale x 4 x i8> %c, <vscale x 4 x i1> %allones, i32 %evl)
+  %u = call <vscale x 4 x i8> @llvm.vp.merge.nxv4i8(<vscale x 4 x i1> %m, <vscale x 4 x i8> %y, <vscale x 4 x i8> %a, i32 %evl)
+  ret <vscale x 4 x i8> %u
+}
+
+define <vscale x 4 x i8> @vmadd_vv_nxv4i8_unmasked(<vscale x 4 x i8> %a, <vscale x 4 x i8> %b, <vscale x 4 x i8> %c,  <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmadd_vv_nxv4i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vmadd.vv v9, v8, v10
+; CHECK-NEXT:    vsetvli zero, zero, e8, mf2, tu, ma
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    ret
+  %splat = insertelement <vscale x 4 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 4 x i1> %splat, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %x = call <vscale x 4 x i8> @llvm.vp.mul.nxv4i8(<vscale x 4 x i8> %a, <vscale x 4 x i8> %b, <vscale x 4 x i1> %allones, i32 %evl)
+  %y = call <vscale x 4 x i8> @llvm.vp.add.nxv4i8(<vscale x 4 x i8> %x, <vscale x 4 x i8> %c, <vscale x 4 x i1> %allones, i32 %evl)
+  %u = call <vscale x 4 x i8> @llvm.vp.merge.nxv4i8(<vscale x 4 x i1> %allones, <vscale x 4 x i8> %y, <vscale x 4 x i8> %a, i32 %evl)
+  ret <vscale x 4 x i8> %u
+}
+
+define <vscale x 4 x i8> @vmadd_vx_nxv4i8(<vscale x 4 x i8> %a, i8 %b, <vscale x 4 x i8> %c,  <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmadd_vx_nxv4i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, tu, mu
+; CHECK-NEXT:    vmadd.vx v8, a0, v9, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 4 x i8> %elt.head, <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer
+  %splat = insertelement <vscale x 4 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 4 x i1> %splat, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %x = call <vscale x 4 x i8> @llvm.vp.mul.nxv4i8(<vscale x 4 x i8> %a, <vscale x 4 x i8> %vb, <vscale x 4 x i1> %allones, i32 %evl)
+  %y = call <vscale x 4 x i8> @llvm.vp.add.nxv4i8(<vscale x 4 x i8> %x, <vscale x 4 x i8> %c, <vscale x 4 x i1> %allones, i32 %evl)
+  %u = call <vscale x 4 x i8> @llvm.vp.merge.nxv4i8(<vscale x 4 x i1> %m, <vscale x 4 x i8> %y, <vscale x 4 x i8> %a, i32 %evl)
+  ret <vscale x 4 x i8> %u
+}
+
+define <vscale x 4 x i8> @vmadd_vx_nxv4i8_unmasked(<vscale x 4 x i8> %a, i8 %b, <vscale x 4 x i8> %c,  <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmadd_vx_nxv4i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, tu, ma
+; CHECK-NEXT:    vmadd.vx v8, a0, v9
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 4 x i8> %elt.head, <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer
+  %splat = insertelement <vscale x 4 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 4 x i1> %splat, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %x = call <vscale x 4 x i8> @llvm.vp.mul.nxv4i8(<vscale x 4 x i8> %a, <vscale x 4 x i8> %vb, <vscale x 4 x i1> %allones, i32 %evl)
+  %y = call <vscale x 4 x i8> @llvm.vp.add.nxv4i8(<vscale x 4 x i8> %x, <vscale x 4 x i8> %c, <vscale x 4 x i1> %allones, i32 %evl)
+  %u = call <vscale x 4 x i8> @llvm.vp.merge.nxv4i8(<vscale x 4 x i1> %allones, <vscale x 4 x i8> %y, <vscale x 4 x i8> %a, i32 %evl)
+  ret <vscale x 4 x i8> %u
+}
+
+define <vscale x 4 x i8> @vmadd_vv_nxv4i8_ta(<vscale x 4 x i8> %a, <vscale x 4 x i8> %b, <vscale x 4 x i8> %c,  <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmadd_vv_nxv4i8_ta:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT:    vmadd.vv v9, v8, v10
+; CHECK-NEXT:    vmerge.vvm v8, v8, v9, v0
+; CHECK-NEXT:    ret
+  %splat = insertelement <vscale x 4 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 4 x i1> %splat, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %x = call <vscale x 4 x i8> @llvm.vp.mul.nxv4i8(<vscale x 4 x i8> %a, <vscale x 4 x i8> %b, <vscale x 4 x i1> %allones, i32 %evl)
+  %y = call <vscale x 4 x i8> @llvm.vp.add.nxv4i8(<vscale x 4 x i8> %x, <vscale x 4 x i8> %c, <vscale x 4 x i1> %allones, i32 %evl)
+  %u = call <vscale x 4 x i8> @llvm.vp.select.nxv4i8(<vscale x 4 x i1> %m, <vscale x 4 x i8> %y, <vscale x 4 x i8> %a, i32 %evl)
+  ret <vscale x 4 x i8> %u
+}
+
+define <vscale x 4 x i8> @vmadd_vx_nxv4i8_ta(<vscale x 4 x i8> %a, i8 %b, <vscale x 4 x i8> %c,  <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmadd_vx_nxv4i8_ta:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vmacc.vx v9, a0, v8
+; CHECK-NEXT:    vmerge.vvm v8, v8, v9, v0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 4 x i8> %elt.head, <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer
+  %splat = insertelement <vscale x 4 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 4 x i1> %splat, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %x = call <vscale x 4 x i8> @llvm.vp.mul.nxv4i8(<vscale x 4 x i8> %a, <vscale x 4 x i8> %vb, <vscale x 4 x i1> %allones, i32 %evl)
+  %y = call <vscale x 4 x i8> @llvm.vp.add.nxv4i8(<vscale x 4 x i8> %x, <vscale x 4 x i8> %c, <vscale x 4 x i1> %allones, i32 %evl)
+  %u = call <vscale x 4 x i8> @llvm.vp.select.nxv4i8(<vscale x 4 x i1> %m, <vscale x 4 x i8> %y, <vscale x 4 x i8> %a, i32 %evl)
+  ret <vscale x 4 x i8> %u
+}
+
+declare <vscale x 8 x i8> @llvm.vp.mul.nxv8i8(<vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i1>, i32)
+declare <vscale x 8 x i8> @llvm.vp.add.nxv8i8(<vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i1>, i32)
+declare <vscale x 8 x i8> @llvm.vp.merge.nxv8i8(<vscale x 8 x i1>, <vscale x 8 x i8>, <vscale x 8 x i8>, i32)
+declare <vscale x 8 x i8> @llvm.vp.select.nxv8i8(<vscale x 8 x i1>, <vscale x 8 x i8>, <vscale x 8 x i8>, i32)
+
+define <vscale x 8 x i8> @vmadd_vv_nxv8i8(<vscale x 8 x i8> %a, <vscale x 8 x i8> %b, <vscale x 8 x i8> %c,  <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmadd_vv_nxv8i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vmadd.vv v9, v8, v10
+; CHECK-NEXT:    vsetvli zero, zero, e8, m1, tu, ma
+; CHECK-NEXT:    vmerge.vvm v8, v8, v9, v0
+; CHECK-NEXT:    ret
+  %splat = insertelement <vscale x 8 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 8 x i1> %splat, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %x = call <vscale x 8 x i8> @llvm.vp.mul.nxv8i8(<vscale x 8 x i8> %a, <vscale x 8 x i8> %b, <vscale x 8 x i1> %allones, i32 %evl)
+  %y = call <vscale x 8 x i8> @llvm.vp.add.nxv8i8(<vscale x 8 x i8> %x, <vscale x 8 x i8> %c, <vscale x 8 x i1> %allones, i32 %evl)
+  %u = call <vscale x 8 x i8> @llvm.vp.merge.nxv8i8(<vscale x 8 x i1> %m, <vscale x 8 x i8> %y, <vscale x 8 x i8> %a, i32 %evl)
+  ret <vscale x 8 x i8> %u
+}
+
+define <vscale x 8 x i8> @vmadd_vv_nxv8i8_unmasked(<vscale x 8 x i8> %a, <vscale x 8 x i8> %b, <vscale x 8 x i8> %c,  <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmadd_vv_nxv8i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vmadd.vv v9, v8, v10
+; CHECK-NEXT:    vsetvli zero, zero, e8, m1, tu, ma
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    ret
+  %splat = insertelement <vscale x 8 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 8 x i1> %splat, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %x = call <vscale x 8 x i8> @llvm.vp.mul.nxv8i8(<vscale x 8 x i8> %a, <vscale x 8 x i8> %b, <vscale x 8 x i1> %allones, i32 %evl)
+  %y = call <vscale x 8 x i8> @llvm.vp.add.nxv8i8(<vscale x 8 x i8> %x, <vscale x 8 x i8> %c, <vscale x 8 x i1> %allones, i32 %evl)
+  %u = call <vscale x 8 x i8> @llvm.vp.merge.nxv8i8(<vscale x 8 x i1> %allones, <vscale x 8 x i8> %y, <vscale x 8 x i8> %a, i32 %evl)
+  ret <vscale x 8 x i8> %u
+}
+
+define <vscale x 8 x i8> @vmadd_vx_nxv8i8(<vscale x 8 x i8> %a, i8 %b, <vscale x 8 x i8> %c,  <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmadd_vx_nxv8i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, tu, mu
+; CHECK-NEXT:    vmadd.vx v8, a0, v9, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i8> %elt.head, <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
+  %splat = insertelement <vscale x 8 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 8 x i1> %splat, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %x = call <vscale x 8 x i8> @llvm.vp.mul.nxv8i8(<vscale x 8 x i8> %a, <vscale x 8 x i8> %vb, <vscale x 8 x i1> %allones, i32 %evl)
+  %y = call <vscale x 8 x i8> @llvm.vp.add.nxv8i8(<vscale x 8 x i8> %x, <vscale x 8 x i8> %c, <vscale x 8 x i1> %allones, i32 %evl)
+  %u = call <vscale x 8 x i8> @llvm.vp.merge.nxv8i8(<vscale x 8 x i1> %m, <vscale x 8 x i8> %y, <vscale x 8 x i8> %a, i32 %evl)
+  ret <vscale x 8 x i8> %u
+}
+
+define <vscale x 8 x i8> @vmadd_vx_nxv8i8_unmasked(<vscale x 8 x i8> %a, i8 %b, <vscale x 8 x i8> %c,  <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmadd_vx_nxv8i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, tu, ma
+; CHECK-NEXT:    vmadd.vx v8, a0, v9
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i8> %elt.head, <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
+  %splat = insertelement <vscale x 8 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 8 x i1> %splat, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %x = call <vscale x 8 x i8> @llvm.vp.mul.nxv8i8(<vscale x 8 x i8> %a, <vscale x 8 x i8> %vb, <vscale x 8 x i1> %allones, i32 %evl)
+  %y = call <vscale x 8 x i8> @llvm.vp.add.nxv8i8(<vscale x 8 x i8> %x, <vscale x 8 x i8> %c, <vscale x 8 x i1> %allones, i32 %evl)
+  %u = call <vscale x 8 x i8> @llvm.vp.merge.nxv8i8(<vscale x 8 x i1> %allones, <vscale x 8 x i8> %y, <vscale x 8 x i8> %a, i32 %evl)
+  ret <vscale x 8 x i8> %u
+}
+
+define <vscale x 8 x i8> @vmadd_vv_nxv8i8_ta(<vscale x 8 x i8> %a, <vscale x 8 x i8> %b, <vscale x 8 x i8> %c,  <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmadd_vv_nxv8i8_ta:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vmadd.vv v9, v8, v10
+; CHECK-NEXT:    vmerge.vvm v8, v8, v9, v0
+; CHECK-NEXT:    ret
+  %splat = insertelement <vscale x 8 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 8 x i1> %splat, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %x = call <vscale x 8 x i8> @llvm.vp.mul.nxv8i8(<vscale x 8 x i8> %a, <vscale x 8 x i8> %b, <vscale x 8 x i1> %allones, i32 %evl)
+  %y = call <vscale x 8 x i8> @llvm.vp.add.nxv8i8(<vscale x 8 x i8> %x, <vscale x 8 x i8> %c, <vscale x 8 x i1> %allones, i32 %evl)
+  %u = call <vscale x 8 x i8> @llvm.vp.select.nxv8i8(<vscale x 8 x i1> %m, <vscale x 8 x i8> %y, <vscale x 8 x i8> %a, i32 %evl)
+  ret <vscale x 8 x i8> %u
+}
+
+define <vscale x 8 x i8> @vmadd_vx_nxv8i8_ta(<vscale x 8 x i8> %a, i8 %b, <vscale x 8 x i8> %c,  <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmadd_vx_nxv8i8_ta:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vmacc.vx v9, a0, v8
+; CHECK-NEXT:    vmerge.vvm v8, v8, v9, v0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i8> %elt.head, <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
+  %splat = insertelement <vscale x 8 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 8 x i1> %splat, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %x = call <vscale x 8 x i8> @llvm.vp.mul.nxv8i8(<vscale x 8 x i8> %a, <vscale x 8 x i8> %vb, <vscale x 8 x i1> %allones, i32 %evl)
+  %y = call <vscale x 8 x i8> @llvm.vp.add.nxv8i8(<vscale x 8 x i8> %x, <vscale x 8 x i8> %c, <vscale x 8 x i1> %allones, i32 %evl)
+  %u = call <vscale x 8 x i8> @llvm.vp.select.nxv8i8(<vscale x 8 x i1> %m, <vscale x 8 x i8> %y, <vscale x 8 x i8> %a, i32 %evl)
+  ret <vscale x 8 x i8> %u
+}
+
+declare <vscale x 16 x i8> @llvm.vp.mul.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i1>, i32)
+declare <vscale x 16 x i8> @llvm.vp.add.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i1>, i32)
+declare <vscale x 16 x i8> @llvm.vp.merge.nxv16i8(<vscale x 16 x i1>, <vscale x 16 x i8>, <vscale x 16 x i8>, i32)
+declare <vscale x 16 x i8> @llvm.vp.select.nxv16i8(<vscale x 16 x i1>, <vscale x 16 x i8>, <vscale x 16 x i8>, i32)
+
+define <vscale x 16 x i8> @vmadd_vv_nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, <vscale x 16 x i8> %c,  <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmadd_vv_nxv16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
+; CHECK-NEXT:    vmadd.vv v10, v8, v12
+; CHECK-NEXT:    vsetvli zero, zero, e8, m2, tu, ma
+; CHECK-NEXT:    vmerge.vvm v8, v8, v10, v0
+; CHECK-NEXT:    ret
+  %splat = insertelement <vscale x 16 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 16 x i1> %splat, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %x = call <vscale x 16 x i8> @llvm.vp.mul.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, <vscale x 16 x i1> %allones, i32 %evl)
+  %y = call <vscale x 16 x i8> @llvm.vp.add.nxv16i8(<vscale x 16 x i8> %x, <vscale x 16 x i8> %c, <vscale x 16 x i1> %allones, i32 %evl)
+  %u = call <vscale x 16 x i8> @llvm.vp.merge.nxv16i8(<vscale x 16 x i1> %m, <vscale x 16 x i8> %y, <vscale x 16 x i8> %a, i32 %evl)
+  ret <vscale x 16 x i8> %u
+}
+
+define <vscale x 16 x i8> @vmadd_vv_nxv16i8_unmasked(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, <vscale x 16 x i8> %c,  <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmadd_vv_nxv16i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
+; CHECK-NEXT:    vmadd.vv v10, v8, v12
+; CHECK-NEXT:    vsetvli zero, zero, e8, m2, tu, ma
+; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    ret
+  %splat = insertelement <vscale x 16 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 16 x i1> %splat, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %x = call <vscale x 16 x i8> @llvm.vp.mul.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, <vscale x 16 x i1> %allones, i32 %evl)
+  %y = call <vscale x 16 x i8> @llvm.vp.add.nxv16i8(<vscale x 16 x i8> %x, <vscale x 16 x i8> %c, <vscale x 16 x i1> %allones, i32 %evl)
+  %u = call <vscale x 16 x i8> @llvm.vp.merge.nxv16i8(<vscale x 16 x i1> %allones, <vscale x 16 x i8> %y, <vscale x 16 x i8> %a, i32 %evl)
+  ret <vscale x 16 x i8> %u
+}
+
+define <vscale x 16 x i8> @vmadd_vx_nxv16i8(<vscale x 16 x i8> %a, i8 %b, <vscale x 16 x i8> %c,  <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmadd_vx_nxv16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, tu, mu
+; CHECK-NEXT:    vmadd.vx v8, a0, v10, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 16 x i8> %elt.head, <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+  %splat = insertelement <vscale x 16 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 16 x i1> %splat, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %x = call <vscale x 16 x i8> @llvm.vp.mul.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %vb, <vscale x 16 x i1> %allones, i32 %evl)
+  %y = call <vscale x 16 x i8> @llvm.vp.add.nxv16i8(<vscale x 16 x i8> %x, <vscale x 16 x i8> %c, <vscale x 16 x i1> %allones, i32 %evl)
+  %u = call <vscale x 16 x i8> @llvm.vp.merge.nxv16i8(<vscale x 16 x i1> %m, <vscale x 16 x i8> %y, <vscale x 16 x i8> %a, i32 %evl)
+  ret <vscale x 16 x i8> %u
+}
+
+define <vscale x 16 x i8> @vmadd_vx_nxv16i8_unmasked(<vscale x 16 x i8> %a, i8 %b, <vscale x 16 x i8> %c,  <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmadd_vx_nxv16i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, tu, ma
+; CHECK-NEXT:    vmadd.vx v8, a0, v10
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 16 x i8> %elt.head, <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+  %splat = insertelement <vscale x 16 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 16 x i1> %splat, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %x = call <vscale x 16 x i8> @llvm.vp.mul.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %vb, <vscale x 16 x i1> %allones, i32 %evl)
+  %y = call <vscale x 16 x i8> @llvm.vp.add.nxv16i8(<vscale x 16 x i8> %x, <vscale x 16 x i8> %c, <vscale x 16 x i1> %allones, i32 %evl)
+  %u = call <vscale x 16 x i8> @llvm.vp.merge.nxv16i8(<vscale x 16 x i1> %allones, <vscale x 16 x i8> %y, <vscale x 16 x i8> %a, i32 %evl)
+  ret <vscale x 16 x i8> %u
+}
+
+define <vscale x 16 x i8> @vmadd_vv_nxv16i8_ta(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, <vscale x 16 x i8> %c,  <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmadd_vv_nxv16i8_ta:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m2, ta, ma
+; CHECK-NEXT:    vmadd.vv v10, v8, v12
+; CHECK-NEXT:    vmerge.vvm v8, v8, v10, v0
+; CHECK-NEXT:    ret
+  %splat = insertelement <vscale x 16 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 16 x i1> %splat, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %x = call <vscale x 16 x i8> @llvm.vp.mul.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, <vscale x 16 x i1> %allones, i32 %evl)
+  %y = call <vscale x 16 x i8> @llvm.vp.add.nxv16i8(<vscale x 16 x i8> %x, <vscale x 16 x i8> %c, <vscale x 16 x i1> %allones, i32 %evl)
+  %u = call <vscale x 16 x i8> @llvm.vp.select.nxv16i8(<vscale x 16 x i1> %m, <vscale x 16 x i8> %y, <vscale x 16 x i8> %a, i32 %evl)
+  ret <vscale x 16 x i8> %u
+}
+
+define <vscale x 16 x i8> @vmadd_vx_nxv16i8_ta(<vscale x 16 x i8> %a, i8 %b, <vscale x 16 x i8> %c,  <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmadd_vx_nxv16i8_ta:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT:    vmacc.vx v10, a0, v8
+; CHECK-NEXT:    vmerge.vvm v8, v8, v10, v0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 16 x i8> %elt.head, <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+  %splat = insertelement <vscale x 16 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 16 x i1> %splat, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %x = call <vscale x 16 x i8> @llvm.vp.mul.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %vb, <vscale x 16 x i1> %allones, i32 %evl)
+  %y = call <vscale x 16 x i8> @llvm.vp.add.nxv16i8(<vscale x 16 x i8> %x, <vscale x 16 x i8> %c, <vscale x 16 x i1> %allones, i32 %evl)
+  %u = call <vscale x 16 x i8> @llvm.vp.select.nxv16i8(<vscale x 16 x i1> %m, <vscale x 16 x i8> %y, <vscale x 16 x i8> %a, i32 %evl)
+  ret <vscale x 16 x i8> %u
+}
+
+declare <vscale x 32 x i8> @llvm.vp.mul.nxv32i8(<vscale x 32 x i8>, <vscale x 32 x i8>, <vscale x 32 x i1>, i32)
+declare <vscale x 32 x i8> @llvm.vp.add.nxv32i8(<vscale x 32 x i8>, <vscale x 32 x i8>, <vscale x 32 x i1>, i32)
+declare <vscale x 32 x i8> @llvm.vp.merge.nxv32i8(<vscale x 32 x i1>, <vscale x 32 x i8>, <vscale x 32 x i8>, i32)
+declare <vscale x 32 x i8> @llvm.vp.select.nxv32i8(<vscale x 32 x i1>, <vscale x 32 x i8>, <vscale x 32 x i8>, i32)
+
+define <vscale x 32 x i8> @vmadd_vv_nxv32i8(<vscale x 32 x i8> %a, <vscale x 32 x i8> %b, <vscale x 32 x i8> %c,  <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmadd_vv_nxv32i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; CHECK-NEXT:    vmadd.vv v12, v8, v16
+; CHECK-NEXT:    vsetvli zero, zero, e8, m4, tu, ma
+; CHECK-NEXT:    vmerge.vvm v8, v8, v12, v0
+; CHECK-NEXT:    ret
+  %splat = insertelement <vscale x 32 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 32 x i1> %splat, <vscale x 32 x i1> poison, <vscale x 32 x i32> zeroinitializer
+  %x = call <vscale x 32 x i8> @llvm.vp.mul.nxv32i8(<vscale x 32 x i8> %a, <vscale x 32 x i8> %b, <vscale x 32 x i1> %allones, i32 %evl)
+  %y = call <vscale x 32 x i8> @llvm.vp.add.nxv32i8(<vscale x 32 x i8> %x, <vscale x 32 x i8> %c, <vscale x 32 x i1> %allones, i32 %evl)
+  %u = call <vscale x 32 x i8> @llvm.vp.merge.nxv32i8(<vscale x 32 x i1> %m, <vscale x 32 x i8> %y, <vscale x 32 x i8> %a, i32 %evl)
+  ret <vscale x 32 x i8> %u
+}
+
+define <vscale x 32 x i8> @vmadd_vv_nxv32i8_unmasked(<vscale x 32 x i8> %a, <vscale x 32 x i8> %b, <vscale x 32 x i8> %c,  <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmadd_vv_nxv32i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; CHECK-NEXT:    vmadd.vv v12, v8, v16
+; CHECK-NEXT:    vsetvli zero, zero, e8, m4, tu, ma
+; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    ret
+  %splat = insertelement <vscale x 32 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 32 x i1> %splat, <vscale x 32 x i1> poison, <vscale x 32 x i32> zeroinitializer
+  %x = call <vscale x 32 x i8> @llvm.vp.mul.nxv32i8(<vscale x 32 x i8> %a, <vscale x 32 x i8> %b, <vscale x 32 x i1> %allones, i32 %evl)
+  %y = call <vscale x 32 x i8> @llvm.vp.add.nxv32i8(<vscale x 32 x i8> %x, <vscale x 32 x i8> %c, <vscale x 32 x i1> %allones, i32 %evl)
+  %u = call <vscale x 32 x i8> @llvm.vp.merge.nxv32i8(<vscale x 32 x i1> %allones, <vscale x 32 x i8> %y, <vscale x 32 x i8> %a, i32 %evl)
+  ret <vscale x 32 x i8> %u
+}
+
+define <vscale x 32 x i8> @vmadd_vx_nxv32i8(<vscale x 32 x i8> %a, i8 %b, <vscale x 32 x i8> %c,  <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmadd_vx_nxv32i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, tu, mu
+; CHECK-NEXT:    vmadd.vx v8, a0, v12, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 32 x i8> %elt.head, <vscale x 32 x i8> poison, <vscale x 32 x i32> zeroinitializer
+  %splat = insertelement <vscale x 32 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 32 x i1> %splat, <vscale x 32 x i1> poison, <vscale x 32 x i32> zeroinitializer
+  %x = call <vscale x 32 x i8> @llvm.vp.mul.nxv32i8(<vscale x 32 x i8> %a, <vscale x 32 x i8> %vb, <vscale x 32 x i1> %allones, i32 %evl)
+  %y = call <vscale x 32 x i8> @llvm.vp.add.nxv32i8(<vscale x 32 x i8> %x, <vscale x 32 x i8> %c, <vscale x 32 x i1> %allones, i32 %evl)
+  %u = call <vscale x 32 x i8> @llvm.vp.merge.nxv32i8(<vscale x 32 x i1> %m, <vscale x 32 x i8> %y, <vscale x 32 x i8> %a, i32 %evl)
+  ret <vscale x 32 x i8> %u
+}
+
+define <vscale x 32 x i8> @vmadd_vx_nxv32i8_unmasked(<vscale x 32 x i8> %a, i8 %b, <vscale x 32 x i8> %c,  <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmadd_vx_nxv32i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, tu, ma
+; CHECK-NEXT:    vmadd.vx v8, a0, v12
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 32 x i8> %elt.head, <vscale x 32 x i8> poison, <vscale x 32 x i32> zeroinitializer
+  %splat = insertelement <vscale x 32 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 32 x i1> %splat, <vscale x 32 x i1> poison, <vscale x 32 x i32> zeroinitializer
+  %x = call <vscale x 32 x i8> @llvm.vp.mul.nxv32i8(<vscale x 32 x i8> %a, <vscale x 32 x i8> %vb, <vscale x 32 x i1> %allones, i32 %evl)
+  %y = call <vscale x 32 x i8> @llvm.vp.add.nxv32i8(<vscale x 32 x i8> %x, <vscale x 32 x i8> %c, <vscale x 32 x i1> %allones, i32 %evl)
+  %u = call <vscale x 32 x i8> @llvm.vp.merge.nxv32i8(<vscale x 32 x i1> %allones, <vscale x 32 x i8> %y, <vscale x 32 x i8> %a, i32 %evl)
+  ret <vscale x 32 x i8> %u
+}
+
+define <vscale x 32 x i8> @vmadd_vv_nxv32i8_ta(<vscale x 32 x i8> %a, <vscale x 32 x i8> %b, <vscale x 32 x i8> %c,  <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmadd_vv_nxv32i8_ta:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
+; CHECK-NEXT:    vmadd.vv v12, v8, v16
+; CHECK-NEXT:    vmerge.vvm v8, v8, v12, v0
+; CHECK-NEXT:    ret
+  %splat = insertelement <vscale x 32 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 32 x i1> %splat, <vscale x 32 x i1> poison, <vscale x 32 x i32> zeroinitializer
+  %x = call <vscale x 32 x i8> @llvm.vp.mul.nxv32i8(<vscale x 32 x i8> %a, <vscale x 32 x i8> %b, <vscale x 32 x i1> %allones, i32 %evl)
+  %y = call <vscale x 32 x i8> @llvm.vp.add.nxv32i8(<vscale x 32 x i8> %x, <vscale x 32 x i8> %c, <vscale x 32 x i1> %allones, i32 %evl)
+  %u = call <vscale x 32 x i8> @llvm.vp.select.nxv32i8(<vscale x 32 x i1> %m, <vscale x 32 x i8> %y, <vscale x 32 x i8> %a, i32 %evl)
+  ret <vscale x 32 x i8> %u
+}
+
+define <vscale x 32 x i8> @vmadd_vx_nxv32i8_ta(<vscale x 32 x i8> %a, i8 %b, <vscale x 32 x i8> %c,  <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmadd_vx_nxv32i8_ta:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
+; CHECK-NEXT:    vmacc.vx v12, a0, v8
+; CHECK-NEXT:    vmerge.vvm v8, v8, v12, v0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 32 x i8> %elt.head, <vscale x 32 x i8> poison, <vscale x 32 x i32> zeroinitializer
+  %splat = insertelement <vscale x 32 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 32 x i1> %splat, <vscale x 32 x i1> poison, <vscale x 32 x i32> zeroinitializer
+  %x = call <vscale x 32 x i8> @llvm.vp.mul.nxv32i8(<vscale x 32 x i8> %a, <vscale x 32 x i8> %vb, <vscale x 32 x i1> %allones, i32 %evl)
+  %y = call <vscale x 32 x i8> @llvm.vp.add.nxv32i8(<vscale x 32 x i8> %x, <vscale x 32 x i8> %c, <vscale x 32 x i1> %allones, i32 %evl)
+  %u = call <vscale x 32 x i8> @llvm.vp.select.nxv32i8(<vscale x 32 x i1> %m, <vscale x 32 x i8> %y, <vscale x 32 x i8> %a, i32 %evl)
+  ret <vscale x 32 x i8> %u
+}
+
+declare <vscale x 64 x i8> @llvm.vp.mul.nxv64i8(<vscale x 64 x i8>, <vscale x 64 x i8>, <vscale x 64 x i1>, i32)
+declare <vscale x 64 x i8> @llvm.vp.add.nxv64i8(<vscale x 64 x i8>, <vscale x 64 x i8>, <vscale x 64 x i1>, i32)
+declare <vscale x 64 x i8> @llvm.vp.merge.nxv64i8(<vscale x 64 x i1>, <vscale x 64 x i8>, <vscale x 64 x i8>, i32)
+declare <vscale x 64 x i8> @llvm.vp.select.nxv64i8(<vscale x 64 x i1>, <vscale x 64 x i8>, <vscale x 64 x i8>, i32)
+
+define <vscale x 64 x i8> @vmadd_vv_nxv64i8(<vscale x 64 x i8> %a, <vscale x 64 x i8> %b, <vscale x 64 x i8> %c,  <vscale x 64 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmadd_vv_nxv64i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vl8r.v v24, (a0)
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vmacc.vv v24, v8, v16
+; CHECK-NEXT:    vsetvli zero, zero, e8, m8, tu, ma
+; CHECK-NEXT:    vmerge.vvm v8, v8, v24, v0
+; CHECK-NEXT:    ret
+  %splat = insertelement <vscale x 64 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 64 x i1> %splat, <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer
+  %x = call <vscale x 64 x i8> @llvm.vp.mul.nxv64i8(<vscale x 64 x i8> %a, <vscale x 64 x i8> %b, <vscale x 64 x i1> %allones, i32 %evl)
+  %y = call <vscale x 64 x i8> @llvm.vp.add.nxv64i8(<vscale x 64 x i8> %x, <vscale x 64 x i8> %c, <vscale x 64 x i1> %allones, i32 %evl)
+  %u = call <vscale x 64 x i8> @llvm.vp.merge.nxv64i8(<vscale x 64 x i1> %m, <vscale x 64 x i8> %y, <vscale x 64 x i8> %a, i32 %evl)
+  ret <vscale x 64 x i8> %u
+}
+
+define <vscale x 64 x i8> @vmadd_vv_nxv64i8_unmasked(<vscale x 64 x i8> %a, <vscale x 64 x i8> %b, <vscale x 64 x i8> %c,  <vscale x 64 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmadd_vv_nxv64i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vl8r.v v24, (a0)
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vmacc.vv v24, v8, v16
+; CHECK-NEXT:    vsetvli zero, zero, e8, m8, tu, ma
+; CHECK-NEXT:    vmv.v.v v8, v24
+; CHECK-NEXT:    ret
+  %splat = insertelement <vscale x 64 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 64 x i1> %splat, <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer
+  %x = call <vscale x 64 x i8> @llvm.vp.mul.nxv64i8(<vscale x 64 x i8> %a, <vscale x 64 x i8> %b, <vscale x 64 x i1> %allones, i32 %evl)
+  %y = call <vscale x 64 x i8> @llvm.vp.add.nxv64i8(<vscale x 64 x i8> %x, <vscale x 64 x i8> %c, <vscale x 64 x i1> %allones, i32 %evl)
+  %u = call <vscale x 64 x i8> @llvm.vp.merge.nxv64i8(<vscale x 64 x i1> %allones, <vscale x 64 x i8> %y, <vscale x 64 x i8> %a, i32 %evl)
+  ret <vscale x 64 x i8> %u
+}
+
+define <vscale x 64 x i8> @vmadd_vx_nxv64i8(<vscale x 64 x i8> %a, i8 %b, <vscale x 64 x i8> %c,  <vscale x 64 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmadd_vx_nxv64i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, tu, mu
+; CHECK-NEXT:    vmadd.vx v8, a0, v16, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 64 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 64 x i8> %elt.head, <vscale x 64 x i8> poison, <vscale x 64 x i32> zeroinitializer
+  %splat = insertelement <vscale x 64 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 64 x i1> %splat, <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer
+  %x = call <vscale x 64 x i8> @llvm.vp.mul.nxv64i8(<vscale x 64 x i8> %a, <vscale x 64 x i8> %vb, <vscale x 64 x i1> %allones, i32 %evl)
+  %y = call <vscale x 64 x i8> @llvm.vp.add.nxv64i8(<vscale x 64 x i8> %x, <vscale x 64 x i8> %c, <vscale x 64 x i1> %allones, i32 %evl)
+  %u = call <vscale x 64 x i8> @llvm.vp.merge.nxv64i8(<vscale x 64 x i1> %m, <vscale x 64 x i8> %y, <vscale x 64 x i8> %a, i32 %evl)
+  ret <vscale x 64 x i8> %u
+}
+
+define <vscale x 64 x i8> @vmadd_vx_nxv64i8_unmasked(<vscale x 64 x i8> %a, i8 %b, <vscale x 64 x i8> %c,  <vscale x 64 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmadd_vx_nxv64i8_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, tu, ma
+; CHECK-NEXT:    vmadd.vx v8, a0, v16
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 64 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 64 x i8> %elt.head, <vscale x 64 x i8> poison, <vscale x 64 x i32> zeroinitializer
+  %splat = insertelement <vscale x 64 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 64 x i1> %splat, <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer
+  %x = call <vscale x 64 x i8> @llvm.vp.mul.nxv64i8(<vscale x 64 x i8> %a, <vscale x 64 x i8> %vb, <vscale x 64 x i1> %allones, i32 %evl)
+  %y = call <vscale x 64 x i8> @llvm.vp.add.nxv64i8(<vscale x 64 x i8> %x, <vscale x 64 x i8> %c, <vscale x 64 x i1> %allones, i32 %evl)
+  %u = call <vscale x 64 x i8> @llvm.vp.merge.nxv64i8(<vscale x 64 x i1> %allones, <vscale x 64 x i8> %y, <vscale x 64 x i8> %a, i32 %evl)
+  ret <vscale x 64 x i8> %u
+}
+
+define <vscale x 64 x i8> @vmadd_vv_nxv64i8_ta(<vscale x 64 x i8> %a, <vscale x 64 x i8> %b, <vscale x 64 x i8> %c,  <vscale x 64 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmadd_vv_nxv64i8_ta:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vl8r.v v24, (a0)
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vmacc.vv v24, v8, v16
+; CHECK-NEXT:    vmerge.vvm v8, v8, v24, v0
+; CHECK-NEXT:    ret
+  %splat = insertelement <vscale x 64 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 64 x i1> %splat, <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer
+  %x = call <vscale x 64 x i8> @llvm.vp.mul.nxv64i8(<vscale x 64 x i8> %a, <vscale x 64 x i8> %b, <vscale x 64 x i1> %allones, i32 %evl)
+  %y = call <vscale x 64 x i8> @llvm.vp.add.nxv64i8(<vscale x 64 x i8> %x, <vscale x 64 x i8> %c, <vscale x 64 x i1> %allones, i32 %evl)
+  %u = call <vscale x 64 x i8> @llvm.vp.select.nxv64i8(<vscale x 64 x i1> %m, <vscale x 64 x i8> %y, <vscale x 64 x i8> %a, i32 %evl)
+  ret <vscale x 64 x i8> %u
+}
+
+define <vscale x 64 x i8> @vmadd_vx_nxv64i8_ta(<vscale x 64 x i8> %a, i8 %b, <vscale x 64 x i8> %c,  <vscale x 64 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmadd_vx_nxv64i8_ta:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m8, ta, ma
+; CHECK-NEXT:    vmacc.vx v16, a0, v8
+; CHECK-NEXT:    vmerge.vvm v8, v8, v16, v0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 64 x i8> poison, i8 %b, i32 0
+  %vb = shufflevector <vscale x 64 x i8> %elt.head, <vscale x 64 x i8> poison, <vscale x 64 x i32> zeroinitializer
+  %splat = insertelement <vscale x 64 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 64 x i1> %splat, <vscale x 64 x i1> poison, <vscale x 64 x i32> zeroinitializer
+  %x = call <vscale x 64 x i8> @llvm.vp.mul.nxv64i8(<vscale x 64 x i8> %a, <vscale x 64 x i8> %vb, <vscale x 64 x i1> %allones, i32 %evl)
+  %y = call <vscale x 64 x i8> @llvm.vp.add.nxv64i8(<vscale x 64 x i8> %x, <vscale x 64 x i8> %c, <vscale x 64 x i1> %allones, i32 %evl)
+  %u = call <vscale x 64 x i8> @llvm.vp.select.nxv64i8(<vscale x 64 x i1> %m, <vscale x 64 x i8> %y, <vscale x 64 x i8> %a, i32 %evl)
+  ret <vscale x 64 x i8> %u
+}
+
+declare <vscale x 1 x i16> @llvm.vp.mul.nxv1i16(<vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i1>, i32)
+declare <vscale x 1 x i16> @llvm.vp.add.nxv1i16(<vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i1>, i32)
+declare <vscale x 1 x i16> @llvm.vp.merge.nxv1i16(<vscale x 1 x i1>, <vscale x 1 x i16>, <vscale x 1 x i16>, i32)
+declare <vscale x 1 x i16> @llvm.vp.select.nxv1i16(<vscale x 1 x i1>, <vscale x 1 x i16>, <vscale x 1 x i16>, i32)
+
+define <vscale x 1 x i16> @vmadd_vv_nxv1i16(<vscale x 1 x i16> %a, <vscale x 1 x i16> %b, <vscale x 1 x i16> %c,  <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmadd_vv_nxv1i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vmadd.vv v9, v8, v10
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, tu, ma
+; CHECK-NEXT:    vmerge.vvm v8, v8, v9, v0
+; CHECK-NEXT:    ret
+  %splat = insertelement <vscale x 1 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 1 x i1> %splat, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %x = call <vscale x 1 x i16> @llvm.vp.mul.nxv1i16(<vscale x 1 x i16> %a, <vscale x 1 x i16> %b, <vscale x 1 x i1> %allones, i32 %evl)
+  %y = call <vscale x 1 x i16> @llvm.vp.add.nxv1i16(<vscale x 1 x i16> %x, <vscale x 1 x i16> %c, <vscale x 1 x i1> %allones, i32 %evl)
+  %u = call <vscale x 1 x i16> @llvm.vp.merge.nxv1i16(<vscale x 1 x i1> %m, <vscale x 1 x i16> %y, <vscale x 1 x i16> %a, i32 %evl)
+  ret <vscale x 1 x i16> %u
+}
+
+define <vscale x 1 x i16> @vmadd_vv_nxv1i16_unmasked(<vscale x 1 x i16> %a, <vscale x 1 x i16> %b, <vscale x 1 x i16> %c,  <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmadd_vv_nxv1i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vmadd.vv v9, v8, v10
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, tu, ma
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    ret
+  %splat = insertelement <vscale x 1 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 1 x i1> %splat, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %x = call <vscale x 1 x i16> @llvm.vp.mul.nxv1i16(<vscale x 1 x i16> %a, <vscale x 1 x i16> %b, <vscale x 1 x i1> %allones, i32 %evl)
+  %y = call <vscale x 1 x i16> @llvm.vp.add.nxv1i16(<vscale x 1 x i16> %x, <vscale x 1 x i16> %c, <vscale x 1 x i1> %allones, i32 %evl)
+  %u = call <vscale x 1 x i16> @llvm.vp.merge.nxv1i16(<vscale x 1 x i1> %allones, <vscale x 1 x i16> %y, <vscale x 1 x i16> %a, i32 %evl)
+  ret <vscale x 1 x i16> %u
+}
+
+define <vscale x 1 x i16> @vmadd_vx_nxv1i16(<vscale x 1 x i16> %a, i16 %b, <vscale x 1 x i16> %c,  <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmadd_vx_nxv1i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, tu, mu
+; CHECK-NEXT:    vmadd.vx v8, a0, v9, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i16> %elt.head, <vscale x 1 x i16> poison, <vscale x 1 x i32> zeroinitializer
+  %splat = insertelement <vscale x 1 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 1 x i1> %splat, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %x = call <vscale x 1 x i16> @llvm.vp.mul.nxv1i16(<vscale x 1 x i16> %a, <vscale x 1 x i16> %vb, <vscale x 1 x i1> %allones, i32 %evl)
+  %y = call <vscale x 1 x i16> @llvm.vp.add.nxv1i16(<vscale x 1 x i16> %x, <vscale x 1 x i16> %c, <vscale x 1 x i1> %allones, i32 %evl)
+  %u = call <vscale x 1 x i16> @llvm.vp.merge.nxv1i16(<vscale x 1 x i1> %m, <vscale x 1 x i16> %y, <vscale x 1 x i16> %a, i32 %evl)
+  ret <vscale x 1 x i16> %u
+}
+
+define <vscale x 1 x i16> @vmadd_vx_nxv1i16_unmasked(<vscale x 1 x i16> %a, i16 %b, <vscale x 1 x i16> %c,  <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmadd_vx_nxv1i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, tu, ma
+; CHECK-NEXT:    vmadd.vx v8, a0, v9
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i16> %elt.head, <vscale x 1 x i16> poison, <vscale x 1 x i32> zeroinitializer
+  %splat = insertelement <vscale x 1 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 1 x i1> %splat, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %x = call <vscale x 1 x i16> @llvm.vp.mul.nxv1i16(<vscale x 1 x i16> %a, <vscale x 1 x i16> %vb, <vscale x 1 x i1> %allones, i32 %evl)
+  %y = call <vscale x 1 x i16> @llvm.vp.add.nxv1i16(<vscale x 1 x i16> %x, <vscale x 1 x i16> %c, <vscale x 1 x i1> %allones, i32 %evl)
+  %u = call <vscale x 1 x i16> @llvm.vp.merge.nxv1i16(<vscale x 1 x i1> %allones, <vscale x 1 x i16> %y, <vscale x 1 x i16> %a, i32 %evl)
+  ret <vscale x 1 x i16> %u
+}
+
+define <vscale x 1 x i16> @vmadd_vv_nxv1i16_ta(<vscale x 1 x i16> %a, <vscale x 1 x i16> %b, <vscale x 1 x i16> %c,  <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmadd_vv_nxv1i16_ta:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vmadd.vv v9, v8, v10
+; CHECK-NEXT:    vmerge.vvm v8, v8, v9, v0
+; CHECK-NEXT:    ret
+  %splat = insertelement <vscale x 1 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 1 x i1> %splat, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %x = call <vscale x 1 x i16> @llvm.vp.mul.nxv1i16(<vscale x 1 x i16> %a, <vscale x 1 x i16> %b, <vscale x 1 x i1> %allones, i32 %evl)
+  %y = call <vscale x 1 x i16> @llvm.vp.add.nxv1i16(<vscale x 1 x i16> %x, <vscale x 1 x i16> %c, <vscale x 1 x i1> %allones, i32 %evl)
+  %u = call <vscale x 1 x i16> @llvm.vp.select.nxv1i16(<vscale x 1 x i1> %m, <vscale x 1 x i16> %y, <vscale x 1 x i16> %a, i32 %evl)
+  ret <vscale x 1 x i16> %u
+}
+
+define <vscale x 1 x i16> @vmadd_vx_nxv1i16_ta(<vscale x 1 x i16> %a, i16 %b, <vscale x 1 x i16> %c,  <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmadd_vx_nxv1i16_ta:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NEXT:    vmacc.vx v9, a0, v8
+; CHECK-NEXT:    vmerge.vvm v8, v8, v9, v0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i16> %elt.head, <vscale x 1 x i16> poison, <vscale x 1 x i32> zeroinitializer
+  %splat = insertelement <vscale x 1 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 1 x i1> %splat, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %x = call <vscale x 1 x i16> @llvm.vp.mul.nxv1i16(<vscale x 1 x i16> %a, <vscale x 1 x i16> %vb, <vscale x 1 x i1> %allones, i32 %evl)
+  %y = call <vscale x 1 x i16> @llvm.vp.add.nxv1i16(<vscale x 1 x i16> %x, <vscale x 1 x i16> %c, <vscale x 1 x i1> %allones, i32 %evl)
+  %u = call <vscale x 1 x i16> @llvm.vp.select.nxv1i16(<vscale x 1 x i1> %m, <vscale x 1 x i16> %y, <vscale x 1 x i16> %a, i32 %evl)
+  ret <vscale x 1 x i16> %u
+}
+
+declare <vscale x 2 x i16> @llvm.vp.mul.nxv2i16(<vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i1>, i32)
+declare <vscale x 2 x i16> @llvm.vp.add.nxv2i16(<vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i1>, i32)
+declare <vscale x 2 x i16> @llvm.vp.merge.nxv2i16(<vscale x 2 x i1>, <vscale x 2 x i16>, <vscale x 2 x i16>, i32)
+declare <vscale x 2 x i16> @llvm.vp.select.nxv2i16(<vscale x 2 x i1>, <vscale x 2 x i16>, <vscale x 2 x i16>, i32)
+
+define <vscale x 2 x i16> @vmadd_vv_nxv2i16(<vscale x 2 x i16> %a, <vscale x 2 x i16> %b, <vscale x 2 x i16> %c,  <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmadd_vv_nxv2i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vmadd.vv v9, v8, v10
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, tu, ma
+; CHECK-NEXT:    vmerge.vvm v8, v8, v9, v0
+; CHECK-NEXT:    ret
+  %splat = insertelement <vscale x 2 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 2 x i1> %splat, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %x = call <vscale x 2 x i16> @llvm.vp.mul.nxv2i16(<vscale x 2 x i16> %a, <vscale x 2 x i16> %b, <vscale x 2 x i1> %allones, i32 %evl)
+  %y = call <vscale x 2 x i16> @llvm.vp.add.nxv2i16(<vscale x 2 x i16> %x, <vscale x 2 x i16> %c, <vscale x 2 x i1> %allones, i32 %evl)
+  %u = call <vscale x 2 x i16> @llvm.vp.merge.nxv2i16(<vscale x 2 x i1> %m, <vscale x 2 x i16> %y, <vscale x 2 x i16> %a, i32 %evl)
+  ret <vscale x 2 x i16> %u
+}
+
+define <vscale x 2 x i16> @vmadd_vv_nxv2i16_unmasked(<vscale x 2 x i16> %a, <vscale x 2 x i16> %b, <vscale x 2 x i16> %c,  <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmadd_vv_nxv2i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vmadd.vv v9, v8, v10
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, tu, ma
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    ret
+  %splat = insertelement <vscale x 2 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 2 x i1> %splat, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %x = call <vscale x 2 x i16> @llvm.vp.mul.nxv2i16(<vscale x 2 x i16> %a, <vscale x 2 x i16> %b, <vscale x 2 x i1> %allones, i32 %evl)
+  %y = call <vscale x 2 x i16> @llvm.vp.add.nxv2i16(<vscale x 2 x i16> %x, <vscale x 2 x i16> %c, <vscale x 2 x i1> %allones, i32 %evl)
+  %u = call <vscale x 2 x i16> @llvm.vp.merge.nxv2i16(<vscale x 2 x i1> %allones, <vscale x 2 x i16> %y, <vscale x 2 x i16> %a, i32 %evl)
+  ret <vscale x 2 x i16> %u
+}
+
+define <vscale x 2 x i16> @vmadd_vx_nxv2i16(<vscale x 2 x i16> %a, i16 %b, <vscale x 2 x i16> %c,  <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmadd_vx_nxv2i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, tu, mu
+; CHECK-NEXT:    vmadd.vx v8, a0, v9, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 2 x i16> %elt.head, <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer
+  %splat = insertelement <vscale x 2 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 2 x i1> %splat, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %x = call <vscale x 2 x i16> @llvm.vp.mul.nxv2i16(<vscale x 2 x i16> %a, <vscale x 2 x i16> %vb, <vscale x 2 x i1> %allones, i32 %evl)
+  %y = call <vscale x 2 x i16> @llvm.vp.add.nxv2i16(<vscale x 2 x i16> %x, <vscale x 2 x i16> %c, <vscale x 2 x i1> %allones, i32 %evl)
+  %u = call <vscale x 2 x i16> @llvm.vp.merge.nxv2i16(<vscale x 2 x i1> %m, <vscale x 2 x i16> %y, <vscale x 2 x i16> %a, i32 %evl)
+  ret <vscale x 2 x i16> %u
+}
+
+define <vscale x 2 x i16> @vmadd_vx_nxv2i16_unmasked(<vscale x 2 x i16> %a, i16 %b, <vscale x 2 x i16> %c,  <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmadd_vx_nxv2i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, tu, ma
+; CHECK-NEXT:    vmadd.vx v8, a0, v9
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 2 x i16> %elt.head, <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer
+  %splat = insertelement <vscale x 2 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 2 x i1> %splat, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %x = call <vscale x 2 x i16> @llvm.vp.mul.nxv2i16(<vscale x 2 x i16> %a, <vscale x 2 x i16> %vb, <vscale x 2 x i1> %allones, i32 %evl)
+  %y = call <vscale x 2 x i16> @llvm.vp.add.nxv2i16(<vscale x 2 x i16> %x, <vscale x 2 x i16> %c, <vscale x 2 x i1> %allones, i32 %evl)
+  %u = call <vscale x 2 x i16> @llvm.vp.merge.nxv2i16(<vscale x 2 x i1> %allones, <vscale x 2 x i16> %y, <vscale x 2 x i16> %a, i32 %evl)
+  ret <vscale x 2 x i16> %u
+}
+
+define <vscale x 2 x i16> @vmadd_vv_nxv2i16_ta(<vscale x 2 x i16> %a, <vscale x 2 x i16> %b, <vscale x 2 x i16> %c,  <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmadd_vv_nxv2i16_ta:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vmadd.vv v9, v8, v10
+; CHECK-NEXT:    vmerge.vvm v8, v8, v9, v0
+; CHECK-NEXT:    ret
+  %splat = insertelement <vscale x 2 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 2 x i1> %splat, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %x = call <vscale x 2 x i16> @llvm.vp.mul.nxv2i16(<vscale x 2 x i16> %a, <vscale x 2 x i16> %b, <vscale x 2 x i1> %allones, i32 %evl)
+  %y = call <vscale x 2 x i16> @llvm.vp.add.nxv2i16(<vscale x 2 x i16> %x, <vscale x 2 x i16> %c, <vscale x 2 x i1> %allones, i32 %evl)
+  %u = call <vscale x 2 x i16> @llvm.vp.select.nxv2i16(<vscale x 2 x i1> %m, <vscale x 2 x i16> %y, <vscale x 2 x i16> %a, i32 %evl)
+  ret <vscale x 2 x i16> %u
+}
+
+define <vscale x 2 x i16> @vmadd_vx_nxv2i16_ta(<vscale x 2 x i16> %a, i16 %b, <vscale x 2 x i16> %c,  <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmadd_vx_nxv2i16_ta:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NEXT:    vmacc.vx v9, a0, v8
+; CHECK-NEXT:    vmerge.vvm v8, v8, v9, v0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 2 x i16> %elt.head, <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer
+  %splat = insertelement <vscale x 2 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 2 x i1> %splat, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %x = call <vscale x 2 x i16> @llvm.vp.mul.nxv2i16(<vscale x 2 x i16> %a, <vscale x 2 x i16> %vb, <vscale x 2 x i1> %allones, i32 %evl)
+  %y = call <vscale x 2 x i16> @llvm.vp.add.nxv2i16(<vscale x 2 x i16> %x, <vscale x 2 x i16> %c, <vscale x 2 x i1> %allones, i32 %evl)
+  %u = call <vscale x 2 x i16> @llvm.vp.select.nxv2i16(<vscale x 2 x i1> %m, <vscale x 2 x i16> %y, <vscale x 2 x i16> %a, i32 %evl)
+  ret <vscale x 2 x i16> %u
+}
+
+declare <vscale x 4 x i16> @llvm.vp.mul.nxv4i16(<vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i1>, i32)
+declare <vscale x 4 x i16> @llvm.vp.add.nxv4i16(<vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i1>, i32)
+declare <vscale x 4 x i16> @llvm.vp.merge.nxv4i16(<vscale x 4 x i1>, <vscale x 4 x i16>, <vscale x 4 x i16>, i32)
+declare <vscale x 4 x i16> @llvm.vp.select.nxv4i16(<vscale x 4 x i1>, <vscale x 4 x i16>, <vscale x 4 x i16>, i32)
+
+define <vscale x 4 x i16> @vmadd_vv_nxv4i16(<vscale x 4 x i16> %a, <vscale x 4 x i16> %b, <vscale x 4 x i16> %c,  <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmadd_vv_nxv4i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vmadd.vv v9, v8, v10
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, tu, ma
+; CHECK-NEXT:    vmerge.vvm v8, v8, v9, v0
+; CHECK-NEXT:    ret
+  %splat = insertelement <vscale x 4 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 4 x i1> %splat, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %x = call <vscale x 4 x i16> @llvm.vp.mul.nxv4i16(<vscale x 4 x i16> %a, <vscale x 4 x i16> %b, <vscale x 4 x i1> %allones, i32 %evl)
+  %y = call <vscale x 4 x i16> @llvm.vp.add.nxv4i16(<vscale x 4 x i16> %x, <vscale x 4 x i16> %c, <vscale x 4 x i1> %allones, i32 %evl)
+  %u = call <vscale x 4 x i16> @llvm.vp.merge.nxv4i16(<vscale x 4 x i1> %m, <vscale x 4 x i16> %y, <vscale x 4 x i16> %a, i32 %evl)
+  ret <vscale x 4 x i16> %u
+}
+
+define <vscale x 4 x i16> @vmadd_vv_nxv4i16_unmasked(<vscale x 4 x i16> %a, <vscale x 4 x i16> %b, <vscale x 4 x i16> %c,  <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmadd_vv_nxv4i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vmadd.vv v9, v8, v10
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, tu, ma
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    ret
+  %splat = insertelement <vscale x 4 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 4 x i1> %splat, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %x = call <vscale x 4 x i16> @llvm.vp.mul.nxv4i16(<vscale x 4 x i16> %a, <vscale x 4 x i16> %b, <vscale x 4 x i1> %allones, i32 %evl)
+  %y = call <vscale x 4 x i16> @llvm.vp.add.nxv4i16(<vscale x 4 x i16> %x, <vscale x 4 x i16> %c, <vscale x 4 x i1> %allones, i32 %evl)
+  %u = call <vscale x 4 x i16> @llvm.vp.merge.nxv4i16(<vscale x 4 x i1> %allones, <vscale x 4 x i16> %y, <vscale x 4 x i16> %a, i32 %evl)
+  ret <vscale x 4 x i16> %u
+}
+
+define <vscale x 4 x i16> @vmadd_vx_nxv4i16(<vscale x 4 x i16> %a, i16 %b, <vscale x 4 x i16> %c,  <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmadd_vx_nxv4i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, tu, mu
+; CHECK-NEXT:    vmadd.vx v8, a0, v9, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 4 x i16> %elt.head, <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
+  %splat = insertelement <vscale x 4 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 4 x i1> %splat, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %x = call <vscale x 4 x i16> @llvm.vp.mul.nxv4i16(<vscale x 4 x i16> %a, <vscale x 4 x i16> %vb, <vscale x 4 x i1> %allones, i32 %evl)
+  %y = call <vscale x 4 x i16> @llvm.vp.add.nxv4i16(<vscale x 4 x i16> %x, <vscale x 4 x i16> %c, <vscale x 4 x i1> %allones, i32 %evl)
+  %u = call <vscale x 4 x i16> @llvm.vp.merge.nxv4i16(<vscale x 4 x i1> %m, <vscale x 4 x i16> %y, <vscale x 4 x i16> %a, i32 %evl)
+  ret <vscale x 4 x i16> %u
+}
+
+define <vscale x 4 x i16> @vmadd_vx_nxv4i16_unmasked(<vscale x 4 x i16> %a, i16 %b, <vscale x 4 x i16> %c,  <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmadd_vx_nxv4i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, tu, ma
+; CHECK-NEXT:    vmadd.vx v8, a0, v9
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 4 x i16> %elt.head, <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
+  %splat = insertelement <vscale x 4 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 4 x i1> %splat, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %x = call <vscale x 4 x i16> @llvm.vp.mul.nxv4i16(<vscale x 4 x i16> %a, <vscale x 4 x i16> %vb, <vscale x 4 x i1> %allones, i32 %evl)
+  %y = call <vscale x 4 x i16> @llvm.vp.add.nxv4i16(<vscale x 4 x i16> %x, <vscale x 4 x i16> %c, <vscale x 4 x i1> %allones, i32 %evl)
+  %u = call <vscale x 4 x i16> @llvm.vp.merge.nxv4i16(<vscale x 4 x i1> %allones, <vscale x 4 x i16> %y, <vscale x 4 x i16> %a, i32 %evl)
+  ret <vscale x 4 x i16> %u
+}
+
+define <vscale x 4 x i16> @vmadd_vv_nxv4i16_ta(<vscale x 4 x i16> %a, <vscale x 4 x i16> %b, <vscale x 4 x i16> %c,  <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmadd_vv_nxv4i16_ta:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vmadd.vv v9, v8, v10
+; CHECK-NEXT:    vmerge.vvm v8, v8, v9, v0
+; CHECK-NEXT:    ret
+  %splat = insertelement <vscale x 4 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 4 x i1> %splat, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %x = call <vscale x 4 x i16> @llvm.vp.mul.nxv4i16(<vscale x 4 x i16> %a, <vscale x 4 x i16> %b, <vscale x 4 x i1> %allones, i32 %evl)
+  %y = call <vscale x 4 x i16> @llvm.vp.add.nxv4i16(<vscale x 4 x i16> %x, <vscale x 4 x i16> %c, <vscale x 4 x i1> %allones, i32 %evl)
+  %u = call <vscale x 4 x i16> @llvm.vp.select.nxv4i16(<vscale x 4 x i1> %m, <vscale x 4 x i16> %y, <vscale x 4 x i16> %a, i32 %evl)
+  ret <vscale x 4 x i16> %u
+}
+
+define <vscale x 4 x i16> @vmadd_vx_nxv4i16_ta(<vscale x 4 x i16> %a, i16 %b, <vscale x 4 x i16> %c,  <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmadd_vx_nxv4i16_ta:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vmacc.vx v9, a0, v8
+; CHECK-NEXT:    vmerge.vvm v8, v8, v9, v0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 4 x i16> %elt.head, <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
+  %splat = insertelement <vscale x 4 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 4 x i1> %splat, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %x = call <vscale x 4 x i16> @llvm.vp.mul.nxv4i16(<vscale x 4 x i16> %a, <vscale x 4 x i16> %vb, <vscale x 4 x i1> %allones, i32 %evl)
+  %y = call <vscale x 4 x i16> @llvm.vp.add.nxv4i16(<vscale x 4 x i16> %x, <vscale x 4 x i16> %c, <vscale x 4 x i1> %allones, i32 %evl)
+  %u = call <vscale x 4 x i16> @llvm.vp.select.nxv4i16(<vscale x 4 x i1> %m, <vscale x 4 x i16> %y, <vscale x 4 x i16> %a, i32 %evl)
+  ret <vscale x 4 x i16> %u
+}
+
+declare <vscale x 8 x i16> @llvm.vp.mul.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i1>, i32)
+declare <vscale x 8 x i16> @llvm.vp.add.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i1>, i32)
+declare <vscale x 8 x i16> @llvm.vp.merge.nxv8i16(<vscale x 8 x i1>, <vscale x 8 x i16>, <vscale x 8 x i16>, i32)
+declare <vscale x 8 x i16> @llvm.vp.select.nxv8i16(<vscale x 8 x i1>, <vscale x 8 x i16>, <vscale x 8 x i16>, i32)
+
+define <vscale x 8 x i16> @vmadd_vv_nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b, <vscale x 8 x i16> %c,  <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmadd_vv_nxv8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT:    vmadd.vv v10, v8, v12
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, tu, ma
+; CHECK-NEXT:    vmerge.vvm v8, v8, v10, v0
+; CHECK-NEXT:    ret
+  %splat = insertelement <vscale x 8 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 8 x i1> %splat, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %x = call <vscale x 8 x i16> @llvm.vp.mul.nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b, <vscale x 8 x i1> %allones, i32 %evl)
+  %y = call <vscale x 8 x i16> @llvm.vp.add.nxv8i16(<vscale x 8 x i16> %x, <vscale x 8 x i16> %c, <vscale x 8 x i1> %allones, i32 %evl)
+  %u = call <vscale x 8 x i16> @llvm.vp.merge.nxv8i16(<vscale x 8 x i1> %m, <vscale x 8 x i16> %y, <vscale x 8 x i16> %a, i32 %evl)
+  ret <vscale x 8 x i16> %u
+}
+
+define <vscale x 8 x i16> @vmadd_vv_nxv8i16_unmasked(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b, <vscale x 8 x i16> %c,  <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmadd_vv_nxv8i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT:    vmadd.vv v10, v8, v12
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, tu, ma
+; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    ret
+  %splat = insertelement <vscale x 8 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 8 x i1> %splat, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %x = call <vscale x 8 x i16> @llvm.vp.mul.nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b, <vscale x 8 x i1> %allones, i32 %evl)
+  %y = call <vscale x 8 x i16> @llvm.vp.add.nxv8i16(<vscale x 8 x i16> %x, <vscale x 8 x i16> %c, <vscale x 8 x i1> %allones, i32 %evl)
+  %u = call <vscale x 8 x i16> @llvm.vp.merge.nxv8i16(<vscale x 8 x i1> %allones, <vscale x 8 x i16> %y, <vscale x 8 x i16> %a, i32 %evl)
+  ret <vscale x 8 x i16> %u
+}
+
+define <vscale x 8 x i16> @vmadd_vx_nxv8i16(<vscale x 8 x i16> %a, i16 %b, <vscale x 8 x i16> %c,  <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmadd_vx_nxv8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, tu, mu
+; CHECK-NEXT:    vmadd.vx v8, a0, v10, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i16> %elt.head, <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
+  %splat = insertelement <vscale x 8 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 8 x i1> %splat, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %x = call <vscale x 8 x i16> @llvm.vp.mul.nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %vb, <vscale x 8 x i1> %allones, i32 %evl)
+  %y = call <vscale x 8 x i16> @llvm.vp.add.nxv8i16(<vscale x 8 x i16> %x, <vscale x 8 x i16> %c, <vscale x 8 x i1> %allones, i32 %evl)
+  %u = call <vscale x 8 x i16> @llvm.vp.merge.nxv8i16(<vscale x 8 x i1> %m, <vscale x 8 x i16> %y, <vscale x 8 x i16> %a, i32 %evl)
+  ret <vscale x 8 x i16> %u
+}
+
+define <vscale x 8 x i16> @vmadd_vx_nxv8i16_unmasked(<vscale x 8 x i16> %a, i16 %b, <vscale x 8 x i16> %c,  <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmadd_vx_nxv8i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, tu, ma
+; CHECK-NEXT:    vmadd.vx v8, a0, v10
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i16> %elt.head, <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
+  %splat = insertelement <vscale x 8 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 8 x i1> %splat, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %x = call <vscale x 8 x i16> @llvm.vp.mul.nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %vb, <vscale x 8 x i1> %allones, i32 %evl)
+  %y = call <vscale x 8 x i16> @llvm.vp.add.nxv8i16(<vscale x 8 x i16> %x, <vscale x 8 x i16> %c, <vscale x 8 x i1> %allones, i32 %evl)
+  %u = call <vscale x 8 x i16> @llvm.vp.merge.nxv8i16(<vscale x 8 x i1> %allones, <vscale x 8 x i16> %y, <vscale x 8 x i16> %a, i32 %evl)
+  ret <vscale x 8 x i16> %u
+}
+
+define <vscale x 8 x i16> @vmadd_vv_nxv8i16_ta(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b, <vscale x 8 x i16> %c,  <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmadd_vv_nxv8i16_ta:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT:    vmadd.vv v10, v8, v12
+; CHECK-NEXT:    vmerge.vvm v8, v8, v10, v0
+; CHECK-NEXT:    ret
+  %splat = insertelement <vscale x 8 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 8 x i1> %splat, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %x = call <vscale x 8 x i16> @llvm.vp.mul.nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b, <vscale x 8 x i1> %allones, i32 %evl)
+  %y = call <vscale x 8 x i16> @llvm.vp.add.nxv8i16(<vscale x 8 x i16> %x, <vscale x 8 x i16> %c, <vscale x 8 x i1> %allones, i32 %evl)
+  %u = call <vscale x 8 x i16> @llvm.vp.select.nxv8i16(<vscale x 8 x i1> %m, <vscale x 8 x i16> %y, <vscale x 8 x i16> %a, i32 %evl)
+  ret <vscale x 8 x i16> %u
+}
+
+define <vscale x 8 x i16> @vmadd_vx_nxv8i16_ta(<vscale x 8 x i16> %a, i16 %b, <vscale x 8 x i16> %c,  <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmadd_vx_nxv8i16_ta:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m2, ta, ma
+; CHECK-NEXT:    vmacc.vx v10, a0, v8
+; CHECK-NEXT:    vmerge.vvm v8, v8, v10, v0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i16> %elt.head, <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
+  %splat = insertelement <vscale x 8 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 8 x i1> %splat, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %x = call <vscale x 8 x i16> @llvm.vp.mul.nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %vb, <vscale x 8 x i1> %allones, i32 %evl)
+  %y = call <vscale x 8 x i16> @llvm.vp.add.nxv8i16(<vscale x 8 x i16> %x, <vscale x 8 x i16> %c, <vscale x 8 x i1> %allones, i32 %evl)
+  %u = call <vscale x 8 x i16> @llvm.vp.select.nxv8i16(<vscale x 8 x i1> %m, <vscale x 8 x i16> %y, <vscale x 8 x i16> %a, i32 %evl)
+  ret <vscale x 8 x i16> %u
+}
+
+declare <vscale x 16 x i16> @llvm.vp.mul.nxv16i16(<vscale x 16 x i16>, <vscale x 16 x i16>, <vscale x 16 x i1>, i32)
+declare <vscale x 16 x i16> @llvm.vp.add.nxv16i16(<vscale x 16 x i16>, <vscale x 16 x i16>, <vscale x 16 x i1>, i32)
+declare <vscale x 16 x i16> @llvm.vp.merge.nxv16i16(<vscale x 16 x i1>, <vscale x 16 x i16>, <vscale x 16 x i16>, i32)
+declare <vscale x 16 x i16> @llvm.vp.select.nxv16i16(<vscale x 16 x i1>, <vscale x 16 x i16>, <vscale x 16 x i16>, i32)
+
+define <vscale x 16 x i16> @vmadd_vv_nxv16i16(<vscale x 16 x i16> %a, <vscale x 16 x i16> %b, <vscale x 16 x i16> %c,  <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmadd_vv_nxv16i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    vmadd.vv v12, v8, v16
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, tu, ma
+; CHECK-NEXT:    vmerge.vvm v8, v8, v12, v0
+; CHECK-NEXT:    ret
+  %splat = insertelement <vscale x 16 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 16 x i1> %splat, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %x = call <vscale x 16 x i16> @llvm.vp.mul.nxv16i16(<vscale x 16 x i16> %a, <vscale x 16 x i16> %b, <vscale x 16 x i1> %allones, i32 %evl)
+  %y = call <vscale x 16 x i16> @llvm.vp.add.nxv16i16(<vscale x 16 x i16> %x, <vscale x 16 x i16> %c, <vscale x 16 x i1> %allones, i32 %evl)
+  %u = call <vscale x 16 x i16> @llvm.vp.merge.nxv16i16(<vscale x 16 x i1> %m, <vscale x 16 x i16> %y, <vscale x 16 x i16> %a, i32 %evl)
+  ret <vscale x 16 x i16> %u
+}
+
+define <vscale x 16 x i16> @vmadd_vv_nxv16i16_unmasked(<vscale x 16 x i16> %a, <vscale x 16 x i16> %b, <vscale x 16 x i16> %c,  <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmadd_vv_nxv16i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    vmadd.vv v12, v8, v16
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, tu, ma
+; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    ret
+  %splat = insertelement <vscale x 16 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 16 x i1> %splat, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %x = call <vscale x 16 x i16> @llvm.vp.mul.nxv16i16(<vscale x 16 x i16> %a, <vscale x 16 x i16> %b, <vscale x 16 x i1> %allones, i32 %evl)
+  %y = call <vscale x 16 x i16> @llvm.vp.add.nxv16i16(<vscale x 16 x i16> %x, <vscale x 16 x i16> %c, <vscale x 16 x i1> %allones, i32 %evl)
+  %u = call <vscale x 16 x i16> @llvm.vp.merge.nxv16i16(<vscale x 16 x i1> %allones, <vscale x 16 x i16> %y, <vscale x 16 x i16> %a, i32 %evl)
+  ret <vscale x 16 x i16> %u
+}
+
+define <vscale x 16 x i16> @vmadd_vx_nxv16i16(<vscale x 16 x i16> %a, i16 %b, <vscale x 16 x i16> %c,  <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmadd_vx_nxv16i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, tu, mu
+; CHECK-NEXT:    vmadd.vx v8, a0, v12, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 16 x i16> %elt.head, <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer
+  %splat = insertelement <vscale x 16 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 16 x i1> %splat, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %x = call <vscale x 16 x i16> @llvm.vp.mul.nxv16i16(<vscale x 16 x i16> %a, <vscale x 16 x i16> %vb, <vscale x 16 x i1> %allones, i32 %evl)
+  %y = call <vscale x 16 x i16> @llvm.vp.add.nxv16i16(<vscale x 16 x i16> %x, <vscale x 16 x i16> %c, <vscale x 16 x i1> %allones, i32 %evl)
+  %u = call <vscale x 16 x i16> @llvm.vp.merge.nxv16i16(<vscale x 16 x i1> %m, <vscale x 16 x i16> %y, <vscale x 16 x i16> %a, i32 %evl)
+  ret <vscale x 16 x i16> %u
+}
+
+define <vscale x 16 x i16> @vmadd_vx_nxv16i16_unmasked(<vscale x 16 x i16> %a, i16 %b, <vscale x 16 x i16> %c,  <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmadd_vx_nxv16i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, tu, ma
+; CHECK-NEXT:    vmadd.vx v8, a0, v12
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 16 x i16> %elt.head, <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer
+  %splat = insertelement <vscale x 16 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 16 x i1> %splat, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %x = call <vscale x 16 x i16> @llvm.vp.mul.nxv16i16(<vscale x 16 x i16> %a, <vscale x 16 x i16> %vb, <vscale x 16 x i1> %allones, i32 %evl)
+  %y = call <vscale x 16 x i16> @llvm.vp.add.nxv16i16(<vscale x 16 x i16> %x, <vscale x 16 x i16> %c, <vscale x 16 x i1> %allones, i32 %evl)
+  %u = call <vscale x 16 x i16> @llvm.vp.merge.nxv16i16(<vscale x 16 x i1> %allones, <vscale x 16 x i16> %y, <vscale x 16 x i16> %a, i32 %evl)
+  ret <vscale x 16 x i16> %u
+}
+
+define <vscale x 16 x i16> @vmadd_vv_nxv16i16_ta(<vscale x 16 x i16> %a, <vscale x 16 x i16> %b, <vscale x 16 x i16> %c,  <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmadd_vv_nxv16i16_ta:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    vmadd.vv v12, v8, v16
+; CHECK-NEXT:    vmerge.vvm v8, v8, v12, v0
+; CHECK-NEXT:    ret
+  %splat = insertelement <vscale x 16 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 16 x i1> %splat, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %x = call <vscale x 16 x i16> @llvm.vp.mul.nxv16i16(<vscale x 16 x i16> %a, <vscale x 16 x i16> %b, <vscale x 16 x i1> %allones, i32 %evl)
+  %y = call <vscale x 16 x i16> @llvm.vp.add.nxv16i16(<vscale x 16 x i16> %x, <vscale x 16 x i16> %c, <vscale x 16 x i1> %allones, i32 %evl)
+  %u = call <vscale x 16 x i16> @llvm.vp.select.nxv16i16(<vscale x 16 x i1> %m, <vscale x 16 x i16> %y, <vscale x 16 x i16> %a, i32 %evl)
+  ret <vscale x 16 x i16> %u
+}
+
+define <vscale x 16 x i16> @vmadd_vx_nxv16i16_ta(<vscale x 16 x i16> %a, i16 %b, <vscale x 16 x i16> %c,  <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmadd_vx_nxv16i16_ta:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vmacc.vx v12, a0, v8
+; CHECK-NEXT:    vmerge.vvm v8, v8, v12, v0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 16 x i16> %elt.head, <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer
+  %splat = insertelement <vscale x 16 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 16 x i1> %splat, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %x = call <vscale x 16 x i16> @llvm.vp.mul.nxv16i16(<vscale x 16 x i16> %a, <vscale x 16 x i16> %vb, <vscale x 16 x i1> %allones, i32 %evl)
+  %y = call <vscale x 16 x i16> @llvm.vp.add.nxv16i16(<vscale x 16 x i16> %x, <vscale x 16 x i16> %c, <vscale x 16 x i1> %allones, i32 %evl)
+  %u = call <vscale x 16 x i16> @llvm.vp.select.nxv16i16(<vscale x 16 x i1> %m, <vscale x 16 x i16> %y, <vscale x 16 x i16> %a, i32 %evl)
+  ret <vscale x 16 x i16> %u
+}
+
+declare <vscale x 32 x i16> @llvm.vp.mul.nxv32i16(<vscale x 32 x i16>, <vscale x 32 x i16>, <vscale x 32 x i1>, i32)
+declare <vscale x 32 x i16> @llvm.vp.add.nxv32i16(<vscale x 32 x i16>, <vscale x 32 x i16>, <vscale x 32 x i1>, i32)
+declare <vscale x 32 x i16> @llvm.vp.merge.nxv32i16(<vscale x 32 x i1>, <vscale x 32 x i16>, <vscale x 32 x i16>, i32)
+declare <vscale x 32 x i16> @llvm.vp.select.nxv32i16(<vscale x 32 x i1>, <vscale x 32 x i16>, <vscale x 32 x i16>, i32)
+
+define <vscale x 32 x i16> @vmadd_vv_nxv32i16(<vscale x 32 x i16> %a, <vscale x 32 x i16> %b, <vscale x 32 x i16> %c,  <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmadd_vv_nxv32i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vl8re16.v v24, (a0)
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vmacc.vv v24, v8, v16
+; CHECK-NEXT:    vsetvli zero, zero, e16, m8, tu, ma
+; CHECK-NEXT:    vmerge.vvm v8, v8, v24, v0
+; CHECK-NEXT:    ret
+  %splat = insertelement <vscale x 32 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 32 x i1> %splat, <vscale x 32 x i1> poison, <vscale x 32 x i32> zeroinitializer
+  %x = call <vscale x 32 x i16> @llvm.vp.mul.nxv32i16(<vscale x 32 x i16> %a, <vscale x 32 x i16> %b, <vscale x 32 x i1> %allones, i32 %evl)
+  %y = call <vscale x 32 x i16> @llvm.vp.add.nxv32i16(<vscale x 32 x i16> %x, <vscale x 32 x i16> %c, <vscale x 32 x i1> %allones, i32 %evl)
+  %u = call <vscale x 32 x i16> @llvm.vp.merge.nxv32i16(<vscale x 32 x i1> %m, <vscale x 32 x i16> %y, <vscale x 32 x i16> %a, i32 %evl)
+  ret <vscale x 32 x i16> %u
+}
+
+define <vscale x 32 x i16> @vmadd_vv_nxv32i16_unmasked(<vscale x 32 x i16> %a, <vscale x 32 x i16> %b, <vscale x 32 x i16> %c,  <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmadd_vv_nxv32i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vl8re16.v v24, (a0)
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vmacc.vv v24, v8, v16
+; CHECK-NEXT:    vsetvli zero, zero, e16, m8, tu, ma
+; CHECK-NEXT:    vmv.v.v v8, v24
+; CHECK-NEXT:    ret
+  %splat = insertelement <vscale x 32 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 32 x i1> %splat, <vscale x 32 x i1> poison, <vscale x 32 x i32> zeroinitializer
+  %x = call <vscale x 32 x i16> @llvm.vp.mul.nxv32i16(<vscale x 32 x i16> %a, <vscale x 32 x i16> %b, <vscale x 32 x i1> %allones, i32 %evl)
+  %y = call <vscale x 32 x i16> @llvm.vp.add.nxv32i16(<vscale x 32 x i16> %x, <vscale x 32 x i16> %c, <vscale x 32 x i1> %allones, i32 %evl)
+  %u = call <vscale x 32 x i16> @llvm.vp.merge.nxv32i16(<vscale x 32 x i1> %allones, <vscale x 32 x i16> %y, <vscale x 32 x i16> %a, i32 %evl)
+  ret <vscale x 32 x i16> %u
+}
+
+define <vscale x 32 x i16> @vmadd_vx_nxv32i16(<vscale x 32 x i16> %a, i16 %b, <vscale x 32 x i16> %c,  <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmadd_vx_nxv32i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, tu, mu
+; CHECK-NEXT:    vmadd.vx v8, a0, v16, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 32 x i16> %elt.head, <vscale x 32 x i16> poison, <vscale x 32 x i32> zeroinitializer
+  %splat = insertelement <vscale x 32 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 32 x i1> %splat, <vscale x 32 x i1> poison, <vscale x 32 x i32> zeroinitializer
+  %x = call <vscale x 32 x i16> @llvm.vp.mul.nxv32i16(<vscale x 32 x i16> %a, <vscale x 32 x i16> %vb, <vscale x 32 x i1> %allones, i32 %evl)
+  %y = call <vscale x 32 x i16> @llvm.vp.add.nxv32i16(<vscale x 32 x i16> %x, <vscale x 32 x i16> %c, <vscale x 32 x i1> %allones, i32 %evl)
+  %u = call <vscale x 32 x i16> @llvm.vp.merge.nxv32i16(<vscale x 32 x i1> %m, <vscale x 32 x i16> %y, <vscale x 32 x i16> %a, i32 %evl)
+  ret <vscale x 32 x i16> %u
+}
+
+define <vscale x 32 x i16> @vmadd_vx_nxv32i16_unmasked(<vscale x 32 x i16> %a, i16 %b, <vscale x 32 x i16> %c,  <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmadd_vx_nxv32i16_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, tu, ma
+; CHECK-NEXT:    vmadd.vx v8, a0, v16
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 32 x i16> %elt.head, <vscale x 32 x i16> poison, <vscale x 32 x i32> zeroinitializer
+  %splat = insertelement <vscale x 32 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 32 x i1> %splat, <vscale x 32 x i1> poison, <vscale x 32 x i32> zeroinitializer
+  %x = call <vscale x 32 x i16> @llvm.vp.mul.nxv32i16(<vscale x 32 x i16> %a, <vscale x 32 x i16> %vb, <vscale x 32 x i1> %allones, i32 %evl)
+  %y = call <vscale x 32 x i16> @llvm.vp.add.nxv32i16(<vscale x 32 x i16> %x, <vscale x 32 x i16> %c, <vscale x 32 x i1> %allones, i32 %evl)
+  %u = call <vscale x 32 x i16> @llvm.vp.merge.nxv32i16(<vscale x 32 x i1> %allones, <vscale x 32 x i16> %y, <vscale x 32 x i16> %a, i32 %evl)
+  ret <vscale x 32 x i16> %u
+}
+
+define <vscale x 32 x i16> @vmadd_vv_nxv32i16_ta(<vscale x 32 x i16> %a, <vscale x 32 x i16> %b, <vscale x 32 x i16> %c,  <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmadd_vv_nxv32i16_ta:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vl8re16.v v24, (a0)
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vmacc.vv v24, v8, v16
+; CHECK-NEXT:    vmerge.vvm v8, v8, v24, v0
+; CHECK-NEXT:    ret
+  %splat = insertelement <vscale x 32 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 32 x i1> %splat, <vscale x 32 x i1> poison, <vscale x 32 x i32> zeroinitializer
+  %x = call <vscale x 32 x i16> @llvm.vp.mul.nxv32i16(<vscale x 32 x i16> %a, <vscale x 32 x i16> %b, <vscale x 32 x i1> %allones, i32 %evl)
+  %y = call <vscale x 32 x i16> @llvm.vp.add.nxv32i16(<vscale x 32 x i16> %x, <vscale x 32 x i16> %c, <vscale x 32 x i1> %allones, i32 %evl)
+  %u = call <vscale x 32 x i16> @llvm.vp.select.nxv32i16(<vscale x 32 x i1> %m, <vscale x 32 x i16> %y, <vscale x 32 x i16> %a, i32 %evl)
+  ret <vscale x 32 x i16> %u
+}
+
+define <vscale x 32 x i16> @vmadd_vx_nxv32i16_ta(<vscale x 32 x i16> %a, i16 %b, <vscale x 32 x i16> %c,  <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmadd_vx_nxv32i16_ta:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vmacc.vx v16, a0, v8
+; CHECK-NEXT:    vmerge.vvm v8, v8, v16, v0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 32 x i16> poison, i16 %b, i32 0
+  %vb = shufflevector <vscale x 32 x i16> %elt.head, <vscale x 32 x i16> poison, <vscale x 32 x i32> zeroinitializer
+  %splat = insertelement <vscale x 32 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 32 x i1> %splat, <vscale x 32 x i1> poison, <vscale x 32 x i32> zeroinitializer
+  %x = call <vscale x 32 x i16> @llvm.vp.mul.nxv32i16(<vscale x 32 x i16> %a, <vscale x 32 x i16> %vb, <vscale x 32 x i1> %allones, i32 %evl)
+  %y = call <vscale x 32 x i16> @llvm.vp.add.nxv32i16(<vscale x 32 x i16> %x, <vscale x 32 x i16> %c, <vscale x 32 x i1> %allones, i32 %evl)
+  %u = call <vscale x 32 x i16> @llvm.vp.select.nxv32i16(<vscale x 32 x i1> %m, <vscale x 32 x i16> %y, <vscale x 32 x i16> %a, i32 %evl)
+  ret <vscale x 32 x i16> %u
+}
+
+declare <vscale x 1 x i32> @llvm.vp.mul.nxv1i32(<vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i1>, i32)
+declare <vscale x 1 x i32> @llvm.vp.add.nxv1i32(<vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i1>, i32)
+declare <vscale x 1 x i32> @llvm.vp.merge.nxv1i32(<vscale x 1 x i1>, <vscale x 1 x i32>, <vscale x 1 x i32>, i32)
+declare <vscale x 1 x i32> @llvm.vp.select.nxv1i32(<vscale x 1 x i1>, <vscale x 1 x i32>, <vscale x 1 x i32>, i32)
+
+define <vscale x 1 x i32> @vmadd_vv_nxv1i32(<vscale x 1 x i32> %a, <vscale x 1 x i32> %b, <vscale x 1 x i32> %c,  <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmadd_vv_nxv1i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vmadd.vv v9, v8, v10
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, tu, ma
+; CHECK-NEXT:    vmerge.vvm v8, v8, v9, v0
+; CHECK-NEXT:    ret
+  %splat = insertelement <vscale x 1 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 1 x i1> %splat, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %x = call <vscale x 1 x i32> @llvm.vp.mul.nxv1i32(<vscale x 1 x i32> %a, <vscale x 1 x i32> %b, <vscale x 1 x i1> %allones, i32 %evl)
+  %y = call <vscale x 1 x i32> @llvm.vp.add.nxv1i32(<vscale x 1 x i32> %x, <vscale x 1 x i32> %c, <vscale x 1 x i1> %allones, i32 %evl)
+  %u = call <vscale x 1 x i32> @llvm.vp.merge.nxv1i32(<vscale x 1 x i1> %m, <vscale x 1 x i32> %y, <vscale x 1 x i32> %a, i32 %evl)
+  ret <vscale x 1 x i32> %u
+}
+
+define <vscale x 1 x i32> @vmadd_vv_nxv1i32_unmasked(<vscale x 1 x i32> %a, <vscale x 1 x i32> %b, <vscale x 1 x i32> %c,  <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmadd_vv_nxv1i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vmadd.vv v9, v8, v10
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, tu, ma
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    ret
+  %splat = insertelement <vscale x 1 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 1 x i1> %splat, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %x = call <vscale x 1 x i32> @llvm.vp.mul.nxv1i32(<vscale x 1 x i32> %a, <vscale x 1 x i32> %b, <vscale x 1 x i1> %allones, i32 %evl)
+  %y = call <vscale x 1 x i32> @llvm.vp.add.nxv1i32(<vscale x 1 x i32> %x, <vscale x 1 x i32> %c, <vscale x 1 x i1> %allones, i32 %evl)
+  %u = call <vscale x 1 x i32> @llvm.vp.merge.nxv1i32(<vscale x 1 x i1> %allones, <vscale x 1 x i32> %y, <vscale x 1 x i32> %a, i32 %evl)
+  ret <vscale x 1 x i32> %u
+}
+
+define <vscale x 1 x i32> @vmadd_vx_nxv1i32(<vscale x 1 x i32> %a, i32 %b, <vscale x 1 x i32> %c,  <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmadd_vx_nxv1i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, tu, mu
+; CHECK-NEXT:    vmadd.vx v8, a0, v9, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i32> %elt.head, <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer
+  %splat = insertelement <vscale x 1 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 1 x i1> %splat, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %x = call <vscale x 1 x i32> @llvm.vp.mul.nxv1i32(<vscale x 1 x i32> %a, <vscale x 1 x i32> %vb, <vscale x 1 x i1> %allones, i32 %evl)
+  %y = call <vscale x 1 x i32> @llvm.vp.add.nxv1i32(<vscale x 1 x i32> %x, <vscale x 1 x i32> %c, <vscale x 1 x i1> %allones, i32 %evl)
+  %u = call <vscale x 1 x i32> @llvm.vp.merge.nxv1i32(<vscale x 1 x i1> %m, <vscale x 1 x i32> %y, <vscale x 1 x i32> %a, i32 %evl)
+  ret <vscale x 1 x i32> %u
+}
+
+define <vscale x 1 x i32> @vmadd_vx_nxv1i32_unmasked(<vscale x 1 x i32> %a, i32 %b, <vscale x 1 x i32> %c,  <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmadd_vx_nxv1i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, tu, ma
+; CHECK-NEXT:    vmadd.vx v8, a0, v9
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i32> %elt.head, <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer
+  %splat = insertelement <vscale x 1 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 1 x i1> %splat, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %x = call <vscale x 1 x i32> @llvm.vp.mul.nxv1i32(<vscale x 1 x i32> %a, <vscale x 1 x i32> %vb, <vscale x 1 x i1> %allones, i32 %evl)
+  %y = call <vscale x 1 x i32> @llvm.vp.add.nxv1i32(<vscale x 1 x i32> %x, <vscale x 1 x i32> %c, <vscale x 1 x i1> %allones, i32 %evl)
+  %u = call <vscale x 1 x i32> @llvm.vp.merge.nxv1i32(<vscale x 1 x i1> %allones, <vscale x 1 x i32> %y, <vscale x 1 x i32> %a, i32 %evl)
+  ret <vscale x 1 x i32> %u
+}
+
+define <vscale x 1 x i32> @vmadd_vv_nxv1i32_ta(<vscale x 1 x i32> %a, <vscale x 1 x i32> %b, <vscale x 1 x i32> %c,  <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmadd_vv_nxv1i32_ta:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vmadd.vv v9, v8, v10
+; CHECK-NEXT:    vmerge.vvm v8, v8, v9, v0
+; CHECK-NEXT:    ret
+  %splat = insertelement <vscale x 1 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 1 x i1> %splat, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %x = call <vscale x 1 x i32> @llvm.vp.mul.nxv1i32(<vscale x 1 x i32> %a, <vscale x 1 x i32> %b, <vscale x 1 x i1> %allones, i32 %evl)
+  %y = call <vscale x 1 x i32> @llvm.vp.add.nxv1i32(<vscale x 1 x i32> %x, <vscale x 1 x i32> %c, <vscale x 1 x i1> %allones, i32 %evl)
+  %u = call <vscale x 1 x i32> @llvm.vp.select.nxv1i32(<vscale x 1 x i1> %m, <vscale x 1 x i32> %y, <vscale x 1 x i32> %a, i32 %evl)
+  ret <vscale x 1 x i32> %u
+}
+
+define <vscale x 1 x i32> @vmadd_vx_nxv1i32_ta(<vscale x 1 x i32> %a, i32 %b, <vscale x 1 x i32> %c,  <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmadd_vx_nxv1i32_ta:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; CHECK-NEXT:    vmacc.vx v9, a0, v8
+; CHECK-NEXT:    vmerge.vvm v8, v8, v9, v0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i32> %elt.head, <vscale x 1 x i32> poison, <vscale x 1 x i32> zeroinitializer
+  %splat = insertelement <vscale x 1 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 1 x i1> %splat, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %x = call <vscale x 1 x i32> @llvm.vp.mul.nxv1i32(<vscale x 1 x i32> %a, <vscale x 1 x i32> %vb, <vscale x 1 x i1> %allones, i32 %evl)
+  %y = call <vscale x 1 x i32> @llvm.vp.add.nxv1i32(<vscale x 1 x i32> %x, <vscale x 1 x i32> %c, <vscale x 1 x i1> %allones, i32 %evl)
+  %u = call <vscale x 1 x i32> @llvm.vp.select.nxv1i32(<vscale x 1 x i1> %m, <vscale x 1 x i32> %y, <vscale x 1 x i32> %a, i32 %evl)
+  ret <vscale x 1 x i32> %u
+}
+
+declare <vscale x 2 x i32> @llvm.vp.mul.nxv2i32(<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i1>, i32)
+declare <vscale x 2 x i32> @llvm.vp.add.nxv2i32(<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i1>, i32)
+declare <vscale x 2 x i32> @llvm.vp.merge.nxv2i32(<vscale x 2 x i1>, <vscale x 2 x i32>, <vscale x 2 x i32>, i32)
+declare <vscale x 2 x i32> @llvm.vp.select.nxv2i32(<vscale x 2 x i1>, <vscale x 2 x i32>, <vscale x 2 x i32>, i32)
+
+define <vscale x 2 x i32> @vmadd_vv_nxv2i32(<vscale x 2 x i32> %a, <vscale x 2 x i32> %b, <vscale x 2 x i32> %c,  <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmadd_vv_nxv2i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vmadd.vv v9, v8, v10
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, tu, ma
+; CHECK-NEXT:    vmerge.vvm v8, v8, v9, v0
+; CHECK-NEXT:    ret
+  %splat = insertelement <vscale x 2 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 2 x i1> %splat, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %x = call <vscale x 2 x i32> @llvm.vp.mul.nxv2i32(<vscale x 2 x i32> %a, <vscale x 2 x i32> %b, <vscale x 2 x i1> %allones, i32 %evl)
+  %y = call <vscale x 2 x i32> @llvm.vp.add.nxv2i32(<vscale x 2 x i32> %x, <vscale x 2 x i32> %c, <vscale x 2 x i1> %allones, i32 %evl)
+  %u = call <vscale x 2 x i32> @llvm.vp.merge.nxv2i32(<vscale x 2 x i1> %m, <vscale x 2 x i32> %y, <vscale x 2 x i32> %a, i32 %evl)
+  ret <vscale x 2 x i32> %u
+}
+
+define <vscale x 2 x i32> @vmadd_vv_nxv2i32_unmasked(<vscale x 2 x i32> %a, <vscale x 2 x i32> %b, <vscale x 2 x i32> %c,  <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmadd_vv_nxv2i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vmadd.vv v9, v8, v10
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, tu, ma
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    ret
+  %splat = insertelement <vscale x 2 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 2 x i1> %splat, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %x = call <vscale x 2 x i32> @llvm.vp.mul.nxv2i32(<vscale x 2 x i32> %a, <vscale x 2 x i32> %b, <vscale x 2 x i1> %allones, i32 %evl)
+  %y = call <vscale x 2 x i32> @llvm.vp.add.nxv2i32(<vscale x 2 x i32> %x, <vscale x 2 x i32> %c, <vscale x 2 x i1> %allones, i32 %evl)
+  %u = call <vscale x 2 x i32> @llvm.vp.merge.nxv2i32(<vscale x 2 x i1> %allones, <vscale x 2 x i32> %y, <vscale x 2 x i32> %a, i32 %evl)
+  ret <vscale x 2 x i32> %u
+}
+
+define <vscale x 2 x i32> @vmadd_vx_nxv2i32(<vscale x 2 x i32> %a, i32 %b, <vscale x 2 x i32> %c,  <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmadd_vx_nxv2i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, tu, mu
+; CHECK-NEXT:    vmadd.vx v8, a0, v9, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 2 x i32> %elt.head, <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+  %splat = insertelement <vscale x 2 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 2 x i1> %splat, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %x = call <vscale x 2 x i32> @llvm.vp.mul.nxv2i32(<vscale x 2 x i32> %a, <vscale x 2 x i32> %vb, <vscale x 2 x i1> %allones, i32 %evl)
+  %y = call <vscale x 2 x i32> @llvm.vp.add.nxv2i32(<vscale x 2 x i32> %x, <vscale x 2 x i32> %c, <vscale x 2 x i1> %allones, i32 %evl)
+  %u = call <vscale x 2 x i32> @llvm.vp.merge.nxv2i32(<vscale x 2 x i1> %m, <vscale x 2 x i32> %y, <vscale x 2 x i32> %a, i32 %evl)
+  ret <vscale x 2 x i32> %u
+}
+
+define <vscale x 2 x i32> @vmadd_vx_nxv2i32_unmasked(<vscale x 2 x i32> %a, i32 %b, <vscale x 2 x i32> %c,  <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmadd_vx_nxv2i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, tu, ma
+; CHECK-NEXT:    vmadd.vx v8, a0, v9
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 2 x i32> %elt.head, <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+  %splat = insertelement <vscale x 2 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 2 x i1> %splat, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %x = call <vscale x 2 x i32> @llvm.vp.mul.nxv2i32(<vscale x 2 x i32> %a, <vscale x 2 x i32> %vb, <vscale x 2 x i1> %allones, i32 %evl)
+  %y = call <vscale x 2 x i32> @llvm.vp.add.nxv2i32(<vscale x 2 x i32> %x, <vscale x 2 x i32> %c, <vscale x 2 x i1> %allones, i32 %evl)
+  %u = call <vscale x 2 x i32> @llvm.vp.merge.nxv2i32(<vscale x 2 x i1> %allones, <vscale x 2 x i32> %y, <vscale x 2 x i32> %a, i32 %evl)
+  ret <vscale x 2 x i32> %u
+}
+
+define <vscale x 2 x i32> @vmadd_vv_nxv2i32_ta(<vscale x 2 x i32> %a, <vscale x 2 x i32> %b, <vscale x 2 x i32> %c,  <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmadd_vv_nxv2i32_ta:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vmadd.vv v9, v8, v10
+; CHECK-NEXT:    vmerge.vvm v8, v8, v9, v0
+; CHECK-NEXT:    ret
+  %splat = insertelement <vscale x 2 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 2 x i1> %splat, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %x = call <vscale x 2 x i32> @llvm.vp.mul.nxv2i32(<vscale x 2 x i32> %a, <vscale x 2 x i32> %b, <vscale x 2 x i1> %allones, i32 %evl)
+  %y = call <vscale x 2 x i32> @llvm.vp.add.nxv2i32(<vscale x 2 x i32> %x, <vscale x 2 x i32> %c, <vscale x 2 x i1> %allones, i32 %evl)
+  %u = call <vscale x 2 x i32> @llvm.vp.select.nxv2i32(<vscale x 2 x i1> %m, <vscale x 2 x i32> %y, <vscale x 2 x i32> %a, i32 %evl)
+  ret <vscale x 2 x i32> %u
+}
+
+define <vscale x 2 x i32> @vmadd_vx_nxv2i32_ta(<vscale x 2 x i32> %a, i32 %b, <vscale x 2 x i32> %c,  <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmadd_vx_nxv2i32_ta:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vmacc.vx v9, a0, v8
+; CHECK-NEXT:    vmerge.vvm v8, v8, v9, v0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 2 x i32> %elt.head, <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+  %splat = insertelement <vscale x 2 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 2 x i1> %splat, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %x = call <vscale x 2 x i32> @llvm.vp.mul.nxv2i32(<vscale x 2 x i32> %a, <vscale x 2 x i32> %vb, <vscale x 2 x i1> %allones, i32 %evl)
+  %y = call <vscale x 2 x i32> @llvm.vp.add.nxv2i32(<vscale x 2 x i32> %x, <vscale x 2 x i32> %c, <vscale x 2 x i1> %allones, i32 %evl)
+  %u = call <vscale x 2 x i32> @llvm.vp.select.nxv2i32(<vscale x 2 x i1> %m, <vscale x 2 x i32> %y, <vscale x 2 x i32> %a, i32 %evl)
+  ret <vscale x 2 x i32> %u
+}
+
+declare <vscale x 4 x i32> @llvm.vp.mul.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i1>, i32)
+declare <vscale x 4 x i32> @llvm.vp.add.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i1>, i32)
+declare <vscale x 4 x i32> @llvm.vp.merge.nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>, <vscale x 4 x i32>, i32)
+declare <vscale x 4 x i32> @llvm.vp.select.nxv4i32(<vscale x 4 x i1>, <vscale x 4 x i32>, <vscale x 4 x i32>, i32)
+
+define <vscale x 4 x i32> @vmadd_vv_nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, <vscale x 4 x i32> %c,  <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmadd_vv_nxv4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vmadd.vv v10, v8, v12
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, tu, ma
+; CHECK-NEXT:    vmerge.vvm v8, v8, v10, v0
+; CHECK-NEXT:    ret
+  %splat = insertelement <vscale x 4 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 4 x i1> %splat, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %x = call <vscale x 4 x i32> @llvm.vp.mul.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, <vscale x 4 x i1> %allones, i32 %evl)
+  %y = call <vscale x 4 x i32> @llvm.vp.add.nxv4i32(<vscale x 4 x i32> %x, <vscale x 4 x i32> %c, <vscale x 4 x i1> %allones, i32 %evl)
+  %u = call <vscale x 4 x i32> @llvm.vp.merge.nxv4i32(<vscale x 4 x i1> %m, <vscale x 4 x i32> %y, <vscale x 4 x i32> %a, i32 %evl)
+  ret <vscale x 4 x i32> %u
+}
+
+define <vscale x 4 x i32> @vmadd_vv_nxv4i32_unmasked(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, <vscale x 4 x i32> %c,  <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmadd_vv_nxv4i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vmadd.vv v10, v8, v12
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, tu, ma
+; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    ret
+  %splat = insertelement <vscale x 4 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 4 x i1> %splat, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %x = call <vscale x 4 x i32> @llvm.vp.mul.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, <vscale x 4 x i1> %allones, i32 %evl)
+  %y = call <vscale x 4 x i32> @llvm.vp.add.nxv4i32(<vscale x 4 x i32> %x, <vscale x 4 x i32> %c, <vscale x 4 x i1> %allones, i32 %evl)
+  %u = call <vscale x 4 x i32> @llvm.vp.merge.nxv4i32(<vscale x 4 x i1> %allones, <vscale x 4 x i32> %y, <vscale x 4 x i32> %a, i32 %evl)
+  ret <vscale x 4 x i32> %u
+}
+
+define <vscale x 4 x i32> @vmadd_vx_nxv4i32(<vscale x 4 x i32> %a, i32 %b, <vscale x 4 x i32> %c,  <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmadd_vx_nxv4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, tu, mu
+; CHECK-NEXT:    vmadd.vx v8, a0, v10, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 4 x i32> %elt.head, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+  %splat = insertelement <vscale x 4 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 4 x i1> %splat, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %x = call <vscale x 4 x i32> @llvm.vp.mul.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %vb, <vscale x 4 x i1> %allones, i32 %evl)
+  %y = call <vscale x 4 x i32> @llvm.vp.add.nxv4i32(<vscale x 4 x i32> %x, <vscale x 4 x i32> %c, <vscale x 4 x i1> %allones, i32 %evl)
+  %u = call <vscale x 4 x i32> @llvm.vp.merge.nxv4i32(<vscale x 4 x i1> %m, <vscale x 4 x i32> %y, <vscale x 4 x i32> %a, i32 %evl)
+  ret <vscale x 4 x i32> %u
+}
+
+define <vscale x 4 x i32> @vmadd_vx_nxv4i32_unmasked(<vscale x 4 x i32> %a, i32 %b, <vscale x 4 x i32> %c,  <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmadd_vx_nxv4i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, tu, ma
+; CHECK-NEXT:    vmadd.vx v8, a0, v10
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 4 x i32> %elt.head, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+  %splat = insertelement <vscale x 4 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 4 x i1> %splat, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %x = call <vscale x 4 x i32> @llvm.vp.mul.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %vb, <vscale x 4 x i1> %allones, i32 %evl)
+  %y = call <vscale x 4 x i32> @llvm.vp.add.nxv4i32(<vscale x 4 x i32> %x, <vscale x 4 x i32> %c, <vscale x 4 x i1> %allones, i32 %evl)
+  %u = call <vscale x 4 x i32> @llvm.vp.merge.nxv4i32(<vscale x 4 x i1> %allones, <vscale x 4 x i32> %y, <vscale x 4 x i32> %a, i32 %evl)
+  ret <vscale x 4 x i32> %u
+}
+
+define <vscale x 4 x i32> @vmadd_vv_nxv4i32_ta(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, <vscale x 4 x i32> %c,  <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmadd_vv_nxv4i32_ta:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vmadd.vv v10, v8, v12
+; CHECK-NEXT:    vmerge.vvm v8, v8, v10, v0
+; CHECK-NEXT:    ret
+  %splat = insertelement <vscale x 4 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 4 x i1> %splat, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %x = call <vscale x 4 x i32> @llvm.vp.mul.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, <vscale x 4 x i1> %allones, i32 %evl)
+  %y = call <vscale x 4 x i32> @llvm.vp.add.nxv4i32(<vscale x 4 x i32> %x, <vscale x 4 x i32> %c, <vscale x 4 x i1> %allones, i32 %evl)
+  %u = call <vscale x 4 x i32> @llvm.vp.select.nxv4i32(<vscale x 4 x i1> %m, <vscale x 4 x i32> %y, <vscale x 4 x i32> %a, i32 %evl)
+  ret <vscale x 4 x i32> %u
+}
+
+define <vscale x 4 x i32> @vmadd_vx_nxv4i32_ta(<vscale x 4 x i32> %a, i32 %b, <vscale x 4 x i32> %c,  <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmadd_vx_nxv4i32_ta:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; CHECK-NEXT:    vmacc.vx v10, a0, v8
+; CHECK-NEXT:    vmerge.vvm v8, v8, v10, v0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 4 x i32> %elt.head, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+  %splat = insertelement <vscale x 4 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 4 x i1> %splat, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %x = call <vscale x 4 x i32> @llvm.vp.mul.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %vb, <vscale x 4 x i1> %allones, i32 %evl)
+  %y = call <vscale x 4 x i32> @llvm.vp.add.nxv4i32(<vscale x 4 x i32> %x, <vscale x 4 x i32> %c, <vscale x 4 x i1> %allones, i32 %evl)
+  %u = call <vscale x 4 x i32> @llvm.vp.select.nxv4i32(<vscale x 4 x i1> %m, <vscale x 4 x i32> %y, <vscale x 4 x i32> %a, i32 %evl)
+  ret <vscale x 4 x i32> %u
+}
+
+declare <vscale x 8 x i32> @llvm.vp.mul.nxv8i32(<vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i1>, i32)
+declare <vscale x 8 x i32> @llvm.vp.add.nxv8i32(<vscale x 8 x i32>, <vscale x 8 x i32>, <vscale x 8 x i1>, i32)
+declare <vscale x 8 x i32> @llvm.vp.merge.nxv8i32(<vscale x 8 x i1>, <vscale x 8 x i32>, <vscale x 8 x i32>, i32)
+declare <vscale x 8 x i32> @llvm.vp.select.nxv8i32(<vscale x 8 x i1>, <vscale x 8 x i32>, <vscale x 8 x i32>, i32)
+
+define <vscale x 8 x i32> @vmadd_vv_nxv8i32(<vscale x 8 x i32> %a, <vscale x 8 x i32> %b, <vscale x 8 x i32> %c,  <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmadd_vv_nxv8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vmadd.vv v12, v8, v16
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, tu, ma
+; CHECK-NEXT:    vmerge.vvm v8, v8, v12, v0
+; CHECK-NEXT:    ret
+  %splat = insertelement <vscale x 8 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 8 x i1> %splat, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %x = call <vscale x 8 x i32> @llvm.vp.mul.nxv8i32(<vscale x 8 x i32> %a, <vscale x 8 x i32> %b, <vscale x 8 x i1> %allones, i32 %evl)
+  %y = call <vscale x 8 x i32> @llvm.vp.add.nxv8i32(<vscale x 8 x i32> %x, <vscale x 8 x i32> %c, <vscale x 8 x i1> %allones, i32 %evl)
+  %u = call <vscale x 8 x i32> @llvm.vp.merge.nxv8i32(<vscale x 8 x i1> %m, <vscale x 8 x i32> %y, <vscale x 8 x i32> %a, i32 %evl)
+  ret <vscale x 8 x i32> %u
+}
+
+define <vscale x 8 x i32> @vmadd_vv_nxv8i32_unmasked(<vscale x 8 x i32> %a, <vscale x 8 x i32> %b, <vscale x 8 x i32> %c,  <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmadd_vv_nxv8i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vmadd.vv v12, v8, v16
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, tu, ma
+; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    ret
+  %splat = insertelement <vscale x 8 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 8 x i1> %splat, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %x = call <vscale x 8 x i32> @llvm.vp.mul.nxv8i32(<vscale x 8 x i32> %a, <vscale x 8 x i32> %b, <vscale x 8 x i1> %allones, i32 %evl)
+  %y = call <vscale x 8 x i32> @llvm.vp.add.nxv8i32(<vscale x 8 x i32> %x, <vscale x 8 x i32> %c, <vscale x 8 x i1> %allones, i32 %evl)
+  %u = call <vscale x 8 x i32> @llvm.vp.merge.nxv8i32(<vscale x 8 x i1> %allones, <vscale x 8 x i32> %y, <vscale x 8 x i32> %a, i32 %evl)
+  ret <vscale x 8 x i32> %u
+}
+
+define <vscale x 8 x i32> @vmadd_vx_nxv8i32(<vscale x 8 x i32> %a, i32 %b, <vscale x 8 x i32> %c,  <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmadd_vx_nxv8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, tu, mu
+; CHECK-NEXT:    vmadd.vx v8, a0, v12, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i32> %elt.head, <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
+  %splat = insertelement <vscale x 8 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 8 x i1> %splat, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %x = call <vscale x 8 x i32> @llvm.vp.mul.nxv8i32(<vscale x 8 x i32> %a, <vscale x 8 x i32> %vb, <vscale x 8 x i1> %allones, i32 %evl)
+  %y = call <vscale x 8 x i32> @llvm.vp.add.nxv8i32(<vscale x 8 x i32> %x, <vscale x 8 x i32> %c, <vscale x 8 x i1> %allones, i32 %evl)
+  %u = call <vscale x 8 x i32> @llvm.vp.merge.nxv8i32(<vscale x 8 x i1> %m, <vscale x 8 x i32> %y, <vscale x 8 x i32> %a, i32 %evl)
+  ret <vscale x 8 x i32> %u
+}
+
+define <vscale x 8 x i32> @vmadd_vx_nxv8i32_unmasked(<vscale x 8 x i32> %a, i32 %b, <vscale x 8 x i32> %c,  <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmadd_vx_nxv8i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, tu, ma
+; CHECK-NEXT:    vmadd.vx v8, a0, v12
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i32> %elt.head, <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
+  %splat = insertelement <vscale x 8 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 8 x i1> %splat, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %x = call <vscale x 8 x i32> @llvm.vp.mul.nxv8i32(<vscale x 8 x i32> %a, <vscale x 8 x i32> %vb, <vscale x 8 x i1> %allones, i32 %evl)
+  %y = call <vscale x 8 x i32> @llvm.vp.add.nxv8i32(<vscale x 8 x i32> %x, <vscale x 8 x i32> %c, <vscale x 8 x i1> %allones, i32 %evl)
+  %u = call <vscale x 8 x i32> @llvm.vp.merge.nxv8i32(<vscale x 8 x i1> %allones, <vscale x 8 x i32> %y, <vscale x 8 x i32> %a, i32 %evl)
+  ret <vscale x 8 x i32> %u
+}
+
+define <vscale x 8 x i32> @vmadd_vv_nxv8i32_ta(<vscale x 8 x i32> %a, <vscale x 8 x i32> %b, <vscale x 8 x i32> %c,  <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmadd_vv_nxv8i32_ta:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vmadd.vv v12, v8, v16
+; CHECK-NEXT:    vmerge.vvm v8, v8, v12, v0
+; CHECK-NEXT:    ret
+  %splat = insertelement <vscale x 8 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 8 x i1> %splat, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %x = call <vscale x 8 x i32> @llvm.vp.mul.nxv8i32(<vscale x 8 x i32> %a, <vscale x 8 x i32> %b, <vscale x 8 x i1> %allones, i32 %evl)
+  %y = call <vscale x 8 x i32> @llvm.vp.add.nxv8i32(<vscale x 8 x i32> %x, <vscale x 8 x i32> %c, <vscale x 8 x i1> %allones, i32 %evl)
+  %u = call <vscale x 8 x i32> @llvm.vp.select.nxv8i32(<vscale x 8 x i1> %m, <vscale x 8 x i32> %y, <vscale x 8 x i32> %a, i32 %evl)
+  ret <vscale x 8 x i32> %u
+}
+
+define <vscale x 8 x i32> @vmadd_vx_nxv8i32_ta(<vscale x 8 x i32> %a, i32 %b, <vscale x 8 x i32> %c,  <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmadd_vx_nxv8i32_ta:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; CHECK-NEXT:    vmacc.vx v12, a0, v8
+; CHECK-NEXT:    vmerge.vvm v8, v8, v12, v0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i32> %elt.head, <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
+  %splat = insertelement <vscale x 8 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 8 x i1> %splat, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %x = call <vscale x 8 x i32> @llvm.vp.mul.nxv8i32(<vscale x 8 x i32> %a, <vscale x 8 x i32> %vb, <vscale x 8 x i1> %allones, i32 %evl)
+  %y = call <vscale x 8 x i32> @llvm.vp.add.nxv8i32(<vscale x 8 x i32> %x, <vscale x 8 x i32> %c, <vscale x 8 x i1> %allones, i32 %evl)
+  %u = call <vscale x 8 x i32> @llvm.vp.select.nxv8i32(<vscale x 8 x i1> %m, <vscale x 8 x i32> %y, <vscale x 8 x i32> %a, i32 %evl)
+  ret <vscale x 8 x i32> %u
+}
+
+declare <vscale x 16 x i32> @llvm.vp.mul.nxv16i32(<vscale x 16 x i32>, <vscale x 16 x i32>, <vscale x 16 x i1>, i32)
+declare <vscale x 16 x i32> @llvm.vp.add.nxv16i32(<vscale x 16 x i32>, <vscale x 16 x i32>, <vscale x 16 x i1>, i32)
+declare <vscale x 16 x i32> @llvm.vp.merge.nxv16i32(<vscale x 16 x i1>, <vscale x 16 x i32>, <vscale x 16 x i32>, i32)
+declare <vscale x 16 x i32> @llvm.vp.select.nxv16i32(<vscale x 16 x i1>, <vscale x 16 x i32>, <vscale x 16 x i32>, i32)
+
+define <vscale x 16 x i32> @vmadd_vv_nxv16i32(<vscale x 16 x i32> %a, <vscale x 16 x i32> %b, <vscale x 16 x i32> %c,  <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmadd_vv_nxv16i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vl8re32.v v24, (a0)
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vmacc.vv v24, v8, v16
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, tu, ma
+; CHECK-NEXT:    vmerge.vvm v8, v8, v24, v0
+; CHECK-NEXT:    ret
+  %splat = insertelement <vscale x 16 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 16 x i1> %splat, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %x = call <vscale x 16 x i32> @llvm.vp.mul.nxv16i32(<vscale x 16 x i32> %a, <vscale x 16 x i32> %b, <vscale x 16 x i1> %allones, i32 %evl)
+  %y = call <vscale x 16 x i32> @llvm.vp.add.nxv16i32(<vscale x 16 x i32> %x, <vscale x 16 x i32> %c, <vscale x 16 x i1> %allones, i32 %evl)
+  %u = call <vscale x 16 x i32> @llvm.vp.merge.nxv16i32(<vscale x 16 x i1> %m, <vscale x 16 x i32> %y, <vscale x 16 x i32> %a, i32 %evl)
+  ret <vscale x 16 x i32> %u
+}
+
+define <vscale x 16 x i32> @vmadd_vv_nxv16i32_unmasked(<vscale x 16 x i32> %a, <vscale x 16 x i32> %b, <vscale x 16 x i32> %c,  <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmadd_vv_nxv16i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vl8re32.v v24, (a0)
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vmacc.vv v24, v8, v16
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, tu, ma
+; CHECK-NEXT:    vmv.v.v v8, v24
+; CHECK-NEXT:    ret
+  %splat = insertelement <vscale x 16 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 16 x i1> %splat, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %x = call <vscale x 16 x i32> @llvm.vp.mul.nxv16i32(<vscale x 16 x i32> %a, <vscale x 16 x i32> %b, <vscale x 16 x i1> %allones, i32 %evl)
+  %y = call <vscale x 16 x i32> @llvm.vp.add.nxv16i32(<vscale x 16 x i32> %x, <vscale x 16 x i32> %c, <vscale x 16 x i1> %allones, i32 %evl)
+  %u = call <vscale x 16 x i32> @llvm.vp.merge.nxv16i32(<vscale x 16 x i1> %allones, <vscale x 16 x i32> %y, <vscale x 16 x i32> %a, i32 %evl)
+  ret <vscale x 16 x i32> %u
+}
+
+define <vscale x 16 x i32> @vmadd_vx_nxv16i32(<vscale x 16 x i32> %a, i32 %b, <vscale x 16 x i32> %c,  <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmadd_vx_nxv16i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, tu, mu
+; CHECK-NEXT:    vmadd.vx v8, a0, v16, v0.t
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 16 x i32> %elt.head, <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+  %splat = insertelement <vscale x 16 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 16 x i1> %splat, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %x = call <vscale x 16 x i32> @llvm.vp.mul.nxv16i32(<vscale x 16 x i32> %a, <vscale x 16 x i32> %vb, <vscale x 16 x i1> %allones, i32 %evl)
+  %y = call <vscale x 16 x i32> @llvm.vp.add.nxv16i32(<vscale x 16 x i32> %x, <vscale x 16 x i32> %c, <vscale x 16 x i1> %allones, i32 %evl)
+  %u = call <vscale x 16 x i32> @llvm.vp.merge.nxv16i32(<vscale x 16 x i1> %m, <vscale x 16 x i32> %y, <vscale x 16 x i32> %a, i32 %evl)
+  ret <vscale x 16 x i32> %u
+}
+
+define <vscale x 16 x i32> @vmadd_vx_nxv16i32_unmasked(<vscale x 16 x i32> %a, i32 %b, <vscale x 16 x i32> %c,  <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmadd_vx_nxv16i32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, tu, ma
+; CHECK-NEXT:    vmadd.vx v8, a0, v16
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 16 x i32> %elt.head, <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+  %splat = insertelement <vscale x 16 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 16 x i1> %splat, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %x = call <vscale x 16 x i32> @llvm.vp.mul.nxv16i32(<vscale x 16 x i32> %a, <vscale x 16 x i32> %vb, <vscale x 16 x i1> %allones, i32 %evl)
+  %y = call <vscale x 16 x i32> @llvm.vp.add.nxv16i32(<vscale x 16 x i32> %x, <vscale x 16 x i32> %c, <vscale x 16 x i1> %allones, i32 %evl)
+  %u = call <vscale x 16 x i32> @llvm.vp.merge.nxv16i32(<vscale x 16 x i1> %allones, <vscale x 16 x i32> %y, <vscale x 16 x i32> %a, i32 %evl)
+  ret <vscale x 16 x i32> %u
+}
+
+define <vscale x 16 x i32> @vmadd_vv_nxv16i32_ta(<vscale x 16 x i32> %a, <vscale x 16 x i32> %b, <vscale x 16 x i32> %c,  <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmadd_vv_nxv16i32_ta:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vl8re32.v v24, (a0)
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vmacc.vv v24, v8, v16
+; CHECK-NEXT:    vmerge.vvm v8, v8, v24, v0
+; CHECK-NEXT:    ret
+  %splat = insertelement <vscale x 16 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 16 x i1> %splat, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %x = call <vscale x 16 x i32> @llvm.vp.mul.nxv16i32(<vscale x 16 x i32> %a, <vscale x 16 x i32> %b, <vscale x 16 x i1> %allones, i32 %evl)
+  %y = call <vscale x 16 x i32> @llvm.vp.add.nxv16i32(<vscale x 16 x i32> %x, <vscale x 16 x i32> %c, <vscale x 16 x i1> %allones, i32 %evl)
+  %u = call <vscale x 16 x i32> @llvm.vp.select.nxv16i32(<vscale x 16 x i1> %m, <vscale x 16 x i32> %y, <vscale x 16 x i32> %a, i32 %evl)
+  ret <vscale x 16 x i32> %u
+}
+
+define <vscale x 16 x i32> @vmadd_vx_nxv16i32_ta(<vscale x 16 x i32> %a, i32 %b, <vscale x 16 x i32> %c,  <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmadd_vx_nxv16i32_ta:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
+; CHECK-NEXT:    vmacc.vx v16, a0, v8
+; CHECK-NEXT:    vmerge.vvm v8, v8, v16, v0
+; CHECK-NEXT:    ret
+  %elt.head = insertelement <vscale x 16 x i32> poison, i32 %b, i32 0
+  %vb = shufflevector <vscale x 16 x i32> %elt.head, <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
+  %splat = insertelement <vscale x 16 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 16 x i1> %splat, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %x = call <vscale x 16 x i32> @llvm.vp.mul.nxv16i32(<vscale x 16 x i32> %a, <vscale x 16 x i32> %vb, <vscale x 16 x i1> %allones, i32 %evl)
+  %y = call <vscale x 16 x i32> @llvm.vp.add.nxv16i32(<vscale x 16 x i32> %x, <vscale x 16 x i32> %c, <vscale x 16 x i1> %allones, i32 %evl)
+  %u = call <vscale x 16 x i32> @llvm.vp.select.nxv16i32(<vscale x 16 x i1> %m, <vscale x 16 x i32> %y, <vscale x 16 x i32> %a, i32 %evl)
+  ret <vscale x 16 x i32> %u
+}
+
+declare <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i1>, i32)
+declare <vscale x 1 x i64> @llvm.vp.add.nxv1i64(<vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i1>, i32)
+declare <vscale x 1 x i64> @llvm.vp.merge.nxv1i64(<vscale x 1 x i1>, <vscale x 1 x i64>, <vscale x 1 x i64>, i32)
+declare <vscale x 1 x i64> @llvm.vp.select.nxv1i64(<vscale x 1 x i1>, <vscale x 1 x i64>, <vscale x 1 x i64>, i32)
+
+define <vscale x 1 x i64> @vmadd_vv_nxv1i64(<vscale x 1 x i64> %a, <vscale x 1 x i64> %b, <vscale x 1 x i64> %c,  <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmadd_vv_nxv1i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vmadd.vv v9, v8, v10
+; CHECK-NEXT:    vsetvli zero, zero, e64, m1, tu, ma
+; CHECK-NEXT:    vmerge.vvm v8, v8, v9, v0
+; CHECK-NEXT:    ret
+  %splat = insertelement <vscale x 1 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 1 x i1> %splat, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %x = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> %a, <vscale x 1 x i64> %b, <vscale x 1 x i1> %allones, i32 %evl)
+  %y = call <vscale x 1 x i64> @llvm.vp.add.nxv1i64(<vscale x 1 x i64> %x, <vscale x 1 x i64> %c, <vscale x 1 x i1> %allones, i32 %evl)
+  %u = call <vscale x 1 x i64> @llvm.vp.merge.nxv1i64(<vscale x 1 x i1> %m, <vscale x 1 x i64> %y, <vscale x 1 x i64> %a, i32 %evl)
+  ret <vscale x 1 x i64> %u
+}
+
+define <vscale x 1 x i64> @vmadd_vv_nxv1i64_unmasked(<vscale x 1 x i64> %a, <vscale x 1 x i64> %b, <vscale x 1 x i64> %c,  <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmadd_vv_nxv1i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vmadd.vv v9, v8, v10
+; CHECK-NEXT:    vsetvli zero, zero, e64, m1, tu, ma
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    ret
+  %splat = insertelement <vscale x 1 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 1 x i1> %splat, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %x = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> %a, <vscale x 1 x i64> %b, <vscale x 1 x i1> %allones, i32 %evl)
+  %y = call <vscale x 1 x i64> @llvm.vp.add.nxv1i64(<vscale x 1 x i64> %x, <vscale x 1 x i64> %c, <vscale x 1 x i1> %allones, i32 %evl)
+  %u = call <vscale x 1 x i64> @llvm.vp.merge.nxv1i64(<vscale x 1 x i1> %allones, <vscale x 1 x i64> %y, <vscale x 1 x i64> %a, i32 %evl)
+  ret <vscale x 1 x i64> %u
+}
+
+define <vscale x 1 x i64> @vmadd_vx_nxv1i64(<vscale x 1 x i64> %a, i64 %b, <vscale x 1 x i64> %c,  <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vmadd_vx_nxv1i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vmadd.vv v10, v8, v9
+; RV32-NEXT:    vsetvli zero, zero, e64, m1, tu, ma
+; RV32-NEXT:    vmerge.vvm v8, v8, v10, v0
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vmadd_vx_nxv1i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m1, tu, mu
+; RV64-NEXT:    vmadd.vx v8, a0, v9, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i64> %elt.head, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+  %splat = insertelement <vscale x 1 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 1 x i1> %splat, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %x = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> %a, <vscale x 1 x i64> %vb, <vscale x 1 x i1> %allones, i32 %evl)
+  %y = call <vscale x 1 x i64> @llvm.vp.add.nxv1i64(<vscale x 1 x i64> %x, <vscale x 1 x i64> %c, <vscale x 1 x i1> %allones, i32 %evl)
+  %u = call <vscale x 1 x i64> @llvm.vp.merge.nxv1i64(<vscale x 1 x i1> %m, <vscale x 1 x i64> %y, <vscale x 1 x i64> %a, i32 %evl)
+  ret <vscale x 1 x i64> %u
+}
+
+define <vscale x 1 x i64> @vmadd_vx_nxv1i64_unmasked(<vscale x 1 x i64> %a, i64 %b, <vscale x 1 x i64> %c,  <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vmadd_vx_nxv1i64_unmasked:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vmadd.vv v10, v8, v9
+; RV32-NEXT:    vsetvli zero, zero, e64, m1, tu, ma
+; RV32-NEXT:    vmv.v.v v8, v10
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vmadd_vx_nxv1i64_unmasked:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m1, tu, ma
+; RV64-NEXT:    vmadd.vx v8, a0, v9
+; RV64-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i64> %elt.head, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+  %splat = insertelement <vscale x 1 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 1 x i1> %splat, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %x = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> %a, <vscale x 1 x i64> %vb, <vscale x 1 x i1> %allones, i32 %evl)
+  %y = call <vscale x 1 x i64> @llvm.vp.add.nxv1i64(<vscale x 1 x i64> %x, <vscale x 1 x i64> %c, <vscale x 1 x i1> %allones, i32 %evl)
+  %u = call <vscale x 1 x i64> @llvm.vp.merge.nxv1i64(<vscale x 1 x i1> %allones, <vscale x 1 x i64> %y, <vscale x 1 x i64> %a, i32 %evl)
+  ret <vscale x 1 x i64> %u
+}
+
+define <vscale x 1 x i64> @vmadd_vv_nxv1i64_ta(<vscale x 1 x i64> %a, <vscale x 1 x i64> %b, <vscale x 1 x i64> %c,  <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmadd_vv_nxv1i64_ta:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vmadd.vv v9, v8, v10
+; CHECK-NEXT:    vmerge.vvm v8, v8, v9, v0
+; CHECK-NEXT:    ret
+  %splat = insertelement <vscale x 1 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 1 x i1> %splat, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %x = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> %a, <vscale x 1 x i64> %b, <vscale x 1 x i1> %allones, i32 %evl)
+  %y = call <vscale x 1 x i64> @llvm.vp.add.nxv1i64(<vscale x 1 x i64> %x, <vscale x 1 x i64> %c, <vscale x 1 x i1> %allones, i32 %evl)
+  %u = call <vscale x 1 x i64> @llvm.vp.select.nxv1i64(<vscale x 1 x i1> %m, <vscale x 1 x i64> %y, <vscale x 1 x i64> %a, i32 %evl)
+  ret <vscale x 1 x i64> %u
+}
+
+define <vscale x 1 x i64> @vmadd_vx_nxv1i64_ta(<vscale x 1 x i64> %a, i64 %b, <vscale x 1 x i64> %c,  <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vmadd_vx_nxv1i64_ta:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; RV32-NEXT:    vlse64.v v10, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m1, ta, ma
+; RV32-NEXT:    vmadd.vv v10, v8, v9
+; RV32-NEXT:    vmerge.vvm v8, v8, v10, v0
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vmadd_vx_nxv1i64_ta:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; RV64-NEXT:    vmacc.vx v9, a0, v8
+; RV64-NEXT:    vmerge.vvm v8, v8, v9, v0
+; RV64-NEXT:    ret
+  %elt.head = insertelement <vscale x 1 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <vscale x 1 x i64> %elt.head, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+  %splat = insertelement <vscale x 1 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 1 x i1> %splat, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %x = call <vscale x 1 x i64> @llvm.vp.mul.nxv1i64(<vscale x 1 x i64> %a, <vscale x 1 x i64> %vb, <vscale x 1 x i1> %allones, i32 %evl)
+  %y = call <vscale x 1 x i64> @llvm.vp.add.nxv1i64(<vscale x 1 x i64> %x, <vscale x 1 x i64> %c, <vscale x 1 x i1> %allones, i32 %evl)
+  %u = call <vscale x 1 x i64> @llvm.vp.select.nxv1i64(<vscale x 1 x i1> %m, <vscale x 1 x i64> %y, <vscale x 1 x i64> %a, i32 %evl)
+  ret <vscale x 1 x i64> %u
+}
+
+declare <vscale x 2 x i64> @llvm.vp.mul.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i1>, i32)
+declare <vscale x 2 x i64> @llvm.vp.add.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i1>, i32)
+declare <vscale x 2 x i64> @llvm.vp.merge.nxv2i64(<vscale x 2 x i1>, <vscale x 2 x i64>, <vscale x 2 x i64>, i32)
+declare <vscale x 2 x i64> @llvm.vp.select.nxv2i64(<vscale x 2 x i1>, <vscale x 2 x i64>, <vscale x 2 x i64>, i32)
+
+define <vscale x 2 x i64> @vmadd_vv_nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, <vscale x 2 x i64> %c,  <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmadd_vv_nxv2i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vmadd.vv v10, v8, v12
+; CHECK-NEXT:    vsetvli zero, zero, e64, m2, tu, ma
+; CHECK-NEXT:    vmerge.vvm v8, v8, v10, v0
+; CHECK-NEXT:    ret
+  %splat = insertelement <vscale x 2 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 2 x i1> %splat, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %x = call <vscale x 2 x i64> @llvm.vp.mul.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, <vscale x 2 x i1> %allones, i32 %evl)
+  %y = call <vscale x 2 x i64> @llvm.vp.add.nxv2i64(<vscale x 2 x i64> %x, <vscale x 2 x i64> %c, <vscale x 2 x i1> %allones, i32 %evl)
+  %u = call <vscale x 2 x i64> @llvm.vp.merge.nxv2i64(<vscale x 2 x i1> %m, <vscale x 2 x i64> %y, <vscale x 2 x i64> %a, i32 %evl)
+  ret <vscale x 2 x i64> %u
+}
+
+define <vscale x 2 x i64> @vmadd_vv_nxv2i64_unmasked(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, <vscale x 2 x i64> %c,  <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmadd_vv_nxv2i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vmadd.vv v10, v8, v12
+; CHECK-NEXT:    vsetvli zero, zero, e64, m2, tu, ma
+; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    ret
+  %splat = insertelement <vscale x 2 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 2 x i1> %splat, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %x = call <vscale x 2 x i64> @llvm.vp.mul.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, <vscale x 2 x i1> %allones, i32 %evl)
+  %y = call <vscale x 2 x i64> @llvm.vp.add.nxv2i64(<vscale x 2 x i64> %x, <vscale x 2 x i64> %c, <vscale x 2 x i1> %allones, i32 %evl)
+  %u = call <vscale x 2 x i64> @llvm.vp.merge.nxv2i64(<vscale x 2 x i1> %allones, <vscale x 2 x i64> %y, <vscale x 2 x i64> %a, i32 %evl)
+  ret <vscale x 2 x i64> %u
+}
+
+define <vscale x 2 x i64> @vmadd_vx_nxv2i64(<vscale x 2 x i64> %a, i64 %b, <vscale x 2 x i64> %c,  <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vmadd_vx_nxv2i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vmadd.vv v12, v8, v10
+; RV32-NEXT:    vsetvli zero, zero, e64, m2, tu, ma
+; RV32-NEXT:    vmerge.vvm v8, v8, v12, v0
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vmadd_vx_nxv2i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m2, tu, mu
+; RV64-NEXT:    vmadd.vx v8, a0, v10, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <vscale x 2 x i64> %elt.head, <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+  %splat = insertelement <vscale x 2 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 2 x i1> %splat, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %x = call <vscale x 2 x i64> @llvm.vp.mul.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %vb, <vscale x 2 x i1> %allones, i32 %evl)
+  %y = call <vscale x 2 x i64> @llvm.vp.add.nxv2i64(<vscale x 2 x i64> %x, <vscale x 2 x i64> %c, <vscale x 2 x i1> %allones, i32 %evl)
+  %u = call <vscale x 2 x i64> @llvm.vp.merge.nxv2i64(<vscale x 2 x i1> %m, <vscale x 2 x i64> %y, <vscale x 2 x i64> %a, i32 %evl)
+  ret <vscale x 2 x i64> %u
+}
+
+define <vscale x 2 x i64> @vmadd_vx_nxv2i64_unmasked(<vscale x 2 x i64> %a, i64 %b, <vscale x 2 x i64> %c,  <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vmadd_vx_nxv2i64_unmasked:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vmadd.vv v12, v8, v10
+; RV32-NEXT:    vsetvli zero, zero, e64, m2, tu, ma
+; RV32-NEXT:    vmv.v.v v8, v12
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vmadd_vx_nxv2i64_unmasked:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m2, tu, ma
+; RV64-NEXT:    vmadd.vx v8, a0, v10
+; RV64-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <vscale x 2 x i64> %elt.head, <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+  %splat = insertelement <vscale x 2 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 2 x i1> %splat, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %x = call <vscale x 2 x i64> @llvm.vp.mul.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %vb, <vscale x 2 x i1> %allones, i32 %evl)
+  %y = call <vscale x 2 x i64> @llvm.vp.add.nxv2i64(<vscale x 2 x i64> %x, <vscale x 2 x i64> %c, <vscale x 2 x i1> %allones, i32 %evl)
+  %u = call <vscale x 2 x i64> @llvm.vp.merge.nxv2i64(<vscale x 2 x i1> %allones, <vscale x 2 x i64> %y, <vscale x 2 x i64> %a, i32 %evl)
+  ret <vscale x 2 x i64> %u
+}
+
+define <vscale x 2 x i64> @vmadd_vv_nxv2i64_ta(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, <vscale x 2 x i64> %c,  <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmadd_vv_nxv2i64_ta:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vmadd.vv v10, v8, v12
+; CHECK-NEXT:    vmerge.vvm v8, v8, v10, v0
+; CHECK-NEXT:    ret
+  %splat = insertelement <vscale x 2 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 2 x i1> %splat, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %x = call <vscale x 2 x i64> @llvm.vp.mul.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, <vscale x 2 x i1> %allones, i32 %evl)
+  %y = call <vscale x 2 x i64> @llvm.vp.add.nxv2i64(<vscale x 2 x i64> %x, <vscale x 2 x i64> %c, <vscale x 2 x i1> %allones, i32 %evl)
+  %u = call <vscale x 2 x i64> @llvm.vp.select.nxv2i64(<vscale x 2 x i1> %m, <vscale x 2 x i64> %y, <vscale x 2 x i64> %a, i32 %evl)
+  ret <vscale x 2 x i64> %u
+}
+
+define <vscale x 2 x i64> @vmadd_vx_nxv2i64_ta(<vscale x 2 x i64> %a, i64 %b, <vscale x 2 x i64> %c,  <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vmadd_vx_nxv2i64_ta:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; RV32-NEXT:    vlse64.v v12, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m2, ta, ma
+; RV32-NEXT:    vmadd.vv v12, v8, v10
+; RV32-NEXT:    vmerge.vvm v8, v8, v12, v0
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vmadd_vx_nxv2i64_ta:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m2, ta, ma
+; RV64-NEXT:    vmacc.vx v10, a0, v8
+; RV64-NEXT:    vmerge.vvm v8, v8, v10, v0
+; RV64-NEXT:    ret
+  %elt.head = insertelement <vscale x 2 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <vscale x 2 x i64> %elt.head, <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+  %splat = insertelement <vscale x 2 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 2 x i1> %splat, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %x = call <vscale x 2 x i64> @llvm.vp.mul.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %vb, <vscale x 2 x i1> %allones, i32 %evl)
+  %y = call <vscale x 2 x i64> @llvm.vp.add.nxv2i64(<vscale x 2 x i64> %x, <vscale x 2 x i64> %c, <vscale x 2 x i1> %allones, i32 %evl)
+  %u = call <vscale x 2 x i64> @llvm.vp.select.nxv2i64(<vscale x 2 x i1> %m, <vscale x 2 x i64> %y, <vscale x 2 x i64> %a, i32 %evl)
+  ret <vscale x 2 x i64> %u
+}
+
+declare <vscale x 4 x i64> @llvm.vp.mul.nxv4i64(<vscale x 4 x i64>, <vscale x 4 x i64>, <vscale x 4 x i1>, i32)
+declare <vscale x 4 x i64> @llvm.vp.add.nxv4i64(<vscale x 4 x i64>, <vscale x 4 x i64>, <vscale x 4 x i1>, i32)
+declare <vscale x 4 x i64> @llvm.vp.merge.nxv4i64(<vscale x 4 x i1>, <vscale x 4 x i64>, <vscale x 4 x i64>, i32)
+declare <vscale x 4 x i64> @llvm.vp.select.nxv4i64(<vscale x 4 x i1>, <vscale x 4 x i64>, <vscale x 4 x i64>, i32)
+
+define <vscale x 4 x i64> @vmadd_vv_nxv4i64(<vscale x 4 x i64> %a, <vscale x 4 x i64> %b, <vscale x 4 x i64> %c,  <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmadd_vv_nxv4i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-NEXT:    vmadd.vv v12, v8, v16
+; CHECK-NEXT:    vsetvli zero, zero, e64, m4, tu, ma
+; CHECK-NEXT:    vmerge.vvm v8, v8, v12, v0
+; CHECK-NEXT:    ret
+  %splat = insertelement <vscale x 4 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 4 x i1> %splat, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %x = call <vscale x 4 x i64> @llvm.vp.mul.nxv4i64(<vscale x 4 x i64> %a, <vscale x 4 x i64> %b, <vscale x 4 x i1> %allones, i32 %evl)
+  %y = call <vscale x 4 x i64> @llvm.vp.add.nxv4i64(<vscale x 4 x i64> %x, <vscale x 4 x i64> %c, <vscale x 4 x i1> %allones, i32 %evl)
+  %u = call <vscale x 4 x i64> @llvm.vp.merge.nxv4i64(<vscale x 4 x i1> %m, <vscale x 4 x i64> %y, <vscale x 4 x i64> %a, i32 %evl)
+  ret <vscale x 4 x i64> %u
+}
+
+define <vscale x 4 x i64> @vmadd_vv_nxv4i64_unmasked(<vscale x 4 x i64> %a, <vscale x 4 x i64> %b, <vscale x 4 x i64> %c,  <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmadd_vv_nxv4i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-NEXT:    vmadd.vv v12, v8, v16
+; CHECK-NEXT:    vsetvli zero, zero, e64, m4, tu, ma
+; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    ret
+  %splat = insertelement <vscale x 4 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 4 x i1> %splat, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %x = call <vscale x 4 x i64> @llvm.vp.mul.nxv4i64(<vscale x 4 x i64> %a, <vscale x 4 x i64> %b, <vscale x 4 x i1> %allones, i32 %evl)
+  %y = call <vscale x 4 x i64> @llvm.vp.add.nxv4i64(<vscale x 4 x i64> %x, <vscale x 4 x i64> %c, <vscale x 4 x i1> %allones, i32 %evl)
+  %u = call <vscale x 4 x i64> @llvm.vp.merge.nxv4i64(<vscale x 4 x i1> %allones, <vscale x 4 x i64> %y, <vscale x 4 x i64> %a, i32 %evl)
+  ret <vscale x 4 x i64> %u
+}
+
+define <vscale x 4 x i64> @vmadd_vx_nxv4i64(<vscale x 4 x i64> %a, i64 %b, <vscale x 4 x i64> %c,  <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vmadd_vx_nxv4i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vmadd.vv v16, v8, v12
+; RV32-NEXT:    vsetvli zero, zero, e64, m4, tu, ma
+; RV32-NEXT:    vmerge.vvm v8, v8, v16, v0
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vmadd_vx_nxv4i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m4, tu, mu
+; RV64-NEXT:    vmadd.vx v8, a0, v12, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <vscale x 4 x i64> %elt.head, <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+  %splat = insertelement <vscale x 4 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 4 x i1> %splat, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %x = call <vscale x 4 x i64> @llvm.vp.mul.nxv4i64(<vscale x 4 x i64> %a, <vscale x 4 x i64> %vb, <vscale x 4 x i1> %allones, i32 %evl)
+  %y = call <vscale x 4 x i64> @llvm.vp.add.nxv4i64(<vscale x 4 x i64> %x, <vscale x 4 x i64> %c, <vscale x 4 x i1> %allones, i32 %evl)
+  %u = call <vscale x 4 x i64> @llvm.vp.merge.nxv4i64(<vscale x 4 x i1> %m, <vscale x 4 x i64> %y, <vscale x 4 x i64> %a, i32 %evl)
+  ret <vscale x 4 x i64> %u
+}
+
+define <vscale x 4 x i64> @vmadd_vx_nxv4i64_unmasked(<vscale x 4 x i64> %a, i64 %b, <vscale x 4 x i64> %c,  <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vmadd_vx_nxv4i64_unmasked:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vmadd.vv v16, v8, v12
+; RV32-NEXT:    vsetvli zero, zero, e64, m4, tu, ma
+; RV32-NEXT:    vmv.v.v v8, v16
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vmadd_vx_nxv4i64_unmasked:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m4, tu, ma
+; RV64-NEXT:    vmadd.vx v8, a0, v12
+; RV64-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <vscale x 4 x i64> %elt.head, <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+  %splat = insertelement <vscale x 4 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 4 x i1> %splat, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %x = call <vscale x 4 x i64> @llvm.vp.mul.nxv4i64(<vscale x 4 x i64> %a, <vscale x 4 x i64> %vb, <vscale x 4 x i1> %allones, i32 %evl)
+  %y = call <vscale x 4 x i64> @llvm.vp.add.nxv4i64(<vscale x 4 x i64> %x, <vscale x 4 x i64> %c, <vscale x 4 x i1> %allones, i32 %evl)
+  %u = call <vscale x 4 x i64> @llvm.vp.merge.nxv4i64(<vscale x 4 x i1> %allones, <vscale x 4 x i64> %y, <vscale x 4 x i64> %a, i32 %evl)
+  ret <vscale x 4 x i64> %u
+}
+
+define <vscale x 4 x i64> @vmadd_vv_nxv4i64_ta(<vscale x 4 x i64> %a, <vscale x 4 x i64> %b, <vscale x 4 x i64> %c,  <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmadd_vv_nxv4i64_ta:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-NEXT:    vmadd.vv v12, v8, v16
+; CHECK-NEXT:    vmerge.vvm v8, v8, v12, v0
+; CHECK-NEXT:    ret
+  %splat = insertelement <vscale x 4 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 4 x i1> %splat, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %x = call <vscale x 4 x i64> @llvm.vp.mul.nxv4i64(<vscale x 4 x i64> %a, <vscale x 4 x i64> %b, <vscale x 4 x i1> %allones, i32 %evl)
+  %y = call <vscale x 4 x i64> @llvm.vp.add.nxv4i64(<vscale x 4 x i64> %x, <vscale x 4 x i64> %c, <vscale x 4 x i1> %allones, i32 %evl)
+  %u = call <vscale x 4 x i64> @llvm.vp.select.nxv4i64(<vscale x 4 x i1> %m, <vscale x 4 x i64> %y, <vscale x 4 x i64> %a, i32 %evl)
+  ret <vscale x 4 x i64> %u
+}
+
+define <vscale x 4 x i64> @vmadd_vx_nxv4i64_ta(<vscale x 4 x i64> %a, i64 %b, <vscale x 4 x i64> %c,  <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vmadd_vx_nxv4i64_ta:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli a1, zero, e64, m4, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m4, ta, ma
+; RV32-NEXT:    vmadd.vv v16, v8, v12
+; RV32-NEXT:    vmerge.vvm v8, v8, v16, v0
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vmadd_vx_nxv4i64_ta:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; RV64-NEXT:    vmacc.vx v12, a0, v8
+; RV64-NEXT:    vmerge.vvm v8, v8, v12, v0
+; RV64-NEXT:    ret
+  %elt.head = insertelement <vscale x 4 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <vscale x 4 x i64> %elt.head, <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
+  %splat = insertelement <vscale x 4 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 4 x i1> %splat, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %x = call <vscale x 4 x i64> @llvm.vp.mul.nxv4i64(<vscale x 4 x i64> %a, <vscale x 4 x i64> %vb, <vscale x 4 x i1> %allones, i32 %evl)
+  %y = call <vscale x 4 x i64> @llvm.vp.add.nxv4i64(<vscale x 4 x i64> %x, <vscale x 4 x i64> %c, <vscale x 4 x i1> %allones, i32 %evl)
+  %u = call <vscale x 4 x i64> @llvm.vp.select.nxv4i64(<vscale x 4 x i1> %m, <vscale x 4 x i64> %y, <vscale x 4 x i64> %a, i32 %evl)
+  ret <vscale x 4 x i64> %u
+}
+
+declare <vscale x 8 x i64> @llvm.vp.mul.nxv8i64(<vscale x 8 x i64>, <vscale x 8 x i64>, <vscale x 8 x i1>, i32)
+declare <vscale x 8 x i64> @llvm.vp.add.nxv8i64(<vscale x 8 x i64>, <vscale x 8 x i64>, <vscale x 8 x i1>, i32)
+declare <vscale x 8 x i64> @llvm.vp.merge.nxv8i64(<vscale x 8 x i1>, <vscale x 8 x i64>, <vscale x 8 x i64>, i32)
+declare <vscale x 8 x i64> @llvm.vp.select.nxv8i64(<vscale x 8 x i1>, <vscale x 8 x i64>, <vscale x 8 x i64>, i32)
+
+define <vscale x 8 x i64> @vmadd_vv_nxv8i64(<vscale x 8 x i64> %a, <vscale x 8 x i64> %b, <vscale x 8 x i64> %c,  <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmadd_vv_nxv8i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vl8re64.v v24, (a0)
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vmacc.vv v24, v8, v16
+; CHECK-NEXT:    vsetvli zero, zero, e64, m8, tu, ma
+; CHECK-NEXT:    vmerge.vvm v8, v8, v24, v0
+; CHECK-NEXT:    ret
+  %splat = insertelement <vscale x 8 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 8 x i1> %splat, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %x = call <vscale x 8 x i64> @llvm.vp.mul.nxv8i64(<vscale x 8 x i64> %a, <vscale x 8 x i64> %b, <vscale x 8 x i1> %allones, i32 %evl)
+  %y = call <vscale x 8 x i64> @llvm.vp.add.nxv8i64(<vscale x 8 x i64> %x, <vscale x 8 x i64> %c, <vscale x 8 x i1> %allones, i32 %evl)
+  %u = call <vscale x 8 x i64> @llvm.vp.merge.nxv8i64(<vscale x 8 x i1> %m, <vscale x 8 x i64> %y, <vscale x 8 x i64> %a, i32 %evl)
+  ret <vscale x 8 x i64> %u
+}
+
+define <vscale x 8 x i64> @vmadd_vv_nxv8i64_unmasked(<vscale x 8 x i64> %a, <vscale x 8 x i64> %b, <vscale x 8 x i64> %c,  <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmadd_vv_nxv8i64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vl8re64.v v24, (a0)
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vmacc.vv v24, v8, v16
+; CHECK-NEXT:    vsetvli zero, zero, e64, m8, tu, ma
+; CHECK-NEXT:    vmv.v.v v8, v24
+; CHECK-NEXT:    ret
+  %splat = insertelement <vscale x 8 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 8 x i1> %splat, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %x = call <vscale x 8 x i64> @llvm.vp.mul.nxv8i64(<vscale x 8 x i64> %a, <vscale x 8 x i64> %b, <vscale x 8 x i1> %allones, i32 %evl)
+  %y = call <vscale x 8 x i64> @llvm.vp.add.nxv8i64(<vscale x 8 x i64> %x, <vscale x 8 x i64> %c, <vscale x 8 x i1> %allones, i32 %evl)
+  %u = call <vscale x 8 x i64> @llvm.vp.merge.nxv8i64(<vscale x 8 x i1> %allones, <vscale x 8 x i64> %y, <vscale x 8 x i64> %a, i32 %evl)
+  ret <vscale x 8 x i64> %u
+}
+
+define <vscale x 8 x i64> @vmadd_vx_nxv8i64(<vscale x 8 x i64> %a, i64 %b, <vscale x 8 x i64> %c,  <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vmadd_vx_nxv8i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v24, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vmadd.vv v24, v8, v16
+; RV32-NEXT:    vsetvli zero, zero, e64, m8, tu, ma
+; RV32-NEXT:    vmerge.vvm v8, v8, v24, v0
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vmadd_vx_nxv8i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, tu, mu
+; RV64-NEXT:    vmadd.vx v8, a0, v16, v0.t
+; RV64-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i64> %elt.head, <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
+  %splat = insertelement <vscale x 8 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 8 x i1> %splat, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %x = call <vscale x 8 x i64> @llvm.vp.mul.nxv8i64(<vscale x 8 x i64> %a, <vscale x 8 x i64> %vb, <vscale x 8 x i1> %allones, i32 %evl)
+  %y = call <vscale x 8 x i64> @llvm.vp.add.nxv8i64(<vscale x 8 x i64> %x, <vscale x 8 x i64> %c, <vscale x 8 x i1> %allones, i32 %evl)
+  %u = call <vscale x 8 x i64> @llvm.vp.merge.nxv8i64(<vscale x 8 x i1> %m, <vscale x 8 x i64> %y, <vscale x 8 x i64> %a, i32 %evl)
+  ret <vscale x 8 x i64> %u
+}
+
+define <vscale x 8 x i64> @vmadd_vx_nxv8i64_unmasked(<vscale x 8 x i64> %a, i64 %b, <vscale x 8 x i64> %c,  <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vmadd_vx_nxv8i64_unmasked:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v24, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vmadd.vv v24, v8, v16
+; RV32-NEXT:    vsetvli zero, zero, e64, m8, tu, ma
+; RV32-NEXT:    vmv.v.v v8, v24
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vmadd_vx_nxv8i64_unmasked:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, tu, ma
+; RV64-NEXT:    vmadd.vx v8, a0, v16
+; RV64-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i64> %elt.head, <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
+  %splat = insertelement <vscale x 8 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 8 x i1> %splat, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %x = call <vscale x 8 x i64> @llvm.vp.mul.nxv8i64(<vscale x 8 x i64> %a, <vscale x 8 x i64> %vb, <vscale x 8 x i1> %allones, i32 %evl)
+  %y = call <vscale x 8 x i64> @llvm.vp.add.nxv8i64(<vscale x 8 x i64> %x, <vscale x 8 x i64> %c, <vscale x 8 x i1> %allones, i32 %evl)
+  %u = call <vscale x 8 x i64> @llvm.vp.merge.nxv8i64(<vscale x 8 x i1> %allones, <vscale x 8 x i64> %y, <vscale x 8 x i64> %a, i32 %evl)
+  ret <vscale x 8 x i64> %u
+}
+
+define <vscale x 8 x i64> @vmadd_vv_nxv8i64_ta(<vscale x 8 x i64> %a, <vscale x 8 x i64> %b, <vscale x 8 x i64> %c,  <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vmadd_vv_nxv8i64_ta:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vl8re64.v v24, (a0)
+; CHECK-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; CHECK-NEXT:    vmacc.vv v24, v8, v16
+; CHECK-NEXT:    vmerge.vvm v8, v8, v24, v0
+; CHECK-NEXT:    ret
+  %splat = insertelement <vscale x 8 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 8 x i1> %splat, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %x = call <vscale x 8 x i64> @llvm.vp.mul.nxv8i64(<vscale x 8 x i64> %a, <vscale x 8 x i64> %b, <vscale x 8 x i1> %allones, i32 %evl)
+  %y = call <vscale x 8 x i64> @llvm.vp.add.nxv8i64(<vscale x 8 x i64> %x, <vscale x 8 x i64> %c, <vscale x 8 x i1> %allones, i32 %evl)
+  %u = call <vscale x 8 x i64> @llvm.vp.select.nxv8i64(<vscale x 8 x i1> %m, <vscale x 8 x i64> %y, <vscale x 8 x i64> %a, i32 %evl)
+  ret <vscale x 8 x i64> %u
+}
+
+define <vscale x 8 x i64> @vmadd_vx_nxv8i64_ta(<vscale x 8 x i64> %a, i64 %b, <vscale x 8 x i64> %c,  <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; RV32-LABEL: vmadd_vx_nxv8i64_ta:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -16
+; RV32-NEXT:    .cfi_def_cfa_offset 16
+; RV32-NEXT:    sw a1, 12(sp)
+; RV32-NEXT:    sw a0, 8(sp)
+; RV32-NEXT:    addi a0, sp, 8
+; RV32-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v24, (a0), zero
+; RV32-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; RV32-NEXT:    vmadd.vv v24, v8, v16
+; RV32-NEXT:    vmerge.vvm v8, v8, v24, v0
+; RV32-NEXT:    addi sp, sp, 16
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vmadd_vx_nxv8i64_ta:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetvli zero, a1, e64, m8, ta, ma
+; RV64-NEXT:    vmacc.vx v16, a0, v8
+; RV64-NEXT:    vmerge.vvm v8, v8, v16, v0
+; RV64-NEXT:    ret
+  %elt.head = insertelement <vscale x 8 x i64> poison, i64 %b, i32 0
+  %vb = shufflevector <vscale x 8 x i64> %elt.head, <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
+  %splat = insertelement <vscale x 8 x i1> poison, i1 -1, i32 0
+  %allones = shufflevector <vscale x 8 x i1> %splat, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %x = call <vscale x 8 x i64> @llvm.vp.mul.nxv8i64(<vscale x 8 x i64> %a, <vscale x 8 x i64> %vb, <vscale x 8 x i1> %allones, i32 %evl)
+  %y = call <vscale x 8 x i64> @llvm.vp.add.nxv8i64(<vscale x 8 x i64> %x, <vscale x 8 x i64> %c, <vscale x 8 x i1> %allones, i32 %evl)
+  %u = call <vscale x 8 x i64> @llvm.vp.select.nxv8i64(<vscale x 8 x i1> %m, <vscale x 8 x i64> %y, <vscale x 8 x i64> %a, i32 %evl)
+  ret <vscale x 8 x i64> %u
+}

From 897cc8a7d7c0e47686322dbd4b95ecad30bb2298 Mon Sep 17 00:00:00 2001
From: Shivam Gupta <shivam98.tkg@gmail.com>
Date: Thu, 26 Oct 2023 12:39:48 +0530
Subject: [PATCH 012/877] [RecursiveASTVisitor] Fix RecursiveASTVisitor (RAV)
 fails to visit the initializer of a bitfield (#69557)

The problem was introduced in the commit
https://github.com/llvm/llvm-project/commit/6b8e3c02ca44fb6c3738bb0c75859c11a03e30ed
when the possibility of initialized bitfields was added, but the logic
in RecursiveASTVisitor was not updated. This PR fixed that.

This fixes https://github.com/llvm/llvm-project/issues/64916.

Patch by Scott McPeak

---------

Co-authored-by: cor3ntin <corentinjabot@gmail.com>
---
 clang/docs/ReleaseNotes.rst                   |  3 ++
 clang/include/clang/AST/RecursiveASTVisitor.h |  2 +-
 clang/unittests/Tooling/CMakeLists.txt        |  1 +
 .../BitfieldInitializer.cpp                   | 34 +++++++++++++++++++
 4 files changed, 39 insertions(+), 1 deletion(-)
 create mode 100644 clang/unittests/Tooling/RecursiveASTVisitorTests/BitfieldInitializer.cpp

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 42f20b9a9bb04..074116d2edf9f 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -650,6 +650,9 @@ Bug Fixes to AST Handling
   `Issue 64170 <https://github.com/llvm/llvm-project/issues/64170>`_
 - Fixed ``hasAnyBase`` not binding nodes in its submatcher.
   (`#65421 <https://github.com/llvm/llvm-project/issues/65421>`_)
+- Fixed a bug where RecursiveASTVisitor fails to visit the
+  initializer of a bitfield.
+  `Issue 64916 <https://github.com/llvm/llvm-project/issues/64916>`_
 
 Miscellaneous Bug Fixes
 ^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/clang/include/clang/AST/RecursiveASTVisitor.h b/clang/include/clang/AST/RecursiveASTVisitor.h
index 3dd23eb38eeab..53bc15e1b19f6 100644
--- a/clang/include/clang/AST/RecursiveASTVisitor.h
+++ b/clang/include/clang/AST/RecursiveASTVisitor.h
@@ -2103,7 +2103,7 @@ DEF_TRAVERSE_DECL(FieldDecl, {
   TRY_TO(TraverseDeclaratorHelper(D));
   if (D->isBitField())
     TRY_TO(TraverseStmt(D->getBitWidth()));
-  else if (D->hasInClassInitializer())
+  if (D->hasInClassInitializer())
     TRY_TO(TraverseStmt(D->getInClassInitializer()));
 })
 
diff --git a/clang/unittests/Tooling/CMakeLists.txt b/clang/unittests/Tooling/CMakeLists.txt
index 2fbe78e3fab75..5a10a6b285390 100644
--- a/clang/unittests/Tooling/CMakeLists.txt
+++ b/clang/unittests/Tooling/CMakeLists.txt
@@ -25,6 +25,7 @@ add_clang_unittest(ToolingTests
   QualTypeNamesTest.cpp
   RangeSelectorTest.cpp
   RecursiveASTVisitorTests/Attr.cpp
+  RecursiveASTVisitorTests/BitfieldInitializer.cpp
   RecursiveASTVisitorTests/CallbacksLeaf.cpp
   RecursiveASTVisitorTests/CallbacksUnaryOperator.cpp
   RecursiveASTVisitorTests/CallbacksBinaryOperator.cpp
diff --git a/clang/unittests/Tooling/RecursiveASTVisitorTests/BitfieldInitializer.cpp b/clang/unittests/Tooling/RecursiveASTVisitorTests/BitfieldInitializer.cpp
new file mode 100644
index 0000000000000..c11e726fe8552
--- /dev/null
+++ b/clang/unittests/Tooling/RecursiveASTVisitorTests/BitfieldInitializer.cpp
@@ -0,0 +1,34 @@
+//===- unittest/Tooling/RecursiveASTVisitorTests/BitfieldInitializer.cpp -===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "TestVisitor.h"
+#include <string>
+
+using namespace clang;
+
+namespace {
+
+// Check to ensure that bitfield initializers are visited.
+class BitfieldInitializerVisitor
+    : public ExpectedLocationVisitor<BitfieldInitializerVisitor> {
+public:
+  bool VisitIntegerLiteral(IntegerLiteral *IL) {
+    Match(std::to_string(IL->getValue().getSExtValue()), IL->getLocation());
+    return true;
+  }
+};
+
+TEST(RecursiveASTVisitor, BitfieldInitializerIsVisited) {
+  BitfieldInitializerVisitor Visitor;
+  Visitor.ExpectMatch("123", 2, 15);
+  EXPECT_TRUE(Visitor.runOver("struct S {\n"
+                              "  int x : 8 = 123;\n"
+                              "};\n"));
+}
+
+} // end anonymous namespace

From 6e2d67e7d66f46fbe4f4c35c7c59d3a8706ea8b0 Mon Sep 17 00:00:00 2001
From: Yeting Kuo <46629943+yetingk@users.noreply.github.com>
Date: Thu, 26 Oct 2023 15:10:57 +0800
Subject: [PATCH 013/877] [RISCV] Support predefined macro
 __riscv_misaligned_[fast,avoid]. (#65756)

RISC-V C API introduced predefined macro to achieve hints about
unaligned accesses ([pr]). This patch defines __riscv_misaligned_fast
when using -mno-strict-align, otherwise, defines
__riscv_misaligned_avoid.

Note: This ignores __riscv_misaligned_slow which is also defined by
spec.

[pr]: https://github.com/riscv-non-isa/riscv-c-api-doc/pull/40
---
 clang/lib/Basic/Targets/RISCV.cpp               |  7 +++++++
 clang/lib/Basic/Targets/RISCV.h                 |  3 +++
 clang/test/Preprocessor/riscv-target-features.c | 12 ++++++++++++
 3 files changed, 22 insertions(+)

diff --git a/clang/lib/Basic/Targets/RISCV.cpp b/clang/lib/Basic/Targets/RISCV.cpp
index 5f75619b74554..0b9ebeaf5e75b 100644
--- a/clang/lib/Basic/Targets/RISCV.cpp
+++ b/clang/lib/Basic/Targets/RISCV.cpp
@@ -210,6 +210,11 @@ void RISCVTargetInfo::getTargetDefines(const LangOptions &Opts,
   if (VScale && VScale->first && VScale->first == VScale->second)
     Builder.defineMacro("__riscv_v_fixed_vlen",
                         Twine(VScale->first * llvm::RISCV::RVVBitsPerBlock));
+
+  if (FastUnalignedAccess)
+    Builder.defineMacro("__riscv_misaligned_fast");
+  else
+    Builder.defineMacro("__riscv_misaligned_avoid");
 }
 
 static constexpr Builtin::Info BuiltinInfo[] = {
@@ -328,6 +333,8 @@ bool RISCVTargetInfo::handleTargetFeatures(std::vector<std::string> &Features,
   if (ISAInfo->hasExtension("zfh") || ISAInfo->hasExtension("zhinx"))
     HasLegalHalfType = true;
 
+  FastUnalignedAccess = llvm::is_contained(Features, "+unaligned-scalar-mem");
+
   return true;
 }
 
diff --git a/clang/lib/Basic/Targets/RISCV.h b/clang/lib/Basic/Targets/RISCV.h
index 6be0e49ca2f55..e5424d318401f 100644
--- a/clang/lib/Basic/Targets/RISCV.h
+++ b/clang/lib/Basic/Targets/RISCV.h
@@ -29,6 +29,9 @@ class RISCVTargetInfo : public TargetInfo {
   std::string ABI, CPU;
   std::unique_ptr<llvm::RISCVISAInfo> ISAInfo;
 
+private:
+  bool FastUnalignedAccess;
+
 public:
   RISCVTargetInfo(const llvm::Triple &Triple, const TargetOptions &)
       : TargetInfo(Triple) {
diff --git a/clang/test/Preprocessor/riscv-target-features.c b/clang/test/Preprocessor/riscv-target-features.c
index ffdec34ca615f..3a0435f9c9790 100644
--- a/clang/test/Preprocessor/riscv-target-features.c
+++ b/clang/test/Preprocessor/riscv-target-features.c
@@ -1246,3 +1246,15 @@
 // RUN: -march=rv64i_zve32x_zvkt1p0 -x c -E -dM %s \
 // RUN: -o - | FileCheck --check-prefix=CHECK-ZVKT-EXT %s
 // CHECK-ZVKT-EXT: __riscv_zvkt 1000000{{$}}
+
+// RUN: %clang --target=riscv32-unknown-linux-gnu -march=rv32i -x c -E -dM %s \
+// RUN: -o - | FileCheck %s --check-prefix=CHECK-MISALIGNED-AVOID
+// RUN: %clang --target=riscv64-unknown-linux-gnu -march=rv64i -x c -E -dM %s \
+// RUN: -o - | FileCheck %s --check-prefix=CHECK-MISALIGNED-AVOID
+// CHECK-MISALIGNED-AVOID: __riscv_misaligned_avoid 1
+
+// RUN: %clang --target=riscv32-unknown-linux-gnu -march=rv32i -E -dM %s \
+// RUN:   -munaligned-access -o - | FileCheck %s --check-prefix=CHECK-MISALIGNED-FAST
+// RUN: %clang --target=riscv64-unknown-linux-gnu -march=rv64i -E -dM %s \
+// RUN:   -munaligned-access -o - | FileCheck %s --check-prefix=CHECK-MISALIGNED-FAST
+// CHECK-MISALIGNED-FAST: __riscv_misaligned_fast 1

From 3b59b3ef449d8296ed5ac09631ca383e139ec700 Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Thu, 26 Oct 2023 07:18:32 +0000
Subject: [PATCH 014/877] [gn build] Port 897cc8a7d7c0

---
 llvm/utils/gn/secondary/clang/unittests/Tooling/BUILD.gn | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/utils/gn/secondary/clang/unittests/Tooling/BUILD.gn b/llvm/utils/gn/secondary/clang/unittests/Tooling/BUILD.gn
index e784b39cd0f18..43f8b886f9b97 100644
--- a/llvm/utils/gn/secondary/clang/unittests/Tooling/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/unittests/Tooling/BUILD.gn
@@ -44,6 +44,7 @@ unittest("ToolingTests") {
     "RecursiveASTVisitorTestPostOrderVisitor.cpp",
     "RecursiveASTVisitorTestTypeLocVisitor.cpp",
     "RecursiveASTVisitorTests/Attr.cpp",
+    "RecursiveASTVisitorTests/BitfieldInitializer.cpp",
     "RecursiveASTVisitorTests/CXXBoolLiteralExpr.cpp",
     "RecursiveASTVisitorTests/CXXMemberCall.cpp",
     "RecursiveASTVisitorTests/CXXMethodDecl.cpp",

From 46028022407d003b8af4ddf27a4679de4891f10d Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <rampitec@users.noreply.github.com>
Date: Thu, 26 Oct 2023 00:26:54 -0700
Subject: [PATCH 015/877] [AMDGPU] Shrink to SOPK with 32-bit signed literals
 (#70263)

A literal like 0xffff8000 is valid to be used as KIMM in a SOPK
instruction, but at the moment our checks expect it to be fully sign
extended to a 64-bit signed integer. This is not required since all
cases which are being shrunk only accept 32-bit operands.

We need to sign extend the operand to 64-bit though so it passes the
verifier and properly printed.
---
 .../Target/AMDGPU/SIShrinkInstructions.cpp    | 15 +++--
 llvm/test/CodeGen/AMDGPU/shrink-i32-kimm.mir  | 57 +++++++++++++++++++
 2 files changed, 67 insertions(+), 5 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/shrink-i32-kimm.mir

diff --git a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
index 88c75a0f86a6c..856121be78031 100644
--- a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
+++ b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -159,7 +159,7 @@ bool SIShrinkInstructions::shouldShrinkTrue16(MachineInstr &MI) const {
 }
 
 bool SIShrinkInstructions::isKImmOperand(const MachineOperand &Src) const {
-  return isInt<16>(Src.getImm()) &&
+  return isInt<16>(SignExtend64(Src.getImm(), 32)) &&
          !TII->isInlineConstant(*Src.getParent(), Src.getOperandNo());
 }
 
@@ -170,7 +170,7 @@ bool SIShrinkInstructions::isKUImmOperand(const MachineOperand &Src) const {
 
 bool SIShrinkInstructions::isKImmOrKUImmOperand(const MachineOperand &Src,
                                                 bool &IsUnsigned) const {
-  if (isInt<16>(Src.getImm())) {
+  if (isInt<16>(SignExtend64(Src.getImm(), 32))) {
     IsUnsigned = false;
     return !TII->isInlineConstant(Src);
   }
@@ -221,7 +221,7 @@ void SIShrinkInstructions::shrinkScalarCompare(MachineInstr &MI) const {
   if (!Src0.isReg())
     return;
 
-  const MachineOperand &Src1 = MI.getOperand(1);
+  MachineOperand &Src1 = MI.getOperand(1);
   if (!Src1.isImm())
     return;
 
@@ -237,6 +237,7 @@ void SIShrinkInstructions::shrinkScalarCompare(MachineInstr &MI) const {
       if (!HasUImm) {
         SOPKOpc = (SOPKOpc == AMDGPU::S_CMPK_EQ_U32) ?
           AMDGPU::S_CMPK_EQ_I32 : AMDGPU::S_CMPK_LG_I32;
+        Src1.setImm(SignExtend32(Src1.getImm(), 32));
       }
 
       MI.setDesc(TII->get(SOPKOpc));
@@ -249,6 +250,8 @@ void SIShrinkInstructions::shrinkScalarCompare(MachineInstr &MI) const {
 
   if ((TII->sopkIsZext(SOPKOpc) && isKUImmOperand(Src1)) ||
       (!TII->sopkIsZext(SOPKOpc) && isKImmOperand(Src1))) {
+    if (!TII->sopkIsZext(SOPKOpc))
+      Src1.setImm(SignExtend64(Src1.getImm(), 32));
     MI.setDesc(NewDesc);
   }
 }
@@ -838,6 +841,7 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
             unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_I32) ?
               AMDGPU::S_ADDK_I32 : AMDGPU::S_MULK_I32;
 
+            Src1->setImm(SignExtend64(Src1->getImm(), 32));
             MI.setDesc(TII->get(Opc));
             MI.tieOperands(0, 1);
           }
@@ -857,9 +861,10 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
 
         if (Src.isImm() && Dst.getReg().isPhysical()) {
           int32_t ReverseImm;
-          if (isKImmOperand(Src))
+          if (isKImmOperand(Src)) {
             MI.setDesc(TII->get(AMDGPU::S_MOVK_I32));
-          else if (isReverseInlineImm(Src, ReverseImm)) {
+            Src.setImm(SignExtend64(Src.getImm(), 32));
+          } else if (isReverseInlineImm(Src, ReverseImm)) {
             MI.setDesc(TII->get(AMDGPU::S_BREV_B32));
             Src.setImm(ReverseImm);
           }
diff --git a/llvm/test/CodeGen/AMDGPU/shrink-i32-kimm.mir b/llvm/test/CodeGen/AMDGPU/shrink-i32-kimm.mir
new file mode 100644
index 0000000000000..e2198faf13f71
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/shrink-i32-kimm.mir
@@ -0,0 +1,57 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
+# RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass=si-shrink-instructions -o - %s | FileCheck -check-prefix=GCN %s
+
+---
+name:            shrink_kimm32_mov_b32
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; GCN-LABEL: name: shrink_kimm32_mov_b32
+    ; GCN: $sgpr0 = S_MOVK_I32 -2048
+    $sgpr0 = S_MOV_B32 4294965248
+...
+
+---
+name:            shrink_kimm32_cmp_eq_u32
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; GCN-LABEL: name: shrink_kimm32_cmp_eq_u32
+    ; GCN: S_CMPK_EQ_I32 undef $sgpr0, -2048, implicit-def $scc
+    S_CMP_EQ_U32 undef $sgpr0, 4294965248, implicit-def $scc
+...
+
+---
+name:            shrink_kimm32_cmp_gt_i32
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; GCN-LABEL: name: shrink_kimm32_cmp_gt_i32
+    ; GCN: S_CMPK_GT_I32 undef $sgpr0, -2048, implicit-def $scc
+    S_CMP_GT_I32 undef $sgpr0, 4294965248, implicit-def $scc
+...
+
+---
+name:            shrink_kimm32_add_i32
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; GCN-LABEL: name: shrink_kimm32_add_i32
+    ; GCN: $sgpr0 = S_ADDK_I32 undef $sgpr0, -2048, implicit-def $scc
+    $sgpr0 = S_ADD_I32 undef $sgpr0, 4294965248, implicit-def $scc
+...
+
+---
+name:            shrink_kimm32_mul_i32
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; GCN-LABEL: name: shrink_kimm32_mul_i32
+    ; GCN: $sgpr0 = S_MULK_I32 undef $sgpr0, -2048, implicit-def $scc
+    $sgpr0 = S_MUL_I32 undef $sgpr0, 4294965248, implicit-def $scc
+...

From d1556e5efbf0cb671c0f6e403fc1eaf9153f8713 Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett@linaro.org>
Date: Thu, 26 Oct 2023 08:33:30 +0100
Subject: [PATCH 016/877] [lldb][lldb-server] Enable sending RegisterFlags as
 XML (#69951)

This adds ToXML methods to encode RegisterFlags and its fields into XML
according to GDB's target XML format:

https://sourceware.org/gdb/onlinedocs/gdb/Target-Description-Format.html#Target-Description-Format

lldb-server does not use libXML to build XML, so this follows the
existing code that uses strings. Indentation is used so the result is
still human readable.

```
<flags id=\"Foo\" size=\"4\">
  <field name=\"abc\" start=\"0\" end=\"0\"/>
</flags>
```

This is used by lldb-server when building target XML, though no one sets
any fields yet. That'll come in a later commit.
---
 lldb/include/lldb/Target/RegisterFlags.h      | 19 +++++--
 .../GDBRemoteCommunicationServerLLGS.cpp      |  9 ++++
 lldb/source/Target/RegisterFlags.cpp          | 46 +++++++++++++++++
 lldb/unittests/Target/RegisterFlagsTest.cpp   | 50 +++++++++++++++++++
 4 files changed, 119 insertions(+), 5 deletions(-)

diff --git a/lldb/include/lldb/Target/RegisterFlags.h b/lldb/include/lldb/Target/RegisterFlags.h
index d98bc0263e35e..7c5b97c2265fd 100644
--- a/lldb/include/lldb/Target/RegisterFlags.h
+++ b/lldb/include/lldb/Target/RegisterFlags.h
@@ -9,20 +9,21 @@
 #ifndef LLDB_TARGET_REGISTERFLAGS_H
 #define LLDB_TARGET_REGISTERFLAGS_H
 
-#include "lldb/Utility/Log.h"
+#include <string>
+#include <vector>
 
 namespace lldb_private {
 
+class StreamString;
+class Log;
+
 class RegisterFlags {
 public:
   class Field {
   public:
     /// Where start is the least significant bit and end is the most
     /// significant bit. The start bit must be <= the end bit.
-    Field(std::string name, unsigned start, unsigned end)
-        : m_name(std::move(name)), m_start(start), m_end(end) {
-      assert(m_start <= m_end && "Start bit must be <= end bit.");
-    }
+    Field(std::string name, unsigned start, unsigned end);
 
     /// Construct a field that occupies a single bit.
     Field(std::string name, unsigned bit_position)
@@ -51,6 +52,11 @@ class RegisterFlags {
     /// covered by either field.
     unsigned PaddingDistance(const Field &other) const;
 
+    /// Output XML that describes this field, to be inserted into a target XML
+    /// file. Reserved characters in field names like "<" are replaced with
+    /// their XML safe equivalents like "&gt;".
+    void ToXML(StreamString &strm) const;
+
     bool operator<(const Field &rhs) const {
       return GetStart() < rhs.GetStart();
     }
@@ -106,6 +112,9 @@ class RegisterFlags {
   /// be split into many tables as needed.
   std::string AsTable(uint32_t max_width) const;
 
+  // Output XML that describes this set of flags.
+  void ToXML(StreamString &strm) const;
+
 private:
   const std::string m_id;
   /// Size in bytes
diff --git a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerLLGS.cpp b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerLLGS.cpp
index 23c2f18cd388a..187c23a206094 100644
--- a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerLLGS.cpp
+++ b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerLLGS.cpp
@@ -3094,6 +3094,12 @@ GDBRemoteCommunicationServerLLGS::BuildTargetXml() {
       continue;
     }
 
+    if (reg_info->flags_type) {
+      response.IndentMore();
+      reg_info->flags_type->ToXML(response);
+      response.IndentLess();
+    }
+
     response.Indent();
     response.Printf("<reg name=\"%s\" bitsize=\"%" PRIu32
                     "\" regnum=\"%d\" ",
@@ -3113,6 +3119,9 @@ GDBRemoteCommunicationServerLLGS::BuildTargetXml() {
     if (!format.empty())
       response << "format=\"" << format << "\" ";
 
+    if (reg_info->flags_type)
+      response << "type=\"" << reg_info->flags_type->GetID() << "\" ";
+
     const char *const register_set_name =
         reg_context.GetRegisterSetNameForRegisterAtIndex(reg_index);
     if (register_set_name)
diff --git a/lldb/source/Target/RegisterFlags.cpp b/lldb/source/Target/RegisterFlags.cpp
index 06fb45d777ec3..49974718ccb51 100644
--- a/lldb/source/Target/RegisterFlags.cpp
+++ b/lldb/source/Target/RegisterFlags.cpp
@@ -7,13 +7,21 @@
 //===----------------------------------------------------------------------===//
 
 #include "lldb/Target/RegisterFlags.h"
+#include "lldb/Utility/Log.h"
 #include "lldb/Utility/StreamString.h"
 
+#include "llvm/ADT/StringExtras.h"
+
 #include <numeric>
 #include <optional>
 
 using namespace lldb_private;
 
+RegisterFlags::Field::Field(std::string name, unsigned start, unsigned end)
+    : m_name(std::move(name)), m_start(start), m_end(end) {
+  assert(m_start <= m_end && "Start bit must be <= end bit.");
+}
+
 void RegisterFlags::Field::log(Log *log) const {
   LLDB_LOG(log, "  Name: \"{0}\" Start: {1} End: {2}", m_name.c_str(), m_start,
            m_end);
@@ -175,3 +183,41 @@ std::string RegisterFlags::AsTable(uint32_t max_width) const {
 
   return table;
 }
+
+void RegisterFlags::ToXML(StreamString &strm) const {
+  // Example XML:
+  // <flags id="cpsr_flags" size="4">
+  //   <field name="incorrect" start="0" end="0"/>
+  // </flags>
+  strm.Indent();
+  strm << "<flags id=\"" << GetID() << "\" ";
+  strm.Printf("size=\"%d\"", GetSize());
+  strm << ">";
+  for (const Field &field : m_fields) {
+    // Skip padding fields.
+    if (field.GetName().empty())
+      continue;
+
+    strm << "\n";
+    strm.IndentMore();
+    field.ToXML(strm);
+    strm.IndentLess();
+  }
+  strm.PutChar('\n');
+  strm.Indent("</flags>\n");
+}
+
+void RegisterFlags::Field::ToXML(StreamString &strm) const {
+  // Example XML:
+  // <field name="correct" start="0" end="0"/>
+  strm.Indent();
+  strm << "<field name=\"";
+
+  std::string escaped_name;
+  llvm::raw_string_ostream escape_strm(escaped_name);
+  llvm::printHTMLEscaped(GetName(), escape_strm);
+  strm << escaped_name << "\" ";
+
+  strm.Printf("start=\"%d\" end=\"%d\"", GetStart(), GetEnd());
+  strm << "/>";
+}
diff --git a/lldb/unittests/Target/RegisterFlagsTest.cpp b/lldb/unittests/Target/RegisterFlagsTest.cpp
index 167e28d0cecb3..c7a4192031655 100644
--- a/lldb/unittests/Target/RegisterFlagsTest.cpp
+++ b/lldb/unittests/Target/RegisterFlagsTest.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "lldb/Target/RegisterFlags.h"
+#include "lldb/Utility/StreamString.h"
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
 
@@ -258,3 +259,52 @@ TEST(RegisterFlagsTest, AsTable) {
             "| really long name |",
             max_many_columns.AsTable(23));
 }
+
+TEST(RegisterFieldsTest, ToXML) {
+  StreamString strm;
+
+  // RegisterFlags requires that some fields be given, so no testing of empty
+  // input.
+
+  // Unnamed fields are padding that are ignored. This applies to fields passed
+  // in, and those generated to fill the other bits (31-1 here).
+  RegisterFlags("Foo", 4, {RegisterFlags::Field("", 0, 0)}).ToXML(strm);
+  ASSERT_EQ(strm.GetString(), "<flags id=\"Foo\" size=\"4\">\n"
+                              "</flags>\n");
+
+  strm.Clear();
+  RegisterFlags("Foo", 4, {RegisterFlags::Field("abc", 0, 0)}).ToXML(strm);
+  ASSERT_EQ(strm.GetString(), "<flags id=\"Foo\" size=\"4\">\n"
+                              "  <field name=\"abc\" start=\"0\" end=\"0\"/>\n"
+                              "</flags>\n");
+
+  strm.Clear();
+  // Should use the current indentation level as a starting point.
+  strm.IndentMore();
+  RegisterFlags(
+      "Bar", 5,
+      {RegisterFlags::Field("f1", 25, 32), RegisterFlags::Field("f2", 10, 24)})
+      .ToXML(strm);
+  ASSERT_EQ(strm.GetString(),
+            "  <flags id=\"Bar\" size=\"5\">\n"
+            "    <field name=\"f1\" start=\"25\" end=\"32\"/>\n"
+            "    <field name=\"f2\" start=\"10\" end=\"24\"/>\n"
+            "  </flags>\n");
+
+  strm.Clear();
+  strm.IndentLess();
+  // Should replace any XML unsafe characters in field names.
+  RegisterFlags("Safe", 8,
+                {RegisterFlags::Field("A<", 4), RegisterFlags::Field("B>", 3),
+                 RegisterFlags::Field("C'", 2), RegisterFlags::Field("D\"", 1),
+                 RegisterFlags::Field("E&", 0)})
+      .ToXML(strm);
+  ASSERT_EQ(strm.GetString(),
+            "<flags id=\"Safe\" size=\"8\">\n"
+            "  <field name=\"A&lt;\" start=\"4\" end=\"4\"/>\n"
+            "  <field name=\"B&gt;\" start=\"3\" end=\"3\"/>\n"
+            "  <field name=\"C&apos;\" start=\"2\" end=\"2\"/>\n"
+            "  <field name=\"D&quot;\" start=\"1\" end=\"1\"/>\n"
+            "  <field name=\"E&amp;\" start=\"0\" end=\"0\"/>\n"
+            "</flags>\n");
+}

From 18775a49416cf767b34eaaf925df5143e40582f4 Mon Sep 17 00:00:00 2001
From: Matthew Devereau <matthew.devereau@arm.com>
Date: Thu, 26 Oct 2023 08:42:25 +0100
Subject: [PATCH 017/877] [AArch64][SVE2] Use rshrnb for masked stores (#70026)

This patch is a follow up on https://reviews.llvm.org/D155299. This
patch combines add+lsr to rshrnb when 'B' in:

  C = A + B
  D = C >> Shift

is equal to (1 << (Shift-1), and the bits in the top half of each vector
element are zeroed or ignored, such as in a truncating masked store.
---
 .../Target/AArch64/AArch64ISelLowering.cpp    | 33 +++++++++++++++----
 .../AArch64/sve2-intrinsics-combine-rshrnb.ll | 19 +++++++++++
 2 files changed, 45 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 7211607fee528..038c23b5e8d50 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -21002,6 +21002,12 @@ static SDValue combineBoolVectorAndTruncateStore(SelectionDAG &DAG,
                       Store->getMemOperand());
 }
 
+bool isHalvingTruncateOfLegalScalableType(EVT SrcVT, EVT DstVT) {
+  return (SrcVT == MVT::nxv8i16 && DstVT == MVT::nxv8i8) ||
+         (SrcVT == MVT::nxv4i32 && DstVT == MVT::nxv4i16) ||
+         (SrcVT == MVT::nxv2i64 && DstVT == MVT::nxv2i32);
+}
+
 static SDValue performSTORECombine(SDNode *N,
                                    TargetLowering::DAGCombinerInfo &DCI,
                                    SelectionDAG &DAG,
@@ -21043,16 +21049,16 @@ static SDValue performSTORECombine(SDNode *N,
   if (SDValue Store = combineBoolVectorAndTruncateStore(DAG, ST))
     return Store;
 
-  if (ST->isTruncatingStore())
+  if (ST->isTruncatingStore()) {
+    EVT StoreVT = ST->getMemoryVT();
+    if (!isHalvingTruncateOfLegalScalableType(ValueVT, StoreVT))
+      return SDValue();
     if (SDValue Rshrnb =
             trySimplifySrlAddToRshrnb(ST->getOperand(1), DAG, Subtarget)) {
-      EVT StoreVT = ST->getMemoryVT();
-      if ((ValueVT == MVT::nxv8i16 && StoreVT == MVT::nxv8i8) ||
-          (ValueVT == MVT::nxv4i32 && StoreVT == MVT::nxv4i16) ||
-          (ValueVT == MVT::nxv2i64 && StoreVT == MVT::nxv2i32))
-        return DAG.getTruncStore(ST->getChain(), ST, Rshrnb, ST->getBasePtr(),
-                                 StoreVT, ST->getMemOperand());
+      return DAG.getTruncStore(ST->getChain(), ST, Rshrnb, ST->getBasePtr(),
+                               StoreVT, ST->getMemOperand());
     }
+  }
 
   return SDValue();
 }
@@ -21098,6 +21104,19 @@ static SDValue performMSTORECombine(SDNode *N,
     }
   }
 
+  if (MST->isTruncatingStore()) {
+    EVT ValueVT = Value->getValueType(0);
+    EVT MemVT = MST->getMemoryVT();
+    if (!isHalvingTruncateOfLegalScalableType(ValueVT, MemVT))
+      return SDValue();
+    if (SDValue Rshrnb = trySimplifySrlAddToRshrnb(Value, DAG, Subtarget)) {
+      return DAG.getMaskedStore(MST->getChain(), DL, Rshrnb, MST->getBasePtr(),
+                                MST->getOffset(), MST->getMask(),
+                                MST->getMemoryVT(), MST->getMemOperand(),
+                                MST->getAddressingMode(), true);
+    }
+  }
+
   return SDValue();
 }
 
diff --git a/llvm/test/CodeGen/AArch64/sve2-intrinsics-combine-rshrnb.ll b/llvm/test/CodeGen/AArch64/sve2-intrinsics-combine-rshrnb.ll
index a913177623df9..0afd11d098a00 100644
--- a/llvm/test/CodeGen/AArch64/sve2-intrinsics-combine-rshrnb.ll
+++ b/llvm/test/CodeGen/AArch64/sve2-intrinsics-combine-rshrnb.ll
@@ -298,3 +298,22 @@ define void @neg_add_lshr_rshrnb_s(ptr %ptr, ptr %dst, i64 %index){
   store <vscale x 2 x i16> %3, ptr %4, align 1
   ret void
 }
+
+define void @masked_store_rshrnb(ptr %ptr, ptr %dst, i64 %index, <vscale x 8 x i1> %mask) {                             ; preds = %vector.body, %vector.ph
+; CHECK-LABEL: masked_store_rshrnb:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    rshrnb z0.b, z0.h, #6
+; CHECK-NEXT:    st1b { z0.h }, p0, [x1, x2]
+; CHECK-NEXT:    ret
+  %wide.masked.load = tail call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr %ptr, i32 2, <vscale x 8 x i1> %mask, <vscale x 8 x i16> poison)
+  %1 = add <vscale x 8 x i16> %wide.masked.load, trunc (<vscale x 8 x i32> shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 32, i64 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer) to <vscale x 8 x i16>)
+  %2 = lshr <vscale x 8 x i16> %1, trunc (<vscale x 8 x i32> shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 6, i64 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer) to <vscale x 8 x i16>)
+  %3 = trunc <vscale x 8 x i16> %2 to <vscale x 8 x i8>
+  %4 = getelementptr inbounds i8, ptr %dst, i64 %index
+  tail call void @llvm.masked.store.nxv8i8.p0(<vscale x 8 x i8> %3, ptr %4, i32 1, <vscale x 8 x i1> %mask)
+  ret void
+}
+
+declare void @llvm.masked.store.nxv8i8.p0(<vscale x 8 x i8>, ptr, i32, <vscale x 8 x i1>)
+declare <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr, i32, <vscale x 8 x i1>, <vscale x 8 x i16>)

From 05dcfa44c0f4c68c3a7b831d6ac2dd0572741ece Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Thu, 26 Oct 2023 10:55:53 +0300
Subject: [PATCH 018/877] [clang] [Gnu] Improve GCCVersion parsing to match
 versions such as "10-win32" (#69079)

In earlier GCC versions, the Debian/Ubuntu provided mingw toolchains
were packaged in /usr/lib/gcc/<triple> with version strings such as
"5.3-win32", which were matched and found since
6afcd64eb65fca233a7b173f88cffb2c2c9c114c. However in recent versions,
they have stopped including the minor version number and only have
version strings such as "10-win32" and "10-posix".

Generalize the parsing code to tolerate the patch suffix to be present
on a version number with only a major number.

Refactor the string parsing code to highlight the overall structure of
the parsing. This implementation should yield the same result as before,
except for when there's only one segment and it has trailing, non-number
contents.

This allows Clang to find the GCC libraries and headers in Debian/Ubuntu
provided MinGW cross compilers.
---
 clang/lib/Driver/ToolChains/Gnu.cpp       | 83 +++++++++++++++--------
 clang/unittests/Driver/GCCVersionTest.cpp |  1 +
 2 files changed, 56 insertions(+), 28 deletions(-)

diff --git a/clang/lib/Driver/ToolChains/Gnu.cpp b/clang/lib/Driver/ToolChains/Gnu.cpp
index cdd911af9a733..a98dfa02fc717 100644
--- a/clang/lib/Driver/ToolChains/Gnu.cpp
+++ b/clang/lib/Driver/ToolChains/Gnu.cpp
@@ -2007,45 +2007,72 @@ Generic_GCC::GCCVersion Generic_GCC::GCCVersion::Parse(StringRef VersionText) {
   std::pair<StringRef, StringRef> First = VersionText.split('.');
   std::pair<StringRef, StringRef> Second = First.second.split('.');
 
-  GCCVersion GoodVersion = {VersionText.str(), -1, -1, -1, "", "", ""};
-  if (First.first.getAsInteger(10, GoodVersion.Major) || GoodVersion.Major < 0)
-    return BadVersion;
-  GoodVersion.MajorStr = First.first.str();
-  if (First.second.empty())
-    return GoodVersion;
+  StringRef MajorStr = First.first;
   StringRef MinorStr = Second.first;
-  if (Second.second.empty()) {
-    if (size_t EndNumber = MinorStr.find_first_not_of("0123456789")) {
-      GoodVersion.PatchSuffix = std::string(MinorStr.substr(EndNumber));
-      MinorStr = MinorStr.slice(0, EndNumber);
-    }
-  }
-  if (MinorStr.getAsInteger(10, GoodVersion.Minor) || GoodVersion.Minor < 0)
-    return BadVersion;
-  GoodVersion.MinorStr = MinorStr.str();
+  StringRef PatchStr = Second.second;
 
-  // First look for a number prefix and parse that if present. Otherwise just
-  // stash the entire patch string in the suffix, and leave the number
-  // unspecified. This covers versions strings such as:
-  //   5        (handled above)
+  GCCVersion GoodVersion = {VersionText.str(), -1, -1, -1, "", "", ""};
+
+  // Parse version number strings such as:
+  //   5
   //   4.4
   //   4.4-patched
   //   4.4.0
   //   4.4.x
   //   4.4.2-rc4
   //   4.4.x-patched
-  // And retains any patch number it finds.
-  StringRef PatchText = Second.second;
-  if (!PatchText.empty()) {
-    if (size_t EndNumber = PatchText.find_first_not_of("0123456789")) {
-      // Try to parse the number and any suffix.
-      if (PatchText.slice(0, EndNumber).getAsInteger(10, GoodVersion.Patch) ||
-          GoodVersion.Patch < 0)
-        return BadVersion;
-      GoodVersion.PatchSuffix = std::string(PatchText.substr(EndNumber));
+  //   10-win32
+  // Split on '.', handle 1, 2 or 3 such segments. Each segment must contain
+  // purely a number, except for the last one, where a non-number suffix
+  // is stored in PatchSuffix. The third segment is allowed to not contain
+  // a number at all.
+
+  auto TryParseLastNumber = [&](StringRef Segment, int &Number,
+                                std::string &OutStr) -> bool {
+    // Look for a number prefix and parse that, and split out any trailing
+    // string into GoodVersion.PatchSuffix.
+
+    if (size_t EndNumber = Segment.find_first_not_of("0123456789")) {
+      StringRef NumberStr = Segment.slice(0, EndNumber);
+      if (NumberStr.getAsInteger(10, Number) || Number < 0)
+        return false;
+      OutStr = NumberStr;
+      GoodVersion.PatchSuffix = Segment.substr(EndNumber);
+      return true;
     }
+    return false;
+  };
+  auto TryParseNumber = [](StringRef Segment, int &Number) -> bool {
+    if (Segment.getAsInteger(10, Number) || Number < 0)
+      return false;
+    return true;
+  };
+
+  if (MinorStr.empty()) {
+    // If no minor string, major is the last segment
+    if (!TryParseLastNumber(MajorStr, GoodVersion.Major, GoodVersion.MajorStr))
+      return BadVersion;
+    return GoodVersion;
   }
 
+  if (!TryParseNumber(MajorStr, GoodVersion.Major))
+    return BadVersion;
+  GoodVersion.MajorStr = MajorStr;
+
+  if (PatchStr.empty()) {
+    // If no patch string, minor is the last segment
+    if (!TryParseLastNumber(MinorStr, GoodVersion.Minor, GoodVersion.MinorStr))
+      return BadVersion;
+    return GoodVersion;
+  }
+
+  if (!TryParseNumber(MinorStr, GoodVersion.Minor))
+    return BadVersion;
+  GoodVersion.MinorStr = MinorStr;
+
+  // For the last segment, tolerate a missing number.
+  std::string DummyStr;
+  TryParseLastNumber(PatchStr, GoodVersion.Patch, DummyStr);
   return GoodVersion;
 }
 
diff --git a/clang/unittests/Driver/GCCVersionTest.cpp b/clang/unittests/Driver/GCCVersionTest.cpp
index 88c26dfe814e3..3158911fe5db9 100644
--- a/clang/unittests/Driver/GCCVersionTest.cpp
+++ b/clang/unittests/Driver/GCCVersionTest.cpp
@@ -39,6 +39,7 @@ const VersionParseTest TestCases[] = {
     {"4.4.2-rc4", 4, 4, 2, "4", "4", "-rc4"},
     {"4.4.x-patched", 4, 4, -1, "4", "4", ""},
     {"not-a-version", -1, -1, -1, "", "", ""},
+    {"10-win32", 10, -1, -1, "10", "", "-win32"},
 };
 
 TEST(GCCVersionTest, Parse) {

From de7c0068329d78027df7b7184d72646c1ca9f2bd Mon Sep 17 00:00:00 2001
From: Qiu Chaofan <qiucofan@cn.ibm.com>
Date: Thu, 26 Oct 2023 15:56:32 +0800
Subject: [PATCH 019/877] [PowerPC] Fix use of FPSCR builtins in smmintrin.h
 (#67299)

smmintrin.h uses __builtin_mffs, __builtin_mffsl, __builtin_mtfsf and
__builtin_set_fpscr_rn. This patch replaces the uses with ppc prefix
and implement the missing ones.
---
 clang/include/clang/Basic/BuiltinsPPC.def     |  2 +
 clang/lib/Basic/Targets/PPC.cpp               |  4 ++
 clang/lib/CodeGen/CGBuiltin.cpp               |  5 ++
 clang/lib/Headers/ppc_wrappers/smmintrin.h    | 50 +++++++++++++------
 clang/test/CodeGen/PowerPC/builtins-ppc.c     | 13 ++++-
 clang/test/CodeGen/PowerPC/ppc-emmintrin.c    |  5 ++
 clang/test/CodeGen/PowerPC/ppc-mmintrin.c     |  5 ++
 clang/test/CodeGen/PowerPC/ppc-pmmintrin.c    |  3 ++
 clang/test/CodeGen/PowerPC/ppc-smmintrin.c    | 37 ++++++++------
 clang/test/CodeGen/PowerPC/ppc-tmmintrin.c    |  3 ++
 clang/test/CodeGen/PowerPC/ppc-x86gprintrin.c |  3 ++
 11 files changed, 99 insertions(+), 31 deletions(-)

diff --git a/clang/include/clang/Basic/BuiltinsPPC.def b/clang/include/clang/Basic/BuiltinsPPC.def
index 18a1186053481..a35488ed3dfa5 100644
--- a/clang/include/clang/Basic/BuiltinsPPC.def
+++ b/clang/include/clang/Basic/BuiltinsPPC.def
@@ -151,9 +151,11 @@ TARGET_BUILTIN(__builtin_ppc_extract_exp, "Uid", "", "power9-vector")
 TARGET_BUILTIN(__builtin_ppc_extract_sig, "ULLid", "", "power9-vector")
 BUILTIN(__builtin_ppc_mtfsb0, "vUIi", "")
 BUILTIN(__builtin_ppc_mtfsb1, "vUIi", "")
+BUILTIN(__builtin_ppc_mffs, "d", "")
 TARGET_BUILTIN(__builtin_ppc_mffsl, "d", "", "isa-v30-instructions")
 BUILTIN(__builtin_ppc_mtfsf, "vUIiUi", "")
 BUILTIN(__builtin_ppc_mtfsfi, "vUIiUIi", "")
+BUILTIN(__builtin_ppc_set_fpscr_rn, "di", "")
 TARGET_BUILTIN(__builtin_ppc_insert_exp, "ddULLi", "", "power9-vector")
 BUILTIN(__builtin_ppc_fmsub, "dddd", "")
 BUILTIN(__builtin_ppc_fmsubs, "ffff", "")
diff --git a/clang/lib/Basic/Targets/PPC.cpp b/clang/lib/Basic/Targets/PPC.cpp
index 0d87a3a4e8c20..5ce276e1af9ef 100644
--- a/clang/lib/Basic/Targets/PPC.cpp
+++ b/clang/lib/Basic/Targets/PPC.cpp
@@ -264,6 +264,10 @@ static void defineXLCompatMacros(MacroBuilder &Builder) {
   Builder.defineMacro("__builtin_minfe", "__builtin_ppc_minfe");
   Builder.defineMacro("__builtin_minfl", "__builtin_ppc_minfl");
   Builder.defineMacro("__builtin_minfs", "__builtin_ppc_minfs");
+  Builder.defineMacro("__builtin_mffs", "__builtin_ppc_mffs");
+  Builder.defineMacro("__builtin_mffsl", "__builtin_ppc_mffsl");
+  Builder.defineMacro("__builtin_mtfsf", "__builtin_ppc_mtfsf");
+  Builder.defineMacro("__builtin_set_fpscr_rn", "__builtin_ppc_set_fpscr_rn");
 }
 
 /// PPCTargetInfo::getTargetDefines - Return a set of the PowerPC-specific
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 85be8bdd00516..dce5ee5888c45 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -17397,6 +17397,11 @@ Value *CodeGenFunction::EmitPPCBuiltinExpr(unsigned BuiltinID,
     Value *Op1 = EmitScalarExpr(E->getArg(1));
     return Builder.CreateFDiv(Op0, Op1, "swdiv");
   }
+  case PPC::BI__builtin_ppc_set_fpscr_rn:
+    return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::ppc_setrnd),
+                              {EmitScalarExpr(E->getArg(0))});
+  case PPC::BI__builtin_ppc_mffs:
+    return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::ppc_readflm));
   }
 }
 
diff --git a/clang/lib/Headers/ppc_wrappers/smmintrin.h b/clang/lib/Headers/ppc_wrappers/smmintrin.h
index 349b395c4f00b..19cdecb18d2b8 100644
--- a/clang/lib/Headers/ppc_wrappers/smmintrin.h
+++ b/clang/lib/Headers/ppc_wrappers/smmintrin.h
@@ -14,7 +14,7 @@
 
 #ifndef NO_WARN_X86_INTRINSICS
 /* This header is distributed to simplify porting x86_64 code that
-   makes explicit use of Intel intrinsics to powerp64/powerpc64le.
+   makes explicit use of Intel intrinsics to powerpc64/powerpc64le.
 
    It is the user's responsibility to determine if the results are
    acceptable and make additional changes as necessary.
@@ -68,10 +68,10 @@ extern __inline __m128d
     __asm__("mffsce %0" : "=f"(__fpscr_save.__fr));
     __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8;
 #else
-    __fpscr_save.__fr = __builtin_mffs();
+    __fpscr_save.__fr = __builtin_ppc_mffs();
     __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8;
     __fpscr_save.__fpscr &= ~0xf8;
-    __builtin_mtfsf(0b00000011, __fpscr_save.__fr);
+    __builtin_ppc_mtfsf(0b00000011, __fpscr_save.__fr);
 #endif
     /* Insert an artificial "read/write" reference to the variable
        read below, to ensure the compiler does not schedule
@@ -83,10 +83,15 @@ extern __inline __m128d
 
   switch (__rounding) {
   case _MM_FROUND_TO_NEAREST_INT:
-    __fpscr_save.__fr = __builtin_mffsl();
+#ifdef _ARCH_PWR9
+    __fpscr_save.__fr = __builtin_ppc_mffsl();
+#else
+    __fpscr_save.__fr = __builtin_ppc_mffs();
+    __fpscr_save.__fpscr &= 0x70007f0ffL;
+#endif
     __attribute__((fallthrough));
   case _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC:
-    __builtin_set_fpscr_rn(0b00);
+    __builtin_ppc_set_fpscr_rn(0b00);
     /* Insert an artificial "read/write" reference to the variable
        read below, to ensure the compiler does not schedule
        a read/use of the variable before the FPSCR is modified, above.
@@ -102,7 +107,7 @@ extern __inline __m128d
        This can be removed if and when GCC PR102783 is fixed.
      */
     __asm__("" : : "wa"(__r));
-    __builtin_set_fpscr_rn(__fpscr_save.__fpscr);
+    __builtin_ppc_set_fpscr_rn(__fpscr_save.__fpscr);
     break;
   case _MM_FROUND_TO_NEG_INF:
   case _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC:
@@ -128,9 +133,14 @@ extern __inline __m128d
      */
     __asm__("" : : "wa"(__r));
     /* Restore enabled exceptions.  */
-    __fpscr_save.__fr = __builtin_mffsl();
+#ifdef _ARCH_PWR9
+    __fpscr_save.__fr = __builtin_ppc_mffsl();
+#else
+    __fpscr_save.__fr = __builtin_ppc_mffs();
+    __fpscr_save.__fpscr &= 0x70007f0ffL;
+#endif
     __fpscr_save.__fpscr |= __enables_save.__fpscr;
-    __builtin_mtfsf(0b00000011, __fpscr_save.__fr);
+    __builtin_ppc_mtfsf(0b00000011, __fpscr_save.__fr);
   }
   return (__m128d)__r;
 }
@@ -159,10 +169,10 @@ extern __inline __m128
     __asm__("mffsce %0" : "=f"(__fpscr_save.__fr));
     __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8;
 #else
-    __fpscr_save.__fr = __builtin_mffs();
+    __fpscr_save.__fr = __builtin_ppc_mffs();
     __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8;
     __fpscr_save.__fpscr &= ~0xf8;
-    __builtin_mtfsf(0b00000011, __fpscr_save.__fr);
+    __builtin_ppc_mtfsf(0b00000011, __fpscr_save.__fr);
 #endif
     /* Insert an artificial "read/write" reference to the variable
        read below, to ensure the compiler does not schedule
@@ -174,10 +184,15 @@ extern __inline __m128
 
   switch (__rounding) {
   case _MM_FROUND_TO_NEAREST_INT:
-    __fpscr_save.__fr = __builtin_mffsl();
+#ifdef _ARCH_PWR9
+    __fpscr_save.__fr = __builtin_ppc_mffsl();
+#else
+    __fpscr_save.__fr = __builtin_ppc_mffs();
+    __fpscr_save.__fpscr &= 0x70007f0ffL;
+#endif
     __attribute__((fallthrough));
   case _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC:
-    __builtin_set_fpscr_rn(0b00);
+    __builtin_ppc_set_fpscr_rn(0b00);
     /* Insert an artificial "read/write" reference to the variable
        read below, to ensure the compiler does not schedule
        a read/use of the variable before the FPSCR is modified, above.
@@ -193,7 +208,7 @@ extern __inline __m128
        This can be removed if and when GCC PR102783 is fixed.
      */
     __asm__("" : : "wa"(__r));
-    __builtin_set_fpscr_rn(__fpscr_save.__fpscr);
+    __builtin_ppc_set_fpscr_rn(__fpscr_save.__fpscr);
     break;
   case _MM_FROUND_TO_NEG_INF:
   case _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC:
@@ -219,9 +234,14 @@ extern __inline __m128
      */
     __asm__("" : : "wa"(__r));
     /* Restore enabled exceptions.  */
-    __fpscr_save.__fr = __builtin_mffsl();
+#ifdef _ARCH_PWR9
+    __fpscr_save.__fr = __builtin_ppc_mffsl();
+#else
+    __fpscr_save.__fr = __builtin_ppc_mffs();
+    __fpscr_save.__fpscr &= 0x70007f0ffL;
+#endif
     __fpscr_save.__fpscr |= __enables_save.__fpscr;
-    __builtin_mtfsf(0b00000011, __fpscr_save.__fr);
+    __builtin_ppc_mtfsf(0b00000011, __fpscr_save.__fr);
   }
   return (__m128)__r;
 }
diff --git a/clang/test/CodeGen/PowerPC/builtins-ppc.c b/clang/test/CodeGen/PowerPC/builtins-ppc.c
index ccc91b6560845..c13edf44cdcbd 100644
--- a/clang/test/CodeGen/PowerPC/builtins-ppc.c
+++ b/clang/test/CodeGen/PowerPC/builtins-ppc.c
@@ -1,5 +1,8 @@
 // REQUIRES: powerpc-registered-target
-// RUN: %clang_cc1 -triple powerpc-unknown-unknown -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -triple powerpc-unknown-unknown -emit-llvm %s -o - \
+// RUN:   | FileCheck %s
+// RUN: %clang_cc1 -triple powerpc-unknown-unknown -emit-llvm %s -o - \
+// RUN:   -target-cpu pwr9 | FileCheck %s --check-prefixes=P9,CHECK
 
 void test_eh_return_data_regno()
 {
@@ -26,6 +29,9 @@ void test_builtin_ppc_setrnd() {
 
   // CHECK: call double @llvm.ppc.setrnd(i32 %2)
   res = __builtin_setrnd(x);
+
+  // CHECK: call double @llvm.ppc.setrnd(i32 %4)
+  res = __builtin_ppc_set_fpscr_rn(x);
 }
 
 void test_builtin_ppc_flm() {
@@ -33,7 +39,10 @@ void test_builtin_ppc_flm() {
   // CHECK: call double @llvm.ppc.readflm()
   res = __builtin_readflm();
 
-  // CHECK: call double @llvm.ppc.setflm(double %1)
+  // CHECK: call double @llvm.ppc.readflm()
+  res = __builtin_ppc_mffs();
+
+  // CHECK: call double @llvm.ppc.setflm(double %2)
   res = __builtin_setflm(res);
 
 #ifdef _ARCH_PWR9
diff --git a/clang/test/CodeGen/PowerPC/ppc-emmintrin.c b/clang/test/CodeGen/PowerPC/ppc-emmintrin.c
index e2d26e611ac81..15d291496c20a 100644
--- a/clang/test/CodeGen/PowerPC/ppc-emmintrin.c
+++ b/clang/test/CodeGen/PowerPC/ppc-emmintrin.c
@@ -8,6 +8,11 @@
 // RUN: %clang -S -emit-llvm -target powerpc64le-unknown-linux-gnu -mcpu=pwr10 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
 // RUN:   -ffp-contract=off -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt -n | FileCheck %s --check-prefixes=CHECK-P10
 
+// RUN: %clang -x c++ -S -emit-llvm -target powerpc64le-unknown-linux-gnu -mcpu=pwr8 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
+// RUN:   -ffp-contract=off -fno-discard-value-names -mllvm -disable-llvm-optzns -fsyntax-only
+// RUN: %clang -x c++ -S -emit-llvm -target powerpc64le-unknown-linux-gnu -mcpu=pwr10 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
+// RUN:   -ffp-contract=off -fno-discard-value-names -mllvm -disable-llvm-optzns -fsyntax-only
+
 // RUN: %clang -S -emit-llvm -target powerpc64-ibm-aix -mcpu=pwr8 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
 // RUN:   -ffp-contract=off -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt -n | FileCheck %s --check-prefixes=CHECK,CHECK-BE
 // RUN: %clang -S -emit-llvm -target powerpc64-ibm-aix -mcpu=pwr10 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
diff --git a/clang/test/CodeGen/PowerPC/ppc-mmintrin.c b/clang/test/CodeGen/PowerPC/ppc-mmintrin.c
index 4cb5b8540092f..1dc6292ae3244 100644
--- a/clang/test/CodeGen/PowerPC/ppc-mmintrin.c
+++ b/clang/test/CodeGen/PowerPC/ppc-mmintrin.c
@@ -9,6 +9,11 @@
 // RUN: %clang -S -emit-llvm -target powerpc64le-unknown-linux-gnu -mcpu=pwr9 -DNO_WARN_X86_INTRINSICS %s \
 // RUN:   -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt -n| FileCheck %s --check-prefixes=CHECK-P9,CHECK,CHECK-LE
 
+// RUN: %clang -x c++ -S -emit-llvm -target powerpc64le-unknown-linux-gnu -mcpu=pwr8 -DNO_WARN_X86_INTRINSICS %s \
+// RUN:   -fno-discard-value-names -mllvm -disable-llvm-optzns -fsyntax-only
+// RUN: %clang -x c++ -S -emit-llvm -target powerpc64le-unknown-linux-gnu -mcpu=pwr9 -DNO_WARN_X86_INTRINSICS %s \
+// RUN:   -fno-discard-value-names -mllvm -disable-llvm-optzns -fsyntax-only
+
 // RUN: %clang -S -emit-llvm -target powerpc64-unknown-freebsd13.0 -mcpu=pwr8 -DNO_WARN_X86_INTRINSICS %s \
 // RUN:   -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt -n | FileCheck %s --check-prefixes=CHECK-P8,CHECK,CHECK-BE
 // RUN: %clang -S -emit-llvm -target powerpc64le-unknown-freebsd13.0 -mcpu=pwr8 -DNO_WARN_X86_INTRINSICS %s \
diff --git a/clang/test/CodeGen/PowerPC/ppc-pmmintrin.c b/clang/test/CodeGen/PowerPC/ppc-pmmintrin.c
index 39194427978ad..6e152c549498d 100644
--- a/clang/test/CodeGen/PowerPC/ppc-pmmintrin.c
+++ b/clang/test/CodeGen/PowerPC/ppc-pmmintrin.c
@@ -13,6 +13,9 @@
 // RUN: %clang -S -emit-llvm -target powerpc64-ibm-aix -mcpu=pwr8 -DNO_MM_MALLOC -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
 // RUN:   -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt -n | FileCheck %s
 
+// RUN: %clang -x c++ -S -emit-llvm -target powerpc64le-gnu-linux -mcpu=pwr8 -DNO_MM_MALLOC -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
+// RUN:   -fno-discard-value-names -mllvm -disable-llvm-optzns -fsyntax-only
+
 #include <pmmintrin.h>
 
 __m128d resd, md1, md2;
diff --git a/clang/test/CodeGen/PowerPC/ppc-smmintrin.c b/clang/test/CodeGen/PowerPC/ppc-smmintrin.c
index 220b65c1ce164..7daef71a61c32 100644
--- a/clang/test/CodeGen/PowerPC/ppc-smmintrin.c
+++ b/clang/test/CodeGen/PowerPC/ppc-smmintrin.c
@@ -15,6 +15,11 @@
 // RUN: %clang -S -emit-llvm -target powerpc64-unknown-linux-gnu -mcpu=pwr10 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
 // RUN:   -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt -n | FileCheck %s --check-prefix=P10
 
+// RUN: %clang -x c++ -S -emit-llvm -target powerpc64le-unknown-linux-gnu -mcpu=pwr8 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
+// RUN:   -fno-discard-value-names -mllvm -disable-llvm-optzns -fsyntax-only
+// RUN: %clang -x c++ -S -emit-llvm -target powerpc64le-unknown-linux-gnu -mcpu=pwr10 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
+// RUN:   -fno-discard-value-names -mllvm -disable-llvm-optzns -fsyntax-only
+
 // RUN: %clang -S -emit-llvm -target powerpc64le-unknown-freebsd13.0 -mcpu=pwr8 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
 // RUN:   -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt -n | FileCheck %s
 // RUN: %clang -S -emit-llvm -target powerpc64-unknown-freebsd13.0 -mcpu=pwr8 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
@@ -239,44 +244,48 @@ test_round() {
 // CHECK-LABEL: @test_round
 
 // CHECK-LABEL: define available_externally <4 x float> @_mm_round_ps(<4 x float> noundef %{{[0-9a-zA-Z_.]+}}, i32 noundef signext %{{[0-9a-zA-Z_.]+}})
-// CHECK: call signext i32 @__builtin_mffs()
-// CHECK: call signext i32 @__builtin_mtfsf(i32 noundef signext 3, double noundef %{{[0-9a-zA-Z_.]+}})
+// CHECK: call double @llvm.ppc.readflm()
+// CHECK: call void @llvm.ppc.mtfsf(i32 3, double %{{[0-9a-zA-Z_.]+}})
 // CHECK: %{{[0-9a-zA-Z_.]+}} = call <4 x float> asm "", "=^wa,0"
-// CHECK: call signext i32 @__builtin_mffsl()
-// CHECK: call signext i32 @__builtin_set_fpscr_rn(i32 noundef signext 0)
+// CHECK: call double @llvm.ppc.readflm()
+// P10: call double @llvm.ppc.mffsl()
+// CHECK: call double @llvm.ppc.setrnd(i32 0)
 // CHECK: %{{[0-9a-zA-Z_.]+}} = call <4 x float> asm "", "=^wa,0"
 // CHECK: call <4 x float> @vec_rint(float vector[4])
 // CHECK: call void asm sideeffect "", "^wa"
-// CHECK: call signext i32 @__builtin_set_fpscr_rn(i64 noundef %{{[0-9a-zA-Z_.]+}})
+// CHECK: call double @llvm.ppc.setrnd(i32 %{{[0-9a-zA-Z_.]+}})
 // CHECK: call <4 x float> @vec_floor(float vector[4])
 // CHECK: call <4 x float> @vec_ceil(float vector[4])
 // CHECK: call <4 x float> @vec_trunc(float vector[4])
 // CHECK: call <4 x float> @vec_rint(float vector[4])
 // CHECK: call void asm sideeffect "", "^wa"
-// CHECK: call signext i32 @__builtin_mffsl()
-// CHECK: call signext i32 @__builtin_mtfsf(i32 noundef signext 3, double noundef %{{[0-9a-zA-Z_.]+}})
+// CHECK: call double @llvm.ppc.readflm()
+// P10: call double @llvm.ppc.mffsl()
+// CHECK: call void @llvm.ppc.mtfsf(i32 3, double %{{[0-9a-zA-Z_.]+}})
 
 // CHECK-LABEL: define available_externally <4 x float> @_mm_round_ss(<4 x float> noundef %{{[0-9a-zA-Z_.]+}}, <4 x float> noundef %{{[0-9a-zA-Z_.]+}}, i32 noundef signext %{{[0-9a-zA-Z_.]+}})
 // CHECK: call <4 x float> @_mm_round_ps(<4 x float> noundef %{{[0-9a-zA-Z_.]+}}, i32 noundef signext %{{[0-9a-zA-Z_.]+}})
 // CHECK: extractelement <4 x float> %{{[0-9a-zA-Z_.]+}}, i32 0
 
 // CHECK-LABEL: define available_externally <2 x double> @_mm_round_pd(<2 x double> noundef %{{[0-9a-zA-Z_.]+}}, i32 noundef signext %{{[0-9a-zA-Z_.]+}})
-// CHECK: call signext i32 @__builtin_mffs()
-// CHECK: call signext i32 @__builtin_mtfsf(i32 noundef signext 3, double noundef %{{[0-9a-zA-Z_.]+}})
+// CHECK: call double @llvm.ppc.readflm()
+// CHECK: call void @llvm.ppc.mtfsf(i32 3, double %{{[0-9a-zA-Z_.]+}})
 // CHECK: %{{[0-9a-zA-Z_.]+}} = call <2 x double> asm "", "=^wa,0"
-// CHECK: call signext i32 @__builtin_mffsl()
-// CHECK: call signext i32 @__builtin_set_fpscr_rn(i32 noundef signext 0)
+// CHECK: call double @llvm.ppc.readflm()
+// P10: call double @llvm.ppc.mffsl()
+// CHECK: call double @llvm.ppc.setrnd(i32 0)
 // CHECK: %{{[0-9a-zA-Z_.]+}} = call <2 x double> asm "", "=^wa,0"
 // CHECK: call <2 x double> @vec_rint(double vector[2])
 // CHECK: call void asm sideeffect "", "^wa"
-// CHECK: call signext i32 @__builtin_set_fpscr_rn(i64 noundef %{{[0-9a-zA-Z_.]+}})
+// CHECK: call double @llvm.ppc.setrnd(i32 %{{[0-9a-zA-Z_.]+}})
 // CHECK: call <2 x double> @vec_floor(double vector[2])
 // CHECK: call <2 x double> @vec_ceil(double vector[2])
 // CHECK: call <2 x double> @vec_trunc(double vector[2])
 // CHECK: call <2 x double> @vec_rint(double vector[2])
 // CHECK: call void asm sideeffect "", "^wa"
-// CHECK: call signext i32 @__builtin_mffsl()
-// CHECK: call signext i32 @__builtin_mtfsf(i32 noundef signext 3, double noundef %{{[0-9a-zA-Z_.]+}})
+// CHECK: call double @llvm.ppc.readflm()
+// P10: call double @llvm.ppc.mffsl()
+// CHECK: call void @llvm.ppc.mtfsf(i32 3, double %{{[0-9a-zA-Z_.]+}})
 
 // CHECK-LABEL: define available_externally <2 x double> @_mm_round_sd(<2 x double> noundef %{{[0-9a-zA-Z_.]+}}, <2 x double> noundef %{{[0-9a-zA-Z_.]+}}, i32 noundef signext %{{[0-9a-zA-Z_.]+}})
 // CHECK: call <2 x double> @_mm_round_pd(<2 x double> noundef %{{[0-9a-zA-Z_.]+}}, i32 noundef signext %{{[0-9a-zA-Z_.]+}})
diff --git a/clang/test/CodeGen/PowerPC/ppc-tmmintrin.c b/clang/test/CodeGen/PowerPC/ppc-tmmintrin.c
index 60633e34b56b9..40d3839dcf026 100644
--- a/clang/test/CodeGen/PowerPC/ppc-tmmintrin.c
+++ b/clang/test/CodeGen/PowerPC/ppc-tmmintrin.c
@@ -13,6 +13,9 @@
 // RUN: %clang -S -emit-llvm -target powerpc64-ibm-aix -mcpu=pwr8 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
 // RUN:   -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt -n | FileCheck %s --check-prefixes=CHECK,CHECK-BE
 
+// RUN: %clang -x c++ -S -emit-llvm -target powerpc64le-gnu-linux -mcpu=pwr8 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
+// RUN:   -fno-discard-value-names -mllvm -disable-llvm-optzns -fsyntax-only
+
 #include <tmmintrin.h>
 
 __m64 res, m1, m2;
diff --git a/clang/test/CodeGen/PowerPC/ppc-x86gprintrin.c b/clang/test/CodeGen/PowerPC/ppc-x86gprintrin.c
index 238ce7c7ee574..ac90a5f8c530b 100644
--- a/clang/test/CodeGen/PowerPC/ppc-x86gprintrin.c
+++ b/clang/test/CodeGen/PowerPC/ppc-x86gprintrin.c
@@ -12,6 +12,9 @@
 // RUN: %clang -S -emit-llvm -target powerpc64-ibm-aix -mcpu=pwr7 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
 // RUN:   -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt -n | FileCheck %s
 
+// RUN: %clang -x c++ -S -emit-llvm -target powerpc64le-unknown-linux-gnu -mcpu=pwr7 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
+// RUN:   -fno-discard-value-names -mllvm -disable-llvm-optzns -fsyntax-only
+
 #include <x86gprintrin.h>
 
 unsigned short us;

From a1260b5209968c08886e3c6183aa793de8931578 Mon Sep 17 00:00:00 2001
From: Pierre van Houtryve <pierre.vanhoutryve@amd.com>
Date: Thu, 26 Oct 2023 09:57:14 +0200
Subject: [PATCH 020/877] [AMDGPU] Use `S_CSELECT` for uniform i1 ext (#69703)

Solves #59869
---
 llvm/lib/Target/AMDGPU/SIInstructions.td | 25 +++++----
 llvm/test/CodeGen/AMDGPU/saddo.ll        | 65 ++++++++++++++----------
 llvm/test/CodeGen/AMDGPU/uaddo.ll        | 46 +++++++++--------
 llvm/test/CodeGen/AMDGPU/usubo.ll        | 46 +++++++++--------
 llvm/test/CodeGen/AMDGPU/zero_extend.ll  |  2 +-
 5 files changed, 107 insertions(+), 77 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 567f1b812c180..707a1c72b5b7c 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -2278,17 +2278,24 @@ def : GCNPat <
   (REG_SEQUENCE SReg_64, $src, sub0, (i32 (IMPLICIT_DEF)), sub1)
 >;
 
-class ZExt_i64_i1_Pat <SDNode ext> : GCNPat <
-  (i64 (ext i1:$src)),
-    (REG_SEQUENCE VReg_64,
-      (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
-                         /*src1mod*/(i32 0), /*src1*/(i32 1), $src),
-      sub0, (S_MOV_B32 (i32 0)), sub1)
->;
+multiclass ZExt_i64_i1_Pat <SDNode ext> {
+  def: GCNPat <
+    (i64 (ext i1:$src)),
+      (REG_SEQUENCE VReg_64,
+        (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
+                           /*src1mod*/(i32 0), /*src1*/(i32 1), $src),
+        sub0, (S_MOV_B32 (i32 0)), sub1)
+  >;
+
+  def : GCNPat <
+    (i64 (UniformUnaryFrag<ext> SCC)),
+    (S_CSELECT_B64 (i64 1), (i64 0))
+  >;
+}
 
 
-def : ZExt_i64_i1_Pat<zext>;
-def : ZExt_i64_i1_Pat<anyext>;
+defm : ZExt_i64_i1_Pat<zext>;
+defm : ZExt_i64_i1_Pat<anyext>;
 
 // FIXME: We need to use COPY_TO_REGCLASS to work-around the fact that
 // REG_SEQUENCE patterns don't support instructions with multiple outputs.
diff --git a/llvm/test/CodeGen/AMDGPU/saddo.ll b/llvm/test/CodeGen/AMDGPU/saddo.ll
index cb3166d7a20d3..f6f3e47c3be7a 100644
--- a/llvm/test/CodeGen/AMDGPU/saddo.ll
+++ b/llvm/test/CodeGen/AMDGPU/saddo.ll
@@ -29,10 +29,12 @@ define amdgpu_kernel void @saddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b)
 ; SI-NEXT:    s_mov_b32 s0, s4
 ; SI-NEXT:    s_mov_b32 s1, s5
 ; SI-NEXT:    s_xor_b64 s[4:5], s[6:7], vcc
-; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
-; SI-NEXT:    v_mov_b32_e32 v1, s11
-; SI-NEXT:    v_add_i32_e32 v0, vcc, s10, v0
-; SI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; SI-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; SI-NEXT:    s_cselect_b64 s[4:5], 1, 0
+; SI-NEXT:    s_add_u32 s4, s10, s4
+; SI-NEXT:    s_addc_u32 s5, s11, s5
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    v_mov_b32_e32 v1, s5
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
@@ -45,15 +47,17 @@ define amdgpu_kernel void @saddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b)
 ; VI-NEXT:    s_add_u32 s2, s6, s0
 ; VI-NEXT:    v_mov_b32_e32 v2, s7
 ; VI-NEXT:    s_addc_u32 s3, s7, s1
-; VI-NEXT:    v_cmp_lt_i64_e64 s[8:9], s[0:1], 0
 ; VI-NEXT:    v_cmp_lt_i64_e32 vcc, s[2:3], v[1:2]
-; VI-NEXT:    v_mov_b32_e32 v3, s3
-; VI-NEXT:    s_xor_b64 s[0:1], s[8:9], vcc
-; VI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
-; VI-NEXT:    v_add_u32_e32 v2, vcc, s2, v2
+; VI-NEXT:    v_cmp_lt_i64_e64 s[0:1], s[0:1], 0
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    s_xor_b64 s[0:1], s[0:1], vcc
+; VI-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; VI-NEXT:    s_cselect_b64 s[0:1], 1, 0
+; VI-NEXT:    s_add_u32 s0, s2, s0
+; VI-NEXT:    s_addc_u32 s1, s3, s1
+; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
-; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; VI-NEXT:    s_endpgm
 ;
@@ -67,13 +71,15 @@ define amdgpu_kernel void @saddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b)
 ; GFX9-NEXT:    s_add_u32 s0, s6, s2
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX9-NEXT:    s_addc_u32 s1, s7, s3
-; GFX9-NEXT:    v_cmp_lt_i64_e64 s[8:9], s[2:3], 0
 ; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1]
+; GFX9-NEXT:    v_cmp_lt_i64_e64 s[2:3], s[2:3], 0
+; GFX9-NEXT:    s_xor_b64 s[2:3], s[2:3], vcc
+; GFX9-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GFX9-NEXT:    s_cselect_b64 s[2:3], 1, 0
+; GFX9-NEXT:    s_add_u32 s0, s0, s2
+; GFX9-NEXT:    s_addc_u32 s1, s1, s3
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    s_xor_b64 s[2:3], s[8:9], vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[2:3]
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -87,11 +93,14 @@ define amdgpu_kernel void @saddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b)
 ; GFX10-NEXT:    s_add_u32 s0, s6, s2
 ; GFX10-NEXT:    s_addc_u32 s1, s7, s3
 ; GFX10-NEXT:    v_cmp_lt_i64_e64 s2, s[2:3], 0
-; GFX10-NEXT:    v_cmp_lt_i64_e64 s3, s[0:1], s[6:7]
-; GFX10-NEXT:    s_xor_b32 s2, s2, s3
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s2
-; GFX10-NEXT:    v_add_co_u32 v0, s0, s0, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s0, s1, 0, s0
+; GFX10-NEXT:    v_cmp_lt_i64_e64 s6, s[0:1], s[6:7]
+; GFX10-NEXT:    s_xor_b32 s2, s2, s6
+; GFX10-NEXT:    s_and_b32 s2, s2, exec_lo
+; GFX10-NEXT:    s_cselect_b64 s[2:3], 1, 0
+; GFX10-NEXT:    s_add_u32 s0, s0, s2
+; GFX10-NEXT:    s_addc_u32 s1, s1, s3
+; GFX10-NEXT:    v_mov_b32_e32 v0, s0
+; GFX10-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
@@ -100,18 +109,20 @@ define amdgpu_kernel void @saddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b)
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
-; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_add_u32 s2, s6, s0
 ; GFX11-NEXT:    s_addc_u32 s3, s7, s1
 ; GFX11-NEXT:    v_cmp_lt_i64_e64 s0, s[0:1], 0
-; GFX11-NEXT:    v_cmp_lt_i64_e64 s1, s[2:3], s[6:7]
+; GFX11-NEXT:    v_cmp_lt_i64_e64 s6, s[2:3], s[6:7]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_xor_b32 s0, s0, s1
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_co_u32 v0, s0, s2, v0
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, s3, 0, s0
+; GFX11-NEXT:    s_xor_b32 s0, s0, s6
+; GFX11-NEXT:    s_and_b32 s0, s0, exec_lo
+; GFX11-NEXT:    s_cselect_b64 s[0:1], 1, 0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_u32 s0, s2, s0
+; GFX11-NEXT:    s_addc_u32 s1, s3, s1
+; GFX11-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
diff --git a/llvm/test/CodeGen/AMDGPU/uaddo.ll b/llvm/test/CodeGen/AMDGPU/uaddo.ll
index 4363db2351e7a..0ebf3f5198203 100644
--- a/llvm/test/CodeGen/AMDGPU/uaddo.ll
+++ b/llvm/test/CodeGen/AMDGPU/uaddo.ll
@@ -7,21 +7,23 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
 ; SI-LABEL: s_uaddo_i64_zext:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
-; SI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xd
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_add_u32 s0, s6, s0
+; SI-NEXT:    v_mov_b32_e32 v0, s6
+; SI-NEXT:    v_mov_b32_e32 v1, s7
+; SI-NEXT:    s_addc_u32 s1, s7, s1
+; SI-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
+; SI-NEXT:    s_and_b64 s[6:7], vcc, exec
+; SI-NEXT:    s_cselect_b64 s[6:7], 1, 0
+; SI-NEXT:    s_add_u32 s6, s0, s6
+; SI-NEXT:    s_addc_u32 s7, s1, s7
+; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_mov_b32 s0, s4
 ; SI-NEXT:    s_mov_b32 s1, s5
-; SI-NEXT:    s_add_u32 s4, s6, s8
 ; SI-NEXT:    v_mov_b32_e32 v0, s6
 ; SI-NEXT:    v_mov_b32_e32 v1, s7
-; SI-NEXT:    s_addc_u32 s5, s7, s9
-; SI-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1]
-; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; SI-NEXT:    v_mov_b32_e32 v1, s5
-; SI-NEXT:    v_add_i32_e32 v0, vcc, s4, v0
-; SI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
@@ -30,17 +32,19 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
 ; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v2, s6
+; VI-NEXT:    v_mov_b32_e32 v1, s6
 ; VI-NEXT:    s_add_u32 s0, s6, s0
-; VI-NEXT:    v_mov_b32_e32 v3, s7
 ; VI-NEXT:    s_addc_u32 s1, s7, s1
-; VI-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
-; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT:    v_mov_b32_e32 v2, s7
+; VI-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[1:2]
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    s_and_b64 s[2:3], vcc, exec
+; VI-NEXT:    s_cselect_b64 s[2:3], 1, 0
+; VI-NEXT:    s_add_u32 s0, s0, s2
+; VI-NEXT:    s_addc_u32 s1, s1, s3
+; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
-; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; VI-NEXT:    s_endpgm
 ;
@@ -52,13 +56,15 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX9-NEXT:    s_add_u32 s0, s6, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX9-NEXT:    s_addc_u32 s1, s7, s3
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
+; GFX9-NEXT:    s_and_b64 s[2:3], vcc, exec
+; GFX9-NEXT:    s_cselect_b64 s[2:3], 1, 0
+; GFX9-NEXT:    s_add_u32 s0, s0, s2
+; GFX9-NEXT:    s_addc_u32 s1, s1, s3
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
   %uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b)
diff --git a/llvm/test/CodeGen/AMDGPU/usubo.ll b/llvm/test/CodeGen/AMDGPU/usubo.ll
index 37b5be3b672f2..ade0616137b17 100644
--- a/llvm/test/CodeGen/AMDGPU/usubo.ll
+++ b/llvm/test/CodeGen/AMDGPU/usubo.ll
@@ -8,21 +8,23 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
 ; SI-LABEL: s_usubo_i64_zext:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
-; SI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xd
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_sub_u32 s0, s6, s0
+; SI-NEXT:    v_mov_b32_e32 v0, s6
+; SI-NEXT:    v_mov_b32_e32 v1, s7
+; SI-NEXT:    s_subb_u32 s1, s7, s1
+; SI-NEXT:    v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1]
+; SI-NEXT:    s_and_b64 s[6:7], vcc, exec
+; SI-NEXT:    s_cselect_b64 s[6:7], 1, 0
+; SI-NEXT:    s_add_u32 s6, s0, s6
+; SI-NEXT:    s_addc_u32 s7, s1, s7
+; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_mov_b32 s0, s4
 ; SI-NEXT:    s_mov_b32 s1, s5
-; SI-NEXT:    s_sub_u32 s4, s6, s8
 ; SI-NEXT:    v_mov_b32_e32 v0, s6
 ; SI-NEXT:    v_mov_b32_e32 v1, s7
-; SI-NEXT:    s_subb_u32 s5, s7, s9
-; SI-NEXT:    v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1]
-; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; SI-NEXT:    v_mov_b32_e32 v1, s5
-; SI-NEXT:    v_add_i32_e32 v0, vcc, s4, v0
-; SI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
@@ -31,17 +33,19 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
 ; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v2, s6
+; VI-NEXT:    v_mov_b32_e32 v1, s6
 ; VI-NEXT:    s_sub_u32 s0, s6, s0
-; VI-NEXT:    v_mov_b32_e32 v3, s7
 ; VI-NEXT:    s_subb_u32 s1, s7, s1
-; VI-NEXT:    v_cmp_gt_u64_e32 vcc, s[0:1], v[2:3]
-; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT:    v_mov_b32_e32 v2, s7
+; VI-NEXT:    v_cmp_gt_u64_e32 vcc, s[0:1], v[1:2]
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    s_and_b64 s[2:3], vcc, exec
+; VI-NEXT:    s_cselect_b64 s[2:3], 1, 0
+; VI-NEXT:    s_add_u32 s0, s0, s2
+; VI-NEXT:    s_addc_u32 s1, s1, s3
+; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
-; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; VI-NEXT:    s_endpgm
 ;
@@ -53,13 +57,15 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX9-NEXT:    s_sub_u32 s0, s6, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX9-NEXT:    s_subb_u32 s1, s7, s3
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX9-NEXT:    v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1]
+; GFX9-NEXT:    s_and_b64 s[2:3], vcc, exec
+; GFX9-NEXT:    s_cselect_b64 s[2:3], 1, 0
+; GFX9-NEXT:    s_add_u32 s0, s0, s2
+; GFX9-NEXT:    s_addc_u32 s1, s1, s3
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
   %usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %b) #0
diff --git a/llvm/test/CodeGen/AMDGPU/zero_extend.ll b/llvm/test/CodeGen/AMDGPU/zero_extend.ll
index 1f532f2706de7..9933cdc18e5fd 100644
--- a/llvm/test/CodeGen/AMDGPU/zero_extend.ll
+++ b/llvm/test/CodeGen/AMDGPU/zero_extend.ll
@@ -38,7 +38,7 @@ define amdgpu_kernel void @s_arg_zext_i1_to_i64(ptr addrspace(1) %out, i1 zeroex
 ; GCN-LABEL: {{^}}s_cmp_zext_i1_to_i64:
 ; GCN-DAG: s_mov_b32 s{{[0-9]+}}, 0
 ; GCN-DAG: s_cmp_eq_u32
-; GCN:     v_cndmask_b32
+; GCN:     s_cselect_b64 s[{{[0-9]+:[0-9]+}}], 1, 0
 define amdgpu_kernel void @s_cmp_zext_i1_to_i64(ptr addrspace(1) %out, i32 %a, i32 %b) #0 {
   %cmp = icmp eq i32 %a, %b
   %ext = zext i1 %cmp to i64

From e58c4c771329eda8ef407b080e2f91e30aabac73 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Thu, 26 Oct 2023 10:58:28 +0300
Subject: [PATCH 021/877] [LLD] [COFF] Recognize Itanium vtables for ICF
 (#70196)

The testcases are plain copies of the existing ICF vtable testcase, with
symbol names renamed to match the Itanium vtable name pattern.
---
 lld/COFF/ICF.cpp                         |  5 ++++-
 lld/test/COFF/icf-vtables-itanium-i386.s | 27 +++++++++++++++++++++++
 lld/test/COFF/icf-vtables-itanium.s      | 28 ++++++++++++++++++++++++
 3 files changed, 59 insertions(+), 1 deletion(-)
 create mode 100644 lld/test/COFF/icf-vtables-itanium-i386.s
 create mode 100644 lld/test/COFF/icf-vtables-itanium.s

diff --git a/lld/COFF/ICF.cpp b/lld/COFF/ICF.cpp
index 0f43da0dbc101..013ffcfb3d5d1 100644
--- a/lld/COFF/ICF.cpp
+++ b/lld/COFF/ICF.cpp
@@ -94,7 +94,10 @@ bool ICF::isEligible(SectionChunk *c) {
     return true;
 
   // So are vtables.
-  if (c->sym && c->sym->getName().starts_with("??_7"))
+  const char *itaniumVtablePrefix =
+      ctx.config.machine == I386 ? "__ZTV" : "_ZTV";
+  if (c->sym && (c->sym->getName().starts_with("??_7") ||
+                 c->sym->getName().starts_with(itaniumVtablePrefix)))
     return true;
 
   // Anything else not in an address-significance table is eligible.
diff --git a/lld/test/COFF/icf-vtables-itanium-i386.s b/lld/test/COFF/icf-vtables-itanium-i386.s
new file mode 100644
index 0000000000000..77b3755178b8b
--- /dev/null
+++ b/lld/test/COFF/icf-vtables-itanium-i386.s
@@ -0,0 +1,27 @@
+# REQUIRES: x86
+# RUN: llvm-mc -triple=i386-windows-gnu -filetype=obj -o %t.obj %s
+# RUN: lld-link %t.obj /out:%t.exe /entry:main /subsystem:console /safeseh:no
+# RUN: llvm-objdump -s %t.exe | FileCheck %s
+
+# CHECK: Contents of section .text:
+.globl _main
+_main:
+# CHECK-NEXT: 401000 00204000 01204000 01204000
+.long __ZTS
+.long __ZTV
+.long __ZTVa
+
+.section .rdata,"dr",discard,__ZTS
+.globl __ZTS
+__ZTS:
+.byte 42
+
+.section .rdata,"dr",discard,__ZTV
+.globl __ZTV
+__ZTV:
+.byte 42
+
+.section .rdata,"dr",discard,__ZTVa
+.globl __ZTVa
+__ZTVa:
+.byte 42
diff --git a/lld/test/COFF/icf-vtables-itanium.s b/lld/test/COFF/icf-vtables-itanium.s
new file mode 100644
index 0000000000000..94f2b611dc86b
--- /dev/null
+++ b/lld/test/COFF/icf-vtables-itanium.s
@@ -0,0 +1,28 @@
+# REQUIRES: x86
+# RUN: llvm-mc -triple=x86_64-windows-gnu -filetype=obj -o %t.obj %s
+# RUN: lld-link %t.obj /out:%t.exe /entry:main /subsystem:console
+# RUN: llvm-objdump -s %t.exe | FileCheck %s
+
+# CHECK: Contents of section .text:
+.globl main
+main:
+# CHECK-NEXT: 140001000 00200040 01000000 01200040 01000000
+.8byte _ZTS
+.8byte _ZTV
+# CHECK-NEXT: 140001010 01200040 01000000
+.8byte _ZTVa
+
+.section .rdata,"dr",discard,_ZTS
+.globl _ZTS
+_ZTS:
+.byte 42
+
+.section .rdata,"dr",discard,_ZTV
+.globl _ZTV
+_ZTV:
+.byte 42
+
+.section .rdata,"dr",discard,_ZTVa
+.globl _ZTVa
+_ZTVa:
+.byte 42

From 34fe8be705c608c4c25ece725eef9305ccfa64e1 Mon Sep 17 00:00:00 2001
From: Mikael Holmen <mikael.holmen@ericsson.com>
Date: Wed, 25 Oct 2023 07:09:54 +0200
Subject: [PATCH 022/877] [test][AggressiveInstCombine] Precommit testcase for
 #69925

We get different results with/without debug info present.
---
 .../AArch64/combine_ignore_debug.ll           | 57 +++++++++++++++++++
 1 file changed, 57 insertions(+)
 create mode 100644 llvm/test/Transforms/AggressiveInstCombine/AArch64/combine_ignore_debug.ll

diff --git a/llvm/test/Transforms/AggressiveInstCombine/AArch64/combine_ignore_debug.ll b/llvm/test/Transforms/AggressiveInstCombine/AArch64/combine_ignore_debug.ll
new file mode 100644
index 0000000000000..4b41060544f7a
--- /dev/null
+++ b/llvm/test/Transforms/AggressiveInstCombine/AArch64/combine_ignore_debug.ll
@@ -0,0 +1,57 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; RUN: opt -mtriple aarch64 -aggressive-instcombine-max-scan-instrs=1 -passes="aggressive-instcombine" -S < %s | FileCheck %s -check-prefix DBG
+; RUN: opt -strip-debug -mtriple aarch64 -aggressive-instcombine-max-scan-instrs=1 -passes="aggressive-instcombine" -S < %s | FileCheck %s -check-prefix NODBG
+
+; FIXME: The DBG and NODBG cases should be the same. I.e. we should optimize the
+; DBG case too even if there is a dbg.value.
+; This is described in https://github.com/llvm/llvm-project/issues/69925
+
+target datalayout = "E"
+
+%s = type { i16, i16 }
+
+@e = global %s zeroinitializer, align 1
+@l = global %s zeroinitializer, align 1
+
+define void @test() {
+; DBG-LABEL: define void @test() {
+; DBG-NEXT:  entry:
+; DBG-NEXT:    [[L1:%.*]] = load i16, ptr @e, align 1
+; DBG-NEXT:    call void @llvm.dbg.value(metadata i32 undef, metadata [[META3:![0-9]+]], metadata !DIExpression()), !dbg [[DBG5:![0-9]+]]
+; DBG-NEXT:    [[L2:%.*]] = load i16, ptr getelementptr inbounds ([[S:%.*]], ptr @e, i16 0, i32 1), align 1
+; DBG-NEXT:    [[E2:%.*]] = zext i16 [[L2]] to i32
+; DBG-NEXT:    [[E1:%.*]] = zext i16 [[L1]] to i32
+; DBG-NEXT:    [[S1:%.*]] = shl nuw i32 [[E1]], 16
+; DBG-NEXT:    [[O1:%.*]] = or i32 [[S1]], [[E2]]
+; DBG-NEXT:    store i32 [[O1]], ptr @l, align 1
+; DBG-NEXT:    ret void
+;
+; NODBG-LABEL: define void @test() {
+; NODBG-NEXT:  entry:
+; NODBG-NEXT:    [[L1:%.*]] = load i32, ptr @e, align 1
+; NODBG-NEXT:    store i32 [[L1]], ptr @l, align 1
+; NODBG-NEXT:    ret void
+;
+entry:
+  %l1 = load i16, ptr @e, align 1
+  call void @llvm.dbg.value(metadata i32 undef, metadata !3, metadata !DIExpression()), !dbg !5
+  %l2 = load i16, ptr getelementptr inbounds (%s, ptr @e, i16 0, i32 1), align 1
+  %e2 = zext i16 %l2 to i32
+  %e1 = zext i16 %l1 to i32
+  %s1 = shl nuw i32 %e1, 16
+  %o1 = or i32 %s1, %e2
+  store i32 %o1, ptr @l, align 1
+  ret void
+}
+
+declare void @llvm.dbg.value(metadata, metadata, metadata)
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C11, file: !1)
+!1 = !DIFile(filename: "foo.c", directory: "/")
+!2 = !{i32 2, !"Debug Info Version", i32 3}
+!3 = !DILocalVariable(scope: !4)
+!4 = distinct !DISubprogram(unit: !0)
+!5 = !DILocation(scope: !4)

From ce0a750fe4b04630fcb242822627d790f4c2878b Mon Sep 17 00:00:00 2001
From: Mikael Holmen <mikael.holmen@ericsson.com>
Date: Wed, 25 Oct 2023 07:11:15 +0200
Subject: [PATCH 023/877] [AggressiveInstCombine] Ignore debug instructions
 when load combining (#70200)

We previously included debug instructions when counting instructions when
looking for loads to combine. This meant that the presence of debug
instructions could affect optimization, as shown in the updated testcase.

This fixes #69925.
---
 .../AggressiveInstCombine.cpp                      |  5 ++++-
 .../AArch64/combine_ignore_debug.ll                | 14 ++++----------
 2 files changed, 8 insertions(+), 11 deletions(-)

diff --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
index a55d01645f10e..d09ac1c099c1a 100644
--- a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
+++ b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
@@ -706,7 +706,10 @@ static bool foldLoadsRecursive(Value *V, LoadOps &LOps, const DataLayout &DL,
        make_range(Start->getIterator(), End->getIterator())) {
     if (Inst.mayWriteToMemory() && isModSet(AA.getModRefInfo(&Inst, Loc)))
       return false;
-    if (++NumScanned > MaxInstrsToScan)
+
+    // Ignore debug info so that's not counted against MaxInstrsToScan.
+    // Otherwise debug info could affect codegen.
+    if (!isa<DbgInfoIntrinsic>(Inst) && ++NumScanned > MaxInstrsToScan)
       return false;
   }
 
diff --git a/llvm/test/Transforms/AggressiveInstCombine/AArch64/combine_ignore_debug.ll b/llvm/test/Transforms/AggressiveInstCombine/AArch64/combine_ignore_debug.ll
index 4b41060544f7a..68455a1f9074e 100644
--- a/llvm/test/Transforms/AggressiveInstCombine/AArch64/combine_ignore_debug.ll
+++ b/llvm/test/Transforms/AggressiveInstCombine/AArch64/combine_ignore_debug.ll
@@ -2,9 +2,8 @@
 ; RUN: opt -mtriple aarch64 -aggressive-instcombine-max-scan-instrs=1 -passes="aggressive-instcombine" -S < %s | FileCheck %s -check-prefix DBG
 ; RUN: opt -strip-debug -mtriple aarch64 -aggressive-instcombine-max-scan-instrs=1 -passes="aggressive-instcombine" -S < %s | FileCheck %s -check-prefix NODBG
 
-; FIXME: The DBG and NODBG cases should be the same. I.e. we should optimize the
-; DBG case too even if there is a dbg.value.
-; This is described in https://github.com/llvm/llvm-project/issues/69925
+; The DBG and NODBG cases should be the same. I.e. we should optimize the DBG
+; case too even if there is a dbg.value.
 
 target datalayout = "E"
 
@@ -16,14 +15,9 @@ target datalayout = "E"
 define void @test() {
 ; DBG-LABEL: define void @test() {
 ; DBG-NEXT:  entry:
-; DBG-NEXT:    [[L1:%.*]] = load i16, ptr @e, align 1
+; DBG-NEXT:    [[L1:%.*]] = load i32, ptr @e, align 1
 ; DBG-NEXT:    call void @llvm.dbg.value(metadata i32 undef, metadata [[META3:![0-9]+]], metadata !DIExpression()), !dbg [[DBG5:![0-9]+]]
-; DBG-NEXT:    [[L2:%.*]] = load i16, ptr getelementptr inbounds ([[S:%.*]], ptr @e, i16 0, i32 1), align 1
-; DBG-NEXT:    [[E2:%.*]] = zext i16 [[L2]] to i32
-; DBG-NEXT:    [[E1:%.*]] = zext i16 [[L1]] to i32
-; DBG-NEXT:    [[S1:%.*]] = shl nuw i32 [[E1]], 16
-; DBG-NEXT:    [[O1:%.*]] = or i32 [[S1]], [[E2]]
-; DBG-NEXT:    store i32 [[O1]], ptr @l, align 1
+; DBG-NEXT:    store i32 [[L1]], ptr @l, align 1
 ; DBG-NEXT:    ret void
 ;
 ; NODBG-LABEL: define void @test() {

From 5ef45c02dc0cd6cce0b3eb099707b8fc696a8ff6 Mon Sep 17 00:00:00 2001
From: Guray Ozen <guray.ozen@gmail.com>
Date: Thu, 26 Oct 2023 10:02:32 +0200
Subject: [PATCH 024/877] [mlir][cuda] Avoid driver call to check max shared
 memory (#70021)

This PR guards the driver call with if-statement as the driver calls are
more expensive.

As a future todo, the if statement could be generated by the compiler
and thus optimized in some cases.
---
 .../ExecutionEngine/CudaRuntimeWrappers.cpp   | 29 ++++++++++---------
 1 file changed, 16 insertions(+), 13 deletions(-)

diff --git a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
index 55db744af021c..a8e743c519135 100644
--- a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
+++ b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
@@ -168,20 +168,23 @@ mgpuLaunchKernel(CUfunction function, intptr_t gridX, intptr_t gridY,
                  intptr_t blockZ, int32_t smem, CUstream stream, void **params,
                  void **extra, size_t /*paramsCount*/) {
   ScopedContext scopedContext;
-  int32_t maxShmem = 0;
-  CUdevice device = getDefaultCuDevice();
-  CUDA_REPORT_IF_ERROR(cuDeviceGet(&device, /*ordinal=*/defaultDevice));
-  CUDA_REPORT_IF_ERROR(cuDeviceGetAttribute(
-      &maxShmem, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN,
-      device));
-  if (maxShmem < smem) {
-    fprintf(stderr,
-            "Requested shared memory (%dkb) is larger than maximum allowed "
-            "shared memory (%dkb) for this device\n",
-            smem, maxShmem);
+  if (smem > 0) {
+    // Avoid checking driver as it's more expensive than if statement
+    int32_t maxShmem = 0;
+    CUdevice device = getDefaultCuDevice();
+    CUDA_REPORT_IF_ERROR(cuDeviceGet(&device, /*ordinal=*/defaultDevice));
+    CUDA_REPORT_IF_ERROR(cuDeviceGetAttribute(
+        &maxShmem, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN,
+        device));
+    if (maxShmem < smem) {
+      fprintf(stderr,
+              "Requested shared memory (%dkb) is larger than maximum allowed "
+              "shared memory (%dkb) for this device\n",
+              smem, maxShmem);
+    }
+    CUDA_REPORT_IF_ERROR(cuFuncSetAttribute(
+        function, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, smem));
   }
-  CUDA_REPORT_IF_ERROR(cuFuncSetAttribute(
-      function, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, smem));
   debug_print("Launching kernel, grid=%ld,%ld,%ld, "
               "threads: %ld, %ld, %ld, "
               "smem: %dkb\n",

From f7dc26cab25e1000b1198625e30ce1acdbe8fcb4 Mon Sep 17 00:00:00 2001
From: Guray Ozen <guray.ozen@gmail.com>
Date: Thu, 26 Oct 2023 10:02:54 +0200
Subject: [PATCH 025/877] [mlir] Fixed typo in type (128x64 -> 64x128) in TMA
 load test (#70022)

The test was meant to check `64x128xf16` as the contiguous dimension
exceeds the cache line (128b). TMA requires cache line-aligned loads, so
loading 64x128 can be done with two 64x64 loads, as documented in the
test.

However, there was a typo in the type, which was `memref<128x64xf16>`
instead of the correct `memref<64x128xf16>`. This PR corrects the issue
and updates the verification.
---
 .../CUDA/sm90/tma_load_64x64_swizzle128b.mlir | 143 +++++++++---------
 1 file changed, 69 insertions(+), 74 deletions(-)

diff --git a/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x64_swizzle128b.mlir b/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x64_swizzle128b.mlir
index 242c5ff875cf4..13b9c48dabe85 100644
--- a/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x64_swizzle128b.mlir
+++ b/mlir/test/Integration/GPU/CUDA/sm90/tma_load_64x64_swizzle128b.mlir
@@ -33,8 +33,8 @@
 !shmemlhs = memref<128x64xf16, 3>
 !lhsTensorMap = !nvgpu.tensormap.descriptor<tensor = !shmemlhs, swizzle = swizzle_128b, l2promo=none, oob=zero, interleave=none>
 
-!rhs = memref<128x64xf16>
-!shmemrhs = memref<128x64xf16, 3>
+!rhs = memref<64x128xf16>
+!shmemrhs = memref<64x128xf16, 3>
 !rhsTensorMap = !nvgpu.tensormap.descriptor<tensor = !shmemrhs, swizzle = swizzle_128b, l2promo=none, oob=zero, interleave=none>
 
 module @mymod {
@@ -68,10 +68,6 @@ module @mymod {
         memref.store %vR, %rhs[%i, %j] : !rhs      
         %vR32 = arith.extf %vR : f16 to f32      
         memref.store %vR32, %rhs32[%i, %j] : memref<64x128xf32>      
-      }
-    }
-    scf.for %j = %c0 to %c128 step %c1 {
-      scf.for %i = %c0 to %c64 step %c1 {    
         %b0 = arith.muli %j, %c64 : index
         %b00 = arith.addi %b0, %i : index
         %b01 = arith.divui %b00, %c8 : index
@@ -103,20 +99,19 @@ module @mymod {
       %6 = gpu.thread_id  x
       %lhsShmem = memref.get_global @bufferLhsGlobal : !shmemlhs
       %rhsShmem = memref.get_global @bufferRhsGlobal : !shmemrhs
-      %rhsShmem2 = memref.subview %rhsShmem[%c32, %c0][%c32, %c128][%c1, %c1] : !shmemrhs to memref<?x?xf16, strided<[?, ?], offset: ?>, 3>
+      %rhsShmem2 = memref.subview %rhsShmem[32, 0][128, 64][1, 1] : !shmemrhs to memref<128x64xf16, strided<[128, 1], offset: 4096>, 3>
     
       // Step 5. Initialize the mbarrier
       %9 = nvgpu.mbarrier.create -> !barrierType
       nvgpu.mbarrier.init %9[%c0], %5 : !barrierType
       %10 = arith.cmpi eq, %6, %c0 : index
       
-      
       // Step 6. First thread does TMA load
       scf.if %10 {
         gpu.printf "[GPU] TMA SIZE %d\0A" %c32768 : index
         nvgpu.tma.async.load %d_lhsTensorMap[%c0, %c0], %9[%c0] to %lhsShmem : !lhsTensorMap, !barrierType -> !shmemlhs
         nvgpu.tma.async.load %d_rhsTensorMap[%c0, %c0], %9[%c0] to %rhsShmem : !rhsTensorMap, !barrierType -> !shmemrhs
-        nvgpu.tma.async.load %d_rhsTensorMap[%c64, %c0], %9[%c0] to %rhsShmem2 : !rhsTensorMap, !barrierType -> memref<?x?xf16, strided<[?, ?], offset: ?>, 3>
+        nvgpu.tma.async.load %d_rhsTensorMap[%c64, %c0], %9[%c0] to %rhsShmem2 : !rhsTensorMap, !barrierType -> memref<128x64xf16, strided<[128, 1], offset: 4096>, 3>
         nvgpu.mbarrier.arrive.expect_tx %9[%c0], %c32768 : !barrierType
       } else {
         nvgpu.mbarrier.arrive.expect_tx %9[%c0], %c0 : !barrierType
@@ -147,69 +142,69 @@ module @mymod {
 
 
 // CHECK: [GPU] TMA SIZE 32768
-// CHECK: ===--- Matrix B ---=== -1 
-// CHECK: 0,   0,   0,   0,   0,   0,   0,   0,   1,   1,   1,   1,   1,   1,   1,   1,   2,   2,   2,   2,   2,   2,   2,   2,   3,   3,   3,   3,   3,   3,   3,   3,   4,   4,   4,   4,   4,   4,   4,   4,   5,   5,   5,   5,   5,   5,   5,   5,   6,   6,   6,   6,   6,   6,   6,   6,   7,   7,   7,   7,   7,   7,   7,   7,   17,   17,   17,   17,   17,   17,   17,   17,   16,   16,   16,   16,   16,   16,   16,   16,   19,   19,   19,   19,   19,   19,   19,   19,   18,   18,   18,   18,   18,   18,   18,   18,   21,   21,   21,   21,   21,   21,   21,   21,   20,   20,   20,   20,   20,   20,   20,   20,   23,   23,   23,   23,   23,   23,   23,   23,   22,   22,   22,   22,   22,   22,   22,   22,   -1
-// CHECK: 17,   17,   17,   17,   17,   17,   17,   17,   16,   16,   16,   16,   16,   16,   16,   16,   19,   19,   19,   19,   19,   19,   19,   19,   18,   18,   18,   18,   18,   18,   18,   18,   21,   21,   21,   21,   21,   21,   21,   21,   20,   20,   20,   20,   20,   20,   20,   20,   23,   23,   23,   23,   23,   23,   23,   23,   22,   22,   22,   22,   22,   22,   22,   22,   34,   34,   34,   34,   34,   34,   34,   34,   35,   35,   35,   35,   35,   35,   35,   35,   32,   32,   32,   32,   32,   32,   32,   32,   33,   33,   33,   33,   33,   33,   33,   33,   38,   38,   38,   38,   38,   38,   38,   38,   39,   39,   39,   39,   39,   39,   39,   39,   36,   36,   36,   36,   36,   36,   36,   36,   37,   37,   37,   37,   37,   37,   37,   37,   -1
-// CHECK: 34,   34,   34,   34,   34,   34,   34,   34,   35,   35,   35,   35,   35,   35,   35,   35,   32,   32,   32,   32,   32,   32,   32,   32,   33,   33,   33,   33,   33,   33,   33,   33,   38,   38,   38,   38,   38,   38,   38,   38,   39,   39,   39,   39,   39,   39,   39,   39,   36,   36,   36,   36,   36,   36,   36,   36,   37,   37,   37,   37,   37,   37,   37,   37,   51,   51,   51,   51,   51,   51,   51,   51,   50,   50,   50,   50,   50,   50,   50,   50,   49,   49,   49,   49,   49,   49,   49,   49,   48,   48,   48,   48,   48,   48,   48,   48,   55,   55,   55,   55,   55,   55,   55,   55,   54,   54,   54,   54,   54,   54,   54,   54,   53,   53,   53,   53,   53,   53,   53,   53,   52,   52,   52,   52,   52,   52,   52,   52,   -1
-// CHECK: 51,   51,   51,   51,   51,   51,   51,   51,   50,   50,   50,   50,   50,   50,   50,   50,   49,   49,   49,   49,   49,   49,   49,   49,   48,   48,   48,   48,   48,   48,   48,   48,   55,   55,   55,   55,   55,   55,   55,   55,   54,   54,   54,   54,   54,   54,   54,   54,   53,   53,   53,   53,   53,   53,   53,   53,   52,   52,   52,   52,   52,   52,   52,   52,   68,   68,   68,   68,   68,   68,   68,   68,   69,   69,   69,   69,   69,   69,   69,   69,   70,   70,   70,   70,   70,   70,   70,   70,   71,   71,   71,   71,   71,   71,   71,   71,   64,   64,   64,   64,   64,   64,   64,   64,   65,   65,   65,   65,   65,   65,   65,   65,   66,   66,   66,   66,   66,   66,   66,   66,   67,   67,   67,   67,   67,   67,   67,   67,   -1
-// CHECK: 68,   68,   68,   68,   68,   68,   68,   68,   69,   69,   69,   69,   69,   69,   69,   69,   70,   70,   70,   70,   70,   70,   70,   70,   71,   71,   71,   71,   71,   71,   71,   71,   64,   64,   64,   64,   64,   64,   64,   64,   65,   65,   65,   65,   65,   65,   65,   65,   66,   66,   66,   66,   66,   66,   66,   66,   67,   67,   67,   67,   67,   67,   67,   67,   85,   85,   85,   85,   85,   85,   85,   85,   84,   84,   84,   84,   84,   84,   84,   84,   87,   87,   87,   87,   87,   87,   87,   87,   86,   86,   86,   86,   86,   86,   86,   86,   81,   81,   81,   81,   81,   81,   81,   81,   80,   80,   80,   80,   80,   80,   80,   80,   83,   83,   83,   83,   83,   83,   83,   83,   82,   82,   82,   82,   82,   82,   82,   82,   -1
-// CHECK: 85,   85,   85,   85,   85,   85,   85,   85,   84,   84,   84,   84,   84,   84,   84,   84,   87,   87,   87,   87,   87,   87,   87,   87,   86,   86,   86,   86,   86,   86,   86,   86,   81,   81,   81,   81,   81,   81,   81,   81,   80,   80,   80,   80,   80,   80,   80,   80,   83,   83,   83,   83,   83,   83,   83,   83,   82,   82,   82,   82,   82,   82,   82,   82,   102,   102,   102,   102,   102,   102,   102,   102,   103,   103,   103,   103,   103,   103,   103,   103,   100,   100,   100,   100,   100,   100,   100,   100,   101,   101,   101,   101,   101,   101,   101,   101,   98,   98,   98,   98,   98,   98,   98,   98,   99,   99,   99,   99,   99,   99,   99,   99,   96,   96,   96,   96,   96,   96,   96,   96,   97,   97,   97,   97,   97,   97,   97,   97,   -1
-// CHECK: 102,   102,   102,   102,   102,   102,   102,   102,   103,   103,   103,   103,   103,   103,   103,   103,   100,   100,   100,   100,   100,   100,   100,   100,   101,   101,   101,   101,   101,   101,   101,   101,   98,   98,   98,   98,   98,   98,   98,   98,   99,   99,   99,   99,   99,   99,   99,   99,   96,   96,   96,   96,   96,   96,   96,   96,   97,   97,   97,   97,   97,   97,   97,   97,   119,   119,   119,   119,   119,   119,   119,   119,   118,   118,   118,   118,   118,   118,   118,   118,   117,   117,   117,   117,   117,   117,   117,   117,   116,   116,   116,   116,   116,   116,   116,   116,   115,   115,   115,   115,   115,   115,   115,   115,   114,   114,   114,   114,   114,   114,   114,   114,   113,   113,   113,   113,   113,   113,   113,   113,   112,   112,   112,   112,   112,   112,   112,   112,   -1
-// CHECK: 119,   119,   119,   119,   119,   119,   119,   119,   118,   118,   118,   118,   118,   118,   118,   118,   117,   117,   117,   117,   117,   117,   117,   117,   116,   116,   116,   116,   116,   116,   116,   116,   115,   115,   115,   115,   115,   115,   115,   115,   114,   114,   114,   114,   114,   114,   114,   114,   113,   113,   113,   113,   113,   113,   113,   113,   112,   112,   112,   112,   112,   112,   112,   112,   128,   128,   128,   128,   128,   128,   128,   128,   129,   129,   129,   129,   129,   129,   129,   129,   130,   130,   130,   130,   130,   130,   130,   130,   131,   131,   131,   131,   131,   131,   131,   131,   132,   132,   132,   132,   132,   132,   132,   132,   133,   133,   133,   133,   133,   133,   133,   133,   134,   134,   134,   134,   134,   134,   134,   134,   135,   135,   135,   135,   135,   135,   135,   135,   -1
-// CHECK: 128,   128,   128,   128,   128,   128,   128,   128,   129,   129,   129,   129,   129,   129,   129,   129,   130,   130,   130,   130,   130,   130,   130,   130,   131,   131,   131,   131,   131,   131,   131,   131,   132,   132,   132,   132,   132,   132,   132,   132,   133,   133,   133,   133,   133,   133,   133,   133,   134,   134,   134,   134,   134,   134,   134,   134,   135,   135,   135,   135,   135,   135,   135,   135,   145,   145,   145,   145,   145,   145,   145,   145,   144,   144,   144,   144,   144,   144,   144,   144,   147,   147,   147,   147,   147,   147,   147,   147,   146,   146,   146,   146,   146,   146,   146,   146,   149,   149,   149,   149,   149,   149,   149,   149,   148,   148,   148,   148,   148,   148,   148,   148,   151,   151,   151,   151,   151,   151,   151,   151,   150,   150,   150,   150,   150,   150,   150,   150,   -1
-// CHECK: 145,   145,   145,   145,   145,   145,   145,   145,   144,   144,   144,   144,   144,   144,   144,   144,   147,   147,   147,   147,   147,   147,   147,   147,   146,   146,   146,   146,   146,   146,   146,   146,   149,   149,   149,   149,   149,   149,   149,   149,   148,   148,   148,   148,   148,   148,   148,   148,   151,   151,   151,   151,   151,   151,   151,   151,   150,   150,   150,   150,   150,   150,   150,   150,   162,   162,   162,   162,   162,   162,   162,   162,   163,   163,   163,   163,   163,   163,   163,   163,   160,   160,   160,   160,   160,   160,   160,   160,   161,   161,   161,   161,   161,   161,   161,   161,   166,   166,   166,   166,   166,   166,   166,   166,   167,   167,   167,   167,   167,   167,   167,   167,   164,   164,   164,   164,   164,   164,   164,   164,   165,   165,   165,   165,   165,   165,   165,   165,   -1
-// CHECK: 162,   162,   162,   162,   162,   162,   162,   162,   163,   163,   163,   163,   163,   163,   163,   163,   160,   160,   160,   160,   160,   160,   160,   160,   161,   161,   161,   161,   161,   161,   161,   161,   166,   166,   166,   166,   166,   166,   166,   166,   167,   167,   167,   167,   167,   167,   167,   167,   164,   164,   164,   164,   164,   164,   164,   164,   165,   165,   165,   165,   165,   165,   165,   165,   179,   179,   179,   179,   179,   179,   179,   179,   178,   178,   178,   178,   178,   178,   178,   178,   177,   177,   177,   177,   177,   177,   177,   177,   176,   176,   176,   176,   176,   176,   176,   176,   183,   183,   183,   183,   183,   183,   183,   183,   182,   182,   182,   182,   182,   182,   182,   182,   181,   181,   181,   181,   181,   181,   181,   181,   180,   180,   180,   180,   180,   180,   180,   180,   -1
-// CHECK: 179,   179,   179,   179,   179,   179,   179,   179,   178,   178,   178,   178,   178,   178,   178,   178,   177,   177,   177,   177,   177,   177,   177,   177,   176,   176,   176,   176,   176,   176,   176,   176,   183,   183,   183,   183,   183,   183,   183,   183,   182,   182,   182,   182,   182,   182,   182,   182,   181,   181,   181,   181,   181,   181,   181,   181,   180,   180,   180,   180,   180,   180,   180,   180,   196,   196,   196,   196,   196,   196,   196,   196,   197,   197,   197,   197,   197,   197,   197,   197,   198,   198,   198,   198,   198,   198,   198,   198,   199,   199,   199,   199,   199,   199,   199,   199,   192,   192,   192,   192,   192,   192,   192,   192,   193,   193,   193,   193,   193,   193,   193,   193,   194,   194,   194,   194,   194,   194,   194,   194,   195,   195,   195,   195,   195,   195,   195,   195,   -1
-// CHECK: 196,   196,   196,   196,   196,   196,   196,   196,   197,   197,   197,   197,   197,   197,   197,   197,   198,   198,   198,   198,   198,   198,   198,   198,   199,   199,   199,   199,   199,   199,   199,   199,   192,   192,   192,   192,   192,   192,   192,   192,   193,   193,   193,   193,   193,   193,   193,   193,   194,   194,   194,   194,   194,   194,   194,   194,   195,   195,   195,   195,   195,   195,   195,   195,   213,   213,   213,   213,   213,   213,   213,   213,   212,   212,   212,   212,   212,   212,   212,   212,   215,   215,   215,   215,   215,   215,   215,   215,   214,   214,   214,   214,   214,   214,   214,   214,   209,   209,   209,   209,   209,   209,   209,   209,   208,   208,   208,   208,   208,   208,   208,   208,   211,   211,   211,   211,   211,   211,   211,   211,   210,   210,   210,   210,   210,   210,   210,   210,   -1
-// CHECK: 213,   213,   213,   213,   213,   213,   213,   213,   212,   212,   212,   212,   212,   212,   212,   212,   215,   215,   215,   215,   215,   215,   215,   215,   214,   214,   214,   214,   214,   214,   214,   214,   209,   209,   209,   209,   209,   209,   209,   209,   208,   208,   208,   208,   208,   208,   208,   208,   211,   211,   211,   211,   211,   211,   211,   211,   210,   210,   210,   210,   210,   210,   210,   210,   230,   230,   230,   230,   230,   230,   230,   230,   231,   231,   231,   231,   231,   231,   231,   231,   228,   228,   228,   228,   228,   228,   228,   228,   229,   229,   229,   229,   229,   229,   229,   229,   226,   226,   226,   226,   226,   226,   226,   226,   227,   227,   227,   227,   227,   227,   227,   227,   224,   224,   224,   224,   224,   224,   224,   224,   225,   225,   225,   225,   225,   225,   225,   225,   -1
-// CHECK: 230,   230,   230,   230,   230,   230,   230,   230,   231,   231,   231,   231,   231,   231,   231,   231,   228,   228,   228,   228,   228,   228,   228,   228,   229,   229,   229,   229,   229,   229,   229,   229,   226,   226,   226,   226,   226,   226,   226,   226,   227,   227,   227,   227,   227,   227,   227,   227,   224,   224,   224,   224,   224,   224,   224,   224,   225,   225,   225,   225,   225,   225,   225,   225,   247,   247,   247,   247,   247,   247,   247,   247,   246,   246,   246,   246,   246,   246,   246,   246,   245,   245,   245,   245,   245,   245,   245,   245,   244,   244,   244,   244,   244,   244,   244,   244,   243,   243,   243,   243,   243,   243,   243,   243,   242,   242,   242,   242,   242,   242,   242,   242,   241,   241,   241,   241,   241,   241,   241,   241,   240,   240,   240,   240,   240,   240,   240,   240,   -1
-// CHECK: 247,   247,   247,   247,   247,   247,   247,   247,   246,   246,   246,   246,   246,   246,   246,   246,   245,   245,   245,   245,   245,   245,   245,   245,   244,   244,   244,   244,   244,   244,   244,   244,   243,   243,   243,   243,   243,   243,   243,   243,   242,   242,   242,   242,   242,   242,   242,   242,   241,   241,   241,   241,   241,   241,   241,   241,   240,   240,   240,   240,   240,   240,   240,   240,   256,   256,   256,   256,   256,   256,   256,   256,   257,   257,   257,   257,   257,   257,   257,   257,   258,   258,   258,   258,   258,   258,   258,   258,   259,   259,   259,   259,   259,   259,   259,   259,   260,   260,   260,   260,   260,   260,   260,   260,   261,   261,   261,   261,   261,   261,   261,   261,   262,   262,   262,   262,   262,   262,   262,   262,   263,   263,   263,   263,   263,   263,   263,   263,   -1
-// CHECK: 256,   256,   256,   256,   256,   256,   256,   256,   257,   257,   257,   257,   257,   257,   257,   257,   258,   258,   258,   258,   258,   258,   258,   258,   259,   259,   259,   259,   259,   259,   259,   259,   260,   260,   260,   260,   260,   260,   260,   260,   261,   261,   261,   261,   261,   261,   261,   261,   262,   262,   262,   262,   262,   262,   262,   262,   263,   263,   263,   263,   263,   263,   263,   263,   273,   273,   273,   273,   273,   273,   273,   273,   272,   272,   272,   272,   272,   272,   272,   272,   275,   275,   275,   275,   275,   275,   275,   275,   274,   274,   274,   274,   274,   274,   274,   274,   277,   277,   277,   277,   277,   277,   277,   277,   276,   276,   276,   276,   276,   276,   276,   276,   279,   279,   279,   279,   279,   279,   279,   279,   278,   278,   278,   278,   278,   278,   278,   278,   -1
-// CHECK: 273,   273,   273,   273,   273,   273,   273,   273,   272,   272,   272,   272,   272,   272,   272,   272,   275,   275,   275,   275,   275,   275,   275,   275,   274,   274,   274,   274,   274,   274,   274,   274,   277,   277,   277,   277,   277,   277,   277,   277,   276,   276,   276,   276,   276,   276,   276,   276,   279,   279,   279,   279,   279,   279,   279,   279,   278,   278,   278,   278,   278,   278,   278,   278,   290,   290,   290,   290,   290,   290,   290,   290,   291,   291,   291,   291,   291,   291,   291,   291,   288,   288,   288,   288,   288,   288,   288,   288,   289,   289,   289,   289,   289,   289,   289,   289,   294,   294,   294,   294,   294,   294,   294,   294,   295,   295,   295,   295,   295,   295,   295,   295,   292,   292,   292,   292,   292,   292,   292,   292,   293,   293,   293,   293,   293,   293,   293,   293,   -1
-// CHECK: 290,   290,   290,   290,   290,   290,   290,   290,   291,   291,   291,   291,   291,   291,   291,   291,   288,   288,   288,   288,   288,   288,   288,   288,   289,   289,   289,   289,   289,   289,   289,   289,   294,   294,   294,   294,   294,   294,   294,   294,   295,   295,   295,   295,   295,   295,   295,   295,   292,   292,   292,   292,   292,   292,   292,   292,   293,   293,   293,   293,   293,   293,   293,   293,   307,   307,   307,   307,   307,   307,   307,   307,   306,   306,   306,   306,   306,   306,   306,   306,   305,   305,   305,   305,   305,   305,   305,   305,   304,   304,   304,   304,   304,   304,   304,   304,   311,   311,   311,   311,   311,   311,   311,   311,   310,   310,   310,   310,   310,   310,   310,   310,   309,   309,   309,   309,   309,   309,   309,   309,   308,   308,   308,   308,   308,   308,   308,   308,   -1
-// CHECK: 307,   307,   307,   307,   307,   307,   307,   307,   306,   306,   306,   306,   306,   306,   306,   306,   305,   305,   305,   305,   305,   305,   305,   305,   304,   304,   304,   304,   304,   304,   304,   304,   311,   311,   311,   311,   311,   311,   311,   311,   310,   310,   310,   310,   310,   310,   310,   310,   309,   309,   309,   309,   309,   309,   309,   309,   308,   308,   308,   308,   308,   308,   308,   308,   324,   324,   324,   324,   324,   324,   324,   324,   325,   325,   325,   325,   325,   325,   325,   325,   326,   326,   326,   326,   326,   326,   326,   326,   327,   327,   327,   327,   327,   327,   327,   327,   320,   320,   320,   320,   320,   320,   320,   320,   321,   321,   321,   321,   321,   321,   321,   321,   322,   322,   322,   322,   322,   322,   322,   322,   323,   323,   323,   323,   323,   323,   323,   323,   -1
-// CHECK: 324,   324,   324,   324,   324,   324,   324,   324,   325,   325,   325,   325,   325,   325,   325,   325,   326,   326,   326,   326,   326,   326,   326,   326,   327,   327,   327,   327,   327,   327,   327,   327,   320,   320,   320,   320,   320,   320,   320,   320,   321,   321,   321,   321,   321,   321,   321,   321,   322,   322,   322,   322,   322,   322,   322,   322,   323,   323,   323,   323,   323,   323,   323,   323,   341,   341,   341,   341,   341,   341,   341,   341,   340,   340,   340,   340,   340,   340,   340,   340,   343,   343,   343,   343,   343,   343,   343,   343,   342,   342,   342,   342,   342,   342,   342,   342,   337,   337,   337,   337,   337,   337,   337,   337,   336,   336,   336,   336,   336,   336,   336,   336,   339,   339,   339,   339,   339,   339,   339,   339,   338,   338,   338,   338,   338,   338,   338,   338,   -1
-// CHECK: 341,   341,   341,   341,   341,   341,   341,   341,   340,   340,   340,   340,   340,   340,   340,   340,   343,   343,   343,   343,   343,   343,   343,   343,   342,   342,   342,   342,   342,   342,   342,   342,   337,   337,   337,   337,   337,   337,   337,   337,   336,   336,   336,   336,   336,   336,   336,   336,   339,   339,   339,   339,   339,   339,   339,   339,   338,   338,   338,   338,   338,   338,   338,   338,   358,   358,   358,   358,   358,   358,   358,   358,   359,   359,   359,   359,   359,   359,   359,   359,   356,   356,   356,   356,   356,   356,   356,   356,   357,   357,   357,   357,   357,   357,   357,   357,   354,   354,   354,   354,   354,   354,   354,   354,   355,   355,   355,   355,   355,   355,   355,   355,   352,   352,   352,   352,   352,   352,   352,   352,   353,   353,   353,   353,   353,   353,   353,   353,   -1
-// CHECK: 358,   358,   358,   358,   358,   358,   358,   358,   359,   359,   359,   359,   359,   359,   359,   359,   356,   356,   356,   356,   356,   356,   356,   356,   357,   357,   357,   357,   357,   357,   357,   357,   354,   354,   354,   354,   354,   354,   354,   354,   355,   355,   355,   355,   355,   355,   355,   355,   352,   352,   352,   352,   352,   352,   352,   352,   353,   353,   353,   353,   353,   353,   353,   353,   375,   375,   375,   375,   375,   375,   375,   375,   374,   374,   374,   374,   374,   374,   374,   374,   373,   373,   373,   373,   373,   373,   373,   373,   372,   372,   372,   372,   372,   372,   372,   372,   371,   371,   371,   371,   371,   371,   371,   371,   370,   370,   370,   370,   370,   370,   370,   370,   369,   369,   369,   369,   369,   369,   369,   369,   368,   368,   368,   368,   368,   368,   368,   368,   -1
-// CHECK: 375,   375,   375,   375,   375,   375,   375,   375,   374,   374,   374,   374,   374,   374,   374,   374,   373,   373,   373,   373,   373,   373,   373,   373,   372,   372,   372,   372,   372,   372,   372,   372,   371,   371,   371,   371,   371,   371,   371,   371,   370,   370,   370,   370,   370,   370,   370,   370,   369,   369,   369,   369,   369,   369,   369,   369,   368,   368,   368,   368,   368,   368,   368,   368,   384,   384,   384,   384,   384,   384,   384,   384,   385,   385,   385,   385,   385,   385,   385,   385,   386,   386,   386,   386,   386,   386,   386,   386,   387,   387,   387,   387,   387,   387,   387,   387,   388,   388,   388,   388,   388,   388,   388,   388,   389,   389,   389,   389,   389,   389,   389,   389,   390,   390,   390,   390,   390,   390,   390,   390,   391,   391,   391,   391,   391,   391,   391,   391,   -1
-// CHECK: 384,   384,   384,   384,   384,   384,   384,   384,   385,   385,   385,   385,   385,   385,   385,   385,   386,   386,   386,   386,   386,   386,   386,   386,   387,   387,   387,   387,   387,   387,   387,   387,   388,   388,   388,   388,   388,   388,   388,   388,   389,   389,   389,   389,   389,   389,   389,   389,   390,   390,   390,   390,   390,   390,   390,   390,   391,   391,   391,   391,   391,   391,   391,   391,   401,   401,   401,   401,   401,   401,   401,   401,   400,   400,   400,   400,   400,   400,   400,   400,   403,   403,   403,   403,   403,   403,   403,   403,   402,   402,   402,   402,   402,   402,   402,   402,   405,   405,   405,   405,   405,   405,   405,   405,   404,   404,   404,   404,   404,   404,   404,   404,   407,   407,   407,   407,   407,   407,   407,   407,   406,   406,   406,   406,   406,   406,   406,   406,   -1
-// CHECK: 401,   401,   401,   401,   401,   401,   401,   401,   400,   400,   400,   400,   400,   400,   400,   400,   403,   403,   403,   403,   403,   403,   403,   403,   402,   402,   402,   402,   402,   402,   402,   402,   405,   405,   405,   405,   405,   405,   405,   405,   404,   404,   404,   404,   404,   404,   404,   404,   407,   407,   407,   407,   407,   407,   407,   407,   406,   406,   406,   406,   406,   406,   406,   406,   418,   418,   418,   418,   418,   418,   418,   418,   419,   419,   419,   419,   419,   419,   419,   419,   416,   416,   416,   416,   416,   416,   416,   416,   417,   417,   417,   417,   417,   417,   417,   417,   422,   422,   422,   422,   422,   422,   422,   422,   423,   423,   423,   423,   423,   423,   423,   423,   420,   420,   420,   420,   420,   420,   420,   420,   421,   421,   421,   421,   421,   421,   421,   421,   -1
-// CHECK: 418,   418,   418,   418,   418,   418,   418,   418,   419,   419,   419,   419,   419,   419,   419,   419,   416,   416,   416,   416,   416,   416,   416,   416,   417,   417,   417,   417,   417,   417,   417,   417,   422,   422,   422,   422,   422,   422,   422,   422,   423,   423,   423,   423,   423,   423,   423,   423,   420,   420,   420,   420,   420,   420,   420,   420,   421,   421,   421,   421,   421,   421,   421,   421,   435,   435,   435,   435,   435,   435,   435,   435,   434,   434,   434,   434,   434,   434,   434,   434,   433,   433,   433,   433,   433,   433,   433,   433,   432,   432,   432,   432,   432,   432,   432,   432,   439,   439,   439,   439,   439,   439,   439,   439,   438,   438,   438,   438,   438,   438,   438,   438,   437,   437,   437,   437,   437,   437,   437,   437,   436,   436,   436,   436,   436,   436,   436,   436,   -1
-// CHECK: 435,   435,   435,   435,   435,   435,   435,   435,   434,   434,   434,   434,   434,   434,   434,   434,   433,   433,   433,   433,   433,   433,   433,   433,   432,   432,   432,   432,   432,   432,   432,   432,   439,   439,   439,   439,   439,   439,   439,   439,   438,   438,   438,   438,   438,   438,   438,   438,   437,   437,   437,   437,   437,   437,   437,   437,   436,   436,   436,   436,   436,   436,   436,   436,   452,   452,   452,   452,   452,   452,   452,   452,   453,   453,   453,   453,   453,   453,   453,   453,   454,   454,   454,   454,   454,   454,   454,   454,   455,   455,   455,   455,   455,   455,   455,   455,   448,   448,   448,   448,   448,   448,   448,   448,   449,   449,   449,   449,   449,   449,   449,   449,   450,   450,   450,   450,   450,   450,   450,   450,   451,   451,   451,   451,   451,   451,   451,   451,   -1
-// CHECK: 452,   452,   452,   452,   452,   452,   452,   452,   453,   453,   453,   453,   453,   453,   453,   453,   454,   454,   454,   454,   454,   454,   454,   454,   455,   455,   455,   455,   455,   455,   455,   455,   448,   448,   448,   448,   448,   448,   448,   448,   449,   449,   449,   449,   449,   449,   449,   449,   450,   450,   450,   450,   450,   450,   450,   450,   451,   451,   451,   451,   451,   451,   451,   451,   469,   469,   469,   469,   469,   469,   469,   469,   468,   468,   468,   468,   468,   468,   468,   468,   471,   471,   471,   471,   471,   471,   471,   471,   470,   470,   470,   470,   470,   470,   470,   470,   465,   465,   465,   465,   465,   465,   465,   465,   464,   464,   464,   464,   464,   464,   464,   464,   467,   467,   467,   467,   467,   467,   467,   467,   466,   466,   466,   466,   466,   466,   466,   466,   -1
-// CHECK: 469,   469,   469,   469,   469,   469,   469,   469,   468,   468,   468,   468,   468,   468,   468,   468,   471,   471,   471,   471,   471,   471,   471,   471,   470,   470,   470,   470,   470,   470,   470,   470,   465,   465,   465,   465,   465,   465,   465,   465,   464,   464,   464,   464,   464,   464,   464,   464,   467,   467,   467,   467,   467,   467,   467,   467,   466,   466,   466,   466,   466,   466,   466,   466,   486,   486,   486,   486,   486,   486,   486,   486,   487,   487,   487,   487,   487,   487,   487,   487,   484,   484,   484,   484,   484,   484,   484,   484,   485,   485,   485,   485,   485,   485,   485,   485,   482,   482,   482,   482,   482,   482,   482,   482,   483,   483,   483,   483,   483,   483,   483,   483,   480,   480,   480,   480,   480,   480,   480,   480,   481,   481,   481,   481,   481,   481,   481,   481,   -1
-// CHECK: 486,   486,   486,   486,   486,   486,   486,   486,   487,   487,   487,   487,   487,   487,   487,   487,   484,   484,   484,   484,   484,   484,   484,   484,   485,   485,   485,   485,   485,   485,   485,   485,   482,   482,   482,   482,   482,   482,   482,   482,   483,   483,   483,   483,   483,   483,   483,   483,   480,   480,   480,   480,   480,   480,   480,   480,   481,   481,   481,   481,   481,   481,   481,   481,   503,   503,   503,   503,   503,   503,   503,   503,   502,   502,   502,   502,   502,   502,   502,   502,   501,   501,   501,   501,   501,   501,   501,   501,   500,   500,   500,   500,   500,   500,   500,   500,   499,   499,   499,   499,   499,   499,   499,   499,   498,   498,   498,   498,   498,   498,   498,   498,   497,   497,   497,   497,   497,   497,   497,   497,   496,   496,   496,   496,   496,   496,   496,   496,   -1
-// CHECK: 503,   503,   503,   503,   503,   503,   503,   503,   502,   502,   502,   502,   502,   502,   502,   502,   501,   501,   501,   501,   501,   501,   501,   501,   500,   500,   500,   500,   500,   500,   500,   500,   499,   499,   499,   499,   499,   499,   499,   499,   498,   498,   498,   498,   498,   498,   498,   498,   497,   497,   497,   497,   497,   497,   497,   497,   496,   496,   496,   496,   496,   496,   496,   496,   512,   512,   512,   512,   512,   512,   512,   512,   513,   513,   513,   513,   513,   513,   513,   513,   514,   514,   514,   514,   514,   514,   514,   514,   515,   515,   515,   515,   515,   515,   515,   515,   516,   516,   516,   516,   516,   516,   516,   516,   517,   517,   517,   517,   517,   517,   517,   517,   518,   518,   518,   518,   518,   518,   518,   518,   519,   519,   519,   519,   519,   519,   519,   519,   -1
-// CHECK: 512,   512,   512,   512,   512,   512,   512,   512,   513,   513,   513,   513,   513,   513,   513,   513,   514,   514,   514,   514,   514,   514,   514,   514,   515,   515,   515,   515,   515,   515,   515,   515,   516,   516,   516,   516,   516,   516,   516,   516,   517,   517,   517,   517,   517,   517,   517,   517,   518,   518,   518,   518,   518,   518,   518,   518,   519,   519,   519,   519,   519,   519,   519,   519,   529,   529,   529,   529,   529,   529,   529,   529,   528,   528,   528,   528,   528,   528,   528,   528,   531,   531,   531,   531,   531,   531,   531,   531,   530,   530,   530,   530,   530,   530,   530,   530,   533,   533,   533,   533,   533,   533,   533,   533,   532,   532,   532,   532,   532,   532,   532,   532,   535,   535,   535,   535,   535,   535,   535,   535,   534,   534,   534,   534,   534,   534,   534,   534,   -1
-// CHECK: 529,   529,   529,   529,   529,   529,   529,   529,   528,   528,   528,   528,   528,   528,   528,   528,   531,   531,   531,   531,   531,   531,   531,   531,   530,   530,   530,   530,   530,   530,   530,   530,   533,   533,   533,   533,   533,   533,   533,   533,   532,   532,   532,   532,   532,   532,   532,   532,   535,   535,   535,   535,   535,   535,   535,   535,   534,   534,   534,   534,   534,   534,   534,   534,   546,   546,   546,   546,   546,   546,   546,   546,   547,   547,   547,   547,   547,   547,   547,   547,   544,   544,   544,   544,   544,   544,   544,   544,   545,   545,   545,   545,   545,   545,   545,   545,   550,   550,   550,   550,   550,   550,   550,   550,   551,   551,   551,   551,   551,   551,   551,   551,   548,   548,   548,   548,   548,   548,   548,   548,   549,   549,   549,   549,   549,   549,   549,   549,   -1
-// CHECK: 546,   546,   546,   546,   546,   546,   546,   546,   547,   547,   547,   547,   547,   547,   547,   547,   544,   544,   544,   544,   544,   544,   544,   544,   545,   545,   545,   545,   545,   545,   545,   545,   550,   550,   550,   550,   550,   550,   550,   550,   551,   551,   551,   551,   551,   551,   551,   551,   548,   548,   548,   548,   548,   548,   548,   548,   549,   549,   549,   549,   549,   549,   549,   549,   563,   563,   563,   563,   563,   563,   563,   563,   562,   562,   562,   562,   562,   562,   562,   562,   561,   561,   561,   561,   561,   561,   561,   561,   560,   560,   560,   560,   560,   560,   560,   560,   567,   567,   567,   567,   567,   567,   567,   567,   566,   566,   566,   566,   566,   566,   566,   566,   565,   565,   565,   565,   565,   565,   565,   565,   564,   564,   564,   564,   564,   564,   564,   564,   -1
-// CHECK: 563,   563,   563,   563,   563,   563,   563,   563,   562,   562,   562,   562,   562,   562,   562,   562,   561,   561,   561,   561,   561,   561,   561,   561,   560,   560,   560,   560,   560,   560,   560,   560,   567,   567,   567,   567,   567,   567,   567,   567,   566,   566,   566,   566,   566,   566,   566,   566,   565,   565,   565,   565,   565,   565,   565,   565,   564,   564,   564,   564,   564,   564,   564,   564,   580,   580,   580,   580,   580,   580,   580,   580,   581,   581,   581,   581,   581,   581,   581,   581,   582,   582,   582,   582,   582,   582,   582,   582,   583,   583,   583,   583,   583,   583,   583,   583,   576,   576,   576,   576,   576,   576,   576,   576,   577,   577,   577,   577,   577,   577,   577,   577,   578,   578,   578,   578,   578,   578,   578,   578,   579,   579,   579,   579,   579,   579,   579,   579,   -1
-// CHECK: 580,   580,   580,   580,   580,   580,   580,   580,   581,   581,   581,   581,   581,   581,   581,   581,   582,   582,   582,   582,   582,   582,   582,   582,   583,   583,   583,   583,   583,   583,   583,   583,   576,   576,   576,   576,   576,   576,   576,   576,   577,   577,   577,   577,   577,   577,   577,   577,   578,   578,   578,   578,   578,   578,   578,   578,   579,   579,   579,   579,   579,   579,   579,   579,   597,   597,   597,   597,   597,   597,   597,   597,   596,   596,   596,   596,   596,   596,   596,   596,   599,   599,   599,   599,   599,   599,   599,   599,   598,   598,   598,   598,   598,   598,   598,   598,   593,   593,   593,   593,   593,   593,   593,   593,   592,   592,   592,   592,   592,   592,   592,   592,   595,   595,   595,   595,   595,   595,   595,   595,   594,   594,   594,   594,   594,   594,   594,   594,   -1
-// CHECK: 597,   597,   597,   597,   597,   597,   597,   597,   596,   596,   596,   596,   596,   596,   596,   596,   599,   599,   599,   599,   599,   599,   599,   599,   598,   598,   598,   598,   598,   598,   598,   598,   593,   593,   593,   593,   593,   593,   593,   593,   592,   592,   592,   592,   592,   592,   592,   592,   595,   595,   595,   595,   595,   595,   595,   595,   594,   594,   594,   594,   594,   594,   594,   594,   614,   614,   614,   614,   614,   614,   614,   614,   615,   615,   615,   615,   615,   615,   615,   615,   612,   612,   612,   612,   612,   612,   612,   612,   613,   613,   613,   613,   613,   613,   613,   613,   610,   610,   610,   610,   610,   610,   610,   610,   611,   611,   611,   611,   611,   611,   611,   611,   608,   608,   608,   608,   608,   608,   608,   608,   609,   609,   609,   609,   609,   609,   609,   609,   -1
-// CHECK: 614,   614,   614,   614,   614,   614,   614,   614,   615,   615,   615,   615,   615,   615,   615,   615,   612,   612,   612,   612,   612,   612,   612,   612,   613,   613,   613,   613,   613,   613,   613,   613,   610,   610,   610,   610,   610,   610,   610,   610,   611,   611,   611,   611,   611,   611,   611,   611,   608,   608,   608,   608,   608,   608,   608,   608,   609,   609,   609,   609,   609,   609,   609,   609,   631,   631,   631,   631,   631,   631,   631,   631,   630,   630,   630,   630,   630,   630,   630,   630,   629,   629,   629,   629,   629,   629,   629,   629,   628,   628,   628,   628,   628,   628,   628,   628,   627,   627,   627,   627,   627,   627,   627,   627,   626,   626,   626,   626,   626,   626,   626,   626,   625,   625,   625,   625,   625,   625,   625,   625,   624,   624,   624,   624,   624,   624,   624,   624,   -1
-// CHECK: 631,   631,   631,   631,   631,   631,   631,   631,   630,   630,   630,   630,   630,   630,   630,   630,   629,   629,   629,   629,   629,   629,   629,   629,   628,   628,   628,   628,   628,   628,   628,   628,   627,   627,   627,   627,   627,   627,   627,   627,   626,   626,   626,   626,   626,   626,   626,   626,   625,   625,   625,   625,   625,   625,   625,   625,   624,   624,   624,   624,   624,   624,   624,   624,   640,   640,   640,   640,   640,   640,   640,   640,   641,   641,   641,   641,   641,   641,   641,   641,   642,   642,   642,   642,   642,   642,   642,   642,   643,   643,   643,   643,   643,   643,   643,   643,   644,   644,   644,   644,   644,   644,   644,   644,   645,   645,   645,   645,   645,   645,   645,   645,   646,   646,   646,   646,   646,   646,   646,   646,   647,   647,   647,   647,   647,   647,   647,   647,   -1
-// CHECK: 640,   640,   640,   640,   640,   640,   640,   640,   641,   641,   641,   641,   641,   641,   641,   641,   642,   642,   642,   642,   642,   642,   642,   642,   643,   643,   643,   643,   643,   643,   643,   643,   644,   644,   644,   644,   644,   644,   644,   644,   645,   645,   645,   645,   645,   645,   645,   645,   646,   646,   646,   646,   646,   646,   646,   646,   647,   647,   647,   647,   647,   647,   647,   647,   657,   657,   657,   657,   657,   657,   657,   657,   656,   656,   656,   656,   656,   656,   656,   656,   659,   659,   659,   659,   659,   659,   659,   659,   658,   658,   658,   658,   658,   658,   658,   658,   661,   661,   661,   661,   661,   661,   661,   661,   660,   660,   660,   660,   660,   660,   660,   660,   663,   663,   663,   663,   663,   663,   663,   663,   662,   662,   662,   662,   662,   662,   662,   662,   -1
-// CHECK: 657,   657,   657,   657,   657,   657,   657,   657,   656,   656,   656,   656,   656,   656,   656,   656,   659,   659,   659,   659,   659,   659,   659,   659,   658,   658,   658,   658,   658,   658,   658,   658,   661,   661,   661,   661,   661,   661,   661,   661,   660,   660,   660,   660,   660,   660,   660,   660,   663,   663,   663,   663,   663,   663,   663,   663,   662,   662,   662,   662,   662,   662,   662,   662,   674,   674,   674,   674,   674,   674,   674,   674,   675,   675,   675,   675,   675,   675,   675,   675,   672,   672,   672,   672,   672,   672,   672,   672,   673,   673,   673,   673,   673,   673,   673,   673,   678,   678,   678,   678,   678,   678,   678,   678,   679,   679,   679,   679,   679,   679,   679,   679,   676,   676,   676,   676,   676,   676,   676,   676,   677,   677,   677,   677,   677,   677,   677,   677,   -1
-// CHECK: 674,   674,   674,   674,   674,   674,   674,   674,   675,   675,   675,   675,   675,   675,   675,   675,   672,   672,   672,   672,   672,   672,   672,   672,   673,   673,   673,   673,   673,   673,   673,   673,   678,   678,   678,   678,   678,   678,   678,   678,   679,   679,   679,   679,   679,   679,   679,   679,   676,   676,   676,   676,   676,   676,   676,   676,   677,   677,   677,   677,   677,   677,   677,   677,   691,   691,   691,   691,   691,   691,   691,   691,   690,   690,   690,   690,   690,   690,   690,   690,   689,   689,   689,   689,   689,   689,   689,   689,   688,   688,   688,   688,   688,   688,   688,   688,   695,   695,   695,   695,   695,   695,   695,   695,   694,   694,   694,   694,   694,   694,   694,   694,   693,   693,   693,   693,   693,   693,   693,   693,   692,   692,   692,   692,   692,   692,   692,   692,   -1
-// CHECK: 691,   691,   691,   691,   691,   691,   691,   691,   690,   690,   690,   690,   690,   690,   690,   690,   689,   689,   689,   689,   689,   689,   689,   689,   688,   688,   688,   688,   688,   688,   688,   688,   695,   695,   695,   695,   695,   695,   695,   695,   694,   694,   694,   694,   694,   694,   694,   694,   693,   693,   693,   693,   693,   693,   693,   693,   692,   692,   692,   692,   692,   692,   692,   692,   708,   708,   708,   708,   708,   708,   708,   708,   709,   709,   709,   709,   709,   709,   709,   709,   710,   710,   710,   710,   710,   710,   710,   710,   711,   711,   711,   711,   711,   711,   711,   711,   704,   704,   704,   704,   704,   704,   704,   704,   705,   705,   705,   705,   705,   705,   705,   705,   706,   706,   706,   706,   706,   706,   706,   706,   707,   707,   707,   707,   707,   707,   707,   707,   -1
-// CHECK: 708,   708,   708,   708,   708,   708,   708,   708,   709,   709,   709,   709,   709,   709,   709,   709,   710,   710,   710,   710,   710,   710,   710,   710,   711,   711,   711,   711,   711,   711,   711,   711,   704,   704,   704,   704,   704,   704,   704,   704,   705,   705,   705,   705,   705,   705,   705,   705,   706,   706,   706,   706,   706,   706,   706,   706,   707,   707,   707,   707,   707,   707,   707,   707,   725,   725,   725,   725,   725,   725,   725,   725,   724,   724,   724,   724,   724,   724,   724,   724,   727,   727,   727,   727,   727,   727,   727,   727,   726,   726,   726,   726,   726,   726,   726,   726,   721,   721,   721,   721,   721,   721,   721,   721,   720,   720,   720,   720,   720,   720,   720,   720,   723,   723,   723,   723,   723,   723,   723,   723,   722,   722,   722,   722,   722,   722,   722,   722,   -1
-// CHECK: 725,   725,   725,   725,   725,   725,   725,   725,   724,   724,   724,   724,   724,   724,   724,   724,   727,   727,   727,   727,   727,   727,   727,   727,   726,   726,   726,   726,   726,   726,   726,   726,   721,   721,   721,   721,   721,   721,   721,   721,   720,   720,   720,   720,   720,   720,   720,   720,   723,   723,   723,   723,   723,   723,   723,   723,   722,   722,   722,   722,   722,   722,   722,   722,   742,   742,   742,   742,   742,   742,   742,   742,   743,   743,   743,   743,   743,   743,   743,   743,   740,   740,   740,   740,   740,   740,   740,   740,   741,   741,   741,   741,   741,   741,   741,   741,   738,   738,   738,   738,   738,   738,   738,   738,   739,   739,   739,   739,   739,   739,   739,   739,   736,   736,   736,   736,   736,   736,   736,   736,   737,   737,   737,   737,   737,   737,   737,   737,   -1
-// CHECK: 742,   742,   742,   742,   742,   742,   742,   742,   743,   743,   743,   743,   743,   743,   743,   743,   740,   740,   740,   740,   740,   740,   740,   740,   741,   741,   741,   741,   741,   741,   741,   741,   738,   738,   738,   738,   738,   738,   738,   738,   739,   739,   739,   739,   739,   739,   739,   739,   736,   736,   736,   736,   736,   736,   736,   736,   737,   737,   737,   737,   737,   737,   737,   737,   759,   759,   759,   759,   759,   759,   759,   759,   758,   758,   758,   758,   758,   758,   758,   758,   757,   757,   757,   757,   757,   757,   757,   757,   756,   756,   756,   756,   756,   756,   756,   756,   755,   755,   755,   755,   755,   755,   755,   755,   754,   754,   754,   754,   754,   754,   754,   754,   753,   753,   753,   753,   753,   753,   753,   753,   752,   752,   752,   752,   752,   752,   752,   752,   -1
-// CHECK: 759,   759,   759,   759,   759,   759,   759,   759,   758,   758,   758,   758,   758,   758,   758,   758,   757,   757,   757,   757,   757,   757,   757,   757,   756,   756,   756,   756,   756,   756,   756,   756,   755,   755,   755,   755,   755,   755,   755,   755,   754,   754,   754,   754,   754,   754,   754,   754,   753,   753,   753,   753,   753,   753,   753,   753,   752,   752,   752,   752,   752,   752,   752,   752,   768,   768,   768,   768,   768,   768,   768,   768,   769,   769,   769,   769,   769,   769,   769,   769,   770,   770,   770,   770,   770,   770,   770,   770,   771,   771,   771,   771,   771,   771,   771,   771,   772,   772,   772,   772,   772,   772,   772,   772,   773,   773,   773,   773,   773,   773,   773,   773,   774,   774,   774,   774,   774,   774,   774,   774,   775,   775,   775,   775,   775,   775,   775,   775,   -1
-// CHECK: 768,   768,   768,   768,   768,   768,   768,   768,   769,   769,   769,   769,   769,   769,   769,   769,   770,   770,   770,   770,   770,   770,   770,   770,   771,   771,   771,   771,   771,   771,   771,   771,   772,   772,   772,   772,   772,   772,   772,   772,   773,   773,   773,   773,   773,   773,   773,   773,   774,   774,   774,   774,   774,   774,   774,   774,   775,   775,   775,   775,   775,   775,   775,   775,   785,   785,   785,   785,   785,   785,   785,   785,   784,   784,   784,   784,   784,   784,   784,   784,   787,   787,   787,   787,   787,   787,   787,   787,   786,   786,   786,   786,   786,   786,   786,   786,   789,   789,   789,   789,   789,   789,   789,   789,   788,   788,   788,   788,   788,   788,   788,   788,   791,   791,   791,   791,   791,   791,   791,   791,   790,   790,   790,   790,   790,   790,   790,   790,   -1
-// CHECK: 785,   785,   785,   785,   785,   785,   785,   785,   784,   784,   784,   784,   784,   784,   784,   784,   787,   787,   787,   787,   787,   787,   787,   787,   786,   786,   786,   786,   786,   786,   786,   786,   789,   789,   789,   789,   789,   789,   789,   789,   788,   788,   788,   788,   788,   788,   788,   788,   791,   791,   791,   791,   791,   791,   791,   791,   790,   790,   790,   790,   790,   790,   790,   790,   802,   802,   802,   802,   802,   802,   802,   802,   803,   803,   803,   803,   803,   803,   803,   803,   800,   800,   800,   800,   800,   800,   800,   800,   801,   801,   801,   801,   801,   801,   801,   801,   806,   806,   806,   806,   806,   806,   806,   806,   807,   807,   807,   807,   807,   807,   807,   807,   804,   804,   804,   804,   804,   804,   804,   804,   805,   805,   805,   805,   805,   805,   805,   805,   -1
-// CHECK: 802,   802,   802,   802,   802,   802,   802,   802,   803,   803,   803,   803,   803,   803,   803,   803,   800,   800,   800,   800,   800,   800,   800,   800,   801,   801,   801,   801,   801,   801,   801,   801,   806,   806,   806,   806,   806,   806,   806,   806,   807,   807,   807,   807,   807,   807,   807,   807,   804,   804,   804,   804,   804,   804,   804,   804,   805,   805,   805,   805,   805,   805,   805,   805,   819,   819,   819,   819,   819,   819,   819,   819,   818,   818,   818,   818,   818,   818,   818,   818,   817,   817,   817,   817,   817,   817,   817,   817,   816,   816,   816,   816,   816,   816,   816,   816,   823,   823,   823,   823,   823,   823,   823,   823,   822,   822,   822,   822,   822,   822,   822,   822,   821,   821,   821,   821,   821,   821,   821,   821,   820,   820,   820,   820,   820,   820,   820,   820,   -1
-// CHECK: 819,   819,   819,   819,   819,   819,   819,   819,   818,   818,   818,   818,   818,   818,   818,   818,   817,   817,   817,   817,   817,   817,   817,   817,   816,   816,   816,   816,   816,   816,   816,   816,   823,   823,   823,   823,   823,   823,   823,   823,   822,   822,   822,   822,   822,   822,   822,   822,   821,   821,   821,   821,   821,   821,   821,   821,   820,   820,   820,   820,   820,   820,   820,   820,   836,   836,   836,   836,   836,   836,   836,   836,   837,   837,   837,   837,   837,   837,   837,   837,   838,   838,   838,   838,   838,   838,   838,   838,   839,   839,   839,   839,   839,   839,   839,   839,   832,   832,   832,   832,   832,   832,   832,   832,   833,   833,   833,   833,   833,   833,   833,   833,   834,   834,   834,   834,   834,   834,   834,   834,   835,   835,   835,   835,   835,   835,   835,   835,   -1
-// CHECK: 836,   836,   836,   836,   836,   836,   836,   836,   837,   837,   837,   837,   837,   837,   837,   837,   838,   838,   838,   838,   838,   838,   838,   838,   839,   839,   839,   839,   839,   839,   839,   839,   832,   832,   832,   832,   832,   832,   832,   832,   833,   833,   833,   833,   833,   833,   833,   833,   834,   834,   834,   834,   834,   834,   834,   834,   835,   835,   835,   835,   835,   835,   835,   835,   853,   853,   853,   853,   853,   853,   853,   853,   852,   852,   852,   852,   852,   852,   852,   852,   855,   855,   855,   855,   855,   855,   855,   855,   854,   854,   854,   854,   854,   854,   854,   854,   849,   849,   849,   849,   849,   849,   849,   849,   848,   848,   848,   848,   848,   848,   848,   848,   851,   851,   851,   851,   851,   851,   851,   851,   850,   850,   850,   850,   850,   850,   850,   850,   -1
-// CHECK: 853,   853,   853,   853,   853,   853,   853,   853,   852,   852,   852,   852,   852,   852,   852,   852,   855,   855,   855,   855,   855,   855,   855,   855,   854,   854,   854,   854,   854,   854,   854,   854,   849,   849,   849,   849,   849,   849,   849,   849,   848,   848,   848,   848,   848,   848,   848,   848,   851,   851,   851,   851,   851,   851,   851,   851,   850,   850,   850,   850,   850,   850,   850,   850,   870,   870,   870,   870,   870,   870,   870,   870,   871,   871,   871,   871,   871,   871,   871,   871,   868,   868,   868,   868,   868,   868,   868,   868,   869,   869,   869,   869,   869,   869,   869,   869,   866,   866,   866,   866,   866,   866,   866,   866,   867,   867,   867,   867,   867,   867,   867,   867,   864,   864,   864,   864,   864,   864,   864,   864,   865,   865,   865,   865,   865,   865,   865,   865,   -1
-// CHECK: 870,   870,   870,   870,   870,   870,   870,   870,   871,   871,   871,   871,   871,   871,   871,   871,   868,   868,   868,   868,   868,   868,   868,   868,   869,   869,   869,   869,   869,   869,   869,   869,   866,   866,   866,   866,   866,   866,   866,   866,   867,   867,   867,   867,   867,   867,   867,   867,   864,   864,   864,   864,   864,   864,   864,   864,   865,   865,   865,   865,   865,   865,   865,   865,   887,   887,   887,   887,   887,   887,   887,   887,   886,   886,   886,   886,   886,   886,   886,   886,   885,   885,   885,   885,   885,   885,   885,   885,   884,   884,   884,   884,   884,   884,   884,   884,   883,   883,   883,   883,   883,   883,   883,   883,   882,   882,   882,   882,   882,   882,   882,   882,   881,   881,   881,   881,   881,   881,   881,   881,   880,   880,   880,   880,   880,   880,   880,   880,   -1
-// CHECK: 887,   887,   887,   887,   887,   887,   887,   887,   886,   886,   886,   886,   886,   886,   886,   886,   885,   885,   885,   885,   885,   885,   885,   885,   884,   884,   884,   884,   884,   884,   884,   884,   883,   883,   883,   883,   883,   883,   883,   883,   882,   882,   882,   882,   882,   882,   882,   882,   881,   881,   881,   881,   881,   881,   881,   881,   880,   880,   880,   880,   880,   880,   880,   880,   896,   896,   896,   896,   896,   896,   896,   896,   897,   897,   897,   897,   897,   897,   897,   897,   898,   898,   898,   898,   898,   898,   898,   898,   899,   899,   899,   899,   899,   899,   899,   899,   900,   900,   900,   900,   900,   900,   900,   900,   901,   901,   901,   901,   901,   901,   901,   901,   902,   902,   902,   902,   902,   902,   902,   902,   903,   903,   903,   903,   903,   903,   903,   903,   -1
-// CHECK: 896,   896,   896,   896,   896,   896,   896,   896,   897,   897,   897,   897,   897,   897,   897,   897,   898,   898,   898,   898,   898,   898,   898,   898,   899,   899,   899,   899,   899,   899,   899,   899,   900,   900,   900,   900,   900,   900,   900,   900,   901,   901,   901,   901,   901,   901,   901,   901,   902,   902,   902,   902,   902,   902,   902,   902,   903,   903,   903,   903,   903,   903,   903,   903,   913,   913,   913,   913,   913,   913,   913,   913,   912,   912,   912,   912,   912,   912,   912,   912,   915,   915,   915,   915,   915,   915,   915,   915,   914,   914,   914,   914,   914,   914,   914,   914,   917,   917,   917,   917,   917,   917,   917,   917,   916,   916,   916,   916,   916,   916,   916,   916,   919,   919,   919,   919,   919,   919,   919,   919,   918,   918,   918,   918,   918,   918,   918,   918,   -1
-// CHECK: 913,   913,   913,   913,   913,   913,   913,   913,   912,   912,   912,   912,   912,   912,   912,   912,   915,   915,   915,   915,   915,   915,   915,   915,   914,   914,   914,   914,   914,   914,   914,   914,   917,   917,   917,   917,   917,   917,   917,   917,   916,   916,   916,   916,   916,   916,   916,   916,   919,   919,   919,   919,   919,   919,   919,   919,   918,   918,   918,   918,   918,   918,   918,   918,   930,   930,   930,   930,   930,   930,   930,   930,   931,   931,   931,   931,   931,   931,   931,   931,   928,   928,   928,   928,   928,   928,   928,   928,   929,   929,   929,   929,   929,   929,   929,   929,   934,   934,   934,   934,   934,   934,   934,   934,   935,   935,   935,   935,   935,   935,   935,   935,   932,   932,   932,   932,   932,   932,   932,   932,   933,   933,   933,   933,   933,   933,   933,   933,   -1
-// CHECK: 930,   930,   930,   930,   930,   930,   930,   930,   931,   931,   931,   931,   931,   931,   931,   931,   928,   928,   928,   928,   928,   928,   928,   928,   929,   929,   929,   929,   929,   929,   929,   929,   934,   934,   934,   934,   934,   934,   934,   934,   935,   935,   935,   935,   935,   935,   935,   935,   932,   932,   932,   932,   932,   932,   932,   932,   933,   933,   933,   933,   933,   933,   933,   933,   947,   947,   947,   947,   947,   947,   947,   947,   946,   946,   946,   946,   946,   946,   946,   946,   945,   945,   945,   945,   945,   945,   945,   945,   944,   944,   944,   944,   944,   944,   944,   944,   951,   951,   951,   951,   951,   951,   951,   951,   950,   950,   950,   950,   950,   950,   950,   950,   949,   949,   949,   949,   949,   949,   949,   949,   948,   948,   948,   948,   948,   948,   948,   948,   -1
-// CHECK: 947,   947,   947,   947,   947,   947,   947,   947,   946,   946,   946,   946,   946,   946,   946,   946,   945,   945,   945,   945,   945,   945,   945,   945,   944,   944,   944,   944,   944,   944,   944,   944,   951,   951,   951,   951,   951,   951,   951,   951,   950,   950,   950,   950,   950,   950,   950,   950,   949,   949,   949,   949,   949,   949,   949,   949,   948,   948,   948,   948,   948,   948,   948,   948,   964,   964,   964,   964,   964,   964,   964,   964,   965,   965,   965,   965,   965,   965,   965,   965,   966,   966,   966,   966,   966,   966,   966,   966,   967,   967,   967,   967,   967,   967,   967,   967,   960,   960,   960,   960,   960,   960,   960,   960,   961,   961,   961,   961,   961,   961,   961,   961,   962,   962,   962,   962,   962,   962,   962,   962,   963,   963,   963,   963,   963,   963,   963,   963,   -1
-// CHECK: 964,   964,   964,   964,   964,   964,   964,   964,   965,   965,   965,   965,   965,   965,   965,   965,   966,   966,   966,   966,   966,   966,   966,   966,   967,   967,   967,   967,   967,   967,   967,   967,   960,   960,   960,   960,   960,   960,   960,   960,   961,   961,   961,   961,   961,   961,   961,   961,   962,   962,   962,   962,   962,   962,   962,   962,   963,   963,   963,   963,   963,   963,   963,   963,   981,   981,   981,   981,   981,   981,   981,   981,   980,   980,   980,   980,   980,   980,   980,   980,   983,   983,   983,   983,   983,   983,   983,   983,   982,   982,   982,   982,   982,   982,   982,   982,   977,   977,   977,   977,   977,   977,   977,   977,   976,   976,   976,   976,   976,   976,   976,   976,   979,   979,   979,   979,   979,   979,   979,   979,   978,   978,   978,   978,   978,   978,   978,   978,   -1
-// CHECK: 981,   981,   981,   981,   981,   981,   981,   981,   980,   980,   980,   980,   980,   980,   980,   980,   983,   983,   983,   983,   983,   983,   983,   983,   982,   982,   982,   982,   982,   982,   982,   982,   977,   977,   977,   977,   977,   977,   977,   977,   976,   976,   976,   976,   976,   976,   976,   976,   979,   979,   979,   979,   979,   979,   979,   979,   978,   978,   978,   978,   978,   978,   978,   978,   998,   998,   998,   998,   998,   998,   998,   998,   999,   999,   999,   999,   999,   999,   999,   999,   996,   996,   996,   996,   996,   996,   996,   996,   997,   997,   997,   997,   997,   997,   997,   997,   994,   994,   994,   994,   994,   994,   994,   994,   995,   995,   995,   995,   995,   995,   995,   995,   992,   992,   992,   992,   992,   992,   992,   992,   993,   993,   993,   993,   993,   993,   993,   993,   -1
-// CHECK: 998,   998,   998,   998,   998,   998,   998,   998,   999,   999,   999,   999,   999,   999,   999,   999,   996,   996,   996,   996,   996,   996,   996,   996,   997,   997,   997,   997,   997,   997,   997,   997,   994,   994,   994,   994,   994,   994,   994,   994,   995,   995,   995,   995,   995,   995,   995,   995,   992,   992,   992,   992,   992,   992,   992,   992,   993,   993,   993,   993,   993,   993,   993,   993,   1015,   1015,   1015,   1015,   1015,   1015,   1015,   1015,   1014,   1014,   1014,   1014,   1014,   1014,   1014,   1014,   1013,   1013,   1013,   1013,   1013,   1013,   1013,   1013,   1012,   1012,   1012,   1012,   1012,   1012,   1012,   1012,   1011,   1011,   1011,   1011,   1011,   1011,   1011,   1011,   1010,   1010,   1010,   1010,   1010,   1010,   1010,   1010,   1009,   1009,   1009,   1009,   1009,   1009,   1009,   1009,   1008,   1008,   1008,   1008,   1008,   1008,   1008,   1008,   -1
-// CHECK: 1015,   1015,   1015,   1015,   1015,   1015,   1015,   1015,   1014,   1014,   1014,   1014,   1014,   1014,   1014,   1014,   1013,   1013,   1013,   1013,   1013,   1013,   1013,   1013,   1012,   1012,   1012,   1012,   1012,   1012,   1012,   1012,   1011,   1011,   1011,   1011,   1011,   1011,   1011,   1011,   1010,   1010,   1010,   1010,   1010,   1010,   1010,   1010,   1009,   1009,   1009,   1009,   1009,   1009,   1009,   1009,   1008,   1008,   1008,   1008,   1008,   1008,   1008,   1008,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   -1
+// CHECK: ===--- Matrix B ---===
+// CHECK: 0,   0,   0,   0,   0,   0,   0,   0,   1,   1,   1,   1,   1,   1,   1,   1,   2,   2,   2,   2,   2,   2,   2,   2,   3,   3,   3,   3,   3,   3,   3,   3,   4,   4,   4,   4,   4,   4,   4,   4,   5,   5,   5,   5,   5,   5,   5,   5,   6,   6,   6,   6,   6,   6,   6,   6,   7,   7,   7,   7,   7,   7,   7,   7,   17,   17,   17,   17,   17,   17,   17,   17,   16,   16,   16,   16,   16,   16,   16,   16,   19,   19,   19,   19,   19,   19,   19,   19,   18,   18,   18,   18,   18,   18,   18,   18,   21,   21,   21,   21,   21,   21,   21,   21,   20,   20,   20,   20,   20,   20,   20,   20,   23,   23,   23,   23,   23,   23,   23,   23,   22,   22,   22,   22,   22,   22,   22,   22
+// CHECK: 34,   34,   34,   34,   34,   34,   34,   34,   35,   35,   35,   35,   35,   35,   35,   35,   32,   32,   32,   32,   32,   32,   32,   32,   33,   33,   33,   33,   33,   33,   33,   33,   38,   38,   38,   38,   38,   38,   38,   38,   39,   39,   39,   39,   39,   39,   39,   39,   36,   36,   36,   36,   36,   36,   36,   36,   37,   37,   37,   37,   37,   37,   37,   37,   51,   51,   51,   51,   51,   51,   51,   51,   50,   50,   50,   50,   50,   50,   50,   50,   49,   49,   49,   49,   49,   49,   49,   49,   48,   48,   48,   48,   48,   48,   48,   48,   55,   55,   55,   55,   55,   55,   55,   55,   54,   54,   54,   54,   54,   54,   54,   54,   53,   53,   53,   53,   53,   53,   53,   53,   52,   52,   52,   52,   52,   52,   52,   52
+// CHECK: 68,   68,   68,   68,   68,   68,   68,   68,   69,   69,   69,   69,   69,   69,   69,   69,   70,   70,   70,   70,   70,   70,   70,   70,   71,   71,   71,   71,   71,   71,   71,   71,   64,   64,   64,   64,   64,   64,   64,   64,   65,   65,   65,   65,   65,   65,   65,   65,   66,   66,   66,   66,   66,   66,   66,   66,   67,   67,   67,   67,   67,   67,   67,   67,   85,   85,   85,   85,   85,   85,   85,   85,   84,   84,   84,   84,   84,   84,   84,   84,   87,   87,   87,   87,   87,   87,   87,   87,   86,   86,   86,   86,   86,   86,   86,   86,   81,   81,   81,   81,   81,   81,   81,   81,   80,   80,   80,   80,   80,   80,   80,   80,   83,   83,   83,   83,   83,   83,   83,   83,   82,   82,   82,   82,   82,   82,   82,   82
+// CHECK: 102,   102,   102,   102,   102,   102,   102,   102,   103,   103,   103,   103,   103,   103,   103,   103,   100,   100,   100,   100,   100,   100,   100,   100,   101,   101,   101,   101,   101,   101,   101,   101,   98,   98,   98,   98,   98,   98,   98,   98,   99,   99,   99,   99,   99,   99,   99,   99,   96,   96,   96,   96,   96,   96,   96,   96,   97,   97,   97,   97,   97,   97,   97,   97,   119,   119,   119,   119,   119,   119,   119,   119,   118,   118,   118,   118,   118,   118,   118,   118,   117,   117,   117,   117,   117,   117,   117,   117,   116,   116,   116,   116,   116,   116,   116,   116,   115,   115,   115,   115,   115,   115,   115,   115,   114,   114,   114,   114,   114,   114,   114,   114,   113,   113,   113,   113,   113,   113,   113,   113,   112,   112,   112,   112,   112,   112,   112,   112
+// CHECK: 128,   128,   128,   128,   128,   128,   128,   128,   129,   129,   129,   129,   129,   129,   129,   129,   130,   130,   130,   130,   130,   130,   130,   130,   131,   131,   131,   131,   131,   131,   131,   131,   132,   132,   132,   132,   132,   132,   132,   132,   133,   133,   133,   133,   133,   133,   133,   133,   134,   134,   134,   134,   134,   134,   134,   134,   135,   135,   135,   135,   135,   135,   135,   135,   145,   145,   145,   145,   145,   145,   145,   145,   144,   144,   144,   144,   144,   144,   144,   144,   147,   147,   147,   147,   147,   147,   147,   147,   146,   146,   146,   146,   146,   146,   146,   146,   149,   149,   149,   149,   149,   149,   149,   149,   148,   148,   148,   148,   148,   148,   148,   148,   151,   151,   151,   151,   151,   151,   151,   151,   150,   150,   150,   150,   150,   150,   150,   150
+// CHECK: 162,   162,   162,   162,   162,   162,   162,   162,   163,   163,   163,   163,   163,   163,   163,   163,   160,   160,   160,   160,   160,   160,   160,   160,   161,   161,   161,   161,   161,   161,   161,   161,   166,   166,   166,   166,   166,   166,   166,   166,   167,   167,   167,   167,   167,   167,   167,   167,   164,   164,   164,   164,   164,   164,   164,   164,   165,   165,   165,   165,   165,   165,   165,   165,   179,   179,   179,   179,   179,   179,   179,   179,   178,   178,   178,   178,   178,   178,   178,   178,   177,   177,   177,   177,   177,   177,   177,   177,   176,   176,   176,   176,   176,   176,   176,   176,   183,   183,   183,   183,   183,   183,   183,   183,   182,   182,   182,   182,   182,   182,   182,   182,   181,   181,   181,   181,   181,   181,   181,   181,   180,   180,   180,   180,   180,   180,   180,   180
+// CHECK: 196,   196,   196,   196,   196,   196,   196,   196,   197,   197,   197,   197,   197,   197,   197,   197,   198,   198,   198,   198,   198,   198,   198,   198,   199,   199,   199,   199,   199,   199,   199,   199,   192,   192,   192,   192,   192,   192,   192,   192,   193,   193,   193,   193,   193,   193,   193,   193,   194,   194,   194,   194,   194,   194,   194,   194,   195,   195,   195,   195,   195,   195,   195,   195,   213,   213,   213,   213,   213,   213,   213,   213,   212,   212,   212,   212,   212,   212,   212,   212,   215,   215,   215,   215,   215,   215,   215,   215,   214,   214,   214,   214,   214,   214,   214,   214,   209,   209,   209,   209,   209,   209,   209,   209,   208,   208,   208,   208,   208,   208,   208,   208,   211,   211,   211,   211,   211,   211,   211,   211,   210,   210,   210,   210,   210,   210,   210,   210
+// CHECK: 230,   230,   230,   230,   230,   230,   230,   230,   231,   231,   231,   231,   231,   231,   231,   231,   228,   228,   228,   228,   228,   228,   228,   228,   229,   229,   229,   229,   229,   229,   229,   229,   226,   226,   226,   226,   226,   226,   226,   226,   227,   227,   227,   227,   227,   227,   227,   227,   224,   224,   224,   224,   224,   224,   224,   224,   225,   225,   225,   225,   225,   225,   225,   225,   247,   247,   247,   247,   247,   247,   247,   247,   246,   246,   246,   246,   246,   246,   246,   246,   245,   245,   245,   245,   245,   245,   245,   245,   244,   244,   244,   244,   244,   244,   244,   244,   243,   243,   243,   243,   243,   243,   243,   243,   242,   242,   242,   242,   242,   242,   242,   242,   241,   241,   241,   241,   241,   241,   241,   241,   240,   240,   240,   240,   240,   240,   240,   240
+// CHECK: 256,   256,   256,   256,   256,   256,   256,   256,   257,   257,   257,   257,   257,   257,   257,   257,   258,   258,   258,   258,   258,   258,   258,   258,   259,   259,   259,   259,   259,   259,   259,   259,   260,   260,   260,   260,   260,   260,   260,   260,   261,   261,   261,   261,   261,   261,   261,   261,   262,   262,   262,   262,   262,   262,   262,   262,   263,   263,   263,   263,   263,   263,   263,   263,   273,   273,   273,   273,   273,   273,   273,   273,   272,   272,   272,   272,   272,   272,   272,   272,   275,   275,   275,   275,   275,   275,   275,   275,   274,   274,   274,   274,   274,   274,   274,   274,   277,   277,   277,   277,   277,   277,   277,   277,   276,   276,   276,   276,   276,   276,   276,   276,   279,   279,   279,   279,   279,   279,   279,   279,   278,   278,   278,   278,   278,   278,   278,   278
+// CHECK: 290,   290,   290,   290,   290,   290,   290,   290,   291,   291,   291,   291,   291,   291,   291,   291,   288,   288,   288,   288,   288,   288,   288,   288,   289,   289,   289,   289,   289,   289,   289,   289,   294,   294,   294,   294,   294,   294,   294,   294,   295,   295,   295,   295,   295,   295,   295,   295,   292,   292,   292,   292,   292,   292,   292,   292,   293,   293,   293,   293,   293,   293,   293,   293,   307,   307,   307,   307,   307,   307,   307,   307,   306,   306,   306,   306,   306,   306,   306,   306,   305,   305,   305,   305,   305,   305,   305,   305,   304,   304,   304,   304,   304,   304,   304,   304,   311,   311,   311,   311,   311,   311,   311,   311,   310,   310,   310,   310,   310,   310,   310,   310,   309,   309,   309,   309,   309,   309,   309,   309,   308,   308,   308,   308,   308,   308,   308,   308
+// CHECK: 324,   324,   324,   324,   324,   324,   324,   324,   325,   325,   325,   325,   325,   325,   325,   325,   326,   326,   326,   326,   326,   326,   326,   326,   327,   327,   327,   327,   327,   327,   327,   327,   320,   320,   320,   320,   320,   320,   320,   320,   321,   321,   321,   321,   321,   321,   321,   321,   322,   322,   322,   322,   322,   322,   322,   322,   323,   323,   323,   323,   323,   323,   323,   323,   341,   341,   341,   341,   341,   341,   341,   341,   340,   340,   340,   340,   340,   340,   340,   340,   343,   343,   343,   343,   343,   343,   343,   343,   342,   342,   342,   342,   342,   342,   342,   342,   337,   337,   337,   337,   337,   337,   337,   337,   336,   336,   336,   336,   336,   336,   336,   336,   339,   339,   339,   339,   339,   339,   339,   339,   338,   338,   338,   338,   338,   338,   338,   338
+// CHECK: 358,   358,   358,   358,   358,   358,   358,   358,   359,   359,   359,   359,   359,   359,   359,   359,   356,   356,   356,   356,   356,   356,   356,   356,   357,   357,   357,   357,   357,   357,   357,   357,   354,   354,   354,   354,   354,   354,   354,   354,   355,   355,   355,   355,   355,   355,   355,   355,   352,   352,   352,   352,   352,   352,   352,   352,   353,   353,   353,   353,   353,   353,   353,   353,   375,   375,   375,   375,   375,   375,   375,   375,   374,   374,   374,   374,   374,   374,   374,   374,   373,   373,   373,   373,   373,   373,   373,   373,   372,   372,   372,   372,   372,   372,   372,   372,   371,   371,   371,   371,   371,   371,   371,   371,   370,   370,   370,   370,   370,   370,   370,   370,   369,   369,   369,   369,   369,   369,   369,   369,   368,   368,   368,   368,   368,   368,   368,   368
+// CHECK: 384,   384,   384,   384,   384,   384,   384,   384,   385,   385,   385,   385,   385,   385,   385,   385,   386,   386,   386,   386,   386,   386,   386,   386,   387,   387,   387,   387,   387,   387,   387,   387,   388,   388,   388,   388,   388,   388,   388,   388,   389,   389,   389,   389,   389,   389,   389,   389,   390,   390,   390,   390,   390,   390,   390,   390,   391,   391,   391,   391,   391,   391,   391,   391,   401,   401,   401,   401,   401,   401,   401,   401,   400,   400,   400,   400,   400,   400,   400,   400,   403,   403,   403,   403,   403,   403,   403,   403,   402,   402,   402,   402,   402,   402,   402,   402,   405,   405,   405,   405,   405,   405,   405,   405,   404,   404,   404,   404,   404,   404,   404,   404,   407,   407,   407,   407,   407,   407,   407,   407,   406,   406,   406,   406,   406,   406,   406,   406
+// CHECK: 418,   418,   418,   418,   418,   418,   418,   418,   419,   419,   419,   419,   419,   419,   419,   419,   416,   416,   416,   416,   416,   416,   416,   416,   417,   417,   417,   417,   417,   417,   417,   417,   422,   422,   422,   422,   422,   422,   422,   422,   423,   423,   423,   423,   423,   423,   423,   423,   420,   420,   420,   420,   420,   420,   420,   420,   421,   421,   421,   421,   421,   421,   421,   421,   435,   435,   435,   435,   435,   435,   435,   435,   434,   434,   434,   434,   434,   434,   434,   434,   433,   433,   433,   433,   433,   433,   433,   433,   432,   432,   432,   432,   432,   432,   432,   432,   439,   439,   439,   439,   439,   439,   439,   439,   438,   438,   438,   438,   438,   438,   438,   438,   437,   437,   437,   437,   437,   437,   437,   437,   436,   436,   436,   436,   436,   436,   436,   436
+// CHECK: 452,   452,   452,   452,   452,   452,   452,   452,   453,   453,   453,   453,   453,   453,   453,   453,   454,   454,   454,   454,   454,   454,   454,   454,   455,   455,   455,   455,   455,   455,   455,   455,   448,   448,   448,   448,   448,   448,   448,   448,   449,   449,   449,   449,   449,   449,   449,   449,   450,   450,   450,   450,   450,   450,   450,   450,   451,   451,   451,   451,   451,   451,   451,   451,   469,   469,   469,   469,   469,   469,   469,   469,   468,   468,   468,   468,   468,   468,   468,   468,   471,   471,   471,   471,   471,   471,   471,   471,   470,   470,   470,   470,   470,   470,   470,   470,   465,   465,   465,   465,   465,   465,   465,   465,   464,   464,   464,   464,   464,   464,   464,   464,   467,   467,   467,   467,   467,   467,   467,   467,   466,   466,   466,   466,   466,   466,   466,   466
+// CHECK: 486,   486,   486,   486,   486,   486,   486,   486,   487,   487,   487,   487,   487,   487,   487,   487,   484,   484,   484,   484,   484,   484,   484,   484,   485,   485,   485,   485,   485,   485,   485,   485,   482,   482,   482,   482,   482,   482,   482,   482,   483,   483,   483,   483,   483,   483,   483,   483,   480,   480,   480,   480,   480,   480,   480,   480,   481,   481,   481,   481,   481,   481,   481,   481,   503,   503,   503,   503,   503,   503,   503,   503,   502,   502,   502,   502,   502,   502,   502,   502,   501,   501,   501,   501,   501,   501,   501,   501,   500,   500,   500,   500,   500,   500,   500,   500,   499,   499,   499,   499,   499,   499,   499,   499,   498,   498,   498,   498,   498,   498,   498,   498,   497,   497,   497,   497,   497,   497,   497,   497,   496,   496,   496,   496,   496,   496,   496,   496
+// CHECK: 512,   512,   512,   512,   512,   512,   512,   512,   513,   513,   513,   513,   513,   513,   513,   513,   514,   514,   514,   514,   514,   514,   514,   514,   515,   515,   515,   515,   515,   515,   515,   515,   516,   516,   516,   516,   516,   516,   516,   516,   517,   517,   517,   517,   517,   517,   517,   517,   518,   518,   518,   518,   518,   518,   518,   518,   519,   519,   519,   519,   519,   519,   519,   519,   529,   529,   529,   529,   529,   529,   529,   529,   528,   528,   528,   528,   528,   528,   528,   528,   531,   531,   531,   531,   531,   531,   531,   531,   530,   530,   530,   530,   530,   530,   530,   530,   533,   533,   533,   533,   533,   533,   533,   533,   532,   532,   532,   532,   532,   532,   532,   532,   535,   535,   535,   535,   535,   535,   535,   535,   534,   534,   534,   534,   534,   534,   534,   534
+// CHECK: 546,   546,   546,   546,   546,   546,   546,   546,   547,   547,   547,   547,   547,   547,   547,   547,   544,   544,   544,   544,   544,   544,   544,   544,   545,   545,   545,   545,   545,   545,   545,   545,   550,   550,   550,   550,   550,   550,   550,   550,   551,   551,   551,   551,   551,   551,   551,   551,   548,   548,   548,   548,   548,   548,   548,   548,   549,   549,   549,   549,   549,   549,   549,   549,   563,   563,   563,   563,   563,   563,   563,   563,   562,   562,   562,   562,   562,   562,   562,   562,   561,   561,   561,   561,   561,   561,   561,   561,   560,   560,   560,   560,   560,   560,   560,   560,   567,   567,   567,   567,   567,   567,   567,   567,   566,   566,   566,   566,   566,   566,   566,   566,   565,   565,   565,   565,   565,   565,   565,   565,   564,   564,   564,   564,   564,   564,   564,   564
+// CHECK: 580,   580,   580,   580,   580,   580,   580,   580,   581,   581,   581,   581,   581,   581,   581,   581,   582,   582,   582,   582,   582,   582,   582,   582,   583,   583,   583,   583,   583,   583,   583,   583,   576,   576,   576,   576,   576,   576,   576,   576,   577,   577,   577,   577,   577,   577,   577,   577,   578,   578,   578,   578,   578,   578,   578,   578,   579,   579,   579,   579,   579,   579,   579,   579,   597,   597,   597,   597,   597,   597,   597,   597,   596,   596,   596,   596,   596,   596,   596,   596,   599,   599,   599,   599,   599,   599,   599,   599,   598,   598,   598,   598,   598,   598,   598,   598,   593,   593,   593,   593,   593,   593,   593,   593,   592,   592,   592,   592,   592,   592,   592,   592,   595,   595,   595,   595,   595,   595,   595,   595,   594,   594,   594,   594,   594,   594,   594,   594
+// CHECK: 614,   614,   614,   614,   614,   614,   614,   614,   615,   615,   615,   615,   615,   615,   615,   615,   612,   612,   612,   612,   612,   612,   612,   612,   613,   613,   613,   613,   613,   613,   613,   613,   610,   610,   610,   610,   610,   610,   610,   610,   611,   611,   611,   611,   611,   611,   611,   611,   608,   608,   608,   608,   608,   608,   608,   608,   609,   609,   609,   609,   609,   609,   609,   609,   631,   631,   631,   631,   631,   631,   631,   631,   630,   630,   630,   630,   630,   630,   630,   630,   629,   629,   629,   629,   629,   629,   629,   629,   628,   628,   628,   628,   628,   628,   628,   628,   627,   627,   627,   627,   627,   627,   627,   627,   626,   626,   626,   626,   626,   626,   626,   626,   625,   625,   625,   625,   625,   625,   625,   625,   624,   624,   624,   624,   624,   624,   624,   624
+// CHECK: 640,   640,   640,   640,   640,   640,   640,   640,   641,   641,   641,   641,   641,   641,   641,   641,   642,   642,   642,   642,   642,   642,   642,   642,   643,   643,   643,   643,   643,   643,   643,   643,   644,   644,   644,   644,   644,   644,   644,   644,   645,   645,   645,   645,   645,   645,   645,   645,   646,   646,   646,   646,   646,   646,   646,   646,   647,   647,   647,   647,   647,   647,   647,   647,   657,   657,   657,   657,   657,   657,   657,   657,   656,   656,   656,   656,   656,   656,   656,   656,   659,   659,   659,   659,   659,   659,   659,   659,   658,   658,   658,   658,   658,   658,   658,   658,   661,   661,   661,   661,   661,   661,   661,   661,   660,   660,   660,   660,   660,   660,   660,   660,   663,   663,   663,   663,   663,   663,   663,   663,   662,   662,   662,   662,   662,   662,   662,   662
+// CHECK: 674,   674,   674,   674,   674,   674,   674,   674,   675,   675,   675,   675,   675,   675,   675,   675,   672,   672,   672,   672,   672,   672,   672,   672,   673,   673,   673,   673,   673,   673,   673,   673,   678,   678,   678,   678,   678,   678,   678,   678,   679,   679,   679,   679,   679,   679,   679,   679,   676,   676,   676,   676,   676,   676,   676,   676,   677,   677,   677,   677,   677,   677,   677,   677,   691,   691,   691,   691,   691,   691,   691,   691,   690,   690,   690,   690,   690,   690,   690,   690,   689,   689,   689,   689,   689,   689,   689,   689,   688,   688,   688,   688,   688,   688,   688,   688,   695,   695,   695,   695,   695,   695,   695,   695,   694,   694,   694,   694,   694,   694,   694,   694,   693,   693,   693,   693,   693,   693,   693,   693,   692,   692,   692,   692,   692,   692,   692,   692
+// CHECK: 708,   708,   708,   708,   708,   708,   708,   708,   709,   709,   709,   709,   709,   709,   709,   709,   710,   710,   710,   710,   710,   710,   710,   710,   711,   711,   711,   711,   711,   711,   711,   711,   704,   704,   704,   704,   704,   704,   704,   704,   705,   705,   705,   705,   705,   705,   705,   705,   706,   706,   706,   706,   706,   706,   706,   706,   707,   707,   707,   707,   707,   707,   707,   707,   725,   725,   725,   725,   725,   725,   725,   725,   724,   724,   724,   724,   724,   724,   724,   724,   727,   727,   727,   727,   727,   727,   727,   727,   726,   726,   726,   726,   726,   726,   726,   726,   721,   721,   721,   721,   721,   721,   721,   721,   720,   720,   720,   720,   720,   720,   720,   720,   723,   723,   723,   723,   723,   723,   723,   723,   722,   722,   722,   722,   722,   722,   722,   722
+// CHECK: 742,   742,   742,   742,   742,   742,   742,   742,   743,   743,   743,   743,   743,   743,   743,   743,   740,   740,   740,   740,   740,   740,   740,   740,   741,   741,   741,   741,   741,   741,   741,   741,   738,   738,   738,   738,   738,   738,   738,   738,   739,   739,   739,   739,   739,   739,   739,   739,   736,   736,   736,   736,   736,   736,   736,   736,   737,   737,   737,   737,   737,   737,   737,   737,   759,   759,   759,   759,   759,   759,   759,   759,   758,   758,   758,   758,   758,   758,   758,   758,   757,   757,   757,   757,   757,   757,   757,   757,   756,   756,   756,   756,   756,   756,   756,   756,   755,   755,   755,   755,   755,   755,   755,   755,   754,   754,   754,   754,   754,   754,   754,   754,   753,   753,   753,   753,   753,   753,   753,   753,   752,   752,   752,   752,   752,   752,   752,   752
+// CHECK: 768,   768,   768,   768,   768,   768,   768,   768,   769,   769,   769,   769,   769,   769,   769,   769,   770,   770,   770,   770,   770,   770,   770,   770,   771,   771,   771,   771,   771,   771,   771,   771,   772,   772,   772,   772,   772,   772,   772,   772,   773,   773,   773,   773,   773,   773,   773,   773,   774,   774,   774,   774,   774,   774,   774,   774,   775,   775,   775,   775,   775,   775,   775,   775,   785,   785,   785,   785,   785,   785,   785,   785,   784,   784,   784,   784,   784,   784,   784,   784,   787,   787,   787,   787,   787,   787,   787,   787,   786,   786,   786,   786,   786,   786,   786,   786,   789,   789,   789,   789,   789,   789,   789,   789,   788,   788,   788,   788,   788,   788,   788,   788,   791,   791,   791,   791,   791,   791,   791,   791,   790,   790,   790,   790,   790,   790,   790,   790
+// CHECK: 802,   802,   802,   802,   802,   802,   802,   802,   803,   803,   803,   803,   803,   803,   803,   803,   800,   800,   800,   800,   800,   800,   800,   800,   801,   801,   801,   801,   801,   801,   801,   801,   806,   806,   806,   806,   806,   806,   806,   806,   807,   807,   807,   807,   807,   807,   807,   807,   804,   804,   804,   804,   804,   804,   804,   804,   805,   805,   805,   805,   805,   805,   805,   805,   819,   819,   819,   819,   819,   819,   819,   819,   818,   818,   818,   818,   818,   818,   818,   818,   817,   817,   817,   817,   817,   817,   817,   817,   816,   816,   816,   816,   816,   816,   816,   816,   823,   823,   823,   823,   823,   823,   823,   823,   822,   822,   822,   822,   822,   822,   822,   822,   821,   821,   821,   821,   821,   821,   821,   821,   820,   820,   820,   820,   820,   820,   820,   820
+// CHECK: 836,   836,   836,   836,   836,   836,   836,   836,   837,   837,   837,   837,   837,   837,   837,   837,   838,   838,   838,   838,   838,   838,   838,   838,   839,   839,   839,   839,   839,   839,   839,   839,   832,   832,   832,   832,   832,   832,   832,   832,   833,   833,   833,   833,   833,   833,   833,   833,   834,   834,   834,   834,   834,   834,   834,   834,   835,   835,   835,   835,   835,   835,   835,   835,   853,   853,   853,   853,   853,   853,   853,   853,   852,   852,   852,   852,   852,   852,   852,   852,   855,   855,   855,   855,   855,   855,   855,   855,   854,   854,   854,   854,   854,   854,   854,   854,   849,   849,   849,   849,   849,   849,   849,   849,   848,   848,   848,   848,   848,   848,   848,   848,   851,   851,   851,   851,   851,   851,   851,   851,   850,   850,   850,   850,   850,   850,   850,   850
+// CHECK: 870,   870,   870,   870,   870,   870,   870,   870,   871,   871,   871,   871,   871,   871,   871,   871,   868,   868,   868,   868,   868,   868,   868,   868,   869,   869,   869,   869,   869,   869,   869,   869,   866,   866,   866,   866,   866,   866,   866,   866,   867,   867,   867,   867,   867,   867,   867,   867,   864,   864,   864,   864,   864,   864,   864,   864,   865,   865,   865,   865,   865,   865,   865,   865,   887,   887,   887,   887,   887,   887,   887,   887,   886,   886,   886,   886,   886,   886,   886,   886,   885,   885,   885,   885,   885,   885,   885,   885,   884,   884,   884,   884,   884,   884,   884,   884,   883,   883,   883,   883,   883,   883,   883,   883,   882,   882,   882,   882,   882,   882,   882,   882,   881,   881,   881,   881,   881,   881,   881,   881,   880,   880,   880,   880,   880,   880,   880,   880
+// CHECK: 896,   896,   896,   896,   896,   896,   896,   896,   897,   897,   897,   897,   897,   897,   897,   897,   898,   898,   898,   898,   898,   898,   898,   898,   899,   899,   899,   899,   899,   899,   899,   899,   900,   900,   900,   900,   900,   900,   900,   900,   901,   901,   901,   901,   901,   901,   901,   901,   902,   902,   902,   902,   902,   902,   902,   902,   903,   903,   903,   903,   903,   903,   903,   903,   913,   913,   913,   913,   913,   913,   913,   913,   912,   912,   912,   912,   912,   912,   912,   912,   915,   915,   915,   915,   915,   915,   915,   915,   914,   914,   914,   914,   914,   914,   914,   914,   917,   917,   917,   917,   917,   917,   917,   917,   916,   916,   916,   916,   916,   916,   916,   916,   919,   919,   919,   919,   919,   919,   919,   919,   918,   918,   918,   918,   918,   918,   918,   918
+// CHECK: 930,   930,   930,   930,   930,   930,   930,   930,   931,   931,   931,   931,   931,   931,   931,   931,   928,   928,   928,   928,   928,   928,   928,   928,   929,   929,   929,   929,   929,   929,   929,   929,   934,   934,   934,   934,   934,   934,   934,   934,   935,   935,   935,   935,   935,   935,   935,   935,   932,   932,   932,   932,   932,   932,   932,   932,   933,   933,   933,   933,   933,   933,   933,   933,   947,   947,   947,   947,   947,   947,   947,   947,   946,   946,   946,   946,   946,   946,   946,   946,   945,   945,   945,   945,   945,   945,   945,   945,   944,   944,   944,   944,   944,   944,   944,   944,   951,   951,   951,   951,   951,   951,   951,   951,   950,   950,   950,   950,   950,   950,   950,   950,   949,   949,   949,   949,   949,   949,   949,   949,   948,   948,   948,   948,   948,   948,   948,   948
+// CHECK: 964,   964,   964,   964,   964,   964,   964,   964,   965,   965,   965,   965,   965,   965,   965,   965,   966,   966,   966,   966,   966,   966,   966,   966,   967,   967,   967,   967,   967,   967,   967,   967,   960,   960,   960,   960,   960,   960,   960,   960,   961,   961,   961,   961,   961,   961,   961,   961,   962,   962,   962,   962,   962,   962,   962,   962,   963,   963,   963,   963,   963,   963,   963,   963,   981,   981,   981,   981,   981,   981,   981,   981,   980,   980,   980,   980,   980,   980,   980,   980,   983,   983,   983,   983,   983,   983,   983,   983,   982,   982,   982,   982,   982,   982,   982,   982,   977,   977,   977,   977,   977,   977,   977,   977,   976,   976,   976,   976,   976,   976,   976,   976,   979,   979,   979,   979,   979,   979,   979,   979,   978,   978,   978,   978,   978,   978,   978,   978
+// CHECK: 998,   998,   998,   998,   998,   998,   998,   998,   999,   999,   999,   999,   999,   999,   999,   999,   996,   996,   996,   996,   996,   996,   996,   996,   997,   997,   997,   997,   997,   997,   997,   997,   994,   994,   994,   994,   994,   994,   994,   994,   995,   995,   995,   995,   995,   995,   995,   995,   992,   992,   992,   992,   992,   992,   992,   992,   993,   993,   993,   993,   993,   993,   993,   993,   1015,   1015,   1015,   1015,   1015,   1015,   1015,   1015,   1014,   1014,   1014,   1014,   1014,   1014,   1014,   1014,   1013,   1013,   1013,   1013,   1013,   1013,   1013,   1013,   1012,   1012,   1012,   1012,   1012,   1012,   1012,   1012,   1011,   1011,   1011,   1011,   1011,   1011,   1011,   1011,   1010,   1010,   1010,   1010,   1010,   1010,   1010,   1010,   1009,   1009,   1009,   1009,   1009,   1009,   1009,   1009,   1008,   1008,   1008,   1008,   1008,   1008,   1008,   1008
+// CHECK: 8,   8,   8,   8,   8,   8,   8,   8,   9,   9,   9,   9,   9,   9,   9,   9,   10,   10,   10,   10,   10,   10,   10,   10,   11,   11,   11,   11,   11,   11,   11,   11,   12,   12,   12,   12,   12,   12,   12,   12,   13,   13,   13,   13,   13,   13,   13,   13,   14,   14,   14,   14,   14,   14,   14,   14,   15,   15,   15,   15,   15,   15,   15,   15,   25,   25,   25,   25,   25,   25,   25,   25,   24,   24,   24,   24,   24,   24,   24,   24,   27,   27,   27,   27,   27,   27,   27,   27,   26,   26,   26,   26,   26,   26,   26,   26,   29,   29,   29,   29,   29,   29,   29,   29,   28,   28,   28,   28,   28,   28,   28,   28,   31,   31,   31,   31,   31,   31,   31,   31,   30,   30,   30,   30,   30,   30,   30,   30
+// CHECK: 42,   42,   42,   42,   42,   42,   42,   42,   43,   43,   43,   43,   43,   43,   43,   43,   40,   40,   40,   40,   40,   40,   40,   40,   41,   41,   41,   41,   41,   41,   41,   41,   46,   46,   46,   46,   46,   46,   46,   46,   47,   47,   47,   47,   47,   47,   47,   47,   44,   44,   44,   44,   44,   44,   44,   44,   45,   45,   45,   45,   45,   45,   45,   45,   59,   59,   59,   59,   59,   59,   59,   59,   58,   58,   58,   58,   58,   58,   58,   58,   57,   57,   57,   57,   57,   57,   57,   57,   56,   56,   56,   56,   56,   56,   56,   56,   63,   63,   63,   63,   63,   63,   63,   63,   62,   62,   62,   62,   62,   62,   62,   62,   61,   61,   61,   61,   61,   61,   61,   61,   60,   60,   60,   60,   60,   60,   60,   60
+// CHECK: 76,   76,   76,   76,   76,   76,   76,   76,   77,   77,   77,   77,   77,   77,   77,   77,   78,   78,   78,   78,   78,   78,   78,   78,   79,   79,   79,   79,   79,   79,   79,   79,   72,   72,   72,   72,   72,   72,   72,   72,   73,   73,   73,   73,   73,   73,   73,   73,   74,   74,   74,   74,   74,   74,   74,   74,   75,   75,   75,   75,   75,   75,   75,   75,   93,   93,   93,   93,   93,   93,   93,   93,   92,   92,   92,   92,   92,   92,   92,   92,   95,   95,   95,   95,   95,   95,   95,   95,   94,   94,   94,   94,   94,   94,   94,   94,   89,   89,   89,   89,   89,   89,   89,   89,   88,   88,   88,   88,   88,   88,   88,   88,   91,   91,   91,   91,   91,   91,   91,   91,   90,   90,   90,   90,   90,   90,   90,   90
+// CHECK: 110,   110,   110,   110,   110,   110,   110,   110,   111,   111,   111,   111,   111,   111,   111,   111,   108,   108,   108,   108,   108,   108,   108,   108,   109,   109,   109,   109,   109,   109,   109,   109,   106,   106,   106,   106,   106,   106,   106,   106,   107,   107,   107,   107,   107,   107,   107,   107,   104,   104,   104,   104,   104,   104,   104,   104,   105,   105,   105,   105,   105,   105,   105,   105,   127,   127,   127,   127,   127,   127,   127,   127,   126,   126,   126,   126,   126,   126,   126,   126,   125,   125,   125,   125,   125,   125,   125,   125,   124,   124,   124,   124,   124,   124,   124,   124,   123,   123,   123,   123,   123,   123,   123,   123,   122,   122,   122,   122,   122,   122,   122,   122,   121,   121,   121,   121,   121,   121,   121,   121,   120,   120,   120,   120,   120,   120,   120,   120
+// CHECK: 136,   136,   136,   136,   136,   136,   136,   136,   137,   137,   137,   137,   137,   137,   137,   137,   138,   138,   138,   138,   138,   138,   138,   138,   139,   139,   139,   139,   139,   139,   139,   139,   140,   140,   140,   140,   140,   140,   140,   140,   141,   141,   141,   141,   141,   141,   141,   141,   142,   142,   142,   142,   142,   142,   142,   142,   143,   143,   143,   143,   143,   143,   143,   143,   153,   153,   153,   153,   153,   153,   153,   153,   152,   152,   152,   152,   152,   152,   152,   152,   155,   155,   155,   155,   155,   155,   155,   155,   154,   154,   154,   154,   154,   154,   154,   154,   157,   157,   157,   157,   157,   157,   157,   157,   156,   156,   156,   156,   156,   156,   156,   156,   159,   159,   159,   159,   159,   159,   159,   159,   158,   158,   158,   158,   158,   158,   158,   158
+// CHECK: 170,   170,   170,   170,   170,   170,   170,   170,   171,   171,   171,   171,   171,   171,   171,   171,   168,   168,   168,   168,   168,   168,   168,   168,   169,   169,   169,   169,   169,   169,   169,   169,   174,   174,   174,   174,   174,   174,   174,   174,   175,   175,   175,   175,   175,   175,   175,   175,   172,   172,   172,   172,   172,   172,   172,   172,   173,   173,   173,   173,   173,   173,   173,   173,   187,   187,   187,   187,   187,   187,   187,   187,   186,   186,   186,   186,   186,   186,   186,   186,   185,   185,   185,   185,   185,   185,   185,   185,   184,   184,   184,   184,   184,   184,   184,   184,   191,   191,   191,   191,   191,   191,   191,   191,   190,   190,   190,   190,   190,   190,   190,   190,   189,   189,   189,   189,   189,   189,   189,   189,   188,   188,   188,   188,   188,   188,   188,   188
+// CHECK: 204,   204,   204,   204,   204,   204,   204,   204,   205,   205,   205,   205,   205,   205,   205,   205,   206,   206,   206,   206,   206,   206,   206,   206,   207,   207,   207,   207,   207,   207,   207,   207,   200,   200,   200,   200,   200,   200,   200,   200,   201,   201,   201,   201,   201,   201,   201,   201,   202,   202,   202,   202,   202,   202,   202,   202,   203,   203,   203,   203,   203,   203,   203,   203,   221,   221,   221,   221,   221,   221,   221,   221,   220,   220,   220,   220,   220,   220,   220,   220,   223,   223,   223,   223,   223,   223,   223,   223,   222,   222,   222,   222,   222,   222,   222,   222,   217,   217,   217,   217,   217,   217,   217,   217,   216,   216,   216,   216,   216,   216,   216,   216,   219,   219,   219,   219,   219,   219,   219,   219,   218,   218,   218,   218,   218,   218,   218,   218
+// CHECK: 238,   238,   238,   238,   238,   238,   238,   238,   239,   239,   239,   239,   239,   239,   239,   239,   236,   236,   236,   236,   236,   236,   236,   236,   237,   237,   237,   237,   237,   237,   237,   237,   234,   234,   234,   234,   234,   234,   234,   234,   235,   235,   235,   235,   235,   235,   235,   235,   232,   232,   232,   232,   232,   232,   232,   232,   233,   233,   233,   233,   233,   233,   233,   233,   255,   255,   255,   255,   255,   255,   255,   255,   254,   254,   254,   254,   254,   254,   254,   254,   253,   253,   253,   253,   253,   253,   253,   253,   252,   252,   252,   252,   252,   252,   252,   252,   251,   251,   251,   251,   251,   251,   251,   251,   250,   250,   250,   250,   250,   250,   250,   250,   249,   249,   249,   249,   249,   249,   249,   249,   248,   248,   248,   248,   248,   248,   248,   248
+// CHECK: 264,   264,   264,   264,   264,   264,   264,   264,   265,   265,   265,   265,   265,   265,   265,   265,   266,   266,   266,   266,   266,   266,   266,   266,   267,   267,   267,   267,   267,   267,   267,   267,   268,   268,   268,   268,   268,   268,   268,   268,   269,   269,   269,   269,   269,   269,   269,   269,   270,   270,   270,   270,   270,   270,   270,   270,   271,   271,   271,   271,   271,   271,   271,   271,   281,   281,   281,   281,   281,   281,   281,   281,   280,   280,   280,   280,   280,   280,   280,   280,   283,   283,   283,   283,   283,   283,   283,   283,   282,   282,   282,   282,   282,   282,   282,   282,   285,   285,   285,   285,   285,   285,   285,   285,   284,   284,   284,   284,   284,   284,   284,   284,   287,   287,   287,   287,   287,   287,   287,   287,   286,   286,   286,   286,   286,   286,   286,   286
+// CHECK: 298,   298,   298,   298,   298,   298,   298,   298,   299,   299,   299,   299,   299,   299,   299,   299,   296,   296,   296,   296,   296,   296,   296,   296,   297,   297,   297,   297,   297,   297,   297,   297,   302,   302,   302,   302,   302,   302,   302,   302,   303,   303,   303,   303,   303,   303,   303,   303,   300,   300,   300,   300,   300,   300,   300,   300,   301,   301,   301,   301,   301,   301,   301,   301,   315,   315,   315,   315,   315,   315,   315,   315,   314,   314,   314,   314,   314,   314,   314,   314,   313,   313,   313,   313,   313,   313,   313,   313,   312,   312,   312,   312,   312,   312,   312,   312,   319,   319,   319,   319,   319,   319,   319,   319,   318,   318,   318,   318,   318,   318,   318,   318,   317,   317,   317,   317,   317,   317,   317,   317,   316,   316,   316,   316,   316,   316,   316,   316
+// CHECK: 332,   332,   332,   332,   332,   332,   332,   332,   333,   333,   333,   333,   333,   333,   333,   333,   334,   334,   334,   334,   334,   334,   334,   334,   335,   335,   335,   335,   335,   335,   335,   335,   328,   328,   328,   328,   328,   328,   328,   328,   329,   329,   329,   329,   329,   329,   329,   329,   330,   330,   330,   330,   330,   330,   330,   330,   331,   331,   331,   331,   331,   331,   331,   331,   349,   349,   349,   349,   349,   349,   349,   349,   348,   348,   348,   348,   348,   348,   348,   348,   351,   351,   351,   351,   351,   351,   351,   351,   350,   350,   350,   350,   350,   350,   350,   350,   345,   345,   345,   345,   345,   345,   345,   345,   344,   344,   344,   344,   344,   344,   344,   344,   347,   347,   347,   347,   347,   347,   347,   347,   346,   346,   346,   346,   346,   346,   346,   346
+// CHECK: 366,   366,   366,   366,   366,   366,   366,   366,   367,   367,   367,   367,   367,   367,   367,   367,   364,   364,   364,   364,   364,   364,   364,   364,   365,   365,   365,   365,   365,   365,   365,   365,   362,   362,   362,   362,   362,   362,   362,   362,   363,   363,   363,   363,   363,   363,   363,   363,   360,   360,   360,   360,   360,   360,   360,   360,   361,   361,   361,   361,   361,   361,   361,   361,   383,   383,   383,   383,   383,   383,   383,   383,   382,   382,   382,   382,   382,   382,   382,   382,   381,   381,   381,   381,   381,   381,   381,   381,   380,   380,   380,   380,   380,   380,   380,   380,   379,   379,   379,   379,   379,   379,   379,   379,   378,   378,   378,   378,   378,   378,   378,   378,   377,   377,   377,   377,   377,   377,   377,   377,   376,   376,   376,   376,   376,   376,   376,   376
+// CHECK: 392,   392,   392,   392,   392,   392,   392,   392,   393,   393,   393,   393,   393,   393,   393,   393,   394,   394,   394,   394,   394,   394,   394,   394,   395,   395,   395,   395,   395,   395,   395,   395,   396,   396,   396,   396,   396,   396,   396,   396,   397,   397,   397,   397,   397,   397,   397,   397,   398,   398,   398,   398,   398,   398,   398,   398,   399,   399,   399,   399,   399,   399,   399,   399,   409,   409,   409,   409,   409,   409,   409,   409,   408,   408,   408,   408,   408,   408,   408,   408,   411,   411,   411,   411,   411,   411,   411,   411,   410,   410,   410,   410,   410,   410,   410,   410,   413,   413,   413,   413,   413,   413,   413,   413,   412,   412,   412,   412,   412,   412,   412,   412,   415,   415,   415,   415,   415,   415,   415,   415,   414,   414,   414,   414,   414,   414,   414,   414
+// CHECK: 426,   426,   426,   426,   426,   426,   426,   426,   427,   427,   427,   427,   427,   427,   427,   427,   424,   424,   424,   424,   424,   424,   424,   424,   425,   425,   425,   425,   425,   425,   425,   425,   430,   430,   430,   430,   430,   430,   430,   430,   431,   431,   431,   431,   431,   431,   431,   431,   428,   428,   428,   428,   428,   428,   428,   428,   429,   429,   429,   429,   429,   429,   429,   429,   443,   443,   443,   443,   443,   443,   443,   443,   442,   442,   442,   442,   442,   442,   442,   442,   441,   441,   441,   441,   441,   441,   441,   441,   440,   440,   440,   440,   440,   440,   440,   440,   447,   447,   447,   447,   447,   447,   447,   447,   446,   446,   446,   446,   446,   446,   446,   446,   445,   445,   445,   445,   445,   445,   445,   445,   444,   444,   444,   444,   444,   444,   444,   444
+// CHECK: 460,   460,   460,   460,   460,   460,   460,   460,   461,   461,   461,   461,   461,   461,   461,   461,   462,   462,   462,   462,   462,   462,   462,   462,   463,   463,   463,   463,   463,   463,   463,   463,   456,   456,   456,   456,   456,   456,   456,   456,   457,   457,   457,   457,   457,   457,   457,   457,   458,   458,   458,   458,   458,   458,   458,   458,   459,   459,   459,   459,   459,   459,   459,   459,   477,   477,   477,   477,   477,   477,   477,   477,   476,   476,   476,   476,   476,   476,   476,   476,   479,   479,   479,   479,   479,   479,   479,   479,   478,   478,   478,   478,   478,   478,   478,   478,   473,   473,   473,   473,   473,   473,   473,   473,   472,   472,   472,   472,   472,   472,   472,   472,   475,   475,   475,   475,   475,   475,   475,   475,   474,   474,   474,   474,   474,   474,   474,   474
+// CHECK: 494,   494,   494,   494,   494,   494,   494,   494,   495,   495,   495,   495,   495,   495,   495,   495,   492,   492,   492,   492,   492,   492,   492,   492,   493,   493,   493,   493,   493,   493,   493,   493,   490,   490,   490,   490,   490,   490,   490,   490,   491,   491,   491,   491,   491,   491,   491,   491,   488,   488,   488,   488,   488,   488,   488,   488,   489,   489,   489,   489,   489,   489,   489,   489,   511,   511,   511,   511,   511,   511,   511,   511,   510,   510,   510,   510,   510,   510,   510,   510,   509,   509,   509,   509,   509,   509,   509,   509,   508,   508,   508,   508,   508,   508,   508,   508,   507,   507,   507,   507,   507,   507,   507,   507,   506,   506,   506,   506,   506,   506,   506,   506,   505,   505,   505,   505,   505,   505,   505,   505,   504,   504,   504,   504,   504,   504,   504,   504
+// CHECK: 520,   520,   520,   520,   520,   520,   520,   520,   521,   521,   521,   521,   521,   521,   521,   521,   522,   522,   522,   522,   522,   522,   522,   522,   523,   523,   523,   523,   523,   523,   523,   523,   524,   524,   524,   524,   524,   524,   524,   524,   525,   525,   525,   525,   525,   525,   525,   525,   526,   526,   526,   526,   526,   526,   526,   526,   527,   527,   527,   527,   527,   527,   527,   527,   537,   537,   537,   537,   537,   537,   537,   537,   536,   536,   536,   536,   536,   536,   536,   536,   539,   539,   539,   539,   539,   539,   539,   539,   538,   538,   538,   538,   538,   538,   538,   538,   541,   541,   541,   541,   541,   541,   541,   541,   540,   540,   540,   540,   540,   540,   540,   540,   543,   543,   543,   543,   543,   543,   543,   543,   542,   542,   542,   542,   542,   542,   542,   542
+// CHECK: 554,   554,   554,   554,   554,   554,   554,   554,   555,   555,   555,   555,   555,   555,   555,   555,   552,   552,   552,   552,   552,   552,   552,   552,   553,   553,   553,   553,   553,   553,   553,   553,   558,   558,   558,   558,   558,   558,   558,   558,   559,   559,   559,   559,   559,   559,   559,   559,   556,   556,   556,   556,   556,   556,   556,   556,   557,   557,   557,   557,   557,   557,   557,   557,   571,   571,   571,   571,   571,   571,   571,   571,   570,   570,   570,   570,   570,   570,   570,   570,   569,   569,   569,   569,   569,   569,   569,   569,   568,   568,   568,   568,   568,   568,   568,   568,   575,   575,   575,   575,   575,   575,   575,   575,   574,   574,   574,   574,   574,   574,   574,   574,   573,   573,   573,   573,   573,   573,   573,   573,   572,   572,   572,   572,   572,   572,   572,   572
+// CHECK: 588,   588,   588,   588,   588,   588,   588,   588,   589,   589,   589,   589,   589,   589,   589,   589,   590,   590,   590,   590,   590,   590,   590,   590,   591,   591,   591,   591,   591,   591,   591,   591,   584,   584,   584,   584,   584,   584,   584,   584,   585,   585,   585,   585,   585,   585,   585,   585,   586,   586,   586,   586,   586,   586,   586,   586,   587,   587,   587,   587,   587,   587,   587,   587,   605,   605,   605,   605,   605,   605,   605,   605,   604,   604,   604,   604,   604,   604,   604,   604,   607,   607,   607,   607,   607,   607,   607,   607,   606,   606,   606,   606,   606,   606,   606,   606,   601,   601,   601,   601,   601,   601,   601,   601,   600,   600,   600,   600,   600,   600,   600,   600,   603,   603,   603,   603,   603,   603,   603,   603,   602,   602,   602,   602,   602,   602,   602,   602
+// CHECK: 622,   622,   622,   622,   622,   622,   622,   622,   623,   623,   623,   623,   623,   623,   623,   623,   620,   620,   620,   620,   620,   620,   620,   620,   621,   621,   621,   621,   621,   621,   621,   621,   618,   618,   618,   618,   618,   618,   618,   618,   619,   619,   619,   619,   619,   619,   619,   619,   616,   616,   616,   616,   616,   616,   616,   616,   617,   617,   617,   617,   617,   617,   617,   617,   639,   639,   639,   639,   639,   639,   639,   639,   638,   638,   638,   638,   638,   638,   638,   638,   637,   637,   637,   637,   637,   637,   637,   637,   636,   636,   636,   636,   636,   636,   636,   636,   635,   635,   635,   635,   635,   635,   635,   635,   634,   634,   634,   634,   634,   634,   634,   634,   633,   633,   633,   633,   633,   633,   633,   633,   632,   632,   632,   632,   632,   632,   632,   632
+// CHECK: 648,   648,   648,   648,   648,   648,   648,   648,   649,   649,   649,   649,   649,   649,   649,   649,   650,   650,   650,   650,   650,   650,   650,   650,   651,   651,   651,   651,   651,   651,   651,   651,   652,   652,   652,   652,   652,   652,   652,   652,   653,   653,   653,   653,   653,   653,   653,   653,   654,   654,   654,   654,   654,   654,   654,   654,   655,   655,   655,   655,   655,   655,   655,   655,   665,   665,   665,   665,   665,   665,   665,   665,   664,   664,   664,   664,   664,   664,   664,   664,   667,   667,   667,   667,   667,   667,   667,   667,   666,   666,   666,   666,   666,   666,   666,   666,   669,   669,   669,   669,   669,   669,   669,   669,   668,   668,   668,   668,   668,   668,   668,   668,   671,   671,   671,   671,   671,   671,   671,   671,   670,   670,   670,   670,   670,   670,   670,   670
+// CHECK: 682,   682,   682,   682,   682,   682,   682,   682,   683,   683,   683,   683,   683,   683,   683,   683,   680,   680,   680,   680,   680,   680,   680,   680,   681,   681,   681,   681,   681,   681,   681,   681,   686,   686,   686,   686,   686,   686,   686,   686,   687,   687,   687,   687,   687,   687,   687,   687,   684,   684,   684,   684,   684,   684,   684,   684,   685,   685,   685,   685,   685,   685,   685,   685,   699,   699,   699,   699,   699,   699,   699,   699,   698,   698,   698,   698,   698,   698,   698,   698,   697,   697,   697,   697,   697,   697,   697,   697,   696,   696,   696,   696,   696,   696,   696,   696,   703,   703,   703,   703,   703,   703,   703,   703,   702,   702,   702,   702,   702,   702,   702,   702,   701,   701,   701,   701,   701,   701,   701,   701,   700,   700,   700,   700,   700,   700,   700,   700
+// CHECK: 716,   716,   716,   716,   716,   716,   716,   716,   717,   717,   717,   717,   717,   717,   717,   717,   718,   718,   718,   718,   718,   718,   718,   718,   719,   719,   719,   719,   719,   719,   719,   719,   712,   712,   712,   712,   712,   712,   712,   712,   713,   713,   713,   713,   713,   713,   713,   713,   714,   714,   714,   714,   714,   714,   714,   714,   715,   715,   715,   715,   715,   715,   715,   715,   733,   733,   733,   733,   733,   733,   733,   733,   732,   732,   732,   732,   732,   732,   732,   732,   735,   735,   735,   735,   735,   735,   735,   735,   734,   734,   734,   734,   734,   734,   734,   734,   729,   729,   729,   729,   729,   729,   729,   729,   728,   728,   728,   728,   728,   728,   728,   728,   731,   731,   731,   731,   731,   731,   731,   731,   730,   730,   730,   730,   730,   730,   730,   730
+// CHECK: 750,   750,   750,   750,   750,   750,   750,   750,   751,   751,   751,   751,   751,   751,   751,   751,   748,   748,   748,   748,   748,   748,   748,   748,   749,   749,   749,   749,   749,   749,   749,   749,   746,   746,   746,   746,   746,   746,   746,   746,   747,   747,   747,   747,   747,   747,   747,   747,   744,   744,   744,   744,   744,   744,   744,   744,   745,   745,   745,   745,   745,   745,   745,   745,   767,   767,   767,   767,   767,   767,   767,   767,   766,   766,   766,   766,   766,   766,   766,   766,   765,   765,   765,   765,   765,   765,   765,   765,   764,   764,   764,   764,   764,   764,   764,   764,   763,   763,   763,   763,   763,   763,   763,   763,   762,   762,   762,   762,   762,   762,   762,   762,   761,   761,   761,   761,   761,   761,   761,   761,   760,   760,   760,   760,   760,   760,   760,   760
+// CHECK: 776,   776,   776,   776,   776,   776,   776,   776,   777,   777,   777,   777,   777,   777,   777,   777,   778,   778,   778,   778,   778,   778,   778,   778,   779,   779,   779,   779,   779,   779,   779,   779,   780,   780,   780,   780,   780,   780,   780,   780,   781,   781,   781,   781,   781,   781,   781,   781,   782,   782,   782,   782,   782,   782,   782,   782,   783,   783,   783,   783,   783,   783,   783,   783,   793,   793,   793,   793,   793,   793,   793,   793,   792,   792,   792,   792,   792,   792,   792,   792,   795,   795,   795,   795,   795,   795,   795,   795,   794,   794,   794,   794,   794,   794,   794,   794,   797,   797,   797,   797,   797,   797,   797,   797,   796,   796,   796,   796,   796,   796,   796,   796,   799,   799,   799,   799,   799,   799,   799,   799,   798,   798,   798,   798,   798,   798,   798,   798
+// CHECK: 810,   810,   810,   810,   810,   810,   810,   810,   811,   811,   811,   811,   811,   811,   811,   811,   808,   808,   808,   808,   808,   808,   808,   808,   809,   809,   809,   809,   809,   809,   809,   809,   814,   814,   814,   814,   814,   814,   814,   814,   815,   815,   815,   815,   815,   815,   815,   815,   812,   812,   812,   812,   812,   812,   812,   812,   813,   813,   813,   813,   813,   813,   813,   813,   827,   827,   827,   827,   827,   827,   827,   827,   826,   826,   826,   826,   826,   826,   826,   826,   825,   825,   825,   825,   825,   825,   825,   825,   824,   824,   824,   824,   824,   824,   824,   824,   831,   831,   831,   831,   831,   831,   831,   831,   830,   830,   830,   830,   830,   830,   830,   830,   829,   829,   829,   829,   829,   829,   829,   829,   828,   828,   828,   828,   828,   828,   828,   828
+// CHECK: 844,   844,   844,   844,   844,   844,   844,   844,   845,   845,   845,   845,   845,   845,   845,   845,   846,   846,   846,   846,   846,   846,   846,   846,   847,   847,   847,   847,   847,   847,   847,   847,   840,   840,   840,   840,   840,   840,   840,   840,   841,   841,   841,   841,   841,   841,   841,   841,   842,   842,   842,   842,   842,   842,   842,   842,   843,   843,   843,   843,   843,   843,   843,   843,   861,   861,   861,   861,   861,   861,   861,   861,   860,   860,   860,   860,   860,   860,   860,   860,   863,   863,   863,   863,   863,   863,   863,   863,   862,   862,   862,   862,   862,   862,   862,   862,   857,   857,   857,   857,   857,   857,   857,   857,   856,   856,   856,   856,   856,   856,   856,   856,   859,   859,   859,   859,   859,   859,   859,   859,   858,   858,   858,   858,   858,   858,   858,   858
+// CHECK: 878,   878,   878,   878,   878,   878,   878,   878,   879,   879,   879,   879,   879,   879,   879,   879,   876,   876,   876,   876,   876,   876,   876,   876,   877,   877,   877,   877,   877,   877,   877,   877,   874,   874,   874,   874,   874,   874,   874,   874,   875,   875,   875,   875,   875,   875,   875,   875,   872,   872,   872,   872,   872,   872,   872,   872,   873,   873,   873,   873,   873,   873,   873,   873,   895,   895,   895,   895,   895,   895,   895,   895,   894,   894,   894,   894,   894,   894,   894,   894,   893,   893,   893,   893,   893,   893,   893,   893,   892,   892,   892,   892,   892,   892,   892,   892,   891,   891,   891,   891,   891,   891,   891,   891,   890,   890,   890,   890,   890,   890,   890,   890,   889,   889,   889,   889,   889,   889,   889,   889,   888,   888,   888,   888,   888,   888,   888,   888
+// CHECK: 904,   904,   904,   904,   904,   904,   904,   904,   905,   905,   905,   905,   905,   905,   905,   905,   906,   906,   906,   906,   906,   906,   906,   906,   907,   907,   907,   907,   907,   907,   907,   907,   908,   908,   908,   908,   908,   908,   908,   908,   909,   909,   909,   909,   909,   909,   909,   909,   910,   910,   910,   910,   910,   910,   910,   910,   911,   911,   911,   911,   911,   911,   911,   911,   921,   921,   921,   921,   921,   921,   921,   921,   920,   920,   920,   920,   920,   920,   920,   920,   923,   923,   923,   923,   923,   923,   923,   923,   922,   922,   922,   922,   922,   922,   922,   922,   925,   925,   925,   925,   925,   925,   925,   925,   924,   924,   924,   924,   924,   924,   924,   924,   927,   927,   927,   927,   927,   927,   927,   927,   926,   926,   926,   926,   926,   926,   926,   926
+// CHECK: 938,   938,   938,   938,   938,   938,   938,   938,   939,   939,   939,   939,   939,   939,   939,   939,   936,   936,   936,   936,   936,   936,   936,   936,   937,   937,   937,   937,   937,   937,   937,   937,   942,   942,   942,   942,   942,   942,   942,   942,   943,   943,   943,   943,   943,   943,   943,   943,   940,   940,   940,   940,   940,   940,   940,   940,   941,   941,   941,   941,   941,   941,   941,   941,   955,   955,   955,   955,   955,   955,   955,   955,   954,   954,   954,   954,   954,   954,   954,   954,   953,   953,   953,   953,   953,   953,   953,   953,   952,   952,   952,   952,   952,   952,   952,   952,   959,   959,   959,   959,   959,   959,   959,   959,   958,   958,   958,   958,   958,   958,   958,   958,   957,   957,   957,   957,   957,   957,   957,   957,   956,   956,   956,   956,   956,   956,   956,   956
+// CHECK: 972,   972,   972,   972,   972,   972,   972,   972,   973,   973,   973,   973,   973,   973,   973,   973,   974,   974,   974,   974,   974,   974,   974,   974,   975,   975,   975,   975,   975,   975,   975,   975,   968,   968,   968,   968,   968,   968,   968,   968,   969,   969,   969,   969,   969,   969,   969,   969,   970,   970,   970,   970,   970,   970,   970,   970,   971,   971,   971,   971,   971,   971,   971,   971,   989,   989,   989,   989,   989,   989,   989,   989,   988,   988,   988,   988,   988,   988,   988,   988,   991,   991,   991,   991,   991,   991,   991,   991,   990,   990,   990,   990,   990,   990,   990,   990,   985,   985,   985,   985,   985,   985,   985,   985,   984,   984,   984,   984,   984,   984,   984,   984,   987,   987,   987,   987,   987,   987,   987,   987,   986,   986,   986,   986,   986,   986,   986,   986
+// CHECK: 1006,   1006,   1006,   1006,   1006,   1006,   1006,   1006,   1007,   1007,   1007,   1007,   1007,   1007,   1007,   1007,   1004,   1004,   1004,   1004,   1004,   1004,   1004,   1004,   1005,   1005,   1005,   1005,   1005,   1005,   1005,   1005,   1002,   1002,   1002,   1002,   1002,   1002,   1002,   1002,   1003,   1003,   1003,   1003,   1003,   1003,   1003,   1003,   1000,   1000,   1000,   1000,   1000,   1000,   1000,   1000,   1001,   1001,   1001,   1001,   1001,   1001,   1001,   1001,   1023,   1023,   1023,   1023,   1023,   1023,   1023,   1023,   1022,   1022,   1022,   1022,   1022,   1022,   1022,   1022,   1021,   1021,   1021,   1021,   1021,   1021,   1021,   1021,   1020,   1020,   1020,   1020,   1020,   1020,   1020,   1020,   1019,   1019,   1019,   1019,   1019,   1019,   1019,   1019,   1018,   1018,   1018,   1018,   1018,   1018,   1018,   1018,   1017,   1017,   1017,   1017,   1017,   1017,   1017,   1017,   1016,   1016,   1016,   1016,   1016,   1016,   1016,   1016
 

From 4506de1e2bfe9352331d75fe6d3c89f3d2f5287c Mon Sep 17 00:00:00 2001
From: Uday Bondhugula <uday@polymagelabs.com>
Date: Thu, 26 Oct 2023 13:34:54 +0530
Subject: [PATCH 026/877] NFC. Move out and expose affine expression
 simplification utility out of AffineOps lib (#69813)

Move out trivial affine expression simplification out of AffineOps
library. Expose it from libIR. Users of such methods shouldn't have to
rely on the AffineOps dialect. For eg., with this change, the method can
be used now from lib/Analysis/ (FlatLinearConstraints) as well as
AffineOps dialect canonicalization.

This way those one won't need to depend on AffineOps for some
simplification of affine expressions.
---
 mlir/include/mlir/IR/AffineExpr.h        |  14 +++
 mlir/lib/Dialect/Affine/IR/AffineOps.cpp | 115 +++--------------------
 mlir/lib/IR/AffineExpr.cpp               |  80 ++++++++++++++++
 3 files changed, 108 insertions(+), 101 deletions(-)

diff --git a/mlir/include/mlir/IR/AffineExpr.h b/mlir/include/mlir/IR/AffineExpr.h
index 8ced8770591ee..69e02c94ef270 100644
--- a/mlir/include/mlir/IR/AffineExpr.h
+++ b/mlir/include/mlir/IR/AffineExpr.h
@@ -353,6 +353,20 @@ void bindSymbolsList(MLIRContext *ctx, MutableArrayRef<AffineExprTy> exprs) {
     e = getAffineSymbolExpr(idx++, ctx);
 }
 
+/// Get a lower or upper (depending on `isUpper`) bound for `expr` while using
+/// the constant lower and upper bounds for its inputs provided in
+/// `constLowerBounds` and `constUpperBounds`. Return std::nullopt if such a
+/// bound can't be computed. This method only handles simple sum of product
+/// expressions (w.r.t constant coefficients) so as to not depend on anything
+/// heavyweight in `Analysis`. Expressions of the form: c0*d0 + c1*d1 + c2*s0 +
+/// ... + c_n are handled. Expressions involving floordiv, ceildiv, mod or
+/// semi-affine ones will lead a none being returned.
+std::optional<int64_t>
+getBoundForAffineExpr(AffineExpr expr, unsigned numDims, unsigned numSymbols,
+                      ArrayRef<std::optional<int64_t>> constLowerBounds,
+                      ArrayRef<std::optional<int64_t>> constUpperBounds,
+                      bool isUpper);
+
 } // namespace mlir
 
 namespace llvm {
diff --git a/mlir/lib/Dialect/Affine/IR/AffineOps.cpp b/mlir/lib/Dialect/Affine/IR/AffineOps.cpp
index 4d79c458889d2..ba4285bd52394 100644
--- a/mlir/lib/Dialect/Affine/IR/AffineOps.cpp
+++ b/mlir/lib/Dialect/Affine/IR/AffineOps.cpp
@@ -700,93 +700,6 @@ static std::optional<int64_t> getUpperBound(Value iv) {
   return forOp.getConstantUpperBound() - 1;
 }
 
-/// Get a lower or upper (depending on `isUpper`) bound for `expr` while using
-/// the constant lower and upper bounds for its inputs provided in
-/// `constLowerBounds` and `constUpperBounds`. Return std::nullopt if such a
-/// bound can't be computed. This method only handles simple sum of product
-/// expressions (w.r.t constant coefficients) so as to not depend on anything
-/// heavyweight in `Analysis`. Expressions of the form: c0*d0 + c1*d1 + c2*s0 +
-/// ... + c_n are handled. Expressions involving floordiv, ceildiv, mod or
-/// semi-affine ones will lead std::nullopt being returned.
-static std::optional<int64_t>
-getBoundForExpr(AffineExpr expr, unsigned numDims, unsigned numSymbols,
-                ArrayRef<std::optional<int64_t>> constLowerBounds,
-                ArrayRef<std::optional<int64_t>> constUpperBounds,
-                bool isUpper) {
-  // Handle divs and mods.
-  if (auto binOpExpr = expr.dyn_cast<AffineBinaryOpExpr>()) {
-    // If the LHS of a floor or ceil is bounded and the RHS is a constant, we
-    // can compute an upper bound.
-    if (binOpExpr.getKind() == AffineExprKind::FloorDiv) {
-      auto rhsConst = binOpExpr.getRHS().dyn_cast<AffineConstantExpr>();
-      if (!rhsConst || rhsConst.getValue() < 1)
-        return std::nullopt;
-      auto bound = getBoundForExpr(binOpExpr.getLHS(), numDims, numSymbols,
-                                   constLowerBounds, constUpperBounds, isUpper);
-      if (!bound)
-        return std::nullopt;
-      return mlir::floorDiv(*bound, rhsConst.getValue());
-    }
-    if (binOpExpr.getKind() == AffineExprKind::CeilDiv) {
-      auto rhsConst = binOpExpr.getRHS().dyn_cast<AffineConstantExpr>();
-      if (rhsConst && rhsConst.getValue() >= 1) {
-        auto bound =
-            getBoundForExpr(binOpExpr.getLHS(), numDims, numSymbols,
-                            constLowerBounds, constUpperBounds, isUpper);
-        if (!bound)
-          return std::nullopt;
-        return mlir::ceilDiv(*bound, rhsConst.getValue());
-      }
-      return std::nullopt;
-    }
-    if (binOpExpr.getKind() == AffineExprKind::Mod) {
-      // lhs mod c is always <= c - 1 and non-negative. In addition, if `lhs` is
-      // bounded such that lb <= lhs <= ub and lb floordiv c == ub floordiv c
-      // (same "interval"), then lb mod c <= lhs mod c <= ub mod c.
-      auto rhsConst = binOpExpr.getRHS().dyn_cast<AffineConstantExpr>();
-      if (rhsConst && rhsConst.getValue() >= 1) {
-        int64_t rhsConstVal = rhsConst.getValue();
-        auto lb = getBoundForExpr(binOpExpr.getLHS(), numDims, numSymbols,
-                                  constLowerBounds, constUpperBounds,
-                                  /*isUpper=*/false);
-        auto ub = getBoundForExpr(binOpExpr.getLHS(), numDims, numSymbols,
-                                  constLowerBounds, constUpperBounds, isUpper);
-        if (ub && lb &&
-            floorDiv(*lb, rhsConstVal) == floorDiv(*ub, rhsConstVal))
-          return isUpper ? mod(*ub, rhsConstVal) : mod(*lb, rhsConstVal);
-        return isUpper ? rhsConstVal - 1 : 0;
-      }
-    }
-  }
-  // Flatten the expression.
-  SimpleAffineExprFlattener flattener(numDims, numSymbols);
-  flattener.walkPostOrder(expr);
-  ArrayRef<int64_t> flattenedExpr = flattener.operandExprStack.back();
-  // TODO: Handle local variables. We can get hold of flattener.localExprs and
-  // get bound on the local expr recursively.
-  if (flattener.numLocals > 0)
-    return std::nullopt;
-  int64_t bound = 0;
-  // Substitute the constant lower or upper bound for the dimensional or
-  // symbolic input depending on `isUpper` to determine the bound.
-  for (unsigned i = 0, e = numDims + numSymbols; i < e; ++i) {
-    if (flattenedExpr[i] > 0) {
-      auto &constBound = isUpper ? constUpperBounds[i] : constLowerBounds[i];
-      if (!constBound)
-        return std::nullopt;
-      bound += *constBound * flattenedExpr[i];
-    } else if (flattenedExpr[i] < 0) {
-      auto &constBound = isUpper ? constLowerBounds[i] : constUpperBounds[i];
-      if (!constBound)
-        return std::nullopt;
-      bound += *constBound * flattenedExpr[i];
-    }
-  }
-  // Constant term.
-  bound += flattenedExpr.back();
-  return bound;
-}
-
 /// Determine a constant upper bound for `expr` if one exists while exploiting
 /// values in `operands`. Note that the upper bound is an inclusive one. `expr`
 /// is guaranteed to be less than or equal to it.
@@ -805,9 +718,9 @@ static std::optional<int64_t> getUpperBound(AffineExpr expr, unsigned numDims,
   if (auto constExpr = expr.dyn_cast<AffineConstantExpr>())
     return constExpr.getValue();
 
-  return getBoundForExpr(expr, numDims, numSymbols, constLowerBounds,
-                         constUpperBounds,
-                         /*isUpper=*/true);
+  return getBoundForAffineExpr(expr, numDims, numSymbols, constLowerBounds,
+                               constUpperBounds,
+                               /*isUpper=*/true);
 }
 
 /// Determine a constant lower bound for `expr` if one exists while exploiting
@@ -829,9 +742,9 @@ static std::optional<int64_t> getLowerBound(AffineExpr expr, unsigned numDims,
   if (auto constExpr = expr.dyn_cast<AffineConstantExpr>()) {
     lowerBound = constExpr.getValue();
   } else {
-    lowerBound = getBoundForExpr(expr, numDims, numSymbols, constLowerBounds,
-                                 constUpperBounds,
-                                 /*isUpper=*/false);
+    lowerBound = getBoundForAffineExpr(expr, numDims, numSymbols,
+                                       constLowerBounds, constUpperBounds,
+                                       /*isUpper=*/false);
   }
   return lowerBound;
 }
@@ -970,14 +883,14 @@ static void simplifyMinOrMaxExprWithOperands(AffineMap &map,
       lowerBounds.push_back(constExpr.getValue());
       upperBounds.push_back(constExpr.getValue());
     } else {
-      lowerBounds.push_back(getBoundForExpr(e, map.getNumDims(),
-                                            map.getNumSymbols(),
-                                            constLowerBounds, constUpperBounds,
-                                            /*isUpper=*/false));
-      upperBounds.push_back(getBoundForExpr(e, map.getNumDims(),
-                                            map.getNumSymbols(),
-                                            constLowerBounds, constUpperBounds,
-                                            /*isUpper=*/true));
+      lowerBounds.push_back(
+          getBoundForAffineExpr(e, map.getNumDims(), map.getNumSymbols(),
+                                constLowerBounds, constUpperBounds,
+                                /*isUpper=*/false));
+      upperBounds.push_back(
+          getBoundForAffineExpr(e, map.getNumDims(), map.getNumSymbols(),
+                                constLowerBounds, constUpperBounds,
+                                /*isUpper=*/true));
     }
   }
 
diff --git a/mlir/lib/IR/AffineExpr.cpp b/mlir/lib/IR/AffineExpr.cpp
index 7eccbca4e6e7a..4b7ec89a842bd 100644
--- a/mlir/lib/IR/AffineExpr.cpp
+++ b/mlir/lib/IR/AffineExpr.cpp
@@ -1438,3 +1438,83 @@ AffineExpr mlir::simplifyAffineExpr(AffineExpr expr, unsigned numDims,
   assert(flattener.operandExprStack.empty());
   return simplifiedExpr;
 }
+
+std::optional<int64_t> mlir::getBoundForAffineExpr(
+    AffineExpr expr, unsigned numDims, unsigned numSymbols,
+    ArrayRef<std::optional<int64_t>> constLowerBounds,
+    ArrayRef<std::optional<int64_t>> constUpperBounds, bool isUpper) {
+  // Handle divs and mods.
+  if (auto binOpExpr = expr.dyn_cast<AffineBinaryOpExpr>()) {
+    // If the LHS of a floor or ceil is bounded and the RHS is a constant, we
+    // can compute an upper bound.
+    if (binOpExpr.getKind() == AffineExprKind::FloorDiv) {
+      auto rhsConst = binOpExpr.getRHS().dyn_cast<AffineConstantExpr>();
+      if (!rhsConst || rhsConst.getValue() < 1)
+        return std::nullopt;
+      auto bound =
+          getBoundForAffineExpr(binOpExpr.getLHS(), numDims, numSymbols,
+                                constLowerBounds, constUpperBounds, isUpper);
+      if (!bound)
+        return std::nullopt;
+      return mlir::floorDiv(*bound, rhsConst.getValue());
+    }
+    if (binOpExpr.getKind() == AffineExprKind::CeilDiv) {
+      auto rhsConst = binOpExpr.getRHS().dyn_cast<AffineConstantExpr>();
+      if (rhsConst && rhsConst.getValue() >= 1) {
+        auto bound =
+            getBoundForAffineExpr(binOpExpr.getLHS(), numDims, numSymbols,
+                                  constLowerBounds, constUpperBounds, isUpper);
+        if (!bound)
+          return std::nullopt;
+        return mlir::ceilDiv(*bound, rhsConst.getValue());
+      }
+      return std::nullopt;
+    }
+    if (binOpExpr.getKind() == AffineExprKind::Mod) {
+      // lhs mod c is always <= c - 1 and non-negative. In addition, if `lhs` is
+      // bounded such that lb <= lhs <= ub and lb floordiv c == ub floordiv c
+      // (same "interval"), then lb mod c <= lhs mod c <= ub mod c.
+      auto rhsConst = binOpExpr.getRHS().dyn_cast<AffineConstantExpr>();
+      if (rhsConst && rhsConst.getValue() >= 1) {
+        int64_t rhsConstVal = rhsConst.getValue();
+        auto lb = getBoundForAffineExpr(binOpExpr.getLHS(), numDims, numSymbols,
+                                        constLowerBounds, constUpperBounds,
+                                        /*isUpper=*/false);
+        auto ub =
+            getBoundForAffineExpr(binOpExpr.getLHS(), numDims, numSymbols,
+                                  constLowerBounds, constUpperBounds, isUpper);
+        if (ub && lb &&
+            floorDiv(*lb, rhsConstVal) == floorDiv(*ub, rhsConstVal))
+          return isUpper ? mod(*ub, rhsConstVal) : mod(*lb, rhsConstVal);
+        return isUpper ? rhsConstVal - 1 : 0;
+      }
+    }
+  }
+  // Flatten the expression.
+  SimpleAffineExprFlattener flattener(numDims, numSymbols);
+  flattener.walkPostOrder(expr);
+  ArrayRef<int64_t> flattenedExpr = flattener.operandExprStack.back();
+  // TODO: Handle local variables. We can get hold of flattener.localExprs and
+  // get bound on the local expr recursively.
+  if (flattener.numLocals > 0)
+    return std::nullopt;
+  int64_t bound = 0;
+  // Substitute the constant lower or upper bound for the dimensional or
+  // symbolic input depending on `isUpper` to determine the bound.
+  for (unsigned i = 0, e = numDims + numSymbols; i < e; ++i) {
+    if (flattenedExpr[i] > 0) {
+      auto &constBound = isUpper ? constUpperBounds[i] : constLowerBounds[i];
+      if (!constBound)
+        return std::nullopt;
+      bound += *constBound * flattenedExpr[i];
+    } else if (flattenedExpr[i] < 0) {
+      auto &constBound = isUpper ? constLowerBounds[i] : constUpperBounds[i];
+      if (!constBound)
+        return std::nullopt;
+      bound += *constBound * flattenedExpr[i];
+    }
+  }
+  // Constant term.
+  bound += flattenedExpr.back();
+  return bound;
+}

From 4f131b0d2214b5fbaef43d130e576b40a20bda32 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Thu, 26 Oct 2023 10:19:06 +0200
Subject: [PATCH 027/877] [IR] Require index width to be ule pointer width
 (#70015)

I don't think there is a use case for having an index type that is wider
than the pointer type, and I'm not entirely clear what semantics this
would even have.

Also clarify the GEP semantics to explicitly say how they interact with
the index type width.
---
 llvm/docs/LangRef.rst                         | 31 +++---
 llvm/lib/IR/DataLayout.cpp                    |  2 +
 .../ptrtoint-constantexpr-loop.ll             | 97 -------------------
 .../invalid-datalayout-index-size.ll          |  3 +
 .../test/Transforms/InferAlignment/ptrmask.ll | 15 +--
 mlir/test/Target/LLVMIR/Import/data-layout.ll |  4 +-
 mlir/test/Target/LLVMIR/data-layout.mlir      |  4 +-
 7 files changed, 30 insertions(+), 126 deletions(-)
 create mode 100644 llvm/test/Assembler/invalid-datalayout-index-size.ll

diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index e4625cae9fc53..cabb5cd1bed62 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -2883,7 +2883,8 @@ as follows:
     This specifies the *size* of a pointer and its ``<abi>`` and
     ``<pref>``\erred alignments for address space ``n``. ``<pref>`` is optional
     and defaults to ``<abi>``. The fourth parameter ``<idx>`` is the size of the
-    index that used for address calculation. If not
+    index that used for address calculation, which must be less than or equal
+    to the pointer size. If not
     specified, the default index size is equal to the pointer size. All sizes
     are in bits. The address space, ``n``, is optional, and if not specified,
     denotes the default address space 0. The value of ``n`` must be
@@ -11030,6 +11031,24 @@ for the given testcase is equivalent to:
       ret ptr %t5
     }
 
+The indices are first converted to offsets in the pointer's index type. If the
+currently indexed type is a struct type, the struct offset corresponding to the
+index is sign-extended or truncated to the pointer index type. Otherwise, the
+index itself is sign-extended or truncated, and then multiplied by the type
+allocation size (that is, the size rounded up to the ABI alignment) of the
+currently indexed type.
+
+The offsets are then added to the low bits of the base address up to the index
+type width, with silently-wrapping two's complement arithmetic. If the pointer
+size is larger than the index size, this means that the bits outside the index
+type width will not be affected.
+
+The result value of the ``getelementptr`` may be outside the object pointed
+to by the base pointer. The result value may not necessarily be used to access
+memory though, even if it happens to point into allocated storage. See the
+:ref:`Pointer Aliasing Rules <pointeraliasing>` section for more
+information.
+
 If the ``inbounds`` keyword is present, the result value of a
 ``getelementptr`` with any non-zero indices is a
 :ref:`poison value <poisonvalues>` if one of the following rules is violated:
@@ -11061,16 +11080,6 @@ These rules are based on the assumption that no allocated object may cross
 the unsigned address space boundary, and no allocated object may be larger
 than half the pointer index type space.
 
-If the ``inbounds`` keyword is not present, the offsets are added to the
-base address with silently-wrapping two's complement arithmetic. If the
-offsets have a different width from the pointer's index type, they are
-sign-extended or truncated to the width of the pointer's index type. The result
-value of the ``getelementptr`` may be outside the object pointed to by the base
-pointer. The result value may not necessarily be used to access memory
-though, even if it happens to point into allocated storage. See the
-:ref:`Pointer Aliasing Rules <pointeraliasing>` section for more
-information.
-
 If the ``inrange`` keyword is present before any index, loading from or
 storing to any pointer derived from the ``getelementptr`` has undefined
 behavior if the load or store would access memory outside of the bounds of
diff --git a/llvm/lib/IR/DataLayout.cpp b/llvm/lib/IR/DataLayout.cpp
index d324c4b488f72..c8ef232082554 100644
--- a/llvm/lib/IR/DataLayout.cpp
+++ b/llvm/lib/IR/DataLayout.cpp
@@ -649,6 +649,8 @@ Error DataLayout::setPointerAlignmentInBits(uint32_t AddrSpace, Align ABIAlign,
   if (PrefAlign < ABIAlign)
     return reportError(
         "Preferred alignment cannot be less than the ABI alignment");
+  if (IndexBitWidth > TypeBitWidth)
+    return reportError("Index width cannot be larger than pointer width");
 
   auto I = lower_bound(Pointers, AddrSpace,
                        [](const PointerAlignElem &A, uint32_t AddressSpace) {
diff --git a/llvm/test/Analysis/ScalarEvolution/ptrtoint-constantexpr-loop.ll b/llvm/test/Analysis/ScalarEvolution/ptrtoint-constantexpr-loop.ll
index ca39b0b40e316..31a6a6ec0023e 100644
--- a/llvm/test/Analysis/ScalarEvolution/ptrtoint-constantexpr-loop.ll
+++ b/llvm/test/Analysis/ScalarEvolution/ptrtoint-constantexpr-loop.ll
@@ -2,7 +2,6 @@
 ; RUN: opt < %s --data-layout="p:64:64:64:64" -S -disable-output "-passes=print<scalar-evolution>" 2>&1 | FileCheck --check-prefixes=PTR64_IDX64 %s
 ; RUN: opt < %s --data-layout="p:64:64:64:32" -S -disable-output "-passes=print<scalar-evolution>" 2>&1 | FileCheck --check-prefixes=PTR64_IDX32 %s
 ; RUN: opt < %s --data-layout="p:16:16:16:16" -S -disable-output "-passes=print<scalar-evolution>" 2>&1 | FileCheck --check-prefixes=PTR16_IDX16 %s
-; RUN: opt < %s --data-layout="p:16:16:16:32" -S -disable-output "-passes=print<scalar-evolution>" 2>&1 | FileCheck --check-prefixes=PTR16_IDX32 %s
 
 @global = external hidden global [0 x i8]
 
@@ -63,24 +62,6 @@ define hidden ptr @trunc_ptr_to_i64(ptr %arg, ptr %arg10) {
 ; PTR16_IDX16-NEXT:  Loop %bb11: Unpredictable symbolic max backedge-taken count.
 ; PTR16_IDX16-NEXT:  Loop %bb11: Unpredictable predicated backedge-taken count.
 ;
-; PTR16_IDX32-LABEL: 'trunc_ptr_to_i64'
-; PTR16_IDX32-NEXT:  Classifying expressions for: @trunc_ptr_to_i64
-; PTR16_IDX32-NEXT:    %tmp = phi i32 [ 0, %bb ], [ %tmp18, %bb17 ]
-; PTR16_IDX32-NEXT:    --> {0,+,2}<%bb11> U: [0,-1) S: [-2147483648,2147483647) Exits: <<Unknown>> LoopDispositions: { %bb11: Computable }
-; PTR16_IDX32-NEXT:    %tmp12 = getelementptr i8, ptr %arg, i64 ptrtoint (ptr @global to i64)
-; PTR16_IDX32-NEXT:    --> ((trunc i64 ptrtoint (ptr @global to i64) to i32) + %arg) U: [0,131071) S: [0,131071) Exits: ((trunc i64 ptrtoint (ptr @global to i64) to i32) + %arg) LoopDispositions: { %bb11: Invariant }
-; PTR16_IDX32-NEXT:    %tmp13 = bitcast ptr %tmp12 to ptr
-; PTR16_IDX32-NEXT:    --> ((trunc i64 ptrtoint (ptr @global to i64) to i32) + %arg) U: [0,131071) S: [0,131071) Exits: ((trunc i64 ptrtoint (ptr @global to i64) to i32) + %arg) LoopDispositions: { %bb11: Invariant }
-; PTR16_IDX32-NEXT:    %tmp14 = load i32, ptr %tmp13, align 4
-; PTR16_IDX32-NEXT:    --> %tmp14 U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %bb11: Variant }
-; PTR16_IDX32-NEXT:    %tmp18 = add i32 %tmp, 2
-; PTR16_IDX32-NEXT:    --> {2,+,2}<%bb11> U: [0,-1) S: [-2147483648,2147483647) Exits: <<Unknown>> LoopDispositions: { %bb11: Computable }
-; PTR16_IDX32-NEXT:  Determining loop execution counts for: @trunc_ptr_to_i64
-; PTR16_IDX32-NEXT:  Loop %bb11: Unpredictable backedge-taken count.
-; PTR16_IDX32-NEXT:  Loop %bb11: Unpredictable constant max backedge-taken count.
-; PTR16_IDX32-NEXT:  Loop %bb11: Unpredictable symbolic max backedge-taken count.
-; PTR16_IDX32-NEXT:  Loop %bb11: Unpredictable predicated backedge-taken count.
-;
 bb:
   br label %bb11
 
@@ -154,24 +135,6 @@ define hidden ptr @trunc_ptr_to_i32(ptr %arg, ptr %arg10) {
 ; PTR16_IDX16-NEXT:  Loop %bb11: Unpredictable symbolic max backedge-taken count.
 ; PTR16_IDX16-NEXT:  Loop %bb11: Unpredictable predicated backedge-taken count.
 ;
-; PTR16_IDX32-LABEL: 'trunc_ptr_to_i32'
-; PTR16_IDX32-NEXT:  Classifying expressions for: @trunc_ptr_to_i32
-; PTR16_IDX32-NEXT:    %tmp = phi i32 [ 0, %bb ], [ %tmp18, %bb17 ]
-; PTR16_IDX32-NEXT:    --> {0,+,2}<%bb11> U: [0,-1) S: [-2147483648,2147483647) Exits: <<Unknown>> LoopDispositions: { %bb11: Computable }
-; PTR16_IDX32-NEXT:    %tmp12 = getelementptr i8, ptr %arg, i32 ptrtoint (ptr @global to i32)
-; PTR16_IDX32-NEXT:    --> (ptrtoint (ptr @global to i32) + %arg) U: [0,131071) S: [0,131071) Exits: (ptrtoint (ptr @global to i32) + %arg) LoopDispositions: { %bb11: Invariant }
-; PTR16_IDX32-NEXT:    %tmp13 = bitcast ptr %tmp12 to ptr
-; PTR16_IDX32-NEXT:    --> (ptrtoint (ptr @global to i32) + %arg) U: [0,131071) S: [0,131071) Exits: (ptrtoint (ptr @global to i32) + %arg) LoopDispositions: { %bb11: Invariant }
-; PTR16_IDX32-NEXT:    %tmp14 = load i32, ptr %tmp13, align 4
-; PTR16_IDX32-NEXT:    --> %tmp14 U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %bb11: Variant }
-; PTR16_IDX32-NEXT:    %tmp18 = add i32 %tmp, 2
-; PTR16_IDX32-NEXT:    --> {2,+,2}<%bb11> U: [0,-1) S: [-2147483648,2147483647) Exits: <<Unknown>> LoopDispositions: { %bb11: Computable }
-; PTR16_IDX32-NEXT:  Determining loop execution counts for: @trunc_ptr_to_i32
-; PTR16_IDX32-NEXT:  Loop %bb11: Unpredictable backedge-taken count.
-; PTR16_IDX32-NEXT:  Loop %bb11: Unpredictable constant max backedge-taken count.
-; PTR16_IDX32-NEXT:  Loop %bb11: Unpredictable symbolic max backedge-taken count.
-; PTR16_IDX32-NEXT:  Loop %bb11: Unpredictable predicated backedge-taken count.
-;
 bb:
   br label %bb11
 
@@ -245,24 +208,6 @@ define hidden ptr @trunc_ptr_to_i128(ptr %arg, ptr %arg10) {
 ; PTR16_IDX16-NEXT:  Loop %bb11: Unpredictable symbolic max backedge-taken count.
 ; PTR16_IDX16-NEXT:  Loop %bb11: Unpredictable predicated backedge-taken count.
 ;
-; PTR16_IDX32-LABEL: 'trunc_ptr_to_i128'
-; PTR16_IDX32-NEXT:  Classifying expressions for: @trunc_ptr_to_i128
-; PTR16_IDX32-NEXT:    %tmp = phi i32 [ 0, %bb ], [ %tmp18, %bb17 ]
-; PTR16_IDX32-NEXT:    --> {0,+,2}<%bb11> U: [0,-1) S: [-2147483648,2147483647) Exits: <<Unknown>> LoopDispositions: { %bb11: Computable }
-; PTR16_IDX32-NEXT:    %tmp12 = getelementptr i8, ptr %arg, i128 ptrtoint (ptr @global to i128)
-; PTR16_IDX32-NEXT:    --> ((trunc i128 ptrtoint (ptr @global to i128) to i32) + %arg) U: [0,131071) S: [0,131071) Exits: ((trunc i128 ptrtoint (ptr @global to i128) to i32) + %arg) LoopDispositions: { %bb11: Invariant }
-; PTR16_IDX32-NEXT:    %tmp13 = bitcast ptr %tmp12 to ptr
-; PTR16_IDX32-NEXT:    --> ((trunc i128 ptrtoint (ptr @global to i128) to i32) + %arg) U: [0,131071) S: [0,131071) Exits: ((trunc i128 ptrtoint (ptr @global to i128) to i32) + %arg) LoopDispositions: { %bb11: Invariant }
-; PTR16_IDX32-NEXT:    %tmp14 = load i32, ptr %tmp13, align 4
-; PTR16_IDX32-NEXT:    --> %tmp14 U: full-set S: full-set Exits: <<Unknown>> LoopDispositions: { %bb11: Variant }
-; PTR16_IDX32-NEXT:    %tmp18 = add i32 %tmp, 2
-; PTR16_IDX32-NEXT:    --> {2,+,2}<%bb11> U: [0,-1) S: [-2147483648,2147483647) Exits: <<Unknown>> LoopDispositions: { %bb11: Computable }
-; PTR16_IDX32-NEXT:  Determining loop execution counts for: @trunc_ptr_to_i128
-; PTR16_IDX32-NEXT:  Loop %bb11: Unpredictable backedge-taken count.
-; PTR16_IDX32-NEXT:  Loop %bb11: Unpredictable constant max backedge-taken count.
-; PTR16_IDX32-NEXT:  Loop %bb11: Unpredictable symbolic max backedge-taken count.
-; PTR16_IDX32-NEXT:  Loop %bb11: Unpredictable predicated backedge-taken count.
-;
 bb:
   br label %bb11
 
@@ -319,18 +264,6 @@ define void @zext_ptr_to_i32(i32 %arg, i32 %arg6) {
 ; PTR16_IDX16-NEXT:  Loop %bb7: Unpredictable symbolic max backedge-taken count.
 ; PTR16_IDX16-NEXT:  Loop %bb7: Unpredictable predicated backedge-taken count.
 ;
-; PTR16_IDX32-LABEL: 'zext_ptr_to_i32'
-; PTR16_IDX32-NEXT:  Classifying expressions for: @zext_ptr_to_i32
-; PTR16_IDX32-NEXT:    %tmp = sub i32 %arg, ptrtoint (ptr @global to i32)
-; PTR16_IDX32-NEXT:    --> ((-1 * ptrtoint (ptr @global to i32))<nsw> + %arg) U: full-set S: full-set Exits: ((-1 * ptrtoint (ptr @global to i32))<nsw> + %arg) LoopDispositions: { %bb7: Invariant }
-; PTR16_IDX32-NEXT:    %tmp9 = select i1 %tmp8, i16 0, i16 1
-; PTR16_IDX32-NEXT:    --> %tmp9 U: [0,2) S: [0,2) Exits: <<Unknown>> LoopDispositions: { %bb7: Variant }
-; PTR16_IDX32-NEXT:  Determining loop execution counts for: @zext_ptr_to_i32
-; PTR16_IDX32-NEXT:  Loop %bb7: Unpredictable backedge-taken count.
-; PTR16_IDX32-NEXT:  Loop %bb7: Unpredictable constant max backedge-taken count.
-; PTR16_IDX32-NEXT:  Loop %bb7: Unpredictable symbolic max backedge-taken count.
-; PTR16_IDX32-NEXT:  Loop %bb7: Unpredictable predicated backedge-taken count.
-;
 bb:
   br label %bb7
 
@@ -382,18 +315,6 @@ define void @sext_to_i32(i32 %arg, i32 %arg6) {
 ; PTR16_IDX16-NEXT:  Loop %bb7: Unpredictable symbolic max backedge-taken count.
 ; PTR16_IDX16-NEXT:  Loop %bb7: Unpredictable predicated backedge-taken count.
 ;
-; PTR16_IDX32-LABEL: 'sext_to_i32'
-; PTR16_IDX32-NEXT:  Classifying expressions for: @sext_to_i32
-; PTR16_IDX32-NEXT:    %tmp = sub i32 %arg, sext (i16 ptrtoint (ptr @global to i16) to i32)
-; PTR16_IDX32-NEXT:    --> ((-1 * (sext i16 ptrtoint (ptr @global to i16) to i32))<nsw> + %arg) U: full-set S: full-set Exits: ((-1 * (sext i16 ptrtoint (ptr @global to i16) to i32))<nsw> + %arg) LoopDispositions: { %bb7: Invariant }
-; PTR16_IDX32-NEXT:    %tmp9 = select i1 %tmp8, i16 0, i16 1
-; PTR16_IDX32-NEXT:    --> %tmp9 U: [0,2) S: [0,2) Exits: <<Unknown>> LoopDispositions: { %bb7: Variant }
-; PTR16_IDX32-NEXT:  Determining loop execution counts for: @sext_to_i32
-; PTR16_IDX32-NEXT:  Loop %bb7: Unpredictable backedge-taken count.
-; PTR16_IDX32-NEXT:  Loop %bb7: Unpredictable constant max backedge-taken count.
-; PTR16_IDX32-NEXT:  Loop %bb7: Unpredictable symbolic max backedge-taken count.
-; PTR16_IDX32-NEXT:  Loop %bb7: Unpredictable predicated backedge-taken count.
-;
 bb:
   br label %bb7
 
@@ -463,24 +384,6 @@ define i64 @sext_like_noop(i32 %n) {
 ; PTR16_IDX16-NEXT:   Predicates:
 ; PTR16_IDX16:       Loop %for.body: Trip multiple is 1
 ;
-; PTR16_IDX32-LABEL: 'sext_like_noop'
-; PTR16_IDX32-NEXT:  Classifying expressions for: @sext_like_noop
-; PTR16_IDX32-NEXT:    %ii = sext i32 %i to i64
-; PTR16_IDX32-NEXT:    --> (sext i32 {1,+,1}<nuw><%for.body> to i64) U: [-2147483648,2147483648) S: [-2147483648,2147483648) --> (-1 + (zext i32 ptrtoint (ptr @sext_like_noop to i32) to i64))<nsw> U: [-1,65535) S: [-1,65535)
-; PTR16_IDX32-NEXT:    %div = sdiv i64 55555, %ii
-; PTR16_IDX32-NEXT:    --> %div U: full-set S: full-set
-; PTR16_IDX32-NEXT:    %i = phi i32 [ %inc, %for.body ], [ 1, %entry ]
-; PTR16_IDX32-NEXT:    --> {1,+,1}<nuw><%for.body> U: [1,0) S: [1,0) Exits: (-1 + ptrtoint (ptr @sext_like_noop to i32))<nsw> LoopDispositions: { %for.body: Computable }
-; PTR16_IDX32-NEXT:    %inc = add nuw i32 %i, 1
-; PTR16_IDX32-NEXT:    --> {2,+,1}<nuw><%for.body> U: [2,0) S: [2,0) Exits: ptrtoint (ptr @sext_like_noop to i32) LoopDispositions: { %for.body: Computable }
-; PTR16_IDX32-NEXT:  Determining loop execution counts for: @sext_like_noop
-; PTR16_IDX32-NEXT:  Loop %for.body: backedge-taken count is (-2 + ptrtoint (ptr @sext_like_noop to i32))<nsw>
-; PTR16_IDX32-NEXT:  Loop %for.body: constant max backedge-taken count is -1
-; PTR16_IDX32-NEXT:  Loop %for.body: symbolic max backedge-taken count is (-2 + ptrtoint (ptr @sext_like_noop to i32))<nsw>
-; PTR16_IDX32-NEXT:  Loop %for.body: Predicated backedge-taken count is (-2 + ptrtoint (ptr @sext_like_noop to i32))<nsw>
-; PTR16_IDX32-NEXT:   Predicates:
-; PTR16_IDX32:       Loop %for.body: Trip multiple is 1
-;
 entry:
   %cmp6 = icmp sgt i32 %n, 1
   br label %for.body
diff --git a/llvm/test/Assembler/invalid-datalayout-index-size.ll b/llvm/test/Assembler/invalid-datalayout-index-size.ll
new file mode 100644
index 0000000000000..dc608cdd56a04
--- /dev/null
+++ b/llvm/test/Assembler/invalid-datalayout-index-size.ll
@@ -0,0 +1,3 @@
+; RUN: not llvm-as < %s 2>&1 | FileCheck %s
+target datalayout = "p:64:64:64:128"
+; CHECK: Index width cannot be larger than pointer width
diff --git a/llvm/test/Transforms/InferAlignment/ptrmask.ll b/llvm/test/Transforms/InferAlignment/ptrmask.ll
index afab872d16d5e..7fb0220e92b96 100644
--- a/llvm/test/Transforms/InferAlignment/ptrmask.ll
+++ b/llvm/test/Transforms/InferAlignment/ptrmask.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
 ; RUN: opt < %s -passes=infer-alignment -S | FileCheck %s
 
-target datalayout = "p1:64:64:64:32-p2:64:64:64:128"
+target datalayout = "p1:64:64:64:32"
 
 ; ------------------------------------------------------------------------------
 ; load instructions
@@ -88,18 +88,5 @@ define i8 @smaller_index_type(ptr addrspace(1) %ptr) {
   ret i8 %load
 }
 
-define i8 @larger_index_type(ptr addrspace(2) %ptr) {
-; CHECK-LABEL: define i8 @larger_index_type
-; CHECK-SAME: (ptr addrspace(2) [[PTR:%.*]]) {
-; CHECK-NEXT:    [[PTR2:%.*]] = call ptr addrspace(2) @llvm.ptrmask.p2.i128(ptr addrspace(2) [[PTR]], i128 -4)
-; CHECK-NEXT:    [[LOAD:%.*]] = load i8, ptr addrspace(2) [[PTR2]], align 4
-; CHECK-NEXT:    ret i8 [[LOAD]]
-;
-  %ptr2 = call ptr addrspace(2) @llvm.ptrmask.p2.i128(ptr addrspace(2) %ptr, i128 -4)
-  %load = load i8, ptr addrspace(2) %ptr2, align 1
-  ret i8 %load
-}
-
 declare ptr @llvm.ptrmask.p0.i64(ptr, i64)
 declare ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1), i32)
-declare ptr addrspace(2) @llvm.ptrmask.p2.i128(ptr addrspace(2), i128)
diff --git a/mlir/test/Target/LLVMIR/Import/data-layout.ll b/mlir/test/Target/LLVMIR/Import/data-layout.ll
index b46f5fc4f2ae2..4336f0dbbc4a9 100644
--- a/mlir/test/Target/LLVMIR/Import/data-layout.ll
+++ b/mlir/test/Target/LLVMIR/Import/data-layout.ll
@@ -36,11 +36,11 @@ target datalayout = "e-m:e-p270:32:64-p271:32:32-p272:64:64-i64:64-f80:128-n8:16
 ; CHECK: dlti.dl_spec =
 ; CHECK: #dlti.dl_spec<
 ; CHECK-DAG:   #dlti.dl_entry<"dlti.endianness", "big">
-; CHECK-DAG:   #dlti.dl_entry<!llvm.ptr<270>, dense<[16, 32, 64, 128]> : vector<4xi32>>
+; CHECK-DAG:   #dlti.dl_entry<!llvm.ptr<270>, dense<[16, 32, 64, 8]> : vector<4xi32>>
 ; CHECK-DAG:   #dlti.dl_entry<!llvm.ptr<271>, dense<[16, 32, 64, 16]> : vector<4xi32>>
 ; CHECK-DAG:   #dlti.dl_entry<"dlti.alloca_memory_space", 1 : ui32>
 ; CHECK-DAG:   #dlti.dl_entry<i64, dense<[64, 128]> : vector<2xi32>>
-target datalayout = "A1-E-p270:16:32:64:128-p271:16:32:64-i64:64:128"
+target datalayout = "A1-E-p270:16:32:64:8-p271:16:32:64-i64:64:128"
 
 ; // -----
 
diff --git a/mlir/test/Target/LLVMIR/data-layout.mlir b/mlir/test/Target/LLVMIR/data-layout.mlir
index 74941d84d7d21..f1304e12c303d 100644
--- a/mlir/test/Target/LLVMIR/data-layout.mlir
+++ b/mlir/test/Target/LLVMIR/data-layout.mlir
@@ -7,7 +7,7 @@
 // CHECK: i64:64:128
 // CHECK: f80:128:256
 // CHECK: p0:32:64:128
-// CHECK: p1:32:32:32:64
+// CHECK: p1:32:32:32:16
 module attributes {dlti.dl_spec = #dlti.dl_spec<
 #dlti.dl_entry<"dlti.endianness", "big">,
 #dlti.dl_entry<"dlti.alloca_memory_space", 4 : ui32>,
@@ -16,7 +16,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<
 #dlti.dl_entry<i64, dense<[64,128]> : vector<2xi32>>,
 #dlti.dl_entry<f80, dense<[128,256]> : vector<2xi32>>,
 #dlti.dl_entry<!llvm.ptr, dense<[32,64,128]> : vector<3xi32>>,
-#dlti.dl_entry<!llvm.ptr<1>, dense<[32,32,32,64]> : vector<4xi32>>
+#dlti.dl_entry<!llvm.ptr<1>, dense<[32,32,32,16]> : vector<4xi32>>
 >} {
   llvm.func @foo() {
     llvm.return

From 0c6759b576947def0210e1bfb135fe8092d51c45 Mon Sep 17 00:00:00 2001
From: Johannes Reifferscheid <jreiffers@google.com>
Date: Thu, 26 Oct 2023 10:33:45 +0200
Subject: [PATCH 028/877] [Bazel] Fixes for ec6da06.

---
 .../bazel/llvm-project-overlay/mlir/examples/toy/Ch6/BUILD.bazel | 1 +
 .../bazel/llvm-project-overlay/mlir/examples/toy/Ch7/BUILD.bazel | 1 +
 2 files changed, 2 insertions(+)

diff --git a/utils/bazel/llvm-project-overlay/mlir/examples/toy/Ch6/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/examples/toy/Ch6/BUILD.bazel
index af5573b1141cd..7543db7c2c279 100644
--- a/utils/bazel/llvm-project-overlay/mlir/examples/toy/Ch6/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/examples/toy/Ch6/BUILD.bazel
@@ -97,6 +97,7 @@ cc_binary(
         ":ToyInterfacesIncGen",
         ":ToyOpsIncGen",
         "//llvm:Core",
+        "//llvm:OrcJIT",
         "//llvm:Support",
         "//mlir:AffineDialect",
         "//mlir:AffineToStandard",
diff --git a/utils/bazel/llvm-project-overlay/mlir/examples/toy/Ch7/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/examples/toy/Ch7/BUILD.bazel
index 6e3ad89a9430a..297fb4e17e8e9 100644
--- a/utils/bazel/llvm-project-overlay/mlir/examples/toy/Ch7/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/examples/toy/Ch7/BUILD.bazel
@@ -97,6 +97,7 @@ cc_binary(
         ":ToyInterfacesIncGen",
         ":ToyOpsIncGen",
         "//llvm:Core",
+        "//llvm:OrcJIT",
         "//llvm:Support",
         "//mlir:AffineDialect",
         "//mlir:AffineToStandard",

From 80ff42bc7bb8188fb0c239c9f2035faf1398e59a Mon Sep 17 00:00:00 2001
From: Benjamin Kramer <benny.kra@googlemail.com>
Date: Thu, 26 Oct 2023 10:36:26 +0200
Subject: [PATCH 029/877] [scev-aa] Make TypeSize -> uint64_t conversion
 explicit

Some versions of MSVC have issues with the implicit conversion, and it
also makes it clearer what's going on (the current code doesn't handle
scalable vectors)
---
 llvm/lib/Analysis/ScalarEvolutionAliasAnalysis.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Analysis/ScalarEvolutionAliasAnalysis.cpp b/llvm/lib/Analysis/ScalarEvolutionAliasAnalysis.cpp
index dc1af1bbb1d16..af8232b03f1ed 100644
--- a/llvm/lib/Analysis/ScalarEvolutionAliasAnalysis.cpp
+++ b/llvm/lib/Analysis/ScalarEvolutionAliasAnalysis.cpp
@@ -55,10 +55,10 @@ AliasResult SCEVAAResult::alias(const MemoryLocation &LocA,
   if (canComputePointerDiff(SE, AS, BS)) {
     unsigned BitWidth = SE.getTypeSizeInBits(AS->getType());
     APInt ASizeInt(BitWidth, LocA.Size.hasValue()
-                                 ? LocA.Size.getValue()
+                                 ? static_cast<uint64_t>(LocA.Size.getValue())
                                  : MemoryLocation::UnknownSize);
     APInt BSizeInt(BitWidth, LocB.Size.hasValue()
-                                 ? LocB.Size.getValue()
+                                 ? static_cast<uint64_t>(LocB.Size.getValue())
                                  : MemoryLocation::UnknownSize);
 
     // Compute the difference between the two pointers.

From 339faffd053b60bee3515fe7c4cca5e76f2cf427 Mon Sep 17 00:00:00 2001
From: Oliver Stannard <oliver.stannard@arm.com>
Date: Thu, 26 Oct 2023 09:50:13 +0100
Subject: [PATCH 030/877] Revert "[AArch64] Move SLS later in pass pipeline"

The (MF.size() == 0) assertis is triggering when building at -O0.
Reverting this while I work out what is going wrong.

This reverts commit 7e8eccd990d37d2771ca5ad7a84f54c3cfc4a5e1.
---
 .../Target/AArch64/AArch64SLSHardening.cpp    |  3 +--
 .../Target/AArch64/AArch64TargetMachine.cpp   |  5 ++--
 llvm/test/CodeGen/AArch64/O0-pipeline.ll      |  4 ++--
 llvm/test/CodeGen/AArch64/O3-pipeline.ll      |  4 ++--
 .../AArch64/arm64-opt-remarks-lazy-bfi.ll     | 24 ++++---------------
 .../AArch64/sls-stackprotector-outliner.ll    | 12 ++++------
 6 files changed, 16 insertions(+), 36 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64SLSHardening.cpp b/llvm/lib/Target/AArch64/AArch64SLSHardening.cpp
index 518ca336b5cde..ff56259eb34a7 100644
--- a/llvm/lib/Target/AArch64/AArch64SLSHardening.cpp
+++ b/llvm/lib/Target/AArch64/AArch64SLSHardening.cpp
@@ -222,8 +222,7 @@ void SLSBLRThunkInserter::populateThunk(MachineFunction &MF) {
 
   const TargetInstrInfo *TII =
       MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
-  assert (MF.size() == 0);
-  MF.push_back(MF.CreateMachineBasicBlock());
+  assert (MF.size() == 1);
   MachineBasicBlock *Entry = &MF.front();
   Entry->clear();
 
diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
index 02670fff35cce..3d818c76bd4b7 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -796,6 +796,9 @@ void AArch64PassConfig::addPreSched2() {
   // info.
   addPass(createAArch64SpeculationHardeningPass());
 
+  addPass(createAArch64IndirectThunks());
+  addPass(createAArch64SLSHardeningPass());
+
   if (TM->getOptLevel() != CodeGenOptLevel::None) {
     if (EnableFalkorHWPFFix)
       addPass(createFalkorHWPFFixPass());
@@ -828,8 +831,6 @@ void AArch64PassConfig::addPreEmitPass() {
 }
 
 void AArch64PassConfig::addPostBBSections() {
-  addPass(createAArch64IndirectThunks());
-  addPass(createAArch64SLSHardeningPass());
   addPass(createAArch64PointerAuthPass());
   if (EnableBranchTargets)
     addPass(createAArch64BranchTargetsPass());
diff --git a/llvm/test/CodeGen/AArch64/O0-pipeline.ll b/llvm/test/CodeGen/AArch64/O0-pipeline.ll
index d1e38b85fa9c3..4f87bb2a3ee81 100644
--- a/llvm/test/CodeGen/AArch64/O0-pipeline.ll
+++ b/llvm/test/CodeGen/AArch64/O0-pipeline.ll
@@ -64,6 +64,8 @@
 ; CHECK-NEXT:       AArch64 pseudo instruction expansion pass
 ; CHECK-NEXT:       Insert KCFI indirect call checks
 ; CHECK-NEXT:       AArch64 speculation hardening pass
+; CHECK-NEXT:       AArch64 Indirect Thunks
+; CHECK-NEXT:       AArch64 sls hardening pass
 ; CHECK-NEXT:       Analyze Machine Code For Garbage Collection
 ; CHECK-NEXT:       Insert fentry calls
 ; CHECK-NEXT:       Insert XRay ops
@@ -73,8 +75,6 @@
 ; CHECK-NEXT:       StackMap Liveness Analysis
 ; CHECK-NEXT:       Live DEBUG_VALUE analysis
 ; CHECK-NEXT:       Machine Sanitizer Binary Metadata
-; CHECK-NEXT:       AArch64 Indirect Thunks
-; CHECK-NEXT:       AArch64 sls hardening pass
 ; CHECK-NEXT:       AArch64 Pointer Authentication
 ; CHECK-NEXT:       AArch64 Branch Targets
 ; CHECK-NEXT:       Branch relaxation pass
diff --git a/llvm/test/CodeGen/AArch64/O3-pipeline.ll b/llvm/test/CodeGen/AArch64/O3-pipeline.ll
index 85a9eb49a9a26..f5c1c3c291cb5 100644
--- a/llvm/test/CodeGen/AArch64/O3-pipeline.ll
+++ b/llvm/test/CodeGen/AArch64/O3-pipeline.ll
@@ -201,6 +201,8 @@
 ; CHECK-NEXT:       AArch64 load / store optimization pass
 ; CHECK-NEXT:       Insert KCFI indirect call checks
 ; CHECK-NEXT:       AArch64 speculation hardening pass
+; CHECK-NEXT:       AArch64 Indirect Thunks
+; CHECK-NEXT:       AArch64 sls hardening pass
 ; CHECK-NEXT:       MachineDominator Tree Construction
 ; CHECK-NEXT:       Machine Natural Loop Construction
 ; CHECK-NEXT:       Falkor HW Prefetch Fix Late Phase
@@ -221,8 +223,6 @@
 ; CHECK-NEXT:       Machine Sanitizer Binary Metadata
 ; CHECK-NEXT:     Machine Outliner
 ; CHECK-NEXT:     FunctionPass Manager
-; CHECK-NEXT:       AArch64 Indirect Thunks
-; CHECK-NEXT:       AArch64 sls hardening pass
 ; CHECK-NEXT:       AArch64 Pointer Authentication
 ; CHECK-NEXT:       AArch64 Branch Targets
 ; CHECK-NEXT:       Branch relaxation pass
diff --git a/llvm/test/CodeGen/AArch64/arm64-opt-remarks-lazy-bfi.ll b/llvm/test/CodeGen/AArch64/arm64-opt-remarks-lazy-bfi.ll
index 3ffaf962425b3..580886520789e 100644
--- a/llvm/test/CodeGen/AArch64/arm64-opt-remarks-lazy-bfi.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-opt-remarks-lazy-bfi.ll
@@ -32,16 +32,8 @@
 
 ; HOTNESS:      Freeing Pass 'Machine Outliner'
 ; HOTNESS-NEXT:  Executing Pass 'Function Pass Manager'
-; HOTNESS-NEXT: Executing Pass 'Verify generated machine code' on Function 'empty_func'...
-; HOTNESS-NEXT:  Freeing Pass 'Verify generated machine code' on Function 'empty_func'...
-; HOTNESS-NEXT: Executing Pass 'AArch64 Indirect Thunks' on Function 'empty_func'...
-; HOTNESS-NEXT:  Freeing Pass 'AArch64 Indirect Thunks' on Function 'empty_func'...
-; HOTNESS-NEXT: Executing Pass 'Verify generated machine code' on Function 'empty_func'...
-; HOTNESS-NEXT:  Freeing Pass 'Verify generated machine code' on Function 'empty_func'...
-; HOTNESS-NEXT: Executing Pass 'AArch64 sls hardening pass' on Function 'empty_func'...
-; HOTNESS-NEXT:  Freeing Pass 'AArch64 sls hardening pass' on Function 'empty_func'...
-; HOTNESS-NEXT: Executing Pass 'Verify generated machine code' on Function 'empty_func'...
-; HOTNESS-NEXT:  Freeing Pass 'Verify generated machine code' on Function 'empty_func'...
+; HOTNESS-NEXT: Executing Pass 'Verify generated machine code'
+; HOTNESS-NEXT: Freeing Pass 'Verify generated machine code'
 ; HOTNESS-NEXT: Executing Pass 'AArch64 Pointer Authentication' on Function 'empty_func'...
 ; HOTNESS-NEXT: Freeing Pass 'AArch64 Pointer Authentication' on Function 'empty_func'...
 ; HOTNESS-NEXT: Executing Pass 'Verify generated machine code' on Function 'empty_func'...
@@ -81,16 +73,8 @@
 
 ; NO_HOTNESS:      Freeing Pass 'Machine Outliner'
 ; NO_HOTNESS-NEXT:  Executing Pass 'Function Pass Manager'
-; NO_HOTNESS-NEXT: Executing Pass 'Verify generated machine code' on Function 'empty_func'...
-; NO_HOTNESS-NEXT:  Freeing Pass 'Verify generated machine code' on Function 'empty_func'...
-; NO_HOTNESS-NEXT: Executing Pass 'AArch64 Indirect Thunks' on Function 'empty_func'...
-; NO_HOTNESS-NEXT:  Freeing Pass 'AArch64 Indirect Thunks' on Function 'empty_func'...
-; NO_HOTNESS-NEXT: Executing Pass 'Verify generated machine code' on Function 'empty_func'...
-; NO_HOTNESS-NEXT:  Freeing Pass 'Verify generated machine code' on Function 'empty_func'...
-; NO_HOTNESS-NEXT: Executing Pass 'AArch64 sls hardening pass' on Function 'empty_func'...
-; NO_HOTNESS-NEXT:  Freeing Pass 'AArch64 sls hardening pass' on Function 'empty_func'...
-; NO_HOTNESS-NEXT: Executing Pass 'Verify generated machine code' on Function 'empty_func'...
-; NO_HOTNESS-NEXT:  Freeing Pass 'Verify generated machine code' on Function 'empty_func'...
+; NO_HOTNESS-NEXT: Executing Pass 'Verify generated machine code'
+; NO_HOTNESS-NEXT: Freeing Pass 'Verify generated machine code'
 ; NO_HOTNESS-NEXT: Executing Pass 'AArch64 Pointer Authentication' on Function 'empty_func'...
 ; NO_HOTNESS-NEXT: Freeing Pass 'AArch64 Pointer Authentication' on Function 'empty_func'...
 ; NO_HOTNESS-NEXT: Executing Pass 'Verify generated machine code' on Function 'empty_func'...
diff --git a/llvm/test/CodeGen/AArch64/sls-stackprotector-outliner.ll b/llvm/test/CodeGen/AArch64/sls-stackprotector-outliner.ll
index 809279eac576b..9e5e55512df81 100644
--- a/llvm/test/CodeGen/AArch64/sls-stackprotector-outliner.ll
+++ b/llvm/test/CodeGen/AArch64/sls-stackprotector-outliner.ll
@@ -18,8 +18,7 @@ define hidden void @_ZTv0_n24_N2C6D1Ev(ptr %this) minsize sspreq "target-feature
 ; CHECK-NEXT:    b.ne .LBB0_2
 ; CHECK-NEXT:  // %bb.1: // %entry
 ; CHECK-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
-; CHECK-NEXT:    add x0, x0, x8
-; CHECK-NEXT:    add sp, sp, #32
+; CHECK-NEXT:    bl OUTLINED_FUNCTION_1
 ; CHECK-NEXT:    b _ZN2C6D1Ev
 ; CHECK-NEXT:    dsb sy
 ; CHECK-NEXT:    isb
@@ -46,8 +45,7 @@ define hidden void @_ZTv0_n24_N2C6D0Ev(ptr %this) minsize sspreq "target-feature
 ; CHECK-NEXT:    b.ne .LBB1_2
 ; CHECK-NEXT:  // %bb.1: // %entry
 ; CHECK-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
-; CHECK-NEXT:    add x0, x0, x8
-; CHECK-NEXT:    add sp, sp, #32
+; CHECK-NEXT:    bl OUTLINED_FUNCTION_1
 ; CHECK-NEXT:    b _ZN2C6D0Ev
 ; CHECK-NEXT:    dsb sy
 ; CHECK-NEXT:    isb
@@ -73,8 +71,7 @@ define hidden void @_ZTv0_n24_N3C10D1Ev(ptr %this) minsize sspreq "target-featur
 ; CHECK-NEXT:    b.ne .LBB2_2
 ; CHECK-NEXT:  // %bb.1: // %entry
 ; CHECK-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
-; CHECK-NEXT:    add x0, x0, x8
-; CHECK-NEXT:    add sp, sp, #32
+; CHECK-NEXT:    bl OUTLINED_FUNCTION_1
 ; CHECK-NEXT:    b _ZN3C10D1Ev
 ; CHECK-NEXT:    dsb sy
 ; CHECK-NEXT:    isb
@@ -100,8 +97,7 @@ define hidden void @_ZTv0_n24_N3C10D0Ev(ptr %this) minsize sspreq "target-featur
 ; CHECK-NEXT:    b.ne .LBB3_2
 ; CHECK-NEXT:  // %bb.1: // %entry
 ; CHECK-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
-; CHECK-NEXT:    add x0, x0, x8
-; CHECK-NEXT:    add sp, sp, #32
+; CHECK-NEXT:    bl OUTLINED_FUNCTION_1
 ; CHECK-NEXT:    b _ZN3C10D0Ev
 ; CHECK-NEXT:    dsb sy
 ; CHECK-NEXT:    isb

From cf3ac964dc0e1b967594aee2bdbfb6a4518e4dfe Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Thu, 26 Oct 2023 11:02:56 +0200
Subject: [PATCH 031/877] [InstCombine] Add additional demanded bits tests for
 shifts (NFC)

---
 llvm/test/Transforms/InstCombine/shift.ll | 72 +++++++++++++++++++++++
 1 file changed, 72 insertions(+)

diff --git a/llvm/test/Transforms/InstCombine/shift.ll b/llvm/test/Transforms/InstCombine/shift.ll
index bfed2dfe55fd5..7b9626331ff29 100644
--- a/llvm/test/Transforms/InstCombine/shift.ll
+++ b/llvm/test/Transforms/InstCombine/shift.ll
@@ -2126,3 +2126,75 @@ define <2 x i8> @ashr_vec_or6_fail(<2 x i8> %x, <2 x i8> %c) {
   %y = ashr <2 x i8> %x, %amt
   ret <2 x i8> %y
 }
+
+define i16 @lshr_and_not_demanded(i8 %x) {
+; CHECK-LABEL: @lshr_and_not_demanded(
+; CHECK-NEXT:    [[Y_EXT:%.*]] = sext i8 [[X:%.*]] to i16
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i16 [[Y_EXT]], 1
+; CHECK-NEXT:    ret i16 [[SHR]]
+;
+  %y = and i8 %x, -2
+  %y.ext = sext i8 %y to i16
+  %shr = lshr i16 %y.ext, 1
+  ret i16 %shr
+}
+
+define i16 @lshr_exact_and_not_demanded(i8 %x) {
+; CHECK-LABEL: @lshr_exact_and_not_demanded(
+; CHECK-NEXT:    [[Y:%.*]] = and i8 [[X:%.*]], -2
+; CHECK-NEXT:    [[Y_EXT:%.*]] = sext i8 [[Y]] to i16
+; CHECK-NEXT:    [[SHR:%.*]] = lshr exact i16 [[Y_EXT]], 1
+; CHECK-NEXT:    ret i16 [[SHR]]
+;
+  %y = and i8 %x, -2
+  %y.ext = sext i8 %y to i16
+  %shr = lshr exact i16 %y.ext, 1
+  ret i16 %shr
+}
+
+define i16 @lshr_and_demanded(i8 %x) {
+; CHECK-LABEL: @lshr_and_demanded(
+; CHECK-NEXT:    [[Y:%.*]] = and i8 [[X:%.*]], -4
+; CHECK-NEXT:    [[Y_EXT:%.*]] = sext i8 [[Y]] to i16
+; CHECK-NEXT:    [[SHR:%.*]] = lshr exact i16 [[Y_EXT]], 1
+; CHECK-NEXT:    ret i16 [[SHR]]
+;
+  %y = and i8 %x, -4
+  %y.ext = sext i8 %y to i16
+  %shr = lshr i16 %y.ext, 1
+  ret i16 %shr
+}
+
+define i16 @ashr_umax_not_demanded(i16 %x) {
+; CHECK-LABEL: @ashr_umax_not_demanded(
+; CHECK-NEXT:    [[SHR:%.*]] = ashr i16 [[X:%.*]], 1
+; CHECK-NEXT:    ret i16 [[SHR]]
+;
+  %y = call i16 @llvm.umax.i16(i16 %x, i16 1)
+  %shr = ashr i16 %y, 1
+  ret i16 %shr
+}
+
+define i16 @ashr_exact_umax_not_demanded(i16 %x) {
+; CHECK-LABEL: @ashr_exact_umax_not_demanded(
+; CHECK-NEXT:    [[Y:%.*]] = call i16 @llvm.umax.i16(i16 [[X:%.*]], i16 1)
+; CHECK-NEXT:    [[SHR:%.*]] = ashr exact i16 [[Y]], 1
+; CHECK-NEXT:    ret i16 [[SHR]]
+;
+  %y = call i16 @llvm.umax.i16(i16 %x, i16 1)
+  %shr = ashr exact i16 %y, 1
+  ret i16 %shr
+}
+
+define i16 @ashr_umax_demanded(i16 %x) {
+; CHECK-LABEL: @ashr_umax_demanded(
+; CHECK-NEXT:    [[Y:%.*]] = call i16 @llvm.umax.i16(i16 [[X:%.*]], i16 2)
+; CHECK-NEXT:    [[SHR:%.*]] = ashr i16 [[Y]], 1
+; CHECK-NEXT:    ret i16 [[SHR]]
+;
+  %y = call i16 @llvm.umax.i16(i16 %x, i16 2)
+  %shr = ashr i16 %y, 1
+  ret i16 %shr
+}
+
+declare i16 @llvm.umax.i16(i16, i16)

From 605fadf0ca21ceba3331c1a39230fc7a45ea3134 Mon Sep 17 00:00:00 2001
From: Dmitry Vyukov <dvyukov@google.com>
Date: Thu, 26 Oct 2023 11:06:15 +0200
Subject: [PATCH 032/877] [libc] Add --sweep-min-size flag for benchmarks
 (#70302)

We have --sweep-max-size, it's reasonable to have --sweep-min-size as
well. It can be used when working on the logic for larger sizes, or to
collect a profile for larger sizes only.
---
 libc/benchmarks/LibcMemoryBenchmarkMain.cpp | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/libc/benchmarks/LibcMemoryBenchmarkMain.cpp b/libc/benchmarks/LibcMemoryBenchmarkMain.cpp
index acd7c30717597..bc6fd8b38cb6d 100644
--- a/libc/benchmarks/LibcMemoryBenchmarkMain.cpp
+++ b/libc/benchmarks/LibcMemoryBenchmarkMain.cpp
@@ -42,9 +42,15 @@ static cl::opt<std::string>
     SizeDistributionName("size-distribution-name",
                          cl::desc("The name of the distribution to use"));
 
-static cl::opt<bool>
-    SweepMode("sweep-mode",
-              cl::desc("If set, benchmark all sizes from 0 to sweep-max-size"));
+static cl::opt<bool> SweepMode(
+    "sweep-mode",
+    cl::desc(
+        "If set, benchmark all sizes from sweep-min-size to sweep-max-size"));
+
+static cl::opt<uint32_t>
+    SweepMinSize("sweep-min-size",
+                 cl::desc("The minimum size to use in sweep-mode"),
+                 cl::init(0));
 
 static cl::opt<uint32_t>
     SweepMaxSize("sweep-max-size",
@@ -185,7 +191,7 @@ struct MemfunctionBenchmarkSweep final : public MemfunctionBenchmarkBase {
     BO.InitialIterations = 100;
     auto &Measurements = Study.Measurements;
     Measurements.reserve(NumTrials * SweepMaxSize);
-    for (size_t Size = 0; Size <= SweepMaxSize; ++Size) {
+    for (size_t Size = SweepMinSize; Size <= SweepMaxSize; ++Size) {
       CurrentSweepSize = Size;
       runTrials(BO, Measurements);
     }

From 274ce8895b29e8be047a68629efda2e9d3ad1f01 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell@arm.com>
Date: Thu, 26 Oct 2023 10:07:23 +0100
Subject: [PATCH 033/877] [mlir] Remove `printCString()` from RunnerUtils
 (#70197)

This is now unused and can be replaced with `printString()` from
CRunnerUtils or `vector.print str`.
---
 .../mlir/ExecutionEngine/RunnerUtils.h        |  1 -
 mlir/lib/ExecutionEngine/RunnerUtils.cpp      |  3 ---
 mlir/test/mlir-cpu-runner/print.mlir          | 19 -------------------
 3 files changed, 23 deletions(-)
 delete mode 100644 mlir/test/mlir-cpu-runner/print.mlir

diff --git a/mlir/include/mlir/ExecutionEngine/RunnerUtils.h b/mlir/include/mlir/ExecutionEngine/RunnerUtils.h
index 56c0983e5f15b..f998ed53b3403 100644
--- a/mlir/include/mlir/ExecutionEngine/RunnerUtils.h
+++ b/mlir/include/mlir/ExecutionEngine/RunnerUtils.h
@@ -392,7 +392,6 @@ extern "C" MLIR_RUNNERUTILS_EXPORT void printMemrefF64(int64_t rank, void *ptr);
 extern "C" MLIR_RUNNERUTILS_EXPORT void printMemrefInd(int64_t rank, void *ptr);
 extern "C" MLIR_RUNNERUTILS_EXPORT void printMemrefC32(int64_t rank, void *ptr);
 extern "C" MLIR_RUNNERUTILS_EXPORT void printMemrefC64(int64_t rank, void *ptr);
-extern "C" MLIR_RUNNERUTILS_EXPORT void printCString(char *str);
 
 extern "C" MLIR_RUNNERUTILS_EXPORT void
 _mlir_ciface_printMemref0dF32(StridedMemRefType<float, 0> *m);
diff --git a/mlir/lib/ExecutionEngine/RunnerUtils.cpp b/mlir/lib/ExecutionEngine/RunnerUtils.cpp
index 4618866f68a44..bbfd3a6b11c2a 100644
--- a/mlir/lib/ExecutionEngine/RunnerUtils.cpp
+++ b/mlir/lib/ExecutionEngine/RunnerUtils.cpp
@@ -158,9 +158,6 @@ extern "C" void printMemrefC64(int64_t rank, void *ptr) {
   _mlir_ciface_printMemrefC64(&descriptor);
 }
 
-/// Deprecated. This should be unified with printString from CRunnerUtils.
-extern "C" void printCString(char *str) { fputs(str, stdout); }
-
 extern "C" void _mlir_ciface_printMemref0dF32(StridedMemRefType<float, 0> *M) {
   impl::printMemRef(*M);
 }
diff --git a/mlir/test/mlir-cpu-runner/print.mlir b/mlir/test/mlir-cpu-runner/print.mlir
deleted file mode 100644
index e36c7154f03a7..0000000000000
--- a/mlir/test/mlir-cpu-runner/print.mlir
+++ /dev/null
@@ -1,19 +0,0 @@
-// RUN: mlir-opt %s -pass-pipeline="builtin.module(convert-func-to-llvm,reconcile-unrealized-casts)" \
-// RUN: | mlir-cpu-runner -e main -entry-point-result=void \
-// RUN:   -shared-libs=%mlir_runner_utils,%mlir_c_runner_utils \
-// RUN: | FileCheck %s
-
-
-llvm.mlir.global internal constant @str_global("String to print\0A")
-llvm.func @printCString(!llvm.ptr<i8>)
-
-func.func @main() {
-  %0 = llvm.mlir.addressof @str_global : !llvm.ptr<array<16 x i8>>
-  %1 = llvm.mlir.constant(0 : index) : i64
-  %2 = llvm.getelementptr %0[%1, %1]
-    : (!llvm.ptr<array<16 x i8>>, i64, i64) -> !llvm.ptr<i8>
-  llvm.call @printCString(%2) : (!llvm.ptr<i8>) -> ()
-  return
-}
-
-// CHECK: String to print

From ba3d6e0499231c5f425b92cc126116b0e184dbf1 Mon Sep 17 00:00:00 2001
From: Piotr Sobczak <piotr.sobczak@amd.com>
Date: Thu, 26 Oct 2023 11:34:33 +0200
Subject: [PATCH 034/877] [AMDGPU] Rematerialize scalar loads (#68778)

Extend the list of instructions that can be rematerialized in
SIInstrInfo::isReallyTriviallyReMaterializable() to support scalar
loads.

Try shrinking instructions to remat only the part needed for current
context. Add SIInstrInfo::reMaterialize target hook, and handle
shrinking of S_LOAD_DWORDX16_IMM to S_LOAD_DWORDX8_IMM as a proof of
concept.
---
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp        | 106 +++++++++++++++-
 llvm/lib/Target/AMDGPU/SIInstrInfo.h          |   5 +
 .../hsa-metadata-kernel-code-props-v3.ll      |   4 +-
 llvm/test/CodeGen/AMDGPU/remat-smrd.mir       | 116 +++++-------------
 .../AMDGPU/snippet-copy-bundle-regression.mir |  42 +------
 5 files changed, 146 insertions(+), 127 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 827c2c1566384..7046c37ef6efd 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -106,9 +106,27 @@ static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) {
   return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx);
 }
 
+static bool canRemat(const MachineInstr &MI) {
+
+  if (SIInstrInfo::isVOP1(MI) || SIInstrInfo::isVOP2(MI) ||
+      SIInstrInfo::isVOP3(MI) || SIInstrInfo::isSDWA(MI) ||
+      SIInstrInfo::isSALU(MI))
+    return true;
+
+  if (SIInstrInfo::isSMRD(MI)) {
+    return !MI.memoperands_empty() &&
+           llvm::all_of(MI.memoperands(), [](const MachineMemOperand *MMO) {
+             return MMO->isLoad() && MMO->isInvariant();
+           });
+  }
+
+  return false;
+}
+
 bool SIInstrInfo::isReallyTriviallyReMaterializable(
     const MachineInstr &MI) const {
-  if (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isSDWA(MI) || isSALU(MI)) {
+
+  if (canRemat(MI)) {
     // Normally VALU use of exec would block the rematerialization, but that
     // is OK in this case to have an implicit exec read as all VALU do.
     // We really want all of the generic logic for this except for this.
@@ -2434,6 +2452,92 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
   return true;
 }
 
+void SIInstrInfo::reMaterialize(MachineBasicBlock &MBB,
+                                MachineBasicBlock::iterator I, Register DestReg,
+                                unsigned SubIdx, const MachineInstr &Orig,
+                                const TargetRegisterInfo &RI) const {
+
+  // Try shrinking the instruction to remat only the part needed for current
+  // context.
+  // TODO: Handle more cases.
+  unsigned Opcode = Orig.getOpcode();
+  switch (Opcode) {
+  case AMDGPU::S_LOAD_DWORDX16_IMM:
+  case AMDGPU::S_LOAD_DWORDX8_IMM: {
+    if (SubIdx != 0)
+      break;
+
+    if (I == MBB.end())
+      break;
+
+    if (I->isBundled())
+      break;
+
+    // Look for a single use of the register that is also a subreg.
+    Register RegToFind = Orig.getOperand(0).getReg();
+    MachineOperand *UseMO = nullptr;
+    for (auto &CandMO : I->operands()) {
+      if (!CandMO.isReg() || CandMO.getReg() != RegToFind || CandMO.isDef())
+        continue;
+      if (UseMO) {
+        UseMO = nullptr;
+        break;
+      }
+      UseMO = &CandMO;
+    }
+    if (!UseMO || UseMO->getSubReg() == AMDGPU::NoSubRegister)
+      break;
+
+    unsigned Offset = RI.getSubRegIdxOffset(UseMO->getSubReg());
+    unsigned SubregSize = RI.getSubRegIdxSize(UseMO->getSubReg());
+
+    MachineFunction *MF = MBB.getParent();
+    MachineRegisterInfo &MRI = MF->getRegInfo();
+    assert(MRI.use_nodbg_empty(DestReg) && "DestReg should have no users yet.");
+
+    unsigned NewOpcode = -1;
+    if (SubregSize == 256)
+      NewOpcode = AMDGPU::S_LOAD_DWORDX8_IMM;
+    else if (SubregSize == 128)
+      NewOpcode = AMDGPU::S_LOAD_DWORDX4_IMM;
+    else
+      break;
+
+    const MCInstrDesc &TID = get(NewOpcode);
+    const TargetRegisterClass *NewRC =
+        RI.getAllocatableClass(getRegClass(TID, 0, &RI, *MF));
+    MRI.setRegClass(DestReg, NewRC);
+
+    UseMO->setReg(DestReg);
+    UseMO->setSubReg(AMDGPU::NoSubRegister);
+
+    // Use a smaller load with the desired size, possibly with updated offset.
+    MachineInstr *MI = MF->CloneMachineInstr(&Orig);
+    MI->setDesc(TID);
+    MI->getOperand(0).setReg(DestReg);
+    MI->getOperand(0).setSubReg(AMDGPU::NoSubRegister);
+    if (Offset) {
+      MachineOperand *OffsetMO = getNamedOperand(*MI, AMDGPU::OpName::offset);
+      int64_t FinalOffset = OffsetMO->getImm() + Offset / 8;
+      OffsetMO->setImm(FinalOffset);
+    }
+    SmallVector<MachineMemOperand *> NewMMOs;
+    for (const MachineMemOperand *MemOp : Orig.memoperands())
+      NewMMOs.push_back(MF->getMachineMemOperand(MemOp, MemOp->getPointerInfo(),
+                                                 SubregSize / 8));
+    MI->setMemRefs(*MF, NewMMOs);
+
+    MBB.insert(I, MI);
+    return;
+  }
+
+  default:
+    break;
+  }
+
+  TargetInstrInfo::reMaterialize(MBB, I, DestReg, SubIdx, Orig, RI);
+}
+
 std::pair<MachineInstr*, MachineInstr*>
 SIInstrInfo::expandMovDPP64(MachineInstr &MI) const {
   assert (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO);
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 5ef17c44f7de3..a64cf0244e4c0 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -275,6 +275,11 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
 
   bool expandPostRAPseudo(MachineInstr &MI) const override;
 
+  void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+                     Register DestReg, unsigned SubIdx,
+                     const MachineInstr &Orig,
+                     const TargetRegisterInfo &TRI) const override;
+
   // Splits a V_MOV_B64_DPP_PSEUDO opcode into a pair of v_mov_b32_dpp
   // instructions. Returns a pair of generated instructions.
   // Can split either post-RA with physical registers or pre-RA with
diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll
index d6f7a92af9dcb..1999c7b065e68 100644
--- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll
+++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props-v3.ll
@@ -47,8 +47,8 @@ entry:
 }
 
 ; CHECK:   .name:       num_spilled_sgprs
-; GFX700:   .sgpr_spill_count: 38
-; GFX803:   .sgpr_spill_count: 22
+; GFX700:   .sgpr_spill_count: 12
+; GFX803:   .sgpr_spill_count: 12
 ; GFX900:   .sgpr_spill_count: 48
 ; GFX1010:  .sgpr_spill_count: 48
 ; CHECK:   .symbol:     num_spilled_sgprs.kd
diff --git a/llvm/test/CodeGen/AMDGPU/remat-smrd.mir b/llvm/test/CodeGen/AMDGPU/remat-smrd.mir
index 753cb135fdae9..95eac12a65389 100644
--- a/llvm/test/CodeGen/AMDGPU/remat-smrd.mir
+++ b/llvm/test/CodeGen/AMDGPU/remat-smrd.mir
@@ -12,15 +12,11 @@ body:             |
     ; GCN: liveins: $sgpr8_sgpr9
     ; GCN-NEXT: {{  $}}
     ; GCN-NEXT: renamable $sgpr2_sgpr3 = COPY $sgpr8_sgpr9
-    ; GCN-NEXT: renamable $sgpr0 = S_LOAD_DWORD_IMM renamable $sgpr2_sgpr3, 0, 0 :: (dereferenceable invariant load (s32), addrspace 4)
-    ; GCN-NEXT: SI_SPILL_S32_SAVE killed renamable $sgpr0, %stack.0, implicit $exec, implicit $sp_reg :: (store (s32) into %stack.0, addrspace 5)
     ; GCN-NEXT: renamable $sgpr1 = S_LOAD_DWORD_IMM renamable $sgpr2_sgpr3, 4, 0 :: (dereferenceable invariant load (s32), addrspace 4)
-    ; GCN-NEXT: renamable $sgpr0 = S_LOAD_DWORD_IMM renamable $sgpr2_sgpr3, 8, 0 :: (dereferenceable invariant load (s32), addrspace 4)
-    ; GCN-NEXT: SI_SPILL_S32_SAVE killed renamable $sgpr0, %stack.1, implicit $exec, implicit $sp_reg :: (store (s32) into %stack.1, addrspace 5)
-    ; GCN-NEXT: renamable $sgpr0 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sp_reg :: (load (s32) from %stack.0, addrspace 5)
+    ; GCN-NEXT: renamable $sgpr0 = S_LOAD_DWORD_IMM renamable $sgpr2_sgpr3, 0, 0 :: (dereferenceable invariant load (s32), addrspace 4)
     ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr0
     ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr1
-    ; GCN-NEXT: renamable $sgpr0 = SI_SPILL_S32_RESTORE %stack.1, implicit $exec, implicit $sp_reg :: (load (s32) from %stack.1, addrspace 5)
+    ; GCN-NEXT: renamable $sgpr0 = S_LOAD_DWORD_IMM renamable $sgpr2_sgpr3, 8, 0 :: (dereferenceable invariant load (s32), addrspace 4)
     ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr0
     ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $sgpr2_sgpr3
     %0:sreg_64_xexec = COPY $sgpr8_sgpr9
@@ -44,16 +40,10 @@ body:             |
     ; GCN-NEXT: {{  $}}
     ; GCN-NEXT: renamable $sgpr0_sgpr1 = COPY $sgpr8_sgpr9
     ; GCN-NEXT: renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM renamable $sgpr0_sgpr1, 0, 0 :: (dereferenceable invariant load (s64), addrspace 4)
-    ; GCN-NEXT: SI_SPILL_S64_SAVE killed renamable $sgpr2_sgpr3, %stack.1, implicit $exec, implicit $sp_reg :: (store (s64) into %stack.1, align 4, addrspace 5)
-    ; GCN-NEXT: renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM renamable $sgpr0_sgpr1, 4, 0 :: (dereferenceable invariant load (s64), addrspace 4)
-    ; GCN-NEXT: SI_SPILL_S64_SAVE killed renamable $sgpr2_sgpr3, %stack.0, implicit $exec, implicit $sp_reg :: (store (s64) into %stack.0, align 4, addrspace 5)
-    ; GCN-NEXT: renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM renamable $sgpr0_sgpr1, 8, 0 :: (dereferenceable invariant load (s64), addrspace 4)
-    ; GCN-NEXT: SI_SPILL_S64_SAVE killed renamable $sgpr2_sgpr3, %stack.2, implicit $exec, implicit $sp_reg :: (store (s64) into %stack.2, align 4, addrspace 5)
-    ; GCN-NEXT: renamable $sgpr2_sgpr3 = SI_SPILL_S64_RESTORE %stack.1, implicit $exec, implicit $sp_reg :: (load (s64) from %stack.1, align 4, addrspace 5)
     ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr2_sgpr3
-    ; GCN-NEXT: renamable $sgpr2_sgpr3 = SI_SPILL_S64_RESTORE %stack.0, implicit $exec, implicit $sp_reg :: (load (s64) from %stack.0, align 4, addrspace 5)
+    ; GCN-NEXT: renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM renamable $sgpr0_sgpr1, 4, 0 :: (dereferenceable invariant load (s64), addrspace 4)
     ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr2_sgpr3
-    ; GCN-NEXT: renamable $sgpr2_sgpr3 = SI_SPILL_S64_RESTORE %stack.2, implicit $exec, implicit $sp_reg :: (load (s64) from %stack.2, align 4, addrspace 5)
+    ; GCN-NEXT: renamable $sgpr2_sgpr3 = S_LOAD_DWORDX2_IMM renamable $sgpr0_sgpr1, 8, 0 :: (dereferenceable invariant load (s64), addrspace 4)
     ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr2_sgpr3
     ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $sgpr0_sgpr1
     %0:sreg_64_xexec = COPY $sgpr8_sgpr9
@@ -77,16 +67,10 @@ body:             |
     ; GCN-NEXT: {{  $}}
     ; GCN-NEXT: renamable $sgpr0_sgpr1 = COPY $sgpr8_sgpr9
     ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7 = S_LOAD_DWORDX4_IMM renamable $sgpr0_sgpr1, 0, 0 :: (dereferenceable invariant load (s128), addrspace 4)
-    ; GCN-NEXT: SI_SPILL_S128_SAVE killed renamable $sgpr4_sgpr5_sgpr6_sgpr7, %stack.1, implicit $exec, implicit $sp_reg :: (store (s128) into %stack.1, align 4, addrspace 5)
-    ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7 = S_LOAD_DWORDX4_IMM renamable $sgpr0_sgpr1, 4, 0 :: (dereferenceable invariant load (s128), addrspace 4)
-    ; GCN-NEXT: SI_SPILL_S128_SAVE killed renamable $sgpr4_sgpr5_sgpr6_sgpr7, %stack.0, implicit $exec, implicit $sp_reg :: (store (s128) into %stack.0, align 4, addrspace 5)
-    ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7 = S_LOAD_DWORDX4_IMM renamable $sgpr0_sgpr1, 8, 0 :: (dereferenceable invariant load (s128), addrspace 4)
-    ; GCN-NEXT: SI_SPILL_S128_SAVE killed renamable $sgpr4_sgpr5_sgpr6_sgpr7, %stack.2, implicit $exec, implicit $sp_reg :: (store (s128) into %stack.2, align 4, addrspace 5)
-    ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7 = SI_SPILL_S128_RESTORE %stack.1, implicit $exec, implicit $sp_reg :: (load (s128) from %stack.1, align 4, addrspace 5)
     ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr4_sgpr5_sgpr6_sgpr7
-    ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7 = SI_SPILL_S128_RESTORE %stack.0, implicit $exec, implicit $sp_reg :: (load (s128) from %stack.0, align 4, addrspace 5)
+    ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7 = S_LOAD_DWORDX4_IMM renamable $sgpr0_sgpr1, 4, 0 :: (dereferenceable invariant load (s128), addrspace 4)
     ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr4_sgpr5_sgpr6_sgpr7
-    ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7 = SI_SPILL_S128_RESTORE %stack.2, implicit $exec, implicit $sp_reg :: (load (s128) from %stack.2, align 4, addrspace 5)
+    ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7 = S_LOAD_DWORDX4_IMM renamable $sgpr0_sgpr1, 8, 0 :: (dereferenceable invariant load (s128), addrspace 4)
     ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr4_sgpr5_sgpr6_sgpr7
     ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $sgpr0_sgpr1
     %0:sreg_64_xexec = COPY $sgpr8_sgpr9
@@ -110,16 +94,10 @@ body:             |
     ; GCN-NEXT: {{  $}}
     ; GCN-NEXT: renamable $sgpr0_sgpr1 = COPY $sgpr8_sgpr9
     ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11 = S_LOAD_DWORDX8_IMM renamable $sgpr0_sgpr1, 0, 0 :: (dereferenceable invariant load (s256), addrspace 4)
-    ; GCN-NEXT: SI_SPILL_S256_SAVE killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, %stack.1, implicit $exec, implicit $sp_reg :: (store (s256) into %stack.1, align 4, addrspace 5)
-    ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11 = S_LOAD_DWORDX8_IMM renamable $sgpr0_sgpr1, 4, 0 :: (dereferenceable invariant load (s256), addrspace 4)
-    ; GCN-NEXT: SI_SPILL_S256_SAVE killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, %stack.0, implicit $exec, implicit $sp_reg :: (store (s256) into %stack.0, align 4, addrspace 5)
-    ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11 = S_LOAD_DWORDX8_IMM renamable $sgpr0_sgpr1, 8, 0 :: (dereferenceable invariant load (s256), addrspace 4)
-    ; GCN-NEXT: SI_SPILL_S256_SAVE killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, %stack.2, implicit $exec, implicit $sp_reg :: (store (s256) into %stack.2, align 4, addrspace 5)
-    ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11 = SI_SPILL_S256_RESTORE %stack.1, implicit $exec, implicit $sp_reg :: (load (s256) from %stack.1, align 4, addrspace 5)
     ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11
-    ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11 = SI_SPILL_S256_RESTORE %stack.0, implicit $exec, implicit $sp_reg :: (load (s256) from %stack.0, align 4, addrspace 5)
+    ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11 = S_LOAD_DWORDX8_IMM renamable $sgpr0_sgpr1, 4, 0 :: (dereferenceable invariant load (s256), addrspace 4)
     ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11
-    ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11 = SI_SPILL_S256_RESTORE %stack.2, implicit $exec, implicit $sp_reg :: (load (s256) from %stack.2, align 4, addrspace 5)
+    ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11 = S_LOAD_DWORDX8_IMM renamable $sgpr0_sgpr1, 8, 0 :: (dereferenceable invariant load (s256), addrspace 4)
     ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11
     ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $sgpr0_sgpr1
     %0:sreg_64_xexec = COPY $sgpr8_sgpr9
@@ -143,16 +121,10 @@ body:             |
     ; GCN-NEXT: {{  $}}
     ; GCN-NEXT: renamable $sgpr0_sgpr1 = COPY $sgpr8_sgpr9
     ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = S_LOAD_DWORDX16_IMM renamable $sgpr0_sgpr1, 0, 0 :: (dereferenceable invariant load (s512), addrspace 4)
-    ; GCN-NEXT: SI_SPILL_S512_SAVE killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19, %stack.1, implicit $exec, implicit $sp_reg :: (store (s512) into %stack.1, align 4, addrspace 5)
-    ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = S_LOAD_DWORDX16_IMM renamable $sgpr0_sgpr1, 4, 0 :: (dereferenceable invariant load (s512), addrspace 4)
-    ; GCN-NEXT: SI_SPILL_S512_SAVE killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19, %stack.0, implicit $exec, implicit $sp_reg :: (store (s512) into %stack.0, align 4, addrspace 5)
-    ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = S_LOAD_DWORDX16_IMM renamable $sgpr0_sgpr1, 8, 0 :: (dereferenceable invariant load (s512), addrspace 4)
-    ; GCN-NEXT: SI_SPILL_S512_SAVE killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19, %stack.2, implicit $exec, implicit $sp_reg :: (store (s512) into %stack.2, align 4, addrspace 5)
-    ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = SI_SPILL_S512_RESTORE %stack.1, implicit $exec, implicit $sp_reg :: (load (s512) from %stack.1, align 4, addrspace 5)
     ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
-    ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = SI_SPILL_S512_RESTORE %stack.0, implicit $exec, implicit $sp_reg :: (load (s512) from %stack.0, align 4, addrspace 5)
+    ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = S_LOAD_DWORDX16_IMM renamable $sgpr0_sgpr1, 4, 0 :: (dereferenceable invariant load (s512), addrspace 4)
     ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
-    ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = SI_SPILL_S512_RESTORE %stack.2, implicit $exec, implicit $sp_reg :: (load (s512) from %stack.2, align 4, addrspace 5)
+    ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = S_LOAD_DWORDX16_IMM renamable $sgpr0_sgpr1, 8, 0 :: (dereferenceable invariant load (s512), addrspace 4)
     ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
     ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $sgpr0_sgpr1
     %0:sreg_64_xexec = COPY $sgpr8_sgpr9
@@ -175,13 +147,12 @@ body:             |
     ; GCN: liveins: $sgpr8_sgpr9, $vgpr0_vgpr1
     ; GCN-NEXT: {{  $}}
     ; GCN-NEXT: renamable $sgpr0_sgpr1 = COPY $sgpr8_sgpr9
-    ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = S_LOAD_DWORDX16_IMM renamable $sgpr0_sgpr1, 0, 0 :: (dereferenceable invariant load (s512), align 4, addrspace 4)
-    ; GCN-NEXT: SI_SPILL_S512_SAVE killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19, %stack.0, implicit $exec, implicit $sp_reg :: (store (s512) into %stack.0, align 4, addrspace 5)
     ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = S_LOAD_DWORDX16_IMM renamable $sgpr0_sgpr1, 128, 0 :: (dereferenceable invariant load (s512), align 4, addrspace 4)
-    ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = SI_SPILL_S512_RESTORE %stack.0, implicit $exec, implicit $sp_reg :: (load (s512) from %stack.0, align 4, addrspace 5)
+    ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11 = S_LOAD_DWORDX8_IMM renamable $sgpr0_sgpr1, 32, 0 :: (dereferenceable invariant load (s256), align 4, addrspace 4)
+    ; GCN-NEXT: renamable $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = COPY killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11
     ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
     ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = S_LOAD_DWORDX16_IMM renamable $sgpr0_sgpr1, 128, 0 :: (dereferenceable invariant load (s512), align 4, addrspace 4)
-    ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = SI_SPILL_S512_RESTORE %stack.0, implicit $exec, implicit $sp_reg :: (load (s512) from %stack.0, align 4, addrspace 5)
+    ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11 = S_LOAD_DWORDX8_IMM renamable $sgpr0_sgpr1, 0, 0 :: (dereferenceable invariant load (s256), align 4, addrspace 4)
     ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11
     ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $sgpr0_sgpr1
     %0:sreg_64_xexec = COPY $sgpr8_sgpr9
@@ -206,14 +177,12 @@ body:             |
     ; GCN: liveins: $sgpr8_sgpr9, $vgpr0_vgpr1
     ; GCN-NEXT: {{  $}}
     ; GCN-NEXT: renamable $sgpr0_sgpr1 = COPY $sgpr8_sgpr9
-    ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = S_LOAD_DWORDX16_IMM renamable $sgpr0_sgpr1, 0, 0 :: (dereferenceable invariant load (s512), align 4, addrspace 4)
-    ; GCN-NEXT: SI_SPILL_S512_SAVE killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19, %stack.0, implicit $exec, implicit $sp_reg :: (store (s512) into %stack.0, align 4, addrspace 5)
     ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = S_LOAD_DWORDX16_IMM renamable $sgpr0_sgpr1, 128, 0 :: (dereferenceable invariant load (s512), align 4, addrspace 4)
-    ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = SI_SPILL_S512_RESTORE %stack.0, implicit $exec, implicit $sp_reg :: (load (s512) from %stack.0, align 4, addrspace 5)
+    ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = S_LOAD_DWORDX16_IMM renamable $sgpr0_sgpr1, 0, 0 :: (dereferenceable invariant load (s512), align 4, addrspace 4)
     ; GCN-NEXT: renamable $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = KILL killed renamable $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19, implicit renamable $sgpr4_sgpr5_sgpr6_sgpr7
     ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
     ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = S_LOAD_DWORDX16_IMM renamable $sgpr0_sgpr1, 128, 0 :: (dereferenceable invariant load (s512), align 4, addrspace 4)
-    ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = SI_SPILL_S512_RESTORE %stack.0, implicit $exec, implicit $sp_reg :: (load (s512) from %stack.0, align 4, addrspace 5)
+    ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11 = S_LOAD_DWORDX8_IMM renamable $sgpr0_sgpr1, 0, 0 :: (dereferenceable invariant load (s256), align 4, addrspace 4)
     ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11 = KILL killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, implicit renamable $sgpr0_sgpr1
     ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11
     ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $sgpr0_sgpr1
@@ -238,10 +207,8 @@ body:             |
     ; GCN: liveins: $sgpr8_sgpr9, $vgpr0_vgpr1
     ; GCN-NEXT: {{  $}}
     ; GCN-NEXT: renamable $sgpr0_sgpr1 = COPY $sgpr8_sgpr9
-    ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = S_LOAD_DWORDX16_IMM renamable $sgpr0_sgpr1, 0, 0 :: (dereferenceable invariant load (s512), align 4, addrspace 4)
-    ; GCN-NEXT: SI_SPILL_S512_SAVE killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19, %stack.0, implicit $exec, implicit $sp_reg :: (store (s512) into %stack.0, align 4, addrspace 5)
     ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = S_LOAD_DWORDX16_IMM renamable $sgpr0_sgpr1, 128, 0 :: (dereferenceable invariant load (s512), align 4, addrspace 4)
-    ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = SI_SPILL_S512_RESTORE %stack.0, implicit $exec, implicit $sp_reg :: (load (s512) from %stack.0, align 4, addrspace 5)
+    ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = S_LOAD_DWORDX16_IMM renamable $sgpr0_sgpr1, 0, 0 :: (dereferenceable invariant load (s512), align 4, addrspace 4)
     ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
     ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $sgpr0_sgpr1
     %0:sreg_64_xexec = COPY $sgpr8_sgpr9
@@ -265,13 +232,12 @@ body:             |
     ; GCN: liveins: $sgpr8_sgpr9, $vgpr0_vgpr1
     ; GCN-NEXT: {{  $}}
     ; GCN-NEXT: renamable $sgpr0_sgpr1 = COPY $sgpr8_sgpr9
-    ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11 = S_LOAD_DWORDX8_IMM renamable $sgpr0_sgpr1, 0, 0 :: (dereferenceable invariant load (s256), align 4, addrspace 4)
-    ; GCN-NEXT: SI_SPILL_S256_SAVE killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, %stack.0, implicit $exec, implicit $sp_reg :: (store (s256) into %stack.0, align 4, addrspace 5)
     ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11 = S_LOAD_DWORDX8_IMM renamable $sgpr0_sgpr1, 128, 0 :: (dereferenceable invariant load (s256), align 4, addrspace 4)
-    ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11 = SI_SPILL_S256_RESTORE %stack.0, implicit $exec, implicit $sp_reg :: (load (s256) from %stack.0, align 4, addrspace 5)
+    ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7 = S_LOAD_DWORDX4_IMM renamable $sgpr0_sgpr1, 16, 0 :: (dereferenceable invariant load (s128), align 4, addrspace 4)
+    ; GCN-NEXT: renamable $sgpr8_sgpr9_sgpr10_sgpr11 = COPY killed renamable $sgpr4_sgpr5_sgpr6_sgpr7
     ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr8_sgpr9_sgpr10_sgpr11
     ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11 = S_LOAD_DWORDX8_IMM renamable $sgpr0_sgpr1, 128, 0 :: (dereferenceable invariant load (s256), align 4, addrspace 4)
-    ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11 = SI_SPILL_S256_RESTORE %stack.0, implicit $exec, implicit $sp_reg :: (load (s256) from %stack.0, align 4, addrspace 5)
+    ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7 = S_LOAD_DWORDX4_IMM renamable $sgpr0_sgpr1, 0, 0 :: (dereferenceable invariant load (s128), align 4, addrspace 4)
     ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr4_sgpr5_sgpr6_sgpr7
     ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $sgpr0_sgpr1
     %0:sreg_64_xexec = COPY $sgpr8_sgpr9
@@ -331,16 +297,10 @@ body:             |
     ; GCN-NEXT: renamable $sgpr2_sgpr3 = COPY $sgpr8_sgpr9
     ; GCN-NEXT: renamable $sgpr0 = COPY $sgpr10
     ; GCN-NEXT: renamable $sgpr1 = S_LOAD_DWORD_SGPR renamable $sgpr2_sgpr3, renamable $sgpr0, 0 :: (dereferenceable invariant load (s32), addrspace 4)
-    ; GCN-NEXT: SI_SPILL_S32_SAVE killed renamable $sgpr1, %stack.1, implicit $exec, implicit $sp_reg :: (store (s32) into %stack.1, addrspace 5)
-    ; GCN-NEXT: renamable $sgpr1 = S_LOAD_DWORD_SGPR renamable $sgpr2_sgpr3, renamable $sgpr0, 0 :: (dereferenceable invariant load (s32), addrspace 4)
-    ; GCN-NEXT: SI_SPILL_S32_SAVE killed renamable $sgpr1, %stack.0, implicit $exec, implicit $sp_reg :: (store (s32) into %stack.0, addrspace 5)
-    ; GCN-NEXT: renamable $sgpr1 = S_LOAD_DWORD_SGPR renamable $sgpr2_sgpr3, renamable $sgpr0, 0 :: (dereferenceable invariant load (s32), addrspace 4)
-    ; GCN-NEXT: SI_SPILL_S32_SAVE killed renamable $sgpr1, %stack.2, implicit $exec, implicit $sp_reg :: (store (s32) into %stack.2, addrspace 5)
-    ; GCN-NEXT: renamable $sgpr1 = SI_SPILL_S32_RESTORE %stack.1, implicit $exec, implicit $sp_reg :: (load (s32) from %stack.1, addrspace 5)
     ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr1
-    ; GCN-NEXT: renamable $sgpr1 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sp_reg :: (load (s32) from %stack.0, addrspace 5)
+    ; GCN-NEXT: renamable $sgpr1 = S_LOAD_DWORD_SGPR renamable $sgpr2_sgpr3, renamable $sgpr0, 0 :: (dereferenceable invariant load (s32), addrspace 4)
     ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr1
-    ; GCN-NEXT: renamable $sgpr1 = SI_SPILL_S32_RESTORE %stack.2, implicit $exec, implicit $sp_reg :: (load (s32) from %stack.2, addrspace 5)
+    ; GCN-NEXT: renamable $sgpr1 = S_LOAD_DWORD_SGPR renamable $sgpr2_sgpr3, renamable $sgpr0, 0 :: (dereferenceable invariant load (s32), addrspace 4)
     ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr1
     ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $sgpr2_sgpr3, implicit killed renamable $sgpr0
     %0:sreg_64_xexec = COPY $sgpr8_sgpr9
@@ -366,16 +326,10 @@ body:             |
     ; GCN-NEXT: renamable $sgpr2_sgpr3 = COPY $sgpr8_sgpr9
     ; GCN-NEXT: renamable $sgpr0 = COPY $sgpr10
     ; GCN-NEXT: renamable $sgpr1 = S_LOAD_DWORD_SGPR_IMM renamable $sgpr2_sgpr3, renamable $sgpr0, 0, 0 :: (dereferenceable invariant load (s32), addrspace 4)
-    ; GCN-NEXT: SI_SPILL_S32_SAVE killed renamable $sgpr1, %stack.1, implicit $exec, implicit $sp_reg :: (store (s32) into %stack.1, addrspace 5)
-    ; GCN-NEXT: renamable $sgpr1 = S_LOAD_DWORD_SGPR_IMM renamable $sgpr2_sgpr3, renamable $sgpr0, 4, 0 :: (dereferenceable invariant load (s32), addrspace 4)
-    ; GCN-NEXT: SI_SPILL_S32_SAVE killed renamable $sgpr1, %stack.0, implicit $exec, implicit $sp_reg :: (store (s32) into %stack.0, addrspace 5)
-    ; GCN-NEXT: renamable $sgpr1 = S_LOAD_DWORD_SGPR_IMM renamable $sgpr2_sgpr3, renamable $sgpr0, 8, 0 :: (dereferenceable invariant load (s32), addrspace 4)
-    ; GCN-NEXT: SI_SPILL_S32_SAVE killed renamable $sgpr1, %stack.2, implicit $exec, implicit $sp_reg :: (store (s32) into %stack.2, addrspace 5)
-    ; GCN-NEXT: renamable $sgpr1 = SI_SPILL_S32_RESTORE %stack.1, implicit $exec, implicit $sp_reg :: (load (s32) from %stack.1, addrspace 5)
     ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr1
-    ; GCN-NEXT: renamable $sgpr1 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sp_reg :: (load (s32) from %stack.0, addrspace 5)
+    ; GCN-NEXT: renamable $sgpr1 = S_LOAD_DWORD_SGPR_IMM renamable $sgpr2_sgpr3, renamable $sgpr0, 4, 0 :: (dereferenceable invariant load (s32), addrspace 4)
     ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr1
-    ; GCN-NEXT: renamable $sgpr1 = SI_SPILL_S32_RESTORE %stack.2, implicit $exec, implicit $sp_reg :: (load (s32) from %stack.2, addrspace 5)
+    ; GCN-NEXT: renamable $sgpr1 = S_LOAD_DWORD_SGPR_IMM renamable $sgpr2_sgpr3, renamable $sgpr0, 8, 0 :: (dereferenceable invariant load (s32), addrspace 4)
     ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr1
     ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $sgpr2_sgpr3, implicit killed renamable $sgpr0
     %0:sreg_64_xexec = COPY $sgpr8_sgpr9
@@ -430,15 +384,11 @@ body:             |
     ; GCN: liveins: $sgpr8_sgpr9_sgpr10_sgpr11
     ; GCN-NEXT: {{  $}}
     ; GCN-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7 = COPY $sgpr8_sgpr9_sgpr10_sgpr11
-    ; GCN-NEXT: renamable $sgpr0 = S_BUFFER_LOAD_DWORD_IMM renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0 :: (dereferenceable invariant load (s32), addrspace 4)
-    ; GCN-NEXT: SI_SPILL_S32_SAVE killed renamable $sgpr0, %stack.0, implicit $exec, implicit $sp_reg :: (store (s32) into %stack.0, addrspace 5)
     ; GCN-NEXT: renamable $sgpr1 = S_BUFFER_LOAD_DWORD_IMM renamable $sgpr4_sgpr5_sgpr6_sgpr7, 4, 0 :: (dereferenceable invariant load (s32), addrspace 4)
-    ; GCN-NEXT: renamable $sgpr0 = S_BUFFER_LOAD_DWORD_IMM renamable $sgpr4_sgpr5_sgpr6_sgpr7, 8, 0 :: (dereferenceable invariant load (s32), addrspace 4)
-    ; GCN-NEXT: SI_SPILL_S32_SAVE killed renamable $sgpr0, %stack.1, implicit $exec, implicit $sp_reg :: (store (s32) into %stack.1, addrspace 5)
-    ; GCN-NEXT: renamable $sgpr0 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sp_reg :: (load (s32) from %stack.0, addrspace 5)
+    ; GCN-NEXT: renamable $sgpr0 = S_BUFFER_LOAD_DWORD_IMM renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0 :: (dereferenceable invariant load (s32), addrspace 4)
     ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr0
     ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr1
-    ; GCN-NEXT: renamable $sgpr0 = SI_SPILL_S32_RESTORE %stack.1, implicit $exec, implicit $sp_reg :: (load (s32) from %stack.1, addrspace 5)
+    ; GCN-NEXT: renamable $sgpr0 = S_BUFFER_LOAD_DWORD_IMM renamable $sgpr4_sgpr5_sgpr6_sgpr7, 8, 0 :: (dereferenceable invariant load (s32), addrspace 4)
     ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr0
     ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $sgpr4_sgpr5_sgpr6_sgpr7
     %0:sgpr_128 = COPY $sgpr8_sgpr9_sgpr10_sgpr11
@@ -461,15 +411,11 @@ body:             |
     ; GCN: liveins: $sgpr8_sgpr9
     ; GCN-NEXT: {{  $}}
     ; GCN-NEXT: renamable $sgpr2_sgpr3 = COPY $sgpr8_sgpr9
-    ; GCN-NEXT: renamable $sgpr0 = S_SCRATCH_LOAD_DWORD_IMM renamable $sgpr2_sgpr3, 0, 0, implicit $flat_scr :: (dereferenceable invariant load (s32), addrspace 4)
-    ; GCN-NEXT: SI_SPILL_S32_SAVE killed renamable $sgpr0, %stack.0, implicit $exec, implicit $sp_reg :: (store (s32) into %stack.0, addrspace 5)
     ; GCN-NEXT: renamable $sgpr1 = S_SCRATCH_LOAD_DWORD_IMM renamable $sgpr2_sgpr3, 4, 0, implicit $flat_scr :: (dereferenceable invariant load (s32), addrspace 4)
-    ; GCN-NEXT: renamable $sgpr0 = S_SCRATCH_LOAD_DWORD_IMM renamable $sgpr2_sgpr3, 8, 0, implicit $flat_scr :: (dereferenceable invariant load (s32), addrspace 4)
-    ; GCN-NEXT: SI_SPILL_S32_SAVE killed renamable $sgpr0, %stack.1, implicit $exec, implicit $sp_reg :: (store (s32) into %stack.1, addrspace 5)
-    ; GCN-NEXT: renamable $sgpr0 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sp_reg :: (load (s32) from %stack.0, addrspace 5)
+    ; GCN-NEXT: renamable $sgpr0 = S_SCRATCH_LOAD_DWORD_IMM renamable $sgpr2_sgpr3, 0, 0, implicit $flat_scr :: (dereferenceable invariant load (s32), addrspace 4)
     ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr0
     ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr1
-    ; GCN-NEXT: renamable $sgpr0 = SI_SPILL_S32_RESTORE %stack.1, implicit $exec, implicit $sp_reg :: (load (s32) from %stack.1, addrspace 5)
+    ; GCN-NEXT: renamable $sgpr0 = S_SCRATCH_LOAD_DWORD_IMM renamable $sgpr2_sgpr3, 8, 0, implicit $flat_scr :: (dereferenceable invariant load (s32), addrspace 4)
     ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr0
     ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $sgpr2_sgpr3
     %0:sreg_64_xexec = COPY $sgpr8_sgpr9
@@ -492,15 +438,11 @@ body:             |
     ; GCN: liveins: $sgpr8_sgpr9
     ; GCN-NEXT: {{  $}}
     ; GCN-NEXT: renamable $sgpr2_sgpr3 = COPY $sgpr8_sgpr9
-    ; GCN-NEXT: renamable $sgpr0 = S_LOAD_DWORD_IMM renamable $sgpr2_sgpr3, 0, 0 :: (invariant load (s32), addrspace 4)
-    ; GCN-NEXT: SI_SPILL_S32_SAVE killed renamable $sgpr0, %stack.0, implicit $exec, implicit $sp_reg :: (store (s32) into %stack.0, addrspace 5)
     ; GCN-NEXT: renamable $sgpr1 = S_LOAD_DWORD_IMM renamable $sgpr2_sgpr3, 4, 0 :: (invariant load (s32), addrspace 4)
-    ; GCN-NEXT: renamable $sgpr0 = S_LOAD_DWORD_IMM renamable $sgpr2_sgpr3, 8, 0 :: (invariant load (s32), addrspace 4)
-    ; GCN-NEXT: SI_SPILL_S32_SAVE killed renamable $sgpr0, %stack.1, implicit $exec, implicit $sp_reg :: (store (s32) into %stack.1, addrspace 5)
-    ; GCN-NEXT: renamable $sgpr0 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sp_reg :: (load (s32) from %stack.0, addrspace 5)
+    ; GCN-NEXT: renamable $sgpr0 = S_LOAD_DWORD_IMM renamable $sgpr2_sgpr3, 0, 0 :: (invariant load (s32), addrspace 4)
     ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr0
     ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr1
-    ; GCN-NEXT: renamable $sgpr0 = SI_SPILL_S32_RESTORE %stack.1, implicit $exec, implicit $sp_reg :: (load (s32) from %stack.1, addrspace 5)
+    ; GCN-NEXT: renamable $sgpr0 = S_LOAD_DWORD_IMM renamable $sgpr2_sgpr3, 8, 0 :: (invariant load (s32), addrspace 4)
     ; GCN-NEXT: S_NOP 0, implicit killed renamable $sgpr0
     ; GCN-NEXT: S_ENDPGM 0, implicit killed renamable $sgpr2_sgpr3
     %0:sreg_64_xexec = COPY $sgpr8_sgpr9
diff --git a/llvm/test/CodeGen/AMDGPU/snippet-copy-bundle-regression.mir b/llvm/test/CodeGen/AMDGPU/snippet-copy-bundle-regression.mir
index 55639a27fd5c7..355829825146d 100644
--- a/llvm/test/CodeGen/AMDGPU/snippet-copy-bundle-regression.mir
+++ b/llvm/test/CodeGen/AMDGPU/snippet-copy-bundle-regression.mir
@@ -33,7 +33,6 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
   ; CHECK-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $vgpr1 = IMPLICIT_DEF
   ; CHECK-NEXT:   renamable $sgpr34_sgpr35 = IMPLICIT_DEF
   ; CHECK-NEXT:   dead renamable $vgpr0 = IMPLICIT_DEF
   ; CHECK-NEXT:   renamable $sgpr41 = IMPLICIT_DEF
@@ -41,16 +40,6 @@ body:             |
   ; CHECK-NEXT:   renamable $sgpr36_sgpr37 = IMPLICIT_DEF
   ; CHECK-NEXT:   renamable $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX8_IMM renamable $sgpr38_sgpr39, 0, 0 :: (dereferenceable invariant load (s256), align 16, addrspace 4)
   ; CHECK-NEXT:   dead renamable $sgpr4 = S_LOAD_DWORD_IMM renamable $sgpr38_sgpr39, 48, 0 :: (dereferenceable invariant load (s32), align 16, addrspace 4)
-  ; CHECK-NEXT:   renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11 = S_LOAD_DWORDX8_IMM renamable $sgpr38_sgpr39, 56, 0 :: (dereferenceable invariant load (s256), align 8, addrspace 4)
-  ; CHECK-NEXT:   renamable $vgpr1 = V_WRITELANE_B32 $sgpr4, 0, killed $vgpr1, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11
-  ; CHECK-NEXT:   renamable $vgpr1 = V_WRITELANE_B32 $sgpr5, 1, killed $vgpr1
-  ; CHECK-NEXT:   renamable $vgpr1 = V_WRITELANE_B32 $sgpr6, 2, killed $vgpr1
-  ; CHECK-NEXT:   renamable $vgpr1 = V_WRITELANE_B32 $sgpr7, 3, killed $vgpr1
-  ; CHECK-NEXT:   renamable $vgpr1 = V_WRITELANE_B32 $sgpr8, 4, killed $vgpr1
-  ; CHECK-NEXT:   renamable $vgpr1 = V_WRITELANE_B32 $sgpr9, 5, killed $vgpr1
-  ; CHECK-NEXT:   renamable $vgpr1 = V_WRITELANE_B32 $sgpr10, 6, killed $vgpr1
-  ; CHECK-NEXT:   renamable $vgpr1 = V_WRITELANE_B32 killed $sgpr11, 7, killed $vgpr1, implicit killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11
-  ; CHECK-NEXT:   SI_SPILL_WWM_V32_SAVE killed $vgpr1, %stack.1, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5)
   ; CHECK-NEXT:   dead renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM renamable $sgpr44_sgpr45, 0, 0 :: (invariant load (s64), align 16, addrspace 4)
   ; CHECK-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
   ; CHECK-NEXT:   $vgpr1 = COPY renamable $sgpr51
@@ -63,50 +52,30 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT:   liveins: $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51:0x000000000000FC00
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $vgpr1 = SI_SPILL_WWM_V32_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5)
-  ; CHECK-NEXT:   $sgpr4 = V_READLANE_B32 $vgpr1, 0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11
-  ; CHECK-NEXT:   $sgpr5 = V_READLANE_B32 $vgpr1, 1
-  ; CHECK-NEXT:   $sgpr6 = V_READLANE_B32 $vgpr1, 2
-  ; CHECK-NEXT:   $sgpr7 = V_READLANE_B32 $vgpr1, 3
-  ; CHECK-NEXT:   $sgpr8 = V_READLANE_B32 $vgpr1, 4
-  ; CHECK-NEXT:   $sgpr9 = V_READLANE_B32 $vgpr1, 5
-  ; CHECK-NEXT:   $sgpr10 = V_READLANE_B32 $vgpr1, 6
-  ; CHECK-NEXT:   $sgpr11 = V_READLANE_B32 $vgpr1, 7
-  ; CHECK-NEXT:   $noreg = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
-  ; CHECK-NEXT:   $exec = S_MOV_B64 killed $noreg
+  ; CHECK-NEXT:   renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11 = S_LOAD_DWORDX8_IMM renamable $sgpr38_sgpr39, 56, 0 :: (dereferenceable invariant load (s256), align 8, addrspace 4)
   ; CHECK-NEXT:   S_BRANCH %bb.3
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   successors: %bb.3(0x80000000)
   ; CHECK-NEXT:   liveins: $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51:0x000000000000FC00
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   renamable $vgpr1 = SI_SPILL_WWM_V32_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5)
-  ; CHECK-NEXT:   $sgpr4 = V_READLANE_B32 $vgpr1, 0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11
-  ; CHECK-NEXT:   $sgpr5 = V_READLANE_B32 $vgpr1, 1
-  ; CHECK-NEXT:   $sgpr6 = V_READLANE_B32 $vgpr1, 2
-  ; CHECK-NEXT:   $sgpr7 = V_READLANE_B32 $vgpr1, 3
-  ; CHECK-NEXT:   $sgpr8 = V_READLANE_B32 $vgpr1, 4
-  ; CHECK-NEXT:   $sgpr9 = V_READLANE_B32 $vgpr1, 5
-  ; CHECK-NEXT:   $sgpr10 = V_READLANE_B32 $vgpr1, 6
-  ; CHECK-NEXT:   $sgpr11 = V_READLANE_B32 $vgpr1, 7
+  ; CHECK-NEXT:   renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11 = S_LOAD_DWORDX8_IMM renamable $sgpr38_sgpr39, 56, 0 :: (dereferenceable invariant load (s256), align 8, addrspace 4)
   ; CHECK-NEXT:   S_CMP_LG_U64 renamable $sgpr4_sgpr5, 0, implicit-def $scc
-  ; CHECK-NEXT:   $noreg = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
-  ; CHECK-NEXT:   $exec = S_MOV_B64 killed $noreg
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
   ; CHECK-NEXT:   successors: %bb.5(0x40000000), %bb.4(0x40000000)
-  ; CHECK-NEXT:   liveins: $vgpr1, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11:0x00000000000003F0, $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51:0x000000000000FC00
+  ; CHECK-NEXT:   liveins: $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11:0x00000000000003F0, $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51:0x000000000000FC00
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   S_CBRANCH_VCCZ %bb.5, implicit undef $vcc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.4:
   ; CHECK-NEXT:   successors: %bb.5(0x80000000)
-  ; CHECK-NEXT:   liveins: $vgpr1, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11:0x00000000000003F0, $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51:0x000000000000FC00
+  ; CHECK-NEXT:   liveins: $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11:0x00000000000003F0, $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51:0x000000000000FC00
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   S_CMP_EQ_U32 renamable $sgpr8, 0, implicit-def $scc
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.5:
-  ; CHECK-NEXT:   liveins: $vgpr1, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11:0x00000000000000F0, $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51:0x000000000000FC00
+  ; CHECK-NEXT:   liveins: $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11:0x00000000000000F0, $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51:0x000000000000FC00
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   dead renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr38_sgpr39, 40, 0 :: (dereferenceable invariant load (s64), addrspace 4)
   ; CHECK-NEXT:   GLOBAL_STORE_DWORD_SADDR undef renamable $vgpr0, undef renamable $vgpr0, killed renamable $sgpr6_sgpr7, 0, 0, implicit $exec :: (store (s32), addrspace 1)
@@ -116,7 +85,6 @@ body:             |
   ; CHECK-NEXT:   $sgpr6_sgpr7 = COPY killed renamable $sgpr36_sgpr37
   ; CHECK-NEXT:   $sgpr10_sgpr11 = COPY killed renamable $sgpr34_sgpr35
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
-  ; CHECK-NEXT:   KILL killed renamable $vgpr1
   ; CHECK-NEXT:   S_ENDPGM 0
   bb.0:
     liveins: $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr14, $sgpr15, $sgpr16

From 12dfcc02388810e113cca359b2b529c98d02307c Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 26 Oct 2023 10:09:31 +0100
Subject: [PATCH 035/877] [DAG] Update test case for Issue #69965

The previous reduced test case just showed a minor codegen regression, this test now shows the actual miscompilation
---
 llvm/test/CodeGen/X86/pr69965.ll | 74 ++++++++++++++++++--------------
 1 file changed, 42 insertions(+), 32 deletions(-)

diff --git a/llvm/test/CodeGen/X86/pr69965.ll b/llvm/test/CodeGen/X86/pr69965.ll
index fc805e5097c0b..47bec4d7b1ab5 100644
--- a/llvm/test/CodeGen/X86/pr69965.ll
+++ b/llvm/test/CodeGen/X86/pr69965.ll
@@ -2,40 +2,50 @@
 ; RUN: llc < %s -mtriple=i686-- | FileCheck %s --check-prefix=X86
 ; RUN: llc < %s -mtriple=x86_64-- | FileCheck %s --check-prefix=X64
 
-define i16 @test(i8 %_in) {
-; X86-LABEL: test:
-; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    notb %al
-; X86-NEXT:    movl %eax, %ecx
-; X86-NEXT:    orb $-128, %cl
-; X86-NEXT:    movzbl %cl, %ecx
-; X86-NEXT:    shll $8, %ecx
-; X86-NEXT:    addb %al, %al
-; X86-NEXT:    movzbl %al, %eax
-; X86-NEXT:    orl %ecx, %eax
-; X86-NEXT:    # kill: def $ax killed $ax killed $eax
+define i64 @PR69965(ptr %input_ptrs, ptr %output_ptrs) {
+; X86-LABEL: PR69965:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl (%eax), %eax
+; X86-NEXT:    movzbl (%eax), %eax
+; X86-NEXT:    notl %eax
+; X86-NEXT:    movzbl %al, %edx
+; X86-NEXT:    shll $8, %eax
+; X86-NEXT:    movl (%ecx), %ecx
+; X86-NEXT:    leal (%eax,%edx,2), %eax
+; X86-NEXT:    orl $32768, %eax # imm = 0x8000
+; X86-NEXT:    movw %ax, (%ecx)
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    xorl %edx, %edx
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test:
-; X64:       # %bb.0:
-; X64-NEXT:    notb %dil
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    orb $-128, %al
+; X64-LABEL: PR69965:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movq (%rdi), %rax
+; X64-NEXT:    movzbl (%rax), %eax
+; X64-NEXT:    notl %eax
 ; X64-NEXT:    movzbl %al, %ecx
-; X64-NEXT:    shll $8, %ecx
-; X64-NEXT:    addb %dil, %dil
-; X64-NEXT:    movzbl %dil, %eax
-; X64-NEXT:    orl %ecx, %eax
-; X64-NEXT:    # kill: def $ax killed $ax killed $eax
+; X64-NEXT:    # kill: def $eax killed $eax def $rax
+; X64-NEXT:    shll $8, %eax
+; X64-NEXT:    movq (%rsi), %rdx
+; X64-NEXT:    leal (%rax,%rcx,2), %eax
+; X64-NEXT:    orl $32768, %eax # imm = 0x8000
+; X64-NEXT:    movw %ax, (%rdx)
+; X64-NEXT:    xorl %eax, %eax
 ; X64-NEXT:    retq
-  %_1 = and i8 %_in, 127
-  %_2 = xor i8 %_1, 127
-  %_3 = or i8 %_2, -128
-  %_4 = zext i8 %_3 to i16
-  %_6 = shl nuw i16 %_4, 8
-  %_7 = shl nuw i8 %_2, 1
-  %_8 = zext i8 %_7 to i16
-  %_9 = or i16 %_6, %_8
-  ret i16 %_9
+entry:
+  %0 = load ptr, ptr %input_ptrs, align 8
+  %.val.i = load i8, ptr %0, align 1
+  %1 = and i8 %.val.i, 127
+  %2 = xor i8 %1, 127
+  %3 = or i8 %2, -128
+  %4 = zext i8 %3 to i16
+  %5 = load ptr, ptr %output_ptrs, align 8
+  %6 = shl nuw i16 %4, 8
+  %7 = shl nuw i8 %2, 1
+  %8 = zext i8 %7 to i16
+  %9 = or i16 %6, %8
+  store i16 %9, ptr %5, align 2
+  ret i64 0
 }

From 547dc461225ba65b6d5dac34efd31d2c8c5b1049 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 26 Oct 2023 10:34:06 +0100
Subject: [PATCH 036/877] [DAG] SimplifyDemandedBits - ensure we drop NSW/NUW
 flags when we simplify a SHL node's input

We already do this for variable shifts, but we missed it for constant shifts

Fixes #69965
---
 .../CodeGen/SelectionDAG/TargetLowering.cpp   | 11 ++++++++++-
 llvm/test/CodeGen/X86/pr69965.ll              | 19 +++++++++++--------
 2 files changed, 21 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 8b4f315949912..c139850eb9963 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -1786,8 +1786,17 @@ bool TargetLowering::SimplifyDemandedBits(
 
       APInt InDemandedMask = DemandedBits.lshr(ShAmt);
       if (SimplifyDemandedBits(Op0, InDemandedMask, DemandedElts, Known, TLO,
-                               Depth + 1))
+                               Depth + 1)) {
+        SDNodeFlags Flags = Op.getNode()->getFlags();
+        if (Flags.hasNoSignedWrap() || Flags.hasNoUnsignedWrap()) {
+          // Disable the nsw and nuw flags. We can no longer guarantee that we
+          // won't wrap after simplification.
+          Flags.setNoSignedWrap(false);
+          Flags.setNoUnsignedWrap(false);
+          Op->setFlags(Flags);
+        }
         return true;
+      }
       assert(!Known.hasConflict() && "Bits known to be one AND zero?");
       Known.Zero <<= ShAmt;
       Known.One <<= ShAmt;
diff --git a/llvm/test/CodeGen/X86/pr69965.ll b/llvm/test/CodeGen/X86/pr69965.ll
index 47bec4d7b1ab5..33bea976c7896 100644
--- a/llvm/test/CodeGen/X86/pr69965.ll
+++ b/llvm/test/CodeGen/X86/pr69965.ll
@@ -10,10 +10,12 @@ define i64 @PR69965(ptr %input_ptrs, ptr %output_ptrs) {
 ; X86-NEXT:    movl (%eax), %eax
 ; X86-NEXT:    movzbl (%eax), %eax
 ; X86-NEXT:    notl %eax
-; X86-NEXT:    movzbl %al, %edx
-; X86-NEXT:    shll $8, %eax
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    shll $8, %edx
 ; X86-NEXT:    movl (%ecx), %ecx
-; X86-NEXT:    leal (%eax,%edx,2), %eax
+; X86-NEXT:    addb %al, %al
+; X86-NEXT:    movzbl %al, %eax
+; X86-NEXT:    orl %edx, %eax
 ; X86-NEXT:    orl $32768, %eax # imm = 0x8000
 ; X86-NEXT:    movw %ax, (%ecx)
 ; X86-NEXT:    xorl %eax, %eax
@@ -25,13 +27,14 @@ define i64 @PR69965(ptr %input_ptrs, ptr %output_ptrs) {
 ; X64-NEXT:    movq (%rdi), %rax
 ; X64-NEXT:    movzbl (%rax), %eax
 ; X64-NEXT:    notl %eax
-; X64-NEXT:    movzbl %al, %ecx
-; X64-NEXT:    # kill: def $eax killed $eax def $rax
+; X64-NEXT:    leal (%rax,%rax), %ecx
+; X64-NEXT:    # kill: def $eax killed $eax killed $rax
 ; X64-NEXT:    shll $8, %eax
 ; X64-NEXT:    movq (%rsi), %rdx
-; X64-NEXT:    leal (%rax,%rcx,2), %eax
-; X64-NEXT:    orl $32768, %eax # imm = 0x8000
-; X64-NEXT:    movw %ax, (%rdx)
+; X64-NEXT:    movzbl %cl, %ecx
+; X64-NEXT:    orl %eax, %ecx
+; X64-NEXT:    orl $32768, %ecx # imm = 0x8000
+; X64-NEXT:    movw %cx, (%rdx)
 ; X64-NEXT:    xorl %eax, %eax
 ; X64-NEXT:    retq
 entry:

From 1217a54be9a194137c776c692a2fad2ba80a8ee8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder@redhat.com>
Date: Thu, 26 Oct 2023 11:23:14 +0200
Subject: [PATCH 037/877] [clang][Interp][NFC] Make InlineDescriptor::Desc
 const

---
 clang/lib/AST/Interp/Descriptor.cpp | 2 +-
 clang/lib/AST/Interp/Descriptor.h   | 6 +++---
 clang/lib/AST/Interp/Pointer.cpp    | 2 +-
 clang/lib/AST/Interp/Pointer.h      | 7 ++++---
 4 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/clang/lib/AST/Interp/Descriptor.cpp b/clang/lib/AST/Interp/Descriptor.cpp
index abd75a8d67bbd..2a21f60588d46 100644
--- a/clang/lib/AST/Interp/Descriptor.cpp
+++ b/clang/lib/AST/Interp/Descriptor.cpp
@@ -262,7 +262,7 @@ Descriptor::Descriptor(const DeclTy &D, PrimType Type, bool IsTemporary,
 }
 
 /// Arrays of composite elements.
-Descriptor::Descriptor(const DeclTy &D, Descriptor *Elem, MetadataSize MD,
+Descriptor::Descriptor(const DeclTy &D, const Descriptor *Elem, MetadataSize MD,
                        unsigned NumElems, bool IsConst, bool IsTemporary,
                        bool IsMutable)
     : Source(D), ElemSize(Elem->getAllocSize() + sizeof(InlineDescriptor)),
diff --git a/clang/lib/AST/Interp/Descriptor.h b/clang/lib/AST/Interp/Descriptor.h
index be9a380138a7b..2fd4e92082645 100644
--- a/clang/lib/AST/Interp/Descriptor.h
+++ b/clang/lib/AST/Interp/Descriptor.h
@@ -72,7 +72,7 @@ struct InlineDescriptor {
   /// Flag indicating if the field is mutable (if in a record).
   unsigned IsFieldMutable : 1;
 
-  Descriptor *Desc;
+  const Descriptor *Desc;
 };
 
 /// Describes a memory block created by an allocation site.
@@ -102,7 +102,7 @@ struct Descriptor final {
   /// Pointer to the record, if block contains records.
   Record *const ElemRecord = nullptr;
   /// Descriptor of the array element.
-  Descriptor *const ElemDesc = nullptr;
+  const Descriptor *const ElemDesc = nullptr;
   /// Flag indicating if the block is mutable.
   const bool IsConst = false;
   /// Flag indicating if a field is mutable.
@@ -129,7 +129,7 @@ struct Descriptor final {
   Descriptor(const DeclTy &D, PrimType Type, bool IsTemporary, UnknownSize);
 
   /// Allocates a descriptor for an array of composites.
-  Descriptor(const DeclTy &D, Descriptor *Elem, MetadataSize MD,
+  Descriptor(const DeclTy &D, const Descriptor *Elem, MetadataSize MD,
              unsigned NumElems, bool IsConst, bool IsTemporary, bool IsMutable);
 
   /// Allocates a descriptor for an array of composites of unknown size.
diff --git a/clang/lib/AST/Interp/Pointer.cpp b/clang/lib/AST/Interp/Pointer.cpp
index d1af58203bec6..e979b99b0fdd0 100644
--- a/clang/lib/AST/Interp/Pointer.cpp
+++ b/clang/lib/AST/Interp/Pointer.cpp
@@ -94,7 +94,7 @@ APValue Pointer::toAPValue() const {
     Offset = CharUnits::Zero();
   } else {
     // Build the lvalue base from the block.
-    Descriptor *Desc = getDeclDesc();
+    const Descriptor *Desc = getDeclDesc();
     if (auto *VD = Desc->asValueDecl())
       Base = VD;
     else if (auto *E = Desc->asExpr())
diff --git a/clang/lib/AST/Interp/Pointer.h b/clang/lib/AST/Interp/Pointer.h
index 3b21290332a9d..65d710077fd1c 100644
--- a/clang/lib/AST/Interp/Pointer.h
+++ b/clang/lib/AST/Interp/Pointer.h
@@ -184,7 +184,8 @@ class Pointer {
 
     // Step into the containing array, if inside one.
     unsigned Next = Base - getInlineDesc()->Offset;
-    Descriptor *Desc = Next == 0 ? getDeclDesc() : getDescriptor(Next)->Desc;
+    const Descriptor *Desc =
+        Next == 0 ? getDeclDesc() : getDescriptor(Next)->Desc;
     if (!Desc->IsArray)
       return *this;
     return Pointer(Pointee, Next, Offset);
@@ -198,7 +199,7 @@ class Pointer {
   bool isField() const { return Base != 0 && Base != RootPtrMark; }
 
   /// Accessor for information about the declaration site.
-  Descriptor *getDeclDesc() const { return Pointee->Desc; }
+  const Descriptor *getDeclDesc() const { return Pointee->Desc; }
   SourceLocation getDeclLoc() const { return getDeclDesc()->getLocation(); }
 
   /// Returns a pointer to the object of which this pointer is a field.
@@ -222,7 +223,7 @@ class Pointer {
   }
 
   /// Accessors for information about the innermost field.
-  Descriptor *getFieldDesc() const {
+  const Descriptor *getFieldDesc() const {
     if (Base == 0 || Base == RootPtrMark)
       return getDeclDesc();
     return getInlineDesc()->Desc;

From f8b7506e2db2a162deec8343d6942258484be233 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder@redhat.com>
Date: Thu, 26 Oct 2023 11:40:43 +0200
Subject: [PATCH 038/877] [clang][NFC] Move a variable into the closest scope

AllocType is not used anywhere else.
---
 clang/lib/AST/ExprConstant.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index eea0827d6f7a8..320e2ef12c38d 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -6867,8 +6867,8 @@ static std::optional<DynAlloc *> CheckDeleteKind(EvalInfo &Info, const Expr *E,
     return std::nullopt;
   }
 
-  QualType AllocType = Pointer.Base.getDynamicAllocType();
   if (DeallocKind != (*Alloc)->getKind()) {
+    QualType AllocType = Pointer.Base.getDynamicAllocType();
     Info.FFDiag(E, diag::note_constexpr_new_delete_mismatch)
         << DeallocKind << (*Alloc)->getKind() << AllocType;
     NoteLValueLocation(Info, Pointer.Base);

From 737d2acd6863ca7012a75575c1a52fd807ed44ba Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder@redhat.com>
Date: Thu, 26 Oct 2023 11:47:26 +0200
Subject: [PATCH 039/877] [clang][Interp][NFC] Make Block::Desc const

---
 clang/lib/AST/Interp/InterpBlock.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/clang/lib/AST/Interp/InterpBlock.h b/clang/lib/AST/Interp/InterpBlock.h
index 89f937b588fb6..4ab67ebb9eaaf 100644
--- a/clang/lib/AST/Interp/InterpBlock.h
+++ b/clang/lib/AST/Interp/InterpBlock.h
@@ -53,7 +53,7 @@ class Block final {
         bool IsStatic = false, bool IsExtern = false)
       : DeclID(DeclID), IsStatic(IsStatic), IsExtern(IsExtern), Desc(Desc) {}
 
-  Block(Descriptor *Desc, bool IsStatic = false, bool IsExtern = false)
+  Block(const Descriptor *Desc, bool IsStatic = false, bool IsExtern = false)
       : DeclID((unsigned)-1), IsStatic(IsStatic), IsExtern(IsExtern),
         Desc(Desc) {}
 
@@ -120,8 +120,8 @@ class Block final {
   friend class DeadBlock;
   friend class InterpState;
 
-  Block(Descriptor *Desc, bool IsExtern, bool IsStatic, bool IsDead)
-    : IsStatic(IsStatic), IsExtern(IsExtern), IsDead(true), Desc(Desc) {}
+  Block(const Descriptor *Desc, bool IsExtern, bool IsStatic, bool IsDead)
+      : IsStatic(IsStatic), IsExtern(IsExtern), IsDead(true), Desc(Desc) {}
 
   /// Deletes a dead block at the end of its lifetime.
   void cleanup();
@@ -149,7 +149,7 @@ class Block final {
   /// via invokeCtor.
   bool IsInitialized = false;
   /// Pointer to the stack slot descriptor.
-  Descriptor *Desc;
+  const Descriptor *Desc;
 };
 
 /// Descriptor for a dead block.

From 24865a6423bbdb0286a649c5a953af0e817bfee8 Mon Sep 17 00:00:00 2001
From: Piotr Sobczak <piotr.sobczak@amd.com>
Date: Thu, 26 Oct 2023 11:46:14 +0200
Subject: [PATCH 040/877] [Inline Spiller] Pre-commit test

---
 llvm/test/CodeGen/AMDGPU/dead_bundle.mir | 42 ++++++++++++++++++++++++
 1 file changed, 42 insertions(+)
 create mode 100644 llvm/test/CodeGen/AMDGPU/dead_bundle.mir

diff --git a/llvm/test/CodeGen/AMDGPU/dead_bundle.mir b/llvm/test/CodeGen/AMDGPU/dead_bundle.mir
new file mode 100644
index 0000000000000..d302cb8051c1a
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/dead_bundle.mir
@@ -0,0 +1,42 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn--amdpal -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs=0 -start-before=greedy,0 -stop-after=virtregrewriter,0 -stress-regalloc=5 %s -o - | FileCheck %s
+
+# This test currently fails with verify-machineinstrs=1 due to dead bundle mishandling: "Live range continues after dead def flag".
+---
+name: psmain
+tracksRegLiveness: true
+machineFunctionInfo:
+  stackPtrOffsetReg: '$sgpr32'
+  psInputAddr:     7
+  psInputEnable:   7
+body:             |
+  bb.0:
+    liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2
+
+    ; CHECK-LABEL: name: psmain
+    ; CHECK: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: dead renamable $sgpr3 = IMPLICIT_DEF
+    ; CHECK-NEXT: renamable $sgpr1 = KILL undef $sgpr1
+    ; CHECK-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11 = S_BUFFER_LOAD_DWORDX8_IMM undef renamable $sgpr0_sgpr1_sgpr2_sgpr3, 416, 0 :: (dereferenceable invariant load (s256), align 4)
+    ; CHECK-NEXT: dead [[V_CVT_U32_F32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_U32_F32_e64 0, $sgpr4, 0, 0, implicit $mode, implicit $exec
+    ; CHECK-NEXT: SI_SPILL_S256_SAVE renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, %stack.0, implicit $exec, implicit $sgpr32 :: (store (s256) into %stack.0, align 4, addrspace 5)
+    ; CHECK-NEXT: dead renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = IMPLICIT_DEF
+    ; CHECK-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11 = S_BUFFER_LOAD_DWORDX8_IMM undef renamable $sgpr0_sgpr1_sgpr2_sgpr3, 416, 0 :: (dereferenceable invariant load (s256), align 4)
+    ; CHECK-NEXT: renamable $sgpr3 = COPY killed renamable $sgpr7
+    ; CHECK-NEXT: renamable $sgpr5 = COPY renamable $sgpr9
+    ; CHECK-NEXT: dead undef %4.sub0:vreg_64 = COPY renamable $sgpr3
+    ; CHECK-NEXT: dead undef %7.sub1:vreg_64 = COPY killed renamable $sgpr5
+    ; CHECK-NEXT: dead [[IMAGE_SAMPLE_V1_V2_gfx11_:%[0-9]+]]:vgpr_32 = IMAGE_SAMPLE_V1_V2_gfx11 undef %4, undef renamable $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 8)
+    ; CHECK-NEXT: S_ENDPGM 0
+    undef %8.sub3:sgpr_128 = IMPLICIT_DEF
+    undef %8.sub1:sgpr_128 = COPY undef $sgpr1
+    %346:sgpr_256 = S_BUFFER_LOAD_DWORDX8_IMM undef %8, 416, 0 :: (dereferenceable invariant load (s256), align 4)
+    %60:vgpr_32 = V_CVT_U32_F32_e64 0, %346.sub0, 0, 0, implicit $mode, implicit $exec
+    %127:sgpr_512 = IMPLICIT_DEF
+    undef %283.sub0:vreg_64 = COPY %346.sub3
+    undef %283.sub1:vreg_64 = COPY %346.sub5
+    %282:vgpr_32 = IMAGE_SAMPLE_V1_V2_gfx11 undef %283, undef %127.sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15, %8, 1, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 8)
+    S_ENDPGM 0
+...
+

From 80abbeca8e52174157654ee33151da507d098ced Mon Sep 17 00:00:00 2001
From: Piotr Sobczak <piotr.sobczak@amd.com>
Date: Thu, 26 Oct 2023 11:51:26 +0200
Subject: [PATCH 041/877] [Inline Spiller] Consider bundles when marking defs
 as dead

Fix bug where the code expects just a single MI, but a series of
bundled MIs need to be handled instead.

The semi-formed bundled are created by SplitKit for the case where
not all lanes are live (buildSingleSubRegCopy). Then the remat
kicks in, and since the values that are copied in the bundle do not
need to be preserved due to the remat (dead defs), all instructions
in the bundle should be marked as dead.

However, only the first one gets marked as dead, which causes the
verifier to complain later with error: "Live range continues after
dead def flag".

Differential Revision: https://reviews.llvm.org/D156999
---
 llvm/lib/CodeGen/InlineSpiller.cpp       | 29 ++++++++++++++++++++++++
 llvm/test/CodeGen/AMDGPU/dead_bundle.mir |  5 ++--
 2 files changed, 31 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/CodeGen/InlineSpiller.cpp b/llvm/lib/CodeGen/InlineSpiller.cpp
index 46fcc62e09e8a..71d58b2e9e18d 100644
--- a/llvm/lib/CodeGen/InlineSpiller.cpp
+++ b/llvm/lib/CodeGen/InlineSpiller.cpp
@@ -753,6 +753,35 @@ void InlineSpiller::reMaterializeAll() {
         continue;
       LLVM_DEBUG(dbgs() << "All defs dead: " << *MI);
       DeadDefs.push_back(MI);
+      // If MI is a bundle header, also try removing copies inside the bundle,
+      // otherwise the verifier would complain "live range continues after dead
+      // def flag".
+      if (MI->isBundledWithSucc() && !MI->isBundledWithPred()) {
+        MachineBasicBlock::instr_iterator BeginIt = MI->getIterator(),
+                                          EndIt = MI->getParent()->instr_end();
+        ++BeginIt; // Skip MI that was already handled.
+
+        bool OnlyDeadCopies = true;
+        for (MachineBasicBlock::instr_iterator It = BeginIt;
+             It != EndIt && It->isBundledWithPred(); ++It) {
+
+          auto DestSrc = TII.isCopyInstr(*It);
+          bool IsCopyToDeadReg =
+              DestSrc && DestSrc->Destination->getReg() == Reg;
+          if (!IsCopyToDeadReg) {
+            OnlyDeadCopies = false;
+            break;
+          }
+        }
+        if (OnlyDeadCopies) {
+          for (MachineBasicBlock::instr_iterator It = BeginIt;
+               It != EndIt && It->isBundledWithPred(); ++It) {
+            It->addRegisterDead(Reg, &TRI);
+            LLVM_DEBUG(dbgs() << "All defs dead: " << *It);
+            DeadDefs.push_back(&*It);
+          }
+        }
+      }
     }
   }
 
diff --git a/llvm/test/CodeGen/AMDGPU/dead_bundle.mir b/llvm/test/CodeGen/AMDGPU/dead_bundle.mir
index d302cb8051c1a..7970650f43d1b 100644
--- a/llvm/test/CodeGen/AMDGPU/dead_bundle.mir
+++ b/llvm/test/CodeGen/AMDGPU/dead_bundle.mir
@@ -1,7 +1,7 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn--amdpal -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs=0 -start-before=greedy,0 -stop-after=virtregrewriter,0 -stress-regalloc=5 %s -o - | FileCheck %s
+# RUN: llc -mtriple=amdgcn--amdpal -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs=1 -start-before=greedy,0 -stop-after=virtregrewriter,0 -stress-regalloc=5 %s -o - | FileCheck %s
 
-# This test currently fails with verify-machineinstrs=1 due to dead bundle mishandling: "Live range continues after dead def flag".
+# This test checks that dead bundles are handled correctly.
 ---
 name: psmain
 tracksRegLiveness: true
@@ -20,7 +20,6 @@ body:             |
     ; CHECK-NEXT: renamable $sgpr1 = KILL undef $sgpr1
     ; CHECK-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11 = S_BUFFER_LOAD_DWORDX8_IMM undef renamable $sgpr0_sgpr1_sgpr2_sgpr3, 416, 0 :: (dereferenceable invariant load (s256), align 4)
     ; CHECK-NEXT: dead [[V_CVT_U32_F32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_U32_F32_e64 0, $sgpr4, 0, 0, implicit $mode, implicit $exec
-    ; CHECK-NEXT: SI_SPILL_S256_SAVE renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, %stack.0, implicit $exec, implicit $sgpr32 :: (store (s256) into %stack.0, align 4, addrspace 5)
     ; CHECK-NEXT: dead renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = IMPLICIT_DEF
     ; CHECK-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11 = S_BUFFER_LOAD_DWORDX8_IMM undef renamable $sgpr0_sgpr1_sgpr2_sgpr3, 416, 0 :: (dereferenceable invariant load (s256), align 4)
     ; CHECK-NEXT: renamable $sgpr3 = COPY killed renamable $sgpr7

From 85f6b2fac9a367337e43ca288c45ea783981cc16 Mon Sep 17 00:00:00 2001
From: Sunil Kuravinakop <koops@hpe.com>
Date: Thu, 26 Oct 2023 05:01:02 -0500
Subject: [PATCH 042/877] [OpenMP] Patch for Support to loop bind clause :
 Checking Parent Region

Differential revision: https://reviews.llvm.org/D158266
---
 clang/include/clang/Sema/Sema.h          |   8 +-
 clang/lib/Sema/SemaOpenMP.cpp            |  64 +++++---
 clang/test/OpenMP/loop_bind_messages.cpp | 180 +++++++++++++++++++++--
 clang/test/PCH/pragma-loop.cpp           |   8 +-
 4 files changed, 227 insertions(+), 33 deletions(-)

diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index 1e9752345ffd1..18ac85011aa75 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -11307,6 +11307,7 @@ class Sema final {
   /// on the parameter of the bind clause. In the methods for the
   /// mapped directives, check the parameters of the lastprivate clause.
   bool checkLastPrivateForMappedDirectives(ArrayRef<OMPClause *> Clauses);
+
   /// Depending on the bind clause of OMPD_loop map the directive to new
   /// directives.
   ///    1) loop bind(parallel) --> OMPD_for
@@ -11316,9 +11317,12 @@ class Sema final {
   /// rigorous semantic checking in the new mapped directives.
   bool mapLoopConstruct(llvm::SmallVector<OMPClause *> &ClausesWithoutBind,
                         ArrayRef<OMPClause *> Clauses,
-                        OpenMPBindClauseKind BindKind,
+                        OpenMPBindClauseKind &BindKind,
                         OpenMPDirectiveKind &Kind,
-                        OpenMPDirectiveKind &PrevMappedDirective);
+                        OpenMPDirectiveKind &PrevMappedDirective,
+                        SourceLocation StartLoc, SourceLocation EndLoc,
+                        const DeclarationNameInfo &DirName,
+                        OpenMPDirectiveKind CancelRegion);
 
 public:
   /// The declarator \p D defines a function in the scope \p S which is nested
diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp
index 75f9e152dca92..f28e0f2693080 100644
--- a/clang/lib/Sema/SemaOpenMP.cpp
+++ b/clang/lib/Sema/SemaOpenMP.cpp
@@ -5062,6 +5062,18 @@ static bool checkNestingOfRegions(Sema &SemaRef, const DSAStackTy *Stack,
         CurrentRegion != OMPD_cancellation_point &&
         CurrentRegion != OMPD_cancel && CurrentRegion != OMPD_scan)
       return false;
+    // Checks needed for mapping "loop" construct. Please check mapLoopConstruct
+    // for a detailed explanation
+    if (SemaRef.LangOpts.OpenMP >= 50 && CurrentRegion == OMPD_loop &&
+        ((BindKind == OMPC_BIND_parallel) || (BindKind == OMPC_BIND_teams)) &&
+        (isOpenMPWorksharingDirective(ParentRegion) ||
+         ParentRegion == OMPD_loop)) {
+      int ErrorMsgNumber = (BindKind == OMPC_BIND_parallel) ? 1 : 4;
+      SemaRef.Diag(StartLoc, diag::err_omp_prohibited_region)
+          << true << getOpenMPDirectiveName(ParentRegion) << ErrorMsgNumber
+          << getOpenMPDirectiveName(CurrentRegion);
+      return true;
+    }
     if (CurrentRegion == OMPD_cancellation_point ||
         CurrentRegion == OMPD_cancel) {
       // OpenMP [2.16, Nesting of Regions]
@@ -6114,35 +6126,40 @@ processImplicitMapsWithDefaultMappers(Sema &S, DSAStackTy *Stack,
 
 bool Sema::mapLoopConstruct(llvm::SmallVector<OMPClause *> &ClausesWithoutBind,
                             ArrayRef<OMPClause *> Clauses,
-                            OpenMPBindClauseKind BindKind,
+                            OpenMPBindClauseKind &BindKind,
                             OpenMPDirectiveKind &Kind,
-                            OpenMPDirectiveKind &PrevMappedDirective) {
+                            OpenMPDirectiveKind &PrevMappedDirective,
+                            SourceLocation StartLoc, SourceLocation EndLoc,
+                            const DeclarationNameInfo &DirName,
+                            OpenMPDirectiveKind CancelRegion) {
 
   bool UseClausesWithoutBind = false;
 
   // Restricting to "#pragma omp loop bind"
   if (getLangOpts().OpenMP >= 50 && Kind == OMPD_loop) {
+
+    const OpenMPDirectiveKind ParentDirective = DSAStack->getParentDirective();
+
     if (BindKind == OMPC_BIND_unknown) {
       // Setting the enclosing teams or parallel construct for the loop
       // directive without bind clause.
       BindKind = OMPC_BIND_thread; // Default bind(thread) if binding is unknown
 
-      const OpenMPDirectiveKind ParentDirective =
-          DSAStack->getParentDirective();
       if (ParentDirective == OMPD_unknown) {
         Diag(DSAStack->getDefaultDSALocation(),
              diag::err_omp_bind_required_on_loop);
-      } else if (ParentDirective == OMPD_parallel ||
-                 ParentDirective == OMPD_target_parallel) {
+      } else if (isOpenMPParallelDirective(ParentDirective) &&
+                 !isOpenMPTeamsDirective(ParentDirective)) {
         BindKind = OMPC_BIND_parallel;
-      } else if (ParentDirective == OMPD_teams ||
-                 ParentDirective == OMPD_target_teams) {
+      } else if (isOpenMPNestingTeamsDirective(ParentDirective) ||
+                 (ParentDirective == OMPD_target_teams)) {
         BindKind = OMPC_BIND_teams;
       }
     } else {
-      // bind clause is present, so we should set flag indicating to only
-      // use the clauses that aren't the bind clause for the new directive that
-      // loop is lowered to.
+      // bind clause is present in loop directive. When the loop directive is
+      // changed to a new directive the bind clause is not used. So, we should
+      // set flag indicating to only use the clauses that aren't the
+      // bind clause.
       UseClausesWithoutBind = true;
     }
 
@@ -6203,26 +6220,35 @@ StmtResult Sema::ActOnOpenMPExecutableDirective(
     OpenMPDirectiveKind PrevMappedDirective) {
   StmtResult Res = StmtError();
   OpenMPBindClauseKind BindKind = OMPC_BIND_unknown;
+  llvm::SmallVector<OMPClause *> ClausesWithoutBind;
+  bool UseClausesWithoutBind = false;
+
   if (const OMPBindClause *BC =
           OMPExecutableDirective::getSingleClause<OMPBindClause>(Clauses))
     BindKind = BC->getBindKind();
+
+  // Variable used to note down the DirectiveKind because mapLoopConstruct may
+  // change "Kind" variable, due to mapping of "omp loop" to other directives.
+  OpenMPDirectiveKind DK = Kind;
+  if ((Kind == OMPD_loop) || (PrevMappedDirective == OMPD_loop)) {
+    UseClausesWithoutBind = mapLoopConstruct(
+        ClausesWithoutBind, Clauses, BindKind, Kind, PrevMappedDirective,
+        StartLoc, EndLoc, DirName, CancelRegion);
+    DK = OMPD_loop;
+  }
+
   // First check CancelRegion which is then used in checkNestingOfRegions.
   if (checkCancelRegion(*this, Kind, CancelRegion, StartLoc) ||
-      checkNestingOfRegions(*this, DSAStack, Kind, DirName, CancelRegion,
-                            BindKind, StartLoc))
+      checkNestingOfRegions(*this, DSAStack, DK, DirName, CancelRegion,
+                            BindKind, StartLoc)) {
     return StmtError();
+  }
 
   // Report affected OpenMP target offloading behavior when in HIP lang-mode.
   if (getLangOpts().HIP && (isOpenMPTargetExecutionDirective(Kind) ||
                             isOpenMPTargetDataManagementDirective(Kind)))
     Diag(StartLoc, diag::warn_hip_omp_target_directives);
 
-  llvm::SmallVector<OMPClause *> ClausesWithoutBind;
-  bool UseClausesWithoutBind = false;
-
-  UseClausesWithoutBind = mapLoopConstruct(ClausesWithoutBind, Clauses,
-                                           BindKind, Kind, PrevMappedDirective);
-
   llvm::SmallVector<OMPClause *, 8> ClausesWithImplicit;
   VarsWithInheritedDSAType VarsWithInheritedDSA;
   bool ErrorFound = false;
diff --git a/clang/test/OpenMP/loop_bind_messages.cpp b/clang/test/OpenMP/loop_bind_messages.cpp
index f7fdf28971432..186a990fd7c52 100644
--- a/clang/test/OpenMP/loop_bind_messages.cpp
+++ b/clang/test/OpenMP/loop_bind_messages.cpp
@@ -4,6 +4,7 @@
 
 #define NNN 50
 int aaa[NNN];
+int aaa2[NNN][NNN];
 
 void parallel_loop() {
   #pragma omp parallel
@@ -15,6 +16,91 @@ void parallel_loop() {
    }
 }
 
+void parallel_for_AND_loop_bind() {
+  #pragma omp parallel for
+  for (int i = 0 ; i < NNN ; i++) {
+    #pragma omp loop bind(parallel) // expected-error{{region cannot be closely nested inside 'parallel for' region; perhaps you forget to enclose 'omp loop' directive into a parallel region?}}
+    for (int j = 0 ; j < NNN ; j++) {
+      aaa2[i][j] = i+j;
+    }
+  }
+}
+
+void parallel_nowait() {
+  #pragma omp parallel
+  #pragma omp for nowait
+  for (int i = 0 ; i < NNN ; i++) {
+    #pragma omp loop bind(parallel) // expected-error{{region cannot be closely nested inside 'for' region; perhaps you forget to enclose 'omp loop' directive into a parallel region?}}
+    for (int j = 0 ; j < NNN ; j++) {
+      aaa2[i][j] = i+j;
+    }
+  }
+}
+
+void parallel_for_with_nothing() {
+  #pragma omp parallel for
+  for (int i = 0 ; i < NNN ; i++) {
+    #pragma omp nothing
+    #pragma omp loop // expected-error{{region cannot be closely nested inside 'parallel for' region; perhaps you forget to enclose 'omp loop' directive into a parallel region?}}
+    for (int j = 0 ; j < NNN ; j++) {
+      aaa2[i][j] = i+j;
+    }
+  }
+}
+
+void parallel_targetfor_with_loop_bind() {
+  #pragma omp target teams distribute parallel for 
+  for (int i = 0 ; i < NNN ; i++) {
+    #pragma omp loop bind(parallel) // expected-error{{region cannot be closely nested inside 'target teams distribute parallel for' region; perhaps you forget to enclose 'omp loop' directive into a parallel region?}}
+    for (int j = 0 ; j < NNN ; j++) {
+      aaa2[i][j] = i+j;
+    }
+  }
+}
+
+void parallel_targetparallel_with_loop() {
+  #pragma omp target parallel
+  for (int i = 0 ; i < NNN ; i++) {
+    #pragma omp loop bind(parallel)
+    for (int j = 0 ; j < NNN ; j++) {
+      aaa2[i][j] = i+j;
+    }
+  }
+}
+
+void loop_bind_AND_loop_bind() {
+  #pragma omp parallel for
+  for (int i = 0; i < 100; ++i) {
+    #pragma omp loop bind(parallel) // expected-error{{region cannot be closely nested inside 'parallel for' region; perhaps you forget to enclose 'omp loop' directive into a parallel region?}} 
+    for (int i = 0 ; i < NNN ; i++) {
+      #pragma omp loop bind(parallel) // expected-error{{region cannot be closely nested inside 'loop' region; perhaps you forget to enclose 'omp loop' directive into a parallel region?}} 
+      for (int j = 0 ; j < NNN ; j++) {
+        aaa[j] = j*NNN;
+      }
+    }
+  }
+}
+
+void parallel_with_sections_loop() {
+  #pragma omp parallel
+  {
+     #pragma omp sections
+     {
+        for (int i = 0 ; i < NNN ; i++) {
+          #pragma omp loop bind(parallel) // expected-error{{region cannot be closely nested inside 'sections' region; perhaps you forget to enclose 'omp loop' directive into a parallel region?}}
+          for (int j = 0 ; j < NNN ; j++) {
+            aaa2[i][j] = i+j;
+          }
+        }
+
+        #pragma omp section
+	{
+          aaa[NNN-1] = NNN;
+        }
+     }
+  }
+}
+
 void teams_loop() {
   int var1, var2;
 
@@ -34,17 +120,23 @@ void teams_loop() {
    }
 }
 
-void orphan_loop_with_bind() {
-  #pragma omp loop bind(parallel) 
-  for (int j = 0 ; j < NNN ; j++) {
-    aaa[j] = j*NNN;
+void teams_targetteams_with_loop() {
+  #pragma omp target teams
+  for (int i = 0 ; i < NNN ; i++) {
+    #pragma omp loop bind(teams)
+    for (int j = 0 ; j < NNN ; j++) {
+      aaa2[i][j] = i+j;
+    }
   }
 }
 
-void orphan_loop_no_bind() {
-  #pragma omp loop  // expected-error{{expected 'bind' clause for 'loop' construct without an enclosing OpenMP construct}}
-  for (int j = 0 ; j < NNN ; j++) {
-    aaa[j] = j*NNN;
+void teams_targetfor_with_loop_bind() {
+  #pragma omp target teams distribute parallel for 
+  for (int i = 0 ; i < NNN ; i++) {
+    #pragma omp loop bind(teams) // expected-error{{region cannot be closely nested inside 'target teams distribute parallel for' region; perhaps you forget to enclose 'omp loop' directive into a teams region?}}
+    for (int j = 0 ; j < NNN ; j++) {
+      aaa2[i][j] = i+j;
+    }
   }
 }
 
@@ -65,12 +157,80 @@ void teams_loop_reduction() {
    }
 }
 
+void teams_loop_distribute() {
+  int total = 0;
+
+  #pragma omp teams num_teams(8) thread_limit(256)
+  #pragma omp distribute parallel for dist_schedule(static, 1024) \
+                                      schedule(static, 64)
+  for (int i = 0; i < NNN; i++) {
+    #pragma omp loop bind(teams) // expected-error{{'distribute parallel for' region; perhaps you forget to enclose 'omp loop' directive into a teams region?}}
+    for (int j = 0; j < NNN; j++) {
+      aaa2[i][j] = i+j;
+    }
+  }
+}
+
+void parallel_for_with_loop_teams_bind(){
+  #pragma omp parallel for
+  for (int i = 0; i < NNN; i++) {
+    #pragma omp loop bind(teams) // expected-error{{region cannot be closely nested inside 'parallel for' region; perhaps you forget to enclose 'omp loop' directive into a teams region?}}
+    for (int j = 0 ; j < NNN ; j++) {
+      aaa[i] = i+i*NNN;
+    }
+  }
+}
+
+void teams_with_loop_thread_bind(){
+  #pragma omp teams
+  for (int i = 0; i < NNN; i++) {
+    #pragma omp loop bind(thread)
+    for (int j = 0 ; j < NNN ; j++) {
+      aaa[i] = i+i*NNN;
+    }
+  }
+}
+
+void orphan_loop_no_bind() {
+  #pragma omp loop  // expected-error{{expected 'bind' clause for 'loop' construct without an enclosing OpenMP construct}}
+  for (int j = 0 ; j < NNN ; j++) {
+    aaa[j] = j*NNN;
+  }
+}
+
+void orphan_loop_parallel_bind() {
+  #pragma omp loop bind(parallel) 
+  for (int j = 0 ; j < NNN ; j++) {
+    aaa[j] = j*NNN;
+  }
+}
+
+void orphan_loop_teams_bind(){
+  #pragma omp loop bind(teams)
+  for (int i = 0; i < NNN; i++) {
+    aaa[i] = i+i*NNN;
+  }
+}
+
 int main(int argc, char *argv[]) {
   parallel_loop();
+  parallel_for_AND_loop_bind();
+  parallel_nowait();
+  parallel_for_with_nothing();
+  parallel_targetfor_with_loop_bind();
+  parallel_targetparallel_with_loop();
+  loop_bind_AND_loop_bind();
+  parallel_with_sections_loop();
   teams_loop();
-  orphan_loop_with_bind();
-  orphan_loop_no_bind();
+  teams_targetteams_with_loop();
+  teams_targetfor_with_loop_bind();
   teams_loop_reduction();
+  teams_loop_distribute();
+  parallel_for_with_loop_teams_bind();
+  teams_with_loop_thread_bind();
+  orphan_loop_no_bind();
+  orphan_loop_parallel_bind();
+  orphan_loop_teams_bind();
 }
 
 #endif
diff --git a/clang/test/PCH/pragma-loop.cpp b/clang/test/PCH/pragma-loop.cpp
index f5de630ffc912..a3c6871041c0e 100644
--- a/clang/test/PCH/pragma-loop.cpp
+++ b/clang/test/PCH/pragma-loop.cpp
@@ -116,9 +116,13 @@ class pragma_test {
 
   inline void run10(int *List, int Length) {
     int i = 0;
-#pragma omp loop bind(teams)
+    int j = 0;
+    #pragma omp teams
     for (int i = 0; i < Length; i++) {
-      List[i] = i;
+      #pragma omp loop bind(teams)
+      for (int j = 0; j < Length; j++) {
+        List[i] = i+j;
+      }
     }
   }
 

From 925f4622dcd09e0d70c6d30779e0c119fb12ce00 Mon Sep 17 00:00:00 2001
From: zhongyunde 00443407 <zhongyunde@huawei.com>
Date: Sat, 21 Oct 2023 13:07:26 +0800
Subject: [PATCH 043/877] [SimplifyCFG] Precommit tests for PR65835

---
 .../Transforms/SimplifyCFG/switch_mask.ll     | 109 ++++++++++++++++++
 1 file changed, 109 insertions(+)
 create mode 100644 llvm/test/Transforms/SimplifyCFG/switch_mask.ll

diff --git a/llvm/test/Transforms/SimplifyCFG/switch_mask.ll b/llvm/test/Transforms/SimplifyCFG/switch_mask.ll
new file mode 100644
index 0000000000000..8c97a0660d070
--- /dev/null
+++ b/llvm/test/Transforms/SimplifyCFG/switch_mask.ll
@@ -0,0 +1,109 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=simplifycfg --switch-to-lookup -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; https://alive2.llvm.org/ce/z/tuxLhJ
+define i1 @switch_lookup_with_small_i1(i64 %x) {
+; CHECK-LABEL: @switch_lookup_with_small_i1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[AND:%.*]] = and i64 [[X:%.*]], 15
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp ult i64 [[AND]], 11
+; CHECK-NEXT:    [[SWITCH_CAST:%.*]] = trunc i64 [[AND]] to i11
+; CHECK-NEXT:    [[SWITCH_SHIFTAMT:%.*]] = mul nuw nsw i11 [[SWITCH_CAST]], 1
+; CHECK-NEXT:    [[SWITCH_DOWNSHIFT:%.*]] = lshr i11 -1018, [[SWITCH_SHIFTAMT]]
+; CHECK-NEXT:    [[SWITCH_MASKED:%.*]] = trunc i11 [[SWITCH_DOWNSHIFT]] to i1
+; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[TMP0]], i1 [[SWITCH_MASKED]], i1 false
+; CHECK-NEXT:    ret i1 [[TMP1]]
+;
+entry:
+  %and = and i64 %x, 15
+  switch i64 %and, label %default [
+  i64 10, label %lor.end
+  i64 1, label %lor.end
+  i64 2, label %lor.end
+  ]
+
+default:                                          ; preds = %entry
+  br label %lor.end
+
+lor.end:                                          ; preds = %entry, %entry, %entry, %default
+  %0 = phi i1 [ true, %entry ], [ false, %default ], [ true, %entry ], [ true, %entry ]
+  ret i1 %0
+}
+
+; https://godbolt.org/z/sjbjorKon
+define i8 @switch_lookup_with_small_i8(i64 %x) {
+; CHECK-LABEL: @switch_lookup_with_small_i8(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[REM:%.*]] = urem i64 [[X:%.*]], 5
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp ult i64 [[REM]], 3
+; CHECK-NEXT:    [[SWITCH_CAST:%.*]] = trunc i64 [[REM]] to i24
+; CHECK-NEXT:    [[SWITCH_SHIFTAMT:%.*]] = mul nuw nsw i24 [[SWITCH_CAST]], 8
+; CHECK-NEXT:    [[SWITCH_DOWNSHIFT:%.*]] = lshr i24 460303, [[SWITCH_SHIFTAMT]]
+; CHECK-NEXT:    [[SWITCH_MASKED:%.*]] = trunc i24 [[SWITCH_DOWNSHIFT]] to i8
+; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[TMP0]], i8 [[SWITCH_MASKED]], i8 0
+; CHECK-NEXT:    ret i8 [[TMP1]]
+;
+entry:
+  %rem = urem i64 %x, 5
+  switch i64 %rem, label %default [
+  i64 0, label %sw.bb0
+  i64 1, label %sw.bb1
+  i64 2, label %sw.bb2
+  ]
+
+sw.bb0:                                           ; preds = %entry
+  br label %lor.end
+
+sw.bb1:                                           ; preds = %entry
+  br label %lor.end
+
+sw.bb2:                                           ; preds = %entry
+  br label %lor.end
+
+default:                                          ; preds = %entry
+  br label %lor.end
+
+lor.end:
+  %0 = phi i8 [ 15, %sw.bb0 ], [ 6, %sw.bb1 ], [ 7, %sw.bb2 ], [ 0, %default ]
+  ret i8 %0
+}
+
+; Negative test: Table size would not fit the register.
+define i8 @switch_lookup_with_small_i8_negative(i64 %x) {
+; CHECK-LABEL: @switch_lookup_with_small_i8_negative(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[REM:%.*]] = urem i64 [[X:%.*]], 9
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp ult i64 [[REM]], 3
+; CHECK-NEXT:    [[SWITCH_CAST:%.*]] = trunc i64 [[REM]] to i24
+; CHECK-NEXT:    [[SWITCH_SHIFTAMT:%.*]] = mul nuw nsw i24 [[SWITCH_CAST]], 8
+; CHECK-NEXT:    [[SWITCH_DOWNSHIFT:%.*]] = lshr i24 460303, [[SWITCH_SHIFTAMT]]
+; CHECK-NEXT:    [[SWITCH_MASKED:%.*]] = trunc i24 [[SWITCH_DOWNSHIFT]] to i8
+; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[TMP0]], i8 [[SWITCH_MASKED]], i8 0
+; CHECK-NEXT:    ret i8 [[TMP1]]
+;
+entry:
+  %rem = urem i64 %x, 9                           ; 9 * 8 = 72 > 64, not fit the register
+  switch i64 %rem, label %default [
+  i64 0, label %sw.bb0
+  i64 1, label %sw.bb1
+  i64 2, label %sw.bb2
+  ]
+
+sw.bb0:                                           ; preds = %entry
+  br label %lor.end
+
+sw.bb1:                                           ; preds = %entry
+  br label %lor.end
+
+sw.bb2:                                           ; preds = %entry
+  br label %lor.end
+
+default:                                          ; preds = %entry
+  br label %lor.end
+
+lor.end:
+  %0 = phi i8 [ 15, %sw.bb0 ], [ 6, %sw.bb1 ], [ 7, %sw.bb2 ], [ 0, %default ]
+  ret i8 %0
+}

From 5e07481d4240b5e8fd85f9b92df30849606c2af0 Mon Sep 17 00:00:00 2001
From: zhongyunde 00443407 <zhongyunde@huawei.com>
Date: Fri, 1 Sep 2023 23:05:45 -0400
Subject: [PATCH 044/877] [SimplifyCFG] Delete the unnecessary range check for
 small mask operation

When the small mask value little than 64, we can eliminate the checking
for upper limit of the range by enlarge the lookup table size to the maximum
index value. (Then the final table size grows to the next pow2 value)
```
bool f(unsigned x) {
    switch (x % 8) {
        case 0: return 1;
        case 1: return 0;
        case 2: return 0;
        case 3: return 1;
        case 4: return 1;
        case 5: return 0;
        case 6: return 1;

        // This would remove the range check: case 7: return 0;
    }
    return 0;
}
```
Use WouldFitInRegister instead of fitsInLegalInteger to support
more result type beside bool.

Fixes https://github.com/llvm/llvm-project/issues/65120
Reviewed By: zmodem, nikic, RKSimon
---
 llvm/lib/Transforms/Utils/SimplifyCFG.cpp     | 22 +++++++++++++++--
 .../Transforms/SimplifyCFG/switch_mask.ll     | 24 ++++++++-----------
 2 files changed, 30 insertions(+), 16 deletions(-)

diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index 68b5b1a78a346..18187bcdedf09 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -6598,9 +6598,8 @@ static bool SwitchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder,
   // If the default destination is unreachable, or if the lookup table covers
   // all values of the conditional variable, branch directly to the lookup table
   // BB. Otherwise, check that the condition is within the case range.
-  const bool DefaultIsReachable =
+  bool DefaultIsReachable =
       !isa<UnreachableInst>(SI->getDefaultDest()->getFirstNonPHIOrDbg());
-  const bool GeneratingCoveredLookupTable = (MaxTableSize == TableSize);
 
   // Create the BB that does the lookups.
   Module &Mod = *CommonDest->getParent()->getParent();
@@ -6631,6 +6630,25 @@ static bool SwitchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder,
 
   BranchInst *RangeCheckBranch = nullptr;
 
+  // Grow the table to cover all possible index values to avoid the range check.
+  if (UseSwitchConditionAsTableIndex) {
+    ConstantRange CR = computeConstantRange(TableIndex, /* ForSigned */ false);
+    // Grow the table shouldn't have any size impact by checking
+    // WouldFitInRegister.
+    // TODO: Consider growing the table also when it doesn't fit in a register
+    // if no optsize is specified.
+    if (all_of(ResultTypes, [&](const auto &KV) {
+          return SwitchLookupTable::WouldFitInRegister(
+              DL, CR.getUpper().getLimitedValue(), KV.second /* ResultType */);
+        })) {
+      // The default branch is unreachable when we enlarge the lookup table.
+      // Adjust DefaultIsReachable to reuse code path.
+      TableSize = CR.getUpper().getZExtValue();
+      DefaultIsReachable = false;
+    }
+  }
+
+  const bool GeneratingCoveredLookupTable = (MaxTableSize == TableSize);
   if (!DefaultIsReachable || GeneratingCoveredLookupTable) {
     Builder.CreateBr(LookupBB);
     if (DTU)
diff --git a/llvm/test/Transforms/SimplifyCFG/switch_mask.ll b/llvm/test/Transforms/SimplifyCFG/switch_mask.ll
index 8c97a0660d070..123519bc69211 100644
--- a/llvm/test/Transforms/SimplifyCFG/switch_mask.ll
+++ b/llvm/test/Transforms/SimplifyCFG/switch_mask.ll
@@ -8,13 +8,11 @@ define i1 @switch_lookup_with_small_i1(i64 %x) {
 ; CHECK-LABEL: @switch_lookup_with_small_i1(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[AND:%.*]] = and i64 [[X:%.*]], 15
-; CHECK-NEXT:    [[TMP0:%.*]] = icmp ult i64 [[AND]], 11
-; CHECK-NEXT:    [[SWITCH_CAST:%.*]] = trunc i64 [[AND]] to i11
-; CHECK-NEXT:    [[SWITCH_SHIFTAMT:%.*]] = mul nuw nsw i11 [[SWITCH_CAST]], 1
-; CHECK-NEXT:    [[SWITCH_DOWNSHIFT:%.*]] = lshr i11 -1018, [[SWITCH_SHIFTAMT]]
-; CHECK-NEXT:    [[SWITCH_MASKED:%.*]] = trunc i11 [[SWITCH_DOWNSHIFT]] to i1
-; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[TMP0]], i1 [[SWITCH_MASKED]], i1 false
-; CHECK-NEXT:    ret i1 [[TMP1]]
+; CHECK-NEXT:    [[SWITCH_CAST:%.*]] = trunc i64 [[AND]] to i16
+; CHECK-NEXT:    [[SWITCH_SHIFTAMT:%.*]] = mul nuw nsw i16 [[SWITCH_CAST]], 1
+; CHECK-NEXT:    [[SWITCH_DOWNSHIFT:%.*]] = lshr i16 1030, [[SWITCH_SHIFTAMT]]
+; CHECK-NEXT:    [[SWITCH_MASKED:%.*]] = trunc i16 [[SWITCH_DOWNSHIFT]] to i1
+; CHECK-NEXT:    ret i1 [[SWITCH_MASKED]]
 ;
 entry:
   %and = and i64 %x, 15
@@ -37,13 +35,11 @@ define i8 @switch_lookup_with_small_i8(i64 %x) {
 ; CHECK-LABEL: @switch_lookup_with_small_i8(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[REM:%.*]] = urem i64 [[X:%.*]], 5
-; CHECK-NEXT:    [[TMP0:%.*]] = icmp ult i64 [[REM]], 3
-; CHECK-NEXT:    [[SWITCH_CAST:%.*]] = trunc i64 [[REM]] to i24
-; CHECK-NEXT:    [[SWITCH_SHIFTAMT:%.*]] = mul nuw nsw i24 [[SWITCH_CAST]], 8
-; CHECK-NEXT:    [[SWITCH_DOWNSHIFT:%.*]] = lshr i24 460303, [[SWITCH_SHIFTAMT]]
-; CHECK-NEXT:    [[SWITCH_MASKED:%.*]] = trunc i24 [[SWITCH_DOWNSHIFT]] to i8
-; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[TMP0]], i8 [[SWITCH_MASKED]], i8 0
-; CHECK-NEXT:    ret i8 [[TMP1]]
+; CHECK-NEXT:    [[SWITCH_CAST:%.*]] = trunc i64 [[REM]] to i40
+; CHECK-NEXT:    [[SWITCH_SHIFTAMT:%.*]] = mul nuw nsw i40 [[SWITCH_CAST]], 8
+; CHECK-NEXT:    [[SWITCH_DOWNSHIFT:%.*]] = lshr i40 460303, [[SWITCH_SHIFTAMT]]
+; CHECK-NEXT:    [[SWITCH_MASKED:%.*]] = trunc i40 [[SWITCH_DOWNSHIFT]] to i8
+; CHECK-NEXT:    ret i8 [[SWITCH_MASKED]]
 ;
 entry:
   %rem = urem i64 %x, 5

From b47ff361345a02e783452fdfb03eab3a7718758e Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Thu, 26 Oct 2023 13:12:30 +0200
Subject: [PATCH 045/877] [InstCombine] Drop exact flag instead of increasing
 demanded bits (#70311)

Demanded bit simplification for lshr/ashr will currently demand the low
bits if the exact flag is set. This is because these bits must be zero
to satisfy the flag.

However, this means that our demanded bits simplification is worse for
lshr/ashr exact than it is for plain lshr/ashr, which is generally not
desirable.

Instead, drop the exact flag if a demanded bits simplification of the
operand succeeds, which may no longer satisfy the exact flag.

This matches what we do for the exact flag on udiv, as well as the
nuw/nsw flags on add/sub/mul.
---
 .../InstCombineSimplifyDemanded.cpp           | 21 +++++++------------
 llvm/test/Transforms/InstCombine/cast.ll      |  4 ++--
 llvm/test/Transforms/InstCombine/select-2.ll  |  2 +-
 llvm/test/Transforms/InstCombine/shift.ll     |  8 +++----
 4 files changed, 14 insertions(+), 21 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
index be005e61a8d2d..cd6b017874e8d 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
@@ -698,14 +698,11 @@ Value *InstCombinerImpl::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
 
       // Unsigned shift right.
       APInt DemandedMaskIn(DemandedMask.shl(ShiftAmt));
-
-      // If the shift is exact, then it does demand the low bits (and knows that
-      // they are zero).
-      if (cast<LShrOperator>(I)->isExact())
-        DemandedMaskIn.setLowBits(ShiftAmt);
-
-      if (SimplifyDemandedBits(I, 0, DemandedMaskIn, Known, Depth + 1))
+      if (SimplifyDemandedBits(I, 0, DemandedMaskIn, Known, Depth + 1)) {
+        // exact flag may not longer hold.
+        I->dropPoisonGeneratingFlags();
         return I;
+      }
       assert(!Known.hasConflict() && "Bits known to be one AND zero?");
       Known.Zero.lshrInPlace(ShiftAmt);
       Known.One.lshrInPlace(ShiftAmt);
@@ -747,13 +744,11 @@ Value *InstCombinerImpl::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
       if (DemandedMask.countl_zero() <= ShiftAmt)
         DemandedMaskIn.setSignBit();
 
-      // If the shift is exact, then it does demand the low bits (and knows that
-      // they are zero).
-      if (cast<AShrOperator>(I)->isExact())
-        DemandedMaskIn.setLowBits(ShiftAmt);
-
-      if (SimplifyDemandedBits(I, 0, DemandedMaskIn, Known, Depth + 1))
+      if (SimplifyDemandedBits(I, 0, DemandedMaskIn, Known, Depth + 1)) {
+        // exact flag may not longer hold.
+        I->dropPoisonGeneratingFlags();
         return I;
+      }
 
       assert(!Known.hasConflict() && "Bits known to be one AND zero?");
       // Compute the new bits that are at the top now plus sign bits.
diff --git a/llvm/test/Transforms/InstCombine/cast.ll b/llvm/test/Transforms/InstCombine/cast.ll
index 5b480b1157936..59e488f3f23d5 100644
--- a/llvm/test/Transforms/InstCombine/cast.ll
+++ b/llvm/test/Transforms/InstCombine/cast.ll
@@ -1318,7 +1318,7 @@ define i64 @test83(i16 %a, i64 %k) {
 define i8 @test84(i32 %a) {
 ; ALL-LABEL: @test84(
 ; ALL-NEXT:    [[ADD:%.*]] = add i32 [[A:%.*]], 2130706432
-; ALL-NEXT:    [[SHR:%.*]] = lshr exact i32 [[ADD]], 23
+; ALL-NEXT:    [[SHR:%.*]] = lshr i32 [[ADD]], 23
 ; ALL-NEXT:    [[TRUNC:%.*]] = trunc i32 [[SHR]] to i8
 ; ALL-NEXT:    ret i8 [[TRUNC]]
 ;
@@ -1331,7 +1331,7 @@ define i8 @test84(i32 %a) {
 define i8 @test85(i32 %a) {
 ; ALL-LABEL: @test85(
 ; ALL-NEXT:    [[ADD:%.*]] = add i32 [[A:%.*]], 2130706432
-; ALL-NEXT:    [[SHR:%.*]] = lshr exact i32 [[ADD]], 23
+; ALL-NEXT:    [[SHR:%.*]] = lshr i32 [[ADD]], 23
 ; ALL-NEXT:    [[TRUNC:%.*]] = trunc i32 [[SHR]] to i8
 ; ALL-NEXT:    ret i8 [[TRUNC]]
 ;
diff --git a/llvm/test/Transforms/InstCombine/select-2.ll b/llvm/test/Transforms/InstCombine/select-2.ll
index 2e4161f5d80aa..148b0dcf10259 100644
--- a/llvm/test/Transforms/InstCombine/select-2.ll
+++ b/llvm/test/Transforms/InstCombine/select-2.ll
@@ -45,7 +45,7 @@ define float @t3(float %x, float %y) {
 
 define i8 @ashr_exact_poison_constant_fold(i1 %b, i8 %x) {
 ; CHECK-LABEL: @ashr_exact_poison_constant_fold(
-; CHECK-NEXT:    [[TMP1:%.*]] = ashr exact i8 [[X:%.*]], 3
+; CHECK-NEXT:    [[TMP1:%.*]] = ashr i8 [[X:%.*]], 3
 ; CHECK-NEXT:    [[R:%.*]] = select i1 [[B:%.*]], i8 [[TMP1]], i8 5
 ; CHECK-NEXT:    ret i8 [[R]]
 ;
diff --git a/llvm/test/Transforms/InstCombine/shift.ll b/llvm/test/Transforms/InstCombine/shift.ll
index 7b9626331ff29..913ef2a74aebd 100644
--- a/llvm/test/Transforms/InstCombine/shift.ll
+++ b/llvm/test/Transforms/InstCombine/shift.ll
@@ -2141,9 +2141,8 @@ define i16 @lshr_and_not_demanded(i8 %x) {
 
 define i16 @lshr_exact_and_not_demanded(i8 %x) {
 ; CHECK-LABEL: @lshr_exact_and_not_demanded(
-; CHECK-NEXT:    [[Y:%.*]] = and i8 [[X:%.*]], -2
-; CHECK-NEXT:    [[Y_EXT:%.*]] = sext i8 [[Y]] to i16
-; CHECK-NEXT:    [[SHR:%.*]] = lshr exact i16 [[Y_EXT]], 1
+; CHECK-NEXT:    [[Y_EXT:%.*]] = sext i8 [[X:%.*]] to i16
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i16 [[Y_EXT]], 1
 ; CHECK-NEXT:    ret i16 [[SHR]]
 ;
   %y = and i8 %x, -2
@@ -2177,8 +2176,7 @@ define i16 @ashr_umax_not_demanded(i16 %x) {
 
 define i16 @ashr_exact_umax_not_demanded(i16 %x) {
 ; CHECK-LABEL: @ashr_exact_umax_not_demanded(
-; CHECK-NEXT:    [[Y:%.*]] = call i16 @llvm.umax.i16(i16 [[X:%.*]], i16 1)
-; CHECK-NEXT:    [[SHR:%.*]] = ashr exact i16 [[Y]], 1
+; CHECK-NEXT:    [[SHR:%.*]] = ashr i16 [[X:%.*]], 1
 ; CHECK-NEXT:    ret i16 [[SHR]]
 ;
   %y = call i16 @llvm.umax.i16(i16 %x, i16 1)

From eb737d6a7644d66ae57b0b4b8a30c6d2d2d961dc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder@redhat.com>
Date: Thu, 26 Oct 2023 13:08:24 +0200
Subject: [PATCH 046/877] [clang][Interp][NFC] Make another Descriptor param
 const

---
 clang/lib/AST/Interp/InterpBlock.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/lib/AST/Interp/InterpBlock.h b/clang/lib/AST/Interp/InterpBlock.h
index 4ab67ebb9eaaf..9d0c4859fd06c 100644
--- a/clang/lib/AST/Interp/InterpBlock.h
+++ b/clang/lib/AST/Interp/InterpBlock.h
@@ -49,7 +49,7 @@ enum PrimType : unsigned;
 class Block final {
 public:
   /// Creates a new block.
-  Block(const std::optional<unsigned> &DeclID, Descriptor *Desc,
+  Block(const std::optional<unsigned> &DeclID, const Descriptor *Desc,
         bool IsStatic = false, bool IsExtern = false)
       : DeclID(DeclID), IsStatic(IsStatic), IsExtern(IsExtern), Desc(Desc) {}
 

From 96e040acee7c1728506ec49a5a229bfecd49f7db Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell@arm.com>
Date: Thu, 26 Oct 2023 12:18:58 +0100
Subject: [PATCH 047/877] [mlir][ArmSVE] Add `-arm-sve-legalize-vector-storage`
 pass  (#68794)

This patch adds a pass that ensures that loads, stores, and allocations
of SVE vector types will be legal in the LLVM backend. It does this at
the memref level, so this pass must be applied before lowering all the
way to LLVM.

This pass currently fixes two issues.

## Loading and storing predicate types

It is only legal to load/store predicate types equal to (or greater
than) a full predicate register, which in MLIR is `vector<[16]xi1>`.
Smaller predicate types (`vector<[1|2|4|8]xi1>`) must be converted
to/from a full predicate type (referred to as a `svbool`) before and
after storing and loading respectively. This pass does this by widening
allocations and inserting conversion intrinsics.

For example:


```mlir
%alloca = memref.alloca() : memref<vector<[4]xi1>>
%mask = vector.constant_mask [4] : vector<[4]xi1>
memref.store %mask, %alloca[] : memref<vector<[4]xi1>>
%reload = memref.load %alloca[] : memref<vector<[4]xi1>>
```
Becomes:
```mlir
%alloca = memref.alloca() {alignment = 1 : i64} : memref<vector<[16]xi1>>
%mask = vector.constant_mask [4] : vector<[4]xi1>
%svbool = arm_sve.convert_to_svbool %mask : vector<[4]xi1>
memref.store %svbool, %alloca[] : memref<vector<[16]xi1>>
%reload_svbool = memref.load %alloca[] : memref<vector<[16]xi1>>
%reload = arm_sve.convert_from_svbool %reload_svbool : vector<[4]xi1>
```

## Relax alignments for SVE vector allocas

The storage for SVE vector types only needs to have an alignment that
matches the element type (for example 4 byte alignment for `f32`s).
However, the LLVM backend currently defaults to aligning to `base size x
element size` bytes. For non-legal vector types like `vector<[8]xf32>`
this results in 8 x 4 = 32-byte alignment, but the backend only supports
up to 16-byte alignment for SVE vectors on the stack. Explicitly setting
a smaller alignment prevents this issue.

Depends on: #68586 and #68695 (for testing)
---
 .../mlir/Dialect/ArmSVE/CMakeLists.txt        |   1 +
 .../Dialect/ArmSVE/Transforms/CMakeLists.txt  |   5 +
 .../mlir/Dialect/ArmSVE/Transforms/Passes.h   |  36 ++
 .../mlir/Dialect/ArmSVE/Transforms/Passes.td  |  68 ++++
 mlir/include/mlir/InitAllPasses.h             |   2 +
 .../Dialect/ArmSVE/Transforms/CMakeLists.txt  |   2 +
 .../Transforms/LegalizeVectorStorage.cpp      | 338 ++++++++++++++++++
 .../ArmSVE/legalize-vector-storage.mlir       | 203 +++++++++++
 .../ArmSVE/arrays-of-scalable-vectors.mlir    | 117 ++++++
 9 files changed, 772 insertions(+)
 create mode 100644 mlir/include/mlir/Dialect/ArmSVE/Transforms/CMakeLists.txt
 create mode 100644 mlir/include/mlir/Dialect/ArmSVE/Transforms/Passes.h
 create mode 100644 mlir/include/mlir/Dialect/ArmSVE/Transforms/Passes.td
 create mode 100644 mlir/lib/Dialect/ArmSVE/Transforms/LegalizeVectorStorage.cpp
 create mode 100644 mlir/test/Dialect/ArmSVE/legalize-vector-storage.mlir
 create mode 100644 mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/arrays-of-scalable-vectors.mlir

diff --git a/mlir/include/mlir/Dialect/ArmSVE/CMakeLists.txt b/mlir/include/mlir/Dialect/ArmSVE/CMakeLists.txt
index f33061b2d87cf..9f57627c321fb 100644
--- a/mlir/include/mlir/Dialect/ArmSVE/CMakeLists.txt
+++ b/mlir/include/mlir/Dialect/ArmSVE/CMakeLists.txt
@@ -1 +1,2 @@
 add_subdirectory(IR)
+add_subdirectory(Transforms)
diff --git a/mlir/include/mlir/Dialect/ArmSVE/Transforms/CMakeLists.txt b/mlir/include/mlir/Dialect/ArmSVE/Transforms/CMakeLists.txt
new file mode 100644
index 0000000000000..7226642daf861
--- /dev/null
+++ b/mlir/include/mlir/Dialect/ArmSVE/Transforms/CMakeLists.txt
@@ -0,0 +1,5 @@
+set(LLVM_TARGET_DEFINITIONS Passes.td)
+mlir_tablegen(Passes.h.inc -gen-pass-decls -name ArmSVE)
+add_public_tablegen_target(MLIRArmSVEPassIncGen)
+
+add_mlir_doc(Passes ArmSVEPasses ./ -gen-pass-doc)
diff --git a/mlir/include/mlir/Dialect/ArmSVE/Transforms/Passes.h b/mlir/include/mlir/Dialect/ArmSVE/Transforms/Passes.h
new file mode 100644
index 0000000000000..66f30a67cb05b
--- /dev/null
+++ b/mlir/include/mlir/Dialect/ArmSVE/Transforms/Passes.h
@@ -0,0 +1,36 @@
+//===- Passes.h - Pass Entrypoints ------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_ARMSVE_TRANSFORMS_PASSES_H
+#define MLIR_DIALECT_ARMSVE_TRANSFORMS_PASSES_H
+
+#include "mlir/Conversion/LLVMCommon/TypeConverter.h"
+#include "mlir/Pass/Pass.h"
+
+namespace mlir::arm_sve {
+
+#define GEN_PASS_DECL
+#include "mlir/Dialect/ArmSVE/Transforms/Passes.h.inc"
+
+/// Pass to legalize Arm SVE vector storage.
+std::unique_ptr<Pass> createLegalizeVectorStoragePass();
+
+/// Collect a set of patterns to legalize Arm SVE vector storage.
+void populateLegalizeVectorStoragePatterns(RewritePatternSet &patterns);
+
+//===----------------------------------------------------------------------===//
+// Registration
+//===----------------------------------------------------------------------===//
+
+/// Generate the code for registering passes.
+#define GEN_PASS_REGISTRATION
+#include "mlir/Dialect/ArmSVE/Transforms/Passes.h.inc"
+
+} // namespace mlir::arm_sve
+
+#endif // MLIR_DIALECT_ARMSVE_TRANSFORMS_PASSES_H
diff --git a/mlir/include/mlir/Dialect/ArmSVE/Transforms/Passes.td b/mlir/include/mlir/Dialect/ArmSVE/Transforms/Passes.td
new file mode 100644
index 0000000000000..d7cb309db5253
--- /dev/null
+++ b/mlir/include/mlir/Dialect/ArmSVE/Transforms/Passes.td
@@ -0,0 +1,68 @@
+//===-- Passes.td - ArmSVE pass definition file ------------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_ARMSVE_TRANSFORMS_PASSES_TD
+#define MLIR_DIALECT_ARMSVE_TRANSFORMS_PASSES_TD
+
+include "mlir/Pass/PassBase.td"
+
+def LegalizeVectorStorage
+    : Pass<"arm-sve-legalize-vector-storage", "mlir::func::FuncOp"> {
+  let summary = "Ensures stores of SVE vector types will be legal";
+  let description = [{
+    This pass ensures that loads, stores, and allocations of SVE vector types
+    will be legal in the LLVM backend. It does this at the memref level, so this
+    pass must be applied before lowering all the way to LLVM.
+
+    This pass currently addresses two issues.
+
+    ## Loading and storing predicate types
+
+    It is only legal to load/store predicate types equal to (or greater than) a
+    full predicate register, which in MLIR is `vector<[16]xi1>`. Smaller
+    predicate types (`vector<[1|2|4|8]xi1>`) must be converted to/from a full
+    predicate type (referred to as a `svbool`) before and after storing and
+    loading respectively. This pass does this by widening allocations and
+    inserting conversion intrinsics. Note: Non-powers-of-two masks (e.g.
+    `vector<[7]xi1>`), which are not SVE predicates, are ignored.
+
+    For example:
+
+    ```mlir
+    %alloca = memref.alloca() : memref<vector<[4]xi1>>
+    %mask = vector.constant_mask [4] : vector<[4]xi1>
+    memref.store %mask, %alloca[] : memref<vector<[4]xi1>>
+    %reload = memref.load %alloca[] : memref<vector<[4]xi1>>
+    ```
+    Becomes:
+    ```mlir
+    %alloca = memref.alloca() {alignment = 1 : i64} : memref<vector<[16]xi1>>
+    %mask = vector.constant_mask [4] : vector<[4]xi1>
+    %svbool = arm_sve.convert_to_svbool %mask : vector<[4]xi1>
+    memref.store %svbool, %alloca[] : memref<vector<[16]xi1>>
+    %reload_svbool = memref.load %alloca[] : memref<vector<[16]xi1>>
+    %reload = arm_sve.convert_from_svbool %reload_svbool : vector<[4]xi1>
+    ```
+
+    ## Relax alignments for SVE vector allocas
+
+    The storage for SVE vector types only needs to have an alignment that
+    matches the element type (for example 4 byte alignment for `f32`s). However,
+    the LLVM backend currently defaults to aligning to `base size` x
+    `element size` bytes. For non-legal vector types like `vector<[8]xf32>` this
+    results in 8 x 4 = 32-byte alignment, but the backend only supports up to
+    16-byte alignment for SVE vectors on the stack. Explicitly setting a smaller
+    alignment prevents this issue.
+  }];
+  let constructor = "mlir::arm_sve::createLegalizeVectorStoragePass()";
+  let dependentDialects = ["func::FuncDialect",
+    "memref::MemRefDialect", "vector::VectorDialect",
+    "arm_sve::ArmSVEDialect"];
+}
+
+#endif // MLIR_DIALECT_ARMSVE_TRANSFORMS_PASSES_TD
diff --git a/mlir/include/mlir/InitAllPasses.h b/mlir/include/mlir/InitAllPasses.h
index 5489a13a8040b..7301905954f56 100644
--- a/mlir/include/mlir/InitAllPasses.h
+++ b/mlir/include/mlir/InitAllPasses.h
@@ -19,6 +19,7 @@
 #include "mlir/Dialect/Affine/Passes.h"
 #include "mlir/Dialect/Arith/Transforms/Passes.h"
 #include "mlir/Dialect/ArmSME/Transforms/Passes.h"
+#include "mlir/Dialect/ArmSVE/Transforms/Passes.h"
 #include "mlir/Dialect/Async/Passes.h"
 #include "mlir/Dialect/Bufferization/Pipelines/Passes.h"
 #include "mlir/Dialect/Bufferization/Transforms/Passes.h"
@@ -82,6 +83,7 @@ inline void registerAllPasses() {
   transform::registerTransformPasses();
   vector::registerVectorPasses();
   arm_sme::registerArmSMEPasses();
+  arm_sve::registerArmSVEPasses();
 
   // Dialect pipelines
   bufferization::registerBufferizationPipelines();
diff --git a/mlir/lib/Dialect/ArmSVE/Transforms/CMakeLists.txt b/mlir/lib/Dialect/ArmSVE/Transforms/CMakeLists.txt
index 2f1c43fae240d..a70c489a51fea 100644
--- a/mlir/lib/Dialect/ArmSVE/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/ArmSVE/Transforms/CMakeLists.txt
@@ -1,8 +1,10 @@
 add_mlir_dialect_library(MLIRArmSVETransforms
   LegalizeForLLVMExport.cpp
+  LegalizeVectorStorage.cpp
 
   DEPENDS
   MLIRArmSVEConversionsIncGen
+  MLIRArmSVEPassIncGen
 
   LINK_LIBS PUBLIC
   MLIRArmSVEDialect
diff --git a/mlir/lib/Dialect/ArmSVE/Transforms/LegalizeVectorStorage.cpp b/mlir/lib/Dialect/ArmSVE/Transforms/LegalizeVectorStorage.cpp
new file mode 100644
index 0000000000000..bee1f3659753b
--- /dev/null
+++ b/mlir/lib/Dialect/ArmSVE/Transforms/LegalizeVectorStorage.cpp
@@ -0,0 +1,338 @@
+//===- LegalizeVectorStorage.cpp - Ensures SVE loads/stores are legal -----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/ArmSVE/IR/ArmSVEDialect.h"
+#include "mlir/Dialect/ArmSVE/Transforms/Passes.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+
+namespace mlir::arm_sve {
+#define GEN_PASS_DEF_LEGALIZEVECTORSTORAGE
+#include "mlir/Dialect/ArmSVE/Transforms/Passes.h.inc"
+} // namespace mlir::arm_sve
+
+using namespace mlir;
+using namespace mlir::arm_sve;
+
+// A tag to mark unrealized_conversions produced by this pass. This is used to
+// detect IR this pass failed to completely legalize, and report an error.
+// If everything was successfully legalized, no tagged ops will remain after
+// this pass.
+constexpr StringLiteral kSVELegalizerTag("__arm_sve_legalize_vector_storage__");
+
+/// Definitions:
+///
+/// [1] svbool = vector<...x[16]xi1>, which maps to some multiple of full SVE
+/// predicate registers. A full predicate is the smallest quantity that can be
+/// loaded/stored.
+///
+/// [2] SVE mask = hardware-sized SVE predicate mask, i.e. its trailing
+/// dimension matches the size of a legal SVE vector size (such as
+/// vector<[4]xi1>), but is too small to be stored to memory (i.e smaller than
+/// a svbool).
+
+namespace {
+
+/// Checks if a vector type is a SVE mask [2].
+bool isSVEMaskType(VectorType type) {
+  return type.getRank() > 0 && type.getElementType().isInteger(1) &&
+         type.getScalableDims().back() && type.getShape().back() < 16 &&
+         llvm::isPowerOf2_32(type.getShape().back()) &&
+         !llvm::is_contained(type.getScalableDims().drop_back(), true);
+}
+
+VectorType widenScalableMaskTypeToSvbool(VectorType type) {
+  assert(isSVEMaskType(type));
+  return VectorType::Builder(type).setDim(type.getRank() - 1, 16);
+}
+
+/// A helper for cloning an op and replacing it will a new version, updated by a
+/// callback.
+template <typename TOp, typename TLegalizerCallback>
+void replaceOpWithLegalizedOp(PatternRewriter &rewriter, TOp op,
+                              TLegalizerCallback callback) {
+  // Clone the previous op to preserve any properties/attributes.
+  auto newOp = op.clone();
+  rewriter.insert(newOp);
+  rewriter.replaceOp(op, callback(newOp));
+}
+
+/// A helper for cloning an op and replacing it with a new version, updated by a
+/// callback, and an unrealized conversion back to the type of the replaced op.
+template <typename TOp, typename TLegalizerCallback>
+void replaceOpWithUnrealizedConversion(PatternRewriter &rewriter, TOp op,
+                                       TLegalizerCallback callback) {
+  replaceOpWithLegalizedOp(rewriter, op, [&](TOp newOp) {
+    // Mark our `unrealized_conversion_casts` with a pass label.
+    return rewriter.create<UnrealizedConversionCastOp>(
+        op.getLoc(), TypeRange{op.getResult().getType()},
+        ValueRange{callback(newOp)},
+        NamedAttribute(rewriter.getStringAttr(kSVELegalizerTag),
+                       rewriter.getUnitAttr()));
+  });
+}
+
+/// Extracts the widened SVE memref value (that's legal to store/load) from the
+/// `unrealized_conversion_cast`s added by this pass.
+static FailureOr<Value> getSVELegalizedMemref(Value illegalMemref) {
+  Operation *definingOp = illegalMemref.getDefiningOp();
+  if (!definingOp || !definingOp->hasAttr(kSVELegalizerTag))
+    return failure();
+  auto unrealizedConversion =
+      llvm::cast<UnrealizedConversionCastOp>(definingOp);
+  return unrealizedConversion.getOperand(0);
+}
+
+/// The default alignment of an alloca in LLVM may request overaligned sizes for
+/// SVE types, which will fail during stack frame allocation. This rewrite
+/// explicitly adds a reasonable alignment to allocas of scalable types.
+struct RelaxScalableVectorAllocaAlignment
+    : public OpRewritePattern<memref::AllocaOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(memref::AllocaOp allocaOp,
+                                PatternRewriter &rewriter) const override {
+    auto memrefElementType = allocaOp.getType().getElementType();
+    auto vectorType = llvm::dyn_cast<VectorType>(memrefElementType);
+    if (!vectorType || !vectorType.isScalable() || allocaOp.getAlignment())
+      return failure();
+
+    // Set alignment based on the defaults for SVE vectors and predicates.
+    unsigned aligment = vectorType.getElementType().isInteger(1) ? 2 : 16;
+    allocaOp.setAlignment(aligment);
+
+    return success();
+  }
+};
+
+/// Replaces allocations of SVE predicates smaller than an svbool [1] (_illegal_
+/// to load/store) with a wider allocation of svbool (_legal_ to load/store)
+/// followed by a tagged unrealized conversion to the original type.
+///
+/// Example
+/// ```
+/// %alloca = memref.alloca() : memref<vector<[4]xi1>>
+/// ```
+/// is rewritten into:
+/// ```
+/// %widened = memref.alloca() {alignment = 1 : i64} : memref<vector<[16]xi1>>
+/// %alloca = builtin.unrealized_conversion_cast %widened
+///   : memref<vector<[16]xi1>> to memref<vector<[4]xi1>>
+///     {__arm_sve_legalize_vector_storage__}
+/// ```
+template <typename AllocLikeOp>
+struct LegalizeSVEMaskAllocation : public OpRewritePattern<AllocLikeOp> {
+  using OpRewritePattern<AllocLikeOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(AllocLikeOp allocLikeOp,
+                                PatternRewriter &rewriter) const override {
+    auto vectorType =
+        llvm::dyn_cast<VectorType>(allocLikeOp.getType().getElementType());
+
+    if (!vectorType || !isSVEMaskType(vectorType))
+      return failure();
+
+    // Replace this alloc-like op of an SVE mask [2] with one of a (storable)
+    // svbool mask [1]. A temporary unrealized_conversion_cast is added to the
+    // old type to allow local rewrites.
+    replaceOpWithUnrealizedConversion(
+        rewriter, allocLikeOp, [&](AllocLikeOp newAllocLikeOp) {
+          newAllocLikeOp.getResult().setType(
+              llvm::cast<MemRefType>(newAllocLikeOp.getType().cloneWith(
+                  {}, widenScalableMaskTypeToSvbool(vectorType))));
+          return newAllocLikeOp;
+        });
+
+    return success();
+  }
+};
+
+/// Replaces vector.type_casts of unrealized conversions to SVE predicate memref
+/// types that are _illegal_ to load/store from (!= svbool [1]), with type casts
+/// of memref types that are _legal_ to load/store, followed by unrealized
+/// conversions.
+///
+/// Example:
+/// ```
+/// %alloca = builtin.unrealized_conversion_cast %widened
+///   : memref<vector<[16]xi1>> to memref<vector<[8]xi1>>
+///     {__arm_sve_legalize_vector_storage__}
+/// %cast = vector.type_cast %alloca
+///   : memref<vector<3x[8]xi1>> to memref<3xvector<[8]xi1>>
+/// ```
+/// is rewritten into:
+/// ```
+/// %widened_cast = vector.type_cast %widened
+///   : memref<vector<3x[16]xi1>> to memref<3xvector<[16]xi1>>
+/// %cast = builtin.unrealized_conversion_cast %widened_cast
+///   : memref<3xvector<[16]xi1>> to memref<3xvector<[8]xi1>>
+///     {__arm_sve_legalize_vector_storage__}
+/// ```
+struct LegalizeSVEMaskTypeCastConversion
+    : public OpRewritePattern<vector::TypeCastOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(vector::TypeCastOp typeCastOp,
+                                PatternRewriter &rewriter) const override {
+    auto resultType = typeCastOp.getResultMemRefType();
+    auto vectorType = llvm::dyn_cast<VectorType>(resultType.getElementType());
+
+    if (!vectorType || !isSVEMaskType(vectorType))
+      return failure();
+
+    auto legalMemref = getSVELegalizedMemref(typeCastOp.getMemref());
+    if (failed(legalMemref))
+      return failure();
+
+    // Replace this vector.type_cast with one of a (storable) svbool mask [1].
+    replaceOpWithUnrealizedConversion(
+        rewriter, typeCastOp, [&](vector::TypeCastOp newTypeCast) {
+          newTypeCast.setOperand(*legalMemref);
+          newTypeCast.getResult().setType(
+              llvm::cast<MemRefType>(newTypeCast.getType().cloneWith(
+                  {}, widenScalableMaskTypeToSvbool(vectorType))));
+          return newTypeCast;
+        });
+
+    return success();
+  }
+};
+
+/// Replaces stores to unrealized conversions to SVE predicate memref types that
+/// are _illegal_ to load/store from (!= svbool [1]), with
+/// `arm_sve.convert_to_svbool`s followed by (legal) wider stores.
+///
+/// Example:
+/// ```
+/// memref.store %mask, %alloca[] : memref<vector<[8]xi1>>
+/// ```
+/// is rewritten into:
+/// ```
+/// %svbool = arm_sve.convert_to_svbool %mask : vector<[8]xi1>
+/// memref.store %svbool, %widened[] : memref<vector<[16]xi1>>
+/// ```
+struct LegalizeSVEMaskStoreConversion
+    : public OpRewritePattern<memref::StoreOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(memref::StoreOp storeOp,
+                                PatternRewriter &rewriter) const override {
+    auto loc = storeOp.getLoc();
+
+    Value valueToStore = storeOp.getValueToStore();
+    auto vectorType = llvm::dyn_cast<VectorType>(valueToStore.getType());
+
+    if (!vectorType || !isSVEMaskType(vectorType))
+      return failure();
+
+    auto legalMemref = getSVELegalizedMemref(storeOp.getMemref());
+    if (failed(legalMemref))
+      return failure();
+
+    auto legalMaskType = widenScalableMaskTypeToSvbool(
+        llvm::cast<VectorType>(valueToStore.getType()));
+    auto convertToSvbool = rewriter.create<arm_sve::ConvertToSvboolOp>(
+        loc, legalMaskType, valueToStore);
+    // Replace this store with a conversion to a storable svbool mask [1],
+    // followed by a wider store.
+    replaceOpWithLegalizedOp(rewriter, storeOp,
+                             [&](memref::StoreOp newStoreOp) {
+                               newStoreOp.setOperand(0, convertToSvbool);
+                               newStoreOp.setOperand(1, *legalMemref);
+                               return newStoreOp;
+                             });
+
+    return success();
+  }
+};
+
+/// Replaces loads from unrealized conversions to SVE predicate memref types
+/// that are _illegal_ to load/store from (!= svbool [1]), types with (legal)
+/// wider loads, followed by `arm_sve.convert_from_svbool`s.
+///
+/// Example:
+/// ```
+/// %reload = memref.load %alloca[] : memref<vector<[4]xi1>>
+/// ```
+/// is rewritten into:
+/// ```
+/// %svbool = memref.load %widened[] : memref<vector<[16]xi1>>
+/// %reload = arm_sve.convert_from_svbool %reload : vector<[4]xi1>
+/// ```
+struct LegalizeSVEMaskLoadConversion : public OpRewritePattern<memref::LoadOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(memref::LoadOp loadOp,
+                                PatternRewriter &rewriter) const override {
+    auto loc = loadOp.getLoc();
+
+    Value loadedMask = loadOp.getResult();
+    auto vectorType = llvm::dyn_cast<VectorType>(loadedMask.getType());
+
+    if (!vectorType || !isSVEMaskType(vectorType))
+      return failure();
+
+    auto legalMemref = getSVELegalizedMemref(loadOp.getMemref());
+    if (failed(legalMemref))
+      return failure();
+
+    auto legalMaskType = widenScalableMaskTypeToSvbool(vectorType);
+    // Replace this load with a legal load of an svbool type, followed by a
+    // conversion back to the original type.
+    replaceOpWithLegalizedOp(rewriter, loadOp, [&](memref::LoadOp newLoadOp) {
+      newLoadOp.setMemRef(*legalMemref);
+      newLoadOp.getResult().setType(legalMaskType);
+      return rewriter.create<arm_sve::ConvertFromSvboolOp>(
+          loc, loadedMask.getType(), newLoadOp);
+    });
+
+    return success();
+  }
+};
+
+} // namespace
+
+void mlir::arm_sve::populateLegalizeVectorStoragePatterns(
+    RewritePatternSet &patterns) {
+  patterns.add<RelaxScalableVectorAllocaAlignment,
+               LegalizeSVEMaskAllocation<memref::AllocaOp>,
+               LegalizeSVEMaskAllocation<memref::AllocOp>,
+               LegalizeSVEMaskTypeCastConversion,
+               LegalizeSVEMaskStoreConversion, LegalizeSVEMaskLoadConversion>(
+      patterns.getContext());
+}
+
+namespace {
+struct LegalizeVectorStorage
+    : public arm_sve::impl::LegalizeVectorStorageBase<LegalizeVectorStorage> {
+
+  void runOnOperation() override {
+    RewritePatternSet patterns(&getContext());
+    populateLegalizeVectorStoragePatterns(patterns);
+    if (failed(applyPatternsAndFoldGreedily(getOperation(),
+                                            std::move(patterns)))) {
+      signalPassFailure();
+    }
+    ConversionTarget target(getContext());
+    target.addDynamicallyLegalOp<UnrealizedConversionCastOp>(
+        [](UnrealizedConversionCastOp unrealizedConversion) {
+          return !unrealizedConversion->hasAttr(kSVELegalizerTag);
+        });
+    // This detects if we failed to completely legalize the IR.
+    if (failed(applyPartialConversion(getOperation(), target, {})))
+      signalPassFailure();
+  }
+};
+
+} // namespace
+
+std::unique_ptr<Pass> mlir::arm_sve::createLegalizeVectorStoragePass() {
+  return std::make_unique<LegalizeVectorStorage>();
+}
diff --git a/mlir/test/Dialect/ArmSVE/legalize-vector-storage.mlir b/mlir/test/Dialect/ArmSVE/legalize-vector-storage.mlir
new file mode 100644
index 0000000000000..9a3df8376f121
--- /dev/null
+++ b/mlir/test/Dialect/ArmSVE/legalize-vector-storage.mlir
@@ -0,0 +1,203 @@
+// RUN: mlir-opt %s -allow-unregistered-dialect -arm-sve-legalize-vector-storage -split-input-file -verify-diagnostics | FileCheck %s
+
+/// This tests the basic functionality of the -arm-sve-legalize-vector-storage pass.
+
+// -----
+
+// CHECK-LABEL: @store_and_reload_sve_predicate_nxv1i1(
+// CHECK-SAME:                                         %[[MASK:.*]]: vector<[1]xi1>)
+func.func @store_and_reload_sve_predicate_nxv1i1(%mask: vector<[1]xi1>) -> vector<[1]xi1> {
+  // CHECK-NEXT: %[[ALLOCA:.*]] = memref.alloca() {alignment = 2 : i64} : memref<vector<[16]xi1>>
+  %alloca = memref.alloca() : memref<vector<[1]xi1>>
+  // CHECK-NEXT: %[[SVBOOL:.*]] = arm_sve.convert_to_svbool %[[MASK]] : vector<[1]xi1>
+  // CHECK-NEXT: memref.store %[[SVBOOL]], %[[ALLOCA]][] : memref<vector<[16]xi1>>
+  memref.store %mask, %alloca[] : memref<vector<[1]xi1>>
+  // CHECK-NEXT: %[[RELOAD:.*]] = memref.load %[[ALLOCA]][] : memref<vector<[16]xi1>>
+  // CHECK-NEXT: %[[MASK:.*]] = arm_sve.convert_from_svbool %[[RELOAD]] : vector<[1]xi1>
+  %reload = memref.load %alloca[] : memref<vector<[1]xi1>>
+  // CHECK-NEXT: return %[[MASK]] : vector<[1]xi1>
+  return %reload : vector<[1]xi1>
+}
+
+// -----
+
+// CHECK-LABEL: @store_and_reload_sve_predicate_nxv2i1(
+// CHECK-SAME:                                         %[[MASK:.*]]: vector<[2]xi1>)
+func.func @store_and_reload_sve_predicate_nxv2i1(%mask: vector<[2]xi1>) -> vector<[2]xi1> {
+  // CHECK-NEXT: %[[ALLOCA:.*]] = memref.alloca() {alignment = 2 : i64} : memref<vector<[16]xi1>>
+  %alloca = memref.alloca() : memref<vector<[2]xi1>>
+  // CHECK-NEXT: %[[SVBOOL:.*]] = arm_sve.convert_to_svbool %[[MASK]] : vector<[2]xi1>
+  // CHECK-NEXT: memref.store %[[SVBOOL]], %[[ALLOCA]][] : memref<vector<[16]xi1>>
+  memref.store %mask, %alloca[] : memref<vector<[2]xi1>>
+  // CHECK-NEXT: %[[RELOAD:.*]] = memref.load %[[ALLOCA]][] : memref<vector<[16]xi1>>
+  // CHECK-NEXT: %[[MASK:.*]] = arm_sve.convert_from_svbool %[[RELOAD]] : vector<[2]xi1>
+  %reload = memref.load %alloca[] : memref<vector<[2]xi1>>
+  // CHECK-NEXT: return %[[MASK]] : vector<[2]xi1>
+  return %reload : vector<[2]xi1>
+}
+
+// -----
+
+// CHECK-LABEL: @store_and_reload_sve_predicate_nxv4i1(
+// CHECK-SAME:                                         %[[MASK:.*]]: vector<[4]xi1>)
+func.func @store_and_reload_sve_predicate_nxv4i1(%mask: vector<[4]xi1>) -> vector<[4]xi1> {
+  // CHECK-NEXT: %[[ALLOCA:.*]] = memref.alloca() {alignment = 2 : i64} : memref<vector<[16]xi1>>
+  %alloca = memref.alloca() : memref<vector<[4]xi1>>
+  // CHECK-NEXT: %[[SVBOOL:.*]] = arm_sve.convert_to_svbool %[[MASK]] : vector<[4]xi1>
+  // CHECK-NEXT: memref.store %[[SVBOOL]], %[[ALLOCA]][] : memref<vector<[16]xi1>>
+  memref.store %mask, %alloca[] : memref<vector<[4]xi1>>
+  // CHECK-NEXT: %[[RELOAD:.*]] = memref.load %[[ALLOCA]][] : memref<vector<[16]xi1>>
+  // CHECK-NEXT: %[[MASK:.*]] = arm_sve.convert_from_svbool %[[RELOAD]] : vector<[4]xi1>
+  %reload = memref.load %alloca[] : memref<vector<[4]xi1>>
+  // CHECK-NEXT: return %[[MASK]] : vector<[4]xi1>
+  return %reload : vector<[4]xi1>
+}
+
+// -----
+
+// CHECK-LABEL: @store_and_reload_sve_predicate_nxv8i1(
+// CHECK-SAME:                                         %[[MASK:.*]]: vector<[8]xi1>)
+func.func @store_and_reload_sve_predicate_nxv8i1(%mask: vector<[8]xi1>) -> vector<[8]xi1> {
+  // CHECK-NEXT: %[[ALLOCA:.*]] = memref.alloca() {alignment = 2 : i64} : memref<vector<[16]xi1>>
+  %alloca = memref.alloca() : memref<vector<[8]xi1>>
+  // CHECK-NEXT: %[[SVBOOL:.*]] = arm_sve.convert_to_svbool %[[MASK]] : vector<[8]xi1>
+  // CHECK-NEXT: memref.store %[[SVBOOL]], %[[ALLOCA]][] : memref<vector<[16]xi1>>
+  memref.store %mask, %alloca[] : memref<vector<[8]xi1>>
+  // CHECK-NEXT: %[[RELOAD:.*]] = memref.load %[[ALLOCA]][] : memref<vector<[16]xi1>>
+  // CHECK-NEXT: %[[MASK:.*]] = arm_sve.convert_from_svbool %[[RELOAD]] : vector<[8]xi1>
+  %reload = memref.load %alloca[] : memref<vector<[8]xi1>>
+  // CHECK-NEXT: return %[[MASK]] : vector<[8]xi1>
+  return %reload : vector<[8]xi1>
+}
+
+// -----
+
+// CHECK-LABEL: @store_and_reload_sve_predicate_nxv16i1(
+// CHECK-SAME:                                         %[[MASK:.*]]: vector<[16]xi1>)
+func.func @store_and_reload_sve_predicate_nxv16i1(%mask: vector<[16]xi1>) -> vector<[16]xi1> {
+  // CHECK-NEXT: %[[ALLOCA:.*]] = memref.alloca() {alignment = 2 : i64} : memref<vector<[16]xi1>>
+  %alloca = memref.alloca() : memref<vector<[16]xi1>>
+  // CHECK-NEXT: memref.store %[[MASK]], %[[ALLOCA]][] : memref<vector<[16]xi1>>
+  memref.store %mask, %alloca[] : memref<vector<[16]xi1>>
+  // CHECK-NEXT: %[[RELOAD:.*]] = memref.load %[[ALLOCA]][] : memref<vector<[16]xi1>>
+  %reload = memref.load %alloca[] : memref<vector<[16]xi1>>
+  // CHECK-NEXT: return %[[RELOAD]] : vector<[16]xi1>
+  return %reload : vector<[16]xi1>
+}
+
+// -----
+
+/// This is not a valid SVE mask type, so is ignored by the
+// `-arm-sve-legalize-vector-storage` pass.
+
+// CHECK-LABEL: @store_and_reload_unsupported_type(
+// CHECK-SAME:                                         %[[MASK:.*]]: vector<[7]xi1>)
+func.func @store_and_reload_unsupported_type(%mask: vector<[7]xi1>) -> vector<[7]xi1> {
+  // CHECK-NEXT: %[[ALLOCA:.*]] = memref.alloca() {alignment = 2 : i64} : memref<vector<[7]xi1>>
+  %alloca = memref.alloca() : memref<vector<[7]xi1>>
+  // CHECK-NEXT: memref.store %[[MASK]], %[[ALLOCA]][] : memref<vector<[7]xi1>>
+  memref.store %mask, %alloca[] : memref<vector<[7]xi1>>
+  // CHECK-NEXT: %[[RELOAD:.*]] = memref.load %[[ALLOCA]][] : memref<vector<[7]xi1>>
+  %reload = memref.load %alloca[] : memref<vector<[7]xi1>>
+  // CHECK-NEXT: return %[[RELOAD]] : vector<[7]xi1>
+  return %reload : vector<[7]xi1>
+}
+
+// -----
+
+// CHECK-LABEL: @store_2d_mask_and_reload_slice(
+// CHECK-SAME:                                  %[[MASK:.*]]: vector<3x[8]xi1>)
+func.func @store_2d_mask_and_reload_slice(%mask: vector<3x[8]xi1>) -> vector<[8]xi1> {
+  // CHECK-NEXT: %[[C0:.*]] = arith.constant 0 : index
+  %c0 = arith.constant 0 : index
+  // CHECK-NEXT: %[[ALLOCA:.*]] = memref.alloca() {alignment = 2 : i64} : memref<vector<3x[16]xi1>>
+  %alloca = memref.alloca() : memref<vector<3x[8]xi1>>
+  // CHECK-NEXT: %[[SVBOOL:.*]] = arm_sve.convert_to_svbool %[[MASK]] : vector<3x[8]xi1>
+  // CHECK-NEXT: memref.store %[[SVBOOL]], %[[ALLOCA]][] : memref<vector<3x[16]xi1>>
+  memref.store %mask, %alloca[] : memref<vector<3x[8]xi1>>
+  // CHECK-NEXT: %[[UNPACK:.*]] = vector.type_cast %[[ALLOCA]] : memref<vector<3x[16]xi1>> to memref<3xvector<[16]xi1>>
+  %unpack = vector.type_cast %alloca : memref<vector<3x[8]xi1>> to memref<3xvector<[8]xi1>>
+  // CHECK-NEXT: %[[RELOAD:.*]] = memref.load %[[UNPACK]][%[[C0]]] : memref<3xvector<[16]xi1>>
+  // CHECK-NEXT: %[[SLICE:.*]] = arm_sve.convert_from_svbool %[[RELOAD]] : vector<[8]xi1>
+  %slice = memref.load %unpack[%c0] : memref<3xvector<[8]xi1>>
+  // CHECK-NEXT: return %[[SLICE]] : vector<[8]xi1>
+  return %slice : vector<[8]xi1>
+}
+
+// -----
+
+// CHECK-LABEL: @set_sve_alloca_alignment
+func.func @set_sve_alloca_alignment() {
+  /// This checks the alignment of alloca's of scalable vectors will be
+  /// something the backend can handle. Currently, the backend sets the
+  /// alignment of scalable vectors to their base size (i.e. their size at
+  /// vscale = 1). This works for hardware-sized types, which always get a
+  /// 16-byte alignment. The problem is larger types e.g. vector<[8]xf32> end up
+  /// with alignments larger than 16-bytes (e.g. 32-bytes here), which are
+  /// unsupported. The `-arm-sve-legalize-vector-storage` pass avoids this
+  /// issue by explicitly setting the alignment to 16-bytes for all scalable
+  /// vectors.
+
+  // CHECK-COUNT-6: alignment = 16
+  %a1 = memref.alloca() : memref<vector<[32]xi8>>
+  %a2 = memref.alloca() : memref<vector<[16]xi8>>
+  %a3 = memref.alloca() : memref<vector<[8]xi8>>
+  %a4 = memref.alloca() : memref<vector<[4]xi8>>
+  %a5 = memref.alloca() : memref<vector<[2]xi8>>
+  %a6 = memref.alloca() : memref<vector<[1]xi8>>
+
+  // CHECK-COUNT-6: alignment = 16
+  %b1 = memref.alloca() : memref<vector<[32]xi16>>
+  %b2 = memref.alloca() : memref<vector<[16]xi16>>
+  %b3 = memref.alloca() : memref<vector<[8]xi16>>
+  %b4 = memref.alloca() : memref<vector<[4]xi16>>
+  %b5 = memref.alloca() : memref<vector<[2]xi16>>
+  %b6 = memref.alloca() : memref<vector<[1]xi16>>
+
+  // CHECK-COUNT-6: alignment = 16
+  %c1 = memref.alloca() : memref<vector<[32]xi32>>
+  %c2 = memref.alloca() : memref<vector<[16]xi32>>
+  %c3 = memref.alloca() : memref<vector<[8]xi32>>
+  %c4 = memref.alloca() : memref<vector<[4]xi32>>
+  %c5 = memref.alloca() : memref<vector<[2]xi32>>
+  %c6 = memref.alloca() : memref<vector<[1]xi32>>
+
+  // CHECK-COUNT-6: alignment = 16
+  %d1 = memref.alloca() : memref<vector<[32]xi64>>
+  %d2 = memref.alloca() : memref<vector<[16]xi64>>
+  %d3 = memref.alloca() : memref<vector<[8]xi64>>
+  %d4 = memref.alloca() : memref<vector<[4]xi64>>
+  %d5 = memref.alloca() : memref<vector<[2]xi64>>
+  %d6 = memref.alloca() : memref<vector<[1]xi64>>
+
+  // CHECK-COUNT-6: alignment = 16
+  %e1 = memref.alloca() : memref<vector<[32]xf32>>
+  %e2 = memref.alloca() : memref<vector<[16]xf32>>
+  %e3 = memref.alloca() : memref<vector<[8]xf32>>
+  %e4 = memref.alloca() : memref<vector<[4]xf32>>
+  %e5 = memref.alloca() : memref<vector<[2]xf32>>
+  %e6 = memref.alloca() : memref<vector<[1]xf32>>
+
+  // CHECK-COUNT-6: alignment = 16
+  %f1 = memref.alloca() : memref<vector<[32]xf64>>
+  %f2 = memref.alloca() : memref<vector<[16]xf64>>
+  %f3 = memref.alloca() : memref<vector<[8]xf64>>
+  %f4 = memref.alloca() : memref<vector<[4]xf64>>
+  %f5 = memref.alloca() : memref<vector<[2]xf64>>
+  %f6 = memref.alloca() : memref<vector<[1]xf64>>
+
+  "prevent.dce"(
+    %a1, %a2, %a3, %a4, %a5, %a6,
+    %b1, %b2, %b3, %b4, %b5, %b6,
+    %c1, %c2, %c3, %c4, %c5, %c6,
+    %d1, %d2, %d3, %d4, %d5, %d6,
+    %e1, %e2, %e3, %e4, %e5, %e6,
+    %f1, %f2, %f3, %f4, %f5, %f6)
+    : (memref<vector<[32]xi8>>, memref<vector<[16]xi8>>, memref<vector<[8]xi8>>, memref<vector<[4]xi8>>, memref<vector<[2]xi8>>, memref<vector<[1]xi8>>,
+       memref<vector<[32]xi16>>, memref<vector<[16]xi16>>, memref<vector<[8]xi16>>, memref<vector<[4]xi16>>, memref<vector<[2]xi16>>, memref<vector<[1]xi16>>,
+       memref<vector<[32]xi32>>, memref<vector<[16]xi32>>, memref<vector<[8]xi32>>, memref<vector<[4]xi32>>, memref<vector<[2]xi32>>, memref<vector<[1]xi32>>,
+       memref<vector<[32]xi64>>, memref<vector<[16]xi64>>, memref<vector<[8]xi64>>, memref<vector<[4]xi64>>, memref<vector<[2]xi64>>, memref<vector<[1]xi64>>,
+       memref<vector<[32]xf32>>, memref<vector<[16]xf32>>, memref<vector<[8]xf32>>, memref<vector<[4]xf32>>, memref<vector<[2]xf32>>, memref<vector<[1]xf32>>,
+       memref<vector<[32]xf64>>, memref<vector<[16]xf64>>, memref<vector<[8]xf64>>, memref<vector<[4]xf64>>, memref<vector<[2]xf64>>, memref<vector<[1]xf64>>) -> ()
+  return
+}
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/arrays-of-scalable-vectors.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/arrays-of-scalable-vectors.mlir
new file mode 100644
index 0000000000000..c486bf0de5d35
--- /dev/null
+++ b/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/arrays-of-scalable-vectors.mlir
@@ -0,0 +1,117 @@
+// RUN: mlir-opt %s -convert-vector-to-scf -arm-sve-legalize-vector-storage -convert-vector-to-llvm="enable-arm-sve" -test-lower-to-llvm | \
+// RUN: %mcr_aarch64_cmd -e=entry -entry-point-result=void --march=aarch64 --mattr="+sve" -shared-libs=%mlir_lib_dir/libmlir_c_runner_utils%shlibext | \
+// RUN: FileCheck %s
+
+/// This tests basic functionality of arrays of scalable vectors, which in MLIR
+/// are vectors with a single trailing scalable dimension. This test requires
+/// the -arm-sve-legalize-vector-storage pass to ensure the loads/stores done
+/// here are be legal for the LLVM backend.
+
+func.func @read_and_print_2d_vector(%memref: memref<3x?xf32>)  {
+  %cst = arith.constant 0.000000e+00 : f32
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c2 = arith.constant 2 : index
+  %dim = memref.dim %memref, %c1 : memref<3x?xf32>
+  %mask = vector.create_mask %c2, %dim : vector<3x[8]xi1>
+  %vector = vector.transfer_read %memref[%c0,%c0], %cst, %mask {in_bounds = [true, true]} : memref<3x?xf32>, vector<3x[8]xf32>
+
+  /// TODO: Support vector.print for arrays of scalable vectors.
+  %row0 = vector.extract %vector[0] : vector<[8]xf32> from vector<3x[8]xf32>
+  %row1 = vector.extract %vector[1] : vector<[8]xf32> from vector<3x[8]xf32>
+  %row2 = vector.extract %vector[2] : vector<[8]xf32> from vector<3x[8]xf32>
+
+  /// Print each of the vectors.
+  /// vscale is >= 1, so at least 8 elements will be printed.
+
+  vector.print str "read_and_print_2d_vector()"
+  // CHECK-LABEL: read_and_print_2d_vector()
+  // CHECK: ( 8, 8, 8, 8, 8, 8, 8, 8
+  vector.print %row0 : vector<[8]xf32>
+  // CHECK: ( 8, 8, 8, 8, 8, 8, 8, 8
+  vector.print %row1 : vector<[8]xf32>
+  /// This last row is all zero due to our mask.
+  // CHECK: ( 0, 0, 0, 0, 0, 0, 0, 0
+  vector.print %row2 : vector<[8]xf32>
+
+  return
+}
+
+func.func @print_1x2xVSCALExf32(%vector: vector<1x2x[4]xf32>) {
+  /// TODO: Support vector.print for arrays of scalable vectors.
+  %slice0 = vector.extract %vector[0, 1] : vector<[4]xf32> from vector<1x2x[4]xf32>
+  %slice1 = vector.extract %vector[0, 1] : vector<[4]xf32> from vector<1x2x[4]xf32>
+  vector.print %slice0 : vector<[4]xf32>
+  vector.print %slice1 : vector<[4]xf32>
+  return
+}
+
+func.func @add_arrays_of_scalable_vectors(%a: memref<1x2x?xf32>, %b: memref<1x2x?xf32>) {
+  %c0 = arith.constant 0 : index
+  %c2 = arith.constant 2 : index
+  %c3 = arith.constant 2 : index
+  %cst = arith.constant 0.000000e+00 : f32
+  %dim_a = memref.dim %a, %c2 : memref<1x2x?xf32>
+  %dim_b = memref.dim %b, %c2 : memref<1x2x?xf32>
+  %mask_a = vector.create_mask %c2, %c3, %dim_a : vector<1x2x[4]xi1>
+  %mask_b = vector.create_mask %c2, %c3, %dim_b : vector<1x2x[4]xi1>
+
+  /// Print each of the vectors.
+  /// vscale is >= 1, so at least 4 elements will be printed.
+
+  // CHECK-LABEL: Vector A
+  // CHECK-NEXT: ( 5, 5, 5, 5
+  // CHECK-NEXT: ( 5, 5, 5, 5
+  vector.print str "\nVector A"
+  %vector_a = vector.transfer_read %a[%c0, %c0, %c0], %cst, %mask_a {in_bounds = [true, true, true]} : memref<1x2x?xf32>, vector<1x2x[4]xf32>
+  func.call @print_1x2xVSCALExf32(%vector_a) : (vector<1x2x[4]xf32>) -> ()
+
+  // CHECK-LABEL: Vector B
+  // CHECK-NEXT: ( 4, 4, 4, 4
+  // CHECK-NEXT: ( 4, 4, 4, 4
+  vector.print str "\nVector B"
+  %vector_b = vector.transfer_read %b[%c0, %c0, %c0], %cst, %mask_b {in_bounds = [true, true, true]} : memref<1x2x?xf32>, vector<1x2x[4]xf32>
+  func.call @print_1x2xVSCALExf32(%vector_b) : (vector<1x2x[4]xf32>) -> ()
+
+  // CHECK-LABEL: Sum
+  // CHECK-NEXT: ( 9, 9, 9, 9
+  // CHECK-NEXT: ( 9, 9, 9, 9
+  vector.print str "\nSum"
+  %sum = arith.addf %vector_a, %vector_b : vector<1x2x[4]xf32>
+  func.call @print_1x2xVSCALExf32(%sum) : (vector<1x2x[4]xf32>) -> ()
+
+  return
+}
+
+func.func @entry() {
+  %vscale = vector.vscale
+
+  %c4 = arith.constant 4 : index
+  %c8 = arith.constant 8 : index
+  %f32_8 = arith.constant 8.0 : f32
+  %f32_5 = arith.constant 5.0 : f32
+  %f32_4 = arith.constant 4.0 : f32
+
+  %test_1_memref_size = arith.muli %vscale, %c8 : index
+  %test_1_memref = memref.alloca(%test_1_memref_size) : memref<3x?xf32>
+
+  linalg.fill ins(%f32_8 : f32) outs(%test_1_memref :memref<3x?xf32>)
+
+  vector.print str "=> Print and read 2D arrays of scalable vectors:"
+  func.call @read_and_print_2d_vector(%test_1_memref) : (memref<3x?xf32>) -> ()
+
+  vector.print str "\n====================\n"
+
+  %test_2_memref_size = arith.muli %vscale, %c4 : index
+  %test_2_memref_a = memref.alloca(%test_2_memref_size) : memref<1x2x?xf32>
+  %test_2_memref_b = memref.alloca(%test_2_memref_size) : memref<1x2x?xf32>
+
+  linalg.fill ins(%f32_5 : f32) outs(%test_2_memref_a :memref<1x2x?xf32>)
+  linalg.fill ins(%f32_4 : f32) outs(%test_2_memref_b :memref<1x2x?xf32>)
+
+  vector.print str "=> Reading and adding two 3D arrays of scalable vectors:"
+  func.call @add_arrays_of_scalable_vectors(
+    %test_2_memref_a, %test_2_memref_b) : (memref<1x2x?xf32>, memref<1x2x?xf32>) -> ()
+
+  return
+}

From 16fbc45f48e895cc29b81fb36baa29f010d02881 Mon Sep 17 00:00:00 2001
From: Christudasan Devadasan <Christudasan.Devadasan@amd.com>
Date: Thu, 26 Oct 2023 17:04:28 +0530
Subject: [PATCH 048/877] Revert "[AMDGPU] Cleanup
 hasUnwantedEffectsWhenEXECEmpty function (#70206)"

This reverts commit 7ce613fc77af092dd6e9db71ce3747b75bc5616e.
---
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp             |  3 ++-
 ...ve-short-exec-branches-special-instructions.mir | 14 ++++++++------
 2 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 7046c37ef6efd..ffcd415a66648 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -4010,7 +4010,8 @@ bool SIInstrInfo::hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const
   //
   // However, executing them with EXEC = 0 causes them to operate on undefined
   // data, which we avoid by returning true here.
-  if (Opcode == AMDGPU::V_READFIRSTLANE_B32)
+  if (Opcode == AMDGPU::V_READFIRSTLANE_B32 ||
+      Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32)
     return true;
 
   return false;
diff --git a/llvm/test/CodeGen/AMDGPU/remove-short-exec-branches-special-instructions.mir b/llvm/test/CodeGen/AMDGPU/remove-short-exec-branches-special-instructions.mir
index 8f3f70ea6287e..fe4aa6a9aea68 100644
--- a/llvm/test/CodeGen/AMDGPU/remove-short-exec-branches-special-instructions.mir
+++ b/llvm/test/CodeGen/AMDGPU/remove-short-exec-branches-special-instructions.mir
@@ -130,12 +130,13 @@ body: |
 
 ---
 
-name: dont_skip_writelane_b32
+name: need_skip_writelane_b32
 body: |
-  ; CHECK-LABEL: name: dont_skip_writelane_b32
+  ; CHECK-LABEL: name: need_skip_writelane_b32
   ; CHECK: bb.0:
-  ; CHECK-NEXT:   successors: %bb.1(0x40000000)
+  ; CHECK-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
   ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
@@ -159,12 +160,13 @@ body: |
 ...
 
 ---
-name: dont_skip_readlane_b32
+name: need_skip_readlane_b32
 body: |
-  ; CHECK-LABEL: name: dont_skip_readlane_b32
+  ; CHECK-LABEL: name: need_skip_readlane_b32
   ; CHECK: bb.0:
-  ; CHECK-NEXT:   successors: %bb.1(0x40000000)
+  ; CHECK-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
   ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   S_CBRANCH_EXECZ %bb.2, implicit $exec
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)

From 0e110fb429e85e7dc4a1e2de739c0d6e931204a7 Mon Sep 17 00:00:00 2001
From: Dmitry Vyukov <dvyukov@google.com>
Date: Thu, 26 Oct 2023 13:40:25 +0200
Subject: [PATCH 049/877] [libc] memmove optimizations (#70043)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

1. Remove is_disjoint check for smaller sizes and reduce code bloat.

inline_memmove may handle some small sizes as efficiently
as inline_memcpy. For these sizes we may not do is_disjoint check.
This both avoids additional code for the most frequent smaller sizes
and removes code bloat (we don't need the memcpy logic for small sizes).
Here we heavily rely on inlining and dead code elimination: from the
first
inline_memmove we should get only handling of small sizes, and from
the second inline_memmove and inline_memcpy we should get only handling
of larger sizes.

2. Use the memcpy thresholds for memmove.
Memcpy thresholds were more carefully tuned.
This becomes more important since we use memmove
for all small sizes always now.

3. Fix boundary conditions for sizes = 16/32/64.
See the added comment for explanations.

Memmove function size drops from 885 to 715 bytes
due to removed duplication.

```
                 │  baseline   │             small-size              │
                 │   sec/op    │   sec/op     vs base                │
memmove/Google_A   3.208n ± 0%   2.911n ± 0%   -9.25% (n=100)
memmove/Google_B   4.113n ± 1%   3.428n ± 0%  -16.65% (n=100)
memmove/Google_D   5.838n ± 0%   4.158n ± 0%  -28.78% (n=100)
memmove/Google_S   4.712n ± 1%   3.899n ± 0%  -17.25% (n=100)
memmove/Google_U   3.609n ± 0%   3.247n ± 1%  -10.02% (n=100)
memmove/0          2.982n ± 0%   2.169n ± 0%  -27.26% (n=50)
memmove/1          3.253n ± 0%   2.168n ± 0%  -33.34% (n=50)
memmove/2          3.255n ± 0%   2.169n ± 0%  -33.38% (n=50)
memmove/3          3.259n ± 2%   2.175n ± 0%  -33.27% (p=0.000 n=50)
memmove/4          3.259n ± 0%   2.168n ± 5%  -33.46% (p=0.000 n=50)
memmove/5          2.488n ± 0%   1.926n ± 0%  -22.57% (p=0.000 n=50)
memmove/6          2.490n ± 0%   1.928n ± 0%  -22.59% (p=0.000 n=50)
memmove/7          2.492n ± 0%   1.927n ± 0%  -22.65% (p=0.000 n=50)
memmove/8          2.737n ± 0%   2.711n ± 0%   -0.97% (p=0.000 n=50)
memmove/9          2.736n ± 0%   2.711n ± 0%   -0.94% (p=0.000 n=50)
memmove/10         2.739n ± 0%   2.711n ± 0%   -1.04% (p=0.000 n=50)
memmove/11         2.740n ± 0%   2.711n ± 0%   -1.07% (p=0.000 n=50)
memmove/12         2.740n ± 0%   2.711n ± 0%   -1.09% (p=0.000 n=50)
memmove/13         2.744n ± 0%   2.711n ± 0%   -1.22% (p=0.000 n=50)
memmove/14         2.742n ± 0%   2.711n ± 0%   -1.14% (p=0.000 n=50)
memmove/15         2.742n ± 0%   2.711n ± 0%   -1.15% (p=0.000 n=50)
memmove/16         2.997n ± 0%   2.981n ± 0%   -0.52% (p=0.000 n=50)
memmove/17         2.998n ± 0%   2.981n ± 0%   -0.55% (p=0.000 n=50)
memmove/18         2.998n ± 0%   2.981n ± 0%   -0.55% (p=0.000 n=50)
memmove/19         2.999n ± 0%   2.982n ± 0%   -0.59% (p=0.000 n=50)
memmove/20         2.998n ± 0%   2.981n ± 0%   -0.55% (p=0.000 n=50)
memmove/21         3.000n ± 0%   2.981n ± 0%   -0.61% (p=0.000 n=50)
memmove/22         3.002n ± 0%   2.981n ± 0%   -0.68% (p=0.000 n=50)
memmove/23         3.002n ± 0%   2.981n ± 0%   -0.67% (p=0.000 n=50)
memmove/24         3.002n ± 0%   2.981n ± 0%   -0.70% (n=50)
memmove/25         3.002n ± 0%   2.981n ± 0%   -0.68% (p=0.000 n=50)
memmove/26         3.004n ± 0%   2.982n ± 0%   -0.74% (p=0.000 n=50)
memmove/27         3.005n ± 0%   2.981n ± 0%   -0.79% (n=50)
memmove/28         3.005n ± 0%   2.982n ± 0%   -0.77% (n=50)
memmove/29         3.009n ± 0%   2.981n ± 0%   -0.92% (n=50)
memmove/30         3.008n ± 0%   2.981n ± 0%   -0.89% (n=50)
memmove/31         3.007n ± 0%   2.982n ± 0%   -0.86% (n=50)
memmove/32         3.540n ± 0%   2.998n ± 0%  -15.31% (p=0.000 n=50)
memmove/33         3.544n ± 0%   2.997n ± 0%  -15.44% (p=0.000 n=50)
memmove/34         3.546n ± 0%   2.999n ± 0%  -15.42% (n=50)
memmove/35         3.545n ± 0%   2.999n ± 0%  -15.40% (n=50)
memmove/36         3.548n ± 0%   2.998n ± 0%  -15.52% (p=0.000 n=50)
memmove/37         3.546n ± 0%   3.000n ± 0%  -15.41% (n=50)
memmove/38         3.549n ± 0%   2.999n ± 0%  -15.49% (p=0.000 n=50)
memmove/39         3.549n ± 0%   2.999n ± 0%  -15.48% (p=0.000 n=50)
memmove/40         3.549n ± 0%   3.000n ± 0%  -15.46% (p=0.000 n=50)
memmove/41         3.550n ± 0%   3.001n ± 0%  -15.47% (n=50)
memmove/42         3.549n ± 0%   3.001n ± 0%  -15.43% (n=50)
memmove/43         3.552n ± 0%   3.001n ± 0%  -15.52% (p=0.000 n=50)
memmove/44         3.552n ± 0%   3.001n ± 0%  -15.51% (n=50)
memmove/45         3.552n ± 0%   3.002n ± 0%  -15.48% (n=50)
memmove/46         3.554n ± 0%   3.001n ± 0%  -15.55% (p=0.000 n=50)
memmove/47         3.556n ± 0%   3.002n ± 0%  -15.58% (p=0.000 n=50)
memmove/48         3.555n ± 0%   3.003n ± 0%  -15.54% (n=50)
memmove/49         3.557n ± 0%   3.002n ± 0%  -15.59% (p=0.000 n=50)
memmove/50         3.557n ± 0%   3.004n ± 0%  -15.55% (p=0.000 n=50)
memmove/51         3.556n ± 0%   3.004n ± 0%  -15.53% (p=0.000 n=50)
memmove/52         3.561n ± 0%   3.004n ± 0%  -15.65% (p=0.000 n=50)
memmove/53         3.558n ± 0%   3.004n ± 0%  -15.57% (p=0.000 n=50)
memmove/54         3.561n ± 0%   3.005n ± 0%  -15.62% (n=50)
memmove/55         3.560n ± 0%   3.006n ± 0%  -15.57% (n=50)
memmove/56         3.562n ± 0%   3.006n ± 0%  -15.60% (p=0.000 n=50)
memmove/57         3.563n ± 0%   3.006n ± 0%  -15.64% (n=50)
memmove/58         3.565n ± 0%   3.007n ± 0%  -15.64% (p=0.000 n=50)
memmove/59         3.564n ± 0%   3.006n ± 0%  -15.66% (p=0.000 n=50)
memmove/60         3.570n ± 0%   3.008n ± 0%  -15.74% (p=0.000 n=50)
memmove/61         3.566n ± 0%   3.009n ± 0%  -15.63% (p=0.000 n=50)
memmove/62         3.567n ± 0%   3.007n ± 0%  -15.70% (p=0.000 n=50)
memmove/63         3.568n ± 0%   3.008n ± 0%  -15.71% (p=0.000 n=50)
memmove/64         4.104n ± 0%   3.008n ± 0%  -26.70% (p=0.000 n=50)
memmove/65         4.126n ± 0%   3.662n ± 0%  -11.26% (p=0.000 n=50)
memmove/66         4.128n ± 0%   3.662n ± 0%  -11.29% (n=50)
memmove/67         4.129n ± 0%   3.662n ± 0%  -11.31% (n=50)
memmove/68         4.129n ± 0%   3.661n ± 0%  -11.33% (p=0.000 n=50)
memmove/69         4.130n ± 0%   3.662n ± 0%  -11.34% (p=0.000 n=50)
memmove/70         4.130n ± 0%   3.662n ± 0%  -11.33% (n=50)
memmove/71         4.132n ± 0%   3.662n ± 0%  -11.38% (p=0.000 n=50)
memmove/72         4.131n ± 0%   3.661n ± 0%  -11.39% (n=50)
memmove/73         4.135n ± 0%   3.661n ± 0%  -11.45% (p=0.000 n=50)
memmove/74         4.137n ± 0%   3.662n ± 0%  -11.49% (n=50)
memmove/75         4.138n ± 0%   3.662n ± 0%  -11.51% (p=0.000 n=50)
memmove/76         4.139n ± 0%   3.661n ± 0%  -11.56% (p=0.000 n=50)
memmove/77         4.136n ± 0%   3.662n ± 0%  -11.47% (p=0.000 n=50)
memmove/78         4.143n ± 0%   3.661n ± 0%  -11.62% (p=0.000 n=50)
memmove/79         4.142n ± 0%   3.661n ± 0%  -11.60% (n=50)
memmove/80         4.142n ± 0%   3.661n ± 0%  -11.62% (p=0.000 n=50)
memmove/81         4.140n ± 0%   3.661n ± 0%  -11.57% (n=50)
memmove/82         4.146n ± 0%   3.661n ± 0%  -11.69% (n=50)
memmove/83         4.143n ± 0%   3.661n ± 0%  -11.63% (p=0.000 n=50)
memmove/84         4.143n ± 0%   3.661n ± 0%  -11.63% (n=50)
memmove/85         4.147n ± 0%   3.661n ± 0%  -11.73% (p=0.000 n=50)
memmove/86         4.142n ± 0%   3.661n ± 0%  -11.62% (p=0.000 n=50)
memmove/87         4.147n ± 0%   3.661n ± 0%  -11.72% (p=0.000 n=50)
memmove/88         4.148n ± 0%   3.661n ± 0%  -11.74% (n=50)
memmove/89         4.152n ± 0%   3.661n ± 0%  -11.84% (n=50)
memmove/90         4.151n ± 0%   3.661n ± 0%  -11.81% (n=50)
memmove/91         4.150n ± 0%   3.661n ± 0%  -11.78% (n=50)
memmove/92         4.153n ± 0%   3.661n ± 0%  -11.86% (n=50)
memmove/93         4.158n ± 0%   3.661n ± 0%  -11.95% (n=50)
memmove/94         4.157n ± 0%   3.661n ± 0%  -11.95% (p=0.000 n=50)
memmove/95         4.155n ± 0%   3.661n ± 0%  -11.90% (p=0.000 n=50)
memmove/96         4.149n ± 0%   3.660n ± 0%  -11.79% (n=50)
memmove/97         4.157n ± 0%   3.661n ± 0%  -11.94% (n=50)
memmove/98         4.157n ± 0%   3.661n ± 0%  -11.94% (n=50)
memmove/99         4.168n ± 0%   3.661n ± 0%  -12.17% (p=0.000 n=50)
memmove/100        4.159n ± 0%   3.660n ± 0%  -12.00% (p=0.000 n=50)
memmove/101        4.161n ± 0%   3.660n ± 0%  -12.03% (p=0.000 n=50)
memmove/102        4.165n ± 0%   3.660n ± 0%  -12.12% (p=0.000 n=50)
memmove/103        4.164n ± 0%   3.661n ± 0%  -12.08% (n=50)
memmove/104        4.164n ± 0%   3.660n ± 0%  -12.11% (n=50)
memmove/105        4.165n ± 0%   3.660n ± 0%  -12.12% (p=0.000 n=50)
memmove/106        4.166n ± 0%   3.660n ± 0%  -12.15% (n=50)
memmove/107        4.171n ± 0%   3.660n ± 1%  -12.26% (p=0.000 n=50)
memmove/108        4.173n ± 0%   3.660n ± 0%  -12.30% (p=0.000 n=50)
memmove/109        4.170n ± 0%   3.660n ± 0%  -12.24% (n=50)
memmove/110        4.174n ± 0%   3.660n ± 0%  -12.31% (n=50)
memmove/111        4.176n ± 0%   3.660n ± 0%  -12.35% (p=0.000 n=50)
memmove/112        4.174n ± 0%   3.659n ± 0%  -12.34% (p=0.000 n=50)
memmove/113        4.176n ± 0%   3.660n ± 0%  -12.35% (n=50)
memmove/114        4.182n ± 0%   3.660n ± 0%  -12.49% (n=50)
memmove/115        4.185n ± 0%   3.660n ± 0%  -12.55% (n=50)
memmove/116        4.184n ± 0%   3.659n ± 0%  -12.54% (n=50)
memmove/117        4.182n ± 0%   3.660n ± 0%  -12.50% (n=50)
memmove/118        4.188n ± 0%   3.660n ± 0%  -12.61% (n=50)
memmove/119        4.186n ± 0%   3.660n ± 0%  -12.57% (p=0.000 n=50)
memmove/120        4.189n ± 0%   3.659n ± 0%  -12.63% (n=50)
memmove/121        4.187n ± 0%   3.660n ± 0%  -12.60% (n=50)
memmove/122        4.186n ± 0%   3.660n ± 0%  -12.58% (n=50)
memmove/123        4.187n ± 0%   3.660n ± 0%  -12.60% (n=50)
memmove/124        4.189n ± 0%   3.659n ± 0%  -12.65% (n=50)
memmove/125        4.195n ± 0%   3.659n ± 0%  -12.78% (n=50)
memmove/126        4.197n ± 0%   3.659n ± 0%  -12.81% (n=50)
memmove/127        4.194n ± 0%   3.659n ± 0%  -12.75% (n=50)
memmove/128        5.035n ± 0%   3.659n ± 0%  -27.32% (n=50)
memmove/129        5.127n ± 0%   5.164n ± 0%   +0.73% (p=0.000 n=50)
memmove/130        5.130n ± 0%   5.176n ± 0%   +0.88% (p=0.000 n=50)
memmove/131        5.127n ± 0%   5.180n ± 0%   +1.05% (p=0.000 n=50)
memmove/132        5.131n ± 0%   5.169n ± 0%   +0.75% (p=0.000 n=50)
memmove/133        5.137n ± 0%   5.179n ± 0%   +0.81% (p=0.000 n=50)
memmove/134        5.140n ± 0%   5.178n ± 0%   +0.74% (p=0.000 n=50)
memmove/135        5.141n ± 0%   5.187n ± 0%   +0.88% (p=0.000 n=50)
memmove/136        5.133n ± 0%   5.184n ± 0%   +0.99% (p=0.000 n=50)
memmove/137        5.148n ± 0%   5.186n ± 0%   +0.73% (p=0.000 n=50)
memmove/138        5.143n ± 0%   5.189n ± 0%   +0.88% (p=0.000 n=50)
memmove/139        5.142n ± 0%   5.192n ± 0%   +0.97% (p=0.000 n=50)
memmove/140        5.141n ± 0%   5.192n ± 0%   +1.01% (p=0.000 n=50)
memmove/141        5.155n ± 0%   5.188n ± 0%   +0.64% (p=0.000 n=50)
memmove/142        5.146n ± 0%   5.192n ± 0%   +0.90% (p=0.000 n=50)
memmove/143        5.142n ± 0%   5.203n ± 0%   +1.19% (p=0.000 n=50)
memmove/144        5.146n ± 0%   5.197n ± 0%   +0.99% (p=0.000 n=50)
memmove/145        5.146n ± 0%   5.196n ± 0%   +0.97% (p=0.000 n=50)
memmove/146        5.151n ± 0%   5.207n ± 0%   +1.10% (p=0.000 n=50)
memmove/147        5.151n ± 0%   5.205n ± 0%   +1.06% (p=0.000 n=50)
memmove/148        5.156n ± 0%   5.190n ± 0%   +0.66% (p=0.000 n=50)
memmove/149        5.158n ± 0%   5.212n ± 0%   +1.04% (p=0.000 n=50)
memmove/150        5.160n ± 0%   5.203n ± 0%   +0.84% (p=0.000 n=50)
memmove/151        5.167n ± 0%   5.210n ± 0%   +0.83% (p=0.000 n=50)
memmove/152        5.157n ± 0%   5.206n ± 0%   +0.94% (p=0.000 n=50)
memmove/153        5.170n ± 0%   5.211n ± 0%   +0.80% (p=0.000 n=50)
memmove/154        5.169n ± 0%   5.222n ± 0%   +1.02% (p=0.000 n=50)
memmove/155        5.171n ± 0%   5.215n ± 0%   +0.87% (p=0.000 n=50)
memmove/156        5.174n ± 0%   5.214n ± 0%   +0.78% (p=0.000 n=50)
memmove/157        5.171n ± 0%   5.218n ± 0%   +0.92% (p=0.000 n=50)
memmove/158        5.168n ± 0%   5.224n ± 0%   +1.09% (p=0.000 n=50)
memmove/159        5.179n ± 0%   5.218n ± 0%   +0.76% (p=0.000 n=50)
memmove/160        5.170n ± 0%   5.219n ± 0%   +0.95% (p=0.000 n=50)
memmove/161        5.187n ± 0%   5.220n ± 0%   +0.64% (p=0.000 n=50)
memmove/162        5.189n ± 0%   5.234n ± 0%   +0.86% (p=0.000 n=50)
memmove/163        5.199n ± 0%   5.250n ± 0%   +0.99% (p=0.000 n=50)
memmove/164        5.205n ± 0%   5.260n ± 0%   +1.04% (p=0.000 n=50)
memmove/165        5.208n ± 0%   5.261n ± 0%   +1.01% (p=0.000 n=50)
memmove/166        5.227n ± 0%   5.275n ± 0%   +0.91% (p=0.000 n=50)
memmove/167        5.233n ± 0%   5.281n ± 0%   +0.92% (p=0.000 n=50)
memmove/168        5.236n ± 0%   5.295n ± 0%   +1.12% (p=0.000 n=50)
memmove/169        5.256n ± 0%   5.297n ± 0%   +0.79% (p=0.000 n=50)
memmove/170        5.259n ± 0%   5.302n ± 0%   +0.80% (p=0.000 n=50)
memmove/171        5.269n ± 0%   5.321n ± 0%   +0.97% (p=0.000 n=50)
memmove/172        5.266n ± 0%   5.318n ± 0%   +0.98% (p=0.000 n=50)
memmove/173        5.272n ± 0%   5.330n ± 0%   +1.09% (p=0.000 n=50)
memmove/174        5.284n ± 0%   5.331n ± 0%   +0.89% (p=0.000 n=50)
memmove/175        5.284n ± 0%   5.322n ± 0%   +0.72% (p=0.000 n=50)
memmove/176        5.298n ± 0%   5.337n ± 0%   +0.74% (p=0.000 n=50)
memmove/177        5.282n ± 0%   5.338n ± 0%   +1.04% (p=0.000 n=50)
memmove/178        5.299n ± 0%   5.337n ± 0%   +0.71% (p=0.000 n=50)
memmove/179        5.296n ± 0%   5.343n ± 0%   +0.88% (p=0.000 n=50)
memmove/180        5.292n ± 0%   5.343n ± 0%   +0.97% (p=0.000 n=50)
memmove/181        5.303n ± 0%   5.335n ± 0%   +0.60% (p=0.000 n=50)
memmove/182        5.305n ± 0%   5.338n ± 0%   +0.62% (p=0.000 n=50)
memmove/183        5.298n ± 0%   5.329n ± 0%   +0.59% (p=0.000 n=50)
memmove/184        5.299n ± 0%   5.333n ± 0%   +0.64% (p=0.000 n=50)
memmove/185        5.291n ± 0%   5.330n ± 0%   +0.73% (p=0.000 n=50)
memmove/186        5.296n ± 0%   5.332n ± 0%   +0.68% (p=0.000 n=50)
memmove/187        5.297n ± 0%   5.320n ± 0%   +0.44% (p=0.000 n=50)
memmove/188        5.286n ± 0%   5.314n ± 0%   +0.53% (p=0.000 n=50)
memmove/189        5.293n ± 0%   5.318n ± 0%   +0.46% (p=0.000 n=50)
memmove/190        5.294n ± 0%   5.318n ± 0%   +0.45% (p=0.000 n=50)
memmove/191        5.292n ± 0%   5.314n ± 0%   +0.40% (p=0.032 n=50)
memmove/192        5.272n ± 0%   5.304n ± 0%   +0.60% (p=0.000 n=50)
memmove/193        5.279n ± 0%   5.310n ± 0%   +0.57% (p=0.000 n=50)
memmove/194        5.294n ± 0%   5.308n ± 0%   +0.26% (p=0.018 n=50)
memmove/195        5.302n ± 0%   5.311n ± 0%   +0.18% (p=0.010 n=50)
memmove/196        5.301n ± 0%   5.316n ± 0%   +0.28% (p=0.023 n=50)
memmove/197        5.302n ± 0%   5.327n ± 0%   +0.47% (p=0.000 n=50)
memmove/198        5.310n ± 0%   5.326n ± 0%   +0.30% (p=0.003 n=50)
memmove/199        5.303n ± 0%   5.319n ± 0%   +0.30% (p=0.009 n=50)
memmove/200        5.312n ± 0%   5.330n ± 0%   +0.35% (p=0.001 n=50)
memmove/201        5.307n ± 0%   5.333n ± 0%   +0.50% (p=0.000 n=50)
memmove/202        5.311n ± 0%   5.334n ± 0%   +0.44% (p=0.000 n=50)
memmove/203        5.313n ± 0%   5.335n ± 0%   +0.41% (p=0.006 n=50)
memmove/204        5.312n ± 0%   5.332n ± 0%   +0.36% (p=0.002 n=50)
memmove/205        5.318n ± 0%   5.345n ± 0%   +0.50% (p=0.000 n=50)
memmove/206        5.311n ± 0%   5.333n ± 0%   +0.42% (p=0.002 n=50)
memmove/207        5.310n ± 0%   5.338n ± 0%   +0.52% (p=0.000 n=50)
memmove/208        5.319n ± 0%   5.341n ± 0%   +0.40% (p=0.004 n=50)
memmove/209        5.330n ± 0%   5.346n ± 0%   +0.30% (p=0.004 n=50)
memmove/210        5.329n ± 0%   5.349n ± 0%   +0.38% (p=0.002 n=50)
memmove/211        5.318n ± 0%   5.340n ± 0%   +0.41% (p=0.000 n=50)
memmove/212        5.339n ± 0%   5.343n ± 0%        ~ (p=0.396 n=50)
memmove/213        5.329n ± 0%   5.343n ± 0%   +0.25% (p=0.017 n=50)
memmove/214        5.339n ± 0%   5.358n ± 0%   +0.35% (p=0.035 n=50)
memmove/215        5.342n ± 0%   5.346n ± 0%        ~ (p=0.063 n=50)
memmove/216        5.338n ± 0%   5.359n ± 0%   +0.39% (p=0.002 n=50)
memmove/217        5.341n ± 0%   5.362n ± 0%   +0.39% (p=0.015 n=50)
memmove/218        5.354n ± 0%   5.373n ± 0%   +0.36% (p=0.041 n=50)
memmove/219        5.352n ± 0%   5.362n ± 0%        ~ (p=0.143 n=50)
memmove/220        5.344n ± 0%   5.370n ± 0%   +0.50% (p=0.001 n=50)
memmove/221        5.345n ± 0%   5.373n ± 0%   +0.53% (p=0.000 n=50)
memmove/222        5.348n ± 0%   5.360n ± 0%   +0.23% (p=0.014 n=50)
memmove/223        5.354n ± 0%   5.377n ± 0%   +0.43% (p=0.024 n=50)
memmove/224        5.352n ± 0%   5.363n ± 0%        ~ (p=0.052 n=50)
memmove/225        5.372n ± 0%   5.380n ± 0%        ~ (p=0.481 n=50)
memmove/226        5.368n ± 0%   5.386n ± 0%   +0.34% (p=0.004 n=50)
memmove/227        5.386n ± 0%   5.402n ± 0%   +0.29% (p=0.028 n=50)
memmove/228        5.400n ± 0%   5.408n ± 0%        ~ (p=0.174 n=50)
memmove/229        5.423n ± 0%   5.427n ± 0%        ~ (p=0.444 n=50)
memmove/230        5.411n ± 0%   5.429n ± 0%   +0.33% (p=0.020 n=50)
memmove/231        5.420n ± 0%   5.433n ± 0%   +0.24% (p=0.034 n=50)
memmove/232        5.435n ± 0%   5.441n ± 0%        ~ (p=0.235 n=50)
memmove/233        5.446n ± 0%   5.462n ± 0%        ~ (p=0.590 n=50)
memmove/234        5.467n ± 0%   5.461n ± 0%        ~ (p=0.921 n=50)
memmove/235        5.472n ± 0%   5.478n ± 0%        ~ (p=0.883 n=50)
memmove/236        5.466n ± 0%   5.478n ± 0%        ~ (p=0.324 n=50)
memmove/237        5.471n ± 0%   5.489n ± 0%        ~ (p=0.132 n=50)
memmove/238        5.485n ± 0%   5.489n ± 0%        ~ (p=0.460 n=50)
memmove/239        5.484n ± 0%   5.488n ± 0%        ~ (p=0.833 n=50)
memmove/240        5.483n ± 0%   5.495n ± 0%        ~ (p=0.095 n=50)
memmove/241        5.498n ± 0%   5.514n ± 0%        ~ (p=0.077 n=50)
memmove/242        5.518n ± 0%   5.517n ± 0%        ~ (p=0.481 n=50)
memmove/243        5.514n ± 0%   5.511n ± 0%        ~ (p=0.503 n=50)
memmove/244        5.510n ± 0%   5.497n ± 0%   -0.24% (p=0.038 n=50)
memmove/245        5.516n ± 0%   5.505n ± 0%        ~ (p=0.317 n=50)
memmove/246        5.513n ± 1%   5.494n ± 0%        ~ (p=0.147 n=50)
memmove/247        5.518n ± 0%   5.499n ± 0%   -0.36% (p=0.011 n=50)
memmove/248        5.503n ± 0%   5.492n ± 0%        ~ (p=0.267 n=50)
memmove/249        5.498n ± 0%   5.497n ± 0%        ~ (p=0.765 n=50)
memmove/250        5.485n ± 0%   5.493n ± 0%        ~ (p=0.348 n=50)
memmove/251        5.503n ± 0%   5.482n ± 0%   -0.37% (p=0.013 n=50)
memmove/252        5.497n ± 0%   5.485n ± 0%        ~ (p=0.077 n=50)
memmove/253        5.489n ± 0%   5.496n ± 0%        ~ (p=0.850 n=50)
memmove/254        5.497n ± 0%   5.491n ± 0%        ~ (p=0.548 n=50)
memmove/255        5.484n ± 1%   5.494n ± 0%        ~ (p=0.888 n=50)
memmove/256        6.952n ± 0%   7.676n ± 0%  +10.41% (p=0.000 n=50)
geomean            4.406n        4.127n        -6.33%
```
---
 libc/src/string/memmove.cpp                   |  8 +-
 libc/src/string/memory_utils/inline_memmove.h | 44 ++++++++--
 .../memory_utils/x86_64/inline_memmove.h      | 86 +++++++++++++++----
 3 files changed, 114 insertions(+), 24 deletions(-)

diff --git a/libc/src/string/memmove.cpp b/libc/src/string/memmove.cpp
index 7d473afc0b42e..19e38a3c8bdbe 100644
--- a/libc/src/string/memmove.cpp
+++ b/libc/src/string/memmove.cpp
@@ -15,10 +15,16 @@ namespace LIBC_NAMESPACE {
 
 LLVM_LIBC_FUNCTION(void *, memmove,
                    (void *dst, const void *src, size_t count)) {
+  // Memmove may handle some small sizes as efficiently as inline_memcpy.
+  // For these sizes we may not do is_disjoint check.
+  // This both avoids additional code for the most frequent smaller sizes
+  // and removes code bloat (we don't need the memcpy logic for small sizes).
+  if (inline_memmove_small_size(dst, src, count))
+    return dst;
   if (is_disjoint(dst, src, count))
     inline_memcpy(dst, src, count);
   else
-    inline_memmove(dst, src, count);
+    inline_memmove_follow_up(dst, src, count);
   return dst;
 }
 
diff --git a/libc/src/string/memory_utils/inline_memmove.h b/libc/src/string/memory_utils/inline_memmove.h
index f72ea24ab538d..30c2c3ddbf1bb 100644
--- a/libc/src/string/memory_utils/inline_memmove.h
+++ b/libc/src/string/memory_utils/inline_memmove.h
@@ -13,28 +13,58 @@
 
 #if defined(LIBC_TARGET_ARCH_IS_X86)
 #include "src/string/memory_utils/x86_64/inline_memmove.h"
-#define LIBC_SRC_STRING_MEMORY_UTILS_MEMMOVE inline_memmove_x86
+#define LIBC_SRC_STRING_MEMORY_UTILS_MEMMOVE_SMALL_SIZE                        \
+  inline_memmove_small_size_x86
+#define LIBC_SRC_STRING_MEMORY_UTILS_MEMMOVE_FOLLOW_UP                         \
+  inline_memmove_follow_up_x86
 #elif defined(LIBC_TARGET_ARCH_IS_AARCH64)
 #include "src/string/memory_utils/aarch64/inline_memmove.h"
-#define LIBC_SRC_STRING_MEMORY_UTILS_MEMMOVE inline_memmove_aarch64
+#define LIBC_SRC_STRING_MEMORY_UTILS_MEMMOVE_SMALL_SIZE                        \
+  inline_memmove_no_small_size
+#define LIBC_SRC_STRING_MEMORY_UTILS_MEMMOVE_FOLLOW_UP inline_memmove_aarch64
 #elif defined(LIBC_TARGET_ARCH_IS_ANY_RISCV)
 #include "src/string/memory_utils/riscv/inline_memmove.h"
-#define LIBC_SRC_STRING_MEMORY_UTILS_MEMMOVE inline_memmove_riscv
+#define LIBC_SRC_STRING_MEMORY_UTILS_MEMMOVE_SMALL_SIZE                        \
+  inline_memmove_no_small_size
+#define LIBC_SRC_STRING_MEMORY_UTILS_MEMMOVE_FOLLOW_UP inline_memmove_riscv
 #elif defined(LIBC_TARGET_ARCH_IS_ARM)
 #include "src/string/memory_utils/generic/byte_per_byte.h"
-#define LIBC_SRC_STRING_MEMORY_UTILS_MEMMOVE inline_memmove_byte_per_byte
+#define LIBC_SRC_STRING_MEMORY_UTILS_MEMMOVE_SMALL_SIZE                        \
+  inline_memmove_no_small_size
+#define LIBC_SRC_STRING_MEMORY_UTILS_MEMMOVE_FOLLOW_UP                         \
+  inline_memmove_byte_per_byte
 #elif defined(LIBC_TARGET_ARCH_IS_GPU)
 #include "src/string/memory_utils/generic/builtin.h"
-#define LIBC_SRC_STRING_MEMORY_UTILS_MEMMOVE inline_memmove_builtin
+#define LIBC_SRC_STRING_MEMORY_UTILS_MEMMOVE_SMALL_SIZE                        \
+  inline_memmove_no_small_size
+#define LIBC_SRC_STRING_MEMORY_UTILS_MEMMOVE_FOLLOW_UP inline_memmove_builtin
 #else
 #error "Unsupported architecture"
 #endif
 
 namespace LIBC_NAMESPACE {
 
+LIBC_INLINE constexpr bool inline_memmove_no_small_size(void *, const void *,
+                                                        size_t) {
+  return false;
+}
+
+LIBC_INLINE bool inline_memmove_small_size(void *dst, const void *src,
+                                           size_t count) {
+  return LIBC_SRC_STRING_MEMORY_UTILS_MEMMOVE_SMALL_SIZE(
+      reinterpret_cast<Ptr>(dst), reinterpret_cast<CPtr>(src), count);
+}
+
+LIBC_INLINE void inline_memmove_follow_up(void *dst, const void *src,
+                                          size_t count) {
+  LIBC_SRC_STRING_MEMORY_UTILS_MEMMOVE_FOLLOW_UP(
+      reinterpret_cast<Ptr>(dst), reinterpret_cast<CPtr>(src), count);
+}
+
 LIBC_INLINE void inline_memmove(void *dst, const void *src, size_t count) {
-  LIBC_SRC_STRING_MEMORY_UTILS_MEMMOVE(reinterpret_cast<Ptr>(dst),
-                                       reinterpret_cast<CPtr>(src), count);
+  if (inline_memmove_small_size(dst, src, count))
+    return;
+  inline_memmove_follow_up(dst, src, count);
 }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/string/memory_utils/x86_64/inline_memmove.h b/libc/src/string/memory_utils/x86_64/inline_memmove.h
index 95ad07f752195..879b36eaa6734 100644
--- a/libc/src/string/memory_utils/x86_64/inline_memmove.h
+++ b/libc/src/string/memory_utils/x86_64/inline_memmove.h
@@ -18,40 +18,94 @@
 
 namespace LIBC_NAMESPACE {
 
-LIBC_INLINE void inline_memmove_x86(Ptr dst, CPtr src, size_t count) {
+LIBC_INLINE bool inline_memmove_small_size_x86(Ptr dst, CPtr src,
+                                               size_t count) {
 #if defined(__AVX512F__)
+  constexpr size_t vector_size = 64;
   using uint128_t = generic_v128;
   using uint256_t = generic_v256;
   using uint512_t = generic_v512;
 #elif defined(__AVX__)
+  constexpr size_t vector_size = 32;
   using uint128_t = generic_v128;
   using uint256_t = generic_v256;
   using uint512_t = cpp::array<generic_v256, 2>;
 #elif defined(__SSE2__)
+  constexpr size_t vector_size = 16;
   using uint128_t = generic_v128;
   using uint256_t = cpp::array<generic_v128, 2>;
   using uint512_t = cpp::array<generic_v128, 4>;
 #else
+  constexpr size_t vector_size = 8;
   using uint128_t = cpp::array<uint64_t, 2>;
   using uint256_t = cpp::array<uint64_t, 4>;
   using uint512_t = cpp::array<uint64_t, 8>;
 #endif
+  (void)vector_size;
   if (count == 0)
-    return;
-  if (count == 1)
-    return generic::Memmove<uint8_t>::block(dst, src);
-  if (count <= 4)
-    return generic::Memmove<uint16_t>::head_tail(dst, src, count);
-  if (count <= 8)
-    return generic::Memmove<uint32_t>::head_tail(dst, src, count);
-  if (count <= 16)
-    return generic::Memmove<uint64_t>::head_tail(dst, src, count);
-  if (count <= 32)
-    return generic::Memmove<uint128_t>::head_tail(dst, src, count);
-  if (count <= 64)
-    return generic::Memmove<uint256_t>::head_tail(dst, src, count);
-  if (count <= 128)
-    return generic::Memmove<uint512_t>::head_tail(dst, src, count);
+    return true;
+  if (count == 1) {
+    generic::Memmove<uint8_t>::block(dst, src);
+    return true;
+  }
+  if (count == 2) {
+    generic::Memmove<uint16_t>::block(dst, src);
+    return true;
+  }
+  if (count == 3) {
+    generic::Memmove<cpp::array<uint8_t, 3>>::block(dst, src);
+    return true;
+  }
+  if (count == 4) {
+    generic::Memmove<uint32_t>::block(dst, src);
+    return true;
+  }
+  if (count < 8) {
+    generic::Memmove<uint32_t>::head_tail(dst, src, count);
+    return true;
+  }
+  // If count is equal to a power of 2, we can handle it as head-tail
+  // of both smaller size and larger size (head-tail are either
+  // non-overlapping for smaller size, or completely collapsed
+  // for larger size). It seems to be more profitable to do the copy
+  // with the larger size, if it's natively supported (e.g. doing
+  // 2 collapsed 32-byte moves for count=64 if AVX2 is supported).
+  // But it's not profitable to use larger size if it's not natively
+  // supported: we will both use more instructions and handle fewer
+  // sizes in earlier branches.
+  if (vector_size >= 16 ? count < 16 : count <= 16) {
+    generic::Memmove<uint64_t>::head_tail(dst, src, count);
+    return true;
+  }
+  if (vector_size >= 32 ? count < 32 : count <= 32) {
+    generic::Memmove<uint128_t>::head_tail(dst, src, count);
+    return true;
+  }
+  if (vector_size >= 64 ? count < 64 : count <= 64) {
+    generic::Memmove<uint256_t>::head_tail(dst, src, count);
+    return true;
+  }
+  if (count <= 128) {
+    generic::Memmove<uint512_t>::head_tail(dst, src, count);
+    return true;
+  }
+  return false;
+}
+
+LIBC_INLINE void inline_memmove_follow_up_x86(Ptr dst, CPtr src, size_t count) {
+#if defined(__AVX512F__)
+  using uint256_t = generic_v256;
+  using uint512_t = generic_v512;
+#elif defined(__AVX__)
+  using uint256_t = generic_v256;
+  using uint512_t = cpp::array<generic_v256, 2>;
+#elif defined(__SSE2__)
+  using uint256_t = cpp::array<generic_v128, 2>;
+  using uint512_t = cpp::array<generic_v128, 4>;
+#else
+  using uint256_t = cpp::array<uint64_t, 4>;
+  using uint512_t = cpp::array<uint64_t, 8>;
+#endif
   if (dst < src) {
     generic::Memmove<uint256_t>::align_forward<Arg::Src>(dst, src, count);
     return generic::Memmove<uint512_t>::loop_and_tail_forward(dst, src, count);

From 9db8f99b61e4c5bab45a41a4da2ab169954d8ff1 Mon Sep 17 00:00:00 2001
From: Johannes Reifferscheid <jreiffers@google.com>
Date: Thu, 26 Oct 2023 13:41:14 +0200
Subject: [PATCH 050/877] [Bazel] Fixes for 96e040a.

---
 .../llvm-project-overlay/mlir/BUILD.bazel     | 22 ++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index abe613b45c80a..2b6a82111d879 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -2103,18 +2103,38 @@ cc_library(
     ],
 )
 
+gentbl_cc_library(
+    name = "ArmSVEPassIncGen",
+    tbl_outs = [
+        (
+            [
+                "-gen-pass-decls",
+                "-name=ArmSVE",
+            ],
+            "include/mlir/Dialect/ArmSVE/Transforms/Passes.h.inc",
+        ),
+    ],
+    tblgen = ":mlir-tblgen",
+    td_file = "include/mlir/Dialect/ArmSVE/Transforms/Passes.td",
+    deps = [":PassBaseTdFiles"],
+)
+
 cc_library(
     name = "ArmSVETransforms",
     srcs = glob(["lib/Dialect/ArmSVE/Transforms/*.cpp"]),
-    hdrs = ["include/mlir/Dialect/ArmSVE/Transforms/Transforms.h"],
+    hdrs = glob(["include/mlir/Dialect/ArmSVE/Transforms/*.h"]),
     includes = ["include"],
     deps = [
         ":ArmSVEDialect",
+        ":ArmSVEPassIncGen",
         ":DialectUtils",
         ":FuncDialect",
         ":IR",
         ":LLVMCommonConversion",
         ":LLVMDialect",
+        ":MemRefDialect",
+        ":Pass",
+        ":TransformUtils",
         ":VectorDialect",
     ],
 )

From 8a80e331506e3e3db390ed0b482c7cbe216f7afc Mon Sep 17 00:00:00 2001
From: bjacob <jacob.benoit.1@gmail.com>
Date: Thu, 26 Oct 2023 07:47:00 -0400
Subject: [PATCH 051/877] Add `isBatchVecmat` utilities for
 `linalg.batch_vecmat` (#70284)

`linalg.batch_vecmat` was just added in
https://github.com/llvm/llvm-project/pull/70218, but I forgot then to
add the standard `isBatchVecmat` utilities
---
 .../Dialect/Linalg/IR/LinalgInterfaces.td     | 11 ++++
 .../mlir/Dialect/Utils/StructuredOpsUtils.h   |  6 +++
 mlir/lib/Dialect/Utils/StructuredOpsUtils.cpp | 25 +++++++++
 .../Dialect/Utils/StructuredOpsUtilsTest.cpp  | 52 +++++++++++++++++++
 4 files changed, 94 insertions(+)

diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgInterfaces.td b/mlir/include/mlir/Dialect/Linalg/IR/LinalgInterfaces.td
index 44e82f452b3ce..69ca888a8acdb 100644
--- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgInterfaces.td
+++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgInterfaces.td
@@ -98,6 +98,17 @@ def LinalgContractionOpInterface : OpInterface<"ContractionOpInterface"> {
         return mlir::isVecmat($_op.getIndexingMaps());
     }]>,
     InterfaceMethod<
+    /*desc=*/[{
+      Returns whether the given op has indexing maps that correspond to a
+      batched vector-matrix multiplication.
+    }],
+    /*retTy=*/"bool",
+    /*methodName=*/"isBatchVecmat",
+    /*args=*/(ins),
+    /*methodBody=*/[{
+        return mlir::isBatchVecmat($_op.getIndexingMaps());
+    }]>,
+    InterfaceMethod<
     /*desc=*/[{
       Returns whether the given op has indexing maps that correspond to a
       matrix-vector multiplication.
diff --git a/mlir/include/mlir/Dialect/Utils/StructuredOpsUtils.h b/mlir/include/mlir/Dialect/Utils/StructuredOpsUtils.h
index 225b9f287d340..134c5569fbb2f 100644
--- a/mlir/include/mlir/Dialect/Utils/StructuredOpsUtils.h
+++ b/mlir/include/mlir/Dialect/Utils/StructuredOpsUtils.h
@@ -55,6 +55,12 @@ bool isRowMajorBatchMatmul(ArrayAttr indexingMaps);
 /// performed within the reduction.
 bool isVecmat(ArrayAttr indexingMaps);
 
+/// Tests whether the given maps describe a batch vector matrix multiplication.
+/// The test is permutation-invariant. Note that this only checks the affine
+/// maps from an operation, so does not perform any checks on the math being
+/// performed within the reduction.
+bool isBatchVecmat(ArrayAttr indexingMaps);
+
 /// Tests whether the given maps describe a matrix vector multiplication. The
 /// test is permutation-invariant. Note that this only checks the affine maps
 /// from an operation, so does not perform any checks on the math being
diff --git a/mlir/lib/Dialect/Utils/StructuredOpsUtils.cpp b/mlir/lib/Dialect/Utils/StructuredOpsUtils.cpp
index 641ddf3f91cb2..383ef1cea53fd 100644
--- a/mlir/lib/Dialect/Utils/StructuredOpsUtils.cpp
+++ b/mlir/lib/Dialect/Utils/StructuredOpsUtils.cpp
@@ -120,6 +120,31 @@ bool mlir::isVecmat(ArrayAttr indexingMaps) {
   return indexingMaps == maps;
 }
 
+bool mlir::isBatchVecmat(ArrayAttr indexingMaps) {
+  if (indexingMaps.size() != 3)
+    return false;
+  AffineMap map0 = cast<AffineMapAttr>(indexingMaps[0]).getValue();
+  AffineMap map1 = cast<AffineMapAttr>(indexingMaps[1]).getValue();
+  AffineMap map2 = cast<AffineMapAttr>(indexingMaps[2]).getValue();
+
+  if (map0.getNumResults() != 2 || map1.getNumResults() != 3 ||
+      map2.getNumResults() != 2 || map0.getNumInputs() != 3 ||
+      map1.getNumInputs() != 3 || map2.getNumInputs() != 3) {
+    return false;
+  }
+
+  // Extract dimensions for B*K * B*K*N -> B*N
+  AffineExpr b = map0.getResult(0);
+  AffineExpr k = map0.getResult(1);
+  AffineExpr n = map2.getResult(1);
+  auto *context = indexingMaps.getContext();
+  auto mapA = AffineMapAttr::get(AffineMap::get(3, 0, {b, k}, context));
+  auto mapB = AffineMapAttr::get(AffineMap::get(3, 0, {b, k, n}, context));
+  auto mapC = AffineMapAttr::get(AffineMap::get(3, 0, {b, n}, context));
+  auto maps = ArrayAttr::get(context, {mapA, mapB, mapC});
+  return indexingMaps == maps;
+}
+
 bool mlir::isMatvec(ArrayAttr indexingMaps) {
   if (indexingMaps.size() != 3)
     return false;
diff --git a/mlir/unittests/Dialect/Utils/StructuredOpsUtilsTest.cpp b/mlir/unittests/Dialect/Utils/StructuredOpsUtilsTest.cpp
index 3f576bacebf6a..d257fc5d6e041 100644
--- a/mlir/unittests/Dialect/Utils/StructuredOpsUtilsTest.cpp
+++ b/mlir/unittests/Dialect/Utils/StructuredOpsUtilsTest.cpp
@@ -370,4 +370,56 @@ TEST(isBatchMatvec, WrongDimOrderMatrix) {
   EXPECT_THAT(maps, Not(Truly(isBatchMatvec)));
 }
 
+TEST(isBatchVecmat, Simple) {
+  MLIRContext context;
+
+  AffineExpr batch, k, n;
+  bindDims(&context, batch, k, n);
+  auto mapA = AffineMapAttr::get(AffineMap::get(3, 0, {batch, k}, &context));
+  auto mapB = AffineMapAttr::get(AffineMap::get(3, 0, {batch, k, n}, &context));
+  auto mapC = AffineMapAttr::get(AffineMap::get(3, 0, {batch, n}, &context));
+  auto maps = ArrayAttr::get(&context, {mapA, mapB, mapC});
+
+  EXPECT_THAT(maps, Truly(isBatchVecmat));
+}
+
+TEST(isBatchVecmat, BindingSwapped) {
+  MLIRContext context;
+
+  AffineExpr batch, k, n;
+  bindDims(&context, batch, n, k); // bind in different order
+  auto mapA = AffineMapAttr::get(AffineMap::get(3, 0, {batch, k}, &context));
+  auto mapB = AffineMapAttr::get(AffineMap::get(3, 0, {batch, k, n}, &context));
+  auto mapC = AffineMapAttr::get(AffineMap::get(3, 0, {batch, n}, &context));
+  auto maps = ArrayAttr::get(&context, {mapA, mapB, mapC});
+
+  EXPECT_THAT(maps, Truly(isBatchVecmat));
+}
+
+TEST(isBatchVecmat, Matmul) {
+  MLIRContext context;
+
+  AffineExpr m, n, k;
+  bindDims(&context, m, n, k);
+  auto mapA = AffineMapAttr::get(AffineMap::get(3, 0, {m, k}, &context));
+  auto mapB = AffineMapAttr::get(AffineMap::get(3, 0, {k, n}, &context));
+  auto mapC = AffineMapAttr::get(AffineMap::get(3, 0, {m, n}, &context));
+  auto maps = ArrayAttr::get(&context, {mapA, mapB, mapC});
+
+  EXPECT_THAT(maps, Not(Truly(isBatchVecmat)));
+}
+
+TEST(isBatchVecmat, WrongDimOrderMatrix) {
+  MLIRContext context;
+
+  AffineExpr batch, k, n;
+  bindDims(&context, batch, k, n);
+  auto mapA = AffineMapAttr::get(AffineMap::get(3, 0, {batch, k}, &context));
+  auto mapB = AffineMapAttr::get(AffineMap::get(3, 0, {batch, n, k}, &context));
+  auto mapC = AffineMapAttr::get(AffineMap::get(3, 0, {batch, n}, &context));
+  auto maps = ArrayAttr::get(&context, {mapA, mapB, mapC});
+
+  EXPECT_THAT(maps, Not(Truly(isBatchVecmat)));
+}
+
 } // namespace

From 8149066fa532d82ff62a0629d5a9fab6bd4da768 Mon Sep 17 00:00:00 2001
From: Weining Lu <luweining@loongson.cn>
Date: Thu, 26 Oct 2023 11:50:28 +0800
Subject: [PATCH 052/877] [LoongArch][test] Add some ABI regression tests for
 empty struct. NFC

How empty structs (not as fields of container struct) are passed in C++
is not explicitly documented in psABI. This patch adds some tests
showing the current handing of clang. Some of the results are different
from gcc. Following patch(es) will try to fix the mismatch.
---
 .../LoongArch/abi-lp64d-empty-structs.c       | 53 +++++++++++++++++++
 1 file changed, 53 insertions(+)

diff --git a/clang/test/CodeGen/LoongArch/abi-lp64d-empty-structs.c b/clang/test/CodeGen/LoongArch/abi-lp64d-empty-structs.c
index fb90bf556c19b..d0daafac336ec 100644
--- a/clang/test/CodeGen/LoongArch/abi-lp64d-empty-structs.c
+++ b/clang/test/CodeGen/LoongArch/abi-lp64d-empty-structs.c
@@ -81,9 +81,62 @@ struct s8 test_s8(struct s8 a) {
   return a;
 }
 
+/// Note: Below tests check how empty structs are passed while above tests check
+/// empty structs as fields of container struct are ignored when flattening
+/// structs to examine whether the container structs can be passed via FARs.
+
 // CHECK-C: define{{.*}} void @test_s9()
 // CHECK-CXX: define{{.*}} i64 @_Z7test_s92s9(i64 {{.*}})
 struct s9 { struct empty e; };
 struct s9 test_s9(struct s9 a) {
   return a;
 }
+
+// CHECK-C: define{{.*}} void @test_s10()
+// CHECK-CXX: define{{.*}} void @_Z8test_s103s10()
+struct s10 { };
+struct s10 test_s10(struct s10 a) {
+  return a;
+}
+
+// CHECK-C: define{{.*}} void @test_s11()
+// CHECK-CXX: define{{.*}} i64 @_Z8test_s113s11(i64 {{.*}})
+struct s11 { struct { } s; };
+struct s11 test_s11(struct s11 a) {
+  return a;
+}
+
+// CHECK-C: define{{.*}} void @test_s12()
+// CHECK-CXX: define{{.*}} void @_Z8test_s123s12()
+struct s12 { int i[0]; };
+struct s12 test_s12(struct s12 a) {
+  return a;
+}
+
+// CHECK-C: define{{.*}} void @test_s13()
+// CHECK-CXX: define{{.*}} void @_Z8test_s133s13()
+struct s13 { struct { } s[0]; };
+struct s13 test_s13(struct s13 a) {
+  return a;
+}
+
+// CHECK-C: define{{.*}} void @test_s14()
+// CHECK-CXX: define{{.*}} i64 @_Z8test_s143s14(i64 {{.*}})
+struct s14 { struct { } s[1]; };
+struct s14 test_s14(struct s14 a) {
+  return a;
+}
+
+// CHECK-C: define{{.*}} void @test_s15()
+// CHECK-CXX: define{{.*}} void @_Z8test_s153s15()
+struct s15 { int : 0; };
+struct s15 test_s15(struct s15 a) {
+  return a;
+}
+
+// CHECK-C: define{{.*}} void @test_s16()
+// CHECK-CXX: define{{.*}} void @_Z8test_s163s16()
+struct s16 { int : 1; };
+struct s16 test_s16(struct s16 a) {
+  return a;
+}

From 658874e084e594f9881d5f1d004fba54d0a44d85 Mon Sep 17 00:00:00 2001
From: Timm Baeder <tbaeder@redhat.com>
Date: Thu, 26 Oct 2023 13:53:54 +0200
Subject: [PATCH 053/877] [clang][Interp] Handle unknown-size arrays better
 (#68868)

We unfortunately actually need to do some checks for array-to-pointer
decays it seems.
---
 clang/lib/AST/Interp/ByteCodeExprGen.cpp | 14 +++++++-
 clang/lib/AST/Interp/EvalEmitter.cpp     |  6 ++++
 clang/lib/AST/Interp/Interp.cpp          |  2 ++
 clang/lib/AST/Interp/Interp.h            | 20 +++++++++++
 clang/lib/AST/Interp/Opcodes.td          |  2 ++
 clang/test/AST/Interp/arrays.cpp         | 42 ++++++++++++++++++++++++
 6 files changed, 85 insertions(+), 1 deletion(-)

diff --git a/clang/lib/AST/Interp/ByteCodeExprGen.cpp b/clang/lib/AST/Interp/ByteCodeExprGen.cpp
index 1b33c69b93aa4..1e508f8998abe 100644
--- a/clang/lib/AST/Interp/ByteCodeExprGen.cpp
+++ b/clang/lib/AST/Interp/ByteCodeExprGen.cpp
@@ -168,7 +168,16 @@ bool ByteCodeExprGen<Emitter>::VisitCastExpr(const CastExpr *CE) {
     return this->emitCastPointerIntegral(T, CE);
   }
 
-  case CK_ArrayToPointerDecay:
+  case CK_ArrayToPointerDecay: {
+    if (!this->visit(SubExpr))
+      return false;
+    if (!this->emitArrayDecay(CE))
+      return false;
+    if (DiscardResult)
+      return this->emitPopPtr(CE);
+    return true;
+  }
+
   case CK_AtomicToNonAtomic:
   case CK_ConstructorConversion:
   case CK_FunctionToPointerDecay:
@@ -505,6 +514,9 @@ bool ByteCodeExprGen<Emitter>::VisitImplicitValueInitExpr(const ImplicitValueIni
   if (QT->isRecordType())
     return false;
 
+  if (QT->isIncompleteArrayType())
+    return true;
+
   if (QT->isArrayType()) {
     const ArrayType *AT = QT->getAsArrayTypeUnsafe();
     assert(AT);
diff --git a/clang/lib/AST/Interp/EvalEmitter.cpp b/clang/lib/AST/Interp/EvalEmitter.cpp
index f8942291b3b16..9bc42057c5f57 100644
--- a/clang/lib/AST/Interp/EvalEmitter.cpp
+++ b/clang/lib/AST/Interp/EvalEmitter.cpp
@@ -185,6 +185,12 @@ bool EvalEmitter::emitRetValue(const SourceInfo &Info) {
       }
       return Ok;
     }
+
+    if (Ty->isIncompleteArrayType()) {
+      R = APValue(APValue::UninitArray(), 0, 0);
+      return true;
+    }
+
     if (const auto *AT = Ty->getAsArrayTypeUnsafe()) {
       const size_t NumElems = Ptr.getNumElems();
       QualType ElemTy = AT->getElementType();
diff --git a/clang/lib/AST/Interp/Interp.cpp b/clang/lib/AST/Interp/Interp.cpp
index c87bb2fa6b02f..4a4c0922758c9 100644
--- a/clang/lib/AST/Interp/Interp.cpp
+++ b/clang/lib/AST/Interp/Interp.cpp
@@ -483,6 +483,8 @@ static bool CheckFieldsInitialized(InterpState &S, CodePtr OpPC,
 
     if (FieldType->isRecordType()) {
       Result &= CheckFieldsInitialized(S, OpPC, FieldPtr, FieldPtr.getRecord());
+    } else if (FieldType->isIncompleteArrayType()) {
+      // Nothing to do here.
     } else if (FieldType->isArrayType()) {
       const auto *CAT =
           cast<ConstantArrayType>(FieldType->getAsArrayTypeUnsafe());
diff --git a/clang/lib/AST/Interp/Interp.h b/clang/lib/AST/Interp/Interp.h
index 2132e8b0a8cfa..86cc267652951 100644
--- a/clang/lib/AST/Interp/Interp.h
+++ b/clang/lib/AST/Interp/Interp.h
@@ -1802,17 +1802,37 @@ inline bool ArrayElemPtr(InterpState &S, CodePtr OpPC) {
   const T &Offset = S.Stk.pop<T>();
   const Pointer &Ptr = S.Stk.peek<Pointer>();
 
+  if (!CheckArray(S, OpPC, Ptr))
+    return false;
+
   if (!OffsetHelper<T, ArithOp::Add>(S, OpPC, Offset, Ptr))
     return false;
 
   return NarrowPtr(S, OpPC);
 }
 
+/// Just takes a pointer and checks if its' an incomplete
+/// array type.
+inline bool ArrayDecay(InterpState &S, CodePtr OpPC) {
+  const Pointer &Ptr = S.Stk.peek<Pointer>();
+
+  if (!Ptr.isUnknownSizeArray())
+    return true;
+
+  const SourceInfo &E = S.Current->getSource(OpPC);
+  S.FFDiag(E, diag::note_constexpr_unsupported_unsized_array);
+
+  return false;
+}
+
 template <PrimType Name, class T = typename PrimConv<Name>::T>
 inline bool ArrayElemPtrPop(InterpState &S, CodePtr OpPC) {
   const T &Offset = S.Stk.pop<T>();
   const Pointer &Ptr = S.Stk.pop<Pointer>();
 
+  if (!CheckArray(S, OpPC, Ptr))
+    return false;
+
   if (!OffsetHelper<T, ArithOp::Add>(S, OpPC, Offset, Ptr))
     return false;
 
diff --git a/clang/lib/AST/Interp/Opcodes.td b/clang/lib/AST/Interp/Opcodes.td
index e1e7e5e2efbb0..69068e87d5720 100644
--- a/clang/lib/AST/Interp/Opcodes.td
+++ b/clang/lib/AST/Interp/Opcodes.td
@@ -687,3 +687,5 @@ def InvalidCast : Opcode {
 def InvalidDeclRef : Opcode {
   let Args = [ArgDeclRef];
 }
+
+def ArrayDecay : Opcode;
diff --git a/clang/test/AST/Interp/arrays.cpp b/clang/test/AST/Interp/arrays.cpp
index 7110785ea4c66..d1673094c2660 100644
--- a/clang/test/AST/Interp/arrays.cpp
+++ b/clang/test/AST/Interp/arrays.cpp
@@ -455,3 +455,45 @@ namespace NoInitMapLeak {
                                // ref-error {{not an integral constant expression}} \
                                // ref-note {{in call to}}
 }
+
+namespace Incomplete {
+  struct Foo {
+    char c;
+    int a[];
+  };
+
+  constexpr Foo F{};
+  constexpr const int *A = F.a; // ref-error {{must be initialized by a constant expression}} \
+                                // ref-note {{array-to-pointer decay of array member without known bound}} \
+                                // expected-error {{must be initialized by a constant expression}} \
+                                // expected-note {{array-to-pointer decay of array member without known bound}}
+
+  constexpr const int *B = F.a + 1; // ref-error {{must be initialized by a constant expression}} \
+                                    // ref-note {{array-to-pointer decay of array member without known bound}} \
+                                    // expected-error {{must be initialized by a constant expression}} \
+                                    // expected-note {{array-to-pointer decay of array member without known bound}}
+
+  constexpr int C = *F.a; // ref-error {{must be initialized by a constant expression}} \
+                          // ref-note {{array-to-pointer decay of array member without known bound}} \
+                          // expected-error {{must be initialized by a constant expression}} \
+                          // expected-note {{array-to-pointer decay of array member without known bound}}
+
+
+
+  /// These are from test/SemaCXX/constant-expression-cxx11.cpp
+  /// and are the only tests using the 'indexing of array without known bound' diagnostic.
+  /// We currently diagnose them differently.
+  extern int arr[]; // expected-note 3{{declared here}}
+  constexpr int *c = &arr[1]; // ref-error  {{must be initialized by a constant expression}} \
+                              // ref-note {{indexing of array without known bound}} \
+                              // expected-error {{must be initialized by a constant expression}} \
+                              // expected-note {{read of non-constexpr variable 'arr'}}
+  constexpr int *d = &arr[1]; // ref-error  {{must be initialized by a constant expression}} \
+                              // ref-note {{indexing of array without known bound}} \
+                              // expected-error {{must be initialized by a constant expression}} \
+                              // expected-note {{read of non-constexpr variable 'arr'}}
+  constexpr int *e = arr + 1; // ref-error  {{must be initialized by a constant expression}} \
+                              // ref-note {{indexing of array without known bound}} \
+                              // expected-error {{must be initialized by a constant expression}} \
+                              // expected-note {{read of non-constexpr variable 'arr'}}
+}

From 64025b8eba200c0be7cedbb36c6dcbbea3ca96c7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andrzej=20Warzy=C5=84ski?= <andrzej.warzynski@arm.com>
Date: Thu, 26 Oct 2023 12:56:28 +0100
Subject: [PATCH 054/877] [mlir][SVE] Add an e2e test for vectorization of
 linalg.matmul (#69592)

--delete-branch
---
 .../Dialect/Linalg/CPU/ArmSVE/matmul.mlir     | 71 +++++++++++++++++++
 1 file changed, 71 insertions(+)
 create mode 100644 mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/matmul.mlir

diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/matmul.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/matmul.mlir
new file mode 100644
index 0000000000000..bc94161d5d375
--- /dev/null
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/matmul.mlir
@@ -0,0 +1,71 @@
+// RUN: mlir-opt %s -test-transform-dialect-interpreter -test-transform-dialect-erase-schedule \
+// RUN:   -one-shot-bufferize -func-bufferize -cse -canonicalize -convert-vector-to-scf -arm-sve-legalize-vector-storage \
+// RUN:   -convert-vector-to-llvm="enable-arm-sve" -test-lower-to-llvm | \
+// RUN: %mcr_aarch64_cmd -e=entry -entry-point-result=void --march=aarch64 --mattr="+sve" -shared-libs=%mlir_runner_utils,%mlir_c_runner_utils | \
+// RUN: FileCheck %s
+
+func.func @entry() {
+  %c1 = arith.constant 1 : index
+  %c2 = arith.constant 2 : index
+  %c4 = arith.constant 4 : index
+  %c0 = arith.constant 0 : index
+  %step = arith.constant 1 : index
+  %c0_f32 = arith.constant 0.0 : f32
+
+  %vscale = vector.vscale
+  %vl_fp = arith.muli %c4, %vscale : index
+  %A_alloc = bufferization.alloc_tensor(%c2, %c1) : tensor<?x?xf32>
+  %B_alloc = bufferization.alloc_tensor(%c1, %vl_fp) : tensor<?x?xf32>
+  %C_alloc = bufferization.alloc_tensor(%c2, %vl_fp) : tensor<?x?xf32>
+
+  %pi = arith.constant  3.14 : f32
+  %A = linalg.fill ins(%pi : f32) outs(%A_alloc : tensor<?x?xf32>) -> tensor<?x?xf32>
+  %B = linalg.fill ins(%pi : f32) outs(%B_alloc : tensor<?x?xf32>) -> tensor<?x?xf32>
+  %C_in = linalg.fill ins(%c0_f32 : f32) outs(%C_alloc : tensor<?x?xf32>) -> tensor<?x?xf32>
+
+  %C_out = linalg.matmul ins(%A, %B: tensor<?x?xf32>, tensor<?x?xf32>) outs(%C_in: tensor<?x?xf32>) -> tensor<?x?xf32>
+
+  // CHECK-LABEL: SVE: START OF TEST OUTPUT
+  vector.print str "SVE: START OF TEST OUTPUT"
+
+  // There are at least 4 x f32 elements in every SVE vector, i.e. 
+  //    * %vscale >= 1. 
+  // Hence, when checking the outupt there will always be at least 4 elements
+  // in every row. For implementations with wider vectors, you should see more
+  // elements being printed.
+  // CHECK-NEXT: Unranked Memref {{.*}} rank = 2 offset = 0 sizes = [2, 16] strides = [16, 1] data =
+  // CHECK-NEXT: [9.8596,   9.8596,   9.8596,   9.8596
+  // CHECK-NEXT: [9.8596,   9.8596,   9.8596,   9.8596
+
+  %xf = tensor.cast %C_out : tensor<?x?xf32> to tensor<*xf32>
+  call @printMemrefF32(%xf) : (tensor<*xf32>) -> ()
+
+  // CHECK-NEXT: SVE: END OF TEST OUTPUT
+  vector.print str "SVE: END OF TEST OUTPUT"
+
+  return
+}
+
+transform.sequence failures(propagate) {
+^bb1(%module_op: !transform.any_op):
+  %0 = transform.structured.match ops{["linalg.matmul"]} in %module_op : (!transform.any_op) -> !transform.any_op
+  %func_op = get_parent_op %0 : (!transform.any_op) -> !transform.op<"func.func">
+  // The tile sizes match the output matrix sizes
+  %1, %loops:3 = transform.structured.tile_using_for %0 [2, [4], 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
+  %2 = transform.structured.match ops{["linalg.matmul"]} in %module_op : (!transform.any_op) -> !transform.any_op
+  // The vector sizes match the output matrix sizes
+  // TOOD: Use variables to re-use "shared" sizes
+  transform.structured.vectorize %2 vector_sizes [2, [4], 1] : !transform.any_op
+
+  transform.apply_patterns to %func_op {
+    transform.apply_patterns.vector.reduction_to_contract
+    transform.apply_patterns.vector.transfer_permutation_patterns
+    transform.apply_patterns.vector.lower_masked_transfers
+  } : !transform.op<"func.func">
+  transform.apply_patterns to %func_op {
+    transform.apply_patterns.vector.lower_contraction lowering_strategy = "outerproduct"
+    transform.apply_patterns.vector.lower_outerproduct
+  } : !transform.op<"func.func">
+}
+
+func.func private @printMemrefF32(%ptr : tensor<*xf32>)

From e9c4dc18bc9d34a2c055117a93799be9ade57d41 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Thu, 26 Oct 2023 12:51:45 +0100
Subject: [PATCH 055/877] Revert "[AMDGPU] Use `S_CSELECT` for uniform i1 ext
 (#69703)"

This reverts commit a1260b5209968c08886e3c6183aa793de8931578.

It was causing some Vulkan CTS failures.
---
 llvm/lib/Target/AMDGPU/SIInstructions.td | 25 ++++-----
 llvm/test/CodeGen/AMDGPU/saddo.ll        | 65 ++++++++++--------------
 llvm/test/CodeGen/AMDGPU/uaddo.ll        | 46 ++++++++---------
 llvm/test/CodeGen/AMDGPU/usubo.ll        | 46 ++++++++---------
 llvm/test/CodeGen/AMDGPU/zero_extend.ll  |  2 +-
 5 files changed, 77 insertions(+), 107 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 707a1c72b5b7c..567f1b812c180 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -2278,24 +2278,17 @@ def : GCNPat <
   (REG_SEQUENCE SReg_64, $src, sub0, (i32 (IMPLICIT_DEF)), sub1)
 >;
 
-multiclass ZExt_i64_i1_Pat <SDNode ext> {
-  def: GCNPat <
-    (i64 (ext i1:$src)),
-      (REG_SEQUENCE VReg_64,
-        (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
-                           /*src1mod*/(i32 0), /*src1*/(i32 1), $src),
-        sub0, (S_MOV_B32 (i32 0)), sub1)
-  >;
-
-  def : GCNPat <
-    (i64 (UniformUnaryFrag<ext> SCC)),
-    (S_CSELECT_B64 (i64 1), (i64 0))
-  >;
-}
+class ZExt_i64_i1_Pat <SDNode ext> : GCNPat <
+  (i64 (ext i1:$src)),
+    (REG_SEQUENCE VReg_64,
+      (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
+                         /*src1mod*/(i32 0), /*src1*/(i32 1), $src),
+      sub0, (S_MOV_B32 (i32 0)), sub1)
+>;
 
 
-defm : ZExt_i64_i1_Pat<zext>;
-defm : ZExt_i64_i1_Pat<anyext>;
+def : ZExt_i64_i1_Pat<zext>;
+def : ZExt_i64_i1_Pat<anyext>;
 
 // FIXME: We need to use COPY_TO_REGCLASS to work-around the fact that
 // REG_SEQUENCE patterns don't support instructions with multiple outputs.
diff --git a/llvm/test/CodeGen/AMDGPU/saddo.ll b/llvm/test/CodeGen/AMDGPU/saddo.ll
index f6f3e47c3be7a..cb3166d7a20d3 100644
--- a/llvm/test/CodeGen/AMDGPU/saddo.ll
+++ b/llvm/test/CodeGen/AMDGPU/saddo.ll
@@ -29,12 +29,10 @@ define amdgpu_kernel void @saddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b)
 ; SI-NEXT:    s_mov_b32 s0, s4
 ; SI-NEXT:    s_mov_b32 s1, s5
 ; SI-NEXT:    s_xor_b64 s[4:5], s[6:7], vcc
-; SI-NEXT:    s_and_b64 s[4:5], s[4:5], exec
-; SI-NEXT:    s_cselect_b64 s[4:5], 1, 0
-; SI-NEXT:    s_add_u32 s4, s10, s4
-; SI-NEXT:    s_addc_u32 s5, s11, s5
-; SI-NEXT:    v_mov_b32_e32 v0, s4
-; SI-NEXT:    v_mov_b32_e32 v1, s5
+; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; SI-NEXT:    v_mov_b32_e32 v1, s11
+; SI-NEXT:    v_add_i32_e32 v0, vcc, s10, v0
+; SI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
@@ -47,17 +45,15 @@ define amdgpu_kernel void @saddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b)
 ; VI-NEXT:    s_add_u32 s2, s6, s0
 ; VI-NEXT:    v_mov_b32_e32 v2, s7
 ; VI-NEXT:    s_addc_u32 s3, s7, s1
+; VI-NEXT:    v_cmp_lt_i64_e64 s[8:9], s[0:1], 0
 ; VI-NEXT:    v_cmp_lt_i64_e32 vcc, s[2:3], v[1:2]
-; VI-NEXT:    v_cmp_lt_i64_e64 s[0:1], s[0:1], 0
+; VI-NEXT:    v_mov_b32_e32 v3, s3
+; VI-NEXT:    s_xor_b64 s[0:1], s[8:9], vcc
+; VI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
+; VI-NEXT:    v_add_u32_e32 v2, vcc, s2, v2
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    s_xor_b64 s[0:1], s[0:1], vcc
-; VI-NEXT:    s_and_b64 s[0:1], s[0:1], exec
-; VI-NEXT:    s_cselect_b64 s[0:1], 1, 0
-; VI-NEXT:    s_add_u32 s0, s2, s0
-; VI-NEXT:    s_addc_u32 s1, s3, s1
-; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
-; VI-NEXT:    v_mov_b32_e32 v2, s0
+; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; VI-NEXT:    s_endpgm
 ;
@@ -71,15 +67,13 @@ define amdgpu_kernel void @saddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b)
 ; GFX9-NEXT:    s_add_u32 s0, s6, s2
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX9-NEXT:    s_addc_u32 s1, s7, s3
+; GFX9-NEXT:    v_cmp_lt_i64_e64 s[8:9], s[2:3], 0
 ; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1]
-; GFX9-NEXT:    v_cmp_lt_i64_e64 s[2:3], s[2:3], 0
-; GFX9-NEXT:    s_xor_b64 s[2:3], s[2:3], vcc
-; GFX9-NEXT:    s_and_b64 s[2:3], s[2:3], exec
-; GFX9-NEXT:    s_cselect_b64 s[2:3], 1, 0
-; GFX9-NEXT:    s_add_u32 s0, s0, s2
-; GFX9-NEXT:    s_addc_u32 s1, s1, s3
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    s_xor_b64 s[2:3], s[8:9], vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[2:3]
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -93,14 +87,11 @@ define amdgpu_kernel void @saddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b)
 ; GFX10-NEXT:    s_add_u32 s0, s6, s2
 ; GFX10-NEXT:    s_addc_u32 s1, s7, s3
 ; GFX10-NEXT:    v_cmp_lt_i64_e64 s2, s[2:3], 0
-; GFX10-NEXT:    v_cmp_lt_i64_e64 s6, s[0:1], s[6:7]
-; GFX10-NEXT:    s_xor_b32 s2, s2, s6
-; GFX10-NEXT:    s_and_b32 s2, s2, exec_lo
-; GFX10-NEXT:    s_cselect_b64 s[2:3], 1, 0
-; GFX10-NEXT:    s_add_u32 s0, s0, s2
-; GFX10-NEXT:    s_addc_u32 s1, s1, s3
-; GFX10-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-NEXT:    v_cmp_lt_i64_e64 s3, s[0:1], s[6:7]
+; GFX10-NEXT:    s_xor_b32 s2, s2, s3
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s2
+; GFX10-NEXT:    v_add_co_u32 v0, s0, s0, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v1, s0, s1, 0, s0
 ; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX10-NEXT:    s_endpgm
 ;
@@ -109,20 +100,18 @@ define amdgpu_kernel void @saddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b)
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
+; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_add_u32 s2, s6, s0
 ; GFX11-NEXT:    s_addc_u32 s3, s7, s1
 ; GFX11-NEXT:    v_cmp_lt_i64_e64 s0, s[0:1], 0
-; GFX11-NEXT:    v_cmp_lt_i64_e64 s6, s[2:3], s[6:7]
+; GFX11-NEXT:    v_cmp_lt_i64_e64 s1, s[2:3], s[6:7]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_xor_b32 s0, s0, s6
-; GFX11-NEXT:    s_and_b32 s0, s0, exec_lo
-; GFX11-NEXT:    s_cselect_b64 s[0:1], 1, 0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_add_u32 s0, s2, s0
-; GFX11-NEXT:    s_addc_u32 s1, s3, s1
-; GFX11-NEXT:    v_mov_b32_e32 v0, s0
-; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    s_xor_b32 s0, s0, s1
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add_co_u32 v0, s0, s2, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, s3, 0, s0
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
diff --git a/llvm/test/CodeGen/AMDGPU/uaddo.ll b/llvm/test/CodeGen/AMDGPU/uaddo.ll
index 0ebf3f5198203..4363db2351e7a 100644
--- a/llvm/test/CodeGen/AMDGPU/uaddo.ll
+++ b/llvm/test/CodeGen/AMDGPU/uaddo.ll
@@ -7,23 +7,21 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
 ; SI-LABEL: s_uaddo_i64_zext:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xd
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_add_u32 s0, s6, s0
-; SI-NEXT:    v_mov_b32_e32 v0, s6
-; SI-NEXT:    v_mov_b32_e32 v1, s7
-; SI-NEXT:    s_addc_u32 s1, s7, s1
-; SI-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
-; SI-NEXT:    s_and_b64 s[6:7], vcc, exec
-; SI-NEXT:    s_cselect_b64 s[6:7], 1, 0
-; SI-NEXT:    s_add_u32 s6, s0, s6
-; SI-NEXT:    s_addc_u32 s7, s1, s7
 ; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s0, s4
 ; SI-NEXT:    s_mov_b32 s1, s5
+; SI-NEXT:    s_add_u32 s4, s6, s8
 ; SI-NEXT:    v_mov_b32_e32 v0, s6
 ; SI-NEXT:    v_mov_b32_e32 v1, s7
+; SI-NEXT:    s_addc_u32 s5, s7, s9
+; SI-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1]
+; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SI-NEXT:    v_mov_b32_e32 v1, s5
+; SI-NEXT:    v_add_i32_e32 v0, vcc, s4, v0
+; SI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
@@ -32,19 +30,17 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
 ; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s6
+; VI-NEXT:    v_mov_b32_e32 v2, s6
 ; VI-NEXT:    s_add_u32 s0, s6, s0
+; VI-NEXT:    v_mov_b32_e32 v3, s7
 ; VI-NEXT:    s_addc_u32 s1, s7, s1
-; VI-NEXT:    v_mov_b32_e32 v2, s7
-; VI-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[1:2]
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    s_and_b64 s[2:3], vcc, exec
-; VI-NEXT:    s_cselect_b64 s[2:3], 1, 0
-; VI-NEXT:    s_add_u32 s0, s0, s2
-; VI-NEXT:    s_addc_u32 s1, s1, s3
+; VI-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3]
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
-; VI-NEXT:    v_mov_b32_e32 v2, s0
+; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; VI-NEXT:    s_endpgm
 ;
@@ -56,15 +52,13 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX9-NEXT:    s_add_u32 s0, s6, s2
-; GFX9-NEXT:    s_addc_u32 s1, s7, s3
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    s_addc_u32 s1, s7, s3
 ; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1]
-; GFX9-NEXT:    s_and_b64 s[2:3], vcc, exec
-; GFX9-NEXT:    s_cselect_b64 s[2:3], 1, 0
-; GFX9-NEXT:    s_add_u32 s0, s0, s2
-; GFX9-NEXT:    s_addc_u32 s1, s1, s3
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
   %uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b)
diff --git a/llvm/test/CodeGen/AMDGPU/usubo.ll b/llvm/test/CodeGen/AMDGPU/usubo.ll
index ade0616137b17..37b5be3b672f2 100644
--- a/llvm/test/CodeGen/AMDGPU/usubo.ll
+++ b/llvm/test/CodeGen/AMDGPU/usubo.ll
@@ -8,23 +8,21 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
 ; SI-LABEL: s_usubo_i64_zext:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0xd
+; SI-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xd
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_sub_u32 s0, s6, s0
-; SI-NEXT:    v_mov_b32_e32 v0, s6
-; SI-NEXT:    v_mov_b32_e32 v1, s7
-; SI-NEXT:    s_subb_u32 s1, s7, s1
-; SI-NEXT:    v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1]
-; SI-NEXT:    s_and_b64 s[6:7], vcc, exec
-; SI-NEXT:    s_cselect_b64 s[6:7], 1, 0
-; SI-NEXT:    s_add_u32 s6, s0, s6
-; SI-NEXT:    s_addc_u32 s7, s1, s7
 ; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s0, s4
 ; SI-NEXT:    s_mov_b32 s1, s5
+; SI-NEXT:    s_sub_u32 s4, s6, s8
 ; SI-NEXT:    v_mov_b32_e32 v0, s6
 ; SI-NEXT:    v_mov_b32_e32 v1, s7
+; SI-NEXT:    s_subb_u32 s5, s7, s9
+; SI-NEXT:    v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1]
+; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; SI-NEXT:    v_mov_b32_e32 v1, s5
+; SI-NEXT:    v_add_i32_e32 v0, vcc, s4, v0
+; SI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
@@ -33,19 +31,17 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
 ; VI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v1, s6
+; VI-NEXT:    v_mov_b32_e32 v2, s6
 ; VI-NEXT:    s_sub_u32 s0, s6, s0
+; VI-NEXT:    v_mov_b32_e32 v3, s7
 ; VI-NEXT:    s_subb_u32 s1, s7, s1
-; VI-NEXT:    v_mov_b32_e32 v2, s7
-; VI-NEXT:    v_cmp_gt_u64_e32 vcc, s[0:1], v[1:2]
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    s_and_b64 s[2:3], vcc, exec
-; VI-NEXT:    s_cselect_b64 s[2:3], 1, 0
-; VI-NEXT:    s_add_u32 s0, s0, s2
-; VI-NEXT:    s_addc_u32 s1, s1, s3
+; VI-NEXT:    v_cmp_gt_u64_e32 vcc, s[0:1], v[2:3]
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
+; VI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
-; VI-NEXT:    v_mov_b32_e32 v2, s0
+; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
 ; VI-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; VI-NEXT:    s_endpgm
 ;
@@ -57,15 +53,13 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX9-NEXT:    s_sub_u32 s0, s6, s2
-; GFX9-NEXT:    s_subb_u32 s1, s7, s3
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    s_subb_u32 s1, s7, s3
 ; GFX9-NEXT:    v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1]
-; GFX9-NEXT:    s_and_b64 s[2:3], vcc, exec
-; GFX9-NEXT:    s_cselect_b64 s[2:3], 1, 0
-; GFX9-NEXT:    s_add_u32 s0, s0, s2
-; GFX9-NEXT:    s_addc_u32 s1, s1, s3
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX9-NEXT:    s_endpgm
   %usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %b) #0
diff --git a/llvm/test/CodeGen/AMDGPU/zero_extend.ll b/llvm/test/CodeGen/AMDGPU/zero_extend.ll
index 9933cdc18e5fd..1f532f2706de7 100644
--- a/llvm/test/CodeGen/AMDGPU/zero_extend.ll
+++ b/llvm/test/CodeGen/AMDGPU/zero_extend.ll
@@ -38,7 +38,7 @@ define amdgpu_kernel void @s_arg_zext_i1_to_i64(ptr addrspace(1) %out, i1 zeroex
 ; GCN-LABEL: {{^}}s_cmp_zext_i1_to_i64:
 ; GCN-DAG: s_mov_b32 s{{[0-9]+}}, 0
 ; GCN-DAG: s_cmp_eq_u32
-; GCN:     s_cselect_b64 s[{{[0-9]+:[0-9]+}}], 1, 0
+; GCN:     v_cndmask_b32
 define amdgpu_kernel void @s_cmp_zext_i1_to_i64(ptr addrspace(1) %out, i32 %a, i32 %b) #0 {
   %cmp = icmp eq i32 %a, %b
   %ext = zext i1 %cmp to i64

From bb2b7530adc87dabc782141573ca063b1b76ce9e Mon Sep 17 00:00:00 2001
From: Christudasan Devadasan <Christudasan.Devadasan@amd.com>
Date: Thu, 26 Oct 2023 17:19:58 +0530
Subject: [PATCH 056/877] [AMDGPU] precommit lit test for PR 69924.

---
 .../AMDGPU/bb-prolog-spill-during-regalloc.ll | 93 +++++++++++++++++++
 1 file changed, 93 insertions(+)
 create mode 100644 llvm/test/CodeGen/AMDGPU/bb-prolog-spill-during-regalloc.ll

diff --git a/llvm/test/CodeGen/AMDGPU/bb-prolog-spill-during-regalloc.ll b/llvm/test/CodeGen/AMDGPU/bb-prolog-spill-during-regalloc.ll
new file mode 100644
index 0000000000000..a844b577c842f
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/bb-prolog-spill-during-regalloc.ll
@@ -0,0 +1,93 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -O0 -verify-machineinstrs --stop-after=regallocfast,1 -o - %s | FileCheck -check-prefix=REGALLOC %s
+
+; FIXME: There are two spill codes inserted wrongly in this test.
+; They are inserted during regalloc for the BBLiveIns - the spill restores for vgpr1 in the Flow block (bb.1) and for vgpr0 in the return block (bb.4).
+define i32 @prolog_spill(i32 %arg0, i32 %arg1, i32 %arg2) {
+  ; REGALLOC-LABEL: name: prolog_spill
+  ; REGALLOC: bb.0.bb.0:
+  ; REGALLOC-NEXT:   successors: %bb.3(0x40000000), %bb.1(0x40000000)
+  ; REGALLOC-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
+  ; REGALLOC-NEXT: {{  $}}
+  ; REGALLOC-NEXT:   renamable $vgpr3 = IMPLICIT_DEF
+  ; REGALLOC-NEXT:   SI_SPILL_V32_SAVE killed $vgpr2, %stack.5, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.5, addrspace 5)
+  ; REGALLOC-NEXT:   SI_SPILL_V32_SAVE killed $vgpr1, %stack.4, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.4, addrspace 5)
+  ; REGALLOC-NEXT:   renamable $vgpr1 = COPY killed $vgpr0
+  ; REGALLOC-NEXT:   $vgpr0 = SI_SPILL_WWM_V32_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5)
+  ; REGALLOC-NEXT:   renamable $sgpr4 = S_MOV_B32 49
+  ; REGALLOC-NEXT:   renamable $sgpr4_sgpr5 = V_CMP_GT_I32_e64 killed $vgpr1, killed $sgpr4, implicit $exec
+  ; REGALLOC-NEXT:   renamable $sgpr6 = IMPLICIT_DEF
+  ; REGALLOC-NEXT:   renamable $vgpr1 = COPY killed renamable $sgpr6
+  ; REGALLOC-NEXT:   SI_SPILL_V32_SAVE killed $vgpr1, %stack.3, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5)
+  ; REGALLOC-NEXT:   renamable $sgpr6_sgpr7 = COPY $exec, implicit-def $exec
+  ; REGALLOC-NEXT:   renamable $sgpr4_sgpr5 = S_AND_B64 renamable $sgpr6_sgpr7, killed renamable $sgpr4_sgpr5, implicit-def dead $scc
+  ; REGALLOC-NEXT:   renamable $sgpr6_sgpr7 = S_XOR_B64 renamable $sgpr4_sgpr5, killed renamable $sgpr6_sgpr7, implicit-def dead $scc
+  ; REGALLOC-NEXT:   renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr6, 0, $vgpr0, implicit-def $sgpr6_sgpr7, implicit $sgpr6_sgpr7
+  ; REGALLOC-NEXT:   renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr7, 1, $vgpr0, implicit killed $sgpr6_sgpr7
+  ; REGALLOC-NEXT:   SI_SPILL_WWM_V32_SAVE killed $vgpr0, %stack.2, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5)
+  ; REGALLOC-NEXT:   $exec = S_MOV_B64_term killed renamable $sgpr4_sgpr5
+  ; REGALLOC-NEXT:   S_CBRANCH_EXECZ %bb.1, implicit $exec
+  ; REGALLOC-NEXT:   S_BRANCH %bb.3
+  ; REGALLOC-NEXT: {{  $}}
+  ; REGALLOC-NEXT: bb.1.Flow:
+  ; REGALLOC-NEXT:   successors: %bb.2(0x40000000), %bb.4(0x40000000)
+  ; REGALLOC-NEXT: {{  $}}
+  ; REGALLOC-NEXT:   $vgpr0 = SI_SPILL_WWM_V32_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5)
+  ; REGALLOC-NEXT:   $vgpr1 = SI_SPILL_V32_RESTORE %stack.3, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5)
+  ; REGALLOC-NEXT:   $sgpr4 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr4_sgpr5
+  ; REGALLOC-NEXT:   $sgpr5 = V_READLANE_B32 $vgpr0, 1
+  ; REGALLOC-NEXT:   renamable $sgpr4_sgpr5 = S_OR_SAVEEXEC_B64 killed renamable $sgpr4_sgpr5, implicit-def $exec, implicit-def dead $scc, implicit $exec
+  ; REGALLOC-NEXT:   SI_SPILL_V32_SAVE killed $vgpr1, %stack.6, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.6, addrspace 5)
+  ; REGALLOC-NEXT:   renamable $sgpr4_sgpr5 = S_AND_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def dead $scc
+  ; REGALLOC-NEXT:   renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr4, 2, $vgpr0, implicit-def $sgpr4_sgpr5, implicit $sgpr4_sgpr5
+  ; REGALLOC-NEXT:   renamable $vgpr0 = V_WRITELANE_B32 $sgpr5, 3, $vgpr0, implicit $sgpr4_sgpr5
+  ; REGALLOC-NEXT:   SI_SPILL_WWM_V32_SAVE killed $vgpr0, %stack.2, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5)
+  ; REGALLOC-NEXT:   $exec = S_XOR_B64_term $exec, killed renamable $sgpr4_sgpr5, implicit-def dead $scc
+  ; REGALLOC-NEXT:   S_CBRANCH_EXECZ %bb.4, implicit $exec
+  ; REGALLOC-NEXT:   S_BRANCH %bb.2
+  ; REGALLOC-NEXT: {{  $}}
+  ; REGALLOC-NEXT: bb.2.bb.1:
+  ; REGALLOC-NEXT:   successors: %bb.4(0x80000000)
+  ; REGALLOC-NEXT: {{  $}}
+  ; REGALLOC-NEXT:   $vgpr0 = SI_SPILL_V32_RESTORE %stack.4, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.4, addrspace 5)
+  ; REGALLOC-NEXT:   renamable $sgpr4 = S_MOV_B32 10
+  ; REGALLOC-NEXT:   renamable $vgpr0 = V_ADD_U32_e64 $vgpr0, killed $sgpr4, 0, implicit $exec
+  ; REGALLOC-NEXT:   SI_SPILL_V32_SAVE killed $vgpr0, %stack.6, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.6, addrspace 5)
+  ; REGALLOC-NEXT:   S_BRANCH %bb.4
+  ; REGALLOC-NEXT: {{  $}}
+  ; REGALLOC-NEXT: bb.3.bb.2:
+  ; REGALLOC-NEXT:   successors: %bb.1(0x80000000)
+  ; REGALLOC-NEXT: {{  $}}
+  ; REGALLOC-NEXT:   $vgpr0 = SI_SPILL_V32_RESTORE %stack.5, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.5, addrspace 5)
+  ; REGALLOC-NEXT:   renamable $sgpr4 = S_MOV_B32 20
+  ; REGALLOC-NEXT:   renamable $vgpr0 = V_ADD_U32_e64 $vgpr0, killed $sgpr4, 0, implicit $exec
+  ; REGALLOC-NEXT:   SI_SPILL_V32_SAVE killed $vgpr0, %stack.3, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5)
+  ; REGALLOC-NEXT:   S_BRANCH %bb.1
+  ; REGALLOC-NEXT: {{  $}}
+  ; REGALLOC-NEXT: bb.4.bb.3:
+  ; REGALLOC-NEXT:   $vgpr1 = SI_SPILL_WWM_V32_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5)
+  ; REGALLOC-NEXT:   $vgpr0 = SI_SPILL_V32_RESTORE %stack.6, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.6, addrspace 5)
+  ; REGALLOC-NEXT:   $sgpr4 = V_READLANE_B32 $vgpr1, 2, implicit-def $sgpr4_sgpr5
+  ; REGALLOC-NEXT:   $sgpr5 = V_READLANE_B32 $vgpr1, 3
+  ; REGALLOC-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def dead $scc
+  ; REGALLOC-NEXT:   renamable $sgpr4 = S_MOV_B32 5
+  ; REGALLOC-NEXT:   renamable $vgpr0 = V_MUL_LO_U32_e64 killed $vgpr0, killed $sgpr4, implicit $exec
+  ; REGALLOC-NEXT:   KILL killed renamable $vgpr1
+  ; REGALLOC-NEXT:   SI_RETURN implicit killed $vgpr0
+bb.0:
+  %cmp = icmp slt i32 %arg0, 50
+  br i1 %cmp, label %bb.1, label %bb.2
+
+bb.1:
+  %val1 = add i32 %arg1, 10
+  br label %bb.3
+
+bb.2:
+  %val2 = add i32 %arg2, 20
+  br label %bb.3
+
+bb.3:
+  %val = phi i32 [ %val1, %bb.1 ], [ %val2, %bb.2 ]
+  %ret = mul i32 %val, 5;
+  ret i32 %ret
+}

From ce6b9b3b58b6c9e51d87084c916fa7aef81401f1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Markus=20M=C3=BCtzel?= <markus.muetzel@gmx.de>
Date: Thu, 26 Oct 2023 14:30:11 +0200
Subject: [PATCH 057/877] [flang][runtime] Avoid dependency on libc++ for
 `std::__libcpp_verbose_abort`

Changes in libc++ during the development cycle for LLVM 17 lead to the FortranRuntime library depending on libc++.

Trying to build with Flang 17 that was built with clang++ 17 and libc++ 17 (on MinGW) leads to the following linker error:

ld.lld: error: undefined symbol: std::__1::__libcpp_verbose_abort(char const*, ...)
>>> referenced by libFortranRuntime.a(io-api.cpp.obj):(std::__1::__throw_bad_variant_access[abi:v170000]())
>>> referenced by libFortranRuntime.a(io-stmt.cpp.obj)
>>> referenced by libFortranRuntime.a(unit.cpp.obj)
That might be caused by std::get being called on a std::variant in common::visit.

std::__libcpp_verbose_abort is a weak symbol in libc++ that can be optionally replaced by an alternative definition in user code (see: [1])

Do that to avoid a dependency of the FortranRuntime on libc++.

[1]: https://libcxx.llvm.org/UsingLibcxx.html#overriding-the-default-termination-handler

See also: https://github.com/msys2/MINGW-packages/pull/18002#issuecomment-1694412640

Differential Revision: https://reviews.llvm.org/D158957
---
 flang/runtime/io-api.cpp | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/flang/runtime/io-api.cpp b/flang/runtime/io-api.cpp
index f9d60fecb149a..2fc530c7431a5 100644
--- a/flang/runtime/io-api.cpp
+++ b/flang/runtime/io-api.cpp
@@ -1517,3 +1517,17 @@ enum Iostat IONAME(CheckUnitNumberInRange128)(common::int128_t unit,
 #endif
 
 } // namespace Fortran::runtime::io
+
+#if defined(_LIBCPP_VERBOSE_ABORT)
+// Provide own definition for `std::__libcpp_verbose_abort` to avoid dependency
+// on the version provided by libc++.
+
+void std::__libcpp_verbose_abort(char const *format, ...) {
+  va_list list;
+  va_start(list, format);
+  std::vfprintf(stderr, format, list);
+  va_end(list);
+
+  std::abort();
+}
+#endif

From 4638c29c3dd14048ca78e37f132d4c75f490d139 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Thu, 26 Oct 2023 13:09:21 +0200
Subject: [PATCH 058/877] [InstSimplify] Remove redundant pointer icmp fold
 (NFCI)

This fold is already performed as part of simplifyICmpWithZero().
---
 llvm/lib/Analysis/InstructionSimplify.cpp | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp
index b3feb2470e58e..3d192d0759a1e 100644
--- a/llvm/lib/Analysis/InstructionSimplify.cpp
+++ b/llvm/lib/Analysis/InstructionSimplify.cpp
@@ -2736,13 +2736,6 @@ static Constant *computePointerICmp(CmpInst::Predicate Pred, Value *LHS,
   const TargetLibraryInfo *TLI = Q.TLI;
   const DominatorTree *DT = Q.DT;
   const Instruction *CxtI = Q.CxtI;
-  const InstrInfoQuery &IIQ = Q.IIQ;
-
-  // A non-null pointer is not equal to a null pointer.
-  if (isa<ConstantPointerNull>(RHS) && ICmpInst::isEquality(Pred) &&
-      llvm::isKnownNonZero(LHS, DL, 0, nullptr, nullptr, nullptr,
-                           IIQ.UseInstrInfo))
-    return ConstantInt::get(getCompareTy(LHS), !CmpInst::isTrueWhenEqual(Pred));
 
   // We can only fold certain predicates on pointer comparisons.
   switch (Pred) {

From 851338b126b8307300ae1c41bd9fb46791365bec Mon Sep 17 00:00:00 2001
From: Allen <zhongyunde@huawei.com>
Date: Thu, 26 Oct 2023 20:39:24 +0800
Subject: [PATCH 059/877] Revert "[SimplifyCFG] Delete the unnecessary range
 check for small mask operation (#70324)

This reverts commit 5e07481d4240b5e8fd85f9b92df30849606c2af0.
---
 llvm/lib/Transforms/Utils/SimplifyCFG.cpp     | 22 ++---------------
 .../Transforms/SimplifyCFG/switch_mask.ll     | 24 +++++++++++--------
 2 files changed, 16 insertions(+), 30 deletions(-)

diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index 18187bcdedf09..68b5b1a78a346 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -6598,8 +6598,9 @@ static bool SwitchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder,
   // If the default destination is unreachable, or if the lookup table covers
   // all values of the conditional variable, branch directly to the lookup table
   // BB. Otherwise, check that the condition is within the case range.
-  bool DefaultIsReachable =
+  const bool DefaultIsReachable =
       !isa<UnreachableInst>(SI->getDefaultDest()->getFirstNonPHIOrDbg());
+  const bool GeneratingCoveredLookupTable = (MaxTableSize == TableSize);
 
   // Create the BB that does the lookups.
   Module &Mod = *CommonDest->getParent()->getParent();
@@ -6630,25 +6631,6 @@ static bool SwitchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder,
 
   BranchInst *RangeCheckBranch = nullptr;
 
-  // Grow the table to cover all possible index values to avoid the range check.
-  if (UseSwitchConditionAsTableIndex) {
-    ConstantRange CR = computeConstantRange(TableIndex, /* ForSigned */ false);
-    // Grow the table shouldn't have any size impact by checking
-    // WouldFitInRegister.
-    // TODO: Consider growing the table also when it doesn't fit in a register
-    // if no optsize is specified.
-    if (all_of(ResultTypes, [&](const auto &KV) {
-          return SwitchLookupTable::WouldFitInRegister(
-              DL, CR.getUpper().getLimitedValue(), KV.second /* ResultType */);
-        })) {
-      // The default branch is unreachable when we enlarge the lookup table.
-      // Adjust DefaultIsReachable to reuse code path.
-      TableSize = CR.getUpper().getZExtValue();
-      DefaultIsReachable = false;
-    }
-  }
-
-  const bool GeneratingCoveredLookupTable = (MaxTableSize == TableSize);
   if (!DefaultIsReachable || GeneratingCoveredLookupTable) {
     Builder.CreateBr(LookupBB);
     if (DTU)
diff --git a/llvm/test/Transforms/SimplifyCFG/switch_mask.ll b/llvm/test/Transforms/SimplifyCFG/switch_mask.ll
index 123519bc69211..8c97a0660d070 100644
--- a/llvm/test/Transforms/SimplifyCFG/switch_mask.ll
+++ b/llvm/test/Transforms/SimplifyCFG/switch_mask.ll
@@ -8,11 +8,13 @@ define i1 @switch_lookup_with_small_i1(i64 %x) {
 ; CHECK-LABEL: @switch_lookup_with_small_i1(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[AND:%.*]] = and i64 [[X:%.*]], 15
-; CHECK-NEXT:    [[SWITCH_CAST:%.*]] = trunc i64 [[AND]] to i16
-; CHECK-NEXT:    [[SWITCH_SHIFTAMT:%.*]] = mul nuw nsw i16 [[SWITCH_CAST]], 1
-; CHECK-NEXT:    [[SWITCH_DOWNSHIFT:%.*]] = lshr i16 1030, [[SWITCH_SHIFTAMT]]
-; CHECK-NEXT:    [[SWITCH_MASKED:%.*]] = trunc i16 [[SWITCH_DOWNSHIFT]] to i1
-; CHECK-NEXT:    ret i1 [[SWITCH_MASKED]]
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp ult i64 [[AND]], 11
+; CHECK-NEXT:    [[SWITCH_CAST:%.*]] = trunc i64 [[AND]] to i11
+; CHECK-NEXT:    [[SWITCH_SHIFTAMT:%.*]] = mul nuw nsw i11 [[SWITCH_CAST]], 1
+; CHECK-NEXT:    [[SWITCH_DOWNSHIFT:%.*]] = lshr i11 -1018, [[SWITCH_SHIFTAMT]]
+; CHECK-NEXT:    [[SWITCH_MASKED:%.*]] = trunc i11 [[SWITCH_DOWNSHIFT]] to i1
+; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[TMP0]], i1 [[SWITCH_MASKED]], i1 false
+; CHECK-NEXT:    ret i1 [[TMP1]]
 ;
 entry:
   %and = and i64 %x, 15
@@ -35,11 +37,13 @@ define i8 @switch_lookup_with_small_i8(i64 %x) {
 ; CHECK-LABEL: @switch_lookup_with_small_i8(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[REM:%.*]] = urem i64 [[X:%.*]], 5
-; CHECK-NEXT:    [[SWITCH_CAST:%.*]] = trunc i64 [[REM]] to i40
-; CHECK-NEXT:    [[SWITCH_SHIFTAMT:%.*]] = mul nuw nsw i40 [[SWITCH_CAST]], 8
-; CHECK-NEXT:    [[SWITCH_DOWNSHIFT:%.*]] = lshr i40 460303, [[SWITCH_SHIFTAMT]]
-; CHECK-NEXT:    [[SWITCH_MASKED:%.*]] = trunc i40 [[SWITCH_DOWNSHIFT]] to i8
-; CHECK-NEXT:    ret i8 [[SWITCH_MASKED]]
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp ult i64 [[REM]], 3
+; CHECK-NEXT:    [[SWITCH_CAST:%.*]] = trunc i64 [[REM]] to i24
+; CHECK-NEXT:    [[SWITCH_SHIFTAMT:%.*]] = mul nuw nsw i24 [[SWITCH_CAST]], 8
+; CHECK-NEXT:    [[SWITCH_DOWNSHIFT:%.*]] = lshr i24 460303, [[SWITCH_SHIFTAMT]]
+; CHECK-NEXT:    [[SWITCH_MASKED:%.*]] = trunc i24 [[SWITCH_DOWNSHIFT]] to i8
+; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[TMP0]], i8 [[SWITCH_MASKED]], i8 0
+; CHECK-NEXT:    ret i8 [[TMP1]]
 ;
 entry:
   %rem = urem i64 %x, 5

From fabcadf2eb20dcc2fa262a5a1c6752ac93544e9d Mon Sep 17 00:00:00 2001
From: Erik Jonsson <erik.j.jonsson@ericsson.com>
Date: Thu, 26 Oct 2023 14:43:38 +0200
Subject: [PATCH 060/877] Let M68kMCCodeEmitter set Scratch size. (#69898)

The Scratch buffer passed to getBinaryCodeForInst needs to be able to
hold any value returned by getMachineOpValue or other custom encoders.
It's better to let the caller of getBinaryCodeForInst set the size of
Scratch as it's impossible for VarLenCodeEmitterGen to know what the
smallest needed size is.

VarLenCodeEmitterGen now calculates its smallest needed Scratch bit
width based on the slice operations and zero extends Scratch if it's too
small. This only guarantees that Scratch has enough bits for the
generated code not for getMachineOpValue or custom encoders.

The smallest internal APInt representation uses one uint64_t word so
there is no point in using a smaller size.
---
 .../M68k/MCTargetDesc/M68kMCCodeEmitter.cpp   |  2 +-
 llvm/test/TableGen/VarLenEncoder.td           |  4 ++--
 llvm/utils/TableGen/VarLenCodeEmitterGen.cpp  | 20 ++++++++++++++-----
 3 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Target/M68k/MCTargetDesc/M68kMCCodeEmitter.cpp b/llvm/lib/Target/M68k/MCTargetDesc/M68kMCCodeEmitter.cpp
index 97f5d7a3dc077..a9ff059bc990b 100644
--- a/llvm/lib/Target/M68k/MCTargetDesc/M68kMCCodeEmitter.cpp
+++ b/llvm/lib/Target/M68k/MCTargetDesc/M68kMCCodeEmitter.cpp
@@ -209,7 +209,7 @@ void M68kMCCodeEmitter::encodeInstruction(const MCInst &MI,
 
   // Try using the new method first.
   APInt EncodedInst(16, 0U);
-  APInt Scratch(16, 0U);
+  APInt Scratch(64, 0U); // One APInt word is enough.
   getBinaryCodeForInstr(MI, Fixups, EncodedInst, Scratch, STI);
 
   ArrayRef<uint64_t> Data(EncodedInst.getRawData(), EncodedInst.getNumWords());
diff --git a/llvm/test/TableGen/VarLenEncoder.td b/llvm/test/TableGen/VarLenEncoder.td
index 3dd100a50fc58..0fabf4150b79d 100644
--- a/llvm/test/TableGen/VarLenEncoder.td
+++ b/llvm/test/TableGen/VarLenEncoder.td
@@ -65,7 +65,7 @@ def FOO32 : MyVarInst<MemOp32<"src">>;
 // CHECK: UINT64_C(46848), // FOO32
 
 // CHECK-LABEL: case ::FOO16: {
-// CHECK: Scratch = Scratch.zext(41);
+// CHECK: Scratch.getBitWidth() < 16
 // src.reg
 // CHECK: getMachineOpValue(MI, MI.getOperand(1), /*Pos=*/0, Scratch, Fixups, STI);
 // CHECK: Inst.insertBits(Scratch.extractBits(8, 0), 0);
@@ -83,7 +83,7 @@ def FOO32 : MyVarInst<MemOp32<"src">>;
 // CHECK: Inst.insertBits(Scratch.extractBits(2, 0), 39);
 
 // CHECK-LABEL: case ::FOO32: {
-// CHECK: Scratch = Scratch.zext(57);
+// CHECK: Scratch.getBitWidth() < 32
 // src.reg
 // CHECK: getMachineOpValue(MI, MI.getOperand(1), /*Pos=*/0, Scratch, Fixups, STI);
 // CHECK: Inst.insertBits(Scratch.extractBits(8, 0), 0);
diff --git a/llvm/utils/TableGen/VarLenCodeEmitterGen.cpp b/llvm/utils/TableGen/VarLenCodeEmitterGen.cpp
index 24f116bbeaced..bfb7e5c333170 100644
--- a/llvm/utils/TableGen/VarLenCodeEmitterGen.cpp
+++ b/llvm/utils/TableGen/VarLenCodeEmitterGen.cpp
@@ -60,6 +60,8 @@
 #include "llvm/TableGen/Error.h"
 #include "llvm/TableGen/Record.h"
 
+#include <algorithm>
+
 using namespace llvm;
 
 namespace {
@@ -445,20 +447,17 @@ std::string VarLenCodeEmitterGen::getInstructionCases(Record *R,
 std::string VarLenCodeEmitterGen::getInstructionCaseForEncoding(
     Record *R, AltEncodingTy Mode, const VarLenInst &VLI, CodeGenTarget &Target,
     int I) {
-  size_t BitWidth = VLI.size();
 
   CodeGenInstruction &CGI = Target.getInstruction(R);
 
   std::string Case;
   raw_string_ostream SS(Case);
-  // Resize the scratch buffer.
-  if (BitWidth && !VLI.isFixedValueOnly())
-    SS.indent(I) << "Scratch = Scratch.zext(" << BitWidth << ");\n";
   // Populate based value.
   SS.indent(I) << "Inst = getInstBits" << Modes[Mode] << "(opcode);\n";
 
   // Process each segment in VLI.
   size_t Offset = 0U;
+  unsigned HighScratchAccess = 0U;
   for (const auto &ES : VLI) {
     unsigned NumBits = ES.BitWidth;
     const Init *Val = ES.Value;
@@ -497,6 +496,8 @@ std::string VarLenCodeEmitterGen::getInstructionCaseForEncoding(
                    << "Scratch.extractBits(" << utostr(NumBits) << ", "
                    << utostr(LoBit) << ")"
                    << ", " << Offset << ");\n";
+
+      HighScratchAccess = std::max(HighScratchAccess, NumBits + LoBit);
     }
     Offset += NumBits;
   }
@@ -505,7 +506,16 @@ std::string VarLenCodeEmitterGen::getInstructionCaseForEncoding(
   if (!PostEmitter.empty())
     SS.indent(I) << "Inst = " << PostEmitter << "(MI, Inst, STI);\n";
 
-  return Case;
+  // Resize the scratch buffer if it's to small.
+  std::string ScratchResizeStr;
+  if (VLI.size() && !VLI.isFixedValueOnly()) {
+    raw_string_ostream RS(ScratchResizeStr);
+    RS.indent(I) << "if (Scratch.getBitWidth() < " << HighScratchAccess
+                 << ") { Scratch = Scratch.zext(" << HighScratchAccess
+                 << "); }\n";
+  }
+
+  return ScratchResizeStr + Case;
 }
 
 namespace llvm {

From 2e85123bfe8501e08689a27c6cf93203df06654a Mon Sep 17 00:00:00 2001
From: Luke Lau <luke@igalia.com>
Date: Thu, 26 Oct 2023 13:46:32 +0100
Subject: [PATCH 061/877] [VP] Check if VP ops with functional intrinsics are
 speculatable (#69504)

Noticed whilst working on #69494. VP intrinsics whose functional
equivalent is
an intrinsic were being marked as their lanes being non-speculatable,
even if
the underlying intrinsic was speculatable.

This meant that

```llvm
  %1 = call <4 x i32> @llvm.vp.umax(<4 x i32> %x, <4 x i32> %y, <4 x i1> %mask, i32 %evl)
```

would be expanded out to

```llvm
  %.splatinsert = insertelement <4 x i32> poison, i32 %evl, i64 0
  %.splat = shufflevector <4 x i32> %.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer
  %1 = icmp ult <4 x i32> <i32 0, i32 1, i32 2, i32 3>, %.splat
  %2 = and <4 x i1> %1, %mask
  %3 = call <4 x i32> @llvm.umax.v4i32(<4 x i32> %x, <4 x i32> %y)
```

instead of

```llvm
  %1 = call <4 x i32> @llvm.umax.v4i32(<4 x i32> %x, <4 x i32> %y)
```

The cause of this was isSafeToSpeculativelyExecuteWithOpcode checking
the
function attributes for the VP instruction itself, not the functional
intrinsic. Since isSafeToSpeculativelyExecuteWithOpcode expects an
already
materialized instruction, we can't use it directly for the intrinsic
case. So
this fixes it by manually checking the function attributes on the
intrinsic.
---
 llvm/lib/CodeGen/ExpandVectorPredication.cpp |  9 +-
 llvm/test/CodeGen/Generic/expand-vp.ll       | 92 +++++++++++++-------
 2 files changed, 68 insertions(+), 33 deletions(-)

diff --git a/llvm/lib/CodeGen/ExpandVectorPredication.cpp b/llvm/lib/CodeGen/ExpandVectorPredication.cpp
index 2d4da33a566dc..6c873a9aee27f 100644
--- a/llvm/lib/CodeGen/ExpandVectorPredication.cpp
+++ b/llvm/lib/CodeGen/ExpandVectorPredication.cpp
@@ -123,9 +123,12 @@ static bool maySpeculateLanes(VPIntrinsic &VPI) {
   if (isa<VPReductionIntrinsic>(VPI))
     return false;
   // Fallback to whether the intrinsic is speculatable.
-  std::optional<unsigned> OpcOpt = VPI.getFunctionalOpcode();
-  unsigned FunctionalOpc = OpcOpt.value_or((unsigned)Instruction::Call);
-  return isSafeToSpeculativelyExecuteWithOpcode(FunctionalOpc, &VPI);
+  if (auto IntrID = VPI.getFunctionalIntrinsicID())
+    return Intrinsic::getAttributes(VPI.getContext(), *IntrID)
+        .hasFnAttr(Attribute::AttrKind::Speculatable);
+  if (auto Opc = VPI.getFunctionalOpcode())
+    return isSafeToSpeculativelyExecuteWithOpcode(*Opc, &VPI);
+  return false;
 }
 
 //// } Helpers
diff --git a/llvm/test/CodeGen/Generic/expand-vp.ll b/llvm/test/CodeGen/Generic/expand-vp.ll
index 509f86a64d9ce..40d183273b86d 100644
--- a/llvm/test/CodeGen/Generic/expand-vp.ll
+++ b/llvm/test/CodeGen/Generic/expand-vp.ll
@@ -18,6 +18,10 @@ declare <8 x i32> @llvm.vp.sdiv.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32)
 declare <8 x i32> @llvm.vp.srem.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32)
 declare <8 x i32> @llvm.vp.udiv.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32)
 declare <8 x i32> @llvm.vp.urem.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32)
+declare <8 x i32> @llvm.vp.smax.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32)
+declare <8 x i32> @llvm.vp.smin.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32)
+declare <8 x i32> @llvm.vp.umax.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32)
+declare <8 x i32> @llvm.vp.umin.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32)
 ; Bit arith
 declare <8 x i32> @llvm.vp.and.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32)
 declare <8 x i32> @llvm.vp.xor.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32)
@@ -52,12 +56,16 @@ define void @test_vp_int_v8(<8 x i32> %i0, <8 x i32> %i1, <8 x i32> %i2, <8 x i3
   %r4 = call <8 x i32> @llvm.vp.srem.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n)
   %r5 = call <8 x i32> @llvm.vp.udiv.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n)
   %r6 = call <8 x i32> @llvm.vp.urem.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n)
-  %r7 = call <8 x i32> @llvm.vp.and.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n)
-  %r8 = call <8 x i32> @llvm.vp.or.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n)
-  %r9 = call <8 x i32> @llvm.vp.xor.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n)
-  %rA = call <8 x i32> @llvm.vp.ashr.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n)
-  %rB = call <8 x i32> @llvm.vp.lshr.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n)
-  %rC = call <8 x i32> @llvm.vp.shl.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n)
+  %r7 = call <8 x i32> @llvm.vp.smax.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n)
+  %r8 = call <8 x i32> @llvm.vp.smin.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n)
+  %r9 = call <8 x i32> @llvm.vp.umax.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n)
+  %rA = call <8 x i32> @llvm.vp.umin.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n)
+  %rB = call <8 x i32> @llvm.vp.and.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n)
+  %rC = call <8 x i32> @llvm.vp.or.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n)
+  %rD = call <8 x i32> @llvm.vp.xor.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n)
+  %rE = call <8 x i32> @llvm.vp.ashr.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n)
+  %rF = call <8 x i32> @llvm.vp.lshr.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n)
+  %r10 = call <8 x i32> @llvm.vp.shl.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n)
   ret void
 }
 
@@ -70,6 +78,10 @@ declare <vscale x 4 x i32> @llvm.vp.sdiv.nxv4i32(<vscale x 4 x i32>, <vscale x 4
 declare <vscale x 4 x i32> @llvm.vp.srem.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i1>, i32)
 declare <vscale x 4 x i32> @llvm.vp.udiv.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i1>, i32)
 declare <vscale x 4 x i32> @llvm.vp.urem.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i1>, i32)
+declare <vscale x 4 x i32> @llvm.vp.smax.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i1>, i32)
+declare <vscale x 4 x i32> @llvm.vp.smin.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i1>, i32)
+declare <vscale x 4 x i32> @llvm.vp.umax.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i1>, i32)
+declare <vscale x 4 x i32> @llvm.vp.umin.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i1>, i32)
 ; Bit arith
 declare <vscale x 4 x i32> @llvm.vp.and.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i1>, i32)
 declare <vscale x 4 x i32> @llvm.vp.xor.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i1>, i32)
@@ -87,12 +99,16 @@ define void @test_vp_int_vscale(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1,
   %r4 = call <vscale x 4 x i32> @llvm.vp.srem.nxv4i32(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1, <vscale x 4 x i1> %m, i32 %n)
   %r5 = call <vscale x 4 x i32> @llvm.vp.udiv.nxv4i32(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1, <vscale x 4 x i1> %m, i32 %n)
   %r6 = call <vscale x 4 x i32> @llvm.vp.urem.nxv4i32(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1, <vscale x 4 x i1> %m, i32 %n)
-  %r7 = call <vscale x 4 x i32> @llvm.vp.and.nxv4i32(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1, <vscale x 4 x i1> %m, i32 %n)
-  %r8 = call <vscale x 4 x i32> @llvm.vp.or.nxv4i32(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1, <vscale x 4 x i1> %m, i32 %n)
-  %r9 = call <vscale x 4 x i32> @llvm.vp.xor.nxv4i32(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1, <vscale x 4 x i1> %m, i32 %n)
-  %rA = call <vscale x 4 x i32> @llvm.vp.ashr.nxv4i32(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1, <vscale x 4 x i1> %m, i32 %n)
-  %rB = call <vscale x 4 x i32> @llvm.vp.lshr.nxv4i32(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1, <vscale x 4 x i1> %m, i32 %n)
-  %rC = call <vscale x 4 x i32> @llvm.vp.shl.nxv4i32(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1, <vscale x 4 x i1> %m, i32 %n)
+  %r7 = call <vscale x 4 x i32> @llvm.vp.smax.nxv4i32(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1, <vscale x 4 x i1> %m, i32 %n)
+  %r8 = call <vscale x 4 x i32> @llvm.vp.smin.nxv4i32(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1, <vscale x 4 x i1> %m, i32 %n)
+  %r9 = call <vscale x 4 x i32> @llvm.vp.umax.nxv4i32(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1, <vscale x 4 x i1> %m, i32 %n)
+  %rA = call <vscale x 4 x i32> @llvm.vp.umin.nxv4i32(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1, <vscale x 4 x i1> %m, i32 %n)
+  %rB = call <vscale x 4 x i32> @llvm.vp.and.nxv4i32(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1, <vscale x 4 x i1> %m, i32 %n)
+  %rC = call <vscale x 4 x i32> @llvm.vp.or.nxv4i32(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1, <vscale x 4 x i1> %m, i32 %n)
+  %rD = call <vscale x 4 x i32> @llvm.vp.xor.nxv4i32(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1, <vscale x 4 x i1> %m, i32 %n)
+  %rE = call <vscale x 4 x i32> @llvm.vp.ashr.nxv4i32(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1, <vscale x 4 x i1> %m, i32 %n)
+  %rF = call <vscale x 4 x i32> @llvm.vp.lshr.nxv4i32(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1, <vscale x 4 x i1> %m, i32 %n)
+  %r10 = call <vscale x 4 x i32> @llvm.vp.shl.nxv4i32(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1, <vscale x 4 x i1> %m, i32 %n)
   ret void
 }
 
@@ -166,6 +182,10 @@ define void @test_vp_cmp_v8(<8 x i32> %i0, <8 x i32> %i1, <8 x float> %f0, <8 x
 ; ALL-CONVERT:       %{{.+}} = udiv <8 x i32> %i0, %{{.+}}
 ; ALL-CONVERT-NOT:   %{{.+}} = urem <8 x i32> %i0, %i1
 ; ALL-CONVERT:       %{{.+}} = urem <8 x i32> %i0, %{{.+}}
+; ALL-CONVERT-NEXT:  %{{.+}} = call <8 x i32> @llvm.smax.v8i32(<8 x i32> %i0, <8 x i32> %i1)
+; ALL-CONVERT-NEXT:  %{{.+}} = call <8 x i32> @llvm.smin.v8i32(<8 x i32> %i0, <8 x i32> %i1)
+; ALL-CONVERT-NEXT:  %{{.+}} = call <8 x i32> @llvm.umax.v8i32(<8 x i32> %i0, <8 x i32> %i1)
+; ALL-CONVERT-NEXT:  %{{.+}} = call <8 x i32> @llvm.umin.v8i32(<8 x i32> %i0, <8 x i32> %i1)
 ; ALL-CONVERT-NEXT:  %{{.+}} = and <8 x i32> %i0, %i1
 ; ALL-CONVERT-NEXT:  %{{.+}} = or <8 x i32> %i0, %i1
 ; ALL-CONVERT-NEXT:  %{{.+}} = xor <8 x i32> %i0, %i1
@@ -263,12 +283,16 @@ define void @test_vp_cmp_v8(<8 x i32> %i0, <8 x i32> %i1, <8 x float> %f0, <8 x
 ; LEGAL_LEGAL-NEXT:   %r4 = call <8 x i32> @llvm.vp.srem.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n)
 ; LEGAL_LEGAL-NEXT:   %r5 = call <8 x i32> @llvm.vp.udiv.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n)
 ; LEGAL_LEGAL-NEXT:   %r6 = call <8 x i32> @llvm.vp.urem.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n)
-; LEGAL_LEGAL-NEXT:   %r7 = call <8 x i32> @llvm.vp.and.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n)
-; LEGAL_LEGAL-NEXT:   %r8 = call <8 x i32> @llvm.vp.or.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n)
-; LEGAL_LEGAL-NEXT:   %r9 = call <8 x i32> @llvm.vp.xor.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n)
-; LEGAL_LEGAL-NEXT:   %rA = call <8 x i32> @llvm.vp.ashr.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n)
-; LEGAL_LEGAL-NEXT:   %rB = call <8 x i32> @llvm.vp.lshr.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n)
-; LEGAL_LEGAL-NEXT:   %rC = call <8 x i32> @llvm.vp.shl.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n)
+; LEGAL_LEGAL-NEXT:   %r7 = call <8 x i32> @llvm.vp.smax.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n)
+; LEGAL_LEGAL-NEXT:   %r8 = call <8 x i32> @llvm.vp.smin.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n)
+; LEGAL_LEGAL-NEXT:   %r9 = call <8 x i32> @llvm.vp.umax.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n)
+; LEGAL_LEGAL-NEXT:   %rA = call <8 x i32> @llvm.vp.umin.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n)
+; LEGAL_LEGAL-NEXT:   %rB = call <8 x i32> @llvm.vp.and.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n)
+; LEGAL_LEGAL-NEXT:   %rC = call <8 x i32> @llvm.vp.or.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n)
+; LEGAL_LEGAL-NEXT:   %rD = call <8 x i32> @llvm.vp.xor.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n)
+; LEGAL_LEGAL-NEXT:   %rE = call <8 x i32> @llvm.vp.ashr.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n)
+; LEGAL_LEGAL-NEXT:   %rF = call <8 x i32> @llvm.vp.lshr.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n)
+; LEGAL_LEGAL-NEXT:   %r10 = call <8 x i32> @llvm.vp.shl.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n)
 ; LEGAL_LEGAL-NEXT:   ret void
 
 ; LEGAL_LEGAL:define void @test_vp_int_vscale(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1, <vscale x 4 x i32> %i2, <vscale x 4 x i32> %f3, <vscale x 4 x i1> %m, i32 %n) {
@@ -279,12 +303,16 @@ define void @test_vp_cmp_v8(<8 x i32> %i0, <8 x i32> %i1, <8 x float> %f0, <8 x
 ; LEGAL_LEGAL-NEXT:  %r4 = call <vscale x 4 x i32> @llvm.vp.srem.nxv4i32(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1, <vscale x 4 x i1> %m, i32 %n)
 ; LEGAL_LEGAL-NEXT:  %r5 = call <vscale x 4 x i32> @llvm.vp.udiv.nxv4i32(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1, <vscale x 4 x i1> %m, i32 %n)
 ; LEGAL_LEGAL-NEXT:  %r6 = call <vscale x 4 x i32> @llvm.vp.urem.nxv4i32(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1, <vscale x 4 x i1> %m, i32 %n)
-; LEGAL_LEGAL-NEXT:  %r7 = call <vscale x 4 x i32> @llvm.vp.and.nxv4i32(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1, <vscale x 4 x i1> %m, i32 %n)
-; LEGAL_LEGAL-NEXT:  %r8 = call <vscale x 4 x i32> @llvm.vp.or.nxv4i32(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1, <vscale x 4 x i1> %m, i32 %n)
-; LEGAL_LEGAL-NEXT:  %r9 = call <vscale x 4 x i32> @llvm.vp.xor.nxv4i32(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1, <vscale x 4 x i1> %m, i32 %n)
-; LEGAL_LEGAL-NEXT:  %rA = call <vscale x 4 x i32> @llvm.vp.ashr.nxv4i32(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1, <vscale x 4 x i1> %m, i32 %n)
-; LEGAL_LEGAL-NEXT:  %rB = call <vscale x 4 x i32> @llvm.vp.lshr.nxv4i32(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1, <vscale x 4 x i1> %m, i32 %n)
-; LEGAL_LEGAL-NEXT:  %rC = call <vscale x 4 x i32> @llvm.vp.shl.nxv4i32(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1, <vscale x 4 x i1> %m, i32 %n)
+; LEGAL_LEGAL-NEXT:  %r7 = call <vscale x 4 x i32> @llvm.vp.smax.nxv4i32(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1, <vscale x 4 x i1> %m, i32 %n)
+; LEGAL_LEGAL-NEXT:  %r8 = call <vscale x 4 x i32> @llvm.vp.smin.nxv4i32(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1, <vscale x 4 x i1> %m, i32 %n)
+; LEGAL_LEGAL-NEXT:  %r9 = call <vscale x 4 x i32> @llvm.vp.umax.nxv4i32(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1, <vscale x 4 x i1> %m, i32 %n)
+; LEGAL_LEGAL-NEXT:  %rA = call <vscale x 4 x i32> @llvm.vp.umin.nxv4i32(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1, <vscale x 4 x i1> %m, i32 %n)
+; LEGAL_LEGAL-NEXT:  %rB = call <vscale x 4 x i32> @llvm.vp.and.nxv4i32(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1, <vscale x 4 x i1> %m, i32 %n)
+; LEGAL_LEGAL-NEXT:  %rC = call <vscale x 4 x i32> @llvm.vp.or.nxv4i32(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1, <vscale x 4 x i1> %m, i32 %n)
+; LEGAL_LEGAL-NEXT:  %rD = call <vscale x 4 x i32> @llvm.vp.xor.nxv4i32(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1, <vscale x 4 x i1> %m, i32 %n)
+; LEGAL_LEGAL-NEXT:  %rE = call <vscale x 4 x i32> @llvm.vp.ashr.nxv4i32(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1, <vscale x 4 x i1> %m, i32 %n)
+; LEGAL_LEGAL-NEXT:  %rF = call <vscale x 4 x i32> @llvm.vp.lshr.nxv4i32(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1, <vscale x 4 x i1> %m, i32 %n)
+; LEGAL_LEGAL-NEXT:  %r10 = call <vscale x 4 x i32> @llvm.vp.shl.nxv4i32(<vscale x 4 x i32> %i0, <vscale x 4 x i32> %i1, <vscale x 4 x i1> %m, i32 %n)
 ; LEGAL_LEGAL-NEXT:  ret void
 
 ; LEGAL_LEGAL: define void @test_vp_reduce_int_v4(i32 %start, <4 x i32> %vi, <4 x i1> %m, i32 %n) {
@@ -342,12 +370,16 @@ define void @test_vp_cmp_v8(<8 x i32> %i0, <8 x i32> %i1, <8 x float> %f0, <8 x
 ; DISCARD_LEGAL-NOT:    %r4 = call <8 x i32> @llvm.vp.srem.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 8)
 ; DISCARD_LEGAL-NOT:    %r5 = call <8 x i32> @llvm.vp.udiv.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 8)
 ; DISCARD_LEGAL-NOT:    %r6 = call <8 x i32> @llvm.vp.urem.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 8)
-; DISCARD_LEGAL:        %r7 = call <8 x i32> @llvm.vp.and.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 8)
-; DISCARD_LEGAL-NEXT:   %r8 = call <8 x i32> @llvm.vp.or.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 8)
-; DISCARD_LEGAL-NEXT:   %r9 = call <8 x i32> @llvm.vp.xor.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 8)
-; DISCARD_LEGAL-NEXT:   %rA = call <8 x i32> @llvm.vp.ashr.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 8)
-; DISCARD_LEGAL-NEXT:   %rB = call <8 x i32> @llvm.vp.lshr.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 8)
-; DISCARD_LEGAL-NEXT:   %rC = call <8 x i32> @llvm.vp.shl.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 8)
+; DISCARD_LEGAL:        %r7 = call <8 x i32> @llvm.vp.smax.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 8)
+; DISCARD_LEGAL:        %r8 = call <8 x i32> @llvm.vp.smin.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 8)
+; DISCARD_LEGAL:        %r9 = call <8 x i32> @llvm.vp.umax.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 8)
+; DISCARD_LEGAL:        %rA = call <8 x i32> @llvm.vp.umin.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 8)
+; DISCARD_LEGAL-NEXT:   %rB = call <8 x i32> @llvm.vp.and.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 8)
+; DISCARD_LEGAL-NEXT:   %rC = call <8 x i32> @llvm.vp.or.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 8)
+; DISCARD_LEGAL-NEXT:   %rD = call <8 x i32> @llvm.vp.xor.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 8)
+; DISCARD_LEGAL-NEXT:   %rE = call <8 x i32> @llvm.vp.ashr.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 8)
+; DISCARD_LEGAL-NEXT:   %rF = call <8 x i32> @llvm.vp.lshr.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 8)
+; DISCARD_LEGAL-NEXT:   %r10 = call <8 x i32> @llvm.vp.shl.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 8)
 ; DISCARD_LEGAL-NEXT:   ret void
 
 ; TODO compute vscale only once and use caching.

From 38c9fab8f34a17462d03eb39fd2d6363f0c81f3e Mon Sep 17 00:00:00 2001
From: Pete Steinfeld <47540744+psteinfeld@users.noreply.github.com>
Date: Thu, 26 Oct 2023 05:50:38 -0700
Subject: [PATCH 062/877] [flang] Regularize TODO  messages for coarray
 intrinsics (#70281)

Apply a tag to coarray intrinsics to make them easier to recognize along
with other coarray constructs.

See pull request #69227 for a similar change.
---
 flang/lib/Optimizer/Builder/IntrinsicCall.cpp | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
index 0a023bc6b21ea..fe40fd821f010 100644
--- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
+++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
@@ -1343,6 +1343,12 @@ static bool isIntrinsicModuleProcedure(llvm::StringRef name) {
          name.startswith("ieee_") || name.startswith("__ppc_");
 }
 
+static bool isCoarrayIntrinsic(llvm::StringRef name) {
+  return name.startswith("atomic_") || name.startswith("co_") ||
+         name.contains("image") || name.endswith("cobound") ||
+         name.equals("team_number");
+}
+
 /// Return the generic name of an intrinsic module procedure specific name.
 /// Remove any "__builtin_" prefix, and any specific suffix of the form
 /// {_[ail]?[0-9]+}*, such as _1 or _a4.
@@ -1363,6 +1369,8 @@ llvm::StringRef genericName(llvm::StringRef specificName) {
 void crashOnMissingIntrinsic(mlir::Location loc, llvm::StringRef name) {
   if (isIntrinsicModuleProcedure(name))
     TODO(loc, "intrinsic module procedure: " + llvm::Twine(name));
+  else if (isCoarrayIntrinsic(name))
+    TODO(loc, "coarray: intrinsic " + llvm::Twine(name));
   else
     TODO(loc, "intrinsic: " + llvm::Twine(name));
 }

From e27ff897c2bf88d9c0b3d101bbe5e830e2831203 Mon Sep 17 00:00:00 2001
From: Andrzej Warzynski <andrzej.warzynski@arm.com>
Date: Thu, 26 Oct 2023 12:49:56 +0000
Subject: [PATCH 063/877] Revert "[mlir][SVE] Add an e2e test for vectorization
 of linalg.matmul (#69592)"

Broken bot:
* https://lab.llvm.org/buildbot/#/builders/197/builds/10572

This reverts commit 64025b8eba200c0be7cedbb36c6dcbbea3ca96c7.
---
 .../Dialect/Linalg/CPU/ArmSVE/matmul.mlir     | 71 -------------------
 1 file changed, 71 deletions(-)
 delete mode 100644 mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/matmul.mlir

diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/matmul.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/matmul.mlir
deleted file mode 100644
index bc94161d5d375..0000000000000
--- a/mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/matmul.mlir
+++ /dev/null
@@ -1,71 +0,0 @@
-// RUN: mlir-opt %s -test-transform-dialect-interpreter -test-transform-dialect-erase-schedule \
-// RUN:   -one-shot-bufferize -func-bufferize -cse -canonicalize -convert-vector-to-scf -arm-sve-legalize-vector-storage \
-// RUN:   -convert-vector-to-llvm="enable-arm-sve" -test-lower-to-llvm | \
-// RUN: %mcr_aarch64_cmd -e=entry -entry-point-result=void --march=aarch64 --mattr="+sve" -shared-libs=%mlir_runner_utils,%mlir_c_runner_utils | \
-// RUN: FileCheck %s
-
-func.func @entry() {
-  %c1 = arith.constant 1 : index
-  %c2 = arith.constant 2 : index
-  %c4 = arith.constant 4 : index
-  %c0 = arith.constant 0 : index
-  %step = arith.constant 1 : index
-  %c0_f32 = arith.constant 0.0 : f32
-
-  %vscale = vector.vscale
-  %vl_fp = arith.muli %c4, %vscale : index
-  %A_alloc = bufferization.alloc_tensor(%c2, %c1) : tensor<?x?xf32>
-  %B_alloc = bufferization.alloc_tensor(%c1, %vl_fp) : tensor<?x?xf32>
-  %C_alloc = bufferization.alloc_tensor(%c2, %vl_fp) : tensor<?x?xf32>
-
-  %pi = arith.constant  3.14 : f32
-  %A = linalg.fill ins(%pi : f32) outs(%A_alloc : tensor<?x?xf32>) -> tensor<?x?xf32>
-  %B = linalg.fill ins(%pi : f32) outs(%B_alloc : tensor<?x?xf32>) -> tensor<?x?xf32>
-  %C_in = linalg.fill ins(%c0_f32 : f32) outs(%C_alloc : tensor<?x?xf32>) -> tensor<?x?xf32>
-
-  %C_out = linalg.matmul ins(%A, %B: tensor<?x?xf32>, tensor<?x?xf32>) outs(%C_in: tensor<?x?xf32>) -> tensor<?x?xf32>
-
-  // CHECK-LABEL: SVE: START OF TEST OUTPUT
-  vector.print str "SVE: START OF TEST OUTPUT"
-
-  // There are at least 4 x f32 elements in every SVE vector, i.e. 
-  //    * %vscale >= 1. 
-  // Hence, when checking the outupt there will always be at least 4 elements
-  // in every row. For implementations with wider vectors, you should see more
-  // elements being printed.
-  // CHECK-NEXT: Unranked Memref {{.*}} rank = 2 offset = 0 sizes = [2, 16] strides = [16, 1] data =
-  // CHECK-NEXT: [9.8596,   9.8596,   9.8596,   9.8596
-  // CHECK-NEXT: [9.8596,   9.8596,   9.8596,   9.8596
-
-  %xf = tensor.cast %C_out : tensor<?x?xf32> to tensor<*xf32>
-  call @printMemrefF32(%xf) : (tensor<*xf32>) -> ()
-
-  // CHECK-NEXT: SVE: END OF TEST OUTPUT
-  vector.print str "SVE: END OF TEST OUTPUT"
-
-  return
-}
-
-transform.sequence failures(propagate) {
-^bb1(%module_op: !transform.any_op):
-  %0 = transform.structured.match ops{["linalg.matmul"]} in %module_op : (!transform.any_op) -> !transform.any_op
-  %func_op = get_parent_op %0 : (!transform.any_op) -> !transform.op<"func.func">
-  // The tile sizes match the output matrix sizes
-  %1, %loops:3 = transform.structured.tile_using_for %0 [2, [4], 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
-  %2 = transform.structured.match ops{["linalg.matmul"]} in %module_op : (!transform.any_op) -> !transform.any_op
-  // The vector sizes match the output matrix sizes
-  // TOOD: Use variables to re-use "shared" sizes
-  transform.structured.vectorize %2 vector_sizes [2, [4], 1] : !transform.any_op
-
-  transform.apply_patterns to %func_op {
-    transform.apply_patterns.vector.reduction_to_contract
-    transform.apply_patterns.vector.transfer_permutation_patterns
-    transform.apply_patterns.vector.lower_masked_transfers
-  } : !transform.op<"func.func">
-  transform.apply_patterns to %func_op {
-    transform.apply_patterns.vector.lower_contraction lowering_strategy = "outerproduct"
-    transform.apply_patterns.vector.lower_outerproduct
-  } : !transform.op<"func.func">
-}
-
-func.func private @printMemrefF32(%ptr : tensor<*xf32>)

From e01efddbf3f977525707d25f500300f62b98fe28 Mon Sep 17 00:00:00 2001
From: Timm Baeder <tbaeder@redhat.com>
Date: Thu, 26 Oct 2023 14:51:30 +0200
Subject: [PATCH 064/877] [clang][Interp] Correctly emit destructors for
 multi-dimensional arrays (#69140)

We were not taking those into account correctly when emitting
destructors. Fix that and add tests for it.

Fixes #69115
---
 clang/lib/AST/Interp/ByteCodeExprGen.cpp | 33 +++++++++-----
 clang/test/AST/Interp/arrays.cpp         | 57 ++++++++++++++++++++++++
 2 files changed, 78 insertions(+), 12 deletions(-)

diff --git a/clang/lib/AST/Interp/ByteCodeExprGen.cpp b/clang/lib/AST/Interp/ByteCodeExprGen.cpp
index 1e508f8998abe..a5141d728d832 100644
--- a/clang/lib/AST/Interp/ByteCodeExprGen.cpp
+++ b/clang/lib/AST/Interp/ByteCodeExprGen.cpp
@@ -2732,19 +2732,28 @@ bool ByteCodeExprGen<Emitter>::emitRecordDestruction(const Descriptor *Desc) {
   // Arrays.
   if (Desc->isArray()) {
     const Descriptor *ElemDesc = Desc->ElemDesc;
-    const Record *ElemRecord = ElemDesc->ElemRecord;
-    assert(ElemRecord); // This is not a primitive array.
+    assert(ElemDesc);
+
+    // Don't need to do anything for these.
+    if (ElemDesc->isPrimitiveArray())
+      return this->emitPopPtr(SourceInfo{});
+
+    // If this is an array of record types, check if we need
+    // to call the element destructors at all. If not, try
+    // to save the work.
+    if (const Record *ElemRecord = ElemDesc->ElemRecord) {
+      if (const CXXDestructorDecl *Dtor = ElemRecord->getDestructor();
+          !Dtor || Dtor->isTrivial())
+        return this->emitPopPtr(SourceInfo{});
+    }
 
-    if (const CXXDestructorDecl *Dtor = ElemRecord->getDestructor();
-        Dtor && !Dtor->isTrivial()) {
-      for (ssize_t I = Desc->getNumElems() - 1; I >= 0; --I) {
-        if (!this->emitConstUint64(I, SourceInfo{}))
-          return false;
-        if (!this->emitArrayElemPtrUint64(SourceInfo{}))
-          return false;
-        if (!this->emitRecordDestruction(Desc->ElemDesc))
-          return false;
-      }
+    for (ssize_t I = Desc->getNumElems() - 1; I >= 0; --I) {
+      if (!this->emitConstUint64(I, SourceInfo{}))
+        return false;
+      if (!this->emitArrayElemPtrUint64(SourceInfo{}))
+        return false;
+      if (!this->emitRecordDestruction(ElemDesc))
+        return false;
     }
     return this->emitPopPtr(SourceInfo{});
   }
diff --git a/clang/test/AST/Interp/arrays.cpp b/clang/test/AST/Interp/arrays.cpp
index d1673094c2660..34e0086fb9ee8 100644
--- a/clang/test/AST/Interp/arrays.cpp
+++ b/clang/test/AST/Interp/arrays.cpp
@@ -1,5 +1,7 @@
 // RUN: %clang_cc1 -fexperimental-new-constant-interpreter -verify %s
+// RUN: %clang_cc1 -fexperimental-new-constant-interpreter -std=c++20 -verify %s
 // RUN: %clang_cc1 -verify=ref %s
+// RUN: %clang_cc1 -verify=ref -std=c++20 %s
 
 constexpr int m = 3;
 constexpr const int *foo[][5] = {
@@ -497,3 +499,58 @@ namespace Incomplete {
                               // expected-error {{must be initialized by a constant expression}} \
                               // expected-note {{read of non-constexpr variable 'arr'}}
 }
+
+namespace GH69115 {
+  /// This used to crash because we were trying to emit destructors for the
+  /// array.
+  constexpr int foo() {
+    int arr[2][2] = {1, 2, 3, 4};
+    return 0;
+  }
+  static_assert(foo() == 0, "");
+
+  /// Test that we still emit the destructors for multi-dimensional
+  /// composite arrays.
+#if __cplusplus >= 202002L
+  constexpr void assert(bool C) {
+    if (C)
+      return;
+    // Invalid in constexpr.
+    (void)(1 / 0); // expected-warning {{undefined}} \
+                   // ref-warning {{undefined}}
+  }
+
+  class F {
+  public:
+    int a;
+    int *dtor;
+    int &idx;
+    constexpr F(int a, int *dtor, int &idx) : a(a), dtor(dtor), idx(idx) {}
+    constexpr ~F() noexcept(false){
+      dtor[idx] = a;
+      ++idx;
+    }
+  };
+  constexpr int foo2() {
+    int dtorIndices[] = {0, 0, 0, 0};
+    int idx = 0;
+
+    {
+      F arr[2][2] = {F(1, dtorIndices, idx),
+                     F(2, dtorIndices, idx),
+                     F(3, dtorIndices, idx),
+                     F(4, dtorIndices, idx)};
+    }
+
+    /// Reverse-reverse order.
+    assert(idx == 4);
+    assert(dtorIndices[0] == 4);
+    assert(dtorIndices[1] == 3);
+    assert(dtorIndices[2] == 2);
+    assert(dtorIndices[3] == 1);
+
+    return 0;
+  }
+  static_assert(foo2() == 0, "");
+#endif
+}

From 560bad013ebcb8d2c2c1722e35270b9a70ab40ce Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev@outlook.com>
Date: Wed, 26 Jul 2023 07:47:08 -0700
Subject: [PATCH 065/877] [SLP]Improve isGatherShuffledEntry by trying
 per-register shuffle.

Currently when building gather/buildvector node, we try to build nodes
shuffles without taking into account separate vector registers. We can
improve final codegen and the whole vectorization process by including
this info into the analysis and the vector code emission, allows to emit
better vectorized code.

Differential Revision: https://reviews.llvm.org/D149742
---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 462 +++++++++++++-----
 .../X86/multi-nodes-to-shuffle.ll             |  45 +-
 2 files changed, 387 insertions(+), 120 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 4f82d2d1d6d91..9b5da445daaab 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -2507,17 +2507,31 @@ class BoUpSLP {
   /// instruction in the list).
   Instruction &getLastInstructionInBundle(const TreeEntry *E);
 
-  /// Checks if the gathered \p VL can be represented as shuffle(s) of previous
-  /// tree entries.
+  /// Checks if the gathered \p VL can be represented as a single register
+  /// shuffle(s) of previous tree entries.
   /// \param TE Tree entry checked for permutation.
   /// \param VL List of scalars (a subset of the TE scalar), checked for
-  /// permutations.
+  /// permutations. Must form single-register vector.
   /// \returns ShuffleKind, if gathered values can be represented as shuffles of
-  /// previous tree entries. \p Mask is filled with the shuffle mask.
+  /// previous tree entries. \p Part of \p Mask is filled with the shuffle mask.
   std::optional<TargetTransformInfo::ShuffleKind>
-  isGatherShuffledEntry(const TreeEntry *TE, ArrayRef<Value *> VL,
-                        SmallVectorImpl<int> &Mask,
-                        SmallVectorImpl<const TreeEntry *> &Entries);
+  isGatherShuffledSingleRegisterEntry(
+      const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
+      SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part);
+
+  /// Checks if the gathered \p VL can be represented as multi-register
+  /// shuffle(s) of previous tree entries.
+  /// \param TE Tree entry checked for permutation.
+  /// \param VL List of scalars (a subset of the TE scalar), checked for
+  /// permutations.
+  /// \returns per-register series of ShuffleKind, if gathered values can be
+  /// represented as shuffles of previous tree entries. \p Mask is filled with
+  /// the shuffle mask (also on per-register base).
+  SmallVector<std::optional<TargetTransformInfo::ShuffleKind>>
+  isGatherShuffledEntry(
+      const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
+      SmallVectorImpl<SmallVector<const TreeEntry *>> &Entries,
+      unsigned NumParts);
 
   /// \returns the scalarization cost for this list of values. Assuming that
   /// this subtree gets vectorized, we may need to extract the values from the
@@ -6990,6 +7004,11 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
   BoUpSLP &R;
   SmallPtrSetImpl<Value *> &CheckedExtracts;
   constexpr static TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+  /// While set, still trying to estimate the cost for the same nodes and we
+  /// can delay actual cost estimation (virtual shuffle instruction emission).
+  /// May help better estimate the cost if same nodes must be permuted + allows
+  /// to move most of the long shuffles cost estimation to TTI.
+  bool SameNodesEstimated = true;
 
   static Constant *getAllOnesValue(const DataLayout &DL, Type *Ty) {
     if (Ty->getScalarType()->isPointerTy()) {
@@ -7230,6 +7249,49 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
     }
     return Cost;
   }
+  /// Transforms mask \p CommonMask per given \p Mask to make proper set after
+  /// shuffle emission.
+  static void transformMaskAfterShuffle(MutableArrayRef<int> CommonMask,
+                                        ArrayRef<int> Mask) {
+    for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
+      if (Mask[Idx] != PoisonMaskElem)
+        CommonMask[Idx] = Idx;
+  }
+  /// Adds the cost of reshuffling \p E1 and \p E2 (if present), using given
+  /// mask \p Mask, register number \p Part, that includes \p SliceSize
+  /// elements.
+  void estimateNodesPermuteCost(const TreeEntry &E1, const TreeEntry *E2,
+                                ArrayRef<int> Mask, unsigned Part,
+                                unsigned SliceSize) {
+    if (SameNodesEstimated) {
+      // Delay the cost estimation if the same nodes are reshuffling.
+      // If we already requested the cost of reshuffling of E1 and E2 before, no
+      // need to estimate another cost with the sub-Mask, instead include this
+      // sub-Mask into the CommonMask to estimate it later and avoid double cost
+      // estimation.
+      if ((InVectors.size() == 2 &&
+           InVectors.front().get<const TreeEntry *>() == &E1 &&
+           InVectors.back().get<const TreeEntry *>() == E2) ||
+          (!E2 && InVectors.front().get<const TreeEntry *>() == &E1)) {
+        assert(all_of(ArrayRef(CommonMask).slice(Part * SliceSize, SliceSize),
+                      [](int Idx) { return Idx == PoisonMaskElem; }) &&
+               "Expected all poisoned elements.");
+        ArrayRef<int> SubMask =
+            ArrayRef(Mask).slice(Part * SliceSize, SliceSize);
+        copy(SubMask, std::next(CommonMask.begin(), SliceSize * Part));
+        return;
+      }
+      // Found non-matching nodes - need to estimate the cost for the matched
+      // and transform mask.
+      Cost += createShuffle(InVectors.front(),
+                            InVectors.size() == 1 ? nullptr : InVectors.back(),
+                            CommonMask);
+      transformMaskAfterShuffle(CommonMask, CommonMask);
+    }
+    SameNodesEstimated = false;
+    Cost += createShuffle(&E1, E2, Mask);
+    transformMaskAfterShuffle(CommonMask, Mask);
+  }
 
   class ShuffleCostBuilder {
     const TargetTransformInfo &TTI;
@@ -7493,31 +7555,74 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
     // into a vector and can be represented as a permutation elements in a
     // single input vector or of 2 input vectors.
     Cost += computeExtractCost(VL, Mask, ShuffleKind);
+    InVectors.assign(1, E);
+    CommonMask.assign(Mask.begin(), Mask.end());
+    transformMaskAfterShuffle(CommonMask, CommonMask);
+    SameNodesEstimated = false;
     return VecBase;
   }
-  void add(const TreeEntry *E1, const TreeEntry *E2, ArrayRef<int> Mask) {
-    if (E1 == E2) {
+  void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
+    if (&E1 == &E2) {
       assert(all_of(Mask,
-                    [=](int Idx) {
-                      return Idx < static_cast<int>(E1->getVectorFactor());
+                    [&](int Idx) {
+                      return Idx < static_cast<int>(E1.getVectorFactor());
                     }) &&
              "Expected single vector shuffle mask.");
       add(E1, Mask);
       return;
     }
-    CommonMask.assign(Mask.begin(), Mask.end());
-    InVectors.assign({E1, E2});
+    if (InVectors.empty()) {
+      CommonMask.assign(Mask.begin(), Mask.end());
+      InVectors.assign({&E1, &E2});
+      return;
+    }
+    assert(!CommonMask.empty() && "Expected non-empty common mask.");
+    auto *MaskVecTy =
+        FixedVectorType::get(E1.Scalars.front()->getType(), Mask.size());
+    unsigned NumParts = TTI.getNumberOfParts(MaskVecTy);
+    assert(NumParts > 0 && NumParts < Mask.size() &&
+           "Expected positive number of registers.");
+    unsigned SliceSize = Mask.size() / NumParts;
+    const auto *It =
+        find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; });
+    unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
+    estimateNodesPermuteCost(E1, &E2, Mask, Part, SliceSize);
   }
-  void add(const TreeEntry *E1, ArrayRef<int> Mask) {
-    CommonMask.assign(Mask.begin(), Mask.end());
-    InVectors.assign(1, E1);
+  void add(const TreeEntry &E1, ArrayRef<int> Mask) {
+    if (InVectors.empty()) {
+      CommonMask.assign(Mask.begin(), Mask.end());
+      InVectors.assign(1, &E1);
+      return;
+    }
+    assert(!CommonMask.empty() && "Expected non-empty common mask.");
+    auto *MaskVecTy =
+        FixedVectorType::get(E1.Scalars.front()->getType(), Mask.size());
+    unsigned NumParts = TTI.getNumberOfParts(MaskVecTy);
+    assert(NumParts > 0 && NumParts < Mask.size() &&
+           "Expected positive number of registers.");
+    unsigned SliceSize = Mask.size() / NumParts;
+    const auto *It =
+        find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; });
+    unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
+    estimateNodesPermuteCost(E1, nullptr, Mask, Part, SliceSize);
+    if (!SameNodesEstimated && InVectors.size() == 1)
+      InVectors.emplace_back(&E1);
   }
   /// Adds another one input vector and the mask for the shuffling.
   void add(Value *V1, ArrayRef<int> Mask) {
-    assert(CommonMask.empty() && InVectors.empty() &&
-           "Expected empty input mask/vectors.");
-    CommonMask.assign(Mask.begin(), Mask.end());
-    InVectors.assign(1, V1);
+    if (InVectors.empty()) {
+      assert(CommonMask.empty() && "Expected empty input mask/vectors.");
+      CommonMask.assign(Mask.begin(), Mask.end());
+      InVectors.assign(1, V1);
+      return;
+    }
+    assert(InVectors.size() == 1 && InVectors.front().is<const TreeEntry *>() &&
+           !CommonMask.empty() && "Expected only single entry from extracts.");
+    InVectors.push_back(V1);
+    unsigned VF = CommonMask.size();
+    for (unsigned Idx = 0; Idx < VF; ++Idx)
+      if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
+        CommonMask[Idx] = Mask[Idx] + VF;
   }
   Value *gather(ArrayRef<Value *> VL, Value *Root = nullptr) {
     Cost += getBuildVectorCost(VL, Root);
@@ -7579,12 +7684,16 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
   ArrayRef<Value *> VL = E->Scalars;
 
   Type *ScalarTy = VL[0]->getType();
-  if (auto *SI = dyn_cast<StoreInst>(VL[0]))
-    ScalarTy = SI->getValueOperand()->getType();
-  else if (auto *CI = dyn_cast<CmpInst>(VL[0]))
-    ScalarTy = CI->getOperand(0)->getType();
-  else if (auto *IE = dyn_cast<InsertElementInst>(VL[0]))
-    ScalarTy = IE->getOperand(1)->getType();
+  if (E->State != TreeEntry::NeedToGather) {
+    if (auto *SI = dyn_cast<StoreInst>(VL[0]))
+      ScalarTy = SI->getValueOperand()->getType();
+    else if (auto *CI = dyn_cast<CmpInst>(VL[0]))
+      ScalarTy = CI->getOperand(0)->getType();
+    else if (auto *IE = dyn_cast<InsertElementInst>(VL[0]))
+      ScalarTy = IE->getOperand(1)->getType();
+  }
+  if (!FixedVectorType::isValidElementType(ScalarTy))
+    return InstructionCost::getInvalid();
   auto *VecTy = FixedVectorType::get(ScalarTy, VL.size());
   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
 
@@ -7596,7 +7705,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
     VecTy = FixedVectorType::get(ScalarTy, VL.size());
   }
   unsigned EntryVF = E->getVectorFactor();
-  auto *FinalVecTy = FixedVectorType::get(VecTy->getElementType(), EntryVF);
+  auto *FinalVecTy = FixedVectorType::get(ScalarTy, EntryVF);
 
   bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
   if (E->State == TreeEntry::NeedToGather) {
@@ -7629,20 +7738,28 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
     SmallVector<int> Mask;
     SmallVector<int> ExtractMask;
     std::optional<TargetTransformInfo::ShuffleKind> ExtractShuffle;
-    std::optional<TargetTransformInfo::ShuffleKind> GatherShuffle;
-    SmallVector<const TreeEntry *> Entries;
+    SmallVector<std::optional<TargetTransformInfo::ShuffleKind>> GatherShuffles;
+    SmallVector<SmallVector<const TreeEntry *>> Entries;
     // Check for gathered extracts.
-    ExtractShuffle = tryToGatherSingleRegisterExtractElements(GatheredScalars, ExtractMask);
+    ExtractShuffle =
+        tryToGatherSingleRegisterExtractElements(GatheredScalars, ExtractMask);
 
     bool Resized = false;
+    unsigned NumParts = TTI->getNumberOfParts(VecTy);
+    if (NumParts == 0 || NumParts >= GatheredScalars.size())
+      NumParts = 1;
     if (Value *VecBase = Estimator.adjustExtracts(
-            E, ExtractMask, ExtractShuffle.value_or(TTI::SK_PermuteTwoSrc)))
+            E, ExtractMask, ExtractShuffle.value_or(TTI::SK_PermuteTwoSrc))) {
       if (auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))
         if (VF == VecBaseTy->getNumElements() && GatheredScalars.size() != VF) {
           Resized = true;
           GatheredScalars.append(VF - GatheredScalars.size(),
                                  PoisonValue::get(ScalarTy));
         }
+    } else if (ExtractShuffle &&
+               TTI->getNumberOfParts(VecTy) == VecTy->getNumElements()) {
+      copy(VL, GatheredScalars.begin());
+    }
 
     // Do not try to look for reshuffled loads for gathered loads (they will be
     // handled later), for vectorized scalars, and cases, which are definitely
@@ -7652,12 +7769,12 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
         all_of(E->Scalars, [this](Value *V) { return getTreeEntry(V); }) ||
         isSplat(E->Scalars) ||
         (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2))
-      GatherShuffle = isGatherShuffledEntry(E, GatheredScalars, Mask, Entries);
-    if (GatherShuffle) {
-      assert((Entries.size() == 1 || Entries.size() == 2) &&
-             "Expected shuffle of 1 or 2 entries.");
-      if (*GatherShuffle == TTI::SK_PermuteSingleSrc &&
-          Entries.front()->isSame(E->Scalars)) {
+      GatherShuffles =
+          isGatherShuffledEntry(E, GatheredScalars, Mask, Entries, NumParts);
+    if (!GatherShuffles.empty()) {
+      if (GatherShuffles.size() == 1 &&
+          *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
+          Entries.front().front()->isSame(E->Scalars)) {
         // Perfect match in the graph, will reuse the previously vectorized
         // node. Cost is 0.
         LLVM_DEBUG(
@@ -7671,15 +7788,18 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
             continue;
           }
           if (Mask[I] == PoisonMaskElem)
-            Mask[I] = Entries.front()->findLaneForValue(V);
+            Mask[I] = Entries.front().front()->findLaneForValue(V);
         }
-        Estimator.add(Entries.front(), Mask);
+        Estimator.add(*Entries.front().front(), Mask);
         return Estimator.finalize(E->ReuseShuffleIndices);
       }
       if (!Resized) {
-        unsigned VF1 = Entries.front()->getVectorFactor();
-        unsigned VF2 = Entries.back()->getVectorFactor();
-        if ((VF == VF1 || VF == VF2) && GatheredScalars.size() != VF)
+        if (GatheredScalars.size() != VF &&
+            any_of(Entries, [&](ArrayRef<const TreeEntry *> TEs) {
+              return any_of(TEs, [&](const TreeEntry *TE) {
+                return TE->getVectorFactor() == VF;
+              });
+            }))
           GatheredScalars.append(VF - GatheredScalars.size(),
                                  PoisonValue::get(ScalarTy));
       }
@@ -7691,7 +7811,21 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
       LLVM_DEBUG(dbgs() << "SLP: shuffled " << Entries.size()
                         << " entries for bundle "
                         << shortBundleName(VL) << ".\n");
-      Estimator.add(Entries.front(), Entries.back(), Mask);
+      unsigned SliceSize = E->Scalars.size() / NumParts;
+      SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
+      for (const auto [I, TEs] : enumerate(Entries)) {
+        if (TEs.empty()) {
+          assert(!GatherShuffles[I] &&
+                 "No shuffles with empty entries list expected.");
+          continue;
+        }
+        assert((TEs.size() == 1 || TEs.size() == 2) &&
+               "Expected shuffle of 1 or 2 entries.");
+        auto SubMask = ArrayRef(Mask).slice(I * SliceSize, SliceSize);
+        VecMask.assign(VecMask.size(), PoisonMaskElem);
+        copy(SubMask, std::next(VecMask.begin(), I * SliceSize));
+        Estimator.add(*TEs.front(), *TEs.back(), VecMask);
+      }
       if (all_of(GatheredScalars, PoisonValue ::classof))
         return Estimator.finalize(E->ReuseShuffleIndices);
       return Estimator.finalize(
@@ -7705,16 +7839,19 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
     if (!all_of(GatheredScalars, PoisonValue::classof)) {
       auto Gathers = ArrayRef(GatheredScalars).take_front(VL.size());
       bool SameGathers = VL.equals(Gathers);
-      Value *BV = Estimator.gather(
-          Gathers, SameGathers ? nullptr
-                               : Constant::getNullValue(FixedVectorType::get(
-                                     ScalarTy, GatheredScalars.size())));
+      if (!SameGathers)
+        return Estimator.finalize(
+            E->ReuseShuffleIndices, E->Scalars.size(),
+            [&](Value *&Vec, SmallVectorImpl<int> &Mask) {
+              Vec = Estimator.gather(
+                  GatheredScalars, Constant::getNullValue(FixedVectorType::get(
+                                       ScalarTy, GatheredScalars.size())));
+            });
+      Value *BV = Estimator.gather(Gathers);
       SmallVector<int> ReuseMask(Gathers.size(), PoisonMaskElem);
       std::iota(ReuseMask.begin(), ReuseMask.end(), 0);
       Estimator.add(BV, ReuseMask);
     }
-    if (ExtractShuffle)
-      Estimator.add(E, std::nullopt);
     return Estimator.finalize(E->ReuseShuffleIndices);
   }
   InstructionCost CommonCost = 0;
@@ -9037,16 +9174,10 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
 }
 
 std::optional<TargetTransformInfo::ShuffleKind>
-BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, ArrayRef<Value *> VL,
-                               SmallVectorImpl<int> &Mask,
-                               SmallVectorImpl<const TreeEntry *> &Entries) {
+BoUpSLP::isGatherShuffledSingleRegisterEntry(
+    const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
+    SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part) {
   Entries.clear();
-  // No need to check for the topmost gather node.
-  if (TE == VectorizableTree.front().get())
-    return std::nullopt;
-  Mask.assign(VL.size(), PoisonMaskElem);
-  assert(TE->UserTreeIndices.size() == 1 &&
-         "Expected only single user of the gather node.");
   // TODO: currently checking only for Scalars in the tree entry, need to count
   // reused elements too for better cost estimation.
   const EdgeInfo &TEUseEI = TE->UserTreeIndices.front();
@@ -9121,7 +9252,7 @@ BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, ArrayRef<Value *> VL,
           UserPHI ? UserPHI->getIncomingBlock(UseEI.EdgeIdx)->getTerminator()
                   : &getLastInstructionInBundle(UseEI.UserTE);
       if (TEInsertPt == InsertPt) {
-        // If 2 gathers are operands of the same entry (regardless of wether
+        // If 2 gathers are operands of the same entry (regardless of whether
         // user is PHI or else), compare operands indices, use the earlier one
         // as the base.
         if (TEUseEI.UserTE == UseEI.UserTE && TEUseEI.EdgeIdx < UseEI.EdgeIdx)
@@ -9186,8 +9317,10 @@ BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, ArrayRef<Value *> VL,
     }
   }
 
-  if (UsedTEs.empty())
+  if (UsedTEs.empty()) {
+    Entries.clear();
     return std::nullopt;
+  }
 
   unsigned VF = 0;
   if (UsedTEs.size() == 1) {
@@ -9203,7 +9336,8 @@ BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, ArrayRef<Value *> VL,
     });
     if (It != FirstEntries.end() && (*It)->getVectorFactor() == VL.size()) {
       Entries.push_back(*It);
-      std::iota(Mask.begin(), Mask.end(), 0);
+      std::iota(std::next(Mask.begin(), Part * VL.size()),
+                std::next(Mask.begin(), (Part + 1) * VL.size()), 0);
       // Clear undef scalars.
       for (int I = 0, Sz = VL.size(); I < Sz; ++I)
         if (isa<PoisonValue>(VL[I]))
@@ -9340,7 +9474,10 @@ BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, ArrayRef<Value *> VL,
     TempEntries.push_back(Entries[I]);
   }
   Entries.swap(TempEntries);
-  if (EntryLanes.size() == Entries.size() && !VL.equals(TE->Scalars)) {
+  if (EntryLanes.size() == Entries.size() &&
+      !VL.equals(ArrayRef(TE->Scalars)
+                     .slice(Part * VL.size(),
+                            std::min<int>(VL.size(), TE->Scalars.size())))) {
     // We may have here 1 or 2 entries only. If the number of scalars is equal
     // to the number of entries, no need to do the analysis, it is not very
     // profitable. Since VL is not the same as TE->Scalars, it means we already
@@ -9353,9 +9490,10 @@ BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, ArrayRef<Value *> VL,
   // Pair.first is the offset to the vector, while Pair.second is the index of
   // scalar in the list.
   for (const std::pair<unsigned, int> &Pair : EntryLanes) {
-    Mask[Pair.second] = Pair.first * VF +
-                        Entries[Pair.first]->findLaneForValue(VL[Pair.second]);
-    IsIdentity &= Mask[Pair.second] == Pair.second;
+    unsigned Idx = Part * VL.size() + Pair.second;
+    Mask[Idx] = Pair.first * VF +
+                Entries[Pair.first]->findLaneForValue(VL[Pair.second]);
+    IsIdentity &= Mask[Idx] == Pair.second;
   }
   switch (Entries.size()) {
   case 1:
@@ -9370,9 +9508,63 @@ BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, ArrayRef<Value *> VL,
     break;
   }
   Entries.clear();
+  // Clear the corresponding mask elements.
+  std::fill(std::next(Mask.begin(), Part * VL.size()),
+            std::next(Mask.begin(), (Part + 1) * VL.size()), PoisonMaskElem);
   return std::nullopt;
 }
 
+SmallVector<std::optional<TargetTransformInfo::ShuffleKind>>
+BoUpSLP::isGatherShuffledEntry(
+    const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
+    SmallVectorImpl<SmallVector<const TreeEntry *>> &Entries,
+    unsigned NumParts) {
+  assert(NumParts > 0 && NumParts < VL.size() &&
+         "Expected positive number of registers.");
+  Entries.clear();
+  // No need to check for the topmost gather node.
+  if (TE == VectorizableTree.front().get())
+    return {};
+  Mask.assign(VL.size(), PoisonMaskElem);
+  assert(TE->UserTreeIndices.size() == 1 &&
+         "Expected only single user of the gather node.");
+  assert(VL.size() % NumParts == 0 &&
+         "Number of scalars must be divisible by NumParts.");
+  unsigned SliceSize = VL.size() / NumParts;
+  SmallVector<std::optional<TTI::ShuffleKind>> Res;
+  for (unsigned Part = 0; Part < NumParts; ++Part) {
+    ArrayRef<Value *> SubVL = VL.slice(Part * SliceSize, SliceSize);
+    SmallVectorImpl<const TreeEntry *> &SubEntries = Entries.emplace_back();
+    std::optional<TTI::ShuffleKind> SubRes =
+        isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part);
+    if (!SubRes)
+      SubEntries.clear();
+    Res.push_back(SubRes);
+    if (SubEntries.size() == 1 &&
+        SubRes.value_or(TTI::SK_PermuteTwoSrc) == TTI::SK_PermuteSingleSrc &&
+        SubEntries.front()->getVectorFactor() == VL.size() &&
+        (SubEntries.front()->isSame(TE->Scalars) ||
+         SubEntries.front()->isSame(VL))) {
+      Entries.clear();
+      Res.clear();
+      std::iota(Mask.begin(), Mask.end(), 0);
+      // Clear undef scalars.
+      for (int I = 0, Sz = VL.size(); I < Sz; ++I)
+        if (isa<PoisonValue>(VL[I]))
+          Mask[I] = PoisonMaskElem;
+      Entries.emplace_back(1, SubEntries.front());
+      Res.push_back(TargetTransformInfo::SK_PermuteSingleSrc);
+      return Res;
+    }
+  }
+  if (all_of(Res,
+             [](const std::optional<TTI::ShuffleKind> &SK) { return !SK; })) {
+    Entries.clear();
+    return {};
+  }
+  return Res;
+}
+
 InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL,
                                        bool ForPoisonSrc) const {
   // Find the type of the operands in VL.
@@ -9839,9 +10031,13 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
   }
   /// Checks if the specified entry \p E needs to be delayed because of its
   /// dependency nodes.
-  Value *needToDelay(const TreeEntry *E, ArrayRef<const TreeEntry *> Deps) {
+  Value *needToDelay(const TreeEntry *E,
+                     ArrayRef<SmallVector<const TreeEntry *>> Deps) {
     // No need to delay emission if all deps are ready.
-    if (all_of(Deps, [](const TreeEntry *TE) { return TE->VectorizedValue; }))
+    if (all_of(Deps, [](ArrayRef<const TreeEntry *> TEs) {
+          return all_of(
+              TEs, [](const TreeEntry *TE) { return TE->VectorizedValue; });
+        }))
       return nullptr;
     // Postpone gather emission, will be emitted after the end of the
     // process to keep correct order.
@@ -10176,9 +10372,13 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
   SmallVector<int> Mask;
   SmallVector<int> ExtractMask;
   std::optional<TargetTransformInfo::ShuffleKind> ExtractShuffle;
-  std::optional<TargetTransformInfo::ShuffleKind> GatherShuffle;
-  SmallVector<const TreeEntry *> Entries;
+  SmallVector<std::optional<TargetTransformInfo::ShuffleKind>> GatherShuffles;
+  SmallVector<SmallVector<const TreeEntry *>> Entries;
   Type *ScalarTy = GatheredScalars.front()->getType();
+  unsigned NumParts = TTI->getNumberOfParts(
+      FixedVectorType::get(ScalarTy, GatheredScalars.size()));
+  if (NumParts == 0 || NumParts >= GatheredScalars.size())
+    NumParts = 1;
   if (!all_of(GatheredScalars, UndefValue::classof)) {
     // Check for gathered extracts.
     ExtractShuffle =
@@ -10197,9 +10397,10 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
         all_of(E->Scalars, [this](Value *V) { return getTreeEntry(V); }) ||
         isSplat(E->Scalars) ||
         (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) {
-      GatherShuffle = isGatherShuffledEntry(E, GatheredScalars, Mask, Entries);
+      GatherShuffles =
+          isGatherShuffledEntry(E, GatheredScalars, Mask, Entries, NumParts);
     }
-    if (GatherShuffle) {
+    if (!GatherShuffles.empty()) {
       if (Value *Delayed = ShuffleBuilder.needToDelay(E, Entries)) {
         // Delay emission of gathers which are not ready yet.
         PostponedGathers.insert(E);
@@ -10207,10 +10408,9 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
         // process to keep correct order.
         return Delayed;
       }
-      assert((Entries.size() == 1 || Entries.size() == 2) &&
-             "Expected shuffle of 1 or 2 entries.");
-      if (*GatherShuffle == TTI::SK_PermuteSingleSrc &&
-          Entries.front()->isSame(E->Scalars)) {
+      if (GatherShuffles.size() == 1 &&
+          *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
+          Entries.front().front()->isSame(E->Scalars)) {
         // Perfect match in the graph, will reuse the previously vectorized
         // node. Cost is 0.
         LLVM_DEBUG(
@@ -10218,11 +10418,11 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
             << "SLP: perfect diamond match for gather bundle "
             << shortBundleName(E->Scalars) << ".\n");
         // Restore the mask for previous partially matched values.
-        if (Entries.front()->ReorderIndices.empty() &&
-            ((Entries.front()->ReuseShuffleIndices.empty() &&
-              E->Scalars.size() == Entries.front()->Scalars.size()) ||
-             (E->Scalars.size() ==
-              Entries.front()->ReuseShuffleIndices.size()))) {
+        const TreeEntry *FrontTE = Entries.front().front();
+        if (FrontTE->ReorderIndices.empty() &&
+            ((FrontTE->ReuseShuffleIndices.empty() &&
+              E->Scalars.size() == FrontTE->Scalars.size()) ||
+             (E->Scalars.size() == FrontTE->ReuseShuffleIndices.size()))) {
           std::iota(Mask.begin(), Mask.end(), 0);
         } else {
           for (auto [I, V] : enumerate(E->Scalars)) {
@@ -10230,17 +10430,20 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
               Mask[I] = PoisonMaskElem;
               continue;
             }
-            Mask[I] = Entries.front()->findLaneForValue(V);
+            Mask[I] = FrontTE->findLaneForValue(V);
           }
         }
-        ShuffleBuilder.add(Entries.front()->VectorizedValue, Mask);
+        ShuffleBuilder.add(FrontTE->VectorizedValue, Mask);
         Res = ShuffleBuilder.finalize(E->getCommonMask());
         return Res;
       }
       if (!Resized) {
-        unsigned VF1 = Entries.front()->getVectorFactor();
-        unsigned VF2 = Entries.back()->getVectorFactor();
-        if ((VF == VF1 || VF == VF2) && GatheredScalars.size() != VF)
+        if (GatheredScalars.size() != VF &&
+            any_of(Entries, [&](ArrayRef<const TreeEntry *> TEs) {
+              return any_of(TEs, [&](const TreeEntry *TE) {
+                return TE->getVectorFactor() == VF;
+              });
+            }))
           GatheredScalars.append(VF - GatheredScalars.size(),
                                  PoisonValue::get(ScalarTy));
       }
@@ -10340,9 +10543,9 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
       }
     }
   };
-  if (ExtractShuffle || GatherShuffle) {
+  if (ExtractShuffle || !GatherShuffles.empty()) {
     bool IsNonPoisoned = true;
-    bool IsUsedInExpr = false;
+    bool IsUsedInExpr = true;
     Value *Vec1 = nullptr;
     if (ExtractShuffle) {
       // Gather of extractelements can be represented as just a shuffle of
@@ -10367,36 +10570,53 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
         }
       }
       if (Vec2) {
+        IsUsedInExpr = false;
         IsNonPoisoned &=
             isGuaranteedNotToBePoison(Vec1) && isGuaranteedNotToBePoison(Vec2);
         ShuffleBuilder.add(Vec1, Vec2, ExtractMask);
       } else if (Vec1) {
-        IsUsedInExpr = FindReusedSplat(
+        IsUsedInExpr &= FindReusedSplat(
             ExtractMask,
             cast<FixedVectorType>(Vec1->getType())->getNumElements());
         ShuffleBuilder.add(Vec1, ExtractMask);
         IsNonPoisoned &= isGuaranteedNotToBePoison(Vec1);
       } else {
+        IsUsedInExpr = false;
         ShuffleBuilder.add(PoisonValue::get(FixedVectorType::get(
                                ScalarTy, GatheredScalars.size())),
                            ExtractMask);
       }
     }
-    if (GatherShuffle) {
-      if (Entries.size() == 1) {
-        IsUsedInExpr = FindReusedSplat(
-            Mask,
-            cast<FixedVectorType>(Entries.front()->VectorizedValue->getType())
-                ->getNumElements());
-        ShuffleBuilder.add(Entries.front()->VectorizedValue, Mask);
-        IsNonPoisoned &=
-            isGuaranteedNotToBePoison(Entries.front()->VectorizedValue);
-      } else {
-        ShuffleBuilder.add(Entries.front()->VectorizedValue,
-                           Entries.back()->VectorizedValue, Mask);
-        IsNonPoisoned &=
-            isGuaranteedNotToBePoison(Entries.front()->VectorizedValue) &&
-            isGuaranteedNotToBePoison(Entries.back()->VectorizedValue);
+    if (!GatherShuffles.empty()) {
+      unsigned SliceSize = E->Scalars.size() / NumParts;
+      SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
+      for (const auto [I, TEs] : enumerate(Entries)) {
+        if (TEs.empty()) {
+          assert(!GatherShuffles[I] &&
+                 "No shuffles with empty entries list expected.");
+          continue;
+        }
+        assert((TEs.size() == 1 || TEs.size() == 2) &&
+               "Expected shuffle of 1 or 2 entries.");
+        auto SubMask = ArrayRef(Mask).slice(I * SliceSize, SliceSize);
+        VecMask.assign(VecMask.size(), PoisonMaskElem);
+        copy(SubMask, std::next(VecMask.begin(), I * SliceSize));
+        if (TEs.size() == 1) {
+          IsUsedInExpr &= FindReusedSplat(
+              VecMask,
+              cast<FixedVectorType>(TEs.front()->VectorizedValue->getType())
+                  ->getNumElements());
+          ShuffleBuilder.add(TEs.front()->VectorizedValue, VecMask);
+          IsNonPoisoned &=
+              isGuaranteedNotToBePoison(TEs.front()->VectorizedValue);
+        } else {
+          IsUsedInExpr = false;
+          ShuffleBuilder.add(TEs.front()->VectorizedValue,
+                             TEs.back()->VectorizedValue, VecMask);
+          IsNonPoisoned &=
+              isGuaranteedNotToBePoison(TEs.front()->VectorizedValue) &&
+              isGuaranteedNotToBePoison(TEs.back()->VectorizedValue);
+        }
       }
     }
     // Try to figure out best way to combine values: build a shuffle and insert
@@ -10407,14 +10627,18 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
     int MSz = Mask.size();
     // Try to build constant vector and shuffle with it only if currently we
     // have a single permutation and more than 1 scalar constants.
-    bool IsSingleShuffle = !ExtractShuffle || !GatherShuffle;
+    bool IsSingleShuffle = !ExtractShuffle || GatherShuffles.empty();
     bool IsIdentityShuffle =
         (ExtractShuffle.value_or(TTI::SK_PermuteTwoSrc) ==
              TTI::SK_PermuteSingleSrc &&
          none_of(ExtractMask, [&](int I) { return I >= EMSz; }) &&
          ShuffleVectorInst::isIdentityMask(ExtractMask, EMSz)) ||
-        (GatherShuffle.value_or(TTI::SK_PermuteTwoSrc) ==
-             TTI::SK_PermuteSingleSrc &&
+        (!GatherShuffles.empty() &&
+         all_of(GatherShuffles,
+                [](const std::optional<TTI::ShuffleKind> &SK) {
+                  return SK.value_or(TTI::SK_PermuteTwoSrc) ==
+                         TTI::SK_PermuteSingleSrc;
+                }) &&
          none_of(Mask, [&](int I) { return I >= MSz; }) &&
          ShuffleVectorInst::isIdentityMask(Mask, MSz));
     bool EnoughConstsForShuffle =
@@ -10590,7 +10814,13 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
           continue;
         }
 
-        Builder.SetInsertPoint(IBB->getTerminator());
+        // if (any_of(E->getOperand(i), [&](Value *V) {
+        //       auto *I = dyn_cast<Instruction>(V);
+        //       return I && I->getParent() == IBB;
+        //     }))
+          Builder.SetInsertPoint(IBB->getTerminator());
+        // else
+        //   Builder.SetInsertPoint(IBB->getFirstNonPHIOrDbgOrLifetime());
         Builder.SetCurrentDebugLocation(PH->getDebugLoc());
         Value *Vec = vectorizeOperand(E, i, /*PostponedPHIs=*/true);
         NewPhi->addIncoming(Vec, IBB);
@@ -11254,10 +11484,22 @@ Value *BoUpSLP::vectorizeTree(
     // The is because source vector that supposed to feed this gather node was
     // inserted at the end of the block [after stab instruction]. So we need
     // to adjust insertion point again to the end of block.
-    if (isa<PHINode>(UserI))
-      Builder.SetInsertPoint(PrevVec->getParent()->getTerminator());
-    else
+    if (isa<PHINode>(UserI)) {
+      // Insert before all users.
+      Instruction *InsertPt = PrevVec->getParent()->getTerminator();
+      for (User *U : PrevVec->users()) {
+        if (U == UserI)
+          continue;
+        auto *UI = dyn_cast<Instruction>(U);
+        if (!UI || isa<PHINode>(UI) || UI->getParent() != InsertPt->getParent())
+          continue;
+        if (UI->comesBefore(InsertPt))
+          InsertPt = UI;
+      }
+      Builder.SetInsertPoint(InsertPt);
+    } else {
       Builder.SetInsertPoint(PrevVec);
+    }
     Builder.SetCurrentDebugLocation(UserI->getDebugLoc());
     Value *Vec = vectorizeTree(TE, /*PostponedPHIs=*/false);
     PrevVec->replaceAllUsesWith(Vec);
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/multi-nodes-to-shuffle.ll b/llvm/test/Transforms/SLPVectorizer/X86/multi-nodes-to-shuffle.ll
index 21aac98aa3ece..e5b5a5c6c4a00 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/multi-nodes-to-shuffle.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/multi-nodes-to-shuffle.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes=slp-vectorizer -S < %s -mtriple=x86_64-unknown-linux -slp-threshold=-107 | FileCheck %s
-; RUN: opt -passes=slp-vectorizer -S < %s -mtriple=x86_64-unknown-linux -slp-threshold=-107 -mattr=+avx2 | FileCheck %s
+; RUN: opt -passes=slp-vectorizer -S < %s -mtriple=x86_64-unknown-linux -slp-threshold=-115 | FileCheck %s
+; RUN: opt -passes=slp-vectorizer -S < %s -mtriple=x86_64-unknown-linux -slp-threshold=-115 -mattr=+avx2 | FileCheck %s --check-prefix=AVX2
 
 define void @test(i64 %p0, i64 %p1, i64 %p2, i64 %p3) {
 ; CHECK-LABEL: @test(
@@ -14,18 +14,43 @@ define void @test(i64 %p0, i64 %p1, i64 %p2, i64 %p3) {
 ; CHECK-NEXT:    [[TMP6:%.*]] = sdiv <4 x i64> [[TMP3]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = sub <4 x i64> [[TMP5]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = shl <4 x i64> [[TMP4]], [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 poison, i32 4>
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i64> [[TMP9]], <4 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 3>
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP5]], <4 x i32> <i32 1, i32 5, i32 poison, i32 5>
-; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i64> [[TMP11]], <4 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 5, i32 3>
-; CHECK-NEXT:    [[TMP13:%.*]] = or <4 x i64> [[TMP10]], [[TMP12]]
-; CHECK-NEXT:    [[TMP14:%.*]] = trunc <4 x i64> [[TMP13]] to <4 x i32>
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i64> [[TMP6]], <4 x i64> [[TMP5]], <4 x i32> <i32 poison, i32 poison, i32 0, i32 4>
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP9]], <4 x i64> [[TMP10]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP5]], <4 x i32> <i32 1, i32 5, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <4 x i64> [[TMP6]], <4 x i64> [[TMP5]], <4 x i32> <i32 poison, i32 poison, i32 1, i32 5>
+; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <4 x i64> [[TMP12]], <4 x i64> [[TMP13]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP15:%.*]] = or <4 x i64> [[TMP11]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = trunc <4 x i64> [[TMP15]] to <4 x i32>
 ; CHECK-NEXT:    br label [[BB:%.*]]
 ; CHECK:       bb:
-; CHECK-NEXT:    [[TMP15:%.*]] = phi <4 x i32> [ [[TMP16:%.*]], [[BB]] ], [ [[TMP14]], [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[TMP16]] = trunc <4 x i64> [[TMP8]] to <4 x i32>
+; CHECK-NEXT:    [[TMP17:%.*]] = phi <4 x i32> [ [[TMP18:%.*]], [[BB]] ], [ [[TMP16]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[TMP18]] = trunc <4 x i64> [[TMP8]] to <4 x i32>
 ; CHECK-NEXT:    br label [[BB]]
 ;
+; AVX2-LABEL: @test(
+; AVX2-NEXT:  entry:
+; AVX2-NEXT:    [[TMP0:%.*]] = insertelement <4 x i64> poison, i64 [[P0:%.*]], i32 0
+; AVX2-NEXT:    [[TMP1:%.*]] = insertelement <4 x i64> [[TMP0]], i64 [[P1:%.*]], i32 1
+; AVX2-NEXT:    [[TMP2:%.*]] = insertelement <4 x i64> [[TMP1]], i64 [[P2:%.*]], i32 2
+; AVX2-NEXT:    [[TMP3:%.*]] = insertelement <4 x i64> [[TMP2]], i64 [[P3:%.*]], i32 3
+; AVX2-NEXT:    [[TMP4:%.*]] = add <4 x i64> [[TMP3]], [[TMP3]]
+; AVX2-NEXT:    [[TMP5:%.*]] = mul <4 x i64> [[TMP3]], [[TMP3]]
+; AVX2-NEXT:    [[TMP6:%.*]] = sdiv <4 x i64> [[TMP3]], [[TMP3]]
+; AVX2-NEXT:    [[TMP7:%.*]] = sub <4 x i64> [[TMP5]], [[TMP6]]
+; AVX2-NEXT:    [[TMP8:%.*]] = shl <4 x i64> [[TMP4]], [[TMP7]]
+; AVX2-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 poison, i32 4>
+; AVX2-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i64> [[TMP9]], <4 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 3>
+; AVX2-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP5]], <4 x i32> <i32 1, i32 5, i32 poison, i32 5>
+; AVX2-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i64> [[TMP11]], <4 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 5, i32 3>
+; AVX2-NEXT:    [[TMP13:%.*]] = or <4 x i64> [[TMP10]], [[TMP12]]
+; AVX2-NEXT:    [[TMP14:%.*]] = trunc <4 x i64> [[TMP13]] to <4 x i32>
+; AVX2-NEXT:    br label [[BB:%.*]]
+; AVX2:       bb:
+; AVX2-NEXT:    [[TMP15:%.*]] = phi <4 x i32> [ [[TMP16:%.*]], [[BB]] ], [ [[TMP14]], [[ENTRY:%.*]] ]
+; AVX2-NEXT:    [[TMP16]] = trunc <4 x i64> [[TMP8]] to <4 x i32>
+; AVX2-NEXT:    br label [[BB]]
+;
 entry:
   %a0 = add i64 %p0, %p0
   %a1 = add i64 %p1, %p1

From bc792a284362696c91599f9ab01f74eda4b9108f Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2@gmail.com>
Date: Thu, 26 Oct 2023 09:05:16 -0400
Subject: [PATCH 066/877] [libc++] Encode additional ODR-affecting properties
 in the ABI tag (#69669)

As explained in `__config`, we have an ABI tag that we use to ensure
that we don't run into ODR issues when mixing different versions of
libc++ in multiple TUs. However, the reasoning behind that extends not
only to different versions of libc++, but also to different
configurations of the same version of libc++. In fact, we've been aware
of this for a while but never really bothered to make the change because
ODR issues are often thought to be benign.

Well, it turns out that I just spent over an hour banging my head
against an issue that boils down to our lack of encoding of some ODR
properties in the ABI tag, so here's the patch we should have done a
long time ago.

For now, the ODR properties we encode in the ABI tag are:
- library version
- exceptions vs no-exceptions
- hardening mode

Those are all things that we support different values for on a per-TU
basis and they definitely affect ODR in a meaningful way. We can add
more properties later as we see fit.
---
 libcxx/include/__config                       | 58 +++++++++++----
 .../libcxx/odr_signature.exceptions.sh.cpp    | 46 ++++++++++++
 .../libcxx/odr_signature.hardening.sh.cpp     | 72 +++++++++++++++++++
 3 files changed, 161 insertions(+), 15 deletions(-)
 create mode 100644 libcxx/test/libcxx/odr_signature.exceptions.sh.cpp
 create mode 100644 libcxx/test/libcxx/odr_signature.hardening.sh.cpp

diff --git a/libcxx/include/__config b/libcxx/include/__config
index 65ce6d6a27f83..4bf171f998c6f 100644
--- a/libcxx/include/__config
+++ b/libcxx/include/__config
@@ -56,10 +56,6 @@
 #  define _LIBCPP_CONCAT_IMPL(_X, _Y) _X##_Y
 #  define _LIBCPP_CONCAT(_X, _Y) _LIBCPP_CONCAT_IMPL(_X, _Y)
 
-// Valid C++ identifier that revs with every libc++ version. This can be used to
-// generate identifiers that must be unique for every released libc++ version.
-#  define _LIBCPP_VERSIONED_IDENTIFIER _LIBCPP_CONCAT(v, _LIBCPP_VERSION)
-
 #  if __STDC_HOSTED__ == 0
 #    define _LIBCPP_FREESTANDING
 #  endif
@@ -734,22 +730,54 @@ typedef __char32_t char32_t;
 #    define _LIBCPP_EXCLUDE_FROM_EXPLICIT_INSTANTIATION _LIBCPP_ALWAYS_INLINE
 #  endif
 
+#  if _LIBCPP_ENABLE_HARDENED_MODE
+#    define _LIBCPP_HARDENING_SIG h
+#  elif _LIBCPP_ENABLE_SAFE_MODE
+#    define _LIBCPP_HARDENING_SIG s
+#  elif _LIBCPP_ENABLE_DEBUG_MODE
+#    define _LIBCPP_HARDENING_SIG d
+#  else
+#    define _LIBCPP_HARDENING_SIG u // for unchecked
+#  endif
+
+#  ifdef _LIBCPP_HAS_NO_EXCEPTIONS
+#    define _LIBCPP_EXCEPTIONS_SIG n
+#  else
+#    define _LIBCPP_EXCEPTIONS_SIG e
+#  endif
+
+#  define _LIBCPP_ODR_SIGNATURE                                                                                        \
+    _LIBCPP_CONCAT(_LIBCPP_CONCAT(_LIBCPP_HARDENING_SIG, _LIBCPP_EXCEPTIONS_SIG), _LIBCPP_VERSION)
+
 // This macro marks a symbol as being hidden from libc++'s ABI. This is achieved
 // on two levels:
 // 1. The symbol is given hidden visibility, which ensures that users won't start exporting
 //    symbols from their dynamic library by means of using the libc++ headers. This ensures
 //    that those symbols stay private to the dynamic library in which it is defined.
 //
-// 2. The symbol is given an ABI tag that changes with each version of libc++. This ensures
-//    that no ODR violation can arise from mixing two TUs compiled with different versions
-//    of libc++ where we would have changed the definition of a symbol. If the symbols shared
-//    the same name, the ODR would require that their definitions be token-by-token equivalent,
-//    which basically prevents us from being able to make any change to any function in our
-//    headers. Using this ABI tag ensures that the symbol name is "bumped" artificially at
-//    each release, which lets us change the definition of these symbols at our leisure.
-//    Note that historically, this has been achieved in various ways, including force-inlining
-//    all functions or giving internal linkage to all functions. Both these (previous) solutions
-//    suffer from drawbacks that lead notably to code bloat.
+// 2. The symbol is given an ABI tag that encodes the ODR-relevant properties of the library.
+//    This ensures that no ODR violation can arise from mixing two TUs compiled with different
+//    versions or configurations of libc++ (such as exceptions vs no-exceptions). Indeed, if the
+//    program contains two definitions of a function, the ODR requires them to be token-by-token
+//    equivalent, and the linker is allowed to pick either definition and discard the other one.
+//
+//    For example, if a program contains a copy of `vector::at()` compiled with exceptions enabled
+//    *and* a copy of `vector::at()` compiled with exceptions disabled (by means of having two TUs
+//    compiled with different settings), the two definitions are both visible by the linker and they
+//    have the same name, but they have a meaningfully different implementation (one throws an exception
+//    and the other aborts the program). This violates the ODR and makes the program ill-formed, and in
+//    practice what will happen is that the linker will pick one of the definitions at random and will
+//    discard the other one. This can quite clearly lead to incorrect program behavior.
+//
+//    A similar reasoning holds for many other properties that are ODR-affecting. Essentially any
+//    property that causes the code of a function to differ from the code in another configuration
+//    can be considered ODR-affecting. In practice, we don't encode all such properties in the ABI
+//    tag, but we encode the ones that we think are most important: library version, exceptions, and
+//    hardening mode.
+//
+//    Note that historically, solving this problem has been achieved in various ways, including
+//    force-inlining all functions or giving internal linkage to all functions. Both these previous
+//    solutions suffer from drawbacks that lead notably to code bloat.
 //
 // Note that we use _LIBCPP_EXCLUDE_FROM_EXPLICIT_INSTANTIATION to ensure that we don't depend
 // on _LIBCPP_HIDE_FROM_ABI methods of classes explicitly instantiated in the dynamic library.
@@ -769,7 +797,7 @@ typedef __char32_t char32_t;
 #  ifndef _LIBCPP_NO_ABI_TAG
 #    define _LIBCPP_HIDE_FROM_ABI                                                                                      \
       _LIBCPP_HIDDEN _LIBCPP_EXCLUDE_FROM_EXPLICIT_INSTANTIATION                                                       \
-          __attribute__((__abi_tag__(_LIBCPP_TOSTRING(_LIBCPP_VERSIONED_IDENTIFIER))))
+          __attribute__((__abi_tag__(_LIBCPP_TOSTRING(_LIBCPP_ODR_SIGNATURE))))
 #  else
 #    define _LIBCPP_HIDE_FROM_ABI _LIBCPP_HIDDEN _LIBCPP_EXCLUDE_FROM_EXPLICIT_INSTANTIATION
 #  endif
diff --git a/libcxx/test/libcxx/odr_signature.exceptions.sh.cpp b/libcxx/test/libcxx/odr_signature.exceptions.sh.cpp
new file mode 100644
index 0000000000000..6bf60b5e82d3c
--- /dev/null
+++ b/libcxx/test/libcxx/odr_signature.exceptions.sh.cpp
@@ -0,0 +1,46 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// TODO: Investigate
+// XFAIL: msvc
+
+// Test that we encode whether exceptions are supported in an ABI tag to avoid
+// ODR violations when linking TUs that have different values for it.
+
+// RUN: %{cxx} %s %{flags} %{compile_flags} -c -DTU1  -fno-exceptions -o %t.tu1.o
+// RUN: %{cxx} %s %{flags} %{compile_flags} -c -DTU2  -fexceptions    -o %t.tu2.o
+// RUN: %{cxx} %s %{flags} %{compile_flags} -c -DMAIN                 -o %t.main.o
+// RUN: %{cxx} %t.tu1.o %t.tu2.o %t.main.o %{flags} %{link_flags} -o %t.exe
+// RUN: %{exec} %t.exe
+
+// -fno-exceptions
+#ifdef TU1
+#  include <__config>
+_LIBCPP_HIDE_FROM_ABI inline int f() { return 1; }
+int tu1() { return f(); }
+#endif // TU1
+
+// -fexceptions
+#ifdef TU2
+#  include <__config>
+_LIBCPP_HIDE_FROM_ABI inline int f() { return 2; }
+int tu2() { return f(); }
+#endif // TU2
+
+#ifdef MAIN
+#  include <cassert>
+
+int tu1();
+int tu2();
+
+int main(int, char**) {
+  assert(tu1() == 1);
+  assert(tu2() == 2);
+  return 0;
+}
+#endif // MAIN
diff --git a/libcxx/test/libcxx/odr_signature.hardening.sh.cpp b/libcxx/test/libcxx/odr_signature.hardening.sh.cpp
new file mode 100644
index 0000000000000..3ae95c8910a92
--- /dev/null
+++ b/libcxx/test/libcxx/odr_signature.hardening.sh.cpp
@@ -0,0 +1,72 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// TODO: Remove these UNSUPPORTED lines once we change how hardening is enabled to avoid
+//       mutually exclusive modes being enabled at the same time.
+// UNSUPPORTED: libcpp-hardening-mode=hardened
+// UNSUPPORTED: libcpp-hardening-mode=safe
+// UNSUPPORTED: libcpp-hardening-mode=debug
+
+// TODO: Investigate
+// XFAIL: msvc
+
+// Test that we encode the hardening mode in an ABI tag to avoid ODR violations
+// when linking TUs that have different values for it.
+
+// RUN: %{cxx} %s %{flags} %{compile_flags} -c -DTU1  -D_LIBCPP_ENABLE_HARDENED_MODE -o %t.tu1.o
+// RUN: %{cxx} %s %{flags} %{compile_flags} -c -DTU2  -D_LIBCPP_ENABLE_SAFE_MODE     -o %t.tu2.o
+// RUN: %{cxx} %s %{flags} %{compile_flags} -c -DTU3  -D_LIBCPP_ENABLE_DEBUG_MODE    -o %t.tu3.o
+// RUN: %{cxx} %s %{flags} %{compile_flags} -c -DTU4                                 -o %t.tu4.o
+// RUN: %{cxx} %s %{flags} %{compile_flags} -c -DMAIN                                -o %t.main.o
+// RUN: %{cxx} %t.tu1.o %t.tu2.o %t.tu3.o %t.tu4.o %t.main.o %{flags} %{link_flags} -o %t.exe
+// RUN: %{exec} %t.exe
+
+// hardened mode
+#ifdef TU1
+#  include <__config>
+_LIBCPP_HIDE_FROM_ABI inline int f() { return 1; }
+int tu1() { return f(); }
+#endif // TU1
+
+// safe mode
+#ifdef TU2
+#  include <__config>
+_LIBCPP_HIDE_FROM_ABI inline int f() { return 2; }
+int tu2() { return f(); }
+#endif // TU2
+
+// debug mode
+#ifdef TU3
+#  include <__config>
+_LIBCPP_HIDE_FROM_ABI inline int f() { return 3; }
+int tu3() { return f(); }
+#endif // TU3
+
+// unchecked mode
+#ifdef TU4
+#  include <__config>
+_LIBCPP_HIDE_FROM_ABI inline int f() { return 4; }
+int tu4() { return f(); }
+#endif // TU4
+
+#ifdef MAIN
+#  include <cassert>
+
+int tu1();
+int tu2();
+int tu3();
+int tu4();
+
+int main(int, char**) {
+  assert(tu1() == 1);
+  assert(tu2() == 2);
+  assert(tu3() == 3);
+  assert(tu4() == 4);
+  return 0;
+}
+#endif // MAIN

From 6282b745e09d57a29e8221db638a7d393d66608d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder@redhat.com>
Date: Thu, 26 Oct 2023 15:09:42 +0200
Subject: [PATCH 067/877] [clang][Interp][NFC] Fix a doc comment

---
 clang/lib/AST/Interp/Descriptor.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/lib/AST/Interp/Descriptor.h b/clang/lib/AST/Interp/Descriptor.h
index 2fd4e92082645..6ee9bbe85d71d 100644
--- a/clang/lib/AST/Interp/Descriptor.h
+++ b/clang/lib/AST/Interp/Descriptor.h
@@ -84,7 +84,7 @@ struct Descriptor final {
   const unsigned ElemSize;
   /// Size of the storage, in host bytes.
   const unsigned Size;
-  // Size of the metadata.
+  /// Size of the metadata.
   const unsigned MDSize;
   /// Size of the allocation (storage + metadata), in host bytes.
   const unsigned AllocSize;

From 1b6b4d6a08321fb914127dadcd6677dcd9b1b222 Mon Sep 17 00:00:00 2001
From: Qizhi Hu <836744285@qq.com>
Date: Thu, 26 Oct 2023 21:11:51 +0800
Subject: [PATCH 068/877] [analyzer] Loop should contain CXXForRangeStmt
 (#70190)

Static analyze can't report diagnose when statement after a
CXXForRangeStmt and enable widen, because
`ExprEngine::processCFGBlockEntrance` lacks of CXXForRangeStmt and when
`AMgr.options.maxBlockVisitOnPath - 1` equals to `blockCount`, it can't
widen. After next iteration, `BlockCount >=
AMgr.options.maxBlockVisitOnPath` holds and generate a sink node. Add
`CXXForRangeStmt` makes it work.

Co-authored-by: huqizhi <836744285@qq.com>
---
 clang/lib/StaticAnalyzer/Core/ExprEngine.cpp   |  2 +-
 clang/lib/StaticAnalyzer/Core/LoopWidening.cpp |  4 +++-
 clang/test/Analysis/loop-widening-notes.cpp    | 12 ++++++++++++
 3 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp b/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp
index 451ee91b94533..2e67fb953e456 100644
--- a/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp
+++ b/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp
@@ -2509,7 +2509,7 @@ void ExprEngine::processCFGBlockEntrance(const BlockEdge &L,
   if (BlockCount == AMgr.options.maxBlockVisitOnPath - 1 &&
       AMgr.options.ShouldWidenLoops) {
     const Stmt *Term = nodeBuilder.getContext().getBlock()->getTerminatorStmt();
-    if (!isa_and_nonnull<ForStmt, WhileStmt, DoStmt>(Term))
+    if (!isa_and_nonnull<ForStmt, WhileStmt, DoStmt, CXXForRangeStmt>(Term))
       return;
     // Widen.
     const LocationContext *LCtx = Pred->getLocationContext();
diff --git a/clang/lib/StaticAnalyzer/Core/LoopWidening.cpp b/clang/lib/StaticAnalyzer/Core/LoopWidening.cpp
index a3b29ff487e4e..9e42801760622 100644
--- a/clang/lib/StaticAnalyzer/Core/LoopWidening.cpp
+++ b/clang/lib/StaticAnalyzer/Core/LoopWidening.cpp
@@ -35,6 +35,8 @@ static const Expr *getLoopCondition(const Stmt *LoopStmt) {
     return cast<WhileStmt>(LoopStmt)->getCond();
   case Stmt::DoStmtClass:
     return cast<DoStmt>(LoopStmt)->getCond();
+  case Stmt::CXXForRangeStmtClass:
+    return cast<CXXForRangeStmt>(LoopStmt)->getCond();
   }
 }
 
@@ -45,7 +47,7 @@ ProgramStateRef getWidenedLoopState(ProgramStateRef PrevState,
                                     const LocationContext *LCtx,
                                     unsigned BlockCount, const Stmt *LoopStmt) {
 
-  assert((isa<ForStmt, WhileStmt, DoStmt>(LoopStmt)));
+  assert((isa<ForStmt, WhileStmt, DoStmt, CXXForRangeStmt>(LoopStmt)));
 
   // Invalidate values in the current state.
   // TODO Make this more conservative by only invalidating values that might
diff --git a/clang/test/Analysis/loop-widening-notes.cpp b/clang/test/Analysis/loop-widening-notes.cpp
index 0ba71d030d058..a3f030dfe9882 100644
--- a/clang/test/Analysis/loop-widening-notes.cpp
+++ b/clang/test/Analysis/loop-widening-notes.cpp
@@ -70,3 +70,15 @@ int test_for_loop() {
   return flag_d / num; // no-crash expected-warning {{Division by zero}} 
                        // expected-note@-1 {{Division by zero}}
 }
+
+int test_for_range_loop() {
+  int arr[10] = {0};
+  for(auto x : arr) { // expected-note {{Assigning value}} 
+    ++x;
+  }
+  if (arr[0] == 0)   // expected-note {{Assuming the condition is true}} 
+                     // expected-note@-1 {{Taking true branch}}
+    return 1/arr[0]; // expected-warning {{Division by zero}}
+                     // expected-note@-1 {{Division by zero}}
+  return 0;
+}

From 7f677fe3100131214386f9ce1fa308c235a595e9 Mon Sep 17 00:00:00 2001
From: Timm Baeder <tbaeder@redhat.com>
Date: Thu, 26 Oct 2023 15:15:25 +0200
Subject: [PATCH 069/877] [clang][Interp] Add explicit dummy descriptors
 (#68888)

Instead of (ab)using incomplete array types for this, add a 'Dummy' bit
to Descriptor. We need to be able to differentiate between the two when
adding an offset.
---
 clang/lib/AST/Interp/Descriptor.cpp    |  7 +++++++
 clang/lib/AST/Interp/Descriptor.h      |  6 ++++++
 clang/lib/AST/Interp/Interp.cpp        |  6 ++++++
 clang/lib/AST/Interp/Interp.h          | 20 ++++++++++++++------
 clang/lib/AST/Interp/InterpBuiltin.cpp |  3 +++
 clang/lib/AST/Interp/Pointer.h         |  2 ++
 clang/lib/AST/Interp/Program.cpp       | 20 +++++++++++---------
 clang/test/AST/Interp/c.c              | 20 ++++++++++++++++++++
 8 files changed, 69 insertions(+), 15 deletions(-)

diff --git a/clang/lib/AST/Interp/Descriptor.cpp b/clang/lib/AST/Interp/Descriptor.cpp
index 2a21f60588d46..59a952135a2d8 100644
--- a/clang/lib/AST/Interp/Descriptor.cpp
+++ b/clang/lib/AST/Interp/Descriptor.cpp
@@ -296,6 +296,13 @@ Descriptor::Descriptor(const DeclTy &D, Record *R, MetadataSize MD,
   assert(Source && "Missing source");
 }
 
+Descriptor::Descriptor(const DeclTy &D, MetadataSize MD)
+    : Source(D), ElemSize(1), Size(ElemSize), MDSize(MD.value_or(0)),
+      AllocSize(Size + MDSize), ElemRecord(nullptr), IsConst(true),
+      IsMutable(false), IsTemporary(false), IsDummy(true) {
+  assert(Source && "Missing source");
+}
+
 QualType Descriptor::getType() const {
   if (auto *E = asExpr())
     return E->getType();
diff --git a/clang/lib/AST/Interp/Descriptor.h b/clang/lib/AST/Interp/Descriptor.h
index 6ee9bbe85d71d..8135f3d12f703 100644
--- a/clang/lib/AST/Interp/Descriptor.h
+++ b/clang/lib/AST/Interp/Descriptor.h
@@ -111,6 +111,8 @@ struct Descriptor final {
   const bool IsTemporary = false;
   /// Flag indicating if the block is an array.
   const bool IsArray = false;
+  /// Flag indicating if this is a dummy descriptor.
+  const bool IsDummy = false;
 
   /// Storage management methods.
   const BlockCtorFn CtorFn = nullptr;
@@ -139,6 +141,8 @@ struct Descriptor final {
   Descriptor(const DeclTy &D, Record *R, MetadataSize MD, bool IsConst,
              bool IsTemporary, bool IsMutable);
 
+  Descriptor(const DeclTy &D, MetadataSize MD);
+
   QualType getType() const;
   QualType getElemQualType() const;
   SourceLocation getLocation() const;
@@ -192,6 +196,8 @@ struct Descriptor final {
   bool isArray() const { return IsArray; }
   /// Checks if the descriptor is of a record.
   bool isRecord() const { return !IsArray && ElemRecord; }
+  /// Checks if this is a dummy descriptor.
+  bool isDummy() const { return IsDummy; }
 };
 
 /// Bitfield tracking the initialisation status of elements of primitive arrays.
diff --git a/clang/lib/AST/Interp/Interp.cpp b/clang/lib/AST/Interp/Interp.cpp
index 4a4c0922758c9..31d43b6010c18 100644
--- a/clang/lib/AST/Interp/Interp.cpp
+++ b/clang/lib/AST/Interp/Interp.cpp
@@ -215,6 +215,10 @@ bool CheckLive(InterpState &S, CodePtr OpPC, const Pointer &Ptr,
   return true;
 }
 
+bool CheckDummy(InterpState &S, CodePtr OpPC, const Pointer &Ptr) {
+  return !Ptr.isDummy();
+}
+
 bool CheckNull(InterpState &S, CodePtr OpPC, const Pointer &Ptr,
                CheckSubobjectKind CSK) {
   if (!Ptr.isZero())
@@ -297,6 +301,8 @@ bool CheckInitialized(InterpState &S, CodePtr OpPC, const Pointer &Ptr,
 }
 
 bool CheckLoad(InterpState &S, CodePtr OpPC, const Pointer &Ptr) {
+  if (!CheckDummy(S, OpPC, Ptr))
+    return false;
   if (!CheckLive(S, OpPC, Ptr, AK_Read))
     return false;
   if (!CheckExtern(S, OpPC, Ptr))
diff --git a/clang/lib/AST/Interp/Interp.h b/clang/lib/AST/Interp/Interp.h
index 86cc267652951..3e1b4f32e8b69 100644
--- a/clang/lib/AST/Interp/Interp.h
+++ b/clang/lib/AST/Interp/Interp.h
@@ -55,6 +55,10 @@ bool CheckArray(InterpState &S, CodePtr OpPC, const Pointer &Ptr);
 /// Checks if a pointer is live and accessible.
 bool CheckLive(InterpState &S, CodePtr OpPC, const Pointer &Ptr,
                AccessKinds AK);
+
+/// Checks if a pointer is a dummy pointer.
+bool CheckDummy(InterpState &S, CodePtr OpPC, const Pointer &Ptr);
+
 /// Checks if a pointer is null.
 bool CheckNull(InterpState &S, CodePtr OpPC, const Pointer &Ptr,
                CheckSubobjectKind CSK);
@@ -1415,8 +1419,9 @@ bool OffsetHelper(InterpState &S, CodePtr OpPC, const T &Offset,
   // Compute the largest index into the array.
   T MaxIndex = T::from(Ptr.getNumElems(), Offset.bitWidth());
 
+  bool Invalid = false;
   // Helper to report an invalid offset, computed as APSInt.
-  auto InvalidOffset = [&]() {
+  auto DiagInvalidOffset = [&]() -> void {
     const unsigned Bits = Offset.bitWidth();
     APSInt APOffset(Offset.toAPSInt().extend(Bits + 2), false);
     APSInt APIndex(Index.toAPSInt().extend(Bits + 2), false);
@@ -1426,28 +1431,31 @@ bool OffsetHelper(InterpState &S, CodePtr OpPC, const T &Offset,
         << NewIndex
         << /*array*/ static_cast<int>(!Ptr.inArray())
         << static_cast<unsigned>(MaxIndex);
-    return false;
+    Invalid = true;
   };
 
   T MaxOffset = T::from(MaxIndex - Index, Offset.bitWidth());
   if constexpr (Op == ArithOp::Add) {
     // If the new offset would be negative, bail out.
     if (Offset.isNegative() && (Offset.isMin() || -Offset > Index))
-      return InvalidOffset();
+      DiagInvalidOffset();
 
     // If the new offset would be out of bounds, bail out.
     if (Offset.isPositive() && Offset > MaxOffset)
-      return InvalidOffset();
+      DiagInvalidOffset();
   } else {
     // If the new offset would be negative, bail out.
     if (Offset.isPositive() && Index < Offset)
-      return InvalidOffset();
+      DiagInvalidOffset();
 
     // If the new offset would be out of bounds, bail out.
     if (Offset.isNegative() && (Offset.isMin() || -Offset > MaxOffset))
-      return InvalidOffset();
+      DiagInvalidOffset();
   }
 
+  if (Invalid && !Ptr.isDummy())
+    return false;
+
   // Offset is valid - compute it on unsigned.
   int64_t WideIndex = static_cast<int64_t>(Index);
   int64_t WideOffset = static_cast<int64_t>(Offset);
diff --git a/clang/lib/AST/Interp/InterpBuiltin.cpp b/clang/lib/AST/Interp/InterpBuiltin.cpp
index 7552c1b88cff6..e329794cb7924 100644
--- a/clang/lib/AST/Interp/InterpBuiltin.cpp
+++ b/clang/lib/AST/Interp/InterpBuiltin.cpp
@@ -152,6 +152,9 @@ static bool interp__builtin_strlen(InterpState &S, CodePtr OpPC,
   if (!CheckLive(S, OpPC, StrPtr, AK_Read))
     return false;
 
+  if (!CheckDummy(S, OpPC, StrPtr))
+    return false;
+
   assert(StrPtr.getFieldDesc()->isPrimitiveArray());
 
   size_t Len = 0;
diff --git a/clang/lib/AST/Interp/Pointer.h b/clang/lib/AST/Interp/Pointer.h
index 65d710077fd1c..b371b306fe7a7 100644
--- a/clang/lib/AST/Interp/Pointer.h
+++ b/clang/lib/AST/Interp/Pointer.h
@@ -314,6 +314,8 @@ class Pointer {
   bool isActive() const { return Base == 0 || getInlineDesc()->IsActive; }
   /// Checks if a structure is a base class.
   bool isBaseClass() const { return isField() && getInlineDesc()->IsBase; }
+  /// Checks if the pointer pointers to a dummy value.
+  bool isDummy() const { return getDeclDesc()->isDummy(); }
 
   /// Checks if an object or a subfield is mutable.
   bool isConst() const {
diff --git a/clang/lib/AST/Interp/Program.cpp b/clang/lib/AST/Interp/Program.cpp
index 65e170881e313..c6d19afd7d221 100644
--- a/clang/lib/AST/Interp/Program.cpp
+++ b/clang/lib/AST/Interp/Program.cpp
@@ -144,16 +144,18 @@ std::optional<unsigned> Program::getOrCreateDummy(const ValueDecl *PD) {
       It != DummyParams.end())
     return It->second;
 
-  // Create a pointer to an incomplete array of the specified elements.
-  QualType ElemTy = PD->getType();
-  QualType Ty =
-      Ctx.getASTContext().getIncompleteArrayType(ElemTy, ArrayType::Normal, 0);
+  // Create dummy descriptor.
+  Descriptor *Desc = allocateDescriptor(PD, std::nullopt);
+  // Allocate a block for storage.
+  unsigned I = Globals.size();
 
-  if (auto Idx = createGlobal(PD, Ty, /*isStatic=*/true, /*isExtern=*/true)) {
-    DummyParams[PD] = *Idx;
-    return Idx;
-  }
-  return std::nullopt;
+  auto *G = new (Allocator, Desc->getAllocSize())
+      Global(getCurrentDecl(), Desc, /*IsStatic=*/true, /*IsExtern=*/false);
+  G->block()->invokeCtor();
+
+  Globals.push_back(G);
+  DummyParams[PD] = I;
+  return I;
 }
 
 std::optional<unsigned> Program::createGlobal(const ValueDecl *VD,
diff --git a/clang/test/AST/Interp/c.c b/clang/test/AST/Interp/c.c
index 974ca72702f7d..e8aa8b8599f21 100644
--- a/clang/test/AST/Interp/c.c
+++ b/clang/test/AST/Interp/c.c
@@ -47,3 +47,23 @@ _Static_assert(&a != 0, ""); // ref-warning {{always true}} \
                              // expected-warning {{always true}} \
                              // pedantic-expected-warning {{always true}} \
                              // pedantic-expected-warning {{is a GNU extension}}
+_Static_assert((&c + 1) != 0, ""); // pedantic-ref-warning {{is a GNU extension}} \
+                                   // pedantic-expected-warning {{is a GNU extension}}
+_Static_assert((&a + 100) != 0, ""); // pedantic-ref-warning {{is a GNU extension}} \
+                                     // pedantic-ref-note {{100 of non-array}} \
+                                     // pedantic-expected-note {{100 of non-array}} \
+                                     // pedantic-expected-warning {{is a GNU extension}}
+_Static_assert((&a - 100) != 0, ""); // pedantic-ref-warning {{is a GNU extension}} \
+                                     // pedantic-expected-warning {{is a GNU extension}} \
+                                     // pedantic-ref-note {{-100 of non-array}} \
+                                     // pedantic-expected-note {{-100 of non-array}}
+/// extern variable of a composite type.
+/// FIXME: The 'cast from void*' note is missing in the new interpreter.
+extern struct Test50S Test50;
+_Static_assert(&Test50 != (void*)0, ""); // ref-warning {{always true}} \
+                                         // pedantic-ref-warning {{always true}} \
+                                         // pedantic-ref-warning {{is a GNU extension}} \
+                                         // pedantic-ref-note {{cast from 'void *' is not allowed}} \
+                                         // expected-warning {{always true}} \
+                                         // pedantic-expected-warning {{always true}} \
+                                         // pedantic-expected-warning {{is a GNU extension}}

From 585da2651ff5d3a2645aa54813fb2b7928d88f55 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 26 Oct 2023 14:39:37 +0100
Subject: [PATCH 070/877] [SLP][X86] Regenerate hadd/hsub tests with full set
 of check-prefixes

Prep for D148855
---
 .../SLPVectorizer/X86/hadd-inseltpoison.ll    | 36 ++++++++++---------
 .../test/Transforms/SLPVectorizer/X86/hadd.ll | 36 ++++++++++---------
 .../SLPVectorizer/X86/hsub-inseltpoison.ll    | 19 ++++++----
 .../test/Transforms/SLPVectorizer/X86/hsub.ll | 19 ++++++----
 4 files changed, 64 insertions(+), 46 deletions(-)

diff --git a/llvm/test/Transforms/SLPVectorizer/X86/hadd-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/hadd-inseltpoison.ll
index 5223cac571489..0217ddcac0046 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/hadd-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/hadd-inseltpoison.ll
@@ -1,10 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=SLM
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX2
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,SLM
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX512
 
 ;
 ; 128-bit vectors
@@ -215,17 +215,17 @@ define <4 x double> @test_v4f64_partial_swizzle(<4 x double> %a, <4 x double> %b
 ; SLM-NEXT:    [[R03:%.*]] = insertelement <4 x double> [[TMP4]], double [[R3]], i64 3
 ; SLM-NEXT:    ret <4 x double> [[R03]]
 ;
-; AVX-LABEL: @test_v4f64_partial_swizzle(
-; AVX-NEXT:    [[A0:%.*]] = extractelement <4 x double> [[A:%.*]], i64 0
-; AVX-NEXT:    [[A1:%.*]] = extractelement <4 x double> [[A]], i64 1
-; AVX-NEXT:    [[R0:%.*]] = fadd double [[A0]], [[A1]]
-; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> poison, <2 x i32> <i32 1, i32 2>
-; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[B]], <4 x double> poison, <2 x i32> <i32 0, i32 3>
-; AVX-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
-; AVX-NEXT:    [[R00:%.*]] = insertelement <4 x double> poison, double [[R0]], i64 0
-; AVX-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; AVX-NEXT:    [[R031:%.*]] = shufflevector <4 x double> [[R00]], <4 x double> [[TMP4]], <4 x i32> <i32 0, i32 poison, i32 4, i32 5>
-; AVX-NEXT:    ret <4 x double> [[R031]]
+; AVX1-LABEL: @test_v4f64_partial_swizzle(
+; AVX1-NEXT:    [[A0:%.*]] = extractelement <4 x double> [[A:%.*]], i64 0
+; AVX1-NEXT:    [[A1:%.*]] = extractelement <4 x double> [[A]], i64 1
+; AVX1-NEXT:    [[R0:%.*]] = fadd double [[A0]], [[A1]]
+; AVX1-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> poison, <2 x i32> <i32 1, i32 2>
+; AVX1-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[B]], <4 x double> poison, <2 x i32> <i32 0, i32 3>
+; AVX1-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
+; AVX1-NEXT:    [[R00:%.*]] = insertelement <4 x double> poison, double [[R0]], i64 0
+; AVX1-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; AVX1-NEXT:    [[R031:%.*]] = shufflevector <4 x double> [[R00]], <4 x double> [[TMP4]], <4 x i32> <i32 0, i32 poison, i32 4, i32 5>
+; AVX1-NEXT:    ret <4 x double> [[R031]]
 ;
 ; AVX2-LABEL: @test_v4f64_partial_swizzle(
 ; AVX2-NEXT:    [[A0:%.*]] = extractelement <4 x double> [[A:%.*]], i64 0
@@ -448,3 +448,5 @@ define <16 x i16> @test_v16i16(<16 x i16> %a, <16 x i16> %b) {
   %rv15 = insertelement <16 x i16> %rv14, i16 %r15, i32 15
   ret <16 x i16> %rv15
 }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; AVX: {{.*}}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll b/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll
index 45514a7a7150d..c38d116a7a323 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll
@@ -1,10 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=SLM
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX2
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,SLM
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX512
 
 ;
 ; 128-bit vectors
@@ -215,17 +215,17 @@ define <4 x double> @test_v4f64_partial_swizzle(<4 x double> %a, <4 x double> %b
 ; SLM-NEXT:    [[R03:%.*]] = insertelement <4 x double> [[TMP4]], double [[R3]], i64 3
 ; SLM-NEXT:    ret <4 x double> [[R03]]
 ;
-; AVX-LABEL: @test_v4f64_partial_swizzle(
-; AVX-NEXT:    [[A0:%.*]] = extractelement <4 x double> [[A:%.*]], i64 0
-; AVX-NEXT:    [[A1:%.*]] = extractelement <4 x double> [[A]], i64 1
-; AVX-NEXT:    [[R0:%.*]] = fadd double [[A0]], [[A1]]
-; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> poison, <2 x i32> <i32 1, i32 2>
-; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[B]], <4 x double> poison, <2 x i32> <i32 0, i32 3>
-; AVX-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
-; AVX-NEXT:    [[R00:%.*]] = insertelement <4 x double> undef, double [[R0]], i64 0
-; AVX-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; AVX-NEXT:    [[R031:%.*]] = shufflevector <4 x double> [[R00]], <4 x double> [[TMP4]], <4 x i32> <i32 0, i32 poison, i32 4, i32 5>
-; AVX-NEXT:    ret <4 x double> [[R031]]
+; AVX1-LABEL: @test_v4f64_partial_swizzle(
+; AVX1-NEXT:    [[A0:%.*]] = extractelement <4 x double> [[A:%.*]], i64 0
+; AVX1-NEXT:    [[A1:%.*]] = extractelement <4 x double> [[A]], i64 1
+; AVX1-NEXT:    [[R0:%.*]] = fadd double [[A0]], [[A1]]
+; AVX1-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> poison, <2 x i32> <i32 1, i32 2>
+; AVX1-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[B]], <4 x double> poison, <2 x i32> <i32 0, i32 3>
+; AVX1-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
+; AVX1-NEXT:    [[R00:%.*]] = insertelement <4 x double> undef, double [[R0]], i64 0
+; AVX1-NEXT:    [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; AVX1-NEXT:    [[R031:%.*]] = shufflevector <4 x double> [[R00]], <4 x double> [[TMP4]], <4 x i32> <i32 0, i32 poison, i32 4, i32 5>
+; AVX1-NEXT:    ret <4 x double> [[R031]]
 ;
 ; AVX2-LABEL: @test_v4f64_partial_swizzle(
 ; AVX2-NEXT:    [[A0:%.*]] = extractelement <4 x double> [[A:%.*]], i64 0
@@ -448,3 +448,5 @@ define <16 x i16> @test_v16i16(<16 x i16> %a, <16 x i16> %b) {
   %rv15 = insertelement <16 x i16> %rv14, i16 %r15, i32 15
   ret <16 x i16> %rv15
 }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; AVX: {{.*}}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/hsub-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/hsub-inseltpoison.ll
index cb05e46b466f2..39400ba4ce1e8 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/hsub-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/hsub-inseltpoison.ll
@@ -1,10 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer,instcombine -S | FileCheck %s
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer,instcombine -S | FileCheck %s
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer,instcombine -S | FileCheck %s
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer,instcombine -S | FileCheck %s
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer,instcombine -S | FileCheck %s
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s
+; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,SLM
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX512
 
 ;
 ; 128-bit vectors
@@ -353,3 +353,10 @@ define <16 x i16> @test_v16i16(<16 x i16> %a, <16 x i16> %b) {
   %rv15 = insertelement <16 x i16> %rv14, i16 %r15, i32 15
   ret <16 x i16> %rv15
 }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; AVX: {{.*}}
+; AVX1: {{.*}}
+; AVX2: {{.*}}
+; AVX512: {{.*}}
+; SLM: {{.*}}
+; SSE: {{.*}}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/hsub.ll b/llvm/test/Transforms/SLPVectorizer/X86/hsub.ll
index 7f6f24ce8fda1..6b63de83c56be 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/hsub.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/hsub.ll
@@ -1,10 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer,instcombine -S | FileCheck %s
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer,instcombine -S | FileCheck %s
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer,instcombine -S | FileCheck %s
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer,instcombine -S | FileCheck %s
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer,instcombine -S | FileCheck %s
-; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s
+; RUN: opt < %s -mtriple=x86_64-unknown -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,SLM
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX1
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX512
+; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -passes=slp-vectorizer,instcombine -S | FileCheck %s --check-prefixes=CHECK,AVX,AVX512
 
 ;
 ; 128-bit vectors
@@ -353,3 +353,10 @@ define <16 x i16> @test_v16i16(<16 x i16> %a, <16 x i16> %b) {
   %rv15 = insertelement <16 x i16> %rv14, i16 %r15, i32 15
   ret <16 x i16> %rv15
 }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; AVX: {{.*}}
+; AVX1: {{.*}}
+; AVX2: {{.*}}
+; AVX512: {{.*}}
+; SLM: {{.*}}
+; SSE: {{.*}}

From 2633d94f289b3d5aa86a1b00591a3c755ce693fa Mon Sep 17 00:00:00 2001
From: Gil Rapaport <gil.rapaport@mobileye.com>
Date: Thu, 26 Oct 2023 16:40:18 +0300
Subject: [PATCH 071/877] [mlir][emitc] Add a structured for operation (#68206)

Add an emitc.for op to the EmitC dialect as a lowering target for
scf.for, replacing its current direct translation to C; The translator
now handles emitc.for instead.
---
 mlir/docs/Dialects/emitc.md                   |   3 -
 mlir/include/mlir/Dialect/EmitC/IR/EmitC.td   |  64 ++++++++-
 mlir/lib/Conversion/SCFToEmitC/SCFToEmitC.cpp | 123 ++++++++++++++----
 mlir/lib/Dialect/EmitC/IR/EmitC.cpp           |  95 ++++++++++++++
 mlir/lib/Target/Cpp/TranslateToCpp.cpp        |  85 +-----------
 mlir/test/Conversion/SCFToEmitC/for.mlir      |  96 ++++++++++++++
 mlir/test/Dialect/EmitC/invalid_ops.mlir      |   2 +-
 mlir/test/Dialect/EmitC/ops.mlir              |  24 +++-
 mlir/test/Target/Cpp/for.mlir                 |  56 ++++++--
 9 files changed, 426 insertions(+), 122 deletions(-)
 create mode 100644 mlir/test/Conversion/SCFToEmitC/for.mlir

diff --git a/mlir/docs/Dialects/emitc.md b/mlir/docs/Dialects/emitc.md
index 4d9f04ab11c8f..03b85611ee3cd 100644
--- a/mlir/docs/Dialects/emitc.md
+++ b/mlir/docs/Dialects/emitc.md
@@ -31,8 +31,5 @@ translating the following operations:
     *   `func.constant`
     *   `func.func`
     *   `func.return`
-*   'scf' Dialect
-    *   `scf.for`
-    *   `scf.yield`
 *   'arith' Dialect
     *   `arith.constant`
diff --git a/mlir/include/mlir/Dialect/EmitC/IR/EmitC.td b/mlir/include/mlir/Dialect/EmitC/IR/EmitC.td
index 827ffc0278fce..2edeb6f8a9cf0 100644
--- a/mlir/include/mlir/Dialect/EmitC/IR/EmitC.td
+++ b/mlir/include/mlir/Dialect/EmitC/IR/EmitC.td
@@ -246,6 +246,67 @@ def EmitC_DivOp : EmitC_BinaryOp<"div", []> {
   let results = (outs FloatIntegerIndexOrOpaqueType);
 }
 
+def EmitC_ForOp : EmitC_Op<"for",
+      [AllTypesMatch<["lowerBound", "upperBound", "step"]>,
+       SingleBlockImplicitTerminator<"emitc::YieldOp">,
+       RecursiveMemoryEffects]> {
+  let summary = "for operation";
+  let description = [{
+    The `emitc.for` operation represents a C loop of the following form:
+
+    ```c++
+    for (T i = lb; i < ub; i += step) { /* ... */ } // where T is typeof(lb)
+    ```
+
+    The operation takes 3 SSA values as operands that represent the lower bound,
+    upper bound and step respectively, and defines an SSA value for its
+    induction variable. It has one region capturing the loop body. The induction
+    variable is represented as an argument of this region. This SSA value is a
+    signless integer or index. The step is a value of same type.
+
+    This operation has no result. The body region must contain exactly one block
+    that terminates with `emitc.yield`. Calling ForOp::build will create such a
+    region and insert the terminator implicitly if none is defined, so will the
+    parsing even in cases when it is absent from the custom format. For example:
+
+    ```mlir
+    // Index case.
+    emitc.for %iv = %lb to %ub step %step {
+      ... // body
+    }
+    ...
+    // Integer case.
+    emitc.for %iv_32 = %lb_32 to %ub_32 step %step_32 : i32 {
+      ... // body
+    }
+    ```
+  }];
+  let arguments = (ins IntegerIndexOrOpaqueType:$lowerBound,
+                       IntegerIndexOrOpaqueType:$upperBound,
+                       IntegerIndexOrOpaqueType:$step);
+  let results = (outs);
+  let regions = (region SizedRegion<1>:$region);
+
+  let skipDefaultBuilders = 1;
+  let builders = [
+    OpBuilder<(ins "Value":$lowerBound, "Value":$upperBound, "Value":$step,
+      CArg<"function_ref<void(OpBuilder &, Location, Value)>", "nullptr">)>
+  ];
+
+  let extraClassDeclaration = [{
+    using BodyBuilderFn =
+        function_ref<void(OpBuilder &, Location, Value)>;
+    Value getInductionVar() { return getBody()->getArgument(0); }
+    void setLowerBound(Value bound) { getOperation()->setOperand(0, bound); }
+    void setUpperBound(Value bound) { getOperation()->setOperand(1, bound); }
+    void setStep(Value step) { getOperation()->setOperand(2, step); }
+  }];
+
+  let hasCanonicalizer = 1;
+  let hasCustomAssemblyFormat = 1;
+  let hasRegionVerifier = 1;
+}
+
 def EmitC_IncludeOp
     : EmitC_Op<"include", [HasParent<"ModuleOp">]> {
   let summary = "Include operation";
@@ -430,7 +491,8 @@ def EmitC_AssignOp : EmitC_Op<"assign", []> {
   let assemblyFormat = "$value `:` type($value) `to` $var `:` type($var) attr-dict";
 }
 
-def EmitC_YieldOp : EmitC_Op<"yield", [Pure, Terminator, ParentOneOf<["IfOp"]>]> {
+def EmitC_YieldOp : EmitC_Op<"yield",
+      [Pure, Terminator, ParentOneOf<["IfOp", "ForOp"]>]> {
   let summary = "block termination operation";
   let description = [{
     "yield" terminates blocks within EmitC control-flow operations. Since
diff --git a/mlir/lib/Conversion/SCFToEmitC/SCFToEmitC.cpp b/mlir/lib/Conversion/SCFToEmitC/SCFToEmitC.cpp
index 5d0d8df8869e3..bf69ba503f4e6 100644
--- a/mlir/lib/Conversion/SCFToEmitC/SCFToEmitC.cpp
+++ b/mlir/lib/Conversion/SCFToEmitC/SCFToEmitC.cpp
@@ -37,7 +37,100 @@ struct SCFToEmitCPass : public impl::SCFToEmitCBase<SCFToEmitCPass> {
   void runOnOperation() override;
 };
 
-// Lower scf::if to emitc::if, implementing return values as emitc::variable's
+// Lower scf::for to emitc::for, implementing result values using
+// emitc::variable's updated within the loop body.
+struct ForLowering : public OpRewritePattern<ForOp> {
+  using OpRewritePattern<ForOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(ForOp forOp,
+                                PatternRewriter &rewriter) const override;
+};
+
+// Create an uninitialized emitc::variable op for each result of the given op.
+template <typename T>
+static SmallVector<Value> createVariablesForResults(T op,
+                                                    PatternRewriter &rewriter) {
+  SmallVector<Value> resultVariables;
+
+  if (!op.getNumResults())
+    return resultVariables;
+
+  Location loc = op->getLoc();
+  MLIRContext *context = op.getContext();
+
+  OpBuilder::InsertionGuard guard(rewriter);
+  rewriter.setInsertionPoint(op);
+
+  for (OpResult result : op.getResults()) {
+    Type resultType = result.getType();
+    emitc::OpaqueAttr noInit = emitc::OpaqueAttr::get(context, "");
+    emitc::VariableOp var =
+        rewriter.create<emitc::VariableOp>(loc, resultType, noInit);
+    resultVariables.push_back(var);
+  }
+
+  return resultVariables;
+}
+
+// Create a series of assign ops assigning given values to given variables at
+// the current insertion point of given rewriter.
+static void assignValues(ValueRange values, SmallVector<Value> &variables,
+                         PatternRewriter &rewriter, Location loc) {
+  for (auto [value, var] : llvm::zip(values, variables))
+    rewriter.create<emitc::AssignOp>(loc, var, value);
+}
+
+static void lowerYield(SmallVector<Value> &resultVariables,
+                       PatternRewriter &rewriter, scf::YieldOp yield) {
+  Location loc = yield.getLoc();
+  ValueRange operands = yield.getOperands();
+
+  OpBuilder::InsertionGuard guard(rewriter);
+  rewriter.setInsertionPoint(yield);
+
+  assignValues(operands, resultVariables, rewriter, loc);
+
+  rewriter.create<emitc::YieldOp>(loc);
+  rewriter.eraseOp(yield);
+}
+
+LogicalResult ForLowering::matchAndRewrite(ForOp forOp,
+                                           PatternRewriter &rewriter) const {
+  Location loc = forOp.getLoc();
+
+  // Create an emitc::variable op for each result. These variables will be
+  // assigned to by emitc::assign ops within the loop body.
+  SmallVector<Value> resultVariables =
+      createVariablesForResults(forOp, rewriter);
+  SmallVector<Value> iterArgsVariables =
+      createVariablesForResults(forOp, rewriter);
+
+  assignValues(forOp.getInits(), iterArgsVariables, rewriter, loc);
+
+  emitc::ForOp loweredFor = rewriter.create<emitc::ForOp>(
+      loc, forOp.getLowerBound(), forOp.getUpperBound(), forOp.getStep());
+
+  Block *loweredBody = loweredFor.getBody();
+
+  // Erase the auto-generated terminator for the lowered for op.
+  rewriter.eraseOp(loweredBody->getTerminator());
+
+  SmallVector<Value> replacingValues;
+  replacingValues.push_back(loweredFor.getInductionVar());
+  replacingValues.append(iterArgsVariables.begin(), iterArgsVariables.end());
+
+  rewriter.mergeBlocks(forOp.getBody(), loweredBody, replacingValues);
+  lowerYield(iterArgsVariables, rewriter,
+             cast<scf::YieldOp>(loweredBody->getTerminator()));
+
+  // Copy iterArgs into results after the for loop.
+  assignValues(iterArgsVariables, resultVariables, rewriter, loc);
+
+  rewriter.replaceOp(forOp, resultVariables);
+  return success();
+}
+
+// Lower scf::if to emitc::if, implementing result values as emitc::variable's
 // updated within the then and else regions.
 struct IfLowering : public OpRewritePattern<IfOp> {
   using OpRewritePattern<IfOp>::OpRewritePattern;
@@ -52,20 +145,10 @@ LogicalResult IfLowering::matchAndRewrite(IfOp ifOp,
                                           PatternRewriter &rewriter) const {
   Location loc = ifOp.getLoc();
 
-  SmallVector<Value> resultVariables;
-
   // Create an emitc::variable op for each result. These variables will be
   // assigned to by emitc::assign ops within the then & else regions.
-  if (ifOp.getNumResults()) {
-    MLIRContext *context = ifOp.getContext();
-    rewriter.setInsertionPoint(ifOp);
-    for (OpResult result : ifOp.getResults()) {
-      Type resultType = result.getType();
-      auto noInit = emitc::OpaqueAttr::get(context, "");
-      auto var = rewriter.create<emitc::VariableOp>(loc, resultType, noInit);
-      resultVariables.push_back(var);
-    }
-  }
+  SmallVector<Value> resultVariables =
+      createVariablesForResults(ifOp, rewriter);
 
   // Utility function to lower the contents of an scf::if region to an emitc::if
   // region. The contents of the scf::if regions is moved into the respective
@@ -76,16 +159,7 @@ LogicalResult IfLowering::matchAndRewrite(IfOp ifOp,
                                                    Region &loweredRegion) {
     rewriter.inlineRegionBefore(region, loweredRegion, loweredRegion.end());
     Operation *terminator = loweredRegion.back().getTerminator();
-    Location terminatorLoc = terminator->getLoc();
-    ValueRange terminatorOperands = terminator->getOperands();
-    rewriter.setInsertionPointToEnd(&loweredRegion.back());
-    for (auto value2Var : llvm::zip(terminatorOperands, resultVariables)) {
-      Value resultValue = std::get<0>(value2Var);
-      Value resultVar = std::get<1>(value2Var);
-      rewriter.create<emitc::AssignOp>(terminatorLoc, resultVar, resultValue);
-    }
-    rewriter.create<emitc::YieldOp>(terminatorLoc);
-    rewriter.eraseOp(terminator);
+    lowerYield(resultVariables, rewriter, cast<scf::YieldOp>(terminator));
   };
 
   Region &thenRegion = ifOp.getThenRegion();
@@ -109,6 +183,7 @@ LogicalResult IfLowering::matchAndRewrite(IfOp ifOp,
 }
 
 void mlir::populateSCFToEmitCConversionPatterns(RewritePatternSet &patterns) {
+  patterns.add<ForLowering>(patterns.getContext());
   patterns.add<IfLowering>(patterns.getContext());
 }
 
@@ -118,7 +193,7 @@ void SCFToEmitCPass::runOnOperation() {
 
   // Configure conversion to lower out SCF operations.
   ConversionTarget target(getContext());
-  target.addIllegalOp<scf::IfOp>();
+  target.addIllegalOp<scf::ForOp, scf::IfOp>();
   target.markUnknownOpDynamicallyLegal([](Operation *) { return true; });
   if (failed(
           applyPartialConversion(getOperation(), target, std::move(patterns))))
diff --git a/mlir/lib/Dialect/EmitC/IR/EmitC.cpp b/mlir/lib/Dialect/EmitC/IR/EmitC.cpp
index 961a52a70a2a1..d06381b7ddad3 100644
--- a/mlir/lib/Dialect/EmitC/IR/EmitC.cpp
+++ b/mlir/lib/Dialect/EmitC/IR/EmitC.cpp
@@ -189,6 +189,101 @@ LogicalResult emitc::ConstantOp::verify() {
 
 OpFoldResult emitc::ConstantOp::fold(FoldAdaptor adaptor) { return getValue(); }
 
+//===----------------------------------------------------------------------===//
+// ForOp
+//===----------------------------------------------------------------------===//
+
+void ForOp::build(OpBuilder &builder, OperationState &result, Value lb,
+                  Value ub, Value step, BodyBuilderFn bodyBuilder) {
+  result.addOperands({lb, ub, step});
+  Type t = lb.getType();
+  Region *bodyRegion = result.addRegion();
+  bodyRegion->push_back(new Block);
+  Block &bodyBlock = bodyRegion->front();
+  bodyBlock.addArgument(t, result.location);
+
+  // Create the default terminator if the builder is not provided.
+  if (!bodyBuilder) {
+    ForOp::ensureTerminator(*bodyRegion, builder, result.location);
+  } else {
+    OpBuilder::InsertionGuard guard(builder);
+    builder.setInsertionPointToStart(&bodyBlock);
+    bodyBuilder(builder, result.location, bodyBlock.getArgument(0));
+  }
+}
+
+void ForOp::getCanonicalizationPatterns(RewritePatternSet &, MLIRContext *) {}
+
+ParseResult ForOp::parse(OpAsmParser &parser, OperationState &result) {
+  Builder &builder = parser.getBuilder();
+  Type type;
+
+  OpAsmParser::Argument inductionVariable;
+  OpAsmParser::UnresolvedOperand lb, ub, step;
+
+  // Parse the induction variable followed by '='.
+  if (parser.parseOperand(inductionVariable.ssaName) || parser.parseEqual() ||
+      // Parse loop bounds.
+      parser.parseOperand(lb) || parser.parseKeyword("to") ||
+      parser.parseOperand(ub) || parser.parseKeyword("step") ||
+      parser.parseOperand(step))
+    return failure();
+
+  // Parse the optional initial iteration arguments.
+  SmallVector<OpAsmParser::Argument, 4> regionArgs;
+  SmallVector<OpAsmParser::UnresolvedOperand, 4> operands;
+  regionArgs.push_back(inductionVariable);
+
+  // Parse optional type, else assume Index.
+  if (parser.parseOptionalColon())
+    type = builder.getIndexType();
+  else if (parser.parseType(type))
+    return failure();
+
+  // Resolve input operands.
+  regionArgs.front().type = type;
+  if (parser.resolveOperand(lb, type, result.operands) ||
+      parser.resolveOperand(ub, type, result.operands) ||
+      parser.resolveOperand(step, type, result.operands))
+    return failure();
+
+  // Parse the body region.
+  Region *body = result.addRegion();
+  if (parser.parseRegion(*body, regionArgs))
+    return failure();
+
+  ForOp::ensureTerminator(*body, builder, result.location);
+
+  // Parse the optional attribute list.
+  if (parser.parseOptionalAttrDict(result.attributes))
+    return failure();
+
+  return success();
+}
+
+void ForOp::print(OpAsmPrinter &p) {
+  p << " " << getInductionVar() << " = " << getLowerBound() << " to "
+    << getUpperBound() << " step " << getStep();
+
+  p << ' ';
+  if (Type t = getInductionVar().getType(); !t.isIndex())
+    p << " : " << t << ' ';
+  p.printRegion(getRegion(),
+                /*printEntryBlockArgs=*/false,
+                /*printBlockTerminators=*/false);
+  p.printOptionalAttrDict((*this)->getAttrs());
+}
+
+LogicalResult ForOp::verifyRegions() {
+  // Check that the body defines as single block argument for the induction
+  // variable.
+  if (getInductionVar().getType() != getLowerBound().getType())
+    return emitOpError(
+        "expected induction variable to be same type as bounds and step");
+
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // IfOp
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Target/Cpp/TranslateToCpp.cpp b/mlir/lib/Target/Cpp/TranslateToCpp.cpp
index 4645ca4b206e7..8ffea4d5b7b32 100644
--- a/mlir/lib/Target/Cpp/TranslateToCpp.cpp
+++ b/mlir/lib/Target/Cpp/TranslateToCpp.cpp
@@ -10,7 +10,6 @@
 #include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h"
 #include "mlir/Dialect/EmitC/IR/EmitC.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/Dialect.h"
@@ -502,30 +501,10 @@ static LogicalResult printOperation(CppEmitter &emitter,
   return success();
 }
 
-static LogicalResult printOperation(CppEmitter &emitter, scf::ForOp forOp) {
+static LogicalResult printOperation(CppEmitter &emitter, emitc::ForOp forOp) {
 
   raw_indented_ostream &os = emitter.ostream();
 
-  OperandRange operands = forOp.getInitArgs();
-  Block::BlockArgListType iterArgs = forOp.getRegionIterArgs();
-  Operation::result_range results = forOp.getResults();
-
-  if (!emitter.shouldDeclareVariablesAtTop()) {
-    for (OpResult result : results) {
-      if (failed(emitter.emitVariableDeclaration(result,
-                                                 /*trailingSemicolon=*/true)))
-        return failure();
-    }
-  }
-
-  for (auto pair : llvm::zip(iterArgs, operands)) {
-    if (failed(emitter.emitType(forOp.getLoc(), std::get<0>(pair).getType())))
-      return failure();
-    os << " " << emitter.getOrCreateName(std::get<0>(pair)) << " = ";
-    os << emitter.getOrCreateName(std::get<1>(pair)) << ";";
-    os << "\n";
-  }
-
   os << "for (";
   if (failed(
           emitter.emitType(forOp.getLoc(), forOp.getInductionVar().getType())))
@@ -548,35 +527,14 @@ static LogicalResult printOperation(CppEmitter &emitter, scf::ForOp forOp) {
   Region &forRegion = forOp.getRegion();
   auto regionOps = forRegion.getOps();
 
-  // We skip the trailing yield op because this updates the result variables
-  // of the for op in the generated code. Instead we update the iterArgs at
-  // the end of a loop iteration and set the result variables after the for
-  // loop.
+  // We skip the trailing yield op.
   for (auto it = regionOps.begin(); std::next(it) != regionOps.end(); ++it) {
     if (failed(emitter.emitOperation(*it, /*trailingSemicolon=*/true)))
       return failure();
   }
 
-  Operation *yieldOp = forRegion.getBlocks().front().getTerminator();
-  // Copy yield operands into iterArgs at the end of a loop iteration.
-  for (auto pair : llvm::zip(iterArgs, yieldOp->getOperands())) {
-    BlockArgument iterArg = std::get<0>(pair);
-    Value operand = std::get<1>(pair);
-    os << emitter.getOrCreateName(iterArg) << " = "
-       << emitter.getOrCreateName(operand) << ";\n";
-  }
-
   os.unindent() << "}";
 
-  // Copy iterArgs into results after the for loop.
-  for (auto pair : llvm::zip(results, iterArgs)) {
-    OpResult result = std::get<0>(pair);
-    BlockArgument iterArg = std::get<1>(pair);
-    os << "\n"
-       << emitter.getOrCreateName(result) << " = "
-       << emitter.getOrCreateName(iterArg) << ";";
-  }
-
   return success();
 }
 
@@ -617,33 +575,6 @@ static LogicalResult printOperation(CppEmitter &emitter, emitc::IfOp ifOp) {
   return success();
 }
 
-static LogicalResult printOperation(CppEmitter &emitter, scf::YieldOp yieldOp) {
-  raw_ostream &os = emitter.ostream();
-  Operation &parentOp = *yieldOp.getOperation()->getParentOp();
-
-  if (yieldOp.getNumOperands() != parentOp.getNumResults()) {
-    return yieldOp.emitError("number of operands does not to match the number "
-                             "of the parent op's results");
-  }
-
-  if (failed(interleaveWithError(
-          llvm::zip(parentOp.getResults(), yieldOp.getOperands()),
-          [&](auto pair) -> LogicalResult {
-            auto result = std::get<0>(pair);
-            auto operand = std::get<1>(pair);
-            os << emitter.getOrCreateName(result) << " = ";
-
-            if (!emitter.hasValueInScope(operand))
-              return yieldOp.emitError("operand value not in scope");
-            os << emitter.getOrCreateName(operand);
-            return success();
-          },
-          [&]() { os << ";\n"; })))
-    return failure();
-
-  return success();
-}
-
 static LogicalResult printOperation(CppEmitter &emitter,
                                     func::ReturnOp returnOp) {
   raw_ostream &os = emitter.ostream();
@@ -748,10 +679,11 @@ static LogicalResult printOperation(CppEmitter &emitter,
     for (Operation &op : block.getOperations()) {
       // When generating code for an emitc.if or cf.cond_br op no semicolon
       // needs to be printed after the closing brace.
-      // When generating code for an scf.for op, printing a trailing semicolon
+      // When generating code for an emitc.for op, printing a trailing semicolon
       // is handled within the printOperation function.
       bool trailingSemicolon =
-          !isa<cf::CondBranchOp, emitc::LiteralOp, emitc::IfOp, scf::ForOp>(op);
+          !isa<cf::CondBranchOp, emitc::ForOp, emitc::IfOp, emitc::LiteralOp>(
+              op);
 
       if (failed(emitter.emitOperation(
               op, /*trailingSemicolon=*/trailingSemicolon)))
@@ -1015,15 +947,12 @@ LogicalResult CppEmitter::emitOperation(Operation &op, bool trailingSemicolon) {
           // EmitC ops.
           .Case<emitc::AddOp, emitc::ApplyOp, emitc::AssignOp, emitc::CallOp,
                 emitc::CastOp, emitc::CmpOp, emitc::ConstantOp, emitc::DivOp,
-                emitc::IfOp, emitc::IncludeOp, emitc::MulOp, emitc::RemOp,
-                emitc::SubOp, emitc::VariableOp>(
+                emitc::ForOp, emitc::IfOp, emitc::IncludeOp, emitc::MulOp,
+                emitc::RemOp, emitc::SubOp, emitc::VariableOp>(
               [&](auto op) { return printOperation(*this, op); })
           // Func ops.
           .Case<func::CallOp, func::ConstantOp, func::FuncOp, func::ReturnOp>(
               [&](auto op) { return printOperation(*this, op); })
-          // SCF ops.
-          .Case<scf::ForOp, scf::YieldOp>(
-              [&](auto op) { return printOperation(*this, op); })
           // Arithmetic ops.
           .Case<arith::ConstantOp>(
               [&](auto op) { return printOperation(*this, op); })
diff --git a/mlir/test/Conversion/SCFToEmitC/for.mlir b/mlir/test/Conversion/SCFToEmitC/for.mlir
new file mode 100644
index 0000000000000..7f90310af2189
--- /dev/null
+++ b/mlir/test/Conversion/SCFToEmitC/for.mlir
@@ -0,0 +1,96 @@
+// RUN: mlir-opt -allow-unregistered-dialect -convert-scf-to-emitc %s | FileCheck %s
+
+func.func @simple_std_for_loop(%arg0 : index, %arg1 : index, %arg2 : index) {
+  scf.for %i0 = %arg0 to %arg1 step %arg2 {
+    %c1 = arith.constant 1 : index
+  }
+  return
+}
+// CHECK-LABEL: func.func @simple_std_for_loop(
+// CHECK-SAME:      %[[VAL_0:.*]]: index, %[[VAL_1:.*]]: index, %[[VAL_2:.*]]: index) {
+// CHECK-NEXT:    emitc.for %[[VAL_3:.*]] = %[[VAL_0]] to %[[VAL_1]] step %[[VAL_2]] {
+// CHECK-NEXT:      %[[VAL_4:.*]] = arith.constant 1 : index
+// CHECK-NEXT:    }
+// CHECK-NEXT:    return
+// CHECK-NEXT:  }
+
+func.func @simple_std_2_for_loops(%arg0 : index, %arg1 : index, %arg2 : index) {
+  scf.for %i0 = %arg0 to %arg1 step %arg2 {
+    %c1 = arith.constant 1 : index
+    scf.for %i1 = %arg0 to %arg1 step %arg2 {
+      %c1_0 = arith.constant 1 : index
+    }
+  }
+  return
+}
+// CHECK-LABEL: func.func @simple_std_2_for_loops(
+// CHECK-SAME:      %[[VAL_0:.*]]: index, %[[VAL_1:.*]]: index, %[[VAL_2:.*]]: index) {
+// CHECK-NEXT:    emitc.for %[[VAL_3:.*]] = %[[VAL_0]] to %[[VAL_1]] step %[[VAL_2]] {
+// CHECK-NEXT:      %[[VAL_4:.*]] = arith.constant 1 : index
+// CHECK-NEXT:      emitc.for %[[VAL_5:.*]] = %[[VAL_0]] to %[[VAL_1]] step %[[VAL_2]] {
+// CHECK-NEXT:        %[[VAL_6:.*]] = arith.constant 1 : index
+// CHECK-NEXT:      }
+// CHECK-NEXT:    }
+// CHECK-NEXT:    return
+// CHECK-NEXT:  }
+
+func.func @for_yield(%arg0 : index, %arg1 : index, %arg2 : index) -> (f32, f32) {
+  %s0 = arith.constant 0.0 : f32
+  %s1 = arith.constant 1.0 : f32
+  %result:2 = scf.for %i0 = %arg0 to %arg1 step %arg2 iter_args(%si = %s0, %sj = %s1) -> (f32, f32) {
+    %sn = arith.addf %si, %sj : f32
+    scf.yield %sn, %sn : f32, f32
+  }
+  return %result#0, %result#1 : f32, f32
+}
+// CHECK-LABEL: func.func @for_yield(
+// CHECK-SAME:      %[[VAL_0:.*]]: index, %[[VAL_1:.*]]: index, %[[VAL_2:.*]]: index) -> (f32, f32) {
+// CHECK-NEXT:    %[[VAL_3:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK-NEXT:    %[[VAL_4:.*]] = arith.constant 1.000000e+00 : f32
+// CHECK-NEXT:    %[[VAL_5:.*]] = "emitc.variable"() <{value = #emitc.opaque<"">}> : () -> f32
+// CHECK-NEXT:    %[[VAL_6:.*]] = "emitc.variable"() <{value = #emitc.opaque<"">}> : () -> f32
+// CHECK-NEXT:    %[[VAL_7:.*]] = "emitc.variable"() <{value = #emitc.opaque<"">}> : () -> f32
+// CHECK-NEXT:    %[[VAL_8:.*]] = "emitc.variable"() <{value = #emitc.opaque<"">}> : () -> f32
+// CHECK-NEXT:    emitc.assign %[[VAL_3]] : f32 to %[[VAL_7]] : f32
+// CHECK-NEXT:    emitc.assign %[[VAL_4]] : f32 to %[[VAL_8]] : f32
+// CHECK-NEXT:    emitc.for %[[VAL_9:.*]] = %[[VAL_0]] to %[[VAL_1]] step %[[VAL_2]] {
+// CHECK-NEXT:      %[[VAL_10:.*]] = arith.addf %[[VAL_7]], %[[VAL_8]] : f32
+// CHECK-NEXT:      emitc.assign %[[VAL_10]] : f32 to %[[VAL_7]] : f32
+// CHECK-NEXT:      emitc.assign %[[VAL_10]] : f32 to %[[VAL_8]] : f32
+// CHECK-NEXT:    }
+// CHECK-NEXT:    emitc.assign %[[VAL_7]] : f32 to %[[VAL_5]] : f32
+// CHECK-NEXT:    emitc.assign %[[VAL_8]] : f32 to %[[VAL_6]] : f32
+// CHECK-NEXT:    return %[[VAL_5]], %[[VAL_6]] : f32, f32
+// CHECK-NEXT:  }
+
+func.func @nested_for_yield(%arg0 : index, %arg1 : index, %arg2 : index) -> f32 {
+  %s0 = arith.constant 1.0 : f32
+  %r = scf.for %i0 = %arg0 to %arg1 step %arg2 iter_args(%iter = %s0) -> (f32) {
+    %result = scf.for %i1 = %arg0 to %arg1 step %arg2 iter_args(%si = %iter) -> (f32) {
+      %sn = arith.addf %si, %si : f32
+      scf.yield %sn : f32
+    }
+    scf.yield %result : f32
+  }
+  return %r : f32
+}
+// CHECK-LABEL: func.func @nested_for_yield(
+// CHECK-SAME:      %[[VAL_0:.*]]: index, %[[VAL_1:.*]]: index, %[[VAL_2:.*]]: index) -> f32 {
+// CHECK-NEXT:    %[[VAL_3:.*]] = arith.constant 1.000000e+00 : f32
+// CHECK-NEXT:    %[[VAL_4:.*]] = "emitc.variable"() <{value = #emitc.opaque<"">}> : () -> f32
+// CHECK-NEXT:    %[[VAL_5:.*]] = "emitc.variable"() <{value = #emitc.opaque<"">}> : () -> f32
+// CHECK-NEXT:    emitc.assign %[[VAL_3]] : f32 to %[[VAL_5]] : f32
+// CHECK-NEXT:    emitc.for %[[VAL_6:.*]] = %[[VAL_0]] to %[[VAL_1]] step %[[VAL_2]] {
+// CHECK-NEXT:      %[[VAL_7:.*]] = "emitc.variable"() <{value = #emitc.opaque<"">}> : () -> f32
+// CHECK-NEXT:      %[[VAL_8:.*]] = "emitc.variable"() <{value = #emitc.opaque<"">}> : () -> f32
+// CHECK-NEXT:      emitc.assign %[[VAL_5]] : f32 to %[[VAL_8]] : f32
+// CHECK-NEXT:      emitc.for %[[VAL_9:.*]] = %[[VAL_0]] to %[[VAL_1]] step %[[VAL_2]] {
+// CHECK-NEXT:        %[[VAL_10:.*]] = arith.addf %[[VAL_8]], %[[VAL_8]] : f32
+// CHECK-NEXT:        emitc.assign %[[VAL_10]] : f32 to %[[VAL_8]] : f32
+// CHECK-NEXT:      }
+// CHECK-NEXT:      emitc.assign %[[VAL_8]] : f32 to %[[VAL_7]] : f32
+// CHECK-NEXT:      emitc.assign %[[VAL_7]] : f32 to %[[VAL_5]] : f32
+// CHECK-NEXT:    }
+// CHECK-NEXT:    emitc.assign %[[VAL_5]] : f32 to %[[VAL_4]] : f32
+// CHECK-NEXT:    return %[[VAL_4]] : f32
+// CHECK-NEXT:  }
diff --git a/mlir/test/Dialect/EmitC/invalid_ops.mlir b/mlir/test/Dialect/EmitC/invalid_ops.mlir
index 9e8f0bf0bf8bd..53d88adf4305f 100644
--- a/mlir/test/Dialect/EmitC/invalid_ops.mlir
+++ b/mlir/test/Dialect/EmitC/invalid_ops.mlir
@@ -203,7 +203,7 @@ func.func @sub_pointer_pointer(%arg0: !emitc.ptr<f32>, %arg1: !emitc.ptr<f32>) {
 // -----
 
 func.func @test_misplaced_yield() {
-  // expected-error @+1 {{'emitc.yield' op expects parent op 'emitc.if'}}
+  // expected-error @+1 {{'emitc.yield' op expects parent op to be one of 'emitc.if, emitc.for'}}
   emitc.yield
   return
 }
diff --git a/mlir/test/Dialect/EmitC/ops.mlir b/mlir/test/Dialect/EmitC/ops.mlir
index 0817945e3b1e0..6c83986809804 100644
--- a/mlir/test/Dialect/EmitC/ops.mlir
+++ b/mlir/test/Dialect/EmitC/ops.mlir
@@ -105,7 +105,7 @@ func.func @test_if(%arg0: i1, %arg1: f32) {
   return
 }
 
-func.func @test_explicit_yield(%arg0: i1, %arg1: f32) {
+func.func @test_if_explicit_yield(%arg0: i1, %arg1: f32) {
   emitc.if %arg0 {
      %0 = emitc.call "func_const"(%arg1) : (f32) -> i32
      emitc.yield
@@ -127,3 +127,25 @@ func.func @test_assign(%arg1: f32) {
   emitc.assign %arg1 : f32 to %v : f32
   return
 }
+
+func.func @test_for(%arg0 : index, %arg1 : index, %arg2 : index) {
+  emitc.for %i0 = %arg0 to %arg1 step %arg2 {
+    %0 = emitc.call "func_const"(%i0) : (index) -> i32
+  }
+  return
+}
+
+func.func @test_for_explicit_yield(%arg0 : index, %arg1 : index, %arg2 : index) {
+  emitc.for %i0 = %arg0 to %arg1 step %arg2 {
+    %0 = emitc.call "func_const"(%i0) : (index) -> i32
+    emitc.yield
+  }
+  return
+}
+
+func.func @test_for_not_index_induction(%arg0 : i16, %arg1 : i16, %arg2 : i16) {
+  emitc.for %i0 = %arg0 to %arg1 step %arg2 : i16 {
+    %0 = emitc.call "func_const"(%i0) : (i16) -> i32
+  }
+  return
+}
diff --git a/mlir/test/Target/Cpp/for.mlir b/mlir/test/Target/Cpp/for.mlir
index e904c99820ad8..c02c8b1ac33e3 100644
--- a/mlir/test/Target/Cpp/for.mlir
+++ b/mlir/test/Target/Cpp/for.mlir
@@ -2,7 +2,7 @@
 // RUN: mlir-translate -mlir-to-cpp -declare-variables-at-top %s | FileCheck %s -check-prefix=CPP-DECLTOP
 
 func.func @test_for(%arg0 : index, %arg1 : index, %arg2 : index) {
-  scf.for %i0 = %arg0 to %arg1 step %arg2 {
+  emitc.for %i0 = %arg0 to %arg1 step %arg2 {
     %0 = emitc.call "f"() : () -> i32
   }
   return
@@ -28,11 +28,21 @@ func.func @test_for_yield() {
   %s0 = arith.constant 0 : i32
   %p0 = arith.constant 1.0 : f32
 
-  %result:2 = scf.for %iter = %start to %stop step %step iter_args(%si = %s0, %pi = %p0) -> (i32, f32) {
-    %sn = emitc.call "add"(%si, %iter) : (i32, index) -> i32
-    %pn = emitc.call "mul"(%pi, %iter) : (f32, index) -> f32
-    scf.yield %sn, %pn : i32, f32
+  %0 = "emitc.variable"() <{value = #emitc.opaque<"">}> : () -> i32
+  %1 = "emitc.variable"() <{value = #emitc.opaque<"">}> : () -> f32
+  %2 = "emitc.variable"() <{value = #emitc.opaque<"">}> : () -> i32
+  %3 = "emitc.variable"() <{value = #emitc.opaque<"">}> : () -> f32
+  emitc.assign %s0 : i32 to %2 : i32
+  emitc.assign %p0 : f32 to %3 : f32
+  emitc.for %iter = %start to %stop step %step {
+    %sn = emitc.call "add"(%2, %iter) : (i32, index) -> i32
+    %pn = emitc.call "mul"(%3, %iter) : (f32, index) -> f32
+    emitc.assign %sn : i32 to %2 : i32
+    emitc.assign %pn : f32 to %3 : f32
+    emitc.yield
   }
+  emitc.assign %2 : i32 to %0 : i32
+  emitc.assign %3 : f32 to %1 : f32
 
   return
 }
@@ -44,8 +54,10 @@ func.func @test_for_yield() {
 // CPP-DEFAULT-NEXT: float [[P0:[^ ]*]] = (float)1.000000000e+00;
 // CPP-DEFAULT-NEXT: int32_t [[SE:[^ ]*]];
 // CPP-DEFAULT-NEXT: float [[PE:[^ ]*]];
-// CPP-DEFAULT-NEXT: int32_t [[SI:[^ ]*]] = [[S0]];
-// CPP-DEFAULT-NEXT: float [[PI:[^ ]*]] = [[P0]];
+// CPP-DEFAULT-NEXT: int32_t [[SI:[^ ]*]];
+// CPP-DEFAULT-NEXT: float [[PI:[^ ]*]];
+// CPP-DEFAULT-NEXT: [[SI:[^ ]*]] = [[S0]];
+// CPP-DEFAULT-NEXT: [[PI:[^ ]*]] = [[P0]];
 // CPP-DEFAULT-NEXT: for (size_t [[ITER:[^ ]*]] = [[START]]; [[ITER]] < [[STOP]]; [[ITER]] += [[STEP]]) {
 // CPP-DEFAULT-NEXT: int32_t [[SN:[^ ]*]] = add([[SI]], [[ITER]]);
 // CPP-DEFAULT-NEXT: float [[PN:[^ ]*]] = mul([[PI]], [[ITER]]);
@@ -64,6 +76,8 @@ func.func @test_for_yield() {
 // CPP-DECLTOP-NEXT: float [[P0:[^ ]*]];
 // CPP-DECLTOP-NEXT: int32_t [[SE:[^ ]*]];
 // CPP-DECLTOP-NEXT: float [[PE:[^ ]*]];
+// CPP-DECLTOP-NEXT: int32_t [[SI:[^ ]*]];
+// CPP-DECLTOP-NEXT: float [[PI:[^ ]*]];
 // CPP-DECLTOP-NEXT: int32_t [[SN:[^ ]*]];
 // CPP-DECLTOP-NEXT: float [[PN:[^ ]*]];
 // CPP-DECLTOP-NEXT: [[START]] = 0;
@@ -71,8 +85,12 @@ func.func @test_for_yield() {
 // CPP-DECLTOP-NEXT: [[STEP]] = 1;
 // CPP-DECLTOP-NEXT: [[S0]] = 0;
 // CPP-DECLTOP-NEXT: [[P0]] = (float)1.000000000e+00;
-// CPP-DECLTOP-NEXT: int32_t [[SI:[^ ]*]] = [[S0]];
-// CPP-DECLTOP-NEXT: float [[PI:[^ ]*]] = [[P0]];
+// CPP-DECLTOP-NEXT: ;
+// CPP-DECLTOP-NEXT: ;
+// CPP-DECLTOP-NEXT: ;
+// CPP-DECLTOP-NEXT: ;
+// CPP-DECLTOP-NEXT: [[SI:[^ ]*]] = [[S0]];
+// CPP-DECLTOP-NEXT: [[PI:[^ ]*]] = [[P0]];
 // CPP-DECLTOP-NEXT: for (size_t [[ITER:[^ ]*]] = [[START]]; [[ITER]] < [[STOP]]; [[ITER]] += [[STEP]]) {
 // CPP-DECLTOP-NEXT: [[SN]] = add([[SI]], [[ITER]]);
 // CPP-DECLTOP-NEXT: [[PN]] = mul([[PI]], [[ITER]]);
@@ -91,14 +109,24 @@ func.func @test_for_yield_2() {
   %s0 = emitc.literal "0" : i32
   %p0 = emitc.literal "M_PI" : f32
 
-  %result:2 = scf.for %iter = %start to %stop step %step iter_args(%si = %s0, %pi = %p0) -> (i32, f32) {
-    %sn = emitc.call "add"(%si, %iter) : (i32, index) -> i32
-    %pn = emitc.call "mul"(%pi, %iter) : (f32, index) -> f32
-    scf.yield %sn, %pn : i32, f32
+  %0 = "emitc.variable"() <{value = #emitc.opaque<"">}> : () -> i32
+  %1 = "emitc.variable"() <{value = #emitc.opaque<"">}> : () -> f32
+  %2 = "emitc.variable"() <{value = #emitc.opaque<"">}> : () -> i32
+  %3 = "emitc.variable"() <{value = #emitc.opaque<"">}> : () -> f32
+  emitc.assign %s0 : i32 to %2 : i32
+  emitc.assign %p0 : f32 to %3 : f32
+  emitc.for %iter = %start to %stop step %step {
+    %sn = emitc.call "add"(%2, %iter) : (i32, index) -> i32
+    %pn = emitc.call "mul"(%3, %iter) : (f32, index) -> f32
+    emitc.assign %sn : i32 to %2 : i32
+    emitc.assign %pn : f32 to %3 : f32
+    emitc.yield
   }
+  emitc.assign %2 : i32 to %0 : i32
+  emitc.assign %3 : f32 to %1 : f32
 
   return
 }
 // CPP-DEFAULT: void test_for_yield_2() {
-// CPP-DEFAULT: float{{.*}}= M_PI
+// CPP-DEFAULT: {{.*}}= M_PI
 // CPP-DEFAULT: for (size_t [[IN:.*]] = 0; [[IN]] < 10; [[IN]] += 1) {

From f118d474eb64c29aff848fe60d1a49fbd310dcbe Mon Sep 17 00:00:00 2001
From: Alexander Richardson <alexrichardson@google.com>
Date: Thu, 26 Oct 2023 07:08:58 -0700
Subject: [PATCH 072/877] [AMDGPU] Use alloca address space in
 rewrite-out-arguments.ll (#70269)

This is needed for the transform to fire with a correct data layout.
Pre-commiting this change to keep the diff of D141060 smaller.
---
 .../CodeGen/AMDGPU/rewrite-out-arguments.ll   | 652 +++++++++---------
 1 file changed, 327 insertions(+), 325 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-out-arguments.ll b/llvm/test/CodeGen/AMDGPU/rewrite-out-arguments.ll
index 72d0053693b78..0d35ba8e1161e 100644
--- a/llvm/test/CodeGen/AMDGPU/rewrite-out-arguments.ll
+++ b/llvm/test/CodeGen/AMDGPU/rewrite-out-arguments.ll
@@ -1,119 +1,121 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs
 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-rewrite-out-arguments < %s | FileCheck %s
-
+; Temporarily add an explicit datalayout until https://reviews.llvm.org/D141060 lands
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8"
+target triple = "amdgcn-amd-amdhsa"
 
 define void @no_ret_blocks() #0 {
   unreachable
 }
 
-define void @void_one_out_arg_i32_no_use(ptr %val) #0 {
+define void @void_one_out_arg_i32_no_use(ptr addrspace(5) %val) #0 {
   ret void
 }
 
-define void @skip_byval_arg(ptr byval(i32) %val) #0 {
-  store i32 0, ptr %val
+define void @skip_byval_arg(ptr addrspace(5) byval(i32) %val) #0 {
+  store i32 0, ptr addrspace(5) %val
   ret void
 }
 
-define void @skip_optnone(ptr byval(i32) %val) #1 {
-  store i32 0, ptr %val
+define void @skip_optnone(ptr addrspace(5) byval(i32) %val) #1 {
+  store i32 0, ptr addrspace(5) %val
   ret void
 }
 
-define void @skip_volatile(ptr byval(i32) %val) #0 {
-  store volatile i32 0, ptr %val
+define void @skip_volatile(ptr addrspace(5) byval(i32) %val) #0 {
+  store volatile i32 0, ptr addrspace(5) %val
   ret void
 }
 
-define void @skip_atomic(ptr byval(i32) %val) #0 {
-  store atomic i32 0, ptr %val seq_cst, align 4
+define void @skip_atomic(ptr addrspace(5) byval(i32) %val) #0 {
+  store atomic i32 0, ptr addrspace(5) %val seq_cst, align 4
   ret void
 }
 
-define void @skip_store_pointer_val(ptr %val) #0 {
-  store ptr %val, ptr poison
+define void @skip_store_pointer_val(ptr addrspace(5) %val) #0 {
+  store ptr addrspace(5) %val, ptr poison
   ret void
 }
 
-define void @skip_store_gep(ptr %val) #0 {
-  %gep = getelementptr inbounds i32, ptr %val, i32 1
-  store i32 0, ptr %gep
+define void @skip_store_gep(ptr addrspace(5) %val) #0 {
+  %gep = getelementptr inbounds i32, ptr addrspace(5) %val, i32 1
+  store i32 0, ptr addrspace(5) %gep
   ret void
 }
 
-define void @skip_sret(ptr sret(i32) %sret, ptr %out) #0 {
-  store i32 1, ptr %sret
-  store i32 0, ptr %out
+define void @skip_sret(ptr addrspace(5) sret(i32) %sret, ptr addrspace(5) %out) #0 {
+  store i32 1, ptr addrspace(5) %sret
+  store i32 0, ptr addrspace(5) %out
   ret void
 }
 
 
-define void @void_one_out_arg_i32_1_use(ptr %val) #0 {
-  store i32 0, ptr %val
+define void @void_one_out_arg_i32_1_use(ptr addrspace(5) %val) #0 {
+  store i32 0, ptr addrspace(5) %val
   ret void
 }
 
 
-define void @void_one_out_arg_i32_1_use_align(ptr align 8 %val) #0 {
-  store i32 0, ptr %val, align 8
+define void @void_one_out_arg_i32_1_use_align(ptr addrspace(5) align 8 %val) #0 {
+  store i32 0, ptr addrspace(5) %val, align 8
   ret void
 }
 
 
-define void @void_one_out_arg_i32_2_use(i1 %arg0, ptr %val) #0 {
+define void @void_one_out_arg_i32_2_use(i1 %arg0, ptr addrspace(5) %val) #0 {
   br i1 %arg0, label %ret0, label %ret1
 
 ret0:
-  store i32 0, ptr %val
+  store i32 0, ptr addrspace(5) %val
   ret void
 
 ret1:
-  store i32 9, ptr %val
+  store i32 9, ptr addrspace(5) %val
   ret void
 }
 
 declare void @may.clobber()
 
 
-define void @void_one_out_arg_i32_2_stores(ptr %val) #0 {
-  store i32 0, ptr %val
-  store i32 1, ptr %val
+define void @void_one_out_arg_i32_2_stores(ptr addrspace(5) %val) #0 {
+  store i32 0, ptr addrspace(5) %val
+  store i32 1, ptr addrspace(5) %val
   ret void
 }
 
 
-define void @void_one_out_arg_i32_2_stores_clobber(ptr %val) #0 {
-  store i32 0, ptr %val
+define void @void_one_out_arg_i32_2_stores_clobber(ptr addrspace(5) %val) #0 {
+  store i32 0, ptr addrspace(5) %val
   call void @may.clobber()
-  store i32 1, ptr %val
+  store i32 1, ptr addrspace(5) %val
   ret void
 }
 
 
-define void @void_one_out_arg_i32_call_may_clobber(ptr %val) #0 {
-  store i32 0, ptr %val
+define void @void_one_out_arg_i32_call_may_clobber(ptr addrspace(5) %val) #0 {
+  store i32 0, ptr addrspace(5) %val
   call void @may.clobber()
   ret void
 }
 
 
-define void @void_one_out_arg_i32_pre_call_may_clobber(ptr %val) #0 {
+define void @void_one_out_arg_i32_pre_call_may_clobber(ptr addrspace(5) %val) #0 {
   call void @may.clobber()
-  store i32 0, ptr %val
+  store i32 0, ptr addrspace(5) %val
   ret void
 }
 
-define void @void_one_out_arg_i32_reload(ptr %val) #0 {
-  store i32 0, ptr %val
-  %load = load i32, ptr %val, align 4
+define void @void_one_out_arg_i32_reload(ptr addrspace(5) %val) #0 {
+  store i32 0, ptr addrspace(5) %val
+  %load = load i32, ptr addrspace(5) %val, align 4
   ret void
 }
 
-define void @void_one_out_arg_i32_store_in_different_block(ptr %out) #0 {
+define void @void_one_out_arg_i32_store_in_different_block(ptr addrspace(5) %out) #0 {
   %load = load i32, ptr addrspace(1) poison
-  store i32 0, ptr %out
+  store i32 0, ptr addrspace(5) %out
   br label %ret
 
 ret:
@@ -121,20 +123,20 @@ ret:
 }
 
 
-define void @unused_out_arg_one_branch(i1 %arg0, ptr %val) #0 {
+define void @unused_out_arg_one_branch(i1 %arg0, ptr addrspace(5) %val) #0 {
   br i1 %arg0, label %ret0, label %ret1
 
 ret0:
   ret void
 
 ret1:
-  store i32 9, ptr %val
+  store i32 9, ptr addrspace(5) %val
   ret void
 }
 
 
-define void @void_one_out_arg_v2i32_1_use(ptr %val) #0 {
-  store <2 x i32> <i32 17, i32 9>, ptr %val
+define void @void_one_out_arg_v2i32_1_use(ptr addrspace(5) %val) #0 {
+  store <2 x i32> <i32 17, i32 9>, ptr addrspace(5) %val
   ret void
 }
 
@@ -142,50 +144,50 @@ define void @void_one_out_arg_v2i32_1_use(ptr %val) #0 {
 
 
 ; Normally this is split into element accesses which we don't handle.
-define void @void_one_out_arg_struct_1_use(ptr %out) #0 {
-  store %struct { i32 9, i8 99, float 4.0 }, ptr %out
+define void @void_one_out_arg_struct_1_use(ptr addrspace(5) %out) #0 {
+  store %struct { i32 9, i8 99, float 4.0 }, ptr addrspace(5) %out
   ret void
 }
 
 
-define i32 @i32_one_out_arg_i32_1_use(ptr %val) #0 {
-  store i32 24, ptr %val
+define i32 @i32_one_out_arg_i32_1_use(ptr addrspace(5) %val) #0 {
+  store i32 24, ptr addrspace(5) %val
   ret i32 9
 }
 
 
-define void @unused_different_type(ptr %arg0, ptr nocapture %arg1) #0 {
-  store float 4.0, ptr %arg1, align 4
+define void @unused_different_type(ptr addrspace(5) %arg0, ptr addrspace(5) nocapture %arg1) #0 {
+  store float 4.0, ptr addrspace(5) %arg1, align 4
   ret void
 }
 
 
-define void @multiple_same_return_noalias(ptr noalias %out0, ptr noalias %out1) #0 {
-  store i32 1, ptr %out0, align 4
-  store i32 2, ptr %out1, align 4
+define void @multiple_same_return_noalias(ptr addrspace(5) noalias %out0, ptr addrspace(5) noalias %out1) #0 {
+  store i32 1, ptr addrspace(5) %out0, align 4
+  store i32 2, ptr addrspace(5) %out1, align 4
   ret void
 }
 
 
-define void @multiple_same_return_mayalias(ptr %out0, ptr %out1) #0 {
-  store i32 1, ptr %out0, align 4
-  store i32 2, ptr %out1, align 4
+define void @multiple_same_return_mayalias(ptr addrspace(5) %out0, ptr addrspace(5) %out1) #0 {
+  store i32 1, ptr addrspace(5) %out0, align 4
+  store i32 2, ptr addrspace(5) %out1, align 4
   ret void
 }
 
 
-define void @multiple_same_return_mayalias_order(ptr %out0, ptr %out1) #0 {
-  store i32 2, ptr %out1, align 4
-  store i32 1, ptr %out0, align 4
+define void @multiple_same_return_mayalias_order(ptr addrspace(5) %out0, ptr addrspace(5) %out1) #0 {
+  store i32 2, ptr addrspace(5) %out1, align 4
+  store i32 1, ptr addrspace(5) %out0, align 4
   ret void
 }
 
 ; Currently this fails to convert because the store won't be found if
 ; it isn't in the same block as the return.
-define i32 @store_in_entry_block(i1 %arg0, ptr %out) #0 {
+define i32 @store_in_entry_block(i1 %arg0, ptr addrspace(5) %out) #0 {
 entry:
   %val0 = load i32, ptr addrspace(1) poison
-  store i32 %val0, ptr %out
+  store i32 %val0, ptr addrspace(5) %out
   br i1 %arg0, label %if, label %endif
 
 if:
@@ -198,8 +200,8 @@ endif:
 }
 
 
-define i1 @i1_one_out_arg_i32_1_use(ptr %val) #0 {
-  store i32 24, ptr %val
+define i1 @i1_one_out_arg_i32_1_use(ptr addrspace(5) %val) #0 {
+  store i32 24, ptr addrspace(5) %val
   ret i1 true
 }
 
@@ -207,20 +209,20 @@ define i1 @i1_one_out_arg_i32_1_use(ptr %val) #0 {
 ; incompatible with struct return types.
 
 
-define zeroext i1 @i1_zeroext_one_out_arg_i32_1_use(ptr %val) #0 {
-  store i32 24, ptr %val
+define zeroext i1 @i1_zeroext_one_out_arg_i32_1_use(ptr addrspace(5) %val) #0 {
+  store i32 24, ptr addrspace(5) %val
   ret i1 true
 }
 
 
-define signext i1 @i1_signext_one_out_arg_i32_1_use(ptr %val) #0 {
-  store i32 24, ptr %val
+define signext i1 @i1_signext_one_out_arg_i32_1_use(ptr addrspace(5) %val) #0 {
+  store i32 24, ptr addrspace(5) %val
   ret i1 true
 }
 
 
-define noalias ptr addrspace(1) @p1i32_noalias_one_out_arg_i32_1_use(ptr %val) #0 {
-  store i32 24, ptr %val
+define noalias ptr addrspace(1) @p1i32_noalias_one_out_arg_i32_1_use(ptr addrspace(5) %val) #0 {
+  store i32 24, ptr addrspace(5) %val
   ret ptr addrspace(1) null
 }
 
@@ -229,74 +231,74 @@ define void @void_one_out_non_private_arg_i32_1_use(ptr addrspace(1) %val) #0 {
   ret void
 }
 
-define void @func_ptr_type(ptr %out) #0 {
+define void @func_ptr_type(ptr addrspace(5) %out) #0 {
   %func = load ptr, ptr poison
-  store ptr %func, ptr %out
+  store ptr %func, ptr addrspace(5) %out
   ret void
 }
 
-define void @bitcast_func_ptr_type(ptr %out) #0 {
+define void @bitcast_func_ptr_type(ptr addrspace(5) %out) #0 {
   %func = load ptr, ptr poison
-  store ptr %func, ptr %out
+  store ptr %func, ptr addrspace(5) %out
   ret void
 }
 
 
-define void @out_arg_small_array(ptr %val) #0 {
-  store [4 x i32] [i32 0, i32 1, i32 2, i32 3], ptr %val
+define void @out_arg_small_array(ptr addrspace(5) %val) #0 {
+  store [4 x i32] [i32 0, i32 1, i32 2, i32 3], ptr addrspace(5) %val
   ret void
 }
 
-define void @out_arg_large_array(ptr %val) #0 {
-  store [17 x i32] zeroinitializer, ptr %val
+define void @out_arg_large_array(ptr addrspace(5) %val) #0 {
+  store [17 x i32] zeroinitializer, ptr addrspace(5) %val
   ret void
 }
 
-define <16 x i32> @num_regs_return_limit(ptr %out, i32 %val) #0 {
+define <16 x i32> @num_regs_return_limit(ptr addrspace(5) %out, i32 %val) #0 {
   %load = load volatile <16 x i32>, ptr addrspace(1) poison
-  store i32 %val, ptr %out
+  store i32 %val, ptr addrspace(5) %out
   ret <16 x i32> %load
 }
 
-define [15 x i32] @num_regs_reach_limit(ptr %out, i32 %val) #0 {
+define [15 x i32] @num_regs_reach_limit(ptr addrspace(5) %out, i32 %val) #0 {
   %load = load volatile [15 x i32], ptr addrspace(1) poison
-  store i32 %val, ptr %out
+  store i32 %val, ptr addrspace(5) %out
   ret [15 x i32] %load
 }
 
 
-define [15 x i32] @num_regs_reach_limit_leftover(ptr %out0, ptr %out1, i32 %val0) #0 {
+define [15 x i32] @num_regs_reach_limit_leftover(ptr addrspace(5) %out0, ptr addrspace(5) %out1, i32 %val0) #0 {
   %load0 = load volatile [15 x i32], ptr addrspace(1) poison
   %load1 = load volatile i32, ptr addrspace(1) poison
-  store i32 %val0, ptr %out0
-  store i32 %load1, ptr %out1
+  store i32 %val0, ptr addrspace(5) %out0
+  store i32 %load1, ptr addrspace(5) %out1
   ret [15 x i32] %load0
 }
 
 
-define void @preserve_debug_info(i32 %arg0, ptr %val) #0 !dbg !5 {
+define void @preserve_debug_info(i32 %arg0, ptr addrspace(5) %val) #0 !dbg !5 {
   call void @may.clobber(), !dbg !10
-  store i32 %arg0, ptr %val, !dbg !11
+  store i32 %arg0, ptr addrspace(5) %val, !dbg !11
   ret void, !dbg !12
 }
 
-define void @preserve_metadata(i32 %arg0, ptr %val) #0 !kernel_arg_access_qual !13 {
+define void @preserve_metadata(i32 %arg0, ptr addrspace(5) %val) #0 !kernel_arg_access_qual !13 {
   call void @may.clobber()
-  store i32 %arg0, ptr %val
+  store i32 %arg0, ptr addrspace(5) %val
   ret void
 }
 
 ; Clang emits this pattern for 3-vectors for some reason.
 
-define void @bitcast_pointer_v4i32_v3i32(ptr %out) #0 {
+define void @bitcast_pointer_v4i32_v3i32(ptr addrspace(5) %out) #0 {
   %load = load volatile <4 x i32>, ptr addrspace(1) poison
-  store <4 x i32> %load, ptr %out
+  store <4 x i32> %load, ptr addrspace(5) %out
   ret void
 }
 
-define void @bitcast_pointer_v4i32_v3f32(ptr %out) #0 {
+define void @bitcast_pointer_v4i32_v3f32(ptr addrspace(5) %out) #0 {
   %load = load volatile <4 x i32>, ptr addrspace(1) poison
-  store <4 x i32> %load, ptr %out
+  store <4 x i32> %load, ptr addrspace(5) %out
   ret void
 }
 
@@ -305,21 +307,21 @@ define void @bitcast_pointer_v4i32_v3f32(ptr %out) #0 {
 ; casts.
 
 
-define void @bitcast_pointer_i32_f32(ptr %out) #0 {
+define void @bitcast_pointer_i32_f32(ptr addrspace(5) %out) #0 {
   %load = load volatile i32, ptr addrspace(1) poison
-  store i32 %load, ptr %out
+  store i32 %load, ptr addrspace(5) %out
   ret void
 }
 
-define void @bitcast_pointer_i32_f16(ptr %out) #0 {
+define void @bitcast_pointer_i32_f16(ptr addrspace(5) %out) #0 {
   %load = load volatile i32, ptr addrspace(1) poison
-  store i32 %load, ptr %out
+  store i32 %load, ptr addrspace(5) %out
   ret void
 }
 
-define void @bitcast_pointer_f16_i32(ptr %out) #0 {
+define void @bitcast_pointer_f16_i32(ptr addrspace(5) %out) #0 {
   %load = load volatile half, ptr addrspace(1) poison
-  store half %load, ptr %out
+  store half %load, ptr addrspace(5) %out
   ret void
 }
 
@@ -330,80 +332,80 @@ define void @bitcast_pointer_f16_i32(ptr %out) #0 {
 %struct.v4f32 = type { <4 x float> }
 
 
-define void @bitcast_struct_v3f32_v3f32(ptr %out, <3 x float> %value) #0 {
+define void @bitcast_struct_v3f32_v3f32(ptr addrspace(5) %out, <3 x float> %value) #0 {
   %extractVec = shufflevector <3 x float> %value, <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
-  store <4 x float> %extractVec, ptr %out, align 16
+  store <4 x float> %extractVec, ptr addrspace(5) %out, align 16
   ret void
 }
 
 
-define void @bitcast_struct_v3f32_v3i32(ptr %out, <3 x i32> %value) #0 {
+define void @bitcast_struct_v3f32_v3i32(ptr addrspace(5) %out, <3 x i32> %value) #0 {
   %extractVec = shufflevector <3 x i32> %value, <3 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
-  store <4 x i32> %extractVec, ptr %out, align 16
+  store <4 x i32> %extractVec, ptr addrspace(5) %out, align 16
   ret void
 }
 
 
-define void @bitcast_struct_v4f32_v4f32(ptr %out, <4 x float> %value) #0 {
-  store <4 x float> %value, ptr %out, align 16
+define void @bitcast_struct_v4f32_v4f32(ptr addrspace(5) %out, <4 x float> %value) #0 {
+  store <4 x float> %value, ptr addrspace(5) %out, align 16
   ret void
 }
 
-define void @bitcast_struct_v3f32_v4i32(ptr %out, <4 x i32> %value) #0 {
-  store <4 x i32> %value, ptr %out, align 16
+define void @bitcast_struct_v3f32_v4i32(ptr addrspace(5) %out, <4 x i32> %value) #0 {
+  store <4 x i32> %value, ptr addrspace(5) %out, align 16
   ret void
 }
 
-define void @bitcast_struct_v4f32_v3f32(ptr %out, <3 x float> %value) #0 {
+define void @bitcast_struct_v4f32_v3f32(ptr addrspace(5) %out, <3 x float> %value) #0 {
   %extractVec = shufflevector <3 x float> %value, <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
-  store <4 x float> %extractVec, ptr %out, align 16
+  store <4 x float> %extractVec, ptr addrspace(5) %out, align 16
   ret void
 }
 
-define void @bitcast_struct_v3f32_v2f32(ptr %out, <2 x float> %value) #0 {
-  store <2 x float> %value, ptr %out, align 8
+define void @bitcast_struct_v3f32_v2f32(ptr addrspace(5) %out, <2 x float> %value) #0 {
+  store <2 x float> %value, ptr addrspace(5) %out, align 8
   ret void
 }
 
-define void @bitcast_struct_v3f32_f32_v3f32(ptr %out, <3 x float> %value) #0 {
+define void @bitcast_struct_v3f32_f32_v3f32(ptr addrspace(5) %out, <3 x float> %value) #0 {
   %extractVec = shufflevector <3 x float> %value, <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
-  store <4 x float> %extractVec, ptr %out, align 16
+  store <4 x float> %extractVec, ptr addrspace(5) %out, align 16
   ret void
 }
 
-define void @bitcast_struct_v3f32_f32_v4f32(ptr %out, <4 x float> %value) #0 {
-  store <4 x float> %value, ptr %out, align 16
+define void @bitcast_struct_v3f32_f32_v4f32(ptr addrspace(5) %out, <4 x float> %value) #0 {
+  store <4 x float> %value, ptr addrspace(5) %out, align 16
   ret void
 }
 
-define void @bitcast_struct_i128_v4f32(ptr %out, <4 x float> %value) #0 {
-  store <4 x float> %value, ptr %out, align 16
+define void @bitcast_struct_i128_v4f32(ptr addrspace(5) %out, <4 x float> %value) #0 {
+  store <4 x float> %value, ptr addrspace(5) %out, align 16
   ret void
 }
 
-define void @bitcast_array_v4i32_v4f32(ptr %out, [4 x float] %value) #0 {
-  store [4 x float] %value, ptr %out, align 4
+define void @bitcast_array_v4i32_v4f32(ptr addrspace(5) %out, [4 x float] %value) #0 {
+  store [4 x float] %value, ptr addrspace(5) %out, align 4
   ret void
 }
 
 
-define void @multi_return_bitcast_struct_v3f32_v3f32(i1 %cond, ptr %out, <3 x float> %value) #0 {
+define void @multi_return_bitcast_struct_v3f32_v3f32(i1 %cond, ptr addrspace(5) %out, <3 x float> %value) #0 {
 entry:
   br i1 %cond, label %ret0, label %ret1
 
 ret0:
   %extractVec = shufflevector <3 x float> %value, <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
-  store <4 x float> %extractVec, ptr %out, align 16
+  store <4 x float> %extractVec, ptr addrspace(5) %out, align 16
   ret void
 
 ret1:
   %load = load <4 x float>, ptr addrspace(1) poison
-  store <4 x float> %load, ptr %out, align 16
+  store <4 x float> %load, ptr addrspace(5) %out, align 16
   ret void
 }
 
-define void @bitcast_v3f32_struct_v3f32(ptr %out, %struct.v3f32 %value) #0 {
-  store %struct.v3f32 %value, ptr %out, align 4
+define void @bitcast_v3f32_struct_v3f32(ptr addrspace(5) %out, %struct.v3f32 %value) #0 {
+  store %struct.v3f32 %value, ptr addrspace(5) %out, align 4
   ret void
 }
 
@@ -435,82 +437,82 @@ attributes #2 = { alwaysinline nounwind }
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@void_one_out_arg_i32_no_use
-; CHECK-SAME: (ptr [[VAL:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: (ptr addrspace(5) [[VAL:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    ret void
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@skip_byval_arg
-; CHECK-SAME: (ptr byval(i32) [[VAL:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    store i32 0, ptr [[VAL]], align 4
+; CHECK-SAME: (ptr addrspace(5) byval(i32) [[VAL:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    store i32 0, ptr addrspace(5) [[VAL]], align 4
 ; CHECK-NEXT:    ret void
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@skip_optnone
-; CHECK-SAME: (ptr byval(i32) [[VAL:%.*]]) #[[ATTR1:[0-9]+]] {
-; CHECK-NEXT:    store i32 0, ptr [[VAL]], align 4
+; CHECK-SAME: (ptr addrspace(5) byval(i32) [[VAL:%.*]]) #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT:    store i32 0, ptr addrspace(5) [[VAL]], align 4
 ; CHECK-NEXT:    ret void
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@skip_volatile
-; CHECK-SAME: (ptr byval(i32) [[VAL:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    store volatile i32 0, ptr [[VAL]], align 4
+; CHECK-SAME: (ptr addrspace(5) byval(i32) [[VAL:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    store volatile i32 0, ptr addrspace(5) [[VAL]], align 4
 ; CHECK-NEXT:    ret void
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@skip_atomic
-; CHECK-SAME: (ptr byval(i32) [[VAL:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    store atomic i32 0, ptr [[VAL]] seq_cst, align 4
+; CHECK-SAME: (ptr addrspace(5) byval(i32) [[VAL:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    store atomic i32 0, ptr addrspace(5) [[VAL]] seq_cst, align 4
 ; CHECK-NEXT:    ret void
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@skip_store_pointer_val
-; CHECK-SAME: (ptr [[VAL:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    store ptr [[VAL]], ptr poison, align 8
+; CHECK-SAME: (ptr addrspace(5) [[VAL:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    store ptr addrspace(5) [[VAL]], ptr poison, align 4
 ; CHECK-NEXT:    ret void
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@skip_store_gep
-; CHECK-SAME: (ptr [[VAL:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, ptr [[VAL]], i32 1
-; CHECK-NEXT:    store i32 0, ptr [[GEP]], align 4
+; CHECK-SAME: (ptr addrspace(5) [[VAL:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, ptr addrspace(5) [[VAL]], i32 1
+; CHECK-NEXT:    store i32 0, ptr addrspace(5) [[GEP]], align 4
 ; CHECK-NEXT:    ret void
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@skip_sret
-; CHECK-SAME: (ptr sret(i32) [[SRET:%.*]], ptr [[OUT:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    store i32 1, ptr [[SRET]], align 4
-; CHECK-NEXT:    store i32 0, ptr [[OUT]], align 4
+; CHECK-SAME: (ptr addrspace(5) sret(i32) [[SRET:%.*]], ptr addrspace(5) [[OUT:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    store i32 1, ptr addrspace(5) [[SRET]], align 4
+; CHECK-NEXT:    store i32 0, ptr addrspace(5) [[OUT]], align 4
 ; CHECK-NEXT:    ret void
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@void_one_out_arg_i32_1_use.body
-; CHECK-SAME: (ptr [[VAL:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: (ptr addrspace(5) [[VAL:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    ret [[VOID_ONE_OUT_ARG_I32_1_USE:%.*]] zeroinitializer
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@void_one_out_arg_i32_1_use
-; CHECK-SAME: (ptr [[TMP0:%.*]]) #[[ATTR2:[0-9]+]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = call [[VOID_ONE_OUT_ARG_I32_1_USE:%.*]] @void_one_out_arg_i32_1_use.body(ptr poison)
+; CHECK-SAME: (ptr addrspace(5) [[TMP0:%.*]]) #[[ATTR2:[0-9]+]] {
+; CHECK-NEXT:    [[TMP2:%.*]] = call [[VOID_ONE_OUT_ARG_I32_1_USE:%.*]] @void_one_out_arg_i32_1_use.body(ptr addrspace(5) poison)
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue [[VOID_ONE_OUT_ARG_I32_1_USE]] [[TMP2]], 0
-; CHECK-NEXT:    store i32 [[TMP3]], ptr [[TMP0]], align 4
+; CHECK-NEXT:    store i32 [[TMP3]], ptr addrspace(5) [[TMP0]], align 4
 ; CHECK-NEXT:    ret void
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@void_one_out_arg_i32_1_use_align.body
-; CHECK-SAME: (ptr align 8 [[VAL:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: (ptr addrspace(5) align 8 [[VAL:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    ret [[VOID_ONE_OUT_ARG_I32_1_USE_ALIGN:%.*]] zeroinitializer
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@void_one_out_arg_i32_1_use_align
-; CHECK-SAME: (ptr align 8 [[TMP0:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = call [[VOID_ONE_OUT_ARG_I32_1_USE_ALIGN:%.*]] @void_one_out_arg_i32_1_use_align.body(ptr poison)
+; CHECK-SAME: (ptr addrspace(5) align 8 [[TMP0:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP2:%.*]] = call [[VOID_ONE_OUT_ARG_I32_1_USE_ALIGN:%.*]] @void_one_out_arg_i32_1_use_align.body(ptr addrspace(5) poison)
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue [[VOID_ONE_OUT_ARG_I32_1_USE_ALIGN]] [[TMP2]], 0
-; CHECK-NEXT:    store i32 [[TMP3]], ptr [[TMP0]], align 8
+; CHECK-NEXT:    store i32 [[TMP3]], ptr addrspace(5) [[TMP0]], align 8
 ; CHECK-NEXT:    ret void
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@void_one_out_arg_i32_2_use.body
-; CHECK-SAME: (i1 [[ARG0:%.*]], ptr [[VAL:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: (i1 [[ARG0:%.*]], ptr addrspace(5) [[VAL:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    br i1 [[ARG0]], label [[RET0:%.*]], label [[RET1:%.*]]
 ; CHECK:       ret0:
 ; CHECK-NEXT:    ret [[VOID_ONE_OUT_ARG_I32_2_USE:%.*]] zeroinitializer
@@ -519,192 +521,192 @@ attributes #2 = { alwaysinline nounwind }
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@void_one_out_arg_i32_2_use
-; CHECK-SAME: (i1 [[TMP0:%.*]], ptr [[TMP1:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[TMP3:%.*]] = call [[VOID_ONE_OUT_ARG_I32_2_USE:%.*]] @void_one_out_arg_i32_2_use.body(i1 [[TMP0]], ptr poison)
+; CHECK-SAME: (i1 [[TMP0:%.*]], ptr addrspace(5) [[TMP1:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = call [[VOID_ONE_OUT_ARG_I32_2_USE:%.*]] @void_one_out_arg_i32_2_use.body(i1 [[TMP0]], ptr addrspace(5) poison)
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue [[VOID_ONE_OUT_ARG_I32_2_USE]] [[TMP3]], 0
-; CHECK-NEXT:    store i32 [[TMP4]], ptr [[TMP1]], align 4
+; CHECK-NEXT:    store i32 [[TMP4]], ptr addrspace(5) [[TMP1]], align 4
 ; CHECK-NEXT:    ret void
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@void_one_out_arg_i32_2_stores.body
-; CHECK-SAME: (ptr [[VAL:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    store i32 0, ptr [[VAL]], align 4
+; CHECK-SAME: (ptr addrspace(5) [[VAL:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    store i32 0, ptr addrspace(5) [[VAL]], align 4
 ; CHECK-NEXT:    ret [[VOID_ONE_OUT_ARG_I32_2_STORES:%.*]] { i32 1 }
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@void_one_out_arg_i32_2_stores
-; CHECK-SAME: (ptr [[TMP0:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = call [[VOID_ONE_OUT_ARG_I32_2_STORES:%.*]] @void_one_out_arg_i32_2_stores.body(ptr poison)
+; CHECK-SAME: (ptr addrspace(5) [[TMP0:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP2:%.*]] = call [[VOID_ONE_OUT_ARG_I32_2_STORES:%.*]] @void_one_out_arg_i32_2_stores.body(ptr addrspace(5) poison)
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue [[VOID_ONE_OUT_ARG_I32_2_STORES]] [[TMP2]], 0
-; CHECK-NEXT:    store i32 [[TMP3]], ptr [[TMP0]], align 4
+; CHECK-NEXT:    store i32 [[TMP3]], ptr addrspace(5) [[TMP0]], align 4
 ; CHECK-NEXT:    ret void
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@void_one_out_arg_i32_2_stores_clobber.body
-; CHECK-SAME: (ptr [[VAL:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    store i32 0, ptr [[VAL]], align 4
+; CHECK-SAME: (ptr addrspace(5) [[VAL:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    store i32 0, ptr addrspace(5) [[VAL]], align 4
 ; CHECK-NEXT:    call void @may.clobber()
 ; CHECK-NEXT:    ret [[VOID_ONE_OUT_ARG_I32_2_STORES_CLOBBER:%.*]] { i32 1 }
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@void_one_out_arg_i32_2_stores_clobber
-; CHECK-SAME: (ptr [[TMP0:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = call [[VOID_ONE_OUT_ARG_I32_2_STORES_CLOBBER:%.*]] @void_one_out_arg_i32_2_stores_clobber.body(ptr poison)
+; CHECK-SAME: (ptr addrspace(5) [[TMP0:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP2:%.*]] = call [[VOID_ONE_OUT_ARG_I32_2_STORES_CLOBBER:%.*]] @void_one_out_arg_i32_2_stores_clobber.body(ptr addrspace(5) poison)
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue [[VOID_ONE_OUT_ARG_I32_2_STORES_CLOBBER]] [[TMP2]], 0
-; CHECK-NEXT:    store i32 [[TMP3]], ptr [[TMP0]], align 4
+; CHECK-NEXT:    store i32 [[TMP3]], ptr addrspace(5) [[TMP0]], align 4
 ; CHECK-NEXT:    ret void
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@void_one_out_arg_i32_call_may_clobber
-; CHECK-SAME: (ptr [[VAL:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    store i32 0, ptr [[VAL]], align 4
+; CHECK-SAME: (ptr addrspace(5) [[VAL:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    store i32 0, ptr addrspace(5) [[VAL]], align 4
 ; CHECK-NEXT:    call void @may.clobber()
 ; CHECK-NEXT:    ret void
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@void_one_out_arg_i32_pre_call_may_clobber.body
-; CHECK-SAME: (ptr [[VAL:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: (ptr addrspace(5) [[VAL:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    call void @may.clobber()
 ; CHECK-NEXT:    ret [[VOID_ONE_OUT_ARG_I32_PRE_CALL_MAY_CLOBBER:%.*]] zeroinitializer
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@void_one_out_arg_i32_pre_call_may_clobber
-; CHECK-SAME: (ptr [[TMP0:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = call [[VOID_ONE_OUT_ARG_I32_PRE_CALL_MAY_CLOBBER:%.*]] @void_one_out_arg_i32_pre_call_may_clobber.body(ptr poison)
+; CHECK-SAME: (ptr addrspace(5) [[TMP0:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP2:%.*]] = call [[VOID_ONE_OUT_ARG_I32_PRE_CALL_MAY_CLOBBER:%.*]] @void_one_out_arg_i32_pre_call_may_clobber.body(ptr addrspace(5) poison)
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue [[VOID_ONE_OUT_ARG_I32_PRE_CALL_MAY_CLOBBER]] [[TMP2]], 0
-; CHECK-NEXT:    store i32 [[TMP3]], ptr [[TMP0]], align 4
+; CHECK-NEXT:    store i32 [[TMP3]], ptr addrspace(5) [[TMP0]], align 4
 ; CHECK-NEXT:    ret void
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@void_one_out_arg_i32_reload
-; CHECK-SAME: (ptr [[VAL:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    store i32 0, ptr [[VAL]], align 4
-; CHECK-NEXT:    [[LOAD:%.*]] = load i32, ptr [[VAL]], align 4
+; CHECK-SAME: (ptr addrspace(5) [[VAL:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    store i32 0, ptr addrspace(5) [[VAL]], align 4
+; CHECK-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(5) [[VAL]], align 4
 ; CHECK-NEXT:    ret void
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@void_one_out_arg_i32_store_in_different_block
-; CHECK-SAME: (ptr [[OUT:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: (ptr addrspace(5) [[OUT:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[LOAD:%.*]] = load i32, ptr addrspace(1) poison, align 4
-; CHECK-NEXT:    store i32 0, ptr [[OUT]], align 4
+; CHECK-NEXT:    store i32 0, ptr addrspace(5) [[OUT]], align 4
 ; CHECK-NEXT:    br label [[RET:%.*]]
 ; CHECK:       ret:
 ; CHECK-NEXT:    ret void
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@unused_out_arg_one_branch
-; CHECK-SAME: (i1 [[ARG0:%.*]], ptr [[VAL:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: (i1 [[ARG0:%.*]], ptr addrspace(5) [[VAL:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    br i1 [[ARG0]], label [[RET0:%.*]], label [[RET1:%.*]]
 ; CHECK:       ret0:
 ; CHECK-NEXT:    ret void
 ; CHECK:       ret1:
-; CHECK-NEXT:    store i32 9, ptr [[VAL]], align 4
+; CHECK-NEXT:    store i32 9, ptr addrspace(5) [[VAL]], align 4
 ; CHECK-NEXT:    ret void
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@void_one_out_arg_v2i32_1_use.body
-; CHECK-SAME: (ptr [[VAL:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: (ptr addrspace(5) [[VAL:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    ret [[VOID_ONE_OUT_ARG_V2I32_1_USE:%.*]] { <2 x i32> <i32 17, i32 9> }
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@void_one_out_arg_v2i32_1_use
-; CHECK-SAME: (ptr [[TMP0:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = call [[VOID_ONE_OUT_ARG_V2I32_1_USE:%.*]] @void_one_out_arg_v2i32_1_use.body(ptr poison)
+; CHECK-SAME: (ptr addrspace(5) [[TMP0:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP2:%.*]] = call [[VOID_ONE_OUT_ARG_V2I32_1_USE:%.*]] @void_one_out_arg_v2i32_1_use.body(ptr addrspace(5) poison)
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue [[VOID_ONE_OUT_ARG_V2I32_1_USE]] [[TMP2]], 0
-; CHECK-NEXT:    store <2 x i32> [[TMP3]], ptr [[TMP0]], align 8
+; CHECK-NEXT:    store <2 x i32> [[TMP3]], ptr addrspace(5) [[TMP0]], align 8
 ; CHECK-NEXT:    ret void
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@void_one_out_arg_struct_1_use.body
-; CHECK-SAME: (ptr [[OUT:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: (ptr addrspace(5) [[OUT:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    ret [[VOID_ONE_OUT_ARG_STRUCT_1_USE:%.*]] { [[STRUCT:%.*]] { i32 9, i8 99, float 4.000000e+00 } }
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@void_one_out_arg_struct_1_use
-; CHECK-SAME: (ptr [[TMP0:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = call [[VOID_ONE_OUT_ARG_STRUCT_1_USE:%.*]] @void_one_out_arg_struct_1_use.body(ptr poison)
+; CHECK-SAME: (ptr addrspace(5) [[TMP0:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP2:%.*]] = call [[VOID_ONE_OUT_ARG_STRUCT_1_USE:%.*]] @void_one_out_arg_struct_1_use.body(ptr addrspace(5) poison)
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue [[VOID_ONE_OUT_ARG_STRUCT_1_USE]] [[TMP2]], 0
-; CHECK-NEXT:    store [[STRUCT:%.*]] [[TMP3]], ptr [[TMP0]], align 4
+; CHECK-NEXT:    store [[STRUCT:%.*]] [[TMP3]], ptr addrspace(5) [[TMP0]], align 4
 ; CHECK-NEXT:    ret void
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@i32_one_out_arg_i32_1_use.body
-; CHECK-SAME: (ptr [[VAL:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: (ptr addrspace(5) [[VAL:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    ret [[I32_ONE_OUT_ARG_I32_1_USE:%.*]] { i32 9, i32 24 }
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@i32_one_out_arg_i32_1_use
-; CHECK-SAME: (ptr [[TMP0:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = call [[I32_ONE_OUT_ARG_I32_1_USE:%.*]] @i32_one_out_arg_i32_1_use.body(ptr poison)
+; CHECK-SAME: (ptr addrspace(5) [[TMP0:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP2:%.*]] = call [[I32_ONE_OUT_ARG_I32_1_USE:%.*]] @i32_one_out_arg_i32_1_use.body(ptr addrspace(5) poison)
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue [[I32_ONE_OUT_ARG_I32_1_USE]] [[TMP2]], 1
-; CHECK-NEXT:    store i32 [[TMP3]], ptr [[TMP0]], align 4
+; CHECK-NEXT:    store i32 [[TMP3]], ptr addrspace(5) [[TMP0]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue [[I32_ONE_OUT_ARG_I32_1_USE]] [[TMP2]], 0
 ; CHECK-NEXT:    ret i32 [[TMP4]]
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@unused_different_type.body
-; CHECK-SAME: (ptr [[ARG0:%.*]], ptr nocapture [[ARG1:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: (ptr addrspace(5) [[ARG0:%.*]], ptr addrspace(5) nocapture [[ARG1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    ret [[UNUSED_DIFFERENT_TYPE:%.*]] { float 4.000000e+00 }
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@unused_different_type
-; CHECK-SAME: (ptr [[TMP0:%.*]], ptr nocapture [[TMP1:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[TMP3:%.*]] = call [[UNUSED_DIFFERENT_TYPE:%.*]] @unused_different_type.body(ptr [[TMP0]], ptr poison)
+; CHECK-SAME: (ptr addrspace(5) [[TMP0:%.*]], ptr addrspace(5) nocapture [[TMP1:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = call [[UNUSED_DIFFERENT_TYPE:%.*]] @unused_different_type.body(ptr addrspace(5) [[TMP0]], ptr addrspace(5) poison)
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue [[UNUSED_DIFFERENT_TYPE]] [[TMP3]], 0
-; CHECK-NEXT:    store float [[TMP4]], ptr [[TMP1]], align 4
+; CHECK-NEXT:    store float [[TMP4]], ptr addrspace(5) [[TMP1]], align 4
 ; CHECK-NEXT:    ret void
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@multiple_same_return_noalias.body
-; CHECK-SAME: (ptr noalias [[OUT0:%.*]], ptr noalias [[OUT1:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: (ptr addrspace(5) noalias [[OUT0:%.*]], ptr addrspace(5) noalias [[OUT1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    ret [[MULTIPLE_SAME_RETURN_NOALIAS:%.*]] { i32 1, i32 2 }
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@multiple_same_return_noalias
-; CHECK-SAME: (ptr noalias [[TMP0:%.*]], ptr noalias [[TMP1:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[TMP3:%.*]] = call [[MULTIPLE_SAME_RETURN_NOALIAS:%.*]] @multiple_same_return_noalias.body(ptr poison, ptr poison)
+; CHECK-SAME: (ptr addrspace(5) noalias [[TMP0:%.*]], ptr addrspace(5) noalias [[TMP1:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = call [[MULTIPLE_SAME_RETURN_NOALIAS:%.*]] @multiple_same_return_noalias.body(ptr addrspace(5) poison, ptr addrspace(5) poison)
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue [[MULTIPLE_SAME_RETURN_NOALIAS]] [[TMP3]], 0
-; CHECK-NEXT:    store i32 [[TMP4]], ptr [[TMP0]], align 4
+; CHECK-NEXT:    store i32 [[TMP4]], ptr addrspace(5) [[TMP0]], align 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue [[MULTIPLE_SAME_RETURN_NOALIAS]] [[TMP3]], 1
-; CHECK-NEXT:    store i32 [[TMP5]], ptr [[TMP1]], align 4
+; CHECK-NEXT:    store i32 [[TMP5]], ptr addrspace(5) [[TMP1]], align 4
 ; CHECK-NEXT:    ret void
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@multiple_same_return_mayalias.body
-; CHECK-SAME: (ptr [[OUT0:%.*]], ptr [[OUT1:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: (ptr addrspace(5) [[OUT0:%.*]], ptr addrspace(5) [[OUT1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    ret [[MULTIPLE_SAME_RETURN_MAYALIAS:%.*]] { i32 2, i32 1 }
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@multiple_same_return_mayalias
-; CHECK-SAME: (ptr [[TMP0:%.*]], ptr [[TMP1:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[TMP3:%.*]] = call [[MULTIPLE_SAME_RETURN_MAYALIAS:%.*]] @multiple_same_return_mayalias.body(ptr poison, ptr poison)
+; CHECK-SAME: (ptr addrspace(5) [[TMP0:%.*]], ptr addrspace(5) [[TMP1:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = call [[MULTIPLE_SAME_RETURN_MAYALIAS:%.*]] @multiple_same_return_mayalias.body(ptr addrspace(5) poison, ptr addrspace(5) poison)
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue [[MULTIPLE_SAME_RETURN_MAYALIAS]] [[TMP3]], 0
-; CHECK-NEXT:    store i32 [[TMP4]], ptr [[TMP0]], align 4
+; CHECK-NEXT:    store i32 [[TMP4]], ptr addrspace(5) [[TMP0]], align 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue [[MULTIPLE_SAME_RETURN_MAYALIAS]] [[TMP3]], 1
-; CHECK-NEXT:    store i32 [[TMP5]], ptr [[TMP1]], align 4
+; CHECK-NEXT:    store i32 [[TMP5]], ptr addrspace(5) [[TMP1]], align 4
 ; CHECK-NEXT:    ret void
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@multiple_same_return_mayalias_order.body
-; CHECK-SAME: (ptr [[OUT0:%.*]], ptr [[OUT1:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: (ptr addrspace(5) [[OUT0:%.*]], ptr addrspace(5) [[OUT1:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    ret [[MULTIPLE_SAME_RETURN_MAYALIAS_ORDER:%.*]] { i32 1, i32 2 }
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@multiple_same_return_mayalias_order
-; CHECK-SAME: (ptr [[TMP0:%.*]], ptr [[TMP1:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[TMP3:%.*]] = call [[MULTIPLE_SAME_RETURN_MAYALIAS_ORDER:%.*]] @multiple_same_return_mayalias_order.body(ptr poison, ptr poison)
+; CHECK-SAME: (ptr addrspace(5) [[TMP0:%.*]], ptr addrspace(5) [[TMP1:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = call [[MULTIPLE_SAME_RETURN_MAYALIAS_ORDER:%.*]] @multiple_same_return_mayalias_order.body(ptr addrspace(5) poison, ptr addrspace(5) poison)
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue [[MULTIPLE_SAME_RETURN_MAYALIAS_ORDER]] [[TMP3]], 0
-; CHECK-NEXT:    store i32 [[TMP4]], ptr [[TMP0]], align 4
+; CHECK-NEXT:    store i32 [[TMP4]], ptr addrspace(5) [[TMP0]], align 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue [[MULTIPLE_SAME_RETURN_MAYALIAS_ORDER]] [[TMP3]], 1
-; CHECK-NEXT:    store i32 [[TMP5]], ptr [[TMP1]], align 4
+; CHECK-NEXT:    store i32 [[TMP5]], ptr addrspace(5) [[TMP1]], align 4
 ; CHECK-NEXT:    ret void
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@store_in_entry_block
-; CHECK-SAME: (i1 [[ARG0:%.*]], ptr [[OUT:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: (i1 [[ARG0:%.*]], ptr addrspace(5) [[OUT:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[VAL0:%.*]] = load i32, ptr addrspace(1) poison, align 4
-; CHECK-NEXT:    store i32 [[VAL0]], ptr [[OUT]], align 4
+; CHECK-NEXT:    store i32 [[VAL0]], ptr addrspace(5) [[OUT]], align 4
 ; CHECK-NEXT:    br i1 [[ARG0]], label [[IF:%.*]], label [[ENDIF:%.*]]
 ; CHECK:       if:
 ; CHECK-NEXT:    [[VAL1:%.*]] = load i32, ptr addrspace(1) poison, align 4
@@ -715,57 +717,57 @@ attributes #2 = { alwaysinline nounwind }
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@i1_one_out_arg_i32_1_use.body
-; CHECK-SAME: (ptr [[VAL:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: (ptr addrspace(5) [[VAL:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    ret [[I1_ONE_OUT_ARG_I32_1_USE:%.*]] { i1 true, i32 24 }
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@i1_one_out_arg_i32_1_use
-; CHECK-SAME: (ptr [[TMP0:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = call [[I1_ONE_OUT_ARG_I32_1_USE:%.*]] @i1_one_out_arg_i32_1_use.body(ptr poison)
+; CHECK-SAME: (ptr addrspace(5) [[TMP0:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP2:%.*]] = call [[I1_ONE_OUT_ARG_I32_1_USE:%.*]] @i1_one_out_arg_i32_1_use.body(ptr addrspace(5) poison)
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue [[I1_ONE_OUT_ARG_I32_1_USE]] [[TMP2]], 1
-; CHECK-NEXT:    store i32 [[TMP3]], ptr [[TMP0]], align 4
+; CHECK-NEXT:    store i32 [[TMP3]], ptr addrspace(5) [[TMP0]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue [[I1_ONE_OUT_ARG_I32_1_USE]] [[TMP2]], 0
 ; CHECK-NEXT:    ret i1 [[TMP4]]
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@i1_zeroext_one_out_arg_i32_1_use.body
-; CHECK-SAME: (ptr [[VAL:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: (ptr addrspace(5) [[VAL:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    ret [[I1_ZEROEXT_ONE_OUT_ARG_I32_1_USE:%.*]] { i1 true, i32 24 }
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@i1_zeroext_one_out_arg_i32_1_use
-; CHECK-SAME: (ptr [[TMP0:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = call [[I1_ZEROEXT_ONE_OUT_ARG_I32_1_USE:%.*]] @i1_zeroext_one_out_arg_i32_1_use.body(ptr poison)
+; CHECK-SAME: (ptr addrspace(5) [[TMP0:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP2:%.*]] = call [[I1_ZEROEXT_ONE_OUT_ARG_I32_1_USE:%.*]] @i1_zeroext_one_out_arg_i32_1_use.body(ptr addrspace(5) poison)
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue [[I1_ZEROEXT_ONE_OUT_ARG_I32_1_USE]] [[TMP2]], 1
-; CHECK-NEXT:    store i32 [[TMP3]], ptr [[TMP0]], align 4
+; CHECK-NEXT:    store i32 [[TMP3]], ptr addrspace(5) [[TMP0]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue [[I1_ZEROEXT_ONE_OUT_ARG_I32_1_USE]] [[TMP2]], 0
 ; CHECK-NEXT:    ret i1 [[TMP4]]
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@i1_signext_one_out_arg_i32_1_use.body
-; CHECK-SAME: (ptr [[VAL:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: (ptr addrspace(5) [[VAL:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    ret [[I1_SIGNEXT_ONE_OUT_ARG_I32_1_USE:%.*]] { i1 true, i32 24 }
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@i1_signext_one_out_arg_i32_1_use
-; CHECK-SAME: (ptr [[TMP0:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = call [[I1_SIGNEXT_ONE_OUT_ARG_I32_1_USE:%.*]] @i1_signext_one_out_arg_i32_1_use.body(ptr poison)
+; CHECK-SAME: (ptr addrspace(5) [[TMP0:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP2:%.*]] = call [[I1_SIGNEXT_ONE_OUT_ARG_I32_1_USE:%.*]] @i1_signext_one_out_arg_i32_1_use.body(ptr addrspace(5) poison)
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue [[I1_SIGNEXT_ONE_OUT_ARG_I32_1_USE]] [[TMP2]], 1
-; CHECK-NEXT:    store i32 [[TMP3]], ptr [[TMP0]], align 4
+; CHECK-NEXT:    store i32 [[TMP3]], ptr addrspace(5) [[TMP0]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue [[I1_SIGNEXT_ONE_OUT_ARG_I32_1_USE]] [[TMP2]], 0
 ; CHECK-NEXT:    ret i1 [[TMP4]]
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@p1i32_noalias_one_out_arg_i32_1_use.body
-; CHECK-SAME: (ptr [[VAL:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: (ptr addrspace(5) [[VAL:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    ret [[P1I32_NOALIAS_ONE_OUT_ARG_I32_1_USE:%.*]] { ptr addrspace(1) null, i32 24 }
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@p1i32_noalias_one_out_arg_i32_1_use
-; CHECK-SAME: (ptr [[TMP0:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = call [[P1I32_NOALIAS_ONE_OUT_ARG_I32_1_USE:%.*]] @p1i32_noalias_one_out_arg_i32_1_use.body(ptr poison)
+; CHECK-SAME: (ptr addrspace(5) [[TMP0:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP2:%.*]] = call [[P1I32_NOALIAS_ONE_OUT_ARG_I32_1_USE:%.*]] @p1i32_noalias_one_out_arg_i32_1_use.body(ptr addrspace(5) poison)
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue [[P1I32_NOALIAS_ONE_OUT_ARG_I32_1_USE]] [[TMP2]], 1
-; CHECK-NEXT:    store i32 [[TMP3]], ptr [[TMP0]], align 4
+; CHECK-NEXT:    store i32 [[TMP3]], ptr addrspace(5) [[TMP0]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue [[P1I32_NOALIAS_ONE_OUT_ARG_I32_1_USE]] [[TMP2]], 0
 ; CHECK-NEXT:    ret ptr addrspace(1) [[TMP4]]
 ;
@@ -777,63 +779,63 @@ attributes #2 = { alwaysinline nounwind }
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@func_ptr_type.body
-; CHECK-SAME: (ptr [[OUT:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: (ptr addrspace(5) [[OUT:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[FUNC:%.*]] = load ptr, ptr poison, align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[FUNC_PTR_TYPE:%.*]] poison, ptr [[FUNC]], 0
 ; CHECK-NEXT:    ret [[FUNC_PTR_TYPE]] [[TMP1]]
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@func_ptr_type
-; CHECK-SAME: (ptr [[TMP0:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = call [[FUNC_PTR_TYPE:%.*]] @func_ptr_type.body(ptr poison)
+; CHECK-SAME: (ptr addrspace(5) [[TMP0:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP2:%.*]] = call [[FUNC_PTR_TYPE:%.*]] @func_ptr_type.body(ptr addrspace(5) poison)
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue [[FUNC_PTR_TYPE]] [[TMP2]], 0
-; CHECK-NEXT:    store ptr [[TMP3]], ptr [[TMP0]], align 8
+; CHECK-NEXT:    store ptr [[TMP3]], ptr addrspace(5) [[TMP0]], align 8
 ; CHECK-NEXT:    ret void
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@bitcast_func_ptr_type.body
-; CHECK-SAME: (ptr [[OUT:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: (ptr addrspace(5) [[OUT:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[FUNC:%.*]] = load ptr, ptr poison, align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[BITCAST_FUNC_PTR_TYPE:%.*]] poison, ptr [[FUNC]], 0
 ; CHECK-NEXT:    ret [[BITCAST_FUNC_PTR_TYPE]] [[TMP1]]
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@bitcast_func_ptr_type
-; CHECK-SAME: (ptr [[TMP0:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = call [[BITCAST_FUNC_PTR_TYPE:%.*]] @bitcast_func_ptr_type.body(ptr poison)
+; CHECK-SAME: (ptr addrspace(5) [[TMP0:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP2:%.*]] = call [[BITCAST_FUNC_PTR_TYPE:%.*]] @bitcast_func_ptr_type.body(ptr addrspace(5) poison)
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue [[BITCAST_FUNC_PTR_TYPE]] [[TMP2]], 0
-; CHECK-NEXT:    store ptr [[TMP3]], ptr [[TMP0]], align 8
+; CHECK-NEXT:    store ptr [[TMP3]], ptr addrspace(5) [[TMP0]], align 8
 ; CHECK-NEXT:    ret void
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@out_arg_small_array.body
-; CHECK-SAME: (ptr [[VAL:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: (ptr addrspace(5) [[VAL:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    ret [[OUT_ARG_SMALL_ARRAY:%.*]] { [4 x i32] [i32 0, i32 1, i32 2, i32 3] }
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@out_arg_small_array
-; CHECK-SAME: (ptr [[TMP0:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = call [[OUT_ARG_SMALL_ARRAY:%.*]] @out_arg_small_array.body(ptr poison)
+; CHECK-SAME: (ptr addrspace(5) [[TMP0:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP2:%.*]] = call [[OUT_ARG_SMALL_ARRAY:%.*]] @out_arg_small_array.body(ptr addrspace(5) poison)
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue [[OUT_ARG_SMALL_ARRAY]] [[TMP2]], 0
-; CHECK-NEXT:    store [4 x i32] [[TMP3]], ptr [[TMP0]], align 4
+; CHECK-NEXT:    store [4 x i32] [[TMP3]], ptr addrspace(5) [[TMP0]], align 4
 ; CHECK-NEXT:    ret void
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@out_arg_large_array
-; CHECK-SAME: (ptr [[VAL:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:    store [17 x i32] zeroinitializer, ptr [[VAL]], align 4
+; CHECK-SAME: (ptr addrspace(5) [[VAL:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    store [17 x i32] zeroinitializer, ptr addrspace(5) [[VAL]], align 4
 ; CHECK-NEXT:    ret void
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@num_regs_return_limit
-; CHECK-SAME: (ptr [[OUT:%.*]], i32 [[VAL:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: (ptr addrspace(5) [[OUT:%.*]], i32 [[VAL:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[LOAD:%.*]] = load volatile <16 x i32>, ptr addrspace(1) poison, align 64
-; CHECK-NEXT:    store i32 [[VAL]], ptr [[OUT]], align 4
+; CHECK-NEXT:    store i32 [[VAL]], ptr addrspace(5) [[OUT]], align 4
 ; CHECK-NEXT:    ret <16 x i32> [[LOAD]]
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@num_regs_reach_limit.body
-; CHECK-SAME: (ptr [[OUT:%.*]], i32 [[VAL:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: (ptr addrspace(5) [[OUT:%.*]], i32 [[VAL:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[LOAD:%.*]] = load volatile [15 x i32], ptr addrspace(1) poison, align 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[NUM_REGS_REACH_LIMIT:%.*]] poison, [15 x i32] [[LOAD]], 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [[NUM_REGS_REACH_LIMIT]] [[TMP1]], i32 [[VAL]], 1
@@ -841,16 +843,16 @@ attributes #2 = { alwaysinline nounwind }
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@num_regs_reach_limit
-; CHECK-SAME: (ptr [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[TMP3:%.*]] = call [[NUM_REGS_REACH_LIMIT:%.*]] @num_regs_reach_limit.body(ptr poison, i32 [[TMP1]])
+; CHECK-SAME: (ptr addrspace(5) [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = call [[NUM_REGS_REACH_LIMIT:%.*]] @num_regs_reach_limit.body(ptr addrspace(5) poison, i32 [[TMP1]])
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue [[NUM_REGS_REACH_LIMIT]] [[TMP3]], 1
-; CHECK-NEXT:    store i32 [[TMP4]], ptr [[TMP0]], align 4
+; CHECK-NEXT:    store i32 [[TMP4]], ptr addrspace(5) [[TMP0]], align 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue [[NUM_REGS_REACH_LIMIT]] [[TMP3]], 0
 ; CHECK-NEXT:    ret [15 x i32] [[TMP5]]
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@num_regs_reach_limit_leftover.body
-; CHECK-SAME: (ptr [[OUT0:%.*]], ptr [[OUT1:%.*]], i32 [[VAL0:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: (ptr addrspace(5) [[OUT0:%.*]], ptr addrspace(5) [[OUT1:%.*]], i32 [[VAL0:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[LOAD0:%.*]] = load volatile [15 x i32], ptr addrspace(1) poison, align 4
 ; CHECK-NEXT:    [[LOAD1:%.*]] = load volatile i32, ptr addrspace(1) poison, align 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[NUM_REGS_REACH_LIMIT_LEFTOVER:%.*]] poison, [15 x i32] [[LOAD0]], 0
@@ -860,267 +862,267 @@ attributes #2 = { alwaysinline nounwind }
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@num_regs_reach_limit_leftover
-; CHECK-SAME: (ptr [[TMP0:%.*]], ptr [[TMP1:%.*]], i32 [[TMP2:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[TMP4:%.*]] = call [[NUM_REGS_REACH_LIMIT_LEFTOVER:%.*]] @num_regs_reach_limit_leftover.body(ptr poison, ptr poison, i32 [[TMP2]])
+; CHECK-SAME: (ptr addrspace(5) [[TMP0:%.*]], ptr addrspace(5) [[TMP1:%.*]], i32 [[TMP2:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP4:%.*]] = call [[NUM_REGS_REACH_LIMIT_LEFTOVER:%.*]] @num_regs_reach_limit_leftover.body(ptr addrspace(5) poison, ptr addrspace(5) poison, i32 [[TMP2]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue [[NUM_REGS_REACH_LIMIT_LEFTOVER]] [[TMP4]], 1
-; CHECK-NEXT:    store i32 [[TMP5]], ptr [[TMP0]], align 4
+; CHECK-NEXT:    store i32 [[TMP5]], ptr addrspace(5) [[TMP0]], align 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractvalue [[NUM_REGS_REACH_LIMIT_LEFTOVER]] [[TMP4]], 2
-; CHECK-NEXT:    store i32 [[TMP6]], ptr [[TMP1]], align 4
+; CHECK-NEXT:    store i32 [[TMP6]], ptr addrspace(5) [[TMP1]], align 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue [[NUM_REGS_REACH_LIMIT_LEFTOVER]] [[TMP4]], 0
 ; CHECK-NEXT:    ret [15 x i32] [[TMP7]]
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@preserve_debug_info.body
-; CHECK-SAME: (i32 [[ARG0:%.*]], ptr [[VAL:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: (i32 [[ARG0:%.*]], ptr addrspace(5) [[VAL:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    call void @may.clobber(), !dbg [[DBG5:![0-9]+]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[PRESERVE_DEBUG_INFO:%.*]] poison, i32 [[ARG0]], 0, !dbg [[DBG11:![0-9]+]]
 ; CHECK-NEXT:    ret [[PRESERVE_DEBUG_INFO]] [[TMP1]], !dbg [[DBG11]]
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@preserve_debug_info
-; CHECK-SAME: (i32 [[TMP0:%.*]], ptr [[TMP1:%.*]]) #[[ATTR2]] !dbg [[DBG6:![0-9]+]] {
-; CHECK-NEXT:    [[TMP3:%.*]] = call [[PRESERVE_DEBUG_INFO:%.*]] @preserve_debug_info.body(i32 [[TMP0]], ptr poison)
+; CHECK-SAME: (i32 [[TMP0:%.*]], ptr addrspace(5) [[TMP1:%.*]]) #[[ATTR2]] !dbg [[DBG6:![0-9]+]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = call [[PRESERVE_DEBUG_INFO:%.*]] @preserve_debug_info.body(i32 [[TMP0]], ptr addrspace(5) poison)
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue [[PRESERVE_DEBUG_INFO]] [[TMP3]], 0
-; CHECK-NEXT:    store i32 [[TMP4]], ptr [[TMP1]], align 4
+; CHECK-NEXT:    store i32 [[TMP4]], ptr addrspace(5) [[TMP1]], align 4
 ; CHECK-NEXT:    ret void
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@preserve_metadata.body
-; CHECK-SAME: (i32 [[ARG0:%.*]], ptr [[VAL:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: (i32 [[ARG0:%.*]], ptr addrspace(5) [[VAL:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    call void @may.clobber()
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[PRESERVE_METADATA:%.*]] poison, i32 [[ARG0]], 0
 ; CHECK-NEXT:    ret [[PRESERVE_METADATA]] [[TMP1]]
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@preserve_metadata
-; CHECK-SAME: (i32 [[TMP0:%.*]], ptr [[TMP1:%.*]]) #[[ATTR2]] !kernel_arg_access_qual !12 {
-; CHECK-NEXT:    [[TMP3:%.*]] = call [[PRESERVE_METADATA:%.*]] @preserve_metadata.body(i32 [[TMP0]], ptr poison)
+; CHECK-SAME: (i32 [[TMP0:%.*]], ptr addrspace(5) [[TMP1:%.*]]) #[[ATTR2]] !kernel_arg_access_qual !12 {
+; CHECK-NEXT:    [[TMP3:%.*]] = call [[PRESERVE_METADATA:%.*]] @preserve_metadata.body(i32 [[TMP0]], ptr addrspace(5) poison)
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue [[PRESERVE_METADATA]] [[TMP3]], 0
-; CHECK-NEXT:    store i32 [[TMP4]], ptr [[TMP1]], align 4
+; CHECK-NEXT:    store i32 [[TMP4]], ptr addrspace(5) [[TMP1]], align 4
 ; CHECK-NEXT:    ret void
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@bitcast_pointer_v4i32_v3i32.body
-; CHECK-SAME: (ptr [[OUT:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: (ptr addrspace(5) [[OUT:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[LOAD:%.*]] = load volatile <4 x i32>, ptr addrspace(1) poison, align 16
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[BITCAST_POINTER_V4I32_V3I32:%.*]] poison, <4 x i32> [[LOAD]], 0
 ; CHECK-NEXT:    ret [[BITCAST_POINTER_V4I32_V3I32]] [[TMP1]]
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@bitcast_pointer_v4i32_v3i32
-; CHECK-SAME: (ptr [[TMP0:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = call [[BITCAST_POINTER_V4I32_V3I32:%.*]] @bitcast_pointer_v4i32_v3i32.body(ptr poison)
+; CHECK-SAME: (ptr addrspace(5) [[TMP0:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP2:%.*]] = call [[BITCAST_POINTER_V4I32_V3I32:%.*]] @bitcast_pointer_v4i32_v3i32.body(ptr addrspace(5) poison)
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue [[BITCAST_POINTER_V4I32_V3I32]] [[TMP2]], 0
-; CHECK-NEXT:    store <4 x i32> [[TMP3]], ptr [[TMP0]], align 16
+; CHECK-NEXT:    store <4 x i32> [[TMP3]], ptr addrspace(5) [[TMP0]], align 16
 ; CHECK-NEXT:    ret void
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@bitcast_pointer_v4i32_v3f32.body
-; CHECK-SAME: (ptr [[OUT:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: (ptr addrspace(5) [[OUT:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[LOAD:%.*]] = load volatile <4 x i32>, ptr addrspace(1) poison, align 16
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[BITCAST_POINTER_V4I32_V3F32:%.*]] poison, <4 x i32> [[LOAD]], 0
 ; CHECK-NEXT:    ret [[BITCAST_POINTER_V4I32_V3F32]] [[TMP1]]
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@bitcast_pointer_v4i32_v3f32
-; CHECK-SAME: (ptr [[TMP0:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = call [[BITCAST_POINTER_V4I32_V3F32:%.*]] @bitcast_pointer_v4i32_v3f32.body(ptr poison)
+; CHECK-SAME: (ptr addrspace(5) [[TMP0:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP2:%.*]] = call [[BITCAST_POINTER_V4I32_V3F32:%.*]] @bitcast_pointer_v4i32_v3f32.body(ptr addrspace(5) poison)
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue [[BITCAST_POINTER_V4I32_V3F32]] [[TMP2]], 0
-; CHECK-NEXT:    store <4 x i32> [[TMP3]], ptr [[TMP0]], align 16
+; CHECK-NEXT:    store <4 x i32> [[TMP3]], ptr addrspace(5) [[TMP0]], align 16
 ; CHECK-NEXT:    ret void
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@bitcast_pointer_i32_f32.body
-; CHECK-SAME: (ptr [[OUT:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: (ptr addrspace(5) [[OUT:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[LOAD:%.*]] = load volatile i32, ptr addrspace(1) poison, align 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[BITCAST_POINTER_I32_F32:%.*]] poison, i32 [[LOAD]], 0
 ; CHECK-NEXT:    ret [[BITCAST_POINTER_I32_F32]] [[TMP1]]
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@bitcast_pointer_i32_f32
-; CHECK-SAME: (ptr [[TMP0:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = call [[BITCAST_POINTER_I32_F32:%.*]] @bitcast_pointer_i32_f32.body(ptr poison)
+; CHECK-SAME: (ptr addrspace(5) [[TMP0:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP2:%.*]] = call [[BITCAST_POINTER_I32_F32:%.*]] @bitcast_pointer_i32_f32.body(ptr addrspace(5) poison)
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue [[BITCAST_POINTER_I32_F32]] [[TMP2]], 0
-; CHECK-NEXT:    store i32 [[TMP3]], ptr [[TMP0]], align 4
+; CHECK-NEXT:    store i32 [[TMP3]], ptr addrspace(5) [[TMP0]], align 4
 ; CHECK-NEXT:    ret void
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@bitcast_pointer_i32_f16.body
-; CHECK-SAME: (ptr [[OUT:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: (ptr addrspace(5) [[OUT:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[LOAD:%.*]] = load volatile i32, ptr addrspace(1) poison, align 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[BITCAST_POINTER_I32_F16:%.*]] poison, i32 [[LOAD]], 0
 ; CHECK-NEXT:    ret [[BITCAST_POINTER_I32_F16]] [[TMP1]]
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@bitcast_pointer_i32_f16
-; CHECK-SAME: (ptr [[TMP0:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = call [[BITCAST_POINTER_I32_F16:%.*]] @bitcast_pointer_i32_f16.body(ptr poison)
+; CHECK-SAME: (ptr addrspace(5) [[TMP0:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP2:%.*]] = call [[BITCAST_POINTER_I32_F16:%.*]] @bitcast_pointer_i32_f16.body(ptr addrspace(5) poison)
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue [[BITCAST_POINTER_I32_F16]] [[TMP2]], 0
-; CHECK-NEXT:    store i32 [[TMP3]], ptr [[TMP0]], align 4
+; CHECK-NEXT:    store i32 [[TMP3]], ptr addrspace(5) [[TMP0]], align 4
 ; CHECK-NEXT:    ret void
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@bitcast_pointer_f16_i32.body
-; CHECK-SAME: (ptr [[OUT:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: (ptr addrspace(5) [[OUT:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[LOAD:%.*]] = load volatile half, ptr addrspace(1) poison, align 2
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[BITCAST_POINTER_F16_I32:%.*]] poison, half [[LOAD]], 0
 ; CHECK-NEXT:    ret [[BITCAST_POINTER_F16_I32]] [[TMP1]]
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@bitcast_pointer_f16_i32
-; CHECK-SAME: (ptr [[TMP0:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[TMP2:%.*]] = call [[BITCAST_POINTER_F16_I32:%.*]] @bitcast_pointer_f16_i32.body(ptr poison)
+; CHECK-SAME: (ptr addrspace(5) [[TMP0:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP2:%.*]] = call [[BITCAST_POINTER_F16_I32:%.*]] @bitcast_pointer_f16_i32.body(ptr addrspace(5) poison)
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue [[BITCAST_POINTER_F16_I32]] [[TMP2]], 0
-; CHECK-NEXT:    store half [[TMP3]], ptr [[TMP0]], align 2
+; CHECK-NEXT:    store half [[TMP3]], ptr addrspace(5) [[TMP0]], align 2
 ; CHECK-NEXT:    ret void
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@bitcast_struct_v3f32_v3f32.body
-; CHECK-SAME: (ptr [[OUT:%.*]], <3 x float> [[VALUE:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: (ptr addrspace(5) [[OUT:%.*]], <3 x float> [[VALUE:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[EXTRACTVEC:%.*]] = shufflevector <3 x float> [[VALUE]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[BITCAST_STRUCT_V3F32_V3F32:%.*]] poison, <4 x float> [[EXTRACTVEC]], 0
 ; CHECK-NEXT:    ret [[BITCAST_STRUCT_V3F32_V3F32]] [[TMP1]]
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@bitcast_struct_v3f32_v3f32
-; CHECK-SAME: (ptr [[TMP0:%.*]], <3 x float> [[TMP1:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[TMP3:%.*]] = call [[BITCAST_STRUCT_V3F32_V3F32:%.*]] @bitcast_struct_v3f32_v3f32.body(ptr poison, <3 x float> [[TMP1]])
+; CHECK-SAME: (ptr addrspace(5) [[TMP0:%.*]], <3 x float> [[TMP1:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = call [[BITCAST_STRUCT_V3F32_V3F32:%.*]] @bitcast_struct_v3f32_v3f32.body(ptr addrspace(5) poison, <3 x float> [[TMP1]])
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue [[BITCAST_STRUCT_V3F32_V3F32]] [[TMP3]], 0
-; CHECK-NEXT:    store <4 x float> [[TMP4]], ptr [[TMP0]], align 16
+; CHECK-NEXT:    store <4 x float> [[TMP4]], ptr addrspace(5) [[TMP0]], align 16
 ; CHECK-NEXT:    ret void
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@bitcast_struct_v3f32_v3i32.body
-; CHECK-SAME: (ptr [[OUT:%.*]], <3 x i32> [[VALUE:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: (ptr addrspace(5) [[OUT:%.*]], <3 x i32> [[VALUE:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[EXTRACTVEC:%.*]] = shufflevector <3 x i32> [[VALUE]], <3 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[BITCAST_STRUCT_V3F32_V3I32:%.*]] poison, <4 x i32> [[EXTRACTVEC]], 0
 ; CHECK-NEXT:    ret [[BITCAST_STRUCT_V3F32_V3I32]] [[TMP1]]
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@bitcast_struct_v3f32_v3i32
-; CHECK-SAME: (ptr [[TMP0:%.*]], <3 x i32> [[TMP1:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[TMP3:%.*]] = call [[BITCAST_STRUCT_V3F32_V3I32:%.*]] @bitcast_struct_v3f32_v3i32.body(ptr poison, <3 x i32> [[TMP1]])
+; CHECK-SAME: (ptr addrspace(5) [[TMP0:%.*]], <3 x i32> [[TMP1:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = call [[BITCAST_STRUCT_V3F32_V3I32:%.*]] @bitcast_struct_v3f32_v3i32.body(ptr addrspace(5) poison, <3 x i32> [[TMP1]])
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue [[BITCAST_STRUCT_V3F32_V3I32]] [[TMP3]], 0
-; CHECK-NEXT:    store <4 x i32> [[TMP4]], ptr [[TMP0]], align 16
+; CHECK-NEXT:    store <4 x i32> [[TMP4]], ptr addrspace(5) [[TMP0]], align 16
 ; CHECK-NEXT:    ret void
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@bitcast_struct_v4f32_v4f32.body
-; CHECK-SAME: (ptr [[OUT:%.*]], <4 x float> [[VALUE:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: (ptr addrspace(5) [[OUT:%.*]], <4 x float> [[VALUE:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[BITCAST_STRUCT_V4F32_V4F32:%.*]] poison, <4 x float> [[VALUE]], 0
 ; CHECK-NEXT:    ret [[BITCAST_STRUCT_V4F32_V4F32]] [[TMP1]]
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@bitcast_struct_v4f32_v4f32
-; CHECK-SAME: (ptr [[TMP0:%.*]], <4 x float> [[TMP1:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[TMP3:%.*]] = call [[BITCAST_STRUCT_V4F32_V4F32:%.*]] @bitcast_struct_v4f32_v4f32.body(ptr poison, <4 x float> [[TMP1]])
+; CHECK-SAME: (ptr addrspace(5) [[TMP0:%.*]], <4 x float> [[TMP1:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = call [[BITCAST_STRUCT_V4F32_V4F32:%.*]] @bitcast_struct_v4f32_v4f32.body(ptr addrspace(5) poison, <4 x float> [[TMP1]])
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue [[BITCAST_STRUCT_V4F32_V4F32]] [[TMP3]], 0
-; CHECK-NEXT:    store <4 x float> [[TMP4]], ptr [[TMP0]], align 16
+; CHECK-NEXT:    store <4 x float> [[TMP4]], ptr addrspace(5) [[TMP0]], align 16
 ; CHECK-NEXT:    ret void
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@bitcast_struct_v3f32_v4i32.body
-; CHECK-SAME: (ptr [[OUT:%.*]], <4 x i32> [[VALUE:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: (ptr addrspace(5) [[OUT:%.*]], <4 x i32> [[VALUE:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[BITCAST_STRUCT_V3F32_V4I32:%.*]] poison, <4 x i32> [[VALUE]], 0
 ; CHECK-NEXT:    ret [[BITCAST_STRUCT_V3F32_V4I32]] [[TMP1]]
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@bitcast_struct_v3f32_v4i32
-; CHECK-SAME: (ptr [[TMP0:%.*]], <4 x i32> [[TMP1:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[TMP3:%.*]] = call [[BITCAST_STRUCT_V3F32_V4I32:%.*]] @bitcast_struct_v3f32_v4i32.body(ptr poison, <4 x i32> [[TMP1]])
+; CHECK-SAME: (ptr addrspace(5) [[TMP0:%.*]], <4 x i32> [[TMP1:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = call [[BITCAST_STRUCT_V3F32_V4I32:%.*]] @bitcast_struct_v3f32_v4i32.body(ptr addrspace(5) poison, <4 x i32> [[TMP1]])
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue [[BITCAST_STRUCT_V3F32_V4I32]] [[TMP3]], 0
-; CHECK-NEXT:    store <4 x i32> [[TMP4]], ptr [[TMP0]], align 16
+; CHECK-NEXT:    store <4 x i32> [[TMP4]], ptr addrspace(5) [[TMP0]], align 16
 ; CHECK-NEXT:    ret void
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@bitcast_struct_v4f32_v3f32.body
-; CHECK-SAME: (ptr [[OUT:%.*]], <3 x float> [[VALUE:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: (ptr addrspace(5) [[OUT:%.*]], <3 x float> [[VALUE:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[EXTRACTVEC:%.*]] = shufflevector <3 x float> [[VALUE]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[BITCAST_STRUCT_V4F32_V3F32:%.*]] poison, <4 x float> [[EXTRACTVEC]], 0
 ; CHECK-NEXT:    ret [[BITCAST_STRUCT_V4F32_V3F32]] [[TMP1]]
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@bitcast_struct_v4f32_v3f32
-; CHECK-SAME: (ptr [[TMP0:%.*]], <3 x float> [[TMP1:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[TMP3:%.*]] = call [[BITCAST_STRUCT_V4F32_V3F32:%.*]] @bitcast_struct_v4f32_v3f32.body(ptr poison, <3 x float> [[TMP1]])
+; CHECK-SAME: (ptr addrspace(5) [[TMP0:%.*]], <3 x float> [[TMP1:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = call [[BITCAST_STRUCT_V4F32_V3F32:%.*]] @bitcast_struct_v4f32_v3f32.body(ptr addrspace(5) poison, <3 x float> [[TMP1]])
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue [[BITCAST_STRUCT_V4F32_V3F32]] [[TMP3]], 0
-; CHECK-NEXT:    store <4 x float> [[TMP4]], ptr [[TMP0]], align 16
+; CHECK-NEXT:    store <4 x float> [[TMP4]], ptr addrspace(5) [[TMP0]], align 16
 ; CHECK-NEXT:    ret void
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@bitcast_struct_v3f32_v2f32.body
-; CHECK-SAME: (ptr [[OUT:%.*]], <2 x float> [[VALUE:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: (ptr addrspace(5) [[OUT:%.*]], <2 x float> [[VALUE:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[BITCAST_STRUCT_V3F32_V2F32:%.*]] poison, <2 x float> [[VALUE]], 0
 ; CHECK-NEXT:    ret [[BITCAST_STRUCT_V3F32_V2F32]] [[TMP1]]
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@bitcast_struct_v3f32_v2f32
-; CHECK-SAME: (ptr [[TMP0:%.*]], <2 x float> [[TMP1:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[TMP3:%.*]] = call [[BITCAST_STRUCT_V3F32_V2F32:%.*]] @bitcast_struct_v3f32_v2f32.body(ptr poison, <2 x float> [[TMP1]])
+; CHECK-SAME: (ptr addrspace(5) [[TMP0:%.*]], <2 x float> [[TMP1:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = call [[BITCAST_STRUCT_V3F32_V2F32:%.*]] @bitcast_struct_v3f32_v2f32.body(ptr addrspace(5) poison, <2 x float> [[TMP1]])
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue [[BITCAST_STRUCT_V3F32_V2F32]] [[TMP3]], 0
-; CHECK-NEXT:    store <2 x float> [[TMP4]], ptr [[TMP0]], align 8
+; CHECK-NEXT:    store <2 x float> [[TMP4]], ptr addrspace(5) [[TMP0]], align 8
 ; CHECK-NEXT:    ret void
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@bitcast_struct_v3f32_f32_v3f32.body
-; CHECK-SAME: (ptr [[OUT:%.*]], <3 x float> [[VALUE:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: (ptr addrspace(5) [[OUT:%.*]], <3 x float> [[VALUE:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[EXTRACTVEC:%.*]] = shufflevector <3 x float> [[VALUE]], <3 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[BITCAST_STRUCT_V3F32_F32_V3F32:%.*]] poison, <4 x float> [[EXTRACTVEC]], 0
 ; CHECK-NEXT:    ret [[BITCAST_STRUCT_V3F32_F32_V3F32]] [[TMP1]]
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@bitcast_struct_v3f32_f32_v3f32
-; CHECK-SAME: (ptr [[TMP0:%.*]], <3 x float> [[TMP1:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[TMP3:%.*]] = call [[BITCAST_STRUCT_V3F32_F32_V3F32:%.*]] @bitcast_struct_v3f32_f32_v3f32.body(ptr poison, <3 x float> [[TMP1]])
+; CHECK-SAME: (ptr addrspace(5) [[TMP0:%.*]], <3 x float> [[TMP1:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = call [[BITCAST_STRUCT_V3F32_F32_V3F32:%.*]] @bitcast_struct_v3f32_f32_v3f32.body(ptr addrspace(5) poison, <3 x float> [[TMP1]])
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue [[BITCAST_STRUCT_V3F32_F32_V3F32]] [[TMP3]], 0
-; CHECK-NEXT:    store <4 x float> [[TMP4]], ptr [[TMP0]], align 16
+; CHECK-NEXT:    store <4 x float> [[TMP4]], ptr addrspace(5) [[TMP0]], align 16
 ; CHECK-NEXT:    ret void
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@bitcast_struct_v3f32_f32_v4f32.body
-; CHECK-SAME: (ptr [[OUT:%.*]], <4 x float> [[VALUE:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: (ptr addrspace(5) [[OUT:%.*]], <4 x float> [[VALUE:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[BITCAST_STRUCT_V3F32_F32_V4F32:%.*]] poison, <4 x float> [[VALUE]], 0
 ; CHECK-NEXT:    ret [[BITCAST_STRUCT_V3F32_F32_V4F32]] [[TMP1]]
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@bitcast_struct_v3f32_f32_v4f32
-; CHECK-SAME: (ptr [[TMP0:%.*]], <4 x float> [[TMP1:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[TMP3:%.*]] = call [[BITCAST_STRUCT_V3F32_F32_V4F32:%.*]] @bitcast_struct_v3f32_f32_v4f32.body(ptr poison, <4 x float> [[TMP1]])
+; CHECK-SAME: (ptr addrspace(5) [[TMP0:%.*]], <4 x float> [[TMP1:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = call [[BITCAST_STRUCT_V3F32_F32_V4F32:%.*]] @bitcast_struct_v3f32_f32_v4f32.body(ptr addrspace(5) poison, <4 x float> [[TMP1]])
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue [[BITCAST_STRUCT_V3F32_F32_V4F32]] [[TMP3]], 0
-; CHECK-NEXT:    store <4 x float> [[TMP4]], ptr [[TMP0]], align 16
+; CHECK-NEXT:    store <4 x float> [[TMP4]], ptr addrspace(5) [[TMP0]], align 16
 ; CHECK-NEXT:    ret void
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@bitcast_struct_i128_v4f32.body
-; CHECK-SAME: (ptr [[OUT:%.*]], <4 x float> [[VALUE:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: (ptr addrspace(5) [[OUT:%.*]], <4 x float> [[VALUE:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[BITCAST_STRUCT_I128_V4F32:%.*]] poison, <4 x float> [[VALUE]], 0
 ; CHECK-NEXT:    ret [[BITCAST_STRUCT_I128_V4F32]] [[TMP1]]
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@bitcast_struct_i128_v4f32
-; CHECK-SAME: (ptr [[TMP0:%.*]], <4 x float> [[TMP1:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[TMP3:%.*]] = call [[BITCAST_STRUCT_I128_V4F32:%.*]] @bitcast_struct_i128_v4f32.body(ptr poison, <4 x float> [[TMP1]])
+; CHECK-SAME: (ptr addrspace(5) [[TMP0:%.*]], <4 x float> [[TMP1:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = call [[BITCAST_STRUCT_I128_V4F32:%.*]] @bitcast_struct_i128_v4f32.body(ptr addrspace(5) poison, <4 x float> [[TMP1]])
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue [[BITCAST_STRUCT_I128_V4F32]] [[TMP3]], 0
-; CHECK-NEXT:    store <4 x float> [[TMP4]], ptr [[TMP0]], align 16
+; CHECK-NEXT:    store <4 x float> [[TMP4]], ptr addrspace(5) [[TMP0]], align 16
 ; CHECK-NEXT:    ret void
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@bitcast_array_v4i32_v4f32.body
-; CHECK-SAME: (ptr [[OUT:%.*]], [4 x float] [[VALUE:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: (ptr addrspace(5) [[OUT:%.*]], [4 x float] [[VALUE:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[BITCAST_ARRAY_V4I32_V4F32:%.*]] poison, [4 x float] [[VALUE]], 0
 ; CHECK-NEXT:    ret [[BITCAST_ARRAY_V4I32_V4F32]] [[TMP1]]
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@bitcast_array_v4i32_v4f32
-; CHECK-SAME: (ptr [[TMP0:%.*]], [4 x float] [[TMP1:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[TMP3:%.*]] = call [[BITCAST_ARRAY_V4I32_V4F32:%.*]] @bitcast_array_v4i32_v4f32.body(ptr poison, [4 x float] [[TMP1]])
+; CHECK-SAME: (ptr addrspace(5) [[TMP0:%.*]], [4 x float] [[TMP1:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = call [[BITCAST_ARRAY_V4I32_V4F32:%.*]] @bitcast_array_v4i32_v4f32.body(ptr addrspace(5) poison, [4 x float] [[TMP1]])
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue [[BITCAST_ARRAY_V4I32_V4F32]] [[TMP3]], 0
-; CHECK-NEXT:    store [4 x float] [[TMP4]], ptr [[TMP0]], align 4
+; CHECK-NEXT:    store [4 x float] [[TMP4]], ptr addrspace(5) [[TMP0]], align 4
 ; CHECK-NEXT:    ret void
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@multi_return_bitcast_struct_v3f32_v3f32.body
-; CHECK-SAME: (i1 [[COND:%.*]], ptr [[OUT:%.*]], <3 x float> [[VALUE:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: (i1 [[COND:%.*]], ptr addrspace(5) [[OUT:%.*]], <3 x float> [[VALUE:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 [[COND]], label [[RET0:%.*]], label [[RET1:%.*]]
 ; CHECK:       ret0:
@@ -1134,23 +1136,23 @@ attributes #2 = { alwaysinline nounwind }
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@multi_return_bitcast_struct_v3f32_v3f32
-; CHECK-SAME: (i1 [[TMP0:%.*]], ptr [[TMP1:%.*]], <3 x float> [[TMP2:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[TMP4:%.*]] = call [[MULTI_RETURN_BITCAST_STRUCT_V3F32_V3F32:%.*]] @multi_return_bitcast_struct_v3f32_v3f32.body(i1 [[TMP0]], ptr poison, <3 x float> [[TMP2]])
+; CHECK-SAME: (i1 [[TMP0:%.*]], ptr addrspace(5) [[TMP1:%.*]], <3 x float> [[TMP2:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP4:%.*]] = call [[MULTI_RETURN_BITCAST_STRUCT_V3F32_V3F32:%.*]] @multi_return_bitcast_struct_v3f32_v3f32.body(i1 [[TMP0]], ptr addrspace(5) poison, <3 x float> [[TMP2]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue [[MULTI_RETURN_BITCAST_STRUCT_V3F32_V3F32]] [[TMP4]], 0
-; CHECK-NEXT:    store <4 x float> [[TMP5]], ptr [[TMP1]], align 16
+; CHECK-NEXT:    store <4 x float> [[TMP5]], ptr addrspace(5) [[TMP1]], align 16
 ; CHECK-NEXT:    ret void
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@bitcast_v3f32_struct_v3f32.body
-; CHECK-SAME: (ptr [[OUT:%.*]], [[STRUCT_V3F32:%.*]] [[VALUE:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: (ptr addrspace(5) [[OUT:%.*]], [[STRUCT_V3F32:%.*]] [[VALUE:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[BITCAST_V3F32_STRUCT_V3F32:%.*]] poison, [[STRUCT_V3F32]] [[VALUE]], 0
 ; CHECK-NEXT:    ret [[BITCAST_V3F32_STRUCT_V3F32]] [[TMP1]]
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@bitcast_v3f32_struct_v3f32
-; CHECK-SAME: (ptr [[TMP0:%.*]], [[STRUCT_V3F32:%.*]] [[TMP1:%.*]]) #[[ATTR2]] {
-; CHECK-NEXT:    [[TMP3:%.*]] = call [[BITCAST_V3F32_STRUCT_V3F32:%.*]] @bitcast_v3f32_struct_v3f32.body(ptr poison, [[STRUCT_V3F32]] [[TMP1]])
+; CHECK-SAME: (ptr addrspace(5) [[TMP0:%.*]], [[STRUCT_V3F32:%.*]] [[TMP1:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    [[TMP3:%.*]] = call [[BITCAST_V3F32_STRUCT_V3F32:%.*]] @bitcast_v3f32_struct_v3f32.body(ptr addrspace(5) poison, [[STRUCT_V3F32]] [[TMP1]])
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue [[BITCAST_V3F32_STRUCT_V3F32]] [[TMP3]], 0
-; CHECK-NEXT:    store [[STRUCT_V3F32]] [[TMP4]], ptr [[TMP0]], align 16
+; CHECK-NEXT:    store [[STRUCT_V3F32]] [[TMP4]], ptr addrspace(5) [[TMP0]], align 16
 ; CHECK-NEXT:    ret void
 ;

From ba11d314a67638454989d4e0aebae64145d1a8ac Mon Sep 17 00:00:00 2001
From: Timm Baeder <tbaeder@redhat.com>
Date: Thu, 26 Oct 2023 16:15:29 +0200
Subject: [PATCH 073/877] [clang][Interp] Only diagnose null field access in
 constant contexts (#69223)

Looks like this should work as long as we don't dereference the value.
---
 clang/lib/AST/Interp/Interp.cpp   |  2 +-
 clang/lib/AST/Interp/Interp.h     |  2 +-
 clang/lib/AST/Interp/Pointer.h    | 28 +++++++++++++++++++++-----
 clang/test/AST/Interp/c.c         | 14 +++++++++++++
 clang/test/AST/Interp/records.cpp | 33 +++++++++++++++++++++++++++++++
 5 files changed, 72 insertions(+), 7 deletions(-)

diff --git a/clang/lib/AST/Interp/Interp.cpp b/clang/lib/AST/Interp/Interp.cpp
index 31d43b6010c18..1ebbadc375f38 100644
--- a/clang/lib/AST/Interp/Interp.cpp
+++ b/clang/lib/AST/Interp/Interp.cpp
@@ -216,7 +216,7 @@ bool CheckLive(InterpState &S, CodePtr OpPC, const Pointer &Ptr,
 }
 
 bool CheckDummy(InterpState &S, CodePtr OpPC, const Pointer &Ptr) {
-  return !Ptr.isDummy();
+  return !Ptr.isZero() && !Ptr.isDummy();
 }
 
 bool CheckNull(InterpState &S, CodePtr OpPC, const Pointer &Ptr,
diff --git a/clang/lib/AST/Interp/Interp.h b/clang/lib/AST/Interp/Interp.h
index 3e1b4f32e8b69..7dd415d6e4605 100644
--- a/clang/lib/AST/Interp/Interp.h
+++ b/clang/lib/AST/Interp/Interp.h
@@ -1155,7 +1155,7 @@ inline bool GetPtrGlobal(InterpState &S, CodePtr OpPC, uint32_t I) {
 /// 2) Pushes Pointer.atField(Off) on the stack
 inline bool GetPtrField(InterpState &S, CodePtr OpPC, uint32_t Off) {
   const Pointer &Ptr = S.Stk.pop<Pointer>();
-  if (!CheckNull(S, OpPC, Ptr, CSK_Field))
+  if (S.inConstantContext() && !CheckNull(S, OpPC, Ptr, CSK_Field))
     return false;
   if (!CheckExtern(S, OpPC, Ptr))
     return false;
diff --git a/clang/lib/AST/Interp/Pointer.h b/clang/lib/AST/Interp/Pointer.h
index b371b306fe7a7..843bcad16b5d1 100644
--- a/clang/lib/AST/Interp/Pointer.h
+++ b/clang/lib/AST/Interp/Pointer.h
@@ -199,7 +199,10 @@ class Pointer {
   bool isField() const { return Base != 0 && Base != RootPtrMark; }
 
   /// Accessor for information about the declaration site.
-  const Descriptor *getDeclDesc() const { return Pointee->Desc; }
+  const Descriptor *getDeclDesc() const {
+    assert(Pointee);
+    return Pointee->Desc;
+  }
   SourceLocation getDeclLoc() const { return getDeclDesc()->getLocation(); }
 
   /// Returns a pointer to the object of which this pointer is a field.
@@ -296,11 +299,17 @@ class Pointer {
   bool isUnion() const;
 
   /// Checks if the storage is extern.
-  bool isExtern() const { return Pointee->isExtern(); }
+  bool isExtern() const { return Pointee && Pointee->isExtern(); }
   /// Checks if the storage is static.
-  bool isStatic() const { return Pointee->isStatic(); }
+  bool isStatic() const {
+    assert(Pointee);
+    return Pointee->isStatic();
+  }
   /// Checks if the storage is temporary.
-  bool isTemporary() const { return Pointee->isTemporary(); }
+  bool isTemporary() const {
+    assert(Pointee);
+    return Pointee->isTemporary();
+  }
   /// Checks if the storage is a static temporary.
   bool isStaticTemporary() const { return isStatic() && isTemporary(); }
 
@@ -323,7 +332,10 @@ class Pointer {
   }
 
   /// Returns the declaration ID.
-  std::optional<unsigned> getDeclID() const { return Pointee->getDeclID(); }
+  std::optional<unsigned> getDeclID() const {
+    assert(Pointee);
+    return Pointee->getDeclID();
+  }
 
   /// Returns the byte offset from the start.
   unsigned getByteOffset() const {
@@ -351,6 +363,8 @@ class Pointer {
 
   /// Checks if the index is one past end.
   bool isOnePastEnd() const {
+    if (!Pointee)
+      return false;
     return isElementPastEnd() || getSize() == getOffset();
   }
 
@@ -360,6 +374,7 @@ class Pointer {
   /// Dereferences the pointer, if it's live.
   template <typename T> T &deref() const {
     assert(isLive() && "Invalid pointer");
+    assert(Pointee);
     if (isArrayRoot())
       return *reinterpret_cast<T *>(Pointee->rawData() + Base +
                                     sizeof(InitMapPtr));
@@ -370,6 +385,7 @@ class Pointer {
   /// Dereferences a primitive element.
   template <typename T> T &elem(unsigned I) const {
     assert(I < getNumElems());
+    assert(Pointee);
     return reinterpret_cast<T *>(Pointee->data() + sizeof(InitMapPtr))[I];
   }
 
@@ -431,12 +447,14 @@ class Pointer {
   /// Returns a descriptor at a given offset.
   InlineDescriptor *getDescriptor(unsigned Offset) const {
     assert(Offset != 0 && "Not a nested pointer");
+    assert(Pointee);
     return reinterpret_cast<InlineDescriptor *>(Pointee->rawData() + Offset) -
            1;
   }
 
   /// Returns a reference to the InitMapPtr which stores the initialization map.
   InitMapPtr &getInitMap() const {
+    assert(Pointee);
     return *reinterpret_cast<InitMapPtr *>(Pointee->rawData() + Base);
   }
 
diff --git a/clang/test/AST/Interp/c.c b/clang/test/AST/Interp/c.c
index e8aa8b8599f21..6bfcded0a7864 100644
--- a/clang/test/AST/Interp/c.c
+++ b/clang/test/AST/Interp/c.c
@@ -3,6 +3,8 @@
 // RUN: %clang_cc1 -verify=ref -std=c11 %s
 // RUN: %clang_cc1 -pedantic -verify=pedantic-ref -std=c11 %s
 
+typedef __INTPTR_TYPE__ intptr_t;
+
 _Static_assert(1, "");
 _Static_assert(0 != 1, "");
 _Static_assert(1.0 == 1.0, ""); // pedantic-ref-warning {{not an integer constant expression}} \
@@ -67,3 +69,15 @@ _Static_assert(&Test50 != (void*)0, ""); // ref-warning {{always true}} \
                                          // expected-warning {{always true}} \
                                          // pedantic-expected-warning {{always true}} \
                                          // pedantic-expected-warning {{is a GNU extension}}
+
+struct y {int x,y;};
+int a2[(intptr_t)&((struct y*)0)->y]; // expected-warning {{folded to constant array}} \
+                                      // pedantic-expected-warning {{folded to constant array}} \
+                                      // ref-warning {{folded to constant array}} \
+                                      // pedantic-ref-warning {{folded to constant array}}
+
+const struct y *yy = (struct y*)0;
+const intptr_t L = (intptr_t)(&(yy->y)); // expected-error {{not a compile-time constant}} \
+                                         // pedantic-expected-error {{not a compile-time constant}} \
+                                         // ref-error {{not a compile-time constant}} \
+                                         // pedantic-ref-error {{not a compile-time constant}}
diff --git a/clang/test/AST/Interp/records.cpp b/clang/test/AST/Interp/records.cpp
index e899e37915f03..280eaf34898ce 100644
--- a/clang/test/AST/Interp/records.cpp
+++ b/clang/test/AST/Interp/records.cpp
@@ -1102,3 +1102,36 @@ namespace DelegatingConstructors {
   static_assert(d4.a == 10, "");
   static_assert(d4.b == 12, "");
 }
+
+namespace AccessOnNullptr {
+  struct F {
+    int a;
+  };
+
+  constexpr int a() { // expected-error {{never produces a constant expression}} \
+                      // ref-error {{never produces a constant expression}}
+    F *f = nullptr;
+
+    f->a = 0; // expected-note 2{{cannot access field of null pointer}} \
+              // ref-note 2{{cannot access field of null pointer}}
+    return f->a;
+  }
+  static_assert(a() == 0, ""); // expected-error {{not an integral constant expression}} \
+                               // expected-note {{in call to 'a()'}} \
+                               // ref-error {{not an integral constant expression}} \
+                               // ref-note {{in call to 'a()'}}
+
+  constexpr int a2() { // expected-error {{never produces a constant expression}} \
+                      // ref-error {{never produces a constant expression}}
+    F *f = nullptr;
+
+
+    const int *a = &(f->a); // expected-note 2{{cannot access field of null pointer}} \
+                            // ref-note 2{{cannot access field of null pointer}}
+    return f->a;
+  }
+  static_assert(a2() == 0, ""); // expected-error {{not an integral constant expression}} \
+                               // expected-note {{in call to 'a2()'}} \
+                               // ref-error {{not an integral constant expression}} \
+                               // ref-note {{in call to 'a2()'}}
+}

From aaabf50d521550c0f6c0b5c8623450eb56f485f5 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 26 Oct 2023 15:33:51 +0100
Subject: [PATCH 074/877] [AArch64] Regenerate tests to show missing constant
 comments

---
 .../test/CodeGen/AArch64/arm64-abi-varargs.ll | 40 +++++++++----------
 .../CodeGen/AArch64/arm64-build-vector.ll     | 12 +++---
 2 files changed, 26 insertions(+), 26 deletions(-)

diff --git a/llvm/test/CodeGen/AArch64/arm64-abi-varargs.ll b/llvm/test/CodeGen/AArch64/arm64-abi-varargs.ll
index d943afb23c03b..1b22514a59d60 100644
--- a/llvm/test/CodeGen/AArch64/arm64-abi-varargs.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-abi-varargs.ll
@@ -64,37 +64,37 @@ define i32 @main() nounwind ssp {
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    sub sp, sp, #96
 ; CHECK-NEXT:    stp x29, x30, [sp, #80] ; 16-byte Folded Spill
-; CHECK-NEXT:    mov w9, #1
-; CHECK-NEXT:    mov w8, #2
+; CHECK-NEXT:    mov w9, #1 ; =0x1
+; CHECK-NEXT:    mov w8, #2 ; =0x2
 ; CHECK-NEXT:    stp w8, w9, [sp, #72]
-; CHECK-NEXT:    mov w9, #3
-; CHECK-NEXT:    mov w8, #4
+; CHECK-NEXT:    mov w9, #3 ; =0x3
+; CHECK-NEXT:    mov w8, #4 ; =0x4
 ; CHECK-NEXT:    stp w8, w9, [sp, #64]
-; CHECK-NEXT:    mov w9, #5
-; CHECK-NEXT:    mov w8, #6
+; CHECK-NEXT:    mov w9, #5 ; =0x5
+; CHECK-NEXT:    mov w8, #6 ; =0x6
 ; CHECK-NEXT:    stp w8, w9, [sp, #56]
-; CHECK-NEXT:    mov w9, #7
-; CHECK-NEXT:    mov w8, #8
+; CHECK-NEXT:    mov w9, #7 ; =0x7
+; CHECK-NEXT:    mov w8, #8 ; =0x8
 ; CHECK-NEXT:    stp w8, w9, [sp, #48]
-; CHECK-NEXT:    mov w8, #9
-; CHECK-NEXT:    mov w9, #10
+; CHECK-NEXT:    mov w8, #9 ; =0x9
+; CHECK-NEXT:    mov w9, #10 ; =0xa
 ; CHECK-NEXT:    stp w9, w8, [sp, #40]
-; CHECK-NEXT:    mov w10, #11
-; CHECK-NEXT:    mov w11, #12
+; CHECK-NEXT:    mov w10, #11 ; =0xb
+; CHECK-NEXT:    mov w11, #12 ; =0xc
 ; CHECK-NEXT:    stp w11, w10, [sp, #32]
 ; CHECK-NEXT:    stp x10, x11, [sp, #16]
 ; CHECK-NEXT:    str x9, [sp, #8]
 ; CHECK-NEXT:    str w8, [sp]
 ; CHECK-NEXT:    add x0, sp, #76
-; CHECK-NEXT:    mov w1, #2
-; CHECK-NEXT:    mov w2, #3
-; CHECK-NEXT:    mov w3, #4
-; CHECK-NEXT:    mov w4, #5
-; CHECK-NEXT:    mov w5, #6
-; CHECK-NEXT:    mov w6, #7
-; CHECK-NEXT:    mov w7, #8
+; CHECK-NEXT:    mov w1, #2 ; =0x2
+; CHECK-NEXT:    mov w2, #3 ; =0x3
+; CHECK-NEXT:    mov w3, #4 ; =0x4
+; CHECK-NEXT:    mov w4, #5 ; =0x5
+; CHECK-NEXT:    mov w5, #6 ; =0x6
+; CHECK-NEXT:    mov w6, #7 ; =0x7
+; CHECK-NEXT:    mov w7, #8 ; =0x8
 ; CHECK-NEXT:    bl _fn9
-; CHECK-NEXT:    mov w0, #0
+; CHECK-NEXT:    mov w0, #0 ; =0x0
 ; CHECK-NEXT:    ldp x29, x30, [sp, #80] ; 16-byte Folded Reload
 ; CHECK-NEXT:    add sp, sp, #96
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/arm64-build-vector.ll b/llvm/test/CodeGen/AArch64/arm64-build-vector.ll
index f9f57e662d6ae..68c56d765cbb9 100644
--- a/llvm/test/CodeGen/AArch64/arm64-build-vector.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-build-vector.ll
@@ -24,7 +24,7 @@ define <4 x float>  @foo(float %a, float %b, float %c, float %d) nounwind {
 define <8 x i16> @build_all_zero(<8 x i16> %a) #1 {
 ; CHECK-LABEL: build_all_zero:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #44672
+; CHECK-NEXT:    mov w8, #44672 // =0xae80
 ; CHECK-NEXT:    fmov s1, w8
 ; CHECK-NEXT:    mul v0.8h, v0.8h, v1.8h
 ; CHECK-NEXT:    ret
@@ -56,7 +56,7 @@ define <8 x i16> @concat_2_build_vector(<4 x i16> %in0) {
 define void @widen_f16_build_vector(ptr %addr) {
 ; CHECK-LABEL: widen_f16_build_vector:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #13294
+; CHECK-NEXT:    mov w8, #13294 // =0x33ee
 ; CHECK-NEXT:    movk w8, #13294, lsl #16
 ; CHECK-NEXT:    str w8, [x0]
 ; CHECK-NEXT:    ret
@@ -68,7 +68,7 @@ define void @widen_f16_build_vector(ptr %addr) {
 define <1 x i64> @single_element_vector_i64(<1 x i64> %arg) {
 ; CHECK-LABEL: single_element_vector_i64:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    mov w8, #1
+; CHECK-NEXT:    mov w8, #1 // =0x1
 ; CHECK-NEXT:    fmov d1, x8
 ; CHECK-NEXT:    add d0, d0, d1
 ; CHECK-NEXT:    ret
@@ -94,7 +94,7 @@ define <1 x double> @convert_single_fp_vector_constant(i1 %cmp) {
 ; CHECK-LABEL: convert_single_fp_vector_constant:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    tst w0, #0x1
-; CHECK-NEXT:    mov x8, #4607182418800017408
+; CHECK-NEXT:    mov x8, #4607182418800017408 // =0x3ff0000000000000
 ; CHECK-NEXT:    csetm x9, ne
 ; CHECK-NEXT:    fmov d0, x8
 ; CHECK-NEXT:    fmov d1, x9
@@ -120,7 +120,7 @@ define <2 x double> @poszero_v2f64(<2 x double> %a) {
 define <2 x double> @negzero_v2f64(<2 x double> %a) {
 ; CHECK-LABEL: negzero_v2f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x8, #-9223372036854775808
+; CHECK-NEXT:    mov x8, #-9223372036854775808 // =0x8000000000000000
 ; CHECK-NEXT:    dup v1.2d, x8
 ; CHECK-NEXT:    fmul v0.2d, v0.2d, v1.2d
 ; CHECK-NEXT:    ret
@@ -141,7 +141,7 @@ define <1 x double> @poszero_v1f64(<1 x double> %a) {
 define <1 x double> @negzero_v1f64(<1 x double> %a) {
 ; CHECK-LABEL: negzero_v1f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x8, #-9223372036854775808
+; CHECK-NEXT:    mov x8, #-9223372036854775808 // =0x8000000000000000
 ; CHECK-NEXT:    fmov d1, x8
 ; CHECK-NEXT:    fmul d0, d0, d1
 ; CHECK-NEXT:    ret

From 13a349425b55f4322ef4410cd6f54587aa80f1d0 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 26 Oct 2023 15:35:09 +0100
Subject: [PATCH 075/877] [AArch64] Regenerate addr-of-ret-addr.ll

---
 llvm/test/CodeGen/AArch64/addr-of-ret-addr.ll | 62 ++++++++++++++-----
 1 file changed, 47 insertions(+), 15 deletions(-)

diff --git a/llvm/test/CodeGen/AArch64/addr-of-ret-addr.ll b/llvm/test/CodeGen/AArch64/addr-of-ret-addr.ll
index 2de708d66f59f..b6ec9eb569436 100644
--- a/llvm/test/CodeGen/AArch64/addr-of-ret-addr.ll
+++ b/llvm/test/CodeGen/AArch64/addr-of-ret-addr.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
 ; RUN: llc < %s -frame-pointer=all -mtriple=arm64-windows | FileCheck %s
 
 ; Test generated from C code:
@@ -15,18 +16,58 @@ declare void @llvm.va_start(ptr)
 declare ptr @llvm.addressofreturnaddress()
 
 define dso_local ptr @"foo"() {
+; CHECK-LABEL: foo:
+; CHECK:       .seh_proc foo
+; CHECK-NEXT:  // %bb.0: // %entry
+; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT:    .seh_save_fplr_x 16
+; CHECK-NEXT:    mov x29, sp
+; CHECK-NEXT:    .seh_set_fp
+; CHECK-NEXT:    .seh_endprologue
+; CHECK-NEXT:    add x0, x29, #8
+; CHECK-NEXT:    .seh_startepilogue
+; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT:    .seh_save_fplr_x 16
+; CHECK-NEXT:    .seh_endepilogue
+; CHECK-NEXT:    ret
+; CHECK-NEXT:    .seh_endfunclet
+; CHECK-NEXT:    .seh_endproc
 entry:
   %0 = call ptr @llvm.addressofreturnaddress()
   ret ptr %0
-
-; CHECK-LABEL: foo
-; CHECK: stp x29, x30, [sp, #-16]!
-; CHECK: mov x29, sp
-; CHECK: add x0, x29, #8
-; CHECK: ldp x29, x30, [sp], #16
 }
 
 define dso_local i32 @"bar"(ptr %x, ...) {
+; CHECK-LABEL: bar:
+; CHECK:       .seh_proc bar
+; CHECK-NEXT:  // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #96
+; CHECK-NEXT:    .seh_stackalloc 96
+; CHECK-NEXT:    stp x29, x30, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    .seh_save_fplr 16
+; CHECK-NEXT:    add x29, sp, #16
+; CHECK-NEXT:    .seh_add_fp 16
+; CHECK-NEXT:    .seh_endprologue
+; CHECK-NEXT:    add x9, x29, #24
+; CHECK-NEXT:    mov x8, x0
+; CHECK-NEXT:    stp x1, x2, [x29, #24]
+; CHECK-NEXT:    stp x9, x0, [sp]
+; CHECK-NEXT:    add x0, x29, #24
+; CHECK-NEXT:    add x1, x29, #8
+; CHECK-NEXT:    stp x3, x4, [x29, #40]
+; CHECK-NEXT:    stp x5, x6, [x29, #56]
+; CHECK-NEXT:    str x7, [x29, #72]
+; CHECK-NEXT:    blr x8
+; CHECK-NEXT:    add w0, w0, #1
+; CHECK-NEXT:    .seh_startepilogue
+; CHECK-NEXT:    ldp x29, x30, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    .seh_save_fplr 16
+; CHECK-NEXT:    add sp, sp, #96
+; CHECK-NEXT:    .seh_stackalloc 96
+; CHECK-NEXT:    .seh_endepilogue
+; CHECK-NEXT:    ret
+; CHECK-NEXT:    .seh_endfunclet
+; CHECK-NEXT:    .seh_endproc
 entry:
   %x.addr = alloca ptr, align 8
   %y = alloca ptr, align 8
@@ -38,13 +79,4 @@ entry:
   %call = call i32 %0(ptr %2, ptr %1)
   %add = add nsw i32 %call, 1
   ret i32 %add
-
-; CHECK-LABEL: bar
-; CHECK: sub sp, sp, #96
-; CHECK: stp x29, x30, [sp, #16]
-; CHECK: add x29, sp, #16
-; CHECK: stp x1, x2, [x29, #24]
-; CHECK: add x1, x29, #8
-; CHECK: ldp x29, x30, [sp, #16]
-; CHECK: add sp, sp, #96
 }

From 53096f910ca874cf446417d0bf551c5d63e917b2 Mon Sep 17 00:00:00 2001
From: Nico Weber <thakis@chromium.org>
Date: Thu, 26 Oct 2023 07:36:43 -0700
Subject: [PATCH 076/877] [lldb] Try to fix build after d1556e5efbf0

Getting lots of `error: unknown type name 'uint64_t'` and `uint32_t`
in this file on Linux.
---
 lldb/include/lldb/Target/RegisterFlags.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lldb/include/lldb/Target/RegisterFlags.h b/lldb/include/lldb/Target/RegisterFlags.h
index 7c5b97c2265fd..a088981918cb3 100644
--- a/lldb/include/lldb/Target/RegisterFlags.h
+++ b/lldb/include/lldb/Target/RegisterFlags.h
@@ -9,6 +9,7 @@
 #ifndef LLDB_TARGET_REGISTERFLAGS_H
 #define LLDB_TARGET_REGISTERFLAGS_H
 
+#include <stdint.h>
 #include <string>
 #include <vector>
 

From 4afe550ba677ca8998f548a80c832ee0f940cca6 Mon Sep 17 00:00:00 2001
From: Kristof Beyls <kristof.beyls@arm.com>
Date: Thu, 26 Oct 2023 16:38:32 +0200
Subject: [PATCH 077/877] [Security Group] add github names of security group
 members. (#69304)

Also drop phabricator names as we no longer use phabricator.

---------

Co-authored-by: Andy Kaylor <andrew.kaylor@intel.com>
---
 llvm/docs/Security.rst | 45 ++++++++++++++++++++++--------------------
 1 file changed, 24 insertions(+), 21 deletions(-)

diff --git a/llvm/docs/Security.rst b/llvm/docs/Security.rst
index 2729541facbfe..61ba9aafaa394 100644
--- a/llvm/docs/Security.rst
+++ b/llvm/docs/Security.rst
@@ -31,28 +31,31 @@ Group Composition
 Security Group Members
 ----------------------
 
-The members of the group represent a wide cross-section of the community, and meet the criteria for inclusion below. The list is in the format `* ${full_name} (${affiliation}) [${phabricator_username}]`. If a phabricator username for an individual isn't available, the brackets will be empty.
-
-* Ahmed Bougacha (Apple) [ab]
-* Andy Kaylor (Intel) [andykaylor]
-* Artur Pilipenko (Azul Systems Inc) [apilipenko]
-* Boovaragavan Dasarathan (Nvidia) [mrragava]
-* Dimitry Andric (individual; FreeBSD) [dim]
-* Ed Maste (individual; FreeBSD) [emaste]
-* George Burgess IV (Google) [george.burgess.iv]
-* Josh Stone (Red Hat; Rust) [cuviper]
+The members of the group represent a wide cross-section of the community, and
+meet the criteria for inclusion below. The list is in the format
+`* ${full_name} (${affiliation}) [${github_username}]`. If a github
+username for an individual isn't available, the brackets will be empty.
+
+* Ahmed Bougacha (Apple) [@ahmedbougacha]
+* Andy Kaylor (Intel) [@andykaylor]
+* Artur Pilipenko (Azul Systems Inc) []
+* Boovaragavan Dasarathan (Nvidia) [@mrragava]
+* Dimitry Andric (individual; FreeBSD) [@DimitryAndric]
+* Ed Maste (individual; FreeBSD) [@emaste]
+* George Burgess IV (Google) [@gburgessiv]
+* Josh Stone (Red Hat; Rust) [@cuviper]
 * Kate McInnes (Apple) []
-* Kristof Beyls (ARM) [kristof.beyls]
-* Matthew Riley (Google) [mattdr]
-* Nikhil Gupta (Nvidia) [nikhgupt]
-* Oliver Hunt (Apple) [ojhunt]
-* Paul Robinson (Sony) [probinson]
-* Peter Smith (ARM) [peter.smith]
-* Pietro Albini (Ferrous Systems; Rust) [pietroalbini]
-* Serge Guelton (Mozilla) [serge-sans-paille]
-* Sergey Maslov (Intel) [smaslov-intel]
-* Shayne Hiet-Block (Microsoft) [Shayne]
-* Tim Penge (Sony) [tpenge]
+* Kristof Beyls (ARM) [@kbeyls]
+* Matthew Riley (Google) [@mmdriley]
+* Nikhil Gupta (Nvidia) []
+* Oliver Hunt (Apple) [@ojhunt]
+* Paul Robinson (Sony) [@pogo59]
+* Peter Smith (ARM) [@smithp35]
+* Pietro Albini (Ferrous Systems; Rust) [@pietroalbini]
+* Serge Guelton (Mozilla) [@serge-sans-paille]
+* Sergey Maslov (Intel) [@smaslov-intel]
+* Shayne Hiet-Block (Microsoft) [@GreatKeeper]
+* Tim Penge (Sony) []
 
 Criteria
 --------

From 7caff73e38038a98b837566eaf0e2e50754e2443 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Thu, 26 Oct 2023 15:39:10 +0100
Subject: [PATCH 078/877] [AMDGPU] Assert that we can find subregs in
 copyPhysReg. NFC. (#70332)

This helped to catch a codegen failure caused by #69703. MachineVerifier
did not complain about this malformed COPY either before regalloc:

  %9:vreg_64 = COPY %17:vgpr_32

Or after regalloc:

  renamable $vgpr0_vgpr1 = COPY renamable $vgpr2, implicit $exec

But we can at least catch the problem when copyPhysReg tries to expand
it into 32-bit register moves and fails to find suitable source
registers:

  $vgpr0 = V_MOV_B32_e32 $noreg, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr2
  $vgpr1 = V_MOV_B32_e32 $noreg, implicit $exec, implicit $vgpr2, implicit $exec
---
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 51 ++++++++++++++------------
 1 file changed, 27 insertions(+), 24 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index ffcd415a66648..62f5a17635cee 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -742,23 +742,27 @@ static void expandSGPRCopy(const SIInstrInfo &TII, MachineBasicBlock &MBB,
 
   for (unsigned Idx = 0; Idx < BaseIndices.size(); ++Idx) {
     int16_t SubIdx = BaseIndices[Idx];
-    Register Reg = RI.getSubReg(DestReg, SubIdx);
+    Register DestSubReg = RI.getSubReg(DestReg, SubIdx);
+    Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
+    assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
     unsigned Opcode = AMDGPU::S_MOV_B32;
 
     // Is SGPR aligned? If so try to combine with next.
-    Register Src = RI.getSubReg(SrcReg, SubIdx);
-    bool AlignedDest = ((Reg - AMDGPU::SGPR0) % 2) == 0;
-    bool AlignedSrc = ((Src - AMDGPU::SGPR0) % 2) == 0;
+    bool AlignedDest = ((DestSubReg - AMDGPU::SGPR0) % 2) == 0;
+    bool AlignedSrc = ((SrcSubReg - AMDGPU::SGPR0) % 2) == 0;
     if (AlignedDest && AlignedSrc && (Idx + 1 < BaseIndices.size())) {
       // Can use SGPR64 copy
       unsigned Channel = RI.getChannelFromSubReg(SubIdx);
       SubIdx = RI.getSubRegFromChannel(Channel, 2);
+      DestSubReg = RI.getSubReg(DestReg, SubIdx);
+      SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
+      assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
       Opcode = AMDGPU::S_MOV_B64;
       Idx++;
     }
 
-    LastMI = BuildMI(MBB, I, DL, TII.get(Opcode), RI.getSubReg(DestReg, SubIdx))
-                 .addReg(RI.getSubReg(SrcReg, SubIdx))
+    LastMI = BuildMI(MBB, I, DL, TII.get(Opcode), DestSubReg)
+                 .addReg(SrcSubReg)
                  .addReg(SrcReg, RegState::Implicit);
 
     if (!FirstMI)
@@ -1098,6 +1102,9 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
       SubIdx = SubIndices[Idx];
     else
       SubIdx = SubIndices[SubIndices.size() - Idx - 1];
+    Register DestSubReg = RI.getSubReg(DestReg, SubIdx);
+    Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
+    assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
 
     bool IsFirstSubreg = Idx == 0;
     bool UseKill = CanKillSuperReg && Idx == SubIndices.size() - 1;
@@ -1105,30 +1112,26 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     if (Opcode == AMDGPU::INSTRUCTION_LIST_END) {
       Register ImpDefSuper = IsFirstSubreg ? Register(DestReg) : Register();
       Register ImpUseSuper = SrcReg;
-      indirectCopyToAGPR(*this, MBB, MI, DL, RI.getSubReg(DestReg, SubIdx),
-                         RI.getSubReg(SrcReg, SubIdx), UseKill, *RS, Overlap,
-                         ImpDefSuper, ImpUseSuper);
+      indirectCopyToAGPR(*this, MBB, MI, DL, DestSubReg, SrcSubReg, UseKill,
+                         *RS, Overlap, ImpDefSuper, ImpUseSuper);
     } else if (Opcode == AMDGPU::V_PK_MOV_B32) {
-      Register DstSubReg = RI.getSubReg(DestReg, SubIdx);
-      Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
       MachineInstrBuilder MIB =
-        BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DstSubReg)
-        .addImm(SISrcMods::OP_SEL_1)
-        .addReg(SrcSubReg)
-        .addImm(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1)
-        .addReg(SrcSubReg)
-        .addImm(0) // op_sel_lo
-        .addImm(0) // op_sel_hi
-        .addImm(0) // neg_lo
-        .addImm(0) // neg_hi
-        .addImm(0) // clamp
-        .addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
+          BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestSubReg)
+              .addImm(SISrcMods::OP_SEL_1)
+              .addReg(SrcSubReg)
+              .addImm(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1)
+              .addReg(SrcSubReg)
+              .addImm(0) // op_sel_lo
+              .addImm(0) // op_sel_hi
+              .addImm(0) // neg_lo
+              .addImm(0) // neg_hi
+              .addImm(0) // clamp
+              .addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
       if (IsFirstSubreg)
         MIB.addReg(DestReg, RegState::Define | RegState::Implicit);
     } else {
       MachineInstrBuilder Builder =
-        BuildMI(MBB, MI, DL, get(Opcode), RI.getSubReg(DestReg, SubIdx))
-        .addReg(RI.getSubReg(SrcReg, SubIdx));
+          BuildMI(MBB, MI, DL, get(Opcode), DestSubReg).addReg(SrcSubReg);
       if (IsFirstSubreg)
         Builder.addReg(DestReg, RegState::Define | RegState::Implicit);
 

From 3c58e53041fcdeae36ef5ca1e0683e0f1f16bf69 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Thu, 26 Oct 2023 15:42:09 +0100
Subject: [PATCH 079/877] [AMDGPU] Use const reference in
 SIInstrInfo::buildExtractSubReg. NFC.

---
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 20 +++++++-------------
 llvm/lib/Target/AMDGPU/SIInstrInfo.h   | 13 ++++++-------
 2 files changed, 13 insertions(+), 20 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 62f5a17635cee..327f8988ac2f1 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -5416,13 +5416,10 @@ void SIInstrInfo::legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const {
   MO.ChangeToRegister(Reg, false);
 }
 
-unsigned SIInstrInfo::buildExtractSubReg(MachineBasicBlock::iterator MI,
-                                         MachineRegisterInfo &MRI,
-                                         MachineOperand &SuperReg,
-                                         const TargetRegisterClass *SuperRC,
-                                         unsigned SubIdx,
-                                         const TargetRegisterClass *SubRC)
-                                         const {
+unsigned SIInstrInfo::buildExtractSubReg(
+    MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI,
+    const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC,
+    unsigned SubIdx, const TargetRegisterClass *SubRC) const {
   MachineBasicBlock *MBB = MI->getParent();
   DebugLoc DL = MI->getDebugLoc();
   Register SubReg = MRI.createVirtualRegister(SubRC);
@@ -5449,12 +5446,9 @@ unsigned SIInstrInfo::buildExtractSubReg(MachineBasicBlock::iterator MI,
 }
 
 MachineOperand SIInstrInfo::buildExtractSubRegOrImm(
-  MachineBasicBlock::iterator MII,
-  MachineRegisterInfo &MRI,
-  MachineOperand &Op,
-  const TargetRegisterClass *SuperRC,
-  unsigned SubIdx,
-  const TargetRegisterClass *SubRC) const {
+    MachineBasicBlock::iterator MII, MachineRegisterInfo &MRI,
+    const MachineOperand &Op, const TargetRegisterClass *SuperRC,
+    unsigned SubIdx, const TargetRegisterClass *SubRC) const {
   if (Op.isImm()) {
     if (SubIdx == AMDGPU::sub0)
       return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm()));
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index a64cf0244e4c0..e6c64d909d3ee 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -102,16 +102,15 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
 public:
   unsigned buildExtractSubReg(MachineBasicBlock::iterator MI,
                               MachineRegisterInfo &MRI,
-                              MachineOperand &SuperReg,
+                              const MachineOperand &SuperReg,
                               const TargetRegisterClass *SuperRC,
                               unsigned SubIdx,
                               const TargetRegisterClass *SubRC) const;
-  MachineOperand buildExtractSubRegOrImm(MachineBasicBlock::iterator MI,
-                                         MachineRegisterInfo &MRI,
-                                         MachineOperand &SuperReg,
-                                         const TargetRegisterClass *SuperRC,
-                                         unsigned SubIdx,
-                                         const TargetRegisterClass *SubRC) const;
+  MachineOperand buildExtractSubRegOrImm(
+      MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI,
+      const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC,
+      unsigned SubIdx, const TargetRegisterClass *SubRC) const;
+
 private:
   void swapOperands(MachineInstr &Inst) const;
 

From 35baff8b6ac4630d28db91d6b481d6cd5910931e Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Thu, 26 Oct 2023 08:02:10 -0700
Subject: [PATCH 080/877] [AMDGPU] Correct assert that incorrectly chained
 multiple == operators. (#70291)

I believe this assert was trying to check that 3 variables were equal to
0.

I think it instead got interpreted as ((DSWCount == DSWWithPermCount) ==
DSWWithSharedVMEMCount) == 0 I guess (DSWCount == DSWWithPermCount) was
true because both counts were 0. Then true got compared to
DSWWithSharedVMEMCount, and since DSWWithSharedVMEMCount is 0, that
compare was false. And then that false compared equal to the final 0.
---
 llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
index c67a21c639fc0..0b2bb98738be2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
@@ -1121,8 +1121,8 @@ void MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
   unsigned MFMACount = 0;
   unsigned DSRCount = 0;
 
-  assert((IsPostRA ||
-          DSWCount == DSWWithPermCount == DSWWithSharedVMEMCount == 0) &&
+  assert((IsPostRA || (DSWCount == 0 && DSWWithPermCount == 0 &&
+                       DSWWithSharedVMEMCount == 0)) &&
          "DSWCounters should be zero in pre-RA scheduling!");
   SmallVector<SUnit *, 6> DSWithPerms;
   for (auto &SU : DAG->SUnits) {

From 2d0ac85b6693eb6e25b4120c0e4e224c42a84462 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Wed, 25 Oct 2023 22:15:10 -0700
Subject: [PATCH 081/877] [X86] Fix gcc warning about mix of enumeral and
 non-enumeral types. NFC

---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 93db31e03e116..6411f27da0776 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -3297,21 +3297,22 @@ unsigned X86TargetLowering::preferedOpcodeForCmpEqPiecesOfOperand(
       // If the current setup has imm64 mask, then inverse will have
       // at least imm32 mask (or be zext i32 -> i64).
       if (VT == MVT::i64)
-        return AndMask->getSignificantBits() > 32 ? ISD::SRL : ShiftOpc;
+        return AndMask->getSignificantBits() > 32 ? (unsigned)ISD::SRL
+                                                  : ShiftOpc;
 
       // We can only benefit if req at least 7-bit for the mask. We
       // don't want to replace shl of 1,2,3 as they can be implemented
       // with lea/add.
-      return ShiftOrRotateAmt.uge(7) ? ISD::SRL : ShiftOpc;
+      return ShiftOrRotateAmt.uge(7) ? (unsigned)ISD::SRL : ShiftOpc;
     }
 
     if (VT == MVT::i64)
       // Keep exactly 32-bit imm64, this is zext i32 -> i64 which is
       // extremely efficient.
-      return AndMask->getSignificantBits() > 33 ? ISD::SHL : ShiftOpc;
+      return AndMask->getSignificantBits() > 33 ? (unsigned)ISD::SHL : ShiftOpc;
 
     // Keep small shifts as shl so we can generate add/lea.
-    return ShiftOrRotateAmt.ult(7) ? ISD::SHL : ShiftOpc;
+    return ShiftOrRotateAmt.ult(7) ? (unsigned)ISD::SHL : ShiftOpc;
   }
 
   // We prefer rotate for vectors of if we won't get a zext mask with SRL

From 78941e1eedb121344e0d969458ea85598cd749df Mon Sep 17 00:00:00 2001
From: hassnaaHamdi <hassnaa.hamdi@arm.com>
Date: Thu, 26 Oct 2023 16:09:59 +0100
Subject: [PATCH 082/877] [llvm][AArch64][Assembly]: Add FP8 instructions
 assembly and disassembly. (#69632)

This patch adds the feature flag FP8 and the assembly/disassembly
for the following instructions of NEON, SVE2 and SME2:
  * NEON Instructions:
    + Advanced SIMD two-register miscellaneous:
      - F1CVTL, F1CVTL2, F2CVTL, F2CVTL2
      - BF1CVTL, BF1CVTL2, BF2CVTL, BF2CVTL2
    + Advanced SIMD three-register extension:
       - FCVTN, FCVTN2 (FP32 to FP8)
       - FCVTN (FP16 to FP8)
    + Advanced SIMD three same:
      - FSCALE
  * SVE2 Instructions:
    + Downconvert instructions:
       - FCVTN_Z2Z_HtoB
       - FCVTNB_Z2Z_StoB
       - BFCVTN_Z2Z_HtoB
       - FCVTNT_Z2Z_StoB
     + Upconvert instructions:
          - F1CVT_ZZ, F2CVT_ZZ
          - BF1CVT_ZZ, BF2CVT_ZZ
          - F1CVTLT_ZZ, F2CVTLT_ZZ
          - BF1CVTLT_ZZ, BF2CVTLT_ZZ

  * SME2 Instructions:
    - F1CVT_2ZZ, F2CVT_2ZZ
    - BF1CVT_2ZZ, BF2CVT_2ZZ
    - F1CVTL_2ZZ, F2CVTL_2ZZ
    - BF1CVTL_2ZZ, BF2CVTL_2ZZ
    - FCVT_Z2Z_HtoB, BFCVT_Z2Z_HtoB
    - FCVT_Z4Z - FCVTN_Z4Z
    - FSCALE_2ZZ, FSCALE_4ZZ
    - FSCALE_2Z2Z, FSCALE_4Z4Z

That is according to this documentation:
https://developer.arm.com/documentation/ddi0602/2023-09
---
 .../llvm/TargetParser/AArch64TargetParser.h   |   2 +
 llvm/lib/Target/AArch64/AArch64.td            |   3 +
 .../lib/Target/AArch64/AArch64InstrFormats.td |  53 +++
 llvm/lib/Target/AArch64/AArch64InstrInfo.td   |  15 +
 .../lib/Target/AArch64/AArch64SMEInstrInfo.td |  39 +-
 .../lib/Target/AArch64/AArch64SVEInstrInfo.td |  21 ++
 .../AArch64/AsmParser/AArch64AsmParser.cpp    |   1 +
 llvm/lib/Target/AArch64/SMEInstrFormats.td    |  41 +-
 llvm/lib/Target/AArch64/SVEInstrFormats.td    |  43 +++
 .../MC/AArch64/FP8/directive-arch-negative.s  |   7 +
 llvm/test/MC/AArch64/FP8/directive-arch.s     |   7 +
 .../FP8/miscellaneous-fp8-diagnostics.s       |  84 +++++
 llvm/test/MC/AArch64/FP8/miscellaneous-fp8.s  | 355 ++++++++++++++++++
 .../MC/AArch64/FP8_SME2/cvt-diagnostics.s     |  87 +++++
 llvm/test/MC/AArch64/FP8_SME2/cvt.s           | 157 ++++++++
 .../MC/AArch64/FP8_SME2/fscale-diagnostics.c  |  62 +++
 llvm/test/MC/AArch64/FP8_SME2/fscale.s        | 160 ++++++++
 .../MC/AArch64/FP8_SVE2/fcvt-diagnostics.s    | 131 +++++++
 llvm/test/MC/AArch64/FP8_SVE2/fcvt.s          | 237 ++++++++++++
 .../MC/AArch64/FP8_SVE2/fcvtn-diagnostics.s   |  70 ++++
 llvm/test/MC/AArch64/FP8_SVE2/fcvtn.s         | 125 ++++++
 .../test/MC/AArch64/SVE2/fcvtnt-diagnostics.s |   4 +-
 .../TargetParser/TargetParserTest.cpp         |   4 +-
 23 files changed, 1686 insertions(+), 22 deletions(-)
 create mode 100644 llvm/test/MC/AArch64/FP8/directive-arch-negative.s
 create mode 100644 llvm/test/MC/AArch64/FP8/directive-arch.s
 create mode 100644 llvm/test/MC/AArch64/FP8/miscellaneous-fp8-diagnostics.s
 create mode 100644 llvm/test/MC/AArch64/FP8/miscellaneous-fp8.s
 create mode 100644 llvm/test/MC/AArch64/FP8_SME2/cvt-diagnostics.s
 create mode 100644 llvm/test/MC/AArch64/FP8_SME2/cvt.s
 create mode 100644 llvm/test/MC/AArch64/FP8_SME2/fscale-diagnostics.c
 create mode 100644 llvm/test/MC/AArch64/FP8_SME2/fscale.s
 create mode 100644 llvm/test/MC/AArch64/FP8_SVE2/fcvt-diagnostics.s
 create mode 100644 llvm/test/MC/AArch64/FP8_SVE2/fcvt.s
 create mode 100644 llvm/test/MC/AArch64/FP8_SVE2/fcvtn-diagnostics.s
 create mode 100644 llvm/test/MC/AArch64/FP8_SVE2/fcvtn.s

diff --git a/llvm/include/llvm/TargetParser/AArch64TargetParser.h b/llvm/include/llvm/TargetParser/AArch64TargetParser.h
index bf14473f133fa..8ff2947794251 100644
--- a/llvm/include/llvm/TargetParser/AArch64TargetParser.h
+++ b/llvm/include/llvm/TargetParser/AArch64TargetParser.h
@@ -160,6 +160,7 @@ enum ArchExtKind : unsigned {
   AEK_ITE =           56, // FEAT_ITE
   AEK_GCS =           57, // FEAT_GCS
   AEK_FPMR =          58, // FEAT_FPMR
+  AEK_FP8 =           59, // FEAT_FP8
   AEK_NUM_EXTENSIONS
 };
 using ExtensionBitset = Bitset<AEK_NUM_EXTENSIONS>;
@@ -269,6 +270,7 @@ inline constexpr ExtensionInfo Extensions[] = {
     {"wfxt", AArch64::AEK_NONE, {}, {}, FEAT_WFXT, "+wfxt", 550},
     {"gcs", AArch64::AEK_GCS, "+gcs", "-gcs", FEAT_INIT, "", 0},
     {"fpmr", AArch64::AEK_FPMR, "+fpmr", "-fpmr", FEAT_INIT, "", 0},
+    {"fp8", AArch64::AEK_FP8, "+fp8", "-fp8", FEAT_INIT, "+fpmr", 0},
     // Special cases
     {"none", AArch64::AEK_NONE, {}, {}, FEAT_INIT, "", ExtensionInfo::MaxFMVPriority},
 };
diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td
index ced1d43892036..0c0fa82ffe93c 100644
--- a/llvm/lib/Target/AArch64/AArch64.td
+++ b/llvm/lib/Target/AArch64/AArch64.td
@@ -130,6 +130,9 @@ def FeatureSVE : SubtargetFeature<"sve", "HasSVE", "true",
 def FeatureFPMR : SubtargetFeature<"fpmr", "HasFPMR", "true",
   "Enable FPMR Register (FEAT_FPMR)">;
 
+def FeatureFP8 : SubtargetFeature<"fp8", "HasFP8", "true",
+  "Enable FP8 instructions (FEAT_FP8)">;
+
 // This flag is currently still labeled as Experimental, but when fully
 // implemented this should tell the compiler to use the zeroing pseudos to
 // benefit from the reverse instructions (e.g. SUB vs SUBR) if the inactive
diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index e5dbfa404b3c6..a48bf77a774b7 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -6056,6 +6056,49 @@ multiclass SIMDThreeSameVectorFML<bit U, bit b13, bits<3> size, string asm,
 }
 
 
+// FP8 assembly/disassembly classes
+
+//----------------------------------------------------------------------------
+// FP8 Advanced SIMD three-register extension
+//----------------------------------------------------------------------------
+class BaseSIMDThreeVectors<bit Q, bit U, bits<2> size, bits<4> op,
+                           RegisterOperand regtype1,
+                           RegisterOperand regtype2, string asm,
+                           string kind1, string kind2>
+  : I<(outs regtype1:$Rd), (ins regtype2:$Rn, regtype2:$Rm), asm,
+      "\t$Rd" # kind1 # ", $Rn" # kind2 # ", $Rm" # kind2, "", []>, Sched<[]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b01110;
+  let Inst{23-22} = size;
+  let Inst{21}    = 0b0;
+  let Inst{20-16} = Rm;
+  let Inst{15}    = 0b1;
+  let Inst{14-11} = op;
+  let Inst{10}    = 0b1;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+
+// FCVTN (FP16 to FP8)
+multiclass SIMDThreeSameSizeVectorCvt<string asm> {
+   def v8f8 : BaseSIMDThreeVectors<0b0, 0b0, 0b01, 0b1110, V64, V64, asm, ".8b",".4h">;
+   def v16f8 : BaseSIMDThreeVectors<0b1, 0b0, 0b01, 0b1110,  V128, V128, asm, ".16b", ".8h">;
+}
+
+// TODO : Create v16f8 value type
+// FCVTN, FCVTN2 (FP32 to FP8)
+multiclass SIMDThreeVectorCvt<string asm> {
+   def v8f8 : BaseSIMDThreeVectors<0b0, 0b0, 0b00, 0b1110, V64, V128, asm, ".8b", ".4s">;
+   def 2v16f8 : BaseSIMDThreeSameVectorDot<0b1, 0b0, 0b00, 0b1110, asm#2, ".16b", ".4s",
+                                           V128, v16i8, v4f32, null_frag>;
+}
+
 //----------------------------------------------------------------------------
 // AdvSIMD two register vector instructions.
 //----------------------------------------------------------------------------
@@ -6479,6 +6522,16 @@ multiclass SIMDMixedTwoVector<bit U, bits<5> opc, string asm,
                 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>;
 }
 
+//----------------------------------------------------------------------------
+// FP8 Advanced SIMD two-register miscellaneous
+//----------------------------------------------------------------------------
+multiclass SIMDMixedTwoVectorFP8<bits<2>sz, string asm> {
+  def v8f16 : BaseSIMDMixedTwoVector<0b0, 0b1, sz, 0b10111, V64, V128,
+                                     asm, ".8h", ".8b", []>;
+  def 2v8f16 : BaseSIMDMixedTwoVector<0b1, 0b1, sz, 0b10111, V128, V128,
+                                     asm#2, ".8h", ".16b", []>;
+}
+
 class BaseSIMDCmpTwoVector<bit Q, bit U, bits<2> size, bits<2> size2,
                            bits<5> opcode, RegisterOperand regtype, string asm,
                            string kind, string zero, ValueType dty,
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 382d3956f105f..6f616b2798435 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -162,6 +162,8 @@ def HasSME2p1        : Predicate<"Subtarget->hasSME2p1()">,
                                  AssemblerPredicateWithAll<(all_of FeatureSME2p1), "sme2p1">;
 def HasFPMR          : Predicate<"Subtarget->hasFPMR()">,
                                  AssemblerPredicateWithAll<(all_of FeatureFPMR), "fpmr">;
+def HasFP8           : Predicate<"Subtarget->hasFP8()">,
+                                 AssemblerPredicateWithAll<(all_of FeatureFP8), "fp8">;
 
 // A subset of SVE(2) instructions are legal in Streaming SVE execution mode,
 // they should be enabled if either has been specified.
@@ -173,6 +175,10 @@ def HasSVE2orSME
     : Predicate<"Subtarget->hasSVE2() || Subtarget->hasSME()">,
                 AssemblerPredicateWithAll<(any_of FeatureSVE2, FeatureSME),
                 "sve2 or sme">;
+def HasSVE2orSME2
+    : Predicate<"Subtarget->hasSVE2() || Subtarget->hasSME2()">,
+                AssemblerPredicateWithAll<(any_of FeatureSVE2, FeatureSME2),
+                "sve2 or sme2">;
 def HasSVE2p1_or_HasSME
     : Predicate<"Subtarget->hasSVE2p1() || Subtarget->hasSME()">,
                  AssemblerPredicateWithAll<(any_of FeatureSME, FeatureSVE2p1), "sme or sve2p1">;
@@ -9249,6 +9255,15 @@ let Predicates = [HasD128] in {
   }
 }
 
+let Predicates = [HasFP8] in {
+  defm F1CVTL  : SIMDMixedTwoVectorFP8<0b00, "f1cvtl">;
+  defm F2CVTL  : SIMDMixedTwoVectorFP8<0b01, "f2cvtl">;
+  defm BF1CVTL : SIMDMixedTwoVectorFP8<0b10, "bf1cvtl">;
+  defm BF2CVTL : SIMDMixedTwoVectorFP8<0b11, "bf2cvtl">;
+  defm FCVTN_F16_F8 : SIMDThreeSameSizeVectorCvt<"fcvtn">;
+  defm FCVTN_F32_F8 : SIMDThreeVectorCvt<"fcvtn">;
+  defm FSCALE : SIMDThreeSameVectorFP<0b1, 0b1, 0b111, "fscale", null_frag>;
+} // End let Predicates = [HasFP8]
 
 include "AArch64InstrAtomics.td"
 include "AArch64SVEInstrInfo.td"
diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
index 2685f2e3c8108..cdc79ec3ae344 100644
--- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
@@ -330,14 +330,14 @@ defm UMLSL_VG4_M4ZZ  : sme2_int_mla_long_array_vg4_single<"umlsl", 0b11, int_aar
 defm UMLSL_VG2_M2Z2Z : sme2_int_mla_long_array_vg2_multi<"umlsl",  0b11, int_aarch64_sme_umlsl_vg2x2>;
 defm UMLSL_VG4_M4Z4Z : sme2_int_mla_long_array_vg4_multi<"umlsl",  0b11, int_aarch64_sme_umlsl_vg2x4>;
 
-defm FCVT_Z2Z_StoH   : sme2_cvt_vg2_single<"fcvt",   0b0000, nxv8f16, nxv4f32, int_aarch64_sve_fcvt_x2>;
-defm FCVTN_Z2Z_StoH  : sme2_cvt_vg2_single<"fcvtn",  0b0001, nxv8f16, nxv4f32, int_aarch64_sve_fcvtn_x2>;
-defm BFCVT_Z2Z_StoH  : sme2_cvt_vg2_single<"bfcvt",  0b1000, nxv8bf16, nxv4f32, int_aarch64_sve_bfcvt_x2>;
-defm BFCVTN_Z2Z_StoH : sme2_cvt_vg2_single<"bfcvtn", 0b1001, nxv8bf16, nxv4f32, int_aarch64_sve_bfcvtn_x2>;
-
-defm SQCVT_Z2Z_StoH  : sme2_cvt_vg2_single<"sqcvt",  0b0110, nxv8i16, nxv4i32, int_aarch64_sve_sqcvt_x2>;
-defm UQCVT_Z2Z_StoH  : sme2_cvt_vg2_single<"uqcvt",  0b0111, nxv8i16, nxv4i32, int_aarch64_sve_uqcvt_x2>;
-defm SQCVTU_Z2Z_StoH : sme2_cvt_vg2_single<"sqcvtu", 0b1110, nxv8i16, nxv4i32, int_aarch64_sve_sqcvtu_x2>;
+defm FCVT_Z2Z_StoH   : sme2_cvt_vg2_single<"fcvt",   0b00000, nxv8f16, nxv4f32, int_aarch64_sve_fcvt_x2>;
+defm FCVTN_Z2Z_StoH  : sme2_cvt_vg2_single<"fcvtn",  0b00001, nxv8f16, nxv4f32, int_aarch64_sve_fcvtn_x2>;
+defm BFCVT_Z2Z_StoH  : sme2_cvt_vg2_single<"bfcvt",  0b10000, nxv8bf16, nxv4f32, int_aarch64_sve_bfcvt_x2>;
+defm BFCVTN_Z2Z_StoH : sme2_cvt_vg2_single<"bfcvtn", 0b10001, nxv8bf16, nxv4f32, int_aarch64_sve_bfcvtn_x2>;
+
+defm SQCVT_Z2Z_StoH  : sme2_cvt_vg2_single<"sqcvt",  0b00110, nxv8i16, nxv4i32, int_aarch64_sve_sqcvt_x2>;
+defm UQCVT_Z2Z_StoH  : sme2_cvt_vg2_single<"uqcvt",  0b00111, nxv8i16, nxv4i32, int_aarch64_sve_uqcvt_x2>;
+defm SQCVTU_Z2Z_StoH : sme2_cvt_vg2_single<"sqcvtu", 0b10110, nxv8i16, nxv4i32, int_aarch64_sve_sqcvtu_x2>;
 defm SQCVT_Z4Z      : sme2_int_cvt_vg4_single<"sqcvt", 0b000, int_aarch64_sve_sqcvt_x4>;
 defm UQCVT_Z4Z      : sme2_int_cvt_vg4_single<"uqcvt", 0b001, int_aarch64_sve_uqcvt_x4>;
 defm SQCVTU_Z4Z     : sme2_int_cvt_vg4_single<"sqcvtu", 0b100, int_aarch64_sve_sqcvtu_x4>;
@@ -855,3 +855,26 @@ defm BFCLAMP_VG4_4ZZZ: sme2p1_bfclamp_vector_vg4_multi<"bfclamp">;
 defm BFMOPA_MPPZZ_H : sme2p1_fmop_tile_fp16<"bfmopa", 0b1, 0b0, 0b11, ZPR16>;
 defm BFMOPS_MPPZZ_H : sme2p1_fmop_tile_fp16<"bfmops", 0b1, 0b1, 0b11, ZPR16>;
 }
+
+let Predicates = [HasSME2, HasFP8] in {
+defm F1CVT_2ZZ_BtoH    : sme2p1_fp8_cvt_vector_vg2_single<"f1cvt",   0b00, 0b0>;
+defm F1CVTL_2ZZ_BtoH   : sme2p1_fp8_cvt_vector_vg2_single<"f1cvtl",  0b00, 0b1>;
+defm BF1CVT_2ZZ_BtoH   : sme2p1_fp8_cvt_vector_vg2_single<"bf1cvt",  0b01, 0b0>;
+defm BF1CVTL_2ZZ_BtoH  : sme2p1_fp8_cvt_vector_vg2_single<"bf1cvtl", 0b01, 0b1>;
+defm F2CVT_2ZZ_BtoH    : sme2p1_fp8_cvt_vector_vg2_single<"f2cvt",   0b10, 0b0>;
+defm F2CVTL_2ZZ_BtoH   : sme2p1_fp8_cvt_vector_vg2_single<"f2cvtl",  0b10, 0b1>;
+defm BF2CVT_2ZZ_BtoH   : sme2p1_fp8_cvt_vector_vg2_single<"bf2cvt",  0b11, 0b0>;
+defm BF2CVTL_2ZZ_BtoH  : sme2p1_fp8_cvt_vector_vg2_single<"bf2cvtl", 0b11, 0b1>;
+
+defm FCVT_Z2Z_HtoB  : sme2_fp8_cvt_vg2_single<"fcvt",   0b0>;
+defm BFCVT_Z2Z_HtoB : sme2_fp8_cvt_vg2_single<"bfcvt",  0b1>;
+defm FCVT_Z4Z_StoB  : sme2_fp8_cvt_vg4_single<"fcvt",   0b0>;
+defm FCVTN_Z4Z_StoB : sme2_fp8_cvt_vg4_single<"fcvtn",  0b1>;
+
+defm FSCALE_2ZZ   : sme2_fp_sve_destructive_vector_vg2_single<"fscale", 0b0011000>;
+defm FSCALE_4ZZ   : sme2_fp_sve_destructive_vector_vg4_single<"fscale", 0b0011000>;
+defm FSCALE_2Z2Z  : sme2_fp_sve_destructive_vector_vg2_multi<"fscale",  0b0011000>;
+defm FSCALE_4Z4Z  : sme2_fp_sve_destructive_vector_vg4_multi<"fscale",  0b0011000>;
+
+} // [HasSME2, HasFP8]
+
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index d599ac4689e5c..002d5d28fcf8d 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -4002,3 +4002,24 @@ defm UZPQ1_ZZZ : sve2p1_permute_vec_elems_q<0b010, "uzpq1">;
 defm UZPQ2_ZZZ : sve2p1_permute_vec_elems_q<0b011, "uzpq2">;
 defm TBLQ_ZZZ  : sve2p1_tblq<"tblq">;
 } // End HasSVE2p1_or_HasSME2p1
+
+//===----------------------------------------------------------------------===//
+// SVE2 FP8 instructions
+//===----------------------------------------------------------------------===//
+let Predicates = [HasSVE2orSME2, HasFP8] in {
+// FP8 upconvert
+defm F1CVT_ZZ     : sve2_fp8_cvt_single<0b0, 0b00, "f1cvt">;
+defm F2CVT_ZZ     : sve2_fp8_cvt_single<0b0, 0b01, "f2cvt">;
+defm BF1CVT_ZZ    : sve2_fp8_cvt_single<0b0, 0b10, "bf1cvt">;
+defm BF2CVT_ZZ    : sve2_fp8_cvt_single<0b0, 0b11, "bf2cvt">;
+defm F1CVTLT_ZZ   : sve2_fp8_cvt_single<0b1, 0b00, "f1cvtlt">;
+defm F2CVTLT_ZZ   : sve2_fp8_cvt_single<0b1, 0b01, "f2cvtlt">;
+defm BF1CVTLT_ZZ  : sve2_fp8_cvt_single<0b1, 0b10, "bf1cvtlt">;
+defm BF2CVTLT_ZZ  : sve2_fp8_cvt_single<0b1, 0b11, "bf2cvtlt">;
+
+// FP8 downconvert
+defm FCVTN_Z2Z_HtoB  : sve2_fp8_down_cvt_single<0b00, "fcvtn", ZZ_h_mul_r>;
+defm FCVTNB_Z2Z_StoB : sve2_fp8_down_cvt_single<0b01, "fcvtnb", ZZ_s_mul_r>;
+defm BFCVTN_Z2Z_HtoB : sve2_fp8_down_cvt_single<0b10, "bfcvtn", ZZ_h_mul_r>;
+defm FCVTNT_Z2Z_StoB : sve2_fp8_down_cvt_single<0b11, "fcvtnt", ZZ_s_mul_r>;
+} // End HasSVE2orSME2, HasFP8
\ No newline at end of file
diff --git a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
index 6e70deec3f890..36e34fdc07e7c 100644
--- a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
+++ b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -3639,6 +3639,7 @@ static const struct Extension {
     {"ssbs", {AArch64::FeatureSSBS}},
     {"tme", {AArch64::FeatureTME}},
     {"fpmr", {AArch64::FeatureFPMR}},
+    {"fp8", {AArch64::FeatureFP8}},
 };
 
 static void setRequiredFeatureString(FeatureBitset FBS, std::string &Str) {
diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td
index 823115c7d0250..d8b44c68fbdee 100644
--- a/llvm/lib/Target/AArch64/SMEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td
@@ -2161,15 +2161,16 @@ multiclass sme2_frint_vector_vg4_multi<string mnemonic, bits<7> op> {
                                          mnemonic>;
 }
 
-class sme2_cvt_vg2_single<string mnemonic, bits<4> op>
-    : I<(outs ZPR16:$Zd), (ins ZZ_s_mul_r:$Zn),
+class sme2_cvt_vg2_single<string mnemonic, bits<5> op,
+                           RegisterOperand first_ty, RegisterOperand second_ty>
+    : I<(outs first_ty:$Zd), (ins second_ty:$Zn),
         mnemonic, "\t$Zd, $Zn", "", []>, Sched<[]> {
   bits<4> Zn;
   bits<5> Zd;
   let Inst{31-23} = 0b110000010;
-  let Inst{22}    = op{3};
-  let Inst{21-18} = 0b1000;
-  let Inst{17-16} = op{2-1};
+  let Inst{22}    = op{4};
+  let Inst{21-19} = 0b100;
+  let Inst{18-16} = op{3-1};
   let Inst{15-10} = 0b111000;
   let Inst{9-6}   = Zn;
   let Inst{5}     = op{0};
@@ -2178,12 +2179,17 @@ class sme2_cvt_vg2_single<string mnemonic, bits<4> op>
 
 // SME2 multi-vec FP down convert two registers
 // SME2 multi-vec int down convert two registers
-multiclass sme2_cvt_vg2_single<string mnemonic, bits<4> op, ValueType out_vt,
+multiclass sme2_cvt_vg2_single<string mnemonic, bits<5> op, ValueType out_vt,
                                ValueType in_vt, SDPatternOperator intrinsic> {
-  def NAME :  sme2_cvt_vg2_single<mnemonic, op>;
+  def NAME :  sme2_cvt_vg2_single<mnemonic, op, ZPR16, ZZ_s_mul_r>;
   def : SVE2p1_Cvt_VG2_Pat<NAME, intrinsic, out_vt, in_vt>;
 }
 
+// SME2 multi-vec FP8 down convert two registers
+multiclass sme2_fp8_cvt_vg2_single<string mnemonic, bit op> {
+  def NAME :  sme2_cvt_vg2_single<mnemonic, {op, 0b1000}, ZPR8, ZZ_h_mul_r>;
+}
+
 class sme2_cvt_unpk_vector_vg2<bits<2>sz, bits<3> op, bit u, RegisterOperand first_ty,
                            RegisterOperand second_ty, string mnemonic>
     : I<(outs first_ty:$Zd), (ins second_ty:$Zn),
@@ -2212,7 +2218,13 @@ multiclass sme2p1_fp_cvt_vector_vg2_single<string mnemonic, bit l> {
   def _S : sme2_cvt_unpk_vector_vg2<0b10, 0b000, l, ZZ_s_mul_r, ZPR16, mnemonic>;
 }
 
-class sme2_cvt_vg4_single<bit sz, bits<3> op, RegisterOperand first_ty,
+// SME2 multi-vec FP8 up convert two registers
+multiclass sme2p1_fp8_cvt_vector_vg2_single<string mnemonic, bits<2> opc, bit L> {
+  def _NAME : sme2_cvt_unpk_vector_vg2<opc, 0b110, L, ZZ_h_mul_r, ZPR8, mnemonic>;
+}
+
+
+class sme2_cvt_vg4_single<bit sz, bits<3> op, bits<4>op2,  RegisterOperand first_ty,
                           RegisterOperand second_ty, string mnemonic>
     : I<(outs first_ty:$Zd), (ins second_ty:$Zn),
         mnemonic, "\t$Zd, $Zn", "", []>, Sched<[]> {
@@ -2221,7 +2233,9 @@ class sme2_cvt_vg4_single<bit sz, bits<3> op, RegisterOperand first_ty,
   let Inst{31-24} = 0b11000001;
   let Inst{23}    = sz;
   let Inst{22}    = op{2};
-  let Inst{21-10} = 0b110011111000;
+  let Inst{21-20} = 0b11;
+  let Inst{19-16} = op2;
+  let Inst{15-10} = 0b111000;
   let Inst{9-7}   = Zn;
   let Inst{6-5}   = op{1-0};
   let Inst{4-0}   = Zd;
@@ -2229,13 +2243,18 @@ class sme2_cvt_vg4_single<bit sz, bits<3> op, RegisterOperand first_ty,
 
 // SME2 multi-vec int down convert four registers
 multiclass sme2_int_cvt_vg4_single<string mnemonic, bits<3> op, SDPatternOperator intrinsic> {
-  def _StoB : sme2_cvt_vg4_single<0, op, ZPR8, ZZZZ_s_mul_r, mnemonic>;
-  def _DtoH : sme2_cvt_vg4_single<1, op, ZPR16, ZZZZ_d_mul_r, mnemonic>;
+  def _StoB : sme2_cvt_vg4_single<0, op, 0b0011, ZPR8, ZZZZ_s_mul_r, mnemonic>;
+  def _DtoH : sme2_cvt_vg4_single<1, op, 0b0011, ZPR16, ZZZZ_d_mul_r, mnemonic>;
 
   def : SME2_Cvt_VG4_Pat<NAME # _StoB, intrinsic, nxv16i8, nxv4i32>;
   def : SME2_Cvt_VG4_Pat<NAME # _DtoH, intrinsic, nxv8i16, nxv2i64>;
 }
 
+//SME2 multi-vec FP8 down convert four registers
+multiclass sme2_fp8_cvt_vg4_single<string mnemonic, bit N> {
+ def _NAME : sme2_cvt_vg4_single<0b0, {0b00, N}, 0b0100, ZPR8, ZZZZ_s_mul_r, mnemonic>;
+}
+
 class sme2_unpk_vector_vg4<bits<2>sz, bit u, RegisterOperand first_ty,
                            RegisterOperand second_ty, string mnemonic>
     : I<(outs first_ty:$Zd), (ins second_ty:$Zn),
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index 7bb457d918821..d2f72fda3a229 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -10078,3 +10078,46 @@ multiclass sve2p1_tblq<string mnemonic> {
   def _S : sve2p1_permute_vec_elems_q<0b10, 0b110, mnemonic, ZPR32, Z_s>;
   def _D : sve2p1_permute_vec_elems_q<0b11, 0b110, mnemonic, ZPR64, Z_d>;
 }
+
+//===----------------------------------------------------------------------===//
+// SVE2 FP8 Instructions
+//===----------------------------------------------------------------------===//
+
+// FP8 upconvert
+class sve2_fp8_cvt_single<bit L, bits<2> opc, string mnemonic,
+                          ZPRRegOp dst_ty, ZPRRegOp src_ty>
+    : I<(outs dst_ty:$Zd), (ins src_ty:$Zn),
+      mnemonic, "\t$Zd, $Zn",
+      "", []>, Sched<[]>{
+  bits<5> Zd;
+  bits<5> Zn;
+  let Inst{31-17} = 0b011001010000100;
+  let Inst{16}    = L;
+  let Inst{15-12} = 0b0011;
+  let Inst{11-10} = opc;
+  let Inst{9-5}   = Zn;
+  let Inst{4-0}   = Zd;
+}
+
+multiclass sve2_fp8_cvt_single<bit L, bits<2> opc, string mnemonic> {
+  def _BtoH : sve2_fp8_cvt_single<L, opc, mnemonic, ZPR16, ZPR8>;
+}
+
+// FP8 downconvert
+class sve2_fp8_down_cvt_single<bits<2> opc, string mnemonic,
+                              ZPRRegOp dst_ty, RegisterOperand src_ty>
+    : I<(outs dst_ty:$Zd), (ins src_ty:$Zn),
+      mnemonic, "\t$Zd, $Zn",
+      "", []>, Sched<[]>{
+  bits<5> Zd;
+  bits<4> Zn;
+  let Inst{31-12} = 0b01100101000010100011;
+  let Inst{11-10} = opc;
+  let Inst{9-6} = Zn;
+  let Inst{5} = 0b0;
+  let Inst{4-0} = Zd;
+}
+
+multiclass sve2_fp8_down_cvt_single<bits<2> opc, string mnemonic, RegisterOperand src> {
+  def NAME : sve2_fp8_down_cvt_single<opc, mnemonic, ZPR8, src>;
+}
\ No newline at end of file
diff --git a/llvm/test/MC/AArch64/FP8/directive-arch-negative.s b/llvm/test/MC/AArch64/FP8/directive-arch-negative.s
new file mode 100644
index 0000000000000..cf48416d29d8a
--- /dev/null
+++ b/llvm/test/MC/AArch64/FP8/directive-arch-negative.s
@@ -0,0 +1,7 @@
+// RUN: not llvm-mc -triple aarch64 -filetype asm -o - %s 2>&1 | FileCheck %s
+
+.arch armv9-a+fp8
+.arch armv9-a+nofp8
+bf1cvtl v0.8h, v0.8b
+// CHECK: error: instruction requires: fp8
+// CHECK: bf1cvtl v0.8h, v0.8b
diff --git a/llvm/test/MC/AArch64/FP8/directive-arch.s b/llvm/test/MC/AArch64/FP8/directive-arch.s
new file mode 100644
index 0000000000000..8857d4f0bfbe4
--- /dev/null
+++ b/llvm/test/MC/AArch64/FP8/directive-arch.s
@@ -0,0 +1,7 @@
+// RUN: llvm-mc -triple aarch64 -o - %s 2>&1 | FileCheck %s
+
+.arch armv9-a+fp8
+bf1cvtl v0.8h, v0.8b
+// CHECK: bf1cvtl v0.8h, v0.8b
+
+.arch armv9-a+nofp8
diff --git a/llvm/test/MC/AArch64/FP8/miscellaneous-fp8-diagnostics.s b/llvm/test/MC/AArch64/FP8/miscellaneous-fp8-diagnostics.s
new file mode 100644
index 0000000000000..4f79f038b6948
--- /dev/null
+++ b/llvm/test/MC/AArch64/FP8/miscellaneous-fp8-diagnostics.s
@@ -0,0 +1,84 @@
+// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+fp8 2>&1 < %s| FileCheck %s
+
+// --------------------------------------------------------------------------//
+// Element size extension incorrect
+
+bf1cvtl v0.8h, v0.8h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: bf1cvtl v0.8h, v0.8h
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+bf1cvtl2 v0.8h, v0.16h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid vector kind qualifier
+// CHECK-NEXT: bf1cvtl2 v0.8h, v0.16h
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+bf2cvtl v0.8h, v0.8h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: bf2cvtl v0.8h, v0.8h
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+bf2cvtl2 v0.8h, v0.16h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid vector kind qualifier
+// CHECK-NEXT: bf2cvtl2 v0.8h, v0.16h
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+f1cvtl v0.8h, v0.8h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: f1cvtl v0.8h, v0.8h
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+f1cvtl2 v0.8h, v0.16h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid vector kind qualifier
+// CHECK-NEXT: f1cvtl2 v0.8h, v0.16h
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+f2cvtl v0.8h, v0.8h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: f2cvtl v0.8h, v0.8h
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+f2cvtl2 v0.8h, v0.16h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid vector kind qualifier
+// CHECK-NEXT: f2cvtl2 v0.8h, v0.16h
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fcvtn  v31.8h, v31.4h, v31.4h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: fcvtn  v31.8h, v31.4h, v31.4h
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fcvtn  v0.8s, v0.4s, v0.4s
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid vector kind qualifier
+// CHECK-NEXT: fcvtn  v0.8s, v0.4s, v0.4s
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fcvtn2  v0.16s, v0.4s, v0.4s
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid vector kind qualifier
+// CHECK-NEXT: fcvtn2  v0.16s, v0.4s, v0.4s
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fscale  v0.4h, v0.4s, v0.4s
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: fscale  v0.4h, v0.4s, v0.4s
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fscale  v0.8h, v0.8s, v0.8s
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid vector kind qualifier
+// CHECK-NEXT: fscale  v0.8h, v0.8s, v0.8s
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fscale  v0.2s, v0.2h, v0.2h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: fscale  v0.2s, v0.2h, v0.2h
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fscale  v0.4s, v31.4h, v0.4h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: fscale  v0.4s, v31.4h, v0.4h
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fscale  v0.2d, v31.2h, v0.2h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: fscale  v0.2d, v31.2h, v0.2h
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
diff --git a/llvm/test/MC/AArch64/FP8/miscellaneous-fp8.s b/llvm/test/MC/AArch64/FP8/miscellaneous-fp8.s
new file mode 100644
index 0000000000000..2f1fd9b86ed84
--- /dev/null
+++ b/llvm/test/MC/AArch64/FP8/miscellaneous-fp8.s
@@ -0,0 +1,355 @@
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+fp8 < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
+// RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+fp8 < %s \
+// RUN:        | llvm-objdump -d --mattr=+fp8 - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+fp8 < %s \
+// RUN:        | llvm-objdump -d --mattr=-sme2 - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+// Disassemble encoding and check the re-encoding (-show-encoding) matches.
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+fp8 < %s \
+// RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
+// RUN:        | llvm-mc -triple=aarch64 -mattr=+fp8 -disassemble -show-encoding \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+
+///
+/// BF1CVTL instructions.
+///
+bf1cvtl v0.8h, v0.8b
+// CHECK-INST: bf1cvtl v0.8h, v0.8b
+// CHECK-ENCODING: [0x00,0x78,0xa1,0x2e]
+// CHECK-ERROR: instruction requires: fp8
+// CHECK-UNKNOWN: 2ea17800 <unknown>
+
+bf1cvtl v0.8h, v31.8b
+// CHECK-INST: bf1cvtl v0.8h, v31.8b
+// CHECK-ENCODING: [0xe0,0x7b,0xa1,0x2e]
+// CHECK-ERROR: instruction requires: fp8
+// CHECK-UNKNOWN: 2ea17be0 <unknown>
+
+bf1cvtl v31.8h, v31.8b
+// CHECK-INST: bf1cvtl v31.8h, v31.8b
+// CHECK-ENCODING: [0xff,0x7b,0xa1,0x2e]
+// CHECK-ERROR: instruction requires: fp8
+// CHECK-UNKNOWN: 2ea17bff <unknown>
+
+///
+/// BF1CVTL2 instructions.
+///
+bf1cvtl2 v0.8h, v0.16b
+// CHECK-INST: bf1cvtl2 v0.8h, v0.16b
+// CHECK-ENCODING: [0x00,0x78,0xa1,0x6e]
+// CHECK-ERROR: instruction requires: fp8
+// CHECK-UNKNOWN: 6ea17800 <unknown>
+
+bf1cvtl2 v0.8h, v31.16b
+// CHECK-INST: bf1cvtl2 v0.8h, v31.16b
+// CHECK-ENCODING: [0xe0,0x7b,0xa1,0x6e]
+// CHECK-ERROR: instruction requires: fp8
+// CHECK-UNKNOWN: 6ea17be0 <unknown>
+
+bf1cvtl2 v31.8h, v31.16b
+// CHECK-INST: bf1cvtl2 v31.8h, v31.16b
+// CHECK-ENCODING: [0xff,0x7b,0xa1,0x6e]
+// CHECK-ERROR: instruction requires: fp8
+// CHECK-UNKNOWN: 6ea17bff <unknown>
+
+///
+/// BF2CVTL instructions.
+///
+bf2cvtl v0.8h, v0.8b
+// CHECK-INST: bf2cvtl v0.8h, v0.8b
+// CHECK-ENCODING: [0x00,0x78,0xe1,0x2e]
+// CHECK-ERROR: instruction requires: fp8
+// CHECK-UNKNOWN: 2ee17800 <unknown>
+
+bf2cvtl v0.8h, v31.8b
+// CHECK-INST: bf2cvtl v0.8h, v31.8b
+// CHECK-ENCODING: [0xe0,0x7b,0xe1,0x2e]
+// CHECK-ERROR: instruction requires: fp8
+// CHECK-UNKNOWN: 2ee17be0 <unknown>
+
+bf2cvtl v31.8h, v31.8b
+// CHECK-INST: bf2cvtl v31.8h, v31.8b
+// CHECK-ENCODING: [0xff,0x7b,0xe1,0x2e]
+// CHECK-ERROR: instruction requires: fp8
+// CHECK-UNKNOWN: 2ee17bff <unknown>
+
+///
+/// BF2CVTL2 instructions.
+///
+bf2cvtl2 v0.8h, v0.16b
+// CHECK-INST: bf2cvtl2 v0.8h, v0.16b
+// CHECK-ENCODING: [0x00,0x78,0xe1,0x6e]
+// CHECK-ERROR: instruction requires: fp8
+// CHECK-UNKNOWN: 6ee17800 <unknown>
+
+bf2cvtl2 v0.8h, v31.16b
+// CHECK-INST: bf2cvtl2 v0.8h, v31.16b
+// CHECK-ENCODING: [0xe0,0x7b,0xe1,0x6e]
+// CHECK-ERROR: instruction requires: fp8
+// CHECK-UNKNOWN: 6ee17be0 <unknown>
+
+bf2cvtl2 v31.8h, v31.16b
+// CHECK-INST: bf2cvtl2 v31.8h, v31.16b
+// CHECK-ENCODING: [0xff,0x7b,0xe1,0x6e]
+// CHECK-ERROR: instruction requires: fp8
+// CHECK-UNKNOWN: 6ee17bff <unknown>
+
+///
+/// F1CVTL instructions.
+///
+f1cvtl v0.8h, v0.8b
+// CHECK-INST: f1cvtl v0.8h, v0.8b
+// CHECK-ENCODING: [0x00,0x78,0x21,0x2e]
+// CHECK-ERROR: instruction requires: fp8
+// CHECK-UNKNOWN: 2e217800 <unknown>
+
+f1cvtl v0.8h, v31.8b
+// CHECK-INST: f1cvtl v0.8h, v31.8b
+// CHECK-ENCODING: [0xe0,0x7b,0x21,0x2e]
+// CHECK-ERROR: instruction requires: fp8
+// CHECK-UNKNOWN: 2e217be0 <unknown>
+
+f1cvtl v31.8h, v31.8b
+// CHECK-INST: f1cvtl v31.8h, v31.8b
+// CHECK-ENCODING: [0xff,0x7b,0x21,0x2e]
+// CHECK-ERROR: instruction requires: fp8
+// CHECK-UNKNOWN: 2e217bff <unknown>
+
+///
+/// F1CVTL2 instructions.
+///
+f1cvtl2 v0.8h, v0.16b
+// CHECK-INST: f1cvtl2 v0.8h, v0.16b
+// CHECK-ENCODING: [0x00,0x78,0x21,0x6e]
+// CHECK-ERROR: instruction requires: fp8
+// CHECK-UNKNOWN: 6e217800 <unknown>
+
+f1cvtl2 v0.8h, v31.16b
+// CHECK-INST: f1cvtl2 v0.8h, v31.16b
+// CHECK-ENCODING: [0xe0,0x7b,0x21,0x6e]
+// CHECK-ERROR: instruction requires: fp8
+// CHECK-UNKNOWN: 6e217be0 <unknown>
+
+f1cvtl2 v31.8h, v31.16b
+// CHECK-INST: f1cvtl2 v31.8h, v31.16b
+// CHECK-ENCODING: [0xff,0x7b,0x21,0x6e]
+// CHECK-ERROR: instruction requires: fp8
+// CHECK-UNKNOWN: 6e217bff <unknown>
+
+///
+/// F2CVTL instructions.
+///
+f2cvtl v0.8h, v0.8b
+// CHECK-INST: f2cvtl v0.8h, v0.8b
+// CHECK-ENCODING: [0x00,0x78,0x61,0x2e]
+// CHECK-ERROR: instruction requires: fp8
+// CHECK-UNKNOWN: 2e617800 <unknown>
+
+f2cvtl v0.8h, v31.8b
+// CHECK-INST: f2cvtl v0.8h, v31.8b
+// CHECK-ENCODING: [0xe0,0x7b,0x61,0x2e]
+// CHECK-ERROR: instruction requires: fp8
+// CHECK-UNKNOWN: 2e617be0 <unknown>
+
+f2cvtl v31.8h, v31.8b
+// CHECK-INST: f2cvtl v31.8h, v31.8b
+// CHECK-ENCODING: [0xff,0x7b,0x61,0x2e]
+// CHECK-ERROR: instruction requires: fp8
+// CHECK-UNKNOWN: 2e617bff <unknown>
+
+///
+/// F2CVTL2 instructions.
+///
+f2cvtl2 v0.8h, v0.16b
+// CHECK-INST: f2cvtl2 v0.8h, v0.16b
+// CHECK-ENCODING: [0x00,0x78,0x61,0x6e]
+// CHECK-ERROR: instruction requires: fp8
+// CHECK-UNKNOWN: 6e617800 <unknown>
+
+f2cvtl2 v0.8h, v31.16b
+// CHECK-INST: f2cvtl2 v0.8h, v31.16b
+// CHECK-ENCODING: [0xe0,0x7b,0x61,0x6e]
+// CHECK-ERROR: instruction requires: fp8
+// CHECK-UNKNOWN: 6e617be0 <unknown>
+
+f2cvtl2 v31.8h, v31.16b
+// CHECK-INST: f2cvtl2 v31.8h, v31.16b
+// CHECK-ENCODING: [0xff,0x7b,0x61,0x6e]
+// CHECK-ERROR: instruction requires: fp8
+// CHECK-UNKNOWN: 6e617bff <unknown>
+
+///
+/// FCVTN instructions.
+///
+// FP16 TO FP8
+fcvtn  v31.8b, v31.4h, v31.4h
+// CHECK-INST: fcvtn  v31.8b, v31.4h, v31.4h
+// CHECK-ENCODING: [0xff,0xf7,0x5f,0x0e]
+// CHECK-ERROR: instruction requires: fp8
+// CHECK-UNKNOWN: 0e5ff7ff <unknown>
+
+fcvtn  v31.8b, v0.4h, v0.4h
+// CHECK-INST: fcvtn  v31.8b, v0.4h, v0.4h
+// CHECK-ENCODING: [0x1f,0xf4,0x40,0x0e]
+// CHECK-ERROR: instruction requires: fp8
+// CHECK-UNKNOWN: 0e40f41f <unknown>
+
+fcvtn  v0.8b, v0.4h, v0.4h
+// CHECK-INST: fcvtn  v0.8b, v0.4h, v0.4h
+// CHECK-ENCODING: [0x00,0xf4,0x40,0x0e]
+// CHECK-ERROR: instruction requires: fp8
+// CHECK-UNKNOWN: 0e40f400 <unknown>
+
+fcvtn  v0.16b, v0.8h, v0.8h
+// CHECK-INST: fcvtn  v0.16b, v0.8h, v0.8h
+// CHECK-ENCODING: [0x00,0xf4,0x40,0x4e]
+// CHECK-ERROR: instruction requires: fp8
+// CHECK-UNKNOWN: 4e40f400 <unknown>
+
+fcvtn  v31.16b, v0.8h, v0.8h
+// CHECK-INST: fcvtn  v31.16b, v0.8h, v0.8h
+// CHECK-ENCODING: [0x1f,0xf4,0x40,0x4e]
+// CHECK-ERROR: instruction requires: fp8
+// CHECK-UNKNOWN: 4e40f41f <unknown>
+
+fcvtn  v31.16b, v31.8h, v31.8h
+// CHECK-INST: fcvtn  v31.16b, v31.8h, v31.8h
+// CHECK-ENCODING: [0xff,0xf7,0x5f,0x4e]
+// CHECK-ERROR: instruction requires: fp8
+// CHECK-UNKNOWN: 4e5ff7ff <unknown>
+
+// FP32 TO FP8
+fcvtn  v0.8b, v0.4s, v0.4s
+// CHECK-INST: fcvtn  v0.8b, v0.4s, v0.4s
+// CHECK-ENCODING: [0x00,0xf4,0x00,0x0e]
+// CHECK-ERROR: instruction requires: fp8
+// CHECK-UNKNOWN: 0e00f400 <unknown>
+
+fcvtn  v0.8b, v31.4s, v31.4s
+// CHECK-INST: fcvtn  v0.8b, v31.4s, v31.4s
+// CHECK-ENCODING: [0xe0,0xf7,0x1f,0x0e]
+// CHECK-ERROR: instruction requires: fp8
+// CHECK-UNKNOWN: 0e1ff7e0 <unknown>
+
+fcvtn  v31.8b, v31.4s, v31.4s
+// CHECK-INST: fcvtn  v31.8b, v31.4s, v31.4s
+// CHECK-ENCODING: [0xff,0xf7,0x1f,0x0e]
+// CHECK-ERROR: instruction requires: fp8
+// CHECK-UNKNOWN: 0e1ff7ff <unknown>
+
+///
+/// FCVTN2 instructions.
+///
+
+fcvtn2  v0.16b, v0.4s, v0.4s
+// CHECK-INST: fcvtn2  v0.16b, v0.4s, v0.4s
+// CHECK-ENCODING: [0x00,0xf4,0x00,0x4e]
+// CHECK-ERROR: instruction requires: fp8
+// CHECK-UNKNOWN: 4e00f400 <unknown>
+
+fcvtn2  v0.16b, v0.4s, v31.4s
+// CHECK-INST: fcvtn2  v0.16b, v0.4s, v31.4s
+// CHECK-ENCODING: [0x00,0xf4,0x1f,0x4e]
+// CHECK-ERROR: instruction requires: fp8
+// CHECK-UNKNOWN: 4e1ff400 <unknown>
+
+fcvtn2  v31.16b, v31.4s, v31.4s
+// CHECK-INST: fcvtn2  v31.16b, v31.4s, v31.4s
+// CHECK-ENCODING: [0xff,0xf7,0x1f,0x4e]
+// CHECK-ERROR: instruction requires: fp8
+// CHECK-UNKNOWN: 4e1ff7ff <unknown>
+
+///
+/// FSCALE instructions.
+///
+fscale  v0.4h, v0.4h, v0.4h
+// CHECK-INST: fscale  v0.4h, v0.4h, v0.4h
+// CHECK-ENCODING: [0x00,0x3c,0xc0,0x2e]
+// CHECK-ERROR: instruction requires: fp8
+// CHECK-UNKNOWN: 2ec03c00 <unknown>
+
+fscale  v0.4h, v31.4h, v31.4h
+// CHECK-INST: fscale  v0.4h, v31.4h, v31.4h
+// CHECK-ENCODING: [0xe0,0x3f,0xdf,0x2e]
+// CHECK-ERROR: instruction requires: fp8
+// CHECK-UNKNOWN: 2edf3fe0 <unknown>
+
+fscale  v31.4h, v31.4h, v31.4h
+// CHECK-INST: fscale  v31.4h, v31.4h, v31.4h
+// CHECK-ENCODING: [0xff,0x3f,0xdf,0x2e]
+// CHECK-ERROR: instruction requires: fp8
+// CHECK-UNKNOWN: 2edf3fff <unknown>
+
+fscale  v0.8h, v0.8h, v0.8h
+// CHECK-INST: fscale  v0.8h, v0.8h, v0.8h
+// CHECK-ENCODING: [0x00,0x3c,0xc0,0x6e]
+// CHECK-ERROR: instruction requires: fp8
+// CHECK-UNKNOWN: 6ec03c00 <unknown>
+
+fscale  v31.8h, v0.8h, v0.8h
+// CHECK-INST: fscale  v31.8h, v0.8h, v0.8h
+// CHECK-ENCODING: [0x1f,0x3c,0xc0,0x6e]
+// CHECK-ERROR: instruction requires: fp8
+// CHECK-UNKNOWN: 6ec03c1f <unknown>
+
+fscale  v31.8h, v31.8h, v31.8h
+// CHECK-INST: fscale  v31.8h, v31.8h, v31.8h
+// CHECK-ENCODING: [0xff,0x3f,0xdf,0x6e]
+// CHECK-ERROR: instruction requires: fp8
+// CHECK-UNKNOWN: 6edf3fff <unknown>
+
+fscale  v0.2s, v0.2s, v0.2s
+// CHECK-INST: fscale  v0.2s, v0.2s, v0.2s
+// CHECK-ENCODING: [0x00,0xfc,0xa0,0x2e]
+// CHECK-ERROR: instruction requires: fp8
+// CHECK-UNKNOWN: 2ea0fc00 <unknown>
+
+fscale  v0.2s, v0.2s, v31.2s
+// CHECK-INST: fscale  v0.2s, v0.2s, v31.2s
+// CHECK-ENCODING: [0x00,0xfc,0xbf,0x2e]
+// CHECK-ERROR: instruction requires: fp8
+// CHECK-UNKNOWN: 2ebffc00 <unknown>
+
+fscale  v31.2s, v31.2s, v31.2s
+// CHECK-INST: fscale  v31.2s, v31.2s, v31.2s
+// CHECK-ENCODING: [0xff,0xff,0xbf,0x2e]
+// CHECK-ERROR: instruction requires: fp8
+// CHECK-UNKNOWN: 2ebfffff <unknown>
+
+fscale  v0.4s, v0.4s, v0.4s
+// CHECK-INST: fscale  v0.4s, v0.4s, v0.4s
+// CHECK-ENCODING: [0x00,0xfc,0xa0,0x6e]
+// CHECK-ERROR: instruction requires: fp8
+// CHECK-UNKNOWN: 6ea0fc00 <unknown>
+
+fscale  v0.4s, v31.4s, v0.4s
+// CHECK-INST: fscale  v0.4s, v31.4s, v0.4s
+// CHECK-ENCODING: [0xe0,0xff,0xa0,0x6e]
+// CHECK-ERROR: instruction requires: fp8
+// CHECK-UNKNOWN: 6ea0ffe0 <unknown>
+
+fscale  v31.4s, v31.4s, v31.4s
+// CHECK-INST: fscale  v31.4s, v31.4s, v31.4s
+// CHECK-ENCODING: [0xff,0xff,0xbf,0x6e]
+// CHECK-ERROR: instruction requires: fp8
+// CHECK-UNKNOWN: 6ebfffff <unknown>
+
+fscale  v0.2d, v0.2d, v0.2d
+// CHECK-INST: fscale  v0.2d, v0.2d, v0.2d
+// CHECK-ENCODING: [0x00,0xfc,0xe0,0x6e]
+// CHECK-ERROR: instruction requires: fp8
+// CHECK-UNKNOWN: 6ee0fc00 <unknown>
+
+fscale  v0.2d, v31.2d, v0.2d
+// CHECK-INST: fscale  v0.2d, v31.2d, v0.2d
+// CHECK-ENCODING: [0xe0,0xff,0xe0,0x6e]
+// CHECK-ERROR: instruction requires: fp8
+// CHECK-UNKNOWN: 6ee0ffe0 <unknown>
+
+fscale  v31.2d, v31.2d, v31.2d
+// CHECK-INST: fscale  v31.2d, v31.2d, v31.2d
+// CHECK-ENCODING: [0xff,0xff,0xff,0x6e]
+// CHECK-ERROR: instruction requires: fp8
+// CHECK-UNKNOWN: 6effffff
diff --git a/llvm/test/MC/AArch64/FP8_SME2/cvt-diagnostics.s b/llvm/test/MC/AArch64/FP8_SME2/cvt-diagnostics.s
new file mode 100644
index 0000000000000..418ae9e2a4405
--- /dev/null
+++ b/llvm/test/MC/AArch64/FP8_SME2/cvt-diagnostics.s
@@ -0,0 +1,87 @@
+// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2,+fp8 2>&1 < %s| FileCheck %s
+
+// --------------------------------------------------------------------------//
+// Incorrect operand
+
+f1cvt  { z0.h, z1.h }, z0
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: unknown token in expression
+// CHECK-NEXT: f1cvt  { z0.h, z1.h }, z0
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+bf1cvt  { z0, z1 }, z0.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: bf1cvt  { z0, z1 }, z0.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+bf1cvtl { z0.b, z1.b }, z0.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: bf1cvtl { z0.b, z1.b }, z0.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+bf2cvt  { z0.h, z1.h }, z0.h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: bf2cvt  { z0.h, z1.h }, z0.h
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+bf2cvtl { z30.h}, z31.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: bf2cvtl { z30.h}, z31.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+f2cvt   { z0, z1.h }, {z0.b}
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: mismatched register size suffix
+// CHECK-NEXT: f2cvt   { z0, z1.h }, {z0.b}
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+f2cvtl  z0.h, z1.h, z0.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: f2cvtl  z0.h, z1.h, z0.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fcvt    z31.b, { z30.h }
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: fcvt    z31.b, { z30.h }
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+bfcvt   z0.b, { z0.b, z1.b }
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: bfcvt   z0.b, { z0.b, z1.b }
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Incorrect range of vectors
+
+bf1cvt { z1.h, z2.h }, z0.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors, where the first vector is a multiple of 2 and with matching element types
+// CHECK-NEXT: bf1cvt { z1.h, z2.h }, z0.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+f1cvt  { z1.h, z0.h }, z31.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: f1cvt  { z1.h, z0.h }, z31.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+f1cvtl { z31.h, z0.h }, z0.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors, where the first vector is a multiple of 2 and with matching element types
+// CHECK-NEXT: f1cvtl { z31.h, z0.h }, z0.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fcvt   z31.b, { z29.s - z0.s }
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 4 consecutive SVE vectors, where the first vector is a multiple of 4 and with matching element types
+// CHECK-NEXT: fcvt   z31.b, { z29.s - z0.s }
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fcvtn  z31.b, { z30.s - z1.s }
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 4 consecutive SVE vectors, where the first vector is a multiple of 4 and with matching element types
+// CHECK-NEXT: fcvtn  z31.b, { z30.s - z1.s }
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fcvtn  z0.b, { z31.s - z2.s }
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 4 consecutive SVE vectors, where the first vector is a multiple of 4 and with matching element types
+// CHECK-NEXT: fcvtn  z0.b, { z31.s - z2.s }
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fcvtn  z0.b, { z1.s - z4.s }
+// CHECK: [[@LINE-1]]:{{[0-9]+}}:
+// CHECK-NEXT: fcvtn  z0.b, { z1.s - z4.s }
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
diff --git a/llvm/test/MC/AArch64/FP8_SME2/cvt.s b/llvm/test/MC/AArch64/FP8_SME2/cvt.s
new file mode 100644
index 0000000000000..35539823fde22
--- /dev/null
+++ b/llvm/test/MC/AArch64/FP8_SME2/cvt.s
@@ -0,0 +1,157 @@
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2,+fp8 < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
+// RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2,+fp8 < %s \
+// RUN:        | llvm-objdump -d --mattr=+sme2,+fp8 - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2,+fp8 < %s \
+// RUN:        | llvm-objdump -d --mattr=-sme2 - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+// Disassemble encoding and check the re-encoding (-show-encoding) matches.
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2,+fp8 < %s \
+// RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
+// RUN:        | llvm-mc -triple=aarch64 -mattr=+sme2,+fp8 -disassemble -show-encoding \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+
+f1cvt   {z0.h-z1.h}, z0.b  // 11000001-00100110-11100000-00000000
+// CHECK-INST: f1cvt   { z0.h, z1.h }, z0.b
+// CHECK-ENCODING: [0x00,0xe0,0x26,0xc1]
+// CHECK-ERROR: instruction requires: fp8 sme2
+// CHECK-UNKNOWN: c126e000 <unknown>
+
+f1cvt   {z30.h-z31.h}, z31.b  // 11000001-00100110-11100011-11111110
+// CHECK-INST: f1cvt   { z30.h, z31.h }, z31.b
+// CHECK-ENCODING: [0xfe,0xe3,0x26,0xc1]
+// CHECK-ERROR: instruction requires: fp8 sme2
+// CHECK-UNKNOWN: c126e3fe <unknown>
+
+f1cvtl  {z0.h-z1.h}, z0.b  // 11000001-00100110-11100000-00000001
+// CHECK-INST: f1cvtl  { z0.h, z1.h }, z0.b
+// CHECK-ENCODING: [0x01,0xe0,0x26,0xc1]
+// CHECK-ERROR: instruction requires: fp8 sme2
+// CHECK-UNKNOWN: c126e001 <unknown>
+
+f1cvtl  {z30.h-z31.h}, z31.b  // 11000001-00100110-11100011-11111111
+// CHECK-INST: f1cvtl  { z30.h, z31.h }, z31.b
+// CHECK-ENCODING: [0xff,0xe3,0x26,0xc1]
+// CHECK-ERROR: instruction requires: fp8 sme2
+// CHECK-UNKNOWN: c126e3ff <unknown>
+
+bf1cvt  {z0.h-z1.h}, z0.b  // 11000001-01100110-11100000-00000000
+// CHECK-INST: bf1cvt  { z0.h, z1.h }, z0.b
+// CHECK-ENCODING: [0x00,0xe0,0x66,0xc1]
+// CHECK-ERROR: instruction requires: fp8 sme2
+// CHECK-UNKNOWN: c166e000 <unknown>
+
+bf1cvt  {z30.h-z31.h}, z31.b  // 11000001-01100110-11100011-11111110
+// CHECK-INST: bf1cvt  { z30.h, z31.h }, z31.b
+// CHECK-ENCODING: [0xfe,0xe3,0x66,0xc1]
+// CHECK-ERROR: instruction requires: fp8 sme2
+// CHECK-UNKNOWN: c166e3fe <unknown>
+
+bf1cvtl {z0.h-z1.h}, z0.b  // 11000001-01100110-11100000-00000001
+// CHECK-INST: bf1cvtl { z0.h, z1.h }, z0.b
+// CHECK-ENCODING: [0x01,0xe0,0x66,0xc1]
+// CHECK-ERROR: instruction requires: fp8 sme2
+// CHECK-UNKNOWN: c166e001 <unknown>
+
+bf1cvtl {z30.h-z31.h}, z31.b  // 11000001-01100110-11100011-11111111
+// CHECK-INST: bf1cvtl { z30.h, z31.h }, z31.b
+// CHECK-ENCODING: [0xff,0xe3,0x66,0xc1]
+// CHECK-ERROR: instruction requires: fp8 sme2
+// CHECK-UNKNOWN: c166e3ff <unknown>
+
+bf2cvt  {z0.h-z1.h}, z0.b  // 11000001-11100110-11100000-00000000
+// CHECK-INST: bf2cvt  { z0.h, z1.h }, z0.b
+// CHECK-ENCODING: [0x00,0xe0,0xe6,0xc1]
+// CHECK-ERROR: instruction requires: fp8 sme2
+// CHECK-UNKNOWN: c1e6e000 <unknown>
+
+bf2cvt  {z30.h-z31.h}, z31.b  // 11000001-11100110-11100011-11111110
+// CHECK-INST: bf2cvt  { z30.h, z31.h }, z31.b
+// CHECK-ENCODING: [0xfe,0xe3,0xe6,0xc1]
+// CHECK-ERROR: instruction requires: fp8 sme2
+// CHECK-UNKNOWN: c1e6e3fe <unknown>
+
+bf2cvtl {z0.h-z1.h}, z0.b  // 11000001-11100110-11100000-00000001
+// CHECK-INST: bf2cvtl { z0.h, z1.h }, z0.b
+// CHECK-ENCODING: [0x01,0xe0,0xe6,0xc1]
+// CHECK-ERROR: instruction requires: fp8 sme2
+// CHECK-UNKNOWN: c1e6e001 <unknown>
+
+bf2cvtl {z30.h-z31.h}, z31.b  // 11000001-11100110-11100011-11111111
+// CHECK-INST: bf2cvtl { z30.h, z31.h }, z31.b
+// CHECK-ENCODING: [0xff,0xe3,0xe6,0xc1]
+// CHECK-ERROR: instruction requires: fp8 sme2
+// CHECK-UNKNOWN: c1e6e3ff <unknown>
+
+f2cvt   {z0.h-z1.h}, z0.b  // 11000001-10100110-11100000-00000000
+// CHECK-INST: f2cvt   { z0.h, z1.h }, z0.b
+// CHECK-ENCODING: [0x00,0xe0,0xa6,0xc1]
+// CHECK-ERROR: instruction requires: fp8 sme2
+// CHECK-UNKNOWN: c1a6e000 <unknown>
+
+f2cvt   {z30.h-z31.h}, z31.b  // 11000001-10100110-11100011-11111110
+// CHECK-INST: f2cvt   { z30.h, z31.h }, z31.b
+// CHECK-ENCODING: [0xfe,0xe3,0xa6,0xc1]
+// CHECK-ERROR: instruction requires: fp8 sme2
+// CHECK-UNKNOWN: c1a6e3fe <unknown>
+
+f2cvtl  {z0.h-z1.h}, z0.b  // 11000001-10100110-11100000-00000001
+// CHECK-INST: f2cvtl  { z0.h, z1.h }, z0.b
+// CHECK-ENCODING: [0x01,0xe0,0xa6,0xc1]
+// CHECK-ERROR: instruction requires: fp8 sme2
+// CHECK-UNKNOWN: c1a6e001 <unknown>
+
+f2cvtl  {z30.h-z31.h}, z31.b  // 11000001-10100110-11100011-11111111
+// CHECK-INST: f2cvtl  { z30.h, z31.h }, z31.b
+// CHECK-ENCODING: [0xff,0xe3,0xa6,0xc1]
+// CHECK-ERROR: instruction requires: fp8 sme2
+// CHECK-UNKNOWN: c1a6e3ff <unknown>
+
+fcvt    z0.b, {z0.h-z1.h}  // 11000001-00100100-11100000-00000000
+// CHECK-INST: fcvt    z0.b, { z0.h, z1.h }
+// CHECK-ENCODING: [0x00,0xe0,0x24,0xc1]
+// CHECK-ERROR: instruction requires: fp8 sme2
+// CHECK-UNKNOWN: c124e000 <unknown>
+
+fcvt    z31.b, {z30.h-z31.h}  // 11000001-00100100-11100011-11011111
+// CHECK-INST: fcvt    z31.b, { z30.h, z31.h }
+// CHECK-ENCODING: [0xdf,0xe3,0x24,0xc1]
+// CHECK-ERROR: instruction requires: fp8 sme2
+// CHECK-UNKNOWN: c124e3df <unknown>
+
+fcvt    z0.b, {z0.s-z3.s}  // 11000001-00110100-11100000-00000000
+// CHECK-INST: fcvt    z0.b, { z0.s - z3.s }
+// CHECK-ENCODING: [0x00,0xe0,0x34,0xc1]
+// CHECK-ERROR: instruction requires: fp8 sme2
+// CHECK-UNKNOWN: c134e000 <unknown>
+
+fcvt    z31.b, {z28.s-z31.s}  // 11000001-00110100-11100011-10011111
+// CHECK-INST: fcvt    z31.b, { z28.s - z31.s }
+// CHECK-ENCODING: [0x9f,0xe3,0x34,0xc1]
+// CHECK-ERROR: instruction requires: fp8 sme2
+// CHECK-UNKNOWN: c134e39f <unknown>
+
+fcvtn   z0.b, {z0.s-z3.s}  // 11000001-00110100-11100000-00100000
+// CHECK-INST: fcvtn   z0.b, { z0.s - z3.s }
+// CHECK-ENCODING: [0x20,0xe0,0x34,0xc1]
+// CHECK-ERROR: instruction requires: fp8 sme2
+// CHECK-UNKNOWN: c134e020 <unknown>
+
+fcvtn   z31.b, {z28.s-z31.s}  // 11000001-00110100-11100011-10111111
+// CHECK-INST: fcvtn   z31.b, { z28.s - z31.s }
+// CHECK-ENCODING: [0xbf,0xe3,0x34,0xc1]
+// CHECK-ERROR: instruction requires: fp8 sme2
+// CHECK-UNKNOWN: c134e3bf <unknown>
+
+bfcvt   z0.b, {z0.h-z1.h}  // 11000001-01100100-11100000-00000000
+// CHECK-INST: bfcvt   z0.b, { z0.h, z1.h }
+// CHECK-ENCODING: [0x00,0xe0,0x64,0xc1]
+// CHECK-ERROR: instruction requires: fp8 sme2
+// CHECK-UNKNOWN: c164e000 <unknown>
+
+bfcvt   z31.b, {z30.h-z31.h}  // 11000001-01100100-11100011-11011111
+// CHECK-INST: bfcvt   z31.b, { z30.h, z31.h }
+// CHECK-ENCODING: [0xdf,0xe3,0x64,0xc1]
+// CHECK-ERROR: instruction requires: fp8 sme2
+// CHECK-UNKNOWN: c164e3df <unknown>
diff --git a/llvm/test/MC/AArch64/FP8_SME2/fscale-diagnostics.c b/llvm/test/MC/AArch64/FP8_SME2/fscale-diagnostics.c
new file mode 100644
index 0000000000000..b9288835a7001
--- /dev/null
+++ b/llvm/test/MC/AArch64/FP8_SME2/fscale-diagnostics.c
@@ -0,0 +1,62 @@
+// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2,+fp8 2>&1 < %s| FileCheck %s
+
+// --------------------------------------------------------------------------//
+// Incorrect operand
+
+fscale  {z0.h-z1.h}, {z0.h-z1.h}, z0
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand
+// CHECK-NEXT: fscale  {z0.h-z1.h}, {z0.h-z1.h}, z0
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fscale  {z0.d-z1.d}, {z0.h-z1.h}, z0.h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: fscale  {z0.d-z1.d}, {z0.h-z1.h}, z0.h
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fscale  {z30.s-z31.s}, {z30.s-z31.s}, {z30 - z31}
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: fscale  {z30.s-z31.s}, {z30.s-z31.s}, {z30 - z31}
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fscale  {z0.s-z3.s}, {z0.d-z3.d}, z0.h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: fscale  {z0.s-z3.s}, {z0.d-z3.d}, z0.h
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fscale  {z28.h-z31.h}, {z28-z31}, {z28.h-z31.h}
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: fscale  {z28.h-z31.h}, {z28-z31}, {z28.h-z31.h}
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fscale  {z28.d-z31.d}, z28.d, {z28.d-z31.d}
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: fscale  {z28.d-z31.d}, z28.d, {z28.d-z31.d}
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fscale  {z0.h-z1.h}, {z1.h-z4.h}, {z0.h-z1.h}
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: fscale  {z0.h-z1.h}, {z1.h-z4.h}, {z0.h-z1.h}
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Incorrect range of vectors
+
+fscale  {z0.h-z1.h}, {z1.h-z2.h}, z0.h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors, where the first vector is a multiple of 2 and with matching element types
+// CHECK-NEXT: fscale  {z0.h-z1.h}, {z1.h-z2.h}, z0.h
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fscale  {z0.h-z1.h}, {z31.h-z0.h}, {z0.h-z1.h}
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors, where the first vector is a multiple of 2 and with matching element types
+// CHECK-NEXT: fscale  {z0.h-z1.h}, {z31.h-z0.h}, {z0.h-z1.h}
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fscale  {z2.h-z5.h}, {z0.h-z3.h}, z0.h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 4 consecutive SVE vectors, where the first vector is a multiple of 4 and with matching element types
+// CHECK-NEXT: fscale  {z2.h-z5.h}, {z0.h-z3.h}, z0.h
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fscale  {z0.h-z3.h}, {z0.h-z3.h}, {z3.h-z6.h}
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 4 consecutive SVE vectors, where the first vector is a multiple of 4 and with matching element types
+// CHECK-NEXT: fscale  {z0.h-z3.h}, {z0.h-z3.h}, {z3.h-z6.h}
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
diff --git a/llvm/test/MC/AArch64/FP8_SME2/fscale.s b/llvm/test/MC/AArch64/FP8_SME2/fscale.s
new file mode 100644
index 0000000000000..b07bc9606ade4
--- /dev/null
+++ b/llvm/test/MC/AArch64/FP8_SME2/fscale.s
@@ -0,0 +1,160 @@
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2,+fp8 < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
+// RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2,+fp8 < %s \
+// RUN:        | llvm-objdump -d --mattr=+sme2,+fp8 - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2,+fp8 < %s \
+// RUN:        | llvm-objdump -d --mattr=-sme2 - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+// Disassemble encoding and check the re-encoding (-show-encoding) matches.
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2,+fp8 < %s \
+// RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
+// RUN:        | llvm-mc -triple=aarch64 -mattr=+sme2,+fp8 -disassemble -show-encoding \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+//2X
+fscale  {z0.h-z1.h}, {z0.h-z1.h}, z0.h  // 11000001-01100000-10100001-10000000
+// CHECK-INST: fscale  { z0.h, z1.h }, { z0.h, z1.h }, z0.h
+// CHECK-ENCODING: [0x80,0xa1,0x60,0xc1]
+// CHECK-ERROR: instruction requires: fp8 sme2
+// CHECK-UNKNOWN: c160a180 <unknown>
+
+fscale  {z30.h-z31.h}, {z30.h-z31.h}, z15.h  // 11000001-01101111-10100001-10011110
+// CHECK-INST: fscale  { z30.h, z31.h }, { z30.h, z31.h }, z15.h
+// CHECK-ENCODING: [0x9e,0xa1,0x6f,0xc1]
+// CHECK-ERROR: instruction requires: fp8 sme2
+// CHECK-UNKNOWN: c16fa19e <unknown>
+
+fscale  {z0.s-z1.s}, {z0.s-z1.s}, z0.s  // 11000001-10100000-10100001-10000000
+// CHECK-INST: fscale  { z0.s, z1.s }, { z0.s, z1.s }, z0.s
+// CHECK-ENCODING: [0x80,0xa1,0xa0,0xc1]
+// CHECK-ERROR: instruction requires: fp8 sme2
+// CHECK-UNKNOWN: c1a0a180 <unknown>
+
+fscale  {z30.s-z31.s}, {z30.s-z31.s}, z15.s  // 11000001-10101111-10100001-10011110
+// CHECK-INST: fscale  { z30.s, z31.s }, { z30.s, z31.s }, z15.s
+// CHECK-ENCODING: [0x9e,0xa1,0xaf,0xc1]
+// CHECK-ERROR: instruction requires: fp8 sme2
+// CHECK-UNKNOWN: c1afa19e <unknown>
+
+fscale  {z0.d-z1.d}, {z0.d-z1.d}, z0.d  // 11000001-11100000-10100001-10000000
+// CHECK-INST: fscale  { z0.d, z1.d }, { z0.d, z1.d }, z0.d
+// CHECK-ENCODING: [0x80,0xa1,0xe0,0xc1]
+// CHECK-ERROR: instruction requires: fp8 sme2
+// CHECK-UNKNOWN: c1e0a180 <unknown>
+
+fscale  {z30.d-z31.d}, {z30.d-z31.d}, z15.d  // 11000001-11101111-10100001-10011110
+// CHECK-INST: fscale  { z30.d, z31.d }, { z30.d, z31.d }, z15.d
+// CHECK-ENCODING: [0x9e,0xa1,0xef,0xc1]
+// CHECK-ERROR: instruction requires: fp8 sme2
+// CHECK-UNKNOWN: c1efa19e <unknown>
+
+fscale  {z0.h-z1.h}, {z0.h-z1.h}, {z0.h-z1.h}  // 11000001-01100000-10110001-10000000
+// CHECK-INST: fscale  { z0.h, z1.h }, { z0.h, z1.h }, { z0.h, z1.h }
+// CHECK-ENCODING: [0x80,0xb1,0x60,0xc1]
+// CHECK-ERROR: instruction requires: fp8 sme2
+// CHECK-UNKNOWN: c160b180 <unknown>
+
+fscale  {z30.h-z31.h}, {z30.h-z31.h}, {z30.h-z31.h}  // 11000001-01111110-10110001-10011110
+// CHECK-INST: fscale  { z30.h, z31.h }, { z30.h, z31.h }, { z30.h, z31.h }
+// CHECK-ENCODING: [0x9e,0xb1,0x7e,0xc1]
+// CHECK-ERROR: instruction requires: fp8 sme2
+// CHECK-UNKNOWN: c17eb19e <unknown>
+
+fscale  {z0.s-z1.s}, {z0.s-z1.s}, {z0.s-z1.s}  // 11000001-10100000-10110001-10000000
+// CHECK-INST: fscale  { z0.s, z1.s }, { z0.s, z1.s }, { z0.s, z1.s }
+// CHECK-ENCODING: [0x80,0xb1,0xa0,0xc1]
+// CHECK-ERROR: instruction requires: fp8 sme2
+// CHECK-UNKNOWN: c1a0b180 <unknown>
+
+fscale  {z30.s-z31.s}, {z30.s-z31.s}, {z30.s-z31.s}  // 11000001-10111110-10110001-10011110
+// CHECK-INST: fscale  { z30.s, z31.s }, { z30.s, z31.s }, { z30.s, z31.s }
+// CHECK-ENCODING: [0x9e,0xb1,0xbe,0xc1]
+// CHECK-ERROR: instruction requires: fp8 sme2
+// CHECK-UNKNOWN: c1beb19e <unknown>
+
+fscale  {z0.d-z1.d}, {z0.d-z1.d}, {z0.d-z1.d}  // 11000001-11100000-10110001-10000000
+// CHECK-INST: fscale  { z0.d, z1.d }, { z0.d, z1.d }, { z0.d, z1.d }
+// CHECK-ENCODING: [0x80,0xb1,0xe0,0xc1]
+// CHECK-ERROR: instruction requires: fp8 sme2
+// CHECK-UNKNOWN: c1e0b180 <unknown>
+
+fscale  {z30.d-z31.d}, {z30.d-z31.d}, {z30.d-z31.d}  // 11000001-11111110-10110001-10011110
+// CHECK-INST: fscale  { z30.d, z31.d }, { z30.d, z31.d }, { z30.d, z31.d }
+// CHECK-ENCODING: [0x9e,0xb1,0xfe,0xc1]
+// CHECK-ERROR: instruction requires: fp8 sme2
+// CHECK-UNKNOWN: c1feb19e <unknown>
+
+
+//4X
+
+fscale  {z0.h-z3.h}, {z0.h-z3.h}, z0.h  // 11000001-01100000-10101001-10000000
+// CHECK-INST: fscale  { z0.h - z3.h }, { z0.h - z3.h }, z0.h
+// CHECK-ENCODING: [0x80,0xa9,0x60,0xc1]
+// CHECK-ERROR: instruction requires: fp8 sme2
+// CHECK-UNKNOWN: c160a980 <unknown>
+
+fscale  {z28.h-z31.h}, {z28.h-z31.h}, z15.h  // 11000001-01101111-10101001-10011100
+// CHECK-INST: fscale  { z28.h - z31.h }, { z28.h - z31.h }, z15.h
+// CHECK-ENCODING: [0x9c,0xa9,0x6f,0xc1]
+// CHECK-ERROR: instruction requires: fp8 sme2
+// CHECK-UNKNOWN: c16fa99c <unknown>
+
+fscale  {z0.s-z3.s}, {z0.s-z3.s}, z0.s  // 11000001-10100000-10101001-10000000
+// CHECK-INST: fscale  { z0.s - z3.s }, { z0.s - z3.s }, z0.s
+// CHECK-ENCODING: [0x80,0xa9,0xa0,0xc1]
+// CHECK-ERROR: instruction requires: fp8 sme2
+// CHECK-UNKNOWN: c1a0a980 <unknown>
+
+fscale  {z28.s-z31.s}, {z28.s-z31.s}, z15.s  // 11000001-10101111-10101001-10011100
+// CHECK-INST: fscale  { z28.s - z31.s }, { z28.s - z31.s }, z15.s
+// CHECK-ENCODING: [0x9c,0xa9,0xaf,0xc1]
+// CHECK-ERROR: instruction requires: fp8 sme2
+// CHECK-UNKNOWN: c1afa99c <unknown>
+
+fscale  {z0.d-z3.d}, {z0.d-z3.d}, z0.d  // 11000001-11100000-10101001-10000000
+// CHECK-INST: fscale  { z0.d - z3.d }, { z0.d - z3.d }, z0.d
+// CHECK-ENCODING: [0x80,0xa9,0xe0,0xc1]
+// CHECK-ERROR: instruction requires: fp8 sme2
+// CHECK-UNKNOWN: c1e0a980 <unknown>
+
+fscale  {z28.d-z31.d}, {z28.d-z31.d}, z15.d  // 11000001-11101111-10101001-10011100
+// CHECK-INST: fscale  { z28.d - z31.d }, { z28.d - z31.d }, z15.d
+// CHECK-ENCODING: [0x9c,0xa9,0xef,0xc1]
+// CHECK-ERROR: instruction requires: fp8 sme2
+// CHECK-UNKNOWN: c1efa99c <unknown>
+
+fscale  {z0.h-z3.h}, {z0.h-z3.h}, {z0.h-z3.h}  // 11000001-01100000-10111001-10000000
+// CHECK-INST: fscale  { z0.h - z3.h }, { z0.h - z3.h }, { z0.h - z3.h }
+// CHECK-ENCODING: [0x80,0xb9,0x60,0xc1]
+// CHECK-ERROR: instruction requires: fp8 sme2
+// CHECK-UNKNOWN: c160b980 <unknown>
+
+fscale  {z28.h-z31.h}, {z28.h-z31.h}, {z28.h-z31.h}  // 11000001-01111100-10111001-10011100
+// CHECK-INST: fscale  { z28.h - z31.h }, { z28.h - z31.h }, { z28.h - z31.h }
+// CHECK-ENCODING: [0x9c,0xb9,0x7c,0xc1]
+// CHECK-ERROR: instruction requires: fp8 sme2
+// CHECK-UNKNOWN: c17cb99c <unknown>
+
+fscale  {z0.s-z3.s}, {z0.s-z3.s}, {z0.s-z3.s}  // 11000001-10100000-10111001-10000000
+// CHECK-INST: fscale  { z0.s - z3.s }, { z0.s - z3.s }, { z0.s - z3.s }
+// CHECK-ENCODING: [0x80,0xb9,0xa0,0xc1]
+// CHECK-ERROR: instruction requires: fp8 sme2
+// CHECK-UNKNOWN: c1a0b980 <unknown>
+
+fscale  {z28.s-z31.s}, {z28.s-z31.s}, {z28.s-z31.s}  // 11000001-10111100-10111001-10011100
+// CHECK-INST: fscale  { z28.s - z31.s }, { z28.s - z31.s }, { z28.s - z31.s }
+// CHECK-ENCODING: [0x9c,0xb9,0xbc,0xc1]
+// CHECK-ERROR: instruction requires: fp8 sme2
+// CHECK-UNKNOWN: c1bcb99c <unknown>
+
+fscale  {z0.d-z3.d}, {z0.d-z3.d}, {z0.d-z3.d}  // 11000001-11100000-10111001-10000000
+// CHECK-INST: fscale  { z0.d - z3.d }, { z0.d - z3.d }, { z0.d - z3.d }
+// CHECK-ENCODING: [0x80,0xb9,0xe0,0xc1]
+// CHECK-ERROR: instruction requires: fp8 sme2
+// CHECK-UNKNOWN: c1e0b980 <unknown>
+
+fscale  {z28.d-z31.d}, {z28.d-z31.d}, {z28.d-z31.d}  // 11000001-11111100-10111001-10011100
+// CHECK-INST: fscale  { z28.d - z31.d }, { z28.d - z31.d }, { z28.d - z31.d }
+// CHECK-ENCODING: [0x9c,0xb9,0xfc,0xc1]
+// CHECK-ERROR: instruction requires: fp8 sme2
+// CHECK-UNKNOWN: c1fcb99c <unknown>
diff --git a/llvm/test/MC/AArch64/FP8_SVE2/fcvt-diagnostics.s b/llvm/test/MC/AArch64/FP8_SVE2/fcvt-diagnostics.s
new file mode 100644
index 0000000000000..6ecbdaed314b3
--- /dev/null
+++ b/llvm/test/MC/AArch64/FP8_SVE2/fcvt-diagnostics.s
@@ -0,0 +1,131 @@
+// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2,+fp8 2>&1 < %s | FileCheck %s
+// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2,+fp8 2>&1 < %s | FileCheck %s
+
+// --------------------------------------------------------------------------//
+
+f1cvt z0.h, z0.h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: f1cvt z0.h, z0.h
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+f1cvt z0.b, z0.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: f1cvt z0.b, z0.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+f1cvt z32.h, z0.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: f1cvt z32.h, z0.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+
+f2cvt z0.h, z0.h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: f2cvt z0.h, z0.h
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+f2cvt z0.b, z0.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: f2cvt z0.b, z0.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+f2cvt z32.h, z0.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: f2cvt z32.h, z0.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+
+bf1cvt z0.h, z0.h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: bf1cvt z0.h, z0.h
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+bf1cvt z0.b, z0.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: bf1cvt z0.b, z0.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+bf1cvt z32.h, z0.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: bf1cvt z32.h, z0.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+
+bf2cvt z0.h, z0.h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: bf2cvt z0.h, z0.h
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+bf2cvt z0.b, z0.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: bf2cvt z0.b, z0.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+bf2cvt z32.h, z0.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: bf2cvt z32.h, z0.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+
+f1cvtlt z0.h, z0.h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: f1cvtlt z0.h, z0.h
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+f1cvtlt z0.b, z0.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: f1cvtlt z0.b, z0.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+f1cvtlt z32.h, z0.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: f1cvtlt z32.h, z0.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+
+f2cvtlt z0.h, z0.h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: f2cvtlt z0.h, z0.h
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+f2cvtlt z0.b, z0.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: f2cvtlt z0.b, z0.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+f2cvtlt z32.h, z0.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: f2cvtlt z32.h, z0.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+
+bf1cvtlt z0.h, z0.h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: bf1cvtlt z0.h, z0.h
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+bf1cvtlt z0.b, z0.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: bf1cvtlt z0.b, z0.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+bf1cvtlt z32.h, z0.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: bf1cvtlt z32.h, z0.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+
+bf2cvtlt z0.h, z0.h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: bf2cvtlt z0.h, z0.h
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+bf2cvtlt z0.b, z0.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: bf2cvtlt z0.b, z0.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+bf2cvtlt z32.h, z0.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: bf2cvtlt z32.h, z0.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
\ No newline at end of file
diff --git a/llvm/test/MC/AArch64/FP8_SVE2/fcvt.s b/llvm/test/MC/AArch64/FP8_SVE2/fcvt.s
new file mode 100644
index 0000000000000..2301935db012f
--- /dev/null
+++ b/llvm/test/MC/AArch64/FP8_SVE2/fcvt.s
@@ -0,0 +1,237 @@
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2,+fp8 < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2,+fp8 < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
+// RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2,+fp8 < %s \
+// RUN:        | llvm-objdump -d --mattr=+sve2,+fp8 - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2,+fp8 < %s \
+// RUN:        | llvm-objdump -d --mattr=-sme2 - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+// Disassemble encoding and check the re-encoding (-show-encoding) matches.
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2,+fp8 < %s \
+// RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
+// RUN:        | llvm-mc -triple=aarch64 -mattr=+sve2,+fp8 -disassemble -show-encoding \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+
+//
+// F1CVT instructions
+//
+f1cvt   z0.h, z0.b  // 01100101-00001000-00110000-00000000
+// CHECK-INST: f1cvt   z0.h, z0.b
+// CHECK-ENCODING: [0x00,0x30,0x08,0x65]
+// CHECK-ERROR: instruction requires: fp8 sve2
+// CHECK-UNKNOWN: 65083000 <unknown>
+
+f1cvt   z0.h, z31.b  // 01100101-00001000-00110011-11100000
+// CHECK-INST: f1cvt   z0.h, z31.b
+// CHECK-ENCODING: [0xe0,0x33,0x08,0x65]
+// CHECK-ERROR: instruction requires: fp8 sve2
+// CHECK-UNKNOWN: 650833e0 <unknown>
+
+f1cvt   z31.h, z0.b  // 01100101-00001000-00110000-00011111
+// CHECK-INST: f1cvt   z31.h, z0.b
+// CHECK-ENCODING: [0x1f,0x30,0x08,0x65]
+// CHECK-ERROR: instruction requires: fp8 sve2
+// CHECK-UNKNOWN: 6508301f <unknown>
+
+f1cvt   z31.h, z31.b  // 01100101-00001000-00110011-11111111
+// CHECK-INST: f1cvt   z31.h, z31.b
+// CHECK-ENCODING: [0xff,0x33,0x08,0x65]
+// CHECK-ERROR: instruction requires: fp8 sve2
+// CHECK-UNKNOWN: 650833ff <unknown>
+
+//
+// F2CVT instructions
+//
+f2cvt   z0.h, z0.b  // 01100101-00001000-00110100-00000000
+// CHECK-INST: f2cvt   z0.h, z0.b
+// CHECK-ENCODING: [0x00,0x34,0x08,0x65]
+// CHECK-ERROR: instruction requires: fp8 sve2
+// CHECK-UNKNOWN: 65083400 <unknown>
+
+f2cvt   z0.h, z31.b  // 01100101-00001000-00110111-11100000
+// CHECK-INST: f2cvt   z0.h, z31.b
+// CHECK-ENCODING: [0xe0,0x37,0x08,0x65]
+// CHECK-ERROR: instruction requires: fp8 sve2
+// CHECK-UNKNOWN: 650837e0 <unknown>
+
+f2cvt   z31.h, z0.b  // 01100101-00001000-00110100-00011111
+// CHECK-INST: f2cvt   z31.h, z0.b
+// CHECK-ENCODING: [0x1f,0x34,0x08,0x65]
+// CHECK-ERROR: instruction requires: fp8 sve2
+// CHECK-UNKNOWN: 6508341f <unknown>
+
+f2cvt   z31.h, z31.b  // 01100101-00001000-00110111-11111111
+// CHECK-INST: f2cvt   z31.h, z31.b
+// CHECK-ENCODING: [0xff,0x37,0x08,0x65]
+// CHECK-ERROR: instruction requires: fp8 sve2
+// CHECK-UNKNOWN: 650837ff <unknown>
+
+
+//
+// BF1CVT instructions
+//
+bf1cvt  z0.h, z0.b  // 01100101-00001000-00111000-00000000
+// CHECK-INST: bf1cvt  z0.h, z0.b
+// CHECK-ENCODING: [0x00,0x38,0x08,0x65]
+// CHECK-ERROR: instruction requires: fp8 sve2
+// CHECK-UNKNOWN: 65083800 <unknown>
+
+bf1cvt  z0.h, z31.b  // 01100101-00001000-00111011-11100000
+// CHECK-INST: bf1cvt  z0.h, z31.b
+// CHECK-ENCODING: [0xe0,0x3b,0x08,0x65]
+// CHECK-ERROR: instruction requires: fp8 sve2
+// CHECK-UNKNOWN: 65083be0 <unknown>
+
+bf1cvt  z31.h, z0.b  // 01100101-00001000-00111000-00011111
+// CHECK-INST: bf1cvt  z31.h, z0.b
+// CHECK-ENCODING: [0x1f,0x38,0x08,0x65]
+// CHECK-ERROR: instruction requires: fp8 sve2
+// CHECK-UNKNOWN: 6508381f <unknown>
+
+bf1cvt  z31.h, z31.b  // 01100101-00001000-00111011-11111111
+// CHECK-INST: bf1cvt  z31.h, z31.b
+// CHECK-ENCODING: [0xff,0x3b,0x08,0x65]
+// CHECK-ERROR: instruction requires: fp8 sve2
+// CHECK-UNKNOWN: 65083bff <unknown>
+
+
+//
+// BF2CVT instructions
+//
+bf2cvt  z0.h, z0.b  // 01100101-00001000-00111100-00000000
+// CHECK-INST: bf2cvt  z0.h, z0.b
+// CHECK-ENCODING: [0x00,0x3c,0x08,0x65]
+// CHECK-ERROR: instruction requires: fp8 sve2
+// CHECK-UNKNOWN: 65083c00 <unknown>
+
+bf2cvt  z0.h, z31.b  // 01100101-00001000-00111111-11100000
+// CHECK-INST: bf2cvt  z0.h, z31.b
+// CHECK-ENCODING: [0xe0,0x3f,0x08,0x65]
+// CHECK-ERROR: instruction requires: fp8 sve2
+// CHECK-UNKNOWN: 65083fe0 <unknown>
+
+bf2cvt  z31.h, z0.b  // 01100101-00001000-00111100-00011111
+// CHECK-INST: bf2cvt  z31.h, z0.b
+// CHECK-ENCODING: [0x1f,0x3c,0x08,0x65]
+// CHECK-ERROR: instruction requires: fp8 sve2
+// CHECK-UNKNOWN: 65083c1f <unknown>
+
+bf2cvt  z31.h, z31.b  // 01100101-00001000-00111111-11111111
+// CHECK-INST: bf2cvt  z31.h, z31.b
+// CHECK-ENCODING: [0xff,0x3f,0x08,0x65]
+// CHECK-ERROR: instruction requires: fp8 sve2
+// CHECK-UNKNOWN: 65083fff <unknown>
+
+
+//
+// F1CVTLT instructions
+//
+f1cvtlt z0.h, z0.b  // 01100101-00001001-00110000-00000000
+// CHECK-INST: f1cvtlt z0.h, z0.b
+// CHECK-ENCODING: [0x00,0x30,0x09,0x65]
+// CHECK-ERROR: instruction requires: fp8 sve2
+// CHECK-UNKNOWN: 65093000 <unknown>
+
+f1cvtlt z0.h, z31.b  // 01100101-00001001-00110011-11100000
+// CHECK-INST: f1cvtlt z0.h, z31.b
+// CHECK-ENCODING: [0xe0,0x33,0x09,0x65]
+// CHECK-ERROR: instruction requires: fp8 sve2
+// CHECK-UNKNOWN: 650933e0 <unknown>
+
+f1cvtlt z31.h, z0.b  // 01100101-00001001-00110000-00011111
+// CHECK-INST: f1cvtlt z31.h, z0.b
+// CHECK-ENCODING: [0x1f,0x30,0x09,0x65]
+// CHECK-ERROR: instruction requires: fp8 sve2
+// CHECK-UNKNOWN: 6509301f <unknown>
+
+f1cvtlt z31.h, z31.b  // 01100101-00001001-00110011-11111111
+// CHECK-INST: f1cvtlt z31.h, z31.b
+// CHECK-ENCODING: [0xff,0x33,0x09,0x65]
+// CHECK-ERROR: instruction requires: fp8 sve2
+// CHECK-UNKNOWN: 650933ff <unknown>
+
+
+//
+// F2CVTLT instructions
+//
+f2cvtlt z0.h, z0.b  // 01100101-00001001-00110100-00000000
+// CHECK-INST: f2cvtlt z0.h, z0.b
+// CHECK-ENCODING: [0x00,0x34,0x09,0x65]
+// CHECK-ERROR: instruction requires: fp8 sve2
+// CHECK-UNKNOWN: 65093400 <unknown>
+
+f2cvtlt z0.h, z31.b  // 01100101-00001001-00110111-11100000
+// CHECK-INST: f2cvtlt z0.h, z31.b
+// CHECK-ENCODING: [0xe0,0x37,0x09,0x65]
+// CHECK-ERROR: instruction requires: fp8 sve2
+// CHECK-UNKNOWN: 650937e0 <unknown>
+
+f2cvtlt z31.h, z0.b  // 01100101-00001001-00110100-00011111
+// CHECK-INST: f2cvtlt z31.h, z0.b
+// CHECK-ENCODING: [0x1f,0x34,0x09,0x65]
+// CHECK-ERROR: instruction requires: fp8 sve2
+// CHECK-UNKNOWN: 6509341f <unknown>
+
+f2cvtlt z31.h, z31.b  // 01100101-00001001-00110111-11111111
+// CHECK-INST: f2cvtlt z31.h, z31.b
+// CHECK-ENCODING: [0xff,0x37,0x09,0x65]
+// CHECK-ERROR: instruction requires: fp8 sve2
+// CHECK-UNKNOWN: 650937ff <unknown>
+
+
+//
+// BF1CVTLT instructions
+//
+bf1cvtlt z0.h, z0.b  // 01100101-00001001-00111000-00000000
+// CHECK-INST: bf1cvtlt z0.h, z0.b
+// CHECK-ENCODING: [0x00,0x38,0x09,0x65]
+// CHECK-ERROR: instruction requires: fp8 sve2
+// CHECK-UNKNOWN: 65093800 <unknown>
+
+bf1cvtlt z0.h, z31.b  // 01100101-00001001-00111011-11100000
+// CHECK-INST: bf1cvtlt z0.h, z31.b
+// CHECK-ENCODING: [0xe0,0x3b,0x09,0x65]
+// CHECK-ERROR: instruction requires: fp8 sve2
+// CHECK-UNKNOWN: 65093be0 <unknown>
+
+bf1cvtlt z31.h, z0.b  // 01100101-00001001-00111000-00011111
+// CHECK-INST: bf1cvtlt z31.h, z0.b
+// CHECK-ENCODING: [0x1f,0x38,0x09,0x65]
+// CHECK-ERROR: instruction requires: fp8 sve2
+// CHECK-UNKNOWN: 6509381f <unknown>
+
+bf1cvtlt z31.h, z31.b  // 01100101-00001001-00111011-11111111
+// CHECK-INST: bf1cvtlt z31.h, z31.b
+// CHECK-ENCODING: [0xff,0x3b,0x09,0x65]
+// CHECK-ERROR: instruction requires: fp8 sve2
+// CHECK-UNKNOWN: 65093bff <unknown>
+
+
+//
+// BF2CVTLT instructions
+//
+bf2cvtlt z0.h, z0.b  // 01100101-00001001-00111100-00000000
+// CHECK-INST: bf2cvtlt z0.h, z0.b
+// CHECK-ENCODING: [0x00,0x3c,0x09,0x65]
+// CHECK-ERROR: instruction requires: fp8 sve2
+// CHECK-UNKNOWN: 65093c00 <unknown>
+
+bf2cvtlt z0.h, z31.b  // 01100101-00001001-00111111-11100000
+// CHECK-INST: bf2cvtlt z0.h, z31.b
+// CHECK-ENCODING: [0xe0,0x3f,0x09,0x65]
+// CHECK-ERROR: instruction requires: fp8 sve2
+// CHECK-UNKNOWN: 65093fe0 <unknown>
+
+bf2cvtlt z31.h, z0.b  // 01100101-00001001-00111100-00011111
+// CHECK-INST: bf2cvtlt z31.h, z0.b
+// CHECK-ENCODING: [0x1f,0x3c,0x09,0x65]
+// CHECK-ERROR: instruction requires: fp8 sve2
+// CHECK-UNKNOWN: 65093c1f <unknown>
+
+bf2cvtlt z31.h, z31.b  // 01100101-00001001-00111111-11111111
+// CHECK-INST: bf2cvtlt z31.h, z31.b
+// CHECK-ENCODING: [0xff,0x3f,0x09,0x65]
+// CHECK-ERROR: instruction requires: fp8 sve2
+// CHECK-UNKNOWN: 65093fff <unknown>
diff --git a/llvm/test/MC/AArch64/FP8_SVE2/fcvtn-diagnostics.s b/llvm/test/MC/AArch64/FP8_SVE2/fcvtn-diagnostics.s
new file mode 100644
index 0000000000000..86cca86542aa3
--- /dev/null
+++ b/llvm/test/MC/AArch64/FP8_SVE2/fcvtn-diagnostics.s
@@ -0,0 +1,70 @@
+// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2,+fp8 2>&1 < %s | FileCheck %s
+// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2,+fp8 2>&1 < %s | FileCheck %s
+
+// --------------------------------------------------------------------------//
+
+fcvtn z0.b, {z1.h, z2.h}
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error:  Invalid vector list, expected list with 2 consecutive SVE vectors, where the first vector is a multiple of 2 and with matching element types
+// CHECK-NEXT:  fcvtn z0.b, {z1.h, z2.h}
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fcvtn z0.h, {z0.h, z1.h}
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: fcvtn z0.h, {z0.h, z1.h}
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fcvtn z0.b, {z0.b, z1.b}
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: fcvtn z0.b, {z0.b, z1.b}
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+
+
+fcvtnb  z0.b, {z1.s, z2.s}
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors, where the first vector is a multiple of 2 and with matching element type
+// CHECK-NEXT:  fcvtnb z0.b, {z1.s, z2.s}
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fcvtnb z0.h, {z0.s, z1.s}
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: fcvtnb z0.h, {z0.s, z1.s}
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fcvtnb z0.b, {z0.h, z1.h}
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: fcvtnb z0.b, {z0.h, z1.h}
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+
+
+bfcvtn z0.b, {z1.h, z2.h}
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error:  Invalid vector list, expected list with 2 consecutive SVE vectors, where the first vector is a multiple of 2 and with matching element types
+// CHECK-NEXT:  bfcvtn z0.b, {z1.h, z2.h}
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+bfcvtn z0.h, {z0.h, z1.h}
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: bfcvtn z0.h, {z0.h, z1.h}
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+bfcvtn z0.b, {z0.b, z1.b}
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: bfcvtn z0.b, {z0.b, z1.b}
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+
+
+fcvtnt  z0.b, {z1.s, z2.s}
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors, where the first vector is a multiple of 2 and with matching element type
+// CHECK-NEXT:  fcvtnt z0.b, {z1.s, z2.s}
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fcvtnt z0.h, {z0.s, z1.s}
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: fcvtnt z0.h, {z0.s, z1.s}
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fcvtnt z0.b, {z0.h, z1.h}
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: fcvtnt z0.b, {z0.h, z1.h}
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
\ No newline at end of file
diff --git a/llvm/test/MC/AArch64/FP8_SVE2/fcvtn.s b/llvm/test/MC/AArch64/FP8_SVE2/fcvtn.s
new file mode 100644
index 0000000000000..e16ff24098ef1
--- /dev/null
+++ b/llvm/test/MC/AArch64/FP8_SVE2/fcvtn.s
@@ -0,0 +1,125 @@
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2,+fp8 < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2,+fp8 < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
+// RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2,+fp8 < %s \
+// RUN:        | llvm-objdump -d --mattr=+sve2,+fp8 - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2,+fp8 < %s \
+// RUN:        | llvm-objdump -d --mattr=-sme2 - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+// Disassemble encoding and check the re-encoding (-show-encoding) matches.
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2,+fp8 < %s \
+// RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
+// RUN:        | llvm-mc -triple=aarch64 -mattr=+sve2,+fp8 -disassemble -show-encoding \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+
+//
+// FCVTN instructions
+//
+fcvtn   z0.b, {z0.h, z1.h}  // 01100101-00001010-00110000-00000000
+// CHECK-INST: fcvtn   z0.b, { z0.h, z1.h }
+// CHECK-ENCODING: [0x00,0x30,0x0a,0x65]
+// CHECK-ERROR: instruction requires: fp8 sve2
+// CHECK-UNKNOWN: 650a3000 <unknown>
+
+fcvtn   z0.b, {z30.h, z31.h}  // 01100101-00001010-00110011-11000000
+// CHECK-INST: fcvtn   z0.b, { z30.h, z31.h }
+// CHECK-ENCODING: [0xc0,0x33,0x0a,0x65]
+// CHECK-ERROR: instruction requires: fp8 sve2
+// CHECK-UNKNOWN: 650a33c0 <unknown>
+
+fcvtn   z31.b, {z0.h, z1.h}  // 01100101-00001010-00110000-00011111
+// CHECK-INST: fcvtn   z31.b, { z0.h, z1.h }
+// CHECK-ENCODING: [0x1f,0x30,0x0a,0x65]
+// CHECK-ERROR: instruction requires: fp8 sve2
+// CHECK-UNKNOWN: 650a301f <unknown>
+
+fcvtn   z31.b, {z30.h, z31.h}  // 01100101-00001010-00110011-11011111
+// CHECK-INST: fcvtn   z31.b, { z30.h, z31.h }
+// CHECK-ENCODING: [0xdf,0x33,0x0a,0x65]
+// CHECK-ERROR: instruction requires: fp8 sve2
+// CHECK-UNKNOWN: 650a33df <unknown>
+
+//
+// FCVTNB instructions
+//
+fcvtnb  z0.b, {z0.s, z1.s}  // 01100101-00001010-00110100-00000000
+// CHECK-INST: fcvtnb  z0.b, { z0.s, z1.s }
+// CHECK-ENCODING: [0x00,0x34,0x0a,0x65]
+// CHECK-ERROR: instruction requires: fp8 sve2
+// CHECK-UNKNOWN: 650a3400 <unknown>
+
+fcvtnb  z0.b, {z30.s, z31.s}  // 01100101-00001010-00110111-11000000
+// CHECK-INST: fcvtnb  z0.b, { z30.s, z31.s }
+// CHECK-ENCODING: [0xc0,0x37,0x0a,0x65]
+// CHECK-ERROR: instruction requires: fp8 sve2
+// CHECK-UNKNOWN: 650a37c0 <unknown>
+
+fcvtnb  z31.b, {z0.s, z1.s}  // 01100101-00001010-00110100-00011111
+// CHECK-INST: fcvtnb  z31.b, { z0.s, z1.s }
+// CHECK-ENCODING: [0x1f,0x34,0x0a,0x65]
+// CHECK-ERROR: instruction requires: fp8 sve2
+// CHECK-UNKNOWN: 650a341f <unknown>
+
+fcvtnb  z31.b, {z30.s, z31.s}  // 01100101-00001010-00110111-11011111
+// CHECK-INST: fcvtnb  z31.b, { z30.s, z31.s }
+// CHECK-ENCODING: [0xdf,0x37,0x0a,0x65]
+// CHECK-ERROR: instruction requires: fp8 sve2
+// CHECK-UNKNOWN: 650a37df <unknown>
+
+
+//
+// BFCVTN instructions
+//
+bfcvtn  z0.b, {z0.h, z1.h}  // 01100101-00001010-00111000-00000000
+// CHECK-INST: bfcvtn  z0.b, { z0.h, z1.h }
+// CHECK-ENCODING: [0x00,0x38,0x0a,0x65]
+// CHECK-ERROR: instruction requires: fp8 sve2
+// CHECK-UNKNOWN: 650a3800 <unknown>
+
+bfcvtn  z0.b, {z30.h, z31.h}  // 01100101-00001010-00111011-11000000
+// CHECK-INST: bfcvtn  z0.b, { z30.h, z31.h }
+// CHECK-ENCODING: [0xc0,0x3b,0x0a,0x65]
+// CHECK-ERROR: instruction requires: fp8 sve2
+// CHECK-UNKNOWN: 650a3bc0 <unknown>
+
+bfcvtn  z31.b, {z0.h, z1.h}  // 01100101-00001010-00111000-00011111
+// CHECK-INST: bfcvtn  z31.b, { z0.h, z1.h }
+// CHECK-ENCODING: [0x1f,0x38,0x0a,0x65]
+// CHECK-ERROR: instruction requires: fp8 sve2
+// CHECK-UNKNOWN: 650a381f <unknown>
+
+bfcvtn  z31.b, {z30.h, z31.h}  // 01100101-00001010-00111011-11011111
+// CHECK-INST: bfcvtn  z31.b, { z30.h, z31.h }
+// CHECK-ENCODING: [0xdf,0x3b,0x0a,0x65]
+// CHECK-ERROR: instruction requires: fp8 sve2
+// CHECK-UNKNOWN: 650a3bdf <unknown>
+
+
+//
+// FCVTNT instructions
+//
+fcvtnt  z0.b, {z0.s, z1.s}  // 01100101-00001010-00111100-00000000
+// CHECK-INST: fcvtnt  z0.b, { z0.s, z1.s }
+// CHECK-ENCODING: [0x00,0x3c,0x0a,0x65]
+// CHECK-ERROR: instruction requires: fp8 sve2
+// CHECK-UNKNOWN: 650a3c00 <unknown>
+
+fcvtnt  z0.b, {z30.s, z31.s}  // 01100101-00001010-00111111-11000000
+// CHECK-INST: fcvtnt  z0.b, { z30.s, z31.s }
+// CHECK-ENCODING: [0xc0,0x3f,0x0a,0x65]
+// CHECK-ERROR: instruction requires: fp8 sve2
+// CHECK-UNKNOWN: 650a3fc0 <unknown>
+
+fcvtnt  z31.b, {z0.s, z1.s}  // 01100101-00001010-00111100-00011111
+// CHECK-INST: fcvtnt  z31.b, { z0.s, z1.s }
+// CHECK-ENCODING: [0x1f,0x3c,0x0a,0x65]
+// CHECK-ERROR: instruction requires: fp8 sve2
+// CHECK-UNKNOWN: 650a3c1f <unknown>
+
+fcvtnt  z31.b, {z30.s, z31.s}  // 01100101-00001010-00111111-11011111
+// CHECK-INST: fcvtnt  z31.b, { z30.s, z31.s }
+// CHECK-ENCODING: [0xdf,0x3f,0x0a,0x65]
+// CHECK-ERROR: instruction requires: fp8 sve2
+// CHECK-UNKNOWN: 650a3fdf <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2/fcvtnt-diagnostics.s b/llvm/test/MC/AArch64/SVE2/fcvtnt-diagnostics.s
index abfdea8c4853b..ae287b94ec01b 100644
--- a/llvm/test/MC/AArch64/SVE2/fcvtnt-diagnostics.s
+++ b/llvm/test/MC/AArch64/SVE2/fcvtnt-diagnostics.s
@@ -5,7 +5,7 @@
 // Invalid element width
 
 fcvtnt z0.b, p0/m, z0.b
-// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
 // CHECK-NEXT: fcvtnt z0.b, p0/m, z0.b
 // CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
 
@@ -25,7 +25,7 @@ fcvtnt z0.d, p0/m, z0.d
 // CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
 
 fcvtnt z0.b, p0/m, z0.h
-// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
 // CHECK-NEXT: fcvtnt z0.b, p0/m, z0.h
 // CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
 
diff --git a/llvm/unittests/TargetParser/TargetParserTest.cpp b/llvm/unittests/TargetParser/TargetParserTest.cpp
index b432e7ac2d868..b662fbe3457cb 100644
--- a/llvm/unittests/TargetParser/TargetParserTest.cpp
+++ b/llvm/unittests/TargetParser/TargetParserTest.cpp
@@ -1732,7 +1732,7 @@ TEST(TargetParserTest, AArch64ExtensionFeatures) {
       AArch64::AEK_RCPC3,   AArch64::AEK_THE,       AArch64::AEK_D128,
       AArch64::AEK_LSE128,  AArch64::AEK_SPECRES2,  AArch64::AEK_RASv2,
       AArch64::AEK_ITE,     AArch64::AEK_GCS,       AArch64::AEK_FPMR,
-  };
+      AArch64::AEK_FP8};
 
   std::vector<StringRef> Features;
 
@@ -1805,6 +1805,7 @@ TEST(TargetParserTest, AArch64ExtensionFeatures) {
   EXPECT_TRUE(llvm::is_contained(Features, "+ite"));
   EXPECT_TRUE(llvm::is_contained(Features, "+gcs"));
   EXPECT_TRUE(llvm::is_contained(Features, "+fpmr"));
+  EXPECT_TRUE(llvm::is_contained(Features, "+fp8"));
 
   // Assuming we listed every extension above, this should produce the same
   // result. (note that AEK_NONE doesn't have a name so it won't be in the
@@ -1929,6 +1930,7 @@ TEST(TargetParserTest, AArch64ArchExtFeature) {
       {"rasv2", "norasv2", "+rasv2", "-rasv2"},
       {"gcs", "nogcs", "+gcs", "-gcs"},
       {"fpmr", "nofpmr", "+fpmr", "-fpmr"},
+      {"fp8", "nofp8", "+fp8", "-fp8"},
   };
 
   for (unsigned i = 0; i < std::size(ArchExt); i++) {

From f3f0672039f42610101db8bbe97f521a65ede413 Mon Sep 17 00:00:00 2001
From: Egor Zhdan <e_zhdan@apple.com>
Date: Thu, 26 Oct 2023 16:14:23 +0100
Subject: [PATCH 083/877] [Clang] Fix several `-Wdocumentation` warnings (NFC)

```
clang/include/clang/Lex/Preprocessor.h:2893:14: warning: parameter 'isEnter:' not found in the function declaration [-Wdocumentation]
  /// \param isEnter: true if this PP is entering a region; otherwise, this PP
             ^~~~~~~~

clang/include/clang/Lex/Preprocessor.h:2895:14: warning: parameter 'Loc:' not found in the function declaration [-Wdocumentation]
  /// \param Loc: the location of the entry or exit of a
             ^~~~

clang/include/clang/Lex/Preprocessor.h:2907:14: warning: parameter 'StartLoc:' not found in the function declaration [-Wdocumentation]
  /// \param StartLoc: output argument. It will be set to the start location of
             ^~~~~~~~~
```
---
 clang/include/clang/Lex/Preprocessor.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/clang/include/clang/Lex/Preprocessor.h b/clang/include/clang/Lex/Preprocessor.h
index 18d88407ae12c..4a99447e757c6 100644
--- a/clang/include/clang/Lex/Preprocessor.h
+++ b/clang/include/clang/Lex/Preprocessor.h
@@ -2879,9 +2879,9 @@ class Preprocessor {
   /// Alter the state of whether this PP currently is in a
   /// "-Wunsafe-buffer-usage" opt-out region.
   ///
-  /// \param isEnter: true if this PP is entering a region; otherwise, this PP
+  /// \param isEnter true if this PP is entering a region; otherwise, this PP
   /// is exiting a region
-  /// \param Loc: the location of the entry or exit of a
+  /// \param Loc the location of the entry or exit of a
   /// region
   /// \return true iff it is INVALID to enter or exit a region, i.e.,
   /// attempt to enter a region before exiting a previous region, or exiting a
@@ -2893,7 +2893,7 @@ class Preprocessor {
   ///          opt-out region
   bool isPPInSafeBufferOptOutRegion();
 
-  /// \param StartLoc: output argument. It will be set to the start location of
+  /// \param StartLoc output argument. It will be set to the start location of
   /// the current "-Wunsafe-buffer-usage" opt-out region iff this function
   /// returns true.
   /// \return true iff this PP is currently in a "-Wunsafe-buffer-usage"

From a6d0e8791e1015a12f8277e6032eae9fdab686fe Mon Sep 17 00:00:00 2001
From: Vlad Serebrennikov <serebrennikov.vladislav@gmail.com>
Date: Thu, 26 Oct 2023 19:22:52 +0400
Subject: [PATCH 084/877] [clang][NFC] Refactor enums that hold size of `Type`
 and `DeclContext` bit-fields (#70296)

This patch refactor said enums to hold total size of a bit-field, and not just non-inherited bits. This brings `Type` and `DeclContext` in line with `Comment` and `Stmt`. It also makes it unnecessary to list all transitive bases of a bit-field as unnamed bit-fields, which makes it more friendly towards debuggers.
---
 clang/include/clang/AST/DeclBase.h        | 49 ++++++++++-------------
 clang/include/clang/AST/Type.h            |  6 +--
 clang/lib/Serialization/ASTWriterDecl.cpp | 18 ++++-----
 3 files changed, 33 insertions(+), 40 deletions(-)

diff --git a/clang/include/clang/AST/DeclBase.h b/clang/include/clang/AST/DeclBase.h
index d383e46e22e16..978e4255e877e 100644
--- a/clang/include/clang/AST/DeclBase.h
+++ b/clang/include/clang/AST/DeclBase.h
@@ -1515,16 +1515,14 @@ class DeclContext {
     uint64_t IsThisDeclarationADemotedDefinition : 1;
   };
 
-  /// Number of non-inherited bits in TagDeclBitfields.
-  enum { NumTagDeclBits = 10 };
+  /// Number of inherited and non-inherited bits in TagDeclBitfields.
+  enum { NumTagDeclBits = NumDeclContextBits + 10 };
 
   /// Stores the bits used by EnumDecl.
   /// If modified NumEnumDeclBit and the accessor
   /// methods in EnumDecl should be updated appropriately.
   class EnumDeclBitfields {
     friend class EnumDecl;
-    /// For the bits in DeclContextBitfields.
-    uint64_t : NumDeclContextBits;
     /// For the bits in TagDeclBitfields.
     uint64_t : NumTagDeclBits;
 
@@ -1554,16 +1552,14 @@ class DeclContext {
     uint64_t HasODRHash : 1;
   };
 
-  /// Number of non-inherited bits in EnumDeclBitfields.
-  enum { NumEnumDeclBits = 20 };
+  /// Number of inherited and non-inherited bits in EnumDeclBitfields.
+  enum { NumEnumDeclBits = NumTagDeclBits + 20 };
 
   /// Stores the bits used by RecordDecl.
   /// If modified NumRecordDeclBits and the accessor
   /// methods in RecordDecl should be updated appropriately.
   class RecordDeclBitfields {
     friend class RecordDecl;
-    /// For the bits in DeclContextBitfields.
-    uint64_t : NumDeclContextBits;
     /// For the bits in TagDeclBitfields.
     uint64_t : NumTagDeclBits;
 
@@ -1615,8 +1611,8 @@ class DeclContext {
     uint64_t ODRHash : 26;
   };
 
-  /// Number of non-inherited bits in RecordDeclBitfields.
-  enum { NumRecordDeclBits = 41 };
+  /// Number of inherited and non-inherited bits in RecordDeclBitfields.
+  enum { NumRecordDeclBits = NumTagDeclBits + 41 };
 
   /// Stores the bits used by OMPDeclareReductionDecl.
   /// If modified NumOMPDeclareReductionDeclBits and the accessor
@@ -1631,8 +1627,9 @@ class DeclContext {
     uint64_t InitializerKind : 2;
   };
 
-  /// Number of non-inherited bits in OMPDeclareReductionDeclBitfields.
-  enum { NumOMPDeclareReductionDeclBits = 2 };
+  /// Number of inherited and non-inherited bits in
+  /// OMPDeclareReductionDeclBitfields.
+  enum { NumOMPDeclareReductionDeclBits = NumDeclContextBits + 2 };
 
   /// Stores the bits used by FunctionDecl.
   /// If modified NumFunctionDeclBits and the accessor
@@ -1711,16 +1708,14 @@ class DeclContext {
     uint64_t FriendConstraintRefersToEnclosingTemplate : 1;
   };
 
-  /// Number of non-inherited bits in FunctionDeclBitfields.
-  enum { NumFunctionDeclBits = 31 };
+  /// Number of inherited and non-inherited bits in FunctionDeclBitfields.
+  enum { NumFunctionDeclBits = NumDeclContextBits + 31 };
 
   /// Stores the bits used by CXXConstructorDecl. If modified
   /// NumCXXConstructorDeclBits and the accessor
   /// methods in CXXConstructorDecl should be updated appropriately.
   class CXXConstructorDeclBitfields {
     friend class CXXConstructorDecl;
-    /// For the bits in DeclContextBitfields.
-    uint64_t : NumDeclContextBits;
     /// For the bits in FunctionDeclBitfields.
     uint64_t : NumFunctionDeclBits;
 
@@ -1739,10 +1734,8 @@ class DeclContext {
     uint64_t IsSimpleExplicit : 1;
   };
 
-  /// Number of non-inherited bits in CXXConstructorDeclBitfields.
-  enum {
-    NumCXXConstructorDeclBits = 64 - NumDeclContextBits - NumFunctionDeclBits
-  };
+  /// Number of inherited and non-inherited bits in CXXConstructorDeclBitfields.
+  enum { NumCXXConstructorDeclBits = NumFunctionDeclBits + 20 };
 
   /// Stores the bits used by ObjCMethodDecl.
   /// If modified NumObjCMethodDeclBits and the accessor
@@ -1803,8 +1796,8 @@ class DeclContext {
     uint64_t HasSkippedBody : 1;
   };
 
-  /// Number of non-inherited bits in ObjCMethodDeclBitfields.
-  enum { NumObjCMethodDeclBits = 24 };
+  /// Number of inherited and non-inherited bits in ObjCMethodDeclBitfields.
+  enum { NumObjCMethodDeclBits = NumDeclContextBits + 24 };
 
   /// Stores the bits used by ObjCContainerDecl.
   /// If modified NumObjCContainerDeclBits and the accessor
@@ -1819,10 +1812,10 @@ class DeclContext {
     SourceLocation AtStart;
   };
 
-  /// Number of non-inherited bits in ObjCContainerDeclBitfields.
+  /// Number of inherited and non-inherited bits in ObjCContainerDeclBitfields.
   /// Note that here we rely on the fact that SourceLocation is 32 bits
   /// wide. We check this with the static_assert in the ctor of DeclContext.
-  enum { NumObjCContainerDeclBits = 64 - NumDeclContextBits };
+  enum { NumObjCContainerDeclBits = 64 };
 
   /// Stores the bits used by LinkageSpecDecl.
   /// If modified NumLinkageSpecDeclBits and the accessor
@@ -1843,8 +1836,8 @@ class DeclContext {
     uint64_t HasBraces : 1;
   };
 
-  /// Number of non-inherited bits in LinkageSpecDeclBitfields.
-  enum { NumLinkageSpecDeclBits = 4 };
+  /// Number of inherited and non-inherited bits in LinkageSpecDeclBitfields.
+  enum { NumLinkageSpecDeclBits = NumDeclContextBits + 4 };
 
   /// Stores the bits used by BlockDecl.
   /// If modified NumBlockDeclBits and the accessor
@@ -1869,8 +1862,8 @@ class DeclContext {
     uint64_t CanAvoidCopyToHeap : 1;
   };
 
-  /// Number of non-inherited bits in BlockDeclBitfields.
-  enum { NumBlockDeclBits = 5 };
+  /// Number of inherited and non-inherited bits in BlockDeclBitfields.
+  enum { NumBlockDeclBits = NumDeclContextBits + 5 };
 
   /// Pointer to the data structure used to lookup declarations
   /// within this context (or a DependentStoredDeclsMap if this is a
diff --git a/clang/include/clang/AST/Type.h b/clang/include/clang/AST/Type.h
index e3dbe3b8a45cc..1e8e1303e65f6 100644
--- a/clang/include/clang/AST/Type.h
+++ b/clang/include/clang/AST/Type.h
@@ -1663,11 +1663,12 @@ class alignas(TypeAlignment) Type : public ExtQualsTypeCommonBase {
     /// Actually an ArrayType::ArraySizeModifier.
     unsigned SizeModifier : 3;
   };
+  enum { NumArrayTypeBits = NumTypeBits + 6 };
 
   class ConstantArrayTypeBitfields {
     friend class ConstantArrayType;
 
-    unsigned : NumTypeBits + 3 + 3;
+    unsigned : NumArrayTypeBits;
 
     /// Whether we have a stored size expression.
     unsigned HasStoredSizeExpr : 1;
@@ -1780,7 +1781,7 @@ class alignas(TypeAlignment) Type : public ExtQualsTypeCommonBase {
     unsigned Keyword : 8;
   };
 
-  enum { NumTypeWithKeywordBits = 8 };
+  enum { NumTypeWithKeywordBits = NumTypeBits + 8 };
 
   class ElaboratedTypeBitfields {
     friend class ElaboratedType;
@@ -1913,7 +1914,6 @@ class alignas(TypeAlignment) Type : public ExtQualsTypeCommonBase {
   class DependentTemplateSpecializationTypeBitfields {
     friend class DependentTemplateSpecializationType;
 
-    unsigned : NumTypeBits;
     unsigned : NumTypeWithKeywordBits;
 
     /// The number of template arguments named in this class template
diff --git a/clang/lib/Serialization/ASTWriterDecl.cpp b/clang/lib/Serialization/ASTWriterDecl.cpp
index 8a2ea7c7624ce..b3364113abf15 100644
--- a/clang/lib/Serialization/ASTWriterDecl.cpp
+++ b/clang/lib/Serialization/ASTWriterDecl.cpp
@@ -431,7 +431,7 @@ void ASTDeclWriter::VisitTypeAliasDecl(TypeAliasDecl *D) {
 }
 
 void ASTDeclWriter::VisitTagDecl(TagDecl *D) {
-  static_assert(DeclContext::NumTagDeclBits == 10,
+  static_assert(DeclContext::NumTagDeclBits == 23,
                 "You need to update the serializer after you change the "
                 "TagDeclBits");
 
@@ -459,7 +459,7 @@ void ASTDeclWriter::VisitTagDecl(TagDecl *D) {
 }
 
 void ASTDeclWriter::VisitEnumDecl(EnumDecl *D) {
-  static_assert(DeclContext::NumEnumDeclBits == 20,
+  static_assert(DeclContext::NumEnumDeclBits == 43,
                 "You need to update the serializer after you change the "
                 "EnumDeclBits");
 
@@ -506,7 +506,7 @@ void ASTDeclWriter::VisitEnumDecl(EnumDecl *D) {
 }
 
 void ASTDeclWriter::VisitRecordDecl(RecordDecl *D) {
-  static_assert(DeclContext::NumRecordDeclBits == 41,
+  static_assert(DeclContext::NumRecordDeclBits == 64,
                 "You need to update the serializer after you change the "
                 "RecordDeclBits");
 
@@ -578,7 +578,7 @@ void ASTDeclWriter::VisitDeclaratorDecl(DeclaratorDecl *D) {
 }
 
 void ASTDeclWriter::VisitFunctionDecl(FunctionDecl *D) {
-  static_assert(DeclContext::NumFunctionDeclBits == 31,
+  static_assert(DeclContext::NumFunctionDeclBits == 44,
                 "You need to update the serializer after you change the "
                 "FunctionDeclBits");
 
@@ -726,7 +726,7 @@ void ASTDeclWriter::VisitCXXDeductionGuideDecl(CXXDeductionGuideDecl *D) {
 }
 
 void ASTDeclWriter::VisitObjCMethodDecl(ObjCMethodDecl *D) {
-  static_assert(DeclContext::NumObjCMethodDeclBits == 24,
+  static_assert(DeclContext::NumObjCMethodDeclBits == 37,
                 "You need to update the serializer after you change the "
                 "ObjCMethodDeclBits");
 
@@ -788,7 +788,7 @@ void ASTDeclWriter::VisitObjCTypeParamDecl(ObjCTypeParamDecl *D) {
 }
 
 void ASTDeclWriter::VisitObjCContainerDecl(ObjCContainerDecl *D) {
-  static_assert(DeclContext::NumObjCContainerDeclBits == 51,
+  static_assert(DeclContext::NumObjCContainerDeclBits == 64,
                 "You need to update the serializer after you change the "
                 "ObjCContainerDeclBits");
 
@@ -1268,7 +1268,7 @@ void ASTDeclWriter::VisitCapturedDecl(CapturedDecl *CD) {
 }
 
 void ASTDeclWriter::VisitLinkageSpecDecl(LinkageSpecDecl *D) {
-  static_assert(DeclContext::NumLinkageSpecDeclBits == 4,
+  static_assert(DeclContext::NumLinkageSpecDeclBits == 17,
                 "You need to update the serializer after you change the"
                 "LinkageSpecDeclBits");
 
@@ -1479,7 +1479,7 @@ void ASTDeclWriter::VisitCXXMethodDecl(CXXMethodDecl *D) {
 }
 
 void ASTDeclWriter::VisitCXXConstructorDecl(CXXConstructorDecl *D) {
-  static_assert(DeclContext::NumCXXConstructorDeclBits == 20,
+  static_assert(DeclContext::NumCXXConstructorDeclBits == 64,
                 "You need to update the serializer after you change the "
                 "CXXConstructorDeclBits");
 
@@ -1960,7 +1960,7 @@ void ASTDeclWriter::VisitOMPRequiresDecl(OMPRequiresDecl *D) {
 }
 
 void ASTDeclWriter::VisitOMPDeclareReductionDecl(OMPDeclareReductionDecl *D) {
-  static_assert(DeclContext::NumOMPDeclareReductionDeclBits == 2,
+  static_assert(DeclContext::NumOMPDeclareReductionDeclBits == 15,
                 "You need to update the serializer after you change the "
                 "NumOMPDeclareReductionDeclBits");
 

From a3490920615e963732d70df2ef77eafa35b8ee0f Mon Sep 17 00:00:00 2001
From: Nikolas Klauser <nikolasklauser@berlin.de>
Date: Wed, 25 Oct 2023 11:19:27 +0200
Subject: [PATCH 085/877] [libc++] Add __small_buffer

This is an implementation detail for `move_only_function` (and potentially other type-erasing classes).

Reviewed By: #libc, ldionne

Spies: Mordante, ldionne, EricWF, libcxx-commits

Differential Revision: https://reviews.llvm.org/D140259
---
 libcxx/include/CMakeLists.txt                 |  1 +
 libcxx/include/__utility/small_buffer.h       | 99 +++++++++++++++++++
 libcxx/include/module.modulemap.in            |  1 +
 .../utilities/utility/small_buffer.pass.cpp   | 74 ++++++++++++++
 4 files changed, 175 insertions(+)
 create mode 100644 libcxx/include/__utility/small_buffer.h
 create mode 100644 libcxx/test/libcxx/utilities/utility/small_buffer.pass.cpp

diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt
index 1f36ff6b3896e..b7b14200498a2 100644
--- a/libcxx/include/CMakeLists.txt
+++ b/libcxx/include/CMakeLists.txt
@@ -854,6 +854,7 @@ set(files
   __utility/piecewise_construct.h
   __utility/priority_tag.h
   __utility/rel_ops.h
+  __utility/small_buffer.h
   __utility/swap.h
   __utility/to_underlying.h
   __utility/unreachable.h
diff --git a/libcxx/include/__utility/small_buffer.h b/libcxx/include/__utility/small_buffer.h
new file mode 100644
index 0000000000000..0c6d4986a5851
--- /dev/null
+++ b/libcxx/include/__utility/small_buffer.h
@@ -0,0 +1,99 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___UTILITY_SMALL_BUFFER_H
+#define _LIBCPP___UTILITY_SMALL_BUFFER_H
+
+#include <__config>
+#include <__memory/construct_at.h>
+#include <__type_traits/decay.h>
+#include <__type_traits/is_trivially_destructible.h>
+#include <__type_traits/is_trivially_move_constructible.h>
+#include <__utility/exception_guard.h>
+#include <__utility/forward.h>
+#include <cstddef>
+#include <new>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
+
+#if _LIBCPP_STD_VER >= 23
+
+// __small_buffer is a helper class to perform the well known SBO (small buffer optimization). It is mainly useful to
+// allow type-erasing classes like move_only_function to store small objects in a local buffer without requiring an
+// allocation.
+//
+// This small buffer class only allows storing  trivially relocatable objects inside the local storage to allow
+// __small_buffer to be trivially relocatable itself. Since the buffer doesn't know what's stored inside it, the user
+// has to manage the object's lifetime, in particular the destruction of the object.
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+template <size_t _BufferSize, size_t _BufferAlignment>
+  requires(_BufferSize > 0 && _BufferAlignment > 0)
+class __small_buffer {
+public:
+  template <class _Tp, class _Decayed = decay_t<_Tp>>
+  static constexpr bool __fits_in_buffer =
+      is_trivially_move_constructible_v<_Decayed> && is_trivially_destructible_v<_Decayed> &&
+      sizeof(_Decayed) <= _BufferSize && alignof(_Decayed) <= _BufferAlignment;
+
+  _LIBCPP_HIDE_FROM_ABI __small_buffer()           = default;
+  __small_buffer(const __small_buffer&)            = delete;
+  __small_buffer& operator=(const __small_buffer&) = delete;
+  _LIBCPP_HIDE_FROM_ABI ~__small_buffer()          = default;
+
+  // Relocates the buffer - __delete() should never be called on a moved-from __small_buffer
+  _LIBCPP_HIDE_FROM_ABI __small_buffer(__small_buffer&&)            = default;
+  _LIBCPP_HIDE_FROM_ABI __small_buffer& operator=(__small_buffer&&) = default;
+
+  template <class _Stored>
+  _LIBCPP_HIDE_FROM_ABI _Stored* __get() {
+    if constexpr (__fits_in_buffer<_Stored>)
+      return std::launder(reinterpret_cast<_Stored*>(__buffer_));
+    else
+      return *std::launder(reinterpret_cast<_Stored**>(__buffer_));
+  }
+
+  template <class _Stored>
+  _LIBCPP_AVAILABILITY_SIZED_NEW_DELETE _LIBCPP_HIDE_FROM_ABI _Stored* __alloc() {
+    if constexpr (__fits_in_buffer<_Stored>) {
+      return std::launder(reinterpret_cast<_Stored*>(__buffer_));
+    } else {
+      byte* __allocation = static_cast<byte*>(::operator new[](sizeof(_Stored), align_val_t{alignof(_Stored)}));
+      std::construct_at(reinterpret_cast<byte**>(__buffer_), __allocation);
+      return std::launder(reinterpret_cast<_Stored*>(__allocation));
+    }
+  }
+
+  template <class _Stored>
+  _LIBCPP_AVAILABILITY_SIZED_NEW_DELETE _LIBCPP_HIDE_FROM_ABI void __dealloc() noexcept {
+    if constexpr (!__fits_in_buffer<_Stored>)
+      ::operator delete[](*reinterpret_cast<void**>(__buffer_), sizeof(_Stored), align_val_t{alignof(_Stored)});
+  }
+
+  template <class _Stored, class... _Args>
+  _LIBCPP_AVAILABILITY_SIZED_NEW_DELETE _LIBCPP_HIDE_FROM_ABI void __construct(_Args&&... __args) {
+    _Stored* __buffer = __alloc<_Stored>();
+    auto __guard      = std::__make_exception_guard([&] { __dealloc<_Stored>(); });
+    std::construct_at(__buffer, std::forward<_Args>(__args)...);
+    __guard.__complete();
+  }
+
+private:
+  alignas(_BufferAlignment) byte __buffer_[_BufferSize];
+};
+
+#  undef _LIBCPP_SMALL_BUFFER_TRIVIAL_ABI
+
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // _LIBCPP_STD_VER >= 23
+
+#endif // _LIBCPP___UTILITY_SMALL_BUFFER_H
diff --git a/libcxx/include/module.modulemap.in b/libcxx/include/module.modulemap.in
index 41158212c9a86..f447b2fa0a0ce 100644
--- a/libcxx/include/module.modulemap.in
+++ b/libcxx/include/module.modulemap.in
@@ -2079,6 +2079,7 @@ module std_private_utility_pair_fwd               [system] { header "__fwd/pair.
 module std_private_utility_piecewise_construct    [system] { header "__utility/piecewise_construct.h" }
 module std_private_utility_priority_tag           [system] { header "__utility/priority_tag.h" }
 module std_private_utility_rel_ops                [system] { header "__utility/rel_ops.h" }
+module std_private_utility_small_buffer           [system] { header "__utility/small_buffer.h" }
 module std_private_utility_swap                   [system] {
   header "__utility/swap.h"
   export std_private_type_traits_is_swappable
diff --git a/libcxx/test/libcxx/utilities/utility/small_buffer.pass.cpp b/libcxx/test/libcxx/utilities/utility/small_buffer.pass.cpp
new file mode 100644
index 0000000000000..2214efa486870
--- /dev/null
+++ b/libcxx/test/libcxx/utilities/utility/small_buffer.pass.cpp
@@ -0,0 +1,74 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// XFAIL: availability-aligned_allocation-missing
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+
+#include "test_macros.h"
+
+TEST_DIAGNOSTIC_PUSH
+TEST_CLANG_DIAGNOSTIC_IGNORED("-Wprivate-header")
+#include <__utility/small_buffer.h>
+TEST_DIAGNOSTIC_POP
+
+#include <cassert>
+#include <memory>
+#include <utility>
+
+struct NotTriviallyRelocatable {
+  char c_;
+
+  NotTriviallyRelocatable(char c) : c_(c) {}
+  ~NotTriviallyRelocatable() {}
+};
+
+struct alignas(16) Overaligned {
+  int i;
+};
+
+int main(int, char**) {
+  using BufferT = std::__small_buffer<8, 8>;
+  static_assert(sizeof(BufferT) == 8);
+  static_assert(alignof(BufferT) == 8);
+  static_assert(BufferT::__fits_in_buffer<int>);
+  static_assert(!BufferT::__fits_in_buffer<Overaligned>);
+  static_assert(!BufferT::__fits_in_buffer<NotTriviallyRelocatable>);
+
+  BufferT buf;
+
+  { // construct/destroy in the same place
+    buf.__construct<int>(3);
+    assert(*buf.__get<int>() == 3);
+    std::destroy_at(buf.__get<int>());
+    buf.__dealloc<int>();
+
+    buf.__construct<NotTriviallyRelocatable>(3);
+    assert(buf.__get<NotTriviallyRelocatable>()->c_ == 3);
+    std::destroy_at(buf.__get<NotTriviallyRelocatable>());
+    buf.__dealloc<NotTriviallyRelocatable>();
+  }
+
+  { // Move the buffer around
+    buf.__construct<int>(3);
+    assert(*buf.__get<int>() == 3);
+    auto buf2 = std::move(buf);
+    assert(*buf2.__get<int>() == 3);
+    std::destroy_at(buf2.__get<int>());
+    buf2.__dealloc<int>();
+
+    buf.__construct<NotTriviallyRelocatable>(3);
+    assert(buf.__get<NotTriviallyRelocatable>()->c_ == 3);
+    auto buf3 = std::move(buf);
+    assert(buf3.__get<NotTriviallyRelocatable>()->c_ == 3);
+    std::destroy_at(buf3.__get<NotTriviallyRelocatable>());
+    buf3.__dealloc<NotTriviallyRelocatable>();
+  }
+
+  return 0;
+}

From c555a12377307909bd47e5de798059089eaa3f85 Mon Sep 17 00:00:00 2001
From: Igor Zhukov <fsb4000@yandex.ru>
Date: Tue, 26 Sep 2023 09:33:41 -0400
Subject: [PATCH 086/877] [libc++] Make sure std::declval() produces an error
 when ODR-used

Fixes https://github.com/llvm/llvm-project/issues/61202

Differential Revision: https://reviews.llvm.org/D145376
---
 libcxx/include/__utility/declval.h            |  6 ++-
 .../deleted_output_functions.verify.cpp       | 53 +++++++++++--------
 .../utility/declval/declval.verify.cpp        | 16 ++++++
 3 files changed, 52 insertions(+), 23 deletions(-)
 create mode 100644 libcxx/test/std/utilities/utility/declval/declval.verify.cpp

diff --git a/libcxx/include/__utility/declval.h b/libcxx/include/__utility/declval.h
index c2f4bec132827..d0856b8afa4db 100644
--- a/libcxx/include/__utility/declval.h
+++ b/libcxx/include/__utility/declval.h
@@ -27,7 +27,11 @@ _Tp __declval(long);
 _LIBCPP_SUPPRESS_DEPRECATED_POP
 
 template <class _Tp>
-decltype(std::__declval<_Tp>(0)) declval() _NOEXCEPT;
+_LIBCPP_HIDE_FROM_ABI decltype(std::__declval<_Tp>(0)) declval() _NOEXCEPT {
+  static_assert(!__is_same(_Tp, _Tp),
+                "std::declval can only be used in an unevaluated context. "
+                "It's likely that your current usage is trying to extract a value from the function.");
+}
 
 _LIBCPP_END_NAMESPACE_STD
 
diff --git a/libcxx/test/std/input.output/iostream.format/output.streams/ostream/deleted_output_functions.verify.cpp b/libcxx/test/std/input.output/iostream.format/output.streams/ostream/deleted_output_functions.verify.cpp
index d8d2a94791b3a..2916a94cdcd6f 100644
--- a/libcxx/test/std/input.output/iostream.format/output.streams/ostream/deleted_output_functions.verify.cpp
+++ b/libcxx/test/std/input.output/iostream.format/output.streams/ostream/deleted_output_functions.verify.cpp
@@ -18,38 +18,47 @@
 
 void f() {
   std::ostringstream s;
+#ifndef TEST_HAS_NO_CHAR8_T
+  char8_t c8_s[]       = u8"test";
+  const char8_t* c8_cs = u8"test";
+#endif
+  char16_t c16_s[]       = u"test";
+  const char16_t* c16_cs = u"test";
+  char32_t c32_s[]       = U"test";
+  const char32_t* c32_cs = U"test";
 
 #ifndef TEST_HAS_NO_WIDE_CHARACTERS
-
-  s << wchar_t();                      // expected-error {{overload resolution selected deleted operator '<<'}}
-  s << std::declval<wchar_t*>();       // expected-error {{overload resolution selected deleted operator '<<'}}
-  s << std::declval<const wchar_t*>(); // expected-error {{overload resolution selected deleted operator '<<'}}
+  wchar_t w_s[]       = L"test";
+  const wchar_t* w_cs = L"test";
+  s << wchar_t(); // expected-error {{overload resolution selected deleted operator '<<'}}
+  s << w_s;       // expected-error {{overload resolution selected deleted operator '<<'}}
+  s << w_cs;      // expected-error {{overload resolution selected deleted operator '<<'}}
 
   std::wostringstream sw;
 #  ifndef TEST_HAS_NO_CHAR8_T
-  sw << char8_t();                      // expected-error {{overload resolution selected deleted operator '<<'}}
-  sw << std::declval<char8_t*>();       // expected-error {{overload resolution selected deleted operator '<<'}}
-  sw << std::declval<const char8_t*>(); // expected-error {{overload resolution selected deleted operator '<<'}}
+  sw << char8_t(); // expected-error {{overload resolution selected deleted operator '<<'}}
+  sw << c8_s;      // expected-error {{overload resolution selected deleted operator '<<'}}
+  sw << c8_cs;     // expected-error {{overload resolution selected deleted operator '<<'}}
 #  endif
 
-  sw << char16_t();                      // expected-error {{overload resolution selected deleted operator '<<'}}
-  sw << std::declval<char16_t*>();       // expected-error {{overload resolution selected deleted operator '<<'}}
-  sw << std::declval<const char16_t*>(); // expected-error {{overload resolution selected deleted operator '<<'}}
-  sw << char32_t();                      // expected-error {{overload resolution selected deleted operator '<<'}}
-  sw << std::declval<char32_t*>();       // expected-error {{overload resolution selected deleted operator '<<'}}
-  sw << std::declval<const char32_t*>(); // expected-error {{overload resolution selected deleted operator '<<'}}
+  sw << char16_t(); // expected-error {{overload resolution selected deleted operator '<<'}}
+  sw << c16_s;      // expected-error {{overload resolution selected deleted operator '<<'}}
+  sw << c16_cs;     // expected-error {{overload resolution selected deleted operator '<<'}}
+  sw << char32_t(); // expected-error {{overload resolution selected deleted operator '<<'}}
+  sw << c32_s;      // expected-error {{overload resolution selected deleted operator '<<'}}
+  sw << c32_cs;     // expected-error {{overload resolution selected deleted operator '<<'}}
 
 #endif // TEST_HAS_NO_WIDE_CHARACTERS
 
 #ifndef TEST_HAS_NO_CHAR8_T
-  s << char8_t();                      // expected-error {{overload resolution selected deleted operator '<<'}}
-  s << std::declval<char8_t*>();       // expected-error {{overload resolution selected deleted operator '<<'}}
-  s << std::declval<const char8_t*>(); // expected-error {{overload resolution selected deleted operator '<<'}}
+  s << char8_t(); // expected-error {{overload resolution selected deleted operator '<<'}}
+  s << c8_s;      // expected-error {{overload resolution selected deleted operator '<<'}}
+  s << c8_cs;     // expected-error {{overload resolution selected deleted operator '<<'}}
 #endif
-  s << char16_t();                      // expected-error {{overload resolution selected deleted operator '<<'}}
-  s << std::declval<char16_t*>();       // expected-error {{overload resolution selected deleted operator '<<'}}
-  s << std::declval<const char16_t*>(); // expected-error {{overload resolution selected deleted operator '<<'}}
-  s << char32_t();                      // expected-error {{overload resolution selected deleted operator '<<'}}
-  s << std::declval<char32_t*>();       // expected-error {{overload resolution selected deleted operator '<<'}}
-  s << std::declval<const char32_t*>(); // expected-error {{overload resolution selected deleted operator '<<'}}
+  s << char16_t(); // expected-error {{overload resolution selected deleted operator '<<'}}
+  s << c16_s;      // expected-error {{overload resolution selected deleted operator '<<'}}
+  s << c16_cs;     // expected-error {{overload resolution selected deleted operator '<<'}}
+  s << char32_t(); // expected-error {{overload resolution selected deleted operator '<<'}}
+  s << c32_s;      // expected-error {{overload resolution selected deleted operator '<<'}}
+  s << c32_cs;     // expected-error {{overload resolution selected deleted operator '<<'}}
 }
diff --git a/libcxx/test/std/utilities/utility/declval/declval.verify.cpp b/libcxx/test/std/utilities/utility/declval/declval.verify.cpp
new file mode 100644
index 0000000000000..a2bd1992aaa83
--- /dev/null
+++ b/libcxx/test/std/utilities/utility/declval/declval.verify.cpp
@@ -0,0 +1,16 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// <utility>
+
+// template <class T> typename add_rvalue_reference<T>::type declval() noexcept;
+
+#include <utility>
+
+int x = std::declval<
+    int>(); // expected-error-re@*:* {{static assertion failed{{.*}}std::declval can only be used in an unevaluated context.}}

From 166b3a86173666ccf823d09e30f40f4a0d386e18 Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2@gmail.com>
Date: Thu, 26 Oct 2023 11:31:36 -0400
Subject: [PATCH 087/877] [libc++][tests] Fix a few remaining instances of
 outdated static assertion regexes in our test suite

---
 .../cpp17_iterator_concepts.verify.cpp        | 86 +++++++++----------
 .../mdspan/layout_stride/extents.verify.cpp   |  4 +-
 2 files changed, 45 insertions(+), 45 deletions(-)

diff --git a/libcxx/test/libcxx/algorithms/cpp17_iterator_concepts.verify.cpp b/libcxx/test/libcxx/algorithms/cpp17_iterator_concepts.verify.cpp
index 3bc49cd3b1b18..1661738dbdfa1 100644
--- a/libcxx/test/libcxx/algorithms/cpp17_iterator_concepts.verify.cpp
+++ b/libcxx/test/libcxx/algorithms/cpp17_iterator_concepts.verify.cpp
@@ -83,26 +83,26 @@ struct diff_t_not_signed : valid_iterator<diff_t_not_signed> {
 };
 
 void check_iterator_requirements() {
-  static_assert(std::__cpp17_iterator<missing_deref>); // expected-error-re {{{{static assertion|static_assert}} failed}}
+  static_assert(std::__cpp17_iterator<missing_deref>); // expected-error-re {{static assertion failed}}
   // expected-note@*:* {{indirection requires pointer operand}}
 
-  static_assert(std::__cpp17_iterator<missing_preincrement>); // expected-error-re {{{{static assertion|static_assert}} failed}}
+  static_assert(std::__cpp17_iterator<missing_preincrement>); // expected-error-re {{static assertion failed}}
   // expected-note@*:* {{cannot increment value of type 'missing_preincrement'}}
 
 
-  static_assert(std::__cpp17_iterator<not_move_constructible>); // expected-error-re {{{{static assertion|static_assert}} failed}}
+  static_assert(std::__cpp17_iterator<not_move_constructible>); // expected-error-re {{static assertion failed}}
   // expected-note@*:* {{because 'not_move_constructible' does not satisfy '__cpp17_move_constructible'}}
 
-  static_assert(std::__cpp17_iterator<not_copy_constructible>); // expected-error-re {{{{static assertion|static_assert}} failed}}
+  static_assert(std::__cpp17_iterator<not_copy_constructible>); // expected-error-re {{static assertion failed}}
   // expected-note@*:* {{because 'not_copy_constructible' does not satisfy '__cpp17_copy_constructible'}}
 
-  static_assert(std::__cpp17_iterator<not_move_assignable>); // expected-error-re {{{{static assertion|static_assert}} failed}}
+  static_assert(std::__cpp17_iterator<not_move_assignable>); // expected-error-re {{static assertion failed}}
   // expected-note@*:* {{because 'not_move_assignable' does not satisfy '__cpp17_copy_assignable'}}
 
-  static_assert(std::__cpp17_iterator<not_copy_assignable>); // expected-error-re {{{{static assertion|static_assert}} failed}}
+  static_assert(std::__cpp17_iterator<not_copy_assignable>); // expected-error-re {{static assertion failed}}
   // expectted-note@*:* {{because 'not_copy_assignable' does not satisfy '__cpp17_copy_assignable'}}
 
-  static_assert(std::__cpp17_iterator<diff_t_not_signed>); // expected-error-re {{{{static assertion|static_assert}} failed}}
+  static_assert(std::__cpp17_iterator<diff_t_not_signed>); // expected-error-re {{static assertion failed}}
   // expectted-note@*:* {{'is_signed_v<__iter_diff_t<diff_t_not_signed> >' evaluated to false}}
 }
 
@@ -115,10 +115,10 @@ bool operator==(not_unequality_comparable, not_unequality_comparable);
 bool operator!=(not_unequality_comparable, not_unequality_comparable) = delete;
 
 void check_input_iterator_requirements() {
-  _LIBCPP_REQUIRE_CPP17_INPUT_ITERATOR(not_equality_comparable); // expected-error-re {{{{static assertion|static_assert}} failed}}
+  _LIBCPP_REQUIRE_CPP17_INPUT_ITERATOR(not_equality_comparable); // expected-error-re {{static assertion failed}}
   // expected-note@*:* {{'__lhs == __rhs' would be invalid: overload resolution selected deleted operator '=='}}
 
-  _LIBCPP_REQUIRE_CPP17_INPUT_ITERATOR(not_unequality_comparable); // expected-error-re {{{{static assertion|static_assert}} failed}}
+  _LIBCPP_REQUIRE_CPP17_INPUT_ITERATOR(not_unequality_comparable); // expected-error-re {{static assertion failed}}
   // expected-note@*:* {{'__lhs != __rhs' would be invalid: overload resolution selected deleted operator '!='}}
 }
 
@@ -138,9 +138,9 @@ struct postincrement_not_ref : valid_iterator<postincrement_not_ref> {};
 bool operator==(postincrement_not_ref, postincrement_not_ref);
 
 void check_forward_iterator_requirements() {
-  _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(not_default_constructible); // expected-error-re {{{{static assertion|static_assert}} failed}}
+  _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(not_default_constructible); // expected-error-re {{static assertion failed}}
   // expected-note@*:* {{because 'not_default_constructible' does not satisfy '__cpp17_default_constructible'}}
-  _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(postincrement_not_ref); // expected-error-re {{{{static assertion|static_assert}} failed}}
+  _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(postincrement_not_ref); // expected-error-re {{static assertion failed}}
 #ifndef _AIX
   // expected-note@*:* {{because type constraint 'convertible_to<valid_iterator<postincrement_not_ref>::Proxy, const postincrement_not_ref &>' was not satisfied}}
 #endif
@@ -167,11 +167,11 @@ struct not_returning_iter_reference : valid_forward_iterator<not_returning_iter_
 };
 
 void check_bidirectional_iterator_requirements() {
-  _LIBCPP_REQUIRE_CPP17_BIDIRECTIONAL_ITERATOR(missing_predecrement); // expected-error-re {{{{static assertion|static_assert}} failed}}
+  _LIBCPP_REQUIRE_CPP17_BIDIRECTIONAL_ITERATOR(missing_predecrement); // expected-error-re {{static assertion failed}}
   // expected-note@*:* {{cannot decrement value of type 'missing_predecrement'}}
-  _LIBCPP_REQUIRE_CPP17_BIDIRECTIONAL_ITERATOR(missing_postdecrement); // expected-error-re {{{{static assertion|static_assert}} failed}}
+  _LIBCPP_REQUIRE_CPP17_BIDIRECTIONAL_ITERATOR(missing_postdecrement); // expected-error-re {{static assertion failed}}
   // expected-note@*:* {{cannot decrement value of type 'missing_postdecrement'}}
-  _LIBCPP_REQUIRE_CPP17_BIDIRECTIONAL_ITERATOR(not_returning_iter_reference); // expected-error-re {{{{static assertion|static_assert}} failed}}
+  _LIBCPP_REQUIRE_CPP17_BIDIRECTIONAL_ITERATOR(not_returning_iter_reference); // expected-error-re {{static assertion failed}}
   // expected-note@*:* {{because type constraint 'same_as<int, __iter_reference<not_returning_iter_reference> >' was not satisfied}}
 }
 
@@ -359,62 +359,62 @@ struct missing_const_const_greater_eq : valid_random_access_iterator<missing_con
 };
 
 void check_random_access_iterator() {
-  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_plus_equals); // expected-error-re {{{{static assertion|static_assert}} failed}}
+  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_plus_equals); // expected-error-re {{static assertion failed}}
   // expected-note@*:* {{because '__iter += __n' would be invalid: overload resolution selected deleted operator '+='}}
-  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_minus_equals); // expected-error-re {{{{static assertion|static_assert}} failed}}
+  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_minus_equals); // expected-error-re {{static assertion failed}}
   // expected-note@*:* {{because '__iter -= __n' would be invalid: overload resolution selected deleted operator '-='}}
-  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_plus_iter_diff); // expected-error-re {{{{static assertion|static_assert}} failed}}
+  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_plus_iter_diff); // expected-error-re {{static assertion failed}}
   // expected-note@*:* {{because '__iter + __n' would be invalid: overload resolution selected deleted operator '+'}}
-  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_plus_diff_iter); // expected-error-re {{{{static assertion|static_assert}} failed}}
+  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_plus_diff_iter); // expected-error-re {{static assertion failed}}
   // expected-note@*:* {{because '__n + __iter' would be invalid: overload resolution selected deleted operator '+'}}
-  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_plus_iter_diff_const_mut); // expected-error-re {{{{static assertion|static_assert}} failed}}
+  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_plus_iter_diff_const_mut); // expected-error-re {{static assertion failed}}
   // expected-note@*:* {{because 'std::as_const(__iter) + __n' would be invalid: overload resolution selected deleted operator '+'}}
-  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_plus_iter_diff_mut_const); // expected-error-re {{{{static assertion|static_assert}} failed}}
+  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_plus_iter_diff_mut_const); // expected-error-re {{static assertion failed}}
   // expected-note@*:* {{because '__n + std::as_const(__iter)' would be invalid: overload resolution selected deleted operator '+'}}
-  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_minus_iter_diff_const); // expected-error-re {{{{static assertion|static_assert}} failed}}
+  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_minus_iter_diff_const); // expected-error-re {{static assertion failed}}
   // expected-note@*:* {{because 'std::as_const(__iter) - __n' would be invalid: overload resolution selected deleted operator '-'}}
-  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_minus_iter_iter); // expected-error-re {{{{static assertion|static_assert}} failed}}
+  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_minus_iter_iter); // expected-error-re {{static assertion failed}}
   // expected-note@*:* {{because '__iter - __iter' would be invalid: overload resolution selected deleted operator '-'}}
-  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_minus_const_iter_iter); // expected-error-re {{{{static assertion|static_assert}} failed}}
+  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_minus_const_iter_iter); // expected-error-re {{static assertion failed}}
   // expected-note@*:* {{because 'std::as_const(__iter) - __iter' would be invalid: overload resolution selected deleted operator '-'}}
-  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_minus_iter_const_iter); // expected-error-re {{{{static assertion|static_assert}} failed}}
+  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_minus_iter_const_iter); // expected-error-re {{static assertion failed}}
   // expected-note@*:* {{because '__iter - std::as_const(__iter)' would be invalid: overload resolution selected deleted operator '-'}}
-  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_minus_const_iter_const_iter); // expected-error-re {{{{static assertion|static_assert}} failed}}
+  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_minus_const_iter_const_iter); // expected-error-re {{static assertion failed}}
   // expected-note@*:* {{because 'std::as_const(__iter) - std::as_const(__iter)' would be invalid: overload resolution selected deleted operator '-'}}
-  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_subscript_operator); // expected-error-re {{{{static assertion|static_assert}} failed}}
+  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_subscript_operator); // expected-error-re {{static assertion failed}}
   // expected-note@*:* {{because '__iter[__n]' would be invalid: overload resolution selected deleted operator '[]'}}
-  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_const_subscript_operator); // expected-error-re {{{{static assertion|static_assert}} failed}}
+  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_const_subscript_operator); // expected-error-re {{static assertion failed}}
   // expected-note@*:* {{because 'std::as_const(__iter)[__n]' would be invalid: overload resolution selected deleted operator '[]'}}
-  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_less); // expected-error-re {{{{static assertion|static_assert}} failed}}
+  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_less); // expected-error-re {{static assertion failed}}
   // expected-note@*:* {{because '__iter < __iter' would be invalid: overload resolution selected deleted operator '<'}}
-  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_const_mut_less); // expected-error-re {{{{static assertion|static_assert}} failed}}
+  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_const_mut_less); // expected-error-re {{static assertion failed}}
   // expected-note@*:* {{because 'std::as_const(__iter) < __iter' would be invalid: overload resolution selected deleted operator '<'}}
-  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_mut_const_less); // expected-error-re {{{{static assertion|static_assert}} failed}}
+  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_mut_const_less); // expected-error-re {{static assertion failed}}
   // expected-note@*:* {{because '__iter < std::as_const(__iter)' would be invalid: overload resolution selected deleted operator '<'}}
-  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_const_const_less); // expected-error-re {{{{static assertion|static_assert}} failed}}
+  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_const_const_less); // expected-error-re {{static assertion failed}}
   // expected-note@*:* {{because 'std::as_const(__iter) < std::as_const(__iter)' would be invalid: overload resolution selected deleted operator '<'}}
-  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_greater); // expected-error-re {{{{static assertion|static_assert}} failed}}
+  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_greater); // expected-error-re {{static assertion failed}}
   // expected-note@*:* {{because '__iter > __iter' would be invalid: overload resolution selected deleted operator '>'}}
-  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_const_mut_greater); // expected-error-re {{{{static assertion|static_assert}} failed}}
+  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_const_mut_greater); // expected-error-re {{static assertion failed}}
   // expected-note@*:* {{because 'std::as_const(__iter) > __iter' would be invalid: overload resolution selected deleted operator '>'}}
-  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_mut_const_greater); // expected-error-re {{{{static assertion|static_assert}} failed}}
+  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_mut_const_greater); // expected-error-re {{static assertion failed}}
   // expected-note@*:* {{because '__iter > std::as_const(__iter)' would be invalid: overload resolution selected deleted operator '>'}}
-  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_const_const_greater); // expected-error-re {{{{static assertion|static_assert}} failed}}
+  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_const_const_greater); // expected-error-re {{static assertion failed}}
   // expected-note@*:* {{because 'std::as_const(__iter) > std::as_const(__iter)' would be invalid: overload resolution selected deleted operator '>'}}
-  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_less_eq); // expected-error-re {{{{static assertion|static_assert}} failed}}
+  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_less_eq); // expected-error-re {{static assertion failed}}
   // expected-note@*:* {{because '__iter <= __iter' would be invalid: overload resolution selected deleted operator '<='}}
-  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_const_mut_less_eq); // expected-error-re {{{{static assertion|static_assert}} failed}}
+  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_const_mut_less_eq); // expected-error-re {{static assertion failed}}
   // expected-note@*:* {{because 'std::as_const(__iter) <= __iter' would be invalid: overload resolution selected deleted operator '<='}}
-  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_mut_const_less_eq); // expected-error-re {{{{static assertion|static_assert}} failed}}
+  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_mut_const_less_eq); // expected-error-re {{static assertion failed}}
   // expected-note@*:* {{because '__iter <= std::as_const(__iter)' would be invalid: overload resolution selected deleted operator '<='}}
-  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_const_const_less_eq); // expected-error-re {{{{static assertion|static_assert}} failed}}
+  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_const_const_less_eq); // expected-error-re {{static assertion failed}}
   // expected-note@*:* {{because 'std::as_const(__iter) <= std::as_const(__iter)' would be invalid: overload resolution selected deleted operator '<='}}
-  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_greater_eq); // expected-error-re {{{{static assertion|static_assert}} failed}}
+  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_greater_eq); // expected-error-re {{static assertion failed}}
   // expected-note@*:* {{because '__iter >= __iter' would be invalid: overload resolution selected deleted operator '>='}}
-  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_const_mut_greater_eq); // expected-error-re {{{{static assertion|static_assert}} failed}}
+  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_const_mut_greater_eq); // expected-error-re {{static assertion failed}}
   // expected-note@*:* {{because 'std::as_const(__iter) >= __iter' would be invalid: overload resolution selected deleted operator '>='}}
-  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_mut_const_greater_eq); // expected-error-re {{{{static assertion|static_assert}} failed}}
+  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_mut_const_greater_eq); // expected-error-re {{static assertion failed}}
   // expected-note@*:* {{because '__iter >= std::as_const(__iter)' would be invalid: overload resolution selected deleted operator '>='}}
-  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_const_const_greater_eq); // expected-error-re {{{{static assertion|static_assert}} failed}}
+  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_const_const_greater_eq); // expected-error-re {{static assertion failed}}
   // expected-note@*:* {{because 'std::as_const(__iter) >= std::as_const(__iter)' would be invalid: overload resolution selected deleted operator '>='}}
 }
diff --git a/libcxx/test/std/containers/views/mdspan/layout_stride/extents.verify.cpp b/libcxx/test/std/containers/views/mdspan/layout_stride/extents.verify.cpp
index 4742527f7af11..46f2b774bcbd9 100644
--- a/libcxx/test/std/containers/views/mdspan/layout_stride/extents.verify.cpp
+++ b/libcxx/test/std/containers/views/mdspan/layout_stride/extents.verify.cpp
@@ -23,11 +23,11 @@
 #include <mdspan>
 
 void not_extents() {
-  // expected-error-re@*:* {{{{(static_assert|static assertion)}} failed {{.*}}layout_stride::mapping template argument must be a specialization of extents}}
+  // expected-error-re@*:* {{static assertion failed {{.*}}layout_stride::mapping template argument must be a specialization of extents}}
   [[maybe_unused]] std::layout_stride::mapping<void> mapping;
 }
 
 void representable() {
-  // expected-error-re@*:* {{{{(static_assert|static assertion)}} failed {{.*}}layout_stride::mapping product of static extents must be representable as index_type.}}
+  // expected-error-re@*:* {{static assertion failed {{.*}}layout_stride::mapping product of static extents must be representable as index_type.}}
   [[maybe_unused]] std::layout_stride::mapping<std::extents<char, 20, 20>> mapping;
 }

From cee08ff342f39f8cd2b3b66f48ecb33d8b5efe65 Mon Sep 17 00:00:00 2001
From: Joseph Huber <35342157+jhuber6@users.noreply.github.com>
Date: Thu, 26 Oct 2023 10:36:34 -0500
Subject: [PATCH 088/877] [Libomptarget] Do not pass 'nogpulib' to the non-LTO
 Nvidia tests (#70327)

Summary:
For the other tests we pass `-nogpulib` to ensure that we set up the
needed libraries correctly. However, this caused problems for the
non-LTO build and test of Nvidia systems. In general this is because we
would do a separate compile of the libomptarget device runtime and then
link in that cubin. This exercised the runtime in a lot of ways it's not
used to, since doing things this way was hardly expected or tested. This
patch disables it only for the Nvidia non-LTO build so that we still get
the effect of `--liboimptarget-nvptx-bc-path` rather than ignoring it.
---
 openmp/libomptarget/test/lit.cfg | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/openmp/libomptarget/test/lit.cfg b/openmp/libomptarget/test/lit.cfg
index 80a3c10d3a949..6dab31bd35a9f 100644
--- a/openmp/libomptarget/test/lit.cfg
+++ b/openmp/libomptarget/test/lit.cfg
@@ -128,7 +128,8 @@ elif config.operating_system == 'Darwin':
     config.test_flags += " -Wl,-rpath," + config.library_dir
     config.test_flags += " -Wl,-rpath," + config.omp_host_rtl_directory
 else: # Unices
-    config.test_flags += " -nogpulib"
+    if config.libomptarget_current_target != "nvptx64-nvidia-cuda":
+        config.test_flags += " -nogpulib"
     config.test_flags += " -Wl,-rpath," + config.library_dir
     config.test_flags += " -Wl,-rpath," + config.omp_host_rtl_directory
     config.test_flags += " -Wl,-rpath," + config.llvm_lib_directory

From c65ec9d9195ad4afee2bbf69fd77607697d43480 Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev@outlook.com>
Date: Thu, 26 Oct 2023 08:36:03 -0700
Subject: [PATCH 089/877] Revert "[SLP]Improve isGatherShuffledEntry by trying
 per-register shuffle."

This reverts commit 560bad013ebcb8d2c2c1722e35270b9a70ab40ce to fix
a bug reported in https://lab.llvm.org/buildbot/#/builders/5/builds/37763.
---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 462 +++++-------------
 .../X86/multi-nodes-to-shuffle.ll             |  45 +-
 2 files changed, 120 insertions(+), 387 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 9b5da445daaab..4f82d2d1d6d91 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -2507,31 +2507,17 @@ class BoUpSLP {
   /// instruction in the list).
   Instruction &getLastInstructionInBundle(const TreeEntry *E);
 
-  /// Checks if the gathered \p VL can be represented as a single register
-  /// shuffle(s) of previous tree entries.
+  /// Checks if the gathered \p VL can be represented as shuffle(s) of previous
+  /// tree entries.
   /// \param TE Tree entry checked for permutation.
   /// \param VL List of scalars (a subset of the TE scalar), checked for
-  /// permutations. Must form single-register vector.
+  /// permutations.
   /// \returns ShuffleKind, if gathered values can be represented as shuffles of
-  /// previous tree entries. \p Part of \p Mask is filled with the shuffle mask.
+  /// previous tree entries. \p Mask is filled with the shuffle mask.
   std::optional<TargetTransformInfo::ShuffleKind>
-  isGatherShuffledSingleRegisterEntry(
-      const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
-      SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part);
-
-  /// Checks if the gathered \p VL can be represented as multi-register
-  /// shuffle(s) of previous tree entries.
-  /// \param TE Tree entry checked for permutation.
-  /// \param VL List of scalars (a subset of the TE scalar), checked for
-  /// permutations.
-  /// \returns per-register series of ShuffleKind, if gathered values can be
-  /// represented as shuffles of previous tree entries. \p Mask is filled with
-  /// the shuffle mask (also on per-register base).
-  SmallVector<std::optional<TargetTransformInfo::ShuffleKind>>
-  isGatherShuffledEntry(
-      const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
-      SmallVectorImpl<SmallVector<const TreeEntry *>> &Entries,
-      unsigned NumParts);
+  isGatherShuffledEntry(const TreeEntry *TE, ArrayRef<Value *> VL,
+                        SmallVectorImpl<int> &Mask,
+                        SmallVectorImpl<const TreeEntry *> &Entries);
 
   /// \returns the scalarization cost for this list of values. Assuming that
   /// this subtree gets vectorized, we may need to extract the values from the
@@ -7004,11 +6990,6 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
   BoUpSLP &R;
   SmallPtrSetImpl<Value *> &CheckedExtracts;
   constexpr static TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
-  /// While set, still trying to estimate the cost for the same nodes and we
-  /// can delay actual cost estimation (virtual shuffle instruction emission).
-  /// May help better estimate the cost if same nodes must be permuted + allows
-  /// to move most of the long shuffles cost estimation to TTI.
-  bool SameNodesEstimated = true;
 
   static Constant *getAllOnesValue(const DataLayout &DL, Type *Ty) {
     if (Ty->getScalarType()->isPointerTy()) {
@@ -7249,49 +7230,6 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
     }
     return Cost;
   }
-  /// Transforms mask \p CommonMask per given \p Mask to make proper set after
-  /// shuffle emission.
-  static void transformMaskAfterShuffle(MutableArrayRef<int> CommonMask,
-                                        ArrayRef<int> Mask) {
-    for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
-      if (Mask[Idx] != PoisonMaskElem)
-        CommonMask[Idx] = Idx;
-  }
-  /// Adds the cost of reshuffling \p E1 and \p E2 (if present), using given
-  /// mask \p Mask, register number \p Part, that includes \p SliceSize
-  /// elements.
-  void estimateNodesPermuteCost(const TreeEntry &E1, const TreeEntry *E2,
-                                ArrayRef<int> Mask, unsigned Part,
-                                unsigned SliceSize) {
-    if (SameNodesEstimated) {
-      // Delay the cost estimation if the same nodes are reshuffling.
-      // If we already requested the cost of reshuffling of E1 and E2 before, no
-      // need to estimate another cost with the sub-Mask, instead include this
-      // sub-Mask into the CommonMask to estimate it later and avoid double cost
-      // estimation.
-      if ((InVectors.size() == 2 &&
-           InVectors.front().get<const TreeEntry *>() == &E1 &&
-           InVectors.back().get<const TreeEntry *>() == E2) ||
-          (!E2 && InVectors.front().get<const TreeEntry *>() == &E1)) {
-        assert(all_of(ArrayRef(CommonMask).slice(Part * SliceSize, SliceSize),
-                      [](int Idx) { return Idx == PoisonMaskElem; }) &&
-               "Expected all poisoned elements.");
-        ArrayRef<int> SubMask =
-            ArrayRef(Mask).slice(Part * SliceSize, SliceSize);
-        copy(SubMask, std::next(CommonMask.begin(), SliceSize * Part));
-        return;
-      }
-      // Found non-matching nodes - need to estimate the cost for the matched
-      // and transform mask.
-      Cost += createShuffle(InVectors.front(),
-                            InVectors.size() == 1 ? nullptr : InVectors.back(),
-                            CommonMask);
-      transformMaskAfterShuffle(CommonMask, CommonMask);
-    }
-    SameNodesEstimated = false;
-    Cost += createShuffle(&E1, E2, Mask);
-    transformMaskAfterShuffle(CommonMask, Mask);
-  }
 
   class ShuffleCostBuilder {
     const TargetTransformInfo &TTI;
@@ -7555,74 +7493,31 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
     // into a vector and can be represented as a permutation elements in a
     // single input vector or of 2 input vectors.
     Cost += computeExtractCost(VL, Mask, ShuffleKind);
-    InVectors.assign(1, E);
-    CommonMask.assign(Mask.begin(), Mask.end());
-    transformMaskAfterShuffle(CommonMask, CommonMask);
-    SameNodesEstimated = false;
     return VecBase;
   }
-  void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
-    if (&E1 == &E2) {
+  void add(const TreeEntry *E1, const TreeEntry *E2, ArrayRef<int> Mask) {
+    if (E1 == E2) {
       assert(all_of(Mask,
-                    [&](int Idx) {
-                      return Idx < static_cast<int>(E1.getVectorFactor());
+                    [=](int Idx) {
+                      return Idx < static_cast<int>(E1->getVectorFactor());
                     }) &&
              "Expected single vector shuffle mask.");
       add(E1, Mask);
       return;
     }
-    if (InVectors.empty()) {
-      CommonMask.assign(Mask.begin(), Mask.end());
-      InVectors.assign({&E1, &E2});
-      return;
-    }
-    assert(!CommonMask.empty() && "Expected non-empty common mask.");
-    auto *MaskVecTy =
-        FixedVectorType::get(E1.Scalars.front()->getType(), Mask.size());
-    unsigned NumParts = TTI.getNumberOfParts(MaskVecTy);
-    assert(NumParts > 0 && NumParts < Mask.size() &&
-           "Expected positive number of registers.");
-    unsigned SliceSize = Mask.size() / NumParts;
-    const auto *It =
-        find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; });
-    unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
-    estimateNodesPermuteCost(E1, &E2, Mask, Part, SliceSize);
+    CommonMask.assign(Mask.begin(), Mask.end());
+    InVectors.assign({E1, E2});
   }
-  void add(const TreeEntry &E1, ArrayRef<int> Mask) {
-    if (InVectors.empty()) {
-      CommonMask.assign(Mask.begin(), Mask.end());
-      InVectors.assign(1, &E1);
-      return;
-    }
-    assert(!CommonMask.empty() && "Expected non-empty common mask.");
-    auto *MaskVecTy =
-        FixedVectorType::get(E1.Scalars.front()->getType(), Mask.size());
-    unsigned NumParts = TTI.getNumberOfParts(MaskVecTy);
-    assert(NumParts > 0 && NumParts < Mask.size() &&
-           "Expected positive number of registers.");
-    unsigned SliceSize = Mask.size() / NumParts;
-    const auto *It =
-        find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; });
-    unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
-    estimateNodesPermuteCost(E1, nullptr, Mask, Part, SliceSize);
-    if (!SameNodesEstimated && InVectors.size() == 1)
-      InVectors.emplace_back(&E1);
+  void add(const TreeEntry *E1, ArrayRef<int> Mask) {
+    CommonMask.assign(Mask.begin(), Mask.end());
+    InVectors.assign(1, E1);
   }
   /// Adds another one input vector and the mask for the shuffling.
   void add(Value *V1, ArrayRef<int> Mask) {
-    if (InVectors.empty()) {
-      assert(CommonMask.empty() && "Expected empty input mask/vectors.");
-      CommonMask.assign(Mask.begin(), Mask.end());
-      InVectors.assign(1, V1);
-      return;
-    }
-    assert(InVectors.size() == 1 && InVectors.front().is<const TreeEntry *>() &&
-           !CommonMask.empty() && "Expected only single entry from extracts.");
-    InVectors.push_back(V1);
-    unsigned VF = CommonMask.size();
-    for (unsigned Idx = 0; Idx < VF; ++Idx)
-      if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
-        CommonMask[Idx] = Mask[Idx] + VF;
+    assert(CommonMask.empty() && InVectors.empty() &&
+           "Expected empty input mask/vectors.");
+    CommonMask.assign(Mask.begin(), Mask.end());
+    InVectors.assign(1, V1);
   }
   Value *gather(ArrayRef<Value *> VL, Value *Root = nullptr) {
     Cost += getBuildVectorCost(VL, Root);
@@ -7684,16 +7579,12 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
   ArrayRef<Value *> VL = E->Scalars;
 
   Type *ScalarTy = VL[0]->getType();
-  if (E->State != TreeEntry::NeedToGather) {
-    if (auto *SI = dyn_cast<StoreInst>(VL[0]))
-      ScalarTy = SI->getValueOperand()->getType();
-    else if (auto *CI = dyn_cast<CmpInst>(VL[0]))
-      ScalarTy = CI->getOperand(0)->getType();
-    else if (auto *IE = dyn_cast<InsertElementInst>(VL[0]))
-      ScalarTy = IE->getOperand(1)->getType();
-  }
-  if (!FixedVectorType::isValidElementType(ScalarTy))
-    return InstructionCost::getInvalid();
+  if (auto *SI = dyn_cast<StoreInst>(VL[0]))
+    ScalarTy = SI->getValueOperand()->getType();
+  else if (auto *CI = dyn_cast<CmpInst>(VL[0]))
+    ScalarTy = CI->getOperand(0)->getType();
+  else if (auto *IE = dyn_cast<InsertElementInst>(VL[0]))
+    ScalarTy = IE->getOperand(1)->getType();
   auto *VecTy = FixedVectorType::get(ScalarTy, VL.size());
   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
 
@@ -7705,7 +7596,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
     VecTy = FixedVectorType::get(ScalarTy, VL.size());
   }
   unsigned EntryVF = E->getVectorFactor();
-  auto *FinalVecTy = FixedVectorType::get(ScalarTy, EntryVF);
+  auto *FinalVecTy = FixedVectorType::get(VecTy->getElementType(), EntryVF);
 
   bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
   if (E->State == TreeEntry::NeedToGather) {
@@ -7738,28 +7629,20 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
     SmallVector<int> Mask;
     SmallVector<int> ExtractMask;
     std::optional<TargetTransformInfo::ShuffleKind> ExtractShuffle;
-    SmallVector<std::optional<TargetTransformInfo::ShuffleKind>> GatherShuffles;
-    SmallVector<SmallVector<const TreeEntry *>> Entries;
+    std::optional<TargetTransformInfo::ShuffleKind> GatherShuffle;
+    SmallVector<const TreeEntry *> Entries;
     // Check for gathered extracts.
-    ExtractShuffle =
-        tryToGatherSingleRegisterExtractElements(GatheredScalars, ExtractMask);
+    ExtractShuffle = tryToGatherSingleRegisterExtractElements(GatheredScalars, ExtractMask);
 
     bool Resized = false;
-    unsigned NumParts = TTI->getNumberOfParts(VecTy);
-    if (NumParts == 0 || NumParts >= GatheredScalars.size())
-      NumParts = 1;
     if (Value *VecBase = Estimator.adjustExtracts(
-            E, ExtractMask, ExtractShuffle.value_or(TTI::SK_PermuteTwoSrc))) {
+            E, ExtractMask, ExtractShuffle.value_or(TTI::SK_PermuteTwoSrc)))
       if (auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))
         if (VF == VecBaseTy->getNumElements() && GatheredScalars.size() != VF) {
           Resized = true;
           GatheredScalars.append(VF - GatheredScalars.size(),
                                  PoisonValue::get(ScalarTy));
         }
-    } else if (ExtractShuffle &&
-               TTI->getNumberOfParts(VecTy) == VecTy->getNumElements()) {
-      copy(VL, GatheredScalars.begin());
-    }
 
     // Do not try to look for reshuffled loads for gathered loads (they will be
     // handled later), for vectorized scalars, and cases, which are definitely
@@ -7769,12 +7652,12 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
         all_of(E->Scalars, [this](Value *V) { return getTreeEntry(V); }) ||
         isSplat(E->Scalars) ||
         (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2))
-      GatherShuffles =
-          isGatherShuffledEntry(E, GatheredScalars, Mask, Entries, NumParts);
-    if (!GatherShuffles.empty()) {
-      if (GatherShuffles.size() == 1 &&
-          *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
-          Entries.front().front()->isSame(E->Scalars)) {
+      GatherShuffle = isGatherShuffledEntry(E, GatheredScalars, Mask, Entries);
+    if (GatherShuffle) {
+      assert((Entries.size() == 1 || Entries.size() == 2) &&
+             "Expected shuffle of 1 or 2 entries.");
+      if (*GatherShuffle == TTI::SK_PermuteSingleSrc &&
+          Entries.front()->isSame(E->Scalars)) {
         // Perfect match in the graph, will reuse the previously vectorized
         // node. Cost is 0.
         LLVM_DEBUG(
@@ -7788,18 +7671,15 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
             continue;
           }
           if (Mask[I] == PoisonMaskElem)
-            Mask[I] = Entries.front().front()->findLaneForValue(V);
+            Mask[I] = Entries.front()->findLaneForValue(V);
         }
-        Estimator.add(*Entries.front().front(), Mask);
+        Estimator.add(Entries.front(), Mask);
         return Estimator.finalize(E->ReuseShuffleIndices);
       }
       if (!Resized) {
-        if (GatheredScalars.size() != VF &&
-            any_of(Entries, [&](ArrayRef<const TreeEntry *> TEs) {
-              return any_of(TEs, [&](const TreeEntry *TE) {
-                return TE->getVectorFactor() == VF;
-              });
-            }))
+        unsigned VF1 = Entries.front()->getVectorFactor();
+        unsigned VF2 = Entries.back()->getVectorFactor();
+        if ((VF == VF1 || VF == VF2) && GatheredScalars.size() != VF)
           GatheredScalars.append(VF - GatheredScalars.size(),
                                  PoisonValue::get(ScalarTy));
       }
@@ -7811,21 +7691,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
       LLVM_DEBUG(dbgs() << "SLP: shuffled " << Entries.size()
                         << " entries for bundle "
                         << shortBundleName(VL) << ".\n");
-      unsigned SliceSize = E->Scalars.size() / NumParts;
-      SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
-      for (const auto [I, TEs] : enumerate(Entries)) {
-        if (TEs.empty()) {
-          assert(!GatherShuffles[I] &&
-                 "No shuffles with empty entries list expected.");
-          continue;
-        }
-        assert((TEs.size() == 1 || TEs.size() == 2) &&
-               "Expected shuffle of 1 or 2 entries.");
-        auto SubMask = ArrayRef(Mask).slice(I * SliceSize, SliceSize);
-        VecMask.assign(VecMask.size(), PoisonMaskElem);
-        copy(SubMask, std::next(VecMask.begin(), I * SliceSize));
-        Estimator.add(*TEs.front(), *TEs.back(), VecMask);
-      }
+      Estimator.add(Entries.front(), Entries.back(), Mask);
       if (all_of(GatheredScalars, PoisonValue ::classof))
         return Estimator.finalize(E->ReuseShuffleIndices);
       return Estimator.finalize(
@@ -7839,19 +7705,16 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
     if (!all_of(GatheredScalars, PoisonValue::classof)) {
       auto Gathers = ArrayRef(GatheredScalars).take_front(VL.size());
       bool SameGathers = VL.equals(Gathers);
-      if (!SameGathers)
-        return Estimator.finalize(
-            E->ReuseShuffleIndices, E->Scalars.size(),
-            [&](Value *&Vec, SmallVectorImpl<int> &Mask) {
-              Vec = Estimator.gather(
-                  GatheredScalars, Constant::getNullValue(FixedVectorType::get(
-                                       ScalarTy, GatheredScalars.size())));
-            });
-      Value *BV = Estimator.gather(Gathers);
+      Value *BV = Estimator.gather(
+          Gathers, SameGathers ? nullptr
+                               : Constant::getNullValue(FixedVectorType::get(
+                                     ScalarTy, GatheredScalars.size())));
       SmallVector<int> ReuseMask(Gathers.size(), PoisonMaskElem);
       std::iota(ReuseMask.begin(), ReuseMask.end(), 0);
       Estimator.add(BV, ReuseMask);
     }
+    if (ExtractShuffle)
+      Estimator.add(E, std::nullopt);
     return Estimator.finalize(E->ReuseShuffleIndices);
   }
   InstructionCost CommonCost = 0;
@@ -9174,10 +9037,16 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
 }
 
 std::optional<TargetTransformInfo::ShuffleKind>
-BoUpSLP::isGatherShuffledSingleRegisterEntry(
-    const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
-    SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part) {
+BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, ArrayRef<Value *> VL,
+                               SmallVectorImpl<int> &Mask,
+                               SmallVectorImpl<const TreeEntry *> &Entries) {
   Entries.clear();
+  // No need to check for the topmost gather node.
+  if (TE == VectorizableTree.front().get())
+    return std::nullopt;
+  Mask.assign(VL.size(), PoisonMaskElem);
+  assert(TE->UserTreeIndices.size() == 1 &&
+         "Expected only single user of the gather node.");
   // TODO: currently checking only for Scalars in the tree entry, need to count
   // reused elements too for better cost estimation.
   const EdgeInfo &TEUseEI = TE->UserTreeIndices.front();
@@ -9252,7 +9121,7 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
           UserPHI ? UserPHI->getIncomingBlock(UseEI.EdgeIdx)->getTerminator()
                   : &getLastInstructionInBundle(UseEI.UserTE);
       if (TEInsertPt == InsertPt) {
-        // If 2 gathers are operands of the same entry (regardless of whether
+        // If 2 gathers are operands of the same entry (regardless of wether
         // user is PHI or else), compare operands indices, use the earlier one
         // as the base.
         if (TEUseEI.UserTE == UseEI.UserTE && TEUseEI.EdgeIdx < UseEI.EdgeIdx)
@@ -9317,10 +9186,8 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
     }
   }
 
-  if (UsedTEs.empty()) {
-    Entries.clear();
+  if (UsedTEs.empty())
     return std::nullopt;
-  }
 
   unsigned VF = 0;
   if (UsedTEs.size() == 1) {
@@ -9336,8 +9203,7 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
     });
     if (It != FirstEntries.end() && (*It)->getVectorFactor() == VL.size()) {
       Entries.push_back(*It);
-      std::iota(std::next(Mask.begin(), Part * VL.size()),
-                std::next(Mask.begin(), (Part + 1) * VL.size()), 0);
+      std::iota(Mask.begin(), Mask.end(), 0);
       // Clear undef scalars.
       for (int I = 0, Sz = VL.size(); I < Sz; ++I)
         if (isa<PoisonValue>(VL[I]))
@@ -9474,10 +9340,7 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
     TempEntries.push_back(Entries[I]);
   }
   Entries.swap(TempEntries);
-  if (EntryLanes.size() == Entries.size() &&
-      !VL.equals(ArrayRef(TE->Scalars)
-                     .slice(Part * VL.size(),
-                            std::min<int>(VL.size(), TE->Scalars.size())))) {
+  if (EntryLanes.size() == Entries.size() && !VL.equals(TE->Scalars)) {
     // We may have here 1 or 2 entries only. If the number of scalars is equal
     // to the number of entries, no need to do the analysis, it is not very
     // profitable. Since VL is not the same as TE->Scalars, it means we already
@@ -9490,10 +9353,9 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
   // Pair.first is the offset to the vector, while Pair.second is the index of
   // scalar in the list.
   for (const std::pair<unsigned, int> &Pair : EntryLanes) {
-    unsigned Idx = Part * VL.size() + Pair.second;
-    Mask[Idx] = Pair.first * VF +
-                Entries[Pair.first]->findLaneForValue(VL[Pair.second]);
-    IsIdentity &= Mask[Idx] == Pair.second;
+    Mask[Pair.second] = Pair.first * VF +
+                        Entries[Pair.first]->findLaneForValue(VL[Pair.second]);
+    IsIdentity &= Mask[Pair.second] == Pair.second;
   }
   switch (Entries.size()) {
   case 1:
@@ -9508,63 +9370,9 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry(
     break;
   }
   Entries.clear();
-  // Clear the corresponding mask elements.
-  std::fill(std::next(Mask.begin(), Part * VL.size()),
-            std::next(Mask.begin(), (Part + 1) * VL.size()), PoisonMaskElem);
   return std::nullopt;
 }
 
-SmallVector<std::optional<TargetTransformInfo::ShuffleKind>>
-BoUpSLP::isGatherShuffledEntry(
-    const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
-    SmallVectorImpl<SmallVector<const TreeEntry *>> &Entries,
-    unsigned NumParts) {
-  assert(NumParts > 0 && NumParts < VL.size() &&
-         "Expected positive number of registers.");
-  Entries.clear();
-  // No need to check for the topmost gather node.
-  if (TE == VectorizableTree.front().get())
-    return {};
-  Mask.assign(VL.size(), PoisonMaskElem);
-  assert(TE->UserTreeIndices.size() == 1 &&
-         "Expected only single user of the gather node.");
-  assert(VL.size() % NumParts == 0 &&
-         "Number of scalars must be divisible by NumParts.");
-  unsigned SliceSize = VL.size() / NumParts;
-  SmallVector<std::optional<TTI::ShuffleKind>> Res;
-  for (unsigned Part = 0; Part < NumParts; ++Part) {
-    ArrayRef<Value *> SubVL = VL.slice(Part * SliceSize, SliceSize);
-    SmallVectorImpl<const TreeEntry *> &SubEntries = Entries.emplace_back();
-    std::optional<TTI::ShuffleKind> SubRes =
-        isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part);
-    if (!SubRes)
-      SubEntries.clear();
-    Res.push_back(SubRes);
-    if (SubEntries.size() == 1 &&
-        SubRes.value_or(TTI::SK_PermuteTwoSrc) == TTI::SK_PermuteSingleSrc &&
-        SubEntries.front()->getVectorFactor() == VL.size() &&
-        (SubEntries.front()->isSame(TE->Scalars) ||
-         SubEntries.front()->isSame(VL))) {
-      Entries.clear();
-      Res.clear();
-      std::iota(Mask.begin(), Mask.end(), 0);
-      // Clear undef scalars.
-      for (int I = 0, Sz = VL.size(); I < Sz; ++I)
-        if (isa<PoisonValue>(VL[I]))
-          Mask[I] = PoisonMaskElem;
-      Entries.emplace_back(1, SubEntries.front());
-      Res.push_back(TargetTransformInfo::SK_PermuteSingleSrc);
-      return Res;
-    }
-  }
-  if (all_of(Res,
-             [](const std::optional<TTI::ShuffleKind> &SK) { return !SK; })) {
-    Entries.clear();
-    return {};
-  }
-  return Res;
-}
-
 InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL,
                                        bool ForPoisonSrc) const {
   // Find the type of the operands in VL.
@@ -10031,13 +9839,9 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
   }
   /// Checks if the specified entry \p E needs to be delayed because of its
   /// dependency nodes.
-  Value *needToDelay(const TreeEntry *E,
-                     ArrayRef<SmallVector<const TreeEntry *>> Deps) {
+  Value *needToDelay(const TreeEntry *E, ArrayRef<const TreeEntry *> Deps) {
     // No need to delay emission if all deps are ready.
-    if (all_of(Deps, [](ArrayRef<const TreeEntry *> TEs) {
-          return all_of(
-              TEs, [](const TreeEntry *TE) { return TE->VectorizedValue; });
-        }))
+    if (all_of(Deps, [](const TreeEntry *TE) { return TE->VectorizedValue; }))
       return nullptr;
     // Postpone gather emission, will be emitted after the end of the
     // process to keep correct order.
@@ -10372,13 +10176,9 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
   SmallVector<int> Mask;
   SmallVector<int> ExtractMask;
   std::optional<TargetTransformInfo::ShuffleKind> ExtractShuffle;
-  SmallVector<std::optional<TargetTransformInfo::ShuffleKind>> GatherShuffles;
-  SmallVector<SmallVector<const TreeEntry *>> Entries;
+  std::optional<TargetTransformInfo::ShuffleKind> GatherShuffle;
+  SmallVector<const TreeEntry *> Entries;
   Type *ScalarTy = GatheredScalars.front()->getType();
-  unsigned NumParts = TTI->getNumberOfParts(
-      FixedVectorType::get(ScalarTy, GatheredScalars.size()));
-  if (NumParts == 0 || NumParts >= GatheredScalars.size())
-    NumParts = 1;
   if (!all_of(GatheredScalars, UndefValue::classof)) {
     // Check for gathered extracts.
     ExtractShuffle =
@@ -10397,10 +10197,9 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
         all_of(E->Scalars, [this](Value *V) { return getTreeEntry(V); }) ||
         isSplat(E->Scalars) ||
         (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) {
-      GatherShuffles =
-          isGatherShuffledEntry(E, GatheredScalars, Mask, Entries, NumParts);
+      GatherShuffle = isGatherShuffledEntry(E, GatheredScalars, Mask, Entries);
     }
-    if (!GatherShuffles.empty()) {
+    if (GatherShuffle) {
       if (Value *Delayed = ShuffleBuilder.needToDelay(E, Entries)) {
         // Delay emission of gathers which are not ready yet.
         PostponedGathers.insert(E);
@@ -10408,9 +10207,10 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
         // process to keep correct order.
         return Delayed;
       }
-      if (GatherShuffles.size() == 1 &&
-          *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
-          Entries.front().front()->isSame(E->Scalars)) {
+      assert((Entries.size() == 1 || Entries.size() == 2) &&
+             "Expected shuffle of 1 or 2 entries.");
+      if (*GatherShuffle == TTI::SK_PermuteSingleSrc &&
+          Entries.front()->isSame(E->Scalars)) {
         // Perfect match in the graph, will reuse the previously vectorized
         // node. Cost is 0.
         LLVM_DEBUG(
@@ -10418,11 +10218,11 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
             << "SLP: perfect diamond match for gather bundle "
             << shortBundleName(E->Scalars) << ".\n");
         // Restore the mask for previous partially matched values.
-        const TreeEntry *FrontTE = Entries.front().front();
-        if (FrontTE->ReorderIndices.empty() &&
-            ((FrontTE->ReuseShuffleIndices.empty() &&
-              E->Scalars.size() == FrontTE->Scalars.size()) ||
-             (E->Scalars.size() == FrontTE->ReuseShuffleIndices.size()))) {
+        if (Entries.front()->ReorderIndices.empty() &&
+            ((Entries.front()->ReuseShuffleIndices.empty() &&
+              E->Scalars.size() == Entries.front()->Scalars.size()) ||
+             (E->Scalars.size() ==
+              Entries.front()->ReuseShuffleIndices.size()))) {
           std::iota(Mask.begin(), Mask.end(), 0);
         } else {
           for (auto [I, V] : enumerate(E->Scalars)) {
@@ -10430,20 +10230,17 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
               Mask[I] = PoisonMaskElem;
               continue;
             }
-            Mask[I] = FrontTE->findLaneForValue(V);
+            Mask[I] = Entries.front()->findLaneForValue(V);
           }
         }
-        ShuffleBuilder.add(FrontTE->VectorizedValue, Mask);
+        ShuffleBuilder.add(Entries.front()->VectorizedValue, Mask);
         Res = ShuffleBuilder.finalize(E->getCommonMask());
         return Res;
       }
       if (!Resized) {
-        if (GatheredScalars.size() != VF &&
-            any_of(Entries, [&](ArrayRef<const TreeEntry *> TEs) {
-              return any_of(TEs, [&](const TreeEntry *TE) {
-                return TE->getVectorFactor() == VF;
-              });
-            }))
+        unsigned VF1 = Entries.front()->getVectorFactor();
+        unsigned VF2 = Entries.back()->getVectorFactor();
+        if ((VF == VF1 || VF == VF2) && GatheredScalars.size() != VF)
           GatheredScalars.append(VF - GatheredScalars.size(),
                                  PoisonValue::get(ScalarTy));
       }
@@ -10543,9 +10340,9 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
       }
     }
   };
-  if (ExtractShuffle || !GatherShuffles.empty()) {
+  if (ExtractShuffle || GatherShuffle) {
     bool IsNonPoisoned = true;
-    bool IsUsedInExpr = true;
+    bool IsUsedInExpr = false;
     Value *Vec1 = nullptr;
     if (ExtractShuffle) {
       // Gather of extractelements can be represented as just a shuffle of
@@ -10570,53 +10367,36 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
         }
       }
       if (Vec2) {
-        IsUsedInExpr = false;
         IsNonPoisoned &=
             isGuaranteedNotToBePoison(Vec1) && isGuaranteedNotToBePoison(Vec2);
         ShuffleBuilder.add(Vec1, Vec2, ExtractMask);
       } else if (Vec1) {
-        IsUsedInExpr &= FindReusedSplat(
+        IsUsedInExpr = FindReusedSplat(
             ExtractMask,
             cast<FixedVectorType>(Vec1->getType())->getNumElements());
         ShuffleBuilder.add(Vec1, ExtractMask);
         IsNonPoisoned &= isGuaranteedNotToBePoison(Vec1);
       } else {
-        IsUsedInExpr = false;
         ShuffleBuilder.add(PoisonValue::get(FixedVectorType::get(
                                ScalarTy, GatheredScalars.size())),
                            ExtractMask);
       }
     }
-    if (!GatherShuffles.empty()) {
-      unsigned SliceSize = E->Scalars.size() / NumParts;
-      SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
-      for (const auto [I, TEs] : enumerate(Entries)) {
-        if (TEs.empty()) {
-          assert(!GatherShuffles[I] &&
-                 "No shuffles with empty entries list expected.");
-          continue;
-        }
-        assert((TEs.size() == 1 || TEs.size() == 2) &&
-               "Expected shuffle of 1 or 2 entries.");
-        auto SubMask = ArrayRef(Mask).slice(I * SliceSize, SliceSize);
-        VecMask.assign(VecMask.size(), PoisonMaskElem);
-        copy(SubMask, std::next(VecMask.begin(), I * SliceSize));
-        if (TEs.size() == 1) {
-          IsUsedInExpr &= FindReusedSplat(
-              VecMask,
-              cast<FixedVectorType>(TEs.front()->VectorizedValue->getType())
-                  ->getNumElements());
-          ShuffleBuilder.add(TEs.front()->VectorizedValue, VecMask);
-          IsNonPoisoned &=
-              isGuaranteedNotToBePoison(TEs.front()->VectorizedValue);
-        } else {
-          IsUsedInExpr = false;
-          ShuffleBuilder.add(TEs.front()->VectorizedValue,
-                             TEs.back()->VectorizedValue, VecMask);
-          IsNonPoisoned &=
-              isGuaranteedNotToBePoison(TEs.front()->VectorizedValue) &&
-              isGuaranteedNotToBePoison(TEs.back()->VectorizedValue);
-        }
+    if (GatherShuffle) {
+      if (Entries.size() == 1) {
+        IsUsedInExpr = FindReusedSplat(
+            Mask,
+            cast<FixedVectorType>(Entries.front()->VectorizedValue->getType())
+                ->getNumElements());
+        ShuffleBuilder.add(Entries.front()->VectorizedValue, Mask);
+        IsNonPoisoned &=
+            isGuaranteedNotToBePoison(Entries.front()->VectorizedValue);
+      } else {
+        ShuffleBuilder.add(Entries.front()->VectorizedValue,
+                           Entries.back()->VectorizedValue, Mask);
+        IsNonPoisoned &=
+            isGuaranteedNotToBePoison(Entries.front()->VectorizedValue) &&
+            isGuaranteedNotToBePoison(Entries.back()->VectorizedValue);
       }
     }
     // Try to figure out best way to combine values: build a shuffle and insert
@@ -10627,18 +10407,14 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
     int MSz = Mask.size();
     // Try to build constant vector and shuffle with it only if currently we
     // have a single permutation and more than 1 scalar constants.
-    bool IsSingleShuffle = !ExtractShuffle || GatherShuffles.empty();
+    bool IsSingleShuffle = !ExtractShuffle || !GatherShuffle;
     bool IsIdentityShuffle =
         (ExtractShuffle.value_or(TTI::SK_PermuteTwoSrc) ==
              TTI::SK_PermuteSingleSrc &&
          none_of(ExtractMask, [&](int I) { return I >= EMSz; }) &&
          ShuffleVectorInst::isIdentityMask(ExtractMask, EMSz)) ||
-        (!GatherShuffles.empty() &&
-         all_of(GatherShuffles,
-                [](const std::optional<TTI::ShuffleKind> &SK) {
-                  return SK.value_or(TTI::SK_PermuteTwoSrc) ==
-                         TTI::SK_PermuteSingleSrc;
-                }) &&
+        (GatherShuffle.value_or(TTI::SK_PermuteTwoSrc) ==
+             TTI::SK_PermuteSingleSrc &&
          none_of(Mask, [&](int I) { return I >= MSz; }) &&
          ShuffleVectorInst::isIdentityMask(Mask, MSz));
     bool EnoughConstsForShuffle =
@@ -10814,13 +10590,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
           continue;
         }
 
-        // if (any_of(E->getOperand(i), [&](Value *V) {
-        //       auto *I = dyn_cast<Instruction>(V);
-        //       return I && I->getParent() == IBB;
-        //     }))
-          Builder.SetInsertPoint(IBB->getTerminator());
-        // else
-        //   Builder.SetInsertPoint(IBB->getFirstNonPHIOrDbgOrLifetime());
+        Builder.SetInsertPoint(IBB->getTerminator());
         Builder.SetCurrentDebugLocation(PH->getDebugLoc());
         Value *Vec = vectorizeOperand(E, i, /*PostponedPHIs=*/true);
         NewPhi->addIncoming(Vec, IBB);
@@ -11484,22 +11254,10 @@ Value *BoUpSLP::vectorizeTree(
     // The is because source vector that supposed to feed this gather node was
     // inserted at the end of the block [after stab instruction]. So we need
     // to adjust insertion point again to the end of block.
-    if (isa<PHINode>(UserI)) {
-      // Insert before all users.
-      Instruction *InsertPt = PrevVec->getParent()->getTerminator();
-      for (User *U : PrevVec->users()) {
-        if (U == UserI)
-          continue;
-        auto *UI = dyn_cast<Instruction>(U);
-        if (!UI || isa<PHINode>(UI) || UI->getParent() != InsertPt->getParent())
-          continue;
-        if (UI->comesBefore(InsertPt))
-          InsertPt = UI;
-      }
-      Builder.SetInsertPoint(InsertPt);
-    } else {
+    if (isa<PHINode>(UserI))
+      Builder.SetInsertPoint(PrevVec->getParent()->getTerminator());
+    else
       Builder.SetInsertPoint(PrevVec);
-    }
     Builder.SetCurrentDebugLocation(UserI->getDebugLoc());
     Value *Vec = vectorizeTree(TE, /*PostponedPHIs=*/false);
     PrevVec->replaceAllUsesWith(Vec);
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/multi-nodes-to-shuffle.ll b/llvm/test/Transforms/SLPVectorizer/X86/multi-nodes-to-shuffle.ll
index e5b5a5c6c4a00..21aac98aa3ece 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/multi-nodes-to-shuffle.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/multi-nodes-to-shuffle.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes=slp-vectorizer -S < %s -mtriple=x86_64-unknown-linux -slp-threshold=-115 | FileCheck %s
-; RUN: opt -passes=slp-vectorizer -S < %s -mtriple=x86_64-unknown-linux -slp-threshold=-115 -mattr=+avx2 | FileCheck %s --check-prefix=AVX2
+; RUN: opt -passes=slp-vectorizer -S < %s -mtriple=x86_64-unknown-linux -slp-threshold=-107 | FileCheck %s
+; RUN: opt -passes=slp-vectorizer -S < %s -mtriple=x86_64-unknown-linux -slp-threshold=-107 -mattr=+avx2 | FileCheck %s
 
 define void @test(i64 %p0, i64 %p1, i64 %p2, i64 %p3) {
 ; CHECK-LABEL: @test(
@@ -14,43 +14,18 @@ define void @test(i64 %p0, i64 %p1, i64 %p2, i64 %p3) {
 ; CHECK-NEXT:    [[TMP6:%.*]] = sdiv <4 x i64> [[TMP3]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = sub <4 x i64> [[TMP5]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = shl <4 x i64> [[TMP4]], [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i64> [[TMP6]], <4 x i64> [[TMP5]], <4 x i32> <i32 poison, i32 poison, i32 0, i32 4>
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP9]], <4 x i64> [[TMP10]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP5]], <4 x i32> <i32 1, i32 5, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <4 x i64> [[TMP6]], <4 x i64> [[TMP5]], <4 x i32> <i32 poison, i32 poison, i32 1, i32 5>
-; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <4 x i64> [[TMP12]], <4 x i64> [[TMP13]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP15:%.*]] = or <4 x i64> [[TMP11]], [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = trunc <4 x i64> [[TMP15]] to <4 x i32>
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 poison, i32 4>
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i64> [[TMP9]], <4 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 3>
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP5]], <4 x i32> <i32 1, i32 5, i32 poison, i32 5>
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i64> [[TMP11]], <4 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 5, i32 3>
+; CHECK-NEXT:    [[TMP13:%.*]] = or <4 x i64> [[TMP10]], [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = trunc <4 x i64> [[TMP13]] to <4 x i32>
 ; CHECK-NEXT:    br label [[BB:%.*]]
 ; CHECK:       bb:
-; CHECK-NEXT:    [[TMP17:%.*]] = phi <4 x i32> [ [[TMP18:%.*]], [[BB]] ], [ [[TMP16]], [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[TMP18]] = trunc <4 x i64> [[TMP8]] to <4 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = phi <4 x i32> [ [[TMP16:%.*]], [[BB]] ], [ [[TMP14]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[TMP16]] = trunc <4 x i64> [[TMP8]] to <4 x i32>
 ; CHECK-NEXT:    br label [[BB]]
 ;
-; AVX2-LABEL: @test(
-; AVX2-NEXT:  entry:
-; AVX2-NEXT:    [[TMP0:%.*]] = insertelement <4 x i64> poison, i64 [[P0:%.*]], i32 0
-; AVX2-NEXT:    [[TMP1:%.*]] = insertelement <4 x i64> [[TMP0]], i64 [[P1:%.*]], i32 1
-; AVX2-NEXT:    [[TMP2:%.*]] = insertelement <4 x i64> [[TMP1]], i64 [[P2:%.*]], i32 2
-; AVX2-NEXT:    [[TMP3:%.*]] = insertelement <4 x i64> [[TMP2]], i64 [[P3:%.*]], i32 3
-; AVX2-NEXT:    [[TMP4:%.*]] = add <4 x i64> [[TMP3]], [[TMP3]]
-; AVX2-NEXT:    [[TMP5:%.*]] = mul <4 x i64> [[TMP3]], [[TMP3]]
-; AVX2-NEXT:    [[TMP6:%.*]] = sdiv <4 x i64> [[TMP3]], [[TMP3]]
-; AVX2-NEXT:    [[TMP7:%.*]] = sub <4 x i64> [[TMP5]], [[TMP6]]
-; AVX2-NEXT:    [[TMP8:%.*]] = shl <4 x i64> [[TMP4]], [[TMP7]]
-; AVX2-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 poison, i32 4>
-; AVX2-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i64> [[TMP9]], <4 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 3>
-; AVX2-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP5]], <4 x i32> <i32 1, i32 5, i32 poison, i32 5>
-; AVX2-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i64> [[TMP11]], <4 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 5, i32 3>
-; AVX2-NEXT:    [[TMP13:%.*]] = or <4 x i64> [[TMP10]], [[TMP12]]
-; AVX2-NEXT:    [[TMP14:%.*]] = trunc <4 x i64> [[TMP13]] to <4 x i32>
-; AVX2-NEXT:    br label [[BB:%.*]]
-; AVX2:       bb:
-; AVX2-NEXT:    [[TMP15:%.*]] = phi <4 x i32> [ [[TMP16:%.*]], [[BB]] ], [ [[TMP14]], [[ENTRY:%.*]] ]
-; AVX2-NEXT:    [[TMP16]] = trunc <4 x i64> [[TMP8]] to <4 x i32>
-; AVX2-NEXT:    br label [[BB]]
-;
 entry:
   %a0 = add i64 %p0, %p0
   %a1 = add i64 %p1, %p1

From 86d07154e6b74fdd93d7678c32f001a26491a481 Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Thu, 26 Oct 2023 15:37:13 +0000
Subject: [PATCH 090/877] [gn build] Port a3490920615e

---
 llvm/utils/gn/secondary/libcxx/include/BUILD.gn | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
index 8766740a9ce83..50969e0f3f2ae 100644
--- a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
+++ b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
@@ -927,6 +927,7 @@ if (current_toolchain == default_toolchain) {
       "__utility/piecewise_construct.h",
       "__utility/priority_tag.h",
       "__utility/rel_ops.h",
+      "__utility/small_buffer.h",
       "__utility/swap.h",
       "__utility/to_underlying.h",
       "__utility/unreachable.h",

From 196d154ab7a76e8ccb11addf61ff53387e397130 Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev@outlook.com>
Date: Wed, 26 Jul 2023 07:47:08 -0700
Subject: [PATCH 091/877] [SLP]Improve isGatherShuffledEntry by trying
 per-register shuffle.

Currently when building gather/buildvector node, we try to build nodes
shuffles without taking into account separate vector registers. We can
improve final codegen and the whole vectorization process by including
this info into the analysis and the vector code emission, allows to emit
better vectorized code.

Differential Revision: https://reviews.llvm.org/D149742
---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 463 +++++++++++++-----
 .../X86/multi-nodes-to-shuffle.ll             |  45 +-
 2 files changed, 388 insertions(+), 120 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 4f82d2d1d6d91..bb4e743c1544a 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -2507,17 +2507,31 @@ class BoUpSLP {
   /// instruction in the list).
   Instruction &getLastInstructionInBundle(const TreeEntry *E);
 
-  /// Checks if the gathered \p VL can be represented as shuffle(s) of previous
-  /// tree entries.
+  /// Checks if the gathered \p VL can be represented as a single register
+  /// shuffle(s) of previous tree entries.
   /// \param TE Tree entry checked for permutation.
   /// \param VL List of scalars (a subset of the TE scalar), checked for
-  /// permutations.
+  /// permutations. Must form single-register vector.
   /// \returns ShuffleKind, if gathered values can be represented as shuffles of
-  /// previous tree entries. \p Mask is filled with the shuffle mask.
+  /// previous tree entries. \p Part of \p Mask is filled with the shuffle mask.
   std::optional<TargetTransformInfo::ShuffleKind>
-  isGatherShuffledEntry(const TreeEntry *TE, ArrayRef<Value *> VL,
-                        SmallVectorImpl<int> &Mask,
-                        SmallVectorImpl<const TreeEntry *> &Entries);
+  isGatherShuffledSingleRegisterEntry(
+      const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
+      SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part);
+
+  /// Checks if the gathered \p VL can be represented as multi-register
+  /// shuffle(s) of previous tree entries.
+  /// \param TE Tree entry checked for permutation.
+  /// \param VL List of scalars (a subset of the TE scalar), checked for
+  /// permutations.
+  /// \returns per-register series of ShuffleKind, if gathered values can be
+  /// represented as shuffles of previous tree entries. \p Mask is filled with
+  /// the shuffle mask (also on per-register base).
+  SmallVector<std::optional<TargetTransformInfo::ShuffleKind>>
+  isGatherShuffledEntry(
+      const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
+      SmallVectorImpl<SmallVector<const TreeEntry *>> &Entries,
+      unsigned NumParts);
 
   /// \returns the scalarization cost for this list of values. Assuming that
   /// this subtree gets vectorized, we may need to extract the values from the
@@ -6990,6 +7004,11 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
   BoUpSLP &R;
   SmallPtrSetImpl<Value *> &CheckedExtracts;
   constexpr static TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+  /// While set, still trying to estimate the cost for the same nodes and we
+  /// can delay actual cost estimation (virtual shuffle instruction emission).
+  /// May help better estimate the cost if same nodes must be permuted + allows
+  /// to move most of the long shuffles cost estimation to TTI.
+  bool SameNodesEstimated = true;
 
   static Constant *getAllOnesValue(const DataLayout &DL, Type *Ty) {
     if (Ty->getScalarType()->isPointerTy()) {
@@ -7230,6 +7249,49 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
     }
     return Cost;
   }
+  /// Transforms mask \p CommonMask per given \p Mask to make proper set after
+  /// shuffle emission.
+  static void transformMaskAfterShuffle(MutableArrayRef<int> CommonMask,
+                                        ArrayRef<int> Mask) {
+    for (unsigned Idx = 0, Sz = CommonMask.size(); Idx < Sz; ++Idx)
+      if (Mask[Idx] != PoisonMaskElem)
+        CommonMask[Idx] = Idx;
+  }
+  /// Adds the cost of reshuffling \p E1 and \p E2 (if present), using given
+  /// mask \p Mask, register number \p Part, that includes \p SliceSize
+  /// elements.
+  void estimateNodesPermuteCost(const TreeEntry &E1, const TreeEntry *E2,
+                                ArrayRef<int> Mask, unsigned Part,
+                                unsigned SliceSize) {
+    if (SameNodesEstimated) {
+      // Delay the cost estimation if the same nodes are reshuffling.
+      // If we already requested the cost of reshuffling of E1 and E2 before, no
+      // need to estimate another cost with the sub-Mask, instead include this
+      // sub-Mask into the CommonMask to estimate it later and avoid double cost
+      // estimation.
+      if ((InVectors.size() == 2 &&
+           InVectors.front().get<const TreeEntry *>() == &E1 &&
+           InVectors.back().get<const TreeEntry *>() == E2) ||
+          (!E2 && InVectors.front().get<const TreeEntry *>() == &E1)) {
+        assert(all_of(ArrayRef(CommonMask).slice(Part * SliceSize, SliceSize),
+                      [](int Idx) { return Idx == PoisonMaskElem; }) &&
+               "Expected all poisoned elements.");
+        ArrayRef<int> SubMask =
+            ArrayRef(Mask).slice(Part * SliceSize, SliceSize);
+        copy(SubMask, std::next(CommonMask.begin(), SliceSize * Part));
+        return;
+      }
+      // Found non-matching nodes - need to estimate the cost for the matched
+      // and transform mask.
+      Cost += createShuffle(InVectors.front(),
+                            InVectors.size() == 1 ? nullptr : InVectors.back(),
+                            CommonMask);
+      transformMaskAfterShuffle(CommonMask, CommonMask);
+    }
+    SameNodesEstimated = false;
+    Cost += createShuffle(&E1, E2, Mask);
+    transformMaskAfterShuffle(CommonMask, Mask);
+  }
 
   class ShuffleCostBuilder {
     const TargetTransformInfo &TTI;
@@ -7493,31 +7555,74 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
     // into a vector and can be represented as a permutation elements in a
     // single input vector or of 2 input vectors.
     Cost += computeExtractCost(VL, Mask, ShuffleKind);
+    InVectors.assign(1, E);
+    CommonMask.assign(Mask.begin(), Mask.end());
+    transformMaskAfterShuffle(CommonMask, CommonMask);
+    SameNodesEstimated = false;
     return VecBase;
   }
-  void add(const TreeEntry *E1, const TreeEntry *E2, ArrayRef<int> Mask) {
-    if (E1 == E2) {
+  void add(const TreeEntry &E1, const TreeEntry &E2, ArrayRef<int> Mask) {
+    if (&E1 == &E2) {
       assert(all_of(Mask,
-                    [=](int Idx) {
-                      return Idx < static_cast<int>(E1->getVectorFactor());
+                    [&](int Idx) {
+                      return Idx < static_cast<int>(E1.getVectorFactor());
                     }) &&
              "Expected single vector shuffle mask.");
       add(E1, Mask);
       return;
     }
-    CommonMask.assign(Mask.begin(), Mask.end());
-    InVectors.assign({E1, E2});
+    if (InVectors.empty()) {
+      CommonMask.assign(Mask.begin(), Mask.end());
+      InVectors.assign({&E1, &E2});
+      return;
+    }
+    assert(!CommonMask.empty() && "Expected non-empty common mask.");
+    auto *MaskVecTy =
+        FixedVectorType::get(E1.Scalars.front()->getType(), Mask.size());
+    unsigned NumParts = TTI.getNumberOfParts(MaskVecTy);
+    assert(NumParts > 0 && NumParts < Mask.size() &&
+           "Expected positive number of registers.");
+    unsigned SliceSize = Mask.size() / NumParts;
+    const auto *It =
+        find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; });
+    unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
+    estimateNodesPermuteCost(E1, &E2, Mask, Part, SliceSize);
   }
-  void add(const TreeEntry *E1, ArrayRef<int> Mask) {
-    CommonMask.assign(Mask.begin(), Mask.end());
-    InVectors.assign(1, E1);
+  void add(const TreeEntry &E1, ArrayRef<int> Mask) {
+    if (InVectors.empty()) {
+      CommonMask.assign(Mask.begin(), Mask.end());
+      InVectors.assign(1, &E1);
+      return;
+    }
+    assert(!CommonMask.empty() && "Expected non-empty common mask.");
+    auto *MaskVecTy =
+        FixedVectorType::get(E1.Scalars.front()->getType(), Mask.size());
+    unsigned NumParts = TTI.getNumberOfParts(MaskVecTy);
+    assert(NumParts > 0 && NumParts < Mask.size() &&
+           "Expected positive number of registers.");
+    unsigned SliceSize = Mask.size() / NumParts;
+    const auto *It =
+        find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; });
+    unsigned Part = std::distance(Mask.begin(), It) / SliceSize;
+    estimateNodesPermuteCost(E1, nullptr, Mask, Part, SliceSize);
+    if (!SameNodesEstimated && InVectors.size() == 1)
+      InVectors.emplace_back(&E1);
   }
   /// Adds another one input vector and the mask for the shuffling.
   void add(Value *V1, ArrayRef<int> Mask) {
-    assert(CommonMask.empty() && InVectors.empty() &&
-           "Expected empty input mask/vectors.");
-    CommonMask.assign(Mask.begin(), Mask.end());
-    InVectors.assign(1, V1);
+    if (InVectors.empty()) {
+      assert(CommonMask.empty() && "Expected empty input mask/vectors.");
+      CommonMask.assign(Mask.begin(), Mask.end());
+      InVectors.assign(1, V1);
+      return;
+    }
+    assert(InVectors.size() == 1 && InVectors.front().is<const TreeEntry *>() &&
+           !CommonMask.empty() && "Expected only single entry from extracts.");
+    InVectors.push_back(V1);
+    unsigned VF = CommonMask.size();
+    for (unsigned Idx = 0; Idx < VF; ++Idx)
+      if (Mask[Idx] != PoisonMaskElem && CommonMask[Idx] == PoisonMaskElem)
+        CommonMask[Idx] = Mask[Idx] + VF;
   }
   Value *gather(ArrayRef<Value *> VL, Value *Root = nullptr) {
     Cost += getBuildVectorCost(VL, Root);
@@ -7579,12 +7684,16 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
   ArrayRef<Value *> VL = E->Scalars;
 
   Type *ScalarTy = VL[0]->getType();
-  if (auto *SI = dyn_cast<StoreInst>(VL[0]))
-    ScalarTy = SI->getValueOperand()->getType();
-  else if (auto *CI = dyn_cast<CmpInst>(VL[0]))
-    ScalarTy = CI->getOperand(0)->getType();
-  else if (auto *IE = dyn_cast<InsertElementInst>(VL[0]))
-    ScalarTy = IE->getOperand(1)->getType();
+  if (E->State != TreeEntry::NeedToGather) {
+    if (auto *SI = dyn_cast<StoreInst>(VL[0]))
+      ScalarTy = SI->getValueOperand()->getType();
+    else if (auto *CI = dyn_cast<CmpInst>(VL[0]))
+      ScalarTy = CI->getOperand(0)->getType();
+    else if (auto *IE = dyn_cast<InsertElementInst>(VL[0]))
+      ScalarTy = IE->getOperand(1)->getType();
+  }
+  if (!FixedVectorType::isValidElementType(ScalarTy))
+    return InstructionCost::getInvalid();
   auto *VecTy = FixedVectorType::get(ScalarTy, VL.size());
   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
 
@@ -7596,7 +7705,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
     VecTy = FixedVectorType::get(ScalarTy, VL.size());
   }
   unsigned EntryVF = E->getVectorFactor();
-  auto *FinalVecTy = FixedVectorType::get(VecTy->getElementType(), EntryVF);
+  auto *FinalVecTy = FixedVectorType::get(ScalarTy, EntryVF);
 
   bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
   if (E->State == TreeEntry::NeedToGather) {
@@ -7629,20 +7738,28 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
     SmallVector<int> Mask;
     SmallVector<int> ExtractMask;
     std::optional<TargetTransformInfo::ShuffleKind> ExtractShuffle;
-    std::optional<TargetTransformInfo::ShuffleKind> GatherShuffle;
-    SmallVector<const TreeEntry *> Entries;
+    SmallVector<std::optional<TargetTransformInfo::ShuffleKind>> GatherShuffles;
+    SmallVector<SmallVector<const TreeEntry *>> Entries;
     // Check for gathered extracts.
-    ExtractShuffle = tryToGatherSingleRegisterExtractElements(GatheredScalars, ExtractMask);
+    ExtractShuffle =
+        tryToGatherSingleRegisterExtractElements(GatheredScalars, ExtractMask);
 
     bool Resized = false;
+    unsigned NumParts = TTI->getNumberOfParts(VecTy);
+    if (NumParts == 0 || NumParts >= GatheredScalars.size())
+      NumParts = 1;
     if (Value *VecBase = Estimator.adjustExtracts(
-            E, ExtractMask, ExtractShuffle.value_or(TTI::SK_PermuteTwoSrc)))
+            E, ExtractMask, ExtractShuffle.value_or(TTI::SK_PermuteTwoSrc))) {
       if (auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))
         if (VF == VecBaseTy->getNumElements() && GatheredScalars.size() != VF) {
           Resized = true;
           GatheredScalars.append(VF - GatheredScalars.size(),
                                  PoisonValue::get(ScalarTy));
         }
+    } else if (ExtractShuffle &&
+               TTI->getNumberOfParts(VecTy) == VecTy->getNumElements()) {
+      copy(VL, GatheredScalars.begin());
+    }
 
     // Do not try to look for reshuffled loads for gathered loads (they will be
     // handled later), for vectorized scalars, and cases, which are definitely
@@ -7652,12 +7769,12 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
         all_of(E->Scalars, [this](Value *V) { return getTreeEntry(V); }) ||
         isSplat(E->Scalars) ||
         (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2))
-      GatherShuffle = isGatherShuffledEntry(E, GatheredScalars, Mask, Entries);
-    if (GatherShuffle) {
-      assert((Entries.size() == 1 || Entries.size() == 2) &&
-             "Expected shuffle of 1 or 2 entries.");
-      if (*GatherShuffle == TTI::SK_PermuteSingleSrc &&
-          Entries.front()->isSame(E->Scalars)) {
+      GatherShuffles =
+          isGatherShuffledEntry(E, GatheredScalars, Mask, Entries, NumParts);
+    if (!GatherShuffles.empty()) {
+      if (GatherShuffles.size() == 1 &&
+          *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
+          Entries.front().front()->isSame(E->Scalars)) {
         // Perfect match in the graph, will reuse the previously vectorized
         // node. Cost is 0.
         LLVM_DEBUG(
@@ -7671,15 +7788,18 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
             continue;
           }
           if (Mask[I] == PoisonMaskElem)
-            Mask[I] = Entries.front()->findLaneForValue(V);
+            Mask[I] = Entries.front().front()->findLaneForValue(V);
         }
-        Estimator.add(Entries.front(), Mask);
+        Estimator.add(*Entries.front().front(), Mask);
         return Estimator.finalize(E->ReuseShuffleIndices);
       }
       if (!Resized) {
-        unsigned VF1 = Entries.front()->getVectorFactor();
-        unsigned VF2 = Entries.back()->getVectorFactor();
-        if ((VF == VF1 || VF == VF2) && GatheredScalars.size() != VF)
+        if (GatheredScalars.size() != VF &&
+            any_of(Entries, [&](ArrayRef<const TreeEntry *> TEs) {
+              return any_of(TEs, [&](const TreeEntry *TE) {
+                return TE->getVectorFactor() == VF;
+              });
+            }))
           GatheredScalars.append(VF - GatheredScalars.size(),
                                  PoisonValue::get(ScalarTy));
       }
@@ -7691,7 +7811,21 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
       LLVM_DEBUG(dbgs() << "SLP: shuffled " << Entries.size()
                         << " entries for bundle "
                         << shortBundleName(VL) << ".\n");
-      Estimator.add(Entries.front(), Entries.back(), Mask);
+      unsigned SliceSize = E->Scalars.size() / NumParts;
+      SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
+      for (const auto [I, TEs] : enumerate(Entries)) {
+        if (TEs.empty()) {
+          assert(!GatherShuffles[I] &&
+                 "No shuffles with empty entries list expected.");
+          continue;
+        }
+        assert((TEs.size() == 1 || TEs.size() == 2) &&
+               "Expected shuffle of 1 or 2 entries.");
+        auto SubMask = ArrayRef(Mask).slice(I * SliceSize, SliceSize);
+        VecMask.assign(VecMask.size(), PoisonMaskElem);
+        copy(SubMask, std::next(VecMask.begin(), I * SliceSize));
+        Estimator.add(*TEs.front(), *TEs.back(), VecMask);
+      }
       if (all_of(GatheredScalars, PoisonValue ::classof))
         return Estimator.finalize(E->ReuseShuffleIndices);
       return Estimator.finalize(
@@ -7705,16 +7839,19 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
     if (!all_of(GatheredScalars, PoisonValue::classof)) {
       auto Gathers = ArrayRef(GatheredScalars).take_front(VL.size());
       bool SameGathers = VL.equals(Gathers);
-      Value *BV = Estimator.gather(
-          Gathers, SameGathers ? nullptr
-                               : Constant::getNullValue(FixedVectorType::get(
-                                     ScalarTy, GatheredScalars.size())));
+      if (!SameGathers)
+        return Estimator.finalize(
+            E->ReuseShuffleIndices, E->Scalars.size(),
+            [&](Value *&Vec, SmallVectorImpl<int> &Mask) {
+              Vec = Estimator.gather(
+                  GatheredScalars, Constant::getNullValue(FixedVectorType::get(
+                                       ScalarTy, GatheredScalars.size())));
+            });
+      Value *BV = Estimator.gather(Gathers);
       SmallVector<int> ReuseMask(Gathers.size(), PoisonMaskElem);
       std::iota(ReuseMask.begin(), ReuseMask.end(), 0);
       Estimator.add(BV, ReuseMask);
     }
-    if (ExtractShuffle)
-      Estimator.add(E, std::nullopt);
     return Estimator.finalize(E->ReuseShuffleIndices);
   }
   InstructionCost CommonCost = 0;
@@ -9037,16 +9174,10 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals) {
 }
 
 std::optional<TargetTransformInfo::ShuffleKind>
-BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, ArrayRef<Value *> VL,
-                               SmallVectorImpl<int> &Mask,
-                               SmallVectorImpl<const TreeEntry *> &Entries) {
+BoUpSLP::isGatherShuffledSingleRegisterEntry(
+    const TreeEntry *TE, ArrayRef<Value *> VL, MutableArrayRef<int> Mask,
+    SmallVectorImpl<const TreeEntry *> &Entries, unsigned Part) {
   Entries.clear();
-  // No need to check for the topmost gather node.
-  if (TE == VectorizableTree.front().get())
-    return std::nullopt;
-  Mask.assign(VL.size(), PoisonMaskElem);
-  assert(TE->UserTreeIndices.size() == 1 &&
-         "Expected only single user of the gather node.");
   // TODO: currently checking only for Scalars in the tree entry, need to count
   // reused elements too for better cost estimation.
   const EdgeInfo &TEUseEI = TE->UserTreeIndices.front();
@@ -9121,7 +9252,7 @@ BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, ArrayRef<Value *> VL,
           UserPHI ? UserPHI->getIncomingBlock(UseEI.EdgeIdx)->getTerminator()
                   : &getLastInstructionInBundle(UseEI.UserTE);
       if (TEInsertPt == InsertPt) {
-        // If 2 gathers are operands of the same entry (regardless of wether
+        // If 2 gathers are operands of the same entry (regardless of whether
         // user is PHI or else), compare operands indices, use the earlier one
         // as the base.
         if (TEUseEI.UserTE == UseEI.UserTE && TEUseEI.EdgeIdx < UseEI.EdgeIdx)
@@ -9186,8 +9317,10 @@ BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, ArrayRef<Value *> VL,
     }
   }
 
-  if (UsedTEs.empty())
+  if (UsedTEs.empty()) {
+    Entries.clear();
     return std::nullopt;
+  }
 
   unsigned VF = 0;
   if (UsedTEs.size() == 1) {
@@ -9203,7 +9336,8 @@ BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, ArrayRef<Value *> VL,
     });
     if (It != FirstEntries.end() && (*It)->getVectorFactor() == VL.size()) {
       Entries.push_back(*It);
-      std::iota(Mask.begin(), Mask.end(), 0);
+      std::iota(std::next(Mask.begin(), Part * VL.size()),
+                std::next(Mask.begin(), (Part + 1) * VL.size()), 0);
       // Clear undef scalars.
       for (int I = 0, Sz = VL.size(); I < Sz; ++I)
         if (isa<PoisonValue>(VL[I]))
@@ -9340,7 +9474,10 @@ BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, ArrayRef<Value *> VL,
     TempEntries.push_back(Entries[I]);
   }
   Entries.swap(TempEntries);
-  if (EntryLanes.size() == Entries.size() && !VL.equals(TE->Scalars)) {
+  if (EntryLanes.size() == Entries.size() &&
+      !VL.equals(ArrayRef(TE->Scalars)
+                     .slice(Part * VL.size(),
+                            std::min<int>(VL.size(), TE->Scalars.size())))) {
     // We may have here 1 or 2 entries only. If the number of scalars is equal
     // to the number of entries, no need to do the analysis, it is not very
     // profitable. Since VL is not the same as TE->Scalars, it means we already
@@ -9353,9 +9490,10 @@ BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, ArrayRef<Value *> VL,
   // Pair.first is the offset to the vector, while Pair.second is the index of
   // scalar in the list.
   for (const std::pair<unsigned, int> &Pair : EntryLanes) {
-    Mask[Pair.second] = Pair.first * VF +
-                        Entries[Pair.first]->findLaneForValue(VL[Pair.second]);
-    IsIdentity &= Mask[Pair.second] == Pair.second;
+    unsigned Idx = Part * VL.size() + Pair.second;
+    Mask[Idx] = Pair.first * VF +
+                Entries[Pair.first]->findLaneForValue(VL[Pair.second]);
+    IsIdentity &= Mask[Idx] == Pair.second;
   }
   switch (Entries.size()) {
   case 1:
@@ -9370,9 +9508,64 @@ BoUpSLP::isGatherShuffledEntry(const TreeEntry *TE, ArrayRef<Value *> VL,
     break;
   }
   Entries.clear();
+  // Clear the corresponding mask elements.
+  std::fill(std::next(Mask.begin(), Part * VL.size()),
+            std::next(Mask.begin(), (Part + 1) * VL.size()), PoisonMaskElem);
   return std::nullopt;
 }
 
+SmallVector<std::optional<TargetTransformInfo::ShuffleKind>>
+BoUpSLP::isGatherShuffledEntry(
+    const TreeEntry *TE, ArrayRef<Value *> VL, SmallVectorImpl<int> &Mask,
+    SmallVectorImpl<SmallVector<const TreeEntry *>> &Entries,
+    unsigned NumParts) {
+  assert(NumParts > 0 && NumParts < VL.size() &&
+         "Expected positive number of registers.");
+  Entries.clear();
+  // No need to check for the topmost gather node.
+  if (TE == VectorizableTree.front().get())
+    return {};
+  Mask.assign(VL.size(), PoisonMaskElem);
+  assert(TE->UserTreeIndices.size() == 1 &&
+         "Expected only single user of the gather node.");
+  assert(VL.size() % NumParts == 0 &&
+         "Number of scalars must be divisible by NumParts.");
+  unsigned SliceSize = VL.size() / NumParts;
+  SmallVector<std::optional<TTI::ShuffleKind>> Res;
+  for (unsigned Part = 0; Part < NumParts; ++Part) {
+    ArrayRef<Value *> SubVL = VL.slice(Part * SliceSize, SliceSize);
+    SmallVectorImpl<const TreeEntry *> &SubEntries = Entries.emplace_back();
+    std::optional<TTI::ShuffleKind> SubRes =
+        isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part);
+    if (!SubRes)
+      SubEntries.clear();
+    Res.push_back(SubRes);
+    if (SubEntries.size() == 1 && *SubRes == TTI::SK_PermuteSingleSrc &&
+        SubEntries.front()->getVectorFactor() == VL.size() &&
+        (SubEntries.front()->isSame(TE->Scalars) ||
+         SubEntries.front()->isSame(VL))) {
+      SmallVector<const TreeEntry *> LocalSubEntries;
+      LocalSubEntries.swap(SubEntries);
+      Entries.clear();
+      Res.clear();
+      std::iota(Mask.begin(), Mask.end(), 0);
+      // Clear undef scalars.
+      for (int I = 0, Sz = VL.size(); I < Sz; ++I)
+        if (isa<PoisonValue>(VL[I]))
+          Mask[I] = PoisonMaskElem;
+      Entries.emplace_back(1, LocalSubEntries.front());
+      Res.push_back(TargetTransformInfo::SK_PermuteSingleSrc);
+      return Res;
+    }
+  }
+  if (all_of(Res,
+             [](const std::optional<TTI::ShuffleKind> &SK) { return !SK; })) {
+    Entries.clear();
+    return {};
+  }
+  return Res;
+}
+
 InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL,
                                        bool ForPoisonSrc) const {
   // Find the type of the operands in VL.
@@ -9839,9 +10032,13 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
   }
   /// Checks if the specified entry \p E needs to be delayed because of its
   /// dependency nodes.
-  Value *needToDelay(const TreeEntry *E, ArrayRef<const TreeEntry *> Deps) {
+  Value *needToDelay(const TreeEntry *E,
+                     ArrayRef<SmallVector<const TreeEntry *>> Deps) {
     // No need to delay emission if all deps are ready.
-    if (all_of(Deps, [](const TreeEntry *TE) { return TE->VectorizedValue; }))
+    if (all_of(Deps, [](ArrayRef<const TreeEntry *> TEs) {
+          return all_of(
+              TEs, [](const TreeEntry *TE) { return TE->VectorizedValue; });
+        }))
       return nullptr;
     // Postpone gather emission, will be emitted after the end of the
     // process to keep correct order.
@@ -10176,9 +10373,13 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
   SmallVector<int> Mask;
   SmallVector<int> ExtractMask;
   std::optional<TargetTransformInfo::ShuffleKind> ExtractShuffle;
-  std::optional<TargetTransformInfo::ShuffleKind> GatherShuffle;
-  SmallVector<const TreeEntry *> Entries;
+  SmallVector<std::optional<TargetTransformInfo::ShuffleKind>> GatherShuffles;
+  SmallVector<SmallVector<const TreeEntry *>> Entries;
   Type *ScalarTy = GatheredScalars.front()->getType();
+  unsigned NumParts = TTI->getNumberOfParts(
+      FixedVectorType::get(ScalarTy, GatheredScalars.size()));
+  if (NumParts == 0 || NumParts >= GatheredScalars.size())
+    NumParts = 1;
   if (!all_of(GatheredScalars, UndefValue::classof)) {
     // Check for gathered extracts.
     ExtractShuffle =
@@ -10197,9 +10398,10 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
         all_of(E->Scalars, [this](Value *V) { return getTreeEntry(V); }) ||
         isSplat(E->Scalars) ||
         (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2)) {
-      GatherShuffle = isGatherShuffledEntry(E, GatheredScalars, Mask, Entries);
+      GatherShuffles =
+          isGatherShuffledEntry(E, GatheredScalars, Mask, Entries, NumParts);
     }
-    if (GatherShuffle) {
+    if (!GatherShuffles.empty()) {
       if (Value *Delayed = ShuffleBuilder.needToDelay(E, Entries)) {
         // Delay emission of gathers which are not ready yet.
         PostponedGathers.insert(E);
@@ -10207,10 +10409,9 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
         // process to keep correct order.
         return Delayed;
       }
-      assert((Entries.size() == 1 || Entries.size() == 2) &&
-             "Expected shuffle of 1 or 2 entries.");
-      if (*GatherShuffle == TTI::SK_PermuteSingleSrc &&
-          Entries.front()->isSame(E->Scalars)) {
+      if (GatherShuffles.size() == 1 &&
+          *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
+          Entries.front().front()->isSame(E->Scalars)) {
         // Perfect match in the graph, will reuse the previously vectorized
         // node. Cost is 0.
         LLVM_DEBUG(
@@ -10218,11 +10419,11 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
             << "SLP: perfect diamond match for gather bundle "
             << shortBundleName(E->Scalars) << ".\n");
         // Restore the mask for previous partially matched values.
-        if (Entries.front()->ReorderIndices.empty() &&
-            ((Entries.front()->ReuseShuffleIndices.empty() &&
-              E->Scalars.size() == Entries.front()->Scalars.size()) ||
-             (E->Scalars.size() ==
-              Entries.front()->ReuseShuffleIndices.size()))) {
+        const TreeEntry *FrontTE = Entries.front().front();
+        if (FrontTE->ReorderIndices.empty() &&
+            ((FrontTE->ReuseShuffleIndices.empty() &&
+              E->Scalars.size() == FrontTE->Scalars.size()) ||
+             (E->Scalars.size() == FrontTE->ReuseShuffleIndices.size()))) {
           std::iota(Mask.begin(), Mask.end(), 0);
         } else {
           for (auto [I, V] : enumerate(E->Scalars)) {
@@ -10230,17 +10431,20 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
               Mask[I] = PoisonMaskElem;
               continue;
             }
-            Mask[I] = Entries.front()->findLaneForValue(V);
+            Mask[I] = FrontTE->findLaneForValue(V);
           }
         }
-        ShuffleBuilder.add(Entries.front()->VectorizedValue, Mask);
+        ShuffleBuilder.add(FrontTE->VectorizedValue, Mask);
         Res = ShuffleBuilder.finalize(E->getCommonMask());
         return Res;
       }
       if (!Resized) {
-        unsigned VF1 = Entries.front()->getVectorFactor();
-        unsigned VF2 = Entries.back()->getVectorFactor();
-        if ((VF == VF1 || VF == VF2) && GatheredScalars.size() != VF)
+        if (GatheredScalars.size() != VF &&
+            any_of(Entries, [&](ArrayRef<const TreeEntry *> TEs) {
+              return any_of(TEs, [&](const TreeEntry *TE) {
+                return TE->getVectorFactor() == VF;
+              });
+            }))
           GatheredScalars.append(VF - GatheredScalars.size(),
                                  PoisonValue::get(ScalarTy));
       }
@@ -10340,9 +10544,9 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
       }
     }
   };
-  if (ExtractShuffle || GatherShuffle) {
+  if (ExtractShuffle || !GatherShuffles.empty()) {
     bool IsNonPoisoned = true;
-    bool IsUsedInExpr = false;
+    bool IsUsedInExpr = true;
     Value *Vec1 = nullptr;
     if (ExtractShuffle) {
       // Gather of extractelements can be represented as just a shuffle of
@@ -10367,36 +10571,53 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
         }
       }
       if (Vec2) {
+        IsUsedInExpr = false;
         IsNonPoisoned &=
             isGuaranteedNotToBePoison(Vec1) && isGuaranteedNotToBePoison(Vec2);
         ShuffleBuilder.add(Vec1, Vec2, ExtractMask);
       } else if (Vec1) {
-        IsUsedInExpr = FindReusedSplat(
+        IsUsedInExpr &= FindReusedSplat(
             ExtractMask,
             cast<FixedVectorType>(Vec1->getType())->getNumElements());
         ShuffleBuilder.add(Vec1, ExtractMask);
         IsNonPoisoned &= isGuaranteedNotToBePoison(Vec1);
       } else {
+        IsUsedInExpr = false;
         ShuffleBuilder.add(PoisonValue::get(FixedVectorType::get(
                                ScalarTy, GatheredScalars.size())),
                            ExtractMask);
       }
     }
-    if (GatherShuffle) {
-      if (Entries.size() == 1) {
-        IsUsedInExpr = FindReusedSplat(
-            Mask,
-            cast<FixedVectorType>(Entries.front()->VectorizedValue->getType())
-                ->getNumElements());
-        ShuffleBuilder.add(Entries.front()->VectorizedValue, Mask);
-        IsNonPoisoned &=
-            isGuaranteedNotToBePoison(Entries.front()->VectorizedValue);
-      } else {
-        ShuffleBuilder.add(Entries.front()->VectorizedValue,
-                           Entries.back()->VectorizedValue, Mask);
-        IsNonPoisoned &=
-            isGuaranteedNotToBePoison(Entries.front()->VectorizedValue) &&
-            isGuaranteedNotToBePoison(Entries.back()->VectorizedValue);
+    if (!GatherShuffles.empty()) {
+      unsigned SliceSize = E->Scalars.size() / NumParts;
+      SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
+      for (const auto [I, TEs] : enumerate(Entries)) {
+        if (TEs.empty()) {
+          assert(!GatherShuffles[I] &&
+                 "No shuffles with empty entries list expected.");
+          continue;
+        }
+        assert((TEs.size() == 1 || TEs.size() == 2) &&
+               "Expected shuffle of 1 or 2 entries.");
+        auto SubMask = ArrayRef(Mask).slice(I * SliceSize, SliceSize);
+        VecMask.assign(VecMask.size(), PoisonMaskElem);
+        copy(SubMask, std::next(VecMask.begin(), I * SliceSize));
+        if (TEs.size() == 1) {
+          IsUsedInExpr &= FindReusedSplat(
+              VecMask,
+              cast<FixedVectorType>(TEs.front()->VectorizedValue->getType())
+                  ->getNumElements());
+          ShuffleBuilder.add(TEs.front()->VectorizedValue, VecMask);
+          IsNonPoisoned &=
+              isGuaranteedNotToBePoison(TEs.front()->VectorizedValue);
+        } else {
+          IsUsedInExpr = false;
+          ShuffleBuilder.add(TEs.front()->VectorizedValue,
+                             TEs.back()->VectorizedValue, VecMask);
+          IsNonPoisoned &=
+              isGuaranteedNotToBePoison(TEs.front()->VectorizedValue) &&
+              isGuaranteedNotToBePoison(TEs.back()->VectorizedValue);
+        }
       }
     }
     // Try to figure out best way to combine values: build a shuffle and insert
@@ -10407,14 +10628,18 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
     int MSz = Mask.size();
     // Try to build constant vector and shuffle with it only if currently we
     // have a single permutation and more than 1 scalar constants.
-    bool IsSingleShuffle = !ExtractShuffle || !GatherShuffle;
+    bool IsSingleShuffle = !ExtractShuffle || GatherShuffles.empty();
     bool IsIdentityShuffle =
         (ExtractShuffle.value_or(TTI::SK_PermuteTwoSrc) ==
              TTI::SK_PermuteSingleSrc &&
          none_of(ExtractMask, [&](int I) { return I >= EMSz; }) &&
          ShuffleVectorInst::isIdentityMask(ExtractMask, EMSz)) ||
-        (GatherShuffle.value_or(TTI::SK_PermuteTwoSrc) ==
-             TTI::SK_PermuteSingleSrc &&
+        (!GatherShuffles.empty() &&
+         all_of(GatherShuffles,
+                [](const std::optional<TTI::ShuffleKind> &SK) {
+                  return SK.value_or(TTI::SK_PermuteTwoSrc) ==
+                         TTI::SK_PermuteSingleSrc;
+                }) &&
          none_of(Mask, [&](int I) { return I >= MSz; }) &&
          ShuffleVectorInst::isIdentityMask(Mask, MSz));
     bool EnoughConstsForShuffle =
@@ -10590,7 +10815,13 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
           continue;
         }
 
-        Builder.SetInsertPoint(IBB->getTerminator());
+        // if (any_of(E->getOperand(i), [&](Value *V) {
+        //       auto *I = dyn_cast<Instruction>(V);
+        //       return I && I->getParent() == IBB;
+        //     }))
+          Builder.SetInsertPoint(IBB->getTerminator());
+        // else
+        //   Builder.SetInsertPoint(IBB->getFirstNonPHIOrDbgOrLifetime());
         Builder.SetCurrentDebugLocation(PH->getDebugLoc());
         Value *Vec = vectorizeOperand(E, i, /*PostponedPHIs=*/true);
         NewPhi->addIncoming(Vec, IBB);
@@ -11254,10 +11485,22 @@ Value *BoUpSLP::vectorizeTree(
     // The is because source vector that supposed to feed this gather node was
     // inserted at the end of the block [after stab instruction]. So we need
     // to adjust insertion point again to the end of block.
-    if (isa<PHINode>(UserI))
-      Builder.SetInsertPoint(PrevVec->getParent()->getTerminator());
-    else
+    if (isa<PHINode>(UserI)) {
+      // Insert before all users.
+      Instruction *InsertPt = PrevVec->getParent()->getTerminator();
+      for (User *U : PrevVec->users()) {
+        if (U == UserI)
+          continue;
+        auto *UI = dyn_cast<Instruction>(U);
+        if (!UI || isa<PHINode>(UI) || UI->getParent() != InsertPt->getParent())
+          continue;
+        if (UI->comesBefore(InsertPt))
+          InsertPt = UI;
+      }
+      Builder.SetInsertPoint(InsertPt);
+    } else {
       Builder.SetInsertPoint(PrevVec);
+    }
     Builder.SetCurrentDebugLocation(UserI->getDebugLoc());
     Value *Vec = vectorizeTree(TE, /*PostponedPHIs=*/false);
     PrevVec->replaceAllUsesWith(Vec);
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/multi-nodes-to-shuffle.ll b/llvm/test/Transforms/SLPVectorizer/X86/multi-nodes-to-shuffle.ll
index 21aac98aa3ece..e5b5a5c6c4a00 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/multi-nodes-to-shuffle.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/multi-nodes-to-shuffle.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes=slp-vectorizer -S < %s -mtriple=x86_64-unknown-linux -slp-threshold=-107 | FileCheck %s
-; RUN: opt -passes=slp-vectorizer -S < %s -mtriple=x86_64-unknown-linux -slp-threshold=-107 -mattr=+avx2 | FileCheck %s
+; RUN: opt -passes=slp-vectorizer -S < %s -mtriple=x86_64-unknown-linux -slp-threshold=-115 | FileCheck %s
+; RUN: opt -passes=slp-vectorizer -S < %s -mtriple=x86_64-unknown-linux -slp-threshold=-115 -mattr=+avx2 | FileCheck %s --check-prefix=AVX2
 
 define void @test(i64 %p0, i64 %p1, i64 %p2, i64 %p3) {
 ; CHECK-LABEL: @test(
@@ -14,18 +14,43 @@ define void @test(i64 %p0, i64 %p1, i64 %p2, i64 %p3) {
 ; CHECK-NEXT:    [[TMP6:%.*]] = sdiv <4 x i64> [[TMP3]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = sub <4 x i64> [[TMP5]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = shl <4 x i64> [[TMP4]], [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 poison, i32 4>
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i64> [[TMP9]], <4 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 3>
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP5]], <4 x i32> <i32 1, i32 5, i32 poison, i32 5>
-; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i64> [[TMP11]], <4 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 5, i32 3>
-; CHECK-NEXT:    [[TMP13:%.*]] = or <4 x i64> [[TMP10]], [[TMP12]]
-; CHECK-NEXT:    [[TMP14:%.*]] = trunc <4 x i64> [[TMP13]] to <4 x i32>
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i64> [[TMP6]], <4 x i64> [[TMP5]], <4 x i32> <i32 poison, i32 poison, i32 0, i32 4>
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP9]], <4 x i64> [[TMP10]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP5]], <4 x i32> <i32 1, i32 5, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <4 x i64> [[TMP6]], <4 x i64> [[TMP5]], <4 x i32> <i32 poison, i32 poison, i32 1, i32 5>
+; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <4 x i64> [[TMP12]], <4 x i64> [[TMP13]], <4 x i32> <i32 0, i32 1, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP15:%.*]] = or <4 x i64> [[TMP11]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = trunc <4 x i64> [[TMP15]] to <4 x i32>
 ; CHECK-NEXT:    br label [[BB:%.*]]
 ; CHECK:       bb:
-; CHECK-NEXT:    [[TMP15:%.*]] = phi <4 x i32> [ [[TMP16:%.*]], [[BB]] ], [ [[TMP14]], [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[TMP16]] = trunc <4 x i64> [[TMP8]] to <4 x i32>
+; CHECK-NEXT:    [[TMP17:%.*]] = phi <4 x i32> [ [[TMP18:%.*]], [[BB]] ], [ [[TMP16]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[TMP18]] = trunc <4 x i64> [[TMP8]] to <4 x i32>
 ; CHECK-NEXT:    br label [[BB]]
 ;
+; AVX2-LABEL: @test(
+; AVX2-NEXT:  entry:
+; AVX2-NEXT:    [[TMP0:%.*]] = insertelement <4 x i64> poison, i64 [[P0:%.*]], i32 0
+; AVX2-NEXT:    [[TMP1:%.*]] = insertelement <4 x i64> [[TMP0]], i64 [[P1:%.*]], i32 1
+; AVX2-NEXT:    [[TMP2:%.*]] = insertelement <4 x i64> [[TMP1]], i64 [[P2:%.*]], i32 2
+; AVX2-NEXT:    [[TMP3:%.*]] = insertelement <4 x i64> [[TMP2]], i64 [[P3:%.*]], i32 3
+; AVX2-NEXT:    [[TMP4:%.*]] = add <4 x i64> [[TMP3]], [[TMP3]]
+; AVX2-NEXT:    [[TMP5:%.*]] = mul <4 x i64> [[TMP3]], [[TMP3]]
+; AVX2-NEXT:    [[TMP6:%.*]] = sdiv <4 x i64> [[TMP3]], [[TMP3]]
+; AVX2-NEXT:    [[TMP7:%.*]] = sub <4 x i64> [[TMP5]], [[TMP6]]
+; AVX2-NEXT:    [[TMP8:%.*]] = shl <4 x i64> [[TMP4]], [[TMP7]]
+; AVX2-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP5]], <4 x i32> <i32 0, i32 4, i32 poison, i32 4>
+; AVX2-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i64> [[TMP9]], <4 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 3>
+; AVX2-NEXT:    [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP5]], <4 x i32> <i32 1, i32 5, i32 poison, i32 5>
+; AVX2-NEXT:    [[TMP12:%.*]] = shufflevector <4 x i64> [[TMP11]], <4 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 5, i32 3>
+; AVX2-NEXT:    [[TMP13:%.*]] = or <4 x i64> [[TMP10]], [[TMP12]]
+; AVX2-NEXT:    [[TMP14:%.*]] = trunc <4 x i64> [[TMP13]] to <4 x i32>
+; AVX2-NEXT:    br label [[BB:%.*]]
+; AVX2:       bb:
+; AVX2-NEXT:    [[TMP15:%.*]] = phi <4 x i32> [ [[TMP16:%.*]], [[BB]] ], [ [[TMP14]], [[ENTRY:%.*]] ]
+; AVX2-NEXT:    [[TMP16]] = trunc <4 x i64> [[TMP8]] to <4 x i32>
+; AVX2-NEXT:    br label [[BB]]
+;
 entry:
   %a0 = add i64 %p0, %p0
   %a1 = add i64 %p1, %p1

From a7d6039f3efb02992d64164e6778b8bebf0a526b Mon Sep 17 00:00:00 2001
From: Aviad Cohen <aviadcohen7@gmail.com>
Date: Thu, 26 Oct 2023 18:54:23 +0300
Subject: [PATCH 092/877] [mlir][linalg] Replace CopyOp from memref to linalg
 in linalg PromoteOp (#69154)

linalg::CopyOp is much more generic and useful to promote buffers. In addition, this is linalg transform and makes more sense to use linalg operations when possible.
---
 .../Dialect/Linalg/Transforms/Promotion.cpp   |  2 +-
 mlir/test/Dialect/Linalg/promote.mlir         | 28 ++++++++-----------
 .../Dialect/Linalg/promotion_options.mlir     |  6 ++--
 .../Dialect/Linalg/transform-promotion.mlir   | 14 +++++-----
 4 files changed, 23 insertions(+), 27 deletions(-)

diff --git a/mlir/lib/Dialect/Linalg/Transforms/Promotion.cpp b/mlir/lib/Dialect/Linalg/Transforms/Promotion.cpp
index a131f30976661..5c140a7d692a9 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Promotion.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Promotion.cpp
@@ -209,7 +209,7 @@ LinalgOpInstancePromotionOptions::LinalgOpInstancePromotionOptions(
   Location loc = linalgOp.getLoc();
   auto defaultCopyCallBack = [loc](OpBuilder &b, Value src,
                                    Value dst) -> LogicalResult {
-    b.create<memref::CopyOp>(loc, src, dst);
+    b.create<linalg::CopyOp>(loc, src, dst);
     return success();
   };
   copyInFn = (options.copyInFn ? *(options.copyInFn) : defaultCopyCallBack);
diff --git a/mlir/test/Dialect/Linalg/promote.mlir b/mlir/test/Dialect/Linalg/promote.mlir
index 4fa56b474f82e..fb5f357f3faa8 100644
--- a/mlir/test/Dialect/Linalg/promote.mlir
+++ b/mlir/test/Dialect/Linalg/promote.mlir
@@ -52,15 +52,13 @@ func.func @matmul_f32(%A: memref<?xi8>, %M: index, %N: index, %K: index) {
 //       CHECK:         %[[fullC:.*]] = memref.view %[[tmpC]][{{.*}}][{{.*}}] : memref<24xi8> to memref<?x?xf32>
 //       CHECK:         %[[partialC:.*]] = memref.subview %[[fullC]]{{.*}} : memref<?x?xf32> to memref<?x?xf32, strided<[?, 1], offset: ?>>
 
-//       CHECK:         memref.copy %[[vA]], %[[partialA]] : memref<?x?xf32, strided<[?, 1], offset: ?>> to memref<?x?xf32, strided<[?, 1], offset: ?>>
-//       CHECK:         memref.copy %[[vB]], %[[partialB]] : memref<?x?xf32, strided<[?, 1], offset: ?>> to memref<?x?xf32, strided<[?, 1], offset: ?>>
-//       CHECK:         memref.copy %[[vC]], %[[partialC]] : memref<?x?xf32, strided<[?, 1], offset: ?>> to memref<?x?xf32, strided<[?, 1], offset: ?>>
+//       CHECK:         linalg.copy ins(%[[vA]] : memref<?x?xf32, strided<[?, 1], offset: ?>>) outs(%[[partialA]] : memref<?x?xf32, strided<[?, 1], offset: ?>>)
+//       CHECK:         linalg.copy ins(%[[vB]] : memref<?x?xf32, strided<[?, 1], offset: ?>>) outs(%[[partialB]] : memref<?x?xf32, strided<[?, 1], offset: ?>>)
+//       CHECK:         linalg.copy ins(%[[vC]] : memref<?x?xf32, strided<[?, 1], offset: ?>>) outs(%[[partialC]] : memref<?x?xf32, strided<[?, 1], offset: ?>>)
 //
 //       CHECK:         linalg.matmul ins(%[[partialA]], %[[partialB]]{{.*}} outs(%[[partialC]]
 //
-//       CHECK:         memref.copy %[[partialC]], %[[vC]] :
-//       CHECK:           memref<?x?xf32, strided<[?, 1], offset: ?>> to
-//       CHECK:           memref<?x?xf32, strided<[?, 1], offset: ?>>
+//       CHECK:         linalg.copy ins(%[[partialC]] : memref<?x?xf32, strided<[?, 1], offset: ?>>) outs(%[[vC]] : memref<?x?xf32, strided<[?, 1], offset: ?>>)
 //
 //   CHECK-NOT:         memref.dealloc %[[tmpA]] : memref<32xi8>
 //   CHECK-NOT:         memref.dealloc %[[tmpB]] : memref<48xi8>
@@ -124,15 +122,13 @@ func.func @matmul_f64(%A: memref<?xi8>, %M: index, %N: index, %K: index) {
 //       CHECK:         %[[fullC_f64:.*]] = memref.view %[[tmpC_f64]][{{.*}}][{{.*}}] : memref<48xi8> to memref<?x?xf64>
 //       CHECK:         %[[partialC_f64:.*]] = memref.subview %[[fullC_f64]][0, 0] [%{{.*}}, %{{.*}}] [1, 1] : memref<?x?xf64> to memref<?x?xf64, strided<[?, 1], offset: ?>>
 
-//       CHECK:         memref.copy %[[vA_f64]], %[[partialA_f64]] : memref<?x?xf64, strided<[?, 1], offset: ?>> to memref<?x?xf64, strided<[?, 1], offset: ?>>
-//       CHECK:         memref.copy %[[vB_f64]], %[[partialB_f64]] : memref<?x?xf64, strided<[?, 1], offset: ?>> to memref<?x?xf64, strided<[?, 1], offset: ?>>
-//       CHECK:         memref.copy %[[vC_f64]], %[[partialC_f64]] : memref<?x?xf64, strided<[?, 1], offset: ?>> to memref<?x?xf64, strided<[?, 1], offset: ?>>
+//       CHECK:         linalg.copy ins(%[[vA_f64]] : memref<?x?xf64, strided<[?, 1], offset: ?>>) outs(%[[partialA_f64]] : memref<?x?xf64, strided<[?, 1], offset: ?>>)
+//       CHECK:         linalg.copy ins(%[[vB_f64]] : memref<?x?xf64, strided<[?, 1], offset: ?>>) outs(%[[partialB_f64]] : memref<?x?xf64, strided<[?, 1], offset: ?>>)
+//       CHECK:         linalg.copy ins(%[[vC_f64]] : memref<?x?xf64, strided<[?, 1], offset: ?>>) outs(%[[partialC_f64]] : memref<?x?xf64, strided<[?, 1], offset: ?>>)
 //
 //       CHECK:         linalg.matmul ins(%[[partialA_f64]], %[[partialB_f64]]{{.*}} outs(%[[partialC_f64]]
 //
-//       CHECK:         memref.copy %[[partialC_f64]], %[[vC_f64]] :
-//       CHECK:           memref<?x?xf64, strided<[?, 1], offset: ?>> to
-//       CHECK:           memref<?x?xf64, strided<[?, 1], offset: ?>>
+//       CHECK:         linalg.copy ins(%[[partialC_f64]] : memref<?x?xf64, strided<[?, 1], offset: ?>>) outs(%[[vC_f64]] : memref<?x?xf64, strided<[?, 1], offset: ?>>)
 //
 //       CHECK:         memref.dealloc %[[tmpA_f64]] : memref<64xi8>
 //       CHECK:         memref.dealloc %[[tmpB_f64]] : memref<96xi8>
@@ -263,7 +259,7 @@ func.func @promote_rank_reducing_subviews(%arg0:  memref<?x?x?x64xf32, strided<[
   // CHECK: %[[c_view:.+]] = memref.view
   // CHECK: %[[c_pro_subview:.+]] = memref.subview %[[c_view]]
 
-  // CHECK-COUNT-3: memref.copy
+  // CHECK-COUNT-3: linalg.copy
   // CHECK: linalg.generic
   // CHECK-SAME: ins(%[[a_pro_subview]], %[[b_pro_subview]]
   // CHECK-SAME: outs(%[[c_pro_subview]]
@@ -361,8 +357,8 @@ func.func @linalg_generic_update_all_function_inputs_outputs(%arg0: memref<3x4xf
   // CHECK:           %[[VAL_60:.*]] = memref.alloc() : memref<48xi8, #gpu.address_space<workgroup>>
   // CHECK:           %[[VAL_61:.*]] = memref.view %[[VAL_60]]{{\[}}%[[VAL_56]]]{{\[}}%[[VAL_50]], %[[VAL_53]]] : memref<48xi8, #gpu.address_space<workgroup>> to memref<?x?xf32, #gpu.address_space<workgroup>>
   // CHECK:           %[[VAL_62:.*]] = memref.subview %[[VAL_61]][0, 0] {{\[}}%[[VAL_52]], %[[VAL_55]]] [1, 1] : memref<?x?xf32, #gpu.address_space<workgroup>> to memref<?x?xf32, strided<[?, 1], offset: ?>, #gpu.address_space<workgroup>>
-  // CHECK:           memref.copy %[[VAL_3]], %[[VAL_24]] : memref<4x3xf32, strided<[4, 1]>, 1> to memref<?x?xf32, strided<[?, 1], offset: ?>, #gpu.address_space<workgroup>>
-  // CHECK:           memref.copy %[[VAL_4]], %[[VAL_43]] : memref<4x3xf32, strided<[4, 1]>, 1> to memref<?x?xf32, strided<[?, 1], offset: ?>, #gpu.address_space<workgroup>>
+// CHECK:           linalg.copy ins(%[[VAL_3]] : memref<4x3xf32, strided<[4, 1]>, 1>) outs(%[[VAL_24]] : memref<?x?xf32, strided<[?, 1], offset: ?>, #gpu.address_space<workgroup>>)
+// CHECK:           linalg.copy ins(%[[VAL_4]] : memref<4x3xf32, strided<[4, 1]>, 1>) outs(%[[VAL_43]] : memref<?x?xf32, strided<[?, 1], offset: ?>, #gpu.address_space<workgroup>>)
   // CHECK:           linalg.generic {doc = "", indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"], library_call = ""} ins(%[[VAL_24]], %[[VAL_43]] : memref<?x?xf32, strided<[?, 1], offset: ?>, #gpu.address_space<workgroup>>, memref<?x?xf32, strided<[?, 1], offset: ?>, #gpu.address_space<workgroup>>) outs(%[[VAL_62]] : memref<?x?xf32, strided<[?, 1], offset: ?>, #gpu.address_space<workgroup>>) {
   // CHECK:           ^bb0(%[[VAL_63:.*]]: f32, %[[VAL_64:.*]]: f32, %[[VAL_65:.*]]: f32):
   // CHECK:             %[[VAL_66:.*]] = arith.addf %[[VAL_63]], %[[VAL_64]] : f32
@@ -376,7 +372,7 @@ func.func @linalg_generic_update_all_function_inputs_outputs(%arg0: memref<3x4xf
     linalg.yield %1 : f32
   }
 
-  // CHECK:           memref.copy %[[VAL_62]], %[[VAL_5]] : memref<?x?xf32, strided<[?, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<4x3xf32, strided<[4, 1]>, 1>
+  // CHECK:           linalg.copy ins(%[[VAL_62]] : memref<?x?xf32, strided<[?, 1], offset: ?>, #gpu.address_space<workgroup>>) outs(%[[VAL_5]] : memref<4x3xf32, strided<[4, 1]>, 1>)
   // CHECK:           memref.dealloc %[[VAL_22]] : memref<48xi8, #gpu.address_space<workgroup>>
   // CHECK:           memref.dealloc %[[VAL_41]] : memref<48xi8, #gpu.address_space<workgroup>>
   // CHECK:           memref.dealloc %[[VAL_60]] : memref<48xi8, #gpu.address_space<workgroup>>
diff --git a/mlir/test/Dialect/Linalg/promotion_options.mlir b/mlir/test/Dialect/Linalg/promotion_options.mlir
index 760336ff34f84..3bf74b708cb82 100644
--- a/mlir/test/Dialect/Linalg/promotion_options.mlir
+++ b/mlir/test/Dialect/Linalg/promotion_options.mlir
@@ -27,10 +27,10 @@ func.func @gemm(%a : memref<?x?xf32>, %b : memref<?x?xf32>, %c : memref<?x?xf32>
 //      CHECK:       %[[VC:.*]] = memref.view %[[tmpC]][%[[C0]]][] : memref<1024xi8> to memref<16x16xf32>
 //      CHECK:       %[[svCC:.+]] = memref.subview %[[VC]]
 
-//      CHECK:       memref.copy %[[svA]], %[[svAA]]
-//      CHECK:       memref.copy %[[svC]], %[[svCC]]
+//      CHECK:       linalg.copy ins(%[[svA]] : memref<?x?xf32, strided<[?, 1], offset: ?>>) outs(%[[svAA]] : memref<?x?xf32, strided<[16, 1]>>)
+//      CHECK:       linalg.copy ins(%[[svC]] : memref<?x?xf32, strided<[?, 1], offset: ?>>) outs(%[[svCC]] : memref<?x?xf32, strided<[16, 1]>>)
 //      CHECK:       linalg.matmul ins(%[[VA]], %[[svB]]{{.*}} outs(%[[VC]]
-//      CHECK:       memref.copy %[[svCC]], %[[svC]]
+//      CHECK:       linalg.copy ins(%[[svCC]] : memref<?x?xf32, strided<[16, 1]>>) outs(%[[svC]] : memref<?x?xf32, strided<[?, 1], offset: ?>>)
 //      CHECK:       memref.dealloc %[[tmpA]]
 //      CHECK:       memref.dealloc %[[tmpC]]
 
diff --git a/mlir/test/Dialect/Linalg/transform-promotion.mlir b/mlir/test/Dialect/Linalg/transform-promotion.mlir
index 362719b730334..d6112db0f7772 100644
--- a/mlir/test/Dialect/Linalg/transform-promotion.mlir
+++ b/mlir/test/Dialect/Linalg/transform-promotion.mlir
@@ -51,9 +51,9 @@ func.func @promote_subview_matmul(%arg0: memref<?x?xf32, strided<[?, 1], offset:
 // CHECK:               %[[v2:.*]] = memref.view %[[a2]]{{.*}} : memref<24000000xi8> to memref<?x?xf32>
 // CHECK:               %[[l2:.*]] = memref.subview %[[v2]][0, 0] [%{{.*}}, %{{.*}}] [1, 1]
 // CHECK-SAME:            memref<?x?xf32> to memref<?x?xf32, strided<[?, 1], offset: ?>>
-// CHECK:               memref.copy %[[s0]], %[[l0]] : memref<?x?xf32, strided{{.*}}> to memref<?x?xf32, strided{{.*}}>
-// CHECK:               memref.copy %[[s1]], %[[l1]] : memref<?x?xf32, strided{{.*}}> to memref<?x?xf32, strided{{.*}}>
-// CHECK:               memref.copy %[[s2]], %[[l2]] : memref<?x?xf32, strided{{.*}}> to memref<?x?xf32, strided{{.*}}>
+// CHECK:               linalg.copy ins(%[[s0]] : memref<?x?xf32, strided{{.*}}>) outs(%[[l0]] : memref<?x?xf32, strided{{.*}}>)
+// CHECK:               linalg.copy ins(%[[s1]] : memref<?x?xf32, strided{{.*}}>) outs(%[[l1]] : memref<?x?xf32, strided{{.*}}>)
+// CHECK:               linalg.copy ins(%[[s2]] : memref<?x?xf32, strided{{.*}}>) outs(%[[l2]] : memref<?x?xf32, strided{{.*}}>)
 // CHECK:               linalg.matmul
 // CHECK-SAME:                 ins(%[[v0]], %[[v1]] : memref<?x?xf32>, memref<?x?xf32>)
 // CHECK-SAME:                outs(%[[v2]] : memref<?x?xf32>)
@@ -114,8 +114,8 @@ func.func @promote_first_subview_matmul(%arg0: memref<?x?xf32, strided<[?, 1], o
 // CHECK-NOT:     memref.alloc
 // CHECK-NOT:     memref.view
 // CHECK-NOT:     memref.subview
-// CHECK:         memref.copy %[[s0]], %[[l0]] : memref<?x?xf32, strided{{.*}}> to memref<?x?xf32, strided{{.*}}>
-// CHECK-NOT:     memref.copy
+// CHECK:         linalg.copy ins(%[[s0]] : memref<?x?xf32, strided{{.*}}>) outs(%[[l0]] : memref<?x?xf32, strided{{.*}}>)
+// CHECK-NOT:     linalg.copy
 // CHECK:         linalg.matmul
 // CHECK-SAME:           ins(%[[v0]], %[[s1]] : memref<?x?xf32>, memref<?x?xf32, strided<[?, ?], offset: ?>>)
 // CHECK-SAME:          outs(%[[s2]] : memref<?x?xf32, strided<[?, ?], offset: ?>>)
@@ -149,7 +149,7 @@ func.func @aligned_promote_fill(%arg0: memref<?x?xf32, strided<[?, 1], offset: ?
 // CHECK:         %[[v0:.*]] = memref.view %[[a0]]{{.*}} : memref<32000000xi8> to memref<?x?xf32>
 // CHECK:         %[[l0:.*]] = memref.subview %[[v0]][0, 0] [%{{.*}}, %{{.*}}] [1, 1] : memref<?x?xf32> to memref<?x?xf32, strided<[?, 1], offset: ?>>
 // CHECK:         linalg.fill ins({{.*}} : f32) outs(%[[v0]] : memref<?x?xf32>)
-// CHECK:         memref.copy %[[s0]], %[[l0]] : memref<?x?xf32, strided{{.*}}> to memref<?x?xf32, strided{{.*}}>
+// CHECK:         linalg.copy ins(%[[s0]] : memref<?x?xf32, strided{{.*}}>) outs(%[[l0]] : memref<?x?xf32, strided{{.*}}>)
 // CHECK:         linalg.fill ins(%[[cf]] : f32) outs(%[[v0]] : memref<?x?xf32>)
 
 module attributes {transform.with_named_sequence} {
@@ -182,7 +182,7 @@ func.func @aligned_promote_fill_complex(%arg0: memref<?x?xcomplex<f32>, strided<
 // CHECK:         %[[v0:.*]] = memref.view %[[a0]]{{.*}} : memref<64000000xi8> to memref<?x?xcomplex<f32>>
 // CHECK:         %[[l0:.*]] = memref.subview %[[v0]][0, 0] [%{{.*}}, %{{.*}}] [1, 1] : memref<?x?xcomplex<f32>> to memref<?x?xcomplex<f32>, strided<[?, 1], offset: ?>>
 // CHECK:         linalg.fill ins({{.*}} : complex<f32>) outs(%[[v0]] : memref<?x?xcomplex<f32>>)
-// CHECK:         memref.copy %[[s0]], %[[l0]] : memref<?x?xcomplex<f32>, strided{{.*}}> to memref<?x?xcomplex<f32>, strided{{.*}}>
+// CHECK:         linalg.copy ins(%[[s0]] : memref<?x?xcomplex<f32>, strided{{.*}}>) outs(%[[l0]] : memref<?x?xcomplex<f32>, strided{{.*}}>)
 // CHECK:         linalg.fill ins(%[[cc]] : complex<f32>) outs(%[[v0]] : memref<?x?xcomplex<f32>>)
 
 module attributes {transform.with_named_sequence} {

From 09e8ef975d9970560b893f79ec283f69ea8db953 Mon Sep 17 00:00:00 2001
From: Aaron Ballman <aaron@aaronballman.com>
Date: Thu, 26 Oct 2023 11:59:17 -0400
Subject: [PATCH 093/877] Diagnose use of VLAs in a coroutine (#70341)

Fixes https://github.com/llvm/llvm-project/issues/65858
---
 clang/docs/ReleaseNotes.rst                   |  4 +++
 .../clang/Basic/DiagnosticSemaKinds.td        |  2 ++
 clang/include/clang/Sema/ScopeInfo.h          |  8 +++++
 clang/lib/Sema/SemaCoroutine.cpp              |  5 ++++
 clang/lib/Sema/SemaType.cpp                   | 18 ++++++++----
 clang/test/SemaCXX/coroutine-vla.cpp          | 29 +++++++++++++++++++
 6 files changed, 60 insertions(+), 6 deletions(-)
 create mode 100644 clang/test/SemaCXX/coroutine-vla.cpp

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 074116d2edf9f..7238386231e1a 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -636,6 +636,10 @@ Bug Fixes to C++ Support
   (`#46200 <https://github.com/llvm/llvm-project/issues/46200>`_)
   (`#57812 <https://github.com/llvm/llvm-project/issues/57812>`_)
 
+- Diagnose use of a variable-length array in a coroutine. The design of
+  coroutines is such that it is not possible to support VLA use. Fixes:
+  (`#65858 <https://github.com/llvm/llvm-project/issues/65858>`_)
+
 - Fix bug where we were overriding zero-initialization of class members when
   default initializing a base class in a constant expression context. Fixes:
   (`#69890 <https://github.com/llvm/llvm-project/issues/69890>`_)
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index a673ce726d6c2..453bd8a9a3404 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -166,6 +166,8 @@ def ext_vla_folded_to_constant : ExtWarn<
   InGroup<GNUFoldingConstant>;
 def err_vla_unsupported : Error<
   "variable length arrays are not supported for %select{the current target|'%1'}0">;
+def err_vla_in_coroutine_unsupported : Error<
+  "variable length arrays in a coroutine are not supported">;
 def note_vla_unsupported : Note<
   "variable length arrays are not supported for the current target">;
 
diff --git a/clang/include/clang/Sema/ScopeInfo.h b/clang/include/clang/Sema/ScopeInfo.h
index 02b22af89ff03..b2f6e3289f41f 100644
--- a/clang/include/clang/Sema/ScopeInfo.h
+++ b/clang/include/clang/Sema/ScopeInfo.h
@@ -189,6 +189,9 @@ class FunctionScopeInfo {
   /// First SEH '__try' statement in the current function.
   SourceLocation FirstSEHTryLoc;
 
+  /// First use of a VLA within the current function.
+  SourceLocation FirstVLALoc;
+
 private:
   /// Used to determine if errors occurred in this function or block.
   DiagnosticErrorTrap ErrorTrap;
@@ -473,6 +476,11 @@ class FunctionScopeInfo {
     FirstSEHTryLoc = TryLoc;
   }
 
+  void setHasVLA(SourceLocation VLALoc) {
+    if (FirstVLALoc.isInvalid())
+      FirstVLALoc = VLALoc;
+  }
+
   bool NeedsScopeChecking() const {
     return !HasDroppedStmt && (HasIndirectGoto || HasMustTail ||
                                (HasBranchProtectedScope && HasBranchIntoScope));
diff --git a/clang/lib/Sema/SemaCoroutine.cpp b/clang/lib/Sema/SemaCoroutine.cpp
index d2b0922a4bb9c..cfaa93fbea4dd 100644
--- a/clang/lib/Sema/SemaCoroutine.cpp
+++ b/clang/lib/Sema/SemaCoroutine.cpp
@@ -1198,6 +1198,11 @@ void Sema::CheckCompletedCoroutineBody(FunctionDecl *FD, Stmt *&Body) {
   if (FD->hasAttr<AlwaysInlineAttr>())
     Diag(FD->getLocation(), diag::warn_always_inline_coroutine);
 
+  // The design of coroutines means we cannot allow use of VLAs within one, so
+  // diagnose if we've seen a VLA in the body of this function.
+  if (Fn->FirstVLALoc.isValid())
+    Diag(Fn->FirstVLALoc, diag::err_vla_in_coroutine_unsupported);
+
   // [stmt.return.coroutine]p1:
   //   A coroutine shall not enclose a return statement ([stmt.return]).
   if (Fn->FirstReturnLoc.isValid()) {
diff --git a/clang/lib/Sema/SemaType.cpp b/clang/lib/Sema/SemaType.cpp
index 28b81c1768a30..dea77fae4cadb 100644
--- a/clang/lib/Sema/SemaType.cpp
+++ b/clang/lib/Sema/SemaType.cpp
@@ -2706,12 +2706,18 @@ QualType Sema::BuildArrayType(QualType T, ArrayType::ArraySizeModifier ASM,
     }
   }
 
-  if (T->isVariableArrayType() && !Context.getTargetInfo().isVLASupported()) {
-    // CUDA device code and some other targets don't support VLAs.
-    bool IsCUDADevice = (getLangOpts().CUDA && getLangOpts().CUDAIsDevice);
-    targetDiag(Loc,
-               IsCUDADevice ? diag::err_cuda_vla : diag::err_vla_unsupported)
-        << (IsCUDADevice ? CurrentCUDATarget() : 0);
+  if (T->isVariableArrayType()) {
+    if (!Context.getTargetInfo().isVLASupported()) {
+      // CUDA device code and some other targets don't support VLAs.
+      bool IsCUDADevice = (getLangOpts().CUDA && getLangOpts().CUDAIsDevice);
+      targetDiag(Loc,
+                 IsCUDADevice ? diag::err_cuda_vla : diag::err_vla_unsupported)
+          << (IsCUDADevice ? CurrentCUDATarget() : 0);
+    } else if (sema::FunctionScopeInfo *FSI = getCurFunction()) {
+      // VLAs are supported on this target, but we may need to do delayed
+      // checking that the VLA is not being used within a coroutine.
+      FSI->setHasVLA(Loc);
+    }
   }
 
   // If this is not C99, diagnose array size modifiers on non-VLAs.
diff --git a/clang/test/SemaCXX/coroutine-vla.cpp b/clang/test/SemaCXX/coroutine-vla.cpp
new file mode 100644
index 0000000000000..176e35f346e2b
--- /dev/null
+++ b/clang/test/SemaCXX/coroutine-vla.cpp
@@ -0,0 +1,29 @@
+// RUN: %clang_cc1 %s -std=c++20 -fsyntax-only -Wno-vla-cxx-extension -verify
+#include "Inputs/std-coroutine.h"
+
+struct promise;
+
+struct coroutine : std::coroutine_handle<promise> {
+  using promise_type = ::promise;
+};
+
+struct promise
+{
+    coroutine get_return_object();
+    std::suspend_always initial_suspend() noexcept;
+    std::suspend_always final_suspend() noexcept;
+    void return_void();
+    void unhandled_exception();
+};
+
+coroutine foo(int n) {
+  int array[n]; // expected-error {{variable length arrays in a coroutine are not supported}}
+  co_return;
+}
+
+void lambda() {
+  [](int n) -> coroutine {
+    int array[n]; // expected-error {{variable length arrays in a coroutine are not supported}}
+    co_return;
+  }(10);
+}

From fd9b3e471e80d5c6aaf525f9fc3a04569eddb960 Mon Sep 17 00:00:00 2001
From: Aart Bik <39774503+aartbik@users.noreply.github.com>
Date: Thu, 26 Oct 2023 09:06:57 -0700
Subject: [PATCH 094/877] [mlir][sparse] cleanup merger test, add header
 (#70279)

---
 .../Dialect/SparseTensor/MergerTest.cpp       | 227 ++++++++----------
 1 file changed, 101 insertions(+), 126 deletions(-)

diff --git a/mlir/unittests/Dialect/SparseTensor/MergerTest.cpp b/mlir/unittests/Dialect/SparseTensor/MergerTest.cpp
index 561753f631c16..e28d88e046fcd 100644
--- a/mlir/unittests/Dialect/SparseTensor/MergerTest.cpp
+++ b/mlir/unittests/Dialect/SparseTensor/MergerTest.cpp
@@ -1,3 +1,11 @@
+//===- MergerTest.cpp - Tests for the sparsifier's merger -----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
 #include "mlir/Dialect/SparseTensor/Utils/Merger.h"
 #include "llvm/Support/Compiler.h"
 #include "gmock/gmock.h"
@@ -73,56 +81,43 @@ namespace {
 /// Helper classes/functions for testing Merger.
 ///
 
-/// Simple recursive data structure used to match expressions in `Merger`.
-struct Pattern;
-/// Since the patterns we need are rather small and short-lived, we use
-/// `Pattern const&` for "pointers" to patterns, rather than using
-/// something more elaborate like `std::shared_ptr<Pattern> const&`.
-using PatternRef = const Pattern &;
-struct Pattern {
+/// Simple recursive data structure used to match expressions in `Merger`,
+/// which uses const references into the short-lived data strucutures.
+struct Match {
   struct Children {
-    Children(PatternRef e0, PatternRef e1) : e0(e0), e1(e1) {}
-    PatternRef e0;
-    PatternRef e1;
+    Children(const Match &e0, const Match &e1) : e0(e0), e1(e1) {}
+    const Match &e0;
+    const Match &e1;
   };
 
-  TensorExp::Kind kind;
+  Match() : kind(TensorExp::Kind::kSynZero) {}
+  Match(TensorId tid) : kind(TensorExp::Kind::kTensor), tid(tid) {}
+  Match(TensorExp::Kind kind, const Match &e0, const Match &e1)
+      : kind(kind), children(e0, e1) {
+    assert(kind >= TensorExp::Kind::kMulF);
+  }
 
+  TensorExp::Kind kind;
   union {
-    /// Expressions representing tensors simply have a tensor number.
     TensorId tid;
-
-    /// Tensor operations point to their children.
     Children children;
   };
-
-  /// Constructors.
-  /// Rather than using these, please use the readable builder
-  /// functions below to make tests more readable.
-  Pattern() : kind(TensorExp::Kind::kSynZero) {}
-  Pattern(TensorId tid) : kind(TensorExp::Kind::kTensor), tid(tid) {}
-  Pattern(TensorExp::Kind kind, PatternRef e0, PatternRef e1)
-      : kind(kind), children(e0, e1) {
-    assert(kind >= TensorExp::Kind::kMulF);
-  }
 };
 
 ///
-/// Readable Pattern builder functions.
+/// Readable Match builder functions.
 /// These should be preferred over the actual constructors.
 ///
 
-static Pattern tensorPattern(TensorId tid) { return Pattern(tid); }
-static Pattern synZeroPattern() { return Pattern(); }
+static Match tensorMatch(TensorId tid) { return Match(tid); }
+static Match synZeroMatch() { return Match(); }
 
 #define IMPL_BINOP_PATTERN(OP, KIND)                                           \
-  LLVM_ATTRIBUTE_UNUSED static Pattern OP##Pattern(PatternRef e0,              \
-                                                   PatternRef e1) {            \
-    return Pattern(KIND, e0, e1);                                              \
+  LLVM_ATTRIBUTE_UNUSED static Match OP##Match(const Match &e0,                \
+                                               const Match &e1) {              \
+    return Match(KIND, e0, e1);                                                \
   }
-
 FOREVERY_BINOP(IMPL_BINOP_PATTERN)
-
 #undef IMPL_BINOP_PATTERN
 
 class MergerTestBase : public ::testing::Test {
@@ -150,9 +145,7 @@ class MergerTestBase : public ::testing::Test {
   LLVM_ATTRIBUTE_UNUSED ExprId OP##Expr(ExprId e0, ExprId e1) {                \
     return merger.addExp(KIND, e0, e1);                                        \
   }
-
   FOREVERY_BINOP(IMPL_BINOP_EXPR)
-
 #undef IMPL_BINOP_EXPR
 
   ///
@@ -168,7 +161,7 @@ class MergerTestBase : public ::testing::Test {
   /// ordering within groups.  If `simple` is true, then compare the
   /// `lat.simple` field instead to test the result after optimization.
   bool latPointWithinRange(LatSetId s, unsigned lo, unsigned n,
-                           PatternRef pattern, const BitVector &bits,
+                           const Match &pattern, const BitVector &bits,
                            bool simple) {
     for (unsigned k = lo, hi = lo + n; k < hi; ++k) {
       if (compareExpression(merger.lat(merger.set(s)[k]).exp, pattern) &&
@@ -180,13 +173,13 @@ class MergerTestBase : public ::testing::Test {
 
   /// Wrapper over latPointWithinRange for readability of tests.
   void expectLatPointWithinRange(LatSetId s, unsigned lo, unsigned n,
-                                 PatternRef pattern, const BitVector &bits,
+                                 const Match &pattern, const BitVector &bits,
                                  bool simple = false) {
     EXPECT_TRUE(latPointWithinRange(s, lo, n, pattern, bits, simple));
   }
 
   /// Wrapper over expectLatPointWithinRange for a single lat point.
-  void expectLatPoint(LatSetId s, unsigned lo, PatternRef pattern,
+  void expectLatPoint(LatSetId s, unsigned lo, const Match &pattern,
                       const BitVector &bits, bool simple = false) {
     EXPECT_TRUE(latPointWithinRange(s, lo, 1, pattern, bits, simple));
   }
@@ -216,7 +209,7 @@ class MergerTestBase : public ::testing::Test {
   /// Compares expressions for equality. Equality is defined recursively as:
   /// - Operations are equal if they have the same kind and children.
   /// - Leaf tensors are equal if they refer to the same tensor.
-  bool compareExpression(ExprId e, PatternRef pattern) {
+  bool compareExpression(ExprId e, const Match &pattern) {
     const auto &tensorExp = merger.exp(e);
     if (tensorExp.kind != pattern.kind)
       return false;
@@ -424,21 +417,19 @@ class MergerTest3T1LSo : public MergerTestBase {
     const auto t0 = tid(0);                                                    \
     const auto t1 = tid(1);                                                    \
     const auto t2 = tid(2);                                                    \
-    PatternRef p0 = tensorPattern(t0);                                         \
-    PatternRef p1 = tensorPattern(t1);                                         \
-    PatternRef p2 = tensorPattern(t2);                                         \
+    const Match &p0 = tensorMatch(t0);                                         \
+    const Match &p1 = tensorMatch(t1);                                         \
+    const Match &p2 = tensorMatch(t2);                                         \
     auto s = merger.buildLattices(e, l0);                                      \
     expectNumLatPoints(s, 1);                                                  \
-    expectLatPoint(s, 0, CONJ2##Pattern(CONJ1##Pattern(p0, p1), p2),           \
+    expectLatPoint(s, 0, CONJ2##Match(CONJ1##Match(p0, p1), p2),               \
                    loopsToBits({{l0, t0}, {l0, t1}, {l0, t2}}));               \
     s = merger.optimizeSet(s);                                                 \
     expectNumLatPoints(s, 1);                                                  \
-    expectLatPoint(s, 0, CONJ2##Pattern(CONJ1##Pattern(p0, p1), p2),           \
+    expectLatPoint(s, 0, CONJ2##Match(CONJ1##Match(p0, p1), p2),               \
                    loopsToBits({{l0, t1}}), true);                             \
   }
-
 FOREVERY_PAIR_OF_COMMON_CONJ_CONJ_BINOP(IMPL_MERGER_TEST_CONJ_CONJ_UNDEF)
-
 #undef IMPL_MERGER_TEST_CONJ_CONJ_UNDEF
 
 /// Vector multiplication (conjunction) of 2 vectors, i.e.;
@@ -461,21 +452,19 @@ FOREVERY_PAIR_OF_COMMON_CONJ_CONJ_BINOP(IMPL_MERGER_TEST_CONJ_CONJ_UNDEF)
     const auto t1 = tid(1);                                                    \
     const auto t2 = tid(2);                                                    \
     const auto t3 = tid(3);                                                    \
-    PatternRef p0 = tensorPattern(t0);                                         \
-    PatternRef p1 = tensorPattern(t1);                                         \
-    PatternRef p2 = tensorPattern(t2);                                         \
+    const Match &p0 = tensorMatch(t0);                                         \
+    const Match &p1 = tensorMatch(t1);                                         \
+    const Match &p2 = tensorMatch(t2);                                         \
     auto s = merger.buildLattices(e, l0);                                      \
     expectNumLatPoints(s, 1);                                                  \
-    expectLatPoint(s, 0, CONJ2##Pattern(CONJ1##Pattern(p0, p1), p2),           \
+    expectLatPoint(s, 0, CONJ2##Match(CONJ1##Match(p0, p1), p2),               \
                    loopsToBits({{l0, t0}, {l0, t1}, {l0, t3}}));               \
     s = merger.optimizeSet(s);                                                 \
     expectNumLatPoints(s, 1);                                                  \
-    expectLatPoint(s, 0, CONJ2##Pattern(CONJ1##Pattern(p0, p1), p2),           \
+    expectLatPoint(s, 0, CONJ2##Match(CONJ1##Match(p0, p1), p2),               \
                    loopsToBits({{l0, t3}}), true);                             \
   }
-
 FOREVERY_PAIR_OF_COMMON_CONJ_CONJ_BINOP(IMPL_MERGER_TEST_CONJ_CONJ_SPARSE_OUT)
-
 #undef IMPL_MERGER_TEST_CONJ_CONJ_SPARSE_OUT
 
 /// Vector addition (disjunction) of 2 vectors. i.e.;
@@ -499,26 +488,24 @@ FOREVERY_PAIR_OF_COMMON_CONJ_CONJ_BINOP(IMPL_MERGER_TEST_CONJ_CONJ_SPARSE_OUT)
     const auto l0 = lid(0);                                                    \
     const auto t0 = tid(0);                                                    \
     const auto t1 = tid(1);                                                    \
-    PatternRef p0 = tensorPattern(t0);                                         \
-    PatternRef p1 = tensorPattern(t1);                                         \
+    const Match &p0 = tensorMatch(t0);                                         \
+    const Match &p1 = tensorMatch(t1);                                         \
     auto s = merger.buildLattices(e, l0);                                      \
                                                                                \
     expectNumLatPoints(s, 3);                                                  \
-    expectLatPoint(s, 0, OP##Pattern(p0, p1),                                  \
+    expectLatPoint(s, 0, OP##Match(p0, p1),                                    \
                    loopsToBits({{l0, t0}, {l0, t1}}));                         \
     expectLatPointWithinRange(s, 1, 2, p0, loopsToBits({{l0, t0}}));           \
     expectLatPointWithinRange(s, 1, 2, p1, loopsToBits({{l0, t1}}));           \
                                                                                \
     s = merger.optimizeSet(s);                                                 \
     expectNumLatPoints(s, 3);                                                  \
-    expectLatPoint(s, 0, OP##Pattern(p0, p1),                                  \
-                   loopsToBits({{l0, t0}, {l0, t1}}), true);                   \
+    expectLatPoint(s, 0, OP##Match(p0, p1), loopsToBits({{l0, t0}, {l0, t1}}), \
+                   true);                                                      \
     expectLatPointWithinRange(s, 1, 2, p0, loopsToBits({{l0, t0}}), true);     \
     expectLatPointWithinRange(s, 1, 2, p1, loopsToBits({{l0, t1}}), true);     \
   }
-
 FOREVERY_COMMON_DISJ_BINOP(IMPL_MERGER_TEST_DISJ)
-
 #undef IMPL_MERGER_TEST_DISJ
 
 /// Vector multiplication (conjunction) of 2 vectors, i.e.;
@@ -533,22 +520,20 @@ FOREVERY_COMMON_DISJ_BINOP(IMPL_MERGER_TEST_DISJ)
     const auto l0 = lid(0);                                                    \
     const auto t0 = tid(0);                                                    \
     const auto t1 = tid(1);                                                    \
-    PatternRef p0 = tensorPattern(t0);                                         \
-    PatternRef p1 = tensorPattern(t1);                                         \
+    const Match &p0 = tensorMatch(t0);                                         \
+    const Match &p1 = tensorMatch(t1);                                         \
     auto s = merger.buildLattices(e, l0);                                      \
                                                                                \
     expectNumLatPoints(s, 1);                                                  \
-    expectLatPoint(s, 0, OP##Pattern(p0, p1),                                  \
+    expectLatPoint(s, 0, OP##Match(p0, p1),                                    \
                    loopsToBits({{l0, t0}, {l0, t1}}));                         \
                                                                                \
     s = merger.optimizeSet(s);                                                 \
     expectNumLatPoints(s, 1);                                                  \
-    expectLatPoint(s, 0, OP##Pattern(p0, p1),                                  \
-                   loopsToBits({{l0, t0}, {l0, t1}}), true);                   \
+    expectLatPoint(s, 0, OP##Match(p0, p1), loopsToBits({{l0, t0}, {l0, t1}}), \
+                   true);                                                      \
   }
-
 FOREVERY_COMMON_CONJ_BINOP(IMPL_MERGER_TEST_CONJ)
-
 #undef IMPL_MERGER_TEST_CONJ
 
 /// Vector multiplication (conjunction) then addition (disjunction), i.e.;
@@ -567,29 +552,27 @@ FOREVERY_COMMON_CONJ_BINOP(IMPL_MERGER_TEST_CONJ)
     const auto t0 = tid(0);                                                    \
     const auto t1 = tid(1);                                                    \
     const auto t2 = tid(2);                                                    \
-    PatternRef p0 = tensorPattern(t0);                                         \
-    PatternRef p1 = tensorPattern(t1);                                         \
-    PatternRef p2 = tensorPattern(t2);                                         \
+    const Match &p0 = tensorMatch(t0);                                         \
+    const Match &p1 = tensorMatch(t1);                                         \
+    const Match &p2 = tensorMatch(t2);                                         \
     auto s = merger.buildLattices(e, l0);                                      \
                                                                                \
     expectNumLatPoints(s, 3);                                                  \
-    expectLatPoint(s, 0, DISJ##Pattern(CONJ##Pattern(p0, p1), p2),             \
+    expectLatPoint(s, 0, DISJ##Match(CONJ##Match(p0, p1), p2),                 \
                    loopsToBits({{l0, t0}, {l0, t1}, {l0, t2}}));               \
-    expectLatPointWithinRange(s, 1, 2, CONJ##Pattern(p0, p1),                  \
+    expectLatPointWithinRange(s, 1, 2, CONJ##Match(p0, p1),                    \
                               loopsToBits({{l0, t0}, {l0, t1}}));              \
     expectLatPointWithinRange(s, 1, 2, p2, loopsToBits({{l0, t2}}));           \
                                                                                \
     s = merger.optimizeSet(s);                                                 \
     expectNumLatPoints(s, 3);                                                  \
-    expectLatPoint(s, 0, DISJ##Pattern(CONJ##Pattern(p0, p1), p2),             \
+    expectLatPoint(s, 0, DISJ##Match(CONJ##Match(p0, p1), p2),                 \
                    loopsToBits({{l0, t0}, {l0, t1}, {l0, t2}}));               \
-    expectLatPointWithinRange(s, 1, 2, CONJ##Pattern(p0, p1),                  \
+    expectLatPointWithinRange(s, 1, 2, CONJ##Match(p0, p1),                    \
                               loopsToBits({{l0, t0}, {l0, t1}}));              \
     expectLatPointWithinRange(s, 1, 2, p2, loopsToBits({{l0, t2}}));           \
   }
-
 FOREVERY_PAIR_OF_COMMON_CONJ_DISJ_BINOP(IMPL_MERGER_TEST_CONJ_DISJ)
-
 #undef IMPL_MERGER_TEST_CONJ_DISJ
 
 /// Vector addition (disjunction) then addition (disjunction), i.e.;
@@ -612,19 +595,19 @@ FOREVERY_PAIR_OF_COMMON_CONJ_DISJ_BINOP(IMPL_MERGER_TEST_CONJ_DISJ)
     const auto t0 = tid(0);                                                    \
     const auto t1 = tid(1);                                                    \
     const auto t2 = tid(2);                                                    \
-    PatternRef p0 = tensorPattern(t0);                                         \
-    PatternRef p1 = tensorPattern(t1);                                         \
-    PatternRef p2 = tensorPattern(t2);                                         \
+    const Match &p0 = tensorMatch(t0);                                         \
+    const Match &p1 = tensorMatch(t1);                                         \
+    const Match &p2 = tensorMatch(t2);                                         \
     auto s = merger.buildLattices(e, l0);                                      \
                                                                                \
     expectNumLatPoints(s, 7);                                                  \
-    expectLatPoint(s, 0, DISJ2##Pattern(DISJ1##Pattern(p0, p1), p2),           \
+    expectLatPoint(s, 0, DISJ2##Match(DISJ1##Match(p0, p1), p2),               \
                    loopsToBits({{l0, t0}, {l0, t1}, {l0, t2}}));               \
-    expectLatPointWithinRange(s, 1, 6, DISJ2##Pattern(p1, p2),                 \
+    expectLatPointWithinRange(s, 1, 6, DISJ2##Match(p1, p2),                   \
                               loopsToBits({{l0, t1}, {l0, t2}}));              \
-    expectLatPointWithinRange(s, 1, 6, DISJ2##Pattern(p0, p2),                 \
+    expectLatPointWithinRange(s, 1, 6, DISJ2##Match(p0, p2),                   \
                               loopsToBits({{l0, t0}, {l0, t2}}));              \
-    expectLatPointWithinRange(s, 1, 6, DISJ1##Pattern(p0, p1),                 \
+    expectLatPointWithinRange(s, 1, 6, DISJ1##Match(p0, p1),                   \
                               loopsToBits({{l0, t0}, {l0, t1}}));              \
     expectLatPointWithinRange(s, 1, 6, p2, loopsToBits({{l0, t2}}));           \
     expectLatPointWithinRange(s, 1, 6, p1, loopsToBits({{l0, t1}}));           \
@@ -632,21 +615,19 @@ FOREVERY_PAIR_OF_COMMON_CONJ_DISJ_BINOP(IMPL_MERGER_TEST_CONJ_DISJ)
                                                                                \
     s = merger.optimizeSet(s);                                                 \
     expectNumLatPoints(s, 7);                                                  \
-    expectLatPoint(s, 0, DISJ2##Pattern(DISJ1##Pattern(p0, p1), p2),           \
+    expectLatPoint(s, 0, DISJ2##Match(DISJ1##Match(p0, p1), p2),               \
                    loopsToBits({{l0, t0}, {l0, t1}, {l0, t2}}));               \
-    expectLatPointWithinRange(s, 1, 6, DISJ2##Pattern(p1, p2),                 \
+    expectLatPointWithinRange(s, 1, 6, DISJ2##Match(p1, p2),                   \
                               loopsToBits({{l0, t1}, {l0, t2}}));              \
-    expectLatPointWithinRange(s, 1, 6, DISJ2##Pattern(p0, p2),                 \
+    expectLatPointWithinRange(s, 1, 6, DISJ2##Match(p0, p2),                   \
                               loopsToBits({{l0, t0}, {l0, t2}}));              \
-    expectLatPointWithinRange(s, 1, 6, DISJ1##Pattern(p0, p1),                 \
+    expectLatPointWithinRange(s, 1, 6, DISJ1##Match(p0, p1),                   \
                               loopsToBits({{l0, t0}, {l0, t1}}));              \
     expectLatPointWithinRange(s, 1, 6, p2, loopsToBits({{l0, t2}}));           \
     expectLatPointWithinRange(s, 1, 6, p1, loopsToBits({{l0, t1}}));           \
     expectLatPointWithinRange(s, 1, 6, p0, loopsToBits({{l0, t0}}));           \
   }
-
 FOREVERY_PAIR_OF_COMMON_DISJ_DISJ_BINOP(IMPL_MERGER_TEST_DISJ_DISJ)
-
 #undef IMPL_MERGER_TEST_DISJ_DISJ
 
 /// Vector multiplication (conjunction) then multiplication (conjunction), i.e.;
@@ -663,21 +644,19 @@ FOREVERY_PAIR_OF_COMMON_DISJ_DISJ_BINOP(IMPL_MERGER_TEST_DISJ_DISJ)
     const auto t0 = tid(0);                                                    \
     const auto t1 = tid(1);                                                    \
     const auto t2 = tid(2);                                                    \
-    PatternRef p0 = tensorPattern(t0);                                         \
-    PatternRef p1 = tensorPattern(t1);                                         \
-    PatternRef p2 = tensorPattern(t2);                                         \
+    const Match &p0 = tensorMatch(t0);                                         \
+    const Match &p1 = tensorMatch(t1);                                         \
+    const Match &p2 = tensorMatch(t2);                                         \
     auto s = merger.buildLattices(e, l0);                                      \
     expectNumLatPoints(s, 1);                                                  \
-    expectLatPoint(s, 0, CONJ2##Pattern(CONJ1##Pattern(p0, p1), p2),           \
+    expectLatPoint(s, 0, CONJ2##Match(CONJ1##Match(p0, p1), p2),               \
                    loopsToBits({{l0, t0}, {l0, t1}, {l0, t2}}));               \
     s = merger.optimizeSet(s);                                                 \
     expectNumLatPoints(s, 1);                                                  \
-    expectLatPoint(s, 0, CONJ2##Pattern(CONJ1##Pattern(p0, p1), p2),           \
+    expectLatPoint(s, 0, CONJ2##Match(CONJ1##Match(p0, p1), p2),               \
                    loopsToBits({{l0, t0}, {l0, t1}, {l0, t2}}), true);         \
   }
-
 FOREVERY_PAIR_OF_COMMON_CONJ_CONJ_BINOP(IMPL_MERGER_TEST_CONJ_CONJ)
-
 #undef IMPL_MERGER_TEST_CONJ_CONJ
 
 /// Vector addition (disjunction) of 2 vectors, i.e.;
@@ -702,25 +681,23 @@ FOREVERY_PAIR_OF_COMMON_CONJ_CONJ_BINOP(IMPL_MERGER_TEST_CONJ_CONJ)
     const auto l0 = lid(0);                                                    \
     const auto t0 = tid(0);                                                    \
     const auto t1 = tid(1);                                                    \
-    PatternRef p0 = tensorPattern(t0);                                         \
-    PatternRef p1 = tensorPattern(t1);                                         \
+    const Match &p0 = tensorMatch(t0);                                         \
+    const Match &p1 = tensorMatch(t1);                                         \
     auto s = merger.buildLattices(e, l0);                                      \
                                                                                \
     expectNumLatPoints(s, 3);                                                  \
-    expectLatPoint(s, 0, OP##Pattern(p0, p1),                                  \
+    expectLatPoint(s, 0, OP##Match(p0, p1),                                    \
                    loopsToBits({{l0, t0}, {l0, t1}}));                         \
     expectLatPointWithinRange(s, 1, 2, p0, loopsToBits({{l0, t0}}));           \
     expectLatPointWithinRange(s, 1, 2, p1, loopsToBits({{l0, t1}}));           \
                                                                                \
     s = merger.optimizeSet(s);                                                 \
     expectNumLatPoints(s, 2);                                                  \
-    expectLatPoint(s, 0, OP##Pattern(p0, p1),                                  \
-                   loopsToBits({{l0, t0}, {l0, t1}}), true);                   \
+    expectLatPoint(s, 0, OP##Match(p0, p1), loopsToBits({{l0, t0}, {l0, t1}}), \
+                   true);                                                      \
     expectLatPoint(s, 1, p1, loopsToBits({{l0, t1}}), true);                   \
   }
-
 FOREVERY_COMMON_DISJ_BINOP(IMPL_MERGER_TEST_OPTIMIZED_DISJ)
-
 #undef IMPL_MERGER_TEST_OPTIMIZED_CONJ
 
 /// Vector multiplication (conjunction) of 2 vectors, i.e.:
@@ -740,20 +717,20 @@ FOREVERY_COMMON_DISJ_BINOP(IMPL_MERGER_TEST_OPTIMIZED_DISJ)
     const auto l0 = lid(0);                                                    \
     const auto t0 = tid(0);                                                    \
     const auto t1 = tid(1);                                                    \
-    PatternRef p0 = tensorPattern(t0);                                         \
-    PatternRef p1 = tensorPattern(t1);                                         \
+    const Match &p0 = tensorMatch(t0);                                         \
+    const Match &p1 = tensorMatch(t1);                                         \
     auto s = merger.buildLattices(e, l0);                                      \
                                                                                \
     expectNumLatPoints(s, 1);                                                  \
-    expectLatPoint(s, 0, OP##Pattern(p0, p1),                                  \
+    expectLatPoint(s, 0, OP##Match(p0, p1),                                    \
                    loopsToBits({{l0, t0}, {l0, t1}}));                         \
                                                                                \
     s = merger.optimizeSet(s);                                                 \
     expectNumLatPoints(s, 1);                                                  \
-    expectLatPoint(s, 0, OP##Pattern(p0, p1), loopsToBits({{l0, t0}}), true);  \
+    expectLatPoint(s, 0, OP##Match(p0, p1), loopsToBits({{l0, t0}}), true);    \
   }
-
 FOREVERY_COMMON_CONJ_BINOP(IMPL_MERGER_TEST_OPTIMIZED_CONJ)
+#undef IMPL_MERGER_TEST_OPTIMIZED_CONJ
 
 /// Vector element-wise comparison (disjunction) of 2 vectors. i.e.;
 ///   a(i) = b(i) + c(i)
@@ -775,20 +752,20 @@ TEST_F(MergerTest3T1L, vector_cmp) {
   const auto l0 = lid(0);
   const auto t0 = tid(0);
   const auto t1 = tid(1);
-  PatternRef zero = synZeroPattern();
-  PatternRef p0 = tensorPattern(t0);
-  PatternRef p1 = tensorPattern(t1);
+  const Match &zero = synZeroMatch();
+  const Match &p0 = tensorMatch(t0);
+  const Match &p1 = tensorMatch(t1);
   auto s = merger.buildLattices(e, l0);
-  expectLatPoint(s, 0, cmpiPattern(p0, p1), loopsToBits({{l0, t0}, {l0, t1}}));
-  expectLatPointWithinRange(s, 1, 2, cmpiPattern(p0, zero),
+  expectLatPoint(s, 0, cmpiMatch(p0, p1), loopsToBits({{l0, t0}, {l0, t1}}));
+  expectLatPointWithinRange(s, 1, 2, cmpiMatch(p0, zero),
                             loopsToBits({{l0, t0}}));
-  expectLatPointWithinRange(s, 1, 2, cmpiPattern(zero, p1),
+  expectLatPointWithinRange(s, 1, 2, cmpiMatch(zero, p1),
                             loopsToBits({{l0, t1}}));
   s = merger.optimizeSet(s);
-  expectLatPoint(s, 0, cmpiPattern(p0, p1), loopsToBits({{l0, t0}, {l0, t1}}));
-  expectLatPointWithinRange(s, 1, 2, cmpiPattern(p0, zero),
+  expectLatPoint(s, 0, cmpiMatch(p0, p1), loopsToBits({{l0, t0}, {l0, t1}}));
+  expectLatPointWithinRange(s, 1, 2, cmpiMatch(p0, zero),
                             loopsToBits({{l0, t0}}));
-  expectLatPointWithinRange(s, 1, 2, cmpiPattern(zero, p1),
+  expectLatPointWithinRange(s, 1, 2, cmpiMatch(zero, p1),
                             loopsToBits({{l0, t1}}));
 }
 
@@ -813,19 +790,17 @@ TEST_F(MergerTest3T1LD, vector_cmp) {
   const auto l0 = lid(0);
   const auto t0 = tid(0);
   const auto t1 = tid(1);
-  PatternRef zero = synZeroPattern();
-  PatternRef p0 = tensorPattern(t0);
-  PatternRef p1 = tensorPattern(t1);
+  const Match &zero = synZeroMatch();
+  const Match &p0 = tensorMatch(t0);
+  const Match &p1 = tensorMatch(t1);
   auto s = merger.buildLattices(e, l0);
-  expectLatPoint(s, 0, cmpiPattern(p0, p1), loopsToBits({{l0, t0}, {l0, t1}}));
-  expectLatPointWithinRange(s, 1, 2, cmpiPattern(p0, zero),
+  expectLatPoint(s, 0, cmpiMatch(p0, p1), loopsToBits({{l0, t0}, {l0, t1}}));
+  expectLatPointWithinRange(s, 1, 2, cmpiMatch(p0, zero),
                             loopsToBits({{l0, t0}}));
-  expectLatPointWithinRange(s, 1, 2, cmpiPattern(zero, p1),
+  expectLatPointWithinRange(s, 1, 2, cmpiMatch(zero, p1),
                             loopsToBits({{l0, t1}}));
   s = merger.optimizeSet(s);
-  expectLatPoint(s, 0, cmpiPattern(p0, p1), loopsToBits({{l0, t0}, {l0, t1}}));
-  expectLatPointWithinRange(s, 1, 2, cmpiPattern(zero, p1),
+  expectLatPoint(s, 0, cmpiMatch(p0, p1), loopsToBits({{l0, t0}, {l0, t1}}));
+  expectLatPointWithinRange(s, 1, 2, cmpiMatch(zero, p1),
                             loopsToBits({{l0, t1}}));
 }
-
-#undef IMPL_MERGER_TEST_OPTIMIZED_CONJ

From a13696fd8490f67b6ad119fcb6fda20e1fd3a089 Mon Sep 17 00:00:00 2001
From: Kiran Chandramohan <kiran.chandramohan@arm.com>
Date: Thu, 26 Oct 2023 15:55:24 +0000
Subject: [PATCH 095/877] [Flang][OpenMP] Port a few parallel tests to HLFIR
 flow

These are copies of tests in flang/test/Lower/OpenMP/FIR.
---
 .../parallel-firstprivate-clause-scalar.f90   | 202 ++++++++++
 .../parallel-lastprivate-clause-scalar.f90    | 253 ++++++++++++
 .../Lower/OpenMP/parallel-private-clause.f90  | 380 ++++++++++++++++++
 flang/test/Lower/OpenMP/parallel-wsloop.f90   | 298 ++++++++++++++
 flang/test/Lower/OpenMP/parallel.f90          | 206 ++++++++++
 5 files changed, 1339 insertions(+)
 create mode 100644 flang/test/Lower/OpenMP/parallel-firstprivate-clause-scalar.f90
 create mode 100644 flang/test/Lower/OpenMP/parallel-lastprivate-clause-scalar.f90
 create mode 100644 flang/test/Lower/OpenMP/parallel-private-clause.f90
 create mode 100644 flang/test/Lower/OpenMP/parallel-wsloop.f90
 create mode 100644 flang/test/Lower/OpenMP/parallel.f90

diff --git a/flang/test/Lower/OpenMP/parallel-firstprivate-clause-scalar.f90 b/flang/test/Lower/OpenMP/parallel-firstprivate-clause-scalar.f90
new file mode 100644
index 0000000000000..6402f98a2addc
--- /dev/null
+++ b/flang/test/Lower/OpenMP/parallel-firstprivate-clause-scalar.f90
@@ -0,0 +1,202 @@
+! This test checks lowering of `FIRSTPRIVATE` clause for scalar types.
+
+! REQUIRES: shell
+! RUN: bbc -fopenmp -emit-hlfir %s -o - | FileCheck %s --check-prefix=CHECK
+
+!CHECK-DAG: func @_QPfirstprivate_complex(%[[ARG1:.*]]: !fir.ref<!fir.complex<4>>{{.*}}, %[[ARG2:.*]]: !fir.ref<!fir.complex<8>>{{.*}}) {
+!CHECK:    %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]] {uniq_name = "_QFfirstprivate_complexEarg1"} : (!fir.ref<!fir.complex<4>>) -> (!fir.ref<!fir.complex<4>>, !fir.ref<!fir.complex<4>>)
+!CHECK:    %[[ARG2_DECL:.*]]:2 = hlfir.declare %[[ARG2]] {uniq_name = "_QFfirstprivate_complexEarg2"} : (!fir.ref<!fir.complex<8>>) -> (!fir.ref<!fir.complex<8>>, !fir.ref<!fir.complex<8>>)
+!CHECK:   omp.parallel {
+!CHECK:     %[[ARG1_PVT:.*]] = fir.alloca !fir.complex<4> {bindc_name = "arg1", pinned, uniq_name = "_QFfirstprivate_complexEarg1"}
+!CHECK:     %[[ARG1_PVT_DECL:.*]]:2 = hlfir.declare %[[ARG1_PVT]] {uniq_name = "_QFfirstprivate_complexEarg1"} : (!fir.ref<!fir.complex<4>>) -> (!fir.ref<!fir.complex<4>>, !fir.ref<!fir.complex<4>>)
+!CHECK:     %[[ARG1_VAL:.*]] = fir.load %[[ARG1_DECL]]#0 : !fir.ref<!fir.complex<4>>
+!CHECK:      hlfir.assign %[[ARG1_VAL]] to %[[ARG1_PVT_DECL]]#0 temporary_lhs : !fir.complex<4>, !fir.ref<!fir.complex<4>>
+!CHECK:     %[[ARG2_PVT:.*]] = fir.alloca !fir.complex<8> {bindc_name = "arg2", pinned, uniq_name = "_QFfirstprivate_complexEarg2"}
+!CHECK:     %[[ARG2_PVT_DECL:.*]]:2 = hlfir.declare %[[ARG2_PVT]] {uniq_name = "_QFfirstprivate_complexEarg2"} : (!fir.ref<!fir.complex<8>>) -> (!fir.ref<!fir.complex<8>>, !fir.ref<!fir.complex<8>>)
+!CHECK:     %[[ARG2_VAL:.*]] = fir.load %[[ARG2_DECL]]#0 : !fir.ref<!fir.complex<8>>
+!CHECK:     hlfir.assign %[[ARG2_VAL]] to %[[ARG2_PVT_DECL]]#0 temporary_lhs : !fir.complex<8>, !fir.ref<!fir.complex<8>>
+!CHECK:     fir.call @_QPfoo(%[[ARG1_PVT_DECL]]#1, %[[ARG2_PVT_DECL]]#1) {{.*}}: (!fir.ref<!fir.complex<4>>, !fir.ref<!fir.complex<8>>) -> ()
+!CHECK:     omp.terminator
+!CHECK:   }
+
+subroutine firstprivate_complex(arg1, arg2)
+        complex(4) :: arg1
+        complex(8) :: arg2
+
+!$OMP PARALLEL FIRSTPRIVATE(arg1, arg2)
+        call foo(arg1, arg2)
+!$OMP END PARALLEL
+
+end subroutine
+
+!CHECK-DAG: func @_QPfirstprivate_integer(%[[ARG1:.*]]: !fir.ref<i32>{{.*}}, %[[ARG2:.*]]: !fir.ref<i8>{{.*}}, %[[ARG3:.*]]: !fir.ref<i16>{{.*}}, %[[ARG4:.*]]: !fir.ref<i32>{{.*}}, %[[ARG5:.*]]: !fir.ref<i64>{{.*}}, %[[ARG6:.*]]: !fir.ref<i128>{{.*}}) {
+!CHECK:  %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]] {uniq_name = "_QFfirstprivate_integerEarg1"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:  %[[ARG2_DECL:.*]]:2 = hlfir.declare %[[ARG2]] {uniq_name = "_QFfirstprivate_integerEarg2"} : (!fir.ref<i8>) -> (!fir.ref<i8>, !fir.ref<i8>)
+!CHECK:  %[[ARG3_DECL:.*]]:2 = hlfir.declare %[[ARG3]] {uniq_name = "_QFfirstprivate_integerEarg3"} : (!fir.ref<i16>) -> (!fir.ref<i16>, !fir.ref<i16>)
+!CHECK:  %[[ARG4_DECL:.*]]:2 = hlfir.declare %[[ARG4]] {uniq_name = "_QFfirstprivate_integerEarg4"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:  %[[ARG5_DECL:.*]]:2 = hlfir.declare %[[ARG5]] {uniq_name = "_QFfirstprivate_integerEarg5"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
+!CHECK:  %[[ARG6_DECL:.*]]:2 = hlfir.declare %[[ARG6]] {uniq_name = "_QFfirstprivate_integerEarg6"} : (!fir.ref<i128>) -> (!fir.ref<i128>, !fir.ref<i128>)
+!CHECK:  omp.parallel {
+!CHECK:    %[[ARG1_PVT:.*]] = fir.alloca i32 {bindc_name = "arg1", pinned, uniq_name = "_QFfirstprivate_integerEarg1"}
+!CHECK:    %[[ARG1_PVT_DECL:.*]]:2 = hlfir.declare %[[ARG1_PVT]] {uniq_name = "_QFfirstprivate_integerEarg1"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:    %[[ARG1_VAL:.*]] = fir.load %[[ARG1_DECL]]#0 : !fir.ref<i32>
+!CHECK:    hlfir.assign %[[ARG1_VAL]] to %[[ARG1_PVT_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+!CHECK:    %[[ARG2_PVT:.*]] = fir.alloca i8 {bindc_name = "arg2", pinned, uniq_name = "_QFfirstprivate_integerEarg2"}
+!CHECK:    %[[ARG2_PVT_DECL:.*]]:2 = hlfir.declare %[[ARG2_PVT]] {uniq_name = "_QFfirstprivate_integerEarg2"} : (!fir.ref<i8>) -> (!fir.ref<i8>, !fir.ref<i8>)
+!CHECK:    %[[ARG2_VAL:.*]] = fir.load %[[ARG2_DECL]]#0 : !fir.ref<i8>
+!CHECK:    hlfir.assign %[[ARG2_VAL]] to %[[ARG2_PVT_DECL]]#0 temporary_lhs : i8, !fir.ref<i8>
+!CHECK:    %[[ARG3_PVT:.*]] = fir.alloca i16 {bindc_name = "arg3", pinned, uniq_name = "_QFfirstprivate_integerEarg3"}
+!CHECK:    %[[ARG3_PVT_DECL:.*]]:2 = hlfir.declare %[[ARG3_PVT]] {uniq_name = "_QFfirstprivate_integerEarg3"} : (!fir.ref<i16>) -> (!fir.ref<i16>, !fir.ref<i16>)
+!CHECK:    %[[ARG3_VAL:.*]] = fir.load %[[ARG3_DECL]]#0 : !fir.ref<i16>
+!CHECK:    hlfir.assign %[[ARG3_VAL]] to %[[ARG3_PVT_DECL]]#0 temporary_lhs : i16, !fir.ref<i16>
+!CHECK:    %[[ARG4_PVT:.*]] = fir.alloca i32 {bindc_name = "arg4", pinned, uniq_name = "_QFfirstprivate_integerEarg4"}
+!CHECK:    %[[ARG4_PVT_DECL:.*]]:2 = hlfir.declare %[[ARG4_PVT]] {uniq_name = "_QFfirstprivate_integerEarg4"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:    %[[ARG4_VAL:.*]] = fir.load %[[ARG4_DECL]]#0 : !fir.ref<i32>
+!CHECK:    hlfir.assign %[[ARG4_VAL]] to %[[ARG4_PVT_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+!CHECK:    %[[ARG5_PVT:.*]] = fir.alloca i64 {bindc_name = "arg5", pinned, uniq_name = "_QFfirstprivate_integerEarg5"}
+!CHECK:    %[[ARG5_PVT_DECL:.*]]:2 = hlfir.declare %[[ARG5_PVT]] {uniq_name = "_QFfirstprivate_integerEarg5"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
+!CHECK:    %[[ARG5_VAL:.*]] = fir.load %[[ARG5_DECL]]#0 : !fir.ref<i64>
+!CHECK:    hlfir.assign %[[ARG5_VAL]] to %[[ARG5_PVT_DECL]]#0 temporary_lhs : i64, !fir.ref<i64>
+!CHECK:    %[[ARG6_PVT:.*]] = fir.alloca i128 {bindc_name = "arg6", pinned, uniq_name = "_QFfirstprivate_integerEarg6"}
+!CHECK:    %[[ARG6_PVT_DECL:.*]]:2 = hlfir.declare %[[ARG6_PVT]] {uniq_name = "_QFfirstprivate_integerEarg6"} : (!fir.ref<i128>) -> (!fir.ref<i128>, !fir.ref<i128>)
+!CHECK:    %[[ARG6_VAL:.*]] = fir.load %[[ARG6_DECL]]#0 : !fir.ref<i128>
+!CHECK:    hlfir.assign %[[ARG6_VAL]] to %[[ARG6_PVT_DECL]]#0 temporary_lhs : i128, !fir.ref<i128>
+!CHECK:    fir.call @_QPbar(%[[ARG1_PVT_DECL]]#1, %[[ARG2_PVT_DECL]]#1, %[[ARG3_PVT_DECL]]#1, %[[ARG4_PVT_DECL]]#1,
+!%[[ARG5_PVT_DECL]]#1, %[[ARG6_PVT_DECL]]#1) {{.*}}: (!fir.ref<i32>, !fir.ref<i8>, !fir.ref<i16>, !fir.ref<i32>, !fir.ref<i64>, !fir.ref<i128>) -> ()
+!CHECK:    omp.terminator
+!CHECK:  }
+
+subroutine firstprivate_integer(arg1, arg2, arg3, arg4, arg5, arg6)
+        integer :: arg1
+        integer(kind=1) :: arg2
+        integer(kind=2) :: arg3
+        integer(kind=4) :: arg4
+        integer(kind=8) :: arg5
+        integer(kind=16) :: arg6
+
+!$OMP PARALLEL FIRSTPRIVATE(arg1, arg2, arg3, arg4, arg5, arg6)
+        call bar(arg1, arg2, arg3, arg4, arg5, arg6)
+!$OMP END PARALLEL
+
+end subroutine
+
+!CHECK-DAG: func @_QPfirstprivate_logical(%[[ARG1:.*]]: !fir.ref<!fir.logical<4>>{{.*}}, %[[ARG2:.*]]: !fir.ref<!fir.logical<1>>{{.*}}, %[[ARG3:.*]]: !fir.ref<!fir.logical<2>>{{.*}}, %[[ARG4:.*]]: !fir.ref<!fir.logical<4>>{{.*}}, %[[ARG5:.*]]: !fir.ref<!fir.logical<8>>{{.*}}) {
+!CHECK:    %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]] {uniq_name = "_QFfirstprivate_logicalEarg1"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+!CHECK:    %[[ARG2_DECL:.*]]:2 = hlfir.declare %[[ARG2]] {uniq_name = "_QFfirstprivate_logicalEarg2"} : (!fir.ref<!fir.logical<1>>) -> (!fir.ref<!fir.logical<1>>, !fir.ref<!fir.logical<1>>)
+!CHECK:    %[[ARG3_DECL:.*]]:2 = hlfir.declare %[[ARG3]] {uniq_name = "_QFfirstprivate_logicalEarg3"} : (!fir.ref<!fir.logical<2>>) -> (!fir.ref<!fir.logical<2>>, !fir.ref<!fir.logical<2>>)
+!CHECK:    %[[ARG4_DECL:.*]]:2 = hlfir.declare %[[ARG4]] {uniq_name = "_QFfirstprivate_logicalEarg4"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+!CHECK:    %[[ARG5_DECL:.*]]:2 = hlfir.declare %[[ARG5]] {uniq_name = "_QFfirstprivate_logicalEarg5"} : (!fir.ref<!fir.logical<8>>) -> (!fir.ref<!fir.logical<8>>, !fir.ref<!fir.logical<8>>)
+!CHECK:   omp.parallel {
+!CHECK:     %[[ARG1_PVT:.*]] = fir.alloca !fir.logical<4> {bindc_name = "arg1", pinned, uniq_name = "_QFfirstprivate_logicalEarg1"}
+!CHECK:     %[[ARG1_PVT_DECL:.*]]:2 = hlfir.declare %[[ARG1_PVT]] {uniq_name = "_QFfirstprivate_logicalEarg1"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+!CHECK:     %[[ARG1_VAL:.*]] = fir.load %[[ARG1_DECL]]#0 : !fir.ref<!fir.logical<4>>
+!CHECK:     hlfir.assign %[[ARG1_VAL]] to %[[ARG1_PVT_DECL]]#0 temporary_lhs : !fir.logical<4>, !fir.ref<!fir.logical<4>>
+!CHECK:     %[[ARG2_PVT:.*]] = fir.alloca !fir.logical<1> {bindc_name = "arg2", pinned, uniq_name = "_QFfirstprivate_logicalEarg2"}
+!CHECK:     %[[ARG2_PVT_DECL:.*]]:2 = hlfir.declare %[[ARG2_PVT]] {uniq_name = "_QFfirstprivate_logicalEarg2"} : (!fir.ref<!fir.logical<1>>) -> (!fir.ref<!fir.logical<1>>, !fir.ref<!fir.logical<1>>)
+!CHECK:     %[[ARG2_VAL:.*]] = fir.load %[[ARG2_DECL]]#0 : !fir.ref<!fir.logical<1>>
+!CHECK:     hlfir.assign %[[ARG2_VAL]] to %[[ARG2_PVT_DECL]]#0 temporary_lhs : !fir.logical<1>, !fir.ref<!fir.logical<1>>
+!CHECK:     %[[ARG3_PVT:.*]] = fir.alloca !fir.logical<2> {bindc_name = "arg3", pinned, uniq_name = "_QFfirstprivate_logicalEarg3"}
+!CHECK:     %[[ARG3_PVT_DECL:.*]]:2 = hlfir.declare %[[ARG3_PVT]] {uniq_name = "_QFfirstprivate_logicalEarg3"} : (!fir.ref<!fir.logical<2>>) -> (!fir.ref<!fir.logical<2>>, !fir.ref<!fir.logical<2>>)
+!CHECK:     %[[ARG3_VAL:.*]] = fir.load %[[ARG3_DECL]]#0 : !fir.ref<!fir.logical<2>>
+!CHECK:     hlfir.assign %[[ARG3_VAL]] to %[[ARG3_PVT_DECL]]#0 temporary_lhs : !fir.logical<2>, !fir.ref<!fir.logical<2>>
+!CHECK:     %[[ARG4_PVT:.*]] = fir.alloca !fir.logical<4> {bindc_name = "arg4", pinned, uniq_name = "_QFfirstprivate_logicalEarg4"}
+!CHECK:     %[[ARG4_PVT_DECL:.*]]:2 = hlfir.declare %[[ARG4_PVT]] {uniq_name = "_QFfirstprivate_logicalEarg4"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+!CHECK:     %[[ARG4_VAL:.*]] = fir.load %[[ARG4_DECL]]#0 : !fir.ref<!fir.logical<4>>
+!CHECK:     hlfir.assign %[[ARG4_VAL]] to %[[ARG4_PVT_DECL]]#0 temporary_lhs : !fir.logical<4>, !fir.ref<!fir.logical<4>>
+!CHECK:     %[[ARG5_PVT:.*]] = fir.alloca !fir.logical<8> {bindc_name = "arg5", pinned, uniq_name = "_QFfirstprivate_logicalEarg5"}
+!CHECK:     %[[ARG5_PVT_DECL:.*]]:2 = hlfir.declare %[[ARG5_PVT]] {uniq_name = "_QFfirstprivate_logicalEarg5"} : (!fir.ref<!fir.logical<8>>) -> (!fir.ref<!fir.logical<8>>, !fir.ref<!fir.logical<8>>)
+!CHECK:     %[[ARG5_VAL:.*]] = fir.load %[[ARG5_DECL]]#0 : !fir.ref<!fir.logical<8>>
+!CHECK:     hlfir.assign %[[ARG5_VAL]] to %[[ARG5_PVT_DECL]]#0 temporary_lhs : !fir.logical<8>, !fir.ref<!fir.logical<8>>
+!CHECK:     fir.call @_QPbaz(%[[ARG1_PVT_DECL]]#1, %[[ARG2_PVT_DECL]]#1, %[[ARG3_PVT_DECL]]#1, %[[ARG4_PVT_DECL]]#1, %[[ARG5_PVT_DECL]]#1) {{.*}}: (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<1>>, !fir.ref<!fir.logical<2>>, !fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<8>>) -> ()
+!CHECK:     omp.terminator
+!CHECK:   }
+
+subroutine firstprivate_logical(arg1, arg2, arg3, arg4, arg5)
+        logical :: arg1
+        logical(kind=1) :: arg2
+        logical(kind=2) :: arg3
+        logical(kind=4) :: arg4
+        logical(kind=8) :: arg5
+
+!$OMP PARALLEL FIRSTPRIVATE(arg1, arg2, arg3, arg4, arg5)
+        call baz(arg1, arg2, arg3, arg4, arg5)
+!$OMP END PARALLEL
+
+end subroutine
+
+!CHECK-DAG: func @_QPfirstprivate_real(%[[ARG1:.*]]: !fir.ref<f32>{{.*}}, %[[ARG2:.*]]: !fir.ref<f16>{{.*}}, %[[ARG3:.*]]: !fir.ref<f32>{{.*}}, %[[ARG4:.*]]: !fir.ref<f64>{{.*}}, %[[ARG5:.*]]: !fir.ref<f80>{{.*}}, %[[ARG6:.*]]: !fir.ref<f128>{{.*}}) {
+!CHECK:   %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]] {uniq_name = "_QFfirstprivate_realEarg1"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+!CHECK:   %[[ARG2_DECL:.*]]:2 = hlfir.declare %[[ARG2]] {uniq_name = "_QFfirstprivate_realEarg2"} : (!fir.ref<f16>) -> (!fir.ref<f16>, !fir.ref<f16>)
+!CHECK:   %[[ARG3_DECL:.*]]:2 = hlfir.declare %[[ARG3]] {uniq_name = "_QFfirstprivate_realEarg3"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+!CHECK:   %[[ARG4_DECL:.*]]:2 = hlfir.declare %[[ARG4]] {uniq_name = "_QFfirstprivate_realEarg4"} : (!fir.ref<f64>) -> (!fir.ref<f64>, !fir.ref<f64>)
+!CHECK:   %[[ARG5_DECL:.*]]:2 = hlfir.declare %[[ARG5]] {uniq_name = "_QFfirstprivate_realEarg5"} : (!fir.ref<f80>) -> (!fir.ref<f80>, !fir.ref<f80>)
+!CHECK:   %[[ARG6_DECL:.*]]:2 = hlfir.declare %[[ARG6]] {uniq_name = "_QFfirstprivate_realEarg6"} : (!fir.ref<f128>) -> (!fir.ref<f128>, !fir.ref<f128>)
+!CHECK:   omp.parallel {
+!CHECK:     %[[ARG1_PVT:.*]] = fir.alloca f32 {bindc_name = "arg1", pinned, uniq_name = "_QFfirstprivate_realEarg1"}
+!CHECK:     %[[ARG1_PVT_DECL:.*]]:2 = hlfir.declare %[[ARG1_PVT]] {uniq_name = "_QFfirstprivate_realEarg1"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+!CHECK:     %[[ARG1_VAL:.*]] = fir.load %[[ARG1_DECL]]#0 : !fir.ref<f32>
+!CHECK:     hlfir.assign %[[ARG1_VAL]] to %[[ARG1_PVT_DECL]]#0 temporary_lhs : f32, !fir.ref<f32>
+!CHECK:     %[[ARG2_PVT:.*]] = fir.alloca f16 {bindc_name = "arg2", pinned, uniq_name = "_QFfirstprivate_realEarg2"}
+!CHECK:     %[[ARG2_PVT_DECL:.*]]:2 = hlfir.declare %[[ARG2_PVT]] {uniq_name = "_QFfirstprivate_realEarg2"} : (!fir.ref<f16>) -> (!fir.ref<f16>, !fir.ref<f16>)
+!CHECK:     %[[ARG2_VAL:.*]] = fir.load %[[ARG2_DECL]]#0 : !fir.ref<f16>
+!CHECK:     hlfir.assign %[[ARG2_VAL]] to %[[ARG2_PVT_DECL]]#0 temporary_lhs : f16, !fir.ref<f16>
+!CHECK:     %[[ARG3_PVT:.*]] = fir.alloca f32 {bindc_name = "arg3", pinned, uniq_name = "_QFfirstprivate_realEarg3"}
+!CHECK:     %[[ARG3_PVT_DECL:.*]]:2 = hlfir.declare %[[ARG3_PVT]] {uniq_name = "_QFfirstprivate_realEarg3"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+!CHECK:     %[[ARG3_VAL:.*]] = fir.load %[[ARG3_DECL]]#0 : !fir.ref<f32>
+!CHECK:     hlfir.assign %[[ARG3_VAL]] to %[[ARG3_PVT_DECL]]#0 temporary_lhs : f32, !fir.ref<f32>
+!CHECK:     %[[ARG4_PVT:.*]] = fir.alloca f64 {bindc_name = "arg4", pinned, uniq_name = "_QFfirstprivate_realEarg4"}
+!CHECK:     %[[ARG4_PVT_DECL:.*]]:2 = hlfir.declare %[[ARG4_PVT]] {uniq_name = "_QFfirstprivate_realEarg4"} : (!fir.ref<f64>) -> (!fir.ref<f64>, !fir.ref<f64>)
+!CHECK:     %[[ARG4_VAL:.*]] = fir.load %[[ARG4_DECL]]#0 : !fir.ref<f64>
+!CHECK:     hlfir.assign %[[ARG4_VAL]] to %[[ARG4_PVT_DECL]]#0 temporary_lhs : f64, !fir.ref<f64>
+!CHECK:     %[[ARG5_PVT:.*]] = fir.alloca f80 {bindc_name = "arg5", pinned, uniq_name = "_QFfirstprivate_realEarg5"}
+!CHECK:     %[[ARG5_PVT_DECL:.*]]:2 = hlfir.declare %[[ARG5_PVT]] {uniq_name = "_QFfirstprivate_realEarg5"} : (!fir.ref<f80>) -> (!fir.ref<f80>, !fir.ref<f80>)
+!CHECK:     %[[ARG5_VAL:.*]] = fir.load %[[ARG5_DECL]]#0 : !fir.ref<f80>
+!CHECK:     hlfir.assign %[[ARG5_VAL]] to %[[ARG5_PVT_DECL]]#0 temporary_lhs : f80, !fir.ref<f80>
+!CHECK:     %[[ARG6_PVT:.*]] = fir.alloca f128 {bindc_name = "arg6", pinned, uniq_name = "_QFfirstprivate_realEarg6"}
+!CHECK:     %[[ARG6_PVT_DECL:.*]]:2 = hlfir.declare %[[ARG6_PVT]] {uniq_name = "_QFfirstprivate_realEarg6"} : (!fir.ref<f128>) -> (!fir.ref<f128>, !fir.ref<f128>)
+!CHECK:     %[[ARG6_VAL:.*]] = fir.load %[[ARG6_DECL]]#0 : !fir.ref<f128>
+!CHECK:     hlfir.assign %[[ARG6_VAL]] to %[[ARG6_PVT_DECL]]#0 temporary_lhs : f128, !fir.ref<f128>
+!CHECK:     fir.call @_QPqux(%[[ARG1_PVT_DECL]]#1, %[[ARG2_PVT_DECL]]#1, %[[ARG3_PVT_DECL]]#1, %[[ARG4_PVT_DECL]]#1, %[[ARG5_PVT_DECL]]#1, %[[ARG6_PVT_DECL]]#1) {{.*}}: (!fir.ref<f32>, !fir.ref<f16>, !fir.ref<f32>, !fir.ref<f64>, !fir.ref<f80>, !fir.ref<f128>) -> ()
+!CHECK:     omp.terminator
+!CHECK:   }
+
+subroutine firstprivate_real(arg1, arg2, arg3, arg4, arg5, arg6)
+        real :: arg1
+        real(kind=2) :: arg2
+        real(kind=4) :: arg3
+        real(kind=8) :: arg4
+        real(kind=10) :: arg5
+        real(kind=16) :: arg6
+
+!$OMP PARALLEL FIRSTPRIVATE(arg1, arg2, arg3, arg4, arg5, arg6)
+        call qux(arg1, arg2, arg3, arg4, arg5, arg6)
+!$OMP END PARALLEL
+
+end subroutine
+
+!CHECK-LABEL:   func.func @_QPmultiple_firstprivate(
+!CHECK-SAME:                                        %[[A_ADDR:.*]]: !fir.ref<i32> {fir.bindc_name = "a"},
+!CHECK-SAME:                                        %[[B_ADDR:.*]]: !fir.ref<i32> {fir.bindc_name = "b"}) {
+!CHECK:           %[[A_DECL:.*]]:2 = hlfir.declare %[[A_ADDR]] {uniq_name = "_QFmultiple_firstprivateEa"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:           %[[B_DECL:.*]]:2 = hlfir.declare %[[B_ADDR]] {uniq_name = "_QFmultiple_firstprivateEb"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:           omp.parallel   {
+!CHECK:             %[[A_PRIV_ADDR:.*]] = fir.alloca i32 {bindc_name = "a", pinned, uniq_name = "_QFmultiple_firstprivateEa"}
+!CHECK:             %[[A_PRIV_DECL:.*]]:2 = hlfir.declare %[[A_PRIV_ADDR]] {uniq_name = "_QFmultiple_firstprivateEa"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:             %[[A:.*]] = fir.load %[[A_DECL]]#0 : !fir.ref<i32>
+!CHECK:             hlfir.assign %[[A]] to %[[A_PRIV_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+!CHECK:             %[[B_PRIV_ADDR:.*]] = fir.alloca i32 {bindc_name = "b", pinned, uniq_name = "_QFmultiple_firstprivateEb"}
+!CHECK:             %[[B_PRIV_DECL:.*]]:2 = hlfir.declare %[[B_PRIV_ADDR]] {uniq_name = "_QFmultiple_firstprivateEb"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:             %[[B:.*]] = fir.load %[[B_DECL]]#0 : !fir.ref<i32>
+!CHECK:             hlfir.assign %[[B]] to %[[B_PRIV_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+!CHECK:             fir.call @_QPquux(%[[A_PRIV_DECL]]#1, %[[B_PRIV_DECL]]#1) {{.*}}: (!fir.ref<i32>, !fir.ref<i32>) -> ()
+!CHECK:             omp.terminator
+!CHECK:           }
+!CHECK:           return
+!CHECK:         }
+
+subroutine multiple_firstprivate(a, b)
+        integer :: a, b
+!$OMP PARALLEL FIRSTPRIVATE(a) FIRSTPRIVATE(b)
+        call quux(a, b)
+!$OMP END PARALLEL
+end subroutine multiple_firstprivate
diff --git a/flang/test/Lower/OpenMP/parallel-lastprivate-clause-scalar.f90 b/flang/test/Lower/OpenMP/parallel-lastprivate-clause-scalar.f90
new file mode 100644
index 0000000000000..abd14f455123b
--- /dev/null
+++ b/flang/test/Lower/OpenMP/parallel-lastprivate-clause-scalar.f90
@@ -0,0 +1,253 @@
+! This test checks lowering of `LASTPRIVATE` clause for scalar types.
+
+! RUN: bbc -fopenmp -emit-hlfir %s -o - | FileCheck %s
+! RUN: flang-new -fc1 -fopenmp -emit-hlfir %s -o - | FileCheck %s
+
+!CHECK: func @_QPlastprivate_character(%[[ARG1:.*]]: !fir.boxchar<1>{{.*}}) {
+!CHECK-DAG: %[[ARG1_UNBOX:.*]]:2 = fir.unboxchar
+!CHECK-DAG: %[[FIVE:.*]] = arith.constant 5 : index
+!CHECK-DAG: %[[ARG1_REF:.*]] = fir.convert %[[ARG1_UNBOX]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.char<1,5>>
+!CHECK-DAG: %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1_REF]] typeparams %[[FIVE]] {uniq_name = "_QFlastprivate_characterEarg1"} : (!fir.ref<!fir.char<1,5>>, index) -> (!fir.ref<!fir.char<1,5>>, !fir.ref<!fir.char<1,5>>)
+
+!CHECK: omp.parallel {
+!CHECK-DAG: %[[ARG1_PVT:.*]] = fir.alloca !fir.char<1,5> {bindc_name = "arg1",
+!CHECK-DAG: %[[ARG1_PVT_DECL:.*]]:2 = hlfir.declare %[[ARG1_PVT]] typeparams %[[FIVE]] {uniq_name = "_QFlastprivate_characterEarg1"} : (!fir.ref<!fir.char<1,5>>, index) -> (!fir.ref<!fir.char<1,5>>, !fir.ref<!fir.char<1,5>>)
+
+! Check that we are accessing the clone inside the loop
+!CHECK-DAG: omp.wsloop for (%[[INDX_WS:.*]]) : {{.*}} {
+!CHECK-DAG: %[[NEG_ONE:.*]] = arith.constant -1 : i32
+!CHECK-NEXT: %[[ADDR:.*]] = fir.address_of(@_QQcl.
+!CHECK-NEXT: %[[CVT0:.*]] = fir.convert %[[ADDR]] 
+!CHECK-NEXT: %[[CNST:.*]] = arith.constant
+!CHECK-NEXT: %[[CALL_BEGIN_IO:.*]] = fir.call @_FortranAioBeginExternalListOutput(%[[NEG_ONE]], %[[CVT0]], %[[CNST]]) {{.*}}: (i32, !fir.ref<i8>, i32) -> !fir.ref<i8>
+!CHECK-NEXT: %[[CVT_0_1:.*]] = fir.convert %[[ARG1_PVT_DECL]]#1 
+!CHECK-NEXT: %[[CVT_0_2:.*]] = fir.convert %[[FIVE]]
+!CHECK-NEXT: %[[CALL_OP_ASCII:.*]] = fir.call @_FortranAioOutputAscii(%[[CALL_BEGIN_IO]], %[[CVT_0_1]], %[[CVT_0_2]])
+!CHECK-NEXT: %[[CALL_END_IO:.*]] = fir.call @_FortranAioEndIoStatement(%[[CALL_BEGIN_IO]])
+
+! Testing last iteration check
+!CHECK: %[[V:.*]] = arith.addi %[[INDX_WS]], %{{.*}} : i32
+!CHECK: %[[C0:.*]] = arith.constant 0 : i32
+!CHECK: %[[T1:.*]] = arith.cmpi slt, %{{.*}}, %[[C0]] : i32
+!CHECK: %[[T2:.*]] = arith.cmpi slt, %[[V]], %{{.*}} : i32
+!CHECK: %[[T3:.*]] = arith.cmpi sgt, %[[V]], %{{.*}} : i32
+!CHECK: %[[IV_CMP:.*]] = arith.select %[[T1]], %[[T2]], %[[T3]] : i1
+!CHECK: fir.if %[[IV_CMP]] {
+!CHECK: fir.store %[[V]] to %{{.*}} : !fir.ref<i32>
+
+! Testing lastprivate val update
+!CHECK-DAG: hlfir.assign %[[ARG1_PVT_DECL]]#0 to %[[ARG1_DECL]]#0 temporary_lhs : !fir.ref<!fir.char<1,5>>, !fir.ref<!fir.char<1,5>>
+!CHECK-DAG: } 
+!CHECK-DAG: omp.yield
+
+subroutine lastprivate_character(arg1)
+        character(5) :: arg1
+!$OMP PARALLEL 
+!$OMP DO LASTPRIVATE(arg1)
+do n = 1, 5
+        arg1(n:n) = 'c'
+        print *, arg1
+end do
+!$OMP END DO
+!$OMP END PARALLEL
+end subroutine
+
+!CHECK: func @_QPlastprivate_int(%[[ARG1:.*]]: !fir.ref<i32> {fir.bindc_name = "arg1"}) {
+!CHECK: %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]] {uniq_name = "_QFlastprivate_intEarg1"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK-DAG: omp.parallel  {
+!CHECK-DAG: %[[CLONE:.*]] = fir.alloca i32 {bindc_name = "arg1"
+!CHECK-DAG: %[[CLONE_DECL:.*]]:2 = hlfir.declare %[[CLONE]] {uniq_name = "_QFlastprivate_intEarg1"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: omp.wsloop for (%[[INDX_WS:.*]]) : {{.*}} {
+
+! Testing last iteration check
+!CHECK: %[[V:.*]] = arith.addi %[[INDX_WS]], %{{.*}} : i32
+!CHECK: %[[C0:.*]] = arith.constant 0 : i32
+!CHECK: %[[T1:.*]] = arith.cmpi slt, %{{.*}}, %[[C0]] : i32
+!CHECK: %[[T2:.*]] = arith.cmpi slt, %[[V]], %{{.*}} : i32
+!CHECK: %[[T3:.*]] = arith.cmpi sgt, %[[V]], %{{.*}} : i32
+!CHECK: %[[IV_CMP:.*]] = arith.select %[[T1]], %[[T2]], %[[T3]] : i1
+!CHECK: fir.if %[[IV_CMP]] {
+!CHECK: fir.store %[[V]] to %{{.*}} : !fir.ref<i32>
+
+! Testing lastprivate val update
+!CHECK-NEXT: %[[CLONE_LD:.*]] = fir.load %[[CLONE_DECL]]#0 : !fir.ref<i32>
+!CHECK:      hlfir.assign %[[CLONE_LD]] to %[[ARG1_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+!CHECK-DAG: }
+!CHECK-DAG: omp.yield
+
+subroutine lastprivate_int(arg1)
+        integer :: arg1
+!$OMP PARALLEL 
+!$OMP DO LASTPRIVATE(arg1)
+do n = 1, 5
+        arg1 = 2
+        print *, arg1
+end do
+!$OMP END DO
+!$OMP END PARALLEL
+print *, arg1
+end subroutine
+
+!CHECK: func.func @_QPmult_lastprivate_int(%[[ARG1:.*]]: !fir.ref<i32> {fir.bindc_name = "arg1"}, %[[ARG2:.*]]: !fir.ref<i32> {fir.bindc_name = "arg2"}) {
+!CHECK: %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]] {uniq_name = "_QFmult_lastprivate_intEarg1"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[ARG2_DECL:.*]]:2 = hlfir.declare %[[ARG2]] {uniq_name = "_QFmult_lastprivate_intEarg2"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: omp.parallel  {
+!CHECK-DAG: %[[CLONE1:.*]] = fir.alloca i32 {bindc_name = "arg1"
+!CHECK-DAG: %[[CLONE1_DECL:.*]]:2 = hlfir.declare %[[CLONE1]] {uniq_name = "_QFmult_lastprivate_intEarg1"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK-DAG: %[[CLONE2:.*]] = fir.alloca i32 {bindc_name = "arg2"
+!CHECK-DAG: %[[CLONE2_DECL:.*]]:2 = hlfir.declare %[[CLONE2]] {uniq_name = "_QFmult_lastprivate_intEarg2"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: omp.wsloop for (%[[INDX_WS:.*]]) : {{.*}} {
+
+! Testing last iteration check
+!CHECK: %[[V:.*]] = arith.addi %[[INDX_WS]], %{{.*}} : i32
+!CHECK: %[[C0:.*]] = arith.constant 0 : i32
+!CHECK: %[[T1:.*]] = arith.cmpi slt, %{{.*}}, %[[C0]] : i32
+!CHECK: %[[T2:.*]] = arith.cmpi slt, %[[V]], %{{.*}} : i32
+!CHECK: %[[T3:.*]] = arith.cmpi sgt, %[[V]], %{{.*}} : i32
+!CHECK: %[[IV_CMP:.*]] = arith.select %[[T1]], %[[T2]], %[[T3]] : i1
+!CHECK: fir.if %[[IV_CMP]] {
+!CHECK: fir.store %[[V]] to %{{.*}} : !fir.ref<i32>
+! Testing lastprivate val update
+!CHECK-DAG: %[[CLONE_LD1:.*]] = fir.load %[[CLONE1_DECL]]#0 : !fir.ref<i32>
+!CHECK-DAG: hlfir.assign %[[CLONE_LD1]] to %[[ARG1_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+!CHECK-DAG: %[[CLONE_LD2:.*]] = fir.load %[[CLONE2_DECL]]#0 : !fir.ref<i32>
+!CHECK-DAG: hlfir.assign %[[CLONE_LD2]] to %[[ARG2_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+!CHECK: }
+!CHECK: omp.yield
+
+subroutine mult_lastprivate_int(arg1, arg2)
+        integer :: arg1, arg2
+!$OMP PARALLEL 
+!$OMP DO LASTPRIVATE(arg1) LASTPRIVATE(arg2)
+do n = 1, 5
+        arg1 = 2
+        arg2 = 3
+        print *, arg1, arg2
+end do
+!$OMP END DO
+!$OMP END PARALLEL
+print *, arg1, arg2
+end subroutine
+
+!CHECK: func.func @_QPmult_lastprivate_int2(%[[ARG1:.*]]: !fir.ref<i32> {fir.bindc_name = "arg1"}, %[[ARG2:.*]]: !fir.ref<i32> {fir.bindc_name = "arg2"}) {
+!CHECK: %[[ARG1_DECL:.*]]:2 = hlfir.declare %arg0 {uniq_name = "_QFmult_lastprivate_int2Earg1"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[ARG2_DECL:.*]]:2 = hlfir.declare %arg1 {uniq_name = "_QFmult_lastprivate_int2Earg2"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: omp.parallel  {
+!CHECK-DAG: %[[CLONE1:.*]] = fir.alloca i32 {bindc_name = "arg1"
+!CHECK-DAG: %[[CLONE1_DECL:.*]]:2 = hlfir.declare %[[CLONE1]] {uniq_name = "_QFmult_lastprivate_int2Earg1"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK-DAG: %[[CLONE2:.*]] = fir.alloca i32 {bindc_name = "arg2"
+!CHECK-DAG: %[[CLONE2_DECL:.*]]:2 = hlfir.declare %[[CLONE2]] {uniq_name = "_QFmult_lastprivate_int2Earg2"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: omp.wsloop for (%[[INDX_WS:.*]]) : {{.*}} {
+
+!Testing last iteration check
+!CHECK: %[[V:.*]] = arith.addi %[[INDX_WS]], %{{.*}} : i32
+!CHECK: %[[C0:.*]] = arith.constant 0 : i32
+!CHECK: %[[T1:.*]] = arith.cmpi slt, %{{.*}}, %[[C0]] : i32
+!CHECK: %[[T2:.*]] = arith.cmpi slt, %[[V]], %{{.*}} : i32
+!CHECK: %[[T3:.*]] = arith.cmpi sgt, %[[V]], %{{.*}} : i32
+!CHECK: %[[IV_CMP:.*]] = arith.select %[[T1]], %[[T2]], %[[T3]] : i1
+!CHECK: fir.if %[[IV_CMP]] {
+!CHECK: fir.store %[[V]] to %{{.*}} : !fir.ref<i32>
+!Testing lastprivate val update
+!CHECK-DAG: %[[CLONE_LD2:.*]] = fir.load %[[CLONE2_DECL]]#0 : !fir.ref<i32>
+!CHECK-DAG: hlfir.assign %[[CLONE_LD2]] to %[[ARG2_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+!CHECK-DAG: %[[CLONE_LD1:.*]] = fir.load %[[CLONE1_DECL]]#0 : !fir.ref<i32>
+!CHECK-DAG: hlfir.assign %[[CLONE_LD1]] to %[[ARG1_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+!CHECK: }
+!CHECK: omp.yield
+
+subroutine mult_lastprivate_int2(arg1, arg2)
+        integer :: arg1, arg2
+!$OMP PARALLEL 
+!$OMP DO LASTPRIVATE(arg1, arg2)
+do n = 1, 5
+        arg1 = 2
+        arg2 = 3
+        print *, arg1, arg2
+end do
+!$OMP END DO
+!$OMP END PARALLEL
+print *, arg1, arg2
+end subroutine
+
+!CHECK: func.func @_QPfirstpriv_lastpriv_int(%[[ARG1:.*]]: !fir.ref<i32> {fir.bindc_name = "arg1"}, %[[ARG2:.*]]: !fir.ref<i32> {fir.bindc_name = "arg2"}) {
+!CHECK:    %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]] {uniq_name = "_QFfirstpriv_lastpriv_intEarg1"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:    %[[ARG2_DECL:.*]]:2 = hlfir.declare %[[ARG2]] {uniq_name = "_QFfirstpriv_lastpriv_intEarg2"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: omp.parallel  {
+! Firstprivate update
+!CHECK: %[[CLONE1:.*]] = fir.alloca i32 {bindc_name = "arg1"
+!CHECK: %[[CLONE1_DECL:.*]]:2 = hlfir.declare %[[CLONE1]] {uniq_name = "_QFfirstpriv_lastpriv_intEarg1"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[FPV_LD:.*]] = fir.load %[[ARG1_DECL]]#0 : !fir.ref<i32>
+!CHECK: hlfir.assign %[[FPV_LD]] to %[[CLONE1_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+! Lastprivate Allocation
+!CHECK: %[[CLONE2:.*]] = fir.alloca i32 {bindc_name = "arg2"
+!CHECK: %[[CLONE2_DECL:.*]]:2 = hlfir.declare %[[CLONE2]] {uniq_name = "_QFfirstpriv_lastpriv_intEarg2"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK-NOT: omp.barrier
+!CHECK: omp.wsloop for (%[[INDX_WS:.*]]) : {{.*}} {
+
+! Testing last iteration check
+!CHECK: %[[V:.*]] = arith.addi %[[INDX_WS]], %{{.*}} : i32
+!CHECK: %[[C0:.*]] = arith.constant 0 : i32
+!CHECK: %[[T1:.*]] = arith.cmpi slt, %{{.*}}, %[[C0]] : i32
+!CHECK: %[[T2:.*]] = arith.cmpi slt, %[[V]], %{{.*}} : i32
+!CHECK: %[[T3:.*]] = arith.cmpi sgt, %[[V]], %{{.*}} : i32
+!CHECK: %[[IV_CMP:.*]] = arith.select %[[T1]], %[[T2]], %[[T3]] : i1
+!CHECK: fir.if %[[IV_CMP]] {
+!CHECK: fir.store %[[V]] to %{{.*}} : !fir.ref<i32>
+! Testing lastprivate val update
+!CHECK-NEXT: %[[CLONE_LD:.*]] = fir.load %[[CLONE2_DECL]]#0 : !fir.ref<i32>
+!CHECK-NEXT: hlfir.assign %[[CLONE_LD]] to %[[ARG2_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+!CHECK-NEXT: }
+!CHECK-NEXT: omp.yield
+
+subroutine firstpriv_lastpriv_int(arg1, arg2)
+        integer :: arg1, arg2
+!$OMP PARALLEL 
+!$OMP DO FIRSTPRIVATE(arg1) LASTPRIVATE(arg2)
+do n = 1, 5
+        arg1 = 2
+        arg2 = 3
+        print *, arg1, arg2
+end do
+!$OMP END DO
+!$OMP END PARALLEL
+print *, arg1, arg2
+end subroutine
+
+!CHECK: func.func @_QPfirstpriv_lastpriv_int2(%[[ARG1:.*]]: !fir.ref<i32> {fir.bindc_name = "arg1"}) {
+!CHECK: %[[ARG1_DECL:.*]]:2 = hlfir.declare %[[ARG1]] {uniq_name = "_QFfirstpriv_lastpriv_int2Earg1"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: omp.parallel  {
+! Firstprivate update
+!CHECK: %[[CLONE1:.*]] = fir.alloca i32 {bindc_name = "arg1"
+!CHECK: %[[CLONE1_DECL:.*]]:2 = hlfir.declare %[[CLONE1]] {uniq_name = "_QFfirstpriv_lastpriv_int2Earg1"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK-NEXT: %[[FPV_LD:.*]] = fir.load %[[ARG1_DECL]]#0 : !fir.ref<i32>
+!CHECK-NEXT: hlfir.assign %[[FPV_LD]] to %[[CLONE1_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+!CHECK-NEXT: omp.barrier
+!CHECK: omp.wsloop for (%[[INDX_WS:.*]]) : {{.*}} {
+! Testing last iteration check
+!CHECK: %[[V:.*]] = arith.addi %[[INDX_WS]], %{{.*}} : i32
+!CHECK: %[[C0:.*]] = arith.constant 0 : i32
+!CHECK: %[[T1:.*]] = arith.cmpi slt, %{{.*}}, %[[C0]] : i32
+!CHECK: %[[T2:.*]] = arith.cmpi slt, %[[V]], %{{.*}} : i32
+!CHECK: %[[T3:.*]] = arith.cmpi sgt, %[[V]], %{{.*}} : i32
+!CHECK: %[[IV_CMP:.*]] = arith.select %[[T1]], %[[T2]], %[[T3]] : i1
+!CHECK: fir.if %[[IV_CMP]] {
+!CHECK: fir.store %[[V]] to %{{.*}} : !fir.ref<i32>
+! Testing lastprivate val update
+!CHECK-NEXT: %[[CLONE_LD:.*]] = fir.load %[[CLONE1_DECL]]#0 : !fir.ref<i32>
+!CHECK-NEXT: hlfir.assign %[[CLONE_LD]] to %[[ARG1_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+!CHECK-NEXT: }
+!CHECK-NEXT: omp.yield
+
+subroutine firstpriv_lastpriv_int2(arg1)
+        integer :: arg1
+!$OMP PARALLEL 
+!$OMP DO FIRSTPRIVATE(arg1) LASTPRIVATE(arg1)
+do n = 1, 5
+        arg1 = 2
+        print *, arg1
+end do
+!$OMP END DO
+!$OMP END PARALLEL
+print *, arg1
+end subroutine
diff --git a/flang/test/Lower/OpenMP/parallel-private-clause.f90 b/flang/test/Lower/OpenMP/parallel-private-clause.f90
new file mode 100644
index 0000000000000..8d288f6483493
--- /dev/null
+++ b/flang/test/Lower/OpenMP/parallel-private-clause.f90
@@ -0,0 +1,380 @@
+! This test checks lowering of OpenMP parallel Directive with
+! `PRIVATE` clause present.
+
+! REQUIRES: shell
+! RUN: bbc --use-desc-for-alloc=false -fopenmp -emit-fir %s -o - | \
+! RUN:   FileCheck %s --check-prefix=FIRDialect
+
+!FIRDialect: func @_QPprivate_clause(%[[ARG1:.*]]: !fir.ref<i32>{{.*}}, %[[ARG2:.*]]: !fir.ref<!fir.array<10xi32>>{{.*}}, %[[ARG3:.*]]: !fir.boxchar<1>{{.*}}, %[[ARG4:.*]]: !fir.boxchar<1>{{.*}}) {
+!FIRDialect-DAG: %[[ALPHA:.*]] = fir.alloca i32 {{{.*}}, uniq_name = "{{.*}}Ealpha"}
+!FIRDialect-DAG: %[[ALPHA_ARRAY:.*]] = fir.alloca !fir.array<10xi32> {{{.*}}, uniq_name = "{{.*}}Ealpha_array"}
+!FIRDialect-DAG: %[[BETA:.*]] = fir.alloca !fir.char<1,5> {{{.*}}, uniq_name = "{{.*}}Ebeta"}
+!FIRDialect-DAG: %[[BETA_ARRAY:.*]] = fir.alloca !fir.array<10x!fir.char<1,5>> {{{.*}}, uniq_name = "{{.*}}Ebeta_array"}
+
+!FIRDialect-DAG:  omp.parallel {
+!FIRDialect-DAG: %[[ALPHA_PRIVATE:.*]] = fir.alloca i32 {{{.*}}, pinned, uniq_name = "{{.*}}Ealpha"}
+!FIRDialect-DAG: %[[ALPHA_ARRAY_PRIVATE:.*]] = fir.alloca !fir.array<10xi32> {{{.*}}, pinned, uniq_name = "{{.*}}Ealpha_array"}
+!FIRDialect-DAG: %[[BETA_PRIVATE:.*]] = fir.alloca !fir.char<1,5> {{{.*}}, pinned, uniq_name = "{{.*}}Ebeta"}
+!FIRDialect-DAG: %[[BETA_ARRAY_PRIVATE:.*]] = fir.alloca !fir.array<10x!fir.char<1,5>> {{{.*}}, pinned, uniq_name = "{{.*}}Ebeta_array"}
+!FIRDialect-DAG: %[[ARG1_PRIVATE:.*]] = fir.alloca i32 {{{.*}}, pinned, uniq_name = "{{.*}}Earg1"}
+!FIRDialect-DAG: %[[ARG2_ARRAY_PRIVATE:.*]] = fir.alloca !fir.array<10xi32> {{{.*}}, pinned, uniq_name = "{{.*}}Earg2"}
+!FIRDialect-DAG: %[[ARG3_PRIVATE:.*]] = fir.alloca !fir.char<1,5> {{{.*}}, pinned, uniq_name = "{{.*}}Earg3"}
+!FIRDialect-DAG: %[[ARG4_ARRAY_PRIVATE:.*]] = fir.alloca !fir.array<10x!fir.char<1,5>> {{{.*}}, pinned, uniq_name = "{{.*}}Earg4"}
+!FIRDialect:    omp.terminator
+!FIRDialect:  }
+
+subroutine private_clause(arg1, arg2, arg3, arg4)
+
+        integer :: arg1, arg2(10)
+        integer :: alpha, alpha_array(10)
+        character(5) :: arg3, arg4(10)
+        character(5) :: beta, beta_array(10)
+
+!$OMP PARALLEL PRIVATE(alpha, alpha_array, beta, beta_array, arg1, arg2, arg3, arg4)
+        alpha = 1
+        alpha_array = 4
+        beta = "hi"
+        beta_array = "hi"
+        arg1 = 2
+        arg2 = 3
+        arg3 = "world"
+        arg4 = "world"
+!$OMP END PARALLEL
+
+end subroutine
+
+!FIRDialect: func @_QPprivate_clause_scalar() {
+!FIRDialect-DAG:   {{.*}} = fir.alloca !fir.complex<4> {bindc_name = "c", uniq_name = "{{.*}}Ec"}
+!FIRDialect-DAG:   {{.*}} = fir.alloca i8 {bindc_name = "i1", uniq_name = "{{.*}}Ei1"}
+!FIRDialect-DAG:   {{.*}} = fir.alloca i128 {bindc_name = "i16", uniq_name = "{{.*}}Ei16"}
+!FIRDialect-DAG:   {{.*}} = fir.alloca i16 {bindc_name = "i2", uniq_name = "{{.*}}Ei2"}
+!FIRDialect-DAG:   {{.*}} = fir.alloca i32 {bindc_name = "i4", uniq_name = "{{.*}}Ei4"}
+!FIRDialect-DAG:   {{.*}} = fir.alloca i64 {bindc_name = "i8", uniq_name = "{{.*}}Ei8"}
+!FIRDialect-DAG:   {{.*}} = fir.alloca !fir.logical<4> {bindc_name = "l", uniq_name = "{{.*}}El"}
+!FIRDialect-DAG:   {{.*}} = fir.alloca f32 {bindc_name = "r", uniq_name = "{{.*}}Er"}
+
+!FIRDialect:   omp.parallel {
+!FIRDialect-DAG:     {{.*}} = fir.alloca i8 {bindc_name = "i1", pinned, uniq_name = "{{.*}}Ei1"}
+!FIRDialect-DAG:     {{.*}} = fir.alloca i16 {bindc_name = "i2", pinned, uniq_name = "{{.*}}Ei2"}
+!FIRDialect-DAG:     {{.*}} = fir.alloca i32 {bindc_name = "i4", pinned, uniq_name = "{{.*}}Ei4"}
+!FIRDialect-DAG:     {{.*}} = fir.alloca i64 {bindc_name = "i8", pinned, uniq_name = "{{.*}}Ei8"}
+!FIRDialect-DAG:     {{.*}} = fir.alloca i128 {bindc_name = "i16", pinned, uniq_name = "{{.*}}Ei16"}
+!FIRDialect-DAG:     {{.*}} = fir.alloca !fir.complex<4> {bindc_name = "c", pinned, uniq_name = "{{.*}}Ec"}
+!FIRDialect-DAG:     {{.*}} = fir.alloca !fir.logical<4> {bindc_name = "l", pinned, uniq_name = "{{.*}}El"}
+!FIRDialect-DAG:     {{.*}} = fir.alloca f32 {bindc_name = "r", pinned, uniq_name = "{{.*}}Er"}
+
+subroutine private_clause_scalar()
+
+        integer(kind=1) :: i1
+        integer(kind=2) :: i2
+        integer(kind=4) :: i4
+        integer(kind=8) :: i8
+        integer(kind=16) :: i16
+        complex :: c
+        logical :: l
+        real :: r
+
+!$OMP PARALLEL PRIVATE(i1, i2, i4, i8, i16, c, l, r)
+        print *, i1, i2, i4, i8, i16, c, l, r
+!$OMP END PARALLEL
+
+end subroutine
+
+!FIRDialect: func @_QPprivate_clause_derived_type() {
+!FIRDialect:   {{.*}} = fir.alloca !fir.type<{{.*}}{t_i:i32,t_arr:!fir.array<5xi32>}> {bindc_name = "t", uniq_name = "{{.*}}Et"}
+
+!FIRDialect:   omp.parallel {
+!FIRDialect:     {{.*}} = fir.alloca !fir.type<{{.*}}{t_i:i32,t_arr:!fir.array<5xi32>}> {bindc_name = "t", pinned, uniq_name = "{{.*}}Et"}
+
+subroutine private_clause_derived_type()
+
+        type my_type
+          integer :: t_i
+          integer :: t_arr(5)
+        end type my_type
+        type(my_type) :: t
+
+!$OMP PARALLEL PRIVATE(t)
+        print *, t%t_i
+!$OMP END PARALLEL
+
+end subroutine
+
+!FIRDialect: func @_QPprivate_clause_allocatable() {
+!FIRDialect-DAG:  {{.*}} = fir.alloca !fir.box<!fir.heap<i32>> {bindc_name = "x", uniq_name = "{{.*}}Ex"}
+!FIRDialect-DAG:  {{.*}} = fir.alloca !fir.heap<i32> {uniq_name = "{{.*}}Ex.addr"}
+!FIRDialect-DAG:  {{.*}} = fir.alloca !fir.box<!fir.heap<!fir.array<?xi32>>> {bindc_name = "x2", uniq_name = "{{.*}}Ex2"}
+!FIRDialect-DAG:  {{.*}} = fir.alloca !fir.heap<!fir.array<?xi32>> {uniq_name = "{{.*}}Ex2.addr"}
+!FIRDialect-DAG:  {{.*}} = fir.address_of(@{{.*}}Ex3) : !fir.ref<!fir.box<!fir.heap<i32>>>
+!FIRDialect-DAG:  [[TMP8:%.*]] = fir.address_of(@{{.*}}Ex4) : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
+
+!FIRDialect:   omp.parallel {
+!FIRDialect-DAG:    [[TMP35:%.*]] = fir.alloca !fir.box<!fir.heap<i32>> {bindc_name = "x", pinned, uniq_name = "{{.*}}Ex"}
+!FIRDialect-DAG:    [[TMP39:%.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xi32>>> {bindc_name = "x2", pinned, uniq_name = "{{.*}}Ex2"}
+!FIRDialect-DAG:    [[TMP45:%.*]] = fir.alloca !fir.box<!fir.heap<i32>> {bindc_name = "x3", pinned, uniq_name = "{{.*}}Ex3"}
+
+!FIRDialect-DAG:    [[TMP51:%.*]] = fir.load [[TMP8]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
+!FIRDialect-DAG:    [[TMP97:%.*]] = fir.load [[TMP8]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
+!FIRDialect-DAG:    [[TMP98:%.*]]:3 = fir.box_dims [[TMP97]], {{.*}} : (!fir.box<!fir.heap<!fir.array<?xi32>>>, index) -> (index, index, index)
+!FIRDialect-DAG:    [[TMP50:%.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xi32>>> {bindc_name = "x4", pinned, uniq_name = "{{.*}}Ex4"}
+
+! FIRDialect-DAG:    [[TMP101:%.*]] = fir.allocmem !fir.array<?xi32>, {{.*}} {fir.must_be_heap = true, uniq_name = "{{.*}}Ex4.alloc"}
+! FIRDialect-DAG:    [[TMP102:%.*]] = fir.shape_shift {{.*}}#0, {{.*}} : (index, index) -> !fir.shapeshift<1>
+! FIRDialect-DAG:    [[TMP103:%.*]] = fir.embox [[TMP101]]([[TMP102]]) : (!fir.heap<!fir.array<?xi32>>, !fir.shapeshift<1>) -> !fir.box<!fir.heap<!fir.array<?xi32>>>
+! FIRDialect-DAG:  fir.store [[TMP103]] to [[TMP50]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
+
+
+subroutine private_clause_allocatable()
+
+        integer, allocatable :: x, x2(:)
+        integer, allocatable, save :: x3, x4(:)
+
+        print *, x, x2, x3, x4
+
+!$OMP PARALLEL PRIVATE(x, x2, x3, x4)
+        print *, x, x2, x3, x4
+!$OMP END PARALLEL
+
+end subroutine
+
+
+!FIRDialect: func @_QPprivate_clause_real_call_allocatable() {
+!FIRDialect-DAG: {{.*}} = fir.alloca !fir.box<!fir.heap<f32>> {bindc_name = "x5", uniq_name = "{{.*}}Ex5"}
+!FIRDialect-DAG: {{.*}} = fir.zero_bits !fir.heap<f32>
+!FIRDialect-DAG: {{.*}} = fir.embox %1 : (!fir.heap<f32>) -> !fir.box<!fir.heap<f32>>
+!FIRDialect-DAG: fir.store %2 to %0 : !fir.ref<!fir.box<!fir.heap<f32>>>
+!FIRDialect-DAG: omp.parallel   {
+!FIRDialect-DAG:  [[TMP203:%.*]] = fir.alloca !fir.box<!fir.heap<f32>> {bindc_name = "x5", pinned, uniq_name = "{{.*}}Ex5"}
+
+!FIRDialect-DAG: fir.if %{{.*}} {
+
+!FIRDialect-DAG:   fir.store %{{.*}} to [[TMP203]] : !fir.ref<!fir.box<!fir.heap<f32>>>
+!FIRDialect-DAG: } else {
+
+!FIRDialect-DAG:   fir.store %{{.*}} to [[TMP203]] : !fir.ref<!fir.box<!fir.heap<f32>>>
+!FIRDialect-DAG: }
+!FIRDialect-DAG: fir.call @_QFprivate_clause_real_call_allocatablePhelper_private_clause_real_call_allocatable([[TMP203]]) fastmath<contract> : (!fir.ref<!fir.box<!fir.heap<f32>>>) -> ()
+!FIRDialect-DAG: %{{.*}} = fir.load [[TMP203]] : !fir.ref<!fir.box<!fir.heap<f32>>>
+
+!FIRDialect-DAG: fir.if %{{.*}} {
+!FIRDialect-DAG:   %{{.*}} = fir.load [[TMP203]] : !fir.ref<!fir.box<!fir.heap<f32>>>
+
+!FIRDialect-DAG:     fir.store %{{.*}} to [[TMP203]] : !fir.ref<!fir.box<!fir.heap<f32>>>
+!FIRDialect-DAG:   }
+!FIRDialect-DAG:   omp.terminator
+!FIRDialect-DAG:   }
+!FIRDialect-DAG:   return
+!FIRDialect-DAG: }
+
+
+subroutine private_clause_real_call_allocatable
+        real, allocatable :: x5
+        !$omp parallel private(x5)
+            call helper_private_clause_real_call_allocatable(x5)
+        !$omp end parallel
+    contains
+        subroutine helper_private_clause_real_call_allocatable(x6)
+            real, allocatable :: x6
+            print *, allocated(x6)
+        end subroutine
+end subroutine
+
+!FIRDialect:  func.func @_QPincrement_list_items(%arg0: !fir.ref<!fir.box<!fir.ptr<!fir.type<_QFincrement_list_itemsTnode{payload:i32,next:!fir.box<!fir.ptr<!fir.type<_QFincrement_list_itemsTnode>>>}>>>> {fir.bindc_name = "head"}) {
+!FIRDialect:    {{%.*}} = fir.alloca !fir.box<!fir.ptr<!fir.type<_QFincrement_list_itemsTnode{payload:i32,next:!fir.box<!fir.ptr<!fir.type<_QFincrement_list_itemsTnode>>>}>>> {bindc_name = "p", uniq_name = "_QFincrement_list_itemsEp"}
+!FIRDialect:    omp.parallel   {
+!FIRDialect:      {{%.*}} = fir.alloca !fir.box<!fir.ptr<!fir.type<_QFincrement_list_itemsTnode{payload:i32,next:!fir.box<!fir.ptr<!fir.type<_QFincrement_list_itemsTnode>>>}>>> {bindc_name = "p", pinned, uniq_name = "_QFincrement_list_itemsEp"}
+!FIRDialect:      omp.single   {
+
+!FIRDialect:         omp.terminator
+!FIRDialect:       omp.terminator
+!FIRDialect:    return
+
+subroutine increment_list_items (head)
+  type node
+     integer :: payload
+     type (node), pointer :: next
+  end type node
+
+  type (node), pointer :: head
+  type (node), pointer :: p
+!$omp parallel private(p)
+!$omp single
+  p => head
+  do
+     p => p%next
+     if ( associated (p) .eqv. .false. ) exit
+  end do
+!$omp end single
+!$omp end parallel
+end subroutine increment_list_items
+
+!FIRDialect:  func.func @_QPparallel_pointer() {
+!FIRDialect-DAG: [[PP0:%.*]]  = fir.alloca !fir.box<!fir.ptr<i32>> {bindc_name = "y1", uniq_name = "{{.*}}Ey1"}
+!FIRDialect-DAG: [[PP1:%.*]]  = fir.alloca !fir.ptr<i32> {uniq_name = "{{.*}}Ey1.addr"}
+!FIRDialect-DAG: [[PP2:%.*]]  = fir.zero_bits !fir.ptr<i32>
+!FIRDialect:     fir.store [[PP2]] to [[PP1]] : !fir.ref<!fir.ptr<i32>>
+!FIRDialect-DAG: [[PP3:%.*]]  = fir.alloca !fir.box<!fir.ptr<!fir.array<?xi32>>> {bindc_name = "y2", uniq_name = "{{.*}}Ey2"}
+
+!FIRDialect:     fir.store %6 to %3 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
+!FIRDialect-DAG: [[PP7:%.*]] = fir.alloca i32 {bindc_name = "z1", fir.target, uniq_name = "{{.*}}Ez1"}
+
+!FIRDialect-DAG: [[PP8:%.*]] = fir.alloca !fir.array<10xi32> {bindc_name = "z2", fir.target, uniq_name = "{{.*}}Ez2"}
+!FIRDialect:     omp.parallel   {
+!FIRDialect-DAG:   [[PP9:%.*]] = fir.alloca !fir.box<!fir.ptr<i32>> {bindc_name = "y1", pinned, uniq_name = "{{.*}}Ey1"}
+!FIRDialect-DAG:   [[PP10:%.*]] = fir.alloca !fir.box<!fir.ptr<!fir.array<?xi32>>> {bindc_name = "y2", pinned, uniq_name = "{{.*}}Ey2"}
+!FIRDialect-DAG:   [[PP11:%.*]] = fir.embox [[PP7]] : (!fir.ref<i32>) -> !fir.box<!fir.ptr<i32>>
+!FIRDialect:       fir.store [[PP11]] to [[PP9]] : !fir.ref<!fir.box<!fir.ptr<i32>>>
+!FIRDialect-DAG:   [[PP12:%.*]] = fir.shape %c{{.*}} : (index) -> !fir.shape<1>
+!FIRDialect-DAG:   [[PP13:%.*]] = fir.embox [[PP8]]([[PP12]]) : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>) -> !fir.box<!fir.ptr<!fir.array<?xi32>>>
+!FIRDialect:       fir.store %13 to [[PP10]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
+!FIRDialect:       omp.terminator
+!FIRDialect:     }
+!FIRDialect:   return
+!FIRDialect: }
+
+subroutine parallel_pointer()
+    integer, pointer :: y1, y2(:)
+    integer, target :: z1, z2(10)
+
+!$omp parallel private(y1, y2)
+  y1=>z1
+  y2=>z2
+!$omp end parallel
+end subroutine parallel_pointer
+
+
+!FIRDialect-LABEL: func @_QPsimple_loop_1()
+subroutine simple_loop_1
+  integer :: i
+  real, allocatable :: r;
+  ! FIRDialect:  omp.parallel
+  !$OMP PARALLEL PRIVATE(r)
+  ! FIRDialect:     %[[ALLOCA_IV:.*]] = fir.alloca i32 {{{.*}}, pinned}
+
+  ! FIRDialect:     [[R:%.*]] = fir.alloca !fir.box<!fir.heap<f32>> {bindc_name = "r", pinned, uniq_name = "{{.*}}Er"}
+  ! FIRDialect:     fir.store {{%.*}} to [[R]] : !fir.ref<!fir.box<!fir.heap<f32>>>
+  ! FIRDialect:     fir.store {{%.*}} to [[R]] : !fir.ref<!fir.box<!fir.heap<f32>>>
+
+  ! FIRDialect:     %[[WS_LB:.*]] = arith.constant 1 : i32
+  ! FIRDialect:     %[[WS_UB:.*]] = arith.constant 9 : i32
+  ! FIRDialect:     %[[WS_STEP:.*]] = arith.constant 1 : i32
+
+  ! FIRDialect:     omp.wsloop for (%[[I:.*]]) : i32 = (%[[WS_LB]]) to (%[[WS_UB]]) inclusive step (%[[WS_STEP]])
+  !$OMP DO
+  do i=1, 9
+  ! FIRDialect:     fir.store %[[I]] to %[[ALLOCA_IV:.*]] : !fir.ref<i32>
+  ! FIRDialect:     %[[LOAD_IV:.*]] = fir.load %[[ALLOCA_IV]] : !fir.ref<i32>
+  ! FIRDialect:     fir.call @_FortranAioOutputInteger32({{.*}}, %[[LOAD_IV]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
+    print*, i
+  end do
+  ! FIRDialect:     omp.yield
+  ! FIRDialect:     {{%.*}} = fir.load [[R]] : !fir.ref<!fir.box<!fir.heap<f32>>>
+  ! FIRDialect:     fir.if {{%.*}} {
+  ! FIRDialect:     [[LD:%.*]] = fir.load [[R]] : !fir.ref<!fir.box<!fir.heap<f32>>>
+  ! FIRDialect:     [[AD:%.*]] = fir.box_addr [[LD]] : (!fir.box<!fir.heap<f32>>) -> !fir.heap<f32>
+  ! FIRDialect:     fir.freemem [[AD]] : !fir.heap<f32>
+  ! FIRDialect:     fir.store {{%.*}} to [[R]] : !fir.ref<!fir.box<!fir.heap<f32>>>
+  !$OMP END DO
+  ! FIRDialect:  omp.terminator
+  !$OMP END PARALLEL
+end subroutine
+
+!FIRDialect-LABEL: func @_QPsimple_loop_2()
+subroutine simple_loop_2
+  integer :: i
+  real, allocatable :: r;
+  ! FIRDialect:  omp.parallel
+  !$OMP PARALLEL
+  ! FIRDialect:     %[[ALLOCA_IV:.*]] = fir.alloca i32 {{{.*}}, pinned}
+
+  ! FIRDialect:     [[R:%.*]] = fir.alloca !fir.box<!fir.heap<f32>> {bindc_name = "r", pinned, uniq_name = "{{.*}}Er"}
+  ! FIRDialect:     fir.store {{%.*}} to [[R]] : !fir.ref<!fir.box<!fir.heap<f32>>>
+  ! FIRDialect:     fir.store {{%.*}} to [[R]] : !fir.ref<!fir.box<!fir.heap<f32>>>
+
+  ! FIRDialect:     %[[WS_LB:.*]] = arith.constant 1 : i32
+  ! FIRDialect:     %[[WS_UB:.*]] = arith.constant 9 : i32
+  ! FIRDialect:     %[[WS_STEP:.*]] = arith.constant 1 : i32
+
+  ! FIRDialect:     omp.wsloop for (%[[I:.*]]) : i32 = (%[[WS_LB]]) to (%[[WS_UB]]) inclusive step (%[[WS_STEP]])
+  !$OMP DO PRIVATE(r)
+  do i=1, 9
+  ! FIRDialect:     fir.store %[[I]] to %[[ALLOCA_IV:.*]] : !fir.ref<i32>
+  ! FIRDialect:     %[[LOAD_IV:.*]] = fir.load %[[ALLOCA_IV]] : !fir.ref<i32>
+  ! FIRDialect:     fir.call @_FortranAioOutputInteger32({{.*}}, %[[LOAD_IV]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
+    print*, i
+  end do
+  ! FIRDialect:     omp.yield
+  ! FIRDialect:     {{%.*}} = fir.load [[R]] : !fir.ref<!fir.box<!fir.heap<f32>>>
+  ! FIRDialect:     fir.if {{%.*}} {
+  ! FIRDialect:     [[LD:%.*]] = fir.load [[R]] : !fir.ref<!fir.box<!fir.heap<f32>>>
+  ! FIRDialect:     [[AD:%.*]] = fir.box_addr [[LD]] : (!fir.box<!fir.heap<f32>>) -> !fir.heap<f32>
+  ! FIRDialect:     fir.freemem [[AD]] : !fir.heap<f32>
+  ! FIRDialect:     fir.store {{%.*}} to [[R]] : !fir.ref<!fir.box<!fir.heap<f32>>>
+  !$OMP END DO
+  ! FIRDialect:  omp.terminator
+  !$OMP END PARALLEL
+end subroutine
+
+!FIRDialect-LABEL: func @_QPsimple_loop_3()
+subroutine simple_loop_3
+  integer :: i
+  real, allocatable :: r;
+  ! FIRDialect:  omp.parallel
+  ! FIRDialect:     %[[ALLOCA_IV:.*]] = fir.alloca i32 {{{.*}}, pinned}
+
+  ! FIRDialect:     [[R:%.*]] = fir.alloca !fir.box<!fir.heap<f32>> {bindc_name = "r", pinned, uniq_name = "{{.*}}Er"}
+  ! FIRDialect:     fir.store {{%.*}} to [[R]] : !fir.ref<!fir.box<!fir.heap<f32>>>
+  ! FIRDialect:     fir.store {{%.*}} to [[R]] : !fir.ref<!fir.box<!fir.heap<f32>>>
+
+  ! FIRDialect:     %[[WS_LB:.*]] = arith.constant 1 : i32
+  ! FIRDialect:     %[[WS_UB:.*]] = arith.constant 9 : i32
+  ! FIRDialect:     %[[WS_STEP:.*]] = arith.constant 1 : i32
+
+  ! FIRDialect:     omp.wsloop for (%[[I:.*]]) : i32 = (%[[WS_LB]]) to (%[[WS_UB]]) inclusive step (%[[WS_STEP]])
+  !$OMP PARALLEL DO PRIVATE(r)
+  do i=1, 9
+  ! FIRDialect:     fir.store %[[I]] to %[[ALLOCA_IV:.*]] : !fir.ref<i32>
+  ! FIRDialect:     %[[LOAD_IV:.*]] = fir.load %[[ALLOCA_IV]] : !fir.ref<i32>
+  ! FIRDialect:     fir.call @_FortranAioOutputInteger32({{.*}}, %[[LOAD_IV]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
+    print*, i
+  end do
+  ! FIRDialect:     omp.yield
+  ! FIRDialect:     {{%.*}} = fir.load [[R]] : !fir.ref<!fir.box<!fir.heap<f32>>>
+  ! FIRDialect:     fir.if {{%.*}} {
+  ! FIRDialect:     [[LD:%.*]] = fir.load [[R]] : !fir.ref<!fir.box<!fir.heap<f32>>>
+  ! FIRDialect:     [[AD:%.*]] = fir.box_addr [[LD]] : (!fir.box<!fir.heap<f32>>) -> !fir.heap<f32>
+  ! FIRDialect:     fir.freemem [[AD]] : !fir.heap<f32>
+  ! FIRDialect:     fir.store {{%.*}} to [[R]] : !fir.ref<!fir.box<!fir.heap<f32>>>
+  !$OMP END PARALLEL DO
+  ! FIRDialect:  omp.terminator
+end subroutine
+
+!CHECK-LABEL: func @_QPsimd_loop_1()
+subroutine simd_loop_1
+  integer :: i
+  real, allocatable :: r;
+  ! FIRDialect:     [[R:%.*]] = fir.alloca !fir.box<!fir.heap<f32>> {bindc_name = "r", pinned, uniq_name = "{{.*}}Er"}
+  ! FIRDialect:     fir.store {{%.*}} to [[R]] : !fir.ref<!fir.box<!fir.heap<f32>>>
+  ! FIRDialect:     fir.store {{%.*}} to [[R]] : !fir.ref<!fir.box<!fir.heap<f32>>>
+
+  ! FIRDialect:     %[[LB:.*]] = arith.constant 1 : i32
+  ! FIRDialect:     %[[UB:.*]] = arith.constant 9 : i32
+  ! FIRDialect:     %[[STEP:.*]] = arith.constant 1 : i32
+
+  ! FIRDialect: omp.simdloop for (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) {
+  !$OMP SIMD PRIVATE(r)
+  do i=1, 9
+  ! FIRDialect:     fir.store %[[I]] to %[[LOCAL:.*]] : !fir.ref<i32>
+  ! FIRDialect:     %[[LOAD_IV:.*]] = fir.load %[[LOCAL]] : !fir.ref<i32>
+  ! FIRDialect:     fir.call @_FortranAioOutputInteger32({{.*}}, %[[LOAD_IV]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
+    print*, i
+  end do
+  !$OMP END SIMD
+  ! FIRDialect:     omp.yield
+  ! FIRDialect:     {{%.*}} = fir.load [[R]] : !fir.ref<!fir.box<!fir.heap<f32>>>
+  ! FIRDialect:     fir.if {{%.*}} {
+  ! FIRDialect:     [[LD:%.*]] = fir.load [[R]] : !fir.ref<!fir.box<!fir.heap<f32>>>
+  ! FIRDialect:     [[AD:%.*]] = fir.box_addr [[LD]] : (!fir.box<!fir.heap<f32>>) -> !fir.heap<f32>
+  ! FIRDialect:     fir.freemem [[AD]] : !fir.heap<f32>
+  ! FIRDialect:     fir.store {{%.*}} to [[R]] : !fir.ref<!fir.box<!fir.heap<f32>>>
+end subroutine
diff --git a/flang/test/Lower/OpenMP/parallel-wsloop.f90 b/flang/test/Lower/OpenMP/parallel-wsloop.f90
new file mode 100644
index 0000000000000..c06f941b74b58
--- /dev/null
+++ b/flang/test/Lower/OpenMP/parallel-wsloop.f90
@@ -0,0 +1,298 @@
+! This test checks lowering of OpenMP DO Directive (Worksharing).
+
+! RUN: bbc -fopenmp -emit-hlfir %s -o - | FileCheck %s
+
+! CHECK-LABEL: func @_QPsimple_parallel_do()
+subroutine simple_parallel_do
+  integer :: i
+  ! CHECK:  omp.parallel
+  ! CHECK:     %[[WS_LB:.*]] = arith.constant 1 : i32
+  ! CHECK:     %[[WS_UB:.*]] = arith.constant 9 : i32
+  ! CHECK:     %[[WS_STEP:.*]] = arith.constant 1 : i32
+  ! CHECK:     omp.wsloop for (%[[I:.*]]) : i32 = (%[[WS_LB]]) to (%[[WS_UB]]) inclusive step (%[[WS_STEP]])
+  !$OMP PARALLEL DO
+  do i=1, 9
+  ! CHECK:    fir.store %[[I]] to %[[IV_ADDR:.*]]#1 : !fir.ref<i32>
+  ! CHECK:    %[[LOAD_IV:.*]] = fir.load %[[IV_ADDR]]#0 : !fir.ref<i32>
+  ! CHECK:    fir.call @_FortranAioOutputInteger32({{.*}}, %[[LOAD_IV]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
+    print*, i
+  end do
+  ! CHECK:       omp.yield
+  ! CHECK:       omp.terminator
+  !$OMP END PARALLEL DO
+end subroutine
+
+! CHECK-LABEL: func @_QPparallel_do_with_parallel_clauses
+! CHECK-SAME: %[[COND_REF:.*]]: !fir.ref<!fir.logical<4>> {fir.bindc_name = "cond"}, %[[NT_REF:.*]]: !fir.ref<i32> {fir.bindc_name = "nt"}
+subroutine parallel_do_with_parallel_clauses(cond, nt)
+  ! CHECK: %[[COND_DECL:.*]]:2 = hlfir.declare %[[COND_REF]] {uniq_name = "_QFparallel_do_with_parallel_clausesEcond"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+  ! CHECK: %[[NT_DECL:.*]]:2 = hlfir.declare %[[NT_REF]] {uniq_name = "_QFparallel_do_with_parallel_clausesEnt"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+  logical :: cond
+  integer :: nt
+  integer :: i
+  ! CHECK:  %[[COND:.*]] = fir.load %[[COND_DECL]]#0 : !fir.ref<!fir.logical<4>>
+  ! CHECK:  %[[COND_CVT:.*]] = fir.convert %[[COND]] : (!fir.logical<4>) -> i1
+  ! CHECK:  %[[NT:.*]] = fir.load %[[NT_DECL]]#0 : !fir.ref<i32>
+  ! CHECK:  omp.parallel if(%[[COND_CVT]] : i1) num_threads(%[[NT]] : i32) proc_bind(close)
+  ! CHECK:     %[[WS_LB:.*]] = arith.constant 1 : i32
+  ! CHECK:     %[[WS_UB:.*]] = arith.constant 9 : i32
+  ! CHECK:     %[[WS_STEP:.*]] = arith.constant 1 : i32
+  ! CHECK:     omp.wsloop for (%[[I:.*]]) : i32 = (%[[WS_LB]]) to (%[[WS_UB]]) inclusive step (%[[WS_STEP]])
+  !$OMP PARALLEL DO IF(cond) NUM_THREADS(nt) PROC_BIND(close)
+  do i=1, 9
+  ! CHECK:    fir.store %[[I]] to %[[IV_ADDR:.*]]#1 : !fir.ref<i32>
+  ! CHECK:    %[[LOAD_IV:.*]] = fir.load %[[IV_ADDR]]#0 : !fir.ref<i32>
+  ! CHECK:    fir.call @_FortranAioOutputInteger32({{.*}}, %[[LOAD_IV]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
+    print*, i
+  end do
+  ! CHECK:       omp.yield
+  ! CHECK:       omp.terminator
+  !$OMP END PARALLEL DO
+end subroutine
+
+! CHECK-LABEL: func @_QPparallel_do_with_clauses
+! CHECK-SAME: %[[NT_REF:.*]]: !fir.ref<i32> {fir.bindc_name = "nt"}
+subroutine parallel_do_with_clauses(nt)
+  ! CHECK:  %[[NT_DECL:.*]]:2 = hlfir.declare %[[NT_REF]] {uniq_name = "_QFparallel_do_with_clausesEnt"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+  integer :: nt
+  integer :: i
+  ! CHECK:  %[[NT:.*]] = fir.load %[[NT_DECL]]#0 : !fir.ref<i32>
+  ! CHECK:  omp.parallel num_threads(%[[NT]] : i32)
+  ! CHECK:     %[[WS_LB:.*]] = arith.constant 1 : i32
+  ! CHECK:     %[[WS_UB:.*]] = arith.constant 9 : i32
+  ! CHECK:     %[[WS_STEP:.*]] = arith.constant 1 : i32
+  ! CHECK:     omp.wsloop schedule(dynamic) for (%[[I:.*]]) : i32 = (%[[WS_LB]]) to (%[[WS_UB]]) inclusive step (%[[WS_STEP]])
+  !$OMP PARALLEL DO NUM_THREADS(nt) SCHEDULE(dynamic)
+  do i=1, 9
+  ! CHECK:    fir.store %[[I]] to %[[IV_ADDR:.*]]#1 : !fir.ref<i32>
+  ! CHECK:    %[[LOAD_IV:.*]] = fir.load %[[IV_ADDR]]#0 : !fir.ref<i32>
+  ! CHECK:    fir.call @_FortranAioOutputInteger32({{.*}}, %[[LOAD_IV]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
+    print*, i
+  end do
+  ! CHECK:       omp.yield
+  ! CHECK:       omp.terminator
+  !$OMP END PARALLEL DO
+end subroutine
+
+!===============================================================================
+! Checking for the following construct:
+!   !$omp parallel do private(...) firstprivate(...)
+!===============================================================================
+
+! CHECK-LABEL: func @_QPparallel_do_with_privatisation_clauses
+! CHECK-SAME: %[[COND_REF:.*]]: !fir.ref<!fir.logical<4>> {fir.bindc_name = "cond"}, %[[NT_REF:.*]]: !fir.ref<i32> {fir.bindc_name = "nt"}
+subroutine parallel_do_with_privatisation_clauses(cond,nt)
+  ! CHECK: %[[COND_DECL:.*]]:2 = hlfir.declare %[[COND_REF]] {uniq_name = "_QFparallel_do_with_privatisation_clausesEcond"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+  ! CHECK: %[[NT_DECL:.*]]:2 = hlfir.declare %[[NT_REF]] {uniq_name = "_QFparallel_do_with_privatisation_clausesEnt"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+  logical :: cond
+  integer :: nt
+  integer :: i
+  ! CHECK:  omp.parallel
+  ! CHECK:    %[[PRIVATE_COND_REF:.*]] = fir.alloca !fir.logical<4> {bindc_name = "cond", pinned, uniq_name = "_QFparallel_do_with_privatisation_clausesEcond"}
+  ! CHECK:    %[[PRIVATE_COND_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_COND_REF]] {uniq_name = "_QFparallel_do_with_privatisation_clausesEcond"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+  ! CHECK:    %[[PRIVATE_NT_REF:.*]] = fir.alloca i32 {bindc_name = "nt", pinned, uniq_name = "_QFparallel_do_with_privatisation_clausesEnt"}
+  ! CHECK:    %[[PRIVATE_NT_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_NT_REF]] {uniq_name = "_QFparallel_do_with_privatisation_clausesEnt"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+  ! CHECK:    %[[NT_VAL:.*]] = fir.load %[[NT_DECL]]#0 : !fir.ref<i32>
+  ! CHECK:    hlfir.assign %[[NT_VAL]] to %[[PRIVATE_NT_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+  ! CHECK:    %[[WS_LB:.*]] = arith.constant 1 : i32
+  ! CHECK:    %[[WS_UB:.*]] = arith.constant 9 : i32
+  ! CHECK:    %[[WS_STEP:.*]] = arith.constant 1 : i32
+  ! CHECK:    omp.wsloop for (%[[I:.*]]) : i32 = (%[[WS_LB]]) to (%[[WS_UB]]) inclusive step (%[[WS_STEP]])
+  !$OMP PARALLEL DO PRIVATE(cond) FIRSTPRIVATE(nt)
+  do i=1, 9
+  ! CHECK:    fir.store %[[I]] to %[[IV_ADDR:.*]]#1 : !fir.ref<i32>
+  ! CHECK:    %[[LOAD_IV:.*]] = fir.load %[[IV_ADDR]]#0 : !fir.ref<i32>
+  ! CHECK:      fir.call @_FortranAioOutputInteger32({{.*}}, %[[LOAD_IV]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
+  ! CHECK:      %[[PRIVATE_COND_VAL:.*]] = fir.load %[[PRIVATE_COND_DECL]]#0 : !fir.ref<!fir.logical<4>>
+  ! CHECK:      %[[PRIVATE_COND_VAL_CVT:.*]] = fir.convert %[[PRIVATE_COND_VAL]] : (!fir.logical<4>) -> i1
+  ! CHECK:      fir.call @_FortranAioOutputLogical({{.*}}, %[[PRIVATE_COND_VAL_CVT]]) {{.*}}: (!fir.ref<i8>, i1) -> i1
+  ! CHECK:      %[[PRIVATE_NT_VAL:.*]] = fir.load %[[PRIVATE_NT_DECL]]#0 : !fir.ref<i32>
+  ! CHECK:      fir.call @_FortranAioOutputInteger32({{.*}}, %[[PRIVATE_NT_VAL]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
+    print*, i, cond, nt
+  end do
+  ! CHECK:      omp.yield
+  ! CHECK:    omp.terminator
+  !$OMP END PARALLEL DO
+end subroutine
+
+!===============================================================================
+! Checking for the following construct
+!   !$omp parallel private(...) firstprivate(...)
+!   !$omp do
+!===============================================================================
+
+subroutine parallel_private_do(cond,nt)
+logical :: cond
+  integer :: nt
+  integer :: i
+  !$OMP PARALLEL PRIVATE(cond) FIRSTPRIVATE(nt)
+  !$OMP DO
+  do i=1, 9
+    call foo(i, cond, nt)
+  end do
+  !$OMP END DO
+  !$OMP END PARALLEL
+end subroutine parallel_private_do
+
+! CHECK-LABEL:   func.func @_QPparallel_private_do(
+! CHECK-SAME:                                      %[[VAL_0:.*]]: !fir.ref<!fir.logical<4>> {fir.bindc_name = "cond"},
+! CHECK-SAME:                                      %[[VAL_1:.*]]: !fir.ref<i32> {fir.bindc_name = "nt"}) {
+! CHECK:           %[[NT_DECL:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFparallel_private_doEnt"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           omp.parallel   {
+! CHECK:             %[[I_PRIV:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+! CHECK:             %[[I_PRIV_DECL:.*]]:2 = hlfir.declare %[[I_PRIV]] {uniq_name = "_QFparallel_private_doEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:             %[[COND_ADDR:.*]] = fir.alloca !fir.logical<4> {bindc_name = "cond", pinned, uniq_name = "_QFparallel_private_doEcond"}
+! CHECK:             %[[COND_DECL:.*]]:2 = hlfir.declare %[[COND_ADDR]] {uniq_name = "_QFparallel_private_doEcond"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK:             %[[NT_PRIV_ADDR:.*]] = fir.alloca i32 {bindc_name = "nt", pinned, uniq_name = "_QFparallel_private_doEnt"}
+! CHECK:             %[[NT_PRIV_DECL:.*]]:2 = hlfir.declare %[[NT_PRIV_ADDR]] {uniq_name = "_QFparallel_private_doEnt"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:             %[[NT:.*]] = fir.load %[[NT_DECL]]#0 : !fir.ref<i32>
+! CHECK:             hlfir.assign %[[NT]] to %[[NT_PRIV_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+! CHECK:             %[[VAL_7:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_8:.*]] = arith.constant 9 : i32
+! CHECK:             %[[VAL_9:.*]] = arith.constant 1 : i32
+! CHECK:             omp.wsloop   for  (%[[I:.*]]) : i32 = (%[[VAL_7]]) to (%[[VAL_8]]) inclusive step (%[[VAL_9]]) {
+! CHECK:               fir.store %[[I]] to %[[I_PRIV_DECL]]#1 : !fir.ref<i32>
+! CHECK:               fir.call @_QPfoo(%[[I_PRIV_DECL]]#1, %[[COND_DECL]]#1, %[[NT_PRIV_DECL]]#1) {{.*}}: (!fir.ref<i32>, !fir.ref<!fir.logical<4>>, !fir.ref<i32>) -> ()
+! CHECK:               omp.yield
+! CHECK:             }
+! CHECK:             omp.terminator
+! CHECK:           }
+! CHECK:           return
+! CHECK:         }
+
+!===============================================================================
+! Checking for the following construct
+!   !$omp parallel
+!   !$omp do firstprivate(...) firstprivate(...)
+!===============================================================================
+
+subroutine omp_parallel_multiple_firstprivate_do(a, b)
+  integer::a, b
+  !$OMP PARALLEL FIRSTPRIVATE(a) FIRSTPRIVATE(b)
+  !$OMP DO
+  do i=1, 10
+    call bar(i, a)
+  end do
+  !$OMP END DO
+  !$OMP END PARALLEL
+end subroutine omp_parallel_multiple_firstprivate_do
+
+! CHECK-LABEL:   func.func @_QPomp_parallel_multiple_firstprivate_do(
+! CHECK-SAME:                                                        %[[A_ADDR:.*]]: !fir.ref<i32> {fir.bindc_name = "a"},
+! CHECK-SAME:                                                        %[[B_ADDR:.*]]: !fir.ref<i32> {fir.bindc_name = "b"}) {
+! CHECK:            %[[A_DECL:.*]]:2 = hlfir.declare %[[A_ADDR]] {uniq_name = "_QFomp_parallel_multiple_firstprivate_doEa"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:            %[[B_DECL:.*]]:2 = hlfir.declare %[[B_ADDR]] {uniq_name = "_QFomp_parallel_multiple_firstprivate_doEb"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           omp.parallel   {
+! CHECK:             %[[I_PRIV_ADDR:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+! CHECK:             %[[I_PRIV_DECL:.*]]:2 = hlfir.declare %[[I_PRIV_ADDR]] {uniq_name = "_QFomp_parallel_multiple_firstprivate_doEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:             %[[A_PRIV_ADDR:.*]] = fir.alloca i32 {bindc_name = "a", pinned, uniq_name = "_QFomp_parallel_multiple_firstprivate_doEa"}
+! CHECK:             %[[A_PRIV_DECL:.*]]:2 = hlfir.declare %[[A_PRIV_ADDR]] {uniq_name = "_QFomp_parallel_multiple_firstprivate_doEa"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:             %[[A:.*]] = fir.load %[[A_DECL]]#0 : !fir.ref<i32>
+! CHECK:             hlfir.assign %[[A]] to %[[A_PRIV_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+! CHECK:             %[[B_PRIV_ADDR:.*]] = fir.alloca i32 {bindc_name = "b", pinned, uniq_name = "_QFomp_parallel_multiple_firstprivate_doEb"}
+! CHECK:             %[[B_PRIV_DECL:.*]]:2 = hlfir.declare %[[B_PRIV_ADDR]] {uniq_name = "_QFomp_parallel_multiple_firstprivate_doEb"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:             %[[B:.*]] = fir.load %[[B_DECL]]#0 : !fir.ref<i32>
+! CHECK:             hlfir.assign %[[B]] to %[[B_PRIV_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+! CHECK:             %[[VAL_8:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_9:.*]] = arith.constant 10 : i32
+! CHECK:             %[[VAL_10:.*]] = arith.constant 1 : i32
+! CHECK:             omp.wsloop   for  (%[[I:.*]]) : i32 = (%[[VAL_8]]) to (%[[VAL_9]]) inclusive step (%[[VAL_10]]) {
+! CHECK:               fir.store %[[I]] to %[[I_PRIV_DECL]]#1 : !fir.ref<i32>
+! CHECK:               fir.call @_QPbar(%[[I_PRIV_DECL]]#1, %[[A_PRIV_DECL]]#1) {{.*}}: (!fir.ref<i32>, !fir.ref<i32>) -> ()
+! CHECK:               omp.yield
+! CHECK:             }
+! CHECK:             omp.terminator
+! CHECK:           }
+! CHECK:           return
+! CHECK:         }
+
+!===============================================================================
+! Checking for the following construct
+!   !$omp parallel
+!   !$omp do private(...) firstprivate(...)
+!===============================================================================
+
+subroutine parallel_do_private(cond,nt)
+logical :: cond
+  integer :: nt
+  integer :: i
+  !$OMP PARALLEL
+  !$OMP DO PRIVATE(cond) FIRSTPRIVATE(nt)
+  do i=1, 9
+    call foo(i, cond, nt)
+  end do
+  !$OMP END DO
+  !$OMP END PARALLEL
+end subroutine parallel_do_private
+
+! CHECK-LABEL:   func.func @_QPparallel_do_private(
+! CHECK-SAME:                                      %[[VAL_0:.*]]: !fir.ref<!fir.logical<4>> {fir.bindc_name = "cond"},
+! CHECK-SAME:                                      %[[VAL_1:.*]]: !fir.ref<i32> {fir.bindc_name = "nt"}) {
+! CHECK:           %[[NT_DECL:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFparallel_do_privateEnt"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           omp.parallel   {
+! CHECK:             %[[I_PRIV_ADDR:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+! CHECK:             %[[I_PRIV_DECL:.*]]:2 = hlfir.declare %[[I_PRIV_ADDR]] {uniq_name = "_QFparallel_do_privateEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:             %[[COND_PRIV_ADDR:.*]] = fir.alloca !fir.logical<4> {bindc_name = "cond", pinned, uniq_name = "_QFparallel_do_privateEcond"}
+! CHECK:             %[[COND_PRIV_DECL:.*]]:2 = hlfir.declare %[[COND_PRIV_ADDR]] {uniq_name = "_QFparallel_do_privateEcond"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK:             %[[NT_PRIV_ADDR:.*]] = fir.alloca i32 {bindc_name = "nt", pinned, uniq_name = "_QFparallel_do_privateEnt"}
+! CHECK:             %[[NT_PRIV_DECL:.*]]:2 = hlfir.declare %[[NT_PRIV_ADDR]] {uniq_name = "_QFparallel_do_privateEnt"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:             %[[NT_VAL:.*]] = fir.load %[[NT_DECL]]#0 : !fir.ref<i32>
+! CHECK:             hlfir.assign %[[NT_VAL]] to %[[NT_PRIV_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+! CHECK:             %[[VAL_7:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_8:.*]] = arith.constant 9 : i32
+! CHECK:             %[[VAL_9:.*]] = arith.constant 1 : i32
+! CHECK:             omp.wsloop   for  (%[[I:.*]]) : i32 = (%[[VAL_7]]) to (%[[VAL_8]]) inclusive step (%[[VAL_9]]) {
+! CHECK:               fir.store %[[I]] to %[[I_PRIV_DECL]]#1 : !fir.ref<i32>
+! CHECK:               fir.call @_QPfoo(%[[I_PRIV_DECL]]#1, %[[COND_PRIV_DECL]]#1, %[[NT_PRIV_DECL]]#1) {{.*}}: (!fir.ref<i32>, !fir.ref<!fir.logical<4>>, !fir.ref<i32>) -> ()
+! CHECK:               omp.yield
+! CHECK:             }
+! CHECK:             omp.terminator
+! CHECK:           }
+! CHECK:           return
+! CHECK:         }
+
+!===============================================================================
+! Checking for the following construct
+!   !$omp parallel
+!   !$omp do firstprivate(...) firstprivate(...)
+!===============================================================================
+
+subroutine omp_parallel_do_multiple_firstprivate(a, b)
+  integer::a, b
+  !$OMP PARALLEL
+  !$OMP DO FIRSTPRIVATE(a) FIRSTPRIVATE(b)
+  do i=1, 10
+    call bar(i, a)
+  end do
+  !$OMP END DO
+  !$OMP END PARALLEL
+end subroutine omp_parallel_do_multiple_firstprivate
+
+! CHECK-LABEL:   func.func @_QPomp_parallel_do_multiple_firstprivate(
+! CHECK-SAME:                                                        %[[A_ADDR:.*]]: !fir.ref<i32> {fir.bindc_name = "a"},
+! CHECK-SAME:                                                        %[[B_ADDR:.*]]: !fir.ref<i32> {fir.bindc_name = "b"}) {
+! CHECK:            %[[A_DECL:.*]]:2 = hlfir.declare %[[A_ADDR]] {uniq_name = "_QFomp_parallel_do_multiple_firstprivateEa"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:            %[[B_DECL:.*]]:2 = hlfir.declare %[[B_ADDR]] {uniq_name = "_QFomp_parallel_do_multiple_firstprivateEb"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>
+! CHECK:           omp.parallel   {
+! CHECK:             %[[I_PRIV_ADDR:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+! CHECK:             %[[I_PRIV_DECL:.*]]:2 = hlfir.declare %[[I_PRIV_ADDR]] {uniq_name = "_QFomp_parallel_do_multiple_firstprivateEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:             %[[A_PRIV_ADDR:.*]] = fir.alloca i32 {bindc_name = "a", pinned, uniq_name = "_QFomp_parallel_do_multiple_firstprivateEa"}
+! CHECK:             %[[A_PRIV_DECL:.*]]:2 = hlfir.declare %[[A_PRIV_ADDR]] {uniq_name = "_QFomp_parallel_do_multiple_firstprivateEa"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:             %[[A:.*]] = fir.load %[[A_DECL]]#0 : !fir.ref<i32>
+! CHECK:             hlfir.assign %[[A]] to %[[A_PRIV_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+! CHECK:             %[[B_PRIV_ADDR:.*]] = fir.alloca i32 {bindc_name = "b", pinned, uniq_name = "_QFomp_parallel_do_multiple_firstprivateEb"}
+! CHECK:             %[[B_PRIV_DECL:.*]]:2 = hlfir.declare %[[B_PRIV_ADDR]] {uniq_name = "_QFomp_parallel_do_multiple_firstprivateEb"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:             %[[B:.*]] = fir.load %[[B_DECL]]#0 : !fir.ref<i32>
+! CHECK:             hlfir.assign %[[B]] to %[[B_PRIV_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+! CHECK:             %[[VAL_8:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_9:.*]] = arith.constant 10 : i32
+! CHECK:             %[[VAL_10:.*]] = arith.constant 1 : i32
+! CHECK:             omp.wsloop   for  (%[[I:.*]]) : i32 = (%[[VAL_8]]) to (%[[VAL_9]]) inclusive step (%[[VAL_10]]) {
+! CHECK:               fir.store %[[I]] to %[[I_PRIV_DECL]]#1 : !fir.ref<i32>
+! CHECK:               fir.call @_QPbar(%[[I_PRIV_DECL]]#1, %[[A_PRIV_DECL]]#1) {{.*}}: (!fir.ref<i32>, !fir.ref<i32>) -> ()
+! CHECK:               omp.yield
+! CHECK:             }
+! CHECK:             omp.terminator
+! CHECK:           }
+! CHECK:           return
+! CHECK:         }
diff --git a/flang/test/Lower/OpenMP/parallel.f90 b/flang/test/Lower/OpenMP/parallel.f90
new file mode 100644
index 0000000000000..0e43244994cfd
--- /dev/null
+++ b/flang/test/Lower/OpenMP/parallel.f90
@@ -0,0 +1,206 @@
+!RUN: %flang_fc1 -emit-hlfir -fopenmp %s -o - | FileCheck %s
+
+!CHECK-LABEL: func @_QPparallel_simple
+subroutine parallel_simple()
+   !CHECK: omp.parallel
+!$omp parallel
+   !CHECK: fir.call
+   call f1()
+!$omp end parallel
+end subroutine parallel_simple
+
+!===============================================================================
+! `if` clause
+!===============================================================================
+
+!CHECK-LABEL: func @_QPparallel_if
+subroutine parallel_if(alpha, beta, gamma)
+   integer, intent(in) :: alpha
+   logical, intent(in) :: beta
+   logical(1) :: logical1
+   logical(2) :: logical2
+   logical(4) :: logical4
+   logical(8) :: logical8
+
+   !CHECK: omp.parallel if(%{{.*}} : i1) {
+   !$omp parallel if(alpha .le. 0)
+   !CHECK: fir.call
+   call f1()
+   !CHECK: omp.terminator
+   !$omp end parallel
+
+   !CHECK: omp.parallel if(%{{.*}} : i1) {
+   !$omp parallel if(.false.)
+   !CHECK: fir.call
+   call f2()
+   !CHECK: omp.terminator
+   !$omp end parallel
+
+   !CHECK: omp.parallel if(%{{.*}} : i1) {
+   !$omp parallel if(alpha .ge. 0)
+   !CHECK: fir.call
+   call f3()
+   !CHECK: omp.terminator
+   !$omp end parallel
+
+   !CHECK: omp.parallel if(%{{.*}} : i1) {
+   !$omp parallel if(.true.)
+   !CHECK: fir.call
+   call f4()
+   !CHECK: omp.terminator
+   !$omp end parallel
+
+   !CHECK: omp.parallel if(%{{.*}} : i1) {
+   !$omp parallel if(beta)
+   !CHECK: fir.call
+   call f1()
+   !CHECK: omp.terminator
+   !$omp end parallel
+
+   !CHECK: omp.parallel if(%{{.*}} : i1) {
+   !$omp parallel if(logical1)
+   !CHECK: fir.call
+   call f1()
+   !CHECK: omp.terminator
+   !$omp end parallel
+
+   !CHECK: omp.parallel if(%{{.*}} : i1) {
+   !$omp parallel if(logical2)
+   !CHECK: fir.call
+   call f1()
+   !CHECK: omp.terminator
+   !$omp end parallel
+
+   !CHECK: omp.parallel if(%{{.*}} : i1) {
+   !$omp parallel if(logical4)
+   !CHECK: fir.call
+   call f1()
+   !CHECK: omp.terminator
+   !$omp end parallel
+
+   !CHECK: omp.parallel if(%{{.*}} : i1) {
+   !$omp parallel if(logical8)
+   !CHECK: fir.call
+   call f1()
+   !CHECK: omp.terminator
+   !$omp end parallel
+
+end subroutine parallel_if
+
+!===============================================================================
+! `num_threads` clause
+!===============================================================================
+
+!CHECK-LABEL: func @_QPparallel_numthreads
+subroutine parallel_numthreads(num_threads)
+   integer, intent(inout) :: num_threads
+
+   !CHECK: omp.parallel num_threads(%{{.*}}: i32) {
+   !$omp parallel num_threads(16)
+   !CHECK: fir.call
+   call f1()
+   !CHECK: omp.terminator
+   !$omp end parallel
+
+   num_threads = 4
+
+   !CHECK: omp.parallel num_threads(%{{.*}} : i32) {
+   !$omp parallel num_threads(num_threads)
+   !CHECK: fir.call
+   call f2()
+   !CHECK: omp.terminator
+   !$omp end parallel
+
+end subroutine parallel_numthreads
+
+!===============================================================================
+! `proc_bind` clause
+!===============================================================================
+
+!CHECK-LABEL: func @_QPparallel_proc_bind
+subroutine parallel_proc_bind()
+
+   !CHECK: omp.parallel proc_bind(master) {
+   !$omp parallel proc_bind(master)
+   !CHECK: fir.call
+   call f1()
+   !CHECK: omp.terminator
+   !$omp end parallel
+
+   !CHECK: omp.parallel proc_bind(close) {
+   !$omp parallel proc_bind(close)
+   !CHECK: fir.call
+   call f2()
+   !CHECK: omp.terminator
+   !$omp end parallel
+
+   !CHECK: omp.parallel proc_bind(spread) {
+   !$omp parallel proc_bind(spread)
+   !CHECK: fir.call
+   call f3()
+   !CHECK: omp.terminator
+   !$omp end parallel
+
+end subroutine parallel_proc_bind
+
+!===============================================================================
+! `allocate` clause
+!===============================================================================
+
+!CHECK-LABEL: func @_QPparallel_allocate
+subroutine parallel_allocate()
+   use omp_lib
+   integer :: x
+   !CHECK: omp.parallel allocate(
+   !CHECK: %{{.+}} : i32 -> %{{.+}} : !fir.ref<i32>
+   !CHECK: ) {
+   !$omp parallel allocate(omp_high_bw_mem_alloc: x) private(x)
+   !CHECK: arith.addi
+   x = x + 12
+   !CHECK: omp.terminator
+   !$omp end parallel
+end subroutine parallel_allocate
+
+!===============================================================================
+! multiple clauses
+!===============================================================================
+
+!CHECK-LABEL: func @_QPparallel_multiple_clauses
+subroutine parallel_multiple_clauses(alpha, num_threads)
+   use omp_lib
+   integer, intent(inout) :: alpha
+   integer, intent(in) :: num_threads
+
+   !CHECK: omp.parallel if({{.*}} : i1) proc_bind(master) {
+   !$omp parallel if(alpha .le. 0) proc_bind(master)
+   !CHECK: fir.call
+   call f1()
+   !CHECK: omp.terminator
+   !$omp end parallel
+
+   !CHECK: omp.parallel num_threads({{.*}} : i32) proc_bind(close) {
+   !$omp parallel proc_bind(close) num_threads(num_threads)
+   !CHECK: fir.call
+   call f2()
+   !CHECK: omp.terminator
+   !$omp end parallel
+
+   !CHECK: omp.parallel if({{.*}} : i1) num_threads({{.*}} : i32) {
+   !$omp parallel num_threads(num_threads) if(alpha .le. 0)
+   !CHECK: fir.call
+   call f3()
+   !CHECK: omp.terminator
+   !$omp end parallel
+
+   !CHECK: omp.parallel if({{.*}} : i1) num_threads({{.*}} : i32) allocate(
+   !CHECK: %{{.+}} : i32 -> %{{.+}} : !fir.ref<i32>
+   !CHECK: ) {
+   !$omp parallel num_threads(num_threads) if(alpha .le. 0) allocate(omp_high_bw_mem_alloc: alpha) private(alpha)
+   !CHECK: fir.call
+   call f3()
+   !CHECK: arith.addi
+   alpha = alpha + 12
+   !CHECK: omp.terminator
+   !$omp end parallel
+
+end subroutine parallel_multiple_clauses

From 45ccc1666c723e11d7b0148b2ef5c37c7a36e916 Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett@linaro.org>
Date: Thu, 26 Oct 2023 16:17:14 +0000
Subject: [PATCH 096/877] [lldb][test][Windows] XFAIL IR memory map test

Since https://github.com/llvm/llvm-project/pull/68052 this has been failing.

https://lab.llvm.org/buildbot/#/builders/219/builds/6545

Follow up changes have not fixed it, XFAIL while I debug it.
---
 lldb/test/Shell/Expr/TestIRMemoryMapWindows.test | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lldb/test/Shell/Expr/TestIRMemoryMapWindows.test b/lldb/test/Shell/Expr/TestIRMemoryMapWindows.test
index ae29492c9ccc9..f9f4da3c40920 100644
--- a/lldb/test/Shell/Expr/TestIRMemoryMapWindows.test
+++ b/lldb/test/Shell/Expr/TestIRMemoryMapWindows.test
@@ -1,4 +1,5 @@
 # REQUIRES: system-windows
+# XFAIL: system-windows
 
 # RUN: %clang_cl_host /Zi /GS- %p/Inputs/call-function.cpp /c /o %t.obj
 # RUN: %msvc_link /debug:full %t.obj /out:%t

From b67d3702577d4a1848bce9be9887e554178a421a Mon Sep 17 00:00:00 2001
From: Daniel Grumberg <dgrumberg@apple.com>
Date: Thu, 26 Oct 2023 17:32:36 +0100
Subject: [PATCH 097/877] [clang] Prioritze decl comments from macro expansion
 site (#65481)

For declarations declared inside a macro, e.g.:
```
`#define MAKE_FUNC(suffix) \
         /// Not selected doc comment \
         void func_##suffix(void) {  }

/// Doc comment foo
MAKE_FUNC(foo)

/// Doc comment bar
MAKE_FUNC(bar)
````

Prefer the doc comment at the expansion site instead of the one defined
in the macro.

rdar://113995729
---
 clang/lib/AST/ASTContext.cpp               | 197 ++++++++-------------
 clang/test/Index/annotate-comments-objc.m  |  48 +++--
 clang/unittests/Tooling/SourceCodeTest.cpp |   8 +-
 3 files changed, 102 insertions(+), 151 deletions(-)

diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp
index 0b6ed82cc5ba0..fea7f2400b31e 100644
--- a/clang/lib/AST/ASTContext.cpp
+++ b/clang/lib/AST/ASTContext.cpp
@@ -112,10 +112,10 @@ enum FloatingRank {
   Ibm128Rank
 };
 
-/// \returns location that is relevant when searching for Doc comments related
-/// to \p D.
-static SourceLocation getDeclLocForCommentSearch(const Decl *D,
-                                                 SourceManager &SourceMgr) {
+/// \returns The locations that are relevant when searching for Doc comments
+/// related to \p D.
+static SmallVector<SourceLocation, 2>
+getDeclLocsForCommentSearch(const Decl *D, SourceManager &SourceMgr) {
   assert(D);
 
   // User can not attach documentation to implicit declarations.
@@ -167,115 +167,48 @@ static SourceLocation getDeclLocForCommentSearch(const Decl *D,
       isa<TemplateTemplateParmDecl>(D))
     return {};
 
+  SmallVector<SourceLocation, 2> Locations;
   // Find declaration location.
   // For Objective-C declarations we generally don't expect to have multiple
   // declarators, thus use declaration starting location as the "declaration
   // location".
   // For all other declarations multiple declarators are used quite frequently,
   // so we use the location of the identifier as the "declaration location".
+  SourceLocation BaseLocation;
   if (isa<ObjCMethodDecl>(D) || isa<ObjCContainerDecl>(D) ||
-      isa<ObjCPropertyDecl>(D) ||
-      isa<RedeclarableTemplateDecl>(D) ||
+      isa<ObjCPropertyDecl>(D) || isa<RedeclarableTemplateDecl>(D) ||
       isa<ClassTemplateSpecializationDecl>(D) ||
       // Allow association with Y across {} in `typedef struct X {} Y`.
       isa<TypedefDecl>(D))
-    return D->getBeginLoc();
+    BaseLocation = D->getBeginLoc();
+  else
+    BaseLocation = D->getLocation();
 
-  const SourceLocation DeclLoc = D->getLocation();
-  if (DeclLoc.isMacroID()) {
-    // There are (at least) three types of macros we care about here.
-    //
-    // 1. Macros that are used in the definition of a type outside the macro,
-    //    with a comment attached at the macro call site.
-    //    ```
-    //    #define MAKE_NAME(Foo) Name##Foo
-    //
-    //    /// Comment is here, where we use the macro.
-    //    struct MAKE_NAME(Foo) {
-    //        int a;
-    //        int b;
-    //    };
-    //    ```
-    // 2. Macros that define whole things along with the comment.
-    //    ```
-    //    #define MAKE_METHOD(name) \
-    //      /** Comment is here, inside the macro. */ \
-    //      void name() {}
-    //
-    //    struct S {
-    //      MAKE_METHOD(f)
-    //    }
-    //    ```
-    // 3. Macros that both declare a type and name a decl outside the macro.
-    //    ```
-    //    /// Comment is here, where we use the macro.
-    //    typedef NS_ENUM(NSInteger, Size) {
-    //        SizeWidth,
-    //        SizeHeight
-    //    };
-    //    ```
-    //    In this case NS_ENUM declares am enum type, and uses the same name for
-    //    the typedef declaration that appears outside the macro. The comment
-    //    here should be applied to both declarations inside and outside the
-    //    macro.
-    //
-    // We have found a Decl name that comes from inside a macro, but
-    // Decl::getLocation() returns the place where the macro is being called.
-    // If the declaration (and not just the name) resides inside the macro,
-    // then we want to map Decl::getLocation() into the macro to where the
-    // declaration and its attached comment (if any) were written.
-    //
-    // This mapping into the macro is done by mapping the location to its
-    // spelling location, however even if the declaration is inside a macro,
-    // the name's spelling can come from a macro argument (case 2 above). In
-    // this case mapping the location to the spelling location finds the
-    // argument's position (at `f` in MAKE_METHOD(`f`) above), which is not
-    // where the declaration and its comment are located.
-    //
-    // To avoid this issue, we make use of Decl::getBeginLocation() instead.
-    // While the declaration's position is where the name is written, the
-    // comment is always attached to the begining of the declaration, not to
-    // the name.
-    //
-    // In the first case, the begin location of the decl is outside the macro,
-    // at the location of `typedef`. This is where the comment is found as
-    // well. The begin location is not inside a macro, so it's spelling
-    // location is the same.
-    //
-    // In the second case, the begin location of the decl is the call to the
-    // macro, at `MAKE_METHOD`. However its spelling location is inside the
-    // the macro at the location of `void`. This is where the comment is found
-    // again.
-    //
-    // In the third case, there's no correct single behaviour. We want to use
-    // the comment outside the macro for the definition that's inside the macro.
-    // There is also a definition outside the macro, and we want the comment to
-    // apply to both. The cases we care about here is NS_ENUM() and
-    // NS_OPTIONS(). In general, if an enum is defined inside a macro, we should
-    // try to find the comment there.
-
-    // This is handling case 3 for NS_ENUM() and NS_OPTIONS(), which define
-    // enum types inside the macro.
-    if (isa<EnumDecl>(D)) {
-      SourceLocation MacroCallLoc = SourceMgr.getExpansionLoc(DeclLoc);
-      if (auto BufferRef =
-              SourceMgr.getBufferOrNone(SourceMgr.getFileID(MacroCallLoc));
-          BufferRef.has_value()) {
-        llvm::StringRef buffer = BufferRef->getBuffer().substr(
-            SourceMgr.getFileOffset(MacroCallLoc));
-        if (buffer.starts_with("NS_ENUM(") ||
-            buffer.starts_with("NS_OPTIONS(")) {
-          // We want to use the comment on the call to NS_ENUM and NS_OPTIONS
-          // macros for the types defined inside the macros, which is at the
-          // expansion location.
-          return MacroCallLoc;
-        }
-      }
+  if (!D->getLocation().isMacroID()) {
+    Locations.emplace_back(BaseLocation);
+  } else {
+    const auto *DeclCtx = D->getDeclContext();
+
+    // When encountering definitions generated from a macro (that are not
+    // contained by another declaration in the macro) we need to try and find
+    // the comment at the location of the expansion but if there is no comment
+    // there we should retry to see if there is a comment inside the macro as
+    // well. To this end we return first BaseLocation to first look at the
+    // expansion site, the second value is the spelling location of the
+    // beginning of the declaration defined inside the macro.
+    if (!(DeclCtx &&
+          Decl::castFromDeclContext(DeclCtx)->getLocation().isMacroID())) {
+      Locations.emplace_back(SourceMgr.getExpansionLoc(BaseLocation));
     }
-    return SourceMgr.getSpellingLoc(D->getBeginLoc());
+
+    // We use Decl::getBeginLoc() and not just BaseLocation here to ensure that
+    // we don't refer to the macro argument location at the expansion site (this
+    // can happen if the name's spelling is provided via macro argument), and
+    // always to the declaration itself.
+    Locations.emplace_back(SourceMgr.getSpellingLoc(D->getBeginLoc()));
   }
 
-  return DeclLoc;
+  return Locations;
 }
 
 RawComment *ASTContext::getRawCommentForDeclNoCacheImpl(
@@ -357,30 +290,36 @@ RawComment *ASTContext::getRawCommentForDeclNoCacheImpl(
 }
 
 RawComment *ASTContext::getRawCommentForDeclNoCache(const Decl *D) const {
-  const SourceLocation DeclLoc = getDeclLocForCommentSearch(D, SourceMgr);
+  const auto DeclLocs = getDeclLocsForCommentSearch(D, SourceMgr);
 
-  // If the declaration doesn't map directly to a location in a file, we
-  // can't find the comment.
-  if (DeclLoc.isInvalid() || !DeclLoc.isFileID())
-    return nullptr;
+  for (const auto DeclLoc : DeclLocs) {
+    // If the declaration doesn't map directly to a location in a file, we
+    // can't find the comment.
+    if (DeclLoc.isInvalid() || !DeclLoc.isFileID())
+      continue;
 
-  if (ExternalSource && !CommentsLoaded) {
-    ExternalSource->ReadComments();
-    CommentsLoaded = true;
-  }
+    if (ExternalSource && !CommentsLoaded) {
+      ExternalSource->ReadComments();
+      CommentsLoaded = true;
+    }
 
-  if (Comments.empty())
-    return nullptr;
+    if (Comments.empty())
+      continue;
 
-  const FileID File = SourceMgr.getDecomposedLoc(DeclLoc).first;
-  if (!File.isValid()) {
-    return nullptr;
+    const FileID File = SourceMgr.getDecomposedLoc(DeclLoc).first;
+    if (!File.isValid())
+      continue;
+
+    const auto CommentsInThisFile = Comments.getCommentsInFile(File);
+    if (!CommentsInThisFile || CommentsInThisFile->empty())
+      continue;
+
+    if (RawComment *Comment =
+            getRawCommentForDeclNoCacheImpl(D, DeclLoc, *CommentsInThisFile))
+      return Comment;
   }
-  const auto CommentsInThisFile = Comments.getCommentsInFile(File);
-  if (!CommentsInThisFile || CommentsInThisFile->empty())
-    return nullptr;
 
-  return getRawCommentForDeclNoCacheImpl(D, DeclLoc, *CommentsInThisFile);
+  return nullptr;
 }
 
 void ASTContext::addComment(const RawComment &RC) {
@@ -584,7 +523,6 @@ void ASTContext::attachCommentsToJustParsedDecls(ArrayRef<Decl *> Decls,
   // declaration, but also comments that *follow* the declaration -- thanks to
   // the lookahead in the lexer: we've consumed the semicolon and looked
   // ahead through comments.
-
   for (const Decl *D : Decls) {
     assert(D);
     if (D->isInvalidDecl())
@@ -592,19 +530,22 @@ void ASTContext::attachCommentsToJustParsedDecls(ArrayRef<Decl *> Decls,
 
     D = &adjustDeclToTemplate(*D);
 
-    const SourceLocation DeclLoc = getDeclLocForCommentSearch(D, SourceMgr);
-
-    if (DeclLoc.isInvalid() || !DeclLoc.isFileID())
-      continue;
-
     if (DeclRawComments.count(D) > 0)
       continue;
 
-    if (RawComment *const DocComment =
-            getRawCommentForDeclNoCacheImpl(D, DeclLoc, *CommentsInThisFile)) {
-      cacheRawCommentForDecl(*D, *DocComment);
-      comments::FullComment *FC = DocComment->parse(*this, PP, D);
-      ParsedComments[D->getCanonicalDecl()] = FC;
+    const auto DeclLocs = getDeclLocsForCommentSearch(D, SourceMgr);
+
+    for (const auto DeclLoc : DeclLocs) {
+      if (DeclLoc.isInvalid() || !DeclLoc.isFileID())
+        continue;
+
+      if (RawComment *const DocComment = getRawCommentForDeclNoCacheImpl(
+              D, DeclLoc, *CommentsInThisFile)) {
+        cacheRawCommentForDecl(*D, *DocComment);
+        comments::FullComment *FC = DocComment->parse(*this, PP, D);
+        ParsedComments[D->getCanonicalDecl()] = FC;
+        break;
+      }
     }
   }
 }
diff --git a/clang/test/Index/annotate-comments-objc.m b/clang/test/Index/annotate-comments-objc.m
index 6a48d9ae8f2cb..f013684c1a638 100644
--- a/clang/test/Index/annotate-comments-objc.m
+++ b/clang/test/Index/annotate-comments-objc.m
@@ -46,18 +46,23 @@ - (void)method1_isdoxy4; /*!< method1_isdoxy4 IS_DOXYGEN_SINGLE */
 // attach unrelated comments in the following cases where tag decls are
 // embedded in declarators.
 
-#define DECLARE_FUNCTIONS(suffix) \
-    /** functionFromMacro IS_DOXYGEN_SINGLE */ \
-    void functionFromMacro(void) { \
-      typedef struct Struct_notdoxy Struct_notdoxy; \
-    } \
-    /** functionFromMacroWithSuffix IS_DOXYGEN_SINGLE */ \
-    void functionFromMacro##suffix(void) { \
-      typedef struct Struct_notdoxy Struct_notdoxy; \
-    }
-
-/// IS_DOXYGEN_NOT_ATTACHED
-DECLARE_FUNCTIONS(WithSuffix)
+#define DECLARE_FUNCTIONS_COMMENTS_IN_MACRO(suffix) \
+  /** functionFromMacro IS_DOXYGEN_SINGLE */ \
+  void functionFromMacro(void) { \
+    typedef struct Struct_notdoxy Struct_notdoxy; \
+  } \
+  /** functionFromMacroWithSuffix IS_DOXYGEN_SINGLE */ \
+  void functionFromMacro##suffix(void) { \
+    typedef struct Struct_notdoxy Struct_notdoxy; \
+  }
+
+DECLARE_FUNCTIONS_COMMENTS_IN_MACRO(WithSuffix)
+
+#define DECLARE_FUNCTIONS \
+  void functionFromMacroWithCommentFromExpansionSite(void) { typedef struct Struct_notdoxy Struct_notdoxy; }
+
+/// functionFromMacroWithCommentFromExpansionSite IS_DOXYGEN_SINGLE
+DECLARE_FUNCTIONS
 
 /// typedef_isdoxy1 IS_DOXYGEN_SINGLE
 typedef struct Struct_notdoxy *typedef_isdoxy1;
@@ -68,9 +73,14 @@ void functionFromMacro(void) { \
   /** namedEnumFromMacro IS_DOXYGEN_SINGLE */ \
   enum name { B };
 
-/// IS_DOXYGEN_NOT_ATTACHED
 DECLARE_ENUMS(namedEnumFromMacro)
 
+#define MYENUM(name) enum name
+struct Foo {
+  /// Vehicles IS_DOXYGEN_SINGLE
+  MYENUM(Vehicles) { Car, Motorbike, Boat} a;
+};
+
 #endif
 
 // RUN: rm -rf %t
@@ -133,8 +143,10 @@ void functionFromMacro(void) { \
 // CHECK: annotate-comments-objc.m:41:22: EnumDecl=An_NS_ENUM_isdoxy1:{{.*}} An_NS_ENUM_isdoxy1 IS_DOXYGEN_SINGLE
 // CHECK: annotate-comments-objc.m:41:22: TypedefDecl=An_NS_ENUM_isdoxy1:{{.*}} An_NS_ENUM_isdoxy1 IS_DOXYGEN_SINGLE
 // CHECK: annotate-comments-objc.m:41:22: EnumDecl=An_NS_ENUM_isdoxy1:{{.*}} An_NS_ENUM_isdoxy1 IS_DOXYGEN_SINGLE
-// CHECK: annotate-comments-objc.m:60:1: FunctionDecl=functionFromMacro:{{.*}} BriefComment=[functionFromMacro IS_DOXYGEN_SINGLE]
-// CHECK: annotate-comments-objc.m:60:1: FunctionDecl=functionFromMacroWithSuffix:{{.*}} BriefComment=[functionFromMacroWithSuffix IS_DOXYGEN_SINGLE]
-// CHECK: annotate-comments-objc.m:63:32: TypedefDecl=typedef_isdoxy1:{{.*}} typedef_isdoxy1 IS_DOXYGEN_SINGLE
-// CHECK: annotate-comments-objc.m:72:1: EnumDecl=enumFromMacro:{{.*}} BriefComment=[enumFromMacro IS_DOXYGEN_SINGLE]
-// CHECK: annotate-comments-objc.m:72:15: EnumDecl=namedEnumFromMacro:{{.*}} BriefComment=[namedEnumFromMacro IS_DOXYGEN_SINGLE]
+// CHECK: annotate-comments-objc.m:59:1: FunctionDecl=functionFromMacro:{{.*}} BriefComment=[functionFromMacro IS_DOXYGEN_SINGLE]
+// CHECK: annotate-comments-objc.m:59:1: FunctionDecl=functionFromMacroWithSuffix:{{.*}} BriefComment=[functionFromMacroWithSuffix IS_DOXYGEN_SINGLE]
+// CHECK: annotate-comments-objc.m:65:1: FunctionDecl=functionFromMacroWithCommentFromExpansionSite:{{.*}} BriefComment=[functionFromMacroWithCommentFromExpansionSite IS_DOXYGEN_SINGLE]
+// CHECK: annotate-comments-objc.m:68:32: TypedefDecl=typedef_isdoxy1:{{.*}} typedef_isdoxy1 IS_DOXYGEN_SINGLE
+// CHECK: annotate-comments-objc.m:76:1: EnumDecl=enumFromMacro:{{.*}} BriefComment=[enumFromMacro IS_DOXYGEN_SINGLE]
+// CHECK: annotate-comments-objc.m:76:15: EnumDecl=namedEnumFromMacro:{{.*}} BriefComment=[namedEnumFromMacro IS_DOXYGEN_SINGLE]
+// CHECK: annotate-comments-objc.m:81:10: EnumDecl=Vehicles:{{.*}} Vehicles IS_DOXYGEN_SINGLE
diff --git a/clang/unittests/Tooling/SourceCodeTest.cpp b/clang/unittests/Tooling/SourceCodeTest.cpp
index 3d1dbceb63a7f..3641d2ee453f4 100644
--- a/clang/unittests/Tooling/SourceCodeTest.cpp
+++ b/clang/unittests/Tooling/SourceCodeTest.cpp
@@ -372,13 +372,11 @@ TEST(SourceCodeTest, getAssociatedRangeWithComments) {
       #define DECL /* Comment */ int x
       $r[[DECL;]])cpp");
 
-  // Does not include comments when only the decl or the comment come from a
-  // macro.
-  // FIXME: Change code to allow this.
   Visit(R"cpp(
       #define DECL int x
-      // Comment
-      $r[[DECL;]])cpp");
+      $r[[// Comment
+      DECL;]])cpp");
+  // Does not include comments when only the comment come from a macro.
   Visit(R"cpp(
       #define COMMENT /* Comment */
       COMMENT

From 3e79f4d2c24faaa24a4a12041b4c33d39fec2ec7 Mon Sep 17 00:00:00 2001
From: Abhishek Varma <avarma094@gmail.com>
Date: Thu, 26 Oct 2023 22:23:17 +0530
Subject: [PATCH 098/877] [MLIR][Arith] Fix arith::AtomicRMWKind::maximumf's
 identity value (#70312)

-- In order to compute maximum, we should always initialise the
   result with the largest negative value possible for the concerned
   element type, instead of the smallest.
-- This commit essentially adds a fix to this issue.

Signed-off-by: Abhishek Varma <abhishek@nod-labs.com>
---
 mlir/lib/Dialect/Arith/IR/ArithOps.cpp                     | 2 +-
 mlir/test/Dialect/Linalg/transform-op-decompose.mlir       | 2 +-
 mlir/test/Dialect/Linalg/transform-op-split-reduction.mlir | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/mlir/lib/Dialect/Arith/IR/ArithOps.cpp b/mlir/lib/Dialect/Arith/IR/ArithOps.cpp
index 1002719f0b89f..56d5e0fed7618 100644
--- a/mlir/lib/Dialect/Arith/IR/ArithOps.cpp
+++ b/mlir/lib/Dialect/Arith/IR/ArithOps.cpp
@@ -2412,7 +2412,7 @@ TypedAttr mlir::arith::getIdentityValueAttr(AtomicRMWKind kind, Type resultType,
     const llvm::fltSemantics &semantic =
         llvm::cast<FloatType>(resultType).getFloatSemantics();
     APFloat identity = useOnlyFiniteValue
-                           ? APFloat::getSmallest(semantic, /*Negative=*/true)
+                           ? APFloat::getLargest(semantic, /*Negative=*/true)
                            : APFloat::getInf(semantic, /*Negative=*/true);
     return builder.getFloatAttr(resultType, identity);
   }
diff --git a/mlir/test/Dialect/Linalg/transform-op-decompose.mlir b/mlir/test/Dialect/Linalg/transform-op-decompose.mlir
index f057a70d13964..ef0aca2cc366f 100644
--- a/mlir/test/Dialect/Linalg/transform-op-decompose.mlir
+++ b/mlir/test/Dialect/Linalg/transform-op-decompose.mlir
@@ -210,7 +210,7 @@ func.func @softmax(%arg0: tensor<2x16x32xf32>, %dst: tensor<2x16x32xf32>) -> ten
 // CHECK-LABEL:      func.func @softmax(
 // CHECK-SAME:           %[[ARG0:[a-zA-Z0-9_]+]]: tensor<2x16x32xf32>, %[[DST:[a-zA-Z0-9_]+]]: tensor<2x16x32xf32>) -> tensor<2x16x32xf32> {
 // CHECK-DAG:        %[[D1:.+]] = tensor.empty() : tensor<2x16xf32>
-// CHECK-DAG:        %[[CST:.+]] = arith.constant -1.401300e-45 : f32
+// CHECK-DAG:        %[[CST:.+]] = arith.constant -3.40282347E+38 : f32
 // CHECK:        %[[D2:.+]] = linalg.fill ins(%[[CST]] : f32) outs(%[[D1]] : tensor<2x16xf32>) -> tensor<2x16xf32>
 // CHECK:        %[[D3:.+]] = linalg.generic {indexing_maps = [#[[$MAP]], #[[$MAP1]]], iterator_types = ["parallel",
 // CHECK-SAME:     "parallel", "reduction"]} ins(%[[ARG0]] : tensor<2x16x32xf32>) outs(%[[D2]] : tensor<2x16xf32>) {
diff --git a/mlir/test/Dialect/Linalg/transform-op-split-reduction.mlir b/mlir/test/Dialect/Linalg/transform-op-split-reduction.mlir
index af08354b1ee4a..006d6105677e9 100644
--- a/mlir/test/Dialect/Linalg/transform-op-split-reduction.mlir
+++ b/mlir/test/Dialect/Linalg/transform-op-split-reduction.mlir
@@ -176,7 +176,7 @@ func.func @generic_split_3d_ninf(%input: tensor<32x2xf32>, %input_2: tensor<5x32
 //  CHECK-DAG: #[[$MAP3:.*]] = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 //  CHECK-DAG: #[[$MAP4:.*]] = affine_map<(d0, d1, d2) -> (d0, d1)>
 // CHECK-LABEL:  func @generic_split_3d_ninf
-//  CHECK-DAG: %[[ID:.*]] = arith.constant -1.401300e-45 : f32
+//  CHECK-DAG: %[[ID:.*]] = arith.constant -3.40282347E+38 : f32
 //  CHECK-DAG: %[[I1:.*]] = tensor.expand_shape %{{.*}}[0, 1], [2]] : tensor<32x2xf32> into tensor<4x8x2xf32>
 //  CHECK-DAG: %[[I2:.*]] = tensor.expand_shape %{{.*}}[0], [1, 2]] : tensor<5x32xf32> into tensor<5x4x8xf32>
 //  CHECK-DAG: %[[INI:.*]] = tensor.empty() : tensor<5x2x4xf32>

From 178a1fea57b542c39de79624662085f86f3e348f Mon Sep 17 00:00:00 2001
From: Ilya Tocar <tokarip@google.com>
Date: Thu, 26 Oct 2023 13:09:20 -0400
Subject: [PATCH 099/877] [libc++] Optimize string operator[] for known large
 inputs (#69500)

If we know that index is larger than SSO size, we know that we can't be
in SSO case, and should access the pointer. This removes extra check
from operator[] for inputs known at compile time to be larger than SSO.
---
 libcxx/include/string                         |  6 ++++++
 .../basic.string/string.access/index.pass.cpp | 21 +++++++++++++++++++
 2 files changed, 27 insertions(+)

diff --git a/libcxx/include/string b/libcxx/include/string
index 91935162f0238..cf9f0c847eb43 100644
--- a/libcxx/include/string
+++ b/libcxx/include/string
@@ -1198,11 +1198,17 @@ public:
 
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 const_reference operator[](size_type __pos) const _NOEXCEPT {
     _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(__pos <= size(), "string index out of bounds");
+    if (__builtin_constant_p(__pos) && !__fits_in_sso(__pos)) {
+      return *(__get_long_pointer() + __pos);
+    }
     return *(data() + __pos);
   }
 
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 reference operator[](size_type __pos) _NOEXCEPT {
     _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(__pos <= size(), "string index out of bounds");
+    if (__builtin_constant_p(__pos) && !__fits_in_sso(__pos)) {
+      return *(__get_long_pointer() + __pos);
+    }
     return *(__get_pointer() + __pos);
   }
 
diff --git a/libcxx/test/std/strings/basic.string/string.access/index.pass.cpp b/libcxx/test/std/strings/basic.string/string.access/index.pass.cpp
index a270dd579667b..8ba8bf0c8b096 100644
--- a/libcxx/test/std/strings/basic.string/string.access/index.pass.cpp
+++ b/libcxx/test/std/strings/basic.string/string.access/index.pass.cpp
@@ -34,10 +34,31 @@ TEST_CONSTEXPR_CXX20 void test_string() {
   assert(s2[0] == '\0');
 }
 
+// Same, but for the string that doesn't fit into SSO.
+template <class S>
+TEST_CONSTEXPR_CXX20 void test_string_long() {
+  S s("0123456789012345678901234567890123456789");
+  const S& cs = s;
+  ASSERT_SAME_TYPE(decltype(s[0]), typename S::reference);
+  ASSERT_SAME_TYPE(decltype(cs[0]), typename S::const_reference);
+  LIBCPP_ASSERT_NOEXCEPT(s[0]);
+  LIBCPP_ASSERT_NOEXCEPT(cs[0]);
+  for (typename S::size_type i = 0; i < cs.size(); ++i) {
+    assert(s[i] == static_cast<char>('0' + (i % 10)));
+    assert(cs[i] == s[i]);
+  }
+  assert(s[33] == static_cast<char>('0' + (33 % 10)));
+  assert(cs[34] == s[34]);
+  assert(cs[cs.size()] == '\0');
+  const S s2 = S();
+  assert(s2[0] == '\0');
+}
+
 TEST_CONSTEXPR_CXX20 bool test() {
   test_string<std::string>();
 #if TEST_STD_VER >= 11
   test_string<std::basic_string<char, std::char_traits<char>, min_allocator<char>>>();
+  test_string_long<std::basic_string<char, std::char_traits<char>, min_allocator<char>>>();
 #endif
 
   return true;

From 59750027b91000a56735d4f533225a3aeda996ce Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2@gmail.com>
Date: Thu, 26 Oct 2023 13:10:44 -0400
Subject: [PATCH 100/877] [libc++][NFC] Remove unused typedefs in
 filesystem::path helpers (#70331)

I came across those typedefs while working on another change, and I
noticed they were just never used.
---
 libcxx/include/__filesystem/path.h | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/libcxx/include/__filesystem/path.h b/libcxx/include/__filesystem/path.h
index c104b003573dc..8a9350be2a00f 100644
--- a/libcxx/include/__filesystem/path.h
+++ b/libcxx/include/__filesystem/path.h
@@ -107,7 +107,6 @@ struct __is_pathable_string<
     _Void<typename __can_convert_char<_ECharT>::__char_type> >
     : public __can_convert_char<_ECharT> {
   using _Str = basic_string<_ECharT, _Traits, _Alloc>;
-  using _Base = __can_convert_char<_ECharT>;
 
   _LIBCPP_HIDE_FROM_ABI
   static _ECharT const* __range_begin(_Str const& __s) { return __s.data(); }
@@ -129,7 +128,6 @@ struct __is_pathable_string<
     _Void<typename __can_convert_char<_ECharT>::__char_type> >
     : public __can_convert_char<_ECharT> {
   using _Str = basic_string_view<_ECharT, _Traits>;
-  using _Base = __can_convert_char<_ECharT>;
 
   _LIBCPP_HIDE_FROM_ABI
   static _ECharT const* __range_begin(_Str const& __s) { return __s.data(); }
@@ -155,8 +153,6 @@ struct __is_pathable_char_array : false_type {};
 template <class _Source, class _ECharT, class _UPtr>
 struct __is_pathable_char_array<_Source, _ECharT*, _UPtr, true>
     : __can_convert_char<__remove_const_t<_ECharT> > {
-  using _Base = __can_convert_char<__remove_const_t<_ECharT> >;
-
   _LIBCPP_HIDE_FROM_ABI
   static _ECharT const* __range_begin(const _ECharT* __b) { return __b; }
 
@@ -185,7 +181,6 @@ struct __is_pathable_iter<
         typename iterator_traits<_Iter>::value_type>::__char_type> >
     : __can_convert_char<typename iterator_traits<_Iter>::value_type> {
   using _ECharT = typename iterator_traits<_Iter>::value_type;
-  using _Base = __can_convert_char<_ECharT>;
 
   _LIBCPP_HIDE_FROM_ABI
   static _Iter __range_begin(_Iter __b) { return __b; }

From bf92eba697453a358412af806df5b8df29a232e0 Mon Sep 17 00:00:00 2001
From: Matt Harding <majaharding@gmail.com>
Date: Thu, 26 Oct 2023 18:23:32 +0100
Subject: [PATCH 101/877] Fix comment in wasm unreachable test (#70340)

Some textual editing errors got through this pull request that was
merged a few weeks ago: https://github.com/llvm/llvm-project/pull/65876

This patch clears up the unintentional duplicated line, and white-space
at the end of the lines.
---
 llvm/test/CodeGen/WebAssembly/unreachable.ll | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/llvm/test/CodeGen/WebAssembly/unreachable.ll b/llvm/test/CodeGen/WebAssembly/unreachable.ll
index 72f865842bdca..5368c2ba5b8dc 100644
--- a/llvm/test/CodeGen/WebAssembly/unreachable.ll
+++ b/llvm/test/CodeGen/WebAssembly/unreachable.ll
@@ -80,11 +80,10 @@ define i32 @missing_ret_unreachable() {
   unreachable
 }
 
-; This is similar to the above test, but ensures wasm unreachable is emitted
-; This is similar to the above test, but the callee has a 'noreturn' attribute.    
-; There is an optimization that removes an 'unreachable' after a noreturn call,  
-; but Wasm backend doesn't use it and ignore `--no-trap-after-noreturn`, if      
-; given, to generate valid code. 
+; This is similar to the above test, but the callee has a 'noreturn' attribute.
+; There is an optimization that removes an 'unreachable' after a noreturn call,
+; but Wasm backend doesn't use it and ignore `--no-trap-after-noreturn`, if
+; given, to generate valid code.
 define i32 @missing_ret_noreturn_unreachable() {
 ; CHECK-LABEL: missing_ret_noreturn_unreachable:
 ; CHECK:         .functype missing_ret_noreturn_unreachable () -> (i32)

From 93659947d2ee76348e300b83f035e054909d56b0 Mon Sep 17 00:00:00 2001
From: Amara Emerson <amara@apple.com>
Date: Thu, 26 Oct 2023 10:29:12 -0700
Subject: [PATCH 102/877] [AArch64][GlobalISel] Add support for pre-indexed
 loads/stores. (#70185)

The pre-index matcher just needs some small heuristics to make sure it
doesn't cause regressions. Apart from that it's a simple change, since
the only difference is an immediate operand of '1' vs '0' in the
instruction.
---
 .../lib/CodeGen/GlobalISel/CombinerHelper.cpp |  23 +-
 .../Target/AArch64/AArch64ISelLowering.cpp    |   4 -
 .../GISel/AArch64InstructionSelector.cpp      |  83 ++--
 .../legalize-indexed-load-stores.mir          |  22 +
 .../AArch64/GlobalISel/store-merging.ll       |   4 +-
 .../CodeGen/AArch64/arm64-indexed-memory.ll   | 303 +++----------
 .../AArch64/arm64-indexed-vector-ldst.ll      | 428 ++++++------------
 7 files changed, 287 insertions(+), 580 deletions(-)

diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index 1cccddfd97222..3c2b5f490ccb8 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -1141,11 +1141,28 @@ bool CombinerHelper::findPreIndexCandidate(GLoadStore &LdSt, Register &Addr,
       return false;
   }
 
+  // Avoid increasing cross-block register pressure.
+  for (auto &AddrUse : MRI.use_nodbg_instructions(Addr))
+    if (AddrUse.getParent() != LdSt.getParent())
+      return false;
+
   // FIXME: check whether all uses of the base pointer are constant PtrAdds.
   // That might allow us to end base's liveness here by adjusting the constant.
-
-  return all_of(MRI.use_nodbg_instructions(Addr),
-                [&](MachineInstr &UseMI) { return dominates(LdSt, UseMI); });
+  bool RealUse = false;
+  for (auto &AddrUse : MRI.use_nodbg_instructions(Addr)) {
+    if (!dominates(LdSt, AddrUse))
+      return false; // All use must be dominated by the load/store.
+
+    // If Ptr may be folded in addressing mode of other use, then it's
+    // not profitable to do this transformation.
+    if (auto *UseLdSt = dyn_cast<GLoadStore>(&AddrUse)) {
+      if (!canFoldInAddressingMode(UseLdSt, TLI, MRI))
+        RealUse = true;
+    } else {
+      RealUse = true;
+    }
+  }
+  return RealUse;
 }
 
 bool CombinerHelper::matchCombineIndexedLoadStore(
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 038c23b5e8d50..5acc2ce58e6af 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -23718,10 +23718,6 @@ bool AArch64TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
 bool AArch64TargetLowering::isIndexingLegal(MachineInstr &MI, Register Base,
                                             Register Offset, bool IsPre,
                                             MachineRegisterInfo &MRI) const {
-  // HACK
-  if (IsPre)
-    return false; // Until we implement.
-
   auto CstOffset = getIConstantVRegVal(Offset, MRI);
   if (!CstOffset || CstOffset->isZero())
     return false;
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
index 941607dae29bb..9c5b34166ffaf 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
@@ -5659,24 +5659,34 @@ bool AArch64InstructionSelector::selectIndexedLoad(MachineInstr &MI,
   Register WriteBack = Ld.getWritebackReg();
   Register Base = Ld.getBaseReg();
   Register Offset = Ld.getOffsetReg();
-
-  if (Ld.isPre())
-    return false; // TODO: add pre-inc support
-
-  unsigned Opc = 0;
-  static constexpr unsigned GPROpcodes[] = {
-      AArch64::LDRBBpost, AArch64::LDRHHpost, AArch64::LDRWpost,
-      AArch64::LDRXpost};
-  static constexpr unsigned FPROpcodes[] = {
-      AArch64::LDRBpost, AArch64::LDRHpost, AArch64::LDRSpost,
-      AArch64::LDRDpost, AArch64::LDRQpost};
-
+  LLT Ty = MRI.getType(Dst);
+  assert(Ty.getSizeInBits() <= 128 && "Unexpected type for indexed load");
   unsigned MemSize = Ld.getMMO().getMemoryType().getSizeInBytes();
-  if (RBI.getRegBank(Dst, MRI, TRI)->getID() == AArch64::FPRRegBankID)
-    Opc = FPROpcodes[Log2_32(MemSize)];
-  else
-    Opc = GPROpcodes[Log2_32(MemSize)];
 
+  unsigned Opc = 0;
+  if (Ld.isPre()) {
+    static constexpr unsigned GPROpcodes[] = {
+        AArch64::LDRBBpre, AArch64::LDRHHpre, AArch64::LDRWpre,
+        AArch64::LDRXpre};
+    static constexpr unsigned FPROpcodes[] = {
+        AArch64::LDRBpre, AArch64::LDRHpre, AArch64::LDRSpre, AArch64::LDRDpre,
+        AArch64::LDRQpre};
+    if (RBI.getRegBank(Dst, MRI, TRI)->getID() == AArch64::FPRRegBankID)
+      Opc = FPROpcodes[Log2_32(MemSize)];
+    else
+      Opc = GPROpcodes[Log2_32(MemSize)];
+  } else {
+    static constexpr unsigned GPROpcodes[] = {
+        AArch64::LDRBBpost, AArch64::LDRHHpost, AArch64::LDRWpost,
+        AArch64::LDRXpost};
+    static constexpr unsigned FPROpcodes[] = {
+        AArch64::LDRBpost, AArch64::LDRHpost, AArch64::LDRSpost,
+        AArch64::LDRDpost, AArch64::LDRQpost};
+    if (RBI.getRegBank(Dst, MRI, TRI)->getID() == AArch64::FPRRegBankID)
+      Opc = FPROpcodes[Log2_32(MemSize)];
+    else
+      Opc = GPROpcodes[Log2_32(MemSize)];
+  }
   auto Cst = getIConstantVRegVal(Offset, MRI);
   if (!Cst)
     return false; // Shouldn't happen, but just in case.
@@ -5695,23 +5705,34 @@ bool AArch64InstructionSelector::selectIndexedStore(GIndexedStore &I,
   Register Base = I.getBaseReg();
   Register Offset = I.getOffsetReg();
   LLT ValTy = MRI.getType(Val);
-
-  if (I.isPre())
-    return false; // TODO: add pre-inc support
+  assert(ValTy.getSizeInBits() <= 128 && "Unexpected type for indexed store");
 
   unsigned Opc = 0;
-  static constexpr unsigned GPROpcodes[] = {
-      AArch64::STRBBpost, AArch64::STRHHpost, AArch64::STRWpost,
-      AArch64::STRXpost};
-  static constexpr unsigned FPROpcodes[] = {
-      AArch64::STRBpost, AArch64::STRHpost, AArch64::STRSpost,
-      AArch64::STRDpost, AArch64::STRQpost};
-
-  assert(ValTy.getSizeInBits() <= 128);
-  if (RBI.getRegBank(Val, MRI, TRI)->getID() == AArch64::FPRRegBankID)
-    Opc = FPROpcodes[Log2_32(ValTy.getSizeInBytes())];
-  else
-    Opc = GPROpcodes[Log2_32(ValTy.getSizeInBytes())];
+  if (I.isPre()) {
+    static constexpr unsigned GPROpcodes[] = {
+        AArch64::STRBBpre, AArch64::STRHHpre, AArch64::STRWpre,
+        AArch64::STRXpre};
+    static constexpr unsigned FPROpcodes[] = {
+        AArch64::STRBpre, AArch64::STRHpre, AArch64::STRSpre, AArch64::STRDpre,
+        AArch64::STRQpre};
+
+    if (RBI.getRegBank(Val, MRI, TRI)->getID() == AArch64::FPRRegBankID)
+      Opc = FPROpcodes[Log2_32(ValTy.getSizeInBytes())];
+    else
+      Opc = GPROpcodes[Log2_32(ValTy.getSizeInBytes())];
+  } else {
+    static constexpr unsigned GPROpcodes[] = {
+        AArch64::STRBBpost, AArch64::STRHHpost, AArch64::STRWpost,
+        AArch64::STRXpost};
+    static constexpr unsigned FPROpcodes[] = {
+        AArch64::STRBpost, AArch64::STRHpost, AArch64::STRSpost,
+        AArch64::STRDpost, AArch64::STRQpost};
+
+    if (RBI.getRegBank(Val, MRI, TRI)->getID() == AArch64::FPRRegBankID)
+      Opc = FPROpcodes[Log2_32(ValTy.getSizeInBytes())];
+    else
+      Opc = GPROpcodes[Log2_32(ValTy.getSizeInBytes())];
+  }
 
   auto Cst = getIConstantVRegVal(Offset, MRI);
   if (!Cst)
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-indexed-load-stores.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-indexed-load-stores.mir
index e82a0c219068f..bd0317ec6a136 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-indexed-load-stores.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-indexed-load-stores.mir
@@ -87,3 +87,25 @@ body: |
     $q0 = COPY %dst
     RET_ReallyLR implicit $x0, implicit $q0
 ...
+---
+name:            pre_store_s64
+body: |
+  bb.0:
+    liveins: $x0, $x1
+
+    ; CHECK-LABEL: name: pre_store_s64
+    ; CHECK: liveins: $x0, $x1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %ptr:_(p0) = COPY $x0
+    ; CHECK-NEXT: %val:_(s64) = COPY $x1
+    ; CHECK-NEXT: %offset:_(s64) = G_CONSTANT i64 8
+    ; CHECK-NEXT: %writeback:_(p0) = G_INDEXED_STORE %val(s64), %ptr, %offset(s64), 1 :: (store (s64))
+    ; CHECK-NEXT: $x0 = COPY %writeback(p0)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %ptr:_(p0) = COPY $x0
+    %val:_(s64) = COPY $x1
+    %offset:_(s64) = G_CONSTANT i64 8
+    %writeback:_(p0) = G_INDEXED_STORE %val, %ptr, %offset, 1 :: (store (s64), align 8)
+    $x0 = COPY %writeback
+    RET_ReallyLR implicit $x0
+...
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/store-merging.ll b/llvm/test/CodeGen/AArch64/GlobalISel/store-merging.ll
index 23886d8bc4a7b..07744dada4f1f 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/store-merging.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/store-merging.ll
@@ -83,8 +83,8 @@ define void @test_simple_vector(ptr %ptr) {
 ; CHECK-NEXT:    mov w8, #5 ; =0x5
 ; CHECK-NEXT:    strh w9, [x0, #2]
 ; CHECK-NEXT:    mov w9, #8 ; =0x8
-; CHECK-NEXT:    strh w8, [x0, #4]
-; CHECK-NEXT:    strh w9, [x0, #6]
+; CHECK-NEXT:    strh w8, [x0, #4]!
+; CHECK-NEXT:    strh w9, [x0, #2]
 ; CHECK-NEXT:    ret
   store <2 x i16> <i16 4, i16 7>, ptr %ptr
   %addr2 = getelementptr <2 x i16>, ptr %ptr, i64 1
diff --git a/llvm/test/CodeGen/AArch64/arm64-indexed-memory.ll b/llvm/test/CodeGen/AArch64/arm64-indexed-memory.ll
index a4da489171403..dc8cbd1e43b90 100644
--- a/llvm/test/CodeGen/AArch64/arm64-indexed-memory.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-indexed-memory.ll
@@ -197,88 +197,40 @@ define ptr @storef64(ptr %ptr, double %index, double %spacing) {
 
 
 define ptr @pref64(ptr %ptr, double %spacing) {
-; CHECK64-LABEL: pref64:
-; CHECK64:       ; %bb.0:
-; CHECK64-NEXT:    str d0, [x0, #32]!
-; CHECK64-NEXT:    ret
-;
-; GISEL-LABEL: pref64:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    mov x8, x0
-; GISEL-NEXT:    add x0, x0, #32
-; GISEL-NEXT:    str d0, [x8, #32]
-; GISEL-NEXT:    ret
-;
-; CHECK32-LABEL: pref64:
-; CHECK32:       ; %bb.0:
-; CHECK32-NEXT:    str d0, [x0, #32]!
-; CHECK32-NEXT:    ret
+; CHECK-LABEL: pref64:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    str d0, [x0, #32]!
+; CHECK-NEXT:    ret
   %incdec.ptr = getelementptr inbounds double, ptr %ptr, i64 4
   store double %spacing, ptr %incdec.ptr, align 4
   ret ptr %incdec.ptr
 }
 
 define ptr @pref32(ptr %ptr, float %spacing) {
-; CHECK64-LABEL: pref32:
-; CHECK64:       ; %bb.0:
-; CHECK64-NEXT:    str s0, [x0, #12]!
-; CHECK64-NEXT:    ret
-;
-; GISEL-LABEL: pref32:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    mov x8, x0
-; GISEL-NEXT:    add x0, x0, #12
-; GISEL-NEXT:    str s0, [x8, #12]
-; GISEL-NEXT:    ret
-;
-; CHECK32-LABEL: pref32:
-; CHECK32:       ; %bb.0:
-; CHECK32-NEXT:    str s0, [x0, #12]!
-; CHECK32-NEXT:    ret
+; CHECK-LABEL: pref32:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    str s0, [x0, #12]!
+; CHECK-NEXT:    ret
   %incdec.ptr = getelementptr inbounds float, ptr %ptr, i64 3
   store float %spacing, ptr %incdec.ptr, align 4
   ret ptr %incdec.ptr
 }
 
 define ptr @pref16(ptr %ptr, half %spacing) nounwind {
-; CHECK64-LABEL: pref16:
-; CHECK64:       ; %bb.0:
-; CHECK64-NEXT:    str h0, [x0, #6]!
-; CHECK64-NEXT:    ret
-;
-; GISEL-LABEL: pref16:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    mov x8, x0
-; GISEL-NEXT:    add x0, x0, #6
-; GISEL-NEXT:    str h0, [x8, #6]
-; GISEL-NEXT:    ret
-;
-; CHECK32-LABEL: pref16:
-; CHECK32:       ; %bb.0:
-; CHECK32-NEXT:    str h0, [x0, #6]!
-; CHECK32-NEXT:    ret
+; CHECK-LABEL: pref16:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    str h0, [x0, #6]!
+; CHECK-NEXT:    ret
   %incdec.ptr = getelementptr inbounds half, ptr %ptr, i64 3
   store half %spacing, ptr %incdec.ptr, align 2
   ret ptr %incdec.ptr
 }
 
 define ptr @pre64(ptr %ptr, i64 %spacing) {
-; CHECK64-LABEL: pre64:
-; CHECK64:       ; %bb.0:
-; CHECK64-NEXT:    str x1, [x0, #16]!
-; CHECK64-NEXT:    ret
-;
-; GISEL-LABEL: pre64:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    mov x8, x0
-; GISEL-NEXT:    add x0, x0, #16
-; GISEL-NEXT:    str x1, [x8, #16]
-; GISEL-NEXT:    ret
-;
-; CHECK32-LABEL: pre64:
-; CHECK32:       ; %bb.0:
-; CHECK32-NEXT:    str x1, [x0, #16]!
-; CHECK32-NEXT:    ret
+; CHECK-LABEL: pre64:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    str x1, [x0, #16]!
+; CHECK-NEXT:    ret
   %incdec.ptr = getelementptr inbounds i64, ptr %ptr, i64 2
   store i64 %spacing, ptr %incdec.ptr, align 4
   ret ptr %incdec.ptr
@@ -297,44 +249,20 @@ define ptr @pre64idxpos256(ptr %ptr, i64 %spacing) {
 }
 
 define ptr @pre64idxneg256(ptr %ptr, i64 %spacing) {
-; CHECK64-LABEL: pre64idxneg256:
-; CHECK64:       ; %bb.0:
-; CHECK64-NEXT:    str x1, [x0, #-256]!
-; CHECK64-NEXT:    ret
-;
-; GISEL-LABEL: pre64idxneg256:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    mov x8, x0
-; GISEL-NEXT:    sub x0, x0, #256
-; GISEL-NEXT:    stur x1, [x8, #-256]
-; GISEL-NEXT:    ret
-;
-; CHECK32-LABEL: pre64idxneg256:
-; CHECK32:       ; %bb.0:
-; CHECK32-NEXT:    str x1, [x0, #-256]!
-; CHECK32-NEXT:    ret
+; CHECK-LABEL: pre64idxneg256:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    str x1, [x0, #-256]!
+; CHECK-NEXT:    ret
   %incdec.ptr = getelementptr inbounds i64, ptr %ptr, i64 -32
   store i64 %spacing, ptr %incdec.ptr, align 4
   ret ptr %incdec.ptr
 }
 
 define ptr @pre32(ptr %ptr, i32 %spacing) {
-; CHECK64-LABEL: pre32:
-; CHECK64:       ; %bb.0:
-; CHECK64-NEXT:    str w1, [x0, #8]!
-; CHECK64-NEXT:    ret
-;
-; GISEL-LABEL: pre32:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    mov x8, x0
-; GISEL-NEXT:    add x0, x0, #8
-; GISEL-NEXT:    str w1, [x8, #8]
-; GISEL-NEXT:    ret
-;
-; CHECK32-LABEL: pre32:
-; CHECK32:       ; %bb.0:
-; CHECK32-NEXT:    str w1, [x0, #8]!
-; CHECK32-NEXT:    ret
+; CHECK-LABEL: pre32:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    str w1, [x0, #8]!
+; CHECK-NEXT:    ret
   %incdec.ptr = getelementptr inbounds i32, ptr %ptr, i64 2
   store i32 %spacing, ptr %incdec.ptr, align 4
   ret ptr %incdec.ptr
@@ -353,44 +281,20 @@ define ptr @pre32idxpos256(ptr %ptr, i32 %spacing) {
 }
 
 define ptr @pre32idxneg256(ptr %ptr, i32 %spacing) {
-; CHECK64-LABEL: pre32idxneg256:
-; CHECK64:       ; %bb.0:
-; CHECK64-NEXT:    str w1, [x0, #-256]!
-; CHECK64-NEXT:    ret
-;
-; GISEL-LABEL: pre32idxneg256:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    mov x8, x0
-; GISEL-NEXT:    sub x0, x0, #256
-; GISEL-NEXT:    stur w1, [x8, #-256]
-; GISEL-NEXT:    ret
-;
-; CHECK32-LABEL: pre32idxneg256:
-; CHECK32:       ; %bb.0:
-; CHECK32-NEXT:    str w1, [x0, #-256]!
-; CHECK32-NEXT:    ret
+; CHECK-LABEL: pre32idxneg256:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    str w1, [x0, #-256]!
+; CHECK-NEXT:    ret
   %incdec.ptr = getelementptr inbounds i32, ptr %ptr, i64 -64
   store i32 %spacing, ptr %incdec.ptr, align 4
   ret ptr %incdec.ptr
 }
 
 define ptr @pre16(ptr %ptr, i16 %spacing) {
-; CHECK64-LABEL: pre16:
-; CHECK64:       ; %bb.0:
-; CHECK64-NEXT:    strh w1, [x0, #4]!
-; CHECK64-NEXT:    ret
-;
-; GISEL-LABEL: pre16:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    mov x8, x0
-; GISEL-NEXT:    add x0, x0, #4
-; GISEL-NEXT:    strh w1, [x8, #4]
-; GISEL-NEXT:    ret
-;
-; CHECK32-LABEL: pre16:
-; CHECK32:       ; %bb.0:
-; CHECK32-NEXT:    strh w1, [x0, #4]!
-; CHECK32-NEXT:    ret
+; CHECK-LABEL: pre16:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    strh w1, [x0, #4]!
+; CHECK-NEXT:    ret
   %incdec.ptr = getelementptr inbounds i16, ptr %ptr, i64 2
   store i16 %spacing, ptr %incdec.ptr, align 4
   ret ptr %incdec.ptr
@@ -409,44 +313,20 @@ define ptr @pre16idxpos256(ptr %ptr, i16 %spacing) {
 }
 
 define ptr @pre16idxneg256(ptr %ptr, i16 %spacing) {
-; CHECK64-LABEL: pre16idxneg256:
-; CHECK64:       ; %bb.0:
-; CHECK64-NEXT:    strh w1, [x0, #-256]!
-; CHECK64-NEXT:    ret
-;
-; GISEL-LABEL: pre16idxneg256:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    mov x8, x0
-; GISEL-NEXT:    sub x0, x0, #256
-; GISEL-NEXT:    sturh w1, [x8, #-256]
-; GISEL-NEXT:    ret
-;
-; CHECK32-LABEL: pre16idxneg256:
-; CHECK32:       ; %bb.0:
-; CHECK32-NEXT:    strh w1, [x0, #-256]!
-; CHECK32-NEXT:    ret
+; CHECK-LABEL: pre16idxneg256:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    strh w1, [x0, #-256]!
+; CHECK-NEXT:    ret
   %incdec.ptr = getelementptr inbounds i16, ptr %ptr, i64 -128
   store i16 %spacing, ptr %incdec.ptr, align 4
   ret ptr %incdec.ptr
 }
 
 define ptr @pre8(ptr %ptr, i8 %spacing) {
-; CHECK64-LABEL: pre8:
-; CHECK64:       ; %bb.0:
-; CHECK64-NEXT:    strb w1, [x0, #2]!
-; CHECK64-NEXT:    ret
-;
-; GISEL-LABEL: pre8:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    mov x8, x0
-; GISEL-NEXT:    add x0, x0, #2
-; GISEL-NEXT:    strb w1, [x8, #2]
-; GISEL-NEXT:    ret
-;
-; CHECK32-LABEL: pre8:
-; CHECK32:       ; %bb.0:
-; CHECK32-NEXT:    strb w1, [x0, #2]!
-; CHECK32-NEXT:    ret
+; CHECK-LABEL: pre8:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    strb w1, [x0, #2]!
+; CHECK-NEXT:    ret
   %incdec.ptr = getelementptr inbounds i8, ptr %ptr, i64 2
   store i8 %spacing, ptr %incdec.ptr, align 4
   ret ptr %incdec.ptr
@@ -465,44 +345,20 @@ define ptr @pre8idxpos256(ptr %ptr, i8 %spacing) {
 }
 
 define ptr @pre8idxneg256(ptr %ptr, i8 %spacing) {
-; CHECK64-LABEL: pre8idxneg256:
-; CHECK64:       ; %bb.0:
-; CHECK64-NEXT:    strb w1, [x0, #-256]!
-; CHECK64-NEXT:    ret
-;
-; GISEL-LABEL: pre8idxneg256:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    mov x8, x0
-; GISEL-NEXT:    sub x0, x0, #256
-; GISEL-NEXT:    sturb w1, [x8, #-256]
-; GISEL-NEXT:    ret
-;
-; CHECK32-LABEL: pre8idxneg256:
-; CHECK32:       ; %bb.0:
-; CHECK32-NEXT:    strb w1, [x0, #-256]!
-; CHECK32-NEXT:    ret
+; CHECK-LABEL: pre8idxneg256:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    strb w1, [x0, #-256]!
+; CHECK-NEXT:    ret
   %incdec.ptr = getelementptr inbounds i8, ptr %ptr, i64 -256
   store i8 %spacing, ptr %incdec.ptr, align 4
   ret ptr %incdec.ptr
 }
 
 define ptr @pretrunc64to32(ptr %ptr, i64 %spacing) {
-; CHECK64-LABEL: pretrunc64to32:
-; CHECK64:       ; %bb.0:
-; CHECK64-NEXT:    str w1, [x0, #8]!
-; CHECK64-NEXT:    ret
-;
-; GISEL-LABEL: pretrunc64to32:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    mov x8, x0
-; GISEL-NEXT:    add x0, x0, #8
-; GISEL-NEXT:    str w1, [x8, #8]
-; GISEL-NEXT:    ret
-;
-; CHECK32-LABEL: pretrunc64to32:
-; CHECK32:       ; %bb.0:
-; CHECK32-NEXT:    str w1, [x0, #8]!
-; CHECK32-NEXT:    ret
+; CHECK-LABEL: pretrunc64to32:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    str w1, [x0, #8]!
+; CHECK-NEXT:    ret
   %incdec.ptr = getelementptr inbounds i32, ptr %ptr, i64 2
   %trunc = trunc i64 %spacing to i32
   store i32 %trunc, ptr %incdec.ptr, align 4
@@ -510,22 +366,10 @@ define ptr @pretrunc64to32(ptr %ptr, i64 %spacing) {
 }
 
 define ptr @pretrunc64to16(ptr %ptr, i64 %spacing) {
-; CHECK64-LABEL: pretrunc64to16:
-; CHECK64:       ; %bb.0:
-; CHECK64-NEXT:    strh w1, [x0, #4]!
-; CHECK64-NEXT:    ret
-;
-; GISEL-LABEL: pretrunc64to16:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    mov x8, x0
-; GISEL-NEXT:    add x0, x0, #4
-; GISEL-NEXT:    strh w1, [x8, #4]
-; GISEL-NEXT:    ret
-;
-; CHECK32-LABEL: pretrunc64to16:
-; CHECK32:       ; %bb.0:
-; CHECK32-NEXT:    strh w1, [x0, #4]!
-; CHECK32-NEXT:    ret
+; CHECK-LABEL: pretrunc64to16:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    strh w1, [x0, #4]!
+; CHECK-NEXT:    ret
   %incdec.ptr = getelementptr inbounds i16, ptr %ptr, i64 2
   %trunc = trunc i64 %spacing to i16
   store i16 %trunc, ptr %incdec.ptr, align 4
@@ -533,22 +377,10 @@ define ptr @pretrunc64to16(ptr %ptr, i64 %spacing) {
 }
 
 define ptr @pretrunc64to8(ptr %ptr, i64 %spacing) {
-; CHECK64-LABEL: pretrunc64to8:
-; CHECK64:       ; %bb.0:
-; CHECK64-NEXT:    strb w1, [x0, #2]!
-; CHECK64-NEXT:    ret
-;
-; GISEL-LABEL: pretrunc64to8:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    mov x8, x0
-; GISEL-NEXT:    add x0, x0, #2
-; GISEL-NEXT:    strb w1, [x8, #2]
-; GISEL-NEXT:    ret
-;
-; CHECK32-LABEL: pretrunc64to8:
-; CHECK32:       ; %bb.0:
-; CHECK32-NEXT:    strb w1, [x0, #2]!
-; CHECK32-NEXT:    ret
+; CHECK-LABEL: pretrunc64to8:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    strb w1, [x0, #2]!
+; CHECK-NEXT:    ret
   %incdec.ptr = getelementptr inbounds i8, ptr %ptr, i64 2
   %trunc = trunc i64 %spacing to i8
   store i8 %trunc, ptr %incdec.ptr, align 4
@@ -583,24 +415,11 @@ define ptr @preidxf32(ptr %src, ptr %out) {
 }
 
 define ptr @preidxf16(ptr %src, ptr %out) {
-; CHECK64-LABEL: preidxf16:
-; CHECK64:       ; %bb.0:
-; CHECK64-NEXT:    ldr h0, [x0, #2]!
-; CHECK64-NEXT:    str h0, [x1]
-; CHECK64-NEXT:    ret
-;
-; GISEL-LABEL: preidxf16:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    ldr h0, [x0, #2]
-; GISEL-NEXT:    add x0, x0, #2
-; GISEL-NEXT:    str h0, [x1]
-; GISEL-NEXT:    ret
-;
-; CHECK32-LABEL: preidxf16:
-; CHECK32:       ; %bb.0:
-; CHECK32-NEXT:    ldr h0, [x0, #2]!
-; CHECK32-NEXT:    str h0, [x1]
-; CHECK32-NEXT:    ret
+; CHECK-LABEL: preidxf16:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    ldr h0, [x0, #2]!
+; CHECK-NEXT:    str h0, [x1]
+; CHECK-NEXT:    ret
   %ptr = getelementptr inbounds half, ptr %src, i64 1
   %tmp = load half, ptr %ptr, align 2
   store half %tmp, ptr %out, align 2
diff --git a/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll b/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll
index 46563f6a8e089..0d7620d1c883d 100644
--- a/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll
@@ -626,20 +626,12 @@
 @ptr = global ptr null
 
 define <8 x i8> @test_v8i8_pre_load(ptr %addr) {
-; SDAG-LABEL: test_v8i8_pre_load:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    ldr d0, [x0, #40]!
-; SDAG-NEXT:    adrp x8, _ptr@PAGE
-; SDAG-NEXT:    str x0, [x8, _ptr@PAGEOFF]
-; SDAG-NEXT:    ret
-;
-; CHECK-GISEL-LABEL: test_v8i8_pre_load:
-; CHECK-GISEL:       ; %bb.0:
-; CHECK-GISEL-NEXT:    ldr d0, [x0, #40]
-; CHECK-GISEL-NEXT:    adrp x8, _ptr@PAGE
-; CHECK-GISEL-NEXT:    add x9, x0, #40
-; CHECK-GISEL-NEXT:    str x9, [x8, _ptr@PAGEOFF]
-; CHECK-GISEL-NEXT:    ret
+; CHECK-LABEL: test_v8i8_pre_load:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    ldr d0, [x0, #40]!
+; CHECK-NEXT:    adrp x8, _ptr@PAGE
+; CHECK-NEXT:    str x0, [x8, _ptr@PAGEOFF]
+; CHECK-NEXT:    ret
   %newaddr = getelementptr <8 x i8>, ptr %addr, i32 5
   %val = load <8 x i8>, ptr %newaddr, align 8
   store ptr %newaddr, ptr @ptr
@@ -660,20 +652,12 @@ define <8 x i8> @test_v8i8_post_load(ptr %addr) {
 }
 
 define void @test_v8i8_pre_store(<8 x i8> %in, ptr %addr) {
-; SDAG-LABEL: test_v8i8_pre_store:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    adrp x8, _ptr@PAGE
-; SDAG-NEXT:    str d0, [x0, #40]!
-; SDAG-NEXT:    str x0, [x8, _ptr@PAGEOFF]
-; SDAG-NEXT:    ret
-;
-; CHECK-GISEL-LABEL: test_v8i8_pre_store:
-; CHECK-GISEL:       ; %bb.0:
-; CHECK-GISEL-NEXT:    adrp x8, _ptr@PAGE
-; CHECK-GISEL-NEXT:    add x9, x0, #40
-; CHECK-GISEL-NEXT:    str d0, [x0, #40]
-; CHECK-GISEL-NEXT:    str x9, [x8, _ptr@PAGEOFF]
-; CHECK-GISEL-NEXT:    ret
+; CHECK-LABEL: test_v8i8_pre_store:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    adrp x8, _ptr@PAGE
+; CHECK-NEXT:    str d0, [x0, #40]!
+; CHECK-NEXT:    str x0, [x8, _ptr@PAGEOFF]
+; CHECK-NEXT:    ret
   %newaddr = getelementptr <8 x i8>, ptr %addr, i32 5
   store <8 x i8> %in, ptr %newaddr, align 8
   store ptr %newaddr, ptr @ptr
@@ -694,20 +678,12 @@ define void @test_v8i8_post_store(<8 x i8> %in, ptr %addr) {
 }
 
 define <4 x i16> @test_v4i16_pre_load(ptr %addr) {
-; SDAG-LABEL: test_v4i16_pre_load:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    ldr d0, [x0, #40]!
-; SDAG-NEXT:    adrp x8, _ptr@PAGE
-; SDAG-NEXT:    str x0, [x8, _ptr@PAGEOFF]
-; SDAG-NEXT:    ret
-;
-; CHECK-GISEL-LABEL: test_v4i16_pre_load:
-; CHECK-GISEL:       ; %bb.0:
-; CHECK-GISEL-NEXT:    ldr d0, [x0, #40]
-; CHECK-GISEL-NEXT:    adrp x8, _ptr@PAGE
-; CHECK-GISEL-NEXT:    add x9, x0, #40
-; CHECK-GISEL-NEXT:    str x9, [x8, _ptr@PAGEOFF]
-; CHECK-GISEL-NEXT:    ret
+; CHECK-LABEL: test_v4i16_pre_load:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    ldr d0, [x0, #40]!
+; CHECK-NEXT:    adrp x8, _ptr@PAGE
+; CHECK-NEXT:    str x0, [x8, _ptr@PAGEOFF]
+; CHECK-NEXT:    ret
   %newaddr = getelementptr <4 x i16>, ptr %addr, i32 5
   %val = load <4 x i16>, ptr %newaddr, align 8
   store ptr %newaddr, ptr @ptr
@@ -728,20 +704,12 @@ define <4 x i16> @test_v4i16_post_load(ptr %addr) {
 }
 
 define void @test_v4i16_pre_store(<4 x i16> %in, ptr %addr) {
-; SDAG-LABEL: test_v4i16_pre_store:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    adrp x8, _ptr@PAGE
-; SDAG-NEXT:    str d0, [x0, #40]!
-; SDAG-NEXT:    str x0, [x8, _ptr@PAGEOFF]
-; SDAG-NEXT:    ret
-;
-; CHECK-GISEL-LABEL: test_v4i16_pre_store:
-; CHECK-GISEL:       ; %bb.0:
-; CHECK-GISEL-NEXT:    adrp x8, _ptr@PAGE
-; CHECK-GISEL-NEXT:    add x9, x0, #40
-; CHECK-GISEL-NEXT:    str d0, [x0, #40]
-; CHECK-GISEL-NEXT:    str x9, [x8, _ptr@PAGEOFF]
-; CHECK-GISEL-NEXT:    ret
+; CHECK-LABEL: test_v4i16_pre_store:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    adrp x8, _ptr@PAGE
+; CHECK-NEXT:    str d0, [x0, #40]!
+; CHECK-NEXT:    str x0, [x8, _ptr@PAGEOFF]
+; CHECK-NEXT:    ret
   %newaddr = getelementptr <4 x i16>, ptr %addr, i32 5
   store <4 x i16> %in, ptr %newaddr, align 8
   store ptr %newaddr, ptr @ptr
@@ -762,20 +730,12 @@ define void @test_v4i16_post_store(<4 x i16> %in, ptr %addr) {
 }
 
 define <2 x i32> @test_v2i32_pre_load(ptr %addr) {
-; SDAG-LABEL: test_v2i32_pre_load:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    ldr d0, [x0, #40]!
-; SDAG-NEXT:    adrp x8, _ptr@PAGE
-; SDAG-NEXT:    str x0, [x8, _ptr@PAGEOFF]
-; SDAG-NEXT:    ret
-;
-; CHECK-GISEL-LABEL: test_v2i32_pre_load:
-; CHECK-GISEL:       ; %bb.0:
-; CHECK-GISEL-NEXT:    ldr d0, [x0, #40]
-; CHECK-GISEL-NEXT:    adrp x8, _ptr@PAGE
-; CHECK-GISEL-NEXT:    add x9, x0, #40
-; CHECK-GISEL-NEXT:    str x9, [x8, _ptr@PAGEOFF]
-; CHECK-GISEL-NEXT:    ret
+; CHECK-LABEL: test_v2i32_pre_load:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    ldr d0, [x0, #40]!
+; CHECK-NEXT:    adrp x8, _ptr@PAGE
+; CHECK-NEXT:    str x0, [x8, _ptr@PAGEOFF]
+; CHECK-NEXT:    ret
   %newaddr = getelementptr <2 x i32>, ptr %addr, i32 5
   %val = load <2 x i32>, ptr %newaddr, align 8
   store ptr %newaddr, ptr @ptr
@@ -796,20 +756,12 @@ define <2 x i32> @test_v2i32_post_load(ptr %addr) {
 }
 
 define void @test_v2i32_pre_store(<2 x i32> %in, ptr %addr) {
-; SDAG-LABEL: test_v2i32_pre_store:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    adrp x8, _ptr@PAGE
-; SDAG-NEXT:    str d0, [x0, #40]!
-; SDAG-NEXT:    str x0, [x8, _ptr@PAGEOFF]
-; SDAG-NEXT:    ret
-;
-; CHECK-GISEL-LABEL: test_v2i32_pre_store:
-; CHECK-GISEL:       ; %bb.0:
-; CHECK-GISEL-NEXT:    adrp x8, _ptr@PAGE
-; CHECK-GISEL-NEXT:    add x9, x0, #40
-; CHECK-GISEL-NEXT:    str d0, [x0, #40]
-; CHECK-GISEL-NEXT:    str x9, [x8, _ptr@PAGEOFF]
-; CHECK-GISEL-NEXT:    ret
+; CHECK-LABEL: test_v2i32_pre_store:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    adrp x8, _ptr@PAGE
+; CHECK-NEXT:    str d0, [x0, #40]!
+; CHECK-NEXT:    str x0, [x8, _ptr@PAGEOFF]
+; CHECK-NEXT:    ret
   %newaddr = getelementptr <2 x i32>, ptr %addr, i32 5
   store <2 x i32> %in, ptr %newaddr, align 8
   store ptr %newaddr, ptr @ptr
@@ -830,20 +782,12 @@ define void @test_v2i32_post_store(<2 x i32> %in, ptr %addr) {
 }
 
 define <2 x float> @test_v2f32_pre_load(ptr %addr) {
-; SDAG-LABEL: test_v2f32_pre_load:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    ldr d0, [x0, #40]!
-; SDAG-NEXT:    adrp x8, _ptr@PAGE
-; SDAG-NEXT:    str x0, [x8, _ptr@PAGEOFF]
-; SDAG-NEXT:    ret
-;
-; CHECK-GISEL-LABEL: test_v2f32_pre_load:
-; CHECK-GISEL:       ; %bb.0:
-; CHECK-GISEL-NEXT:    ldr d0, [x0, #40]
-; CHECK-GISEL-NEXT:    adrp x8, _ptr@PAGE
-; CHECK-GISEL-NEXT:    add x9, x0, #40
-; CHECK-GISEL-NEXT:    str x9, [x8, _ptr@PAGEOFF]
-; CHECK-GISEL-NEXT:    ret
+; CHECK-LABEL: test_v2f32_pre_load:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    ldr d0, [x0, #40]!
+; CHECK-NEXT:    adrp x8, _ptr@PAGE
+; CHECK-NEXT:    str x0, [x8, _ptr@PAGEOFF]
+; CHECK-NEXT:    ret
   %newaddr = getelementptr <2 x float>, ptr %addr, i32 5
   %val = load <2 x float>, ptr %newaddr, align 8
   store ptr %newaddr, ptr @ptr
@@ -864,20 +808,12 @@ define <2 x float> @test_v2f32_post_load(ptr %addr) {
 }
 
 define void @test_v2f32_pre_store(<2 x float> %in, ptr %addr) {
-; SDAG-LABEL: test_v2f32_pre_store:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    adrp x8, _ptr@PAGE
-; SDAG-NEXT:    str d0, [x0, #40]!
-; SDAG-NEXT:    str x0, [x8, _ptr@PAGEOFF]
-; SDAG-NEXT:    ret
-;
-; CHECK-GISEL-LABEL: test_v2f32_pre_store:
-; CHECK-GISEL:       ; %bb.0:
-; CHECK-GISEL-NEXT:    adrp x8, _ptr@PAGE
-; CHECK-GISEL-NEXT:    add x9, x0, #40
-; CHECK-GISEL-NEXT:    str d0, [x0, #40]
-; CHECK-GISEL-NEXT:    str x9, [x8, _ptr@PAGEOFF]
-; CHECK-GISEL-NEXT:    ret
+; CHECK-LABEL: test_v2f32_pre_store:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    adrp x8, _ptr@PAGE
+; CHECK-NEXT:    str d0, [x0, #40]!
+; CHECK-NEXT:    str x0, [x8, _ptr@PAGEOFF]
+; CHECK-NEXT:    ret
   %newaddr = getelementptr <2 x float>, ptr %addr, i32 5
   store <2 x float> %in, ptr %newaddr, align 8
   store ptr %newaddr, ptr @ptr
@@ -907,10 +843,10 @@ define <1 x i64> @test_v1i64_pre_load(ptr %addr) {
 ;
 ; CHECK-GISEL-LABEL: test_v1i64_pre_load:
 ; CHECK-GISEL:       ; %bb.0:
-; CHECK-GISEL-NEXT:    ldr d0, [x0, #40]
-; CHECK-GISEL-NEXT:    adrp x8, _ptr@PAGE
-; CHECK-GISEL-NEXT:    add x9, x0, #40
-; CHECK-GISEL-NEXT:    str x9, [x8, _ptr@PAGEOFF]
+; CHECK-GISEL-NEXT:    ldr x8, [x0, #40]!
+; CHECK-GISEL-NEXT:    adrp x9, _ptr@PAGE
+; CHECK-GISEL-NEXT:    str x0, [x9, _ptr@PAGEOFF]
+; CHECK-GISEL-NEXT:    fmov d0, x8
 ; CHECK-GISEL-NEXT:    ret
   %newaddr = getelementptr <1 x i64>, ptr %addr, i32 5
   %val = load <1 x i64>, ptr %newaddr, align 8
@@ -940,20 +876,12 @@ define <1 x i64> @test_v1i64_post_load(ptr %addr) {
 }
 
 define void @test_v1i64_pre_store(<1 x i64> %in, ptr %addr) {
-; SDAG-LABEL: test_v1i64_pre_store:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    adrp x8, _ptr@PAGE
-; SDAG-NEXT:    str d0, [x0, #40]!
-; SDAG-NEXT:    str x0, [x8, _ptr@PAGEOFF]
-; SDAG-NEXT:    ret
-;
-; CHECK-GISEL-LABEL: test_v1i64_pre_store:
-; CHECK-GISEL:       ; %bb.0:
-; CHECK-GISEL-NEXT:    adrp x8, _ptr@PAGE
-; CHECK-GISEL-NEXT:    add x9, x0, #40
-; CHECK-GISEL-NEXT:    str d0, [x0, #40]
-; CHECK-GISEL-NEXT:    str x9, [x8, _ptr@PAGEOFF]
-; CHECK-GISEL-NEXT:    ret
+; CHECK-LABEL: test_v1i64_pre_store:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    adrp x8, _ptr@PAGE
+; CHECK-NEXT:    str d0, [x0, #40]!
+; CHECK-NEXT:    str x0, [x8, _ptr@PAGEOFF]
+; CHECK-NEXT:    ret
   %newaddr = getelementptr <1 x i64>, ptr %addr, i32 5
   store <1 x i64> %in, ptr %newaddr, align 8
   store ptr %newaddr, ptr @ptr
@@ -974,20 +902,12 @@ define void @test_v1i64_post_store(<1 x i64> %in, ptr %addr) {
 }
 
 define <16 x i8> @test_v16i8_pre_load(ptr %addr) {
-; SDAG-LABEL: test_v16i8_pre_load:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    ldr q0, [x0, #80]!
-; SDAG-NEXT:    adrp x8, _ptr@PAGE
-; SDAG-NEXT:    str x0, [x8, _ptr@PAGEOFF]
-; SDAG-NEXT:    ret
-;
-; CHECK-GISEL-LABEL: test_v16i8_pre_load:
-; CHECK-GISEL:       ; %bb.0:
-; CHECK-GISEL-NEXT:    ldr q0, [x0, #80]
-; CHECK-GISEL-NEXT:    adrp x8, _ptr@PAGE
-; CHECK-GISEL-NEXT:    add x9, x0, #80
-; CHECK-GISEL-NEXT:    str x9, [x8, _ptr@PAGEOFF]
-; CHECK-GISEL-NEXT:    ret
+; CHECK-LABEL: test_v16i8_pre_load:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    ldr q0, [x0, #80]!
+; CHECK-NEXT:    adrp x8, _ptr@PAGE
+; CHECK-NEXT:    str x0, [x8, _ptr@PAGEOFF]
+; CHECK-NEXT:    ret
   %newaddr = getelementptr <16 x i8>, ptr %addr, i32 5
   %val = load <16 x i8>, ptr %newaddr, align 8
   store ptr %newaddr, ptr @ptr
@@ -1008,20 +928,12 @@ define <16 x i8> @test_v16i8_post_load(ptr %addr) {
 }
 
 define void @test_v16i8_pre_store(<16 x i8> %in, ptr %addr) {
-; SDAG-LABEL: test_v16i8_pre_store:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    adrp x8, _ptr@PAGE
-; SDAG-NEXT:    str q0, [x0, #80]!
-; SDAG-NEXT:    str x0, [x8, _ptr@PAGEOFF]
-; SDAG-NEXT:    ret
-;
-; CHECK-GISEL-LABEL: test_v16i8_pre_store:
-; CHECK-GISEL:       ; %bb.0:
-; CHECK-GISEL-NEXT:    adrp x8, _ptr@PAGE
-; CHECK-GISEL-NEXT:    add x9, x0, #80
-; CHECK-GISEL-NEXT:    str q0, [x0, #80]
-; CHECK-GISEL-NEXT:    str x9, [x8, _ptr@PAGEOFF]
-; CHECK-GISEL-NEXT:    ret
+; CHECK-LABEL: test_v16i8_pre_store:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    adrp x8, _ptr@PAGE
+; CHECK-NEXT:    str q0, [x0, #80]!
+; CHECK-NEXT:    str x0, [x8, _ptr@PAGEOFF]
+; CHECK-NEXT:    ret
   %newaddr = getelementptr <16 x i8>, ptr %addr, i32 5
   store <16 x i8> %in, ptr %newaddr, align 8
   store ptr %newaddr, ptr @ptr
@@ -1042,20 +954,12 @@ define void @test_v16i8_post_store(<16 x i8> %in, ptr %addr) {
 }
 
 define <8 x i16> @test_v8i16_pre_load(ptr %addr) {
-; SDAG-LABEL: test_v8i16_pre_load:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    ldr q0, [x0, #80]!
-; SDAG-NEXT:    adrp x8, _ptr@PAGE
-; SDAG-NEXT:    str x0, [x8, _ptr@PAGEOFF]
-; SDAG-NEXT:    ret
-;
-; CHECK-GISEL-LABEL: test_v8i16_pre_load:
-; CHECK-GISEL:       ; %bb.0:
-; CHECK-GISEL-NEXT:    ldr q0, [x0, #80]
-; CHECK-GISEL-NEXT:    adrp x8, _ptr@PAGE
-; CHECK-GISEL-NEXT:    add x9, x0, #80
-; CHECK-GISEL-NEXT:    str x9, [x8, _ptr@PAGEOFF]
-; CHECK-GISEL-NEXT:    ret
+; CHECK-LABEL: test_v8i16_pre_load:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    ldr q0, [x0, #80]!
+; CHECK-NEXT:    adrp x8, _ptr@PAGE
+; CHECK-NEXT:    str x0, [x8, _ptr@PAGEOFF]
+; CHECK-NEXT:    ret
   %newaddr = getelementptr <8 x i16>, ptr %addr, i32 5
   %val = load <8 x i16>, ptr %newaddr, align 8
   store ptr %newaddr, ptr @ptr
@@ -1076,20 +980,12 @@ define <8 x i16> @test_v8i16_post_load(ptr %addr) {
 }
 
 define void @test_v8i16_pre_store(<8 x i16> %in, ptr %addr) {
-; SDAG-LABEL: test_v8i16_pre_store:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    adrp x8, _ptr@PAGE
-; SDAG-NEXT:    str q0, [x0, #80]!
-; SDAG-NEXT:    str x0, [x8, _ptr@PAGEOFF]
-; SDAG-NEXT:    ret
-;
-; CHECK-GISEL-LABEL: test_v8i16_pre_store:
-; CHECK-GISEL:       ; %bb.0:
-; CHECK-GISEL-NEXT:    adrp x8, _ptr@PAGE
-; CHECK-GISEL-NEXT:    add x9, x0, #80
-; CHECK-GISEL-NEXT:    str q0, [x0, #80]
-; CHECK-GISEL-NEXT:    str x9, [x8, _ptr@PAGEOFF]
-; CHECK-GISEL-NEXT:    ret
+; CHECK-LABEL: test_v8i16_pre_store:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    adrp x8, _ptr@PAGE
+; CHECK-NEXT:    str q0, [x0, #80]!
+; CHECK-NEXT:    str x0, [x8, _ptr@PAGEOFF]
+; CHECK-NEXT:    ret
   %newaddr = getelementptr <8 x i16>, ptr %addr, i32 5
   store <8 x i16> %in, ptr %newaddr, align 8
   store ptr %newaddr, ptr @ptr
@@ -1110,20 +1006,12 @@ define void @test_v8i16_post_store(<8 x i16> %in, ptr %addr) {
 }
 
 define <4 x i32> @test_v4i32_pre_load(ptr %addr) {
-; SDAG-LABEL: test_v4i32_pre_load:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    ldr q0, [x0, #80]!
-; SDAG-NEXT:    adrp x8, _ptr@PAGE
-; SDAG-NEXT:    str x0, [x8, _ptr@PAGEOFF]
-; SDAG-NEXT:    ret
-;
-; CHECK-GISEL-LABEL: test_v4i32_pre_load:
-; CHECK-GISEL:       ; %bb.0:
-; CHECK-GISEL-NEXT:    ldr q0, [x0, #80]
-; CHECK-GISEL-NEXT:    adrp x8, _ptr@PAGE
-; CHECK-GISEL-NEXT:    add x9, x0, #80
-; CHECK-GISEL-NEXT:    str x9, [x8, _ptr@PAGEOFF]
-; CHECK-GISEL-NEXT:    ret
+; CHECK-LABEL: test_v4i32_pre_load:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    ldr q0, [x0, #80]!
+; CHECK-NEXT:    adrp x8, _ptr@PAGE
+; CHECK-NEXT:    str x0, [x8, _ptr@PAGEOFF]
+; CHECK-NEXT:    ret
   %newaddr = getelementptr <4 x i32>, ptr %addr, i32 5
   %val = load <4 x i32>, ptr %newaddr, align 8
   store ptr %newaddr, ptr @ptr
@@ -1144,20 +1032,12 @@ define <4 x i32> @test_v4i32_post_load(ptr %addr) {
 }
 
 define void @test_v4i32_pre_store(<4 x i32> %in, ptr %addr) {
-; SDAG-LABEL: test_v4i32_pre_store:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    adrp x8, _ptr@PAGE
-; SDAG-NEXT:    str q0, [x0, #80]!
-; SDAG-NEXT:    str x0, [x8, _ptr@PAGEOFF]
-; SDAG-NEXT:    ret
-;
-; CHECK-GISEL-LABEL: test_v4i32_pre_store:
-; CHECK-GISEL:       ; %bb.0:
-; CHECK-GISEL-NEXT:    adrp x8, _ptr@PAGE
-; CHECK-GISEL-NEXT:    add x9, x0, #80
-; CHECK-GISEL-NEXT:    str q0, [x0, #80]
-; CHECK-GISEL-NEXT:    str x9, [x8, _ptr@PAGEOFF]
-; CHECK-GISEL-NEXT:    ret
+; CHECK-LABEL: test_v4i32_pre_store:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    adrp x8, _ptr@PAGE
+; CHECK-NEXT:    str q0, [x0, #80]!
+; CHECK-NEXT:    str x0, [x8, _ptr@PAGEOFF]
+; CHECK-NEXT:    ret
   %newaddr = getelementptr <4 x i32>, ptr %addr, i32 5
   store <4 x i32> %in, ptr %newaddr, align 8
   store ptr %newaddr, ptr @ptr
@@ -1179,20 +1059,12 @@ define void @test_v4i32_post_store(<4 x i32> %in, ptr %addr) {
 
 
 define <4 x float> @test_v4f32_pre_load(ptr %addr) {
-; SDAG-LABEL: test_v4f32_pre_load:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    ldr q0, [x0, #80]!
-; SDAG-NEXT:    adrp x8, _ptr@PAGE
-; SDAG-NEXT:    str x0, [x8, _ptr@PAGEOFF]
-; SDAG-NEXT:    ret
-;
-; CHECK-GISEL-LABEL: test_v4f32_pre_load:
-; CHECK-GISEL:       ; %bb.0:
-; CHECK-GISEL-NEXT:    ldr q0, [x0, #80]
-; CHECK-GISEL-NEXT:    adrp x8, _ptr@PAGE
-; CHECK-GISEL-NEXT:    add x9, x0, #80
-; CHECK-GISEL-NEXT:    str x9, [x8, _ptr@PAGEOFF]
-; CHECK-GISEL-NEXT:    ret
+; CHECK-LABEL: test_v4f32_pre_load:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    ldr q0, [x0, #80]!
+; CHECK-NEXT:    adrp x8, _ptr@PAGE
+; CHECK-NEXT:    str x0, [x8, _ptr@PAGEOFF]
+; CHECK-NEXT:    ret
   %newaddr = getelementptr <4 x float>, ptr %addr, i32 5
   %val = load <4 x float>, ptr %newaddr, align 8
   store ptr %newaddr, ptr @ptr
@@ -1213,20 +1085,12 @@ define <4 x float> @test_v4f32_post_load(ptr %addr) {
 }
 
 define void @test_v4f32_pre_store(<4 x float> %in, ptr %addr) {
-; SDAG-LABEL: test_v4f32_pre_store:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    adrp x8, _ptr@PAGE
-; SDAG-NEXT:    str q0, [x0, #80]!
-; SDAG-NEXT:    str x0, [x8, _ptr@PAGEOFF]
-; SDAG-NEXT:    ret
-;
-; CHECK-GISEL-LABEL: test_v4f32_pre_store:
-; CHECK-GISEL:       ; %bb.0:
-; CHECK-GISEL-NEXT:    adrp x8, _ptr@PAGE
-; CHECK-GISEL-NEXT:    add x9, x0, #80
-; CHECK-GISEL-NEXT:    str q0, [x0, #80]
-; CHECK-GISEL-NEXT:    str x9, [x8, _ptr@PAGEOFF]
-; CHECK-GISEL-NEXT:    ret
+; CHECK-LABEL: test_v4f32_pre_store:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    adrp x8, _ptr@PAGE
+; CHECK-NEXT:    str q0, [x0, #80]!
+; CHECK-NEXT:    str x0, [x8, _ptr@PAGEOFF]
+; CHECK-NEXT:    ret
   %newaddr = getelementptr <4 x float>, ptr %addr, i32 5
   store <4 x float> %in, ptr %newaddr, align 8
   store ptr %newaddr, ptr @ptr
@@ -1248,20 +1112,12 @@ define void @test_v4f32_post_store(<4 x float> %in, ptr %addr) {
 
 
 define <2 x i64> @test_v2i64_pre_load(ptr %addr) {
-; SDAG-LABEL: test_v2i64_pre_load:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    ldr q0, [x0, #80]!
-; SDAG-NEXT:    adrp x8, _ptr@PAGE
-; SDAG-NEXT:    str x0, [x8, _ptr@PAGEOFF]
-; SDAG-NEXT:    ret
-;
-; CHECK-GISEL-LABEL: test_v2i64_pre_load:
-; CHECK-GISEL:       ; %bb.0:
-; CHECK-GISEL-NEXT:    ldr q0, [x0, #80]
-; CHECK-GISEL-NEXT:    adrp x8, _ptr@PAGE
-; CHECK-GISEL-NEXT:    add x9, x0, #80
-; CHECK-GISEL-NEXT:    str x9, [x8, _ptr@PAGEOFF]
-; CHECK-GISEL-NEXT:    ret
+; CHECK-LABEL: test_v2i64_pre_load:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    ldr q0, [x0, #80]!
+; CHECK-NEXT:    adrp x8, _ptr@PAGE
+; CHECK-NEXT:    str x0, [x8, _ptr@PAGEOFF]
+; CHECK-NEXT:    ret
   %newaddr = getelementptr <2 x i64>, ptr %addr, i32 5
   %val = load <2 x i64>, ptr %newaddr, align 8
   store ptr %newaddr, ptr @ptr
@@ -1282,20 +1138,12 @@ define <2 x i64> @test_v2i64_post_load(ptr %addr) {
 }
 
 define void @test_v2i64_pre_store(<2 x i64> %in, ptr %addr) {
-; SDAG-LABEL: test_v2i64_pre_store:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    adrp x8, _ptr@PAGE
-; SDAG-NEXT:    str q0, [x0, #80]!
-; SDAG-NEXT:    str x0, [x8, _ptr@PAGEOFF]
-; SDAG-NEXT:    ret
-;
-; CHECK-GISEL-LABEL: test_v2i64_pre_store:
-; CHECK-GISEL:       ; %bb.0:
-; CHECK-GISEL-NEXT:    adrp x8, _ptr@PAGE
-; CHECK-GISEL-NEXT:    add x9, x0, #80
-; CHECK-GISEL-NEXT:    str q0, [x0, #80]
-; CHECK-GISEL-NEXT:    str x9, [x8, _ptr@PAGEOFF]
-; CHECK-GISEL-NEXT:    ret
+; CHECK-LABEL: test_v2i64_pre_store:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    adrp x8, _ptr@PAGE
+; CHECK-NEXT:    str q0, [x0, #80]!
+; CHECK-NEXT:    str x0, [x8, _ptr@PAGEOFF]
+; CHECK-NEXT:    ret
   %newaddr = getelementptr <2 x i64>, ptr %addr, i32 5
   store <2 x i64> %in, ptr %newaddr, align 8
   store ptr %newaddr, ptr @ptr
@@ -1317,20 +1165,12 @@ define void @test_v2i64_post_store(<2 x i64> %in, ptr %addr) {
 
 
 define <2 x double> @test_v2f64_pre_load(ptr %addr) {
-; SDAG-LABEL: test_v2f64_pre_load:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    ldr q0, [x0, #80]!
-; SDAG-NEXT:    adrp x8, _ptr@PAGE
-; SDAG-NEXT:    str x0, [x8, _ptr@PAGEOFF]
-; SDAG-NEXT:    ret
-;
-; CHECK-GISEL-LABEL: test_v2f64_pre_load:
-; CHECK-GISEL:       ; %bb.0:
-; CHECK-GISEL-NEXT:    ldr q0, [x0, #80]
-; CHECK-GISEL-NEXT:    adrp x8, _ptr@PAGE
-; CHECK-GISEL-NEXT:    add x9, x0, #80
-; CHECK-GISEL-NEXT:    str x9, [x8, _ptr@PAGEOFF]
-; CHECK-GISEL-NEXT:    ret
+; CHECK-LABEL: test_v2f64_pre_load:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    ldr q0, [x0, #80]!
+; CHECK-NEXT:    adrp x8, _ptr@PAGE
+; CHECK-NEXT:    str x0, [x8, _ptr@PAGEOFF]
+; CHECK-NEXT:    ret
   %newaddr = getelementptr <2 x double>, ptr %addr, i32 5
   %val = load <2 x double>, ptr %newaddr, align 8
   store ptr %newaddr, ptr @ptr
@@ -1351,20 +1191,12 @@ define <2 x double> @test_v2f64_post_load(ptr %addr) {
 }
 
 define void @test_v2f64_pre_store(<2 x double> %in, ptr %addr) {
-; SDAG-LABEL: test_v2f64_pre_store:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    adrp x8, _ptr@PAGE
-; SDAG-NEXT:    str q0, [x0, #80]!
-; SDAG-NEXT:    str x0, [x8, _ptr@PAGEOFF]
-; SDAG-NEXT:    ret
-;
-; CHECK-GISEL-LABEL: test_v2f64_pre_store:
-; CHECK-GISEL:       ; %bb.0:
-; CHECK-GISEL-NEXT:    adrp x8, _ptr@PAGE
-; CHECK-GISEL-NEXT:    add x9, x0, #80
-; CHECK-GISEL-NEXT:    str q0, [x0, #80]
-; CHECK-GISEL-NEXT:    str x9, [x8, _ptr@PAGEOFF]
-; CHECK-GISEL-NEXT:    ret
+; CHECK-LABEL: test_v2f64_pre_store:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    adrp x8, _ptr@PAGE
+; CHECK-NEXT:    str q0, [x0, #80]!
+; CHECK-NEXT:    str x0, [x8, _ptr@PAGEOFF]
+; CHECK-NEXT:    ret
   %newaddr = getelementptr <2 x double>, ptr %addr, i32 5
   store <2 x double> %in, ptr %newaddr, align 8
   store ptr %newaddr, ptr @ptr

From 7fa19e6f4b87623b0ca1a23bf6b6293c1b5e5799 Mon Sep 17 00:00:00 2001
From: Nishant Patel <nishant.b.patel@intel.com>
Date: Thu, 26 Oct 2023 10:41:09 -0700
Subject: [PATCH 103/877] [MLIR] Add SyclRuntimeWrapper  (#69648)

---
 mlir/CMakeLists.txt                           |   1 +
 mlir/cmake/modules/FindLevelZero.cmake        | 221 ++++++++++++++++++
 mlir/cmake/modules/FindSyclRuntime.cmake      |  68 ++++++
 mlir/lib/ExecutionEngine/CMakeLists.txt       |  36 +++
 .../ExecutionEngine/SyclRuntimeWrappers.cpp   | 209 +++++++++++++++++
 5 files changed, 535 insertions(+)
 create mode 100644 mlir/cmake/modules/FindLevelZero.cmake
 create mode 100644 mlir/cmake/modules/FindSyclRuntime.cmake
 create mode 100644 mlir/lib/ExecutionEngine/SyclRuntimeWrappers.cpp

diff --git a/mlir/CMakeLists.txt b/mlir/CMakeLists.txt
index ac120aad0d1ed..16ff950089734 100644
--- a/mlir/CMakeLists.txt
+++ b/mlir/CMakeLists.txt
@@ -126,6 +126,7 @@ add_definitions(-DMLIR_ROCM_CONVERSIONS_ENABLED=${MLIR_ENABLE_ROCM_CONVERSIONS})
 set(MLIR_ENABLE_DEPRECATED_GPU_SERIALIZATION 0 CACHE BOOL "Enable deprecated GPU serialization passes")
 set(MLIR_ENABLE_CUDA_RUNNER 0 CACHE BOOL "Enable building the mlir CUDA runner")
 set(MLIR_ENABLE_ROCM_RUNNER 0 CACHE BOOL "Enable building the mlir ROCm runner")
+set(MLIR_ENABLE_SYCL_RUNNER 0 CACHE BOOL "Enable building the mlir Sycl runner")
 set(MLIR_ENABLE_SPIRV_CPU_RUNNER 0 CACHE BOOL "Enable building the mlir SPIR-V cpu runner")
 set(MLIR_ENABLE_VULKAN_RUNNER 0 CACHE BOOL "Enable building the mlir Vulkan runner")
 set(MLIR_ENABLE_NVPTXCOMPILER 0 CACHE BOOL
diff --git a/mlir/cmake/modules/FindLevelZero.cmake b/mlir/cmake/modules/FindLevelZero.cmake
new file mode 100644
index 0000000000000..012187f0afc0b
--- /dev/null
+++ b/mlir/cmake/modules/FindLevelZero.cmake
@@ -0,0 +1,221 @@
+# CMake find_package() module for level-zero
+#
+# Example usage:
+#
+# find_package(LevelZero)
+#
+# If successful, the following variables will be defined:
+# LevelZero_FOUND
+# LevelZero_INCLUDE_DIRS
+# LevelZero_LIBRARY
+# LevelZero_LIBRARIES_DIR
+#
+# By default, the module searches the standard paths to locate the "ze_api.h"
+# and the ze_loader shared library. When using a custom level-zero installation,
+# the environment variable "LEVEL_ZERO_DIR" should be specified telling the
+# module to get the level-zero library and headers from that location.
+
+include(FindPackageHandleStandardArgs)
+
+# Search path priority
+# 1. CMake Variable LEVEL_ZERO_DIR
+# 2. Environment Variable LEVEL_ZERO_DIR
+
+if(NOT LEVEL_ZERO_DIR)
+    if(DEFINED ENV{LEVEL_ZERO_DIR})
+        set(LEVEL_ZERO_DIR "$ENV{LEVEL_ZERO_DIR}")
+    endif()
+endif()
+
+if(LEVEL_ZERO_DIR)
+    find_path(LevelZero_INCLUDE_DIR
+        NAMES level_zero/ze_api.h
+        PATHS ${LEVEL_ZERO_DIR}/include
+        NO_DEFAULT_PATH
+    )
+
+    if(LINUX)
+        find_library(LevelZero_LIBRARY
+            NAMES ze_loader
+            PATHS ${LEVEL_ZERO_DIR}/lib
+                  ${LEVEL_ZERO_DIR}/lib/x86_64-linux-gnu
+            NO_DEFAULT_PATH
+        )
+    else()
+        find_library(LevelZero_LIBRARY
+            NAMES ze_loader
+            PATHS ${LEVEL_ZERO_DIR}/lib
+            NO_DEFAULT_PATH
+        )
+    endif()
+else()
+    find_path(LevelZero_INCLUDE_DIR
+        NAMES level_zero/ze_api.h
+    )
+
+    find_library(LevelZero_LIBRARY
+        NAMES ze_loader
+    )
+endif()
+
+# Compares the two version string that are supposed to be in x.y.z format
+# and reports if the argument VERSION_STR1 is greater than or equal than
+# version_str2. The strings are compared lexicographically after conversion to
+# lists of equal lengths, with the shorter string getting zero-padded.
+function(compare_versions VERSION_STR1 VERSION_STR2 OUTPUT)
+    # Convert the strings to list
+    string(REPLACE  "." ";" VL1 ${VERSION_STR1})
+    string(REPLACE  "." ";" VL2 ${VERSION_STR2})
+    # get lengths of both lists
+    list(LENGTH VL1 VL1_LEN)
+    list(LENGTH VL2 VL2_LEN)
+    set(LEN ${VL1_LEN})
+    # If they differ in size pad the shorter list with 0s
+    if(VL1_LEN GREATER VL2_LEN)
+        math(EXPR DIFF "${VL1_LEN} - ${VL2_LEN}" OUTPUT_FORMAT DECIMAL)
+        foreach(IDX RANGE 1 ${DIFF} 1)
+            list(APPEND VL2 "0")
+        endforeach()
+    elseif(VL2_LEN GREATER VL2_LEN)
+        math(EXPR DIFF "${VL1_LEN} - ${VL2_LEN}" OUTPUT_FORMAT DECIMAL)
+        foreach(IDX RANGE 1 ${DIFF} 1)
+            list(APPEND VL2 "0")
+        endforeach()
+        set(LEN ${VL2_LEN})
+    endif()
+    math(EXPR LEN_SUB_ONE "${LEN}-1")
+    foreach(IDX RANGE 0 ${LEN_SUB_ONE} 1)
+        list(GET VL1 ${IDX} VAL1)
+        list(GET VL2 ${IDX} VAL2)
+
+        if(${VAL1} GREATER ${VAL2})
+            set(${OUTPUT} TRUE PARENT_SCOPE)
+            break()
+        elseif(${VAL1} LESS ${VAL2})
+            set(${OUTPUT} FALSE PARENT_SCOPE)
+            break()
+        else()
+            set(${OUTPUT} TRUE PARENT_SCOPE)
+        endif()
+    endforeach()
+
+    endfunction(compare_versions)
+
+# Creates a small function to run and extract the LevelZero loader version.
+function(get_l0_loader_version)
+
+    set(L0_VERSIONEER_SRC
+        [====[
+        #include <iostream>
+        #include <level_zero/loader/ze_loader.h>
+        #include <string>
+        int main() {
+            ze_result_t result;
+            std::string loader("loader");
+            zel_component_version_t *versions;
+            size_t size = 0;
+            result = zeInit(0);
+            if (result != ZE_RESULT_SUCCESS) {
+                std::cerr << "Failed to init ze driver" << std::endl;
+                return -1;
+            }
+            zelLoaderGetVersions(&size, nullptr);
+            versions = new zel_component_version_t[size];
+            zelLoaderGetVersions(&size, versions);
+            for (size_t i = 0; i < size; i++) {
+                if (loader.compare(versions[i].component_name) == 0) {
+                    std::cout << versions[i].component_lib_version.major << "."
+                              << versions[i].component_lib_version.minor << "."
+                              << versions[i].component_lib_version.patch;
+                    break;
+                }
+            }
+            delete[] versions;
+            return 0;
+        }
+        ]====]
+    )
+
+    set(L0_VERSIONEER_FILE ${CMAKE_BINARY_DIR}/temp/l0_versioneer.cpp)
+
+    file(WRITE ${L0_VERSIONEER_FILE} "${L0_VERSIONEER_SRC}")
+
+    # We need both the directories in the include path as ze_loader.h
+    # includes "ze_api.h" and not "level_zero/ze_api.h".
+    list(APPEND INCLUDE_DIRS ${LevelZero_INCLUDE_DIR})
+    list(APPEND INCLUDE_DIRS ${LevelZero_INCLUDE_DIR}/level_zero)
+    list(JOIN INCLUDE_DIRS ";" INCLUDE_DIRS_STR)
+    try_run(L0_VERSIONEER_RUN L0_VERSIONEER_COMPILE
+            "${CMAKE_BINARY_DIR}"
+            "${L0_VERSIONEER_FILE}"
+            LINK_LIBRARIES ${LevelZero_LIBRARY}
+            CMAKE_FLAGS
+                "-DINCLUDE_DIRECTORIES=${INCLUDE_DIRS_STR}"
+            RUN_OUTPUT_VARIABLE L0_VERSION
+    )
+    if(${L0_VERSIONEER_COMPILE} AND (DEFINED L0_VERSIONEER_RUN))
+        set(LevelZero_VERSION ${L0_VERSION} PARENT_SCOPE)
+        message(STATUS "Found Level Zero of version: ${L0_VERSION}")
+    else()
+        message(FATAL_ERROR
+            "Could not compile a level-zero program to extract loader version"
+        )
+    endif()
+endfunction(get_l0_loader_version)
+
+if(LevelZero_INCLUDE_DIR AND LevelZero_LIBRARY)
+    list(APPEND LevelZero_LIBRARIES "${LevelZero_LIBRARY}")
+    list(APPEND LevelZero_INCLUDE_DIRS ${LevelZero_INCLUDE_DIR})
+    if(OpenCL_FOUND)
+      list(APPEND LevelZero_INCLUDE_DIRS ${OpenCL_INCLUDE_DIRS})
+    endif()
+
+    cmake_path(GET LevelZero_LIBRARY PARENT_PATH LevelZero_LIBRARIES_PATH)
+    set(LevelZero_LIBRARIES_DIR ${LevelZero_LIBRARIES_PATH})
+
+    if(NOT TARGET LevelZero::LevelZero)
+      add_library(LevelZero::LevelZero INTERFACE IMPORTED)
+      set_target_properties(LevelZero::LevelZero
+        PROPERTIES INTERFACE_LINK_LIBRARIES "${LevelZero_LIBRARIES}"
+      )
+      set_target_properties(LevelZero::LevelZero
+        PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${LevelZero_INCLUDE_DIRS}"
+      )
+    endif()
+endif()
+
+# Check if a specific version of Level Zero is required
+if(LevelZero_FIND_VERSION)
+    get_l0_loader_version()
+    set(VERSION_GT_FIND_VERSION FALSE)
+    compare_versions(
+        ${LevelZero_VERSION}
+        ${LevelZero_FIND_VERSION}
+        VERSION_GT_FIND_VERSION
+    )
+    if(${VERSION_GT_FIND_VERSION})
+        set(LevelZero_FOUND TRUE)
+    else()
+        set(LevelZero_FOUND FALSE)
+    endif()
+else()
+    set(LevelZero_FOUND TRUE)
+endif()
+
+find_package_handle_standard_args(LevelZero
+    REQUIRED_VARS
+        LevelZero_FOUND
+        LevelZero_INCLUDE_DIRS
+        LevelZero_LIBRARY
+        LevelZero_LIBRARIES_DIR
+    HANDLE_COMPONENTS
+)
+mark_as_advanced(LevelZero_LIBRARY LevelZero_INCLUDE_DIRS)
+
+if(LevelZero_FOUND)
+    find_package_message(LevelZero "Found LevelZero: ${LevelZero_LIBRARY}"
+        "(found version ${LevelZero_VERSION})"
+    )
+else()
+    find_package_message(LevelZero "Could not find LevelZero" "")
+endif()
diff --git a/mlir/cmake/modules/FindSyclRuntime.cmake b/mlir/cmake/modules/FindSyclRuntime.cmake
new file mode 100644
index 0000000000000..38b065a3f284c
--- /dev/null
+++ b/mlir/cmake/modules/FindSyclRuntime.cmake
@@ -0,0 +1,68 @@
+# CMake find_package() module for SYCL Runtime
+#
+# Example usage:
+#
+# find_package(SyclRuntime)
+#
+# If successful, the following variables will be defined:
+# SyclRuntime_FOUND
+# SyclRuntime_INCLUDE_DIRS
+# SyclRuntime_LIBRARY
+# SyclRuntime_LIBRARIES_DIR
+#
+
+include(FindPackageHandleStandardArgs)
+
+if(NOT DEFINED ENV{CMPLR_ROOT})
+    message(WARNING "Please make sure to install Intel DPC++ Compiler and run setvars.(sh/bat)")
+    message(WARNING "You can download standalone Intel DPC++ Compiler from https://www.intel.com/content/www/us/en/developer/articles/tool/oneapi-standalone-components.html#compilers")
+else()
+    if(LINUX OR (${CMAKE_SYSTEM_NAME} MATCHES "Linux"))
+        set(SyclRuntime_ROOT "$ENV{CMPLR_ROOT}/linux")
+    elseif(WIN32)
+        set(SyclRuntime_ROOT "$ENV{CMPLR_ROOT}/windows")
+    endif()
+    list(APPEND SyclRuntime_INCLUDE_DIRS "${SyclRuntime_ROOT}/include")
+    list(APPEND SyclRuntime_INCLUDE_DIRS "${SyclRuntime_ROOT}/include/sycl")
+
+    set(SyclRuntime_LIBRARY_DIR "${SyclRuntime_ROOT}/lib")
+
+    message(STATUS "SyclRuntime_LIBRARY_DIR: ${SyclRuntime_LIBRARY_DIR}")
+    find_library(SyclRuntime_LIBRARY
+        NAMES sycl
+        PATHS ${SyclRuntime_LIBRARY_DIR}
+        NO_DEFAULT_PATH
+        )
+endif()
+
+if(SyclRuntime_LIBRARY)
+    set(SyclRuntime_FOUND TRUE)
+    if(NOT TARGET SyclRuntime::SyclRuntime)
+        add_library(SyclRuntime::SyclRuntime INTERFACE IMPORTED)
+        set_target_properties(SyclRuntime::SyclRuntime
+            PROPERTIES INTERFACE_LINK_LIBRARIES "${SyclRuntime_LIBRARY}"
+      )
+      set_target_properties(SyclRuntime::SyclRuntime
+          PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${SyclRuntime_INCLUDE_DIRS}"
+      )
+    endif()
+else()
+    set(SyclRuntime_FOUND FALSE)
+endif()
+
+find_package_handle_standard_args(SyclRuntime
+    REQUIRED_VARS
+        SyclRuntime_FOUND
+        SyclRuntime_INCLUDE_DIRS
+        SyclRuntime_LIBRARY
+        SyclRuntime_LIBRARY_DIR
+    HANDLE_COMPONENTS
+)
+
+mark_as_advanced(SyclRuntime_LIBRARY SyclRuntime_INCLUDE_DIRS)
+
+if(SyclRuntime_FOUND)
+    find_package_message(SyclRuntime "Found SyclRuntime: ${SyclRuntime_LIBRARY}" "")
+else()
+    find_package_message(SyclRuntime "Could not find SyclRuntime" "")
+endif()
diff --git a/mlir/lib/ExecutionEngine/CMakeLists.txt b/mlir/lib/ExecutionEngine/CMakeLists.txt
index ea33c2c6ed261..fdc797763ae3a 100644
--- a/mlir/lib/ExecutionEngine/CMakeLists.txt
+++ b/mlir/lib/ExecutionEngine/CMakeLists.txt
@@ -12,6 +12,7 @@ set(LLVM_OPTIONAL_SOURCES
   RunnerUtils.cpp
   OptUtils.cpp
   JitRunner.cpp
+  SyclRuntimeWrappers.cpp
   )
 
 # Use a separate library for OptUtils, to avoid pulling in the entire JIT and
@@ -328,4 +329,39 @@ if(LLVM_ENABLE_PIC)
       hip::host hip::amdhip64
     )
   endif()
+
+  if(MLIR_ENABLE_SYCL_RUNNER)
+    find_package(SyclRuntime)
+
+    if(NOT SyclRuntime_FOUND)
+      message(FATAL_ERROR "syclRuntime not found. Please set check oneapi installation and run setvars.sh.")
+    endif()
+
+    find_package(LevelZero)
+
+    if(NOT LevelZero_FOUND)
+      message(FATAL_ERROR "LevelZero not found. Please set LEVEL_ZERO_DIR.")
+    endif()
+
+    add_mlir_library(mlir_sycl_runtime
+      SHARED
+      SyclRuntimeWrappers.cpp
+
+      EXCLUDE_FROM_LIBMLIR
+    )
+
+    check_cxx_compiler_flag("-frtti" CXX_HAS_FRTTI_FLAG)
+    if(NOT CXX_HAS_FRTTI_FLAG)
+      message(FATAL_ERROR "CXX compiler does not accept flag -frtti")
+    endif()
+    target_compile_options (mlir_sycl_runtime PUBLIC -fexceptions -frtti)
+
+    target_include_directories(mlir_sycl_runtime PRIVATE
+      ${MLIR_INCLUDE_DIRS}
+    )
+
+    target_link_libraries(mlir_sycl_runtime PRIVATE LevelZero::LevelZero SyclRuntime::SyclRuntime)
+
+    set_property(TARGET mlir_sycl_runtime APPEND PROPERTY BUILD_RPATH "${LevelZero_LIBRARIES_DIR}" "${SyclRuntime_LIBRARIES_DIR}")
+  endif()
 endif()
diff --git a/mlir/lib/ExecutionEngine/SyclRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/SyclRuntimeWrappers.cpp
new file mode 100644
index 0000000000000..c250340c38fc7
--- /dev/null
+++ b/mlir/lib/ExecutionEngine/SyclRuntimeWrappers.cpp
@@ -0,0 +1,209 @@
+//===- SyclRuntimeWrappers.cpp - MLIR SYCL wrapper library ------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Implements wrappers around the sycl runtime library with C linkage
+//
+//===----------------------------------------------------------------------===//
+
+#include <CL/sycl.hpp>
+#include <level_zero/ze_api.h>
+#include <sycl/ext/oneapi/backend/level_zero.hpp>
+
+#ifdef _WIN32
+#define SYCL_RUNTIME_EXPORT __declspec(dllexport)
+#else
+#define SYCL_RUNTIME_EXPORT
+#endif // _WIN32
+
+namespace {
+
+template <typename F>
+auto catchAll(F &&func) {
+  try {
+    return func();
+  } catch (const std::exception &e) {
+    fprintf(stdout, "An exception was thrown: %s\n", e.what());
+    fflush(stdout);
+    abort();
+  } catch (...) {
+    fprintf(stdout, "An unknown exception was thrown\n");
+    fflush(stdout);
+    abort();
+  }
+}
+
+#define L0_SAFE_CALL(call)                                                     \
+  {                                                                            \
+    ze_result_t status = (call);                                               \
+    if (status != ZE_RESULT_SUCCESS) {                                         \
+      fprintf(stdout, "L0 error %d\n", status);                                \
+      fflush(stdout);                                                          \
+      abort();                                                                 \
+    }                                                                          \
+  }
+
+} // namespace
+
+static sycl::device getDefaultDevice() {
+  static sycl::device syclDevice;
+  static bool isDeviceInitialised = false;
+  if (!isDeviceInitialised) {
+    auto platformList = sycl::platform::get_platforms();
+    for (const auto &platform : platformList) {
+      auto platformName = platform.get_info<sycl::info::platform::name>();
+      bool isLevelZero = platformName.find("Level-Zero") != std::string::npos;
+      if (!isLevelZero)
+        continue;
+
+      syclDevice = platform.get_devices()[0];
+      isDeviceInitialised = true;
+      return syclDevice;
+    }
+    throw std::runtime_error("getDefaultDevice failed");
+  } else
+    return syclDevice;
+}
+
+static sycl::context getDefaultContext() {
+  static sycl::context syclContext{getDefaultDevice()};
+  return syclContext;
+}
+
+static void *allocDeviceMemory(sycl::queue *queue, size_t size, bool isShared) {
+  void *memPtr = nullptr;
+  if (isShared) {
+    memPtr = sycl::aligned_alloc_shared(64, size, getDefaultDevice(),
+                                        getDefaultContext());
+  } else {
+    memPtr = sycl::aligned_alloc_device(64, size, getDefaultDevice(),
+                                        getDefaultContext());
+  }
+  if (memPtr == nullptr) {
+    throw std::runtime_error("mem allocation failed!");
+  }
+  return memPtr;
+}
+
+static void deallocDeviceMemory(sycl::queue *queue, void *ptr) {
+  sycl::free(ptr, *queue);
+}
+
+static ze_module_handle_t loadModule(const void *data, size_t dataSize) {
+  assert(data);
+  ze_module_handle_t zeModule;
+  ze_module_desc_t desc = {ZE_STRUCTURE_TYPE_MODULE_DESC,
+                           nullptr,
+                           ZE_MODULE_FORMAT_IL_SPIRV,
+                           dataSize,
+                           (const uint8_t *)data,
+                           nullptr,
+                           nullptr};
+  auto zeDevice = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(
+      getDefaultDevice());
+  auto zeContext = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(
+      getDefaultContext());
+  L0_SAFE_CALL(zeModuleCreate(zeContext, zeDevice, &desc, &zeModule, nullptr));
+  return zeModule;
+}
+
+static sycl::kernel *getKernel(ze_module_handle_t zeModule, const char *name) {
+  assert(zeModule);
+  assert(name);
+  ze_kernel_handle_t zeKernel;
+  ze_kernel_desc_t desc = {};
+  desc.pKernelName = name;
+
+  L0_SAFE_CALL(zeKernelCreate(zeModule, &desc, &zeKernel));
+  sycl::kernel_bundle<sycl::bundle_state::executable> kernelBundle =
+      sycl::make_kernel_bundle<sycl::backend::ext_oneapi_level_zero,
+                               sycl::bundle_state::executable>(
+          {zeModule}, getDefaultContext());
+
+  auto kernel = sycl::make_kernel<sycl::backend::ext_oneapi_level_zero>(
+      {kernelBundle, zeKernel}, getDefaultContext());
+  return new sycl::kernel(kernel);
+}
+
+static void launchKernel(sycl::queue *queue, sycl::kernel *kernel, size_t gridX,
+                         size_t gridY, size_t gridZ, size_t blockX,
+                         size_t blockY, size_t blockZ, size_t sharedMemBytes,
+                         void **params, size_t paramsCount) {
+  auto syclGlobalRange =
+      sycl::range<3>(blockZ * gridZ, blockY * gridY, blockX * gridX);
+  auto syclLocalRange = sycl::range<3>(blockZ, blockY, blockX);
+  sycl::nd_range<3> syclNdRange(syclGlobalRange, syclLocalRange);
+
+  queue->submit([&](sycl::handler &cgh) {
+    for (size_t i = 0; i < paramsCount; i++) {
+      cgh.set_arg(static_cast<uint32_t>(i), *(static_cast<void **>(params[i])));
+    }
+    cgh.parallel_for(syclNdRange, *kernel);
+  });
+}
+
+// Wrappers
+
+extern "C" SYCL_RUNTIME_EXPORT sycl::queue *mgpuStreamCreate() {
+
+  return catchAll([&]() {
+    sycl::queue *queue =
+        new sycl::queue(getDefaultContext(), getDefaultDevice());
+    return queue;
+  });
+}
+
+extern "C" SYCL_RUNTIME_EXPORT void mgpuStreamDestroy(sycl::queue *queue) {
+  catchAll([&]() { delete queue; });
+}
+
+extern "C" SYCL_RUNTIME_EXPORT void *
+mgpuMemAlloc(uint64_t size, sycl::queue *queue, bool isShared) {
+  return catchAll([&]() {
+    return allocDeviceMemory(queue, static_cast<size_t>(size), true);
+  });
+}
+
+extern "C" SYCL_RUNTIME_EXPORT void mgpuMemFree(void *ptr, sycl::queue *queue) {
+  catchAll([&]() {
+    if (ptr) {
+      deallocDeviceMemory(queue, ptr);
+    }
+  });
+}
+
+extern "C" SYCL_RUNTIME_EXPORT ze_module_handle_t
+mgpuModuleLoad(const void *data, size_t gpuBlobSize) {
+  return catchAll([&]() { return loadModule(data, gpuBlobSize); });
+}
+
+extern "C" SYCL_RUNTIME_EXPORT sycl::kernel *
+mgpuModuleGetFunction(ze_module_handle_t module, const char *name) {
+  return catchAll([&]() { return getKernel(module, name); });
+}
+
+extern "C" SYCL_RUNTIME_EXPORT void
+mgpuLaunchKernel(sycl::kernel *kernel, size_t gridX, size_t gridY, size_t gridZ,
+                 size_t blockX, size_t blockY, size_t blockZ,
+                 size_t sharedMemBytes, sycl::queue *queue, void **params,
+                 void ** /*extra*/, size_t paramsCount) {
+  return catchAll([&]() {
+    launchKernel(queue, kernel, gridX, gridY, gridZ, blockX, blockY, blockZ,
+                 sharedMemBytes, params, paramsCount);
+  });
+}
+
+extern "C" SYCL_RUNTIME_EXPORT void mgpuStreamSynchronize(sycl::queue *queue) {
+
+  catchAll([&]() { queue->wait(); });
+}
+
+extern "C" SYCL_RUNTIME_EXPORT void
+mgpuModuleUnload(ze_module_handle_t module) {
+
+  catchAll([&]() { L0_SAFE_CALL(zeModuleDestroy(module)); });
+}

From 88d00a6897d71fded96a4f806ce5ebc46fd2a0de Mon Sep 17 00:00:00 2001
From: Alpha Abdoulaye <aabdoulaye@apple.com>
Date: Thu, 26 Oct 2023 10:45:08 -0700
Subject: [PATCH 104/877] Reland [dsymutil] Add support for mergeable libraries
 (#70256)

Reland https://reviews.llvm.org/D158124
Fixed `-fpermissive` error reported by gcc only.
---
 llvm/docs/CommandGuide/dsymutil.rst           |  15 ++
 llvm/include/llvm/BinaryFormat/Dwarf.def      |   1 +
 llvm/include/llvm/BinaryFormat/MachO.h        |   1 +
 llvm/include/llvm/DWARFLinker/DWARFLinker.h   |  23 ++
 .../llvm/DWARFLinkerParallel/AddressesMap.h   |  18 ++
 llvm/include/llvm/TargetParser/Triple.h       |   6 +-
 llvm/lib/DWARFLinker/DWARFLinker.cpp          |  61 ++++-
 llvm/lib/TargetParser/Triple.cpp              |  58 ++---
 .../Contents/Info.plist                       |  20 ++
 .../Resources/DWARF/bar-relink-variant.dylib  | Bin 0 -> 8923 bytes
 .../aarch64/bar-relink-variant.dylib.yml      |   8 +
 .../bar-relink.dylib.dSYM/Contents/Info.plist |  20 ++
 .../Contents/Resources/DWARF/bar-relink.dylib | Bin 0 -> 8923 bytes
 .../Relocations/aarch64/bar-relink.dylib.yml  |   8 +
 .../Inputs/basic-relink.macho.arm64.dylib     | Bin 0 -> 50992 bytes
 .../Inputs/basic-relink.macho.arm64.o         | Bin 0 -> 2176 bytes
 .../Contents/Info.plist                       |  20 ++
 .../Resources/DWARF/foo-relink-variant.dylib  | Bin 0 -> 9196 bytes
 .../DWARF/foo-relink-variant_debug.dylib      | Bin 0 -> 9196 bytes
 .../aarch64/foo-relink-variant.dylib.yml      |   9 +
 .../foo-relink.dylib.dSYM/Contents/Info.plist |  20 ++
 .../Contents/Resources/DWARF/foo-relink.dylib | Bin 0 -> 9292 bytes
 .../Relocations/aarch64/foo-relink.dylib.yml  |  10 +
 .../Contents/Info.plist                       |  20 ++
 .../Resources/DWARF/proxy-relink.dylib        | Bin 0 -> 10046 bytes
 .../aarch64/proxy-relink.dylib.yml            |  14 ++
 .../Inputs/two-level-relink.macho.arm64.dylib | Bin 0 -> 50944 bytes
 .../Inputs/variant-relink.macho.arm64.dylib   | Bin 0 -> 50944 bytes
 llvm/test/tools/dsymutil/basic-linking.test   | 122 ++++++++++
 llvm/test/tools/dsymutil/cmdline.test         |   2 +
 llvm/tools/dsymutil/CMakeLists.txt            |   1 +
 llvm/tools/dsymutil/DebugMap.cpp              |  23 +-
 llvm/tools/dsymutil/DebugMap.h                |  44 ++--
 llvm/tools/dsymutil/DwarfLinkerForBinary.cpp  | 202 ++++++++++++----
 llvm/tools/dsymutil/DwarfLinkerForBinary.h    | 112 +++++++--
 llvm/tools/dsymutil/LinkUtils.h               |   6 +
 llvm/tools/dsymutil/MachODebugMapParser.cpp   | 226 +++++++++++++++---
 llvm/tools/dsymutil/Options.td                |  11 +
 llvm/tools/dsymutil/RelocationMap.cpp         |  92 +++++++
 llvm/tools/dsymutil/RelocationMap.h           | 160 +++++++++++++
 llvm/tools/dsymutil/dsymutil.cpp              |  19 +-
 llvm/tools/dsymutil/dsymutil.h                |   6 +-
 llvm/tools/llvm-dwarfutil/DebugInfoLinker.cpp |  12 +
 llvm/tools/llvm-nm/llvm-nm.cpp                |  47 ++--
 44 files changed, 1218 insertions(+), 199 deletions(-)
 create mode 100644 llvm/test/tools/dsymutil/Inputs/bar-relink-variant.dylib.dSYM/Contents/Info.plist
 create mode 100644 llvm/test/tools/dsymutil/Inputs/bar-relink-variant.dylib.dSYM/Contents/Resources/DWARF/bar-relink-variant.dylib
 create mode 100644 llvm/test/tools/dsymutil/Inputs/bar-relink-variant.dylib.dSYM/Contents/Resources/Relocations/aarch64/bar-relink-variant.dylib.yml
 create mode 100644 llvm/test/tools/dsymutil/Inputs/bar-relink.dylib.dSYM/Contents/Info.plist
 create mode 100644 llvm/test/tools/dsymutil/Inputs/bar-relink.dylib.dSYM/Contents/Resources/DWARF/bar-relink.dylib
 create mode 100644 llvm/test/tools/dsymutil/Inputs/bar-relink.dylib.dSYM/Contents/Resources/Relocations/aarch64/bar-relink.dylib.yml
 create mode 100644 llvm/test/tools/dsymutil/Inputs/basic-relink.macho.arm64.dylib
 create mode 100644 llvm/test/tools/dsymutil/Inputs/basic-relink.macho.arm64.o
 create mode 100644 llvm/test/tools/dsymutil/Inputs/foo-relink-variant.dylib.dSYM/Contents/Info.plist
 create mode 100644 llvm/test/tools/dsymutil/Inputs/foo-relink-variant.dylib.dSYM/Contents/Resources/DWARF/foo-relink-variant.dylib
 create mode 100644 llvm/test/tools/dsymutil/Inputs/foo-relink-variant.dylib.dSYM/Contents/Resources/DWARF/foo-relink-variant_debug.dylib
 create mode 100644 llvm/test/tools/dsymutil/Inputs/foo-relink-variant.dylib.dSYM/Contents/Resources/Relocations/aarch64/foo-relink-variant.dylib.yml
 create mode 100644 llvm/test/tools/dsymutil/Inputs/foo-relink.dylib.dSYM/Contents/Info.plist
 create mode 100644 llvm/test/tools/dsymutil/Inputs/foo-relink.dylib.dSYM/Contents/Resources/DWARF/foo-relink.dylib
 create mode 100644 llvm/test/tools/dsymutil/Inputs/foo-relink.dylib.dSYM/Contents/Resources/Relocations/aarch64/foo-relink.dylib.yml
 create mode 100644 llvm/test/tools/dsymutil/Inputs/proxy-relink.dylib.dSYM/Contents/Info.plist
 create mode 100644 llvm/test/tools/dsymutil/Inputs/proxy-relink.dylib.dSYM/Contents/Resources/DWARF/proxy-relink.dylib
 create mode 100644 llvm/test/tools/dsymutil/Inputs/proxy-relink.dylib.dSYM/Contents/Resources/Relocations/aarch64/proxy-relink.dylib.yml
 create mode 100644 llvm/test/tools/dsymutil/Inputs/two-level-relink.macho.arm64.dylib
 create mode 100644 llvm/test/tools/dsymutil/Inputs/variant-relink.macho.arm64.dylib
 create mode 100644 llvm/tools/dsymutil/RelocationMap.cpp
 create mode 100644 llvm/tools/dsymutil/RelocationMap.h

diff --git a/llvm/docs/CommandGuide/dsymutil.rst b/llvm/docs/CommandGuide/dsymutil.rst
index 02243e227a24d..df621a429bb5c 100644
--- a/llvm/docs/CommandGuide/dsymutil.rst
+++ b/llvm/docs/CommandGuide/dsymutil.rst
@@ -32,11 +32,26 @@ OPTIONS
  architectures will be linked by default and any architectures that can't be
  properly linked will cause :program:`dsymutil` to return an error.
 
+.. option:: --build-variant-suffix <suffix=buildvariant>
+
+ Specify the build variant suffix used to build the executabe file.
+ There can be multiple variants for the binary of a product, each built
+ slightly differently. The most common build variants are 'debug' and
+ 'profile'. Setting the DYLD_IMAGE_SUFFIX environment variable will
+ cause dyld to load the specified variant at runtime.
+
 .. option:: --dump-debug-map
 
  Dump the *executable*'s debug-map (the list of the object files containing the
  debug information) in YAML format and exit. No DWARF link will take place.
 
+ .. option:: -D <path>
+
+ Specify a directory that contain dSYM files to search for.
+ This is used for mergeable libraries, so dsymutil knows where to look
+ for dSYM files with  debug information about symbols present in those
+ libraries.
+
 .. option:: --fat64
 
  Use a 64-bit header when emitting universal binaries.
diff --git a/llvm/include/llvm/BinaryFormat/Dwarf.def b/llvm/include/llvm/BinaryFormat/Dwarf.def
index fb328a0257732..d1abb1f361d3e 100644
--- a/llvm/include/llvm/BinaryFormat/Dwarf.def
+++ b/llvm/include/llvm/BinaryFormat/Dwarf.def
@@ -629,6 +629,7 @@ HANDLE_DW_AT(0x3fec, APPLE_objc_complete_type, 0, APPLE)
 HANDLE_DW_AT(0x3fed, APPLE_property, 0, APPLE)
 HANDLE_DW_AT(0x3fee, APPLE_objc_direct, 0, APPLE)
 HANDLE_DW_AT(0x3fef, APPLE_sdk, 0, APPLE)
+HANDLE_DW_AT(0x3ff0, APPLE_origin, 0, APPLE)
 
 // Attribute form encodings.
 HANDLE_DW_FORM(0x01, addr, 2, DWARF)
diff --git a/llvm/include/llvm/BinaryFormat/MachO.h b/llvm/include/llvm/BinaryFormat/MachO.h
index 49991ebe7bfaf..bef70f869520b 100644
--- a/llvm/include/llvm/BinaryFormat/MachO.h
+++ b/llvm/include/llvm/BinaryFormat/MachO.h
@@ -373,6 +373,7 @@ enum StabType {
   N_SSYM = 0x60u,
   N_SO = 0x64u,
   N_OSO = 0x66u,
+  N_LIB = 0x68u,
   N_LSYM = 0x80u,
   N_BINCL = 0x82u,
   N_SOL = 0x84u,
diff --git a/llvm/include/llvm/DWARFLinker/DWARFLinker.h b/llvm/include/llvm/DWARFLinker/DWARFLinker.h
index 6887e441ce8ff..2bd85e30d3b13 100644
--- a/llvm/include/llvm/DWARFLinker/DWARFLinker.h
+++ b/llvm/include/llvm/DWARFLinker/DWARFLinker.h
@@ -62,6 +62,9 @@ class AddressesMap {
   virtual std::optional<int64_t>
   getSubprogramRelocAdjustment(const DWARFDie &DIE) = 0;
 
+  /// Returns the file name associated to the AddessesMap
+  virtual std::optional<StringRef> getLibraryInstallName() = 0;
+
   /// Apply the valid relocations to the buffer \p Data, taking into
   /// account that Data is at \p BaseOffset in the .debug_info section.
   ///
@@ -69,6 +72,23 @@ class AddressesMap {
   virtual bool applyValidRelocs(MutableArrayRef<char> Data, uint64_t BaseOffset,
                                 bool IsLittleEndian) = 0;
 
+  /// Check if the linker needs to gather and save relocation info.
+  virtual bool needToSaveValidRelocs() = 0;
+
+  /// Update and save original relocations located in between StartOffset and
+  /// EndOffset. LinkedOffset is the value which should be added to the original
+  /// relocation offset to get new relocation offset in linked binary.
+  virtual void updateAndSaveValidRelocs(bool IsDWARF5,
+                                        uint64_t OriginalUnitOffset,
+                                        int64_t LinkedOffset,
+                                        uint64_t StartOffset,
+                                        uint64_t EndOffset) = 0;
+
+  /// Update the valid relocations that used OriginalUnitOffset as the compile
+  /// unit offset, and update their values to reflect OutputUnitOffset.
+  virtual void updateRelocationsWithUnitOffset(uint64_t OriginalUnitOffset,
+                                               uint64_t OutputUnitOffset) = 0;
+
   /// Erases all data.
   virtual void clear() = 0;
 };
@@ -751,6 +771,9 @@ class DWARFLinker {
       /// Is there a DW_AT_str_offsets_base in the CU?
       bool AttrStrOffsetBaseSeen = false;
 
+      /// Is there a DW_AT_APPLE_origin in the CU?
+      bool HasAppleOrigin = false;
+
       AttributesInfo() = default;
     };
 
diff --git a/llvm/include/llvm/DWARFLinkerParallel/AddressesMap.h b/llvm/include/llvm/DWARFLinkerParallel/AddressesMap.h
index 22fbec20d7d37..b451fee4e0b72 100644
--- a/llvm/include/llvm/DWARFLinkerParallel/AddressesMap.h
+++ b/llvm/include/llvm/DWARFLinkerParallel/AddressesMap.h
@@ -55,6 +55,9 @@ class AddressesMap {
   virtual std::optional<int64_t>
   getSubprogramRelocAdjustment(const DWARFDie &DIE) = 0;
 
+  // Returns the library install name associated to the AddessesMap.
+  virtual std::optional<StringRef> getLibraryInstallName() = 0;
+
   /// Apply the valid relocations to the buffer \p Data, taking into
   /// account that Data is at \p BaseOffset in the .debug_info section.
   ///
@@ -62,6 +65,21 @@ class AddressesMap {
   virtual bool applyValidRelocs(MutableArrayRef<char> Data, uint64_t BaseOffset,
                                 bool IsLittleEndian) = 0;
 
+  /// Check if the linker needs to gather and save relocation info.
+  virtual bool needToSaveValidRelocs() = 0;
+
+  /// Update and save relocation values to be serialized
+  virtual void updateAndSaveValidRelocs(bool IsDWARF5,
+                                        uint64_t OriginalUnitOffset,
+                                        int64_t LinkedOffset,
+                                        uint64_t StartOffset,
+                                        uint64_t EndOffset) = 0;
+
+  /// Update the valid relocations that used OriginalUnitOffset as the compile
+  /// unit offset, and update their values to reflect OutputUnitOffset.
+  virtual void updateRelocationsWithUnitOffset(uint64_t OriginalUnitOffset,
+                                               uint64_t OutputUnitOffset) = 0;
+
   /// Erases all data.
   virtual void clear() = 0;
 
diff --git a/llvm/include/llvm/TargetParser/Triple.h b/llvm/include/llvm/TargetParser/Triple.h
index 53cef0abbe0e1..0f56ac68c851f 100644
--- a/llvm/include/llvm/TargetParser/Triple.h
+++ b/llvm/include/llvm/TargetParser/Triple.h
@@ -418,9 +418,6 @@ class Triple {
   /// Get the architecture (first) component of the triple.
   StringRef getArchName() const;
 
-  /// Get the architecture name based on Kind and SubArch.
-  StringRef getArchName(ArchType Kind, SubArchType SubArch = NoSubArch) const;
-
   /// Get the vendor (second) component of the triple.
   StringRef getVendorName() const;
 
@@ -1118,6 +1115,9 @@ class Triple {
   /// Get the canonical name for the \p Kind architecture.
   static StringRef getArchTypeName(ArchType Kind);
 
+  /// Get the architecture name based on \p Kind and \p SubArch.
+  static StringRef getArchName(ArchType Kind, SubArchType SubArch = NoSubArch);
+
   /// Get the "prefix" canonical name for the \p Kind architecture. This is the
   /// prefix used by the architecture specific builtins, and is suitable for
   /// passing to \see Intrinsic::getIntrinsicForClangBuiltin().
diff --git a/llvm/lib/DWARFLinker/DWARFLinker.cpp b/llvm/lib/DWARFLinker/DWARFLinker.cpp
index 2d8360f100c11..80a4e2adefa6c 100644
--- a/llvm/lib/DWARFLinker/DWARFLinker.cpp
+++ b/llvm/lib/DWARFLinker/DWARFLinker.cpp
@@ -1026,6 +1026,15 @@ unsigned DWARFLinker::DIECloner::cloneStringAttribute(DIE &Die,
     StringEntry = DebugLineStrPool.getEntry(*String);
   } else {
     StringEntry = DebugStrPool.getEntry(*String);
+
+    if (AttrSpec.Attr == dwarf::DW_AT_APPLE_origin) {
+      Info.HasAppleOrigin = true;
+      if (std::optional<StringRef> FileName =
+              ObjFile.Addresses->getLibraryInstallName()) {
+        StringEntry = DebugStrPool.getEntry(*FileName);
+      }
+    }
+
     // Update attributes info.
     if (AttrSpec.Attr == dwarf::DW_AT_name)
       Info.Name = StringEntry;
@@ -1637,6 +1646,12 @@ shouldSkipAttribute(bool Update,
   }
 }
 
+struct AttributeLinkedOffsetFixup {
+  int64_t LinkedOffsetFixupVal;
+  uint64_t InputAttrStartOffset;
+  uint64_t InputAttrEndOffset;
+};
+
 DIE *DWARFLinker::DIECloner::cloneDIE(const DWARFDie &InputDIE,
                                       const DWARFFile &File, CompileUnit &Unit,
                                       int64_t PCOffset, uint32_t OutOffset,
@@ -1720,6 +1735,9 @@ DIE *DWARFLinker::DIECloner::cloneDIE(const DWARFDie &InputDIE,
       Flags |= TF_SkipPC;
   }
 
+  std::optional<StringRef> LibraryInstallName =
+      ObjFile.Addresses->getLibraryInstallName();
+  SmallVector<AttributeLinkedOffsetFixup> AttributesFixups;
   for (const auto &AttrSpec : Abbrev->attributes()) {
     if (shouldSkipAttribute(Update, AttrSpec, Flags & TF_SkipPC)) {
       DWARFFormValue::skipValue(AttrSpec.Form, Data, &Offset,
@@ -1727,17 +1745,41 @@ DIE *DWARFLinker::DIECloner::cloneDIE(const DWARFDie &InputDIE,
       continue;
     }
 
+    AttributeLinkedOffsetFixup CurAttrFixup;
+    CurAttrFixup.InputAttrStartOffset = InputDIE.getOffset() + Offset;
+    CurAttrFixup.LinkedOffsetFixupVal =
+        Unit.getStartOffset() + OutOffset - CurAttrFixup.InputAttrStartOffset;
+
     DWARFFormValue Val = AttrSpec.getFormValue();
     uint64_t AttrSize = Offset;
     Val.extractValue(Data, &Offset, U.getFormParams(), &U);
+    CurAttrFixup.InputAttrEndOffset = InputDIE.getOffset() + Offset;
     AttrSize = Offset - AttrSize;
 
-    OutOffset += cloneAttribute(*Die, InputDIE, File, Unit, Val, AttrSpec,
-                                AttrSize, AttrInfo, IsLittleEndian);
+    uint64_t FinalAttrSize =
+        cloneAttribute(*Die, InputDIE, File, Unit, Val, AttrSpec, AttrSize,
+                       AttrInfo, IsLittleEndian);
+    if (FinalAttrSize != 0 && ObjFile.Addresses->needToSaveValidRelocs())
+      AttributesFixups.push_back(CurAttrFixup);
+
+    OutOffset += FinalAttrSize;
   }
 
-  // Look for accelerator entries.
   uint16_t Tag = InputDIE.getTag();
+  // Add the DW_AT_APPLE_origin attribute to Compile Unit die if we have
+  // an install name and the DWARF doesn't have the attribute yet.
+  const bool NeedsAppleOrigin = (Tag == dwarf::DW_TAG_compile_unit) &&
+                                LibraryInstallName.has_value() &&
+                                !AttrInfo.HasAppleOrigin;
+  if (NeedsAppleOrigin) {
+    auto StringEntry = DebugStrPool.getEntry(LibraryInstallName.value());
+    Die->addValue(DIEAlloc, dwarf::Attribute(dwarf::DW_AT_APPLE_origin),
+                  dwarf::DW_FORM_strp, DIEInteger(StringEntry.getOffset()));
+    AttrInfo.Name = StringEntry;
+    OutOffset += 4;
+  }
+
+  // Look for accelerator entries.
   // FIXME: This is slightly wrong. An inline_subroutine without a
   // low_pc, but with AT_ranges might be interesting to get into the
   // accelerator tables too. For now stick with dsymutil's behavior.
@@ -1806,8 +1848,19 @@ DIE *DWARFLinker::DIECloner::cloneDIE(const DWARFDie &InputDIE,
   Linker.assignAbbrev(NewAbbrev);
   Die->setAbbrevNumber(NewAbbrev.getNumber());
 
+  uint64_t AbbrevNumberSize = getULEB128Size(Die->getAbbrevNumber());
+
   // Add the size of the abbreviation number to the output offset.
-  OutOffset += getULEB128Size(Die->getAbbrevNumber());
+  OutOffset += AbbrevNumberSize;
+
+  // Update fixups with the size of the abbreviation number
+  for (AttributeLinkedOffsetFixup &F : AttributesFixups)
+    F.LinkedOffsetFixupVal += AbbrevNumberSize;
+
+  for (AttributeLinkedOffsetFixup &F : AttributesFixups)
+    ObjFile.Addresses->updateAndSaveValidRelocs(
+        Unit.getOrigUnit().getVersion() >= 5, Unit.getOrigUnit().getOffset(),
+        F.LinkedOffsetFixupVal, F.InputAttrStartOffset, F.InputAttrEndOffset);
 
   if (!HasChildren) {
     // Update our size.
diff --git a/llvm/lib/TargetParser/Triple.cpp b/llvm/lib/TargetParser/Triple.cpp
index b9fab469f7476..5d4eb79675f89 100644
--- a/llvm/lib/TargetParser/Triple.cpp
+++ b/llvm/lib/TargetParser/Triple.cpp
@@ -90,6 +90,36 @@ StringRef Triple::getArchTypeName(ArchType Kind) {
   llvm_unreachable("Invalid ArchType!");
 }
 
+StringRef Triple::getArchName(ArchType Kind, SubArchType SubArch) {
+  switch (Kind) {
+  case Triple::mips:
+    if (SubArch == MipsSubArch_r6)
+      return "mipsisa32r6";
+    break;
+  case Triple::mipsel:
+    if (SubArch == MipsSubArch_r6)
+      return "mipsisa32r6el";
+    break;
+  case Triple::mips64:
+    if (SubArch == MipsSubArch_r6)
+      return "mipsisa64r6";
+    break;
+  case Triple::mips64el:
+    if (SubArch == MipsSubArch_r6)
+      return "mipsisa64r6el";
+    break;
+  case Triple::aarch64:
+    if (SubArch == AArch64SubArch_arm64ec)
+      return "arm64ec";
+    if (SubArch == AArch64SubArch_arm64e)
+      return "arm64e";
+    break;
+  default:
+    break;
+  }
+  return getArchTypeName(Kind);
+}
+
 StringRef Triple::getArchTypePrefix(ArchType Kind) {
   switch (Kind) {
   default:
@@ -1143,34 +1173,6 @@ StringRef Triple::getArchName() const {
   return StringRef(Data).split('-').first;           // Isolate first component
 }
 
-StringRef Triple::getArchName(ArchType Kind, SubArchType SubArch) const {
-  switch (Kind) {
-  case Triple::mips:
-    if (SubArch == MipsSubArch_r6)
-      return "mipsisa32r6";
-    break;
-  case Triple::mipsel:
-    if (SubArch == MipsSubArch_r6)
-      return "mipsisa32r6el";
-    break;
-  case Triple::mips64:
-    if (SubArch == MipsSubArch_r6)
-      return "mipsisa64r6";
-    break;
-  case Triple::mips64el:
-    if (SubArch == MipsSubArch_r6)
-      return "mipsisa64r6el";
-    break;
-  case Triple::aarch64:
-    if (SubArch == AArch64SubArch_arm64ec)
-      return "arm64ec";
-    break;
-  default:
-    break;
-  }
-  return getArchTypeName(Kind);
-}
-
 StringRef Triple::getVendorName() const {
   StringRef Tmp = StringRef(Data).split('-').second; // Strip first component
   return Tmp.split('-').first;                       // Isolate second component
diff --git a/llvm/test/tools/dsymutil/Inputs/bar-relink-variant.dylib.dSYM/Contents/Info.plist b/llvm/test/tools/dsymutil/Inputs/bar-relink-variant.dylib.dSYM/Contents/Info.plist
new file mode 100644
index 0000000000000..14d6272d3f8d9
--- /dev/null
+++ b/llvm/test/tools/dsymutil/Inputs/bar-relink-variant.dylib.dSYM/Contents/Info.plist
@@ -0,0 +1,20 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple Computer//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+	<dict>
+		<key>CFBundleDevelopmentRegion</key>
+		<string>English</string>
+		<key>CFBundleIdentifier</key>
+		<string>com.apple.xcode.dsym.bar-relink-variant.dylib</string>
+		<key>CFBundleInfoDictionaryVersion</key>
+		<string>6.0</string>
+		<key>CFBundlePackageType</key>
+		<string>dSYM</string>
+		<key>CFBundleSignature</key>
+		<string>????</string>
+		<key>CFBundleShortVersionString</key>
+		<string>1.0</string>
+		<key>CFBundleVersion</key>
+		<string>1</string>
+	</dict>
+</plist>
diff --git a/llvm/test/tools/dsymutil/Inputs/bar-relink-variant.dylib.dSYM/Contents/Resources/DWARF/bar-relink-variant.dylib b/llvm/test/tools/dsymutil/Inputs/bar-relink-variant.dylib.dSYM/Contents/Resources/DWARF/bar-relink-variant.dylib
new file mode 100644
index 0000000000000000000000000000000000000000..ec2e06a231f88946e6a74540c847fa801fcbbb45
GIT binary patch
literal 8923
zcmeHN%}*0S6rb(32o#qhF+#Xhf(azDZ3#^bMrqTcG$<G#vA1>GE>J1E&F+FoOlUk9
zZ+@ISa`52IBZ(eN^bhb4;NVdY81EWC-rJceOKW4jX)=$z&YL&y{r1gomc#Db_g_E%
z=pjTRAl;xRKu1G3qcQ};0O*6E#nSF}GFnr<e|VI1hbNfdOC$=+PzecuVmygZI|?J<
zlR?l=pqjQklV1j-QBsW#I>%B_5f??%Tw~j%Hk}{G+nc6qokt<{T+bCGHQy58ZJKw=
zW=SiXD;6v?*7t(p<$X!K7N4tb;y#~&YFakEoUXeGNimy`z9lqerj$?2FX^s+6x0eY
zcocGcpFTgDSl2A5rd`X;&&_0W{0wT~M1gbigpKFf6zkzNp$l;apjfEqW_s}ox~M#6
zBgcu-`2iyf?<?G67j@(QjKb!sRw<hXx?167SUibx<M$)S({0^cHJm2A4685B(lc%1
zvD=~v&tZ6L49{s3PcIZ~W2<?;2MlkX;T^Pz=eV|)(a7J}6AJW>;f<YTMvFny^lG(Y
zXr{hlI90vah<C{FMp*vg-{3(`BVObw1>*l#k>M7FF*mQzy<3ITZN^(;cnd6ktxY_u
zaJz^pjrHv_Jig!kKj5*;ia{_S7!V8y1_T3w0l|P^Krrx6GZ294df@?&KR4rZS}#7R
zr9eH=3c5|;XNNn)C>Rh72nGZLf&syRU_dY+7!V8y1_T3w0l~n3#sIdrg9N)=k|cdo
zPT~Ns1F+{S5qR<+w+(2~uB%>?4BEE={-WS?Nba5s1R)+<#rPV55ZZL-5&jmux6FP@
zFa|xl)4Nm;!KUmb&^^yf$V;?+jr_nqIIe<9on112Ga<hQM<^Kd2TIl(jFK^yNPW^d
zr9UJq_wOlRc9qDPA*D|`)d9s$5?RUhPY+IoFNOo~wSqhXj?3Y27z$Gf27TNeOby5i
z24En<rLjL6Emokv9NjW(r)-(gL{g2b@#v`M9Zw|VFjFVvYGUj-cQH&Z7j+ksoLIhS
zl?)a7t+9+{x`ydGv8=IWRII9D$8J<~cg3<doY-}}xUiI0tIXkxS<24w<-{ah(J8GH
z8eN~pa%OeYb)bwLQZ`*Ihs-XeXW7}}hP>}GaHPHt@)Lx|;DMszw;L!t25pRcb?agH
zE$9W9<97uf?=bTIHJ%veOn{<rTz-9E-4nbKB+>!j287@*3>F*^NW4a|@sKeGPZaLb
R=c7|zMSeI~X(|&-`~`Fg`G)`i

literal 0
HcmV?d00001

diff --git a/llvm/test/tools/dsymutil/Inputs/bar-relink-variant.dylib.dSYM/Contents/Resources/Relocations/aarch64/bar-relink-variant.dylib.yml b/llvm/test/tools/dsymutil/Inputs/bar-relink-variant.dylib.dSYM/Contents/Resources/Relocations/aarch64/bar-relink-variant.dylib.yml
new file mode 100644
index 0000000000000..68fd250cdf2a8
--- /dev/null
+++ b/llvm/test/tools/dsymutil/Inputs/bar-relink-variant.dylib.dSYM/Contents/Resources/Relocations/aarch64/bar-relink-variant.dylib.yml
@@ -0,0 +1,8 @@
+---
+triple:          'arm64-apple-darwin'
+binary-path:     bar-relink-variant.dylib
+relocations:
+  - { offset: 0x26, size: 0x8, addend: 0x0, symName: _bar, symObjAddr: 0x0, symBinAddr: 0x3FA0, symSize: 0x8 }
+  - { offset: 0x3F, size: 0x8, addend: 0x0, symName: _baz, symObjAddr: 0x8, symBinAddr: 0x4000, symSize: 0x0 }
+  - { offset: 0x4F, size: 0x8, addend: 0x0, symName: _bar, symObjAddr: 0x0, symBinAddr: 0x3FA0, symSize: 0x8 }
+...
diff --git a/llvm/test/tools/dsymutil/Inputs/bar-relink.dylib.dSYM/Contents/Info.plist b/llvm/test/tools/dsymutil/Inputs/bar-relink.dylib.dSYM/Contents/Info.plist
new file mode 100644
index 0000000000000..37c2a8aebe4c6
--- /dev/null
+++ b/llvm/test/tools/dsymutil/Inputs/bar-relink.dylib.dSYM/Contents/Info.plist
@@ -0,0 +1,20 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple Computer//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+	<dict>
+		<key>CFBundleDevelopmentRegion</key>
+		<string>English</string>
+		<key>CFBundleIdentifier</key>
+		<string>com.apple.xcode.dsym.bar-relink.dylib</string>
+		<key>CFBundleInfoDictionaryVersion</key>
+		<string>6.0</string>
+		<key>CFBundlePackageType</key>
+		<string>dSYM</string>
+		<key>CFBundleSignature</key>
+		<string>????</string>
+		<key>CFBundleShortVersionString</key>
+		<string>1.0</string>
+		<key>CFBundleVersion</key>
+		<string>1</string>
+	</dict>
+</plist>
diff --git a/llvm/test/tools/dsymutil/Inputs/bar-relink.dylib.dSYM/Contents/Resources/DWARF/bar-relink.dylib b/llvm/test/tools/dsymutil/Inputs/bar-relink.dylib.dSYM/Contents/Resources/DWARF/bar-relink.dylib
new file mode 100644
index 0000000000000000000000000000000000000000..9ebd56ecb00a482b3694f1ef2f27dea726effc43
GIT binary patch
literal 8923
zcmeHN&rcIU6rSz22o#qhF+#Xh0|`xJ+Y(3&{-8~Z(tu!u#NO6zyFjJvHoFTbF@YFk
zJgSKa2M-)Pc=Jf2F(!I7@eknWK@S@5nyByX%#@|I5pSBzBd_!3&3oUz`DQunzJ34g
z%a0C1Bm&Y7dIWSZgfl9AF!X?~KmHI4zB^aA`(!&baPQUhF!OtfM1dJ9ApuZ~ClP8#
zVFY~A3;GFE)8;4g^I+6Vs@_58SPClQqG+0H+;pi;=Lhk2#_3w?en>6Xa|OwkZwc^L
z&GoWb(#qz71q=1{J!5!zUlOmy=c<{w&uvgm%ckekH8&wCX7kavgr>}t@`?E+-PQMl
zn&AcaLk{lK=SLIkngrFf>D<iJL^j9IU<;fma891E@id!aJ-jA#Ax;kz3-w$}U%iMf
zDi7GmaiVm-$H>C_3isGX-MBxau)3&K%BF#?W_TGEPomuT{mAijTQ?UCrvWd+>PxWn
z%pvjEZP9?|FuWy(=NuAGFBEKJt#QA53~z?v?Hv-&acwW7p1;^b3iOuY#ZEG#$)IU^
zwOTPWQ(rNhs$Q(e`^@nAS^nq0!GoN7yvQR8#Q(1%r<xST+`K;bRuxXS5pRj%&9eBV
zL*iM58%0d1uWy&(@%`@p0gqi)41xi{fM7r{AQ%t~2nGZLf`NaUfdEw32@iPuxf!3+
zI`Kg*1?q`b&}{-g8{8pA!GK^uFd!HZ3<w4U1A+m;fM7r{AQ%t~2nPN$2C%&yB-rJW
zB<Z_y6bE=6fIVM{z?1)wZ9t25S@oJ^(7p}u7X`;da{E*u2=Uk|#@7gh(55?&@VDT-
zW%g5oG3eQ?-laMSHf7I)?s#58o}=w+<Og=aaS2pvZIk(%3Hc>BLcyRvP_kZQl#H=N
z>XHVO?vSk9xvhNNR3az(lrHI53luv-WF^-<-a8gP8xFwN3UWU<E`-BjC`=_7^l`H{
z)gvnyfPn~?#{OuuSb_d>bj`4xvSmh-qiRA;L<c?ZP;xW@Gj$}PCSwP=i(_)RsJoEl
z#Pda~WT?<@jb|*=HB8ruXN@(ZVpR=0ex;(j3zogqc`*}1%0We#7=Ty}~tCr99l
zPHCCY=-M=%Gpnnv17&QGvgu+uWO6P&$<7ux<b9WcBlWeApCCL24-^%@-9X_nXk*-q
z>-WQNK+nJ&zbo)~hmrTM@x(A^7!-x$^6LZZp5TojkrwziAOwG5u;73|5?d4-4;gdt
UMBy%dIymN4<cEWmhBC3lU-{Jf7XSbN

literal 0
HcmV?d00001

diff --git a/llvm/test/tools/dsymutil/Inputs/bar-relink.dylib.dSYM/Contents/Resources/Relocations/aarch64/bar-relink.dylib.yml b/llvm/test/tools/dsymutil/Inputs/bar-relink.dylib.dSYM/Contents/Resources/Relocations/aarch64/bar-relink.dylib.yml
new file mode 100644
index 0000000000000..d47e5f9e2e8d2
--- /dev/null
+++ b/llvm/test/tools/dsymutil/Inputs/bar-relink.dylib.dSYM/Contents/Resources/Relocations/aarch64/bar-relink.dylib.yml
@@ -0,0 +1,8 @@
+---
+triple:          'arm64-apple-darwin'
+binary-path:     bar-relink.dylib
+relocations:
+  - { offset: 0x26, size: 0x8, addend: 0x0, symName: _bar, symObjAddr: 0x0, symBinAddr: 0x3FA0, symSize: 0x8 }
+  - { offset: 0x3F, size: 0x8, addend: 0x0, symName: _baz, symObjAddr: 0x8, symBinAddr: 0x4000, symSize: 0x0 }
+  - { offset: 0x4F, size: 0x8, addend: 0x0, symName: _bar, symObjAddr: 0x0, symBinAddr: 0x3FA0, symSize: 0x8 }
+...
diff --git a/llvm/test/tools/dsymutil/Inputs/basic-relink.macho.arm64.dylib b/llvm/test/tools/dsymutil/Inputs/basic-relink.macho.arm64.dylib
new file mode 100644
index 0000000000000000000000000000000000000000..a813562d59671ca36aa860a69b007b7719ec669e
GIT binary patch
literal 50992
zcmeI5Yiv|S6vt0@x3x-NV2zOSSe`*C+Xgno;DZI)2tt5JLuyF4?4zZdZg-n~SPBRm
z1OrhcMNRnP2jK&lpeckXC_)X7#7I<>M`Egypnx$LNK6zGz;o`MX{YNpH6h`P|ABL7
z{xfIJocYbY`)$&1ZePD!LR3Ipir6aH+C4;D?H17}*4ud0vKs0e8(8F)Ku(9!h1@O1
zq3mK=>87=5CAi9sS24#KYgaC!UH$G7C~Hc~N~JTQR9{zaysV1Hd$WsZGhVTZr^hWT
zoJuF7@iodnH{Kg6o?DG;t>l2%(q~zjct<oIv7+(jgslhi+o9s={faj8({Xhz=djC;
zQ#ZF^t~Gz@lI6~61eC`hwd5-%2P|i6S!)u;F|f`bub6%95^&}#Wxpf#%c434=B2&3
z&Z>4m)j8VCOFS$q5=;k`udOAoey!u__eu0nycXDT7B5_~sJ?EYes_CR%z&zuu6@YS
zvNv;orL3P~8=2D|3Z|mr>B**8G~ViK3x-<~zF@L#mfsiYj7394ekLHisfTDaS2LAf
z&06x3+$Cl=>sqc4pWuF8co?^ru}OJSnS4hzY-V082b6A}pN~fsd&xSdHVr>LZMlET
zcK5N`^PVqXnZbQtHYa8ox616coC_8G<(9SW{6%w-??iKO){&V7RFg?1Yk0?E%R5u)
zrZ(R^%|PZ&VUsxfdWhtmdO~U0x0Eq#_ByASL%t2zUU+~22!H?xfB*=900@8p2!H?x
zfB*=900@8p2!H?xfB*=900@8p2!H?xfB*=900@8p2!H?xfB*=900@8p2!H?xfB*=9
z00@8p2!H?xfB*=900{hF3EWwK{DV7g>f-(HpxWoU`0ngGch-Ngug7g&xHq59-Ycgg
z*UAG&iVJD@HEv(xHurbQc%UHrqV}6g*%2~6s4qFbeD}4x!WFnhu5lNZE^_YUj^@SX
zD(ob62vV!7u`a$W>yo?ia)($<-*cC?S>zsX3E#l?IyZb>9JAmV?yKXv#c}i){#m1q
zCJ2B42!H?xfB*=900@8p2!H?xfB*=900@8p2!H?xfB*=900@8p2!H?xfB*=900@8p
z2!H?xfB*=900@8p2!H?xfB*=900@8p2!H?xfB*=900@8p2!H?xfWQMLVE?`E2fQYh
z0|5{K0T2KI5C8!X009sH0T2KI5C8!X009sH0T2KI5C8!X009sH0T2KI5C8!X009sH
z0T2KI5C8!X009sH0T2KI5C8!X009sH0T2KI5C8!X009sH0T2KI5coF%FLy0tYh)`>
z#+6+>svfCoUD-NNL=@0HYOYIZt38uWal62xY0~AUa~t{b_eSziHp^bFB$chBN8PkT
z`uA@n@&se)=0t)X4h3mcB${fE1v_cH^h{x=(ca=oL{mA`SWa^B90}^I8Ihz_@pB!~
zg6n5<wCu5&cUjqc)pnDqK7LfFc$KUt8Cu49SsKkIMf8%KadZ7H9&ZsxRn~KD=CdV7
z?`EAZpK6uQ7#^2=wAT4}xnGLT#~hdaXg;5?kIdIv&o8!T_s=)S&HePUpShnCtj+zL
z$<fzY=i5(%+K;>*l8@FpA7Ly-wD2{@&3vkO%*>~dwV6*@j;>~%FQ2t4pGSB-$wzB$
z+UK3iM~|2DxR*^A@i+66{grd4&R^n-Hs_b~XG2*wF7m#~xPJfSyhZEtoWPnT|946a
zS7KhW)YoE#f=S}XI<oXPW63w;e1OulGVx5RDMB?1<ML6~P$-y+hNmZ+V$pc3uPqpE
zN%(@vwpo5(xX))$mqecnacCVRY>G%{EE?)_GrTydeCaqF`46E6IvSZUg8u1-7?_DM
zkWe`3_D>}}{97bHQJQfj_wg~h9w(QFEB;5+!6rW)Qc8(Vk=I2fjvVsYt~~~2>ZJM6
zXXCHz-LUvTm1p9?Gtra3_*U1)CVzV4*zLzEzdrQix4j>3+JCSyar(FB9?!+IXIouM
z8w>AkIJ9Bi)*n89eZltXy+e(A-#)$SN=MnQ%a_LVw!XTetKNSr^U{IiFTWSRF}>=r
zH~jPY-(S1=ZnOLS>Y`bHhQ~GEEFE_E+#h?sTe<ntuRC3bt`8Oee#gsv<8H0k)3x)f
zkKddyV(MF0SI$Vy?mjv1$c^X6PJY3?dH4OsA8P*7yY80UO&z^lcVg!G1I?v-6CFn<
U#^<k^xvl=<Z2$Lb<(Hi3AJ02HlmGw#

literal 0
HcmV?d00001

diff --git a/llvm/test/tools/dsymutil/Inputs/basic-relink.macho.arm64.o b/llvm/test/tools/dsymutil/Inputs/basic-relink.macho.arm64.o
new file mode 100644
index 0000000000000000000000000000000000000000..b1b517d11172fd24d038d0d7dc8fde8b2084be34
GIT binary patch
literal 2176
zcmb7F-)|IE6uz^wyDcmXKPmwugf&*951rj5MH`9G4K1{^q)m6Di$<@rv(vJ&JG0rD
z76>FIJn`UzB=E#P00|^K@<5^{Jm`bbC-6ZCPsSIc(Kk)i@7y_eTP8*8NzVPwJ@=e%
z&YgSi%(s91{MQyC3<04b4IQB&4~4u4y=N+O-J!~BHOhhLc^_=j962FUM&7w7w&Qtq
zyjESEYilnX#Fv777bNNtX>m!%5K8HWQMc(Yuv5xs`81C%J>vQ_2XU_zDH~pWaX~uu
zde>Vb{;~JUbH{oi``P-K@oa(*?q?`9U(=rtC?(0a%se6EBj)=`@Ao-;*tE(>^8Lm<
zKQf-ku(1k>`kYRu<w@UZdsIV`?|an|Lex0Fm|Z1eYhAve{)tQLNxmXS4|4LA4f3h_
zI*yy<t0wqvZ;&rK-&uFR=a_Gv>-+9M`P`t5x+1ygpKbaLDW}IcRl!r{%d>NY&-E#g
zUoA((0lHsWP2VF=a=-B<!2M#r8hA*4dJf+KtnGou&$3{V#!W^3Uy(;!ob4B)2&8*D
z3nigeB&A$xBd!<L#%1Ng=eJf&aTOV#K?$#|T=??l7p8nTGbM~2#;evq#<VV9w0>E(
zw(ow)+F|TOL8)OeIq-JwP%eckO~aTRKrJuB_ZBM53<}daoXf$?l9|gLhS_He${pQf
zT9kN%K3R0<XbVF0tLnFi0_+peJLH7ik3o&}u}B@oU}QF#y)VZM9O-v4m?HB4kTtSj
z1N!^;)eAwPM*9hkrYO^W%DXMcU&t?c-LM(>`O-nVXczOZEALS0U=cHWxM-K&5OpVP
zx<g$L2Y$wGJMQVgcDn61hiz9Bh^py25h%h!%?%o!joVr{8u*dtM`2;iTk={#$Lkhm
zTTV0|blYL!q~p$1YIcVmYho&6Q+l~HjOT}qGa`=eO$(ELXE6$ed)n!u^~!T|<x{dU
z`Tn@9o|+w(Qiw(~1oe3_Q7KQ*8>QwY#lJ%d`3!W_-w<+Sb_sFg_%}O8U{63FLn(&(
zl0>~U;D>6480J-{1>btS|0hP*_X6|Ne}L-4S)ws=y*l0e^@|^bpZ+k4^$L^@nrfiK
zAA~~a(8)!uz%E!U+%0K6*GP9Mt#)e|#-G+!0U#2n@o#6Z@;CK94ZAYw&A(@isZ>7*
zXl*+bcXEi{4;<6DzJ%|56l3-f5ZT$+cn+ZT>=DLSfR<vSm6*>$g6{f#@U5Niruk+e
l^!JQXx2?&?67jE!)SV(#N42C)zpPf&?i^5KNsUFF@DKB93~c}a

literal 0
HcmV?d00001

diff --git a/llvm/test/tools/dsymutil/Inputs/foo-relink-variant.dylib.dSYM/Contents/Info.plist b/llvm/test/tools/dsymutil/Inputs/foo-relink-variant.dylib.dSYM/Contents/Info.plist
new file mode 100644
index 0000000000000..7e84e95bd3f48
--- /dev/null
+++ b/llvm/test/tools/dsymutil/Inputs/foo-relink-variant.dylib.dSYM/Contents/Info.plist
@@ -0,0 +1,20 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple Computer//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+	<dict>
+		<key>CFBundleDevelopmentRegion</key>
+		<string>English</string>
+		<key>CFBundleIdentifier</key>
+		<string>com.apple.xcode.dsym.foo-relink-variant.dylib</string>
+		<key>CFBundleInfoDictionaryVersion</key>
+		<string>6.0</string>
+		<key>CFBundlePackageType</key>
+		<string>dSYM</string>
+		<key>CFBundleSignature</key>
+		<string>????</string>
+		<key>CFBundleShortVersionString</key>
+		<string>1.0</string>
+		<key>CFBundleVersion</key>
+		<string>1</string>
+	</dict>
+</plist>
diff --git a/llvm/test/tools/dsymutil/Inputs/foo-relink-variant.dylib.dSYM/Contents/Resources/DWARF/foo-relink-variant.dylib b/llvm/test/tools/dsymutil/Inputs/foo-relink-variant.dylib.dSYM/Contents/Resources/DWARF/foo-relink-variant.dylib
new file mode 100644
index 0000000000000000000000000000000000000000..523f85712de567adb16857bb2603c07d64bc4638
GIT binary patch
literal 9196
zcmeHNJ8Tn47@qZdlMp)#1OY)%EH44#upJa6;B=612E`;igG5Aeduw|WnV4P6y9u0h
z5(p@u195Gn6clubiUOKQAR!tmI$CsBfRIvx;O_f(W*mE+=yKN~%}BHVWB&h}na@*r
z9&bK>{FY`+V<_FI9jNz`Wbv{Yq7U`hlMm+}7`gi0-lua@htBNXFZ@v=GhtC>EP+b#
zYz@~f$ki}6q5g$x+ZT?HUw{%XgK-D94G5})OU1TB_gctxZnxr<kMg$sVn`!5awXY~
zrY7J`dsi!7*{*mKJ{scVdnEA2o08;Mnp_Q)<~fXN+oOfkBggXvIoClE0zRT5WLB8O
zBj=Jk2&YdcoczV1^Cv_U#95(A9E;l@5KE;KmDE*apxAcVEly9`)r#kGduhDcC>kG2
zDoaa==hPi<(hb_iH!H^Xt4N=<md24@i6`i9!}}`mUWkORE5vh(#kxDw&fokw4m2w8
z=C=xE86n3Pg!L#Q{(JPC=ZWt{e$TdLfP~w&om#Ex+MaX84Qfs)j<+E2c8K_e|AL2{
zINpXa4kXv_hV24)86n3P{#L`U+m7c6yh|eATOpobyj-G``1oE4JUQQ2|AHsFRYEbK
z7*Gr-1{4E|0mXn~Krx^g_#X_=UUoG$p0~w5m(BplmiuV@V|S|Igzqz(KB_1N6a$I@
z#eiZ!F`yVw3@8Q^1BwB~fMP%~@EtNxz?hSaZr?Ob`(pNx!1kZ+qBVvc|Gv75tK+Fm
zba{ts<%B#KSZ{Q8j~HDeM)xyA?>L$0LMC0RCfTl~!T2#k8=gFPAiP^fss--{Ao0ug
zE@+E~5>{lje|UTm4Ln|A%ge=h#xHOrBWG7M=`_p1`U!O|@-p@qQqI_d-k3fan*Wr(
zI^BipiTmkiWEs)TXmTF9k!0;XyhMZS;S_Z0p?RE%W<&GeqIt20cVxBppsm$*n(GYR
zy!pHN>AJaQ<7RWM){AExi42;0JN~@BX|PW>52t=i71kZavjTm9B^nOQe+SdeBdJsh
z&}@kArma#Hcl*|iTMsI}XZ0V*<T5#HcjVpIe;@}dGmy*l?`d5hSxT;yoDfMtcD&@5
z-3;!!vp@S@=z3w0&AT&h)vvkr?AfY=Bc=M4AbZLwof#X?)P$ocW-LD<oBIdQV^E%A
zJi1|K3tnwH3}_gv>X)1<t9T)cuQPFc1;Jy6$A*T*3a2Tvbo2$UkQ&~<{EZ@GXz@{v
z$50BzzuwZha5uP<ItZO!eMG+&<R7S1qV)kuuVtu6T-ks0QU-ongw(f=w=M=tL8Es#
hjOA`{?0M)k4q97S%nx@TPCs8%p|v2B9Un2x>K|G$6=DDY

literal 0
HcmV?d00001

diff --git a/llvm/test/tools/dsymutil/Inputs/foo-relink-variant.dylib.dSYM/Contents/Resources/DWARF/foo-relink-variant_debug.dylib b/llvm/test/tools/dsymutil/Inputs/foo-relink-variant.dylib.dSYM/Contents/Resources/DWARF/foo-relink-variant_debug.dylib
new file mode 100644
index 0000000000000000000000000000000000000000..523f85712de567adb16857bb2603c07d64bc4638
GIT binary patch
literal 9196
zcmeHNJ8Tn47@qZdlMp)#1OY)%EH44#upJa6;B=612E`;igG5Aeduw|WnV4P6y9u0h
z5(p@u195Gn6clubiUOKQAR!tmI$CsBfRIvx;O_f(W*mE+=yKN~%}BHVWB&h}na@*r
z9&bK>{FY`+V<_FI9jNz`Wbv{Yq7U`hlMm+}7`gi0-lua@htBNXFZ@v=GhtC>EP+b#
zYz@~f$ki}6q5g$x+ZT?HUw{%XgK-D94G5})OU1TB_gctxZnxr<kMg$sVn`!5awXY~
zrY7J`dsi!7*{*mKJ{scVdnEA2o08;Mnp_Q)<~fXN+oOfkBggXvIoClE0zRT5WLB8O
zBj=Jk2&YdcoczV1^Cv_U#95(A9E;l@5KE;KmDE*apxAcVEly9`)r#kGduhDcC>kG2
zDoaa==hPi<(hb_iH!H^Xt4N=<md24@i6`i9!}}`mUWkORE5vh(#kxDw&fokw4m2w8
z=C=xE86n3Pg!L#Q{(JPC=ZWt{e$TdLfP~w&om#Ex+MaX84Qfs)j<+E2c8K_e|AL2{
zINpXa4kXv_hV24)86n3P{#L`U+m7c6yh|eATOpobyj-G``1oE4JUQQ2|AHsFRYEbK
z7*Gr-1{4E|0mXn~Krx^g_#X_=UUoG$p0~w5m(BplmiuV@V|S|Igzqz(KB_1N6a$I@
z#eiZ!F`yVw3@8Q^1BwB~fMP%~@EtNxz?hSaZr?Ob`(pNx!1kZ+qBVvc|Gv75tK+Fm
zba{ts<%B#KSZ{Q8j~HDeM)xyA?>L$0LMC0RCfTl~!T2#k8=gFPAiP^fss--{Ao0ug
zE@+E~5>{lje|UTm4Ln|A%ge=h#xHOrBWG7M=`_p1`U!O|@-p@qQqI_d-k3fan*Wr(
zI^BipiTmkiWEs)TXmTF9k!0;XyhMZS;S_Z0p?RE%W<&GeqIt20cVxBppsm$*n(GYR
zy!pHN>AJaQ<7RWM){AExi42;0JN~@BX|PW>52t=i71kZavjTm9B^nOQe+SdeBdJsh
z&}@kArma#Hcl*|iTMsI}XZ0V*<T5#HcjVpIe;@}dGmy*l?`d5hSxT;yoDfMtcD&@5
z-3;!!vp@S@=z3w0&AT&h)vvkr?AfY=Bc=M4AbZLwof#X?)P$ocW-LD<oBIdQV^E%A
zJi1|K3tnwH3}_gv>X)1<t9T)cuQPFc1;Jy6$A*T*3a2Tvbo2$UkQ&~<{EZ@GXz@{v
z$50BzzuwZha5uP<ItZO!eMG+&<R7S1qV)kuuVtu6T-ks0QU-ongw(f=w=M=tL8Es#
hjOA`{?0M)k4q97S%nx@TPCs8%p|v2B9Un2x>K|G$6=DDY

literal 0
HcmV?d00001

diff --git a/llvm/test/tools/dsymutil/Inputs/foo-relink-variant.dylib.dSYM/Contents/Resources/Relocations/aarch64/foo-relink-variant.dylib.yml b/llvm/test/tools/dsymutil/Inputs/foo-relink-variant.dylib.dSYM/Contents/Resources/Relocations/aarch64/foo-relink-variant.dylib.yml
new file mode 100644
index 0000000000000..0bc06e9a9e857
--- /dev/null
+++ b/llvm/test/tools/dsymutil/Inputs/foo-relink-variant.dylib.dSYM/Contents/Resources/Relocations/aarch64/foo-relink-variant.dylib.yml
@@ -0,0 +1,9 @@
+---
+triple:          'arm64-apple-darwin'
+binary-path:     foo-relink-variant.dylib
+relocations:
+  - { offset: 0x26, size: 0x8, addend: 0x0, symName: _foo, symObjAddr: 0x0, symBinAddr: 0x3F64, symSize: 0x20 }
+  - { offset: 0x33, size: 0x8, addend: 0x0, symName: _foo, symObjAddr: 0x0, symBinAddr: 0x3F64, symSize: 0x20 }
+  - { offset: 0x88, size: 0x8, addend: 0x0, symName: _altfoo, symObjAddr: 0x0, symBinAddr: 0x3F84, symSize: 0x24 }
+  - { offset: 0x95, size: 0x8, addend: 0x0, symName: _altfoo, symObjAddr: 0x0, symBinAddr: 0x3F84, symSize: 0x24 }
+...
diff --git a/llvm/test/tools/dsymutil/Inputs/foo-relink.dylib.dSYM/Contents/Info.plist b/llvm/test/tools/dsymutil/Inputs/foo-relink.dylib.dSYM/Contents/Info.plist
new file mode 100644
index 0000000000000..e919260131558
--- /dev/null
+++ b/llvm/test/tools/dsymutil/Inputs/foo-relink.dylib.dSYM/Contents/Info.plist
@@ -0,0 +1,20 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple Computer//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+	<dict>
+		<key>CFBundleDevelopmentRegion</key>
+		<string>English</string>
+		<key>CFBundleIdentifier</key>
+		<string>com.apple.xcode.dsym.foo-relink.dylib</string>
+		<key>CFBundleInfoDictionaryVersion</key>
+		<string>6.0</string>
+		<key>CFBundlePackageType</key>
+		<string>dSYM</string>
+		<key>CFBundleSignature</key>
+		<string>????</string>
+		<key>CFBundleShortVersionString</key>
+		<string>1.0</string>
+		<key>CFBundleVersion</key>
+		<string>1</string>
+	</dict>
+</plist>
diff --git a/llvm/test/tools/dsymutil/Inputs/foo-relink.dylib.dSYM/Contents/Resources/DWARF/foo-relink.dylib b/llvm/test/tools/dsymutil/Inputs/foo-relink.dylib.dSYM/Contents/Resources/DWARF/foo-relink.dylib
new file mode 100644
index 0000000000000000000000000000000000000000..94480df788f119b27746827f6ace437ef091891c
GIT binary patch
literal 9292
zcmeHN&u<e)6rS~ZlhkXLUqTH^52g*Ok;vLX394udiE5&x4wRB0q8wN@_JRy%*Ya*c
zE0936s_3ni_yagJr>gYW11C6E6>veEkPsItRTZemf?B?}GvnB6N6I0`8EN*-d-L8m
zGoN=Q&%Al~@8ADr8Pga_2DK0Mhcs!t3_%Q{j*Q=Wyl3^38@K*C{q4^~cb*&&_BfDP
z(5Nz&LM4B;jWHdf2uT>bQU64BocW^*^JpZ?MAE=@1A;2yQgNKf`zqpXu6N?iAK`to
zYc8$W*pz11+q!_a>|d_=Rj1~k3(%3A-)(_+wk=4`rOni8(mJP69cOmtc=>2)My_>2
zxPXti2w4^?@yNO42ErKxfsRu;J$Z6UctM;HO^K7^`mFfP`k@l<?{G8XtKQP`d8c0U
zJu<C_w<`QuPBwZ=iRU(5|GXD=;jN1KofYA$x!yR^D)9vUU3kWq-1DgjF!spE-csVZ
zOG`~}rF*^41zuHzKi?o;7&YUFB!5#UInXPCH}$?~tRo!9Z8Ykh<GYu<u;Esccn9Zr
z;IN24_`y0r!rXxeTM}<k;K@&Baf5i#*A4u--FOcK-l7=)!3Obyr3)1bNzN}Xz7M(H
zyle=3B@_dS0mXn~Krx^gPz)#r6a$KZ&0~P}vRkq7yejs&bOu1W+(&N}dL;Vs58kau
z=YoJa@_`-na_Gqo)#go2At?qF1BwB~fMP%~pcqgLC<YV*iUGxdVn8vlNeoarG|edK
zrfJ%1YYPc%|0x%(G3@Y1njTWe7wtHChqQ7^9t`X<`ZHx?ploFB8hYR72J8FI&@<&!
z8VHnpPqPnugYx5pE?jv4LAc)-ro>GR0B=8#c+!0Ut+m66e5`dpxc?P&aDR!tUT*O+
zJkv4EK)m=YD?<Ae^>%D!>{m#+avHToLk@}-{6<fm@}+v}$Lw9w4ANLyo}17_wBQ~+
zGt^`JL60;$1JOD3BUt3nr_iRM+ZT{(SG3?6x;35Bu{87x(u_DcD@V|_YkRGohHl-s
zYQ4N>ZF_gf+OF-uJ&%-)t{r_ZuI-)})U8A2$L7q=Be-{<kFiwCfZ5rsr<*a%bnCEb
zni!HzQc^otspC>WZpCYcwZPAfj@w1Mn9Il3k<sxYG<&RQkM8Syu?iGitGE$@!oor&
zsCqV%<b}@yKl1!2ER?(zuO2kKX5mEL#o1HyQdl_VR=${9up7eA<}+6+%kI%Jj2Kog
zGVa|{3p0LWISOeStR7U{I<@36R?UxC@*NdNUJ%@UdTw%Bd?mDcmW`jF9#X>{n*ULx
zj}9Bvqz{FVAH{qh-ZbCbAM<~Ium7JCbV`a>QRDXtl5X2jiQgYcnSFnTHpTyfN=1&z
z=OJ$5z)o)%%}d8!8iSc=(LEpB&bfZm{0Nd>M*6_$%^YUzr~IMp<FzJQ9mu37S3@g#
E3#bn*PXGV_

literal 0
HcmV?d00001

diff --git a/llvm/test/tools/dsymutil/Inputs/foo-relink.dylib.dSYM/Contents/Resources/Relocations/aarch64/foo-relink.dylib.yml b/llvm/test/tools/dsymutil/Inputs/foo-relink.dylib.dSYM/Contents/Resources/Relocations/aarch64/foo-relink.dylib.yml
new file mode 100644
index 0000000000000..0db1b30e9d126
--- /dev/null
+++ b/llvm/test/tools/dsymutil/Inputs/foo-relink.dylib.dSYM/Contents/Resources/Relocations/aarch64/foo-relink.dylib.yml
@@ -0,0 +1,10 @@
+---
+triple:          'arm64-apple-darwin'
+binary-path:     foo-relink.dylib
+relocations:
+  - { offset: 0x26, size: 0x8, addend: 0x0, symName: _foo, symObjAddr: 0x0, symBinAddr: 0x3F54, symSize: 0x20 }
+  - { offset: 0x33, size: 0x8, addend: 0x0, symName: _foo, symObjAddr: 0x0, symBinAddr: 0x3F54, symSize: 0x20 }
+  - { offset: 0x5B, size: 0x8, addend: 0x0, symName: _foo_unused, symObjAddr: 0x20, symBinAddr: 0x3F74, symSize: 0x8 }
+  - { offset: 0xA1, size: 0x8, addend: 0x0, symName: _altfoo, symObjAddr: 0x0, symBinAddr: 0x3F7C, symSize: 0x24 }
+  - { offset: 0xAE, size: 0x8, addend: 0x0, symName: _altfoo, symObjAddr: 0x0, symBinAddr: 0x3F7C, symSize: 0x24 }
+...
diff --git a/llvm/test/tools/dsymutil/Inputs/proxy-relink.dylib.dSYM/Contents/Info.plist b/llvm/test/tools/dsymutil/Inputs/proxy-relink.dylib.dSYM/Contents/Info.plist
new file mode 100644
index 0000000000000..425df2f6c841d
--- /dev/null
+++ b/llvm/test/tools/dsymutil/Inputs/proxy-relink.dylib.dSYM/Contents/Info.plist
@@ -0,0 +1,20 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple Computer//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+	<dict>
+		<key>CFBundleDevelopmentRegion</key>
+		<string>English</string>
+		<key>CFBundleIdentifier</key>
+		<string>com.apple.xcode.dsym.proxy-relink.dylib</string>
+		<key>CFBundleInfoDictionaryVersion</key>
+		<string>6.0</string>
+		<key>CFBundlePackageType</key>
+		<string>dSYM</string>
+		<key>CFBundleSignature</key>
+		<string>????</string>
+		<key>CFBundleShortVersionString</key>
+		<string>1.0</string>
+		<key>CFBundleVersion</key>
+		<string>1</string>
+	</dict>
+</plist>
diff --git a/llvm/test/tools/dsymutil/Inputs/proxy-relink.dylib.dSYM/Contents/Resources/DWARF/proxy-relink.dylib b/llvm/test/tools/dsymutil/Inputs/proxy-relink.dylib.dSYM/Contents/Resources/DWARF/proxy-relink.dylib
new file mode 100644
index 0000000000000000000000000000000000000000..c79d8f7825b58cc9ad52b2d9d014ee99089a5e4a
GIT binary patch
literal 10046
zcmeHNUuYaf7@xhny}RV%*|dZ<ZG;n3tzz#INlIEF$<ajfr?xeTZCXpGdw1L9+RN>6
zyJ@Xf8Z49uMGz|#DilFPP!Rg42>pW(3R=ZKeeuZ`MG-|GEDGxH+nKp+Hg~2z`La{K
ze1GRR-+Vi_bY}MEA3y!o!kEU;nn7{Uktk`RZ35c>dTOlo!;XRN&IgCC6w{qMetbmQ
z-Ht2?jfk-jh~n8Afwcg>4{AGT5y6~K4Np%|fUgbubgAm#L=mZT?wHRwQUUYhbqok!
zX(`0bsisEQLQRIix$Vs7?3&4sw<z&Yqgtamsuq`c%FlV;cB;klj12JOU66QT1>oae
zY;tivUp`qZ7kIIJ)N=LZ^}8tXW<5B543Eow4i{aVq3l$a@7%L{Qap`88Kt;X6{CEK
z=y~IjBCi?C^BADl$D{S6ns?5$#KhVWURobZd8r=Y3sN4G+{e6mk&E+!;TTInBCjP~
z;i>gRp4I9#f*8(sjqV;F9vW3^_bT;7qhe5#{A;1PbMwz{1Cm8P`?C{|$tbYTq^NLG
zLhp|WPK@T*BvhK8nVX;CrDEA6QzN`SHF!#PY`_$rQ8mglro95M?{m@ds_fr)Zv!A{
zR(Nu`uE4u3@m`Yo-M&LSBbTe1Csxk)+!tctcjfr!?z^-6@FR5t{q^hpQuIG2^Xu(g
z4bP}lN+vHGbEaJ}@_xL_Uy1$-=JMwMfQL{&-VKTOj)b{!hj`AZ3ZCvtJWDRvw<Vr+
zzZ4oU&c%GK+_5}``19L%OLSE8-S|KQpuikdAgDl4fuI6G1%e6$6$mQuzf^$sKHj}!
z$13_Z`0vL5jSF($*)R1zsqZA6Hg^jkG}=2+yxNyPDfK}a&se5nII|hY%H)jd)~bn(
z&GB^MRH>L-@*K6S<sNR792gl#ANunw<AtJKDH*32CwtI>3Ir7hDiBm4s6bGGpaMY!
zf(ir`2r3X%AgDl40dEE9Iy1s(SFdT>pUJz3(bX47W6nuyYsoxemelTh&dASNX`?!~
zttahX|KMUWq_*~LdZhU`T^kR@0YcmR2%}q|I=G&lehoz(AhhY>=J8Mj-A4Zn!gC~0
zgHUG*>-AXkSv@kYH@~lkqhoG9yIe53jH;85I89KKtl_CMjy4xBu?+8FAc=cpal@^O
zK<6j*nuy;Amxx!>723)kBK~?UUd>J2tND9~_$_#B0#W7OD1HI}5f>!xKJOOR1#J)L
zifd);8n|~>p^spQYT-}8;<1R=rOkZ{qZW2gk|e55tEh$jfWHcL8Zx1J=l(nNXe(mJ
zA!zZYFk66t7HWf0kHoyen$-@Y7N%I111%rx9yrxlm*Bky^49Gk_~TxKW(^UZei(sG
zp_f}mNQ;rChOaZq7=qL&-y`xG$Xji96t6+E4iauyjbf1Ij#(Q;Yt=R<*XiNp3#XGm
zpGmG+zbV<Ot;I1~sFQ`0{fR^ZUjdMqNc2ODq#3yfH-wWr5|1VZ@bQFRAK{lX_$(*d
zgPx=72HGh&gs*dAqW^bi+7$$OQJgdqSsPtpZ|Y$oIo9Yps!o>fyi@rSz8OfJFspXa
zDyO=8(_QJV)E3vewY#?qTDqqz-TmNd?~x3_74wDz7&|kaw+dz&x6GNHR@pJjj-45z
zcaT*vtC_te0|$TAIXm-&k>4{poyM7;Yw*NO4vi~!cMo!~3&)v=uBn;Pa%JAJS$>wz
z4|#TCB71;OjvgH5QwR1AbI#Oh;<7hY<C97{fh?9CM#quM-b}SF9;-Oqq+?2h`QJS9
zTobHh<&6>>nat9ntbHoeLe)nTwkZ1>O~ue>QbhG)D31COr@q&ZEGFIqy=X5cE<OIq
z+K)S8zaE0U1Ud~m4|)wmzf6iJotpdm1Aoia7;}F=kZ()feGfn$l)Ag}As;3XG?i-@
zG7%P|+vta^u_$ah{v{xZxv9Ln78Vov)T^hTw$FdF7qQQSE`aDIK{&62IvIOo%Z`>5
uOj10zB*M?=1qKVFZ`Tm9|JHM_Kw|VlBlC6;%s0=Dya}n6D!HiDE9*b6pWSEx

literal 0
HcmV?d00001

diff --git a/llvm/test/tools/dsymutil/Inputs/proxy-relink.dylib.dSYM/Contents/Resources/Relocations/aarch64/proxy-relink.dylib.yml b/llvm/test/tools/dsymutil/Inputs/proxy-relink.dylib.dSYM/Contents/Resources/Relocations/aarch64/proxy-relink.dylib.yml
new file mode 100644
index 0000000000000..44dd0a2342da8
--- /dev/null
+++ b/llvm/test/tools/dsymutil/Inputs/proxy-relink.dylib.dSYM/Contents/Resources/Relocations/aarch64/proxy-relink.dylib.yml
@@ -0,0 +1,14 @@
+---
+triple:          'arm64-apple-darwin'
+binary-path:     proxy-relink.dylib
+relocations:
+  - { offset: 0x26, size: 0x8, addend: 0x0, symName: _display, symObjAddr: 0x0, symBinAddr: 0x3F1C, symSize: 0x1C }
+  - { offset: 0x41, size: 0x8, addend: 0x0, symName: _display, symObjAddr: 0x0, symBinAddr: 0x3F1C, symSize: 0x1C }
+  - { offset: 0x7C, size: 0x8, addend: 0x0, symName: _bar, symObjAddr: 0x3FA0, symBinAddr: 0x3F38, symSize: 0x8 }
+  - { offset: 0x99, size: 0x8, addend: 0x0, symName: _baz, symObjAddr: 0x4000, symBinAddr: 0x8000, symSize: 0x0 }
+  - { offset: 0xA9, size: 0x8, addend: 0x0, symName: _bar, symObjAddr: 0x3FA0, symBinAddr: 0x3F38, symSize: 0x8 }
+  - { offset: 0xE8, size: 0x8, addend: 0x0, symName: _foo, symObjAddr: 0x3F60, symBinAddr: 0x3F40, symSize: 0x24 }
+  - { offset: 0xF9, size: 0x8, addend: 0x0, symName: _foo, symObjAddr: 0x3F60, symBinAddr: 0x3F40, symSize: 0x24 }
+  - { offset: 0x14E, size: 0x8, addend: 0x0, symName: _altfoo, symObjAddr: 0x3F84, symBinAddr: 0x3F64, symSize: 0x24 }
+  - { offset: 0x15F, size: 0x8, addend: 0x0, symName: _altfoo, symObjAddr: 0x3F84, symBinAddr: 0x3F64, symSize: 0x24 }
+...
diff --git a/llvm/test/tools/dsymutil/Inputs/two-level-relink.macho.arm64.dylib b/llvm/test/tools/dsymutil/Inputs/two-level-relink.macho.arm64.dylib
new file mode 100644
index 0000000000000000000000000000000000000000..333ac9f5d904ce2e802bee5ce44bc0346498c85b
GIT binary patch
literal 50944
zcmeI*e{54#6bJCLwwqJNhCp-+!%aj2!diYTWYI**uK=R33}R;Z<I$}hYu2?(+p%8+
zD~KX$bbt{fe+U>Hi5h~$z#qgC77<B=5y9vmNEQ&3KqS#jND$9??_RiT#zsQIKYp*?
z-goZ3@4WLqecgXAoVao2-#nr$;+DgG1ou`C(FXg7sEFf@ylPp?=T|T1AhY>1CRCiw
z)6zMW-7G5^YE7zuhs=D%vz)bv+9I**&UQ(ZGDFKsB%5mzx2H1mZB+UA-s~o^nJ-u6
z)9aR1n@GmP(N$`HX1=W|pIfc#ScUy^%baC3M_a<tpcRhR#q7Ah-!_%6UL}y&tWQnf
z9^`x2#@({!&0ap+n!9w#GUwayt1WjKrC#as1<QA9S*v2k*KpT-xxCkIe&>7@+0Tgm
zu&9atb?IF^rqr=tjTy07m+Y{tU?3S#d+k{6^G>Yu>E}t}p}ZE_d0t$!WbyoYi}bUb
zRyqA@)bT@3Eca&K-*Aro+zT`2lP$5bNN9B^QWg(I!qEnAW1zM^<_*LfXO??|ZIN&d
zpTMo0CB&W5L@Rliq2iSsOMP+=l6x=5Q+RAWLR85G4CCWU?$RcxF2ASA+|Bwp?AMz3
z{DEA?@GiMd#raddBgdz8?EKW{zdXNc*>s*O<?iIn=c9c4SiT?1>2p8BgI&MG+~oIC
z!Ci7W*O4=`$k&{R`}nS<FKbIALyg`!ri0vr<mpNi$+I;{#d6=$#&VbSlifs92f6{-
zga!c!KmY;|fB*y_009U<00Izz00bZa0SG_<0uX=z1Rwwb2tWV=5P$##AOHafKmY;|
zfB*y_009U<00Izz00bZa0SG_<0uX=z1R(Ig7P#5*&7PZX>SToXsq*>G$BDMy>^R(&
zc3Wrsl&X^P)?Di8;^RvNbnsGv|6p!5Ju1iN_?Xu^^`4288adb=e)o0n2$%mlxk}C~
zUF^J%TdefWQC^LecjNs$O4skX%dy$y{oOo%^SjV~`<u_<l;fROcI6dn7r$(4zGa&-
z&&4qWAOHafKmY;|fB*y_009U<00Izz00bZa0SG_<0uX=z1Rwwb2tWV=5P$##AOHaf
zKmY;|fB*y_009U<00Izz00bZa0SG_<0uX=z1Rwwb2tWV=4_3hbcfKF&C2=(fKmY;|
zfB*y_009U<00Izz00bZa0SG_<0uX=z1Rwwb2tWV=5P$##AOHafKmY;|fB*y_009U<
z00Izz00bZa0SG_<0uX=z1Rwwb2tWV=5P$##1`sIaxk~QU+_O~Y!EPQk7OGJXcI?j~
z^6MFOu1m#MQ*$!G$5|FlmMJ%#U&|i{){=)(Dc<FZQ+_**a?^I1Pp>8N1R}}0Sd1R7
z2~bfmoM?&!+UQA{nZTQhbH__M?m_ubP3o@K0gn^82aR}IMqJBr9cL;3<D$JyQhE8K
zc$Rb0XdcEJv8>B=i=Acr`b#p_&3ZoJeP%tII5z9qk`aH!vD7moqn-*?&m+7p_2^jF
zlgsncB$j&gdI86!+;x+6v%a$&oArGs>v}VfJsI&ejt8pGuj(7i>r$VN<$B^JU7EyF
zpSf<<Q^IRzJvkhk^%P{p(>Wffo}j8n9zUr^$9y<oJ?6SukDPB`)9{zr^ey1Wo>JVo
z$@3!X`uPzLiS_-C<CsJK-)Zt&kX}lbdfKg;K%Draoh<##Tk_01KOo<tsQi$8lx&HW
zMMA4Xk+OIw5{@=_8w0iVF>fH=IJ4YaOHJ`uYnxsQwnf4<q)#36%-HR1#C7NdgXS~#
z8E13J_jI<g&p6w$xTggDx5kvK9&Vh?OY`3rAII8!G|$Q3n=$0_@WB5Ob#RwIchZIv
zb)PJCQJxbH`5DnW28DILgRbW;zf|=@d-l{XUw`fUUTb9G?%t!peJ56|e{)Y&#j#E6
zzj*cYebupVf3Hh>e(LFIa4oIQzO`omn)VI99C~}<TT?z8YJGP@cK4x=C-oN0F1yg$
z>*+Y!Tr}+1=65c%<Q;zO>XCnnem#BpjcXs&xpz#-nfX_3N!_*K!w&p-b=RpC>&{)+
z?Am{2sQmZ4p6@NW{_?KQ&BwofcihN{?_FFmJ@M?`?l}klT=2xxFSyt3yx;soo&U<b
mu}^I|`Ey_8hoeI4`#$RZJHLGUf~wOeE>@)uwC(sUi~a-1bU{D>

literal 0
HcmV?d00001

diff --git a/llvm/test/tools/dsymutil/Inputs/variant-relink.macho.arm64.dylib b/llvm/test/tools/dsymutil/Inputs/variant-relink.macho.arm64.dylib
new file mode 100644
index 0000000000000000000000000000000000000000..3c434096f7dc9736894757e70a423396128c57be
GIT binary patch
literal 50944
zcmeI5U2IfE6vxkg&{ioeLPKb5?Mi@Dpu38Qd?^c-NPz%>MyrUox80U**zS_uTci|W
zLsG0Us3n*XpFpA_(Le~{BSEcFQG-SxiqW8=#sDfPiHU)T^_+WWy0@D)eIVh*|ADhJ
z|Cw{oJ@cE{eVg=)>zDt{C-M+m4*PKSb$+6DyGK;ax`Ss8qk3LVHH&NuXALMlgonjB
zm2HM$HLSCg;3hj?V74>Yq(Ve{GvSgb?Usg-vRdj=y+hggI#fPhn{A@q`SMggJ#QHG
zDJvO`uTt^Z`L?TkJ~gkkl6`W^m|?WUpNYmJMl{}-uytR4J5)ZsUeWIQbY9)cKJ2jb
zRL-fMV?45C@p5N3!YX2bR_Ya(4VJw%j8zHOHn8qLUmnNW7IvOj%046ZVNnBp>(Wu&
zrd2<z+AQs^OCk&-Vp^t(wYAih(K??#Pof9oHQ&y&sA}=Td6iZA>~^V~VbyA#>u6b<
zdws)LhuKGGjjuJ6Q8R8;BpYJU`08M@S>Kchn#ty=Q-YD!ShSAAZJWX*ggbQ+)pB#c
z(zUFmK3Rk0-p_g>w@t^{xqzYEU&<~vPPu%aCa}BfW7)@Q`T731+{sb$p6x%+?8yCW
z*4`Ia20t{5o|`_I#{%q5&I0Zg*nQa_%K6(T@3ZR{%_iTMY3!2Ad5=8lp-@XI8RB(|
zFK<m*4b8#1ZW7};O7iUKB9gOpkJ7Sku~K%KKh;fiZ+{DrAQ%XM00@8p2!H?xfB*=9
z00@8p2!H?xfB*=900@8p2!H?xfB*=900@8p2!H?xfB*=900@8p2!H?xfB*=900@8p
z2!H?xfB*=900@8p2!H?xfWZHrz>T)|-@4(WP6qKZEq%Ci6knme(e}ZfE}wC>=Mg&H
zQ%FZI7KV@J4WT<Oa{oN{dAw84ms5I#j1LqOrt~t2<-4!@hkL`<$Xj-H$wKEoZi%Ar
zZslsM+>H-&E3WUkOW$pBe>b0Revh_qe-m*IPpxxh*UuMiC5^bYwC<n<0T2KI5C8!X
z009sH0T2KI5C8!X009sH0T2KI5C8!X009sH0T2KI5C8!X009sH0T2KI5C8!X009sH
z0T2KI5C8!X009sH0T2KI5C8!X009sH0T2KI5C8!Xxb+0=zw>?TUyC<_00@8p2!H?x
zfB*=900@8p2!H?xfB*=900@8p2!H?xfB*=900@8p2!H?xfB*=900@8p2!H?xfB*=9
z00@8p2!H?xfB*=900@8p2!H?xfB*=900@8p2!H?xWDy9kUdmp>?orN-ZGP2`R;_Mq
z9nK*N>k;*=S7~ESi<RQO$DoNa<fC&N`6IoF{FF{}ls8EQ>uIEqcFOp%jYNJkW;G@f
zbX%QC#gS-gP0VbiyJciN2bBcIN;>w4d?+jDHlhW$_hsottQ+~M$dwCrm^>QfkHBp1
zuobb7b!nNG_mr@UiC&yF?|y$f&o@b+vIMv8de&v>eXRSdXO^m`l;@=$t#v&C9v9Q~
zxaZyVe8(~Fdb(M=>-juO=W@dS>Um1lBikwUXszoJMq;`i_q@BF2ROi8PZ?`>J>^+i
zSoc>?TGewqzf<bbn!EOX<w-GYq9uo(FJv8H*CzAs`ec2DJgDoJ{G#2@7xFQuG&`G|
z7nu*Z_Dl3ErN{6*OaAZJ1a8CwWT?|^)R{@*kM(5eGj7NkcfK&8s<?b{LR4p_qV*NY
zhFCPdI@oO1Hzk5*vU%#1U_I%W0YehKA;g(=5`M6`H584fEHf4htud{pP-eDbt(lCP
zaVr>UjYaEvpU(sYT}hpl+)UoS8C<CuP9|?g3{YMsfXlP(`uF)={M%B&LhnHlkIC;%
zF?s#m@IRtfcKNvz8%ET9D&VDjM-KYgdz=+AU9WqS9Eu+;n4Wj2aOJ9LzgK?ZJFs=$
z8_PaEeEG_e;*OsldivA7rQPprdiy|4;>4xKF8|l3Pp|easTtC<;oye#?ccup!u;na
zy*b!?#d|q5zqS`W{=|h#m9KibPyBP~u;-YybLfnHBXf6o&XxY~)o)v_zS8L1H7RH6
zU-e~;SBDKf^5vD?XI5-J|I0S-!OMf?zuEbMVA-|Dc6V+&d1&jH5#=vkSTQ*@V}JMD
zqklen*SH0~&9B{T{=uIAW8#x}B|VF*O4GY#)Q{C|ueWVE8K_FktUZ3_SkHssJj+}C
EA1fw95C8xG

literal 0
HcmV?d00001

diff --git a/llvm/test/tools/dsymutil/basic-linking.test b/llvm/test/tools/dsymutil/basic-linking.test
index cc521430b0c87..88cd3293efa24 100644
--- a/llvm/test/tools/dsymutil/basic-linking.test
+++ b/llvm/test/tools/dsymutil/basic-linking.test
@@ -2,6 +2,9 @@ RUN: dsymutil -no-output -verbose -oso-prepend-path=%p %p/Inputs/basic.macho.x86
 RUN: dsymutil -no-output -verbose -oso-prepend-path=%p %p/Inputs/basic-lto.macho.x86_64 | FileCheck %s --check-prefix=CHECK-LTO
 RUN: dsymutil -no-output -verbose -oso-prepend-path=%p %p/Inputs/basic-archive.macho.x86_64 | FileCheck %s --check-prefix=CHECK-ARCHIVE
 RUN: dsymutil -no-output -verbose -oso-prepend-path=%p %p/Inputs/basic.macho.x86_64 %p/Inputs/basic-lto.macho.x86_64 %p/Inputs/basic-archive.macho.x86_64 | FileCheck %s --check-prefixes=CHECK,CHECK-LTO,CHECK-ARCHIVE
+RUN: dsymutil -no-output -verbose -oso-prepend-path=%p -D %p/Inputs %p/Inputs/basic-relink.macho.arm64.dylib | FileCheck %s --check-prefix=CHECK-RELINK
+RUN: dsymutil -no-output -verbose -oso-prepend-path=%p -D %p/Inputs %p/Inputs/two-level-relink.macho.arm64.dylib | FileCheck %s --check-prefix=CHECK-RELINK-TWO
+RUN: dsymutil -no-output -verbose -oso-prepend-path=%p -build-variant-suffix=_debug -D WrongPath -D %p/Inputs %p/Inputs/variant-relink.macho.arm64.dylib | FileCheck %s --check-prefix=CHECK-RELINK-VARIANT
 
 This test check the basic Dwarf linking process through the debug dumps.
 
@@ -175,3 +178,122 @@ CHECK-ARCHIVE: Found valid debug map entry: _inc	0x0000000000000070 => 0x0000000
 CHECK-ARCHIVE-NEXT: Keeping subprogram DIE:
 CHECK-ARCHIVE-NEXT: DW_TAG_subprogram
 CHECK-ARCHIVE-NEXT:   DW_AT_name {{.*}}"inc")
+
+
+================================= Simple relink ================================
+CHECK-RELINK: DEBUG MAP OBJECT: {{.*}}basic-relink.macho.arm64.o
+CHECK-RELINK: Input compilation unit:
+CHECK-RELINK-NEXT: TAG_compile_unit
+CHECK-RELINK-NOT: TAG
+CHECK-RELINK: AT_name {{.*}}basic-relink.macho.arm64.c
+
+CHECK-RELINK: DEBUG MAP OBJECT: {{.*}}foo-relink.dylib
+CHECK-RELINK: Input compilation unit:
+CHECK-RELINK-NEXT: TAG_compile_unit
+CHECK-RELINK-NOT: TAG
+CHECK-RELINK: AT_name {{.*}}foo-relink.c
+
+CHECK-RELINK: Input compilation unit:
+CHECK-RELINK-NEXT: TAG_compile_unit
+CHECK-RELINK-NOT: TAG
+CHECK-RELINK: AT_name {{.*}}altfoo-relink.c
+
+CHECK-RELINK: DEBUG MAP OBJECT: {{.*}}bar-relink.dylib
+CHECK-RELINK: Input compilation unit:
+CHECK-RELINK-NEXT: TAG_compile_unit
+CHECK-RELINK-NOT: TAG
+CHECK-RELINK: AT_name {{.*}}bar-relink.c
+
+CHECK-RELINK-NOT: Found valid debug map entry
+CHECK-RELINK: Found valid debug map entry: _display	0x0000000000000000 => 0x0000000000003f10
+CHECK-RELINK-NEXT: Keeping subprogram DIE:
+CHECK-RELINK-NEXT: DW_TAG_subprogram
+CHECK-RELINK:   DW_AT_name{{.*}}"display"
+
+CHECK-RELINK: Found valid debug map entry: _foo	0x0000000000003f54 => 0x0000000000003f2c
+CHECK-RELINK-NEXT: Keeping subprogram DIE:
+CHECK-RELINK-NEXT: DW_TAG_subprogram
+CHECK-RELINK:   DW_AT_name {{.*}}"foo"
+
+CHECK-RELINK-NOT: Found valid debug map entry
+CHECK-RELINK: Found valid debug map entry: _foo_unused	0x0000000000003f74 => 0x0000000000003f4c
+CHECK-RELINK-NEXT: Keeping subprogram DIE:
+CHECK-RELINK-NEXT: DW_TAG_subprogram
+CHECK-RELINK:   DW_AT_name {{.*}}"foo_unused"
+
+CHECK-RELINK-NOT: Found valid debug map entry
+CHECK-RELINK: Found valid debug map entry: _altfoo	0x0000000000003f7c => 0x0000000000003f54
+CHECK-RELINK-NEXT: Keeping subprogram DIE:
+CHECK-RELINK-NEXT: DW_TAG_subprogram
+CHECK-RELINK:   DW_AT_name {{.*}}"altfoo"
+
+CHECK-RELINK-NOT: Found valid debug map entry
+CHECK-RELINK: Found valid debug map entry: _baz	0x0000000000004000 => 0x0000000000008000
+CHECK-RELINK-NEXT: Keeping variable DIE:
+CHECK-RELINK-NEXT: DW_TAG_variable
+CHECK-RELINK-NEXT:   DW_AT_name {{.*}}"baz"
+
+CHECK-RELINK-NOT: Found valid debug map entry
+CHECK-RELINK: Found valid debug map entry: _bar	0x0000000000003fa0 => 0x0000000000003f78
+CHECK-RELINK-NEXT: Keeping subprogram DIE:
+CHECK-RELINK-NEXT: DW_TAG_subprogram
+CHECK-RELINK:   DW_AT_name {{.*}}"bar"
+
+================================= Two level relink ================================
+CHECK-RELINK-TWO: DEBUG MAP OBJECT: {{.*}}proxy-relink.dylib
+CHECK-RELINK-TWO: Input compilation unit:
+CHECK-RELINK-TWO-NEXT: TAG_compile_unit
+CHECK-RELINK-TWO-NOT: TAG
+CHECK-RELINK-TWO: AT_name {{.*}}two-level-relink.macho.arm64.c
+
+CHECK-RELINK-TWO: Input compilation unit:
+CHECK-RELINK-TWO-NEXT: TAG_compile_unit
+CHECK-RELINK-TWO-NOT: TAG
+CHECK-RELINK-TWO: AT_name {{.*}}bar-relink.c
+CHECK-RELINK-TWO: DW_AT_APPLE_origin {{.*}}/path/to/bar-relink.dylib
+
+CHECK-RELINK-TWO: Input compilation unit:
+CHECK-RELINK-TWO-NEXT: TAG_compile_unit
+CHECK-RELINK-TWO-NOT: TAG
+CHECK-RELINK-TWO: AT_name {{.*}}foo-relink.c
+CHECK-RELINK-TWO: DW_AT_APPLE_origin {{.*}}/path/to/foo-relink.dylib
+
+CHECK-RELINK-TWO: Input compilation unit:
+CHECK-RELINK-TWO-NEXT: TAG_compile_unit
+CHECK-RELINK-TWO-NOT: TAG
+CHECK-RELINK-TWO: AT_name {{.*}}altfoo-relink.c
+CHECK-RELINK-TWO: DW_AT_APPLE_origin {{.*}}/path/to/foo-relink.dylib
+
+CHECK-RELINK-TWO-NOT: Found valid debug map entry
+CHECK-RELINK-TWO: Found valid debug map entry: _display	0x0000000000003f1c => 0x0000000000003f1c
+CHECK-RELINK-TWO-NEXT: Keeping subprogram DIE:
+CHECK-RELINK-TWO-NEXT: DW_TAG_subprogram
+CHECK-RELINK-TWO:   DW_AT_name{{.*}}"display"
+
+CHECK-RELINK-TWO-NOT: Found valid debug map entry
+CHECK-RELINK-TWO: Found valid debug map entry: _baz	0x0000000000008000 => 0x0000000000008000
+CHECK-RELINK-TWO-NEXT: Keeping variable DIE:
+CHECK-RELINK-TWO-NEXT: DW_TAG_variable
+CHECK-RELINK-TWO-NEXT:   DW_AT_name {{.*}}"baz"
+
+CHECK-RELINK-TWO-NOT: Found valid debug map entry
+CHECK-RELINK-TWO: Found valid debug map entry: _bar	0x0000000000003f38 => 0x0000000000003f38
+CHECK-RELINK-TWO-NEXT: Keeping subprogram DIE:
+CHECK-RELINK-TWO-NEXT: DW_TAG_subprogram
+CHECK-RELINK-TWO:   DW_AT_name {{.*}}"bar"
+
+CHECK-RELINK-TWO: Found valid debug map entry: _foo	0x0000000000003f40 => 0x0000000000003f40
+CHECK-RELINK-TWO-NEXT: Keeping subprogram DIE:
+CHECK-RELINK-TWO-NEXT: DW_TAG_subprogram
+CHECK-RELINK-TWO:   DW_AT_name {{.*}}"foo"
+
+CHECK-RELINK-TWO-NOT: Found valid debug map entry
+CHECK-RELINK-TWO: Found valid debug map entry: _altfoo	0x0000000000003f64 => 0x0000000000003f64
+CHECK-RELINK-TWO-NEXT: Keeping subprogram DIE:
+CHECK-RELINK-TWO-NEXT: DW_TAG_subprogram
+CHECK-RELINK-TWO:   DW_AT_name {{.*}}"altfoo"
+
+================================= Build variants relink ================================
+CHECK-RELINK-VARIANT: DEBUG MAP OBJECT: {{.*}}basic-relink.macho.arm64.o
+CHECK-RELINK-VARIANT: DEBUG MAP OBJECT: {{.*}}foo-relink-variant_debug.dylib
+CHECK-RELINK-VARIANT: DEBUG MAP OBJECT: {{.*}}bar-relink-variant.dylib
diff --git a/llvm/test/tools/dsymutil/cmdline.test b/llvm/test/tools/dsymutil/cmdline.test
index 2317852f3c489..36cf3f542695c 100644
--- a/llvm/test/tools/dsymutil/cmdline.test
+++ b/llvm/test/tools/dsymutil/cmdline.test
@@ -7,7 +7,9 @@ HELP-NOT: -reverse-iterate
 HELP: Dsymutil Options:
 CHECK: -accelerator
 CHECK: -arch <arch>
+CHECK: -build-variant-suffix <suffix=buildvariant>
 CHECK: -dump-debug-map
+CHECK: -D <path>
 CHECK: -fat64
 CHECK: -flat
 CHECK: -gen-reproducer
diff --git a/llvm/tools/dsymutil/CMakeLists.txt b/llvm/tools/dsymutil/CMakeLists.txt
index 3cb7594d2fd92..c612bfd9150c4 100644
--- a/llvm/tools/dsymutil/CMakeLists.txt
+++ b/llvm/tools/dsymutil/CMakeLists.txt
@@ -30,6 +30,7 @@ add_llvm_tool(dsymutil
   MachODebugMapParser.cpp
   MachOUtils.cpp
   Reproducer.cpp
+  RelocationMap.cpp
   SymbolMap.cpp
 
   DEPENDS
diff --git a/llvm/tools/dsymutil/DebugMap.cpp b/llvm/tools/dsymutil/DebugMap.cpp
index d4e2c2b2cfac7..dcdecdfe8210b 100644
--- a/llvm/tools/dsymutil/DebugMap.cpp
+++ b/llvm/tools/dsymutil/DebugMap.cpp
@@ -45,6 +45,11 @@ DebugMapObject::DebugMapObject(StringRef ObjectFilename,
 bool DebugMapObject::addSymbol(StringRef Name,
                                std::optional<uint64_t> ObjectAddress,
                                uint64_t LinkedAddress, uint32_t Size) {
+  if (Symbols.count(Name)) {
+    // Symbol was previously added.
+    return true;
+  }
+
   auto InsertResult = Symbols.insert(
       std::make_pair(Name, SymbolMapping(ObjectAddress, LinkedAddress, Size)));
 
@@ -53,6 +58,12 @@ bool DebugMapObject::addSymbol(StringRef Name,
   return InsertResult.second;
 }
 
+void DebugMapObject::setRelocationMap(dsymutil::RelocationMap &RM) {
+  RelocMap.emplace(RM);
+}
+
+void DebugMapObject::setInstallName(StringRef IN) { InstallName.emplace(IN); }
+
 void DebugMapObject::print(raw_ostream &OS) const {
   OS << getObjectFilename() << ":\n";
   // Sort the symbols in alphabetical order, like llvm-nm (and to get
@@ -158,8 +169,8 @@ struct MappingTraits<dsymutil::DebugMapObject>::YamlDMO {
   std::vector<dsymutil::DebugMapObject::YAMLSymbolMapping> Entries;
 };
 
-void MappingTraits<std::pair<std::string, DebugMapObject::SymbolMapping>>::
-    mapping(IO &io, std::pair<std::string, DebugMapObject::SymbolMapping> &s) {
+void MappingTraits<std::pair<std::string, SymbolMapping>>::mapping(
+    IO &io, std::pair<std::string, SymbolMapping> &s) {
   io.mapRequired("sym", s.first);
   io.mapOptional("objAddr", s.second.ObjectAddress);
   io.mapRequired("binAddr", s.second.BinaryAddress);
@@ -275,7 +286,13 @@ MappingTraits<dsymutil::DebugMapObject>::YamlDMO::denormalize(IO &IO) {
     }
   }
 
-  dsymutil::DebugMapObject Res(Path, sys::toTimePoint(Timestamp), MachO::N_OSO);
+  uint8_t Type = MachO::N_OSO;
+  if (Path.endswith(".dylib")) {
+    // FIXME: find a more resilient way
+    Type = MachO::N_LIB;
+  }
+  dsymutil::DebugMapObject Res(Path, sys::toTimePoint(Timestamp), Type);
+
   for (auto &Entry : Entries) {
     auto &Mapping = Entry.second;
     std::optional<uint64_t> ObjAddress;
diff --git a/llvm/tools/dsymutil/DebugMap.h b/llvm/tools/dsymutil/DebugMap.h
index 86cb88d32492d..9c3a698fa1191 100644
--- a/llvm/tools/dsymutil/DebugMap.h
+++ b/llvm/tools/dsymutil/DebugMap.h
@@ -21,6 +21,7 @@
 #ifndef LLVM_TOOLS_DSYMUTIL_DEBUGMAP_H
 #define LLVM_TOOLS_DSYMUTIL_DEBUGMAP_H
 
+#include "RelocationMap.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
@@ -134,22 +135,6 @@ class DebugMap {
 /// linked binary for all the linked atoms in this object file.
 class DebugMapObject {
 public:
-  struct SymbolMapping {
-    std::optional<yaml::Hex64> ObjectAddress;
-    yaml::Hex64 BinaryAddress;
-    yaml::Hex32 Size;
-
-    SymbolMapping(std::optional<uint64_t> ObjectAddr, uint64_t BinaryAddress,
-                  uint32_t Size)
-        : BinaryAddress(BinaryAddress), Size(Size) {
-      if (ObjectAddr)
-        ObjectAddress = *ObjectAddr;
-    }
-
-    /// For YAML IO support
-    SymbolMapping() = default;
-  };
-
   using YAMLSymbolMapping = std::pair<std::string, SymbolMapping>;
   using DebugMapEntry = StringMapEntry<SymbolMapping>;
 
@@ -182,6 +167,16 @@ class DebugMapObject {
   }
   const std::vector<std::string> &getWarnings() const { return Warnings; }
 
+  const std::optional<RelocationMap> &getRelocationMap() const {
+    return RelocMap;
+  }
+  void setRelocationMap(dsymutil::RelocationMap &RM);
+
+  const std::optional<std::string> &getInstallName() const {
+    return InstallName;
+  }
+  void setInstallName(StringRef IN);
+
   void print(raw_ostream &OS) const;
 #ifndef NDEBUG
   void dump() const;
@@ -196,10 +191,13 @@ class DebugMapObject {
 
   std::string Filename;
   sys::TimePoint<std::chrono::seconds> Timestamp;
-  StringMap<SymbolMapping> Symbols;
+  StringMap<struct SymbolMapping> Symbols;
   DenseMap<uint64_t, DebugMapEntry *> AddressToMapping;
   uint8_t Type;
 
+  std::optional<RelocationMap> RelocMap;
+  std::optional<std::string> InstallName;
+
   std::vector<std::string> Warnings;
 
   /// For YAMLIO support.
@@ -225,10 +223,8 @@ namespace yaml {
 
 using namespace llvm::dsymutil;
 
-template <>
-struct MappingTraits<std::pair<std::string, DebugMapObject::SymbolMapping>> {
-  static void mapping(IO &io,
-                      std::pair<std::string, DebugMapObject::SymbolMapping> &s);
+template <> struct MappingTraits<std::pair<std::string, SymbolMapping>> {
+  static void mapping(IO &io, std::pair<std::string, SymbolMapping> &s);
   static const bool flow = true;
 };
 
@@ -237,12 +233,6 @@ template <> struct MappingTraits<dsymutil::DebugMapObject> {
   static void mapping(IO &io, dsymutil::DebugMapObject &DMO);
 };
 
-template <> struct ScalarTraits<Triple> {
-  static void output(const Triple &val, void *, raw_ostream &out);
-  static StringRef input(StringRef scalar, void *, Triple &value);
-  static QuotingType mustQuote(StringRef) { return QuotingType::Single; }
-};
-
 template <>
 struct SequenceTraits<std::vector<std::unique_ptr<dsymutil::DebugMapObject>>> {
   static size_t
diff --git a/llvm/tools/dsymutil/DwarfLinkerForBinary.cpp b/llvm/tools/dsymutil/DwarfLinkerForBinary.cpp
index 39776ae5a9200..a8fea1e271227 100644
--- a/llvm/tools/dsymutil/DwarfLinkerForBinary.cpp
+++ b/llvm/tools/dsymutil/DwarfLinkerForBinary.cpp
@@ -189,6 +189,44 @@ static Error remarksErrorHandler(const DebugMapObject &DMO,
 
   return createFileError(FE->getFileName(), std::move(NewE));
 }
+template <typename OutDwarfFile, typename AddressMap>
+Error DwarfLinkerForBinary::emitRelocations(
+    const DebugMap &DM,
+    std::vector<ObjectWithRelocMap<OutDwarfFile>> &ObjectsForLinking) {
+  // Return early if the "Resources" directory is not being written to.
+  if (!Options.ResourceDir)
+    return Error::success();
+
+  RelocationMap RM(DM.getTriple(), DM.getBinaryPath());
+  for (auto &Obj : ObjectsForLinking) {
+    if (!Obj.OutRelocs->isInitialized())
+      continue;
+    Obj.OutRelocs->addValidRelocs(RM);
+  }
+
+  SmallString<128> InputPath;
+  SmallString<128> Path;
+  // Create the "Relocations" directory in the "Resources" directory, and
+  // create an architecture-specific directory in the "Relocations" directory.
+  StringRef ArchName = Triple::getArchName(RM.getTriple().getArch(),
+                                           RM.getTriple().getSubArch());
+  sys::path::append(Path, *Options.ResourceDir, "Relocations", ArchName);
+  if (std::error_code EC = sys::fs::create_directories(Path.str(), true,
+                                                       sys::fs::perms::all_all))
+    return errorCodeToError(EC);
+
+  // Append the file name.
+  sys::path::append(Path, sys::path::filename(DM.getBinaryPath()));
+  Path.append(".yml");
+
+  std::error_code EC;
+  raw_fd_ostream OS(Path.str(), EC, sys::fs::OF_Text);
+  if (EC)
+    return errorCodeToError(EC);
+
+  RM.print(OS);
+  return Error::success();
+}
 
 static Error emitRemarks(const LinkOptions &Options, StringRef BinaryPath,
                          StringRef ArchName, const remarks::RemarkLinker &RL) {
@@ -229,30 +267,31 @@ static Error emitRemarks(const LinkOptions &Options, StringRef BinaryPath,
 }
 
 template <typename OutDWARFFile, typename AddressesMap>
-ErrorOr<std::unique_ptr<OutDWARFFile>>
-DwarfLinkerForBinary::loadObject(const DebugMapObject &Obj,
-                                 const DebugMap &DebugMap,
-                                 remarks::RemarkLinker &RL) {
+ErrorOr<std::unique_ptr<OutDWARFFile>> DwarfLinkerForBinary::loadObject(
+    const DebugMapObject &Obj, const DebugMap &DebugMap,
+    remarks::RemarkLinker &RL,
+    std::shared_ptr<DwarfLinkerForBinaryRelocationMap> DLBRM) {
   auto ErrorOrObj = loadObject(Obj, DebugMap.getTriple());
   std::unique_ptr<OutDWARFFile> Res;
 
   if (ErrorOrObj) {
+    auto Context = DWARFContext::create(
+        *ErrorOrObj, DWARFContext::ProcessDebugRelocations::Process, nullptr,
+        "",
+        [&](Error Err) {
+          handleAllErrors(std::move(Err), [&](ErrorInfoBase &Info) {
+            reportError(Info.message());
+          });
+        },
+        [&](Error Warning) {
+          handleAllErrors(std::move(Warning), [&](ErrorInfoBase &Info) {
+            reportWarning(Info.message());
+          });
+        });
+    DLBRM->init(*Context);
     Res = std::make_unique<OutDWARFFile>(
-        Obj.getObjectFilename(),
-        DWARFContext::create(
-            *ErrorOrObj, DWARFContext::ProcessDebugRelocations::Process,
-            nullptr, "",
-            [&](Error Err) {
-              handleAllErrors(std::move(Err), [&](ErrorInfoBase &Info) {
-                reportError(Info.message());
-              });
-            },
-            [&](Error Warning) {
-              handleAllErrors(std::move(Warning), [&](ErrorInfoBase &Info) {
-                reportWarning(Info.message());
-              });
-            }),
-        std::make_unique<AddressesMap>(*this, *ErrorOrObj, Obj),
+        Obj.getObjectFilename(), std::move(Context),
+        std::make_unique<AddressesMap>(*this, *ErrorOrObj, Obj, DLBRM),
         [&](StringRef FileName) { BinHolder.eraseObjectEntry(FileName); });
 
     Error E = RL.link(*ErrorOrObj);
@@ -614,7 +653,7 @@ template <typename Linker, typename OutDwarfFile, typename AddressMap>
 bool DwarfLinkerForBinary::linkImpl(
     const DebugMap &Map, typename Linker::OutputFileType ObjectType) {
 
-  std::vector<std::unique_ptr<OutDwarfFile>> ObjectsForLinking;
+  std::vector<ObjectWithRelocMap<OutDwarfFile>> ObjectsForLinking;
 
   DebugMap DebugMap(Map.getTriple(), Map.getBinaryPath());
 
@@ -668,10 +707,12 @@ bool DwarfLinkerForBinary::linkImpl(
     auto &Obj = DebugMap.addDebugMapObject(
         Path, sys::TimePoint<std::chrono::seconds>(), MachO::N_OSO);
 
+    auto DLBRelocMap = std::make_shared<DwarfLinkerForBinaryRelocationMap>();
     if (ErrorOr<std::unique_ptr<OutDwarfFile>> ErrorOrObj =
-            loadObject<OutDwarfFile, AddressMap>(Obj, DebugMap, RL)) {
-      ObjectsForLinking.emplace_back(std::move(*ErrorOrObj));
-      return *ObjectsForLinking.back();
+            loadObject<OutDwarfFile, AddressMap>(Obj, DebugMap, RL,
+                                                 DLBRelocMap)) {
+      ObjectsForLinking.emplace_back(std::move(*ErrorOrObj), DLBRelocMap);
+      return *ObjectsForLinking.back().Object;
     } else {
       // Try and emit more helpful warnings by applying some heuristics.
       StringRef ObjFile = ContainerName;
@@ -782,15 +823,18 @@ bool DwarfLinkerForBinary::linkImpl(
       continue;
     }
 
+    auto DLBRelocMap = std::make_shared<DwarfLinkerForBinaryRelocationMap>();
     if (ErrorOr<std::unique_ptr<OutDwarfFile>> ErrorOrObj =
-            loadObject<OutDwarfFile, AddressMap>(*Obj, Map, RL)) {
-      ObjectsForLinking.emplace_back(std::move(*ErrorOrObj));
-      GeneralLinker->addObjectFile(*ObjectsForLinking.back(), Loader,
+            loadObject<OutDwarfFile, AddressMap>(*Obj, Map, RL, DLBRelocMap)) {
+      ObjectsForLinking.emplace_back(std::move(*ErrorOrObj), DLBRelocMap);
+      GeneralLinker->addObjectFile(*ObjectsForLinking.back().Object, Loader,
                                    OnCUDieLoaded);
     } else {
-      ObjectsForLinking.push_back(std::make_unique<OutDwarfFile>(
-          Obj->getObjectFilename(), nullptr, nullptr));
-      GeneralLinker->addObjectFile(*ObjectsForLinking.back());
+      ObjectsForLinking.push_back(
+          {std::make_unique<OutDwarfFile>(Obj->getObjectFilename(), nullptr,
+                                          nullptr),
+           DLBRelocMap});
+      GeneralLinker->addObjectFile(*ObjectsForLinking.back().Object);
     }
   }
 
@@ -815,6 +859,10 @@ bool DwarfLinkerForBinary::linkImpl(
   if (Options.NoOutput)
     return true;
 
+  if (Error E =
+          emitRelocations<OutDwarfFile, AddressMap>(Map, ObjectsForLinking))
+    return error(toString(std::move(E)));
+
   if (Options.ResourceDir && !ParseableSwiftInterfaces.empty()) {
     StringRef ArchName = Triple::getArchTypeName(Map.getTriple().getArch());
     if (auto E = copySwiftInterfaces(ArchName))
@@ -903,12 +951,14 @@ void DwarfLinkerForBinary::AddressManager<AddressesMapBase>::
         continue;
       }
       if (const auto *Mapping = DMO.lookupSymbol(*SymbolName))
-        ValidRelocs.emplace_back(Offset64, RelocSize, Addend, Mapping);
+        ValidRelocs.emplace_back(Offset64, RelocSize, Addend, Mapping->getKey(),
+                                 Mapping->getValue());
     } else if (const auto *Mapping = DMO.lookupObjectAddress(SymAddress)) {
       // Do not store the addend. The addend was the address of the symbol in
       // the object file, the address in the binary that is stored in the debug
       // map doesn't need to be offset.
-      ValidRelocs.emplace_back(Offset64, RelocSize, SymOffset, Mapping);
+      ValidRelocs.emplace_back(Offset64, RelocSize, SymOffset,
+                               Mapping->getKey(), Mapping->getValue());
     }
   }
 }
@@ -966,20 +1016,17 @@ bool DwarfLinkerForBinary::AddressManager<AddressesMapBase>::
 }
 
 template <typename AddressesMapBase>
-std::vector<
-    typename DwarfLinkerForBinary::AddressManager<AddressesMapBase>::ValidReloc>
+std::vector<ValidReloc>
 DwarfLinkerForBinary::AddressManager<AddressesMapBase>::getRelocations(
     const std::vector<ValidReloc> &Relocs, uint64_t StartPos, uint64_t EndPos) {
-  std::vector<
-      DwarfLinkerForBinary::AddressManager<AddressesMapBase>::ValidReloc>
-      Res;
+  std::vector<ValidReloc> Res;
 
   auto CurReloc = partition_point(Relocs, [StartPos](const ValidReloc &Reloc) {
-    return Reloc.Offset < StartPos;
+    return (uint64_t)Reloc.Offset < StartPos;
   });
 
   while (CurReloc != Relocs.end() && CurReloc->Offset >= StartPos &&
-         CurReloc->Offset < EndPos) {
+         (uint64_t)CurReloc->Offset < EndPos) {
     Res.push_back(*CurReloc);
     CurReloc++;
   }
@@ -990,12 +1037,12 @@ DwarfLinkerForBinary::AddressManager<AddressesMapBase>::getRelocations(
 template <typename AddressesMapBase>
 void DwarfLinkerForBinary::AddressManager<AddressesMapBase>::printReloc(
     const ValidReloc &Reloc) {
-  const auto &Mapping = Reloc.Mapping->getValue();
+  const auto &Mapping = Reloc.SymbolMapping;
   const uint64_t ObjectAddress = Mapping.ObjectAddress
                                      ? uint64_t(*Mapping.ObjectAddress)
                                      : std::numeric_limits<uint64_t>::max();
 
-  outs() << "Found valid debug map entry: " << Reloc.Mapping->getKey() << "\t"
+  outs() << "Found valid debug map entry: " << Reloc.SymbolName << "\t"
          << format("0x%016" PRIx64 " => 0x%016" PRIx64 "\n", ObjectAddress,
                    uint64_t(Mapping.BinaryAddress));
 }
@@ -1004,8 +1051,8 @@ template <typename AddressesMapBase>
 int64_t DwarfLinkerForBinary::AddressManager<AddressesMapBase>::getRelocValue(
     const ValidReloc &Reloc) {
   int64_t AddrAdjust = relocate(Reloc);
-  if (Reloc.Mapping->getValue().ObjectAddress)
-    AddrAdjust -= uint64_t(*Reloc.Mapping->getValue().ObjectAddress);
+  if (Reloc.SymbolMapping.ObjectAddress)
+    AddrAdjust -= uint64_t(*Reloc.SymbolMapping.ObjectAddress);
   return AddrAdjust;
 }
 
@@ -1116,12 +1163,40 @@ std::optional<int64_t> DwarfLinkerForBinary::AddressManager<
   }
 }
 
+template <typename AddressesMapBase>
+std::optional<StringRef> DwarfLinkerForBinary::AddressManager<
+    AddressesMapBase>::getLibraryInstallName() {
+  return LibInstallName;
+}
+
 template <typename AddressesMapBase>
 uint64_t DwarfLinkerForBinary::AddressManager<AddressesMapBase>::relocate(
     const ValidReloc &Reloc) const {
-  return Reloc.Mapping->getValue().BinaryAddress + Reloc.Addend;
+  return Reloc.SymbolMapping.BinaryAddress + Reloc.Addend;
+}
+
+template <typename AddressesMapBase>
+void DwarfLinkerForBinary::AddressManager<
+    AddressesMapBase>::updateAndSaveValidRelocs(bool IsDWARF5,
+                                                uint64_t OriginalUnitOffset,
+                                                int64_t LinkedOffset,
+                                                uint64_t StartOffset,
+                                                uint64_t EndOffset) {
+  std::vector<ValidReloc> InRelocs =
+      getRelocations(ValidDebugInfoRelocs, StartOffset, EndOffset);
+  if (IsDWARF5)
+    InRelocs = getRelocations(ValidDebugAddrRelocs, StartOffset, EndOffset);
+  DwarfLinkerRelocMap->updateAndSaveValidRelocs(
+      IsDWARF5, InRelocs, OriginalUnitOffset, LinkedOffset);
 }
 
+template <typename AddressesMapBase>
+void DwarfLinkerForBinary::AddressManager<AddressesMapBase>::
+    updateRelocationsWithUnitOffset(uint64_t OriginalUnitOffset,
+                                    uint64_t OutputUnitOffset) {
+  DwarfLinkerRelocMap->updateRelocationsWithUnitOffset(OriginalUnitOffset,
+                                                       OutputUnitOffset);
+}
 /// Apply the valid relocations found by findValidRelocs() to
 /// the buffer \p Data, taking into account that Data is at \p BaseOffset
 /// in the debug_info section.
@@ -1133,6 +1208,7 @@ uint64_t DwarfLinkerForBinary::AddressManager<AddressesMapBase>::relocate(
 template <typename AddressesMapBase>
 bool DwarfLinkerForBinary::AddressManager<AddressesMapBase>::applyValidRelocs(
     MutableArrayRef<char> Data, uint64_t BaseOffset, bool IsLittleEndian) {
+
   std::vector<ValidReloc> Relocs = getRelocations(
       ValidDebugInfoRelocs, BaseOffset, BaseOffset + Data.size());
 
@@ -1148,9 +1224,47 @@ bool DwarfLinkerForBinary::AddressManager<AddressesMapBase>::applyValidRelocs(
     assert(CurReloc.Size <= sizeof(Buf));
     memcpy(&Data[CurReloc.Offset - BaseOffset], Buf, CurReloc.Size);
   }
-
   return Relocs.size() > 0;
 }
 
+void DwarfLinkerForBinaryRelocationMap::init(DWARFContext &Context) {
+  for (const std::unique_ptr<DWARFUnit> &CU : Context.compile_units())
+    StoredValidDebugInfoRelocsMap.insert(
+        std::make_pair(CU->getOffset(), std::vector<ValidReloc>()));
+  // FIXME: Support relocations debug_addr (DWARF5).
+}
+
+void DwarfLinkerForBinaryRelocationMap::addValidRelocs(RelocationMap &RM) {
+  for (const auto &DebugInfoRelocs : StoredValidDebugInfoRelocsMap) {
+    for (const auto &InfoReloc : DebugInfoRelocs.second)
+      RM.addRelocationMapEntry(InfoReloc);
+  }
+  // FIXME: Support relocations debug_addr (DWARF5).
+}
+
+void DwarfLinkerForBinaryRelocationMap::updateRelocationsWithUnitOffset(
+    uint64_t OriginalUnitOffset, uint64_t OutputUnitOffset) {
+  std::vector<ValidReloc> &StoredValidDebugInfoRelocs =
+      StoredValidDebugInfoRelocsMap[OriginalUnitOffset];
+  for (ValidReloc &R : StoredValidDebugInfoRelocs) {
+    R.Offset = (uint64_t)R.Offset + OutputUnitOffset;
+  }
+  // FIXME: Support relocations debug_addr (DWARF5).
+}
+
+void DwarfLinkerForBinaryRelocationMap::updateAndSaveValidRelocs(
+    bool IsDWARF5, std::vector<ValidReloc> &InRelocs, uint64_t UnitOffset,
+    int64_t LinkedOffset) {
+  std::vector<ValidReloc> &OutRelocs =
+      StoredValidDebugInfoRelocsMap[UnitOffset];
+  if (IsDWARF5)
+    OutRelocs = StoredValidDebugAddrRelocsMap[UnitOffset];
+
+  for (ValidReloc &R : InRelocs) {
+    OutRelocs.emplace_back(R.Offset + LinkedOffset, R.Size, R.Addend,
+                           R.SymbolName, R.SymbolMapping);
+  }
+}
+
 } // namespace dsymutil
 } // namespace llvm
diff --git a/llvm/tools/dsymutil/DwarfLinkerForBinary.h b/llvm/tools/dsymutil/DwarfLinkerForBinary.h
index 230f569a6988c..328cd9197d0d1 100644
--- a/llvm/tools/dsymutil/DwarfLinkerForBinary.h
+++ b/llvm/tools/dsymutil/DwarfLinkerForBinary.h
@@ -13,6 +13,7 @@
 #include "DebugMap.h"
 #include "LinkUtils.h"
 #include "MachOUtils.h"
+#include "RelocationMap.h"
 #include "llvm/DWARFLinker/DWARFLinker.h"
 #include "llvm/DWARFLinker/DWARFLinkerCompileUnit.h"
 #include "llvm/DWARFLinker/DWARFLinkerDeclContext.h"
@@ -21,10 +22,48 @@
 #include "llvm/Remarks/RemarkFormat.h"
 #include "llvm/Remarks/RemarkLinker.h"
 #include <mutex>
+#include <optional>
 
 namespace llvm {
 namespace dsymutil {
 
+/// DwarfLinkerForBinaryRelocationMap contains the logic to handle the
+/// relocations and to store them inside an associated RelocationMap.
+class DwarfLinkerForBinaryRelocationMap {
+public:
+  void init(DWARFContext &Context);
+
+  bool isInitialized() {
+    return StoredValidDebugInfoRelocsMap.getMemorySize() != 0;
+  }
+
+  void addValidRelocs(RelocationMap &RM);
+
+  void updateAndSaveValidRelocs(bool IsDWARF5,
+                                std::vector<ValidReloc> &InRelocs,
+                                uint64_t UnitOffset, int64_t LinkedOffset);
+
+  void updateRelocationsWithUnitOffset(uint64_t OriginalUnitOffset,
+                                       uint64_t OutputUnitOffset);
+
+  /// Map compilation unit offset to the valid relocations to store
+  /// @{
+  DenseMap<uint64_t, std::vector<ValidReloc>> StoredValidDebugInfoRelocsMap;
+  DenseMap<uint64_t, std::vector<ValidReloc>> StoredValidDebugAddrRelocsMap;
+  /// @}
+
+  DwarfLinkerForBinaryRelocationMap() = default;
+};
+
+template <typename OutDwarfFile> struct ObjectWithRelocMap {
+  ObjectWithRelocMap(
+      std::unique_ptr<OutDwarfFile> Object,
+      std::shared_ptr<DwarfLinkerForBinaryRelocationMap> OutRelocs)
+      : Object(std::move(Object)), OutRelocs(OutRelocs) {}
+  std::unique_ptr<OutDwarfFile> Object;
+  std::shared_ptr<DwarfLinkerForBinaryRelocationMap> OutRelocs;
+};
+
 /// The core of the Dsymutil Dwarf linking logic.
 ///
 /// The link of the dwarf information from the object files will be
@@ -67,26 +106,11 @@ class DwarfLinkerForBinary {
   /// Keeps track of relocations.
   template <typename AddressesMapBase>
   class AddressManager : public AddressesMapBase {
-    struct ValidReloc {
-      uint64_t Offset;
-      uint32_t Size;
-      uint64_t Addend;
-      const DebugMapObject::DebugMapEntry *Mapping;
-
-      ValidReloc(uint64_t Offset, uint32_t Size, uint64_t Addend,
-                 const DebugMapObject::DebugMapEntry *Mapping)
-          : Offset(Offset), Size(Size), Addend(Addend), Mapping(Mapping) {}
-
-      bool operator<(const ValidReloc &RHS) const {
-        return Offset < RHS.Offset;
-      }
-      bool operator<(uint64_t RHS) const { return Offset < RHS; }
-    };
 
     const DwarfLinkerForBinary &Linker;
 
     /// The valid relocations for the current DebugMapObject.
-    /// This vector is sorted by relocation offset.
+    /// These vectors are sorted by relocation offset.
     /// {
     std::vector<ValidReloc> ValidDebugInfoRelocs;
     std::vector<ValidReloc> ValidDebugAddrRelocs;
@@ -94,6 +118,12 @@ class DwarfLinkerForBinary {
 
     StringRef SrcFileName;
 
+    uint8_t DebugMapObjectType;
+
+    std::shared_ptr<DwarfLinkerForBinaryRelocationMap> DwarfLinkerRelocMap;
+
+    std::optional<std::string> LibInstallName;
+
     /// Returns list of valid relocations from \p Relocs,
     /// between \p StartOffset and \p NextOffset.
     ///
@@ -115,9 +145,29 @@ class DwarfLinkerForBinary {
 
   public:
     AddressManager(DwarfLinkerForBinary &Linker, const object::ObjectFile &Obj,
-                   const DebugMapObject &DMO)
-        : Linker(Linker), SrcFileName(DMO.getObjectFilename()) {
-      findValidRelocsInDebugSections(Obj, DMO);
+                   const DebugMapObject &DMO,
+                   std::shared_ptr<DwarfLinkerForBinaryRelocationMap> DLBRM)
+        : Linker(Linker), SrcFileName(DMO.getObjectFilename()),
+          DebugMapObjectType(MachO::N_OSO), DwarfLinkerRelocMap(DLBRM) {
+      if (DMO.getRelocationMap().has_value()) {
+        DebugMapObjectType = MachO::N_LIB;
+        LibInstallName.emplace(DMO.getInstallName().value());
+        const RelocationMap &RM = DMO.getRelocationMap().value();
+        for (const auto &Reloc : RM.relocations()) {
+          const auto *DebugMapEntry = DMO.lookupSymbol(Reloc.SymbolName);
+          if (!DebugMapEntry)
+            continue;
+          std::optional<uint64_t> ObjAddress;
+          ObjAddress.emplace(DebugMapEntry->getValue().ObjectAddress.value());
+          ValidDebugInfoRelocs.emplace_back(
+              Reloc.Offset, Reloc.Size, Reloc.Addend, Reloc.SymbolName,
+              SymbolMapping(ObjAddress, DebugMapEntry->getValue().BinaryAddress,
+                            DebugMapEntry->getValue().Size));
+          // FIXME: Support relocations debug_addr.
+        }
+      } else {
+        findValidRelocsInDebugSections(Obj, DMO);
+      }
     }
     ~AddressManager() override { clear(); }
 
@@ -158,9 +208,20 @@ class DwarfLinkerForBinary {
     std::optional<int64_t>
     getSubprogramRelocAdjustment(const DWARFDie &DIE) override;
 
+    std::optional<StringRef> getLibraryInstallName() override;
+
     bool applyValidRelocs(MutableArrayRef<char> Data, uint64_t BaseOffset,
                           bool IsLittleEndian) override;
 
+    bool needToSaveValidRelocs() override { return true; }
+
+    void updateAndSaveValidRelocs(bool IsDWARF5, uint64_t OriginalUnitOffset,
+                                  int64_t LinkedOffset, uint64_t StartOffset,
+                                  uint64_t EndOffset) override;
+
+    void updateRelocationsWithUnitOffset(uint64_t OriginalUnitOffset,
+                                         uint64_t OutputUnitOffset) override;
+
     void clear() override {
       ValidDebugInfoRelocs.clear();
       ValidDebugAddrRelocs.clear();
@@ -180,11 +241,11 @@ class DwarfLinkerForBinary {
   /// Attempt to load a debug object from disk.
   ErrorOr<const object::ObjectFile &> loadObject(const DebugMapObject &Obj,
                                                  const Triple &triple);
-
   template <typename OutDWARFFile, typename AddressesMap>
-  ErrorOr<std::unique_ptr<OutDWARFFile>> loadObject(const DebugMapObject &Obj,
-                                                    const DebugMap &DebugMap,
-                                                    remarks::RemarkLinker &RL);
+  ErrorOr<std::unique_ptr<OutDWARFFile>>
+  loadObject(const DebugMapObject &Obj, const DebugMap &DebugMap,
+             remarks::RemarkLinker &RL,
+             std::shared_ptr<DwarfLinkerForBinaryRelocationMap> DLBRM);
 
   void collectRelocationsToApplyToSwiftReflectionSections(
       const object::SectionRef &Section, StringRef &Contents,
@@ -207,6 +268,11 @@ class DwarfLinkerForBinary {
   bool linkImpl(const DebugMap &Map,
                 typename Linker::OutputFileType ObjectType);
 
+  template <typename OutDwarfFile, typename AddressMap>
+  Error emitRelocations(
+      const DebugMap &DM,
+      std::vector<ObjectWithRelocMap<OutDwarfFile>> &ObjectsForLinking);
+
   raw_fd_ostream &OutFile;
   BinaryHolder &BinHolder;
   LinkOptions Options;
diff --git a/llvm/tools/dsymutil/LinkUtils.h b/llvm/tools/dsymutil/LinkUtils.h
index 88c17d5036899..0bf6d9aac1a3f 100644
--- a/llvm/tools/dsymutil/LinkUtils.h
+++ b/llvm/tools/dsymutil/LinkUtils.h
@@ -93,6 +93,12 @@ struct LinkOptions {
   llvm::IntrusiveRefCntPtr<llvm::vfs::FileSystem> VFS =
       vfs::getRealFileSystem();
 
+  /// -build-variant-suffix.
+  std::string BuildVariantSuffix;
+
+  /// Paths where to search for the .dSYM files of merged libraries.
+  std::vector<std::string> DSYMSearchPaths;
+
   /// Fields used for linking and placing remarks into the .dSYM bundle.
   /// @{
 
diff --git a/llvm/tools/dsymutil/MachODebugMapParser.cpp b/llvm/tools/dsymutil/MachODebugMapParser.cpp
index d9bf2301e21e5..9623b71714582 100644
--- a/llvm/tools/dsymutil/MachODebugMapParser.cpp
+++ b/llvm/tools/dsymutil/MachODebugMapParser.cpp
@@ -9,6 +9,7 @@
 #include "BinaryHolder.h"
 #include "DebugMap.h"
 #include "MachOUtils.h"
+#include "RelocationMap.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/Object/MachO.h"
@@ -28,9 +29,13 @@ class MachODebugMapParser {
 public:
   MachODebugMapParser(llvm::IntrusiveRefCntPtr<llvm::vfs::FileSystem> VFS,
                       StringRef BinaryPath, ArrayRef<std::string> Archs,
-                      StringRef PathPrefix = "", bool Verbose = false)
+                      ArrayRef<std::string> DSYMSearchPaths,
+                      StringRef PathPrefix = "", StringRef VariantSuffix = "",
+                      bool Verbose = false)
       : BinaryPath(std::string(BinaryPath)), Archs(Archs.begin(), Archs.end()),
-        PathPrefix(std::string(PathPrefix)), BinHolder(VFS, Verbose),
+        DSYMSearchPaths(DSYMSearchPaths.begin(), DSYMSearchPaths.end()),
+        PathPrefix(std::string(PathPrefix)),
+        VariantSuffix(std::string(VariantSuffix)), BinHolder(VFS, Verbose),
         CurrentDebugMapObject(nullptr), SkipDebugMapObject(false) {}
 
   /// Parses and returns the DebugMaps of the input binary. The binary contains
@@ -47,7 +52,9 @@ class MachODebugMapParser {
 private:
   std::string BinaryPath;
   SmallVector<StringRef, 1> Archs;
+  SmallVector<StringRef, 1> DSYMSearchPaths;
   std::string PathPrefix;
+  std::string VariantSuffix;
 
   /// Owns the MemoryBuffer for the main binary.
   BinaryHolder BinHolder;
@@ -87,6 +94,9 @@ class MachODebugMapParser {
   void
   switchToNewDebugMapObject(StringRef Filename,
                             sys::TimePoint<std::chrono::seconds> Timestamp);
+  void
+  switchToNewLibDebugMapObject(StringRef Filename,
+                               sys::TimePoint<std::chrono::seconds> Timestamp);
   void resetParserState();
   uint64_t getMainBinarySymbolAddress(StringRef Name);
   std::vector<StringRef> getMainBinarySymbolNames(uint64_t Value);
@@ -176,8 +186,6 @@ void MachODebugMapParser::addCommonSymbols() {
 /// everything up to add symbols to the new one.
 void MachODebugMapParser::switchToNewDebugMapObject(
     StringRef Filename, sys::TimePoint<std::chrono::seconds> Timestamp) {
-  addCommonSymbols();
-  resetParserState();
 
   SmallString<80> Path(PathPrefix);
   sys::path::append(Path, Filename);
@@ -198,11 +206,138 @@ void MachODebugMapParser::switchToNewDebugMapObject(
     return;
   }
 
+  addCommonSymbols();
+  resetParserState();
+
   CurrentDebugMapObject =
       &Result->addDebugMapObject(Path, Timestamp, MachO::N_OSO);
+
   loadCurrentObjectFileSymbols(*Object);
 }
 
+/// Create a new DebugMapObject of type MachO::N_LIB.
+/// This function resets the state of the parser that was
+/// referring to the last object file and sets everything
+/// up to add symbols to the new one.
+void MachODebugMapParser::switchToNewLibDebugMapObject(
+    StringRef Filename, sys::TimePoint<std::chrono::seconds> Timestamp) {
+
+  if (DSYMSearchPaths.empty()) {
+    Warning("no dSYM search path was specified");
+    return;
+  }
+
+  StringRef LeafName = sys::path::filename(Filename);
+  SmallString<128> VariantLeafName;
+  SmallString<128> ProductName(LeafName);
+
+  // For Framework.framework/Framework and -build-variant-suffix=_debug,
+  // look in the following order:
+  // 1) Framework.framework.dSYM/Contents/Resources/DWARF/Framework_debug
+  // 2) Framework.framework.dSYM/Contents/Resources/DWARF/Framework
+  //
+  // For libName.dylib and -build-variant-suffix=_debug,
+  // look in the following order:
+  // 1) libName.dylib.dSYM/Contents/Resources/DWARF/libName_debug.dylib
+  // 2) libName.dylib.dSYM/Contents/Resources/DWARF/libName.dylib
+
+  size_t libExt = LeafName.rfind(".dylib");
+  if (libExt != StringRef::npos) {
+    if (!VariantSuffix.empty()) {
+      VariantLeafName.append(LeafName.substr(0, libExt));
+      VariantLeafName.append(VariantSuffix);
+      VariantLeafName.append(".dylib");
+    }
+  } else {
+    // Expected to be a framework
+    ProductName.append(".framework");
+    if (!VariantSuffix.empty()) {
+      VariantLeafName.append(LeafName);
+      VariantLeafName.append(VariantSuffix);
+    }
+  }
+
+  for (auto DSYMSearchPath : DSYMSearchPaths) {
+    SmallString<256> Path(DSYMSearchPath);
+    SmallString<256> FallbackPath(Path);
+
+    SmallString<256> DSYMPath(ProductName);
+    DSYMPath.append(".dSYM");
+    sys::path::append(DSYMPath, "Contents", "Resources", "DWARF");
+
+    if (!VariantSuffix.empty()) {
+      sys::path::append(Path, DSYMPath, VariantLeafName);
+      sys::path::append(FallbackPath, DSYMPath, LeafName);
+    } else {
+      sys::path::append(Path, DSYMPath, LeafName);
+    }
+
+    auto ObjectEntry = BinHolder.getObjectEntry(Path, Timestamp);
+    if (!ObjectEntry) {
+      auto Err = ObjectEntry.takeError();
+      Warning("unable to open object file: " + toString(std::move(Err)),
+              Path.str());
+      if (!VariantSuffix.empty()) {
+        ObjectEntry = BinHolder.getObjectEntry(FallbackPath, Timestamp);
+        if (!ObjectEntry) {
+          auto Err = ObjectEntry.takeError();
+          Warning("unable to open object file: " + toString(std::move(Err)),
+                  FallbackPath.str());
+          continue;
+        }
+        Path.assign(FallbackPath);
+      } else {
+        continue;
+      }
+    }
+
+    auto Object =
+        ObjectEntry->getObjectAs<MachOObjectFile>(Result->getTriple());
+    if (!Object) {
+      auto Err = Object.takeError();
+      Warning("unable to open object file: " + toString(std::move(Err)),
+              Path.str());
+      continue;
+    }
+
+    if (CurrentDebugMapObject &&
+        CurrentDebugMapObject->getType() == MachO::N_LIB &&
+        CurrentDebugMapObject->getObjectFilename().compare(Path.str()) == 0) {
+      return;
+    }
+
+    addCommonSymbols();
+    resetParserState();
+
+    CurrentDebugMapObject =
+        &Result->addDebugMapObject(Path, Timestamp, MachO::N_LIB);
+
+    CurrentDebugMapObject->setInstallName(Filename);
+
+    SmallString<256> RMPath(DSYMSearchPath);
+    sys::path::append(RMPath, ProductName);
+    RMPath.append(".dSYM");
+    StringRef ArchName = Triple::getArchName(Result->getTriple().getArch(),
+                                             Result->getTriple().getSubArch());
+    sys::path::append(RMPath, "Contents", "Resources", "Relocations", ArchName);
+    sys::path::append(RMPath, LeafName);
+    RMPath.append(".yml");
+    const auto &RelocMapPtrOrErr =
+        RelocationMap::parseYAMLRelocationMap(RMPath, PathPrefix);
+    if (auto EC = RelocMapPtrOrErr.getError()) {
+      Warning("cannot parse relocation map file: " + EC.message(),
+              RMPath.str());
+      return;
+    }
+    CurrentDebugMapObject->setRelocationMap(*RelocMapPtrOrErr->get());
+
+    loadCurrentObjectFileSymbols(*Object);
+
+    // Found and loaded new dSYM file
+    return;
+  }
+}
+
 static std::string getArchName(const object::MachOObjectFile &Obj) {
   Triple T = Obj.getArchTriple();
   return std::string(T.getArchName());
@@ -275,23 +410,39 @@ struct DarwinStabName {
   const char *Name;
 };
 
-const struct DarwinStabName DarwinStabNames[] = {
-    {MachO::N_GSYM, "N_GSYM"},    {MachO::N_FNAME, "N_FNAME"},
-    {MachO::N_FUN, "N_FUN"},      {MachO::N_STSYM, "N_STSYM"},
-    {MachO::N_LCSYM, "N_LCSYM"},  {MachO::N_BNSYM, "N_BNSYM"},
-    {MachO::N_PC, "N_PC"},        {MachO::N_AST, "N_AST"},
-    {MachO::N_OPT, "N_OPT"},      {MachO::N_RSYM, "N_RSYM"},
-    {MachO::N_SLINE, "N_SLINE"},  {MachO::N_ENSYM, "N_ENSYM"},
-    {MachO::N_SSYM, "N_SSYM"},    {MachO::N_SO, "N_SO"},
-    {MachO::N_OSO, "N_OSO"},      {MachO::N_LSYM, "N_LSYM"},
-    {MachO::N_BINCL, "N_BINCL"},  {MachO::N_SOL, "N_SOL"},
-    {MachO::N_PARAMS, "N_PARAM"}, {MachO::N_VERSION, "N_VERS"},
-    {MachO::N_OLEVEL, "N_OLEV"},  {MachO::N_PSYM, "N_PSYM"},
-    {MachO::N_EINCL, "N_EINCL"},  {MachO::N_ENTRY, "N_ENTRY"},
-    {MachO::N_LBRAC, "N_LBRAC"},  {MachO::N_EXCL, "N_EXCL"},
-    {MachO::N_RBRAC, "N_RBRAC"},  {MachO::N_BCOMM, "N_BCOMM"},
-    {MachO::N_ECOMM, "N_ECOMM"},  {MachO::N_ECOML, "N_ECOML"},
-    {MachO::N_LENG, "N_LENG"},    {0, nullptr}};
+const struct DarwinStabName DarwinStabNames[] = {{MachO::N_GSYM, "N_GSYM"},
+                                                 {MachO::N_FNAME, "N_FNAME"},
+                                                 {MachO::N_FUN, "N_FUN"},
+                                                 {MachO::N_STSYM, "N_STSYM"},
+                                                 {MachO::N_LCSYM, "N_LCSYM"},
+                                                 {MachO::N_BNSYM, "N_BNSYM"},
+                                                 {MachO::N_PC, "N_PC"},
+                                                 {MachO::N_AST, "N_AST"},
+                                                 {MachO::N_OPT, "N_OPT"},
+                                                 {MachO::N_RSYM, "N_RSYM"},
+                                                 {MachO::N_SLINE, "N_SLINE"},
+                                                 {MachO::N_ENSYM, "N_ENSYM"},
+                                                 {MachO::N_SSYM, "N_SSYM"},
+                                                 {MachO::N_SO, "N_SO"},
+                                                 {MachO::N_OSO, "N_OSO"},
+                                                 {MachO::N_LIB, "N_LIB"},
+                                                 {MachO::N_LSYM, "N_LSYM"},
+                                                 {MachO::N_BINCL, "N_BINCL"},
+                                                 {MachO::N_SOL, "N_SOL"},
+                                                 {MachO::N_PARAMS, "N_PARAM"},
+                                                 {MachO::N_VERSION, "N_VERS"},
+                                                 {MachO::N_OLEVEL, "N_OLEV"},
+                                                 {MachO::N_PSYM, "N_PSYM"},
+                                                 {MachO::N_EINCL, "N_EINCL"},
+                                                 {MachO::N_ENTRY, "N_ENTRY"},
+                                                 {MachO::N_LBRAC, "N_LBRAC"},
+                                                 {MachO::N_EXCL, "N_EXCL"},
+                                                 {MachO::N_RBRAC, "N_RBRAC"},
+                                                 {MachO::N_BCOMM, "N_BCOMM"},
+                                                 {MachO::N_ECOMM, "N_ECOMM"},
+                                                 {MachO::N_ECOML, "N_ECOML"},
+                                                 {MachO::N_LENG, "N_LENG"},
+                                                 {0, nullptr}};
 
 static const char *getDarwinStabString(uint8_t NType) {
   for (unsigned i = 0; DarwinStabNames[i].Name; i++) {
@@ -477,13 +628,25 @@ void MachODebugMapParser::handleStabSymbolTableEntry(
 
   const char *Name = &MainBinaryStrings.data()[StringIndex];
 
+  // An N_LIB entry represents the start of a new library file description.
+  if (Type == MachO::N_LIB) {
+    switchToNewLibDebugMapObject(Name, sys::toTimePoint(Value));
+    return;
+  }
+
   // An N_OSO entry represents the start of a new object file description.
+  // If an N_LIB entry was present, this is parsed only if the library
+  // dSYM file could not be found.
   if (Type == MachO::N_OSO) {
-    if (Duplicates.count(OSO(Name, Value))) {
-      SkipDebugMapObject = true;
-      return;
+    if (!CurrentDebugMapObject ||
+        CurrentDebugMapObject->getType() != MachO::N_LIB) {
+      if (Duplicates.count(OSO(Name, Value))) {
+        SkipDebugMapObject = true;
+        return;
+      }
+      switchToNewDebugMapObject(Name, sys::toTimePoint(Value));
     }
-    return switchToNewDebugMapObject(Name, sys::toTimePoint(Value));
+    return;
   }
 
   if (SkipDebugMapObject)
@@ -694,18 +857,23 @@ namespace dsymutil {
 llvm::ErrorOr<std::vector<std::unique_ptr<DebugMap>>>
 parseDebugMap(llvm::IntrusiveRefCntPtr<llvm::vfs::FileSystem> VFS,
               StringRef InputFile, ArrayRef<std::string> Archs,
-              StringRef PrependPath, bool Verbose, bool InputIsYAML) {
+              ArrayRef<std::string> DSYMSearchPaths, StringRef PrependPath,
+              StringRef VariantSuffix, bool Verbose, bool InputIsYAML) {
   if (InputIsYAML)
     return DebugMap::parseYAMLDebugMap(InputFile, PrependPath, Verbose);
 
-  MachODebugMapParser Parser(VFS, InputFile, Archs, PrependPath, Verbose);
+  MachODebugMapParser Parser(VFS, InputFile, Archs, DSYMSearchPaths,
+                             PrependPath, VariantSuffix, Verbose);
+
   return Parser.parse();
 }
 
 bool dumpStab(llvm::IntrusiveRefCntPtr<llvm::vfs::FileSystem> VFS,
               StringRef InputFile, ArrayRef<std::string> Archs,
-              StringRef PrependPath) {
-  MachODebugMapParser Parser(VFS, InputFile, Archs, PrependPath, false);
+              ArrayRef<std::string> DSYMSearchPaths, StringRef PrependPath,
+              StringRef VariantSuffix) {
+  MachODebugMapParser Parser(VFS, InputFile, Archs, DSYMSearchPaths,
+                             PrependPath, VariantSuffix, false);
   return Parser.dumpStab();
 }
 } // namespace dsymutil
diff --git a/llvm/tools/dsymutil/Options.td b/llvm/tools/dsymutil/Options.td
index 79f04fdfb0360..da071341cc01f 100644
--- a/llvm/tools/dsymutil/Options.td
+++ b/llvm/tools/dsymutil/Options.td
@@ -201,3 +201,14 @@ def linker: Separate<["--", "-"], "linker">,
   HelpText<"Specify the desired type of DWARF linker. Defaults to 'apple'">,
   Group<grp_general>;
 def: Joined<["--", "-"], "linker=">, Alias<linker>;
+
+def build_variant_suffix: Separate<["--", "-"], "build-variant-suffix">,
+  MetaVarName<"<suffix=buildvariant>">,
+  HelpText<"Specify the build variant suffix used to build the executabe file.">,
+  Group<grp_general>;
+def: Joined<["--", "-"], "build-variant-suffix=">, Alias<build_variant_suffix>;
+
+def dsym_search_path: Separate<["-", "--"], "D">,
+  MetaVarName<"<path>">,
+  HelpText<"Specify a directory that contain dSYM files to search for.">,
+  Group<grp_general>;
diff --git a/llvm/tools/dsymutil/RelocationMap.cpp b/llvm/tools/dsymutil/RelocationMap.cpp
new file mode 100644
index 0000000000000..5921e7c9c2495
--- /dev/null
+++ b/llvm/tools/dsymutil/RelocationMap.cpp
@@ -0,0 +1,92 @@
+//===- tools/dsymutil/RelocationMap.cpp - Relocation map representation---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "RelocationMap.h"
+
+namespace llvm {
+
+namespace dsymutil {
+
+void RelocationMap::print(raw_ostream &OS) const {
+  yaml::Output yout(OS, /* Ctxt = */ nullptr, /* WrapColumn = */ 0);
+  yout << const_cast<RelocationMap &>(*this);
+}
+
+#ifndef NDEBUG
+void RelocationMap::dump() const { print(errs()); }
+#endif
+
+void RelocationMap::addRelocationMapEntry(const ValidReloc &Relocation) {
+  Relocations.push_back(Relocation);
+}
+
+namespace {
+
+struct YAMLContext {
+  StringRef PrependPath;
+  Triple BinaryTriple;
+};
+
+} // end anonymous namespace
+
+ErrorOr<std::unique_ptr<RelocationMap>>
+RelocationMap::parseYAMLRelocationMap(StringRef InputFile,
+                                      StringRef PrependPath) {
+  auto ErrOrFile = MemoryBuffer::getFileOrSTDIN(InputFile);
+  if (auto Err = ErrOrFile.getError())
+    return Err;
+
+  YAMLContext Ctxt;
+
+  Ctxt.PrependPath = PrependPath;
+
+  std::unique_ptr<RelocationMap> Result;
+  yaml::Input yin((*ErrOrFile)->getBuffer(), &Ctxt);
+  yin >> Result;
+
+  if (auto EC = yin.error())
+    return EC;
+  return std::move(Result);
+}
+
+} // end namespace dsymutil
+
+namespace yaml {
+
+void MappingTraits<dsymutil::ValidReloc>::mapping(IO &io,
+                                                  dsymutil::ValidReloc &VR) {
+  io.mapRequired("offset", VR.Offset);
+  io.mapRequired("size", VR.Size);
+  io.mapRequired("addend", VR.Addend);
+  io.mapRequired("symName", VR.SymbolName);
+  io.mapOptional("symObjAddr", VR.SymbolMapping.ObjectAddress);
+  io.mapRequired("symBinAddr", VR.SymbolMapping.BinaryAddress);
+  io.mapRequired("symSize", VR.SymbolMapping.Size);
+}
+
+void MappingTraits<dsymutil::RelocationMap>::mapping(
+    IO &io, dsymutil::RelocationMap &RM) {
+  io.mapRequired("triple", RM.BinaryTriple);
+  io.mapRequired("binary-path", RM.BinaryPath);
+  if (void *Ctxt = io.getContext())
+    reinterpret_cast<YAMLContext *>(Ctxt)->BinaryTriple = RM.BinaryTriple;
+  io.mapRequired("relocations", RM.Relocations);
+}
+
+void MappingTraits<std::unique_ptr<dsymutil::RelocationMap>>::mapping(
+    IO &io, std::unique_ptr<dsymutil::RelocationMap> &RM) {
+  if (!RM)
+    RM.reset(new RelocationMap());
+  io.mapRequired("triple", RM->BinaryTriple);
+  io.mapRequired("binary-path", RM->BinaryPath);
+  if (void *Ctxt = io.getContext())
+    reinterpret_cast<YAMLContext *>(Ctxt)->BinaryTriple = RM->BinaryTriple;
+  io.mapRequired("relocations", RM->Relocations);
+}
+} // end namespace yaml
+} // end namespace llvm
diff --git a/llvm/tools/dsymutil/RelocationMap.h b/llvm/tools/dsymutil/RelocationMap.h
new file mode 100644
index 0000000000000..3d851acf2b892
--- /dev/null
+++ b/llvm/tools/dsymutil/RelocationMap.h
@@ -0,0 +1,160 @@
+//===- tools/dsymutil/RelocationMap.h -------------------------- *- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+///
+/// This file contains the class declaration of the RelocationMap
+/// entity. RelocationMap lists all the relocations of all the
+/// atoms used in the object files linked together to
+/// produce an executable.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_DSYMUTIL_RELOCATIONMAP_H
+#define LLVM_TOOLS_DSYMUTIL_RELOCATIONMAP_H
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/Support/YAMLTraits.h"
+#include "llvm/TargetParser/Triple.h"
+
+#include <optional>
+#include <string>
+#include <vector>
+
+namespace llvm {
+
+class raw_ostream;
+
+namespace dsymutil {
+
+struct SymbolMapping {
+  std::optional<yaml::Hex64> ObjectAddress;
+  yaml::Hex64 BinaryAddress;
+  yaml::Hex32 Size;
+
+  SymbolMapping(std::optional<uint64_t> ObjectAddr, uint64_t BinaryAddress,
+                uint32_t Size)
+      : BinaryAddress(BinaryAddress), Size(Size) {
+    if (ObjectAddr)
+      ObjectAddress = *ObjectAddr;
+  }
+
+  /// For YAML IO support
+  SymbolMapping() = default;
+};
+
+/// ValidReloc represents one relocation entry described by the RelocationMap.
+/// It contains a list of DWARF relocations to apply to a linked binary.
+class ValidReloc {
+public:
+  yaml::Hex64 Offset;
+  yaml::Hex32 Size;
+  yaml::Hex64 Addend;
+  std::string SymbolName;
+  struct SymbolMapping SymbolMapping;
+
+  struct SymbolMapping getSymbolMapping() const { return SymbolMapping; }
+
+  ValidReloc(uint64_t Offset, uint32_t Size, uint64_t Addend,
+             StringRef SymbolName, struct SymbolMapping SymbolMapping)
+      : Offset(Offset), Size(Size), Addend(Addend), SymbolName(SymbolName),
+        SymbolMapping(SymbolMapping) {}
+
+  bool operator<(const ValidReloc &RHS) const { return Offset < RHS.Offset; }
+
+  /// For YAMLIO support.
+  ValidReloc() = default;
+};
+
+/// The RelocationMap object stores the list of relocation entries for a binary
+class RelocationMap {
+  Triple BinaryTriple;
+  std::string BinaryPath;
+  using RelocContainer = std::vector<ValidReloc>;
+
+  RelocContainer Relocations;
+
+  /// For YAML IO support.
+  ///@{
+  friend yaml::MappingTraits<std::unique_ptr<RelocationMap>>;
+  friend yaml::MappingTraits<RelocationMap>;
+
+  RelocationMap() = default;
+  ///@}
+
+public:
+  RelocationMap(const Triple &BinaryTriple, StringRef BinaryPath)
+      : BinaryTriple(BinaryTriple), BinaryPath(std::string(BinaryPath)) {}
+
+  using const_iterator = RelocContainer::const_iterator;
+
+  iterator_range<const_iterator> relocations() const {
+    return make_range(begin(), end());
+  }
+
+  const_iterator begin() const { return Relocations.begin(); }
+
+  const_iterator end() const { return Relocations.end(); }
+
+  size_t getNumberOfEntries() const { return Relocations.size(); }
+
+  /// This function adds a ValidReloc to the list owned by this
+  /// relocation map.
+  void addRelocationMapEntry(const ValidReloc &Relocation);
+
+  const Triple &getTriple() const { return BinaryTriple; }
+
+  StringRef getBinaryPath() const { return BinaryPath; }
+
+  void print(raw_ostream &OS) const;
+
+#ifndef NDEBUG
+  void dump() const;
+#endif
+
+  /// Read a relocation map from \a InputFile.
+  static ErrorOr<std::unique_ptr<RelocationMap>>
+  parseYAMLRelocationMap(StringRef InputFile, StringRef PrependPath);
+};
+
+} // end namespace dsymutil
+} // end namespace llvm
+
+LLVM_YAML_IS_SEQUENCE_VECTOR(dsymutil::ValidReloc)
+
+namespace llvm {
+namespace yaml {
+
+using namespace llvm::dsymutil;
+
+template <> struct MappingTraits<dsymutil::ValidReloc> {
+  static void mapping(IO &io, dsymutil::ValidReloc &VR);
+  static const bool flow = true;
+};
+
+template <> struct MappingTraits<dsymutil::RelocationMap> {
+  struct YamlRM;
+  static void mapping(IO &io, dsymutil::RelocationMap &RM);
+};
+
+template <> struct MappingTraits<std::unique_ptr<dsymutil::RelocationMap>> {
+  struct YamlRM;
+  static void mapping(IO &io, std::unique_ptr<dsymutil::RelocationMap> &RM);
+};
+
+template <> struct ScalarTraits<Triple> {
+  static void output(const Triple &val, void *, raw_ostream &out);
+  static StringRef input(StringRef scalar, void *, Triple &value);
+  static QuotingType mustQuote(StringRef) { return QuotingType::Single; }
+};
+
+} // end namespace yaml
+} // end namespace llvm
+
+#endif // LLVM_TOOLS_DSYMUTIL_RELOCATIONMAP_H
diff --git a/llvm/tools/dsymutil/dsymutil.cpp b/llvm/tools/dsymutil/dsymutil.cpp
index 104895b1a90bd..2dd123318e00b 100644
--- a/llvm/tools/dsymutil/dsymutil.cpp
+++ b/llvm/tools/dsymutil/dsymutil.cpp
@@ -398,6 +398,12 @@ static Expected<DsymutilOptions> getOptions(opt::InputArgList &Args) {
   Options.LinkOpts.RemarksKeepAll =
       !Args.hasArg(OPT_remarks_drop_without_debug);
 
+  if (opt::Arg *BuildVariantSuffix = Args.getLastArg(OPT_build_variant_suffix))
+    Options.LinkOpts.BuildVariantSuffix = BuildVariantSuffix->getValue();
+
+  for (auto *SearchPath : Args.filtered(OPT_dsym_search_path))
+    Options.LinkOpts.DSYMSearchPaths.push_back(SearchPath->getValue());
+
   if (Error E = verifyOptions(Options))
     return std::move(E);
   return Options;
@@ -670,15 +676,18 @@ int dsymutil_main(int argc, char **argv, const llvm::ToolContext &) {
     // Dump the symbol table for each input file and requested arch
     if (Options.DumpStab) {
       if (!dumpStab(Options.LinkOpts.VFS, InputFile, Options.Archs,
-                    Options.LinkOpts.PrependPath))
+                    Options.LinkOpts.DSYMSearchPaths,
+                    Options.LinkOpts.PrependPath,
+                    Options.LinkOpts.BuildVariantSuffix))
         return EXIT_FAILURE;
       continue;
     }
 
-    auto DebugMapPtrsOrErr =
-        parseDebugMap(Options.LinkOpts.VFS, InputFile, Options.Archs,
-                      Options.LinkOpts.PrependPath, Options.LinkOpts.Verbose,
-                      Options.InputIsYAMLDebugMap);
+    auto DebugMapPtrsOrErr = parseDebugMap(
+        Options.LinkOpts.VFS, InputFile, Options.Archs,
+        Options.LinkOpts.DSYMSearchPaths, Options.LinkOpts.PrependPath,
+        Options.LinkOpts.BuildVariantSuffix, Options.LinkOpts.Verbose,
+        Options.InputIsYAMLDebugMap);
 
     if (auto EC = DebugMapPtrsOrErr.getError()) {
       WithColor::error() << "cannot parse the debug map for '" << InputFile
diff --git a/llvm/tools/dsymutil/dsymutil.h b/llvm/tools/dsymutil/dsymutil.h
index ddecd8a76c7fc..5504dd57c7e55 100644
--- a/llvm/tools/dsymutil/dsymutil.h
+++ b/llvm/tools/dsymutil/dsymutil.h
@@ -35,12 +35,14 @@ namespace dsymutil {
 ErrorOr<std::vector<std::unique_ptr<DebugMap>>>
 parseDebugMap(llvm::IntrusiveRefCntPtr<llvm::vfs::FileSystem> VFS,
               StringRef InputFile, ArrayRef<std::string> Archs,
-              StringRef PrependPath, bool Verbose, bool InputIsYAML);
+              ArrayRef<std::string> DSYMSearchPaths, StringRef PrependPath,
+              StringRef VariantSuffix, bool Verbose, bool InputIsYAML);
 
 /// Dump the symbol table.
 bool dumpStab(llvm::IntrusiveRefCntPtr<llvm::vfs::FileSystem> VFS,
               StringRef InputFile, ArrayRef<std::string> Archs,
-              StringRef PrependPath = "");
+              ArrayRef<std::string> DSYMSearchPaths, StringRef PrependPath = "",
+              StringRef VariantSuffix = "");
 
 } // end namespace dsymutil
 } // end namespace llvm
diff --git a/llvm/tools/llvm-dwarfutil/DebugInfoLinker.cpp b/llvm/tools/llvm-dwarfutil/DebugInfoLinker.cpp
index d97dd7392b0df..02a94596ec764 100644
--- a/llvm/tools/llvm-dwarfutil/DebugInfoLinker.cpp
+++ b/llvm/tools/llvm-dwarfutil/DebugInfoLinker.cpp
@@ -132,11 +132,23 @@ class ObjFileAddressMap : public AddressMapBase {
     return std::nullopt;
   }
 
+  std::optional<StringRef> getLibraryInstallName() override {
+    return std::nullopt;
+  }
+
   bool applyValidRelocs(MutableArrayRef<char>, uint64_t, bool) override {
     // no need to apply relocations to the linked binary.
     return false;
   }
 
+  bool needToSaveValidRelocs() override { return false; }
+
+  void updateAndSaveValidRelocs(bool, uint64_t, int64_t, uint64_t,
+                                uint64_t) override {}
+
+  void updateRelocationsWithUnitOffset(uint64_t OriginalUnitOffset,
+                                       uint64_t OutputUnitOffset) override {}
+
   void clear() override {}
 
 protected:
diff --git a/llvm/tools/llvm-nm/llvm-nm.cpp b/llvm/tools/llvm-nm/llvm-nm.cpp
index e32aa8ab8f5bf..fede89e9c1167 100644
--- a/llvm/tools/llvm-nm/llvm-nm.cpp
+++ b/llvm/tools/llvm-nm/llvm-nm.cpp
@@ -561,37 +561,22 @@ struct DarwinStabName {
   const char *Name;
 };
 const struct DarwinStabName DarwinStabNames[] = {
-    {MachO::N_GSYM, "GSYM"},
-    {MachO::N_FNAME, "FNAME"},
-    {MachO::N_FUN, "FUN"},
-    {MachO::N_STSYM, "STSYM"},
-    {MachO::N_LCSYM, "LCSYM"},
-    {MachO::N_BNSYM, "BNSYM"},
-    {MachO::N_PC, "PC"},
-    {MachO::N_AST, "AST"},
-    {MachO::N_OPT, "OPT"},
-    {MachO::N_RSYM, "RSYM"},
-    {MachO::N_SLINE, "SLINE"},
-    {MachO::N_ENSYM, "ENSYM"},
-    {MachO::N_SSYM, "SSYM"},
-    {MachO::N_SO, "SO"},
-    {MachO::N_OSO, "OSO"},
-    {MachO::N_LSYM, "LSYM"},
-    {MachO::N_BINCL, "BINCL"},
-    {MachO::N_SOL, "SOL"},
-    {MachO::N_PARAMS, "PARAM"},
-    {MachO::N_VERSION, "VERS"},
-    {MachO::N_OLEVEL, "OLEV"},
-    {MachO::N_PSYM, "PSYM"},
-    {MachO::N_EINCL, "EINCL"},
-    {MachO::N_ENTRY, "ENTRY"},
-    {MachO::N_LBRAC, "LBRAC"},
-    {MachO::N_EXCL, "EXCL"},
-    {MachO::N_RBRAC, "RBRAC"},
-    {MachO::N_BCOMM, "BCOMM"},
-    {MachO::N_ECOMM, "ECOMM"},
-    {MachO::N_ECOML, "ECOML"},
-    {MachO::N_LENG, "LENG"},
+    {MachO::N_GSYM, "GSYM"},    {MachO::N_FNAME, "FNAME"},
+    {MachO::N_FUN, "FUN"},      {MachO::N_STSYM, "STSYM"},
+    {MachO::N_LCSYM, "LCSYM"},  {MachO::N_BNSYM, "BNSYM"},
+    {MachO::N_PC, "PC"},        {MachO::N_AST, "AST"},
+    {MachO::N_OPT, "OPT"},      {MachO::N_RSYM, "RSYM"},
+    {MachO::N_SLINE, "SLINE"},  {MachO::N_ENSYM, "ENSYM"},
+    {MachO::N_SSYM, "SSYM"},    {MachO::N_SO, "SO"},
+    {MachO::N_OSO, "OSO"},      {MachO::N_LIB, "LIB"},
+    {MachO::N_LSYM, "LSYM"},    {MachO::N_BINCL, "BINCL"},
+    {MachO::N_SOL, "SOL"},      {MachO::N_PARAMS, "PARAM"},
+    {MachO::N_VERSION, "VERS"}, {MachO::N_OLEVEL, "OLEV"},
+    {MachO::N_PSYM, "PSYM"},    {MachO::N_EINCL, "EINCL"},
+    {MachO::N_ENTRY, "ENTRY"},  {MachO::N_LBRAC, "LBRAC"},
+    {MachO::N_EXCL, "EXCL"},    {MachO::N_RBRAC, "RBRAC"},
+    {MachO::N_BCOMM, "BCOMM"},  {MachO::N_ECOMM, "ECOMM"},
+    {MachO::N_ECOML, "ECOML"},  {MachO::N_LENG, "LENG"},
 };
 
 static const char *getDarwinStabString(uint8_t NType) {

From 0cbaff815cf2083b956af037c2efbdce722ed560 Mon Sep 17 00:00:00 2001
From: Aart Bik <39774503+aartbik@users.noreply.github.com>
Date: Thu, 26 Oct 2023 10:48:29 -0700
Subject: [PATCH 105/877] [mlir][sparse] cleanup conversion test (#70356)

Various TODOs had been added that actually removed the actual test.
This puts the CHECK test backs and removes the TODOs that have no
immediate plans.
---
 .../SparseTensor/convert_sparse2sparse.mlir       | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/mlir/test/Dialect/SparseTensor/convert_sparse2sparse.mlir b/mlir/test/Dialect/SparseTensor/convert_sparse2sparse.mlir
index 658e8aa40022e..0280e27b4e312 100644
--- a/mlir/test/Dialect/SparseTensor/convert_sparse2sparse.mlir
+++ b/mlir/test/Dialect/SparseTensor/convert_sparse2sparse.mlir
@@ -41,25 +41,24 @@ func.func @sparse_nop_convert(%arg0: tensor<64xf32, #SparseVector>) -> tensor<64
 }
 
 // CHECK-LABEL:   func.func @sparse_hidden_nop_cast
-// TODO: The following convert should be a cast instead.
-// CHECK:           sparse_tensor.convert
-// CHECK:           return
+// CHECK-NEXT:      sparse_tensor.convert
+// CHECK-NEXT:      return
 func.func @sparse_hidden_nop_cast(%arg0: tensor<32xf32, #SparseVector>) -> tensor<?xf32, #SparseVector> {
   %0 = sparse_tensor.convert %arg0 : tensor<32xf32, #SparseVector> to tensor<?xf32, #SparseVector>
   return %0 : tensor<?xf32, #SparseVector>
 }
 
 // CHECK-LABEL:   func.func @sparse_convert_1d_ss(
-// TODO: libgen path need to support efficient format conversion (e.g., 32 bit pos -> 64 bit pos).
-// Maybe we should use a different operator as well to be clear.
+// CHECK-NEXT:      sparse_tensor.convert
+// CHECK-NEXT:      return
 func.func @sparse_convert_1d_ss(%arg0: tensor<?xf32, #SparseVector64>) -> tensor<?xf32, #SparseVector32> {
   %0 = sparse_tensor.convert %arg0 : tensor<?xf32, #SparseVector64> to tensor<?xf32, #SparseVector32>
   return %0 : tensor<?xf32, #SparseVector32>
 }
 
 // CHECK-LABEL:   func.func @sparse_convert(
-// TODO: libgen path need to support efficient format conversion (e.g., 32 bit pos -> 64 bit pos).
-// Maybe we should use a different operator as well to be clear.
+// CHECK-NEXT:      sparse_tensor.convert
+// CHECK-NEXT:      return
 func.func @sparse_convert(%arg0: tensor<?xf32, #SparseVector64>) -> tensor<?xf32, #SparseVector32> {
   %0 = sparse_tensor.convert %arg0 : tensor<?xf32, #SparseVector64> to tensor<?xf32, #SparseVector32>
   return %0 : tensor<?xf32, #SparseVector32>
@@ -73,6 +72,7 @@ func.func @sparse_convert(%arg0: tensor<?xf32, #SparseVector64>) -> tensor<?xf32
 // CHECK:           sparse_tensor.foreach
 // CHECK:             sparse_tensor.insert
 // CHECK:           sparse_tensor.load
+// CHECK:           return
 func.func @sparse_convert_permuted(%arg0: tensor<?x?x?xf32, #SortedCOO3D>) -> tensor<?x?x?xf32, #TsssPermuted> {
   %0 = sparse_tensor.convert %arg0 : tensor<?x?x?xf32, #SortedCOO3D> to tensor<?x?x?xf32, #TsssPermuted>
   return %0 : tensor<?x?x?xf32, #TsssPermuted>
@@ -83,6 +83,7 @@ func.func @sparse_convert_permuted(%arg0: tensor<?x?x?xf32, #SortedCOO3D>) -> te
 // CHECK:             sparse_tensor.insert
 // CHECK:           sparse_tensor.load
 // CHECK-NOT:       sparse_tensor.reorder_coo
+// CHECK:           return
 func.func @sparse_convert_slice(%arg0: tensor<2x13xi32, #COOSlice>) -> (tensor<2x13xi32, #SortedCOO2D>)  {
   %0 = sparse_tensor.convert %arg0 : tensor<2x13xi32, #COOSlice> to tensor<2x13xi32, #SortedCOO2D>
   return %0 : tensor<2x13xi32, #SortedCOO2D>

From 02fcae844c30e086600018e20c2e512269da1126 Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Thu, 26 Oct 2023 17:54:25 +0000
Subject: [PATCH 106/877] [gn build] Port 88d00a6897d7

---
 llvm/utils/gn/secondary/llvm/tools/dsymutil/BUILD.gn | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/utils/gn/secondary/llvm/tools/dsymutil/BUILD.gn b/llvm/utils/gn/secondary/llvm/tools/dsymutil/BUILD.gn
index db8f5d1da3097..e962d100f217a 100644
--- a/llvm/utils/gn/secondary/llvm/tools/dsymutil/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/tools/dsymutil/BUILD.gn
@@ -28,6 +28,7 @@ driver_executable("dsymutil") {
     "DwarfLinkerForBinary.cpp",
     "MachODebugMapParser.cpp",
     "MachOUtils.cpp",
+    "RelocationMap.cpp",
     "Reproducer.cpp",
     "SymbolMap.cpp",
     "dsymutil.cpp",

From d307dc5b512753efa4db45576fa9aeed1de97e62 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Thu, 26 Oct 2023 10:38:59 -0700
Subject: [PATCH 107/877] [RISCV][GISel] Allow G_AND/G_OR/G_XOR to have s32
 types on RV64.

Even though we don't have W instructions for them. This treats them
the same as other binary operators.
---
 .../Target/RISCV/GISel/RISCVLegalizerInfo.cpp |  7 +---
 llvm/lib/Target/RISCV/RISCVGISel.td           | 10 +++++
 .../legalizer/rv64/legalize-abs.mir           | 22 ++++------
 .../legalizer/rv64/legalize-add.mir           | 10 +++--
 .../legalizer/rv64/legalize-addo-subo.mir     | 14 +++++--
 .../legalizer/rv64/legalize-and.mir           | 28 +++++++++----
 .../legalizer/rv64/legalize-ashr.mir          | 24 +++++------
 .../legalizer/rv64/legalize-div.mir           | 42 +++++++++----------
 .../legalizer/rv64/legalize-load.mir          | 31 ++++++--------
 .../legalizer/rv64/legalize-lshr.mir          | 42 +++++++++----------
 .../GlobalISel/legalizer/rv64/legalize-or.mir | 28 +++++++++----
 .../legalizer/rv64/legalize-rem.mir           | 42 +++++++++----------
 .../legalizer/rv64/legalize-shl.mir           | 24 +++++------
 .../legalizer/rv64/legalize-store.mir         | 22 ++++------
 .../legalizer/rv64/legalize-sub.mir           | 10 +++--
 .../legalizer/rv64/legalize-xor.mir           | 28 +++++++++----
 16 files changed, 212 insertions(+), 172 deletions(-)

diff --git a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
index 736422c4af131..1d4e279d524be 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
+++ b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
@@ -33,12 +33,7 @@ RISCVLegalizerInfo::RISCVLegalizerInfo(const RISCVSubtarget &ST) {
 
   using namespace TargetOpcode;
 
-  getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
-      .legalFor({XLenLLT})
-      .widenScalarToNextPow2(0)
-      .clampScalar(0, XLenLLT, XLenLLT);
-
-  getActionDefinitionsBuilder({G_ADD, G_SUB})
+  getActionDefinitionsBuilder({G_ADD, G_SUB, G_AND, G_OR, G_XOR})
       .legalFor({s32, XLenLLT})
       .widenScalarToNextPow2(0)
       .clampScalar(0, s32, XLenLLT);
diff --git a/llvm/lib/Target/RISCV/RISCVGISel.td b/llvm/lib/Target/RISCV/RISCVGISel.td
index 60896106bc0b5..fcac2f3655962 100644
--- a/llvm/lib/Target/RISCV/RISCVGISel.td
+++ b/llvm/lib/Target/RISCV/RISCVGISel.td
@@ -90,12 +90,22 @@ def : Pat<(XLenVT (sub GPR:$rs1, simm12Plus1:$imm)),
 let Predicates = [IsRV64] in {
 def : Pat<(i32 (add GPR:$rs1, GPR:$rs2)), (ADDW GPR:$rs1, GPR:$rs2)>;
 def : Pat<(i32 (sub GPR:$rs1, GPR:$rs2)), (SUBW GPR:$rs1, GPR:$rs2)>;
+def : Pat<(i32 (and GPR:$rs1, GPR:$rs2)), (AND GPR:$rs1, GPR:$rs2)>;
+def : Pat<(i32 (or GPR:$rs1, GPR:$rs2)), (OR GPR:$rs1, GPR:$rs2)>;
+def : Pat<(i32 (xor GPR:$rs1, GPR:$rs2)), (XOR GPR:$rs1, GPR:$rs2)>;
 
 def : Pat<(i32 (add GPR:$rs1, simm12i32:$imm)),
           (ADDIW GPR:$rs1, (i64 (as_i64imm $imm)))>;
 def : Pat<(i32 (sub GPR:$rs1, simm12Plus1i32:$imm)),
           (ADDIW GPR:$rs1, (i64 (NegImm $imm)))>;
 
+def : Pat<(i32 (and GPR:$rs1, simm12i32:$imm)),
+          (ANDI GPR:$rs1, (i64 (as_i64imm $imm)))>;
+def : Pat<(i32 (or GPR:$rs1, simm12i32:$imm)),
+          (ORI GPR:$rs1, (i64 (as_i64imm $imm)))>;
+def : Pat<(i32 (xor GPR:$rs1, simm12i32:$imm)),
+          (XORI GPR:$rs1, (i64 (as_i64imm $imm)))>;
+
 def : Pat<(i32 (shl GPR:$rs1, (i32 GPR:$rs2))), (SLLW GPR:$rs1, GPR:$rs2)>;
 def : Pat<(i32 (sra GPR:$rs1, (i32 GPR:$rs2))), (SRAW GPR:$rs1, GPR:$rs2)>;
 def : Pat<(i32 (srl GPR:$rs1, (i32 GPR:$rs2))), (SRLW GPR:$rs1, GPR:$rs2)>;
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-abs.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-abs.mir
index ae86ede4724ab..d7b694b9dfdb8 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-abs.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-abs.mir
@@ -15,11 +15,10 @@ body:             |
     ; CHECK-NEXT: [[ASHR1:%[0-9]+]]:_(s32) = G_ASHR [[ASHR]], [[C]](s32)
     ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[ASSERT_ZEXT]](s64)
     ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[TRUNC1]], [[ASHR1]]
-    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[ADD]](s32)
-    ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[ASHR1]](s32)
-    ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(s64) = G_XOR [[ANYEXT]], [[ANYEXT1]]
+    ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[ADD]], [[ASHR1]]
     ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 255
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[XOR]], [[C2]]
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[XOR]](s32)
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[ANYEXT]], [[C2]]
     ; CHECK-NEXT: $x10 = COPY [[AND]](s64)
     ; CHECK-NEXT: PseudoRET implicit $x10
     %1:_(s64) = COPY $x10
@@ -45,11 +44,10 @@ body:             |
     ; CHECK-NEXT: [[ASHR1:%[0-9]+]]:_(s32) = G_ASHR [[ASHR]], [[C]](s32)
     ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[ASSERT_SEXT]](s64)
     ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[TRUNC1]], [[ASHR1]]
-    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[ADD]](s32)
-    ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[ASHR1]](s32)
-    ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(s64) = G_XOR [[ANYEXT]], [[ANYEXT1]]
+    ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[ADD]], [[ASHR1]]
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[XOR]](s32)
     ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 48
-    ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s64) = G_SHL [[XOR]], [[C2]](s64)
+    ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C2]](s64)
     ; CHECK-NEXT: [[ASHR2:%[0-9]+]]:_(s64) = G_ASHR [[SHL1]], [[C2]](s64)
     ; CHECK-NEXT: $x10 = COPY [[ASHR2]](s64)
     ; CHECK-NEXT: PseudoRET implicit $x10
@@ -72,11 +70,9 @@ body:             |
     ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 31
     ; CHECK-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[TRUNC]], [[C]](s32)
     ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[TRUNC]], [[ASHR]]
-    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[ADD]](s32)
-    ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[ASHR]](s32)
-    ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(s64) = G_XOR [[ANYEXT]], [[ANYEXT1]]
-    ; CHECK-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s64) = G_SEXT_INREG [[XOR]], 32
-    ; CHECK-NEXT: $x10 = COPY [[SEXT_INREG]](s64)
+    ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[ADD]], [[ASHR]]
+    ; CHECK-NEXT: [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[XOR]](s32)
+    ; CHECK-NEXT: $x10 = COPY [[SEXT]](s64)
     ; CHECK-NEXT: PseudoRET implicit $x10
     %1:_(s64) = COPY $x10
     %2:_(s64) = G_ASSERT_SEXT %1, 32
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-add.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-add.mir
index 4a406fbe76cd3..11ac3c5991376 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-add.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-add.mir
@@ -195,11 +195,15 @@ body:             |
     ; CHECK-NEXT: [[ADD2:%[0-9]+]]:_(s64) = G_ADD [[ADD1]], [[AND]]
     ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
     ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s64) = G_ICMP intpred(eq), [[ADD2]](s64), [[C1]]
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[ICMP2]], [[ICMP]]
-    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s64) = G_OR [[ICMP1]], [[AND1]]
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[ICMP2]](s64)
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[ICMP]](s64)
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[TRUNC]], [[TRUNC1]]
+    ; CHECK-NEXT: [[TRUNC2:%[0-9]+]]:_(s32) = G_TRUNC [[ICMP1]](s64)
+    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[TRUNC2]], [[AND1]]
     ; CHECK-NEXT: [[ADD3:%[0-9]+]]:_(s64) = G_ADD %hi1, %hi2
     ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s64) = G_AND [[OR]], [[C2]]
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR]](s32)
+    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s64) = G_AND [[ANYEXT]], [[C2]]
     ; CHECK-NEXT: [[ADD4:%[0-9]+]]:_(s64) = G_ADD [[ADD3]], [[AND2]]
     ; CHECK-NEXT: $x10 = COPY [[ADD]](s64)
     ; CHECK-NEXT: $x11 = COPY [[ADD2]](s64)
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-addo-subo.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-addo-subo.mir
index 5659f89f55ede..6600f7f30d729 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-addo-subo.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-addo-subo.mir
@@ -122,9 +122,12 @@ body:             |
     ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(slt), [[ADD]](s64), [[COPY]]
     ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s64) = G_ICMP intpred(slt), [[COPY1]](s64), [[C]]
-    ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(s64) = G_XOR [[ICMP1]], [[ICMP]]
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[ICMP1]](s64)
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[ICMP]](s64)
+    ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[TRUNC]], [[TRUNC1]]
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[XOR]](s32)
     ; CHECK-NEXT: $x10 = COPY [[ADD]](s64)
-    ; CHECK-NEXT: $x11 = COPY [[XOR]](s64)
+    ; CHECK-NEXT: $x11 = COPY [[ANYEXT]](s64)
     ; CHECK-NEXT: PseudoRET implicit $x10, implicit $x11
     %0:_(s64) = COPY $x10
     %1:_(s64) = COPY $x11
@@ -255,9 +258,12 @@ body:             |
     ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(slt), [[SUB]](s64), [[COPY]]
     ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s64) = G_ICMP intpred(sgt), [[COPY1]](s64), [[C]]
-    ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(s64) = G_XOR [[ICMP1]], [[ICMP]]
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[ICMP1]](s64)
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[ICMP]](s64)
+    ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[TRUNC]], [[TRUNC1]]
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[XOR]](s32)
     ; CHECK-NEXT: $x10 = COPY [[SUB]](s64)
-    ; CHECK-NEXT: $x11 = COPY [[XOR]](s64)
+    ; CHECK-NEXT: $x11 = COPY [[ANYEXT]](s64)
     ; CHECK-NEXT: PseudoRET implicit $x10, implicit $x11
     %0:_(s64) = COPY $x10
     %1:_(s64) = COPY $x11
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-and.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-and.mir
index f7802eb3b6b2f..89541575cf1c8 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-and.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-and.mir
@@ -8,8 +8,11 @@ body:             |
     ; CHECK-LABEL: name: and_i8
     ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY]], [[COPY1]]
-    ; CHECK-NEXT: $x10 = COPY [[AND]](s64)
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64)
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[TRUNC]], [[TRUNC1]]
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[AND]](s32)
+    ; CHECK-NEXT: $x10 = COPY [[ANYEXT]](s64)
     ; CHECK-NEXT: PseudoRET implicit $x10
     %0:_(s64) = COPY $x10
     %1:_(s64) = COPY $x11
@@ -28,8 +31,11 @@ body:             |
     ; CHECK-LABEL: name: and_i15
     ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY]], [[COPY1]]
-    ; CHECK-NEXT: $x10 = COPY [[AND]](s64)
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64)
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[TRUNC]], [[TRUNC1]]
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[AND]](s32)
+    ; CHECK-NEXT: $x10 = COPY [[ANYEXT]](s64)
     ; CHECK-NEXT: PseudoRET implicit $x10
     %0:_(s64) = COPY $x10
     %1:_(s64) = COPY $x11
@@ -48,8 +54,11 @@ body:             |
     ; CHECK-LABEL: name: and_i16
     ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY]], [[COPY1]]
-    ; CHECK-NEXT: $x10 = COPY [[AND]](s64)
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64)
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[TRUNC]], [[TRUNC1]]
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[AND]](s32)
+    ; CHECK-NEXT: $x10 = COPY [[ANYEXT]](s64)
     ; CHECK-NEXT: PseudoRET implicit $x10
     %0:_(s64) = COPY $x10
     %1:_(s64) = COPY $x11
@@ -68,8 +77,11 @@ body:             |
     ; CHECK-LABEL: name: and_i32
     ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY]], [[COPY1]]
-    ; CHECK-NEXT: $x10 = COPY [[AND]](s64)
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64)
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[TRUNC]], [[TRUNC1]]
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[AND]](s32)
+    ; CHECK-NEXT: $x10 = COPY [[ANYEXT]](s64)
     ; CHECK-NEXT: PseudoRET implicit $x10
     %0:_(s64) = COPY $x10
     %1:_(s64) = COPY $x11
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-ashr.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-ashr.mir
index 02c6b630acbda..a422c42e77684 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-ashr.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-ashr.mir
@@ -8,14 +8,14 @@ body:             |
     ; CHECK-LABEL: name: ashr_i8
     ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 255
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY1]], [[C]]
-    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[AND]](s64)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64)
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[TRUNC]], [[C]]
     ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
     ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
     ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[TRUNC1]], [[C1]](s32)
     ; CHECK-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[SHL]], [[C1]](s32)
-    ; CHECK-NEXT: [[ASHR1:%[0-9]+]]:_(s32) = G_ASHR [[ASHR]], [[TRUNC]](s32)
+    ; CHECK-NEXT: [[ASHR1:%[0-9]+]]:_(s32) = G_ASHR [[ASHR]], [[AND]](s32)
     ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[ASHR1]](s32)
     ; CHECK-NEXT: $x10 = COPY [[ANYEXT]](s64)
     ; CHECK-NEXT: PseudoRET implicit $x10
@@ -36,14 +36,14 @@ body:             |
     ; CHECK-LABEL: name: ashr_i15
     ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 32767
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY1]], [[C]]
-    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[AND]](s64)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 32767
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64)
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[TRUNC]], [[C]]
     ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
     ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 17
     ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[TRUNC1]], [[C1]](s32)
     ; CHECK-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[SHL]], [[C1]](s32)
-    ; CHECK-NEXT: [[ASHR1:%[0-9]+]]:_(s32) = G_ASHR [[ASHR]], [[TRUNC]](s32)
+    ; CHECK-NEXT: [[ASHR1:%[0-9]+]]:_(s32) = G_ASHR [[ASHR]], [[AND]](s32)
     ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[ASHR1]](s32)
     ; CHECK-NEXT: $x10 = COPY [[ANYEXT]](s64)
     ; CHECK-NEXT: PseudoRET implicit $x10
@@ -64,14 +64,14 @@ body:             |
     ; CHECK-LABEL: name: ashr_i16
     ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 65535
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY1]], [[C]]
-    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[AND]](s64)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64)
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[TRUNC]], [[C]]
     ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
     ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
     ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[TRUNC1]], [[C1]](s32)
     ; CHECK-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[SHL]], [[C1]](s32)
-    ; CHECK-NEXT: [[ASHR1:%[0-9]+]]:_(s32) = G_ASHR [[ASHR]], [[TRUNC]](s32)
+    ; CHECK-NEXT: [[ASHR1:%[0-9]+]]:_(s32) = G_ASHR [[ASHR]], [[AND]](s32)
     ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[ASHR1]](s32)
     ; CHECK-NEXT: $x10 = COPY [[ANYEXT]](s64)
     ; CHECK-NEXT: PseudoRET implicit $x10
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-div.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-div.mir
index 5909a062844a6..d3895cb54092e 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-div.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-div.mir
@@ -332,13 +332,13 @@ body:             |
     ; CHECK-M-LABEL: name: udiv_i8
     ; CHECK-M: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
     ; CHECK-M-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
-    ; CHECK-M-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 255
-    ; CHECK-M-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY]], [[C]]
-    ; CHECK-M-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[AND]](s64)
-    ; CHECK-M-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 255
-    ; CHECK-M-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[COPY1]], [[C1]]
-    ; CHECK-M-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[AND1]](s64)
-    ; CHECK-M-NEXT: [[UDIV:%[0-9]+]]:_(s32) = G_UDIV [[TRUNC]], [[TRUNC1]]
+    ; CHECK-M-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
+    ; CHECK-M-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-M-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[TRUNC]], [[C]]
+    ; CHECK-M-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
+    ; CHECK-M-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64)
+    ; CHECK-M-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[TRUNC1]], [[C1]]
+    ; CHECK-M-NEXT: [[UDIV:%[0-9]+]]:_(s32) = G_UDIV [[AND]], [[AND1]]
     ; CHECK-M-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[UDIV]](s32)
     ; CHECK-M-NEXT: $x10 = COPY [[ANYEXT]](s64)
     ; CHECK-M-NEXT: PseudoRET implicit $x10
@@ -373,13 +373,13 @@ body:             |
     ; CHECK-M-LABEL: name: udiv_i15
     ; CHECK-M: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
     ; CHECK-M-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
-    ; CHECK-M-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 32767
-    ; CHECK-M-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY]], [[C]]
-    ; CHECK-M-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[AND]](s64)
-    ; CHECK-M-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 32767
-    ; CHECK-M-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[COPY1]], [[C1]]
-    ; CHECK-M-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[AND1]](s64)
-    ; CHECK-M-NEXT: [[UDIV:%[0-9]+]]:_(s32) = G_UDIV [[TRUNC]], [[TRUNC1]]
+    ; CHECK-M-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 32767
+    ; CHECK-M-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-M-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[TRUNC]], [[C]]
+    ; CHECK-M-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 32767
+    ; CHECK-M-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64)
+    ; CHECK-M-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[TRUNC1]], [[C1]]
+    ; CHECK-M-NEXT: [[UDIV:%[0-9]+]]:_(s32) = G_UDIV [[AND]], [[AND1]]
     ; CHECK-M-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[UDIV]](s32)
     ; CHECK-M-NEXT: $x10 = COPY [[ANYEXT]](s64)
     ; CHECK-M-NEXT: PseudoRET implicit $x10
@@ -414,13 +414,13 @@ body:             |
     ; CHECK-M-LABEL: name: udiv_i16
     ; CHECK-M: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
     ; CHECK-M-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
-    ; CHECK-M-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 65535
-    ; CHECK-M-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY]], [[C]]
-    ; CHECK-M-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[AND]](s64)
-    ; CHECK-M-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 65535
-    ; CHECK-M-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[COPY1]], [[C1]]
-    ; CHECK-M-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[AND1]](s64)
-    ; CHECK-M-NEXT: [[UDIV:%[0-9]+]]:_(s32) = G_UDIV [[TRUNC]], [[TRUNC1]]
+    ; CHECK-M-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+    ; CHECK-M-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-M-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[TRUNC]], [[C]]
+    ; CHECK-M-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+    ; CHECK-M-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64)
+    ; CHECK-M-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[TRUNC1]], [[C1]]
+    ; CHECK-M-NEXT: [[UDIV:%[0-9]+]]:_(s32) = G_UDIV [[AND]], [[AND1]]
     ; CHECK-M-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[UDIV]](s32)
     ; CHECK-M-NEXT: $x10 = COPY [[ANYEXT]](s64)
     ; CHECK-M-NEXT: PseudoRET implicit $x10
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-load.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-load.mir
index 43cb3445ac4cc..895e10a8b5fae 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-load.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-load.mir
@@ -220,10 +220,9 @@ body:             |
     ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
     ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
     ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[SHL]](s32)
-    ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[ZEXTLOAD]](s32)
-    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s64) = G_OR [[ANYEXT]], [[ANYEXT1]]
-    ; CHECK-NEXT: $x10 = COPY [[OR]](s64)
+    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR]](s32)
+    ; CHECK-NEXT: $x10 = COPY [[ANYEXT]](s64)
     ; CHECK-NEXT: PseudoRET implicit $x10
     %0:_(p0) = COPY $x10
     %1:_(s16) = G_LOAD %0(p0) :: (load (s16), align 1)
@@ -259,9 +258,7 @@ body:             |
     ; CHECK-NEXT: [[ZEXTLOAD1:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD]](p0) :: (load (s8) from unknown-address + 1)
     ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
     ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXTLOAD1]], [[C1]](s32)
-    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[SHL]](s32)
-    ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[ZEXTLOAD]](s32)
-    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s64) = G_OR [[ANYEXT]], [[ANYEXT1]]
+    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
     ; CHECK-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C2]](s64)
     ; CHECK-NEXT: [[ZEXTLOAD2:%[0-9]+]]:_(s32) = G_ZEXTLOAD [[PTR_ADD1]](p0) :: (load (s8) from unknown-address + 2)
@@ -270,15 +267,12 @@ body:             |
     ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s8) from unknown-address + 3)
     ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
     ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C4]](s32)
-    ; CHECK-NEXT: [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[SHL1]](s32)
-    ; CHECK-NEXT: [[ANYEXT3:%[0-9]+]]:_(s64) = G_ANYEXT [[ZEXTLOAD2]](s32)
-    ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s64) = G_OR [[ANYEXT2]], [[ANYEXT3]]
-    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[OR1]](s64)
+    ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD2]]
     ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; CHECK-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[TRUNC]], [[C5]](s32)
-    ; CHECK-NEXT: [[ANYEXT4:%[0-9]+]]:_(s64) = G_ANYEXT [[SHL2]](s32)
-    ; CHECK-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[ANYEXT4]], [[OR]]
-    ; CHECK-NEXT: $x10 = COPY [[OR2]](s64)
+    ; CHECK-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C5]](s32)
+    ; CHECK-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR2]](s32)
+    ; CHECK-NEXT: $x10 = COPY [[ANYEXT]](s64)
     ; CHECK-NEXT: PseudoRET implicit $x10
     %0:_(p0) = COPY $x10
     %1:_(s32) = G_LOAD %0(p0) :: (load (s32), align 1)
@@ -314,10 +308,9 @@ body:             |
     ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s16) from unknown-address + 2)
     ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
     ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
-    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[SHL]](s32)
-    ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[ZEXTLOAD]](s32)
-    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s64) = G_OR [[ANYEXT]], [[ANYEXT1]]
-    ; CHECK-NEXT: $x10 = COPY [[OR]](s64)
+    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR]](s32)
+    ; CHECK-NEXT: $x10 = COPY [[ANYEXT]](s64)
     ; CHECK-NEXT: PseudoRET implicit $x10
     %0:_(p0) = COPY $x10
     %1:_(s32) = G_LOAD %0(p0) :: (load (s32), align 2)
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-lshr.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-lshr.mir
index 78cd648706de9..aa34e7d2cdcae 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-lshr.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-lshr.mir
@@ -8,13 +8,13 @@ body:             |
     ; CHECK-LABEL: name: lshr_i8
     ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 255
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY1]], [[C]]
-    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[AND]](s64)
-    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 255
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[COPY]], [[C1]]
-    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[AND1]](s64)
-    ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[TRUNC1]], [[TRUNC]](s32)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64)
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[TRUNC]], [[C]]
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[TRUNC1]], [[C1]]
+    ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[AND1]], [[AND]](s32)
     ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LSHR]](s32)
     ; CHECK-NEXT: $x10 = COPY [[ANYEXT]](s64)
     ; CHECK-NEXT: PseudoRET implicit $x10
@@ -35,13 +35,13 @@ body:             |
     ; CHECK-LABEL: name: lshr_i15
     ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 32767
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY1]], [[C]]
-    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[AND]](s64)
-    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 32767
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[COPY]], [[C1]]
-    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[AND1]](s64)
-    ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[TRUNC1]], [[TRUNC]](s32)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 32767
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64)
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[TRUNC]], [[C]]
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 32767
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[TRUNC1]], [[C1]]
+    ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[AND1]], [[AND]](s32)
     ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LSHR]](s32)
     ; CHECK-NEXT: $x10 = COPY [[ANYEXT]](s64)
     ; CHECK-NEXT: PseudoRET implicit $x10
@@ -62,13 +62,13 @@ body:             |
     ; CHECK-LABEL: name: lshr_i16
     ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 65535
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY1]], [[C]]
-    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[AND]](s64)
-    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 65535
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[COPY]], [[C1]]
-    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[AND1]](s64)
-    ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[TRUNC1]], [[TRUNC]](s32)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64)
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[TRUNC]], [[C]]
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[TRUNC1]], [[C1]]
+    ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[AND1]], [[AND]](s32)
     ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LSHR]](s32)
     ; CHECK-NEXT: $x10 = COPY [[ANYEXT]](s64)
     ; CHECK-NEXT: PseudoRET implicit $x10
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-or.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-or.mir
index d5b26a42b203e..3c56929ef67bd 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-or.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-or.mir
@@ -8,8 +8,11 @@ body:             |
     ; CHECK-LABEL: name: or_i8
     ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
-    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s64) = G_OR [[COPY]], [[COPY1]]
-    ; CHECK-NEXT: $x10 = COPY [[OR]](s64)
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64)
+    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[TRUNC]], [[TRUNC1]]
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR]](s32)
+    ; CHECK-NEXT: $x10 = COPY [[ANYEXT]](s64)
     ; CHECK-NEXT: PseudoRET implicit $x10
     %0:_(s64) = COPY $x10
     %1:_(s64) = COPY $x11
@@ -28,8 +31,11 @@ body:             |
     ; CHECK-LABEL: name: or_i15
     ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
-    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s64) = G_OR [[COPY]], [[COPY1]]
-    ; CHECK-NEXT: $x10 = COPY [[OR]](s64)
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64)
+    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[TRUNC]], [[TRUNC1]]
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR]](s32)
+    ; CHECK-NEXT: $x10 = COPY [[ANYEXT]](s64)
     ; CHECK-NEXT: PseudoRET implicit $x10
     %0:_(s64) = COPY $x10
     %1:_(s64) = COPY $x11
@@ -48,8 +54,11 @@ body:             |
     ; CHECK-LABEL: name: or_i16
     ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
-    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s64) = G_OR [[COPY]], [[COPY1]]
-    ; CHECK-NEXT: $x10 = COPY [[OR]](s64)
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64)
+    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[TRUNC]], [[TRUNC1]]
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR]](s32)
+    ; CHECK-NEXT: $x10 = COPY [[ANYEXT]](s64)
     ; CHECK-NEXT: PseudoRET implicit $x10
     %0:_(s64) = COPY $x10
     %1:_(s64) = COPY $x11
@@ -68,8 +77,11 @@ body:             |
     ; CHECK-LABEL: name: or_i32
     ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
-    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s64) = G_OR [[COPY]], [[COPY1]]
-    ; CHECK-NEXT: $x10 = COPY [[OR]](s64)
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64)
+    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[TRUNC]], [[TRUNC1]]
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR]](s32)
+    ; CHECK-NEXT: $x10 = COPY [[ANYEXT]](s64)
     ; CHECK-NEXT: PseudoRET implicit $x10
     %0:_(s64) = COPY $x10
     %1:_(s64) = COPY $x11
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-rem.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-rem.mir
index ec165ce551330..e43516ee4acbb 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-rem.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-rem.mir
@@ -332,13 +332,13 @@ body:             |
     ; CHECK-M-LABEL: name: urem_i8
     ; CHECK-M: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
     ; CHECK-M-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
-    ; CHECK-M-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 255
-    ; CHECK-M-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY]], [[C]]
-    ; CHECK-M-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[AND]](s64)
-    ; CHECK-M-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 255
-    ; CHECK-M-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[COPY1]], [[C1]]
-    ; CHECK-M-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[AND1]](s64)
-    ; CHECK-M-NEXT: [[UREM:%[0-9]+]]:_(s32) = G_UREM [[TRUNC]], [[TRUNC1]]
+    ; CHECK-M-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
+    ; CHECK-M-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-M-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[TRUNC]], [[C]]
+    ; CHECK-M-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
+    ; CHECK-M-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64)
+    ; CHECK-M-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[TRUNC1]], [[C1]]
+    ; CHECK-M-NEXT: [[UREM:%[0-9]+]]:_(s32) = G_UREM [[AND]], [[AND1]]
     ; CHECK-M-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[UREM]](s32)
     ; CHECK-M-NEXT: $x10 = COPY [[ANYEXT]](s64)
     ; CHECK-M-NEXT: PseudoRET implicit $x10
@@ -373,13 +373,13 @@ body:             |
     ; CHECK-M-LABEL: name: urem_i15
     ; CHECK-M: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
     ; CHECK-M-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
-    ; CHECK-M-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 32767
-    ; CHECK-M-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY]], [[C]]
-    ; CHECK-M-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[AND]](s64)
-    ; CHECK-M-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 32767
-    ; CHECK-M-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[COPY1]], [[C1]]
-    ; CHECK-M-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[AND1]](s64)
-    ; CHECK-M-NEXT: [[UREM:%[0-9]+]]:_(s32) = G_UREM [[TRUNC]], [[TRUNC1]]
+    ; CHECK-M-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 32767
+    ; CHECK-M-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-M-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[TRUNC]], [[C]]
+    ; CHECK-M-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 32767
+    ; CHECK-M-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64)
+    ; CHECK-M-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[TRUNC1]], [[C1]]
+    ; CHECK-M-NEXT: [[UREM:%[0-9]+]]:_(s32) = G_UREM [[AND]], [[AND1]]
     ; CHECK-M-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[UREM]](s32)
     ; CHECK-M-NEXT: $x10 = COPY [[ANYEXT]](s64)
     ; CHECK-M-NEXT: PseudoRET implicit $x10
@@ -414,13 +414,13 @@ body:             |
     ; CHECK-M-LABEL: name: urem_i16
     ; CHECK-M: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
     ; CHECK-M-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
-    ; CHECK-M-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 65535
-    ; CHECK-M-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY]], [[C]]
-    ; CHECK-M-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[AND]](s64)
-    ; CHECK-M-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 65535
-    ; CHECK-M-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[COPY1]], [[C1]]
-    ; CHECK-M-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[AND1]](s64)
-    ; CHECK-M-NEXT: [[UREM:%[0-9]+]]:_(s32) = G_UREM [[TRUNC]], [[TRUNC1]]
+    ; CHECK-M-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+    ; CHECK-M-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-M-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[TRUNC]], [[C]]
+    ; CHECK-M-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+    ; CHECK-M-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64)
+    ; CHECK-M-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[TRUNC1]], [[C1]]
+    ; CHECK-M-NEXT: [[UREM:%[0-9]+]]:_(s32) = G_UREM [[AND]], [[AND1]]
     ; CHECK-M-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[UREM]](s32)
     ; CHECK-M-NEXT: $x10 = COPY [[ANYEXT]](s64)
     ; CHECK-M-NEXT: PseudoRET implicit $x10
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-shl.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-shl.mir
index 3d81a140e3be2..0f9587b4c8b8d 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-shl.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-shl.mir
@@ -8,11 +8,11 @@ body:             |
     ; CHECK-LABEL: name: shl_i8
     ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 255
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY1]], [[C]]
-    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[AND]](s64)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64)
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[TRUNC]], [[C]]
     ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
-    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[TRUNC1]], [[TRUNC]](s32)
+    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[TRUNC1]], [[AND]](s32)
     ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[SHL]](s32)
     ; CHECK-NEXT: $x10 = COPY [[ANYEXT]](s64)
     ; CHECK-NEXT: PseudoRET implicit $x10
@@ -33,11 +33,11 @@ body:             |
     ; CHECK-LABEL: name: shl_i15
     ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 32767
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY1]], [[C]]
-    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[AND]](s64)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 32767
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64)
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[TRUNC]], [[C]]
     ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
-    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[TRUNC1]], [[TRUNC]](s32)
+    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[TRUNC1]], [[AND]](s32)
     ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[SHL]](s32)
     ; CHECK-NEXT: $x10 = COPY [[ANYEXT]](s64)
     ; CHECK-NEXT: PseudoRET implicit $x10
@@ -58,11 +58,11 @@ body:             |
     ; CHECK-LABEL: name: shl_i16
     ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 65535
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY1]], [[C]]
-    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[AND]](s64)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64)
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[TRUNC]], [[C]]
     ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
-    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[TRUNC1]], [[TRUNC]](s32)
+    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[TRUNC1]], [[AND]](s32)
     ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[SHL]](s32)
     ; CHECK-NEXT: $x10 = COPY [[ANYEXT]](s64)
     ; CHECK-NEXT: PseudoRET implicit $x10
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-store.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-store.mir
index 0cba84fad1e2a..fb4ead8f8d711 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-store.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-store.mir
@@ -209,10 +209,9 @@ body:             |
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $x11
     ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
     ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 65535
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY]], [[C1]]
-    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[AND]](s64)
-    ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[TRUNC1]], [[C]](s32)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[TRUNC]], [[C1]]
+    ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[AND]], [[C]](s32)
     ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
     ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY1]], [[C2]](s64)
     ; CHECK-NEXT: G_STORE [[TRUNC]](s32), [[COPY1]](p0) :: (store (s8))
@@ -254,20 +253,17 @@ body:             |
     ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
     ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY1]], [[C1]](s64)
     ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 65535
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY]], [[C3]]
-    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[AND]](s64)
-    ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[TRUNC1]], [[C2]](s32)
+    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C3]]
+    ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[AND]], [[C2]](s32)
     ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
     ; CHECK-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY1]], [[C4]](s64)
     ; CHECK-NEXT: G_STORE [[COPY2]](s32), [[COPY1]](p0) :: (store (s8))
     ; CHECK-NEXT: G_STORE [[LSHR1]](s32), [[PTR_ADD1]](p0) :: (store (s8) into unknown-address + 1)
     ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LSHR]](s32)
-    ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 65535
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[ANYEXT]], [[C6]]
-    ; CHECK-NEXT: [[TRUNC2:%[0-9]+]]:_(s32) = G_TRUNC [[AND1]](s64)
-    ; CHECK-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[TRUNC2]], [[C5]](s32)
+    ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C6]]
+    ; CHECK-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[AND1]], [[C5]](s32)
     ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
     ; CHECK-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD]], [[C7]](s64)
     ; CHECK-NEXT: G_STORE [[LSHR]](s32), [[PTR_ADD]](p0) :: (store (s8) into unknown-address + 2)
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-sub.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-sub.mir
index 81c4edf2add7c..20bf8929c5552 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-sub.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-sub.mir
@@ -195,11 +195,15 @@ body:             |
     ; CHECK-NEXT: [[SUB2:%[0-9]+]]:_(s64) = G_SUB [[SUB1]], [[AND]]
     ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
     ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s64) = G_ICMP intpred(eq), [[SUB1]](s64), [[C1]]
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[ICMP2]], [[ICMP]]
-    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s64) = G_OR [[ICMP1]], [[AND1]]
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[ICMP2]](s64)
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[ICMP]](s64)
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[TRUNC]], [[TRUNC1]]
+    ; CHECK-NEXT: [[TRUNC2:%[0-9]+]]:_(s32) = G_TRUNC [[ICMP1]](s64)
+    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[TRUNC2]], [[AND1]]
     ; CHECK-NEXT: [[SUB3:%[0-9]+]]:_(s64) = G_SUB %hi1, %hi2
     ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s64) = G_AND [[OR]], [[C2]]
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR]](s32)
+    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s64) = G_AND [[ANYEXT]], [[C2]]
     ; CHECK-NEXT: [[SUB4:%[0-9]+]]:_(s64) = G_SUB [[SUB3]], [[AND2]]
     ; CHECK-NEXT: $x10 = COPY [[SUB]](s64)
     ; CHECK-NEXT: $x11 = COPY [[SUB2]](s64)
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-xor.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-xor.mir
index 80081bd2498a1..469f8b25f7ec1 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-xor.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-xor.mir
@@ -8,8 +8,11 @@ body:             |
     ; CHECK-LABEL: name: xor_i8
     ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
-    ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(s64) = G_XOR [[COPY]], [[COPY1]]
-    ; CHECK-NEXT: $x10 = COPY [[XOR]](s64)
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64)
+    ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[TRUNC]], [[TRUNC1]]
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[XOR]](s32)
+    ; CHECK-NEXT: $x10 = COPY [[ANYEXT]](s64)
     ; CHECK-NEXT: PseudoRET implicit $x10
     %0:_(s64) = COPY $x10
     %1:_(s64) = COPY $x11
@@ -28,8 +31,11 @@ body:             |
     ; CHECK-LABEL: name: xor_i15
     ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
-    ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(s64) = G_XOR [[COPY]], [[COPY1]]
-    ; CHECK-NEXT: $x10 = COPY [[XOR]](s64)
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64)
+    ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[TRUNC]], [[TRUNC1]]
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[XOR]](s32)
+    ; CHECK-NEXT: $x10 = COPY [[ANYEXT]](s64)
     ; CHECK-NEXT: PseudoRET implicit $x10
     %0:_(s64) = COPY $x10
     %1:_(s64) = COPY $x11
@@ -48,8 +54,11 @@ body:             |
     ; CHECK-LABEL: name: xor_i16
     ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
-    ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(s64) = G_XOR [[COPY]], [[COPY1]]
-    ; CHECK-NEXT: $x10 = COPY [[XOR]](s64)
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64)
+    ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[TRUNC]], [[TRUNC1]]
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[XOR]](s32)
+    ; CHECK-NEXT: $x10 = COPY [[ANYEXT]](s64)
     ; CHECK-NEXT: PseudoRET implicit $x10
     %0:_(s64) = COPY $x10
     %1:_(s64) = COPY $x11
@@ -68,8 +77,11 @@ body:             |
     ; CHECK-LABEL: name: xor_i32
     ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
-    ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(s64) = G_XOR [[COPY]], [[COPY1]]
-    ; CHECK-NEXT: $x10 = COPY [[XOR]](s64)
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64)
+    ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[TRUNC]], [[TRUNC1]]
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[XOR]](s32)
+    ; CHECK-NEXT: $x10 = COPY [[ANYEXT]](s64)
     ; CHECK-NEXT: PseudoRET implicit $x10
     %0:_(s64) = COPY $x10
     %1:_(s64) = COPY $x11

From 645b7795d49774ea055fc4e803bf99f742d2739a Mon Sep 17 00:00:00 2001
From: Youngsuk Kim <youngsuk.kim@hpe.com>
Date: Thu, 26 Oct 2023 12:55:26 -0500
Subject: [PATCH 108/877] [mlir] Remove no-op ptr-to-ptr bitcasts (NFC)

Opaque pointer cleanup effort. NFC.
---
 mlir/lib/ExecutionEngine/ExecutionEngine.cpp              | 1 -
 .../LLVMIR/Dialect/OpenACC/OpenACCToLLVMIRTranslation.cpp | 8 ++------
 .../LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp   | 8 ++------
 3 files changed, 4 insertions(+), 13 deletions(-)

diff --git a/mlir/lib/ExecutionEngine/ExecutionEngine.cpp b/mlir/lib/ExecutionEngine/ExecutionEngine.cpp
index 9acbc9adf8140..dbcc0ba6fc99c 100644
--- a/mlir/lib/ExecutionEngine/ExecutionEngine.cpp
+++ b/mlir/lib/ExecutionEngine/ExecutionEngine.cpp
@@ -194,7 +194,6 @@ static void packFunctionArguments(Module *module) {
           builder.CreateGEP(builder.getInt8PtrTy(), argList, retIndex);
       llvm::Value *retPtr =
           builder.CreateLoad(builder.getInt8PtrTy(), retPtrPtr);
-      retPtr = builder.CreateBitCast(retPtr, result->getType()->getPointerTo());
       builder.CreateStore(result, retPtr);
     }
 
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenACC/OpenACCToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenACC/OpenACCToLLVMIRTranslation.cpp
index 37fec190d6f40..9ce20d798ab6b 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenACC/OpenACCToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenACC/OpenACCToLLVMIRTranslation.cpp
@@ -115,17 +115,13 @@ processOperands(llvm::IRBuilderBase &builder,
     llvm::Value *ptrBaseGEP = builder.CreateInBoundsGEP(
         arrI8PtrTy, mapperAllocas.ArgsBase,
         {builder.getInt32(0), builder.getInt32(index)});
-    llvm::Value *ptrBaseCast = builder.CreateBitCast(
-        ptrBaseGEP, dataPtrBase->getType()->getPointerTo());
-    builder.CreateStore(dataPtrBase, ptrBaseCast);
+    builder.CreateStore(dataPtrBase, ptrBaseGEP);
 
     // Store pointer extracted from operand into the i-th position of args.
     llvm::Value *ptrGEP = builder.CreateInBoundsGEP(
         arrI8PtrTy, mapperAllocas.Args,
         {builder.getInt32(0), builder.getInt32(index)});
-    llvm::Value *ptrCast =
-        builder.CreateBitCast(ptrGEP, dataPtr->getType()->getPointerTo());
-    builder.CreateStore(dataPtr, ptrCast);
+    builder.CreateStore(dataPtr, ptrGEP);
 
     // Store size extracted from operand into the i-th position of argSizes.
     llvm::Value *sizeGEP = builder.CreateInBoundsGEP(
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index eb8f6cf277b11..875ce11391587 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -1523,8 +1523,6 @@ convertOmpThreadprivate(Operation &opInst, llvm::IRBuilderBase &builder,
   LLVM::GlobalOp global =
       addressOfOp.getGlobal(moduleTranslation.symbolTable());
   llvm::GlobalValue *globalValue = moduleTranslation.lookupGlobal(global);
-  llvm::Value *data =
-      builder.CreateBitCast(globalValue, builder.getInt8PtrTy());
   llvm::Type *type = globalValue->getValueType();
   llvm::TypeSize typeSize =
       builder.GetInsertBlock()->getModule()->getDataLayout().getTypeStoreSize(
@@ -1532,12 +1530,10 @@ convertOmpThreadprivate(Operation &opInst, llvm::IRBuilderBase &builder,
   llvm::ConstantInt *size = builder.getInt64(typeSize.getFixedValue());
   llvm::StringRef suffix = llvm::StringRef(".cache", 6);
   std::string cacheName = (Twine(global.getSymName()).concat(suffix)).str();
-  // Emit runtime function and bitcast its type (i8*) to real data type.
   llvm::Value *callInst =
       moduleTranslation.getOpenMPBuilder()->createCachedThreadPrivate(
-          ompLoc, data, size, cacheName);
-  llvm::Value *result = builder.CreateBitCast(callInst, globalValue->getType());
-  moduleTranslation.mapValue(opInst.getResult(0), result);
+          ompLoc, globalValue, size, cacheName);
+  moduleTranslation.mapValue(opInst.getResult(0), callInst);
   return success();
 }
 

From fceb7193ea538827ce7cfdf43bb972d88df53878 Mon Sep 17 00:00:00 2001
From: Ralf Jung <post@ralfj.de>
Date: Thu, 26 Oct 2023 20:09:38 +0200
Subject: [PATCH 109/877] clarify NaN propagation in fptrunc (#68554)

Follow-up to #66579: while
implementing those semantics in Miri I realized there's a special case
to be considered in truncating float casts.
---
 llvm/docs/LangRef.rst | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index cabb5cd1bed62..c97a7ae372bc6 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -11332,7 +11332,10 @@ environment <floatenv>`.
 NaN values follow the usual :ref:`NaN behaviors <floatnan>`, except that _if_ a
 NaN payload is propagated from the input ("Quieting NaN propagation" or
 "Unchanged NaN propagation" cases), then the low order bits of the NaN payload
-which cannot fit in the resulting type are discarded.
+which cannot fit in the resulting type are discarded. Note that if discarding
+the low order bits leads to an all-0 payload, this cannot be represented as a
+signaling NaN (it would represent an infinity instead), so in that case
+"Unchanged NaN propagation" is not possible.
 
 Example:
 """"""""

From ff94061a9f15c121fa1068caed680c6916a215e0 Mon Sep 17 00:00:00 2001
From: Aart Bik <39774503+aartbik@users.noreply.github.com>
Date: Thu, 26 Oct 2023 11:14:44 -0700
Subject: [PATCH 110/877] [mlir][sparse] remove reshape dot test (#70359)

This no longer tests a required feature.
---
 .../SparseTensor/sparse_reshape_dot.mlir      | 89 -------------------
 1 file changed, 89 deletions(-)
 delete mode 100644 mlir/test/Dialect/SparseTensor/sparse_reshape_dot.mlir

diff --git a/mlir/test/Dialect/SparseTensor/sparse_reshape_dot.mlir b/mlir/test/Dialect/SparseTensor/sparse_reshape_dot.mlir
deleted file mode 100644
index c562d6845e84f..0000000000000
--- a/mlir/test/Dialect/SparseTensor/sparse_reshape_dot.mlir
+++ /dev/null
@@ -1,89 +0,0 @@
-//
-// TODO: this test case is temporarily disabled as we are improving zero-cost sparse tensor reshaping.
-// XFAIL: *
-//
-// RUN: mlir-opt %s --linalg-generalize-named-ops --sparsification --cse --canonicalize | FileCheck %s
-
-#COO_2D = #sparse_tensor.encoding<{ map = (d0, d1) -> (d0 : compressed(nonunique), d1 : singleton), posWidth = 32, crdWidth = 32 }>
-#COO_3D = #sparse_tensor.encoding<{ map = (d0, d1, d2) -> (d0 : compressed(nonunique), d1 : singleton(nonunique), d2 : singleton), posWidth = 32, crdWidth = 32 }>
-
-
-// CHECK-LABEL:   func.func @sparse_reshape_fused(
-// CHECK-SAME:      %[[VAL_0:.*]]: tensor<5x6xf32>,
-// CHECK-SAME:      %[[VAL_1:.*]]: tensor<6x2x3xf32, #sparse_tensor.encoding<{{{.*}}}>>) -> tensor<?x?x?xf32> {
-// CHECK-DAG:       %[[VAL_2:.*]] = arith.constant false
-// CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 5 : index
-// CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 3 : index
-// CHECK-DAG:       %[[VAL_5:.*]] = arith.constant 0 : index
-// CHECK-DAG:       %[[VAL_6:.*]] = arith.constant 1 : index
-// CHECK-DAG:       %[[VAL_7:.*]] = tensor.empty() : tensor<5x6xf32>
-// CHECK-DAG:       %[[VAL_8:.*]] = sparse_tensor.positions %[[VAL_1]] {level = 0 : index}
-// CHECK-DAG:       %[[VAL_9:.*]] = sparse_tensor.coordinates %[[VAL_1]] {level = 0 : index}
-// CHECK-DAG:       %[[VAL_10:.*]] = sparse_tensor.coordinates %[[VAL_1]] {level = 1 : index}
-// CHECK-DAG:       %[[VAL_11:.*]] = sparse_tensor.coordinates %[[VAL_1]] {level = 2 : index}
-// CHECK-DAG:       %[[VAL_12:.*]] = sparse_tensor.values %[[VAL_1]]
-// CHECK-DAG:       %[[VAL_13:.*]] = bufferization.to_memref %[[VAL_7]] : memref<5x6xf32>
-// CHECK:           scf.for %[[VAL_14:.*]] = %[[VAL_5]] to %[[VAL_3]] step %[[VAL_6]] {
-// CHECK:             %[[VAL_15:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_5]]] : memref<?xi32>
-// CHECK:             %[[VAL_16:.*]] = arith.extui %[[VAL_15]] : i32 to i64
-// CHECK:             %[[VAL_17:.*]] = arith.index_cast %[[VAL_16]] : i64 to index
-// CHECK:             %[[VAL_18:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_6]]] : memref<?xi32>
-// CHECK:             %[[VAL_19:.*]] = arith.extui %[[VAL_18]] : i32 to i64
-// CHECK:             %[[VAL_20:.*]] = arith.index_cast %[[VAL_19]] : i64 to index
-// CHECK:             %[[VAL_21:.*]] = scf.while (%[[VAL_22:.*]] = %[[VAL_17]]) : (index) -> index {
-// CHECK:               %[[VAL_23:.*]] = arith.cmpi ult, %[[VAL_22]], %[[VAL_20]] : index
-// CHECK:               scf.condition(%[[VAL_23]]) %[[VAL_22]] : index
-// CHECK:             } do {
-// CHECK:             ^bb0(%[[VAL_24:.*]]: index):
-// CHECK:               %[[VAL_25:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_24]]] : memref<?xi32, strided<[?], offset: ?>>
-// CHECK:               %[[VAL_26:.*]] = arith.extui %[[VAL_25]] : i32 to i64
-// CHECK:               %[[VAL_27:.*]] = arith.index_cast %[[VAL_26]] : i64 to index
-// CHECK:               %[[VAL_28:.*]] = scf.while (%[[VAL_29:.*]] = %[[VAL_24]]) : (index) -> index {
-// CHECK:                 %[[VAL_30:.*]] = arith.cmpi ult, %[[VAL_29]], %[[VAL_20]] : index
-// CHECK:                 %[[VAL_31:.*]] = scf.if %[[VAL_30]] -> (i1) {
-// CHECK:                   %[[VAL_32:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_29]]] : memref<?xi32, strided<[?], offset: ?>>
-// CHECK:                   %[[VAL_33:.*]] = arith.extui %[[VAL_32]] : i32 to i64
-// CHECK:                   %[[VAL_34:.*]] = arith.index_cast %[[VAL_33]] : i64 to index
-// CHECK:                   %[[VAL_35:.*]] = arith.cmpi eq, %[[VAL_34]], %[[VAL_27]] : index
-// CHECK:                   scf.yield %[[VAL_35]] : i1
-// CHECK:                 } else {
-// CHECK:                   scf.yield %[[VAL_2]] : i1
-// CHECK:                 }
-// CHECK:                 scf.condition(%[[VAL_36:.*]]) %[[VAL_29]] : index
-// CHECK:               } do {
-// CHECK:               ^bb0(%[[VAL_37:.*]]: index):
-// CHECK:                 %[[VAL_38:.*]] = arith.addi %[[VAL_37]], %[[VAL_6]] : index
-// CHECK:                 scf.yield %[[VAL_38]] : index
-// CHECK:               }
-// CHECK:               %[[VAL_39:.*]] = tensor.extract %[[VAL_0]]{{\[}}%[[VAL_14]], %[[VAL_27]]] : tensor<5x6xf32>
-// CHECK:               scf.for %[[VAL_40:.*]] = %[[VAL_24]] to %[[VAL_41:.*]] step %[[VAL_6]] {
-// CHECK:                 %[[VAL_42:.*]] = memref.load %[[VAL_10]]{{\[}}%[[VAL_40]]] : memref<?xi32, strided<[?], offset: ?>>
-// CHECK:                 %[[VAL_43:.*]] = arith.extui %[[VAL_42]] : i32 to i64
-// CHECK:                 %[[VAL_44:.*]] = arith.index_cast %[[VAL_43]] : i64 to index
-// CHECK:                 %[[VAL_45:.*]] = arith.muli %[[VAL_44]], %[[VAL_4]] : index
-// CHECK:                 %[[VAL_46:.*]] = memref.load %[[VAL_11]]{{\[}}%[[VAL_40]]] : memref<?xi32, strided<[?], offset: ?>>
-// CHECK:                 %[[VAL_47:.*]] = arith.extui %[[VAL_46]] : i32 to i64
-// CHECK:                 %[[VAL_48:.*]] = arith.index_cast %[[VAL_47]] : i64 to index
-// CHECK:                 %[[VAL_49:.*]] = arith.addi %[[VAL_45]], %[[VAL_48]] : index
-// CHECK:                 %[[VAL_50:.*]] = tensor.extract %[[VAL_7]]{{\[}}%[[VAL_14]], %[[VAL_49]]] : tensor<5x6xf32>
-// CHECK:                 %[[VAL_51:.*]] = memref.load %[[VAL_12]]{{\[}}%[[VAL_40]]] : memref<?xf32>
-// CHECK:                 %[[VAL_52:.*]] = arith.mulf %[[VAL_39]], %[[VAL_51]] : f32
-// CHECK:                 %[[VAL_53:.*]] = arith.addf %[[VAL_50]], %[[VAL_52]] : f32
-// CHECK:                 memref.store %[[VAL_53]], %[[VAL_13]]{{\[}}%[[VAL_14]], %[[VAL_49]]] : memref<5x6xf32>
-// CHECK:               } {"Emitted from" = "linalg.generic"}
-// CHECK:               scf.yield %[[VAL_54:.*]] : index
-// CHECK:             } attributes {"Emitted from" = "linalg.generic"}
-// CHECK:           } {"Emitted from" = "linalg.generic"}
-// CHECK:           %[[VAL_55:.*]] = bufferization.to_tensor %[[VAL_13]] : memref<5x6xf32>
-// CHECK:           %[[VAL_56:.*]] = tensor.expand_shape %[[VAL_55]] {{\[\[}}0], [1, 2]] : tensor<5x6xf32> into tensor<5x2x3xf32>
-// CHECK:           %[[VAL_57:.*]] = tensor.cast %[[VAL_56]] : tensor<5x2x3xf32> to tensor<?x?x?xf32>
-// CHECK:           return %[[VAL_57]] : tensor<?x?x?xf32>
-// CHECK:         }
-func.func @sparse_reshape_fused(%arg0: tensor<5x6xf32>, %arg1: tensor<6x2x3xf32, #COO_3D>) -> tensor<?x?x?xf32> {
-  %collapsed = tensor.collapse_shape %arg1 [[0], [1, 2]] : tensor<6x2x3xf32, #COO_3D> into tensor<6x6xf32, #COO_2D>
-  %0 = tensor.empty() : tensor<5x6xf32>
-  %2 = linalg.matmul ins(%arg0, %collapsed : tensor<5x6xf32>, tensor<6x6xf32, #COO_2D>) outs(%0 : tensor<5x6xf32>) -> tensor<5x6xf32>
-  %expanded = tensor.expand_shape %2 [[0], [1, 2]] : tensor<5x6xf32> into tensor<5x2x3xf32>
-  %ret1 = tensor.cast %expanded : tensor<5x2x3xf32> to tensor<?x?x?xf32>
-  return %ret1 : tensor<?x?x?xf32>
-}

From cf07904ee4ef5445342404f28193cb8583c0a61c Mon Sep 17 00:00:00 2001
From: Jinsong Ji <jinsong.ji@intel.com>
Date: Thu, 26 Oct 2023 14:26:47 -0400
Subject: [PATCH 111/877] [Github] Fix libc docs build (#70363)

https://github.com/llvm/llvm-project/pull/69824 added libc build, but
missed the folder in ninja command, is causing failures.

ninja: fatal: chdir to 'docs-libc-html' - No such file or directory
ninja: Entering directory `docs-libc-html'
---
 .github/workflows/docs.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
index cbb3706cc1bcf..e6af2f41167e0 100644
--- a/.github/workflows/docs.yml
+++ b/.github/workflows/docs.yml
@@ -119,7 +119,7 @@ jobs:
         if: steps.docs-changed-subprojects.outputs.libc_any_changed == 'true'
         run: |
           cmake -B libc-build -GNinja -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_RUNTIMES="libc" -DLLVM_ENABLE_SPHINX=ON ./runtimes
-          TZ=UTC ninja -C docs-libc-html
+          TZ=UTC ninja -C libc-build docs-libc-html
       - name: Build LLD docs
         if: steps.docs-changed-subprojects.outputs.lld_any_changed == 'true'
         run: |

From f9ead46931aef2978ddf350ba6523638175d7861 Mon Sep 17 00:00:00 2001
From: Jessica Clarke <jrtc27@jrtc27.com>
Date: Thu, 26 Oct 2023 19:28:28 +0100
Subject: [PATCH 112/877] [AST] Only dump desugared type when visibly different
 (#65214)

These are an artifact of how types are structured but serve little
purpose, merely showing that the type is sugared in some way. For
example, ElaboratedType's existence means struct S gets printed as
'struct S':'struct S' in the AST, which is unnecessary visual clutter.
Note that skipping the second print when the types have the same string
matches what we do for diagnostics, where the aka will be skipped.
---
 clang/docs/HowToSetupToolingForLLVM.rst       |  2 +-
 clang/docs/ReleaseNotes.rst                   | 22 +++++++
 clang/lib/AST/JSONNodeDumper.cpp              | 10 +++-
 clang/lib/AST/TextNodeDumper.cpp              | 13 ++--
 .../AST/HLSL/this-reference-template.hlsl     |  8 +--
 .../test/AST/ast-dump-APValue-anon-union.cpp  | 10 ++--
 clang/test/AST/ast-dump-APValue-struct.cpp    | 12 ++--
 clang/test/AST/ast-dump-APValue-union.cpp     | 10 ++--
 clang/test/AST/ast-dump-attr.cpp              | 12 ++--
 clang/test/AST/ast-dump-decl-json.c           |  4 --
 clang/test/AST/ast-dump-decl-json.m           |  1 -
 clang/test/AST/ast-dump-decl.cpp              | 26 ++++----
 clang/test/AST/ast-dump-decl.m                |  2 +-
 clang/test/AST/ast-dump-expr-json.c           |  6 --
 clang/test/AST/ast-dump-expr-json.cpp         | 40 -------------
 clang/test/AST/ast-dump-expr-json.m           | 24 --------
 clang/test/AST/ast-dump-expr.c                |  6 +-
 clang/test/AST/ast-dump-expr.cpp              | 22 +++----
 clang/test/AST/ast-dump-fpfeatures.cpp        |  2 +-
 clang/test/AST/ast-dump-funcs.cpp             |  2 +-
 clang/test/AST/ast-dump-functionprototype.cpp |  6 +-
 clang/test/AST/ast-dump-lambda.cpp            |  2 +-
 clang/test/AST/ast-dump-objc-arc-json.m       |  1 -
 ...openmp-begin-declare-variant_reference.cpp |  4 +-
 ...penmp-begin-declare-variant_template_1.cpp |  8 +--
 ...penmp-begin-declare-variant_template_2.cpp | 14 ++---
 ...penmp-begin-declare-variant_template_3.cpp | 44 +++++++-------
 .../AST/ast-dump-overloaded-operators.cpp     | 16 ++---
 clang/test/AST/ast-dump-records-json.cpp      |  7 ---
 clang/test/AST/ast-dump-records.cpp           |  8 +--
 clang/test/AST/ast-dump-recovery.cpp          | 12 ++--
 clang/test/AST/ast-dump-stmt-json.cpp         | 60 -------------------
 clang/test/AST/ast-dump-stmt.cpp              | 52 ++++++++--------
 clang/test/AST/ast-dump-stmt.m                |  2 +-
 .../test/AST/ast-dump-template-decls-json.cpp |  1 -
 clang/test/AST/ast-dump-template-decls.cpp    |  2 +-
 ...dump-template-json-win32-mangler-crash.cpp |  2 -
 clang/test/AST/ast-dump-temporaries-json.cpp  |  5 --
 clang/test/AST/ast-dump-types-json.cpp        |  1 -
 clang/test/AST/coroutine-locals-cleanup.cpp   |  4 +-
 clang/test/AST/float16.cpp                    | 32 +++++-----
 clang/test/AST/nrvo.c                         | 12 ++--
 clang/test/AST/sourceranges.cpp               |  4 +-
 clang/test/C/drs/dr253.c                      |  2 +-
 .../dcl.decl/dcl.init/dcl.init.ref/p4-ast.cpp |  6 +-
 clang/test/Import/objc-param-decl/test.m      |  2 +-
 clang/test/OpenMP/align_clause_ast_print.cpp  |  2 +-
 clang/test/OpenMP/generic_loop_ast_print.cpp  |  6 +-
 clang/test/OpenMP/scope_ast_print.cpp         |  4 +-
 clang/test/SemaCXX/co_await-ast.cpp           | 14 ++---
 clang/test/SemaCXX/consteval-cleanup.cpp      | 14 ++---
 .../address-space-deduction.clcpp             |  2 +-
 clang/test/SemaOpenCLCXX/addrspace-auto.clcpp |  8 +--
 .../aggregate-deduction-candidate.cpp         | 34 +++++------
 clang/test/SemaTemplate/deduction-guide.cpp   | 12 ++--
 .../SemaTemplate/default-expr-arguments-3.cpp |  6 +-
 clang/test/SemaTemplate/make_integer_seq.cpp  |  8 +--
 clang/test/SemaTemplate/pr47676.cpp           |  2 +-
 clang/test/SemaTemplate/type_pack_element.cpp |  2 +-
 clang/unittests/AST/ASTImporterTest.cpp       |  2 +-
 60 files changed, 269 insertions(+), 390 deletions(-)

diff --git a/clang/docs/HowToSetupToolingForLLVM.rst b/clang/docs/HowToSetupToolingForLLVM.rst
index 62189511aeb2a..dc1c17b0ae68d 100644
--- a/clang/docs/HowToSetupToolingForLLVM.rst
+++ b/clang/docs/HowToSetupToolingForLLVM.rst
@@ -172,7 +172,7 @@ Examples:
   clang::ASTConsumer *newASTConsumer() (CompoundStmt 0x44da290 </home/alexfh/local/llvm/tools/clang/tools/clang-check/ClangCheck.cpp:64:40, line:72:3>
     (IfStmt 0x44d97c8 <line:65:5, line:66:45>
       <<<NULL>>>
-        (ImplicitCastExpr 0x44d96d0 <line:65:9> '_Bool':'_Bool' <UserDefinedConversion>
+        (ImplicitCastExpr 0x44d96d0 <line:65:9> '_Bool' <UserDefinedConversion>
   ...
   $ clang-check tools/clang/tools/clang-check/ClangCheck.cpp -ast-print -ast-dump-filter ActionFactory::newASTConsumer
   Processing: tools/clang/tools/clang-check/ClangCheck.cpp.
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 7238386231e1a..82550232947f7 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -113,6 +113,28 @@ ABI Changes in This Version
 - Following the SystemV ABI for x86-64, ``__int128`` arguments will no longer
   be split between a register and a stack slot.
 
+AST Dumping Potentially Breaking Changes
+----------------------------------------
+- When dumping a sugared type, Clang will no longer print the desugared type if
+  its textual representation is the same as the sugared one. This applies to
+  both text dumps of the form ``'foo':'foo'`` which will now be dumped as just
+  ``'foo'``, and JSON dumps of the form:
+
+  .. code-block:: json
+
+    "type": {
+      "qualType": "foo",
+      "desugaredQualType": "foo"
+    }
+
+  which will now be dumped as just:
+
+  .. code-block:: json
+
+    "type": {
+      "qualType": "foo"
+    }
+
 What's New in Clang |release|?
 ==============================
 Some of the major new features and improvements to Clang are listed
diff --git a/clang/lib/AST/JSONNodeDumper.cpp b/clang/lib/AST/JSONNodeDumper.cpp
index beb07015f0bcb..25b94ec5616b1 100644
--- a/clang/lib/AST/JSONNodeDumper.cpp
+++ b/clang/lib/AST/JSONNodeDumper.cpp
@@ -315,12 +315,16 @@ std::string JSONNodeDumper::createPointerRepresentation(const void *Ptr) {
 
 llvm::json::Object JSONNodeDumper::createQualType(QualType QT, bool Desugar) {
   SplitQualType SQT = QT.split();
-  llvm::json::Object Ret{{"qualType", QualType::getAsString(SQT, PrintPolicy)}};
+  std::string SQTS = QualType::getAsString(SQT, PrintPolicy);
+  llvm::json::Object Ret{{"qualType", SQTS}};
 
   if (Desugar && !QT.isNull()) {
     SplitQualType DSQT = QT.getSplitDesugaredType();
-    if (DSQT != SQT)
-      Ret["desugaredQualType"] = QualType::getAsString(DSQT, PrintPolicy);
+    if (DSQT != SQT) {
+      std::string DSQTS = QualType::getAsString(DSQT, PrintPolicy);
+      if (DSQTS != SQTS)
+        Ret["desugaredQualType"] = DSQTS;
+    }
     if (const auto *TT = QT->getAs<TypedefType>())
       Ret["typeAliasDeclId"] = createPointerRepresentation(TT->getDecl());
   }
diff --git a/clang/lib/AST/TextNodeDumper.cpp b/clang/lib/AST/TextNodeDumper.cpp
index 15eb6c6191edf..ca609cf2d2060 100644
--- a/clang/lib/AST/TextNodeDumper.cpp
+++ b/clang/lib/AST/TextNodeDumper.cpp
@@ -692,13 +692,18 @@ void TextNodeDumper::dumpBareType(QualType T, bool Desugar) {
   ColorScope Color(OS, ShowColors, TypeColor);
 
   SplitQualType T_split = T.split();
-  OS << "'" << QualType::getAsString(T_split, PrintPolicy) << "'";
+  std::string T_str = QualType::getAsString(T_split, PrintPolicy);
+  OS << "'" << T_str << "'";
 
   if (Desugar && !T.isNull()) {
-    // If the type is sugared, also dump a (shallow) desugared type.
+    // If the type is sugared, also dump a (shallow) desugared type when
+    // it is visibly different.
     SplitQualType D_split = T.getSplitDesugaredType();
-    if (T_split != D_split)
-      OS << ":'" << QualType::getAsString(D_split, PrintPolicy) << "'";
+    if (T_split != D_split) {
+      std::string D_str = QualType::getAsString(D_split, PrintPolicy);
+      if (T_str != D_str)
+        OS << ":'" << QualType::getAsString(D_split, PrintPolicy) << "'";
+    }
   }
 }
 
diff --git a/clang/test/AST/HLSL/this-reference-template.hlsl b/clang/test/AST/HLSL/this-reference-template.hlsl
index 3b7fba3efdc74..60e057986ebf8 100644
--- a/clang/test/AST/HLSL/this-reference-template.hlsl
+++ b/clang/test/AST/HLSL/this-reference-template.hlsl
@@ -35,12 +35,12 @@ void main() {
 // CHECK:     -CXXMethodDecl 0x{{[0-9A-Fa-f]+}} <line:8:3, line:10:3> line:8:5 used getFirst 'int ()' implicit_instantiation implicit-inline
 // CHECK-NEXT:-CompoundStmt 0x{{[0-9A-Fa-f]+}} <col:16, line:10:3>
 // CHECK-NEXT:-ReturnStmt 0x{{[0-9A-Fa-f]+}} <line:9:4, col:16>
-// CHECK-NEXT:-ImplicitCastExpr 0x{{[0-9A-Fa-f]+}} <col:11, col:16> 'int':'int' <LValueToRValue>
-// CHECK-NEXT:-MemberExpr 0x{{[0-9A-Fa-f]+}} <col:11, col:16> 'int':'int' lvalue .First 0x{{[0-9A-Fa-f]+}}
+// CHECK-NEXT:-ImplicitCastExpr 0x{{[0-9A-Fa-f]+}} <col:11, col:16> 'int' <LValueToRValue>
+// CHECK-NEXT:-MemberExpr 0x{{[0-9A-Fa-f]+}} <col:11, col:16> 'int' lvalue .First 0x{{[0-9A-Fa-f]+}}
 // CHECK-NEXT:-CXXThisExpr 0x{{[0-9A-Fa-f]+}} <col:11> 'Pair<int, float>' lvalue this
 // CHECK-NEXT:-CXXMethodDecl 0x{{[0-9A-Fa-f]+}} <line:12:3, line:14:3> line:12:5 used getSecond 'float ()' implicit_instantiation implicit-inline
 // CHECK-NEXT:-CompoundStmt 0x{{[0-9A-Fa-f]+}} <col:17, line:14:3>
 // CHECK-NEXT:-ReturnStmt 0x{{[0-9A-Fa-f]+}} <line:13:5, col:12>
-// CHECK-NEXT:-ImplicitCastExpr 0x{{[0-9A-Fa-f]+}} <col:12> 'float':'float' <LValueToRValue>
-// CHECK-NEXT:-MemberExpr 0x{{[0-9A-Fa-f]+}} <col:12> 'float':'float' lvalue .Second 0x{{[0-9A-Fa-f]+}}
+// CHECK-NEXT:-ImplicitCastExpr 0x{{[0-9A-Fa-f]+}} <col:12> 'float' <LValueToRValue>
+// CHECK-NEXT:-MemberExpr 0x{{[0-9A-Fa-f]+}} <col:12> 'float' lvalue .Second 0x{{[0-9A-Fa-f]+}}
 // CHECK-NEXT:-CXXThisExpr 0x{{[0-9A-Fa-f]+}} <col:12> 'Pair<int, float>' lvalue implicit this
diff --git a/clang/test/AST/ast-dump-APValue-anon-union.cpp b/clang/test/AST/ast-dump-APValue-anon-union.cpp
index 906bfe4857ed0..0e6466ee1fd73 100644
--- a/clang/test/AST/ast-dump-APValue-anon-union.cpp
+++ b/clang/test/AST/ast-dump-APValue-anon-union.cpp
@@ -30,23 +30,23 @@ union U1 {
 
 void Test() {
   constexpr S0 s0{};
-  // CHECK:  | `-VarDecl {{.*}} <col:{{.*}}, col:{{.*}}> col:{{.*}} s0 'const S0':'const S0' constexpr listinit
+  // CHECK:  | `-VarDecl {{.*}} <col:{{.*}}, col:{{.*}}> col:{{.*}} s0 'const S0' constexpr listinit
   // CHECK-NEXT:  |   |-value: Struct
   // CHECK-NEXT:  |   | `-field: Union .i Int 42
 
   constexpr U0 u0a{};
-  // CHECK:  | `-VarDecl {{.*}} <col:{{.*}}, col:{{.*}}> col:{{.*}} u0a 'const U0':'const U0' constexpr listinit
+  // CHECK:  | `-VarDecl {{.*}} <col:{{.*}}, col:{{.*}}> col:{{.*}} u0a 'const U0' constexpr listinit
   // CHECK-NEXT:  |   |-value: Union None
 
   constexpr U0 u0b{3.1415f};
-  // CHECK:  | `-VarDecl {{.*}} <col:{{.*}}, col:{{.*}}> col:{{.*}} u0b 'const U0':'const U0' constexpr listinit
+  // CHECK:  | `-VarDecl {{.*}} <col:{{.*}}, col:{{.*}}> col:{{.*}} u0b 'const U0' constexpr listinit
   // CHECK-NEXT:  |   |-value: Union .U0::(anonymous union at {{.*}}) Union .f Float 3.141500e+00
 
   constexpr U1 u1a{};
-  // CHECK:  | `-VarDecl {{.*}} <col:{{.*}}, col:{{.*}}> col:{{.*}} u1a 'const U1':'const U1' constexpr listinit
+  // CHECK:  | `-VarDecl {{.*}} <col:{{.*}}, col:{{.*}}> col:{{.*}} u1a 'const U1' constexpr listinit
   // CHECK-NEXT:  |   |-value: Union .U1::(anonymous union at {{.*}}) Union .f Float 0.000000e+00
 
   constexpr U1 u1b{3.1415f};
-  // CHECK:    `-VarDecl {{.*}} <col:{{.*}}, col:{{.*}}> col:{{.*}} u1b 'const U1':'const U1' constexpr listinit
+  // CHECK:    `-VarDecl {{.*}} <col:{{.*}}, col:{{.*}}> col:{{.*}} u1b 'const U1' constexpr listinit
   // CHECK-NEXT:      |-value: Union .U1::(anonymous union at {{.*}}) Union .f Float 3.141500e+00
 }
diff --git a/clang/test/AST/ast-dump-APValue-struct.cpp b/clang/test/AST/ast-dump-APValue-struct.cpp
index 04d1877c293d1..4730404abc287 100644
--- a/clang/test/AST/ast-dump-APValue-struct.cpp
+++ b/clang/test/AST/ast-dump-APValue-struct.cpp
@@ -60,12 +60,12 @@ struct S5 : S4 {
 
 void Test() {
   constexpr S0 s0{};
-  // CHECK:  | `-VarDecl {{.*}} <col:{{.*}}, col:{{.*}}> col:{{.*}} s0 'const S0':'const S0' constexpr listinit
+  // CHECK:  | `-VarDecl {{.*}} <col:{{.*}}, col:{{.*}}> col:{{.*}} s0 'const S0' constexpr listinit
   // CHECK-NEXT:  |   |-value: Struct
   // CHECK-NEXT:  |   | `-fields: Int 0, Union .j Int 0
 
   constexpr S1 s1{};
-  // CHECK:  | `-VarDecl {{.*}} <col:{{.*}}, col:{{.*}}> col:{{.*}} s1 'const S1':'const S1' constexpr listinit
+  // CHECK:  | `-VarDecl {{.*}} <col:{{.*}}, col:{{.*}}> col:{{.*}} s1 'const S1' constexpr listinit
   // CHECK-NEXT:  |   |-value: Struct
   // CHECK-NEXT:  |   | |-field: Int 0
   // CHECK-NEXT:  |   | `-field: Union .s
@@ -73,12 +73,12 @@ void Test() {
   // CHECK-NEXT:  |   |     `-field: Int 0
 
   constexpr S2 s2{};
-  // CHECK:  | `-VarDecl {{.*}} <col:{{.*}}, col:{{.*}}> col:{{.*}} s2 'const S2':'const S2' constexpr listinit
+  // CHECK:  | `-VarDecl {{.*}} <col:{{.*}}, col:{{.*}}> col:{{.*}} s2 'const S2' constexpr listinit
   // CHECK-NEXT:  |   |-value: Struct
   // CHECK-NEXT:  |   | `-fields: Int 0, Union .u Union .j Int 0
 
   constexpr S3 s3{};
-  // CHECK:  | `-VarDecl {{.*}} <col:{{.*}}, col:{{.*}}> col:{{.*}} s3 'const S3':'const S3' constexpr listinit
+  // CHECK:  | `-VarDecl {{.*}} <col:{{.*}}, col:{{.*}}> col:{{.*}} s3 'const S3' constexpr listinit
   // CHECK-NEXT:  |   |-value: Struct
   // CHECK-NEXT:  |   | |-field: Int 0
   // CHECK-NEXT:  |   | `-field: Union .u
@@ -87,7 +87,7 @@ void Test() {
   // CHECK-NEXT:  |   |       `-field: Int 0
 
   constexpr S4 s4{};
-  // CHECK:  | `-VarDecl {{.*}} <col:{{.*}}, col:{{.*}}> col:{{.*}} s4 'const S4':'const S4' constexpr listinit
+  // CHECK:  | `-VarDecl {{.*}} <col:{{.*}}, col:{{.*}}> col:{{.*}} s4 'const S4' constexpr listinit
   // CHECK-NEXT:  |   |-value: Struct
   // CHECK-NEXT:  |   | |-base: Struct
   // CHECK-NEXT:  |   | | `-fields: Int 0, Union .j Int 0
@@ -96,7 +96,7 @@ void Test() {
   // CHECK-NEXT:  |   | `-fields: Int 4, Int 5, Int 6
 
   constexpr S5 s5{};
-  // CHECK:    `-VarDecl {{.*}} <col:{{.*}}, col:{{.*}}> col:{{.*}} s5 'const S5':'const S5' constexpr listinit
+  // CHECK:    `-VarDecl {{.*}} <col:{{.*}}, col:{{.*}}> col:{{.*}} s5 'const S5' constexpr listinit
   // CHECK-NEXT:      |-value: Struct
   // CHECK-NEXT:      | |-base: Struct
   // CHECK-NEXT:      | | |-base: Struct
diff --git a/clang/test/AST/ast-dump-APValue-union.cpp b/clang/test/AST/ast-dump-APValue-union.cpp
index b70b5ea484a6e..c717b6ece7382 100644
--- a/clang/test/AST/ast-dump-APValue-union.cpp
+++ b/clang/test/AST/ast-dump-APValue-union.cpp
@@ -39,25 +39,25 @@ union U3 {
 
 void Test() {
   constexpr U0 u0{};
-  // CHECK:  | `-VarDecl {{.*}} <col:{{.*}}, col:{{.*}}> col:{{.*}} u0 'const U0':'const U0' constexpr listinit
+  // CHECK:  | `-VarDecl {{.*}} <col:{{.*}}, col:{{.*}}> col:{{.*}} u0 'const U0' constexpr listinit
   // CHECK-NEXT:  |   |-value: Union .i Int 42
 
   constexpr U1 u1{};
-  // CHECK:  | `-VarDecl {{.*}} <col:{{.*}}, col:{{.*}}> col:{{.*}} u1 'const U1':'const U1' constexpr listinit
+  // CHECK:  | `-VarDecl {{.*}} <col:{{.*}}, col:{{.*}}> col:{{.*}} u1 'const U1' constexpr listinit
   // CHECK-NEXT:  |   |-value: Union .uinner Union .f Float 3.141500e+00
 
   constexpr U2 u2{};
-  // CHECK:  | `-VarDecl {{.*}} <col:{{.*}}, col:{{.*}}> col:{{.*}} u2 'const U2':'const U2' constexpr listinit
+  // CHECK:  | `-VarDecl {{.*}} <col:{{.*}}, col:{{.*}}> col:{{.*}} u2 'const U2' constexpr listinit
   // CHECK-NEXT:  |   |-value: Union .uinner
   // CHECK-NEXT:  |   | `-Union .arr
   // CHECK-NEXT:  |   |   `-Array size=2
   // CHECK-NEXT:  |   |     `-elements: Int 1, Int 2
 
   constexpr U3 u3a = {.f = 3.1415};
-  // CHECK:  | `-VarDecl {{.*}} <col:{{.*}}, col:{{.*}}> col:{{.*}} u3a 'const U3':'const U3' constexpr cinit
+  // CHECK:  | `-VarDecl {{.*}} <col:{{.*}}, col:{{.*}}> col:{{.*}} u3a 'const U3' constexpr cinit
   // CHECK-NEXT:  |   |-value: Union .f Float 3.141500e+00
 
   constexpr U3 u3b = {.uinner = {}};
-  // CHECK:    `-VarDecl {{.*}} <col:{{.*}}, col:{{.*}}> col:{{.*}} u3b 'const U3':'const U3' constexpr cinit
+  // CHECK:    `-VarDecl {{.*}} <col:{{.*}}, col:{{.*}}> col:{{.*}} u3b 'const U3' constexpr cinit
   // CHECK-NEXT:      |-value: Union .uinner Union .d Float 3.141500e+00
 }
diff --git a/clang/test/AST/ast-dump-attr.cpp b/clang/test/AST/ast-dump-attr.cpp
index 8fd4a8f3a54c6..f5a7481571421 100644
--- a/clang/test/AST/ast-dump-attr.cpp
+++ b/clang/test/AST/ast-dump-attr.cpp
@@ -146,17 +146,17 @@ struct C { char a[16]; };
 // CHECK: ClassTemplateSpecializationDecl {{.*}} struct my_union
 // CHECK: CXXRecordDecl {{.*}} implicit struct my_union
 // CHECK: FieldDecl {{.*}} buffer 'char[1024]'
-// CHECK-NEXT: AlignedAttr {{.*}} alignas 'TestAligns::A':'TestAligns::A'
-// CHECK-NEXT: AlignedAttr {{.*}} alignas 'TestAligns::B':'TestAligns::B'
-// CHECK-NEXT: AlignedAttr {{.*}} alignas 'TestAligns::C':'TestAligns::C'
+// CHECK-NEXT: AlignedAttr {{.*}} alignas 'TestAligns::A'
+// CHECK-NEXT: AlignedAttr {{.*}} alignas 'TestAligns::B'
+// CHECK-NEXT: AlignedAttr {{.*}} alignas 'TestAligns::C'
 my_union<A, B, C> my_union_val;
 
 // CHECK: ClassTemplateSpecializationDecl {{.*}} struct my_union2
 // CHECK: CXXRecordDecl {{.*}} implicit struct my_union2
 // CHECK: FieldDecl {{.*}} buffer 'char[1024]'
-// CHECK-NEXT: AlignedAttr {{.*}} _Alignas 'TestAligns::A':'TestAligns::A'
-// CHECK-NEXT: AlignedAttr {{.*}} _Alignas 'TestAligns::B':'TestAligns::B'
-// CHECK-NEXT: AlignedAttr {{.*}} _Alignas 'TestAligns::C':'TestAligns::C'
+// CHECK-NEXT: AlignedAttr {{.*}} _Alignas 'TestAligns::A'
+// CHECK-NEXT: AlignedAttr {{.*}} _Alignas 'TestAligns::B'
+// CHECK-NEXT: AlignedAttr {{.*}} _Alignas 'TestAligns::C'
 my_union2<A, B, C> my_union2_val;
 
 } // namespace TestAligns
diff --git a/clang/test/AST/ast-dump-decl-json.c b/clang/test/AST/ast-dump-decl-json.c
index 55b918afaab5d..7f53cda5020b5 100644
--- a/clang/test/AST/ast-dump-decl-json.c
+++ b/clang/test/AST/ast-dump-decl-json.c
@@ -1351,7 +1351,6 @@ void testParmVarDecl(int TestParmVarDecl);
 // CHECK-NEXT:    "isUsed": true,
 // CHECK-NEXT:    "name": "x",
 // CHECK-NEXT:    "type": {
-// CHECK-NEXT:     "desugaredQualType": "enum Enum",
 // CHECK-NEXT:     "qualType": "enum Enum"
 // CHECK-NEXT:    }
 // CHECK-NEXT:   },
@@ -1424,7 +1423,6 @@ void testParmVarDecl(int TestParmVarDecl);
 // CHECK-NEXT:           }
 // CHECK-NEXT:          },
 // CHECK-NEXT:          "type": {
-// CHECK-NEXT:           "desugaredQualType": "enum Enum",
 // CHECK-NEXT:           "qualType": "enum Enum"
 // CHECK-NEXT:          },
 // CHECK-NEXT:          "valueCategory": "prvalue",
@@ -1446,7 +1444,6 @@ void testParmVarDecl(int TestParmVarDecl);
 // CHECK-NEXT:             }
 // CHECK-NEXT:            },
 // CHECK-NEXT:            "type": {
-// CHECK-NEXT:             "desugaredQualType": "enum Enum",
 // CHECK-NEXT:             "qualType": "enum Enum"
 // CHECK-NEXT:            },
 // CHECK-NEXT:            "valueCategory": "lvalue",
@@ -1455,7 +1452,6 @@ void testParmVarDecl(int TestParmVarDecl);
 // CHECK-NEXT:             "kind": "ParmVarDecl",
 // CHECK-NEXT:             "name": "x",
 // CHECK-NEXT:             "type": {
-// CHECK-NEXT:              "desugaredQualType": "enum Enum",
 // CHECK-NEXT:              "qualType": "enum Enum"
 // CHECK-NEXT:             }
 // CHECK-NEXT:            }
diff --git a/clang/test/AST/ast-dump-decl-json.m b/clang/test/AST/ast-dump-decl-json.m
index 9d82c6696cb52..f7067ac0d3b77 100644
--- a/clang/test/AST/ast-dump-decl-json.m
+++ b/clang/test/AST/ast-dump-decl-json.m
@@ -911,7 +911,6 @@ void f(void) {
 // CHECK-NEXT:    },
 // CHECK-NEXT:    "name": "T",
 // CHECK-NEXT:    "type": {
-// CHECK-NEXT:     "desugaredQualType": "id",
 // CHECK-NEXT:     "qualType": "id",
 // CHECK-NEXT:     "typeAliasDeclId": "0x{{.*}}"
 // CHECK-NEXT:    }
diff --git a/clang/test/AST/ast-dump-decl.cpp b/clang/test/AST/ast-dump-decl.cpp
index 017e640aeaea6..d74aa9045532e 100644
--- a/clang/test/AST/ast-dump-decl.cpp
+++ b/clang/test/AST/ast-dump-decl.cpp
@@ -248,19 +248,19 @@ namespace testFunctionTemplateDecl {
   // CHECK-NEXT:  | |-TemplateArgument type 'testFunctionTemplateDecl::A'
   // CHECK-NEXT:  | | `-RecordType 0{{.+}} 'testFunctionTemplateDecl::A'
   // CHECK-NEXT:  | |   `-CXXRecord 0x{{.+}} 'A'
-  // CHECK-NEXT:  | |-ParmVarDecl 0x{{.+}} <col:50> col:51 'testFunctionTemplateDecl::A':'testFunctionTemplateDecl::A'
+  // CHECK-NEXT:  | |-ParmVarDecl 0x{{.+}} <col:50> col:51 'testFunctionTemplateDecl::A'
   // CHECK-NEXT:  | `-CompoundStmt 0x{{.+}} <col:53, col:55>
   // CHECK-NEXT:  |-Function 0x{{.+}} 'TestFunctionTemplate' 'void (B)'
   // CHECK-NEXT:  |-FunctionDecl 0x{{.+}} <col:24, col:55> col:29 TestFunctionTemplate 'void (testFunctionTemplateDecl::C)'
   // CHECK-NEXT:  | |-TemplateArgument type 'testFunctionTemplateDecl::C'
   // CHECK-NEXT:  | | `-RecordType 0{{.+}} 'testFunctionTemplateDecl::C'
   // CHECK-NEXT:  | |   `-CXXRecord 0x{{.+}} 'C'
-  // CHECK-NEXT:  | `-ParmVarDecl 0x{{.+}} <col:50> col:51 'testFunctionTemplateDecl::C':'testFunctionTemplateDecl::C'
+  // CHECK-NEXT:  | `-ParmVarDecl 0x{{.+}} <col:50> col:51 'testFunctionTemplateDecl::C'
   // CHECK-NEXT:  `-FunctionDecl 0x{{.+}} <col:24, col:55> col:29 TestFunctionTemplate 'void (testFunctionTemplateDecl::D)'
   // CHECK-NEXT:    |-TemplateArgument type 'testFunctionTemplateDecl::D'
   // CHECK-NEXT:    | `-RecordType 0{{.+}} 'testFunctionTemplateDecl::D'
   // CHECK-NEXT:    |   `-CXXRecord 0x{{.+}} 'D'
-  // CHECK-NEXT:    |-ParmVarDecl 0x{{.+}} <col:50> col:51 'testFunctionTemplateDecl::D':'testFunctionTemplateDecl::D'
+  // CHECK-NEXT:    |-ParmVarDecl 0x{{.+}} <col:50> col:51 'testFunctionTemplateDecl::D'
   // CHECK-NEXT:    `-CompoundStmt 0x{{.+}} <col:53, col:55>
 
   // CHECK:       FunctionDecl 0x{{.+}} prev 0x{{.+}} <{{.+}}:[[@LINE-32]]:3, col:41> col:19 TestFunctionTemplate 'void (B)'
@@ -500,7 +500,7 @@ namespace testCanonicalTemplate {
   // CHECK-NEXT:     |-TemplateArgument type 'testCanonicalTemplate::A'{{$}}
   // CHECK-NEXT:     | `-RecordType 0x{{.+}} 'testCanonicalTemplate::A'{{$}}
   // CHECK-NEXT:     |   `-CXXRecord 0x{{.+}} 'A'{{$}}
-  // CHECK-NEXT:     `-ParmVarDecl 0x{{.*}} <col:50> col:51 'testCanonicalTemplate::A':'testCanonicalTemplate::A'{{$}}
+  // CHECK-NEXT:     `-ParmVarDecl 0x{{.*}} <col:50> col:51 'testCanonicalTemplate::A'{{$}}
 
   // CHECK:      FunctionTemplateDecl 0x{{.+}} prev 0x{{.+}} <{{.+}}:[[@LINE-12]]:3, col:51> col:29 TestFunctionTemplate{{$}}
   // CHECK-NEXT:   |-TemplateTypeParmDecl 0x{{.+}} <col:12, col:21> col:21 referenced typename depth 0 index 0 T{{$}}
@@ -613,15 +613,15 @@ namespace testCanonicalTemplate {
   // CHECK:      VarTemplateDecl 0x{{.+}} <{{.+}}:[[@LINE-11]]:7, col:43> col:43 TestVarTemplate{{$}}
   // CHECK-NEXT: |-TemplateTypeParmDecl 0x{{.+}} <col:16, col:25> col:25 referenced typename depth 0 index 0 T{{$}}
   // CHECK-NEXT: |-VarDecl 0x{{.+}} <col:28, col:43> col:43 TestVarTemplate 'const T' static{{$}}
-  // CHECK-NEXT: |-VarTemplateSpecializationDecl 0x{{.+}} parent 0x{{.+}} prev 0x{{.+}} <line:[[@LINE-11]]:3, col:34> col:14 referenced TestVarTemplate 'const int':'const int' implicit_instantiation cinit{{$}}
+  // CHECK-NEXT: |-VarTemplateSpecializationDecl 0x{{.+}} parent 0x{{.+}} prev 0x{{.+}} <line:[[@LINE-11]]:3, col:34> col:14 referenced TestVarTemplate 'const int' implicit_instantiation cinit{{$}}
   // CHECK-NEXT: | |-NestedNameSpecifier TypeSpec 'testCanonicalTemplate::S'{{$}}
   // CHECK-NEXT: | |-TemplateArgument type 'int'{{$}}
   // CHECK-NEXT: | | `-BuiltinType 0x{{.+}} 'int'{{$}}
-  // CHECK-NEXT: | `-InitListExpr 0x{{.+}} <col:32, col:34> 'int':'int'{{$}}
-  // CHECK-NEXT: `-VarTemplateSpecializationDecl 0x{{.+}} <line:[[@LINE-19]]:28, col:43> col:43 referenced TestVarTemplate 'const int':'const int' implicit_instantiation static{{$}}
+  // CHECK-NEXT: | `-InitListExpr 0x{{.+}} <col:32, col:34> 'int'{{$}}
+  // CHECK-NEXT: `-VarTemplateSpecializationDecl 0x{{.+}} <line:[[@LINE-19]]:28, col:43> col:43 referenced TestVarTemplate 'const int' implicit_instantiation static{{$}}
   // CHECK-NEXT:   `-TemplateArgument type 'int'{{$}}
 
-  // CHECK:     VarTemplateSpecializationDecl 0x{{.+}} <{{.+}}:[[@LINE-22]]:28, col:43> col:43 referenced TestVarTemplate 'const int':'const int' implicit_instantiation static{{$}}
+  // CHECK:     VarTemplateSpecializationDecl 0x{{.+}} <{{.+}}:[[@LINE-22]]:28, col:43> col:43 referenced TestVarTemplate 'const int' implicit_instantiation static{{$}}
   // CHECK-NEXT:`-TemplateArgument type 'int'{{$}}
   // CHECK-NEXT:  `-BuiltinType 0x{{.+}} 'int'{{$}}
 
@@ -630,14 +630,14 @@ namespace testCanonicalTemplate {
   // CHECK-NEXT: |-VarDecl 0x{{.+}} parent 0x{{.+}} prev 0x{{.+}} <line:[[@LINE-25]]:3, col:34> col:14 TestVarTemplate 'const T' cinit{{$}}
   // CHECK-NEXT: | |-NestedNameSpecifier TypeSpec 'testCanonicalTemplate::S'{{$}}
   // CHECK-NEXT: | `-InitListExpr 0x{{.+}} <col:32, col:34> 'void'{{$}}
-  // CHECK-NEXT: |-VarTemplateSpecialization 0x{{.+}} 'TestVarTemplate' 'const int':'const int'{{$}}
-  // CHECK-NEXT: `-VarTemplateSpecialization 0x{{.+}} 'TestVarTemplate' 'const int':'const int'{{$}}
+  // CHECK-NEXT: |-VarTemplateSpecialization 0x{{.+}} 'TestVarTemplate' 'const int'{{$}}
+  // CHECK-NEXT: `-VarTemplateSpecialization 0x{{.+}} 'TestVarTemplate' 'const int'{{$}}
     
-  // CHECK:      VarTemplateSpecializationDecl 0x{{.+}} parent 0x{{.+}} prev 0x{{.+}} <{{.+}}:[[@LINE-31]]:3, col:34> col:14 referenced TestVarTemplate 'const int':'const int' implicit_instantiation cinit{{$}}
+  // CHECK:      VarTemplateSpecializationDecl 0x{{.+}} parent 0x{{.+}} prev 0x{{.+}} <{{.+}}:[[@LINE-31]]:3, col:34> col:14 referenced TestVarTemplate 'const int' implicit_instantiation cinit{{$}}
   // CHECK-NEXT: |-NestedNameSpecifier TypeSpec 'testCanonicalTemplate::S'{{$}}
   // CHECK-NEXT: |-TemplateArgument type 'int'{{$}}
   // CHECK-NEXT: | `-BuiltinType 0x{{.+}} 'int'{{$}}
-  // CHECK-NEXT: `-InitListExpr 0x{{.+}} <col:32, col:34> 'int':'int'{{$}}
+  // CHECK-NEXT: `-InitListExpr 0x{{.+}} <col:32, col:34> 'int'{{$}}
 } 
 
 template <class T>
@@ -856,7 +856,7 @@ namespace TestConstexprVariableTemplateWithInitializer {
   inline constexpr in_place_type_t<_Tp> in_place_type{};
   // CHECK:     -VarTemplateDecl 0x{{.+}} <line:[[@LINE-2]]:3, line:[[@LINE-1]]:55> col:41 in_place_type{{$}}
   // CHECK-NEXT: |-TemplateTypeParmDecl 0x{{.+}} <line:[[@LINE-3]]:13, col:22> col:22 referenced typename depth 0 index 0 _Tp{{$}}
-  // CHECK-NEXT: `-VarDecl 0x{{.+}} <line:[[@LINE-3]]:3, col:55> col:41 in_place_type 'const in_place_type_t<_Tp>':'const in_place_type_t<_Tp>' inline constexpr listinit{{$}}
+  // CHECK-NEXT: `-VarDecl 0x{{.+}} <line:[[@LINE-3]]:3, col:55> col:41 in_place_type 'const in_place_type_t<_Tp>' inline constexpr listinit{{$}}
   // CHECK-NEXT:  `-InitListExpr 0x{{.+}} <col:54, col:55> 'void'{{$}}
 
   template <typename T> constexpr T call_init(0);
diff --git a/clang/test/AST/ast-dump-decl.m b/clang/test/AST/ast-dump-decl.m
index 0b259d522645d..5f09b6042f409 100644
--- a/clang/test/AST/ast-dump-decl.m
+++ b/clang/test/AST/ast-dump-decl.m
@@ -94,7 +94,7 @@ @interface TestGenericInterface<T> : A<P> {
 // CHECK:      ObjCInterfaceDecl{{.*}} TestGenericInterface
 // CHECK-NEXT:   -super ObjCInterface {{.+}} 'A'
 // CHECK-NEXT:   -ObjCProtocol {{.+}} 'P'
-// CHECK-NEXT:   -ObjCTypeParamDecl {{.+}} <col:33> col:33 T 'id':'id'
+// CHECK-NEXT:   -ObjCTypeParamDecl {{.+}} <col:33> col:33 T 'id'
 
 @implementation TestObjCClass (TestObjCCategoryDecl)
 - (void) bar {
diff --git a/clang/test/AST/ast-dump-expr-json.c b/clang/test/AST/ast-dump-expr-json.c
index 14238283af3f2..e910864eeed65 100644
--- a/clang/test/AST/ast-dump-expr-json.c
+++ b/clang/test/AST/ast-dump-expr-json.c
@@ -4080,7 +4080,6 @@ void PrimaryExpressions(int a) {
 // CHECK-NEXT:    "isUsed": true,
 // CHECK-NEXT:    "name": "b",
 // CHECK-NEXT:    "type": {
-// CHECK-NEXT:     "desugaredQualType": "struct S",
 // CHECK-NEXT:     "qualType": "struct S"
 // CHECK-NEXT:    }
 // CHECK-NEXT:   },
@@ -4530,7 +4529,6 @@ void PrimaryExpressions(int a) {
 // CHECK-NEXT:           }
 // CHECK-NEXT:          },
 // CHECK-NEXT:          "type": {
-// CHECK-NEXT:           "desugaredQualType": "struct S",
 // CHECK-NEXT:           "qualType": "struct S"
 // CHECK-NEXT:          },
 // CHECK-NEXT:          "valueCategory": "lvalue",
@@ -4539,7 +4537,6 @@ void PrimaryExpressions(int a) {
 // CHECK-NEXT:           "kind": "ParmVarDecl",
 // CHECK-NEXT:           "name": "b",
 // CHECK-NEXT:           "type": {
-// CHECK-NEXT:            "desugaredQualType": "struct S",
 // CHECK-NEXT:            "qualType": "struct S"
 // CHECK-NEXT:           }
 // CHECK-NEXT:          }
@@ -4817,7 +4814,6 @@ void PrimaryExpressions(int a) {
 // CHECK-NEXT:       }
 // CHECK-NEXT:      },
 // CHECK-NEXT:      "type": {
-// CHECK-NEXT:       "desugaredQualType": "struct S",
 // CHECK-NEXT:       "qualType": "struct S"
 // CHECK-NEXT:      },
 // CHECK-NEXT:      "valueCategory": "prvalue",
@@ -4839,7 +4835,6 @@ void PrimaryExpressions(int a) {
 // CHECK-NEXT:         }
 // CHECK-NEXT:        },
 // CHECK-NEXT:        "type": {
-// CHECK-NEXT:         "desugaredQualType": "struct S",
 // CHECK-NEXT:         "qualType": "struct S"
 // CHECK-NEXT:        },
 // CHECK-NEXT:        "valueCategory": "lvalue",
@@ -4860,7 +4855,6 @@ void PrimaryExpressions(int a) {
 // CHECK-NEXT:           }
 // CHECK-NEXT:          },
 // CHECK-NEXT:          "type": {
-// CHECK-NEXT:           "desugaredQualType": "struct S",
 // CHECK-NEXT:           "qualType": "struct S"
 // CHECK-NEXT:          },
 // CHECK-NEXT:          "valueCategory": "prvalue",
diff --git a/clang/test/AST/ast-dump-expr-json.cpp b/clang/test/AST/ast-dump-expr-json.cpp
index eac0346d64319..0fb07b0b434cc 100644
--- a/clang/test/AST/ast-dump-expr-json.cpp
+++ b/clang/test/AST/ast-dump-expr-json.cpp
@@ -325,7 +325,6 @@ void TestNonADLCall3() {
 // CHECK-NEXT:    "isUsed": true,
 // CHECK-NEXT:    "name": "obj1",
 // CHECK-NEXT:    "type": {
-// CHECK-NEXT:     "desugaredQualType": "S",
 // CHECK-NEXT:     "qualType": "S"
 // CHECK-NEXT:    }
 // CHECK-NEXT:   },
@@ -462,7 +461,6 @@ void TestNonADLCall3() {
 // CHECK-NEXT:         }
 // CHECK-NEXT:        },
 // CHECK-NEXT:        "type": {
-// CHECK-NEXT:         "desugaredQualType": "S",
 // CHECK-NEXT:         "qualType": "S"
 // CHECK-NEXT:        },
 // CHECK-NEXT:        "valueCategory": "lvalue",
@@ -471,7 +469,6 @@ void TestNonADLCall3() {
 // CHECK-NEXT:         "kind": "ParmVarDecl",
 // CHECK-NEXT:         "name": "obj1",
 // CHECK-NEXT:         "type": {
-// CHECK-NEXT:          "desugaredQualType": "S",
 // CHECK-NEXT:          "qualType": "S"
 // CHECK-NEXT:         }
 // CHECK-NEXT:        }
@@ -733,7 +730,6 @@ void TestNonADLCall3() {
 // CHECK-NEXT:             }
 // CHECK-NEXT:            },
 // CHECK-NEXT:            "type": {
-// CHECK-NEXT:             "desugaredQualType": "S",
 // CHECK-NEXT:             "qualType": "S"
 // CHECK-NEXT:            },
 // CHECK-NEXT:            "valueCategory": "lvalue",
@@ -742,7 +738,6 @@ void TestNonADLCall3() {
 // CHECK-NEXT:             "kind": "ParmVarDecl",
 // CHECK-NEXT:             "name": "obj1",
 // CHECK-NEXT:             "type": {
-// CHECK-NEXT:              "desugaredQualType": "S",
 // CHECK-NEXT:              "qualType": "S"
 // CHECK-NEXT:             }
 // CHECK-NEXT:            }
@@ -2539,7 +2534,6 @@ void TestNonADLCall3() {
 // CHECK-NEXT:    "isUsed": true,
 // CHECK-NEXT:    "name": "a",
 // CHECK-NEXT:    "type": {
-// CHECK-NEXT:     "desugaredQualType": "S",
 // CHECK-NEXT:     "qualType": "S"
 // CHECK-NEXT:    }
 // CHECK-NEXT:   },
@@ -2672,7 +2666,6 @@ void TestNonADLCall3() {
 // CHECK-NEXT:           }
 // CHECK-NEXT:          },
 // CHECK-NEXT:          "type": {
-// CHECK-NEXT:           "desugaredQualType": "S",
 // CHECK-NEXT:           "qualType": "S"
 // CHECK-NEXT:          },
 // CHECK-NEXT:          "valueCategory": "lvalue",
@@ -2681,7 +2674,6 @@ void TestNonADLCall3() {
 // CHECK-NEXT:           "kind": "ParmVarDecl",
 // CHECK-NEXT:           "name": "a",
 // CHECK-NEXT:           "type": {
-// CHECK-NEXT:            "desugaredQualType": "S",
 // CHECK-NEXT:            "qualType": "S"
 // CHECK-NEXT:           }
 // CHECK-NEXT:          }
@@ -2848,7 +2840,6 @@ void TestNonADLCall3() {
 // CHECK-NEXT:       }
 // CHECK-NEXT:      },
 // CHECK-NEXT:      "type": {
-// CHECK-NEXT:       "desugaredQualType": "int",
 // CHECK-NEXT:       "qualType": "int"
 // CHECK-NEXT:      },
 // CHECK-NEXT:      "valueCategory": "prvalue",
@@ -2948,7 +2939,6 @@ void TestNonADLCall3() {
 // CHECK-NEXT:       }
 // CHECK-NEXT:      },
 // CHECK-NEXT:      "type": {
-// CHECK-NEXT:       "desugaredQualType": "float",
 // CHECK-NEXT:       "qualType": "float"
 // CHECK-NEXT:      },
 // CHECK-NEXT:      "valueCategory": "prvalue",
@@ -2992,7 +2982,6 @@ void TestNonADLCall3() {
 // CHECK-NEXT:           }
 // CHECK-NEXT:          },
 // CHECK-NEXT:          "type": {
-// CHECK-NEXT:           "desugaredQualType": "S",
 // CHECK-NEXT:           "qualType": "S"
 // CHECK-NEXT:          },
 // CHECK-NEXT:          "valueCategory": "lvalue",
@@ -3001,7 +2990,6 @@ void TestNonADLCall3() {
 // CHECK-NEXT:           "kind": "ParmVarDecl",
 // CHECK-NEXT:           "name": "a",
 // CHECK-NEXT:           "type": {
-// CHECK-NEXT:            "desugaredQualType": "S",
 // CHECK-NEXT:            "qualType": "S"
 // CHECK-NEXT:           }
 // CHECK-NEXT:          }
@@ -3169,7 +3157,6 @@ void TestNonADLCall3() {
 // CHECK-NEXT:           }
 // CHECK-NEXT:          },
 // CHECK-NEXT:          "type": {
-// CHECK-NEXT:           "desugaredQualType": "S",
 // CHECK-NEXT:           "qualType": "S"
 // CHECK-NEXT:          },
 // CHECK-NEXT:          "valueCategory": "lvalue",
@@ -3178,7 +3165,6 @@ void TestNonADLCall3() {
 // CHECK-NEXT:           "kind": "ParmVarDecl",
 // CHECK-NEXT:           "name": "a",
 // CHECK-NEXT:           "type": {
-// CHECK-NEXT:            "desugaredQualType": "S",
 // CHECK-NEXT:            "qualType": "S"
 // CHECK-NEXT:           }
 // CHECK-NEXT:          }
@@ -3247,7 +3233,6 @@ void TestNonADLCall3() {
 // CHECK-NEXT:           }
 // CHECK-NEXT:          },
 // CHECK-NEXT:          "type": {
-// CHECK-NEXT:           "desugaredQualType": "S",
 // CHECK-NEXT:           "qualType": "S"
 // CHECK-NEXT:          },
 // CHECK-NEXT:          "valueCategory": "lvalue",
@@ -3256,7 +3241,6 @@ void TestNonADLCall3() {
 // CHECK-NEXT:           "kind": "ParmVarDecl",
 // CHECK-NEXT:           "name": "a",
 // CHECK-NEXT:           "type": {
-// CHECK-NEXT:            "desugaredQualType": "S",
 // CHECK-NEXT:            "qualType": "S"
 // CHECK-NEXT:           }
 // CHECK-NEXT:          }
@@ -3500,7 +3484,6 @@ void TestNonADLCall3() {
 // CHECK-NEXT:         }
 // CHECK-NEXT:        },
 // CHECK-NEXT:        "type": {
-// CHECK-NEXT:         "desugaredQualType": "S",
 // CHECK-NEXT:         "qualType": "S"
 // CHECK-NEXT:        },
 // CHECK-NEXT:        "valueCategory": "lvalue",
@@ -3509,7 +3492,6 @@ void TestNonADLCall3() {
 // CHECK-NEXT:         "kind": "ParmVarDecl",
 // CHECK-NEXT:         "name": "a",
 // CHECK-NEXT:         "type": {
-// CHECK-NEXT:          "desugaredQualType": "S",
 // CHECK-NEXT:          "qualType": "S"
 // CHECK-NEXT:         }
 // CHECK-NEXT:        }
@@ -3537,7 +3519,6 @@ void TestNonADLCall3() {
 // CHECK-NEXT:      },
 // CHECK-NEXT:      "valueCategory": "lvalue",
 // CHECK-NEXT:      "typeArg": {
-// CHECK-NEXT:       "desugaredQualType": "S",
 // CHECK-NEXT:       "qualType": "S"
 // CHECK-NEXT:      }
 // CHECK-NEXT:     },
@@ -3562,11 +3543,9 @@ void TestNonADLCall3() {
 // CHECK-NEXT:      },
 // CHECK-NEXT:      "valueCategory": "lvalue",
 // CHECK-NEXT:      "typeArg": {
-// CHECK-NEXT:       "desugaredQualType": "const volatile S",
 // CHECK-NEXT:       "qualType": "const volatile S"
 // CHECK-NEXT:      },
 // CHECK-NEXT:      "adjustedTypeArg": {
-// CHECK-NEXT:       "desugaredQualType": "S",
 // CHECK-NEXT:       "qualType": "S"
 // CHECK-NEXT:      }
 // CHECK-NEXT:     }
@@ -6591,7 +6570,6 @@ void TestNonADLCall3() {
 // CHECK-NEXT:            },
 // CHECK-NEXT:            "isImplicit": true,
 // CHECK-NEXT:            "type": {
-// CHECK-NEXT:             "desugaredQualType": "int",
 // CHECK-NEXT:             "qualType": "int"
 // CHECK-NEXT:            }
 // CHECK-NEXT:           }
@@ -7858,7 +7836,6 @@ void TestNonADLCall3() {
 // CHECK-NEXT:        "isUsed": true,
 // CHECK-NEXT:        "name": "x",
 // CHECK-NEXT:        "type": {
-// CHECK-NEXT:         "desugaredQualType": "NS::X",
 // CHECK-NEXT:         "qualType": "NS::X"
 // CHECK-NEXT:        },
 // CHECK-NEXT:        "init": "call",
@@ -7879,7 +7856,6 @@ void TestNonADLCall3() {
 // CHECK-NEXT:           }
 // CHECK-NEXT:          },
 // CHECK-NEXT:          "type": {
-// CHECK-NEXT:           "desugaredQualType": "NS::X",
 // CHECK-NEXT:           "qualType": "NS::X"
 // CHECK-NEXT:          },
 // CHECK-NEXT:          "valueCategory": "prvalue",
@@ -8030,7 +8006,6 @@ void TestNonADLCall3() {
 // CHECK-NEXT:             }
 // CHECK-NEXT:            },
 // CHECK-NEXT:            "type": {
-// CHECK-NEXT:             "desugaredQualType": "NS::X",
 // CHECK-NEXT:             "qualType": "NS::X"
 // CHECK-NEXT:            },
 // CHECK-NEXT:            "valueCategory": "lvalue",
@@ -8039,7 +8014,6 @@ void TestNonADLCall3() {
 // CHECK-NEXT:             "kind": "VarDecl",
 // CHECK-NEXT:             "name": "x",
 // CHECK-NEXT:             "type": {
-// CHECK-NEXT:              "desugaredQualType": "NS::X",
 // CHECK-NEXT:              "qualType": "NS::X"
 // CHECK-NEXT:             }
 // CHECK-NEXT:            }
@@ -8139,7 +8113,6 @@ void TestNonADLCall3() {
 // CHECK-NEXT:         }
 // CHECK-NEXT:        },
 // CHECK-NEXT:        "type": {
-// CHECK-NEXT:         "desugaredQualType": "NS::X",
 // CHECK-NEXT:         "qualType": "NS::X"
 // CHECK-NEXT:        },
 // CHECK-NEXT:        "valueCategory": "prvalue",
@@ -8187,7 +8160,6 @@ void TestNonADLCall3() {
 // CHECK-NEXT:             }
 // CHECK-NEXT:            },
 // CHECK-NEXT:            "type": {
-// CHECK-NEXT:             "desugaredQualType": "NS::X",
 // CHECK-NEXT:             "qualType": "NS::X"
 // CHECK-NEXT:            },
 // CHECK-NEXT:            "valueCategory": "lvalue",
@@ -8196,7 +8168,6 @@ void TestNonADLCall3() {
 // CHECK-NEXT:             "kind": "VarDecl",
 // CHECK-NEXT:             "name": "x",
 // CHECK-NEXT:             "type": {
-// CHECK-NEXT:              "desugaredQualType": "NS::X",
 // CHECK-NEXT:              "qualType": "NS::X"
 // CHECK-NEXT:             }
 // CHECK-NEXT:            }
@@ -8299,7 +8270,6 @@ void TestNonADLCall3() {
 // CHECK-NEXT:        "isUsed": true,
 // CHECK-NEXT:        "name": "x",
 // CHECK-NEXT:        "type": {
-// CHECK-NEXT:         "desugaredQualType": "NS::X",
 // CHECK-NEXT:         "qualType": "NS::X"
 // CHECK-NEXT:        },
 // CHECK-NEXT:        "init": "call",
@@ -8320,7 +8290,6 @@ void TestNonADLCall3() {
 // CHECK-NEXT:           }
 // CHECK-NEXT:          },
 // CHECK-NEXT:          "type": {
-// CHECK-NEXT:           "desugaredQualType": "NS::X",
 // CHECK-NEXT:           "qualType": "NS::X"
 // CHECK-NEXT:          },
 // CHECK-NEXT:          "valueCategory": "prvalue",
@@ -8470,7 +8439,6 @@ void TestNonADLCall3() {
 // CHECK-NEXT:             }
 // CHECK-NEXT:            },
 // CHECK-NEXT:            "type": {
-// CHECK-NEXT:             "desugaredQualType": "NS::X",
 // CHECK-NEXT:             "qualType": "NS::X"
 // CHECK-NEXT:            },
 // CHECK-NEXT:            "valueCategory": "lvalue",
@@ -8479,7 +8447,6 @@ void TestNonADLCall3() {
 // CHECK-NEXT:             "kind": "VarDecl",
 // CHECK-NEXT:             "name": "x",
 // CHECK-NEXT:             "type": {
-// CHECK-NEXT:              "desugaredQualType": "NS::X",
 // CHECK-NEXT:              "qualType": "NS::X"
 // CHECK-NEXT:             }
 // CHECK-NEXT:            }
@@ -8582,7 +8549,6 @@ void TestNonADLCall3() {
 // CHECK-NEXT:        "isUsed": true,
 // CHECK-NEXT:        "name": "x",
 // CHECK-NEXT:        "type": {
-// CHECK-NEXT:         "desugaredQualType": "NS::X",
 // CHECK-NEXT:         "qualType": "NS::X"
 // CHECK-NEXT:        },
 // CHECK-NEXT:        "init": "call",
@@ -8603,7 +8569,6 @@ void TestNonADLCall3() {
 // CHECK-NEXT:           }
 // CHECK-NEXT:          },
 // CHECK-NEXT:          "type": {
-// CHECK-NEXT:           "desugaredQualType": "NS::X",
 // CHECK-NEXT:           "qualType": "NS::X"
 // CHECK-NEXT:          },
 // CHECK-NEXT:          "valueCategory": "prvalue",
@@ -8799,7 +8764,6 @@ void TestNonADLCall3() {
 // CHECK-NEXT:             }
 // CHECK-NEXT:            },
 // CHECK-NEXT:            "type": {
-// CHECK-NEXT:             "desugaredQualType": "NS::X",
 // CHECK-NEXT:             "qualType": "NS::X"
 // CHECK-NEXT:            },
 // CHECK-NEXT:            "valueCategory": "lvalue",
@@ -8808,7 +8772,6 @@ void TestNonADLCall3() {
 // CHECK-NEXT:             "kind": "VarDecl",
 // CHECK-NEXT:             "name": "x",
 // CHECK-NEXT:             "type": {
-// CHECK-NEXT:              "desugaredQualType": "NS::X",
 // CHECK-NEXT:              "qualType": "NS::X"
 // CHECK-NEXT:             }
 // CHECK-NEXT:            }
@@ -8908,7 +8871,6 @@ void TestNonADLCall3() {
 // CHECK-NEXT:         }
 // CHECK-NEXT:        },
 // CHECK-NEXT:        "type": {
-// CHECK-NEXT:         "desugaredQualType": "NS::X",
 // CHECK-NEXT:         "qualType": "NS::X"
 // CHECK-NEXT:        },
 // CHECK-NEXT:        "valueCategory": "prvalue",
@@ -8956,7 +8918,6 @@ void TestNonADLCall3() {
 // CHECK-NEXT:             }
 // CHECK-NEXT:            },
 // CHECK-NEXT:            "type": {
-// CHECK-NEXT:             "desugaredQualType": "NS::X",
 // CHECK-NEXT:             "qualType": "NS::X"
 // CHECK-NEXT:            },
 // CHECK-NEXT:            "valueCategory": "lvalue",
@@ -8965,7 +8926,6 @@ void TestNonADLCall3() {
 // CHECK-NEXT:             "kind": "VarDecl",
 // CHECK-NEXT:             "name": "x",
 // CHECK-NEXT:             "type": {
-// CHECK-NEXT:              "desugaredQualType": "NS::X",
 // CHECK-NEXT:              "qualType": "NS::X"
 // CHECK-NEXT:             }
 // CHECK-NEXT:            }
diff --git a/clang/test/AST/ast-dump-expr-json.m b/clang/test/AST/ast-dump-expr-json.m
index bbde4451f9fe3..3c502c0496308 100644
--- a/clang/test/AST/ast-dump-expr-json.m
+++ b/clang/test/AST/ast-dump-expr-json.m
@@ -764,7 +764,6 @@ void TestObjCBoolLiteral(void) {
 // CHECK-NEXT:    "isUsed": true,
 // CHECK-NEXT:    "name": "Obj",
 // CHECK-NEXT:    "type": {
-// CHECK-NEXT:     "desugaredQualType": "id",
 // CHECK-NEXT:     "qualType": "id",
 // CHECK-NEXT:     "typeAliasDeclId": "0x{{.*}}"
 // CHECK-NEXT:    }
@@ -825,7 +824,6 @@ void TestObjCBoolLiteral(void) {
 // CHECK-NEXT:         }
 // CHECK-NEXT:        },
 // CHECK-NEXT:        "type": {
-// CHECK-NEXT:         "desugaredQualType": "id",
 // CHECK-NEXT:         "qualType": "id",
 // CHECK-NEXT:         "typeAliasDeclId": "0x{{.*}}"
 // CHECK-NEXT:        },
@@ -848,7 +846,6 @@ void TestObjCBoolLiteral(void) {
 // CHECK-NEXT:           }
 // CHECK-NEXT:          },
 // CHECK-NEXT:          "type": {
-// CHECK-NEXT:           "desugaredQualType": "id",
 // CHECK-NEXT:           "qualType": "id",
 // CHECK-NEXT:           "typeAliasDeclId": "0x{{.*}}"
 // CHECK-NEXT:          },
@@ -858,7 +855,6 @@ void TestObjCBoolLiteral(void) {
 // CHECK-NEXT:           "kind": "ParmVarDecl",
 // CHECK-NEXT:           "name": "Obj",
 // CHECK-NEXT:           "type": {
-// CHECK-NEXT:            "desugaredQualType": "id",
 // CHECK-NEXT:            "qualType": "id",
 // CHECK-NEXT:            "typeAliasDeclId": "0x{{.*}}"
 // CHECK-NEXT:           }
@@ -2036,7 +2032,6 @@ void TestObjCBoolLiteral(void) {
 // CHECK-NEXT:       }
 // CHECK-NEXT:      },
 // CHECK-NEXT:      "type": {
-// CHECK-NEXT:       "desugaredQualType": "id",
 // CHECK-NEXT:       "qualType": "id",
 // CHECK-NEXT:       "typeAliasDeclId": "0x{{.*}}"
 // CHECK-NEXT:      },
@@ -2468,7 +2463,6 @@ void TestObjCBoolLiteral(void) {
 // CHECK-NEXT:         }
 // CHECK-NEXT:        },
 // CHECK-NEXT:        "type": {
-// CHECK-NEXT:         "desugaredQualType": "id",
 // CHECK-NEXT:         "qualType": "id",
 // CHECK-NEXT:         "typeAliasDeclId": "0x{{.*}}"
 // CHECK-NEXT:        },
@@ -2490,7 +2484,6 @@ void TestObjCBoolLiteral(void) {
 // CHECK-NEXT:           }
 // CHECK-NEXT:          },
 // CHECK-NEXT:          "type": {
-// CHECK-NEXT:           "desugaredQualType": "id",
 // CHECK-NEXT:           "qualType": "id",
 // CHECK-NEXT:           "typeAliasDeclId": "0x{{.*}}"
 // CHECK-NEXT:          },
@@ -2678,7 +2671,6 @@ void TestObjCBoolLiteral(void) {
 // CHECK-NEXT:           }
 // CHECK-NEXT:          },
 // CHECK-NEXT:          "type": {
-// CHECK-NEXT:           "desugaredQualType": "id",
 // CHECK-NEXT:           "qualType": "id",
 // CHECK-NEXT:           "typeAliasDeclId": "0x{{.*}}"
 // CHECK-NEXT:          },
@@ -2700,7 +2692,6 @@ void TestObjCBoolLiteral(void) {
 // CHECK-NEXT:             }
 // CHECK-NEXT:            },
 // CHECK-NEXT:            "type": {
-// CHECK-NEXT:             "desugaredQualType": "id",
 // CHECK-NEXT:             "qualType": "id",
 // CHECK-NEXT:             "typeAliasDeclId": "0x{{.*}}"
 // CHECK-NEXT:            },
@@ -2864,7 +2855,6 @@ void TestObjCBoolLiteral(void) {
 // CHECK-NEXT:        "isUsed": true,
 // CHECK-NEXT:        "name": "i",
 // CHECK-NEXT:        "type": {
-// CHECK-NEXT:         "desugaredQualType": "id",
 // CHECK-NEXT:         "qualType": "id",
 // CHECK-NEXT:         "typeAliasDeclId": "0x{{.*}}"
 // CHECK-NEXT:        },
@@ -2886,7 +2876,6 @@ void TestObjCBoolLiteral(void) {
 // CHECK-NEXT:           }
 // CHECK-NEXT:          },
 // CHECK-NEXT:          "type": {
-// CHECK-NEXT:           "desugaredQualType": "id",
 // CHECK-NEXT:           "qualType": "id",
 // CHECK-NEXT:           "typeAliasDeclId": "0x{{.*}}"
 // CHECK-NEXT:          },
@@ -3163,7 +3152,6 @@ void TestObjCBoolLiteral(void) {
 // CHECK-NEXT:             }
 // CHECK-NEXT:            },
 // CHECK-NEXT:            "type": {
-// CHECK-NEXT:             "desugaredQualType": "id",
 // CHECK-NEXT:             "qualType": "id",
 // CHECK-NEXT:             "typeAliasDeclId": "0x{{.*}}"
 // CHECK-NEXT:            },
@@ -3312,7 +3300,6 @@ void TestObjCBoolLiteral(void) {
 // CHECK-NEXT:       }
 // CHECK-NEXT:      },
 // CHECK-NEXT:      "type": {
-// CHECK-NEXT:       "desugaredQualType": "id",
 // CHECK-NEXT:       "qualType": "id",
 // CHECK-NEXT:       "typeAliasDeclId": "0x{{.*}}"
 // CHECK-NEXT:      },
@@ -3788,7 +3775,6 @@ void TestObjCBoolLiteral(void) {
 // CHECK-NEXT:         }
 // CHECK-NEXT:        },
 // CHECK-NEXT:        "type": {
-// CHECK-NEXT:         "desugaredQualType": "id",
 // CHECK-NEXT:         "qualType": "id",
 // CHECK-NEXT:         "typeAliasDeclId": "0x{{.*}}"
 // CHECK-NEXT:        },
@@ -3810,7 +3796,6 @@ void TestObjCBoolLiteral(void) {
 // CHECK-NEXT:           }
 // CHECK-NEXT:          },
 // CHECK-NEXT:          "type": {
-// CHECK-NEXT:           "desugaredQualType": "id",
 // CHECK-NEXT:           "qualType": "id",
 // CHECK-NEXT:           "typeAliasDeclId": "0x{{.*}}"
 // CHECK-NEXT:          },
@@ -3998,7 +3983,6 @@ void TestObjCBoolLiteral(void) {
 // CHECK-NEXT:           }
 // CHECK-NEXT:          },
 // CHECK-NEXT:          "type": {
-// CHECK-NEXT:           "desugaredQualType": "id",
 // CHECK-NEXT:           "qualType": "id",
 // CHECK-NEXT:           "typeAliasDeclId": "0x{{.*}}"
 // CHECK-NEXT:          },
@@ -4020,7 +4004,6 @@ void TestObjCBoolLiteral(void) {
 // CHECK-NEXT:             }
 // CHECK-NEXT:            },
 // CHECK-NEXT:            "type": {
-// CHECK-NEXT:             "desugaredQualType": "id",
 // CHECK-NEXT:             "qualType": "id",
 // CHECK-NEXT:             "typeAliasDeclId": "0x{{.*}}"
 // CHECK-NEXT:            },
@@ -4113,7 +4096,6 @@ void TestObjCBoolLiteral(void) {
 // CHECK-NEXT:           }
 // CHECK-NEXT:          },
 // CHECK-NEXT:          "type": {
-// CHECK-NEXT:           "desugaredQualType": "id",
 // CHECK-NEXT:           "qualType": "id",
 // CHECK-NEXT:           "typeAliasDeclId": "0x{{.*}}"
 // CHECK-NEXT:          },
@@ -4208,7 +4190,6 @@ void TestObjCBoolLiteral(void) {
 // CHECK-NEXT:       }
 // CHECK-NEXT:      },
 // CHECK-NEXT:      "type": {
-// CHECK-NEXT:       "desugaredQualType": "id",
 // CHECK-NEXT:       "qualType": "id",
 // CHECK-NEXT:       "typeAliasDeclId": "0x{{.*}}"
 // CHECK-NEXT:      },
@@ -4231,7 +4212,6 @@ void TestObjCBoolLiteral(void) {
 // CHECK-NEXT:         }
 // CHECK-NEXT:        },
 // CHECK-NEXT:        "type": {
-// CHECK-NEXT:         "desugaredQualType": "id",
 // CHECK-NEXT:         "qualType": "id",
 // CHECK-NEXT:         "typeAliasDeclId": "0x{{.*}}"
 // CHECK-NEXT:        },
@@ -4241,7 +4221,6 @@ void TestObjCBoolLiteral(void) {
 // CHECK-NEXT:         "kind": "VarDecl",
 // CHECK-NEXT:         "name": "i",
 // CHECK-NEXT:         "type": {
-// CHECK-NEXT:          "desugaredQualType": "id",
 // CHECK-NEXT:          "qualType": "id",
 // CHECK-NEXT:          "typeAliasDeclId": "0x{{.*}}"
 // CHECK-NEXT:         }
@@ -4263,7 +4242,6 @@ void TestObjCBoolLiteral(void) {
 // CHECK-NEXT:         }
 // CHECK-NEXT:        },
 // CHECK-NEXT:        "type": {
-// CHECK-NEXT:         "desugaredQualType": "id",
 // CHECK-NEXT:         "qualType": "id",
 // CHECK-NEXT:         "typeAliasDeclId": "0x{{.*}}"
 // CHECK-NEXT:        },
@@ -4584,7 +4562,6 @@ void TestObjCBoolLiteral(void) {
 // CHECK-NEXT:           }
 // CHECK-NEXT:          },
 // CHECK-NEXT:          "type": {
-// CHECK-NEXT:           "desugaredQualType": "id",
 // CHECK-NEXT:           "qualType": "id",
 // CHECK-NEXT:           "typeAliasDeclId": "0x{{.*}}"
 // CHECK-NEXT:          },
@@ -4681,7 +4658,6 @@ void TestObjCBoolLiteral(void) {
 // CHECK-NEXT:             }
 // CHECK-NEXT:            },
 // CHECK-NEXT:            "type": {
-// CHECK-NEXT:             "desugaredQualType": "id",
 // CHECK-NEXT:             "qualType": "id",
 // CHECK-NEXT:             "typeAliasDeclId": "0x{{.*}}"
 // CHECK-NEXT:            },
diff --git a/clang/test/AST/ast-dump-expr.c b/clang/test/AST/ast-dump-expr.c
index 8605cd4bbaa16..959d61ec9794b 100644
--- a/clang/test/AST/ast-dump-expr.c
+++ b/clang/test/AST/ast-dump-expr.c
@@ -259,7 +259,7 @@ void PostfixOperators(int *a, struct S b, struct S *c) {
   b.a;
   // CHECK: ImplicitCastExpr
   // CHECK-NEXT: MemberExpr 0x{{[^ ]*}} <col:3, col:5> 'int' lvalue .a 0x{{[^ ]*}}
-  // CHECK-NEXT: DeclRefExpr 0x{{[^ ]*}} <col:3> 'struct S':'struct S' lvalue ParmVar 0x{{[^ ]*}} 'b' 'struct S':'struct S'
+  // CHECK-NEXT: DeclRefExpr 0x{{[^ ]*}} <col:3> 'struct S' lvalue ParmVar 0x{{[^ ]*}} 'b' 'struct S'
 
   c->a;
   // CHECK: ImplicitCastExpr
@@ -280,8 +280,8 @@ void PostfixOperators(int *a, struct S b, struct S *c) {
 
   (struct S){1};
   // CHECK: ImplicitCastExpr
-  // CHECK-NEXT: CompoundLiteralExpr 0x{{[^ ]*}} <col:3, col:15> 'struct S':'struct S' lvalue
-  // CHECK-NEXT: InitListExpr 0x{{[^ ]*}} <col:13, col:15> 'struct S':'struct S'
+  // CHECK-NEXT: CompoundLiteralExpr 0x{{[^ ]*}} <col:3, col:15> 'struct S' lvalue
+  // CHECK-NEXT: InitListExpr 0x{{[^ ]*}} <col:13, col:15> 'struct S'
   // CHECK-NEXT: IntegerLiteral 0x{{[^ ]*}} <col:14> 'int' 1
 }
 
diff --git a/clang/test/AST/ast-dump-expr.cpp b/clang/test/AST/ast-dump-expr.cpp
index 1c07aa066462b..69e65e22d61d0 100644
--- a/clang/test/AST/ast-dump-expr.cpp
+++ b/clang/test/AST/ast-dump-expr.cpp
@@ -59,7 +59,7 @@ void Throw() {
 void PointerToMember(S obj1, S *obj2, int S::* data, void (S::*call)(int)) {
   obj1.*data;
   // CHECK: BinaryOperator 0x{{[^ ]*}} <line:[[@LINE-1]]:3, col:9> 'int' lvalue '.*'
-  // CHECK-NEXT: DeclRefExpr 0x{{[^ ]*}} <col:3> 'S':'S' lvalue ParmVar 0x{{[^ ]*}} 'obj1' 'S':'S'
+  // CHECK-NEXT: DeclRefExpr 0x{{[^ ]*}} <col:3> 'S' lvalue ParmVar 0x{{[^ ]*}} 'obj1' 'S'
   // CHECK-NEXT: ImplicitCastExpr
   // CHECK-NEXT: DeclRefExpr 0x{{[^ ]*}} <col:9> 'int S::*' lvalue ParmVar 0x{{[^ ]*}} 'data' 'int S::*'
 
@@ -74,7 +74,7 @@ void PointerToMember(S obj1, S *obj2, int S::* data, void (S::*call)(int)) {
   // CHECK: CXXMemberCallExpr 0x{{[^ ]*}} <line:[[@LINE-1]]:3, col:18> 'void'
   // CHECK-NEXT: ParenExpr 0x{{[^ ]*}} <col:3, col:14> '<bound member function type>'
   // CHECK-NEXT: BinaryOperator 0x{{[^ ]*}} <col:4, col:10> '<bound member function type>' '.*'
-  // CHECK-NEXT: DeclRefExpr 0x{{[^ ]*}} <col:4> 'S':'S' lvalue ParmVar 0x{{[^ ]*}} 'obj1' 'S':'S'
+  // CHECK-NEXT: DeclRefExpr 0x{{[^ ]*}} <col:4> 'S' lvalue ParmVar 0x{{[^ ]*}} 'obj1' 'S'
   // CHECK-NEXT: ImplicitCastExpr
   // CHECK-NEXT: DeclRefExpr 0x{{[^ ]*}} <col:10> 'void (S::*)(int)' lvalue ParmVar 0x{{[^ ]*}} 'call' 'void (S::*)(int)'
   // CHECK-NEXT: IntegerLiteral 0x{{[^ ]*}} <col:16> 'int' 12
@@ -178,7 +178,7 @@ void PostfixExpressions(S a, S *p, U<int> *r) {
   a.func(0);
   // CHECK: CXXMemberCallExpr 0x{{[^ ]*}} <line:[[@LINE-1]]:3, col:11> 'void'
   // CHECK-NEXT: MemberExpr 0x{{[^ ]*}} <col:3, col:5> '<bound member function type>' .func 0x{{[^ ]*}}
-  // CHECK-NEXT: DeclRefExpr 0x{{[^ ]*}} <col:3> 'S':'S' lvalue ParmVar 0x{{[^ ]*}} 'a' 'S':'S'
+  // CHECK-NEXT: DeclRefExpr 0x{{[^ ]*}} <col:3> 'S' lvalue ParmVar 0x{{[^ ]*}} 'a' 'S'
   // CHECK-NEXT: IntegerLiteral 0x{{[^ ]*}} <col:10> 'int' 0
 
   p->func(0);
@@ -190,16 +190,16 @@ void PostfixExpressions(S a, S *p, U<int> *r) {
 
   // FIXME: there is no mention that this used the template keyword.
   p->template foo<int>();
-  // CHECK: CXXMemberCallExpr 0x{{[^ ]*}} <line:[[@LINE-1]]:3, col:24> 'int':'int'
+  // CHECK: CXXMemberCallExpr 0x{{[^ ]*}} <line:[[@LINE-1]]:3, col:24> 'int'
   // CHECK-NEXT: MemberExpr 0x{{[^ ]*}} <col:3, col:22> '<bound member function type>' ->foo 0x{{[^ ]*}}
   // CHECK-NEXT: ImplicitCastExpr
   // CHECK-NEXT: DeclRefExpr 0x{{[^ ]*}} <col:3> 'S *' lvalue ParmVar 0x{{[^ ]*}} 'p' 'S *'
 
   // FIXME: there is no mention that this used the template keyword.
   a.template foo<float>();
-  // CHECK: CXXMemberCallExpr 0x{{[^ ]*}} <line:[[@LINE-1]]:3, col:25> 'float':'float'
+  // CHECK: CXXMemberCallExpr 0x{{[^ ]*}} <line:[[@LINE-1]]:3, col:25> 'float'
   // CHECK-NEXT: MemberExpr 0x{{[^ ]*}} <col:3, col:23> '<bound member function type>' .foo 0x{{[^ ]*}}
-  // CHECK-NEXT: DeclRefExpr 0x{{[^ ]*}} <col:3> 'S':'S' lvalue ParmVar 0x{{[^ ]*}} 'a' 'S':'S'
+  // CHECK-NEXT: DeclRefExpr 0x{{[^ ]*}} <col:3> 'S' lvalue ParmVar 0x{{[^ ]*}} 'a' 'S'
 
   p->~S();
   // CHECK: CXXMemberCallExpr 0x{{[^ ]*}} <line:[[@LINE-1]]:3, col:9> 'void'
@@ -210,14 +210,14 @@ void PostfixExpressions(S a, S *p, U<int> *r) {
   a.~S();
   // CHECK: CXXMemberCallExpr 0x{{[^ ]*}} <line:[[@LINE-1]]:3, col:8> 'void'
   // CHECK-NEXT: MemberExpr 0x{{[^ ]*}} <col:3, col:6> '<bound member function type>' .~S 0x{{[^ ]*}}
-  // CHECK-NEXT: DeclRefExpr 0x{{[^ ]*}} <col:3> 'S':'S' lvalue ParmVar 0x{{[^ ]*}} 'a' 'S':'S'
+  // CHECK-NEXT: DeclRefExpr 0x{{[^ ]*}} <col:3> 'S' lvalue ParmVar 0x{{[^ ]*}} 'a' 'S'
 
   // FIXME: there seems to be no way to distinguish the construct below from
   // the construct above.
   a.~decltype(a)();
   // CHECK: CXXMemberCallExpr 0x{{[^ ]*}} <line:[[@LINE-1]]:3, col:18> 'void'
   // CHECK-NEXT: MemberExpr 0x{{[^ ]*}} <col:3, col:5> '<bound member function type>' .~S 0x{{[^ ]*}}
-  // CHECK-NEXT: DeclRefExpr 0x{{[^ ]*}} <col:3> 'S':'S' lvalue ParmVar 0x{{[^ ]*}} 'a' 'S':'S'
+  // CHECK-NEXT: DeclRefExpr 0x{{[^ ]*}} <col:3> 'S' lvalue ParmVar 0x{{[^ ]*}} 'a' 'S'
 
   // FIXME: similarly, there is no way to distinguish the construct below from
   // the p->~S() case.
@@ -233,13 +233,13 @@ void PostfixExpressions(S a, S *p, U<int> *r) {
   r->template U<int>::~U();
   // CHECK: CXXMemberCallExpr 0x{{[^ ]*}} <line:[[@LINE-1]]:3, col:26> 'void'
   // CHECK-NEXT: MemberExpr 0x{{[^ ]*}} <col:3, col:24> '<bound member function type>' ->~U 0x{{[^ ]*}}
-  // CHECK-NEXT: NestedNameSpecifier TypeSpecWithTemplate 'U<int>':'U<int>'
+  // CHECK-NEXT: NestedNameSpecifier TypeSpecWithTemplate 'U<int>'
   // CHECK-NEXT: ImplicitCastExpr
   // CHECK-NEXT: DeclRefExpr 0x{{[^ ]*}} <col:3> 'U<int> *' lvalue ParmVar 0x{{[^ ]*}} 'r' 'U<int> *'
 
   typeid(a);
   // CHECK: CXXTypeidExpr 0x{{[^ ]*}} <line:[[@LINE-1]]:3, col:11> 'const std::type_info' lvalue
-  // CHECK-NEXT: DeclRefExpr 0x{{[^ ]*}} <col:10> 'S':'S' lvalue ParmVar 0x{{[^ ]*}} 'a' 'S':'S'
+  // CHECK-NEXT: DeclRefExpr 0x{{[^ ]*}} <col:10> 'S' lvalue ParmVar 0x{{[^ ]*}} 'a' 'S'
 
   // FIXME: no type information is printed for the argument.
   typeid(S);
@@ -448,7 +448,7 @@ void PrimaryExpressions(Ts... a) {
   // CHECK-NEXT: CXXMethodDecl 0x{{[^ ]*}} <col:16, col:18> col:3 operator() 'auto () const -> auto' inline
   // CHECK-NEXT: CompoundStmt
   // CHECK-NEXT: FieldDecl 0x{{[^ ]*}} <col:4> col:4 implicit 'Ts...'
-  // CHECK-NEXT: FieldDecl 0x{{[^ ]*}} <col:10> col:10 implicit 'int':'int'
+  // CHECK-NEXT: FieldDecl 0x{{[^ ]*}} <col:10> col:10 implicit 'int'
   // CHECK-NEXT: ParenListExpr 0x{{[^ ]*}} <col:4> 'NULL TYPE'
   // CHECK-NEXT: DeclRefExpr 0x{{[^ ]*}} <col:4> 'Ts' lvalue ParmVar 0x{{[^ ]*}} 'a' 'Ts...'
   // CHECK-NEXT: IntegerLiteral 0x{{[^ ]*}} <col:14> 'int' 12
diff --git a/clang/test/AST/ast-dump-fpfeatures.cpp b/clang/test/AST/ast-dump-fpfeatures.cpp
index 9228b6e7238be..da0011602a728 100644
--- a/clang/test/AST/ast-dump-fpfeatures.cpp
+++ b/clang/test/AST/ast-dump-fpfeatures.cpp
@@ -140,7 +140,7 @@ float func_15(float x, float y) {
 // CHECK:         FunctionDecl {{.*}} func_14 'float (float, float)'
 // CHECK:           CompoundStmt
 // CHECK-NEXT:        ReturnStmt
-// CHECK-NEXT:          BinaryOperator {{.*}} 'float':'float' '+' ConstRoundingMode=towardzero
+// CHECK-NEXT:          BinaryOperator {{.*}} 'float' '+' ConstRoundingMode=towardzero
 
 float func_16(float x, float y) {
 #pragma STDC FENV_ROUND FE_TOWARDZERO
diff --git a/clang/test/AST/ast-dump-funcs.cpp b/clang/test/AST/ast-dump-funcs.cpp
index 7d47893d4596d..61fb5d4eb654e 100644
--- a/clang/test/AST/ast-dump-funcs.cpp
+++ b/clang/test/AST/ast-dump-funcs.cpp
@@ -32,7 +32,7 @@ struct S {
   // CHECK-NEXT: CXXCtorInitializer Field 0x{{[^ ]*}} 'j' 'int'
   // CHECK-NEXT: IntegerLiteral 0x{{[^ ]*}} <col:17> 'int' 0
   // CHECK-NEXT: CXXCtorInitializer Field 0x{{[^ ]*}} 'r' 'R'
-  // CHECK-NEXT: CXXConstructExpr 0x{{[^ ]*}} <col:3> 'R':'R' 'void () noexcept'
+  // CHECK-NEXT: CXXConstructExpr 0x{{[^ ]*}} <col:3> 'R' 'void () noexcept'
   // CHECK-NEXT: CompoundStmt 0x{{[^ ]*}} <col:20, col:21>
 
   void a();
diff --git a/clang/test/AST/ast-dump-functionprototype.cpp b/clang/test/AST/ast-dump-functionprototype.cpp
index 4ba57bd5ace0e..d831e5da2d320 100644
--- a/clang/test/AST/ast-dump-functionprototype.cpp
+++ b/clang/test/AST/ast-dump-functionprototype.cpp
@@ -5,7 +5,7 @@ struct B {};
 
 typedef void (type1)() noexcept(10 > 5);
 
-// CHECK:      TypedefDecl {{.*}} type1 'void () noexcept(10 > 5)':'void () noexcept(10 > 5)'
+// CHECK:      TypedefDecl {{.*}} type1 'void () noexcept(10 > 5)'
 // CHECK-NEXT: `-ParenType {{.*}}
 // CHECK-NEXT:   `-FunctionProtoType {{.*}} 'void () noexcept(10 > 5)' exceptionspec_noexcept_true cdecl
 // CHECK-NEXT:     |-NoexceptExpr: ConstantExpr {{.*}} 'bool'
@@ -14,9 +14,9 @@ typedef void (type1)() noexcept(10 > 5);
 
 typedef void (type2)() throw(A, B);
 
-// CHECK:      TypedefDecl {{.*}} type2 'void () throw(A, B)':'void () throw(A, B)'
+// CHECK:      TypedefDecl {{.*}} type2 'void () throw(A, B)'
 // CHECK-NEXT: `-ParenType {{.*}}
 // CHECK-NEXT:   `-FunctionProtoType {{.*}} 'void () throw(A, B)' exceptionspec_dynamic cdecl
-// CHECK-NEXT:     |-Exceptions: 'A':'A', 'B':'B'
+// CHECK-NEXT:     |-Exceptions: 'A', 'B'
 // CHECK-NEXT:     `-BuiltinType {{.*}} 'void'
 
diff --git a/clang/test/AST/ast-dump-lambda.cpp b/clang/test/AST/ast-dump-lambda.cpp
index 56d09b92df0e6..ef8789cd97d3e 100644
--- a/clang/test/AST/ast-dump-lambda.cpp
+++ b/clang/test/AST/ast-dump-lambda.cpp
@@ -229,7 +229,7 @@ template <typename... Ts> void test(Ts... a) {
 // CHECK-NEXT:    | | |-CXXMethodDecl {{.*}} <col:16, col:19> col:3{{( imported)?}} operator() 'auto () const -> auto' inline
 // CHECK-NEXT:    | | | `-CompoundStmt {{.*}} <col:18, col:19>
 // CHECK-NEXT:    | | |-FieldDecl {{.*}} <col:4> col:4{{( imported)?}} implicit 'Ts...'
-// CHECK-NEXT:    | | `-FieldDecl {{.*}} <col:10> col:10{{( imported)?}} implicit 'int':'int'
+// CHECK-NEXT:    | | `-FieldDecl {{.*}} <col:10> col:10{{( imported)?}} implicit 'int'
 // CHECK-NEXT:    | |-ParenListExpr {{.*}} <col:4> 'NULL TYPE'
 // CHECK-NEXT:    | | `-DeclRefExpr {{.*}} <col:4> 'Ts' lvalue ParmVar {{.*}} 'a' 'Ts...'
 // CHECK-NEXT:    | |-IntegerLiteral {{.*}} <col:14> 'int' 12
diff --git a/clang/test/AST/ast-dump-objc-arc-json.m b/clang/test/AST/ast-dump-objc-arc-json.m
index 5792c8bf58776..86ca28e283a5b 100644
--- a/clang/test/AST/ast-dump-objc-arc-json.m
+++ b/clang/test/AST/ast-dump-objc-arc-json.m
@@ -22,7 +22,6 @@ id TestCompoundLiteral(id a) {
 // CHECK-NEXT:   }
 // CHECK-NEXT:  },
 // CHECK-NEXT:  "type": {
-// CHECK-NEXT:   "desugaredQualType": "id",
 // CHECK-NEXT:   "qualType": "id",
 // CHECK-NEXT:   "typeAliasDeclId": "0x{{.*}}"
 // CHECK-NEXT:  },
diff --git a/clang/test/AST/ast-dump-openmp-begin-declare-variant_reference.cpp b/clang/test/AST/ast-dump-openmp-begin-declare-variant_reference.cpp
index ef1d2033c5d10..1937a5d1c3eb3 100644
--- a/clang/test/AST/ast-dump-openmp-begin-declare-variant_reference.cpp
+++ b/clang/test/AST/ast-dump-openmp-begin-declare-variant_reference.cpp
@@ -121,7 +121,7 @@ int test(float &&f, short &&s) {
 // CHECK-NEXT: | | | `-LValueReferenceType [[ADDR_7:0x[a-z0-9]*]] 'float &'
 // CHECK-NEXT: | | |   `-BuiltinType [[ADDR_8:0x[a-z0-9]*]] 'float'
 // CHECK-NEXT: | | |-CXXRecordDecl [[ADDR_9:0x[a-z0-9]*]] <col:22, col:29> col:29 implicit struct remove_reference
-// CHECK-NEXT: | | `-TypedefDecl [[ADDR_10:0x[a-z0-9]*]] <col:55, col:67> col:67 referenced type 'float':'float'
+// CHECK-NEXT: | | `-TypedefDecl [[ADDR_10:0x[a-z0-9]*]] <col:55, col:67> col:67 referenced type 'float'
 // CHECK-NEXT: | |   `-SubstTemplateTypeParmType [[ADDR_11:0x[a-z0-9]*]] 'float' sugar class depth 0 index 0 _Tp
 // CHECK-NEXT: | |     |-ClassTemplateSpecialization [[ADDR_6]] 'remove_reference'
 // CHECK-NEXT: | |     `-BuiltinType [[ADDR_8]] 'float'
@@ -137,7 +137,7 @@ int test(float &&f, short &&s) {
 // CHECK-NEXT: |   | `-LValueReferenceType [[ADDR_15:0x[a-z0-9]*]] 'short &'
 // CHECK-NEXT: |   |   `-BuiltinType [[ADDR_16:0x[a-z0-9]*]] 'short'
 // CHECK-NEXT: |   |-CXXRecordDecl [[ADDR_17:0x[a-z0-9]*]] <col:22, col:29> col:29 implicit struct remove_reference
-// CHECK-NEXT: |   `-TypedefDecl [[ADDR_18:0x[a-z0-9]*]] <col:55, col:67> col:67 referenced type 'short':'short'
+// CHECK-NEXT: |   `-TypedefDecl [[ADDR_18:0x[a-z0-9]*]] <col:55, col:67> col:67 referenced type 'short'
 // CHECK-NEXT: |     `-SubstTemplateTypeParmType [[ADDR_19:0x[a-z0-9]*]] 'short' sugar class depth 0 index 0 _Tp
 // CHECK-NEXT: |       |-ClassTemplateSpecialization [[ADDR_14]] 'remove_reference'
 // CHECK-NEXT: |       `-BuiltinType [[ADDR_16]] 'short'
diff --git a/clang/test/AST/ast-dump-openmp-begin-declare-variant_template_1.cpp b/clang/test/AST/ast-dump-openmp-begin-declare-variant_template_1.cpp
index 5916958b94625..0dfed6ffa240d 100644
--- a/clang/test/AST/ast-dump-openmp-begin-declare-variant_template_1.cpp
+++ b/clang/test/AST/ast-dump-openmp-begin-declare-variant_template_1.cpp
@@ -124,14 +124,14 @@ int test() {
 // CHECK-NEXT: |         |-CallExpr [[ADDR_63:0x[a-z0-9]*]] <col:10, col:25> 'int'
 // CHECK-NEXT: |         | |-ImplicitCastExpr [[ADDR_64:0x[a-z0-9]*]] <col:10> 'int (*)(double)' <FunctionToPointerDecay>
 // CHECK-NEXT: |         | | `-DeclRefExpr [[ADDR_65:0x[a-z0-9]*]] <col:10> 'int (double)' {{.*}}Function [[ADDR_44]] 'also_after' 'int (double)'
-// CHECK-NEXT: |         | `-CXXFunctionalCastExpr [[ADDR_66:0x[a-z0-9]*]] <col:21, col:24> 'double':'double' functional cast to double <NoOp>
-// CHECK-NEXT: |         |   `-ImplicitCastExpr [[ADDR_67:0x[a-z0-9]*]] <col:23> 'double':'double' <IntegralToFloating> part_of_explicit_cast
+// CHECK-NEXT: |         | `-CXXFunctionalCastExpr [[ADDR_66:0x[a-z0-9]*]] <col:21, col:24> 'double' functional cast to double <NoOp>
+// CHECK-NEXT: |         |   `-ImplicitCastExpr [[ADDR_67:0x[a-z0-9]*]] <col:23> 'double' <IntegralToFloating> part_of_explicit_cast
 // CHECK-NEXT: |         |     `-IntegerLiteral [[ADDR_58]] <col:23> 'int' 0
 // CHECK-NEXT: |         `-CallExpr [[ADDR_68:0x[a-z0-9]*]] <line:16:1, line:39:25> 'int'
 // CHECK-NEXT: |           |-ImplicitCastExpr [[ADDR_69:0x[a-z0-9]*]] <line:16:1> 'int (*)(double)' <FunctionToPointerDecay>
 // CHECK-NEXT: |           | `-DeclRefExpr [[ADDR_25]] <col:1> 'int (double)' Function [[ADDR_26]] 'also_after[implementation={vendor(llvm)}]' 'int (double)'
-// CHECK-NEXT: |           `-CXXFunctionalCastExpr [[ADDR_66]] <line:39:21, col:24> 'double':'double' functional cast to double <NoOp>
-// CHECK-NEXT: |             `-ImplicitCastExpr [[ADDR_67]] <col:23> 'double':'double' <IntegralToFloating> part_of_explicit_cast
+// CHECK-NEXT: |           `-CXXFunctionalCastExpr [[ADDR_66]] <line:39:21, col:24> 'double' functional cast to double <NoOp>
+// CHECK-NEXT: |             `-ImplicitCastExpr [[ADDR_67]] <col:23> 'double' <IntegralToFloating> part_of_explicit_cast
 // CHECK-NEXT: |               `-IntegerLiteral [[ADDR_58]] <col:23> 'int' 0
 // CHECK-NEXT: |-TypedefDecl [[ADDR_70:0x[a-z0-9]*]] <line:42:1, col:18> col:14 referenced Ty 'int (*)({{.*}})'
 // CHECK-NEXT: | `-PointerType [[ADDR_71:0x[a-z0-9]*]] 'int (*)({{.*}})'
diff --git a/clang/test/AST/ast-dump-openmp-begin-declare-variant_template_2.cpp b/clang/test/AST/ast-dump-openmp-begin-declare-variant_template_2.cpp
index 9613e86634927..da46cef7f3f1b 100644
--- a/clang/test/AST/ast-dump-openmp-begin-declare-variant_template_2.cpp
+++ b/clang/test/AST/ast-dump-openmp-begin-declare-variant_template_2.cpp
@@ -66,7 +66,7 @@ int test() {
 // CHECK-NEXT: | `-FunctionDecl [[ADDR_10:0x[a-z0-9]*]] <line:5:1, line:7:1> line:5:5 used also_before 'int (double)'
 // CHECK-NEXT: |   |-TemplateArgument type 'double'
 // CHECK-NEXT: |   | `-BuiltinType [[ADDR_11:0x[a-z0-9]*]] 'double'
-// CHECK-NEXT: |   |-ParmVarDecl [[ADDR_12:0x[a-z0-9]*]] <col:17> col:18 'double':'double'
+// CHECK-NEXT: |   |-ParmVarDecl [[ADDR_12:0x[a-z0-9]*]] <col:17> col:18 'double'
 // CHECK-NEXT: |   |-CompoundStmt [[ADDR_13:0x[a-z0-9]*]] <col:20, line:7:1>
 // CHECK-NEXT: |   | `-ReturnStmt [[ADDR_14:0x[a-z0-9]*]] <line:6:3, col:10>
 // CHECK-NEXT: |   |   `-IntegerLiteral [[ADDR_6]] <col:10> 'int' 1
@@ -97,7 +97,7 @@ int test() {
 // CHECK-NEXT: | `-FunctionDecl [[ADDR_17]] <line:18:1, line:20:1> line:18:1 also_before[implementation={extension(allow_templates)}] 'int (double)'
 // CHECK-NEXT: |   |-TemplateArgument type 'double'
 // CHECK-NEXT: |   | `-BuiltinType [[ADDR_11]] 'double'
-// CHECK-NEXT: |   |-ParmVarDecl [[ADDR_37:0x[a-z0-9]*]] <col:17> col:18 'double':'double'
+// CHECK-NEXT: |   |-ParmVarDecl [[ADDR_37:0x[a-z0-9]*]] <col:17> col:18 'double'
 // CHECK-NEXT: |   `-CompoundStmt [[ADDR_38:0x[a-z0-9]*]] <col:20, line:20:1>
 // CHECK-NEXT: |     `-ReturnStmt [[ADDR_39:0x[a-z0-9]*]] <line:19:3, col:10>
 // CHECK-NEXT: |       `-IntegerLiteral [[ADDR_36]] <col:10> 'int' 0
@@ -110,7 +110,7 @@ int test() {
 // CHECK-NEXT: | `-FunctionDecl [[ADDR_47:0x[a-z0-9]*]] <line:44:1, line:46:1> line:44:5 used also_after 'int (char)'
 // CHECK-NEXT: |   |-TemplateArgument type 'char'
 // CHECK-NEXT: |   | `-BuiltinType [[ADDR_48:0x[a-z0-9]*]] 'char'
-// CHECK-NEXT: |   |-ParmVarDecl [[ADDR_49:0x[a-z0-9]*]] <col:16> col:17 'char':'char'
+// CHECK-NEXT: |   |-ParmVarDecl [[ADDR_49:0x[a-z0-9]*]] <col:16> col:17 'char'
 // CHECK-NEXT: |   |-CompoundStmt [[ADDR_50:0x[a-z0-9]*]] <col:19, line:46:1>
 // CHECK-NEXT: |   | `-ReturnStmt [[ADDR_51:0x[a-z0-9]*]] <line:45:3, col:10>
 // CHECK-NEXT: |   |   `-IntegerLiteral [[ADDR_52:0x[a-z0-9]*]] <col:10> 'int' 6
@@ -126,7 +126,7 @@ int test() {
 // CHECK-NEXT: | `-FunctionDecl [[ADDR_55]] <line:22:1, line:24:1> line:22:1 also_after[implementation={extension(allow_templates)}] 'int (char)'
 // CHECK-NEXT: |   |-TemplateArgument type 'char'
 // CHECK-NEXT: |   | `-BuiltinType [[ADDR_48]] 'char'
-// CHECK-NEXT: |   |-ParmVarDecl [[ADDR_60:0x[a-z0-9]*]] <col:16> col:17 'char':'char'
+// CHECK-NEXT: |   |-ParmVarDecl [[ADDR_60:0x[a-z0-9]*]] <col:16> col:17 'char'
 // CHECK-NEXT: |   `-CompoundStmt [[ADDR_61:0x[a-z0-9]*]] <col:19, line:24:1>
 // CHECK-NEXT: |     `-ReturnStmt [[ADDR_62:0x[a-z0-9]*]] <line:23:3, col:10>
 // CHECK-NEXT: |       `-IntegerLiteral [[ADDR_59]] <col:10> 'int' 0
@@ -213,7 +213,7 @@ int test() {
 // CHECK-NEXT: | `-FunctionDecl [[ADDR_128:0x[a-z0-9]*]] <line:48:1, line:50:1> line:48:5 used also_after_mismatch 'int (int)'
 // CHECK-NEXT: |   |-TemplateArgument type 'int'
 // CHECK-NEXT: |   | `-BuiltinType [[ADDR_129:0x[a-z0-9]*]] 'int'
-// CHECK-NEXT: |   |-ParmVarDecl [[ADDR_130:0x[a-z0-9]*]] <col:25> col:26 'int':'int'
+// CHECK-NEXT: |   |-ParmVarDecl [[ADDR_130:0x[a-z0-9]*]] <col:25> col:26 'int'
 // CHECK-NEXT: |   `-CompoundStmt [[ADDR_131:0x[a-z0-9]*]] <col:28, line:50:1>
 // CHECK-NEXT: |     `-ReturnStmt [[ADDR_132:0x[a-z0-9]*]] <line:49:3, col:10>
 // CHECK-NEXT: |       `-IntegerLiteral [[ADDR_127]] <col:10> 'int' 0
@@ -244,12 +244,12 @@ int test() {
 // CHECK-NEXT:         | |   |-CallExpr [[ADDR_155:0x[a-z0-9]*]] <col:85, col:103> 'int'
 // CHECK-NEXT:         | |   | |-ImplicitCastExpr [[ADDR_156:0x[a-z0-9]*]] <col:85, col:100> 'int (*)(char)' <FunctionToPointerDecay>
 // CHECK-NEXT:         | |   | | `-DeclRefExpr [[ADDR_157:0x[a-z0-9]*]] <col:85, col:100> 'int (char)' {{.*}}Function [[ADDR_47]] 'also_after' 'int (char)' (FunctionTemplate [[ADDR_114]] 'also_after')
-// CHECK-NEXT:         | |   | `-ImplicitCastExpr [[ADDR_158:0x[a-z0-9]*]] <col:102> 'char':'char' <IntegralCast>
+// CHECK-NEXT:         | |   | `-ImplicitCastExpr [[ADDR_158:0x[a-z0-9]*]] <col:102> 'char' <IntegralCast>
 // CHECK-NEXT:         | |   |   `-IntegerLiteral [[ADDR_159:0x[a-z0-9]*]] <col:102> 'int' 0
 // CHECK-NEXT:         | |   `-CallExpr [[ADDR_160:0x[a-z0-9]*]] <line:22:1, line:54:103> 'int'
 // CHECK-NEXT:         | |     |-ImplicitCastExpr [[ADDR_161:0x[a-z0-9]*]] <line:22:1> 'int (*)(char)' <FunctionToPointerDecay>
 // CHECK-NEXT:         | |     | `-DeclRefExpr [[ADDR_54]] <col:1> 'int (char)' {{.*}}Function [[ADDR_55]] 'also_after[implementation={extension(allow_templates)}]' 'int (char)'
-// CHECK-NEXT:         | |     `-ImplicitCastExpr [[ADDR_162:0x[a-z0-9]*]] <line:54:102> 'char':'char' <IntegralCast>
+// CHECK-NEXT:         | |     `-ImplicitCastExpr [[ADDR_162:0x[a-z0-9]*]] <line:54:102> 'char' <IntegralCast>
 // CHECK-NEXT:         | |       `-IntegerLiteral [[ADDR_159]] <col:102> 'int' 0
 // CHECK-NEXT:         | `-CallExpr [[ADDR_163:0x[a-z0-9]*]] <col:107, col:128> 'int'
 // CHECK-NEXT:         |   |-ImplicitCastExpr [[ADDR_164:0x[a-z0-9]*]] <col:107> 'int (*)(int)' <FunctionToPointerDecay>
diff --git a/clang/test/AST/ast-dump-openmp-begin-declare-variant_template_3.cpp b/clang/test/AST/ast-dump-openmp-begin-declare-variant_template_3.cpp
index 60619801735f2..ad269506042c8 100644
--- a/clang/test/AST/ast-dump-openmp-begin-declare-variant_template_3.cpp
+++ b/clang/test/AST/ast-dump-openmp-begin-declare-variant_template_3.cpp
@@ -97,14 +97,14 @@ int test() {
 // CHECK-NEXT: | |-FunctionDecl [[ADDR_32:0x[a-z0-9]*]] <line:10:1, line:12:1> line:10:5 used also_before 'int (int)'
 // CHECK-NEXT: | | |-TemplateArgument type 'int'
 // CHECK-NEXT: | | | `-BuiltinType [[ADDR_9]] 'int'
-// CHECK-NEXT: | | |-ParmVarDecl [[ADDR_33:0x[a-z0-9]*]] <col:17, col:19> col:19 s 'int':'int'
+// CHECK-NEXT: | | |-ParmVarDecl [[ADDR_33:0x[a-z0-9]*]] <col:17, col:19> col:19 s 'int'
 // CHECK-NEXT: | | `-CompoundStmt [[ADDR_34:0x[a-z0-9]*]] <col:22, line:12:1>
 // CHECK-NEXT: | |   `-ReturnStmt [[ADDR_35:0x[a-z0-9]*]] <line:11:3, col:10>
 // CHECK-NEXT: | |     `-IntegerLiteral [[ADDR_28]] <col:10> 'int' 0
 // CHECK-NEXT: | `-FunctionDecl [[ADDR_36:0x[a-z0-9]*]] <line:10:1, line:12:1> line:10:5 used also_before 'int (double)'
 // CHECK-NEXT: |   |-TemplateArgument type 'double'
 // CHECK-NEXT: |   | `-BuiltinType [[ADDR_21]] 'double'
-// CHECK-NEXT: |   |-ParmVarDecl [[ADDR_37:0x[a-z0-9]*]] <col:17, col:19> col:19 s 'double':'double'
+// CHECK-NEXT: |   |-ParmVarDecl [[ADDR_37:0x[a-z0-9]*]] <col:17, col:19> col:19 s 'double'
 // CHECK-NEXT: |   `-CompoundStmt [[ADDR_38:0x[a-z0-9]*]] <col:22, line:12:1>
 // CHECK-NEXT: |     `-ReturnStmt [[ADDR_39:0x[a-z0-9]*]] <line:11:3, col:10>
 // CHECK-NEXT: |       `-IntegerLiteral [[ADDR_28]] <col:10> 'int' 0
@@ -116,7 +116,7 @@ int test() {
 // CHECK-NEXT: | |   |-DeclStmt [[ADDR_44:0x[a-z0-9]*]] <line:18:3, col:11>
 // CHECK-NEXT: | |   | `-VarDecl [[ADDR_45:0x[a-z0-9]*]] <col:3, col:10> col:10 referenced t 'double'
 // CHECK-NEXT: | |   |-DeclStmt [[ADDR_46:0x[a-z0-9]*]] <line:19:3, col:16>
-// CHECK-NEXT: | |   | `-VarDecl [[ADDR_47:0x[a-z0-9]*]] <col:3, col:15> col:8 q 'S<T>':'S<T>' callinit
+// CHECK-NEXT: | |   | `-VarDecl [[ADDR_47:0x[a-z0-9]*]] <col:3, col:15> col:8 q 'S<T>' callinit
 // CHECK-NEXT: | |   |   `-ParenListExpr [[ADDR_48:0x[a-z0-9]*]] <col:9, col:15> 'NULL TYPE'
 // CHECK-NEXT: | |   |     |-IntegerLiteral [[ADDR_49:0x[a-z0-9]*]] <col:10> 'int' 1
 // CHECK-NEXT: | |   |     `-UnaryOperator [[ADDR_50:0x[a-z0-9]*]] <col:13, col:14> 'double *' prefix '&' cannot overflow
@@ -126,11 +126,11 @@ int test() {
 // CHECK-NEXT: | |-FunctionDecl [[ADDR_54:0x[a-z0-9]*]] <line:16:1, line:21:1> line:16:1 also_before[implementation={extension(allow_templates)}] 'int (S<int>)'
 // CHECK-NEXT: | | |-TemplateArgument type 'int'
 // CHECK-NEXT: | | | `-BuiltinType [[ADDR_9]] 'int'
-// CHECK-NEXT: | | `-ParmVarDecl [[ADDR_55:0x[a-z0-9]*]] <col:17, col:22> col:22 s 'S<int>':'S<int>'
+// CHECK-NEXT: | | `-ParmVarDecl [[ADDR_55:0x[a-z0-9]*]] <col:17, col:22> col:22 s 'S<int>'
 // CHECK-NEXT: | `-FunctionDecl [[ADDR_56:0x[a-z0-9]*]] <col:1, line:21:1> line:16:1 also_before[implementation={extension(allow_templates)}] 'int (S<double>)'
 // CHECK-NEXT: |   |-TemplateArgument type 'double'
 // CHECK-NEXT: |   | `-BuiltinType [[ADDR_21]] 'double'
-// CHECK-NEXT: |   `-ParmVarDecl [[ADDR_57:0x[a-z0-9]*]] <col:17, col:22> col:22 s 'S<double>':'S<double>'
+// CHECK-NEXT: |   `-ParmVarDecl [[ADDR_57:0x[a-z0-9]*]] <col:17, col:22> col:22 s 'S<double>'
 // CHECK-NEXT: |-FunctionTemplateDecl [[ADDR_58:0x[a-z0-9]*]] <line:22:1, line:23:19> col:5 implicit special
 // CHECK-NEXT: | |-TemplateTypeParmDecl [[ADDR_59:0x[a-z0-9]*]] <line:22:11, col:20> col:20 referenced typename depth 0 index 0 T
 // CHECK-NEXT: | |-FunctionDecl [[ADDR_60:0x[a-z0-9]*]] <line:23:1, col:19> col:5 special 'int (S<T>)'
@@ -140,7 +140,7 @@ int test() {
 // CHECK-NEXT: | `-FunctionDecl [[ADDR_65:0x[a-z0-9]*]] <col:1, col:19> col:5 used special 'int (S<int>)'
 // CHECK-NEXT: |   |-TemplateArgument type 'int'
 // CHECK-NEXT: |   | `-BuiltinType [[ADDR_9]] 'int'
-// CHECK-NEXT: |   |-ParmVarDecl [[ADDR_66:0x[a-z0-9]*]] <col:13, col:18> col:18 s 'S<int>':'S<int>'
+// CHECK-NEXT: |   |-ParmVarDecl [[ADDR_66:0x[a-z0-9]*]] <col:13, col:18> col:18 s 'S<int>'
 // CHECK-NEXT: |   `-OMPDeclareVariantAttr [[ADDR_67:0x[a-z0-9]*]] <<invalid sloc>> Implicit implementation={extension(allow_templates)}
 // CHECK-NEXT: |     `-DeclRefExpr [[ADDR_68:0x[a-z0-9]*]] <col:1> 'int (S<int>)' {{.*}}Function [[ADDR_69:0x[a-z0-9]*]] 'special[implementation={extension(allow_templates)}]' 'int (S<int>)'
 // CHECK-NEXT: |-FunctionTemplateDecl [[ADDR_70:0x[a-z0-9]*]] <line:22:1, line:27:1> line:23:1 special[implementation={extension(allow_templates)}]
@@ -151,7 +151,7 @@ int test() {
 // CHECK-NEXT: | |   |-DeclStmt [[ADDR_72:0x[a-z0-9]*]] <line:24:3, col:6>
 // CHECK-NEXT: | |   | `-VarDecl [[ADDR_73:0x[a-z0-9]*]] <col:3, col:5> col:5 referenced t 'T'
 // CHECK-NEXT: | |   |-DeclStmt [[ADDR_74:0x[a-z0-9]*]] <line:25:3, col:16>
-// CHECK-NEXT: | |   | `-VarDecl [[ADDR_75:0x[a-z0-9]*]] <col:3, col:15> col:8 q 'S<T>':'S<T>' callinit
+// CHECK-NEXT: | |   | `-VarDecl [[ADDR_75:0x[a-z0-9]*]] <col:3, col:15> col:8 q 'S<T>' callinit
 // CHECK-NEXT: | |   |   `-ParenListExpr [[ADDR_76:0x[a-z0-9]*]] <col:9, col:15> 'NULL TYPE'
 // CHECK-NEXT: | |   |     |-IntegerLiteral [[ADDR_77:0x[a-z0-9]*]] <col:10> 'int' 0
 // CHECK-NEXT: | |   |     `-UnaryOperator [[ADDR_78:0x[a-z0-9]*]] <col:13, col:14> '<dependent type>' prefix '&' cannot overflow
@@ -161,16 +161,16 @@ int test() {
 // CHECK-NEXT: | `-FunctionDecl [[ADDR_69]] <line:23:1, line:27:1> line:23:1 special[implementation={extension(allow_templates)}] 'int (S<int>)'
 // CHECK-NEXT: |   |-TemplateArgument type 'int'
 // CHECK-NEXT: |   | `-BuiltinType [[ADDR_9]] 'int'
-// CHECK-NEXT: |   |-ParmVarDecl [[ADDR_82:0x[a-z0-9]*]] <col:13, col:18> col:18 s 'S<int>':'S<int>'
+// CHECK-NEXT: |   |-ParmVarDecl [[ADDR_82:0x[a-z0-9]*]] <col:13, col:18> col:18 s 'S<int>'
 // CHECK-NEXT: |   `-CompoundStmt [[ADDR_83:0x[a-z0-9]*]] <col:21, line:27:1>
 // CHECK-NEXT: |     |-DeclStmt [[ADDR_84:0x[a-z0-9]*]] <line:24:3, col:6>
-// CHECK-NEXT: |     | `-VarDecl [[ADDR_85:0x[a-z0-9]*]] <col:3, col:5> col:5 used t 'int':'int'
+// CHECK-NEXT: |     | `-VarDecl [[ADDR_85:0x[a-z0-9]*]] <col:3, col:5> col:5 used t 'int'
 // CHECK-NEXT: |     |-DeclStmt [[ADDR_86:0x[a-z0-9]*]] <line:25:3, col:16>
-// CHECK-NEXT: |     | `-VarDecl [[ADDR_87:0x[a-z0-9]*]] <col:3, col:15> col:8 q 'S<int>':'S<int>' callinit
-// CHECK-NEXT: |     |   `-CXXConstructExpr [[ADDR_88:0x[a-z0-9]*]] <col:8, col:15> 'S<int>':'S<int>' 'void (int, int *)'
+// CHECK-NEXT: |     | `-VarDecl [[ADDR_87:0x[a-z0-9]*]] <col:3, col:15> col:8 q 'S<int>' callinit
+// CHECK-NEXT: |     |   `-CXXConstructExpr [[ADDR_88:0x[a-z0-9]*]] <col:8, col:15> 'S<int>' 'void (int, int *)'
 // CHECK-NEXT: |     |     |-IntegerLiteral [[ADDR_77]] <col:10> 'int' 0
 // CHECK-NEXT: |     |     `-UnaryOperator [[ADDR_89:0x[a-z0-9]*]] <col:13, col:14> 'int *' prefix '&' cannot overflow
-// CHECK-NEXT: |     |       `-DeclRefExpr [[ADDR_90:0x[a-z0-9]*]] <col:14> 'int':'int' {{.*}}Var [[ADDR_85]] 't' 'int':'int'
+// CHECK-NEXT: |     |       `-DeclRefExpr [[ADDR_90:0x[a-z0-9]*]] <col:14> 'int' {{.*}}Var [[ADDR_85]] 't' 'int'
 // CHECK-NEXT: |     `-ReturnStmt [[ADDR_91:0x[a-z0-9]*]] <line:26:3, col:10>
 // CHECK-NEXT: |       `-IntegerLiteral [[ADDR_81]] <col:10> 'int' 0
 // CHECK-NEXT: |-FunctionTemplateDecl [[ADDR_92:0x[a-z0-9]*]] <line:28:1, line:29:22> col:5 implicit also_after
@@ -187,7 +187,7 @@ int test() {
 // CHECK-NEXT: |     |-DeclStmt [[ADDR_101:0x[a-z0-9]*]] <line:31:3, col:11>
 // CHECK-NEXT: |     | `-VarDecl [[ADDR_102:0x[a-z0-9]*]] <col:3, col:10> col:10 referenced t 'double'
 // CHECK-NEXT: |     |-DeclStmt [[ADDR_103:0x[a-z0-9]*]] <line:32:3, col:18>
-// CHECK-NEXT: |     | `-VarDecl [[ADDR_104:0x[a-z0-9]*]] <col:3, col:17> col:8 q 'S<T>':'S<T>' callinit
+// CHECK-NEXT: |     | `-VarDecl [[ADDR_104:0x[a-z0-9]*]] <col:3, col:17> col:8 q 'S<T>' callinit
 // CHECK-NEXT: |     |   `-ParenListExpr [[ADDR_105:0x[a-z0-9]*]] <col:9, col:17> 'NULL TYPE'
 // CHECK-NEXT: |     |     |-FloatingLiteral [[ADDR_106:0x[a-z0-9]*]] <col:10> 'double' 2.000000e+00
 // CHECK-NEXT: |     |     `-UnaryOperator [[ADDR_107:0x[a-z0-9]*]] <col:15, col:16> 'double *' prefix '&' cannot overflow
@@ -204,14 +204,14 @@ int test() {
 // CHECK-NEXT: | |-FunctionDecl [[ADDR_118:0x[a-z0-9]*]] <line:38:1, line:40:1> line:38:5 used also_after 'int (int)'
 // CHECK-NEXT: | | |-TemplateArgument type 'int'
 // CHECK-NEXT: | | | `-BuiltinType [[ADDR_9]] 'int'
-// CHECK-NEXT: | | |-ParmVarDecl [[ADDR_119:0x[a-z0-9]*]] <col:16, col:18> col:18 s 'int':'int'
+// CHECK-NEXT: | | |-ParmVarDecl [[ADDR_119:0x[a-z0-9]*]] <col:16, col:18> col:18 s 'int'
 // CHECK-NEXT: | | `-CompoundStmt [[ADDR_120:0x[a-z0-9]*]] <col:21, line:40:1>
 // CHECK-NEXT: | |   `-ReturnStmt [[ADDR_121:0x[a-z0-9]*]] <line:39:3, col:10>
 // CHECK-NEXT: | |     `-IntegerLiteral [[ADDR_117]] <col:10> 'int' 0
 // CHECK-NEXT: | `-FunctionDecl [[ADDR_122:0x[a-z0-9]*]] <line:38:1, line:40:1> line:38:5 used also_after 'int (double)'
 // CHECK-NEXT: |   |-TemplateArgument type 'double'
 // CHECK-NEXT: |   | `-BuiltinType [[ADDR_21]] 'double'
-// CHECK-NEXT: |   |-ParmVarDecl [[ADDR_123:0x[a-z0-9]*]] <col:16, col:18> col:18 s 'double':'double'
+// CHECK-NEXT: |   |-ParmVarDecl [[ADDR_123:0x[a-z0-9]*]] <col:16, col:18> col:18 s 'double'
 // CHECK-NEXT: |   `-CompoundStmt [[ADDR_124:0x[a-z0-9]*]] <col:21, line:40:1>
 // CHECK-NEXT: |     `-ReturnStmt [[ADDR_125:0x[a-z0-9]*]] <line:39:3, col:10>
 // CHECK-NEXT: |       `-IntegerLiteral [[ADDR_117]] <col:10> 'int' 0
@@ -243,19 +243,19 @@ int test() {
 // CHECK-NEXT:             |-CallExpr [[ADDR_151:0x[a-z0-9]*]] <col:78, col:98> 'int'
 // CHECK-NEXT:             | |-ImplicitCastExpr [[ADDR_152:0x[a-z0-9]*]] <col:78> 'int (*)(S<int>)' <FunctionToPointerDecay>
 // CHECK-NEXT:             | | `-DeclRefExpr [[ADDR_153:0x[a-z0-9]*]] <col:78> 'int (S<int>)' {{.*}}Function [[ADDR_65]] 'special' 'int (S<int>)' (FunctionTemplate [[ADDR_58]] 'special')
-// CXX11-NEXT:             | `-CXXConstructExpr [[ADDR_154:0x[a-z0-9]*]] <col:86, col:97> 'S<int>':'S<int>' 'void (S<int> &&) noexcept' elidable
-// CXX11-NEXT:             |   `-MaterializeTemporaryExpr [[ADDR_155:0x[a-z0-9]*]] <col:86, col:97> 'S<int>':'S<int>' xvalue
-// CHECK-NEXT:             |     `-CXXTemporaryObjectExpr [[ADDR_156:0x[a-z0-9]*]] <col:86, col:97> 'S<int>':'S<int>' 'void (int, int *)'
+// CXX11-NEXT:             | `-CXXConstructExpr [[ADDR_154:0x[a-z0-9]*]] <col:86, col:97> 'S<int>' 'void (S<int> &&) noexcept' elidable
+// CXX11-NEXT:             |   `-MaterializeTemporaryExpr [[ADDR_155:0x[a-z0-9]*]] <col:86, col:97> 'S<int>' xvalue
+// CHECK-NEXT:             |     `-CXXTemporaryObjectExpr [[ADDR_156:0x[a-z0-9]*]] <col:86, col:97> 'S<int>' 'void (int, int *)'
 // CHECK-NEXT:             |       |-IntegerLiteral [[ADDR_157:0x[a-z0-9]*]] <col:93> 'int' 0
 // CHECK-NEXT:             |       `-ImplicitCastExpr [[ADDR_158:0x[a-z0-9]*]] <col:96> 'int *' <NullToPointer>
 // CHECK-NEXT:             |         `-IntegerLiteral [[ADDR_159:0x[a-z0-9]*]] <col:96> 'int' 0
 // CHECK-NEXT:             `-CallExpr [[ADDR_160:0x[a-z0-9]*]] <line:23:1, line:44:98> 'int'
 // CHECK-NEXT:               |-ImplicitCastExpr [[ADDR_161:0x[a-z0-9]*]] <line:23:1> 'int (*)(S<int>)' <FunctionToPointerDecay>
 // CHECK-NEXT:               | `-DeclRefExpr [[ADDR_68]] <col:1> 'int (S<int>)' {{.*}}Function [[ADDR_69]] 'special[implementation={extension(allow_templates)}]' 'int (S<int>)'
-// CXX11-NEXT:               `-CXXConstructExpr [[ADDR_162:0x[a-z0-9]*]] <line:44:86, col:97> 'S<int>':'S<int>' 'void (S<int> &&) noexcept' elidable
-// CXX11-NEXT:                 `-MaterializeTemporaryExpr [[ADDR_163:0x[a-z0-9]*]] <col:86, col:97> 'S<int>':'S<int>' xvalue
-// CXX11-NEXT:                   `-CXXTemporaryObjectExpr [[ADDR_156]] <col:86, col:97> 'S<int>':'S<int>' 'void (int, int *)'
-// CXX17-NEXT:                   `-CXXTemporaryObjectExpr [[ADDR_156]] <line:[[#]]:86, col:97> 'S<int>':'S<int>' 'void (int, int *)'
+// CXX11-NEXT:               `-CXXConstructExpr [[ADDR_162:0x[a-z0-9]*]] <line:44:86, col:97> 'S<int>' 'void (S<int> &&) noexcept' elidable
+// CXX11-NEXT:                 `-MaterializeTemporaryExpr [[ADDR_163:0x[a-z0-9]*]] <col:86, col:97> 'S<int>' xvalue
+// CXX11-NEXT:                   `-CXXTemporaryObjectExpr [[ADDR_156]] <col:86, col:97> 'S<int>' 'void (int, int *)'
+// CXX17-NEXT:                   `-CXXTemporaryObjectExpr [[ADDR_156]] <line:[[#]]:86, col:97> 'S<int>' 'void (int, int *)'
 // CHECK-NEXT:                     |-IntegerLiteral [[ADDR_157]] <col:93> 'int' 0
 // CHECK-NEXT:                     `-ImplicitCastExpr [[ADDR_158]] <col:96> 'int *' <NullToPointer>
 // CHECK-NEXT:                       `-IntegerLiteral [[ADDR_159]] <col:96> 'int' 0
diff --git a/clang/test/AST/ast-dump-overloaded-operators.cpp b/clang/test/AST/ast-dump-overloaded-operators.cpp
index dc6d7bdc9b085..639a0d9874eb0 100644
--- a/clang/test/AST/ast-dump-overloaded-operators.cpp
+++ b/clang/test/AST/ast-dump-overloaded-operators.cpp
@@ -31,14 +31,14 @@ void test() {
 // CHECK-NEXT:     |-CXXOperatorCallExpr {{.*}} <line:16:3, col:7> 'void' '+'
 // CHECK-NEXT:     | |-ImplicitCastExpr {{.*}} <col:5> 'void (*)(E, E)' <FunctionToPointerDecay>
 // CHECK-NEXT:     | | `-DeclRefExpr {{.*}} <col:5> 'void (E, E)' lvalue Function {{.*}} 'operator+' 'void (E, E)'
-// CHECK-NEXT:     | |-ImplicitCastExpr {{.*}} <col:3> 'E':'E' <LValueToRValue>
-// CHECK-NEXT:     | | `-DeclRefExpr {{.*}} <col:3> 'E':'E' lvalue Var {{.*}} 'e' 'E'
-// CHECK-NEXT:     | `-ImplicitCastExpr {{.*}} <col:7> 'E':'E' <LValueToRValue>
-// CHECK-NEXT:     |   `-DeclRefExpr {{.*}} <col:7> 'E':'E' lvalue Var {{.*}} 'e' 'E'
+// CHECK-NEXT:     | |-ImplicitCastExpr {{.*}} <col:3> 'E' <LValueToRValue>
+// CHECK-NEXT:     | | `-DeclRefExpr {{.*}} <col:3> 'E' lvalue Var {{.*}} 'e' 'E'
+// CHECK-NEXT:     | `-ImplicitCastExpr {{.*}} <col:7> 'E' <LValueToRValue>
+// CHECK-NEXT:     |   `-DeclRefExpr {{.*}} <col:7> 'E' lvalue Var {{.*}} 'e' 'E'
 // CHECK-NEXT:     `-CXXOperatorCallExpr {{.*}} <line:17:3, col:7> 'void' ','
 // CHECK-NEXT:       |-ImplicitCastExpr {{.*}} <col:5> 'void (*)(E, E)' <FunctionToPointerDecay>
 // CHECK-NEXT:       | `-DeclRefExpr {{.*}} <col:5> 'void (E, E)' lvalue Function {{.*}} 'operator,' 'void (E, E)'
-// CHECK-NEXT:       |-ImplicitCastExpr {{.*}} <col:3> 'E':'E' <LValueToRValue>
-// CHECK-NEXT:       | `-DeclRefExpr {{.*}} <col:3> 'E':'E' lvalue Var {{.*}} 'e' 'E'
-// CHECK-NEXT:       `-ImplicitCastExpr {{.*}} <col:7> 'E':'E' <LValueToRValue>
-// CHECK-NEXT:         `-DeclRefExpr {{.*}} <col:7> 'E':'E' lvalue Var {{.*}} 'e' 'E'
+// CHECK-NEXT:       |-ImplicitCastExpr {{.*}} <col:3> 'E' <LValueToRValue>
+// CHECK-NEXT:       | `-DeclRefExpr {{.*}} <col:3> 'E' lvalue Var {{.*}} 'e' 'E'
+// CHECK-NEXT:       `-ImplicitCastExpr {{.*}} <col:7> 'E' <LValueToRValue>
+// CHECK-NEXT:         `-DeclRefExpr {{.*}} <col:7> 'E' lvalue Var {{.*}} 'e' 'E'
diff --git a/clang/test/AST/ast-dump-records-json.cpp b/clang/test/AST/ast-dump-records-json.cpp
index bc53d03176f66..a7eb8771d3f02 100644
--- a/clang/test/AST/ast-dump-records-json.cpp
+++ b/clang/test/AST/ast-dump-records-json.cpp
@@ -3266,7 +3266,6 @@ struct Derived6 : virtual public Bases... {
 // CHECK-NEXT:   {
 // CHECK-NEXT:    "access": "public",
 // CHECK-NEXT:    "type": {
-// CHECK-NEXT:     "desugaredQualType": "Base1",
 // CHECK-NEXT:     "qualType": "Base1"
 // CHECK-NEXT:    },
 // CHECK-NEXT:    "writtenAccess": "none"
@@ -3378,7 +3377,6 @@ struct Derived6 : virtual public Bases... {
 // CHECK-NEXT:   {
 // CHECK-NEXT:    "access": "private",
 // CHECK-NEXT:    "type": {
-// CHECK-NEXT:     "desugaredQualType": "Base1",
 // CHECK-NEXT:     "qualType": "Base1"
 // CHECK-NEXT:    },
 // CHECK-NEXT:    "writtenAccess": "private"
@@ -3479,7 +3477,6 @@ struct Derived6 : virtual public Bases... {
 // CHECK-NEXT:    "access": "public",
 // CHECK-NEXT:    "isVirtual": true,
 // CHECK-NEXT:    "type": {
-// CHECK-NEXT:     "desugaredQualType": "Base1",
 // CHECK-NEXT:     "qualType": "Base1"
 // CHECK-NEXT:    },
 // CHECK-NEXT:    "writtenAccess": "none"
@@ -3718,7 +3715,6 @@ struct Derived6 : virtual public Bases... {
 // CHECK-NEXT:   {
 // CHECK-NEXT:    "access": "public",
 // CHECK-NEXT:    "type": {
-// CHECK-NEXT:     "desugaredQualType": "Base1",
 // CHECK-NEXT:     "qualType": "Base1"
 // CHECK-NEXT:    },
 // CHECK-NEXT:    "writtenAccess": "none"
@@ -3727,7 +3723,6 @@ struct Derived6 : virtual public Bases... {
 // CHECK-NEXT:    "access": "public",
 // CHECK-NEXT:    "isVirtual": true,
 // CHECK-NEXT:    "type": {
-// CHECK-NEXT:     "desugaredQualType": "Base2",
 // CHECK-NEXT:     "qualType": "Base2"
 // CHECK-NEXT:    },
 // CHECK-NEXT:    "writtenAccess": "none"
@@ -3735,7 +3730,6 @@ struct Derived6 : virtual public Bases... {
 // CHECK-NEXT:   {
 // CHECK-NEXT:    "access": "protected",
 // CHECK-NEXT:    "type": {
-// CHECK-NEXT:     "desugaredQualType": "Base3",
 // CHECK-NEXT:     "qualType": "Base3"
 // CHECK-NEXT:    },
 // CHECK-NEXT:    "writtenAccess": "protected"
@@ -3975,7 +3969,6 @@ struct Derived6 : virtual public Bases... {
 // CHECK-NEXT:    "access": "protected",
 // CHECK-NEXT:    "isVirtual": true,
 // CHECK-NEXT:    "type": {
-// CHECK-NEXT:     "desugaredQualType": "Base1",
 // CHECK-NEXT:     "qualType": "Base1"
 // CHECK-NEXT:    },
 // CHECK-NEXT:    "writtenAccess": "protected"
diff --git a/clang/test/AST/ast-dump-records.cpp b/clang/test/AST/ast-dump-records.cpp
index 7a93b69d37ba6..bfd8892698d4b 100644
--- a/clang/test/AST/ast-dump-records.cpp
+++ b/clang/test/AST/ast-dump-records.cpp
@@ -301,10 +301,10 @@ class NonTrivial {
 struct CheckFullExpression {
 // CHECK: |-CXXRecordDecl {{.*}} struct CheckFullExpression definition
   NonTrivial value = NonTrivial();
-// CHECK: | |-FieldDecl {{.*}} value 'NonTrivial':'NonTrivial'
-// CHECK-NEXT: | | `-ExprWithCleanups {{.*}} 'NonTrivial':'NonTrivial'
-// CHECK-NEXT: | |   `-CXXBindTemporaryExpr {{.*}} 'NonTrivial':'NonTrivial' (CXXTemporary{{.*}})
-// CHECK-NEXT: | |     `-CXXTemporaryObjectExpr {{.*}} 'NonTrivial':'NonTrivial' 'void ()'
+// CHECK: | |-FieldDecl {{.*}} value 'NonTrivial'
+// CHECK-NEXT: | | `-ExprWithCleanups {{.*}} 'NonTrivial'
+// CHECK-NEXT: | |   `-CXXBindTemporaryExpr {{.*}} 'NonTrivial' (CXXTemporary{{.*}})
+// CHECK-NEXT: | |     `-CXXTemporaryObjectExpr {{.*}} 'NonTrivial' 'void ()'
 };
 
 struct CheckNoCleanup {
diff --git a/clang/test/AST/ast-dump-recovery.cpp b/clang/test/AST/ast-dump-recovery.cpp
index 278b9fc000b57..cfb013585ad74 100644
--- a/clang/test/AST/ast-dump-recovery.cpp
+++ b/clang/test/AST/ast-dump-recovery.cpp
@@ -202,27 +202,27 @@ void InvalidInitalizer(int x) {
   // CHECK-NEXT:  `-InitListExpr
   Bar b2 = {1};
   // CHECK:     `-VarDecl {{.*}} b3 'Bar'
-  // CHECK-NEXT:  `-RecoveryExpr {{.*}} 'Bar':'Bar' contains-errors
+  // CHECK-NEXT:  `-RecoveryExpr {{.*}} 'Bar' contains-errors
   // CHECK-NEXT:    `-DeclRefExpr {{.*}} 'x' 'int'
   Bar b3 = Bar(x);
   // CHECK:     `-VarDecl {{.*}} b4 'Bar'
-  // CHECK-NEXT:  `-RecoveryExpr {{.*}} 'Bar':'Bar' contains-errors
+  // CHECK-NEXT:  `-RecoveryExpr {{.*}} 'Bar' contains-errors
   // CHECK-NEXT:    `-InitListExpr {{.*}} 'void'
   // CHECK-NEXT:      `-DeclRefExpr {{.*}} 'x' 'int'
   Bar b4 = Bar{x};
   // CHECK:     `-VarDecl {{.*}} b5 'Bar'
-  // CHECK-NEXT: `-CXXUnresolvedConstructExpr {{.*}} 'Bar':'Bar' contains-errors 'Bar'
+  // CHECK-NEXT: `-CXXUnresolvedConstructExpr {{.*}} 'Bar' contains-errors 'Bar'
   // CHECK-NEXT:   `-RecoveryExpr {{.*}} contains-errors
   // CHECK-NEXT:     `-UnresolvedLookupExpr {{.*}} 'invalid'
   Bar b5 = Bar(invalid());
   // CHECK:     `-VarDecl {{.*}} b6 'Bar'
-  // CHECK-NEXT: `-CXXUnresolvedConstructExpr {{.*}} 'Bar':'Bar' contains-errors 'Bar'
+  // CHECK-NEXT: `-CXXUnresolvedConstructExpr {{.*}} 'Bar' contains-errors 'Bar'
   // CHECK-NEXT:  `-InitListExpr {{.*}} contains-errors
   // CHECK-NEXT:   `-RecoveryExpr {{.*}} contains-errors
   // CHECK-NEXT:     `-UnresolvedLookupExpr {{.*}} 'invalid'
   Bar b6 = Bar{invalid()};
 
-  // CHECK:     RecoveryExpr {{.*}} 'Bar':'Bar' contains-errors
+  // CHECK:     RecoveryExpr {{.*}} 'Bar' contains-errors
   // CHECK-NEXT:  `-IntegerLiteral {{.*}} 'int' 1
   Bar(1);
 
@@ -326,7 +326,7 @@ void CtorInitializer() {
     // CHECK-NEXT: |   `-RecoveryExpr {{.*}} '<dependent type>'
     // CHECK-NEXT: |     `-UnresolvedLookupExpr {{.*}} '<overloaded function type>'
     // CHECK-NEXT: |-CXXCtorInitializer Field {{.*}} 's' 'S'
-    // CHECK-NEXT: | `-RecoveryExpr {{.*}} 'S':'S' contains-errors
+    // CHECK-NEXT: | `-RecoveryExpr {{.*}} 'S' contains-errors
     // CHECK-NEXT: |   |-IntegerLiteral {{.*}} 1
     // CHECK-NEXT: |   `-IntegerLiteral {{.*}} 2
   };
diff --git a/clang/test/AST/ast-dump-stmt-json.cpp b/clang/test/AST/ast-dump-stmt-json.cpp
index 8e42a2e276b46..667a12a012024 100644
--- a/clang/test/AST/ast-dump-stmt-json.cpp
+++ b/clang/test/AST/ast-dump-stmt-json.cpp
@@ -2265,7 +2265,6 @@ void TestDependentGenericSelectionExpr(Ty T) {
 // CHECK-NEXT:          "isReferenced": true,
 // CHECK-NEXT:          "name": "obj",
 // CHECK-NEXT:          "type": {
-// CHECK-NEXT:           "desugaredQualType": "DependentScopeMemberExprWrapper<T>",
 // CHECK-NEXT:           "qualType": "DependentScopeMemberExprWrapper<T>"
 // CHECK-NEXT:          }
 // CHECK-NEXT:         }
@@ -2331,7 +2330,6 @@ void TestDependentGenericSelectionExpr(Ty T) {
 // CHECK-NEXT:             }
 // CHECK-NEXT:            },
 // CHECK-NEXT:            "type": {
-// CHECK-NEXT:             "desugaredQualType": "DependentScopeMemberExprWrapper<T>",
 // CHECK-NEXT:             "qualType": "DependentScopeMemberExprWrapper<T>"
 // CHECK-NEXT:            },
 // CHECK-NEXT:            "valueCategory": "lvalue",
@@ -2340,7 +2338,6 @@ void TestDependentGenericSelectionExpr(Ty T) {
 // CHECK-NEXT:             "kind": "VarDecl",
 // CHECK-NEXT:             "name": "obj",
 // CHECK-NEXT:             "type": {
-// CHECK-NEXT:              "desugaredQualType": "DependentScopeMemberExprWrapper<T>",
 // CHECK-NEXT:              "qualType": "DependentScopeMemberExprWrapper<T>"
 // CHECK-NEXT:             }
 // CHECK-NEXT:            }
@@ -2429,7 +2426,6 @@ void TestDependentGenericSelectionExpr(Ty T) {
 // CHECK-NEXT:             }
 // CHECK-NEXT:            },
 // CHECK-NEXT:            "type": {
-// CHECK-NEXT:             "desugaredQualType": "DependentScopeMemberExprWrapper<T>",
 // CHECK-NEXT:             "qualType": "DependentScopeMemberExprWrapper<T>"
 // CHECK-NEXT:            },
 // CHECK-NEXT:            "valueCategory": "lvalue",
@@ -2438,7 +2434,6 @@ void TestDependentGenericSelectionExpr(Ty T) {
 // CHECK-NEXT:             "kind": "VarDecl",
 // CHECK-NEXT:             "name": "obj",
 // CHECK-NEXT:             "type": {
-// CHECK-NEXT:              "desugaredQualType": "DependentScopeMemberExprWrapper<T>",
 // CHECK-NEXT:              "qualType": "DependentScopeMemberExprWrapper<T>"
 // CHECK-NEXT:             }
 // CHECK-NEXT:            }
@@ -2593,7 +2588,6 @@ void TestDependentGenericSelectionExpr(Ty T) {
 // CHECK-NEXT:                 }
 // CHECK-NEXT:                },
 // CHECK-NEXT:                "type": {
-// CHECK-NEXT:                 "desugaredQualType": "DependentScopeMemberExprWrapper<T>",
 // CHECK-NEXT:                 "qualType": "DependentScopeMemberExprWrapper<T>"
 // CHECK-NEXT:                },
 // CHECK-NEXT:                "valueCategory": "lvalue",
@@ -2602,7 +2596,6 @@ void TestDependentGenericSelectionExpr(Ty T) {
 // CHECK-NEXT:                 "kind": "VarDecl",
 // CHECK-NEXT:                 "name": "obj",
 // CHECK-NEXT:                 "type": {
-// CHECK-NEXT:                  "desugaredQualType": "DependentScopeMemberExprWrapper<T>",
 // CHECK-NEXT:                  "qualType": "DependentScopeMemberExprWrapper<T>"
 // CHECK-NEXT:                 }
 // CHECK-NEXT:                }
@@ -2779,7 +2772,6 @@ void TestDependentGenericSelectionExpr(Ty T) {
 // CHECK-NEXT:          "isReferenced": true,
 // CHECK-NEXT:          "name": "obj",
 // CHECK-NEXT:          "type": {
-// CHECK-NEXT:           "desugaredQualType": "OtherDependentScopeMemberExprWrapper<T>",
 // CHECK-NEXT:           "qualType": "OtherDependentScopeMemberExprWrapper<T>"
 // CHECK-NEXT:          }
 // CHECK-NEXT:         }
@@ -2867,7 +2859,6 @@ void TestDependentGenericSelectionExpr(Ty T) {
 // CHECK-NEXT:             }
 // CHECK-NEXT:            },
 // CHECK-NEXT:            "type": {
-// CHECK-NEXT:             "desugaredQualType": "OtherDependentScopeMemberExprWrapper<T>",
 // CHECK-NEXT:             "qualType": "OtherDependentScopeMemberExprWrapper<T>"
 // CHECK-NEXT:            },
 // CHECK-NEXT:            "valueCategory": "lvalue",
@@ -2876,7 +2867,6 @@ void TestDependentGenericSelectionExpr(Ty T) {
 // CHECK-NEXT:             "kind": "VarDecl",
 // CHECK-NEXT:             "name": "obj",
 // CHECK-NEXT:             "type": {
-// CHECK-NEXT:              "desugaredQualType": "OtherDependentScopeMemberExprWrapper<T>",
 // CHECK-NEXT:              "qualType": "OtherDependentScopeMemberExprWrapper<T>"
 // CHECK-NEXT:             }
 // CHECK-NEXT:            }
@@ -3037,7 +3027,6 @@ void TestDependentGenericSelectionExpr(Ty T) {
 // CHECK-NEXT:             }
 // CHECK-NEXT:            },
 // CHECK-NEXT:            "type": {
-// CHECK-NEXT:             "desugaredQualType": "U",
 // CHECK-NEXT:             "qualType": "U"
 // CHECK-NEXT:            },
 // CHECK-NEXT:            "valueCategory": "prvalue",
@@ -3066,7 +3055,6 @@ void TestDependentGenericSelectionExpr(Ty T) {
 // CHECK-NEXT:             }
 // CHECK-NEXT:            },
 // CHECK-NEXT:            "type": {
-// CHECK-NEXT:             "desugaredQualType": "U",
 // CHECK-NEXT:             "qualType": "U"
 // CHECK-NEXT:            },
 // CHECK-NEXT:            "valueCategory": "prvalue",
@@ -4563,7 +4551,6 @@ void TestDependentGenericSelectionExpr(Ty T) {
 // CHECK-NEXT:          "isUsed": true,
 // CHECK-NEXT:          "name": "__begin1",
 // CHECK-NEXT:          "type": {
-// CHECK-NEXT:           "desugaredQualType": "int *",
 // CHECK-NEXT:           "qualType": "int *"
 // CHECK-NEXT:          },
 // CHECK-NEXT:          "init": "c",
@@ -4663,7 +4650,6 @@ void TestDependentGenericSelectionExpr(Ty T) {
 // CHECK-NEXT:          "isUsed": true,
 // CHECK-NEXT:          "name": "__end1",
 // CHECK-NEXT:          "type": {
-// CHECK-NEXT:           "desugaredQualType": "int *",
 // CHECK-NEXT:           "qualType": "int *"
 // CHECK-NEXT:          },
 // CHECK-NEXT:          "init": "c",
@@ -4804,7 +4790,6 @@ void TestDependentGenericSelectionExpr(Ty T) {
 // CHECK-NEXT:           }
 // CHECK-NEXT:          },
 // CHECK-NEXT:          "type": {
-// CHECK-NEXT:           "desugaredQualType": "int *",
 // CHECK-NEXT:           "qualType": "int *"
 // CHECK-NEXT:          },
 // CHECK-NEXT:          "valueCategory": "prvalue",
@@ -4826,7 +4811,6 @@ void TestDependentGenericSelectionExpr(Ty T) {
 // CHECK-NEXT:             }
 // CHECK-NEXT:            },
 // CHECK-NEXT:            "type": {
-// CHECK-NEXT:             "desugaredQualType": "int *",
 // CHECK-NEXT:             "qualType": "int *"
 // CHECK-NEXT:            },
 // CHECK-NEXT:            "valueCategory": "lvalue",
@@ -4835,7 +4819,6 @@ void TestDependentGenericSelectionExpr(Ty T) {
 // CHECK-NEXT:             "kind": "VarDecl",
 // CHECK-NEXT:             "name": "__begin1",
 // CHECK-NEXT:             "type": {
-// CHECK-NEXT:              "desugaredQualType": "int *",
 // CHECK-NEXT:              "qualType": "int *"
 // CHECK-NEXT:             }
 // CHECK-NEXT:            }
@@ -4858,7 +4841,6 @@ void TestDependentGenericSelectionExpr(Ty T) {
 // CHECK-NEXT:           }
 // CHECK-NEXT:          },
 // CHECK-NEXT:          "type": {
-// CHECK-NEXT:           "desugaredQualType": "int *",
 // CHECK-NEXT:           "qualType": "int *"
 // CHECK-NEXT:          },
 // CHECK-NEXT:          "valueCategory": "prvalue",
@@ -4880,7 +4862,6 @@ void TestDependentGenericSelectionExpr(Ty T) {
 // CHECK-NEXT:             }
 // CHECK-NEXT:            },
 // CHECK-NEXT:            "type": {
-// CHECK-NEXT:             "desugaredQualType": "int *",
 // CHECK-NEXT:             "qualType": "int *"
 // CHECK-NEXT:            },
 // CHECK-NEXT:            "valueCategory": "lvalue",
@@ -4889,7 +4870,6 @@ void TestDependentGenericSelectionExpr(Ty T) {
 // CHECK-NEXT:             "kind": "VarDecl",
 // CHECK-NEXT:             "name": "__end1",
 // CHECK-NEXT:             "type": {
-// CHECK-NEXT:              "desugaredQualType": "int *",
 // CHECK-NEXT:              "qualType": "int *"
 // CHECK-NEXT:             }
 // CHECK-NEXT:            }
@@ -4914,7 +4894,6 @@ void TestDependentGenericSelectionExpr(Ty T) {
 // CHECK-NEXT:         }
 // CHECK-NEXT:        },
 // CHECK-NEXT:        "type": {
-// CHECK-NEXT:         "desugaredQualType": "int *",
 // CHECK-NEXT:         "qualType": "int *"
 // CHECK-NEXT:        },
 // CHECK-NEXT:        "valueCategory": "lvalue",
@@ -4937,7 +4916,6 @@ void TestDependentGenericSelectionExpr(Ty T) {
 // CHECK-NEXT:           }
 // CHECK-NEXT:          },
 // CHECK-NEXT:          "type": {
-// CHECK-NEXT:           "desugaredQualType": "int *",
 // CHECK-NEXT:           "qualType": "int *"
 // CHECK-NEXT:          },
 // CHECK-NEXT:          "valueCategory": "lvalue",
@@ -4946,7 +4924,6 @@ void TestDependentGenericSelectionExpr(Ty T) {
 // CHECK-NEXT:           "kind": "VarDecl",
 // CHECK-NEXT:           "name": "__begin1",
 // CHECK-NEXT:           "type": {
-// CHECK-NEXT:            "desugaredQualType": "int *",
 // CHECK-NEXT:            "qualType": "int *"
 // CHECK-NEXT:           }
 // CHECK-NEXT:          }
@@ -5055,7 +5032,6 @@ void TestDependentGenericSelectionExpr(Ty T) {
 // CHECK-NEXT:                 }
 // CHECK-NEXT:                },
 // CHECK-NEXT:                "type": {
-// CHECK-NEXT:                 "desugaredQualType": "int *",
 // CHECK-NEXT:                 "qualType": "int *"
 // CHECK-NEXT:                },
 // CHECK-NEXT:                "valueCategory": "prvalue",
@@ -5077,7 +5053,6 @@ void TestDependentGenericSelectionExpr(Ty T) {
 // CHECK-NEXT:                   }
 // CHECK-NEXT:                  },
 // CHECK-NEXT:                  "type": {
-// CHECK-NEXT:                   "desugaredQualType": "int *",
 // CHECK-NEXT:                   "qualType": "int *"
 // CHECK-NEXT:                  },
 // CHECK-NEXT:                  "valueCategory": "lvalue",
@@ -5086,7 +5061,6 @@ void TestDependentGenericSelectionExpr(Ty T) {
 // CHECK-NEXT:                   "kind": "VarDecl",
 // CHECK-NEXT:                   "name": "__begin1",
 // CHECK-NEXT:                   "type": {
-// CHECK-NEXT:                    "desugaredQualType": "int *",
 // CHECK-NEXT:                    "qualType": "int *"
 // CHECK-NEXT:                   }
 // CHECK-NEXT:                  }
@@ -5160,7 +5134,6 @@ void TestDependentGenericSelectionExpr(Ty T) {
 // CHECK-NEXT:        "isUsed": true,
 // CHECK-NEXT:        "name": "C",
 // CHECK-NEXT:        "type": {
-// CHECK-NEXT:         "desugaredQualType": "Container",
 // CHECK-NEXT:         "qualType": "Container"
 // CHECK-NEXT:        },
 // CHECK-NEXT:        "init": "call",
@@ -5181,7 +5154,6 @@ void TestDependentGenericSelectionExpr(Ty T) {
 // CHECK-NEXT:           }
 // CHECK-NEXT:          },
 // CHECK-NEXT:          "type": {
-// CHECK-NEXT:           "desugaredQualType": "Container",
 // CHECK-NEXT:           "qualType": "Container"
 // CHECK-NEXT:          },
 // CHECK-NEXT:          "valueCategory": "prvalue",
@@ -5275,7 +5247,6 @@ void TestDependentGenericSelectionExpr(Ty T) {
 // CHECK-NEXT:             }
 // CHECK-NEXT:            },
 // CHECK-NEXT:            "type": {
-// CHECK-NEXT:             "desugaredQualType": "Container",
 // CHECK-NEXT:             "qualType": "Container"
 // CHECK-NEXT:            },
 // CHECK-NEXT:            "valueCategory": "lvalue",
@@ -5284,7 +5255,6 @@ void TestDependentGenericSelectionExpr(Ty T) {
 // CHECK-NEXT:             "kind": "VarDecl",
 // CHECK-NEXT:             "name": "C",
 // CHECK-NEXT:             "type": {
-// CHECK-NEXT:              "desugaredQualType": "Container",
 // CHECK-NEXT:              "qualType": "Container"
 // CHECK-NEXT:             }
 // CHECK-NEXT:            }
@@ -5333,7 +5303,6 @@ void TestDependentGenericSelectionExpr(Ty T) {
 // CHECK-NEXT:          "isUsed": true,
 // CHECK-NEXT:          "name": "__begin1",
 // CHECK-NEXT:          "type": {
-// CHECK-NEXT:           "desugaredQualType": "int *",
 // CHECK-NEXT:           "qualType": "int *"
 // CHECK-NEXT:          },
 // CHECK-NEXT:          "init": "c",
@@ -5418,7 +5387,6 @@ void TestDependentGenericSelectionExpr(Ty T) {
 // CHECK-NEXT:                   }
 // CHECK-NEXT:                  },
 // CHECK-NEXT:                  "type": {
-// CHECK-NEXT:                   "desugaredQualType": "Container",
 // CHECK-NEXT:                   "qualType": "Container"
 // CHECK-NEXT:                  },
 // CHECK-NEXT:                  "valueCategory": "lvalue",
@@ -5481,7 +5449,6 @@ void TestDependentGenericSelectionExpr(Ty T) {
 // CHECK-NEXT:          "isUsed": true,
 // CHECK-NEXT:          "name": "__end1",
 // CHECK-NEXT:          "type": {
-// CHECK-NEXT:           "desugaredQualType": "int *",
 // CHECK-NEXT:           "qualType": "int *"
 // CHECK-NEXT:          },
 // CHECK-NEXT:          "init": "c",
@@ -5566,7 +5533,6 @@ void TestDependentGenericSelectionExpr(Ty T) {
 // CHECK-NEXT:                   }
 // CHECK-NEXT:                  },
 // CHECK-NEXT:                  "type": {
-// CHECK-NEXT:                   "desugaredQualType": "Container",
 // CHECK-NEXT:                   "qualType": "Container"
 // CHECK-NEXT:                  },
 // CHECK-NEXT:                  "valueCategory": "lvalue",
@@ -5626,7 +5592,6 @@ void TestDependentGenericSelectionExpr(Ty T) {
 // CHECK-NEXT:           }
 // CHECK-NEXT:          },
 // CHECK-NEXT:          "type": {
-// CHECK-NEXT:           "desugaredQualType": "int *",
 // CHECK-NEXT:           "qualType": "int *"
 // CHECK-NEXT:          },
 // CHECK-NEXT:          "valueCategory": "prvalue",
@@ -5648,7 +5613,6 @@ void TestDependentGenericSelectionExpr(Ty T) {
 // CHECK-NEXT:             }
 // CHECK-NEXT:            },
 // CHECK-NEXT:            "type": {
-// CHECK-NEXT:             "desugaredQualType": "int *",
 // CHECK-NEXT:             "qualType": "int *"
 // CHECK-NEXT:            },
 // CHECK-NEXT:            "valueCategory": "lvalue",
@@ -5657,7 +5621,6 @@ void TestDependentGenericSelectionExpr(Ty T) {
 // CHECK-NEXT:             "kind": "VarDecl",
 // CHECK-NEXT:             "name": "__begin1",
 // CHECK-NEXT:             "type": {
-// CHECK-NEXT:              "desugaredQualType": "int *",
 // CHECK-NEXT:              "qualType": "int *"
 // CHECK-NEXT:             }
 // CHECK-NEXT:            }
@@ -5680,7 +5643,6 @@ void TestDependentGenericSelectionExpr(Ty T) {
 // CHECK-NEXT:           }
 // CHECK-NEXT:          },
 // CHECK-NEXT:          "type": {
-// CHECK-NEXT:           "desugaredQualType": "int *",
 // CHECK-NEXT:           "qualType": "int *"
 // CHECK-NEXT:          },
 // CHECK-NEXT:          "valueCategory": "prvalue",
@@ -5702,7 +5664,6 @@ void TestDependentGenericSelectionExpr(Ty T) {
 // CHECK-NEXT:             }
 // CHECK-NEXT:            },
 // CHECK-NEXT:            "type": {
-// CHECK-NEXT:             "desugaredQualType": "int *",
 // CHECK-NEXT:             "qualType": "int *"
 // CHECK-NEXT:            },
 // CHECK-NEXT:            "valueCategory": "lvalue",
@@ -5711,7 +5672,6 @@ void TestDependentGenericSelectionExpr(Ty T) {
 // CHECK-NEXT:             "kind": "VarDecl",
 // CHECK-NEXT:             "name": "__end1",
 // CHECK-NEXT:             "type": {
-// CHECK-NEXT:              "desugaredQualType": "int *",
 // CHECK-NEXT:              "qualType": "int *"
 // CHECK-NEXT:             }
 // CHECK-NEXT:            }
@@ -5736,7 +5696,6 @@ void TestDependentGenericSelectionExpr(Ty T) {
 // CHECK-NEXT:         }
 // CHECK-NEXT:        },
 // CHECK-NEXT:        "type": {
-// CHECK-NEXT:         "desugaredQualType": "int *",
 // CHECK-NEXT:         "qualType": "int *"
 // CHECK-NEXT:        },
 // CHECK-NEXT:        "valueCategory": "lvalue",
@@ -5759,7 +5718,6 @@ void TestDependentGenericSelectionExpr(Ty T) {
 // CHECK-NEXT:           }
 // CHECK-NEXT:          },
 // CHECK-NEXT:          "type": {
-// CHECK-NEXT:           "desugaredQualType": "int *",
 // CHECK-NEXT:           "qualType": "int *"
 // CHECK-NEXT:          },
 // CHECK-NEXT:          "valueCategory": "lvalue",
@@ -5768,7 +5726,6 @@ void TestDependentGenericSelectionExpr(Ty T) {
 // CHECK-NEXT:           "kind": "VarDecl",
 // CHECK-NEXT:           "name": "__begin1",
 // CHECK-NEXT:           "type": {
-// CHECK-NEXT:            "desugaredQualType": "int *",
 // CHECK-NEXT:            "qualType": "int *"
 // CHECK-NEXT:           }
 // CHECK-NEXT:          }
@@ -5877,7 +5834,6 @@ void TestDependentGenericSelectionExpr(Ty T) {
 // CHECK-NEXT:                 }
 // CHECK-NEXT:                },
 // CHECK-NEXT:                "type": {
-// CHECK-NEXT:                 "desugaredQualType": "int *",
 // CHECK-NEXT:                 "qualType": "int *"
 // CHECK-NEXT:                },
 // CHECK-NEXT:                "valueCategory": "prvalue",
@@ -5899,7 +5855,6 @@ void TestDependentGenericSelectionExpr(Ty T) {
 // CHECK-NEXT:                   }
 // CHECK-NEXT:                  },
 // CHECK-NEXT:                  "type": {
-// CHECK-NEXT:                   "desugaredQualType": "int *",
 // CHECK-NEXT:                   "qualType": "int *"
 // CHECK-NEXT:                  },
 // CHECK-NEXT:                  "valueCategory": "lvalue",
@@ -5908,7 +5863,6 @@ void TestDependentGenericSelectionExpr(Ty T) {
 // CHECK-NEXT:                   "kind": "VarDecl",
 // CHECK-NEXT:                   "name": "__begin1",
 // CHECK-NEXT:                   "type": {
-// CHECK-NEXT:                    "desugaredQualType": "int *",
 // CHECK-NEXT:                    "qualType": "int *"
 // CHECK-NEXT:                   }
 // CHECK-NEXT:                  }
@@ -6120,7 +6074,6 @@ void TestDependentGenericSelectionExpr(Ty T) {
 // CHECK-NEXT:          "isUsed": true,
 // CHECK-NEXT:          "name": "__begin1",
 // CHECK-NEXT:          "type": {
-// CHECK-NEXT:           "desugaredQualType": "int *",
 // CHECK-NEXT:           "qualType": "int *"
 // CHECK-NEXT:          },
 // CHECK-NEXT:          "init": "c",
@@ -6220,7 +6173,6 @@ void TestDependentGenericSelectionExpr(Ty T) {
 // CHECK-NEXT:          "isUsed": true,
 // CHECK-NEXT:          "name": "__end1",
 // CHECK-NEXT:          "type": {
-// CHECK-NEXT:           "desugaredQualType": "int *",
 // CHECK-NEXT:           "qualType": "int *"
 // CHECK-NEXT:          },
 // CHECK-NEXT:          "init": "c",
@@ -6361,7 +6313,6 @@ void TestDependentGenericSelectionExpr(Ty T) {
 // CHECK-NEXT:           }
 // CHECK-NEXT:          },
 // CHECK-NEXT:          "type": {
-// CHECK-NEXT:           "desugaredQualType": "int *",
 // CHECK-NEXT:           "qualType": "int *"
 // CHECK-NEXT:          },
 // CHECK-NEXT:          "valueCategory": "prvalue",
@@ -6383,7 +6334,6 @@ void TestDependentGenericSelectionExpr(Ty T) {
 // CHECK-NEXT:             }
 // CHECK-NEXT:            },
 // CHECK-NEXT:            "type": {
-// CHECK-NEXT:             "desugaredQualType": "int *",
 // CHECK-NEXT:             "qualType": "int *"
 // CHECK-NEXT:            },
 // CHECK-NEXT:            "valueCategory": "lvalue",
@@ -6392,7 +6342,6 @@ void TestDependentGenericSelectionExpr(Ty T) {
 // CHECK-NEXT:             "kind": "VarDecl",
 // CHECK-NEXT:             "name": "__begin1",
 // CHECK-NEXT:             "type": {
-// CHECK-NEXT:              "desugaredQualType": "int *",
 // CHECK-NEXT:              "qualType": "int *"
 // CHECK-NEXT:             }
 // CHECK-NEXT:            }
@@ -6415,7 +6364,6 @@ void TestDependentGenericSelectionExpr(Ty T) {
 // CHECK-NEXT:           }
 // CHECK-NEXT:          },
 // CHECK-NEXT:          "type": {
-// CHECK-NEXT:           "desugaredQualType": "int *",
 // CHECK-NEXT:           "qualType": "int *"
 // CHECK-NEXT:          },
 // CHECK-NEXT:          "valueCategory": "prvalue",
@@ -6437,7 +6385,6 @@ void TestDependentGenericSelectionExpr(Ty T) {
 // CHECK-NEXT:             }
 // CHECK-NEXT:            },
 // CHECK-NEXT:            "type": {
-// CHECK-NEXT:             "desugaredQualType": "int *",
 // CHECK-NEXT:             "qualType": "int *"
 // CHECK-NEXT:            },
 // CHECK-NEXT:            "valueCategory": "lvalue",
@@ -6446,7 +6393,6 @@ void TestDependentGenericSelectionExpr(Ty T) {
 // CHECK-NEXT:             "kind": "VarDecl",
 // CHECK-NEXT:             "name": "__end1",
 // CHECK-NEXT:             "type": {
-// CHECK-NEXT:              "desugaredQualType": "int *",
 // CHECK-NEXT:              "qualType": "int *"
 // CHECK-NEXT:             }
 // CHECK-NEXT:            }
@@ -6471,7 +6417,6 @@ void TestDependentGenericSelectionExpr(Ty T) {
 // CHECK-NEXT:         }
 // CHECK-NEXT:        },
 // CHECK-NEXT:        "type": {
-// CHECK-NEXT:         "desugaredQualType": "int *",
 // CHECK-NEXT:         "qualType": "int *"
 // CHECK-NEXT:        },
 // CHECK-NEXT:        "valueCategory": "lvalue",
@@ -6494,7 +6439,6 @@ void TestDependentGenericSelectionExpr(Ty T) {
 // CHECK-NEXT:           }
 // CHECK-NEXT:          },
 // CHECK-NEXT:          "type": {
-// CHECK-NEXT:           "desugaredQualType": "int *",
 // CHECK-NEXT:           "qualType": "int *"
 // CHECK-NEXT:          },
 // CHECK-NEXT:          "valueCategory": "lvalue",
@@ -6503,7 +6447,6 @@ void TestDependentGenericSelectionExpr(Ty T) {
 // CHECK-NEXT:           "kind": "VarDecl",
 // CHECK-NEXT:           "name": "__begin1",
 // CHECK-NEXT:           "type": {
-// CHECK-NEXT:            "desugaredQualType": "int *",
 // CHECK-NEXT:            "qualType": "int *"
 // CHECK-NEXT:           }
 // CHECK-NEXT:          }
@@ -6612,7 +6555,6 @@ void TestDependentGenericSelectionExpr(Ty T) {
 // CHECK-NEXT:                 }
 // CHECK-NEXT:                },
 // CHECK-NEXT:                "type": {
-// CHECK-NEXT:                 "desugaredQualType": "int *",
 // CHECK-NEXT:                 "qualType": "int *"
 // CHECK-NEXT:                },
 // CHECK-NEXT:                "valueCategory": "prvalue",
@@ -6634,7 +6576,6 @@ void TestDependentGenericSelectionExpr(Ty T) {
 // CHECK-NEXT:                   }
 // CHECK-NEXT:                  },
 // CHECK-NEXT:                  "type": {
-// CHECK-NEXT:                   "desugaredQualType": "int *",
 // CHECK-NEXT:                   "qualType": "int *"
 // CHECK-NEXT:                  },
 // CHECK-NEXT:                  "valueCategory": "lvalue",
@@ -6643,7 +6584,6 @@ void TestDependentGenericSelectionExpr(Ty T) {
 // CHECK-NEXT:                   "kind": "VarDecl",
 // CHECK-NEXT:                   "name": "__begin1",
 // CHECK-NEXT:                   "type": {
-// CHECK-NEXT:                    "desugaredQualType": "int *",
 // CHECK-NEXT:                    "qualType": "int *"
 // CHECK-NEXT:                   }
 // CHECK-NEXT:                  }
diff --git a/clang/test/AST/ast-dump-stmt.cpp b/clang/test/AST/ast-dump-stmt.cpp
index 4be66756e6c1e..407584e5b82de 100644
--- a/clang/test/AST/ast-dump-stmt.cpp
+++ b/clang/test/AST/ast-dump-stmt.cpp
@@ -99,8 +99,8 @@ void TestUnionInitList()
   U us[3] = {1};
 // CHECK: VarDecl {{.+}} <col:3, col:15> col:5 us 'U[3]' cinit
 // CHECK-NEXT: `-InitListExpr {{.+}} <col:13, col:15> 'U[3]'
-// CHECK-NEXT:   |-array_filler: InitListExpr {{.+}} <col:15> 'U':'U' field Field {{.+}} 'i' 'int'
-// CHECK-NEXT:   `-InitListExpr {{.+}} <col:14> 'U':'U' field Field {{.+}} 'i' 'int'
+// CHECK-NEXT:   |-array_filler: InitListExpr {{.+}} <col:15> 'U' field Field {{.+}} 'i' 'int'
+// CHECK-NEXT:   `-InitListExpr {{.+}} <col:14> 'U' field Field {{.+}} 'i' 'int'
 // CHECK-NEXT:     `-IntegerLiteral {{.+}} <col:14> 'int' 1
 }
 
@@ -198,28 +198,28 @@ void TestIteration() {
   // CHECK-NEXT: VarDecl 0x{{[^ ]*}} <col:16> col:16 implicit used __range1 'int (&)[10]' cinit
   // CHECK-NEXT: DeclRefExpr 0x{{[^ ]*}} <col:16> 'int[10]' lvalue Var 0x{{[^ ]*}} 'vals' 'int[10]'
   // CHECK-NEXT: DeclStmt
-  // CHECK-NEXT: VarDecl 0x{{[^ ]*}} <col:14> col:14 implicit used __begin1 'int *':'int *' cinit
+  // CHECK-NEXT: VarDecl 0x{{[^ ]*}} <col:14> col:14 implicit used __begin1 'int *' cinit
   // CHECK-NEXT: ImplicitCastExpr
   // CHECK-NEXT: DeclRefExpr 0x{{[^ ]*}} <col:14> 'int[10]' lvalue Var 0x{{[^ ]*}} '__range1' 'int (&)[10]'
   // CHECK-NEXT: DeclStmt
-  // CHECK-NEXT: VarDecl 0x{{[^ ]*}} <col:14, col:16> col:14 implicit used __end1 'int *':'int *' cinit
+  // CHECK-NEXT: VarDecl 0x{{[^ ]*}} <col:14, col:16> col:14 implicit used __end1 'int *' cinit
   // CHECK-NEXT: BinaryOperator 0x{{[^ ]*}} <col:14, col:16> 'int *' '+'
   // CHECK-NEXT: ImplicitCastExpr
   // CHECK-NEXT: DeclRefExpr 0x{{[^ ]*}} <col:14> 'int[10]' lvalue Var 0x{{[^ ]*}} '__range1' 'int (&)[10]'
   // CHECK-NEXT: IntegerLiteral 0x{{[^ ]*}} <col:16> 'long' 10
   // CHECK-NEXT: BinaryOperator 0x{{[^ ]*}} <col:14> 'bool' '!='
   // CHECK-NEXT: ImplicitCastExpr
-  // CHECK-NEXT: DeclRefExpr 0x{{[^ ]*}} <col:14> 'int *':'int *' lvalue Var 0x{{[^ ]*}} '__begin1' 'int *':'int *'
+  // CHECK-NEXT: DeclRefExpr 0x{{[^ ]*}} <col:14> 'int *' lvalue Var 0x{{[^ ]*}} '__begin1' 'int *'
   // CHECK-NEXT: ImplicitCastExpr
-  // CHECK-NEXT: DeclRefExpr 0x{{[^ ]*}} <col:14> 'int *':'int *' lvalue Var 0x{{[^ ]*}} '__end1' 'int *':'int *'
-  // CHECK-NEXT: UnaryOperator 0x{{[^ ]*}} <col:14> 'int *':'int *' lvalue prefix '++'
-  // CHECK-NEXT: DeclRefExpr 0x{{[^ ]*}} <col:14> 'int *':'int *' lvalue Var 0x{{[^ ]*}} '__begin1' 'int *':'int *'
+  // CHECK-NEXT: DeclRefExpr 0x{{[^ ]*}} <col:14> 'int *' lvalue Var 0x{{[^ ]*}} '__end1' 'int *'
+  // CHECK-NEXT: UnaryOperator 0x{{[^ ]*}} <col:14> 'int *' lvalue prefix '++'
+  // CHECK-NEXT: DeclRefExpr 0x{{[^ ]*}} <col:14> 'int *' lvalue Var 0x{{[^ ]*}} '__begin1' 'int *'
   // CHECK-NEXT: DeclStmt
   // CHECK-NEXT: VarDecl 0x{{[^ ]*}} <col:8, col:14> col:12 v 'int' cinit
   // CHECK-NEXT: ImplicitCastExpr
   // CHECK-NEXT: UnaryOperator 0x{{[^ ]*}} <col:14> 'int' lvalue prefix '*' cannot overflow
   // CHECK-NEXT: ImplicitCastExpr
-  // CHECK-NEXT: DeclRefExpr 0x{{[^ ]*}} <col:14> 'int *':'int *' lvalue Var 0x{{[^ ]*}} '__begin1' 'int *':'int *'
+  // CHECK-NEXT: DeclRefExpr 0x{{[^ ]*}} <col:14> 'int *' lvalue Var 0x{{[^ ]*}} '__begin1' 'int *'
   // CHECK-NEXT: NullStmt
 
   Container C;
@@ -229,32 +229,32 @@ void TestIteration() {
   // CHECK-NEXT: <<<NULL>>>
   // CHECK-NEXT: DeclStmt
   // CHECK-NEXT: VarDecl 0x{{[^ ]*}} <col:16> col:16 implicit used __range1 'Container &' cinit
-  // CHECK-NEXT: DeclRefExpr 0x{{[^ ]*}} <col:16> 'Container':'Container' lvalue Var 0x{{[^ ]*}} 'C' 'Container':'Container'
+  // CHECK-NEXT: DeclRefExpr 0x{{[^ ]*}} <col:16> 'Container' lvalue Var 0x{{[^ ]*}} 'C' 'Container'
   // CHECK-NEXT: DeclStmt
-  // CHECK-NEXT: VarDecl 0x{{[^ ]*}} <col:14> col:14 implicit used __begin1 'int *':'int *' cinit
+  // CHECK-NEXT: VarDecl 0x{{[^ ]*}} <col:14> col:14 implicit used __begin1 'int *' cinit
   // CHECK-NEXT: CXXMemberCallExpr 0x{{[^ ]*}} <col:14> 'int *'
   // CHECK-NEXT: MemberExpr 0x{{[^ ]*}} <col:14> '<bound member function type>' .begin 0x{{[^ ]*}}
   // CHECK-NEXT: ImplicitCastExpr
-  // CHECK-NEXT: DeclRefExpr 0x{{[^ ]*}} <col:14> 'Container':'Container' lvalue Var 0x{{[^ ]*}} '__range1' 'Container &'
+  // CHECK-NEXT: DeclRefExpr 0x{{[^ ]*}} <col:14> 'Container' lvalue Var 0x{{[^ ]*}} '__range1' 'Container &'
   // CHECK-NEXT: DeclStmt
-  // CHECK-NEXT: VarDecl 0x{{[^ ]*}} <col:14> col:14 implicit used __end1 'int *':'int *' cinit
+  // CHECK-NEXT: VarDecl 0x{{[^ ]*}} <col:14> col:14 implicit used __end1 'int *' cinit
   // CHECK-NEXT: CXXMemberCallExpr 0x{{[^ ]*}} <col:14> 'int *'
   // CHECK-NEXT: MemberExpr 0x{{[^ ]*}} <col:14> '<bound member function type>' .end 0x{{[^ ]*}}
   // CHECK-NEXT: ImplicitCastExpr
-  // CHECK-NEXT: DeclRefExpr 0x{{[^ ]*}} <col:14> 'Container':'Container' lvalue Var 0x{{[^ ]*}} '__range1' 'Container &'
+  // CHECK-NEXT: DeclRefExpr 0x{{[^ ]*}} <col:14> 'Container' lvalue Var 0x{{[^ ]*}} '__range1' 'Container &'
   // CHECK-NEXT: BinaryOperator 0x{{[^ ]*}} <col:14> 'bool' '!='
   // CHECK-NEXT: ImplicitCastExpr
-  // CHECK-NEXT: DeclRefExpr 0x{{[^ ]*}} <col:14> 'int *':'int *' lvalue Var 0x{{[^ ]*}} '__begin1' 'int *':'int *'
+  // CHECK-NEXT: DeclRefExpr 0x{{[^ ]*}} <col:14> 'int *' lvalue Var 0x{{[^ ]*}} '__begin1' 'int *'
   // CHECK-NEXT: ImplicitCastExpr
-  // CHECK-NEXT: DeclRefExpr 0x{{[^ ]*}} <col:14> 'int *':'int *' lvalue Var 0x{{[^ ]*}} '__end1' 'int *':'int *'
-  // CHECK-NEXT: UnaryOperator 0x{{[^ ]*}} <col:14> 'int *':'int *' lvalue prefix '++'
-  // CHECK-NEXT: DeclRefExpr 0x{{[^ ]*}} <col:14> 'int *':'int *' lvalue Var 0x{{[^ ]*}} '__begin1' 'int *':'int *'
+  // CHECK-NEXT: DeclRefExpr 0x{{[^ ]*}} <col:14> 'int *' lvalue Var 0x{{[^ ]*}} '__end1' 'int *'
+  // CHECK-NEXT: UnaryOperator 0x{{[^ ]*}} <col:14> 'int *' lvalue prefix '++'
+  // CHECK-NEXT: DeclRefExpr 0x{{[^ ]*}} <col:14> 'int *' lvalue Var 0x{{[^ ]*}} '__begin1' 'int *'
   // CHECK-NEXT: DeclStmt
   // CHECK-NEXT: VarDecl 0x{{[^ ]*}} <col:8, col:14> col:12 v 'int' cinit
   // CHECK-NEXT: ImplicitCastExpr
   // CHECK-NEXT: UnaryOperator 0x{{[^ ]*}} <col:14> 'int' lvalue prefix '*' cannot overflow
   // CHECK-NEXT: ImplicitCastExpr
-  // CHECK-NEXT: DeclRefExpr 0x{{[^ ]*}} <col:14> 'int *':'int *' lvalue Var 0x{{[^ ]*}} '__begin1' 'int *':'int *'
+  // CHECK-NEXT: DeclRefExpr 0x{{[^ ]*}} <col:14> 'int *' lvalue Var 0x{{[^ ]*}} '__begin1' 'int *'
   // CHECK-NEXT: NullStmt
 
   for (int a; int v : vals)
@@ -266,27 +266,27 @@ void TestIteration() {
   // CHECK-NEXT: VarDecl 0x{{[^ ]*}} <col:23> col:23 implicit used __range1 'int (&)[10]' cinit
   // CHECK-NEXT: DeclRefExpr 0x{{[^ ]*}} <col:23> 'int[10]' lvalue Var 0x{{[^ ]*}} 'vals' 'int[10]'
   // CHECK-NEXT: DeclStmt
-  // CHECK-NEXT: VarDecl 0x{{[^ ]*}} <col:21> col:21 implicit used __begin1 'int *':'int *' cinit
+  // CHECK-NEXT: VarDecl 0x{{[^ ]*}} <col:21> col:21 implicit used __begin1 'int *' cinit
   // CHECK-NEXT: ImplicitCastExpr
   // CHECK-NEXT: DeclRefExpr 0x{{[^ ]*}} <col:21> 'int[10]' lvalue Var 0x{{[^ ]*}} '__range1' 'int (&)[10]'
   // CHECK-NEXT: DeclStmt
-  // CHECK-NEXT: VarDecl 0x{{[^ ]*}} <col:21, col:23> col:21 implicit used __end1 'int *':'int *' cinit
+  // CHECK-NEXT: VarDecl 0x{{[^ ]*}} <col:21, col:23> col:21 implicit used __end1 'int *' cinit
   // CHECK-NEXT: BinaryOperator 0x{{[^ ]*}} <col:21, col:23> 'int *' '+'
   // CHECK-NEXT: ImplicitCastExpr
   // CHECK-NEXT: DeclRefExpr 0x{{[^ ]*}} <col:21> 'int[10]' lvalue Var 0x{{[^ ]*}} '__range1' 'int (&)[10]'
   // CHECK-NEXT: IntegerLiteral 0x{{[^ ]*}} <col:23> 'long' 10
   // CHECK-NEXT: BinaryOperator 0x{{[^ ]*}} <col:21> 'bool' '!='
   // CHECK-NEXT: ImplicitCastExpr
-  // CHECK-NEXT: DeclRefExpr 0x{{[^ ]*}} <col:21> 'int *':'int *' lvalue Var 0x{{[^ ]*}} '__begin1' 'int *':'int *'
+  // CHECK-NEXT: DeclRefExpr 0x{{[^ ]*}} <col:21> 'int *' lvalue Var 0x{{[^ ]*}} '__begin1' 'int *'
   // CHECK-NEXT: ImplicitCastExpr
-  // CHECK-NEXT: DeclRefExpr 0x{{[^ ]*}} <col:21> 'int *':'int *' lvalue Var 0x{{[^ ]*}} '__end1' 'int *':'int *'
-  // CHECK-NEXT: UnaryOperator 0x{{[^ ]*}} <col:21> 'int *':'int *' lvalue prefix '++'
-  // CHECK-NEXT: DeclRefExpr 0x{{[^ ]*}} <col:21> 'int *':'int *' lvalue Var 0x{{[^ ]*}} '__begin1' 'int *':'int *'
+  // CHECK-NEXT: DeclRefExpr 0x{{[^ ]*}} <col:21> 'int *' lvalue Var 0x{{[^ ]*}} '__end1' 'int *'
+  // CHECK-NEXT: UnaryOperator 0x{{[^ ]*}} <col:21> 'int *' lvalue prefix '++'
+  // CHECK-NEXT: DeclRefExpr 0x{{[^ ]*}} <col:21> 'int *' lvalue Var 0x{{[^ ]*}} '__begin1' 'int *'
   // CHECK-NEXT: DeclStmt
   // CHECK-NEXT: VarDecl 0x{{[^ ]*}} <col:15, col:21> col:19 v 'int' cinit
   // CHECK-NEXT: ImplicitCastExpr
   // CHECK-NEXT: UnaryOperator 0x{{[^ ]*}} <col:21> 'int' lvalue prefix '*' cannot overflow
   // CHECK-NEXT: ImplicitCastExpr
-  // CHECK-NEXT: DeclRefExpr 0x{{[^ ]*}} <col:21> 'int *':'int *' lvalue Var 0x{{[^ ]*}} '__begin1' 'int *':'int *'
+  // CHECK-NEXT: DeclRefExpr 0x{{[^ ]*}} <col:21> 'int *' lvalue Var 0x{{[^ ]*}} '__begin1' 'int *'
   // CHECK-NEXT: NullStmt
 }
diff --git a/clang/test/AST/ast-dump-stmt.m b/clang/test/AST/ast-dump-stmt.m
index 1d349ece966f8..e0fc16b3fa825 100644
--- a/clang/test/AST/ast-dump-stmt.m
+++ b/clang/test/AST/ast-dump-stmt.m
@@ -55,4 +55,4 @@ id TestCompoundLiteral(id a) {
 // CHECK:     FunctionDecl{{.*}}TestCompoundLiteral
 // CHECK:       ExprWithCleanups
 // CHECK-NEXT:    cleanup CompoundLiteralExpr
-// CHECK:           CompoundLiteralExpr{{.*}}'S':'S' lvalue
+// CHECK:           CompoundLiteralExpr{{.*}}'S' lvalue
diff --git a/clang/test/AST/ast-dump-template-decls-json.cpp b/clang/test/AST/ast-dump-template-decls-json.cpp
index f51ef937d91db..00a656cd05917 100644
--- a/clang/test/AST/ast-dump-template-decls-json.cpp
+++ b/clang/test/AST/ast-dump-template-decls-json.cpp
@@ -826,7 +826,6 @@ void i();
 // CHECK-NEXT:         }
 // CHECK-NEXT:        },
 // CHECK-NEXT:        "type": {
-// CHECK-NEXT:         "desugaredQualType": "Uy<Ty>",
 // CHECK-NEXT:         "qualType": "Uy<Ty>"
 // CHECK-NEXT:        }
 // CHECK-NEXT:       }
diff --git a/clang/test/AST/ast-dump-template-decls.cpp b/clang/test/AST/ast-dump-template-decls.cpp
index 847a437e0f85e..142bc9e6ad9a0 100644
--- a/clang/test/AST/ast-dump-template-decls.cpp
+++ b/clang/test/AST/ast-dump-template-decls.cpp
@@ -171,7 +171,7 @@ template<class E1, class E2> class E {};
 using test1 = D<E, int>;
 // CHECK:      TypeAliasDecl 0x{{[^ ]*}} <line:{{[1-9]+}}:1, col:23> col:7 test1 'D<subst_default_argument::E, int>':'subst_default_argument::E<int, subst_default_argument::A<int>>'
 // CHECK:      TemplateSpecializationType 0x{{[^ ]*}} 'A<int>' sugar A
-// CHECK-NEXT: |-TemplateArgument type 'int':'int'
+// CHECK-NEXT: |-TemplateArgument type 'int'
 // CHECK-NEXT: | `-SubstTemplateTypeParmType 0x{{[^ ]*}} 'int' sugar class depth 0 index 1 D2
 // CHECK-NEXT: |   |-TypeAliasTemplate 0x{{[^ ]*}} 'D'
 // CHECK-NEXT: |   `-BuiltinType 0x{{[^ ]*}} 'int'
diff --git a/clang/test/AST/ast-dump-template-json-win32-mangler-crash.cpp b/clang/test/AST/ast-dump-template-json-win32-mangler-crash.cpp
index f6d1caa121f2a..8c03b58abb0ed 100644
--- a/clang/test/AST/ast-dump-template-json-win32-mangler-crash.cpp
+++ b/clang/test/AST/ast-dump-template-json-win32-mangler-crash.cpp
@@ -532,7 +532,6 @@ int main()
 // CHECK-NEXT:        },
 // CHECK-NEXT:        "name": "type",
 // CHECK-NEXT:        "type": {
-// CHECK-NEXT:         "desugaredQualType": "integral_constant<_Ty, _Val>",
 // CHECK-NEXT:         "qualType": "integral_constant<_Ty, _Val>"
 // CHECK-NEXT:        },
 // CHECK-NEXT:        "inner": [
@@ -872,7 +871,6 @@ int main()
 // CHECK-NEXT:      },
 // CHECK-NEXT:      "name": "bool_constant",
 // CHECK-NEXT:      "type": {
-// CHECK-NEXT:       "desugaredQualType": "integral_constant<bool, _Val>",
 // CHECK-NEXT:       "qualType": "integral_constant<bool, _Val>"
 // CHECK-NEXT:      },
 // CHECK-NEXT:      "inner": [
diff --git a/clang/test/AST/ast-dump-temporaries-json.cpp b/clang/test/AST/ast-dump-temporaries-json.cpp
index a8b14de29fcf9..0fd2762cee1a7 100644
--- a/clang/test/AST/ast-dump-temporaries-json.cpp
+++ b/clang/test/AST/ast-dump-temporaries-json.cpp
@@ -36,7 +36,6 @@ void MaterializeTemp() {
 // CHECK-NEXT:   }
 // CHECK-NEXT:  },
 // CHECK-NEXT:  "type": {
-// CHECK-NEXT:   "desugaredQualType": "const S",
 // CHECK-NEXT:   "qualType": "const S"
 // CHECK-NEXT:  },
 // CHECK-NEXT:  "valueCategory": "lvalue",
@@ -58,7 +57,6 @@ void MaterializeTemp() {
 // CHECK-NEXT:     }
 // CHECK-NEXT:    },
 // CHECK-NEXT:    "type": {
-// CHECK-NEXT:     "desugaredQualType": "const S",
 // CHECK-NEXT:     "qualType": "const S"
 // CHECK-NEXT:    },
 // CHECK-NEXT:    "valueCategory": "lvalue",
@@ -89,7 +87,6 @@ void MaterializeTemp() {
 // CHECK-NEXT:       }
 // CHECK-NEXT:      },
 // CHECK-NEXT:      "type": {
-// CHECK-NEXT:       "desugaredQualType": "const S",
 // CHECK-NEXT:       "qualType": "const S"
 // CHECK-NEXT:      },
 // CHECK-NEXT:      "valueCategory": "prvalue",
@@ -111,7 +108,6 @@ void MaterializeTemp() {
 // CHECK-NEXT:         }
 // CHECK-NEXT:        },
 // CHECK-NEXT:        "type": {
-// CHECK-NEXT:         "desugaredQualType": "S",
 // CHECK-NEXT:         "qualType": "S"
 // CHECK-NEXT:        },
 // CHECK-NEXT:        "valueCategory": "prvalue",
@@ -141,7 +137,6 @@ void MaterializeTemp() {
 // CHECK-NEXT:           }
 // CHECK-NEXT:          },
 // CHECK-NEXT:          "type": {
-// CHECK-NEXT:           "desugaredQualType": "S",
 // CHECK-NEXT:           "qualType": "S"
 // CHECK-NEXT:          },
 // CHECK-NEXT:          "valueCategory": "prvalue",
diff --git a/clang/test/AST/ast-dump-types-json.cpp b/clang/test/AST/ast-dump-types-json.cpp
index 0a92483fb1c7f..c1bb9266fa869 100644
--- a/clang/test/AST/ast-dump-types-json.cpp
+++ b/clang/test/AST/ast-dump-types-json.cpp
@@ -101,7 +101,6 @@ using ::TestUsingShadowDeclType;
 // CHECK-NEXT:  },
 // CHECK-NEXT:  "name": "TestElaboratedType2",
 // CHECK-NEXT:  "type": {
-// CHECK-NEXT:   "desugaredQualType": "NS::S",
 // CHECK-NEXT:   "qualType": "NS::S"
 // CHECK-NEXT:  },
 // CHECK-NEXT:  "inner": [
diff --git a/clang/test/AST/coroutine-locals-cleanup.cpp b/clang/test/AST/coroutine-locals-cleanup.cpp
index 6eb6fc0948cbb..ce106b8e230a1 100644
--- a/clang/test/AST/coroutine-locals-cleanup.cpp
+++ b/clang/test/AST/coroutine-locals-cleanup.cpp
@@ -85,7 +85,7 @@ Task bar() {
 // CHECK:           CaseStmt
 // CHECK:             ExprWithCleanups {{.*}} 'void'
 // CHECK-NEXT:          CoawaitExpr
-// CHECK-NEXT:            CXXBindTemporaryExpr {{.*}} 'Task':'Task' (CXXTemporary {{.*}})
+// CHECK-NEXT:            CXXBindTemporaryExpr {{.*}} 'Task' (CXXTemporary {{.*}})
 // CHECK:                 MaterializeTemporaryExpr {{.*}} 'Awaiter':'Task::Awaiter'
 // CHECK:                 ExprWithCleanups {{.*}} 'bool'
 // CHECK-NEXT:              CXXMemberCallExpr {{.*}} 'bool'
@@ -98,7 +98,7 @@ Task bar() {
 // CHECK:           CaseStmt
 // CHECK:             ExprWithCleanups {{.*}} 'void'
 // CHECK-NEXT:          CoawaitExpr
-// CHECK-NEXT:            CXXBindTemporaryExpr {{.*}} 'Task':'Task' (CXXTemporary {{.*}})
+// CHECK-NEXT:            CXXBindTemporaryExpr {{.*}} 'Task' (CXXTemporary {{.*}})
 // CHECK:                 MaterializeTemporaryExpr {{.*}} 'Awaiter':'Task::Awaiter'
 // CHECK:                 ExprWithCleanups {{.*}} 'bool'
 // CHECK-NEXT:              CXXMemberCallExpr {{.*}} 'bool'
diff --git a/clang/test/AST/float16.cpp b/clang/test/AST/float16.cpp
index 9e0d70b9a1c00..a9e1144cf0958 100644
--- a/clang/test/AST/float16.cpp
+++ b/clang/test/AST/float16.cpp
@@ -85,7 +85,7 @@ auto C = -1.0f16 + B;
 //CHECK-NEXT: | `-ImplicitCastExpr {{.*}} '__fp16' <FloatingCast>
 //CHECK-NEXT: |   `-UnaryOperator {{.*}} 'double' prefix '-'
 //CHECK-NEXT: |     `-FloatingLiteral {{.*}} 'double' 1.000000e-01
-//CHECK-NEXT: |-VarDecl {{.*}} C 'float':'float' cinit
+//CHECK-NEXT: |-VarDecl {{.*}} C 'float' cinit
 //CHECK-NEXT: | `-BinaryOperator {{.*}} 'float' '+'
 //CHECK-NEXT: |   |-ImplicitCastExpr {{.*}} 'float' <FloatingCast>
 //CHECK-NEXT: |   | `-UnaryOperator {{.*}} '_Float16' prefix '-'
@@ -100,7 +100,7 @@ auto C = -1.0f16 + B;
 //CHECK-NATIVE: | `-ImplicitCastExpr {{.*}} '__fp16' <FloatingCast>
 //CHECK-NATIVE: |   `-UnaryOperator {{.*}} 'double' prefix '-'
 //CHECK-NATIVE: |     `-FloatingLiteral {{.*}} 'double' 1.000000e-01
-//CHECK-NATIVE: |-VarDecl {{.*}} C '__fp16':'__fp16' cinit
+//CHECK-NATIVE: |-VarDecl {{.*}} C '__fp16' cinit
 //CHECK-NATIVE: | `-BinaryOperator {{.*}} '__fp16' '+'
 //CHECK-NATIVE: |   |-ImplicitCastExpr {{.*}} '__fp16' <FloatingCast>
 //CHECK-NATIVE: |   | `-UnaryOperator {{.*}} '_Float16' prefix '-'
@@ -178,12 +178,12 @@ template <class C> C func1t(C arg) {
 //CHECK-NEXT: | |       `-FloatingLiteral {{.*}} '_Float16' 2.000000e+00
 //CHECK-NEXT: | `-FunctionDecl {{.*}} used func1t '_Float16 (_Float16)'
 //CHECK-NEXT: |   |-TemplateArgument type '_Float16'
-//CHECK:      |   |-ParmVarDecl {{.*}} used arg '_Float16':'_Float16'
+//CHECK:      |   |-ParmVarDecl {{.*}} used arg '_Float16'
 //CHECK-NEXT: |   `-CompoundStmt
 //CHECK-NEXT: |     `-ReturnStmt
 //CHECK-NEXT: |       `-BinaryOperator {{.*}} '_Float16' '*'
-//CHECK-NEXT: |         |-ImplicitCastExpr {{.*}} '_Float16':'_Float16' <LValueToRValue>
-//CHECK-NEXT: |         | `-DeclRefExpr {{.*}} '_Float16':'_Float16' lvalue ParmVar {{.*}} 'arg' '_Float16':'_Float16'
+//CHECK-NEXT: |         |-ImplicitCastExpr {{.*}} '_Float16' <LValueToRValue>
+//CHECK-NEXT: |         | `-DeclRefExpr {{.*}} '_Float16' lvalue ParmVar {{.*}} 'arg' '_Float16'
 //CHECK-NEXT: |         `-FloatingLiteral {{.*}} '_Float16' 2.000000e+00
 
 
@@ -223,14 +223,14 @@ int main(void) {
 //CHECK-NEXT:  |     `-FloatingLiteral {{.*}} 'double' 1.000977e+00
 
   C1 c1(f1l);
-//CHECK:       | `-VarDecl{{.*}} used c1 'C1':'C1' callinit
-//CHECK-NEXT:  |   `-CXXConstructExpr {{.*}} 'C1':'C1' 'void (_Float16)
+//CHECK:       | `-VarDecl{{.*}} used c1 'C1' callinit
+//CHECK-NEXT:  |   `-CXXConstructExpr {{.*}} 'C1' 'void (_Float16)
 //CHECK-NEXT:  |     `-ImplicitCastExpr {{.*}} '_Float16' <LValueToRValue>
 //CHECK-NEXT:  |       `-DeclRefExpr {{.*}} '_Float16' lvalue Var 0x{{.*}} 'f1l' '_Float16'
 
   S1<_Float16> s1 = { 132.f16 };
-//CHECK:       | `-VarDecl {{.*}} used s1 'S1<_Float16>':'S1<_Float16>' cinit
-//CHECK-NEXT:  |   `-InitListExpr {{.*}} 'S1<_Float16>':'S1<_Float16>'
+//CHECK:       | `-VarDecl {{.*}} used s1 'S1<_Float16>' cinit
+//CHECK-NEXT:  |   `-InitListExpr {{.*}} 'S1<_Float16>'
 //CHECK-NEXT:  |     `-FloatingLiteral {{.*}} '_Float16' 1.320000e+02
 
   _Float16 f4l = func1n(f1l)  + func1f(f2l) + c1.func1c(f3l) + c1.func2c(f1l) +
@@ -255,37 +255,37 @@ int main(void) {
 //CHECK-NEXT:  |     | | | | | |     `-DeclRefExpr {{.*}} '_Float16' lvalue Var {{.*}} 'f2l' '_Float16'
 //CHECK-NEXT:  |     | | | | | `-CXXMemberCallExpr {{.*}} '_Float16'
 //CHECK-NEXT:  |     | | | | |   |-MemberExpr {{.*}} '<bound member function type>' .func1c {{.*}}
-//CHECK-NEXT:  |     | | | | |   | `-DeclRefExpr {{.*}} 'C1':'C1' lvalue Var {{.*}} 'c1' 'C1':'C1'
+//CHECK-NEXT:  |     | | | | |   | `-DeclRefExpr {{.*}} 'C1' lvalue Var {{.*}} 'c1' 'C1'
 //CHECK-NEXT:  |     | | | | |   `-ImplicitCastExpr {{.*}} '_Float16' <LValueToRValue>
 //CHECK-NEXT:  |     | | | | |     `-DeclRefExpr {{.*}} '_Float16' lvalue Var {{.*}} 'f3l' '_Float16'
 //CHECK-NEXT:  |     | | | | `-CallExpr {{.*}} '_Float16'
 //CHECK-NEXT:  |     | | | |   |-ImplicitCastExpr {{.*}} '_Float16 (*)(_Float16)' <FunctionToPointerDecay>
 //CHECK-NEXT:  |     | | | |   | `-MemberExpr {{.*}} '_Float16 (_Float16)' lvalue .func2c {{.*}}
-//CHECK-NEXT:  |     | | | |   |   `-DeclRefExpr {{.*}} 'C1':'C1' lvalue Var {{.*}} 'c1' 'C1':'C1'
+//CHECK-NEXT:  |     | | | |   |   `-DeclRefExpr {{.*}} 'C1' lvalue Var {{.*}} 'c1' 'C1'
 //CHECK-NEXT:  |     | | | |   `-ImplicitCastExpr {{.*}} '_Float16' <LValueToRValue>
 //CHECK-NEXT:  |     | | | |     `-DeclRefExpr {{.*}} '_Float16' lvalue Var {{.*}} 'f1l' '_Float16'
-//CHECK-NEXT:  |     | | | `-CallExpr {{.*}} '_Float16':'_Float16'
+//CHECK-NEXT:  |     | | | `-CallExpr {{.*}} '_Float16'
 //CHECK-NEXT:  |     | | |   |-ImplicitCastExpr {{.*}} '_Float16 (*)(_Float16)' <FunctionToPointerDecay>
 //CHECK-NEXT:  |     | | |   | `-DeclRefExpr {{.*}} '_Float16 (_Float16)' lvalue Function {{.*}} 'func1t' '_Float16 (_Float16)' (FunctionTemplate {{.*}} 'func1t')
 //CHECK-NEXT:  |     | | |   `-ImplicitCastExpr {{.*}} '_Float16' <LValueToRValue>
 //CHECK-NEXT:  |     | | |     `-DeclRefExpr {{.*}} '_Float16' lvalue Var {{.*}} 'f1l' '_Float16'
 //CHECK-NEXT:  |     | | `-ImplicitCastExpr {{.*}} '_Float16' <LValueToRValue>
 //CHECK-NEXT:  |     | |   `-MemberExpr {{.*}} '_Float16' lvalue .mem2 {{.*}}
-//CHECK-NEXT:  |     | |     `-DeclRefExpr {{.*}} 'S1<_Float16>':'S1<_Float16>' lvalue Var {{.*}} 's1' 'S1<_Float16>':'S1<_Float16>'
+//CHECK-NEXT:  |     | |     `-DeclRefExpr {{.*}} 'S1<_Float16>' lvalue Var {{.*}} 's1' 'S1<_Float16>'
 //CHECK-NEXT:  |     | `-ImplicitCastExpr {{.*}} '_Float16' <LValueToRValue>
 //CHECK-NEXT:  |     |   `-DeclRefExpr {{.*}} '_Float16' lvalue Var {{.*}} 'f1n' '_Float16'
 //CHECK-NEXT:  |     `-ImplicitCastExpr {{.*}} '_Float16' <LValueToRValue>
 //CHECK-NEXT:  |       `-DeclRefExpr {{.*}} '_Float16' lvalue Var {{.*}} 'f2n' '_Float16'
 
   auto f5l = -1.f16, *f6l = &f2l, f7l = func1t(f3l);
-//CHECK:       | |-VarDecl {{.*}} f5l '_Float16':'_Float16' cinit
+//CHECK:       | |-VarDecl {{.*}} f5l '_Float16' cinit
 //CHECK-NEXT:  | | `-UnaryOperator {{.*}} '_Float16' prefix '-'
 //CHECK-NEXT:  | |   `-FloatingLiteral {{.*}} '_Float16' 1.000000e+00
 //CHECK-NEXT:  | |-VarDecl {{.*}} f6l '_Float16 *' cinit
 //CHECK-NEXT:  | | `-UnaryOperator {{.*}} '_Float16 *' prefix '&'
 //CHECK-NEXT:  | |   `-DeclRefExpr {{.*}} '_Float16' lvalue Var {{.*}} 'f2l' '_Float16'
-//CHECK-NEXT:  | `-VarDecl {{.*}} f7l '_Float16':'_Float16' cinit
-//CHECK-NEXT:  |   `-CallExpr {{.*}} '_Float16':'_Float16'
+//CHECK-NEXT:  | `-VarDecl {{.*}} f7l '_Float16' cinit
+//CHECK-NEXT:  |   `-CallExpr {{.*}} '_Float16'
 //CHECK-NEXT:  |     |-ImplicitCastExpr {{.*}} '_Float16 (*)(_Float16)' <FunctionToPointerDecay>
 //CHECK-NEXT:  |     | `-DeclRefExpr {{.*}} '_Float16 (_Float16)' lvalue Function {{.*}} 'func1t' '_Float16 (_Float16)' (FunctionTemplate {{.*}} 'func1t')
 //CHECK-NEXT:  |     `-ImplicitCastExpr {{.*}} '_Float16' <LValueToRValue>
diff --git a/clang/test/AST/nrvo.c b/clang/test/AST/nrvo.c
index a3e12759ad0d9..078445cd972c4 100644
--- a/clang/test/AST/nrvo.c
+++ b/clang/test/AST/nrvo.c
@@ -7,11 +7,11 @@ struct A f1(void) {
   // CHECK-NEXT: CompoundStmt 0x{{[^ ]*}} <col:19, line:{{[^:]*}}:1>
   struct A a;
   // CHECK-NEXT: DeclStmt 0x{{[^ ]*}} <line:[[@LINE-1]]:3, col:13>
-  // CHECK-NEXT: VarDecl 0x{{[^ ]*}} <col:3, col:12> col:12 used a 'struct A':'struct A' nrvo
+  // CHECK-NEXT: VarDecl 0x{{[^ ]*}} <col:3, col:12> col:12 used a 'struct A' nrvo
   return a;
   // CHECK-NEXT: ReturnStmt 0x{{[^ ]*}} <line:[[@LINE-1]]:3, col:10>
-  // CHECK-NEXT: ImplicitCastExpr 0x{{[^ ]*}} <col:10> 'struct A':'struct A' <LValueToRValue>
-  // CHECK-NEXT: DeclRefExpr 0x{{[^ ]*}} <col:10> 'struct A':'struct A' lvalue Var 0x{{[^ ]*}} 'a' 'struct A':'struct A'
+  // CHECK-NEXT: ImplicitCastExpr 0x{{[^ ]*}} <col:10> 'struct A' <LValueToRValue>
+  // CHECK-NEXT: DeclRefExpr 0x{{[^ ]*}} <col:10> 'struct A' lvalue Var 0x{{[^ ]*}} 'a' 'struct A'
 }
 
 void f2(void) {
@@ -20,10 +20,10 @@ void f2(void) {
     // CHECK-NEXT: CompoundStmt 0x{{[^ ]*}} <col:10, line:{{[^:]*}}:3>
     struct A a;
     // CHECK-NEXT: DeclStmt 0x{{[^ ]*}} <line:[[@LINE-1]]:5, col:15>
-    // CHECK-NEXT: VarDecl 0x{{[^ ]*}} <col:5, col:14> col:14 used a 'struct A':'struct A' nrvo
+    // CHECK-NEXT: VarDecl 0x{{[^ ]*}} <col:5, col:14> col:14 used a 'struct A' nrvo
     return a;
     // CHECK-NEXT: ReturnStmt 0x{{[^ ]*}} <line:[[@LINE-1]]:5, col:12>
-    // CHECK-NEXT: ImplicitCastExpr 0x{{[^ ]*}} <col:12> 'struct A':'struct A' <LValueToRValue>
-    // CHECK-NEXT: DeclRefExpr 0x{{[^ ]*}} <col:12> 'struct A':'struct A' lvalue Var 0x{{[^ ]*}} 'a' 'struct A':'struct A'
+    // CHECK-NEXT: ImplicitCastExpr 0x{{[^ ]*}} <col:12> 'struct A' <LValueToRValue>
+    // CHECK-NEXT: DeclRefExpr 0x{{[^ ]*}} <col:12> 'struct A' lvalue Var 0x{{[^ ]*}} 'a' 'struct A'
   }();
 }
diff --git a/clang/test/AST/sourceranges.cpp b/clang/test/AST/sourceranges.cpp
index 40135bd2e17c6..f78d34c1ee7e3 100644
--- a/clang/test/AST/sourceranges.cpp
+++ b/clang/test/AST/sourceranges.cpp
@@ -49,14 +49,14 @@ void construct() {
   A a = A(12);
   // CHECK: CXXConstructExpr {{0x[0-9a-fA-F]+}} <col:9, col:13> 'A':'foo::A' 'void (int){{( __attribute__\(\(thiscall\)\))?}}'
   D d = D(12);
-  // CHECK: CXXConstructExpr {{0x[0-9a-fA-F]+}} <col:9, col:13> 'D':'D' 'void (int){{( __attribute__\(\(thiscall\)\))?}}'
+  // CHECK: CXXConstructExpr {{0x[0-9a-fA-F]+}} <col:9, col:13> 'D' 'void (int){{( __attribute__\(\(thiscall\)\))?}}'
 }
 
 namespace PR38987 {
 struct A { A(); };
 template <class T> void f() { T{}; }
 template void f<A>();
-// CHECK: CXXTemporaryObjectExpr {{.*}} <col:31, col:33> 'PR38987::A':'PR38987::A'
+// CHECK: CXXTemporaryObjectExpr {{.*}} <col:31, col:33> 'PR38987::A'
 }
 
 void abort() __attribute__((noreturn));
diff --git a/clang/test/C/drs/dr253.c b/clang/test/C/drs/dr253.c
index 3eae0a01586a7..1c6f610dbcca1 100644
--- a/clang/test/C/drs/dr253.c
+++ b/clang/test/C/drs/dr253.c
@@ -19,7 +19,7 @@ struct fred y [] = { { { "abc" }, 1 }, [0] = { .s[0] = 'q' } };
 
 // CHECK: VarDecl 0x{{.*}} <line:16:1, col:62> col:13 y 'struct fred[1]' cinit
 // CHECK-NEXT: InitListExpr 0x{{.*}} <col:20, col:62> 'struct fred[1]'
-// CHECK-NEXT: InitListExpr 0x{{.*}} <col:46, col:60> 'struct fred':'struct fred'
+// CHECK-NEXT: InitListExpr 0x{{.*}} <col:46, col:60> 'struct fred'
 // CHECK-NEXT: InitListExpr 0x{{.*}} <col:50, col:56> 'char[6]'
 // CHECK-NEXT: array_filler
 // CHECK-NEXT: ImplicitCastExpr
diff --git a/clang/test/CXX/dcl.decl/dcl.init/dcl.init.ref/p4-ast.cpp b/clang/test/CXX/dcl.decl/dcl.init/dcl.init.ref/p4-ast.cpp
index 4fc0a05ae1eca..32c4ddd921bba 100644
--- a/clang/test/CXX/dcl.decl/dcl.init/dcl.init.ref/p4-ast.cpp
+++ b/clang/test/CXX/dcl.decl/dcl.init/dcl.init.ref/p4-ast.cpp
@@ -3,7 +3,7 @@
 void f() noexcept;
 
 // CHECK: VarDecl {{.*}} ref 'void (&)()' cinit
-// CHECK-NEXT: ImplicitCastExpr {{.*}} 'void ()':'void ()' lvalue <NoOp>
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'void ()' lvalue <NoOp>
 // CHECK-NEXT: DeclRefExpr {{.*}} 'void () noexcept' lvalue Function {{.*}} 'f' 'void () noexcept'
 void (&ref)() = f;
 
@@ -13,6 +13,6 @@ struct X {
 } x;
 
 // CHECK: VarDecl {{.*}} xp 'void (&)()' cinit
-// CHECK-NEXT: ImplicitCastExpr {{.*}} 'void ()':'void ()' lvalue <NoOp>
-// CHECK-NEXT: ImplicitCastExpr {{.*}} 'void () noexcept':'void () noexcept' lvalue <UserDefinedConversion>
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'void ()' lvalue <NoOp>
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'void () noexcept' lvalue <UserDefinedConversion>
 void (&xp)() = x;
diff --git a/clang/test/Import/objc-param-decl/test.m b/clang/test/Import/objc-param-decl/test.m
index dc98c31200193..f2b6aa0927295 100644
--- a/clang/test/Import/objc-param-decl/test.m
+++ b/clang/test/Import/objc-param-decl/test.m
@@ -5,7 +5,7 @@
 // CHECK-SAME: FirstParam
 // CHECK-SAME: 'id<NSString>'
 // CHECK-NEXT: ObjCTypeParamDecl
-// CHECK-SAME: 'id':'id'
+// CHECK-SAME: 'id'
 
 void expr() {
   Dictionary *d;
diff --git a/clang/test/OpenMP/align_clause_ast_print.cpp b/clang/test/OpenMP/align_clause_ast_print.cpp
index 3ed602d533bc8..87000f9c41bae 100644
--- a/clang/test/OpenMP/align_clause_ast_print.cpp
+++ b/clang/test/OpenMP/align_clause_ast_print.cpp
@@ -122,7 +122,7 @@ int template_test() {
 // DUMP: NonTypeTemplateParmDecl {{.*}}'unsigned int' depth 0 index 1 size
 // DUMP: IntegerLiteral {{.*}}'unsigned int' 1
 // DUMP: OMPAllocateDecl {{.*}}
-// DUMP: DeclRefExpr {{.*}}'double':'double' lvalue Var {{.*}} 'foo' 'double':'double'
+// DUMP: DeclRefExpr {{.*}}'double' lvalue Var {{.*}} 'foo' 'double'
 // DUMP: OMPAlignClause {{.*}}
 // DUMP: ConstantExpr {{.*}}'unsigned int'
 // DUMP: value: Int 1
diff --git a/clang/test/OpenMP/generic_loop_ast_print.cpp b/clang/test/OpenMP/generic_loop_ast_print.cpp
index 29c48ffde8092..df806405571cf 100644
--- a/clang/test/OpenMP/generic_loop_ast_print.cpp
+++ b/clang/test/OpenMP/generic_loop_ast_print.cpp
@@ -51,15 +51,15 @@
 //DUMP: FunctionDecl{{.*}}templ_foo 'void (int)'
 //DUMP: TemplateArgument type 'int'
 //DUMP: TemplateArgument integral 2
-//DUMP: ParmVarDecl{{.*}}'int':'int'
+//DUMP: ParmVarDecl{{.*}}'int'
 //DUMP: OMPSimdDirective
 //DUMP: OMPCollapseClause
 //DUMP: ConstantExpr{{.*}}'int'
 //DUMP: value: Int 2
 //DUMP: OMPReductionClause
-//DUMP: DeclRefExpr{{.*}}'z' 'int':'int'
+//DUMP: DeclRefExpr{{.*}}'z' 'int'
 //DUMP: OMPLastprivateClause
-//DUMP: DeclRefExpr{{.*}}'j' 'int':'int'
+//DUMP: DeclRefExpr{{.*}}'j' 'int'
 //DUMP: ForStmt
 template <typename T, int C>
 void templ_foo(T t) {
diff --git a/clang/test/OpenMP/scope_ast_print.cpp b/clang/test/OpenMP/scope_ast_print.cpp
index c69908980d229..c5c3a29738a72 100644
--- a/clang/test/OpenMP/scope_ast_print.cpp
+++ b/clang/test/OpenMP/scope_ast_print.cpp
@@ -81,8 +81,8 @@ int template_test() {
 //DUMP: BuiltinType {{.*}}'double'
 //DUMP: OMPScopeDirective
 //DUMP: OMPPrivateClause
-//DUMP: DeclRefExpr {{.*}}'double':'double' lvalue Var {{.*}} 'a' 'double':'double'
+//DUMP: DeclRefExpr {{.*}}'double' lvalue Var {{.*}} 'a' 'double'
 //DUMP: OMPReductionClause
-//DUMP: DeclRefExpr {{.*}}'double':'double' lvalue Var {{.*}} 'b' 'double':'double'
+//DUMP: DeclRefExpr {{.*}}'double' lvalue Var {{.*}} 'b' 'double'
 //PRINT: #pragma omp scope private(a) reduction(*: b)
 #endif // HEADER
diff --git a/clang/test/SemaCXX/co_await-ast.cpp b/clang/test/SemaCXX/co_await-ast.cpp
index b9eae493be952..10cee21da0e87 100644
--- a/clang/test/SemaCXX/co_await-ast.cpp
+++ b/clang/test/SemaCXX/co_await-ast.cpp
@@ -47,13 +47,13 @@ awaitable foo() {
 // CHECK:   |-CompoundStmt {{.*}}
 // CHECK:   | `-ExprWithCleanups {{.*}} 'void'
 // CHECK:   |   `-CoawaitExpr {{.*}} 'void'
-// CHECK:   |     |-CXXTemporaryObjectExpr {{.*}} 'executor':'executor' 'void (){{.*}} noexcept' zeroing
+// CHECK:   |     |-CXXTemporaryObjectExpr {{.*}} 'executor' 'void (){{.*}} noexcept' zeroing
 // CHECK:   |     |-MaterializeTemporaryExpr {{.*}} 'result_t':'awaitable_frame::result_t' lvalue
 // CHECK:   |     | `-CXXBindTemporaryExpr {{.*}} 'result_t':'awaitable_frame::result_t' (CXXTemporary {{.*}})
 // CHECK:   |     |   `-CXXMemberCallExpr {{.*}} 'result_t':'awaitable_frame::result_t'
 // CHECK:   |     |     |-MemberExpr {{.*}} '<bound member function type>' .await_transform {{.*}}
 // CHECK:   |     |     | `-DeclRefExpr {{.*}} 'std::coroutine_traits<awaitable>::promise_type':'awaitable_frame' lvalue Var {{.*}} '__promise' 'std::coroutine_traits<awaitable>::promise_type':'awaitable_frame'
-// CHECK:   |     |     `-CXXTemporaryObjectExpr {{.*}} 'executor':'executor' 'void (){{.*}} noexcept' zeroing
+// CHECK:   |     |     `-CXXTemporaryObjectExpr {{.*}} 'executor' 'void (){{.*}} noexcept' zeroing
 // CHECK:   |     |-ExprWithCleanups {{.*}} 'bool'
 // CHECK:   |     | `-CXXMemberCallExpr {{.*}} 'bool'
 // CHECK:   |     |   `-MemberExpr {{.*}} '<bound member function type>' .await_ready {{.*}}
@@ -64,7 +64,7 @@ awaitable foo() {
 // CHECK:   |     |             `-CXXMemberCallExpr {{.*}} 'result_t':'awaitable_frame::result_t'
 // CHECK:   |     |               |-MemberExpr {{.*}} '<bound member function type>' .await_transform {{.*}}
 // CHECK:   |     |               | `-DeclRefExpr {{.*}} 'std::coroutine_traits<awaitable>::promise_type':'awaitable_frame' lvalue Var {{.*}} '__promise' 'std::coroutine_traits<awaitable>::promise_type':'awaitable_frame'
-// CHECK:   |     |               `-CXXTemporaryObjectExpr {{.*}} 'executor':'executor' 'void (){{.*}} noexcept' zeroing
+// CHECK:   |     |               `-CXXTemporaryObjectExpr {{.*}} 'executor' 'void (){{.*}} noexcept' zeroing
 // CHECK:   |     |-ExprWithCleanups {{.*}} 'void'
 // CHECK:   |     | `-CXXMemberCallExpr {{.*}} 'void'
 // CHECK:   |     |   |-MemberExpr {{.*}} '<bound member function type>' .await_suspend {{.*}}
@@ -74,9 +74,9 @@ awaitable foo() {
 // CHECK:   |     |   |       `-CXXMemberCallExpr {{.*}} 'result_t':'awaitable_frame::result_t'
 // CHECK:   |     |   |         |-MemberExpr {{.*}} '<bound member function type>' .await_transform {{.*}}
 // CHECK:   |     |   |         | `-DeclRefExpr {{.*}} 'std::coroutine_traits<awaitable>::promise_type':'awaitable_frame' lvalue Var {{.*}} '__promise' 'std::coroutine_traits<awaitable>::promise_type':'awaitable_frame'
-// CHECK:   |     |   |         `-CXXTemporaryObjectExpr {{.*}} 'executor':'executor' 'void (){{.*}} noexcept' zeroing
-// CHECK:   |     |   `-ImplicitCastExpr {{.*}} 'std::coroutine_handle<void>':'std::coroutine_handle<void>' <ConstructorConversion>
-// CHECK:   |     |     `-CXXConstructExpr {{.*}} 'std::coroutine_handle<void>':'std::coroutine_handle<void>' 'void (coroutine_handle<awaitable_frame> &&){{.*}} noexcept'
+// CHECK:   |     |   |         `-CXXTemporaryObjectExpr {{.*}} 'executor' 'void (){{.*}} noexcept' zeroing
+// CHECK:   |     |   `-ImplicitCastExpr {{.*}} 'std::coroutine_handle<void>' <ConstructorConversion>
+// CHECK:   |     |     `-CXXConstructExpr {{.*}} 'std::coroutine_handle<void>' 'void (coroutine_handle<awaitable_frame> &&){{.*}} noexcept'
 // CHECK:   |     |       `-MaterializeTemporaryExpr {{.*}} 'coroutine_handle<awaitable_frame>':'std::coroutine_handle<awaitable_frame>' xvalue
 // CHECK:   |     |         `-CallExpr {{.*}} 'coroutine_handle<awaitable_frame>':'std::coroutine_handle<awaitable_frame>'
 // CHECK:   |     |           |-ImplicitCastExpr {{.*}} 'coroutine_handle<awaitable_frame> (*)(void *) noexcept' <FunctionToPointerDecay>
@@ -93,6 +93,6 @@ awaitable foo() {
 // CHECK:   |                 `-CXXMemberCallExpr {{.*}} 'result_t':'awaitable_frame::result_t'
 // CHECK:   |                   |-MemberExpr {{.*}} '<bound member function type>' .await_transform {{.*}}
 // CHECK:   |                   | `-DeclRefExpr {{.*}} 'std::coroutine_traits<awaitable>::promise_type':'awaitable_frame' lvalue Var {{.*}} '__promise' 'std::coroutine_traits<awaitable>::promise_type':'awaitable_frame'
-// CHECK:   |                   `-CXXTemporaryObjectExpr {{.*}} <col:12, col:21> 'executor':'executor' 'void (){{.*}} noexcept' zeroing
+// CHECK:   |                   `-CXXTemporaryObjectExpr {{.*}} <col:12, col:21> 'executor' 'void (){{.*}} noexcept' zeroing
 
 // Rest of the generated coroutine statements omitted.
diff --git a/clang/test/SemaCXX/consteval-cleanup.cpp b/clang/test/SemaCXX/consteval-cleanup.cpp
index ddc56174b484f..499c45db50177 100644
--- a/clang/test/SemaCXX/consteval-cleanup.cpp
+++ b/clang/test/SemaCXX/consteval-cleanup.cpp
@@ -21,11 +21,11 @@ void foo() {
   // CHECK: foo
   // CHECK: ExprWithCleanups
   // CHECK-NEXT: BinaryOperator {{.*}} 'bool' ','
-  // CHECK-NEXT: BinaryOperator {{.*}} 'P':'P' ','
-  // CHECK-NEXT: CXXFunctionalCastExpr {{.*}} 'A':'A'
-  // CHECK-NEXT: CXXBindTemporaryExpr {{.*}} 'A':'A'
-  // CHECK-NEXT: CXXConstructExpr {{.*}} 'A':'A'
-  // CHECK: ConstantExpr {{.*}} 'P':'P'
+  // CHECK-NEXT: BinaryOperator {{.*}} 'P' ','
+  // CHECK-NEXT: CXXFunctionalCastExpr {{.*}} 'A'
+  // CHECK-NEXT: CXXBindTemporaryExpr {{.*}} 'A'
+  // CHECK-NEXT: CXXConstructExpr {{.*}} 'A'
+  // CHECK: ConstantExpr {{.*}} 'P'
   // CHECK-NEXT: value:
   // CHECK-NEXT: ExprWithCleanups
 }
@@ -36,10 +36,10 @@ void foobar() {
   // CHECK: ExprWithCleanups
   // CHECK-NEXT: cleanup Block
   // CHECK-NEXT: BinaryOperator {{.*}} 'bool' ','
-  // CHECK-NEXT: BinaryOperator {{.*}} 'P':'P' ','
+  // CHECK-NEXT: BinaryOperator {{.*}} 'P' ','
   // CHECK-NEXT: CallExpr
   // CHECK-NEXT: BlockExpr
-  // CHECK: ConstantExpr {{.*}} 'P':'P'
+  // CHECK: ConstantExpr {{.*}} 'P'
   // CHECK-NEXT: value:
   // CHECK-NEXT: ExprWithCleanups
   // CHECK-NOT:  cleanup Block
diff --git a/clang/test/SemaOpenCLCXX/address-space-deduction.clcpp b/clang/test/SemaOpenCLCXX/address-space-deduction.clcpp
index 3c801f2c4db76..526e86584c77b 100644
--- a/clang/test/SemaOpenCLCXX/address-space-deduction.clcpp
+++ b/clang/test/SemaOpenCLCXX/address-space-deduction.clcpp
@@ -6,7 +6,7 @@
 constexpr int foo = 0;
 
 //CHECK: |-VarDecl {{.*}} foo1 'T' cinit
-//CHECK: `-VarTemplateSpecializationDecl {{.*}} used foo1 '__global long':'__global long' implicit_instantiation cinit
+//CHECK: `-VarTemplateSpecializationDecl {{.*}} used foo1 '__global long' implicit_instantiation cinit
 template <typename T>
 T foo1 = 0;
 
diff --git a/clang/test/SemaOpenCLCXX/addrspace-auto.clcpp b/clang/test/SemaOpenCLCXX/addrspace-auto.clcpp
index 7862564d1b4ef..97fad5939955c 100644
--- a/clang/test/SemaOpenCLCXX/addrspace-auto.clcpp
+++ b/clang/test/SemaOpenCLCXX/addrspace-auto.clcpp
@@ -1,18 +1,18 @@
 //RUN: %clang_cc1 %s -pedantic -ast-dump -verify | FileCheck %s
 
 __constant int i = 1;
-//CHECK: |-VarDecl {{.*}} ai '__global int':'__global int'
+//CHECK: |-VarDecl {{.*}} ai '__global int'
 auto ai = i;
 
 kernel void test() {
   int i;
-  //CHECK: VarDecl {{.*}} ai '__private int':'__private int'
+  //CHECK: VarDecl {{.*}} ai '__private int'
   auto ai = i;
 
   constexpr int c = 1;
-  //CHECK: VarDecl {{.*}} used cai '__constant int':'__constant int'
+  //CHECK: VarDecl {{.*}} used cai '__constant int'
   __constant auto cai = c;
-  //CHECK: VarDecl {{.*}} aii '__private int':'__private int'
+  //CHECK: VarDecl {{.*}} aii '__private int'
   auto aii = cai;
 
   //CHECK: VarDecl {{.*}} ref '__private int &__private'
diff --git a/clang/test/SemaTemplate/aggregate-deduction-candidate.cpp b/clang/test/SemaTemplate/aggregate-deduction-candidate.cpp
index d455d424ab3d0..7f535651bb815 100644
--- a/clang/test/SemaTemplate/aggregate-deduction-candidate.cpp
+++ b/clang/test/SemaTemplate/aggregate-deduction-candidate.cpp
@@ -21,8 +21,8 @@ namespace Basic {
   // CHECK: `-CXXDeductionGuideDecl {{.*}} implicit used <deduction guide for A> 'auto (double, double) -> Basic::A<double>'
   // CHECK:   |-TemplateArgument type 'double'
   // CHECK:   | `-BuiltinType {{.*}} 'double'
-  // CHECK:   |-ParmVarDecl {{.*}} 'double':'double'
-  // CHECK:   `-ParmVarDecl {{.*}} 'double':'double'
+  // CHECK:   |-ParmVarDecl {{.*}} 'double'
+  // CHECK:   `-ParmVarDecl {{.*}} 'double'
   // CHECK: FunctionProtoType {{.*}} 'auto (T, T) -> A<T>' dependent trailing_return cdecl
   // CHECK: |-InjectedClassNameType {{.*}} 'A<T>' dependent
   // CHECK: | `-CXXRecord {{.*}} 'A'
@@ -65,13 +65,13 @@ namespace Basic {
   // CHECK: FunctionTemplateDecl {{.*}} implicit <deduction guide for C>
   // CHECK: |-TemplateTypeParmDecl {{.*}} referenced typename depth 0 index 0 T
   // CHECK: |-CXXDeductionGuideDecl {{.*}} implicit <deduction guide for C> 'auto (S<T>, T) -> C<T>'
-  // CHECK: | |-ParmVarDecl {{.*}} 'S<T>':'S<T>'
+  // CHECK: | |-ParmVarDecl {{.*}} 'S<T>'
   // CHECK: | `-ParmVarDecl {{.*}} 'T'
   // CHECK: `-CXXDeductionGuideDecl {{.*}} implicit used <deduction guide for C> 'auto (S<int>, int) -> Basic::C<int>'
   // CHECK:   |-TemplateArgument type 'int'
   // CHECK:   | `-BuiltinType {{.*}} 'int'
   // CHECK:   |-ParmVarDecl {{.*}} 'S<int>':'Basic::S<int>'
-  // CHECK:   `-ParmVarDecl {{.*}} 'int':'int'
+  // CHECK:   `-ParmVarDecl {{.*}} 'int'
   // CHECK: FunctionProtoType {{.*}} 'auto (S<T>, T) -> C<T>' dependent trailing_return cdecl
   // CHECK: |-InjectedClassNameType {{.*}} 'C<T>' dependent
   // CHECK: | `-CXXRecord {{.*}} 'C'
@@ -87,8 +87,8 @@ namespace Basic {
   // CHECK: FunctionTemplateDecl {{.*}} implicit <deduction guide for D>
   // CHECK: |-TemplateTypeParmDecl {{.*}} referenced typename depth 0 index 0 T
   // CHECK: `-CXXDeductionGuideDecl {{.*}} implicit <deduction guide for D> 'auto (int, int) -> D<T>'
-  // CHECK:   |-ParmVarDecl {{.*}} 'int':'int'
-  // CHECK:   `-ParmVarDecl {{.*}} 'int':'int'
+  // CHECK:   |-ParmVarDecl {{.*}} 'int'
+  // CHECK:   `-ParmVarDecl {{.*}} 'int'
   // CHECK: FunctionProtoType {{.*}} 'auto (int, int) -> D<T>' dependent trailing_return cdecl
   // CHECK: |-InjectedClassNameType {{.*}} 'D<T>' dependent
   // CHECK: | `-CXXRecord {{.*}} 'D'
@@ -117,7 +117,7 @@ namespace Basic {
   // CHECK: `-CXXDeductionGuideDecl {{.*}} implicit used <deduction guide for E> 'auto (int, decltype(t)) -> Basic::E<int>'
   // CHECK:   |-TemplateArgument type 'int'
   // CHECK:   | `-BuiltinType {{.*}} 'int'
-  // CHECK:   |-ParmVarDecl {{.*}} 'int':'int'
+  // CHECK:   |-ParmVarDecl {{.*}} 'int'
   // CHECK:   `-ParmVarDecl {{.*}} 'decltype(t)':'int'
   // CHECK: FunctionProtoType {{.*}} 'auto (T, decltype(t)) -> E<T>' dependent trailing_return cdecl
   // CHECK: |-InjectedClassNameType {{.*}} 'E<T>' dependent
@@ -150,7 +150,7 @@ namespace Basic {
   // CHECK:   |-TemplateArgument type 'int'
   // CHECK:   | `-BuiltinType {{.*}} 'int'
   // CHECK:   |-ParmVarDecl {{.*}} 'typename I<int>::type':'int'
-  // CHECK:   `-ParmVarDecl {{.*}} 'int':'int'
+  // CHECK:   `-ParmVarDecl {{.*}} 'int'
   // CHECK: FunctionProtoType {{.*}} 'auto (typename I<T>::type, T) -> F<T>' dependent trailing_return cdecl
   // CHECK: |-InjectedClassNameType {{.*}} 'F<T>' dependent
   // CHECK: | `-CXXRecord {{.*}} 'F'
@@ -235,8 +235,8 @@ namespace BraceElision {
   // CHECK: `-CXXDeductionGuideDecl {{.*}} implicit used <deduction guide for A> 'auto (int, int) -> BraceElision::A<int>'
   // CHECK:   |-TemplateArgument type 'int'
   // CHECK:   | `-BuiltinType {{.*}} 'int'
-  // CHECK:   |-ParmVarDecl {{.*}} 'int':'int'
-  // CHECK:   `-ParmVarDecl {{.*}} 'int':'int'
+  // CHECK:   |-ParmVarDecl {{.*}} 'int'
+  // CHECK:   `-ParmVarDecl {{.*}} 'int'
   // CHECK: FunctionProtoType {{.*}} 'auto (T, T) -> A<T>' dependent trailing_return cdecl
   // CHECK: |-InjectedClassNameType {{.*}} 'A<T>' dependent
   // CHECK: | `-CXXRecord {{.*}} 'A'
@@ -275,8 +275,8 @@ namespace TrailingPack {
   // CHECK: | `-TemplateArgument type 'TrailingPack::(lambda at {{.*}})'
   // CHECK: |   `-RecordType {{.*}} 'TrailingPack::(lambda at {{.*}})'
   // CHECK: |     `-CXXRecord {{.*}} ''
-  // CHECK: |-ParmVarDecl {{.*}} 'TrailingPack::(lambda at {{.*}})':'TrailingPack::(lambda at {{.*}})'
-  // CHECK: `-ParmVarDecl {{.*}} 'TrailingPack::(lambda at {{.*}})':'TrailingPack::(lambda at {{.*}})'
+  // CHECK: |-ParmVarDecl {{.*}} 'TrailingPack::(lambda at {{.*}})'
+  // CHECK: `-ParmVarDecl {{.*}} 'TrailingPack::(lambda at {{.*}})'
   // CHECK: FunctionProtoType {{.*}} 'auto (T...) -> A<T...>' dependent trailing_return cdecl
   // CHECK: |-InjectedClassNameType {{.*}} 'A<T...>' dependent
   // CHECK: | `-CXXRecord {{.*}} 'A'
@@ -324,7 +324,7 @@ namespace DeduceArity {
   // CHECK: FunctionTemplateDecl {{.*}} implicit <deduction guide for F>
   // CHECK: |-TemplateTypeParmDecl {{.*}} referenced typename depth 0 index 0 ... T
   // CHECK: |-CXXDeductionGuideDecl {{.*}} implicit <deduction guide for F> 'auto (Types<T...>, T...) -> F<T...>'
-  // CHECK: | |-ParmVarDecl {{.*}} 'Types<T...>':'Types<T...>'
+  // CHECK: | |-ParmVarDecl {{.*}} 'Types<T...>'
   // CHECK: | `-ParmVarDecl {{.*}} 'T...' pack
   // CHECK: |-CXXDeductionGuideDecl {{.*}} implicit used <deduction guide for F> 
   // CHECK-SAME: 'auto (Types<X, Y, Z>, DeduceArity::X, DeduceArity::Y, DeduceArity::Z) -> 
@@ -340,16 +340,16 @@ namespace DeduceArity {
   // CHECK: | |   `-RecordType {{.*}} 'DeduceArity::Z'
   // CHECK: | |     `-CXXRecord {{.*}} 'Z'
   // CHECK: | |-ParmVarDecl {{.*}} 'Types<X, Y, Z>':'DeduceArity::Types<DeduceArity::X, DeduceArity::Y, DeduceArity::Z>'
-  // CHECK: | |-ParmVarDecl {{.*}} 'DeduceArity::X':'DeduceArity::X'
-  // CHECK: | |-ParmVarDecl {{.*}} 'DeduceArity::Y':'DeduceArity::Y'
-  // CHECK: | `-ParmVarDecl {{.*}} 'DeduceArity::Z':'DeduceArity::Z'
+  // CHECK: | |-ParmVarDecl {{.*}} 'DeduceArity::X'
+  // CHECK: | |-ParmVarDecl {{.*}} 'DeduceArity::Y'
+  // CHECK: | `-ParmVarDecl {{.*}} 'DeduceArity::Z'
   // CHECK: `-CXXDeductionGuideDecl {{.*}} implicit <deduction guide for F> 'auto (Types<X>, DeduceArity::X) -> DeduceArity::F<DeduceArity::X>'
   // CHECK:   |-TemplateArgument pack
   // CHECK:   | `-TemplateArgument type 'DeduceArity::X'
   // CHECK:   |   `-RecordType {{.*}} 'DeduceArity::X'
   // CHECK:   |     `-CXXRecord {{.*}} 'X'
   // CHECK:   |-ParmVarDecl {{.*}} 'Types<X>':'DeduceArity::Types<DeduceArity::X>'
-  // CHECK:   `-ParmVarDecl {{.*}} 'DeduceArity::X':'DeduceArity::X'
+  // CHECK:   `-ParmVarDecl {{.*}} 'DeduceArity::X'
   // CHECK: FunctionProtoType {{.*}} 'auto (Types<T...>, T...) -> F<T...>' dependent trailing_return cdecl
   // CHECK: |-InjectedClassNameType {{.*}} 'F<T...>' dependent
   // CHECK: | `-CXXRecord {{.*}} 'F'
diff --git a/clang/test/SemaTemplate/deduction-guide.cpp b/clang/test/SemaTemplate/deduction-guide.cpp
index 9de2975656ee2..16c7083df29d0 100644
--- a/clang/test/SemaTemplate/deduction-guide.cpp
+++ b/clang/test/SemaTemplate/deduction-guide.cpp
@@ -37,7 +37,7 @@ using AT = A<int[3], int, int, short>;
 // CHECK:   | | `-Var {{.*}} 'arr1' 'int[3]'
 // CHECK:   | `-TemplateArgument decl
 // CHECK:   |   `-Var {{.*}} 'arr2' 'int[3]'
-// CHECK:   |-ParmVarDecl {{.*}} 'X<&arr1, &arr2>':'X<&arr1, &arr2>'
+// CHECK:   |-ParmVarDecl {{.*}} 'X<&arr1, &arr2>'
 // CHECK:   |-ParmVarDecl {{.*}} 'int (*)[3]'
 // CHECK:   |-ParmVarDecl {{.*}} 'int (*)[3]'
 // CHECK:   `-ParmVarDecl {{.*}} 'short (*)[4]'
@@ -76,7 +76,7 @@ using BT = B<char, 'x'>;
 // CHECK:   |-TemplateArgument integral 120
 // CHECK:   |-TemplateArgument type 'std::nullptr_t'
 // CHECK:   |-TemplateArgument nullptr
-// CHECK:   `-ParmVarDecl {{.*}} 'X<nullptr, 'x'>':'X<nullptr, 'x'>'
+// CHECK:   `-ParmVarDecl {{.*}} 'X<nullptr, 'x'>'
 // CHECK: FunctionProtoType {{.*}} 'auto (X<W, V>) -> B<T, V>' dependent trailing_return
 // CHECK: |-InjectedClassNameType {{.*}} 'B<T, V>' dependent
 // CHECK: `-TemplateSpecializationType {{.*}} 'X<W, V>' dependent X
@@ -111,9 +111,9 @@ using CT = C<int>;
 // CHECK:  |-TemplateArgument template B
 // CHECK:  |-TemplateArgument type 'int'
 // CHECK:  |-TemplateArgument integral 0
-// CHECK:  |-ParmVarDecl {{.*}} 'int':'int'
-// CHECK:  |-ParmVarDecl {{.*}} 'Y<B>':'Y<B>'
-// CHECK:  `-ParmVarDecl {{.*}} 'int':'int'
+// CHECK:  |-ParmVarDecl {{.*}} 'int'
+// CHECK:  |-ParmVarDecl {{.*}} 'Y<B>'
+// CHECK:  `-ParmVarDecl {{.*}} 'int'
 // CHECK: FunctionProtoType {{.*}} 'auto (A, Y<>, type-parameter-0-2) -> C<A>' dependent trailing_return cdecl
 // CHECK: |-InjectedClassNameType {{.*}} 'C<A>' dependent
 // CHECK: |-TemplateTypeParmType {{.*}} 'A' dependent depth 0 index 0
@@ -234,7 +234,7 @@ F s(0);
 // CHECK:   |-TemplateArgument integral 120
 // CHECK:   |-TemplateArgument type 'int'
 // CHECK:   | `-BuiltinType {{.*}} 'int'
-// CHECK:   `-ParmVarDecl {{.*}} 'int':'int'
+// CHECK:   `-ParmVarDecl {{.*}} 'int'
 // CHECK: FunctionProtoType {{.*}} 'auto (type-parameter-0-1) -> F<>' dependent trailing_return cdecl
 // CHECK: |-InjectedClassNameType {{.*}} 'F<>' dependent
 // CHECK: | `-CXXRecord {{.*}} 'F'
diff --git a/clang/test/SemaTemplate/default-expr-arguments-3.cpp b/clang/test/SemaTemplate/default-expr-arguments-3.cpp
index 4bbdb6f91ec7e..4d04209e110b3 100644
--- a/clang/test/SemaTemplate/default-expr-arguments-3.cpp
+++ b/clang/test/SemaTemplate/default-expr-arguments-3.cpp
@@ -4,7 +4,7 @@
 // CHECK: FunctionDecl {{.*}} used func 'void ()'
 // CHECK-NEXT: TemplateArgument type 'int'
 // CHECK: LambdaExpr {{.*}} '(lambda at
-// CHECK: ParmVarDecl {{.*}} used f 'foo':'foo' cinit
+// CHECK: ParmVarDecl {{.*}} used f 'foo' cinit
 // CHECK-NEXT: DeclRefExpr {{.*}} 'foo' EnumConstant {{.*}} 'a' 'foo'
 
 namespace PR28795 {
@@ -23,7 +23,7 @@ namespace PR28795 {
 // CHECK: ClassTemplateSpecializationDecl {{.*}} struct class2 definition
 // CHECK: TemplateArgument type 'int'
 // CHECK: LambdaExpr {{.*}} '(lambda at
-// CHECK: ParmVarDecl {{.*}} used f 'foo':'foo' cinit
+// CHECK: ParmVarDecl {{.*}} used f 'foo' cinit
 // CHECK-NEXT: DeclRefExpr {{.*}} 'foo' EnumConstant {{.*}} 'a' 'foo'
 
 // Template struct case:
@@ -41,7 +41,7 @@ template struct class2<int>;
 // CHECK-NEXT: FunctionDecl {{.*}} f1 'void ()'
 // CHECK: FunctionDecl {{.*}} f1 'void ()'
 // CHECK-NEXT: TemplateArgument type 'int'
-// CHECK: ParmVarDecl {{.*}} n 'foo':'foo' cinit
+// CHECK: ParmVarDecl {{.*}} n 'foo' cinit
 // CHECK-NEXT: DeclRefExpr {{.*}} 'foo' EnumConstant {{.*}} 'a' 'foo'
 
 template<typename T>
diff --git a/clang/test/SemaTemplate/make_integer_seq.cpp b/clang/test/SemaTemplate/make_integer_seq.cpp
index 12520167b93e5..644bf41f8614c 100644
--- a/clang/test/SemaTemplate/make_integer_seq.cpp
+++ b/clang/test/SemaTemplate/make_integer_seq.cpp
@@ -37,7 +37,7 @@ using test2 = B<int, 1>;
 // CHECK-NEXT:       `-ElaboratedType 0x{{[0-9A-Fa-f]+}} '__make_integer_seq<A, int, 1>' sugar
 // CHECK-NEXT:         `-TemplateSpecializationType 0x{{[0-9A-Fa-f]+}} '__make_integer_seq<A, int, 1>' sugar alias __make_integer_seq
 // CHECK-NEXT:           |-TemplateArgument template A
-// CHECK-NEXT:           |-TemplateArgument type 'int':'int'
+// CHECK-NEXT:           |-TemplateArgument type 'int'
 // CHECK-NEXT:           | `-SubstTemplateTypeParmType 0x{{[0-9A-Fa-f]+}} 'int' sugar class depth 0 index 0 B1
 // CHECK-NEXT:           |   |-TypeAliasTemplate 0x{{[0-9A-Fa-f]+}} 'B'
 // CHECK-NEXT:           |   `-BuiltinType 0x{{[0-9A-Fa-f]+}} 'int'
@@ -48,14 +48,14 @@ using test2 = B<int, 1>;
 // CHECK-NEXT:           |     |-NonTypeTemplateParmDecl 0x{{[0-9A-Fa-f]+}} <col:21, col:24> col:24 referenced 'B1' depth 0 index 1 B2
 // CHECK-NEXT:           |     `-IntegerLiteral 0x{{[0-9A-Fa-f]+}} <col:64> 'int' 1
 // CHECK-NEXT:           `-TemplateSpecializationType 0x{{[0-9A-Fa-f]+}} 'A<int, 0>' sugar A
-// CHECK-NEXT:             |-TemplateArgument type 'int':'int'
+// CHECK-NEXT:             |-TemplateArgument type 'int'
 // CHECK-NEXT:             | `-SubstTemplateTypeParmType 0x{{[0-9A-Fa-f]+}} 'int' sugar class depth 0 index 0 B1
 // CHECK-NEXT:             |   |-TypeAliasTemplate 0x{{[0-9A-Fa-f]+}} 'B'
 // CHECK-NEXT:             |   `-BuiltinType 0x{{[0-9A-Fa-f]+}} 'int'
 // CHECK-NEXT:             |-TemplateArgument expr
 // CHECK-NEXT:             | `-ConstantExpr 0x{{[0-9A-Fa-f]+}} <col:64> 'int'
 // CHECK-NEXT:             |   |-value: Int 0
-// CHECK-NEXT:             |   `-IntegerLiteral 0x{{[0-9A-Fa-f]+}} <col:64> 'int':'int' 0
+// CHECK-NEXT:             |   `-IntegerLiteral 0x{{[0-9A-Fa-f]+}} <col:64> 'int' 0
 // CHECK-NEXT:             `-RecordType 0x{{[0-9A-Fa-f]+}} 'A<int, 0>'
 // CHECK-NEXT:               `-ClassTemplateSpecialization 0x{{[0-9A-Fa-f]+}} 'A'
 
@@ -99,7 +99,7 @@ template <template <class T, T...> class S, class T, int N> struct C {
 // CHECK-NEXT:             `-IntegerLiteral 0x{{[0-9A-Fa-f]+}} <col:42> 'int' 1
 
   using test5 = __make_integer_seq<A, int, N>;
-//      CHECK: `-TypeAliasDecl 0x{{[0-9A-Fa-f]+}} <line:101:3, col:45> col:9 test5 '__make_integer_seq<A, int, N>':'__make_integer_seq<A, int, N>'
+//      CHECK: `-TypeAliasDecl 0x{{[0-9A-Fa-f]+}} <line:101:3, col:45> col:9 test5 '__make_integer_seq<A, int, N>'
 // CHECK-NEXT:   `-ElaboratedType 0x{{[0-9A-Fa-f]+}} '__make_integer_seq<A, int, N>' sugar dependent
 // CHECK-NEXT:     `-TemplateSpecializationType 0x{{[0-9A-Fa-f]+}} '__make_integer_seq<A, int, N>' sugar dependent alias __make_integer_seq
 // CHECK-NEXT:       |-TemplateArgument template A
diff --git a/clang/test/SemaTemplate/pr47676.cpp b/clang/test/SemaTemplate/pr47676.cpp
index 7dd0804bd36ca..b62b62ca2b7f2 100644
--- a/clang/test/SemaTemplate/pr47676.cpp
+++ b/clang/test/SemaTemplate/pr47676.cpp
@@ -34,5 +34,5 @@ void g(void) {
 
 //      CHECK: |       `-CStyleCastExpr {{.*}} '__vector int' <VectorSplat>
 // CHECK-NEXT: |         `-ImplicitCastExpr {{.*}} 'int' <FloatingToIntegral>
-// CHECK-NEXT: |           `-ImplicitCastExpr {{.*}}:'double' <LValueToRValue>
+// CHECK-NEXT: |           `-ImplicitCastExpr {{.*}}'double' <LValueToRValue>
 }
diff --git a/clang/test/SemaTemplate/type_pack_element.cpp b/clang/test/SemaTemplate/type_pack_element.cpp
index 260b90bf506c5..9bca846e6659b 100644
--- a/clang/test/SemaTemplate/type_pack_element.cpp
+++ b/clang/test/SemaTemplate/type_pack_element.cpp
@@ -55,7 +55,7 @@ template<int N, class ...Ts> struct A {
 // CHECK-NEXT:               `-TemplateTypeParmType 0x{{[0-9A-Fa-f]+}} 'type-parameter-0-1' dependent contains_unexpanded_pack depth 0 index 1 pack
 
   using test4 = __type_pack_element<N, int>;
-//      CHECK: `-TypeAliasDecl 0x{{[0-9A-Fa-f]+}} <line:57:3, col:43> col:9 test4 '__type_pack_element<N, int>':'__type_pack_element<N, int>'
+//      CHECK: `-TypeAliasDecl 0x{{[0-9A-Fa-f]+}} <line:57:3, col:43> col:9 test4 '__type_pack_element<N, int>'
 // CHECK-NEXT:   `-ElaboratedType 0x{{[0-9A-Fa-f]+}} '__type_pack_element<N, int>' sugar dependent
 // CHECK-NEXT:     `-TemplateSpecializationType 0x{{[0-9A-Fa-f]+}} '__type_pack_element<N, int>' sugar dependent alias __type_pack_element
 // CHECK-NEXT:       |-TemplateArgument expr
diff --git a/clang/unittests/AST/ASTImporterTest.cpp b/clang/unittests/AST/ASTImporterTest.cpp
index 06c9eac8f6025..87e6cd1e160c4 100644
--- a/clang/unittests/AST/ASTImporterTest.cpp
+++ b/clang/unittests/AST/ASTImporterTest.cpp
@@ -4941,7 +4941,7 @@ TEST_P(ASTImporterOptionSpecificTestBase,
 | `-CXXDeductionGuideDecl 0x20515d8 <col:9, col:12> col:9 implicit used <deduction guide for A> 'auto (int) -> A<int>'
 |   |-TemplateArgument type 'int'
 |   | `-BuiltinType 0x20587e0 'int'
-|   `-ParmVarDecl 0x2051388 <col:11> col:12 'int':'int'
+|   `-ParmVarDecl 0x2051388 <col:11> col:12 'int'
 `-FunctionTemplateDecl 0x1fe5a78 <line:2:7, col:36> col:36 implicit <deduction guide for A>
   |-TemplateTypeParmDecl 0x1fe4eb0 <col:17, col:26> col:26 referenced typename depth 0 index 0 T
   `-CXXDeductionGuideDecl 0x1fe59c0 <col:36> col:36 implicit <deduction guide for A> 'auto (A<T>) -> A<T>'

From a66051c68a43af39f9fd962f71d58ae0efcf860d Mon Sep 17 00:00:00 2001
From: Amara Emerson <amara@apple.com>
Date: Fri, 20 Oct 2023 14:07:38 -0700
Subject: [PATCH 113/877] [InstCombine] Add oneuse checks to shr + cmp constant
 folds.

This change has virtually no code size regressions on the llvm test suite (+ SPECs)
while having these improvements (measured with -Os on Darwin arm64):

External/S.../CFP2006/450.soplex/450.soplex    214024.00      213920.00     -0.0%
External/S...7speed/641.leela_s/641.leela_s     93412.00       93348.00     -0.1%
External/S...17rate/541.leela_r/541.leela_r     93412.00       93348.00     -0.1%
MultiSourc.../Applications/JM/lencod/lencod    426044.00      425748.00     -0.1%
MultiSourc...rks/mediabench/gsm/toast/toast     20436.00       20416.00     -0.1%
MultiSourc...ench/telecomm-gsm/telecomm-gsm     20436.00       20416.00     -0.1%
MultiSourc...Prolangs-C/assembler/assembler     16172.00       16156.00     -0.1%
MultiSourc...nch/mpeg2/mpeg2dec/mpeg2decode     35332.00       35256.00     -0.2%
SingleSour...Adobe-C++/stepanov_abstraction      6904.00        6888.00     -0.2%
External/SPEC/CINT2000/254.gap/254.gap         366060.00      365132.00     -0.3%
MultiSourc...-ProxyApps-C++/PENNANT/PENNANT     79688.00       79484.00     -0.3%
External/S...NT2006/464.h264ref/464.h264ref    352044.00      351132.00     -0.3%
SingleSour...arks/Adobe-C++/functionobjects     15524.00       15480.00     -0.3%
SingleSour...arks/Adobe-C++/stepanov_vector     10728.00       10696.00     -0.3%
SingleSour...ks/Misc-C++/stepanov_container     16900.00       16848.00     -0.3%
MultiSource/Applications/oggenc/oggenc         124184.00      123780.00     -0.3%
SingleSour...tout-C++/Shootout-C++-wordfreq      7060.00        7036.00     -0.3%
MultiSourc...ity-rijndael/security-rijndael      8976.00        8936.00     -0.4%
MultiSource/Benchmarks/McCat/18-imp/imp          9816.00        9772.00     -0.4%
SingleSour...chmarks/Misc-C++/stepanov_v1p2      1772.00        1764.00     -0.5%
MultiSourc...iabench/g721/g721encode/encode      5492.00        5464.00     -0.5%
MultiSourc...rks/McCat/03-testtrie/testtrie      1364.00        1344.00     -1.5%
SingleSour.../execute/GCC-C-execute-pr42833       400.00         364.00     -9.0%

Doing so also prevents a regression described in https://reviews.llvm.org/D143624

Differential Revision: https://reviews.llvm.org/D149918
---
 .../lib/Transforms/InstCombine/InstCombineCompares.cpp |  2 +-
 .../InstCombine/ashr-icmp-minmax-idiom-break.ll        |  7 +++----
 llvm/test/Transforms/InstCombine/icmp-shr-lt-gt.ll     |  2 +-
 .../PhaseOrdering/icmp-ashr-breaking-select-idiom.ll   | 10 +++-------
 4 files changed, 8 insertions(+), 13 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
index a09c9b48be9d5..fd12b01736f3f 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -2451,7 +2451,7 @@ Instruction *InstCombinerImpl::foldICmpShrConstant(ICmpInst &Cmp,
   // constant-value-based preconditions in the folds below, then we could assert
   // those conditions rather than checking them. This is difficult because of
   // undef/poison (PR34838).
-  if (IsAShr) {
+  if (IsAShr && Shr->hasOneUse()) {
     if (IsExact || Pred == CmpInst::ICMP_SLT || Pred == CmpInst::ICMP_ULT) {
       // When ShAmtC can be shifted losslessly:
       // icmp PRED (ashr exact X, ShAmtC), C --> icmp PRED X, (C << ShAmtC)
diff --git a/llvm/test/Transforms/InstCombine/ashr-icmp-minmax-idiom-break.ll b/llvm/test/Transforms/InstCombine/ashr-icmp-minmax-idiom-break.ll
index a539b91366896..c6d6e916b2c78 100644
--- a/llvm/test/Transforms/InstCombine/ashr-icmp-minmax-idiom-break.ll
+++ b/llvm/test/Transforms/InstCombine/ashr-icmp-minmax-idiom-break.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
 ; RUN: opt < %s -passes=instcombine -S | FileCheck %s
 
-; This test is pre-committed to show sub-optimal codegen due to
-; min/max idiom breakage. On AArch64, these constants are also expensive to materialize,
+; Check we don't have sub-optimal codegen due to min/max idiom breakage.
+; On AArch64, these constants are also expensive to materialize,
 ; and therefore generate poor code vs maintaining the min/max idiom.
 
 define i64 @dont_break_minmax_i64(i64 %conv, i64 %conv2) {
@@ -10,8 +10,7 @@ define i64 @dont_break_minmax_i64(i64 %conv, i64 %conv2) {
 ; CHECK-SAME: (i64 [[CONV:%.*]], i64 [[CONV2:%.*]]) {
 ; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i64 [[CONV]], [[CONV2]]
 ; CHECK-NEXT:    [[SHR:%.*]] = ashr i64 [[MUL]], 4
-; CHECK-NEXT:    [[CMP4_I:%.*]] = icmp slt i64 [[MUL]], 5579712
-; CHECK-NEXT:    [[SPEC_SELECT_I:%.*]] = select i1 [[CMP4_I]], i64 [[SHR]], i64 348731
+; CHECK-NEXT:    [[SPEC_SELECT_I:%.*]] = call i64 @llvm.smin.i64(i64 [[SHR]], i64 348731)
 ; CHECK-NEXT:    ret i64 [[SPEC_SELECT_I]]
 ;
   %mul = mul nsw i64 %conv, %conv2
diff --git a/llvm/test/Transforms/InstCombine/icmp-shr-lt-gt.ll b/llvm/test/Transforms/InstCombine/icmp-shr-lt-gt.ll
index 08a763a50bf95..538590e188c44 100644
--- a/llvm/test/Transforms/InstCombine/icmp-shr-lt-gt.ll
+++ b/llvm/test/Transforms/InstCombine/icmp-shr-lt-gt.ll
@@ -900,7 +900,7 @@ define i1 @ashrsgt_01_00(i4 %x) {
 define i1 @ashrsgt_01_00_multiuse(i4 %x, ptr %p) {
 ; CHECK-LABEL: @ashrsgt_01_00_multiuse(
 ; CHECK-NEXT:    [[S:%.*]] = ashr i4 [[X:%.*]], 1
-; CHECK-NEXT:    [[C:%.*]] = icmp sgt i4 [[X]], 1
+; CHECK-NEXT:    [[C:%.*]] = icmp sgt i4 [[S]], 0
 ; CHECK-NEXT:    store i4 [[S]], ptr [[P:%.*]], align 1
 ; CHECK-NEXT:    ret i1 [[C]]
 ;
diff --git a/llvm/test/Transforms/PhaseOrdering/icmp-ashr-breaking-select-idiom.ll b/llvm/test/Transforms/PhaseOrdering/icmp-ashr-breaking-select-idiom.ll
index 8559f973f281a..67d721b23d6f0 100644
--- a/llvm/test/Transforms/PhaseOrdering/icmp-ashr-breaking-select-idiom.ll
+++ b/llvm/test/Transforms/PhaseOrdering/icmp-ashr-breaking-select-idiom.ll
@@ -5,8 +5,7 @@ define i32 @testa(i32 %mul) {
 ; CHECK-LABEL: define i32 @testa(
 ; CHECK-SAME: i32 [[MUL:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:    [[SHR:%.*]] = ashr i32 [[MUL]], 15
-; CHECK-NEXT:    [[CMP4_I:%.*]] = icmp slt i32 [[MUL]], 1073741824
-; CHECK-NEXT:    [[SPEC_SELECT_I:%.*]] = select i1 [[CMP4_I]], i32 [[SHR]], i32 32767
+; CHECK-NEXT:    [[SPEC_SELECT_I:%.*]] = tail call i32 @llvm.smin.i32(i32 [[SHR]], i32 32767)
 ; CHECK-NEXT:    ret i32 [[SPEC_SELECT_I]]
 ;
   %shr = ashr i32 %mul, 15
@@ -20,11 +19,8 @@ define i32 @testb(i32 %mul) {
 ; CHECK-LABEL: define i32 @testb(
 ; CHECK-SAME: i32 [[MUL:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; CHECK-NEXT:    [[SHR102:%.*]] = ashr i32 [[MUL]], 7
-; CHECK-NEXT:    [[CMP4_I:%.*]] = icmp sgt i32 [[MUL]], 16383
-; CHECK-NEXT:    [[RETVAL_0_I:%.*]] = select i1 [[CMP4_I]], i32 127, i32 -128
-; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[MUL]], 16384
-; CHECK-NEXT:    [[CLEANUP_DEST_SLOT_0_I:%.*]] = icmp ult i32 [[TMP1]], 32768
-; CHECK-NEXT:    [[SPEC_SELECT_I:%.*]] = select i1 [[CLEANUP_DEST_SLOT_0_I]], i32 [[SHR102]], i32 [[RETVAL_0_I]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.smax.i32(i32 [[SHR102]], i32 -128)
+; CHECK-NEXT:    [[SPEC_SELECT_I:%.*]] = tail call i32 @llvm.smin.i32(i32 [[TMP1]], i32 127)
 ; CHECK-NEXT:    ret i32 [[SPEC_SELECT_I]]
 ;
   %shr102 = ashr i32 %mul, 7

From 183158b4bc57131cc6223ee0fa9a540dfe7a0653 Mon Sep 17 00:00:00 2001
From: Rainer Orth <ro@gcc.gnu.org>
Date: Thu, 26 Oct 2023 20:37:31 +0200
Subject: [PATCH 114/877] [Driver] Fix -r handling on Solaris (#70322)

As discussed in [[Driver] Link Flang runtime on Solaris](https://github.com/llvm/llvm-project/pull/65644), `clang -r`
incorrectly passes both `-Bdynamic` and `-e _start` to `ld` which lets the linker choke.

This patch fixes this, omitting `-Bdynamic` completely which is the linker default.

Tested on `amd64-pc-solaris2.11` and `sparcv9-sun-solaris2.11`.
---
 clang/lib/Driver/ToolChains/Solaris.cpp | 7 +++----
 clang/test/Driver/solaris-ld.c          | 1 +
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/clang/lib/Driver/ToolChains/Solaris.cpp b/clang/lib/Driver/ToolChains/Solaris.cpp
index ecff8ddc4ee76..2b1ac5a7f475c 100644
--- a/clang/lib/Driver/ToolChains/Solaris.cpp
+++ b/clang/lib/Driver/ToolChains/Solaris.cpp
@@ -96,7 +96,8 @@ void solaris::Linker::ConstructJob(Compilation &C, const JobAction &JA,
   if (!LinkerIsGnuLd)
     CmdArgs.push_back("-C");
 
-  if (!Args.hasArg(options::OPT_nostdlib, options::OPT_shared)) {
+  if (!Args.hasArg(options::OPT_nostdlib, options::OPT_shared,
+                   options::OPT_r)) {
     CmdArgs.push_back("-e");
     CmdArgs.push_back("_start");
   }
@@ -114,10 +115,8 @@ void solaris::Linker::ConstructJob(Compilation &C, const JobAction &JA,
     CmdArgs.push_back("-Bstatic");
     CmdArgs.push_back("-dn");
   } else {
-    CmdArgs.push_back("-Bdynamic");
-    if (Args.hasArg(options::OPT_shared)) {
+    if (!Args.hasArg(options::OPT_r) && Args.hasArg(options::OPT_shared))
       CmdArgs.push_back("-shared");
-    }
 
     // libpthread has been folded into libc since Solaris 10, no need to do
     // anything for pthreads. Claim argument to avoid warning.
diff --git a/clang/test/Driver/solaris-ld.c b/clang/test/Driver/solaris-ld.c
index 09e6e8b4f07cf..8f7f168c38721 100644
--- a/clang/test/Driver/solaris-ld.c
+++ b/clang/test/Driver/solaris-ld.c
@@ -177,6 +177,7 @@
 // RUN:   | FileCheck %s --check-prefix=CHECK-RELOCATABLE
 // CHECK-RELOCATABLE:     "-L
 // CHECK-RELOCATABLE:     "-r"
+// CHECK-RELOCATABLE-NOT: "-e"
 // CHECK-RELOCATABLE-NOT: "-l
 // CHECK-RELOCATABLE-NOT: /crt{{[^.]+}}.o
 // CHECK-RELOCATABLE-NOT: /values-{{[^.]+}}.o

From f7de49840367f44a5a88cab2c652e84f7efbf8b0 Mon Sep 17 00:00:00 2001
From: Benjamin Kramer <benny.kra@googlemail.com>
Date: Thu, 26 Oct 2023 20:33:05 +0200
Subject: [PATCH 115/877] [AArch64][GlobalISel] Fold variable into assert

Avoids unused variable warnings in release builds. NFCI.
---
 llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
index 9c5b34166ffaf..45fc6b0238877 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
@@ -5659,8 +5659,8 @@ bool AArch64InstructionSelector::selectIndexedLoad(MachineInstr &MI,
   Register WriteBack = Ld.getWritebackReg();
   Register Base = Ld.getBaseReg();
   Register Offset = Ld.getOffsetReg();
-  LLT Ty = MRI.getType(Dst);
-  assert(Ty.getSizeInBits() <= 128 && "Unexpected type for indexed load");
+  assert(MRI.getType(Dst).getSizeInBits() <= 128 &&
+         "Unexpected type for indexed load");
   unsigned MemSize = Ld.getMMO().getMemoryType().getSizeInBytes();
 
   unsigned Opc = 0;

From 15254eb74039c2c0bd7695d87e71890868f8e308 Mon Sep 17 00:00:00 2001
From: Brad Smith <brad@comstyle.com>
Date: Thu, 26 Oct 2023 14:44:12 -0400
Subject: [PATCH 116/877] [Driver] Clean up unused architecture related bits
 for *BSD's (#69809)

- FreeBSD removed big-endian arm with 12.0.
- OpenBSD never had big-endian arm support. I added it just in case, but it has
  never been used.
- Remove sparcel bits. It was sprinkled in a few places but it will never be a
  thing.
- Remove 32-bit sparc bits for FreeBSD. FreeBSD has never had 32-bit sparc
  support.
- Remove sparc64 IAS test as support was enabled across the board awhile ago.
---
 clang/lib/Basic/Targets.cpp             |  8 --------
 clang/lib/Driver/ToolChains/FreeBSD.cpp |  4 +---
 clang/lib/Driver/ToolChains/NetBSD.cpp  |  3 +--
 clang/lib/Driver/ToolChains/OpenBSD.cpp |  3 +--
 clang/test/Driver/freebsd.c             | 11 +++--------
 5 files changed, 6 insertions(+), 23 deletions(-)

diff --git a/clang/lib/Basic/Targets.cpp b/clang/lib/Basic/Targets.cpp
index 8130f90a27674..ea002bb464fcc 100644
--- a/clang/lib/Basic/Targets.cpp
+++ b/clang/lib/Basic/Targets.cpp
@@ -247,12 +247,8 @@ std::unique_ptr<TargetInfo> AllocateTarget(const llvm::Triple &Triple,
     switch (os) {
     case llvm::Triple::Linux:
       return std::make_unique<LinuxTargetInfo<ARMbeTargetInfo>>(Triple, Opts);
-    case llvm::Triple::FreeBSD:
-      return std::make_unique<FreeBSDTargetInfo<ARMbeTargetInfo>>(Triple, Opts);
     case llvm::Triple::NetBSD:
       return std::make_unique<NetBSDTargetInfo<ARMbeTargetInfo>>(Triple, Opts);
-    case llvm::Triple::OpenBSD:
-      return std::make_unique<OpenBSDTargetInfo<ARMbeTargetInfo>>(Triple, Opts);
     case llvm::Triple::RTEMS:
       return std::make_unique<RTEMSTargetInfo<ARMbeTargetInfo>>(Triple, Opts);
     case llvm::Triple::NaCl:
@@ -487,15 +483,11 @@ std::unique_ptr<TargetInfo> AllocateTarget(const llvm::Triple &Triple,
       return std::make_unique<SparcV8TargetInfo>(Triple, Opts);
     }
 
-  // The 'sparcel' architecture copies all the above cases except for Solaris.
   case llvm::Triple::sparcel:
     switch (os) {
     case llvm::Triple::Linux:
       return std::make_unique<LinuxTargetInfo<SparcV8elTargetInfo>>(Triple,
                                                                     Opts);
-    case llvm::Triple::NetBSD:
-      return std::make_unique<NetBSDTargetInfo<SparcV8elTargetInfo>>(Triple,
-                                                                     Opts);
     case llvm::Triple::RTEMS:
       return std::make_unique<RTEMSTargetInfo<SparcV8elTargetInfo>>(Triple,
                                                                     Opts);
diff --git a/clang/lib/Driver/ToolChains/FreeBSD.cpp b/clang/lib/Driver/ToolChains/FreeBSD.cpp
index 7a61159ba4a73..f4c2f70e73576 100644
--- a/clang/lib/Driver/ToolChains/FreeBSD.cpp
+++ b/clang/lib/Driver/ToolChains/FreeBSD.cpp
@@ -88,8 +88,6 @@ void freebsd::Assembler::ConstructJob(Compilation &C, const JobAction &JA,
     CmdArgs.push_back("-meabi=5");
     break;
   }
-  case llvm::Triple::sparc:
-  case llvm::Triple::sparcel:
   case llvm::Triple::sparcv9: {
     std::string CPU = getCPUName(D, Args, getToolChain().getTriple());
     CmdArgs.push_back(
@@ -167,7 +165,7 @@ void freebsd::Linker::ConstructJob(Compilation &C, const JobAction &JA,
       CmdArgs.push_back("/libexec/ld-elf.so.1");
     }
     const llvm::Triple &T = ToolChain.getTriple();
-    if (Arch == llvm::Triple::arm || Arch == llvm::Triple::sparc || T.isX86())
+    if (Arch == llvm::Triple::arm || T.isX86())
       CmdArgs.push_back("--hash-style=both");
     CmdArgs.push_back("--enable-new-dtags");
   }
diff --git a/clang/lib/Driver/ToolChains/NetBSD.cpp b/clang/lib/Driver/ToolChains/NetBSD.cpp
index e643bdf8bcbed..3bf8af642b334 100644
--- a/clang/lib/Driver/ToolChains/NetBSD.cpp
+++ b/clang/lib/Driver/ToolChains/NetBSD.cpp
@@ -77,8 +77,7 @@ void netbsd::Assembler::ConstructJob(Compilation &C, const JobAction &JA,
     break;
   }
 
-  case llvm::Triple::sparc:
-  case llvm::Triple::sparcel: {
+  case llvm::Triple::sparc: {
     CmdArgs.push_back("-32");
     std::string CPU = getCPUName(D, Args, Triple);
     CmdArgs.push_back(sparc::getSparcAsmModeForCPU(CPU, Triple));
diff --git a/clang/lib/Driver/ToolChains/OpenBSD.cpp b/clang/lib/Driver/ToolChains/OpenBSD.cpp
index e874f245776c4..cefbdff20ead9 100644
--- a/clang/lib/Driver/ToolChains/OpenBSD.cpp
+++ b/clang/lib/Driver/ToolChains/OpenBSD.cpp
@@ -45,8 +45,7 @@ void openbsd::Assembler::ConstructJob(Compilation &C, const JobAction &JA,
     CmdArgs.push_back("--32");
     break;
 
-  case llvm::Triple::arm:
-  case llvm::Triple::armeb: {
+  case llvm::Triple::arm: {
     StringRef MArch, MCPU;
     arm::getARMArchCPUFromArgs(Args, MArch, MCPU, /*FromAs*/ true);
     std::string Arch = arm::getARMTargetCPU(MCPU, MArch, Triple);
diff --git a/clang/test/Driver/freebsd.c b/clang/test/Driver/freebsd.c
index afa0a17249851..e1ce8889459f9 100644
--- a/clang/test/Driver/freebsd.c
+++ b/clang/test/Driver/freebsd.c
@@ -163,9 +163,9 @@
 // CHECK-ARM-EABIHF-NOT: as{{.*}}" "-mfpu=softvfp"
 // CHECK-ARM-EABIHF-NOT: as{{.*}}" "-matpcs"
 
-// RUN: %clang --target=sparc-unknown-freebsd -### %s -fpic -no-integrated-as 2>&1 \
-// RUN:   | FileCheck --check-prefix=CHECK-SPARC-PIE %s
-// CHECK-SPARC-PIE: as{{.*}}" "-KPIC
+// RUN: %clang --target=sparc64-unknown-freebsd -### %s -fpic -no-integrated-as 2>&1 \
+// RUN:   | FileCheck --check-prefix=CHECK-SPARC-PIC %s
+// CHECK-SPARC-PIC: as{{.*}}" "-KPIC
 
 // RUN: %clang -mcpu=ultrasparc --target=sparc64-unknown-freebsd -### %s -no-integrated-as 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-SPARC-CPU %s
@@ -191,11 +191,6 @@
 // RUN:   | FileCheck -check-prefix=CHECK-MIPS64-CPU %s
 // CHECK-MIPS64-CPU: "-target-cpu" "mips3"
 
-// Check that the integrated assembler is enabled for SPARC64
-// RUN: %clang --target=sparc64-unknown-freebsd -### -c %s 2>&1 \
-// RUN:   | FileCheck -check-prefix=CHECK-IAS %s
-// CHECK-IAS-NOT: "-no-integrated-as"
-
 // RUN: %clang --target=ppc64-unknown-freebsd13.0 -### -S %s 2>&1 | \
 // RUN: FileCheck -check-prefix=PPC64-MUNWIND %s
 // PPC64-MUNWIND: "-funwind-tables=2"

From c02b2ab8ab794864858fbda173de10f3701e3723 Mon Sep 17 00:00:00 2001
From: Piotr Zegar <me@piotrzegar.pl>
Date: Thu, 26 Oct 2023 21:06:19 +0200
Subject: [PATCH 117/877] [clang-tidy] Add StrictMode to
 cppcoreguidelines-pro-type-static-cast-downcast (#69529)

Add StrictMode option that controls behavior whatever
warnings are emitted for casts on non-polymorphic types.

Fixes: #69414
---
 .../ProTypeStaticCastDowncastCheck.cpp        | 28 +++++++++----
 .../ProTypeStaticCastDowncastCheck.h          | 10 ++++-
 clang-tools-extra/docs/ReleaseNotes.rst       |  5 +++
 .../pro-type-static-cast-downcast.rst         |  8 ++++
 .../pro-type-static-cast-downcast.cpp         | 39 ++++++++++---------
 5 files changed, 61 insertions(+), 29 deletions(-)

diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeStaticCastDowncastCheck.cpp b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeStaticCastDowncastCheck.cpp
index 5344d465ad1fa..2a89de27760fe 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeStaticCastDowncastCheck.cpp
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeStaticCastDowncastCheck.cpp
@@ -14,17 +14,24 @@ using namespace clang::ast_matchers;
 
 namespace clang::tidy::cppcoreguidelines {
 
+ProTypeStaticCastDowncastCheck::ProTypeStaticCastDowncastCheck(
+    StringRef Name, ClangTidyContext *Context)
+    : ClangTidyCheck(Name, Context),
+      StrictMode(Options.getLocalOrGlobal("StrictMode", true)) {}
+
+void ProTypeStaticCastDowncastCheck::storeOptions(
+    ClangTidyOptions::OptionMap &Opts) {
+  Options.store(Opts, "StrictMode", StrictMode);
+}
+
 void ProTypeStaticCastDowncastCheck::registerMatchers(MatchFinder *Finder) {
   Finder->addMatcher(
-      cxxStaticCastExpr(unless(isInTemplateInstantiation())).bind("cast"),
-      this);
+      cxxStaticCastExpr(hasCastKind(CK_BaseToDerived)).bind("cast"), this);
 }
 
 void ProTypeStaticCastDowncastCheck::check(
     const MatchFinder::MatchResult &Result) {
   const auto *MatchedCast = Result.Nodes.getNodeAs<CXXStaticCastExpr>("cast");
-  if (MatchedCast->getCastKind() != CK_BaseToDerived)
-    return;
 
   QualType SourceType = MatchedCast->getSubExpr()->getType();
   const auto *SourceDecl = SourceType->getPointeeCXXRecordDecl();
@@ -33,15 +40,20 @@ void ProTypeStaticCastDowncastCheck::check(
   if (!SourceDecl)
     return;
 
-  if (SourceDecl->isPolymorphic())
+  if (SourceDecl->isPolymorphic()) {
     diag(MatchedCast->getOperatorLoc(),
          "do not use static_cast to downcast from a base to a derived class; "
          "use dynamic_cast instead")
         << FixItHint::CreateReplacement(MatchedCast->getOperatorLoc(),
                                         "dynamic_cast");
-  else
-    diag(MatchedCast->getOperatorLoc(),
-         "do not use static_cast to downcast from a base to a derived class");
+    return;
+  }
+
+  if (!StrictMode)
+    return;
+
+  diag(MatchedCast->getOperatorLoc(),
+       "do not use static_cast to downcast from a base to a derived class");
 }
 
 } // namespace clang::tidy::cppcoreguidelines
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeStaticCastDowncastCheck.h b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeStaticCastDowncastCheck.h
index 33ec8a071c108..b9e78a82a39f2 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeStaticCastDowncastCheck.h
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/ProTypeStaticCastDowncastCheck.h
@@ -20,13 +20,19 @@ namespace clang::tidy::cppcoreguidelines {
 /// http://clang.llvm.org/extra/clang-tidy/checks/cppcoreguidelines/pro-type-static-cast-downcast.html
 class ProTypeStaticCastDowncastCheck : public ClangTidyCheck {
 public:
-  ProTypeStaticCastDowncastCheck(StringRef Name, ClangTidyContext *Context)
-      : ClangTidyCheck(Name, Context) {}
+  ProTypeStaticCastDowncastCheck(StringRef Name, ClangTidyContext *Context);
   bool isLanguageVersionSupported(const LangOptions &LangOpts) const override {
     return LangOpts.CPlusPlus;
   }
   void registerMatchers(ast_matchers::MatchFinder *Finder) override;
   void check(const ast_matchers::MatchFinder::MatchResult &Result) override;
+  void storeOptions(ClangTidyOptions::OptionMap &Opts) override;
+  std::optional<TraversalKind> getCheckTraversalKind() const override {
+    return TK_IgnoreUnlessSpelledInSource;
+  }
+
+private:
+  const bool StrictMode;
 };
 
 } // namespace clang::tidy::cppcoreguidelines
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index c93775beb8aea..8e4288399a769 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -269,6 +269,11 @@ Changes in existing checks
   <clang-tidy/checks/cppcoreguidelines/pro-type-member-init>` check to ignore
   dependent delegate constructors.
 
+- Improved :doc:`cppcoreguidelines-pro-type-static-cast-downcast
+  <clang-tidy/checks/cppcoreguidelines/pro-type-static-cast-downcast>` check to
+  disregard casts on non-polymorphic types when the `StrictMode` option is set
+  to `false`.
+
 - Improved :doc:`cppcoreguidelines-pro-type-vararg
   <clang-tidy/checks/cppcoreguidelines/pro-type-vararg>` check to ignore
   false-positives in unevaluated context (e.g., ``decltype``, ``sizeof``, ...).
diff --git a/clang-tools-extra/docs/clang-tidy/checks/cppcoreguidelines/pro-type-static-cast-downcast.rst b/clang-tools-extra/docs/clang-tidy/checks/cppcoreguidelines/pro-type-static-cast-downcast.rst
index bcb353301f028..333e6db2aacec 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/cppcoreguidelines/pro-type-static-cast-downcast.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/cppcoreguidelines/pro-type-static-cast-downcast.rst
@@ -14,3 +14,11 @@ unrelated type ``Z``.
 This rule is part of the `Type safety (Type.2)
 <https://isocpp.github.io/CppCoreGuidelines/CppCoreGuidelines#Pro-type-downcast>`_
 profile from the C++ Core Guidelines.
+
+Options
+-------
+
+.. option:: StrictMode
+
+  When set to `false`, no warnings are emitted for casts on non-polymorphic
+  types. Default is `true`.
diff --git a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/pro-type-static-cast-downcast.cpp b/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/pro-type-static-cast-downcast.cpp
index 7a6c027ec479b..11179b7d2d19b 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/pro-type-static-cast-downcast.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/pro-type-static-cast-downcast.cpp
@@ -1,4 +1,5 @@
-// RUN: %check_clang_tidy %s cppcoreguidelines-pro-type-static-cast-downcast %t
+// RUN: %check_clang_tidy  -check-suffixes=NSTRICT,STRICT %s cppcoreguidelines-pro-type-static-cast-downcast %t
+// RUN: %check_clang_tidy  -check-suffix=NSTRICT %s cppcoreguidelines-pro-type-static-cast-downcast %t -- -config="{CheckOptions: {StrictMode: false}}"
 
 class Base {
 };
@@ -26,11 +27,11 @@ class PolymorphicMultiDerived : public Base, public PolymorphicBase {
 void pointers() {
 
   auto P0 = static_cast<Derived*>(new Base());
-  // CHECK-MESSAGES: :[[@LINE-1]]:13: warning: do not use static_cast to downcast from a base to a derived class [cppcoreguidelines-pro-type-static-cast-downcast]
+  // CHECK-MESSAGES-STRICT: :[[@LINE-1]]:13: warning: do not use static_cast to downcast from a base to a derived class [cppcoreguidelines-pro-type-static-cast-downcast]
 
   const Base* B0;
   auto PC0 = static_cast<const Derived*>(B0);
-  // CHECK-MESSAGES: :[[@LINE-1]]:14: warning: do not use static_cast to downcast from a base to a derived class [cppcoreguidelines-pro-type-static-cast-downcast]
+  // CHECK-MESSAGES-STRICT: :[[@LINE-1]]:14: warning: do not use static_cast to downcast from a base to a derived class [cppcoreguidelines-pro-type-static-cast-downcast]
 
   auto P1 = static_cast<Base*>(new Derived()); // OK, upcast to a public base
   auto P2 = static_cast<Base*>(new MultiDerived()); // OK, upcast to a public base
@@ -40,13 +41,13 @@ void pointers() {
 void pointers_polymorphic() {
 
   auto PP0 = static_cast<PolymorphicDerived*>(new PolymorphicBase());
-  // CHECK-MESSAGES: :[[@LINE-1]]:14: warning: do not use static_cast to downcast from a base to a derived class; use dynamic_cast instead [cppcoreguidelines-pro-type-static-cast-downcast]
-  // CHECK-FIXES: auto PP0 = dynamic_cast<PolymorphicDerived*>(new PolymorphicBase());
+  // CHECK-MESSAGES-NSTRICT: :[[@LINE-1]]:14: warning: do not use static_cast to downcast from a base to a derived class; use dynamic_cast instead [cppcoreguidelines-pro-type-static-cast-downcast]
+  // CHECK-FIXES-NSTRICT: auto PP0 = dynamic_cast<PolymorphicDerived*>(new PolymorphicBase());
 
   const PolymorphicBase* B0;
   auto PPC0 = static_cast<const PolymorphicDerived*>(B0);
-  // CHECK-MESSAGES: :[[@LINE-1]]:15: warning: do not use static_cast to downcast from a base to a derived class; use dynamic_cast instead [cppcoreguidelines-pro-type-static-cast-downcast]
-  // CHECK-FIXES: auto PPC0 = dynamic_cast<const PolymorphicDerived*>(B0);
+  // CHECK-MESSAGES-NSTRICT: :[[@LINE-1]]:15: warning: do not use static_cast to downcast from a base to a derived class; use dynamic_cast instead [cppcoreguidelines-pro-type-static-cast-downcast]
+  // CHECK-FIXES-NSTRICT: auto PPC0 = dynamic_cast<const PolymorphicDerived*>(B0);
 
 
   auto B1 = static_cast<PolymorphicBase*>(new PolymorphicDerived()); // OK, upcast to a public base
@@ -57,27 +58,27 @@ void pointers_polymorphic() {
 void arrays() {
   Base ArrayOfBase[10];
   auto A0 = static_cast<Derived*>(ArrayOfBase);
-  // CHECK-MESSAGES: :[[@LINE-1]]:13: warning: do not use static_cast to downcast from a base to a derived class [cppcoreguidelines-pro-type-static-cast-downcast]
+  // CHECK-MESSAGES-STRICT: :[[@LINE-1]]:13: warning: do not use static_cast to downcast from a base to a derived class [cppcoreguidelines-pro-type-static-cast-downcast]
 }
 
 void arrays_polymorphic() {
   PolymorphicBase ArrayOfPolymorphicBase[10];
   auto AP0 = static_cast<PolymorphicDerived*>(ArrayOfPolymorphicBase);
-  // CHECK-MESSAGES: :[[@LINE-1]]:14: warning: do not use static_cast to downcast from a base to a derived class; use dynamic_cast instead
-  // CHECK-FIXES: auto AP0 = dynamic_cast<PolymorphicDerived*>(ArrayOfPolymorphicBase);
+  // CHECK-MESSAGES-NSTRICT: :[[@LINE-1]]:14: warning: do not use static_cast to downcast from a base to a derived class; use dynamic_cast instead
+  // CHECK-FIXES-NSTRICT: auto AP0 = dynamic_cast<PolymorphicDerived*>(ArrayOfPolymorphicBase);
 }
 
 void references() {
   Base B0;
   auto R0 = static_cast<Derived&>(B0);
-  // CHECK-MESSAGES: :[[@LINE-1]]:13: warning: do not use static_cast to downcast from a base to a derived class [cppcoreguidelines-pro-type-static-cast-downcast]
+  // CHECK-MESSAGES-STRICT: :[[@LINE-1]]:13: warning: do not use static_cast to downcast from a base to a derived class [cppcoreguidelines-pro-type-static-cast-downcast]
   Base& RefToBase = B0;
   auto R1 = static_cast<Derived&>(RefToBase);
-  // CHECK-MESSAGES: :[[@LINE-1]]:13: warning: do not use static_cast to downcast from a base to a derived class [cppcoreguidelines-pro-type-static-cast-downcast]
+  // CHECK-MESSAGES-STRICT: :[[@LINE-1]]:13: warning: do not use static_cast to downcast from a base to a derived class [cppcoreguidelines-pro-type-static-cast-downcast]
 
   const Base& ConstRefToBase = B0;
   auto RC1 = static_cast<const Derived&>(ConstRefToBase);
-  // CHECK-MESSAGES: :[[@LINE-1]]:14: warning: do not use static_cast to downcast from a base to a derived class [cppcoreguidelines-pro-type-static-cast-downcast]
+  // CHECK-MESSAGES-STRICT: :[[@LINE-1]]:14: warning: do not use static_cast to downcast from a base to a derived class [cppcoreguidelines-pro-type-static-cast-downcast]
 
 
   Derived RD1;
@@ -87,18 +88,18 @@ void references() {
 void references_polymorphic() {
   PolymorphicBase B0;
   auto RP0 = static_cast<PolymorphicDerived&>(B0);
-  // CHECK-MESSAGES: :[[@LINE-1]]:14: warning: do not use static_cast to downcast from a base to a derived class; use dynamic_cast instead
-  // CHECK-FIXES: auto RP0 = dynamic_cast<PolymorphicDerived&>(B0);
+  // CHECK-MESSAGES-NSTRICT: :[[@LINE-1]]:14: warning: do not use static_cast to downcast from a base to a derived class; use dynamic_cast instead
+  // CHECK-FIXES-NSTRICT: auto RP0 = dynamic_cast<PolymorphicDerived&>(B0);
 
   PolymorphicBase& RefToPolymorphicBase = B0;
   auto RP1 = static_cast<PolymorphicDerived&>(RefToPolymorphicBase);
-  // CHECK-MESSAGES: :[[@LINE-1]]:14: warning: do not use static_cast to downcast from a base to a derived class; use dynamic_cast instead [cppcoreguidelines-pro-type-static-cast-downcast]
-  // CHECK-FIXES: auto RP1 = dynamic_cast<PolymorphicDerived&>(RefToPolymorphicBase);
+  // CHECK-MESSAGES-NSTRICT: :[[@LINE-1]]:14: warning: do not use static_cast to downcast from a base to a derived class; use dynamic_cast instead [cppcoreguidelines-pro-type-static-cast-downcast]
+  // CHECK-FIXES-NSTRICT: auto RP1 = dynamic_cast<PolymorphicDerived&>(RefToPolymorphicBase);
 
   const PolymorphicBase& ConstRefToPolymorphicBase = B0;
   auto RPC2 = static_cast<const PolymorphicDerived&>(ConstRefToPolymorphicBase);
-  // CHECK-MESSAGES: :[[@LINE-1]]:15: warning: do not use static_cast to downcast from a base to a derived class; use dynamic_cast instead [cppcoreguidelines-pro-type-static-cast-downcast]
-  // CHECK-FIXES: auto RPC2 = dynamic_cast<const PolymorphicDerived&>(ConstRefToPolymorphicBase);
+  // CHECK-MESSAGES-NSTRICT: :[[@LINE-1]]:15: warning: do not use static_cast to downcast from a base to a derived class; use dynamic_cast instead [cppcoreguidelines-pro-type-static-cast-downcast]
+  // CHECK-FIXES-NSTRICT: auto RPC2 = dynamic_cast<const PolymorphicDerived&>(ConstRefToPolymorphicBase);
 
   PolymorphicDerived d1;
   auto RP2 = static_cast<PolymorphicBase&>(d1); // OK, upcast to a public base

From 696eb3a55aecb6d6a702da9d7d88fbae7b3d77d0 Mon Sep 17 00:00:00 2001
From: Alex Richardson <alexrichardson@google.com>
Date: Wed, 25 Oct 2023 14:44:43 -0700
Subject: [PATCH 118/877] Remove unnecessary newline from error message

I was writing a test that included this error and noticed the spurios
newline that made writing the test more awkward.
---
 llvm/lib/MC/TargetRegistry.cpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/MC/TargetRegistry.cpp b/llvm/lib/MC/TargetRegistry.cpp
index fa7aaccabcd61..0aa48916c7d25 100644
--- a/llvm/lib/MC/TargetRegistry.cpp
+++ b/llvm/lib/MC/TargetRegistry.cpp
@@ -49,9 +49,8 @@ const Target *TargetRegistry::lookupTarget(StringRef ArchName,
     std::string TempError;
     TheTarget = TargetRegistry::lookupTarget(TheTriple.getTriple(), TempError);
     if (!TheTarget) {
-      Error = "unable to get target for '"
-            + TheTriple.getTriple()
-            + "', see --version and --triple.\n";
+      Error = "unable to get target for '" + TheTriple.getTriple() +
+              "', see --version and --triple.";
       return nullptr;
     }
   }

From e39f6c1844fab59c638d8059a6cf139adb42279a Mon Sep 17 00:00:00 2001
From: Alex Richardson <alexrichardson@google.com>
Date: Wed, 25 Oct 2023 15:12:01 -0700
Subject: [PATCH 119/877] [opt] Infer DataLayout from triple if not specified

There are many tests that specify a target triple/CPU flags but no
DataLayout which can lead to IR being generated that has unusual
behaviour. This commit attempts to use the default DataLayout based
on the relevant flags if there is no explicit override on the command
line or in the IR file.

One thing that is not currently possible to differentiate from a missing
datalayout `target datalayout = ""` in the IR file since the current
APIs don't allow detecting this case. If it is considered useful to
support this case (instead of passing "-data-layout=" on the command
line), I can change IR parsers to track whether they have seen such a
directive and change the callback type.

Differential Revision: https://reviews.llvm.org/D141060
---
 llvm/test/Analysis/BasicAA/libfuncs.ll        |    2 +-
 .../Analysis/CostModel/AArch64/sve-fptrunc.ll |   23 +-
 .../Analysis/CostModel/AArch64/sve-trunc.ll   |   35 +-
 llvm/test/Analysis/CostModel/AMDGPU/cast.ll   |   12 +-
 .../CostModel/AMDGPU/load-to-trunc.ll         |    4 +-
 .../Analysis/CostModel/ARM/load-to-trunc.ll   |    6 +-
 .../CostModel/PowerPC/load-to-trunc.ll        |    4 +-
 llvm/test/Analysis/CostModel/RISCV/cast.ll    |   18 +-
 .../CostModel/RISCV/fca-load-store.ll         |   16 +-
 .../Analysis/CostModel/RISCV/load-to-trunc.ll |    6 +-
 .../CostModel/RISCV/rvv-load-store.ll         |    4 +-
 .../CostModel/SystemZ/load-to-trunc.ll        |    4 +-
 .../Analysis/CostModel/X86/load-to-trunc.ll   |    4 +-
 .../CostModel/X86/min-legal-vector-width.ll   |   48 +-
 llvm/test/Analysis/CostModel/X86/size-cost.ll |    4 +-
 .../Analysis/CostModel/X86/trunc-codesize.ll  |  325 +--
 .../Analysis/CostModel/X86/trunc-latency.ll   |  325 +--
 .../CostModel/X86/trunc-sizelatency.ll        |  325 +--
 llvm/test/Analysis/CostModel/X86/trunc.ll     |  252 +--
 .../test/CodeGen/AArch64/arm64_32-gep-sink.ll |   14 +-
 .../CodeGen/AArch64/sme-aarch64-svcount-O3.ll |    4 +-
 .../AMDGPU/addrspacecast-constantexpr.ll      |    4 +-
 .../AMDGPU/amdgpu-codegenprepare-idiv.ll      |  100 +-
 ...mplify-libcall-sincos.defined.nobuiltin.ll |   11 +-
 .../annotate-kernel-features-hsa-call.ll      |    4 +-
 .../AMDGPU/implicitarg-offset-attributes.ll   |    2 +-
 .../AMDGPU/lds-reject-absolute-addresses.ll   |    2 +-
 .../AMDGPU/lower-ctor-dtor-constexpr-alias.ll |    2 +-
 llvm/test/CodeGen/AMDGPU/lower-ctor-dtor.ll   |    2 +-
 .../AMDGPU/lower-kernel-and-module-lds.ll     |    4 +-
 .../AMDGPU/lower-kernel-lds-super-align.ll    |    8 +-
 llvm/test/CodeGen/AMDGPU/lower-kernel-lds.ll  |    2 +-
 .../AMDGPU/lower-lds-struct-aa-memcpy.ll      |    2 +-
 .../AMDGPU/lower-lds-struct-aa-merge.ll       |    2 +-
 .../CodeGen/AMDGPU/lower-lds-struct-aa.ll     |    2 +-
 ...ect-extern-uses-max-reachable-alignment.ll |   10 +-
 .../AMDGPU/lower-module-lds-used-list.ll      |    2 +-
 .../AMDGPU/lower-module-lds-via-hybrid.ll     |   26 +-
 .../AMDGPU/lower-module-lds-via-table.ll      |    9 +-
 .../AMDGPU/lower-multiple-ctor-dtor.ll        |    2 +-
 .../CodeGen/AMDGPU/nested-loop-conditions.ll  |    4 +-
 llvm/test/CodeGen/AMDGPU/opencl-printf.ll     |   32 +-
 .../CodeGen/AMDGPU/private-memory-atomics.ll  |    4 +-
 .../AMDGPU/promote-alloca-array-aggregate.ll  |    2 +-
 .../CodeGen/AMDGPU/promote-alloca-memset.ll   |   10 +-
 .../CodeGen/AMDGPU/rewrite-out-arguments.ll   |    3 -
 .../BPF/CORE/field-reloc-bitfield-1-bpfeb.ll  |    8 +-
 .../BPF/CORE/field-reloc-bitfield-1.ll        |   12 +-
 .../X86/expand-large-div-rem-sdiv129.ll       |    4 +-
 .../X86/expand-large-div-rem-srem129.ll       |    4 +-
 .../X86/expand-large-div-rem-udiv129.ll       |    4 +-
 .../X86/expand-large-div-rem-urem129.ll       |    4 +-
 .../HWAddressSanitizer/use-after-scope.ll     |  120 +-
 .../cmp-tracing-api-x86_32.ll                 |    6 +-
 .../AtomicExpand/AMDGPU/expand-atomic-i8.ll   | 1766 +++++++++++------
 .../AMDGPU/expand-atomic-rmw-fadd.ll          |  306 ++-
 .../AMDGPU/expand-atomic-rmw-fmax.ll          |   17 +-
 .../AMDGPU/expand-atomic-rmw-fmin.ll          |   17 +-
 .../AMDGPU/expand-atomic-rmw-fsub.ll          |   51 +-
 .../AtomicExpand/PowerPC/cmpxchg.ll           |   14 +-
 .../AtomicExpand/X86/expand-atomic-libcall.ll |   12 +-
 .../AMDGPU/sink-addrspacecast.ll              |   32 +-
 .../DivRemPairs/X86/div-expanded-rem-pair.ll  |   20 +-
 .../IndVarSimplify/ARM/indvar-cost.ll         |  130 +-
 .../Transforms/InferFunctionAttrs/annotate.ll |    2 +-
 .../InstCombine/double-float-shrink-2.ll      |   50 +-
 llvm/test/Transforms/InstCombine/ffs-i16.ll   |    8 +-
 llvm/test/Transforms/InstCombine/fls-i16.ll   |    8 +-
 .../Transforms/InstCombine/isascii-i16.ll     |   18 +-
 .../Transforms/InstCombine/isdigit-i16.ll     |   26 +-
 llvm/test/Transforms/InstCombine/pow-4.ll     |    4 +-
 .../test/Transforms/InstCombine/pow_fp_int.ll |    1 +
 .../test/Transforms/InstCombine/printf-i16.ll |   28 +-
 llvm/test/Transforms/InstCombine/puts-i16.ll  |    2 +-
 llvm/test/Transforms/InstCombine/sincospi.ll  |   54 +-
 .../ConstProp/calls-math-finite.ll            |    4 +-
 .../X86/2012-01-13-phielim.ll                 |   46 +-
 .../AArch64/runtime-unroll-generic.ll         |  104 +-
 .../LoopUnroll/ARM/instr-size-costs.ll        |   36 +-
 .../Transforms/LoopUnroll/ARM/upperbound.ll   |    4 +-
 .../LoopVectorize/AArch64/masked-call.ll      |   56 +-
 .../sve-epilog-vect-inloop-reductions.ll      |    8 +-
 .../AArch64/sve-epilog-vect-reductions.ll     |    8 +-
 .../AArch64/sve-live-out-pointer-induction.ll |    6 +-
 .../sve-runtime-check-size-based-threshold.ll |   24 +-
 .../AArch64/synthesize-mask-for-call.ll       |   24 +-
 .../PowerPC/widened-massv-call.ll             |   27 +-
 .../RISCV/interleaved-accesses-zve32x.ll      |    8 +-
 .../RISCV/interleaved-accesses.ll             |  274 ++-
 .../Transforms/LoopVectorize/RISCV/lmul.ll    |   32 +-
 .../RISCV/masked_gather_scatter.ll            |    6 +-
 llvm/test/Transforms/MemCpyOpt/no-libcalls.ll |    8 +-
 .../MergeICmps/X86/addressspaces.ll           |    8 +-
 llvm/test/Transforms/NewGVN/refine-stores.ll  |    6 +-
 .../test/Transforms/OpenMP/barrier_removal.ll |    2 +-
 ...ting-sinking-required-for-vectorization.ll |   68 +-
 .../AArch64/matrix-extract-insert.ll          |   54 +-
 ...ple-unreachable-exits-for-vectorization.ll |   68 +-
 .../SLPVectorizer/AArch64/reduce-add-i64.ll   |   10 +-
 .../SLPVectorizer/AArch64/splat-loads.ll      |    2 +-
 .../SLPVectorizer/X86/control-dependence.ll   |   88 +-
 .../SLPVectorizer/X86/opaque-ptr.ll           |   19 +-
 llvm/test/Transforms/SafeStack/X86/setjmp2.ll |   67 +-
 .../AMDGPU/split-gep-and-gvn.ll               |   56 +-
 .../AMDGPU/split-gep.ll                       |    7 +-
 .../NVPTX/split-gep.ll                        |   10 +-
 .../RISCV/split-gep.ll                        |   12 +-
 .../Transforms/TypePromotion/ARM/pointers.ll  |    2 +-
 ...insert-binop-with-constant-inseltpoison.ll |    6 +-
 .../X86/insert-binop-with-constant.ll         |    6 +-
 .../X86/scalarize-cmp-inseltpoison.ll         |    2 +-
 .../VectorCombine/X86/scalarize-cmp.ll        |    2 +-
 llvm/test/tools/opt/infer-data-layout.ll      |   13 +
 llvm/test/tools/opt/invalid-target.ll         |   16 +
 llvm/tools/opt/opt.cpp                        |   34 +-
 115 files changed, 2771 insertions(+), 2876 deletions(-)
 create mode 100644 llvm/test/tools/opt/infer-data-layout.ll
 create mode 100644 llvm/test/tools/opt/invalid-target.ll

diff --git a/llvm/test/Analysis/BasicAA/libfuncs.ll b/llvm/test/Analysis/BasicAA/libfuncs.ll
index b584e5327be1f..e1ad376ad383b 100644
--- a/llvm/test/Analysis/BasicAA/libfuncs.ll
+++ b/llvm/test/Analysis/BasicAA/libfuncs.ll
@@ -1,4 +1,4 @@
-; RUN: opt -mtriple=i386-pc-linux-gnu -aa-pipeline=basic-aa -passes=inferattrs,aa-eval -print-all-alias-modref-info -disable-output 2>&1 %s | FileCheck %s
+; RUN: opt -mtriple=x86_64-pc-linux-gnu -aa-pipeline=basic-aa -passes=inferattrs,aa-eval -print-all-alias-modref-info -disable-output 2>&1 %s | FileCheck %s
 
 ; CHECK-LABEL: Function: test_memcmp_const_size
 ; CHECK:      Just Ref:  Ptr: i8* %a	<->  %res = tail call i32 @memcmp(ptr %a, ptr %b, i64 4)
diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-fptrunc.ll b/llvm/test/Analysis/CostModel/AArch64/sve-fptrunc.ll
index c322fad94e969..020d34deb8c69 100644
--- a/llvm/test/Analysis/CostModel/AArch64/sve-fptrunc.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/sve-fptrunc.ll
@@ -1,19 +1,22 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 3
 ; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple aarch64-linux-gnu -mattr=+sve -S -o - < %s | FileCheck %s
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64-unknown-linux-gnu"
 
 define void @sve_fptruncs() {
-  ;CHECK-LABEL: 'sve_fptruncs'
-  ;CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction:   %nxv2_f16_from_f32 = fptrunc <vscale x 2 x float> undef to <vscale x 2 x half>
-  ;CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction:   %nxv4_f16_from_f32 = fptrunc <vscale x 4 x float> undef to <vscale x 4 x half>
-  ;CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction:   %nxv8_f16_from_f32 = fptrunc <vscale x 8 x float> undef to <vscale x 8 x half>
-  ;CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction:   %nxv2_f16_from_f64 = fptrunc <vscale x 2 x double> undef to <vscale x 2 x half>
-  ;CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction:   %nxv4_f16_from_f64 = fptrunc <vscale x 4 x double> undef to <vscale x 4 x half>
-  ;CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction:   %nxv8_f16_from_f64 = fptrunc <vscale x 8 x double> undef to <vscale x 8 x half>
-  ;CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction:   %nxv2_f32_from_f64 = fptrunc <vscale x 2 x double> undef to <vscale x 2 x float>
-  ;CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction:   %nxv4_f32_from_f64 = fptrunc <vscale x 4 x double> undef to <vscale x 4 x float>
-  ;CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction:   %nxv8_f32_from_f64 = fptrunc <vscale x 8 x double> undef to <vscale x 8 x float>
+; CHECK-LABEL: 'sve_fptruncs'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2_f16_from_f32 = fptrunc <vscale x 2 x float> undef to <vscale x 2 x half>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv4_f16_from_f32 = fptrunc <vscale x 4 x float> undef to <vscale x 4 x half>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv8_f16_from_f32 = fptrunc <vscale x 8 x float> undef to <vscale x 8 x half>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2_f16_from_f64 = fptrunc <vscale x 2 x double> undef to <vscale x 2 x half>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4_f16_from_f64 = fptrunc <vscale x 4 x double> undef to <vscale x 4 x half>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %nxv8_f16_from_f64 = fptrunc <vscale x 8 x double> undef to <vscale x 8 x half>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv2_f32_from_f64 = fptrunc <vscale x 2 x double> undef to <vscale x 2 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv4_f32_from_f64 = fptrunc <vscale x 4 x double> undef to <vscale x 4 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %nxv8_f32_from_f64 = fptrunc <vscale x 8 x double> undef to <vscale x 8 x float>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
   %nxv2_f16_from_f32 = fptrunc <vscale x 2 x float> undef to <vscale x 2 x half>
   %nxv4_f16_from_f32 = fptrunc <vscale x 4 x float> undef to <vscale x 4 x half>
   %nxv8_f16_from_f32 = fptrunc <vscale x 8 x float> undef to <vscale x 8 x half>
diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-trunc.ll b/llvm/test/Analysis/CostModel/AArch64/sve-trunc.ll
index c9d03296e85bd..767bd5f5b75cb 100644
--- a/llvm/test/Analysis/CostModel/AArch64/sve-trunc.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/sve-trunc.ll
@@ -1,24 +1,27 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 3
 ; RUN: opt -mtriple=aarch64-linux-gnu -mattr=+sve -passes="print<cost-model>" 2>&1 -disable-output < %s | FileCheck %s
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 
 define void @sve_truncs() {
-  ;CHECK-LABEL: 'sve_truncs'
-  ;CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction:   %trunc_v2i16_to_i1 = trunc <vscale x 2 x i16> undef to <vscale x 2 x i1>
-  ;CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction:   %trunc_v2i32_to_i1 = trunc <vscale x 2 x i32> undef to <vscale x 2 x i1>
-  ;CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction:   %trunc_v2i64_to_i1 = trunc <vscale x 2 x i64> undef to <vscale x 2 x i1>
-  ;CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction:   %trunc_v4i16_to_i1 = trunc <vscale x 4 x i16> undef to <vscale x 4 x i1>
-  ;CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction:   %trunc_v4i32_to_i1 = trunc <vscale x 4 x i32> undef to <vscale x 4 x i1>
-  ;CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction:   %trunc_v4i64_to_i1 = trunc <vscale x 4 x i64> undef to <vscale x 4 x i1>
-  ;CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction:   %trunc_v8i16_to_i1 = trunc <vscale x 8 x i16> undef to <vscale x 8 x i1>
-  ;CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction:   %trunc_v8i32_to_i1 = trunc <vscale x 8 x i32> undef to <vscale x 8 x i1>
-  ;CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction:   %trunc_v8i64_to_i1 = trunc <vscale x 8 x i64> undef to <vscale x 8 x i1>
-  ;CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction:   %trunc_v2i32_to_i16 = trunc <vscale x 2 x i32> undef to <vscale x 2 x i16>
-  ;CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction:   %trunc_v2i64_to_i32 = trunc <vscale x 2 x i64> undef to <vscale x 2 x i32>
-  ;CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction:   %trunc_v4i32_to_i16 = trunc <vscale x 4 x i32> undef to <vscale x 4 x i16>
-  ;CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction:   %trunc_v4i64_to_i32 = trunc <vscale x 4 x i64> undef to <vscale x 4 x i32>
-  ;CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction:   %trunc_v8i32_to_i16 = trunc <vscale x 8 x i32> undef to <vscale x 8 x i16>
-  ;CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction:   %trunc_v8i64_to_i32 = trunc <vscale x 8 x i64> undef to <vscale x 8 x i32>
+; CHECK-LABEL: 'sve_truncs'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %trunc_v2i16_to_i1 = trunc <vscale x 2 x i16> undef to <vscale x 2 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %trunc_v2i32_to_i1 = trunc <vscale x 2 x i32> undef to <vscale x 2 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %trunc_v2i64_to_i1 = trunc <vscale x 2 x i64> undef to <vscale x 2 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %trunc_v4i16_to_i1 = trunc <vscale x 4 x i16> undef to <vscale x 4 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %trunc_v4i32_to_i1 = trunc <vscale x 4 x i32> undef to <vscale x 4 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %trunc_v4i64_to_i1 = trunc <vscale x 4 x i64> undef to <vscale x 4 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %trunc_v8i16_to_i1 = trunc <vscale x 8 x i16> undef to <vscale x 8 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %trunc_v8i32_to_i1 = trunc <vscale x 8 x i32> undef to <vscale x 8 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %trunc_v8i64_to_i1 = trunc <vscale x 8 x i64> undef to <vscale x 8 x i1>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %trunc_v2i32_to_i16 = trunc <vscale x 2 x i32> undef to <vscale x 2 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %trunc_v2i64_to_i32 = trunc <vscale x 2 x i64> undef to <vscale x 2 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %trunc_v4i32_to_i16 = trunc <vscale x 4 x i32> undef to <vscale x 4 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %trunc_v4i64_to_i32 = trunc <vscale x 4 x i64> undef to <vscale x 4 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %trunc_v8i32_to_i16 = trunc <vscale x 8 x i32> undef to <vscale x 8 x i16>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %trunc_v8i64_to_i32 = trunc <vscale x 8 x i64> undef to <vscale x 8 x i32>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
   %trunc_v2i16_to_i1  = trunc <vscale x 2 x i16> undef to <vscale x 2 x i1>
   %trunc_v2i32_to_i1  = trunc <vscale x 2 x i32> undef to <vscale x 2 x i1>
   %trunc_v2i64_to_i1  = trunc <vscale x 2 x i64> undef to <vscale x 2 x i1>
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/cast.ll b/llvm/test/Analysis/CostModel/AMDGPU/cast.ll
index 55757e9b57957..fc8f5fc393396 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/cast.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/cast.ll
@@ -74,11 +74,11 @@ define i32 @zext_sext(<8 x i1> %in) {
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %E = trunc <4 x i64> undef to <4 x i32>
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %F = trunc <8 x i32> undef to <8 x i16>
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %F1 = trunc <16 x i16> undef to <16 x i8>
-; FAST-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %F2 = trunc <8 x i32> undef to <8 x i8>
-; FAST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %F3 = trunc <4 x i64> undef to <4 x i8>
+; FAST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %F2 = trunc <8 x i32> undef to <8 x i8>
+; FAST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %F3 = trunc <4 x i64> undef to <4 x i8>
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %G = trunc <8 x i64> undef to <8 x i32>
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %G1 = trunc <16 x i32> undef to <16 x i16>
-; FAST-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %G2 = trunc <16 x i32> undef to <16 x i8>
+; FAST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %G2 = trunc <16 x i32> undef to <16 x i8>
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret i32 undef
 ;
 ; SLOW-LABEL: 'zext_sext'
@@ -134,11 +134,11 @@ define i32 @zext_sext(<8 x i1> %in) {
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %E = trunc <4 x i64> undef to <4 x i32>
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %F = trunc <8 x i32> undef to <8 x i16>
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %F1 = trunc <16 x i16> undef to <16 x i8>
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %F2 = trunc <8 x i32> undef to <8 x i8>
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %F3 = trunc <4 x i64> undef to <4 x i8>
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %F2 = trunc <8 x i32> undef to <8 x i8>
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %F3 = trunc <4 x i64> undef to <4 x i8>
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %G = trunc <8 x i64> undef to <8 x i32>
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %G1 = trunc <16 x i32> undef to <16 x i16>
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %G2 = trunc <16 x i32> undef to <16 x i8>
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %G2 = trunc <16 x i32> undef to <16 x i8>
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
 ; SLOW-SIZE-LABEL: 'zext_sext'
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/load-to-trunc.ll b/llvm/test/Analysis/CostModel/AMDGPU/load-to-trunc.ll
index 0c16e3f095be5..d304acc79f5ed 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/load-to-trunc.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/load-to-trunc.ll
@@ -8,7 +8,7 @@
 ; Check that cost is 1 for unusual load to register sized load.
 define i32 @loadUnusualIntegerWithTrunc(ptr %ptr) {
 ; CHECK-LABEL: 'loadUnusualIntegerWithTrunc'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %out = load i128, ptr %ptr, align 4
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %out = load i128, ptr %ptr, align 8
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %trunc = trunc i128 %out to i32
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %trunc
 ;
@@ -19,7 +19,7 @@ define i32 @loadUnusualIntegerWithTrunc(ptr %ptr) {
 
 define i128 @loadUnusualInteger(ptr %ptr) {
 ; CHECK-LABEL: 'loadUnusualInteger'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %out = load i128, ptr %ptr, align 4
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %out = load i128, ptr %ptr, align 8
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i128 %out
 ;
   %out = load i128, ptr %ptr
diff --git a/llvm/test/Analysis/CostModel/ARM/load-to-trunc.ll b/llvm/test/Analysis/CostModel/ARM/load-to-trunc.ll
index ba776852396da..44042094fb06b 100644
--- a/llvm/test/Analysis/CostModel/ARM/load-to-trunc.ll
+++ b/llvm/test/Analysis/CostModel/ARM/load-to-trunc.ll
@@ -9,8 +9,8 @@
 ; Check that cost is 1 for unusual load to register sized load.
 define i32 @loadUnusualIntegerWithTrunc(ptr %ptr) {
 ; CHECK-LABEL: 'loadUnusualIntegerWithTrunc'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %out = load i128, ptr %ptr, align 4
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %trunc = trunc i128 %out to i32
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %out = load i128, ptr %ptr, align 8
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %trunc = trunc i128 %out to i32
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %trunc
 ;
   %out = load i128, ptr %ptr
@@ -20,7 +20,7 @@ define i32 @loadUnusualIntegerWithTrunc(ptr %ptr) {
 
 define i128 @loadUnusualInteger(ptr %ptr) {
 ; CHECK-LABEL: 'loadUnusualInteger'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %out = load i128, ptr %ptr, align 4
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %out = load i128, ptr %ptr, align 8
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i128 %out
 ;
   %out = load i128, ptr %ptr
diff --git a/llvm/test/Analysis/CostModel/PowerPC/load-to-trunc.ll b/llvm/test/Analysis/CostModel/PowerPC/load-to-trunc.ll
index 6297f60e0108f..57a6e98cfb4ee 100644
--- a/llvm/test/Analysis/CostModel/PowerPC/load-to-trunc.ll
+++ b/llvm/test/Analysis/CostModel/PowerPC/load-to-trunc.ll
@@ -7,7 +7,7 @@
 ; Check that cost is 1 for unusual load to register sized load.
 define i32 @loadUnusualIntegerWithTrunc(ptr %ptr) {
 ; CHECK-LABEL: 'loadUnusualIntegerWithTrunc'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %out = load i128, ptr %ptr, align 4
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %out = load i128, ptr %ptr, align 8
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %trunc = trunc i128 %out to i32
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %trunc
 ;
@@ -18,7 +18,7 @@ define i32 @loadUnusualIntegerWithTrunc(ptr %ptr) {
 
 define i128 @loadUnusualInteger(ptr %ptr) {
 ; CHECK-LABEL: 'loadUnusualInteger'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %out = load i128, ptr %ptr, align 4
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %out = load i128, ptr %ptr, align 8
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i128 %out
 ;
   %out = load i128, ptr %ptr
diff --git a/llvm/test/Analysis/CostModel/RISCV/cast.ll b/llvm/test/Analysis/CostModel/RISCV/cast.ll
index c486af1cfd4a8..bd26c19c2f2c3 100644
--- a/llvm/test/Analysis/CostModel/RISCV/cast.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/cast.ll
@@ -1075,7 +1075,7 @@ define void @trunc() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64i8_v64i1 = trunc <64 x i8> undef to <64 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64i16_v64i1 = trunc <64 x i16> undef to <64 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v64i32_v64i1 = trunc <64 x i32> undef to <64 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v64i64_v64i1 = trunc <64 x i64> undef to <64 x i1>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v64i64_v64i1 = trunc <64 x i64> undef to <64 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v128i16_v128i8 = trunc <128 x i16> undef to <128 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v128i32_v128i8 = trunc <128 x i32> undef to <128 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v128i64_v128i8 = trunc <128 x i64> undef to <128 x i8>
@@ -1085,7 +1085,7 @@ define void @trunc() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v128i8_v128i1 = trunc <128 x i8> undef to <128 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v128i16_v128i1 = trunc <128 x i16> undef to <128 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v128i32_v128i1 = trunc <128 x i32> undef to <128 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v128i64_v128i1 = trunc <128 x i64> undef to <128 x i1>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v128i64_v128i1 = trunc <128 x i64> undef to <128 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v256i16_v256i8 = trunc <256 x i16> undef to <256 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %v256i32_v256i8 = trunc <256 x i32> undef to <256 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %v256i64_v256i8 = trunc <256 x i64> undef to <256 x i8>
@@ -1095,7 +1095,7 @@ define void @trunc() {
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v256i8_v256i1 = trunc <256 x i8> undef to <256 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v256i16_v256i1 = trunc <256 x i16> undef to <256 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %v256i32_v256i1 = trunc <256 x i32> undef to <256 x i1>
-; RV32-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %v256i64_v256i1 = trunc <256 x i64> undef to <256 x i1>
+; RV32-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v256i64_v256i1 = trunc <256 x i64> undef to <256 x i1>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16_nxv1i8 = trunc <vscale x 1 x i16> undef to <vscale x 1 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i32_nxv1i8 = trunc <vscale x 1 x i32> undef to <vscale x 1 x i8>
 ; RV32-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1i64_nxv1i8 = trunc <vscale x 1 x i64> undef to <vscale x 1 x i8>
@@ -1227,8 +1227,8 @@ define void @trunc() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v64i64_v64i32 = trunc <64 x i64> undef to <64 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64i8_v64i1 = trunc <64 x i8> undef to <64 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v64i16_v64i1 = trunc <64 x i16> undef to <64 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v64i32_v64i1 = trunc <64 x i32> undef to <64 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v64i64_v64i1 = trunc <64 x i64> undef to <64 x i1>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v64i32_v64i1 = trunc <64 x i32> undef to <64 x i1>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v64i64_v64i1 = trunc <64 x i64> undef to <64 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v128i16_v128i8 = trunc <128 x i16> undef to <128 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v128i32_v128i8 = trunc <128 x i32> undef to <128 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %v128i64_v128i8 = trunc <128 x i64> undef to <128 x i8>
@@ -1237,8 +1237,8 @@ define void @trunc() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v128i64_v128i32 = trunc <128 x i64> undef to <128 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v128i8_v128i1 = trunc <128 x i8> undef to <128 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %v128i16_v128i1 = trunc <128 x i16> undef to <128 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %v128i32_v128i1 = trunc <128 x i32> undef to <128 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %v128i64_v128i1 = trunc <128 x i64> undef to <128 x i1>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v128i32_v128i1 = trunc <128 x i32> undef to <128 x i1>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v128i64_v128i1 = trunc <128 x i64> undef to <128 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v256i16_v256i8 = trunc <256 x i16> undef to <256 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %v256i32_v256i8 = trunc <256 x i32> undef to <256 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %v256i64_v256i8 = trunc <256 x i64> undef to <256 x i8>
@@ -1247,8 +1247,8 @@ define void @trunc() {
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v256i64_v256i32 = trunc <256 x i64> undef to <256 x i32>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v256i8_v256i1 = trunc <256 x i8> undef to <256 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v256i16_v256i1 = trunc <256 x i16> undef to <256 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %v256i32_v256i1 = trunc <256 x i32> undef to <256 x i1>
-; RV64-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %v256i64_v256i1 = trunc <256 x i64> undef to <256 x i1>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v256i32_v256i1 = trunc <256 x i32> undef to <256 x i1>
+; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v256i64_v256i1 = trunc <256 x i64> undef to <256 x i1>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nxv1i16_nxv1i8 = trunc <vscale x 1 x i16> undef to <vscale x 1 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %nxv1i32_nxv1i8 = trunc <vscale x 1 x i32> undef to <vscale x 1 x i8>
 ; RV64-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %nxv1i64_nxv1i8 = trunc <vscale x 1 x i64> undef to <vscale x 1 x i8>
diff --git a/llvm/test/Analysis/CostModel/RISCV/fca-load-store.ll b/llvm/test/Analysis/CostModel/RISCV/fca-load-store.ll
index 81299de2d4968..ca722a739a5e1 100644
--- a/llvm/test/Analysis/CostModel/RISCV/fca-load-store.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/fca-load-store.ll
@@ -4,10 +4,10 @@
 
 define void @load(ptr %p) {
 ; CHECK-LABEL: 'load'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %1 = load [2 x i64], ptr %p, align 4
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %2 = load [4 x i64], ptr %p, align 4
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %3 = load { i64, i64 }, ptr %p, align 4
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %4 = load { i64, i32 }, ptr %p, align 4
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %1 = load [2 x i64], ptr %p, align 8
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %2 = load [4 x i64], ptr %p, align 8
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %3 = load { i64, i64 }, ptr %p, align 8
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %4 = load { i64, i32 }, ptr %p, align 8
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   load [2 x i64], ptr %p
@@ -20,10 +20,10 @@ define void @load(ptr %p) {
 
 define void @store(ptr %p) {
 ; CHECK-LABEL: 'store'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store [2 x i64] undef, ptr %p, align 4
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store [4 x i64] undef, ptr %p, align 4
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store { i64, i64 } undef, ptr %p, align 4
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store { i64, i32 } undef, ptr %p, align 4
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store [2 x i64] undef, ptr %p, align 8
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store [4 x i64] undef, ptr %p, align 8
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store { i64, i64 } undef, ptr %p, align 8
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store { i64, i32 } undef, ptr %p, align 8
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   store [2 x i64] undef, ptr %p
diff --git a/llvm/test/Analysis/CostModel/RISCV/load-to-trunc.ll b/llvm/test/Analysis/CostModel/RISCV/load-to-trunc.ll
index 189cda24327db..2e400c74985fd 100644
--- a/llvm/test/Analysis/CostModel/RISCV/load-to-trunc.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/load-to-trunc.ll
@@ -8,8 +8,8 @@
 ; Check that cost is 1 for unusual load to register sized load.
 define i32 @loadUnusualIntegerWithTrunc(ptr %ptr) {
 ; CHECK-LABEL: 'loadUnusualIntegerWithTrunc'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %out = load i128, ptr %ptr, align 4
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %trunc = trunc i128 %out to i32
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %out = load i128, ptr %ptr, align 16
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %trunc = trunc i128 %out to i32
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %trunc
 ;
   %out = load i128, ptr %ptr
@@ -19,7 +19,7 @@ define i32 @loadUnusualIntegerWithTrunc(ptr %ptr) {
 
 define i128 @loadUnusualInteger(ptr %ptr) {
 ; CHECK-LABEL: 'loadUnusualInteger'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %out = load i128, ptr %ptr, align 4
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %out = load i128, ptr %ptr, align 16
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i128 %out
 ;
   %out = load i128, ptr %ptr
diff --git a/llvm/test/Analysis/CostModel/RISCV/rvv-load-store.ll b/llvm/test/Analysis/CostModel/RISCV/rvv-load-store.ll
index df02b5374f770..669ccf4a1b263 100644
--- a/llvm/test/Analysis/CostModel/RISCV/rvv-load-store.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/rvv-load-store.ll
@@ -44,7 +44,7 @@ define void @load(ptr %p) {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %37 = load <vscale x 8 x i32>, ptr %p, align 32
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %38 = load <vscale x 16 x i32>, ptr %p, align 64
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %39 = load <vscale x 32 x i32>, ptr %p, align 128
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %40 = load i64, ptr %p, align 4
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %40 = load i64, ptr %p, align 8
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %41 = load <1 x i64>, ptr %p, align 8
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %42 = load <2 x i64>, ptr %p, align 16
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %43 = load <4 x i64>, ptr %p, align 32
@@ -187,7 +187,7 @@ define void @store(ptr %p) {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <vscale x 8 x i32> undef, ptr %p, align 32
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: store <vscale x 16 x i32> undef, ptr %p, align 64
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: store <vscale x 32 x i32> undef, ptr %p, align 128
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i64 undef, ptr %p, align 4
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i64 undef, ptr %p, align 8
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <1 x i64> undef, ptr %p, align 8
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x i64> undef, ptr %p, align 16
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <4 x i64> undef, ptr %p, align 32
diff --git a/llvm/test/Analysis/CostModel/SystemZ/load-to-trunc.ll b/llvm/test/Analysis/CostModel/SystemZ/load-to-trunc.ll
index 2f643f156c1ee..cd6af575ea9ec 100644
--- a/llvm/test/Analysis/CostModel/SystemZ/load-to-trunc.ll
+++ b/llvm/test/Analysis/CostModel/SystemZ/load-to-trunc.ll
@@ -8,7 +8,7 @@
 ; Check that cost is 1 for unusual load to register sized load.
 define i32 @loadUnusualIntegerWithTrunc(ptr %ptr) {
 ; CHECK-LABEL: 'loadUnusualIntegerWithTrunc'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %out = load i128, ptr %ptr, align 4
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %out = load i128, ptr %ptr, align 8
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %trunc = trunc i128 %out to i32
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %trunc
 ;
@@ -19,7 +19,7 @@ define i32 @loadUnusualIntegerWithTrunc(ptr %ptr) {
 
 define i128 @loadUnusualInteger(ptr %ptr) {
 ; CHECK-LABEL: 'loadUnusualInteger'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %out = load i128, ptr %ptr, align 4
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %out = load i128, ptr %ptr, align 8
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i128 %out
 ;
   %out = load i128, ptr %ptr
diff --git a/llvm/test/Analysis/CostModel/X86/load-to-trunc.ll b/llvm/test/Analysis/CostModel/X86/load-to-trunc.ll
index 813e7a191259d..8386f2b5e2b5d 100644
--- a/llvm/test/Analysis/CostModel/X86/load-to-trunc.ll
+++ b/llvm/test/Analysis/CostModel/X86/load-to-trunc.ll
@@ -9,7 +9,7 @@
 ; Check that cost is 1 for unusual load to register sized load.
 define i32 @loadUnusualIntegerWithTrunc(ptr %ptr) {
 ; CHECK-LABEL: 'loadUnusualIntegerWithTrunc'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %out = load i128, ptr %ptr, align 4
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %out = load i128, ptr %ptr, align 16
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %trunc = trunc i128 %out to i32
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %trunc
 ;
@@ -20,7 +20,7 @@ define i32 @loadUnusualIntegerWithTrunc(ptr %ptr) {
 
 define i128 @loadUnusualInteger(ptr %ptr) {
 ; CHECK-LABEL: 'loadUnusualInteger'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %out = load i128, ptr %ptr, align 4
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %out = load i128, ptr %ptr, align 16
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i128 %out
 ;
   %out = load i128, ptr %ptr
diff --git a/llvm/test/Analysis/CostModel/X86/min-legal-vector-width.ll b/llvm/test/Analysis/CostModel/X86/min-legal-vector-width.ll
index 414ffe927e5fc..47bbc53f3cd78 100644
--- a/llvm/test/Analysis/CostModel/X86/min-legal-vector-width.ll
+++ b/llvm/test/Analysis/CostModel/X86/min-legal-vector-width.ll
@@ -542,20 +542,20 @@ define i32 @trunc_vXi1() "min-legal-vector-width"="256" {
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i16 = trunc <2 x i16> undef to <2 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i16 = trunc <4 x i16> undef to <4 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V8i16 = trunc <8 x i16> undef to <8 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16i16 = trunc <16 x i16> undef to <16 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i8 = trunc <2 x i8> undef to <2 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i8 = trunc <4 x i8> undef to <4 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i8 = trunc <8 x i8> undef to <8 x i1>
@@ -569,8 +569,8 @@ define i32 @trunc_vXi1() "min-legal-vector-width"="256" {
 ; AVX512VL256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i1>
 ; AVX512VL256-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i1>
 ; AVX512VL256-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i1>
-; AVX512VL256-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
-; AVX512VL256-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
+; AVX512VL256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
+; AVX512VL256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
 ; AVX512VL256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i1>
 ; AVX512VL256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i1>
 ; AVX512VL256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i1>
@@ -587,8 +587,8 @@ define i32 @trunc_vXi1() "min-legal-vector-width"="256" {
 ; AVX512VL256-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4i8 = trunc <4 x i8> undef to <4 x i1>
 ; AVX512VL256-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8i8 = trunc <8 x i8> undef to <8 x i1>
 ; AVX512VL256-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16i8 = trunc <16 x i8> undef to <16 x i1>
-; AVX512VL256-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V32i8 = trunc <32 x i8> undef to <32 x i1>
-; AVX512VL256-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V64i8 = trunc <64 x i8> undef to <64 x i1>
+; AVX512VL256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i8 = trunc <32 x i8> undef to <32 x i1>
+; AVX512VL256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i8 = trunc <64 x i8> undef to <64 x i1>
 ; AVX512VL256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512VL512-LABEL: 'trunc_vXi1'
@@ -596,8 +596,8 @@ define i32 @trunc_vXi1() "min-legal-vector-width"="256" {
 ; AVX512VL512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i1>
 ; AVX512VL512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i1>
 ; AVX512VL512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i1>
-; AVX512VL512-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
-; AVX512VL512-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
+; AVX512VL512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
+; AVX512VL512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
 ; AVX512VL512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i1>
 ; AVX512VL512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i1>
 ; AVX512VL512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i1>
@@ -608,14 +608,14 @@ define i32 @trunc_vXi1() "min-legal-vector-width"="256" {
 ; AVX512VL512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4i16 = trunc <4 x i16> undef to <4 x i1>
 ; AVX512VL512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8i16 = trunc <8 x i16> undef to <8 x i1>
 ; AVX512VL512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16i16 = trunc <16 x i16> undef to <16 x i1>
-; AVX512VL512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1>
-; AVX512VL512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1>
+; AVX512VL512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1>
+; AVX512VL512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1>
 ; AVX512VL512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2i8 = trunc <2 x i8> undef to <2 x i1>
 ; AVX512VL512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4i8 = trunc <4 x i8> undef to <4 x i1>
 ; AVX512VL512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8i8 = trunc <8 x i8> undef to <8 x i1>
 ; AVX512VL512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16i8 = trunc <16 x i8> undef to <16 x i1>
-; AVX512VL512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32i8 = trunc <32 x i8> undef to <32 x i1>
-; AVX512VL512-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V64i8 = trunc <64 x i8> undef to <64 x i1>
+; AVX512VL512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i8 = trunc <32 x i8> undef to <32 x i1>
+; AVX512VL512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i8 = trunc <64 x i8> undef to <64 x i1>
 ; AVX512VL512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; SKX256-LABEL: 'trunc_vXi1'
@@ -623,14 +623,14 @@ define i32 @trunc_vXi1() "min-legal-vector-width"="256" {
 ; SKX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i1>
 ; SKX256-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i1>
 ; SKX256-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i1>
-; SKX256-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
-; SKX256-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
+; SKX256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
+; SKX256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
 ; SKX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i1>
 ; SKX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i1>
 ; SKX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i1>
 ; SKX256-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i1>
-; SKX256-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1>
-; SKX256-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1>
+; SKX256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1>
+; SKX256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1>
 ; SKX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i16 = trunc <2 x i16> undef to <2 x i1>
 ; SKX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i16 = trunc <4 x i16> undef to <4 x i1>
 ; SKX256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8i16 = trunc <8 x i16> undef to <8 x i1>
@@ -650,14 +650,14 @@ define i32 @trunc_vXi1() "min-legal-vector-width"="256" {
 ; SKX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i1>
 ; SKX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i1>
 ; SKX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i1>
-; SKX512-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
-; SKX512-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
+; SKX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
+; SKX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
 ; SKX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i1>
 ; SKX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i1>
 ; SKX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i1>
 ; SKX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i1>
-; SKX512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1>
-; SKX512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1>
+; SKX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1>
+; SKX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1>
 ; SKX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i16 = trunc <2 x i16> undef to <2 x i1>
 ; SKX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4i16 = trunc <4 x i16> undef to <4 x i1>
 ; SKX512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8i16 = trunc <8 x i16> undef to <8 x i1>
diff --git a/llvm/test/Analysis/CostModel/X86/size-cost.ll b/llvm/test/Analysis/CostModel/X86/size-cost.ll
index 41c16af96340c..aa905dcec777a 100644
--- a/llvm/test/Analysis/CostModel/X86/size-cost.ll
+++ b/llvm/test/Analysis/CostModel/X86/size-cost.ll
@@ -48,7 +48,7 @@ define double @bitcast_i64_f64(i64 %x) {
 
 define ptr @inttoptr_i64_p64(i64 %x) {
 ; CHECK-LABEL: 'inttoptr_i64_p64'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r = inttoptr i64 %x to ptr
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = inttoptr i64 %x to ptr
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret ptr %r
 ;
   %r = inttoptr i64 %x to ptr
@@ -57,7 +57,7 @@ define ptr @inttoptr_i64_p64(i64 %x) {
 
 define i64 @ptrtoint_p64_i64(ptr %x) {
 ; CHECK-LABEL: 'ptrtoint_p64_i64'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r = ptrtoint ptr %x to i64
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r = ptrtoint ptr %x to i64
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %r
 ;
   %r = ptrtoint ptr %x to i64
diff --git a/llvm/test/Analysis/CostModel/X86/trunc-codesize.ll b/llvm/test/Analysis/CostModel/X86/trunc-codesize.ll
index 6419302da03f6..9c4abfef52d91 100644
--- a/llvm/test/Analysis/CostModel/X86/trunc-codesize.ll
+++ b/llvm/test/Analysis/CostModel/X86/trunc-codesize.ll
@@ -537,26 +537,26 @@ define i32 @trunc_vXi1() "min-legal-vector-width"="256" {
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i64 = trunc <20 x i64> undef to <20 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i64 = trunc <24 x i64> undef to <24 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i64 = trunc <28 x i64> undef to <28 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i64 = trunc <40 x i64> undef to <40 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i64 = trunc <48 x i64> undef to <48 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i64 = trunc <56 x i64> undef to <56 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i64 = trunc <80 x i64> undef to <80 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i64 = trunc <112 x i64> undef to <112 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i64 = trunc <160 x i64> undef to <160 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i64 = trunc <192 x i64> undef to <192 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i64 = trunc <224 x i64> undef to <224 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i64 = trunc <320 x i64> undef to <320 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i64 = trunc <384 x i64> undef to <384 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i64 = trunc <448 x i64> undef to <448 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i64 = trunc <640 x i64> undef to <640 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i64 = trunc <768 x i64> undef to <768 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i64 = trunc <896 x i64> undef to <896 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i32 = trunc i32 undef to i1
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V3i32 = trunc <3 x i32> undef to <3 x i1>
@@ -572,26 +572,26 @@ define i32 @trunc_vXi1() "min-legal-vector-width"="256" {
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i32 = trunc <20 x i32> undef to <20 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i32 = trunc <24 x i32> undef to <24 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i32 = trunc <28 x i32> undef to <28 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i32 = trunc <40 x i32> undef to <40 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i32 = trunc <48 x i32> undef to <48 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i32 = trunc <56 x i32> undef to <56 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i32 = trunc <80 x i32> undef to <80 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i32 = trunc <112 x i32> undef to <112 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i32 = trunc <160 x i32> undef to <160 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i32 = trunc <192 x i32> undef to <192 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i32 = trunc <224 x i32> undef to <224 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i32 = trunc <320 x i32> undef to <320 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i32 = trunc <384 x i32> undef to <384 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i32 = trunc <448 x i32> undef to <448 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i32 = trunc <640 x i32> undef to <640 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i32 = trunc <768 x i32> undef to <768 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i32 = trunc <896 x i32> undef to <896 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i32 = trunc <1024 x i32> undef to <1024 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024i32 = trunc <1024 x i32> undef to <1024 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i16 = trunc i16 undef to i1
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i16 = trunc <2 x i16> undef to <2 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V3i16 = trunc <3 x i16> undef to <3 x i1>
@@ -607,26 +607,26 @@ define i32 @trunc_vXi1() "min-legal-vector-width"="256" {
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i16 = trunc <20 x i16> undef to <20 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i16 = trunc <24 x i16> undef to <24 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i16 = trunc <28 x i16> undef to <28 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i16 = trunc <40 x i16> undef to <40 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i16 = trunc <48 x i16> undef to <48 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i16 = trunc <56 x i16> undef to <56 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i16 = trunc <80 x i16> undef to <80 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i16 = trunc <112 x i16> undef to <112 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i16 = trunc <128 x i16> undef to <128 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128i16 = trunc <128 x i16> undef to <128 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i16 = trunc <160 x i16> undef to <160 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i16 = trunc <192 x i16> undef to <192 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i16 = trunc <224 x i16> undef to <224 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i16 = trunc <256 x i16> undef to <256 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256i16 = trunc <256 x i16> undef to <256 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i16 = trunc <320 x i16> undef to <320 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i16 = trunc <384 x i16> undef to <384 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i16 = trunc <448 x i16> undef to <448 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i16 = trunc <512 x i16> undef to <512 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512i16 = trunc <512 x i16> undef to <512 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i16 = trunc <640 x i16> undef to <640 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i16 = trunc <768 x i16> undef to <768 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i16 = trunc <896 x i16> undef to <896 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i16 = trunc <1024 x i16> undef to <1024 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024i16 = trunc <1024 x i16> undef to <1024 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i8 = trunc i8 undef to i1
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i8 = trunc <2 x i8> undef to <2 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V3i8 = trunc <3 x i8> undef to <3 x i1>
@@ -663,292 +663,6 @@ define i32 @trunc_vXi1() "min-legal-vector-width"="256" {
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V896i8 = trunc <896 x i8> undef to <896 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024i8 = trunc <1024 x i8> undef to <1024 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
-;
-; AVX512-LABEL: 'trunc_vXi1'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i64 = trunc i64 undef to i1
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = trunc <2 x i64> undef to <2 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V3i64 = trunc <3 x i64> undef to <3 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V5i64 = trunc <5 x i64> undef to <5 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i64 = trunc <6 x i64> undef to <6 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V7i64 = trunc <7 x i64> undef to <7 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i64 = trunc <10 x i64> undef to <10 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i64 = trunc <12 x i64> undef to <12 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i64 = trunc <14 x i64> undef to <14 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i64 = trunc <20 x i64> undef to <20 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i64 = trunc <24 x i64> undef to <24 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i64 = trunc <28 x i64> undef to <28 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i64 = trunc <40 x i64> undef to <40 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i64 = trunc <48 x i64> undef to <48 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i64 = trunc <56 x i64> undef to <56 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i64 = trunc <80 x i64> undef to <80 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i64 = trunc <112 x i64> undef to <112 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i64 = trunc <160 x i64> undef to <160 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i64 = trunc <192 x i64> undef to <192 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i64 = trunc <224 x i64> undef to <224 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i64 = trunc <320 x i64> undef to <320 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i64 = trunc <384 x i64> undef to <384 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i64 = trunc <448 x i64> undef to <448 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i64 = trunc <640 x i64> undef to <640 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i64 = trunc <768 x i64> undef to <768 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i64 = trunc <896 x i64> undef to <896 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i32 = trunc i32 undef to i1
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V3i32 = trunc <3 x i32> undef to <3 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V5i32 = trunc <5 x i32> undef to <5 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i32 = trunc <6 x i32> undef to <6 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V7i32 = trunc <7 x i32> undef to <7 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i32 = trunc <10 x i32> undef to <10 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i32 = trunc <12 x i32> undef to <12 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i32 = trunc <14 x i32> undef to <14 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i32 = trunc <20 x i32> undef to <20 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i32 = trunc <24 x i32> undef to <24 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i32 = trunc <28 x i32> undef to <28 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i32 = trunc <40 x i32> undef to <40 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i32 = trunc <48 x i32> undef to <48 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i32 = trunc <56 x i32> undef to <56 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i32 = trunc <80 x i32> undef to <80 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i32 = trunc <112 x i32> undef to <112 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i32 = trunc <160 x i32> undef to <160 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i32 = trunc <192 x i32> undef to <192 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i32 = trunc <224 x i32> undef to <224 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i32 = trunc <320 x i32> undef to <320 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i32 = trunc <384 x i32> undef to <384 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i32 = trunc <448 x i32> undef to <448 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i32 = trunc <640 x i32> undef to <640 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i32 = trunc <768 x i32> undef to <768 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i32 = trunc <896 x i32> undef to <896 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i32 = trunc <1024 x i32> undef to <1024 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i16 = trunc i16 undef to i1
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i16 = trunc <2 x i16> undef to <2 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V3i16 = trunc <3 x i16> undef to <3 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i16 = trunc <4 x i16> undef to <4 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V5i16 = trunc <5 x i16> undef to <5 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i16 = trunc <6 x i16> undef to <6 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V7i16 = trunc <7 x i16> undef to <7 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i16 = trunc <8 x i16> undef to <8 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i16 = trunc <10 x i16> undef to <10 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i16 = trunc <12 x i16> undef to <12 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i16 = trunc <14 x i16> undef to <14 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i16 = trunc <16 x i16> undef to <16 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i16 = trunc <20 x i16> undef to <20 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i16 = trunc <24 x i16> undef to <24 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i16 = trunc <28 x i16> undef to <28 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i16 = trunc <40 x i16> undef to <40 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i16 = trunc <48 x i16> undef to <48 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i16 = trunc <56 x i16> undef to <56 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i16 = trunc <80 x i16> undef to <80 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i16 = trunc <112 x i16> undef to <112 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i16 = trunc <128 x i16> undef to <128 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i16 = trunc <160 x i16> undef to <160 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i16 = trunc <192 x i16> undef to <192 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i16 = trunc <224 x i16> undef to <224 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i16 = trunc <256 x i16> undef to <256 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i16 = trunc <320 x i16> undef to <320 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i16 = trunc <384 x i16> undef to <384 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i16 = trunc <448 x i16> undef to <448 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i16 = trunc <512 x i16> undef to <512 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i16 = trunc <640 x i16> undef to <640 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i16 = trunc <768 x i16> undef to <768 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i16 = trunc <896 x i16> undef to <896 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i16 = trunc <1024 x i16> undef to <1024 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i8 = trunc i8 undef to i1
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i8 = trunc <2 x i8> undef to <2 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V3i8 = trunc <3 x i8> undef to <3 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i8 = trunc <4 x i8> undef to <4 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V5i8 = trunc <5 x i8> undef to <5 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i8 = trunc <6 x i8> undef to <6 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V7i8 = trunc <7 x i8> undef to <7 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i8 = trunc <8 x i8> undef to <8 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i8 = trunc <10 x i8> undef to <10 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i8 = trunc <12 x i8> undef to <12 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i8 = trunc <14 x i8> undef to <14 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i8 = trunc <16 x i8> undef to <16 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i8 = trunc <20 x i8> undef to <20 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i8 = trunc <24 x i8> undef to <24 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i8 = trunc <28 x i8> undef to <28 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i8 = trunc <32 x i8> undef to <32 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i8 = trunc <40 x i8> undef to <40 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i8 = trunc <48 x i8> undef to <48 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i8 = trunc <56 x i8> undef to <56 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i8 = trunc <64 x i8> undef to <64 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i8 = trunc <80 x i8> undef to <80 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i8 = trunc <112 x i8> undef to <112 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i8 = trunc <128 x i8> undef to <128 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i8 = trunc <160 x i8> undef to <160 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i8 = trunc <192 x i8> undef to <192 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i8 = trunc <224 x i8> undef to <224 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i8 = trunc <256 x i8> undef to <256 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i8 = trunc <320 x i8> undef to <320 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i8 = trunc <384 x i8> undef to <384 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i8 = trunc <448 x i8> undef to <448 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i8 = trunc <512 x i8> undef to <512 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i8 = trunc <640 x i8> undef to <640 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i8 = trunc <768 x i8> undef to <768 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i8 = trunc <896 x i8> undef to <896 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i8 = trunc <1024 x i8> undef to <1024 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
-;
-; AVX256-LABEL: 'trunc_vXi1'
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i64 = trunc i64 undef to i1
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = trunc <2 x i64> undef to <2 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V3i64 = trunc <3 x i64> undef to <3 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V5i64 = trunc <5 x i64> undef to <5 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i64 = trunc <6 x i64> undef to <6 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V7i64 = trunc <7 x i64> undef to <7 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i64 = trunc <10 x i64> undef to <10 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i64 = trunc <12 x i64> undef to <12 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i64 = trunc <14 x i64> undef to <14 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i64 = trunc <20 x i64> undef to <20 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i64 = trunc <24 x i64> undef to <24 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i64 = trunc <28 x i64> undef to <28 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i64 = trunc <40 x i64> undef to <40 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i64 = trunc <48 x i64> undef to <48 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i64 = trunc <56 x i64> undef to <56 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i64 = trunc <80 x i64> undef to <80 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i64 = trunc <112 x i64> undef to <112 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i64 = trunc <160 x i64> undef to <160 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i64 = trunc <192 x i64> undef to <192 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i64 = trunc <224 x i64> undef to <224 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i64 = trunc <320 x i64> undef to <320 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i64 = trunc <384 x i64> undef to <384 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i64 = trunc <448 x i64> undef to <448 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i64 = trunc <640 x i64> undef to <640 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i64 = trunc <768 x i64> undef to <768 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i64 = trunc <896 x i64> undef to <896 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i32 = trunc i32 undef to i1
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V3i32 = trunc <3 x i32> undef to <3 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V5i32 = trunc <5 x i32> undef to <5 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i32 = trunc <6 x i32> undef to <6 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V7i32 = trunc <7 x i32> undef to <7 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i32 = trunc <10 x i32> undef to <10 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i32 = trunc <12 x i32> undef to <12 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i32 = trunc <14 x i32> undef to <14 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i32 = trunc <20 x i32> undef to <20 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i32 = trunc <24 x i32> undef to <24 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i32 = trunc <28 x i32> undef to <28 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i32 = trunc <40 x i32> undef to <40 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i32 = trunc <48 x i32> undef to <48 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i32 = trunc <56 x i32> undef to <56 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i32 = trunc <80 x i32> undef to <80 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i32 = trunc <112 x i32> undef to <112 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i32 = trunc <160 x i32> undef to <160 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i32 = trunc <192 x i32> undef to <192 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i32 = trunc <224 x i32> undef to <224 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i32 = trunc <320 x i32> undef to <320 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i32 = trunc <384 x i32> undef to <384 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i32 = trunc <448 x i32> undef to <448 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i32 = trunc <640 x i32> undef to <640 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i32 = trunc <768 x i32> undef to <768 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i32 = trunc <896 x i32> undef to <896 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i32 = trunc <1024 x i32> undef to <1024 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i16 = trunc i16 undef to i1
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i16 = trunc <2 x i16> undef to <2 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V3i16 = trunc <3 x i16> undef to <3 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i16 = trunc <4 x i16> undef to <4 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V5i16 = trunc <5 x i16> undef to <5 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i16 = trunc <6 x i16> undef to <6 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V7i16 = trunc <7 x i16> undef to <7 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i16 = trunc <8 x i16> undef to <8 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i16 = trunc <10 x i16> undef to <10 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i16 = trunc <12 x i16> undef to <12 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i16 = trunc <14 x i16> undef to <14 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i16 = trunc <16 x i16> undef to <16 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i16 = trunc <20 x i16> undef to <20 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i16 = trunc <24 x i16> undef to <24 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i16 = trunc <28 x i16> undef to <28 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i16 = trunc <40 x i16> undef to <40 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i16 = trunc <48 x i16> undef to <48 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i16 = trunc <56 x i16> undef to <56 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i16 = trunc <80 x i16> undef to <80 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i16 = trunc <112 x i16> undef to <112 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i16 = trunc <128 x i16> undef to <128 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i16 = trunc <160 x i16> undef to <160 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i16 = trunc <192 x i16> undef to <192 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i16 = trunc <224 x i16> undef to <224 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i16 = trunc <256 x i16> undef to <256 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i16 = trunc <320 x i16> undef to <320 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i16 = trunc <384 x i16> undef to <384 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i16 = trunc <448 x i16> undef to <448 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i16 = trunc <512 x i16> undef to <512 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i16 = trunc <640 x i16> undef to <640 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i16 = trunc <768 x i16> undef to <768 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i16 = trunc <896 x i16> undef to <896 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i16 = trunc <1024 x i16> undef to <1024 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i8 = trunc i8 undef to i1
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i8 = trunc <2 x i8> undef to <2 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V3i8 = trunc <3 x i8> undef to <3 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i8 = trunc <4 x i8> undef to <4 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V5i8 = trunc <5 x i8> undef to <5 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i8 = trunc <6 x i8> undef to <6 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V7i8 = trunc <7 x i8> undef to <7 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i8 = trunc <8 x i8> undef to <8 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i8 = trunc <10 x i8> undef to <10 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i8 = trunc <12 x i8> undef to <12 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i8 = trunc <14 x i8> undef to <14 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i8 = trunc <16 x i8> undef to <16 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i8 = trunc <20 x i8> undef to <20 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i8 = trunc <24 x i8> undef to <24 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i8 = trunc <28 x i8> undef to <28 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i8 = trunc <32 x i8> undef to <32 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i8 = trunc <40 x i8> undef to <40 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i8 = trunc <48 x i8> undef to <48 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i8 = trunc <56 x i8> undef to <56 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i8 = trunc <64 x i8> undef to <64 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i8 = trunc <80 x i8> undef to <80 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i8 = trunc <112 x i8> undef to <112 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i8 = trunc <128 x i8> undef to <128 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i8 = trunc <160 x i8> undef to <160 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i8 = trunc <192 x i8> undef to <192 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i8 = trunc <224 x i8> undef to <224 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i8 = trunc <256 x i8> undef to <256 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i8 = trunc <320 x i8> undef to <320 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i8 = trunc <384 x i8> undef to <384 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i8 = trunc <448 x i8> undef to <448 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i8 = trunc <512 x i8> undef to <512 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i8 = trunc <640 x i8> undef to <640 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i8 = trunc <768 x i8> undef to <768 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i8 = trunc <896 x i8> undef to <896 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i8 = trunc <1024 x i8> undef to <1024 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
   %i64 = trunc i64 undef to i1
   %V2i64 = trunc <2 x i64> undef to <2 x i1>
@@ -1096,3 +810,6 @@ define i32 @trunc_vXi1() "min-legal-vector-width"="256" {
 
   ret i32 undef
 }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; AVX256: {{.*}}
+; AVX512: {{.*}}
diff --git a/llvm/test/Analysis/CostModel/X86/trunc-latency.ll b/llvm/test/Analysis/CostModel/X86/trunc-latency.ll
index 20789ad7bd50f..f6092b9f5bab1 100644
--- a/llvm/test/Analysis/CostModel/X86/trunc-latency.ll
+++ b/llvm/test/Analysis/CostModel/X86/trunc-latency.ll
@@ -537,26 +537,26 @@ define i32 @trunc_vXi1() "min-legal-vector-width"="256" {
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i64 = trunc <20 x i64> undef to <20 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i64 = trunc <24 x i64> undef to <24 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i64 = trunc <28 x i64> undef to <28 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i64 = trunc <40 x i64> undef to <40 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i64 = trunc <48 x i64> undef to <48 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i64 = trunc <56 x i64> undef to <56 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i64 = trunc <80 x i64> undef to <80 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i64 = trunc <112 x i64> undef to <112 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i64 = trunc <160 x i64> undef to <160 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i64 = trunc <192 x i64> undef to <192 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i64 = trunc <224 x i64> undef to <224 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i64 = trunc <320 x i64> undef to <320 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i64 = trunc <384 x i64> undef to <384 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i64 = trunc <448 x i64> undef to <448 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i64 = trunc <640 x i64> undef to <640 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i64 = trunc <768 x i64> undef to <768 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i64 = trunc <896 x i64> undef to <896 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i32 = trunc i32 undef to i1
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V3i32 = trunc <3 x i32> undef to <3 x i1>
@@ -572,26 +572,26 @@ define i32 @trunc_vXi1() "min-legal-vector-width"="256" {
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i32 = trunc <20 x i32> undef to <20 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i32 = trunc <24 x i32> undef to <24 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i32 = trunc <28 x i32> undef to <28 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i32 = trunc <40 x i32> undef to <40 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i32 = trunc <48 x i32> undef to <48 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i32 = trunc <56 x i32> undef to <56 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i32 = trunc <80 x i32> undef to <80 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i32 = trunc <112 x i32> undef to <112 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i32 = trunc <160 x i32> undef to <160 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i32 = trunc <192 x i32> undef to <192 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i32 = trunc <224 x i32> undef to <224 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i32 = trunc <320 x i32> undef to <320 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i32 = trunc <384 x i32> undef to <384 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i32 = trunc <448 x i32> undef to <448 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i32 = trunc <640 x i32> undef to <640 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i32 = trunc <768 x i32> undef to <768 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i32 = trunc <896 x i32> undef to <896 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i32 = trunc <1024 x i32> undef to <1024 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024i32 = trunc <1024 x i32> undef to <1024 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i16 = trunc i16 undef to i1
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i16 = trunc <2 x i16> undef to <2 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V3i16 = trunc <3 x i16> undef to <3 x i1>
@@ -607,26 +607,26 @@ define i32 @trunc_vXi1() "min-legal-vector-width"="256" {
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i16 = trunc <20 x i16> undef to <20 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i16 = trunc <24 x i16> undef to <24 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i16 = trunc <28 x i16> undef to <28 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i16 = trunc <40 x i16> undef to <40 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i16 = trunc <48 x i16> undef to <48 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i16 = trunc <56 x i16> undef to <56 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i16 = trunc <80 x i16> undef to <80 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i16 = trunc <112 x i16> undef to <112 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i16 = trunc <128 x i16> undef to <128 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128i16 = trunc <128 x i16> undef to <128 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i16 = trunc <160 x i16> undef to <160 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i16 = trunc <192 x i16> undef to <192 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i16 = trunc <224 x i16> undef to <224 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i16 = trunc <256 x i16> undef to <256 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256i16 = trunc <256 x i16> undef to <256 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i16 = trunc <320 x i16> undef to <320 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i16 = trunc <384 x i16> undef to <384 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i16 = trunc <448 x i16> undef to <448 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i16 = trunc <512 x i16> undef to <512 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512i16 = trunc <512 x i16> undef to <512 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i16 = trunc <640 x i16> undef to <640 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i16 = trunc <768 x i16> undef to <768 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i16 = trunc <896 x i16> undef to <896 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i16 = trunc <1024 x i16> undef to <1024 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024i16 = trunc <1024 x i16> undef to <1024 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i8 = trunc i8 undef to i1
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i8 = trunc <2 x i8> undef to <2 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V3i8 = trunc <3 x i8> undef to <3 x i1>
@@ -663,292 +663,6 @@ define i32 @trunc_vXi1() "min-legal-vector-width"="256" {
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V896i8 = trunc <896 x i8> undef to <896 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024i8 = trunc <1024 x i8> undef to <1024 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
-;
-; AVX512-LABEL: 'trunc_vXi1'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i64 = trunc i64 undef to i1
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = trunc <2 x i64> undef to <2 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V3i64 = trunc <3 x i64> undef to <3 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V5i64 = trunc <5 x i64> undef to <5 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i64 = trunc <6 x i64> undef to <6 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V7i64 = trunc <7 x i64> undef to <7 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i64 = trunc <10 x i64> undef to <10 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i64 = trunc <12 x i64> undef to <12 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i64 = trunc <14 x i64> undef to <14 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i64 = trunc <20 x i64> undef to <20 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i64 = trunc <24 x i64> undef to <24 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i64 = trunc <28 x i64> undef to <28 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i64 = trunc <40 x i64> undef to <40 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i64 = trunc <48 x i64> undef to <48 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i64 = trunc <56 x i64> undef to <56 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i64 = trunc <80 x i64> undef to <80 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i64 = trunc <112 x i64> undef to <112 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i64 = trunc <160 x i64> undef to <160 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i64 = trunc <192 x i64> undef to <192 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i64 = trunc <224 x i64> undef to <224 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i64 = trunc <320 x i64> undef to <320 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i64 = trunc <384 x i64> undef to <384 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i64 = trunc <448 x i64> undef to <448 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i64 = trunc <640 x i64> undef to <640 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i64 = trunc <768 x i64> undef to <768 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i64 = trunc <896 x i64> undef to <896 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i32 = trunc i32 undef to i1
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V3i32 = trunc <3 x i32> undef to <3 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V5i32 = trunc <5 x i32> undef to <5 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i32 = trunc <6 x i32> undef to <6 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V7i32 = trunc <7 x i32> undef to <7 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i32 = trunc <10 x i32> undef to <10 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i32 = trunc <12 x i32> undef to <12 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i32 = trunc <14 x i32> undef to <14 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i32 = trunc <20 x i32> undef to <20 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i32 = trunc <24 x i32> undef to <24 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i32 = trunc <28 x i32> undef to <28 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i32 = trunc <40 x i32> undef to <40 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i32 = trunc <48 x i32> undef to <48 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i32 = trunc <56 x i32> undef to <56 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i32 = trunc <80 x i32> undef to <80 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i32 = trunc <112 x i32> undef to <112 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i32 = trunc <160 x i32> undef to <160 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i32 = trunc <192 x i32> undef to <192 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i32 = trunc <224 x i32> undef to <224 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i32 = trunc <320 x i32> undef to <320 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i32 = trunc <384 x i32> undef to <384 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i32 = trunc <448 x i32> undef to <448 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i32 = trunc <640 x i32> undef to <640 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i32 = trunc <768 x i32> undef to <768 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i32 = trunc <896 x i32> undef to <896 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i32 = trunc <1024 x i32> undef to <1024 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i16 = trunc i16 undef to i1
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i16 = trunc <2 x i16> undef to <2 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V3i16 = trunc <3 x i16> undef to <3 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i16 = trunc <4 x i16> undef to <4 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V5i16 = trunc <5 x i16> undef to <5 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i16 = trunc <6 x i16> undef to <6 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V7i16 = trunc <7 x i16> undef to <7 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i16 = trunc <8 x i16> undef to <8 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i16 = trunc <10 x i16> undef to <10 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i16 = trunc <12 x i16> undef to <12 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i16 = trunc <14 x i16> undef to <14 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i16 = trunc <16 x i16> undef to <16 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i16 = trunc <20 x i16> undef to <20 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i16 = trunc <24 x i16> undef to <24 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i16 = trunc <28 x i16> undef to <28 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i16 = trunc <40 x i16> undef to <40 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i16 = trunc <48 x i16> undef to <48 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i16 = trunc <56 x i16> undef to <56 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i16 = trunc <80 x i16> undef to <80 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i16 = trunc <112 x i16> undef to <112 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i16 = trunc <128 x i16> undef to <128 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i16 = trunc <160 x i16> undef to <160 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i16 = trunc <192 x i16> undef to <192 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i16 = trunc <224 x i16> undef to <224 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i16 = trunc <256 x i16> undef to <256 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i16 = trunc <320 x i16> undef to <320 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i16 = trunc <384 x i16> undef to <384 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i16 = trunc <448 x i16> undef to <448 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i16 = trunc <512 x i16> undef to <512 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i16 = trunc <640 x i16> undef to <640 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i16 = trunc <768 x i16> undef to <768 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i16 = trunc <896 x i16> undef to <896 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i16 = trunc <1024 x i16> undef to <1024 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i8 = trunc i8 undef to i1
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i8 = trunc <2 x i8> undef to <2 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V3i8 = trunc <3 x i8> undef to <3 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i8 = trunc <4 x i8> undef to <4 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V5i8 = trunc <5 x i8> undef to <5 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i8 = trunc <6 x i8> undef to <6 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V7i8 = trunc <7 x i8> undef to <7 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i8 = trunc <8 x i8> undef to <8 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i8 = trunc <10 x i8> undef to <10 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i8 = trunc <12 x i8> undef to <12 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i8 = trunc <14 x i8> undef to <14 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i8 = trunc <16 x i8> undef to <16 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i8 = trunc <20 x i8> undef to <20 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i8 = trunc <24 x i8> undef to <24 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i8 = trunc <28 x i8> undef to <28 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i8 = trunc <32 x i8> undef to <32 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i8 = trunc <40 x i8> undef to <40 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i8 = trunc <48 x i8> undef to <48 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i8 = trunc <56 x i8> undef to <56 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i8 = trunc <64 x i8> undef to <64 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i8 = trunc <80 x i8> undef to <80 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i8 = trunc <112 x i8> undef to <112 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i8 = trunc <128 x i8> undef to <128 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i8 = trunc <160 x i8> undef to <160 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i8 = trunc <192 x i8> undef to <192 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i8 = trunc <224 x i8> undef to <224 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i8 = trunc <256 x i8> undef to <256 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i8 = trunc <320 x i8> undef to <320 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i8 = trunc <384 x i8> undef to <384 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i8 = trunc <448 x i8> undef to <448 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i8 = trunc <512 x i8> undef to <512 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i8 = trunc <640 x i8> undef to <640 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i8 = trunc <768 x i8> undef to <768 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i8 = trunc <896 x i8> undef to <896 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i8 = trunc <1024 x i8> undef to <1024 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
-;
-; AVX256-LABEL: 'trunc_vXi1'
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i64 = trunc i64 undef to i1
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = trunc <2 x i64> undef to <2 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V3i64 = trunc <3 x i64> undef to <3 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V5i64 = trunc <5 x i64> undef to <5 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i64 = trunc <6 x i64> undef to <6 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V7i64 = trunc <7 x i64> undef to <7 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i64 = trunc <10 x i64> undef to <10 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i64 = trunc <12 x i64> undef to <12 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i64 = trunc <14 x i64> undef to <14 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i64 = trunc <20 x i64> undef to <20 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i64 = trunc <24 x i64> undef to <24 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i64 = trunc <28 x i64> undef to <28 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i64 = trunc <40 x i64> undef to <40 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i64 = trunc <48 x i64> undef to <48 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i64 = trunc <56 x i64> undef to <56 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i64 = trunc <80 x i64> undef to <80 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i64 = trunc <112 x i64> undef to <112 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i64 = trunc <160 x i64> undef to <160 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i64 = trunc <192 x i64> undef to <192 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i64 = trunc <224 x i64> undef to <224 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i64 = trunc <320 x i64> undef to <320 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i64 = trunc <384 x i64> undef to <384 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i64 = trunc <448 x i64> undef to <448 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i64 = trunc <640 x i64> undef to <640 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i64 = trunc <768 x i64> undef to <768 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i64 = trunc <896 x i64> undef to <896 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i32 = trunc i32 undef to i1
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V3i32 = trunc <3 x i32> undef to <3 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V5i32 = trunc <5 x i32> undef to <5 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i32 = trunc <6 x i32> undef to <6 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V7i32 = trunc <7 x i32> undef to <7 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i32 = trunc <10 x i32> undef to <10 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i32 = trunc <12 x i32> undef to <12 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i32 = trunc <14 x i32> undef to <14 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i32 = trunc <20 x i32> undef to <20 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i32 = trunc <24 x i32> undef to <24 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i32 = trunc <28 x i32> undef to <28 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i32 = trunc <40 x i32> undef to <40 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i32 = trunc <48 x i32> undef to <48 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i32 = trunc <56 x i32> undef to <56 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i32 = trunc <80 x i32> undef to <80 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i32 = trunc <112 x i32> undef to <112 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i32 = trunc <160 x i32> undef to <160 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i32 = trunc <192 x i32> undef to <192 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i32 = trunc <224 x i32> undef to <224 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i32 = trunc <320 x i32> undef to <320 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i32 = trunc <384 x i32> undef to <384 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i32 = trunc <448 x i32> undef to <448 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i32 = trunc <640 x i32> undef to <640 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i32 = trunc <768 x i32> undef to <768 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i32 = trunc <896 x i32> undef to <896 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i32 = trunc <1024 x i32> undef to <1024 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i16 = trunc i16 undef to i1
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i16 = trunc <2 x i16> undef to <2 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V3i16 = trunc <3 x i16> undef to <3 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i16 = trunc <4 x i16> undef to <4 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V5i16 = trunc <5 x i16> undef to <5 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i16 = trunc <6 x i16> undef to <6 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V7i16 = trunc <7 x i16> undef to <7 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i16 = trunc <8 x i16> undef to <8 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i16 = trunc <10 x i16> undef to <10 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i16 = trunc <12 x i16> undef to <12 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i16 = trunc <14 x i16> undef to <14 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i16 = trunc <16 x i16> undef to <16 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i16 = trunc <20 x i16> undef to <20 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i16 = trunc <24 x i16> undef to <24 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i16 = trunc <28 x i16> undef to <28 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i16 = trunc <40 x i16> undef to <40 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i16 = trunc <48 x i16> undef to <48 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i16 = trunc <56 x i16> undef to <56 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i16 = trunc <80 x i16> undef to <80 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i16 = trunc <112 x i16> undef to <112 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i16 = trunc <128 x i16> undef to <128 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i16 = trunc <160 x i16> undef to <160 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i16 = trunc <192 x i16> undef to <192 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i16 = trunc <224 x i16> undef to <224 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i16 = trunc <256 x i16> undef to <256 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i16 = trunc <320 x i16> undef to <320 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i16 = trunc <384 x i16> undef to <384 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i16 = trunc <448 x i16> undef to <448 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i16 = trunc <512 x i16> undef to <512 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i16 = trunc <640 x i16> undef to <640 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i16 = trunc <768 x i16> undef to <768 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i16 = trunc <896 x i16> undef to <896 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i16 = trunc <1024 x i16> undef to <1024 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i8 = trunc i8 undef to i1
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i8 = trunc <2 x i8> undef to <2 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V3i8 = trunc <3 x i8> undef to <3 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i8 = trunc <4 x i8> undef to <4 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V5i8 = trunc <5 x i8> undef to <5 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i8 = trunc <6 x i8> undef to <6 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V7i8 = trunc <7 x i8> undef to <7 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i8 = trunc <8 x i8> undef to <8 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i8 = trunc <10 x i8> undef to <10 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i8 = trunc <12 x i8> undef to <12 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i8 = trunc <14 x i8> undef to <14 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i8 = trunc <16 x i8> undef to <16 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i8 = trunc <20 x i8> undef to <20 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i8 = trunc <24 x i8> undef to <24 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i8 = trunc <28 x i8> undef to <28 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i8 = trunc <32 x i8> undef to <32 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i8 = trunc <40 x i8> undef to <40 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i8 = trunc <48 x i8> undef to <48 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i8 = trunc <56 x i8> undef to <56 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i8 = trunc <64 x i8> undef to <64 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i8 = trunc <80 x i8> undef to <80 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i8 = trunc <112 x i8> undef to <112 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i8 = trunc <128 x i8> undef to <128 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i8 = trunc <160 x i8> undef to <160 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i8 = trunc <192 x i8> undef to <192 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i8 = trunc <224 x i8> undef to <224 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i8 = trunc <256 x i8> undef to <256 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i8 = trunc <320 x i8> undef to <320 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i8 = trunc <384 x i8> undef to <384 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i8 = trunc <448 x i8> undef to <448 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i8 = trunc <512 x i8> undef to <512 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i8 = trunc <640 x i8> undef to <640 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i8 = trunc <768 x i8> undef to <768 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i8 = trunc <896 x i8> undef to <896 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i8 = trunc <1024 x i8> undef to <1024 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
   %i64 = trunc i64 undef to i1
   %V2i64 = trunc <2 x i64> undef to <2 x i1>
@@ -1096,3 +810,6 @@ define i32 @trunc_vXi1() "min-legal-vector-width"="256" {
 
   ret i32 undef
 }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; AVX256: {{.*}}
+; AVX512: {{.*}}
diff --git a/llvm/test/Analysis/CostModel/X86/trunc-sizelatency.ll b/llvm/test/Analysis/CostModel/X86/trunc-sizelatency.ll
index a705c47849fdc..49ed2902da45b 100644
--- a/llvm/test/Analysis/CostModel/X86/trunc-sizelatency.ll
+++ b/llvm/test/Analysis/CostModel/X86/trunc-sizelatency.ll
@@ -537,26 +537,26 @@ define i32 @trunc_vXi1() "min-legal-vector-width"="256" {
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i64 = trunc <20 x i64> undef to <20 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i64 = trunc <24 x i64> undef to <24 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i64 = trunc <28 x i64> undef to <28 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i64 = trunc <40 x i64> undef to <40 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i64 = trunc <48 x i64> undef to <48 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i64 = trunc <56 x i64> undef to <56 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i64 = trunc <80 x i64> undef to <80 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i64 = trunc <112 x i64> undef to <112 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i64 = trunc <160 x i64> undef to <160 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i64 = trunc <192 x i64> undef to <192 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i64 = trunc <224 x i64> undef to <224 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i64 = trunc <320 x i64> undef to <320 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i64 = trunc <384 x i64> undef to <384 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i64 = trunc <448 x i64> undef to <448 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i64 = trunc <640 x i64> undef to <640 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i64 = trunc <768 x i64> undef to <768 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i64 = trunc <896 x i64> undef to <896 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i32 = trunc i32 undef to i1
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V3i32 = trunc <3 x i32> undef to <3 x i1>
@@ -572,26 +572,26 @@ define i32 @trunc_vXi1() "min-legal-vector-width"="256" {
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i32 = trunc <20 x i32> undef to <20 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i32 = trunc <24 x i32> undef to <24 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i32 = trunc <28 x i32> undef to <28 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i32 = trunc <40 x i32> undef to <40 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i32 = trunc <48 x i32> undef to <48 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i32 = trunc <56 x i32> undef to <56 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i32 = trunc <80 x i32> undef to <80 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i32 = trunc <112 x i32> undef to <112 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i32 = trunc <160 x i32> undef to <160 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i32 = trunc <192 x i32> undef to <192 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i32 = trunc <224 x i32> undef to <224 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i32 = trunc <320 x i32> undef to <320 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i32 = trunc <384 x i32> undef to <384 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i32 = trunc <448 x i32> undef to <448 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i32 = trunc <640 x i32> undef to <640 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i32 = trunc <768 x i32> undef to <768 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i32 = trunc <896 x i32> undef to <896 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i32 = trunc <1024 x i32> undef to <1024 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024i32 = trunc <1024 x i32> undef to <1024 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i16 = trunc i16 undef to i1
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i16 = trunc <2 x i16> undef to <2 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V3i16 = trunc <3 x i16> undef to <3 x i1>
@@ -607,26 +607,26 @@ define i32 @trunc_vXi1() "min-legal-vector-width"="256" {
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i16 = trunc <20 x i16> undef to <20 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i16 = trunc <24 x i16> undef to <24 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i16 = trunc <28 x i16> undef to <28 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i16 = trunc <40 x i16> undef to <40 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i16 = trunc <48 x i16> undef to <48 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i16 = trunc <56 x i16> undef to <56 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i16 = trunc <80 x i16> undef to <80 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i16 = trunc <112 x i16> undef to <112 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i16 = trunc <128 x i16> undef to <128 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128i16 = trunc <128 x i16> undef to <128 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i16 = trunc <160 x i16> undef to <160 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i16 = trunc <192 x i16> undef to <192 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i16 = trunc <224 x i16> undef to <224 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i16 = trunc <256 x i16> undef to <256 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256i16 = trunc <256 x i16> undef to <256 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i16 = trunc <320 x i16> undef to <320 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i16 = trunc <384 x i16> undef to <384 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i16 = trunc <448 x i16> undef to <448 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i16 = trunc <512 x i16> undef to <512 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512i16 = trunc <512 x i16> undef to <512 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i16 = trunc <640 x i16> undef to <640 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i16 = trunc <768 x i16> undef to <768 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i16 = trunc <896 x i16> undef to <896 x i1>
-; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i16 = trunc <1024 x i16> undef to <1024 x i1>
+; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024i16 = trunc <1024 x i16> undef to <1024 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i8 = trunc i8 undef to i1
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i8 = trunc <2 x i8> undef to <2 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V3i8 = trunc <3 x i8> undef to <3 x i1>
@@ -663,292 +663,6 @@ define i32 @trunc_vXi1() "min-legal-vector-width"="256" {
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V896i8 = trunc <896 x i8> undef to <896 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024i8 = trunc <1024 x i8> undef to <1024 x i1>
 ; AVX-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
-;
-; AVX512-LABEL: 'trunc_vXi1'
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i64 = trunc i64 undef to i1
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = trunc <2 x i64> undef to <2 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V3i64 = trunc <3 x i64> undef to <3 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V5i64 = trunc <5 x i64> undef to <5 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i64 = trunc <6 x i64> undef to <6 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V7i64 = trunc <7 x i64> undef to <7 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i64 = trunc <10 x i64> undef to <10 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i64 = trunc <12 x i64> undef to <12 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i64 = trunc <14 x i64> undef to <14 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i64 = trunc <20 x i64> undef to <20 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i64 = trunc <24 x i64> undef to <24 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i64 = trunc <28 x i64> undef to <28 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i64 = trunc <40 x i64> undef to <40 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i64 = trunc <48 x i64> undef to <48 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i64 = trunc <56 x i64> undef to <56 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i64 = trunc <80 x i64> undef to <80 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i64 = trunc <112 x i64> undef to <112 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i64 = trunc <160 x i64> undef to <160 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i64 = trunc <192 x i64> undef to <192 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i64 = trunc <224 x i64> undef to <224 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i64 = trunc <320 x i64> undef to <320 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i64 = trunc <384 x i64> undef to <384 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i64 = trunc <448 x i64> undef to <448 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i64 = trunc <640 x i64> undef to <640 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i64 = trunc <768 x i64> undef to <768 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i64 = trunc <896 x i64> undef to <896 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i32 = trunc i32 undef to i1
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V3i32 = trunc <3 x i32> undef to <3 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V5i32 = trunc <5 x i32> undef to <5 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i32 = trunc <6 x i32> undef to <6 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V7i32 = trunc <7 x i32> undef to <7 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i32 = trunc <10 x i32> undef to <10 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i32 = trunc <12 x i32> undef to <12 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i32 = trunc <14 x i32> undef to <14 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i32 = trunc <20 x i32> undef to <20 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i32 = trunc <24 x i32> undef to <24 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i32 = trunc <28 x i32> undef to <28 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i32 = trunc <40 x i32> undef to <40 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i32 = trunc <48 x i32> undef to <48 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i32 = trunc <56 x i32> undef to <56 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i32 = trunc <80 x i32> undef to <80 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i32 = trunc <112 x i32> undef to <112 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i32 = trunc <160 x i32> undef to <160 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i32 = trunc <192 x i32> undef to <192 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i32 = trunc <224 x i32> undef to <224 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i32 = trunc <320 x i32> undef to <320 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i32 = trunc <384 x i32> undef to <384 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i32 = trunc <448 x i32> undef to <448 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i32 = trunc <640 x i32> undef to <640 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i32 = trunc <768 x i32> undef to <768 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i32 = trunc <896 x i32> undef to <896 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i32 = trunc <1024 x i32> undef to <1024 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i16 = trunc i16 undef to i1
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i16 = trunc <2 x i16> undef to <2 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V3i16 = trunc <3 x i16> undef to <3 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i16 = trunc <4 x i16> undef to <4 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V5i16 = trunc <5 x i16> undef to <5 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i16 = trunc <6 x i16> undef to <6 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V7i16 = trunc <7 x i16> undef to <7 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i16 = trunc <8 x i16> undef to <8 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i16 = trunc <10 x i16> undef to <10 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i16 = trunc <12 x i16> undef to <12 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i16 = trunc <14 x i16> undef to <14 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i16 = trunc <16 x i16> undef to <16 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i16 = trunc <20 x i16> undef to <20 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i16 = trunc <24 x i16> undef to <24 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i16 = trunc <28 x i16> undef to <28 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i16 = trunc <40 x i16> undef to <40 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i16 = trunc <48 x i16> undef to <48 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i16 = trunc <56 x i16> undef to <56 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i16 = trunc <80 x i16> undef to <80 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i16 = trunc <112 x i16> undef to <112 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i16 = trunc <128 x i16> undef to <128 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i16 = trunc <160 x i16> undef to <160 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i16 = trunc <192 x i16> undef to <192 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i16 = trunc <224 x i16> undef to <224 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i16 = trunc <256 x i16> undef to <256 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i16 = trunc <320 x i16> undef to <320 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i16 = trunc <384 x i16> undef to <384 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i16 = trunc <448 x i16> undef to <448 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i16 = trunc <512 x i16> undef to <512 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i16 = trunc <640 x i16> undef to <640 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i16 = trunc <768 x i16> undef to <768 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i16 = trunc <896 x i16> undef to <896 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i16 = trunc <1024 x i16> undef to <1024 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i8 = trunc i8 undef to i1
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i8 = trunc <2 x i8> undef to <2 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V3i8 = trunc <3 x i8> undef to <3 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i8 = trunc <4 x i8> undef to <4 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V5i8 = trunc <5 x i8> undef to <5 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i8 = trunc <6 x i8> undef to <6 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V7i8 = trunc <7 x i8> undef to <7 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i8 = trunc <8 x i8> undef to <8 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i8 = trunc <10 x i8> undef to <10 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i8 = trunc <12 x i8> undef to <12 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i8 = trunc <14 x i8> undef to <14 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i8 = trunc <16 x i8> undef to <16 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i8 = trunc <20 x i8> undef to <20 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i8 = trunc <24 x i8> undef to <24 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i8 = trunc <28 x i8> undef to <28 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i8 = trunc <32 x i8> undef to <32 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i8 = trunc <40 x i8> undef to <40 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i8 = trunc <48 x i8> undef to <48 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i8 = trunc <56 x i8> undef to <56 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i8 = trunc <64 x i8> undef to <64 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i8 = trunc <80 x i8> undef to <80 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i8 = trunc <112 x i8> undef to <112 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i8 = trunc <128 x i8> undef to <128 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i8 = trunc <160 x i8> undef to <160 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i8 = trunc <192 x i8> undef to <192 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i8 = trunc <224 x i8> undef to <224 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i8 = trunc <256 x i8> undef to <256 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i8 = trunc <320 x i8> undef to <320 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i8 = trunc <384 x i8> undef to <384 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i8 = trunc <448 x i8> undef to <448 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i8 = trunc <512 x i8> undef to <512 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i8 = trunc <640 x i8> undef to <640 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i8 = trunc <768 x i8> undef to <768 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i8 = trunc <896 x i8> undef to <896 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i8 = trunc <1024 x i8> undef to <1024 x i1>
-; AVX512-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
-;
-; AVX256-LABEL: 'trunc_vXi1'
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i64 = trunc i64 undef to i1
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i64 = trunc <2 x i64> undef to <2 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V3i64 = trunc <3 x i64> undef to <3 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i64 = trunc <4 x i64> undef to <4 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V5i64 = trunc <5 x i64> undef to <5 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i64 = trunc <6 x i64> undef to <6 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V7i64 = trunc <7 x i64> undef to <7 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i64 = trunc <8 x i64> undef to <8 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i64 = trunc <10 x i64> undef to <10 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i64 = trunc <12 x i64> undef to <12 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i64 = trunc <14 x i64> undef to <14 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i64 = trunc <16 x i64> undef to <16 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i64 = trunc <20 x i64> undef to <20 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i64 = trunc <24 x i64> undef to <24 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i64 = trunc <28 x i64> undef to <28 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i64 = trunc <40 x i64> undef to <40 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i64 = trunc <48 x i64> undef to <48 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i64 = trunc <56 x i64> undef to <56 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i64 = trunc <80 x i64> undef to <80 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i64 = trunc <112 x i64> undef to <112 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i64 = trunc <160 x i64> undef to <160 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i64 = trunc <192 x i64> undef to <192 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i64 = trunc <224 x i64> undef to <224 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i64 = trunc <320 x i64> undef to <320 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i64 = trunc <384 x i64> undef to <384 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i64 = trunc <448 x i64> undef to <448 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i64 = trunc <640 x i64> undef to <640 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i64 = trunc <768 x i64> undef to <768 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i64 = trunc <896 x i64> undef to <896 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i32 = trunc i32 undef to i1
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V3i32 = trunc <3 x i32> undef to <3 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i32 = trunc <4 x i32> undef to <4 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V5i32 = trunc <5 x i32> undef to <5 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i32 = trunc <6 x i32> undef to <6 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V7i32 = trunc <7 x i32> undef to <7 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i32 = trunc <8 x i32> undef to <8 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i32 = trunc <10 x i32> undef to <10 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i32 = trunc <12 x i32> undef to <12 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i32 = trunc <14 x i32> undef to <14 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i32 = trunc <16 x i32> undef to <16 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i32 = trunc <20 x i32> undef to <20 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i32 = trunc <24 x i32> undef to <24 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i32 = trunc <28 x i32> undef to <28 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i32 = trunc <40 x i32> undef to <40 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i32 = trunc <48 x i32> undef to <48 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i32 = trunc <56 x i32> undef to <56 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i32 = trunc <80 x i32> undef to <80 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i32 = trunc <112 x i32> undef to <112 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i32 = trunc <160 x i32> undef to <160 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i32 = trunc <192 x i32> undef to <192 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i32 = trunc <224 x i32> undef to <224 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i32 = trunc <320 x i32> undef to <320 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i32 = trunc <384 x i32> undef to <384 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i32 = trunc <448 x i32> undef to <448 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i32 = trunc <640 x i32> undef to <640 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i32 = trunc <768 x i32> undef to <768 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i32 = trunc <896 x i32> undef to <896 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i32 = trunc <1024 x i32> undef to <1024 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i16 = trunc i16 undef to i1
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i16 = trunc <2 x i16> undef to <2 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V3i16 = trunc <3 x i16> undef to <3 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i16 = trunc <4 x i16> undef to <4 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V5i16 = trunc <5 x i16> undef to <5 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i16 = trunc <6 x i16> undef to <6 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V7i16 = trunc <7 x i16> undef to <7 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i16 = trunc <8 x i16> undef to <8 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i16 = trunc <10 x i16> undef to <10 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i16 = trunc <12 x i16> undef to <12 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i16 = trunc <14 x i16> undef to <14 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i16 = trunc <16 x i16> undef to <16 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i16 = trunc <20 x i16> undef to <20 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i16 = trunc <24 x i16> undef to <24 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i16 = trunc <28 x i16> undef to <28 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i16 = trunc <40 x i16> undef to <40 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i16 = trunc <48 x i16> undef to <48 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i16 = trunc <56 x i16> undef to <56 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i16 = trunc <80 x i16> undef to <80 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i16 = trunc <112 x i16> undef to <112 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i16 = trunc <128 x i16> undef to <128 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i16 = trunc <160 x i16> undef to <160 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i16 = trunc <192 x i16> undef to <192 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i16 = trunc <224 x i16> undef to <224 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i16 = trunc <256 x i16> undef to <256 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i16 = trunc <320 x i16> undef to <320 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i16 = trunc <384 x i16> undef to <384 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i16 = trunc <448 x i16> undef to <448 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i16 = trunc <512 x i16> undef to <512 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i16 = trunc <640 x i16> undef to <640 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i16 = trunc <768 x i16> undef to <768 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i16 = trunc <896 x i16> undef to <896 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i16 = trunc <1024 x i16> undef to <1024 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i8 = trunc i8 undef to i1
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i8 = trunc <2 x i8> undef to <2 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V3i8 = trunc <3 x i8> undef to <3 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4i8 = trunc <4 x i8> undef to <4 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V5i8 = trunc <5 x i8> undef to <5 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V6i8 = trunc <6 x i8> undef to <6 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V7i8 = trunc <7 x i8> undef to <7 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8i8 = trunc <8 x i8> undef to <8 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V10i8 = trunc <10 x i8> undef to <10 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V12i8 = trunc <12 x i8> undef to <12 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V14i8 = trunc <14 x i8> undef to <14 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16i8 = trunc <16 x i8> undef to <16 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V20i8 = trunc <20 x i8> undef to <20 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V24i8 = trunc <24 x i8> undef to <24 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V28i8 = trunc <28 x i8> undef to <28 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32i8 = trunc <32 x i8> undef to <32 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V40i8 = trunc <40 x i8> undef to <40 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V48i8 = trunc <48 x i8> undef to <48 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V56i8 = trunc <56 x i8> undef to <56 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64i8 = trunc <64 x i8> undef to <64 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V80i8 = trunc <80 x i8> undef to <80 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V112i8 = trunc <112 x i8> undef to <112 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128i8 = trunc <128 x i8> undef to <128 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V160i8 = trunc <160 x i8> undef to <160 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V192i8 = trunc <192 x i8> undef to <192 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V224i8 = trunc <224 x i8> undef to <224 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256i8 = trunc <256 x i8> undef to <256 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V320i8 = trunc <320 x i8> undef to <320 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V384i8 = trunc <384 x i8> undef to <384 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V448i8 = trunc <448 x i8> undef to <448 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512i8 = trunc <512 x i8> undef to <512 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V640i8 = trunc <640 x i8> undef to <640 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V768i8 = trunc <768 x i8> undef to <768 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V896i8 = trunc <896 x i8> undef to <896 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1024i8 = trunc <1024 x i8> undef to <1024 x i1>
-; AVX256-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 undef
 ;
   %i64 = trunc i64 undef to i1
   %V2i64 = trunc <2 x i64> undef to <2 x i1>
@@ -1096,3 +810,6 @@ define i32 @trunc_vXi1() "min-legal-vector-width"="256" {
 
   ret i32 undef
 }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; AVX256: {{.*}}
+; AVX512: {{.*}}
diff --git a/llvm/test/Analysis/CostModel/X86/trunc.ll b/llvm/test/Analysis/CostModel/X86/trunc.ll
index 98e6fa5348b9c..2d062742ef685 100644
--- a/llvm/test/Analysis/CostModel/X86/trunc.ll
+++ b/llvm/test/Analysis/CostModel/X86/trunc.ll
@@ -2754,26 +2754,26 @@ define i32 @trunc_vXi1() "min-legal-vector-width"="256" {
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V20i64 = trunc <20 x i64> undef to <20 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V24i64 = trunc <24 x i64> undef to <24 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 59 for instruction: %V28i64 = trunc <28 x i64> undef to <28 x i1>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 83 for instruction: %V40i64 = trunc <40 x i64> undef to <40 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %V48i64 = trunc <48 x i64> undef to <48 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 117 for instruction: %V56i64 = trunc <56 x i64> undef to <56 x i1>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 165 for instruction: %V80i64 = trunc <80 x i64> undef to <80 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 231 for instruction: %V112i64 = trunc <112 x i64> undef to <112 x i1>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 330 for instruction: %V160i64 = trunc <160 x i64> undef to <160 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 396 for instruction: %V192i64 = trunc <192 x i64> undef to <192 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 462 for instruction: %V224i64 = trunc <224 x i64> undef to <224 x i1>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 184 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 660 for instruction: %V320i64 = trunc <320 x i64> undef to <320 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 792 for instruction: %V384i64 = trunc <384 x i64> undef to <384 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 924 for instruction: %V448i64 = trunc <448 x i64> undef to <448 x i1>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 368 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1320 for instruction: %V640i64 = trunc <640 x i64> undef to <640 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1584 for instruction: %V768i64 = trunc <768 x i64> undef to <768 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1848 for instruction: %V896i64 = trunc <896 x i64> undef to <896 x i1>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 736 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i32 = trunc i32 undef to i1
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V3i32 = trunc <3 x i32> undef to <3 x i1>
@@ -2789,26 +2789,26 @@ define i32 @trunc_vXi1() "min-legal-vector-width"="256" {
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V20i32 = trunc <20 x i32> undef to <20 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V24i32 = trunc <24 x i32> undef to <24 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 59 for instruction: %V28i32 = trunc <28 x i32> undef to <28 x i1>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 83 for instruction: %V40i32 = trunc <40 x i32> undef to <40 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %V48i32 = trunc <48 x i32> undef to <48 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 117 for instruction: %V56i32 = trunc <56 x i32> undef to <56 x i1>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 165 for instruction: %V80i32 = trunc <80 x i32> undef to <80 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 231 for instruction: %V112i32 = trunc <112 x i32> undef to <112 x i1>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 330 for instruction: %V160i32 = trunc <160 x i32> undef to <160 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 396 for instruction: %V192i32 = trunc <192 x i32> undef to <192 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 462 for instruction: %V224i32 = trunc <224 x i32> undef to <224 x i1>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 168 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 660 for instruction: %V320i32 = trunc <320 x i32> undef to <320 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 792 for instruction: %V384i32 = trunc <384 x i32> undef to <384 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 924 for instruction: %V448i32 = trunc <448 x i32> undef to <448 x i1>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 336 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1320 for instruction: %V640i32 = trunc <640 x i32> undef to <640 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1584 for instruction: %V768i32 = trunc <768 x i32> undef to <768 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1848 for instruction: %V896i32 = trunc <896 x i32> undef to <896 x i1>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 672 for instruction: %V1024i32 = trunc <1024 x i32> undef to <1024 x i1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024i32 = trunc <1024 x i32> undef to <1024 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i16 = trunc i16 undef to i1
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i16 = trunc <2 x i16> undef to <2 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V3i16 = trunc <3 x i16> undef to <3 x i1>
@@ -2824,26 +2824,26 @@ define i32 @trunc_vXi1() "min-legal-vector-width"="256" {
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V20i16 = trunc <20 x i16> undef to <20 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V24i16 = trunc <24 x i16> undef to <24 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 59 for instruction: %V28i16 = trunc <28 x i16> undef to <28 x i1>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 83 for instruction: %V40i16 = trunc <40 x i16> undef to <40 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %V48i16 = trunc <48 x i16> undef to <48 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 117 for instruction: %V56i16 = trunc <56 x i16> undef to <56 x i1>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 165 for instruction: %V80i16 = trunc <80 x i16> undef to <80 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 231 for instruction: %V112i16 = trunc <112 x i16> undef to <112 x i1>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V128i16 = trunc <128 x i16> undef to <128 x i1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128i16 = trunc <128 x i16> undef to <128 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 330 for instruction: %V160i16 = trunc <160 x i16> undef to <160 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 396 for instruction: %V192i16 = trunc <192 x i16> undef to <192 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 462 for instruction: %V224i16 = trunc <224 x i16> undef to <224 x i1>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V256i16 = trunc <256 x i16> undef to <256 x i1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256i16 = trunc <256 x i16> undef to <256 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 660 for instruction: %V320i16 = trunc <320 x i16> undef to <320 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 792 for instruction: %V384i16 = trunc <384 x i16> undef to <384 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 924 for instruction: %V448i16 = trunc <448 x i16> undef to <448 x i1>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %V512i16 = trunc <512 x i16> undef to <512 x i1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512i16 = trunc <512 x i16> undef to <512 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1320 for instruction: %V640i16 = trunc <640 x i16> undef to <640 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1584 for instruction: %V768i16 = trunc <768 x i16> undef to <768 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1848 for instruction: %V896i16 = trunc <896 x i16> undef to <896 x i1>
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 288 for instruction: %V1024i16 = trunc <1024 x i16> undef to <1024 x i1>
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024i16 = trunc <1024 x i16> undef to <1024 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i8 = trunc i8 undef to i1
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i8 = trunc <2 x i8> undef to <2 x i1>
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V3i8 = trunc <3 x i8> undef to <3 x i1>
@@ -2897,26 +2897,26 @@ define i32 @trunc_vXi1() "min-legal-vector-width"="256" {
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V20i64 = trunc <20 x i64> undef to <20 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V24i64 = trunc <24 x i64> undef to <24 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 59 for instruction: %V28i64 = trunc <28 x i64> undef to <28 x i1>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 83 for instruction: %V40i64 = trunc <40 x i64> undef to <40 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %V48i64 = trunc <48 x i64> undef to <48 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 117 for instruction: %V56i64 = trunc <56 x i64> undef to <56 x i1>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 165 for instruction: %V80i64 = trunc <80 x i64> undef to <80 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 231 for instruction: %V112i64 = trunc <112 x i64> undef to <112 x i1>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 330 for instruction: %V160i64 = trunc <160 x i64> undef to <160 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 396 for instruction: %V192i64 = trunc <192 x i64> undef to <192 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 462 for instruction: %V224i64 = trunc <224 x i64> undef to <224 x i1>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 184 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 660 for instruction: %V320i64 = trunc <320 x i64> undef to <320 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 792 for instruction: %V384i64 = trunc <384 x i64> undef to <384 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 924 for instruction: %V448i64 = trunc <448 x i64> undef to <448 x i1>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 368 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1320 for instruction: %V640i64 = trunc <640 x i64> undef to <640 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1584 for instruction: %V768i64 = trunc <768 x i64> undef to <768 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1848 for instruction: %V896i64 = trunc <896 x i64> undef to <896 x i1>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 736 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i32 = trunc i32 undef to i1
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V3i32 = trunc <3 x i32> undef to <3 x i1>
@@ -2932,26 +2932,26 @@ define i32 @trunc_vXi1() "min-legal-vector-width"="256" {
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V20i32 = trunc <20 x i32> undef to <20 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V24i32 = trunc <24 x i32> undef to <24 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 59 for instruction: %V28i32 = trunc <28 x i32> undef to <28 x i1>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 83 for instruction: %V40i32 = trunc <40 x i32> undef to <40 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %V48i32 = trunc <48 x i32> undef to <48 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 117 for instruction: %V56i32 = trunc <56 x i32> undef to <56 x i1>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 165 for instruction: %V80i32 = trunc <80 x i32> undef to <80 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 231 for instruction: %V112i32 = trunc <112 x i32> undef to <112 x i1>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 330 for instruction: %V160i32 = trunc <160 x i32> undef to <160 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 396 for instruction: %V192i32 = trunc <192 x i32> undef to <192 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 462 for instruction: %V224i32 = trunc <224 x i32> undef to <224 x i1>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 660 for instruction: %V320i32 = trunc <320 x i32> undef to <320 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 792 for instruction: %V384i32 = trunc <384 x i32> undef to <384 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 924 for instruction: %V448i32 = trunc <448 x i32> undef to <448 x i1>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 272 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1320 for instruction: %V640i32 = trunc <640 x i32> undef to <640 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1584 for instruction: %V768i32 = trunc <768 x i32> undef to <768 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1848 for instruction: %V896i32 = trunc <896 x i32> undef to <896 x i1>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 544 for instruction: %V1024i32 = trunc <1024 x i32> undef to <1024 x i1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024i32 = trunc <1024 x i32> undef to <1024 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i16 = trunc i16 undef to i1
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i16 = trunc <2 x i16> undef to <2 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V3i16 = trunc <3 x i16> undef to <3 x i1>
@@ -2967,26 +2967,26 @@ define i32 @trunc_vXi1() "min-legal-vector-width"="256" {
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V20i16 = trunc <20 x i16> undef to <20 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V24i16 = trunc <24 x i16> undef to <24 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 59 for instruction: %V28i16 = trunc <28 x i16> undef to <28 x i1>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 83 for instruction: %V40i16 = trunc <40 x i16> undef to <40 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %V48i16 = trunc <48 x i16> undef to <48 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 117 for instruction: %V56i16 = trunc <56 x i16> undef to <56 x i1>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 165 for instruction: %V80i16 = trunc <80 x i16> undef to <80 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 231 for instruction: %V112i16 = trunc <112 x i16> undef to <112 x i1>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V128i16 = trunc <128 x i16> undef to <128 x i1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128i16 = trunc <128 x i16> undef to <128 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 330 for instruction: %V160i16 = trunc <160 x i16> undef to <160 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 396 for instruction: %V192i16 = trunc <192 x i16> undef to <192 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 462 for instruction: %V224i16 = trunc <224 x i16> undef to <224 x i1>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V256i16 = trunc <256 x i16> undef to <256 x i1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256i16 = trunc <256 x i16> undef to <256 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 660 for instruction: %V320i16 = trunc <320 x i16> undef to <320 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 792 for instruction: %V384i16 = trunc <384 x i16> undef to <384 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 924 for instruction: %V448i16 = trunc <448 x i16> undef to <448 x i1>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %V512i16 = trunc <512 x i16> undef to <512 x i1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512i16 = trunc <512 x i16> undef to <512 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1320 for instruction: %V640i16 = trunc <640 x i16> undef to <640 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1584 for instruction: %V768i16 = trunc <768 x i16> undef to <768 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1848 for instruction: %V896i16 = trunc <896 x i16> undef to <896 x i1>
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 288 for instruction: %V1024i16 = trunc <1024 x i16> undef to <1024 x i1>
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024i16 = trunc <1024 x i16> undef to <1024 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i8 = trunc i8 undef to i1
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i8 = trunc <2 x i8> undef to <2 x i1>
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V3i8 = trunc <3 x i8> undef to <3 x i1>
@@ -3040,26 +3040,26 @@ define i32 @trunc_vXi1() "min-legal-vector-width"="256" {
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V20i64 = trunc <20 x i64> undef to <20 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V24i64 = trunc <24 x i64> undef to <24 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V28i64 = trunc <28 x i64> undef to <28 x i1>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V40i64 = trunc <40 x i64> undef to <40 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V48i64 = trunc <48 x i64> undef to <48 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V56i64 = trunc <56 x i64> undef to <56 x i1>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V80i64 = trunc <80 x i64> undef to <80 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V112i64 = trunc <112 x i64> undef to <112 x i1>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i1>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V160i64 = trunc <160 x i64> undef to <160 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %V192i64 = trunc <192 x i64> undef to <192 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %V224i64 = trunc <224 x i64> undef to <224 x i1>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i1>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V320i64 = trunc <320 x i64> undef to <320 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 768 for instruction: %V384i64 = trunc <384 x i64> undef to <384 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 896 for instruction: %V448i64 = trunc <448 x i64> undef to <448 x i1>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 352 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i1>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V640i64 = trunc <640 x i64> undef to <640 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 1536 for instruction: %V768i64 = trunc <768 x i64> undef to <768 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 1792 for instruction: %V896i64 = trunc <896 x i64> undef to <896 x i1>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 704 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i1>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i32 = trunc i32 undef to i1
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3i32 = trunc <3 x i32> undef to <3 x i1>
@@ -3110,26 +3110,26 @@ define i32 @trunc_vXi1() "min-legal-vector-width"="256" {
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V20i16 = trunc <20 x i16> undef to <20 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V24i16 = trunc <24 x i16> undef to <24 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V28i16 = trunc <28 x i16> undef to <28 x i1>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V40i16 = trunc <40 x i16> undef to <40 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V48i16 = trunc <48 x i16> undef to <48 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V56i16 = trunc <56 x i16> undef to <56 x i1>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V80i16 = trunc <80 x i16> undef to <80 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V112i16 = trunc <112 x i16> undef to <112 x i1>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V128i16 = trunc <128 x i16> undef to <128 x i1>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128i16 = trunc <128 x i16> undef to <128 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V160i16 = trunc <160 x i16> undef to <160 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %V192i16 = trunc <192 x i16> undef to <192 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %V224i16 = trunc <224 x i16> undef to <224 x i1>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V256i16 = trunc <256 x i16> undef to <256 x i1>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256i16 = trunc <256 x i16> undef to <256 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V320i16 = trunc <320 x i16> undef to <320 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 768 for instruction: %V384i16 = trunc <384 x i16> undef to <384 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 896 for instruction: %V448i16 = trunc <448 x i16> undef to <448 x i1>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V512i16 = trunc <512 x i16> undef to <512 x i1>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512i16 = trunc <512 x i16> undef to <512 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V640i16 = trunc <640 x i16> undef to <640 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 1536 for instruction: %V768i16 = trunc <768 x i16> undef to <768 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 1792 for instruction: %V896i16 = trunc <896 x i16> undef to <896 x i1>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V1024i16 = trunc <1024 x i16> undef to <1024 x i1>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024i16 = trunc <1024 x i16> undef to <1024 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i8 = trunc i8 undef to i1
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2i8 = trunc <2 x i8> undef to <2 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V3i8 = trunc <3 x i8> undef to <3 x i1>
@@ -3145,26 +3145,26 @@ define i32 @trunc_vXi1() "min-legal-vector-width"="256" {
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V20i8 = trunc <20 x i8> undef to <20 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V24i8 = trunc <24 x i8> undef to <24 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V28i8 = trunc <28 x i8> undef to <28 x i1>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32i8 = trunc <32 x i8> undef to <32 x i1>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i8 = trunc <32 x i8> undef to <32 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V40i8 = trunc <40 x i8> undef to <40 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V48i8 = trunc <48 x i8> undef to <48 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V56i8 = trunc <56 x i8> undef to <56 x i1>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V64i8 = trunc <64 x i8> undef to <64 x i1>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i8 = trunc <64 x i8> undef to <64 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V80i8 = trunc <80 x i8> undef to <80 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V112i8 = trunc <112 x i8> undef to <112 x i1>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V128i8 = trunc <128 x i8> undef to <128 x i1>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128i8 = trunc <128 x i8> undef to <128 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V160i8 = trunc <160 x i8> undef to <160 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %V192i8 = trunc <192 x i8> undef to <192 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %V224i8 = trunc <224 x i8> undef to <224 x i1>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V256i8 = trunc <256 x i8> undef to <256 x i1>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256i8 = trunc <256 x i8> undef to <256 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V320i8 = trunc <320 x i8> undef to <320 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 768 for instruction: %V384i8 = trunc <384 x i8> undef to <384 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 896 for instruction: %V448i8 = trunc <448 x i8> undef to <448 x i1>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %V512i8 = trunc <512 x i8> undef to <512 x i1>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512i8 = trunc <512 x i8> undef to <512 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V640i8 = trunc <640 x i8> undef to <640 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 1536 for instruction: %V768i8 = trunc <768 x i8> undef to <768 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 1792 for instruction: %V896i8 = trunc <896 x i8> undef to <896 x i1>
-; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 240 for instruction: %V1024i8 = trunc <1024 x i8> undef to <1024 x i1>
+; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024i8 = trunc <1024 x i8> undef to <1024 x i1>
 ; AVX512FVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512FVEC256-LABEL: 'trunc_vXi1'
@@ -3183,26 +3183,26 @@ define i32 @trunc_vXi1() "min-legal-vector-width"="256" {
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V20i64 = trunc <20 x i64> undef to <20 x i1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V24i64 = trunc <24 x i64> undef to <24 x i1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V28i64 = trunc <28 x i64> undef to <28 x i1>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V40i64 = trunc <40 x i64> undef to <40 x i1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V48i64 = trunc <48 x i64> undef to <48 x i1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V56i64 = trunc <56 x i64> undef to <56 x i1>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V80i64 = trunc <80 x i64> undef to <80 x i1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V112i64 = trunc <112 x i64> undef to <112 x i1>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i1>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V160i64 = trunc <160 x i64> undef to <160 x i1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %V192i64 = trunc <192 x i64> undef to <192 x i1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %V224i64 = trunc <224 x i64> undef to <224 x i1>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i1>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V320i64 = trunc <320 x i64> undef to <320 x i1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 768 for instruction: %V384i64 = trunc <384 x i64> undef to <384 x i1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 896 for instruction: %V448i64 = trunc <448 x i64> undef to <448 x i1>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 352 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i1>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V640i64 = trunc <640 x i64> undef to <640 x i1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 1536 for instruction: %V768i64 = trunc <768 x i64> undef to <768 x i1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 1792 for instruction: %V896i64 = trunc <896 x i64> undef to <896 x i1>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 704 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i1>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i32 = trunc i32 undef to i1
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3i32 = trunc <3 x i32> undef to <3 x i1>
@@ -3288,26 +3288,26 @@ define i32 @trunc_vXi1() "min-legal-vector-width"="256" {
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V20i8 = trunc <20 x i8> undef to <20 x i1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V24i8 = trunc <24 x i8> undef to <24 x i1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V28i8 = trunc <28 x i8> undef to <28 x i1>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V32i8 = trunc <32 x i8> undef to <32 x i1>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i8 = trunc <32 x i8> undef to <32 x i1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V40i8 = trunc <40 x i8> undef to <40 x i1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V48i8 = trunc <48 x i8> undef to <48 x i1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V56i8 = trunc <56 x i8> undef to <56 x i1>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V64i8 = trunc <64 x i8> undef to <64 x i1>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i8 = trunc <64 x i8> undef to <64 x i1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V80i8 = trunc <80 x i8> undef to <80 x i1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V112i8 = trunc <112 x i8> undef to <112 x i1>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V128i8 = trunc <128 x i8> undef to <128 x i1>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128i8 = trunc <128 x i8> undef to <128 x i1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V160i8 = trunc <160 x i8> undef to <160 x i1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %V192i8 = trunc <192 x i8> undef to <192 x i1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %V224i8 = trunc <224 x i8> undef to <224 x i1>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %V256i8 = trunc <256 x i8> undef to <256 x i1>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256i8 = trunc <256 x i8> undef to <256 x i1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V320i8 = trunc <320 x i8> undef to <320 x i1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 768 for instruction: %V384i8 = trunc <384 x i8> undef to <384 x i1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 896 for instruction: %V448i8 = trunc <448 x i8> undef to <448 x i1>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 272 for instruction: %V512i8 = trunc <512 x i8> undef to <512 x i1>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512i8 = trunc <512 x i8> undef to <512 x i1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V640i8 = trunc <640 x i8> undef to <640 x i1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 1536 for instruction: %V768i8 = trunc <768 x i8> undef to <768 x i1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 1792 for instruction: %V896i8 = trunc <896 x i8> undef to <896 x i1>
-; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 544 for instruction: %V1024i8 = trunc <1024 x i8> undef to <1024 x i1>
+; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024i8 = trunc <1024 x i8> undef to <1024 x i1>
 ; AVX512FVEC256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512DQVEC512-LABEL: 'trunc_vXi1'
@@ -3396,26 +3396,26 @@ define i32 @trunc_vXi1() "min-legal-vector-width"="256" {
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V20i16 = trunc <20 x i16> undef to <20 x i1>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V24i16 = trunc <24 x i16> undef to <24 x i1>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V28i16 = trunc <28 x i16> undef to <28 x i1>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V40i16 = trunc <40 x i16> undef to <40 x i1>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V48i16 = trunc <48 x i16> undef to <48 x i1>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V56i16 = trunc <56 x i16> undef to <56 x i1>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V80i16 = trunc <80 x i16> undef to <80 x i1>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V112i16 = trunc <112 x i16> undef to <112 x i1>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V128i16 = trunc <128 x i16> undef to <128 x i1>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128i16 = trunc <128 x i16> undef to <128 x i1>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V160i16 = trunc <160 x i16> undef to <160 x i1>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %V192i16 = trunc <192 x i16> undef to <192 x i1>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %V224i16 = trunc <224 x i16> undef to <224 x i1>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V256i16 = trunc <256 x i16> undef to <256 x i1>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256i16 = trunc <256 x i16> undef to <256 x i1>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V320i16 = trunc <320 x i16> undef to <320 x i1>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 768 for instruction: %V384i16 = trunc <384 x i16> undef to <384 x i1>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 896 for instruction: %V448i16 = trunc <448 x i16> undef to <448 x i1>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V512i16 = trunc <512 x i16> undef to <512 x i1>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512i16 = trunc <512 x i16> undef to <512 x i1>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V640i16 = trunc <640 x i16> undef to <640 x i1>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 1536 for instruction: %V768i16 = trunc <768 x i16> undef to <768 x i1>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 1792 for instruction: %V896i16 = trunc <896 x i16> undef to <896 x i1>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V1024i16 = trunc <1024 x i16> undef to <1024 x i1>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024i16 = trunc <1024 x i16> undef to <1024 x i1>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i8 = trunc i8 undef to i1
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2i8 = trunc <2 x i8> undef to <2 x i1>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V3i8 = trunc <3 x i8> undef to <3 x i1>
@@ -3431,26 +3431,26 @@ define i32 @trunc_vXi1() "min-legal-vector-width"="256" {
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V20i8 = trunc <20 x i8> undef to <20 x i1>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V24i8 = trunc <24 x i8> undef to <24 x i1>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V28i8 = trunc <28 x i8> undef to <28 x i1>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V32i8 = trunc <32 x i8> undef to <32 x i1>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i8 = trunc <32 x i8> undef to <32 x i1>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V40i8 = trunc <40 x i8> undef to <40 x i1>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V48i8 = trunc <48 x i8> undef to <48 x i1>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V56i8 = trunc <56 x i8> undef to <56 x i1>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V64i8 = trunc <64 x i8> undef to <64 x i1>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i8 = trunc <64 x i8> undef to <64 x i1>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V80i8 = trunc <80 x i8> undef to <80 x i1>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V112i8 = trunc <112 x i8> undef to <112 x i1>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V128i8 = trunc <128 x i8> undef to <128 x i1>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128i8 = trunc <128 x i8> undef to <128 x i1>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V160i8 = trunc <160 x i8> undef to <160 x i1>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %V192i8 = trunc <192 x i8> undef to <192 x i1>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %V224i8 = trunc <224 x i8> undef to <224 x i1>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V256i8 = trunc <256 x i8> undef to <256 x i1>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256i8 = trunc <256 x i8> undef to <256 x i1>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V320i8 = trunc <320 x i8> undef to <320 x i1>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 768 for instruction: %V384i8 = trunc <384 x i8> undef to <384 x i1>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 896 for instruction: %V448i8 = trunc <448 x i8> undef to <448 x i1>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %V512i8 = trunc <512 x i8> undef to <512 x i1>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512i8 = trunc <512 x i8> undef to <512 x i1>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V640i8 = trunc <640 x i8> undef to <640 x i1>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 1536 for instruction: %V768i8 = trunc <768 x i8> undef to <768 x i1>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 1792 for instruction: %V896i8 = trunc <896 x i8> undef to <896 x i1>
-; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 240 for instruction: %V1024i8 = trunc <1024 x i8> undef to <1024 x i1>
+; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024i8 = trunc <1024 x i8> undef to <1024 x i1>
 ; AVX512DQVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512DQVEC256-LABEL: 'trunc_vXi1'
@@ -3574,26 +3574,26 @@ define i32 @trunc_vXi1() "min-legal-vector-width"="256" {
 ; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V20i8 = trunc <20 x i8> undef to <20 x i1>
 ; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V24i8 = trunc <24 x i8> undef to <24 x i1>
 ; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V28i8 = trunc <28 x i8> undef to <28 x i1>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V32i8 = trunc <32 x i8> undef to <32 x i1>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i8 = trunc <32 x i8> undef to <32 x i1>
 ; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V40i8 = trunc <40 x i8> undef to <40 x i1>
 ; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V48i8 = trunc <48 x i8> undef to <48 x i1>
 ; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V56i8 = trunc <56 x i8> undef to <56 x i1>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V64i8 = trunc <64 x i8> undef to <64 x i1>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i8 = trunc <64 x i8> undef to <64 x i1>
 ; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V80i8 = trunc <80 x i8> undef to <80 x i1>
 ; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V112i8 = trunc <112 x i8> undef to <112 x i1>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V128i8 = trunc <128 x i8> undef to <128 x i1>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128i8 = trunc <128 x i8> undef to <128 x i1>
 ; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V160i8 = trunc <160 x i8> undef to <160 x i1>
 ; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %V192i8 = trunc <192 x i8> undef to <192 x i1>
 ; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %V224i8 = trunc <224 x i8> undef to <224 x i1>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 136 for instruction: %V256i8 = trunc <256 x i8> undef to <256 x i1>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256i8 = trunc <256 x i8> undef to <256 x i1>
 ; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V320i8 = trunc <320 x i8> undef to <320 x i1>
 ; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 768 for instruction: %V384i8 = trunc <384 x i8> undef to <384 x i1>
 ; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 896 for instruction: %V448i8 = trunc <448 x i8> undef to <448 x i1>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 272 for instruction: %V512i8 = trunc <512 x i8> undef to <512 x i1>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512i8 = trunc <512 x i8> undef to <512 x i1>
 ; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V640i8 = trunc <640 x i8> undef to <640 x i1>
 ; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 1536 for instruction: %V768i8 = trunc <768 x i8> undef to <768 x i1>
 ; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 1792 for instruction: %V896i8 = trunc <896 x i8> undef to <896 x i1>
-; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 544 for instruction: %V1024i8 = trunc <1024 x i8> undef to <1024 x i1>
+; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024i8 = trunc <1024 x i8> undef to <1024 x i1>
 ; AVX512DQVEC256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
 ; AVX512BWVEC512-LABEL: 'trunc_vXi1'
@@ -3612,26 +3612,26 @@ define i32 @trunc_vXi1() "min-legal-vector-width"="256" {
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V20i64 = trunc <20 x i64> undef to <20 x i1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V24i64 = trunc <24 x i64> undef to <24 x i1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V28i64 = trunc <28 x i64> undef to <28 x i1>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V40i64 = trunc <40 x i64> undef to <40 x i1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V48i64 = trunc <48 x i64> undef to <48 x i1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V56i64 = trunc <56 x i64> undef to <56 x i1>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V80i64 = trunc <80 x i64> undef to <80 x i1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V112i64 = trunc <112 x i64> undef to <112 x i1>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 94 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i1>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V160i64 = trunc <160 x i64> undef to <160 x i1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %V192i64 = trunc <192 x i64> undef to <192 x i1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %V224i64 = trunc <224 x i64> undef to <224 x i1>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 188 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i1>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V320i64 = trunc <320 x i64> undef to <320 x i1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 768 for instruction: %V384i64 = trunc <384 x i64> undef to <384 x i1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 896 for instruction: %V448i64 = trunc <448 x i64> undef to <448 x i1>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 376 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i1>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V640i64 = trunc <640 x i64> undef to <640 x i1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 1536 for instruction: %V768i64 = trunc <768 x i64> undef to <768 x i1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 1792 for instruction: %V896i64 = trunc <896 x i64> undef to <896 x i1>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 752 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i1>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i32 = trunc i32 undef to i1
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3i32 = trunc <3 x i32> undef to <3 x i1>
@@ -3647,26 +3647,26 @@ define i32 @trunc_vXi1() "min-legal-vector-width"="256" {
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V20i32 = trunc <20 x i32> undef to <20 x i1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V24i32 = trunc <24 x i32> undef to <24 x i1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V28i32 = trunc <28 x i32> undef to <28 x i1>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V40i32 = trunc <40 x i32> undef to <40 x i1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V48i32 = trunc <48 x i32> undef to <48 x i1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V56i32 = trunc <56 x i32> undef to <56 x i1>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V80i32 = trunc <80 x i32> undef to <80 x i1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V112i32 = trunc <112 x i32> undef to <112 x i1>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i1>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V160i32 = trunc <160 x i32> undef to <160 x i1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %V192i32 = trunc <192 x i32> undef to <192 x i1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %V224i32 = trunc <224 x i32> undef to <224 x i1>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i1>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V320i32 = trunc <320 x i32> undef to <320 x i1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 768 for instruction: %V384i32 = trunc <384 x i32> undef to <384 x i1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 896 for instruction: %V448i32 = trunc <448 x i32> undef to <448 x i1>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i1>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V640i32 = trunc <640 x i32> undef to <640 x i1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 1536 for instruction: %V768i32 = trunc <768 x i32> undef to <768 x i1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 1792 for instruction: %V896i32 = trunc <896 x i32> undef to <896 x i1>
-; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %V1024i32 = trunc <1024 x i32> undef to <1024 x i1>
+; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024i32 = trunc <1024 x i32> undef to <1024 x i1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i16 = trunc i16 undef to i1
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i16 = trunc <2 x i16> undef to <2 x i1>
 ; AVX512BWVEC512-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3i16 = trunc <3 x i16> undef to <3 x i1>
@@ -3755,26 +3755,26 @@ define i32 @trunc_vXi1() "min-legal-vector-width"="256" {
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V20i64 = trunc <20 x i64> undef to <20 x i1>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V24i64 = trunc <24 x i64> undef to <24 x i1>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V28i64 = trunc <28 x i64> undef to <28 x i1>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V40i64 = trunc <40 x i64> undef to <40 x i1>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V48i64 = trunc <48 x i64> undef to <48 x i1>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V56i64 = trunc <56 x i64> undef to <56 x i1>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V80i64 = trunc <80 x i64> undef to <80 x i1>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V112i64 = trunc <112 x i64> undef to <112 x i1>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 94 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i1>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i1>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V160i64 = trunc <160 x i64> undef to <160 x i1>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %V192i64 = trunc <192 x i64> undef to <192 x i1>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %V224i64 = trunc <224 x i64> undef to <224 x i1>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 188 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i1>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i1>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V320i64 = trunc <320 x i64> undef to <320 x i1>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 768 for instruction: %V384i64 = trunc <384 x i64> undef to <384 x i1>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 896 for instruction: %V448i64 = trunc <448 x i64> undef to <448 x i1>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 376 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i1>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i1>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V640i64 = trunc <640 x i64> undef to <640 x i1>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 1536 for instruction: %V768i64 = trunc <768 x i64> undef to <768 x i1>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 1792 for instruction: %V896i64 = trunc <896 x i64> undef to <896 x i1>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 752 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i1>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i1>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i32 = trunc i32 undef to i1
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i1>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3i32 = trunc <3 x i32> undef to <3 x i1>
@@ -3790,26 +3790,26 @@ define i32 @trunc_vXi1() "min-legal-vector-width"="256" {
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V20i32 = trunc <20 x i32> undef to <20 x i1>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V24i32 = trunc <24 x i32> undef to <24 x i1>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %V28i32 = trunc <28 x i32> undef to <28 x i1>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %V40i32 = trunc <40 x i32> undef to <40 x i1>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V48i32 = trunc <48 x i32> undef to <48 x i1>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %V56i32 = trunc <56 x i32> undef to <56 x i1>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %V80i32 = trunc <80 x i32> undef to <80 x i1>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %V112i32 = trunc <112 x i32> undef to <112 x i1>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i1>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i1>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %V160i32 = trunc <160 x i32> undef to <160 x i1>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %V192i32 = trunc <192 x i32> undef to <192 x i1>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 448 for instruction: %V224i32 = trunc <224 x i32> undef to <224 x i1>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i1>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i1>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 640 for instruction: %V320i32 = trunc <320 x i32> undef to <320 x i1>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 768 for instruction: %V384i32 = trunc <384 x i32> undef to <384 x i1>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 896 for instruction: %V448i32 = trunc <448 x i32> undef to <448 x i1>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 152 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i1>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i1>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 1280 for instruction: %V640i32 = trunc <640 x i32> undef to <640 x i1>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 1536 for instruction: %V768i32 = trunc <768 x i32> undef to <768 x i1>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 1792 for instruction: %V896i32 = trunc <896 x i32> undef to <896 x i1>
-; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 304 for instruction: %V1024i32 = trunc <1024 x i32> undef to <1024 x i1>
+; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024i32 = trunc <1024 x i32> undef to <1024 x i1>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i16 = trunc i16 undef to i1
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2i16 = trunc <2 x i16> undef to <2 x i1>
 ; AVX512BWVEC256-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V3i16 = trunc <3 x i16> undef to <3 x i1>
@@ -3898,26 +3898,26 @@ define i32 @trunc_vXi1() "min-legal-vector-width"="256" {
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V20i64 = trunc <20 x i64> undef to <20 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V24i64 = trunc <24 x i64> undef to <24 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 59 for instruction: %V28i64 = trunc <28 x i64> undef to <28 x i1>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i64 = trunc <32 x i64> undef to <32 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 83 for instruction: %V40i64 = trunc <40 x i64> undef to <40 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %V48i64 = trunc <48 x i64> undef to <48 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 117 for instruction: %V56i64 = trunc <56 x i64> undef to <56 x i1>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i64 = trunc <64 x i64> undef to <64 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 165 for instruction: %V80i64 = trunc <80 x i64> undef to <80 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 231 for instruction: %V112i64 = trunc <112 x i64> undef to <112 x i1>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i1>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128i64 = trunc <128 x i64> undef to <128 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 330 for instruction: %V160i64 = trunc <160 x i64> undef to <160 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 396 for instruction: %V192i64 = trunc <192 x i64> undef to <192 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 462 for instruction: %V224i64 = trunc <224 x i64> undef to <224 x i1>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 184 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i1>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256i64 = trunc <256 x i64> undef to <256 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 660 for instruction: %V320i64 = trunc <320 x i64> undef to <320 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 792 for instruction: %V384i64 = trunc <384 x i64> undef to <384 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 924 for instruction: %V448i64 = trunc <448 x i64> undef to <448 x i1>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 368 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i1>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512i64 = trunc <512 x i64> undef to <512 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1320 for instruction: %V640i64 = trunc <640 x i64> undef to <640 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1584 for instruction: %V768i64 = trunc <768 x i64> undef to <768 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1848 for instruction: %V896i64 = trunc <896 x i64> undef to <896 x i1>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 736 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i1>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024i64 = trunc <1024 x i64> undef to <1024 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i32 = trunc i32 undef to i1
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i32 = trunc <2 x i32> undef to <2 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V3i32 = trunc <3 x i32> undef to <3 x i1>
@@ -3933,26 +3933,26 @@ define i32 @trunc_vXi1() "min-legal-vector-width"="256" {
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V20i32 = trunc <20 x i32> undef to <20 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V24i32 = trunc <24 x i32> undef to <24 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 59 for instruction: %V28i32 = trunc <28 x i32> undef to <28 x i1>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i32 = trunc <32 x i32> undef to <32 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 83 for instruction: %V40i32 = trunc <40 x i32> undef to <40 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %V48i32 = trunc <48 x i32> undef to <48 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 117 for instruction: %V56i32 = trunc <56 x i32> undef to <56 x i1>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i32 = trunc <64 x i32> undef to <64 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 165 for instruction: %V80i32 = trunc <80 x i32> undef to <80 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 231 for instruction: %V112i32 = trunc <112 x i32> undef to <112 x i1>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i1>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128i32 = trunc <128 x i32> undef to <128 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 330 for instruction: %V160i32 = trunc <160 x i32> undef to <160 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 396 for instruction: %V192i32 = trunc <192 x i32> undef to <192 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 462 for instruction: %V224i32 = trunc <224 x i32> undef to <224 x i1>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 168 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i1>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256i32 = trunc <256 x i32> undef to <256 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 660 for instruction: %V320i32 = trunc <320 x i32> undef to <320 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 792 for instruction: %V384i32 = trunc <384 x i32> undef to <384 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 924 for instruction: %V448i32 = trunc <448 x i32> undef to <448 x i1>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 336 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i1>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512i32 = trunc <512 x i32> undef to <512 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1320 for instruction: %V640i32 = trunc <640 x i32> undef to <640 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1584 for instruction: %V768i32 = trunc <768 x i32> undef to <768 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1848 for instruction: %V896i32 = trunc <896 x i32> undef to <896 x i1>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 672 for instruction: %V1024i32 = trunc <1024 x i32> undef to <1024 x i1>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024i32 = trunc <1024 x i32> undef to <1024 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i16 = trunc i16 undef to i1
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i16 = trunc <2 x i16> undef to <2 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V3i16 = trunc <3 x i16> undef to <3 x i1>
@@ -3968,26 +3968,26 @@ define i32 @trunc_vXi1() "min-legal-vector-width"="256" {
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V20i16 = trunc <20 x i16> undef to <20 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %V24i16 = trunc <24 x i16> undef to <24 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 59 for instruction: %V28i16 = trunc <28 x i16> undef to <28 x i1>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V32i16 = trunc <32 x i16> undef to <32 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 83 for instruction: %V40i16 = trunc <40 x i16> undef to <40 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 99 for instruction: %V48i16 = trunc <48 x i16> undef to <48 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 117 for instruction: %V56i16 = trunc <56 x i16> undef to <56 x i1>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V64i16 = trunc <64 x i16> undef to <64 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 165 for instruction: %V80i16 = trunc <80 x i16> undef to <80 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 231 for instruction: %V112i16 = trunc <112 x i16> undef to <112 x i1>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V128i16 = trunc <128 x i16> undef to <128 x i1>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V128i16 = trunc <128 x i16> undef to <128 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 330 for instruction: %V160i16 = trunc <160 x i16> undef to <160 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 396 for instruction: %V192i16 = trunc <192 x i16> undef to <192 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 462 for instruction: %V224i16 = trunc <224 x i16> undef to <224 x i1>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %V256i16 = trunc <256 x i16> undef to <256 x i1>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V256i16 = trunc <256 x i16> undef to <256 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 660 for instruction: %V320i16 = trunc <320 x i16> undef to <320 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 792 for instruction: %V384i16 = trunc <384 x i16> undef to <384 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 924 for instruction: %V448i16 = trunc <448 x i16> undef to <448 x i1>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 144 for instruction: %V512i16 = trunc <512 x i16> undef to <512 x i1>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V512i16 = trunc <512 x i16> undef to <512 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1320 for instruction: %V640i16 = trunc <640 x i16> undef to <640 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1584 for instruction: %V768i16 = trunc <768 x i16> undef to <768 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1848 for instruction: %V896i16 = trunc <896 x i16> undef to <896 x i1>
-; BTVER2-NEXT:  Cost Model: Found an estimated cost of 288 for instruction: %V1024i16 = trunc <1024 x i16> undef to <1024 x i1>
+; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V1024i16 = trunc <1024 x i16> undef to <1024 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i8 = trunc i8 undef to i1
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2i8 = trunc <2 x i8> undef to <2 x i1>
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %V3i8 = trunc <3 x i8> undef to <3 x i1>
diff --git a/llvm/test/CodeGen/AArch64/arm64_32-gep-sink.ll b/llvm/test/CodeGen/AArch64/arm64_32-gep-sink.ll
index 1a9c5974a547a..964c669439d4d 100644
--- a/llvm/test/CodeGen/AArch64/arm64_32-gep-sink.ll
+++ b/llvm/test/CodeGen/AArch64/arm64_32-gep-sink.ll
@@ -8,8 +8,9 @@ define void @test_simple_sink(ptr %base, i64 %offset) {
 ; CHECK-NEXT:    [[TST:%.*]] = load i1, ptr [[ADDR]], align 1
 ; CHECK-NEXT:    br i1 [[TST]], label [[NEXT:%.*]], label [[END:%.*]]
 ; CHECK:       next:
-; CHECK-NEXT:    [[SUNKADDR:%.*]] = getelementptr i8, ptr [[BASE]], i64 [[OFFSET]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load volatile i1, ptr [[SUNKADDR]], align 1
+; CHECK-NEXT:    [[SUNKADDR:%.*]] = trunc i64 [[OFFSET]] to i32
+; CHECK-NEXT:    [[SUNKADDR1:%.*]] = getelementptr i8, ptr [[BASE]], i32 [[SUNKADDR]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load volatile i1, ptr [[SUNKADDR1]], align 1
 ; CHECK-NEXT:    ret void
 ; CHECK:       end:
 ; CHECK-NEXT:    ret void
@@ -33,8 +34,9 @@ define void @test_inbounds_sink(ptr %base, i64 %offset) {
 ; CHECK-NEXT:    [[TST:%.*]] = load i1, ptr [[ADDR]], align 1
 ; CHECK-NEXT:    br i1 [[TST]], label [[NEXT:%.*]], label [[END:%.*]]
 ; CHECK:       next:
-; CHECK-NEXT:    [[SUNKADDR:%.*]] = getelementptr inbounds i8, ptr [[BASE]], i64 [[OFFSET]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load volatile i1, ptr [[SUNKADDR]], align 1
+; CHECK-NEXT:    [[SUNKADDR:%.*]] = trunc i64 [[OFFSET]] to i32
+; CHECK-NEXT:    [[SUNKADDR1:%.*]] = getelementptr inbounds i8, ptr [[BASE]], i32 [[SUNKADDR]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load volatile i1, ptr [[SUNKADDR1]], align 1
 ; CHECK-NEXT:    ret void
 ; CHECK:       end:
 ; CHECK-NEXT:    ret void
@@ -61,8 +63,8 @@ define void @test_add_sink(ptr %base, i64 %offset) {
 ; CHECK-NEXT:    [[TST:%.*]] = load i1, ptr [[ADDR]], align 1
 ; CHECK-NEXT:    br i1 [[TST]], label [[NEXT:%.*]], label [[END:%.*]]
 ; CHECK:       next:
-; CHECK-NEXT:    [[SUNKADDR:%.*]] = getelementptr i8, ptr [[BASE]], i64 [[OFFSET]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load volatile i1, ptr [[SUNKADDR]], align 1
+; CHECK-NEXT:    [[TMP1:%.*]] = inttoptr i64 [[ADDR64]] to ptr
+; CHECK-NEXT:    [[TMP2:%.*]] = load volatile i1, ptr [[TMP1]], align 1
 ; CHECK-NEXT:    ret void
 ; CHECK:       end:
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/CodeGen/AArch64/sme-aarch64-svcount-O3.ll b/llvm/test/CodeGen/AArch64/sme-aarch64-svcount-O3.ll
index 531c666277aae..079f2e7eb481b 100644
--- a/llvm/test/CodeGen/AArch64/sme-aarch64-svcount-O3.ll
+++ b/llvm/test/CodeGen/AArch64/sme-aarch64-svcount-O3.ll
@@ -5,14 +5,14 @@
 define target("aarch64.svcount") @test_alloca_store_reload(target("aarch64.svcount") %val0, target("aarch64.svcount") %val1, ptr %iptr, ptr %pptr, i64 %N) nounwind {
 ; CHECK-LABEL: @test_alloca_store_reload(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    store i64 0, ptr [[IPTR:%.*]], align 4
+; CHECK-NEXT:    store i64 0, ptr [[IPTR:%.*]], align 8
 ; CHECK-NEXT:    store target("aarch64.svcount") [[VAL0:%.*]], ptr [[PPTR:%.*]], align 2
 ; CHECK-NEXT:    [[I1_PEEL:%.*]] = icmp eq i64 [[N:%.*]], 0
 ; CHECK-NEXT:    br i1 [[I1_PEEL]], label [[LOOP_EXIT:%.*]], label [[LOOP_BODY:%.*]]
 ; CHECK:       loop.body:
 ; CHECK-NEXT:    [[IND:%.*]] = phi i64 [ [[IND_NEXT:%.*]], [[LOOP_BODY]] ], [ 1, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[IPTR_GEP:%.*]] = getelementptr i64, ptr [[IPTR]], i64 [[IND]]
-; CHECK-NEXT:    store i64 [[IND]], ptr [[IPTR_GEP]], align 4
+; CHECK-NEXT:    store i64 [[IND]], ptr [[IPTR_GEP]], align 8
 ; CHECK-NEXT:    store target("aarch64.svcount") [[VAL1:%.*]], ptr [[PPTR]], align 2
 ; CHECK-NEXT:    [[IND_NEXT]] = add i64 [[IND]], 1
 ; CHECK-NEXT:    [[I1:%.*]] = icmp eq i64 [[IND]], [[N]]
diff --git a/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll
index aacaae2753c75..c4e40d46e3179 100644
--- a/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll
@@ -183,12 +183,12 @@ define amdgpu_kernel void @store_value_constant_cast_lds_gv_gep_to_flat(ptr addr
 define amdgpu_kernel void @store_ptrtoint_value_constant_cast_lds_gv_gep_to_flat(ptr addrspace(1) %out) #1 {
 ; AKF_HSA-LABEL: define {{[^@]+}}@store_ptrtoint_value_constant_cast_lds_gv_gep_to_flat
 ; AKF_HSA-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR1]] {
-; AKF_HSA-NEXT:    store i64 ptrtoint (ptr addrspace(4) getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8) to i64), ptr addrspace(1) [[OUT]], align 4
+; AKF_HSA-NEXT:    store i64 ptrtoint (ptr addrspace(4) getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8) to i64), ptr addrspace(1) [[OUT]], align 8
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@store_ptrtoint_value_constant_cast_lds_gv_gep_to_flat
 ; ATTRIBUTOR_HSA-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR2]] {
-; ATTRIBUTOR_HSA-NEXT:    store i64 ptrtoint (ptr addrspace(4) getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8) to i64), ptr addrspace(1) [[OUT]], align 4
+; ATTRIBUTOR_HSA-NEXT:    store i64 ptrtoint (ptr addrspace(4) getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8) to i64), ptr addrspace(1) [[OUT]], align 8
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
   store i64 ptrtoint (ptr addrspace(4) getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8) to i64), ptr addrspace(1) %out
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
index 7eb1cb926c190..d342c4ffa37b0 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
@@ -69,7 +69,6 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: udiv_i32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -164,7 +163,6 @@ define amdgpu_kernel void @urem_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: urem_i32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -280,7 +278,6 @@ define amdgpu_kernel void @sdiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
 ; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s0, v0
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: sdiv_i32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -398,7 +395,6 @@ define amdgpu_kernel void @srem_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: srem_i32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -480,7 +476,6 @@ define amdgpu_kernel void @udiv_i16(ptr addrspace(1) %out, i16 %x, i16 %y) {
 ; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
 ; GFX6-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: udiv_i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x2c
@@ -551,7 +546,6 @@ define amdgpu_kernel void @urem_i16(ptr addrspace(1) %out, i16 %x, i16 %y) {
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
 ; GFX6-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: urem_i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x2c
@@ -630,7 +624,6 @@ define amdgpu_kernel void @sdiv_i16(ptr addrspace(1) %out, i16 %x, i16 %y) {
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s4, v2
 ; GFX6-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: sdiv_i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
@@ -715,7 +708,6 @@ define amdgpu_kernel void @srem_i16(ptr addrspace(1) %out, i16 %x, i16 %y) {
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
 ; GFX6-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: srem_i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
@@ -788,7 +780,6 @@ define amdgpu_kernel void @udiv_i8(ptr addrspace(1) %out, i8 %x, i8 %y) {
 ; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
 ; GFX6-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: udiv_i8:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x2c
@@ -856,7 +847,6 @@ define amdgpu_kernel void @urem_i8(ptr addrspace(1) %out, i8 %x, i8 %y) {
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
 ; GFX6-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: urem_i8:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x2c
@@ -934,7 +924,6 @@ define amdgpu_kernel void @sdiv_i8(ptr addrspace(1) %out, i8 %x, i8 %y) {
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s4, v2
 ; GFX6-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: sdiv_i8:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
@@ -1020,7 +1009,6 @@ define amdgpu_kernel void @srem_i8(ptr addrspace(1) %out, i8 %x, i8 %y) {
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
 ; GFX6-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: srem_i8:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
@@ -1283,7 +1271,6 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: udiv_v4i32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
@@ -1591,7 +1578,6 @@ define amdgpu_kernel void @urem_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: urem_v4i32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
@@ -1983,7 +1969,6 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x
 ; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s0, v3
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: sdiv_v4i32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
@@ -2391,7 +2376,6 @@ define amdgpu_kernel void @srem_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: srem_v4i32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
@@ -2657,7 +2641,6 @@ define amdgpu_kernel void @udiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: udiv_v4i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
@@ -2879,7 +2862,6 @@ define amdgpu_kernel void @urem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: urem_v4i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
@@ -3129,7 +3111,6 @@ define amdgpu_kernel void @sdiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x
 ; GFX6-NEXT:    v_or_b32_e32 v0, v2, v0
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: sdiv_v4i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
@@ -3411,7 +3392,6 @@ define amdgpu_kernel void @srem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: srem_v4i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
@@ -3544,7 +3524,6 @@ define amdgpu_kernel void @udiv_i3(ptr addrspace(1) %out, i3 %x, i3 %y) {
 ; GFX6-NEXT:    v_and_b32_e32 v0, 7, v0
 ; GFX6-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: udiv_i3:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
@@ -3618,7 +3597,6 @@ define amdgpu_kernel void @urem_i3(ptr addrspace(1) %out, i3 %x, i3 %y) {
 ; GFX6-NEXT:    v_and_b32_e32 v0, 7, v0
 ; GFX6-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: urem_i3:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x2c
@@ -3700,7 +3678,6 @@ define amdgpu_kernel void @sdiv_i3(ptr addrspace(1) %out, i3 %x, i3 %y) {
 ; GFX6-NEXT:    v_and_b32_e32 v0, 7, v0
 ; GFX6-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: sdiv_i3:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
@@ -3788,7 +3765,6 @@ define amdgpu_kernel void @srem_i3(ptr addrspace(1) %out, i3 %x, i3 %y) {
 ; GFX6-NEXT:    v_and_b32_e32 v0, 7, v0
 ; GFX6-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: srem_i3:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
@@ -3934,7 +3910,6 @@ define amdgpu_kernel void @udiv_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x
 ; GFX6-NEXT:    buffer_store_short v2, off, s[0:3], 0 offset:4
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: udiv_v3i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
@@ -4107,7 +4082,6 @@ define amdgpu_kernel void @urem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x
 ; GFX6-NEXT:    buffer_store_short v2, off, s[0:3], 0 offset:4
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: urem_v3i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
@@ -4302,7 +4276,6 @@ define amdgpu_kernel void @sdiv_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x
 ; GFX6-NEXT:    buffer_store_short v0, off, s[0:3], 0 offset:4
 ; GFX6-NEXT:    buffer_store_dword v1, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: sdiv_v3i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
@@ -4519,7 +4492,6 @@ define amdgpu_kernel void @srem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x
 ; GFX6-NEXT:    buffer_store_short v2, off, s[0:3], 0 offset:4
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: srem_v3i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
@@ -4713,7 +4685,6 @@ define amdgpu_kernel void @udiv_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x
 ; GFX6-NEXT:    v_and_b32_e32 v0, 0x1fff, v1
 ; GFX6-NEXT:    buffer_store_short v0, off, s[0:3], 0 offset:4
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: udiv_v3i15:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
@@ -4908,7 +4879,6 @@ define amdgpu_kernel void @urem_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x
 ; GFX6-NEXT:    v_and_b32_e32 v0, 0x1fff, v1
 ; GFX6-NEXT:    buffer_store_short v0, off, s[0:3], 0 offset:4
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: urem_v3i15:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
@@ -5123,7 +5093,6 @@ define amdgpu_kernel void @sdiv_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x
 ; GFX6-NEXT:    v_and_b32_e32 v0, 0x1fff, v1
 ; GFX6-NEXT:    buffer_store_short v0, off, s[0:3], 0 offset:4
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: sdiv_v3i15:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
@@ -5360,7 +5329,6 @@ define amdgpu_kernel void @srem_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x
 ; GFX6-NEXT:    v_and_b32_e32 v0, 0x1fff, v1
 ; GFX6-NEXT:    buffer_store_short v0, off, s[0:3], 0 offset:4
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: srem_v3i15:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
@@ -5464,7 +5432,6 @@ define amdgpu_kernel void @udiv_i32_oddk_denom(ptr addrspace(1) %out, i32 %x) {
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 20, v0
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: udiv_i32_oddk_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
@@ -5501,7 +5468,6 @@ define amdgpu_kernel void @udiv_i32_pow2k_denom(ptr addrspace(1) %out, i32 %x) {
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: udiv_i32_pow2k_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
@@ -5537,7 +5503,6 @@ define amdgpu_kernel void @udiv_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: udiv_i32_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -5579,7 +5544,6 @@ define amdgpu_kernel void @udiv_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i3
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: udiv_v2i32_pow2k_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -5625,7 +5589,6 @@ define amdgpu_kernel void @udiv_v2i32_mixed_pow2k_denom(ptr addrspace(1) %out, <
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: udiv_v2i32_mixed_pow2k_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -5771,7 +5734,6 @@ define amdgpu_kernel void @udiv_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: udiv_v2i32_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
@@ -5854,7 +5816,6 @@ define amdgpu_kernel void @urem_i32_oddk_denom(ptr addrspace(1) %out, i32 %x) {
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: urem_i32_oddk_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
@@ -5893,7 +5854,6 @@ define amdgpu_kernel void @urem_i32_pow2k_denom(ptr addrspace(1) %out, i32 %x) {
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: urem_i32_pow2k_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
@@ -5930,7 +5890,6 @@ define amdgpu_kernel void @urem_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: urem_i32_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -5973,7 +5932,6 @@ define amdgpu_kernel void @urem_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i3
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: urem_v2i32_pow2k_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -6105,7 +6063,6 @@ define amdgpu_kernel void @urem_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: urem_v2i32_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
@@ -6181,7 +6138,6 @@ define amdgpu_kernel void @sdiv_i32_oddk_denom(ptr addrspace(1) %out, i32 %x) {
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: sdiv_i32_oddk_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
@@ -6221,7 +6177,6 @@ define amdgpu_kernel void @sdiv_i32_pow2k_denom(ptr addrspace(1) %out, i32 %x) {
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: sdiv_i32_pow2k_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
@@ -6289,7 +6244,6 @@ define amdgpu_kernel void @sdiv_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x
 ; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s0, v0
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: sdiv_i32_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -6365,7 +6319,6 @@ define amdgpu_kernel void @sdiv_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i3
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: sdiv_v2i32_pow2k_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -6420,7 +6373,6 @@ define amdgpu_kernel void @ssdiv_v2i32_mixed_pow2k_denom(ptr addrspace(1) %out,
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: ssdiv_v2i32_mixed_pow2k_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -6605,7 +6557,6 @@ define amdgpu_kernel void @sdiv_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    v_subrev_i32_e32 v1, vcc, s4, v1
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: sdiv_v2i32_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
@@ -6707,7 +6658,6 @@ define amdgpu_kernel void @srem_i32_oddk_denom(ptr addrspace(1) %out, i32 %x) {
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: srem_i32_oddk_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
@@ -6750,7 +6700,6 @@ define amdgpu_kernel void @srem_i32_pow2k_denom(ptr addrspace(1) %out, i32 %x) {
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: srem_i32_pow2k_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
@@ -6813,7 +6762,6 @@ define amdgpu_kernel void @srem_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: srem_i32_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -6888,7 +6836,6 @@ define amdgpu_kernel void @srem_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i3
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: srem_v2i32_pow2k_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -7060,7 +7007,6 @@ define amdgpu_kernel void @srem_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: srem_v2i32_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
@@ -7134,7 +7080,7 @@ define amdgpu_kernel void @srem_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 define amdgpu_kernel void @udiv_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) {
 ; CHECK-LABEL: @udiv_i64_oddk_denom(
 ; CHECK-NEXT:    [[R:%.*]] = udiv i64 [[X:%.*]], 1235195949943
-; CHECK-NEXT:    store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 4
+; CHECK-NEXT:    store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 8
 ; CHECK-NEXT:    ret void
 ;
 ; GFX6-LABEL: udiv_i64_oddk_denom:
@@ -7233,7 +7179,6 @@ define amdgpu_kernel void @udiv_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) {
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: udiv_i64_oddk_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
@@ -7344,7 +7289,7 @@ define amdgpu_kernel void @udiv_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) {
 define amdgpu_kernel void @udiv_i64_pow2k_denom(ptr addrspace(1) %out, i64 %x) {
 ; CHECK-LABEL: @udiv_i64_pow2k_denom(
 ; CHECK-NEXT:    [[R:%.*]] = udiv i64 [[X:%.*]], 4096
-; CHECK-NEXT:    store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 4
+; CHECK-NEXT:    store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 8
 ; CHECK-NEXT:    ret void
 ;
 ; GFX6-LABEL: udiv_i64_pow2k_denom:
@@ -7360,7 +7305,6 @@ define amdgpu_kernel void @udiv_i64_pow2k_denom(ptr addrspace(1) %out, i64 %x) {
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: udiv_i64_pow2k_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -7380,7 +7324,7 @@ define amdgpu_kernel void @udiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; CHECK-LABEL: @udiv_i64_pow2_shl_denom(
 ; CHECK-NEXT:    [[SHL_Y:%.*]] = shl i64 4096, [[Y:%.*]]
 ; CHECK-NEXT:    [[R:%.*]] = udiv i64 [[X:%.*]], [[SHL_Y]]
-; CHECK-NEXT:    store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 4
+; CHECK-NEXT:    store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 8
 ; CHECK-NEXT:    ret void
 ;
 ; GFX6-LABEL: udiv_i64_pow2_shl_denom:
@@ -7398,7 +7342,6 @@ define amdgpu_kernel void @udiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: udiv_i64_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x34
@@ -7443,7 +7386,6 @@ define amdgpu_kernel void @udiv_v2i64_pow2k_denom(ptr addrspace(1) %out, <2 x i6
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s7
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: udiv_v2i64_pow2k_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
@@ -7560,7 +7502,6 @@ define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(ptr addrspace(1) %out, <
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: udiv_v2i64_mixed_pow2k_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
@@ -7692,7 +7633,6 @@ define amdgpu_kernel void @udiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s7
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: udiv_v2i64_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
@@ -7718,7 +7658,7 @@ define amdgpu_kernel void @udiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 define amdgpu_kernel void @urem_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) {
 ; CHECK-LABEL: @urem_i64_oddk_denom(
 ; CHECK-NEXT:    [[R:%.*]] = urem i64 [[X:%.*]], 1235195393993
-; CHECK-NEXT:    store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 4
+; CHECK-NEXT:    store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 8
 ; CHECK-NEXT:    ret void
 ;
 ; GFX6-LABEL: urem_i64_oddk_denom:
@@ -7816,7 +7756,6 @@ define amdgpu_kernel void @urem_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) {
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: urem_i64_oddk_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
@@ -7925,7 +7864,7 @@ define amdgpu_kernel void @urem_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) {
 define amdgpu_kernel void @urem_i64_pow2k_denom(ptr addrspace(1) %out, i64 %x) {
 ; CHECK-LABEL: @urem_i64_pow2k_denom(
 ; CHECK-NEXT:    [[R:%.*]] = urem i64 [[X:%.*]], 4096
-; CHECK-NEXT:    store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 4
+; CHECK-NEXT:    store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 8
 ; CHECK-NEXT:    ret void
 ;
 ; GFX6-LABEL: urem_i64_pow2k_denom:
@@ -7941,7 +7880,6 @@ define amdgpu_kernel void @urem_i64_pow2k_denom(ptr addrspace(1) %out, i64 %x) {
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: urem_i64_pow2k_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -7960,7 +7898,7 @@ define amdgpu_kernel void @urem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; CHECK-LABEL: @urem_i64_pow2_shl_denom(
 ; CHECK-NEXT:    [[SHL_Y:%.*]] = shl i64 4096, [[Y:%.*]]
 ; CHECK-NEXT:    [[R:%.*]] = urem i64 [[X:%.*]], [[SHL_Y]]
-; CHECK-NEXT:    store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 4
+; CHECK-NEXT:    store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 8
 ; CHECK-NEXT:    ret void
 ;
 ; GFX6-LABEL: urem_i64_pow2_shl_denom:
@@ -7981,7 +7919,6 @@ define amdgpu_kernel void @urem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: urem_i64_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x34
@@ -8029,7 +7966,6 @@ define amdgpu_kernel void @urem_v2i64_pow2k_denom(ptr addrspace(1) %out, <2 x i6
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: urem_v2i64_pow2k_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
@@ -8084,7 +8020,6 @@ define amdgpu_kernel void @urem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: urem_v2i64_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
@@ -8115,7 +8050,7 @@ define amdgpu_kernel void @urem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 define amdgpu_kernel void @sdiv_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) {
 ; CHECK-LABEL: @sdiv_i64_oddk_denom(
 ; CHECK-NEXT:    [[R:%.*]] = sdiv i64 [[X:%.*]], 1235195
-; CHECK-NEXT:    store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 4
+; CHECK-NEXT:    store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 8
 ; CHECK-NEXT:    ret void
 ;
 ; GFX6-LABEL: sdiv_i64_oddk_denom:
@@ -8212,7 +8147,6 @@ define amdgpu_kernel void @sdiv_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) {
 ; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: sdiv_i64_oddk_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_mov_b32 s4, 0x33fe64
@@ -8315,7 +8249,7 @@ define amdgpu_kernel void @sdiv_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) {
 define amdgpu_kernel void @sdiv_i64_pow2k_denom(ptr addrspace(1) %out, i64 %x) {
 ; CHECK-LABEL: @sdiv_i64_pow2k_denom(
 ; CHECK-NEXT:    [[R:%.*]] = sdiv i64 [[X:%.*]], 4096
-; CHECK-NEXT:    store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 4
+; CHECK-NEXT:    store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 8
 ; CHECK-NEXT:    ret void
 ;
 ; GFX6-LABEL: sdiv_i64_pow2k_denom:
@@ -8335,7 +8269,6 @@ define amdgpu_kernel void @sdiv_i64_pow2k_denom(ptr addrspace(1) %out, i64 %x) {
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: sdiv_i64_pow2k_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -8359,7 +8292,7 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; CHECK-LABEL: @sdiv_i64_pow2_shl_denom(
 ; CHECK-NEXT:    [[SHL_Y:%.*]] = shl i64 4096, [[Y:%.*]]
 ; CHECK-NEXT:    [[R:%.*]] = sdiv i64 [[X:%.*]], [[SHL_Y]]
-; CHECK-NEXT:    store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 4
+; CHECK-NEXT:    store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 8
 ; CHECK-NEXT:    ret void
 ;
 ; GFX6-LABEL: sdiv_i64_pow2_shl_denom:
@@ -8498,7 +8431,6 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: sdiv_i64_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x34
@@ -8690,7 +8622,6 @@ define amdgpu_kernel void @sdiv_v2i64_pow2k_denom(ptr addrspace(1) %out, <2 x i6
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s7
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: sdiv_v2i64_pow2k_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
@@ -8829,7 +8760,6 @@ define amdgpu_kernel void @ssdiv_v2i64_mixed_pow2k_denom(ptr addrspace(1) %out,
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: ssdiv_v2i64_mixed_pow2k_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
@@ -9208,7 +9138,6 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: sdiv_v2i64_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
@@ -9510,7 +9439,7 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 define amdgpu_kernel void @srem_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) {
 ; CHECK-LABEL: @srem_i64_oddk_denom(
 ; CHECK-NEXT:    [[R:%.*]] = srem i64 [[X:%.*]], 1235195
-; CHECK-NEXT:    store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 4
+; CHECK-NEXT:    store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 8
 ; CHECK-NEXT:    ret void
 ;
 ; GFX6-LABEL: srem_i64_oddk_denom:
@@ -9605,7 +9534,6 @@ define amdgpu_kernel void @srem_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) {
 ; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: srem_i64_oddk_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_mov_b32 s4, 0x33fe64
@@ -9711,7 +9639,7 @@ define amdgpu_kernel void @srem_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) {
 define amdgpu_kernel void @srem_i64_pow2k_denom(ptr addrspace(1) %out, i64 %x) {
 ; CHECK-LABEL: @srem_i64_pow2k_denom(
 ; CHECK-NEXT:    [[R:%.*]] = srem i64 [[X:%.*]], 4096
-; CHECK-NEXT:    store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 4
+; CHECK-NEXT:    store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 8
 ; CHECK-NEXT:    ret void
 ;
 ; GFX6-LABEL: srem_i64_pow2k_denom:
@@ -9733,7 +9661,6 @@ define amdgpu_kernel void @srem_i64_pow2k_denom(ptr addrspace(1) %out, i64 %x) {
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: srem_i64_pow2k_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -9759,7 +9686,7 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; CHECK-LABEL: @srem_i64_pow2_shl_denom(
 ; CHECK-NEXT:    [[SHL_Y:%.*]] = shl i64 4096, [[Y:%.*]]
 ; CHECK-NEXT:    [[R:%.*]] = srem i64 [[X:%.*]], [[SHL_Y]]
-; CHECK-NEXT:    store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 4
+; CHECK-NEXT:    store i64 [[R]], ptr addrspace(1) [[OUT:%.*]], align 8
 ; CHECK-NEXT:    ret void
 ;
 ; GFX6-LABEL: srem_i64_pow2_shl_denom:
@@ -9896,7 +9823,6 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: srem_i64_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x34
@@ -10089,7 +10015,6 @@ define amdgpu_kernel void @srem_v2i64_pow2k_denom(ptr addrspace(1) %out, <2 x i6
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s7
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: srem_v2i64_pow2k_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
@@ -10388,7 +10313,6 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
 ; GFX6-NEXT:    s_endpgm
-;
 ; GFX9-LABEL: srem_v2i64_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-sincos.defined.nobuiltin.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-sincos.defined.nobuiltin.ll
index b94304ec2b2ac..d85f4b8bb6cee 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-sincos.defined.nobuiltin.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-sincos.defined.nobuiltin.ll
@@ -56,12 +56,13 @@ define void @sincos_f32(float %x, ptr addrspace(1) nocapture writeonly %sin_out,
 ; CHECK-LABEL: define void @sincos_f32
 ; CHECK-SAME: (float [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[__SINCOS_:%.*]] = alloca float, align 4
-; CHECK-NEXT:    [[TMP0:%.*]] = call contract float @_Z6sincosfPU3AS0f(float [[X]], ptr [[__SINCOS_]])
-; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[__SINCOS_]], align 4
-; CHECK-NEXT:    store float [[TMP0]], ptr addrspace(1) [[SIN_OUT]], align 4
+; CHECK-NEXT:    [[__SINCOS_:%.*]] = alloca float, align 4, addrspace(5)
+; CHECK-NEXT:    [[TMP0:%.*]] = addrspacecast ptr addrspace(5) [[__SINCOS_]] to ptr
+; CHECK-NEXT:    [[TMP1:%.*]] = call contract float @_Z6sincosfPU3AS0f(float [[X]], ptr [[TMP0]])
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr addrspace(5) [[__SINCOS_]], align 4
+; CHECK-NEXT:    store float [[TMP1]], ptr addrspace(1) [[SIN_OUT]], align 4
 ; CHECK-NEXT:    [[CALL1:%.*]] = tail call contract float @_Z3cosf(float [[X]])
-; CHECK-NEXT:    store float [[TMP1]], ptr addrspace(1) [[COS_OUT]], align 4
+; CHECK-NEXT:    store float [[TMP2]], ptr addrspace(1) [[COS_OUT]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll
index c8ac5bcf8d222..a516d9ecdaf0e 100644
--- a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll
@@ -166,13 +166,13 @@ define void @use_dispatch_id() #1 {
 ; AKF_HSA-LABEL: define {{[^@]+}}@use_dispatch_id
 ; AKF_HSA-SAME: () #[[ATTR1]] {
 ; AKF_HSA-NEXT:    [[VAL:%.*]] = call i64 @llvm.amdgcn.dispatch.id()
-; AKF_HSA-NEXT:    store volatile i64 [[VAL]], ptr addrspace(1) undef, align 4
+; AKF_HSA-NEXT:    store volatile i64 [[VAL]], ptr addrspace(1) undef, align 8
 ; AKF_HSA-NEXT:    ret void
 ;
 ; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_dispatch_id
 ; ATTRIBUTOR_HSA-SAME: () #[[ATTR9:[0-9]+]] {
 ; ATTRIBUTOR_HSA-NEXT:    [[VAL:%.*]] = call i64 @llvm.amdgcn.dispatch.id()
-; ATTRIBUTOR_HSA-NEXT:    store volatile i64 [[VAL]], ptr addrspace(1) undef, align 4
+; ATTRIBUTOR_HSA-NEXT:    store volatile i64 [[VAL]], ptr addrspace(1) undef, align 8
 ; ATTRIBUTOR_HSA-NEXT:    ret void
 ;
   %val = call i64 @llvm.amdgcn.dispatch.id()
diff --git a/llvm/test/CodeGen/AMDGPU/implicitarg-offset-attributes.ll b/llvm/test/CodeGen/AMDGPU/implicitarg-offset-attributes.ll
index b91dcd0bb9a01..c355023270c19 100644
--- a/llvm/test/CodeGen/AMDGPU/implicitarg-offset-attributes.ll
+++ b/llvm/test/CodeGen/AMDGPU/implicitarg-offset-attributes.ll
@@ -42,7 +42,7 @@ define void @use_everything_else() {
 ; CHECK-NEXT:    [[VAL8:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id()
 ; CHECK-NEXT:    store volatile i32 [[VAL8]], ptr addrspace(1) null, align 4
 ; CHECK-NEXT:    [[VAL9:%.*]] = call i64 @llvm.amdgcn.dispatch.id()
-; CHECK-NEXT:    store volatile i64 [[VAL9]], ptr addrspace(1) null, align 4
+; CHECK-NEXT:    store volatile i64 [[VAL9]], ptr addrspace(1) null, align 8
 ; CHECK-NEXT:    ret void
 ;
   %val0 = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/lds-reject-absolute-addresses.ll b/llvm/test/CodeGen/AMDGPU/lds-reject-absolute-addresses.ll
index 753f4a8a06fe9..659cdb55ded2f 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-reject-absolute-addresses.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-reject-absolute-addresses.ll
@@ -11,5 +11,5 @@ define amdgpu_kernel void @kern() {
   ret void
 }
 
-!0 = !{i64 0, i64 1}
+!0 = !{i32 0, i32 1}
 
diff --git a/llvm/test/CodeGen/AMDGPU/lower-ctor-dtor-constexpr-alias.ll b/llvm/test/CodeGen/AMDGPU/lower-ctor-dtor-constexpr-alias.ll
index b97acaa8d7b35..a1929a2e8931c 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-ctor-dtor-constexpr-alias.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-ctor-dtor-constexpr-alias.ll
@@ -34,7 +34,7 @@ define void @bar() addrspace(1) {
 ; CHECK: @[[__INIT_ARRAY_END:[a-zA-Z0-9_$"\\.-]+]] = external addrspace(1) constant [0 x ptr addrspace(1)]
 ; CHECK: @[[__FINI_ARRAY_START:[a-zA-Z0-9_$"\\.-]+]] = external addrspace(1) constant [0 x ptr addrspace(1)]
 ; CHECK: @[[__FINI_ARRAY_END:[a-zA-Z0-9_$"\\.-]+]] = external addrspace(1) constant [0 x ptr addrspace(1)]
-; CHECK: @[[LLVM_USED:[a-zA-Z0-9_$"\\.-]+]] = appending global [2 x ptr] [ptr @amdgcn.device.init, ptr @amdgcn.device.fini], section "llvm.metadata"
+; CHECK: @[[LLVM_USED:[a-zA-Z0-9_$"\\.-]+]] = appending addrspace(1) global [2 x ptr] [ptr @amdgcn.device.init, ptr @amdgcn.device.fini], section "llvm.metadata"
 ; CHECK: @[[FOO_ALIAS:[a-zA-Z0-9_$"\\.-]+]] = hidden alias void (), ptr @foo
 ;.
 ; CHECK-LABEL: define void @foo(
diff --git a/llvm/test/CodeGen/AMDGPU/lower-ctor-dtor.ll b/llvm/test/CodeGen/AMDGPU/lower-ctor-dtor.ll
index aca5886bce5f7..9684421822297 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-ctor-dtor.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-ctor-dtor.ll
@@ -51,7 +51,7 @@ define internal void @bar() {
 ; CHECK: @[[__INIT_ARRAY_END:[a-zA-Z0-9_$"\\.-]+]] = external addrspace(1) constant [0 x ptr addrspace(1)]
 ; CHECK: @[[__FINI_ARRAY_START:[a-zA-Z0-9_$"\\.-]+]] = external addrspace(1) constant [0 x ptr addrspace(1)]
 ; CHECK: @[[__FINI_ARRAY_END:[a-zA-Z0-9_$"\\.-]+]] = external addrspace(1) constant [0 x ptr addrspace(1)]
-; CHECK: @[[LLVM_USED:[a-zA-Z0-9_$"\\.-]+]] = appending global [2 x ptr] [ptr @amdgcn.device.init, ptr @amdgcn.device.fini], section "llvm.metadata"
+; CHECK: @[[LLVM_USED:[a-zA-Z0-9_$"\\.-]+]] = appending addrspace(1) global [2 x ptr] [ptr @amdgcn.device.init, ptr @amdgcn.device.fini], section "llvm.metadata"
 ;.
 ; CHECK-LABEL: define internal void @foo() {
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/CodeGen/AMDGPU/lower-kernel-and-module-lds.ll b/llvm/test/CodeGen/AMDGPU/lower-kernel-and-module-lds.ll
index 8c3b1bc3e6ead..eefa0b23d0c08 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-kernel-and-module-lds.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-kernel-and-module-lds.ll
@@ -15,7 +15,7 @@
 
 ;.
 ; CHECK: @llvm.amdgcn.module.lds = internal addrspace(3) global %llvm.amdgcn.module.lds.t poison, align 8, !absolute_symbol !0
-; CHECK: @llvm.compiler.used = appending global [1 x ptr] [ptr addrspacecast (ptr addrspace(3) @llvm.amdgcn.module.lds to ptr)], section "llvm.metadata"
+; CHECK: @llvm.compiler.used = appending addrspace(1) global [1 x ptr] [ptr addrspacecast (ptr addrspace(3) @llvm.amdgcn.module.lds to ptr)], section "llvm.metadata"
 ; CHECK: @llvm.amdgcn.kernel.k0.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k0.lds.t poison, align 16, !absolute_symbol !0
 ; CHECK: @llvm.amdgcn.kernel.k1.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k1.lds.t poison, align 16, !absolute_symbol !0
 ; CHECK: @llvm.amdgcn.kernel.k2.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k2.lds.t poison, align 2, !absolute_symbol !0
@@ -99,4 +99,4 @@ define void @f0() {
 ; CHECK: attributes #3 = { "amdgpu-lds-size"="4" }
 ; CHECK: attributes #4 = { "amdgpu-lds-size"="9" }
 
-; CHECK: !0 = !{i64 0, i64 1}
+; CHECK: !0 = !{i32 0, i32 1}
diff --git a/llvm/test/CodeGen/AMDGPU/lower-kernel-lds-super-align.ll b/llvm/test/CodeGen/AMDGPU/lower-kernel-lds-super-align.ll
index 95bf1f931c405..61c62c4a00f10 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-kernel-lds-super-align.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-kernel-lds-super-align.ll
@@ -24,7 +24,7 @@
 ; SUPER-ALIGN_ON:  @llvm.amdgcn.kernel.k3.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k3.lds.t poison, align 16
 ; SUPER-ALIGN_OFF: @llvm.amdgcn.kernel.k3.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k3.lds.t poison, align 8
 
-; SUPER-ALIGN_ON:  @llvm.amdgcn.kernel.k4.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k4.lds.t poison, align 16
+; SUPER-ALIGN_ON:  @llvm.amdgcn.kernel.k4.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k4.lds.t poison, align 8
 ; SUPER-ALIGN_OFF: @llvm.amdgcn.kernel.k4.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k4.lds.t poison, align 4
 
 ; CHECK-LABEL: @k1
@@ -133,11 +133,9 @@ define amdgpu_kernel void @k3(i64 %x) {
 ; Check that aligment is not propagated if use is not a pointer operand.
 
 ; CHECK-LABEL: @k4
-; SUPER-ALIGN_ON:  store i32 poison, ptr addrspace(3) %gep, align 8
-; SUPER-ALIGN_OFF: store i32 poison, ptr addrspace(3) %gep, align 4
+; CHECK:           store i32 poison, ptr addrspace(3) %gep, align 4
 ; CHECK:           store ptr addrspace(3) %gep, ptr poison, align 4
-; SUPER-ALIGN_ON:  %val1 = cmpxchg volatile ptr addrspace(3) %gep, i32 1, i32 2 monotonic monotonic, align 8
-; SUPER-ALIGN_OFF: %val1 = cmpxchg volatile ptr addrspace(3) %gep, i32 1, i32 2 monotonic monotonic, align 4
+; CHECK:           %val1 = cmpxchg volatile ptr addrspace(3) %gep, i32 1, i32 2 monotonic monotonic, align 4
 ; CHECK:           %val2 = cmpxchg volatile ptr poison, ptr addrspace(3) %gep, ptr addrspace(3) poison monotonic monotonic, align 4
 define amdgpu_kernel void @k4() {
   %gep = getelementptr inbounds ptr addrspace(3), ptr addrspace(3) @lds.6, i64 1
diff --git a/llvm/test/CodeGen/AMDGPU/lower-kernel-lds.ll b/llvm/test/CodeGen/AMDGPU/lower-kernel-lds.ll
index 9013fdf647921..70142fa4b5b29 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-kernel-lds.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-kernel-lds.ll
@@ -62,4 +62,4 @@ define amdgpu_ps void @k2() {
   ret void
 }
 
-; CHECK: !0 = !{i64 0, i64 1}
+; CHECK: !0 = !{i32 0, i32 1}
diff --git a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll
index 48f5330938581..2547aacddb18e 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-memcpy.ll
@@ -69,7 +69,7 @@ declare void @llvm.memcpy.p3.p3.i64(ptr addrspace(3) noalias nocapture writeonly
 ; CHECK: attributes #[[ATTR0]] = { "amdgpu-lds-size"="7" }
 ; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
 ;.
-; CHECK: [[META0:![0-9]+]] = !{i64 0, i64 1}
+; CHECK: [[META0:![0-9]+]] = !{i32 0, i32 1}
 ; CHECK: [[META1:![0-9]+]] = !{!2}
 ; CHECK: [[META2:![0-9]+]] = distinct !{!2, !3}
 ; CHECK: [[META3:![0-9]+]] = distinct !{!3}
diff --git a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-merge.ll b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-merge.ll
index 6e13cfb00a4b3..d2d15f5ca4577 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-merge.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-merge.ll
@@ -42,7 +42,7 @@ bb:
 !8 = !{!"omnipotent char", !9, i64 0}
 !9 = !{!"Simple C++ TBAA"}
 
-; CHECK:!0 = !{i64 0, i64 1}
+; CHECK:!0 = !{i32 0, i32 1}
 ; CHECK:!1 = !{!2, !3, i64 0}
 ; CHECK:!2 = !{!"no_clobber_ds_load_stores_x2_preexisting_aa", !3, i64 0}
 ; CHECK:!3 = !{!"int", !4, i64 0}
diff --git a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa.ll b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa.ll
index fe88b7770e09c..868ac42dbe8fc 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa.ll
@@ -110,7 +110,7 @@ bb:
   ret void
 }
 
-; CHECK: !0 = !{i64 0, i64 1}
+; CHECK: !0 = !{i32 0, i32 1}
 ; CHECK: !1 = !{!2}
 ; CHECK: !2 = distinct !{!2, !3}
 ; CHECK: !3 = distinct !{!3}
diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-indirect-extern-uses-max-reachable-alignment.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-indirect-extern-uses-max-reachable-alignment.ll
index c999332cb6542..efb4fc0fc2678 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-indirect-extern-uses-max-reachable-alignment.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-indirect-extern-uses-max-reachable-alignment.ll
@@ -16,7 +16,7 @@
 ; CHECK: @dynamic_kernel_only = external addrspace(3) global [0 x double]
 ; CHECK: @dynamic_shared8 = external addrspace(3) global [0 x i64], align 8
 ; CHECK: @llvm.amdgcn.module.lds = internal addrspace(3) global %llvm.amdgcn.module.lds.t poison, align 4, !absolute_symbol !0
-; CHECK: @llvm.compiler.used = appending global [1 x ptr] [ptr addrspacecast (ptr addrspace(3) @llvm.amdgcn.module.lds to ptr)], section "llvm.metadata"
+; CHECK: @llvm.compiler.used = appending addrspace(1) global [1 x ptr] [ptr addrspacecast (ptr addrspace(3) @llvm.amdgcn.module.lds to ptr)], section "llvm.metadata"
 
 ; Alignment of these must be the maximum of the alignment of the reachable symbols
 ; CHECK: @llvm.amdgcn.expect_align1.dynlds = external addrspace(3) global [0 x i8], align 1, !absolute_symbol !0
@@ -103,7 +103,7 @@ define void @use_shared8() #0 {
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(4) [[DYNAMIC_SHARED8]], align 4
 ; CHECK-NEXT:    [[DYNAMIC_SHARED81:%.*]] = inttoptr i32 [[TMP2]] to ptr addrspace(3)
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i64], ptr addrspace(3) [[DYNAMIC_SHARED81]], i32 0, i32 7
-; CHECK-NEXT:    store i64 3, ptr addrspace(3) [[ARRAYIDX]], align 4
+; CHECK-NEXT:    store i64 3, ptr addrspace(3) [[ARRAYIDX]], align 8
 ; CHECK-NEXT:    ret void
 ;
   %arrayidx = getelementptr inbounds [0 x i64], ptr addrspace(3) @dynamic_shared8, i32 0, i32 7
@@ -149,7 +149,7 @@ define amdgpu_kernel void @expect_align8() {
 ; CHECK-LABEL: define amdgpu_kernel void @expect_align8() !llvm.amdgcn.lds.kernel.id !5 {
 ; CHECK-NEXT:    call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.expect_align8.dynlds) ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i64], ptr addrspace(3) @dynamic_shared8, i32 0, i32 9
-; CHECK-NEXT:    store i64 3, ptr addrspace(3) [[ARRAYIDX]], align 4
+; CHECK-NEXT:    store i64 3, ptr addrspace(3) [[ARRAYIDX]], align 8
 ; CHECK-NEXT:    call void @use_shared8()
 ; CHECK-NEXT:    ret void
 ;
@@ -188,8 +188,8 @@ attributes #0 = { noinline }
 ; CHECK: attributes #2 = { nocallback nofree nosync nounwind willreturn memory(none) }
 ; CHECK: attributes #3 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
 
-; CHECK: !0 = !{i64 0, i64 1}
-; CHECK: !1 = !{i64 4, i64 5}
+; CHECK: !0 = !{i32 0, i32 1}
+; CHECK: !1 = !{i32 4, i32 5}
 ; CHECK: !2 = !{i32 0}
 ; CHECK: !3 = !{i32 1}
 ; CHECK: !4 = !{i32 2}
diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-used-list.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-used-list.ll
index cb34d48875d10..6c504315e8d7a 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-used-list.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-used-list.ll
@@ -26,7 +26,7 @@
 @llvm.used = appending global [2 x ptr] [ptr addrspacecast (ptr addrspace(3) @tolower to ptr), ptr addrspacecast (ptr addrspace(1) @ignored to ptr)], section "llvm.metadata"
 
 ; @ignored still in list, @tolower removed, llvm.amdgcn.module.lds appended
-; CHECK: @llvm.compiler.used = appending global [2 x ptr] [ptr addrspacecast (ptr addrspace(1) @ignored to ptr), ptr addrspacecast (ptr addrspace(3) @llvm.amdgcn.module.lds to ptr)], section "llvm.metadata"
+; CHECK: @llvm.compiler.used = appending addrspace(1) global [2 x ptr] [ptr addrspacecast (ptr addrspace(1) @ignored to ptr), ptr addrspacecast (ptr addrspace(3) @llvm.amdgcn.module.lds to ptr)], section "llvm.metadata"
 
 @llvm.compiler.used = appending global [2 x ptr] [ptr addrspacecast (ptr addrspace(3) @tolower to ptr), ptr addrspacecast (ptr addrspace(1) @ignored to ptr)], section "llvm.metadata"
 
diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll
index 00a099e9b9c16..66f65a77dfeb3 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -S -mtriple=amdgcn--amdhsa -passes=amdgpu-lower-module-lds < %s --amdgpu-lower-module-lds-strategy=hybrid | FileCheck -check-prefix=OPT %s
 ; RUN: llc -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s --amdgpu-lower-module-lds-strategy=hybrid | FileCheck -check-prefix=GCN %s
 
@@ -12,7 +12,7 @@
 @unused = addrspace(3) global i16 poison
 
 ; OPT: @llvm.amdgcn.module.lds = internal addrspace(3) global %llvm.amdgcn.module.lds.t poison, align 16, !absolute_symbol !0
-; OPT: @llvm.compiler.used = appending global [1 x ptr] [ptr addrspacecast (ptr addrspace(3) @llvm.amdgcn.module.lds to ptr)], section "llvm.metadata"
+; OPT: @llvm.compiler.used = appending addrspace(1) global [1 x ptr] [ptr addrspacecast (ptr addrspace(3) @llvm.amdgcn.module.lds to ptr)], section "llvm.metadata"
 ; OPT: @llvm.amdgcn.kernel.kernel_no_table.lds = internal addrspace(3) global %llvm.amdgcn.kernel.kernel_no_table.lds.t poison, align 8, !absolute_symbol !0
 ; OPT: @llvm.amdgcn.kernel.k01.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k01.lds.t poison, align 4, !absolute_symbol !1
 ; OPT: @llvm.amdgcn.kernel.k23.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k23.lds.t poison, align 8, !absolute_symbol !0
@@ -73,12 +73,12 @@ define void @f2() {
 ; OPT-NEXT:    [[V22:%.*]] = getelementptr inbounds [2 x [1 x i32]], ptr addrspace(4) @llvm.amdgcn.lds.offset.table, i32 0, i32 [[TMP1]], i32 0
 ; OPT-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(4) [[V22]], align 4
 ; OPT-NEXT:    [[V23:%.*]] = inttoptr i32 [[TMP2]] to ptr addrspace(3)
-; OPT-NEXT:    [[LD:%.*]] = load i64, ptr addrspace(3) [[V23]], align 4
+; OPT-NEXT:    [[LD:%.*]] = load i64, ptr addrspace(3) [[V23]], align 8
 ; OPT-NEXT:    [[MUL:%.*]] = mul i64 [[LD]], 4
 ; OPT-NEXT:    [[V2:%.*]] = getelementptr inbounds [2 x [1 x i32]], ptr addrspace(4) @llvm.amdgcn.lds.offset.table, i32 0, i32 [[TMP1]], i32 0
 ; OPT-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(4) [[V2]], align 4
 ; OPT-NEXT:    [[V21:%.*]] = inttoptr i32 [[TMP3]] to ptr addrspace(3)
-; OPT-NEXT:    store i64 [[MUL]], ptr addrspace(3) [[V21]], align 4
+; OPT-NEXT:    store i64 [[MUL]], ptr addrspace(3) [[V21]], align 8
 ; OPT-NEXT:    ret void
 ;
 ; GCN-LABEL: f2:
@@ -193,7 +193,7 @@ define amdgpu_kernel void @k01() {
 
 define amdgpu_kernel void @k23() {
 ; OPT-LABEL: @k23(
-; OPT-NEXT:    call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k23.lds) ], !alias.scope [[META4:![0-9]+]], !noalias [[META7:![0-9]+]]
+; OPT-NEXT:    call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k23.lds) ], !alias.scope !4, !noalias !7
 ; OPT-NEXT:    call void @f2()
 ; OPT-NEXT:    call void @f3()
 ; OPT-NEXT:    ret void
@@ -231,12 +231,12 @@ define amdgpu_kernel void @k23() {
 ; Access and allocate three variables
 define amdgpu_kernel void @k123() {
 ; OPT-LABEL: @k123(
-; OPT-NEXT:    call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k123.lds) ], !alias.scope [[META10:![0-9]+]], !noalias [[META13:![0-9]+]]
+; OPT-NEXT:    call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k123.lds) ], !alias.scope !10, !noalias !13
 ; OPT-NEXT:    call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ]
 ; OPT-NEXT:    call void @f1()
-; OPT-NEXT:    [[LD:%.*]] = load i8, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_K123_LDS_T:%.*]], ptr addrspace(3) @llvm.amdgcn.kernel.k123.lds, i32 0, i32 1), align 8, !alias.scope [[META13]], !noalias [[META10]]
+; OPT-NEXT:    [[LD:%.*]] = load i8, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_K123_LDS_T:%.*]], ptr addrspace(3) @llvm.amdgcn.kernel.k123.lds, i32 0, i32 1), align 8, !alias.scope !13, !noalias !10
 ; OPT-NEXT:    [[MUL:%.*]] = mul i8 [[LD]], 8
-; OPT-NEXT:    store i8 [[MUL]], ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_K123_LDS_T]], ptr addrspace(3) @llvm.amdgcn.kernel.k123.lds, i32 0, i32 1), align 8, !alias.scope [[META13]], !noalias [[META10]]
+; OPT-NEXT:    store i8 [[MUL]], ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_K123_LDS_T]], ptr addrspace(3) @llvm.amdgcn.kernel.k123.lds, i32 0, i32 1), align 8, !alias.scope !13, !noalias !10
 ; OPT-NEXT:    call void @f2()
 ; OPT-NEXT:    ret void
 ;
@@ -285,14 +285,14 @@ define amdgpu_kernel void @k123() {
 
 
 ; OPT: attributes #0 = { "amdgpu-lds-size"="8" }
-; OPT: attributes #1 = { "amdgpu-lds-size"="12" }
-; OPT: attributes #2 = { "amdgpu-lds-size"="20" }
+; OPT: attributes #1 = { "amdgpu-lds-size"="16" }
+; OPT: attributes #2 = { "amdgpu-lds-size"="24" }
 ; OPT: attributes #3 = { nocallback nofree nosync nounwind willreturn memory(none) }
 ; OPT: attributes #4 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
 
-; OPT: !0 = !{i64 0, i64 1}
-; OPT: !1 = !{i64 4, i64 5}
-; OPT: !2 = !{i64 8, i64 9}
+; OPT: !0 = !{i32 0, i32 1}
+; OPT: !1 = !{i32 4, i32 5}
+; OPT: !2 = !{i32 8, i32 9}
 ; OPT: !3 = !{i32 1}
 ; OPT: !4 = !{!5}
 ; OPT: !5 = distinct !{!5, !6}
diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll
index 82004b84275e6..f1545ed267be6 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll
@@ -118,12 +118,12 @@ define void @f2() {
 ; OPT-NEXT:    [[V22:%.*]] = getelementptr inbounds [3 x [4 x i32]], ptr addrspace(4) @llvm.amdgcn.lds.offset.table, i32 0, i32 [[TMP1]], i32 2
 ; OPT-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(4) [[V22]], align 4
 ; OPT-NEXT:    [[V23:%.*]] = inttoptr i32 [[TMP2]] to ptr addrspace(3)
-; OPT-NEXT:    [[LD:%.*]] = load i64, ptr addrspace(3) [[V23]], align 4
+; OPT-NEXT:    [[LD:%.*]] = load i64, ptr addrspace(3) [[V23]], align 8
 ; OPT-NEXT:    [[MUL:%.*]] = mul i64 [[LD]], 4
 ; OPT-NEXT:    [[V2:%.*]] = getelementptr inbounds [3 x [4 x i32]], ptr addrspace(4) @llvm.amdgcn.lds.offset.table, i32 0, i32 [[TMP1]], i32 2
 ; OPT-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(4) [[V2]], align 4
 ; OPT-NEXT:    [[V21:%.*]] = inttoptr i32 [[TMP3]] to ptr addrspace(3)
-; OPT-NEXT:    store i64 [[MUL]], ptr addrspace(3) [[V21]], align 4
+; OPT-NEXT:    store i64 [[MUL]], ptr addrspace(3) [[V21]], align 8
 ; OPT-NEXT:    ret void
 ;
 ; GCN-LABEL: f2:
@@ -300,7 +300,7 @@ define amdgpu_kernel void @k23() {
 ; Access and allocate three variables
 define amdgpu_kernel void @k123() {
 ; OPT-LABEL: define amdgpu_kernel void @k123(
-; OPT-SAME: ) #[[ATTR2:[0-9]+]] !llvm.amdgcn.lds.kernel.id !13 {
+; OPT-SAME: ) #[[ATTR1]] !llvm.amdgcn.lds.kernel.id !13 {
 ; OPT-NEXT:    call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k123.lds) ], !alias.scope !14, !noalias !17
 ; OPT-NEXT:    call void @f1()
 ; OPT-NEXT:    [[LD:%.*]] = load i8, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_K123_LDS_T:%.*]], ptr addrspace(3) @llvm.amdgcn.kernel.k123.lds, i32 0, i32 1), align 2, !alias.scope !20, !noalias !21
@@ -352,8 +352,7 @@ define amdgpu_kernel void @k123() {
 ; OPT: declare i32 @llvm.amdgcn.lds.kernel.id()
 
 ; OPT: attributes #0 = { "amdgpu-lds-size"="8" }
-; OPT: attributes #1 = { "amdgpu-lds-size"="12" }
-; OPT: attributes #2 = { "amdgpu-lds-size"="16" }
+; OPT: attributes #1 = { "amdgpu-lds-size"="16" }
 
 !0 = !{i64 0, i64 1}
 !1 = !{i32 0}
diff --git a/llvm/test/CodeGen/AMDGPU/lower-multiple-ctor-dtor.ll b/llvm/test/CodeGen/AMDGPU/lower-multiple-ctor-dtor.ll
index 7aa1d212ed417..83bb61d1a6323 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-multiple-ctor-dtor.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-multiple-ctor-dtor.ll
@@ -11,7 +11,7 @@
 ; CHECK: @__init_array_end = external addrspace(1) constant [0 x ptr addrspace(1)]
 ; CHECK: @__fini_array_start = external addrspace(1) constant [0 x ptr addrspace(1)]
 ; CHECK: @__fini_array_end = external addrspace(1) constant [0 x ptr addrspace(1)]
-; CHECK: @llvm.used = appending global [2 x ptr] [ptr @amdgcn.device.init, ptr @amdgcn.device.fini]
+; CHECK: @llvm.used = appending addrspace(1) global [2 x ptr] [ptr @amdgcn.device.init, ptr @amdgcn.device.fini]
 ; UTC_ARGS: --enable
 
 
diff --git a/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll b/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll
index 17bbcde2cc62a..637805122b129 100644
--- a/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll
+++ b/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll
@@ -50,7 +50,7 @@ define amdgpu_kernel void @reduced_nested_loop_conditions(ptr addrspace(3) nocap
 ; IR-NEXT:  bb:
 ; IR-NEXT:    [[MY_TMP:%.*]] = tail call i32 @llvm.amdgcn.workitem.id.x() #[[ATTR4:[0-9]+]]
 ; IR-NEXT:    [[MY_TMP1:%.*]] = getelementptr inbounds i64, ptr addrspace(3) [[ARG:%.*]], i32 [[MY_TMP]]
-; IR-NEXT:    [[MY_TMP2:%.*]] = load volatile i64, ptr addrspace(3) [[MY_TMP1]], align 4
+; IR-NEXT:    [[MY_TMP2:%.*]] = load volatile i64, ptr addrspace(3) [[MY_TMP1]], align 8
 ; IR-NEXT:    br label [[BB5:%.*]]
 ; IR:       bb3:
 ; IR-NEXT:    br i1 true, label [[BB4:%.*]], label [[BB13:%.*]]
@@ -93,6 +93,7 @@ define amdgpu_kernel void @reduced_nested_loop_conditions(ptr addrspace(3) nocap
 ; IR:       bb23:
 ; IR-NEXT:    call void @llvm.amdgcn.end.cf.i64(i64 [[TMP6]])
 ; IR-NEXT:    ret void
+;
 bb:
   %my.tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %my.tmp1 = getelementptr inbounds i64, ptr addrspace(3) %arg, i32 %my.tmp
@@ -276,6 +277,7 @@ define amdgpu_kernel void @nested_loop_conditions(ptr addrspace(1) nocapture %ar
 ; IR-NEXT:    call void @llvm.amdgcn.end.cf.i64(i64 [[TMP7]])
 ; IR-NEXT:    store volatile i32 0, ptr addrspace(1) undef, align 4
 ; IR-NEXT:    ret void
+;
 bb:
   %my.tmp1134 = load volatile i32, ptr addrspace(1) undef
   %my.tmp1235 = icmp slt i32 %my.tmp1134, 9
diff --git a/llvm/test/CodeGen/AMDGPU/opencl-printf.ll b/llvm/test/CodeGen/AMDGPU/opencl-printf.ll
index caf260a083ccb..a1f0a5da125df 100644
--- a/llvm/test/CodeGen/AMDGPU/opencl-printf.ll
+++ b/llvm/test/CodeGen/AMDGPU/opencl-printf.ll
@@ -86,7 +86,7 @@ define amdgpu_kernel void @format_str_f(float %f32.0, double %f64, float %f32.1,
 ; GCN-NEXT:    [[PRINTBUFFNEXTPTR5:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR4]], i32 4
 ; GCN-NEXT:    store i32 [[I32:%.*]], ptr addrspace(1) [[PRINTBUFFNEXTPTR5]], align 4
 ; GCN-NEXT:    [[PRINTBUFFNEXTPTR6:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR5]], i32 4
-; GCN-NEXT:    store i64 [[I64:%.*]], ptr addrspace(1) [[PRINTBUFFNEXTPTR6]], align 4
+; GCN-NEXT:    store i64 [[I64:%.*]], ptr addrspace(1) [[PRINTBUFFNEXTPTR6]], align 8
 ; GCN-NEXT:    [[PRINTBUFFNEXTPTR7:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR6]], i32 8
 ; GCN-NEXT:    store <2 x float> <float 1.000000e+00, float 2.000000e+00>, ptr addrspace(1) [[PRINTBUFFNEXTPTR7]], align 8
 ; GCN-NEXT:    [[PRINTBUFFNEXTPTR8:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR7]], i32 8
@@ -108,7 +108,7 @@ define void @format_str_ptr(ptr %ptr.flat, ptr addrspace(3) %ptr.lds, ptr addrsp
 ; R600-NEXT:    ret void
 ;
 ; GCN-LABEL: @format_str_ptr(
-; GCN-NEXT:    [[PRINTF_ALLOC_FN:%.*]] = call ptr addrspace(1) @__printf_alloc(i32 44)
+; GCN-NEXT:    [[PRINTF_ALLOC_FN:%.*]] = call ptr addrspace(1) @__printf_alloc(i32 36)
 ; GCN-NEXT:    br label [[DOTSPLIT:%.*]]
 ; GCN:       .split:
 ; GCN-NEXT:    [[TMP1:%.*]] = icmp ne ptr addrspace(1) [[PRINTF_ALLOC_FN]], null
@@ -120,12 +120,12 @@ define void @format_str_ptr(ptr %ptr.flat, ptr addrspace(3) %ptr.lds, ptr addrsp
 ; GCN-NEXT:    [[PRINTBUFFGEP:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTF_ALLOC_FN]], i32 4
 ; GCN-NEXT:    store ptr [[PTR_FLAT:%.*]], ptr addrspace(1) [[PRINTBUFFGEP]], align 8
 ; GCN-NEXT:    [[PRINTBUFFNEXTPTR:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFGEP]], i32 8
-; GCN-NEXT:    store ptr addrspace(3) [[PTR_LDS:%.*]], ptr addrspace(1) [[PRINTBUFFNEXTPTR]], align 8
-; GCN-NEXT:    [[PRINTBUFFNEXTPTR1:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR]], i32 8
+; GCN-NEXT:    store ptr addrspace(3) [[PTR_LDS:%.*]], ptr addrspace(1) [[PRINTBUFFNEXTPTR]], align 4
+; GCN-NEXT:    [[PRINTBUFFNEXTPTR1:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR]], i32 4
 ; GCN-NEXT:    store ptr addrspace(1) [[PTR_GLOBAL:%.*]], ptr addrspace(1) [[PRINTBUFFNEXTPTR1]], align 8
 ; GCN-NEXT:    [[PRINTBUFFNEXTPTR2:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR1]], i32 8
-; GCN-NEXT:    store ptr addrspace(5) [[PTR_STACK:%.*]], ptr addrspace(1) [[PRINTBUFFNEXTPTR2]], align 8
-; GCN-NEXT:    [[PRINTBUFFNEXTPTR3:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR2]], i32 8
+; GCN-NEXT:    store ptr addrspace(5) [[PTR_STACK:%.*]], ptr addrspace(1) [[PRINTBUFFNEXTPTR2]], align 4
+; GCN-NEXT:    [[PRINTBUFFNEXTPTR3:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR2]], i32 4
 ; GCN-NEXT:    store ptr addrspace(4) [[PTR_CONST:%.*]], ptr addrspace(1) [[PRINTBUFFNEXTPTR3]], align 8
 ; GCN-NEXT:    br label [[TMP3]]
 ; GCN:       3:
@@ -145,7 +145,7 @@ define amdgpu_kernel void @format_str_d(i1 %i1, i4 %i4, i8 %i8, i24 %i24, i16 %i
 ; GCN-NEXT:    [[TMP2:%.*]] = sext i4 [[I4:%.*]] to i32
 ; GCN-NEXT:    [[TMP3:%.*]] = sext i8 [[I8:%.*]] to i32
 ; GCN-NEXT:    [[TMP4:%.*]] = sext i16 [[I16:%.*]] to i32
-; GCN-NEXT:    [[PRINTF_ALLOC_FN:%.*]] = call ptr addrspace(1) @__printf_alloc(i32 68)
+; GCN-NEXT:    [[PRINTF_ALLOC_FN:%.*]] = call ptr addrspace(1) @__printf_alloc(i32 72)
 ; GCN-NEXT:    br label [[DOTSPLIT:%.*]]
 ; GCN:       .split:
 ; GCN-NEXT:    [[TMP5:%.*]] = icmp ne ptr addrspace(1) [[PRINTF_ALLOC_FN]], null
@@ -167,11 +167,11 @@ define amdgpu_kernel void @format_str_d(i1 %i1, i4 %i4, i8 %i8, i24 %i24, i16 %i
 ; GCN-NEXT:    [[PRINTBUFFNEXTPTR4:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR3]], i32 4
 ; GCN-NEXT:    store i32 [[I32:%.*]], ptr addrspace(1) [[PRINTBUFFNEXTPTR4]], align 4
 ; GCN-NEXT:    [[PRINTBUFFNEXTPTR5:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR4]], i32 4
-; GCN-NEXT:    store i64 [[I64:%.*]], ptr addrspace(1) [[PRINTBUFFNEXTPTR5]], align 4
+; GCN-NEXT:    store i64 [[I64:%.*]], ptr addrspace(1) [[PRINTBUFFNEXTPTR5]], align 8
 ; GCN-NEXT:    [[PRINTBUFFNEXTPTR6:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR5]], i32 8
-; GCN-NEXT:    store i96 [[I96:%.*]], ptr addrspace(1) [[PRINTBUFFNEXTPTR6]], align 4
-; GCN-NEXT:    [[PRINTBUFFNEXTPTR7:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR6]], i32 12
-; GCN-NEXT:    store i128 [[I128:%.*]], ptr addrspace(1) [[PRINTBUFFNEXTPTR7]], align 4
+; GCN-NEXT:    store i96 [[I96:%.*]], ptr addrspace(1) [[PRINTBUFFNEXTPTR6]], align 8
+; GCN-NEXT:    [[PRINTBUFFNEXTPTR7:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR6]], i32 16
+; GCN-NEXT:    store i128 [[I128:%.*]], ptr addrspace(1) [[PRINTBUFFNEXTPTR7]], align 8
 ; GCN-NEXT:    [[PRINTBUFFNEXTPTR8:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR7]], i32 16
 ; GCN-NEXT:    store i32 1234, ptr addrspace(1) [[PRINTBUFFNEXTPTR8]], align 4
 ; GCN-NEXT:    br label [[TMP7]]
@@ -192,7 +192,7 @@ define amdgpu_kernel void @format_str_u(i1 %i1, i4 %i4, i8 %i8, i24 %i24, i16 %i
 ; GCN-NEXT:    [[TMP2:%.*]] = zext i4 [[I4:%.*]] to i32
 ; GCN-NEXT:    [[TMP3:%.*]] = zext i8 [[I8:%.*]] to i32
 ; GCN-NEXT:    [[TMP4:%.*]] = zext i16 [[I16:%.*]] to i32
-; GCN-NEXT:    [[PRINTF_ALLOC_FN:%.*]] = call ptr addrspace(1) @__printf_alloc(i32 68)
+; GCN-NEXT:    [[PRINTF_ALLOC_FN:%.*]] = call ptr addrspace(1) @__printf_alloc(i32 72)
 ; GCN-NEXT:    br label [[DOTSPLIT:%.*]]
 ; GCN:       .split:
 ; GCN-NEXT:    [[TMP5:%.*]] = icmp ne ptr addrspace(1) [[PRINTF_ALLOC_FN]], null
@@ -214,11 +214,11 @@ define amdgpu_kernel void @format_str_u(i1 %i1, i4 %i4, i8 %i8, i24 %i24, i16 %i
 ; GCN-NEXT:    [[PRINTBUFFNEXTPTR4:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR3]], i32 4
 ; GCN-NEXT:    store i32 [[I32:%.*]], ptr addrspace(1) [[PRINTBUFFNEXTPTR4]], align 4
 ; GCN-NEXT:    [[PRINTBUFFNEXTPTR5:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR4]], i32 4
-; GCN-NEXT:    store i64 [[I64:%.*]], ptr addrspace(1) [[PRINTBUFFNEXTPTR5]], align 4
+; GCN-NEXT:    store i64 [[I64:%.*]], ptr addrspace(1) [[PRINTBUFFNEXTPTR5]], align 8
 ; GCN-NEXT:    [[PRINTBUFFNEXTPTR6:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR5]], i32 8
-; GCN-NEXT:    store i96 [[I96:%.*]], ptr addrspace(1) [[PRINTBUFFNEXTPTR6]], align 4
-; GCN-NEXT:    [[PRINTBUFFNEXTPTR7:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR6]], i32 12
-; GCN-NEXT:    store i128 [[I128:%.*]], ptr addrspace(1) [[PRINTBUFFNEXTPTR7]], align 4
+; GCN-NEXT:    store i96 [[I96:%.*]], ptr addrspace(1) [[PRINTBUFFNEXTPTR6]], align 8
+; GCN-NEXT:    [[PRINTBUFFNEXTPTR7:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR6]], i32 16
+; GCN-NEXT:    store i128 [[I128:%.*]], ptr addrspace(1) [[PRINTBUFFNEXTPTR7]], align 8
 ; GCN-NEXT:    [[PRINTBUFFNEXTPTR8:%.*]] = getelementptr i8, ptr addrspace(1) [[PRINTBUFFNEXTPTR7]], i32 16
 ; GCN-NEXT:    store i32 1234, ptr addrspace(1) [[PRINTBUFFNEXTPTR8]], align 4
 ; GCN-NEXT:    br label [[TMP7]]
diff --git a/llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll b/llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll
index 708742bdec83d..7345765430701 100644
--- a/llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll
@@ -134,10 +134,10 @@ define i32 @cmpxchg_private_i32(ptr addrspace(5) %ptr) {
 
 define i64 @cmpxchg_private_i64(ptr addrspace(5) %ptr) {
 ; IR-LABEL: @cmpxchg_private_i64(
-; IR-NEXT:    [[TMP1:%.*]] = load i64, ptr addrspace(5) [[PTR:%.*]], align 4
+; IR-NEXT:    [[TMP1:%.*]] = load i64, ptr addrspace(5) [[PTR:%.*]], align 8
 ; IR-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 0
 ; IR-NEXT:    [[TMP3:%.*]] = select i1 [[TMP2]], i64 1, i64 [[TMP1]]
-; IR-NEXT:    store i64 [[TMP3]], ptr addrspace(5) [[PTR]], align 4
+; IR-NEXT:    store i64 [[TMP3]], ptr addrspace(5) [[PTR]], align 8
 ; IR-NEXT:    [[TMP4:%.*]] = insertvalue { i64, i1 } poison, i64 [[TMP1]], 0
 ; IR-NEXT:    [[TMP5:%.*]] = insertvalue { i64, i1 } [[TMP4]], i1 [[TMP2]], 1
 ; IR-NEXT:    [[RESULT_0:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-array-aggregate.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-array-aggregate.ll
index 68737cb227a00..05ee10598b21a 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-array-aggregate.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-array-aggregate.ll
@@ -93,7 +93,7 @@ define amdgpu_vs void @promote_load_from_store_aggr() #0 {
 ; CHECK-NEXT:    [[FOO3_FCA_0_EXTRACT:%.*]] = extractvalue [2 x float] [[FOO3]], 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> undef, float [[FOO3_FCA_0_EXTRACT]], i32 0
 ; CHECK-NEXT:    [[FOO3_FCA_1_EXTRACT:%.*]] = extractvalue [2 x float] [[FOO3]], 1
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[FOO3_FCA_1_EXTRACT]], i64 1
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float [[FOO3_FCA_1_EXTRACT]], i32 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 [[FOO1]]
 ; CHECK-NEXT:    [[FOO9:%.*]] = insertelement <4 x float> undef, float [[TMP3]], i32 0
 ; CHECK-NEXT:    [[FOO10:%.*]] = insertelement <4 x float> [[FOO9]], float [[TMP3]], i32 1
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-memset.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-memset.ll
index a99c01edcc12d..15af1f17e230e 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-alloca-memset.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-memset.ll
@@ -7,7 +7,7 @@ define amdgpu_kernel void @memset_all_zero(i64 %val) {
 ; CHECK-LABEL: @memset_all_zero(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <6 x i64> zeroinitializer, i64 [[VAL:%.*]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <6 x i64> [[TMP0]], i64 [[VAL]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <6 x i64> [[TMP0]], i64 [[VAL]], i32 1
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -24,7 +24,7 @@ define amdgpu_kernel void @memset_all_5(i64 %val) {
 ; CHECK-LABEL: @memset_all_5(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i64> <i64 361700864190383365, i64 361700864190383365, i64 361700864190383365, i64 361700864190383365>, i64 [[VAL:%.*]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i64> [[TMP0]], i64 [[VAL]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i64> [[TMP0]], i64 [[VAL]], i32 1
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -42,7 +42,7 @@ define amdgpu_kernel void @memset_volatile_nopromote(i64 %val) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[STACK:%.*]] = alloca [4 x i64], align 4, addrspace(5)
 ; CHECK-NEXT:    call void @llvm.memset.p5.i64(ptr addrspace(5) [[STACK]], i8 0, i64 32, i1 true)
-; CHECK-NEXT:    store i64 [[VAL:%.*]], ptr addrspace(5) [[STACK]], align 4
+; CHECK-NEXT:    store i64 [[VAL:%.*]], ptr addrspace(5) [[STACK]], align 8
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -57,7 +57,7 @@ define amdgpu_kernel void @memset_badsize_nopromote(i64 %val) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[STACK:%.*]] = alloca [4 x i64], align 4, addrspace(5)
 ; CHECK-NEXT:    call void @llvm.memset.p5.i64(ptr addrspace(5) [[STACK]], i8 0, i64 31, i1 true)
-; CHECK-NEXT:    store i64 [[VAL:%.*]], ptr addrspace(5) [[STACK]], align 4
+; CHECK-NEXT:    store i64 [[VAL:%.*]], ptr addrspace(5) [[STACK]], align 8
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -73,7 +73,7 @@ define amdgpu_kernel void @memset_offset_ptr_nopromote(i64 %val) {
 ; CHECK-NEXT:    [[STACK:%.*]] = alloca [4 x i64], align 4, addrspace(5)
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr [4 x i64], ptr addrspace(5) [[STACK]], i64 0, i64 1
 ; CHECK-NEXT:    call void @llvm.memset.p5.i64(ptr addrspace(5) [[GEP]], i8 0, i64 24, i1 true)
-; CHECK-NEXT:    store i64 [[VAL:%.*]], ptr addrspace(5) [[STACK]], align 4
+; CHECK-NEXT:    store i64 [[VAL:%.*]], ptr addrspace(5) [[STACK]], align 8
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/rewrite-out-arguments.ll b/llvm/test/CodeGen/AMDGPU/rewrite-out-arguments.ll
index 0d35ba8e1161e..d946d594fde2f 100644
--- a/llvm/test/CodeGen/AMDGPU/rewrite-out-arguments.ll
+++ b/llvm/test/CodeGen/AMDGPU/rewrite-out-arguments.ll
@@ -1,8 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs
 ; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-rewrite-out-arguments < %s | FileCheck %s
-; Temporarily add an explicit datalayout until https://reviews.llvm.org/D141060 lands
-target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8"
-target triple = "amdgcn-amd-amdhsa"
 
 define void @no_ret_blocks() #0 {
   unreachable
diff --git a/llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-1-bpfeb.ll b/llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-1-bpfeb.ll
index ecbfec96a19b0..63a0945edf152 100644
--- a/llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-1-bpfeb.ll
+++ b/llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-1-bpfeb.ll
@@ -36,11 +36,11 @@ define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !13 {
 ; CHECK-ALU64-NEXT:  # %bb.0: # %entry
 ; CHECK-ALU64-NEXT:    #DEBUG_VALUE: test:arg <- $r1
 ; CHECK-ALU64-NEXT:  .Ltmp0:
-; CHECK-ALU64-NEXT:    r1 = 20
+; CHECK-ALU64-NEXT:    r1 = 16
 ; CHECK-ALU64-NEXT:  .Ltmp1:
 ; CHECK-ALU64-NEXT:  .Ltmp2:
 ; CHECK-ALU64-NEXT:  .Ltmp3:
-; CHECK-ALU64-NEXT:    r0 = 4
+; CHECK-ALU64-NEXT:    r0 = 8
 ; CHECK-ALU64-NEXT:  .Ltmp4:
 ; CHECK-ALU64-NEXT:    .loc 1 12 69 prologue_end # test.c:12:69
 ; CHECK-ALU64-NEXT:  .Ltmp5:
@@ -67,11 +67,11 @@ define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !13 {
 ; CHECK-ALU32-NEXT:  # %bb.0: # %entry
 ; CHECK-ALU32-NEXT:    #DEBUG_VALUE: test:arg <- $r1
 ; CHECK-ALU32-NEXT:  .Ltmp0:
-; CHECK-ALU32-NEXT:    r1 = 20
+; CHECK-ALU32-NEXT:    r1 = 16
 ; CHECK-ALU32-NEXT:  .Ltmp1:
 ; CHECK-ALU32-NEXT:  .Ltmp2:
 ; CHECK-ALU32-NEXT:  .Ltmp3:
-; CHECK-ALU32-NEXT:    r0 = 4
+; CHECK-ALU32-NEXT:    r0 = 8
 ; CHECK-ALU32-NEXT:  .Ltmp4:
 ; CHECK-ALU32-NEXT:    .loc 1 12 69 prologue_end # test.c:12:69
 ; CHECK-ALU32-NEXT:  .Ltmp5:
diff --git a/llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-1.ll b/llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-1.ll
index 66a1cf291cd4c..33462198eab15 100644
--- a/llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-1.ll
+++ b/llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-1.ll
@@ -36,18 +36,18 @@ define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !13 {
 ; CHECK-ALU64-NEXT:  # %bb.0: # %entry
 ; CHECK-ALU64-NEXT:    #DEBUG_VALUE: test:arg <- $r1
 ; CHECK-ALU64-NEXT:  .Ltmp0:
-; CHECK-ALU64-NEXT:    r1 = 20
+; CHECK-ALU64-NEXT:    r1 = 16
 ; CHECK-ALU64-NEXT:  .Ltmp1:
 ; CHECK-ALU64-NEXT:  .Ltmp2:
 ; CHECK-ALU64-NEXT:  .Ltmp3:
-; CHECK-ALU64-NEXT:    r0 = 4
+; CHECK-ALU64-NEXT:    r0 = 8
 ; CHECK-ALU64-NEXT:  .Ltmp4:
 ; CHECK-ALU64-NEXT:    .loc 1 12 69 prologue_end # test.c:12:69
 ; CHECK-ALU64-NEXT:  .Ltmp5:
 ; CHECK-ALU64-NEXT:  .Ltmp6:
 ; CHECK-ALU64-NEXT:    r0 += r1
 ; CHECK-ALU64-NEXT:  .Ltmp7:
-; CHECK-ALU64-NEXT:    r1 = 50
+; CHECK-ALU64-NEXT:    r1 = 18
 ; CHECK-ALU64-NEXT:    .loc 1 13 67 # test.c:13:67
 ; CHECK-ALU64-NEXT:  .Ltmp8:
 ; CHECK-ALU64-NEXT:    r0 += r1
@@ -67,18 +67,18 @@ define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !13 {
 ; CHECK-ALU32-NEXT:  # %bb.0: # %entry
 ; CHECK-ALU32-NEXT:    #DEBUG_VALUE: test:arg <- $r1
 ; CHECK-ALU32-NEXT:  .Ltmp0:
-; CHECK-ALU32-NEXT:    r1 = 20
+; CHECK-ALU32-NEXT:    r1 = 16
 ; CHECK-ALU32-NEXT:  .Ltmp1:
 ; CHECK-ALU32-NEXT:  .Ltmp2:
 ; CHECK-ALU32-NEXT:  .Ltmp3:
-; CHECK-ALU32-NEXT:    r0 = 4
+; CHECK-ALU32-NEXT:    r0 = 8
 ; CHECK-ALU32-NEXT:  .Ltmp4:
 ; CHECK-ALU32-NEXT:    .loc 1 12 69 prologue_end # test.c:12:69
 ; CHECK-ALU32-NEXT:  .Ltmp5:
 ; CHECK-ALU32-NEXT:  .Ltmp6:
 ; CHECK-ALU32-NEXT:    w0 += w1
 ; CHECK-ALU32-NEXT:  .Ltmp7:
-; CHECK-ALU32-NEXT:    r1 = 50
+; CHECK-ALU32-NEXT:    r1 = 18
 ; CHECK-ALU32-NEXT:    .loc 1 13 67 # test.c:13:67
 ; CHECK-ALU32-NEXT:  .Ltmp8:
 ; CHECK-ALU32-NEXT:    w0 += w1
diff --git a/llvm/test/CodeGen/X86/expand-large-div-rem-sdiv129.ll b/llvm/test/CodeGen/X86/expand-large-div-rem-sdiv129.ll
index 2b592fdbf0d29..61c8f88634898 100644
--- a/llvm/test/CodeGen/X86/expand-large-div-rem-sdiv129.ll
+++ b/llvm/test/CodeGen/X86/expand-large-div-rem-sdiv129.ll
@@ -4,7 +4,7 @@
 define void @sdiv129(ptr %ptr, ptr %out) nounwind {
 ; CHECK-LABEL: @sdiv129(
 ; CHECK-NEXT:  _udiv-special-cases:
-; CHECK-NEXT:    [[A:%.*]] = load i129, ptr [[PTR:%.*]], align 4
+; CHECK-NEXT:    [[A:%.*]] = load i129, ptr [[PTR:%.*]], align 16
 ; CHECK-NEXT:    [[TMP0:%.*]] = freeze i129 [[A]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = freeze i129 3
 ; CHECK-NEXT:    [[TMP2:%.*]] = ashr i129 [[TMP0]], 128
@@ -66,7 +66,7 @@ define void @sdiv129(ptr %ptr, ptr %out) nounwind {
 ; CHECK-NEXT:    [[TMP48:%.*]] = phi i129 [ [[TMP25]], [[UDIV_LOOP_EXIT]] ], [ [[TMP20]], [[_UDIV_SPECIAL_CASES:%.*]] ]
 ; CHECK-NEXT:    [[TMP49:%.*]] = xor i129 [[TMP48]], [[TMP8]]
 ; CHECK-NEXT:    [[TMP50:%.*]] = sub i129 [[TMP49]], [[TMP8]]
-; CHECK-NEXT:    store i129 [[TMP50]], ptr [[OUT:%.*]], align 4
+; CHECK-NEXT:    store i129 [[TMP50]], ptr [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %a = load i129, ptr %ptr
diff --git a/llvm/test/CodeGen/X86/expand-large-div-rem-srem129.ll b/llvm/test/CodeGen/X86/expand-large-div-rem-srem129.ll
index f0f410def9641..eddb2095c40f5 100644
--- a/llvm/test/CodeGen/X86/expand-large-div-rem-srem129.ll
+++ b/llvm/test/CodeGen/X86/expand-large-div-rem-srem129.ll
@@ -4,7 +4,7 @@
 define void @test(ptr %ptr, ptr %out) nounwind {
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:  _udiv-special-cases:
-; CHECK-NEXT:    [[A:%.*]] = load i129, ptr [[PTR:%.*]], align 4
+; CHECK-NEXT:    [[A:%.*]] = load i129, ptr [[PTR:%.*]], align 16
 ; CHECK-NEXT:    [[TMP0:%.*]] = freeze i129 [[A]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = freeze i129 3
 ; CHECK-NEXT:    [[TMP2:%.*]] = ashr i129 [[TMP0]], 128
@@ -69,7 +69,7 @@ define void @test(ptr %ptr, ptr %out) nounwind {
 ; CHECK-NEXT:    [[TMP51:%.*]] = sub i129 [[TMP8]], [[TMP50]]
 ; CHECK-NEXT:    [[TMP52:%.*]] = xor i129 [[TMP51]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP53:%.*]] = sub i129 [[TMP52]], [[TMP2]]
-; CHECK-NEXT:    store i129 [[TMP53]], ptr [[OUT:%.*]], align 4
+; CHECK-NEXT:    store i129 [[TMP53]], ptr [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %a = load i129, ptr %ptr
diff --git a/llvm/test/CodeGen/X86/expand-large-div-rem-udiv129.ll b/llvm/test/CodeGen/X86/expand-large-div-rem-udiv129.ll
index 79b527ec4f989..78bce5c398cad 100644
--- a/llvm/test/CodeGen/X86/expand-large-div-rem-udiv129.ll
+++ b/llvm/test/CodeGen/X86/expand-large-div-rem-udiv129.ll
@@ -4,7 +4,7 @@
 define void @test(ptr %ptr, ptr %out) nounwind {
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:  _udiv-special-cases:
-; CHECK-NEXT:    [[A:%.*]] = load i129, ptr [[PTR:%.*]], align 4
+; CHECK-NEXT:    [[A:%.*]] = load i129, ptr [[PTR:%.*]], align 16
 ; CHECK-NEXT:    [[TMP0:%.*]] = freeze i129 3
 ; CHECK-NEXT:    [[TMP1:%.*]] = freeze i129 [[A]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i129 [[TMP0]], 0
@@ -55,7 +55,7 @@ define void @test(ptr %ptr, ptr %out) nounwind {
 ; CHECK-NEXT:    br i1 [[TMP38]], label [[UDIV_LOOP_EXIT]], label [[UDIV_PREHEADER]]
 ; CHECK:       udiv-end:
 ; CHECK-NEXT:    [[TMP39:%.*]] = phi i129 [ [[TMP16]], [[UDIV_LOOP_EXIT]] ], [ [[TMP11]], [[_UDIV_SPECIAL_CASES:%.*]] ]
-; CHECK-NEXT:    store i129 [[TMP39]], ptr [[OUT:%.*]], align 4
+; CHECK-NEXT:    store i129 [[TMP39]], ptr [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %a = load i129, ptr %ptr
diff --git a/llvm/test/CodeGen/X86/expand-large-div-rem-urem129.ll b/llvm/test/CodeGen/X86/expand-large-div-rem-urem129.ll
index 134875b00d717..ebe75fcf23c4b 100644
--- a/llvm/test/CodeGen/X86/expand-large-div-rem-urem129.ll
+++ b/llvm/test/CodeGen/X86/expand-large-div-rem-urem129.ll
@@ -4,7 +4,7 @@
 define void @test(ptr %ptr, ptr %out) nounwind {
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:  _udiv-special-cases:
-; CHECK-NEXT:    [[A:%.*]] = load i129, ptr [[PTR:%.*]], align 4
+; CHECK-NEXT:    [[A:%.*]] = load i129, ptr [[PTR:%.*]], align 16
 ; CHECK-NEXT:    [[TMP0:%.*]] = freeze i129 [[A]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = freeze i129 3
 ; CHECK-NEXT:    [[TMP2:%.*]] = freeze i129 [[TMP1]]
@@ -59,7 +59,7 @@ define void @test(ptr %ptr, ptr %out) nounwind {
 ; CHECK-NEXT:    [[TMP41:%.*]] = phi i129 [ [[TMP18]], [[UDIV_LOOP_EXIT]] ], [ [[TMP13]], [[_UDIV_SPECIAL_CASES:%.*]] ]
 ; CHECK-NEXT:    [[TMP42:%.*]] = mul i129 [[TMP1]], [[TMP41]]
 ; CHECK-NEXT:    [[TMP43:%.*]] = sub i129 [[TMP0]], [[TMP42]]
-; CHECK-NEXT:    store i129 [[TMP43]], ptr [[OUT:%.*]], align 4
+; CHECK-NEXT:    store i129 [[TMP43]], ptr [[OUT:%.*]], align 16
 ; CHECK-NEXT:    ret void
 ;
   %a = load i129, ptr %ptr
diff --git a/llvm/test/Instrumentation/HWAddressSanitizer/use-after-scope.ll b/llvm/test/Instrumentation/HWAddressSanitizer/use-after-scope.ll
index 105ce5197fa6e..c678164f1f36b 100644
--- a/llvm/test/Instrumentation/HWAddressSanitizer/use-after-scope.ll
+++ b/llvm/test/Instrumentation/HWAddressSanitizer/use-after-scope.ll
@@ -80,7 +80,7 @@ define dso_local i32 @standard_lifetime() local_unnamed_addr sanitize_hwaddress
 ; AARCH64-SCOPE-LABEL: @standard_lifetime(
 ; AARCH64-SCOPE-NEXT:    [[TMP1:%.*]] = call ptr @llvm.thread.pointer()
 ; AARCH64-SCOPE-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 48
-; AARCH64-SCOPE-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 4
+; AARCH64-SCOPE-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8
 ; AARCH64-SCOPE-NEXT:    [[TMP4:%.*]] = ashr i64 [[TMP3]], 3
 ; AARCH64-SCOPE-NEXT:    [[TMP5:%.*]] = call i64 @llvm.read_register.i64(metadata [[META1:![0-9]+]])
 ; AARCH64-SCOPE-NEXT:    [[TMP6:%.*]] = call ptr @llvm.frameaddress.p0(i32 0)
@@ -88,13 +88,13 @@ define dso_local i32 @standard_lifetime() local_unnamed_addr sanitize_hwaddress
 ; AARCH64-SCOPE-NEXT:    [[TMP8:%.*]] = shl i64 [[TMP7]], 44
 ; AARCH64-SCOPE-NEXT:    [[TMP9:%.*]] = or i64 [[TMP5]], [[TMP8]]
 ; AARCH64-SCOPE-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP3]] to ptr
-; AARCH64-SCOPE-NEXT:    store i64 [[TMP9]], ptr [[TMP10]], align 4
+; AARCH64-SCOPE-NEXT:    store i64 [[TMP9]], ptr [[TMP10]], align 8
 ; AARCH64-SCOPE-NEXT:    [[TMP11:%.*]] = ashr i64 [[TMP3]], 56
 ; AARCH64-SCOPE-NEXT:    [[TMP12:%.*]] = shl nuw nsw i64 [[TMP11]], 12
 ; AARCH64-SCOPE-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], -1
 ; AARCH64-SCOPE-NEXT:    [[TMP14:%.*]] = add i64 [[TMP3]], 8
 ; AARCH64-SCOPE-NEXT:    [[TMP15:%.*]] = and i64 [[TMP14]], [[TMP13]]
-; AARCH64-SCOPE-NEXT:    store i64 [[TMP15]], ptr [[TMP2]], align 4
+; AARCH64-SCOPE-NEXT:    store i64 [[TMP15]], ptr [[TMP2]], align 8
 ; AARCH64-SCOPE-NEXT:    [[TMP16:%.*]] = or i64 [[TMP3]], 4294967295
 ; AARCH64-SCOPE-NEXT:    [[HWASAN_SHADOW:%.*]] = add i64 [[TMP16]], 1
 ; AARCH64-SCOPE-NEXT:    [[TMP17:%.*]] = inttoptr i64 [[HWASAN_SHADOW]] to ptr
@@ -132,7 +132,7 @@ define dso_local i32 @standard_lifetime() local_unnamed_addr sanitize_hwaddress
 ; AARCH64-NOSCOPE-LABEL: @standard_lifetime(
 ; AARCH64-NOSCOPE-NEXT:    [[TMP1:%.*]] = call ptr @llvm.thread.pointer()
 ; AARCH64-NOSCOPE-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 48
-; AARCH64-NOSCOPE-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 4
+; AARCH64-NOSCOPE-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8
 ; AARCH64-NOSCOPE-NEXT:    [[TMP4:%.*]] = ashr i64 [[TMP3]], 3
 ; AARCH64-NOSCOPE-NEXT:    [[TMP5:%.*]] = call i64 @llvm.read_register.i64(metadata [[META1:![0-9]+]])
 ; AARCH64-NOSCOPE-NEXT:    [[TMP6:%.*]] = call ptr @llvm.frameaddress.p0(i32 0)
@@ -140,13 +140,13 @@ define dso_local i32 @standard_lifetime() local_unnamed_addr sanitize_hwaddress
 ; AARCH64-NOSCOPE-NEXT:    [[TMP8:%.*]] = shl i64 [[TMP7]], 44
 ; AARCH64-NOSCOPE-NEXT:    [[TMP9:%.*]] = or i64 [[TMP5]], [[TMP8]]
 ; AARCH64-NOSCOPE-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP3]] to ptr
-; AARCH64-NOSCOPE-NEXT:    store i64 [[TMP9]], ptr [[TMP10]], align 4
+; AARCH64-NOSCOPE-NEXT:    store i64 [[TMP9]], ptr [[TMP10]], align 8
 ; AARCH64-NOSCOPE-NEXT:    [[TMP11:%.*]] = ashr i64 [[TMP3]], 56
 ; AARCH64-NOSCOPE-NEXT:    [[TMP12:%.*]] = shl nuw nsw i64 [[TMP11]], 12
 ; AARCH64-NOSCOPE-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], -1
 ; AARCH64-NOSCOPE-NEXT:    [[TMP14:%.*]] = add i64 [[TMP3]], 8
 ; AARCH64-NOSCOPE-NEXT:    [[TMP15:%.*]] = and i64 [[TMP14]], [[TMP13]]
-; AARCH64-NOSCOPE-NEXT:    store i64 [[TMP15]], ptr [[TMP2]], align 4
+; AARCH64-NOSCOPE-NEXT:    store i64 [[TMP15]], ptr [[TMP2]], align 8
 ; AARCH64-NOSCOPE-NEXT:    [[TMP16:%.*]] = or i64 [[TMP3]], 4294967295
 ; AARCH64-NOSCOPE-NEXT:    [[HWASAN_SHADOW:%.*]] = add i64 [[TMP16]], 1
 ; AARCH64-NOSCOPE-NEXT:    [[TMP17:%.*]] = inttoptr i64 [[HWASAN_SHADOW]] to ptr
@@ -182,7 +182,7 @@ define dso_local i32 @standard_lifetime() local_unnamed_addr sanitize_hwaddress
 ; AARCH64-SHORT-SCOPE-LABEL: @standard_lifetime(
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP1:%.*]] = call ptr @llvm.thread.pointer()
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 48
-; AARCH64-SHORT-SCOPE-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 4
+; AARCH64-SHORT-SCOPE-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP4:%.*]] = ashr i64 [[TMP3]], 3
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP5:%.*]] = call i64 @llvm.read_register.i64(metadata [[META1:![0-9]+]])
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP6:%.*]] = call ptr @llvm.frameaddress.p0(i32 0)
@@ -190,13 +190,13 @@ define dso_local i32 @standard_lifetime() local_unnamed_addr sanitize_hwaddress
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP8:%.*]] = shl i64 [[TMP7]], 44
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP9:%.*]] = or i64 [[TMP5]], [[TMP8]]
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP3]] to ptr
-; AARCH64-SHORT-SCOPE-NEXT:    store i64 [[TMP9]], ptr [[TMP10]], align 4
+; AARCH64-SHORT-SCOPE-NEXT:    store i64 [[TMP9]], ptr [[TMP10]], align 8
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP11:%.*]] = ashr i64 [[TMP3]], 56
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP12:%.*]] = shl nuw nsw i64 [[TMP11]], 12
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], -1
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP14:%.*]] = add i64 [[TMP3]], 8
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP15:%.*]] = and i64 [[TMP14]], [[TMP13]]
-; AARCH64-SHORT-SCOPE-NEXT:    store i64 [[TMP15]], ptr [[TMP2]], align 4
+; AARCH64-SHORT-SCOPE-NEXT:    store i64 [[TMP15]], ptr [[TMP2]], align 8
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP16:%.*]] = or i64 [[TMP3]], 4294967295
 ; AARCH64-SHORT-SCOPE-NEXT:    [[HWASAN_SHADOW:%.*]] = add i64 [[TMP16]], 1
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP17:%.*]] = inttoptr i64 [[HWASAN_SHADOW]] to ptr
@@ -237,7 +237,7 @@ define dso_local i32 @standard_lifetime() local_unnamed_addr sanitize_hwaddress
 ; AARCH64-SHORT-NOSCOPE-LABEL: @standard_lifetime(
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP1:%.*]] = call ptr @llvm.thread.pointer()
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 48
-; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 4
+; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP4:%.*]] = ashr i64 [[TMP3]], 3
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP5:%.*]] = call i64 @llvm.read_register.i64(metadata [[META1:![0-9]+]])
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP6:%.*]] = call ptr @llvm.frameaddress.p0(i32 0)
@@ -245,13 +245,13 @@ define dso_local i32 @standard_lifetime() local_unnamed_addr sanitize_hwaddress
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP8:%.*]] = shl i64 [[TMP7]], 44
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP9:%.*]] = or i64 [[TMP5]], [[TMP8]]
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP3]] to ptr
-; AARCH64-SHORT-NOSCOPE-NEXT:    store i64 [[TMP9]], ptr [[TMP10]], align 4
+; AARCH64-SHORT-NOSCOPE-NEXT:    store i64 [[TMP9]], ptr [[TMP10]], align 8
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP11:%.*]] = ashr i64 [[TMP3]], 56
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP12:%.*]] = shl nuw nsw i64 [[TMP11]], 12
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], -1
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP14:%.*]] = add i64 [[TMP3]], 8
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP15:%.*]] = and i64 [[TMP14]], [[TMP13]]
-; AARCH64-SHORT-NOSCOPE-NEXT:    store i64 [[TMP15]], ptr [[TMP2]], align 4
+; AARCH64-SHORT-NOSCOPE-NEXT:    store i64 [[TMP15]], ptr [[TMP2]], align 8
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP16:%.*]] = or i64 [[TMP3]], 4294967295
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[HWASAN_SHADOW:%.*]] = add i64 [[TMP16]], 1
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP17:%.*]] = inttoptr i64 [[HWASAN_SHADOW]] to ptr
@@ -361,7 +361,7 @@ define dso_local i32 @standard_lifetime_optnone() local_unnamed_addr optnone noi
 ; AARCH64-SCOPE-LABEL: @standard_lifetime_optnone(
 ; AARCH64-SCOPE-NEXT:    [[TMP1:%.*]] = call ptr @llvm.thread.pointer()
 ; AARCH64-SCOPE-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 48
-; AARCH64-SCOPE-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 4
+; AARCH64-SCOPE-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8
 ; AARCH64-SCOPE-NEXT:    [[TMP4:%.*]] = ashr i64 [[TMP3]], 3
 ; AARCH64-SCOPE-NEXT:    [[TMP5:%.*]] = call i64 @llvm.read_register.i64(metadata [[META1]])
 ; AARCH64-SCOPE-NEXT:    [[TMP6:%.*]] = call ptr @llvm.frameaddress.p0(i32 0)
@@ -369,13 +369,13 @@ define dso_local i32 @standard_lifetime_optnone() local_unnamed_addr optnone noi
 ; AARCH64-SCOPE-NEXT:    [[TMP8:%.*]] = shl i64 [[TMP7]], 44
 ; AARCH64-SCOPE-NEXT:    [[TMP9:%.*]] = or i64 [[TMP5]], [[TMP8]]
 ; AARCH64-SCOPE-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP3]] to ptr
-; AARCH64-SCOPE-NEXT:    store i64 [[TMP9]], ptr [[TMP10]], align 4
+; AARCH64-SCOPE-NEXT:    store i64 [[TMP9]], ptr [[TMP10]], align 8
 ; AARCH64-SCOPE-NEXT:    [[TMP11:%.*]] = ashr i64 [[TMP3]], 56
 ; AARCH64-SCOPE-NEXT:    [[TMP12:%.*]] = shl nuw nsw i64 [[TMP11]], 12
 ; AARCH64-SCOPE-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], -1
 ; AARCH64-SCOPE-NEXT:    [[TMP14:%.*]] = add i64 [[TMP3]], 8
 ; AARCH64-SCOPE-NEXT:    [[TMP15:%.*]] = and i64 [[TMP14]], [[TMP13]]
-; AARCH64-SCOPE-NEXT:    store i64 [[TMP15]], ptr [[TMP2]], align 4
+; AARCH64-SCOPE-NEXT:    store i64 [[TMP15]], ptr [[TMP2]], align 8
 ; AARCH64-SCOPE-NEXT:    [[TMP16:%.*]] = or i64 [[TMP3]], 4294967295
 ; AARCH64-SCOPE-NEXT:    [[HWASAN_SHADOW:%.*]] = add i64 [[TMP16]], 1
 ; AARCH64-SCOPE-NEXT:    [[TMP17:%.*]] = inttoptr i64 [[HWASAN_SHADOW]] to ptr
@@ -413,7 +413,7 @@ define dso_local i32 @standard_lifetime_optnone() local_unnamed_addr optnone noi
 ; AARCH64-NOSCOPE-LABEL: @standard_lifetime_optnone(
 ; AARCH64-NOSCOPE-NEXT:    [[TMP1:%.*]] = call ptr @llvm.thread.pointer()
 ; AARCH64-NOSCOPE-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 48
-; AARCH64-NOSCOPE-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 4
+; AARCH64-NOSCOPE-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8
 ; AARCH64-NOSCOPE-NEXT:    [[TMP4:%.*]] = ashr i64 [[TMP3]], 3
 ; AARCH64-NOSCOPE-NEXT:    [[TMP5:%.*]] = call i64 @llvm.read_register.i64(metadata [[META1]])
 ; AARCH64-NOSCOPE-NEXT:    [[TMP6:%.*]] = call ptr @llvm.frameaddress.p0(i32 0)
@@ -421,13 +421,13 @@ define dso_local i32 @standard_lifetime_optnone() local_unnamed_addr optnone noi
 ; AARCH64-NOSCOPE-NEXT:    [[TMP8:%.*]] = shl i64 [[TMP7]], 44
 ; AARCH64-NOSCOPE-NEXT:    [[TMP9:%.*]] = or i64 [[TMP5]], [[TMP8]]
 ; AARCH64-NOSCOPE-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP3]] to ptr
-; AARCH64-NOSCOPE-NEXT:    store i64 [[TMP9]], ptr [[TMP10]], align 4
+; AARCH64-NOSCOPE-NEXT:    store i64 [[TMP9]], ptr [[TMP10]], align 8
 ; AARCH64-NOSCOPE-NEXT:    [[TMP11:%.*]] = ashr i64 [[TMP3]], 56
 ; AARCH64-NOSCOPE-NEXT:    [[TMP12:%.*]] = shl nuw nsw i64 [[TMP11]], 12
 ; AARCH64-NOSCOPE-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], -1
 ; AARCH64-NOSCOPE-NEXT:    [[TMP14:%.*]] = add i64 [[TMP3]], 8
 ; AARCH64-NOSCOPE-NEXT:    [[TMP15:%.*]] = and i64 [[TMP14]], [[TMP13]]
-; AARCH64-NOSCOPE-NEXT:    store i64 [[TMP15]], ptr [[TMP2]], align 4
+; AARCH64-NOSCOPE-NEXT:    store i64 [[TMP15]], ptr [[TMP2]], align 8
 ; AARCH64-NOSCOPE-NEXT:    [[TMP16:%.*]] = or i64 [[TMP3]], 4294967295
 ; AARCH64-NOSCOPE-NEXT:    [[HWASAN_SHADOW:%.*]] = add i64 [[TMP16]], 1
 ; AARCH64-NOSCOPE-NEXT:    [[TMP17:%.*]] = inttoptr i64 [[HWASAN_SHADOW]] to ptr
@@ -463,7 +463,7 @@ define dso_local i32 @standard_lifetime_optnone() local_unnamed_addr optnone noi
 ; AARCH64-SHORT-SCOPE-LABEL: @standard_lifetime_optnone(
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP1:%.*]] = call ptr @llvm.thread.pointer()
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 48
-; AARCH64-SHORT-SCOPE-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 4
+; AARCH64-SHORT-SCOPE-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP4:%.*]] = ashr i64 [[TMP3]], 3
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP5:%.*]] = call i64 @llvm.read_register.i64(metadata [[META1]])
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP6:%.*]] = call ptr @llvm.frameaddress.p0(i32 0)
@@ -471,13 +471,13 @@ define dso_local i32 @standard_lifetime_optnone() local_unnamed_addr optnone noi
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP8:%.*]] = shl i64 [[TMP7]], 44
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP9:%.*]] = or i64 [[TMP5]], [[TMP8]]
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP3]] to ptr
-; AARCH64-SHORT-SCOPE-NEXT:    store i64 [[TMP9]], ptr [[TMP10]], align 4
+; AARCH64-SHORT-SCOPE-NEXT:    store i64 [[TMP9]], ptr [[TMP10]], align 8
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP11:%.*]] = ashr i64 [[TMP3]], 56
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP12:%.*]] = shl nuw nsw i64 [[TMP11]], 12
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], -1
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP14:%.*]] = add i64 [[TMP3]], 8
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP15:%.*]] = and i64 [[TMP14]], [[TMP13]]
-; AARCH64-SHORT-SCOPE-NEXT:    store i64 [[TMP15]], ptr [[TMP2]], align 4
+; AARCH64-SHORT-SCOPE-NEXT:    store i64 [[TMP15]], ptr [[TMP2]], align 8
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP16:%.*]] = or i64 [[TMP3]], 4294967295
 ; AARCH64-SHORT-SCOPE-NEXT:    [[HWASAN_SHADOW:%.*]] = add i64 [[TMP16]], 1
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP17:%.*]] = inttoptr i64 [[HWASAN_SHADOW]] to ptr
@@ -518,7 +518,7 @@ define dso_local i32 @standard_lifetime_optnone() local_unnamed_addr optnone noi
 ; AARCH64-SHORT-NOSCOPE-LABEL: @standard_lifetime_optnone(
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP1:%.*]] = call ptr @llvm.thread.pointer()
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 48
-; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 4
+; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP4:%.*]] = ashr i64 [[TMP3]], 3
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP5:%.*]] = call i64 @llvm.read_register.i64(metadata [[META1]])
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP6:%.*]] = call ptr @llvm.frameaddress.p0(i32 0)
@@ -526,13 +526,13 @@ define dso_local i32 @standard_lifetime_optnone() local_unnamed_addr optnone noi
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP8:%.*]] = shl i64 [[TMP7]], 44
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP9:%.*]] = or i64 [[TMP5]], [[TMP8]]
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP3]] to ptr
-; AARCH64-SHORT-NOSCOPE-NEXT:    store i64 [[TMP9]], ptr [[TMP10]], align 4
+; AARCH64-SHORT-NOSCOPE-NEXT:    store i64 [[TMP9]], ptr [[TMP10]], align 8
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP11:%.*]] = ashr i64 [[TMP3]], 56
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP12:%.*]] = shl nuw nsw i64 [[TMP11]], 12
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], -1
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP14:%.*]] = add i64 [[TMP3]], 8
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP15:%.*]] = and i64 [[TMP14]], [[TMP13]]
-; AARCH64-SHORT-NOSCOPE-NEXT:    store i64 [[TMP15]], ptr [[TMP2]], align 4
+; AARCH64-SHORT-NOSCOPE-NEXT:    store i64 [[TMP15]], ptr [[TMP2]], align 8
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP16:%.*]] = or i64 [[TMP3]], 4294967295
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[HWASAN_SHADOW:%.*]] = add i64 [[TMP16]], 1
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP17:%.*]] = inttoptr i64 [[HWASAN_SHADOW]] to ptr
@@ -632,7 +632,7 @@ define dso_local i32 @multiple_lifetimes() local_unnamed_addr sanitize_hwaddress
 ; AARCH64-SCOPE-LABEL: @multiple_lifetimes(
 ; AARCH64-SCOPE-NEXT:    [[TMP1:%.*]] = call ptr @llvm.thread.pointer()
 ; AARCH64-SCOPE-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 48
-; AARCH64-SCOPE-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 4
+; AARCH64-SCOPE-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8
 ; AARCH64-SCOPE-NEXT:    [[TMP4:%.*]] = ashr i64 [[TMP3]], 3
 ; AARCH64-SCOPE-NEXT:    [[TMP5:%.*]] = call i64 @llvm.read_register.i64(metadata [[META1]])
 ; AARCH64-SCOPE-NEXT:    [[TMP6:%.*]] = call ptr @llvm.frameaddress.p0(i32 0)
@@ -640,13 +640,13 @@ define dso_local i32 @multiple_lifetimes() local_unnamed_addr sanitize_hwaddress
 ; AARCH64-SCOPE-NEXT:    [[TMP8:%.*]] = shl i64 [[TMP7]], 44
 ; AARCH64-SCOPE-NEXT:    [[TMP9:%.*]] = or i64 [[TMP5]], [[TMP8]]
 ; AARCH64-SCOPE-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP3]] to ptr
-; AARCH64-SCOPE-NEXT:    store i64 [[TMP9]], ptr [[TMP10]], align 4
+; AARCH64-SCOPE-NEXT:    store i64 [[TMP9]], ptr [[TMP10]], align 8
 ; AARCH64-SCOPE-NEXT:    [[TMP11:%.*]] = ashr i64 [[TMP3]], 56
 ; AARCH64-SCOPE-NEXT:    [[TMP12:%.*]] = shl nuw nsw i64 [[TMP11]], 12
 ; AARCH64-SCOPE-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], -1
 ; AARCH64-SCOPE-NEXT:    [[TMP14:%.*]] = add i64 [[TMP3]], 8
 ; AARCH64-SCOPE-NEXT:    [[TMP15:%.*]] = and i64 [[TMP14]], [[TMP13]]
-; AARCH64-SCOPE-NEXT:    store i64 [[TMP15]], ptr [[TMP2]], align 4
+; AARCH64-SCOPE-NEXT:    store i64 [[TMP15]], ptr [[TMP2]], align 8
 ; AARCH64-SCOPE-NEXT:    [[TMP16:%.*]] = or i64 [[TMP3]], 4294967295
 ; AARCH64-SCOPE-NEXT:    [[HWASAN_SHADOW:%.*]] = add i64 [[TMP16]], 1
 ; AARCH64-SCOPE-NEXT:    [[TMP17:%.*]] = inttoptr i64 [[HWASAN_SHADOW]] to ptr
@@ -678,7 +678,7 @@ define dso_local i32 @multiple_lifetimes() local_unnamed_addr sanitize_hwaddress
 ; AARCH64-NOSCOPE-LABEL: @multiple_lifetimes(
 ; AARCH64-NOSCOPE-NEXT:    [[TMP1:%.*]] = call ptr @llvm.thread.pointer()
 ; AARCH64-NOSCOPE-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 48
-; AARCH64-NOSCOPE-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 4
+; AARCH64-NOSCOPE-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8
 ; AARCH64-NOSCOPE-NEXT:    [[TMP4:%.*]] = ashr i64 [[TMP3]], 3
 ; AARCH64-NOSCOPE-NEXT:    [[TMP5:%.*]] = call i64 @llvm.read_register.i64(metadata [[META1]])
 ; AARCH64-NOSCOPE-NEXT:    [[TMP6:%.*]] = call ptr @llvm.frameaddress.p0(i32 0)
@@ -686,13 +686,13 @@ define dso_local i32 @multiple_lifetimes() local_unnamed_addr sanitize_hwaddress
 ; AARCH64-NOSCOPE-NEXT:    [[TMP8:%.*]] = shl i64 [[TMP7]], 44
 ; AARCH64-NOSCOPE-NEXT:    [[TMP9:%.*]] = or i64 [[TMP5]], [[TMP8]]
 ; AARCH64-NOSCOPE-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP3]] to ptr
-; AARCH64-NOSCOPE-NEXT:    store i64 [[TMP9]], ptr [[TMP10]], align 4
+; AARCH64-NOSCOPE-NEXT:    store i64 [[TMP9]], ptr [[TMP10]], align 8
 ; AARCH64-NOSCOPE-NEXT:    [[TMP11:%.*]] = ashr i64 [[TMP3]], 56
 ; AARCH64-NOSCOPE-NEXT:    [[TMP12:%.*]] = shl nuw nsw i64 [[TMP11]], 12
 ; AARCH64-NOSCOPE-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], -1
 ; AARCH64-NOSCOPE-NEXT:    [[TMP14:%.*]] = add i64 [[TMP3]], 8
 ; AARCH64-NOSCOPE-NEXT:    [[TMP15:%.*]] = and i64 [[TMP14]], [[TMP13]]
-; AARCH64-NOSCOPE-NEXT:    store i64 [[TMP15]], ptr [[TMP2]], align 4
+; AARCH64-NOSCOPE-NEXT:    store i64 [[TMP15]], ptr [[TMP2]], align 8
 ; AARCH64-NOSCOPE-NEXT:    [[TMP16:%.*]] = or i64 [[TMP3]], 4294967295
 ; AARCH64-NOSCOPE-NEXT:    [[HWASAN_SHADOW:%.*]] = add i64 [[TMP16]], 1
 ; AARCH64-NOSCOPE-NEXT:    [[TMP17:%.*]] = inttoptr i64 [[HWASAN_SHADOW]] to ptr
@@ -724,7 +724,7 @@ define dso_local i32 @multiple_lifetimes() local_unnamed_addr sanitize_hwaddress
 ; AARCH64-SHORT-SCOPE-LABEL: @multiple_lifetimes(
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP1:%.*]] = call ptr @llvm.thread.pointer()
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 48
-; AARCH64-SHORT-SCOPE-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 4
+; AARCH64-SHORT-SCOPE-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP4:%.*]] = ashr i64 [[TMP3]], 3
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP5:%.*]] = call i64 @llvm.read_register.i64(metadata [[META1]])
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP6:%.*]] = call ptr @llvm.frameaddress.p0(i32 0)
@@ -732,13 +732,13 @@ define dso_local i32 @multiple_lifetimes() local_unnamed_addr sanitize_hwaddress
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP8:%.*]] = shl i64 [[TMP7]], 44
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP9:%.*]] = or i64 [[TMP5]], [[TMP8]]
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP3]] to ptr
-; AARCH64-SHORT-SCOPE-NEXT:    store i64 [[TMP9]], ptr [[TMP10]], align 4
+; AARCH64-SHORT-SCOPE-NEXT:    store i64 [[TMP9]], ptr [[TMP10]], align 8
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP11:%.*]] = ashr i64 [[TMP3]], 56
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP12:%.*]] = shl nuw nsw i64 [[TMP11]], 12
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], -1
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP14:%.*]] = add i64 [[TMP3]], 8
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP15:%.*]] = and i64 [[TMP14]], [[TMP13]]
-; AARCH64-SHORT-SCOPE-NEXT:    store i64 [[TMP15]], ptr [[TMP2]], align 4
+; AARCH64-SHORT-SCOPE-NEXT:    store i64 [[TMP15]], ptr [[TMP2]], align 8
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP16:%.*]] = or i64 [[TMP3]], 4294967295
 ; AARCH64-SHORT-SCOPE-NEXT:    [[HWASAN_SHADOW:%.*]] = add i64 [[TMP16]], 1
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP17:%.*]] = inttoptr i64 [[HWASAN_SHADOW]] to ptr
@@ -773,7 +773,7 @@ define dso_local i32 @multiple_lifetimes() local_unnamed_addr sanitize_hwaddress
 ; AARCH64-SHORT-NOSCOPE-LABEL: @multiple_lifetimes(
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP1:%.*]] = call ptr @llvm.thread.pointer()
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 48
-; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 4
+; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP4:%.*]] = ashr i64 [[TMP3]], 3
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP5:%.*]] = call i64 @llvm.read_register.i64(metadata [[META1]])
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP6:%.*]] = call ptr @llvm.frameaddress.p0(i32 0)
@@ -781,13 +781,13 @@ define dso_local i32 @multiple_lifetimes() local_unnamed_addr sanitize_hwaddress
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP8:%.*]] = shl i64 [[TMP7]], 44
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP9:%.*]] = or i64 [[TMP5]], [[TMP8]]
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP3]] to ptr
-; AARCH64-SHORT-NOSCOPE-NEXT:    store i64 [[TMP9]], ptr [[TMP10]], align 4
+; AARCH64-SHORT-NOSCOPE-NEXT:    store i64 [[TMP9]], ptr [[TMP10]], align 8
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP11:%.*]] = ashr i64 [[TMP3]], 56
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP12:%.*]] = shl nuw nsw i64 [[TMP11]], 12
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], -1
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP14:%.*]] = add i64 [[TMP3]], 8
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP15:%.*]] = and i64 [[TMP14]], [[TMP13]]
-; AARCH64-SHORT-NOSCOPE-NEXT:    store i64 [[TMP15]], ptr [[TMP2]], align 4
+; AARCH64-SHORT-NOSCOPE-NEXT:    store i64 [[TMP15]], ptr [[TMP2]], align 8
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP16:%.*]] = or i64 [[TMP3]], 4294967295
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[HWASAN_SHADOW:%.*]] = add i64 [[TMP16]], 1
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP17:%.*]] = inttoptr i64 [[HWASAN_SHADOW]] to ptr
@@ -892,7 +892,7 @@ define dso_local i32 @unreachable_exit() local_unnamed_addr sanitize_hwaddress {
 ; AARCH64-SCOPE-LABEL: @unreachable_exit(
 ; AARCH64-SCOPE-NEXT:    [[TMP1:%.*]] = call ptr @llvm.thread.pointer()
 ; AARCH64-SCOPE-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 48
-; AARCH64-SCOPE-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 4
+; AARCH64-SCOPE-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8
 ; AARCH64-SCOPE-NEXT:    [[TMP4:%.*]] = ashr i64 [[TMP3]], 3
 ; AARCH64-SCOPE-NEXT:    [[TMP5:%.*]] = call i64 @llvm.read_register.i64(metadata [[META1]])
 ; AARCH64-SCOPE-NEXT:    [[TMP6:%.*]] = call ptr @llvm.frameaddress.p0(i32 0)
@@ -900,13 +900,13 @@ define dso_local i32 @unreachable_exit() local_unnamed_addr sanitize_hwaddress {
 ; AARCH64-SCOPE-NEXT:    [[TMP8:%.*]] = shl i64 [[TMP7]], 44
 ; AARCH64-SCOPE-NEXT:    [[TMP9:%.*]] = or i64 [[TMP5]], [[TMP8]]
 ; AARCH64-SCOPE-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP3]] to ptr
-; AARCH64-SCOPE-NEXT:    store i64 [[TMP9]], ptr [[TMP10]], align 4
+; AARCH64-SCOPE-NEXT:    store i64 [[TMP9]], ptr [[TMP10]], align 8
 ; AARCH64-SCOPE-NEXT:    [[TMP11:%.*]] = ashr i64 [[TMP3]], 56
 ; AARCH64-SCOPE-NEXT:    [[TMP12:%.*]] = shl nuw nsw i64 [[TMP11]], 12
 ; AARCH64-SCOPE-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], -1
 ; AARCH64-SCOPE-NEXT:    [[TMP14:%.*]] = add i64 [[TMP3]], 8
 ; AARCH64-SCOPE-NEXT:    [[TMP15:%.*]] = and i64 [[TMP14]], [[TMP13]]
-; AARCH64-SCOPE-NEXT:    store i64 [[TMP15]], ptr [[TMP2]], align 4
+; AARCH64-SCOPE-NEXT:    store i64 [[TMP15]], ptr [[TMP2]], align 8
 ; AARCH64-SCOPE-NEXT:    [[TMP16:%.*]] = or i64 [[TMP3]], 4294967295
 ; AARCH64-SCOPE-NEXT:    [[HWASAN_SHADOW:%.*]] = add i64 [[TMP16]], 1
 ; AARCH64-SCOPE-NEXT:    [[TMP17:%.*]] = inttoptr i64 [[HWASAN_SHADOW]] to ptr
@@ -949,7 +949,7 @@ define dso_local i32 @unreachable_exit() local_unnamed_addr sanitize_hwaddress {
 ; AARCH64-NOSCOPE-LABEL: @unreachable_exit(
 ; AARCH64-NOSCOPE-NEXT:    [[TMP1:%.*]] = call ptr @llvm.thread.pointer()
 ; AARCH64-NOSCOPE-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 48
-; AARCH64-NOSCOPE-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 4
+; AARCH64-NOSCOPE-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8
 ; AARCH64-NOSCOPE-NEXT:    [[TMP4:%.*]] = ashr i64 [[TMP3]], 3
 ; AARCH64-NOSCOPE-NEXT:    [[TMP5:%.*]] = call i64 @llvm.read_register.i64(metadata [[META1]])
 ; AARCH64-NOSCOPE-NEXT:    [[TMP6:%.*]] = call ptr @llvm.frameaddress.p0(i32 0)
@@ -957,13 +957,13 @@ define dso_local i32 @unreachable_exit() local_unnamed_addr sanitize_hwaddress {
 ; AARCH64-NOSCOPE-NEXT:    [[TMP8:%.*]] = shl i64 [[TMP7]], 44
 ; AARCH64-NOSCOPE-NEXT:    [[TMP9:%.*]] = or i64 [[TMP5]], [[TMP8]]
 ; AARCH64-NOSCOPE-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP3]] to ptr
-; AARCH64-NOSCOPE-NEXT:    store i64 [[TMP9]], ptr [[TMP10]], align 4
+; AARCH64-NOSCOPE-NEXT:    store i64 [[TMP9]], ptr [[TMP10]], align 8
 ; AARCH64-NOSCOPE-NEXT:    [[TMP11:%.*]] = ashr i64 [[TMP3]], 56
 ; AARCH64-NOSCOPE-NEXT:    [[TMP12:%.*]] = shl nuw nsw i64 [[TMP11]], 12
 ; AARCH64-NOSCOPE-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], -1
 ; AARCH64-NOSCOPE-NEXT:    [[TMP14:%.*]] = add i64 [[TMP3]], 8
 ; AARCH64-NOSCOPE-NEXT:    [[TMP15:%.*]] = and i64 [[TMP14]], [[TMP13]]
-; AARCH64-NOSCOPE-NEXT:    store i64 [[TMP15]], ptr [[TMP2]], align 4
+; AARCH64-NOSCOPE-NEXT:    store i64 [[TMP15]], ptr [[TMP2]], align 8
 ; AARCH64-NOSCOPE-NEXT:    [[TMP16:%.*]] = or i64 [[TMP3]], 4294967295
 ; AARCH64-NOSCOPE-NEXT:    [[HWASAN_SHADOW:%.*]] = add i64 [[TMP16]], 1
 ; AARCH64-NOSCOPE-NEXT:    [[TMP17:%.*]] = inttoptr i64 [[HWASAN_SHADOW]] to ptr
@@ -1005,7 +1005,7 @@ define dso_local i32 @unreachable_exit() local_unnamed_addr sanitize_hwaddress {
 ; AARCH64-SHORT-SCOPE-LABEL: @unreachable_exit(
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP1:%.*]] = call ptr @llvm.thread.pointer()
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 48
-; AARCH64-SHORT-SCOPE-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 4
+; AARCH64-SHORT-SCOPE-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP4:%.*]] = ashr i64 [[TMP3]], 3
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP5:%.*]] = call i64 @llvm.read_register.i64(metadata [[META1]])
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP6:%.*]] = call ptr @llvm.frameaddress.p0(i32 0)
@@ -1013,13 +1013,13 @@ define dso_local i32 @unreachable_exit() local_unnamed_addr sanitize_hwaddress {
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP8:%.*]] = shl i64 [[TMP7]], 44
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP9:%.*]] = or i64 [[TMP5]], [[TMP8]]
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP3]] to ptr
-; AARCH64-SHORT-SCOPE-NEXT:    store i64 [[TMP9]], ptr [[TMP10]], align 4
+; AARCH64-SHORT-SCOPE-NEXT:    store i64 [[TMP9]], ptr [[TMP10]], align 8
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP11:%.*]] = ashr i64 [[TMP3]], 56
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP12:%.*]] = shl nuw nsw i64 [[TMP11]], 12
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], -1
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP14:%.*]] = add i64 [[TMP3]], 8
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP15:%.*]] = and i64 [[TMP14]], [[TMP13]]
-; AARCH64-SHORT-SCOPE-NEXT:    store i64 [[TMP15]], ptr [[TMP2]], align 4
+; AARCH64-SHORT-SCOPE-NEXT:    store i64 [[TMP15]], ptr [[TMP2]], align 8
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP16:%.*]] = or i64 [[TMP3]], 4294967295
 ; AARCH64-SHORT-SCOPE-NEXT:    [[HWASAN_SHADOW:%.*]] = add i64 [[TMP16]], 1
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP17:%.*]] = inttoptr i64 [[HWASAN_SHADOW]] to ptr
@@ -1065,7 +1065,7 @@ define dso_local i32 @unreachable_exit() local_unnamed_addr sanitize_hwaddress {
 ; AARCH64-SHORT-NOSCOPE-LABEL: @unreachable_exit(
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP1:%.*]] = call ptr @llvm.thread.pointer()
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 48
-; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 4
+; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP4:%.*]] = ashr i64 [[TMP3]], 3
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP5:%.*]] = call i64 @llvm.read_register.i64(metadata [[META1]])
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP6:%.*]] = call ptr @llvm.frameaddress.p0(i32 0)
@@ -1073,13 +1073,13 @@ define dso_local i32 @unreachable_exit() local_unnamed_addr sanitize_hwaddress {
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP8:%.*]] = shl i64 [[TMP7]], 44
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP9:%.*]] = or i64 [[TMP5]], [[TMP8]]
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP3]] to ptr
-; AARCH64-SHORT-NOSCOPE-NEXT:    store i64 [[TMP9]], ptr [[TMP10]], align 4
+; AARCH64-SHORT-NOSCOPE-NEXT:    store i64 [[TMP9]], ptr [[TMP10]], align 8
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP11:%.*]] = ashr i64 [[TMP3]], 56
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP12:%.*]] = shl nuw nsw i64 [[TMP11]], 12
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], -1
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP14:%.*]] = add i64 [[TMP3]], 8
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP15:%.*]] = and i64 [[TMP14]], [[TMP13]]
-; AARCH64-SHORT-NOSCOPE-NEXT:    store i64 [[TMP15]], ptr [[TMP2]], align 4
+; AARCH64-SHORT-NOSCOPE-NEXT:    store i64 [[TMP15]], ptr [[TMP2]], align 8
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP16:%.*]] = or i64 [[TMP3]], 4294967295
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[HWASAN_SHADOW:%.*]] = add i64 [[TMP16]], 1
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP17:%.*]] = inttoptr i64 [[HWASAN_SHADOW]] to ptr
@@ -1200,7 +1200,7 @@ define dso_local i32 @diamond_lifetime() local_unnamed_addr sanitize_hwaddress {
 ; AARCH64-SCOPE-LABEL: @diamond_lifetime(
 ; AARCH64-SCOPE-NEXT:    [[TMP1:%.*]] = call ptr @llvm.thread.pointer()
 ; AARCH64-SCOPE-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 48
-; AARCH64-SCOPE-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 4
+; AARCH64-SCOPE-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8
 ; AARCH64-SCOPE-NEXT:    [[TMP4:%.*]] = ashr i64 [[TMP3]], 3
 ; AARCH64-SCOPE-NEXT:    [[TMP5:%.*]] = call i64 @llvm.read_register.i64(metadata [[META1]])
 ; AARCH64-SCOPE-NEXT:    [[TMP6:%.*]] = call ptr @llvm.frameaddress.p0(i32 0)
@@ -1208,13 +1208,13 @@ define dso_local i32 @diamond_lifetime() local_unnamed_addr sanitize_hwaddress {
 ; AARCH64-SCOPE-NEXT:    [[TMP8:%.*]] = shl i64 [[TMP7]], 44
 ; AARCH64-SCOPE-NEXT:    [[TMP9:%.*]] = or i64 [[TMP5]], [[TMP8]]
 ; AARCH64-SCOPE-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP3]] to ptr
-; AARCH64-SCOPE-NEXT:    store i64 [[TMP9]], ptr [[TMP10]], align 4
+; AARCH64-SCOPE-NEXT:    store i64 [[TMP9]], ptr [[TMP10]], align 8
 ; AARCH64-SCOPE-NEXT:    [[TMP11:%.*]] = ashr i64 [[TMP3]], 56
 ; AARCH64-SCOPE-NEXT:    [[TMP12:%.*]] = shl nuw nsw i64 [[TMP11]], 12
 ; AARCH64-SCOPE-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], -1
 ; AARCH64-SCOPE-NEXT:    [[TMP14:%.*]] = add i64 [[TMP3]], 8
 ; AARCH64-SCOPE-NEXT:    [[TMP15:%.*]] = and i64 [[TMP14]], [[TMP13]]
-; AARCH64-SCOPE-NEXT:    store i64 [[TMP15]], ptr [[TMP2]], align 4
+; AARCH64-SCOPE-NEXT:    store i64 [[TMP15]], ptr [[TMP2]], align 8
 ; AARCH64-SCOPE-NEXT:    [[TMP16:%.*]] = or i64 [[TMP3]], 4294967295
 ; AARCH64-SCOPE-NEXT:    [[HWASAN_SHADOW:%.*]] = add i64 [[TMP16]], 1
 ; AARCH64-SCOPE-NEXT:    [[TMP17:%.*]] = inttoptr i64 [[HWASAN_SHADOW]] to ptr
@@ -1261,7 +1261,7 @@ define dso_local i32 @diamond_lifetime() local_unnamed_addr sanitize_hwaddress {
 ; AARCH64-NOSCOPE-LABEL: @diamond_lifetime(
 ; AARCH64-NOSCOPE-NEXT:    [[TMP1:%.*]] = call ptr @llvm.thread.pointer()
 ; AARCH64-NOSCOPE-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 48
-; AARCH64-NOSCOPE-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 4
+; AARCH64-NOSCOPE-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8
 ; AARCH64-NOSCOPE-NEXT:    [[TMP4:%.*]] = ashr i64 [[TMP3]], 3
 ; AARCH64-NOSCOPE-NEXT:    [[TMP5:%.*]] = call i64 @llvm.read_register.i64(metadata [[META1]])
 ; AARCH64-NOSCOPE-NEXT:    [[TMP6:%.*]] = call ptr @llvm.frameaddress.p0(i32 0)
@@ -1269,13 +1269,13 @@ define dso_local i32 @diamond_lifetime() local_unnamed_addr sanitize_hwaddress {
 ; AARCH64-NOSCOPE-NEXT:    [[TMP8:%.*]] = shl i64 [[TMP7]], 44
 ; AARCH64-NOSCOPE-NEXT:    [[TMP9:%.*]] = or i64 [[TMP5]], [[TMP8]]
 ; AARCH64-NOSCOPE-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP3]] to ptr
-; AARCH64-NOSCOPE-NEXT:    store i64 [[TMP9]], ptr [[TMP10]], align 4
+; AARCH64-NOSCOPE-NEXT:    store i64 [[TMP9]], ptr [[TMP10]], align 8
 ; AARCH64-NOSCOPE-NEXT:    [[TMP11:%.*]] = ashr i64 [[TMP3]], 56
 ; AARCH64-NOSCOPE-NEXT:    [[TMP12:%.*]] = shl nuw nsw i64 [[TMP11]], 12
 ; AARCH64-NOSCOPE-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], -1
 ; AARCH64-NOSCOPE-NEXT:    [[TMP14:%.*]] = add i64 [[TMP3]], 8
 ; AARCH64-NOSCOPE-NEXT:    [[TMP15:%.*]] = and i64 [[TMP14]], [[TMP13]]
-; AARCH64-NOSCOPE-NEXT:    store i64 [[TMP15]], ptr [[TMP2]], align 4
+; AARCH64-NOSCOPE-NEXT:    store i64 [[TMP15]], ptr [[TMP2]], align 8
 ; AARCH64-NOSCOPE-NEXT:    [[TMP16:%.*]] = or i64 [[TMP3]], 4294967295
 ; AARCH64-NOSCOPE-NEXT:    [[HWASAN_SHADOW:%.*]] = add i64 [[TMP16]], 1
 ; AARCH64-NOSCOPE-NEXT:    [[TMP17:%.*]] = inttoptr i64 [[HWASAN_SHADOW]] to ptr
@@ -1313,7 +1313,7 @@ define dso_local i32 @diamond_lifetime() local_unnamed_addr sanitize_hwaddress {
 ; AARCH64-SHORT-SCOPE-LABEL: @diamond_lifetime(
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP1:%.*]] = call ptr @llvm.thread.pointer()
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 48
-; AARCH64-SHORT-SCOPE-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 4
+; AARCH64-SHORT-SCOPE-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP4:%.*]] = ashr i64 [[TMP3]], 3
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP5:%.*]] = call i64 @llvm.read_register.i64(metadata [[META1]])
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP6:%.*]] = call ptr @llvm.frameaddress.p0(i32 0)
@@ -1321,13 +1321,13 @@ define dso_local i32 @diamond_lifetime() local_unnamed_addr sanitize_hwaddress {
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP8:%.*]] = shl i64 [[TMP7]], 44
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP9:%.*]] = or i64 [[TMP5]], [[TMP8]]
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP3]] to ptr
-; AARCH64-SHORT-SCOPE-NEXT:    store i64 [[TMP9]], ptr [[TMP10]], align 4
+; AARCH64-SHORT-SCOPE-NEXT:    store i64 [[TMP9]], ptr [[TMP10]], align 8
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP11:%.*]] = ashr i64 [[TMP3]], 56
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP12:%.*]] = shl nuw nsw i64 [[TMP11]], 12
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], -1
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP14:%.*]] = add i64 [[TMP3]], 8
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP15:%.*]] = and i64 [[TMP14]], [[TMP13]]
-; AARCH64-SHORT-SCOPE-NEXT:    store i64 [[TMP15]], ptr [[TMP2]], align 4
+; AARCH64-SHORT-SCOPE-NEXT:    store i64 [[TMP15]], ptr [[TMP2]], align 8
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP16:%.*]] = or i64 [[TMP3]], 4294967295
 ; AARCH64-SHORT-SCOPE-NEXT:    [[HWASAN_SHADOW:%.*]] = add i64 [[TMP16]], 1
 ; AARCH64-SHORT-SCOPE-NEXT:    [[TMP17:%.*]] = inttoptr i64 [[HWASAN_SHADOW]] to ptr
@@ -1377,7 +1377,7 @@ define dso_local i32 @diamond_lifetime() local_unnamed_addr sanitize_hwaddress {
 ; AARCH64-SHORT-NOSCOPE-LABEL: @diamond_lifetime(
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP1:%.*]] = call ptr @llvm.thread.pointer()
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 48
-; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 4
+; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP2]], align 8
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP4:%.*]] = ashr i64 [[TMP3]], 3
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP5:%.*]] = call i64 @llvm.read_register.i64(metadata [[META1]])
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP6:%.*]] = call ptr @llvm.frameaddress.p0(i32 0)
@@ -1385,13 +1385,13 @@ define dso_local i32 @diamond_lifetime() local_unnamed_addr sanitize_hwaddress {
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP8:%.*]] = shl i64 [[TMP7]], 44
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP9:%.*]] = or i64 [[TMP5]], [[TMP8]]
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP3]] to ptr
-; AARCH64-SHORT-NOSCOPE-NEXT:    store i64 [[TMP9]], ptr [[TMP10]], align 4
+; AARCH64-SHORT-NOSCOPE-NEXT:    store i64 [[TMP9]], ptr [[TMP10]], align 8
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP11:%.*]] = ashr i64 [[TMP3]], 56
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP12:%.*]] = shl nuw nsw i64 [[TMP11]], 12
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP13:%.*]] = xor i64 [[TMP12]], -1
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP14:%.*]] = add i64 [[TMP3]], 8
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP15:%.*]] = and i64 [[TMP14]], [[TMP13]]
-; AARCH64-SHORT-NOSCOPE-NEXT:    store i64 [[TMP15]], ptr [[TMP2]], align 4
+; AARCH64-SHORT-NOSCOPE-NEXT:    store i64 [[TMP15]], ptr [[TMP2]], align 8
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP16:%.*]] = or i64 [[TMP3]], 4294967295
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[HWASAN_SHADOW:%.*]] = add i64 [[TMP16]], 1
 ; AARCH64-SHORT-NOSCOPE-NEXT:    [[TMP17:%.*]] = inttoptr i64 [[HWASAN_SHADOW]] to ptr
diff --git a/llvm/test/Instrumentation/SanitizerCoverage/cmp-tracing-api-x86_32.ll b/llvm/test/Instrumentation/SanitizerCoverage/cmp-tracing-api-x86_32.ll
index 1fcb6047797e3..2ba72d37b70f0 100644
--- a/llvm/test/Instrumentation/SanitizerCoverage/cmp-tracing-api-x86_32.ll
+++ b/llvm/test/Instrumentation/SanitizerCoverage/cmp-tracing-api-x86_32.ll
@@ -6,14 +6,14 @@ target triple = "i386-unknown-linux-gnu"
 define i32 @foo() #0 {
 ; CHECK-LABEL: define i32 @foo() comdat {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    call void @__sanitizer_cov_trace_pc_guard(ptr @__sancov_gen_) #[[ATTR1:[0-9]+]]
+; CHECK-NEXT:    call void @__sanitizer_cov_trace_pc_guard(ptr inttoptr (i32 ptrtoint (ptr @__sancov_gen_ to i32) to ptr)) #[[ATTR1:[0-9]+]]
 ; CHECK-NEXT:    ret i32 0
 ;
 entry:
   ret i32 0
 }
 
-; CHECK-DAG: declare void @__sanitizer_cov_trace_pc_indir(i64)
+; CHECK-DAG: declare void @__sanitizer_cov_trace_pc_indir(i32)
 ; CHECK-DAG: declare void @__sanitizer_cov_trace_cmp1(i8 zeroext, i8 zeroext)
 ; CHECK-DAG: declare void @__sanitizer_cov_trace_cmp2(i16 zeroext, i16 zeroext)
 ; CHECK-DAG: declare void @__sanitizer_cov_trace_cmp4(i32 zeroext, i32 zeroext)
@@ -24,7 +24,7 @@ entry:
 ; CHECK-DAG: declare void @__sanitizer_cov_trace_const_cmp8(i64, i64)
 ; CHECK-DAG: declare void @__sanitizer_cov_trace_div4(i32 zeroext)
 ; CHECK-DAG: declare void @__sanitizer_cov_trace_div8(i64)
-; CHECK-DAG: declare void @__sanitizer_cov_trace_gep(i64)
+; CHECK-DAG: declare void @__sanitizer_cov_trace_gep(i32)
 ; CHECK-DAG: declare void @__sanitizer_cov_trace_switch(i64, ptr)
 ; CHECK-DAG: declare void @__sanitizer_cov_trace_pc()
 ; CHECK-DAG: declare void @__sanitizer_cov_trace_pc_guard(ptr)
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i8.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i8.ll
index ad57ca640a515..6a6e416bdbc89 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i8.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i8.ll
@@ -1,96 +1,172 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -atomic-expand %s | FileCheck %s
-; RUN: opt -mtriple=r600-mesa-mesa3d -S -atomic-expand %s | FileCheck %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -atomic-expand %s | FileCheck %s --check-prefixes=CHECK,GCN
+; RUN: opt -mtriple=r600-mesa-mesa3d -S -atomic-expand %s | FileCheck %s --check-prefixes=CHECK,R600
 
 define i8 @test_atomicrmw_xchg_i8_global_agent(ptr addrspace(1) %ptr, i8 %value) {
-; CHECK-LABEL: @test_atomicrmw_xchg_i8_global_agent(
-; CHECK-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
-; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
-; CHECK-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; CHECK-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; CHECK-NEXT:    [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]]
-; CHECK-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
-; CHECK-NEXT:    [[TMP3:%.*]] = zext i8 [[VALUE:%.*]] to i32
-; CHECK-NEXT:    [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[SHIFTAMT]]
-; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP4]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; CHECK-NEXT:    [[TMP5:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
-; CHECK-NEXT:    [[TMP6:%.*]] = or i32 [[TMP5]], [[VALOPERAND_SHIFTED]]
-; CHECK-NEXT:    [[TMP7:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[TMP6]] syncscope("agent") seq_cst seq_cst, align 4
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP7]], 1
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP7]], 0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; CHECK:       atomicrmw.end:
-; CHECK-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
-; CHECK-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
-; CHECK-NEXT:    ret i8 [[EXTRACTED]]
+; GCN-LABEL: @test_atomicrmw_xchg_i8_global_agent(
+; GCN-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
+; GCN-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
+; GCN-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
+; GCN-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
+; GCN-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
+; GCN-NEXT:    [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]]
+; GCN-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; GCN-NEXT:    [[TMP3:%.*]] = zext i8 [[VALUE:%.*]] to i32
+; GCN-NEXT:    [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[SHIFTAMT]]
+; GCN-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
+; GCN-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GCN:       atomicrmw.start:
+; GCN-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP4]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; GCN-NEXT:    [[TMP5:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; GCN-NEXT:    [[TMP6:%.*]] = or i32 [[TMP5]], [[VALOPERAND_SHIFTED]]
+; GCN-NEXT:    [[TMP7:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[TMP6]] syncscope("agent") seq_cst seq_cst, align 4
+; GCN-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP7]], 1
+; GCN-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP7]], 0
+; GCN-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GCN:       atomicrmw.end:
+; GCN-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; GCN-NEXT:    ret i8 [[EXTRACTED]]
+;
+; R600-LABEL: @test_atomicrmw_xchg_i8_global_agent(
+; R600-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) [[PTR:%.*]], i32 -4)
+; R600-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i32
+; R600-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; R600-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; R600-NEXT:    [[MASK:%.*]] = shl i32 255, [[TMP2]]
+; R600-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; R600-NEXT:    [[TMP3:%.*]] = zext i8 [[VALUE:%.*]] to i32
+; R600-NEXT:    [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[TMP2]]
+; R600-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
+; R600-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; R600:       atomicrmw.start:
+; R600-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP4]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; R600-NEXT:    [[TMP5:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; R600-NEXT:    [[TMP6:%.*]] = or i32 [[TMP5]], [[VALOPERAND_SHIFTED]]
+; R600-NEXT:    [[TMP7:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[TMP6]] syncscope("agent") seq_cst seq_cst, align 4
+; R600-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP7]], 1
+; R600-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP7]], 0
+; R600-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; R600:       atomicrmw.end:
+; R600-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
+; R600-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; R600-NEXT:    ret i8 [[EXTRACTED]]
 ;
   %res = atomicrmw xchg ptr addrspace(1) %ptr, i8 %value syncscope("agent") seq_cst
   ret i8 %res
 }
 
 define i8 @test_atomicrmw_add_i8_global_agent(ptr addrspace(1) %ptr, i8 %value) {
-; CHECK-LABEL: @test_atomicrmw_add_i8_global_agent(
-; CHECK-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
-; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
-; CHECK-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; CHECK-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; CHECK-NEXT:    [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]]
-; CHECK-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
-; CHECK-NEXT:    [[TMP3:%.*]] = zext i8 [[VALUE:%.*]] to i32
-; CHECK-NEXT:    [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[SHIFTAMT]]
-; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP4]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; CHECK-NEXT:    [[NEW:%.*]] = add i32 [[LOADED]], [[VALOPERAND_SHIFTED]]
-; CHECK-NEXT:    [[TMP5:%.*]] = and i32 [[NEW]], [[MASK]]
-; CHECK-NEXT:    [[TMP6:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
-; CHECK-NEXT:    [[TMP7:%.*]] = or i32 [[TMP6]], [[TMP5]]
-; CHECK-NEXT:    [[TMP8:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[TMP7]] syncscope("agent") seq_cst seq_cst, align 4
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP8]], 1
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP8]], 0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; CHECK:       atomicrmw.end:
-; CHECK-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
-; CHECK-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
-; CHECK-NEXT:    ret i8 [[EXTRACTED]]
+; GCN-LABEL: @test_atomicrmw_add_i8_global_agent(
+; GCN-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
+; GCN-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
+; GCN-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
+; GCN-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
+; GCN-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
+; GCN-NEXT:    [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]]
+; GCN-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; GCN-NEXT:    [[TMP3:%.*]] = zext i8 [[VALUE:%.*]] to i32
+; GCN-NEXT:    [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[SHIFTAMT]]
+; GCN-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
+; GCN-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GCN:       atomicrmw.start:
+; GCN-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP4]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; GCN-NEXT:    [[NEW:%.*]] = add i32 [[LOADED]], [[VALOPERAND_SHIFTED]]
+; GCN-NEXT:    [[TMP5:%.*]] = and i32 [[NEW]], [[MASK]]
+; GCN-NEXT:    [[TMP6:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; GCN-NEXT:    [[TMP7:%.*]] = or i32 [[TMP6]], [[TMP5]]
+; GCN-NEXT:    [[TMP8:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[TMP7]] syncscope("agent") seq_cst seq_cst, align 4
+; GCN-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP8]], 1
+; GCN-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP8]], 0
+; GCN-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GCN:       atomicrmw.end:
+; GCN-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; GCN-NEXT:    ret i8 [[EXTRACTED]]
+;
+; R600-LABEL: @test_atomicrmw_add_i8_global_agent(
+; R600-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) [[PTR:%.*]], i32 -4)
+; R600-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i32
+; R600-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; R600-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; R600-NEXT:    [[MASK:%.*]] = shl i32 255, [[TMP2]]
+; R600-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; R600-NEXT:    [[TMP3:%.*]] = zext i8 [[VALUE:%.*]] to i32
+; R600-NEXT:    [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[TMP2]]
+; R600-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
+; R600-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; R600:       atomicrmw.start:
+; R600-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP4]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; R600-NEXT:    [[NEW:%.*]] = add i32 [[LOADED]], [[VALOPERAND_SHIFTED]]
+; R600-NEXT:    [[TMP5:%.*]] = and i32 [[NEW]], [[MASK]]
+; R600-NEXT:    [[TMP6:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; R600-NEXT:    [[TMP7:%.*]] = or i32 [[TMP6]], [[TMP5]]
+; R600-NEXT:    [[TMP8:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[TMP7]] syncscope("agent") seq_cst seq_cst, align 4
+; R600-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP8]], 1
+; R600-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP8]], 0
+; R600-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; R600:       atomicrmw.end:
+; R600-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
+; R600-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; R600-NEXT:    ret i8 [[EXTRACTED]]
 ;
   %res = atomicrmw add ptr addrspace(1) %ptr, i8 %value syncscope("agent") seq_cst
   ret i8 %res
 }
 
 define i8 @test_atomicrmw_add_i8_global_agent_align2(ptr addrspace(1) %ptr, i8 %value) {
-; CHECK-LABEL: @test_atomicrmw_add_i8_global_agent_align2(
-; CHECK-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
-; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
-; CHECK-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; CHECK-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; CHECK-NEXT:    [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]]
-; CHECK-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
-; CHECK-NEXT:    [[TMP3:%.*]] = zext i8 [[VALUE:%.*]] to i32
-; CHECK-NEXT:    [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[SHIFTAMT]]
-; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP4]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; CHECK-NEXT:    [[NEW:%.*]] = add i32 [[LOADED]], [[VALOPERAND_SHIFTED]]
-; CHECK-NEXT:    [[TMP5:%.*]] = and i32 [[NEW]], [[MASK]]
-; CHECK-NEXT:    [[TMP6:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
-; CHECK-NEXT:    [[TMP7:%.*]] = or i32 [[TMP6]], [[TMP5]]
-; CHECK-NEXT:    [[TMP8:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[TMP7]] syncscope("agent") seq_cst seq_cst, align 4
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP8]], 1
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP8]], 0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; CHECK:       atomicrmw.end:
-; CHECK-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
-; CHECK-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
-; CHECK-NEXT:    ret i8 [[EXTRACTED]]
+; GCN-LABEL: @test_atomicrmw_add_i8_global_agent_align2(
+; GCN-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
+; GCN-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
+; GCN-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
+; GCN-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
+; GCN-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
+; GCN-NEXT:    [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]]
+; GCN-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; GCN-NEXT:    [[TMP3:%.*]] = zext i8 [[VALUE:%.*]] to i32
+; GCN-NEXT:    [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[SHIFTAMT]]
+; GCN-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
+; GCN-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GCN:       atomicrmw.start:
+; GCN-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP4]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; GCN-NEXT:    [[NEW:%.*]] = add i32 [[LOADED]], [[VALOPERAND_SHIFTED]]
+; GCN-NEXT:    [[TMP5:%.*]] = and i32 [[NEW]], [[MASK]]
+; GCN-NEXT:    [[TMP6:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; GCN-NEXT:    [[TMP7:%.*]] = or i32 [[TMP6]], [[TMP5]]
+; GCN-NEXT:    [[TMP8:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[TMP7]] syncscope("agent") seq_cst seq_cst, align 4
+; GCN-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP8]], 1
+; GCN-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP8]], 0
+; GCN-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GCN:       atomicrmw.end:
+; GCN-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; GCN-NEXT:    ret i8 [[EXTRACTED]]
+;
+; R600-LABEL: @test_atomicrmw_add_i8_global_agent_align2(
+; R600-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) [[PTR:%.*]], i32 -4)
+; R600-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i32
+; R600-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; R600-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; R600-NEXT:    [[MASK:%.*]] = shl i32 255, [[TMP2]]
+; R600-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; R600-NEXT:    [[TMP3:%.*]] = zext i8 [[VALUE:%.*]] to i32
+; R600-NEXT:    [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[TMP2]]
+; R600-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
+; R600-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; R600:       atomicrmw.start:
+; R600-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP4]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; R600-NEXT:    [[NEW:%.*]] = add i32 [[LOADED]], [[VALOPERAND_SHIFTED]]
+; R600-NEXT:    [[TMP5:%.*]] = and i32 [[NEW]], [[MASK]]
+; R600-NEXT:    [[TMP6:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; R600-NEXT:    [[TMP7:%.*]] = or i32 [[TMP6]], [[TMP5]]
+; R600-NEXT:    [[TMP8:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[TMP7]] syncscope("agent") seq_cst seq_cst, align 4
+; R600-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP8]], 1
+; R600-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP8]], 0
+; R600-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; R600:       atomicrmw.end:
+; R600-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
+; R600-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; R600-NEXT:    ret i8 [[EXTRACTED]]
 ;
   %res = atomicrmw add ptr addrspace(1) %ptr, i8 %value syncscope("agent") seq_cst, align 2
   ret i8 %res
@@ -120,303 +196,546 @@ define i8 @test_atomicrmw_add_i8_global_agent_align4(ptr addrspace(1) %ptr, i8 %
 }
 
 define i8 @test_atomicrmw_sub_i8_global_agent(ptr addrspace(1) %ptr, i8 %value) {
-; CHECK-LABEL: @test_atomicrmw_sub_i8_global_agent(
-; CHECK-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
-; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
-; CHECK-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; CHECK-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; CHECK-NEXT:    [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]]
-; CHECK-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
-; CHECK-NEXT:    [[TMP3:%.*]] = zext i8 [[VALUE:%.*]] to i32
-; CHECK-NEXT:    [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[SHIFTAMT]]
-; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP4]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; CHECK-NEXT:    [[NEW:%.*]] = sub i32 [[LOADED]], [[VALOPERAND_SHIFTED]]
-; CHECK-NEXT:    [[TMP5:%.*]] = and i32 [[NEW]], [[MASK]]
-; CHECK-NEXT:    [[TMP6:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
-; CHECK-NEXT:    [[TMP7:%.*]] = or i32 [[TMP6]], [[TMP5]]
-; CHECK-NEXT:    [[TMP8:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[TMP7]] syncscope("agent") seq_cst seq_cst, align 4
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP8]], 1
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP8]], 0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; CHECK:       atomicrmw.end:
-; CHECK-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
-; CHECK-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
-; CHECK-NEXT:    ret i8 [[EXTRACTED]]
+; GCN-LABEL: @test_atomicrmw_sub_i8_global_agent(
+; GCN-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
+; GCN-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
+; GCN-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
+; GCN-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
+; GCN-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
+; GCN-NEXT:    [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]]
+; GCN-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; GCN-NEXT:    [[TMP3:%.*]] = zext i8 [[VALUE:%.*]] to i32
+; GCN-NEXT:    [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[SHIFTAMT]]
+; GCN-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
+; GCN-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GCN:       atomicrmw.start:
+; GCN-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP4]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; GCN-NEXT:    [[NEW:%.*]] = sub i32 [[LOADED]], [[VALOPERAND_SHIFTED]]
+; GCN-NEXT:    [[TMP5:%.*]] = and i32 [[NEW]], [[MASK]]
+; GCN-NEXT:    [[TMP6:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; GCN-NEXT:    [[TMP7:%.*]] = or i32 [[TMP6]], [[TMP5]]
+; GCN-NEXT:    [[TMP8:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[TMP7]] syncscope("agent") seq_cst seq_cst, align 4
+; GCN-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP8]], 1
+; GCN-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP8]], 0
+; GCN-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GCN:       atomicrmw.end:
+; GCN-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; GCN-NEXT:    ret i8 [[EXTRACTED]]
+;
+; R600-LABEL: @test_atomicrmw_sub_i8_global_agent(
+; R600-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) [[PTR:%.*]], i32 -4)
+; R600-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i32
+; R600-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; R600-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; R600-NEXT:    [[MASK:%.*]] = shl i32 255, [[TMP2]]
+; R600-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; R600-NEXT:    [[TMP3:%.*]] = zext i8 [[VALUE:%.*]] to i32
+; R600-NEXT:    [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[TMP2]]
+; R600-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
+; R600-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; R600:       atomicrmw.start:
+; R600-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP4]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; R600-NEXT:    [[NEW:%.*]] = sub i32 [[LOADED]], [[VALOPERAND_SHIFTED]]
+; R600-NEXT:    [[TMP5:%.*]] = and i32 [[NEW]], [[MASK]]
+; R600-NEXT:    [[TMP6:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; R600-NEXT:    [[TMP7:%.*]] = or i32 [[TMP6]], [[TMP5]]
+; R600-NEXT:    [[TMP8:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[TMP7]] syncscope("agent") seq_cst seq_cst, align 4
+; R600-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP8]], 1
+; R600-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP8]], 0
+; R600-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; R600:       atomicrmw.end:
+; R600-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
+; R600-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; R600-NEXT:    ret i8 [[EXTRACTED]]
 ;
   %res = atomicrmw sub ptr addrspace(1) %ptr, i8 %value syncscope("agent") seq_cst
   ret i8 %res
 }
 
 define i8 @test_atomicrmw_and_i8_global_agent(ptr addrspace(1) %ptr, i8 %value) {
-; CHECK-LABEL: @test_atomicrmw_and_i8_global_agent(
-; CHECK-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
-; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
-; CHECK-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; CHECK-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; CHECK-NEXT:    [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]]
-; CHECK-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
-; CHECK-NEXT:    [[TMP3:%.*]] = zext i8 [[VALUE:%.*]] to i32
-; CHECK-NEXT:    [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[SHIFTAMT]]
-; CHECK-NEXT:    [[ANDOPERAND:%.*]] = or i32 [[INV_MASK]], [[VALOPERAND_SHIFTED]]
-; CHECK-NEXT:    [[TMP4:%.*]] = atomicrmw and ptr addrspace(1) [[ALIGNEDADDR]], i32 [[ANDOPERAND]] syncscope("agent") seq_cst, align 4
-; CHECK-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[TMP4]], [[SHIFTAMT]]
-; CHECK-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
-; CHECK-NEXT:    ret i8 [[EXTRACTED]]
+; GCN-LABEL: @test_atomicrmw_and_i8_global_agent(
+; GCN-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
+; GCN-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
+; GCN-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
+; GCN-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
+; GCN-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
+; GCN-NEXT:    [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]]
+; GCN-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; GCN-NEXT:    [[TMP3:%.*]] = zext i8 [[VALUE:%.*]] to i32
+; GCN-NEXT:    [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[SHIFTAMT]]
+; GCN-NEXT:    [[ANDOPERAND:%.*]] = or i32 [[INV_MASK]], [[VALOPERAND_SHIFTED]]
+; GCN-NEXT:    [[TMP4:%.*]] = atomicrmw and ptr addrspace(1) [[ALIGNEDADDR]], i32 [[ANDOPERAND]] syncscope("agent") seq_cst, align 4
+; GCN-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[TMP4]], [[SHIFTAMT]]
+; GCN-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; GCN-NEXT:    ret i8 [[EXTRACTED]]
+;
+; R600-LABEL: @test_atomicrmw_and_i8_global_agent(
+; R600-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) [[PTR:%.*]], i32 -4)
+; R600-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i32
+; R600-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; R600-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; R600-NEXT:    [[MASK:%.*]] = shl i32 255, [[TMP2]]
+; R600-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; R600-NEXT:    [[TMP3:%.*]] = zext i8 [[VALUE:%.*]] to i32
+; R600-NEXT:    [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[TMP2]]
+; R600-NEXT:    [[ANDOPERAND:%.*]] = or i32 [[INV_MASK]], [[VALOPERAND_SHIFTED]]
+; R600-NEXT:    [[TMP4:%.*]] = atomicrmw and ptr addrspace(1) [[ALIGNEDADDR]], i32 [[ANDOPERAND]] syncscope("agent") seq_cst, align 4
+; R600-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[TMP4]], [[TMP2]]
+; R600-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; R600-NEXT:    ret i8 [[EXTRACTED]]
 ;
   %res = atomicrmw and ptr addrspace(1) %ptr, i8 %value syncscope("agent") seq_cst
   ret i8 %res
 }
 
 define i8 @test_atomicrmw_nand_i8_global_agent(ptr addrspace(1) %ptr, i8 %value) {
-; CHECK-LABEL: @test_atomicrmw_nand_i8_global_agent(
-; CHECK-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
-; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
-; CHECK-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; CHECK-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; CHECK-NEXT:    [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]]
-; CHECK-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
-; CHECK-NEXT:    [[TMP3:%.*]] = zext i8 [[VALUE:%.*]] to i32
-; CHECK-NEXT:    [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[SHIFTAMT]]
-; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP4]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; CHECK-NEXT:    [[TMP5:%.*]] = and i32 [[LOADED]], [[VALOPERAND_SHIFTED]]
-; CHECK-NEXT:    [[NEW:%.*]] = xor i32 [[TMP5]], -1
-; CHECK-NEXT:    [[TMP6:%.*]] = and i32 [[NEW]], [[MASK]]
-; CHECK-NEXT:    [[TMP7:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
-; CHECK-NEXT:    [[TMP8:%.*]] = or i32 [[TMP7]], [[TMP6]]
-; CHECK-NEXT:    [[TMP9:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[TMP8]] syncscope("agent") seq_cst seq_cst, align 4
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP9]], 1
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP9]], 0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; CHECK:       atomicrmw.end:
-; CHECK-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
-; CHECK-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
-; CHECK-NEXT:    ret i8 [[EXTRACTED]]
+; GCN-LABEL: @test_atomicrmw_nand_i8_global_agent(
+; GCN-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
+; GCN-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
+; GCN-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
+; GCN-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
+; GCN-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
+; GCN-NEXT:    [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]]
+; GCN-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; GCN-NEXT:    [[TMP3:%.*]] = zext i8 [[VALUE:%.*]] to i32
+; GCN-NEXT:    [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[SHIFTAMT]]
+; GCN-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
+; GCN-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GCN:       atomicrmw.start:
+; GCN-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP4]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; GCN-NEXT:    [[TMP5:%.*]] = and i32 [[LOADED]], [[VALOPERAND_SHIFTED]]
+; GCN-NEXT:    [[NEW:%.*]] = xor i32 [[TMP5]], -1
+; GCN-NEXT:    [[TMP6:%.*]] = and i32 [[NEW]], [[MASK]]
+; GCN-NEXT:    [[TMP7:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; GCN-NEXT:    [[TMP8:%.*]] = or i32 [[TMP7]], [[TMP6]]
+; GCN-NEXT:    [[TMP9:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[TMP8]] syncscope("agent") seq_cst seq_cst, align 4
+; GCN-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP9]], 1
+; GCN-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP9]], 0
+; GCN-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GCN:       atomicrmw.end:
+; GCN-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; GCN-NEXT:    ret i8 [[EXTRACTED]]
+;
+; R600-LABEL: @test_atomicrmw_nand_i8_global_agent(
+; R600-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) [[PTR:%.*]], i32 -4)
+; R600-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i32
+; R600-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; R600-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; R600-NEXT:    [[MASK:%.*]] = shl i32 255, [[TMP2]]
+; R600-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; R600-NEXT:    [[TMP3:%.*]] = zext i8 [[VALUE:%.*]] to i32
+; R600-NEXT:    [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[TMP2]]
+; R600-NEXT:    [[TMP4:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
+; R600-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; R600:       atomicrmw.start:
+; R600-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP4]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; R600-NEXT:    [[TMP5:%.*]] = and i32 [[LOADED]], [[VALOPERAND_SHIFTED]]
+; R600-NEXT:    [[NEW:%.*]] = xor i32 [[TMP5]], -1
+; R600-NEXT:    [[TMP6:%.*]] = and i32 [[NEW]], [[MASK]]
+; R600-NEXT:    [[TMP7:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; R600-NEXT:    [[TMP8:%.*]] = or i32 [[TMP7]], [[TMP6]]
+; R600-NEXT:    [[TMP9:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[TMP8]] syncscope("agent") seq_cst seq_cst, align 4
+; R600-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP9]], 1
+; R600-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP9]], 0
+; R600-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; R600:       atomicrmw.end:
+; R600-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
+; R600-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; R600-NEXT:    ret i8 [[EXTRACTED]]
 ;
   %res = atomicrmw nand ptr addrspace(1) %ptr, i8 %value syncscope("agent") seq_cst
   ret i8 %res
 }
 
 define i8 @test_atomicrmw_or_i8_global_agent(ptr addrspace(1) %ptr, i8 %value) {
-; CHECK-LABEL: @test_atomicrmw_or_i8_global_agent(
-; CHECK-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
-; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
-; CHECK-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; CHECK-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; CHECK-NEXT:    [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]]
-; CHECK-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
-; CHECK-NEXT:    [[TMP3:%.*]] = zext i8 [[VALUE:%.*]] to i32
-; CHECK-NEXT:    [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[SHIFTAMT]]
-; CHECK-NEXT:    [[TMP4:%.*]] = atomicrmw or ptr addrspace(1) [[ALIGNEDADDR]], i32 [[VALOPERAND_SHIFTED]] syncscope("agent") seq_cst, align 4
-; CHECK-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[TMP4]], [[SHIFTAMT]]
-; CHECK-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
-; CHECK-NEXT:    ret i8 [[EXTRACTED]]
+; GCN-LABEL: @test_atomicrmw_or_i8_global_agent(
+; GCN-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
+; GCN-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
+; GCN-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
+; GCN-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
+; GCN-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
+; GCN-NEXT:    [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]]
+; GCN-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; GCN-NEXT:    [[TMP3:%.*]] = zext i8 [[VALUE:%.*]] to i32
+; GCN-NEXT:    [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[SHIFTAMT]]
+; GCN-NEXT:    [[TMP4:%.*]] = atomicrmw or ptr addrspace(1) [[ALIGNEDADDR]], i32 [[VALOPERAND_SHIFTED]] syncscope("agent") seq_cst, align 4
+; GCN-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[TMP4]], [[SHIFTAMT]]
+; GCN-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; GCN-NEXT:    ret i8 [[EXTRACTED]]
+;
+; R600-LABEL: @test_atomicrmw_or_i8_global_agent(
+; R600-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) [[PTR:%.*]], i32 -4)
+; R600-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i32
+; R600-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; R600-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; R600-NEXT:    [[MASK:%.*]] = shl i32 255, [[TMP2]]
+; R600-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; R600-NEXT:    [[TMP3:%.*]] = zext i8 [[VALUE:%.*]] to i32
+; R600-NEXT:    [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[TMP2]]
+; R600-NEXT:    [[TMP4:%.*]] = atomicrmw or ptr addrspace(1) [[ALIGNEDADDR]], i32 [[VALOPERAND_SHIFTED]] syncscope("agent") seq_cst, align 4
+; R600-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[TMP4]], [[TMP2]]
+; R600-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; R600-NEXT:    ret i8 [[EXTRACTED]]
 ;
   %res = atomicrmw or ptr addrspace(1) %ptr, i8 %value syncscope("agent") seq_cst
   ret i8 %res
 }
 
 define i8 @test_atomicrmw_xor_i8_global_agent(ptr addrspace(1) %ptr, i8 %value) {
-; CHECK-LABEL: @test_atomicrmw_xor_i8_global_agent(
-; CHECK-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
-; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
-; CHECK-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; CHECK-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; CHECK-NEXT:    [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]]
-; CHECK-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
-; CHECK-NEXT:    [[TMP3:%.*]] = zext i8 [[VALUE:%.*]] to i32
-; CHECK-NEXT:    [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[SHIFTAMT]]
-; CHECK-NEXT:    [[TMP4:%.*]] = atomicrmw xor ptr addrspace(1) [[ALIGNEDADDR]], i32 [[VALOPERAND_SHIFTED]] syncscope("agent") seq_cst, align 4
-; CHECK-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[TMP4]], [[SHIFTAMT]]
-; CHECK-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
-; CHECK-NEXT:    ret i8 [[EXTRACTED]]
+; GCN-LABEL: @test_atomicrmw_xor_i8_global_agent(
+; GCN-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
+; GCN-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
+; GCN-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
+; GCN-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
+; GCN-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
+; GCN-NEXT:    [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]]
+; GCN-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; GCN-NEXT:    [[TMP3:%.*]] = zext i8 [[VALUE:%.*]] to i32
+; GCN-NEXT:    [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[SHIFTAMT]]
+; GCN-NEXT:    [[TMP4:%.*]] = atomicrmw xor ptr addrspace(1) [[ALIGNEDADDR]], i32 [[VALOPERAND_SHIFTED]] syncscope("agent") seq_cst, align 4
+; GCN-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[TMP4]], [[SHIFTAMT]]
+; GCN-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; GCN-NEXT:    ret i8 [[EXTRACTED]]
+;
+; R600-LABEL: @test_atomicrmw_xor_i8_global_agent(
+; R600-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) [[PTR:%.*]], i32 -4)
+; R600-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i32
+; R600-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; R600-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; R600-NEXT:    [[MASK:%.*]] = shl i32 255, [[TMP2]]
+; R600-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; R600-NEXT:    [[TMP3:%.*]] = zext i8 [[VALUE:%.*]] to i32
+; R600-NEXT:    [[VALOPERAND_SHIFTED:%.*]] = shl i32 [[TMP3]], [[TMP2]]
+; R600-NEXT:    [[TMP4:%.*]] = atomicrmw xor ptr addrspace(1) [[ALIGNEDADDR]], i32 [[VALOPERAND_SHIFTED]] syncscope("agent") seq_cst, align 4
+; R600-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[TMP4]], [[TMP2]]
+; R600-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; R600-NEXT:    ret i8 [[EXTRACTED]]
 ;
   %res = atomicrmw xor ptr addrspace(1) %ptr, i8 %value syncscope("agent") seq_cst
   ret i8 %res
 }
 
 define i8 @test_atomicrmw_max_i8_global_agent(ptr addrspace(1) %ptr, i8 %value) {
-; CHECK-LABEL: @test_atomicrmw_max_i8_global_agent(
-; CHECK-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
-; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
-; CHECK-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; CHECK-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; CHECK-NEXT:    [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]]
-; CHECK-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; CHECK-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
-; CHECK-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp sgt i8 [[EXTRACTED]], [[VALUE:%.*]]
-; CHECK-NEXT:    [[NEW:%.*]] = select i1 [[TMP4]], i8 [[EXTRACTED]], i8 [[VALUE]]
-; CHECK-NEXT:    [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
-; CHECK-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
-; CHECK-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
-; CHECK-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
-; CHECK-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP5]], 0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; CHECK:       atomicrmw.end:
-; CHECK-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
-; CHECK-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
-; CHECK-NEXT:    ret i8 [[EXTRACTED3]]
+; GCN-LABEL: @test_atomicrmw_max_i8_global_agent(
+; GCN-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
+; GCN-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
+; GCN-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
+; GCN-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
+; GCN-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
+; GCN-NEXT:    [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]]
+; GCN-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; GCN-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
+; GCN-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GCN:       atomicrmw.start:
+; GCN-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; GCN-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; GCN-NEXT:    [[TMP4:%.*]] = icmp sgt i8 [[EXTRACTED]], [[VALUE:%.*]]
+; GCN-NEXT:    [[NEW:%.*]] = select i1 [[TMP4]], i8 [[EXTRACTED]], i8 [[VALUE]]
+; GCN-NEXT:    [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; GCN-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; GCN-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; GCN-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; GCN-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GCN-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GCN-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GCN:       atomicrmw.end:
+; GCN-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
+; GCN-NEXT:    ret i8 [[EXTRACTED3]]
+;
+; R600-LABEL: @test_atomicrmw_max_i8_global_agent(
+; R600-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) [[PTR:%.*]], i32 -4)
+; R600-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i32
+; R600-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; R600-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; R600-NEXT:    [[MASK:%.*]] = shl i32 255, [[TMP2]]
+; R600-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; R600-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
+; R600-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; R600:       atomicrmw.start:
+; R600-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; R600-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
+; R600-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; R600-NEXT:    [[TMP4:%.*]] = icmp sgt i8 [[EXTRACTED]], [[VALUE:%.*]]
+; R600-NEXT:    [[NEW:%.*]] = select i1 [[TMP4]], i8 [[EXTRACTED]], i8 [[VALUE]]
+; R600-NEXT:    [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; R600-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
+; R600-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; R600-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; R600-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; R600-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; R600-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP5]], 0
+; R600-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; R600:       atomicrmw.end:
+; R600-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
+; R600-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
+; R600-NEXT:    ret i8 [[EXTRACTED3]]
 ;
   %res = atomicrmw max ptr addrspace(1) %ptr, i8 %value syncscope("agent") seq_cst
   ret i8 %res
 }
 
 define i8 @test_atomicrmw_min_i8_global_agent(ptr addrspace(1) %ptr, i8 %value) {
-; CHECK-LABEL: @test_atomicrmw_min_i8_global_agent(
-; CHECK-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
-; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
-; CHECK-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; CHECK-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; CHECK-NEXT:    [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]]
-; CHECK-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; CHECK-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
-; CHECK-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp sle i8 [[EXTRACTED]], [[VALUE:%.*]]
-; CHECK-NEXT:    [[NEW:%.*]] = select i1 [[TMP4]], i8 [[EXTRACTED]], i8 [[VALUE]]
-; CHECK-NEXT:    [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
-; CHECK-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
-; CHECK-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
-; CHECK-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
-; CHECK-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP5]], 0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; CHECK:       atomicrmw.end:
-; CHECK-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
-; CHECK-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
-; CHECK-NEXT:    ret i8 [[EXTRACTED3]]
+; GCN-LABEL: @test_atomicrmw_min_i8_global_agent(
+; GCN-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
+; GCN-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
+; GCN-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
+; GCN-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
+; GCN-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
+; GCN-NEXT:    [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]]
+; GCN-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; GCN-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
+; GCN-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GCN:       atomicrmw.start:
+; GCN-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; GCN-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; GCN-NEXT:    [[TMP4:%.*]] = icmp sle i8 [[EXTRACTED]], [[VALUE:%.*]]
+; GCN-NEXT:    [[NEW:%.*]] = select i1 [[TMP4]], i8 [[EXTRACTED]], i8 [[VALUE]]
+; GCN-NEXT:    [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; GCN-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; GCN-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; GCN-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; GCN-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GCN-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GCN-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GCN:       atomicrmw.end:
+; GCN-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
+; GCN-NEXT:    ret i8 [[EXTRACTED3]]
+;
+; R600-LABEL: @test_atomicrmw_min_i8_global_agent(
+; R600-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) [[PTR:%.*]], i32 -4)
+; R600-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i32
+; R600-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; R600-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; R600-NEXT:    [[MASK:%.*]] = shl i32 255, [[TMP2]]
+; R600-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; R600-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
+; R600-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; R600:       atomicrmw.start:
+; R600-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; R600-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
+; R600-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; R600-NEXT:    [[TMP4:%.*]] = icmp sle i8 [[EXTRACTED]], [[VALUE:%.*]]
+; R600-NEXT:    [[NEW:%.*]] = select i1 [[TMP4]], i8 [[EXTRACTED]], i8 [[VALUE]]
+; R600-NEXT:    [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; R600-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
+; R600-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; R600-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; R600-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; R600-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; R600-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP5]], 0
+; R600-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; R600:       atomicrmw.end:
+; R600-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
+; R600-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
+; R600-NEXT:    ret i8 [[EXTRACTED3]]
 ;
   %res = atomicrmw min ptr addrspace(1) %ptr, i8 %value syncscope("agent") seq_cst
   ret i8 %res
 }
 
 define i8 @test_atomicrmw_umax_i8_global_agent(ptr addrspace(1) %ptr, i8 %value) {
-; CHECK-LABEL: @test_atomicrmw_umax_i8_global_agent(
-; CHECK-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
-; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
-; CHECK-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; CHECK-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; CHECK-NEXT:    [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]]
-; CHECK-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; CHECK-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
-; CHECK-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp ugt i8 [[EXTRACTED]], [[VALUE:%.*]]
-; CHECK-NEXT:    [[NEW:%.*]] = select i1 [[TMP4]], i8 [[EXTRACTED]], i8 [[VALUE]]
-; CHECK-NEXT:    [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
-; CHECK-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
-; CHECK-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
-; CHECK-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
-; CHECK-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP5]], 0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; CHECK:       atomicrmw.end:
-; CHECK-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
-; CHECK-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
-; CHECK-NEXT:    ret i8 [[EXTRACTED3]]
+; GCN-LABEL: @test_atomicrmw_umax_i8_global_agent(
+; GCN-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
+; GCN-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
+; GCN-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
+; GCN-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
+; GCN-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
+; GCN-NEXT:    [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]]
+; GCN-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; GCN-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
+; GCN-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GCN:       atomicrmw.start:
+; GCN-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; GCN-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; GCN-NEXT:    [[TMP4:%.*]] = icmp ugt i8 [[EXTRACTED]], [[VALUE:%.*]]
+; GCN-NEXT:    [[NEW:%.*]] = select i1 [[TMP4]], i8 [[EXTRACTED]], i8 [[VALUE]]
+; GCN-NEXT:    [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; GCN-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; GCN-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; GCN-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; GCN-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GCN-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GCN-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GCN:       atomicrmw.end:
+; GCN-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
+; GCN-NEXT:    ret i8 [[EXTRACTED3]]
+;
+; R600-LABEL: @test_atomicrmw_umax_i8_global_agent(
+; R600-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) [[PTR:%.*]], i32 -4)
+; R600-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i32
+; R600-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; R600-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; R600-NEXT:    [[MASK:%.*]] = shl i32 255, [[TMP2]]
+; R600-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; R600-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
+; R600-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; R600:       atomicrmw.start:
+; R600-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; R600-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
+; R600-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; R600-NEXT:    [[TMP4:%.*]] = icmp ugt i8 [[EXTRACTED]], [[VALUE:%.*]]
+; R600-NEXT:    [[NEW:%.*]] = select i1 [[TMP4]], i8 [[EXTRACTED]], i8 [[VALUE]]
+; R600-NEXT:    [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; R600-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
+; R600-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; R600-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; R600-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; R600-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; R600-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP5]], 0
+; R600-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; R600:       atomicrmw.end:
+; R600-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
+; R600-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
+; R600-NEXT:    ret i8 [[EXTRACTED3]]
 ;
   %res = atomicrmw umax ptr addrspace(1) %ptr, i8 %value syncscope("agent") seq_cst
   ret i8 %res
 }
 
 define i8 @test_atomicrmw_umin_i8_global_agent(ptr addrspace(1) %ptr, i8 %value) {
-; CHECK-LABEL: @test_atomicrmw_umin_i8_global_agent(
-; CHECK-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
-; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
-; CHECK-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; CHECK-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; CHECK-NEXT:    [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]]
-; CHECK-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; CHECK-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
-; CHECK-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp ule i8 [[EXTRACTED]], [[VALUE:%.*]]
-; CHECK-NEXT:    [[NEW:%.*]] = select i1 [[TMP4]], i8 [[EXTRACTED]], i8 [[VALUE]]
-; CHECK-NEXT:    [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
-; CHECK-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
-; CHECK-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
-; CHECK-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
-; CHECK-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP5]], 0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; CHECK:       atomicrmw.end:
-; CHECK-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
-; CHECK-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
-; CHECK-NEXT:    ret i8 [[EXTRACTED3]]
+; GCN-LABEL: @test_atomicrmw_umin_i8_global_agent(
+; GCN-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
+; GCN-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
+; GCN-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
+; GCN-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
+; GCN-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
+; GCN-NEXT:    [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]]
+; GCN-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; GCN-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
+; GCN-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GCN:       atomicrmw.start:
+; GCN-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; GCN-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; GCN-NEXT:    [[TMP4:%.*]] = icmp ule i8 [[EXTRACTED]], [[VALUE:%.*]]
+; GCN-NEXT:    [[NEW:%.*]] = select i1 [[TMP4]], i8 [[EXTRACTED]], i8 [[VALUE]]
+; GCN-NEXT:    [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; GCN-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; GCN-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; GCN-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; GCN-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GCN-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GCN-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GCN:       atomicrmw.end:
+; GCN-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
+; GCN-NEXT:    ret i8 [[EXTRACTED3]]
+;
+; R600-LABEL: @test_atomicrmw_umin_i8_global_agent(
+; R600-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) [[PTR:%.*]], i32 -4)
+; R600-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i32
+; R600-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; R600-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; R600-NEXT:    [[MASK:%.*]] = shl i32 255, [[TMP2]]
+; R600-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; R600-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
+; R600-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; R600:       atomicrmw.start:
+; R600-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; R600-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
+; R600-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; R600-NEXT:    [[TMP4:%.*]] = icmp ule i8 [[EXTRACTED]], [[VALUE:%.*]]
+; R600-NEXT:    [[NEW:%.*]] = select i1 [[TMP4]], i8 [[EXTRACTED]], i8 [[VALUE]]
+; R600-NEXT:    [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; R600-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
+; R600-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; R600-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; R600-NEXT:    [[TMP5:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; R600-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; R600-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP5]], 0
+; R600-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; R600:       atomicrmw.end:
+; R600-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
+; R600-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
+; R600-NEXT:    ret i8 [[EXTRACTED3]]
 ;
   %res = atomicrmw umin ptr addrspace(1) %ptr, i8 %value syncscope("agent") seq_cst
   ret i8 %res
 }
 
 define i8 @test_cmpxchg_i8_global_agent(ptr addrspace(1) %out, i8 %in, i8 %old) {
-; CHECK-LABEL: @test_cmpxchg_i8_global_agent(
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT:%.*]], i64 4
-; CHECK-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[GEP]], i64 -4)
-; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[GEP]] to i64
-; CHECK-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; CHECK-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; CHECK-NEXT:    [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]]
-; CHECK-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
-; CHECK-NEXT:    [[TMP3:%.*]] = zext i8 [[IN:%.*]] to i32
-; CHECK-NEXT:    [[TMP4:%.*]] = shl i32 [[TMP3]], [[SHIFTAMT]]
-; CHECK-NEXT:    [[TMP5:%.*]] = zext i8 [[OLD:%.*]] to i32
-; CHECK-NEXT:    [[TMP6:%.*]] = shl i32 [[TMP5]], [[SHIFTAMT]]
-; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = and i32 [[TMP7]], [[INV_MASK]]
-; CHECK-NEXT:    br label [[PARTWORD_CMPXCHG_LOOP:%.*]]
-; CHECK:       partword.cmpxchg.loop:
-; CHECK-NEXT:    [[TMP9:%.*]] = phi i32 [ [[TMP8]], [[TMP0:%.*]] ], [ [[TMP15:%.*]], [[PARTWORD_CMPXCHG_FAILURE:%.*]] ]
-; CHECK-NEXT:    [[TMP10:%.*]] = or i32 [[TMP9]], [[TMP4]]
-; CHECK-NEXT:    [[TMP11:%.*]] = or i32 [[TMP9]], [[TMP6]]
-; CHECK-NEXT:    [[TMP12:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[TMP11]], i32 [[TMP10]] syncscope("agent") seq_cst seq_cst, align 4
-; CHECK-NEXT:    [[TMP13:%.*]] = extractvalue { i32, i1 } [[TMP12]], 0
-; CHECK-NEXT:    [[TMP14:%.*]] = extractvalue { i32, i1 } [[TMP12]], 1
-; CHECK-NEXT:    br i1 [[TMP14]], label [[PARTWORD_CMPXCHG_END:%.*]], label [[PARTWORD_CMPXCHG_FAILURE]]
-; CHECK:       partword.cmpxchg.failure:
-; CHECK-NEXT:    [[TMP15]] = and i32 [[TMP13]], [[INV_MASK]]
-; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne i32 [[TMP9]], [[TMP15]]
-; CHECK-NEXT:    br i1 [[TMP16]], label [[PARTWORD_CMPXCHG_LOOP]], label [[PARTWORD_CMPXCHG_END]]
-; CHECK:       partword.cmpxchg.end:
-; CHECK-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[TMP13]], [[SHIFTAMT]]
-; CHECK-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
-; CHECK-NEXT:    [[TMP17:%.*]] = insertvalue { i8, i1 } poison, i8 [[EXTRACTED]], 0
-; CHECK-NEXT:    [[TMP18:%.*]] = insertvalue { i8, i1 } [[TMP17]], i1 [[TMP14]], 1
-; CHECK-NEXT:    [[EXTRACT:%.*]] = extractvalue { i8, i1 } [[TMP18]], 0
-; CHECK-NEXT:    ret i8 [[EXTRACT]]
+; GCN-LABEL: @test_cmpxchg_i8_global_agent(
+; GCN-NEXT:    [[GEP:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT:%.*]], i64 4
+; GCN-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[GEP]], i64 -4)
+; GCN-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[GEP]] to i64
+; GCN-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
+; GCN-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
+; GCN-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
+; GCN-NEXT:    [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]]
+; GCN-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; GCN-NEXT:    [[TMP3:%.*]] = zext i8 [[IN:%.*]] to i32
+; GCN-NEXT:    [[TMP4:%.*]] = shl i32 [[TMP3]], [[SHIFTAMT]]
+; GCN-NEXT:    [[TMP5:%.*]] = zext i8 [[OLD:%.*]] to i32
+; GCN-NEXT:    [[TMP6:%.*]] = shl i32 [[TMP5]], [[SHIFTAMT]]
+; GCN-NEXT:    [[TMP7:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
+; GCN-NEXT:    [[TMP8:%.*]] = and i32 [[TMP7]], [[INV_MASK]]
+; GCN-NEXT:    br label [[PARTWORD_CMPXCHG_LOOP:%.*]]
+; GCN:       partword.cmpxchg.loop:
+; GCN-NEXT:    [[TMP9:%.*]] = phi i32 [ [[TMP8]], [[TMP0:%.*]] ], [ [[TMP15:%.*]], [[PARTWORD_CMPXCHG_FAILURE:%.*]] ]
+; GCN-NEXT:    [[TMP10:%.*]] = or i32 [[TMP9]], [[TMP4]]
+; GCN-NEXT:    [[TMP11:%.*]] = or i32 [[TMP9]], [[TMP6]]
+; GCN-NEXT:    [[TMP12:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[TMP11]], i32 [[TMP10]] syncscope("agent") seq_cst seq_cst, align 4
+; GCN-NEXT:    [[TMP13:%.*]] = extractvalue { i32, i1 } [[TMP12]], 0
+; GCN-NEXT:    [[TMP14:%.*]] = extractvalue { i32, i1 } [[TMP12]], 1
+; GCN-NEXT:    br i1 [[TMP14]], label [[PARTWORD_CMPXCHG_END:%.*]], label [[PARTWORD_CMPXCHG_FAILURE]]
+; GCN:       partword.cmpxchg.failure:
+; GCN-NEXT:    [[TMP15]] = and i32 [[TMP13]], [[INV_MASK]]
+; GCN-NEXT:    [[TMP16:%.*]] = icmp ne i32 [[TMP9]], [[TMP15]]
+; GCN-NEXT:    br i1 [[TMP16]], label [[PARTWORD_CMPXCHG_LOOP]], label [[PARTWORD_CMPXCHG_END]]
+; GCN:       partword.cmpxchg.end:
+; GCN-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[TMP13]], [[SHIFTAMT]]
+; GCN-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; GCN-NEXT:    [[TMP17:%.*]] = insertvalue { i8, i1 } poison, i8 [[EXTRACTED]], 0
+; GCN-NEXT:    [[TMP18:%.*]] = insertvalue { i8, i1 } [[TMP17]], i1 [[TMP14]], 1
+; GCN-NEXT:    [[EXTRACT:%.*]] = extractvalue { i8, i1 } [[TMP18]], 0
+; GCN-NEXT:    ret i8 [[EXTRACT]]
+;
+; R600-LABEL: @test_cmpxchg_i8_global_agent(
+; R600-NEXT:    [[GEP:%.*]] = getelementptr i8, ptr addrspace(1) [[OUT:%.*]], i64 4
+; R600-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) [[GEP]], i32 -4)
+; R600-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[GEP]] to i32
+; R600-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; R600-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; R600-NEXT:    [[MASK:%.*]] = shl i32 255, [[TMP2]]
+; R600-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; R600-NEXT:    [[TMP3:%.*]] = zext i8 [[IN:%.*]] to i32
+; R600-NEXT:    [[TMP4:%.*]] = shl i32 [[TMP3]], [[TMP2]]
+; R600-NEXT:    [[TMP5:%.*]] = zext i8 [[OLD:%.*]] to i32
+; R600-NEXT:    [[TMP6:%.*]] = shl i32 [[TMP5]], [[TMP2]]
+; R600-NEXT:    [[TMP7:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
+; R600-NEXT:    [[TMP8:%.*]] = and i32 [[TMP7]], [[INV_MASK]]
+; R600-NEXT:    br label [[PARTWORD_CMPXCHG_LOOP:%.*]]
+; R600:       partword.cmpxchg.loop:
+; R600-NEXT:    [[TMP9:%.*]] = phi i32 [ [[TMP8]], [[TMP0:%.*]] ], [ [[TMP15:%.*]], [[PARTWORD_CMPXCHG_FAILURE:%.*]] ]
+; R600-NEXT:    [[TMP10:%.*]] = or i32 [[TMP9]], [[TMP4]]
+; R600-NEXT:    [[TMP11:%.*]] = or i32 [[TMP9]], [[TMP6]]
+; R600-NEXT:    [[TMP12:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[TMP11]], i32 [[TMP10]] syncscope("agent") seq_cst seq_cst, align 4
+; R600-NEXT:    [[TMP13:%.*]] = extractvalue { i32, i1 } [[TMP12]], 0
+; R600-NEXT:    [[TMP14:%.*]] = extractvalue { i32, i1 } [[TMP12]], 1
+; R600-NEXT:    br i1 [[TMP14]], label [[PARTWORD_CMPXCHG_END:%.*]], label [[PARTWORD_CMPXCHG_FAILURE]]
+; R600:       partword.cmpxchg.failure:
+; R600-NEXT:    [[TMP15]] = and i32 [[TMP13]], [[INV_MASK]]
+; R600-NEXT:    [[TMP16:%.*]] = icmp ne i32 [[TMP9]], [[TMP15]]
+; R600-NEXT:    br i1 [[TMP16]], label [[PARTWORD_CMPXCHG_LOOP]], label [[PARTWORD_CMPXCHG_END]]
+; R600:       partword.cmpxchg.end:
+; R600-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[TMP13]], [[TMP2]]
+; R600-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; R600-NEXT:    [[TMP17:%.*]] = insertvalue { i8, i1 } poison, i8 [[EXTRACTED]], 0
+; R600-NEXT:    [[TMP18:%.*]] = insertvalue { i8, i1 } [[TMP17]], i1 [[TMP14]], 1
+; R600-NEXT:    [[EXTRACT:%.*]] = extractvalue { i8, i1 } [[TMP18]], 0
+; R600-NEXT:    ret i8 [[EXTRACT]]
 ;
   %gep = getelementptr i8, ptr addrspace(1) %out, i64 4
   %res = cmpxchg ptr addrspace(1) %gep, i8 %old, i8 %in syncscope("agent") seq_cst seq_cst
@@ -427,17 +746,16 @@ define i8 @test_cmpxchg_i8_global_agent(ptr addrspace(1) %out, i8 %in, i8 %old)
 define i8 @test_cmpxchg_i8_local_align2(ptr addrspace(3) %out, i8 %in, i8 %old) {
 ; CHECK-LABEL: @test_cmpxchg_i8_local_align2(
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i8, ptr addrspace(3) [[OUT:%.*]], i64 4
-; CHECK-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i64(ptr addrspace(3) [[GEP]], i64 -4)
-; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[GEP]] to i64
-; CHECK-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; CHECK-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; CHECK-NEXT:    [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]]
+; CHECK-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[GEP]], i32 -4)
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[GEP]] to i32
+; CHECK-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; CHECK-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; CHECK-NEXT:    [[MASK:%.*]] = shl i32 255, [[TMP2]]
 ; CHECK-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
 ; CHECK-NEXT:    [[TMP3:%.*]] = zext i8 [[IN:%.*]] to i32
-; CHECK-NEXT:    [[TMP4:%.*]] = shl i32 [[TMP3]], [[SHIFTAMT]]
+; CHECK-NEXT:    [[TMP4:%.*]] = shl i32 [[TMP3]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = zext i8 [[OLD:%.*]] to i32
-; CHECK-NEXT:    [[TMP6:%.*]] = shl i32 [[TMP5]], [[SHIFTAMT]]
+; CHECK-NEXT:    [[TMP6:%.*]] = shl i32 [[TMP5]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
 ; CHECK-NEXT:    [[TMP8:%.*]] = and i32 [[TMP7]], [[INV_MASK]]
 ; CHECK-NEXT:    br label [[PARTWORD_CMPXCHG_LOOP:%.*]]
@@ -454,7 +772,7 @@ define i8 @test_cmpxchg_i8_local_align2(ptr addrspace(3) %out, i8 %in, i8 %old)
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp ne i32 [[TMP9]], [[TMP15]]
 ; CHECK-NEXT:    br i1 [[TMP16]], label [[PARTWORD_CMPXCHG_LOOP]], label [[PARTWORD_CMPXCHG_END]]
 ; CHECK:       partword.cmpxchg.end:
-; CHECK-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[TMP13]], [[SHIFTAMT]]
+; CHECK-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[TMP13]], [[TMP2]]
 ; CHECK-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
 ; CHECK-NEXT:    [[TMP17:%.*]] = insertvalue { i8, i1 } poison, i8 [[EXTRACTED]], 0
 ; CHECK-NEXT:    [[TMP18:%.*]] = insertvalue { i8, i1 } [[TMP17]], i1 [[TMP14]], 1
@@ -468,70 +786,128 @@ define i8 @test_cmpxchg_i8_local_align2(ptr addrspace(3) %out, i8 %in, i8 %old)
 }
 
 define i8 @test_atomicrmw_inc_i8_global_agent(ptr addrspace(1) %ptr, i8 %value) {
-; CHECK-LABEL: @test_atomicrmw_inc_i8_global_agent(
-; CHECK-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
-; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
-; CHECK-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; CHECK-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; CHECK-NEXT:    [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]]
-; CHECK-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; CHECK-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
-; CHECK-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
-; CHECK-NEXT:    [[TMP4:%.*]] = add i8 [[EXTRACTED]], 1
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]]
-; CHECK-NEXT:    [[NEW:%.*]] = select i1 [[TMP5]], i8 0, i8 [[TMP4]]
-; CHECK-NEXT:    [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
-; CHECK-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
-; CHECK-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
-; CHECK-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
-; CHECK-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; CHECK:       atomicrmw.end:
-; CHECK-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
-; CHECK-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
-; CHECK-NEXT:    ret i8 [[EXTRACTED3]]
+; GCN-LABEL: @test_atomicrmw_inc_i8_global_agent(
+; GCN-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
+; GCN-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
+; GCN-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
+; GCN-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
+; GCN-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
+; GCN-NEXT:    [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]]
+; GCN-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; GCN-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
+; GCN-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GCN:       atomicrmw.start:
+; GCN-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; GCN-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; GCN-NEXT:    [[TMP4:%.*]] = add i8 [[EXTRACTED]], 1
+; GCN-NEXT:    [[TMP5:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]]
+; GCN-NEXT:    [[NEW:%.*]] = select i1 [[TMP5]], i8 0, i8 [[TMP4]]
+; GCN-NEXT:    [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; GCN-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; GCN-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; GCN-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; GCN-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
+; GCN-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
+; GCN-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GCN:       atomicrmw.end:
+; GCN-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
+; GCN-NEXT:    ret i8 [[EXTRACTED3]]
+;
+; R600-LABEL: @test_atomicrmw_inc_i8_global_agent(
+; R600-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) [[PTR:%.*]], i32 -4)
+; R600-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i32
+; R600-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; R600-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; R600-NEXT:    [[MASK:%.*]] = shl i32 255, [[TMP2]]
+; R600-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; R600-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
+; R600-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; R600:       atomicrmw.start:
+; R600-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; R600-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
+; R600-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; R600-NEXT:    [[TMP4:%.*]] = add i8 [[EXTRACTED]], 1
+; R600-NEXT:    [[TMP5:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]]
+; R600-NEXT:    [[NEW:%.*]] = select i1 [[TMP5]], i8 0, i8 [[TMP4]]
+; R600-NEXT:    [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; R600-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
+; R600-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; R600-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; R600-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; R600-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
+; R600-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
+; R600-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; R600:       atomicrmw.end:
+; R600-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
+; R600-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
+; R600-NEXT:    ret i8 [[EXTRACTED3]]
 ;
   %res = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i8 %value syncscope("agent") seq_cst
   ret i8 %res
 }
 
 define i8 @test_atomicrmw_inc_i8_global_agent_align2(ptr addrspace(1) %ptr, i8 %value) {
-; CHECK-LABEL: @test_atomicrmw_inc_i8_global_agent_align2(
-; CHECK-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
-; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
-; CHECK-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; CHECK-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; CHECK-NEXT:    [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]]
-; CHECK-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; CHECK-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
-; CHECK-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
-; CHECK-NEXT:    [[TMP4:%.*]] = add i8 [[EXTRACTED]], 1
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]]
-; CHECK-NEXT:    [[NEW:%.*]] = select i1 [[TMP5]], i8 0, i8 [[TMP4]]
-; CHECK-NEXT:    [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
-; CHECK-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
-; CHECK-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
-; CHECK-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
-; CHECK-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; CHECK:       atomicrmw.end:
-; CHECK-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
-; CHECK-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
-; CHECK-NEXT:    ret i8 [[EXTRACTED3]]
+; GCN-LABEL: @test_atomicrmw_inc_i8_global_agent_align2(
+; GCN-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
+; GCN-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
+; GCN-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
+; GCN-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
+; GCN-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
+; GCN-NEXT:    [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]]
+; GCN-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; GCN-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
+; GCN-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GCN:       atomicrmw.start:
+; GCN-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; GCN-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; GCN-NEXT:    [[TMP4:%.*]] = add i8 [[EXTRACTED]], 1
+; GCN-NEXT:    [[TMP5:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]]
+; GCN-NEXT:    [[NEW:%.*]] = select i1 [[TMP5]], i8 0, i8 [[TMP4]]
+; GCN-NEXT:    [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; GCN-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; GCN-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; GCN-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; GCN-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
+; GCN-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
+; GCN-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GCN:       atomicrmw.end:
+; GCN-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
+; GCN-NEXT:    ret i8 [[EXTRACTED3]]
+;
+; R600-LABEL: @test_atomicrmw_inc_i8_global_agent_align2(
+; R600-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) [[PTR:%.*]], i32 -4)
+; R600-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i32
+; R600-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; R600-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; R600-NEXT:    [[MASK:%.*]] = shl i32 255, [[TMP2]]
+; R600-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; R600-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
+; R600-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; R600:       atomicrmw.start:
+; R600-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; R600-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
+; R600-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; R600-NEXT:    [[TMP4:%.*]] = add i8 [[EXTRACTED]], 1
+; R600-NEXT:    [[TMP5:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]]
+; R600-NEXT:    [[NEW:%.*]] = select i1 [[TMP5]], i8 0, i8 [[TMP4]]
+; R600-NEXT:    [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; R600-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
+; R600-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; R600-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; R600-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; R600-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
+; R600-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
+; R600-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; R600:       atomicrmw.end:
+; R600-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
+; R600-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
+; R600-NEXT:    ret i8 [[EXTRACTED3]]
 ;
   %res = atomicrmw uinc_wrap ptr addrspace(1) %ptr, i8 %value syncscope("agent") seq_cst, align 2
   ret i8 %res
@@ -564,24 +940,23 @@ define i8 @test_atomicrmw_inc_i8_global_agent_align4(ptr addrspace(1) %ptr, i8 %
 
 define i8 @test_atomicrmw_inc_i8_local(ptr addrspace(3) %ptr, i8 %value) {
 ; CHECK-LABEL: @test_atomicrmw_inc_i8_local(
-; CHECK-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i64(ptr addrspace(3) [[PTR:%.*]], i64 -4)
-; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i64
-; CHECK-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; CHECK-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; CHECK-NEXT:    [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]]
+; CHECK-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4)
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32
+; CHECK-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; CHECK-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; CHECK-NEXT:    [[MASK:%.*]] = shl i32 255, [[TMP2]]
 ; CHECK-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
 ; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; CHECK:       atomicrmw.start:
 ; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; CHECK-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; CHECK-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
 ; CHECK-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
 ; CHECK-NEXT:    [[TMP4:%.*]] = add i8 [[EXTRACTED]], 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]]
 ; CHECK-NEXT:    [[NEW:%.*]] = select i1 [[TMP5]], i8 0, i8 [[TMP4]]
 ; CHECK-NEXT:    [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
-; CHECK-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; CHECK-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
 ; CHECK-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
 ; CHECK-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4
@@ -589,7 +964,7 @@ define i8 @test_atomicrmw_inc_i8_local(ptr addrspace(3) %ptr, i8 %value) {
 ; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
 ; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; CHECK:       atomicrmw.end:
-; CHECK-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; CHECK-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
 ; CHECK-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
 ; CHECK-NEXT:    ret i8 [[EXTRACTED3]]
 ;
@@ -599,24 +974,23 @@ define i8 @test_atomicrmw_inc_i8_local(ptr addrspace(3) %ptr, i8 %value) {
 
 define i8 @test_atomicrmw_inc_i8_local_align2(ptr addrspace(3) %ptr, i8 %value) {
 ; CHECK-LABEL: @test_atomicrmw_inc_i8_local_align2(
-; CHECK-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i64(ptr addrspace(3) [[PTR:%.*]], i64 -4)
-; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i64
-; CHECK-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; CHECK-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; CHECK-NEXT:    [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]]
+; CHECK-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4)
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32
+; CHECK-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; CHECK-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; CHECK-NEXT:    [[MASK:%.*]] = shl i32 255, [[TMP2]]
 ; CHECK-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
 ; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; CHECK:       atomicrmw.start:
 ; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; CHECK-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; CHECK-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
 ; CHECK-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
 ; CHECK-NEXT:    [[TMP4:%.*]] = add i8 [[EXTRACTED]], 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]]
 ; CHECK-NEXT:    [[NEW:%.*]] = select i1 [[TMP5]], i8 0, i8 [[TMP4]]
 ; CHECK-NEXT:    [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
-; CHECK-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; CHECK-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
 ; CHECK-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
 ; CHECK-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4
@@ -624,7 +998,7 @@ define i8 @test_atomicrmw_inc_i8_local_align2(ptr addrspace(3) %ptr, i8 %value)
 ; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
 ; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; CHECK:       atomicrmw.end:
-; CHECK-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; CHECK-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
 ; CHECK-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
 ; CHECK-NEXT:    ret i8 [[EXTRACTED3]]
 ;
@@ -658,70 +1032,128 @@ define i8 @test_atomicrmw_inc_i8_local_align4(ptr addrspace(3) %ptr, i8 %value)
 }
 
 define i8 @test_atomicrmw_inc_i8_flat_agent(ptr %ptr, i8 %value) {
-; CHECK-LABEL: @test_atomicrmw_inc_i8_flat_agent(
-; CHECK-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR:%.*]], i64 -4)
-; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[PTR]] to i64
-; CHECK-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; CHECK-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; CHECK-NEXT:    [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]]
-; CHECK-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ALIGNEDADDR]], align 4
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; CHECK-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
-; CHECK-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
-; CHECK-NEXT:    [[TMP4:%.*]] = add i8 [[EXTRACTED]], 1
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]]
-; CHECK-NEXT:    [[NEW:%.*]] = select i1 [[TMP5]], i8 0, i8 [[TMP4]]
-; CHECK-NEXT:    [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
-; CHECK-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
-; CHECK-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
-; CHECK-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
-; CHECK-NEXT:    [[TMP6:%.*]] = cmpxchg ptr [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; CHECK:       atomicrmw.end:
-; CHECK-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
-; CHECK-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
-; CHECK-NEXT:    ret i8 [[EXTRACTED3]]
+; GCN-LABEL: @test_atomicrmw_inc_i8_flat_agent(
+; GCN-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR:%.*]], i64 -4)
+; GCN-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[PTR]] to i64
+; GCN-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
+; GCN-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
+; GCN-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
+; GCN-NEXT:    [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]]
+; GCN-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; GCN-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ALIGNEDADDR]], align 4
+; GCN-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GCN:       atomicrmw.start:
+; GCN-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; GCN-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; GCN-NEXT:    [[TMP4:%.*]] = add i8 [[EXTRACTED]], 1
+; GCN-NEXT:    [[TMP5:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]]
+; GCN-NEXT:    [[NEW:%.*]] = select i1 [[TMP5]], i8 0, i8 [[TMP4]]
+; GCN-NEXT:    [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; GCN-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; GCN-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; GCN-NEXT:    [[TMP6:%.*]] = cmpxchg ptr [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; GCN-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
+; GCN-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
+; GCN-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GCN:       atomicrmw.end:
+; GCN-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
+; GCN-NEXT:    ret i8 [[EXTRACTED3]]
+;
+; R600-LABEL: @test_atomicrmw_inc_i8_flat_agent(
+; R600-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr @llvm.ptrmask.p0.i32(ptr [[PTR:%.*]], i32 -4)
+; R600-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[PTR]] to i32
+; R600-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; R600-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; R600-NEXT:    [[MASK:%.*]] = shl i32 255, [[TMP2]]
+; R600-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; R600-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ALIGNEDADDR]], align 4
+; R600-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; R600:       atomicrmw.start:
+; R600-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; R600-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
+; R600-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; R600-NEXT:    [[TMP4:%.*]] = add i8 [[EXTRACTED]], 1
+; R600-NEXT:    [[TMP5:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]]
+; R600-NEXT:    [[NEW:%.*]] = select i1 [[TMP5]], i8 0, i8 [[TMP4]]
+; R600-NEXT:    [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; R600-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
+; R600-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; R600-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; R600-NEXT:    [[TMP6:%.*]] = cmpxchg ptr [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; R600-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
+; R600-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
+; R600-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; R600:       atomicrmw.end:
+; R600-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
+; R600-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
+; R600-NEXT:    ret i8 [[EXTRACTED3]]
 ;
   %res = atomicrmw uinc_wrap ptr %ptr, i8 %value syncscope("agent") seq_cst
   ret i8 %res
 }
 
 define i8 @test_atomicrmw_inc_i8_flat_agent_align2(ptr %ptr, i8 %value) {
-; CHECK-LABEL: @test_atomicrmw_inc_i8_flat_agent_align2(
-; CHECK-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR:%.*]], i64 -4)
-; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[PTR]] to i64
-; CHECK-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; CHECK-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; CHECK-NEXT:    [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]]
-; CHECK-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ALIGNEDADDR]], align 4
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; CHECK-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
-; CHECK-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
-; CHECK-NEXT:    [[TMP4:%.*]] = add i8 [[EXTRACTED]], 1
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]]
-; CHECK-NEXT:    [[NEW:%.*]] = select i1 [[TMP5]], i8 0, i8 [[TMP4]]
-; CHECK-NEXT:    [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
-; CHECK-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
-; CHECK-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
-; CHECK-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
-; CHECK-NEXT:    [[TMP6:%.*]] = cmpxchg ptr [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; CHECK:       atomicrmw.end:
-; CHECK-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
-; CHECK-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
-; CHECK-NEXT:    ret i8 [[EXTRACTED3]]
+; GCN-LABEL: @test_atomicrmw_inc_i8_flat_agent_align2(
+; GCN-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR:%.*]], i64 -4)
+; GCN-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[PTR]] to i64
+; GCN-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
+; GCN-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
+; GCN-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
+; GCN-NEXT:    [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]]
+; GCN-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; GCN-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ALIGNEDADDR]], align 4
+; GCN-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GCN:       atomicrmw.start:
+; GCN-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; GCN-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; GCN-NEXT:    [[TMP4:%.*]] = add i8 [[EXTRACTED]], 1
+; GCN-NEXT:    [[TMP5:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]]
+; GCN-NEXT:    [[NEW:%.*]] = select i1 [[TMP5]], i8 0, i8 [[TMP4]]
+; GCN-NEXT:    [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; GCN-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; GCN-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; GCN-NEXT:    [[TMP6:%.*]] = cmpxchg ptr [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; GCN-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
+; GCN-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
+; GCN-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GCN:       atomicrmw.end:
+; GCN-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
+; GCN-NEXT:    ret i8 [[EXTRACTED3]]
+;
+; R600-LABEL: @test_atomicrmw_inc_i8_flat_agent_align2(
+; R600-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr @llvm.ptrmask.p0.i32(ptr [[PTR:%.*]], i32 -4)
+; R600-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[PTR]] to i32
+; R600-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; R600-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; R600-NEXT:    [[MASK:%.*]] = shl i32 255, [[TMP2]]
+; R600-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; R600-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ALIGNEDADDR]], align 4
+; R600-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; R600:       atomicrmw.start:
+; R600-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; R600-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
+; R600-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; R600-NEXT:    [[TMP4:%.*]] = add i8 [[EXTRACTED]], 1
+; R600-NEXT:    [[TMP5:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]]
+; R600-NEXT:    [[NEW:%.*]] = select i1 [[TMP5]], i8 0, i8 [[TMP4]]
+; R600-NEXT:    [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; R600-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
+; R600-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; R600-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; R600-NEXT:    [[TMP6:%.*]] = cmpxchg ptr [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; R600-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
+; R600-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
+; R600-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; R600:       atomicrmw.end:
+; R600-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
+; R600-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
+; R600-NEXT:    ret i8 [[EXTRACTED3]]
 ;
   %res = atomicrmw uinc_wrap ptr %ptr, i8 %value syncscope("agent") seq_cst, align 2
   ret i8 %res
@@ -753,74 +1185,136 @@ define i8 @test_atomicrmw_inc_i8_flat_agent_align4(ptr %ptr, i8 %value) {
 }
 
 define i8 @test_atomicrmw_dec_i8_global_agent(ptr addrspace(1) %ptr, i8 %value) {
-; CHECK-LABEL: @test_atomicrmw_dec_i8_global_agent(
-; CHECK-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
-; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
-; CHECK-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; CHECK-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; CHECK-NEXT:    [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]]
-; CHECK-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; CHECK-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
-; CHECK-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
-; CHECK-NEXT:    [[TMP4:%.*]] = sub i8 [[EXTRACTED]], 1
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i8 [[EXTRACTED]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp ugt i8 [[EXTRACTED]], [[VALUE:%.*]]
-; CHECK-NEXT:    [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]]
-; CHECK-NEXT:    [[NEW:%.*]] = select i1 [[TMP7]], i8 [[VALUE]], i8 [[TMP4]]
-; CHECK-NEXT:    [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
-; CHECK-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
-; CHECK-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
-; CHECK-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP8]], 1
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP8]], 0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; CHECK:       atomicrmw.end:
-; CHECK-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
-; CHECK-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
-; CHECK-NEXT:    ret i8 [[EXTRACTED3]]
+; GCN-LABEL: @test_atomicrmw_dec_i8_global_agent(
+; GCN-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
+; GCN-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
+; GCN-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
+; GCN-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
+; GCN-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
+; GCN-NEXT:    [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]]
+; GCN-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; GCN-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
+; GCN-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GCN:       atomicrmw.start:
+; GCN-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; GCN-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; GCN-NEXT:    [[TMP4:%.*]] = sub i8 [[EXTRACTED]], 1
+; GCN-NEXT:    [[TMP5:%.*]] = icmp eq i8 [[EXTRACTED]], 0
+; GCN-NEXT:    [[TMP6:%.*]] = icmp ugt i8 [[EXTRACTED]], [[VALUE:%.*]]
+; GCN-NEXT:    [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]]
+; GCN-NEXT:    [[NEW:%.*]] = select i1 [[TMP7]], i8 [[VALUE]], i8 [[TMP4]]
+; GCN-NEXT:    [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; GCN-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; GCN-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; GCN-NEXT:    [[TMP8:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; GCN-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP8]], 1
+; GCN-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP8]], 0
+; GCN-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GCN:       atomicrmw.end:
+; GCN-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
+; GCN-NEXT:    ret i8 [[EXTRACTED3]]
+;
+; R600-LABEL: @test_atomicrmw_dec_i8_global_agent(
+; R600-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) [[PTR:%.*]], i32 -4)
+; R600-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i32
+; R600-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; R600-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; R600-NEXT:    [[MASK:%.*]] = shl i32 255, [[TMP2]]
+; R600-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; R600-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
+; R600-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; R600:       atomicrmw.start:
+; R600-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; R600-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
+; R600-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; R600-NEXT:    [[TMP4:%.*]] = sub i8 [[EXTRACTED]], 1
+; R600-NEXT:    [[TMP5:%.*]] = icmp eq i8 [[EXTRACTED]], 0
+; R600-NEXT:    [[TMP6:%.*]] = icmp ugt i8 [[EXTRACTED]], [[VALUE:%.*]]
+; R600-NEXT:    [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]]
+; R600-NEXT:    [[NEW:%.*]] = select i1 [[TMP7]], i8 [[VALUE]], i8 [[TMP4]]
+; R600-NEXT:    [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; R600-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
+; R600-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; R600-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; R600-NEXT:    [[TMP8:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; R600-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP8]], 1
+; R600-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP8]], 0
+; R600-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; R600:       atomicrmw.end:
+; R600-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
+; R600-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
+; R600-NEXT:    ret i8 [[EXTRACTED3]]
 ;
   %res = atomicrmw udec_wrap ptr addrspace(1) %ptr, i8 %value syncscope("agent") seq_cst
   ret i8 %res
 }
 
 define i8 @test_atomicrmw_dec_i8_global_agent_align2(ptr addrspace(1) %ptr, i8 %value) {
-; CHECK-LABEL: @test_atomicrmw_dec_i8_global_agent_align2(
-; CHECK-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
-; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
-; CHECK-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; CHECK-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; CHECK-NEXT:    [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]]
-; CHECK-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; CHECK-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
-; CHECK-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
-; CHECK-NEXT:    [[TMP4:%.*]] = sub i8 [[EXTRACTED]], 1
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i8 [[EXTRACTED]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp ugt i8 [[EXTRACTED]], [[VALUE:%.*]]
-; CHECK-NEXT:    [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]]
-; CHECK-NEXT:    [[NEW:%.*]] = select i1 [[TMP7]], i8 [[VALUE]], i8 [[TMP4]]
-; CHECK-NEXT:    [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
-; CHECK-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
-; CHECK-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
-; CHECK-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP8]], 1
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP8]], 0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; CHECK:       atomicrmw.end:
-; CHECK-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
-; CHECK-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
-; CHECK-NEXT:    ret i8 [[EXTRACTED3]]
+; GCN-LABEL: @test_atomicrmw_dec_i8_global_agent_align2(
+; GCN-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
+; GCN-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
+; GCN-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
+; GCN-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
+; GCN-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
+; GCN-NEXT:    [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]]
+; GCN-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; GCN-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
+; GCN-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GCN:       atomicrmw.start:
+; GCN-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; GCN-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; GCN-NEXT:    [[TMP4:%.*]] = sub i8 [[EXTRACTED]], 1
+; GCN-NEXT:    [[TMP5:%.*]] = icmp eq i8 [[EXTRACTED]], 0
+; GCN-NEXT:    [[TMP6:%.*]] = icmp ugt i8 [[EXTRACTED]], [[VALUE:%.*]]
+; GCN-NEXT:    [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]]
+; GCN-NEXT:    [[NEW:%.*]] = select i1 [[TMP7]], i8 [[VALUE]], i8 [[TMP4]]
+; GCN-NEXT:    [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; GCN-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; GCN-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; GCN-NEXT:    [[TMP8:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; GCN-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP8]], 1
+; GCN-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP8]], 0
+; GCN-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GCN:       atomicrmw.end:
+; GCN-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
+; GCN-NEXT:    ret i8 [[EXTRACTED3]]
+;
+; R600-LABEL: @test_atomicrmw_dec_i8_global_agent_align2(
+; R600-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) [[PTR:%.*]], i32 -4)
+; R600-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i32
+; R600-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; R600-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; R600-NEXT:    [[MASK:%.*]] = shl i32 255, [[TMP2]]
+; R600-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; R600-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
+; R600-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; R600:       atomicrmw.start:
+; R600-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; R600-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
+; R600-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; R600-NEXT:    [[TMP4:%.*]] = sub i8 [[EXTRACTED]], 1
+; R600-NEXT:    [[TMP5:%.*]] = icmp eq i8 [[EXTRACTED]], 0
+; R600-NEXT:    [[TMP6:%.*]] = icmp ugt i8 [[EXTRACTED]], [[VALUE:%.*]]
+; R600-NEXT:    [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]]
+; R600-NEXT:    [[NEW:%.*]] = select i1 [[TMP7]], i8 [[VALUE]], i8 [[TMP4]]
+; R600-NEXT:    [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; R600-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
+; R600-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; R600-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; R600-NEXT:    [[TMP8:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; R600-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP8]], 1
+; R600-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP8]], 0
+; R600-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; R600:       atomicrmw.end:
+; R600-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
+; R600-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
+; R600-NEXT:    ret i8 [[EXTRACTED3]]
 ;
   %res = atomicrmw udec_wrap ptr addrspace(1) %ptr, i8 %value syncscope("agent") seq_cst, align 2
   ret i8 %res
@@ -855,18 +1349,17 @@ define i8 @test_atomicrmw_dec_i8_global_agent_align4(ptr addrspace(1) %ptr, i8 %
 
 define i8 @test_atomicrmw_dec_i8_local(ptr addrspace(3) %ptr, i8 %value) {
 ; CHECK-LABEL: @test_atomicrmw_dec_i8_local(
-; CHECK-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i64(ptr addrspace(3) [[PTR:%.*]], i64 -4)
-; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i64
-; CHECK-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; CHECK-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; CHECK-NEXT:    [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]]
+; CHECK-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4)
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32
+; CHECK-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; CHECK-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; CHECK-NEXT:    [[MASK:%.*]] = shl i32 255, [[TMP2]]
 ; CHECK-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
 ; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; CHECK:       atomicrmw.start:
 ; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; CHECK-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; CHECK-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
 ; CHECK-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
 ; CHECK-NEXT:    [[TMP4:%.*]] = sub i8 [[EXTRACTED]], 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i8 [[EXTRACTED]], 0
@@ -874,7 +1367,7 @@ define i8 @test_atomicrmw_dec_i8_local(ptr addrspace(3) %ptr, i8 %value) {
 ; CHECK-NEXT:    [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]]
 ; CHECK-NEXT:    [[NEW:%.*]] = select i1 [[TMP7]], i8 [[VALUE]], i8 [[TMP4]]
 ; CHECK-NEXT:    [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
-; CHECK-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; CHECK-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
 ; CHECK-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
 ; CHECK-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4
@@ -882,7 +1375,7 @@ define i8 @test_atomicrmw_dec_i8_local(ptr addrspace(3) %ptr, i8 %value) {
 ; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP8]], 0
 ; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; CHECK:       atomicrmw.end:
-; CHECK-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; CHECK-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
 ; CHECK-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
 ; CHECK-NEXT:    ret i8 [[EXTRACTED3]]
 ;
@@ -892,18 +1385,17 @@ define i8 @test_atomicrmw_dec_i8_local(ptr addrspace(3) %ptr, i8 %value) {
 
 define i8 @test_atomicrmw_dec_i8_local_align2(ptr addrspace(3) %ptr, i8 %value) {
 ; CHECK-LABEL: @test_atomicrmw_dec_i8_local_align2(
-; CHECK-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i64(ptr addrspace(3) [[PTR:%.*]], i64 -4)
-; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i64
-; CHECK-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; CHECK-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; CHECK-NEXT:    [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]]
+; CHECK-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4)
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32
+; CHECK-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; CHECK-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; CHECK-NEXT:    [[MASK:%.*]] = shl i32 255, [[TMP2]]
 ; CHECK-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
 ; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; CHECK:       atomicrmw.start:
 ; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; CHECK-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; CHECK-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
 ; CHECK-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
 ; CHECK-NEXT:    [[TMP4:%.*]] = sub i8 [[EXTRACTED]], 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i8 [[EXTRACTED]], 0
@@ -911,7 +1403,7 @@ define i8 @test_atomicrmw_dec_i8_local_align2(ptr addrspace(3) %ptr, i8 %value)
 ; CHECK-NEXT:    [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]]
 ; CHECK-NEXT:    [[NEW:%.*]] = select i1 [[TMP7]], i8 [[VALUE]], i8 [[TMP4]]
 ; CHECK-NEXT:    [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
-; CHECK-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; CHECK-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
 ; CHECK-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
 ; CHECK-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4
@@ -919,7 +1411,7 @@ define i8 @test_atomicrmw_dec_i8_local_align2(ptr addrspace(3) %ptr, i8 %value)
 ; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP8]], 0
 ; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; CHECK:       atomicrmw.end:
-; CHECK-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; CHECK-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
 ; CHECK-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
 ; CHECK-NEXT:    ret i8 [[EXTRACTED3]]
 ;
@@ -955,74 +1447,136 @@ define i8 @test_atomicrmw_dec_i8_local_align4(ptr addrspace(3) %ptr, i8 %value)
 }
 
 define i8 @test_atomicrmw_dec_i8_flat_agent(ptr %ptr, i8 %value) {
-; CHECK-LABEL: @test_atomicrmw_dec_i8_flat_agent(
-; CHECK-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR:%.*]], i64 -4)
-; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[PTR]] to i64
-; CHECK-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; CHECK-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; CHECK-NEXT:    [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]]
-; CHECK-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ALIGNEDADDR]], align 4
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; CHECK-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
-; CHECK-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
-; CHECK-NEXT:    [[TMP4:%.*]] = sub i8 [[EXTRACTED]], 1
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i8 [[EXTRACTED]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp ugt i8 [[EXTRACTED]], [[VALUE:%.*]]
-; CHECK-NEXT:    [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]]
-; CHECK-NEXT:    [[NEW:%.*]] = select i1 [[TMP7]], i8 [[VALUE]], i8 [[TMP4]]
-; CHECK-NEXT:    [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
-; CHECK-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
-; CHECK-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
-; CHECK-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = cmpxchg ptr [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP8]], 1
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP8]], 0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; CHECK:       atomicrmw.end:
-; CHECK-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
-; CHECK-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
-; CHECK-NEXT:    ret i8 [[EXTRACTED3]]
+; GCN-LABEL: @test_atomicrmw_dec_i8_flat_agent(
+; GCN-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR:%.*]], i64 -4)
+; GCN-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[PTR]] to i64
+; GCN-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
+; GCN-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
+; GCN-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
+; GCN-NEXT:    [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]]
+; GCN-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; GCN-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ALIGNEDADDR]], align 4
+; GCN-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GCN:       atomicrmw.start:
+; GCN-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; GCN-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; GCN-NEXT:    [[TMP4:%.*]] = sub i8 [[EXTRACTED]], 1
+; GCN-NEXT:    [[TMP5:%.*]] = icmp eq i8 [[EXTRACTED]], 0
+; GCN-NEXT:    [[TMP6:%.*]] = icmp ugt i8 [[EXTRACTED]], [[VALUE:%.*]]
+; GCN-NEXT:    [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]]
+; GCN-NEXT:    [[NEW:%.*]] = select i1 [[TMP7]], i8 [[VALUE]], i8 [[TMP4]]
+; GCN-NEXT:    [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; GCN-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; GCN-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; GCN-NEXT:    [[TMP8:%.*]] = cmpxchg ptr [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; GCN-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP8]], 1
+; GCN-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP8]], 0
+; GCN-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GCN:       atomicrmw.end:
+; GCN-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
+; GCN-NEXT:    ret i8 [[EXTRACTED3]]
+;
+; R600-LABEL: @test_atomicrmw_dec_i8_flat_agent(
+; R600-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr @llvm.ptrmask.p0.i32(ptr [[PTR:%.*]], i32 -4)
+; R600-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[PTR]] to i32
+; R600-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; R600-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; R600-NEXT:    [[MASK:%.*]] = shl i32 255, [[TMP2]]
+; R600-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; R600-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ALIGNEDADDR]], align 4
+; R600-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; R600:       atomicrmw.start:
+; R600-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; R600-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
+; R600-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; R600-NEXT:    [[TMP4:%.*]] = sub i8 [[EXTRACTED]], 1
+; R600-NEXT:    [[TMP5:%.*]] = icmp eq i8 [[EXTRACTED]], 0
+; R600-NEXT:    [[TMP6:%.*]] = icmp ugt i8 [[EXTRACTED]], [[VALUE:%.*]]
+; R600-NEXT:    [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]]
+; R600-NEXT:    [[NEW:%.*]] = select i1 [[TMP7]], i8 [[VALUE]], i8 [[TMP4]]
+; R600-NEXT:    [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; R600-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
+; R600-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; R600-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; R600-NEXT:    [[TMP8:%.*]] = cmpxchg ptr [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; R600-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP8]], 1
+; R600-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP8]], 0
+; R600-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; R600:       atomicrmw.end:
+; R600-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
+; R600-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
+; R600-NEXT:    ret i8 [[EXTRACTED3]]
 ;
   %res = atomicrmw udec_wrap ptr %ptr, i8 %value syncscope("agent") seq_cst
   ret i8 %res
 }
 
 define i8 @test_atomicrmw_dec_i8_flat_agent_align2(ptr %ptr, i8 %value) {
-; CHECK-LABEL: @test_atomicrmw_dec_i8_flat_agent_align2(
-; CHECK-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR:%.*]], i64 -4)
-; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[PTR]] to i64
-; CHECK-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; CHECK-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; CHECK-NEXT:    [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]]
-; CHECK-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ALIGNEDADDR]], align 4
-; CHECK-NEXT:    br label [[ATOMICRMW_START:%.*]]
-; CHECK:       atomicrmw.start:
-; CHECK-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; CHECK-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
-; CHECK-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
-; CHECK-NEXT:    [[TMP4:%.*]] = sub i8 [[EXTRACTED]], 1
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i8 [[EXTRACTED]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp ugt i8 [[EXTRACTED]], [[VALUE:%.*]]
-; CHECK-NEXT:    [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]]
-; CHECK-NEXT:    [[NEW:%.*]] = select i1 [[TMP7]], i8 [[VALUE]], i8 [[TMP4]]
-; CHECK-NEXT:    [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
-; CHECK-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
-; CHECK-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
-; CHECK-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = cmpxchg ptr [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
-; CHECK-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP8]], 1
-; CHECK-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP8]], 0
-; CHECK-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; CHECK:       atomicrmw.end:
-; CHECK-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
-; CHECK-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
-; CHECK-NEXT:    ret i8 [[EXTRACTED3]]
+; GCN-LABEL: @test_atomicrmw_dec_i8_flat_agent_align2(
+; GCN-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR:%.*]], i64 -4)
+; GCN-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[PTR]] to i64
+; GCN-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
+; GCN-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
+; GCN-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
+; GCN-NEXT:    [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]]
+; GCN-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; GCN-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ALIGNEDADDR]], align 4
+; GCN-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; GCN:       atomicrmw.start:
+; GCN-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; GCN-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; GCN-NEXT:    [[TMP4:%.*]] = sub i8 [[EXTRACTED]], 1
+; GCN-NEXT:    [[TMP5:%.*]] = icmp eq i8 [[EXTRACTED]], 0
+; GCN-NEXT:    [[TMP6:%.*]] = icmp ugt i8 [[EXTRACTED]], [[VALUE:%.*]]
+; GCN-NEXT:    [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]]
+; GCN-NEXT:    [[NEW:%.*]] = select i1 [[TMP7]], i8 [[VALUE]], i8 [[TMP4]]
+; GCN-NEXT:    [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; GCN-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; GCN-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; GCN-NEXT:    [[TMP8:%.*]] = cmpxchg ptr [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; GCN-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP8]], 1
+; GCN-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP8]], 0
+; GCN-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GCN:       atomicrmw.end:
+; GCN-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
+; GCN-NEXT:    ret i8 [[EXTRACTED3]]
+;
+; R600-LABEL: @test_atomicrmw_dec_i8_flat_agent_align2(
+; R600-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr @llvm.ptrmask.p0.i32(ptr [[PTR:%.*]], i32 -4)
+; R600-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[PTR]] to i32
+; R600-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; R600-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; R600-NEXT:    [[MASK:%.*]] = shl i32 255, [[TMP2]]
+; R600-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; R600-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ALIGNEDADDR]], align 4
+; R600-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; R600:       atomicrmw.start:
+; R600-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; R600-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
+; R600-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; R600-NEXT:    [[TMP4:%.*]] = sub i8 [[EXTRACTED]], 1
+; R600-NEXT:    [[TMP5:%.*]] = icmp eq i8 [[EXTRACTED]], 0
+; R600-NEXT:    [[TMP6:%.*]] = icmp ugt i8 [[EXTRACTED]], [[VALUE:%.*]]
+; R600-NEXT:    [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP6]]
+; R600-NEXT:    [[NEW:%.*]] = select i1 [[TMP7]], i8 [[VALUE]], i8 [[TMP4]]
+; R600-NEXT:    [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; R600-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
+; R600-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; R600-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; R600-NEXT:    [[TMP8:%.*]] = cmpxchg ptr [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; R600-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP8]], 1
+; R600-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP8]], 0
+; R600-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; R600:       atomicrmw.end:
+; R600-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
+; R600-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
+; R600-NEXT:    ret i8 [[EXTRACTED3]]
 ;
   %res = atomicrmw udec_wrap ptr %ptr, i8 %value syncscope("agent") seq_cst, align 2
   ret i8 %res
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll
index 93e43030040d3..97c041168d147 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll
@@ -1279,24 +1279,23 @@ define half @test_atomicrmw_fadd_f16_global_align4(ptr addrspace(1) %ptr, half %
 
 define half @test_atomicrmw_fadd_f16_local(ptr addrspace(3) %ptr, half %value) {
 ; CI-LABEL: @test_atomicrmw_fadd_f16_local(
-; CI-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i64(ptr addrspace(3) [[PTR:%.*]], i64 -4)
-; CI-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i64
-; CI-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; CI-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; CI-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; CI-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
+; CI-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4)
+; CI-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32
+; CI-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; CI-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; CI-NEXT:    [[MASK:%.*]] = shl i32 65535, [[TMP2]]
 ; CI-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
 ; CI-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
 ; CI-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; CI:       atomicrmw.start:
 ; CI-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; CI-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; CI-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
 ; CI-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
 ; CI-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to half
 ; CI-NEXT:    [[NEW:%.*]] = fadd half [[TMP4]], [[VALUE:%.*]]
 ; CI-NEXT:    [[TMP5:%.*]] = bitcast half [[NEW]] to i16
 ; CI-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
-; CI-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; CI-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
 ; CI-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
 ; CI-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
 ; CI-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4
@@ -1304,30 +1303,29 @@ define half @test_atomicrmw_fadd_f16_local(ptr addrspace(3) %ptr, half %value) {
 ; CI-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
 ; CI-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; CI:       atomicrmw.end:
-; CI-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; CI-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
 ; CI-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
 ; CI-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to half
 ; CI-NEXT:    ret half [[TMP7]]
 ;
 ; GFX9-LABEL: @test_atomicrmw_fadd_f16_local(
-; GFX9-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i64(ptr addrspace(3) [[PTR:%.*]], i64 -4)
-; GFX9-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i64
-; GFX9-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; GFX9-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; GFX9-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; GFX9-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
+; GFX9-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4)
+; GFX9-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32
+; GFX9-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; GFX9-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; GFX9-NEXT:    [[MASK:%.*]] = shl i32 65535, [[TMP2]]
 ; GFX9-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
 ; GFX9-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
 ; GFX9-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; GFX9:       atomicrmw.start:
 ; GFX9-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GFX9-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; GFX9-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
 ; GFX9-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
 ; GFX9-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to half
 ; GFX9-NEXT:    [[NEW:%.*]] = fadd half [[TMP4]], [[VALUE:%.*]]
 ; GFX9-NEXT:    [[TMP5:%.*]] = bitcast half [[NEW]] to i16
 ; GFX9-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
-; GFX9-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; GFX9-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
 ; GFX9-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
 ; GFX9-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
 ; GFX9-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4
@@ -1335,30 +1333,29 @@ define half @test_atomicrmw_fadd_f16_local(ptr addrspace(3) %ptr, half %value) {
 ; GFX9-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
 ; GFX9-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; GFX9:       atomicrmw.end:
-; GFX9-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; GFX9-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
 ; GFX9-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
 ; GFX9-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to half
 ; GFX9-NEXT:    ret half [[TMP7]]
 ;
 ; GFX908-LABEL: @test_atomicrmw_fadd_f16_local(
-; GFX908-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i64(ptr addrspace(3) [[PTR:%.*]], i64 -4)
-; GFX908-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i64
-; GFX908-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; GFX908-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; GFX908-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; GFX908-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
+; GFX908-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4)
+; GFX908-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32
+; GFX908-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; GFX908-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; GFX908-NEXT:    [[MASK:%.*]] = shl i32 65535, [[TMP2]]
 ; GFX908-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
 ; GFX908-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
 ; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; GFX908:       atomicrmw.start:
 ; GFX908-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GFX908-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; GFX908-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
 ; GFX908-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
 ; GFX908-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to half
 ; GFX908-NEXT:    [[NEW:%.*]] = fadd half [[TMP4]], [[VALUE:%.*]]
 ; GFX908-NEXT:    [[TMP5:%.*]] = bitcast half [[NEW]] to i16
 ; GFX908-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
-; GFX908-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; GFX908-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
 ; GFX908-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
 ; GFX908-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
 ; GFX908-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4
@@ -1366,30 +1363,29 @@ define half @test_atomicrmw_fadd_f16_local(ptr addrspace(3) %ptr, half %value) {
 ; GFX908-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
 ; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; GFX908:       atomicrmw.end:
-; GFX908-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; GFX908-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
 ; GFX908-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
 ; GFX908-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to half
 ; GFX908-NEXT:    ret half [[TMP7]]
 ;
 ; GFX90A-LABEL: @test_atomicrmw_fadd_f16_local(
-; GFX90A-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i64(ptr addrspace(3) [[PTR:%.*]], i64 -4)
-; GFX90A-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i64
-; GFX90A-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; GFX90A-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; GFX90A-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; GFX90A-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
+; GFX90A-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4)
+; GFX90A-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32
+; GFX90A-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; GFX90A-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; GFX90A-NEXT:    [[MASK:%.*]] = shl i32 65535, [[TMP2]]
 ; GFX90A-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
 ; GFX90A-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
 ; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; GFX90A:       atomicrmw.start:
 ; GFX90A-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GFX90A-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; GFX90A-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
 ; GFX90A-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
 ; GFX90A-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to half
 ; GFX90A-NEXT:    [[NEW:%.*]] = fadd half [[TMP4]], [[VALUE:%.*]]
 ; GFX90A-NEXT:    [[TMP5:%.*]] = bitcast half [[NEW]] to i16
 ; GFX90A-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
-; GFX90A-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; GFX90A-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
 ; GFX90A-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
 ; GFX90A-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
 ; GFX90A-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4
@@ -1397,30 +1393,29 @@ define half @test_atomicrmw_fadd_f16_local(ptr addrspace(3) %ptr, half %value) {
 ; GFX90A-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
 ; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; GFX90A:       atomicrmw.end:
-; GFX90A-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; GFX90A-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
 ; GFX90A-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
 ; GFX90A-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to half
 ; GFX90A-NEXT:    ret half [[TMP7]]
 ;
 ; GFX940-LABEL: @test_atomicrmw_fadd_f16_local(
-; GFX940-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i64(ptr addrspace(3) [[PTR:%.*]], i64 -4)
-; GFX940-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i64
-; GFX940-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; GFX940-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; GFX940-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; GFX940-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
+; GFX940-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4)
+; GFX940-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32
+; GFX940-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; GFX940-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; GFX940-NEXT:    [[MASK:%.*]] = shl i32 65535, [[TMP2]]
 ; GFX940-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
 ; GFX940-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
 ; GFX940-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; GFX940:       atomicrmw.start:
 ; GFX940-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GFX940-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; GFX940-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
 ; GFX940-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
 ; GFX940-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to half
 ; GFX940-NEXT:    [[NEW:%.*]] = fadd half [[TMP4]], [[VALUE:%.*]]
 ; GFX940-NEXT:    [[TMP5:%.*]] = bitcast half [[NEW]] to i16
 ; GFX940-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
-; GFX940-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; GFX940-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
 ; GFX940-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
 ; GFX940-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
 ; GFX940-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4
@@ -1428,30 +1423,29 @@ define half @test_atomicrmw_fadd_f16_local(ptr addrspace(3) %ptr, half %value) {
 ; GFX940-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
 ; GFX940-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; GFX940:       atomicrmw.end:
-; GFX940-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; GFX940-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
 ; GFX940-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
 ; GFX940-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to half
 ; GFX940-NEXT:    ret half [[TMP7]]
 ;
 ; GFX11-LABEL: @test_atomicrmw_fadd_f16_local(
-; GFX11-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i64(ptr addrspace(3) [[PTR:%.*]], i64 -4)
-; GFX11-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i64
-; GFX11-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; GFX11-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; GFX11-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; GFX11-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
+; GFX11-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4)
+; GFX11-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32
+; GFX11-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; GFX11-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; GFX11-NEXT:    [[MASK:%.*]] = shl i32 65535, [[TMP2]]
 ; GFX11-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
 ; GFX11-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
 ; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; GFX11:       atomicrmw.start:
 ; GFX11-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GFX11-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; GFX11-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
 ; GFX11-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
 ; GFX11-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to half
 ; GFX11-NEXT:    [[NEW:%.*]] = fadd half [[TMP4]], [[VALUE:%.*]]
 ; GFX11-NEXT:    [[TMP5:%.*]] = bitcast half [[NEW]] to i16
 ; GFX11-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
-; GFX11-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; GFX11-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
 ; GFX11-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
 ; GFX11-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
 ; GFX11-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4
@@ -1459,7 +1453,7 @@ define half @test_atomicrmw_fadd_f16_local(ptr addrspace(3) %ptr, half %value) {
 ; GFX11-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
 ; GFX11-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; GFX11:       atomicrmw.end:
-; GFX11-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; GFX11-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
 ; GFX11-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
 ; GFX11-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to half
 ; GFX11-NEXT:    ret half [[TMP7]]
@@ -2074,24 +2068,23 @@ define float @test_atomicrmw_fadd_f32_local_strictfp(ptr addrspace(3) %ptr, floa
 
 define bfloat @test_atomicrmw_fadd_bf16_local(ptr addrspace(3) %ptr, bfloat %value) {
 ; CI-LABEL: @test_atomicrmw_fadd_bf16_local(
-; CI-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i64(ptr addrspace(3) [[PTR:%.*]], i64 -4)
-; CI-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i64
-; CI-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; CI-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; CI-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; CI-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
+; CI-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4)
+; CI-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32
+; CI-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; CI-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; CI-NEXT:    [[MASK:%.*]] = shl i32 65535, [[TMP2]]
 ; CI-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
 ; CI-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
 ; CI-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; CI:       atomicrmw.start:
 ; CI-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; CI-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; CI-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
 ; CI-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
 ; CI-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
 ; CI-NEXT:    [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]]
 ; CI-NEXT:    [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16
 ; CI-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
-; CI-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; CI-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
 ; CI-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
 ; CI-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
 ; CI-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4
@@ -2099,30 +2092,29 @@ define bfloat @test_atomicrmw_fadd_bf16_local(ptr addrspace(3) %ptr, bfloat %val
 ; CI-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
 ; CI-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; CI:       atomicrmw.end:
-; CI-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; CI-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
 ; CI-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
 ; CI-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat
 ; CI-NEXT:    ret bfloat [[TMP7]]
 ;
 ; GFX9-LABEL: @test_atomicrmw_fadd_bf16_local(
-; GFX9-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i64(ptr addrspace(3) [[PTR:%.*]], i64 -4)
-; GFX9-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i64
-; GFX9-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; GFX9-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; GFX9-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; GFX9-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
+; GFX9-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4)
+; GFX9-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32
+; GFX9-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; GFX9-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; GFX9-NEXT:    [[MASK:%.*]] = shl i32 65535, [[TMP2]]
 ; GFX9-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
 ; GFX9-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
 ; GFX9-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; GFX9:       atomicrmw.start:
 ; GFX9-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GFX9-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; GFX9-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
 ; GFX9-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
 ; GFX9-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
 ; GFX9-NEXT:    [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]]
 ; GFX9-NEXT:    [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16
 ; GFX9-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
-; GFX9-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; GFX9-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
 ; GFX9-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
 ; GFX9-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
 ; GFX9-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4
@@ -2130,30 +2122,29 @@ define bfloat @test_atomicrmw_fadd_bf16_local(ptr addrspace(3) %ptr, bfloat %val
 ; GFX9-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
 ; GFX9-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; GFX9:       atomicrmw.end:
-; GFX9-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; GFX9-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
 ; GFX9-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
 ; GFX9-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat
 ; GFX9-NEXT:    ret bfloat [[TMP7]]
 ;
 ; GFX908-LABEL: @test_atomicrmw_fadd_bf16_local(
-; GFX908-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i64(ptr addrspace(3) [[PTR:%.*]], i64 -4)
-; GFX908-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i64
-; GFX908-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; GFX908-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; GFX908-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; GFX908-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
+; GFX908-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4)
+; GFX908-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32
+; GFX908-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; GFX908-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; GFX908-NEXT:    [[MASK:%.*]] = shl i32 65535, [[TMP2]]
 ; GFX908-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
 ; GFX908-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
 ; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; GFX908:       atomicrmw.start:
 ; GFX908-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GFX908-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; GFX908-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
 ; GFX908-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
 ; GFX908-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
 ; GFX908-NEXT:    [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]]
 ; GFX908-NEXT:    [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16
 ; GFX908-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
-; GFX908-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; GFX908-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
 ; GFX908-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
 ; GFX908-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
 ; GFX908-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4
@@ -2161,30 +2152,29 @@ define bfloat @test_atomicrmw_fadd_bf16_local(ptr addrspace(3) %ptr, bfloat %val
 ; GFX908-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
 ; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; GFX908:       atomicrmw.end:
-; GFX908-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; GFX908-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
 ; GFX908-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
 ; GFX908-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat
 ; GFX908-NEXT:    ret bfloat [[TMP7]]
 ;
 ; GFX90A-LABEL: @test_atomicrmw_fadd_bf16_local(
-; GFX90A-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i64(ptr addrspace(3) [[PTR:%.*]], i64 -4)
-; GFX90A-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i64
-; GFX90A-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; GFX90A-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; GFX90A-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; GFX90A-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
+; GFX90A-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4)
+; GFX90A-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32
+; GFX90A-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; GFX90A-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; GFX90A-NEXT:    [[MASK:%.*]] = shl i32 65535, [[TMP2]]
 ; GFX90A-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
 ; GFX90A-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
 ; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; GFX90A:       atomicrmw.start:
 ; GFX90A-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GFX90A-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; GFX90A-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
 ; GFX90A-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
 ; GFX90A-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
 ; GFX90A-NEXT:    [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]]
 ; GFX90A-NEXT:    [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16
 ; GFX90A-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
-; GFX90A-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; GFX90A-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
 ; GFX90A-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
 ; GFX90A-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
 ; GFX90A-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4
@@ -2192,30 +2182,29 @@ define bfloat @test_atomicrmw_fadd_bf16_local(ptr addrspace(3) %ptr, bfloat %val
 ; GFX90A-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
 ; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; GFX90A:       atomicrmw.end:
-; GFX90A-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; GFX90A-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
 ; GFX90A-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
 ; GFX90A-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat
 ; GFX90A-NEXT:    ret bfloat [[TMP7]]
 ;
 ; GFX940-LABEL: @test_atomicrmw_fadd_bf16_local(
-; GFX940-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i64(ptr addrspace(3) [[PTR:%.*]], i64 -4)
-; GFX940-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i64
-; GFX940-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; GFX940-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; GFX940-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; GFX940-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
+; GFX940-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4)
+; GFX940-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32
+; GFX940-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; GFX940-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; GFX940-NEXT:    [[MASK:%.*]] = shl i32 65535, [[TMP2]]
 ; GFX940-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
 ; GFX940-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
 ; GFX940-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; GFX940:       atomicrmw.start:
 ; GFX940-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GFX940-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; GFX940-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
 ; GFX940-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
 ; GFX940-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
 ; GFX940-NEXT:    [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]]
 ; GFX940-NEXT:    [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16
 ; GFX940-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
-; GFX940-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; GFX940-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
 ; GFX940-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
 ; GFX940-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
 ; GFX940-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4
@@ -2223,30 +2212,29 @@ define bfloat @test_atomicrmw_fadd_bf16_local(ptr addrspace(3) %ptr, bfloat %val
 ; GFX940-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
 ; GFX940-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; GFX940:       atomicrmw.end:
-; GFX940-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; GFX940-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
 ; GFX940-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
 ; GFX940-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat
 ; GFX940-NEXT:    ret bfloat [[TMP7]]
 ;
 ; GFX11-LABEL: @test_atomicrmw_fadd_bf16_local(
-; GFX11-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i64(ptr addrspace(3) [[PTR:%.*]], i64 -4)
-; GFX11-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i64
-; GFX11-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; GFX11-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; GFX11-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; GFX11-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
+; GFX11-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4)
+; GFX11-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32
+; GFX11-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; GFX11-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; GFX11-NEXT:    [[MASK:%.*]] = shl i32 65535, [[TMP2]]
 ; GFX11-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
 ; GFX11-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
 ; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; GFX11:       atomicrmw.start:
 ; GFX11-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GFX11-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; GFX11-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
 ; GFX11-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
 ; GFX11-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
 ; GFX11-NEXT:    [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]]
 ; GFX11-NEXT:    [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16
 ; GFX11-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
-; GFX11-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; GFX11-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
 ; GFX11-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
 ; GFX11-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
 ; GFX11-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4
@@ -2254,7 +2242,7 @@ define bfloat @test_atomicrmw_fadd_bf16_local(ptr addrspace(3) %ptr, bfloat %val
 ; GFX11-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
 ; GFX11-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; GFX11:       atomicrmw.end:
-; GFX11-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; GFX11-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
 ; GFX11-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
 ; GFX11-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat
 ; GFX11-NEXT:    ret bfloat [[TMP7]]
@@ -3040,24 +3028,23 @@ define bfloat @test_atomicrmw_fadd_bf16_global_system_align4(ptr addrspace(1) %p
 
 define bfloat @test_atomicrmw_fadd_bf16_local_strictfp(ptr addrspace(3) %ptr, bfloat %value) #2 {
 ; CI-LABEL: @test_atomicrmw_fadd_bf16_local_strictfp(
-; CI-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i64(ptr addrspace(3) [[PTR:%.*]], i64 -4)
-; CI-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i64
-; CI-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; CI-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; CI-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; CI-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
+; CI-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4)
+; CI-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32
+; CI-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; CI-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; CI-NEXT:    [[MASK:%.*]] = shl i32 65535, [[TMP2]]
 ; CI-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
 ; CI-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
 ; CI-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; CI:       atomicrmw.start:
 ; CI-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; CI-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; CI-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
 ; CI-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
 ; CI-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
 ; CI-NEXT:    [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]]
 ; CI-NEXT:    [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16
 ; CI-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
-; CI-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; CI-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
 ; CI-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
 ; CI-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
 ; CI-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4
@@ -3065,30 +3052,29 @@ define bfloat @test_atomicrmw_fadd_bf16_local_strictfp(ptr addrspace(3) %ptr, bf
 ; CI-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
 ; CI-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; CI:       atomicrmw.end:
-; CI-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; CI-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
 ; CI-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
 ; CI-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat
 ; CI-NEXT:    ret bfloat [[TMP7]]
 ;
 ; GFX9-LABEL: @test_atomicrmw_fadd_bf16_local_strictfp(
-; GFX9-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i64(ptr addrspace(3) [[PTR:%.*]], i64 -4)
-; GFX9-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i64
-; GFX9-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; GFX9-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; GFX9-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; GFX9-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
+; GFX9-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4)
+; GFX9-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32
+; GFX9-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; GFX9-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; GFX9-NEXT:    [[MASK:%.*]] = shl i32 65535, [[TMP2]]
 ; GFX9-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
 ; GFX9-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
 ; GFX9-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; GFX9:       atomicrmw.start:
 ; GFX9-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GFX9-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; GFX9-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
 ; GFX9-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
 ; GFX9-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
 ; GFX9-NEXT:    [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]]
 ; GFX9-NEXT:    [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16
 ; GFX9-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
-; GFX9-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; GFX9-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
 ; GFX9-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
 ; GFX9-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
 ; GFX9-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4
@@ -3096,30 +3082,29 @@ define bfloat @test_atomicrmw_fadd_bf16_local_strictfp(ptr addrspace(3) %ptr, bf
 ; GFX9-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
 ; GFX9-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; GFX9:       atomicrmw.end:
-; GFX9-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; GFX9-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
 ; GFX9-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
 ; GFX9-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat
 ; GFX9-NEXT:    ret bfloat [[TMP7]]
 ;
 ; GFX908-LABEL: @test_atomicrmw_fadd_bf16_local_strictfp(
-; GFX908-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i64(ptr addrspace(3) [[PTR:%.*]], i64 -4)
-; GFX908-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i64
-; GFX908-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; GFX908-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; GFX908-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; GFX908-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
+; GFX908-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4)
+; GFX908-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32
+; GFX908-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; GFX908-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; GFX908-NEXT:    [[MASK:%.*]] = shl i32 65535, [[TMP2]]
 ; GFX908-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
 ; GFX908-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
 ; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; GFX908:       atomicrmw.start:
 ; GFX908-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GFX908-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; GFX908-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
 ; GFX908-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
 ; GFX908-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
 ; GFX908-NEXT:    [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]]
 ; GFX908-NEXT:    [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16
 ; GFX908-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
-; GFX908-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; GFX908-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
 ; GFX908-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
 ; GFX908-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
 ; GFX908-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4
@@ -3127,30 +3112,29 @@ define bfloat @test_atomicrmw_fadd_bf16_local_strictfp(ptr addrspace(3) %ptr, bf
 ; GFX908-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
 ; GFX908-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; GFX908:       atomicrmw.end:
-; GFX908-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; GFX908-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
 ; GFX908-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
 ; GFX908-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat
 ; GFX908-NEXT:    ret bfloat [[TMP7]]
 ;
 ; GFX90A-LABEL: @test_atomicrmw_fadd_bf16_local_strictfp(
-; GFX90A-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i64(ptr addrspace(3) [[PTR:%.*]], i64 -4)
-; GFX90A-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i64
-; GFX90A-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; GFX90A-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; GFX90A-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; GFX90A-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
+; GFX90A-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4)
+; GFX90A-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32
+; GFX90A-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; GFX90A-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; GFX90A-NEXT:    [[MASK:%.*]] = shl i32 65535, [[TMP2]]
 ; GFX90A-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
 ; GFX90A-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
 ; GFX90A-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; GFX90A:       atomicrmw.start:
 ; GFX90A-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GFX90A-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; GFX90A-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
 ; GFX90A-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
 ; GFX90A-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
 ; GFX90A-NEXT:    [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]]
 ; GFX90A-NEXT:    [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16
 ; GFX90A-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
-; GFX90A-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; GFX90A-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
 ; GFX90A-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
 ; GFX90A-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
 ; GFX90A-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4
@@ -3158,30 +3142,29 @@ define bfloat @test_atomicrmw_fadd_bf16_local_strictfp(ptr addrspace(3) %ptr, bf
 ; GFX90A-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
 ; GFX90A-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; GFX90A:       atomicrmw.end:
-; GFX90A-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; GFX90A-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
 ; GFX90A-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
 ; GFX90A-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat
 ; GFX90A-NEXT:    ret bfloat [[TMP7]]
 ;
 ; GFX940-LABEL: @test_atomicrmw_fadd_bf16_local_strictfp(
-; GFX940-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i64(ptr addrspace(3) [[PTR:%.*]], i64 -4)
-; GFX940-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i64
-; GFX940-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; GFX940-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; GFX940-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; GFX940-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
+; GFX940-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4)
+; GFX940-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32
+; GFX940-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; GFX940-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; GFX940-NEXT:    [[MASK:%.*]] = shl i32 65535, [[TMP2]]
 ; GFX940-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
 ; GFX940-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
 ; GFX940-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; GFX940:       atomicrmw.start:
 ; GFX940-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GFX940-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; GFX940-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
 ; GFX940-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
 ; GFX940-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
 ; GFX940-NEXT:    [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]]
 ; GFX940-NEXT:    [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16
 ; GFX940-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
-; GFX940-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; GFX940-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
 ; GFX940-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
 ; GFX940-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
 ; GFX940-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4
@@ -3189,30 +3172,29 @@ define bfloat @test_atomicrmw_fadd_bf16_local_strictfp(ptr addrspace(3) %ptr, bf
 ; GFX940-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
 ; GFX940-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; GFX940:       atomicrmw.end:
-; GFX940-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; GFX940-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
 ; GFX940-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
 ; GFX940-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat
 ; GFX940-NEXT:    ret bfloat [[TMP7]]
 ;
 ; GFX11-LABEL: @test_atomicrmw_fadd_bf16_local_strictfp(
-; GFX11-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i64(ptr addrspace(3) [[PTR:%.*]], i64 -4)
-; GFX11-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i64
-; GFX11-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; GFX11-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; GFX11-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; GFX11-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
+; GFX11-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4)
+; GFX11-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32
+; GFX11-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; GFX11-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; GFX11-NEXT:    [[MASK:%.*]] = shl i32 65535, [[TMP2]]
 ; GFX11-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
 ; GFX11-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
 ; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; GFX11:       atomicrmw.start:
 ; GFX11-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GFX11-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; GFX11-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
 ; GFX11-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
 ; GFX11-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
 ; GFX11-NEXT:    [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]]
 ; GFX11-NEXT:    [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16
 ; GFX11-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
-; GFX11-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; GFX11-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
 ; GFX11-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
 ; GFX11-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
 ; GFX11-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4
@@ -3220,7 +3202,7 @@ define bfloat @test_atomicrmw_fadd_bf16_local_strictfp(ptr addrspace(3) %ptr, bf
 ; GFX11-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
 ; GFX11-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; GFX11:       atomicrmw.end:
-; GFX11-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; GFX11-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
 ; GFX11-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
 ; GFX11-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat
 ; GFX11-NEXT:    ret bfloat [[TMP7]]
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmax.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmax.ll
index 5c4462caafb84..9dfbe9b4eb741 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmax.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmax.ll
@@ -165,24 +165,23 @@ define half @test_atomicrmw_fmax_f16_global_align4(ptr addrspace(1) %ptr, half %
 
 define half @test_atomicrmw_fmax_f16_local(ptr addrspace(3) %ptr, half %value) {
 ; GCN-LABEL: @test_atomicrmw_fmax_f16_local(
-; GCN-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i64(ptr addrspace(3) [[PTR:%.*]], i64 -4)
-; GCN-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i64
-; GCN-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; GCN-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; GCN-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; GCN-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
+; GCN-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4)
+; GCN-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32
+; GCN-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; GCN-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; GCN-NEXT:    [[MASK:%.*]] = shl i32 65535, [[TMP2]]
 ; GCN-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
 ; GCN-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
 ; GCN-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; GCN:       atomicrmw.start:
 ; GCN-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GCN-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
 ; GCN-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
 ; GCN-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to half
 ; GCN-NEXT:    [[TMP5:%.*]] = call half @llvm.maxnum.f16(half [[TMP4]], half [[VALUE:%.*]])
 ; GCN-NEXT:    [[TMP6:%.*]] = bitcast half [[TMP5]] to i16
 ; GCN-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP6]] to i32
-; GCN-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
 ; GCN-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
 ; GCN-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
 ; GCN-NEXT:    [[TMP7:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4
@@ -190,7 +189,7 @@ define half @test_atomicrmw_fmax_f16_local(ptr addrspace(3) %ptr, half %value) {
 ; GCN-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP7]], 0
 ; GCN-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; GCN:       atomicrmw.end:
-; GCN-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
 ; GCN-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
 ; GCN-NEXT:    [[TMP8:%.*]] = bitcast i16 [[EXTRACTED3]] to half
 ; GCN-NEXT:    ret half [[TMP8]]
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmin.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmin.ll
index 0e32489286a16..5a732653b48b1 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmin.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fmin.ll
@@ -165,24 +165,23 @@ define half @test_atomicrmw_fmin_f16_global_align4(ptr addrspace(1) %ptr, half %
 
 define half @test_atomicrmw_fmin_f16_local(ptr addrspace(3) %ptr, half %value) {
 ; GCN-LABEL: @test_atomicrmw_fmin_f16_local(
-; GCN-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i64(ptr addrspace(3) [[PTR:%.*]], i64 -4)
-; GCN-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i64
-; GCN-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; GCN-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; GCN-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; GCN-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
+; GCN-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4)
+; GCN-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32
+; GCN-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; GCN-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; GCN-NEXT:    [[MASK:%.*]] = shl i32 65535, [[TMP2]]
 ; GCN-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
 ; GCN-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
 ; GCN-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; GCN:       atomicrmw.start:
 ; GCN-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GCN-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
 ; GCN-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
 ; GCN-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to half
 ; GCN-NEXT:    [[TMP5:%.*]] = call half @llvm.minnum.f16(half [[TMP4]], half [[VALUE:%.*]])
 ; GCN-NEXT:    [[TMP6:%.*]] = bitcast half [[TMP5]] to i16
 ; GCN-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP6]] to i32
-; GCN-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
 ; GCN-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
 ; GCN-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
 ; GCN-NEXT:    [[TMP7:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4
@@ -190,7 +189,7 @@ define half @test_atomicrmw_fmin_f16_local(ptr addrspace(3) %ptr, half %value) {
 ; GCN-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP7]], 0
 ; GCN-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; GCN:       atomicrmw.end:
-; GCN-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
 ; GCN-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
 ; GCN-NEXT:    [[TMP8:%.*]] = bitcast i16 [[EXTRACTED3]] to half
 ; GCN-NEXT:    ret half [[TMP8]]
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fsub.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fsub.ll
index f1b15f4e46679..9805c317b9215 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fsub.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fsub.ll
@@ -165,24 +165,23 @@ define half @test_atomicrmw_fsub_f16_global_align4(ptr addrspace(1) %ptr, half %
 
 define half @test_atomicrmw_fsub_f16_local(ptr addrspace(3) %ptr, half %value) {
 ; GCN-LABEL: @test_atomicrmw_fsub_f16_local(
-; GCN-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i64(ptr addrspace(3) [[PTR:%.*]], i64 -4)
-; GCN-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i64
-; GCN-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; GCN-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; GCN-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; GCN-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
+; GCN-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4)
+; GCN-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32
+; GCN-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; GCN-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; GCN-NEXT:    [[MASK:%.*]] = shl i32 65535, [[TMP2]]
 ; GCN-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
 ; GCN-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
 ; GCN-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; GCN:       atomicrmw.start:
 ; GCN-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GCN-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
 ; GCN-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
 ; GCN-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to half
 ; GCN-NEXT:    [[NEW:%.*]] = fsub half [[TMP4]], [[VALUE:%.*]]
 ; GCN-NEXT:    [[TMP5:%.*]] = bitcast half [[NEW]] to i16
 ; GCN-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
-; GCN-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
 ; GCN-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
 ; GCN-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
 ; GCN-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4
@@ -190,7 +189,7 @@ define half @test_atomicrmw_fsub_f16_local(ptr addrspace(3) %ptr, half %value) {
 ; GCN-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
 ; GCN-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; GCN:       atomicrmw.end:
-; GCN-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
 ; GCN-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
 ; GCN-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to half
 ; GCN-NEXT:    ret half [[TMP7]]
@@ -285,24 +284,23 @@ define float @test_atomicrmw_fsub_f32_global_strictfp(ptr addrspace(1) %ptr, flo
 
 define bfloat @test_atomicrmw_fadd_bf16_local(ptr addrspace(3) %ptr, bfloat %value) {
 ; GCN-LABEL: @test_atomicrmw_fadd_bf16_local(
-; GCN-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i64(ptr addrspace(3) [[PTR:%.*]], i64 -4)
-; GCN-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i64
-; GCN-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; GCN-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; GCN-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; GCN-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
+; GCN-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4)
+; GCN-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32
+; GCN-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; GCN-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; GCN-NEXT:    [[MASK:%.*]] = shl i32 65535, [[TMP2]]
 ; GCN-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
 ; GCN-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
 ; GCN-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; GCN:       atomicrmw.start:
 ; GCN-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GCN-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
 ; GCN-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
 ; GCN-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
 ; GCN-NEXT:    [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]]
 ; GCN-NEXT:    [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16
 ; GCN-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
-; GCN-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
 ; GCN-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
 ; GCN-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
 ; GCN-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4
@@ -310,7 +308,7 @@ define bfloat @test_atomicrmw_fadd_bf16_local(ptr addrspace(3) %ptr, bfloat %val
 ; GCN-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
 ; GCN-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; GCN:       atomicrmw.end:
-; GCN-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
 ; GCN-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
 ; GCN-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat
 ; GCN-NEXT:    ret bfloat [[TMP7]]
@@ -471,24 +469,23 @@ define bfloat @test_atomicrmw_fadd_bf16_global_system_align4(ptr addrspace(1) %p
 
 define bfloat @test_atomicrmw_fadd_bf16_local_strictfp(ptr addrspace(3) %ptr, bfloat %value) #2 {
 ; GCN-LABEL: @test_atomicrmw_fadd_bf16_local_strictfp(
-; GCN-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i64(ptr addrspace(3) [[PTR:%.*]], i64 -4)
-; GCN-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i64
-; GCN-NEXT:    [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
-; GCN-NEXT:    [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
-; GCN-NEXT:    [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
-; GCN-NEXT:    [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
+; GCN-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4)
+; GCN-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32
+; GCN-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; GCN-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; GCN-NEXT:    [[MASK:%.*]] = shl i32 65535, [[TMP2]]
 ; GCN-NEXT:    [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
 ; GCN-NEXT:    [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
 ; GCN-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; GCN:       atomicrmw.start:
 ; GCN-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
-; GCN-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
 ; GCN-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
 ; GCN-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
 ; GCN-NEXT:    [[NEW:%.*]] = fadd bfloat [[TMP4]], [[VALUE:%.*]]
 ; GCN-NEXT:    [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16
 ; GCN-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
-; GCN-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
 ; GCN-NEXT:    [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
 ; GCN-NEXT:    [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
 ; GCN-NEXT:    [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] monotonic monotonic, align 4
@@ -496,7 +493,7 @@ define bfloat @test_atomicrmw_fadd_bf16_local_strictfp(ptr addrspace(3) %ptr, bf
 ; GCN-NEXT:    [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
 ; GCN-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
 ; GCN:       atomicrmw.end:
-; GCN-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; GCN-NEXT:    [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
 ; GCN-NEXT:    [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
 ; GCN-NEXT:    [[TMP7:%.*]] = bitcast i16 [[EXTRACTED3]] to bfloat
 ; GCN-NEXT:    ret bfloat [[TMP7]]
diff --git a/llvm/test/Transforms/AtomicExpand/PowerPC/cmpxchg.ll b/llvm/test/Transforms/AtomicExpand/PowerPC/cmpxchg.ll
index 9a327f087bcd1..169d73cc0308d 100644
--- a/llvm/test/Transforms/AtomicExpand/PowerPC/cmpxchg.ll
+++ b/llvm/test/Transforms/AtomicExpand/PowerPC/cmpxchg.ll
@@ -33,16 +33,12 @@ define i1 @test_cmpxchg_seq_cst(ptr %addr, i128 %desire, i128 %new) {
 ; PWR7-NEXT:    [[TMP0:%.*]] = alloca i128, align 8
 ; PWR7-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[TMP0]])
 ; PWR7-NEXT:    store i128 [[DESIRE:%.*]], ptr [[TMP0]], align 8
-; PWR7-NEXT:    [[TMP1:%.*]] = alloca i128, align 8
-; PWR7-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[TMP1]])
-; PWR7-NEXT:    store i128 [[NEW:%.*]], ptr [[TMP1]], align 8
-; PWR7-NEXT:    [[TMP2:%.*]] = call zeroext i1 @__atomic_compare_exchange(i64 16, ptr [[ADDR:%.*]], ptr [[TMP0]], ptr [[TMP1]], i32 5, i32 5)
-; PWR7-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[TMP1]])
-; PWR7-NEXT:    [[TMP3:%.*]] = load i128, ptr [[TMP0]], align 8
+; PWR7-NEXT:    [[TMP1:%.*]] = call zeroext i1 @__atomic_compare_exchange_16(ptr [[ADDR:%.*]], ptr [[TMP0]], i128 [[NEW:%.*]], i32 5, i32 5)
+; PWR7-NEXT:    [[TMP2:%.*]] = load i128, ptr [[TMP0]], align 8
 ; PWR7-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr [[TMP0]])
-; PWR7-NEXT:    [[TMP4:%.*]] = insertvalue { i128, i1 } poison, i128 [[TMP3]], 0
-; PWR7-NEXT:    [[TMP5:%.*]] = insertvalue { i128, i1 } [[TMP4]], i1 [[TMP2]], 1
-; PWR7-NEXT:    [[SUCC:%.*]] = extractvalue { i128, i1 } [[TMP5]], 1
+; PWR7-NEXT:    [[TMP3:%.*]] = insertvalue { i128, i1 } poison, i128 [[TMP2]], 0
+; PWR7-NEXT:    [[TMP4:%.*]] = insertvalue { i128, i1 } [[TMP3]], i1 [[TMP1]], 1
+; PWR7-NEXT:    [[SUCC:%.*]] = extractvalue { i128, i1 } [[TMP4]], 1
 ; PWR7-NEXT:    ret i1 [[SUCC]]
 ;
 entry:
diff --git a/llvm/test/Transforms/AtomicExpand/X86/expand-atomic-libcall.ll b/llvm/test/Transforms/AtomicExpand/X86/expand-atomic-libcall.ll
index 826b9be7a1e5d..8d71966c04d03 100644
--- a/llvm/test/Transforms/AtomicExpand/X86/expand-atomic-libcall.ll
+++ b/llvm/test/Transforms/AtomicExpand/X86/expand-atomic-libcall.ll
@@ -4,10 +4,10 @@
 
 define i256 @atomic_load256_libcall(ptr %ptr) nounwind {
 ; CHECK-LABEL: @atomic_load256_libcall(
-; CHECK-NEXT:    [[TMP1:%.*]] = alloca i256, align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = alloca i256, align 16
 ; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[TMP1]])
-; CHECK-NEXT:    call void @__atomic_load(i64 32, ptr [[PTR:%.*]], ptr [[TMP1]], i32 0)
-; CHECK-NEXT:    [[TMP2:%.*]] = load i256, ptr [[TMP1]], align 8
+; CHECK-NEXT:    call void @__atomic_load(i32 32, ptr [[PTR:%.*]], ptr [[TMP1]], i32 0)
+; CHECK-NEXT:    [[TMP2:%.*]] = load i256, ptr [[TMP1]], align 16
 ; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[TMP1]])
 ; CHECK-NEXT:    ret i256 [[TMP2]]
 ;
@@ -18,10 +18,10 @@ define i256 @atomic_load256_libcall(ptr %ptr) nounwind {
 define i256 @atomic_load256_libcall_as1(ptr addrspace(1) %ptr) nounwind {
 ; CHECK-LABEL: @atomic_load256_libcall_as1(
 ; CHECK-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[PTR:%.*]] to ptr
-; CHECK-NEXT:    [[TMP2:%.*]] = alloca i256, align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = alloca i256, align 16
 ; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr [[TMP2]])
-; CHECK-NEXT:    call void @__atomic_load(i64 32, ptr [[TMP1]], ptr [[TMP2]], i32 0)
-; CHECK-NEXT:    [[TMP3:%.*]] = load i256, ptr [[TMP2]], align 8
+; CHECK-NEXT:    call void @__atomic_load(i32 32, ptr [[TMP1]], ptr [[TMP2]], i32 0)
+; CHECK-NEXT:    [[TMP3:%.*]] = load i256, ptr [[TMP2]], align 16
 ; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr [[TMP2]])
 ; CHECK-NEXT:    ret i256 [[TMP3]]
 ;
diff --git a/llvm/test/Transforms/CodeGenPrepare/AMDGPU/sink-addrspacecast.ll b/llvm/test/Transforms/CodeGenPrepare/AMDGPU/sink-addrspacecast.ll
index be7ae0ef3f42c..7e6020629df82 100644
--- a/llvm/test/Transforms/CodeGenPrepare/AMDGPU/sink-addrspacecast.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/AMDGPU/sink-addrspacecast.ll
@@ -7,10 +7,10 @@ define i64 @no_sink_local_to_flat(i1 %pred, ptr addrspace(3) %ptr) {
 ; CHECK-NEXT:    [[PTR_CAST:%.*]] = addrspacecast ptr addrspace(3) [[PTR]] to ptr
 ; CHECK-NEXT:    br i1 [[PRED]], label [[L1:%.*]], label [[L2:%.*]]
 ; CHECK:       l1:
-; CHECK-NEXT:    [[V1:%.*]] = load i64, ptr addrspace(3) [[PTR]], align 4
+; CHECK-NEXT:    [[V1:%.*]] = load i64, ptr addrspace(3) [[PTR]], align 8
 ; CHECK-NEXT:    ret i64 [[V1]]
 ; CHECK:       l2:
-; CHECK-NEXT:    [[V2:%.*]] = load i64, ptr [[PTR_CAST]], align 4
+; CHECK-NEXT:    [[V2:%.*]] = load i64, ptr [[PTR_CAST]], align 8
 ; CHECK-NEXT:    ret i64 [[V2]]
 ;
   %ptr_cast = addrspacecast ptr addrspace(3) %ptr to ptr
@@ -31,10 +31,10 @@ define i64 @no_sink_private_to_flat(i1 %pred, ptr addrspace(5) %ptr) {
 ; CHECK-NEXT:    [[PTR_CAST:%.*]] = addrspacecast ptr addrspace(5) [[PTR]] to ptr
 ; CHECK-NEXT:    br i1 [[PRED]], label [[L1:%.*]], label [[L2:%.*]]
 ; CHECK:       l1:
-; CHECK-NEXT:    [[V1:%.*]] = load i64, ptr addrspace(5) [[PTR]], align 4
+; CHECK-NEXT:    [[V1:%.*]] = load i64, ptr addrspace(5) [[PTR]], align 8
 ; CHECK-NEXT:    ret i64 [[V1]]
 ; CHECK:       l2:
-; CHECK-NEXT:    [[V2:%.*]] = load i64, ptr [[PTR_CAST]], align 4
+; CHECK-NEXT:    [[V2:%.*]] = load i64, ptr [[PTR_CAST]], align 8
 ; CHECK-NEXT:    ret i64 [[V2]]
 ;
   %ptr_cast = addrspacecast ptr addrspace(5) %ptr to ptr
@@ -55,11 +55,11 @@ define i64 @sink_global_to_flat(i1 %pred, ptr addrspace(1) %ptr) {
 ; CHECK-SAME: i1 [[PRED:%.*]], ptr addrspace(1) [[PTR:%.*]]) {
 ; CHECK-NEXT:    br i1 [[PRED]], label [[L1:%.*]], label [[L2:%.*]]
 ; CHECK:       l1:
-; CHECK-NEXT:    [[V1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 4
+; CHECK-NEXT:    [[V1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8
 ; CHECK-NEXT:    ret i64 [[V1]]
 ; CHECK:       l2:
 ; CHECK-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr
-; CHECK-NEXT:    [[V2:%.*]] = load i64, ptr [[TMP1]], align 4
+; CHECK-NEXT:    [[V2:%.*]] = load i64, ptr [[TMP1]], align 8
 ; CHECK-NEXT:    ret i64 [[V2]]
 ;
   %ptr_cast = addrspacecast ptr addrspace(1) %ptr to ptr
@@ -79,11 +79,11 @@ define i64 @sink_flat_to_global(i1 %pred, ptr %ptr) {
 ; CHECK-SAME: i1 [[PRED:%.*]], ptr [[PTR:%.*]]) {
 ; CHECK-NEXT:    br i1 [[PRED]], label [[L1:%.*]], label [[L2:%.*]]
 ; CHECK:       l1:
-; CHECK-NEXT:    [[V1:%.*]] = load i64, ptr [[PTR]], align 4
+; CHECK-NEXT:    [[V1:%.*]] = load i64, ptr [[PTR]], align 8
 ; CHECK-NEXT:    ret i64 [[V1]]
 ; CHECK:       l2:
 ; CHECK-NEXT:    [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(1)
-; CHECK-NEXT:    [[V2:%.*]] = load i64, ptr addrspace(1) [[TMP1]], align 4
+; CHECK-NEXT:    [[V2:%.*]] = load i64, ptr addrspace(1) [[TMP1]], align 8
 ; CHECK-NEXT:    ret i64 [[V2]]
 ;
   %ptr_cast = addrspacecast ptr %ptr to ptr addrspace(1)
@@ -103,11 +103,11 @@ define i64 @sink_flat_to_constant(i1 %pred, ptr %ptr) {
 ; CHECK-SAME: i1 [[PRED:%.*]], ptr [[PTR:%.*]]) {
 ; CHECK-NEXT:    br i1 [[PRED]], label [[L1:%.*]], label [[L2:%.*]]
 ; CHECK:       l1:
-; CHECK-NEXT:    [[V1:%.*]] = load i64, ptr [[PTR]], align 4
+; CHECK-NEXT:    [[V1:%.*]] = load i64, ptr [[PTR]], align 8
 ; CHECK-NEXT:    ret i64 [[V1]]
 ; CHECK:       l2:
 ; CHECK-NEXT:    [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(4)
-; CHECK-NEXT:    [[V2:%.*]] = load i64, ptr addrspace(4) [[TMP1]], align 4
+; CHECK-NEXT:    [[V2:%.*]] = load i64, ptr addrspace(4) [[TMP1]], align 8
 ; CHECK-NEXT:    ret i64 [[V2]]
 ;
   %ptr_cast = addrspacecast ptr %ptr to ptr addrspace(4)
@@ -125,13 +125,13 @@ l2:
 define i64 @sink_flat_to_local(i1 %pred, ptr %ptr) {
 ; CHECK-LABEL: define i64 @sink_flat_to_local(
 ; CHECK-SAME: i1 [[PRED:%.*]], ptr [[PTR:%.*]]) {
+; CHECK-NEXT:    [[PTR_CAST:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(3)
 ; CHECK-NEXT:    br i1 [[PRED]], label [[L1:%.*]], label [[L2:%.*]]
 ; CHECK:       l1:
-; CHECK-NEXT:    [[V1:%.*]] = load i64, ptr [[PTR]], align 4
+; CHECK-NEXT:    [[V1:%.*]] = load i64, ptr [[PTR]], align 8
 ; CHECK-NEXT:    ret i64 [[V1]]
 ; CHECK:       l2:
-; CHECK-NEXT:    [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(3)
-; CHECK-NEXT:    [[V2:%.*]] = load i64, ptr addrspace(3) [[TMP1]], align 4
+; CHECK-NEXT:    [[V2:%.*]] = load i64, ptr addrspace(3) [[PTR_CAST]], align 8
 ; CHECK-NEXT:    ret i64 [[V2]]
 ;
   %ptr_cast = addrspacecast ptr %ptr to ptr addrspace(3)
@@ -149,13 +149,13 @@ l2:
 define i64 @sink_flat_to_private(i1 %pred, ptr %ptr) {
 ; CHECK-LABEL: define i64 @sink_flat_to_private(
 ; CHECK-SAME: i1 [[PRED:%.*]], ptr [[PTR:%.*]]) {
+; CHECK-NEXT:    [[PTR_CAST:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5)
 ; CHECK-NEXT:    br i1 [[PRED]], label [[L1:%.*]], label [[L2:%.*]]
 ; CHECK:       l1:
-; CHECK-NEXT:    [[V1:%.*]] = load i64, ptr [[PTR]], align 4
+; CHECK-NEXT:    [[V1:%.*]] = load i64, ptr [[PTR]], align 8
 ; CHECK-NEXT:    ret i64 [[V1]]
 ; CHECK:       l2:
-; CHECK-NEXT:    [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5)
-; CHECK-NEXT:    [[V2:%.*]] = load i64, ptr addrspace(5) [[TMP1]], align 4
+; CHECK-NEXT:    [[V2:%.*]] = load i64, ptr addrspace(5) [[PTR_CAST]], align 8
 ; CHECK-NEXT:    ret i64 [[V2]]
 ;
   %ptr_cast = addrspacecast ptr %ptr to ptr addrspace(5)
diff --git a/llvm/test/Transforms/DivRemPairs/X86/div-expanded-rem-pair.ll b/llvm/test/Transforms/DivRemPairs/X86/div-expanded-rem-pair.ll
index 9a13d8bb9e1fc..5340527a046a5 100644
--- a/llvm/test/Transforms/DivRemPairs/X86/div-expanded-rem-pair.ll
+++ b/llvm/test/Transforms/DivRemPairs/X86/div-expanded-rem-pair.ll
@@ -214,7 +214,7 @@ define i64 @remainder_triangle_i64(i64 %a, i64 %b, ptr %rp) {
 ; CHECK-NEXT:    [[REM:%.*]] = urem i64 [[A]], [[B]]
 ; CHECK-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[END:%.*]]
 ; CHECK:       if.then:
-; CHECK-NEXT:    store i64 [[REM]], ptr [[RP]], align 4
+; CHECK-NEXT:    store i64 [[REM]], ptr [[RP]], align 8
 ; CHECK-NEXT:    br label [[END]]
 ; CHECK:       end:
 ; CHECK-NEXT:    ret i64 [[DIV]]
@@ -246,7 +246,7 @@ define i128 @remainder_triangle_i128(i128 %a, i128 %b, ptr %rp) {
 ; CHECK:       if.then:
 ; CHECK-NEXT:    [[TMP0:%.*]] = mul i128 [[DIV]], [[B_FROZEN]]
 ; CHECK-NEXT:    [[REM_DECOMPOSED:%.*]] = sub i128 [[A_FROZEN]], [[TMP0]]
-; CHECK-NEXT:    store i128 [[REM_DECOMPOSED]], ptr [[RP]], align 4
+; CHECK-NEXT:    store i128 [[REM_DECOMPOSED]], ptr [[RP]], align 16
 ; CHECK-NEXT:    br label [[END]]
 ; CHECK:       end:
 ; CHECK-NEXT:    ret i128 [[DIV]]
@@ -275,7 +275,7 @@ define i64 @remainder_triangle_i64_multiple_rem_edges(i64 %a, i64 %b, i64 %c, pt
 ; CHECK-NEXT:    i64 2, label [[SW_BB]]
 ; CHECK-NEXT:    ]
 ; CHECK:       sw.bb:
-; CHECK-NEXT:    store i64 [[REM]], ptr [[RP:%.*]], align 4
+; CHECK-NEXT:    store i64 [[REM]], ptr [[RP:%.*]], align 8
 ; CHECK-NEXT:    br label [[SW_DEFAULT]]
 ; CHECK:       sw.default:
 ; CHECK-NEXT:    ret i64 [[DIV]]
@@ -306,7 +306,7 @@ define i64 @remainder_triangle_i64_multiple_div_edges(i64 %a, i64 %b, i64 %c, pt
 ; CHECK-NEXT:    i64 2, label [[SW_BB]]
 ; CHECK-NEXT:    ]
 ; CHECK:       sw.default:
-; CHECK-NEXT:    store i64 [[REM]], ptr [[RP:%.*]], align 4
+; CHECK-NEXT:    store i64 [[REM]], ptr [[RP:%.*]], align 8
 ; CHECK-NEXT:    br label [[SW_BB]]
 ; CHECK:       sw.bb:
 ; CHECK-NEXT:    ret i64 [[DIV]]
@@ -339,7 +339,7 @@ define i64 @remainder_triangle_i64_maythrow_rem(i64 %a, i64 %b, ptr %rp) {
 ; CHECK:       if.then:
 ; CHECK-NEXT:    call void @maythrow()
 ; CHECK-NEXT:    [[REM:%.*]] = urem i64 [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    store i64 [[REM]], ptr [[RP]], align 4
+; CHECK-NEXT:    store i64 [[REM]], ptr [[RP]], align 8
 ; CHECK-NEXT:    br label [[END]]
 ; CHECK:       end:
 ; CHECK-NEXT:    [[DIV:%.*]] = udiv i64 [[A]], [[B]]
@@ -369,7 +369,7 @@ define i64 @remainder_triangle_i64_maythrow_div(i64 %a, i64 %b, ptr %rp) {
 ; CHECK-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[END:%.*]]
 ; CHECK:       if.then:
 ; CHECK-NEXT:    [[REM:%.*]] = urem i64 [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    store i64 [[REM]], ptr [[RP]], align 4
+; CHECK-NEXT:    store i64 [[REM]], ptr [[RP]], align 8
 ; CHECK-NEXT:    br label [[END]]
 ; CHECK:       end:
 ; CHECK-NEXT:    call void @maythrow()
@@ -401,7 +401,7 @@ define i128 @remainder_triangle_i128_maythrow_rem(i128 %a, i128 %b, ptr %rp) {
 ; CHECK:       if.then:
 ; CHECK-NEXT:    call void @maythrow()
 ; CHECK-NEXT:    [[REM:%.*]] = urem i128 [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    store i128 [[REM]], ptr [[RP]], align 4
+; CHECK-NEXT:    store i128 [[REM]], ptr [[RP]], align 16
 ; CHECK-NEXT:    br label [[END]]
 ; CHECK:       end:
 ; CHECK-NEXT:    [[DIV:%.*]] = udiv i128 [[A]], [[B]]
@@ -431,7 +431,7 @@ define i128 @remainder_triangle_i128_maythrow_div(i128 %a, i128 %b, ptr %rp) {
 ; CHECK-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[END:%.*]]
 ; CHECK:       if.then:
 ; CHECK-NEXT:    [[REM:%.*]] = urem i128 [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    store i128 [[REM]], ptr [[RP]], align 4
+; CHECK-NEXT:    store i128 [[REM]], ptr [[RP]], align 16
 ; CHECK-NEXT:    br label [[END]]
 ; CHECK:       end:
 ; CHECK-NEXT:    call void @maythrow()
@@ -464,7 +464,7 @@ define i64 @remainder_not_triangle_i32(i64 %a, i64 %b, i64 %c, ptr %rp) {
 ; CHECK-NEXT:    ]
 ; CHECK:       sw.bb:
 ; CHECK-NEXT:    [[REM:%.*]] = urem i64 [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    store i64 [[REM]], ptr [[RP:%.*]], align 4
+; CHECK-NEXT:    store i64 [[REM]], ptr [[RP:%.*]], align 8
 ; CHECK-NEXT:    br label [[SW_BB1]]
 ; CHECK:       sw.bb1:
 ; CHECK-NEXT:    [[DIV:%.*]] = udiv i64 [[A]], [[B]]
@@ -501,7 +501,7 @@ define i64 @remainder_not_triangle_i32_2(i64 %a, i64 %b, i64 %c, ptr %rp) {
 ; CHECK-NEXT:    br i1 [[TOBOOL_NOT]], label [[IF_END3:%.*]], label [[IF_THEN:%.*]]
 ; CHECK:       if.then:
 ; CHECK-NEXT:    [[REM:%.*]] = urem i64 [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    store i64 [[REM]], ptr [[RP]], align 4
+; CHECK-NEXT:    store i64 [[REM]], ptr [[RP]], align 8
 ; CHECK-NEXT:    [[TOBOOL1_NOT:%.*]] = icmp eq i64 [[C:%.*]], 0
 ; CHECK-NEXT:    br i1 [[TOBOOL1_NOT]], label [[IF_END3]], label [[RETURN:%.*]]
 ; CHECK:       if.end3:
diff --git a/llvm/test/Transforms/IndVarSimplify/ARM/indvar-cost.ll b/llvm/test/Transforms/IndVarSimplify/ARM/indvar-cost.ll
index 2c92f2a315362..b047bf5fc120c 100644
--- a/llvm/test/Transforms/IndVarSimplify/ARM/indvar-cost.ll
+++ b/llvm/test/Transforms/IndVarSimplify/ARM/indvar-cost.ll
@@ -13,6 +13,9 @@ define dso_local arm_aapcscc void @arm_conv_fast_q15(ptr %pSrcA, i32 %srcALen, p
 ; CHECK-T1-NEXT:    [[CMP41080:%.*]] = icmp eq i32 [[SUB]], 0
 ; CHECK-T1-NEXT:    br i1 [[CMP41080]], label [[WHILE_END13:%.*]], label [[WHILE_COND5_PREHEADER_PREHEADER:%.*]]
 ; CHECK-T1:       while.cond5.preheader.preheader:
+; CHECK-T1-NEXT:    [[TMP0:%.*]] = add i32 [[SRCALEN_SRCBLEN]], -2
+; CHECK-T1-NEXT:    [[UMIN:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP0]], i32 2)
+; CHECK-T1-NEXT:    [[TMP1:%.*]] = add nuw nsw i32 [[UMIN]], 2
 ; CHECK-T1-NEXT:    br label [[WHILE_COND5_PREHEADER:%.*]]
 ; CHECK-T1:       while.cond5.preheader:
 ; CHECK-T1-NEXT:    [[COUNT_01084:%.*]] = phi i32 [ [[INC:%.*]], [[WHILE_END:%.*]] ], [ 1, [[WHILE_COND5_PREHEADER_PREHEADER]] ]
@@ -26,11 +29,11 @@ define dso_local arm_aapcscc void @arm_conv_fast_q15(ptr %pSrcA, i32 %srcALen, p
 ; CHECK-T1-NEXT:    [[PY_11076:%.*]] = phi ptr [ [[INCDEC_PTR8:%.*]], [[WHILE_BODY7]] ], [ [[PY_01082]], [[WHILE_COND5_PREHEADER]] ]
 ; CHECK-T1-NEXT:    [[PX_11075:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[WHILE_BODY7]] ], [ [[PSRCB_PSRCA]], [[WHILE_COND5_PREHEADER]] ]
 ; CHECK-T1-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i16, ptr [[PX_11075]], i32 1
-; CHECK-T1-NEXT:    [[TMP0:%.*]] = load i16, ptr [[PX_11075]], align 2
-; CHECK-T1-NEXT:    [[CONV:%.*]] = sext i16 [[TMP0]] to i32
+; CHECK-T1-NEXT:    [[TMP2:%.*]] = load i16, ptr [[PX_11075]], align 2
+; CHECK-T1-NEXT:    [[CONV:%.*]] = sext i16 [[TMP2]] to i32
 ; CHECK-T1-NEXT:    [[INCDEC_PTR8]] = getelementptr inbounds i16, ptr [[PY_11076]], i32 -1
-; CHECK-T1-NEXT:    [[TMP1:%.*]] = load i16, ptr [[PY_11076]], align 2
-; CHECK-T1-NEXT:    [[CONV9:%.*]] = sext i16 [[TMP1]] to i32
+; CHECK-T1-NEXT:    [[TMP3:%.*]] = load i16, ptr [[PY_11076]], align 2
+; CHECK-T1-NEXT:    [[CONV9:%.*]] = sext i16 [[TMP3]] to i32
 ; CHECK-T1-NEXT:    [[MUL_I:%.*]] = mul nsw i32 [[CONV9]], [[CONV]]
 ; CHECK-T1-NEXT:    [[SHR3_I:%.*]] = ashr i32 [[CONV]], 16
 ; CHECK-T1-NEXT:    [[SHR4_I:%.*]] = ashr i32 [[CONV9]], 16
@@ -42,17 +45,15 @@ define dso_local arm_aapcscc void @arm_conv_fast_q15(ptr %pSrcA, i32 %srcALen, p
 ; CHECK-T1-NEXT:    br i1 [[CMP6]], label [[WHILE_END]], label [[WHILE_BODY7]]
 ; CHECK-T1:       while.end:
 ; CHECK-T1-NEXT:    [[ADD6_I_LCSSA:%.*]] = phi i32 [ [[ADD6_I]], [[WHILE_BODY7]] ]
-; CHECK-T1-NEXT:    [[TMP2:%.*]] = lshr i32 [[ADD6_I_LCSSA]], 15
-; CHECK-T1-NEXT:    [[CONV10:%.*]] = trunc i32 [[TMP2]] to i16
+; CHECK-T1-NEXT:    [[TMP4:%.*]] = lshr i32 [[ADD6_I_LCSSA]], 15
+; CHECK-T1-NEXT:    [[CONV10:%.*]] = trunc i32 [[TMP4]] to i16
 ; CHECK-T1-NEXT:    [[INCDEC_PTR11]] = getelementptr inbounds i16, ptr [[POUT_01081]], i32 1
 ; CHECK-T1-NEXT:    store i16 [[CONV10]], ptr [[POUT_01081]], align 2
 ; CHECK-T1-NEXT:    [[ADD_PTR]] = getelementptr inbounds i16, ptr [[PSRCA_PSRCB]], i32 [[COUNT_01084]]
 ; CHECK-T1-NEXT:    [[INC]] = add nuw nsw i32 [[COUNT_01084]], 1
 ; CHECK-T1-NEXT:    [[DEC12]] = add i32 [[BLOCKSIZE1_01083]], -1
-; CHECK-T1-NEXT:    [[CMP3:%.*]] = icmp ult i32 [[COUNT_01084]], 3
-; CHECK-T1-NEXT:    [[CMP4:%.*]] = icmp ne i32 [[DEC12]], 0
-; CHECK-T1-NEXT:    [[TMP3:%.*]] = and i1 [[CMP4]], [[CMP3]]
-; CHECK-T1-NEXT:    br i1 [[TMP3]], label [[WHILE_COND5_PREHEADER]], label [[WHILE_END13_LOOPEXIT:%.*]]
+; CHECK-T1-NEXT:    [[EXITCOND:%.*]] = icmp ne i32 [[INC]], [[TMP1]]
+; CHECK-T1-NEXT:    br i1 [[EXITCOND]], label [[WHILE_COND5_PREHEADER]], label [[WHILE_END13_LOOPEXIT:%.*]]
 ; CHECK-T1:       while.end13.loopexit:
 ; CHECK-T1-NEXT:    [[INCDEC_PTR11_LCSSA:%.*]] = phi ptr [ [[INCDEC_PTR11]], [[WHILE_END]] ]
 ; CHECK-T1-NEXT:    [[ADD_PTR_LCSSA:%.*]] = phi ptr [ [[ADD_PTR]], [[WHILE_END]] ]
@@ -85,34 +86,34 @@ define dso_local arm_aapcscc void @arm_conv_fast_q15(ptr %pSrcA, i32 %srcALen, p
 ; CHECK-T1-NEXT:    [[PY_31056:%.*]] = phi ptr [ [[ADD_PTR_I884:%.*]], [[WHILE_BODY23]] ], [ [[PY_21070]], [[WHILE_BODY23_PREHEADER]] ]
 ; CHECK-T1-NEXT:    [[PX_31055:%.*]] = phi ptr [ [[ADD_PTR_I890:%.*]], [[WHILE_BODY23]] ], [ [[PSRCB_PSRCA]], [[WHILE_BODY23_PREHEADER]] ]
 ; CHECK-T1-NEXT:    [[ARRAYIDX_I907:%.*]] = getelementptr inbounds i16, ptr [[PX_31055]], i32 1
-; CHECK-T1-NEXT:    [[TMP4:%.*]] = load i16, ptr [[ARRAYIDX_I907]], align 2
-; CHECK-T1-NEXT:    [[TMP5:%.*]] = load i16, ptr [[PX_31055]], align 2
+; CHECK-T1-NEXT:    [[TMP5:%.*]] = load i16, ptr [[ARRAYIDX_I907]], align 2
+; CHECK-T1-NEXT:    [[TMP6:%.*]] = load i16, ptr [[PX_31055]], align 2
 ; CHECK-T1-NEXT:    [[ADD_PTR_I912:%.*]] = getelementptr inbounds i16, ptr [[PX_31055]], i32 2
 ; CHECK-T1-NEXT:    [[ARRAYIDX_I901:%.*]] = getelementptr inbounds i16, ptr [[PY_31056]], i32 1
-; CHECK-T1-NEXT:    [[TMP6:%.*]] = load i16, ptr [[ARRAYIDX_I901]], align 2
-; CHECK-T1-NEXT:    [[TMP7:%.*]] = load i16, ptr [[PY_31056]], align 2
+; CHECK-T1-NEXT:    [[TMP7:%.*]] = load i16, ptr [[ARRAYIDX_I901]], align 2
+; CHECK-T1-NEXT:    [[TMP8:%.*]] = load i16, ptr [[PY_31056]], align 2
 ; CHECK-T1-NEXT:    [[ADD_PTR_I906:%.*]] = getelementptr inbounds i16, ptr [[PY_31056]], i32 -2
-; CHECK-T1-NEXT:    [[SHR_I892:%.*]] = sext i16 [[TMP5]] to i32
-; CHECK-T1-NEXT:    [[SHR1_I893:%.*]] = sext i16 [[TMP6]] to i32
+; CHECK-T1-NEXT:    [[SHR_I892:%.*]] = sext i16 [[TMP6]] to i32
+; CHECK-T1-NEXT:    [[SHR1_I893:%.*]] = sext i16 [[TMP7]] to i32
 ; CHECK-T1-NEXT:    [[MUL_I894:%.*]] = mul nsw i32 [[SHR1_I893]], [[SHR_I892]]
-; CHECK-T1-NEXT:    [[SHR2_I895:%.*]] = sext i16 [[TMP4]] to i32
-; CHECK-T1-NEXT:    [[SHR4_I897:%.*]] = sext i16 [[TMP7]] to i32
+; CHECK-T1-NEXT:    [[SHR2_I895:%.*]] = sext i16 [[TMP5]] to i32
+; CHECK-T1-NEXT:    [[SHR4_I897:%.*]] = sext i16 [[TMP8]] to i32
 ; CHECK-T1-NEXT:    [[MUL5_I898:%.*]] = mul nsw i32 [[SHR4_I897]], [[SHR2_I895]]
 ; CHECK-T1-NEXT:    [[ADD_I899:%.*]] = add i32 [[MUL_I894]], [[SUM_11057]]
 ; CHECK-T1-NEXT:    [[ADD6_I900:%.*]] = add i32 [[ADD_I899]], [[MUL5_I898]]
 ; CHECK-T1-NEXT:    [[ARRAYIDX_I885:%.*]] = getelementptr inbounds i16, ptr [[PX_31055]], i32 3
-; CHECK-T1-NEXT:    [[TMP8:%.*]] = load i16, ptr [[ARRAYIDX_I885]], align 2
-; CHECK-T1-NEXT:    [[TMP9:%.*]] = load i16, ptr [[ADD_PTR_I912]], align 2
+; CHECK-T1-NEXT:    [[TMP9:%.*]] = load i16, ptr [[ARRAYIDX_I885]], align 2
+; CHECK-T1-NEXT:    [[TMP10:%.*]] = load i16, ptr [[ADD_PTR_I912]], align 2
 ; CHECK-T1-NEXT:    [[ADD_PTR_I890]] = getelementptr inbounds i16, ptr [[PX_31055]], i32 4
 ; CHECK-T1-NEXT:    [[ARRAYIDX_I879:%.*]] = getelementptr inbounds i16, ptr [[PY_31056]], i32 -1
-; CHECK-T1-NEXT:    [[TMP10:%.*]] = load i16, ptr [[ARRAYIDX_I879]], align 2
-; CHECK-T1-NEXT:    [[TMP11:%.*]] = load i16, ptr [[ADD_PTR_I906]], align 2
+; CHECK-T1-NEXT:    [[TMP11:%.*]] = load i16, ptr [[ARRAYIDX_I879]], align 2
+; CHECK-T1-NEXT:    [[TMP12:%.*]] = load i16, ptr [[ADD_PTR_I906]], align 2
 ; CHECK-T1-NEXT:    [[ADD_PTR_I884]] = getelementptr inbounds i16, ptr [[PY_31056]], i32 -4
-; CHECK-T1-NEXT:    [[SHR_I870:%.*]] = sext i16 [[TMP9]] to i32
-; CHECK-T1-NEXT:    [[SHR1_I871:%.*]] = sext i16 [[TMP10]] to i32
+; CHECK-T1-NEXT:    [[SHR_I870:%.*]] = sext i16 [[TMP10]] to i32
+; CHECK-T1-NEXT:    [[SHR1_I871:%.*]] = sext i16 [[TMP11]] to i32
 ; CHECK-T1-NEXT:    [[MUL_I872:%.*]] = mul nsw i32 [[SHR1_I871]], [[SHR_I870]]
-; CHECK-T1-NEXT:    [[SHR2_I873:%.*]] = sext i16 [[TMP8]] to i32
-; CHECK-T1-NEXT:    [[SHR4_I875:%.*]] = sext i16 [[TMP11]] to i32
+; CHECK-T1-NEXT:    [[SHR2_I873:%.*]] = sext i16 [[TMP9]] to i32
+; CHECK-T1-NEXT:    [[SHR4_I875:%.*]] = sext i16 [[TMP12]] to i32
 ; CHECK-T1-NEXT:    [[MUL5_I876:%.*]] = mul nsw i32 [[SHR4_I875]], [[SHR2_I873]]
 ; CHECK-T1-NEXT:    [[ADD_I877:%.*]] = add i32 [[ADD6_I900]], [[MUL_I872]]
 ; CHECK-T1-NEXT:    [[ADD6_I878]] = add i32 [[ADD_I877]], [[MUL5_I876]]
@@ -140,11 +141,11 @@ define dso_local arm_aapcscc void @arm_conv_fast_q15(ptr %pSrcA, i32 %srcALen, p
 ; CHECK-T1-NEXT:    [[PY_41064:%.*]] = phi ptr [ [[INCDEC_PTR39:%.*]], [[WHILE_BODY36]] ], [ [[ADD_PTR32]], [[WHILE_BODY36_PREHEADER]] ]
 ; CHECK-T1-NEXT:    [[PX_41063:%.*]] = phi ptr [ [[INCDEC_PTR37:%.*]], [[WHILE_BODY36]] ], [ [[PX_3_LCSSA]], [[WHILE_BODY36_PREHEADER]] ]
 ; CHECK-T1-NEXT:    [[INCDEC_PTR37]] = getelementptr inbounds i16, ptr [[PX_41063]], i32 1
-; CHECK-T1-NEXT:    [[TMP12:%.*]] = load i16, ptr [[PX_41063]], align 2
-; CHECK-T1-NEXT:    [[CONV38:%.*]] = sext i16 [[TMP12]] to i32
+; CHECK-T1-NEXT:    [[TMP13:%.*]] = load i16, ptr [[PX_41063]], align 2
+; CHECK-T1-NEXT:    [[CONV38:%.*]] = sext i16 [[TMP13]] to i32
 ; CHECK-T1-NEXT:    [[INCDEC_PTR39]] = getelementptr inbounds i16, ptr [[PY_41064]], i32 -1
-; CHECK-T1-NEXT:    [[TMP13:%.*]] = load i16, ptr [[PY_41064]], align 2
-; CHECK-T1-NEXT:    [[CONV40:%.*]] = sext i16 [[TMP13]] to i32
+; CHECK-T1-NEXT:    [[TMP14:%.*]] = load i16, ptr [[PY_41064]], align 2
+; CHECK-T1-NEXT:    [[CONV40:%.*]] = sext i16 [[TMP14]] to i32
 ; CHECK-T1-NEXT:    [[MUL_I863:%.*]] = mul nsw i32 [[CONV40]], [[CONV38]]
 ; CHECK-T1-NEXT:    [[SHR3_I864:%.*]] = ashr i32 [[CONV38]], 16
 ; CHECK-T1-NEXT:    [[SHR4_I865:%.*]] = ashr i32 [[CONV40]], 16
@@ -159,8 +160,8 @@ define dso_local arm_aapcscc void @arm_conv_fast_q15(ptr %pSrcA, i32 %srcALen, p
 ; CHECK-T1-NEXT:    br label [[WHILE_END43]]
 ; CHECK-T1:       while.end43:
 ; CHECK-T1-NEXT:    [[SUM_2_LCSSA:%.*]] = phi i32 [ [[SUM_1_LCSSA]], [[WHILE_END31]] ], [ [[ADD6_I868_LCSSA]], [[WHILE_END43_LOOPEXIT]] ]
-; CHECK-T1-NEXT:    [[TMP14:%.*]] = lshr i32 [[SUM_2_LCSSA]], 15
-; CHECK-T1-NEXT:    [[CONV45:%.*]] = trunc i32 [[TMP14]] to i16
+; CHECK-T1-NEXT:    [[TMP15:%.*]] = lshr i32 [[SUM_2_LCSSA]], 15
+; CHECK-T1-NEXT:    [[CONV45:%.*]] = trunc i32 [[TMP15]] to i16
 ; CHECK-T1-NEXT:    [[INCDEC_PTR46]] = getelementptr inbounds i16, ptr [[POUT_11069]], i32 1
 ; CHECK-T1-NEXT:    store i16 [[CONV45]], ptr [[POUT_11069]], align 2
 ; CHECK-T1-NEXT:    [[SUB47:%.*]] = add i32 [[COUNT_11072]], -1
@@ -184,6 +185,9 @@ define dso_local arm_aapcscc void @arm_conv_fast_q15(ptr %pSrcA, i32 %srcALen, p
 ; CHECK-T2-NEXT:    [[CMP41080:%.*]] = icmp eq i32 [[SUB]], 0
 ; CHECK-T2-NEXT:    br i1 [[CMP41080]], label [[WHILE_END13:%.*]], label [[WHILE_COND5_PREHEADER_PREHEADER:%.*]]
 ; CHECK-T2:       while.cond5.preheader.preheader:
+; CHECK-T2-NEXT:    [[TMP0:%.*]] = add i32 [[SRCALEN_SRCBLEN]], -2
+; CHECK-T2-NEXT:    [[UMIN:%.*]] = call i32 @llvm.umin.i32(i32 [[TMP0]], i32 2)
+; CHECK-T2-NEXT:    [[TMP1:%.*]] = add nuw nsw i32 [[UMIN]], 2
 ; CHECK-T2-NEXT:    br label [[WHILE_COND5_PREHEADER:%.*]]
 ; CHECK-T2:       while.cond5.preheader:
 ; CHECK-T2-NEXT:    [[COUNT_01084:%.*]] = phi i32 [ [[INC:%.*]], [[WHILE_END:%.*]] ], [ 1, [[WHILE_COND5_PREHEADER_PREHEADER]] ]
@@ -197,11 +201,11 @@ define dso_local arm_aapcscc void @arm_conv_fast_q15(ptr %pSrcA, i32 %srcALen, p
 ; CHECK-T2-NEXT:    [[PY_11076:%.*]] = phi ptr [ [[INCDEC_PTR8:%.*]], [[WHILE_BODY7]] ], [ [[PY_01082]], [[WHILE_COND5_PREHEADER]] ]
 ; CHECK-T2-NEXT:    [[PX_11075:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[WHILE_BODY7]] ], [ [[PSRCB_PSRCA]], [[WHILE_COND5_PREHEADER]] ]
 ; CHECK-T2-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i16, ptr [[PX_11075]], i32 1
-; CHECK-T2-NEXT:    [[TMP0:%.*]] = load i16, ptr [[PX_11075]], align 2
-; CHECK-T2-NEXT:    [[CONV:%.*]] = sext i16 [[TMP0]] to i32
+; CHECK-T2-NEXT:    [[TMP2:%.*]] = load i16, ptr [[PX_11075]], align 2
+; CHECK-T2-NEXT:    [[CONV:%.*]] = sext i16 [[TMP2]] to i32
 ; CHECK-T2-NEXT:    [[INCDEC_PTR8]] = getelementptr inbounds i16, ptr [[PY_11076]], i32 -1
-; CHECK-T2-NEXT:    [[TMP1:%.*]] = load i16, ptr [[PY_11076]], align 2
-; CHECK-T2-NEXT:    [[CONV9:%.*]] = sext i16 [[TMP1]] to i32
+; CHECK-T2-NEXT:    [[TMP3:%.*]] = load i16, ptr [[PY_11076]], align 2
+; CHECK-T2-NEXT:    [[CONV9:%.*]] = sext i16 [[TMP3]] to i32
 ; CHECK-T2-NEXT:    [[MUL_I:%.*]] = mul nsw i32 [[CONV9]], [[CONV]]
 ; CHECK-T2-NEXT:    [[SHR3_I:%.*]] = ashr i32 [[CONV]], 16
 ; CHECK-T2-NEXT:    [[SHR4_I:%.*]] = ashr i32 [[CONV9]], 16
@@ -213,17 +217,15 @@ define dso_local arm_aapcscc void @arm_conv_fast_q15(ptr %pSrcA, i32 %srcALen, p
 ; CHECK-T2-NEXT:    br i1 [[CMP6]], label [[WHILE_END]], label [[WHILE_BODY7]]
 ; CHECK-T2:       while.end:
 ; CHECK-T2-NEXT:    [[ADD6_I_LCSSA:%.*]] = phi i32 [ [[ADD6_I]], [[WHILE_BODY7]] ]
-; CHECK-T2-NEXT:    [[TMP2:%.*]] = lshr i32 [[ADD6_I_LCSSA]], 15
-; CHECK-T2-NEXT:    [[CONV10:%.*]] = trunc i32 [[TMP2]] to i16
+; CHECK-T2-NEXT:    [[TMP4:%.*]] = lshr i32 [[ADD6_I_LCSSA]], 15
+; CHECK-T2-NEXT:    [[CONV10:%.*]] = trunc i32 [[TMP4]] to i16
 ; CHECK-T2-NEXT:    [[INCDEC_PTR11]] = getelementptr inbounds i16, ptr [[POUT_01081]], i32 1
 ; CHECK-T2-NEXT:    store i16 [[CONV10]], ptr [[POUT_01081]], align 2
 ; CHECK-T2-NEXT:    [[ADD_PTR]] = getelementptr inbounds i16, ptr [[PSRCA_PSRCB]], i32 [[COUNT_01084]]
 ; CHECK-T2-NEXT:    [[INC]] = add nuw nsw i32 [[COUNT_01084]], 1
 ; CHECK-T2-NEXT:    [[DEC12]] = add i32 [[BLOCKSIZE1_01083]], -1
-; CHECK-T2-NEXT:    [[CMP3:%.*]] = icmp ult i32 [[COUNT_01084]], 3
-; CHECK-T2-NEXT:    [[CMP4:%.*]] = icmp ne i32 [[DEC12]], 0
-; CHECK-T2-NEXT:    [[TMP3:%.*]] = and i1 [[CMP4]], [[CMP3]]
-; CHECK-T2-NEXT:    br i1 [[TMP3]], label [[WHILE_COND5_PREHEADER]], label [[WHILE_END13_LOOPEXIT:%.*]]
+; CHECK-T2-NEXT:    [[EXITCOND:%.*]] = icmp ne i32 [[INC]], [[TMP1]]
+; CHECK-T2-NEXT:    br i1 [[EXITCOND]], label [[WHILE_COND5_PREHEADER]], label [[WHILE_END13_LOOPEXIT:%.*]]
 ; CHECK-T2:       while.end13.loopexit:
 ; CHECK-T2-NEXT:    [[INCDEC_PTR11_LCSSA:%.*]] = phi ptr [ [[INCDEC_PTR11]], [[WHILE_END]] ]
 ; CHECK-T2-NEXT:    [[ADD_PTR_LCSSA:%.*]] = phi ptr [ [[ADD_PTR]], [[WHILE_END]] ]
@@ -256,34 +258,34 @@ define dso_local arm_aapcscc void @arm_conv_fast_q15(ptr %pSrcA, i32 %srcALen, p
 ; CHECK-T2-NEXT:    [[PY_31056:%.*]] = phi ptr [ [[ADD_PTR_I884:%.*]], [[WHILE_BODY23]] ], [ [[PY_21070]], [[WHILE_BODY23_PREHEADER]] ]
 ; CHECK-T2-NEXT:    [[PX_31055:%.*]] = phi ptr [ [[ADD_PTR_I890:%.*]], [[WHILE_BODY23]] ], [ [[PSRCB_PSRCA]], [[WHILE_BODY23_PREHEADER]] ]
 ; CHECK-T2-NEXT:    [[ARRAYIDX_I907:%.*]] = getelementptr inbounds i16, ptr [[PX_31055]], i32 1
-; CHECK-T2-NEXT:    [[TMP4:%.*]] = load i16, ptr [[ARRAYIDX_I907]], align 2
-; CHECK-T2-NEXT:    [[TMP5:%.*]] = load i16, ptr [[PX_31055]], align 2
+; CHECK-T2-NEXT:    [[TMP5:%.*]] = load i16, ptr [[ARRAYIDX_I907]], align 2
+; CHECK-T2-NEXT:    [[TMP6:%.*]] = load i16, ptr [[PX_31055]], align 2
 ; CHECK-T2-NEXT:    [[ADD_PTR_I912:%.*]] = getelementptr inbounds i16, ptr [[PX_31055]], i32 2
 ; CHECK-T2-NEXT:    [[ARRAYIDX_I901:%.*]] = getelementptr inbounds i16, ptr [[PY_31056]], i32 1
-; CHECK-T2-NEXT:    [[TMP6:%.*]] = load i16, ptr [[ARRAYIDX_I901]], align 2
-; CHECK-T2-NEXT:    [[TMP7:%.*]] = load i16, ptr [[PY_31056]], align 2
+; CHECK-T2-NEXT:    [[TMP7:%.*]] = load i16, ptr [[ARRAYIDX_I901]], align 2
+; CHECK-T2-NEXT:    [[TMP8:%.*]] = load i16, ptr [[PY_31056]], align 2
 ; CHECK-T2-NEXT:    [[ADD_PTR_I906:%.*]] = getelementptr inbounds i16, ptr [[PY_31056]], i32 -2
-; CHECK-T2-NEXT:    [[SHR_I892:%.*]] = sext i16 [[TMP5]] to i32
-; CHECK-T2-NEXT:    [[SHR1_I893:%.*]] = sext i16 [[TMP6]] to i32
+; CHECK-T2-NEXT:    [[SHR_I892:%.*]] = sext i16 [[TMP6]] to i32
+; CHECK-T2-NEXT:    [[SHR1_I893:%.*]] = sext i16 [[TMP7]] to i32
 ; CHECK-T2-NEXT:    [[MUL_I894:%.*]] = mul nsw i32 [[SHR1_I893]], [[SHR_I892]]
-; CHECK-T2-NEXT:    [[SHR2_I895:%.*]] = sext i16 [[TMP4]] to i32
-; CHECK-T2-NEXT:    [[SHR4_I897:%.*]] = sext i16 [[TMP7]] to i32
+; CHECK-T2-NEXT:    [[SHR2_I895:%.*]] = sext i16 [[TMP5]] to i32
+; CHECK-T2-NEXT:    [[SHR4_I897:%.*]] = sext i16 [[TMP8]] to i32
 ; CHECK-T2-NEXT:    [[MUL5_I898:%.*]] = mul nsw i32 [[SHR4_I897]], [[SHR2_I895]]
 ; CHECK-T2-NEXT:    [[ADD_I899:%.*]] = add i32 [[MUL_I894]], [[SUM_11057]]
 ; CHECK-T2-NEXT:    [[ADD6_I900:%.*]] = add i32 [[ADD_I899]], [[MUL5_I898]]
 ; CHECK-T2-NEXT:    [[ARRAYIDX_I885:%.*]] = getelementptr inbounds i16, ptr [[PX_31055]], i32 3
-; CHECK-T2-NEXT:    [[TMP8:%.*]] = load i16, ptr [[ARRAYIDX_I885]], align 2
-; CHECK-T2-NEXT:    [[TMP9:%.*]] = load i16, ptr [[ADD_PTR_I912]], align 2
+; CHECK-T2-NEXT:    [[TMP9:%.*]] = load i16, ptr [[ARRAYIDX_I885]], align 2
+; CHECK-T2-NEXT:    [[TMP10:%.*]] = load i16, ptr [[ADD_PTR_I912]], align 2
 ; CHECK-T2-NEXT:    [[ADD_PTR_I890]] = getelementptr inbounds i16, ptr [[PX_31055]], i32 4
 ; CHECK-T2-NEXT:    [[ARRAYIDX_I879:%.*]] = getelementptr inbounds i16, ptr [[PY_31056]], i32 -1
-; CHECK-T2-NEXT:    [[TMP10:%.*]] = load i16, ptr [[ARRAYIDX_I879]], align 2
-; CHECK-T2-NEXT:    [[TMP11:%.*]] = load i16, ptr [[ADD_PTR_I906]], align 2
+; CHECK-T2-NEXT:    [[TMP11:%.*]] = load i16, ptr [[ARRAYIDX_I879]], align 2
+; CHECK-T2-NEXT:    [[TMP12:%.*]] = load i16, ptr [[ADD_PTR_I906]], align 2
 ; CHECK-T2-NEXT:    [[ADD_PTR_I884]] = getelementptr inbounds i16, ptr [[PY_31056]], i32 -4
-; CHECK-T2-NEXT:    [[SHR_I870:%.*]] = sext i16 [[TMP9]] to i32
-; CHECK-T2-NEXT:    [[SHR1_I871:%.*]] = sext i16 [[TMP10]] to i32
+; CHECK-T2-NEXT:    [[SHR_I870:%.*]] = sext i16 [[TMP10]] to i32
+; CHECK-T2-NEXT:    [[SHR1_I871:%.*]] = sext i16 [[TMP11]] to i32
 ; CHECK-T2-NEXT:    [[MUL_I872:%.*]] = mul nsw i32 [[SHR1_I871]], [[SHR_I870]]
-; CHECK-T2-NEXT:    [[SHR2_I873:%.*]] = sext i16 [[TMP8]] to i32
-; CHECK-T2-NEXT:    [[SHR4_I875:%.*]] = sext i16 [[TMP11]] to i32
+; CHECK-T2-NEXT:    [[SHR2_I873:%.*]] = sext i16 [[TMP9]] to i32
+; CHECK-T2-NEXT:    [[SHR4_I875:%.*]] = sext i16 [[TMP12]] to i32
 ; CHECK-T2-NEXT:    [[MUL5_I876:%.*]] = mul nsw i32 [[SHR4_I875]], [[SHR2_I873]]
 ; CHECK-T2-NEXT:    [[ADD_I877:%.*]] = add i32 [[ADD6_I900]], [[MUL_I872]]
 ; CHECK-T2-NEXT:    [[ADD6_I878]] = add i32 [[ADD_I877]], [[MUL5_I876]]
@@ -311,11 +313,11 @@ define dso_local arm_aapcscc void @arm_conv_fast_q15(ptr %pSrcA, i32 %srcALen, p
 ; CHECK-T2-NEXT:    [[PY_41064:%.*]] = phi ptr [ [[INCDEC_PTR39:%.*]], [[WHILE_BODY36]] ], [ [[ADD_PTR32]], [[WHILE_BODY36_PREHEADER]] ]
 ; CHECK-T2-NEXT:    [[PX_41063:%.*]] = phi ptr [ [[INCDEC_PTR37:%.*]], [[WHILE_BODY36]] ], [ [[PX_3_LCSSA]], [[WHILE_BODY36_PREHEADER]] ]
 ; CHECK-T2-NEXT:    [[INCDEC_PTR37]] = getelementptr inbounds i16, ptr [[PX_41063]], i32 1
-; CHECK-T2-NEXT:    [[TMP12:%.*]] = load i16, ptr [[PX_41063]], align 2
-; CHECK-T2-NEXT:    [[CONV38:%.*]] = sext i16 [[TMP12]] to i32
+; CHECK-T2-NEXT:    [[TMP13:%.*]] = load i16, ptr [[PX_41063]], align 2
+; CHECK-T2-NEXT:    [[CONV38:%.*]] = sext i16 [[TMP13]] to i32
 ; CHECK-T2-NEXT:    [[INCDEC_PTR39]] = getelementptr inbounds i16, ptr [[PY_41064]], i32 -1
-; CHECK-T2-NEXT:    [[TMP13:%.*]] = load i16, ptr [[PY_41064]], align 2
-; CHECK-T2-NEXT:    [[CONV40:%.*]] = sext i16 [[TMP13]] to i32
+; CHECK-T2-NEXT:    [[TMP14:%.*]] = load i16, ptr [[PY_41064]], align 2
+; CHECK-T2-NEXT:    [[CONV40:%.*]] = sext i16 [[TMP14]] to i32
 ; CHECK-T2-NEXT:    [[MUL_I863:%.*]] = mul nsw i32 [[CONV40]], [[CONV38]]
 ; CHECK-T2-NEXT:    [[SHR3_I864:%.*]] = ashr i32 [[CONV38]], 16
 ; CHECK-T2-NEXT:    [[SHR4_I865:%.*]] = ashr i32 [[CONV40]], 16
@@ -330,8 +332,8 @@ define dso_local arm_aapcscc void @arm_conv_fast_q15(ptr %pSrcA, i32 %srcALen, p
 ; CHECK-T2-NEXT:    br label [[WHILE_END43]]
 ; CHECK-T2:       while.end43:
 ; CHECK-T2-NEXT:    [[SUM_2_LCSSA:%.*]] = phi i32 [ [[SUM_1_LCSSA]], [[WHILE_END31]] ], [ [[ADD6_I868_LCSSA]], [[WHILE_END43_LOOPEXIT]] ]
-; CHECK-T2-NEXT:    [[TMP14:%.*]] = lshr i32 [[SUM_2_LCSSA]], 15
-; CHECK-T2-NEXT:    [[CONV45:%.*]] = trunc i32 [[TMP14]] to i16
+; CHECK-T2-NEXT:    [[TMP15:%.*]] = lshr i32 [[SUM_2_LCSSA]], 15
+; CHECK-T2-NEXT:    [[CONV45:%.*]] = trunc i32 [[TMP15]] to i16
 ; CHECK-T2-NEXT:    [[INCDEC_PTR46]] = getelementptr inbounds i16, ptr [[POUT_11069]], i32 1
 ; CHECK-T2-NEXT:    store i16 [[CONV45]], ptr [[POUT_11069]], align 2
 ; CHECK-T2-NEXT:    [[SUB47:%.*]] = add i32 [[COUNT_11072]], -1
diff --git a/llvm/test/Transforms/InferFunctionAttrs/annotate.ll b/llvm/test/Transforms/InferFunctionAttrs/annotate.ll
index f70a2962b9394..456155d7e4437 100644
--- a/llvm/test/Transforms/InferFunctionAttrs/annotate.ll
+++ b/llvm/test/Transforms/InferFunctionAttrs/annotate.ll
@@ -2,7 +2,7 @@
 ; RUN: opt < %s -mtriple=x86_64-apple-macosx10.8.0 -passes=inferattrs -S | FileCheck --match-full-lines --check-prefixes=CHECK,CHECK-KNOWN,CHECK-NOLINUX,CHECK-OPEN,CHECK-DARWIN %s
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -passes=inferattrs -S | FileCheck --match-full-lines --check-prefixes=CHECK,CHECK-KNOWN,CHECK-LINUX %s
 ; RUN: opt < %s -mtriple=nvptx -passes=inferattrs -S | FileCheck --match-full-lines --check-prefixes=CHECK-NOLINUX,CHECK-NVPTX %s
-; RUN: opt < %s -mtriple=powerpc-ibm-aix-xcoff -passes=inferattrs -S | FileCheck --match-full-lines --check-prefixes=CHECK-AIX %s
+; RUN: opt < %s -mtriple=powerpc64-ibm-aix-xcoff -passes=inferattrs -S | FileCheck --match-full-lines --check-prefixes=CHECK-NOLINUX,CHECK-AIX %s
 
 declare i32 @__nvvm_reflect(ptr)
 ; CHECK-NVPTX: declare noundef i32 @__nvvm_reflect(ptr noundef) [[NOFREE_NOUNWIND_READNONE:#[0-9]+]]
diff --git a/llvm/test/Transforms/InstCombine/double-float-shrink-2.ll b/llvm/test/Transforms/InstCombine/double-float-shrink-2.ll
index 24ccad4e157b9..baa78f82146f9 100644
--- a/llvm/test/Transforms/InstCombine/double-float-shrink-2.ll
+++ b/llvm/test/Transforms/InstCombine/double-float-shrink-2.ll
@@ -1,10 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -passes=instcombine -S -mtriple "i386-pc-linux"     | FileCheck %s
-; RUN: opt < %s -passes=instcombine -S -mtriple "i386-pc-win32"     | FileCheck %s
-; RUN: opt < %s -passes=instcombine -S -mtriple "x86_64-pc-win32"   | FileCheck %s
-; RUN: opt < %s -passes=instcombine -S -mtriple "i386-pc-mingw32"   | FileCheck %s
-; RUN: opt < %s -passes=instcombine -S -mtriple "x86_64-pc-mingw32" | FileCheck %s
-; RUN: opt < %s -passes=instcombine -S -mtriple "sparc-sun-solaris" | FileCheck %s
+; RUN: opt < %s -passes=instcombine -S -mtriple "i386-pc-linux"     | FileCheck %s --check-prefixes=CHECK,DOUBLE-4BYTE-ALIGN
+; RUN: opt < %s -passes=instcombine -S -mtriple "i386-pc-win32"     | FileCheck %s --check-prefixes=CHECK,DOUBLE-8BYTE-ALIGN
+; RUN: opt < %s -passes=instcombine -S -mtriple "x86_64-pc-win32"   | FileCheck %s --check-prefixes=CHECK,DOUBLE-8BYTE-ALIGN
+; RUN: opt < %s -passes=instcombine -S -mtriple "i386-pc-mingw32"   | FileCheck %s --check-prefixes=CHECK,DOUBLE-8BYTE-ALIGN
+; RUN: opt < %s -passes=instcombine -S -mtriple "x86_64-pc-mingw32" | FileCheck %s --check-prefixes=CHECK,DOUBLE-8BYTE-ALIGN
+; RUN: opt < %s -passes=instcombine -S -mtriple "sparc-sun-solaris" | FileCheck %s --check-prefixes=CHECK,DOUBLE-8BYTE-ALIGN
 ; RUN: opt < %s -passes=instcombine -S -mtriple "x86_64-pc-win32" -enable-debugify 2>&1 | FileCheck --check-prefix=DBG-VALID %s
 
 declare double @floor(double)
@@ -708,12 +708,19 @@ define float @test_shrink_intrin_fabs_fast_fp16_src(half %C) {
 }
 
 define float @test_no_shrink_intrin_floor_multi_use_fpext(half %C) {
-; CHECK-LABEL: @test_no_shrink_intrin_floor_multi_use_fpext(
-; CHECK-NEXT:    [[D:%.*]] = fpext half [[C:%.*]] to double
-; CHECK-NEXT:    store volatile double [[D]], ptr undef, align 8
-; CHECK-NEXT:    [[E:%.*]] = call double @llvm.floor.f64(double [[D]])
-; CHECK-NEXT:    [[F:%.*]] = fptrunc double [[E]] to float
-; CHECK-NEXT:    ret float [[F]]
+; DOUBLE-4BYTE-ALIGN-LABEL: @test_no_shrink_intrin_floor_multi_use_fpext(
+; DOUBLE-4BYTE-ALIGN-NEXT:    [[D:%.*]] = fpext half [[C:%.*]] to double
+; DOUBLE-4BYTE-ALIGN-NEXT:    store volatile double [[D]], ptr undef, align 4
+; DOUBLE-4BYTE-ALIGN-NEXT:    [[E:%.*]] = call double @llvm.floor.f64(double [[D]])
+; DOUBLE-4BYTE-ALIGN-NEXT:    [[F:%.*]] = fptrunc double [[E]] to float
+; DOUBLE-4BYTE-ALIGN-NEXT:    ret float [[F]]
+;
+; DOUBLE-8BYTE-ALIGN-LABEL: @test_no_shrink_intrin_floor_multi_use_fpext(
+; DOUBLE-8BYTE-ALIGN-NEXT:    [[D:%.*]] = fpext half [[C:%.*]] to double
+; DOUBLE-8BYTE-ALIGN-NEXT:    store volatile double [[D]], ptr undef, align 8
+; DOUBLE-8BYTE-ALIGN-NEXT:    [[E:%.*]] = call double @llvm.floor.f64(double [[D]])
+; DOUBLE-8BYTE-ALIGN-NEXT:    [[F:%.*]] = fptrunc double [[E]] to float
+; DOUBLE-8BYTE-ALIGN-NEXT:    ret float [[F]]
 ;
   %D = fpext half %C to double
   store volatile double %D, ptr undef
@@ -723,12 +730,19 @@ define float @test_no_shrink_intrin_floor_multi_use_fpext(half %C) {
 }
 
 define float @test_no_shrink_intrin_fabs_multi_use_fpext(half %C) {
-; CHECK-LABEL: @test_no_shrink_intrin_fabs_multi_use_fpext(
-; CHECK-NEXT:    [[D:%.*]] = fpext half [[C:%.*]] to double
-; CHECK-NEXT:    store volatile double [[D]], ptr undef, align 8
-; CHECK-NEXT:    [[E:%.*]] = call double @llvm.fabs.f64(double [[D]])
-; CHECK-NEXT:    [[F:%.*]] = fptrunc double [[E]] to float
-; CHECK-NEXT:    ret float [[F]]
+; DOUBLE-4BYTE-ALIGN-LABEL: @test_no_shrink_intrin_fabs_multi_use_fpext(
+; DOUBLE-4BYTE-ALIGN-NEXT:    [[D:%.*]] = fpext half [[C:%.*]] to double
+; DOUBLE-4BYTE-ALIGN-NEXT:    store volatile double [[D]], ptr undef, align 4
+; DOUBLE-4BYTE-ALIGN-NEXT:    [[E:%.*]] = call double @llvm.fabs.f64(double [[D]])
+; DOUBLE-4BYTE-ALIGN-NEXT:    [[F:%.*]] = fptrunc double [[E]] to float
+; DOUBLE-4BYTE-ALIGN-NEXT:    ret float [[F]]
+;
+; DOUBLE-8BYTE-ALIGN-LABEL: @test_no_shrink_intrin_fabs_multi_use_fpext(
+; DOUBLE-8BYTE-ALIGN-NEXT:    [[D:%.*]] = fpext half [[C:%.*]] to double
+; DOUBLE-8BYTE-ALIGN-NEXT:    store volatile double [[D]], ptr undef, align 8
+; DOUBLE-8BYTE-ALIGN-NEXT:    [[E:%.*]] = call double @llvm.fabs.f64(double [[D]])
+; DOUBLE-8BYTE-ALIGN-NEXT:    [[F:%.*]] = fptrunc double [[E]] to float
+; DOUBLE-8BYTE-ALIGN-NEXT:    ret float [[F]]
 ;
   %D = fpext half %C to double
   store volatile double %D, ptr undef
diff --git a/llvm/test/Transforms/InstCombine/ffs-i16.ll b/llvm/test/Transforms/InstCombine/ffs-i16.ll
index f567a9488d498..05c21fce5f185 100644
--- a/llvm/test/Transforms/InstCombine/ffs-i16.ll
+++ b/llvm/test/Transforms/InstCombine/ffs-i16.ll
@@ -13,13 +13,13 @@ declare void @sink(i16)
 
 define void @fold_ffs(i16 %x) {
 ; AVR-LABEL: @fold_ffs(
-; AVR-NEXT:    call void @sink(i16 0)
-; AVR-NEXT:    call void @sink(i16 1)
-; AVR-NEXT:    [[CTTZ:%.*]] = call i16 @llvm.cttz.i16(i16 [[X:%.*]], i1 true), !range [[RNG0:![0-9]+]]
+; AVR-NEXT:    call addrspace(1) void @sink(i16 0)
+; AVR-NEXT:    call addrspace(1) void @sink(i16 1)
+; AVR-NEXT:    [[CTTZ:%.*]] = call addrspace(1) i16 @llvm.cttz.i16(i16 [[X:%.*]], i1 true), !range [[RNG0:![0-9]+]]
 ; AVR-NEXT:    [[TMP1:%.*]] = add nuw nsw i16 [[CTTZ]], 1
 ; AVR-NEXT:    [[DOTNOT:%.*]] = icmp eq i16 [[X]], 0
 ; AVR-NEXT:    [[NX:%.*]] = select i1 [[DOTNOT]], i16 0, i16 [[TMP1]]
-; AVR-NEXT:    call void @sink(i16 [[NX]])
+; AVR-NEXT:    call addrspace(1) void @sink(i16 [[NX]])
 ; AVR-NEXT:    ret void
 ;
 ; MSP430-LABEL: @fold_ffs(
diff --git a/llvm/test/Transforms/InstCombine/fls-i16.ll b/llvm/test/Transforms/InstCombine/fls-i16.ll
index ba0471665a482..6d939b4e5c4e4 100644
--- a/llvm/test/Transforms/InstCombine/fls-i16.ll
+++ b/llvm/test/Transforms/InstCombine/fls-i16.ll
@@ -14,11 +14,11 @@ declare void @sink(i16)
 
 define void @fold_fls(i16 %x) {
 ; AVR-LABEL: @fold_fls(
-; AVR-NEXT:    call void @sink(i16 0)
-; AVR-NEXT:    call void @sink(i16 1)
-; AVR-NEXT:    [[CTLZ:%.*]] = call i16 @llvm.ctlz.i16(i16 [[X:%.*]], i1 false), !range [[RNG0:![0-9]+]]
+; AVR-NEXT:    call addrspace(1) void @sink(i16 0)
+; AVR-NEXT:    call addrspace(1) void @sink(i16 1)
+; AVR-NEXT:    [[CTLZ:%.*]] = call addrspace(1) i16 @llvm.ctlz.i16(i16 [[X:%.*]], i1 false), !range [[RNG0:![0-9]+]]
 ; AVR-NEXT:    [[NX:%.*]] = sub nuw nsw i16 16, [[CTLZ]]
-; AVR-NEXT:    call void @sink(i16 [[NX]])
+; AVR-NEXT:    call addrspace(1) void @sink(i16 [[NX]])
 ; AVR-NEXT:    ret void
 ;
 ; MSP430-LABEL: @fold_fls(
diff --git a/llvm/test/Transforms/InstCombine/isascii-i16.ll b/llvm/test/Transforms/InstCombine/isascii-i16.ll
index 2cac57e09c37e..c5a9384a1ba39 100644
--- a/llvm/test/Transforms/InstCombine/isascii-i16.ll
+++ b/llvm/test/Transforms/InstCombine/isascii-i16.ll
@@ -12,17 +12,17 @@ declare void @sink(i16)
 
 define void @fold_isascii(i16 %c) {
 ; AVR-LABEL: @fold_isascii(
-; AVR-NEXT:    call void @sink(i16 1)
-; AVR-NEXT:    call void @sink(i16 1)
-; AVR-NEXT:    call void @sink(i16 1)
-; AVR-NEXT:    call void @sink(i16 0)
-; AVR-NEXT:    call void @sink(i16 0)
-; AVR-NEXT:    call void @sink(i16 0)
-; AVR-NEXT:    call void @sink(i16 0)
-; AVR-NEXT:    call void @sink(i16 0)
+; AVR-NEXT:    call addrspace(1) void @sink(i16 1)
+; AVR-NEXT:    call addrspace(1) void @sink(i16 1)
+; AVR-NEXT:    call addrspace(1) void @sink(i16 1)
+; AVR-NEXT:    call addrspace(1) void @sink(i16 0)
+; AVR-NEXT:    call addrspace(1) void @sink(i16 0)
+; AVR-NEXT:    call addrspace(1) void @sink(i16 0)
+; AVR-NEXT:    call addrspace(1) void @sink(i16 0)
+; AVR-NEXT:    call addrspace(1) void @sink(i16 0)
 ; AVR-NEXT:    [[ISASCII:%.*]] = icmp ult i16 [[C:%.*]], 128
 ; AVR-NEXT:    [[IC:%.*]] = zext i1 [[ISASCII]] to i16
-; AVR-NEXT:    call void @sink(i16 [[IC]])
+; AVR-NEXT:    call addrspace(1) void @sink(i16 [[IC]])
 ; AVR-NEXT:    ret void
 ;
 ; MSP430-LABEL: @fold_isascii(
diff --git a/llvm/test/Transforms/InstCombine/isdigit-i16.ll b/llvm/test/Transforms/InstCombine/isdigit-i16.ll
index 69f7b7f2e1f70..fd02fcef798cf 100644
--- a/llvm/test/Transforms/InstCombine/isdigit-i16.ll
+++ b/llvm/test/Transforms/InstCombine/isdigit-i16.ll
@@ -11,22 +11,22 @@ declare void @sink(i16)
 
 define void @fold_isdigit(i16 %c) {
 ; AVR-LABEL: @fold_isdigit(
-; AVR-NEXT:    call void @sink(i16 0)
-; AVR-NEXT:    call void @sink(i16 0)
-; AVR-NEXT:    call void @sink(i16 0)
-; AVR-NEXT:    call void @sink(i16 1)
-; AVR-NEXT:    call void @sink(i16 1)
-; AVR-NEXT:    call void @sink(i16 1)
-; AVR-NEXT:    call void @sink(i16 0)
-; AVR-NEXT:    call void @sink(i16 0)
-; AVR-NEXT:    call void @sink(i16 0)
-; AVR-NEXT:    call void @sink(i16 0)
-; AVR-NEXT:    call void @sink(i16 0)
-; AVR-NEXT:    call void @sink(i16 0)
+; AVR-NEXT:    call addrspace(1) void @sink(i16 0)
+; AVR-NEXT:    call addrspace(1) void @sink(i16 0)
+; AVR-NEXT:    call addrspace(1) void @sink(i16 0)
+; AVR-NEXT:    call addrspace(1) void @sink(i16 1)
+; AVR-NEXT:    call addrspace(1) void @sink(i16 1)
+; AVR-NEXT:    call addrspace(1) void @sink(i16 1)
+; AVR-NEXT:    call addrspace(1) void @sink(i16 0)
+; AVR-NEXT:    call addrspace(1) void @sink(i16 0)
+; AVR-NEXT:    call addrspace(1) void @sink(i16 0)
+; AVR-NEXT:    call addrspace(1) void @sink(i16 0)
+; AVR-NEXT:    call addrspace(1) void @sink(i16 0)
+; AVR-NEXT:    call addrspace(1) void @sink(i16 0)
 ; AVR-NEXT:    [[ISDIGITTMP:%.*]] = add i16 [[C:%.*]], -48
 ; AVR-NEXT:    [[ISDIGIT:%.*]] = icmp ult i16 [[ISDIGITTMP]], 10
 ; AVR-NEXT:    [[IC:%.*]] = zext i1 [[ISDIGIT]] to i16
-; AVR-NEXT:    call void @sink(i16 [[IC]])
+; AVR-NEXT:    call addrspace(1) void @sink(i16 [[IC]])
 ; AVR-NEXT:    ret void
 ;
 ; MSP430-LABEL: @fold_isdigit(
diff --git a/llvm/test/Transforms/InstCombine/pow-4.ll b/llvm/test/Transforms/InstCombine/pow-4.ll
index dbe6477c0ecaa..09ce645130fa8 100644
--- a/llvm/test/Transforms/InstCombine/pow-4.ll
+++ b/llvm/test/Transforms/InstCombine/pow-4.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes=instcombine -S < %s -mtriple unknown                        | FileCheck %s --check-prefixes=CHECK,CHECKI32,CHECKSQRT
-; RUN: opt -passes=instcombine -S < %s -mtriple unknown -disable-builtin sqrt  | FileCheck %s --check-prefixes=CHECK,CHECKI32,CHECKNOSQRT
+; RUN: opt -passes=instcombine -S < %s -mtriple unknown -data-layout=e          | FileCheck %s --check-prefixes=CHECK,CHECKI32,CHECKSQRT
+; RUN: opt -passes=instcombine -S < %s -mtriple unknown -data-layout=e -disable-builtin sqrt  | FileCheck %s --check-prefixes=CHECK,CHECKI32,CHECKNOSQRT
 ; RUN: opt -passes=instcombine -S < %s -mtriple msp430                        | FileCheck %s --check-prefixes=CHECK,CHECKI16,CHECKSQRT
 ; RUN: opt -passes=instcombine -S < %s -mtriple msp430 -disable-builtin sqrt  | FileCheck %s --check-prefixes=CHECK,CHECKI16,CHECKNOSQRT
 
diff --git a/llvm/test/Transforms/InstCombine/pow_fp_int.ll b/llvm/test/Transforms/InstCombine/pow_fp_int.ll
index e4d74f6bacb06..9c1fa88f3183e 100644
--- a/llvm/test/Transforms/InstCombine/pow_fp_int.ll
+++ b/llvm/test/Transforms/InstCombine/pow_fp_int.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
 ; RUN: opt -mtriple unknown -passes=instcombine -S < %s | FileCheck %s
+target datalayout = "e"
 
 ; PR42190
 ; Can't generate test checks due to PR42740.
diff --git a/llvm/test/Transforms/InstCombine/printf-i16.ll b/llvm/test/Transforms/InstCombine/printf-i16.ll
index f9cc4ebfbcc34..2177eda3aecaf 100644
--- a/llvm/test/Transforms/InstCombine/printf-i16.ll
+++ b/llvm/test/Transforms/InstCombine/printf-i16.ll
@@ -24,21 +24,21 @@ declare i16 @printf(ptr, ...)
 
 define void @xform_printf(i8 %c8, i16 %c16) {
 ; AVR-LABEL: @xform_printf(
-; AVR-NEXT:    [[PUTCHAR:%.*]] = call i16 @putchar(i16 1)
-; AVR-NEXT:    [[PUTCHAR1:%.*]] = call i16 @putchar(i16 1)
-; AVR-NEXT:    [[PUTCHAR2:%.*]] = call i16 @putchar(i16 1)
-; AVR-NEXT:    [[PUTCHAR3:%.*]] = call i16 @putchar(i16 127)
-; AVR-NEXT:    [[PUTCHAR4:%.*]] = call i16 @putchar(i16 127)
-; AVR-NEXT:    [[PUTCHAR5:%.*]] = call i16 @putchar(i16 127)
-; AVR-NEXT:    [[PUTCHAR6:%.*]] = call i16 @putchar(i16 128)
-; AVR-NEXT:    [[PUTCHAR7:%.*]] = call i16 @putchar(i16 128)
-; AVR-NEXT:    [[PUTCHAR8:%.*]] = call i16 @putchar(i16 128)
-; AVR-NEXT:    [[PUTCHAR9:%.*]] = call i16 @putchar(i16 255)
-; AVR-NEXT:    [[PUTCHAR10:%.*]] = call i16 @putchar(i16 255)
-; AVR-NEXT:    [[PUTCHAR11:%.*]] = call i16 @putchar(i16 255)
+; AVR-NEXT:    [[PUTCHAR:%.*]] = call addrspace(1) i16 @putchar(i16 1)
+; AVR-NEXT:    [[PUTCHAR1:%.*]] = call addrspace(1) i16 @putchar(i16 1)
+; AVR-NEXT:    [[PUTCHAR2:%.*]] = call addrspace(1) i16 @putchar(i16 1)
+; AVR-NEXT:    [[PUTCHAR3:%.*]] = call addrspace(1) i16 @putchar(i16 127)
+; AVR-NEXT:    [[PUTCHAR4:%.*]] = call addrspace(1) i16 @putchar(i16 127)
+; AVR-NEXT:    [[PUTCHAR5:%.*]] = call addrspace(1) i16 @putchar(i16 127)
+; AVR-NEXT:    [[PUTCHAR6:%.*]] = call addrspace(1) i16 @putchar(i16 128)
+; AVR-NEXT:    [[PUTCHAR7:%.*]] = call addrspace(1) i16 @putchar(i16 128)
+; AVR-NEXT:    [[PUTCHAR8:%.*]] = call addrspace(1) i16 @putchar(i16 128)
+; AVR-NEXT:    [[PUTCHAR9:%.*]] = call addrspace(1) i16 @putchar(i16 255)
+; AVR-NEXT:    [[PUTCHAR10:%.*]] = call addrspace(1) i16 @putchar(i16 255)
+; AVR-NEXT:    [[PUTCHAR11:%.*]] = call addrspace(1) i16 @putchar(i16 255)
 ; AVR-NEXT:    [[TMP1:%.*]] = zext i8 [[C8:%.*]] to i16
-; AVR-NEXT:    [[PUTCHAR12:%.*]] = call i16 @putchar(i16 [[TMP1]])
-; AVR-NEXT:    [[PUTCHAR13:%.*]] = call i16 @putchar(i16 [[C16:%.*]])
+; AVR-NEXT:    [[PUTCHAR12:%.*]] = call addrspace(1) i16 @putchar(i16 [[TMP1]])
+; AVR-NEXT:    [[PUTCHAR13:%.*]] = call addrspace(1) i16 @putchar(i16 [[C16:%.*]])
 ; AVR-NEXT:    ret void
 ;
 ; MSP430-LABEL: @xform_printf(
diff --git a/llvm/test/Transforms/InstCombine/puts-i16.ll b/llvm/test/Transforms/InstCombine/puts-i16.ll
index 92769083a8aff..4ac3cc01ea043 100644
--- a/llvm/test/Transforms/InstCombine/puts-i16.ll
+++ b/llvm/test/Transforms/InstCombine/puts-i16.ll
@@ -14,7 +14,7 @@ declare i16 @puts(ptr)
 define void @xform_puts(i16 %c) {
 ; Transform puts("") to putchar("\n").
 ; AVR-LABEL: @xform_puts(
-; AVR-NEXT:    [[PUTCHAR:%.*]] = call i16 @putchar(i16 10)
+; AVR-NEXT:    [[PUTCHAR:%.*]] = call addrspace(1) i16 @putchar(i16 10)
 ; AVR-NEXT:    ret void
 ;
 ; MSP430-LABEL: @xform_puts(
diff --git a/llvm/test/Transforms/InstCombine/sincospi.ll b/llvm/test/Transforms/InstCombine/sincospi.ll
index ef65e0b68c769..c4aee306a03c7 100644
--- a/llvm/test/Transforms/InstCombine/sincospi.ll
+++ b/llvm/test/Transforms/InstCombine/sincospi.ll
@@ -1,10 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -passes=instcombine -S < %s -mtriple=x86_64-apple-macosx10.9 | FileCheck %s --check-prefix=CHECK-FLOAT-IN-VEC
-; RUN: opt -passes=instcombine -S < %s -mtriple=arm-apple-ios7.0 | FileCheck %s
-; RUN: opt -passes=instcombine -S < %s -mtriple=arm64-apple-ios7.0 | FileCheck %s
-; RUN: opt -passes=instcombine -S < %s -mtriple=x86_64-apple-macosx10.8 | FileCheck %s --check-prefix=CHECK-NO-SINCOS
-; RUN: opt -passes=instcombine -S < %s -mtriple=arm-apple-ios6.0 | FileCheck %s --check-prefix=CHECK-NO-SINCOS
-; RUN: opt -passes=instcombine -S < %s -mtriple=x86_64-none-linux-gnu | FileCheck %s --check-prefix=CHECK-NO-SINCOS
+; RUN: opt -passes=instcombine -S < %s -mtriple=arm-apple-ios7.0 | FileCheck %s --check-prefixes=CHECK,CHECK-DOUBLE-ALIGN4
+; RUN: opt -passes=instcombine -S < %s -mtriple=arm64-apple-ios7.0 | FileCheck %s --check-prefixes=CHECK,CHECK-DOUBLE-ALIGN8
+; RUN: opt -passes=instcombine -S < %s -mtriple=x86_64-apple-macosx10.8 | FileCheck %s --check-prefixes=CHECK-NO-SINCOS,CHECK-NO-SINCOS-DOUBLE-ALIGN8
+; RUN: opt -passes=instcombine -S < %s -mtriple=arm-apple-ios6.0 | FileCheck %s --check-prefixes=CHECK-NO-SINCOS,CHECK-NO-SINCOS-DOUBLE-ALIGN4
+; RUN: opt -passes=instcombine -S < %s -mtriple=x86_64-none-linux-gnu | FileCheck %s --check-prefixes=CHECK-NO-SINCOS,CHECK-NO-SINCOS-DOUBLE-ALIGN8
 
 
 attributes #0 = { readnone nounwind }
@@ -129,21 +129,37 @@ define double @test_instbased_f64() {
 ; CHECK-FLOAT-IN-VEC-NEXT:    [[RES:%.*]] = fadd double [[SINPI]], [[COSPI]]
 ; CHECK-FLOAT-IN-VEC-NEXT:    ret double [[RES]]
 ;
-; CHECK-LABEL: @test_instbased_f64(
-; CHECK-NEXT:    [[VAL:%.*]] = load double, ptr @var64, align 8
-; CHECK-NEXT:    [[SINCOSPI:%.*]] = call { double, double } @__sincospi_stret(double [[VAL]])
-; CHECK-NEXT:    [[SINPI:%.*]] = extractvalue { double, double } [[SINCOSPI]], 0
-; CHECK-NEXT:    [[COSPI:%.*]] = extractvalue { double, double } [[SINCOSPI]], 1
-; CHECK-NEXT:    [[COS:%.*]] = call double @__cospi(double [[VAL]]) #[[ATTR0]]
-; CHECK-NEXT:    [[RES:%.*]] = fadd double [[SINPI]], [[COSPI]]
-; CHECK-NEXT:    ret double [[RES]]
+; CHECK-DOUBLE-ALIGN4-LABEL: @test_instbased_f64(
+; CHECK-DOUBLE-ALIGN4-NEXT:    [[VAL:%.*]] = load double, ptr @var64, align 4
+; CHECK-DOUBLE-ALIGN4-NEXT:    [[SINCOSPI:%.*]] = call { double, double } @__sincospi_stret(double [[VAL]])
+; CHECK-DOUBLE-ALIGN4-NEXT:    [[SINPI:%.*]] = extractvalue { double, double } [[SINCOSPI]], 0
+; CHECK-DOUBLE-ALIGN4-NEXT:    [[COSPI:%.*]] = extractvalue { double, double } [[SINCOSPI]], 1
+; CHECK-DOUBLE-ALIGN4-NEXT:    [[COS:%.*]] = call double @__cospi(double [[VAL]]) #[[ATTR0]]
+; CHECK-DOUBLE-ALIGN4-NEXT:    [[RES:%.*]] = fadd double [[SINPI]], [[COSPI]]
+; CHECK-DOUBLE-ALIGN4-NEXT:    ret double [[RES]]
 ;
-; CHECK-NO-SINCOS-LABEL: @test_instbased_f64(
-; CHECK-NO-SINCOS-NEXT:    [[VAL:%.*]] = load double, ptr @var64, align 8
-; CHECK-NO-SINCOS-NEXT:    [[SIN:%.*]] = call double @__sinpi(double [[VAL]]) #[[ATTR0]]
-; CHECK-NO-SINCOS-NEXT:    [[COS:%.*]] = call double @__cospi(double [[VAL]]) #[[ATTR0]]
-; CHECK-NO-SINCOS-NEXT:    [[RES:%.*]] = fadd double [[SIN]], [[COS]]
-; CHECK-NO-SINCOS-NEXT:    ret double [[RES]]
+; CHECK-DOUBLE-ALIGN8-LABEL: @test_instbased_f64(
+; CHECK-DOUBLE-ALIGN8-NEXT:    [[VAL:%.*]] = load double, ptr @var64, align 8
+; CHECK-DOUBLE-ALIGN8-NEXT:    [[SINCOSPI:%.*]] = call { double, double } @__sincospi_stret(double [[VAL]])
+; CHECK-DOUBLE-ALIGN8-NEXT:    [[SINPI:%.*]] = extractvalue { double, double } [[SINCOSPI]], 0
+; CHECK-DOUBLE-ALIGN8-NEXT:    [[COSPI:%.*]] = extractvalue { double, double } [[SINCOSPI]], 1
+; CHECK-DOUBLE-ALIGN8-NEXT:    [[COS:%.*]] = call double @__cospi(double [[VAL]]) #[[ATTR0]]
+; CHECK-DOUBLE-ALIGN8-NEXT:    [[RES:%.*]] = fadd double [[SINPI]], [[COSPI]]
+; CHECK-DOUBLE-ALIGN8-NEXT:    ret double [[RES]]
+;
+; CHECK-NO-SINCOS-DOUBLE-ALIGN8-LABEL: @test_instbased_f64(
+; CHECK-NO-SINCOS-DOUBLE-ALIGN8-NEXT:    [[VAL:%.*]] = load double, ptr @var64, align 8
+; CHECK-NO-SINCOS-DOUBLE-ALIGN8-NEXT:    [[SIN:%.*]] = call double @__sinpi(double [[VAL]]) #[[ATTR0]]
+; CHECK-NO-SINCOS-DOUBLE-ALIGN8-NEXT:    [[COS:%.*]] = call double @__cospi(double [[VAL]]) #[[ATTR0]]
+; CHECK-NO-SINCOS-DOUBLE-ALIGN8-NEXT:    [[RES:%.*]] = fadd double [[SIN]], [[COS]]
+; CHECK-NO-SINCOS-DOUBLE-ALIGN8-NEXT:    ret double [[RES]]
+;
+; CHECK-NO-SINCOS-DOUBLE-ALIGN4-LABEL: @test_instbased_f64(
+; CHECK-NO-SINCOS-DOUBLE-ALIGN4-NEXT:    [[VAL:%.*]] = load double, ptr @var64, align 4
+; CHECK-NO-SINCOS-DOUBLE-ALIGN4-NEXT:    [[SIN:%.*]] = call double @__sinpi(double [[VAL]]) #[[ATTR0]]
+; CHECK-NO-SINCOS-DOUBLE-ALIGN4-NEXT:    [[COS:%.*]] = call double @__cospi(double [[VAL]]) #[[ATTR0]]
+; CHECK-NO-SINCOS-DOUBLE-ALIGN4-NEXT:    [[RES:%.*]] = fadd double [[SIN]], [[COS]]
+; CHECK-NO-SINCOS-DOUBLE-ALIGN4-NEXT:    ret double [[RES]]
 ;
   %val = load double, ptr @var64
   %sin = call double @__sinpi(double %val) #0
diff --git a/llvm/test/Transforms/InstSimplify/ConstProp/calls-math-finite.ll b/llvm/test/Transforms/InstSimplify/ConstProp/calls-math-finite.ll
index 36a69f3031b19..2ec147c2ad79f 100644
--- a/llvm/test/Transforms/InstSimplify/ConstProp/calls-math-finite.ll
+++ b/llvm/test/Transforms/InstSimplify/ConstProp/calls-math-finite.ll
@@ -1,13 +1,13 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -passes=instsimplify -S | FileCheck %s
-; RUN: opt < %s -passes=instsimplify -S -mtriple=unknown-unknown-linux-musl | FileCheck -check-prefix=MUSL %s
+; RUN: opt < %s -passes=instsimplify -S -mtriple=x86_64-unknown-linux-musl | FileCheck -check-prefix=MUSL %s
 
 ; Test to verify constant folding can occur when math routines are mapped
 ; to the __<func>_finite versions of functions due to __FINITE_MATH_ONLY__
 ; being enabled on headers on Linux. All calls should constant fold away
 ; in this test.
 
-target triple = "unknown-unknown-linux-gnu"
+target triple = "x86_64-unknown-linux-gnu"
 
 declare double @__acos_finite(double) #0
 declare float @__acosf_finite(float) #0
diff --git a/llvm/test/Transforms/LoopStrengthReduce/X86/2012-01-13-phielim.ll b/llvm/test/Transforms/LoopStrengthReduce/X86/2012-01-13-phielim.ll
index d034e6acf9f86..32646b4c2b42d 100644
--- a/llvm/test/Transforms/LoopStrengthReduce/X86/2012-01-13-phielim.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/X86/2012-01-13-phielim.ll
@@ -10,21 +10,23 @@ define i32 @test(ptr %base) nounwind uwtable ssp {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[WHILE_BODY_LR_PH_I:%.*]]
 ; CHECK:       while.body.lr.ph.i:
+; CHECK-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 16
 ; CHECK-NEXT:    br label [[WHILE_BODY_I:%.*]]
 ; CHECK:       while.body.i:
 ; CHECK-NEXT:    [[INDVARS_IV7_I:%.*]] = phi i64 [ 16, [[WHILE_BODY_LR_PH_I]] ], [ [[INDVARS_IV_NEXT8_I:%.*]], [[COND_TRUE29_I:%.*]] ]
 ; CHECK-NEXT:    [[I_05_I:%.*]] = phi i64 [ 0, [[WHILE_BODY_LR_PH_I]] ], [ [[INDVARS_IV7_I]], [[COND_TRUE29_I]] ]
+; CHECK-NEXT:    [[LSR4:%.*]] = trunc i64 [[I_05_I]] to i32
+; CHECK-NEXT:    [[TMP0:%.*]] = sext i32 [[LSR4]] to i64
+; CHECK-NEXT:    [[UGLYGEP1:%.*]] = getelementptr i8, ptr [[UGLYGEP]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[SEXT_I:%.*]] = shl i64 [[I_05_I]], 32
 ; CHECK-NEXT:    [[IDX_EXT_I:%.*]] = ashr exact i64 [[SEXT_I]], 32
 ; CHECK-NEXT:    [[ADD_PTR_SUM_I:%.*]] = add i64 [[IDX_EXT_I]], 16
 ; CHECK-NEXT:    br label [[FOR_BODY_I:%.*]]
 ; CHECK:       for.body.i:
-; CHECK-NEXT:    [[INDVARS_IV_I:%.*]] = phi i64 [ 0, [[WHILE_BODY_I]] ], [ [[INDVARS_IV_NEXT_I:%.*]], [[FOR_BODY_I]] ]
-; CHECK-NEXT:    [[ADD_PTR_SUM:%.*]] = add i64 [[ADD_PTR_SUM_I]], [[INDVARS_IV_I]]
-; CHECK-NEXT:    [[ARRAYIDX22_I:%.*]] = getelementptr inbounds i8, ptr [[BASE:%.*]], i64 [[ADD_PTR_SUM]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX22_I]], align 1
-; CHECK-NEXT:    [[INDVARS_IV_NEXT_I]] = add i64 [[INDVARS_IV_I]], 1
+; CHECK-NEXT:    [[LSR_IV2:%.*]] = phi ptr [ [[UGLYGEP3:%.*]], [[FOR_BODY_I]] ], [ [[UGLYGEP1]], [[WHILE_BODY_I]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[LSR_IV2]], align 1
 ; CHECK-NEXT:    [[CMP:%.*]] = call i1 @check() #[[ATTR3:[0-9]+]]
+; CHECK-NEXT:    [[UGLYGEP3]] = getelementptr i8, ptr [[LSR_IV2]], i64 1
 ; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_END_I:%.*]], label [[FOR_BODY_I]]
 ; CHECK:       for.end.i:
 ; CHECK-NEXT:    [[ADD_PTR_I144:%.*]] = getelementptr inbounds i8, ptr [[BASE]], i64 [[ADD_PTR_SUM_I]]
@@ -93,18 +95,20 @@ define void @test2(i32 %n) nounwind uwtable {
 ; CHECK:       for.cond468.preheader:
 ; CHECK-NEXT:    br label [[FOR_COND468:%.*]]
 ; CHECK:       for.cond468:
-; CHECK-NEXT:    [[INDVARS_IV1163:%.*]] = phi i64 [ [[INDVARS_IV_NEXT1164:%.*]], [[IF_THEN477:%.*]] ], [ 1, [[FOR_COND468_PREHEADER]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = trunc i64 [[INDVARS_IV1163]] to i32
-; CHECK-NEXT:    [[CMP469:%.*]] = icmp slt i32 [[TMP0]], [[N:%.*]]
+; CHECK-NEXT:    [[LSR_IV1:%.*]] = phi i32 [ 1, [[FOR_COND468_PREHEADER]] ], [ [[LSR_IV_NEXT:%.*]], [[IF_THEN477:%.*]] ]
+; CHECK-NEXT:    [[LSR_IV:%.*]] = phi ptr [ getelementptr inbounds ([5000 x %struct.anon.7.91.199.307.415.475.559.643.751.835.943.1003.1111.1219.1351.1375.1399.1435.1471.1483.1519.1531.1651.1771], ptr @tags, i64 0, i64 0, i32 2), [[FOR_COND468_PREHEADER]] ], [ [[UGLYGEP:%.*]], [[IF_THEN477]] ]
+; CHECK-NEXT:    [[K_0:%.*]] = load i32, ptr [[LSR_IV]], align 4
+; CHECK-NEXT:    [[CMP469:%.*]] = icmp slt i32 [[LSR_IV1]], [[N:%.*]]
 ; CHECK-NEXT:    br i1 [[CMP469]], label [[FOR_BODY471:%.*]], label [[FOR_INC498_PREHEADER:%.*]]
 ; CHECK:       for.body471:
-; CHECK-NEXT:    [[FIRST:%.*]] = getelementptr inbounds [5000 x %struct.anon.7.91.199.307.415.475.559.643.751.835.943.1003.1111.1219.1351.1375.1399.1435.1471.1483.1519.1531.1651.1771], ptr @tags, i64 0, i64 [[INDVARS_IV1163]], i32 1
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[FIRST]], align 4
+; CHECK-NEXT:    [[UGLYGEP2:%.*]] = getelementptr i8, ptr [[LSR_IV]], i64 8
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[UGLYGEP2]], align 4
 ; CHECK-NEXT:    br i1 false, label [[IF_THEN477]], label [[FOR_INC498_PREHEADER]]
 ; CHECK:       for.inc498.preheader:
 ; CHECK-NEXT:    br label [[FOR_INC498:%.*]]
 ; CHECK:       if.then477:
-; CHECK-NEXT:    [[INDVARS_IV_NEXT1164]] = add i64 [[INDVARS_IV1163]], 1
+; CHECK-NEXT:    [[UGLYGEP]] = getelementptr i8, ptr [[LSR_IV]], i64 12
+; CHECK-NEXT:    [[LSR_IV_NEXT]] = add nuw nsw i32 [[LSR_IV1]], 1
 ; CHECK-NEXT:    br label [[FOR_COND468]]
 ; CHECK:       for.inc498:
 ; CHECK-NEXT:    br label [[FOR_INC498]]
@@ -154,27 +158,27 @@ define fastcc void @test3(double* nocapture %u) nounwind uwtable ssp {
 ; CHECK:       for.body3.us.i:
 ; CHECK-NEXT:    [[INDVARS_IV_I_SV_PHI:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_I:%.*]], [[MESHBB]] ], [ 0, [[FOR_BODY3_LR_PH_US_I:%.*]] ]
 ; CHECK-NEXT:    [[OPQ_SA_CALC12:%.*]] = sub i32 undef, 227
-; CHECK-NEXT:    [[TMP0:%.*]] = add nsw i64 [[INDVARS_IV_I_SV_PHI]], [[INDVARS_IV8_I_SV_PHI26:%.*]]
-; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
-; CHECK-NEXT:    [[MUL_I_US_I:%.*]] = mul nsw i32 0, [[TMP1]]
-; CHECK-NEXT:    [[ARRAYIDX5_US_I:%.*]] = getelementptr inbounds double, ptr [[U:%.*]], i64 [[INDVARS_IV_I_SV_PHI]]
-; CHECK-NEXT:    [[TMP2:%.*]] = load double, ptr [[ARRAYIDX5_US_I]], align 8
-; CHECK-NEXT:    [[INDVARS_IV_NEXT_I]] = add i64 [[INDVARS_IV_I_SV_PHI]], 1
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[LSR_IV:%.*]], [[INDVARS_IV_I_SV_PHI]]
+; CHECK-NEXT:    [[TMP:%.*]] = trunc i64 [[TMP0]] to i32
+; CHECK-NEXT:    [[MUL_I_US_I:%.*]] = mul nsw i32 0, [[TMP]]
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[INDVARS_IV_I_SV_PHI]], 3
+; CHECK-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, ptr [[U:%.*]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load double, ptr [[UGLYGEP]], align 8
 ; CHECK-NEXT:    br i1 undef, label [[FOR_INC8_US_I:%.*]], label [[MESHBB]]
 ; CHECK:       for.body3.lr.ph.us.i.loopexit:
+; CHECK-NEXT:    [[LSR_IV_NEXT:%.*]] = add i64 [[LSR_IV]], 1
 ; CHECK-NEXT:    br label [[FOR_BODY3_LR_PH_US_I]]
 ; CHECK:       for.body3.lr.ph.us.i:
-; CHECK-NEXT:    [[INDVARS_IV8_I_SV_PHI26]] = phi i64 [ undef, [[MESHBB1]] ], [ [[INDVARS_IV8_I_SV_PHI24:%.*]], [[FOR_BODY3_LR_PH_US_I_LOOPEXIT:%.*]] ]
-; CHECK-NEXT:    [[ARRAYIDX_US_I:%.*]] = getelementptr inbounds double, ptr undef, i64 [[INDVARS_IV8_I_SV_PHI26]]
-; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDVARS_IV8_I_SV_PHI26]], 1
+; CHECK-NEXT:    [[LSR_IV]] = phi i64 [ [[LSR_IV_NEXT]], [[FOR_BODY3_LR_PH_US_I_LOOPEXIT:%.*]] ], [ undef, [[MESHBB1]] ]
+; CHECK-NEXT:    [[ARRAYIDX_US_I:%.*]] = getelementptr inbounds double, ptr undef, i64 [[LSR_IV]]
 ; CHECK-NEXT:    br label [[FOR_BODY3_US_I:%.*]]
 ; CHECK:       for.inc8.us.i2:
 ; CHECK-NEXT:    unreachable
 ; CHECK:       eval_At_times_u.exit:
 ; CHECK-NEXT:    ret void
 ; CHECK:       meshBB:
-; CHECK-NEXT:    [[INDVARS_IV8_I_SV_PHI24]] = phi i64 [ undef, [[FOR_BODY3_US_I]] ], [ [[TMP3]], [[FOR_INC8_US_I]] ]
 ; CHECK-NEXT:    [[MESHSTACKVARIABLE_PHI:%.*]] = phi i32 [ [[OPQ_SA_CALC12]], [[FOR_BODY3_US_I]] ], [ undef, [[FOR_INC8_US_I]] ]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT_I]] = add i64 [[INDVARS_IV_I_SV_PHI]], 1
 ; CHECK-NEXT:    br i1 true, label [[FOR_BODY3_LR_PH_US_I_LOOPEXIT]], label [[FOR_BODY3_US_I]]
 ; CHECK:       meshBB1.loopexit:
 ; CHECK-NEXT:    br label [[MESHBB1]]
diff --git a/llvm/test/Transforms/LoopUnroll/AArch64/runtime-unroll-generic.ll b/llvm/test/Transforms/LoopUnroll/AArch64/runtime-unroll-generic.ll
index f8b5f406c6a26..9581bf8854a10 100644
--- a/llvm/test/Transforms/LoopUnroll/AArch64/runtime-unroll-generic.ll
+++ b/llvm/test/Transforms/LoopUnroll/AArch64/runtime-unroll-generic.ll
@@ -9,116 +9,110 @@ define void @runtime_unroll_generic(i32 %arg_0, ptr %arg_1, ptr %arg_2, ptr %arg
 ; CHECK-A55-NEXT:    [[CMP52_NOT:%.*]] = icmp eq i32 [[ARG_0:%.*]], 0
 ; CHECK-A55-NEXT:    br i1 [[CMP52_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY6_PREHEADER:%.*]]
 ; CHECK-A55:       for.body6.preheader:
-; CHECK-A55-NEXT:    [[XTRAITER:%.*]] = and i32 [[ARG_0]], 3
+; CHECK-A55-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[ARG_0]] to i64
+; CHECK-A55-NEXT:    [[XTRAITER:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 3
 ; CHECK-A55-NEXT:    [[TMP0:%.*]] = icmp ult i32 [[ARG_0]], 4
 ; CHECK-A55-NEXT:    br i1 [[TMP0]], label [[FOR_END_LOOPEXIT_UNR_LCSSA:%.*]], label [[FOR_BODY6_PREHEADER_NEW:%.*]]
 ; CHECK-A55:       for.body6.preheader.new:
-; CHECK-A55-NEXT:    [[UNROLL_ITER:%.*]] = and i32 [[ARG_0]], -4
+; CHECK-A55-NEXT:    [[UNROLL_ITER:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 4294967292
 ; CHECK-A55-NEXT:    br label [[FOR_BODY6:%.*]]
 ; CHECK-A55:       for.body6:
-; CHECK-A55-NEXT:    [[K_03:%.*]] = phi i32 [ 0, [[FOR_BODY6_PREHEADER_NEW]] ], [ [[INC_3:%.*]], [[FOR_BODY6]] ]
-; CHECK-A55-NEXT:    [[NITER:%.*]] = phi i32 [ 0, [[FOR_BODY6_PREHEADER_NEW]] ], [ [[NITER_NEXT_3:%.*]], [[FOR_BODY6]] ]
-; CHECK-A55-NEXT:    [[IDX_EXT:%.*]] = zext i32 [[K_03]] to i64
-; CHECK-A55-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, ptr [[ARG_2:%.*]], i64 [[IDX_EXT]]
+; CHECK-A55-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY6_PREHEADER_NEW]] ], [ [[INDVARS_IV_NEXT_3:%.*]], [[FOR_BODY6]] ]
+; CHECK-A55-NEXT:    [[NITER:%.*]] = phi i64 [ 0, [[FOR_BODY6_PREHEADER_NEW]] ], [ [[NITER_NEXT_3:%.*]], [[FOR_BODY6]] ]
+; CHECK-A55-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, ptr [[ARG_2:%.*]], i64 [[INDVARS_IV]]
 ; CHECK-A55-NEXT:    [[TMP1:%.*]] = load i16, ptr [[ARRAYIDX10]], align 2
 ; CHECK-A55-NEXT:    [[CONV:%.*]] = sext i16 [[TMP1]] to i32
-; CHECK-A55-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds i16, ptr [[ARG_3:%.*]], i64 [[IDX_EXT]]
+; CHECK-A55-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds i16, ptr [[ARG_3:%.*]], i64 [[INDVARS_IV]]
 ; CHECK-A55-NEXT:    [[TMP2:%.*]] = load i16, ptr [[ARRAYIDX14]], align 2
 ; CHECK-A55-NEXT:    [[CONV15:%.*]] = sext i16 [[TMP2]] to i32
 ; CHECK-A55-NEXT:    [[MUL16:%.*]] = mul nsw i32 [[CONV15]], [[CONV]]
-; CHECK-A55-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds i32, ptr [[ARG_1:%.*]], i64 [[IDX_EXT]]
+; CHECK-A55-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds i32, ptr [[ARG_1:%.*]], i64 [[INDVARS_IV]]
 ; CHECK-A55-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX20]], align 4
 ; CHECK-A55-NEXT:    [[ADD21:%.*]] = add nsw i32 [[MUL16]], [[TMP3]]
 ; CHECK-A55-NEXT:    store i32 [[ADD21]], ptr [[ARRAYIDX20]], align 4
-; CHECK-A55-NEXT:    [[INC:%.*]] = or i32 [[K_03]], 1
-; CHECK-A55-NEXT:    [[IDX_EXT_1:%.*]] = zext i32 [[INC]] to i64
-; CHECK-A55-NEXT:    [[ARRAYIDX10_1:%.*]] = getelementptr inbounds i16, ptr [[ARG_2]], i64 [[IDX_EXT_1]]
+; CHECK-A55-NEXT:    [[INDVARS_IV_NEXT:%.*]] = or i64 [[INDVARS_IV]], 1
+; CHECK-A55-NEXT:    [[ARRAYIDX10_1:%.*]] = getelementptr inbounds i16, ptr [[ARG_2]], i64 [[INDVARS_IV_NEXT]]
 ; CHECK-A55-NEXT:    [[TMP4:%.*]] = load i16, ptr [[ARRAYIDX10_1]], align 2
 ; CHECK-A55-NEXT:    [[CONV_1:%.*]] = sext i16 [[TMP4]] to i32
-; CHECK-A55-NEXT:    [[ARRAYIDX14_1:%.*]] = getelementptr inbounds i16, ptr [[ARG_3]], i64 [[IDX_EXT_1]]
+; CHECK-A55-NEXT:    [[ARRAYIDX14_1:%.*]] = getelementptr inbounds i16, ptr [[ARG_3]], i64 [[INDVARS_IV_NEXT]]
 ; CHECK-A55-NEXT:    [[TMP5:%.*]] = load i16, ptr [[ARRAYIDX14_1]], align 2
 ; CHECK-A55-NEXT:    [[CONV15_1:%.*]] = sext i16 [[TMP5]] to i32
 ; CHECK-A55-NEXT:    [[MUL16_1:%.*]] = mul nsw i32 [[CONV15_1]], [[CONV_1]]
-; CHECK-A55-NEXT:    [[ARRAYIDX20_1:%.*]] = getelementptr inbounds i32, ptr [[ARG_1]], i64 [[IDX_EXT_1]]
+; CHECK-A55-NEXT:    [[ARRAYIDX20_1:%.*]] = getelementptr inbounds i32, ptr [[ARG_1]], i64 [[INDVARS_IV_NEXT]]
 ; CHECK-A55-NEXT:    [[TMP6:%.*]] = load i32, ptr [[ARRAYIDX20_1]], align 4
 ; CHECK-A55-NEXT:    [[ADD21_1:%.*]] = add nsw i32 [[MUL16_1]], [[TMP6]]
 ; CHECK-A55-NEXT:    store i32 [[ADD21_1]], ptr [[ARRAYIDX20_1]], align 4
-; CHECK-A55-NEXT:    [[INC_1:%.*]] = or i32 [[K_03]], 2
-; CHECK-A55-NEXT:    [[IDX_EXT_2:%.*]] = zext i32 [[INC_1]] to i64
-; CHECK-A55-NEXT:    [[ARRAYIDX10_2:%.*]] = getelementptr inbounds i16, ptr [[ARG_2]], i64 [[IDX_EXT_2]]
+; CHECK-A55-NEXT:    [[INDVARS_IV_NEXT_1:%.*]] = or i64 [[INDVARS_IV]], 2
+; CHECK-A55-NEXT:    [[ARRAYIDX10_2:%.*]] = getelementptr inbounds i16, ptr [[ARG_2]], i64 [[INDVARS_IV_NEXT_1]]
 ; CHECK-A55-NEXT:    [[TMP7:%.*]] = load i16, ptr [[ARRAYIDX10_2]], align 2
 ; CHECK-A55-NEXT:    [[CONV_2:%.*]] = sext i16 [[TMP7]] to i32
-; CHECK-A55-NEXT:    [[ARRAYIDX14_2:%.*]] = getelementptr inbounds i16, ptr [[ARG_3]], i64 [[IDX_EXT_2]]
+; CHECK-A55-NEXT:    [[ARRAYIDX14_2:%.*]] = getelementptr inbounds i16, ptr [[ARG_3]], i64 [[INDVARS_IV_NEXT_1]]
 ; CHECK-A55-NEXT:    [[TMP8:%.*]] = load i16, ptr [[ARRAYIDX14_2]], align 2
 ; CHECK-A55-NEXT:    [[CONV15_2:%.*]] = sext i16 [[TMP8]] to i32
 ; CHECK-A55-NEXT:    [[MUL16_2:%.*]] = mul nsw i32 [[CONV15_2]], [[CONV_2]]
-; CHECK-A55-NEXT:    [[ARRAYIDX20_2:%.*]] = getelementptr inbounds i32, ptr [[ARG_1]], i64 [[IDX_EXT_2]]
+; CHECK-A55-NEXT:    [[ARRAYIDX20_2:%.*]] = getelementptr inbounds i32, ptr [[ARG_1]], i64 [[INDVARS_IV_NEXT_1]]
 ; CHECK-A55-NEXT:    [[TMP9:%.*]] = load i32, ptr [[ARRAYIDX20_2]], align 4
 ; CHECK-A55-NEXT:    [[ADD21_2:%.*]] = add nsw i32 [[MUL16_2]], [[TMP9]]
 ; CHECK-A55-NEXT:    store i32 [[ADD21_2]], ptr [[ARRAYIDX20_2]], align 4
-; CHECK-A55-NEXT:    [[INC_2:%.*]] = or i32 [[K_03]], 3
-; CHECK-A55-NEXT:    [[IDX_EXT_3:%.*]] = zext i32 [[INC_2]] to i64
-; CHECK-A55-NEXT:    [[ARRAYIDX10_3:%.*]] = getelementptr inbounds i16, ptr [[ARG_2]], i64 [[IDX_EXT_3]]
+; CHECK-A55-NEXT:    [[INDVARS_IV_NEXT_2:%.*]] = or i64 [[INDVARS_IV]], 3
+; CHECK-A55-NEXT:    [[ARRAYIDX10_3:%.*]] = getelementptr inbounds i16, ptr [[ARG_2]], i64 [[INDVARS_IV_NEXT_2]]
 ; CHECK-A55-NEXT:    [[TMP10:%.*]] = load i16, ptr [[ARRAYIDX10_3]], align 2
 ; CHECK-A55-NEXT:    [[CONV_3:%.*]] = sext i16 [[TMP10]] to i32
-; CHECK-A55-NEXT:    [[ARRAYIDX14_3:%.*]] = getelementptr inbounds i16, ptr [[ARG_3]], i64 [[IDX_EXT_3]]
+; CHECK-A55-NEXT:    [[ARRAYIDX14_3:%.*]] = getelementptr inbounds i16, ptr [[ARG_3]], i64 [[INDVARS_IV_NEXT_2]]
 ; CHECK-A55-NEXT:    [[TMP11:%.*]] = load i16, ptr [[ARRAYIDX14_3]], align 2
 ; CHECK-A55-NEXT:    [[CONV15_3:%.*]] = sext i16 [[TMP11]] to i32
 ; CHECK-A55-NEXT:    [[MUL16_3:%.*]] = mul nsw i32 [[CONV15_3]], [[CONV_3]]
-; CHECK-A55-NEXT:    [[ARRAYIDX20_3:%.*]] = getelementptr inbounds i32, ptr [[ARG_1]], i64 [[IDX_EXT_3]]
+; CHECK-A55-NEXT:    [[ARRAYIDX20_3:%.*]] = getelementptr inbounds i32, ptr [[ARG_1]], i64 [[INDVARS_IV_NEXT_2]]
 ; CHECK-A55-NEXT:    [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX20_3]], align 4
 ; CHECK-A55-NEXT:    [[ADD21_3:%.*]] = add nsw i32 [[MUL16_3]], [[TMP12]]
 ; CHECK-A55-NEXT:    store i32 [[ADD21_3]], ptr [[ARRAYIDX20_3]], align 4
-; CHECK-A55-NEXT:    [[INC_3]] = add nuw i32 [[K_03]], 4
-; CHECK-A55-NEXT:    [[NITER_NEXT_3]] = add i32 [[NITER]], 4
-; CHECK-A55-NEXT:    [[NITER_NCMP_3_NOT:%.*]] = icmp eq i32 [[NITER_NEXT_3]], [[UNROLL_ITER]]
-; CHECK-A55-NEXT:    br i1 [[NITER_NCMP_3_NOT]], label [[FOR_END_LOOPEXIT_UNR_LCSSA]], label [[FOR_BODY6]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-A55-NEXT:    [[INDVARS_IV_NEXT_3]] = add nuw nsw i64 [[INDVARS_IV]], 4
+; CHECK-A55-NEXT:    [[NITER_NEXT_3]] = add i64 [[NITER]], 4
+; CHECK-A55-NEXT:    [[NITER_NCMP_3:%.*]] = icmp eq i64 [[NITER_NEXT_3]], [[UNROLL_ITER]]
+; CHECK-A55-NEXT:    br i1 [[NITER_NCMP_3]], label [[FOR_END_LOOPEXIT_UNR_LCSSA]], label [[FOR_BODY6]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK-A55:       for.end.loopexit.unr-lcssa:
-; CHECK-A55-NEXT:    [[K_03_UNR:%.*]] = phi i32 [ 0, [[FOR_BODY6_PREHEADER]] ], [ [[INC_3]], [[FOR_BODY6]] ]
-; CHECK-A55-NEXT:    [[LCMP_MOD_NOT:%.*]] = icmp eq i32 [[XTRAITER]], 0
+; CHECK-A55-NEXT:    [[INDVARS_IV_UNR:%.*]] = phi i64 [ 0, [[FOR_BODY6_PREHEADER]] ], [ [[INDVARS_IV_NEXT_3]], [[FOR_BODY6]] ]
+; CHECK-A55-NEXT:    [[LCMP_MOD_NOT:%.*]] = icmp eq i64 [[XTRAITER]], 0
 ; CHECK-A55-NEXT:    br i1 [[LCMP_MOD_NOT]], label [[FOR_END]], label [[FOR_BODY6_EPIL:%.*]]
 ; CHECK-A55:       for.body6.epil:
-; CHECK-A55-NEXT:    [[IDX_EXT_EPIL:%.*]] = zext i32 [[K_03_UNR]] to i64
-; CHECK-A55-NEXT:    [[ARRAYIDX10_EPIL:%.*]] = getelementptr inbounds i16, ptr [[ARG_2]], i64 [[IDX_EXT_EPIL]]
+; CHECK-A55-NEXT:    [[ARRAYIDX10_EPIL:%.*]] = getelementptr inbounds i16, ptr [[ARG_2]], i64 [[INDVARS_IV_UNR]]
 ; CHECK-A55-NEXT:    [[TMP13:%.*]] = load i16, ptr [[ARRAYIDX10_EPIL]], align 2
 ; CHECK-A55-NEXT:    [[CONV_EPIL:%.*]] = sext i16 [[TMP13]] to i32
-; CHECK-A55-NEXT:    [[ARRAYIDX14_EPIL:%.*]] = getelementptr inbounds i16, ptr [[ARG_3]], i64 [[IDX_EXT_EPIL]]
+; CHECK-A55-NEXT:    [[ARRAYIDX14_EPIL:%.*]] = getelementptr inbounds i16, ptr [[ARG_3]], i64 [[INDVARS_IV_UNR]]
 ; CHECK-A55-NEXT:    [[TMP14:%.*]] = load i16, ptr [[ARRAYIDX14_EPIL]], align 2
 ; CHECK-A55-NEXT:    [[CONV15_EPIL:%.*]] = sext i16 [[TMP14]] to i32
 ; CHECK-A55-NEXT:    [[MUL16_EPIL:%.*]] = mul nsw i32 [[CONV15_EPIL]], [[CONV_EPIL]]
-; CHECK-A55-NEXT:    [[ARRAYIDX20_EPIL:%.*]] = getelementptr inbounds i32, ptr [[ARG_1]], i64 [[IDX_EXT_EPIL]]
+; CHECK-A55-NEXT:    [[ARRAYIDX20_EPIL:%.*]] = getelementptr inbounds i32, ptr [[ARG_1]], i64 [[INDVARS_IV_UNR]]
 ; CHECK-A55-NEXT:    [[TMP15:%.*]] = load i32, ptr [[ARRAYIDX20_EPIL]], align 4
 ; CHECK-A55-NEXT:    [[ADD21_EPIL:%.*]] = add nsw i32 [[MUL16_EPIL]], [[TMP15]]
 ; CHECK-A55-NEXT:    store i32 [[ADD21_EPIL]], ptr [[ARRAYIDX20_EPIL]], align 4
-; CHECK-A55-NEXT:    [[EPIL_ITER_CMP_NOT:%.*]] = icmp eq i32 [[XTRAITER]], 1
+; CHECK-A55-NEXT:    [[EPIL_ITER_CMP_NOT:%.*]] = icmp eq i64 [[XTRAITER]], 1
 ; CHECK-A55-NEXT:    br i1 [[EPIL_ITER_CMP_NOT]], label [[FOR_END]], label [[FOR_BODY6_EPIL_1:%.*]]
 ; CHECK-A55:       for.body6.epil.1:
-; CHECK-A55-NEXT:    [[INC_EPIL:%.*]] = add nuw i32 [[K_03_UNR]], 1
-; CHECK-A55-NEXT:    [[IDX_EXT_EPIL_1:%.*]] = zext i32 [[INC_EPIL]] to i64
-; CHECK-A55-NEXT:    [[ARRAYIDX10_EPIL_1:%.*]] = getelementptr inbounds i16, ptr [[ARG_2]], i64 [[IDX_EXT_EPIL_1]]
+; CHECK-A55-NEXT:    [[INDVARS_IV_NEXT_EPIL:%.*]] = add nuw nsw i64 [[INDVARS_IV_UNR]], 1
+; CHECK-A55-NEXT:    [[ARRAYIDX10_EPIL_1:%.*]] = getelementptr inbounds i16, ptr [[ARG_2]], i64 [[INDVARS_IV_NEXT_EPIL]]
 ; CHECK-A55-NEXT:    [[TMP16:%.*]] = load i16, ptr [[ARRAYIDX10_EPIL_1]], align 2
 ; CHECK-A55-NEXT:    [[CONV_EPIL_1:%.*]] = sext i16 [[TMP16]] to i32
-; CHECK-A55-NEXT:    [[ARRAYIDX14_EPIL_1:%.*]] = getelementptr inbounds i16, ptr [[ARG_3]], i64 [[IDX_EXT_EPIL_1]]
+; CHECK-A55-NEXT:    [[ARRAYIDX14_EPIL_1:%.*]] = getelementptr inbounds i16, ptr [[ARG_3]], i64 [[INDVARS_IV_NEXT_EPIL]]
 ; CHECK-A55-NEXT:    [[TMP17:%.*]] = load i16, ptr [[ARRAYIDX14_EPIL_1]], align 2
 ; CHECK-A55-NEXT:    [[CONV15_EPIL_1:%.*]] = sext i16 [[TMP17]] to i32
 ; CHECK-A55-NEXT:    [[MUL16_EPIL_1:%.*]] = mul nsw i32 [[CONV15_EPIL_1]], [[CONV_EPIL_1]]
-; CHECK-A55-NEXT:    [[ARRAYIDX20_EPIL_1:%.*]] = getelementptr inbounds i32, ptr [[ARG_1]], i64 [[IDX_EXT_EPIL_1]]
+; CHECK-A55-NEXT:    [[ARRAYIDX20_EPIL_1:%.*]] = getelementptr inbounds i32, ptr [[ARG_1]], i64 [[INDVARS_IV_NEXT_EPIL]]
 ; CHECK-A55-NEXT:    [[TMP18:%.*]] = load i32, ptr [[ARRAYIDX20_EPIL_1]], align 4
 ; CHECK-A55-NEXT:    [[ADD21_EPIL_1:%.*]] = add nsw i32 [[MUL16_EPIL_1]], [[TMP18]]
 ; CHECK-A55-NEXT:    store i32 [[ADD21_EPIL_1]], ptr [[ARRAYIDX20_EPIL_1]], align 4
-; CHECK-A55-NEXT:    [[EPIL_ITER_CMP_1_NOT:%.*]] = icmp eq i32 [[XTRAITER]], 2
+; CHECK-A55-NEXT:    [[EPIL_ITER_CMP_1_NOT:%.*]] = icmp eq i64 [[XTRAITER]], 2
 ; CHECK-A55-NEXT:    br i1 [[EPIL_ITER_CMP_1_NOT]], label [[FOR_END]], label [[FOR_BODY6_EPIL_2:%.*]]
 ; CHECK-A55:       for.body6.epil.2:
-; CHECK-A55-NEXT:    [[INC_EPIL_1:%.*]] = add nuw i32 [[K_03_UNR]], 2
-; CHECK-A55-NEXT:    [[IDX_EXT_EPIL_2:%.*]] = zext i32 [[INC_EPIL_1]] to i64
-; CHECK-A55-NEXT:    [[ARRAYIDX10_EPIL_2:%.*]] = getelementptr inbounds i16, ptr [[ARG_2]], i64 [[IDX_EXT_EPIL_2]]
+; CHECK-A55-NEXT:    [[INDVARS_IV_NEXT_EPIL_1:%.*]] = add nuw nsw i64 [[INDVARS_IV_UNR]], 2
+; CHECK-A55-NEXT:    [[ARRAYIDX10_EPIL_2:%.*]] = getelementptr inbounds i16, ptr [[ARG_2]], i64 [[INDVARS_IV_NEXT_EPIL_1]]
 ; CHECK-A55-NEXT:    [[TMP19:%.*]] = load i16, ptr [[ARRAYIDX10_EPIL_2]], align 2
 ; CHECK-A55-NEXT:    [[CONV_EPIL_2:%.*]] = sext i16 [[TMP19]] to i32
-; CHECK-A55-NEXT:    [[ARRAYIDX14_EPIL_2:%.*]] = getelementptr inbounds i16, ptr [[ARG_3]], i64 [[IDX_EXT_EPIL_2]]
+; CHECK-A55-NEXT:    [[ARRAYIDX14_EPIL_2:%.*]] = getelementptr inbounds i16, ptr [[ARG_3]], i64 [[INDVARS_IV_NEXT_EPIL_1]]
 ; CHECK-A55-NEXT:    [[TMP20:%.*]] = load i16, ptr [[ARRAYIDX14_EPIL_2]], align 2
 ; CHECK-A55-NEXT:    [[CONV15_EPIL_2:%.*]] = sext i16 [[TMP20]] to i32
 ; CHECK-A55-NEXT:    [[MUL16_EPIL_2:%.*]] = mul nsw i32 [[CONV15_EPIL_2]], [[CONV_EPIL_2]]
-; CHECK-A55-NEXT:    [[ARRAYIDX20_EPIL_2:%.*]] = getelementptr inbounds i32, ptr [[ARG_1]], i64 [[IDX_EXT_EPIL_2]]
+; CHECK-A55-NEXT:    [[ARRAYIDX20_EPIL_2:%.*]] = getelementptr inbounds i32, ptr [[ARG_1]], i64 [[INDVARS_IV_NEXT_EPIL_1]]
 ; CHECK-A55-NEXT:    [[TMP21:%.*]] = load i32, ptr [[ARRAYIDX20_EPIL_2]], align 4
 ; CHECK-A55-NEXT:    [[ADD21_EPIL_2:%.*]] = add nsw i32 [[MUL16_EPIL_2]], [[TMP21]]
 ; CHECK-A55-NEXT:    store i32 [[ADD21_EPIL_2]], ptr [[ARRAYIDX20_EPIL_2]], align 4
@@ -129,24 +123,26 @@ define void @runtime_unroll_generic(i32 %arg_0, ptr %arg_1, ptr %arg_2, ptr %arg
 ; CHECK-GENERIC-LABEL: @runtime_unroll_generic(
 ; CHECK-GENERIC-NEXT:  entry:
 ; CHECK-GENERIC-NEXT:    [[CMP52_NOT:%.*]] = icmp eq i32 [[ARG_0:%.*]], 0
-; CHECK-GENERIC-NEXT:    br i1 [[CMP52_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY6:%.*]]
+; CHECK-GENERIC-NEXT:    br i1 [[CMP52_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY6_PREHEADER:%.*]]
+; CHECK-GENERIC:       for.body6.preheader:
+; CHECK-GENERIC-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[ARG_0]] to i64
+; CHECK-GENERIC-NEXT:    br label [[FOR_BODY6:%.*]]
 ; CHECK-GENERIC:       for.body6:
-; CHECK-GENERIC-NEXT:    [[K_03:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY6]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-GENERIC-NEXT:    [[IDX_EXT:%.*]] = zext i32 [[K_03]] to i64
-; CHECK-GENERIC-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, ptr [[ARG_2:%.*]], i64 [[IDX_EXT]]
+; CHECK-GENERIC-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY6_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY6]] ]
+; CHECK-GENERIC-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, ptr [[ARG_2:%.*]], i64 [[INDVARS_IV]]
 ; CHECK-GENERIC-NEXT:    [[TMP0:%.*]] = load i16, ptr [[ARRAYIDX10]], align 2
 ; CHECK-GENERIC-NEXT:    [[CONV:%.*]] = sext i16 [[TMP0]] to i32
-; CHECK-GENERIC-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds i16, ptr [[ARG_3:%.*]], i64 [[IDX_EXT]]
+; CHECK-GENERIC-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds i16, ptr [[ARG_3:%.*]], i64 [[INDVARS_IV]]
 ; CHECK-GENERIC-NEXT:    [[TMP1:%.*]] = load i16, ptr [[ARRAYIDX14]], align 2
 ; CHECK-GENERIC-NEXT:    [[CONV15:%.*]] = sext i16 [[TMP1]] to i32
 ; CHECK-GENERIC-NEXT:    [[MUL16:%.*]] = mul nsw i32 [[CONV15]], [[CONV]]
-; CHECK-GENERIC-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds i32, ptr [[ARG_1:%.*]], i64 [[IDX_EXT]]
+; CHECK-GENERIC-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds i32, ptr [[ARG_1:%.*]], i64 [[INDVARS_IV]]
 ; CHECK-GENERIC-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX20]], align 4
 ; CHECK-GENERIC-NEXT:    [[ADD21:%.*]] = add nsw i32 [[MUL16]], [[TMP2]]
 ; CHECK-GENERIC-NEXT:    store i32 [[ADD21]], ptr [[ARRAYIDX20]], align 4
-; CHECK-GENERIC-NEXT:    [[INC]] = add nuw i32 [[K_03]], 1
-; CHECK-GENERIC-NEXT:    [[CMP5:%.*]] = icmp ult i32 [[INC]], [[ARG_0]]
-; CHECK-GENERIC-NEXT:    br i1 [[CMP5]], label [[FOR_BODY6]], label [[FOR_END]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-GENERIC-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-GENERIC-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-GENERIC-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY6]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK-GENERIC:       for.end:
 ; CHECK-GENERIC-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopUnroll/ARM/instr-size-costs.ll b/llvm/test/Transforms/LoopUnroll/ARM/instr-size-costs.ll
index e1701ee75ea46..216bf489bc66e 100644
--- a/llvm/test/Transforms/LoopUnroll/ARM/instr-size-costs.ll
+++ b/llvm/test/Transforms/LoopUnroll/ARM/instr-size-costs.ll
@@ -103,19 +103,19 @@ define void @test_i64_add_optsize(ptr %a, ptr %b, ptr %c) #0 {
 ; CHECK-V8-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[COUNT_1:%.*]], [[LOOP]] ]
 ; CHECK-V8-NEXT:    [[ADDR_A:%.*]] = getelementptr i64, ptr [[A:%.*]], i32 [[IV]]
 ; CHECK-V8-NEXT:    [[ADDR_B:%.*]] = getelementptr i64, ptr [[B:%.*]], i32 [[IV]]
-; CHECK-V8-NEXT:    [[DATA_A:%.*]] = load i64, ptr [[ADDR_A]], align 4
-; CHECK-V8-NEXT:    [[DATA_B:%.*]] = load i64, ptr [[ADDR_B]], align 4
+; CHECK-V8-NEXT:    [[DATA_A:%.*]] = load i64, ptr [[ADDR_A]], align 8
+; CHECK-V8-NEXT:    [[DATA_B:%.*]] = load i64, ptr [[ADDR_B]], align 8
 ; CHECK-V8-NEXT:    [[RES:%.*]] = add i64 [[DATA_A]], [[DATA_B]]
 ; CHECK-V8-NEXT:    [[ADDR_C:%.*]] = getelementptr i64, ptr [[C:%.*]], i32 [[IV]]
-; CHECK-V8-NEXT:    store i64 [[RES]], ptr [[ADDR_C]], align 4
+; CHECK-V8-NEXT:    store i64 [[RES]], ptr [[ADDR_C]], align 8
 ; CHECK-V8-NEXT:    [[COUNT:%.*]] = add nuw nsw i32 [[IV]], 1
 ; CHECK-V8-NEXT:    [[ADDR_A_1:%.*]] = getelementptr i64, ptr [[A]], i32 [[COUNT]]
 ; CHECK-V8-NEXT:    [[ADDR_B_1:%.*]] = getelementptr i64, ptr [[B]], i32 [[COUNT]]
-; CHECK-V8-NEXT:    [[DATA_A_1:%.*]] = load i64, ptr [[ADDR_A_1]], align 4
-; CHECK-V8-NEXT:    [[DATA_B_1:%.*]] = load i64, ptr [[ADDR_B_1]], align 4
+; CHECK-V8-NEXT:    [[DATA_A_1:%.*]] = load i64, ptr [[ADDR_A_1]], align 8
+; CHECK-V8-NEXT:    [[DATA_B_1:%.*]] = load i64, ptr [[ADDR_B_1]], align 8
 ; CHECK-V8-NEXT:    [[RES_1:%.*]] = add i64 [[DATA_A_1]], [[DATA_B_1]]
 ; CHECK-V8-NEXT:    [[ADDR_C_1:%.*]] = getelementptr i64, ptr [[C]], i32 [[COUNT]]
-; CHECK-V8-NEXT:    store i64 [[RES_1]], ptr [[ADDR_C_1]], align 4
+; CHECK-V8-NEXT:    store i64 [[RES_1]], ptr [[ADDR_C_1]], align 8
 ; CHECK-V8-NEXT:    [[COUNT_1]] = add nuw nsw i32 [[IV]], 2
 ; CHECK-V8-NEXT:    [[END_1:%.*]] = icmp ne i32 [[COUNT_1]], 100
 ; CHECK-V8-NEXT:    br i1 [[END_1]], label [[LOOP]], label [[EXIT:%.*]]
@@ -150,19 +150,19 @@ define void @test_i64_add_minsize(ptr %a, ptr %b, ptr %c) #1 {
 ; CHECK-V8-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[COUNT_1:%.*]], [[LOOP]] ]
 ; CHECK-V8-NEXT:    [[ADDR_A:%.*]] = getelementptr i64, ptr [[A:%.*]], i32 [[IV]]
 ; CHECK-V8-NEXT:    [[ADDR_B:%.*]] = getelementptr i64, ptr [[B:%.*]], i32 [[IV]]
-; CHECK-V8-NEXT:    [[DATA_A:%.*]] = load i64, ptr [[ADDR_A]], align 4
-; CHECK-V8-NEXT:    [[DATA_B:%.*]] = load i64, ptr [[ADDR_B]], align 4
+; CHECK-V8-NEXT:    [[DATA_A:%.*]] = load i64, ptr [[ADDR_A]], align 8
+; CHECK-V8-NEXT:    [[DATA_B:%.*]] = load i64, ptr [[ADDR_B]], align 8
 ; CHECK-V8-NEXT:    [[RES:%.*]] = add i64 [[DATA_A]], [[DATA_B]]
 ; CHECK-V8-NEXT:    [[ADDR_C:%.*]] = getelementptr i64, ptr [[C:%.*]], i32 [[IV]]
-; CHECK-V8-NEXT:    store i64 [[RES]], ptr [[ADDR_C]], align 4
+; CHECK-V8-NEXT:    store i64 [[RES]], ptr [[ADDR_C]], align 8
 ; CHECK-V8-NEXT:    [[COUNT:%.*]] = add nuw nsw i32 [[IV]], 1
 ; CHECK-V8-NEXT:    [[ADDR_A_1:%.*]] = getelementptr i64, ptr [[A]], i32 [[COUNT]]
 ; CHECK-V8-NEXT:    [[ADDR_B_1:%.*]] = getelementptr i64, ptr [[B]], i32 [[COUNT]]
-; CHECK-V8-NEXT:    [[DATA_A_1:%.*]] = load i64, ptr [[ADDR_A_1]], align 4
-; CHECK-V8-NEXT:    [[DATA_B_1:%.*]] = load i64, ptr [[ADDR_B_1]], align 4
+; CHECK-V8-NEXT:    [[DATA_A_1:%.*]] = load i64, ptr [[ADDR_A_1]], align 8
+; CHECK-V8-NEXT:    [[DATA_B_1:%.*]] = load i64, ptr [[ADDR_B_1]], align 8
 ; CHECK-V8-NEXT:    [[RES_1:%.*]] = add i64 [[DATA_A_1]], [[DATA_B_1]]
 ; CHECK-V8-NEXT:    [[ADDR_C_1:%.*]] = getelementptr i64, ptr [[C]], i32 [[COUNT]]
-; CHECK-V8-NEXT:    store i64 [[RES_1]], ptr [[ADDR_C_1]], align 4
+; CHECK-V8-NEXT:    store i64 [[RES_1]], ptr [[ADDR_C_1]], align 8
 ; CHECK-V8-NEXT:    [[COUNT_1]] = add nuw nsw i32 [[IV]], 2
 ; CHECK-V8-NEXT:    [[END_1:%.*]] = icmp ne i32 [[COUNT_1]], 100
 ; CHECK-V8-NEXT:    br i1 [[END_1]], label [[LOOP]], label [[EXIT:%.*]]
@@ -310,13 +310,13 @@ define i64 @test_i64_select_optsize(ptr %a, ptr %b, ptr %c) #0 {
 ; CHECK-V8-NEXT:    [[ACC:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[ACC_NEXT:%.*]], [[LOOP]] ]
 ; CHECK-V8-NEXT:    [[ADDR_A:%.*]] = getelementptr i64, ptr [[A:%.*]], i32 [[IV]]
 ; CHECK-V8-NEXT:    [[ADDR_B:%.*]] = getelementptr i64, ptr [[B:%.*]], i32 [[IV]]
-; CHECK-V8-NEXT:    [[DATA_A:%.*]] = load i64, ptr [[ADDR_A]], align 4
-; CHECK-V8-NEXT:    [[DATA_B:%.*]] = load i64, ptr [[ADDR_B]], align 4
+; CHECK-V8-NEXT:    [[DATA_A:%.*]] = load i64, ptr [[ADDR_A]], align 8
+; CHECK-V8-NEXT:    [[DATA_B:%.*]] = load i64, ptr [[ADDR_B]], align 8
 ; CHECK-V8-NEXT:    [[UGT:%.*]] = icmp ugt i64 [[DATA_A]], [[DATA_B]]
 ; CHECK-V8-NEXT:    [[UMAX:%.*]] = select i1 [[UGT]], i64 [[DATA_A]], i64 [[DATA_B]]
 ; CHECK-V8-NEXT:    [[ACC_NEXT]] = add i64 [[UMAX]], [[ACC]]
 ; CHECK-V8-NEXT:    [[ADDR_C:%.*]] = getelementptr i64, ptr [[C:%.*]], i32 [[IV]]
-; CHECK-V8-NEXT:    store i64 [[UMAX]], ptr [[ADDR_C]], align 4
+; CHECK-V8-NEXT:    store i64 [[UMAX]], ptr [[ADDR_C]], align 8
 ; CHECK-V8-NEXT:    [[COUNT]] = add nuw i32 [[IV]], 1
 ; CHECK-V8-NEXT:    [[END:%.*]] = icmp ne i32 [[COUNT]], 100
 ; CHECK-V8-NEXT:    br i1 [[END]], label [[LOOP]], label [[EXIT:%.*]]
@@ -356,13 +356,13 @@ define i64 @test_i64_select_minsize(ptr %a, ptr %b, ptr %c) #1 {
 ; CHECK-V8-NEXT:    [[ACC:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[ACC_NEXT:%.*]], [[LOOP]] ]
 ; CHECK-V8-NEXT:    [[ADDR_A:%.*]] = getelementptr i64, ptr [[A:%.*]], i32 [[IV]]
 ; CHECK-V8-NEXT:    [[ADDR_B:%.*]] = getelementptr i64, ptr [[B:%.*]], i32 [[IV]]
-; CHECK-V8-NEXT:    [[DATA_A:%.*]] = load i64, ptr [[ADDR_A]], align 4
-; CHECK-V8-NEXT:    [[DATA_B:%.*]] = load i64, ptr [[ADDR_B]], align 4
+; CHECK-V8-NEXT:    [[DATA_A:%.*]] = load i64, ptr [[ADDR_A]], align 8
+; CHECK-V8-NEXT:    [[DATA_B:%.*]] = load i64, ptr [[ADDR_B]], align 8
 ; CHECK-V8-NEXT:    [[UGT:%.*]] = icmp ugt i64 [[DATA_A]], [[DATA_B]]
 ; CHECK-V8-NEXT:    [[UMAX:%.*]] = select i1 [[UGT]], i64 [[DATA_A]], i64 [[DATA_B]]
 ; CHECK-V8-NEXT:    [[ACC_NEXT]] = add i64 [[UMAX]], [[ACC]]
 ; CHECK-V8-NEXT:    [[ADDR_C:%.*]] = getelementptr i64, ptr [[C:%.*]], i32 [[IV]]
-; CHECK-V8-NEXT:    store i64 [[UMAX]], ptr [[ADDR_C]], align 4
+; CHECK-V8-NEXT:    store i64 [[UMAX]], ptr [[ADDR_C]], align 8
 ; CHECK-V8-NEXT:    [[COUNT]] = add nuw i32 [[IV]], 1
 ; CHECK-V8-NEXT:    [[END:%.*]] = icmp ne i32 [[COUNT]], 100
 ; CHECK-V8-NEXT:    br i1 [[END]], label [[LOOP]], label [[EXIT:%.*]]
diff --git a/llvm/test/Transforms/LoopUnroll/ARM/upperbound.ll b/llvm/test/Transforms/LoopUnroll/ARM/upperbound.ll
index ae23a2e9ce352..c2e3f59ec678d 100644
--- a/llvm/test/Transforms/LoopUnroll/ARM/upperbound.ll
+++ b/llvm/test/Transforms/LoopUnroll/ARM/upperbound.ll
@@ -17,7 +17,7 @@ define void @test(ptr %x, i32 %n) {
 ; CHECK-NEXT:    store i32 0, ptr [[X]], align 4
 ; CHECK-NEXT:    br label [[IF_END]]
 ; CHECK:       if.end:
-; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 1
+; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[X]], i32 1
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[REM]], 1
 ; CHECK-NEXT:    br i1 [[CMP]], label [[WHILE_BODY_1:%.*]], label [[WHILE_END]]
 ; CHECK:       while.body.1:
@@ -28,7 +28,7 @@ define void @test(ptr %x, i32 %n) {
 ; CHECK-NEXT:    store i32 0, ptr [[INCDEC_PTR]], align 4
 ; CHECK-NEXT:    br label [[IF_END_1]]
 ; CHECK:       if.end.1:
-; CHECK-NEXT:    [[INCDEC_PTR_1:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 2
+; CHECK-NEXT:    [[INCDEC_PTR_1:%.*]] = getelementptr inbounds i32, ptr [[X]], i32 2
 ; CHECK-NEXT:    [[CMP_1:%.*]] = icmp sgt i32 [[REM]], 2
 ; CHECK-NEXT:    br i1 [[CMP_1]], label [[WHILE_BODY_2:%.*]], label [[WHILE_END]]
 ; CHECK:       while.body.2:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll
index a78878c55a1fc..19a970fb0716f 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll
@@ -23,10 +23,10 @@ define void @test_widen(ptr noalias %a, ptr readnone %b) #4 {
 ; TFNONE:       vector.body:
 ; TFNONE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; TFNONE-NEXT:    [[TMP4:%.*]] = getelementptr i64, ptr [[B:%.*]], i64 [[INDEX]]
-; TFNONE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP4]], align 4
+; TFNONE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP4]], align 8
 ; TFNONE-NEXT:    [[TMP5:%.*]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> [[WIDE_LOAD]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
 ; TFNONE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
-; TFNONE-NEXT:    store <vscale x 2 x i64> [[TMP5]], ptr [[TMP6]], align 4
+; TFNONE-NEXT:    store <vscale x 2 x i64> [[TMP5]], ptr [[TMP6]], align 8
 ; TFNONE-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
 ; TFNONE-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 2
 ; TFNONE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]]
@@ -40,10 +40,10 @@ define void @test_widen(ptr noalias %a, ptr readnone %b) #4 {
 ; TFNONE:       for.body:
 ; TFNONE-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; TFNONE-NEXT:    [[GEP:%.*]] = getelementptr i64, ptr [[B]], i64 [[INDVARS_IV]]
-; TFNONE-NEXT:    [[LOAD:%.*]] = load i64, ptr [[GEP]], align 4
+; TFNONE-NEXT:    [[LOAD:%.*]] = load i64, ptr [[GEP]], align 8
 ; TFNONE-NEXT:    [[CALL:%.*]] = call i64 @foo(i64 [[LOAD]]) #[[ATTR3:[0-9]+]]
 ; TFNONE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDVARS_IV]]
-; TFNONE-NEXT:    store i64 [[CALL]], ptr [[ARRAYIDX]], align 4
+; TFNONE-NEXT:    store i64 [[CALL]], ptr [[ARRAYIDX]], align 8
 ; TFNONE-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; TFNONE-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1025
 ; TFNONE-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
@@ -66,10 +66,10 @@ define void @test_widen(ptr noalias %a, ptr readnone %b) #4 {
 ; TFCOMMON-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; TFCOMMON-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; TFCOMMON-NEXT:    [[TMP5:%.*]] = getelementptr i64, ptr [[B:%.*]], i64 [[INDEX]]
-; TFCOMMON-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP5]], i32 4, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
+; TFCOMMON-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP5]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
 ; TFCOMMON-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> [[WIDE_MASKED_LOAD]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
 ; TFCOMMON-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
-; TFCOMMON-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP6]], ptr [[TMP7]], i32 4, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; TFCOMMON-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP6]], ptr [[TMP7]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
 ; TFCOMMON-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
 ; TFCOMMON-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 2
 ; TFCOMMON-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]]
@@ -359,10 +359,10 @@ define void @test_widen_nomask(ptr noalias %a, ptr readnone %b) #4 {
 ; TFNONE:       vector.body:
 ; TFNONE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; TFNONE-NEXT:    [[TMP4:%.*]] = getelementptr i64, ptr [[B:%.*]], i64 [[INDEX]]
-; TFNONE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP4]], align 4
+; TFNONE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP4]], align 8
 ; TFNONE-NEXT:    [[TMP5:%.*]] = call <vscale x 2 x i64> @foo_vector_nomask(<vscale x 2 x i64> [[WIDE_LOAD]])
 ; TFNONE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
-; TFNONE-NEXT:    store <vscale x 2 x i64> [[TMP5]], ptr [[TMP6]], align 4
+; TFNONE-NEXT:    store <vscale x 2 x i64> [[TMP5]], ptr [[TMP6]], align 8
 ; TFNONE-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
 ; TFNONE-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 2
 ; TFNONE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]]
@@ -376,10 +376,10 @@ define void @test_widen_nomask(ptr noalias %a, ptr readnone %b) #4 {
 ; TFNONE:       for.body:
 ; TFNONE-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; TFNONE-NEXT:    [[GEP:%.*]] = getelementptr i64, ptr [[B]], i64 [[INDVARS_IV]]
-; TFNONE-NEXT:    [[LOAD:%.*]] = load i64, ptr [[GEP]], align 4
+; TFNONE-NEXT:    [[LOAD:%.*]] = load i64, ptr [[GEP]], align 8
 ; TFNONE-NEXT:    [[CALL:%.*]] = call i64 @foo(i64 [[LOAD]]) #[[ATTR5:[0-9]+]]
 ; TFNONE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDVARS_IV]]
-; TFNONE-NEXT:    store i64 [[CALL]], ptr [[ARRAYIDX]], align 4
+; TFNONE-NEXT:    store i64 [[CALL]], ptr [[ARRAYIDX]], align 8
 ; TFNONE-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; TFNONE-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1025
 ; TFNONE-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
@@ -392,10 +392,10 @@ define void @test_widen_nomask(ptr noalias %a, ptr readnone %b) #4 {
 ; TFALWAYS:       for.body:
 ; TFALWAYS-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; TFALWAYS-NEXT:    [[GEP:%.*]] = getelementptr i64, ptr [[B:%.*]], i64 [[INDVARS_IV]]
-; TFALWAYS-NEXT:    [[LOAD:%.*]] = load i64, ptr [[GEP]], align 4
+; TFALWAYS-NEXT:    [[LOAD:%.*]] = load i64, ptr [[GEP]], align 8
 ; TFALWAYS-NEXT:    [[CALL:%.*]] = call i64 @foo(i64 [[LOAD]]) #[[ATTR5:[0-9]+]]
 ; TFALWAYS-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDVARS_IV]]
-; TFALWAYS-NEXT:    store i64 [[CALL]], ptr [[ARRAYIDX]], align 4
+; TFALWAYS-NEXT:    store i64 [[CALL]], ptr [[ARRAYIDX]], align 8
 ; TFALWAYS-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; TFALWAYS-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1025
 ; TFALWAYS-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]]
@@ -417,10 +417,10 @@ define void @test_widen_nomask(ptr noalias %a, ptr readnone %b) #4 {
 ; TFFALLBACK:       vector.body:
 ; TFFALLBACK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; TFFALLBACK-NEXT:    [[TMP4:%.*]] = getelementptr i64, ptr [[B:%.*]], i64 [[INDEX]]
-; TFFALLBACK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP4]], align 4
+; TFFALLBACK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP4]], align 8
 ; TFFALLBACK-NEXT:    [[TMP5:%.*]] = call <vscale x 2 x i64> @foo_vector_nomask(<vscale x 2 x i64> [[WIDE_LOAD]])
 ; TFFALLBACK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
-; TFFALLBACK-NEXT:    store <vscale x 2 x i64> [[TMP5]], ptr [[TMP6]], align 4
+; TFFALLBACK-NEXT:    store <vscale x 2 x i64> [[TMP5]], ptr [[TMP6]], align 8
 ; TFFALLBACK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
 ; TFFALLBACK-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 2
 ; TFFALLBACK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]]
@@ -432,10 +432,10 @@ define void @test_widen_nomask(ptr noalias %a, ptr readnone %b) #4 {
 ; TFFALLBACK:       for.body:
 ; TFFALLBACK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; TFFALLBACK-NEXT:    [[GEP:%.*]] = getelementptr i64, ptr [[B]], i64 [[INDVARS_IV]]
-; TFFALLBACK-NEXT:    [[LOAD:%.*]] = load i64, ptr [[GEP]], align 4
+; TFFALLBACK-NEXT:    [[LOAD:%.*]] = load i64, ptr [[GEP]], align 8
 ; TFFALLBACK-NEXT:    [[CALL:%.*]] = call i64 @foo(i64 [[LOAD]]) #[[ATTR5:[0-9]+]]
 ; TFFALLBACK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDVARS_IV]]
-; TFFALLBACK-NEXT:    store i64 [[CALL]], ptr [[ARRAYIDX]], align 4
+; TFFALLBACK-NEXT:    store i64 [[CALL]], ptr [[ARRAYIDX]], align 8
 ; TFFALLBACK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; TFFALLBACK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1025
 ; TFFALLBACK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
@@ -479,10 +479,10 @@ define void @test_widen_optmask(ptr noalias %a, ptr readnone %b) #4 {
 ; TFNONE:       vector.body:
 ; TFNONE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; TFNONE-NEXT:    [[TMP4:%.*]] = getelementptr i64, ptr [[B:%.*]], i64 [[INDEX]]
-; TFNONE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP4]], align 4
+; TFNONE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP4]], align 8
 ; TFNONE-NEXT:    [[TMP5:%.*]] = call <vscale x 2 x i64> @foo_vector_nomask(<vscale x 2 x i64> [[WIDE_LOAD]])
 ; TFNONE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
-; TFNONE-NEXT:    store <vscale x 2 x i64> [[TMP5]], ptr [[TMP6]], align 4
+; TFNONE-NEXT:    store <vscale x 2 x i64> [[TMP5]], ptr [[TMP6]], align 8
 ; TFNONE-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
 ; TFNONE-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 2
 ; TFNONE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]]
@@ -496,10 +496,10 @@ define void @test_widen_optmask(ptr noalias %a, ptr readnone %b) #4 {
 ; TFNONE:       for.body:
 ; TFNONE-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; TFNONE-NEXT:    [[GEP:%.*]] = getelementptr i64, ptr [[B]], i64 [[INDVARS_IV]]
-; TFNONE-NEXT:    [[LOAD:%.*]] = load i64, ptr [[GEP]], align 4
+; TFNONE-NEXT:    [[LOAD:%.*]] = load i64, ptr [[GEP]], align 8
 ; TFNONE-NEXT:    [[CALL:%.*]] = call i64 @foo(i64 [[LOAD]]) #[[ATTR6:[0-9]+]]
 ; TFNONE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDVARS_IV]]
-; TFNONE-NEXT:    store i64 [[CALL]], ptr [[ARRAYIDX]], align 4
+; TFNONE-NEXT:    store i64 [[CALL]], ptr [[ARRAYIDX]], align 8
 ; TFNONE-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; TFNONE-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1025
 ; TFNONE-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
@@ -522,10 +522,10 @@ define void @test_widen_optmask(ptr noalias %a, ptr readnone %b) #4 {
 ; TFALWAYS-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; TFALWAYS-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; TFALWAYS-NEXT:    [[TMP5:%.*]] = getelementptr i64, ptr [[B:%.*]], i64 [[INDEX]]
-; TFALWAYS-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP5]], i32 4, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
+; TFALWAYS-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP5]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
 ; TFALWAYS-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> [[WIDE_MASKED_LOAD]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
 ; TFALWAYS-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
-; TFALWAYS-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP6]], ptr [[TMP7]], i32 4, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; TFALWAYS-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP6]], ptr [[TMP7]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
 ; TFALWAYS-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
 ; TFALWAYS-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 2
 ; TFALWAYS-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]]
@@ -552,10 +552,10 @@ define void @test_widen_optmask(ptr noalias %a, ptr readnone %b) #4 {
 ; TFFALLBACK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; TFFALLBACK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; TFFALLBACK-NEXT:    [[TMP5:%.*]] = getelementptr i64, ptr [[B:%.*]], i64 [[INDEX]]
-; TFFALLBACK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP5]], i32 4, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
+; TFFALLBACK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP5]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
 ; TFFALLBACK-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> [[WIDE_MASKED_LOAD]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
 ; TFFALLBACK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
-; TFFALLBACK-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP6]], ptr [[TMP7]], i32 4, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; TFFALLBACK-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP6]], ptr [[TMP7]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
 ; TFFALLBACK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
 ; TFFALLBACK-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 2
 ; TFFALLBACK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]]
@@ -611,7 +611,7 @@ define double @test_widen_fmuladd_and_call(ptr noalias %a, ptr readnone %b, doub
 ; TFNONE-NEXT:    [[TMP6:%.*]] = fptoui <vscale x 2 x double> [[WIDE_LOAD]] to <vscale x 2 x i64>
 ; TFNONE-NEXT:    [[TMP7:%.*]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> [[TMP6]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
 ; TFNONE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
-; TFNONE-NEXT:    store <vscale x 2 x i64> [[TMP7]], ptr [[TMP8]], align 4
+; TFNONE-NEXT:    store <vscale x 2 x i64> [[TMP7]], ptr [[TMP8]], align 8
 ; TFNONE-NEXT:    [[TMP9]] = call double @llvm.vector.reduce.fadd.nxv2f64(double [[VEC_PHI]], <vscale x 2 x double> [[TMP5]])
 ; TFNONE-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
 ; TFNONE-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 2
@@ -633,7 +633,7 @@ define double @test_widen_fmuladd_and_call(ptr noalias %a, ptr readnone %b, doub
 ; TFNONE-NEXT:    [[TOINT:%.*]] = fptoui double [[LOAD]] to i64
 ; TFNONE-NEXT:    [[CALL:%.*]] = call i64 @foo(i64 [[TOINT]]) #[[ATTR3]]
 ; TFNONE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDVARS_IV]]
-; TFNONE-NEXT:    store i64 [[CALL]], ptr [[ARRAYIDX]], align 4
+; TFNONE-NEXT:    store i64 [[CALL]], ptr [[ARRAYIDX]], align 8
 ; TFNONE-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; TFNONE-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1025
 ; TFNONE-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
@@ -665,7 +665,7 @@ define double @test_widen_fmuladd_and_call(ptr noalias %a, ptr readnone %b, doub
 ; TFALWAYS-NEXT:    [[TMP7:%.*]] = fptoui <vscale x 2 x double> [[WIDE_MASKED_LOAD]] to <vscale x 2 x i64>
 ; TFALWAYS-NEXT:    [[TMP8:%.*]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> [[TMP7]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
 ; TFALWAYS-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
-; TFALWAYS-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP8]], ptr [[TMP9]], i32 4, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; TFALWAYS-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP8]], ptr [[TMP9]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
 ; TFALWAYS-NEXT:    [[TMP10:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x double> [[TMP6]], <vscale x 2 x double> shufflevector (<vscale x 2 x double> insertelement (<vscale x 2 x double> poison, double -0.000000e+00, i64 0), <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer)
 ; TFALWAYS-NEXT:    [[TMP11]] = call double @llvm.vector.reduce.fadd.nxv2f64(double [[VEC_PHI]], <vscale x 2 x double> [[TMP10]])
 ; TFALWAYS-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
@@ -702,7 +702,7 @@ define double @test_widen_fmuladd_and_call(ptr noalias %a, ptr readnone %b, doub
 ; TFFALLBACK-NEXT:    [[TMP7:%.*]] = fptoui <vscale x 2 x double> [[WIDE_MASKED_LOAD]] to <vscale x 2 x i64>
 ; TFFALLBACK-NEXT:    [[TMP8:%.*]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> [[TMP7]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
 ; TFFALLBACK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
-; TFFALLBACK-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP8]], ptr [[TMP9]], i32 4, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; TFFALLBACK-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP8]], ptr [[TMP9]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
 ; TFFALLBACK-NEXT:    [[TMP10:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x double> [[TMP6]], <vscale x 2 x double> shufflevector (<vscale x 2 x double> insertelement (<vscale x 2 x double> poison, double -0.000000e+00, i64 0), <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer)
 ; TFFALLBACK-NEXT:    [[TMP11]] = call double @llvm.vector.reduce.fadd.nxv2f64(double [[VEC_PHI]], <vscale x 2 x double> [[TMP10]])
 ; TFFALLBACK-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-inloop-reductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-inloop-reductions.ll
index 94c48de856ed7..04af618014a79 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-inloop-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-inloop-reductions.ll
@@ -33,11 +33,11 @@ define i64 @int_reduction_and(ptr noalias nocapture %a, i64 %N) {
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP4]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP12]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP12]], align 8
 ; CHECK-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP14:%.*]] = mul i64 [[TMP13]], 2
 ; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i64 [[TMP14]]
-; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 2 x i64>, ptr [[TMP15]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 2 x i64>, ptr [[TMP15]], align 8
 ; CHECK-NEXT:    [[TMP16:%.*]] = call i64 @llvm.vector.reduce.and.nxv2i64(<vscale x 2 x i64> [[WIDE_LOAD]])
 ; CHECK-NEXT:    [[TMP17]] = and i64 [[TMP16]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = call i64 @llvm.vector.reduce.and.nxv2i64(<vscale x 2 x i64> [[WIDE_LOAD3]])
@@ -67,7 +67,7 @@ define i64 @int_reduction_and(ptr noalias nocapture %a, i64 %N) {
 ; CHECK-NEXT:    [[TMP23:%.*]] = add i64 [[INDEX7]], 0
 ; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP23]]
 ; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[TMP24]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD9:%.*]] = load <2 x i64>, ptr [[TMP25]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD9:%.*]] = load <2 x i64>, ptr [[TMP25]], align 8
 ; CHECK-NEXT:    [[TMP26:%.*]] = call i64 @llvm.vector.reduce.and.v2i64(<2 x i64> [[WIDE_LOAD9]])
 ; CHECK-NEXT:    [[TMP27]] = and i64 [[TMP26]], [[VEC_PHI8]]
 ; CHECK-NEXT:    [[INDEX_NEXT10]] = add nuw i64 [[INDEX7]], 2
@@ -84,7 +84,7 @@ define i64 @int_reduction_and(ptr noalias nocapture %a, i64 %N) {
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ]
 ; CHECK-NEXT:    [[RDX:%.*]] = phi i64 [ [[AND:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX11]], [[VEC_EPILOG_SCALAR_PH]] ]
 ; CHECK-NEXT:    [[L2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[L3:%.*]] = load i64, ptr [[L2]], align 4
+; CHECK-NEXT:    [[L3:%.*]] = load i64, ptr [[L2]], align 8
 ; CHECK-NEXT:    [[AND]] = and i64 [[RDX]], [[L3]]
 ; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-reductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-reductions.ll
index 9bbbf46e23844..f84e7c0ea182b 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-reductions.ll
@@ -33,11 +33,11 @@ define i64 @int_reduction_add(ptr %a, i64 %N) {
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP4]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP12]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP12]], align 8
 ; CHECK-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP14:%.*]] = mul i64 [[TMP13]], 2
 ; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i64 [[TMP14]]
-; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 2 x i64>, ptr [[TMP15]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 2 x i64>, ptr [[TMP15]], align 8
 ; CHECK-NEXT:    [[TMP16]] = add <vscale x 2 x i64> [[WIDE_LOAD]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[TMP17]] = add <vscale x 2 x i64> [[WIDE_LOAD3]], [[VEC_PHI2]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
@@ -67,7 +67,7 @@ define i64 @int_reduction_add(ptr %a, i64 %N) {
 ; CHECK-NEXT:    [[TMP23:%.*]] = add i64 [[INDEX7]], 0
 ; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP23]]
 ; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[TMP24]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD9:%.*]] = load <2 x i64>, ptr [[TMP25]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD9:%.*]] = load <2 x i64>, ptr [[TMP25]], align 8
 ; CHECK-NEXT:    [[TMP26]] = add <2 x i64> [[WIDE_LOAD9]], [[VEC_PHI8]]
 ; CHECK-NEXT:    [[INDEX_NEXT10]] = add nuw i64 [[INDEX7]], 2
 ; CHECK-NEXT:    [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT10]], [[N_VEC5]]
@@ -84,7 +84,7 @@ define i64 @int_reduction_add(ptr %a, i64 %N) {
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[SUM:%.*]] = phi i64 [ [[BC_MERGE_RDX11]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
-; CHECK-NEXT:    [[TMP29:%.*]] = load i64, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[TMP29:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
 ; CHECK-NEXT:    [[ADD]] = add i64 [[TMP29]], [[SUM]]
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-live-out-pointer-induction.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-live-out-pointer-induction.ll
index 302e2ba7e9b31..b378603bbc52f 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-live-out-pointer-induction.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-live-out-pointer-induction.ll
@@ -55,11 +55,11 @@ define ptr @test(ptr %start.1, ptr %start.2, ptr %end) {
 ; CHECK-NEXT:    [[TMP28:%.*]] = mul i64 [[TMP27]], 8
 ; CHECK-NEXT:    [[NEXT_GEP8:%.*]] = getelementptr i8, ptr [[START_2]], i64 [[TMP28]]
 ; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr i64, ptr [[NEXT_GEP]], i32 0
-; CHECK-NEXT:    store <vscale x 2 x i64> zeroinitializer, ptr [[TMP29]], align 4
+; CHECK-NEXT:    store <vscale x 2 x i64> zeroinitializer, ptr [[TMP29]], align 8
 ; CHECK-NEXT:    [[TMP30:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP31:%.*]] = mul i64 [[TMP30]], 2
 ; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr i64, ptr [[NEXT_GEP]], i64 [[TMP31]]
-; CHECK-NEXT:    store <vscale x 2 x i64> zeroinitializer, ptr [[TMP32]], align 4
+; CHECK-NEXT:    store <vscale x 2 x i64> zeroinitializer, ptr [[TMP32]], align 8
 ; CHECK-NEXT:    [[TMP33:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP34:%.*]] = mul i64 [[TMP33]], 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP34]]
@@ -79,7 +79,7 @@ define ptr @test(ptr %start.1, ptr %start.2, ptr %end) {
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV_1:%.*]] = phi ptr [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_1_NEXT:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[IV_2:%.*]] = phi ptr [ [[BC_RESUME_VAL4]], [[SCALAR_PH]] ], [ [[IV_2_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    store i64 0, ptr [[IV_2]], align 4
+; CHECK-NEXT:    store i64 0, ptr [[IV_2]], align 8
 ; CHECK-NEXT:    [[IV_2_NEXT]] = getelementptr inbounds ptr, ptr [[IV_2]], i64 1
 ; CHECK-NEXT:    [[IV_1_NEXT]] = getelementptr inbounds ptr, ptr [[IV_1]], i64 1
 ; CHECK-NEXT:    [[CMP_I_I_NOT_I:%.*]] = icmp eq ptr [[IV_2_NEXT]], [[END]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-runtime-check-size-based-threshold.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-runtime-check-size-based-threshold.ll
index 5fc316d0efe33..166d771482441 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-runtime-check-size-based-threshold.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-runtime-check-size-based-threshold.ll
@@ -61,17 +61,17 @@ define void @min_trip_count_due_to_runtime_checks_1(ptr %dst.1, ptr %dst.2, ptr
 ; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr i64, ptr [[SRC_2]], i64 [[TMP17]]
 ; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr i64, ptr [[SRC_2]], i64 [[TMP22]]
 ; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr i64, ptr [[TMP23]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP27]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP27]], align 8
 ; CHECK-NEXT:    [[TMP28:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP29:%.*]] = mul i64 [[TMP28]], 2
 ; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr i64, ptr [[TMP23]], i64 [[TMP29]]
-; CHECK-NEXT:    [[WIDE_LOAD12:%.*]] = load <vscale x 2 x i64>, ptr [[TMP30]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD12:%.*]] = load <vscale x 2 x i64>, ptr [[TMP30]], align 8
 ; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr i64, ptr [[TMP25]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD13:%.*]] = load <vscale x 2 x i64>, ptr [[TMP31]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD13:%.*]] = load <vscale x 2 x i64>, ptr [[TMP31]], align 8
 ; CHECK-NEXT:    [[TMP32:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP33:%.*]] = mul i64 [[TMP32]], 2
 ; CHECK-NEXT:    [[TMP34:%.*]] = getelementptr i64, ptr [[TMP25]], i64 [[TMP33]]
-; CHECK-NEXT:    [[WIDE_LOAD14:%.*]] = load <vscale x 2 x i64>, ptr [[TMP34]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD14:%.*]] = load <vscale x 2 x i64>, ptr [[TMP34]], align 8
 ; CHECK-NEXT:    [[TMP35:%.*]] = add <vscale x 2 x i64> [[WIDE_LOAD]], [[WIDE_LOAD13]]
 ; CHECK-NEXT:    [[TMP36:%.*]] = add <vscale x 2 x i64> [[WIDE_LOAD12]], [[WIDE_LOAD14]]
 ; CHECK-NEXT:    [[TMP37:%.*]] = getelementptr i64, ptr [[DST_1]], i64 [[TMP17]]
@@ -79,17 +79,17 @@ define void @min_trip_count_due_to_runtime_checks_1(ptr %dst.1, ptr %dst.2, ptr
 ; CHECK-NEXT:    [[TMP39:%.*]] = getelementptr i64, ptr [[DST_2]], i64 [[TMP17]]
 ; CHECK-NEXT:    [[TMP40:%.*]] = getelementptr i64, ptr [[DST_2]], i64 [[TMP22]]
 ; CHECK-NEXT:    [[TMP41:%.*]] = getelementptr i64, ptr [[TMP37]], i32 0
-; CHECK-NEXT:    store <vscale x 2 x i64> [[TMP35]], ptr [[TMP41]], align 4
+; CHECK-NEXT:    store <vscale x 2 x i64> [[TMP35]], ptr [[TMP41]], align 8
 ; CHECK-NEXT:    [[TMP42:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP43:%.*]] = mul i64 [[TMP42]], 2
 ; CHECK-NEXT:    [[TMP44:%.*]] = getelementptr i64, ptr [[TMP37]], i64 [[TMP43]]
-; CHECK-NEXT:    store <vscale x 2 x i64> [[TMP36]], ptr [[TMP44]], align 4
+; CHECK-NEXT:    store <vscale x 2 x i64> [[TMP36]], ptr [[TMP44]], align 8
 ; CHECK-NEXT:    [[TMP45:%.*]] = getelementptr i64, ptr [[TMP39]], i32 0
-; CHECK-NEXT:    store <vscale x 2 x i64> [[TMP35]], ptr [[TMP45]], align 4
+; CHECK-NEXT:    store <vscale x 2 x i64> [[TMP35]], ptr [[TMP45]], align 8
 ; CHECK-NEXT:    [[TMP46:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP47:%.*]] = mul i64 [[TMP46]], 2
 ; CHECK-NEXT:    [[TMP48:%.*]] = getelementptr i64, ptr [[TMP39]], i64 [[TMP47]]
-; CHECK-NEXT:    store <vscale x 2 x i64> [[TMP36]], ptr [[TMP48]], align 4
+; CHECK-NEXT:    store <vscale x 2 x i64> [[TMP36]], ptr [[TMP48]], align 8
 ; CHECK-NEXT:    [[TMP49:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP50:%.*]] = mul i64 [[TMP49]], 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP50]]
@@ -105,13 +105,13 @@ define void @min_trip_count_due_to_runtime_checks_1(ptr %dst.1, ptr %dst.2, ptr
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[GEP_SRC_1:%.*]] = getelementptr i64, ptr [[SRC_1]], i64 [[IV]]
 ; CHECK-NEXT:    [[GEP_SRC_2:%.*]] = getelementptr i64, ptr [[SRC_2]], i64 [[IV]]
-; CHECK-NEXT:    [[L_1:%.*]] = load i64, ptr [[GEP_SRC_1]], align 4
-; CHECK-NEXT:    [[L_2:%.*]] = load i64, ptr [[GEP_SRC_2]], align 4
+; CHECK-NEXT:    [[L_1:%.*]] = load i64, ptr [[GEP_SRC_1]], align 8
+; CHECK-NEXT:    [[L_2:%.*]] = load i64, ptr [[GEP_SRC_2]], align 8
 ; CHECK-NEXT:    [[ADD:%.*]] = add i64 [[L_1]], [[L_2]]
 ; CHECK-NEXT:    [[GEP_DST_1:%.*]] = getelementptr i64, ptr [[DST_1]], i64 [[IV]]
 ; CHECK-NEXT:    [[GEP_DST_2:%.*]] = getelementptr i64, ptr [[DST_2]], i64 [[IV]]
-; CHECK-NEXT:    store i64 [[ADD]], ptr [[GEP_DST_1]], align 4
-; CHECK-NEXT:    store i64 [[ADD]], ptr [[GEP_DST_2]], align 4
+; CHECK-NEXT:    store i64 [[ADD]], ptr [[GEP_DST_1]], align 8
+; CHECK-NEXT:    store i64 [[ADD]], ptr [[GEP_DST_2]], align 8
 ; CHECK-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[CMP10:%.*]] = icmp ult i64 [[IV_NEXT]], [[N]]
 ; CHECK-NEXT:    br i1 [[CMP10]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP3:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/synthesize-mask-for-call.ll b/llvm/test/Transforms/LoopVectorize/AArch64/synthesize-mask-for-call.ll
index cbded740efe4f..eaabc263913a7 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/synthesize-mask-for-call.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/synthesize-mask-for-call.ll
@@ -185,11 +185,11 @@ define void @test_v4_v4m(ptr noalias %a, ptr readonly %b) #3 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i64, ptr [[B:%.*]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i64, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i64> @foo_vector_fixed4_nomask(<4 x i64> [[WIDE_LOAD]])
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0
-; CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[TMP5]], align 4
+; CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[TMP5]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -201,10 +201,10 @@ define void @test_v4_v4m(ptr noalias %a, ptr readonly %b) #3 {
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i64, ptr [[B]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[LOAD:%.*]] = load i64, ptr [[GEP]], align 4
+; CHECK-NEXT:    [[LOAD:%.*]] = load i64, ptr [[GEP]], align 8
 ; CHECK-NEXT:    [[CALL:%.*]] = call i64 @foo(i64 [[LOAD]]) #[[ATTR1:[0-9]+]]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    store i64 [[CALL]], ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    store i64 [[CALL]], ptr [[ARRAYIDX]], align 8
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1024
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
@@ -241,11 +241,11 @@ define void @test_v2_v4m(ptr noalias %a, ptr readonly %b) #3 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i64, ptr [[B:%.*]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i64, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i64> @foo_vector_fixed4_mask(<4 x i64> [[WIDE_LOAD]], <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0
-; CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[TMP5]], align 4
+; CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[TMP5]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
@@ -257,10 +257,10 @@ define void @test_v2_v4m(ptr noalias %a, ptr readonly %b) #3 {
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i64, ptr [[B]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[LOAD:%.*]] = load i64, ptr [[GEP]], align 4
+; CHECK-NEXT:    [[LOAD:%.*]] = load i64, ptr [[GEP]], align 8
 ; CHECK-NEXT:    [[CALL:%.*]] = call i64 @foo(i64 [[LOAD]]) #[[ATTR2:[0-9]+]]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    store i64 [[CALL]], ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    store i64 [[CALL]], ptr [[ARRAYIDX]], align 8
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1024
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
@@ -297,11 +297,11 @@ define void @test_v2_v4(ptr noalias %a, ptr readonly %b) #3 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i64, ptr [[B:%.*]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i64, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i64> @foo_vector_fixed4_nomask(<4 x i64> [[WIDE_LOAD]])
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0
-; CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[TMP5]], align 4
+; CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[TMP5]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
@@ -313,10 +313,10 @@ define void @test_v2_v4(ptr noalias %a, ptr readonly %b) #3 {
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i64, ptr [[B]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    [[LOAD:%.*]] = load i64, ptr [[GEP]], align 4
+; CHECK-NEXT:    [[LOAD:%.*]] = load i64, ptr [[GEP]], align 8
 ; CHECK-NEXT:    [[CALL:%.*]] = call i64 @foo(i64 [[LOAD]]) #[[ATTR3:[0-9]+]]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDVARS_IV]]
-; CHECK-NEXT:    store i64 [[CALL]], ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    store i64 [[CALL]], ptr [[ARRAYIDX]], align 8
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1024
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/PowerPC/widened-massv-call.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/widened-massv-call.ll
index 240882bab636d..60619f5de0937 100644
--- a/llvm/test/Transforms/LoopVectorize/PowerPC/widened-massv-call.ll
+++ b/llvm/test/Transforms/LoopVectorize/PowerPC/widened-massv-call.ll
@@ -8,21 +8,20 @@ define dso_local double @test(ptr %Arr) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x double> [ zeroinitializer, [[ENTRY]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = zext i32 [[INDEX]] to i64
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[ARR:%.*]], i64 [[TMP0]]
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[TMP1]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = fpext <2 x float> [[WIDE_LOAD]] to <2 x double>
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call fast <2 x double> @__sind2(<2 x double> [[TMP2]])
-; CHECK-NEXT:    [[TMP4]] = fadd fast <2 x double> [[TMP3]], [[VEC_PHI]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 128
-; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x double> [ zeroinitializer, [[ENTRY]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds float, ptr [[ARR:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = fpext <2 x float> [[WIDE_LOAD]] to <2 x double>
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast <2 x double> @__sind2(<2 x double> [[TMP1]])
+; CHECK-NEXT:    [[TMP3]] = fadd fast <2 x double> [[TMP2]], [[VEC_PHI]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 128
+; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[DOTLCSSA:%.*]] = phi <2 x double> [ [[TMP4]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP6:%.*]] = tail call fast double @llvm.vector.reduce.fadd.v2f64(double -0.000000e+00, <2 x double> [[DOTLCSSA]])
-; CHECK-NEXT:    ret double [[TMP6]]
+; CHECK-NEXT:    [[DOTLCSSA:%.*]] = phi <2 x double> [ [[TMP3]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = tail call fast double @llvm.vector.reduce.fadd.v2f64(double -0.000000e+00, <2 x double> [[DOTLCSSA]])
+; CHECK-NEXT:    ret double [[TMP5]]
 ;
 entry:
   br label %for.cond
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses-zve32x.ll b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses-zve32x.ll
index 241fa9d8cd49b..bf148917dca10 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses-zve32x.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses-zve32x.ll
@@ -10,14 +10,14 @@ define void @load_store_zve32x(ptr %p) {
 ; CHECK-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[OFFSET0:%.*]] = shl i64 [[I]], 1
 ; CHECK-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]]
-; CHECK-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 4
+; CHECK-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 8
 ; CHECK-NEXT:    [[Y0:%.*]] = add i64 [[X0]], 1
-; CHECK-NEXT:    store i64 [[Y0]], ptr [[Q0]], align 4
+; CHECK-NEXT:    store i64 [[Y0]], ptr [[Q0]], align 8
 ; CHECK-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
 ; CHECK-NEXT:    [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]]
-; CHECK-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 4
+; CHECK-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 8
 ; CHECK-NEXT:    [[Y1:%.*]] = add i64 [[X1]], 2
-; CHECK-NEXT:    store i64 [[Y1]], ptr [[Q1]], align 4
+; CHECK-NEXT:    store i64 [[Y1]], ptr [[Q1]], align 8
 ; CHECK-NEXT:    [[NEXTI]] = add i64 [[I]], 1
 ; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
 ; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT:%.*]], label [[LOOP]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll
index 99f03cdcf648c..29ef7364b8212 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll
@@ -77,22 +77,49 @@ exit:
 define void @load_store_factor2_i64(ptr %p) {
 ; CHECK-LABEL: @load_store_factor2_i64(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i64, ptr [[TMP2]], i32 0
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[TMP3]], align 8
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[TMP4:%.*]] = add <4 x i64> [[STRIDED_VEC]], <i64 1, i64 1, i64 1, i64 1>
+; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = add <4 x i64> [[STRIDED_VEC1]], <i64 2, i64 2, i64 2, i64 2>
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i64, ptr [[TMP6]], i32 -1
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP9]], <8 x i64> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; CHECK-NEXT:    store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP8]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[OFFSET0:%.*]] = shl i64 [[I]], 1
-; CHECK-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]]
-; CHECK-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 4
+; CHECK-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]]
+; CHECK-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 8
 ; CHECK-NEXT:    [[Y0:%.*]] = add i64 [[X0]], 1
-; CHECK-NEXT:    store i64 [[Y0]], ptr [[Q0]], align 4
+; CHECK-NEXT:    store i64 [[Y0]], ptr [[Q0]], align 8
 ; CHECK-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
 ; CHECK-NEXT:    [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]]
-; CHECK-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 4
+; CHECK-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 8
 ; CHECK-NEXT:    [[Y1:%.*]] = add i64 [[X1]], 2
-; CHECK-NEXT:    store i64 [[Y1]], ptr [[Q1]], align 4
+; CHECK-NEXT:    store i64 [[Y1]], ptr [[Q1]], align 8
 ; CHECK-NEXT:    [[NEXTI]] = add i64 [[I]], 1
 ; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT:%.*]], label [[LOOP]]
+; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -150,7 +177,7 @@ define void @load_store_factor3_i32(ptr %p) {
 ; CHECK-NEXT:    store <6 x i32> [[INTERLEAVED_VEC]], ptr [[TMP10]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -175,7 +202,7 @@ define void @load_store_factor3_i32(ptr %p) {
 ; CHECK-NEXT:    store i32 [[Y2]], ptr [[Q2]], align 4
 ; CHECK-NEXT:    [[NEXTI]] = add i64 [[I]], 1
 ; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -212,27 +239,75 @@ exit:
 define void @load_store_factor3_i64(ptr %p) {
 ; CHECK-LABEL: @load_store_factor3_i64(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP4:%.*]] = call <vscale x 2 x i64> @llvm.experimental.stepvector.nxv2i64()
+; CHECK-NEXT:    [[TMP5:%.*]] = add <vscale x 2 x i64> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = mul <vscale x 2 x i64> [[TMP5]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP6]]
+; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 2
+; CHECK-NEXT:    [[TMP9:%.*]] = mul i64 1, [[TMP8]]
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP9]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP10:%.*]] = mul <vscale x 2 x i64> [[VEC_IND]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 3, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i64, ptr [[P:%.*]], <vscale x 2 x i64> [[TMP10]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> [[TMP11]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i64> poison)
+; CHECK-NEXT:    [[TMP12:%.*]] = add <vscale x 2 x i64> [[WIDE_MASKED_GATHER]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> [[TMP12]], <vscale x 2 x ptr> [[TMP11]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; CHECK-NEXT:    [[TMP13:%.*]] = add <vscale x 2 x i64> [[TMP10]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i64, ptr [[P]], <vscale x 2 x i64> [[TMP13]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> [[TMP14]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i64> poison)
+; CHECK-NEXT:    [[TMP15:%.*]] = add <vscale x 2 x i64> [[WIDE_MASKED_GATHER1]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 2, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> [[TMP15]], <vscale x 2 x ptr> [[TMP14]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; CHECK-NEXT:    [[TMP16:%.*]] = add <vscale x 2 x i64> [[TMP13]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr i64, ptr [[P]], <vscale x 2 x i64> [[TMP16]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER2:%.*]] = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> [[TMP17]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i64> poison)
+; CHECK-NEXT:    [[TMP18:%.*]] = add <vscale x 2 x i64> [[WIDE_MASKED_GATHER2]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 3, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> [[TMP18]], <vscale x 2 x ptr> [[TMP17]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; CHECK-NEXT:    [[TMP19:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP20:%.*]] = mul i64 [[TMP19]], 2
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP20]]
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[OFFSET0:%.*]] = mul i64 [[I]], 3
-; CHECK-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]]
-; CHECK-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 4
+; CHECK-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]]
+; CHECK-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 8
 ; CHECK-NEXT:    [[Y0:%.*]] = add i64 [[X0]], 1
-; CHECK-NEXT:    store i64 [[Y0]], ptr [[Q0]], align 4
+; CHECK-NEXT:    store i64 [[Y0]], ptr [[Q0]], align 8
 ; CHECK-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
 ; CHECK-NEXT:    [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]]
-; CHECK-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 4
+; CHECK-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 8
 ; CHECK-NEXT:    [[Y1:%.*]] = add i64 [[X1]], 2
-; CHECK-NEXT:    store i64 [[Y1]], ptr [[Q1]], align 4
+; CHECK-NEXT:    store i64 [[Y1]], ptr [[Q1]], align 8
 ; CHECK-NEXT:    [[OFFSET2:%.*]] = add i64 [[OFFSET1]], 1
 ; CHECK-NEXT:    [[Q2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET2]]
-; CHECK-NEXT:    [[X2:%.*]] = load i64, ptr [[Q2]], align 4
+; CHECK-NEXT:    [[X2:%.*]] = load i64, ptr [[Q2]], align 8
 ; CHECK-NEXT:    [[Y2:%.*]] = add i64 [[X2]], 3
-; CHECK-NEXT:    store i64 [[Y2]], ptr [[Q2]], align 4
+; CHECK-NEXT:    store i64 [[Y2]], ptr [[Q2]], align 8
 ; CHECK-NEXT:    [[NEXTI]] = add i64 [[I]], 1
 ; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT:%.*]], label [[LOOP]]
+; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -269,52 +344,125 @@ exit:
 define void @load_store_factor8(ptr %p) {
 ; CHECK-LABEL: @load_store_factor8(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP4:%.*]] = call <vscale x 2 x i64> @llvm.experimental.stepvector.nxv2i64()
+; CHECK-NEXT:    [[TMP5:%.*]] = add <vscale x 2 x i64> [[TMP4]], zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = mul <vscale x 2 x i64> [[TMP5]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP6]]
+; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 2
+; CHECK-NEXT:    [[TMP9:%.*]] = mul i64 1, [[TMP8]]
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP9]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP10:%.*]] = shl <vscale x 2 x i64> [[VEC_IND]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 3, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i64, ptr [[P:%.*]], <vscale x 2 x i64> [[TMP10]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> [[TMP11]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i64> poison)
+; CHECK-NEXT:    [[TMP12:%.*]] = add <vscale x 2 x i64> [[WIDE_MASKED_GATHER]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> [[TMP12]], <vscale x 2 x ptr> [[TMP11]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; CHECK-NEXT:    [[TMP13:%.*]] = add <vscale x 2 x i64> [[TMP10]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i64, ptr [[P]], <vscale x 2 x i64> [[TMP13]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> [[TMP14]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i64> poison)
+; CHECK-NEXT:    [[TMP15:%.*]] = add <vscale x 2 x i64> [[WIDE_MASKED_GATHER1]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 2, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> [[TMP15]], <vscale x 2 x ptr> [[TMP14]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; CHECK-NEXT:    [[TMP16:%.*]] = add <vscale x 2 x i64> [[TMP13]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr i64, ptr [[P]], <vscale x 2 x i64> [[TMP16]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER2:%.*]] = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> [[TMP17]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i64> poison)
+; CHECK-NEXT:    [[TMP18:%.*]] = add <vscale x 2 x i64> [[WIDE_MASKED_GATHER2]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 3, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> [[TMP18]], <vscale x 2 x ptr> [[TMP17]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; CHECK-NEXT:    [[TMP19:%.*]] = add <vscale x 2 x i64> [[TMP16]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr i64, ptr [[P]], <vscale x 2 x i64> [[TMP19]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER3:%.*]] = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> [[TMP20]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i64> poison)
+; CHECK-NEXT:    [[TMP21:%.*]] = add <vscale x 2 x i64> [[WIDE_MASKED_GATHER3]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 4, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> [[TMP21]], <vscale x 2 x ptr> [[TMP20]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; CHECK-NEXT:    [[TMP22:%.*]] = add <vscale x 2 x i64> [[TMP19]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr i64, ptr [[P]], <vscale x 2 x i64> [[TMP22]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER4:%.*]] = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> [[TMP23]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i64> poison)
+; CHECK-NEXT:    [[TMP24:%.*]] = add <vscale x 2 x i64> [[WIDE_MASKED_GATHER4]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 5, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> [[TMP24]], <vscale x 2 x ptr> [[TMP23]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; CHECK-NEXT:    [[TMP25:%.*]] = add <vscale x 2 x i64> [[TMP22]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr i64, ptr [[P]], <vscale x 2 x i64> [[TMP25]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER5:%.*]] = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> [[TMP26]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i64> poison)
+; CHECK-NEXT:    [[TMP27:%.*]] = add <vscale x 2 x i64> [[WIDE_MASKED_GATHER5]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 6, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> [[TMP27]], <vscale x 2 x ptr> [[TMP26]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; CHECK-NEXT:    [[TMP28:%.*]] = add <vscale x 2 x i64> [[TMP25]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr i64, ptr [[P]], <vscale x 2 x i64> [[TMP28]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER6:%.*]] = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> [[TMP29]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i64> poison)
+; CHECK-NEXT:    [[TMP30:%.*]] = add <vscale x 2 x i64> [[WIDE_MASKED_GATHER6]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 7, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> [[TMP30]], <vscale x 2 x ptr> [[TMP29]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; CHECK-NEXT:    [[TMP31:%.*]] = add <vscale x 2 x i64> [[TMP28]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr i64, ptr [[P]], <vscale x 2 x i64> [[TMP31]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER7:%.*]] = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64.nxv2p0(<vscale x 2 x ptr> [[TMP32]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), <vscale x 2 x i64> poison)
+; CHECK-NEXT:    [[TMP33:%.*]] = add <vscale x 2 x i64> [[WIDE_MASKED_GATHER7]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 8, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv2i64.nxv2p0(<vscale x 2 x i64> [[TMP33]], <vscale x 2 x ptr> [[TMP32]], i32 8, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; CHECK-NEXT:    [[TMP34:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP35:%.*]] = mul i64 [[TMP34]], 2
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP35]]
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT]]
+; CHECK-NEXT:    [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[OFFSET0:%.*]] = shl i64 [[I]], 3
-; CHECK-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]]
-; CHECK-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 4
+; CHECK-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]]
+; CHECK-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 8
 ; CHECK-NEXT:    [[Y0:%.*]] = add i64 [[X0]], 1
-; CHECK-NEXT:    store i64 [[Y0]], ptr [[Q0]], align 4
+; CHECK-NEXT:    store i64 [[Y0]], ptr [[Q0]], align 8
 ; CHECK-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
 ; CHECK-NEXT:    [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]]
-; CHECK-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 4
+; CHECK-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 8
 ; CHECK-NEXT:    [[Y1:%.*]] = add i64 [[X1]], 2
-; CHECK-NEXT:    store i64 [[Y1]], ptr [[Q1]], align 4
+; CHECK-NEXT:    store i64 [[Y1]], ptr [[Q1]], align 8
 ; CHECK-NEXT:    [[OFFSET2:%.*]] = add i64 [[OFFSET1]], 1
 ; CHECK-NEXT:    [[Q2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET2]]
-; CHECK-NEXT:    [[X2:%.*]] = load i64, ptr [[Q2]], align 4
+; CHECK-NEXT:    [[X2:%.*]] = load i64, ptr [[Q2]], align 8
 ; CHECK-NEXT:    [[Y2:%.*]] = add i64 [[X2]], 3
-; CHECK-NEXT:    store i64 [[Y2]], ptr [[Q2]], align 4
+; CHECK-NEXT:    store i64 [[Y2]], ptr [[Q2]], align 8
 ; CHECK-NEXT:    [[OFFSET3:%.*]] = add i64 [[OFFSET2]], 1
 ; CHECK-NEXT:    [[Q3:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET3]]
-; CHECK-NEXT:    [[X3:%.*]] = load i64, ptr [[Q3]], align 4
+; CHECK-NEXT:    [[X3:%.*]] = load i64, ptr [[Q3]], align 8
 ; CHECK-NEXT:    [[Y3:%.*]] = add i64 [[X3]], 4
-; CHECK-NEXT:    store i64 [[Y3]], ptr [[Q3]], align 4
+; CHECK-NEXT:    store i64 [[Y3]], ptr [[Q3]], align 8
 ; CHECK-NEXT:    [[OFFSET4:%.*]] = add i64 [[OFFSET3]], 1
 ; CHECK-NEXT:    [[Q4:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET4]]
-; CHECK-NEXT:    [[X4:%.*]] = load i64, ptr [[Q4]], align 4
+; CHECK-NEXT:    [[X4:%.*]] = load i64, ptr [[Q4]], align 8
 ; CHECK-NEXT:    [[Y4:%.*]] = add i64 [[X4]], 5
-; CHECK-NEXT:    store i64 [[Y4]], ptr [[Q4]], align 4
+; CHECK-NEXT:    store i64 [[Y4]], ptr [[Q4]], align 8
 ; CHECK-NEXT:    [[OFFSET5:%.*]] = add i64 [[OFFSET4]], 1
 ; CHECK-NEXT:    [[Q5:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET5]]
-; CHECK-NEXT:    [[X5:%.*]] = load i64, ptr [[Q5]], align 4
+; CHECK-NEXT:    [[X5:%.*]] = load i64, ptr [[Q5]], align 8
 ; CHECK-NEXT:    [[Y5:%.*]] = add i64 [[X5]], 6
-; CHECK-NEXT:    store i64 [[Y5]], ptr [[Q5]], align 4
+; CHECK-NEXT:    store i64 [[Y5]], ptr [[Q5]], align 8
 ; CHECK-NEXT:    [[OFFSET6:%.*]] = add i64 [[OFFSET5]], 1
 ; CHECK-NEXT:    [[Q6:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET6]]
-; CHECK-NEXT:    [[X6:%.*]] = load i64, ptr [[Q6]], align 4
+; CHECK-NEXT:    [[X6:%.*]] = load i64, ptr [[Q6]], align 8
 ; CHECK-NEXT:    [[Y6:%.*]] = add i64 [[X6]], 7
-; CHECK-NEXT:    store i64 [[Y6]], ptr [[Q6]], align 4
+; CHECK-NEXT:    store i64 [[Y6]], ptr [[Q6]], align 8
 ; CHECK-NEXT:    [[OFFSET7:%.*]] = add i64 [[OFFSET6]], 1
 ; CHECK-NEXT:    [[Q7:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET7]]
-; CHECK-NEXT:    [[X7:%.*]] = load i64, ptr [[Q7]], align 4
+; CHECK-NEXT:    [[X7:%.*]] = load i64, ptr [[Q7]], align 8
 ; CHECK-NEXT:    [[Y7:%.*]] = add i64 [[X7]], 8
-; CHECK-NEXT:    store i64 [[Y7]], ptr [[Q7]], align 4
+; CHECK-NEXT:    store i64 [[Y7]], ptr [[Q7]], align 8
 ; CHECK-NEXT:    [[NEXTI]] = add i64 [[I]], 1
 ; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT:%.*]], label [[LOOP]]
+; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP11:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -410,7 +558,7 @@ define void @combine_load_factor2_i32(ptr noalias %p, ptr noalias %q) {
 ; CHECK-NEXT:    store <8 x i32> [[TMP9]], ptr [[TMP13]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -429,7 +577,7 @@ define void @combine_load_factor2_i32(ptr noalias %p, ptr noalias %q) {
 ; CHECK-NEXT:    store i32 [[RES]], ptr [[DST]], align 4
 ; CHECK-NEXT:    [[NEXTI]] = add i64 [[I]], 1
 ; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP13:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -461,21 +609,55 @@ exit:
 define void @combine_load_factor2_i64(ptr noalias %p, ptr noalias %q) {
 ; CHECK-LABEL: @combine_load_factor2_i64(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[TMP0]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i64, ptr [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i64, ptr [[TMP5]], i32 0
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[TMP6]], align 8
+; CHECK-NEXT:    [[WIDE_VEC1:%.*]] = load <8 x i64>, ptr [[TMP7]], align 8
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <8 x i64> [[WIDE_VEC1]], <8 x i64> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <8 x i64> [[WIDE_VEC1]], <8 x i64> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    [[TMP8:%.*]] = add <4 x i64> [[STRIDED_VEC]], [[STRIDED_VEC3]]
+; CHECK-NEXT:    [[TMP9:%.*]] = add <4 x i64> [[STRIDED_VEC2]], [[STRIDED_VEC4]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i64, ptr [[Q:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i64, ptr [[Q]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i64, ptr [[TMP10]], i32 0
+; CHECK-NEXT:    store <4 x i64> [[TMP8]], ptr [[TMP12]], align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i64, ptr [[TMP10]], i32 4
+; CHECK-NEXT:    store <4 x i64> [[TMP9]], ptr [[TMP13]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[I:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[OFFSET0:%.*]] = shl i64 [[I]], 1
-; CHECK-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]]
-; CHECK-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 4
+; CHECK-NEXT:    [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]]
+; CHECK-NEXT:    [[X0:%.*]] = load i64, ptr [[Q0]], align 8
 ; CHECK-NEXT:    [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1
 ; CHECK-NEXT:    [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]]
-; CHECK-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 4
+; CHECK-NEXT:    [[X1:%.*]] = load i64, ptr [[Q1]], align 8
 ; CHECK-NEXT:    [[RES:%.*]] = add i64 [[X0]], [[X1]]
-; CHECK-NEXT:    [[DST:%.*]] = getelementptr i64, ptr [[Q:%.*]], i64 [[I]]
-; CHECK-NEXT:    store i64 [[RES]], ptr [[DST]], align 4
+; CHECK-NEXT:    [[DST:%.*]] = getelementptr i64, ptr [[Q]], i64 [[I]]
+; CHECK-NEXT:    store i64 [[RES]], ptr [[DST]], align 8
 ; CHECK-NEXT:    [[NEXTI]] = add i64 [[I]], 1
 ; CHECK-NEXT:    [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024
-; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT:%.*]], label [[LOOP]]
+; CHECK-NEXT:    br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP15:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/lmul.ll b/llvm/test/Transforms/LoopVectorize/RISCV/lmul.ll
index 9b3b90a7bc3b6..072cef66c00f4 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/lmul.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/lmul.ll
@@ -21,9 +21,9 @@ define void @load_store(ptr %p) {
 ; LMUL1-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 0
 ; LMUL1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[P:%.*]], i64 [[TMP2]]
 ; LMUL1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0
-; LMUL1-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 1 x i64>, ptr [[TMP4]], align 4
+; LMUL1-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 1 x i64>, ptr [[TMP4]], align 8
 ; LMUL1-NEXT:    [[TMP5:%.*]] = add <vscale x 1 x i64> [[WIDE_LOAD]], shufflevector (<vscale x 1 x i64> insertelement (<vscale x 1 x i64> poison, i64 1, i64 0), <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer)
-; LMUL1-NEXT:    store <vscale x 1 x i64> [[TMP5]], ptr [[TMP4]], align 4
+; LMUL1-NEXT:    store <vscale x 1 x i64> [[TMP5]], ptr [[TMP4]], align 8
 ; LMUL1-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
 ; LMUL1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
 ; LMUL1-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -37,9 +37,9 @@ define void @load_store(ptr %p) {
 ; LMUL1:       for.body:
 ; LMUL1-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; LMUL1-NEXT:    [[Q:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[IV]]
-; LMUL1-NEXT:    [[V:%.*]] = load i64, ptr [[Q]], align 4
+; LMUL1-NEXT:    [[V:%.*]] = load i64, ptr [[Q]], align 8
 ; LMUL1-NEXT:    [[W:%.*]] = add i64 [[V]], 1
-; LMUL1-NEXT:    store i64 [[W]], ptr [[Q]], align 4
+; LMUL1-NEXT:    store i64 [[W]], ptr [[Q]], align 8
 ; LMUL1-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; LMUL1-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
 ; LMUL1-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
@@ -63,9 +63,9 @@ define void @load_store(ptr %p) {
 ; LMUL2-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
 ; LMUL2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[P:%.*]], i64 [[TMP4]]
 ; LMUL2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0
-; LMUL2-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP6]], align 4
+; LMUL2-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP6]], align 8
 ; LMUL2-NEXT:    [[TMP7:%.*]] = add <vscale x 2 x i64> [[WIDE_LOAD]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
-; LMUL2-NEXT:    store <vscale x 2 x i64> [[TMP7]], ptr [[TMP6]], align 4
+; LMUL2-NEXT:    store <vscale x 2 x i64> [[TMP7]], ptr [[TMP6]], align 8
 ; LMUL2-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
 ; LMUL2-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 2
 ; LMUL2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]]
@@ -80,9 +80,9 @@ define void @load_store(ptr %p) {
 ; LMUL2:       for.body:
 ; LMUL2-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; LMUL2-NEXT:    [[Q:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[IV]]
-; LMUL2-NEXT:    [[V:%.*]] = load i64, ptr [[Q]], align 4
+; LMUL2-NEXT:    [[V:%.*]] = load i64, ptr [[Q]], align 8
 ; LMUL2-NEXT:    [[W:%.*]] = add i64 [[V]], 1
-; LMUL2-NEXT:    store i64 [[W]], ptr [[Q]], align 4
+; LMUL2-NEXT:    store i64 [[W]], ptr [[Q]], align 8
 ; LMUL2-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; LMUL2-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
 ; LMUL2-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
@@ -106,9 +106,9 @@ define void @load_store(ptr %p) {
 ; LMUL4-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
 ; LMUL4-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[P:%.*]], i64 [[TMP4]]
 ; LMUL4-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0
-; LMUL4-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i64>, ptr [[TMP6]], align 4
+; LMUL4-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i64>, ptr [[TMP6]], align 8
 ; LMUL4-NEXT:    [[TMP7:%.*]] = add <vscale x 4 x i64> [[WIDE_LOAD]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
-; LMUL4-NEXT:    store <vscale x 4 x i64> [[TMP7]], ptr [[TMP6]], align 4
+; LMUL4-NEXT:    store <vscale x 4 x i64> [[TMP7]], ptr [[TMP6]], align 8
 ; LMUL4-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
 ; LMUL4-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 4
 ; LMUL4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]]
@@ -123,9 +123,9 @@ define void @load_store(ptr %p) {
 ; LMUL4:       for.body:
 ; LMUL4-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; LMUL4-NEXT:    [[Q:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[IV]]
-; LMUL4-NEXT:    [[V:%.*]] = load i64, ptr [[Q]], align 4
+; LMUL4-NEXT:    [[V:%.*]] = load i64, ptr [[Q]], align 8
 ; LMUL4-NEXT:    [[W:%.*]] = add i64 [[V]], 1
-; LMUL4-NEXT:    store i64 [[W]], ptr [[Q]], align 4
+; LMUL4-NEXT:    store i64 [[W]], ptr [[Q]], align 8
 ; LMUL4-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; LMUL4-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
 ; LMUL4-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
@@ -149,9 +149,9 @@ define void @load_store(ptr %p) {
 ; LMUL8-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
 ; LMUL8-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[P:%.*]], i64 [[TMP4]]
 ; LMUL8-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0
-; LMUL8-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i64>, ptr [[TMP6]], align 4
+; LMUL8-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i64>, ptr [[TMP6]], align 8
 ; LMUL8-NEXT:    [[TMP7:%.*]] = add <vscale x 8 x i64> [[WIDE_LOAD]], shufflevector (<vscale x 8 x i64> insertelement (<vscale x 8 x i64> poison, i64 1, i64 0), <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer)
-; LMUL8-NEXT:    store <vscale x 8 x i64> [[TMP7]], ptr [[TMP6]], align 4
+; LMUL8-NEXT:    store <vscale x 8 x i64> [[TMP7]], ptr [[TMP6]], align 8
 ; LMUL8-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
 ; LMUL8-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 8
 ; LMUL8-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]]
@@ -166,9 +166,9 @@ define void @load_store(ptr %p) {
 ; LMUL8:       for.body:
 ; LMUL8-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; LMUL8-NEXT:    [[Q:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[IV]]
-; LMUL8-NEXT:    [[V:%.*]] = load i64, ptr [[Q]], align 4
+; LMUL8-NEXT:    [[V:%.*]] = load i64, ptr [[Q]], align 8
 ; LMUL8-NEXT:    [[W:%.*]] = add i64 [[V]], 1
-; LMUL8-NEXT:    store i64 [[W]], ptr [[Q]], align 4
+; LMUL8-NEXT:    store i64 [[W]], ptr [[Q]], align 8
 ; LMUL8-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; LMUL8-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
 ; LMUL8-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll b/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll
index fab924580cbb7..72e3bca79a5a5 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/masked_gather_scatter.ll
@@ -22,9 +22,9 @@ define void @foo4(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture rea
 ; RV32-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 625, [[TMP2]]
 ; RV32-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; RV32:       vector.memcheck:
-; RV32-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 79880
-; RV32-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[TRIGGER:%.*]], i64 39940
-; RV32-NEXT:    [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 159752
+; RV32-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i32 79880
+; RV32-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[TRIGGER:%.*]], i32 39940
+; RV32-NEXT:    [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[B:%.*]], i32 159752
 ; RV32-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[A]], [[SCEVGEP1]]
 ; RV32-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[TRIGGER]], [[SCEVGEP]]
 ; RV32-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
diff --git a/llvm/test/Transforms/MemCpyOpt/no-libcalls.ll b/llvm/test/Transforms/MemCpyOpt/no-libcalls.ll
index 04cbc3ee8cf34..8d48a20f4ce5b 100644
--- a/llvm/test/Transforms/MemCpyOpt/no-libcalls.ll
+++ b/llvm/test/Transforms/MemCpyOpt/no-libcalls.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -passes=memcpyopt < %s | FileCheck %s --check-prefixes=CHECK,LIBCALLS
+; RUN: opt -S -passes=memcpyopt -mtriple=x86_64 < %s | FileCheck %s --check-prefixes=CHECK,LIBCALLS
 ; RUN: opt -S -passes=memcpyopt -mtriple=amdgcn-- < %s | FileCheck %s --check-prefixes=CHECK,NO-LIBCALLS
 ; RUN: opt -S -passes=memcpyopt -mtriple=amdgcn-- -enable-memcpyopt-without-libcalls < %s \
 ; RUN:     | FileCheck %s --check-prefixes=CHECK,LIBCALLS
@@ -38,12 +38,12 @@ define void @dont_create_memset(ptr %p) {
 
 define void @dont_create_memcpy(ptr %p1, ptr %p2) {
 ; LIBCALLS-LABEL: @dont_create_memcpy(
-; LIBCALLS-NEXT:    call void @llvm.memmove.p0.p0.i64(ptr align 4 [[P2:%.*]], ptr align 4 [[P1:%.*]], i64 8, i1 false)
+; LIBCALLS-NEXT:    call void @llvm.memmove.p0.p0.i64(ptr align 8 [[P2:%.*]], ptr align 8 [[P1:%.*]], i64 8, i1 false)
 ; LIBCALLS-NEXT:    ret void
 ;
 ; NO-LIBCALLS-LABEL: @dont_create_memcpy(
-; NO-LIBCALLS-NEXT:    [[V:%.*]] = load [[TY:%.*]], ptr [[P1:%.*]], align 4
-; NO-LIBCALLS-NEXT:    store [[TY]] [[V]], ptr [[P2:%.*]], align 4
+; NO-LIBCALLS-NEXT:    [[V:%.*]] = load [[TY:%.*]], ptr [[P1:%.*]], align 8
+; NO-LIBCALLS-NEXT:    store [[TY]] [[V]], ptr [[P2:%.*]], align 8
 ; NO-LIBCALLS-NEXT:    ret void
 ;
   %v = load %ty, ptr %p1
diff --git a/llvm/test/Transforms/MergeICmps/X86/addressspaces.ll b/llvm/test/Transforms/MergeICmps/X86/addressspaces.ll
index fc85f57e020c1..f3541630c804f 100644
--- a/llvm/test/Transforms/MergeICmps/X86/addressspaces.ll
+++ b/llvm/test/Transforms/MergeICmps/X86/addressspaces.ll
@@ -46,13 +46,13 @@ define void @no_memcmp(ptr addrspace(11) dereferenceable(16) %a, ptr addrspace(1
 ; CHECK-NEXT:    [[PTR_B1:%.*]] = getelementptr inbounds [2 x i64], ptr addrspace(11) [[B:%.*]], i64 0, i64 1
 ; CHECK-NEXT:    br label [[BB1:%.*]]
 ; CHECK:       bb1:
-; CHECK-NEXT:    [[A0:%.*]] = load i64, ptr addrspace(11) [[A]], align 4
-; CHECK-NEXT:    [[B0:%.*]] = load i64, ptr addrspace(11) [[B]], align 4
+; CHECK-NEXT:    [[A0:%.*]] = load i64, ptr addrspace(11) [[A]], align 8
+; CHECK-NEXT:    [[B0:%.*]] = load i64, ptr addrspace(11) [[B]], align 8
 ; CHECK-NEXT:    [[COND0:%.*]] = icmp eq i64 [[A0]], [[B0]]
 ; CHECK-NEXT:    br i1 [[COND0]], label [[BB2:%.*]], label [[BB3:%.*]]
 ; CHECK:       bb2:
-; CHECK-NEXT:    [[A1:%.*]] = load i64, ptr addrspace(11) [[PTR_A1]], align 4
-; CHECK-NEXT:    [[B1:%.*]] = load i64, ptr addrspace(11) [[PTR_B1]], align 4
+; CHECK-NEXT:    [[A1:%.*]] = load i64, ptr addrspace(11) [[PTR_A1]], align 8
+; CHECK-NEXT:    [[B1:%.*]] = load i64, ptr addrspace(11) [[PTR_B1]], align 8
 ; CHECK-NEXT:    [[COND1:%.*]] = icmp eq i64 [[A1]], [[B1]]
 ; CHECK-NEXT:    br label [[BB3]]
 ; CHECK:       bb3:
diff --git a/llvm/test/Transforms/NewGVN/refine-stores.ll b/llvm/test/Transforms/NewGVN/refine-stores.ll
index a45dd1f48e1c0..eb5acd276a597 100644
--- a/llvm/test/Transforms/NewGVN/refine-stores.ll
+++ b/llvm/test/Transforms/NewGVN/refine-stores.ll
@@ -56,7 +56,7 @@ define void @a() {
 ; CHECK-NEXT:  b:
 ; CHECK-NEXT:    br label [[C:%.*]]
 ; CHECK:       c:
-; CHECK-NEXT:    store i64 undef, ptr null, align 4
+; CHECK-NEXT:    store i64 undef, ptr null, align 8
 ; CHECK-NEXT:    br label [[E:%.*]]
 ; CHECK:       e:
 ; CHECK-NEXT:    store ptr undef, ptr null, align 8
@@ -93,9 +93,9 @@ define void @widget(ptr %arg) {
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[TMP3]], 0
 ; CHECK-NEXT:    br i1 [[TMP4]], label [[BB7]], label [[BB5:%.*]]
 ; CHECK:       bb5:
-; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr null, align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr null, align 8
 ; CHECK-NEXT:    call void @quux()
-; CHECK-NEXT:    store i64 [[TMP6]], ptr undef, align 4
+; CHECK-NEXT:    store i64 [[TMP6]], ptr undef, align 8
 ; CHECK-NEXT:    br label [[BB7]]
 ; CHECK:       bb7:
 ; CHECK-NEXT:    [[TMP8]] = add i64 [[TMP3]], 1
diff --git a/llvm/test/Transforms/OpenMP/barrier_removal.ll b/llvm/test/Transforms/OpenMP/barrier_removal.ll
index 8aca820afdfa8..3040c1721d363 100644
--- a/llvm/test/Transforms/OpenMP/barrier_removal.ll
+++ b/llvm/test/Transforms/OpenMP/barrier_removal.ll
@@ -329,7 +329,7 @@ define void @neg_loads() "kernel" {
 define void @pos_priv_mem() "kernel" {
 ; CHECK-LABEL: define {{[^@]+}}@pos_priv_mem
 ; CHECK-SAME: () #[[ATTR4]] {
-; CHECK-NEXT:    [[ARG:%.*]] = load ptr addrspace(5), ptr @GPtr5, align 8
+; CHECK-NEXT:    [[ARG:%.*]] = load ptr addrspace(5), ptr @GPtr5, align 4
 ; CHECK-NEXT:    [[LOC:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    [[A:%.*]] = load i32, ptr @PG1, align 4
 ; CHECK-NEXT:    store i32 [[A]], ptr [[LOC]], align 4
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/hoisting-sinking-required-for-vectorization.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/hoisting-sinking-required-for-vectorization.ll
index dd6d2e8270d9c..40a5c73b69de9 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/hoisting-sinking-required-for-vectorization.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/hoisting-sinking-required-for-vectorization.ll
@@ -41,49 +41,47 @@ return:                                           ; preds = %if.end3, %if.then2,
 define void @loop(ptr %X, ptr %Y) {
 ; CHECK-LABEL: @loop(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[X5:%.*]] = ptrtoint ptr [[X:%.*]] to i64
-; CHECK-NEXT:    [[Y6:%.*]] = ptrtoint ptr [[Y:%.*]] to i64
-; CHECK-NEXT:    [[TMP0:%.*]] = sub i64 [[X5]], [[Y6]]
+; CHECK-NEXT:    [[X6:%.*]] = ptrtoint ptr [[X:%.*]] to i64
+; CHECK-NEXT:    [[Y7:%.*]] = ptrtoint ptr [[Y:%.*]] to i64
+; CHECK-NEXT:    [[TMP0:%.*]] = sub i64 [[X6]], [[Y7]]
 ; CHECK-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 32
 ; CHECK-NEXT:    br i1 [[DIFF_CHECK]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[INDEX]] to i64
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds double, ptr [[Y]], i64 [[TMP1]]
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP2]], align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds double, ptr [[TMP2]], i64 2
-; CHECK-NEXT:    [[WIDE_LOAD7:%.*]] = load <2 x double>, ptr [[TMP3]], align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = fcmp olt <2 x double> [[WIDE_LOAD]], zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = fcmp olt <2 x double> [[WIDE_LOAD7]], zeroinitializer
-; CHECK-NEXT:    [[TMP6:%.*]] = fcmp ogt <2 x double> [[WIDE_LOAD]], <double 6.000000e+00, double 6.000000e+00>
-; CHECK-NEXT:    [[TMP7:%.*]] = fcmp ogt <2 x double> [[WIDE_LOAD7]], <double 6.000000e+00, double 6.000000e+00>
-; CHECK-NEXT:    [[TMP8:%.*]] = select <2 x i1> [[TMP6]], <2 x double> <double 6.000000e+00, double 6.000000e+00>, <2 x double> [[WIDE_LOAD]]
-; CHECK-NEXT:    [[TMP9:%.*]] = select <2 x i1> [[TMP7]], <2 x double> <double 6.000000e+00, double 6.000000e+00>, <2 x double> [[WIDE_LOAD7]]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds double, ptr [[Y]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP1]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds double, ptr [[TMP1]], i64 2
+; CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <2 x double>, ptr [[TMP2]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = fcmp olt <2 x double> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = fcmp olt <2 x double> [[WIDE_LOAD8]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = fcmp ogt <2 x double> [[WIDE_LOAD]], <double 6.000000e+00, double 6.000000e+00>
+; CHECK-NEXT:    [[TMP6:%.*]] = fcmp ogt <2 x double> [[WIDE_LOAD8]], <double 6.000000e+00, double 6.000000e+00>
+; CHECK-NEXT:    [[TMP7:%.*]] = select <2 x i1> [[TMP5]], <2 x double> <double 6.000000e+00, double 6.000000e+00>, <2 x double> [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP8:%.*]] = select <2 x i1> [[TMP6]], <2 x double> <double 6.000000e+00, double 6.000000e+00>, <2 x double> [[WIDE_LOAD8]]
+; CHECK-NEXT:    [[TMP9:%.*]] = select <2 x i1> [[TMP3]], <2 x double> zeroinitializer, <2 x double> [[TMP7]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = select <2 x i1> [[TMP4]], <2 x double> zeroinitializer, <2 x double> [[TMP8]]
-; CHECK-NEXT:    [[TMP11:%.*]] = select <2 x i1> [[TMP5]], <2 x double> zeroinitializer, <2 x double> [[TMP9]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[INDEX]]
+; CHECK-NEXT:    store <2 x double> [[TMP9]], ptr [[TMP11]], align 8
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds double, ptr [[TMP11]], i64 2
 ; CHECK-NEXT:    store <2 x double> [[TMP10]], ptr [[TMP12]], align 8
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds double, ptr [[TMP12]], i64 2
-; CHECK-NEXT:    store <2 x double> [[TMP11]], ptr [[TMP13]], align 8
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[INDEX_NEXT]], 20000
-; CHECK-NEXT:    br i1 [[TMP14]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 20000
+; CHECK-NEXT:    br i1 [[TMP13]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       for.cond.cleanup:
 ; CHECK-NEXT:    ret void
 ; CHECK:       for.body:
-; CHECK-NEXT:    [[I_04:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY]] ]
-; CHECK-NEXT:    [[IDXPROM:%.*]] = zext i32 [[I_04]] to i64
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[Y]], i64 [[IDXPROM]]
-; CHECK-NEXT:    [[TMP15:%.*]] = load double, ptr [[ARRAYIDX]], align 8
-; CHECK-NEXT:    [[CMP_I:%.*]] = fcmp olt double [[TMP15]], 0.000000e+00
-; CHECK-NEXT:    [[CMP1_I:%.*]] = fcmp ogt double [[TMP15]], 6.000000e+00
-; CHECK-NEXT:    [[DOTV_I:%.*]] = select i1 [[CMP1_I]], double 6.000000e+00, double [[TMP15]]
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[Y]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP14:%.*]] = load double, ptr [[ARRAYIDX]], align 8
+; CHECK-NEXT:    [[CMP_I:%.*]] = fcmp olt double [[TMP14]], 0.000000e+00
+; CHECK-NEXT:    [[CMP1_I:%.*]] = fcmp ogt double [[TMP14]], 6.000000e+00
+; CHECK-NEXT:    [[DOTV_I:%.*]] = select i1 [[CMP1_I]], double 6.000000e+00, double [[TMP14]]
 ; CHECK-NEXT:    [[RETVAL_0_I:%.*]] = select i1 [[CMP_I]], double 0.000000e+00, double [[DOTV_I]]
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[IDXPROM]]
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[INDVARS_IV]]
 ; CHECK-NEXT:    store double [[RETVAL_0_I]], ptr [[ARRAYIDX2]], align 8
-; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_04]], 1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[I_04]], 19999
-; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 20000
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ;
 entry:
   %X.addr = alloca ptr, align 8
@@ -193,8 +191,8 @@ define void @loop2(ptr %A, ptr %B, ptr %C, float %x) {
 ; CHECK-NEXT:    [[ADD_SINK:%.*]] = phi float [ [[ADD]], [[ELSE]] ], [ [[MUL2_I81_I]], [[LOOP_BODY]] ]
 ; CHECK-NEXT:    store float [[ADD_SINK]], ptr [[B_GEP_0]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
-; CHECK-NEXT:    [[CMP_0:%.*]] = icmp ult i64 [[IV1]], 9999
-; CHECK-NEXT:    br i1 [[CMP_0]], label [[LOOP_BODY]], label [[EXIT]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 10000
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[LOOP_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll
index 2db35c4e03e4d..d048b0bab4176 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll
@@ -92,21 +92,20 @@ define void @matrix_extract_insert_loop(i32 %i, ptr nonnull align 8 dereferencea
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds <225 x double>, ptr [[B:%.*]], i64 0, i64 [[CONV6]]
 ; CHECK-NEXT:    br label [[FOR_BODY4_US:%.*]]
 ; CHECK:       for.body4.us:
-; CHECK-NEXT:    [[K_011_US:%.*]] = phi i32 [ 0, [[FOR_COND1_PREHEADER_US]] ], [ [[INC_US:%.*]], [[FOR_BODY4_US]] ]
-; CHECK-NEXT:    [[CONV_US:%.*]] = zext i32 [[K_011_US]] to i64
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp ult i32 [[K_011_US]], 225
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_COND1_PREHEADER_US]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY4_US]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ult i64 [[INDVARS_IV]], 225
 ; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP2]])
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds <225 x double>, ptr [[A:%.*]], i64 0, i64 [[CONV_US]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds <225 x double>, ptr [[A:%.*]], i64 0, i64 [[INDVARS_IV]]
 ; CHECK-NEXT:    [[MATRIXEXT_US:%.*]] = load double, ptr [[TMP3]], align 8
 ; CHECK-NEXT:    [[MATRIXEXT8_US:%.*]] = load double, ptr [[TMP1]], align 8
 ; CHECK-NEXT:    [[MUL_US:%.*]] = fmul double [[MATRIXEXT_US]], [[MATRIXEXT8_US]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds <225 x double>, ptr [[B]], i64 0, i64 [[CONV_US]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds <225 x double>, ptr [[B]], i64 0, i64 [[INDVARS_IV]]
 ; CHECK-NEXT:    [[MATRIXEXT11_US:%.*]] = load double, ptr [[TMP4]], align 8
 ; CHECK-NEXT:    [[SUB_US:%.*]] = fsub double [[MATRIXEXT11_US]], [[MUL_US]]
 ; CHECK-NEXT:    store double [[SUB_US]], ptr [[TMP4]], align 8
-; CHECK-NEXT:    [[INC_US]] = add nuw nsw i32 [[K_011_US]], 1
-; CHECK-NEXT:    [[CMP2_US:%.*]] = icmp ult i32 [[INC_US]], [[I]]
-; CHECK-NEXT:    br i1 [[CMP2_US]], label [[FOR_BODY4_US]], label [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US:%.*]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[CONV6]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US:%.*]], label [[FOR_BODY4_US]]
 ; CHECK:       for.cond1.for.cond.cleanup3_crit_edge.us:
 ; CHECK-NEXT:    [[TMP5:%.*]] = add nuw nsw i64 [[CONV6]], 15
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp ult i32 [[I]], 210
@@ -114,10 +113,9 @@ define void @matrix_extract_insert_loop(i32 %i, ptr nonnull align 8 dereferencea
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds <225 x double>, ptr [[B]], i64 0, i64 [[TMP5]]
 ; CHECK-NEXT:    br label [[FOR_BODY4_US_1:%.*]]
 ; CHECK:       for.body4.us.1:
-; CHECK-NEXT:    [[K_011_US_1:%.*]] = phi i32 [ 0, [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US]] ], [ [[INC_US_1:%.*]], [[FOR_BODY4_US_1]] ]
-; CHECK-NEXT:    [[CONV_US_1:%.*]] = zext i32 [[K_011_US_1]] to i64
-; CHECK-NEXT:    [[TMP8:%.*]] = add nuw nsw i64 [[CONV_US_1]], 15
-; CHECK-NEXT:    [[TMP9:%.*]] = icmp ult i32 [[K_011_US_1]], 210
+; CHECK-NEXT:    [[INDVARS_IV_1:%.*]] = phi i64 [ 0, [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US]] ], [ [[INDVARS_IV_NEXT_1:%.*]], [[FOR_BODY4_US_1]] ]
+; CHECK-NEXT:    [[TMP8:%.*]] = add nuw nsw i64 [[INDVARS_IV_1]], 15
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp ult i64 [[INDVARS_IV_1]], 210
 ; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP9]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds <225 x double>, ptr [[A]], i64 0, i64 [[TMP8]]
 ; CHECK-NEXT:    [[MATRIXEXT_US_1:%.*]] = load double, ptr [[TMP10]], align 8
@@ -127,9 +125,9 @@ define void @matrix_extract_insert_loop(i32 %i, ptr nonnull align 8 dereferencea
 ; CHECK-NEXT:    [[MATRIXEXT11_US_1:%.*]] = load double, ptr [[TMP11]], align 8
 ; CHECK-NEXT:    [[SUB_US_1:%.*]] = fsub double [[MATRIXEXT11_US_1]], [[MUL_US_1]]
 ; CHECK-NEXT:    store double [[SUB_US_1]], ptr [[TMP11]], align 8
-; CHECK-NEXT:    [[INC_US_1]] = add nuw nsw i32 [[K_011_US_1]], 1
-; CHECK-NEXT:    [[CMP2_US_1:%.*]] = icmp ult i32 [[INC_US_1]], [[I]]
-; CHECK-NEXT:    br i1 [[CMP2_US_1]], label [[FOR_BODY4_US_1]], label [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US_1:%.*]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT_1]] = add nuw nsw i64 [[INDVARS_IV_1]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT_1:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_1]], [[CONV6]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT_1]], label [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US_1:%.*]], label [[FOR_BODY4_US_1]]
 ; CHECK:       for.cond1.for.cond.cleanup3_crit_edge.us.1:
 ; CHECK-NEXT:    [[TMP12:%.*]] = add nuw nsw i64 [[CONV6]], 30
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp ult i32 [[I]], 195
@@ -137,10 +135,9 @@ define void @matrix_extract_insert_loop(i32 %i, ptr nonnull align 8 dereferencea
 ; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds <225 x double>, ptr [[B]], i64 0, i64 [[TMP12]]
 ; CHECK-NEXT:    br label [[FOR_BODY4_US_2:%.*]]
 ; CHECK:       for.body4.us.2:
-; CHECK-NEXT:    [[K_011_US_2:%.*]] = phi i32 [ 0, [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US_1]] ], [ [[INC_US_2:%.*]], [[FOR_BODY4_US_2]] ]
-; CHECK-NEXT:    [[CONV_US_2:%.*]] = zext i32 [[K_011_US_2]] to i64
-; CHECK-NEXT:    [[TMP15:%.*]] = add nuw nsw i64 [[CONV_US_2]], 30
-; CHECK-NEXT:    [[TMP16:%.*]] = icmp ult i32 [[K_011_US_2]], 195
+; CHECK-NEXT:    [[INDVARS_IV_2:%.*]] = phi i64 [ 0, [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US_1]] ], [ [[INDVARS_IV_NEXT_2:%.*]], [[FOR_BODY4_US_2]] ]
+; CHECK-NEXT:    [[TMP15:%.*]] = add nuw nsw i64 [[INDVARS_IV_2]], 30
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp ult i64 [[INDVARS_IV_2]], 195
 ; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP16]])
 ; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds <225 x double>, ptr [[A]], i64 0, i64 [[TMP15]]
 ; CHECK-NEXT:    [[MATRIXEXT_US_2:%.*]] = load double, ptr [[TMP17]], align 8
@@ -150,9 +147,9 @@ define void @matrix_extract_insert_loop(i32 %i, ptr nonnull align 8 dereferencea
 ; CHECK-NEXT:    [[MATRIXEXT11_US_2:%.*]] = load double, ptr [[TMP18]], align 8
 ; CHECK-NEXT:    [[SUB_US_2:%.*]] = fsub double [[MATRIXEXT11_US_2]], [[MUL_US_2]]
 ; CHECK-NEXT:    store double [[SUB_US_2]], ptr [[TMP18]], align 8
-; CHECK-NEXT:    [[INC_US_2]] = add nuw nsw i32 [[K_011_US_2]], 1
-; CHECK-NEXT:    [[CMP2_US_2:%.*]] = icmp ult i32 [[INC_US_2]], [[I]]
-; CHECK-NEXT:    br i1 [[CMP2_US_2]], label [[FOR_BODY4_US_2]], label [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US_2:%.*]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT_2]] = add nuw nsw i64 [[INDVARS_IV_2]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT_2:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_2]], [[CONV6]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT_2]], label [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US_2:%.*]], label [[FOR_BODY4_US_2]]
 ; CHECK:       for.cond1.for.cond.cleanup3_crit_edge.us.2:
 ; CHECK-NEXT:    [[TMP19:%.*]] = add nuw nsw i64 [[CONV6]], 45
 ; CHECK-NEXT:    [[TMP20:%.*]] = icmp ult i32 [[I]], 180
@@ -160,10 +157,9 @@ define void @matrix_extract_insert_loop(i32 %i, ptr nonnull align 8 dereferencea
 ; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds <225 x double>, ptr [[B]], i64 0, i64 [[TMP19]]
 ; CHECK-NEXT:    br label [[FOR_BODY4_US_3:%.*]]
 ; CHECK:       for.body4.us.3:
-; CHECK-NEXT:    [[K_011_US_3:%.*]] = phi i32 [ 0, [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US_2]] ], [ [[INC_US_3:%.*]], [[FOR_BODY4_US_3]] ]
-; CHECK-NEXT:    [[CONV_US_3:%.*]] = zext i32 [[K_011_US_3]] to i64
-; CHECK-NEXT:    [[TMP22:%.*]] = add nuw nsw i64 [[CONV_US_3]], 45
-; CHECK-NEXT:    [[TMP23:%.*]] = icmp ult i32 [[K_011_US_3]], 180
+; CHECK-NEXT:    [[INDVARS_IV_3:%.*]] = phi i64 [ 0, [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US_2]] ], [ [[INDVARS_IV_NEXT_3:%.*]], [[FOR_BODY4_US_3]] ]
+; CHECK-NEXT:    [[TMP22:%.*]] = add nuw nsw i64 [[INDVARS_IV_3]], 45
+; CHECK-NEXT:    [[TMP23:%.*]] = icmp ult i64 [[INDVARS_IV_3]], 180
 ; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP23]])
 ; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds <225 x double>, ptr [[A]], i64 0, i64 [[TMP22]]
 ; CHECK-NEXT:    [[MATRIXEXT_US_3:%.*]] = load double, ptr [[TMP24]], align 8
@@ -173,9 +169,9 @@ define void @matrix_extract_insert_loop(i32 %i, ptr nonnull align 8 dereferencea
 ; CHECK-NEXT:    [[MATRIXEXT11_US_3:%.*]] = load double, ptr [[TMP25]], align 8
 ; CHECK-NEXT:    [[SUB_US_3:%.*]] = fsub double [[MATRIXEXT11_US_3]], [[MUL_US_3]]
 ; CHECK-NEXT:    store double [[SUB_US_3]], ptr [[TMP25]], align 8
-; CHECK-NEXT:    [[INC_US_3]] = add nuw nsw i32 [[K_011_US_3]], 1
-; CHECK-NEXT:    [[CMP2_US_3:%.*]] = icmp ult i32 [[INC_US_3]], [[I]]
-; CHECK-NEXT:    br i1 [[CMP2_US_3]], label [[FOR_BODY4_US_3]], label [[FOR_COND_CLEANUP]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT_3]] = add nuw nsw i64 [[INDVARS_IV_3]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT_3:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT_3]], [[CONV6]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT_3]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY4_US_3]]
 ; CHECK:       for.cond.cleanup:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/peel-multiple-unreachable-exits-for-vectorization.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/peel-multiple-unreachable-exits-for-vectorization.ll
index 8d41d9a486801..a077239e5ffc7 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/peel-multiple-unreachable-exits-for-vectorization.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/peel-multiple-unreachable-exits-for-vectorization.ll
@@ -16,19 +16,20 @@ define i64 @sum_2_at_with_int_conversion(ptr %A, ptr %B, i64 %N) {
 ; CHECK-NEXT:    [[START_INT_I:%.*]] = ptrtoint ptr [[START_I]] to i64
 ; CHECK-NEXT:    [[END_INT_I:%.*]] = ptrtoint ptr [[END_I]] to i64
 ; CHECK-NEXT:    [[SUB_I:%.*]] = sub i64 [[END_INT_I]], [[START_INT_I]]
+; CHECK-NEXT:    [[SMAX:%.*]] = tail call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 0)
 ; CHECK-NEXT:    [[GEP_END_I2:%.*]] = getelementptr [[VEC]], ptr [[B:%.*]], i64 0, i32 1
 ; CHECK-NEXT:    [[START_I1_PEEL:%.*]] = load ptr, ptr [[B]], align 8
 ; CHECK-NEXT:    [[END_I3_PEEL:%.*]] = load ptr, ptr [[GEP_END_I2]], align 8
 ; CHECK-NEXT:    [[START_INT_I4_PEEL:%.*]] = ptrtoint ptr [[START_I1_PEEL]] to i64
 ; CHECK-NEXT:    [[END_INT_I5_PEEL:%.*]] = ptrtoint ptr [[END_I3_PEEL]] to i64
 ; CHECK-NEXT:    [[SUB_I6_PEEL:%.*]] = sub i64 [[END_INT_I5_PEEL]], [[START_INT_I4_PEEL]]
-; CHECK-NEXT:    [[LV_I_PEEL:%.*]] = load i64, ptr [[START_I]], align 4
-; CHECK-NEXT:    [[LV_I9_PEEL:%.*]] = load i64, ptr [[START_I1_PEEL]], align 4
+; CHECK-NEXT:    [[LV_I_PEEL:%.*]] = load i64, ptr [[START_I]], align 8
+; CHECK-NEXT:    [[LV_I9_PEEL:%.*]] = load i64, ptr [[START_I1_PEEL]], align 8
 ; CHECK-NEXT:    [[SUM_NEXT_PEEL:%.*]] = add i64 [[LV_I_PEEL]], [[LV_I9_PEEL]]
-; CHECK-NEXT:    [[C_PEEL:%.*]] = icmp sgt i64 [[N:%.*]], 0
-; CHECK-NEXT:    br i1 [[C_PEEL]], label [[LOOP_PREHEADER:%.*]], label [[EXIT:%.*]]
+; CHECK-NEXT:    [[EXITCOND_PEEL_NOT:%.*]] = icmp slt i64 [[N]], 1
+; CHECK-NEXT:    br i1 [[EXITCOND_PEEL_NOT]], label [[EXIT:%.*]], label [[LOOP_PREHEADER:%.*]]
 ; CHECK:       loop.preheader:
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[N]], -1
+; CHECK-NEXT:    [[TMP0:%.*]] = add nsw i64 [[SMAX]], -1
 ; CHECK-NEXT:    [[UMIN:%.*]] = tail call i64 @llvm.umin.i64(i64 [[SUB_I6_PEEL]], i64 [[TMP0]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = freeze i64 [[UMIN]]
 ; CHECK-NEXT:    [[UMIN15:%.*]] = tail call i64 @llvm.umin.i64(i64 [[TMP1]], i64 [[SUB_I]])
@@ -49,13 +50,13 @@ define i64 @sum_2_at_with_int_conversion(ptr %A, ptr %B, i64 %N) {
 ; CHECK-NEXT:    [[VEC_PHI16:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = or i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i64, ptr [[START_I]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP6]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP6]], align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i64, ptr [[TMP6]], i64 2
-; CHECK-NEXT:    [[WIDE_LOAD17:%.*]] = load <2 x i64>, ptr [[TMP7]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD17:%.*]] = load <2 x i64>, ptr [[TMP7]], align 8
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i64, ptr [[START_I1_PEEL]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[WIDE_LOAD18:%.*]] = load <2 x i64>, ptr [[TMP8]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD18:%.*]] = load <2 x i64>, ptr [[TMP8]], align 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i64, ptr [[TMP8]], i64 2
-; CHECK-NEXT:    [[WIDE_LOAD19:%.*]] = load <2 x i64>, ptr [[TMP9]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD19:%.*]] = load <2 x i64>, ptr [[TMP9]], align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = add <2 x i64> [[WIDE_LOAD]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = add <2 x i64> [[WIDE_LOAD17]], [[VEC_PHI16]]
 ; CHECK-NEXT:    [[TMP12]] = add <2 x i64> [[TMP10]], [[WIDE_LOAD18]]
@@ -87,14 +88,14 @@ define i64 @sum_2_at_with_int_conversion(ptr %A, ptr %B, i64 %N) {
 ; CHECK-NEXT:    unreachable
 ; CHECK:       at_with_int_conversion.exit11:
 ; CHECK-NEXT:    [[GEP_IDX_I:%.*]] = getelementptr i64, ptr [[START_I]], i64 [[IV]]
-; CHECK-NEXT:    [[LV_I:%.*]] = load i64, ptr [[GEP_IDX_I]], align 4
+; CHECK-NEXT:    [[LV_I:%.*]] = load i64, ptr [[GEP_IDX_I]], align 8
 ; CHECK-NEXT:    [[GEP_IDX_I8:%.*]] = getelementptr i64, ptr [[START_I1_PEEL]], i64 [[IV]]
-; CHECK-NEXT:    [[LV_I9:%.*]] = load i64, ptr [[GEP_IDX_I8]], align 4
+; CHECK-NEXT:    [[LV_I9:%.*]] = load i64, ptr [[GEP_IDX_I8]], align 8
 ; CHECK-NEXT:    [[ADD:%.*]] = add i64 [[LV_I]], [[SUM]]
 ; CHECK-NEXT:    [[SUM_NEXT]] = add i64 [[ADD]], [[LV_I9]]
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[C:%.*]] = icmp slt i64 [[IV]], [[N]]
-; CHECK-NEXT:    br i1 [[C]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw i64 [[IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV]], [[SMAX]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    [[SUM_NEXT_LCSSA:%.*]] = phi i64 [ [[SUM_NEXT_PEEL]], [[AT_WITH_INT_CONVERSION_EXIT11_PEEL:%.*]] ], [ [[SUM_NEXT]], [[AT_WITH_INT_CONVERSION_EXIT11]] ]
 ; CHECK-NEXT:    ret i64 [[SUM_NEXT_LCSSA]]
@@ -127,8 +128,9 @@ define i64 @sum_3_at_with_int_conversion(ptr %A, ptr %B, ptr %C, i64 %N) {
 ; CHECK-NEXT:    [[END_INT_I:%.*]] = ptrtoint ptr [[END_I]] to i64
 ; CHECK-NEXT:    [[SUB_I:%.*]] = sub i64 [[END_INT_I]], [[START_INT_I]]
 ; CHECK-NEXT:    [[GEP_END_I13:%.*]] = getelementptr [[VEC]], ptr [[C:%.*]], i64 0, i32 1
+; CHECK-NEXT:    [[SMAX:%.*]] = tail call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 0)
 ; CHECK-NEXT:    [[GEP_END_I2:%.*]] = getelementptr [[VEC]], ptr [[B:%.*]], i64 0, i32 1
-; CHECK-NEXT:    [[LV_I_PEEL:%.*]] = load i64, ptr [[START_I]], align 4
+; CHECK-NEXT:    [[LV_I_PEEL:%.*]] = load i64, ptr [[START_I]], align 8
 ; CHECK-NEXT:    [[START_I1_PEEL:%.*]] = load ptr, ptr [[B]], align 8
 ; CHECK-NEXT:    [[END_I3_PEEL:%.*]] = load ptr, ptr [[GEP_END_I2]], align 8
 ; CHECK-NEXT:    [[START_INT_I4_PEEL:%.*]] = ptrtoint ptr [[START_I1_PEEL]] to i64
@@ -140,14 +142,14 @@ define i64 @sum_3_at_with_int_conversion(ptr %A, ptr %B, ptr %C, i64 %N) {
 ; CHECK-NEXT:    [[START_INT_I15_PEEL:%.*]] = ptrtoint ptr [[START_I12_PEEL]] to i64
 ; CHECK-NEXT:    [[END_INT_I16_PEEL:%.*]] = ptrtoint ptr [[END_I14_PEEL]] to i64
 ; CHECK-NEXT:    [[SUB_I17_PEEL:%.*]] = sub i64 [[END_INT_I16_PEEL]], [[START_INT_I15_PEEL]]
-; CHECK-NEXT:    [[LV_I9_PEEL:%.*]] = load i64, ptr [[START_I1_PEEL]], align 4
-; CHECK-NEXT:    [[LV_I20_PEEL:%.*]] = load i64, ptr [[START_I12_PEEL]], align 4
+; CHECK-NEXT:    [[LV_I9_PEEL:%.*]] = load i64, ptr [[START_I1_PEEL]], align 8
+; CHECK-NEXT:    [[LV_I20_PEEL:%.*]] = load i64, ptr [[START_I12_PEEL]], align 8
 ; CHECK-NEXT:    [[ADD_2_PEEL:%.*]] = add i64 [[LV_I_PEEL]], [[LV_I9_PEEL]]
 ; CHECK-NEXT:    [[SUM_NEXT_PEEL:%.*]] = add i64 [[ADD_2_PEEL]], [[LV_I20_PEEL]]
-; CHECK-NEXT:    [[COND_PEEL:%.*]] = icmp sgt i64 [[N:%.*]], 0
-; CHECK-NEXT:    br i1 [[COND_PEEL]], label [[LOOP_PREHEADER:%.*]], label [[EXIT:%.*]]
+; CHECK-NEXT:    [[EXITCOND_PEEL_NOT:%.*]] = icmp slt i64 [[N]], 1
+; CHECK-NEXT:    br i1 [[EXITCOND_PEEL_NOT]], label [[EXIT:%.*]], label [[LOOP_PREHEADER:%.*]]
 ; CHECK:       loop.preheader:
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[N]], -1
+; CHECK-NEXT:    [[TMP0:%.*]] = add nsw i64 [[SMAX]], -1
 ; CHECK-NEXT:    [[UMIN:%.*]] = tail call i64 @llvm.umin.i64(i64 [[SUB_I17_PEEL]], i64 [[TMP0]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = freeze i64 [[UMIN]]
 ; CHECK-NEXT:    [[UMIN26:%.*]] = tail call i64 @llvm.umin.i64(i64 [[TMP1]], i64 [[SUB_I6_PEEL]])
@@ -169,17 +171,17 @@ define i64 @sum_3_at_with_int_conversion(ptr %A, ptr %B, ptr %C, i64 %N) {
 ; CHECK-NEXT:    [[VEC_PHI28:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = or i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i64, ptr [[START_I]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP6]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP6]], align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i64, ptr [[TMP6]], i64 2
-; CHECK-NEXT:    [[WIDE_LOAD29:%.*]] = load <2 x i64>, ptr [[TMP7]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD29:%.*]] = load <2 x i64>, ptr [[TMP7]], align 8
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i64, ptr [[START_I1_PEEL]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[WIDE_LOAD30:%.*]] = load <2 x i64>, ptr [[TMP8]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD30:%.*]] = load <2 x i64>, ptr [[TMP8]], align 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i64, ptr [[TMP8]], i64 2
-; CHECK-NEXT:    [[WIDE_LOAD31:%.*]] = load <2 x i64>, ptr [[TMP9]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD31:%.*]] = load <2 x i64>, ptr [[TMP9]], align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i64, ptr [[START_I12_PEEL]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[WIDE_LOAD32:%.*]] = load <2 x i64>, ptr [[TMP10]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD32:%.*]] = load <2 x i64>, ptr [[TMP10]], align 8
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i64, ptr [[TMP10]], i64 2
-; CHECK-NEXT:    [[WIDE_LOAD33:%.*]] = load <2 x i64>, ptr [[TMP11]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD33:%.*]] = load <2 x i64>, ptr [[TMP11]], align 8
 ; CHECK-NEXT:    [[TMP12:%.*]] = add <2 x i64> [[WIDE_LOAD]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = add <2 x i64> [[WIDE_LOAD29]], [[VEC_PHI28]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = add <2 x i64> [[TMP12]], [[WIDE_LOAD30]]
@@ -207,7 +209,7 @@ define i64 @sum_3_at_with_int_conversion(ptr %A, ptr %B, ptr %C, i64 %N) {
 ; CHECK-NEXT:    unreachable
 ; CHECK:       at_with_int_conversion.exit:
 ; CHECK-NEXT:    [[GEP_IDX_I:%.*]] = getelementptr i64, ptr [[START_I]], i64 [[IV]]
-; CHECK-NEXT:    [[LV_I:%.*]] = load i64, ptr [[GEP_IDX_I]], align 4
+; CHECK-NEXT:    [[LV_I:%.*]] = load i64, ptr [[GEP_IDX_I]], align 8
 ; CHECK-NEXT:    [[INRANGE_I7:%.*]] = icmp ult i64 [[SUB_I6_PEEL]], [[IV]]
 ; CHECK-NEXT:    br i1 [[INRANGE_I7]], label [[ERROR_I10:%.*]], label [[AT_WITH_INT_CONVERSION_EXIT11:%.*]]
 ; CHECK:       error.i10:
@@ -221,15 +223,15 @@ define i64 @sum_3_at_with_int_conversion(ptr %A, ptr %B, ptr %C, i64 %N) {
 ; CHECK-NEXT:    unreachable
 ; CHECK:       at_with_int_conversion.exit22:
 ; CHECK-NEXT:    [[GEP_IDX_I8:%.*]] = getelementptr i64, ptr [[START_I1_PEEL]], i64 [[IV]]
-; CHECK-NEXT:    [[LV_I9:%.*]] = load i64, ptr [[GEP_IDX_I8]], align 4
+; CHECK-NEXT:    [[LV_I9:%.*]] = load i64, ptr [[GEP_IDX_I8]], align 8
 ; CHECK-NEXT:    [[GEP_IDX_I19:%.*]] = getelementptr i64, ptr [[START_I12_PEEL]], i64 [[IV]]
-; CHECK-NEXT:    [[LV_I20:%.*]] = load i64, ptr [[GEP_IDX_I19]], align 4
+; CHECK-NEXT:    [[LV_I20:%.*]] = load i64, ptr [[GEP_IDX_I19]], align 8
 ; CHECK-NEXT:    [[ADD_1:%.*]] = add i64 [[LV_I]], [[SUM]]
 ; CHECK-NEXT:    [[ADD_2:%.*]] = add i64 [[ADD_1]], [[LV_I9]]
 ; CHECK-NEXT:    [[SUM_NEXT]] = add i64 [[ADD_2]], [[LV_I20]]
-; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
-; CHECK-NEXT:    [[COND:%.*]] = icmp slt i64 [[IV]], [[N]]
-; CHECK-NEXT:    br i1 [[COND]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw i64 [[IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV]], [[SMAX]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    [[SUM_NEXT_LCSSA:%.*]] = phi i64 [ [[SUM_NEXT_PEEL]], [[AT_WITH_INT_CONVERSION_EXIT22_PEEL:%.*]] ], [ [[SUM_NEXT]], [[AT_WITH_INT_CONVERSION_EXIT22]] ]
 ; CHECK-NEXT:    ret i64 [[SUM_NEXT_LCSSA]]
@@ -267,7 +269,7 @@ define i64 @at_with_int_conversion(ptr %ptr, i64 %idx) {
 ; CHECK-NEXT:    br i1 [[INRANGE]], label [[ERROR:%.*]], label [[EXIT:%.*]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    [[GEP_IDX:%.*]] = getelementptr i64, ptr [[START]], i64 [[IDX]]
-; CHECK-NEXT:    [[LV:%.*]] = load i64, ptr [[GEP_IDX]], align 4
+; CHECK-NEXT:    [[LV:%.*]] = load i64, ptr [[GEP_IDX]], align 8
 ; CHECK-NEXT:    ret i64 [[LV]]
 ; CHECK:       error:
 ; CHECK-NEXT:    tail call void @error()
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-add-i64.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-add-i64.ll
index 47b98d1a6f733..d67fdc1cd6aa0 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-add-i64.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-add-i64.ll
@@ -183,9 +183,9 @@ entry:
 define i64 @red_ld_2xi64(ptr %ptr) {
 ; CHECK-LABEL: @red_ld_2xi64(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[LD0:%.*]] = load i64, ptr [[PTR:%.*]], align 4
+; CHECK-NEXT:    [[LD0:%.*]] = load i64, ptr [[PTR:%.*]], align 8
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i64, ptr [[PTR]], i64 1
-; CHECK-NEXT:    [[LD1:%.*]] = load i64, ptr [[GEP]], align 4
+; CHECK-NEXT:    [[LD1:%.*]] = load i64, ptr [[GEP]], align 8
 ; CHECK-NEXT:    [[ADD_1:%.*]] = add nuw nsw i64 [[LD0]], [[LD1]]
 ; CHECK-NEXT:    ret i64 [[ADD_1]]
 ;
@@ -200,7 +200,7 @@ entry:
 define i64 @red_ld_4xi64(ptr %ptr) {
 ; CHECK-LABEL: @red_ld_4xi64(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i64>, ptr [[PTR:%.*]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i64>, ptr [[PTR:%.*]], align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP0]])
 ; CHECK-NEXT:    ret i64 [[TMP1]]
 ;
@@ -221,7 +221,7 @@ entry:
 define i64 @red_ld_8xi64(ptr %ptr) {
 ; CHECK-LABEL: @red_ld_8xi64(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i64>, ptr [[PTR:%.*]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i64>, ptr [[PTR:%.*]], align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP0]])
 ; CHECK-NEXT:    ret i64 [[TMP1]]
 ;
@@ -254,7 +254,7 @@ entry:
 define i64 @red_ld_16xi64(ptr %ptr) {
 ; CHECK-LABEL: @red_ld_16xi64(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i64>, ptr [[PTR:%.*]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i64>, ptr [[PTR:%.*]], align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> [[TMP0]])
 ; CHECK-NEXT:    ret i64 [[TMP1]]
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/splat-loads.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/splat-loads.ll
index 111ca8abf8122..afaf6b98e5081 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/splat-loads.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/splat-loads.ll
@@ -101,7 +101,7 @@ define void @splat_loads_i64(ptr %array1, ptr %array2, ptr %ptrA, ptr %ptrB) {
 ; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP6:%.*]] = or <2 x i64> [[TMP0]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = add <2 x i64> [[TMP3]], [[TMP6]]
-; CHECK-NEXT:    store <2 x i64> [[TMP7]], ptr [[ARRAY1]], align 4
+; CHECK-NEXT:    store <2 x i64> [[TMP7]], ptr [[ARRAY1]], align 8
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/control-dependence.ll b/llvm/test/Transforms/SLPVectorizer/X86/control-dependence.ll
index d1dd57fac4958..69cfdd44ff6e7 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/control-dependence.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/control-dependence.ll
@@ -8,10 +8,10 @@ declare i64 @may_throw() willreturn
 ; Base case with no interesting control dependencies
 define void @test_no_control(ptr %a, ptr %b, ptr %c) {
 ; CHECK-LABEL: @test_no_control(
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[C:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[C:%.*]], align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = add <2 x i64> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    store <2 x i64> [[TMP3]], ptr [[B:%.*]], align 4
+; CHECK-NEXT:    store <2 x i64> [[TMP3]], ptr [[B:%.*]], align 8
 ; CHECK-NEXT:    ret void
 ;
   %v1 = load i64, ptr %a
@@ -32,13 +32,13 @@ define void @test_no_control(ptr %a, ptr %b, ptr %c) {
 
 define void @test1(ptr %a, ptr %b, ptr %c) {
 ; CHECK-LABEL: @test1(
-; CHECK-NEXT:    [[C1:%.*]] = load i64, ptr [[C:%.*]], align 4
+; CHECK-NEXT:    [[C1:%.*]] = load i64, ptr [[C:%.*]], align 8
 ; CHECK-NEXT:    [[C2:%.*]] = call i64 @may_inf_loop_ro()
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i64> poison, i64 [[C1]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[C2]], i32 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = add <2 x i64> [[TMP1]], [[TMP3]]
-; CHECK-NEXT:    store <2 x i64> [[TMP4]], ptr [[B:%.*]], align 4
+; CHECK-NEXT:    store <2 x i64> [[TMP4]], ptr [[B:%.*]], align 8
 ; CHECK-NEXT:    ret void
 ;
   %v1 = load i64, ptr %a
@@ -58,13 +58,13 @@ define void @test1(ptr %a, ptr %b, ptr %c) {
 
 define void @test2(ptr %a, ptr %b, ptr %c) {
 ; CHECK-LABEL: @test2(
-; CHECK-NEXT:    [[C1:%.*]] = load i64, ptr [[C:%.*]], align 4
+; CHECK-NEXT:    [[C1:%.*]] = load i64, ptr [[C:%.*]], align 8
 ; CHECK-NEXT:    [[C2:%.*]] = call i64 @may_inf_loop_ro()
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i64> poison, i64 [[C1]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[C2]], i32 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = add <2 x i64> [[TMP1]], [[TMP3]]
-; CHECK-NEXT:    store <2 x i64> [[TMP4]], ptr [[B:%.*]], align 4
+; CHECK-NEXT:    store <2 x i64> [[TMP4]], ptr [[B:%.*]], align 8
 ; CHECK-NEXT:    ret void
 ;
   %c1 = load i64, ptr %c
@@ -85,13 +85,13 @@ define void @test2(ptr %a, ptr %b, ptr %c) {
 
 define void @test3(ptr %a, ptr %b, ptr %c) {
 ; CHECK-LABEL: @test3(
-; CHECK-NEXT:    [[C1:%.*]] = load i64, ptr [[C:%.*]], align 4
+; CHECK-NEXT:    [[C1:%.*]] = load i64, ptr [[C:%.*]], align 8
 ; CHECK-NEXT:    [[C2:%.*]] = call i64 @may_inf_loop_ro()
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i64> poison, i64 [[C1]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[C2]], i32 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = add <2 x i64> [[TMP1]], [[TMP3]]
-; CHECK-NEXT:    store <2 x i64> [[TMP4]], ptr [[B:%.*]], align 4
+; CHECK-NEXT:    store <2 x i64> [[TMP4]], ptr [[B:%.*]], align 8
 ; CHECK-NEXT:    ret void
 ;
   %v1 = load i64, ptr %a
@@ -111,13 +111,13 @@ define void @test3(ptr %a, ptr %b, ptr %c) {
 
 define void @test4(ptr %a, ptr %b, ptr %c) {
 ; CHECK-LABEL: @test4(
-; CHECK-NEXT:    [[C1:%.*]] = load i64, ptr [[C:%.*]], align 4
+; CHECK-NEXT:    [[C1:%.*]] = load i64, ptr [[C:%.*]], align 8
 ; CHECK-NEXT:    [[C2:%.*]] = call i64 @may_inf_loop_ro()
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i64> poison, i64 [[C1]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[C2]], i32 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = add <2 x i64> [[TMP1]], [[TMP3]]
-; CHECK-NEXT:    store <2 x i64> [[TMP4]], ptr [[B:%.*]], align 4
+; CHECK-NEXT:    store <2 x i64> [[TMP4]], ptr [[B:%.*]], align 8
 ; CHECK-NEXT:    ret void
 ;
   %v1 = load i64, ptr %a
@@ -138,12 +138,12 @@ define void @test4(ptr %a, ptr %b, ptr %c) {
 define void @test5(ptr %a, ptr %b, ptr %c) {
 ; CHECK-LABEL: @test5(
 ; CHECK-NEXT:    [[C2:%.*]] = call i64 @may_inf_loop_ro()
-; CHECK-NEXT:    [[C1:%.*]] = load i64, ptr [[C:%.*]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 4
+; CHECK-NEXT:    [[C1:%.*]] = load i64, ptr [[C:%.*]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i64> poison, i64 [[C1]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[C2]], i32 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = add <2 x i64> [[TMP1]], [[TMP3]]
-; CHECK-NEXT:    store <2 x i64> [[TMP4]], ptr [[B:%.*]], align 4
+; CHECK-NEXT:    store <2 x i64> [[TMP4]], ptr [[B:%.*]], align 8
 ; CHECK-NEXT:    ret void
 ;
   %a2 = getelementptr i64, ptr %a, i32 1
@@ -164,10 +164,10 @@ define void @test5(ptr %a, ptr %b, ptr %c) {
 define void @test6(ptr %a, ptr %b, ptr %c) {
 ; CHECK-LABEL: @test6(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @may_inf_loop_ro()
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr [[C:%.*]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[A:%.*]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = load <2 x i64>, ptr [[C:%.*]], align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = add <2 x i64> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    store <2 x i64> [[TMP4]], ptr [[B:%.*]], align 4
+; CHECK-NEXT:    store <2 x i64> [[TMP4]], ptr [[B:%.*]], align 8
 ; CHECK-NEXT:    ret void
 ;
   %v1 = load i64, ptr %a
@@ -196,15 +196,15 @@ define void @test6(ptr %a, ptr %b, ptr %c) {
 define void @test7(ptr %a, ptr %b, ptr %c) {
 ; CHECK-LABEL: @test7(
 ; CHECK-NEXT:    [[A2:%.*]] = getelementptr i64, ptr [[A:%.*]], i32 1
-; CHECK-NEXT:    [[V1:%.*]] = load i64, ptr [[A]], align 4
-; CHECK-NEXT:    store i64 0, ptr [[A]], align 4
+; CHECK-NEXT:    [[V1:%.*]] = load i64, ptr [[A]], align 8
+; CHECK-NEXT:    store i64 0, ptr [[A]], align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @may_inf_loop_ro()
-; CHECK-NEXT:    [[V2:%.*]] = load i64, ptr [[A2]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[C:%.*]], align 4
+; CHECK-NEXT:    [[V2:%.*]] = load i64, ptr [[A2]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[C:%.*]], align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[V1]], i32 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[V2]], i32 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[TMP4]], [[TMP2]]
-; CHECK-NEXT:    store <2 x i64> [[TMP5]], ptr [[B:%.*]], align 4
+; CHECK-NEXT:    store <2 x i64> [[TMP5]], ptr [[B:%.*]], align 8
 ; CHECK-NEXT:    ret void
 ;
   %v1 = load i64, ptr %a
@@ -229,15 +229,15 @@ define void @test7(ptr %a, ptr %b, ptr %c) {
 define void @test8(ptr %a, ptr %b, ptr %c) {
 ; CHECK-LABEL: @test8(
 ; CHECK-NEXT:    [[A2:%.*]] = getelementptr i64, ptr [[A:%.*]], i32 1
-; CHECK-NEXT:    [[V1:%.*]] = load i64, ptr [[A]], align 4
-; CHECK-NEXT:    store i64 0, ptr [[A]], align 4
+; CHECK-NEXT:    [[V1:%.*]] = load i64, ptr [[A]], align 8
+; CHECK-NEXT:    store i64 0, ptr [[A]], align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @may_throw() #[[ATTR4:[0-9]+]]
-; CHECK-NEXT:    [[V2:%.*]] = load i64, ptr [[A2]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[C:%.*]], align 4
+; CHECK-NEXT:    [[V2:%.*]] = load i64, ptr [[A2]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[C:%.*]], align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[V1]], i32 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[V2]], i32 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[TMP4]], [[TMP2]]
-; CHECK-NEXT:    store <2 x i64> [[TMP5]], ptr [[B:%.*]], align 4
+; CHECK-NEXT:    store <2 x i64> [[TMP5]], ptr [[B:%.*]], align 8
 ; CHECK-NEXT:    ret void
 ;
   %v1 = load i64, ptr %a
@@ -262,15 +262,15 @@ define void @test8(ptr %a, ptr %b, ptr %c) {
 define void @test9(ptr %a, ptr %b, ptr %c) {
 ; CHECK-LABEL: @test9(
 ; CHECK-NEXT:    [[A2:%.*]] = getelementptr i64, ptr [[A:%.*]], i32 1
-; CHECK-NEXT:    [[V1:%.*]] = load i64, ptr [[A]], align 4
-; CHECK-NEXT:    store i64 0, ptr [[A]], align 4
+; CHECK-NEXT:    [[V1:%.*]] = load i64, ptr [[A]], align 8
+; CHECK-NEXT:    store i64 0, ptr [[A]], align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @may_throw()
-; CHECK-NEXT:    [[V2:%.*]] = load i64, ptr [[A2]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[C:%.*]], align 4
+; CHECK-NEXT:    [[V2:%.*]] = load i64, ptr [[A2]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[C:%.*]], align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[V1]], i32 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[V2]], i32 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[TMP4]], [[TMP2]]
-; CHECK-NEXT:    store <2 x i64> [[TMP5]], ptr [[B:%.*]], align 4
+; CHECK-NEXT:    store <2 x i64> [[TMP5]], ptr [[B:%.*]], align 8
 ; CHECK-NEXT:    ret void
 ;
   %v1 = load i64, ptr %a
@@ -294,18 +294,18 @@ define void @test9(ptr %a, ptr %b, ptr %c) {
 ; A variant of test7 which shows the same problem with a non-load instruction
 define void @test10(ptr %a, ptr %b, ptr %c) {
 ; CHECK-LABEL: @test10(
-; CHECK-NEXT:    [[V1:%.*]] = load i64, ptr [[A:%.*]], align 4
+; CHECK-NEXT:    [[V1:%.*]] = load i64, ptr [[A:%.*]], align 8
 ; CHECK-NEXT:    [[A2:%.*]] = getelementptr i64, ptr [[A]], i32 1
-; CHECK-NEXT:    [[V2:%.*]] = load i64, ptr [[A2]], align 4
+; CHECK-NEXT:    [[V2:%.*]] = load i64, ptr [[A2]], align 8
 ; CHECK-NEXT:    [[U1:%.*]] = udiv i64 200, [[V1]]
-; CHECK-NEXT:    store i64 [[U1]], ptr [[A]], align 4
+; CHECK-NEXT:    store i64 [[U1]], ptr [[A]], align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @may_inf_loop_ro()
 ; CHECK-NEXT:    [[U2:%.*]] = udiv i64 200, [[V2]]
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[C:%.*]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[C:%.*]], align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[U1]], i32 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[U2]], i32 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[TMP4]], [[TMP2]]
-; CHECK-NEXT:    store <2 x i64> [[TMP5]], ptr [[B:%.*]], align 4
+; CHECK-NEXT:    store <2 x i64> [[TMP5]], ptr [[B:%.*]], align 8
 ; CHECK-NEXT:    ret void
 ;
   %v1 = load i64, ptr %a
@@ -334,14 +334,14 @@ define void @test10(ptr %a, ptr %b, ptr %c) {
 define void @test11(i64 %x, i64 %y, ptr %b, ptr %c) {
 ; CHECK-LABEL: @test11(
 ; CHECK-NEXT:    [[U1:%.*]] = udiv i64 200, [[X:%.*]]
-; CHECK-NEXT:    store i64 [[U1]], ptr [[B:%.*]], align 4
+; CHECK-NEXT:    store i64 [[U1]], ptr [[B:%.*]], align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @may_inf_loop_ro()
 ; CHECK-NEXT:    [[U2:%.*]] = udiv i64 200, [[Y:%.*]]
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[C:%.*]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i64>, ptr [[C:%.*]], align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[U1]], i32 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[U2]], i32 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[TMP4]], [[TMP2]]
-; CHECK-NEXT:    store <2 x i64> [[TMP5]], ptr [[B]], align 4
+; CHECK-NEXT:    store <2 x i64> [[TMP5]], ptr [[B]], align 8
 ; CHECK-NEXT:    ret void
 ;
   %u1 = udiv i64 200, %x
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/opaque-ptr.ll b/llvm/test/Transforms/SLPVectorizer/X86/opaque-ptr.ll
index e2adeb911670a..a2c2e41d81b42 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/opaque-ptr.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/opaque-ptr.ll
@@ -52,14 +52,17 @@ define void @test(ptr %r, ptr %p, ptr %q) #0 {
 
 define void @test2(i64* %a, i64* %b) {
 ; CHECK-LABEL: @test2(
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x ptr> poison, ptr [[A:%.*]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x ptr> [[TMP1]], ptr [[B:%.*]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i64, <2 x ptr> [[TMP2]], <2 x i64> <i64 1, i64 3>
-; CHECK-NEXT:    [[TMP4:%.*]] = ptrtoint <2 x ptr> [[TMP3]] to <2 x i64>
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x ptr> [[TMP3]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i64>, ptr [[TMP5]], align 8
-; CHECK-NEXT:    [[TMP7:%.*]] = add <2 x i64> [[TMP4]], [[TMP6]]
-; CHECK-NEXT:    store <2 x i64> [[TMP7]], ptr [[TMP5]], align 8
+; CHECK-NEXT:    [[A1:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 1
+; CHECK-NEXT:    [[A2:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 2
+; CHECK-NEXT:    [[I1:%.*]] = ptrtoint ptr [[A1]] to i64
+; CHECK-NEXT:    [[B3:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 3
+; CHECK-NEXT:    [[I2:%.*]] = ptrtoint ptr [[B3]] to i64
+; CHECK-NEXT:    [[V1:%.*]] = load i64, ptr [[A1]], align 8
+; CHECK-NEXT:    [[V2:%.*]] = load i64, ptr [[A2]], align 8
+; CHECK-NEXT:    [[ADD1:%.*]] = add i64 [[I1]], [[V1]]
+; CHECK-NEXT:    [[ADD2:%.*]] = add i64 [[I2]], [[V2]]
+; CHECK-NEXT:    store i64 [[ADD1]], ptr [[A1]], align 8
+; CHECK-NEXT:    store i64 [[ADD2]], ptr [[A2]], align 8
 ; CHECK-NEXT:    ret void
 ;
   %a1 = getelementptr inbounds i64, i64* %a, i64 1
diff --git a/llvm/test/Transforms/SafeStack/X86/setjmp2.ll b/llvm/test/Transforms/SafeStack/X86/setjmp2.ll
index 7fbd4506bea78..d5bd7c67109a1 100644
--- a/llvm/test/Transforms/SafeStack/X86/setjmp2.ll
+++ b/llvm/test/Transforms/SafeStack/X86/setjmp2.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
-; RUN: opt -safe-stack -S -mtriple=i386-pc-linux-gnu < %s -o - | FileCheck %s
-; RUN: opt -safe-stack -S -mtriple=x86_64-pc-linux-gnu < %s -o - | FileCheck %s
+; RUN: opt -safe-stack -S -mtriple=i386-pc-linux-gnu < %s -o - | FileCheck %s --check-prefix=I386
+; RUN: opt -safe-stack -S -mtriple=x86_64-pc-linux-gnu < %s -o - | FileCheck %s  --check-prefix=X86-64
 
 %struct.__jmp_buf_tag = type { [8 x i64], i32, %struct.__sigset_t }
 %struct.__sigset_t = type { [16 x i64] }
@@ -11,27 +11,48 @@
 ; setjmp/longjmp test with dynamically sized array.
 ; Requires protector.
 define i32 @foo(i32 %size) nounwind uwtable safestack {
-; CHECK-LABEL: define i32 @foo(
-; CHECK-SAME: i32 [[SIZE:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[UNSAFE_STACK_PTR:%.*]] = load ptr, ptr @__safestack_unsafe_stack_ptr, align 8
-; CHECK-NEXT:    [[UNSAFE_STACK_DYNAMIC_PTR:%.*]] = alloca ptr, align 8
-; CHECK-NEXT:    store ptr [[UNSAFE_STACK_PTR]], ptr [[UNSAFE_STACK_DYNAMIC_PTR]], align 8
-; CHECK-NEXT:    [[TMP0:%.*]] = zext i32 [[SIZE]] to i64
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr @__safestack_unsafe_stack_ptr, align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[TMP2]] to i64
-; CHECK-NEXT:    [[TMP4:%.*]] = sub i64 [[TMP3]], [[TMP1]]
-; CHECK-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], -16
-; CHECK-NEXT:    [[A:%.*]] = inttoptr i64 [[TMP5]] to ptr
-; CHECK-NEXT:    store ptr [[A]], ptr @__safestack_unsafe_stack_ptr, align 8
-; CHECK-NEXT:    store ptr [[A]], ptr [[UNSAFE_STACK_DYNAMIC_PTR]], align 8
-; CHECK-NEXT:    [[CALL:%.*]] = call i32 @_setjmp(ptr @buf) #[[ATTR1:[0-9]+]]
-; CHECK-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[UNSAFE_STACK_DYNAMIC_PTR]], align 8
-; CHECK-NEXT:    store ptr [[TMP6]], ptr @__safestack_unsafe_stack_ptr, align 8
-; CHECK-NEXT:    call void @funcall(ptr [[A]])
-; CHECK-NEXT:    store ptr [[UNSAFE_STACK_PTR]], ptr @__safestack_unsafe_stack_ptr, align 8
-; CHECK-NEXT:    ret i32 0
+; I386-LABEL: define i32 @foo(
+; I386-SAME: i32 [[SIZE:%.*]]) #[[ATTR0:[0-9]+]] {
+; I386-NEXT:  entry:
+; I386-NEXT:    [[UNSAFE_STACK_PTR:%.*]] = load ptr, ptr @__safestack_unsafe_stack_ptr, align 4
+; I386-NEXT:    [[UNSAFE_STACK_DYNAMIC_PTR:%.*]] = alloca ptr, align 4
+; I386-NEXT:    store ptr [[UNSAFE_STACK_PTR]], ptr [[UNSAFE_STACK_DYNAMIC_PTR]], align 4
+; I386-NEXT:    [[TMP0:%.*]] = mul i32 [[SIZE]], 4
+; I386-NEXT:    [[TMP1:%.*]] = load ptr, ptr @__safestack_unsafe_stack_ptr, align 4
+; I386-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[TMP1]] to i32
+; I386-NEXT:    [[TMP3:%.*]] = sub i32 [[TMP2]], [[TMP0]]
+; I386-NEXT:    [[TMP4:%.*]] = and i32 [[TMP3]], -16
+; I386-NEXT:    [[A:%.*]] = inttoptr i32 [[TMP4]] to ptr
+; I386-NEXT:    store ptr [[A]], ptr @__safestack_unsafe_stack_ptr, align 4
+; I386-NEXT:    store ptr [[A]], ptr [[UNSAFE_STACK_DYNAMIC_PTR]], align 4
+; I386-NEXT:    [[CALL:%.*]] = call i32 @_setjmp(ptr @buf) #[[ATTR1:[0-9]+]]
+; I386-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[UNSAFE_STACK_DYNAMIC_PTR]], align 4
+; I386-NEXT:    store ptr [[TMP5]], ptr @__safestack_unsafe_stack_ptr, align 4
+; I386-NEXT:    call void @funcall(ptr [[A]])
+; I386-NEXT:    store ptr [[UNSAFE_STACK_PTR]], ptr @__safestack_unsafe_stack_ptr, align 4
+; I386-NEXT:    ret i32 0
+;
+; X86-64-LABEL: define i32 @foo(
+; X86-64-SAME: i32 [[SIZE:%.*]]) #[[ATTR0:[0-9]+]] {
+; X86-64-NEXT:  entry:
+; X86-64-NEXT:    [[UNSAFE_STACK_PTR:%.*]] = load ptr, ptr @__safestack_unsafe_stack_ptr, align 8
+; X86-64-NEXT:    [[UNSAFE_STACK_DYNAMIC_PTR:%.*]] = alloca ptr, align 8
+; X86-64-NEXT:    store ptr [[UNSAFE_STACK_PTR]], ptr [[UNSAFE_STACK_DYNAMIC_PTR]], align 8
+; X86-64-NEXT:    [[TMP0:%.*]] = zext i32 [[SIZE]] to i64
+; X86-64-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; X86-64-NEXT:    [[TMP2:%.*]] = load ptr, ptr @__safestack_unsafe_stack_ptr, align 8
+; X86-64-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[TMP2]] to i64
+; X86-64-NEXT:    [[TMP4:%.*]] = sub i64 [[TMP3]], [[TMP1]]
+; X86-64-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], -16
+; X86-64-NEXT:    [[A:%.*]] = inttoptr i64 [[TMP5]] to ptr
+; X86-64-NEXT:    store ptr [[A]], ptr @__safestack_unsafe_stack_ptr, align 8
+; X86-64-NEXT:    store ptr [[A]], ptr [[UNSAFE_STACK_DYNAMIC_PTR]], align 8
+; X86-64-NEXT:    [[CALL:%.*]] = call i32 @_setjmp(ptr @buf) #[[ATTR1:[0-9]+]]
+; X86-64-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[UNSAFE_STACK_DYNAMIC_PTR]], align 8
+; X86-64-NEXT:    store ptr [[TMP6]], ptr @__safestack_unsafe_stack_ptr, align 8
+; X86-64-NEXT:    call void @funcall(ptr [[A]])
+; X86-64-NEXT:    store ptr [[UNSAFE_STACK_PTR]], ptr @__safestack_unsafe_stack_ptr, align 8
+; X86-64-NEXT:    ret i32 0
 ;
 entry:
 
diff --git a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/split-gep-and-gvn.ll b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/split-gep-and-gvn.ll
index 237c2b42b1901..6ef8a38dfd459 100644
--- a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/split-gep-and-gvn.ll
+++ b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/split-gep-and-gvn.ll
@@ -22,20 +22,20 @@ define void @sum_of_array(i32 %x, i32 %y, ptr nocapture %output) {
 ; IR-NEXT:  .preheader:
 ; IR-NEXT:    [[I:%.*]] = sext i32 [[Y]] to i64
 ; IR-NEXT:    [[I1:%.*]] = sext i32 [[X]] to i64
-; IR-NEXT:    [[I2:%.*]] = getelementptr [32 x [32 x float]], ptr addrspace(3) @array, i64 0, i64 [[I1]], i64 [[I]]
+; IR-NEXT:    [[I2:%.*]] = getelementptr [32 x [32 x float]], ptr addrspace(3) @array, i32 0, i32 [[X]], i32 [[Y]]
 ; IR-NEXT:    [[I3:%.*]] = addrspacecast ptr addrspace(3) [[I2]] to ptr
 ; IR-NEXT:    [[I4:%.*]] = load float, ptr [[I3]], align 4
 ; IR-NEXT:    [[I5:%.*]] = fadd float [[I4]], 0.000000e+00
-; IR-NEXT:    [[I82:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[I2]], i64 1
-; IR-NEXT:    [[I9:%.*]] = addrspacecast ptr addrspace(3) [[I82]] to ptr
+; IR-NEXT:    [[I87:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[I2]], i32 1
+; IR-NEXT:    [[I9:%.*]] = addrspacecast ptr addrspace(3) [[I87]] to ptr
 ; IR-NEXT:    [[I10:%.*]] = load float, ptr [[I9]], align 4
 ; IR-NEXT:    [[I11:%.*]] = fadd float [[I5]], [[I10]]
-; IR-NEXT:    [[I144:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[I2]], i64 32
-; IR-NEXT:    [[I15:%.*]] = addrspacecast ptr addrspace(3) [[I144]] to ptr
+; IR-NEXT:    [[I1412:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[I2]], i32 32
+; IR-NEXT:    [[I15:%.*]] = addrspacecast ptr addrspace(3) [[I1412]] to ptr
 ; IR-NEXT:    [[I16:%.*]] = load float, ptr [[I15]], align 4
 ; IR-NEXT:    [[I17:%.*]] = fadd float [[I11]], [[I16]]
-; IR-NEXT:    [[I187:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[I2]], i64 33
-; IR-NEXT:    [[I19:%.*]] = addrspacecast ptr addrspace(3) [[I187]] to ptr
+; IR-NEXT:    [[I1818:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[I2]], i32 33
+; IR-NEXT:    [[I19:%.*]] = addrspacecast ptr addrspace(3) [[I1818]] to ptr
 ; IR-NEXT:    [[I20:%.*]] = load float, ptr [[I19]], align 4
 ; IR-NEXT:    [[I21:%.*]] = fadd float [[I17]], [[I20]]
 ; IR-NEXT:    store float [[I21]], ptr [[OUTPUT]], align 4
@@ -84,20 +84,20 @@ define void @sum_of_array2(i32 %x, i32 %y, ptr nocapture %output) {
 ; IR-NEXT:  .preheader:
 ; IR-NEXT:    [[I:%.*]] = sext i32 [[Y]] to i64
 ; IR-NEXT:    [[I1:%.*]] = sext i32 [[X]] to i64
-; IR-NEXT:    [[I2:%.*]] = getelementptr [32 x [32 x float]], ptr addrspace(3) @array, i64 0, i64 [[I1]], i64 [[I]]
+; IR-NEXT:    [[I2:%.*]] = getelementptr [32 x [32 x float]], ptr addrspace(3) @array, i32 0, i32 [[X]], i32 [[Y]]
 ; IR-NEXT:    [[I3:%.*]] = addrspacecast ptr addrspace(3) [[I2]] to ptr
 ; IR-NEXT:    [[I4:%.*]] = load float, ptr [[I3]], align 4
 ; IR-NEXT:    [[I5:%.*]] = fadd float [[I4]], 0.000000e+00
-; IR-NEXT:    [[I72:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[I2]], i64 1
-; IR-NEXT:    [[I8:%.*]] = addrspacecast ptr addrspace(3) [[I72]] to ptr
+; IR-NEXT:    [[I77:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[I2]], i32 1
+; IR-NEXT:    [[I8:%.*]] = addrspacecast ptr addrspace(3) [[I77]] to ptr
 ; IR-NEXT:    [[I9:%.*]] = load float, ptr [[I8]], align 4
 ; IR-NEXT:    [[I10:%.*]] = fadd float [[I5]], [[I9]]
-; IR-NEXT:    [[I124:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[I2]], i64 32
-; IR-NEXT:    [[I13:%.*]] = addrspacecast ptr addrspace(3) [[I124]] to ptr
+; IR-NEXT:    [[I1212:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[I2]], i32 32
+; IR-NEXT:    [[I13:%.*]] = addrspacecast ptr addrspace(3) [[I1212]] to ptr
 ; IR-NEXT:    [[I14:%.*]] = load float, ptr [[I13]], align 4
 ; IR-NEXT:    [[I15:%.*]] = fadd float [[I10]], [[I14]]
-; IR-NEXT:    [[I167:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[I2]], i64 33
-; IR-NEXT:    [[I17:%.*]] = addrspacecast ptr addrspace(3) [[I167]] to ptr
+; IR-NEXT:    [[I1618:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[I2]], i32 33
+; IR-NEXT:    [[I17:%.*]] = addrspacecast ptr addrspace(3) [[I1618]] to ptr
 ; IR-NEXT:    [[I18:%.*]] = load float, ptr [[I17]], align 4
 ; IR-NEXT:    [[I19:%.*]] = fadd float [[I15]], [[I18]]
 ; IR-NEXT:    store float [[I19]], ptr [[OUTPUT]], align 4
@@ -145,20 +145,20 @@ define void @sum_of_array3(i32 %x, i32 %y, ptr nocapture %output) {
 ; IR-NEXT:  .preheader:
 ; IR-NEXT:    [[I:%.*]] = zext i32 [[Y]] to i64
 ; IR-NEXT:    [[I1:%.*]] = zext i32 [[X]] to i64
-; IR-NEXT:    [[I2:%.*]] = getelementptr [32 x [32 x float]], ptr addrspace(3) @array, i64 0, i64 [[I1]], i64 [[I]]
+; IR-NEXT:    [[I2:%.*]] = getelementptr [32 x [32 x float]], ptr addrspace(3) @array, i32 0, i32 [[X]], i32 [[Y]]
 ; IR-NEXT:    [[I3:%.*]] = addrspacecast ptr addrspace(3) [[I2]] to ptr
 ; IR-NEXT:    [[I4:%.*]] = load float, ptr [[I3]], align 4
 ; IR-NEXT:    [[I5:%.*]] = fadd float [[I4]], 0.000000e+00
-; IR-NEXT:    [[I82:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[I2]], i64 1
-; IR-NEXT:    [[I9:%.*]] = addrspacecast ptr addrspace(3) [[I82]] to ptr
+; IR-NEXT:    [[I87:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[I2]], i32 1
+; IR-NEXT:    [[I9:%.*]] = addrspacecast ptr addrspace(3) [[I87]] to ptr
 ; IR-NEXT:    [[I10:%.*]] = load float, ptr [[I9]], align 4
 ; IR-NEXT:    [[I11:%.*]] = fadd float [[I5]], [[I10]]
-; IR-NEXT:    [[I144:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[I2]], i64 32
-; IR-NEXT:    [[I15:%.*]] = addrspacecast ptr addrspace(3) [[I144]] to ptr
+; IR-NEXT:    [[I1412:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[I2]], i32 32
+; IR-NEXT:    [[I15:%.*]] = addrspacecast ptr addrspace(3) [[I1412]] to ptr
 ; IR-NEXT:    [[I16:%.*]] = load float, ptr [[I15]], align 4
 ; IR-NEXT:    [[I17:%.*]] = fadd float [[I11]], [[I16]]
-; IR-NEXT:    [[I187:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[I2]], i64 33
-; IR-NEXT:    [[I19:%.*]] = addrspacecast ptr addrspace(3) [[I187]] to ptr
+; IR-NEXT:    [[I1818:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[I2]], i32 33
+; IR-NEXT:    [[I19:%.*]] = addrspacecast ptr addrspace(3) [[I1818]] to ptr
 ; IR-NEXT:    [[I20:%.*]] = load float, ptr [[I19]], align 4
 ; IR-NEXT:    [[I21:%.*]] = fadd float [[I17]], [[I20]]
 ; IR-NEXT:    store float [[I21]], ptr [[OUTPUT]], align 4
@@ -205,20 +205,20 @@ define void @sum_of_array4(i32 %x, i32 %y, ptr nocapture %output) {
 ; IR-NEXT:  .preheader:
 ; IR-NEXT:    [[I:%.*]] = zext i32 [[Y]] to i64
 ; IR-NEXT:    [[I1:%.*]] = zext i32 [[X]] to i64
-; IR-NEXT:    [[I2:%.*]] = getelementptr [32 x [32 x float]], ptr addrspace(3) @array, i64 0, i64 [[I1]], i64 [[I]]
+; IR-NEXT:    [[I2:%.*]] = getelementptr [32 x [32 x float]], ptr addrspace(3) @array, i32 0, i32 [[X]], i32 [[Y]]
 ; IR-NEXT:    [[I3:%.*]] = addrspacecast ptr addrspace(3) [[I2]] to ptr
 ; IR-NEXT:    [[I4:%.*]] = load float, ptr [[I3]], align 4
 ; IR-NEXT:    [[I5:%.*]] = fadd float [[I4]], 0.000000e+00
-; IR-NEXT:    [[I72:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[I2]], i64 1
-; IR-NEXT:    [[I8:%.*]] = addrspacecast ptr addrspace(3) [[I72]] to ptr
+; IR-NEXT:    [[I77:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[I2]], i32 1
+; IR-NEXT:    [[I8:%.*]] = addrspacecast ptr addrspace(3) [[I77]] to ptr
 ; IR-NEXT:    [[I9:%.*]] = load float, ptr [[I8]], align 4
 ; IR-NEXT:    [[I10:%.*]] = fadd float [[I5]], [[I9]]
-; IR-NEXT:    [[I124:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[I2]], i64 32
-; IR-NEXT:    [[I13:%.*]] = addrspacecast ptr addrspace(3) [[I124]] to ptr
+; IR-NEXT:    [[I1212:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[I2]], i32 32
+; IR-NEXT:    [[I13:%.*]] = addrspacecast ptr addrspace(3) [[I1212]] to ptr
 ; IR-NEXT:    [[I14:%.*]] = load float, ptr [[I13]], align 4
 ; IR-NEXT:    [[I15:%.*]] = fadd float [[I10]], [[I14]]
-; IR-NEXT:    [[I167:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[I2]], i64 33
-; IR-NEXT:    [[I17:%.*]] = addrspacecast ptr addrspace(3) [[I167]] to ptr
+; IR-NEXT:    [[I1618:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[I2]], i32 33
+; IR-NEXT:    [[I17:%.*]] = addrspacecast ptr addrspace(3) [[I1618]] to ptr
 ; IR-NEXT:    [[I18:%.*]] = load float, ptr [[I17]], align 4
 ; IR-NEXT:    [[I19:%.*]] = fadd float [[I15]], [[I18]]
 ; IR-NEXT:    store float [[I19]], ptr [[OUTPUT]], align 4
diff --git a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/split-gep.ll b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/split-gep.ll
index 60d005be3d73e..1b220ceac3514 100644
--- a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/split-gep.ll
+++ b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/split-gep.ll
@@ -12,10 +12,9 @@
 define ptr addrspace(3) @packed_struct(i32 %i, i32 %j) {
 ; CHECK-LABEL: @packed_struct(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = sext i32 [[I:%.*]] to i64
-; CHECK-NEXT:    [[TMP1:%.*]] = sext i32 [[J:%.*]] to i64
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr [1024 x %struct.Packed], ptr addrspace(3) @packed_struct_array, i64 0, i64 [[TMP0]], i32 1, i64 [[TMP1]]
-; CHECK-NEXT:    [[UGLYGEP:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP2]], i64 100
+; CHECK-NEXT:    [[IDXPROM:%.*]] = trunc i64 0 to i32
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr [1024 x %struct.Packed], ptr addrspace(3) @packed_struct_array, i32 [[IDXPROM]], i32 [[I:%.*]], i32 1, i32 [[J:%.*]]
+; CHECK-NEXT:    [[UGLYGEP:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP0]], i32 100
 ; CHECK-NEXT:    ret ptr addrspace(3) [[UGLYGEP]]
 ;
 entry:
diff --git a/llvm/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep.ll b/llvm/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep.ll
index 5ae4ea9dad7de..67d78fc90512f 100644
--- a/llvm/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep.ll
+++ b/llvm/test/Transforms/SeparateConstOffsetFromGEP/NVPTX/split-gep.ll
@@ -163,7 +163,7 @@ define ptr @expr(i64 %a, i64 %b, ptr %out) {
 ; CHECK-NEXT:    [[I2:%.*]] = add i64 [[B]], [[A]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr [32 x [32 x float]], ptr @float_2d_array, i64 0, i64 [[I2]], i64 0
 ; CHECK-NEXT:    [[P3:%.*]] = getelementptr inbounds float, ptr [[TMP0]], i64 160
-; CHECK-NEXT:    store i64 [[B5]], ptr [[OUT]], align 4
+; CHECK-NEXT:    store i64 [[B5]], ptr [[OUT]], align 8
 ; CHECK-NEXT:    ret ptr [[P3]]
 ;
 entry:
@@ -358,8 +358,8 @@ define ptr @sign_mod_unsign(ptr %ptr, i64 %idx) {
 ; CHECK-SAME: ptr [[PTR:%.*]], i64 [[IDX:%.*]]) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr [[STRUCT0:%.*]], ptr [[PTR]], i64 0, i32 3, i64 [[IDX]], i32 1
-; CHECK-NEXT:    [[PTR22:%.*]] = getelementptr inbounds [[STRUCT2:%.*]], ptr [[TMP0]], i64 -3
-; CHECK-NEXT:    ret ptr [[PTR22]]
+; CHECK-NEXT:    [[UGLYGEP:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 -64
+; CHECK-NEXT:    ret ptr [[UGLYGEP]]
 ;
 entry:
   %arrayidx = add nsw i64 %idx, -2
@@ -373,7 +373,7 @@ define ptr @trunk_explicit(ptr %ptr, i64 %idx) {
 ; CHECK-SAME: ptr [[PTR:%.*]], i64 [[IDX:%.*]]) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr [[STRUCT0:%.*]], ptr [[PTR]], i64 0, i32 3, i64 [[IDX]], i32 1
-; CHECK-NEXT:    [[PTR21:%.*]] = getelementptr inbounds [[STRUCT2:%.*]], ptr [[TMP0]], i64 151
+; CHECK-NEXT:    [[PTR21:%.*]] = getelementptr inbounds [[STRUCT2:%.*]], ptr [[TMP0]], i64 134
 ; CHECK-NEXT:    ret ptr [[PTR21]]
 ;
 entry:
@@ -390,7 +390,7 @@ define ptr @trunk_long_idx(ptr %ptr, i64 %idx) {
 ; CHECK-SAME: ptr [[PTR:%.*]], i64 [[IDX:%.*]]) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr [[STRUCT0:%.*]], ptr [[PTR]], i64 0, i32 3, i64 [[IDX]], i32 1
-; CHECK-NEXT:    [[PTR21:%.*]] = getelementptr inbounds [[STRUCT2:%.*]], ptr [[TMP0]], i64 151
+; CHECK-NEXT:    [[PTR21:%.*]] = getelementptr inbounds [[STRUCT2:%.*]], ptr [[TMP0]], i64 134
 ; CHECK-NEXT:    ret ptr [[PTR21]]
 ;
 entry:
diff --git a/llvm/test/Transforms/SeparateConstOffsetFromGEP/RISCV/split-gep.ll b/llvm/test/Transforms/SeparateConstOffsetFromGEP/RISCV/split-gep.ll
index 1f47d2f0c2c79..ed0a1185985cc 100644
--- a/llvm/test/Transforms/SeparateConstOffsetFromGEP/RISCV/split-gep.ll
+++ b/llvm/test/Transforms/SeparateConstOffsetFromGEP/RISCV/split-gep.ll
@@ -13,11 +13,11 @@ define i64 @test1(ptr %array, i64 %i, i64 %j)  {
 ; CHECK-NEXT:    [[ADD:%.*]] = add nsw i64 [[I:%.*]], 5
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i64, ptr [[ARRAY:%.*]], i64 [[I]]
 ; CHECK-NEXT:    [[GEP4:%.*]] = getelementptr inbounds i64, ptr [[TMP0]], i64 5
-; CHECK-NEXT:    store i64 [[J:%.*]], ptr [[GEP4]], align 4
+; CHECK-NEXT:    store i64 [[J:%.*]], ptr [[GEP4]], align 8
 ; CHECK-NEXT:    [[GEP26:%.*]] = getelementptr inbounds i64, ptr [[TMP0]], i64 6
-; CHECK-NEXT:    store i64 [[J]], ptr [[GEP26]], align 4
+; CHECK-NEXT:    store i64 [[J]], ptr [[GEP26]], align 8
 ; CHECK-NEXT:    [[GEP38:%.*]] = getelementptr inbounds i64, ptr [[TMP0]], i64 35
-; CHECK-NEXT:    store i64 [[ADD]], ptr [[GEP38]], align 4
+; CHECK-NEXT:    store i64 [[ADD]], ptr [[GEP38]], align 8
 ; CHECK-NEXT:    ret i64 undef
 ;
 entry:
@@ -169,11 +169,11 @@ define i64 @test6(ptr %array, i64 %i, i64 %j) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[ADD:%.*]] = add nsw i64 [[I:%.*]], 5
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i64, ptr [[ARRAY:%.*]], i64 [[J:%.*]]
-; CHECK-NEXT:    store i64 [[ADD]], ptr [[GEP]], align 4
+; CHECK-NEXT:    store i64 [[ADD]], ptr [[GEP]], align 8
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i64, ptr [[ARRAY]], i64 [[I]]
 ; CHECK-NEXT:    [[GEP52:%.*]] = getelementptr inbounds i64, ptr [[TMP0]], i64 6
-; CHECK-NEXT:    store i64 [[I]], ptr [[GEP52]], align 4
-; CHECK-NEXT:    store i64 [[I]], ptr [[TMP0]], align 4
+; CHECK-NEXT:    store i64 [[I]], ptr [[GEP52]], align 8
+; CHECK-NEXT:    store i64 [[I]], ptr [[TMP0]], align 8
 ; CHECK-NEXT:    ret i64 undef
 ;
 entry:
diff --git a/llvm/test/Transforms/TypePromotion/ARM/pointers.ll b/llvm/test/Transforms/TypePromotion/ARM/pointers.ll
index 5781dc262ffd1..362b4ec73401c 100644
--- a/llvm/test/Transforms/TypePromotion/ARM/pointers.ll
+++ b/llvm/test/Transforms/TypePromotion/ARM/pointers.ll
@@ -130,7 +130,7 @@ define i8 @call_pointer(i8 zeroext %x, i8 zeroext %y, ptr %a, ptr %b) {
 define i16 @pointer_to_pointer(ptr %arg, i16 zeroext %limit) {
 ; CHECK-LABEL: @pointer_to_pointer(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[ADDR:%.*]] = load ptr, ptr [[ARG:%.*]], align 8
+; CHECK-NEXT:    [[ADDR:%.*]] = load ptr, ptr [[ARG:%.*]], align 4
 ; CHECK-NEXT:    [[VAL:%.*]] = load i16, ptr [[ADDR]], align 2
 ; CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[VAL]] to i32
 ; CHECK-NEXT:    [[ADD:%.*]] = add nuw i32 [[TMP0]], 7
diff --git a/llvm/test/Transforms/VectorCombine/X86/insert-binop-with-constant-inseltpoison.ll b/llvm/test/Transforms/VectorCombine/X86/insert-binop-with-constant-inseltpoison.ll
index e5be89dd02b16..05251cb829b2b 100644
--- a/llvm/test/Transforms/VectorCombine/X86/insert-binop-with-constant-inseltpoison.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/insert-binop-with-constant-inseltpoison.ll
@@ -26,7 +26,7 @@ define <2 x i64> @add_constant_not_undef_lane(i64 %x) {
 
 define <2 x i64> @add_constant_load(ptr %p) {
 ; CHECK-LABEL: @add_constant_load(
-; CHECK-NEXT:    [[LD:%.*]] = load i64, ptr [[P:%.*]], align 4
+; CHECK-NEXT:    [[LD:%.*]] = load i64, ptr [[P:%.*]], align 8
 ; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x i64> poison, i64 [[LD]], i32 0
 ; CHECK-NEXT:    [[BO:%.*]] = add <2 x i64> [[INS]], <i64 42, i64 -42>
 ; CHECK-NEXT:    ret <2 x i64> [[BO]]
@@ -152,7 +152,7 @@ define <2 x i64> @shl_constant_op0_not_undef_lane(i64 %x) {
 
 define <2 x i64> @shl_constant_op0_load(ptr %p) {
 ; CHECK-LABEL: @shl_constant_op0_load(
-; CHECK-NEXT:    [[LD:%.*]] = load i64, ptr [[P:%.*]], align 4
+; CHECK-NEXT:    [[LD:%.*]] = load i64, ptr [[P:%.*]], align 8
 ; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x i64> poison, i64 [[LD]], i32 1
 ; CHECK-NEXT:    [[BO:%.*]] = shl <2 x i64> <i64 undef, i64 2>, [[INS]]
 ; CHECK-NEXT:    ret <2 x i64> [[BO]]
@@ -203,7 +203,7 @@ define <2 x i64> @shl_constant_op1_not_undef_lane(i64 %x) {
 
 define <2 x i64> @shl_constant_op1_load(ptr %p) {
 ; CHECK-LABEL: @shl_constant_op1_load(
-; CHECK-NEXT:    [[LD:%.*]] = load i64, ptr [[P:%.*]], align 4
+; CHECK-NEXT:    [[LD:%.*]] = load i64, ptr [[P:%.*]], align 8
 ; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x i64> poison, i64 [[LD]], i32 0
 ; CHECK-NEXT:    [[BO:%.*]] = shl nuw <2 x i64> [[INS]], <i64 5, i64 2>
 ; CHECK-NEXT:    ret <2 x i64> [[BO]]
diff --git a/llvm/test/Transforms/VectorCombine/X86/insert-binop-with-constant.ll b/llvm/test/Transforms/VectorCombine/X86/insert-binop-with-constant.ll
index 2a0db43b49b88..21adcdfd29b82 100644
--- a/llvm/test/Transforms/VectorCombine/X86/insert-binop-with-constant.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/insert-binop-with-constant.ll
@@ -26,7 +26,7 @@ define <2 x i64> @add_constant_not_undef_lane(i64 %x) {
 
 define <2 x i64> @add_constant_load(ptr %p) {
 ; CHECK-LABEL: @add_constant_load(
-; CHECK-NEXT:    [[LD:%.*]] = load i64, ptr [[P:%.*]], align 4
+; CHECK-NEXT:    [[LD:%.*]] = load i64, ptr [[P:%.*]], align 8
 ; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x i64> undef, i64 [[LD]], i32 0
 ; CHECK-NEXT:    [[BO:%.*]] = add <2 x i64> [[INS]], <i64 42, i64 -42>
 ; CHECK-NEXT:    ret <2 x i64> [[BO]]
@@ -152,7 +152,7 @@ define <2 x i64> @shl_constant_op0_not_undef_lane(i64 %x) {
 
 define <2 x i64> @shl_constant_op0_load(ptr %p) {
 ; CHECK-LABEL: @shl_constant_op0_load(
-; CHECK-NEXT:    [[LD:%.*]] = load i64, ptr [[P:%.*]], align 4
+; CHECK-NEXT:    [[LD:%.*]] = load i64, ptr [[P:%.*]], align 8
 ; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x i64> undef, i64 [[LD]], i32 1
 ; CHECK-NEXT:    [[BO:%.*]] = shl <2 x i64> <i64 undef, i64 2>, [[INS]]
 ; CHECK-NEXT:    ret <2 x i64> [[BO]]
@@ -203,7 +203,7 @@ define <2 x i64> @shl_constant_op1_not_undef_lane(i64 %x) {
 
 define <2 x i64> @shl_constant_op1_load(ptr %p) {
 ; CHECK-LABEL: @shl_constant_op1_load(
-; CHECK-NEXT:    [[LD:%.*]] = load i64, ptr [[P:%.*]], align 4
+; CHECK-NEXT:    [[LD:%.*]] = load i64, ptr [[P:%.*]], align 8
 ; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x i64> undef, i64 [[LD]], i32 0
 ; CHECK-NEXT:    [[BO:%.*]] = shl nuw <2 x i64> [[INS]], <i64 5, i64 2>
 ; CHECK-NEXT:    ret <2 x i64> [[BO]]
diff --git a/llvm/test/Transforms/VectorCombine/X86/scalarize-cmp-inseltpoison.ll b/llvm/test/Transforms/VectorCombine/X86/scalarize-cmp-inseltpoison.ll
index 225fa3c9670b3..14b517d613de4 100644
--- a/llvm/test/Transforms/VectorCombine/X86/scalarize-cmp-inseltpoison.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/scalarize-cmp-inseltpoison.ll
@@ -169,7 +169,7 @@ define <2 x i1> @constant_op1_i64_not_undef_lane(i64 %x) {
 
 define <2 x i1> @constant_op1_i64_load(ptr %p) {
 ; CHECK-LABEL: @constant_op1_i64_load(
-; CHECK-NEXT:    [[LD:%.*]] = load i64, ptr [[P:%.*]], align 4
+; CHECK-NEXT:    [[LD:%.*]] = load i64, ptr [[P:%.*]], align 8
 ; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x i64> poison, i64 [[LD]], i32 0
 ; CHECK-NEXT:    [[R:%.*]] = icmp eq <2 x i64> [[INS]], <i64 42, i64 -42>
 ; CHECK-NEXT:    ret <2 x i1> [[R]]
diff --git a/llvm/test/Transforms/VectorCombine/X86/scalarize-cmp.ll b/llvm/test/Transforms/VectorCombine/X86/scalarize-cmp.ll
index 11f85b2d47672..e9ab99c3e0d9c 100644
--- a/llvm/test/Transforms/VectorCombine/X86/scalarize-cmp.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/scalarize-cmp.ll
@@ -169,7 +169,7 @@ define <2 x i1> @constant_op1_i64_not_undef_lane(i64 %x) {
 
 define <2 x i1> @constant_op1_i64_load(ptr %p) {
 ; CHECK-LABEL: @constant_op1_i64_load(
-; CHECK-NEXT:    [[LD:%.*]] = load i64, ptr [[P:%.*]], align 4
+; CHECK-NEXT:    [[LD:%.*]] = load i64, ptr [[P:%.*]], align 8
 ; CHECK-NEXT:    [[INS:%.*]] = insertelement <2 x i64> undef, i64 [[LD]], i32 0
 ; CHECK-NEXT:    [[R:%.*]] = icmp eq <2 x i64> [[INS]], <i64 42, i64 -42>
 ; CHECK-NEXT:    ret <2 x i1> [[R]]
diff --git a/llvm/test/tools/opt/infer-data-layout.ll b/llvm/test/tools/opt/infer-data-layout.ll
new file mode 100644
index 0000000000000..d1bcfec489381
--- /dev/null
+++ b/llvm/test/tools/opt/infer-data-layout.ll
@@ -0,0 +1,13 @@
+; REQUIRES: x86-registered-target
+;; Check that we infer the correct datalayout from a target triple
+; RUN: opt -mtriple=i386-linux-gnu -S -passes=no-op-module < %s | FileCheck %s --check-prefix=LINUX
+; RUN: opt -mtriple=i386-apple-darwin -S -passes=no-op-module < %s | FileCheck %s --check-prefix=DARWIN
+; RUN: opt -mtriple=i386-windows-msvc -S -passes=no-op-module < %s | FileCheck %s --check-prefix=WINDOWS
+
+target datalayout = ""
+; LINUX: target datalayout = "e-m:e-p:32:32-p270:32:32-p271:32:32-p272:64:64-i128:128-f64:32:64-f80:32-n8:16:32-S128"
+; LINUX: target triple = "i386-unknown-linux-gnu"
+; DARWIN: target datalayout = "e-m:o-p:32:32-p270:32:32-p271:32:32-p272:64:64-i128:128-f64:32:64-f80:128-n8:16:32-S128"
+; DARWIN: target triple = "i386-apple-darwin"
+; WINDOWS: target datalayout = "e-m:x-p:32:32-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32-a:0:32-S32"
+; WINDOWS: target triple = "i386-unknown-windows-msvc"
diff --git a/llvm/test/tools/opt/invalid-target.ll b/llvm/test/tools/opt/invalid-target.ll
new file mode 100644
index 0000000000000..67c9b3c2a0a47
--- /dev/null
+++ b/llvm/test/tools/opt/invalid-target.ll
@@ -0,0 +1,16 @@
+;; Check that invalid triples are handled correctly by opt.
+
+;; No diagnostics should be printed for an explicitly/implicitly empty triple
+; RUN: opt -S -passes=no-op-module -o /dev/null < %s 2>&1 | FileCheck %s --allow-empty --check-prefix=EMPTY
+; RUN: opt '-mtriple=' -S -passes=no-op-module -o /dev/null < %s 2>&1 | FileCheck %s --allow-empty --check-prefix=EMPTY
+; EMPTY-NOT: {{.+}}
+
+;; Using "unknown" as the architecture is explicitly allowed (but warns)
+; RUN: opt -mtriple=unknown -S -passes=no-op-module -o /dev/null < %s 2>&1 | FileCheck %s --check-prefix=UNKNOWN
+; UNKNOWN: opt: warning: failed to infer data layout: unable to get target for 'unknown', see --version and --triple.
+
+;; However, any other invalid target triple should cause the tool to fail:
+; RUN: not opt -mtriple=invalid -S -passes=no-op-module -o /dev/null < %s 2>&1 | FileCheck %s --check-prefix=INVALID
+; INVALID: opt: warning: failed to infer data layout: unable to get target for 'invalid', see --version and --triple.
+; INVALID-NEXT: opt: unrecognized architecture 'invalid' provided.
+; INVALID-EMPTY:
diff --git a/llvm/tools/opt/opt.cpp b/llvm/tools/opt/opt.cpp
index 5f415e50d20f2..bb6627364442e 100644
--- a/llvm/tools/opt/opt.cpp
+++ b/llvm/tools/opt/opt.cpp
@@ -500,10 +500,36 @@ int main(int argc, char **argv) {
   std::unique_ptr<ToolOutputFile> RemarksFile = std::move(*RemarksFileOrErr);
 
   // Load the input module...
-  auto SetDataLayout = [](StringRef, StringRef) -> std::optional<std::string> {
-    if (ClDataLayout.empty())
+  auto SetDataLayout = [&](StringRef IRTriple,
+                           StringRef IRLayout) -> std::optional<std::string> {
+    // Data layout specified on the command line has the highest priority.
+    if (!ClDataLayout.empty())
+      return ClDataLayout;
+    // If an explicit data layout is already defined in the IR, don't infer.
+    if (!IRLayout.empty())
       return std::nullopt;
-    return ClDataLayout;
+
+    // If an explicit triple was specified (either in the IR or on the
+    // command line), use that to infer the default data layout. However, the
+    // command line target triple should override the IR file target triple.
+    std::string TripleStr =
+        TargetTriple.empty() ? IRTriple.str() : Triple::normalize(TargetTriple);
+    // If the triple string is still empty, we don't fall back to
+    // sys::getDefaultTargetTriple() since we do not want to have differing
+    // behaviour dependent on the configured default triple. Therefore, if the
+    // user did not pass -mtriple or define an explicit triple/datalayout in
+    // the IR, we should default to an empty (default) DataLayout.
+    if (TripleStr.empty())
+      return std::nullopt;
+    // Otherwise we infer the DataLayout from the target machine.
+    Expected<std::unique_ptr<TargetMachine>> ExpectedTM =
+        codegen::createTargetMachineForTriple(TripleStr, GetCodeGenOptLevel());
+    if (!ExpectedTM) {
+      errs() << argv[0] << ": warning: failed to infer data layout: "
+             << toString(ExpectedTM.takeError()) << "\n";
+      return std::nullopt;
+    }
+    return (*ExpectedTM)->createDataLayout().getStringRepresentation();
   };
   std::unique_ptr<Module> M;
   if (NoUpgradeDebugInfo)
@@ -531,7 +557,7 @@ int main(int argc, char **argv) {
     }
   }
 
-  // If we are supposed to override the target triple or data layout, do so now.
+  // If we are supposed to override the target triple, do so now.
   if (!TargetTriple.empty())
     M->setTargetTriple(Triple::normalize(TargetTriple));
 

From 56183cf608f308ff441b69dcc3ef626acbb014d4 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Thu, 26 Oct 2023 12:12:10 -0700
Subject: [PATCH 120/877] [RISCV] Disable lax vector conversions between
 RVVBuiltin types and RVVFixedLengthDataVector.

This seems to be causing issues with using overloaded RVV intrinsics
that take scalar operands. If the scalar type passed in doesn't exactly
match the element type.

I blindly copied this feature from SVE. Since no one has asked for it
I'd prefer to remove it to make overloaded intrinsics work as expected.

By removing the lax conversions, types declared with __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen)))
can only ever be used like their underlying RVV builtin type. No lax conversions
to other element sizes with the same LMUL.

Fixes #64404.
---
 clang/lib/AST/ASTContext.cpp                  |  4 +--
 .../CodeGen/attr-riscv-rvv-vector-bits-cast.c | 26 +++++--------------
 .../Sema/riscv-rvv-lax-vector-conversions.c   |  6 +++++
 .../riscv-rvv-lax-vector-conversions.cpp      |  8 ++++--
 4 files changed, 20 insertions(+), 24 deletions(-)

diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp
index fea7f2400b31e..349c559b48b41 100644
--- a/clang/lib/AST/ASTContext.cpp
+++ b/clang/lib/AST/ASTContext.cpp
@@ -9563,9 +9563,7 @@ bool ASTContext::areLaxCompatibleRVVTypes(QualType FirstType,
       return false;
 
     const auto *VecTy = SecondType->getAs<VectorType>();
-    if (VecTy &&
-        (VecTy->getVectorKind() == VectorType::RVVFixedLengthDataVector ||
-         VecTy->getVectorKind() == VectorType::GenericVector)) {
+    if (VecTy && VecTy->getVectorKind() == VectorType::GenericVector) {
       const LangOptions::LaxVectorConversionKind LVCKind =
           getLangOpts().getLaxVectorConversions();
 
diff --git a/clang/test/CodeGen/attr-riscv-rvv-vector-bits-cast.c b/clang/test/CodeGen/attr-riscv-rvv-vector-bits-cast.c
index b324d0a2b3cbd..93e9a4eee96eb 100644
--- a/clang/test/CodeGen/attr-riscv-rvv-vector-bits-cast.c
+++ b/clang/test/CodeGen/attr-riscv-rvv-vector-bits-cast.c
@@ -55,23 +55,11 @@ fixed_float64m1_t from_vfloat64m1_t(vfloat64m1_t type) {
   return type;
 }
 
-// CHECK-LABEL: @lax_cast(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[SAVED_VALUE:%.*]] = alloca <8 x i32>, align 32
-// CHECK-NEXT:    [[TYPE:%.*]] = tail call <8 x i32> @llvm.vector.extract.v8i32.nxv2i32(<vscale x 2 x i32> [[TYPE_COERCE:%.*]], i64 0)
-// CHECK-NEXT:    store <8 x i32> [[TYPE]], ptr [[SAVED_VALUE]], align 32, !tbaa [[TBAA4:![0-9]+]]
-// CHECK-NEXT:    [[TMP0:%.*]] = load <vscale x 1 x i64>, ptr [[SAVED_VALUE]], align 32, !tbaa [[TBAA4]]
-// CHECK-NEXT:    ret <vscale x 1 x i64> [[TMP0]]
-//
-vint64m1_t lax_cast(fixed_int32m1_t type) {
-  return type;
-}
-
 // CHECK-LABEL: @to_vint32m1_t__from_gnu_int32m1_t(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TYPE:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA4]]
-// CHECK-NEXT:    [[CASTSCALABLESVE:%.*]] = tail call <vscale x 2 x i32> @llvm.vector.insert.nxv2i32.v8i32(<vscale x 2 x i32> undef, <8 x i32> [[TYPE]], i64 0)
-// CHECK-NEXT:    ret <vscale x 2 x i32> [[CASTSCALABLESVE]]
+// CHECK-NEXT:    [[TYPE:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA4:![0-9]+]]
+// CHECK-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 2 x i32> @llvm.vector.insert.nxv2i32.v8i32(<vscale x 2 x i32> undef, <8 x i32> [[TYPE]], i64 0)
+// CHECK-NEXT:    ret <vscale x 2 x i32> [[CAST_SCALABLE]]
 //
 vint32m1_t to_vint32m1_t__from_gnu_int32m1_t(gnu_int32m1_t type) {
   return type;
@@ -79,8 +67,8 @@ vint32m1_t to_vint32m1_t__from_gnu_int32m1_t(gnu_int32m1_t type) {
 
 // CHECK-LABEL: @from_vint32m1_t__to_gnu_int32m1_t(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[CASTFIXEDSVE:%.*]] = tail call <8 x i32> @llvm.vector.extract.v8i32.nxv2i32(<vscale x 2 x i32> [[TYPE:%.*]], i64 0)
-// CHECK-NEXT:    store <8 x i32> [[CASTFIXEDSVE]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-NEXT:    [[CAST_FIXED:%.*]] = tail call <8 x i32> @llvm.vector.extract.v8i32.nxv2i32(<vscale x 2 x i32> [[TYPE:%.*]], i64 0)
+// CHECK-NEXT:    store <8 x i32> [[CAST_FIXED]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA4]]
 // CHECK-NEXT:    ret void
 //
 gnu_int32m1_t from_vint32m1_t__to_gnu_int32m1_t(vint32m1_t type) {
@@ -90,8 +78,8 @@ gnu_int32m1_t from_vint32m1_t__to_gnu_int32m1_t(vint32m1_t type) {
 // CHECK-LABEL: @to_fixed_int32m1_t__from_gnu_int32m1_t(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TYPE:%.*]] = load <8 x i32>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA4]]
-// CHECK-NEXT:    [[CASTSCALABLESVE:%.*]] = tail call <vscale x 2 x i32> @llvm.vector.insert.nxv2i32.v8i32(<vscale x 2 x i32> undef, <8 x i32> [[TYPE]], i64 0)
-// CHECK-NEXT:    ret <vscale x 2 x i32> [[CASTSCALABLESVE]]
+// CHECK-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 2 x i32> @llvm.vector.insert.nxv2i32.v8i32(<vscale x 2 x i32> undef, <8 x i32> [[TYPE]], i64 0)
+// CHECK-NEXT:    ret <vscale x 2 x i32> [[CAST_SCALABLE]]
 //
 fixed_int32m1_t to_fixed_int32m1_t__from_gnu_int32m1_t(gnu_int32m1_t type) {
   return type;
diff --git a/clang/test/Sema/riscv-rvv-lax-vector-conversions.c b/clang/test/Sema/riscv-rvv-lax-vector-conversions.c
index 8ab01620b82aa..bc731f90f6a9a 100644
--- a/clang/test/Sema/riscv-rvv-lax-vector-conversions.c
+++ b/clang/test/Sema/riscv-rvv-lax-vector-conversions.c
@@ -33,8 +33,12 @@ void rvv_allowed_with_integer_lax_conversions() {
   // -flax-vector-conversions={integer,all}.
   fi32 = si64;
   // lax-vector-none-error@-1 {{assigning to 'rvv_fixed_int32m1_t' (vector of 16 'int' values) from incompatible type}}
+  // lax-vector-integer-error@-2 {{assigning to 'rvv_fixed_int32m1_t' (vector of 16 'int' values) from incompatible type}}
+  // lax-vector-all-error@-3 {{assigning to 'rvv_fixed_int32m1_t' (vector of 16 'int' values) from incompatible type}}
   si64 = fi32;
   // lax-vector-none-error@-1 {{assigning to 'vint64m1_t' (aka '__rvv_int64m1_t') from incompatible type}}
+  // lax-vector-integer-error@-2 {{assigning to 'vint64m1_t' (aka '__rvv_int64m1_t') from incompatible type}}
+  // lax-vector-all-error@-3 {{assigning to 'vint64m1_t' (aka '__rvv_int64m1_t') from incompatible type}}
 }
 
 void rvv_allowed_with_all_lax_conversions() {
@@ -46,9 +50,11 @@ void rvv_allowed_with_all_lax_conversions() {
   ff32 = sf64;
   // lax-vector-none-error@-1 {{assigning to 'rvv_fixed_float32m1_t' (vector of 16 'float' values) from incompatible type}}
   // lax-vector-integer-error@-2 {{assigning to 'rvv_fixed_float32m1_t' (vector of 16 'float' values) from incompatible type}}
+  // lax-vector-all-error@-3 {{assigning to 'rvv_fixed_float32m1_t' (vector of 16 'float' values) from incompatible type}}
   sf64 = ff32;
   // lax-vector-none-error@-1 {{assigning to 'vfloat64m1_t' (aka '__rvv_float64m1_t') from incompatible type}}
   // lax-vector-integer-error@-2 {{assigning to 'vfloat64m1_t' (aka '__rvv_float64m1_t') from incompatible type}}
+  // lax-vector-all-error@-3 {{assigning to 'vfloat64m1_t' (aka '__rvv_float64m1_t') from incompatible type}}
 }
 
 void gnu_allowed_with_integer_lax_conversions() {
diff --git a/clang/test/SemaCXX/riscv-rvv-lax-vector-conversions.cpp b/clang/test/SemaCXX/riscv-rvv-lax-vector-conversions.cpp
index ff3e028aa314d..00a043797a9d3 100644
--- a/clang/test/SemaCXX/riscv-rvv-lax-vector-conversions.cpp
+++ b/clang/test/SemaCXX/riscv-rvv-lax-vector-conversions.cpp
@@ -2,8 +2,6 @@
 // RUN: %clang_cc1 -triple riscv64-none-linux-gnu -target-feature +f -target-feature +d -target-feature +zve64d -mvscale-min=8 -mvscale-max=8 -flax-vector-conversions=integer -ffreestanding -fsyntax-only -verify=lax-vector-integer %s
 // RUN: %clang_cc1 -triple riscv64-none-linux-gnu -target-feature +f -target-feature +d -target-feature +zve64d -mvscale-min=8 -mvscale-max=8 -flax-vector-conversions=all -ffreestanding -fsyntax-only -verify=lax-vector-all %s
 
-// lax-vector-all-no-diagnostics
-
 // REQUIRES: riscv-registered-target
 
 #define RVV_FIXED_ATTR __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen)))
@@ -33,8 +31,12 @@ void rvv_allowed_with_integer_lax_conversions() {
   // -flax-vector-conversions={integer,all}.
   fi32 = si64;
   // lax-vector-none-error@-1 {{assigning to 'rvv_fixed_int32m1_t' (vector of 16 'int' values) from incompatible type}}
+  // lax-vector-integer-error@-2 {{assigning to 'rvv_fixed_int32m1_t' (vector of 16 'int' values) from incompatible type}}
+  // lax-vector-all-error@-3 {{assigning to 'rvv_fixed_int32m1_t' (vector of 16 'int' values) from incompatible type}}
   si64 = fi32;
   // lax-vector-none-error@-1 {{assigning to 'vint64m1_t' (aka '__rvv_int64m1_t') from incompatible type}}
+  // lax-vector-integer-error@-2 {{assigning to 'vint64m1_t' (aka '__rvv_int64m1_t') from incompatible type}}
+  // lax-vector-all-error@-3 {{assigning to 'vint64m1_t' (aka '__rvv_int64m1_t') from incompatible type}}
 }
 
 void rvv_allowed_with_all_lax_conversions() {
@@ -46,9 +48,11 @@ void rvv_allowed_with_all_lax_conversions() {
   ff32 = sf64;
   // lax-vector-none-error@-1 {{assigning to 'rvv_fixed_float32m1_t' (vector of 16 'float' values) from incompatible type}}
   // lax-vector-integer-error@-2 {{assigning to 'rvv_fixed_float32m1_t' (vector of 16 'float' values) from incompatible type}}
+  // lax-vector-all-error@-3 {{assigning to 'rvv_fixed_float32m1_t' (vector of 16 'float' values) from incompatible type}}
   sf64 = ff32;
   // lax-vector-none-error@-1 {{assigning to 'vfloat64m1_t' (aka '__rvv_float64m1_t') from incompatible type}}
   // lax-vector-integer-error@-2 {{assigning to 'vfloat64m1_t' (aka '__rvv_float64m1_t') from incompatible type}}
+  // lax-vector-all-error@-3 {{assigning to 'vfloat64m1_t' (aka '__rvv_float64m1_t') from incompatible type}}
 }
 
 void gnu_allowed_with_integer_lax_conversions() {

From 92b4e05494126f39716048aa09a2b89485384574 Mon Sep 17 00:00:00 2001
From: Adrian Prantl <aprantl@apple.com>
Date: Thu, 26 Oct 2023 12:25:20 -0700
Subject: [PATCH 121/877] Fix typo in swift section name

---
 llvm/include/llvm/BinaryFormat/Swift.def | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/include/llvm/BinaryFormat/Swift.def b/llvm/include/llvm/BinaryFormat/Swift.def
index da83e3170528e..065760366c971 100644
--- a/llvm/include/llvm/BinaryFormat/Swift.def
+++ b/llvm/include/llvm/BinaryFormat/Swift.def
@@ -34,4 +34,4 @@ HANDLE_SWIFT_SECTION(acfuncs, "__swift5_acfuncs", "swift5_accessible_functions",
 HANDLE_SWIFT_SECTION(mpenum, "__swift5_mpenum", "swift5_mpenum", ".sw5mpen$B")
 
 // Debug info.
-HANDLE_SWIFT_SECTION(swiftast, "__swift_ast", "swift_ast", "swiftast")
+HANDLE_SWIFT_SECTION(swiftast, "__swift_ast", ".swift_ast", "swiftast")

From cfc922fc5a38bbb5f6e2dfa878ba0f52ed58d93e Mon Sep 17 00:00:00 2001
From: Tai Ly <tai.ly@arm.com>
Date: Thu, 26 Oct 2023 14:41:37 -0500
Subject: [PATCH 122/877] [Tosa] Add tosa-to-linalg-pipeline for testing
 (#69997)

Add tosa-to-linalg-pipeline that calls the function
addTosaToLinalgPasses, so it gets tested in core

also added tests in tosa-to-linalg-pipeline.mlir

Signed-off-by: Tai Ly <tai.ly@arm.com>
---
 mlir/include/mlir/Conversion/Passes.td        |  8 +--
 .../Conversion/TosaToLinalg/TosaToLinalg.h    |  4 ++
 mlir/include/mlir/InitAllPasses.h             |  1 +
 .../TosaToLinalg/TosaToLinalgPass.cpp         | 23 ++++++++-
 .../Tosa/Transforms/TosaValidation.cpp        |  1 -
 .../TosaToLinalg/tosa-to-linalg-pipeline.mlir | 50 +++++++++++++++++++
 6 files changed, 80 insertions(+), 7 deletions(-)
 create mode 100644 mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-pipeline.mlir

diff --git a/mlir/include/mlir/Conversion/Passes.td b/mlir/include/mlir/Conversion/Passes.td
index 274784fe4a7b2..f05e5a8ae667d 100644
--- a/mlir/include/mlir/Conversion/Passes.td
+++ b/mlir/include/mlir/Conversion/Passes.td
@@ -388,8 +388,8 @@ def ConvertFuncToLLVMPass : Pass<"convert-func-to-llvm", "ModuleOp"> {
     already present in the IR will be kept as is.
 
     An LLVM datalayout string can be attached as an attribute to the module on
-    which the pass anchors. Such an attribute is attached by calling the 
-    set-module-datalayout pass. If present, an llvm::DataLayout object is 
+    which the pass anchors. Such an attribute is attached by calling the
+    set-module-datalayout pass. If present, an llvm::DataLayout object is
     created from this attribute and used in the conversion to LLVM.
 
     #### Output IR
@@ -816,12 +816,12 @@ def ConvertMemRefToSPIRV : Pass<"convert-memref-to-spirv"> {
 def ConvertNVVMToLLVMPass : Pass<"convert-nvvm-to-llvm"> {
   let summary = "Convert NVVM to PTX with Inline Assembly in LLVM dialect";
   let description = [{
-    This pass generates PTX instructions using inline assembly for NVVM 
+    This pass generates PTX instructions using inline assembly for NVVM
     operations implements `BasicPtxBuilderInterface`.
   }];
   let dependentDialects = [
     "NVVM::NVVMDialect",
-  ];  
+  ];
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/Conversion/TosaToLinalg/TosaToLinalg.h b/mlir/include/mlir/Conversion/TosaToLinalg/TosaToLinalg.h
index c411010603ac6..b4c4eb8651a6f 100644
--- a/mlir/include/mlir/Conversion/TosaToLinalg/TosaToLinalg.h
+++ b/mlir/include/mlir/Conversion/TosaToLinalg/TosaToLinalg.h
@@ -38,6 +38,10 @@ void addTosaToLinalgPasses(
     tosa::TosaValidationOptions const &validationOptions = {
         tosa::TosaProfileEnum::Undefined, false, tosa::TosaLevelEnum::None});
 
+/// Populates TOSA to linalg pipelines
+/// Currently, this includes only the "tosa-to-linalg-pipeline".
+void registerTosaToLinalgPipelines();
+
 /// Populates conversion passes from TOSA dialect to Linalg dialect.
 void populateTosaToLinalgConversionPatterns(RewritePatternSet *patterns);
 
diff --git a/mlir/include/mlir/InitAllPasses.h b/mlir/include/mlir/InitAllPasses.h
index 7301905954f56..80894094484b9 100644
--- a/mlir/include/mlir/InitAllPasses.h
+++ b/mlir/include/mlir/InitAllPasses.h
@@ -88,6 +88,7 @@ inline void registerAllPasses() {
   // Dialect pipelines
   bufferization::registerBufferizationPipelines();
   sparse_tensor::registerSparseTensorPipelines();
+  tosa::registerTosaToLinalgPipelines();
 }
 
 } // namespace mlir
diff --git a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgPass.cpp b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgPass.cpp
index 2072fabc29242..a486e28c50c71 100644
--- a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgPass.cpp
+++ b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgPass.cpp
@@ -90,7 +90,26 @@ void mlir::tosa::addTosaToLinalgPasses(
   pm.addNestedPass<func::FuncOp>(tosa::createTosaLayerwiseConstantFoldPass(
       {options.aggressiveReduceConstant}));
   pm.addNestedPass<func::FuncOp>(tosa::createTosaMakeBroadcastablePass());
-  pm.addNestedPass<mlir::ModuleOp>(
-      tosa::createTosaValidation(validationOptions));
+  pm.addPass(tosa::createTosaValidation(validationOptions));
   pm.addNestedPass<func::FuncOp>(tosa::createTosaToLinalg());
 }
+
+//===----------------------------------------------------------------------===//
+// Pipeline registration.
+//===----------------------------------------------------------------------===//
+
+void mlir::tosa::registerTosaToLinalgPipelines() {
+  PassPipelineRegistration<>(
+      "tosa-to-linalg-pipeline",
+      "The default pipeline for converting TOSA operators to the equivalent "
+      "operations using the tensor operations in LinAlg as well as LinAlg "
+      "named operations.",
+      [](OpPassManager &pm) {
+        TosaToLinalgOptions tosaToLinalgOptions;
+        tosa::addTosaToLinalgPasses(pm, tosaToLinalgOptions,
+                                    /* validationOptions = */
+                                    {tosa::TosaProfileEnum::BaseInference,
+                                     /* StrictOperationSpecAlignment = */ true,
+                                     tosa::TosaLevelEnum::EightK});
+      });
+}
diff --git a/mlir/lib/Dialect/Tosa/Transforms/TosaValidation.cpp b/mlir/lib/Dialect/Tosa/Transforms/TosaValidation.cpp
index 424a31175d617..967775281ad91 100644
--- a/mlir/lib/Dialect/Tosa/Transforms/TosaValidation.cpp
+++ b/mlir/lib/Dialect/Tosa/Transforms/TosaValidation.cpp
@@ -15,7 +15,6 @@
 #include "mlir/Dialect/Tosa/Transforms/PassesEnums.cpp.inc"
 
 #include <string>
-#include <unordered_map>
 
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/Tosa/IR/TosaOps.h"
diff --git a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-pipeline.mlir b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-pipeline.mlir
new file mode 100644
index 0000000000000..e009cab4b1bf5
--- /dev/null
+++ b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-pipeline.mlir
@@ -0,0 +1,50 @@
+// RUN: mlir-opt %s --split-input-file --tosa-to-linalg-pipeline -verify-diagnostics
+
+
+// -----
+
+// check that -tosa-validate of stateful ops kick in
+func.func @test_variable_write_shape(%arg0: tensor<1x4x8xi32>) -> () {
+  tosa.variable @stored_var = dense<-1> : tensor<2x4x8xi32>
+  // expected-error@+1 {{'tosa.variable.write' op operand type does not equal variable type}}
+  tosa.variable.write @stored_var, %arg0 : tensor<1x4x8xi32>
+  return
+}
+
+// -----
+
+// check that -tosa-validate level checking kick in
+func.func @tensor_with_unknown_rank(%arg0: tensor<*xi8>) -> tensor<*xi8> {
+  // expected-error@+1 {{'tosa.abs' op failed level check: unranked tensor}}
+  %0 = "tosa.abs"(%arg0) : (tensor<*xi8>) -> tensor<*xi8>
+  return %0 : tensor<*xi8>
+}
+
+// -----
+
+// check that tosa verify kick in
+func.func @test_avg_pool2d_zero_dim_input(%arg0: tensor<1x0x?x9xf32>) -> tensor<1x7x7x9xf32> {
+  // expected-error@+1 {{'tosa.avg_pool2d' op tensor has a dimension with size zero. Each dimension of a tensor must have size >= 1}}
+    %0 = "tosa.avg_pool2d"(%arg0) {acc_type = f32, kernel = array<i64: 2, 2>, pad = array<i64: 0, 1, 0, 1>, stride = array<i64: 1, 1>}
+      : (tensor<1x0x?x9xf32>) -> tensor<1x7x7x9xf32>
+    return %0 : tensor<1x7x7x9xf32>
+}
+
+// -----
+
+// check that --tosa-to-linalg kick in
+func.func @avg_pool2d_with_unsupported_quant_type(%arg0: tensor<1x7x7x9x!quant.uniform<i8:f32, 0.01>>) -> tensor<1x7x7x9x!quant.uniform<i8:f32, 0.01>> {
+  // expected-error@+1 {{failed to legalize operation 'tosa.avg_pool2d'}}
+  %0 = "tosa.avg_pool2d"(%arg0) {acc_type = i32, kernel = array<i64: 2, 2>, pad = array<i64: 0, 1, 0, 1>, stride = array<i64: 1, 1>} : (tensor<1x7x7x9x!quant.uniform<i8:f32, 0.01>>) -> tensor<1x7x7x9x!quant.uniform<i8:f32, 0.01>>
+  return %0 : tensor<1x7x7x9x!quant.uniform<i8:f32, 0.01>>
+}
+
+// -----
+
+// check that --tosa-validate=strict-op-spec-alignment does not kick in because tosa-to-linalg-named comes before tosa-validate
+// this would have failed tosa strict-op-spec-alignment because perms of transpose is not constant
+// but tosa.transpose is lowered by tosa-to-linalg-named pass which is earlier than tosa-validate pass in the pipeline
+func.func @test_transpose_non_const(%arg0: tensor<13x21x3xf32>, %arg1: tensor<3xi32>) -> tensor<3x13x21xf32> {
+  %0 = tosa.transpose %arg0, %arg1 : (tensor<13x21x3xf32>, tensor<3xi32>) -> tensor<3x13x21xf32>
+  return %0 : tensor<3x13x21xf32>
+}

From b1bb23991ab3014c1f02b9a8c8cfeb578b273877 Mon Sep 17 00:00:00 2001
From: Aart Bik <39774503+aartbik@users.noreply.github.com>
Date: Thu, 26 Oct 2023 12:46:23 -0700
Subject: [PATCH 123/877] [mlir][sparse] merger cleanup (#70371)

Implemented some TODOs and removed unlikely ones.
Comment cleanup
---
 .../mlir/Dialect/SparseTensor/Utils/Merger.h  | 78 +++++--------------
 .../SparseTensor/Transforms/CodegenEnv.cpp    | 12 +--
 2 files changed, 22 insertions(+), 68 deletions(-)

diff --git a/mlir/include/mlir/Dialect/SparseTensor/Utils/Merger.h b/mlir/include/mlir/Dialect/SparseTensor/Utils/Merger.h
index 5e75380067572..215920f8b4607 100644
--- a/mlir/include/mlir/Dialect/SparseTensor/Utils/Merger.h
+++ b/mlir/include/mlir/Dialect/SparseTensor/Utils/Merger.h
@@ -122,19 +122,10 @@ struct TensorExp final {
 ///
 /// The `kLoopVar` leaf kind is for representing `linalg::IndexOp`.
 /// That is, its argument is a `LoopId` identifying the loop-variable
-/// in question, and its value will be the current iteration's value
-/// of that loop-variable.  See the `LoopId` documentation for more details.
-///
-/// The `kSynZero` leaf kind is for representing a synthetic zero value, which
-/// can be introduced when sparsifying operations like `arith::cmp` to generate
-/// `arith::cmp %lhs, %syn_zero` when the rhs operand is absent.
-//
-// TODO: Modify this definition so that the numeric values already encode
-// the `ExpArity` (while extending the notion of "arity" to include not
-// just the number of `ExprId` children the node has, but also whether the
-// node has a `Value` and/or `Operation*`).  Doing this will avoid needing
-// to enumerate all the kinds in `getExpArity` and in the `TensorExp` ctor,
-// and should help clean up a few other places as well.
+/// in question, and its value will be the current iteration's value.
+/// The `kSynZero` leaf kind is for representing a synthetic zero value,
+/// which can be introduced when sparsifying operations like `arith::cmp`
+/// to generate `arith::cmp %lhs, %syn_zero` when the rhs operand is absent.
 enum class TensorExp::Kind {
   // Leaf.
   kTensor = 0,
@@ -253,15 +244,6 @@ class Merger {
   ///
   /// The maxLvlRank specifies the max level rank of all inputs/output tensors.
   /// It is used to pre-allocate sufficient memory for internal storage.
-  //
-  // TODO: we want to make the filter loop more efficient in the future,
-  // e.g., by avoiding scanning the full list of stored coordinates (keeping
-  // the last position in ordered list) or even apply binary search to find
-  // the coordinate.
-  //
-  // TODO: would be cleaner to understand/document if the first argument
-  // gave the number of input tensors, instead of the current number of
-  // input+output tensors.
   Merger(unsigned numInputOutputTensors, unsigned numNativeLoops,
          unsigned numFilterLoops, unsigned maxLvlRank);
 
@@ -383,12 +365,15 @@ class Merger {
 
   /// Gets the total number of loops (native loops + filter loops).
   constexpr unsigned getNumLoops() const { return numLoops; }
+
   /// Gets the number of native loops.
   constexpr unsigned getNumNativeLoops() const { return numNativeLoops; }
+
   /// Gets the number of filter loops.
   constexpr unsigned getNumFilterLoops() const {
     return numLoops - numNativeLoops;
   }
+
   /// Gets the identifier of the first filter-loop.
   constexpr LoopId getStartingFilterLoopId() const {
     return getNumNativeLoops();
@@ -473,8 +458,7 @@ class Merger {
     lvlTypes[t][i] = dlt;
     loopToLvl[t][i] = lvl;
     lvlToLoop[t][lvl] = i;
-    // TODO: Maybe we should favor a constant loop bound when there are multiple
-    // choices.
+    // TODO: favor a constant loop bound when there are multiple choices.
     loopBounds[i] = std::make_pair(t, lvl);
   }
 
@@ -600,43 +584,19 @@ class Merger {
   /// Checks whether the given expression has an associated value.
   bool hasExprValue(ExprId e) const { return static_cast<bool>(exp(e).val); }
 
-  /// Sets the expression to have the associated value.  Asserts that
-  /// the new value is defined, and that the expression does not already
-  /// have a value.  If you want to overwrite a previous associated value,
-  /// use `updateExprValue` instead.
+  /// Sets the expression to have the associated value. Asserts that the new
+  /// value is defined, and that the expression does not already have a value.
   void setExprValue(ExprId e, Value v) {
-    assert(isValidExprId(e));
-    assert(v && "Got an undefined value");
-    auto &val = tensorExps[e].val;
-    assert(!val && "Expression already has an associated value");
-    val = v;
+    assert(!exp(e).val && "Expression already has an associated value");
+    assert(v && "Trying to assign an undefined value");
+    tensorExps[e].val = v;
   }
 
-  /// Clears the value associated with the expression.  Asserts that the
+  /// Clears the value associated with the expression. Asserts that the
   /// expression does indeed have an associated value before clearing it.
-  /// If you don't want to check for a previous associated value first,
-  /// then use `updateExprValue` instead.
   void clearExprValue(ExprId e) {
-    assert(isValidExprId(e));
-    auto &val = tensorExps[e].val;
-    assert(val && "Expression does not have an associated value to clear");
-    val = Value();
-  }
-
-  /// Unilaterally updates the expression to have the associated value.
-  /// That is, unlike `setExprValue` and `clearExprValue`, this method
-  /// does not perform any checks on whether the expression had a
-  /// previously associated value nor whether the new value is defined.
-  //
-  // TODO: The unilateral update semantics are required by the
-  // current implementation of `CodegenEnv::genLoopBoundary`; however,
-  // that implementation seems a bit dubious.  We would much rather have
-  // the semantics `{ clearExprValue(e); setExprValue(e, v); }` or
-  // `{ clearExprValue(e); if (v) setExprValue(e, v); }` since those
-  // provide better invariants.
-  void updateExprValue(ExprId e, Value v) {
-    assert(isValidExprId(e));
-    tensorExps[e].val = v;
+    assert(exp(e).val && "Expression does not have an associated value");
+    tensorExps[e].val = Value();
   }
 
 #ifndef NDEBUG
@@ -706,12 +666,10 @@ class Merger {
   // `operator[]`: `SmallVector` performs OOB checks, whereas `std::vector`
   // does not.
 
-  /// Map that converts pair<TensorId, LoopId> to the corresponding
-  /// level-type.
+  /// Map that converts pair<TensorId, LoopId> to the corresponding lvl-type.
   std::vector<std::vector<DimLevelType>> lvlTypes;
 
-  /// Map that converts pair<TensorId, LoopId> to the corresponding
-  /// level.
+  /// Map that converts pair<TensorId, LoopId> to the corresponding lvl.
   std::vector<std::vector<std::optional<Level>>> loopToLvl;
 
   /// Map that converts pair<TensorId, Level> to the corresponding LoopId.
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/CodegenEnv.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/CodegenEnv.cpp
index 924b0a0dac811..5c7cc93737b7f 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/CodegenEnv.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/CodegenEnv.cpp
@@ -137,9 +137,6 @@ std::optional<Operation *> CodegenEnv::genLoopBoundary(
   auto r = callback(params); // may update parameters
   unsigned i = 0;
   if (isReduc()) {
-    // FIXME: This requires `updateExprValue` to perform updates without
-    // checking for a previous value; but it's not clear whether that's
-    // by design or might be a potential source for bugs.
     updateReduc(params[i++]);
     if (redValidLexInsert)
       setValidLexInsert(params[i++]);
@@ -283,16 +280,15 @@ void CodegenEnv::endExpand() {
 void CodegenEnv::startReduc(ExprId exp, Value val) {
   assert(!isReduc() && exp != detail::kInvalidId);
   redExp = exp;
-  updateReduc(val);
+  redVal = val;
+  latticeMerger.setExprValue(exp, val);
 }
 
 void CodegenEnv::updateReduc(Value val) {
   assert(isReduc());
   redVal = val;
-  // NOTE: `genLoopBoundary` requires that this performs a unilateral
-  // update without checking for a previous value first.  (It's not
-  // clear whether any other callsites also require that.)
-  latticeMerger.updateExprValue(redExp, val);
+  latticeMerger.clearExprValue(redExp);
+  latticeMerger.setExprValue(redExp, val);
 }
 
 Value CodegenEnv::endReduc() {

From b13e9efcef53bf913c1f93f0522bd50e39d44b6c Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph@gmail.com>
Date: Thu, 26 Oct 2023 12:53:44 -0700
Subject: [PATCH 124/877] Update llvm/docs/MyFirstTypoFix.rst for
 post-Phabricator / Pull-requests world (#70310)

---
 llvm/docs/MyFirstTypoFix.rst | 99 ++++++------------------------------
 1 file changed, 15 insertions(+), 84 deletions(-)

diff --git a/llvm/docs/MyFirstTypoFix.rst b/llvm/docs/MyFirstTypoFix.rst
index 27324049af330..b6040af756e26 100644
--- a/llvm/docs/MyFirstTypoFix.rst
+++ b/llvm/docs/MyFirstTypoFix.rst
@@ -66,8 +66,6 @@ We're going to need some tools:
 
 -  python: to run the LLVM tests,
 
--  arcanist: for uploading changes for review,
-
 As an example, on Ubuntu:
 
 .. code:: console
@@ -345,42 +343,21 @@ all the \*-commits mailing lists).
 Uploading a change for review
 -----------------------------
 
-.. warning::
-
-  Phabricator is deprecated and will be switched to read-only mode in October
-  2023. For new code contributions use :ref:`GitHub Pull Requests <github-reviews>`.
-  This section contains old information that needs to be updated.
-
-LLVM code reviews happen at https://reviews.llvm.org. The web interface
-is called Phabricator, and the code review part is Differential. You
-should create a user account there for reviews (click "Log In" and then
-"Register new account").
-
-Now you can upload your change for review:
-
-.. code:: console
-
-   $ arc diff HEAD^
-
-This creates a review for your change, comparing your current commit
-with the previous commit. You will be prompted to fill in the review
-details. Your commit message is already there, so just add cfe-commits
-under the "subscribers" section. It should print a code review URL:
-https://reviews.llvm.org/D58291 You can always find your active reviews
-on Phabricator under "My activity".
-
+LLVM code reviews happen through pull-request on GitHub, see
+:ref:`GitHub <github-reviews>` documentation for how to open
+a pull-request on GitHub.
 
 Review process
 --------------
 
-When you upload a change for review, an email is sent to you, the
-cfe-commits list, and anyone else subscribed to these kinds of changes.
+When you open a pull-request, some automation will add a comment and
+notify different member of the projects depending on the component you
+changed.
 Within a few days, someone should start the review. They may add
 themselves as a reviewer, or simply start leaving comments. You'll get
 another email any time the review is updated. The details are in the
 `https://llvm.org/docs/CodeReview/ <https://llvm.org/docs/CodeReview.html>`__.
 
-
 Comments
 ~~~~~~~~
 
@@ -395,15 +372,9 @@ page.
 Updating your change
 ~~~~~~~~~~~~~~~~~~~~
 
-If you make changes in response to a reviewer's comments, simply run
-
-.. code:: console
-
-   $ arc diff
-
-again to update the change and notify the reviewer. Typically this is a
-good time to send any draft comments as well.
-
+If you make changes in response to a reviewer's comments, simply update
+your branch with more commits and push to your fork. It may be a good
+idea to answer the comments from the reviewer explicitly.
 
 Accepting a revision
 ~~~~~~~~~~~~~~~~~~~~
@@ -411,21 +382,20 @@ Accepting a revision
 When the reviewer is happy with the change, they will **Accept** the
 revision. They may leave some more minor comments that you should
 address, but at this point the review is complete. It's time to get it
-committed!
+merged!
 
 
 Commit by proxy
 ---------------
 
-As this is your first change, you won't have access to commit it
+As this is your first change, you won't have access to merge it
 yourself yet. The reviewer **doesn't know this**, so you need to tell
 them! Leave a message on the review like:
 
    Thanks @somellvmdev. I don't have commit access, can you land this
-   patch for me? Please use "My Name my@email" to commit the change.
-
-The review will be updated when the change is committed.
+   patch for me?
 
+The pull-request will be closed and you will be notified by GitHub.
 
 Review expectations
 -------------------
@@ -484,46 +454,6 @@ changes (e.g. llvm-commits@lists.llvm.org), as discussion often happens
 there if a new patch causes problems.
 
 
-Commit
-------
-
-Let's say you have a change on a local git branch, reviewed and ready to
-commit. Things to do first:
-
--  if you used multiple fine-grained commits locally, squash them into a
-   single commit. LLVM prefers commits to match the code that was
-   reviewed. (If you created one commit and then used "arc diff", you're
-   fine)
-
--  rebase your patch against the latest LLVM code. LLVM uses a linear
-   history, so everything should be based on an up-to-date origin/main.
-
-.. code:: console
-
-   $ git pull --rebase https://github.com/llvm/llvm-project.git main
-
--  ensure the patch looks correct.
-
-.. code:: console
-
-   $ git show
-
--  run the tests one last time, for good luck
-
-At this point git show should show a single commit on top of
-origin/main.
-
-Now you can push your commit with
-
-.. code:: console
-
-   $ git push https://github.com/llvm/llvm-project.git HEAD:main
-
-You should see your change `on
-GitHub <https://github.com/llvm/llvm-project/commits/main>`__ within
-minutes.
-
-
 Post-commit errors
 ------------------
 
@@ -559,7 +489,8 @@ buildbots, overview of bots, getting error logs.
 Reverts
 -------
 
-if in doubt, revert and re-land.
+If in doubt, revert immediately, and re-land later after investigation
+and fix.
 
 
 Conclusion

From f24c443e8241df7df1d5152c45636c76b682a043 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andrzej=20Warzy=C5=84ski?= <andrzej.warzynski@arm.com>
Date: Thu, 26 Oct 2023 20:57:49 +0100
Subject: [PATCH 125/877] [mlir][SVE] Add an e2e test for vector.contract
 (#69845)

Adds an end-to-end test for `vector.contract` that targets SVE (i.e.
scalable vectors). Note that this requires lifting the restriction on
`vector.outerproduct` (to which `vector.contract` is lowered to) that
would deem the following as invalid by the Op verifier (*):

```
vector.outerproduct %27, %28, %26 {kind = #vector.kind<add>} : vector<3xf32>, vector<[2]xf32>
```

This is indeed valid as the end-to-end test demonstrates (at least when
compiling for SVE).
---
 mlir/lib/Dialect/Vector/IR/VectorOps.cpp      |   9 +-
 ...r-contract-to-outerproduct-transforms.mlir |  59 +++++++---
 .../Vector/vector-scalable-outerproduct.mlir  |   5 +-
 .../Vector/CPU/ArmSVE/test-contraction.mlir   | 103 ++++++++++++++++++
 4 files changed, 157 insertions(+), 19 deletions(-)
 create mode 100644 mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/test-contraction.mlir

diff --git a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
index 8de132daf3c6a..d77476c109083 100644
--- a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
+++ b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
@@ -3067,9 +3067,12 @@ LogicalResult OuterProductOp::verify() {
       return emitOpError("expected #1 operand dim to match result dim #1");
     if (vRHS.getDimSize(0) != vRES.getDimSize(1))
       return emitOpError("expected #2 operand dim to match result dim #2");
-    if (vRHS.isScalable() != vLHS.isScalable())
-      return emitOpError("expected either all or none of vector operands #1 "
-                         "and #2 to be scalable");
+    if (vLHS.isScalable() && !vRHS.isScalable()) {
+      // This restriction reflects what's currently supported in terms of
+      // scalable vectors. However, we could relax this if there's a use case.
+      return emitOpError(
+          "expected either both or only #2 operand dim to be scalable");
+    }
   } else {
     // An AXPY operation.
     if (vRES.getRank() != 1)
diff --git a/mlir/test/Dialect/Vector/vector-contract-to-outerproduct-transforms.mlir b/mlir/test/Dialect/Vector/vector-contract-to-outerproduct-transforms.mlir
index 3530393c013a1..44fb23088cea9 100644
--- a/mlir/test/Dialect/Vector/vector-contract-to-outerproduct-transforms.mlir
+++ b/mlir/test/Dialect/Vector/vector-contract-to-outerproduct-transforms.mlir
@@ -79,21 +79,21 @@ func.func @masked_extract_contract2_scalable_parallel_dim(%arg0: vector<[2]x3xf3
 }
 
 // CHECK-LABEL: func.func @masked_extract_contract4(
-// CHECK-SAME:                                      %[[VAL_0:.*]]: vector<3x5xf32>,
-// CHECK-SAME:                                      %[[VAL_1:.*]]: vector<5x7xf32>,
-// CHECK-SAME:                                      %[[VAL_2:.*]]: vector<3x7xf32>,
-// CHECK-SAME:                                      %[[VAL_3:.*]]: vector<3x7x5xi1>) -> vector<3x7xf32> {
-// CHECK:         %[[VAL_5:.*]] = vector.transpose %[[VAL_3]], [2, 0, 1] : vector<3x7x5xi1> to vector<5x3x7xi1>
-// CHECK:         %[[VAL_8:.*]] = vector.extract %[[VAL_5]][0] : vector<3x7xi1> from vector<5x3x7xi1>
-// CHECK:         %[[VAL_9:.*]] = vector.mask %[[VAL_8]] { vector.outerproduct %{{.*}} {kind = #vector.kind<add>} : vector<3xf32>, vector<7xf32> } : vector<3x7xi1> -> vector<3x7xf32>
-// CHECK:         %[[VAL_12:.*]] = vector.extract %[[VAL_5]][1] : vector<3x7xi1> from vector<5x3x7xi1>
-// CHECK:         %[[VAL_13:.*]] = vector.mask %[[VAL_12]] { vector.outerproduct %{{.*}} {kind = #vector.kind<add>} : vector<3xf32>, vector<7xf32> } : vector<3x7xi1> -> vector<3x7xf32>
-// CHECK:         %[[VAL_16:.*]] = vector.extract %[[VAL_5]][2] : vector<3x7xi1> from vector<5x3x7xi1>
-// CHECK:         %[[VAL_17:.*]] = vector.mask %[[VAL_16]] { vector.outerproduct %{{.*}} {kind = #vector.kind<add>} : vector<3xf32>, vector<7xf32> } : vector<3x7xi1> -> vector<3x7xf32>
-// CHECK:         %[[VAL_20:.*]] = vector.extract %[[VAL_5]][3] : vector<3x7xi1> from vector<5x3x7xi1>
-// CHECK:         %[[VAL_21:.*]] = vector.mask %[[VAL_20]] { vector.outerproduct %{{.*}} {kind = #vector.kind<add>} : vector<3xf32>, vector<7xf32> } : vector<3x7xi1> -> vector<3x7xf32>
-// CHECK:         %[[VAL_24:.*]] = vector.extract %[[VAL_5]][4] : vector<3x7xi1> from vector<5x3x7xi1>
-// CHECK:         %[[VAL_25:.*]] = vector.mask %[[VAL_24]] { vector.outerproduct %{{.*}} {kind = #vector.kind<add>} : vector<3xf32>, vector<7xf32> } : vector<3x7xi1> -> vector<3x7xf32>
+// CHECK-SAME:    %{{.*}}: vector<3x5xf32>,
+// CHECK-SAME:    %{{.*}}: vector<5x7xf32>,
+// CHECK-SAME:    %{{.*}}: vector<3x7xf32>,
+// CHECK-SAME:    %[[IN_MASK:.*]]: vector<3x7x5xi1>) -> vector<3x7xf32> {
+// CHECK:         %[[T_MASK:.*]] = vector.transpose %[[IN_MASK]], [2, 0, 1] : vector<3x7x5xi1> to vector<5x3x7xi1>
+// CHECK:         %[[T_MASK_R0:.*]] = vector.extract %[[T_MASK]][0] : vector<3x7xi1> from vector<5x3x7xi1>
+// CHECK:         %{{.*}} = vector.mask %[[T_MASK_R0]] { vector.outerproduct %{{.*}} {kind = #vector.kind<add>} : vector<3xf32>, vector<7xf32> } : vector<3x7xi1> -> vector<3x7xf32>
+// CHECK:         %[[T_MASK_R1:.*]] = vector.extract %[[T_MASK]][1] : vector<3x7xi1> from vector<5x3x7xi1>
+// CHECK:         %{{.*}} = vector.mask %[[T_MASK_R1]] { vector.outerproduct %{{.*}} {kind = #vector.kind<add>} : vector<3xf32>, vector<7xf32> } : vector<3x7xi1> -> vector<3x7xf32>
+// CHECK:         %[[T_MASK_R2:.*]] = vector.extract %[[T_MASK]][2] : vector<3x7xi1> from vector<5x3x7xi1>
+// CHECK:         %{{.*}} = vector.mask %[[T_MASK_R2]] { vector.outerproduct %{{.*}} {kind = #vector.kind<add>} : vector<3xf32>, vector<7xf32> } : vector<3x7xi1> -> vector<3x7xf32>
+// CHECK:         %[[T_MASK_R3:.*]] = vector.extract %[[T_MASK]][3] : vector<3x7xi1> from vector<5x3x7xi1>
+// CHECK:         %{{.*}} = vector.mask %[[T_MASK_R3]] { vector.outerproduct %{{.*}} {kind = #vector.kind<add>} : vector<3xf32>, vector<7xf32> } : vector<3x7xi1> -> vector<3x7xf32>
+// CHECK:         %[[T_MASK_R4:.*]] = vector.extract %[[T_MASK]][4] : vector<3x7xi1> from vector<5x3x7xi1>
+// CHECK:         %{{.*}} = vector.mask %[[T_MASK_R4]] { vector.outerproduct %{{.*}} {kind = #vector.kind<add>} : vector<3xf32>, vector<7xf32> } : vector<3x7xi1> -> vector<3x7xf32>
 
 func.func @masked_extract_contract4(%arg0: vector<3x5xf32>,
                                     %arg1: vector<5x7xf32>,
@@ -104,6 +104,35 @@ func.func @masked_extract_contract4(%arg0: vector<3x5xf32>,
   return %0 : vector<3x7xf32>
 }
 
+// CHECK-LABEL: func.func @masked_extract_contract4_scalable_J_dim(
+// CHECK-SAME:    %{{.*}}: vector<3x5xf32>,
+// CHECK-SAME:    %{{.*}}: vector<5x[7]xf32>,
+// CHECK-SAME:    %{{.*}}: vector<3x[7]xf32>,
+// CHECK-SAME:    %[[IN_MASK:.*]]: vector<3x[7]x5xi1>) -> vector<3x[7]xf32> {
+// CHECK:         %[[T_MASK:.*]] = vector.transpose %[[IN_MASK]], [2, 0, 1] : vector<3x[7]x5xi1> to vector<5x3x[7]xi1>
+// CHECK:         %[[T_MASK_R0:.*]] = vector.extract %[[T_MASK]][0] : vector<3x[7]xi1> from vector<5x3x[7]xi1>
+// CHECK:         %{{.*}} = vector.mask %[[T_MASK_R0]] { vector.outerproduct %{{.*}} {kind = #vector.kind<add>} : vector<3xf32>, vector<[7]xf32> } : vector<3x[7]xi1> -> vector<3x[7]xf32>
+// CHECK:         %[[T_MASK_R1:.*]] = vector.extract %[[T_MASK]][1] : vector<3x[7]xi1> from vector<5x3x[7]xi1>
+// CHECK:         %[[VAL_13:.*]] = vector.mask %[[T_MASK_R1]] { vector.outerproduct %{{.*}} {kind = #vector.kind<add>} : vector<3xf32>, vector<[7]xf32> } : vector<3x[7]xi1> -> vector<3x[7]xf32>
+// CHECK:         %[[T_MASK_R2:.*]] = vector.extract %[[T_MASK]][2] : vector<3x[7]xi1> from vector<5x3x[7]xi1>
+// CHECK:         %{{.*}} = vector.mask %[[T_MASK_R2]] { vector.outerproduct %{{.*}} {kind = #vector.kind<add>} : vector<3xf32>, vector<[7]xf32> } : vector<3x[7]xi1> -> vector<3x[7]xf32>
+// CHECK:         %[[T_MASK_R3:.*]] = vector.extract %[[T_MASK]][3] : vector<3x[7]xi1> from vector<5x3x[7]xi1>
+// CHECK:         %{{.*}} = vector.mask %[[T_MASK_R3]] { vector.outerproduct %{{.*}} {kind = #vector.kind<add>} : vector<3xf32>, vector<[7]xf32> } : vector<3x[7]xi1> -> vector<3x[7]xf32>
+// CHECK:         %[[T_MASK_R4:.*]] = vector.extract %[[T_MASK]][4] : vector<3x[7]xi1> from vector<5x3x[7]xi1>
+// CHECK:         %{{.*}} = vector.mask %[[T_MASK_R4]] { vector.outerproduct %{{.*}} {kind = #vector.kind<add>} : vector<3xf32>, vector<[7]xf32> } : vector<3x[7]xi1> -> vector<3x[7]xf32>
+
+// Note that only the J dimension is scalable in this example. In theory, all
+// dimensions could be be scalable, but there is no target yet for which this
+// would make sense.
+func.func @masked_extract_contract4_scalable_J_dim(%arg0: vector<3x5xf32>,
+                                    %arg1: vector<5x[7]xf32>,
+                                    %arg2: vector<3x[7]xf32>,
+                                    %m : vector<3x[7]x5xi1>) -> vector<3x[7]xf32> {
+  %0 = vector.mask %m { vector.contract #matmat_trait %arg0, %arg1, %arg2
+  : vector<3x5xf32>, vector<5x[7]xf32> into vector<3x[7]xf32> } : vector<3x[7]x5xi1> -> vector<3x[7]xf32>
+  return %0 : vector<3x[7]xf32>
+}
+
 // CHECK-LABEL: func @matmul
 // CHECK-SAME: %[[A:[a-zA-Z0-9]*]]: vector<2x4xf32>,
 // CHECK-SAME: %[[B:[a-zA-Z0-9]*]]: vector<4x3xf32>,
diff --git a/mlir/test/Dialect/Vector/vector-scalable-outerproduct.mlir b/mlir/test/Dialect/Vector/vector-scalable-outerproduct.mlir
index 7d9923e036660..3b4e24da92aaa 100644
--- a/mlir/test/Dialect/Vector/vector-scalable-outerproduct.mlir
+++ b/mlir/test/Dialect/Vector/vector-scalable-outerproduct.mlir
@@ -21,9 +21,12 @@ func.func @invalid_outerproduct(%src : memref<?xf32>) {
   %0 = vector.load %src[%idx] : memref<?xf32>, vector<[4]xf32>
   %1 = vector.load %src[%idx] : memref<?xf32>, vector<4xf32>
 
-  // expected-error @+1 {{expected either all or none of vector operands #1 and #2 to be scalable}}
+  // expected-error @+1 {{expected either both or only #2 operand dim to be scalable}}
   %op = vector.outerproduct %0, %1 : vector<[4]xf32>, vector<4xf32>
+
+  return
 }
+
 // -----
 
 func.func @invalid_outerproduct1(%src : memref<?xf32>) {
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/test-contraction.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/test-contraction.mlir
new file mode 100644
index 0000000000000..9bc6eab39d69f
--- /dev/null
+++ b/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/test-contraction.mlir
@@ -0,0 +1,103 @@
+// DEFINE: %{compile} = mlir-opt %s  -test-transform-dialect-interpreter -test-transform-dialect-erase-schedule\
+// DEFINE:    -cse -canonicalize -convert-vector-to-scf -arm-sve-legalize-vector-storage\
+// DEFINE:    -convert-vector-to-llvm="enable-arm-sve" -test-lower-to-llvm -o %t
+// DEFINE: %{entry} =
+// DEFINE: %{run} = %mcr_aarch64_cmd %t -e=%{entry} -entry-point-result=void --march=aarch64 --mattr="+sve" -shared-libs=%mlir_lib_dir/libmlir_c_runner_utils%shlibext
+
+// This check whether the files compiles and generates a temporary that will be executed further down.
+// RUN: %{compile}
+
+// REDEFINE: %{entry} = matmul_i32
+// RUN: %{run} | FileCheck %s --check-prefix=I32
+
+// REDEFINE: %{entry} = matmul_f32
+// RUN: %{run} | FileCheck %s --check-prefix=F32
+
+// NOTE: These tests are meant to complement the integration tests from:
+//    * ../test-contraction.mlir
+// (tests with fixed width vectors). Rather than duplicating those tests, this
+// file focuses on excercissing scalable vectors in a few most common cases.
+
+// TODO: Masks + matvec + dot product
+
+#matmat_accesses = [
+  affine_map<(i, j, k) -> (i, k)>,
+  affine_map<(i, j, k) -> (k, j)>,
+  affine_map<(i, j, k) -> (i, j)>
+]
+#matmat_trait = {
+  indexing_maps = #matmat_accesses,
+  iterator_types = ["parallel", "parallel", "reduction"]
+}
+
+func.func @matmul_i32() {
+  // Setup vector A:
+  %vector_a = arith.constant dense<123> : vector<3x5xi32>
+
+  // Setup vector B:
+  %vector_b = arith.constant dense<123> : vector<5x[2]xi32>
+
+  // Setup vector C:
+  %vector_c = arith.constant dense<314> : vector<3x[2]xi32>
+
+  // Matmul
+  %0 = vector.contract #matmat_trait %vector_a, %vector_b, %vector_c
+    : vector<3x5xi32>, vector<5x[2]xi32> into vector<3x[2]xi32>
+
+  // Print the output
+  %slice1 = vector.extract %0[0] : vector<[2]xi32> from vector<3x[2]xi32>
+  // I32: ( 75959, 75959
+  vector.print %slice1 : vector<[2]xi32>
+  %slice2 = vector.extract %0[1] : vector<[2]xi32> from vector<3x[2]xi32>
+  // I32-NEXT: ( 75959, 75959
+  vector.print %slice2 : vector<[2]xi32>
+  %slice3 = vector.extract %0[2] : vector<[2]xi32> from vector<3x[2]xi32>
+  // I32-NEXT: ( 75959, 75959
+  vector.print %slice3 : vector<[2]xi32>
+
+  // CHECK: SVE: END OF TEST OUTPUT
+  vector.print str "SVE: END OF TEST OUTPUT"
+
+  return
+}
+
+func.func @matmul_f32() {
+  // Setup vector A:
+  %vector_a = arith.constant dense<1.23> : vector<3x5xf32>
+
+  // Setup vector B:
+  %vector_b = arith.constant dense<1.23> : vector<5x[2]xf32>
+
+  // Setup vector C:
+  %vector_c = arith.constant dense<3.14> : vector<3x[2]xf32>
+
+  // Matmul
+  %0 = vector.contract #matmat_trait %vector_a, %vector_b, %vector_c
+    : vector<3x5xf32>, vector<5x[2]xf32> into vector<3x[2]xf32>
+
+  // Print the output
+  %slice1 = vector.extract %0[0] : vector<[2]xf32> from vector<3x[2]xf32>
+  // F32: ( 10.7045, 10.7045
+  vector.print %slice1 : vector<[2]xf32>
+  %slice2 = vector.extract %0[1] : vector<[2]xf32> from vector<3x[2]xf32>
+  // F32-NEXT: ( 10.7045, 10.7045
+  vector.print %slice2 : vector<[2]xf32>
+  %slice3 = vector.extract %0[2] : vector<[2]xf32> from vector<3x[2]xf32>
+  // F32-NEXT: ( 10.7045, 10.7045
+  vector.print %slice3 : vector<[2]xf32>
+
+  // CHECK: SVE: END OF TEST OUTPUT
+  vector.print str "SVE: END OF TEST OUTPUT"
+
+  return
+}
+
+transform.sequence failures(propagate) {
+^bb1(%module_op: !transform.any_op):
+  %f = transform.structured.match ops{["func.func"]} in %module_op
+    : (!transform.any_op) -> !transform.any_op
+
+  transform.apply_patterns to %f {
+    transform.apply_patterns.vector.lower_contraction lowering_strategy = "outerproduct"
+  } : !transform.any_op
+}

From 52315f9b75f1bd02367fbf38096a58d10d3c67d8 Mon Sep 17 00:00:00 2001
From: David Pagan <dave.pagan@amd.com>
Date: Thu, 26 Oct 2023 13:19:37 -0700
Subject: [PATCH 126/877] [clang][OpenMP] Fix target data if/logical expression
 assert fail (#70268)

Fixed assertion failure

  Basic Block in function 'main' does not have terminator!
  label %land.end

caused by premature setting of CodeGenIP upon entry to
emitTargetDataCalls, where subsequent evaluation of logical expression
created new basic blocks, leaving CodeGenIP pointing to the wrong basic
block. CodeGenIP is now set near the end of the function, just prior to
generating a comparison of the logical expression result (from the if
clause) which uses CodeGenIP to insert new IR.
---
 clang/lib/CodeGen/CGOpenMPRuntime.cpp         |  10 +-
 .../OpenMP/target_data_if_logical_codegen.cpp | 120 ++++++++++++++++++
 2 files changed, 125 insertions(+), 5 deletions(-)
 create mode 100644 clang/test/OpenMP/target_data_if_logical_codegen.cpp

diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index aae1a0ea250ee..75fad160b7162 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -10230,11 +10230,6 @@ void CGOpenMPRuntime::emitTargetDataCalls(
   PrePostActionTy NoPrivAction;
 
   using InsertPointTy = llvm::OpenMPIRBuilder::InsertPointTy;
-  InsertPointTy AllocaIP(CGF.AllocaInsertPt->getParent(),
-                         CGF.AllocaInsertPt->getIterator());
-  InsertPointTy CodeGenIP(CGF.Builder.GetInsertBlock(),
-                          CGF.Builder.GetInsertPoint());
-  llvm::OpenMPIRBuilder::LocationDescription OmpLoc(CodeGenIP);
 
   llvm::Value *IfCondVal = nullptr;
   if (IfCond)
@@ -10314,6 +10309,11 @@ void CGOpenMPRuntime::emitTargetDataCalls(
   // Source location for the ident struct
   llvm::Value *RTLoc = emitUpdateLocation(CGF, D.getBeginLoc());
 
+  InsertPointTy AllocaIP(CGF.AllocaInsertPt->getParent(),
+                         CGF.AllocaInsertPt->getIterator());
+  InsertPointTy CodeGenIP(CGF.Builder.GetInsertBlock(),
+                          CGF.Builder.GetInsertPoint());
+  llvm::OpenMPIRBuilder::LocationDescription OmpLoc(CodeGenIP);
   CGF.Builder.restoreIP(OMPBuilder.createTargetData(
       OmpLoc, AllocaIP, CodeGenIP, DeviceID, IfCondVal, Info, GenMapInfoCB,
       /*MapperFunc=*/nullptr, BodyCB, DeviceAddrCB, CustomMapperCB, RTLoc));
diff --git a/clang/test/OpenMP/target_data_if_logical_codegen.cpp b/clang/test/OpenMP/target_data_if_logical_codegen.cpp
new file mode 100644
index 0000000000000..85d98b0c3bcd4
--- /dev/null
+++ b/clang/test/OpenMP/target_data_if_logical_codegen.cpp
@@ -0,0 +1,120 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _ --version 3
+// REQUIRES: amdgpu-registered-target
+
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-unknown -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -o - \
+// RUN: | FileCheck %s
+
+// Check same results after serialization round-trip
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-unknown -fopenmp-targets=amdgcn-amd-amdhsa -emit-pch -o %t %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-unknown -fopenmp-targets=amdgcn-amd-amdhsa -include-pch %t -emit-llvm %s -o - \
+// RUN: | FileCheck %s
+
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+extern bool foo(bool);
+
+int if_logical() {
+  bool a = foo(true);
+  bool b = foo(true);
+  int pp = 42;
+  int *p = &pp;
+  #pragma omp target data if(a && b) map(to: p[0])
+  {
+    p[0]++;
+  }
+  if (p[0])
+    return 1;
+  return 0;
+}
+
+int main() {
+  return if_logical();
+}
+
+#endif
+// CHECK-LABEL: define dso_local noundef i32 @_Z10if_logicalv(
+// CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[A:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[B:%.*]] = alloca i8, align 1
+// CHECK-NEXT:    [[PP:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[P:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 8
+// CHECK-NEXT:    [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 8
+// CHECK-NEXT:    [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 8
+// CHECK-NEXT:    [[CALL:%.*]] = call noundef zeroext i1 @_Z3foob(i1 noundef zeroext true)
+// CHECK-NEXT:    [[FROMBOOL:%.*]] = zext i1 [[CALL]] to i8
+// CHECK-NEXT:    store i8 [[FROMBOOL]], ptr [[A]], align 1
+// CHECK-NEXT:    [[CALL1:%.*]] = call noundef zeroext i1 @_Z3foob(i1 noundef zeroext true)
+// CHECK-NEXT:    [[FROMBOOL2:%.*]] = zext i1 [[CALL1]] to i8
+// CHECK-NEXT:    store i8 [[FROMBOOL2]], ptr [[B]], align 1
+// CHECK-NEXT:    store i32 42, ptr [[PP]], align 4
+// CHECK-NEXT:    store ptr [[PP]], ptr [[P]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A]], align 1
+// CHECK-NEXT:    [[TOBOOL:%.*]] = trunc i8 [[TMP0]] to i1
+// CHECK-NEXT:    br i1 [[TOBOOL]], label [[LAND_RHS:%.*]], label [[LAND_END:%.*]]
+// CHECK:       land.rhs:
+// CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[B]], align 1
+// CHECK-NEXT:    [[TOBOOL3:%.*]] = trunc i8 [[TMP1]] to i1
+// CHECK-NEXT:    br label [[LAND_END]]
+// CHECK:       land.end:
+// CHECK-NEXT:    [[TMP2:%.*]] = phi i1 [ false, [[ENTRY:%.*]] ], [ [[TOBOOL3]], [[LAND_RHS]] ]
+// CHECK-NEXT:    br i1 [[TMP2]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]]
+// CHECK:       omp_if.then:
+// CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[P]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[P]], align 8
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i64 0
+// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK-NEXT:    store ptr [[TMP3]], ptr [[TMP5]], align 8
+// CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK-NEXT:    store ptr [[ARRAYIDX]], ptr [[TMP6]], align 8
+// CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0
+// CHECK-NEXT:    store ptr null, ptr [[TMP7]], align 8
+// CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK-NEXT:    call void @__tgt_target_data_begin_mapper(ptr @[[GLOB1:[0-9]+]], i64 -1, i32 1, ptr [[TMP8]], ptr [[TMP9]], ptr @.offload_sizes, ptr @.offload_maptypes, ptr null, ptr null)
+// CHECK-NEXT:    br label [[OMP_IF_END:%.*]]
+// CHECK:       omp_if.else:
+// CHECK-NEXT:    br label [[OMP_IF_END]]
+// CHECK:       omp_if.end:
+// CHECK-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[P]], align 8
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i64 0
+// CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX4]], align 4
+// CHECK-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP11]], 1
+// CHECK-NEXT:    store i32 [[INC]], ptr [[ARRAYIDX4]], align 4
+// CHECK-NEXT:    br i1 [[TMP2]], label [[OMP_IF_THEN5:%.*]], label [[OMP_IF_ELSE6:%.*]]
+// CHECK:       omp_if.then5:
+// CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0
+// CHECK-NEXT:    call void @__tgt_target_data_end_mapper(ptr @[[GLOB1]], i64 -1, i32 1, ptr [[TMP12]], ptr [[TMP13]], ptr @.offload_sizes, ptr @.offload_maptypes, ptr null, ptr null)
+// CHECK-NEXT:    br label [[OMP_IF_END7:%.*]]
+// CHECK:       omp_if.else6:
+// CHECK-NEXT:    br label [[OMP_IF_END7]]
+// CHECK:       omp_if.end7:
+// CHECK-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[P]], align 8
+// CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i64 0
+// CHECK-NEXT:    [[TMP15:%.*]] = load i32, ptr [[ARRAYIDX8]], align 4
+// CHECK-NEXT:    [[TOBOOL9:%.*]] = icmp ne i32 [[TMP15]], 0
+// CHECK-NEXT:    br i1 [[TOBOOL9]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
+// CHECK:       if.then:
+// CHECK-NEXT:    store i32 1, ptr [[RETVAL]], align 4
+// CHECK-NEXT:    br label [[RETURN:%.*]]
+// CHECK:       if.end:
+// CHECK-NEXT:    store i32 0, ptr [[RETVAL]], align 4
+// CHECK-NEXT:    br label [[RETURN]]
+// CHECK:       return:
+// CHECK-NEXT:    [[TMP16:%.*]] = load i32, ptr [[RETVAL]], align 4
+// CHECK-NEXT:    ret i32 [[TMP16]]
+//
+//
+// CHECK-LABEL: define dso_local noundef i32 @main(
+// CHECK-SAME: ) #[[ATTR3:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store i32 0, ptr [[RETVAL]], align 4
+// CHECK-NEXT:    [[CALL:%.*]] = call noundef i32 @_Z10if_logicalv()
+// CHECK-NEXT:    ret i32 [[CALL]]
+//

From 3fe8fd712b6d2b177c4a2e5c1f595d8a8fcdbe1b Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Thu, 26 Oct 2023 21:37:04 +0100
Subject: [PATCH 127/877] [AArch64] Fix st2 check for nearby store with debug
 info.

It needs to be skipping over debug instructions, whilst not counting them in
the MaxLookupDist.
---
 .../Target/AArch64/AArch64ISelLowering.cpp    |   6 +-
 llvm/test/CodeGen/AArch64/vldn_shuffle.ll     | 114 +++++++++++++++---
 2 files changed, 103 insertions(+), 17 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 5acc2ce58e6af..4daaf21d42b3c 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -15295,7 +15295,11 @@ bool hasNearbyPairedStore(Iter It, Iter End, Value *Ptr, const DataLayout &DL) {
   const Value *PtrA1 =
       Ptr->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
 
-  while (++It != End && !It->isDebugOrPseudoInst() && MaxLookupDist-- > 0) {
+  while (++It != End) {
+    if (It->isDebugOrPseudoInst())
+      continue;
+    if (MaxLookupDist-- == 0)
+      break;
     if (const auto *SI = dyn_cast<StoreInst>(&*It)) {
       const Value *PtrB1 =
           SI->getPointerOperand()->stripAndAccumulateInBoundsConstantOffsets(
diff --git a/llvm/test/CodeGen/AArch64/vldn_shuffle.ll b/llvm/test/CodeGen/AArch64/vldn_shuffle.ll
index d5066aafb816d..3685e9cf85bd6 100644
--- a/llvm/test/CodeGen/AArch64/vldn_shuffle.ll
+++ b/llvm/test/CodeGen/AArch64/vldn_shuffle.ll
@@ -3,7 +3,9 @@
 
 define void @vld2(ptr nocapture readonly %pSrc, ptr noalias nocapture %pDst, i32 %numSamples) {
 ; CHECK-LABEL: vld2:
-; CHECK:       // %bb.0: // %entry
+; CHECK:       .Lfunc_begin0:
+; CHECK-NEXT:    .cfi_startproc
+; CHECK-NEXT:  // %bb.0: // %entry
 ; CHECK-NEXT:    mov x8, xzr
 ; CHECK-NEXT:  .LBB0_1: // %vector.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -41,7 +43,9 @@ while.end:                                        ; preds = %vector.body
 
 define void @vld3(ptr nocapture readonly %pSrc, ptr noalias nocapture %pDst, i32 %numSamples) {
 ; CHECK-LABEL: vld3:
-; CHECK:       // %bb.0: // %entry
+; CHECK:       .Lfunc_begin1:
+; CHECK-NEXT:    .cfi_startproc
+; CHECK-NEXT:  // %bb.0: // %entry
 ; CHECK-NEXT:    mov x8, xzr
 ; CHECK-NEXT:  .LBB1_1: // %vector.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -83,7 +87,9 @@ while.end:                                        ; preds = %vector.body
 
 define void @vld4(ptr nocapture readonly %pSrc, ptr noalias nocapture %pDst, i32 %numSamples) {
 ; CHECK-LABEL: vld4:
-; CHECK:       // %bb.0: // %entry
+; CHECK:       .Lfunc_begin2:
+; CHECK-NEXT:    .cfi_startproc
+; CHECK-NEXT:  // %bb.0: // %entry
 ; CHECK-NEXT:    mov x8, xzr
 ; CHECK-NEXT:  .LBB2_1: // %vector.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -131,7 +137,9 @@ while.end:                                        ; preds = %vector.body
 
 define void @twosrc(ptr nocapture readonly %pSrc, ptr nocapture readonly %pSrc2, ptr noalias nocapture %pDst, i32 %numSamples) {
 ; CHECK-LABEL: twosrc:
-; CHECK:       // %bb.0: // %entry
+; CHECK:       .Lfunc_begin3:
+; CHECK-NEXT:    .cfi_startproc
+; CHECK-NEXT:  // %bb.0: // %entry
 ; CHECK-NEXT:    mov x8, xzr
 ; CHECK-NEXT:  .LBB3_1: // %vector.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -175,7 +183,9 @@ while.end:                                        ; preds = %vector.body
 
 define void @vld2_multiuse(ptr nocapture readonly %pSrc, ptr noalias nocapture %pDst, i32 %numSamples) {
 ; CHECK-LABEL: vld2_multiuse:
-; CHECK:       // %bb.0: // %entry
+; CHECK:       .Lfunc_begin4:
+; CHECK-NEXT:    .cfi_startproc
+; CHECK-NEXT:  // %bb.0: // %entry
 ; CHECK-NEXT:    mov x8, xzr
 ; CHECK-NEXT:  .LBB4_1: // %vector.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -212,7 +222,9 @@ while.end:                                        ; preds = %vector.body
 
 define void @vld3_multiuse(ptr nocapture readonly %pSrc, ptr noalias nocapture %pDst, i32 %numSamples) {
 ; CHECK-LABEL: vld3_multiuse:
-; CHECK:       // %bb.0: // %entry
+; CHECK:       .Lfunc_begin5:
+; CHECK-NEXT:    .cfi_startproc
+; CHECK-NEXT:  // %bb.0: // %entry
 ; CHECK-NEXT:    mov x8, xzr
 ; CHECK-NEXT:  .LBB5_1: // %vector.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -252,7 +264,9 @@ while.end:                                        ; preds = %vector.body
 
 define void @vld4_multiuse(ptr nocapture readonly %pSrc, ptr noalias nocapture %pDst, i32 %numSamples) {
 ; CHECK-LABEL: vld4_multiuse:
-; CHECK:       // %bb.0: // %entry
+; CHECK:       .Lfunc_begin6:
+; CHECK-NEXT:    .cfi_startproc
+; CHECK-NEXT:  // %bb.0: // %entry
 ; CHECK-NEXT:    mov x8, xzr
 ; CHECK-NEXT:  .LBB6_1: // %vector.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
@@ -299,7 +313,9 @@ while.end:                                        ; preds = %vector.body
 ; as a single store. This avoids the vld2 for data that is already shuffled.
 define void @transpose_s16_8x8_simpler(ptr nocapture noundef %a) {
 ; CHECK-LABEL: transpose_s16_8x8_simpler:
-; CHECK:       // %bb.0: // %entry
+; CHECK:       .Lfunc_begin7:
+; CHECK-NEXT:    .cfi_startproc
+; CHECK-NEXT:  // %bb.0: // %entry
 ; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ldp q2, q3, [x0, #64]
 ; CHECK-NEXT:    ldp q4, q5, [x0, #32]
@@ -350,7 +366,9 @@ entry:
 ; Same as above with some different shuffles
 define void @transpose_s16_8x8_simpler2(ptr nocapture noundef %a) {
 ; CHECK-LABEL: transpose_s16_8x8_simpler2:
-; CHECK:       // %bb.0: // %entry
+; CHECK:       .Lfunc_begin8:
+; CHECK-NEXT:    .cfi_startproc
+; CHECK-NEXT:  // %bb.0: // %entry
 ; CHECK-NEXT:    ldp q0, q2, [x0]
 ; CHECK-NEXT:    ldp q3, q4, [x0, #64]
 ; CHECK-NEXT:    ldp q5, q6, [x0, #32]
@@ -401,7 +419,9 @@ entry:
 
 define void @transpose_s16_8x8(ptr nocapture noundef %0, ptr nocapture noundef %1, ptr nocapture noundef %2, ptr nocapture noundef %3, ptr nocapture noundef %4, ptr nocapture noundef %5, ptr nocapture noundef %6, ptr nocapture noundef %7) {
 ; CHECK-LABEL: transpose_s16_8x8:
-; CHECK:       // %bb.0:
+; CHECK:       .Lfunc_begin9:
+; CHECK-NEXT:    .cfi_startproc
+; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    ldr q1, [x1]
 ; CHECK-NEXT:    ldr q3, [x4]
@@ -492,7 +512,9 @@ define void @transpose_s16_8x8(ptr nocapture noundef %0, ptr nocapture noundef %
 
 define void @transpose_s16_8x8_(ptr nocapture noundef %0) {
 ; CHECK-LABEL: transpose_s16_8x8_:
-; CHECK:       // %bb.0:
+; CHECK:       .Lfunc_begin10:
+; CHECK-NEXT:    .cfi_startproc
+; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    ldp q2, q3, [x0, #32]
 ; CHECK-NEXT:    ldp q4, q5, [x0, #64]
@@ -586,7 +608,9 @@ define void @transpose_s16_8x8_(ptr nocapture noundef %0) {
 
 define void @store_factor2(ptr %ptr, <4 x i32> %a0, <4 x i32> %a1) {
 ; CHECK-LABEL: store_factor2:
-; CHECK:       // %bb.0:
+; CHECK:       .Lfunc_begin11:
+; CHECK-NEXT:    .cfi_startproc
+; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    trn1 v2.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    trn1 v3.4s, v1.4s, v0.4s
 ; CHECK-NEXT:    st2 { v2.4s, v3.4s }, [x0]
@@ -600,7 +624,9 @@ define void @store_factor2(ptr %ptr, <4 x i32> %a0, <4 x i32> %a1) {
 
 define void @store_factor2_high(ptr %ptr, ptr %ptr2, <4 x i32> %a0, <4 x i32> %a1) {
 ; CHECK-LABEL: store_factor2_high:
-; CHECK:       // %bb.0:
+; CHECK:       .Lfunc_begin12:
+; CHECK-NEXT:    .cfi_startproc
+; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    trn1 v2.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    trn1 v0.4s, v1.4s, v0.4s
 ; CHECK-NEXT:    zip1 v1.4s, v2.4s, v0.4s
@@ -620,7 +646,9 @@ define void @store_factor2_high(ptr %ptr, ptr %ptr2, <4 x i32> %a0, <4 x i32> %a
 
 define void @store_factor2_high2(ptr %ptr, ptr %ptr2, <4 x i32> %a0, <4 x i32> %a1) {
 ; CHECK-LABEL: store_factor2_high2:
-; CHECK:       // %bb.0:
+; CHECK:       .Lfunc_begin13:
+; CHECK-NEXT:    .cfi_startproc
+; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    zip1 v2.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    zip2 v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    trn1 v2.4s, v2.4s, v1.4s
@@ -636,7 +664,9 @@ define void @store_factor2_high2(ptr %ptr, ptr %ptr2, <4 x i32> %a0, <4 x i32> %
 
 define void @store_factor3(ptr %ptr, <4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
 ; CHECK-LABEL: store_factor3:
-; CHECK:       // %bb.0:
+; CHECK:       .Lfunc_begin14:
+; CHECK-NEXT:    .cfi_startproc
+; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ext v3.16b, v0.16b, v1.16b, #12
 ; CHECK-NEXT:    ext v6.16b, v1.16b, v2.16b, #12
 ; CHECK-NEXT:    zip2 v3.4s, v0.4s, v3.4s
@@ -660,7 +690,9 @@ define void @store_factor3(ptr %ptr, <4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2
 
 define void @store_factor4(ptr %ptr, <4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2, <4 x i32> %a3) {
 ; CHECK-LABEL: store_factor4:
-; CHECK:       // %bb.0:
+; CHECK:       .Lfunc_begin15:
+; CHECK-NEXT:    .cfi_startproc
+; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    trn1 v4.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    trn1 v5.4s, v1.4s, v2.4s
 ; CHECK-NEXT:    trn1 v6.4s, v2.4s, v3.4s
@@ -677,3 +709,53 @@ define void @store_factor4(ptr %ptr, <4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2
   store <16 x i32> %interleaved.vec, ptr %ptr, align 4
   ret void
 }
+
+define void @debuginfo(ptr nocapture noundef writeonly %buf, <8 x i16> noundef %a) {
+; CHECK-LABEL: debuginfo:
+; CHECK:       .Lfunc_begin16:
+; CHECK-NEXT:    .cfi_startproc
+; CHECK-NEXT:  // %bb.0: // %entry
+; CHECK-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-NEXT:    zip1 v2.8h, v0.8h, v1.8h
+; CHECK-NEXT:    zip2 v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    stp q2, q0, [x0]
+; CHECK-NEXT:    ret
+entry:
+  %vzip.i = shufflevector <8 x i16> %a, <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 poison, i16 poison, i16 poison, i16 poison>, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+  %vzip1.i = shufflevector <8 x i16> %a, <8 x i16> <i16 poison, i16 poison, i16 poison, i16 poison, i16 0, i16 0, i16 0, i16 0>, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  store <8 x i16> %vzip.i, ptr %buf, align 4
+  call void @llvm.dbg.value(metadata <8 x i16> %vzip1.i, metadata !21, metadata !DIExpression()), !dbg !23
+  %add.ptr = getelementptr inbounds i32, ptr %buf, i64 4
+  store <8 x i16> %vzip1.i, ptr %add.ptr, align 4
+  ret void
+}
+
+declare void @llvm.dbg.value(metadata, metadata, metadata)
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!6, !7, !8, !9, !10, !11}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C11, file: !1, producer: "", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, retainedTypes: !2, splitDebugInlining: false, nameTableKind: None)
+!1 = !DIFile(filename: "a64.c", directory: "", checksumkind: CSK_MD5, checksum: "a1a236fb20d703d1ea5963e75545b91a")
+!2 = !{!15}
+!3 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!4 = !{!5}
+!5 = !DISubrange(count: 8)
+!6 = !{i32 7, !"Dwarf Version", i32 5}
+!7 = !{i32 2, !"Debug Info Version", i32 3}
+!8 = !{i32 1, !"wchar_size", i32 4}
+!9 = !{i32 7, !"uwtable", i32 2}
+!10 = !{i32 7, !"frame-pointer", i32 1}
+!11 = !{i32 7, !"debug-info-assignment-tracking", i1 true}
+!12 = !DISubroutineType(types: !13)
+!13 = !{null, !14, !15}
+!14 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !3, size: 64)
+!15 = !DIDerivedType(tag: DW_TAG_const_type, baseType: !16)
+!16 = !DIDerivedType(tag: DW_TAG_typedef, name: "int16x8_t", file: !1, line: 57, baseType: !17)
+!17 = !DICompositeType(tag: DW_TAG_array_type, baseType: !18, size: 128, flags: DIFlagVector, elements: !4)
+!18 = !DIBasicType(name: "short", size: 16, encoding: DW_ATE_signed)
+!19 = distinct !DISubprogram(name: "store_s16q_to_tran_low_", scope: !1, file: !1, line: 13, type: !12, scopeLine: 13, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !20)
+!20 = !{!21}
+!21 = !DILocalVariable(name: "__s1", scope: !22, file: !1, line: 16, type: !16)
+!22 = distinct !DILexicalBlock(scope: !19, file: !1, line: 16, column: 3)
+!23 = !DILocation(line: 0, scope: !22)

From e3ecdf7d3b7ec8fa2c3c594ff39b20788e75eaf2 Mon Sep 17 00:00:00 2001
From: Owen Pan <owenpiano@gmail.com>
Date: Thu, 26 Oct 2023 13:30:07 -0700
Subject: [PATCH 128/877] [clang-format][NFC] Remove some extraneous newlines
 at end of test cases

---
 clang/unittests/Format/SortImportsTestJS.cpp | 88 ++++++++++----------
 1 file changed, 44 insertions(+), 44 deletions(-)

diff --git a/clang/unittests/Format/SortImportsTestJS.cpp b/clang/unittests/Format/SortImportsTestJS.cpp
index c724fcc073f59..f423cdd21d1ac 100644
--- a/clang/unittests/Format/SortImportsTestJS.cpp
+++ b/clang/unittests/Format/SortImportsTestJS.cpp
@@ -111,8 +111,8 @@ TEST_F(SortImportsTestJS, WrappedImportStatements) {
 TEST_F(SortImportsTestJS, SeparateMainCodeBody) {
   verifySort("import {sym} from 'a';"
              "\n"
-             "let x = 1;\n",
-             "import {sym} from 'a'; let x = 1;\n");
+             "let x = 1;",
+             "import {sym} from 'a'; let x = 1;");
 }
 
 TEST_F(SortImportsTestJS, Comments) {
@@ -123,17 +123,17 @@ TEST_F(SortImportsTestJS, Comments) {
              "/** @fileoverview This is a great file. */\n"
              "import {sym} from 'b';  // from //foo:bar\n"
              "// A very important import follows.\n"
-             "import {sym} from 'a';  /* more comments */\n");
+             "import {sym} from 'a';  /* more comments */");
   verifySort("import {sym} from 'a';\n"
              "import {sym} from 'b';\n"
              "\n"
              "/** Comment on variable. */\n"
-             "const x = 1;\n",
+             "const x = 1;",
              "import {sym} from 'b';\n"
              "import {sym} from 'a';\n"
              "\n"
              "/** Comment on variable. */\n"
-             "const x = 1;\n");
+             "const x = 1;");
 }
 
 TEST_F(SortImportsTestJS, SortStar) {
@@ -142,26 +142,26 @@ TEST_F(SortImportsTestJS, SortStar) {
              "import * as bar from 'b';\n",
              "import {sym} from 'a';\n"
              "import * as foo from 'a';\n"
-             "import * as bar from 'b';\n");
+             "import * as bar from 'b';");
 }
 
 TEST_F(SortImportsTestJS, AliasesSymbols) {
   verifySort("import {sym1 as alias1} from 'b';\n"
              "import {sym2 as alias2, sym3 as alias3} from 'c';\n",
              "import {sym2 as alias2, sym3 as alias3} from 'c';\n"
-             "import {sym1 as alias1} from 'b';\n");
+             "import {sym1 as alias1} from 'b';");
 }
 
 TEST_F(SortImportsTestJS, SortSymbols) {
   verifySort("import {sym1, sym2 as a, sym3} from 'b';\n",
-             "import {sym2 as a, sym1, sym3} from 'b';\n");
+             "import {sym2 as a, sym1, sym3} from 'b';");
   verifySort("import {sym1 /* important! */, /*!*/ sym2 as a} from 'b';\n",
-             "import {/*!*/ sym2 as a, sym1 /* important! */} from 'b';\n");
+             "import {/*!*/ sym2 as a, sym1 /* important! */} from 'b';");
   verifySort("import {sym1, sym2} from 'b';\n", "import {\n"
                                                 "  sym2 \n"
                                                 ",\n"
                                                 " sym1 \n"
-                                                "} from 'b';\n");
+                                                "} from 'b';");
 }
 
 TEST_F(SortImportsTestJS, GroupImports) {
@@ -173,13 +173,13 @@ TEST_F(SortImportsTestJS, GroupImports) {
              "import {b} from './relative/path';\n"
              "import {b} from './relative/path/nested';\n"
              "\n"
-             "let x = 1;\n",
+             "let x = 1;",
              "import {b} from './relative/path/nested';\n"
              "import {b} from './relative/path';\n"
              "import {b} from '../parent/nested';\n"
              "import {b} from '../parent';\n"
              "import {a} from 'absolute';\n"
-             "let x = 1;\n");
+             "let x = 1;");
 }
 
 TEST_F(SortImportsTestJS, Exports) {
@@ -193,7 +193,7 @@ TEST_F(SortImportsTestJS, Exports) {
              "export {S};\n"
              "\n"
              "let x = 1;\n"
-             "export y = 1;\n",
+             "export y = 1;",
              "export {R} from './relative';\n"
              "import {T} from './cpath';\n"
              "export {S};\n"
@@ -201,12 +201,12 @@ TEST_F(SortImportsTestJS, Exports) {
              "import {S} from 'bpath';\n"
              "export {P} from '../parent';\n"
              "let x = 1;\n"
-             "export y = 1;\n");
+             "export y = 1;");
   verifySort("import {S} from 'bpath';\n"
              "\n"
              "export {T} from 'epath';\n",
              "export {T} from 'epath';\n"
-             "import {S} from 'bpath';\n");
+             "import {S} from 'bpath';");
 }
 
 TEST_F(SortImportsTestJS, SideEffectImports) {
@@ -219,7 +219,7 @@ TEST_F(SortImportsTestJS, SideEffectImports) {
              "import {R} from './relative';\n"
              "import 'ZZside-effect';\n"
              "import {A} from 'absolute';\n"
-             "import 'AAside-effect';\n");
+             "import 'AAside-effect';");
 }
 
 TEST_F(SortImportsTestJS, AffectedRange) {
@@ -262,7 +262,7 @@ TEST_F(SortImportsTestJS, SortingCanShrink) {
 }
 
 TEST_F(SortImportsTestJS, TrailingComma) {
-  verifySort("import {A, B,} from 'aa';\n", "import {B, A,} from 'aa';\n");
+  verifySort("import {A, B,} from 'aa';\n", "import {B, A,} from 'aa';");
 }
 
 TEST_F(SortImportsTestJS, SortCaseInsensitive) {
@@ -308,7 +308,7 @@ TEST_F(SortImportsTestJS, SortDefaultImports) {
   verifySort("import {A} from 'a';\n"
              "import {default as B} from 'b';\n",
              "import {default as B} from 'b';\n"
-             "import {A} from 'a';\n");
+             "import {A} from 'a';");
 }
 
 TEST_F(SortImportsTestJS, MergeImports) {
@@ -316,12 +316,12 @@ TEST_F(SortImportsTestJS, MergeImports) {
   verifySort("import {X, Y} from 'a';\n"
              "import {Z} from 'z';\n"
              "\n"
-             "X + Y + Z;\n",
+             "X + Y + Z;",
              "import {X} from 'a';\n"
              "import {Z} from 'z';\n"
              "import {Y} from 'a';\n"
              "\n"
-             "X + Y + Z;\n");
+             "X + Y + Z;");
 
   // merge only, no resorting.
   verifySort("import {A, B} from 'foo';\n", "import {A} from 'foo';\n"
@@ -333,15 +333,15 @@ TEST_F(SortImportsTestJS, MergeImports) {
 
   // ignores import *
   verifySort("import * as foo from 'foo';\n"
-             "import {A} from 'foo';\n",
+             "import {A} from 'foo';",
              "import   * as foo from 'foo';\n"
-             "import {A} from 'foo';\n");
+             "import {A} from 'foo';");
 
   // ignores default import
   verifySort("import X from 'foo';\n"
-             "import {A} from 'foo';\n",
+             "import {A} from 'foo';",
              "import    X from 'foo';\n"
-             "import {A} from 'foo';\n");
+             "import {A} from 'foo';");
 
   // keeps comments
   // known issue: loses the 'also a' comment.
@@ -350,7 +350,7 @@ TEST_F(SortImportsTestJS, MergeImports) {
              "// z\n"
              "import {Z} from 'z';\n"
              "\n"
-             "X + Y + Z;\n",
+             "X + Y + Z;",
              "// a\n"
              "import {/* y */ Y} from 'a';\n"
              "// z\n"
@@ -358,7 +358,7 @@ TEST_F(SortImportsTestJS, MergeImports) {
              "// also a\n"
              "import {/* x */ X} from 'a';\n"
              "\n"
-             "X + Y + Z;\n");
+             "X + Y + Z;");
 
   // do not merge imports and exports
   verifySort("import {A} from 'foo';\n"
@@ -375,18 +375,18 @@ TEST_F(SortImportsTestJS, MergeImports) {
              "\n"
              "import {bar} from './a';\n",
              "import {bar} from './a';\n"
-             "import './a';\n");
+             "import './a';");
 }
 
 TEST_F(SortImportsTestJS, RespectsClangFormatOff) {
   verifySort("// clang-format off\n"
              "import {B} from './b';\n"
              "import {A} from './a';\n"
-             "// clang-format on\n",
+             "// clang-format on",
              "// clang-format off\n"
              "import {B} from './b';\n"
              "import {A} from './a';\n"
-             "// clang-format on\n");
+             "// clang-format on");
 
   verifySort("import {A} from './sorted1_a';\n"
              "import {B} from './sorted1_b';\n"
@@ -403,15 +403,15 @@ TEST_F(SortImportsTestJS, RespectsClangFormatOff) {
              "import {A} from './unsorted_a';\n"
              "// clang-format on\n"
              "import {B} from './sorted2_b';\n"
-             "import {A} from './sorted2_a';\n");
+             "import {A} from './sorted2_a';");
 
   // Boundary cases
-  verifySort("// clang-format on\n", "// clang-format on\n");
-  verifySort("// clang-format off\n", "// clang-format off\n");
+  verifySort("// clang-format on", "// clang-format on");
+  verifySort("// clang-format off", "// clang-format off");
   verifySort("// clang-format on\n"
-             "// clang-format off\n",
+             "// clang-format off",
              "// clang-format on\n"
-             "// clang-format off\n");
+             "// clang-format off");
   verifySort("// clang-format off\n"
              "// clang-format on\n"
              "import {A} from './a';\n"
@@ -419,7 +419,7 @@ TEST_F(SortImportsTestJS, RespectsClangFormatOff) {
              "// clang-format off\n"
              "// clang-format on\n"
              "import {B} from './b';\n"
-             "import {A} from './a';\n");
+             "import {A} from './a';");
   // section ends with comment
   verifySort("// clang-format on\n"
              "import {A} from './a';\n"
@@ -427,12 +427,12 @@ TEST_F(SortImportsTestJS, RespectsClangFormatOff) {
              "import {C} from './c';\n"
              "\n" // inserted empty line is working as intended: splits imports
                   // section from main code body
-             "// clang-format off\n",
+             "// clang-format off",
              "// clang-format on\n"
              "import {C} from './c';\n"
              "import {B} from './b';\n"
              "import {A} from './a';\n"
-             "// clang-format off\n");
+             "// clang-format off");
 }
 
 TEST_F(SortImportsTestJS, RespectsClangFormatOffInNamedImports) {
@@ -455,14 +455,14 @@ TEST_F(SortImportsTestJS, ImportEqAliases) {
              "\n"
              "export {Z};\n"
              "\n"
-             "console.log(Z);\n",
+             "console.log(Z);",
              "import {A} from 'foo';\n"
              "import Z = A.C;\n"
              "export {Z};\n"
              "import {B} from 'bar';\n"
              "import Y = B.C.Z;\n"
              "\n"
-             "console.log(Z);\n");
+             "console.log(Z);");
 }
 
 TEST_F(SortImportsTestJS, ImportExportType) {
@@ -482,18 +482,18 @@ TEST_F(SortImportsTestJS, ImportExportType) {
 
   // Symbols within import statement
   verifySort("import {type sym1, type sym2 as a, sym3} from 'b';\n",
-             "import {type sym2 as a, type sym1, sym3} from 'b';\n");
+             "import {type sym2 as a, type sym1, sym3} from 'b';");
 
   // Merging
   verifySort("import {X, type Z} from 'a';\n"
              "import type {Y} from 'a';\n"
              "\n"
-             "X + Y + Z;\n",
+             "X + Y + Z;",
              "import {X} from 'a';\n"
              "import {type Z} from 'a';\n"
              "import type {Y} from 'a';\n"
              "\n"
-             "X + Y + Z;\n");
+             "X + Y + Z;");
 
   // Merging: empty imports
   verifySort("import type {A} from 'foo';\n", "import type {} from 'foo';\n"
@@ -508,10 +508,10 @@ TEST_F(SortImportsTestJS, ImportExportType) {
   // statements should therefore not merge.
   verifySort("export type A = B;\n"
              "export {X};\n"
-             "export {Y};\n",
+             "export {Y};",
              "export type A = B;\n"
              "export {X};\n"
-             "export {Y};\n");
+             "export {Y};");
 }
 
 TEST_F(SortImportsTestJS, TemplateKeyword) {

From c9e8b73694131d2d9cc9224734122ec658937dd4 Mon Sep 17 00:00:00 2001
From: Amara Emerson <amara@apple.com>
Date: Thu, 26 Oct 2023 13:38:09 -0700
Subject: [PATCH 129/877] [AArch64][GlobalISel] Add support for extending
 indexed loads. (#70373)

---
 .../GISel/AArch64InstructionSelector.cpp      |  91 +++++
 .../AArch64/GISel/AArch64LegalizerInfo.cpp    |  19 +-
 .../AArch64/GISel/AArch64RegisterBankInfo.cpp |   7 +-
 .../legalize-indexed-load-stores.mir          |  80 ++++
 .../CodeGen/AArch64/arm64-indexed-memory.ll   | 345 ++++--------------
 5 files changed, 269 insertions(+), 273 deletions(-)

diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
index 45fc6b0238877..2089bfba5ff37 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
@@ -230,6 +230,7 @@ class AArch64InstructionSelector : public InstructionSelector {
   bool selectMOPS(MachineInstr &I, MachineRegisterInfo &MRI);
   bool selectUSMovFromExtend(MachineInstr &I, MachineRegisterInfo &MRI);
 
+  bool selectIndexedExtLoad(MachineInstr &I, MachineRegisterInfo &MRI);
   bool selectIndexedLoad(MachineInstr &I, MachineRegisterInfo &MRI);
   bool selectIndexedStore(GIndexedStore &I, MachineRegisterInfo &MRI);
 
@@ -3047,6 +3048,9 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
     return constrainSelectedInstRegOperands(*LoadStore, TII, TRI, RBI);
   }
 
+  case TargetOpcode::G_INDEXED_ZEXTLOAD:
+  case TargetOpcode::G_INDEXED_SEXTLOAD:
+    return selectIndexedExtLoad(I, MRI);
   case TargetOpcode::G_INDEXED_LOAD:
     return selectIndexedLoad(I, MRI);
   case TargetOpcode::G_INDEXED_STORE:
@@ -5648,6 +5652,93 @@ MachineInstr *AArch64InstructionSelector::tryAdvSIMDModImmFP(
   return &*Mov;
 }
 
+bool AArch64InstructionSelector::selectIndexedExtLoad(
+    MachineInstr &MI, MachineRegisterInfo &MRI) {
+  auto &ExtLd = cast<GIndexedExtLoad>(MI);
+  Register Dst = ExtLd.getDstReg();
+  Register WriteBack = ExtLd.getWritebackReg();
+  Register Base = ExtLd.getBaseReg();
+  Register Offset = ExtLd.getOffsetReg();
+  LLT Ty = MRI.getType(Dst);
+  assert(Ty.getSizeInBits() <= 64); // Only for scalar GPRs.
+  unsigned MemSizeBits = ExtLd.getMMO().getMemoryType().getSizeInBits();
+  bool IsPre = ExtLd.isPre();
+  bool IsSExt = isa<GIndexedSExtLoad>(ExtLd);
+  bool InsertIntoXReg = false;
+  bool IsDst64 = Ty.getSizeInBits() == 64;
+
+  unsigned Opc = 0;
+  LLT NewLdDstTy;
+  LLT s32 = LLT::scalar(32);
+  LLT s64 = LLT::scalar(64);
+
+  if (MemSizeBits == 8) {
+    if (IsSExt) {
+      if (IsDst64)
+        Opc = IsPre ? AArch64::LDRSBXpre : AArch64::LDRSBXpost;
+      else
+        Opc = IsPre ? AArch64::LDRSBWpre : AArch64::LDRSBWpost;
+      NewLdDstTy = IsDst64 ? s64 : s32;
+    } else {
+      Opc = IsPre ? AArch64::LDRBBpre : AArch64::LDRBBpost;
+      InsertIntoXReg = IsDst64;
+      NewLdDstTy = s32;
+    }
+  } else if (MemSizeBits == 16) {
+    if (IsSExt) {
+      if (IsDst64)
+        Opc = IsPre ? AArch64::LDRSHXpre : AArch64::LDRSHXpost;
+      else
+        Opc = IsPre ? AArch64::LDRSHWpre : AArch64::LDRSHWpost;
+      NewLdDstTy = IsDst64 ? s64 : s32;
+    } else {
+      Opc = IsPre ? AArch64::LDRHHpre : AArch64::LDRHHpost;
+      InsertIntoXReg = IsDst64;
+      NewLdDstTy = s32;
+    }
+  } else if (MemSizeBits == 32) {
+    if (IsSExt) {
+      Opc = IsPre ? AArch64::LDRSWpre : AArch64::LDRSWpost;
+      NewLdDstTy = s64;
+    } else {
+      Opc = IsPre ? AArch64::LDRWpre : AArch64::LDRWpost;
+      InsertIntoXReg = IsDst64;
+      NewLdDstTy = s32;
+    }
+  } else {
+    llvm_unreachable("Unexpected size for indexed load");
+  }
+
+  if (RBI.getRegBank(Dst, MRI, TRI)->getID() == AArch64::FPRRegBankID)
+    return false; // We should be on gpr.
+
+  auto Cst = getIConstantVRegVal(Offset, MRI);
+  if (!Cst)
+    return false; // Shouldn't happen, but just in case.
+
+  auto LdMI = MIB.buildInstr(Opc, {WriteBack, NewLdDstTy}, {Base})
+                  .addImm(Cst->getSExtValue());
+  LdMI.cloneMemRefs(ExtLd);
+  constrainSelectedInstRegOperands(*LdMI, TII, TRI, RBI);
+  // Make sure to select the load with the MemTy as the dest type, and then
+  // insert into X reg if needed.
+  if (InsertIntoXReg) {
+    // Generate a SUBREG_TO_REG.
+    auto SubToReg = MIB.buildInstr(TargetOpcode::SUBREG_TO_REG, {Dst}, {})
+                        .addImm(0)
+                        .addUse(LdMI.getReg(1))
+                        .addImm(AArch64::sub_32);
+    RBI.constrainGenericRegister(SubToReg.getReg(0), AArch64::GPR64RegClass,
+                                 MRI);
+  } else {
+    auto Copy = MIB.buildCopy(Dst, LdMI.getReg(1));
+    selectCopy(*Copy, TII, MRI, TRI, RBI);
+  }
+  MI.eraseFromParent();
+
+  return true;
+}
+
 bool AArch64InstructionSelector::selectIndexedLoad(MachineInstr &MI,
                                                    MachineRegisterInfo &MRI) {
   // TODO: extending loads.
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index 9fadc008de206..226cdee44ece2 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -459,7 +459,24 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       .legalIf(IndexedLoadBasicPred)
       .unsupported();
   getActionDefinitionsBuilder({G_INDEXED_SEXTLOAD, G_INDEXED_ZEXTLOAD})
-      .unsupported(); // TODO: implement
+      .unsupportedIf(
+          atomicOrderingAtLeastOrStrongerThan(0, AtomicOrdering::Unordered))
+      .legalIf(all(typeInSet(0, {s16, s32, s64}),
+                   LegalityPredicate([=](const LegalityQuery &Q) {
+                     LLT LdTy = Q.Types[0];
+                     LLT PtrTy = Q.Types[1];
+                     LLT MemTy = Q.MMODescrs[0].MemoryTy;
+                     if (PtrTy != p0)
+                       return false;
+                     if (LdTy == s16)
+                       return MemTy == s8;
+                     if (LdTy == s32)
+                       return MemTy == s8 || MemTy == s16;
+                     if (LdTy == s64)
+                       return MemTy == s8 || MemTy == s16 || MemTy == s32;
+                     return false;
+                   })))
+      .unsupported();
 
   // Constants
   getActionDefinitionsBuilder(G_CONSTANT)
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
index b74c4021d3e4e..b956cd4782b79 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
@@ -887,9 +887,12 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
       break;
     }
     break;
-  case TargetOpcode::G_INDEXED_LOAD:
   case TargetOpcode::G_INDEXED_SEXTLOAD:
-  case TargetOpcode::G_INDEXED_ZEXTLOAD: {
+  case TargetOpcode::G_INDEXED_ZEXTLOAD:
+    // These should always be GPR.
+    OpRegBankIdx[0] = PMI_FirstGPR;
+    break;
+  case TargetOpcode::G_INDEXED_LOAD: {
     if (isLoadFromFPType(MI))
       OpRegBankIdx[0] = PMI_FirstFPR;
     break;
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-indexed-load-stores.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-indexed-load-stores.mir
index bd0317ec6a136..e207d3a94dc5a 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-indexed-load-stores.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-indexed-load-stores.mir
@@ -109,3 +109,83 @@ body: |
     $x0 = COPY %writeback
     RET_ReallyLR implicit $x0
 ...
+---
+name:            post_zextload_s8_to_s64
+body: |
+  bb.0:
+    liveins: $x0
+
+    ; CHECK-LABEL: name: post_zextload_s8_to_s64
+    ; CHECK: liveins: $x0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %ptr:_(p0) = COPY $x0
+    ; CHECK-NEXT: %offset:_(s64) = G_CONSTANT i64 8
+    ; CHECK-NEXT: %dst:_(s64), %writeback:_(p0) = G_INDEXED_ZEXTLOAD %ptr, %offset(s64), 0 :: (load (s8), align 8)
+    ; CHECK-NEXT: $x0 = COPY %writeback(p0)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %ptr:_(p0) = COPY $x0
+    %offset:_(s64) = G_CONSTANT i64 8
+    %dst:_(s64), %writeback:_(p0) = G_INDEXED_ZEXTLOAD %ptr, %offset, 0 :: (load (s8), align 8)
+    $x0 = COPY %writeback
+    RET_ReallyLR implicit $x0
+...
+---
+name:            post_sextload_s8_to_s64
+body: |
+  bb.0:
+    liveins: $x0
+
+    ; CHECK-LABEL: name: post_sextload_s8_to_s64
+    ; CHECK: liveins: $x0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %ptr:_(p0) = COPY $x0
+    ; CHECK-NEXT: %offset:_(s64) = G_CONSTANT i64 8
+    ; CHECK-NEXT: %dst:_(s64), %writeback:_(p0) = G_INDEXED_SEXTLOAD %ptr, %offset(s64), 0 :: (load (s8), align 8)
+    ; CHECK-NEXT: $x0 = COPY %writeback(p0)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %ptr:_(p0) = COPY $x0
+    %offset:_(s64) = G_CONSTANT i64 8
+    %dst:_(s64), %writeback:_(p0) = G_INDEXED_SEXTLOAD %ptr, %offset, 0 :: (load (s8), align 8)
+    $x0 = COPY %writeback
+    RET_ReallyLR implicit $x0
+...
+---
+name:            post_sextload_s32_to_s64
+body: |
+  bb.0:
+    liveins: $x0
+
+    ; CHECK-LABEL: name: post_sextload_s32_to_s64
+    ; CHECK: liveins: $x0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %ptr:_(p0) = COPY $x0
+    ; CHECK-NEXT: %offset:_(s64) = G_CONSTANT i64 8
+    ; CHECK-NEXT: %dst:_(s64), %writeback:_(p0) = G_INDEXED_SEXTLOAD %ptr, %offset(s64), 0 :: (load (s32), align 8)
+    ; CHECK-NEXT: $x0 = COPY %writeback(p0)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %ptr:_(p0) = COPY $x0
+    %offset:_(s64) = G_CONSTANT i64 8
+    %dst:_(s64), %writeback:_(p0) = G_INDEXED_SEXTLOAD %ptr, %offset, 0 :: (load (s32), align 8)
+    $x0 = COPY %writeback
+    RET_ReallyLR implicit $x0
+...
+---
+name:            post_zextload_s32_to_s64
+body: |
+  bb.0:
+    liveins: $x0
+
+    ; CHECK-LABEL: name: post_zextload_s32_to_s64
+    ; CHECK: liveins: $x0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %ptr:_(p0) = COPY $x0
+    ; CHECK-NEXT: %offset:_(s64) = G_CONSTANT i64 8
+    ; CHECK-NEXT: %dst:_(s64), %writeback:_(p0) = G_INDEXED_ZEXTLOAD %ptr, %offset(s64), 0 :: (load (s32), align 8)
+    ; CHECK-NEXT: $x0 = COPY %writeback(p0)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %ptr:_(p0) = COPY $x0
+    %offset:_(s64) = G_CONSTANT i64 8
+    %dst:_(s64), %writeback:_(p0) = G_INDEXED_ZEXTLOAD %ptr, %offset, 0 :: (load (s32), align 8)
+    $x0 = COPY %writeback
+    RET_ReallyLR implicit $x0
+...
diff --git a/llvm/test/CodeGen/AArch64/arm64-indexed-memory.ll b/llvm/test/CodeGen/AArch64/arm64-indexed-memory.ll
index dc8cbd1e43b90..d949f95209577 100644
--- a/llvm/test/CodeGen/AArch64/arm64-indexed-memory.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-indexed-memory.ll
@@ -503,24 +503,11 @@ define ptr @preidx8zext64(ptr %src, ptr %out) {
 }
 
 define ptr @preidx32sext64(ptr %src, ptr %out) {
-; CHECK64-LABEL: preidx32sext64:
-; CHECK64:       ; %bb.0:
-; CHECK64-NEXT:    ldrsw x8, [x0, #4]!
-; CHECK64-NEXT:    str x8, [x1]
-; CHECK64-NEXT:    ret
-;
-; GISEL-LABEL: preidx32sext64:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    ldrsw x8, [x0, #4]
-; GISEL-NEXT:    add x0, x0, #4
-; GISEL-NEXT:    str x8, [x1]
-; GISEL-NEXT:    ret
-;
-; CHECK32-LABEL: preidx32sext64:
-; CHECK32:       ; %bb.0:
-; CHECK32-NEXT:    ldrsw x8, [x0, #4]!
-; CHECK32-NEXT:    str x8, [x1]
-; CHECK32-NEXT:    ret
+; CHECK-LABEL: preidx32sext64:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    ldrsw x8, [x0, #4]!
+; CHECK-NEXT:    str x8, [x1]
+; CHECK-NEXT:    ret
   %ptr = getelementptr inbounds i32, ptr %src, i64 1
   %tmp = load i32, ptr %ptr, align 4
   %ext = sext i32 %tmp to i64
@@ -529,24 +516,11 @@ define ptr @preidx32sext64(ptr %src, ptr %out) {
 }
 
 define ptr @preidx16sext32(ptr %src, ptr %out) {
-; CHECK64-LABEL: preidx16sext32:
-; CHECK64:       ; %bb.0:
-; CHECK64-NEXT:    ldrsh w8, [x0, #2]!
-; CHECK64-NEXT:    str w8, [x1]
-; CHECK64-NEXT:    ret
-;
-; GISEL-LABEL: preidx16sext32:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    ldrsh w8, [x0, #2]
-; GISEL-NEXT:    add x0, x0, #2
-; GISEL-NEXT:    str w8, [x1]
-; GISEL-NEXT:    ret
-;
-; CHECK32-LABEL: preidx16sext32:
-; CHECK32:       ; %bb.0:
-; CHECK32-NEXT:    ldrsh w8, [x0, #2]!
-; CHECK32-NEXT:    str w8, [x1]
-; CHECK32-NEXT:    ret
+; CHECK-LABEL: preidx16sext32:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    ldrsh w8, [x0, #2]!
+; CHECK-NEXT:    str w8, [x1]
+; CHECK-NEXT:    ret
   %ptr = getelementptr inbounds i16, ptr %src, i64 1
   %tmp = load i16, ptr %ptr, align 4
   %ext = sext i16 %tmp to i32
@@ -555,24 +529,11 @@ define ptr @preidx16sext32(ptr %src, ptr %out) {
 }
 
 define ptr @preidx16sext64(ptr %src, ptr %out) {
-; CHECK64-LABEL: preidx16sext64:
-; CHECK64:       ; %bb.0:
-; CHECK64-NEXT:    ldrsh x8, [x0, #2]!
-; CHECK64-NEXT:    str x8, [x1]
-; CHECK64-NEXT:    ret
-;
-; GISEL-LABEL: preidx16sext64:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    ldrsh x8, [x0, #2]
-; GISEL-NEXT:    add x0, x0, #2
-; GISEL-NEXT:    str x8, [x1]
-; GISEL-NEXT:    ret
-;
-; CHECK32-LABEL: preidx16sext64:
-; CHECK32:       ; %bb.0:
-; CHECK32-NEXT:    ldrsh x8, [x0, #2]!
-; CHECK32-NEXT:    str x8, [x1]
-; CHECK32-NEXT:    ret
+; CHECK-LABEL: preidx16sext64:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    ldrsh x8, [x0, #2]!
+; CHECK-NEXT:    str x8, [x1]
+; CHECK-NEXT:    ret
   %ptr = getelementptr inbounds i16, ptr %src, i64 1
   %tmp = load i16, ptr %ptr, align 4
   %ext = sext i16 %tmp to i64
@@ -581,24 +542,11 @@ define ptr @preidx16sext64(ptr %src, ptr %out) {
 }
 
 define ptr @preidx8sext32(ptr %src, ptr %out) {
-; CHECK64-LABEL: preidx8sext32:
-; CHECK64:       ; %bb.0:
-; CHECK64-NEXT:    ldrsb w8, [x0, #1]!
-; CHECK64-NEXT:    str w8, [x1]
-; CHECK64-NEXT:    ret
-;
-; GISEL-LABEL: preidx8sext32:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    ldrsb w8, [x0, #1]
-; GISEL-NEXT:    add x0, x0, #1
-; GISEL-NEXT:    str w8, [x1]
-; GISEL-NEXT:    ret
-;
-; CHECK32-LABEL: preidx8sext32:
-; CHECK32:       ; %bb.0:
-; CHECK32-NEXT:    ldrsb w8, [x0, #1]!
-; CHECK32-NEXT:    str w8, [x1]
-; CHECK32-NEXT:    ret
+; CHECK-LABEL: preidx8sext32:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    ldrsb w8, [x0, #1]!
+; CHECK-NEXT:    str w8, [x1]
+; CHECK-NEXT:    ret
   %ptr = getelementptr inbounds i8, ptr %src, i64 1
   %tmp = load i8, ptr %ptr, align 4
   %ext = sext i8 %tmp to i32
@@ -607,24 +555,11 @@ define ptr @preidx8sext32(ptr %src, ptr %out) {
 }
 
 define ptr @preidx8sext64(ptr %src, ptr %out) {
-; CHECK64-LABEL: preidx8sext64:
-; CHECK64:       ; %bb.0:
-; CHECK64-NEXT:    ldrsb x8, [x0, #1]!
-; CHECK64-NEXT:    str x8, [x1]
-; CHECK64-NEXT:    ret
-;
-; GISEL-LABEL: preidx8sext64:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    ldrsb x8, [x0, #1]
-; GISEL-NEXT:    add x0, x0, #1
-; GISEL-NEXT:    str x8, [x1]
-; GISEL-NEXT:    ret
-;
-; CHECK32-LABEL: preidx8sext64:
-; CHECK32:       ; %bb.0:
-; CHECK32-NEXT:    ldrsb x8, [x0, #1]!
-; CHECK32-NEXT:    str x8, [x1]
-; CHECK32-NEXT:    ret
+; CHECK-LABEL: preidx8sext64:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    ldrsb x8, [x0, #1]!
+; CHECK-NEXT:    str x8, [x1]
+; CHECK-NEXT:    ret
   %ptr = getelementptr inbounds i8, ptr %src, i64 1
   %tmp = load i8, ptr %ptr, align 4
   %ext = sext i8 %tmp to i64
@@ -662,24 +597,11 @@ define ptr @postidx_clobber(ptr %addr) nounwind noinline ssp {
 }
 
 define ptr @preidx32_sb(ptr %src, ptr %out) {
-; CHECK64-LABEL: preidx32_sb:
-; CHECK64:       ; %bb.0:
-; CHECK64-NEXT:    ldrsb w8, [x0, #1]!
-; CHECK64-NEXT:    str w8, [x1]
-; CHECK64-NEXT:    ret
-;
-; GISEL-LABEL: preidx32_sb:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    ldrsb w8, [x0, #1]
-; GISEL-NEXT:    add x0, x0, #1
-; GISEL-NEXT:    str w8, [x1]
-; GISEL-NEXT:    ret
-;
-; CHECK32-LABEL: preidx32_sb:
-; CHECK32:       ; %bb.0:
-; CHECK32-NEXT:    ldrsb w8, [x0, #1]!
-; CHECK32-NEXT:    str w8, [x1]
-; CHECK32-NEXT:    ret
+; CHECK-LABEL: preidx32_sb:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    ldrsb w8, [x0, #1]!
+; CHECK-NEXT:    str w8, [x1]
+; CHECK-NEXT:    ret
   %ptr = getelementptr inbounds i8, ptr %src, i64 1
   %tmp = load i8, ptr %ptr, align 1
   %sext = sext i8 %tmp to i32
@@ -688,24 +610,11 @@ define ptr @preidx32_sb(ptr %src, ptr %out) {
 }
 
 define ptr @preidx32_sh(ptr %src, ptr %out) {
-; CHECK64-LABEL: preidx32_sh:
-; CHECK64:       ; %bb.0:
-; CHECK64-NEXT:    ldrsh w8, [x0, #2]!
-; CHECK64-NEXT:    str w8, [x1]
-; CHECK64-NEXT:    ret
-;
-; GISEL-LABEL: preidx32_sh:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    ldrsh w8, [x0, #2]
-; GISEL-NEXT:    add x0, x0, #2
-; GISEL-NEXT:    str w8, [x1]
-; GISEL-NEXT:    ret
-;
-; CHECK32-LABEL: preidx32_sh:
-; CHECK32:       ; %bb.0:
-; CHECK32-NEXT:    ldrsh w8, [x0, #2]!
-; CHECK32-NEXT:    str w8, [x1]
-; CHECK32-NEXT:    ret
+; CHECK-LABEL: preidx32_sh:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    ldrsh w8, [x0, #2]!
+; CHECK-NEXT:    str w8, [x1]
+; CHECK-NEXT:    ret
   %ptr = getelementptr inbounds i16, ptr %src, i64 1
   %tmp = load i16, ptr %ptr, align 2
   %sext = sext i16 %tmp to i32
@@ -714,24 +623,11 @@ define ptr @preidx32_sh(ptr %src, ptr %out) {
 }
 
 define ptr @preidx64_sb(ptr %src, ptr %out) {
-; CHECK64-LABEL: preidx64_sb:
-; CHECK64:       ; %bb.0:
-; CHECK64-NEXT:    ldrsb x8, [x0, #1]!
-; CHECK64-NEXT:    str x8, [x1]
-; CHECK64-NEXT:    ret
-;
-; GISEL-LABEL: preidx64_sb:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    ldrsb x8, [x0, #1]
-; GISEL-NEXT:    add x0, x0, #1
-; GISEL-NEXT:    str x8, [x1]
-; GISEL-NEXT:    ret
-;
-; CHECK32-LABEL: preidx64_sb:
-; CHECK32:       ; %bb.0:
-; CHECK32-NEXT:    ldrsb x8, [x0, #1]!
-; CHECK32-NEXT:    str x8, [x1]
-; CHECK32-NEXT:    ret
+; CHECK-LABEL: preidx64_sb:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    ldrsb x8, [x0, #1]!
+; CHECK-NEXT:    str x8, [x1]
+; CHECK-NEXT:    ret
   %ptr = getelementptr inbounds i8, ptr %src, i64 1
   %tmp = load i8, ptr %ptr, align 1
   %sext = sext i8 %tmp to i64
@@ -740,24 +636,11 @@ define ptr @preidx64_sb(ptr %src, ptr %out) {
 }
 
 define ptr @preidx64_sh(ptr %src, ptr %out) {
-; CHECK64-LABEL: preidx64_sh:
-; CHECK64:       ; %bb.0:
-; CHECK64-NEXT:    ldrsh x8, [x0, #2]!
-; CHECK64-NEXT:    str x8, [x1]
-; CHECK64-NEXT:    ret
-;
-; GISEL-LABEL: preidx64_sh:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    ldrsh x8, [x0, #2]
-; GISEL-NEXT:    add x0, x0, #2
-; GISEL-NEXT:    str x8, [x1]
-; GISEL-NEXT:    ret
-;
-; CHECK32-LABEL: preidx64_sh:
-; CHECK32:       ; %bb.0:
-; CHECK32-NEXT:    ldrsh x8, [x0, #2]!
-; CHECK32-NEXT:    str x8, [x1]
-; CHECK32-NEXT:    ret
+; CHECK-LABEL: preidx64_sh:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    ldrsh x8, [x0, #2]!
+; CHECK-NEXT:    str x8, [x1]
+; CHECK-NEXT:    ret
   %ptr = getelementptr inbounds i16, ptr %src, i64 1
   %tmp = load i16, ptr %ptr, align 2
   %sext = sext i16 %tmp to i64
@@ -766,24 +649,11 @@ define ptr @preidx64_sh(ptr %src, ptr %out) {
 }
 
 define ptr @preidx64_sw(ptr %src, ptr %out) {
-; CHECK64-LABEL: preidx64_sw:
-; CHECK64:       ; %bb.0:
-; CHECK64-NEXT:    ldrsw x8, [x0, #4]!
-; CHECK64-NEXT:    str x8, [x1]
-; CHECK64-NEXT:    ret
-;
-; GISEL-LABEL: preidx64_sw:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    ldrsw x8, [x0, #4]
-; GISEL-NEXT:    add x0, x0, #4
-; GISEL-NEXT:    str x8, [x1]
-; GISEL-NEXT:    ret
-;
-; CHECK32-LABEL: preidx64_sw:
-; CHECK32:       ; %bb.0:
-; CHECK32-NEXT:    ldrsw x8, [x0, #4]!
-; CHECK32-NEXT:    str x8, [x1]
-; CHECK32-NEXT:    ret
+; CHECK-LABEL: preidx64_sw:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    ldrsw x8, [x0, #4]!
+; CHECK-NEXT:    str x8, [x1]
+; CHECK-NEXT:    ret
   %ptr = getelementptr inbounds i32, ptr %src, i64 1
   %tmp = load i32, ptr %ptr, align 2
   %sext = sext i32 %tmp to i64
@@ -792,24 +662,11 @@ define ptr @preidx64_sw(ptr %src, ptr %out) {
 }
 
 define ptr @postidx32_sb(ptr %src, ptr %out) {
-; CHECK64-LABEL: postidx32_sb:
-; CHECK64:       ; %bb.0:
-; CHECK64-NEXT:    ldrsb w8, [x0], #1
-; CHECK64-NEXT:    str w8, [x1]
-; CHECK64-NEXT:    ret
-;
-; GISEL-LABEL: postidx32_sb:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    ldrsb w8, [x0]
-; GISEL-NEXT:    add x0, x0, #1
-; GISEL-NEXT:    str w8, [x1]
-; GISEL-NEXT:    ret
-;
-; CHECK32-LABEL: postidx32_sb:
-; CHECK32:       ; %bb.0:
-; CHECK32-NEXT:    ldrsb w8, [x0], #1
-; CHECK32-NEXT:    str w8, [x1]
-; CHECK32-NEXT:    ret
+; CHECK-LABEL: postidx32_sb:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    ldrsb w8, [x0], #1
+; CHECK-NEXT:    str w8, [x1]
+; CHECK-NEXT:    ret
   %tmp = load i8, ptr %src, align 1
   %ptr = getelementptr inbounds i8, ptr %src, i64 1
   %sext = sext i8 %tmp to i32
@@ -818,24 +675,11 @@ define ptr @postidx32_sb(ptr %src, ptr %out) {
 }
 
 define ptr @postidx32_sh(ptr %src, ptr %out) {
-; CHECK64-LABEL: postidx32_sh:
-; CHECK64:       ; %bb.0:
-; CHECK64-NEXT:    ldrsh w8, [x0], #2
-; CHECK64-NEXT:    str w8, [x1]
-; CHECK64-NEXT:    ret
-;
-; GISEL-LABEL: postidx32_sh:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    ldrsh w8, [x0]
-; GISEL-NEXT:    add x0, x0, #2
-; GISEL-NEXT:    str w8, [x1]
-; GISEL-NEXT:    ret
-;
-; CHECK32-LABEL: postidx32_sh:
-; CHECK32:       ; %bb.0:
-; CHECK32-NEXT:    ldrsh w8, [x0], #2
-; CHECK32-NEXT:    str w8, [x1]
-; CHECK32-NEXT:    ret
+; CHECK-LABEL: postidx32_sh:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    ldrsh w8, [x0], #2
+; CHECK-NEXT:    str w8, [x1]
+; CHECK-NEXT:    ret
   %tmp = load i16, ptr %src, align 2
   %ptr = getelementptr inbounds i16, ptr %src, i64 1
   %sext = sext i16 %tmp to i32
@@ -844,24 +688,11 @@ define ptr @postidx32_sh(ptr %src, ptr %out) {
 }
 
 define ptr @postidx64_sb(ptr %src, ptr %out) {
-; CHECK64-LABEL: postidx64_sb:
-; CHECK64:       ; %bb.0:
-; CHECK64-NEXT:    ldrsb x8, [x0], #1
-; CHECK64-NEXT:    str x8, [x1]
-; CHECK64-NEXT:    ret
-;
-; GISEL-LABEL: postidx64_sb:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    ldrsb x8, [x0]
-; GISEL-NEXT:    add x0, x0, #1
-; GISEL-NEXT:    str x8, [x1]
-; GISEL-NEXT:    ret
-;
-; CHECK32-LABEL: postidx64_sb:
-; CHECK32:       ; %bb.0:
-; CHECK32-NEXT:    ldrsb x8, [x0], #1
-; CHECK32-NEXT:    str x8, [x1]
-; CHECK32-NEXT:    ret
+; CHECK-LABEL: postidx64_sb:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    ldrsb x8, [x0], #1
+; CHECK-NEXT:    str x8, [x1]
+; CHECK-NEXT:    ret
   %tmp = load i8, ptr %src, align 1
   %ptr = getelementptr inbounds i8, ptr %src, i64 1
   %sext = sext i8 %tmp to i64
@@ -870,24 +701,11 @@ define ptr @postidx64_sb(ptr %src, ptr %out) {
 }
 
 define ptr @postidx64_sh(ptr %src, ptr %out) {
-; CHECK64-LABEL: postidx64_sh:
-; CHECK64:       ; %bb.0:
-; CHECK64-NEXT:    ldrsh x8, [x0], #2
-; CHECK64-NEXT:    str x8, [x1]
-; CHECK64-NEXT:    ret
-;
-; GISEL-LABEL: postidx64_sh:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    ldrsh x8, [x0]
-; GISEL-NEXT:    add x0, x0, #2
-; GISEL-NEXT:    str x8, [x1]
-; GISEL-NEXT:    ret
-;
-; CHECK32-LABEL: postidx64_sh:
-; CHECK32:       ; %bb.0:
-; CHECK32-NEXT:    ldrsh x8, [x0], #2
-; CHECK32-NEXT:    str x8, [x1]
-; CHECK32-NEXT:    ret
+; CHECK-LABEL: postidx64_sh:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    ldrsh x8, [x0], #2
+; CHECK-NEXT:    str x8, [x1]
+; CHECK-NEXT:    ret
   %tmp = load i16, ptr %src, align 2
   %ptr = getelementptr inbounds i16, ptr %src, i64 1
   %sext = sext i16 %tmp to i64
@@ -896,24 +714,11 @@ define ptr @postidx64_sh(ptr %src, ptr %out) {
 }
 
 define ptr @postidx64_sw(ptr %src, ptr %out) {
-; CHECK64-LABEL: postidx64_sw:
-; CHECK64:       ; %bb.0:
-; CHECK64-NEXT:    ldrsw x8, [x0], #4
-; CHECK64-NEXT:    str x8, [x1]
-; CHECK64-NEXT:    ret
-;
-; GISEL-LABEL: postidx64_sw:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    ldrsw x8, [x0]
-; GISEL-NEXT:    add x0, x0, #4
-; GISEL-NEXT:    str x8, [x1]
-; GISEL-NEXT:    ret
-;
-; CHECK32-LABEL: postidx64_sw:
-; CHECK32:       ; %bb.0:
-; CHECK32-NEXT:    ldrsw x8, [x0], #4
-; CHECK32-NEXT:    str x8, [x1]
-; CHECK32-NEXT:    ret
+; CHECK-LABEL: postidx64_sw:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    ldrsw x8, [x0], #4
+; CHECK-NEXT:    str x8, [x1]
+; CHECK-NEXT:    ret
   %tmp = load i32, ptr %src, align 4
   %ptr = getelementptr inbounds i32, ptr %src, i64 1
   %sext = sext i32 %tmp to i64

From 543f9e781032f086afa7438065b9ce7a9c07ca2b Mon Sep 17 00:00:00 2001
From: Piotr Zegar <piotr.zegar@nokia.com>
Date: Thu, 26 Oct 2023 22:58:20 +0200
Subject: [PATCH 130/877] [clang-tidy] Improve bugprone-unused-return-value
 check (#66573)

Improve diagnostic message to be more straight forward, fix handling of
casting to non-void and add new option AllowCastToVoid to control
casting
to void behavior.

Closes #66570
---
 .../bugprone/UnusedReturnValueCheck.cpp       |  36 ++++--
 .../bugprone/UnusedReturnValueCheck.h         |   4 +
 .../clang-tidy/cert/CERTTidyModule.cpp        |   1 +
 clang-tools-extra/docs/ReleaseNotes.rst       |   6 +
 .../checks/bugprone/unused-return-value.rst   |   4 +
 .../docs/clang-tidy/checks/cert/err33-c.rst   |   3 +
 .../bugprone/unused-return-value-custom.cpp   |  27 ++---
 .../checkers/bugprone/unused-return-value.cpp | 103 +++++++++---------
 .../test/clang-tidy/checkers/cert/err33-c.c   |  12 +-
 9 files changed, 114 insertions(+), 82 deletions(-)

diff --git a/clang-tools-extra/clang-tidy/bugprone/UnusedReturnValueCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/UnusedReturnValueCheck.cpp
index bdc601c2445f5..6bc9f2dd367dc 100644
--- a/clang-tools-extra/clang-tidy/bugprone/UnusedReturnValueCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/UnusedReturnValueCheck.cpp
@@ -130,26 +130,35 @@ UnusedReturnValueCheck::UnusedReturnValueCheck(llvm::StringRef Name,
                                             "::std::error_condition;"
                                             "::std::errc;"
                                             "::std::expected;"
-                                            "::boost::system::error_code"))) {}
+                                            "::boost::system::error_code"))),
+      AllowCastToVoid(Options.get("AllowCastToVoid", false)) {}
 
 void UnusedReturnValueCheck::storeOptions(ClangTidyOptions::OptionMap &Opts) {
   Options.store(Opts, "CheckedFunctions", CheckedFunctions);
   Options.store(Opts, "CheckedReturnTypes",
                 utils::options::serializeStringList(CheckedReturnTypes));
+  Options.store(Opts, "AllowCastToVoid", AllowCastToVoid);
 }
 
 void UnusedReturnValueCheck::registerMatchers(MatchFinder *Finder) {
   auto FunVec = utils::options::parseStringList(CheckedFunctions);
 
-  auto MatchedCallExpr = expr(ignoringImplicit(ignoringParenImpCasts(
-      callExpr(callee(functionDecl(
-                   // Don't match void overloads of checked functions.
-                   unless(returns(voidType())),
-                   anyOf(isInstantiatedFrom(hasAnyName(FunVec)),
-                         returns(hasCanonicalType(hasDeclaration(
-                             namedDecl(matchers::matchesAnyListedName(
-                                 CheckedReturnTypes)))))))))
-          .bind("match"))));
+  auto MatchedDirectCallExpr =
+      expr(callExpr(callee(functionDecl(
+                        // Don't match void overloads of checked functions.
+                        unless(returns(voidType())),
+                        anyOf(isInstantiatedFrom(hasAnyName(FunVec)),
+                              returns(hasCanonicalType(hasDeclaration(
+                                  namedDecl(matchers::matchesAnyListedName(
+                                      CheckedReturnTypes)))))))))
+               .bind("match"));
+
+  auto CheckCastToVoid =
+      AllowCastToVoid ? castExpr(unless(hasCastKind(CK_ToVoid))) : castExpr();
+  auto MatchedCallExpr = expr(
+      anyOf(MatchedDirectCallExpr,
+            explicitCastExpr(unless(cxxFunctionalCastExpr()), CheckCastToVoid,
+                             hasSourceExpression(MatchedDirectCallExpr))));
 
   auto UnusedInCompoundStmt =
       compoundStmt(forEach(MatchedCallExpr),
@@ -178,8 +187,13 @@ void UnusedReturnValueCheck::registerMatchers(MatchFinder *Finder) {
 void UnusedReturnValueCheck::check(const MatchFinder::MatchResult &Result) {
   if (const auto *Matched = Result.Nodes.getNodeAs<CallExpr>("match")) {
     diag(Matched->getBeginLoc(),
-         "the value returned by this function should be used")
+         "the value returned by this function should not be disregarded; "
+         "neglecting it may lead to errors")
         << Matched->getSourceRange();
+
+    if (!AllowCastToVoid)
+      return;
+
     diag(Matched->getBeginLoc(),
          "cast the expression to void to silence this warning",
          DiagnosticIDs::Note);
diff --git a/clang-tools-extra/clang-tidy/bugprone/UnusedReturnValueCheck.h b/clang-tools-extra/clang-tidy/bugprone/UnusedReturnValueCheck.h
index 15881e12ca0c3..b4356f8379fdc 100644
--- a/clang-tools-extra/clang-tidy/bugprone/UnusedReturnValueCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/UnusedReturnValueCheck.h
@@ -24,10 +24,14 @@ class UnusedReturnValueCheck : public ClangTidyCheck {
   void storeOptions(ClangTidyOptions::OptionMap &Opts) override;
   void registerMatchers(ast_matchers::MatchFinder *Finder) override;
   void check(const ast_matchers::MatchFinder::MatchResult &Result) override;
+  std::optional<TraversalKind> getCheckTraversalKind() const override {
+    return TK_IgnoreUnlessSpelledInSource;
+  }
 
 private:
   std::string CheckedFunctions;
   const std::vector<StringRef> CheckedReturnTypes;
+  const bool AllowCastToVoid;
 };
 
 } // namespace clang::tidy::bugprone
diff --git a/clang-tools-extra/clang-tidy/cert/CERTTidyModule.cpp b/clang-tools-extra/clang-tidy/cert/CERTTidyModule.cpp
index d448d9ba61454..d334ab8c437d3 100644
--- a/clang-tools-extra/clang-tidy/cert/CERTTidyModule.cpp
+++ b/clang-tools-extra/clang-tidy/cert/CERTTidyModule.cpp
@@ -328,6 +328,7 @@ class CERTModule : public ClangTidyModule {
     ClangTidyOptions::OptionMap &Opts = Options.CheckOptions;
     Opts["cert-dcl16-c.NewSuffixes"] = "L;LL;LU;LLU";
     Opts["cert-err33-c.CheckedFunctions"] = CertErr33CCheckedFunctions;
+    Opts["cert-err33-c.AllowCastToVoid"] = "true";
     Opts["cert-oop54-cpp.WarnOnlyIfThisHasSuspiciousField"] = "false";
     Opts["cert-str34-c.DiagnoseSignedUnsignedCharComparisons"] = "false";
     return Options;
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index 8e4288399a769..a511a522efb2e 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -232,6 +232,12 @@ Changes in existing checks
   <clang-tidy/checks/bugprone/undefined-memory-manipulation>` check to support
   fixed-size arrays of non-trivial types.
 
+- Improved :doc:`bugprone-unused-return-value
+  <clang-tidy/checks/bugprone/unused-return-value>` check diagnostic message,
+  added support for detection of unused results when cast to non-``void`` type.
+  Casting to ``void`` no longer suppresses issues by default, control this
+  behavior with the new `AllowCastToVoid` option.
+
 - Improved :doc:`cppcoreguidelines-avoid-non-const-global-variables
   <clang-tidy/checks/cppcoreguidelines/avoid-non-const-global-variables>` check
   to ignore ``static`` variables declared within the scope of
diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/unused-return-value.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/unused-return-value.rst
index 5ca96216a1004..1e3c8a3268222 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/unused-return-value.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/unused-return-value.rst
@@ -52,5 +52,9 @@ Options
    By default the following function return types are checked:
    `::std::error_code`, `::std::error_condition`, `::std::errc`, `::std::expected`, `::boost::system::error_code`
 
+.. option:: AllowCastToVoid
+
+   Controls whether casting return values to ``void`` is permitted. Default: `false`.
+
 :doc:`cert-err33-c <../cert/err33-c>` is an alias of this check that checks a
 fixed and large set of standard library functions.
diff --git a/clang-tools-extra/docs/clang-tidy/checks/cert/err33-c.rst b/clang-tools-extra/docs/clang-tidy/checks/cert/err33-c.rst
index f3211e50b7e6c..c1414ec5e086d 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/cert/err33-c.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/cert/err33-c.rst
@@ -189,6 +189,9 @@ functions are checked:
 This check is an alias of check :doc:`bugprone-unused-return-value <../bugprone/unused-return-value>`
 with a fixed set of functions.
 
+Suppressing issues by casting to ``void`` is enabled by default and can be
+disabled by setting `AllowCastToVoid` option to ``false``.
+
 The check corresponds to a part of CERT C Coding Standard rule `ERR33-C.
 Detect and handle standard library errors
 <https://wiki.sei.cmu.edu/confluence/display/c/ERR33-C.+Detect+and+handle+standard+library+errors>`_.
diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/unused-return-value-custom.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/unused-return-value-custom.cpp
index fb5f77031a327..d3650b210ab02 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/bugprone/unused-return-value-custom.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/unused-return-value-custom.cpp
@@ -47,40 +47,35 @@ void fun(int);
 
 void warning() {
   fun();
-  // CHECK-NOTES: [[@LINE-1]]:3: warning: the value returned by this function should be used
-  // CHECK-NOTES: [[@LINE-2]]:3: note: cast the expression to void to silence this warning
+  // CHECK-MESSAGES: [[@LINE-1]]:3: warning: the value returned by this function should not be disregarded; neglecting it may lead to errors
 
   (fun());
-  // CHECK-NOTES: [[@LINE-1]]:4: warning: the value {{.*}} should be used
-  // CHECK-NOTES: [[@LINE-2]]:4: note: cast {{.*}} this warning
+  // CHECK-MESSAGES: [[@LINE-1]]:4: warning: the value returned by this function should not be disregarded; neglecting it may lead to errors
+
+  (void)fun();
+  // CHECK-MESSAGES: [[@LINE-1]]:9: warning: the value returned by this function should not be disregarded; neglecting it may lead to errors
 
   ns::Outer::Inner ObjA1;
   ObjA1.memFun();
-  // CHECK-NOTES: [[@LINE-1]]:3: warning: the value {{.*}} should be used
-  // CHECK-NOTES: [[@LINE-2]]:3: note: cast {{.*}} this warning
+  // CHECK-MESSAGES: [[@LINE-1]]:3: warning: the value returned by this function should not be disregarded; neglecting it may lead to errors
 
   ns::AliasName::Inner ObjA2;
   ObjA2.memFun();
-  // CHECK-NOTES: [[@LINE-1]]:3: warning: the value {{.*}} should be used
-  // CHECK-NOTES: [[@LINE-2]]:3: note: cast {{.*}} this warning
+  // CHECK-MESSAGES: [[@LINE-1]]:3: warning: the value returned by this function should not be disregarded; neglecting it may lead to errors
 
   ns::Derived ObjA3;
   ObjA3.memFun();
-  // CHECK-NOTES: [[@LINE-1]]:3: warning: the value {{.*}} should be used
-  // CHECK-NOTES: [[@LINE-2]]:3: note: cast {{.*}} this warning
+  // CHECK-MESSAGES: [[@LINE-1]]:3: warning: the value returned by this function should not be disregarded; neglecting it may lead to errors
 
   ns::Type::staticFun();
-  // CHECK-NOTES: [[@LINE-1]]:3: warning: the value {{.*}} should be used
-  // CHECK-NOTES: [[@LINE-2]]:3: note: cast {{.*}} this warning
+  // CHECK-MESSAGES: [[@LINE-1]]:3: warning: the value returned by this function should not be disregarded; neglecting it may lead to errors
 
   ns::ClassTemplate<int> ObjA4;
   ObjA4.memFun();
-  // CHECK-NOTES: [[@LINE-1]]:3: warning: the value {{.*}} should be used
-  // CHECK-NOTES: [[@LINE-2]]:3: note: cast {{.*}} this warning
+  // CHECK-MESSAGES: [[@LINE-1]]:3: warning: the value returned by this function should not be disregarded; neglecting it may lead to errors
 
   ns::ClassTemplate<int>::staticFun();
-  // CHECK-NOTES: [[@LINE-1]]:3: warning: the value {{.*}} should be used
-  // CHECK-NOTES: [[@LINE-2]]:3: note: cast {{.*}} this warning
+  // CHECK-MESSAGES: [[@LINE-1]]:3: warning: the value returned by this function should not be disregarded; neglecting it may lead to errors
 }
 
 void noWarning() {
diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/unused-return-value.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/unused-return-value.cpp
index 6da66ca42d90f..5c6ce1e4bf1fd 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/bugprone/unused-return-value.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/unused-return-value.cpp
@@ -1,4 +1,5 @@
-// RUN: %check_clang_tidy %s bugprone-unused-return-value %t -- -- -fexceptions
+// RUN: %check_clang_tidy %s bugprone-unused-return-value %t -- \
+// RUN:   --config="{CheckOptions: {bugprone-unused-return-value.AllowCastToVoid: true}}" -- -fexceptions
 
 namespace std {
 
@@ -81,121 +82,125 @@ std::error_code errorFunc() {
 
 void warning() {
   std::async(increment, 42);
-  // CHECK-NOTES: [[@LINE-1]]:3: warning: the value returned by this function should be used
-  // CHECK-NOTES: [[@LINE-2]]:3: note: cast the expression to void to silence this warning
+  // CHECK-MESSAGES: [[@LINE-1]]:3: warning: the value returned by this function should not be disregarded; neglecting it may lead to errors
+  // CHECK-MESSAGES: [[@LINE-2]]:3: note: cast the expression to void to silence this warning
 
   std::async(std::launch::async, increment, 42);
-  // CHECK-NOTES: [[@LINE-1]]:3: warning: the value {{.*}} should be used
-  // CHECK-NOTES: [[@LINE-2]]:3: note: cast {{.*}} this warning
+  // CHECK-MESSAGES: [[@LINE-1]]:3: warning: the value returned by this function should not be disregarded; neglecting it may lead to errors
+  // CHECK-MESSAGES: [[@LINE-2]]:3: note: cast the expression to void to silence this warning
 
   Foo F;
   std::launder(&F);
-  // CHECK-NOTES: [[@LINE-1]]:3: warning: the value {{.*}} should be used
-  // CHECK-NOTES: [[@LINE-2]]:3: note: cast {{.*}} this warning
+  // CHECK-MESSAGES: [[@LINE-1]]:3: warning: the value returned by this function should not be disregarded; neglecting it may lead to errors
+  // CHECK-MESSAGES: [[@LINE-2]]:3: note: cast the expression to void to silence this warning
 
   std::remove(nullptr, nullptr, 1);
-  // CHECK-NOTES: [[@LINE-1]]:3: warning: the value {{.*}} should be used
-  // CHECK-NOTES: [[@LINE-2]]:3: note: cast {{.*}} this warning
+  // CHECK-MESSAGES: [[@LINE-1]]:3: warning: the value returned by this function should not be disregarded; neglecting it may lead to errors
+  // CHECK-MESSAGES: [[@LINE-2]]:3: note: cast the expression to void to silence this warning
 
   std::remove_if(nullptr, nullptr, nullptr);
-  // CHECK-NOTES: [[@LINE-1]]:3: warning: the value {{.*}} should be used
-  // CHECK-NOTES: [[@LINE-2]]:3: note: cast {{.*}} this warning
+  // CHECK-MESSAGES: [[@LINE-1]]:3: warning: the value returned by this function should not be disregarded; neglecting it may lead to errors
+  // CHECK-MESSAGES: [[@LINE-2]]:3: note: cast the expression to void to silence this warning
 
   std::unique(nullptr, nullptr);
-  // CHECK-NOTES: [[@LINE-1]]:3: warning: the value {{.*}} should be used
-  // CHECK-NOTES: [[@LINE-2]]:3: note: cast {{.*}} this warning
+  // CHECK-MESSAGES: [[@LINE-1]]:3: warning: the value returned by this function should not be disregarded; neglecting it may lead to errors
+  // CHECK-MESSAGES: [[@LINE-2]]:3: note: cast the expression to void to silence this warning
 
   std::unique_ptr<Foo> UPtr;
   UPtr.release();
-  // CHECK-NOTES: [[@LINE-1]]:3: warning: the value {{.*}} should be used
-  // CHECK-NOTES: [[@LINE-2]]:3: note: cast {{.*}} this warning
+  // CHECK-MESSAGES: [[@LINE-1]]:3: warning: the value returned by this function should not be disregarded; neglecting it may lead to errors
+  // CHECK-MESSAGES: [[@LINE-2]]:3: note: cast the expression to void to silence this warning
 
   std::string Str;
   Str.empty();
-  // CHECK-NOTES: [[@LINE-1]]:3: warning: the value {{.*}} should be used
-  // CHECK-NOTES: [[@LINE-2]]:3: note: cast {{.*}} this warning
+  // CHECK-MESSAGES: [[@LINE-1]]:3: warning: the value returned by this function should not be disregarded; neglecting it may lead to errors
+  // CHECK-MESSAGES: [[@LINE-2]]:3: note: cast the expression to void to silence this warning
+
+  (int)Str.empty();
+  // CHECK-MESSAGES: [[@LINE-1]]:8: warning: the value returned by this function should not be disregarded; neglecting it may lead to errors
+  // CHECK-MESSAGES: [[@LINE-2]]:8: note: cast the expression to void to silence this warning
 
   std::vector<Foo> Vec;
   Vec.empty();
-  // CHECK-NOTES: [[@LINE-1]]:3: warning: the value {{.*}} should be used
-  // CHECK-NOTES: [[@LINE-2]]:3: note: cast {{.*}} this warning
+  // CHECK-MESSAGES: [[@LINE-1]]:3: warning: the value returned by this function should not be disregarded; neglecting it may lead to errors
+  // CHECK-MESSAGES: [[@LINE-2]]:3: note: cast the expression to void to silence this warning
 
   // test discarding return values inside different kinds of statements
 
   auto Lambda = [] { std::remove(nullptr, nullptr, 1); };
-  // CHECK-NOTES: [[@LINE-1]]:22: warning: the value {{.*}} should be used
-  // CHECK-NOTES: [[@LINE-2]]:22: note: cast {{.*}} this warning
+  // CHECK-MESSAGES: [[@LINE-1]]:22: warning: the value returned by this function should not be disregarded; neglecting it may lead to errors
+  // CHECK-MESSAGES: [[@LINE-2]]:22: note: cast the expression to void to silence this warning
 
   if (true)
     std::remove(nullptr, nullptr, 1);
-  // CHECK-NOTES: [[@LINE-1]]:5: warning: the value {{.*}} should be used
-  // CHECK-NOTES: [[@LINE-2]]:5: note: cast {{.*}} this warning
+  // CHECK-MESSAGES: [[@LINE-1]]:5: warning: the value returned by this function should not be disregarded; neglecting it may lead to errors
+  // CHECK-MESSAGES: [[@LINE-2]]:5: note: cast the expression to void to silence this warning
   else if (true)
     std::remove(nullptr, nullptr, 1);
-  // CHECK-NOTES: [[@LINE-1]]:5: warning: the value {{.*}} should be used
-  // CHECK-NOTES: [[@LINE-2]]:5: note: cast {{.*}} this warning
+  // CHECK-MESSAGES: [[@LINE-1]]:5: warning: the value returned by this function should not be disregarded; neglecting it may lead to errors
+  // CHECK-MESSAGES: [[@LINE-2]]:5: note: cast the expression to void to silence this warning
   else
     std::remove(nullptr, nullptr, 1);
-  // CHECK-NOTES: [[@LINE-1]]:5: warning: the value {{.*}} should be used
-  // CHECK-NOTES: [[@LINE-2]]:5: note: cast {{.*}} this warning
+  // CHECK-MESSAGES: [[@LINE-1]]:5: warning: the value returned by this function should not be disregarded; neglecting it may lead to errors
+  // CHECK-MESSAGES: [[@LINE-2]]:5: note: cast the expression to void to silence this warning
 
   while (true)
     std::remove(nullptr, nullptr, 1);
-  // CHECK-NOTES: [[@LINE-1]]:5: warning: the value {{.*}} should be used
-  // CHECK-NOTES: [[@LINE-2]]:5: note: cast {{.*}} this warning
+  // CHECK-MESSAGES: [[@LINE-1]]:5: warning: the value returned by this function should not be disregarded; neglecting it may lead to errors
+  // CHECK-MESSAGES: [[@LINE-2]]:5: note: cast the expression to void to silence this warning
 
   do
     std::remove(nullptr, nullptr, 1);
-  // CHECK-NOTES: [[@LINE-1]]:5: warning: the value {{.*}} should be used
-  // CHECK-NOTES: [[@LINE-2]]:5: note: cast {{.*}} this warning
+  // CHECK-MESSAGES: [[@LINE-1]]:5: warning: the value returned by this function should not be disregarded; neglecting it may lead to errors
+  // CHECK-MESSAGES: [[@LINE-2]]:5: note: cast the expression to void to silence this warning
   while (true);
 
   for (;;)
     std::remove(nullptr, nullptr, 1);
-  // CHECK-NOTES: [[@LINE-1]]:5: warning: the value {{.*}} should be used
-  // CHECK-NOTES: [[@LINE-2]]:5: note: cast {{.*}} this warning
+  // CHECK-MESSAGES: [[@LINE-1]]:5: warning: the value returned by this function should not be disregarded; neglecting it may lead to errors
+  // CHECK-MESSAGES: [[@LINE-2]]:5: note: cast the expression to void to silence this warning
 
   for (std::remove(nullptr, nullptr, 1);;)
-    // CHECK-NOTES: [[@LINE-1]]:8: warning: the value {{.*}} should be used
-    // CHECK-NOTES: [[@LINE-2]]:8: note: cast {{.*}} this warning
+    // CHECK-MESSAGES: [[@LINE-1]]:8: warning: the value returned by this function should not be disregarded; neglecting it may lead to errors
+    // CHECK-MESSAGES: [[@LINE-2]]:8: note: cast the expression to void to silence this warning
     ;
 
   for (;; std::remove(nullptr, nullptr, 1))
-    // CHECK-NOTES: [[@LINE-1]]:11: warning: the value {{.*}} should be used
-    // CHECK-NOTES: [[@LINE-2]]:11: note: cast {{.*}} this warning
+    // CHECK-MESSAGES: [[@LINE-1]]:11: warning: the value returned by this function should not be disregarded; neglecting it may lead to errors
+    // CHECK-MESSAGES: [[@LINE-2]]:11: note: cast the expression to void to silence this warning
     ;
 
   for (auto C : "foo")
     std::remove(nullptr, nullptr, 1);
-  // CHECK-NOTES: [[@LINE-1]]:5: warning: the value {{.*}} should be used
-  // CHECK-NOTES: [[@LINE-2]]:5: note: cast {{.*}} this warning
+  // CHECK-MESSAGES: [[@LINE-1]]:5: warning: the value returned by this function should not be disregarded; neglecting it may lead to errors
+  // CHECK-MESSAGES: [[@LINE-2]]:5: note: cast the expression to void to silence this warning
 
   switch (1) {
   case 1:
     std::remove(nullptr, nullptr, 1);
-    // CHECK-NOTES: [[@LINE-1]]:5: warning: the value {{.*}} should be used
-    // CHECK-NOTES: [[@LINE-2]]:5: note: cast {{.*}} this warning
+    // CHECK-MESSAGES: [[@LINE-1]]:5: warning: the value returned by this function should not be disregarded; neglecting it may lead to errors
+    // CHECK-MESSAGES: [[@LINE-2]]:5: note: cast the expression to void to silence this warning
     break;
   default:
     std::remove(nullptr, nullptr, 1);
-    // CHECK-NOTES: [[@LINE-1]]:5: warning: the value {{.*}} should be used
-    // CHECK-NOTES: [[@LINE-2]]:5: note: cast {{.*}} this warning
+    // CHECK-MESSAGES: [[@LINE-1]]:5: warning: the value returned by this function should not be disregarded; neglecting it may lead to errors
+    // CHECK-MESSAGES: [[@LINE-2]]:5: note: cast the expression to void to silence this warning
     break;
   }
 
   try {
     std::remove(nullptr, nullptr, 1);
-    // CHECK-NOTES: [[@LINE-1]]:5: warning: the value {{.*}} should be used
-    // CHECK-NOTES: [[@LINE-2]]:5: note: cast {{.*}} this warning
+    // CHECK-MESSAGES: [[@LINE-1]]:5: warning: the value returned by this function should not be disregarded; neglecting it may lead to errors
+    // CHECK-MESSAGES: [[@LINE-2]]:5: note: cast the expression to void to silence this warning
   } catch (...) {
     std::remove(nullptr, nullptr, 1);
-    // CHECK-NOTES: [[@LINE-1]]:5: warning: the value {{.*}} should be used
-    // CHECK-NOTES: [[@LINE-2]]:5: note: cast {{.*}} this warning
+    // CHECK-MESSAGES: [[@LINE-1]]:5: warning: the value returned by this function should not be disregarded; neglecting it may lead to errors
+    // CHECK-MESSAGES: [[@LINE-2]]:5: note: cast the expression to void to silence this warning
   }
 
   errorFunc();
-  // CHECK-NOTES: [[@LINE-1]]:3: warning: the value {{.*}} should be used
-  // CHECK-NOTES: [[@LINE-2]]:3: note: cast {{.*}} this warning
+  // CHECK-MESSAGES: [[@LINE-1]]:3: warning: the value returned by this function should not be disregarded; neglecting it may lead to errors
+  // CHECK-MESSAGES: [[@LINE-2]]:3: note: cast the expression to void to silence this warning
 }
 
 void noWarning() {
diff --git a/clang-tools-extra/test/clang-tidy/checkers/cert/err33-c.c b/clang-tools-extra/test/clang-tidy/checkers/cert/err33-c.c
index 520ff17bd890e..87ce0acf664e6 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/cert/err33-c.c
+++ b/clang-tools-extra/test/clang-tidy/checkers/cert/err33-c.c
@@ -4,15 +4,15 @@ typedef __SIZE_TYPE__ size_t;
 void *aligned_alloc(size_t alignment, size_t size);
 void test_aligned_alloc(void) {
   aligned_alloc(2, 10);
-  // CHECK-NOTES: [[@LINE-1]]:3: warning: the value returned by this function should be used
-  // CHECK-NOTES: [[@LINE-2]]:3: note: cast the expression to void to silence this warning
+  // CHECK-MESSAGES: [[@LINE-1]]:3: warning: the value returned by this function should not be disregarded; neglecting it may lead to errors
+  // CHECK-MESSAGES: [[@LINE-2]]:3: note: cast the expression to void to silence this warning
 }
 
 long strtol(const char *restrict nptr, char **restrict endptr, int base);
 void test_strtol(void) {
   strtol("123", 0, 10);
-  // CHECK-NOTES: [[@LINE-1]]:3: warning: the value returned by this function should be used
-  // CHECK-NOTES: [[@LINE-2]]:3: note: cast the expression to void to silence this warning
+  // CHECK-MESSAGES: [[@LINE-1]]:3: warning: the value returned by this function should not be disregarded; neglecting it may lead to errors
+  // CHECK-MESSAGES: [[@LINE-2]]:3: note: cast the expression to void to silence this warning
 }
 
 typedef char wchar_t;
@@ -20,6 +20,6 @@ int wscanf_s(const wchar_t *restrict format, ...);
 void test_wscanf_s(void) {
   int Val;
   wscanf_s("%i", &Val);
-  // CHECK-NOTES: [[@LINE-1]]:3: warning: the value returned by this function should be used
-  // CHECK-NOTES: [[@LINE-2]]:3: note: cast the expression to void to silence this warning
+  // CHECK-MESSAGES: [[@LINE-1]]:3: warning: the value returned by this function should not be disregarded; neglecting it may lead to errors
+  // CHECK-MESSAGES: [[@LINE-2]]:3: note: cast the expression to void to silence this warning
 }

From a8a6b66a266e8917caa4e9fb1371c24bd54c32b3 Mon Sep 17 00:00:00 2001
From: serge-sans-paille <serge.guelton@telecom-bretagne.eu>
Date: Thu, 26 Oct 2023 21:09:31 +0000
Subject: [PATCH 131/877] [clang] Document -Wglobal-constructors behavior
 (#68084)

It's a drop in the ocean considering the lack of documentation of our
diagnostics, but it's a start.

Co-authored-by: serge-sans-paille <sguelton@mozilla.com>
---
 clang/include/clang/Basic/DiagnosticGroups.td | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/clang/include/clang/Basic/DiagnosticGroups.td b/clang/include/clang/Basic/DiagnosticGroups.td
index 318eb0d6f8890..9a8f3f03b39d1 100644
--- a/clang/include/clang/Basic/DiagnosticGroups.td
+++ b/clang/include/clang/Basic/DiagnosticGroups.td
@@ -415,7 +415,11 @@ def : DiagGroup<"c++2a-compat-pedantic", [CXX20CompatPedantic]>;
 def ExitTimeDestructors : DiagGroup<"exit-time-destructors">;
 def FlexibleArrayExtensions : DiagGroup<"flexible-array-extensions">;
 def FourByteMultiChar : DiagGroup<"four-char-constants">;
-def GlobalConstructors : DiagGroup<"global-constructors">;
+def GlobalConstructors : DiagGroup<"global-constructors"> {
+ code Documentation = [{
+Emit a warning for each variable declaration that generates code run at startup.
+ }];
+}
 def BitwiseConditionalParentheses: DiagGroup<"bitwise-conditional-parentheses">;
 def BitwiseOpParentheses: DiagGroup<"bitwise-op-parentheses">;
 def LogicalOpParentheses: DiagGroup<"logical-op-parentheses">;

From cf0f6a146038ca95e9ab9df40c153fd22015a042 Mon Sep 17 00:00:00 2001
From: ZhangYin <zhangyin2018@iscas.ac.cn>
Date: Fri, 27 Oct 2023 05:09:55 +0800
Subject: [PATCH 132/877] [libc++] <experimental/simd> Add assignment operator
 of simd reference (#70020)

---
 libcxx/docs/Status/ParallelismProjects.csv    |  1 +
 .../include/experimental/__simd/reference.h   |  7 ++
 .../reference_assignment.pass.cpp             | 83 +++++++++++++++++++
 3 files changed, 91 insertions(+)
 create mode 100644 libcxx/test/std/experimental/simd/simd.reference/reference_assignment.pass.cpp

diff --git a/libcxx/docs/Status/ParallelismProjects.csv b/libcxx/docs/Status/ParallelismProjects.csv
index 667a74d207c93..05a66080823d4 100644
--- a/libcxx/docs/Status/ParallelismProjects.csv
+++ b/libcxx/docs/Status/ParallelismProjects.csv
@@ -15,6 +15,7 @@ Section,Description,Dependencies,Assignee,Complete
 | `[parallel.simd.traits] <https://wg21.link/N4808>`_, "simd type traits resize_simd", None, Yin Zhang, |In Progress|
 | `[parallel.simd.whereexpr] <https://wg21.link/N4808>`_, "Where expression class templates", None, Yin Zhang, |In Progress|
 | `[parallel.simd.reference] <https://wg21.link/N4808>`_, "`Element references operator value_type() <https://github.com/llvm/llvm-project/pull/68960>`_", None, Yin Zhang, |Complete|
+| `[parallel.simd.reference] <https://wg21.link/N4808>`_, "`Element references operator= <https://github.com/llvm/llvm-project/pull/70020>`_", None, Yin Zhang, |Complete|
 | `[parallel.simd.class] <https://wg21.link/N4808>`_, "`Class template simd declaration and alias <https://reviews.llvm.org/D144362>`_", [parallel.simd.abi], Yin Zhang, |Complete|
 | `[parallel.simd.class] <https://wg21.link/N4808>`_, "`simd<>::size() <https://reviews.llvm.org/D144363>`_", [parallel.simd.traits] simd_size[_v], Yin Zhang, |Complete|
 | `[parallel.simd.class] <https://wg21.link/N4808>`_, "`simd broadcast constructor <https://reviews.llvm.org/D156225>`_", None, Yin Zhang, |Complete|
diff --git a/libcxx/include/experimental/__simd/reference.h b/libcxx/include/experimental/__simd/reference.h
index 335b127bf271f..8c58d24f2f2dc 100644
--- a/libcxx/include/experimental/__simd/reference.h
+++ b/libcxx/include/experimental/__simd/reference.h
@@ -10,6 +10,7 @@
 #ifndef _LIBCPP_EXPERIMENTAL___SIMD_REFERENCE_H
 #define _LIBCPP_EXPERIMENTAL___SIMD_REFERENCE_H
 
+#include <__type_traits/is_assignable.h>
 #include <experimental/__simd/utility.h>
 
 #if _LIBCPP_STD_VER >= 17 && defined(_LIBCPP_ENABLE_EXPERIMENTAL)
@@ -44,6 +45,12 @@ class __simd_reference {
   __simd_reference(const __simd_reference&) = delete;
 
   _LIBCPP_HIDE_FROM_ABI operator value_type() const noexcept { return __get(); }
+
+  template <class _Up, enable_if_t<is_assignable_v<value_type&, _Up&&>, int> = 0>
+  _LIBCPP_HIDE_FROM_ABI __simd_reference operator=(_Up&& __v) && noexcept {
+    __set(static_cast<value_type>(std::forward<_Up>(__v)));
+    return {__s_, __idx_};
+  }
 };
 
 } // namespace parallelism_v2
diff --git a/libcxx/test/std/experimental/simd/simd.reference/reference_assignment.pass.cpp b/libcxx/test/std/experimental/simd/simd.reference/reference_assignment.pass.cpp
new file mode 100644
index 0000000000000..4b943689c6fde
--- /dev/null
+++ b/libcxx/test/std/experimental/simd/simd.reference/reference_assignment.pass.cpp
@@ -0,0 +1,83 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14
+
+// <experimental/simd>
+//
+// [simd.reference]
+// template<class U> reference=(U&& x) && noexcept;
+//
+// XFAIL: LIBCXX-AIX-FIXME
+
+#include "../test_utils.h"
+#include <experimental/simd>
+
+namespace ex = std::experimental::parallelism_v2;
+
+template <class T, class SimdAbi>
+struct CheckSimdReferenceAssignmentHelper {
+  template <class U>
+  void operator()() const {
+    if constexpr (std::is_assignable_v<T&, U&&>) {
+      ex::simd<T, SimdAbi> origin_simd([](T i) { return i; });
+      for (size_t i = 0; i < origin_simd.size(); ++i) {
+        static_assert(noexcept(origin_simd[i] = static_cast<U>(i + 1)));
+        origin_simd[i] = static_cast<U>(i + 1);
+        assert(origin_simd[i] == static_cast<T>(std::forward<U>(i + 1)));
+      }
+    }
+  }
+};
+
+template <class T, class SimdAbi>
+struct CheckMaskReferenceAssignmentHelper {
+  template <class U>
+  void operator()() const {
+    if constexpr (std::is_assignable_v<bool&, U&&>) {
+      ex::simd_mask<T, SimdAbi> origin_mask(true);
+      for (size_t i = 0; i < origin_mask.size(); ++i) {
+        static_assert(noexcept(origin_mask[i] = static_cast<U>(i + 1)));
+        origin_mask[i] = static_cast<U>(i % 2);
+        assert(origin_mask[i] == static_cast<T>(std::forward<U>(i % 2)));
+      }
+    }
+  }
+};
+
+template <class T, class SimdAbi>
+struct CheckReferenceAssignmentTraitsHelper {
+  template <class U>
+  void operator()() const {
+    if constexpr (std::is_assignable_v<T&, U&&>)
+      static_assert(std::is_assignable_v<typename ex::simd<T, SimdAbi>::reference&&, U&&>);
+    else
+      static_assert(!std::is_assignable_v<typename ex::simd<T, SimdAbi>::reference&&, U&&>);
+
+    if constexpr (std::is_assignable_v<bool&, U&&>)
+      static_assert(std::is_assignable_v<typename ex::simd_mask<T, SimdAbi>::reference&&, U&&>);
+    else
+      static_assert(!std::is_assignable_v<typename ex::simd_mask<T, SimdAbi>::reference&&, U&&>);
+  }
+};
+
+template <class T, std::size_t>
+struct CheckReferenceAssignment {
+  template <class SimdAbi>
+  void operator()() {
+    types::for_each(arithmetic_no_bool_types(), CheckSimdReferenceAssignmentHelper<T, SimdAbi>());
+    types::for_each(arithmetic_no_bool_types(), CheckMaskReferenceAssignmentHelper<T, SimdAbi>());
+
+    types::for_each(arithmetic_no_bool_types(), CheckReferenceAssignmentTraitsHelper<T, SimdAbi>());
+  }
+};
+
+int main(int, char**) {
+  test_all_simd_abi<CheckReferenceAssignment>();
+  return 0;
+}

From 7e4254552481dbc4089b91941273ff8c42a7113c Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Thu, 26 Oct 2023 14:15:36 -0700
Subject: [PATCH 133/877] [Driver] Reject unsupported -mcmodel= (#70262)

-mcmodel= is supported for a few architectures. Reject the option for
other architectures.

* -mcmodel= is unsupported on x86-32.
* -mcmodel=large is unsupported for PIC on AArch64.
* -mcmodel= is unsupported for aarch64_32 triples.
* https://reviews.llvm.org/D67066 (for RISC-V) made
-mcmodel=medany/-mcmodel=medlow aliases for all architectures. Restrict
this to RISC-V.
* llvm/lib/Target/Sparc has some small/medium/large support, but the
values listed on https://gcc.gnu.org/onlinedocs/gcc/SPARC-Options.html
had been supported before https://reviews.llvm.org/D67066. Consider
-mcmodel= unsupported for Sparc.
* https://reviews.llvm.org/D106371 translated -mcmodel=medium to
-mcmodel=large on AIX, even for 32-bit systems. Retain this behavior but
reject -mcmodel= for other PPC32 systems.

In general the accept/reject behavior is more similar to GCC.

err_drv_invalid_argument_to_option is less clear than
err_drv_unsupported_option_argument. As the supported values are
different for
different architectures, add a
err_drv_unsupported_option_argument_for_target
for better clarity.
---
 .../clang/Basic/DiagnosticDriverKinds.td      |  2 +
 clang/include/clang/Driver/Options.td         |  8 ----
 clang/lib/Driver/ToolChains/Clang.cpp         | 37 +++++++++++++------
 .../CodeGen/RISCV/riscv-sdata-module-flag.c   |  2 -
 clang/test/Driver/mcmodel.c                   | 14 +++++--
 clang/test/Driver/riscv-sdata-warning.c       |  8 ----
 clang/test/Preprocessor/init-x86.c            |  3 --
 7 files changed, 39 insertions(+), 35 deletions(-)

diff --git a/clang/include/clang/Basic/DiagnosticDriverKinds.td b/clang/include/clang/Basic/DiagnosticDriverKinds.td
index 5dbc5b5edfb4a..c0ccd64a2a7b8 100644
--- a/clang/include/clang/Basic/DiagnosticDriverKinds.td
+++ b/clang/include/clang/Basic/DiagnosticDriverKinds.td
@@ -20,6 +20,8 @@ def err_drv_unsupported_opt_for_language_mode : Error<
   "unsupported option '%0' for language mode '%1'">;
 def err_drv_unsupported_option_argument : Error<
   "unsupported argument '%1' to option '%0'">;
+def err_drv_unsupported_option_argument_for_target : Error<
+  "unsupported argument '%1' to option '%0' for target '%2'">;
 def err_drv_unknown_stdin_type : Error<
   "-E or -x required when input is from standard input">;
 def err_drv_unknown_stdin_type_clang_cl : Error<
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 2eb86caa6e6d4..7f3f5125d42e7 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -4480,14 +4480,6 @@ def msave_restore : Flag<["-"], "msave-restore">, Group<m_riscv_Features_Group>,
 def mno_save_restore : Flag<["-"], "mno-save-restore">, Group<m_riscv_Features_Group>,
   HelpText<"Disable using library calls for save and restore">;
 } // let Flags = [TargetSpecific]
-def mcmodel_EQ_medlow : Flag<["-"], "mcmodel=medlow">, Group<m_Group>,
-  Visibility<[ClangOption, CC1Option]>,
-  Alias<mcmodel_EQ>, AliasArgs<["small"]>,
-  HelpText<"Equivalent to -mcmodel=small, compatible with RISC-V gcc.">;
-def mcmodel_EQ_medany : Flag<["-"], "mcmodel=medany">, Group<m_Group>,
-  Visibility<[ClangOption, CC1Option]>,
-  Alias<mcmodel_EQ>, AliasArgs<["medium"]>,
-  HelpText<"Equivalent to -mcmodel=medium, compatible with RISC-V gcc.">;
 let Flags = [TargetSpecific] in {
 def menable_experimental_extensions : Flag<["-"], "menable-experimental-extensions">, Group<m_Group>,
   HelpText<"Enable use of experimental RISC-V extensions.">;
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 601bbfb927746..43a92adbef64b 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -5722,18 +5722,33 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
 
   if (Arg *A = Args.getLastArg(options::OPT_mcmodel_EQ)) {
     StringRef CM = A->getValue();
-    if (CM == "small" || CM == "kernel" || CM == "medium" || CM == "large" ||
-        CM == "tiny") {
-      if (Triple.isOSAIX() && CM == "medium")
-        CmdArgs.push_back("-mcmodel=large");
-      else if (Triple.isAArch64() && (CM == "kernel" || CM == "medium"))
-        D.Diag(diag::err_drv_invalid_argument_to_option)
-            << CM << A->getOption().getName();
-      else
-        A->render(Args, CmdArgs);
+    bool Ok = false;
+    if (Triple.isOSAIX() && CM == "medium") {
+      CM = "large";
+      Ok = true;
+    }
+    if (Triple.isAArch64(64)) {
+      Ok = CM == "tiny" || CM == "small" || CM == "large";
+      if (CM == "large" && RelocationModel != llvm::Reloc::Static)
+        D.Diag(diag::err_drv_argument_only_allowed_with)
+            << A->getAsString(Args) << "-fno-pic";
+    } else if (Triple.isPPC64()) {
+      Ok = CM == "small" || CM == "medium" || CM == "large";
+    } else if (Triple.isRISCV()) {
+      if (CM == "medlow")
+        CM = "small";
+      else if (CM == "medany")
+        CM = "medium";
+      Ok = CM == "small" || CM == "medium";
+    } else if (Triple.getArch() == llvm::Triple::x86_64) {
+      Ok = llvm::is_contained({"small", "kernel", "medium", "large", "tiny"},
+                              CM);
+    }
+    if (Ok) {
+      CmdArgs.push_back(Args.MakeArgString("-mcmodel=" + CM));
     } else {
-      D.Diag(diag::err_drv_invalid_argument_to_option)
-          << CM << A->getOption().getName();
+      D.Diag(diag::err_drv_unsupported_option_argument_for_target)
+          << A->getSpelling() << CM << TripleStr;
     }
   }
 
diff --git a/clang/test/CodeGen/RISCV/riscv-sdata-module-flag.c b/clang/test/CodeGen/RISCV/riscv-sdata-module-flag.c
index e758b384bc6eb..9bd69e7995877 100644
--- a/clang/test/CodeGen/RISCV/riscv-sdata-module-flag.c
+++ b/clang/test/CodeGen/RISCV/riscv-sdata-module-flag.c
@@ -27,8 +27,6 @@
 // RUN:   | FileCheck %s -check-prefix=RV64-ANDROID
 // RUN: %clang --target=riscv64-unknown-elf %s -S -emit-llvm -fpic -o - \
 // RUN:   | FileCheck %s -check-prefix=RV64-PIC
-// RUN: %clang --target=riscv64-unknown-elf %s -S -emit-llvm -mcmodel=large -o - \
-// RUN:   | FileCheck %s -check-prefix=RV64-LARGE
 
 void test(void) {}
 
diff --git a/clang/test/Driver/mcmodel.c b/clang/test/Driver/mcmodel.c
index 43aeb96af0bd0..fb3bbccb0c681 100644
--- a/clang/test/Driver/mcmodel.c
+++ b/clang/test/Driver/mcmodel.c
@@ -1,14 +1,18 @@
+// RUN: not %clang -### -c --target=i686 -mcmodel=medium %s 2>&1 | FileCheck --check-prefix=ERR-MEDIUM %s
 // RUN: %clang --target=x86_64 -### -c -mcmodel=tiny %s 2>&1 | FileCheck --check-prefix=TINY %s
 // RUN: %clang --target=x86_64 -### -c -mcmodel=small %s 2>&1 | FileCheck --check-prefix=SMALL %s
 // RUN: %clang --target=x86_64 -### -S -mcmodel=kernel %s 2>&1 | FileCheck --check-prefix=KERNEL %s
 // RUN: %clang --target=x86_64 -### -c -mcmodel=medium %s 2>&1 | FileCheck --check-prefix=MEDIUM %s
 // RUN: %clang --target=x86_64 -### -S -mcmodel=large %s 2>&1 | FileCheck --check-prefix=LARGE %s
+// RUN: not %clang -### -c --target=powerpc-linux-gnu -mcmodel=medium %s 2>&1 | FileCheck --check-prefix=ERR-MEDIUM %s
 // RUN: %clang --target=powerpc-unknown-aix -### -S -mcmodel=medium %s 2> %t.log
 // RUN: FileCheck --check-prefix=AIX-MCMEDIUM-OVERRIDE %s < %t.log
 // RUN: not %clang -### -c -mcmodel=lager %s 2>&1 | FileCheck --check-prefix=INVALID %s
 // RUN: %clang --target=aarch64 -### -S -mcmodel=large -fno-pic %s 2>&1 | FileCheck --check-prefix=LARGE %s
+// RUN: not %clang --target=aarch64 -### -S -mcmodel=large -fpic %s 2>&1 | FileCheck --check-prefix=AARCH64-PIC-LARGE %s
 // RUN: not %clang -### -c --target=aarch64 -mcmodel=medium %s 2>&1 | FileCheck --check-prefix=ERR-MEDIUM %s
 // RUN: not %clang -### -c --target=aarch64 -mcmodel=kernel %s 2>&1 | FileCheck --check-prefix=ERR-KERNEL %s
+// RUN: not %clang --target=aarch64_32-linux -### -S -mcmodel=small %s 2>&1 | FileCheck --check-prefix=ERR-AARCH64_32 %s
 
 // TINY: "-mcmodel=tiny"
 // SMALL: "-mcmodel=small"
@@ -17,7 +21,11 @@
 // LARGE: "-mcmodel=large"
 // AIX-MCMEDIUM-OVERRIDE: "-mcmodel=large"
 
-// INVALID: error: invalid argument 'lager' to -mcmodel=
+// INVALID: error: unsupported argument 'lager' to option '-mcmodel=' for target '{{.*}}'
 
-// ERR-MEDIUM: error: invalid argument 'medium' to -mcmodel=
-// ERR-KERNEL: error: invalid argument 'kernel' to -mcmodel=
+// ERR-MEDIUM: error: unsupported argument 'medium' to option '-mcmodel=' for target '{{.*}}'
+// ERR-KERNEL: error: unsupported argument 'kernel' to option '-mcmodel=' for target '{{.*}}'
+// ERR-LARGE:  error: unsupported argument 'large' to option '-mcmodel=' for target '{{.*}}'
+
+// AARCH64-PIC-LARGE: error: invalid argument '-mcmodel=large' only allowed with '-fno-pic'
+// ERR-AARCH64_32: error: unsupported argument 'small' to option '-mcmodel=' for target 'aarch64_32-unknown-linux'
diff --git a/clang/test/Driver/riscv-sdata-warning.c b/clang/test/Driver/riscv-sdata-warning.c
index dbb5f5de9cd81..ace9a32ee116f 100644
--- a/clang/test/Driver/riscv-sdata-warning.c
+++ b/clang/test/Driver/riscv-sdata-warning.c
@@ -2,11 +2,3 @@
 // RUN: %clang -S --target=riscv32-unknown-elf -fpic -msmall-data-limit=8 %s 2>&1 \
 // RUN:   | FileCheck -check-prefix=CHECK-PIC-SDATA %s
 // CHECK-PIC-SDATA: warning: ignoring '-msmall-data-limit=' with -mcmodel=large for -fpic or RV64
-
-// RUN: %clang -S --target=riscv64-unknown-elf -mcmodel=large -msmall-data-limit=8 %s 2>&1 \
-// RUN:   | FileCheck -check-prefix=CHECK-RV64-LARGE-SDATA %s
-// CHECK-RV64-LARGE-SDATA: warning: ignoring '-msmall-data-limit=' with -mcmodel=large for -fpic or RV64
-
-// RUN: %clang -S --target=riscv64-linux-android -msmall-data-limit=8 %s 2>&1 \
-// RUN:   | FileCheck -check-prefix=CHECK-RV64-LARGE-SDATA-ANDROID %s
-// CHECK-RV64-LARGE-SDATA-ANDROID: warning: ignoring '-msmall-data-limit=' with -mcmodel=large for -fpic or RV64
diff --git a/clang/test/Preprocessor/init-x86.c b/clang/test/Preprocessor/init-x86.c
index 58be9b7165717..1268419e18a5c 100644
--- a/clang/test/Preprocessor/init-x86.c
+++ b/clang/test/Preprocessor/init-x86.c
@@ -802,9 +802,6 @@
 // X86_64H:#define __x86_64h 1
 // X86_64H:#define __x86_64h__ 1
 
-// RUN: %clang -xc - -E -dM -mcmodel=medium --target=i386-unknown-linux < /dev/null | FileCheck -match-full-lines -check-prefix X86_MEDIUM %s
-// X86_MEDIUM:#define __code_model_medium__ 1
-
 // RUN: %clang_cc1 -E -dM -ffreestanding -fgnuc-version=4.2.1 -triple=x86_64-none-none-gnux32 < /dev/null | FileCheck -match-full-lines -check-prefix X32 %s
 // RUN: %clang_cc1 -x c++ -E -dM -ffreestanding -fgnuc-version=4.2.1 -triple=x86_64-none-none-gnux32 < /dev/null | FileCheck -match-full-lines -check-prefix X32 -check-prefix X32-CXX %s
 //

From 1e029cf53bc9fb1894824363634323f0ed51c5a8 Mon Sep 17 00:00:00 2001
From: Alex Richardson <alexrichardson@google.com>
Date: Thu, 26 Oct 2023 14:27:34 -0700
Subject: [PATCH 134/877] Add missing REQUIRES lines to unbreak buildbots

Since e39f6c1844fab59c638d8059a6cf139adb42279a these tests require
a valid target in order to compute the data layout.
---
 llvm/test/Transforms/InstCombine/ffs-i16.ll     | 2 ++
 llvm/test/Transforms/InstCombine/fls-i16.ll     | 1 +
 llvm/test/Transforms/InstCombine/isascii-i16.ll | 1 +
 llvm/test/Transforms/InstCombine/isdigit-i16.ll | 1 +
 llvm/test/Transforms/InstCombine/printf-i16.ll  | 1 +
 llvm/test/Transforms/InstCombine/puts-i16.ll    | 1 +
 llvm/test/Transforms/InstCombine/sincospi.ll    | 2 +-
 llvm/test/Transforms/OpenMP/barrier_removal.ll  | 2 ++
 8 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/llvm/test/Transforms/InstCombine/ffs-i16.ll b/llvm/test/Transforms/InstCombine/ffs-i16.ll
index 05c21fce5f185..f2b1f074f86d5 100644
--- a/llvm/test/Transforms/InstCombine/ffs-i16.ll
+++ b/llvm/test/Transforms/InstCombine/ffs-i16.ll
@@ -5,6 +5,8 @@
 ;
 ; RUN: opt < %s -mtriple=avr-linux -passes=instcombine -S | FileCheck %s --check-prefix=AVR
 ; RUN: opt < %s -mtriple=msp430-linux -passes=instcombine -S | FileCheck %s --check-prefix=MSP430
+; REQUIRES: avr-registered-target,msp430-registered-target
+
 
 declare i16 @ffs(i16)
 
diff --git a/llvm/test/Transforms/InstCombine/fls-i16.ll b/llvm/test/Transforms/InstCombine/fls-i16.ll
index 6d939b4e5c4e4..54692fde5303e 100644
--- a/llvm/test/Transforms/InstCombine/fls-i16.ll
+++ b/llvm/test/Transforms/InstCombine/fls-i16.ll
@@ -6,6 +6,7 @@
 ;
 ; RUN: opt < %s -mtriple=avr-freebsd -passes=instcombine -S | FileCheck %s --check-prefix=AVR
 ; RUN: opt < %s -mtriple=msp430-freebsd -passes=instcombine -S | FileCheck %s --check-prefix=MSP430
+; REQUIRES: avr-registered-target,msp430-registered-target
 
 declare i16 @fls(i16)
 
diff --git a/llvm/test/Transforms/InstCombine/isascii-i16.ll b/llvm/test/Transforms/InstCombine/isascii-i16.ll
index c5a9384a1ba39..5dfabe4c0f8b4 100644
--- a/llvm/test/Transforms/InstCombine/isascii-i16.ll
+++ b/llvm/test/Transforms/InstCombine/isascii-i16.ll
@@ -4,6 +4,7 @@
 ;
 ; RUN: opt < %s -mtriple=avr-freebsd -passes=instcombine -S | FileCheck %s --check-prefix=AVR
 ; RUN: opt < %s -mtriple=msp430-linux -passes=instcombine -S | FileCheck %s --check-prefix=MSP430
+; REQUIRES: avr-registered-target,msp430-registered-target
 
 declare i16 @isascii(i16)
 
diff --git a/llvm/test/Transforms/InstCombine/isdigit-i16.ll b/llvm/test/Transforms/InstCombine/isdigit-i16.ll
index fd02fcef798cf..4226ead2105ae 100644
--- a/llvm/test/Transforms/InstCombine/isdigit-i16.ll
+++ b/llvm/test/Transforms/InstCombine/isdigit-i16.ll
@@ -4,6 +4,7 @@
 ;
 ; RUN: opt < %s -mtriple=avr-linux -passes=instcombine -S | FileCheck %s --check-prefix=AVR
 ; RUN: opt < %s -mtriple=msp430-freebsd -passes=instcombine -S | FileCheck %s  --check-prefix=MSP430
+; REQUIRES: avr-registered-target,msp430-registered-target
 
 declare i16 @isdigit(i16)
 
diff --git a/llvm/test/Transforms/InstCombine/printf-i16.ll b/llvm/test/Transforms/InstCombine/printf-i16.ll
index 2177eda3aecaf..ce8783692aa23 100644
--- a/llvm/test/Transforms/InstCombine/printf-i16.ll
+++ b/llvm/test/Transforms/InstCombine/printf-i16.ll
@@ -2,6 +2,7 @@
 ;
 ; RUN: opt < %s -mtriple=avr-freebsd -passes=instcombine -S | FileCheck %s --check-prefix=AVR
 ; RUN: opt < %s -mtriple=msp430-linux -passes=instcombine -S | FileCheck %s --check-prefix=MSP430
+; REQUIRES: avr-registered-target,msp430-registered-target
 ;
 ; Verify that the puts to putchar transformation works correctly even for
 ; targets with 16-bit int.
diff --git a/llvm/test/Transforms/InstCombine/puts-i16.ll b/llvm/test/Transforms/InstCombine/puts-i16.ll
index 4ac3cc01ea043..9a62fd492339f 100644
--- a/llvm/test/Transforms/InstCombine/puts-i16.ll
+++ b/llvm/test/Transforms/InstCombine/puts-i16.ll
@@ -2,6 +2,7 @@
 ;
 ; RUN: opt < %s -mtriple=avr-linux -passes=instcombine -S | FileCheck %s --check-prefix=AVR
 ; RUN: opt < %s -mtriple=msp430-freebsd -passes=instcombine -S | FileCheck %s --check-prefix=MSP430
+; REQUIRES: avr-registered-target,msp430-registered-target
 ;
 ; Test that the puts to putchar transformation works correctly even for
 ; targets with 16-bit int.
diff --git a/llvm/test/Transforms/InstCombine/sincospi.ll b/llvm/test/Transforms/InstCombine/sincospi.ll
index c4aee306a03c7..b76ae20171147 100644
--- a/llvm/test/Transforms/InstCombine/sincospi.ll
+++ b/llvm/test/Transforms/InstCombine/sincospi.ll
@@ -5,7 +5,7 @@
 ; RUN: opt -passes=instcombine -S < %s -mtriple=x86_64-apple-macosx10.8 | FileCheck %s --check-prefixes=CHECK-NO-SINCOS,CHECK-NO-SINCOS-DOUBLE-ALIGN8
 ; RUN: opt -passes=instcombine -S < %s -mtriple=arm-apple-ios6.0 | FileCheck %s --check-prefixes=CHECK-NO-SINCOS,CHECK-NO-SINCOS-DOUBLE-ALIGN4
 ; RUN: opt -passes=instcombine -S < %s -mtriple=x86_64-none-linux-gnu | FileCheck %s --check-prefixes=CHECK-NO-SINCOS,CHECK-NO-SINCOS-DOUBLE-ALIGN8
-
+; REQUIRES: arm-registered-target, x86-registered-target
 
 attributes #0 = { readnone nounwind }
 
diff --git a/llvm/test/Transforms/OpenMP/barrier_removal.ll b/llvm/test/Transforms/OpenMP/barrier_removal.ll
index 3040c1721d363..c8e9ac05091ca 100644
--- a/llvm/test/Transforms/OpenMP/barrier_removal.ll
+++ b/llvm/test/Transforms/OpenMP/barrier_removal.ll
@@ -1,6 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals
 ; RUN: opt < %s -S -passes=openmp-opt | FileCheck %s --check-prefixes=CHECK,MODULE
 ; RUN: opt < %s -S -passes=openmp-opt-cgscc | FileCheck %s --check-prefixes=CHECK,CGSCC
+; REQUIRES: amdgpu-registered-target
+
 target triple = "amdgcn-amd-amdhsa"
 
 declare void @useI32(i32)

From 1f5a9d1d073bc0e566333b0758cadc465581ff9d Mon Sep 17 00:00:00 2001
From: Alex Richardson <alexrichardson@google.com>
Date: Thu, 26 Oct 2023 14:32:12 -0700
Subject: [PATCH 135/877] Fix opt/invalid-target.ll on Windows bots

On Windows the warning/error messages print opt.exe instead of opt, so
just drop this part of the check.
---
 llvm/test/tools/opt/invalid-target.ll | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/test/tools/opt/invalid-target.ll b/llvm/test/tools/opt/invalid-target.ll
index 67c9b3c2a0a47..e55a8a44126ac 100644
--- a/llvm/test/tools/opt/invalid-target.ll
+++ b/llvm/test/tools/opt/invalid-target.ll
@@ -7,10 +7,10 @@
 
 ;; Using "unknown" as the architecture is explicitly allowed (but warns)
 ; RUN: opt -mtriple=unknown -S -passes=no-op-module -o /dev/null < %s 2>&1 | FileCheck %s --check-prefix=UNKNOWN
-; UNKNOWN: opt: warning: failed to infer data layout: unable to get target for 'unknown', see --version and --triple.
+; UNKNOWN: warning: failed to infer data layout: unable to get target for 'unknown', see --version and --triple.
 
 ;; However, any other invalid target triple should cause the tool to fail:
 ; RUN: not opt -mtriple=invalid -S -passes=no-op-module -o /dev/null < %s 2>&1 | FileCheck %s --check-prefix=INVALID
-; INVALID: opt: warning: failed to infer data layout: unable to get target for 'invalid', see --version and --triple.
-; INVALID-NEXT: opt: unrecognized architecture 'invalid' provided.
+; INVALID: warning: failed to infer data layout: unable to get target for 'invalid', see --version and --triple.
+; INVALID-NEXT: unrecognized architecture 'invalid' provided.
 ; INVALID-EMPTY:

From 0012b956f9d59eba4f8df32716d8be13f12cdce2 Mon Sep 17 00:00:00 2001
From: Johannes Doerfert <johannes@jdoerfert.de>
Date: Mon, 23 Oct 2023 13:38:12 -0700
Subject: [PATCH 136/877] [OpenMP][FIX] Move workaround code to avoid races

The workaround code ensure we always call __kmpc_kernel_parallel, but it
did so in a racy manner as the initialization might not have been
completed yet. To avoid introducing a sync, we move the workaround into
the deinit function for now.
---
 openmp/libomptarget/DeviceRTL/src/Kernel.cpp | 32 +++++++++++---------
 1 file changed, 17 insertions(+), 15 deletions(-)

diff --git a/openmp/libomptarget/DeviceRTL/src/Kernel.cpp b/openmp/libomptarget/DeviceRTL/src/Kernel.cpp
index 91e8a00bdef9d..8fe3e3f32d1aa 100644
--- a/openmp/libomptarget/DeviceRTL/src/Kernel.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Kernel.cpp
@@ -110,20 +110,8 @@ int32_t __kmpc_target_init(KernelEnvironmentTy &KernelEnvironment) {
   // main thread's warp, so none of its threads can ever be active worker
   // threads.
   if (UseGenericStateMachine &&
-      mapping::getThreadIdInBlock() < mapping::getMaxTeamThreads(IsSPMD)) {
+      mapping::getThreadIdInBlock() < mapping::getMaxTeamThreads(IsSPMD))
     genericStateMachine(KernelEnvironment.Ident);
-  } else {
-    // Retrieve the work function just to ensure we always call
-    // __kmpc_kernel_parallel even if a custom state machine is used.
-    // TODO: this is not super pretty. The problem is we create the call to
-    // __kmpc_kernel_parallel in the openmp-opt pass but while we optimize it is
-    // not there yet. Thus, we assume we never reach it from
-    // __kmpc_target_deinit. That allows us to remove the store in there to
-    // ParallelRegionFn, which leads to bad results later on.
-    ParallelRegionFnTy WorkFn = nullptr;
-    __kmpc_kernel_parallel(&WorkFn);
-    ASSERT(WorkFn == nullptr, nullptr);
-  }
 
   return mapping::getThreadIdInBlock();
 }
@@ -140,8 +128,22 @@ void __kmpc_target_deinit() {
   if (IsSPMD)
     return;
 
-  // Signal the workers to exit the state machine and exit the kernel.
-  state::ParallelRegionFn = nullptr;
+  if (mapping::isInitialThreadInLevel0(IsSPMD)) {
+    // Signal the workers to exit the state machine and exit the kernel.
+    state::ParallelRegionFn = nullptr;
+  } else if (!state::getKernelEnvironment()
+                  .Configuration.UseGenericStateMachine) {
+    // Retrieve the work function just to ensure we always call
+    // __kmpc_kernel_parallel even if a custom state machine is used.
+    // TODO: this is not super pretty. The problem is we create the call to
+    // __kmpc_kernel_parallel in the openmp-opt pass but while we optimize it
+    // is not there yet. Thus, we assume we never reach it from
+    // __kmpc_target_deinit. That allows us to remove the store in there to
+    // ParallelRegionFn, which leads to bad results later on.
+    ParallelRegionFnTy WorkFn = nullptr;
+    __kmpc_kernel_parallel(&WorkFn);
+    ASSERT(WorkFn == nullptr, nullptr);
+  }
 }
 
 int8_t __kmpc_is_spmd_exec_mode() { return mapping::isSPMDMode(); }

From 331085b469bb340d50859f48d49c7932801c0934 Mon Sep 17 00:00:00 2001
From: Johannes Doerfert <johannes@jdoerfert.de>
Date: Mon, 23 Oct 2023 15:12:37 -0700
Subject: [PATCH 137/877] [OpenMP][NFC] Clang format some tests

---
 ...s_distribute_parallel_for_simd_codegen.cpp | 664 +++++++++---------
 ...x_target_teams_distribute_simd_codegen.cpp | 588 ++++++++--------
 2 files changed, 626 insertions(+), 626 deletions(-)

diff --git a/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_simd_codegen.cpp b/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_simd_codegen.cpp
index 6a3f968e74b10..dbc2a6c6668a2 100644
--- a/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_simd_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_simd_codegen.cpp
@@ -158,69 +158,69 @@ int bar(int n){
 // CHECK1-NEXT:    store i32 [[TMP11]], ptr [[DOTOMP_IV]], align 4
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK1:       omp.inner.for.cond:
-// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12:![0-9]+]]
-// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16:![0-9]+]]
+// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4, !llvm.access.group [[ACC_GRP16]]
 // CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP13]], 1
 // CHECK1-NEXT:    [[CMP6:%.*]] = icmp slt i32 [[TMP12]], [[ADD]]
 // CHECK1-NEXT:    br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK1:       omp.inner.for.body:
-// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP16]]
 // CHECK1-NEXT:    [[TMP15:%.*]] = zext i32 [[TMP14]] to i64
-// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP16]]
 // CHECK1-NEXT:    [[TMP17:%.*]] = zext i32 [[TMP16]] to i64
-// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[N_ADDR]], align 4, !llvm.access.group [[ACC_GRP12]]
-// CHECK1-NEXT:    store i32 [[TMP18]], ptr [[N_CASTED]], align 4, !llvm.access.group [[ACC_GRP12]]
-// CHECK1-NEXT:    [[TMP19:%.*]] = load i64, ptr [[N_CASTED]], align 8, !llvm.access.group [[ACC_GRP12]]
-// CHECK1-NEXT:    [[TMP20:%.*]] = load i32, ptr [[L_ADDR]], align 4, !llvm.access.group [[ACC_GRP12]]
-// CHECK1-NEXT:    store i32 [[TMP20]], ptr [[L_CASTED]], align 4, !llvm.access.group [[ACC_GRP12]]
-// CHECK1-NEXT:    [[TMP21:%.*]] = load i64, ptr [[L_CASTED]], align 8, !llvm.access.group [[ACC_GRP12]]
+// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[N_ADDR]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK1-NEXT:    store i32 [[TMP18]], ptr [[N_CASTED]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK1-NEXT:    [[TMP19:%.*]] = load i64, ptr [[N_CASTED]], align 8, !llvm.access.group [[ACC_GRP16]]
+// CHECK1-NEXT:    [[TMP20:%.*]] = load i32, ptr [[L_ADDR]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK1-NEXT:    store i32 [[TMP20]], ptr [[L_CASTED]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK1-NEXT:    [[TMP21:%.*]] = load i64, ptr [[L_CASTED]], align 8, !llvm.access.group [[ACC_GRP16]]
 // CHECK1-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [5 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
 // CHECK1-NEXT:    [[TMP23:%.*]] = inttoptr i64 [[TMP15]] to ptr
-// CHECK1-NEXT:    store ptr [[TMP23]], ptr [[TMP22]], align 8, !llvm.access.group [[ACC_GRP12]]
+// CHECK1-NEXT:    store ptr [[TMP23]], ptr [[TMP22]], align 8, !llvm.access.group [[ACC_GRP16]]
 // CHECK1-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [5 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 1
 // CHECK1-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP17]] to ptr
-// CHECK1-NEXT:    store ptr [[TMP25]], ptr [[TMP24]], align 8, !llvm.access.group [[ACC_GRP12]]
+// CHECK1-NEXT:    store ptr [[TMP25]], ptr [[TMP24]], align 8, !llvm.access.group [[ACC_GRP16]]
 // CHECK1-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [5 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 2
 // CHECK1-NEXT:    [[TMP27:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK1-NEXT:    store ptr [[TMP27]], ptr [[TMP26]], align 8, !llvm.access.group [[ACC_GRP12]]
+// CHECK1-NEXT:    store ptr [[TMP27]], ptr [[TMP26]], align 8, !llvm.access.group [[ACC_GRP16]]
 // CHECK1-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [5 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 3
-// CHECK1-NEXT:    store ptr [[TMP0]], ptr [[TMP28]], align 8, !llvm.access.group [[ACC_GRP12]]
+// CHECK1-NEXT:    store ptr [[TMP0]], ptr [[TMP28]], align 8, !llvm.access.group [[ACC_GRP16]]
 // CHECK1-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [5 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 4
 // CHECK1-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP21]] to ptr
-// CHECK1-NEXT:    store ptr [[TMP30]], ptr [[TMP29]], align 8, !llvm.access.group [[ACC_GRP12]]
-// CHECK1-NEXT:    [[TMP31:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !llvm.access.group [[ACC_GRP12]]
-// CHECK1-NEXT:    [[TMP32:%.*]] = load i32, ptr [[TMP31]], align 4, !llvm.access.group [[ACC_GRP12]]
-// CHECK1-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP32]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l26_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 5), !llvm.access.group [[ACC_GRP12]]
+// CHECK1-NEXT:    store ptr [[TMP30]], ptr [[TMP29]], align 8, !llvm.access.group [[ACC_GRP16]]
+// CHECK1-NEXT:    [[TMP31:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !llvm.access.group [[ACC_GRP16]]
+// CHECK1-NEXT:    [[TMP32:%.*]] = load i32, ptr [[TMP31]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK1-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP32]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l26_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 5), !llvm.access.group [[ACC_GRP16]]
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK1:       omp.inner.for.inc:
-// CHECK1-NEXT:    [[TMP33:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12]]
-// CHECK1-NEXT:    [[TMP34:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK1-NEXT:    [[TMP33:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK1-NEXT:    [[TMP34:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP16]]
 // CHECK1-NEXT:    [[ADD7:%.*]] = add nsw i32 [[TMP33]], [[TMP34]]
-// CHECK1-NEXT:    store i32 [[ADD7]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12]]
-// CHECK1-NEXT:    [[TMP35:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP12]]
-// CHECK1-NEXT:    [[TMP36:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK1-NEXT:    store i32 [[ADD7]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK1-NEXT:    [[TMP35:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK1-NEXT:    [[TMP36:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP16]]
 // CHECK1-NEXT:    [[ADD8:%.*]] = add nsw i32 [[TMP35]], [[TMP36]]
-// CHECK1-NEXT:    store i32 [[ADD8]], ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP12]]
-// CHECK1-NEXT:    [[TMP37:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP12]]
-// CHECK1-NEXT:    [[TMP38:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK1-NEXT:    store i32 [[ADD8]], ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK1-NEXT:    [[TMP37:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK1-NEXT:    [[TMP38:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP16]]
 // CHECK1-NEXT:    [[ADD9:%.*]] = add nsw i32 [[TMP37]], [[TMP38]]
-// CHECK1-NEXT:    store i32 [[ADD9]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP12]]
-// CHECK1-NEXT:    [[TMP39:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP12]]
-// CHECK1-NEXT:    [[TMP40:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK1-NEXT:    store i32 [[ADD9]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK1-NEXT:    [[TMP39:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK1-NEXT:    [[TMP40:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4, !llvm.access.group [[ACC_GRP16]]
 // CHECK1-NEXT:    [[CMP10:%.*]] = icmp sgt i32 [[TMP39]], [[TMP40]]
 // CHECK1-NEXT:    br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]]
 // CHECK1:       cond.true11:
-// CHECK1-NEXT:    [[TMP41:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK1-NEXT:    [[TMP41:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4, !llvm.access.group [[ACC_GRP16]]
 // CHECK1-NEXT:    br label [[COND_END13:%.*]]
 // CHECK1:       cond.false12:
-// CHECK1-NEXT:    [[TMP42:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK1-NEXT:    [[TMP42:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP16]]
 // CHECK1-NEXT:    br label [[COND_END13]]
 // CHECK1:       cond.end13:
 // CHECK1-NEXT:    [[COND14:%.*]] = phi i32 [ [[TMP41]], [[COND_TRUE11]] ], [ [[TMP42]], [[COND_FALSE12]] ]
-// CHECK1-NEXT:    store i32 [[COND14]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP12]]
-// CHECK1-NEXT:    [[TMP43:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP12]]
-// CHECK1-NEXT:    store i32 [[TMP43]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12]]
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP13:![0-9]+]]
+// CHECK1-NEXT:    store i32 [[COND14]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK1-NEXT:    [[TMP43:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK1-NEXT:    store i32 [[TMP43]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP17:![0-9]+]]
 // CHECK1:       omp.inner.for.end:
 // CHECK1-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK1:       omp.loop.exit:
@@ -333,29 +333,29 @@ int bar(int n){
 // CHECK1:       omp.dispatch.body:
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK1:       omp.inner.for.cond:
-// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16:![0-9]+]]
-// CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP20:![0-9]+]]
+// CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP20]]
 // CHECK1-NEXT:    [[CMP9:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]]
 // CHECK1-NEXT:    br i1 [[CMP9]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK1:       omp.inner.for.body:
-// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP20]]
 // CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1
 // CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK1-NEXT:    store i32 [[ADD]], ptr [[I4]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK1-NEXT:    [[TMP19:%.*]] = load i32, ptr [[I4]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK1-NEXT:    store i32 [[ADD]], ptr [[I4]], align 4, !llvm.access.group [[ACC_GRP20]]
+// CHECK1-NEXT:    [[TMP19:%.*]] = load i32, ptr [[I4]], align 4, !llvm.access.group [[ACC_GRP20]]
 // CHECK1-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP19]] to i64
 // CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], ptr [[TMP0]], i64 0, i64 [[IDXPROM]]
-// CHECK1-NEXT:    store i32 1, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK1-NEXT:    [[TMP20:%.*]] = load i32, ptr [[I4]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK1-NEXT:    store i32 [[TMP20]], ptr [[L_ADDR]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK1-NEXT:    store i32 1, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP20]]
+// CHECK1-NEXT:    [[TMP20:%.*]] = load i32, ptr [[I4]], align 4, !llvm.access.group [[ACC_GRP20]]
+// CHECK1-NEXT:    store i32 [[TMP20]], ptr [[L_ADDR]], align 4, !llvm.access.group [[ACC_GRP20]]
 // CHECK1-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK1:       omp.body.continue:
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK1:       omp.inner.for.inc:
-// CHECK1-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK1-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP20]]
 // CHECK1-NEXT:    [[ADD10:%.*]] = add nsw i32 [[TMP21]], 1
-// CHECK1-NEXT:    store i32 [[ADD10]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP17:![0-9]+]]
+// CHECK1-NEXT:    store i32 [[ADD10]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP20]]
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP21:![0-9]+]]
 // CHECK1:       omp.inner.for.end:
 // CHECK1-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK1:       omp.dispatch.inc:
@@ -487,63 +487,63 @@ int bar(int n){
 // CHECK1-NEXT:    store i32 [[TMP11]], ptr [[DOTOMP_IV]], align 4
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK1:       omp.inner.for.cond:
-// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19:![0-9]+]]
-// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP23:![0-9]+]]
+// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4, !llvm.access.group [[ACC_GRP23]]
 // CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP13]], 1
 // CHECK1-NEXT:    [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]]
 // CHECK1-NEXT:    br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK1:       omp.inner.for.body:
-// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP23]]
 // CHECK1-NEXT:    [[TMP15:%.*]] = zext i32 [[TMP14]] to i64
-// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP23]]
 // CHECK1-NEXT:    [[TMP17:%.*]] = zext i32 [[TMP16]] to i64
-// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[N_ADDR]], align 4, !llvm.access.group [[ACC_GRP19]]
-// CHECK1-NEXT:    store i32 [[TMP18]], ptr [[N_CASTED]], align 4, !llvm.access.group [[ACC_GRP19]]
-// CHECK1-NEXT:    [[TMP19:%.*]] = load i64, ptr [[N_CASTED]], align 8, !llvm.access.group [[ACC_GRP19]]
+// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[N_ADDR]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK1-NEXT:    store i32 [[TMP18]], ptr [[N_CASTED]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK1-NEXT:    [[TMP19:%.*]] = load i64, ptr [[N_CASTED]], align 8, !llvm.access.group [[ACC_GRP23]]
 // CHECK1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
 // CHECK1-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP15]] to ptr
-// CHECK1-NEXT:    store ptr [[TMP21]], ptr [[TMP20]], align 8, !llvm.access.group [[ACC_GRP19]]
+// CHECK1-NEXT:    store ptr [[TMP21]], ptr [[TMP20]], align 8, !llvm.access.group [[ACC_GRP23]]
 // CHECK1-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 1
 // CHECK1-NEXT:    [[TMP23:%.*]] = inttoptr i64 [[TMP17]] to ptr
-// CHECK1-NEXT:    store ptr [[TMP23]], ptr [[TMP22]], align 8, !llvm.access.group [[ACC_GRP19]]
+// CHECK1-NEXT:    store ptr [[TMP23]], ptr [[TMP22]], align 8, !llvm.access.group [[ACC_GRP23]]
 // CHECK1-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 2
 // CHECK1-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK1-NEXT:    store ptr [[TMP25]], ptr [[TMP24]], align 8, !llvm.access.group [[ACC_GRP19]]
+// CHECK1-NEXT:    store ptr [[TMP25]], ptr [[TMP24]], align 8, !llvm.access.group [[ACC_GRP23]]
 // CHECK1-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 3
-// CHECK1-NEXT:    store ptr [[TMP0]], ptr [[TMP26]], align 8, !llvm.access.group [[ACC_GRP19]]
-// CHECK1-NEXT:    [[TMP27:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !llvm.access.group [[ACC_GRP19]]
-// CHECK1-NEXT:    [[TMP28:%.*]] = load i32, ptr [[TMP27]], align 4, !llvm.access.group [[ACC_GRP19]]
-// CHECK1-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP28]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l32_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 4), !llvm.access.group [[ACC_GRP19]]
+// CHECK1-NEXT:    store ptr [[TMP0]], ptr [[TMP26]], align 8, !llvm.access.group [[ACC_GRP23]]
+// CHECK1-NEXT:    [[TMP27:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !llvm.access.group [[ACC_GRP23]]
+// CHECK1-NEXT:    [[TMP28:%.*]] = load i32, ptr [[TMP27]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK1-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP28]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l32_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 4), !llvm.access.group [[ACC_GRP23]]
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK1:       omp.inner.for.inc:
-// CHECK1-NEXT:    [[TMP29:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19]]
-// CHECK1-NEXT:    [[TMP30:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK1-NEXT:    [[TMP29:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK1-NEXT:    [[TMP30:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP23]]
 // CHECK1-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP29]], [[TMP30]]
-// CHECK1-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19]]
-// CHECK1-NEXT:    [[TMP31:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP19]]
-// CHECK1-NEXT:    [[TMP32:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK1-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK1-NEXT:    [[TMP31:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK1-NEXT:    [[TMP32:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP23]]
 // CHECK1-NEXT:    [[ADD7:%.*]] = add nsw i32 [[TMP31]], [[TMP32]]
-// CHECK1-NEXT:    store i32 [[ADD7]], ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP19]]
-// CHECK1-NEXT:    [[TMP33:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP19]]
-// CHECK1-NEXT:    [[TMP34:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK1-NEXT:    store i32 [[ADD7]], ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK1-NEXT:    [[TMP33:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK1-NEXT:    [[TMP34:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP23]]
 // CHECK1-NEXT:    [[ADD8:%.*]] = add nsw i32 [[TMP33]], [[TMP34]]
-// CHECK1-NEXT:    store i32 [[ADD8]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP19]]
-// CHECK1-NEXT:    [[TMP35:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP19]]
-// CHECK1-NEXT:    [[TMP36:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK1-NEXT:    store i32 [[ADD8]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK1-NEXT:    [[TMP35:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK1-NEXT:    [[TMP36:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4, !llvm.access.group [[ACC_GRP23]]
 // CHECK1-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[TMP35]], [[TMP36]]
 // CHECK1-NEXT:    br i1 [[CMP9]], label [[COND_TRUE10:%.*]], label [[COND_FALSE11:%.*]]
 // CHECK1:       cond.true10:
-// CHECK1-NEXT:    [[TMP37:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK1-NEXT:    [[TMP37:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4, !llvm.access.group [[ACC_GRP23]]
 // CHECK1-NEXT:    br label [[COND_END12:%.*]]
 // CHECK1:       cond.false11:
-// CHECK1-NEXT:    [[TMP38:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK1-NEXT:    [[TMP38:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP23]]
 // CHECK1-NEXT:    br label [[COND_END12]]
 // CHECK1:       cond.end12:
 // CHECK1-NEXT:    [[COND13:%.*]] = phi i32 [ [[TMP37]], [[COND_TRUE10]] ], [ [[TMP38]], [[COND_FALSE11]] ]
-// CHECK1-NEXT:    store i32 [[COND13]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP19]]
-// CHECK1-NEXT:    [[TMP39:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP19]]
-// CHECK1-NEXT:    store i32 [[TMP39]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19]]
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP20:![0-9]+]]
+// CHECK1-NEXT:    store i32 [[COND13]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK1-NEXT:    [[TMP39:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK1-NEXT:    store i32 [[TMP39]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP24:![0-9]+]]
 // CHECK1:       omp.inner.for.end:
 // CHECK1-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK1:       omp.loop.exit:
@@ -623,33 +623,33 @@ int bar(int n){
 // CHECK1-NEXT:    store i32 [[TMP9]], ptr [[DOTOMP_IV]], align 4
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK1:       omp.inner.for.cond:
-// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22:![0-9]+]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26:![0-9]+]]
 // CHECK1-NEXT:    [[CONV5:%.*]] = sext i32 [[TMP10]] to i64
-// CHECK1-NEXT:    [[TMP11:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR]], align 8, !llvm.access.group [[ACC_GRP22]]
+// CHECK1-NEXT:    [[TMP11:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR]], align 8, !llvm.access.group [[ACC_GRP26]]
 // CHECK1-NEXT:    [[CMP6:%.*]] = icmp ule i64 [[CONV5]], [[TMP11]]
 // CHECK1-NEXT:    br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK1:       omp.inner.for.body:
-// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
 // CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1
 // CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK1-NEXT:    store i32 [[ADD]], ptr [[I4]], align 4, !llvm.access.group [[ACC_GRP22]]
-// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[I4]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK1-NEXT:    store i32 [[ADD]], ptr [[I4]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[I4]], align 4, !llvm.access.group [[ACC_GRP26]]
 // CHECK1-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP13]] to i64
 // CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], ptr [[TMP0]], i64 0, i64 [[IDXPROM]]
-// CHECK1-NEXT:    [[TMP14:%.*]] = load i16, ptr [[ARRAYIDX]], align 2, !llvm.access.group [[ACC_GRP22]]
+// CHECK1-NEXT:    [[TMP14:%.*]] = load i16, ptr [[ARRAYIDX]], align 2, !llvm.access.group [[ACC_GRP26]]
 // CHECK1-NEXT:    [[CONV7:%.*]] = sext i16 [[TMP14]] to i32
 // CHECK1-NEXT:    [[ADD8:%.*]] = add nsw i32 [[CONV7]], 1
 // CHECK1-NEXT:    [[CONV9:%.*]] = trunc i32 [[ADD8]] to i16
-// CHECK1-NEXT:    store i16 [[CONV9]], ptr [[ARRAYIDX]], align 2, !llvm.access.group [[ACC_GRP22]]
+// CHECK1-NEXT:    store i16 [[CONV9]], ptr [[ARRAYIDX]], align 2, !llvm.access.group [[ACC_GRP26]]
 // CHECK1-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK1:       omp.body.continue:
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK1:       omp.inner.for.inc:
-// CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
-// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP26]]
 // CHECK1-NEXT:    [[ADD10:%.*]] = add nsw i32 [[TMP15]], [[TMP16]]
-// CHECK1-NEXT:    store i32 [[ADD10]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP23:![0-9]+]]
+// CHECK1-NEXT:    store i32 [[ADD10]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP27:![0-9]+]]
 // CHECK1:       omp.inner.for.end:
 // CHECK1-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK1:       omp.loop.exit:
@@ -736,51 +736,51 @@ int bar(int n){
 // CHECK1-NEXT:    store i32 [[TMP5]], ptr [[DOTOMP_IV]], align 4
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK1:       omp.inner.for.cond:
-// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP25:![0-9]+]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP29:![0-9]+]]
 // CHECK1-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP6]], 10
 // CHECK1-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK1:       omp.inner.for.body:
-// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP25]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP29]]
 // CHECK1-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP7]] to i64
-// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP25]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP29]]
 // CHECK1-NEXT:    [[TMP10:%.*]] = zext i32 [[TMP9]] to i64
 // CHECK1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [3 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
 // CHECK1-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP8]] to ptr
-// CHECK1-NEXT:    store ptr [[TMP12]], ptr [[TMP11]], align 8, !llvm.access.group [[ACC_GRP25]]
+// CHECK1-NEXT:    store ptr [[TMP12]], ptr [[TMP11]], align 8, !llvm.access.group [[ACC_GRP29]]
 // CHECK1-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [3 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 1
 // CHECK1-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK1-NEXT:    store ptr [[TMP14]], ptr [[TMP13]], align 8, !llvm.access.group [[ACC_GRP25]]
+// CHECK1-NEXT:    store ptr [[TMP14]], ptr [[TMP13]], align 8, !llvm.access.group [[ACC_GRP29]]
 // CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [3 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 2
-// CHECK1-NEXT:    store ptr [[TMP0]], ptr [[TMP15]], align 8, !llvm.access.group [[ACC_GRP25]]
-// CHECK1-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l37_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 3), !llvm.access.group [[ACC_GRP25]]
+// CHECK1-NEXT:    store ptr [[TMP0]], ptr [[TMP15]], align 8, !llvm.access.group [[ACC_GRP29]]
+// CHECK1-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l37_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 3), !llvm.access.group [[ACC_GRP29]]
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK1:       omp.inner.for.inc:
-// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP25]]
-// CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP25]]
+// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP29]]
+// CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP29]]
 // CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP16]], [[TMP17]]
-// CHECK1-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP25]]
-// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP25]]
-// CHECK1-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP25]]
+// CHECK1-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP29]]
+// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP29]]
+// CHECK1-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP29]]
 // CHECK1-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP18]], [[TMP19]]
-// CHECK1-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP25]]
-// CHECK1-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP25]]
-// CHECK1-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP25]]
+// CHECK1-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP29]]
+// CHECK1-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP29]]
+// CHECK1-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP29]]
 // CHECK1-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP20]], [[TMP21]]
-// CHECK1-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP25]]
-// CHECK1-NEXT:    [[TMP22:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP25]]
+// CHECK1-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP29]]
+// CHECK1-NEXT:    [[TMP22:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP29]]
 // CHECK1-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP22]], 9
 // CHECK1-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
 // CHECK1:       cond.true5:
 // CHECK1-NEXT:    br label [[COND_END7:%.*]]
 // CHECK1:       cond.false6:
-// CHECK1-NEXT:    [[TMP23:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP25]]
+// CHECK1-NEXT:    [[TMP23:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP29]]
 // CHECK1-NEXT:    br label [[COND_END7]]
 // CHECK1:       cond.end7:
 // CHECK1-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP23]], [[COND_FALSE6]] ]
-// CHECK1-NEXT:    store i32 [[COND8]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP25]]
-// CHECK1-NEXT:    [[TMP24:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP25]]
-// CHECK1-NEXT:    store i32 [[TMP24]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP25]]
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP26:![0-9]+]]
+// CHECK1-NEXT:    store i32 [[COND8]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP29]]
+// CHECK1-NEXT:    [[TMP24:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP29]]
+// CHECK1-NEXT:    store i32 [[TMP24]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP29]]
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP30:![0-9]+]]
 // CHECK1:       omp.inner.for.end:
 // CHECK1-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK1:       omp.loop.exit:
@@ -833,31 +833,31 @@ int bar(int n){
 // CHECK1-NEXT:    store i32 [[TMP5]], ptr [[DOTOMP_IV]], align 4
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK1:       omp.inner.for.cond:
-// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP28:![0-9]+]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP32:![0-9]+]]
 // CHECK1-NEXT:    [[CONV2:%.*]] = sext i32 [[TMP6]] to i64
-// CHECK1-NEXT:    [[TMP7:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR]], align 8, !llvm.access.group [[ACC_GRP28]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR]], align 8, !llvm.access.group [[ACC_GRP32]]
 // CHECK1-NEXT:    [[CMP:%.*]] = icmp ule i64 [[CONV2]], [[TMP7]]
 // CHECK1-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK1:       omp.inner.for.body:
-// CHECK1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP28]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP32]]
 // CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1
 // CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK1-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP28]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP28]]
+// CHECK1-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP32]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP32]]
 // CHECK1-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP9]] to i64
 // CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP0]], i64 0, i64 [[IDXPROM]]
-// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP28]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP32]]
 // CHECK1-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP10]], 1
-// CHECK1-NEXT:    store i32 [[ADD3]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP28]]
+// CHECK1-NEXT:    store i32 [[ADD3]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP32]]
 // CHECK1-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK1:       omp.body.continue:
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK1:       omp.inner.for.inc:
-// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP28]]
-// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP28]]
+// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP32]]
+// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP32]]
 // CHECK1-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP11]], [[TMP12]]
-// CHECK1-NEXT:    store i32 [[ADD4]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP28]]
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP29:![0-9]+]]
+// CHECK1-NEXT:    store i32 [[ADD4]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP32]]
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP33:![0-9]+]]
 // CHECK1:       omp.inner.for.end:
 // CHECK1-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK1:       omp.loop.exit:
@@ -947,57 +947,57 @@ int bar(int n){
 // CHECK1-NEXT:    store i32 [[TMP5]], ptr [[DOTOMP_IV]], align 4
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK1:       omp.inner.for.cond:
-// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP31:![0-9]+]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP35:![0-9]+]]
 // CHECK1-NEXT:    [[CMP2:%.*]] = icmp slt i32 [[TMP6]], 100
 // CHECK1-NEXT:    br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK1:       omp.inner.for.body:
-// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP31]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP35]]
 // CHECK1-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP7]] to i64
-// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP31]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP35]]
 // CHECK1-NEXT:    [[TMP10:%.*]] = zext i32 [[TMP9]] to i64
-// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[F_ADDR]], align 4, !llvm.access.group [[ACC_GRP31]]
-// CHECK1-NEXT:    store i32 [[TMP11]], ptr [[F_CASTED]], align 4, !llvm.access.group [[ACC_GRP31]]
-// CHECK1-NEXT:    [[TMP12:%.*]] = load i64, ptr [[F_CASTED]], align 8, !llvm.access.group [[ACC_GRP31]]
+// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[F_ADDR]], align 4, !llvm.access.group [[ACC_GRP35]]
+// CHECK1-NEXT:    store i32 [[TMP11]], ptr [[F_CASTED]], align 4, !llvm.access.group [[ACC_GRP35]]
+// CHECK1-NEXT:    [[TMP12:%.*]] = load i64, ptr [[F_CASTED]], align 8, !llvm.access.group [[ACC_GRP35]]
 // CHECK1-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
 // CHECK1-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP8]] to ptr
-// CHECK1-NEXT:    store ptr [[TMP14]], ptr [[TMP13]], align 8, !llvm.access.group [[ACC_GRP31]]
+// CHECK1-NEXT:    store ptr [[TMP14]], ptr [[TMP13]], align 8, !llvm.access.group [[ACC_GRP35]]
 // CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 1
 // CHECK1-NEXT:    [[TMP16:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK1-NEXT:    store ptr [[TMP16]], ptr [[TMP15]], align 8, !llvm.access.group [[ACC_GRP31]]
+// CHECK1-NEXT:    store ptr [[TMP16]], ptr [[TMP15]], align 8, !llvm.access.group [[ACC_GRP35]]
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 2
-// CHECK1-NEXT:    store ptr [[TMP0]], ptr [[TMP17]], align 8, !llvm.access.group [[ACC_GRP31]]
+// CHECK1-NEXT:    store ptr [[TMP0]], ptr [[TMP17]], align 8, !llvm.access.group [[ACC_GRP35]]
 // CHECK1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 3
 // CHECK1-NEXT:    [[TMP19:%.*]] = inttoptr i64 [[TMP12]] to ptr
-// CHECK1-NEXT:    store ptr [[TMP19]], ptr [[TMP18]], align 8, !llvm.access.group [[ACC_GRP31]]
-// CHECK1-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l42_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 4), !llvm.access.group [[ACC_GRP31]]
+// CHECK1-NEXT:    store ptr [[TMP19]], ptr [[TMP18]], align 8, !llvm.access.group [[ACC_GRP35]]
+// CHECK1-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l42_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 4), !llvm.access.group [[ACC_GRP35]]
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK1:       omp.inner.for.inc:
-// CHECK1-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP31]]
-// CHECK1-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP31]]
+// CHECK1-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP35]]
+// CHECK1-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP35]]
 // CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP20]], [[TMP21]]
-// CHECK1-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP31]]
-// CHECK1-NEXT:    [[TMP22:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP31]]
-// CHECK1-NEXT:    [[TMP23:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP31]]
+// CHECK1-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP35]]
+// CHECK1-NEXT:    [[TMP22:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP35]]
+// CHECK1-NEXT:    [[TMP23:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP35]]
 // CHECK1-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP22]], [[TMP23]]
-// CHECK1-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP31]]
-// CHECK1-NEXT:    [[TMP24:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP31]]
-// CHECK1-NEXT:    [[TMP25:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP31]]
+// CHECK1-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP35]]
+// CHECK1-NEXT:    [[TMP24:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP35]]
+// CHECK1-NEXT:    [[TMP25:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP35]]
 // CHECK1-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP24]], [[TMP25]]
-// CHECK1-NEXT:    store i32 [[ADD4]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP31]]
-// CHECK1-NEXT:    [[TMP26:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP31]]
+// CHECK1-NEXT:    store i32 [[ADD4]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP35]]
+// CHECK1-NEXT:    [[TMP26:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP35]]
 // CHECK1-NEXT:    [[CMP5:%.*]] = icmp sgt i32 [[TMP26]], 99
 // CHECK1-NEXT:    br i1 [[CMP5]], label [[COND_TRUE6:%.*]], label [[COND_FALSE7:%.*]]
 // CHECK1:       cond.true6:
 // CHECK1-NEXT:    br label [[COND_END8:%.*]]
 // CHECK1:       cond.false7:
-// CHECK1-NEXT:    [[TMP27:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP31]]
+// CHECK1-NEXT:    [[TMP27:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP35]]
 // CHECK1-NEXT:    br label [[COND_END8]]
 // CHECK1:       cond.end8:
 // CHECK1-NEXT:    [[COND9:%.*]] = phi i32 [ 99, [[COND_TRUE6]] ], [ [[TMP27]], [[COND_FALSE7]] ]
-// CHECK1-NEXT:    store i32 [[COND9]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP31]]
-// CHECK1-NEXT:    [[TMP28:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP31]]
-// CHECK1-NEXT:    store i32 [[TMP28]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP31]]
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP32:![0-9]+]]
+// CHECK1-NEXT:    store i32 [[COND9]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP35]]
+// CHECK1-NEXT:    [[TMP28:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP35]]
+// CHECK1-NEXT:    store i32 [[TMP28]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP35]]
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP36:![0-9]+]]
 // CHECK1:       omp.inner.for.end:
 // CHECK1-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK1:       omp.loop.exit:
@@ -1056,49 +1056,49 @@ int bar(int n){
 // CHECK1-NEXT:    store i32 [[TMP5]], ptr [[DOTOMP_IV]], align 4
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK1:       omp.inner.for.cond:
-// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP34:![0-9]+]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP38:![0-9]+]]
 // CHECK1-NEXT:    [[CONV3:%.*]] = sext i32 [[TMP6]] to i64
-// CHECK1-NEXT:    [[TMP7:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR]], align 8, !llvm.access.group [[ACC_GRP34]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR]], align 8, !llvm.access.group [[ACC_GRP38]]
 // CHECK1-NEXT:    [[CMP:%.*]] = icmp ule i64 [[CONV3]], [[TMP7]]
 // CHECK1-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK1:       omp.inner.for.body:
-// CHECK1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP34]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP38]]
 // CHECK1-NEXT:    [[DIV:%.*]] = sdiv i32 [[TMP8]], 10
 // CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i32 [[DIV]], 1
 // CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK1-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP34]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP34]]
-// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP34]]
+// CHECK1-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP38]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP38]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP38]]
 // CHECK1-NEXT:    [[DIV4:%.*]] = sdiv i32 [[TMP10]], 10
 // CHECK1-NEXT:    [[MUL5:%.*]] = mul nsw i32 [[DIV4]], 10
 // CHECK1-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP9]], [[MUL5]]
 // CHECK1-NEXT:    [[MUL6:%.*]] = mul nsw i32 [[SUB]], 1
 // CHECK1-NEXT:    [[ADD7:%.*]] = add nsw i32 0, [[MUL6]]
-// CHECK1-NEXT:    store i32 [[ADD7]], ptr [[J]], align 4, !llvm.access.group [[ACC_GRP34]]
-// CHECK1-NEXT:    store i32 10, ptr [[K]], align 4, !llvm.access.group [[ACC_GRP34]]
-// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP34]]
-// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[J]], align 4, !llvm.access.group [[ACC_GRP34]]
-// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[F_ADDR]], align 4, !llvm.access.group [[ACC_GRP34]]
+// CHECK1-NEXT:    store i32 [[ADD7]], ptr [[J]], align 4, !llvm.access.group [[ACC_GRP38]]
+// CHECK1-NEXT:    store i32 10, ptr [[K]], align 4, !llvm.access.group [[ACC_GRP38]]
+// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP38]]
+// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[J]], align 4, !llvm.access.group [[ACC_GRP38]]
+// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[F_ADDR]], align 4, !llvm.access.group [[ACC_GRP38]]
 // CHECK1-NEXT:    [[MUL8:%.*]] = mul nsw i32 [[TMP12]], [[TMP13]]
 // CHECK1-NEXT:    [[ADD9:%.*]] = add nsw i32 [[TMP11]], [[MUL8]]
-// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[K]], align 4, !llvm.access.group [[ACC_GRP34]]
+// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[K]], align 4, !llvm.access.group [[ACC_GRP38]]
 // CHECK1-NEXT:    [[ADD10:%.*]] = add nsw i32 [[ADD9]], [[TMP14]]
-// CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP34]]
+// CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP38]]
 // CHECK1-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP15]] to i64
 // CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP0]], i64 0, i64 [[IDXPROM]]
-// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[J]], align 4, !llvm.access.group [[ACC_GRP34]]
+// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[J]], align 4, !llvm.access.group [[ACC_GRP38]]
 // CHECK1-NEXT:    [[IDXPROM11:%.*]] = sext i32 [[TMP16]] to i64
 // CHECK1-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX]], i64 0, i64 [[IDXPROM11]]
-// CHECK1-NEXT:    store i32 [[ADD10]], ptr [[ARRAYIDX12]], align 4, !llvm.access.group [[ACC_GRP34]]
+// CHECK1-NEXT:    store i32 [[ADD10]], ptr [[ARRAYIDX12]], align 4, !llvm.access.group [[ACC_GRP38]]
 // CHECK1-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK1:       omp.body.continue:
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK1:       omp.inner.for.inc:
-// CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP34]]
-// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP34]]
+// CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP38]]
+// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP38]]
 // CHECK1-NEXT:    [[ADD13:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
-// CHECK1-NEXT:    store i32 [[ADD13]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP34]]
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP35:![0-9]+]]
+// CHECK1-NEXT:    store i32 [[ADD13]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP38]]
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP39:![0-9]+]]
 // CHECK1:       omp.inner.for.end:
 // CHECK1-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK1:       omp.loop.exit:
@@ -1213,67 +1213,67 @@ int bar(int n){
 // CHECK2-NEXT:    store i32 [[TMP11]], ptr [[DOTOMP_IV]], align 4
 // CHECK2-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK2:       omp.inner.for.cond:
-// CHECK2-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12:![0-9]+]]
-// CHECK2-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK2-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16:![0-9]+]]
+// CHECK2-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4, !llvm.access.group [[ACC_GRP16]]
 // CHECK2-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP13]], 1
 // CHECK2-NEXT:    [[CMP6:%.*]] = icmp slt i32 [[TMP12]], [[ADD]]
 // CHECK2-NEXT:    br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK2:       omp.inner.for.body:
-// CHECK2-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP12]]
-// CHECK2-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP12]]
-// CHECK2-NEXT:    [[TMP16:%.*]] = load i32, ptr [[N_ADDR]], align 4, !llvm.access.group [[ACC_GRP12]]
-// CHECK2-NEXT:    store i32 [[TMP16]], ptr [[N_CASTED]], align 4, !llvm.access.group [[ACC_GRP12]]
-// CHECK2-NEXT:    [[TMP17:%.*]] = load i32, ptr [[N_CASTED]], align 4, !llvm.access.group [[ACC_GRP12]]
-// CHECK2-NEXT:    [[TMP18:%.*]] = load i32, ptr [[L_ADDR]], align 4, !llvm.access.group [[ACC_GRP12]]
-// CHECK2-NEXT:    store i32 [[TMP18]], ptr [[L_CASTED]], align 4, !llvm.access.group [[ACC_GRP12]]
-// CHECK2-NEXT:    [[TMP19:%.*]] = load i32, ptr [[L_CASTED]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK2-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK2-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK2-NEXT:    [[TMP16:%.*]] = load i32, ptr [[N_ADDR]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK2-NEXT:    store i32 [[TMP16]], ptr [[N_CASTED]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK2-NEXT:    [[TMP17:%.*]] = load i32, ptr [[N_CASTED]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK2-NEXT:    [[TMP18:%.*]] = load i32, ptr [[L_ADDR]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK2-NEXT:    store i32 [[TMP18]], ptr [[L_CASTED]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK2-NEXT:    [[TMP19:%.*]] = load i32, ptr [[L_CASTED]], align 4, !llvm.access.group [[ACC_GRP16]]
 // CHECK2-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [5 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
 // CHECK2-NEXT:    [[TMP21:%.*]] = inttoptr i32 [[TMP14]] to ptr
-// CHECK2-NEXT:    store ptr [[TMP21]], ptr [[TMP20]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK2-NEXT:    store ptr [[TMP21]], ptr [[TMP20]], align 4, !llvm.access.group [[ACC_GRP16]]
 // CHECK2-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [5 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
 // CHECK2-NEXT:    [[TMP23:%.*]] = inttoptr i32 [[TMP15]] to ptr
-// CHECK2-NEXT:    store ptr [[TMP23]], ptr [[TMP22]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK2-NEXT:    store ptr [[TMP23]], ptr [[TMP22]], align 4, !llvm.access.group [[ACC_GRP16]]
 // CHECK2-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [5 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 2
 // CHECK2-NEXT:    [[TMP25:%.*]] = inttoptr i32 [[TMP17]] to ptr
-// CHECK2-NEXT:    store ptr [[TMP25]], ptr [[TMP24]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK2-NEXT:    store ptr [[TMP25]], ptr [[TMP24]], align 4, !llvm.access.group [[ACC_GRP16]]
 // CHECK2-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [5 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 3
-// CHECK2-NEXT:    store ptr [[TMP0]], ptr [[TMP26]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK2-NEXT:    store ptr [[TMP0]], ptr [[TMP26]], align 4, !llvm.access.group [[ACC_GRP16]]
 // CHECK2-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [5 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 4
 // CHECK2-NEXT:    [[TMP28:%.*]] = inttoptr i32 [[TMP19]] to ptr
-// CHECK2-NEXT:    store ptr [[TMP28]], ptr [[TMP27]], align 4, !llvm.access.group [[ACC_GRP12]]
-// CHECK2-NEXT:    [[TMP29:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4, !llvm.access.group [[ACC_GRP12]]
-// CHECK2-NEXT:    [[TMP30:%.*]] = load i32, ptr [[TMP29]], align 4, !llvm.access.group [[ACC_GRP12]]
-// CHECK2-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP30]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l26_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i32 5), !llvm.access.group [[ACC_GRP12]]
+// CHECK2-NEXT:    store ptr [[TMP28]], ptr [[TMP27]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK2-NEXT:    [[TMP29:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK2-NEXT:    [[TMP30:%.*]] = load i32, ptr [[TMP29]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK2-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP30]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l26_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i32 5), !llvm.access.group [[ACC_GRP16]]
 // CHECK2-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK2:       omp.inner.for.inc:
-// CHECK2-NEXT:    [[TMP31:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12]]
-// CHECK2-NEXT:    [[TMP32:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK2-NEXT:    [[TMP31:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK2-NEXT:    [[TMP32:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP16]]
 // CHECK2-NEXT:    [[ADD7:%.*]] = add nsw i32 [[TMP31]], [[TMP32]]
-// CHECK2-NEXT:    store i32 [[ADD7]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12]]
-// CHECK2-NEXT:    [[TMP33:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP12]]
-// CHECK2-NEXT:    [[TMP34:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK2-NEXT:    store i32 [[ADD7]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK2-NEXT:    [[TMP33:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK2-NEXT:    [[TMP34:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP16]]
 // CHECK2-NEXT:    [[ADD8:%.*]] = add nsw i32 [[TMP33]], [[TMP34]]
-// CHECK2-NEXT:    store i32 [[ADD8]], ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP12]]
-// CHECK2-NEXT:    [[TMP35:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP12]]
-// CHECK2-NEXT:    [[TMP36:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK2-NEXT:    store i32 [[ADD8]], ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK2-NEXT:    [[TMP35:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK2-NEXT:    [[TMP36:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP16]]
 // CHECK2-NEXT:    [[ADD9:%.*]] = add nsw i32 [[TMP35]], [[TMP36]]
-// CHECK2-NEXT:    store i32 [[ADD9]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP12]]
-// CHECK2-NEXT:    [[TMP37:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP12]]
-// CHECK2-NEXT:    [[TMP38:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK2-NEXT:    store i32 [[ADD9]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK2-NEXT:    [[TMP37:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK2-NEXT:    [[TMP38:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4, !llvm.access.group [[ACC_GRP16]]
 // CHECK2-NEXT:    [[CMP10:%.*]] = icmp sgt i32 [[TMP37]], [[TMP38]]
 // CHECK2-NEXT:    br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]]
 // CHECK2:       cond.true11:
-// CHECK2-NEXT:    [[TMP39:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK2-NEXT:    [[TMP39:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4, !llvm.access.group [[ACC_GRP16]]
 // CHECK2-NEXT:    br label [[COND_END13:%.*]]
 // CHECK2:       cond.false12:
-// CHECK2-NEXT:    [[TMP40:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK2-NEXT:    [[TMP40:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP16]]
 // CHECK2-NEXT:    br label [[COND_END13]]
 // CHECK2:       cond.end13:
 // CHECK2-NEXT:    [[COND14:%.*]] = phi i32 [ [[TMP39]], [[COND_TRUE11]] ], [ [[TMP40]], [[COND_FALSE12]] ]
-// CHECK2-NEXT:    store i32 [[COND14]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP12]]
-// CHECK2-NEXT:    [[TMP41:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP12]]
-// CHECK2-NEXT:    store i32 [[TMP41]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12]]
-// CHECK2-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP13:![0-9]+]]
+// CHECK2-NEXT:    store i32 [[COND14]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK2-NEXT:    [[TMP41:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK2-NEXT:    store i32 [[TMP41]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK2-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP17:![0-9]+]]
 // CHECK2:       omp.inner.for.end:
 // CHECK2-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK2:       omp.loop.exit:
@@ -1382,28 +1382,28 @@ int bar(int n){
 // CHECK2:       omp.dispatch.body:
 // CHECK2-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK2:       omp.inner.for.cond:
-// CHECK2-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16:![0-9]+]]
-// CHECK2-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK2-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP20:![0-9]+]]
+// CHECK2-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP20]]
 // CHECK2-NEXT:    [[CMP6:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]]
 // CHECK2-NEXT:    br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK2:       omp.inner.for.body:
-// CHECK2-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK2-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP20]]
 // CHECK2-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1
 // CHECK2-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK2-NEXT:    store i32 [[ADD]], ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK2-NEXT:    [[TMP19:%.*]] = load i32, ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK2-NEXT:    store i32 [[ADD]], ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP20]]
+// CHECK2-NEXT:    [[TMP19:%.*]] = load i32, ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP20]]
 // CHECK2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], ptr [[TMP0]], i32 0, i32 [[TMP19]]
-// CHECK2-NEXT:    store i32 1, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK2-NEXT:    [[TMP20:%.*]] = load i32, ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK2-NEXT:    store i32 [[TMP20]], ptr [[L_ADDR]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK2-NEXT:    store i32 1, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP20]]
+// CHECK2-NEXT:    [[TMP20:%.*]] = load i32, ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP20]]
+// CHECK2-NEXT:    store i32 [[TMP20]], ptr [[L_ADDR]], align 4, !llvm.access.group [[ACC_GRP20]]
 // CHECK2-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK2:       omp.body.continue:
 // CHECK2-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK2:       omp.inner.for.inc:
-// CHECK2-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK2-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP20]]
 // CHECK2-NEXT:    [[ADD7:%.*]] = add nsw i32 [[TMP21]], 1
-// CHECK2-NEXT:    store i32 [[ADD7]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK2-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP17:![0-9]+]]
+// CHECK2-NEXT:    store i32 [[ADD7]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP20]]
+// CHECK2-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP21:![0-9]+]]
 // CHECK2:       omp.inner.for.end:
 // CHECK2-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK2:       omp.dispatch.inc:
@@ -1535,61 +1535,61 @@ int bar(int n){
 // CHECK2-NEXT:    store i32 [[TMP11]], ptr [[DOTOMP_IV]], align 4
 // CHECK2-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK2:       omp.inner.for.cond:
-// CHECK2-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19:![0-9]+]]
-// CHECK2-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK2-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP23:![0-9]+]]
+// CHECK2-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4, !llvm.access.group [[ACC_GRP23]]
 // CHECK2-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP13]], 1
 // CHECK2-NEXT:    [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]]
 // CHECK2-NEXT:    br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK2:       omp.inner.for.body:
-// CHECK2-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP19]]
-// CHECK2-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP19]]
-// CHECK2-NEXT:    [[TMP16:%.*]] = load i32, ptr [[N_ADDR]], align 4, !llvm.access.group [[ACC_GRP19]]
-// CHECK2-NEXT:    store i32 [[TMP16]], ptr [[N_CASTED]], align 4, !llvm.access.group [[ACC_GRP19]]
-// CHECK2-NEXT:    [[TMP17:%.*]] = load i32, ptr [[N_CASTED]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK2-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK2-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK2-NEXT:    [[TMP16:%.*]] = load i32, ptr [[N_ADDR]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK2-NEXT:    store i32 [[TMP16]], ptr [[N_CASTED]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK2-NEXT:    [[TMP17:%.*]] = load i32, ptr [[N_CASTED]], align 4, !llvm.access.group [[ACC_GRP23]]
 // CHECK2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
 // CHECK2-NEXT:    [[TMP19:%.*]] = inttoptr i32 [[TMP14]] to ptr
-// CHECK2-NEXT:    store ptr [[TMP19]], ptr [[TMP18]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK2-NEXT:    store ptr [[TMP19]], ptr [[TMP18]], align 4, !llvm.access.group [[ACC_GRP23]]
 // CHECK2-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
 // CHECK2-NEXT:    [[TMP21:%.*]] = inttoptr i32 [[TMP15]] to ptr
-// CHECK2-NEXT:    store ptr [[TMP21]], ptr [[TMP20]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK2-NEXT:    store ptr [[TMP21]], ptr [[TMP20]], align 4, !llvm.access.group [[ACC_GRP23]]
 // CHECK2-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 2
 // CHECK2-NEXT:    [[TMP23:%.*]] = inttoptr i32 [[TMP17]] to ptr
-// CHECK2-NEXT:    store ptr [[TMP23]], ptr [[TMP22]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK2-NEXT:    store ptr [[TMP23]], ptr [[TMP22]], align 4, !llvm.access.group [[ACC_GRP23]]
 // CHECK2-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 3
-// CHECK2-NEXT:    store ptr [[TMP0]], ptr [[TMP24]], align 4, !llvm.access.group [[ACC_GRP19]]
-// CHECK2-NEXT:    [[TMP25:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4, !llvm.access.group [[ACC_GRP19]]
-// CHECK2-NEXT:    [[TMP26:%.*]] = load i32, ptr [[TMP25]], align 4, !llvm.access.group [[ACC_GRP19]]
-// CHECK2-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP26]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l32_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i32 4), !llvm.access.group [[ACC_GRP19]]
+// CHECK2-NEXT:    store ptr [[TMP0]], ptr [[TMP24]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK2-NEXT:    [[TMP25:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK2-NEXT:    [[TMP26:%.*]] = load i32, ptr [[TMP25]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK2-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP26]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l32_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i32 4), !llvm.access.group [[ACC_GRP23]]
 // CHECK2-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK2:       omp.inner.for.inc:
-// CHECK2-NEXT:    [[TMP27:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19]]
-// CHECK2-NEXT:    [[TMP28:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK2-NEXT:    [[TMP27:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK2-NEXT:    [[TMP28:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP23]]
 // CHECK2-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP27]], [[TMP28]]
-// CHECK2-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19]]
-// CHECK2-NEXT:    [[TMP29:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP19]]
-// CHECK2-NEXT:    [[TMP30:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK2-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK2-NEXT:    [[TMP29:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK2-NEXT:    [[TMP30:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP23]]
 // CHECK2-NEXT:    [[ADD7:%.*]] = add nsw i32 [[TMP29]], [[TMP30]]
-// CHECK2-NEXT:    store i32 [[ADD7]], ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP19]]
-// CHECK2-NEXT:    [[TMP31:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP19]]
-// CHECK2-NEXT:    [[TMP32:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK2-NEXT:    store i32 [[ADD7]], ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK2-NEXT:    [[TMP31:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK2-NEXT:    [[TMP32:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP23]]
 // CHECK2-NEXT:    [[ADD8:%.*]] = add nsw i32 [[TMP31]], [[TMP32]]
-// CHECK2-NEXT:    store i32 [[ADD8]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP19]]
-// CHECK2-NEXT:    [[TMP33:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP19]]
-// CHECK2-NEXT:    [[TMP34:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK2-NEXT:    store i32 [[ADD8]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK2-NEXT:    [[TMP33:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK2-NEXT:    [[TMP34:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4, !llvm.access.group [[ACC_GRP23]]
 // CHECK2-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[TMP33]], [[TMP34]]
 // CHECK2-NEXT:    br i1 [[CMP9]], label [[COND_TRUE10:%.*]], label [[COND_FALSE11:%.*]]
 // CHECK2:       cond.true10:
-// CHECK2-NEXT:    [[TMP35:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK2-NEXT:    [[TMP35:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4, !llvm.access.group [[ACC_GRP23]]
 // CHECK2-NEXT:    br label [[COND_END12:%.*]]
 // CHECK2:       cond.false11:
-// CHECK2-NEXT:    [[TMP36:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK2-NEXT:    [[TMP36:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP23]]
 // CHECK2-NEXT:    br label [[COND_END12]]
 // CHECK2:       cond.end12:
 // CHECK2-NEXT:    [[COND13:%.*]] = phi i32 [ [[TMP35]], [[COND_TRUE10]] ], [ [[TMP36]], [[COND_FALSE11]] ]
-// CHECK2-NEXT:    store i32 [[COND13]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP19]]
-// CHECK2-NEXT:    [[TMP37:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP19]]
-// CHECK2-NEXT:    store i32 [[TMP37]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19]]
-// CHECK2-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP20:![0-9]+]]
+// CHECK2-NEXT:    store i32 [[COND13]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK2-NEXT:    [[TMP37:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK2-NEXT:    store i32 [[TMP37]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK2-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP24:![0-9]+]]
 // CHECK2:       omp.inner.for.end:
 // CHECK2-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK2:       omp.loop.exit:
@@ -1667,31 +1667,31 @@ int bar(int n){
 // CHECK2-NEXT:    store i32 [[TMP9]], ptr [[DOTOMP_IV]], align 4
 // CHECK2-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK2:       omp.inner.for.cond:
-// CHECK2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22:![0-9]+]]
-// CHECK2-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTPREVIOUS_UB__ADDR]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26:![0-9]+]]
+// CHECK2-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTPREVIOUS_UB__ADDR]], align 4, !llvm.access.group [[ACC_GRP26]]
 // CHECK2-NEXT:    [[CMP4:%.*]] = icmp ule i32 [[TMP10]], [[TMP11]]
 // CHECK2-NEXT:    br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK2:       omp.inner.for.body:
-// CHECK2-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK2-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
 // CHECK2-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1
 // CHECK2-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK2-NEXT:    store i32 [[ADD]], ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP22]]
-// CHECK2-NEXT:    [[TMP13:%.*]] = load i32, ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK2-NEXT:    store i32 [[ADD]], ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK2-NEXT:    [[TMP13:%.*]] = load i32, ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP26]]
 // CHECK2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], ptr [[TMP0]], i32 0, i32 [[TMP13]]
-// CHECK2-NEXT:    [[TMP14:%.*]] = load i16, ptr [[ARRAYIDX]], align 2, !llvm.access.group [[ACC_GRP22]]
+// CHECK2-NEXT:    [[TMP14:%.*]] = load i16, ptr [[ARRAYIDX]], align 2, !llvm.access.group [[ACC_GRP26]]
 // CHECK2-NEXT:    [[CONV:%.*]] = sext i16 [[TMP14]] to i32
 // CHECK2-NEXT:    [[ADD5:%.*]] = add nsw i32 [[CONV]], 1
 // CHECK2-NEXT:    [[CONV6:%.*]] = trunc i32 [[ADD5]] to i16
-// CHECK2-NEXT:    store i16 [[CONV6]], ptr [[ARRAYIDX]], align 2, !llvm.access.group [[ACC_GRP22]]
+// CHECK2-NEXT:    store i16 [[CONV6]], ptr [[ARRAYIDX]], align 2, !llvm.access.group [[ACC_GRP26]]
 // CHECK2-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK2:       omp.body.continue:
 // CHECK2-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK2:       omp.inner.for.inc:
-// CHECK2-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
-// CHECK2-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK2-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK2-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP26]]
 // CHECK2-NEXT:    [[ADD7:%.*]] = add nsw i32 [[TMP15]], [[TMP16]]
-// CHECK2-NEXT:    store i32 [[ADD7]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
-// CHECK2-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP23:![0-9]+]]
+// CHECK2-NEXT:    store i32 [[ADD7]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK2-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP27:![0-9]+]]
 // CHECK2:       omp.inner.for.end:
 // CHECK2-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK2:       omp.loop.exit:
@@ -1778,49 +1778,49 @@ int bar(int n){
 // CHECK2-NEXT:    store i32 [[TMP5]], ptr [[DOTOMP_IV]], align 4
 // CHECK2-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK2:       omp.inner.for.cond:
-// CHECK2-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP25:![0-9]+]]
+// CHECK2-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP29:![0-9]+]]
 // CHECK2-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP6]], 10
 // CHECK2-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK2:       omp.inner.for.body:
-// CHECK2-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP25]]
-// CHECK2-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP25]]
+// CHECK2-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP29]]
+// CHECK2-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP29]]
 // CHECK2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [3 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
 // CHECK2-NEXT:    [[TMP10:%.*]] = inttoptr i32 [[TMP7]] to ptr
-// CHECK2-NEXT:    store ptr [[TMP10]], ptr [[TMP9]], align 4, !llvm.access.group [[ACC_GRP25]]
+// CHECK2-NEXT:    store ptr [[TMP10]], ptr [[TMP9]], align 4, !llvm.access.group [[ACC_GRP29]]
 // CHECK2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [3 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
 // CHECK2-NEXT:    [[TMP12:%.*]] = inttoptr i32 [[TMP8]] to ptr
-// CHECK2-NEXT:    store ptr [[TMP12]], ptr [[TMP11]], align 4, !llvm.access.group [[ACC_GRP25]]
+// CHECK2-NEXT:    store ptr [[TMP12]], ptr [[TMP11]], align 4, !llvm.access.group [[ACC_GRP29]]
 // CHECK2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [3 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 2
-// CHECK2-NEXT:    store ptr [[TMP0]], ptr [[TMP13]], align 4, !llvm.access.group [[ACC_GRP25]]
-// CHECK2-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l37_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i32 3), !llvm.access.group [[ACC_GRP25]]
+// CHECK2-NEXT:    store ptr [[TMP0]], ptr [[TMP13]], align 4, !llvm.access.group [[ACC_GRP29]]
+// CHECK2-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l37_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i32 3), !llvm.access.group [[ACC_GRP29]]
 // CHECK2-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK2:       omp.inner.for.inc:
-// CHECK2-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP25]]
-// CHECK2-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP25]]
+// CHECK2-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP29]]
+// CHECK2-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP29]]
 // CHECK2-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP14]], [[TMP15]]
-// CHECK2-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP25]]
-// CHECK2-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP25]]
-// CHECK2-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP25]]
+// CHECK2-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP29]]
+// CHECK2-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP29]]
+// CHECK2-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP29]]
 // CHECK2-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP16]], [[TMP17]]
-// CHECK2-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP25]]
-// CHECK2-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP25]]
-// CHECK2-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP25]]
+// CHECK2-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP29]]
+// CHECK2-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP29]]
+// CHECK2-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP29]]
 // CHECK2-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP18]], [[TMP19]]
-// CHECK2-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP25]]
-// CHECK2-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP25]]
+// CHECK2-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP29]]
+// CHECK2-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP29]]
 // CHECK2-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP20]], 9
 // CHECK2-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
 // CHECK2:       cond.true5:
 // CHECK2-NEXT:    br label [[COND_END7:%.*]]
 // CHECK2:       cond.false6:
-// CHECK2-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP25]]
+// CHECK2-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP29]]
 // CHECK2-NEXT:    br label [[COND_END7]]
 // CHECK2:       cond.end7:
 // CHECK2-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP21]], [[COND_FALSE6]] ]
-// CHECK2-NEXT:    store i32 [[COND8]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP25]]
-// CHECK2-NEXT:    [[TMP22:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP25]]
-// CHECK2-NEXT:    store i32 [[TMP22]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP25]]
-// CHECK2-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP26:![0-9]+]]
+// CHECK2-NEXT:    store i32 [[COND8]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP29]]
+// CHECK2-NEXT:    [[TMP22:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP29]]
+// CHECK2-NEXT:    store i32 [[TMP22]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP29]]
+// CHECK2-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP30:![0-9]+]]
 // CHECK2:       omp.inner.for.end:
 // CHECK2-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK2:       omp.loop.exit:
@@ -1871,29 +1871,29 @@ int bar(int n){
 // CHECK2-NEXT:    store i32 [[TMP5]], ptr [[DOTOMP_IV]], align 4
 // CHECK2-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK2:       omp.inner.for.cond:
-// CHECK2-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP28:![0-9]+]]
-// CHECK2-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTPREVIOUS_UB__ADDR]], align 4, !llvm.access.group [[ACC_GRP28]]
+// CHECK2-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP32:![0-9]+]]
+// CHECK2-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTPREVIOUS_UB__ADDR]], align 4, !llvm.access.group [[ACC_GRP32]]
 // CHECK2-NEXT:    [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]]
 // CHECK2-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK2:       omp.inner.for.body:
-// CHECK2-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP28]]
+// CHECK2-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP32]]
 // CHECK2-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1
 // CHECK2-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK2-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP28]]
-// CHECK2-NEXT:    [[TMP9:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP28]]
+// CHECK2-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP32]]
+// CHECK2-NEXT:    [[TMP9:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP32]]
 // CHECK2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP0]], i32 0, i32 [[TMP9]]
-// CHECK2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP28]]
+// CHECK2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP32]]
 // CHECK2-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP10]], 1
-// CHECK2-NEXT:    store i32 [[ADD1]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP28]]
+// CHECK2-NEXT:    store i32 [[ADD1]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP32]]
 // CHECK2-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK2:       omp.body.continue:
 // CHECK2-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK2:       omp.inner.for.inc:
-// CHECK2-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP28]]
-// CHECK2-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP28]]
+// CHECK2-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP32]]
+// CHECK2-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP32]]
 // CHECK2-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP11]], [[TMP12]]
-// CHECK2-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP28]]
-// CHECK2-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP29:![0-9]+]]
+// CHECK2-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP32]]
+// CHECK2-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP33:![0-9]+]]
 // CHECK2:       omp.inner.for.end:
 // CHECK2-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK2:       omp.loop.exit:
@@ -1983,55 +1983,55 @@ int bar(int n){
 // CHECK2-NEXT:    store i32 [[TMP5]], ptr [[DOTOMP_IV]], align 4
 // CHECK2-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK2:       omp.inner.for.cond:
-// CHECK2-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP31:![0-9]+]]
+// CHECK2-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP35:![0-9]+]]
 // CHECK2-NEXT:    [[CMP2:%.*]] = icmp slt i32 [[TMP6]], 100
 // CHECK2-NEXT:    br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK2:       omp.inner.for.body:
-// CHECK2-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP31]]
-// CHECK2-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP31]]
-// CHECK2-NEXT:    [[TMP9:%.*]] = load i32, ptr [[F_ADDR]], align 4, !llvm.access.group [[ACC_GRP31]]
-// CHECK2-NEXT:    store i32 [[TMP9]], ptr [[F_CASTED]], align 4, !llvm.access.group [[ACC_GRP31]]
-// CHECK2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[F_CASTED]], align 4, !llvm.access.group [[ACC_GRP31]]
+// CHECK2-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP35]]
+// CHECK2-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP35]]
+// CHECK2-NEXT:    [[TMP9:%.*]] = load i32, ptr [[F_ADDR]], align 4, !llvm.access.group [[ACC_GRP35]]
+// CHECK2-NEXT:    store i32 [[TMP9]], ptr [[F_CASTED]], align 4, !llvm.access.group [[ACC_GRP35]]
+// CHECK2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[F_CASTED]], align 4, !llvm.access.group [[ACC_GRP35]]
 // CHECK2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
 // CHECK2-NEXT:    [[TMP12:%.*]] = inttoptr i32 [[TMP7]] to ptr
-// CHECK2-NEXT:    store ptr [[TMP12]], ptr [[TMP11]], align 4, !llvm.access.group [[ACC_GRP31]]
+// CHECK2-NEXT:    store ptr [[TMP12]], ptr [[TMP11]], align 4, !llvm.access.group [[ACC_GRP35]]
 // CHECK2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
 // CHECK2-NEXT:    [[TMP14:%.*]] = inttoptr i32 [[TMP8]] to ptr
-// CHECK2-NEXT:    store ptr [[TMP14]], ptr [[TMP13]], align 4, !llvm.access.group [[ACC_GRP31]]
+// CHECK2-NEXT:    store ptr [[TMP14]], ptr [[TMP13]], align 4, !llvm.access.group [[ACC_GRP35]]
 // CHECK2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 2
-// CHECK2-NEXT:    store ptr [[TMP0]], ptr [[TMP15]], align 4, !llvm.access.group [[ACC_GRP31]]
+// CHECK2-NEXT:    store ptr [[TMP0]], ptr [[TMP15]], align 4, !llvm.access.group [[ACC_GRP35]]
 // CHECK2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 3
 // CHECK2-NEXT:    [[TMP17:%.*]] = inttoptr i32 [[TMP10]] to ptr
-// CHECK2-NEXT:    store ptr [[TMP17]], ptr [[TMP16]], align 4, !llvm.access.group [[ACC_GRP31]]
-// CHECK2-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l42_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i32 4), !llvm.access.group [[ACC_GRP31]]
+// CHECK2-NEXT:    store ptr [[TMP17]], ptr [[TMP16]], align 4, !llvm.access.group [[ACC_GRP35]]
+// CHECK2-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l42_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i32 4), !llvm.access.group [[ACC_GRP35]]
 // CHECK2-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK2:       omp.inner.for.inc:
-// CHECK2-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP31]]
-// CHECK2-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP31]]
+// CHECK2-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP35]]
+// CHECK2-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP35]]
 // CHECK2-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP18]], [[TMP19]]
-// CHECK2-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP31]]
-// CHECK2-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP31]]
-// CHECK2-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP31]]
+// CHECK2-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP35]]
+// CHECK2-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP35]]
+// CHECK2-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP35]]
 // CHECK2-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP20]], [[TMP21]]
-// CHECK2-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP31]]
-// CHECK2-NEXT:    [[TMP22:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP31]]
-// CHECK2-NEXT:    [[TMP23:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP31]]
+// CHECK2-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP35]]
+// CHECK2-NEXT:    [[TMP22:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP35]]
+// CHECK2-NEXT:    [[TMP23:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP35]]
 // CHECK2-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP22]], [[TMP23]]
-// CHECK2-NEXT:    store i32 [[ADD4]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP31]]
-// CHECK2-NEXT:    [[TMP24:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP31]]
+// CHECK2-NEXT:    store i32 [[ADD4]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP35]]
+// CHECK2-NEXT:    [[TMP24:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP35]]
 // CHECK2-NEXT:    [[CMP5:%.*]] = icmp sgt i32 [[TMP24]], 99
 // CHECK2-NEXT:    br i1 [[CMP5]], label [[COND_TRUE6:%.*]], label [[COND_FALSE7:%.*]]
 // CHECK2:       cond.true6:
 // CHECK2-NEXT:    br label [[COND_END8:%.*]]
 // CHECK2:       cond.false7:
-// CHECK2-NEXT:    [[TMP25:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP31]]
+// CHECK2-NEXT:    [[TMP25:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP35]]
 // CHECK2-NEXT:    br label [[COND_END8]]
 // CHECK2:       cond.end8:
 // CHECK2-NEXT:    [[COND9:%.*]] = phi i32 [ 99, [[COND_TRUE6]] ], [ [[TMP25]], [[COND_FALSE7]] ]
-// CHECK2-NEXT:    store i32 [[COND9]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP31]]
-// CHECK2-NEXT:    [[TMP26:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP31]]
-// CHECK2-NEXT:    store i32 [[TMP26]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP31]]
-// CHECK2-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP32:![0-9]+]]
+// CHECK2-NEXT:    store i32 [[COND9]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP35]]
+// CHECK2-NEXT:    [[TMP26:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP35]]
+// CHECK2-NEXT:    store i32 [[TMP26]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP35]]
+// CHECK2-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP36:![0-9]+]]
 // CHECK2:       omp.inner.for.end:
 // CHECK2-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK2:       omp.loop.exit:
@@ -2088,46 +2088,46 @@ int bar(int n){
 // CHECK2-NEXT:    store i32 [[TMP5]], ptr [[DOTOMP_IV]], align 4
 // CHECK2-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK2:       omp.inner.for.cond:
-// CHECK2-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP34:![0-9]+]]
-// CHECK2-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTPREVIOUS_UB__ADDR]], align 4, !llvm.access.group [[ACC_GRP34]]
+// CHECK2-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP38:![0-9]+]]
+// CHECK2-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTPREVIOUS_UB__ADDR]], align 4, !llvm.access.group [[ACC_GRP38]]
 // CHECK2-NEXT:    [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]]
 // CHECK2-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK2:       omp.inner.for.body:
-// CHECK2-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP34]]
+// CHECK2-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP38]]
 // CHECK2-NEXT:    [[DIV:%.*]] = sdiv i32 [[TMP8]], 10
 // CHECK2-NEXT:    [[MUL:%.*]] = mul nsw i32 [[DIV]], 1
 // CHECK2-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK2-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP34]]
-// CHECK2-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP34]]
-// CHECK2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP34]]
+// CHECK2-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP38]]
+// CHECK2-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP38]]
+// CHECK2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP38]]
 // CHECK2-NEXT:    [[DIV2:%.*]] = sdiv i32 [[TMP10]], 10
 // CHECK2-NEXT:    [[MUL3:%.*]] = mul nsw i32 [[DIV2]], 10
 // CHECK2-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP9]], [[MUL3]]
 // CHECK2-NEXT:    [[MUL4:%.*]] = mul nsw i32 [[SUB]], 1
 // CHECK2-NEXT:    [[ADD5:%.*]] = add nsw i32 0, [[MUL4]]
-// CHECK2-NEXT:    store i32 [[ADD5]], ptr [[J]], align 4, !llvm.access.group [[ACC_GRP34]]
-// CHECK2-NEXT:    store i32 10, ptr [[K]], align 4, !llvm.access.group [[ACC_GRP34]]
-// CHECK2-NEXT:    [[TMP11:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP34]]
-// CHECK2-NEXT:    [[TMP12:%.*]] = load i32, ptr [[J]], align 4, !llvm.access.group [[ACC_GRP34]]
-// CHECK2-NEXT:    [[TMP13:%.*]] = load i32, ptr [[F_ADDR]], align 4, !llvm.access.group [[ACC_GRP34]]
+// CHECK2-NEXT:    store i32 [[ADD5]], ptr [[J]], align 4, !llvm.access.group [[ACC_GRP38]]
+// CHECK2-NEXT:    store i32 10, ptr [[K]], align 4, !llvm.access.group [[ACC_GRP38]]
+// CHECK2-NEXT:    [[TMP11:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP38]]
+// CHECK2-NEXT:    [[TMP12:%.*]] = load i32, ptr [[J]], align 4, !llvm.access.group [[ACC_GRP38]]
+// CHECK2-NEXT:    [[TMP13:%.*]] = load i32, ptr [[F_ADDR]], align 4, !llvm.access.group [[ACC_GRP38]]
 // CHECK2-NEXT:    [[MUL6:%.*]] = mul nsw i32 [[TMP12]], [[TMP13]]
 // CHECK2-NEXT:    [[ADD7:%.*]] = add nsw i32 [[TMP11]], [[MUL6]]
-// CHECK2-NEXT:    [[TMP14:%.*]] = load i32, ptr [[K]], align 4, !llvm.access.group [[ACC_GRP34]]
+// CHECK2-NEXT:    [[TMP14:%.*]] = load i32, ptr [[K]], align 4, !llvm.access.group [[ACC_GRP38]]
 // CHECK2-NEXT:    [[ADD8:%.*]] = add nsw i32 [[ADD7]], [[TMP14]]
-// CHECK2-NEXT:    [[TMP15:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP34]]
+// CHECK2-NEXT:    [[TMP15:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP38]]
 // CHECK2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP0]], i32 0, i32 [[TMP15]]
-// CHECK2-NEXT:    [[TMP16:%.*]] = load i32, ptr [[J]], align 4, !llvm.access.group [[ACC_GRP34]]
+// CHECK2-NEXT:    [[TMP16:%.*]] = load i32, ptr [[J]], align 4, !llvm.access.group [[ACC_GRP38]]
 // CHECK2-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX]], i32 0, i32 [[TMP16]]
-// CHECK2-NEXT:    store i32 [[ADD8]], ptr [[ARRAYIDX9]], align 4, !llvm.access.group [[ACC_GRP34]]
+// CHECK2-NEXT:    store i32 [[ADD8]], ptr [[ARRAYIDX9]], align 4, !llvm.access.group [[ACC_GRP38]]
 // CHECK2-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK2:       omp.body.continue:
 // CHECK2-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK2:       omp.inner.for.inc:
-// CHECK2-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP34]]
-// CHECK2-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP34]]
+// CHECK2-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP38]]
+// CHECK2-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP38]]
 // CHECK2-NEXT:    [[ADD10:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
-// CHECK2-NEXT:    store i32 [[ADD10]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP34]]
-// CHECK2-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP35:![0-9]+]]
+// CHECK2-NEXT:    store i32 [[ADD10]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP38]]
+// CHECK2-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP39:![0-9]+]]
 // CHECK2:       omp.inner.for.end:
 // CHECK2-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK2:       omp.loop.exit:
diff --git a/clang/test/OpenMP/nvptx_target_teams_distribute_simd_codegen.cpp b/clang/test/OpenMP/nvptx_target_teams_distribute_simd_codegen.cpp
index 20fe58ce10082..3173c075d7047 100644
--- a/clang/test/OpenMP/nvptx_target_teams_distribute_simd_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_target_teams_distribute_simd_codegen.cpp
@@ -170,29 +170,29 @@ int bar(int n){
 // CHECK45-64:       omp.dispatch.body:
 // CHECK45-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK45-64:       omp.inner.for.cond:
-// CHECK45-64-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12:![0-9]+]]
-// CHECK45-64-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK45-64-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16:![0-9]+]]
+// CHECK45-64-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP16]]
 // CHECK45-64-NEXT:    [[CMP7:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]]
 // CHECK45-64-NEXT:    br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK45-64:       omp.inner.for.body:
-// CHECK45-64-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK45-64-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
 // CHECK45-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP16]], 1
 // CHECK45-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK45-64-NEXT:    store i32 [[ADD]], ptr [[I4]], align 4, !llvm.access.group [[ACC_GRP12]]
-// CHECK45-64-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I4]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK45-64-NEXT:    store i32 [[ADD]], ptr [[I4]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK45-64-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I4]], align 4, !llvm.access.group [[ACC_GRP16]]
 // CHECK45-64-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP17]] to i64
 // CHECK45-64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], ptr [[TMP0]], i64 0, i64 [[IDXPROM]]
-// CHECK45-64-NEXT:    store i32 1, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP12]]
-// CHECK45-64-NEXT:    [[TMP18:%.*]] = load i32, ptr [[I4]], align 4, !llvm.access.group [[ACC_GRP12]]
-// CHECK45-64-NEXT:    store i32 [[TMP18]], ptr [[L_ADDR]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK45-64-NEXT:    store i32 1, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK45-64-NEXT:    [[TMP18:%.*]] = load i32, ptr [[I4]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK45-64-NEXT:    store i32 [[TMP18]], ptr [[L_ADDR]], align 4, !llvm.access.group [[ACC_GRP16]]
 // CHECK45-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK45-64:       omp.body.continue:
 // CHECK45-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK45-64:       omp.inner.for.inc:
-// CHECK45-64-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK45-64-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
 // CHECK45-64-NEXT:    [[ADD8:%.*]] = add nsw i32 [[TMP19]], 1
-// CHECK45-64-NEXT:    store i32 [[ADD8]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12]]
-// CHECK45-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP13:![0-9]+]]
+// CHECK45-64-NEXT:    store i32 [[ADD8]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK45-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP17:![0-9]+]]
 // CHECK45-64:       omp.inner.for.end:
 // CHECK45-64-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK45-64:       omp.dispatch.inc:
@@ -330,31 +330,31 @@ int bar(int n){
 // CHECK45-64:       omp.dispatch.body:
 // CHECK45-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK45-64:       omp.inner.for.cond:
-// CHECK45-64-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16:![0-9]+]]
-// CHECK45-64-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK45-64-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP20:![0-9]+]]
+// CHECK45-64-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP20]]
 // CHECK45-64-NEXT:    [[CMP6:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]]
 // CHECK45-64-NEXT:    br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK45-64:       omp.inner.for.body:
-// CHECK45-64-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK45-64-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP20]]
 // CHECK45-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP16]], 1
 // CHECK45-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK45-64-NEXT:    store i32 [[ADD]], ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK45-64-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK45-64-NEXT:    store i32 [[ADD]], ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP20]]
+// CHECK45-64-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP20]]
 // CHECK45-64-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP17]] to i64
 // CHECK45-64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], ptr [[TMP0]], i64 0, i64 [[IDXPROM]]
-// CHECK45-64-NEXT:    [[TMP18:%.*]] = load i16, ptr [[ARRAYIDX]], align 2, !llvm.access.group [[ACC_GRP16]]
+// CHECK45-64-NEXT:    [[TMP18:%.*]] = load i16, ptr [[ARRAYIDX]], align 2, !llvm.access.group [[ACC_GRP20]]
 // CHECK45-64-NEXT:    [[CONV:%.*]] = sext i16 [[TMP18]] to i32
 // CHECK45-64-NEXT:    [[ADD7:%.*]] = add nsw i32 [[CONV]], 1
 // CHECK45-64-NEXT:    [[CONV8:%.*]] = trunc i32 [[ADD7]] to i16
-// CHECK45-64-NEXT:    store i16 [[CONV8]], ptr [[ARRAYIDX]], align 2, !llvm.access.group [[ACC_GRP16]]
+// CHECK45-64-NEXT:    store i16 [[CONV8]], ptr [[ARRAYIDX]], align 2, !llvm.access.group [[ACC_GRP20]]
 // CHECK45-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK45-64:       omp.body.continue:
 // CHECK45-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK45-64:       omp.inner.for.inc:
-// CHECK45-64-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK45-64-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP20]]
 // CHECK45-64-NEXT:    [[ADD9:%.*]] = add nsw i32 [[TMP19]], 1
-// CHECK45-64-NEXT:    store i32 [[ADD9]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK45-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP17:![0-9]+]]
+// CHECK45-64-NEXT:    store i32 [[ADD9]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP20]]
+// CHECK45-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP21:![0-9]+]]
 // CHECK45-64:       omp.inner.for.end:
 // CHECK45-64-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK45-64:       omp.dispatch.inc:
@@ -457,29 +457,29 @@ int bar(int n){
 // CHECK45-64:       omp.dispatch.body:
 // CHECK45-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK45-64:       omp.inner.for.cond:
-// CHECK45-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19:![0-9]+]]
-// CHECK45-64-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK45-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP23:![0-9]+]]
+// CHECK45-64-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP23]]
 // CHECK45-64-NEXT:    [[CMP2:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
 // CHECK45-64-NEXT:    br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK45-64:       omp.inner.for.body:
-// CHECK45-64-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK45-64-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP23]]
 // CHECK45-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
 // CHECK45-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK45-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP19]]
-// CHECK45-64-NEXT:    [[TMP11:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK45-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK45-64-NEXT:    [[TMP11:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP23]]
 // CHECK45-64-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP11]] to i64
 // CHECK45-64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP0]], i64 0, i64 [[IDXPROM]]
-// CHECK45-64-NEXT:    [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK45-64-NEXT:    [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP23]]
 // CHECK45-64-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP12]], 1
-// CHECK45-64-NEXT:    store i32 [[ADD3]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK45-64-NEXT:    store i32 [[ADD3]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP23]]
 // CHECK45-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK45-64:       omp.body.continue:
 // CHECK45-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK45-64:       omp.inner.for.inc:
-// CHECK45-64-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK45-64-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP23]]
 // CHECK45-64-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP13]], 1
-// CHECK45-64-NEXT:    store i32 [[ADD4]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19]]
-// CHECK45-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP20:![0-9]+]]
+// CHECK45-64-NEXT:    store i32 [[ADD4]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK45-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP24:![0-9]+]]
 // CHECK45-64:       omp.inner.for.end:
 // CHECK45-64-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK45-64:       omp.dispatch.inc:
@@ -584,47 +584,47 @@ int bar(int n){
 // CHECK45-64:       omp.dispatch.body:
 // CHECK45-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK45-64:       omp.inner.for.cond:
-// CHECK45-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22:![0-9]+]]
-// CHECK45-64-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK45-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26:![0-9]+]]
+// CHECK45-64-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP26]]
 // CHECK45-64-NEXT:    [[CMP3:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
 // CHECK45-64-NEXT:    br i1 [[CMP3]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK45-64:       omp.inner.for.body:
-// CHECK45-64-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK45-64-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
 // CHECK45-64-NEXT:    [[DIV:%.*]] = sdiv i32 [[TMP10]], 10
 // CHECK45-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[DIV]], 1
 // CHECK45-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK45-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP22]]
-// CHECK45-64-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
-// CHECK45-64-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK45-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK45-64-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK45-64-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
 // CHECK45-64-NEXT:    [[DIV4:%.*]] = sdiv i32 [[TMP12]], 10
 // CHECK45-64-NEXT:    [[MUL5:%.*]] = mul nsw i32 [[DIV4]], 10
 // CHECK45-64-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP11]], [[MUL5]]
 // CHECK45-64-NEXT:    [[MUL6:%.*]] = mul nsw i32 [[SUB]], 1
 // CHECK45-64-NEXT:    [[ADD7:%.*]] = add nsw i32 0, [[MUL6]]
-// CHECK45-64-NEXT:    store i32 [[ADD7]], ptr [[J]], align 4, !llvm.access.group [[ACC_GRP22]]
-// CHECK45-64-NEXT:    store i32 10, ptr [[K]], align 4, !llvm.access.group [[ACC_GRP22]]
-// CHECK45-64-NEXT:    [[TMP13:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP22]]
-// CHECK45-64-NEXT:    [[TMP14:%.*]] = load i32, ptr [[J]], align 4, !llvm.access.group [[ACC_GRP22]]
-// CHECK45-64-NEXT:    [[TMP15:%.*]] = load i32, ptr [[F_ADDR]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK45-64-NEXT:    store i32 [[ADD7]], ptr [[J]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK45-64-NEXT:    store i32 10, ptr [[K]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK45-64-NEXT:    [[TMP13:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK45-64-NEXT:    [[TMP14:%.*]] = load i32, ptr [[J]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK45-64-NEXT:    [[TMP15:%.*]] = load i32, ptr [[F_ADDR]], align 4, !llvm.access.group [[ACC_GRP26]]
 // CHECK45-64-NEXT:    [[MUL8:%.*]] = mul nsw i32 [[TMP14]], [[TMP15]]
 // CHECK45-64-NEXT:    [[ADD9:%.*]] = add nsw i32 [[TMP13]], [[MUL8]]
-// CHECK45-64-NEXT:    [[TMP16:%.*]] = load i32, ptr [[K]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK45-64-NEXT:    [[TMP16:%.*]] = load i32, ptr [[K]], align 4, !llvm.access.group [[ACC_GRP26]]
 // CHECK45-64-NEXT:    [[ADD10:%.*]] = add nsw i32 [[ADD9]], [[TMP16]]
-// CHECK45-64-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK45-64-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP26]]
 // CHECK45-64-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP17]] to i64
 // CHECK45-64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP0]], i64 0, i64 [[IDXPROM]]
-// CHECK45-64-NEXT:    [[TMP18:%.*]] = load i32, ptr [[J]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK45-64-NEXT:    [[TMP18:%.*]] = load i32, ptr [[J]], align 4, !llvm.access.group [[ACC_GRP26]]
 // CHECK45-64-NEXT:    [[IDXPROM11:%.*]] = sext i32 [[TMP18]] to i64
 // CHECK45-64-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX]], i64 0, i64 [[IDXPROM11]]
-// CHECK45-64-NEXT:    store i32 [[ADD10]], ptr [[ARRAYIDX12]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK45-64-NEXT:    store i32 [[ADD10]], ptr [[ARRAYIDX12]], align 4, !llvm.access.group [[ACC_GRP26]]
 // CHECK45-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK45-64:       omp.body.continue:
 // CHECK45-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK45-64:       omp.inner.for.inc:
-// CHECK45-64-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK45-64-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
 // CHECK45-64-NEXT:    [[ADD13:%.*]] = add nsw i32 [[TMP19]], 1
-// CHECK45-64-NEXT:    store i32 [[ADD13]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
-// CHECK45-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP23:![0-9]+]]
+// CHECK45-64-NEXT:    store i32 [[ADD13]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK45-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP27:![0-9]+]]
 // CHECK45-64:       omp.inner.for.end:
 // CHECK45-64-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK45-64:       omp.dispatch.inc:
@@ -753,28 +753,28 @@ int bar(int n){
 // CHECK45-32:       omp.dispatch.body:
 // CHECK45-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK45-32:       omp.inner.for.cond:
-// CHECK45-32-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12:![0-9]+]]
-// CHECK45-32-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK45-32-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16:![0-9]+]]
+// CHECK45-32-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP16]]
 // CHECK45-32-NEXT:    [[CMP7:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]]
 // CHECK45-32-NEXT:    br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK45-32:       omp.inner.for.body:
-// CHECK45-32-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK45-32-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
 // CHECK45-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP16]], 1
 // CHECK45-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK45-32-NEXT:    store i32 [[ADD]], ptr [[I4]], align 4, !llvm.access.group [[ACC_GRP12]]
-// CHECK45-32-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I4]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK45-32-NEXT:    store i32 [[ADD]], ptr [[I4]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK45-32-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I4]], align 4, !llvm.access.group [[ACC_GRP16]]
 // CHECK45-32-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], ptr [[TMP0]], i32 0, i32 [[TMP17]]
-// CHECK45-32-NEXT:    store i32 1, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP12]]
-// CHECK45-32-NEXT:    [[TMP18:%.*]] = load i32, ptr [[I4]], align 4, !llvm.access.group [[ACC_GRP12]]
-// CHECK45-32-NEXT:    store i32 [[TMP18]], ptr [[L_ADDR]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK45-32-NEXT:    store i32 1, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK45-32-NEXT:    [[TMP18:%.*]] = load i32, ptr [[I4]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK45-32-NEXT:    store i32 [[TMP18]], ptr [[L_ADDR]], align 4, !llvm.access.group [[ACC_GRP16]]
 // CHECK45-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK45-32:       omp.body.continue:
 // CHECK45-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK45-32:       omp.inner.for.inc:
-// CHECK45-32-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK45-32-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
 // CHECK45-32-NEXT:    [[ADD8:%.*]] = add nsw i32 [[TMP19]], 1
-// CHECK45-32-NEXT:    store i32 [[ADD8]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12]]
-// CHECK45-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP13:![0-9]+]]
+// CHECK45-32-NEXT:    store i32 [[ADD8]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK45-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP17:![0-9]+]]
 // CHECK45-32:       omp.inner.for.end:
 // CHECK45-32-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK45-32:       omp.dispatch.inc:
@@ -912,30 +912,30 @@ int bar(int n){
 // CHECK45-32:       omp.dispatch.body:
 // CHECK45-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK45-32:       omp.inner.for.cond:
-// CHECK45-32-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16:![0-9]+]]
-// CHECK45-32-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK45-32-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP20:![0-9]+]]
+// CHECK45-32-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP20]]
 // CHECK45-32-NEXT:    [[CMP6:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]]
 // CHECK45-32-NEXT:    br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK45-32:       omp.inner.for.body:
-// CHECK45-32-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK45-32-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP20]]
 // CHECK45-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP16]], 1
 // CHECK45-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK45-32-NEXT:    store i32 [[ADD]], ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK45-32-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK45-32-NEXT:    store i32 [[ADD]], ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP20]]
+// CHECK45-32-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP20]]
 // CHECK45-32-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], ptr [[TMP0]], i32 0, i32 [[TMP17]]
-// CHECK45-32-NEXT:    [[TMP18:%.*]] = load i16, ptr [[ARRAYIDX]], align 2, !llvm.access.group [[ACC_GRP16]]
+// CHECK45-32-NEXT:    [[TMP18:%.*]] = load i16, ptr [[ARRAYIDX]], align 2, !llvm.access.group [[ACC_GRP20]]
 // CHECK45-32-NEXT:    [[CONV:%.*]] = sext i16 [[TMP18]] to i32
 // CHECK45-32-NEXT:    [[ADD7:%.*]] = add nsw i32 [[CONV]], 1
 // CHECK45-32-NEXT:    [[CONV8:%.*]] = trunc i32 [[ADD7]] to i16
-// CHECK45-32-NEXT:    store i16 [[CONV8]], ptr [[ARRAYIDX]], align 2, !llvm.access.group [[ACC_GRP16]]
+// CHECK45-32-NEXT:    store i16 [[CONV8]], ptr [[ARRAYIDX]], align 2, !llvm.access.group [[ACC_GRP20]]
 // CHECK45-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK45-32:       omp.body.continue:
 // CHECK45-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK45-32:       omp.inner.for.inc:
-// CHECK45-32-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK45-32-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP20]]
 // CHECK45-32-NEXT:    [[ADD9:%.*]] = add nsw i32 [[TMP19]], 1
-// CHECK45-32-NEXT:    store i32 [[ADD9]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK45-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP17:![0-9]+]]
+// CHECK45-32-NEXT:    store i32 [[ADD9]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP20]]
+// CHECK45-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP21:![0-9]+]]
 // CHECK45-32:       omp.inner.for.end:
 // CHECK45-32-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK45-32:       omp.dispatch.inc:
@@ -1038,28 +1038,28 @@ int bar(int n){
 // CHECK45-32:       omp.dispatch.body:
 // CHECK45-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK45-32:       omp.inner.for.cond:
-// CHECK45-32-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19:![0-9]+]]
-// CHECK45-32-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK45-32-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP23:![0-9]+]]
+// CHECK45-32-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP23]]
 // CHECK45-32-NEXT:    [[CMP2:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
 // CHECK45-32-NEXT:    br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK45-32:       omp.inner.for.body:
-// CHECK45-32-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK45-32-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP23]]
 // CHECK45-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
 // CHECK45-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK45-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP19]]
-// CHECK45-32-NEXT:    [[TMP11:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK45-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK45-32-NEXT:    [[TMP11:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP23]]
 // CHECK45-32-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP0]], i32 0, i32 [[TMP11]]
-// CHECK45-32-NEXT:    [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK45-32-NEXT:    [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP23]]
 // CHECK45-32-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP12]], 1
-// CHECK45-32-NEXT:    store i32 [[ADD3]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK45-32-NEXT:    store i32 [[ADD3]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP23]]
 // CHECK45-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK45-32:       omp.body.continue:
 // CHECK45-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK45-32:       omp.inner.for.inc:
-// CHECK45-32-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK45-32-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP23]]
 // CHECK45-32-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP13]], 1
-// CHECK45-32-NEXT:    store i32 [[ADD4]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19]]
-// CHECK45-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP20:![0-9]+]]
+// CHECK45-32-NEXT:    store i32 [[ADD4]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK45-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP24:![0-9]+]]
 // CHECK45-32:       omp.inner.for.end:
 // CHECK45-32-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK45-32:       omp.dispatch.inc:
@@ -1164,45 +1164,45 @@ int bar(int n){
 // CHECK45-32:       omp.dispatch.body:
 // CHECK45-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK45-32:       omp.inner.for.cond:
-// CHECK45-32-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22:![0-9]+]]
-// CHECK45-32-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK45-32-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26:![0-9]+]]
+// CHECK45-32-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP26]]
 // CHECK45-32-NEXT:    [[CMP3:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
 // CHECK45-32-NEXT:    br i1 [[CMP3]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK45-32:       omp.inner.for.body:
-// CHECK45-32-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK45-32-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
 // CHECK45-32-NEXT:    [[DIV:%.*]] = sdiv i32 [[TMP10]], 10
 // CHECK45-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[DIV]], 1
 // CHECK45-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK45-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP22]]
-// CHECK45-32-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
-// CHECK45-32-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK45-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK45-32-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK45-32-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
 // CHECK45-32-NEXT:    [[DIV4:%.*]] = sdiv i32 [[TMP12]], 10
 // CHECK45-32-NEXT:    [[MUL5:%.*]] = mul nsw i32 [[DIV4]], 10
 // CHECK45-32-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP11]], [[MUL5]]
 // CHECK45-32-NEXT:    [[MUL6:%.*]] = mul nsw i32 [[SUB]], 1
 // CHECK45-32-NEXT:    [[ADD7:%.*]] = add nsw i32 0, [[MUL6]]
-// CHECK45-32-NEXT:    store i32 [[ADD7]], ptr [[J]], align 4, !llvm.access.group [[ACC_GRP22]]
-// CHECK45-32-NEXT:    store i32 10, ptr [[K]], align 4, !llvm.access.group [[ACC_GRP22]]
-// CHECK45-32-NEXT:    [[TMP13:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP22]]
-// CHECK45-32-NEXT:    [[TMP14:%.*]] = load i32, ptr [[J]], align 4, !llvm.access.group [[ACC_GRP22]]
-// CHECK45-32-NEXT:    [[TMP15:%.*]] = load i32, ptr [[F_ADDR]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK45-32-NEXT:    store i32 [[ADD7]], ptr [[J]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK45-32-NEXT:    store i32 10, ptr [[K]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK45-32-NEXT:    [[TMP13:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK45-32-NEXT:    [[TMP14:%.*]] = load i32, ptr [[J]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK45-32-NEXT:    [[TMP15:%.*]] = load i32, ptr [[F_ADDR]], align 4, !llvm.access.group [[ACC_GRP26]]
 // CHECK45-32-NEXT:    [[MUL8:%.*]] = mul nsw i32 [[TMP14]], [[TMP15]]
 // CHECK45-32-NEXT:    [[ADD9:%.*]] = add nsw i32 [[TMP13]], [[MUL8]]
-// CHECK45-32-NEXT:    [[TMP16:%.*]] = load i32, ptr [[K]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK45-32-NEXT:    [[TMP16:%.*]] = load i32, ptr [[K]], align 4, !llvm.access.group [[ACC_GRP26]]
 // CHECK45-32-NEXT:    [[ADD10:%.*]] = add nsw i32 [[ADD9]], [[TMP16]]
-// CHECK45-32-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK45-32-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP26]]
 // CHECK45-32-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP0]], i32 0, i32 [[TMP17]]
-// CHECK45-32-NEXT:    [[TMP18:%.*]] = load i32, ptr [[J]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK45-32-NEXT:    [[TMP18:%.*]] = load i32, ptr [[J]], align 4, !llvm.access.group [[ACC_GRP26]]
 // CHECK45-32-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX]], i32 0, i32 [[TMP18]]
-// CHECK45-32-NEXT:    store i32 [[ADD10]], ptr [[ARRAYIDX11]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK45-32-NEXT:    store i32 [[ADD10]], ptr [[ARRAYIDX11]], align 4, !llvm.access.group [[ACC_GRP26]]
 // CHECK45-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK45-32:       omp.body.continue:
 // CHECK45-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK45-32:       omp.inner.for.inc:
-// CHECK45-32-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK45-32-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
 // CHECK45-32-NEXT:    [[ADD12:%.*]] = add nsw i32 [[TMP19]], 1
-// CHECK45-32-NEXT:    store i32 [[ADD12]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
-// CHECK45-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP23:![0-9]+]]
+// CHECK45-32-NEXT:    store i32 [[ADD12]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK45-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP27:![0-9]+]]
 // CHECK45-32:       omp.inner.for.end:
 // CHECK45-32-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK45-32:       omp.dispatch.inc:
@@ -1331,28 +1331,28 @@ int bar(int n){
 // CHECK45-32-EX:       omp.dispatch.body:
 // CHECK45-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK45-32-EX:       omp.inner.for.cond:
-// CHECK45-32-EX-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12:![0-9]+]]
-// CHECK45-32-EX-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK45-32-EX-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16:![0-9]+]]
+// CHECK45-32-EX-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP16]]
 // CHECK45-32-EX-NEXT:    [[CMP7:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]]
 // CHECK45-32-EX-NEXT:    br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK45-32-EX:       omp.inner.for.body:
-// CHECK45-32-EX-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK45-32-EX-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
 // CHECK45-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP16]], 1
 // CHECK45-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK45-32-EX-NEXT:    store i32 [[ADD]], ptr [[I4]], align 4, !llvm.access.group [[ACC_GRP12]]
-// CHECK45-32-EX-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I4]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK45-32-EX-NEXT:    store i32 [[ADD]], ptr [[I4]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK45-32-EX-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I4]], align 4, !llvm.access.group [[ACC_GRP16]]
 // CHECK45-32-EX-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], ptr [[TMP0]], i32 0, i32 [[TMP17]]
-// CHECK45-32-EX-NEXT:    store i32 1, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP12]]
-// CHECK45-32-EX-NEXT:    [[TMP18:%.*]] = load i32, ptr [[I4]], align 4, !llvm.access.group [[ACC_GRP12]]
-// CHECK45-32-EX-NEXT:    store i32 [[TMP18]], ptr [[L_ADDR]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK45-32-EX-NEXT:    store i32 1, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK45-32-EX-NEXT:    [[TMP18:%.*]] = load i32, ptr [[I4]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK45-32-EX-NEXT:    store i32 [[TMP18]], ptr [[L_ADDR]], align 4, !llvm.access.group [[ACC_GRP16]]
 // CHECK45-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK45-32-EX:       omp.body.continue:
 // CHECK45-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK45-32-EX:       omp.inner.for.inc:
-// CHECK45-32-EX-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK45-32-EX-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
 // CHECK45-32-EX-NEXT:    [[ADD8:%.*]] = add nsw i32 [[TMP19]], 1
-// CHECK45-32-EX-NEXT:    store i32 [[ADD8]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12]]
-// CHECK45-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP13:![0-9]+]]
+// CHECK45-32-EX-NEXT:    store i32 [[ADD8]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK45-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP17:![0-9]+]]
 // CHECK45-32-EX:       omp.inner.for.end:
 // CHECK45-32-EX-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK45-32-EX:       omp.dispatch.inc:
@@ -1490,30 +1490,30 @@ int bar(int n){
 // CHECK45-32-EX:       omp.dispatch.body:
 // CHECK45-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK45-32-EX:       omp.inner.for.cond:
-// CHECK45-32-EX-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16:![0-9]+]]
-// CHECK45-32-EX-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK45-32-EX-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP20:![0-9]+]]
+// CHECK45-32-EX-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP20]]
 // CHECK45-32-EX-NEXT:    [[CMP6:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]]
 // CHECK45-32-EX-NEXT:    br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK45-32-EX:       omp.inner.for.body:
-// CHECK45-32-EX-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK45-32-EX-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP20]]
 // CHECK45-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP16]], 1
 // CHECK45-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK45-32-EX-NEXT:    store i32 [[ADD]], ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK45-32-EX-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK45-32-EX-NEXT:    store i32 [[ADD]], ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP20]]
+// CHECK45-32-EX-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP20]]
 // CHECK45-32-EX-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], ptr [[TMP0]], i32 0, i32 [[TMP17]]
-// CHECK45-32-EX-NEXT:    [[TMP18:%.*]] = load i16, ptr [[ARRAYIDX]], align 2, !llvm.access.group [[ACC_GRP16]]
+// CHECK45-32-EX-NEXT:    [[TMP18:%.*]] = load i16, ptr [[ARRAYIDX]], align 2, !llvm.access.group [[ACC_GRP20]]
 // CHECK45-32-EX-NEXT:    [[CONV:%.*]] = sext i16 [[TMP18]] to i32
 // CHECK45-32-EX-NEXT:    [[ADD7:%.*]] = add nsw i32 [[CONV]], 1
 // CHECK45-32-EX-NEXT:    [[CONV8:%.*]] = trunc i32 [[ADD7]] to i16
-// CHECK45-32-EX-NEXT:    store i16 [[CONV8]], ptr [[ARRAYIDX]], align 2, !llvm.access.group [[ACC_GRP16]]
+// CHECK45-32-EX-NEXT:    store i16 [[CONV8]], ptr [[ARRAYIDX]], align 2, !llvm.access.group [[ACC_GRP20]]
 // CHECK45-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK45-32-EX:       omp.body.continue:
 // CHECK45-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK45-32-EX:       omp.inner.for.inc:
-// CHECK45-32-EX-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK45-32-EX-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP20]]
 // CHECK45-32-EX-NEXT:    [[ADD9:%.*]] = add nsw i32 [[TMP19]], 1
-// CHECK45-32-EX-NEXT:    store i32 [[ADD9]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK45-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP17:![0-9]+]]
+// CHECK45-32-EX-NEXT:    store i32 [[ADD9]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP20]]
+// CHECK45-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP21:![0-9]+]]
 // CHECK45-32-EX:       omp.inner.for.end:
 // CHECK45-32-EX-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK45-32-EX:       omp.dispatch.inc:
@@ -1616,28 +1616,28 @@ int bar(int n){
 // CHECK45-32-EX:       omp.dispatch.body:
 // CHECK45-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK45-32-EX:       omp.inner.for.cond:
-// CHECK45-32-EX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19:![0-9]+]]
-// CHECK45-32-EX-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK45-32-EX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP23:![0-9]+]]
+// CHECK45-32-EX-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP23]]
 // CHECK45-32-EX-NEXT:    [[CMP2:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
 // CHECK45-32-EX-NEXT:    br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK45-32-EX:       omp.inner.for.body:
-// CHECK45-32-EX-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK45-32-EX-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP23]]
 // CHECK45-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
 // CHECK45-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK45-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP19]]
-// CHECK45-32-EX-NEXT:    [[TMP11:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK45-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK45-32-EX-NEXT:    [[TMP11:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP23]]
 // CHECK45-32-EX-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP0]], i32 0, i32 [[TMP11]]
-// CHECK45-32-EX-NEXT:    [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK45-32-EX-NEXT:    [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP23]]
 // CHECK45-32-EX-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP12]], 1
-// CHECK45-32-EX-NEXT:    store i32 [[ADD3]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK45-32-EX-NEXT:    store i32 [[ADD3]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP23]]
 // CHECK45-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK45-32-EX:       omp.body.continue:
 // CHECK45-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK45-32-EX:       omp.inner.for.inc:
-// CHECK45-32-EX-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK45-32-EX-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP23]]
 // CHECK45-32-EX-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP13]], 1
-// CHECK45-32-EX-NEXT:    store i32 [[ADD4]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19]]
-// CHECK45-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP20:![0-9]+]]
+// CHECK45-32-EX-NEXT:    store i32 [[ADD4]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK45-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP24:![0-9]+]]
 // CHECK45-32-EX:       omp.inner.for.end:
 // CHECK45-32-EX-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK45-32-EX:       omp.dispatch.inc:
@@ -1742,45 +1742,45 @@ int bar(int n){
 // CHECK45-32-EX:       omp.dispatch.body:
 // CHECK45-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK45-32-EX:       omp.inner.for.cond:
-// CHECK45-32-EX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22:![0-9]+]]
-// CHECK45-32-EX-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK45-32-EX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26:![0-9]+]]
+// CHECK45-32-EX-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP26]]
 // CHECK45-32-EX-NEXT:    [[CMP3:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
 // CHECK45-32-EX-NEXT:    br i1 [[CMP3]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK45-32-EX:       omp.inner.for.body:
-// CHECK45-32-EX-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK45-32-EX-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
 // CHECK45-32-EX-NEXT:    [[DIV:%.*]] = sdiv i32 [[TMP10]], 10
 // CHECK45-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[DIV]], 1
 // CHECK45-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK45-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP22]]
-// CHECK45-32-EX-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
-// CHECK45-32-EX-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK45-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK45-32-EX-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK45-32-EX-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
 // CHECK45-32-EX-NEXT:    [[DIV4:%.*]] = sdiv i32 [[TMP12]], 10
 // CHECK45-32-EX-NEXT:    [[MUL5:%.*]] = mul nsw i32 [[DIV4]], 10
 // CHECK45-32-EX-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP11]], [[MUL5]]
 // CHECK45-32-EX-NEXT:    [[MUL6:%.*]] = mul nsw i32 [[SUB]], 1
 // CHECK45-32-EX-NEXT:    [[ADD7:%.*]] = add nsw i32 0, [[MUL6]]
-// CHECK45-32-EX-NEXT:    store i32 [[ADD7]], ptr [[J]], align 4, !llvm.access.group [[ACC_GRP22]]
-// CHECK45-32-EX-NEXT:    store i32 10, ptr [[K]], align 4, !llvm.access.group [[ACC_GRP22]]
-// CHECK45-32-EX-NEXT:    [[TMP13:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP22]]
-// CHECK45-32-EX-NEXT:    [[TMP14:%.*]] = load i32, ptr [[J]], align 4, !llvm.access.group [[ACC_GRP22]]
-// CHECK45-32-EX-NEXT:    [[TMP15:%.*]] = load i32, ptr [[F_ADDR]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK45-32-EX-NEXT:    store i32 [[ADD7]], ptr [[J]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK45-32-EX-NEXT:    store i32 10, ptr [[K]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK45-32-EX-NEXT:    [[TMP13:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK45-32-EX-NEXT:    [[TMP14:%.*]] = load i32, ptr [[J]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK45-32-EX-NEXT:    [[TMP15:%.*]] = load i32, ptr [[F_ADDR]], align 4, !llvm.access.group [[ACC_GRP26]]
 // CHECK45-32-EX-NEXT:    [[MUL8:%.*]] = mul nsw i32 [[TMP14]], [[TMP15]]
 // CHECK45-32-EX-NEXT:    [[ADD9:%.*]] = add nsw i32 [[TMP13]], [[MUL8]]
-// CHECK45-32-EX-NEXT:    [[TMP16:%.*]] = load i32, ptr [[K]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK45-32-EX-NEXT:    [[TMP16:%.*]] = load i32, ptr [[K]], align 4, !llvm.access.group [[ACC_GRP26]]
 // CHECK45-32-EX-NEXT:    [[ADD10:%.*]] = add nsw i32 [[ADD9]], [[TMP16]]
-// CHECK45-32-EX-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK45-32-EX-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP26]]
 // CHECK45-32-EX-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP0]], i32 0, i32 [[TMP17]]
-// CHECK45-32-EX-NEXT:    [[TMP18:%.*]] = load i32, ptr [[J]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK45-32-EX-NEXT:    [[TMP18:%.*]] = load i32, ptr [[J]], align 4, !llvm.access.group [[ACC_GRP26]]
 // CHECK45-32-EX-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX]], i32 0, i32 [[TMP18]]
-// CHECK45-32-EX-NEXT:    store i32 [[ADD10]], ptr [[ARRAYIDX11]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK45-32-EX-NEXT:    store i32 [[ADD10]], ptr [[ARRAYIDX11]], align 4, !llvm.access.group [[ACC_GRP26]]
 // CHECK45-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK45-32-EX:       omp.body.continue:
 // CHECK45-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK45-32-EX:       omp.inner.for.inc:
-// CHECK45-32-EX-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK45-32-EX-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
 // CHECK45-32-EX-NEXT:    [[ADD12:%.*]] = add nsw i32 [[TMP19]], 1
-// CHECK45-32-EX-NEXT:    store i32 [[ADD12]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
-// CHECK45-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP23:![0-9]+]]
+// CHECK45-32-EX-NEXT:    store i32 [[ADD12]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK45-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP27:![0-9]+]]
 // CHECK45-32-EX:       omp.inner.for.end:
 // CHECK45-32-EX-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK45-32-EX:       omp.dispatch.inc:
@@ -1909,29 +1909,29 @@ int bar(int n){
 // CHECK-64:       omp.dispatch.body:
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-64:       omp.inner.for.cond:
-// CHECK-64-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12:![0-9]+]]
-// CHECK-64-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK-64-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP16]]
 // CHECK-64-NEXT:    [[CMP7:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]]
 // CHECK-64-NEXT:    br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-64:       omp.inner.for.body:
-// CHECK-64-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK-64-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
 // CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP16]], 1
 // CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I4]], align 4, !llvm.access.group [[ACC_GRP12]]
-// CHECK-64-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I4]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I4]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK-64-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I4]], align 4, !llvm.access.group [[ACC_GRP16]]
 // CHECK-64-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP17]] to i64
 // CHECK-64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], ptr [[TMP0]], i64 0, i64 [[IDXPROM]]
-// CHECK-64-NEXT:    store i32 1, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP12]]
-// CHECK-64-NEXT:    [[TMP18:%.*]] = load i32, ptr [[I4]], align 4, !llvm.access.group [[ACC_GRP12]]
-// CHECK-64-NEXT:    store i32 [[TMP18]], ptr [[L_ADDR]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK-64-NEXT:    store i32 1, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK-64-NEXT:    [[TMP18:%.*]] = load i32, ptr [[I4]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK-64-NEXT:    store i32 [[TMP18]], ptr [[L_ADDR]], align 4, !llvm.access.group [[ACC_GRP16]]
 // CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-64:       omp.body.continue:
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-64:       omp.inner.for.inc:
-// CHECK-64-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK-64-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
 // CHECK-64-NEXT:    [[ADD8:%.*]] = add nsw i32 [[TMP19]], 1
-// CHECK-64-NEXT:    store i32 [[ADD8]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12]]
-// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP13:![0-9]+]]
+// CHECK-64-NEXT:    store i32 [[ADD8]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP17:![0-9]+]]
 // CHECK-64:       omp.inner.for.end:
 // CHECK-64-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-64:       omp.dispatch.inc:
@@ -2069,31 +2069,31 @@ int bar(int n){
 // CHECK-64:       omp.dispatch.body:
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-64:       omp.inner.for.cond:
-// CHECK-64-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16:![0-9]+]]
-// CHECK-64-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK-64-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP20:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP20]]
 // CHECK-64-NEXT:    [[CMP6:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]]
 // CHECK-64-NEXT:    br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-64:       omp.inner.for.body:
-// CHECK-64-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK-64-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP20]]
 // CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP16]], 1
 // CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK-64-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP20]]
+// CHECK-64-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP20]]
 // CHECK-64-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP17]] to i64
 // CHECK-64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], ptr [[TMP0]], i64 0, i64 [[IDXPROM]]
-// CHECK-64-NEXT:    [[TMP18:%.*]] = load i16, ptr [[ARRAYIDX]], align 2, !llvm.access.group [[ACC_GRP16]]
+// CHECK-64-NEXT:    [[TMP18:%.*]] = load i16, ptr [[ARRAYIDX]], align 2, !llvm.access.group [[ACC_GRP20]]
 // CHECK-64-NEXT:    [[CONV:%.*]] = sext i16 [[TMP18]] to i32
 // CHECK-64-NEXT:    [[ADD7:%.*]] = add nsw i32 [[CONV]], 1
 // CHECK-64-NEXT:    [[CONV8:%.*]] = trunc i32 [[ADD7]] to i16
-// CHECK-64-NEXT:    store i16 [[CONV8]], ptr [[ARRAYIDX]], align 2, !llvm.access.group [[ACC_GRP16]]
+// CHECK-64-NEXT:    store i16 [[CONV8]], ptr [[ARRAYIDX]], align 2, !llvm.access.group [[ACC_GRP20]]
 // CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-64:       omp.body.continue:
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-64:       omp.inner.for.inc:
-// CHECK-64-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK-64-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP20]]
 // CHECK-64-NEXT:    [[ADD9:%.*]] = add nsw i32 [[TMP19]], 1
-// CHECK-64-NEXT:    store i32 [[ADD9]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP17:![0-9]+]]
+// CHECK-64-NEXT:    store i32 [[ADD9]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP20]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP21:![0-9]+]]
 // CHECK-64:       omp.inner.for.end:
 // CHECK-64-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-64:       omp.dispatch.inc:
@@ -2196,29 +2196,29 @@ int bar(int n){
 // CHECK-64:       omp.dispatch.body:
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-64:       omp.inner.for.cond:
-// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19:![0-9]+]]
-// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP23:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP23]]
 // CHECK-64-NEXT:    [[CMP2:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
 // CHECK-64-NEXT:    br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-64:       omp.inner.for.body:
-// CHECK-64-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK-64-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP23]]
 // CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
 // CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP19]]
-// CHECK-64-NEXT:    [[TMP11:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK-64-NEXT:    [[TMP11:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP23]]
 // CHECK-64-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP11]] to i64
 // CHECK-64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP0]], i64 0, i64 [[IDXPROM]]
-// CHECK-64-NEXT:    [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK-64-NEXT:    [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP23]]
 // CHECK-64-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP12]], 1
-// CHECK-64-NEXT:    store i32 [[ADD3]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK-64-NEXT:    store i32 [[ADD3]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP23]]
 // CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-64:       omp.body.continue:
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-64:       omp.inner.for.inc:
-// CHECK-64-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK-64-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP23]]
 // CHECK-64-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP13]], 1
-// CHECK-64-NEXT:    store i32 [[ADD4]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19]]
-// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP20:![0-9]+]]
+// CHECK-64-NEXT:    store i32 [[ADD4]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP24:![0-9]+]]
 // CHECK-64:       omp.inner.for.end:
 // CHECK-64-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-64:       omp.dispatch.inc:
@@ -2323,47 +2323,47 @@ int bar(int n){
 // CHECK-64:       omp.dispatch.body:
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-64:       omp.inner.for.cond:
-// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22:![0-9]+]]
-// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP26]]
 // CHECK-64-NEXT:    [[CMP3:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
 // CHECK-64-NEXT:    br i1 [[CMP3]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-64:       omp.inner.for.body:
-// CHECK-64-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK-64-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
 // CHECK-64-NEXT:    [[DIV:%.*]] = sdiv i32 [[TMP10]], 10
 // CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[DIV]], 1
 // CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP22]]
-// CHECK-64-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
-// CHECK-64-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK-64-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK-64-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
 // CHECK-64-NEXT:    [[DIV4:%.*]] = sdiv i32 [[TMP12]], 10
 // CHECK-64-NEXT:    [[MUL5:%.*]] = mul nsw i32 [[DIV4]], 10
 // CHECK-64-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP11]], [[MUL5]]
 // CHECK-64-NEXT:    [[MUL6:%.*]] = mul nsw i32 [[SUB]], 1
 // CHECK-64-NEXT:    [[ADD7:%.*]] = add nsw i32 0, [[MUL6]]
-// CHECK-64-NEXT:    store i32 [[ADD7]], ptr [[J]], align 4, !llvm.access.group [[ACC_GRP22]]
-// CHECK-64-NEXT:    store i32 10, ptr [[K]], align 4, !llvm.access.group [[ACC_GRP22]]
-// CHECK-64-NEXT:    [[TMP13:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP22]]
-// CHECK-64-NEXT:    [[TMP14:%.*]] = load i32, ptr [[J]], align 4, !llvm.access.group [[ACC_GRP22]]
-// CHECK-64-NEXT:    [[TMP15:%.*]] = load i32, ptr [[F_ADDR]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK-64-NEXT:    store i32 [[ADD7]], ptr [[J]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK-64-NEXT:    store i32 10, ptr [[K]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK-64-NEXT:    [[TMP13:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK-64-NEXT:    [[TMP14:%.*]] = load i32, ptr [[J]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK-64-NEXT:    [[TMP15:%.*]] = load i32, ptr [[F_ADDR]], align 4, !llvm.access.group [[ACC_GRP26]]
 // CHECK-64-NEXT:    [[MUL8:%.*]] = mul nsw i32 [[TMP14]], [[TMP15]]
 // CHECK-64-NEXT:    [[ADD9:%.*]] = add nsw i32 [[TMP13]], [[MUL8]]
-// CHECK-64-NEXT:    [[TMP16:%.*]] = load i32, ptr [[K]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK-64-NEXT:    [[TMP16:%.*]] = load i32, ptr [[K]], align 4, !llvm.access.group [[ACC_GRP26]]
 // CHECK-64-NEXT:    [[ADD10:%.*]] = add nsw i32 [[ADD9]], [[TMP16]]
-// CHECK-64-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK-64-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP26]]
 // CHECK-64-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP17]] to i64
 // CHECK-64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP0]], i64 0, i64 [[IDXPROM]]
-// CHECK-64-NEXT:    [[TMP18:%.*]] = load i32, ptr [[J]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK-64-NEXT:    [[TMP18:%.*]] = load i32, ptr [[J]], align 4, !llvm.access.group [[ACC_GRP26]]
 // CHECK-64-NEXT:    [[IDXPROM11:%.*]] = sext i32 [[TMP18]] to i64
 // CHECK-64-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX]], i64 0, i64 [[IDXPROM11]]
-// CHECK-64-NEXT:    store i32 [[ADD10]], ptr [[ARRAYIDX12]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK-64-NEXT:    store i32 [[ADD10]], ptr [[ARRAYIDX12]], align 4, !llvm.access.group [[ACC_GRP26]]
 // CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-64:       omp.body.continue:
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-64:       omp.inner.for.inc:
-// CHECK-64-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK-64-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
 // CHECK-64-NEXT:    [[ADD13:%.*]] = add nsw i32 [[TMP19]], 1
-// CHECK-64-NEXT:    store i32 [[ADD13]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
-// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP23:![0-9]+]]
+// CHECK-64-NEXT:    store i32 [[ADD13]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP27:![0-9]+]]
 // CHECK-64:       omp.inner.for.end:
 // CHECK-64-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-64:       omp.dispatch.inc:
@@ -2492,28 +2492,28 @@ int bar(int n){
 // CHECK-32:       omp.dispatch.body:
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32:       omp.inner.for.cond:
-// CHECK-32-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12:![0-9]+]]
-// CHECK-32-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK-32-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP16]]
 // CHECK-32-NEXT:    [[CMP7:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]]
 // CHECK-32-NEXT:    br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32:       omp.inner.for.body:
-// CHECK-32-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK-32-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
 // CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP16]], 1
 // CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I4]], align 4, !llvm.access.group [[ACC_GRP12]]
-// CHECK-32-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I4]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I4]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK-32-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I4]], align 4, !llvm.access.group [[ACC_GRP16]]
 // CHECK-32-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], ptr [[TMP0]], i32 0, i32 [[TMP17]]
-// CHECK-32-NEXT:    store i32 1, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP12]]
-// CHECK-32-NEXT:    [[TMP18:%.*]] = load i32, ptr [[I4]], align 4, !llvm.access.group [[ACC_GRP12]]
-// CHECK-32-NEXT:    store i32 [[TMP18]], ptr [[L_ADDR]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK-32-NEXT:    store i32 1, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK-32-NEXT:    [[TMP18:%.*]] = load i32, ptr [[I4]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK-32-NEXT:    store i32 [[TMP18]], ptr [[L_ADDR]], align 4, !llvm.access.group [[ACC_GRP16]]
 // CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32:       omp.body.continue:
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32:       omp.inner.for.inc:
-// CHECK-32-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK-32-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
 // CHECK-32-NEXT:    [[ADD8:%.*]] = add nsw i32 [[TMP19]], 1
-// CHECK-32-NEXT:    store i32 [[ADD8]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12]]
-// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP13:![0-9]+]]
+// CHECK-32-NEXT:    store i32 [[ADD8]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP17:![0-9]+]]
 // CHECK-32:       omp.inner.for.end:
 // CHECK-32-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-32:       omp.dispatch.inc:
@@ -2651,30 +2651,30 @@ int bar(int n){
 // CHECK-32:       omp.dispatch.body:
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32:       omp.inner.for.cond:
-// CHECK-32-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16:![0-9]+]]
-// CHECK-32-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK-32-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP20:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP20]]
 // CHECK-32-NEXT:    [[CMP6:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]]
 // CHECK-32-NEXT:    br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32:       omp.inner.for.body:
-// CHECK-32-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK-32-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP20]]
 // CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP16]], 1
 // CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK-32-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP20]]
+// CHECK-32-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP20]]
 // CHECK-32-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], ptr [[TMP0]], i32 0, i32 [[TMP17]]
-// CHECK-32-NEXT:    [[TMP18:%.*]] = load i16, ptr [[ARRAYIDX]], align 2, !llvm.access.group [[ACC_GRP16]]
+// CHECK-32-NEXT:    [[TMP18:%.*]] = load i16, ptr [[ARRAYIDX]], align 2, !llvm.access.group [[ACC_GRP20]]
 // CHECK-32-NEXT:    [[CONV:%.*]] = sext i16 [[TMP18]] to i32
 // CHECK-32-NEXT:    [[ADD7:%.*]] = add nsw i32 [[CONV]], 1
 // CHECK-32-NEXT:    [[CONV8:%.*]] = trunc i32 [[ADD7]] to i16
-// CHECK-32-NEXT:    store i16 [[CONV8]], ptr [[ARRAYIDX]], align 2, !llvm.access.group [[ACC_GRP16]]
+// CHECK-32-NEXT:    store i16 [[CONV8]], ptr [[ARRAYIDX]], align 2, !llvm.access.group [[ACC_GRP20]]
 // CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32:       omp.body.continue:
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32:       omp.inner.for.inc:
-// CHECK-32-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK-32-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP20]]
 // CHECK-32-NEXT:    [[ADD9:%.*]] = add nsw i32 [[TMP19]], 1
-// CHECK-32-NEXT:    store i32 [[ADD9]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP17:![0-9]+]]
+// CHECK-32-NEXT:    store i32 [[ADD9]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP20]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP21:![0-9]+]]
 // CHECK-32:       omp.inner.for.end:
 // CHECK-32-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-32:       omp.dispatch.inc:
@@ -2777,28 +2777,28 @@ int bar(int n){
 // CHECK-32:       omp.dispatch.body:
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32:       omp.inner.for.cond:
-// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19:![0-9]+]]
-// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP23:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP23]]
 // CHECK-32-NEXT:    [[CMP2:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
 // CHECK-32-NEXT:    br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32:       omp.inner.for.body:
-// CHECK-32-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK-32-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP23]]
 // CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
 // CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP19]]
-// CHECK-32-NEXT:    [[TMP11:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK-32-NEXT:    [[TMP11:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP23]]
 // CHECK-32-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP0]], i32 0, i32 [[TMP11]]
-// CHECK-32-NEXT:    [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK-32-NEXT:    [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP23]]
 // CHECK-32-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP12]], 1
-// CHECK-32-NEXT:    store i32 [[ADD3]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK-32-NEXT:    store i32 [[ADD3]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP23]]
 // CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32:       omp.body.continue:
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32:       omp.inner.for.inc:
-// CHECK-32-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK-32-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP23]]
 // CHECK-32-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP13]], 1
-// CHECK-32-NEXT:    store i32 [[ADD4]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19]]
-// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP20:![0-9]+]]
+// CHECK-32-NEXT:    store i32 [[ADD4]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP24:![0-9]+]]
 // CHECK-32:       omp.inner.for.end:
 // CHECK-32-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-32:       omp.dispatch.inc:
@@ -2903,45 +2903,45 @@ int bar(int n){
 // CHECK-32:       omp.dispatch.body:
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32:       omp.inner.for.cond:
-// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22:![0-9]+]]
-// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP26]]
 // CHECK-32-NEXT:    [[CMP3:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
 // CHECK-32-NEXT:    br i1 [[CMP3]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32:       omp.inner.for.body:
-// CHECK-32-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK-32-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
 // CHECK-32-NEXT:    [[DIV:%.*]] = sdiv i32 [[TMP10]], 10
 // CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[DIV]], 1
 // CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP22]]
-// CHECK-32-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
-// CHECK-32-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK-32-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK-32-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
 // CHECK-32-NEXT:    [[DIV4:%.*]] = sdiv i32 [[TMP12]], 10
 // CHECK-32-NEXT:    [[MUL5:%.*]] = mul nsw i32 [[DIV4]], 10
 // CHECK-32-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP11]], [[MUL5]]
 // CHECK-32-NEXT:    [[MUL6:%.*]] = mul nsw i32 [[SUB]], 1
 // CHECK-32-NEXT:    [[ADD7:%.*]] = add nsw i32 0, [[MUL6]]
-// CHECK-32-NEXT:    store i32 [[ADD7]], ptr [[J]], align 4, !llvm.access.group [[ACC_GRP22]]
-// CHECK-32-NEXT:    store i32 10, ptr [[K]], align 4, !llvm.access.group [[ACC_GRP22]]
-// CHECK-32-NEXT:    [[TMP13:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP22]]
-// CHECK-32-NEXT:    [[TMP14:%.*]] = load i32, ptr [[J]], align 4, !llvm.access.group [[ACC_GRP22]]
-// CHECK-32-NEXT:    [[TMP15:%.*]] = load i32, ptr [[F_ADDR]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK-32-NEXT:    store i32 [[ADD7]], ptr [[J]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK-32-NEXT:    store i32 10, ptr [[K]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK-32-NEXT:    [[TMP13:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK-32-NEXT:    [[TMP14:%.*]] = load i32, ptr [[J]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK-32-NEXT:    [[TMP15:%.*]] = load i32, ptr [[F_ADDR]], align 4, !llvm.access.group [[ACC_GRP26]]
 // CHECK-32-NEXT:    [[MUL8:%.*]] = mul nsw i32 [[TMP14]], [[TMP15]]
 // CHECK-32-NEXT:    [[ADD9:%.*]] = add nsw i32 [[TMP13]], [[MUL8]]
-// CHECK-32-NEXT:    [[TMP16:%.*]] = load i32, ptr [[K]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK-32-NEXT:    [[TMP16:%.*]] = load i32, ptr [[K]], align 4, !llvm.access.group [[ACC_GRP26]]
 // CHECK-32-NEXT:    [[ADD10:%.*]] = add nsw i32 [[ADD9]], [[TMP16]]
-// CHECK-32-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK-32-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP26]]
 // CHECK-32-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP0]], i32 0, i32 [[TMP17]]
-// CHECK-32-NEXT:    [[TMP18:%.*]] = load i32, ptr [[J]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK-32-NEXT:    [[TMP18:%.*]] = load i32, ptr [[J]], align 4, !llvm.access.group [[ACC_GRP26]]
 // CHECK-32-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX]], i32 0, i32 [[TMP18]]
-// CHECK-32-NEXT:    store i32 [[ADD10]], ptr [[ARRAYIDX11]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK-32-NEXT:    store i32 [[ADD10]], ptr [[ARRAYIDX11]], align 4, !llvm.access.group [[ACC_GRP26]]
 // CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32:       omp.body.continue:
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32:       omp.inner.for.inc:
-// CHECK-32-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK-32-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
 // CHECK-32-NEXT:    [[ADD12:%.*]] = add nsw i32 [[TMP19]], 1
-// CHECK-32-NEXT:    store i32 [[ADD12]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
-// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP23:![0-9]+]]
+// CHECK-32-NEXT:    store i32 [[ADD12]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP27:![0-9]+]]
 // CHECK-32:       omp.inner.for.end:
 // CHECK-32-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-32:       omp.dispatch.inc:
@@ -3070,28 +3070,28 @@ int bar(int n){
 // CHECK-32-EX:       omp.dispatch.body:
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32-EX:       omp.inner.for.cond:
-// CHECK-32-EX-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12:![0-9]+]]
-// CHECK-32-EX-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK-32-EX-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP16]]
 // CHECK-32-EX-NEXT:    [[CMP7:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]]
 // CHECK-32-EX-NEXT:    br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32-EX:       omp.inner.for.body:
-// CHECK-32-EX-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK-32-EX-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
 // CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP16]], 1
 // CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I4]], align 4, !llvm.access.group [[ACC_GRP12]]
-// CHECK-32-EX-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I4]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I4]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK-32-EX-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I4]], align 4, !llvm.access.group [[ACC_GRP16]]
 // CHECK-32-EX-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], ptr [[TMP0]], i32 0, i32 [[TMP17]]
-// CHECK-32-EX-NEXT:    store i32 1, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP12]]
-// CHECK-32-EX-NEXT:    [[TMP18:%.*]] = load i32, ptr [[I4]], align 4, !llvm.access.group [[ACC_GRP12]]
-// CHECK-32-EX-NEXT:    store i32 [[TMP18]], ptr [[L_ADDR]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK-32-EX-NEXT:    store i32 1, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK-32-EX-NEXT:    [[TMP18:%.*]] = load i32, ptr [[I4]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK-32-EX-NEXT:    store i32 [[TMP18]], ptr [[L_ADDR]], align 4, !llvm.access.group [[ACC_GRP16]]
 // CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32-EX:       omp.body.continue:
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32-EX:       omp.inner.for.inc:
-// CHECK-32-EX-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK-32-EX-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
 // CHECK-32-EX-NEXT:    [[ADD8:%.*]] = add nsw i32 [[TMP19]], 1
-// CHECK-32-EX-NEXT:    store i32 [[ADD8]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12]]
-// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP13:![0-9]+]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD8]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP17:![0-9]+]]
 // CHECK-32-EX:       omp.inner.for.end:
 // CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-32-EX:       omp.dispatch.inc:
@@ -3229,30 +3229,30 @@ int bar(int n){
 // CHECK-32-EX:       omp.dispatch.body:
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32-EX:       omp.inner.for.cond:
-// CHECK-32-EX-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16:![0-9]+]]
-// CHECK-32-EX-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK-32-EX-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP20:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP20]]
 // CHECK-32-EX-NEXT:    [[CMP6:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]]
 // CHECK-32-EX-NEXT:    br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32-EX:       omp.inner.for.body:
-// CHECK-32-EX-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK-32-EX-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP20]]
 // CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP16]], 1
 // CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK-32-EX-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP20]]
+// CHECK-32-EX-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP20]]
 // CHECK-32-EX-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], ptr [[TMP0]], i32 0, i32 [[TMP17]]
-// CHECK-32-EX-NEXT:    [[TMP18:%.*]] = load i16, ptr [[ARRAYIDX]], align 2, !llvm.access.group [[ACC_GRP16]]
+// CHECK-32-EX-NEXT:    [[TMP18:%.*]] = load i16, ptr [[ARRAYIDX]], align 2, !llvm.access.group [[ACC_GRP20]]
 // CHECK-32-EX-NEXT:    [[CONV:%.*]] = sext i16 [[TMP18]] to i32
 // CHECK-32-EX-NEXT:    [[ADD7:%.*]] = add nsw i32 [[CONV]], 1
 // CHECK-32-EX-NEXT:    [[CONV8:%.*]] = trunc i32 [[ADD7]] to i16
-// CHECK-32-EX-NEXT:    store i16 [[CONV8]], ptr [[ARRAYIDX]], align 2, !llvm.access.group [[ACC_GRP16]]
+// CHECK-32-EX-NEXT:    store i16 [[CONV8]], ptr [[ARRAYIDX]], align 2, !llvm.access.group [[ACC_GRP20]]
 // CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32-EX:       omp.body.continue:
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32-EX:       omp.inner.for.inc:
-// CHECK-32-EX-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK-32-EX-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP20]]
 // CHECK-32-EX-NEXT:    [[ADD9:%.*]] = add nsw i32 [[TMP19]], 1
-// CHECK-32-EX-NEXT:    store i32 [[ADD9]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP17:![0-9]+]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD9]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP20]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP21:![0-9]+]]
 // CHECK-32-EX:       omp.inner.for.end:
 // CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-32-EX:       omp.dispatch.inc:
@@ -3355,28 +3355,28 @@ int bar(int n){
 // CHECK-32-EX:       omp.dispatch.body:
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32-EX:       omp.inner.for.cond:
-// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19:![0-9]+]]
-// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP23:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP23]]
 // CHECK-32-EX-NEXT:    [[CMP2:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
 // CHECK-32-EX-NEXT:    br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32-EX:       omp.inner.for.body:
-// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP23]]
 // CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
 // CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP19]]
-// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP23]]
 // CHECK-32-EX-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP0]], i32 0, i32 [[TMP11]]
-// CHECK-32-EX-NEXT:    [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK-32-EX-NEXT:    [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP23]]
 // CHECK-32-EX-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP12]], 1
-// CHECK-32-EX-NEXT:    store i32 [[ADD3]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD3]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP23]]
 // CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32-EX:       omp.body.continue:
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32-EX:       omp.inner.for.inc:
-// CHECK-32-EX-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK-32-EX-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP23]]
 // CHECK-32-EX-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP13]], 1
-// CHECK-32-EX-NEXT:    store i32 [[ADD4]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19]]
-// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP20:![0-9]+]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD4]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP24:![0-9]+]]
 // CHECK-32-EX:       omp.inner.for.end:
 // CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-32-EX:       omp.dispatch.inc:
@@ -3481,45 +3481,45 @@ int bar(int n){
 // CHECK-32-EX:       omp.dispatch.body:
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32-EX:       omp.inner.for.cond:
-// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22:![0-9]+]]
-// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP26]]
 // CHECK-32-EX-NEXT:    [[CMP3:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
 // CHECK-32-EX-NEXT:    br i1 [[CMP3]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32-EX:       omp.inner.for.body:
-// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
 // CHECK-32-EX-NEXT:    [[DIV:%.*]] = sdiv i32 [[TMP10]], 10
 // CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[DIV]], 1
 // CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP22]]
-// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
-// CHECK-32-EX-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK-32-EX-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
 // CHECK-32-EX-NEXT:    [[DIV4:%.*]] = sdiv i32 [[TMP12]], 10
 // CHECK-32-EX-NEXT:    [[MUL5:%.*]] = mul nsw i32 [[DIV4]], 10
 // CHECK-32-EX-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP11]], [[MUL5]]
 // CHECK-32-EX-NEXT:    [[MUL6:%.*]] = mul nsw i32 [[SUB]], 1
 // CHECK-32-EX-NEXT:    [[ADD7:%.*]] = add nsw i32 0, [[MUL6]]
-// CHECK-32-EX-NEXT:    store i32 [[ADD7]], ptr [[J]], align 4, !llvm.access.group [[ACC_GRP22]]
-// CHECK-32-EX-NEXT:    store i32 10, ptr [[K]], align 4, !llvm.access.group [[ACC_GRP22]]
-// CHECK-32-EX-NEXT:    [[TMP13:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP22]]
-// CHECK-32-EX-NEXT:    [[TMP14:%.*]] = load i32, ptr [[J]], align 4, !llvm.access.group [[ACC_GRP22]]
-// CHECK-32-EX-NEXT:    [[TMP15:%.*]] = load i32, ptr [[F_ADDR]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD7]], ptr [[J]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK-32-EX-NEXT:    store i32 10, ptr [[K]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK-32-EX-NEXT:    [[TMP13:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK-32-EX-NEXT:    [[TMP14:%.*]] = load i32, ptr [[J]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK-32-EX-NEXT:    [[TMP15:%.*]] = load i32, ptr [[F_ADDR]], align 4, !llvm.access.group [[ACC_GRP26]]
 // CHECK-32-EX-NEXT:    [[MUL8:%.*]] = mul nsw i32 [[TMP14]], [[TMP15]]
 // CHECK-32-EX-NEXT:    [[ADD9:%.*]] = add nsw i32 [[TMP13]], [[MUL8]]
-// CHECK-32-EX-NEXT:    [[TMP16:%.*]] = load i32, ptr [[K]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK-32-EX-NEXT:    [[TMP16:%.*]] = load i32, ptr [[K]], align 4, !llvm.access.group [[ACC_GRP26]]
 // CHECK-32-EX-NEXT:    [[ADD10:%.*]] = add nsw i32 [[ADD9]], [[TMP16]]
-// CHECK-32-EX-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK-32-EX-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP26]]
 // CHECK-32-EX-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP0]], i32 0, i32 [[TMP17]]
-// CHECK-32-EX-NEXT:    [[TMP18:%.*]] = load i32, ptr [[J]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK-32-EX-NEXT:    [[TMP18:%.*]] = load i32, ptr [[J]], align 4, !llvm.access.group [[ACC_GRP26]]
 // CHECK-32-EX-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX]], i32 0, i32 [[TMP18]]
-// CHECK-32-EX-NEXT:    store i32 [[ADD10]], ptr [[ARRAYIDX11]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD10]], ptr [[ARRAYIDX11]], align 4, !llvm.access.group [[ACC_GRP26]]
 // CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32-EX:       omp.body.continue:
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32-EX:       omp.inner.for.inc:
-// CHECK-32-EX-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK-32-EX-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
 // CHECK-32-EX-NEXT:    [[ADD12:%.*]] = add nsw i32 [[TMP19]], 1
-// CHECK-32-EX-NEXT:    store i32 [[ADD12]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
-// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP23:![0-9]+]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD12]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP27:![0-9]+]]
 // CHECK-32-EX:       omp.inner.for.end:
 // CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-32-EX:       omp.dispatch.inc:

From 289a0f255dbe0a3cc7b31d9bd5992f648df3fb9d Mon Sep 17 00:00:00 2001
From: Johannes Doerfert <johannes@jdoerfert.de>
Date: Mon, 23 Oct 2023 15:08:42 -0700
Subject: [PATCH 138/877] [OpenMP] Remove SPMD specific handling during
 globalization

Globalization and SPMD are different things that used to be conflated.
Some leftover crossover interactions remain, trying to remove them now.
---
 clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp      |  16 +-
 clang/lib/CodeGen/CGOpenMPRuntimeGPU.h        |   5 +-
 ..._teams_distribute_parallel_for_codegen.cpp | 222 +++++-----
 ...s_distribute_parallel_for_simd_codegen.cpp | 160 ++++---
 ...x_target_teams_distribute_simd_codegen.cpp | 396 +++++++++---------
 5 files changed, 386 insertions(+), 413 deletions(-)

diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
index 56de8d39fe856..0604ece7c2878 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -1046,10 +1046,8 @@ llvm::Function *CGOpenMPRuntimeGPU::emitTeamsOutlinedFunction(
 }
 
 void CGOpenMPRuntimeGPU::emitGenericVarsProlog(CodeGenFunction &CGF,
-                                                 SourceLocation Loc,
-                                                 bool WithSPMDCheck) {
-  if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic &&
-      getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD)
+                                               SourceLocation Loc) {
+  if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic)
     return;
 
   CGBuilderTy &Bld = CGF.Builder;
@@ -1158,10 +1156,8 @@ void CGOpenMPRuntimeGPU::getKmpcFreeShared(
                       {AddrSizePair.first, AddrSizePair.second});
 }
 
-void CGOpenMPRuntimeGPU::emitGenericVarsEpilog(CodeGenFunction &CGF,
-                                                 bool WithSPMDCheck) {
-  if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic &&
-      getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD)
+void CGOpenMPRuntimeGPU::emitGenericVarsEpilog(CodeGenFunction &CGF) {
+  if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic)
     return;
 
   const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
@@ -3350,13 +3346,13 @@ void CGOpenMPRuntimeGPU::emitFunctionProlog(CodeGenFunction &CGF,
     Data.insert(std::make_pair(VD, MappedVarData()));
   }
   if (!NeedToDelayGlobalization) {
-    emitGenericVarsProlog(CGF, D->getBeginLoc(), /*WithSPMDCheck=*/true);
+    emitGenericVarsProlog(CGF, D->getBeginLoc());
     struct GlobalizationScope final : EHScopeStack::Cleanup {
       GlobalizationScope() = default;
 
       void Emit(CodeGenFunction &CGF, Flags flags) override {
         static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime())
-            .emitGenericVarsEpilog(CGF, /*WithSPMDCheck=*/true);
+            .emitGenericVarsEpilog(CGF);
       }
     };
     CGF.EHStack.pushCleanup<GlobalizationScope>(NormalAndEHCleanup);
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
index 86871dfce418f..c4501a1a2a496 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
@@ -68,11 +68,10 @@ class CGOpenMPRuntimeGPU : public CGOpenMPRuntime {
                         bool IsSPMD);
 
   /// Helper for generic variables globalization prolog.
-  void emitGenericVarsProlog(CodeGenFunction &CGF, SourceLocation Loc,
-                             bool WithSPMDCheck = false);
+  void emitGenericVarsProlog(CodeGenFunction &CGF, SourceLocation Loc);
 
   /// Helper for generic variables globalization epilog.
-  void emitGenericVarsEpilog(CodeGenFunction &CGF, bool WithSPMDCheck = false);
+  void emitGenericVarsEpilog(CodeGenFunction &CGF);
 
   //
   // Base class overrides.
diff --git a/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp b/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp
index ce8b2d6b4622e..df43851378bcb 100644
--- a/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp
@@ -98,7 +98,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP6:%.*]] = load i64, ptr [[L_CASTED]], align 8
 // CHECK1-NEXT:    store i32 0, ptr [[DOTZERO_ADDR]], align 4
 // CHECK1-NEXT:    store i32 [[TMP2]], ptr [[DOTTHREADID_TEMP_]], align 4
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l28_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], i64 [[TMP4]], ptr [[TMP0]], i64 [[TMP6]]) #[[ATTR3:[0-9]+]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l28_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], i64 [[TMP4]], ptr [[TMP0]], i64 [[TMP6]]) #[[ATTR2:[0-9]+]]
 // CHECK1-NEXT:    call void @__kmpc_target_deinit()
 // CHECK1-NEXT:    ret void
 // CHECK1:       worker.exit:
@@ -116,13 +116,13 @@ int bar(int n){
 // CHECK1-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[TMP:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
-// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[I:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
-// CHECK1-NEXT:    [[I4:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[I3:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[N_CASTED:%.*]] = alloca i64, align 8
 // CHECK1-NEXT:    [[L_CASTED:%.*]] = alloca i64, align 8
 // CHECK1-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x ptr], align 8
@@ -132,21 +132,20 @@ int bar(int n){
 // CHECK1-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
 // CHECK1-NEXT:    store i64 [[L]], ptr [[L_ADDR]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK1-NEXT:    [[L1:%.*]] = call align 8 ptr @__kmpc_alloc_shared(i64 4)
 // CHECK1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[N_ADDR]], align 4
 // CHECK1-NEXT:    store i32 [[TMP1]], ptr [[DOTCAPTURE_EXPR_]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
 // CHECK1-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0
 // CHECK1-NEXT:    [[DIV:%.*]] = sdiv i32 [[SUB]], 1
-// CHECK1-NEXT:    [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1
-// CHECK1-NEXT:    store i32 [[SUB3]], ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK1-NEXT:    [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK1-NEXT:    store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
 // CHECK1-NEXT:    store i32 0, ptr [[I]], align 4
 // CHECK1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
 // CHECK1-NEXT:    [[CMP:%.*]] = icmp slt i32 0, [[TMP3]]
 // CHECK1-NEXT:    br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
 // CHECK1:       omp.precond.then:
 // CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
-// CHECK1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
 // CHECK1-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_COMB_UB]], align 4
 // CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
 // CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
@@ -154,11 +153,11 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4
 // CHECK1-NEXT:    call void @__kmpc_distribute_static_init_4(ptr @[[GLOB2:[0-9]+]], i32 [[TMP6]], i32 91, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 128)
 // CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
-// CHECK1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
-// CHECK1-NEXT:    [[CMP5:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]]
-// CHECK1-NEXT:    br i1 [[CMP5]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]]
+// CHECK1-NEXT:    br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
 // CHECK1:       cond.true:
-// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
 // CHECK1-NEXT:    br label [[COND_END:%.*]]
 // CHECK1:       cond.false:
 // CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
@@ -171,10 +170,10 @@ int bar(int n){
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK1:       omp.inner.for.cond:
 // CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
-// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
 // CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP13]], 1
-// CHECK1-NEXT:    [[CMP6:%.*]] = icmp slt i32 [[TMP12]], [[ADD]]
-// CHECK1-NEXT:    br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK1-NEXT:    [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]]
+// CHECK1-NEXT:    br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK1:       omp.inner.for.body:
 // CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
 // CHECK1-NEXT:    [[TMP15:%.*]] = zext i32 [[TMP14]] to i64
@@ -207,29 +206,29 @@ int bar(int n){
 // CHECK1:       omp.inner.for.inc:
 // CHECK1-NEXT:    [[TMP33:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
 // CHECK1-NEXT:    [[TMP34:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
-// CHECK1-NEXT:    [[ADD7:%.*]] = add nsw i32 [[TMP33]], [[TMP34]]
-// CHECK1-NEXT:    store i32 [[ADD7]], ptr [[DOTOMP_IV]], align 4
+// CHECK1-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP33]], [[TMP34]]
+// CHECK1-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4
 // CHECK1-NEXT:    [[TMP35:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
 // CHECK1-NEXT:    [[TMP36:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
-// CHECK1-NEXT:    [[ADD8:%.*]] = add nsw i32 [[TMP35]], [[TMP36]]
-// CHECK1-NEXT:    store i32 [[ADD8]], ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK1-NEXT:    [[ADD7:%.*]] = add nsw i32 [[TMP35]], [[TMP36]]
+// CHECK1-NEXT:    store i32 [[ADD7]], ptr [[DOTOMP_COMB_LB]], align 4
 // CHECK1-NEXT:    [[TMP37:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
 // CHECK1-NEXT:    [[TMP38:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
-// CHECK1-NEXT:    [[ADD9:%.*]] = add nsw i32 [[TMP37]], [[TMP38]]
-// CHECK1-NEXT:    store i32 [[ADD9]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT:    [[ADD8:%.*]] = add nsw i32 [[TMP37]], [[TMP38]]
+// CHECK1-NEXT:    store i32 [[ADD8]], ptr [[DOTOMP_COMB_UB]], align 4
 // CHECK1-NEXT:    [[TMP39:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
-// CHECK1-NEXT:    [[TMP40:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
-// CHECK1-NEXT:    [[CMP10:%.*]] = icmp sgt i32 [[TMP39]], [[TMP40]]
-// CHECK1-NEXT:    br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]]
-// CHECK1:       cond.true11:
-// CHECK1-NEXT:    [[TMP41:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
-// CHECK1-NEXT:    br label [[COND_END13:%.*]]
-// CHECK1:       cond.false12:
+// CHECK1-NEXT:    [[TMP40:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[TMP39]], [[TMP40]]
+// CHECK1-NEXT:    br i1 [[CMP9]], label [[COND_TRUE10:%.*]], label [[COND_FALSE11:%.*]]
+// CHECK1:       cond.true10:
+// CHECK1-NEXT:    [[TMP41:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT:    br label [[COND_END12:%.*]]
+// CHECK1:       cond.false11:
 // CHECK1-NEXT:    [[TMP42:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
-// CHECK1-NEXT:    br label [[COND_END13]]
-// CHECK1:       cond.end13:
-// CHECK1-NEXT:    [[COND14:%.*]] = phi i32 [ [[TMP41]], [[COND_TRUE11]] ], [ [[TMP42]], [[COND_FALSE12]] ]
-// CHECK1-NEXT:    store i32 [[COND14]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK1-NEXT:    br label [[COND_END12]]
+// CHECK1:       cond.end12:
+// CHECK1-NEXT:    [[COND13:%.*]] = phi i32 [ [[TMP41]], [[COND_TRUE10]] ], [ [[TMP42]], [[COND_FALSE11]] ]
+// CHECK1-NEXT:    store i32 [[COND13]], ptr [[DOTOMP_COMB_UB]], align 4
 // CHECK1-NEXT:    [[TMP43:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
 // CHECK1-NEXT:    store i32 [[TMP43]], ptr [[DOTOMP_IV]], align 4
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]]
@@ -249,7 +248,6 @@ int bar(int n){
 // CHECK1:       .omp.lastprivate.done:
 // CHECK1-NEXT:    br label [[OMP_PRECOND_END]]
 // CHECK1:       omp.precond.end:
-// CHECK1-NEXT:    call void @__kmpc_free_shared(ptr [[L1]], i64 4)
 // CHECK1-NEXT:    ret void
 //
 //
@@ -386,7 +384,7 @@ int bar(int n){
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l34
-// CHECK1-SAME: (i64 noundef [[N:%.*]], ptr noundef nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR6:[0-9]+]] {
+// CHECK1-SAME: (i64 noundef [[N:%.*]], ptr noundef nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR4:[0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[N_ADDR:%.*]] = alloca i64, align 8
 // CHECK1-NEXT:    [[AA_ADDR:%.*]] = alloca ptr, align 8
@@ -406,7 +404,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP4:%.*]] = load i64, ptr [[N_CASTED]], align 8
 // CHECK1-NEXT:    store i32 0, ptr [[DOTZERO_ADDR]], align 4
 // CHECK1-NEXT:    store i32 [[TMP2]], ptr [[DOTTHREADID_TEMP_]], align 4
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l34_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], i64 [[TMP4]], ptr [[TMP0]]) #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l34_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], i64 [[TMP4]], ptr [[TMP0]]) #[[ATTR2]]
 // CHECK1-NEXT:    call void @__kmpc_target_deinit()
 // CHECK1-NEXT:    ret void
 // CHECK1:       worker.exit:
@@ -652,7 +650,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK1-NEXT:    store i32 0, ptr [[DOTZERO_ADDR]], align 4
 // CHECK1-NEXT:    store i32 [[TMP2]], ptr [[DOTTHREADID_TEMP_]], align 4
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l39_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], ptr [[TMP0]]) #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l39_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], ptr [[TMP0]]) #[[ATTR2]]
 // CHECK1-NEXT:    call void @__kmpc_target_deinit()
 // CHECK1-NEXT:    ret void
 // CHECK1:       worker.exit:
@@ -843,7 +841,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP4:%.*]] = load i64, ptr [[F_CASTED]], align 8
 // CHECK1-NEXT:    store i32 0, ptr [[DOTZERO_ADDR]], align 4
 // CHECK1-NEXT:    store i32 [[TMP2]], ptr [[DOTTHREADID_TEMP_]], align 4
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l44_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], ptr [[TMP0]], i64 [[TMP4]]) #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l44_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], ptr [[TMP0]], i64 [[TMP4]]) #[[ATTR2]]
 // CHECK1-NEXT:    call void @__kmpc_target_deinit()
 // CHECK1-NEXT:    ret void
 // CHECK1:       worker.exit:
@@ -1069,7 +1067,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP4:%.*]] = load i64, ptr [[N_CASTED]], align 8
 // CHECK1-NEXT:    store i32 0, ptr [[DOTZERO_ADDR]], align 4
 // CHECK1-NEXT:    store i32 [[TMP2]], ptr [[DOTTHREADID_TEMP_]], align 4
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l52_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], i64 [[TMP4]], ptr [[TMP0]]) #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l52_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], i64 [[TMP4]], ptr [[TMP0]]) #[[ATTR2]]
 // CHECK1-NEXT:    call void @__kmpc_target_deinit()
 // CHECK1-NEXT:    ret void
 // CHECK1:       worker.exit:
@@ -1382,7 +1380,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[V_ADDR]], align 8
 // CHECK1-NEXT:    store i32 0, ptr [[DOTZERO_ADDR]], align 4
 // CHECK1-NEXT:    store i32 [[TMP2]], ptr [[DOTTHREADID_TEMP_]], align 4
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l59_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], i64 [[TMP4]], ptr [[TMP0]], ptr [[TMP5]]) #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l59_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], i64 [[TMP4]], ptr [[TMP0]], ptr [[TMP5]]) #[[ATTR2]]
 // CHECK1-NEXT:    call void @__kmpc_target_deinit()
 // CHECK1-NEXT:    ret void
 // CHECK1:       worker.exit:
@@ -1648,7 +1646,7 @@ int bar(int n){
 // CHECK2-NEXT:    [[TMP6:%.*]] = load i64, ptr [[L_CASTED]], align 8
 // CHECK2-NEXT:    store i32 0, ptr [[DOTZERO_ADDR]], align 4
 // CHECK2-NEXT:    store i32 [[TMP2]], ptr [[DOTTHREADID_TEMP_]], align 4
-// CHECK2-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l28_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], i64 [[TMP4]], ptr [[TMP0]], i64 [[TMP6]]) #[[ATTR3:[0-9]+]]
+// CHECK2-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l28_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], i64 [[TMP4]], ptr [[TMP0]], i64 [[TMP6]]) #[[ATTR2:[0-9]+]]
 // CHECK2-NEXT:    call void @__kmpc_target_deinit()
 // CHECK2-NEXT:    ret void
 // CHECK2:       worker.exit:
@@ -1666,13 +1664,13 @@ int bar(int n){
 // CHECK2-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    [[TMP:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
-// CHECK2-NEXT:    [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    [[I:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
-// CHECK2-NEXT:    [[I4:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[I3:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    [[N_CASTED:%.*]] = alloca i64, align 8
 // CHECK2-NEXT:    [[L_CASTED:%.*]] = alloca i64, align 8
 // CHECK2-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x ptr], align 8
@@ -1682,21 +1680,20 @@ int bar(int n){
 // CHECK2-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
 // CHECK2-NEXT:    store i64 [[L]], ptr [[L_ADDR]], align 8
 // CHECK2-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK2-NEXT:    [[L1:%.*]] = call align 8 ptr @__kmpc_alloc_shared(i64 4)
 // CHECK2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[N_ADDR]], align 4
 // CHECK2-NEXT:    store i32 [[TMP1]], ptr [[DOTCAPTURE_EXPR_]], align 4
 // CHECK2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
 // CHECK2-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0
 // CHECK2-NEXT:    [[DIV:%.*]] = sdiv i32 [[SUB]], 1
-// CHECK2-NEXT:    [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1
-// CHECK2-NEXT:    store i32 [[SUB3]], ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK2-NEXT:    [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK2-NEXT:    store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
 // CHECK2-NEXT:    store i32 0, ptr [[I]], align 4
 // CHECK2-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
 // CHECK2-NEXT:    [[CMP:%.*]] = icmp slt i32 0, [[TMP3]]
 // CHECK2-NEXT:    br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
 // CHECK2:       omp.precond.then:
 // CHECK2-NEXT:    store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
-// CHECK2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
 // CHECK2-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_COMB_UB]], align 4
 // CHECK2-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
 // CHECK2-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
@@ -1704,11 +1701,11 @@ int bar(int n){
 // CHECK2-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4
 // CHECK2-NEXT:    call void @__kmpc_distribute_static_init_4(ptr @[[GLOB2:[0-9]+]], i32 [[TMP6]], i32 91, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 128)
 // CHECK2-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
-// CHECK2-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
-// CHECK2-NEXT:    [[CMP5:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]]
-// CHECK2-NEXT:    br i1 [[CMP5]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK2-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK2-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]]
+// CHECK2-NEXT:    br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
 // CHECK2:       cond.true:
-// CHECK2-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK2-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
 // CHECK2-NEXT:    br label [[COND_END:%.*]]
 // CHECK2:       cond.false:
 // CHECK2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
@@ -1721,10 +1718,10 @@ int bar(int n){
 // CHECK2-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK2:       omp.inner.for.cond:
 // CHECK2-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
-// CHECK2-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK2-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
 // CHECK2-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP13]], 1
-// CHECK2-NEXT:    [[CMP6:%.*]] = icmp slt i32 [[TMP12]], [[ADD]]
-// CHECK2-NEXT:    br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK2-NEXT:    [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]]
+// CHECK2-NEXT:    br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK2:       omp.inner.for.body:
 // CHECK2-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
 // CHECK2-NEXT:    [[TMP15:%.*]] = zext i32 [[TMP14]] to i64
@@ -1757,29 +1754,29 @@ int bar(int n){
 // CHECK2:       omp.inner.for.inc:
 // CHECK2-NEXT:    [[TMP33:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
 // CHECK2-NEXT:    [[TMP34:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
-// CHECK2-NEXT:    [[ADD7:%.*]] = add nsw i32 [[TMP33]], [[TMP34]]
-// CHECK2-NEXT:    store i32 [[ADD7]], ptr [[DOTOMP_IV]], align 4
+// CHECK2-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP33]], [[TMP34]]
+// CHECK2-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4
 // CHECK2-NEXT:    [[TMP35:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
 // CHECK2-NEXT:    [[TMP36:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
-// CHECK2-NEXT:    [[ADD8:%.*]] = add nsw i32 [[TMP35]], [[TMP36]]
-// CHECK2-NEXT:    store i32 [[ADD8]], ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK2-NEXT:    [[ADD7:%.*]] = add nsw i32 [[TMP35]], [[TMP36]]
+// CHECK2-NEXT:    store i32 [[ADD7]], ptr [[DOTOMP_COMB_LB]], align 4
 // CHECK2-NEXT:    [[TMP37:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
 // CHECK2-NEXT:    [[TMP38:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
-// CHECK2-NEXT:    [[ADD9:%.*]] = add nsw i32 [[TMP37]], [[TMP38]]
-// CHECK2-NEXT:    store i32 [[ADD9]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK2-NEXT:    [[ADD8:%.*]] = add nsw i32 [[TMP37]], [[TMP38]]
+// CHECK2-NEXT:    store i32 [[ADD8]], ptr [[DOTOMP_COMB_UB]], align 4
 // CHECK2-NEXT:    [[TMP39:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
-// CHECK2-NEXT:    [[TMP40:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
-// CHECK2-NEXT:    [[CMP10:%.*]] = icmp sgt i32 [[TMP39]], [[TMP40]]
-// CHECK2-NEXT:    br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]]
-// CHECK2:       cond.true11:
-// CHECK2-NEXT:    [[TMP41:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
-// CHECK2-NEXT:    br label [[COND_END13:%.*]]
-// CHECK2:       cond.false12:
+// CHECK2-NEXT:    [[TMP40:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK2-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[TMP39]], [[TMP40]]
+// CHECK2-NEXT:    br i1 [[CMP9]], label [[COND_TRUE10:%.*]], label [[COND_FALSE11:%.*]]
+// CHECK2:       cond.true10:
+// CHECK2-NEXT:    [[TMP41:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK2-NEXT:    br label [[COND_END12:%.*]]
+// CHECK2:       cond.false11:
 // CHECK2-NEXT:    [[TMP42:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
-// CHECK2-NEXT:    br label [[COND_END13]]
-// CHECK2:       cond.end13:
-// CHECK2-NEXT:    [[COND14:%.*]] = phi i32 [ [[TMP41]], [[COND_TRUE11]] ], [ [[TMP42]], [[COND_FALSE12]] ]
-// CHECK2-NEXT:    store i32 [[COND14]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK2-NEXT:    br label [[COND_END12]]
+// CHECK2:       cond.end12:
+// CHECK2-NEXT:    [[COND13:%.*]] = phi i32 [ [[TMP41]], [[COND_TRUE10]] ], [ [[TMP42]], [[COND_FALSE11]] ]
+// CHECK2-NEXT:    store i32 [[COND13]], ptr [[DOTOMP_COMB_UB]], align 4
 // CHECK2-NEXT:    [[TMP43:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
 // CHECK2-NEXT:    store i32 [[TMP43]], ptr [[DOTOMP_IV]], align 4
 // CHECK2-NEXT:    br label [[OMP_INNER_FOR_COND]]
@@ -1799,7 +1796,6 @@ int bar(int n){
 // CHECK2:       .omp.lastprivate.done:
 // CHECK2-NEXT:    br label [[OMP_PRECOND_END]]
 // CHECK2:       omp.precond.end:
-// CHECK2-NEXT:    call void @__kmpc_free_shared(ptr [[L1]], i64 4)
 // CHECK2-NEXT:    ret void
 //
 //
@@ -1936,7 +1932,7 @@ int bar(int n){
 //
 //
 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l34
-// CHECK2-SAME: (i64 noundef [[N:%.*]], ptr noundef nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR6:[0-9]+]] {
+// CHECK2-SAME: (i64 noundef [[N:%.*]], ptr noundef nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR4:[0-9]+]] {
 // CHECK2-NEXT:  entry:
 // CHECK2-NEXT:    [[N_ADDR:%.*]] = alloca i64, align 8
 // CHECK2-NEXT:    [[AA_ADDR:%.*]] = alloca ptr, align 8
@@ -1956,7 +1952,7 @@ int bar(int n){
 // CHECK2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[N_CASTED]], align 8
 // CHECK2-NEXT:    store i32 0, ptr [[DOTZERO_ADDR]], align 4
 // CHECK2-NEXT:    store i32 [[TMP2]], ptr [[DOTTHREADID_TEMP_]], align 4
-// CHECK2-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l34_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], i64 [[TMP4]], ptr [[TMP0]]) #[[ATTR3]]
+// CHECK2-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l34_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], i64 [[TMP4]], ptr [[TMP0]]) #[[ATTR2]]
 // CHECK2-NEXT:    call void @__kmpc_target_deinit()
 // CHECK2-NEXT:    ret void
 // CHECK2:       worker.exit:
@@ -2202,7 +2198,7 @@ int bar(int n){
 // CHECK2-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK2-NEXT:    store i32 0, ptr [[DOTZERO_ADDR]], align 4
 // CHECK2-NEXT:    store i32 [[TMP2]], ptr [[DOTTHREADID_TEMP_]], align 4
-// CHECK2-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l39_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], ptr [[TMP0]]) #[[ATTR3]]
+// CHECK2-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l39_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], ptr [[TMP0]]) #[[ATTR2]]
 // CHECK2-NEXT:    call void @__kmpc_target_deinit()
 // CHECK2-NEXT:    ret void
 // CHECK2:       worker.exit:
@@ -2393,7 +2389,7 @@ int bar(int n){
 // CHECK2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[F_CASTED]], align 8
 // CHECK2-NEXT:    store i32 0, ptr [[DOTZERO_ADDR]], align 4
 // CHECK2-NEXT:    store i32 [[TMP2]], ptr [[DOTTHREADID_TEMP_]], align 4
-// CHECK2-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l44_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], ptr [[TMP0]], i64 [[TMP4]]) #[[ATTR3]]
+// CHECK2-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l44_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], ptr [[TMP0]], i64 [[TMP4]]) #[[ATTR2]]
 // CHECK2-NEXT:    call void @__kmpc_target_deinit()
 // CHECK2-NEXT:    ret void
 // CHECK2:       worker.exit:
@@ -2619,7 +2615,7 @@ int bar(int n){
 // CHECK2-NEXT:    [[TMP4:%.*]] = load i64, ptr [[N_CASTED]], align 8
 // CHECK2-NEXT:    store i32 0, ptr [[DOTZERO_ADDR]], align 4
 // CHECK2-NEXT:    store i32 [[TMP2]], ptr [[DOTTHREADID_TEMP_]], align 4
-// CHECK2-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l52_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], i64 [[TMP4]], ptr [[TMP0]]) #[[ATTR3]]
+// CHECK2-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l52_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], i64 [[TMP4]], ptr [[TMP0]]) #[[ATTR2]]
 // CHECK2-NEXT:    call void @__kmpc_target_deinit()
 // CHECK2-NEXT:    ret void
 // CHECK2:       worker.exit:
@@ -2927,7 +2923,7 @@ int bar(int n){
 // CHECK2-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[V_ADDR]], align 8
 // CHECK2-NEXT:    store i32 0, ptr [[DOTZERO_ADDR]], align 4
 // CHECK2-NEXT:    store i32 [[TMP2]], ptr [[DOTTHREADID_TEMP_]], align 4
-// CHECK2-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l59_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], i64 [[TMP4]], ptr [[TMP0]], ptr [[TMP5]]) #[[ATTR3]]
+// CHECK2-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l59_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], i64 [[TMP4]], ptr [[TMP0]], ptr [[TMP5]]) #[[ATTR2]]
 // CHECK2-NEXT:    call void @__kmpc_target_deinit()
 // CHECK2-NEXT:    ret void
 // CHECK2:       worker.exit:
@@ -3193,7 +3189,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP6:%.*]] = load i32, ptr [[L_CASTED]], align 4
 // CHECK3-NEXT:    store i32 0, ptr [[DOTZERO_ADDR]], align 4
 // CHECK3-NEXT:    store i32 [[TMP2]], ptr [[DOTTHREADID_TEMP_]], align 4
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l28_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], i32 [[TMP4]], ptr [[TMP0]], i32 [[TMP6]]) #[[ATTR3:[0-9]+]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l28_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], i32 [[TMP4]], ptr [[TMP0]], i32 [[TMP6]]) #[[ATTR2:[0-9]+]]
 // CHECK3-NEXT:    call void @__kmpc_target_deinit()
 // CHECK3-NEXT:    ret void
 // CHECK3:       worker.exit:
@@ -3211,13 +3207,13 @@ int bar(int n){
 // CHECK3-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
 // CHECK3-NEXT:    [[TMP:%.*]] = alloca i32, align 4
 // CHECK3-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
-// CHECK3-NEXT:    [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
 // CHECK3-NEXT:    [[I:%.*]] = alloca i32, align 4
 // CHECK3-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
 // CHECK3-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
 // CHECK3-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
 // CHECK3-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
-// CHECK3-NEXT:    [[I4:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    [[I3:%.*]] = alloca i32, align 4
 // CHECK3-NEXT:    [[N_CASTED:%.*]] = alloca i32, align 4
 // CHECK3-NEXT:    [[L_CASTED:%.*]] = alloca i32, align 4
 // CHECK3-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x ptr], align 4
@@ -3227,21 +3223,20 @@ int bar(int n){
 // CHECK3-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
 // CHECK3-NEXT:    store i32 [[L]], ptr [[L_ADDR]], align 4
 // CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
-// CHECK3-NEXT:    [[L1:%.*]] = call align 8 ptr @__kmpc_alloc_shared(i32 4)
 // CHECK3-NEXT:    [[TMP1:%.*]] = load i32, ptr [[N_ADDR]], align 4
 // CHECK3-NEXT:    store i32 [[TMP1]], ptr [[DOTCAPTURE_EXPR_]], align 4
 // CHECK3-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
 // CHECK3-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0
 // CHECK3-NEXT:    [[DIV:%.*]] = sdiv i32 [[SUB]], 1
-// CHECK3-NEXT:    [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1
-// CHECK3-NEXT:    store i32 [[SUB3]], ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK3-NEXT:    [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK3-NEXT:    store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
 // CHECK3-NEXT:    store i32 0, ptr [[I]], align 4
 // CHECK3-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
 // CHECK3-NEXT:    [[CMP:%.*]] = icmp slt i32 0, [[TMP3]]
 // CHECK3-NEXT:    br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
 // CHECK3:       omp.precond.then:
 // CHECK3-NEXT:    store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
-// CHECK3-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK3-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
 // CHECK3-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_COMB_UB]], align 4
 // CHECK3-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
 // CHECK3-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
@@ -3249,11 +3244,11 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4
 // CHECK3-NEXT:    call void @__kmpc_distribute_static_init_4(ptr @[[GLOB2:[0-9]+]], i32 [[TMP6]], i32 91, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 128)
 // CHECK3-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
-// CHECK3-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
-// CHECK3-NEXT:    [[CMP5:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]]
-// CHECK3-NEXT:    br i1 [[CMP5]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK3-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK3-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]]
+// CHECK3-NEXT:    br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
 // CHECK3:       cond.true:
-// CHECK3-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK3-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
 // CHECK3-NEXT:    br label [[COND_END:%.*]]
 // CHECK3:       cond.false:
 // CHECK3-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
@@ -3266,10 +3261,10 @@ int bar(int n){
 // CHECK3-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK3:       omp.inner.for.cond:
 // CHECK3-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
-// CHECK3-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK3-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
 // CHECK3-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP13]], 1
-// CHECK3-NEXT:    [[CMP6:%.*]] = icmp slt i32 [[TMP12]], [[ADD]]
-// CHECK3-NEXT:    br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK3-NEXT:    [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]]
+// CHECK3-NEXT:    br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK3:       omp.inner.for.body:
 // CHECK3-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
 // CHECK3-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
@@ -3300,29 +3295,29 @@ int bar(int n){
 // CHECK3:       omp.inner.for.inc:
 // CHECK3-NEXT:    [[TMP31:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
 // CHECK3-NEXT:    [[TMP32:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
-// CHECK3-NEXT:    [[ADD7:%.*]] = add nsw i32 [[TMP31]], [[TMP32]]
-// CHECK3-NEXT:    store i32 [[ADD7]], ptr [[DOTOMP_IV]], align 4
+// CHECK3-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP31]], [[TMP32]]
+// CHECK3-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4
 // CHECK3-NEXT:    [[TMP33:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
 // CHECK3-NEXT:    [[TMP34:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
-// CHECK3-NEXT:    [[ADD8:%.*]] = add nsw i32 [[TMP33]], [[TMP34]]
-// CHECK3-NEXT:    store i32 [[ADD8]], ptr [[DOTOMP_COMB_LB]], align 4
+// CHECK3-NEXT:    [[ADD7:%.*]] = add nsw i32 [[TMP33]], [[TMP34]]
+// CHECK3-NEXT:    store i32 [[ADD7]], ptr [[DOTOMP_COMB_LB]], align 4
 // CHECK3-NEXT:    [[TMP35:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
 // CHECK3-NEXT:    [[TMP36:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
-// CHECK3-NEXT:    [[ADD9:%.*]] = add nsw i32 [[TMP35]], [[TMP36]]
-// CHECK3-NEXT:    store i32 [[ADD9]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT:    [[ADD8:%.*]] = add nsw i32 [[TMP35]], [[TMP36]]
+// CHECK3-NEXT:    store i32 [[ADD8]], ptr [[DOTOMP_COMB_UB]], align 4
 // CHECK3-NEXT:    [[TMP37:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
-// CHECK3-NEXT:    [[TMP38:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
-// CHECK3-NEXT:    [[CMP10:%.*]] = icmp sgt i32 [[TMP37]], [[TMP38]]
-// CHECK3-NEXT:    br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]]
-// CHECK3:       cond.true11:
-// CHECK3-NEXT:    [[TMP39:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
-// CHECK3-NEXT:    br label [[COND_END13:%.*]]
-// CHECK3:       cond.false12:
+// CHECK3-NEXT:    [[TMP38:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK3-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[TMP37]], [[TMP38]]
+// CHECK3-NEXT:    br i1 [[CMP9]], label [[COND_TRUE10:%.*]], label [[COND_FALSE11:%.*]]
+// CHECK3:       cond.true10:
+// CHECK3-NEXT:    [[TMP39:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK3-NEXT:    br label [[COND_END12:%.*]]
+// CHECK3:       cond.false11:
 // CHECK3-NEXT:    [[TMP40:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
-// CHECK3-NEXT:    br label [[COND_END13]]
-// CHECK3:       cond.end13:
-// CHECK3-NEXT:    [[COND14:%.*]] = phi i32 [ [[TMP39]], [[COND_TRUE11]] ], [ [[TMP40]], [[COND_FALSE12]] ]
-// CHECK3-NEXT:    store i32 [[COND14]], ptr [[DOTOMP_COMB_UB]], align 4
+// CHECK3-NEXT:    br label [[COND_END12]]
+// CHECK3:       cond.end12:
+// CHECK3-NEXT:    [[COND13:%.*]] = phi i32 [ [[TMP39]], [[COND_TRUE10]] ], [ [[TMP40]], [[COND_FALSE11]] ]
+// CHECK3-NEXT:    store i32 [[COND13]], ptr [[DOTOMP_COMB_UB]], align 4
 // CHECK3-NEXT:    [[TMP41:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
 // CHECK3-NEXT:    store i32 [[TMP41]], ptr [[DOTOMP_IV]], align 4
 // CHECK3-NEXT:    br label [[OMP_INNER_FOR_COND]]
@@ -3342,7 +3337,6 @@ int bar(int n){
 // CHECK3:       .omp.lastprivate.done:
 // CHECK3-NEXT:    br label [[OMP_PRECOND_END]]
 // CHECK3:       omp.precond.end:
-// CHECK3-NEXT:    call void @__kmpc_free_shared(ptr [[L1]], i32 4)
 // CHECK3-NEXT:    ret void
 //
 //
@@ -3474,7 +3468,7 @@ int bar(int n){
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l34
-// CHECK3-SAME: (i32 noundef [[N:%.*]], ptr noundef nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR6:[0-9]+]] {
+// CHECK3-SAME: (i32 noundef [[N:%.*]], ptr noundef nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR4:[0-9]+]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
 // CHECK3-NEXT:    [[AA_ADDR:%.*]] = alloca ptr, align 4
@@ -3494,7 +3488,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP4:%.*]] = load i32, ptr [[N_CASTED]], align 4
 // CHECK3-NEXT:    store i32 0, ptr [[DOTZERO_ADDR]], align 4
 // CHECK3-NEXT:    store i32 [[TMP2]], ptr [[DOTTHREADID_TEMP_]], align 4
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l34_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], i32 [[TMP4]], ptr [[TMP0]]) #[[ATTR3]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l34_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], i32 [[TMP4]], ptr [[TMP0]]) #[[ATTR2]]
 // CHECK3-NEXT:    call void @__kmpc_target_deinit()
 // CHECK3-NEXT:    ret void
 // CHECK3:       worker.exit:
@@ -3734,7 +3728,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK3-NEXT:    store i32 0, ptr [[DOTZERO_ADDR]], align 4
 // CHECK3-NEXT:    store i32 [[TMP2]], ptr [[DOTTHREADID_TEMP_]], align 4
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l39_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], ptr [[TMP0]]) #[[ATTR3]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l39_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], ptr [[TMP0]]) #[[ATTR2]]
 // CHECK3-NEXT:    call void @__kmpc_target_deinit()
 // CHECK3-NEXT:    ret void
 // CHECK3:       worker.exit:
@@ -3919,7 +3913,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP4:%.*]] = load i32, ptr [[F_CASTED]], align 4
 // CHECK3-NEXT:    store i32 0, ptr [[DOTZERO_ADDR]], align 4
 // CHECK3-NEXT:    store i32 [[TMP2]], ptr [[DOTTHREADID_TEMP_]], align 4
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l44_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], ptr [[TMP0]], i32 [[TMP4]]) #[[ATTR3]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l44_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], ptr [[TMP0]], i32 [[TMP4]]) #[[ATTR2]]
 // CHECK3-NEXT:    call void @__kmpc_target_deinit()
 // CHECK3-NEXT:    ret void
 // CHECK3:       worker.exit:
@@ -4138,7 +4132,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP4:%.*]] = load i32, ptr [[N_CASTED]], align 4
 // CHECK3-NEXT:    store i32 0, ptr [[DOTZERO_ADDR]], align 4
 // CHECK3-NEXT:    store i32 [[TMP2]], ptr [[DOTTHREADID_TEMP_]], align 4
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l52_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], i32 [[TMP4]], ptr [[TMP0]]) #[[ATTR3]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l52_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], i32 [[TMP4]], ptr [[TMP0]]) #[[ATTR2]]
 // CHECK3-NEXT:    call void @__kmpc_target_deinit()
 // CHECK3-NEXT:    ret void
 // CHECK3:       worker.exit:
@@ -4454,7 +4448,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[V_ADDR]], align 4
 // CHECK3-NEXT:    store i32 0, ptr [[DOTZERO_ADDR]], align 4
 // CHECK3-NEXT:    store i32 [[TMP2]], ptr [[DOTTHREADID_TEMP_]], align 4
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l59_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], i32 [[TMP4]], ptr [[TMP0]], ptr [[TMP5]]) #[[ATTR3]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l59_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], i32 [[TMP4]], ptr [[TMP0]], ptr [[TMP5]]) #[[ATTR2]]
 // CHECK3-NEXT:    call void @__kmpc_target_deinit()
 // CHECK3-NEXT:    ret void
 // CHECK3:       worker.exit:
diff --git a/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_simd_codegen.cpp b/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_simd_codegen.cpp
index dbc2a6c6668a2..fdd4457739510 100644
--- a/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_simd_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_simd_codegen.cpp
@@ -86,7 +86,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP6:%.*]] = load i64, ptr [[L_CASTED]], align 8
 // CHECK1-NEXT:    store i32 0, ptr [[DOTZERO_ADDR]], align 4
 // CHECK1-NEXT:    store i32 [[TMP2]], ptr [[DOTTHREADID_TEMP_]], align 4
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l26_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], i64 [[TMP4]], ptr [[TMP0]], i64 [[TMP6]]) #[[ATTR3:[0-9]+]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l26_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], i64 [[TMP4]], ptr [[TMP0]], i64 [[TMP6]]) #[[ATTR2:[0-9]+]]
 // CHECK1-NEXT:    call void @__kmpc_target_deinit()
 // CHECK1-NEXT:    ret void
 // CHECK1:       worker.exit:
@@ -104,13 +104,13 @@ int bar(int n){
 // CHECK1-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[TMP:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
-// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[I:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
-// CHECK1-NEXT:    [[I4:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    [[I3:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[N_CASTED:%.*]] = alloca i64, align 8
 // CHECK1-NEXT:    [[L_CASTED:%.*]] = alloca i64, align 8
 // CHECK1-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x ptr], align 8
@@ -120,21 +120,20 @@ int bar(int n){
 // CHECK1-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
 // CHECK1-NEXT:    store i64 [[L]], ptr [[L_ADDR]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK1-NEXT:    [[L1:%.*]] = call align 8 ptr @__kmpc_alloc_shared(i64 4)
 // CHECK1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[N_ADDR]], align 4
 // CHECK1-NEXT:    store i32 [[TMP1]], ptr [[DOTCAPTURE_EXPR_]], align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
 // CHECK1-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0
 // CHECK1-NEXT:    [[DIV:%.*]] = sdiv i32 [[SUB]], 1
-// CHECK1-NEXT:    [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1
-// CHECK1-NEXT:    store i32 [[SUB3]], ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK1-NEXT:    [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK1-NEXT:    store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
 // CHECK1-NEXT:    store i32 0, ptr [[I]], align 4
 // CHECK1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
 // CHECK1-NEXT:    [[CMP:%.*]] = icmp slt i32 0, [[TMP3]]
 // CHECK1-NEXT:    br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
 // CHECK1:       omp.precond.then:
 // CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
-// CHECK1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
 // CHECK1-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_COMB_UB]], align 4
 // CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
 // CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
@@ -142,11 +141,11 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4
 // CHECK1-NEXT:    call void @__kmpc_distribute_static_init_4(ptr @[[GLOB2:[0-9]+]], i32 [[TMP6]], i32 91, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 128)
 // CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
-// CHECK1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
-// CHECK1-NEXT:    [[CMP5:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]]
-// CHECK1-NEXT:    br i1 [[CMP5]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK1-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]]
+// CHECK1-NEXT:    br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
 // CHECK1:       cond.true:
-// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
 // CHECK1-NEXT:    br label [[COND_END:%.*]]
 // CHECK1:       cond.false:
 // CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
@@ -159,10 +158,10 @@ int bar(int n){
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK1:       omp.inner.for.cond:
 // CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16:![0-9]+]]
-// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4, !llvm.access.group [[ACC_GRP16]]
 // CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP13]], 1
-// CHECK1-NEXT:    [[CMP6:%.*]] = icmp slt i32 [[TMP12]], [[ADD]]
-// CHECK1-NEXT:    br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK1-NEXT:    [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]]
+// CHECK1-NEXT:    br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK1:       omp.inner.for.body:
 // CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP16]]
 // CHECK1-NEXT:    [[TMP15:%.*]] = zext i32 [[TMP14]] to i64
@@ -195,29 +194,29 @@ int bar(int n){
 // CHECK1:       omp.inner.for.inc:
 // CHECK1-NEXT:    [[TMP33:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
 // CHECK1-NEXT:    [[TMP34:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK1-NEXT:    [[ADD7:%.*]] = add nsw i32 [[TMP33]], [[TMP34]]
-// CHECK1-NEXT:    store i32 [[ADD7]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK1-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP33]], [[TMP34]]
+// CHECK1-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
 // CHECK1-NEXT:    [[TMP35:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP16]]
 // CHECK1-NEXT:    [[TMP36:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK1-NEXT:    [[ADD8:%.*]] = add nsw i32 [[TMP35]], [[TMP36]]
-// CHECK1-NEXT:    store i32 [[ADD8]], ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK1-NEXT:    [[ADD7:%.*]] = add nsw i32 [[TMP35]], [[TMP36]]
+// CHECK1-NEXT:    store i32 [[ADD7]], ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP16]]
 // CHECK1-NEXT:    [[TMP37:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP16]]
 // CHECK1-NEXT:    [[TMP38:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK1-NEXT:    [[ADD9:%.*]] = add nsw i32 [[TMP37]], [[TMP38]]
-// CHECK1-NEXT:    store i32 [[ADD9]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK1-NEXT:    [[ADD8:%.*]] = add nsw i32 [[TMP37]], [[TMP38]]
+// CHECK1-NEXT:    store i32 [[ADD8]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP16]]
 // CHECK1-NEXT:    [[TMP39:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK1-NEXT:    [[TMP40:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK1-NEXT:    [[CMP10:%.*]] = icmp sgt i32 [[TMP39]], [[TMP40]]
-// CHECK1-NEXT:    br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]]
-// CHECK1:       cond.true11:
-// CHECK1-NEXT:    [[TMP41:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK1-NEXT:    br label [[COND_END13:%.*]]
-// CHECK1:       cond.false12:
+// CHECK1-NEXT:    [[TMP40:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK1-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[TMP39]], [[TMP40]]
+// CHECK1-NEXT:    br i1 [[CMP9]], label [[COND_TRUE10:%.*]], label [[COND_FALSE11:%.*]]
+// CHECK1:       cond.true10:
+// CHECK1-NEXT:    [[TMP41:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK1-NEXT:    br label [[COND_END12:%.*]]
+// CHECK1:       cond.false11:
 // CHECK1-NEXT:    [[TMP42:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK1-NEXT:    br label [[COND_END13]]
-// CHECK1:       cond.end13:
-// CHECK1-NEXT:    [[COND14:%.*]] = phi i32 [ [[TMP41]], [[COND_TRUE11]] ], [ [[TMP42]], [[COND_FALSE12]] ]
-// CHECK1-NEXT:    store i32 [[COND14]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK1-NEXT:    br label [[COND_END12]]
+// CHECK1:       cond.end12:
+// CHECK1-NEXT:    [[COND13:%.*]] = phi i32 [ [[TMP41]], [[COND_TRUE10]] ], [ [[TMP42]], [[COND_FALSE11]] ]
+// CHECK1-NEXT:    store i32 [[COND13]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP16]]
 // CHECK1-NEXT:    [[TMP43:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP16]]
 // CHECK1-NEXT:    store i32 [[TMP43]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP17:![0-9]+]]
@@ -232,11 +231,11 @@ int bar(int n){
 // CHECK1-NEXT:    br i1 [[TMP47]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
 // CHECK1:       .omp.final.then:
 // CHECK1-NEXT:    [[TMP48:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
-// CHECK1-NEXT:    [[SUB15:%.*]] = sub nsw i32 [[TMP48]], 0
-// CHECK1-NEXT:    [[DIV16:%.*]] = sdiv i32 [[SUB15]], 1
-// CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i32 [[DIV16]], 1
-// CHECK1-NEXT:    [[ADD17:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK1-NEXT:    store i32 [[ADD17]], ptr [[I4]], align 4
+// CHECK1-NEXT:    [[SUB14:%.*]] = sub nsw i32 [[TMP48]], 0
+// CHECK1-NEXT:    [[DIV15:%.*]] = sdiv i32 [[SUB14]], 1
+// CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i32 [[DIV15]], 1
+// CHECK1-NEXT:    [[ADD16:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK1-NEXT:    store i32 [[ADD16]], ptr [[I3]], align 4
 // CHECK1-NEXT:    br label [[DOTOMP_FINAL_DONE]]
 // CHECK1:       .omp.final.done:
 // CHECK1-NEXT:    [[TMP49:%.*]] = load i32, ptr [[DOTOMP_IS_LAST]], align 4
@@ -249,7 +248,6 @@ int bar(int n){
 // CHECK1:       .omp.lastprivate.done:
 // CHECK1-NEXT:    br label [[OMP_PRECOND_END]]
 // CHECK1:       omp.precond.end:
-// CHECK1-NEXT:    call void @__kmpc_free_shared(ptr [[L1]], i64 4)
 // CHECK1-NEXT:    ret void
 //
 //
@@ -398,7 +396,7 @@ int bar(int n){
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l32
-// CHECK1-SAME: (i64 noundef [[N:%.*]], ptr noundef nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR6:[0-9]+]] {
+// CHECK1-SAME: (i64 noundef [[N:%.*]], ptr noundef nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR4:[0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[N_ADDR:%.*]] = alloca i64, align 8
 // CHECK1-NEXT:    [[AA_ADDR:%.*]] = alloca ptr, align 8
@@ -418,7 +416,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP4:%.*]] = load i64, ptr [[N_CASTED]], align 8
 // CHECK1-NEXT:    store i32 0, ptr [[DOTZERO_ADDR]], align 4
 // CHECK1-NEXT:    store i32 [[TMP2]], ptr [[DOTTHREADID_TEMP_]], align 4
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l32_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], i64 [[TMP4]], ptr [[TMP0]]) #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l32_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], i64 [[TMP4]], ptr [[TMP0]]) #[[ATTR2]]
 // CHECK1-NEXT:    call void @__kmpc_target_deinit()
 // CHECK1-NEXT:    ret void
 // CHECK1:       worker.exit:
@@ -688,7 +686,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK1-NEXT:    store i32 0, ptr [[DOTZERO_ADDR]], align 4
 // CHECK1-NEXT:    store i32 [[TMP2]], ptr [[DOTTHREADID_TEMP_]], align 4
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l37_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], ptr [[TMP0]]) #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l37_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], ptr [[TMP0]]) #[[ATTR2]]
 // CHECK1-NEXT:    call void @__kmpc_target_deinit()
 // CHECK1-NEXT:    ret void
 // CHECK1:       worker.exit:
@@ -893,7 +891,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP4:%.*]] = load i64, ptr [[F_CASTED]], align 8
 // CHECK1-NEXT:    store i32 0, ptr [[DOTZERO_ADDR]], align 4
 // CHECK1-NEXT:    store i32 [[TMP2]], ptr [[DOTTHREADID_TEMP_]], align 4
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l42_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], ptr [[TMP0]], i64 [[TMP4]]) #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l42_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], ptr [[TMP0]], i64 [[TMP4]]) #[[ATTR2]]
 // CHECK1-NEXT:    call void @__kmpc_target_deinit()
 // CHECK1-NEXT:    ret void
 // CHECK1:       worker.exit:
@@ -1141,7 +1139,7 @@ int bar(int n){
 // CHECK2-NEXT:    [[TMP6:%.*]] = load i32, ptr [[L_CASTED]], align 4
 // CHECK2-NEXT:    store i32 0, ptr [[DOTZERO_ADDR]], align 4
 // CHECK2-NEXT:    store i32 [[TMP2]], ptr [[DOTTHREADID_TEMP_]], align 4
-// CHECK2-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l26_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], i32 [[TMP4]], ptr [[TMP0]], i32 [[TMP6]]) #[[ATTR3:[0-9]+]]
+// CHECK2-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l26_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], i32 [[TMP4]], ptr [[TMP0]], i32 [[TMP6]]) #[[ATTR2:[0-9]+]]
 // CHECK2-NEXT:    call void @__kmpc_target_deinit()
 // CHECK2-NEXT:    ret void
 // CHECK2:       worker.exit:
@@ -1159,13 +1157,13 @@ int bar(int n){
 // CHECK2-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    [[TMP:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
-// CHECK2-NEXT:    [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    [[I:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    [[DOTOMP_COMB_LB:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    [[DOTOMP_COMB_UB:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
-// CHECK2-NEXT:    [[I4:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    [[I3:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    [[N_CASTED:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    [[L_CASTED:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [5 x ptr], align 4
@@ -1175,21 +1173,20 @@ int bar(int n){
 // CHECK2-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
 // CHECK2-NEXT:    store i32 [[L]], ptr [[L_ADDR]], align 4
 // CHECK2-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
-// CHECK2-NEXT:    [[L1:%.*]] = call align 8 ptr @__kmpc_alloc_shared(i32 4)
 // CHECK2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[N_ADDR]], align 4
 // CHECK2-NEXT:    store i32 [[TMP1]], ptr [[DOTCAPTURE_EXPR_]], align 4
 // CHECK2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
 // CHECK2-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0
 // CHECK2-NEXT:    [[DIV:%.*]] = sdiv i32 [[SUB]], 1
-// CHECK2-NEXT:    [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1
-// CHECK2-NEXT:    store i32 [[SUB3]], ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK2-NEXT:    [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK2-NEXT:    store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
 // CHECK2-NEXT:    store i32 0, ptr [[I]], align 4
 // CHECK2-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
 // CHECK2-NEXT:    [[CMP:%.*]] = icmp slt i32 0, [[TMP3]]
 // CHECK2-NEXT:    br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
 // CHECK2:       omp.precond.then:
 // CHECK2-NEXT:    store i32 0, ptr [[DOTOMP_COMB_LB]], align 4
-// CHECK2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
 // CHECK2-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_COMB_UB]], align 4
 // CHECK2-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
 // CHECK2-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
@@ -1197,11 +1194,11 @@ int bar(int n){
 // CHECK2-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4
 // CHECK2-NEXT:    call void @__kmpc_distribute_static_init_4(ptr @[[GLOB2:[0-9]+]], i32 [[TMP6]], i32 91, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 128)
 // CHECK2-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
-// CHECK2-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
-// CHECK2-NEXT:    [[CMP5:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]]
-// CHECK2-NEXT:    br i1 [[CMP5]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK2-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK2-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]]
+// CHECK2-NEXT:    br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
 // CHECK2:       cond.true:
-// CHECK2-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK2-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
 // CHECK2-NEXT:    br label [[COND_END:%.*]]
 // CHECK2:       cond.false:
 // CHECK2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
@@ -1214,10 +1211,10 @@ int bar(int n){
 // CHECK2-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK2:       omp.inner.for.cond:
 // CHECK2-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16:![0-9]+]]
-// CHECK2-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK2-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4, !llvm.access.group [[ACC_GRP16]]
 // CHECK2-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP13]], 1
-// CHECK2-NEXT:    [[CMP6:%.*]] = icmp slt i32 [[TMP12]], [[ADD]]
-// CHECK2-NEXT:    br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK2-NEXT:    [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]]
+// CHECK2-NEXT:    br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK2:       omp.inner.for.body:
 // CHECK2-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP16]]
 // CHECK2-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP16]]
@@ -1248,29 +1245,29 @@ int bar(int n){
 // CHECK2:       omp.inner.for.inc:
 // CHECK2-NEXT:    [[TMP31:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
 // CHECK2-NEXT:    [[TMP32:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK2-NEXT:    [[ADD7:%.*]] = add nsw i32 [[TMP31]], [[TMP32]]
-// CHECK2-NEXT:    store i32 [[ADD7]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK2-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP31]], [[TMP32]]
+// CHECK2-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
 // CHECK2-NEXT:    [[TMP33:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP16]]
 // CHECK2-NEXT:    [[TMP34:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK2-NEXT:    [[ADD8:%.*]] = add nsw i32 [[TMP33]], [[TMP34]]
-// CHECK2-NEXT:    store i32 [[ADD8]], ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK2-NEXT:    [[ADD7:%.*]] = add nsw i32 [[TMP33]], [[TMP34]]
+// CHECK2-NEXT:    store i32 [[ADD7]], ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP16]]
 // CHECK2-NEXT:    [[TMP35:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP16]]
 // CHECK2-NEXT:    [[TMP36:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK2-NEXT:    [[ADD9:%.*]] = add nsw i32 [[TMP35]], [[TMP36]]
-// CHECK2-NEXT:    store i32 [[ADD9]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK2-NEXT:    [[ADD8:%.*]] = add nsw i32 [[TMP35]], [[TMP36]]
+// CHECK2-NEXT:    store i32 [[ADD8]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP16]]
 // CHECK2-NEXT:    [[TMP37:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK2-NEXT:    [[TMP38:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK2-NEXT:    [[CMP10:%.*]] = icmp sgt i32 [[TMP37]], [[TMP38]]
-// CHECK2-NEXT:    br i1 [[CMP10]], label [[COND_TRUE11:%.*]], label [[COND_FALSE12:%.*]]
-// CHECK2:       cond.true11:
-// CHECK2-NEXT:    [[TMP39:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK2-NEXT:    br label [[COND_END13:%.*]]
-// CHECK2:       cond.false12:
+// CHECK2-NEXT:    [[TMP38:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK2-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[TMP37]], [[TMP38]]
+// CHECK2-NEXT:    br i1 [[CMP9]], label [[COND_TRUE10:%.*]], label [[COND_FALSE11:%.*]]
+// CHECK2:       cond.true10:
+// CHECK2-NEXT:    [[TMP39:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK2-NEXT:    br label [[COND_END12:%.*]]
+// CHECK2:       cond.false11:
 // CHECK2-NEXT:    [[TMP40:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK2-NEXT:    br label [[COND_END13]]
-// CHECK2:       cond.end13:
-// CHECK2-NEXT:    [[COND14:%.*]] = phi i32 [ [[TMP39]], [[COND_TRUE11]] ], [ [[TMP40]], [[COND_FALSE12]] ]
-// CHECK2-NEXT:    store i32 [[COND14]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK2-NEXT:    br label [[COND_END12]]
+// CHECK2:       cond.end12:
+// CHECK2-NEXT:    [[COND13:%.*]] = phi i32 [ [[TMP39]], [[COND_TRUE10]] ], [ [[TMP40]], [[COND_FALSE11]] ]
+// CHECK2-NEXT:    store i32 [[COND13]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP16]]
 // CHECK2-NEXT:    [[TMP41:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP16]]
 // CHECK2-NEXT:    store i32 [[TMP41]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
 // CHECK2-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP17:![0-9]+]]
@@ -1285,11 +1282,11 @@ int bar(int n){
 // CHECK2-NEXT:    br i1 [[TMP45]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
 // CHECK2:       .omp.final.then:
 // CHECK2-NEXT:    [[TMP46:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
-// CHECK2-NEXT:    [[SUB15:%.*]] = sub nsw i32 [[TMP46]], 0
-// CHECK2-NEXT:    [[DIV16:%.*]] = sdiv i32 [[SUB15]], 1
-// CHECK2-NEXT:    [[MUL:%.*]] = mul nsw i32 [[DIV16]], 1
-// CHECK2-NEXT:    [[ADD17:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK2-NEXT:    store i32 [[ADD17]], ptr [[I4]], align 4
+// CHECK2-NEXT:    [[SUB14:%.*]] = sub nsw i32 [[TMP46]], 0
+// CHECK2-NEXT:    [[DIV15:%.*]] = sdiv i32 [[SUB14]], 1
+// CHECK2-NEXT:    [[MUL:%.*]] = mul nsw i32 [[DIV15]], 1
+// CHECK2-NEXT:    [[ADD16:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK2-NEXT:    store i32 [[ADD16]], ptr [[I3]], align 4
 // CHECK2-NEXT:    br label [[DOTOMP_FINAL_DONE]]
 // CHECK2:       .omp.final.done:
 // CHECK2-NEXT:    [[TMP47:%.*]] = load i32, ptr [[DOTOMP_IS_LAST]], align 4
@@ -1302,7 +1299,6 @@ int bar(int n){
 // CHECK2:       .omp.lastprivate.done:
 // CHECK2-NEXT:    br label [[OMP_PRECOND_END]]
 // CHECK2:       omp.precond.end:
-// CHECK2-NEXT:    call void @__kmpc_free_shared(ptr [[L1]], i32 4)
 // CHECK2-NEXT:    ret void
 //
 //
@@ -1446,7 +1442,7 @@ int bar(int n){
 //
 //
 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l32
-// CHECK2-SAME: (i32 noundef [[N:%.*]], ptr noundef nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR6:[0-9]+]] {
+// CHECK2-SAME: (i32 noundef [[N:%.*]], ptr noundef nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR4:[0-9]+]] {
 // CHECK2-NEXT:  entry:
 // CHECK2-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    [[AA_ADDR:%.*]] = alloca ptr, align 4
@@ -1466,7 +1462,7 @@ int bar(int n){
 // CHECK2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[N_CASTED]], align 4
 // CHECK2-NEXT:    store i32 0, ptr [[DOTZERO_ADDR]], align 4
 // CHECK2-NEXT:    store i32 [[TMP2]], ptr [[DOTTHREADID_TEMP_]], align 4
-// CHECK2-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l32_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], i32 [[TMP4]], ptr [[TMP0]]) #[[ATTR3]]
+// CHECK2-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l32_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], i32 [[TMP4]], ptr [[TMP0]]) #[[ATTR2]]
 // CHECK2-NEXT:    call void @__kmpc_target_deinit()
 // CHECK2-NEXT:    ret void
 // CHECK2:       worker.exit:
@@ -1730,7 +1726,7 @@ int bar(int n){
 // CHECK2-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK2-NEXT:    store i32 0, ptr [[DOTZERO_ADDR]], align 4
 // CHECK2-NEXT:    store i32 [[TMP2]], ptr [[DOTTHREADID_TEMP_]], align 4
-// CHECK2-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l37_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], ptr [[TMP0]]) #[[ATTR3]]
+// CHECK2-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l37_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], ptr [[TMP0]]) #[[ATTR2]]
 // CHECK2-NEXT:    call void @__kmpc_target_deinit()
 // CHECK2-NEXT:    ret void
 // CHECK2:       worker.exit:
@@ -1929,7 +1925,7 @@ int bar(int n){
 // CHECK2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[F_CASTED]], align 4
 // CHECK2-NEXT:    store i32 0, ptr [[DOTZERO_ADDR]], align 4
 // CHECK2-NEXT:    store i32 [[TMP2]], ptr [[DOTTHREADID_TEMP_]], align 4
-// CHECK2-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l42_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], ptr [[TMP0]], i32 [[TMP4]]) #[[ATTR3]]
+// CHECK2-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l42_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], ptr [[TMP0]], i32 [[TMP4]]) #[[ATTR2]]
 // CHECK2-NEXT:    call void @__kmpc_target_deinit()
 // CHECK2-NEXT:    ret void
 // CHECK2:       worker.exit:
diff --git a/clang/test/OpenMP/nvptx_target_teams_distribute_simd_codegen.cpp b/clang/test/OpenMP/nvptx_target_teams_distribute_simd_codegen.cpp
index 3173c075d7047..be814ed13349c 100644
--- a/clang/test/OpenMP/nvptx_target_teams_distribute_simd_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_target_teams_distribute_simd_codegen.cpp
@@ -94,7 +94,7 @@ int bar(int n){
 // CHECK45-64-NEXT:    [[TMP6:%.*]] = load i64, ptr [[L_CASTED]], align 8
 // CHECK45-64-NEXT:    store i32 0, ptr [[DOTZERO_ADDR]], align 4
 // CHECK45-64-NEXT:    store i32 [[TMP2]], ptr [[DOTTHREADID_TEMP_]], align 4
-// CHECK45-64-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l34_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], i64 [[TMP4]], ptr [[TMP0]], i64 [[TMP6]]) #[[ATTR3:[0-9]+]]
+// CHECK45-64-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l34_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], i64 [[TMP4]], ptr [[TMP0]], i64 [[TMP6]]) #[[ATTR2:[0-9]+]]
 // CHECK45-64-NEXT:    call void @__kmpc_target_deinit()
 // CHECK45-64-NEXT:    ret void
 // CHECK45-64:       worker.exit:
@@ -112,34 +112,33 @@ int bar(int n){
 // CHECK45-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
 // CHECK45-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
 // CHECK45-64-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
-// CHECK45-64-NEXT:    [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4
+// CHECK45-64-NEXT:    [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
 // CHECK45-64-NEXT:    [[I:%.*]] = alloca i32, align 4
 // CHECK45-64-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
 // CHECK45-64-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
 // CHECK45-64-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
 // CHECK45-64-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
-// CHECK45-64-NEXT:    [[I4:%.*]] = alloca i32, align 4
+// CHECK45-64-NEXT:    [[I3:%.*]] = alloca i32, align 4
 // CHECK45-64-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
 // CHECK45-64-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
 // CHECK45-64-NEXT:    store i64 [[N]], ptr [[N_ADDR]], align 8
 // CHECK45-64-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
 // CHECK45-64-NEXT:    store i64 [[L]], ptr [[L_ADDR]], align 8
 // CHECK45-64-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK45-64-NEXT:    [[L1:%.*]] = call align 8 ptr @__kmpc_alloc_shared(i64 4)
 // CHECK45-64-NEXT:    [[TMP1:%.*]] = load i32, ptr [[N_ADDR]], align 4
 // CHECK45-64-NEXT:    store i32 [[TMP1]], ptr [[DOTCAPTURE_EXPR_]], align 4
 // CHECK45-64-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
 // CHECK45-64-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0
 // CHECK45-64-NEXT:    [[DIV:%.*]] = sdiv i32 [[SUB]], 1
-// CHECK45-64-NEXT:    [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1
-// CHECK45-64-NEXT:    store i32 [[SUB3]], ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK45-64-NEXT:    [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK45-64-NEXT:    store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
 // CHECK45-64-NEXT:    store i32 0, ptr [[I]], align 4
 // CHECK45-64-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
 // CHECK45-64-NEXT:    [[CMP:%.*]] = icmp slt i32 0, [[TMP3]]
 // CHECK45-64-NEXT:    br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
 // CHECK45-64:       omp.precond.then:
 // CHECK45-64-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4
-// CHECK45-64-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK45-64-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
 // CHECK45-64-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_UB]], align 4
 // CHECK45-64-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
 // CHECK45-64-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
@@ -149,11 +148,11 @@ int bar(int n){
 // CHECK45-64-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
 // CHECK45-64:       omp.dispatch.cond:
 // CHECK45-64-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
-// CHECK45-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
-// CHECK45-64-NEXT:    [[CMP5:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]]
-// CHECK45-64-NEXT:    br i1 [[CMP5]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK45-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK45-64-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]]
+// CHECK45-64-NEXT:    br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
 // CHECK45-64:       cond.true:
-// CHECK45-64-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK45-64-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
 // CHECK45-64-NEXT:    br label [[COND_END:%.*]]
 // CHECK45-64:       cond.false:
 // CHECK45-64-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
@@ -165,45 +164,45 @@ int bar(int n){
 // CHECK45-64-NEXT:    store i32 [[TMP11]], ptr [[DOTOMP_IV]], align 4
 // CHECK45-64-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
 // CHECK45-64-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
-// CHECK45-64-NEXT:    [[CMP6:%.*]] = icmp sle i32 [[TMP12]], [[TMP13]]
-// CHECK45-64-NEXT:    br i1 [[CMP6]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK45-64-NEXT:    [[CMP5:%.*]] = icmp sle i32 [[TMP12]], [[TMP13]]
+// CHECK45-64-NEXT:    br i1 [[CMP5]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
 // CHECK45-64:       omp.dispatch.body:
 // CHECK45-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK45-64:       omp.inner.for.cond:
 // CHECK45-64-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16:![0-9]+]]
 // CHECK45-64-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK45-64-NEXT:    [[CMP7:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]]
-// CHECK45-64-NEXT:    br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK45-64-NEXT:    [[CMP6:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]]
+// CHECK45-64-NEXT:    br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK45-64:       omp.inner.for.body:
 // CHECK45-64-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
 // CHECK45-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP16]], 1
 // CHECK45-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK45-64-NEXT:    store i32 [[ADD]], ptr [[I4]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK45-64-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I4]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK45-64-NEXT:    store i32 [[ADD]], ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK45-64-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP16]]
 // CHECK45-64-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP17]] to i64
 // CHECK45-64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], ptr [[TMP0]], i64 0, i64 [[IDXPROM]]
 // CHECK45-64-NEXT:    store i32 1, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK45-64-NEXT:    [[TMP18:%.*]] = load i32, ptr [[I4]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK45-64-NEXT:    [[TMP18:%.*]] = load i32, ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP16]]
 // CHECK45-64-NEXT:    store i32 [[TMP18]], ptr [[L_ADDR]], align 4, !llvm.access.group [[ACC_GRP16]]
 // CHECK45-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK45-64:       omp.body.continue:
 // CHECK45-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK45-64:       omp.inner.for.inc:
 // CHECK45-64-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK45-64-NEXT:    [[ADD8:%.*]] = add nsw i32 [[TMP19]], 1
-// CHECK45-64-NEXT:    store i32 [[ADD8]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK45-64-NEXT:    [[ADD7:%.*]] = add nsw i32 [[TMP19]], 1
+// CHECK45-64-NEXT:    store i32 [[ADD7]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
 // CHECK45-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP17:![0-9]+]]
 // CHECK45-64:       omp.inner.for.end:
 // CHECK45-64-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK45-64:       omp.dispatch.inc:
 // CHECK45-64-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
 // CHECK45-64-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
-// CHECK45-64-NEXT:    [[ADD9:%.*]] = add nsw i32 [[TMP20]], [[TMP21]]
-// CHECK45-64-NEXT:    store i32 [[ADD9]], ptr [[DOTOMP_LB]], align 4
+// CHECK45-64-NEXT:    [[ADD8:%.*]] = add nsw i32 [[TMP20]], [[TMP21]]
+// CHECK45-64-NEXT:    store i32 [[ADD8]], ptr [[DOTOMP_LB]], align 4
 // CHECK45-64-NEXT:    [[TMP22:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
 // CHECK45-64-NEXT:    [[TMP23:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
-// CHECK45-64-NEXT:    [[ADD10:%.*]] = add nsw i32 [[TMP22]], [[TMP23]]
-// CHECK45-64-NEXT:    store i32 [[ADD10]], ptr [[DOTOMP_UB]], align 4
+// CHECK45-64-NEXT:    [[ADD9:%.*]] = add nsw i32 [[TMP22]], [[TMP23]]
+// CHECK45-64-NEXT:    store i32 [[ADD9]], ptr [[DOTOMP_UB]], align 4
 // CHECK45-64-NEXT:    br label [[OMP_DISPATCH_COND]]
 // CHECK45-64:       omp.dispatch.end:
 // CHECK45-64-NEXT:    [[TMP24:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
@@ -214,11 +213,11 @@ int bar(int n){
 // CHECK45-64-NEXT:    br i1 [[TMP27]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
 // CHECK45-64:       .omp.final.then:
 // CHECK45-64-NEXT:    [[TMP28:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
-// CHECK45-64-NEXT:    [[SUB11:%.*]] = sub nsw i32 [[TMP28]], 0
-// CHECK45-64-NEXT:    [[DIV12:%.*]] = sdiv i32 [[SUB11]], 1
-// CHECK45-64-NEXT:    [[MUL13:%.*]] = mul nsw i32 [[DIV12]], 1
-// CHECK45-64-NEXT:    [[ADD14:%.*]] = add nsw i32 0, [[MUL13]]
-// CHECK45-64-NEXT:    store i32 [[ADD14]], ptr [[I4]], align 4
+// CHECK45-64-NEXT:    [[SUB10:%.*]] = sub nsw i32 [[TMP28]], 0
+// CHECK45-64-NEXT:    [[DIV11:%.*]] = sdiv i32 [[SUB10]], 1
+// CHECK45-64-NEXT:    [[MUL12:%.*]] = mul nsw i32 [[DIV11]], 1
+// CHECK45-64-NEXT:    [[ADD13:%.*]] = add nsw i32 0, [[MUL12]]
+// CHECK45-64-NEXT:    store i32 [[ADD13]], ptr [[I3]], align 4
 // CHECK45-64-NEXT:    br label [[DOTOMP_FINAL_DONE]]
 // CHECK45-64:       .omp.final.done:
 // CHECK45-64-NEXT:    [[TMP29:%.*]] = load i32, ptr [[DOTOMP_IS_LAST]], align 4
@@ -231,12 +230,11 @@ int bar(int n){
 // CHECK45-64:       .omp.lastprivate.done:
 // CHECK45-64-NEXT:    br label [[OMP_PRECOND_END]]
 // CHECK45-64:       omp.precond.end:
-// CHECK45-64-NEXT:    call void @__kmpc_free_shared(ptr [[L1]], i64 4)
 // CHECK45-64-NEXT:    ret void
 //
 //
 // CHECK45-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l40
-// CHECK45-64-SAME: (i64 [[N:%.*]], ptr nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR5:[0-9]+]] {
+// CHECK45-64-SAME: (i64 [[N:%.*]], ptr nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR3:[0-9]+]] {
 // CHECK45-64-NEXT:  entry:
 // CHECK45-64-NEXT:    [[N_ADDR:%.*]] = alloca i64, align 8
 // CHECK45-64-NEXT:    [[AA_ADDR:%.*]] = alloca ptr, align 8
@@ -256,7 +254,7 @@ int bar(int n){
 // CHECK45-64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[N_CASTED]], align 8
 // CHECK45-64-NEXT:    store i32 0, ptr [[DOTZERO_ADDR]], align 4
 // CHECK45-64-NEXT:    store i32 [[TMP2]], ptr [[DOTTHREADID_TEMP_]], align 4
-// CHECK45-64-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l40_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], i64 [[TMP4]], ptr [[TMP0]]) #[[ATTR3]]
+// CHECK45-64-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l40_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], i64 [[TMP4]], ptr [[TMP0]]) #[[ATTR2]]
 // CHECK45-64-NEXT:    call void @__kmpc_target_deinit()
 // CHECK45-64-NEXT:    ret void
 // CHECK45-64:       worker.exit:
@@ -403,7 +401,7 @@ int bar(int n){
 // CHECK45-64-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK45-64-NEXT:    store i32 0, ptr [[DOTZERO_ADDR]], align 4
 // CHECK45-64-NEXT:    store i32 [[TMP2]], ptr [[DOTTHREADID_TEMP_]], align 4
-// CHECK45-64-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l45_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], ptr [[TMP0]]) #[[ATTR3]]
+// CHECK45-64-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l45_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], ptr [[TMP0]]) #[[ATTR2]]
 // CHECK45-64-NEXT:    call void @__kmpc_target_deinit()
 // CHECK45-64-NEXT:    ret void
 // CHECK45-64:       worker.exit:
@@ -525,7 +523,7 @@ int bar(int n){
 // CHECK45-64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[F_CASTED]], align 8
 // CHECK45-64-NEXT:    store i32 0, ptr [[DOTZERO_ADDR]], align 4
 // CHECK45-64-NEXT:    store i32 [[TMP2]], ptr [[DOTTHREADID_TEMP_]], align 4
-// CHECK45-64-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l50_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], ptr [[TMP0]], i64 [[TMP4]]) #[[ATTR3]]
+// CHECK45-64-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l50_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], ptr [[TMP0]], i64 [[TMP4]]) #[[ATTR2]]
 // CHECK45-64-NEXT:    call void @__kmpc_target_deinit()
 // CHECK45-64-NEXT:    ret void
 // CHECK45-64:       worker.exit:
@@ -677,7 +675,7 @@ int bar(int n){
 // CHECK45-32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[L_CASTED]], align 4
 // CHECK45-32-NEXT:    store i32 0, ptr [[DOTZERO_ADDR]], align 4
 // CHECK45-32-NEXT:    store i32 [[TMP2]], ptr [[DOTTHREADID_TEMP_]], align 4
-// CHECK45-32-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l34_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], i32 [[TMP4]], ptr [[TMP0]], i32 [[TMP6]]) #[[ATTR3:[0-9]+]]
+// CHECK45-32-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l34_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], i32 [[TMP4]], ptr [[TMP0]], i32 [[TMP6]]) #[[ATTR2:[0-9]+]]
 // CHECK45-32-NEXT:    call void @__kmpc_target_deinit()
 // CHECK45-32-NEXT:    ret void
 // CHECK45-32:       worker.exit:
@@ -695,34 +693,33 @@ int bar(int n){
 // CHECK45-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
 // CHECK45-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
 // CHECK45-32-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
-// CHECK45-32-NEXT:    [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4
+// CHECK45-32-NEXT:    [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
 // CHECK45-32-NEXT:    [[I:%.*]] = alloca i32, align 4
 // CHECK45-32-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
 // CHECK45-32-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
 // CHECK45-32-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
 // CHECK45-32-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
-// CHECK45-32-NEXT:    [[I4:%.*]] = alloca i32, align 4
+// CHECK45-32-NEXT:    [[I3:%.*]] = alloca i32, align 4
 // CHECK45-32-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
 // CHECK45-32-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
 // CHECK45-32-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
 // CHECK45-32-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
 // CHECK45-32-NEXT:    store i32 [[L]], ptr [[L_ADDR]], align 4
 // CHECK45-32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
-// CHECK45-32-NEXT:    [[L1:%.*]] = call align 8 ptr @__kmpc_alloc_shared(i32 4)
 // CHECK45-32-NEXT:    [[TMP1:%.*]] = load i32, ptr [[N_ADDR]], align 4
 // CHECK45-32-NEXT:    store i32 [[TMP1]], ptr [[DOTCAPTURE_EXPR_]], align 4
 // CHECK45-32-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
 // CHECK45-32-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0
 // CHECK45-32-NEXT:    [[DIV:%.*]] = sdiv i32 [[SUB]], 1
-// CHECK45-32-NEXT:    [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1
-// CHECK45-32-NEXT:    store i32 [[SUB3]], ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK45-32-NEXT:    [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK45-32-NEXT:    store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
 // CHECK45-32-NEXT:    store i32 0, ptr [[I]], align 4
 // CHECK45-32-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
 // CHECK45-32-NEXT:    [[CMP:%.*]] = icmp slt i32 0, [[TMP3]]
 // CHECK45-32-NEXT:    br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
 // CHECK45-32:       omp.precond.then:
 // CHECK45-32-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4
-// CHECK45-32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK45-32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
 // CHECK45-32-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_UB]], align 4
 // CHECK45-32-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
 // CHECK45-32-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
@@ -732,11 +729,11 @@ int bar(int n){
 // CHECK45-32-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
 // CHECK45-32:       omp.dispatch.cond:
 // CHECK45-32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
-// CHECK45-32-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
-// CHECK45-32-NEXT:    [[CMP5:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]]
-// CHECK45-32-NEXT:    br i1 [[CMP5]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK45-32-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK45-32-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]]
+// CHECK45-32-NEXT:    br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
 // CHECK45-32:       cond.true:
-// CHECK45-32-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK45-32-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
 // CHECK45-32-NEXT:    br label [[COND_END:%.*]]
 // CHECK45-32:       cond.false:
 // CHECK45-32-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
@@ -748,44 +745,44 @@ int bar(int n){
 // CHECK45-32-NEXT:    store i32 [[TMP11]], ptr [[DOTOMP_IV]], align 4
 // CHECK45-32-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
 // CHECK45-32-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
-// CHECK45-32-NEXT:    [[CMP6:%.*]] = icmp sle i32 [[TMP12]], [[TMP13]]
-// CHECK45-32-NEXT:    br i1 [[CMP6]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK45-32-NEXT:    [[CMP5:%.*]] = icmp sle i32 [[TMP12]], [[TMP13]]
+// CHECK45-32-NEXT:    br i1 [[CMP5]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
 // CHECK45-32:       omp.dispatch.body:
 // CHECK45-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK45-32:       omp.inner.for.cond:
 // CHECK45-32-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16:![0-9]+]]
 // CHECK45-32-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK45-32-NEXT:    [[CMP7:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]]
-// CHECK45-32-NEXT:    br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK45-32-NEXT:    [[CMP6:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]]
+// CHECK45-32-NEXT:    br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK45-32:       omp.inner.for.body:
 // CHECK45-32-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
 // CHECK45-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP16]], 1
 // CHECK45-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK45-32-NEXT:    store i32 [[ADD]], ptr [[I4]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK45-32-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I4]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK45-32-NEXT:    store i32 [[ADD]], ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK45-32-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP16]]
 // CHECK45-32-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], ptr [[TMP0]], i32 0, i32 [[TMP17]]
 // CHECK45-32-NEXT:    store i32 1, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK45-32-NEXT:    [[TMP18:%.*]] = load i32, ptr [[I4]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK45-32-NEXT:    [[TMP18:%.*]] = load i32, ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP16]]
 // CHECK45-32-NEXT:    store i32 [[TMP18]], ptr [[L_ADDR]], align 4, !llvm.access.group [[ACC_GRP16]]
 // CHECK45-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK45-32:       omp.body.continue:
 // CHECK45-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK45-32:       omp.inner.for.inc:
 // CHECK45-32-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK45-32-NEXT:    [[ADD8:%.*]] = add nsw i32 [[TMP19]], 1
-// CHECK45-32-NEXT:    store i32 [[ADD8]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK45-32-NEXT:    [[ADD7:%.*]] = add nsw i32 [[TMP19]], 1
+// CHECK45-32-NEXT:    store i32 [[ADD7]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
 // CHECK45-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP17:![0-9]+]]
 // CHECK45-32:       omp.inner.for.end:
 // CHECK45-32-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK45-32:       omp.dispatch.inc:
 // CHECK45-32-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
 // CHECK45-32-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
-// CHECK45-32-NEXT:    [[ADD9:%.*]] = add nsw i32 [[TMP20]], [[TMP21]]
-// CHECK45-32-NEXT:    store i32 [[ADD9]], ptr [[DOTOMP_LB]], align 4
+// CHECK45-32-NEXT:    [[ADD8:%.*]] = add nsw i32 [[TMP20]], [[TMP21]]
+// CHECK45-32-NEXT:    store i32 [[ADD8]], ptr [[DOTOMP_LB]], align 4
 // CHECK45-32-NEXT:    [[TMP22:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
 // CHECK45-32-NEXT:    [[TMP23:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
-// CHECK45-32-NEXT:    [[ADD10:%.*]] = add nsw i32 [[TMP22]], [[TMP23]]
-// CHECK45-32-NEXT:    store i32 [[ADD10]], ptr [[DOTOMP_UB]], align 4
+// CHECK45-32-NEXT:    [[ADD9:%.*]] = add nsw i32 [[TMP22]], [[TMP23]]
+// CHECK45-32-NEXT:    store i32 [[ADD9]], ptr [[DOTOMP_UB]], align 4
 // CHECK45-32-NEXT:    br label [[OMP_DISPATCH_COND]]
 // CHECK45-32:       omp.dispatch.end:
 // CHECK45-32-NEXT:    [[TMP24:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
@@ -796,11 +793,11 @@ int bar(int n){
 // CHECK45-32-NEXT:    br i1 [[TMP27]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
 // CHECK45-32:       .omp.final.then:
 // CHECK45-32-NEXT:    [[TMP28:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
-// CHECK45-32-NEXT:    [[SUB11:%.*]] = sub nsw i32 [[TMP28]], 0
-// CHECK45-32-NEXT:    [[DIV12:%.*]] = sdiv i32 [[SUB11]], 1
-// CHECK45-32-NEXT:    [[MUL13:%.*]] = mul nsw i32 [[DIV12]], 1
-// CHECK45-32-NEXT:    [[ADD14:%.*]] = add nsw i32 0, [[MUL13]]
-// CHECK45-32-NEXT:    store i32 [[ADD14]], ptr [[I4]], align 4
+// CHECK45-32-NEXT:    [[SUB10:%.*]] = sub nsw i32 [[TMP28]], 0
+// CHECK45-32-NEXT:    [[DIV11:%.*]] = sdiv i32 [[SUB10]], 1
+// CHECK45-32-NEXT:    [[MUL12:%.*]] = mul nsw i32 [[DIV11]], 1
+// CHECK45-32-NEXT:    [[ADD13:%.*]] = add nsw i32 0, [[MUL12]]
+// CHECK45-32-NEXT:    store i32 [[ADD13]], ptr [[I3]], align 4
 // CHECK45-32-NEXT:    br label [[DOTOMP_FINAL_DONE]]
 // CHECK45-32:       .omp.final.done:
 // CHECK45-32-NEXT:    [[TMP29:%.*]] = load i32, ptr [[DOTOMP_IS_LAST]], align 4
@@ -813,12 +810,11 @@ int bar(int n){
 // CHECK45-32:       .omp.lastprivate.done:
 // CHECK45-32-NEXT:    br label [[OMP_PRECOND_END]]
 // CHECK45-32:       omp.precond.end:
-// CHECK45-32-NEXT:    call void @__kmpc_free_shared(ptr [[L1]], i32 4)
 // CHECK45-32-NEXT:    ret void
 //
 //
 // CHECK45-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l40
-// CHECK45-32-SAME: (i32 [[N:%.*]], ptr nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR5:[0-9]+]] {
+// CHECK45-32-SAME: (i32 [[N:%.*]], ptr nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR3:[0-9]+]] {
 // CHECK45-32-NEXT:  entry:
 // CHECK45-32-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
 // CHECK45-32-NEXT:    [[AA_ADDR:%.*]] = alloca ptr, align 4
@@ -838,7 +834,7 @@ int bar(int n){
 // CHECK45-32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[N_CASTED]], align 4
 // CHECK45-32-NEXT:    store i32 0, ptr [[DOTZERO_ADDR]], align 4
 // CHECK45-32-NEXT:    store i32 [[TMP2]], ptr [[DOTTHREADID_TEMP_]], align 4
-// CHECK45-32-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l40_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], i32 [[TMP4]], ptr [[TMP0]]) #[[ATTR3]]
+// CHECK45-32-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l40_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], i32 [[TMP4]], ptr [[TMP0]]) #[[ATTR2]]
 // CHECK45-32-NEXT:    call void @__kmpc_target_deinit()
 // CHECK45-32-NEXT:    ret void
 // CHECK45-32:       worker.exit:
@@ -984,7 +980,7 @@ int bar(int n){
 // CHECK45-32-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK45-32-NEXT:    store i32 0, ptr [[DOTZERO_ADDR]], align 4
 // CHECK45-32-NEXT:    store i32 [[TMP2]], ptr [[DOTTHREADID_TEMP_]], align 4
-// CHECK45-32-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l45_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], ptr [[TMP0]]) #[[ATTR3]]
+// CHECK45-32-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l45_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], ptr [[TMP0]]) #[[ATTR2]]
 // CHECK45-32-NEXT:    call void @__kmpc_target_deinit()
 // CHECK45-32-NEXT:    ret void
 // CHECK45-32:       worker.exit:
@@ -1105,7 +1101,7 @@ int bar(int n){
 // CHECK45-32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[F_CASTED]], align 4
 // CHECK45-32-NEXT:    store i32 0, ptr [[DOTZERO_ADDR]], align 4
 // CHECK45-32-NEXT:    store i32 [[TMP2]], ptr [[DOTTHREADID_TEMP_]], align 4
-// CHECK45-32-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l50_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], ptr [[TMP0]], i32 [[TMP4]]) #[[ATTR3]]
+// CHECK45-32-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l50_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], ptr [[TMP0]], i32 [[TMP4]]) #[[ATTR2]]
 // CHECK45-32-NEXT:    call void @__kmpc_target_deinit()
 // CHECK45-32-NEXT:    ret void
 // CHECK45-32:       worker.exit:
@@ -1255,7 +1251,7 @@ int bar(int n){
 // CHECK45-32-EX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[L_CASTED]], align 4
 // CHECK45-32-EX-NEXT:    store i32 0, ptr [[DOTZERO_ADDR]], align 4
 // CHECK45-32-EX-NEXT:    store i32 [[TMP2]], ptr [[DOTTHREADID_TEMP_]], align 4
-// CHECK45-32-EX-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l34_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], i32 [[TMP4]], ptr [[TMP0]], i32 [[TMP6]]) #[[ATTR3:[0-9]+]]
+// CHECK45-32-EX-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l34_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], i32 [[TMP4]], ptr [[TMP0]], i32 [[TMP6]]) #[[ATTR2:[0-9]+]]
 // CHECK45-32-EX-NEXT:    call void @__kmpc_target_deinit()
 // CHECK45-32-EX-NEXT:    ret void
 // CHECK45-32-EX:       worker.exit:
@@ -1273,34 +1269,33 @@ int bar(int n){
 // CHECK45-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
 // CHECK45-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
 // CHECK45-32-EX-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
-// CHECK45-32-EX-NEXT:    [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4
+// CHECK45-32-EX-NEXT:    [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
 // CHECK45-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
 // CHECK45-32-EX-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
 // CHECK45-32-EX-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
 // CHECK45-32-EX-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
 // CHECK45-32-EX-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
-// CHECK45-32-EX-NEXT:    [[I4:%.*]] = alloca i32, align 4
+// CHECK45-32-EX-NEXT:    [[I3:%.*]] = alloca i32, align 4
 // CHECK45-32-EX-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
 // CHECK45-32-EX-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
 // CHECK45-32-EX-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
 // CHECK45-32-EX-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
 // CHECK45-32-EX-NEXT:    store i32 [[L]], ptr [[L_ADDR]], align 4
 // CHECK45-32-EX-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
-// CHECK45-32-EX-NEXT:    [[L1:%.*]] = call align 8 ptr @__kmpc_alloc_shared(i32 4)
 // CHECK45-32-EX-NEXT:    [[TMP1:%.*]] = load i32, ptr [[N_ADDR]], align 4
 // CHECK45-32-EX-NEXT:    store i32 [[TMP1]], ptr [[DOTCAPTURE_EXPR_]], align 4
 // CHECK45-32-EX-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
 // CHECK45-32-EX-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0
 // CHECK45-32-EX-NEXT:    [[DIV:%.*]] = sdiv i32 [[SUB]], 1
-// CHECK45-32-EX-NEXT:    [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1
-// CHECK45-32-EX-NEXT:    store i32 [[SUB3]], ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK45-32-EX-NEXT:    [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK45-32-EX-NEXT:    store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
 // CHECK45-32-EX-NEXT:    store i32 0, ptr [[I]], align 4
 // CHECK45-32-EX-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
 // CHECK45-32-EX-NEXT:    [[CMP:%.*]] = icmp slt i32 0, [[TMP3]]
 // CHECK45-32-EX-NEXT:    br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
 // CHECK45-32-EX:       omp.precond.then:
 // CHECK45-32-EX-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4
-// CHECK45-32-EX-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK45-32-EX-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
 // CHECK45-32-EX-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_UB]], align 4
 // CHECK45-32-EX-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
 // CHECK45-32-EX-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
@@ -1310,11 +1305,11 @@ int bar(int n){
 // CHECK45-32-EX-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
 // CHECK45-32-EX:       omp.dispatch.cond:
 // CHECK45-32-EX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
-// CHECK45-32-EX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
-// CHECK45-32-EX-NEXT:    [[CMP5:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]]
-// CHECK45-32-EX-NEXT:    br i1 [[CMP5]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK45-32-EX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK45-32-EX-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]]
+// CHECK45-32-EX-NEXT:    br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
 // CHECK45-32-EX:       cond.true:
-// CHECK45-32-EX-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK45-32-EX-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
 // CHECK45-32-EX-NEXT:    br label [[COND_END:%.*]]
 // CHECK45-32-EX:       cond.false:
 // CHECK45-32-EX-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
@@ -1326,44 +1321,44 @@ int bar(int n){
 // CHECK45-32-EX-NEXT:    store i32 [[TMP11]], ptr [[DOTOMP_IV]], align 4
 // CHECK45-32-EX-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
 // CHECK45-32-EX-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
-// CHECK45-32-EX-NEXT:    [[CMP6:%.*]] = icmp sle i32 [[TMP12]], [[TMP13]]
-// CHECK45-32-EX-NEXT:    br i1 [[CMP6]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK45-32-EX-NEXT:    [[CMP5:%.*]] = icmp sle i32 [[TMP12]], [[TMP13]]
+// CHECK45-32-EX-NEXT:    br i1 [[CMP5]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
 // CHECK45-32-EX:       omp.dispatch.body:
 // CHECK45-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK45-32-EX:       omp.inner.for.cond:
 // CHECK45-32-EX-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16:![0-9]+]]
 // CHECK45-32-EX-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK45-32-EX-NEXT:    [[CMP7:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]]
-// CHECK45-32-EX-NEXT:    br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK45-32-EX-NEXT:    [[CMP6:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]]
+// CHECK45-32-EX-NEXT:    br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK45-32-EX:       omp.inner.for.body:
 // CHECK45-32-EX-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
 // CHECK45-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP16]], 1
 // CHECK45-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK45-32-EX-NEXT:    store i32 [[ADD]], ptr [[I4]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK45-32-EX-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I4]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK45-32-EX-NEXT:    store i32 [[ADD]], ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK45-32-EX-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP16]]
 // CHECK45-32-EX-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], ptr [[TMP0]], i32 0, i32 [[TMP17]]
 // CHECK45-32-EX-NEXT:    store i32 1, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK45-32-EX-NEXT:    [[TMP18:%.*]] = load i32, ptr [[I4]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK45-32-EX-NEXT:    [[TMP18:%.*]] = load i32, ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP16]]
 // CHECK45-32-EX-NEXT:    store i32 [[TMP18]], ptr [[L_ADDR]], align 4, !llvm.access.group [[ACC_GRP16]]
 // CHECK45-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK45-32-EX:       omp.body.continue:
 // CHECK45-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK45-32-EX:       omp.inner.for.inc:
 // CHECK45-32-EX-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK45-32-EX-NEXT:    [[ADD8:%.*]] = add nsw i32 [[TMP19]], 1
-// CHECK45-32-EX-NEXT:    store i32 [[ADD8]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK45-32-EX-NEXT:    [[ADD7:%.*]] = add nsw i32 [[TMP19]], 1
+// CHECK45-32-EX-NEXT:    store i32 [[ADD7]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
 // CHECK45-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP17:![0-9]+]]
 // CHECK45-32-EX:       omp.inner.for.end:
 // CHECK45-32-EX-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK45-32-EX:       omp.dispatch.inc:
 // CHECK45-32-EX-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
 // CHECK45-32-EX-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
-// CHECK45-32-EX-NEXT:    [[ADD9:%.*]] = add nsw i32 [[TMP20]], [[TMP21]]
-// CHECK45-32-EX-NEXT:    store i32 [[ADD9]], ptr [[DOTOMP_LB]], align 4
+// CHECK45-32-EX-NEXT:    [[ADD8:%.*]] = add nsw i32 [[TMP20]], [[TMP21]]
+// CHECK45-32-EX-NEXT:    store i32 [[ADD8]], ptr [[DOTOMP_LB]], align 4
 // CHECK45-32-EX-NEXT:    [[TMP22:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
 // CHECK45-32-EX-NEXT:    [[TMP23:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
-// CHECK45-32-EX-NEXT:    [[ADD10:%.*]] = add nsw i32 [[TMP22]], [[TMP23]]
-// CHECK45-32-EX-NEXT:    store i32 [[ADD10]], ptr [[DOTOMP_UB]], align 4
+// CHECK45-32-EX-NEXT:    [[ADD9:%.*]] = add nsw i32 [[TMP22]], [[TMP23]]
+// CHECK45-32-EX-NEXT:    store i32 [[ADD9]], ptr [[DOTOMP_UB]], align 4
 // CHECK45-32-EX-NEXT:    br label [[OMP_DISPATCH_COND]]
 // CHECK45-32-EX:       omp.dispatch.end:
 // CHECK45-32-EX-NEXT:    [[TMP24:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
@@ -1374,11 +1369,11 @@ int bar(int n){
 // CHECK45-32-EX-NEXT:    br i1 [[TMP27]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
 // CHECK45-32-EX:       .omp.final.then:
 // CHECK45-32-EX-NEXT:    [[TMP28:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
-// CHECK45-32-EX-NEXT:    [[SUB11:%.*]] = sub nsw i32 [[TMP28]], 0
-// CHECK45-32-EX-NEXT:    [[DIV12:%.*]] = sdiv i32 [[SUB11]], 1
-// CHECK45-32-EX-NEXT:    [[MUL13:%.*]] = mul nsw i32 [[DIV12]], 1
-// CHECK45-32-EX-NEXT:    [[ADD14:%.*]] = add nsw i32 0, [[MUL13]]
-// CHECK45-32-EX-NEXT:    store i32 [[ADD14]], ptr [[I4]], align 4
+// CHECK45-32-EX-NEXT:    [[SUB10:%.*]] = sub nsw i32 [[TMP28]], 0
+// CHECK45-32-EX-NEXT:    [[DIV11:%.*]] = sdiv i32 [[SUB10]], 1
+// CHECK45-32-EX-NEXT:    [[MUL12:%.*]] = mul nsw i32 [[DIV11]], 1
+// CHECK45-32-EX-NEXT:    [[ADD13:%.*]] = add nsw i32 0, [[MUL12]]
+// CHECK45-32-EX-NEXT:    store i32 [[ADD13]], ptr [[I3]], align 4
 // CHECK45-32-EX-NEXT:    br label [[DOTOMP_FINAL_DONE]]
 // CHECK45-32-EX:       .omp.final.done:
 // CHECK45-32-EX-NEXT:    [[TMP29:%.*]] = load i32, ptr [[DOTOMP_IS_LAST]], align 4
@@ -1391,12 +1386,11 @@ int bar(int n){
 // CHECK45-32-EX:       .omp.lastprivate.done:
 // CHECK45-32-EX-NEXT:    br label [[OMP_PRECOND_END]]
 // CHECK45-32-EX:       omp.precond.end:
-// CHECK45-32-EX-NEXT:    call void @__kmpc_free_shared(ptr [[L1]], i32 4)
 // CHECK45-32-EX-NEXT:    ret void
 //
 //
 // CHECK45-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l40
-// CHECK45-32-EX-SAME: (i32 [[N:%.*]], ptr nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR5:[0-9]+]] {
+// CHECK45-32-EX-SAME: (i32 [[N:%.*]], ptr nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR3:[0-9]+]] {
 // CHECK45-32-EX-NEXT:  entry:
 // CHECK45-32-EX-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
 // CHECK45-32-EX-NEXT:    [[AA_ADDR:%.*]] = alloca ptr, align 4
@@ -1416,7 +1410,7 @@ int bar(int n){
 // CHECK45-32-EX-NEXT:    [[TMP4:%.*]] = load i32, ptr [[N_CASTED]], align 4
 // CHECK45-32-EX-NEXT:    store i32 0, ptr [[DOTZERO_ADDR]], align 4
 // CHECK45-32-EX-NEXT:    store i32 [[TMP2]], ptr [[DOTTHREADID_TEMP_]], align 4
-// CHECK45-32-EX-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l40_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], i32 [[TMP4]], ptr [[TMP0]]) #[[ATTR3]]
+// CHECK45-32-EX-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l40_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], i32 [[TMP4]], ptr [[TMP0]]) #[[ATTR2]]
 // CHECK45-32-EX-NEXT:    call void @__kmpc_target_deinit()
 // CHECK45-32-EX-NEXT:    ret void
 // CHECK45-32-EX:       worker.exit:
@@ -1562,7 +1556,7 @@ int bar(int n){
 // CHECK45-32-EX-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK45-32-EX-NEXT:    store i32 0, ptr [[DOTZERO_ADDR]], align 4
 // CHECK45-32-EX-NEXT:    store i32 [[TMP2]], ptr [[DOTTHREADID_TEMP_]], align 4
-// CHECK45-32-EX-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l45_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], ptr [[TMP0]]) #[[ATTR3]]
+// CHECK45-32-EX-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l45_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], ptr [[TMP0]]) #[[ATTR2]]
 // CHECK45-32-EX-NEXT:    call void @__kmpc_target_deinit()
 // CHECK45-32-EX-NEXT:    ret void
 // CHECK45-32-EX:       worker.exit:
@@ -1683,7 +1677,7 @@ int bar(int n){
 // CHECK45-32-EX-NEXT:    [[TMP4:%.*]] = load i32, ptr [[F_CASTED]], align 4
 // CHECK45-32-EX-NEXT:    store i32 0, ptr [[DOTZERO_ADDR]], align 4
 // CHECK45-32-EX-NEXT:    store i32 [[TMP2]], ptr [[DOTTHREADID_TEMP_]], align 4
-// CHECK45-32-EX-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l50_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], ptr [[TMP0]], i32 [[TMP4]]) #[[ATTR3]]
+// CHECK45-32-EX-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l50_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], ptr [[TMP0]], i32 [[TMP4]]) #[[ATTR2]]
 // CHECK45-32-EX-NEXT:    call void @__kmpc_target_deinit()
 // CHECK45-32-EX-NEXT:    ret void
 // CHECK45-32-EX:       worker.exit:
@@ -1833,7 +1827,7 @@ int bar(int n){
 // CHECK-64-NEXT:    [[TMP6:%.*]] = load i64, ptr [[L_CASTED]], align 8
 // CHECK-64-NEXT:    store i32 0, ptr [[DOTZERO_ADDR]], align 4
 // CHECK-64-NEXT:    store i32 [[TMP2]], ptr [[DOTTHREADID_TEMP_]], align 4
-// CHECK-64-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l34_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], i64 [[TMP4]], ptr [[TMP0]], i64 [[TMP6]]) #[[ATTR3:[0-9]+]]
+// CHECK-64-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l34_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], i64 [[TMP4]], ptr [[TMP0]], i64 [[TMP6]]) #[[ATTR2:[0-9]+]]
 // CHECK-64-NEXT:    call void @__kmpc_target_deinit()
 // CHECK-64-NEXT:    ret void
 // CHECK-64:       worker.exit:
@@ -1851,34 +1845,33 @@ int bar(int n){
 // CHECK-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
 // CHECK-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
 // CHECK-64-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
-// CHECK-64-NEXT:    [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
 // CHECK-64-NEXT:    [[I:%.*]] = alloca i32, align 4
 // CHECK-64-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
 // CHECK-64-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
 // CHECK-64-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
 // CHECK-64-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
-// CHECK-64-NEXT:    [[I4:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    [[I3:%.*]] = alloca i32, align 4
 // CHECK-64-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
 // CHECK-64-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
 // CHECK-64-NEXT:    store i64 [[N]], ptr [[N_ADDR]], align 8
 // CHECK-64-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
 // CHECK-64-NEXT:    store i64 [[L]], ptr [[L_ADDR]], align 8
 // CHECK-64-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-64-NEXT:    [[L1:%.*]] = call align 8 ptr @__kmpc_alloc_shared(i64 4)
 // CHECK-64-NEXT:    [[TMP1:%.*]] = load i32, ptr [[N_ADDR]], align 4
 // CHECK-64-NEXT:    store i32 [[TMP1]], ptr [[DOTCAPTURE_EXPR_]], align 4
 // CHECK-64-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
 // CHECK-64-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0
 // CHECK-64-NEXT:    [[DIV:%.*]] = sdiv i32 [[SUB]], 1
-// CHECK-64-NEXT:    [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1
-// CHECK-64-NEXT:    store i32 [[SUB3]], ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK-64-NEXT:    [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK-64-NEXT:    store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
 // CHECK-64-NEXT:    store i32 0, ptr [[I]], align 4
 // CHECK-64-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
 // CHECK-64-NEXT:    [[CMP:%.*]] = icmp slt i32 0, [[TMP3]]
 // CHECK-64-NEXT:    br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
 // CHECK-64:       omp.precond.then:
 // CHECK-64-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4
-// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
 // CHECK-64-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_UB]], align 4
 // CHECK-64-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
 // CHECK-64-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
@@ -1888,11 +1881,11 @@ int bar(int n){
 // CHECK-64-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
 // CHECK-64:       omp.dispatch.cond:
 // CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
-// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
-// CHECK-64-NEXT:    [[CMP5:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]]
-// CHECK-64-NEXT:    br i1 [[CMP5]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK-64-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]]
+// CHECK-64-NEXT:    br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
 // CHECK-64:       cond.true:
-// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
 // CHECK-64-NEXT:    br label [[COND_END:%.*]]
 // CHECK-64:       cond.false:
 // CHECK-64-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
@@ -1904,45 +1897,45 @@ int bar(int n){
 // CHECK-64-NEXT:    store i32 [[TMP11]], ptr [[DOTOMP_IV]], align 4
 // CHECK-64-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
 // CHECK-64-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
-// CHECK-64-NEXT:    [[CMP6:%.*]] = icmp sle i32 [[TMP12]], [[TMP13]]
-// CHECK-64-NEXT:    br i1 [[CMP6]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-64-NEXT:    [[CMP5:%.*]] = icmp sle i32 [[TMP12]], [[TMP13]]
+// CHECK-64-NEXT:    br i1 [[CMP5]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
 // CHECK-64:       omp.dispatch.body:
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-64:       omp.inner.for.cond:
 // CHECK-64-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16:![0-9]+]]
 // CHECK-64-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK-64-NEXT:    [[CMP7:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]]
-// CHECK-64-NEXT:    br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-64-NEXT:    [[CMP6:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]]
+// CHECK-64-NEXT:    br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-64:       omp.inner.for.body:
 // CHECK-64-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
 // CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP16]], 1
 // CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I4]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK-64-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I4]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK-64-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP16]]
 // CHECK-64-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP17]] to i64
 // CHECK-64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], ptr [[TMP0]], i64 0, i64 [[IDXPROM]]
 // CHECK-64-NEXT:    store i32 1, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK-64-NEXT:    [[TMP18:%.*]] = load i32, ptr [[I4]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK-64-NEXT:    [[TMP18:%.*]] = load i32, ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP16]]
 // CHECK-64-NEXT:    store i32 [[TMP18]], ptr [[L_ADDR]], align 4, !llvm.access.group [[ACC_GRP16]]
 // CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-64:       omp.body.continue:
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-64:       omp.inner.for.inc:
 // CHECK-64-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK-64-NEXT:    [[ADD8:%.*]] = add nsw i32 [[TMP19]], 1
-// CHECK-64-NEXT:    store i32 [[ADD8]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK-64-NEXT:    [[ADD7:%.*]] = add nsw i32 [[TMP19]], 1
+// CHECK-64-NEXT:    store i32 [[ADD7]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP17:![0-9]+]]
 // CHECK-64:       omp.inner.for.end:
 // CHECK-64-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-64:       omp.dispatch.inc:
 // CHECK-64-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
 // CHECK-64-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
-// CHECK-64-NEXT:    [[ADD9:%.*]] = add nsw i32 [[TMP20]], [[TMP21]]
-// CHECK-64-NEXT:    store i32 [[ADD9]], ptr [[DOTOMP_LB]], align 4
+// CHECK-64-NEXT:    [[ADD8:%.*]] = add nsw i32 [[TMP20]], [[TMP21]]
+// CHECK-64-NEXT:    store i32 [[ADD8]], ptr [[DOTOMP_LB]], align 4
 // CHECK-64-NEXT:    [[TMP22:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
 // CHECK-64-NEXT:    [[TMP23:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
-// CHECK-64-NEXT:    [[ADD10:%.*]] = add nsw i32 [[TMP22]], [[TMP23]]
-// CHECK-64-NEXT:    store i32 [[ADD10]], ptr [[DOTOMP_UB]], align 4
+// CHECK-64-NEXT:    [[ADD9:%.*]] = add nsw i32 [[TMP22]], [[TMP23]]
+// CHECK-64-NEXT:    store i32 [[ADD9]], ptr [[DOTOMP_UB]], align 4
 // CHECK-64-NEXT:    br label [[OMP_DISPATCH_COND]]
 // CHECK-64:       omp.dispatch.end:
 // CHECK-64-NEXT:    [[TMP24:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
@@ -1953,11 +1946,11 @@ int bar(int n){
 // CHECK-64-NEXT:    br i1 [[TMP27]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
 // CHECK-64:       .omp.final.then:
 // CHECK-64-NEXT:    [[TMP28:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
-// CHECK-64-NEXT:    [[SUB11:%.*]] = sub nsw i32 [[TMP28]], 0
-// CHECK-64-NEXT:    [[DIV12:%.*]] = sdiv i32 [[SUB11]], 1
-// CHECK-64-NEXT:    [[MUL13:%.*]] = mul nsw i32 [[DIV12]], 1
-// CHECK-64-NEXT:    [[ADD14:%.*]] = add nsw i32 0, [[MUL13]]
-// CHECK-64-NEXT:    store i32 [[ADD14]], ptr [[I4]], align 4
+// CHECK-64-NEXT:    [[SUB10:%.*]] = sub nsw i32 [[TMP28]], 0
+// CHECK-64-NEXT:    [[DIV11:%.*]] = sdiv i32 [[SUB10]], 1
+// CHECK-64-NEXT:    [[MUL12:%.*]] = mul nsw i32 [[DIV11]], 1
+// CHECK-64-NEXT:    [[ADD13:%.*]] = add nsw i32 0, [[MUL12]]
+// CHECK-64-NEXT:    store i32 [[ADD13]], ptr [[I3]], align 4
 // CHECK-64-NEXT:    br label [[DOTOMP_FINAL_DONE]]
 // CHECK-64:       .omp.final.done:
 // CHECK-64-NEXT:    [[TMP29:%.*]] = load i32, ptr [[DOTOMP_IS_LAST]], align 4
@@ -1970,12 +1963,11 @@ int bar(int n){
 // CHECK-64:       .omp.lastprivate.done:
 // CHECK-64-NEXT:    br label [[OMP_PRECOND_END]]
 // CHECK-64:       omp.precond.end:
-// CHECK-64-NEXT:    call void @__kmpc_free_shared(ptr [[L1]], i64 4)
 // CHECK-64-NEXT:    ret void
 //
 //
 // CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l40
-// CHECK-64-SAME: (i64 [[N:%.*]], ptr nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR5:[0-9]+]] {
+// CHECK-64-SAME: (i64 [[N:%.*]], ptr nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR3:[0-9]+]] {
 // CHECK-64-NEXT:  entry:
 // CHECK-64-NEXT:    [[N_ADDR:%.*]] = alloca i64, align 8
 // CHECK-64-NEXT:    [[AA_ADDR:%.*]] = alloca ptr, align 8
@@ -1995,7 +1987,7 @@ int bar(int n){
 // CHECK-64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[N_CASTED]], align 8
 // CHECK-64-NEXT:    store i32 0, ptr [[DOTZERO_ADDR]], align 4
 // CHECK-64-NEXT:    store i32 [[TMP2]], ptr [[DOTTHREADID_TEMP_]], align 4
-// CHECK-64-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l40_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], i64 [[TMP4]], ptr [[TMP0]]) #[[ATTR3]]
+// CHECK-64-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l40_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], i64 [[TMP4]], ptr [[TMP0]]) #[[ATTR2]]
 // CHECK-64-NEXT:    call void @__kmpc_target_deinit()
 // CHECK-64-NEXT:    ret void
 // CHECK-64:       worker.exit:
@@ -2142,7 +2134,7 @@ int bar(int n){
 // CHECK-64-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK-64-NEXT:    store i32 0, ptr [[DOTZERO_ADDR]], align 4
 // CHECK-64-NEXT:    store i32 [[TMP2]], ptr [[DOTTHREADID_TEMP_]], align 4
-// CHECK-64-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l45_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], ptr [[TMP0]]) #[[ATTR3]]
+// CHECK-64-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l45_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], ptr [[TMP0]]) #[[ATTR2]]
 // CHECK-64-NEXT:    call void @__kmpc_target_deinit()
 // CHECK-64-NEXT:    ret void
 // CHECK-64:       worker.exit:
@@ -2264,7 +2256,7 @@ int bar(int n){
 // CHECK-64-NEXT:    [[TMP4:%.*]] = load i64, ptr [[F_CASTED]], align 8
 // CHECK-64-NEXT:    store i32 0, ptr [[DOTZERO_ADDR]], align 4
 // CHECK-64-NEXT:    store i32 [[TMP2]], ptr [[DOTTHREADID_TEMP_]], align 4
-// CHECK-64-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l50_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], ptr [[TMP0]], i64 [[TMP4]]) #[[ATTR3]]
+// CHECK-64-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l50_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], ptr [[TMP0]], i64 [[TMP4]]) #[[ATTR2]]
 // CHECK-64-NEXT:    call void @__kmpc_target_deinit()
 // CHECK-64-NEXT:    ret void
 // CHECK-64:       worker.exit:
@@ -2416,7 +2408,7 @@ int bar(int n){
 // CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[L_CASTED]], align 4
 // CHECK-32-NEXT:    store i32 0, ptr [[DOTZERO_ADDR]], align 4
 // CHECK-32-NEXT:    store i32 [[TMP2]], ptr [[DOTTHREADID_TEMP_]], align 4
-// CHECK-32-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l34_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], i32 [[TMP4]], ptr [[TMP0]], i32 [[TMP6]]) #[[ATTR3:[0-9]+]]
+// CHECK-32-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l34_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], i32 [[TMP4]], ptr [[TMP0]], i32 [[TMP6]]) #[[ATTR2:[0-9]+]]
 // CHECK-32-NEXT:    call void @__kmpc_target_deinit()
 // CHECK-32-NEXT:    ret void
 // CHECK-32:       worker.exit:
@@ -2434,34 +2426,33 @@ int bar(int n){
 // CHECK-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
 // CHECK-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
 // CHECK-32-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
-// CHECK-32-NEXT:    [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
 // CHECK-32-NEXT:    [[I:%.*]] = alloca i32, align 4
 // CHECK-32-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
 // CHECK-32-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
 // CHECK-32-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
 // CHECK-32-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
-// CHECK-32-NEXT:    [[I4:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    [[I3:%.*]] = alloca i32, align 4
 // CHECK-32-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
 // CHECK-32-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
 // CHECK-32-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
 // CHECK-32-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
 // CHECK-32-NEXT:    store i32 [[L]], ptr [[L_ADDR]], align 4
 // CHECK-32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
-// CHECK-32-NEXT:    [[L1:%.*]] = call align 8 ptr @__kmpc_alloc_shared(i32 4)
 // CHECK-32-NEXT:    [[TMP1:%.*]] = load i32, ptr [[N_ADDR]], align 4
 // CHECK-32-NEXT:    store i32 [[TMP1]], ptr [[DOTCAPTURE_EXPR_]], align 4
 // CHECK-32-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
 // CHECK-32-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0
 // CHECK-32-NEXT:    [[DIV:%.*]] = sdiv i32 [[SUB]], 1
-// CHECK-32-NEXT:    [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1
-// CHECK-32-NEXT:    store i32 [[SUB3]], ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK-32-NEXT:    [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK-32-NEXT:    store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
 // CHECK-32-NEXT:    store i32 0, ptr [[I]], align 4
 // CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
 // CHECK-32-NEXT:    [[CMP:%.*]] = icmp slt i32 0, [[TMP3]]
 // CHECK-32-NEXT:    br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
 // CHECK-32:       omp.precond.then:
 // CHECK-32-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4
-// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
 // CHECK-32-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_UB]], align 4
 // CHECK-32-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
 // CHECK-32-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
@@ -2471,11 +2462,11 @@ int bar(int n){
 // CHECK-32-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
 // CHECK-32:       omp.dispatch.cond:
 // CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
-// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
-// CHECK-32-NEXT:    [[CMP5:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]]
-// CHECK-32-NEXT:    br i1 [[CMP5]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK-32-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]]
+// CHECK-32-NEXT:    br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
 // CHECK-32:       cond.true:
-// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
 // CHECK-32-NEXT:    br label [[COND_END:%.*]]
 // CHECK-32:       cond.false:
 // CHECK-32-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
@@ -2487,44 +2478,44 @@ int bar(int n){
 // CHECK-32-NEXT:    store i32 [[TMP11]], ptr [[DOTOMP_IV]], align 4
 // CHECK-32-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
 // CHECK-32-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
-// CHECK-32-NEXT:    [[CMP6:%.*]] = icmp sle i32 [[TMP12]], [[TMP13]]
-// CHECK-32-NEXT:    br i1 [[CMP6]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-32-NEXT:    [[CMP5:%.*]] = icmp sle i32 [[TMP12]], [[TMP13]]
+// CHECK-32-NEXT:    br i1 [[CMP5]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
 // CHECK-32:       omp.dispatch.body:
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32:       omp.inner.for.cond:
 // CHECK-32-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16:![0-9]+]]
 // CHECK-32-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK-32-NEXT:    [[CMP7:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]]
-// CHECK-32-NEXT:    br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32-NEXT:    [[CMP6:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]]
+// CHECK-32-NEXT:    br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32:       omp.inner.for.body:
 // CHECK-32-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
 // CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP16]], 1
 // CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I4]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK-32-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I4]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK-32-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP16]]
 // CHECK-32-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], ptr [[TMP0]], i32 0, i32 [[TMP17]]
 // CHECK-32-NEXT:    store i32 1, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK-32-NEXT:    [[TMP18:%.*]] = load i32, ptr [[I4]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK-32-NEXT:    [[TMP18:%.*]] = load i32, ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP16]]
 // CHECK-32-NEXT:    store i32 [[TMP18]], ptr [[L_ADDR]], align 4, !llvm.access.group [[ACC_GRP16]]
 // CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32:       omp.body.continue:
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32:       omp.inner.for.inc:
 // CHECK-32-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK-32-NEXT:    [[ADD8:%.*]] = add nsw i32 [[TMP19]], 1
-// CHECK-32-NEXT:    store i32 [[ADD8]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK-32-NEXT:    [[ADD7:%.*]] = add nsw i32 [[TMP19]], 1
+// CHECK-32-NEXT:    store i32 [[ADD7]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP17:![0-9]+]]
 // CHECK-32:       omp.inner.for.end:
 // CHECK-32-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-32:       omp.dispatch.inc:
 // CHECK-32-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
 // CHECK-32-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
-// CHECK-32-NEXT:    [[ADD9:%.*]] = add nsw i32 [[TMP20]], [[TMP21]]
-// CHECK-32-NEXT:    store i32 [[ADD9]], ptr [[DOTOMP_LB]], align 4
+// CHECK-32-NEXT:    [[ADD8:%.*]] = add nsw i32 [[TMP20]], [[TMP21]]
+// CHECK-32-NEXT:    store i32 [[ADD8]], ptr [[DOTOMP_LB]], align 4
 // CHECK-32-NEXT:    [[TMP22:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
 // CHECK-32-NEXT:    [[TMP23:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
-// CHECK-32-NEXT:    [[ADD10:%.*]] = add nsw i32 [[TMP22]], [[TMP23]]
-// CHECK-32-NEXT:    store i32 [[ADD10]], ptr [[DOTOMP_UB]], align 4
+// CHECK-32-NEXT:    [[ADD9:%.*]] = add nsw i32 [[TMP22]], [[TMP23]]
+// CHECK-32-NEXT:    store i32 [[ADD9]], ptr [[DOTOMP_UB]], align 4
 // CHECK-32-NEXT:    br label [[OMP_DISPATCH_COND]]
 // CHECK-32:       omp.dispatch.end:
 // CHECK-32-NEXT:    [[TMP24:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
@@ -2535,11 +2526,11 @@ int bar(int n){
 // CHECK-32-NEXT:    br i1 [[TMP27]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
 // CHECK-32:       .omp.final.then:
 // CHECK-32-NEXT:    [[TMP28:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
-// CHECK-32-NEXT:    [[SUB11:%.*]] = sub nsw i32 [[TMP28]], 0
-// CHECK-32-NEXT:    [[DIV12:%.*]] = sdiv i32 [[SUB11]], 1
-// CHECK-32-NEXT:    [[MUL13:%.*]] = mul nsw i32 [[DIV12]], 1
-// CHECK-32-NEXT:    [[ADD14:%.*]] = add nsw i32 0, [[MUL13]]
-// CHECK-32-NEXT:    store i32 [[ADD14]], ptr [[I4]], align 4
+// CHECK-32-NEXT:    [[SUB10:%.*]] = sub nsw i32 [[TMP28]], 0
+// CHECK-32-NEXT:    [[DIV11:%.*]] = sdiv i32 [[SUB10]], 1
+// CHECK-32-NEXT:    [[MUL12:%.*]] = mul nsw i32 [[DIV11]], 1
+// CHECK-32-NEXT:    [[ADD13:%.*]] = add nsw i32 0, [[MUL12]]
+// CHECK-32-NEXT:    store i32 [[ADD13]], ptr [[I3]], align 4
 // CHECK-32-NEXT:    br label [[DOTOMP_FINAL_DONE]]
 // CHECK-32:       .omp.final.done:
 // CHECK-32-NEXT:    [[TMP29:%.*]] = load i32, ptr [[DOTOMP_IS_LAST]], align 4
@@ -2552,12 +2543,11 @@ int bar(int n){
 // CHECK-32:       .omp.lastprivate.done:
 // CHECK-32-NEXT:    br label [[OMP_PRECOND_END]]
 // CHECK-32:       omp.precond.end:
-// CHECK-32-NEXT:    call void @__kmpc_free_shared(ptr [[L1]], i32 4)
 // CHECK-32-NEXT:    ret void
 //
 //
 // CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l40
-// CHECK-32-SAME: (i32 [[N:%.*]], ptr nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR5:[0-9]+]] {
+// CHECK-32-SAME: (i32 [[N:%.*]], ptr nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR3:[0-9]+]] {
 // CHECK-32-NEXT:  entry:
 // CHECK-32-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
 // CHECK-32-NEXT:    [[AA_ADDR:%.*]] = alloca ptr, align 4
@@ -2577,7 +2567,7 @@ int bar(int n){
 // CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[N_CASTED]], align 4
 // CHECK-32-NEXT:    store i32 0, ptr [[DOTZERO_ADDR]], align 4
 // CHECK-32-NEXT:    store i32 [[TMP2]], ptr [[DOTTHREADID_TEMP_]], align 4
-// CHECK-32-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l40_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], i32 [[TMP4]], ptr [[TMP0]]) #[[ATTR3]]
+// CHECK-32-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l40_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], i32 [[TMP4]], ptr [[TMP0]]) #[[ATTR2]]
 // CHECK-32-NEXT:    call void @__kmpc_target_deinit()
 // CHECK-32-NEXT:    ret void
 // CHECK-32:       worker.exit:
@@ -2723,7 +2713,7 @@ int bar(int n){
 // CHECK-32-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK-32-NEXT:    store i32 0, ptr [[DOTZERO_ADDR]], align 4
 // CHECK-32-NEXT:    store i32 [[TMP2]], ptr [[DOTTHREADID_TEMP_]], align 4
-// CHECK-32-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l45_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], ptr [[TMP0]]) #[[ATTR3]]
+// CHECK-32-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l45_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], ptr [[TMP0]]) #[[ATTR2]]
 // CHECK-32-NEXT:    call void @__kmpc_target_deinit()
 // CHECK-32-NEXT:    ret void
 // CHECK-32:       worker.exit:
@@ -2844,7 +2834,7 @@ int bar(int n){
 // CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[F_CASTED]], align 4
 // CHECK-32-NEXT:    store i32 0, ptr [[DOTZERO_ADDR]], align 4
 // CHECK-32-NEXT:    store i32 [[TMP2]], ptr [[DOTTHREADID_TEMP_]], align 4
-// CHECK-32-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l50_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], ptr [[TMP0]], i32 [[TMP4]]) #[[ATTR3]]
+// CHECK-32-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l50_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], ptr [[TMP0]], i32 [[TMP4]]) #[[ATTR2]]
 // CHECK-32-NEXT:    call void @__kmpc_target_deinit()
 // CHECK-32-NEXT:    ret void
 // CHECK-32:       worker.exit:
@@ -2994,7 +2984,7 @@ int bar(int n){
 // CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[L_CASTED]], align 4
 // CHECK-32-EX-NEXT:    store i32 0, ptr [[DOTZERO_ADDR]], align 4
 // CHECK-32-EX-NEXT:    store i32 [[TMP2]], ptr [[DOTTHREADID_TEMP_]], align 4
-// CHECK-32-EX-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l34_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], i32 [[TMP4]], ptr [[TMP0]], i32 [[TMP6]]) #[[ATTR3:[0-9]+]]
+// CHECK-32-EX-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l34_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], i32 [[TMP4]], ptr [[TMP0]], i32 [[TMP6]]) #[[ATTR2:[0-9]+]]
 // CHECK-32-EX-NEXT:    call void @__kmpc_target_deinit()
 // CHECK-32-EX-NEXT:    ret void
 // CHECK-32-EX:       worker.exit:
@@ -3012,34 +3002,33 @@ int bar(int n){
 // CHECK-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
 // CHECK-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
 // CHECK-32-EX-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
-// CHECK-32-EX-NEXT:    [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[DOTCAPTURE_EXPR_1:%.*]] = alloca i32, align 4
 // CHECK-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
 // CHECK-32-EX-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
 // CHECK-32-EX-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
 // CHECK-32-EX-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
 // CHECK-32-EX-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
-// CHECK-32-EX-NEXT:    [[I4:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    [[I3:%.*]] = alloca i32, align 4
 // CHECK-32-EX-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 4
 // CHECK-32-EX-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 4
 // CHECK-32-EX-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
 // CHECK-32-EX-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
 // CHECK-32-EX-NEXT:    store i32 [[L]], ptr [[L_ADDR]], align 4
 // CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
-// CHECK-32-EX-NEXT:    [[L1:%.*]] = call align 8 ptr @__kmpc_alloc_shared(i32 4)
 // CHECK-32-EX-NEXT:    [[TMP1:%.*]] = load i32, ptr [[N_ADDR]], align 4
 // CHECK-32-EX-NEXT:    store i32 [[TMP1]], ptr [[DOTCAPTURE_EXPR_]], align 4
 // CHECK-32-EX-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
 // CHECK-32-EX-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP2]], 0
 // CHECK-32-EX-NEXT:    [[DIV:%.*]] = sdiv i32 [[SUB]], 1
-// CHECK-32-EX-NEXT:    [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1
-// CHECK-32-EX-NEXT:    store i32 [[SUB3]], ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK-32-EX-NEXT:    [[SUB2:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK-32-EX-NEXT:    store i32 [[SUB2]], ptr [[DOTCAPTURE_EXPR_1]], align 4
 // CHECK-32-EX-NEXT:    store i32 0, ptr [[I]], align 4
 // CHECK-32-EX-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
 // CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp slt i32 0, [[TMP3]]
 // CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
 // CHECK-32-EX:       omp.precond.then:
 // CHECK-32-EX-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4
-// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
 // CHECK-32-EX-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_UB]], align 4
 // CHECK-32-EX-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
 // CHECK-32-EX-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
@@ -3049,11 +3038,11 @@ int bar(int n){
 // CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
 // CHECK-32-EX:       omp.dispatch.cond:
 // CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
-// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
-// CHECK-32-EX-NEXT:    [[CMP5:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]]
-// CHECK-32-EX-NEXT:    br i1 [[CMP5]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
+// CHECK-32-EX-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP7]], [[TMP8]]
+// CHECK-32-EX-NEXT:    br i1 [[CMP4]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
 // CHECK-32-EX:       cond.true:
-// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4
 // CHECK-32-EX-NEXT:    br label [[COND_END:%.*]]
 // CHECK-32-EX:       cond.false:
 // CHECK-32-EX-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
@@ -3065,44 +3054,44 @@ int bar(int n){
 // CHECK-32-EX-NEXT:    store i32 [[TMP11]], ptr [[DOTOMP_IV]], align 4
 // CHECK-32-EX-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
 // CHECK-32-EX-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
-// CHECK-32-EX-NEXT:    [[CMP6:%.*]] = icmp sle i32 [[TMP12]], [[TMP13]]
-// CHECK-32-EX-NEXT:    br i1 [[CMP6]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
+// CHECK-32-EX-NEXT:    [[CMP5:%.*]] = icmp sle i32 [[TMP12]], [[TMP13]]
+// CHECK-32-EX-NEXT:    br i1 [[CMP5]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
 // CHECK-32-EX:       omp.dispatch.body:
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32-EX:       omp.inner.for.cond:
 // CHECK-32-EX-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16:![0-9]+]]
 // CHECK-32-EX-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK-32-EX-NEXT:    [[CMP7:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]]
-// CHECK-32-EX-NEXT:    br i1 [[CMP7]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK-32-EX-NEXT:    [[CMP6:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]]
+// CHECK-32-EX-NEXT:    br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32-EX:       omp.inner.for.body:
 // CHECK-32-EX-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
 // CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP16]], 1
 // CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I4]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK-32-EX-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I4]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK-32-EX-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP16]]
 // CHECK-32-EX-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], ptr [[TMP0]], i32 0, i32 [[TMP17]]
 // CHECK-32-EX-NEXT:    store i32 1, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK-32-EX-NEXT:    [[TMP18:%.*]] = load i32, ptr [[I4]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK-32-EX-NEXT:    [[TMP18:%.*]] = load i32, ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP16]]
 // CHECK-32-EX-NEXT:    store i32 [[TMP18]], ptr [[L_ADDR]], align 4, !llvm.access.group [[ACC_GRP16]]
 // CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32-EX:       omp.body.continue:
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32-EX:       omp.inner.for.inc:
 // CHECK-32-EX-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK-32-EX-NEXT:    [[ADD8:%.*]] = add nsw i32 [[TMP19]], 1
-// CHECK-32-EX-NEXT:    store i32 [[ADD8]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK-32-EX-NEXT:    [[ADD7:%.*]] = add nsw i32 [[TMP19]], 1
+// CHECK-32-EX-NEXT:    store i32 [[ADD7]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP17:![0-9]+]]
 // CHECK-32-EX:       omp.inner.for.end:
 // CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-32-EX:       omp.dispatch.inc:
 // CHECK-32-EX-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
 // CHECK-32-EX-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
-// CHECK-32-EX-NEXT:    [[ADD9:%.*]] = add nsw i32 [[TMP20]], [[TMP21]]
-// CHECK-32-EX-NEXT:    store i32 [[ADD9]], ptr [[DOTOMP_LB]], align 4
+// CHECK-32-EX-NEXT:    [[ADD8:%.*]] = add nsw i32 [[TMP20]], [[TMP21]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD8]], ptr [[DOTOMP_LB]], align 4
 // CHECK-32-EX-NEXT:    [[TMP22:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
 // CHECK-32-EX-NEXT:    [[TMP23:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
-// CHECK-32-EX-NEXT:    [[ADD10:%.*]] = add nsw i32 [[TMP22]], [[TMP23]]
-// CHECK-32-EX-NEXT:    store i32 [[ADD10]], ptr [[DOTOMP_UB]], align 4
+// CHECK-32-EX-NEXT:    [[ADD9:%.*]] = add nsw i32 [[TMP22]], [[TMP23]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD9]], ptr [[DOTOMP_UB]], align 4
 // CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_COND]]
 // CHECK-32-EX:       omp.dispatch.end:
 // CHECK-32-EX-NEXT:    [[TMP24:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
@@ -3113,11 +3102,11 @@ int bar(int n){
 // CHECK-32-EX-NEXT:    br i1 [[TMP27]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
 // CHECK-32-EX:       .omp.final.then:
 // CHECK-32-EX-NEXT:    [[TMP28:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
-// CHECK-32-EX-NEXT:    [[SUB11:%.*]] = sub nsw i32 [[TMP28]], 0
-// CHECK-32-EX-NEXT:    [[DIV12:%.*]] = sdiv i32 [[SUB11]], 1
-// CHECK-32-EX-NEXT:    [[MUL13:%.*]] = mul nsw i32 [[DIV12]], 1
-// CHECK-32-EX-NEXT:    [[ADD14:%.*]] = add nsw i32 0, [[MUL13]]
-// CHECK-32-EX-NEXT:    store i32 [[ADD14]], ptr [[I4]], align 4
+// CHECK-32-EX-NEXT:    [[SUB10:%.*]] = sub nsw i32 [[TMP28]], 0
+// CHECK-32-EX-NEXT:    [[DIV11:%.*]] = sdiv i32 [[SUB10]], 1
+// CHECK-32-EX-NEXT:    [[MUL12:%.*]] = mul nsw i32 [[DIV11]], 1
+// CHECK-32-EX-NEXT:    [[ADD13:%.*]] = add nsw i32 0, [[MUL12]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD13]], ptr [[I3]], align 4
 // CHECK-32-EX-NEXT:    br label [[DOTOMP_FINAL_DONE]]
 // CHECK-32-EX:       .omp.final.done:
 // CHECK-32-EX-NEXT:    [[TMP29:%.*]] = load i32, ptr [[DOTOMP_IS_LAST]], align 4
@@ -3130,12 +3119,11 @@ int bar(int n){
 // CHECK-32-EX:       .omp.lastprivate.done:
 // CHECK-32-EX-NEXT:    br label [[OMP_PRECOND_END]]
 // CHECK-32-EX:       omp.precond.end:
-// CHECK-32-EX-NEXT:    call void @__kmpc_free_shared(ptr [[L1]], i32 4)
 // CHECK-32-EX-NEXT:    ret void
 //
 //
 // CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l40
-// CHECK-32-EX-SAME: (i32 [[N:%.*]], ptr nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR5:[0-9]+]] {
+// CHECK-32-EX-SAME: (i32 [[N:%.*]], ptr nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR3:[0-9]+]] {
 // CHECK-32-EX-NEXT:  entry:
 // CHECK-32-EX-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
 // CHECK-32-EX-NEXT:    [[AA_ADDR:%.*]] = alloca ptr, align 4
@@ -3155,7 +3143,7 @@ int bar(int n){
 // CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, ptr [[N_CASTED]], align 4
 // CHECK-32-EX-NEXT:    store i32 0, ptr [[DOTZERO_ADDR]], align 4
 // CHECK-32-EX-NEXT:    store i32 [[TMP2]], ptr [[DOTTHREADID_TEMP_]], align 4
-// CHECK-32-EX-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l40_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], i32 [[TMP4]], ptr [[TMP0]]) #[[ATTR3]]
+// CHECK-32-EX-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l40_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], i32 [[TMP4]], ptr [[TMP0]]) #[[ATTR2]]
 // CHECK-32-EX-NEXT:    call void @__kmpc_target_deinit()
 // CHECK-32-EX-NEXT:    ret void
 // CHECK-32-EX:       worker.exit:
@@ -3301,7 +3289,7 @@ int bar(int n){
 // CHECK-32-EX-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK-32-EX-NEXT:    store i32 0, ptr [[DOTZERO_ADDR]], align 4
 // CHECK-32-EX-NEXT:    store i32 [[TMP2]], ptr [[DOTTHREADID_TEMP_]], align 4
-// CHECK-32-EX-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l45_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], ptr [[TMP0]]) #[[ATTR3]]
+// CHECK-32-EX-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l45_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], ptr [[TMP0]]) #[[ATTR2]]
 // CHECK-32-EX-NEXT:    call void @__kmpc_target_deinit()
 // CHECK-32-EX-NEXT:    ret void
 // CHECK-32-EX:       worker.exit:
@@ -3422,7 +3410,7 @@ int bar(int n){
 // CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, ptr [[F_CASTED]], align 4
 // CHECK-32-EX-NEXT:    store i32 0, ptr [[DOTZERO_ADDR]], align 4
 // CHECK-32-EX-NEXT:    store i32 [[TMP2]], ptr [[DOTTHREADID_TEMP_]], align 4
-// CHECK-32-EX-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l50_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], ptr [[TMP0]], i32 [[TMP4]]) #[[ATTR3]]
+// CHECK-32-EX-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l50_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]], ptr [[TMP0]], i32 [[TMP4]]) #[[ATTR2]]
 // CHECK-32-EX-NEXT:    call void @__kmpc_target_deinit()
 // CHECK-32-EX-NEXT:    ret void
 // CHECK-32-EX:       worker.exit:

From ab34d71087a85a9d024657207683ea0eb775d326 Mon Sep 17 00:00:00 2001
From: Johannes Doerfert <johannes@jdoerfert.de>
Date: Mon, 23 Oct 2023 15:33:27 -0700
Subject: [PATCH 139/877] [OpenMP][NFC] Remove untested code emitting no-op
 call

---
 clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
index 0604ece7c2878..152a7511f4dd1 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -900,11 +900,7 @@ CGOpenMPRuntimeGPU::CGOpenMPRuntimeGPU(CodeGenModule &CGM)
 void CGOpenMPRuntimeGPU::emitProcBindClause(CodeGenFunction &CGF,
                                               ProcBindKind ProcBind,
                                               SourceLocation Loc) {
-  // Do nothing in case of SPMD mode and L0 parallel.
-  if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD)
-    return;
-
-  CGOpenMPRuntime::emitProcBindClause(CGF, ProcBind, Loc);
+  // Nothing to do.
 }
 
 void CGOpenMPRuntimeGPU::emitNumThreadsClause(CodeGenFunction &CGF,

From 57cebc709df0ce839807b852432eccf345d8a63e Mon Sep 17 00:00:00 2001
From: Johannes Doerfert <johannes@jdoerfert.de>
Date: Wed, 25 Oct 2023 11:25:00 -0700
Subject: [PATCH 140/877] [OpenMP][NFC] Fix test (remove wrong autogen header)

---
 clang/test/OpenMP/thread_limit_nvptx.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/test/OpenMP/thread_limit_nvptx.c b/clang/test/OpenMP/thread_limit_nvptx.c
index 14312ab436a4e..0925e24c6b930 100644
--- a/clang/test/OpenMP/thread_limit_nvptx.c
+++ b/clang/test/OpenMP/thread_limit_nvptx.c
@@ -1,5 +1,5 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _
 // Test target codegen - host bc file has to be created first.
+//
 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc
 // RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s
 // expected-no-diagnostics

From 0ba57c8bbac0e6c5cc5a85b615b801f9b8749017 Mon Sep 17 00:00:00 2001
From: Johannes Doerfert <johannes@jdoerfert.de>
Date: Thu, 26 Oct 2023 14:45:07 -0700
Subject: [PATCH 141/877] [OpenMP] Pass min/max thread and team count to the
 OMPIRBuilder (#70247)

We now provide the information about the min/max thread and team count
from to the OMPIRBuilder, no matter what the source was. That means we
unify `thread_limit`, `num_teams`, `num_threads` handling with the
target specific attriutes (`__launch_bounds__` and
`amdgpu_flat_work_group_size`). This is in preparation to pass the
values to the runtime, and to allow the middle-end (OpenMP-opt) to
tighten the values if it seems appropriate. There is no "real" change
after this commit.
---
 clang/lib/CodeGen/CGOpenMPRuntime.cpp         |   81 +-
 clang/lib/CodeGen/CGOpenMPRuntime.h           |    3 +-
 clang/lib/CodeGen/CodeGenModule.h             |   14 +-
 clang/lib/CodeGen/Targets/AMDGPU.cpp          |   10 +-
 clang/lib/CodeGen/Targets/NVPTX.cpp           |   45 +-
 clang/test/OpenMP/ompx_attributes_codegen.cpp |   34 +-
 clang/test/OpenMP/target_parallel_codegen.cpp |   96 +-
 .../OpenMP/target_parallel_for_codegen.cpp    |  384 ++--
 .../target_parallel_for_simd_codegen.cpp      | 1592 ++++++++---------
 clang/test/OpenMP/thread_limit_nvptx.c        |    8 +-
 .../llvm/Frontend/OpenMP/OMPIRBuilder.h       |   48 +-
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp     |  172 +-
 12 files changed, 1322 insertions(+), 1165 deletions(-)

diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index 75fad160b7162..6262b3416a173 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -6021,15 +6021,46 @@ void CGOpenMPRuntime::emitTargetOutlinedFunctionHelper(
       };
 
   // Get NumTeams and ThreadLimit attributes
-  int32_t DefaultValTeams = -1;
-  uint32_t DefaultValThreads = UINT32_MAX;
-  getNumTeamsExprForTargetDirective(CGF, D, DefaultValTeams);
-  getNumThreadsExprForTargetDirective(CGF, D, DefaultValThreads,
+  int32_t DefaultValMinTeams = 1;
+  int32_t DefaultValMaxTeams = -1;
+  uint32_t DefaultValMinThreads = 1;
+  uint32_t DefaultValMaxThreads = UINT32_MAX;
+
+  getNumTeamsExprForTargetDirective(CGF, D, DefaultValMinTeams,
+                                    DefaultValMaxTeams);
+  getNumThreadsExprForTargetDirective(CGF, D, DefaultValMaxThreads,
                                       /*UpperBoundOnly=*/true);
 
-  OMPBuilder.emitTargetRegionFunction(EntryInfo, GenerateOutlinedFunction,
-                                      DefaultValTeams, DefaultValThreads,
-                                      IsOffloadEntry, OutlinedFn, OutlinedFnID);
+  for (auto *C : D.getClausesOfKind<OMPXAttributeClause>()) {
+    for (auto *A : C->getAttrs()) {
+      int32_t MinThreadsVal = 1, MaxThreadsVal = 0;
+      int32_t MinBlocksVal = 1, MaxBlocksVal = -1;
+      if (auto *Attr = dyn_cast<CUDALaunchBoundsAttr>(A))
+        CGM.handleCUDALaunchBoundsAttr(nullptr, Attr, &MaxThreadsVal,
+                                       &MinBlocksVal, &MaxBlocksVal);
+      else if (auto *Attr = dyn_cast<AMDGPUFlatWorkGroupSizeAttr>(A))
+        CGM.handleAMDGPUFlatWorkGroupSizeAttr(
+            nullptr, Attr, /*ReqdWGS=*/nullptr, &MinThreadsVal, &MaxThreadsVal);
+      else
+        continue;
+
+      DefaultValMinThreads =
+          std::max(DefaultValMinThreads, uint32_t(MinThreadsVal));
+      DefaultValMaxThreads =
+          DefaultValMaxThreads
+              ? std::min(DefaultValMaxThreads, uint32_t(MaxThreadsVal))
+              : MaxThreadsVal;
+      DefaultValMinTeams = DefaultValMinTeams
+                               ? std::max(DefaultValMinTeams, MinBlocksVal)
+                               : MinBlocksVal;
+      DefaultValMaxTeams = std::min(DefaultValMaxTeams, MaxBlocksVal);
+    }
+  }
+
+  OMPBuilder.emitTargetRegionFunction(
+      EntryInfo, GenerateOutlinedFunction, DefaultValMinTeams,
+      DefaultValMaxTeams, DefaultValMinThreads, DefaultValMaxThreads,
+      IsOffloadEntry, OutlinedFn, OutlinedFnID);
 
   if (!OutlinedFn)
     return;
@@ -6038,14 +6069,8 @@ void CGOpenMPRuntime::emitTargetOutlinedFunctionHelper(
 
   for (auto *C : D.getClausesOfKind<OMPXAttributeClause>()) {
     for (auto *A : C->getAttrs()) {
-      if (auto *Attr = dyn_cast<CUDALaunchBoundsAttr>(A))
-        CGM.handleCUDALaunchBoundsAttr(OutlinedFn, Attr);
-      else if (auto *Attr = dyn_cast<AMDGPUFlatWorkGroupSizeAttr>(A))
-        CGM.handleAMDGPUFlatWorkGroupSizeAttr(OutlinedFn, Attr);
-      else if (auto *Attr = dyn_cast<AMDGPUWavesPerEUAttr>(A))
+      if (auto *Attr = dyn_cast<AMDGPUWavesPerEUAttr>(A))
         CGM.handleAMDGPUWavesPerEUAttr(OutlinedFn, Attr);
-      else
-        llvm_unreachable("Unexpected attribute kind");
     }
   }
 }
@@ -6103,8 +6128,8 @@ const Stmt *CGOpenMPRuntime::getSingleCompoundChild(ASTContext &Ctx,
 }
 
 const Expr *CGOpenMPRuntime::getNumTeamsExprForTargetDirective(
-    CodeGenFunction &CGF, const OMPExecutableDirective &D,
-    int32_t &DefaultVal) {
+    CodeGenFunction &CGF, const OMPExecutableDirective &D, int32_t &MinTeamsVal,
+    int32_t &MaxTeamsVal) {
 
   OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind();
   assert(isOpenMPTargetExecutionDirective(DirectiveKind) &&
@@ -6125,22 +6150,22 @@ const Expr *CGOpenMPRuntime::getNumTeamsExprForTargetDirective(
           if (NumTeams->isIntegerConstantExpr(CGF.getContext()))
             if (auto Constant =
                     NumTeams->getIntegerConstantExpr(CGF.getContext()))
-              DefaultVal = Constant->getExtValue();
+              MinTeamsVal = MaxTeamsVal = Constant->getExtValue();
           return NumTeams;
         }
-        DefaultVal = 0;
+        MinTeamsVal = MaxTeamsVal = 0;
         return nullptr;
       }
       if (isOpenMPParallelDirective(NestedDir->getDirectiveKind()) ||
           isOpenMPSimdDirective(NestedDir->getDirectiveKind())) {
-        DefaultVal = 1;
+        MinTeamsVal = MaxTeamsVal = 1;
         return nullptr;
       }
-      DefaultVal = 1;
+      MinTeamsVal = MaxTeamsVal = 1;
       return nullptr;
     }
     // A value of -1 is used to check if we need to emit no teams region
-    DefaultVal = -1;
+    MinTeamsVal = MaxTeamsVal = -1;
     return nullptr;
   }
   case OMPD_target_teams_loop:
@@ -6154,10 +6179,10 @@ const Expr *CGOpenMPRuntime::getNumTeamsExprForTargetDirective(
           D.getSingleClause<OMPNumTeamsClause>()->getNumTeams();
       if (NumTeams->isIntegerConstantExpr(CGF.getContext()))
         if (auto Constant = NumTeams->getIntegerConstantExpr(CGF.getContext()))
-          DefaultVal = Constant->getExtValue();
+          MinTeamsVal = MaxTeamsVal = Constant->getExtValue();
       return NumTeams;
     }
-    DefaultVal = 0;
+    MinTeamsVal = MaxTeamsVal = 0;
     return nullptr;
   }
   case OMPD_target_parallel:
@@ -6165,7 +6190,7 @@ const Expr *CGOpenMPRuntime::getNumTeamsExprForTargetDirective(
   case OMPD_target_parallel_for_simd:
   case OMPD_target_parallel_loop:
   case OMPD_target_simd:
-    DefaultVal = 1;
+    MinTeamsVal = MaxTeamsVal = 1;
     return nullptr;
   case OMPD_parallel:
   case OMPD_for:
@@ -6240,8 +6265,9 @@ llvm::Value *CGOpenMPRuntime::emitNumTeamsForTargetDirective(
          "Clauses associated with the teams directive expected to be emitted "
          "only for the host!");
   CGBuilderTy &Bld = CGF.Builder;
-  int32_t DefaultNT = -1;
-  const Expr *NumTeams = getNumTeamsExprForTargetDirective(CGF, D, DefaultNT);
+  int32_t MinNT = -1, MaxNT = -1;
+  const Expr *NumTeams =
+      getNumTeamsExprForTargetDirective(CGF, D, MinNT, MaxNT);
   if (NumTeams != nullptr) {
     OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind();
 
@@ -6271,7 +6297,8 @@ llvm::Value *CGOpenMPRuntime::emitNumTeamsForTargetDirective(
     }
   }
 
-  return llvm::ConstantInt::get(CGF.Int32Ty, DefaultNT);
+  assert(MinNT == MaxNT && "Num threads ranges require handling here.");
+  return llvm::ConstantInt::get(CGF.Int32Ty, MinNT);
 }
 
 /// Check for a num threads constant value (stored in \p DefaultVal), or
diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.h b/clang/lib/CodeGen/CGOpenMPRuntime.h
index 74b528d6cd7f8..d2f922da33209 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.h
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.h
@@ -637,7 +637,8 @@ class CGOpenMPRuntime {
   /// Otherwise, return nullptr.
   const Expr *getNumTeamsExprForTargetDirective(CodeGenFunction &CGF,
                                                 const OMPExecutableDirective &D,
-                                                int32_t &DefaultVal);
+                                                int32_t &MinTeamsVal,
+                                                int32_t &MaxTeamsVal);
   llvm::Value *emitNumTeamsForTargetDirective(CodeGenFunction &CGF,
                                               const OMPExecutableDirective &D);
 
diff --git a/clang/lib/CodeGen/CodeGenModule.h b/clang/lib/CodeGen/CodeGenModule.h
index 073b471c6e3cc..793861f23b15f 100644
--- a/clang/lib/CodeGen/CodeGenModule.h
+++ b/clang/lib/CodeGen/CodeGenModule.h
@@ -1543,15 +1543,23 @@ class CodeGenModule : public CodeGenTypeCache {
   void moveLazyEmissionStates(CodeGenModule *NewBuilder);
 
   /// Emit the IR encoding to attach the CUDA launch bounds attribute to \p F.
+  /// If \p MaxThreadsVal is not nullptr, the max threads value is stored in it,
+  /// if a valid one was found.
   void handleCUDALaunchBoundsAttr(llvm::Function *F,
-                                  const CUDALaunchBoundsAttr *A);
+                                  const CUDALaunchBoundsAttr *A,
+                                  int32_t *MaxThreadsVal = nullptr,
+                                  int32_t *MinBlocksVal = nullptr,
+                                  int32_t *MaxClusterRankVal = nullptr);
 
   /// Emit the IR encoding to attach the AMD GPU flat-work-group-size attribute
   /// to \p F. Alternatively, the work group size can be taken from a \p
-  /// ReqdWGS.
+  /// ReqdWGS. If \p MinThreadsVal is not nullptr, the min threads value is
+  /// stored in it, if a valid one was found. If \p MaxThreadsVal is not
+  /// nullptr, the max threads value is stored in it, if a valid one was found.
   void handleAMDGPUFlatWorkGroupSizeAttr(
       llvm::Function *F, const AMDGPUFlatWorkGroupSizeAttr *A,
-      const ReqdWorkGroupSizeAttr *ReqdWGS = nullptr);
+      const ReqdWorkGroupSizeAttr *ReqdWGS = nullptr,
+      int32_t *MinThreadsVal = nullptr, int32_t *MaxThreadsVal = nullptr);
 
   /// Emit the IR encoding to attach the AMD GPU waves-per-eu attribute to \p F.
   void handleAMDGPUWavesPerEUAttr(llvm::Function *F,
diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp b/clang/lib/CodeGen/Targets/AMDGPU.cpp
index f6a614b3e4d54..0411846cf9b02 100644
--- a/clang/lib/CodeGen/Targets/AMDGPU.cpp
+++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp
@@ -594,7 +594,8 @@ llvm::Value *AMDGPUTargetCodeGenInfo::createEnqueuedBlockKernel(
 
 void CodeGenModule::handleAMDGPUFlatWorkGroupSizeAttr(
     llvm::Function *F, const AMDGPUFlatWorkGroupSizeAttr *FlatWGS,
-    const ReqdWorkGroupSizeAttr *ReqdWGS) {
+    const ReqdWorkGroupSizeAttr *ReqdWGS, int32_t *MinThreadsVal,
+    int32_t *MaxThreadsVal) {
   unsigned Min = 0;
   unsigned Max = 0;
   if (FlatWGS) {
@@ -607,8 +608,13 @@ void CodeGenModule::handleAMDGPUFlatWorkGroupSizeAttr(
   if (Min != 0) {
     assert(Min <= Max && "Min must be less than or equal Max");
 
+    if (MinThreadsVal)
+      *MinThreadsVal = Min;
+    if (MaxThreadsVal)
+      *MaxThreadsVal = Max;
     std::string AttrVal = llvm::utostr(Min) + "," + llvm::utostr(Max);
-    F->addFnAttr("amdgpu-flat-work-group-size", AttrVal);
+    if (F)
+      F->addFnAttr("amdgpu-flat-work-group-size", AttrVal);
   } else
     assert(Max == 0 && "Max must be zero");
 }
diff --git a/clang/lib/CodeGen/Targets/NVPTX.cpp b/clang/lib/CodeGen/Targets/NVPTX.cpp
index 64d019a10514d..d0dc7c258a03a 100644
--- a/clang/lib/CodeGen/Targets/NVPTX.cpp
+++ b/clang/lib/CodeGen/Targets/NVPTX.cpp
@@ -287,14 +287,23 @@ bool NVPTXTargetCodeGenInfo::shouldEmitStaticExternCAliases() const {
 }
 }
 
-void CodeGenModule::handleCUDALaunchBoundsAttr(
-    llvm::Function *F, const CUDALaunchBoundsAttr *Attr) {
+void CodeGenModule::handleCUDALaunchBoundsAttr(llvm::Function *F,
+                                               const CUDALaunchBoundsAttr *Attr,
+                                               int32_t *MaxThreadsVal,
+                                               int32_t *MinBlocksVal,
+                                               int32_t *MaxClusterRankVal) {
   // Create !{<func-ref>, metadata !"maxntidx", i32 <val>} node
   llvm::APSInt MaxThreads(32);
   MaxThreads = Attr->getMaxThreads()->EvaluateKnownConstInt(getContext());
-  if (MaxThreads > 0)
-    NVPTXTargetCodeGenInfo::addNVVMMetadata(F, "maxntidx",
-                                            MaxThreads.getExtValue());
+  if (MaxThreads > 0) {
+    if (MaxThreadsVal)
+      *MaxThreadsVal = MaxThreads.getExtValue();
+    if (F) {
+      // Create !{<func-ref>, metadata !"maxntidx", i32 <val>} node
+      NVPTXTargetCodeGenInfo::addNVVMMetadata(F, "maxntidx",
+                                              MaxThreads.getExtValue());
+    }
+  }
 
   // min and max blocks is an optional argument for CUDALaunchBoundsAttr. If it
   // was not specified in __launch_bounds__ or if the user specified a 0 value,
@@ -302,18 +311,28 @@ void CodeGenModule::handleCUDALaunchBoundsAttr(
   if (Attr->getMinBlocks()) {
     llvm::APSInt MinBlocks(32);
     MinBlocks = Attr->getMinBlocks()->EvaluateKnownConstInt(getContext());
-    if (MinBlocks > 0)
-      // Create !{<func-ref>, metadata !"minctasm", i32 <val>} node
-      NVPTXTargetCodeGenInfo::addNVVMMetadata(F, "minctasm",
-                                              MinBlocks.getExtValue());
+    if (MinBlocks > 0) {
+      if (MinBlocksVal)
+        *MinBlocksVal = MinBlocks.getExtValue();
+      if (F) {
+        // Create !{<func-ref>, metadata !"minctasm", i32 <val>} node
+        NVPTXTargetCodeGenInfo::addNVVMMetadata(F, "minctasm",
+                                                MinBlocks.getExtValue());
+      }
+    }
   }
   if (Attr->getMaxBlocks()) {
     llvm::APSInt MaxBlocks(32);
     MaxBlocks = Attr->getMaxBlocks()->EvaluateKnownConstInt(getContext());
-    if (MaxBlocks > 0)
-      // Create !{<func-ref>, metadata !"maxclusterrank", i32 <val>} node
-      NVPTXTargetCodeGenInfo::addNVVMMetadata(F, "maxclusterrank",
-                                              MaxBlocks.getExtValue());
+    if (MaxBlocks > 0) {
+      if (MaxClusterRankVal)
+        *MaxClusterRankVal = MaxBlocks.getExtValue();
+      if (F) {
+        // Create !{<func-ref>, metadata !"maxclusterrank", i32 <val>} node
+        NVPTXTargetCodeGenInfo::addNVVMMetadata(F, "maxclusterrank",
+                                                MaxBlocks.getExtValue());
+      }
+    }
   }
 }
 
diff --git a/clang/test/OpenMP/ompx_attributes_codegen.cpp b/clang/test/OpenMP/ompx_attributes_codegen.cpp
index 21e9805cbe829..bcf524b464aef 100644
--- a/clang/test/OpenMP/ompx_attributes_codegen.cpp
+++ b/clang/test/OpenMP/ompx_attributes_codegen.cpp
@@ -1,16 +1,17 @@
 // REQUIRES: amdgpu-registered-target
 
 // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-unknown -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm-bc %s -o %t-ppc-host.bc
-// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s
-// RUN: %clang_cc1 -target-cpu gfx900 -fopenmp -x c++ -std=c++11 -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=AMD
+// RUN: %clang_cc1 -target-cpu gfx900 -fopenmp -x c++ -std=c++11 -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=AMD
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple nvptx64 -fopenmp-targets=nvptx64 -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=NVIDIA
 // expected-no-diagnostics
 
 
 // Check that the target attributes are set on the generated kernel
 void func() {
-  // CHECK: amdgpu_kernel void @__omp_offloading[[HASH:.*]]_l15() #0
-  // CHECK: amdgpu_kernel void @__omp_offloading[[HASH:.*]]_l17()
-  // CHECK: amdgpu_kernel void @__omp_offloading[[HASH:.*]]_l19() #4
+  // AMD: amdgpu_kernel void @__omp_offloading[[HASH:.*]]_l16() #0
+  // AMD: amdgpu_kernel void @__omp_offloading[[HASH:.*]]_l18()
+  // AMD: amdgpu_kernel void @__omp_offloading[[HASH:.*]]_l20() #4
 
   #pragma omp target ompx_attribute([[clang::amdgpu_flat_work_group_size(10, 20)]])
   {}
@@ -21,11 +22,20 @@ void func() {
   {}
 }
 
-// CHECK: attributes #0
-// CHECK-SAME: "amdgpu-flat-work-group-size"="10,20"
-// CHECK: attributes #4
-// CHECK-SAME: "amdgpu-flat-work-group-size"="3,17"
-// CHECK-SAME: "amdgpu-waves-per-eu"="3,7"
+// AMD: attributes #0
+// AMD-SAME: "amdgpu-flat-work-group-size"="10,20"
+// AMD-SAME: "omp_target_thread_limit"="20"
+// AMD: "omp_target_thread_limit"="45"
+// AMD: attributes #4
+// AMD-SAME: "amdgpu-flat-work-group-size"="3,17"
+// AMD-SAME: "amdgpu-waves-per-eu"="3,7"
+// AMD-SAME: "omp_target_thread_limit"="17"
 
-// CHECK: !{ptr @__omp_offloading[[HASH]]_l17, !"maxntidx", i32 45}
-// CHECK: !{ptr @__omp_offloading[[HASH]]_l17, !"minctasm", i32 90}
+// It is unclear if we should use the AMD annotations for other targets, we do for now.
+// NVIDIA: "omp_target_thread_limit"="20"
+// NVIDIA: "omp_target_thread_limit"="45"
+// NVIDIA: "omp_target_thread_limit"="17"
+// NVIDIA: !{ptr @__omp_offloading[[HASH1:.*]]_l16, !"maxntidx", i32 20}
+// NVIDIA: !{ptr @__omp_offloading[[HASH2:.*]]_l18, !"minctasm", i32 90}
+// NVIDIA: !{ptr @__omp_offloading[[HASH2]]_l18, !"maxntidx", i32 45}
+// NVIDIA: !{ptr @__omp_offloading[[HASH3:.*]]_l20, !"maxntidx", i32 17}
diff --git a/clang/test/OpenMP/target_parallel_codegen.cpp b/clang/test/OpenMP/target_parallel_codegen.cpp
index df8a2c878760c..c8af38e32e638 100644
--- a/clang/test/OpenMP/target_parallel_codegen.cpp
+++ b/clang/test/OpenMP/target_parallel_codegen.cpp
@@ -603,42 +603,42 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T:%.*]], ptr [[TMP4]], i32 0, i32 2
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], ptr [[TMP4]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8
-// CHECK1-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META12:![0-9]+]])
-// CHECK1-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META15:![0-9]+]])
-// CHECK1-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META17:![0-9]+]])
-// CHECK1-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META19:![0-9]+]])
-// CHECK1-NEXT:    store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !21
-// CHECK1-NEXT:    store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 8, !noalias !21
-// CHECK1-NEXT:    store ptr null, ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias !21
-// CHECK1-NEXT:    store ptr null, ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !21
-// CHECK1-NEXT:    store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 8, !noalias !21
-// CHECK1-NEXT:    store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !21
-// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !21
-// CHECK1-NEXT:    store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias !21
+// CHECK1-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META26:![0-9]+]])
+// CHECK1-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META29:![0-9]+]])
+// CHECK1-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META31:![0-9]+]])
+// CHECK1-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META33:![0-9]+]])
+// CHECK1-NEXT:    store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !35
+// CHECK1-NEXT:    store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 8, !noalias !35
+// CHECK1-NEXT:    store ptr null, ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias !35
+// CHECK1-NEXT:    store ptr null, ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !35
+// CHECK1-NEXT:    store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 8, !noalias !35
+// CHECK1-NEXT:    store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !35
+// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !35
+// CHECK1-NEXT:    store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias !35
 // CHECK1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 1
-// CHECK1-NEXT:    store i32 0, ptr [[TMP9]], align 4, !noalias !21
+// CHECK1-NEXT:    store i32 0, ptr [[TMP9]], align 4, !noalias !35
 // CHECK1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 2
-// CHECK1-NEXT:    store ptr null, ptr [[TMP10]], align 8, !noalias !21
+// CHECK1-NEXT:    store ptr null, ptr [[TMP10]], align 8, !noalias !35
 // CHECK1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 3
-// CHECK1-NEXT:    store ptr null, ptr [[TMP11]], align 8, !noalias !21
+// CHECK1-NEXT:    store ptr null, ptr [[TMP11]], align 8, !noalias !35
 // CHECK1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 4
-// CHECK1-NEXT:    store ptr null, ptr [[TMP12]], align 8, !noalias !21
+// CHECK1-NEXT:    store ptr null, ptr [[TMP12]], align 8, !noalias !35
 // CHECK1-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 5
-// CHECK1-NEXT:    store ptr null, ptr [[TMP13]], align 8, !noalias !21
+// CHECK1-NEXT:    store ptr null, ptr [[TMP13]], align 8, !noalias !35
 // CHECK1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 6
-// CHECK1-NEXT:    store ptr null, ptr [[TMP14]], align 8, !noalias !21
+// CHECK1-NEXT:    store ptr null, ptr [[TMP14]], align 8, !noalias !35
 // CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 7
-// CHECK1-NEXT:    store ptr null, ptr [[TMP15]], align 8, !noalias !21
+// CHECK1-NEXT:    store ptr null, ptr [[TMP15]], align 8, !noalias !35
 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 8
-// CHECK1-NEXT:    store i64 0, ptr [[TMP16]], align 8, !noalias !21
+// CHECK1-NEXT:    store i64 0, ptr [[TMP16]], align 8, !noalias !35
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 9
-// CHECK1-NEXT:    store i64 1, ptr [[TMP17]], align 8, !noalias !21
+// CHECK1-NEXT:    store i64 1, ptr [[TMP17]], align 8, !noalias !35
 // CHECK1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 10
-// CHECK1-NEXT:    store [3 x i32] [i32 1, i32 0, i32 0], ptr [[TMP18]], align 4, !noalias !21
+// CHECK1-NEXT:    store [3 x i32] [i32 1, i32 0, i32 0], ptr [[TMP18]], align 4, !noalias !35
 // CHECK1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 11
-// CHECK1-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP19]], align 4, !noalias !21
+// CHECK1-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP19]], align 4, !noalias !35
 // CHECK1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 12
-// CHECK1-NEXT:    store i32 0, ptr [[TMP20]], align 4, !noalias !21
+// CHECK1-NEXT:    store i32 0, ptr [[TMP20]], align 4, !noalias !35
 // CHECK1-NEXT:    [[TMP21:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l100.region_id, ptr [[KERNEL_ARGS_I]])
 // CHECK1-NEXT:    [[TMP22:%.*]] = icmp ne i32 [[TMP21]], 0
 // CHECK1-NEXT:    br i1 [[TMP22]], label [[OMP_OFFLOAD_FAILED_I:%.*]], label [[DOTOMP_OUTLINED__EXIT:%.*]]
@@ -1706,42 +1706,42 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T:%.*]], ptr [[TMP4]], i32 0, i32 2
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], ptr [[TMP4]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 4
-// CHECK3-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META13:![0-9]+]])
-// CHECK3-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META16:![0-9]+]])
-// CHECK3-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META18:![0-9]+]])
-// CHECK3-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META20:![0-9]+]])
-// CHECK3-NEXT:    store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !22
-// CHECK3-NEXT:    store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 4, !noalias !22
-// CHECK3-NEXT:    store ptr null, ptr [[DOTPRIVATES__ADDR_I]], align 4, !noalias !22
-// CHECK3-NEXT:    store ptr null, ptr [[DOTCOPY_FN__ADDR_I]], align 4, !noalias !22
-// CHECK3-NEXT:    store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 4, !noalias !22
-// CHECK3-NEXT:    store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 4, !noalias !22
-// CHECK3-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 4, !noalias !22
-// CHECK3-NEXT:    store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias !22
+// CHECK3-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META27:![0-9]+]])
+// CHECK3-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META30:![0-9]+]])
+// CHECK3-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META32:![0-9]+]])
+// CHECK3-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META34:![0-9]+]])
+// CHECK3-NEXT:    store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !36
+// CHECK3-NEXT:    store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 4, !noalias !36
+// CHECK3-NEXT:    store ptr null, ptr [[DOTPRIVATES__ADDR_I]], align 4, !noalias !36
+// CHECK3-NEXT:    store ptr null, ptr [[DOTCOPY_FN__ADDR_I]], align 4, !noalias !36
+// CHECK3-NEXT:    store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 4, !noalias !36
+// CHECK3-NEXT:    store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 4, !noalias !36
+// CHECK3-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 4, !noalias !36
+// CHECK3-NEXT:    store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias !36
 // CHECK3-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 1
-// CHECK3-NEXT:    store i32 0, ptr [[TMP9]], align 4, !noalias !22
+// CHECK3-NEXT:    store i32 0, ptr [[TMP9]], align 4, !noalias !36
 // CHECK3-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 2
-// CHECK3-NEXT:    store ptr null, ptr [[TMP10]], align 4, !noalias !22
+// CHECK3-NEXT:    store ptr null, ptr [[TMP10]], align 4, !noalias !36
 // CHECK3-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 3
-// CHECK3-NEXT:    store ptr null, ptr [[TMP11]], align 4, !noalias !22
+// CHECK3-NEXT:    store ptr null, ptr [[TMP11]], align 4, !noalias !36
 // CHECK3-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 4
-// CHECK3-NEXT:    store ptr null, ptr [[TMP12]], align 4, !noalias !22
+// CHECK3-NEXT:    store ptr null, ptr [[TMP12]], align 4, !noalias !36
 // CHECK3-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 5
-// CHECK3-NEXT:    store ptr null, ptr [[TMP13]], align 4, !noalias !22
+// CHECK3-NEXT:    store ptr null, ptr [[TMP13]], align 4, !noalias !36
 // CHECK3-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 6
-// CHECK3-NEXT:    store ptr null, ptr [[TMP14]], align 4, !noalias !22
+// CHECK3-NEXT:    store ptr null, ptr [[TMP14]], align 4, !noalias !36
 // CHECK3-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 7
-// CHECK3-NEXT:    store ptr null, ptr [[TMP15]], align 4, !noalias !22
+// CHECK3-NEXT:    store ptr null, ptr [[TMP15]], align 4, !noalias !36
 // CHECK3-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 8
-// CHECK3-NEXT:    store i64 0, ptr [[TMP16]], align 8, !noalias !22
+// CHECK3-NEXT:    store i64 0, ptr [[TMP16]], align 8, !noalias !36
 // CHECK3-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 9
-// CHECK3-NEXT:    store i64 1, ptr [[TMP17]], align 8, !noalias !22
+// CHECK3-NEXT:    store i64 1, ptr [[TMP17]], align 8, !noalias !36
 // CHECK3-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 10
-// CHECK3-NEXT:    store [3 x i32] [i32 1, i32 0, i32 0], ptr [[TMP18]], align 4, !noalias !22
+// CHECK3-NEXT:    store [3 x i32] [i32 1, i32 0, i32 0], ptr [[TMP18]], align 4, !noalias !36
 // CHECK3-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 11
-// CHECK3-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP19]], align 4, !noalias !22
+// CHECK3-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP19]], align 4, !noalias !36
 // CHECK3-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 12
-// CHECK3-NEXT:    store i32 0, ptr [[TMP20]], align 4, !noalias !22
+// CHECK3-NEXT:    store i32 0, ptr [[TMP20]], align 4, !noalias !36
 // CHECK3-NEXT:    [[TMP21:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l100.region_id, ptr [[KERNEL_ARGS_I]])
 // CHECK3-NEXT:    [[TMP22:%.*]] = icmp ne i32 [[TMP21]], 0
 // CHECK3-NEXT:    br i1 [[TMP22]], label [[OMP_OFFLOAD_FAILED_I:%.*]], label [[DOTOMP_OUTLINED__EXIT:%.*]]
diff --git a/clang/test/OpenMP/target_parallel_for_codegen.cpp b/clang/test/OpenMP/target_parallel_for_codegen.cpp
index aa60cc3405f87..09c479c50eef2 100644
--- a/clang/test/OpenMP/target_parallel_for_codegen.cpp
+++ b/clang/test/OpenMP/target_parallel_for_codegen.cpp
@@ -794,32 +794,32 @@ int bar(int n){
 // CHECK1-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK1:       omp.inner.for.cond:
-// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12:![0-9]+]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26:![0-9]+]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP26]]
 // CHECK1-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
 // CHECK1-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK1:       omp.inner.for.body:
-// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
 // CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
 // CHECK1-NEXT:    [[SUB:%.*]] = sub nsw i32 10, [[MUL]]
-// CHECK1-NEXT:    store i32 [[SUB]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP12]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load i64, ptr [[DOTLINEAR_START]], align 8, !llvm.access.group [[ACC_GRP12]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK1-NEXT:    store i32 [[SUB]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load i64, ptr [[DOTLINEAR_START]], align 8, !llvm.access.group [[ACC_GRP26]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
 // CHECK1-NEXT:    [[MUL2:%.*]] = mul nsw i32 [[TMP9]], 3
 // CHECK1-NEXT:    [[CONV:%.*]] = sext i32 [[MUL2]] to i64
 // CHECK1-NEXT:    [[ADD:%.*]] = add nsw i64 [[TMP8]], [[CONV]]
-// CHECK1-NEXT:    store i64 [[ADD]], ptr [[K1]], align 8, !llvm.access.group [[ACC_GRP12]]
-// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK1-NEXT:    store i64 [[ADD]], ptr [[K1]], align 8, !llvm.access.group [[ACC_GRP26]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP26]]
 // CHECK1-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP10]], 1
-// CHECK1-NEXT:    store i32 [[ADD3]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK1-NEXT:    store i32 [[ADD3]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP26]]
 // CHECK1-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK1:       omp.body.continue:
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK1:       omp.inner.for.inc:
-// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
 // CHECK1-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP11]], 1
-// CHECK1-NEXT:    store i32 [[ADD4]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12]]
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP13:![0-9]+]]
+// CHECK1-NEXT:    store i32 [[ADD4]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP27:![0-9]+]]
 // CHECK1:       omp.inner.for.end:
 // CHECK1-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK1:       omp.dispatch.inc:
@@ -1027,64 +1027,64 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], ptr [[TMP4]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8
 // CHECK1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES]], ptr [[TMP3]], i32 0, i32 1
-// CHECK1-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META15:![0-9]+]])
-// CHECK1-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META18:![0-9]+]])
-// CHECK1-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META20:![0-9]+]])
-// CHECK1-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META22:![0-9]+]])
-// CHECK1-NEXT:    store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !24
-// CHECK1-NEXT:    store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 8, !noalias !24
-// CHECK1-NEXT:    store ptr [[TMP8]], ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias !24
-// CHECK1-NEXT:    store ptr @.omp_task_privates_map., ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !24
-// CHECK1-NEXT:    store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 8, !noalias !24
-// CHECK1-NEXT:    store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !24
-// CHECK1-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !24
-// CHECK1-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !24
-// CHECK1-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias !24
+// CHECK1-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META29:![0-9]+]])
+// CHECK1-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META32:![0-9]+]])
+// CHECK1-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META34:![0-9]+]])
+// CHECK1-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META36:![0-9]+]])
+// CHECK1-NEXT:    store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !38
+// CHECK1-NEXT:    store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 8, !noalias !38
+// CHECK1-NEXT:    store ptr [[TMP8]], ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias !38
+// CHECK1-NEXT:    store ptr @.omp_task_privates_map., ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !38
+// CHECK1-NEXT:    store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 8, !noalias !38
+// CHECK1-NEXT:    store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !38
+// CHECK1-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !38
+// CHECK1-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !38
+// CHECK1-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias !38
 // CHECK1-NEXT:    call void [[TMP10]](ptr [[TMP11]], ptr [[DOTFIRSTPRIV_PTR_ADDR_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR2_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR3_I]]) #[[ATTR4]]
-// CHECK1-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR_I]], align 8, !noalias !24
-// CHECK1-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], align 8, !noalias !24
-// CHECK1-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR2_I]], align 8, !noalias !24
-// CHECK1-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR3_I]], align 8, !noalias !24
+// CHECK1-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR_I]], align 8, !noalias !38
+// CHECK1-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], align 8, !noalias !38
+// CHECK1-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR2_I]], align 8, !noalias !38
+// CHECK1-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR3_I]], align 8, !noalias !38
 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT_ANON:%.*]], ptr [[TMP9]], i32 0, i32 1
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_ANON]], ptr [[TMP9]], i32 0, i32 2
-// CHECK1-NEXT:    store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias !24
+// CHECK1-NEXT:    store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias !38
 // CHECK1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 1
-// CHECK1-NEXT:    store i32 3, ptr [[TMP18]], align 4, !noalias !24
+// CHECK1-NEXT:    store i32 3, ptr [[TMP18]], align 4, !noalias !38
 // CHECK1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 2
-// CHECK1-NEXT:    store ptr [[TMP13]], ptr [[TMP19]], align 8, !noalias !24
+// CHECK1-NEXT:    store ptr [[TMP13]], ptr [[TMP19]], align 8, !noalias !38
 // CHECK1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 3
-// CHECK1-NEXT:    store ptr [[TMP14]], ptr [[TMP20]], align 8, !noalias !24
+// CHECK1-NEXT:    store ptr [[TMP14]], ptr [[TMP20]], align 8, !noalias !38
 // CHECK1-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 4
-// CHECK1-NEXT:    store ptr [[TMP15]], ptr [[TMP21]], align 8, !noalias !24
+// CHECK1-NEXT:    store ptr [[TMP15]], ptr [[TMP21]], align 8, !noalias !38
 // CHECK1-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 5
-// CHECK1-NEXT:    store ptr @.offload_maptypes, ptr [[TMP22]], align 8, !noalias !24
+// CHECK1-NEXT:    store ptr @.offload_maptypes, ptr [[TMP22]], align 8, !noalias !38
 // CHECK1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 6
-// CHECK1-NEXT:    store ptr null, ptr [[TMP23]], align 8, !noalias !24
+// CHECK1-NEXT:    store ptr null, ptr [[TMP23]], align 8, !noalias !38
 // CHECK1-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 7
-// CHECK1-NEXT:    store ptr null, ptr [[TMP24]], align 8, !noalias !24
+// CHECK1-NEXT:    store ptr null, ptr [[TMP24]], align 8, !noalias !38
 // CHECK1-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 8
-// CHECK1-NEXT:    store i64 0, ptr [[TMP25]], align 8, !noalias !24
+// CHECK1-NEXT:    store i64 0, ptr [[TMP25]], align 8, !noalias !38
 // CHECK1-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 9
-// CHECK1-NEXT:    store i64 1, ptr [[TMP26]], align 8, !noalias !24
+// CHECK1-NEXT:    store i64 1, ptr [[TMP26]], align 8, !noalias !38
 // CHECK1-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 10
-// CHECK1-NEXT:    store [3 x i32] [i32 1, i32 0, i32 0], ptr [[TMP27]], align 4, !noalias !24
+// CHECK1-NEXT:    store [3 x i32] [i32 1, i32 0, i32 0], ptr [[TMP27]], align 4, !noalias !38
 // CHECK1-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 11
-// CHECK1-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP28]], align 4, !noalias !24
+// CHECK1-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP28]], align 4, !noalias !38
 // CHECK1-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 12
-// CHECK1-NEXT:    store i32 0, ptr [[TMP29]], align 4, !noalias !24
+// CHECK1-NEXT:    store i32 0, ptr [[TMP29]], align 4, !noalias !38
 // CHECK1-NEXT:    [[TMP30:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB2]], i64 -1, i32 1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l128.region_id, ptr [[KERNEL_ARGS_I]])
 // CHECK1-NEXT:    [[TMP31:%.*]] = icmp ne i32 [[TMP30]], 0
 // CHECK1-NEXT:    br i1 [[TMP31]], label [[OMP_OFFLOAD_FAILED_I:%.*]], label [[DOTOMP_OUTLINED__EXIT:%.*]]
 // CHECK1:       omp_offload.failed.i:
 // CHECK1-NEXT:    [[TMP32:%.*]] = load i16, ptr [[TMP12]], align 2
-// CHECK1-NEXT:    store i16 [[TMP32]], ptr [[AA_CASTED_I]], align 2, !noalias !24
-// CHECK1-NEXT:    [[TMP33:%.*]] = load i64, ptr [[AA_CASTED_I]], align 8, !noalias !24
+// CHECK1-NEXT:    store i16 [[TMP32]], ptr [[AA_CASTED_I]], align 2, !noalias !38
+// CHECK1-NEXT:    [[TMP33:%.*]] = load i64, ptr [[AA_CASTED_I]], align 8, !noalias !38
 // CHECK1-NEXT:    [[TMP34:%.*]] = load i32, ptr [[TMP16]], align 4
-// CHECK1-NEXT:    store i32 [[TMP34]], ptr [[LIN_CASTED_I]], align 4, !noalias !24
-// CHECK1-NEXT:    [[TMP35:%.*]] = load i64, ptr [[LIN_CASTED_I]], align 8, !noalias !24
+// CHECK1-NEXT:    store i32 [[TMP34]], ptr [[LIN_CASTED_I]], align 4, !noalias !38
+// CHECK1-NEXT:    [[TMP35:%.*]] = load i64, ptr [[LIN_CASTED_I]], align 8, !noalias !38
 // CHECK1-NEXT:    [[TMP36:%.*]] = load i32, ptr [[TMP17]], align 4
-// CHECK1-NEXT:    store i32 [[TMP36]], ptr [[A_CASTED_I]], align 4, !noalias !24
-// CHECK1-NEXT:    [[TMP37:%.*]] = load i64, ptr [[A_CASTED_I]], align 8, !noalias !24
+// CHECK1-NEXT:    store i32 [[TMP36]], ptr [[A_CASTED_I]], align 4, !noalias !38
+// CHECK1-NEXT:    [[TMP37:%.*]] = load i64, ptr [[A_CASTED_I]], align 8, !noalias !38
 // CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l128(i64 [[TMP33]], i64 [[TMP35]], i64 [[TMP37]]) #[[ATTR4]]
 // CHECK1-NEXT:    br label [[DOTOMP_OUTLINED__EXIT]]
 // CHECK1:       .omp_outlined..exit:
@@ -2464,32 +2464,32 @@ int bar(int n){
 // CHECK3-NEXT:    store i32 [[TMP5]], ptr [[DOTOMP_IV]], align 4
 // CHECK3-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK3:       omp.inner.for.cond:
-// CHECK3-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP13:![0-9]+]]
-// CHECK3-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP13]]
+// CHECK3-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP27:![0-9]+]]
+// CHECK3-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP27]]
 // CHECK3-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP6]], [[TMP7]]
 // CHECK3-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK3:       omp.inner.for.body:
-// CHECK3-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP13]]
+// CHECK3-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP27]]
 // CHECK3-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1
 // CHECK3-NEXT:    [[SUB:%.*]] = sub nsw i32 10, [[MUL]]
-// CHECK3-NEXT:    store i32 [[SUB]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP13]]
-// CHECK3-NEXT:    [[TMP9:%.*]] = load i64, ptr [[DOTLINEAR_START]], align 8, !llvm.access.group [[ACC_GRP13]]
-// CHECK3-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP13]]
+// CHECK3-NEXT:    store i32 [[SUB]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP27]]
+// CHECK3-NEXT:    [[TMP9:%.*]] = load i64, ptr [[DOTLINEAR_START]], align 8, !llvm.access.group [[ACC_GRP27]]
+// CHECK3-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP27]]
 // CHECK3-NEXT:    [[MUL2:%.*]] = mul nsw i32 [[TMP10]], 3
 // CHECK3-NEXT:    [[CONV:%.*]] = sext i32 [[MUL2]] to i64
 // CHECK3-NEXT:    [[ADD:%.*]] = add nsw i64 [[TMP9]], [[CONV]]
-// CHECK3-NEXT:    store i64 [[ADD]], ptr [[K1]], align 8, !llvm.access.group [[ACC_GRP13]]
-// CHECK3-NEXT:    [[TMP11:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP13]]
+// CHECK3-NEXT:    store i64 [[ADD]], ptr [[K1]], align 8, !llvm.access.group [[ACC_GRP27]]
+// CHECK3-NEXT:    [[TMP11:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP27]]
 // CHECK3-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP11]], 1
-// CHECK3-NEXT:    store i32 [[ADD3]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP13]]
+// CHECK3-NEXT:    store i32 [[ADD3]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP27]]
 // CHECK3-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK3:       omp.body.continue:
 // CHECK3-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK3:       omp.inner.for.inc:
-// CHECK3-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP13]]
+// CHECK3-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP27]]
 // CHECK3-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP12]], 1
-// CHECK3-NEXT:    store i32 [[ADD4]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP13]]
-// CHECK3-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP14:![0-9]+]]
+// CHECK3-NEXT:    store i32 [[ADD4]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP27]]
+// CHECK3-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP28:![0-9]+]]
 // CHECK3:       omp.inner.for.end:
 // CHECK3-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK3:       omp.dispatch.inc:
@@ -2697,64 +2697,64 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], ptr [[TMP4]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 4
 // CHECK3-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES]], ptr [[TMP3]], i32 0, i32 1
-// CHECK3-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META16:![0-9]+]])
-// CHECK3-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META19:![0-9]+]])
-// CHECK3-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META21:![0-9]+]])
-// CHECK3-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META23:![0-9]+]])
-// CHECK3-NEXT:    store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !25
-// CHECK3-NEXT:    store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 4, !noalias !25
-// CHECK3-NEXT:    store ptr [[TMP8]], ptr [[DOTPRIVATES__ADDR_I]], align 4, !noalias !25
-// CHECK3-NEXT:    store ptr @.omp_task_privates_map., ptr [[DOTCOPY_FN__ADDR_I]], align 4, !noalias !25
-// CHECK3-NEXT:    store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 4, !noalias !25
-// CHECK3-NEXT:    store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 4, !noalias !25
-// CHECK3-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 4, !noalias !25
-// CHECK3-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[DOTCOPY_FN__ADDR_I]], align 4, !noalias !25
-// CHECK3-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[DOTPRIVATES__ADDR_I]], align 4, !noalias !25
+// CHECK3-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META30:![0-9]+]])
+// CHECK3-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META33:![0-9]+]])
+// CHECK3-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META35:![0-9]+]])
+// CHECK3-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META37:![0-9]+]])
+// CHECK3-NEXT:    store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !39
+// CHECK3-NEXT:    store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 4, !noalias !39
+// CHECK3-NEXT:    store ptr [[TMP8]], ptr [[DOTPRIVATES__ADDR_I]], align 4, !noalias !39
+// CHECK3-NEXT:    store ptr @.omp_task_privates_map., ptr [[DOTCOPY_FN__ADDR_I]], align 4, !noalias !39
+// CHECK3-NEXT:    store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 4, !noalias !39
+// CHECK3-NEXT:    store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 4, !noalias !39
+// CHECK3-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 4, !noalias !39
+// CHECK3-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[DOTCOPY_FN__ADDR_I]], align 4, !noalias !39
+// CHECK3-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[DOTPRIVATES__ADDR_I]], align 4, !noalias !39
 // CHECK3-NEXT:    call void [[TMP10]](ptr [[TMP11]], ptr [[DOTFIRSTPRIV_PTR_ADDR_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR2_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR3_I]]) #[[ATTR4]]
-// CHECK3-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR_I]], align 4, !noalias !25
-// CHECK3-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], align 4, !noalias !25
-// CHECK3-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR2_I]], align 4, !noalias !25
-// CHECK3-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR3_I]], align 4, !noalias !25
+// CHECK3-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR_I]], align 4, !noalias !39
+// CHECK3-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], align 4, !noalias !39
+// CHECK3-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR2_I]], align 4, !noalias !39
+// CHECK3-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR3_I]], align 4, !noalias !39
 // CHECK3-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT_ANON:%.*]], ptr [[TMP9]], i32 0, i32 1
 // CHECK3-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_ANON]], ptr [[TMP9]], i32 0, i32 2
-// CHECK3-NEXT:    store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias !25
+// CHECK3-NEXT:    store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias !39
 // CHECK3-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 1
-// CHECK3-NEXT:    store i32 3, ptr [[TMP18]], align 4, !noalias !25
+// CHECK3-NEXT:    store i32 3, ptr [[TMP18]], align 4, !noalias !39
 // CHECK3-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 2
-// CHECK3-NEXT:    store ptr [[TMP13]], ptr [[TMP19]], align 4, !noalias !25
+// CHECK3-NEXT:    store ptr [[TMP13]], ptr [[TMP19]], align 4, !noalias !39
 // CHECK3-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 3
-// CHECK3-NEXT:    store ptr [[TMP14]], ptr [[TMP20]], align 4, !noalias !25
+// CHECK3-NEXT:    store ptr [[TMP14]], ptr [[TMP20]], align 4, !noalias !39
 // CHECK3-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 4
-// CHECK3-NEXT:    store ptr [[TMP15]], ptr [[TMP21]], align 4, !noalias !25
+// CHECK3-NEXT:    store ptr [[TMP15]], ptr [[TMP21]], align 4, !noalias !39
 // CHECK3-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 5
-// CHECK3-NEXT:    store ptr @.offload_maptypes, ptr [[TMP22]], align 4, !noalias !25
+// CHECK3-NEXT:    store ptr @.offload_maptypes, ptr [[TMP22]], align 4, !noalias !39
 // CHECK3-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 6
-// CHECK3-NEXT:    store ptr null, ptr [[TMP23]], align 4, !noalias !25
+// CHECK3-NEXT:    store ptr null, ptr [[TMP23]], align 4, !noalias !39
 // CHECK3-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 7
-// CHECK3-NEXT:    store ptr null, ptr [[TMP24]], align 4, !noalias !25
+// CHECK3-NEXT:    store ptr null, ptr [[TMP24]], align 4, !noalias !39
 // CHECK3-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 8
-// CHECK3-NEXT:    store i64 0, ptr [[TMP25]], align 8, !noalias !25
+// CHECK3-NEXT:    store i64 0, ptr [[TMP25]], align 8, !noalias !39
 // CHECK3-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 9
-// CHECK3-NEXT:    store i64 1, ptr [[TMP26]], align 8, !noalias !25
+// CHECK3-NEXT:    store i64 1, ptr [[TMP26]], align 8, !noalias !39
 // CHECK3-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 10
-// CHECK3-NEXT:    store [3 x i32] [i32 1, i32 0, i32 0], ptr [[TMP27]], align 4, !noalias !25
+// CHECK3-NEXT:    store [3 x i32] [i32 1, i32 0, i32 0], ptr [[TMP27]], align 4, !noalias !39
 // CHECK3-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 11
-// CHECK3-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP28]], align 4, !noalias !25
+// CHECK3-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP28]], align 4, !noalias !39
 // CHECK3-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 12
-// CHECK3-NEXT:    store i32 0, ptr [[TMP29]], align 4, !noalias !25
+// CHECK3-NEXT:    store i32 0, ptr [[TMP29]], align 4, !noalias !39
 // CHECK3-NEXT:    [[TMP30:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB2]], i64 -1, i32 1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l128.region_id, ptr [[KERNEL_ARGS_I]])
 // CHECK3-NEXT:    [[TMP31:%.*]] = icmp ne i32 [[TMP30]], 0
 // CHECK3-NEXT:    br i1 [[TMP31]], label [[OMP_OFFLOAD_FAILED_I:%.*]], label [[DOTOMP_OUTLINED__EXIT:%.*]]
 // CHECK3:       omp_offload.failed.i:
 // CHECK3-NEXT:    [[TMP32:%.*]] = load i16, ptr [[TMP12]], align 2
-// CHECK3-NEXT:    store i16 [[TMP32]], ptr [[AA_CASTED_I]], align 2, !noalias !25
-// CHECK3-NEXT:    [[TMP33:%.*]] = load i32, ptr [[AA_CASTED_I]], align 4, !noalias !25
+// CHECK3-NEXT:    store i16 [[TMP32]], ptr [[AA_CASTED_I]], align 2, !noalias !39
+// CHECK3-NEXT:    [[TMP33:%.*]] = load i32, ptr [[AA_CASTED_I]], align 4, !noalias !39
 // CHECK3-NEXT:    [[TMP34:%.*]] = load i32, ptr [[TMP16]], align 4
-// CHECK3-NEXT:    store i32 [[TMP34]], ptr [[LIN_CASTED_I]], align 4, !noalias !25
-// CHECK3-NEXT:    [[TMP35:%.*]] = load i32, ptr [[LIN_CASTED_I]], align 4, !noalias !25
+// CHECK3-NEXT:    store i32 [[TMP34]], ptr [[LIN_CASTED_I]], align 4, !noalias !39
+// CHECK3-NEXT:    [[TMP35:%.*]] = load i32, ptr [[LIN_CASTED_I]], align 4, !noalias !39
 // CHECK3-NEXT:    [[TMP36:%.*]] = load i32, ptr [[TMP17]], align 4
-// CHECK3-NEXT:    store i32 [[TMP36]], ptr [[A_CASTED_I]], align 4, !noalias !25
-// CHECK3-NEXT:    [[TMP37:%.*]] = load i32, ptr [[A_CASTED_I]], align 4, !noalias !25
+// CHECK3-NEXT:    store i32 [[TMP36]], ptr [[A_CASTED_I]], align 4, !noalias !39
+// CHECK3-NEXT:    [[TMP37:%.*]] = load i32, ptr [[A_CASTED_I]], align 4, !noalias !39
 // CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l128(i32 [[TMP33]], i32 [[TMP35]], i32 [[TMP37]]) #[[ATTR4]]
 // CHECK3-NEXT:    br label [[DOTOMP_OUTLINED__EXIT]]
 // CHECK3:       .omp_outlined..exit:
@@ -5670,32 +5670,32 @@ int bar(int n){
 // CHECK17-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
 // CHECK17-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK17:       omp.inner.for.cond:
-// CHECK17-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12:![0-9]+]]
-// CHECK17-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK17-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26:![0-9]+]]
+// CHECK17-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP26]]
 // CHECK17-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
 // CHECK17-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK17:       omp.inner.for.body:
-// CHECK17-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK17-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
 // CHECK17-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
 // CHECK17-NEXT:    [[SUB:%.*]] = sub nsw i32 10, [[MUL]]
-// CHECK17-NEXT:    store i32 [[SUB]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP12]]
-// CHECK17-NEXT:    [[TMP8:%.*]] = load i64, ptr [[DOTLINEAR_START]], align 8, !llvm.access.group [[ACC_GRP12]]
-// CHECK17-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK17-NEXT:    store i32 [[SUB]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK17-NEXT:    [[TMP8:%.*]] = load i64, ptr [[DOTLINEAR_START]], align 8, !llvm.access.group [[ACC_GRP26]]
+// CHECK17-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
 // CHECK17-NEXT:    [[MUL2:%.*]] = mul nsw i32 [[TMP9]], 3
 // CHECK17-NEXT:    [[CONV:%.*]] = sext i32 [[MUL2]] to i64
 // CHECK17-NEXT:    [[ADD:%.*]] = add nsw i64 [[TMP8]], [[CONV]]
-// CHECK17-NEXT:    store i64 [[ADD]], ptr [[K1]], align 8, !llvm.access.group [[ACC_GRP12]]
-// CHECK17-NEXT:    [[TMP10:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK17-NEXT:    store i64 [[ADD]], ptr [[K1]], align 8, !llvm.access.group [[ACC_GRP26]]
+// CHECK17-NEXT:    [[TMP10:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP26]]
 // CHECK17-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP10]], 1
-// CHECK17-NEXT:    store i32 [[ADD3]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK17-NEXT:    store i32 [[ADD3]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP26]]
 // CHECK17-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK17:       omp.body.continue:
 // CHECK17-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK17:       omp.inner.for.inc:
-// CHECK17-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK17-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
 // CHECK17-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP11]], 1
-// CHECK17-NEXT:    store i32 [[ADD4]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12]]
-// CHECK17-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP13:![0-9]+]]
+// CHECK17-NEXT:    store i32 [[ADD4]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK17-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP27:![0-9]+]]
 // CHECK17:       omp.inner.for.end:
 // CHECK17-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK17:       omp.dispatch.inc:
@@ -5903,64 +5903,64 @@ int bar(int n){
 // CHECK17-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], ptr [[TMP4]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8
 // CHECK17-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES]], ptr [[TMP3]], i32 0, i32 1
-// CHECK17-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META15:![0-9]+]])
-// CHECK17-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META18:![0-9]+]])
-// CHECK17-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META20:![0-9]+]])
-// CHECK17-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META22:![0-9]+]])
-// CHECK17-NEXT:    store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !24
-// CHECK17-NEXT:    store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 8, !noalias !24
-// CHECK17-NEXT:    store ptr [[TMP8]], ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias !24
-// CHECK17-NEXT:    store ptr @.omp_task_privates_map., ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !24
-// CHECK17-NEXT:    store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 8, !noalias !24
-// CHECK17-NEXT:    store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !24
-// CHECK17-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !24
-// CHECK17-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !24
-// CHECK17-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias !24
+// CHECK17-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META29:![0-9]+]])
+// CHECK17-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META32:![0-9]+]])
+// CHECK17-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META34:![0-9]+]])
+// CHECK17-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META36:![0-9]+]])
+// CHECK17-NEXT:    store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !38
+// CHECK17-NEXT:    store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 8, !noalias !38
+// CHECK17-NEXT:    store ptr [[TMP8]], ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias !38
+// CHECK17-NEXT:    store ptr @.omp_task_privates_map., ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !38
+// CHECK17-NEXT:    store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 8, !noalias !38
+// CHECK17-NEXT:    store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !38
+// CHECK17-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !38
+// CHECK17-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !38
+// CHECK17-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias !38
 // CHECK17-NEXT:    call void [[TMP10]](ptr [[TMP11]], ptr [[DOTFIRSTPRIV_PTR_ADDR_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR2_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR3_I]]) #[[ATTR4]]
-// CHECK17-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR_I]], align 8, !noalias !24
-// CHECK17-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], align 8, !noalias !24
-// CHECK17-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR2_I]], align 8, !noalias !24
-// CHECK17-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR3_I]], align 8, !noalias !24
+// CHECK17-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR_I]], align 8, !noalias !38
+// CHECK17-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], align 8, !noalias !38
+// CHECK17-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR2_I]], align 8, !noalias !38
+// CHECK17-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR3_I]], align 8, !noalias !38
 // CHECK17-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT_ANON:%.*]], ptr [[TMP9]], i32 0, i32 1
 // CHECK17-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_ANON]], ptr [[TMP9]], i32 0, i32 2
-// CHECK17-NEXT:    store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias !24
+// CHECK17-NEXT:    store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias !38
 // CHECK17-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 1
-// CHECK17-NEXT:    store i32 3, ptr [[TMP18]], align 4, !noalias !24
+// CHECK17-NEXT:    store i32 3, ptr [[TMP18]], align 4, !noalias !38
 // CHECK17-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 2
-// CHECK17-NEXT:    store ptr [[TMP13]], ptr [[TMP19]], align 8, !noalias !24
+// CHECK17-NEXT:    store ptr [[TMP13]], ptr [[TMP19]], align 8, !noalias !38
 // CHECK17-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 3
-// CHECK17-NEXT:    store ptr [[TMP14]], ptr [[TMP20]], align 8, !noalias !24
+// CHECK17-NEXT:    store ptr [[TMP14]], ptr [[TMP20]], align 8, !noalias !38
 // CHECK17-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 4
-// CHECK17-NEXT:    store ptr [[TMP15]], ptr [[TMP21]], align 8, !noalias !24
+// CHECK17-NEXT:    store ptr [[TMP15]], ptr [[TMP21]], align 8, !noalias !38
 // CHECK17-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 5
-// CHECK17-NEXT:    store ptr @.offload_maptypes, ptr [[TMP22]], align 8, !noalias !24
+// CHECK17-NEXT:    store ptr @.offload_maptypes, ptr [[TMP22]], align 8, !noalias !38
 // CHECK17-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 6
-// CHECK17-NEXT:    store ptr null, ptr [[TMP23]], align 8, !noalias !24
+// CHECK17-NEXT:    store ptr null, ptr [[TMP23]], align 8, !noalias !38
 // CHECK17-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 7
-// CHECK17-NEXT:    store ptr null, ptr [[TMP24]], align 8, !noalias !24
+// CHECK17-NEXT:    store ptr null, ptr [[TMP24]], align 8, !noalias !38
 // CHECK17-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 8
-// CHECK17-NEXT:    store i64 0, ptr [[TMP25]], align 8, !noalias !24
+// CHECK17-NEXT:    store i64 0, ptr [[TMP25]], align 8, !noalias !38
 // CHECK17-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 9
-// CHECK17-NEXT:    store i64 1, ptr [[TMP26]], align 8, !noalias !24
+// CHECK17-NEXT:    store i64 1, ptr [[TMP26]], align 8, !noalias !38
 // CHECK17-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 10
-// CHECK17-NEXT:    store [3 x i32] [i32 1, i32 0, i32 0], ptr [[TMP27]], align 4, !noalias !24
+// CHECK17-NEXT:    store [3 x i32] [i32 1, i32 0, i32 0], ptr [[TMP27]], align 4, !noalias !38
 // CHECK17-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 11
-// CHECK17-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP28]], align 4, !noalias !24
+// CHECK17-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP28]], align 4, !noalias !38
 // CHECK17-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 12
-// CHECK17-NEXT:    store i32 0, ptr [[TMP29]], align 4, !noalias !24
+// CHECK17-NEXT:    store i32 0, ptr [[TMP29]], align 4, !noalias !38
 // CHECK17-NEXT:    [[TMP30:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB2]], i64 -1, i32 1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l128.region_id, ptr [[KERNEL_ARGS_I]])
 // CHECK17-NEXT:    [[TMP31:%.*]] = icmp ne i32 [[TMP30]], 0
 // CHECK17-NEXT:    br i1 [[TMP31]], label [[OMP_OFFLOAD_FAILED_I:%.*]], label [[DOTOMP_OUTLINED__EXIT:%.*]]
 // CHECK17:       omp_offload.failed.i:
 // CHECK17-NEXT:    [[TMP32:%.*]] = load i16, ptr [[TMP12]], align 2
-// CHECK17-NEXT:    store i16 [[TMP32]], ptr [[AA_CASTED_I]], align 2, !noalias !24
-// CHECK17-NEXT:    [[TMP33:%.*]] = load i64, ptr [[AA_CASTED_I]], align 8, !noalias !24
+// CHECK17-NEXT:    store i16 [[TMP32]], ptr [[AA_CASTED_I]], align 2, !noalias !38
+// CHECK17-NEXT:    [[TMP33:%.*]] = load i64, ptr [[AA_CASTED_I]], align 8, !noalias !38
 // CHECK17-NEXT:    [[TMP34:%.*]] = load i32, ptr [[TMP16]], align 4
-// CHECK17-NEXT:    store i32 [[TMP34]], ptr [[LIN_CASTED_I]], align 4, !noalias !24
-// CHECK17-NEXT:    [[TMP35:%.*]] = load i64, ptr [[LIN_CASTED_I]], align 8, !noalias !24
+// CHECK17-NEXT:    store i32 [[TMP34]], ptr [[LIN_CASTED_I]], align 4, !noalias !38
+// CHECK17-NEXT:    [[TMP35:%.*]] = load i64, ptr [[LIN_CASTED_I]], align 8, !noalias !38
 // CHECK17-NEXT:    [[TMP36:%.*]] = load i32, ptr [[TMP17]], align 4
-// CHECK17-NEXT:    store i32 [[TMP36]], ptr [[A_CASTED_I]], align 4, !noalias !24
-// CHECK17-NEXT:    [[TMP37:%.*]] = load i64, ptr [[A_CASTED_I]], align 8, !noalias !24
+// CHECK17-NEXT:    store i32 [[TMP36]], ptr [[A_CASTED_I]], align 4, !noalias !38
+// CHECK17-NEXT:    [[TMP37:%.*]] = load i64, ptr [[A_CASTED_I]], align 8, !noalias !38
 // CHECK17-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l128(i64 [[TMP33]], i64 [[TMP35]], i64 [[TMP37]]) #[[ATTR4]]
 // CHECK17-NEXT:    br label [[DOTOMP_OUTLINED__EXIT]]
 // CHECK17:       .omp_outlined..exit:
@@ -7340,32 +7340,32 @@ int bar(int n){
 // CHECK19-NEXT:    store i32 [[TMP5]], ptr [[DOTOMP_IV]], align 4
 // CHECK19-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK19:       omp.inner.for.cond:
-// CHECK19-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP13:![0-9]+]]
-// CHECK19-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP13]]
+// CHECK19-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP27:![0-9]+]]
+// CHECK19-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP27]]
 // CHECK19-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP6]], [[TMP7]]
 // CHECK19-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK19:       omp.inner.for.body:
-// CHECK19-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP13]]
+// CHECK19-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP27]]
 // CHECK19-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1
 // CHECK19-NEXT:    [[SUB:%.*]] = sub nsw i32 10, [[MUL]]
-// CHECK19-NEXT:    store i32 [[SUB]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP13]]
-// CHECK19-NEXT:    [[TMP9:%.*]] = load i64, ptr [[DOTLINEAR_START]], align 8, !llvm.access.group [[ACC_GRP13]]
-// CHECK19-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP13]]
+// CHECK19-NEXT:    store i32 [[SUB]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP27]]
+// CHECK19-NEXT:    [[TMP9:%.*]] = load i64, ptr [[DOTLINEAR_START]], align 8, !llvm.access.group [[ACC_GRP27]]
+// CHECK19-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP27]]
 // CHECK19-NEXT:    [[MUL2:%.*]] = mul nsw i32 [[TMP10]], 3
 // CHECK19-NEXT:    [[CONV:%.*]] = sext i32 [[MUL2]] to i64
 // CHECK19-NEXT:    [[ADD:%.*]] = add nsw i64 [[TMP9]], [[CONV]]
-// CHECK19-NEXT:    store i64 [[ADD]], ptr [[K1]], align 8, !llvm.access.group [[ACC_GRP13]]
-// CHECK19-NEXT:    [[TMP11:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP13]]
+// CHECK19-NEXT:    store i64 [[ADD]], ptr [[K1]], align 8, !llvm.access.group [[ACC_GRP27]]
+// CHECK19-NEXT:    [[TMP11:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP27]]
 // CHECK19-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP11]], 1
-// CHECK19-NEXT:    store i32 [[ADD3]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP13]]
+// CHECK19-NEXT:    store i32 [[ADD3]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP27]]
 // CHECK19-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK19:       omp.body.continue:
 // CHECK19-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK19:       omp.inner.for.inc:
-// CHECK19-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP13]]
+// CHECK19-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP27]]
 // CHECK19-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP12]], 1
-// CHECK19-NEXT:    store i32 [[ADD4]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP13]]
-// CHECK19-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP14:![0-9]+]]
+// CHECK19-NEXT:    store i32 [[ADD4]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP27]]
+// CHECK19-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP28:![0-9]+]]
 // CHECK19:       omp.inner.for.end:
 // CHECK19-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK19:       omp.dispatch.inc:
@@ -7573,64 +7573,64 @@ int bar(int n){
 // CHECK19-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], ptr [[TMP4]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 4
 // CHECK19-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES]], ptr [[TMP3]], i32 0, i32 1
-// CHECK19-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META16:![0-9]+]])
-// CHECK19-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META19:![0-9]+]])
-// CHECK19-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META21:![0-9]+]])
-// CHECK19-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META23:![0-9]+]])
-// CHECK19-NEXT:    store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !25
-// CHECK19-NEXT:    store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 4, !noalias !25
-// CHECK19-NEXT:    store ptr [[TMP8]], ptr [[DOTPRIVATES__ADDR_I]], align 4, !noalias !25
-// CHECK19-NEXT:    store ptr @.omp_task_privates_map., ptr [[DOTCOPY_FN__ADDR_I]], align 4, !noalias !25
-// CHECK19-NEXT:    store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 4, !noalias !25
-// CHECK19-NEXT:    store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 4, !noalias !25
-// CHECK19-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 4, !noalias !25
-// CHECK19-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[DOTCOPY_FN__ADDR_I]], align 4, !noalias !25
-// CHECK19-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[DOTPRIVATES__ADDR_I]], align 4, !noalias !25
+// CHECK19-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META30:![0-9]+]])
+// CHECK19-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META33:![0-9]+]])
+// CHECK19-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META35:![0-9]+]])
+// CHECK19-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META37:![0-9]+]])
+// CHECK19-NEXT:    store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !39
+// CHECK19-NEXT:    store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 4, !noalias !39
+// CHECK19-NEXT:    store ptr [[TMP8]], ptr [[DOTPRIVATES__ADDR_I]], align 4, !noalias !39
+// CHECK19-NEXT:    store ptr @.omp_task_privates_map., ptr [[DOTCOPY_FN__ADDR_I]], align 4, !noalias !39
+// CHECK19-NEXT:    store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 4, !noalias !39
+// CHECK19-NEXT:    store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 4, !noalias !39
+// CHECK19-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 4, !noalias !39
+// CHECK19-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[DOTCOPY_FN__ADDR_I]], align 4, !noalias !39
+// CHECK19-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[DOTPRIVATES__ADDR_I]], align 4, !noalias !39
 // CHECK19-NEXT:    call void [[TMP10]](ptr [[TMP11]], ptr [[DOTFIRSTPRIV_PTR_ADDR_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR2_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR3_I]]) #[[ATTR4]]
-// CHECK19-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR_I]], align 4, !noalias !25
-// CHECK19-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], align 4, !noalias !25
-// CHECK19-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR2_I]], align 4, !noalias !25
-// CHECK19-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR3_I]], align 4, !noalias !25
+// CHECK19-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR_I]], align 4, !noalias !39
+// CHECK19-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], align 4, !noalias !39
+// CHECK19-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR2_I]], align 4, !noalias !39
+// CHECK19-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR3_I]], align 4, !noalias !39
 // CHECK19-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT_ANON:%.*]], ptr [[TMP9]], i32 0, i32 1
 // CHECK19-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_ANON]], ptr [[TMP9]], i32 0, i32 2
-// CHECK19-NEXT:    store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias !25
+// CHECK19-NEXT:    store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias !39
 // CHECK19-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 1
-// CHECK19-NEXT:    store i32 3, ptr [[TMP18]], align 4, !noalias !25
+// CHECK19-NEXT:    store i32 3, ptr [[TMP18]], align 4, !noalias !39
 // CHECK19-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 2
-// CHECK19-NEXT:    store ptr [[TMP13]], ptr [[TMP19]], align 4, !noalias !25
+// CHECK19-NEXT:    store ptr [[TMP13]], ptr [[TMP19]], align 4, !noalias !39
 // CHECK19-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 3
-// CHECK19-NEXT:    store ptr [[TMP14]], ptr [[TMP20]], align 4, !noalias !25
+// CHECK19-NEXT:    store ptr [[TMP14]], ptr [[TMP20]], align 4, !noalias !39
 // CHECK19-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 4
-// CHECK19-NEXT:    store ptr [[TMP15]], ptr [[TMP21]], align 4, !noalias !25
+// CHECK19-NEXT:    store ptr [[TMP15]], ptr [[TMP21]], align 4, !noalias !39
 // CHECK19-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 5
-// CHECK19-NEXT:    store ptr @.offload_maptypes, ptr [[TMP22]], align 4, !noalias !25
+// CHECK19-NEXT:    store ptr @.offload_maptypes, ptr [[TMP22]], align 4, !noalias !39
 // CHECK19-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 6
-// CHECK19-NEXT:    store ptr null, ptr [[TMP23]], align 4, !noalias !25
+// CHECK19-NEXT:    store ptr null, ptr [[TMP23]], align 4, !noalias !39
 // CHECK19-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 7
-// CHECK19-NEXT:    store ptr null, ptr [[TMP24]], align 4, !noalias !25
+// CHECK19-NEXT:    store ptr null, ptr [[TMP24]], align 4, !noalias !39
 // CHECK19-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 8
-// CHECK19-NEXT:    store i64 0, ptr [[TMP25]], align 8, !noalias !25
+// CHECK19-NEXT:    store i64 0, ptr [[TMP25]], align 8, !noalias !39
 // CHECK19-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 9
-// CHECK19-NEXT:    store i64 1, ptr [[TMP26]], align 8, !noalias !25
+// CHECK19-NEXT:    store i64 1, ptr [[TMP26]], align 8, !noalias !39
 // CHECK19-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 10
-// CHECK19-NEXT:    store [3 x i32] [i32 1, i32 0, i32 0], ptr [[TMP27]], align 4, !noalias !25
+// CHECK19-NEXT:    store [3 x i32] [i32 1, i32 0, i32 0], ptr [[TMP27]], align 4, !noalias !39
 // CHECK19-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 11
-// CHECK19-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP28]], align 4, !noalias !25
+// CHECK19-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP28]], align 4, !noalias !39
 // CHECK19-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 12
-// CHECK19-NEXT:    store i32 0, ptr [[TMP29]], align 4, !noalias !25
+// CHECK19-NEXT:    store i32 0, ptr [[TMP29]], align 4, !noalias !39
 // CHECK19-NEXT:    [[TMP30:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB2]], i64 -1, i32 1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l128.region_id, ptr [[KERNEL_ARGS_I]])
 // CHECK19-NEXT:    [[TMP31:%.*]] = icmp ne i32 [[TMP30]], 0
 // CHECK19-NEXT:    br i1 [[TMP31]], label [[OMP_OFFLOAD_FAILED_I:%.*]], label [[DOTOMP_OUTLINED__EXIT:%.*]]
 // CHECK19:       omp_offload.failed.i:
 // CHECK19-NEXT:    [[TMP32:%.*]] = load i16, ptr [[TMP12]], align 2
-// CHECK19-NEXT:    store i16 [[TMP32]], ptr [[AA_CASTED_I]], align 2, !noalias !25
-// CHECK19-NEXT:    [[TMP33:%.*]] = load i32, ptr [[AA_CASTED_I]], align 4, !noalias !25
+// CHECK19-NEXT:    store i16 [[TMP32]], ptr [[AA_CASTED_I]], align 2, !noalias !39
+// CHECK19-NEXT:    [[TMP33:%.*]] = load i32, ptr [[AA_CASTED_I]], align 4, !noalias !39
 // CHECK19-NEXT:    [[TMP34:%.*]] = load i32, ptr [[TMP16]], align 4
-// CHECK19-NEXT:    store i32 [[TMP34]], ptr [[LIN_CASTED_I]], align 4, !noalias !25
-// CHECK19-NEXT:    [[TMP35:%.*]] = load i32, ptr [[LIN_CASTED_I]], align 4, !noalias !25
+// CHECK19-NEXT:    store i32 [[TMP34]], ptr [[LIN_CASTED_I]], align 4, !noalias !39
+// CHECK19-NEXT:    [[TMP35:%.*]] = load i32, ptr [[LIN_CASTED_I]], align 4, !noalias !39
 // CHECK19-NEXT:    [[TMP36:%.*]] = load i32, ptr [[TMP17]], align 4
-// CHECK19-NEXT:    store i32 [[TMP36]], ptr [[A_CASTED_I]], align 4, !noalias !25
-// CHECK19-NEXT:    [[TMP37:%.*]] = load i32, ptr [[A_CASTED_I]], align 4, !noalias !25
+// CHECK19-NEXT:    store i32 [[TMP36]], ptr [[A_CASTED_I]], align 4, !noalias !39
+// CHECK19-NEXT:    [[TMP37:%.*]] = load i32, ptr [[A_CASTED_I]], align 4, !noalias !39
 // CHECK19-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l128(i32 [[TMP33]], i32 [[TMP35]], i32 [[TMP37]]) #[[ATTR4]]
 // CHECK19-NEXT:    br label [[DOTOMP_OUTLINED__EXIT]]
 // CHECK19:       .omp_outlined..exit:
diff --git a/clang/test/OpenMP/target_parallel_for_simd_codegen.cpp b/clang/test/OpenMP/target_parallel_for_simd_codegen.cpp
index 3b88f6eee5470..cfb9d8ff94d17 100644
--- a/clang/test/OpenMP/target_parallel_for_simd_codegen.cpp
+++ b/clang/test/OpenMP/target_parallel_for_simd_codegen.cpp
@@ -659,23 +659,23 @@ int bar(int n){
 // CHECK1-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK1:       omp.inner.for.cond:
-// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP10:![0-9]+]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP10]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP24:![0-9]+]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP24]]
 // CHECK1-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
 // CHECK1-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK1:       omp.inner.for.body:
-// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP10]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP24]]
 // CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 5
 // CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 3, [[MUL]]
-// CHECK1-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP10]]
+// CHECK1-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP24]]
 // CHECK1-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK1:       omp.body.continue:
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK1:       omp.inner.for.inc:
-// CHECK1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP10]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP24]]
 // CHECK1-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP8]], 1
-// CHECK1-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP10]]
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP11:![0-9]+]]
+// CHECK1-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP24]]
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP25:![0-9]+]]
 // CHECK1:       omp.inner.for.end:
 // CHECK1-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK1:       omp.loop.exit:
@@ -710,42 +710,42 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T:%.*]], ptr [[TMP4]], i32 0, i32 2
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], ptr [[TMP4]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8
-// CHECK1-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META16:![0-9]+]])
-// CHECK1-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META19:![0-9]+]])
-// CHECK1-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META21:![0-9]+]])
-// CHECK1-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META23:![0-9]+]])
-// CHECK1-NEXT:    store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !25
-// CHECK1-NEXT:    store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 8, !noalias !25
-// CHECK1-NEXT:    store ptr null, ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias !25
-// CHECK1-NEXT:    store ptr null, ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !25
-// CHECK1-NEXT:    store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 8, !noalias !25
-// CHECK1-NEXT:    store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !25
-// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !25
-// CHECK1-NEXT:    store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias !25
+// CHECK1-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META30:![0-9]+]])
+// CHECK1-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META33:![0-9]+]])
+// CHECK1-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META35:![0-9]+]])
+// CHECK1-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META37:![0-9]+]])
+// CHECK1-NEXT:    store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !39
+// CHECK1-NEXT:    store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 8, !noalias !39
+// CHECK1-NEXT:    store ptr null, ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias !39
+// CHECK1-NEXT:    store ptr null, ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !39
+// CHECK1-NEXT:    store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 8, !noalias !39
+// CHECK1-NEXT:    store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !39
+// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !39
+// CHECK1-NEXT:    store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias !39
 // CHECK1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 1
-// CHECK1-NEXT:    store i32 0, ptr [[TMP9]], align 4, !noalias !25
+// CHECK1-NEXT:    store i32 0, ptr [[TMP9]], align 4, !noalias !39
 // CHECK1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 2
-// CHECK1-NEXT:    store ptr null, ptr [[TMP10]], align 8, !noalias !25
+// CHECK1-NEXT:    store ptr null, ptr [[TMP10]], align 8, !noalias !39
 // CHECK1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 3
-// CHECK1-NEXT:    store ptr null, ptr [[TMP11]], align 8, !noalias !25
+// CHECK1-NEXT:    store ptr null, ptr [[TMP11]], align 8, !noalias !39
 // CHECK1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 4
-// CHECK1-NEXT:    store ptr null, ptr [[TMP12]], align 8, !noalias !25
+// CHECK1-NEXT:    store ptr null, ptr [[TMP12]], align 8, !noalias !39
 // CHECK1-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 5
-// CHECK1-NEXT:    store ptr null, ptr [[TMP13]], align 8, !noalias !25
+// CHECK1-NEXT:    store ptr null, ptr [[TMP13]], align 8, !noalias !39
 // CHECK1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 6
-// CHECK1-NEXT:    store ptr null, ptr [[TMP14]], align 8, !noalias !25
+// CHECK1-NEXT:    store ptr null, ptr [[TMP14]], align 8, !noalias !39
 // CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 7
-// CHECK1-NEXT:    store ptr null, ptr [[TMP15]], align 8, !noalias !25
+// CHECK1-NEXT:    store ptr null, ptr [[TMP15]], align 8, !noalias !39
 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 8
-// CHECK1-NEXT:    store i64 0, ptr [[TMP16]], align 8, !noalias !25
+// CHECK1-NEXT:    store i64 0, ptr [[TMP16]], align 8, !noalias !39
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 9
-// CHECK1-NEXT:    store i64 1, ptr [[TMP17]], align 8, !noalias !25
+// CHECK1-NEXT:    store i64 1, ptr [[TMP17]], align 8, !noalias !39
 // CHECK1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 10
-// CHECK1-NEXT:    store [3 x i32] [i32 1, i32 0, i32 0], ptr [[TMP18]], align 4, !noalias !25
+// CHECK1-NEXT:    store [3 x i32] [i32 1, i32 0, i32 0], ptr [[TMP18]], align 4, !noalias !39
 // CHECK1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 11
-// CHECK1-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP19]], align 4, !noalias !25
+// CHECK1-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP19]], align 4, !noalias !39
 // CHECK1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 12
-// CHECK1-NEXT:    store i32 0, ptr [[TMP20]], align 4, !noalias !25
+// CHECK1-NEXT:    store i32 0, ptr [[TMP20]], align 4, !noalias !39
 // CHECK1-NEXT:    [[TMP21:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB2]], i64 -1, i32 1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l96.region_id, ptr [[KERNEL_ARGS_I]])
 // CHECK1-NEXT:    [[TMP22:%.*]] = icmp ne i32 [[TMP21]], 0
 // CHECK1-NEXT:    br i1 [[TMP22]], label [[OMP_OFFLOAD_FAILED_I:%.*]], label [[DOTOMP_OUTLINED__EXIT:%.*]]
@@ -815,32 +815,32 @@ int bar(int n){
 // CHECK1-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK1:       omp.inner.for.cond:
-// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26:![0-9]+]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP40:![0-9]+]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP40]]
 // CHECK1-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
 // CHECK1-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK1:       omp.inner.for.body:
-// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP40]]
 // CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
 // CHECK1-NEXT:    [[SUB:%.*]] = sub nsw i32 10, [[MUL]]
-// CHECK1-NEXT:    store i32 [[SUB]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP26]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load i64, ptr [[DOTLINEAR_START]], align 8, !llvm.access.group [[ACC_GRP26]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK1-NEXT:    store i32 [[SUB]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP40]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load i64, ptr [[DOTLINEAR_START]], align 8, !llvm.access.group [[ACC_GRP40]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP40]]
 // CHECK1-NEXT:    [[MUL2:%.*]] = mul nsw i32 [[TMP9]], 3
 // CHECK1-NEXT:    [[CONV:%.*]] = sext i32 [[MUL2]] to i64
 // CHECK1-NEXT:    [[ADD:%.*]] = add nsw i64 [[TMP8]], [[CONV]]
-// CHECK1-NEXT:    store i64 [[ADD]], ptr [[K1]], align 8, !llvm.access.group [[ACC_GRP26]]
-// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK1-NEXT:    store i64 [[ADD]], ptr [[K1]], align 8, !llvm.access.group [[ACC_GRP40]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP40]]
 // CHECK1-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP10]], 1
-// CHECK1-NEXT:    store i32 [[ADD3]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK1-NEXT:    store i32 [[ADD3]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP40]]
 // CHECK1-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK1:       omp.body.continue:
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK1:       omp.inner.for.inc:
-// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP40]]
 // CHECK1-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP11]], 1
-// CHECK1-NEXT:    store i32 [[ADD4]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP27:![0-9]+]]
+// CHECK1-NEXT:    store i32 [[ADD4]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP40]]
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP41:![0-9]+]]
 // CHECK1:       omp.inner.for.end:
 // CHECK1-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK1:       omp.dispatch.inc:
@@ -943,44 +943,44 @@ int bar(int n){
 // CHECK1-NEXT:    store i64 [[TMP6]], ptr [[DOTOMP_IV]], align 8
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK1:       omp.inner.for.cond:
-// CHECK1-NEXT:    [[TMP7:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP29:![0-9]+]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8, !llvm.access.group [[ACC_GRP29]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP43:![0-9]+]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8, !llvm.access.group [[ACC_GRP43]]
 // CHECK1-NEXT:    [[CMP4:%.*]] = icmp ule i64 [[TMP7]], [[TMP8]]
 // CHECK1-NEXT:    br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK1:       omp.inner.for.body:
-// CHECK1-NEXT:    [[TMP9:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP29]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP43]]
 // CHECK1-NEXT:    [[MUL:%.*]] = mul i64 [[TMP9]], 400
 // CHECK1-NEXT:    [[SUB:%.*]] = sub i64 2000, [[MUL]]
-// CHECK1-NEXT:    store i64 [[SUB]], ptr [[IT]], align 8, !llvm.access.group [[ACC_GRP29]]
-// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTLINEAR_START]], align 4, !llvm.access.group [[ACC_GRP29]]
+// CHECK1-NEXT:    store i64 [[SUB]], ptr [[IT]], align 8, !llvm.access.group [[ACC_GRP43]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTLINEAR_START]], align 4, !llvm.access.group [[ACC_GRP43]]
 // CHECK1-NEXT:    [[CONV:%.*]] = sext i32 [[TMP10]] to i64
-// CHECK1-NEXT:    [[TMP11:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP29]]
-// CHECK1-NEXT:    [[TMP12:%.*]] = load i64, ptr [[DOTLINEAR_STEP]], align 8, !llvm.access.group [[ACC_GRP29]]
+// CHECK1-NEXT:    [[TMP11:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP43]]
+// CHECK1-NEXT:    [[TMP12:%.*]] = load i64, ptr [[DOTLINEAR_STEP]], align 8, !llvm.access.group [[ACC_GRP43]]
 // CHECK1-NEXT:    [[MUL5:%.*]] = mul i64 [[TMP11]], [[TMP12]]
 // CHECK1-NEXT:    [[ADD:%.*]] = add i64 [[CONV]], [[MUL5]]
 // CHECK1-NEXT:    [[CONV6:%.*]] = trunc i64 [[ADD]] to i32
-// CHECK1-NEXT:    store i32 [[CONV6]], ptr [[LIN2]], align 4, !llvm.access.group [[ACC_GRP29]]
-// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTLINEAR_START1]], align 4, !llvm.access.group [[ACC_GRP29]]
+// CHECK1-NEXT:    store i32 [[CONV6]], ptr [[LIN2]], align 4, !llvm.access.group [[ACC_GRP43]]
+// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTLINEAR_START1]], align 4, !llvm.access.group [[ACC_GRP43]]
 // CHECK1-NEXT:    [[CONV7:%.*]] = sext i32 [[TMP13]] to i64
-// CHECK1-NEXT:    [[TMP14:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP29]]
-// CHECK1-NEXT:    [[TMP15:%.*]] = load i64, ptr [[DOTLINEAR_STEP]], align 8, !llvm.access.group [[ACC_GRP29]]
+// CHECK1-NEXT:    [[TMP14:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP43]]
+// CHECK1-NEXT:    [[TMP15:%.*]] = load i64, ptr [[DOTLINEAR_STEP]], align 8, !llvm.access.group [[ACC_GRP43]]
 // CHECK1-NEXT:    [[MUL8:%.*]] = mul i64 [[TMP14]], [[TMP15]]
 // CHECK1-NEXT:    [[ADD9:%.*]] = add i64 [[CONV7]], [[MUL8]]
 // CHECK1-NEXT:    [[CONV10:%.*]] = trunc i64 [[ADD9]] to i32
-// CHECK1-NEXT:    store i32 [[CONV10]], ptr [[A3]], align 4, !llvm.access.group [[ACC_GRP29]]
-// CHECK1-NEXT:    [[TMP16:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP29]]
+// CHECK1-NEXT:    store i32 [[CONV10]], ptr [[A3]], align 4, !llvm.access.group [[ACC_GRP43]]
+// CHECK1-NEXT:    [[TMP16:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP43]]
 // CHECK1-NEXT:    [[CONV11:%.*]] = sext i16 [[TMP16]] to i32
 // CHECK1-NEXT:    [[ADD12:%.*]] = add nsw i32 [[CONV11]], 1
 // CHECK1-NEXT:    [[CONV13:%.*]] = trunc i32 [[ADD12]] to i16
-// CHECK1-NEXT:    store i16 [[CONV13]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP29]]
+// CHECK1-NEXT:    store i16 [[CONV13]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP43]]
 // CHECK1-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK1:       omp.body.continue:
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK1:       omp.inner.for.inc:
-// CHECK1-NEXT:    [[TMP17:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP29]]
+// CHECK1-NEXT:    [[TMP17:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP43]]
 // CHECK1-NEXT:    [[ADD14:%.*]] = add i64 [[TMP17]], 1
-// CHECK1-NEXT:    store i64 [[ADD14]], ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP29]]
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP30:![0-9]+]]
+// CHECK1-NEXT:    store i64 [[ADD14]], ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP43]]
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP44:![0-9]+]]
 // CHECK1:       omp.inner.for.end:
 // CHECK1-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK1:       omp.loop.exit:
@@ -1064,32 +1064,32 @@ int bar(int n){
 // CHECK1-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK1:       omp.inner.for.cond:
-// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP32:![0-9]+]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP32]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP46:![0-9]+]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP46]]
 // CHECK1-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
 // CHECK1-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK1:       omp.inner.for.body:
-// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP32]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP46]]
 // CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 4
 // CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 6, [[MUL]]
 // CHECK1-NEXT:    [[CONV:%.*]] = trunc i32 [[ADD]] to i16
-// CHECK1-NEXT:    store i16 [[CONV]], ptr [[IT]], align 2, !llvm.access.group [[ACC_GRP32]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP32]]
+// CHECK1-NEXT:    store i16 [[CONV]], ptr [[IT]], align 2, !llvm.access.group [[ACC_GRP46]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP46]]
 // CHECK1-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP8]], 1
-// CHECK1-NEXT:    store i32 [[ADD2]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP32]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP32]]
+// CHECK1-NEXT:    store i32 [[ADD2]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP46]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP46]]
 // CHECK1-NEXT:    [[CONV3:%.*]] = sext i16 [[TMP9]] to i32
 // CHECK1-NEXT:    [[ADD4:%.*]] = add nsw i32 [[CONV3]], 1
 // CHECK1-NEXT:    [[CONV5:%.*]] = trunc i32 [[ADD4]] to i16
-// CHECK1-NEXT:    store i16 [[CONV5]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP32]]
+// CHECK1-NEXT:    store i16 [[CONV5]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP46]]
 // CHECK1-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK1:       omp.body.continue:
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK1:       omp.inner.for.inc:
-// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP32]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP46]]
 // CHECK1-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP10]], 1
-// CHECK1-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP32]]
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP33:![0-9]+]]
+// CHECK1-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP46]]
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP47:![0-9]+]]
 // CHECK1:       omp.inner.for.end:
 // CHECK1-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK1:       omp.loop.exit:
@@ -1219,60 +1219,60 @@ int bar(int n){
 // CHECK1:       omp.dispatch.body:
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK1:       omp.inner.for.cond:
-// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP35:![0-9]+]]
-// CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP35]]
+// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP49:![0-9]+]]
+// CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP49]]
 // CHECK1-NEXT:    [[CMP6:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]]
 // CHECK1-NEXT:    br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK1:       omp.inner.for.body:
-// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP35]]
+// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP49]]
 // CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1
 // CHECK1-NEXT:    [[SUB:%.*]] = sub nsw i32 122, [[MUL]]
 // CHECK1-NEXT:    [[CONV:%.*]] = trunc i32 [[SUB]] to i8
-// CHECK1-NEXT:    store i8 [[CONV]], ptr [[IT]], align 1, !llvm.access.group [[ACC_GRP35]]
-// CHECK1-NEXT:    [[TMP19:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP35]]
+// CHECK1-NEXT:    store i8 [[CONV]], ptr [[IT]], align 1, !llvm.access.group [[ACC_GRP49]]
+// CHECK1-NEXT:    [[TMP19:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP49]]
 // CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP19]], 1
-// CHECK1-NEXT:    store i32 [[ADD]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP35]]
+// CHECK1-NEXT:    store i32 [[ADD]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP49]]
 // CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x float], ptr [[TMP0]], i64 0, i64 2
-// CHECK1-NEXT:    [[TMP20:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP35]]
+// CHECK1-NEXT:    [[TMP20:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP49]]
 // CHECK1-NEXT:    [[CONV7:%.*]] = fpext float [[TMP20]] to double
 // CHECK1-NEXT:    [[ADD8:%.*]] = fadd double [[CONV7]], 1.000000e+00
 // CHECK1-NEXT:    [[CONV9:%.*]] = fptrunc double [[ADD8]] to float
-// CHECK1-NEXT:    store float [[CONV9]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP35]]
+// CHECK1-NEXT:    store float [[CONV9]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP49]]
 // CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i64 3
-// CHECK1-NEXT:    [[TMP21:%.*]] = load float, ptr [[ARRAYIDX10]], align 4, !llvm.access.group [[ACC_GRP35]]
+// CHECK1-NEXT:    [[TMP21:%.*]] = load float, ptr [[ARRAYIDX10]], align 4, !llvm.access.group [[ACC_GRP49]]
 // CHECK1-NEXT:    [[CONV11:%.*]] = fpext float [[TMP21]] to double
 // CHECK1-NEXT:    [[ADD12:%.*]] = fadd double [[CONV11]], 1.000000e+00
 // CHECK1-NEXT:    [[CONV13:%.*]] = fptrunc double [[ADD12]] to float
-// CHECK1-NEXT:    store float [[CONV13]], ptr [[ARRAYIDX10]], align 4, !llvm.access.group [[ACC_GRP35]]
+// CHECK1-NEXT:    store float [[CONV13]], ptr [[ARRAYIDX10]], align 4, !llvm.access.group [[ACC_GRP49]]
 // CHECK1-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds [5 x [10 x double]], ptr [[TMP3]], i64 0, i64 1
 // CHECK1-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds [10 x double], ptr [[ARRAYIDX14]], i64 0, i64 2
-// CHECK1-NEXT:    [[TMP22:%.*]] = load double, ptr [[ARRAYIDX15]], align 8, !llvm.access.group [[ACC_GRP35]]
+// CHECK1-NEXT:    [[TMP22:%.*]] = load double, ptr [[ARRAYIDX15]], align 8, !llvm.access.group [[ACC_GRP49]]
 // CHECK1-NEXT:    [[ADD16:%.*]] = fadd double [[TMP22]], 1.000000e+00
-// CHECK1-NEXT:    store double [[ADD16]], ptr [[ARRAYIDX15]], align 8, !llvm.access.group [[ACC_GRP35]]
+// CHECK1-NEXT:    store double [[ADD16]], ptr [[ARRAYIDX15]], align 8, !llvm.access.group [[ACC_GRP49]]
 // CHECK1-NEXT:    [[TMP23:%.*]] = mul nsw i64 1, [[TMP5]]
 // CHECK1-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds double, ptr [[TMP6]], i64 [[TMP23]]
 // CHECK1-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds double, ptr [[ARRAYIDX17]], i64 3
-// CHECK1-NEXT:    [[TMP24:%.*]] = load double, ptr [[ARRAYIDX18]], align 8, !llvm.access.group [[ACC_GRP35]]
+// CHECK1-NEXT:    [[TMP24:%.*]] = load double, ptr [[ARRAYIDX18]], align 8, !llvm.access.group [[ACC_GRP49]]
 // CHECK1-NEXT:    [[ADD19:%.*]] = fadd double [[TMP24]], 1.000000e+00
-// CHECK1-NEXT:    store double [[ADD19]], ptr [[ARRAYIDX18]], align 8, !llvm.access.group [[ACC_GRP35]]
+// CHECK1-NEXT:    store double [[ADD19]], ptr [[ARRAYIDX18]], align 8, !llvm.access.group [[ACC_GRP49]]
 // CHECK1-NEXT:    [[X:%.*]] = getelementptr inbounds [[STRUCT_TT:%.*]], ptr [[TMP7]], i32 0, i32 0
-// CHECK1-NEXT:    [[TMP25:%.*]] = load i64, ptr [[X]], align 8, !llvm.access.group [[ACC_GRP35]]
+// CHECK1-NEXT:    [[TMP25:%.*]] = load i64, ptr [[X]], align 8, !llvm.access.group [[ACC_GRP49]]
 // CHECK1-NEXT:    [[ADD20:%.*]] = add nsw i64 [[TMP25]], 1
-// CHECK1-NEXT:    store i64 [[ADD20]], ptr [[X]], align 8, !llvm.access.group [[ACC_GRP35]]
+// CHECK1-NEXT:    store i64 [[ADD20]], ptr [[X]], align 8, !llvm.access.group [[ACC_GRP49]]
 // CHECK1-NEXT:    [[Y:%.*]] = getelementptr inbounds [[STRUCT_TT]], ptr [[TMP7]], i32 0, i32 1
-// CHECK1-NEXT:    [[TMP26:%.*]] = load i8, ptr [[Y]], align 8, !llvm.access.group [[ACC_GRP35]]
+// CHECK1-NEXT:    [[TMP26:%.*]] = load i8, ptr [[Y]], align 8, !llvm.access.group [[ACC_GRP49]]
 // CHECK1-NEXT:    [[CONV21:%.*]] = sext i8 [[TMP26]] to i32
 // CHECK1-NEXT:    [[ADD22:%.*]] = add nsw i32 [[CONV21]], 1
 // CHECK1-NEXT:    [[CONV23:%.*]] = trunc i32 [[ADD22]] to i8
-// CHECK1-NEXT:    store i8 [[CONV23]], ptr [[Y]], align 8, !llvm.access.group [[ACC_GRP35]]
+// CHECK1-NEXT:    store i8 [[CONV23]], ptr [[Y]], align 8, !llvm.access.group [[ACC_GRP49]]
 // CHECK1-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK1:       omp.body.continue:
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK1:       omp.inner.for.inc:
-// CHECK1-NEXT:    [[TMP27:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP35]]
+// CHECK1-NEXT:    [[TMP27:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP49]]
 // CHECK1-NEXT:    [[ADD24:%.*]] = add nsw i32 [[TMP27]], 1
-// CHECK1-NEXT:    store i32 [[ADD24]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP35]]
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP36:![0-9]+]]
+// CHECK1-NEXT:    store i32 [[ADD24]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP49]]
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP50:![0-9]+]]
 // CHECK1:       omp.inner.for.end:
 // CHECK1-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK1:       omp.dispatch.inc:
@@ -1715,37 +1715,37 @@ int bar(int n){
 // CHECK1-NEXT:    store i64 [[TMP8]], ptr [[DOTOMP_IV]], align 8
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK1:       omp.inner.for.cond:
-// CHECK1-NEXT:    [[TMP9:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP38:![0-9]+]]
-// CHECK1-NEXT:    [[TMP10:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8, !llvm.access.group [[ACC_GRP38]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP52:![0-9]+]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8, !llvm.access.group [[ACC_GRP52]]
 // CHECK1-NEXT:    [[CMP3:%.*]] = icmp ule i64 [[TMP9]], [[TMP10]]
 // CHECK1-NEXT:    br i1 [[CMP3]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK1:       omp.inner.for.body:
-// CHECK1-NEXT:    [[TMP11:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP38]]
+// CHECK1-NEXT:    [[TMP11:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP52]]
 // CHECK1-NEXT:    [[MUL:%.*]] = mul i64 [[TMP11]], 400
 // CHECK1-NEXT:    [[SUB:%.*]] = sub i64 2000, [[MUL]]
-// CHECK1-NEXT:    store i64 [[SUB]], ptr [[IT]], align 8, !llvm.access.group [[ACC_GRP38]]
-// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[B_ADDR]], align 4, !llvm.access.group [[ACC_GRP38]]
+// CHECK1-NEXT:    store i64 [[SUB]], ptr [[IT]], align 8, !llvm.access.group [[ACC_GRP52]]
+// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[B_ADDR]], align 4, !llvm.access.group [[ACC_GRP52]]
 // CHECK1-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP12]] to double
 // CHECK1-NEXT:    [[ADD:%.*]] = fadd double [[CONV]], 1.500000e+00
 // CHECK1-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_S1:%.*]], ptr [[TMP0]], i32 0, i32 0
-// CHECK1-NEXT:    store double [[ADD]], ptr [[A]], align 8, !llvm.access.group [[ACC_GRP38]]
+// CHECK1-NEXT:    store double [[ADD]], ptr [[A]], align 8, !llvm.access.group [[ACC_GRP52]]
 // CHECK1-NEXT:    [[A4:%.*]] = getelementptr inbounds [[STRUCT_S1]], ptr [[TMP0]], i32 0, i32 0
-// CHECK1-NEXT:    [[TMP13:%.*]] = load double, ptr [[A4]], align 8, !llvm.access.group [[ACC_GRP38]]
+// CHECK1-NEXT:    [[TMP13:%.*]] = load double, ptr [[A4]], align 8, !llvm.access.group [[ACC_GRP52]]
 // CHECK1-NEXT:    [[INC:%.*]] = fadd double [[TMP13]], 1.000000e+00
-// CHECK1-NEXT:    store double [[INC]], ptr [[A4]], align 8, !llvm.access.group [[ACC_GRP38]]
+// CHECK1-NEXT:    store double [[INC]], ptr [[A4]], align 8, !llvm.access.group [[ACC_GRP52]]
 // CHECK1-NEXT:    [[CONV5:%.*]] = fptosi double [[INC]] to i16
 // CHECK1-NEXT:    [[TMP14:%.*]] = mul nsw i64 1, [[TMP2]]
 // CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[TMP3]], i64 [[TMP14]]
 // CHECK1-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i16, ptr [[ARRAYIDX]], i64 1
-// CHECK1-NEXT:    store i16 [[CONV5]], ptr [[ARRAYIDX6]], align 2, !llvm.access.group [[ACC_GRP38]]
+// CHECK1-NEXT:    store i16 [[CONV5]], ptr [[ARRAYIDX6]], align 2, !llvm.access.group [[ACC_GRP52]]
 // CHECK1-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK1:       omp.body.continue:
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK1:       omp.inner.for.inc:
-// CHECK1-NEXT:    [[TMP15:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP38]]
+// CHECK1-NEXT:    [[TMP15:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP52]]
 // CHECK1-NEXT:    [[ADD7:%.*]] = add i64 [[TMP15]], 1
-// CHECK1-NEXT:    store i64 [[ADD7]], ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP38]]
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP39:![0-9]+]]
+// CHECK1-NEXT:    store i64 [[ADD7]], ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP52]]
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP53:![0-9]+]]
 // CHECK1:       omp.inner.for.end:
 // CHECK1-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK1:       omp.loop.exit:
@@ -1874,35 +1874,35 @@ int bar(int n){
 // CHECK1-NEXT:    store i64 [[TMP5]], ptr [[DOTOMP_IV]], align 8
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK1:       omp.inner.for.cond:
-// CHECK1-NEXT:    [[TMP6:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP41:![0-9]+]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8, !llvm.access.group [[ACC_GRP41]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP55:![0-9]+]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8, !llvm.access.group [[ACC_GRP55]]
 // CHECK1-NEXT:    [[CMP1:%.*]] = icmp sle i64 [[TMP6]], [[TMP7]]
 // CHECK1-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK1:       omp.inner.for.body:
-// CHECK1-NEXT:    [[TMP8:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP41]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP55]]
 // CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i64 [[TMP8]], 3
 // CHECK1-NEXT:    [[ADD:%.*]] = add nsw i64 -10, [[MUL]]
-// CHECK1-NEXT:    store i64 [[ADD]], ptr [[I]], align 8, !llvm.access.group [[ACC_GRP41]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP41]]
+// CHECK1-NEXT:    store i64 [[ADD]], ptr [[I]], align 8, !llvm.access.group [[ACC_GRP55]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP55]]
 // CHECK1-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP9]], 1
-// CHECK1-NEXT:    store i32 [[ADD2]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP41]]
-// CHECK1-NEXT:    [[TMP10:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP41]]
+// CHECK1-NEXT:    store i32 [[ADD2]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP55]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP55]]
 // CHECK1-NEXT:    [[CONV:%.*]] = sext i16 [[TMP10]] to i32
 // CHECK1-NEXT:    [[ADD3:%.*]] = add nsw i32 [[CONV]], 1
 // CHECK1-NEXT:    [[CONV4:%.*]] = trunc i32 [[ADD3]] to i16
-// CHECK1-NEXT:    store i16 [[CONV4]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP41]]
+// CHECK1-NEXT:    store i16 [[CONV4]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP55]]
 // CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP0]], i64 0, i64 2
-// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP41]]
+// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP55]]
 // CHECK1-NEXT:    [[ADD5:%.*]] = add nsw i32 [[TMP11]], 1
-// CHECK1-NEXT:    store i32 [[ADD5]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP41]]
+// CHECK1-NEXT:    store i32 [[ADD5]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP55]]
 // CHECK1-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK1:       omp.body.continue:
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK1:       omp.inner.for.inc:
-// CHECK1-NEXT:    [[TMP12:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP41]]
+// CHECK1-NEXT:    [[TMP12:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP55]]
 // CHECK1-NEXT:    [[ADD6:%.*]] = add nsw i64 [[TMP12]], 1
-// CHECK1-NEXT:    store i64 [[ADD6]], ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP41]]
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP42:![0-9]+]]
+// CHECK1-NEXT:    store i64 [[ADD6]], ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP55]]
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP56:![0-9]+]]
 // CHECK1:       omp.inner.for.end:
 // CHECK1-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK1:       omp.loop.exit:
@@ -2286,23 +2286,23 @@ int bar(int n){
 // CHECK3-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
 // CHECK3-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK3:       omp.inner.for.cond:
-// CHECK3-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP11:![0-9]+]]
-// CHECK3-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP11]]
+// CHECK3-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP25:![0-9]+]]
+// CHECK3-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP25]]
 // CHECK3-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
 // CHECK3-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK3:       omp.inner.for.body:
-// CHECK3-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP11]]
+// CHECK3-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP25]]
 // CHECK3-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 5
 // CHECK3-NEXT:    [[ADD:%.*]] = add nsw i32 3, [[MUL]]
-// CHECK3-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP11]]
+// CHECK3-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP25]]
 // CHECK3-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK3:       omp.body.continue:
 // CHECK3-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK3:       omp.inner.for.inc:
-// CHECK3-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP11]]
+// CHECK3-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP25]]
 // CHECK3-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP8]], 1
-// CHECK3-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP11]]
-// CHECK3-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP12:![0-9]+]]
+// CHECK3-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP25]]
+// CHECK3-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP26:![0-9]+]]
 // CHECK3:       omp.inner.for.end:
 // CHECK3-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK3:       omp.loop.exit:
@@ -2337,42 +2337,42 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T:%.*]], ptr [[TMP4]], i32 0, i32 2
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], ptr [[TMP4]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 4
-// CHECK3-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META17:![0-9]+]])
-// CHECK3-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META20:![0-9]+]])
-// CHECK3-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META22:![0-9]+]])
-// CHECK3-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META24:![0-9]+]])
-// CHECK3-NEXT:    store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !26
-// CHECK3-NEXT:    store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 4, !noalias !26
-// CHECK3-NEXT:    store ptr null, ptr [[DOTPRIVATES__ADDR_I]], align 4, !noalias !26
-// CHECK3-NEXT:    store ptr null, ptr [[DOTCOPY_FN__ADDR_I]], align 4, !noalias !26
-// CHECK3-NEXT:    store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 4, !noalias !26
-// CHECK3-NEXT:    store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 4, !noalias !26
-// CHECK3-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 4, !noalias !26
-// CHECK3-NEXT:    store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias !26
+// CHECK3-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META31:![0-9]+]])
+// CHECK3-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META34:![0-9]+]])
+// CHECK3-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META36:![0-9]+]])
+// CHECK3-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META38:![0-9]+]])
+// CHECK3-NEXT:    store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !40
+// CHECK3-NEXT:    store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 4, !noalias !40
+// CHECK3-NEXT:    store ptr null, ptr [[DOTPRIVATES__ADDR_I]], align 4, !noalias !40
+// CHECK3-NEXT:    store ptr null, ptr [[DOTCOPY_FN__ADDR_I]], align 4, !noalias !40
+// CHECK3-NEXT:    store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 4, !noalias !40
+// CHECK3-NEXT:    store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 4, !noalias !40
+// CHECK3-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 4, !noalias !40
+// CHECK3-NEXT:    store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias !40
 // CHECK3-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 1
-// CHECK3-NEXT:    store i32 0, ptr [[TMP9]], align 4, !noalias !26
+// CHECK3-NEXT:    store i32 0, ptr [[TMP9]], align 4, !noalias !40
 // CHECK3-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 2
-// CHECK3-NEXT:    store ptr null, ptr [[TMP10]], align 4, !noalias !26
+// CHECK3-NEXT:    store ptr null, ptr [[TMP10]], align 4, !noalias !40
 // CHECK3-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 3
-// CHECK3-NEXT:    store ptr null, ptr [[TMP11]], align 4, !noalias !26
+// CHECK3-NEXT:    store ptr null, ptr [[TMP11]], align 4, !noalias !40
 // CHECK3-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 4
-// CHECK3-NEXT:    store ptr null, ptr [[TMP12]], align 4, !noalias !26
+// CHECK3-NEXT:    store ptr null, ptr [[TMP12]], align 4, !noalias !40
 // CHECK3-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 5
-// CHECK3-NEXT:    store ptr null, ptr [[TMP13]], align 4, !noalias !26
+// CHECK3-NEXT:    store ptr null, ptr [[TMP13]], align 4, !noalias !40
 // CHECK3-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 6
-// CHECK3-NEXT:    store ptr null, ptr [[TMP14]], align 4, !noalias !26
+// CHECK3-NEXT:    store ptr null, ptr [[TMP14]], align 4, !noalias !40
 // CHECK3-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 7
-// CHECK3-NEXT:    store ptr null, ptr [[TMP15]], align 4, !noalias !26
+// CHECK3-NEXT:    store ptr null, ptr [[TMP15]], align 4, !noalias !40
 // CHECK3-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 8
-// CHECK3-NEXT:    store i64 0, ptr [[TMP16]], align 8, !noalias !26
+// CHECK3-NEXT:    store i64 0, ptr [[TMP16]], align 8, !noalias !40
 // CHECK3-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 9
-// CHECK3-NEXT:    store i64 1, ptr [[TMP17]], align 8, !noalias !26
+// CHECK3-NEXT:    store i64 1, ptr [[TMP17]], align 8, !noalias !40
 // CHECK3-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 10
-// CHECK3-NEXT:    store [3 x i32] [i32 1, i32 0, i32 0], ptr [[TMP18]], align 4, !noalias !26
+// CHECK3-NEXT:    store [3 x i32] [i32 1, i32 0, i32 0], ptr [[TMP18]], align 4, !noalias !40
 // CHECK3-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 11
-// CHECK3-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP19]], align 4, !noalias !26
+// CHECK3-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP19]], align 4, !noalias !40
 // CHECK3-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 12
-// CHECK3-NEXT:    store i32 0, ptr [[TMP20]], align 4, !noalias !26
+// CHECK3-NEXT:    store i32 0, ptr [[TMP20]], align 4, !noalias !40
 // CHECK3-NEXT:    [[TMP21:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB2]], i64 -1, i32 1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l96.region_id, ptr [[KERNEL_ARGS_I]])
 // CHECK3-NEXT:    [[TMP22:%.*]] = icmp ne i32 [[TMP21]], 0
 // CHECK3-NEXT:    br i1 [[TMP22]], label [[OMP_OFFLOAD_FAILED_I:%.*]], label [[DOTOMP_OUTLINED__EXIT:%.*]]
@@ -2440,32 +2440,32 @@ int bar(int n){
 // CHECK3-NEXT:    store i32 [[TMP5]], ptr [[DOTOMP_IV]], align 4
 // CHECK3-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK3:       omp.inner.for.cond:
-// CHECK3-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP27:![0-9]+]]
-// CHECK3-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP27]]
+// CHECK3-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP41:![0-9]+]]
+// CHECK3-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP41]]
 // CHECK3-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP6]], [[TMP7]]
 // CHECK3-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK3:       omp.inner.for.body:
-// CHECK3-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP27]]
+// CHECK3-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP41]]
 // CHECK3-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1
 // CHECK3-NEXT:    [[SUB:%.*]] = sub nsw i32 10, [[MUL]]
-// CHECK3-NEXT:    store i32 [[SUB]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP27]]
-// CHECK3-NEXT:    [[TMP9:%.*]] = load i64, ptr [[DOTLINEAR_START]], align 8, !llvm.access.group [[ACC_GRP27]]
-// CHECK3-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP27]]
+// CHECK3-NEXT:    store i32 [[SUB]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP41]]
+// CHECK3-NEXT:    [[TMP9:%.*]] = load i64, ptr [[DOTLINEAR_START]], align 8, !llvm.access.group [[ACC_GRP41]]
+// CHECK3-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP41]]
 // CHECK3-NEXT:    [[MUL2:%.*]] = mul nsw i32 [[TMP10]], 3
 // CHECK3-NEXT:    [[CONV:%.*]] = sext i32 [[MUL2]] to i64
 // CHECK3-NEXT:    [[ADD:%.*]] = add nsw i64 [[TMP9]], [[CONV]]
-// CHECK3-NEXT:    store i64 [[ADD]], ptr [[K1]], align 8, !llvm.access.group [[ACC_GRP27]]
-// CHECK3-NEXT:    [[TMP11:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP27]]
+// CHECK3-NEXT:    store i64 [[ADD]], ptr [[K1]], align 8, !llvm.access.group [[ACC_GRP41]]
+// CHECK3-NEXT:    [[TMP11:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP41]]
 // CHECK3-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP11]], 1
-// CHECK3-NEXT:    store i32 [[ADD3]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP27]]
+// CHECK3-NEXT:    store i32 [[ADD3]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP41]]
 // CHECK3-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK3:       omp.body.continue:
 // CHECK3-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK3:       omp.inner.for.inc:
-// CHECK3-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP27]]
+// CHECK3-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP41]]
 // CHECK3-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP12]], 1
-// CHECK3-NEXT:    store i32 [[ADD4]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP27]]
-// CHECK3-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP28:![0-9]+]]
+// CHECK3-NEXT:    store i32 [[ADD4]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP41]]
+// CHECK3-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP42:![0-9]+]]
 // CHECK3:       omp.inner.for.end:
 // CHECK3-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK3:       omp.dispatch.inc:
@@ -2568,44 +2568,44 @@ int bar(int n){
 // CHECK3-NEXT:    store i64 [[TMP6]], ptr [[DOTOMP_IV]], align 8
 // CHECK3-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK3:       omp.inner.for.cond:
-// CHECK3-NEXT:    [[TMP7:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP30:![0-9]+]]
-// CHECK3-NEXT:    [[TMP8:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8, !llvm.access.group [[ACC_GRP30]]
+// CHECK3-NEXT:    [[TMP7:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP44:![0-9]+]]
+// CHECK3-NEXT:    [[TMP8:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8, !llvm.access.group [[ACC_GRP44]]
 // CHECK3-NEXT:    [[CMP4:%.*]] = icmp ule i64 [[TMP7]], [[TMP8]]
 // CHECK3-NEXT:    br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK3:       omp.inner.for.body:
-// CHECK3-NEXT:    [[TMP9:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP30]]
+// CHECK3-NEXT:    [[TMP9:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP44]]
 // CHECK3-NEXT:    [[MUL:%.*]] = mul i64 [[TMP9]], 400
 // CHECK3-NEXT:    [[SUB:%.*]] = sub i64 2000, [[MUL]]
-// CHECK3-NEXT:    store i64 [[SUB]], ptr [[IT]], align 8, !llvm.access.group [[ACC_GRP30]]
-// CHECK3-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTLINEAR_START]], align 4, !llvm.access.group [[ACC_GRP30]]
+// CHECK3-NEXT:    store i64 [[SUB]], ptr [[IT]], align 8, !llvm.access.group [[ACC_GRP44]]
+// CHECK3-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTLINEAR_START]], align 4, !llvm.access.group [[ACC_GRP44]]
 // CHECK3-NEXT:    [[CONV:%.*]] = sext i32 [[TMP10]] to i64
-// CHECK3-NEXT:    [[TMP11:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP30]]
-// CHECK3-NEXT:    [[TMP12:%.*]] = load i64, ptr [[DOTLINEAR_STEP]], align 8, !llvm.access.group [[ACC_GRP30]]
+// CHECK3-NEXT:    [[TMP11:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP44]]
+// CHECK3-NEXT:    [[TMP12:%.*]] = load i64, ptr [[DOTLINEAR_STEP]], align 8, !llvm.access.group [[ACC_GRP44]]
 // CHECK3-NEXT:    [[MUL5:%.*]] = mul i64 [[TMP11]], [[TMP12]]
 // CHECK3-NEXT:    [[ADD:%.*]] = add i64 [[CONV]], [[MUL5]]
 // CHECK3-NEXT:    [[CONV6:%.*]] = trunc i64 [[ADD]] to i32
-// CHECK3-NEXT:    store i32 [[CONV6]], ptr [[LIN2]], align 4, !llvm.access.group [[ACC_GRP30]]
-// CHECK3-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTLINEAR_START1]], align 4, !llvm.access.group [[ACC_GRP30]]
+// CHECK3-NEXT:    store i32 [[CONV6]], ptr [[LIN2]], align 4, !llvm.access.group [[ACC_GRP44]]
+// CHECK3-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTLINEAR_START1]], align 4, !llvm.access.group [[ACC_GRP44]]
 // CHECK3-NEXT:    [[CONV7:%.*]] = sext i32 [[TMP13]] to i64
-// CHECK3-NEXT:    [[TMP14:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP30]]
-// CHECK3-NEXT:    [[TMP15:%.*]] = load i64, ptr [[DOTLINEAR_STEP]], align 8, !llvm.access.group [[ACC_GRP30]]
+// CHECK3-NEXT:    [[TMP14:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP44]]
+// CHECK3-NEXT:    [[TMP15:%.*]] = load i64, ptr [[DOTLINEAR_STEP]], align 8, !llvm.access.group [[ACC_GRP44]]
 // CHECK3-NEXT:    [[MUL8:%.*]] = mul i64 [[TMP14]], [[TMP15]]
 // CHECK3-NEXT:    [[ADD9:%.*]] = add i64 [[CONV7]], [[MUL8]]
 // CHECK3-NEXT:    [[CONV10:%.*]] = trunc i64 [[ADD9]] to i32
-// CHECK3-NEXT:    store i32 [[CONV10]], ptr [[A3]], align 4, !llvm.access.group [[ACC_GRP30]]
-// CHECK3-NEXT:    [[TMP16:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP30]]
+// CHECK3-NEXT:    store i32 [[CONV10]], ptr [[A3]], align 4, !llvm.access.group [[ACC_GRP44]]
+// CHECK3-NEXT:    [[TMP16:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP44]]
 // CHECK3-NEXT:    [[CONV11:%.*]] = sext i16 [[TMP16]] to i32
 // CHECK3-NEXT:    [[ADD12:%.*]] = add nsw i32 [[CONV11]], 1
 // CHECK3-NEXT:    [[CONV13:%.*]] = trunc i32 [[ADD12]] to i16
-// CHECK3-NEXT:    store i16 [[CONV13]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP30]]
+// CHECK3-NEXT:    store i16 [[CONV13]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP44]]
 // CHECK3-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK3:       omp.body.continue:
 // CHECK3-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK3:       omp.inner.for.inc:
-// CHECK3-NEXT:    [[TMP17:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP30]]
+// CHECK3-NEXT:    [[TMP17:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP44]]
 // CHECK3-NEXT:    [[ADD14:%.*]] = add i64 [[TMP17]], 1
-// CHECK3-NEXT:    store i64 [[ADD14]], ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP30]]
-// CHECK3-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP31:![0-9]+]]
+// CHECK3-NEXT:    store i64 [[ADD14]], ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP44]]
+// CHECK3-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP45:![0-9]+]]
 // CHECK3:       omp.inner.for.end:
 // CHECK3-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK3:       omp.loop.exit:
@@ -2689,32 +2689,32 @@ int bar(int n){
 // CHECK3-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
 // CHECK3-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK3:       omp.inner.for.cond:
-// CHECK3-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP33:![0-9]+]]
-// CHECK3-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP33]]
+// CHECK3-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP47:![0-9]+]]
+// CHECK3-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP47]]
 // CHECK3-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
 // CHECK3-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK3:       omp.inner.for.body:
-// CHECK3-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP33]]
+// CHECK3-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP47]]
 // CHECK3-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 4
 // CHECK3-NEXT:    [[ADD:%.*]] = add nsw i32 6, [[MUL]]
 // CHECK3-NEXT:    [[CONV:%.*]] = trunc i32 [[ADD]] to i16
-// CHECK3-NEXT:    store i16 [[CONV]], ptr [[IT]], align 2, !llvm.access.group [[ACC_GRP33]]
-// CHECK3-NEXT:    [[TMP8:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP33]]
+// CHECK3-NEXT:    store i16 [[CONV]], ptr [[IT]], align 2, !llvm.access.group [[ACC_GRP47]]
+// CHECK3-NEXT:    [[TMP8:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP47]]
 // CHECK3-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP8]], 1
-// CHECK3-NEXT:    store i32 [[ADD2]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP33]]
-// CHECK3-NEXT:    [[TMP9:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP33]]
+// CHECK3-NEXT:    store i32 [[ADD2]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP47]]
+// CHECK3-NEXT:    [[TMP9:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP47]]
 // CHECK3-NEXT:    [[CONV3:%.*]] = sext i16 [[TMP9]] to i32
 // CHECK3-NEXT:    [[ADD4:%.*]] = add nsw i32 [[CONV3]], 1
 // CHECK3-NEXT:    [[CONV5:%.*]] = trunc i32 [[ADD4]] to i16
-// CHECK3-NEXT:    store i16 [[CONV5]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP33]]
+// CHECK3-NEXT:    store i16 [[CONV5]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP47]]
 // CHECK3-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK3:       omp.body.continue:
 // CHECK3-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK3:       omp.inner.for.inc:
-// CHECK3-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP33]]
+// CHECK3-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP47]]
 // CHECK3-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP10]], 1
-// CHECK3-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP33]]
-// CHECK3-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP34:![0-9]+]]
+// CHECK3-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP47]]
+// CHECK3-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP48:![0-9]+]]
 // CHECK3:       omp.inner.for.end:
 // CHECK3-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK3:       omp.loop.exit:
@@ -2844,60 +2844,60 @@ int bar(int n){
 // CHECK3:       omp.dispatch.body:
 // CHECK3-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK3:       omp.inner.for.cond:
-// CHECK3-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP36:![0-9]+]]
-// CHECK3-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP36]]
+// CHECK3-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP50:![0-9]+]]
+// CHECK3-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP50]]
 // CHECK3-NEXT:    [[CMP6:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]]
 // CHECK3-NEXT:    br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK3:       omp.inner.for.body:
-// CHECK3-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP36]]
+// CHECK3-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP50]]
 // CHECK3-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1
 // CHECK3-NEXT:    [[SUB:%.*]] = sub nsw i32 122, [[MUL]]
 // CHECK3-NEXT:    [[CONV:%.*]] = trunc i32 [[SUB]] to i8
-// CHECK3-NEXT:    store i8 [[CONV]], ptr [[IT]], align 1, !llvm.access.group [[ACC_GRP36]]
-// CHECK3-NEXT:    [[TMP19:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP36]]
+// CHECK3-NEXT:    store i8 [[CONV]], ptr [[IT]], align 1, !llvm.access.group [[ACC_GRP50]]
+// CHECK3-NEXT:    [[TMP19:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP50]]
 // CHECK3-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP19]], 1
-// CHECK3-NEXT:    store i32 [[ADD]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP36]]
+// CHECK3-NEXT:    store i32 [[ADD]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP50]]
 // CHECK3-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x float], ptr [[TMP0]], i32 0, i32 2
-// CHECK3-NEXT:    [[TMP20:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP36]]
+// CHECK3-NEXT:    [[TMP20:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP50]]
 // CHECK3-NEXT:    [[CONV7:%.*]] = fpext float [[TMP20]] to double
 // CHECK3-NEXT:    [[ADD8:%.*]] = fadd double [[CONV7]], 1.000000e+00
 // CHECK3-NEXT:    [[CONV9:%.*]] = fptrunc double [[ADD8]] to float
-// CHECK3-NEXT:    store float [[CONV9]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP36]]
+// CHECK3-NEXT:    store float [[CONV9]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP50]]
 // CHECK3-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 3
-// CHECK3-NEXT:    [[TMP21:%.*]] = load float, ptr [[ARRAYIDX10]], align 4, !llvm.access.group [[ACC_GRP36]]
+// CHECK3-NEXT:    [[TMP21:%.*]] = load float, ptr [[ARRAYIDX10]], align 4, !llvm.access.group [[ACC_GRP50]]
 // CHECK3-NEXT:    [[CONV11:%.*]] = fpext float [[TMP21]] to double
 // CHECK3-NEXT:    [[ADD12:%.*]] = fadd double [[CONV11]], 1.000000e+00
 // CHECK3-NEXT:    [[CONV13:%.*]] = fptrunc double [[ADD12]] to float
-// CHECK3-NEXT:    store float [[CONV13]], ptr [[ARRAYIDX10]], align 4, !llvm.access.group [[ACC_GRP36]]
+// CHECK3-NEXT:    store float [[CONV13]], ptr [[ARRAYIDX10]], align 4, !llvm.access.group [[ACC_GRP50]]
 // CHECK3-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds [5 x [10 x double]], ptr [[TMP3]], i32 0, i32 1
 // CHECK3-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds [10 x double], ptr [[ARRAYIDX14]], i32 0, i32 2
-// CHECK3-NEXT:    [[TMP22:%.*]] = load double, ptr [[ARRAYIDX15]], align 8, !llvm.access.group [[ACC_GRP36]]
+// CHECK3-NEXT:    [[TMP22:%.*]] = load double, ptr [[ARRAYIDX15]], align 8, !llvm.access.group [[ACC_GRP50]]
 // CHECK3-NEXT:    [[ADD16:%.*]] = fadd double [[TMP22]], 1.000000e+00
-// CHECK3-NEXT:    store double [[ADD16]], ptr [[ARRAYIDX15]], align 8, !llvm.access.group [[ACC_GRP36]]
+// CHECK3-NEXT:    store double [[ADD16]], ptr [[ARRAYIDX15]], align 8, !llvm.access.group [[ACC_GRP50]]
 // CHECK3-NEXT:    [[TMP23:%.*]] = mul nsw i32 1, [[TMP5]]
 // CHECK3-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds double, ptr [[TMP6]], i32 [[TMP23]]
 // CHECK3-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds double, ptr [[ARRAYIDX17]], i32 3
-// CHECK3-NEXT:    [[TMP24:%.*]] = load double, ptr [[ARRAYIDX18]], align 8, !llvm.access.group [[ACC_GRP36]]
+// CHECK3-NEXT:    [[TMP24:%.*]] = load double, ptr [[ARRAYIDX18]], align 8, !llvm.access.group [[ACC_GRP50]]
 // CHECK3-NEXT:    [[ADD19:%.*]] = fadd double [[TMP24]], 1.000000e+00
-// CHECK3-NEXT:    store double [[ADD19]], ptr [[ARRAYIDX18]], align 8, !llvm.access.group [[ACC_GRP36]]
+// CHECK3-NEXT:    store double [[ADD19]], ptr [[ARRAYIDX18]], align 8, !llvm.access.group [[ACC_GRP50]]
 // CHECK3-NEXT:    [[X:%.*]] = getelementptr inbounds [[STRUCT_TT:%.*]], ptr [[TMP7]], i32 0, i32 0
-// CHECK3-NEXT:    [[TMP25:%.*]] = load i64, ptr [[X]], align 4, !llvm.access.group [[ACC_GRP36]]
+// CHECK3-NEXT:    [[TMP25:%.*]] = load i64, ptr [[X]], align 4, !llvm.access.group [[ACC_GRP50]]
 // CHECK3-NEXT:    [[ADD20:%.*]] = add nsw i64 [[TMP25]], 1
-// CHECK3-NEXT:    store i64 [[ADD20]], ptr [[X]], align 4, !llvm.access.group [[ACC_GRP36]]
+// CHECK3-NEXT:    store i64 [[ADD20]], ptr [[X]], align 4, !llvm.access.group [[ACC_GRP50]]
 // CHECK3-NEXT:    [[Y:%.*]] = getelementptr inbounds [[STRUCT_TT]], ptr [[TMP7]], i32 0, i32 1
-// CHECK3-NEXT:    [[TMP26:%.*]] = load i8, ptr [[Y]], align 4, !llvm.access.group [[ACC_GRP36]]
+// CHECK3-NEXT:    [[TMP26:%.*]] = load i8, ptr [[Y]], align 4, !llvm.access.group [[ACC_GRP50]]
 // CHECK3-NEXT:    [[CONV21:%.*]] = sext i8 [[TMP26]] to i32
 // CHECK3-NEXT:    [[ADD22:%.*]] = add nsw i32 [[CONV21]], 1
 // CHECK3-NEXT:    [[CONV23:%.*]] = trunc i32 [[ADD22]] to i8
-// CHECK3-NEXT:    store i8 [[CONV23]], ptr [[Y]], align 4, !llvm.access.group [[ACC_GRP36]]
+// CHECK3-NEXT:    store i8 [[CONV23]], ptr [[Y]], align 4, !llvm.access.group [[ACC_GRP50]]
 // CHECK3-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK3:       omp.body.continue:
 // CHECK3-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK3:       omp.inner.for.inc:
-// CHECK3-NEXT:    [[TMP27:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP36]]
+// CHECK3-NEXT:    [[TMP27:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP50]]
 // CHECK3-NEXT:    [[ADD24:%.*]] = add nsw i32 [[TMP27]], 1
-// CHECK3-NEXT:    store i32 [[ADD24]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP36]]
-// CHECK3-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP37:![0-9]+]]
+// CHECK3-NEXT:    store i32 [[ADD24]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP50]]
+// CHECK3-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP51:![0-9]+]]
 // CHECK3:       omp.inner.for.end:
 // CHECK3-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK3:       omp.dispatch.inc:
@@ -3340,37 +3340,37 @@ int bar(int n){
 // CHECK3-NEXT:    store i64 [[TMP8]], ptr [[DOTOMP_IV]], align 8
 // CHECK3-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK3:       omp.inner.for.cond:
-// CHECK3-NEXT:    [[TMP9:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP39:![0-9]+]]
-// CHECK3-NEXT:    [[TMP10:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8, !llvm.access.group [[ACC_GRP39]]
+// CHECK3-NEXT:    [[TMP9:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP53:![0-9]+]]
+// CHECK3-NEXT:    [[TMP10:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8, !llvm.access.group [[ACC_GRP53]]
 // CHECK3-NEXT:    [[CMP3:%.*]] = icmp ule i64 [[TMP9]], [[TMP10]]
 // CHECK3-NEXT:    br i1 [[CMP3]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK3:       omp.inner.for.body:
-// CHECK3-NEXT:    [[TMP11:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP39]]
+// CHECK3-NEXT:    [[TMP11:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP53]]
 // CHECK3-NEXT:    [[MUL:%.*]] = mul i64 [[TMP11]], 400
 // CHECK3-NEXT:    [[SUB:%.*]] = sub i64 2000, [[MUL]]
-// CHECK3-NEXT:    store i64 [[SUB]], ptr [[IT]], align 8, !llvm.access.group [[ACC_GRP39]]
-// CHECK3-NEXT:    [[TMP12:%.*]] = load i32, ptr [[B_ADDR]], align 4, !llvm.access.group [[ACC_GRP39]]
+// CHECK3-NEXT:    store i64 [[SUB]], ptr [[IT]], align 8, !llvm.access.group [[ACC_GRP53]]
+// CHECK3-NEXT:    [[TMP12:%.*]] = load i32, ptr [[B_ADDR]], align 4, !llvm.access.group [[ACC_GRP53]]
 // CHECK3-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP12]] to double
 // CHECK3-NEXT:    [[ADD:%.*]] = fadd double [[CONV]], 1.500000e+00
 // CHECK3-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_S1:%.*]], ptr [[TMP0]], i32 0, i32 0
-// CHECK3-NEXT:    store double [[ADD]], ptr [[A]], align 4, !llvm.access.group [[ACC_GRP39]]
+// CHECK3-NEXT:    store double [[ADD]], ptr [[A]], align 4, !llvm.access.group [[ACC_GRP53]]
 // CHECK3-NEXT:    [[A4:%.*]] = getelementptr inbounds [[STRUCT_S1]], ptr [[TMP0]], i32 0, i32 0
-// CHECK3-NEXT:    [[TMP13:%.*]] = load double, ptr [[A4]], align 4, !llvm.access.group [[ACC_GRP39]]
+// CHECK3-NEXT:    [[TMP13:%.*]] = load double, ptr [[A4]], align 4, !llvm.access.group [[ACC_GRP53]]
 // CHECK3-NEXT:    [[INC:%.*]] = fadd double [[TMP13]], 1.000000e+00
-// CHECK3-NEXT:    store double [[INC]], ptr [[A4]], align 4, !llvm.access.group [[ACC_GRP39]]
+// CHECK3-NEXT:    store double [[INC]], ptr [[A4]], align 4, !llvm.access.group [[ACC_GRP53]]
 // CHECK3-NEXT:    [[CONV5:%.*]] = fptosi double [[INC]] to i16
 // CHECK3-NEXT:    [[TMP14:%.*]] = mul nsw i32 1, [[TMP2]]
 // CHECK3-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[TMP3]], i32 [[TMP14]]
 // CHECK3-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i16, ptr [[ARRAYIDX]], i32 1
-// CHECK3-NEXT:    store i16 [[CONV5]], ptr [[ARRAYIDX6]], align 2, !llvm.access.group [[ACC_GRP39]]
+// CHECK3-NEXT:    store i16 [[CONV5]], ptr [[ARRAYIDX6]], align 2, !llvm.access.group [[ACC_GRP53]]
 // CHECK3-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK3:       omp.body.continue:
 // CHECK3-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK3:       omp.inner.for.inc:
-// CHECK3-NEXT:    [[TMP15:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP39]]
+// CHECK3-NEXT:    [[TMP15:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP53]]
 // CHECK3-NEXT:    [[ADD7:%.*]] = add i64 [[TMP15]], 1
-// CHECK3-NEXT:    store i64 [[ADD7]], ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP39]]
-// CHECK3-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP40:![0-9]+]]
+// CHECK3-NEXT:    store i64 [[ADD7]], ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP53]]
+// CHECK3-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP54:![0-9]+]]
 // CHECK3:       omp.inner.for.end:
 // CHECK3-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK3:       omp.loop.exit:
@@ -3499,35 +3499,35 @@ int bar(int n){
 // CHECK3-NEXT:    store i64 [[TMP5]], ptr [[DOTOMP_IV]], align 8
 // CHECK3-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK3:       omp.inner.for.cond:
-// CHECK3-NEXT:    [[TMP6:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP42:![0-9]+]]
-// CHECK3-NEXT:    [[TMP7:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8, !llvm.access.group [[ACC_GRP42]]
+// CHECK3-NEXT:    [[TMP6:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP56:![0-9]+]]
+// CHECK3-NEXT:    [[TMP7:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8, !llvm.access.group [[ACC_GRP56]]
 // CHECK3-NEXT:    [[CMP1:%.*]] = icmp sle i64 [[TMP6]], [[TMP7]]
 // CHECK3-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK3:       omp.inner.for.body:
-// CHECK3-NEXT:    [[TMP8:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP42]]
+// CHECK3-NEXT:    [[TMP8:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP56]]
 // CHECK3-NEXT:    [[MUL:%.*]] = mul nsw i64 [[TMP8]], 3
 // CHECK3-NEXT:    [[ADD:%.*]] = add nsw i64 -10, [[MUL]]
-// CHECK3-NEXT:    store i64 [[ADD]], ptr [[I]], align 8, !llvm.access.group [[ACC_GRP42]]
-// CHECK3-NEXT:    [[TMP9:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP42]]
+// CHECK3-NEXT:    store i64 [[ADD]], ptr [[I]], align 8, !llvm.access.group [[ACC_GRP56]]
+// CHECK3-NEXT:    [[TMP9:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP56]]
 // CHECK3-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP9]], 1
-// CHECK3-NEXT:    store i32 [[ADD2]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP42]]
-// CHECK3-NEXT:    [[TMP10:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP42]]
+// CHECK3-NEXT:    store i32 [[ADD2]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP56]]
+// CHECK3-NEXT:    [[TMP10:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP56]]
 // CHECK3-NEXT:    [[CONV:%.*]] = sext i16 [[TMP10]] to i32
 // CHECK3-NEXT:    [[ADD3:%.*]] = add nsw i32 [[CONV]], 1
 // CHECK3-NEXT:    [[CONV4:%.*]] = trunc i32 [[ADD3]] to i16
-// CHECK3-NEXT:    store i16 [[CONV4]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP42]]
+// CHECK3-NEXT:    store i16 [[CONV4]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP56]]
 // CHECK3-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP0]], i32 0, i32 2
-// CHECK3-NEXT:    [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP42]]
+// CHECK3-NEXT:    [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP56]]
 // CHECK3-NEXT:    [[ADD5:%.*]] = add nsw i32 [[TMP11]], 1
-// CHECK3-NEXT:    store i32 [[ADD5]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP42]]
+// CHECK3-NEXT:    store i32 [[ADD5]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP56]]
 // CHECK3-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK3:       omp.body.continue:
 // CHECK3-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK3:       omp.inner.for.inc:
-// CHECK3-NEXT:    [[TMP12:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP42]]
+// CHECK3-NEXT:    [[TMP12:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP56]]
 // CHECK3-NEXT:    [[ADD6:%.*]] = add nsw i64 [[TMP12]], 1
-// CHECK3-NEXT:    store i64 [[ADD6]], ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP42]]
-// CHECK3-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP43:![0-9]+]]
+// CHECK3-NEXT:    store i64 [[ADD6]], ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP56]]
+// CHECK3-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP57:![0-9]+]]
 // CHECK3:       omp.inner.for.end:
 // CHECK3-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK3:       omp.loop.exit:
@@ -3915,23 +3915,23 @@ int bar(int n){
 // CHECK5-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
 // CHECK5-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK5:       omp.inner.for.cond:
-// CHECK5-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP10:![0-9]+]]
-// CHECK5-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP10]]
+// CHECK5-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP24:![0-9]+]]
+// CHECK5-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP24]]
 // CHECK5-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
 // CHECK5-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK5:       omp.inner.for.body:
-// CHECK5-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP10]]
+// CHECK5-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP24]]
 // CHECK5-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 5
 // CHECK5-NEXT:    [[ADD:%.*]] = add nsw i32 3, [[MUL]]
-// CHECK5-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP10]]
+// CHECK5-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP24]]
 // CHECK5-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK5:       omp.body.continue:
 // CHECK5-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK5:       omp.inner.for.inc:
-// CHECK5-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP10]]
+// CHECK5-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP24]]
 // CHECK5-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP8]], 1
-// CHECK5-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP10]]
-// CHECK5-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP11:![0-9]+]]
+// CHECK5-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP24]]
+// CHECK5-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP25:![0-9]+]]
 // CHECK5:       omp.inner.for.end:
 // CHECK5-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK5:       omp.loop.exit:
@@ -3966,42 +3966,42 @@ int bar(int n){
 // CHECK5-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T:%.*]], ptr [[TMP4]], i32 0, i32 2
 // CHECK5-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], ptr [[TMP4]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8
-// CHECK5-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META16:![0-9]+]])
-// CHECK5-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META19:![0-9]+]])
-// CHECK5-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META21:![0-9]+]])
-// CHECK5-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META23:![0-9]+]])
-// CHECK5-NEXT:    store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !25
-// CHECK5-NEXT:    store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 8, !noalias !25
-// CHECK5-NEXT:    store ptr null, ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias !25
-// CHECK5-NEXT:    store ptr null, ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !25
-// CHECK5-NEXT:    store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 8, !noalias !25
-// CHECK5-NEXT:    store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !25
-// CHECK5-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !25
-// CHECK5-NEXT:    store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias !25
+// CHECK5-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META30:![0-9]+]])
+// CHECK5-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META33:![0-9]+]])
+// CHECK5-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META35:![0-9]+]])
+// CHECK5-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META37:![0-9]+]])
+// CHECK5-NEXT:    store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !39
+// CHECK5-NEXT:    store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 8, !noalias !39
+// CHECK5-NEXT:    store ptr null, ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias !39
+// CHECK5-NEXT:    store ptr null, ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !39
+// CHECK5-NEXT:    store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 8, !noalias !39
+// CHECK5-NEXT:    store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !39
+// CHECK5-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !39
+// CHECK5-NEXT:    store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias !39
 // CHECK5-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 1
-// CHECK5-NEXT:    store i32 0, ptr [[TMP9]], align 4, !noalias !25
+// CHECK5-NEXT:    store i32 0, ptr [[TMP9]], align 4, !noalias !39
 // CHECK5-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 2
-// CHECK5-NEXT:    store ptr null, ptr [[TMP10]], align 8, !noalias !25
+// CHECK5-NEXT:    store ptr null, ptr [[TMP10]], align 8, !noalias !39
 // CHECK5-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 3
-// CHECK5-NEXT:    store ptr null, ptr [[TMP11]], align 8, !noalias !25
+// CHECK5-NEXT:    store ptr null, ptr [[TMP11]], align 8, !noalias !39
 // CHECK5-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 4
-// CHECK5-NEXT:    store ptr null, ptr [[TMP12]], align 8, !noalias !25
+// CHECK5-NEXT:    store ptr null, ptr [[TMP12]], align 8, !noalias !39
 // CHECK5-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 5
-// CHECK5-NEXT:    store ptr null, ptr [[TMP13]], align 8, !noalias !25
+// CHECK5-NEXT:    store ptr null, ptr [[TMP13]], align 8, !noalias !39
 // CHECK5-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 6
-// CHECK5-NEXT:    store ptr null, ptr [[TMP14]], align 8, !noalias !25
+// CHECK5-NEXT:    store ptr null, ptr [[TMP14]], align 8, !noalias !39
 // CHECK5-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 7
-// CHECK5-NEXT:    store ptr null, ptr [[TMP15]], align 8, !noalias !25
+// CHECK5-NEXT:    store ptr null, ptr [[TMP15]], align 8, !noalias !39
 // CHECK5-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 8
-// CHECK5-NEXT:    store i64 0, ptr [[TMP16]], align 8, !noalias !25
+// CHECK5-NEXT:    store i64 0, ptr [[TMP16]], align 8, !noalias !39
 // CHECK5-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 9
-// CHECK5-NEXT:    store i64 1, ptr [[TMP17]], align 8, !noalias !25
+// CHECK5-NEXT:    store i64 1, ptr [[TMP17]], align 8, !noalias !39
 // CHECK5-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 10
-// CHECK5-NEXT:    store [3 x i32] [i32 1, i32 0, i32 0], ptr [[TMP18]], align 4, !noalias !25
+// CHECK5-NEXT:    store [3 x i32] [i32 1, i32 0, i32 0], ptr [[TMP18]], align 4, !noalias !39
 // CHECK5-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 11
-// CHECK5-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP19]], align 4, !noalias !25
+// CHECK5-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP19]], align 4, !noalias !39
 // CHECK5-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 12
-// CHECK5-NEXT:    store i32 0, ptr [[TMP20]], align 4, !noalias !25
+// CHECK5-NEXT:    store i32 0, ptr [[TMP20]], align 4, !noalias !39
 // CHECK5-NEXT:    [[TMP21:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB2]], i64 -1, i32 1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l96.region_id, ptr [[KERNEL_ARGS_I]])
 // CHECK5-NEXT:    [[TMP22:%.*]] = icmp ne i32 [[TMP21]], 0
 // CHECK5-NEXT:    br i1 [[TMP22]], label [[OMP_OFFLOAD_FAILED_I:%.*]], label [[DOTOMP_OUTLINED__EXIT:%.*]]
@@ -4071,32 +4071,32 @@ int bar(int n){
 // CHECK5-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
 // CHECK5-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK5:       omp.inner.for.cond:
-// CHECK5-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26:![0-9]+]]
-// CHECK5-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK5-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP40:![0-9]+]]
+// CHECK5-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP40]]
 // CHECK5-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
 // CHECK5-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK5:       omp.inner.for.body:
-// CHECK5-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK5-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP40]]
 // CHECK5-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
 // CHECK5-NEXT:    [[SUB:%.*]] = sub nsw i32 10, [[MUL]]
-// CHECK5-NEXT:    store i32 [[SUB]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP26]]
-// CHECK5-NEXT:    [[TMP8:%.*]] = load i64, ptr [[DOTLINEAR_START]], align 8, !llvm.access.group [[ACC_GRP26]]
-// CHECK5-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK5-NEXT:    store i32 [[SUB]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP40]]
+// CHECK5-NEXT:    [[TMP8:%.*]] = load i64, ptr [[DOTLINEAR_START]], align 8, !llvm.access.group [[ACC_GRP40]]
+// CHECK5-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP40]]
 // CHECK5-NEXT:    [[MUL2:%.*]] = mul nsw i32 [[TMP9]], 3
 // CHECK5-NEXT:    [[CONV:%.*]] = sext i32 [[MUL2]] to i64
 // CHECK5-NEXT:    [[ADD:%.*]] = add nsw i64 [[TMP8]], [[CONV]]
-// CHECK5-NEXT:    store i64 [[ADD]], ptr [[K1]], align 8, !llvm.access.group [[ACC_GRP26]]
-// CHECK5-NEXT:    [[TMP10:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK5-NEXT:    store i64 [[ADD]], ptr [[K1]], align 8, !llvm.access.group [[ACC_GRP40]]
+// CHECK5-NEXT:    [[TMP10:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP40]]
 // CHECK5-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP10]], 1
-// CHECK5-NEXT:    store i32 [[ADD3]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK5-NEXT:    store i32 [[ADD3]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP40]]
 // CHECK5-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK5:       omp.body.continue:
 // CHECK5-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK5:       omp.inner.for.inc:
-// CHECK5-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK5-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP40]]
 // CHECK5-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP11]], 1
-// CHECK5-NEXT:    store i32 [[ADD4]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
-// CHECK5-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP27:![0-9]+]]
+// CHECK5-NEXT:    store i32 [[ADD4]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP40]]
+// CHECK5-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP41:![0-9]+]]
 // CHECK5:       omp.inner.for.end:
 // CHECK5-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK5:       omp.dispatch.inc:
@@ -4199,44 +4199,44 @@ int bar(int n){
 // CHECK5-NEXT:    store i64 [[TMP6]], ptr [[DOTOMP_IV]], align 8
 // CHECK5-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK5:       omp.inner.for.cond:
-// CHECK5-NEXT:    [[TMP7:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP29:![0-9]+]]
-// CHECK5-NEXT:    [[TMP8:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8, !llvm.access.group [[ACC_GRP29]]
+// CHECK5-NEXT:    [[TMP7:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP43:![0-9]+]]
+// CHECK5-NEXT:    [[TMP8:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8, !llvm.access.group [[ACC_GRP43]]
 // CHECK5-NEXT:    [[CMP4:%.*]] = icmp ule i64 [[TMP7]], [[TMP8]]
 // CHECK5-NEXT:    br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK5:       omp.inner.for.body:
-// CHECK5-NEXT:    [[TMP9:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP29]]
+// CHECK5-NEXT:    [[TMP9:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP43]]
 // CHECK5-NEXT:    [[MUL:%.*]] = mul i64 [[TMP9]], 400
 // CHECK5-NEXT:    [[SUB:%.*]] = sub i64 2000, [[MUL]]
-// CHECK5-NEXT:    store i64 [[SUB]], ptr [[IT]], align 8, !llvm.access.group [[ACC_GRP29]]
-// CHECK5-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTLINEAR_START]], align 4, !llvm.access.group [[ACC_GRP29]]
+// CHECK5-NEXT:    store i64 [[SUB]], ptr [[IT]], align 8, !llvm.access.group [[ACC_GRP43]]
+// CHECK5-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTLINEAR_START]], align 4, !llvm.access.group [[ACC_GRP43]]
 // CHECK5-NEXT:    [[CONV:%.*]] = sext i32 [[TMP10]] to i64
-// CHECK5-NEXT:    [[TMP11:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP29]]
-// CHECK5-NEXT:    [[TMP12:%.*]] = load i64, ptr [[DOTLINEAR_STEP]], align 8, !llvm.access.group [[ACC_GRP29]]
+// CHECK5-NEXT:    [[TMP11:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP43]]
+// CHECK5-NEXT:    [[TMP12:%.*]] = load i64, ptr [[DOTLINEAR_STEP]], align 8, !llvm.access.group [[ACC_GRP43]]
 // CHECK5-NEXT:    [[MUL5:%.*]] = mul i64 [[TMP11]], [[TMP12]]
 // CHECK5-NEXT:    [[ADD:%.*]] = add i64 [[CONV]], [[MUL5]]
 // CHECK5-NEXT:    [[CONV6:%.*]] = trunc i64 [[ADD]] to i32
-// CHECK5-NEXT:    store i32 [[CONV6]], ptr [[LIN2]], align 4, !llvm.access.group [[ACC_GRP29]]
-// CHECK5-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTLINEAR_START1]], align 4, !llvm.access.group [[ACC_GRP29]]
+// CHECK5-NEXT:    store i32 [[CONV6]], ptr [[LIN2]], align 4, !llvm.access.group [[ACC_GRP43]]
+// CHECK5-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTLINEAR_START1]], align 4, !llvm.access.group [[ACC_GRP43]]
 // CHECK5-NEXT:    [[CONV7:%.*]] = sext i32 [[TMP13]] to i64
-// CHECK5-NEXT:    [[TMP14:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP29]]
-// CHECK5-NEXT:    [[TMP15:%.*]] = load i64, ptr [[DOTLINEAR_STEP]], align 8, !llvm.access.group [[ACC_GRP29]]
+// CHECK5-NEXT:    [[TMP14:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP43]]
+// CHECK5-NEXT:    [[TMP15:%.*]] = load i64, ptr [[DOTLINEAR_STEP]], align 8, !llvm.access.group [[ACC_GRP43]]
 // CHECK5-NEXT:    [[MUL8:%.*]] = mul i64 [[TMP14]], [[TMP15]]
 // CHECK5-NEXT:    [[ADD9:%.*]] = add i64 [[CONV7]], [[MUL8]]
 // CHECK5-NEXT:    [[CONV10:%.*]] = trunc i64 [[ADD9]] to i32
-// CHECK5-NEXT:    store i32 [[CONV10]], ptr [[A3]], align 4, !llvm.access.group [[ACC_GRP29]]
-// CHECK5-NEXT:    [[TMP16:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP29]]
+// CHECK5-NEXT:    store i32 [[CONV10]], ptr [[A3]], align 4, !llvm.access.group [[ACC_GRP43]]
+// CHECK5-NEXT:    [[TMP16:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP43]]
 // CHECK5-NEXT:    [[CONV11:%.*]] = sext i16 [[TMP16]] to i32
 // CHECK5-NEXT:    [[ADD12:%.*]] = add nsw i32 [[CONV11]], 1
 // CHECK5-NEXT:    [[CONV13:%.*]] = trunc i32 [[ADD12]] to i16
-// CHECK5-NEXT:    store i16 [[CONV13]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP29]]
+// CHECK5-NEXT:    store i16 [[CONV13]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP43]]
 // CHECK5-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK5:       omp.body.continue:
 // CHECK5-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK5:       omp.inner.for.inc:
-// CHECK5-NEXT:    [[TMP17:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP29]]
+// CHECK5-NEXT:    [[TMP17:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP43]]
 // CHECK5-NEXT:    [[ADD14:%.*]] = add i64 [[TMP17]], 1
-// CHECK5-NEXT:    store i64 [[ADD14]], ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP29]]
-// CHECK5-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP30:![0-9]+]]
+// CHECK5-NEXT:    store i64 [[ADD14]], ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP43]]
+// CHECK5-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP44:![0-9]+]]
 // CHECK5:       omp.inner.for.end:
 // CHECK5-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK5:       omp.loop.exit:
@@ -4320,32 +4320,32 @@ int bar(int n){
 // CHECK5-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
 // CHECK5-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK5:       omp.inner.for.cond:
-// CHECK5-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP32:![0-9]+]]
-// CHECK5-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP32]]
+// CHECK5-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP46:![0-9]+]]
+// CHECK5-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP46]]
 // CHECK5-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
 // CHECK5-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK5:       omp.inner.for.body:
-// CHECK5-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP32]]
+// CHECK5-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP46]]
 // CHECK5-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 4
 // CHECK5-NEXT:    [[ADD:%.*]] = add nsw i32 6, [[MUL]]
 // CHECK5-NEXT:    [[CONV:%.*]] = trunc i32 [[ADD]] to i16
-// CHECK5-NEXT:    store i16 [[CONV]], ptr [[IT]], align 2, !llvm.access.group [[ACC_GRP32]]
-// CHECK5-NEXT:    [[TMP8:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP32]]
+// CHECK5-NEXT:    store i16 [[CONV]], ptr [[IT]], align 2, !llvm.access.group [[ACC_GRP46]]
+// CHECK5-NEXT:    [[TMP8:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP46]]
 // CHECK5-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP8]], 1
-// CHECK5-NEXT:    store i32 [[ADD2]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP32]]
-// CHECK5-NEXT:    [[TMP9:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP32]]
+// CHECK5-NEXT:    store i32 [[ADD2]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP46]]
+// CHECK5-NEXT:    [[TMP9:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP46]]
 // CHECK5-NEXT:    [[CONV3:%.*]] = sext i16 [[TMP9]] to i32
 // CHECK5-NEXT:    [[ADD4:%.*]] = add nsw i32 [[CONV3]], 1
 // CHECK5-NEXT:    [[CONV5:%.*]] = trunc i32 [[ADD4]] to i16
-// CHECK5-NEXT:    store i16 [[CONV5]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP32]]
+// CHECK5-NEXT:    store i16 [[CONV5]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP46]]
 // CHECK5-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK5:       omp.body.continue:
 // CHECK5-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK5:       omp.inner.for.inc:
-// CHECK5-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP32]]
+// CHECK5-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP46]]
 // CHECK5-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP10]], 1
-// CHECK5-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP32]]
-// CHECK5-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP33:![0-9]+]]
+// CHECK5-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP46]]
+// CHECK5-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP47:![0-9]+]]
 // CHECK5:       omp.inner.for.end:
 // CHECK5-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK5:       omp.loop.exit:
@@ -4475,60 +4475,60 @@ int bar(int n){
 // CHECK5:       omp.dispatch.body:
 // CHECK5-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK5:       omp.inner.for.cond:
-// CHECK5-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP35:![0-9]+]]
-// CHECK5-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP35]]
+// CHECK5-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP49:![0-9]+]]
+// CHECK5-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP49]]
 // CHECK5-NEXT:    [[CMP6:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]]
 // CHECK5-NEXT:    br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK5:       omp.inner.for.body:
-// CHECK5-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP35]]
+// CHECK5-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP49]]
 // CHECK5-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1
 // CHECK5-NEXT:    [[SUB:%.*]] = sub nsw i32 122, [[MUL]]
 // CHECK5-NEXT:    [[CONV:%.*]] = trunc i32 [[SUB]] to i8
-// CHECK5-NEXT:    store i8 [[CONV]], ptr [[IT]], align 1, !llvm.access.group [[ACC_GRP35]]
-// CHECK5-NEXT:    [[TMP19:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP35]]
+// CHECK5-NEXT:    store i8 [[CONV]], ptr [[IT]], align 1, !llvm.access.group [[ACC_GRP49]]
+// CHECK5-NEXT:    [[TMP19:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP49]]
 // CHECK5-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP19]], 1
-// CHECK5-NEXT:    store i32 [[ADD]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP35]]
+// CHECK5-NEXT:    store i32 [[ADD]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP49]]
 // CHECK5-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x float], ptr [[TMP0]], i64 0, i64 2
-// CHECK5-NEXT:    [[TMP20:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP35]]
+// CHECK5-NEXT:    [[TMP20:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP49]]
 // CHECK5-NEXT:    [[CONV7:%.*]] = fpext float [[TMP20]] to double
 // CHECK5-NEXT:    [[ADD8:%.*]] = fadd double [[CONV7]], 1.000000e+00
 // CHECK5-NEXT:    [[CONV9:%.*]] = fptrunc double [[ADD8]] to float
-// CHECK5-NEXT:    store float [[CONV9]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP35]]
+// CHECK5-NEXT:    store float [[CONV9]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP49]]
 // CHECK5-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i64 3
-// CHECK5-NEXT:    [[TMP21:%.*]] = load float, ptr [[ARRAYIDX10]], align 4, !llvm.access.group [[ACC_GRP35]]
+// CHECK5-NEXT:    [[TMP21:%.*]] = load float, ptr [[ARRAYIDX10]], align 4, !llvm.access.group [[ACC_GRP49]]
 // CHECK5-NEXT:    [[CONV11:%.*]] = fpext float [[TMP21]] to double
 // CHECK5-NEXT:    [[ADD12:%.*]] = fadd double [[CONV11]], 1.000000e+00
 // CHECK5-NEXT:    [[CONV13:%.*]] = fptrunc double [[ADD12]] to float
-// CHECK5-NEXT:    store float [[CONV13]], ptr [[ARRAYIDX10]], align 4, !llvm.access.group [[ACC_GRP35]]
+// CHECK5-NEXT:    store float [[CONV13]], ptr [[ARRAYIDX10]], align 4, !llvm.access.group [[ACC_GRP49]]
 // CHECK5-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds [5 x [10 x double]], ptr [[TMP3]], i64 0, i64 1
 // CHECK5-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds [10 x double], ptr [[ARRAYIDX14]], i64 0, i64 2
-// CHECK5-NEXT:    [[TMP22:%.*]] = load double, ptr [[ARRAYIDX15]], align 8, !llvm.access.group [[ACC_GRP35]]
+// CHECK5-NEXT:    [[TMP22:%.*]] = load double, ptr [[ARRAYIDX15]], align 8, !llvm.access.group [[ACC_GRP49]]
 // CHECK5-NEXT:    [[ADD16:%.*]] = fadd double [[TMP22]], 1.000000e+00
-// CHECK5-NEXT:    store double [[ADD16]], ptr [[ARRAYIDX15]], align 8, !llvm.access.group [[ACC_GRP35]]
+// CHECK5-NEXT:    store double [[ADD16]], ptr [[ARRAYIDX15]], align 8, !llvm.access.group [[ACC_GRP49]]
 // CHECK5-NEXT:    [[TMP23:%.*]] = mul nsw i64 1, [[TMP5]]
 // CHECK5-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds double, ptr [[TMP6]], i64 [[TMP23]]
 // CHECK5-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds double, ptr [[ARRAYIDX17]], i64 3
-// CHECK5-NEXT:    [[TMP24:%.*]] = load double, ptr [[ARRAYIDX18]], align 8, !llvm.access.group [[ACC_GRP35]]
+// CHECK5-NEXT:    [[TMP24:%.*]] = load double, ptr [[ARRAYIDX18]], align 8, !llvm.access.group [[ACC_GRP49]]
 // CHECK5-NEXT:    [[ADD19:%.*]] = fadd double [[TMP24]], 1.000000e+00
-// CHECK5-NEXT:    store double [[ADD19]], ptr [[ARRAYIDX18]], align 8, !llvm.access.group [[ACC_GRP35]]
+// CHECK5-NEXT:    store double [[ADD19]], ptr [[ARRAYIDX18]], align 8, !llvm.access.group [[ACC_GRP49]]
 // CHECK5-NEXT:    [[X:%.*]] = getelementptr inbounds [[STRUCT_TT:%.*]], ptr [[TMP7]], i32 0, i32 0
-// CHECK5-NEXT:    [[TMP25:%.*]] = load i64, ptr [[X]], align 8, !llvm.access.group [[ACC_GRP35]]
+// CHECK5-NEXT:    [[TMP25:%.*]] = load i64, ptr [[X]], align 8, !llvm.access.group [[ACC_GRP49]]
 // CHECK5-NEXT:    [[ADD20:%.*]] = add nsw i64 [[TMP25]], 1
-// CHECK5-NEXT:    store i64 [[ADD20]], ptr [[X]], align 8, !llvm.access.group [[ACC_GRP35]]
+// CHECK5-NEXT:    store i64 [[ADD20]], ptr [[X]], align 8, !llvm.access.group [[ACC_GRP49]]
 // CHECK5-NEXT:    [[Y:%.*]] = getelementptr inbounds [[STRUCT_TT]], ptr [[TMP7]], i32 0, i32 1
-// CHECK5-NEXT:    [[TMP26:%.*]] = load i8, ptr [[Y]], align 8, !llvm.access.group [[ACC_GRP35]]
+// CHECK5-NEXT:    [[TMP26:%.*]] = load i8, ptr [[Y]], align 8, !llvm.access.group [[ACC_GRP49]]
 // CHECK5-NEXT:    [[CONV21:%.*]] = sext i8 [[TMP26]] to i32
 // CHECK5-NEXT:    [[ADD22:%.*]] = add nsw i32 [[CONV21]], 1
 // CHECK5-NEXT:    [[CONV23:%.*]] = trunc i32 [[ADD22]] to i8
-// CHECK5-NEXT:    store i8 [[CONV23]], ptr [[Y]], align 8, !llvm.access.group [[ACC_GRP35]]
+// CHECK5-NEXT:    store i8 [[CONV23]], ptr [[Y]], align 8, !llvm.access.group [[ACC_GRP49]]
 // CHECK5-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK5:       omp.body.continue:
 // CHECK5-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK5:       omp.inner.for.inc:
-// CHECK5-NEXT:    [[TMP27:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP35]]
+// CHECK5-NEXT:    [[TMP27:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP49]]
 // CHECK5-NEXT:    [[ADD24:%.*]] = add nsw i32 [[TMP27]], 1
-// CHECK5-NEXT:    store i32 [[ADD24]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP35]]
-// CHECK5-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP36:![0-9]+]]
+// CHECK5-NEXT:    store i32 [[ADD24]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP49]]
+// CHECK5-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP50:![0-9]+]]
 // CHECK5:       omp.inner.for.end:
 // CHECK5-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK5:       omp.dispatch.inc:
@@ -5022,37 +5022,37 @@ int bar(int n){
 // CHECK5-NEXT:    store i64 [[TMP9]], ptr [[DOTOMP_IV]], align 8
 // CHECK5-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK5:       omp.inner.for.cond:
-// CHECK5-NEXT:    [[TMP10:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP38:![0-9]+]]
-// CHECK5-NEXT:    [[TMP11:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8, !llvm.access.group [[ACC_GRP38]]
+// CHECK5-NEXT:    [[TMP10:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP52:![0-9]+]]
+// CHECK5-NEXT:    [[TMP11:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8, !llvm.access.group [[ACC_GRP52]]
 // CHECK5-NEXT:    [[CMP3:%.*]] = icmp ule i64 [[TMP10]], [[TMP11]]
 // CHECK5-NEXT:    br i1 [[CMP3]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK5:       omp.inner.for.body:
-// CHECK5-NEXT:    [[TMP12:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP38]]
+// CHECK5-NEXT:    [[TMP12:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP52]]
 // CHECK5-NEXT:    [[MUL:%.*]] = mul i64 [[TMP12]], 400
 // CHECK5-NEXT:    [[SUB:%.*]] = sub i64 2000, [[MUL]]
-// CHECK5-NEXT:    store i64 [[SUB]], ptr [[IT]], align 8, !llvm.access.group [[ACC_GRP38]]
-// CHECK5-NEXT:    [[TMP13:%.*]] = load i32, ptr [[B_ADDR]], align 4, !llvm.access.group [[ACC_GRP38]]
+// CHECK5-NEXT:    store i64 [[SUB]], ptr [[IT]], align 8, !llvm.access.group [[ACC_GRP52]]
+// CHECK5-NEXT:    [[TMP13:%.*]] = load i32, ptr [[B_ADDR]], align 4, !llvm.access.group [[ACC_GRP52]]
 // CHECK5-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP13]] to double
 // CHECK5-NEXT:    [[ADD:%.*]] = fadd double [[CONV]], 1.500000e+00
 // CHECK5-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_S1:%.*]], ptr [[TMP0]], i32 0, i32 0
-// CHECK5-NEXT:    store double [[ADD]], ptr [[A]], align 8, !nontemporal !39, !llvm.access.group [[ACC_GRP38]]
+// CHECK5-NEXT:    store double [[ADD]], ptr [[A]], align 8, !nontemporal !53, !llvm.access.group [[ACC_GRP52]]
 // CHECK5-NEXT:    [[A4:%.*]] = getelementptr inbounds [[STRUCT_S1]], ptr [[TMP0]], i32 0, i32 0
-// CHECK5-NEXT:    [[TMP14:%.*]] = load double, ptr [[A4]], align 8, !nontemporal !39, !llvm.access.group [[ACC_GRP38]]
+// CHECK5-NEXT:    [[TMP14:%.*]] = load double, ptr [[A4]], align 8, !nontemporal !53, !llvm.access.group [[ACC_GRP52]]
 // CHECK5-NEXT:    [[INC:%.*]] = fadd double [[TMP14]], 1.000000e+00
-// CHECK5-NEXT:    store double [[INC]], ptr [[A4]], align 8, !nontemporal !39, !llvm.access.group [[ACC_GRP38]]
+// CHECK5-NEXT:    store double [[INC]], ptr [[A4]], align 8, !nontemporal !53, !llvm.access.group [[ACC_GRP52]]
 // CHECK5-NEXT:    [[CONV5:%.*]] = fptosi double [[INC]] to i16
 // CHECK5-NEXT:    [[TMP15:%.*]] = mul nsw i64 1, [[TMP2]]
 // CHECK5-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[TMP3]], i64 [[TMP15]]
 // CHECK5-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i16, ptr [[ARRAYIDX]], i64 1
-// CHECK5-NEXT:    store i16 [[CONV5]], ptr [[ARRAYIDX6]], align 2, !llvm.access.group [[ACC_GRP38]]
+// CHECK5-NEXT:    store i16 [[CONV5]], ptr [[ARRAYIDX6]], align 2, !llvm.access.group [[ACC_GRP52]]
 // CHECK5-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK5:       omp.body.continue:
 // CHECK5-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK5:       omp.inner.for.inc:
-// CHECK5-NEXT:    [[TMP16:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP38]]
+// CHECK5-NEXT:    [[TMP16:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP52]]
 // CHECK5-NEXT:    [[ADD7:%.*]] = add i64 [[TMP16]], 1
-// CHECK5-NEXT:    store i64 [[ADD7]], ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP38]]
-// CHECK5-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP40:![0-9]+]]
+// CHECK5-NEXT:    store i64 [[ADD7]], ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP52]]
+// CHECK5-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP54:![0-9]+]]
 // CHECK5:       omp.inner.for.end:
 // CHECK5-NEXT:    br label [[OMP_IF_END:%.*]]
 // CHECK5:       omp_if.else:
@@ -5104,7 +5104,7 @@ int bar(int n){
 // CHECK5-NEXT:    [[TMP28:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8
 // CHECK5-NEXT:    [[ADD28:%.*]] = add i64 [[TMP28]], 1
 // CHECK5-NEXT:    store i64 [[ADD28]], ptr [[DOTOMP_IV]], align 8
-// CHECK5-NEXT:    br label [[OMP_INNER_FOR_COND13]], !llvm.loop [[LOOP42:![0-9]+]]
+// CHECK5-NEXT:    br label [[OMP_INNER_FOR_COND13]], !llvm.loop [[LOOP56:![0-9]+]]
 // CHECK5:       omp.inner.for.end29:
 // CHECK5-NEXT:    br label [[OMP_IF_END]]
 // CHECK5:       omp_if.end:
@@ -5237,35 +5237,35 @@ int bar(int n){
 // CHECK5-NEXT:    store i64 [[TMP5]], ptr [[DOTOMP_IV]], align 8
 // CHECK5-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK5:       omp.inner.for.cond:
-// CHECK5-NEXT:    [[TMP6:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP44:![0-9]+]]
-// CHECK5-NEXT:    [[TMP7:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8, !llvm.access.group [[ACC_GRP44]]
+// CHECK5-NEXT:    [[TMP6:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP58:![0-9]+]]
+// CHECK5-NEXT:    [[TMP7:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8, !llvm.access.group [[ACC_GRP58]]
 // CHECK5-NEXT:    [[CMP1:%.*]] = icmp sle i64 [[TMP6]], [[TMP7]]
 // CHECK5-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK5:       omp.inner.for.body:
-// CHECK5-NEXT:    [[TMP8:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP44]]
+// CHECK5-NEXT:    [[TMP8:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP58]]
 // CHECK5-NEXT:    [[MUL:%.*]] = mul nsw i64 [[TMP8]], 3
 // CHECK5-NEXT:    [[ADD:%.*]] = add nsw i64 -10, [[MUL]]
-// CHECK5-NEXT:    store i64 [[ADD]], ptr [[I]], align 8, !llvm.access.group [[ACC_GRP44]]
-// CHECK5-NEXT:    [[TMP9:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP44]]
+// CHECK5-NEXT:    store i64 [[ADD]], ptr [[I]], align 8, !llvm.access.group [[ACC_GRP58]]
+// CHECK5-NEXT:    [[TMP9:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP58]]
 // CHECK5-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP9]], 1
-// CHECK5-NEXT:    store i32 [[ADD2]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP44]]
-// CHECK5-NEXT:    [[TMP10:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP44]]
+// CHECK5-NEXT:    store i32 [[ADD2]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP58]]
+// CHECK5-NEXT:    [[TMP10:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP58]]
 // CHECK5-NEXT:    [[CONV:%.*]] = sext i16 [[TMP10]] to i32
 // CHECK5-NEXT:    [[ADD3:%.*]] = add nsw i32 [[CONV]], 1
 // CHECK5-NEXT:    [[CONV4:%.*]] = trunc i32 [[ADD3]] to i16
-// CHECK5-NEXT:    store i16 [[CONV4]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP44]]
+// CHECK5-NEXT:    store i16 [[CONV4]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP58]]
 // CHECK5-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP0]], i64 0, i64 2
-// CHECK5-NEXT:    [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP44]]
+// CHECK5-NEXT:    [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP58]]
 // CHECK5-NEXT:    [[ADD5:%.*]] = add nsw i32 [[TMP11]], 1
-// CHECK5-NEXT:    store i32 [[ADD5]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP44]]
+// CHECK5-NEXT:    store i32 [[ADD5]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP58]]
 // CHECK5-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK5:       omp.body.continue:
 // CHECK5-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK5:       omp.inner.for.inc:
-// CHECK5-NEXT:    [[TMP12:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP44]]
+// CHECK5-NEXT:    [[TMP12:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP58]]
 // CHECK5-NEXT:    [[ADD6:%.*]] = add nsw i64 [[TMP12]], 1
-// CHECK5-NEXT:    store i64 [[ADD6]], ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP44]]
-// CHECK5-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP45:![0-9]+]]
+// CHECK5-NEXT:    store i64 [[ADD6]], ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP58]]
+// CHECK5-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP59:![0-9]+]]
 // CHECK5:       omp.inner.for.end:
 // CHECK5-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK5:       omp.loop.exit:
@@ -5649,23 +5649,23 @@ int bar(int n){
 // CHECK7-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
 // CHECK7-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK7:       omp.inner.for.cond:
-// CHECK7-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP11:![0-9]+]]
-// CHECK7-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP11]]
+// CHECK7-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP25:![0-9]+]]
+// CHECK7-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP25]]
 // CHECK7-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
 // CHECK7-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK7:       omp.inner.for.body:
-// CHECK7-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP11]]
+// CHECK7-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP25]]
 // CHECK7-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 5
 // CHECK7-NEXT:    [[ADD:%.*]] = add nsw i32 3, [[MUL]]
-// CHECK7-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP11]]
+// CHECK7-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP25]]
 // CHECK7-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK7:       omp.body.continue:
 // CHECK7-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK7:       omp.inner.for.inc:
-// CHECK7-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP11]]
+// CHECK7-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP25]]
 // CHECK7-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP8]], 1
-// CHECK7-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP11]]
-// CHECK7-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP12:![0-9]+]]
+// CHECK7-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP25]]
+// CHECK7-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP26:![0-9]+]]
 // CHECK7:       omp.inner.for.end:
 // CHECK7-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK7:       omp.loop.exit:
@@ -5700,42 +5700,42 @@ int bar(int n){
 // CHECK7-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T:%.*]], ptr [[TMP4]], i32 0, i32 2
 // CHECK7-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], ptr [[TMP4]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 4
-// CHECK7-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META17:![0-9]+]])
-// CHECK7-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META20:![0-9]+]])
-// CHECK7-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META22:![0-9]+]])
-// CHECK7-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META24:![0-9]+]])
-// CHECK7-NEXT:    store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !26
-// CHECK7-NEXT:    store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 4, !noalias !26
-// CHECK7-NEXT:    store ptr null, ptr [[DOTPRIVATES__ADDR_I]], align 4, !noalias !26
-// CHECK7-NEXT:    store ptr null, ptr [[DOTCOPY_FN__ADDR_I]], align 4, !noalias !26
-// CHECK7-NEXT:    store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 4, !noalias !26
-// CHECK7-NEXT:    store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 4, !noalias !26
-// CHECK7-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 4, !noalias !26
-// CHECK7-NEXT:    store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias !26
+// CHECK7-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META31:![0-9]+]])
+// CHECK7-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META34:![0-9]+]])
+// CHECK7-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META36:![0-9]+]])
+// CHECK7-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META38:![0-9]+]])
+// CHECK7-NEXT:    store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !40
+// CHECK7-NEXT:    store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 4, !noalias !40
+// CHECK7-NEXT:    store ptr null, ptr [[DOTPRIVATES__ADDR_I]], align 4, !noalias !40
+// CHECK7-NEXT:    store ptr null, ptr [[DOTCOPY_FN__ADDR_I]], align 4, !noalias !40
+// CHECK7-NEXT:    store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 4, !noalias !40
+// CHECK7-NEXT:    store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 4, !noalias !40
+// CHECK7-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 4, !noalias !40
+// CHECK7-NEXT:    store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias !40
 // CHECK7-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 1
-// CHECK7-NEXT:    store i32 0, ptr [[TMP9]], align 4, !noalias !26
+// CHECK7-NEXT:    store i32 0, ptr [[TMP9]], align 4, !noalias !40
 // CHECK7-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 2
-// CHECK7-NEXT:    store ptr null, ptr [[TMP10]], align 4, !noalias !26
+// CHECK7-NEXT:    store ptr null, ptr [[TMP10]], align 4, !noalias !40
 // CHECK7-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 3
-// CHECK7-NEXT:    store ptr null, ptr [[TMP11]], align 4, !noalias !26
+// CHECK7-NEXT:    store ptr null, ptr [[TMP11]], align 4, !noalias !40
 // CHECK7-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 4
-// CHECK7-NEXT:    store ptr null, ptr [[TMP12]], align 4, !noalias !26
+// CHECK7-NEXT:    store ptr null, ptr [[TMP12]], align 4, !noalias !40
 // CHECK7-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 5
-// CHECK7-NEXT:    store ptr null, ptr [[TMP13]], align 4, !noalias !26
+// CHECK7-NEXT:    store ptr null, ptr [[TMP13]], align 4, !noalias !40
 // CHECK7-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 6
-// CHECK7-NEXT:    store ptr null, ptr [[TMP14]], align 4, !noalias !26
+// CHECK7-NEXT:    store ptr null, ptr [[TMP14]], align 4, !noalias !40
 // CHECK7-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 7
-// CHECK7-NEXT:    store ptr null, ptr [[TMP15]], align 4, !noalias !26
+// CHECK7-NEXT:    store ptr null, ptr [[TMP15]], align 4, !noalias !40
 // CHECK7-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 8
-// CHECK7-NEXT:    store i64 0, ptr [[TMP16]], align 8, !noalias !26
+// CHECK7-NEXT:    store i64 0, ptr [[TMP16]], align 8, !noalias !40
 // CHECK7-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 9
-// CHECK7-NEXT:    store i64 1, ptr [[TMP17]], align 8, !noalias !26
+// CHECK7-NEXT:    store i64 1, ptr [[TMP17]], align 8, !noalias !40
 // CHECK7-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 10
-// CHECK7-NEXT:    store [3 x i32] [i32 1, i32 0, i32 0], ptr [[TMP18]], align 4, !noalias !26
+// CHECK7-NEXT:    store [3 x i32] [i32 1, i32 0, i32 0], ptr [[TMP18]], align 4, !noalias !40
 // CHECK7-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 11
-// CHECK7-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP19]], align 4, !noalias !26
+// CHECK7-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP19]], align 4, !noalias !40
 // CHECK7-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 12
-// CHECK7-NEXT:    store i32 0, ptr [[TMP20]], align 4, !noalias !26
+// CHECK7-NEXT:    store i32 0, ptr [[TMP20]], align 4, !noalias !40
 // CHECK7-NEXT:    [[TMP21:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB2]], i64 -1, i32 1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l96.region_id, ptr [[KERNEL_ARGS_I]])
 // CHECK7-NEXT:    [[TMP22:%.*]] = icmp ne i32 [[TMP21]], 0
 // CHECK7-NEXT:    br i1 [[TMP22]], label [[OMP_OFFLOAD_FAILED_I:%.*]], label [[DOTOMP_OUTLINED__EXIT:%.*]]
@@ -5803,32 +5803,32 @@ int bar(int n){
 // CHECK7-NEXT:    store i32 [[TMP5]], ptr [[DOTOMP_IV]], align 4
 // CHECK7-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK7:       omp.inner.for.cond:
-// CHECK7-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP27:![0-9]+]]
-// CHECK7-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP27]]
+// CHECK7-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP41:![0-9]+]]
+// CHECK7-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP41]]
 // CHECK7-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP6]], [[TMP7]]
 // CHECK7-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK7:       omp.inner.for.body:
-// CHECK7-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP27]]
+// CHECK7-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP41]]
 // CHECK7-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1
 // CHECK7-NEXT:    [[SUB:%.*]] = sub nsw i32 10, [[MUL]]
-// CHECK7-NEXT:    store i32 [[SUB]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP27]]
-// CHECK7-NEXT:    [[TMP9:%.*]] = load i64, ptr [[DOTLINEAR_START]], align 8, !llvm.access.group [[ACC_GRP27]]
-// CHECK7-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP27]]
+// CHECK7-NEXT:    store i32 [[SUB]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP41]]
+// CHECK7-NEXT:    [[TMP9:%.*]] = load i64, ptr [[DOTLINEAR_START]], align 8, !llvm.access.group [[ACC_GRP41]]
+// CHECK7-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP41]]
 // CHECK7-NEXT:    [[MUL2:%.*]] = mul nsw i32 [[TMP10]], 3
 // CHECK7-NEXT:    [[CONV:%.*]] = sext i32 [[MUL2]] to i64
 // CHECK7-NEXT:    [[ADD:%.*]] = add nsw i64 [[TMP9]], [[CONV]]
-// CHECK7-NEXT:    store i64 [[ADD]], ptr [[K1]], align 8, !llvm.access.group [[ACC_GRP27]]
-// CHECK7-NEXT:    [[TMP11:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP27]]
+// CHECK7-NEXT:    store i64 [[ADD]], ptr [[K1]], align 8, !llvm.access.group [[ACC_GRP41]]
+// CHECK7-NEXT:    [[TMP11:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP41]]
 // CHECK7-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP11]], 1
-// CHECK7-NEXT:    store i32 [[ADD3]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP27]]
+// CHECK7-NEXT:    store i32 [[ADD3]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP41]]
 // CHECK7-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK7:       omp.body.continue:
 // CHECK7-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK7:       omp.inner.for.inc:
-// CHECK7-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP27]]
+// CHECK7-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP41]]
 // CHECK7-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP12]], 1
-// CHECK7-NEXT:    store i32 [[ADD4]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP27]]
-// CHECK7-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP28:![0-9]+]]
+// CHECK7-NEXT:    store i32 [[ADD4]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP41]]
+// CHECK7-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP42:![0-9]+]]
 // CHECK7:       omp.inner.for.end:
 // CHECK7-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK7:       omp.dispatch.inc:
@@ -5931,44 +5931,44 @@ int bar(int n){
 // CHECK7-NEXT:    store i64 [[TMP6]], ptr [[DOTOMP_IV]], align 8
 // CHECK7-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK7:       omp.inner.for.cond:
-// CHECK7-NEXT:    [[TMP7:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP30:![0-9]+]]
-// CHECK7-NEXT:    [[TMP8:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8, !llvm.access.group [[ACC_GRP30]]
+// CHECK7-NEXT:    [[TMP7:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP44:![0-9]+]]
+// CHECK7-NEXT:    [[TMP8:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8, !llvm.access.group [[ACC_GRP44]]
 // CHECK7-NEXT:    [[CMP4:%.*]] = icmp ule i64 [[TMP7]], [[TMP8]]
 // CHECK7-NEXT:    br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK7:       omp.inner.for.body:
-// CHECK7-NEXT:    [[TMP9:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP30]]
+// CHECK7-NEXT:    [[TMP9:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP44]]
 // CHECK7-NEXT:    [[MUL:%.*]] = mul i64 [[TMP9]], 400
 // CHECK7-NEXT:    [[SUB:%.*]] = sub i64 2000, [[MUL]]
-// CHECK7-NEXT:    store i64 [[SUB]], ptr [[IT]], align 8, !llvm.access.group [[ACC_GRP30]]
-// CHECK7-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTLINEAR_START]], align 4, !llvm.access.group [[ACC_GRP30]]
+// CHECK7-NEXT:    store i64 [[SUB]], ptr [[IT]], align 8, !llvm.access.group [[ACC_GRP44]]
+// CHECK7-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTLINEAR_START]], align 4, !llvm.access.group [[ACC_GRP44]]
 // CHECK7-NEXT:    [[CONV:%.*]] = sext i32 [[TMP10]] to i64
-// CHECK7-NEXT:    [[TMP11:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP30]]
-// CHECK7-NEXT:    [[TMP12:%.*]] = load i64, ptr [[DOTLINEAR_STEP]], align 8, !llvm.access.group [[ACC_GRP30]]
+// CHECK7-NEXT:    [[TMP11:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP44]]
+// CHECK7-NEXT:    [[TMP12:%.*]] = load i64, ptr [[DOTLINEAR_STEP]], align 8, !llvm.access.group [[ACC_GRP44]]
 // CHECK7-NEXT:    [[MUL5:%.*]] = mul i64 [[TMP11]], [[TMP12]]
 // CHECK7-NEXT:    [[ADD:%.*]] = add i64 [[CONV]], [[MUL5]]
 // CHECK7-NEXT:    [[CONV6:%.*]] = trunc i64 [[ADD]] to i32
-// CHECK7-NEXT:    store i32 [[CONV6]], ptr [[LIN2]], align 4, !llvm.access.group [[ACC_GRP30]]
-// CHECK7-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTLINEAR_START1]], align 4, !llvm.access.group [[ACC_GRP30]]
+// CHECK7-NEXT:    store i32 [[CONV6]], ptr [[LIN2]], align 4, !llvm.access.group [[ACC_GRP44]]
+// CHECK7-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTLINEAR_START1]], align 4, !llvm.access.group [[ACC_GRP44]]
 // CHECK7-NEXT:    [[CONV7:%.*]] = sext i32 [[TMP13]] to i64
-// CHECK7-NEXT:    [[TMP14:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP30]]
-// CHECK7-NEXT:    [[TMP15:%.*]] = load i64, ptr [[DOTLINEAR_STEP]], align 8, !llvm.access.group [[ACC_GRP30]]
+// CHECK7-NEXT:    [[TMP14:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP44]]
+// CHECK7-NEXT:    [[TMP15:%.*]] = load i64, ptr [[DOTLINEAR_STEP]], align 8, !llvm.access.group [[ACC_GRP44]]
 // CHECK7-NEXT:    [[MUL8:%.*]] = mul i64 [[TMP14]], [[TMP15]]
 // CHECK7-NEXT:    [[ADD9:%.*]] = add i64 [[CONV7]], [[MUL8]]
 // CHECK7-NEXT:    [[CONV10:%.*]] = trunc i64 [[ADD9]] to i32
-// CHECK7-NEXT:    store i32 [[CONV10]], ptr [[A3]], align 4, !llvm.access.group [[ACC_GRP30]]
-// CHECK7-NEXT:    [[TMP16:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP30]]
+// CHECK7-NEXT:    store i32 [[CONV10]], ptr [[A3]], align 4, !llvm.access.group [[ACC_GRP44]]
+// CHECK7-NEXT:    [[TMP16:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP44]]
 // CHECK7-NEXT:    [[CONV11:%.*]] = sext i16 [[TMP16]] to i32
 // CHECK7-NEXT:    [[ADD12:%.*]] = add nsw i32 [[CONV11]], 1
 // CHECK7-NEXT:    [[CONV13:%.*]] = trunc i32 [[ADD12]] to i16
-// CHECK7-NEXT:    store i16 [[CONV13]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP30]]
+// CHECK7-NEXT:    store i16 [[CONV13]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP44]]
 // CHECK7-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK7:       omp.body.continue:
 // CHECK7-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK7:       omp.inner.for.inc:
-// CHECK7-NEXT:    [[TMP17:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP30]]
+// CHECK7-NEXT:    [[TMP17:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP44]]
 // CHECK7-NEXT:    [[ADD14:%.*]] = add i64 [[TMP17]], 1
-// CHECK7-NEXT:    store i64 [[ADD14]], ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP30]]
-// CHECK7-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP31:![0-9]+]]
+// CHECK7-NEXT:    store i64 [[ADD14]], ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP44]]
+// CHECK7-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP45:![0-9]+]]
 // CHECK7:       omp.inner.for.end:
 // CHECK7-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK7:       omp.loop.exit:
@@ -6052,32 +6052,32 @@ int bar(int n){
 // CHECK7-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
 // CHECK7-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK7:       omp.inner.for.cond:
-// CHECK7-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP33:![0-9]+]]
-// CHECK7-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP33]]
+// CHECK7-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP47:![0-9]+]]
+// CHECK7-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP47]]
 // CHECK7-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
 // CHECK7-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK7:       omp.inner.for.body:
-// CHECK7-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP33]]
+// CHECK7-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP47]]
 // CHECK7-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 4
 // CHECK7-NEXT:    [[ADD:%.*]] = add nsw i32 6, [[MUL]]
 // CHECK7-NEXT:    [[CONV:%.*]] = trunc i32 [[ADD]] to i16
-// CHECK7-NEXT:    store i16 [[CONV]], ptr [[IT]], align 2, !llvm.access.group [[ACC_GRP33]]
-// CHECK7-NEXT:    [[TMP8:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP33]]
+// CHECK7-NEXT:    store i16 [[CONV]], ptr [[IT]], align 2, !llvm.access.group [[ACC_GRP47]]
+// CHECK7-NEXT:    [[TMP8:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP47]]
 // CHECK7-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP8]], 1
-// CHECK7-NEXT:    store i32 [[ADD2]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP33]]
-// CHECK7-NEXT:    [[TMP9:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP33]]
+// CHECK7-NEXT:    store i32 [[ADD2]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP47]]
+// CHECK7-NEXT:    [[TMP9:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP47]]
 // CHECK7-NEXT:    [[CONV3:%.*]] = sext i16 [[TMP9]] to i32
 // CHECK7-NEXT:    [[ADD4:%.*]] = add nsw i32 [[CONV3]], 1
 // CHECK7-NEXT:    [[CONV5:%.*]] = trunc i32 [[ADD4]] to i16
-// CHECK7-NEXT:    store i16 [[CONV5]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP33]]
+// CHECK7-NEXT:    store i16 [[CONV5]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP47]]
 // CHECK7-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK7:       omp.body.continue:
 // CHECK7-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK7:       omp.inner.for.inc:
-// CHECK7-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP33]]
+// CHECK7-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP47]]
 // CHECK7-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP10]], 1
-// CHECK7-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP33]]
-// CHECK7-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP34:![0-9]+]]
+// CHECK7-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP47]]
+// CHECK7-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP48:![0-9]+]]
 // CHECK7:       omp.inner.for.end:
 // CHECK7-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK7:       omp.loop.exit:
@@ -6207,60 +6207,60 @@ int bar(int n){
 // CHECK7:       omp.dispatch.body:
 // CHECK7-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK7:       omp.inner.for.cond:
-// CHECK7-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP36:![0-9]+]]
-// CHECK7-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP36]]
+// CHECK7-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP50:![0-9]+]]
+// CHECK7-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP50]]
 // CHECK7-NEXT:    [[CMP6:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]]
 // CHECK7-NEXT:    br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK7:       omp.inner.for.body:
-// CHECK7-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP36]]
+// CHECK7-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP50]]
 // CHECK7-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1
 // CHECK7-NEXT:    [[SUB:%.*]] = sub nsw i32 122, [[MUL]]
 // CHECK7-NEXT:    [[CONV:%.*]] = trunc i32 [[SUB]] to i8
-// CHECK7-NEXT:    store i8 [[CONV]], ptr [[IT]], align 1, !llvm.access.group [[ACC_GRP36]]
-// CHECK7-NEXT:    [[TMP19:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP36]]
+// CHECK7-NEXT:    store i8 [[CONV]], ptr [[IT]], align 1, !llvm.access.group [[ACC_GRP50]]
+// CHECK7-NEXT:    [[TMP19:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP50]]
 // CHECK7-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP19]], 1
-// CHECK7-NEXT:    store i32 [[ADD]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP36]]
+// CHECK7-NEXT:    store i32 [[ADD]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP50]]
 // CHECK7-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x float], ptr [[TMP0]], i32 0, i32 2
-// CHECK7-NEXT:    [[TMP20:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP36]]
+// CHECK7-NEXT:    [[TMP20:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP50]]
 // CHECK7-NEXT:    [[CONV7:%.*]] = fpext float [[TMP20]] to double
 // CHECK7-NEXT:    [[ADD8:%.*]] = fadd double [[CONV7]], 1.000000e+00
 // CHECK7-NEXT:    [[CONV9:%.*]] = fptrunc double [[ADD8]] to float
-// CHECK7-NEXT:    store float [[CONV9]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP36]]
+// CHECK7-NEXT:    store float [[CONV9]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP50]]
 // CHECK7-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 3
-// CHECK7-NEXT:    [[TMP21:%.*]] = load float, ptr [[ARRAYIDX10]], align 4, !llvm.access.group [[ACC_GRP36]]
+// CHECK7-NEXT:    [[TMP21:%.*]] = load float, ptr [[ARRAYIDX10]], align 4, !llvm.access.group [[ACC_GRP50]]
 // CHECK7-NEXT:    [[CONV11:%.*]] = fpext float [[TMP21]] to double
 // CHECK7-NEXT:    [[ADD12:%.*]] = fadd double [[CONV11]], 1.000000e+00
 // CHECK7-NEXT:    [[CONV13:%.*]] = fptrunc double [[ADD12]] to float
-// CHECK7-NEXT:    store float [[CONV13]], ptr [[ARRAYIDX10]], align 4, !llvm.access.group [[ACC_GRP36]]
+// CHECK7-NEXT:    store float [[CONV13]], ptr [[ARRAYIDX10]], align 4, !llvm.access.group [[ACC_GRP50]]
 // CHECK7-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds [5 x [10 x double]], ptr [[TMP3]], i32 0, i32 1
 // CHECK7-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds [10 x double], ptr [[ARRAYIDX14]], i32 0, i32 2
-// CHECK7-NEXT:    [[TMP22:%.*]] = load double, ptr [[ARRAYIDX15]], align 8, !llvm.access.group [[ACC_GRP36]]
+// CHECK7-NEXT:    [[TMP22:%.*]] = load double, ptr [[ARRAYIDX15]], align 8, !llvm.access.group [[ACC_GRP50]]
 // CHECK7-NEXT:    [[ADD16:%.*]] = fadd double [[TMP22]], 1.000000e+00
-// CHECK7-NEXT:    store double [[ADD16]], ptr [[ARRAYIDX15]], align 8, !llvm.access.group [[ACC_GRP36]]
+// CHECK7-NEXT:    store double [[ADD16]], ptr [[ARRAYIDX15]], align 8, !llvm.access.group [[ACC_GRP50]]
 // CHECK7-NEXT:    [[TMP23:%.*]] = mul nsw i32 1, [[TMP5]]
 // CHECK7-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds double, ptr [[TMP6]], i32 [[TMP23]]
 // CHECK7-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds double, ptr [[ARRAYIDX17]], i32 3
-// CHECK7-NEXT:    [[TMP24:%.*]] = load double, ptr [[ARRAYIDX18]], align 8, !llvm.access.group [[ACC_GRP36]]
+// CHECK7-NEXT:    [[TMP24:%.*]] = load double, ptr [[ARRAYIDX18]], align 8, !llvm.access.group [[ACC_GRP50]]
 // CHECK7-NEXT:    [[ADD19:%.*]] = fadd double [[TMP24]], 1.000000e+00
-// CHECK7-NEXT:    store double [[ADD19]], ptr [[ARRAYIDX18]], align 8, !llvm.access.group [[ACC_GRP36]]
+// CHECK7-NEXT:    store double [[ADD19]], ptr [[ARRAYIDX18]], align 8, !llvm.access.group [[ACC_GRP50]]
 // CHECK7-NEXT:    [[X:%.*]] = getelementptr inbounds [[STRUCT_TT:%.*]], ptr [[TMP7]], i32 0, i32 0
-// CHECK7-NEXT:    [[TMP25:%.*]] = load i64, ptr [[X]], align 4, !llvm.access.group [[ACC_GRP36]]
+// CHECK7-NEXT:    [[TMP25:%.*]] = load i64, ptr [[X]], align 4, !llvm.access.group [[ACC_GRP50]]
 // CHECK7-NEXT:    [[ADD20:%.*]] = add nsw i64 [[TMP25]], 1
-// CHECK7-NEXT:    store i64 [[ADD20]], ptr [[X]], align 4, !llvm.access.group [[ACC_GRP36]]
+// CHECK7-NEXT:    store i64 [[ADD20]], ptr [[X]], align 4, !llvm.access.group [[ACC_GRP50]]
 // CHECK7-NEXT:    [[Y:%.*]] = getelementptr inbounds [[STRUCT_TT]], ptr [[TMP7]], i32 0, i32 1
-// CHECK7-NEXT:    [[TMP26:%.*]] = load i8, ptr [[Y]], align 4, !llvm.access.group [[ACC_GRP36]]
+// CHECK7-NEXT:    [[TMP26:%.*]] = load i8, ptr [[Y]], align 4, !llvm.access.group [[ACC_GRP50]]
 // CHECK7-NEXT:    [[CONV21:%.*]] = sext i8 [[TMP26]] to i32
 // CHECK7-NEXT:    [[ADD22:%.*]] = add nsw i32 [[CONV21]], 1
 // CHECK7-NEXT:    [[CONV23:%.*]] = trunc i32 [[ADD22]] to i8
-// CHECK7-NEXT:    store i8 [[CONV23]], ptr [[Y]], align 4, !llvm.access.group [[ACC_GRP36]]
+// CHECK7-NEXT:    store i8 [[CONV23]], ptr [[Y]], align 4, !llvm.access.group [[ACC_GRP50]]
 // CHECK7-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK7:       omp.body.continue:
 // CHECK7-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK7:       omp.inner.for.inc:
-// CHECK7-NEXT:    [[TMP27:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP36]]
+// CHECK7-NEXT:    [[TMP27:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP50]]
 // CHECK7-NEXT:    [[ADD24:%.*]] = add nsw i32 [[TMP27]], 1
-// CHECK7-NEXT:    store i32 [[ADD24]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP36]]
-// CHECK7-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP37:![0-9]+]]
+// CHECK7-NEXT:    store i32 [[ADD24]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP50]]
+// CHECK7-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP51:![0-9]+]]
 // CHECK7:       omp.inner.for.end:
 // CHECK7-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK7:       omp.dispatch.inc:
@@ -6754,37 +6754,37 @@ int bar(int n){
 // CHECK7-NEXT:    store i64 [[TMP9]], ptr [[DOTOMP_IV]], align 8
 // CHECK7-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK7:       omp.inner.for.cond:
-// CHECK7-NEXT:    [[TMP10:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP39:![0-9]+]]
-// CHECK7-NEXT:    [[TMP11:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8, !llvm.access.group [[ACC_GRP39]]
+// CHECK7-NEXT:    [[TMP10:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP53:![0-9]+]]
+// CHECK7-NEXT:    [[TMP11:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8, !llvm.access.group [[ACC_GRP53]]
 // CHECK7-NEXT:    [[CMP3:%.*]] = icmp ule i64 [[TMP10]], [[TMP11]]
 // CHECK7-NEXT:    br i1 [[CMP3]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK7:       omp.inner.for.body:
-// CHECK7-NEXT:    [[TMP12:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP39]]
+// CHECK7-NEXT:    [[TMP12:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP53]]
 // CHECK7-NEXT:    [[MUL:%.*]] = mul i64 [[TMP12]], 400
 // CHECK7-NEXT:    [[SUB:%.*]] = sub i64 2000, [[MUL]]
-// CHECK7-NEXT:    store i64 [[SUB]], ptr [[IT]], align 8, !llvm.access.group [[ACC_GRP39]]
-// CHECK7-NEXT:    [[TMP13:%.*]] = load i32, ptr [[B_ADDR]], align 4, !llvm.access.group [[ACC_GRP39]]
+// CHECK7-NEXT:    store i64 [[SUB]], ptr [[IT]], align 8, !llvm.access.group [[ACC_GRP53]]
+// CHECK7-NEXT:    [[TMP13:%.*]] = load i32, ptr [[B_ADDR]], align 4, !llvm.access.group [[ACC_GRP53]]
 // CHECK7-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP13]] to double
 // CHECK7-NEXT:    [[ADD:%.*]] = fadd double [[CONV]], 1.500000e+00
 // CHECK7-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_S1:%.*]], ptr [[TMP0]], i32 0, i32 0
-// CHECK7-NEXT:    store double [[ADD]], ptr [[A]], align 4, !nontemporal !40, !llvm.access.group [[ACC_GRP39]]
+// CHECK7-NEXT:    store double [[ADD]], ptr [[A]], align 4, !nontemporal !54, !llvm.access.group [[ACC_GRP53]]
 // CHECK7-NEXT:    [[A4:%.*]] = getelementptr inbounds [[STRUCT_S1]], ptr [[TMP0]], i32 0, i32 0
-// CHECK7-NEXT:    [[TMP14:%.*]] = load double, ptr [[A4]], align 4, !nontemporal !40, !llvm.access.group [[ACC_GRP39]]
+// CHECK7-NEXT:    [[TMP14:%.*]] = load double, ptr [[A4]], align 4, !nontemporal !54, !llvm.access.group [[ACC_GRP53]]
 // CHECK7-NEXT:    [[INC:%.*]] = fadd double [[TMP14]], 1.000000e+00
-// CHECK7-NEXT:    store double [[INC]], ptr [[A4]], align 4, !nontemporal !40, !llvm.access.group [[ACC_GRP39]]
+// CHECK7-NEXT:    store double [[INC]], ptr [[A4]], align 4, !nontemporal !54, !llvm.access.group [[ACC_GRP53]]
 // CHECK7-NEXT:    [[CONV5:%.*]] = fptosi double [[INC]] to i16
 // CHECK7-NEXT:    [[TMP15:%.*]] = mul nsw i32 1, [[TMP2]]
 // CHECK7-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[TMP3]], i32 [[TMP15]]
 // CHECK7-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i16, ptr [[ARRAYIDX]], i32 1
-// CHECK7-NEXT:    store i16 [[CONV5]], ptr [[ARRAYIDX6]], align 2, !llvm.access.group [[ACC_GRP39]]
+// CHECK7-NEXT:    store i16 [[CONV5]], ptr [[ARRAYIDX6]], align 2, !llvm.access.group [[ACC_GRP53]]
 // CHECK7-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK7:       omp.body.continue:
 // CHECK7-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK7:       omp.inner.for.inc:
-// CHECK7-NEXT:    [[TMP16:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP39]]
+// CHECK7-NEXT:    [[TMP16:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP53]]
 // CHECK7-NEXT:    [[ADD7:%.*]] = add i64 [[TMP16]], 1
-// CHECK7-NEXT:    store i64 [[ADD7]], ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP39]]
-// CHECK7-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP41:![0-9]+]]
+// CHECK7-NEXT:    store i64 [[ADD7]], ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP53]]
+// CHECK7-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP55:![0-9]+]]
 // CHECK7:       omp.inner.for.end:
 // CHECK7-NEXT:    br label [[OMP_IF_END:%.*]]
 // CHECK7:       omp_if.else:
@@ -6836,7 +6836,7 @@ int bar(int n){
 // CHECK7-NEXT:    [[TMP28:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8
 // CHECK7-NEXT:    [[ADD28:%.*]] = add i64 [[TMP28]], 1
 // CHECK7-NEXT:    store i64 [[ADD28]], ptr [[DOTOMP_IV]], align 8
-// CHECK7-NEXT:    br label [[OMP_INNER_FOR_COND13]], !llvm.loop [[LOOP43:![0-9]+]]
+// CHECK7-NEXT:    br label [[OMP_INNER_FOR_COND13]], !llvm.loop [[LOOP57:![0-9]+]]
 // CHECK7:       omp.inner.for.end29:
 // CHECK7-NEXT:    br label [[OMP_IF_END]]
 // CHECK7:       omp_if.end:
@@ -6969,35 +6969,35 @@ int bar(int n){
 // CHECK7-NEXT:    store i64 [[TMP5]], ptr [[DOTOMP_IV]], align 8
 // CHECK7-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK7:       omp.inner.for.cond:
-// CHECK7-NEXT:    [[TMP6:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP45:![0-9]+]]
-// CHECK7-NEXT:    [[TMP7:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8, !llvm.access.group [[ACC_GRP45]]
+// CHECK7-NEXT:    [[TMP6:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP59:![0-9]+]]
+// CHECK7-NEXT:    [[TMP7:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8, !llvm.access.group [[ACC_GRP59]]
 // CHECK7-NEXT:    [[CMP1:%.*]] = icmp sle i64 [[TMP6]], [[TMP7]]
 // CHECK7-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK7:       omp.inner.for.body:
-// CHECK7-NEXT:    [[TMP8:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP45]]
+// CHECK7-NEXT:    [[TMP8:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP59]]
 // CHECK7-NEXT:    [[MUL:%.*]] = mul nsw i64 [[TMP8]], 3
 // CHECK7-NEXT:    [[ADD:%.*]] = add nsw i64 -10, [[MUL]]
-// CHECK7-NEXT:    store i64 [[ADD]], ptr [[I]], align 8, !llvm.access.group [[ACC_GRP45]]
-// CHECK7-NEXT:    [[TMP9:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP45]]
+// CHECK7-NEXT:    store i64 [[ADD]], ptr [[I]], align 8, !llvm.access.group [[ACC_GRP59]]
+// CHECK7-NEXT:    [[TMP9:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP59]]
 // CHECK7-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP9]], 1
-// CHECK7-NEXT:    store i32 [[ADD2]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP45]]
-// CHECK7-NEXT:    [[TMP10:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP45]]
+// CHECK7-NEXT:    store i32 [[ADD2]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP59]]
+// CHECK7-NEXT:    [[TMP10:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP59]]
 // CHECK7-NEXT:    [[CONV:%.*]] = sext i16 [[TMP10]] to i32
 // CHECK7-NEXT:    [[ADD3:%.*]] = add nsw i32 [[CONV]], 1
 // CHECK7-NEXT:    [[CONV4:%.*]] = trunc i32 [[ADD3]] to i16
-// CHECK7-NEXT:    store i16 [[CONV4]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP45]]
+// CHECK7-NEXT:    store i16 [[CONV4]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP59]]
 // CHECK7-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP0]], i32 0, i32 2
-// CHECK7-NEXT:    [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP45]]
+// CHECK7-NEXT:    [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP59]]
 // CHECK7-NEXT:    [[ADD5:%.*]] = add nsw i32 [[TMP11]], 1
-// CHECK7-NEXT:    store i32 [[ADD5]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP45]]
+// CHECK7-NEXT:    store i32 [[ADD5]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP59]]
 // CHECK7-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK7:       omp.body.continue:
 // CHECK7-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK7:       omp.inner.for.inc:
-// CHECK7-NEXT:    [[TMP12:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP45]]
+// CHECK7-NEXT:    [[TMP12:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP59]]
 // CHECK7-NEXT:    [[ADD6:%.*]] = add nsw i64 [[TMP12]], 1
-// CHECK7-NEXT:    store i64 [[ADD6]], ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP45]]
-// CHECK7-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP46:![0-9]+]]
+// CHECK7-NEXT:    store i64 [[ADD6]], ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP59]]
+// CHECK7-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP60:![0-9]+]]
 // CHECK7:       omp.inner.for.end:
 // CHECK7-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK7:       omp.loop.exit:
@@ -9066,23 +9066,23 @@ int bar(int n){
 // CHECK17-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
 // CHECK17-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK17:       omp.inner.for.cond:
-// CHECK17-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP11:![0-9]+]]
-// CHECK17-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP11]]
+// CHECK17-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP25:![0-9]+]]
+// CHECK17-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP25]]
 // CHECK17-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
 // CHECK17-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK17:       omp.inner.for.body:
-// CHECK17-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP11]]
+// CHECK17-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP25]]
 // CHECK17-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 5
 // CHECK17-NEXT:    [[ADD:%.*]] = add nsw i32 3, [[MUL]]
-// CHECK17-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP11]]
+// CHECK17-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP25]]
 // CHECK17-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK17:       omp.body.continue:
 // CHECK17-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK17:       omp.inner.for.inc:
-// CHECK17-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP11]]
+// CHECK17-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP25]]
 // CHECK17-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP8]], 1
-// CHECK17-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP11]]
-// CHECK17-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP12:![0-9]+]]
+// CHECK17-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP25]]
+// CHECK17-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP26:![0-9]+]]
 // CHECK17:       omp.inner.for.end:
 // CHECK17-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK17:       omp.loop.exit:
@@ -9176,44 +9176,44 @@ int bar(int n){
 // CHECK17-NEXT:    store i64 [[TMP6]], ptr [[DOTOMP_IV]], align 8
 // CHECK17-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK17:       omp.inner.for.cond:
-// CHECK17-NEXT:    [[TMP7:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP17:![0-9]+]]
-// CHECK17-NEXT:    [[TMP8:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8, !llvm.access.group [[ACC_GRP17]]
+// CHECK17-NEXT:    [[TMP7:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP31:![0-9]+]]
+// CHECK17-NEXT:    [[TMP8:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8, !llvm.access.group [[ACC_GRP31]]
 // CHECK17-NEXT:    [[CMP4:%.*]] = icmp ule i64 [[TMP7]], [[TMP8]]
 // CHECK17-NEXT:    br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK17:       omp.inner.for.body:
-// CHECK17-NEXT:    [[TMP9:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP17]]
+// CHECK17-NEXT:    [[TMP9:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP31]]
 // CHECK17-NEXT:    [[MUL:%.*]] = mul i64 [[TMP9]], 400
 // CHECK17-NEXT:    [[SUB:%.*]] = sub i64 2000, [[MUL]]
-// CHECK17-NEXT:    store i64 [[SUB]], ptr [[IT]], align 8, !llvm.access.group [[ACC_GRP17]]
-// CHECK17-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTLINEAR_START]], align 4, !llvm.access.group [[ACC_GRP17]]
+// CHECK17-NEXT:    store i64 [[SUB]], ptr [[IT]], align 8, !llvm.access.group [[ACC_GRP31]]
+// CHECK17-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTLINEAR_START]], align 4, !llvm.access.group [[ACC_GRP31]]
 // CHECK17-NEXT:    [[CONV:%.*]] = sext i32 [[TMP10]] to i64
-// CHECK17-NEXT:    [[TMP11:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP17]]
-// CHECK17-NEXT:    [[TMP12:%.*]] = load i64, ptr [[DOTLINEAR_STEP]], align 8, !llvm.access.group [[ACC_GRP17]]
+// CHECK17-NEXT:    [[TMP11:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP31]]
+// CHECK17-NEXT:    [[TMP12:%.*]] = load i64, ptr [[DOTLINEAR_STEP]], align 8, !llvm.access.group [[ACC_GRP31]]
 // CHECK17-NEXT:    [[MUL5:%.*]] = mul i64 [[TMP11]], [[TMP12]]
 // CHECK17-NEXT:    [[ADD:%.*]] = add i64 [[CONV]], [[MUL5]]
 // CHECK17-NEXT:    [[CONV6:%.*]] = trunc i64 [[ADD]] to i32
-// CHECK17-NEXT:    store i32 [[CONV6]], ptr [[LIN2]], align 4, !llvm.access.group [[ACC_GRP17]]
-// CHECK17-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTLINEAR_START1]], align 4, !llvm.access.group [[ACC_GRP17]]
+// CHECK17-NEXT:    store i32 [[CONV6]], ptr [[LIN2]], align 4, !llvm.access.group [[ACC_GRP31]]
+// CHECK17-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTLINEAR_START1]], align 4, !llvm.access.group [[ACC_GRP31]]
 // CHECK17-NEXT:    [[CONV7:%.*]] = sext i32 [[TMP13]] to i64
-// CHECK17-NEXT:    [[TMP14:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP17]]
-// CHECK17-NEXT:    [[TMP15:%.*]] = load i64, ptr [[DOTLINEAR_STEP]], align 8, !llvm.access.group [[ACC_GRP17]]
+// CHECK17-NEXT:    [[TMP14:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP31]]
+// CHECK17-NEXT:    [[TMP15:%.*]] = load i64, ptr [[DOTLINEAR_STEP]], align 8, !llvm.access.group [[ACC_GRP31]]
 // CHECK17-NEXT:    [[MUL8:%.*]] = mul i64 [[TMP14]], [[TMP15]]
 // CHECK17-NEXT:    [[ADD9:%.*]] = add i64 [[CONV7]], [[MUL8]]
 // CHECK17-NEXT:    [[CONV10:%.*]] = trunc i64 [[ADD9]] to i32
-// CHECK17-NEXT:    store i32 [[CONV10]], ptr [[A3]], align 4, !llvm.access.group [[ACC_GRP17]]
-// CHECK17-NEXT:    [[TMP16:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP17]]
+// CHECK17-NEXT:    store i32 [[CONV10]], ptr [[A3]], align 4, !llvm.access.group [[ACC_GRP31]]
+// CHECK17-NEXT:    [[TMP16:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP31]]
 // CHECK17-NEXT:    [[CONV11:%.*]] = sext i16 [[TMP16]] to i32
 // CHECK17-NEXT:    [[ADD12:%.*]] = add nsw i32 [[CONV11]], 1
 // CHECK17-NEXT:    [[CONV13:%.*]] = trunc i32 [[ADD12]] to i16
-// CHECK17-NEXT:    store i16 [[CONV13]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP17]]
+// CHECK17-NEXT:    store i16 [[CONV13]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP31]]
 // CHECK17-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK17:       omp.body.continue:
 // CHECK17-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK17:       omp.inner.for.inc:
-// CHECK17-NEXT:    [[TMP17:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP17]]
+// CHECK17-NEXT:    [[TMP17:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP31]]
 // CHECK17-NEXT:    [[ADD14:%.*]] = add i64 [[TMP17]], 1
-// CHECK17-NEXT:    store i64 [[ADD14]], ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP17]]
-// CHECK17-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP18:![0-9]+]]
+// CHECK17-NEXT:    store i64 [[ADD14]], ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP31]]
+// CHECK17-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP32:![0-9]+]]
 // CHECK17:       omp.inner.for.end:
 // CHECK17-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK17:       omp.loop.exit:
@@ -9303,32 +9303,32 @@ int bar(int n){
 // CHECK17-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
 // CHECK17-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK17:       omp.inner.for.cond:
-// CHECK17-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP20:![0-9]+]]
-// CHECK17-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP20]]
+// CHECK17-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP34:![0-9]+]]
+// CHECK17-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP34]]
 // CHECK17-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
 // CHECK17-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK17:       omp.inner.for.body:
-// CHECK17-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP20]]
+// CHECK17-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP34]]
 // CHECK17-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 4
 // CHECK17-NEXT:    [[ADD:%.*]] = add nsw i32 6, [[MUL]]
 // CHECK17-NEXT:    [[CONV:%.*]] = trunc i32 [[ADD]] to i16
-// CHECK17-NEXT:    store i16 [[CONV]], ptr [[IT]], align 2, !llvm.access.group [[ACC_GRP20]]
-// CHECK17-NEXT:    [[TMP8:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP20]]
+// CHECK17-NEXT:    store i16 [[CONV]], ptr [[IT]], align 2, !llvm.access.group [[ACC_GRP34]]
+// CHECK17-NEXT:    [[TMP8:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP34]]
 // CHECK17-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP8]], 1
-// CHECK17-NEXT:    store i32 [[ADD2]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP20]]
-// CHECK17-NEXT:    [[TMP9:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP20]]
+// CHECK17-NEXT:    store i32 [[ADD2]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP34]]
+// CHECK17-NEXT:    [[TMP9:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP34]]
 // CHECK17-NEXT:    [[CONV3:%.*]] = sext i16 [[TMP9]] to i32
 // CHECK17-NEXT:    [[ADD4:%.*]] = add nsw i32 [[CONV3]], 1
 // CHECK17-NEXT:    [[CONV5:%.*]] = trunc i32 [[ADD4]] to i16
-// CHECK17-NEXT:    store i16 [[CONV5]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP20]]
+// CHECK17-NEXT:    store i16 [[CONV5]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP34]]
 // CHECK17-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK17:       omp.body.continue:
 // CHECK17-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK17:       omp.inner.for.inc:
-// CHECK17-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP20]]
+// CHECK17-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP34]]
 // CHECK17-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP10]], 1
-// CHECK17-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP20]]
-// CHECK17-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP21:![0-9]+]]
+// CHECK17-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP34]]
+// CHECK17-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP35:![0-9]+]]
 // CHECK17:       omp.inner.for.end:
 // CHECK17-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK17:       omp.loop.exit:
@@ -9458,60 +9458,60 @@ int bar(int n){
 // CHECK17:       omp.dispatch.body:
 // CHECK17-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK17:       omp.inner.for.cond:
-// CHECK17-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP23:![0-9]+]]
-// CHECK17-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK17-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP37:![0-9]+]]
+// CHECK17-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP37]]
 // CHECK17-NEXT:    [[CMP6:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]]
 // CHECK17-NEXT:    br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK17:       omp.inner.for.body:
-// CHECK17-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK17-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP37]]
 // CHECK17-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1
 // CHECK17-NEXT:    [[SUB:%.*]] = sub nsw i32 122, [[MUL]]
 // CHECK17-NEXT:    [[CONV:%.*]] = trunc i32 [[SUB]] to i8
-// CHECK17-NEXT:    store i8 [[CONV]], ptr [[IT]], align 1, !llvm.access.group [[ACC_GRP23]]
-// CHECK17-NEXT:    [[TMP19:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK17-NEXT:    store i8 [[CONV]], ptr [[IT]], align 1, !llvm.access.group [[ACC_GRP37]]
+// CHECK17-NEXT:    [[TMP19:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP37]]
 // CHECK17-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP19]], 1
-// CHECK17-NEXT:    store i32 [[ADD]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK17-NEXT:    store i32 [[ADD]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP37]]
 // CHECK17-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x float], ptr [[TMP0]], i64 0, i64 2
-// CHECK17-NEXT:    [[TMP20:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK17-NEXT:    [[TMP20:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP37]]
 // CHECK17-NEXT:    [[CONV7:%.*]] = fpext float [[TMP20]] to double
 // CHECK17-NEXT:    [[ADD8:%.*]] = fadd double [[CONV7]], 1.000000e+00
 // CHECK17-NEXT:    [[CONV9:%.*]] = fptrunc double [[ADD8]] to float
-// CHECK17-NEXT:    store float [[CONV9]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK17-NEXT:    store float [[CONV9]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP37]]
 // CHECK17-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i64 3
-// CHECK17-NEXT:    [[TMP21:%.*]] = load float, ptr [[ARRAYIDX10]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK17-NEXT:    [[TMP21:%.*]] = load float, ptr [[ARRAYIDX10]], align 4, !llvm.access.group [[ACC_GRP37]]
 // CHECK17-NEXT:    [[CONV11:%.*]] = fpext float [[TMP21]] to double
 // CHECK17-NEXT:    [[ADD12:%.*]] = fadd double [[CONV11]], 1.000000e+00
 // CHECK17-NEXT:    [[CONV13:%.*]] = fptrunc double [[ADD12]] to float
-// CHECK17-NEXT:    store float [[CONV13]], ptr [[ARRAYIDX10]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK17-NEXT:    store float [[CONV13]], ptr [[ARRAYIDX10]], align 4, !llvm.access.group [[ACC_GRP37]]
 // CHECK17-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds [5 x [10 x double]], ptr [[TMP3]], i64 0, i64 1
 // CHECK17-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds [10 x double], ptr [[ARRAYIDX14]], i64 0, i64 2
-// CHECK17-NEXT:    [[TMP22:%.*]] = load double, ptr [[ARRAYIDX15]], align 8, !llvm.access.group [[ACC_GRP23]]
+// CHECK17-NEXT:    [[TMP22:%.*]] = load double, ptr [[ARRAYIDX15]], align 8, !llvm.access.group [[ACC_GRP37]]
 // CHECK17-NEXT:    [[ADD16:%.*]] = fadd double [[TMP22]], 1.000000e+00
-// CHECK17-NEXT:    store double [[ADD16]], ptr [[ARRAYIDX15]], align 8, !llvm.access.group [[ACC_GRP23]]
+// CHECK17-NEXT:    store double [[ADD16]], ptr [[ARRAYIDX15]], align 8, !llvm.access.group [[ACC_GRP37]]
 // CHECK17-NEXT:    [[TMP23:%.*]] = mul nsw i64 1, [[TMP5]]
 // CHECK17-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds double, ptr [[TMP6]], i64 [[TMP23]]
 // CHECK17-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds double, ptr [[ARRAYIDX17]], i64 3
-// CHECK17-NEXT:    [[TMP24:%.*]] = load double, ptr [[ARRAYIDX18]], align 8, !llvm.access.group [[ACC_GRP23]]
+// CHECK17-NEXT:    [[TMP24:%.*]] = load double, ptr [[ARRAYIDX18]], align 8, !llvm.access.group [[ACC_GRP37]]
 // CHECK17-NEXT:    [[ADD19:%.*]] = fadd double [[TMP24]], 1.000000e+00
-// CHECK17-NEXT:    store double [[ADD19]], ptr [[ARRAYIDX18]], align 8, !llvm.access.group [[ACC_GRP23]]
+// CHECK17-NEXT:    store double [[ADD19]], ptr [[ARRAYIDX18]], align 8, !llvm.access.group [[ACC_GRP37]]
 // CHECK17-NEXT:    [[X:%.*]] = getelementptr inbounds [[STRUCT_TT:%.*]], ptr [[TMP7]], i32 0, i32 0
-// CHECK17-NEXT:    [[TMP25:%.*]] = load i64, ptr [[X]], align 8, !llvm.access.group [[ACC_GRP23]]
+// CHECK17-NEXT:    [[TMP25:%.*]] = load i64, ptr [[X]], align 8, !llvm.access.group [[ACC_GRP37]]
 // CHECK17-NEXT:    [[ADD20:%.*]] = add nsw i64 [[TMP25]], 1
-// CHECK17-NEXT:    store i64 [[ADD20]], ptr [[X]], align 8, !llvm.access.group [[ACC_GRP23]]
+// CHECK17-NEXT:    store i64 [[ADD20]], ptr [[X]], align 8, !llvm.access.group [[ACC_GRP37]]
 // CHECK17-NEXT:    [[Y:%.*]] = getelementptr inbounds [[STRUCT_TT]], ptr [[TMP7]], i32 0, i32 1
-// CHECK17-NEXT:    [[TMP26:%.*]] = load i8, ptr [[Y]], align 8, !llvm.access.group [[ACC_GRP23]]
+// CHECK17-NEXT:    [[TMP26:%.*]] = load i8, ptr [[Y]], align 8, !llvm.access.group [[ACC_GRP37]]
 // CHECK17-NEXT:    [[CONV21:%.*]] = sext i8 [[TMP26]] to i32
 // CHECK17-NEXT:    [[ADD22:%.*]] = add nsw i32 [[CONV21]], 1
 // CHECK17-NEXT:    [[CONV23:%.*]] = trunc i32 [[ADD22]] to i8
-// CHECK17-NEXT:    store i8 [[CONV23]], ptr [[Y]], align 8, !llvm.access.group [[ACC_GRP23]]
+// CHECK17-NEXT:    store i8 [[CONV23]], ptr [[Y]], align 8, !llvm.access.group [[ACC_GRP37]]
 // CHECK17-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK17:       omp.body.continue:
 // CHECK17-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK17:       omp.inner.for.inc:
-// CHECK17-NEXT:    [[TMP27:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK17-NEXT:    [[TMP27:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP37]]
 // CHECK17-NEXT:    [[ADD24:%.*]] = add nsw i32 [[TMP27]], 1
-// CHECK17-NEXT:    store i32 [[ADD24]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP23]]
-// CHECK17-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP24:![0-9]+]]
+// CHECK17-NEXT:    store i32 [[ADD24]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP37]]
+// CHECK17-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP38:![0-9]+]]
 // CHECK17:       omp.inner.for.end:
 // CHECK17-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK17:       omp.dispatch.inc:
@@ -9660,37 +9660,37 @@ int bar(int n){
 // CHECK17-NEXT:    store i64 [[TMP8]], ptr [[DOTOMP_IV]], align 8
 // CHECK17-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK17:       omp.inner.for.cond:
-// CHECK17-NEXT:    [[TMP9:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP26:![0-9]+]]
-// CHECK17-NEXT:    [[TMP10:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8, !llvm.access.group [[ACC_GRP26]]
+// CHECK17-NEXT:    [[TMP9:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP40:![0-9]+]]
+// CHECK17-NEXT:    [[TMP10:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8, !llvm.access.group [[ACC_GRP40]]
 // CHECK17-NEXT:    [[CMP3:%.*]] = icmp ule i64 [[TMP9]], [[TMP10]]
 // CHECK17-NEXT:    br i1 [[CMP3]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK17:       omp.inner.for.body:
-// CHECK17-NEXT:    [[TMP11:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP26]]
+// CHECK17-NEXT:    [[TMP11:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP40]]
 // CHECK17-NEXT:    [[MUL:%.*]] = mul i64 [[TMP11]], 400
 // CHECK17-NEXT:    [[SUB:%.*]] = sub i64 2000, [[MUL]]
-// CHECK17-NEXT:    store i64 [[SUB]], ptr [[IT]], align 8, !llvm.access.group [[ACC_GRP26]]
-// CHECK17-NEXT:    [[TMP12:%.*]] = load i32, ptr [[B_ADDR]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK17-NEXT:    store i64 [[SUB]], ptr [[IT]], align 8, !llvm.access.group [[ACC_GRP40]]
+// CHECK17-NEXT:    [[TMP12:%.*]] = load i32, ptr [[B_ADDR]], align 4, !llvm.access.group [[ACC_GRP40]]
 // CHECK17-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP12]] to double
 // CHECK17-NEXT:    [[ADD:%.*]] = fadd double [[CONV]], 1.500000e+00
 // CHECK17-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_S1:%.*]], ptr [[TMP0]], i32 0, i32 0
-// CHECK17-NEXT:    store double [[ADD]], ptr [[A]], align 8, !llvm.access.group [[ACC_GRP26]]
+// CHECK17-NEXT:    store double [[ADD]], ptr [[A]], align 8, !llvm.access.group [[ACC_GRP40]]
 // CHECK17-NEXT:    [[A4:%.*]] = getelementptr inbounds [[STRUCT_S1]], ptr [[TMP0]], i32 0, i32 0
-// CHECK17-NEXT:    [[TMP13:%.*]] = load double, ptr [[A4]], align 8, !llvm.access.group [[ACC_GRP26]]
+// CHECK17-NEXT:    [[TMP13:%.*]] = load double, ptr [[A4]], align 8, !llvm.access.group [[ACC_GRP40]]
 // CHECK17-NEXT:    [[INC:%.*]] = fadd double [[TMP13]], 1.000000e+00
-// CHECK17-NEXT:    store double [[INC]], ptr [[A4]], align 8, !llvm.access.group [[ACC_GRP26]]
+// CHECK17-NEXT:    store double [[INC]], ptr [[A4]], align 8, !llvm.access.group [[ACC_GRP40]]
 // CHECK17-NEXT:    [[CONV5:%.*]] = fptosi double [[INC]] to i16
 // CHECK17-NEXT:    [[TMP14:%.*]] = mul nsw i64 1, [[TMP2]]
 // CHECK17-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[TMP3]], i64 [[TMP14]]
 // CHECK17-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i16, ptr [[ARRAYIDX]], i64 1
-// CHECK17-NEXT:    store i16 [[CONV5]], ptr [[ARRAYIDX6]], align 2, !llvm.access.group [[ACC_GRP26]]
+// CHECK17-NEXT:    store i16 [[CONV5]], ptr [[ARRAYIDX6]], align 2, !llvm.access.group [[ACC_GRP40]]
 // CHECK17-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK17:       omp.body.continue:
 // CHECK17-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK17:       omp.inner.for.inc:
-// CHECK17-NEXT:    [[TMP15:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP26]]
+// CHECK17-NEXT:    [[TMP15:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP40]]
 // CHECK17-NEXT:    [[ADD7:%.*]] = add i64 [[TMP15]], 1
-// CHECK17-NEXT:    store i64 [[ADD7]], ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP26]]
-// CHECK17-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP27:![0-9]+]]
+// CHECK17-NEXT:    store i64 [[ADD7]], ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP40]]
+// CHECK17-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP41:![0-9]+]]
 // CHECK17:       omp.inner.for.end:
 // CHECK17-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK17:       omp.loop.exit:
@@ -9770,35 +9770,35 @@ int bar(int n){
 // CHECK17-NEXT:    store i64 [[TMP5]], ptr [[DOTOMP_IV]], align 8
 // CHECK17-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK17:       omp.inner.for.cond:
-// CHECK17-NEXT:    [[TMP6:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP29:![0-9]+]]
-// CHECK17-NEXT:    [[TMP7:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8, !llvm.access.group [[ACC_GRP29]]
+// CHECK17-NEXT:    [[TMP6:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP43:![0-9]+]]
+// CHECK17-NEXT:    [[TMP7:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8, !llvm.access.group [[ACC_GRP43]]
 // CHECK17-NEXT:    [[CMP1:%.*]] = icmp sle i64 [[TMP6]], [[TMP7]]
 // CHECK17-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK17:       omp.inner.for.body:
-// CHECK17-NEXT:    [[TMP8:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP29]]
+// CHECK17-NEXT:    [[TMP8:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP43]]
 // CHECK17-NEXT:    [[MUL:%.*]] = mul nsw i64 [[TMP8]], 3
 // CHECK17-NEXT:    [[ADD:%.*]] = add nsw i64 -10, [[MUL]]
-// CHECK17-NEXT:    store i64 [[ADD]], ptr [[I]], align 8, !llvm.access.group [[ACC_GRP29]]
-// CHECK17-NEXT:    [[TMP9:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP29]]
+// CHECK17-NEXT:    store i64 [[ADD]], ptr [[I]], align 8, !llvm.access.group [[ACC_GRP43]]
+// CHECK17-NEXT:    [[TMP9:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP43]]
 // CHECK17-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP9]], 1
-// CHECK17-NEXT:    store i32 [[ADD2]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP29]]
-// CHECK17-NEXT:    [[TMP10:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP29]]
+// CHECK17-NEXT:    store i32 [[ADD2]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP43]]
+// CHECK17-NEXT:    [[TMP10:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP43]]
 // CHECK17-NEXT:    [[CONV:%.*]] = sext i16 [[TMP10]] to i32
 // CHECK17-NEXT:    [[ADD3:%.*]] = add nsw i32 [[CONV]], 1
 // CHECK17-NEXT:    [[CONV4:%.*]] = trunc i32 [[ADD3]] to i16
-// CHECK17-NEXT:    store i16 [[CONV4]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP29]]
+// CHECK17-NEXT:    store i16 [[CONV4]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP43]]
 // CHECK17-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP0]], i64 0, i64 2
-// CHECK17-NEXT:    [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP29]]
+// CHECK17-NEXT:    [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP43]]
 // CHECK17-NEXT:    [[ADD5:%.*]] = add nsw i32 [[TMP11]], 1
-// CHECK17-NEXT:    store i32 [[ADD5]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP29]]
+// CHECK17-NEXT:    store i32 [[ADD5]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP43]]
 // CHECK17-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK17:       omp.body.continue:
 // CHECK17-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK17:       omp.inner.for.inc:
-// CHECK17-NEXT:    [[TMP12:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP29]]
+// CHECK17-NEXT:    [[TMP12:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP43]]
 // CHECK17-NEXT:    [[ADD6:%.*]] = add nsw i64 [[TMP12]], 1
-// CHECK17-NEXT:    store i64 [[ADD6]], ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP29]]
-// CHECK17-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP30:![0-9]+]]
+// CHECK17-NEXT:    store i64 [[ADD6]], ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP43]]
+// CHECK17-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP44:![0-9]+]]
 // CHECK17:       omp.inner.for.end:
 // CHECK17-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK17:       omp.loop.exit:
@@ -9856,23 +9856,23 @@ int bar(int n){
 // CHECK19-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
 // CHECK19-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK19:       omp.inner.for.cond:
-// CHECK19-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12:![0-9]+]]
-// CHECK19-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK19-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26:![0-9]+]]
+// CHECK19-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP26]]
 // CHECK19-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
 // CHECK19-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK19:       omp.inner.for.body:
-// CHECK19-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK19-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
 // CHECK19-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 5
 // CHECK19-NEXT:    [[ADD:%.*]] = add nsw i32 3, [[MUL]]
-// CHECK19-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK19-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP26]]
 // CHECK19-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK19:       omp.body.continue:
 // CHECK19-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK19:       omp.inner.for.inc:
-// CHECK19-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK19-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
 // CHECK19-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP8]], 1
-// CHECK19-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12]]
-// CHECK19-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP13:![0-9]+]]
+// CHECK19-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK19-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP27:![0-9]+]]
 // CHECK19:       omp.inner.for.end:
 // CHECK19-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK19:       omp.loop.exit:
@@ -9966,44 +9966,44 @@ int bar(int n){
 // CHECK19-NEXT:    store i64 [[TMP6]], ptr [[DOTOMP_IV]], align 8
 // CHECK19-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK19:       omp.inner.for.cond:
-// CHECK19-NEXT:    [[TMP7:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP18:![0-9]+]]
-// CHECK19-NEXT:    [[TMP8:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8, !llvm.access.group [[ACC_GRP18]]
+// CHECK19-NEXT:    [[TMP7:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP32:![0-9]+]]
+// CHECK19-NEXT:    [[TMP8:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8, !llvm.access.group [[ACC_GRP32]]
 // CHECK19-NEXT:    [[CMP4:%.*]] = icmp ule i64 [[TMP7]], [[TMP8]]
 // CHECK19-NEXT:    br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK19:       omp.inner.for.body:
-// CHECK19-NEXT:    [[TMP9:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP18]]
+// CHECK19-NEXT:    [[TMP9:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP32]]
 // CHECK19-NEXT:    [[MUL:%.*]] = mul i64 [[TMP9]], 400
 // CHECK19-NEXT:    [[SUB:%.*]] = sub i64 2000, [[MUL]]
-// CHECK19-NEXT:    store i64 [[SUB]], ptr [[IT]], align 8, !llvm.access.group [[ACC_GRP18]]
-// CHECK19-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTLINEAR_START]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK19-NEXT:    store i64 [[SUB]], ptr [[IT]], align 8, !llvm.access.group [[ACC_GRP32]]
+// CHECK19-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTLINEAR_START]], align 4, !llvm.access.group [[ACC_GRP32]]
 // CHECK19-NEXT:    [[CONV:%.*]] = sext i32 [[TMP10]] to i64
-// CHECK19-NEXT:    [[TMP11:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP18]]
-// CHECK19-NEXT:    [[TMP12:%.*]] = load i64, ptr [[DOTLINEAR_STEP]], align 8, !llvm.access.group [[ACC_GRP18]]
+// CHECK19-NEXT:    [[TMP11:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP32]]
+// CHECK19-NEXT:    [[TMP12:%.*]] = load i64, ptr [[DOTLINEAR_STEP]], align 8, !llvm.access.group [[ACC_GRP32]]
 // CHECK19-NEXT:    [[MUL5:%.*]] = mul i64 [[TMP11]], [[TMP12]]
 // CHECK19-NEXT:    [[ADD:%.*]] = add i64 [[CONV]], [[MUL5]]
 // CHECK19-NEXT:    [[CONV6:%.*]] = trunc i64 [[ADD]] to i32
-// CHECK19-NEXT:    store i32 [[CONV6]], ptr [[LIN2]], align 4, !llvm.access.group [[ACC_GRP18]]
-// CHECK19-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTLINEAR_START1]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK19-NEXT:    store i32 [[CONV6]], ptr [[LIN2]], align 4, !llvm.access.group [[ACC_GRP32]]
+// CHECK19-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTLINEAR_START1]], align 4, !llvm.access.group [[ACC_GRP32]]
 // CHECK19-NEXT:    [[CONV7:%.*]] = sext i32 [[TMP13]] to i64
-// CHECK19-NEXT:    [[TMP14:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP18]]
-// CHECK19-NEXT:    [[TMP15:%.*]] = load i64, ptr [[DOTLINEAR_STEP]], align 8, !llvm.access.group [[ACC_GRP18]]
+// CHECK19-NEXT:    [[TMP14:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP32]]
+// CHECK19-NEXT:    [[TMP15:%.*]] = load i64, ptr [[DOTLINEAR_STEP]], align 8, !llvm.access.group [[ACC_GRP32]]
 // CHECK19-NEXT:    [[MUL8:%.*]] = mul i64 [[TMP14]], [[TMP15]]
 // CHECK19-NEXT:    [[ADD9:%.*]] = add i64 [[CONV7]], [[MUL8]]
 // CHECK19-NEXT:    [[CONV10:%.*]] = trunc i64 [[ADD9]] to i32
-// CHECK19-NEXT:    store i32 [[CONV10]], ptr [[A3]], align 4, !llvm.access.group [[ACC_GRP18]]
-// CHECK19-NEXT:    [[TMP16:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP18]]
+// CHECK19-NEXT:    store i32 [[CONV10]], ptr [[A3]], align 4, !llvm.access.group [[ACC_GRP32]]
+// CHECK19-NEXT:    [[TMP16:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP32]]
 // CHECK19-NEXT:    [[CONV11:%.*]] = sext i16 [[TMP16]] to i32
 // CHECK19-NEXT:    [[ADD12:%.*]] = add nsw i32 [[CONV11]], 1
 // CHECK19-NEXT:    [[CONV13:%.*]] = trunc i32 [[ADD12]] to i16
-// CHECK19-NEXT:    store i16 [[CONV13]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP18]]
+// CHECK19-NEXT:    store i16 [[CONV13]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP32]]
 // CHECK19-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK19:       omp.body.continue:
 // CHECK19-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK19:       omp.inner.for.inc:
-// CHECK19-NEXT:    [[TMP17:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP18]]
+// CHECK19-NEXT:    [[TMP17:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP32]]
 // CHECK19-NEXT:    [[ADD14:%.*]] = add i64 [[TMP17]], 1
-// CHECK19-NEXT:    store i64 [[ADD14]], ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP18]]
-// CHECK19-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP19:![0-9]+]]
+// CHECK19-NEXT:    store i64 [[ADD14]], ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP32]]
+// CHECK19-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP33:![0-9]+]]
 // CHECK19:       omp.inner.for.end:
 // CHECK19-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK19:       omp.loop.exit:
@@ -10093,32 +10093,32 @@ int bar(int n){
 // CHECK19-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
 // CHECK19-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK19:       omp.inner.for.cond:
-// CHECK19-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP21:![0-9]+]]
-// CHECK19-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP21]]
+// CHECK19-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP35:![0-9]+]]
+// CHECK19-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP35]]
 // CHECK19-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
 // CHECK19-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK19:       omp.inner.for.body:
-// CHECK19-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP21]]
+// CHECK19-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP35]]
 // CHECK19-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 4
 // CHECK19-NEXT:    [[ADD:%.*]] = add nsw i32 6, [[MUL]]
 // CHECK19-NEXT:    [[CONV:%.*]] = trunc i32 [[ADD]] to i16
-// CHECK19-NEXT:    store i16 [[CONV]], ptr [[IT]], align 2, !llvm.access.group [[ACC_GRP21]]
-// CHECK19-NEXT:    [[TMP8:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP21]]
+// CHECK19-NEXT:    store i16 [[CONV]], ptr [[IT]], align 2, !llvm.access.group [[ACC_GRP35]]
+// CHECK19-NEXT:    [[TMP8:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP35]]
 // CHECK19-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP8]], 1
-// CHECK19-NEXT:    store i32 [[ADD2]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP21]]
-// CHECK19-NEXT:    [[TMP9:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP21]]
+// CHECK19-NEXT:    store i32 [[ADD2]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP35]]
+// CHECK19-NEXT:    [[TMP9:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP35]]
 // CHECK19-NEXT:    [[CONV3:%.*]] = sext i16 [[TMP9]] to i32
 // CHECK19-NEXT:    [[ADD4:%.*]] = add nsw i32 [[CONV3]], 1
 // CHECK19-NEXT:    [[CONV5:%.*]] = trunc i32 [[ADD4]] to i16
-// CHECK19-NEXT:    store i16 [[CONV5]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP21]]
+// CHECK19-NEXT:    store i16 [[CONV5]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP35]]
 // CHECK19-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK19:       omp.body.continue:
 // CHECK19-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK19:       omp.inner.for.inc:
-// CHECK19-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP21]]
+// CHECK19-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP35]]
 // CHECK19-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP10]], 1
-// CHECK19-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP21]]
-// CHECK19-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP22:![0-9]+]]
+// CHECK19-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP35]]
+// CHECK19-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP36:![0-9]+]]
 // CHECK19:       omp.inner.for.end:
 // CHECK19-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK19:       omp.loop.exit:
@@ -10248,60 +10248,60 @@ int bar(int n){
 // CHECK19:       omp.dispatch.body:
 // CHECK19-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK19:       omp.inner.for.cond:
-// CHECK19-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP24:![0-9]+]]
-// CHECK19-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP24]]
+// CHECK19-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP38:![0-9]+]]
+// CHECK19-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP38]]
 // CHECK19-NEXT:    [[CMP6:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]]
 // CHECK19-NEXT:    br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK19:       omp.inner.for.body:
-// CHECK19-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP24]]
+// CHECK19-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP38]]
 // CHECK19-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1
 // CHECK19-NEXT:    [[SUB:%.*]] = sub nsw i32 122, [[MUL]]
 // CHECK19-NEXT:    [[CONV:%.*]] = trunc i32 [[SUB]] to i8
-// CHECK19-NEXT:    store i8 [[CONV]], ptr [[IT]], align 1, !llvm.access.group [[ACC_GRP24]]
-// CHECK19-NEXT:    [[TMP19:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP24]]
+// CHECK19-NEXT:    store i8 [[CONV]], ptr [[IT]], align 1, !llvm.access.group [[ACC_GRP38]]
+// CHECK19-NEXT:    [[TMP19:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP38]]
 // CHECK19-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP19]], 1
-// CHECK19-NEXT:    store i32 [[ADD]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP24]]
+// CHECK19-NEXT:    store i32 [[ADD]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP38]]
 // CHECK19-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x float], ptr [[TMP0]], i32 0, i32 2
-// CHECK19-NEXT:    [[TMP20:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP24]]
+// CHECK19-NEXT:    [[TMP20:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP38]]
 // CHECK19-NEXT:    [[CONV7:%.*]] = fpext float [[TMP20]] to double
 // CHECK19-NEXT:    [[ADD8:%.*]] = fadd double [[CONV7]], 1.000000e+00
 // CHECK19-NEXT:    [[CONV9:%.*]] = fptrunc double [[ADD8]] to float
-// CHECK19-NEXT:    store float [[CONV9]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP24]]
+// CHECK19-NEXT:    store float [[CONV9]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP38]]
 // CHECK19-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 3
-// CHECK19-NEXT:    [[TMP21:%.*]] = load float, ptr [[ARRAYIDX10]], align 4, !llvm.access.group [[ACC_GRP24]]
+// CHECK19-NEXT:    [[TMP21:%.*]] = load float, ptr [[ARRAYIDX10]], align 4, !llvm.access.group [[ACC_GRP38]]
 // CHECK19-NEXT:    [[CONV11:%.*]] = fpext float [[TMP21]] to double
 // CHECK19-NEXT:    [[ADD12:%.*]] = fadd double [[CONV11]], 1.000000e+00
 // CHECK19-NEXT:    [[CONV13:%.*]] = fptrunc double [[ADD12]] to float
-// CHECK19-NEXT:    store float [[CONV13]], ptr [[ARRAYIDX10]], align 4, !llvm.access.group [[ACC_GRP24]]
+// CHECK19-NEXT:    store float [[CONV13]], ptr [[ARRAYIDX10]], align 4, !llvm.access.group [[ACC_GRP38]]
 // CHECK19-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds [5 x [10 x double]], ptr [[TMP3]], i32 0, i32 1
 // CHECK19-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds [10 x double], ptr [[ARRAYIDX14]], i32 0, i32 2
-// CHECK19-NEXT:    [[TMP22:%.*]] = load double, ptr [[ARRAYIDX15]], align 8, !llvm.access.group [[ACC_GRP24]]
+// CHECK19-NEXT:    [[TMP22:%.*]] = load double, ptr [[ARRAYIDX15]], align 8, !llvm.access.group [[ACC_GRP38]]
 // CHECK19-NEXT:    [[ADD16:%.*]] = fadd double [[TMP22]], 1.000000e+00
-// CHECK19-NEXT:    store double [[ADD16]], ptr [[ARRAYIDX15]], align 8, !llvm.access.group [[ACC_GRP24]]
+// CHECK19-NEXT:    store double [[ADD16]], ptr [[ARRAYIDX15]], align 8, !llvm.access.group [[ACC_GRP38]]
 // CHECK19-NEXT:    [[TMP23:%.*]] = mul nsw i32 1, [[TMP5]]
 // CHECK19-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds double, ptr [[TMP6]], i32 [[TMP23]]
 // CHECK19-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds double, ptr [[ARRAYIDX17]], i32 3
-// CHECK19-NEXT:    [[TMP24:%.*]] = load double, ptr [[ARRAYIDX18]], align 8, !llvm.access.group [[ACC_GRP24]]
+// CHECK19-NEXT:    [[TMP24:%.*]] = load double, ptr [[ARRAYIDX18]], align 8, !llvm.access.group [[ACC_GRP38]]
 // CHECK19-NEXT:    [[ADD19:%.*]] = fadd double [[TMP24]], 1.000000e+00
-// CHECK19-NEXT:    store double [[ADD19]], ptr [[ARRAYIDX18]], align 8, !llvm.access.group [[ACC_GRP24]]
+// CHECK19-NEXT:    store double [[ADD19]], ptr [[ARRAYIDX18]], align 8, !llvm.access.group [[ACC_GRP38]]
 // CHECK19-NEXT:    [[X:%.*]] = getelementptr inbounds [[STRUCT_TT:%.*]], ptr [[TMP7]], i32 0, i32 0
-// CHECK19-NEXT:    [[TMP25:%.*]] = load i64, ptr [[X]], align 4, !llvm.access.group [[ACC_GRP24]]
+// CHECK19-NEXT:    [[TMP25:%.*]] = load i64, ptr [[X]], align 4, !llvm.access.group [[ACC_GRP38]]
 // CHECK19-NEXT:    [[ADD20:%.*]] = add nsw i64 [[TMP25]], 1
-// CHECK19-NEXT:    store i64 [[ADD20]], ptr [[X]], align 4, !llvm.access.group [[ACC_GRP24]]
+// CHECK19-NEXT:    store i64 [[ADD20]], ptr [[X]], align 4, !llvm.access.group [[ACC_GRP38]]
 // CHECK19-NEXT:    [[Y:%.*]] = getelementptr inbounds [[STRUCT_TT]], ptr [[TMP7]], i32 0, i32 1
-// CHECK19-NEXT:    [[TMP26:%.*]] = load i8, ptr [[Y]], align 4, !llvm.access.group [[ACC_GRP24]]
+// CHECK19-NEXT:    [[TMP26:%.*]] = load i8, ptr [[Y]], align 4, !llvm.access.group [[ACC_GRP38]]
 // CHECK19-NEXT:    [[CONV21:%.*]] = sext i8 [[TMP26]] to i32
 // CHECK19-NEXT:    [[ADD22:%.*]] = add nsw i32 [[CONV21]], 1
 // CHECK19-NEXT:    [[CONV23:%.*]] = trunc i32 [[ADD22]] to i8
-// CHECK19-NEXT:    store i8 [[CONV23]], ptr [[Y]], align 4, !llvm.access.group [[ACC_GRP24]]
+// CHECK19-NEXT:    store i8 [[CONV23]], ptr [[Y]], align 4, !llvm.access.group [[ACC_GRP38]]
 // CHECK19-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK19:       omp.body.continue:
 // CHECK19-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK19:       omp.inner.for.inc:
-// CHECK19-NEXT:    [[TMP27:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP24]]
+// CHECK19-NEXT:    [[TMP27:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP38]]
 // CHECK19-NEXT:    [[ADD24:%.*]] = add nsw i32 [[TMP27]], 1
-// CHECK19-NEXT:    store i32 [[ADD24]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP24]]
-// CHECK19-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP25:![0-9]+]]
+// CHECK19-NEXT:    store i32 [[ADD24]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP38]]
+// CHECK19-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP39:![0-9]+]]
 // CHECK19:       omp.inner.for.end:
 // CHECK19-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK19:       omp.dispatch.inc:
@@ -10450,37 +10450,37 @@ int bar(int n){
 // CHECK19-NEXT:    store i64 [[TMP8]], ptr [[DOTOMP_IV]], align 8
 // CHECK19-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK19:       omp.inner.for.cond:
-// CHECK19-NEXT:    [[TMP9:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP27:![0-9]+]]
-// CHECK19-NEXT:    [[TMP10:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8, !llvm.access.group [[ACC_GRP27]]
+// CHECK19-NEXT:    [[TMP9:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP41:![0-9]+]]
+// CHECK19-NEXT:    [[TMP10:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8, !llvm.access.group [[ACC_GRP41]]
 // CHECK19-NEXT:    [[CMP3:%.*]] = icmp ule i64 [[TMP9]], [[TMP10]]
 // CHECK19-NEXT:    br i1 [[CMP3]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK19:       omp.inner.for.body:
-// CHECK19-NEXT:    [[TMP11:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP27]]
+// CHECK19-NEXT:    [[TMP11:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP41]]
 // CHECK19-NEXT:    [[MUL:%.*]] = mul i64 [[TMP11]], 400
 // CHECK19-NEXT:    [[SUB:%.*]] = sub i64 2000, [[MUL]]
-// CHECK19-NEXT:    store i64 [[SUB]], ptr [[IT]], align 8, !llvm.access.group [[ACC_GRP27]]
-// CHECK19-NEXT:    [[TMP12:%.*]] = load i32, ptr [[B_ADDR]], align 4, !llvm.access.group [[ACC_GRP27]]
+// CHECK19-NEXT:    store i64 [[SUB]], ptr [[IT]], align 8, !llvm.access.group [[ACC_GRP41]]
+// CHECK19-NEXT:    [[TMP12:%.*]] = load i32, ptr [[B_ADDR]], align 4, !llvm.access.group [[ACC_GRP41]]
 // CHECK19-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP12]] to double
 // CHECK19-NEXT:    [[ADD:%.*]] = fadd double [[CONV]], 1.500000e+00
 // CHECK19-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_S1:%.*]], ptr [[TMP0]], i32 0, i32 0
-// CHECK19-NEXT:    store double [[ADD]], ptr [[A]], align 4, !llvm.access.group [[ACC_GRP27]]
+// CHECK19-NEXT:    store double [[ADD]], ptr [[A]], align 4, !llvm.access.group [[ACC_GRP41]]
 // CHECK19-NEXT:    [[A4:%.*]] = getelementptr inbounds [[STRUCT_S1]], ptr [[TMP0]], i32 0, i32 0
-// CHECK19-NEXT:    [[TMP13:%.*]] = load double, ptr [[A4]], align 4, !llvm.access.group [[ACC_GRP27]]
+// CHECK19-NEXT:    [[TMP13:%.*]] = load double, ptr [[A4]], align 4, !llvm.access.group [[ACC_GRP41]]
 // CHECK19-NEXT:    [[INC:%.*]] = fadd double [[TMP13]], 1.000000e+00
-// CHECK19-NEXT:    store double [[INC]], ptr [[A4]], align 4, !llvm.access.group [[ACC_GRP27]]
+// CHECK19-NEXT:    store double [[INC]], ptr [[A4]], align 4, !llvm.access.group [[ACC_GRP41]]
 // CHECK19-NEXT:    [[CONV5:%.*]] = fptosi double [[INC]] to i16
 // CHECK19-NEXT:    [[TMP14:%.*]] = mul nsw i32 1, [[TMP2]]
 // CHECK19-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[TMP3]], i32 [[TMP14]]
 // CHECK19-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i16, ptr [[ARRAYIDX]], i32 1
-// CHECK19-NEXT:    store i16 [[CONV5]], ptr [[ARRAYIDX6]], align 2, !llvm.access.group [[ACC_GRP27]]
+// CHECK19-NEXT:    store i16 [[CONV5]], ptr [[ARRAYIDX6]], align 2, !llvm.access.group [[ACC_GRP41]]
 // CHECK19-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK19:       omp.body.continue:
 // CHECK19-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK19:       omp.inner.for.inc:
-// CHECK19-NEXT:    [[TMP15:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP27]]
+// CHECK19-NEXT:    [[TMP15:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP41]]
 // CHECK19-NEXT:    [[ADD7:%.*]] = add i64 [[TMP15]], 1
-// CHECK19-NEXT:    store i64 [[ADD7]], ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP27]]
-// CHECK19-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP28:![0-9]+]]
+// CHECK19-NEXT:    store i64 [[ADD7]], ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP41]]
+// CHECK19-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP42:![0-9]+]]
 // CHECK19:       omp.inner.for.end:
 // CHECK19-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK19:       omp.loop.exit:
@@ -10560,35 +10560,35 @@ int bar(int n){
 // CHECK19-NEXT:    store i64 [[TMP5]], ptr [[DOTOMP_IV]], align 8
 // CHECK19-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK19:       omp.inner.for.cond:
-// CHECK19-NEXT:    [[TMP6:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP30:![0-9]+]]
-// CHECK19-NEXT:    [[TMP7:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8, !llvm.access.group [[ACC_GRP30]]
+// CHECK19-NEXT:    [[TMP6:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP44:![0-9]+]]
+// CHECK19-NEXT:    [[TMP7:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8, !llvm.access.group [[ACC_GRP44]]
 // CHECK19-NEXT:    [[CMP1:%.*]] = icmp sle i64 [[TMP6]], [[TMP7]]
 // CHECK19-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK19:       omp.inner.for.body:
-// CHECK19-NEXT:    [[TMP8:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP30]]
+// CHECK19-NEXT:    [[TMP8:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP44]]
 // CHECK19-NEXT:    [[MUL:%.*]] = mul nsw i64 [[TMP8]], 3
 // CHECK19-NEXT:    [[ADD:%.*]] = add nsw i64 -10, [[MUL]]
-// CHECK19-NEXT:    store i64 [[ADD]], ptr [[I]], align 8, !llvm.access.group [[ACC_GRP30]]
-// CHECK19-NEXT:    [[TMP9:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP30]]
+// CHECK19-NEXT:    store i64 [[ADD]], ptr [[I]], align 8, !llvm.access.group [[ACC_GRP44]]
+// CHECK19-NEXT:    [[TMP9:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP44]]
 // CHECK19-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP9]], 1
-// CHECK19-NEXT:    store i32 [[ADD2]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP30]]
-// CHECK19-NEXT:    [[TMP10:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP30]]
+// CHECK19-NEXT:    store i32 [[ADD2]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP44]]
+// CHECK19-NEXT:    [[TMP10:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP44]]
 // CHECK19-NEXT:    [[CONV:%.*]] = sext i16 [[TMP10]] to i32
 // CHECK19-NEXT:    [[ADD3:%.*]] = add nsw i32 [[CONV]], 1
 // CHECK19-NEXT:    [[CONV4:%.*]] = trunc i32 [[ADD3]] to i16
-// CHECK19-NEXT:    store i16 [[CONV4]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP30]]
+// CHECK19-NEXT:    store i16 [[CONV4]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP44]]
 // CHECK19-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP0]], i32 0, i32 2
-// CHECK19-NEXT:    [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP30]]
+// CHECK19-NEXT:    [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP44]]
 // CHECK19-NEXT:    [[ADD5:%.*]] = add nsw i32 [[TMP11]], 1
-// CHECK19-NEXT:    store i32 [[ADD5]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP30]]
+// CHECK19-NEXT:    store i32 [[ADD5]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP44]]
 // CHECK19-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK19:       omp.body.continue:
 // CHECK19-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK19:       omp.inner.for.inc:
-// CHECK19-NEXT:    [[TMP12:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP30]]
+// CHECK19-NEXT:    [[TMP12:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP44]]
 // CHECK19-NEXT:    [[ADD6:%.*]] = add nsw i64 [[TMP12]], 1
-// CHECK19-NEXT:    store i64 [[ADD6]], ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP30]]
-// CHECK19-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP31:![0-9]+]]
+// CHECK19-NEXT:    store i64 [[ADD6]], ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP44]]
+// CHECK19-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP45:![0-9]+]]
 // CHECK19:       omp.inner.for.end:
 // CHECK19-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK19:       omp.loop.exit:
@@ -10646,23 +10646,23 @@ int bar(int n){
 // CHECK21-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
 // CHECK21-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK21:       omp.inner.for.cond:
-// CHECK21-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP11:![0-9]+]]
-// CHECK21-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP11]]
+// CHECK21-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP25:![0-9]+]]
+// CHECK21-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP25]]
 // CHECK21-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
 // CHECK21-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK21:       omp.inner.for.body:
-// CHECK21-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP11]]
+// CHECK21-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP25]]
 // CHECK21-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 5
 // CHECK21-NEXT:    [[ADD:%.*]] = add nsw i32 3, [[MUL]]
-// CHECK21-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP11]]
+// CHECK21-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP25]]
 // CHECK21-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK21:       omp.body.continue:
 // CHECK21-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK21:       omp.inner.for.inc:
-// CHECK21-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP11]]
+// CHECK21-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP25]]
 // CHECK21-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP8]], 1
-// CHECK21-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP11]]
-// CHECK21-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP12:![0-9]+]]
+// CHECK21-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP25]]
+// CHECK21-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP26:![0-9]+]]
 // CHECK21:       omp.inner.for.end:
 // CHECK21-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK21:       omp.loop.exit:
@@ -10756,44 +10756,44 @@ int bar(int n){
 // CHECK21-NEXT:    store i64 [[TMP6]], ptr [[DOTOMP_IV]], align 8
 // CHECK21-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK21:       omp.inner.for.cond:
-// CHECK21-NEXT:    [[TMP7:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP17:![0-9]+]]
-// CHECK21-NEXT:    [[TMP8:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8, !llvm.access.group [[ACC_GRP17]]
+// CHECK21-NEXT:    [[TMP7:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP31:![0-9]+]]
+// CHECK21-NEXT:    [[TMP8:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8, !llvm.access.group [[ACC_GRP31]]
 // CHECK21-NEXT:    [[CMP4:%.*]] = icmp ule i64 [[TMP7]], [[TMP8]]
 // CHECK21-NEXT:    br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK21:       omp.inner.for.body:
-// CHECK21-NEXT:    [[TMP9:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP17]]
+// CHECK21-NEXT:    [[TMP9:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP31]]
 // CHECK21-NEXT:    [[MUL:%.*]] = mul i64 [[TMP9]], 400
 // CHECK21-NEXT:    [[SUB:%.*]] = sub i64 2000, [[MUL]]
-// CHECK21-NEXT:    store i64 [[SUB]], ptr [[IT]], align 8, !llvm.access.group [[ACC_GRP17]]
-// CHECK21-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTLINEAR_START]], align 4, !llvm.access.group [[ACC_GRP17]]
+// CHECK21-NEXT:    store i64 [[SUB]], ptr [[IT]], align 8, !llvm.access.group [[ACC_GRP31]]
+// CHECK21-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTLINEAR_START]], align 4, !llvm.access.group [[ACC_GRP31]]
 // CHECK21-NEXT:    [[CONV:%.*]] = sext i32 [[TMP10]] to i64
-// CHECK21-NEXT:    [[TMP11:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP17]]
-// CHECK21-NEXT:    [[TMP12:%.*]] = load i64, ptr [[DOTLINEAR_STEP]], align 8, !llvm.access.group [[ACC_GRP17]]
+// CHECK21-NEXT:    [[TMP11:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP31]]
+// CHECK21-NEXT:    [[TMP12:%.*]] = load i64, ptr [[DOTLINEAR_STEP]], align 8, !llvm.access.group [[ACC_GRP31]]
 // CHECK21-NEXT:    [[MUL5:%.*]] = mul i64 [[TMP11]], [[TMP12]]
 // CHECK21-NEXT:    [[ADD:%.*]] = add i64 [[CONV]], [[MUL5]]
 // CHECK21-NEXT:    [[CONV6:%.*]] = trunc i64 [[ADD]] to i32
-// CHECK21-NEXT:    store i32 [[CONV6]], ptr [[LIN2]], align 4, !llvm.access.group [[ACC_GRP17]]
-// CHECK21-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTLINEAR_START1]], align 4, !llvm.access.group [[ACC_GRP17]]
+// CHECK21-NEXT:    store i32 [[CONV6]], ptr [[LIN2]], align 4, !llvm.access.group [[ACC_GRP31]]
+// CHECK21-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTLINEAR_START1]], align 4, !llvm.access.group [[ACC_GRP31]]
 // CHECK21-NEXT:    [[CONV7:%.*]] = sext i32 [[TMP13]] to i64
-// CHECK21-NEXT:    [[TMP14:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP17]]
-// CHECK21-NEXT:    [[TMP15:%.*]] = load i64, ptr [[DOTLINEAR_STEP]], align 8, !llvm.access.group [[ACC_GRP17]]
+// CHECK21-NEXT:    [[TMP14:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP31]]
+// CHECK21-NEXT:    [[TMP15:%.*]] = load i64, ptr [[DOTLINEAR_STEP]], align 8, !llvm.access.group [[ACC_GRP31]]
 // CHECK21-NEXT:    [[MUL8:%.*]] = mul i64 [[TMP14]], [[TMP15]]
 // CHECK21-NEXT:    [[ADD9:%.*]] = add i64 [[CONV7]], [[MUL8]]
 // CHECK21-NEXT:    [[CONV10:%.*]] = trunc i64 [[ADD9]] to i32
-// CHECK21-NEXT:    store i32 [[CONV10]], ptr [[A3]], align 4, !llvm.access.group [[ACC_GRP17]]
-// CHECK21-NEXT:    [[TMP16:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP17]]
+// CHECK21-NEXT:    store i32 [[CONV10]], ptr [[A3]], align 4, !llvm.access.group [[ACC_GRP31]]
+// CHECK21-NEXT:    [[TMP16:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP31]]
 // CHECK21-NEXT:    [[CONV11:%.*]] = sext i16 [[TMP16]] to i32
 // CHECK21-NEXT:    [[ADD12:%.*]] = add nsw i32 [[CONV11]], 1
 // CHECK21-NEXT:    [[CONV13:%.*]] = trunc i32 [[ADD12]] to i16
-// CHECK21-NEXT:    store i16 [[CONV13]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP17]]
+// CHECK21-NEXT:    store i16 [[CONV13]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP31]]
 // CHECK21-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK21:       omp.body.continue:
 // CHECK21-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK21:       omp.inner.for.inc:
-// CHECK21-NEXT:    [[TMP17:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP17]]
+// CHECK21-NEXT:    [[TMP17:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP31]]
 // CHECK21-NEXT:    [[ADD14:%.*]] = add i64 [[TMP17]], 1
-// CHECK21-NEXT:    store i64 [[ADD14]], ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP17]]
-// CHECK21-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP18:![0-9]+]]
+// CHECK21-NEXT:    store i64 [[ADD14]], ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP31]]
+// CHECK21-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP32:![0-9]+]]
 // CHECK21:       omp.inner.for.end:
 // CHECK21-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK21:       omp.loop.exit:
@@ -10883,32 +10883,32 @@ int bar(int n){
 // CHECK21-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
 // CHECK21-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK21:       omp.inner.for.cond:
-// CHECK21-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP20:![0-9]+]]
-// CHECK21-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP20]]
+// CHECK21-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP34:![0-9]+]]
+// CHECK21-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP34]]
 // CHECK21-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
 // CHECK21-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK21:       omp.inner.for.body:
-// CHECK21-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP20]]
+// CHECK21-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP34]]
 // CHECK21-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 4
 // CHECK21-NEXT:    [[ADD:%.*]] = add nsw i32 6, [[MUL]]
 // CHECK21-NEXT:    [[CONV:%.*]] = trunc i32 [[ADD]] to i16
-// CHECK21-NEXT:    store i16 [[CONV]], ptr [[IT]], align 2, !llvm.access.group [[ACC_GRP20]]
-// CHECK21-NEXT:    [[TMP8:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP20]]
+// CHECK21-NEXT:    store i16 [[CONV]], ptr [[IT]], align 2, !llvm.access.group [[ACC_GRP34]]
+// CHECK21-NEXT:    [[TMP8:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP34]]
 // CHECK21-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP8]], 1
-// CHECK21-NEXT:    store i32 [[ADD2]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP20]]
-// CHECK21-NEXT:    [[TMP9:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP20]]
+// CHECK21-NEXT:    store i32 [[ADD2]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP34]]
+// CHECK21-NEXT:    [[TMP9:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP34]]
 // CHECK21-NEXT:    [[CONV3:%.*]] = sext i16 [[TMP9]] to i32
 // CHECK21-NEXT:    [[ADD4:%.*]] = add nsw i32 [[CONV3]], 1
 // CHECK21-NEXT:    [[CONV5:%.*]] = trunc i32 [[ADD4]] to i16
-// CHECK21-NEXT:    store i16 [[CONV5]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP20]]
+// CHECK21-NEXT:    store i16 [[CONV5]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP34]]
 // CHECK21-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK21:       omp.body.continue:
 // CHECK21-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK21:       omp.inner.for.inc:
-// CHECK21-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP20]]
+// CHECK21-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP34]]
 // CHECK21-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP10]], 1
-// CHECK21-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP20]]
-// CHECK21-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP21:![0-9]+]]
+// CHECK21-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP34]]
+// CHECK21-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP35:![0-9]+]]
 // CHECK21:       omp.inner.for.end:
 // CHECK21-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK21:       omp.loop.exit:
@@ -11038,60 +11038,60 @@ int bar(int n){
 // CHECK21:       omp.dispatch.body:
 // CHECK21-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK21:       omp.inner.for.cond:
-// CHECK21-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP23:![0-9]+]]
-// CHECK21-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK21-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP37:![0-9]+]]
+// CHECK21-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP37]]
 // CHECK21-NEXT:    [[CMP6:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]]
 // CHECK21-NEXT:    br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK21:       omp.inner.for.body:
-// CHECK21-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK21-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP37]]
 // CHECK21-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1
 // CHECK21-NEXT:    [[SUB:%.*]] = sub nsw i32 122, [[MUL]]
 // CHECK21-NEXT:    [[CONV:%.*]] = trunc i32 [[SUB]] to i8
-// CHECK21-NEXT:    store i8 [[CONV]], ptr [[IT]], align 1, !llvm.access.group [[ACC_GRP23]]
-// CHECK21-NEXT:    [[TMP19:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK21-NEXT:    store i8 [[CONV]], ptr [[IT]], align 1, !llvm.access.group [[ACC_GRP37]]
+// CHECK21-NEXT:    [[TMP19:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP37]]
 // CHECK21-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP19]], 1
-// CHECK21-NEXT:    store i32 [[ADD]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK21-NEXT:    store i32 [[ADD]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP37]]
 // CHECK21-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x float], ptr [[TMP0]], i64 0, i64 2
-// CHECK21-NEXT:    [[TMP20:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK21-NEXT:    [[TMP20:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP37]]
 // CHECK21-NEXT:    [[CONV7:%.*]] = fpext float [[TMP20]] to double
 // CHECK21-NEXT:    [[ADD8:%.*]] = fadd double [[CONV7]], 1.000000e+00
 // CHECK21-NEXT:    [[CONV9:%.*]] = fptrunc double [[ADD8]] to float
-// CHECK21-NEXT:    store float [[CONV9]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK21-NEXT:    store float [[CONV9]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP37]]
 // CHECK21-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i64 3
-// CHECK21-NEXT:    [[TMP21:%.*]] = load float, ptr [[ARRAYIDX10]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK21-NEXT:    [[TMP21:%.*]] = load float, ptr [[ARRAYIDX10]], align 4, !llvm.access.group [[ACC_GRP37]]
 // CHECK21-NEXT:    [[CONV11:%.*]] = fpext float [[TMP21]] to double
 // CHECK21-NEXT:    [[ADD12:%.*]] = fadd double [[CONV11]], 1.000000e+00
 // CHECK21-NEXT:    [[CONV13:%.*]] = fptrunc double [[ADD12]] to float
-// CHECK21-NEXT:    store float [[CONV13]], ptr [[ARRAYIDX10]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK21-NEXT:    store float [[CONV13]], ptr [[ARRAYIDX10]], align 4, !llvm.access.group [[ACC_GRP37]]
 // CHECK21-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds [5 x [10 x double]], ptr [[TMP3]], i64 0, i64 1
 // CHECK21-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds [10 x double], ptr [[ARRAYIDX14]], i64 0, i64 2
-// CHECK21-NEXT:    [[TMP22:%.*]] = load double, ptr [[ARRAYIDX15]], align 8, !llvm.access.group [[ACC_GRP23]]
+// CHECK21-NEXT:    [[TMP22:%.*]] = load double, ptr [[ARRAYIDX15]], align 8, !llvm.access.group [[ACC_GRP37]]
 // CHECK21-NEXT:    [[ADD16:%.*]] = fadd double [[TMP22]], 1.000000e+00
-// CHECK21-NEXT:    store double [[ADD16]], ptr [[ARRAYIDX15]], align 8, !llvm.access.group [[ACC_GRP23]]
+// CHECK21-NEXT:    store double [[ADD16]], ptr [[ARRAYIDX15]], align 8, !llvm.access.group [[ACC_GRP37]]
 // CHECK21-NEXT:    [[TMP23:%.*]] = mul nsw i64 1, [[TMP5]]
 // CHECK21-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds double, ptr [[TMP6]], i64 [[TMP23]]
 // CHECK21-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds double, ptr [[ARRAYIDX17]], i64 3
-// CHECK21-NEXT:    [[TMP24:%.*]] = load double, ptr [[ARRAYIDX18]], align 8, !llvm.access.group [[ACC_GRP23]]
+// CHECK21-NEXT:    [[TMP24:%.*]] = load double, ptr [[ARRAYIDX18]], align 8, !llvm.access.group [[ACC_GRP37]]
 // CHECK21-NEXT:    [[ADD19:%.*]] = fadd double [[TMP24]], 1.000000e+00
-// CHECK21-NEXT:    store double [[ADD19]], ptr [[ARRAYIDX18]], align 8, !llvm.access.group [[ACC_GRP23]]
+// CHECK21-NEXT:    store double [[ADD19]], ptr [[ARRAYIDX18]], align 8, !llvm.access.group [[ACC_GRP37]]
 // CHECK21-NEXT:    [[X:%.*]] = getelementptr inbounds [[STRUCT_TT:%.*]], ptr [[TMP7]], i32 0, i32 0
-// CHECK21-NEXT:    [[TMP25:%.*]] = load i64, ptr [[X]], align 8, !llvm.access.group [[ACC_GRP23]]
+// CHECK21-NEXT:    [[TMP25:%.*]] = load i64, ptr [[X]], align 8, !llvm.access.group [[ACC_GRP37]]
 // CHECK21-NEXT:    [[ADD20:%.*]] = add nsw i64 [[TMP25]], 1
-// CHECK21-NEXT:    store i64 [[ADD20]], ptr [[X]], align 8, !llvm.access.group [[ACC_GRP23]]
+// CHECK21-NEXT:    store i64 [[ADD20]], ptr [[X]], align 8, !llvm.access.group [[ACC_GRP37]]
 // CHECK21-NEXT:    [[Y:%.*]] = getelementptr inbounds [[STRUCT_TT]], ptr [[TMP7]], i32 0, i32 1
-// CHECK21-NEXT:    [[TMP26:%.*]] = load i8, ptr [[Y]], align 8, !llvm.access.group [[ACC_GRP23]]
+// CHECK21-NEXT:    [[TMP26:%.*]] = load i8, ptr [[Y]], align 8, !llvm.access.group [[ACC_GRP37]]
 // CHECK21-NEXT:    [[CONV21:%.*]] = sext i8 [[TMP26]] to i32
 // CHECK21-NEXT:    [[ADD22:%.*]] = add nsw i32 [[CONV21]], 1
 // CHECK21-NEXT:    [[CONV23:%.*]] = trunc i32 [[ADD22]] to i8
-// CHECK21-NEXT:    store i8 [[CONV23]], ptr [[Y]], align 8, !llvm.access.group [[ACC_GRP23]]
+// CHECK21-NEXT:    store i8 [[CONV23]], ptr [[Y]], align 8, !llvm.access.group [[ACC_GRP37]]
 // CHECK21-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK21:       omp.body.continue:
 // CHECK21-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK21:       omp.inner.for.inc:
-// CHECK21-NEXT:    [[TMP27:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK21-NEXT:    [[TMP27:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP37]]
 // CHECK21-NEXT:    [[ADD24:%.*]] = add nsw i32 [[TMP27]], 1
-// CHECK21-NEXT:    store i32 [[ADD24]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP23]]
-// CHECK21-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP24:![0-9]+]]
+// CHECK21-NEXT:    store i32 [[ADD24]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP37]]
+// CHECK21-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP38:![0-9]+]]
 // CHECK21:       omp.inner.for.end:
 // CHECK21-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK21:       omp.dispatch.inc:
@@ -11270,37 +11270,37 @@ int bar(int n){
 // CHECK21-NEXT:    store i64 [[TMP9]], ptr [[DOTOMP_IV]], align 8
 // CHECK21-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK21:       omp.inner.for.cond:
-// CHECK21-NEXT:    [[TMP10:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP26:![0-9]+]]
-// CHECK21-NEXT:    [[TMP11:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8, !llvm.access.group [[ACC_GRP26]]
+// CHECK21-NEXT:    [[TMP10:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP40:![0-9]+]]
+// CHECK21-NEXT:    [[TMP11:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8, !llvm.access.group [[ACC_GRP40]]
 // CHECK21-NEXT:    [[CMP3:%.*]] = icmp ule i64 [[TMP10]], [[TMP11]]
 // CHECK21-NEXT:    br i1 [[CMP3]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK21:       omp.inner.for.body:
-// CHECK21-NEXT:    [[TMP12:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP26]]
+// CHECK21-NEXT:    [[TMP12:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP40]]
 // CHECK21-NEXT:    [[MUL:%.*]] = mul i64 [[TMP12]], 400
 // CHECK21-NEXT:    [[SUB:%.*]] = sub i64 2000, [[MUL]]
-// CHECK21-NEXT:    store i64 [[SUB]], ptr [[IT]], align 8, !llvm.access.group [[ACC_GRP26]]
-// CHECK21-NEXT:    [[TMP13:%.*]] = load i32, ptr [[B_ADDR]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK21-NEXT:    store i64 [[SUB]], ptr [[IT]], align 8, !llvm.access.group [[ACC_GRP40]]
+// CHECK21-NEXT:    [[TMP13:%.*]] = load i32, ptr [[B_ADDR]], align 4, !llvm.access.group [[ACC_GRP40]]
 // CHECK21-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP13]] to double
 // CHECK21-NEXT:    [[ADD:%.*]] = fadd double [[CONV]], 1.500000e+00
 // CHECK21-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_S1:%.*]], ptr [[TMP0]], i32 0, i32 0
-// CHECK21-NEXT:    store double [[ADD]], ptr [[A]], align 8, !nontemporal !27, !llvm.access.group [[ACC_GRP26]]
+// CHECK21-NEXT:    store double [[ADD]], ptr [[A]], align 8, !nontemporal !41, !llvm.access.group [[ACC_GRP40]]
 // CHECK21-NEXT:    [[A4:%.*]] = getelementptr inbounds [[STRUCT_S1]], ptr [[TMP0]], i32 0, i32 0
-// CHECK21-NEXT:    [[TMP14:%.*]] = load double, ptr [[A4]], align 8, !nontemporal !27, !llvm.access.group [[ACC_GRP26]]
+// CHECK21-NEXT:    [[TMP14:%.*]] = load double, ptr [[A4]], align 8, !nontemporal !41, !llvm.access.group [[ACC_GRP40]]
 // CHECK21-NEXT:    [[INC:%.*]] = fadd double [[TMP14]], 1.000000e+00
-// CHECK21-NEXT:    store double [[INC]], ptr [[A4]], align 8, !nontemporal !27, !llvm.access.group [[ACC_GRP26]]
+// CHECK21-NEXT:    store double [[INC]], ptr [[A4]], align 8, !nontemporal !41, !llvm.access.group [[ACC_GRP40]]
 // CHECK21-NEXT:    [[CONV5:%.*]] = fptosi double [[INC]] to i16
 // CHECK21-NEXT:    [[TMP15:%.*]] = mul nsw i64 1, [[TMP2]]
 // CHECK21-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[TMP3]], i64 [[TMP15]]
 // CHECK21-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i16, ptr [[ARRAYIDX]], i64 1
-// CHECK21-NEXT:    store i16 [[CONV5]], ptr [[ARRAYIDX6]], align 2, !llvm.access.group [[ACC_GRP26]]
+// CHECK21-NEXT:    store i16 [[CONV5]], ptr [[ARRAYIDX6]], align 2, !llvm.access.group [[ACC_GRP40]]
 // CHECK21-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK21:       omp.body.continue:
 // CHECK21-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK21:       omp.inner.for.inc:
-// CHECK21-NEXT:    [[TMP16:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP26]]
+// CHECK21-NEXT:    [[TMP16:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP40]]
 // CHECK21-NEXT:    [[ADD7:%.*]] = add i64 [[TMP16]], 1
-// CHECK21-NEXT:    store i64 [[ADD7]], ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP26]]
-// CHECK21-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP28:![0-9]+]]
+// CHECK21-NEXT:    store i64 [[ADD7]], ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP40]]
+// CHECK21-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP42:![0-9]+]]
 // CHECK21:       omp.inner.for.end:
 // CHECK21-NEXT:    br label [[OMP_IF_END:%.*]]
 // CHECK21:       omp_if.else:
@@ -11352,7 +11352,7 @@ int bar(int n){
 // CHECK21-NEXT:    [[TMP28:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8
 // CHECK21-NEXT:    [[ADD28:%.*]] = add i64 [[TMP28]], 1
 // CHECK21-NEXT:    store i64 [[ADD28]], ptr [[DOTOMP_IV]], align 8
-// CHECK21-NEXT:    br label [[OMP_INNER_FOR_COND13]], !llvm.loop [[LOOP30:![0-9]+]]
+// CHECK21-NEXT:    br label [[OMP_INNER_FOR_COND13]], !llvm.loop [[LOOP44:![0-9]+]]
 // CHECK21:       omp.inner.for.end29:
 // CHECK21-NEXT:    br label [[OMP_IF_END]]
 // CHECK21:       omp_if.end:
@@ -11436,35 +11436,35 @@ int bar(int n){
 // CHECK21-NEXT:    store i64 [[TMP5]], ptr [[DOTOMP_IV]], align 8
 // CHECK21-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK21:       omp.inner.for.cond:
-// CHECK21-NEXT:    [[TMP6:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP32:![0-9]+]]
-// CHECK21-NEXT:    [[TMP7:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8, !llvm.access.group [[ACC_GRP32]]
+// CHECK21-NEXT:    [[TMP6:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP46:![0-9]+]]
+// CHECK21-NEXT:    [[TMP7:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8, !llvm.access.group [[ACC_GRP46]]
 // CHECK21-NEXT:    [[CMP1:%.*]] = icmp sle i64 [[TMP6]], [[TMP7]]
 // CHECK21-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK21:       omp.inner.for.body:
-// CHECK21-NEXT:    [[TMP8:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP32]]
+// CHECK21-NEXT:    [[TMP8:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP46]]
 // CHECK21-NEXT:    [[MUL:%.*]] = mul nsw i64 [[TMP8]], 3
 // CHECK21-NEXT:    [[ADD:%.*]] = add nsw i64 -10, [[MUL]]
-// CHECK21-NEXT:    store i64 [[ADD]], ptr [[I]], align 8, !llvm.access.group [[ACC_GRP32]]
-// CHECK21-NEXT:    [[TMP9:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP32]]
+// CHECK21-NEXT:    store i64 [[ADD]], ptr [[I]], align 8, !llvm.access.group [[ACC_GRP46]]
+// CHECK21-NEXT:    [[TMP9:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP46]]
 // CHECK21-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP9]], 1
-// CHECK21-NEXT:    store i32 [[ADD2]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP32]]
-// CHECK21-NEXT:    [[TMP10:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP32]]
+// CHECK21-NEXT:    store i32 [[ADD2]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP46]]
+// CHECK21-NEXT:    [[TMP10:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP46]]
 // CHECK21-NEXT:    [[CONV:%.*]] = sext i16 [[TMP10]] to i32
 // CHECK21-NEXT:    [[ADD3:%.*]] = add nsw i32 [[CONV]], 1
 // CHECK21-NEXT:    [[CONV4:%.*]] = trunc i32 [[ADD3]] to i16
-// CHECK21-NEXT:    store i16 [[CONV4]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP32]]
+// CHECK21-NEXT:    store i16 [[CONV4]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP46]]
 // CHECK21-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP0]], i64 0, i64 2
-// CHECK21-NEXT:    [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP32]]
+// CHECK21-NEXT:    [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP46]]
 // CHECK21-NEXT:    [[ADD5:%.*]] = add nsw i32 [[TMP11]], 1
-// CHECK21-NEXT:    store i32 [[ADD5]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP32]]
+// CHECK21-NEXT:    store i32 [[ADD5]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP46]]
 // CHECK21-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK21:       omp.body.continue:
 // CHECK21-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK21:       omp.inner.for.inc:
-// CHECK21-NEXT:    [[TMP12:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP32]]
+// CHECK21-NEXT:    [[TMP12:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP46]]
 // CHECK21-NEXT:    [[ADD6:%.*]] = add nsw i64 [[TMP12]], 1
-// CHECK21-NEXT:    store i64 [[ADD6]], ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP32]]
-// CHECK21-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP33:![0-9]+]]
+// CHECK21-NEXT:    store i64 [[ADD6]], ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP46]]
+// CHECK21-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP47:![0-9]+]]
 // CHECK21:       omp.inner.for.end:
 // CHECK21-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK21:       omp.loop.exit:
@@ -11522,23 +11522,23 @@ int bar(int n){
 // CHECK23-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
 // CHECK23-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK23:       omp.inner.for.cond:
-// CHECK23-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12:![0-9]+]]
-// CHECK23-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK23-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26:![0-9]+]]
+// CHECK23-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP26]]
 // CHECK23-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
 // CHECK23-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK23:       omp.inner.for.body:
-// CHECK23-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK23-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
 // CHECK23-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 5
 // CHECK23-NEXT:    [[ADD:%.*]] = add nsw i32 3, [[MUL]]
-// CHECK23-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK23-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP26]]
 // CHECK23-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK23:       omp.body.continue:
 // CHECK23-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK23:       omp.inner.for.inc:
-// CHECK23-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK23-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
 // CHECK23-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP8]], 1
-// CHECK23-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12]]
-// CHECK23-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP13:![0-9]+]]
+// CHECK23-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK23-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP27:![0-9]+]]
 // CHECK23:       omp.inner.for.end:
 // CHECK23-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK23:       omp.loop.exit:
@@ -11632,44 +11632,44 @@ int bar(int n){
 // CHECK23-NEXT:    store i64 [[TMP6]], ptr [[DOTOMP_IV]], align 8
 // CHECK23-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK23:       omp.inner.for.cond:
-// CHECK23-NEXT:    [[TMP7:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP18:![0-9]+]]
-// CHECK23-NEXT:    [[TMP8:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8, !llvm.access.group [[ACC_GRP18]]
+// CHECK23-NEXT:    [[TMP7:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP32:![0-9]+]]
+// CHECK23-NEXT:    [[TMP8:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8, !llvm.access.group [[ACC_GRP32]]
 // CHECK23-NEXT:    [[CMP4:%.*]] = icmp ule i64 [[TMP7]], [[TMP8]]
 // CHECK23-NEXT:    br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK23:       omp.inner.for.body:
-// CHECK23-NEXT:    [[TMP9:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP18]]
+// CHECK23-NEXT:    [[TMP9:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP32]]
 // CHECK23-NEXT:    [[MUL:%.*]] = mul i64 [[TMP9]], 400
 // CHECK23-NEXT:    [[SUB:%.*]] = sub i64 2000, [[MUL]]
-// CHECK23-NEXT:    store i64 [[SUB]], ptr [[IT]], align 8, !llvm.access.group [[ACC_GRP18]]
-// CHECK23-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTLINEAR_START]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK23-NEXT:    store i64 [[SUB]], ptr [[IT]], align 8, !llvm.access.group [[ACC_GRP32]]
+// CHECK23-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTLINEAR_START]], align 4, !llvm.access.group [[ACC_GRP32]]
 // CHECK23-NEXT:    [[CONV:%.*]] = sext i32 [[TMP10]] to i64
-// CHECK23-NEXT:    [[TMP11:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP18]]
-// CHECK23-NEXT:    [[TMP12:%.*]] = load i64, ptr [[DOTLINEAR_STEP]], align 8, !llvm.access.group [[ACC_GRP18]]
+// CHECK23-NEXT:    [[TMP11:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP32]]
+// CHECK23-NEXT:    [[TMP12:%.*]] = load i64, ptr [[DOTLINEAR_STEP]], align 8, !llvm.access.group [[ACC_GRP32]]
 // CHECK23-NEXT:    [[MUL5:%.*]] = mul i64 [[TMP11]], [[TMP12]]
 // CHECK23-NEXT:    [[ADD:%.*]] = add i64 [[CONV]], [[MUL5]]
 // CHECK23-NEXT:    [[CONV6:%.*]] = trunc i64 [[ADD]] to i32
-// CHECK23-NEXT:    store i32 [[CONV6]], ptr [[LIN2]], align 4, !llvm.access.group [[ACC_GRP18]]
-// CHECK23-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTLINEAR_START1]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK23-NEXT:    store i32 [[CONV6]], ptr [[LIN2]], align 4, !llvm.access.group [[ACC_GRP32]]
+// CHECK23-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTLINEAR_START1]], align 4, !llvm.access.group [[ACC_GRP32]]
 // CHECK23-NEXT:    [[CONV7:%.*]] = sext i32 [[TMP13]] to i64
-// CHECK23-NEXT:    [[TMP14:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP18]]
-// CHECK23-NEXT:    [[TMP15:%.*]] = load i64, ptr [[DOTLINEAR_STEP]], align 8, !llvm.access.group [[ACC_GRP18]]
+// CHECK23-NEXT:    [[TMP14:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP32]]
+// CHECK23-NEXT:    [[TMP15:%.*]] = load i64, ptr [[DOTLINEAR_STEP]], align 8, !llvm.access.group [[ACC_GRP32]]
 // CHECK23-NEXT:    [[MUL8:%.*]] = mul i64 [[TMP14]], [[TMP15]]
 // CHECK23-NEXT:    [[ADD9:%.*]] = add i64 [[CONV7]], [[MUL8]]
 // CHECK23-NEXT:    [[CONV10:%.*]] = trunc i64 [[ADD9]] to i32
-// CHECK23-NEXT:    store i32 [[CONV10]], ptr [[A3]], align 4, !llvm.access.group [[ACC_GRP18]]
-// CHECK23-NEXT:    [[TMP16:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP18]]
+// CHECK23-NEXT:    store i32 [[CONV10]], ptr [[A3]], align 4, !llvm.access.group [[ACC_GRP32]]
+// CHECK23-NEXT:    [[TMP16:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP32]]
 // CHECK23-NEXT:    [[CONV11:%.*]] = sext i16 [[TMP16]] to i32
 // CHECK23-NEXT:    [[ADD12:%.*]] = add nsw i32 [[CONV11]], 1
 // CHECK23-NEXT:    [[CONV13:%.*]] = trunc i32 [[ADD12]] to i16
-// CHECK23-NEXT:    store i16 [[CONV13]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP18]]
+// CHECK23-NEXT:    store i16 [[CONV13]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP32]]
 // CHECK23-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK23:       omp.body.continue:
 // CHECK23-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK23:       omp.inner.for.inc:
-// CHECK23-NEXT:    [[TMP17:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP18]]
+// CHECK23-NEXT:    [[TMP17:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP32]]
 // CHECK23-NEXT:    [[ADD14:%.*]] = add i64 [[TMP17]], 1
-// CHECK23-NEXT:    store i64 [[ADD14]], ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP18]]
-// CHECK23-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP19:![0-9]+]]
+// CHECK23-NEXT:    store i64 [[ADD14]], ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP32]]
+// CHECK23-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP33:![0-9]+]]
 // CHECK23:       omp.inner.for.end:
 // CHECK23-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK23:       omp.loop.exit:
@@ -11759,32 +11759,32 @@ int bar(int n){
 // CHECK23-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
 // CHECK23-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK23:       omp.inner.for.cond:
-// CHECK23-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP21:![0-9]+]]
-// CHECK23-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP21]]
+// CHECK23-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP35:![0-9]+]]
+// CHECK23-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP35]]
 // CHECK23-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
 // CHECK23-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK23:       omp.inner.for.body:
-// CHECK23-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP21]]
+// CHECK23-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP35]]
 // CHECK23-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 4
 // CHECK23-NEXT:    [[ADD:%.*]] = add nsw i32 6, [[MUL]]
 // CHECK23-NEXT:    [[CONV:%.*]] = trunc i32 [[ADD]] to i16
-// CHECK23-NEXT:    store i16 [[CONV]], ptr [[IT]], align 2, !llvm.access.group [[ACC_GRP21]]
-// CHECK23-NEXT:    [[TMP8:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP21]]
+// CHECK23-NEXT:    store i16 [[CONV]], ptr [[IT]], align 2, !llvm.access.group [[ACC_GRP35]]
+// CHECK23-NEXT:    [[TMP8:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP35]]
 // CHECK23-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP8]], 1
-// CHECK23-NEXT:    store i32 [[ADD2]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP21]]
-// CHECK23-NEXT:    [[TMP9:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP21]]
+// CHECK23-NEXT:    store i32 [[ADD2]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP35]]
+// CHECK23-NEXT:    [[TMP9:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP35]]
 // CHECK23-NEXT:    [[CONV3:%.*]] = sext i16 [[TMP9]] to i32
 // CHECK23-NEXT:    [[ADD4:%.*]] = add nsw i32 [[CONV3]], 1
 // CHECK23-NEXT:    [[CONV5:%.*]] = trunc i32 [[ADD4]] to i16
-// CHECK23-NEXT:    store i16 [[CONV5]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP21]]
+// CHECK23-NEXT:    store i16 [[CONV5]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP35]]
 // CHECK23-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK23:       omp.body.continue:
 // CHECK23-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK23:       omp.inner.for.inc:
-// CHECK23-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP21]]
+// CHECK23-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP35]]
 // CHECK23-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP10]], 1
-// CHECK23-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP21]]
-// CHECK23-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP22:![0-9]+]]
+// CHECK23-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP35]]
+// CHECK23-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP36:![0-9]+]]
 // CHECK23:       omp.inner.for.end:
 // CHECK23-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK23:       omp.loop.exit:
@@ -11914,60 +11914,60 @@ int bar(int n){
 // CHECK23:       omp.dispatch.body:
 // CHECK23-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK23:       omp.inner.for.cond:
-// CHECK23-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP24:![0-9]+]]
-// CHECK23-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP24]]
+// CHECK23-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP38:![0-9]+]]
+// CHECK23-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP38]]
 // CHECK23-NEXT:    [[CMP6:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]]
 // CHECK23-NEXT:    br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK23:       omp.inner.for.body:
-// CHECK23-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP24]]
+// CHECK23-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP38]]
 // CHECK23-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1
 // CHECK23-NEXT:    [[SUB:%.*]] = sub nsw i32 122, [[MUL]]
 // CHECK23-NEXT:    [[CONV:%.*]] = trunc i32 [[SUB]] to i8
-// CHECK23-NEXT:    store i8 [[CONV]], ptr [[IT]], align 1, !llvm.access.group [[ACC_GRP24]]
-// CHECK23-NEXT:    [[TMP19:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP24]]
+// CHECK23-NEXT:    store i8 [[CONV]], ptr [[IT]], align 1, !llvm.access.group [[ACC_GRP38]]
+// CHECK23-NEXT:    [[TMP19:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP38]]
 // CHECK23-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP19]], 1
-// CHECK23-NEXT:    store i32 [[ADD]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP24]]
+// CHECK23-NEXT:    store i32 [[ADD]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP38]]
 // CHECK23-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x float], ptr [[TMP0]], i32 0, i32 2
-// CHECK23-NEXT:    [[TMP20:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP24]]
+// CHECK23-NEXT:    [[TMP20:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP38]]
 // CHECK23-NEXT:    [[CONV7:%.*]] = fpext float [[TMP20]] to double
 // CHECK23-NEXT:    [[ADD8:%.*]] = fadd double [[CONV7]], 1.000000e+00
 // CHECK23-NEXT:    [[CONV9:%.*]] = fptrunc double [[ADD8]] to float
-// CHECK23-NEXT:    store float [[CONV9]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP24]]
+// CHECK23-NEXT:    store float [[CONV9]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP38]]
 // CHECK23-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 3
-// CHECK23-NEXT:    [[TMP21:%.*]] = load float, ptr [[ARRAYIDX10]], align 4, !llvm.access.group [[ACC_GRP24]]
+// CHECK23-NEXT:    [[TMP21:%.*]] = load float, ptr [[ARRAYIDX10]], align 4, !llvm.access.group [[ACC_GRP38]]
 // CHECK23-NEXT:    [[CONV11:%.*]] = fpext float [[TMP21]] to double
 // CHECK23-NEXT:    [[ADD12:%.*]] = fadd double [[CONV11]], 1.000000e+00
 // CHECK23-NEXT:    [[CONV13:%.*]] = fptrunc double [[ADD12]] to float
-// CHECK23-NEXT:    store float [[CONV13]], ptr [[ARRAYIDX10]], align 4, !llvm.access.group [[ACC_GRP24]]
+// CHECK23-NEXT:    store float [[CONV13]], ptr [[ARRAYIDX10]], align 4, !llvm.access.group [[ACC_GRP38]]
 // CHECK23-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds [5 x [10 x double]], ptr [[TMP3]], i32 0, i32 1
 // CHECK23-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds [10 x double], ptr [[ARRAYIDX14]], i32 0, i32 2
-// CHECK23-NEXT:    [[TMP22:%.*]] = load double, ptr [[ARRAYIDX15]], align 8, !llvm.access.group [[ACC_GRP24]]
+// CHECK23-NEXT:    [[TMP22:%.*]] = load double, ptr [[ARRAYIDX15]], align 8, !llvm.access.group [[ACC_GRP38]]
 // CHECK23-NEXT:    [[ADD16:%.*]] = fadd double [[TMP22]], 1.000000e+00
-// CHECK23-NEXT:    store double [[ADD16]], ptr [[ARRAYIDX15]], align 8, !llvm.access.group [[ACC_GRP24]]
+// CHECK23-NEXT:    store double [[ADD16]], ptr [[ARRAYIDX15]], align 8, !llvm.access.group [[ACC_GRP38]]
 // CHECK23-NEXT:    [[TMP23:%.*]] = mul nsw i32 1, [[TMP5]]
 // CHECK23-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds double, ptr [[TMP6]], i32 [[TMP23]]
 // CHECK23-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds double, ptr [[ARRAYIDX17]], i32 3
-// CHECK23-NEXT:    [[TMP24:%.*]] = load double, ptr [[ARRAYIDX18]], align 8, !llvm.access.group [[ACC_GRP24]]
+// CHECK23-NEXT:    [[TMP24:%.*]] = load double, ptr [[ARRAYIDX18]], align 8, !llvm.access.group [[ACC_GRP38]]
 // CHECK23-NEXT:    [[ADD19:%.*]] = fadd double [[TMP24]], 1.000000e+00
-// CHECK23-NEXT:    store double [[ADD19]], ptr [[ARRAYIDX18]], align 8, !llvm.access.group [[ACC_GRP24]]
+// CHECK23-NEXT:    store double [[ADD19]], ptr [[ARRAYIDX18]], align 8, !llvm.access.group [[ACC_GRP38]]
 // CHECK23-NEXT:    [[X:%.*]] = getelementptr inbounds [[STRUCT_TT:%.*]], ptr [[TMP7]], i32 0, i32 0
-// CHECK23-NEXT:    [[TMP25:%.*]] = load i64, ptr [[X]], align 4, !llvm.access.group [[ACC_GRP24]]
+// CHECK23-NEXT:    [[TMP25:%.*]] = load i64, ptr [[X]], align 4, !llvm.access.group [[ACC_GRP38]]
 // CHECK23-NEXT:    [[ADD20:%.*]] = add nsw i64 [[TMP25]], 1
-// CHECK23-NEXT:    store i64 [[ADD20]], ptr [[X]], align 4, !llvm.access.group [[ACC_GRP24]]
+// CHECK23-NEXT:    store i64 [[ADD20]], ptr [[X]], align 4, !llvm.access.group [[ACC_GRP38]]
 // CHECK23-NEXT:    [[Y:%.*]] = getelementptr inbounds [[STRUCT_TT]], ptr [[TMP7]], i32 0, i32 1
-// CHECK23-NEXT:    [[TMP26:%.*]] = load i8, ptr [[Y]], align 4, !llvm.access.group [[ACC_GRP24]]
+// CHECK23-NEXT:    [[TMP26:%.*]] = load i8, ptr [[Y]], align 4, !llvm.access.group [[ACC_GRP38]]
 // CHECK23-NEXT:    [[CONV21:%.*]] = sext i8 [[TMP26]] to i32
 // CHECK23-NEXT:    [[ADD22:%.*]] = add nsw i32 [[CONV21]], 1
 // CHECK23-NEXT:    [[CONV23:%.*]] = trunc i32 [[ADD22]] to i8
-// CHECK23-NEXT:    store i8 [[CONV23]], ptr [[Y]], align 4, !llvm.access.group [[ACC_GRP24]]
+// CHECK23-NEXT:    store i8 [[CONV23]], ptr [[Y]], align 4, !llvm.access.group [[ACC_GRP38]]
 // CHECK23-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK23:       omp.body.continue:
 // CHECK23-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK23:       omp.inner.for.inc:
-// CHECK23-NEXT:    [[TMP27:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP24]]
+// CHECK23-NEXT:    [[TMP27:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP38]]
 // CHECK23-NEXT:    [[ADD24:%.*]] = add nsw i32 [[TMP27]], 1
-// CHECK23-NEXT:    store i32 [[ADD24]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP24]]
-// CHECK23-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP25:![0-9]+]]
+// CHECK23-NEXT:    store i32 [[ADD24]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP38]]
+// CHECK23-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP39:![0-9]+]]
 // CHECK23:       omp.inner.for.end:
 // CHECK23-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK23:       omp.dispatch.inc:
@@ -12146,37 +12146,37 @@ int bar(int n){
 // CHECK23-NEXT:    store i64 [[TMP9]], ptr [[DOTOMP_IV]], align 8
 // CHECK23-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK23:       omp.inner.for.cond:
-// CHECK23-NEXT:    [[TMP10:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP27:![0-9]+]]
-// CHECK23-NEXT:    [[TMP11:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8, !llvm.access.group [[ACC_GRP27]]
+// CHECK23-NEXT:    [[TMP10:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP41:![0-9]+]]
+// CHECK23-NEXT:    [[TMP11:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8, !llvm.access.group [[ACC_GRP41]]
 // CHECK23-NEXT:    [[CMP3:%.*]] = icmp ule i64 [[TMP10]], [[TMP11]]
 // CHECK23-NEXT:    br i1 [[CMP3]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK23:       omp.inner.for.body:
-// CHECK23-NEXT:    [[TMP12:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP27]]
+// CHECK23-NEXT:    [[TMP12:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP41]]
 // CHECK23-NEXT:    [[MUL:%.*]] = mul i64 [[TMP12]], 400
 // CHECK23-NEXT:    [[SUB:%.*]] = sub i64 2000, [[MUL]]
-// CHECK23-NEXT:    store i64 [[SUB]], ptr [[IT]], align 8, !llvm.access.group [[ACC_GRP27]]
-// CHECK23-NEXT:    [[TMP13:%.*]] = load i32, ptr [[B_ADDR]], align 4, !llvm.access.group [[ACC_GRP27]]
+// CHECK23-NEXT:    store i64 [[SUB]], ptr [[IT]], align 8, !llvm.access.group [[ACC_GRP41]]
+// CHECK23-NEXT:    [[TMP13:%.*]] = load i32, ptr [[B_ADDR]], align 4, !llvm.access.group [[ACC_GRP41]]
 // CHECK23-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP13]] to double
 // CHECK23-NEXT:    [[ADD:%.*]] = fadd double [[CONV]], 1.500000e+00
 // CHECK23-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_S1:%.*]], ptr [[TMP0]], i32 0, i32 0
-// CHECK23-NEXT:    store double [[ADD]], ptr [[A]], align 4, !nontemporal !28, !llvm.access.group [[ACC_GRP27]]
+// CHECK23-NEXT:    store double [[ADD]], ptr [[A]], align 4, !nontemporal !42, !llvm.access.group [[ACC_GRP41]]
 // CHECK23-NEXT:    [[A4:%.*]] = getelementptr inbounds [[STRUCT_S1]], ptr [[TMP0]], i32 0, i32 0
-// CHECK23-NEXT:    [[TMP14:%.*]] = load double, ptr [[A4]], align 4, !nontemporal !28, !llvm.access.group [[ACC_GRP27]]
+// CHECK23-NEXT:    [[TMP14:%.*]] = load double, ptr [[A4]], align 4, !nontemporal !42, !llvm.access.group [[ACC_GRP41]]
 // CHECK23-NEXT:    [[INC:%.*]] = fadd double [[TMP14]], 1.000000e+00
-// CHECK23-NEXT:    store double [[INC]], ptr [[A4]], align 4, !nontemporal !28, !llvm.access.group [[ACC_GRP27]]
+// CHECK23-NEXT:    store double [[INC]], ptr [[A4]], align 4, !nontemporal !42, !llvm.access.group [[ACC_GRP41]]
 // CHECK23-NEXT:    [[CONV5:%.*]] = fptosi double [[INC]] to i16
 // CHECK23-NEXT:    [[TMP15:%.*]] = mul nsw i32 1, [[TMP2]]
 // CHECK23-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[TMP3]], i32 [[TMP15]]
 // CHECK23-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i16, ptr [[ARRAYIDX]], i32 1
-// CHECK23-NEXT:    store i16 [[CONV5]], ptr [[ARRAYIDX6]], align 2, !llvm.access.group [[ACC_GRP27]]
+// CHECK23-NEXT:    store i16 [[CONV5]], ptr [[ARRAYIDX6]], align 2, !llvm.access.group [[ACC_GRP41]]
 // CHECK23-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK23:       omp.body.continue:
 // CHECK23-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK23:       omp.inner.for.inc:
-// CHECK23-NEXT:    [[TMP16:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP27]]
+// CHECK23-NEXT:    [[TMP16:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP41]]
 // CHECK23-NEXT:    [[ADD7:%.*]] = add i64 [[TMP16]], 1
-// CHECK23-NEXT:    store i64 [[ADD7]], ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP27]]
-// CHECK23-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP29:![0-9]+]]
+// CHECK23-NEXT:    store i64 [[ADD7]], ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP41]]
+// CHECK23-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP43:![0-9]+]]
 // CHECK23:       omp.inner.for.end:
 // CHECK23-NEXT:    br label [[OMP_IF_END:%.*]]
 // CHECK23:       omp_if.else:
@@ -12228,7 +12228,7 @@ int bar(int n){
 // CHECK23-NEXT:    [[TMP28:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8
 // CHECK23-NEXT:    [[ADD28:%.*]] = add i64 [[TMP28]], 1
 // CHECK23-NEXT:    store i64 [[ADD28]], ptr [[DOTOMP_IV]], align 8
-// CHECK23-NEXT:    br label [[OMP_INNER_FOR_COND13]], !llvm.loop [[LOOP31:![0-9]+]]
+// CHECK23-NEXT:    br label [[OMP_INNER_FOR_COND13]], !llvm.loop [[LOOP45:![0-9]+]]
 // CHECK23:       omp.inner.for.end29:
 // CHECK23-NEXT:    br label [[OMP_IF_END]]
 // CHECK23:       omp_if.end:
@@ -12312,35 +12312,35 @@ int bar(int n){
 // CHECK23-NEXT:    store i64 [[TMP5]], ptr [[DOTOMP_IV]], align 8
 // CHECK23-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK23:       omp.inner.for.cond:
-// CHECK23-NEXT:    [[TMP6:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP33:![0-9]+]]
-// CHECK23-NEXT:    [[TMP7:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8, !llvm.access.group [[ACC_GRP33]]
+// CHECK23-NEXT:    [[TMP6:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP47:![0-9]+]]
+// CHECK23-NEXT:    [[TMP7:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8, !llvm.access.group [[ACC_GRP47]]
 // CHECK23-NEXT:    [[CMP1:%.*]] = icmp sle i64 [[TMP6]], [[TMP7]]
 // CHECK23-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK23:       omp.inner.for.body:
-// CHECK23-NEXT:    [[TMP8:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP33]]
+// CHECK23-NEXT:    [[TMP8:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP47]]
 // CHECK23-NEXT:    [[MUL:%.*]] = mul nsw i64 [[TMP8]], 3
 // CHECK23-NEXT:    [[ADD:%.*]] = add nsw i64 -10, [[MUL]]
-// CHECK23-NEXT:    store i64 [[ADD]], ptr [[I]], align 8, !llvm.access.group [[ACC_GRP33]]
-// CHECK23-NEXT:    [[TMP9:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP33]]
+// CHECK23-NEXT:    store i64 [[ADD]], ptr [[I]], align 8, !llvm.access.group [[ACC_GRP47]]
+// CHECK23-NEXT:    [[TMP9:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP47]]
 // CHECK23-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP9]], 1
-// CHECK23-NEXT:    store i32 [[ADD2]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP33]]
-// CHECK23-NEXT:    [[TMP10:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP33]]
+// CHECK23-NEXT:    store i32 [[ADD2]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP47]]
+// CHECK23-NEXT:    [[TMP10:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP47]]
 // CHECK23-NEXT:    [[CONV:%.*]] = sext i16 [[TMP10]] to i32
 // CHECK23-NEXT:    [[ADD3:%.*]] = add nsw i32 [[CONV]], 1
 // CHECK23-NEXT:    [[CONV4:%.*]] = trunc i32 [[ADD3]] to i16
-// CHECK23-NEXT:    store i16 [[CONV4]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP33]]
+// CHECK23-NEXT:    store i16 [[CONV4]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP47]]
 // CHECK23-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP0]], i32 0, i32 2
-// CHECK23-NEXT:    [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP33]]
+// CHECK23-NEXT:    [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP47]]
 // CHECK23-NEXT:    [[ADD5:%.*]] = add nsw i32 [[TMP11]], 1
-// CHECK23-NEXT:    store i32 [[ADD5]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP33]]
+// CHECK23-NEXT:    store i32 [[ADD5]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP47]]
 // CHECK23-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK23:       omp.body.continue:
 // CHECK23-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK23:       omp.inner.for.inc:
-// CHECK23-NEXT:    [[TMP12:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP33]]
+// CHECK23-NEXT:    [[TMP12:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP47]]
 // CHECK23-NEXT:    [[ADD6:%.*]] = add nsw i64 [[TMP12]], 1
-// CHECK23-NEXT:    store i64 [[ADD6]], ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP33]]
-// CHECK23-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP34:![0-9]+]]
+// CHECK23-NEXT:    store i64 [[ADD6]], ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP47]]
+// CHECK23-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP48:![0-9]+]]
 // CHECK23:       omp.inner.for.end:
 // CHECK23-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK23:       omp.loop.exit:
diff --git a/clang/test/OpenMP/thread_limit_nvptx.c b/clang/test/OpenMP/thread_limit_nvptx.c
index 0925e24c6b930..2132e1aa7834a 100644
--- a/clang/test/OpenMP/thread_limit_nvptx.c
+++ b/clang/test/OpenMP/thread_limit_nvptx.c
@@ -15,15 +15,15 @@ void foo(int N) {
 #pragma omp target teams distribute parallel for simd thread_limit(4)
   for (int i = 0; i < N; ++i)
     ;
-// TODO: We should not emit two maxntidx annotations.
-// CHECK: l21, !"maxntidx", i32 128}
+// CHECK-NOT: l21, !"maxntidx", i32 128}
 // CHECK: l21, !"maxntidx", i32 42}
+// CHECK-NOT: l21, !"maxntidx", i32 128}
 #pragma omp target teams distribute parallel for simd ompx_attribute(__attribute__((launch_bounds(42, 42))))
   for (int i = 0; i < N; ++i)
     ;
-// TODO: We should not emit two maxntidx annotations.
+// CHECK-NOT: l27, !"maxntidx", i32 42}
 // CHECK: l27, !"maxntidx", i32 22}
-// CHECK: l27, !"maxntidx", i32 42}
+// CHECK-NOT: l27, !"maxntidx", i32 42}
 #pragma omp target teams distribute parallel for simd ompx_attribute(__attribute__((launch_bounds(42, 42)))) num_threads(22)
   for (int i = 0; i < N; ++i)
     ;
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index c2cfdfd32324d..270f3efc5a3b3 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -2030,11 +2030,30 @@ class OpenMPIRBuilder {
 
   ///}
 
+  /// Helpers to read/write kernel annotations from the IR.
+  ///
+  ///{
+
+  /// Read/write a bounds on threads for \p Kernel. Read will return 0 if none
+  /// is set.
+  static std::pair<int32_t, int32_t>
+  readThreadBoundsForKernel(Function &Kernel);
+  static void writeThreadBoundsForKernel(Function &Kernel, int32_t LB,
+                                         int32_t UB);
+
+  /// Read/write a bounds on teams for \p Kernel. Read will return 0 if none
+  /// is set.
+  static std::pair<int32_t, int32_t> readTeamBoundsForKernel(Function &Kernel);
+  static void writeTeamsForKernel(Function &Kernel, int32_t LB, int32_t UB);
+  ///}
+
 private:
   // Sets the function attributes expected for the outlined function
   void setOutlinedTargetRegionFunctionAttributes(Function *OutlinedFn,
-                                                 int32_t NumTeams,
-                                                 int32_t NumThreads);
+                                                 int32_t MinTeams,
+                                                 int32_t MaxTeams,
+                                                 int32_t MinThreads,
+                                                 int32_t MaxThreads);
 
   // Creates the function ID/Address for the given outlined function.
   // In the case of an embedded device function the address of the function is
@@ -2079,13 +2098,16 @@ class OpenMPIRBuilder {
   /// \param InfoManager The info manager keeping track of the offload entries
   /// \param EntryInfo The entry information about the function
   /// \param GenerateFunctionCallback The callback function to generate the code
-  /// \param NumTeams Number default teams
-  /// \param NumThreads Number default threads
+  /// \param MinTeams Minimal number of teams
+  /// \param MaxTeams Maximal number of teams
+  /// \param MinThreads Minimal number of threads
+  /// \param MaxThreads Maximal number of threads
   /// \param OutlinedFunction Pointer to the outlined function
   /// \param EntryFnIDName Name of the ID o be created
   void emitTargetRegionFunction(TargetRegionEntryInfo &EntryInfo,
                                 FunctionGenCallback &GenerateFunctionCallback,
-                                int32_t NumTeams, int32_t NumThreads,
+                                int32_t MinTeams, int32_t MaxTeams,
+                                int32_t MinThreads, int32_t MaxThreads,
                                 bool IsOffloadEntry, Function *&OutlinedFn,
                                 Constant *&OutlinedFnID);
 
@@ -2097,13 +2119,15 @@ class OpenMPIRBuilder {
   /// \param OutlinedFunction Pointer to the outlined function
   /// \param EntryFnName Name of the outlined function
   /// \param EntryFnIDName Name of the ID o be created
-  /// \param NumTeams Number default teams
-  /// \param NumThreads Number default threads
-  Constant *registerTargetRegionFunction(TargetRegionEntryInfo &EntryInfo,
-                                         Function *OutlinedFunction,
-                                         StringRef EntryFnName,
-                                         StringRef EntryFnIDName,
-                                         int32_t NumTeams, int32_t NumThreads);
+  /// \param MinTeams Minimal number of teams
+  /// \param MaxTeams Maximal number of teams
+  /// \param MinThreads Minimal number of threads
+  /// \param MaxThreads Maximal number of threads
+  Constant *registerTargetRegionFunction(
+      TargetRegionEntryInfo &EntryInfo, Function *OutlinedFunction,
+      StringRef EntryFnName, StringRef EntryFnIDName, int32_t MinTeams,
+      int32_t MaxTeams, int32_t MinThreads, int32_t MaxThreads);
+
   /// Type of BodyGen to use for region codegen
   ///
   /// Priv: If device pointer privatization is required, emit the body of the
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 62c97ff7f2925..b40796d4e50a6 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -4190,8 +4190,106 @@ static const omp::GV &getGridValue(Function *Kernel) {
   llvm_unreachable("No grid value available for this architecture!");
 }
 
+static MDNode *getNVPTXMDNode(Function &Kernel, StringRef Name) {
+  Module &M = *Kernel.getParent();
+  NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
+  for (auto *Op : MD->operands()) {
+    if (Op->getNumOperands() != 3)
+      continue;
+    auto *KernelOp = dyn_cast<ConstantAsMetadata>(Op->getOperand(0));
+    if (!KernelOp || KernelOp->getValue() != &Kernel)
+      continue;
+    auto *Prop = dyn_cast<MDString>(Op->getOperand(1));
+    if (!Prop || Prop->getString() != Name)
+      continue;
+    return Op;
+  }
+  return nullptr;
+}
+
+static void updateNVPTXMetadata(Function &Kernel, StringRef Name, int32_t Value,
+                                bool Min) {
+  // Update the "maxntidx" metadata for NVIDIA, or add it.
+  MDNode *ExistingOp = getNVPTXMDNode(Kernel, Name);
+  if (ExistingOp) {
+    auto *OldVal = dyn_cast<ConstantAsMetadata>(ExistingOp->getOperand(2));
+    int32_t OldLimit = cast<ConstantInt>(OldVal->getValue())->getZExtValue();
+    ExistingOp->replaceOperandWith(
+        2, ConstantAsMetadata::get(ConstantInt::get(
+               OldVal->getValue()->getType(),
+               Min ? std::min(OldLimit, Value) : std::max(OldLimit, Value))));
+  } else {
+    LLVMContext &Ctx = Kernel.getContext();
+    Metadata *MDVals[] = {ConstantAsMetadata::get(&Kernel),
+                          MDString::get(Ctx, Name),
+                          ConstantAsMetadata::get(
+                              ConstantInt::get(Type::getInt32Ty(Ctx), Value))};
+    // Append metadata to nvvm.annotations
+    Module &M = *Kernel.getParent();
+    NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
+    MD->addOperand(MDNode::get(Ctx, MDVals));
+  }
+}
+
+std::pair<int32_t, int32_t>
+OpenMPIRBuilder::readThreadBoundsForKernel(Function &Kernel) {
+  int32_t ThreadLimit =
+      Kernel.getFnAttributeAsParsedInteger("omp_target_thread_limit");
+
+  bool IsAMDGPU = Kernel.getCallingConv() == CallingConv::AMDGPU_KERNEL;
+  if (IsAMDGPU) {
+    const auto &Attr = Kernel.getFnAttribute("amdgpu-flat-work-group-size");
+    if (!Attr.isValid() || !Attr.isStringAttribute())
+      return {0, ThreadLimit};
+    auto [LBStr, UBStr] = Attr.getValueAsString().split(',');
+    int32_t LB, UB;
+    if (!llvm::to_integer(UBStr, UB, 10))
+      return {0, ThreadLimit};
+    UB = ThreadLimit ? std::min(ThreadLimit, UB) : UB;
+    if (!llvm::to_integer(LBStr, LB, 10))
+      return {0, UB};
+    return {LB, UB};
+  }
+
+  if (MDNode *ExistingOp = getNVPTXMDNode(Kernel, "maxntidx")) {
+    auto *OldVal = dyn_cast<ConstantAsMetadata>(ExistingOp->getOperand(2));
+    int32_t UB = cast<ConstantInt>(OldVal->getValue())->getZExtValue();
+    return {0, ThreadLimit ? std::min(ThreadLimit, UB) : UB};
+  }
+  return {0, ThreadLimit};
+}
+
+void OpenMPIRBuilder::writeThreadBoundsForKernel(Function &Kernel, int32_t LB,
+                                                 int32_t UB) {
+  Kernel.addFnAttr("omp_target_thread_limit", std::to_string(UB));
+
+  bool IsAMDGPU = Kernel.getCallingConv() == CallingConv::AMDGPU_KERNEL;
+  if (IsAMDGPU) {
+    Kernel.addFnAttr("amdgpu-flat-work-group-size",
+                     llvm::utostr(LB) + "," + llvm::utostr(UB));
+    return;
+  }
+
+  updateNVPTXMetadata(Kernel, "maxntidx", UB, true);
+}
+
+std::pair<int32_t, int32_t>
+OpenMPIRBuilder::readTeamBoundsForKernel(Function &Kernel) {
+  // TODO: Read from backend annotations if available.
+  return {0, Kernel.getFnAttributeAsParsedInteger("omp_target_num_teams")};
+}
+
+void OpenMPIRBuilder::writeTeamsForKernel(Function &Kernel, int32_t LB,
+                                          int32_t UB) {
+  if (UB > 0)
+    updateNVPTXMetadata(Kernel, "maxclusterrank", UB, true);
+  updateNVPTXMetadata(Kernel, "minctasm", LB, false);
+  Kernel.addFnAttr("omp_target_num_teams", std::to_string(LB));
+}
+
 void OpenMPIRBuilder::setOutlinedTargetRegionFunctionAttributes(
-    Function *OutlinedFn, int32_t NumTeams, int32_t NumThreads) {
+    Function *OutlinedFn, int32_t MinTeams, int32_t MaxTeams,
+    int32_t MinThreads, int32_t MaxThreads) {
   if (Config.isTargetDevice()) {
     OutlinedFn->setLinkage(GlobalValue::WeakODRLinkage);
     // TODO: Determine if DSO local can be set to true.
@@ -4201,53 +4299,15 @@ void OpenMPIRBuilder::setOutlinedTargetRegionFunctionAttributes(
       OutlinedFn->setCallingConv(CallingConv::AMDGPU_KERNEL);
   }
 
-  if (NumTeams > 0)
-    OutlinedFn->addFnAttr("omp_target_num_teams", std::to_string(NumTeams));
+  if (MinTeams > 1 || MaxTeams > 0)
+    writeTeamsForKernel(*OutlinedFn, MinTeams, MaxTeams);
 
-  if (NumThreads == -1 && Config.isGPU())
-    NumThreads = getGridValue(OutlinedFn).GV_Default_WG_Size;
+  if (MaxThreads == -1 && Config.isGPU())
+    MaxThreads = std::max(int32_t(getGridValue(OutlinedFn).GV_Default_WG_Size),
+                          MinThreads);
 
-  if (NumThreads > 0) {
-    if (OutlinedFn->getCallingConv() == CallingConv::AMDGPU_KERNEL) {
-      OutlinedFn->addFnAttr("amdgpu-flat-work-group-size",
-                            "1," + llvm::utostr(NumThreads));
-    } else {
-      // Update the "maxntidx" metadata for NVIDIA, or add it.
-      NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
-      MDNode *ExistingOp = nullptr;
-      for (auto *Op : MD->operands()) {
-        if (Op->getNumOperands() != 3)
-          continue;
-        auto *Kernel = dyn_cast<ConstantAsMetadata>(Op->getOperand(0));
-        if (!Kernel || Kernel->getValue() != OutlinedFn)
-          continue;
-        auto *Prop = dyn_cast<MDString>(Op->getOperand(1));
-        if (!Prop || Prop->getString() != "maxntidx")
-          continue;
-        ExistingOp = Op;
-        break;
-      }
-      if (ExistingOp) {
-        auto *OldVal = dyn_cast<ConstantAsMetadata>(ExistingOp->getOperand(2));
-        int32_t OldLimit =
-            cast<ConstantInt>(OldVal->getValue())->getZExtValue();
-        ExistingOp->replaceOperandWith(
-            2, ConstantAsMetadata::get(
-                   ConstantInt::get(OldVal->getValue()->getType(),
-                                    std::min(OldLimit, NumThreads))));
-      } else {
-        LLVMContext &Ctx = M.getContext();
-        Metadata *MDVals[] = {ConstantAsMetadata::get(OutlinedFn),
-                              MDString::get(Ctx, "maxntidx"),
-                              ConstantAsMetadata::get(ConstantInt::get(
-                                  Type::getInt32Ty(Ctx), NumThreads))};
-        // Append metadata to nvvm.annotations
-        MD->addOperand(MDNode::get(Ctx, MDVals));
-      }
-    }
-    OutlinedFn->addFnAttr("omp_target_thread_limit",
-                          std::to_string(NumThreads));
-  }
+  if (MaxThreads > 0)
+    writeThreadBoundsForKernel(*OutlinedFn, MinThreads, MaxThreads);
 }
 
 Constant *OpenMPIRBuilder::createOutlinedFunctionID(Function *OutlinedFn,
@@ -4276,9 +4336,9 @@ Constant *OpenMPIRBuilder::createTargetRegionEntryAddr(Function *OutlinedFn,
 
 void OpenMPIRBuilder::emitTargetRegionFunction(
     TargetRegionEntryInfo &EntryInfo,
-    FunctionGenCallback &GenerateFunctionCallback, int32_t NumTeams,
-    int32_t NumThreads, bool IsOffloadEntry, Function *&OutlinedFn,
-    Constant *&OutlinedFnID) {
+    FunctionGenCallback &GenerateFunctionCallback, int32_t MinTeams,
+    int32_t MaxTeams, int32_t MinThreads, int32_t MaxThreads,
+    bool IsOffloadEntry, Function *&OutlinedFn, Constant *&OutlinedFnID) {
 
   SmallString<64> EntryFnName;
   OffloadInfoManager.getTargetRegionEntryFnName(EntryFnName, EntryInfo);
@@ -4299,15 +4359,17 @@ void OpenMPIRBuilder::emitTargetRegionFunction(
           : createPlatformSpecificName({EntryFnName, "region_id"});
 
   OutlinedFnID = registerTargetRegionFunction(
-      EntryInfo, OutlinedFn, EntryFnName, EntryFnIDName, NumTeams, NumThreads);
+      EntryInfo, OutlinedFn, EntryFnName, EntryFnIDName, MinTeams, MaxTeams,
+      MinThreads, MaxThreads);
 }
 
 Constant *OpenMPIRBuilder::registerTargetRegionFunction(
     TargetRegionEntryInfo &EntryInfo, Function *OutlinedFn,
-    StringRef EntryFnName, StringRef EntryFnIDName, int32_t NumTeams,
-    int32_t NumThreads) {
+    StringRef EntryFnName, StringRef EntryFnIDName, int32_t MinTeams,
+    int32_t MaxTeams, int32_t MinThreads, int32_t MaxThreads) {
   if (OutlinedFn)
-    setOutlinedTargetRegionFunctionAttributes(OutlinedFn, NumTeams, NumThreads);
+    setOutlinedTargetRegionFunctionAttributes(OutlinedFn, MinTeams, MaxTeams,
+                                              MinThreads, MaxThreads);
   auto OutlinedFnID = createOutlinedFunctionID(OutlinedFn, EntryFnIDName);
   auto EntryAddr = createTargetRegionEntryAddr(OutlinedFn, EntryFnName);
   OffloadInfoManager.registerTargetRegionEntryInfo(
@@ -4598,8 +4660,8 @@ static void emitTargetOutlinedFunction(
       };
 
   OMPBuilder.emitTargetRegionFunction(EntryInfo, GenerateOutlinedFunction,
-                                      NumTeams, NumThreads, true, OutlinedFn,
-                                      OutlinedFnID);
+                                      NumTeams, NumTeams, 1, NumThreads, true,
+                                      OutlinedFn, OutlinedFnID);
 }
 
 static void emitTargetCall(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder,

From 140e5d52e7d722a4928337b04f991051d6acda9c Mon Sep 17 00:00:00 2001
From: Kiran Chandramohan <kiran.chandramohan@arm.com>
Date: Thu, 26 Oct 2023 21:41:52 +0000
Subject: [PATCH 142/877] [Flang][OpenMP] Port copyin and commonblock
 privatisation tests to HLFIR flow

These are copies of tests in flang/test/Lower/OpenMP/FIR
copyin.f90 is autogenerated.
---
 flang/test/Lower/OpenMP/copyin.f90            | 346 ++++++++++++++++++
 .../test/Lower/OpenMP/private-commonblock.f90 | 127 +++++++
 2 files changed, 473 insertions(+)
 create mode 100644 flang/test/Lower/OpenMP/copyin.f90
 create mode 100644 flang/test/Lower/OpenMP/private-commonblock.f90

diff --git a/flang/test/Lower/OpenMP/copyin.f90 b/flang/test/Lower/OpenMP/copyin.f90
new file mode 100644
index 0000000000000..895e1abd274f3
--- /dev/null
+++ b/flang/test/Lower/OpenMP/copyin.f90
@@ -0,0 +1,346 @@
+! This test checks lowering of `COPYIN` clause.
+! Note: The test is autogenerated.
+
+! RUN: bbc -fopenmp -emit-hlfir %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir -fopenmp %s -o - | FileCheck %s
+
+
+! CHECK-LABEL:   func.func @_QPcopyin_scalar_array() {
+! CHECK:           %[[VAL_0:.*]] = fir.address_of(@_QFcopyin_scalar_arrayEx1) : !fir.ref<i32>
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFcopyin_scalar_arrayEx1"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_2:.*]] = omp.threadprivate %[[VAL_1]]#1 : !fir.ref<i32> -> !fir.ref<i32>
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_2]] {uniq_name = "_QFcopyin_scalar_arrayEx1"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_4:.*]] = fir.address_of(@_QFcopyin_scalar_arrayEx2) : !fir.ref<!fir.array<10xi64>>
+! CHECK:           %[[VAL_5:.*]] = arith.constant 10 : index
+! CHECK:           %[[VAL_6:.*]] = fir.shape %[[VAL_5]] : (index) -> !fir.shape<1>
+! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_4]](%[[VAL_6]]) {uniq_name = "_QFcopyin_scalar_arrayEx2"} : (!fir.ref<!fir.array<10xi64>>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xi64>>, !fir.ref<!fir.array<10xi64>>)
+! CHECK:           %[[VAL_8:.*]] = omp.threadprivate %[[VAL_7]]#1 : !fir.ref<!fir.array<10xi64>> -> !fir.ref<!fir.array<10xi64>>
+! CHECK:           %[[VAL_9:.*]] = fir.shape %[[VAL_5]] : (index) -> !fir.shape<1>
+! CHECK:           %[[VAL_10:.*]]:2 = hlfir.declare %[[VAL_8]](%[[VAL_9]]) {uniq_name = "_QFcopyin_scalar_arrayEx2"} : (!fir.ref<!fir.array<10xi64>>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xi64>>, !fir.ref<!fir.array<10xi64>>)
+! CHECK:           omp.parallel {
+! CHECK:             %[[VAL_11:.*]] = omp.threadprivate %[[VAL_1]]#1 : !fir.ref<i32> -> !fir.ref<i32>
+! CHECK:             %[[VAL_12:.*]]:2 = hlfir.declare %[[VAL_11]] {uniq_name = "_QFcopyin_scalar_arrayEx1"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:             %[[VAL_13:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<i32>
+! CHECK:             hlfir.assign %[[VAL_13]] to %[[VAL_12]]#0 temporary_lhs : i32, !fir.ref<i32>
+! CHECK:             %[[VAL_14:.*]] = omp.threadprivate %[[VAL_7]]#1 : !fir.ref<!fir.array<10xi64>> -> !fir.ref<!fir.array<10xi64>>
+! CHECK:             %[[VAL_15:.*]] = fir.shape %[[VAL_5]] : (index) -> !fir.shape<1>
+! CHECK:             %[[VAL_16:.*]]:2 = hlfir.declare %[[VAL_14]](%[[VAL_15]]) {uniq_name = "_QFcopyin_scalar_arrayEx2"} : (!fir.ref<!fir.array<10xi64>>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xi64>>, !fir.ref<!fir.array<10xi64>>)
+! CHECK:             hlfir.assign %[[VAL_10]]#0 to %[[VAL_16]]#0 temporary_lhs : !fir.ref<!fir.array<10xi64>>, !fir.ref<!fir.array<10xi64>>
+! CHECK:             omp.barrier
+! CHECK:             fir.call @_QPsub1(%[[VAL_12]]#1, %[[VAL_16]]#1) fastmath<contract> : (!fir.ref<i32>, !fir.ref<!fir.array<10xi64>>) -> ()
+! CHECK:             omp.terminator
+! CHECK:           }
+! CHECK:           return
+! CHECK:         }
+
+subroutine copyin_scalar_array()
+  integer(kind=4), save :: x1
+  integer(kind=8), save :: x2(10)
+  !$omp threadprivate(x1, x2)
+
+  !$omp parallel copyin(x1) copyin(x2)
+    call sub1(x1, x2)
+  !$omp end parallel
+
+end
+
+! CHECK-LABEL:   func.func @_QPcopyin_char_chararray() {
+! CHECK:           %[[VAL_0:.*]] = fir.address_of(@_QFcopyin_char_chararrayEx3) : !fir.ref<!fir.char<1,5>>
+! CHECK:           %[[VAL_1:.*]] = arith.constant 5 : index
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_0]] typeparams %[[VAL_1]] {uniq_name = "_QFcopyin_char_chararrayEx3"} : (!fir.ref<!fir.char<1,5>>, index) -> (!fir.ref<!fir.char<1,5>>, !fir.ref<!fir.char<1,5>>)
+! CHECK:           %[[VAL_3:.*]] = omp.threadprivate %[[VAL_2]]#1 : !fir.ref<!fir.char<1,5>> -> !fir.ref<!fir.char<1,5>>
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] typeparams %[[VAL_1]] {uniq_name = "_QFcopyin_char_chararrayEx3"} : (!fir.ref<!fir.char<1,5>>, index) -> (!fir.ref<!fir.char<1,5>>, !fir.ref<!fir.char<1,5>>)
+! CHECK:           %[[VAL_5:.*]] = fir.address_of(@_QFcopyin_char_chararrayEx4) : !fir.ref<!fir.array<10x!fir.char<1,5>>>
+! CHECK:           %[[VAL_6:.*]] = arith.constant 5 : index
+! CHECK:           %[[VAL_7:.*]] = arith.constant 10 : index
+! CHECK:           %[[VAL_8:.*]] = fir.shape %[[VAL_7]] : (index) -> !fir.shape<1>
+! CHECK:           %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_5]](%[[VAL_8]]) typeparams %[[VAL_6]] {uniq_name = "_QFcopyin_char_chararrayEx4"} : (!fir.ref<!fir.array<10x!fir.char<1,5>>>, !fir.shape<1>, index) -> (!fir.ref<!fir.array<10x!fir.char<1,5>>>, !fir.ref<!fir.array<10x!fir.char<1,5>>>)
+! CHECK:           %[[VAL_10:.*]] = omp.threadprivate %[[VAL_9]]#1 : !fir.ref<!fir.array<10x!fir.char<1,5>>> -> !fir.ref<!fir.array<10x!fir.char<1,5>>>
+! CHECK:           %[[VAL_11:.*]] = fir.shape %[[VAL_7]] : (index) -> !fir.shape<1>
+! CHECK:           %[[VAL_12:.*]]:2 = hlfir.declare %[[VAL_10]](%[[VAL_11]]) typeparams %[[VAL_6]] {uniq_name = "_QFcopyin_char_chararrayEx4"} : (!fir.ref<!fir.array<10x!fir.char<1,5>>>, !fir.shape<1>, index) -> (!fir.ref<!fir.array<10x!fir.char<1,5>>>, !fir.ref<!fir.array<10x!fir.char<1,5>>>)
+! CHECK:           omp.parallel {
+! CHECK:             %[[VAL_13:.*]] = omp.threadprivate %[[VAL_2]]#1 : !fir.ref<!fir.char<1,5>> -> !fir.ref<!fir.char<1,5>>
+! CHECK:             %[[VAL_14:.*]]:2 = hlfir.declare %[[VAL_13]] typeparams %[[VAL_1]] {uniq_name = "_QFcopyin_char_chararrayEx3"} : (!fir.ref<!fir.char<1,5>>, index) -> (!fir.ref<!fir.char<1,5>>, !fir.ref<!fir.char<1,5>>)
+! CHECK:             hlfir.assign %[[VAL_4]]#0 to %[[VAL_14]]#0 temporary_lhs : !fir.ref<!fir.char<1,5>>, !fir.ref<!fir.char<1,5>>
+! CHECK:             %[[VAL_15:.*]] = omp.threadprivate %[[VAL_9]]#1 : !fir.ref<!fir.array<10x!fir.char<1,5>>> -> !fir.ref<!fir.array<10x!fir.char<1,5>>>
+! CHECK:             %[[VAL_16:.*]] = fir.shape %[[VAL_7]] : (index) -> !fir.shape<1>
+! CHECK:             %[[VAL_17:.*]]:2 = hlfir.declare %[[VAL_15]](%[[VAL_16]]) typeparams %[[VAL_6]] {uniq_name = "_QFcopyin_char_chararrayEx4"} : (!fir.ref<!fir.array<10x!fir.char<1,5>>>, !fir.shape<1>, index) -> (!fir.ref<!fir.array<10x!fir.char<1,5>>>, !fir.ref<!fir.array<10x!fir.char<1,5>>>)
+! CHECK:             hlfir.assign %[[VAL_12]]#0 to %[[VAL_17]]#0 temporary_lhs : !fir.ref<!fir.array<10x!fir.char<1,5>>>, !fir.ref<!fir.array<10x!fir.char<1,5>>>
+! CHECK:             omp.barrier
+! CHECK:             %[[VAL_18:.*]] = fir.emboxchar %[[VAL_14]]#1, %[[VAL_1]] : (!fir.ref<!fir.char<1,5>>, index) -> !fir.boxchar<1>
+! CHECK:             %[[VAL_19:.*]] = fir.convert %[[VAL_17]]#1 : (!fir.ref<!fir.array<10x!fir.char<1,5>>>) -> !fir.ref<!fir.char<1,5>>
+! CHECK:             %[[VAL_20:.*]] = fir.emboxchar %[[VAL_19]], %[[VAL_6]] : (!fir.ref<!fir.char<1,5>>, index) -> !fir.boxchar<1>
+! CHECK:             fir.call @_QPsub2(%[[VAL_18]], %[[VAL_20]]) fastmath<contract> : (!fir.boxchar<1>, !fir.boxchar<1>) -> ()
+! CHECK:             omp.terminator
+! CHECK:           }
+! CHECK:           return
+! CHECK:         }
+subroutine copyin_char_chararray()
+  character(5), save :: x3, x4(10)
+  !$omp threadprivate(x3, x4)
+
+  !$omp parallel copyin(x3) copyin(x4)
+    call sub2(x3, x4)
+  !$omp end parallel
+
+end
+
+! CHECK-LABEL:   func.func @_QPcopyin_derived_type() {
+! CHECK:           %[[VAL_0:.*]] = fir.address_of(@_QFcopyin_derived_typeE.b.my_type.t_arr) : !fir.ref<!fir.array<2x1x!fir.type<_QM__fortran_type_infoTvalue{genre:i8,__padding0:!fir.array<7xi8>,value:i64}>>>
+! CHECK:           %[[VAL_1:.*]] = arith.constant 0 : index
+! CHECK:           %[[VAL_2:.*]] = arith.constant 2 : index
+! CHECK:           %[[VAL_3:.*]] = arith.constant 0 : index
+! CHECK:           %[[VAL_4:.*]] = arith.constant 1 : index
+! CHECK:           %[[VAL_5:.*]] = fir.shape_shift %[[VAL_1]], %[[VAL_2]], %[[VAL_3]], %[[VAL_4]] : (index, index, index, index) -> !fir.shapeshift<2>
+! CHECK:           %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_5]]) {{{.*}}, uniq_name = "_QFcopyin_derived_typeE.b.my_type.t_arr"}
+! CHECK:           %[[VAL_7:.*]] = fir.address_of(@_QFcopyin_derived_typeE.n.t_i) : !fir.ref<!fir.char<1,3>>
+! CHECK:           %[[VAL_8:.*]] = arith.constant 3 : index
+! CHECK:           %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_7]] typeparams %[[VAL_8]] {{{.*}}, uniq_name = "_QFcopyin_derived_typeE.n.t_i"}
+! CHECK:           %[[VAL_10:.*]] = fir.address_of(@_QFcopyin_derived_typeE.n.t_arr) : !fir.ref<!fir.char<1,5>>
+! CHECK:           %[[VAL_11:.*]] = arith.constant 5 : index
+! CHECK:           %[[VAL_12:.*]]:2 = hlfir.declare %[[VAL_10]] typeparams %[[VAL_11]] {{{.*}}, uniq_name = "_QFcopyin_derived_typeE.n.t_arr"}
+! CHECK:           %[[VAL_13:.*]] = fir.address_of(@_QFcopyin_derived_typeE.n.my_type) : !fir.ref<!fir.char<1,7>>
+! CHECK:           %[[VAL_14:.*]] = arith.constant 7 : index
+! CHECK:           %[[VAL_15:.*]]:2 = hlfir.declare %[[VAL_13]] typeparams %[[VAL_14]] {{{.*}}, uniq_name = "_QFcopyin_derived_typeE.n.my_type"}
+! CHECK:           %[[VAL_16:.*]] = fir.address_of(@_QFcopyin_derived_typeEx5) : !fir.ref<!fir.type<_QFcopyin_derived_typeTmy_type{t_i:i32,t_arr:!fir.array<5xi32>}>>
+! CHECK:           %[[VAL_17:.*]]:2 = hlfir.declare %[[VAL_16]] {uniq_name = "_QFcopyin_derived_typeEx5"} : (!fir.ref<!fir.type<_QFcopyin_derived_typeTmy_type{t_i:i32,t_arr:!fir.array<5xi32>}>>) -> (!fir.ref<!fir.type<_QFcopyin_derived_typeTmy_type{t_i:i32,t_arr:!fir.array<5xi32>}>>, !fir.ref<!fir.type<_QFcopyin_derived_typeTmy_type{t_i:i32,t_arr:!fir.array<5xi32>}>>)
+! CHECK:           %[[VAL_18:.*]] = omp.threadprivate %[[VAL_17]]#1 : !fir.ref<!fir.type<_QFcopyin_derived_typeTmy_type{t_i:i32,t_arr:!fir.array<5xi32>}>> -> !fir.ref<!fir.type<_QFcopyin_derived_typeTmy_type{t_i:i32,t_arr:!fir.array<5xi32>}>>
+! CHECK:           %[[VAL_19:.*]]:2 = hlfir.declare %[[VAL_18]] {uniq_name = "_QFcopyin_derived_typeEx5"} : (!fir.ref<!fir.type<_QFcopyin_derived_typeTmy_type{t_i:i32,t_arr:!fir.array<5xi32>}>>) -> (!fir.ref<!fir.type<_QFcopyin_derived_typeTmy_type{t_i:i32,t_arr:!fir.array<5xi32>}>>, !fir.ref<!fir.type<_QFcopyin_derived_typeTmy_type{t_i:i32,t_arr:!fir.array<5xi32>}>>)
+! CHECK:           %[[VAL_20:.*]] = fir.address_of(@_QFcopyin_derived_typeE.c.my_type)
+! CHECK:           %[[VAL_21:.*]] = arith.constant 0 : index
+! CHECK:           %[[VAL_22:.*]] = arith.constant 2 : index
+! CHECK:           %[[VAL_23:.*]] = fir.shape_shift %[[VAL_21]], %[[VAL_22]] : (index, index) -> !fir.shapeshift<1>
+! CHECK:           %[[VAL_24:.*]]:2 = hlfir.declare %[[VAL_20]](%[[VAL_23]]) {{{.*}}, uniq_name = "_QFcopyin_derived_typeE.c.my_type"}
+! CHECK:           %[[VAL_25:.*]] = fir.address_of(@_QFcopyin_derived_typeE.dt.my_type)
+! CHECK:           %[[VAL_26:.*]]:2 = hlfir.declare %[[VAL_25]] {{{.*}}, uniq_name = "_QFcopyin_derived_typeE.dt.my_type"}
+! CHECK:           omp.parallel {
+! CHECK:             %[[VAL_27:.*]] = omp.threadprivate %[[VAL_17]]#1 : !fir.ref<!fir.type<_QFcopyin_derived_typeTmy_type{t_i:i32,t_arr:!fir.array<5xi32>}>> -> !fir.ref<!fir.type<_QFcopyin_derived_typeTmy_type{t_i:i32,t_arr:!fir.array<5xi32>}>>
+! CHECK:             %[[VAL_28:.*]]:2 = hlfir.declare %[[VAL_27]] {uniq_name = "_QFcopyin_derived_typeEx5"} : (!fir.ref<!fir.type<_QFcopyin_derived_typeTmy_type{t_i:i32,t_arr:!fir.array<5xi32>}>>) -> (!fir.ref<!fir.type<_QFcopyin_derived_typeTmy_type{t_i:i32,t_arr:!fir.array<5xi32>}>>, !fir.ref<!fir.type<_QFcopyin_derived_typeTmy_type{t_i:i32,t_arr:!fir.array<5xi32>}>>)
+! CHECK:             hlfir.assign %[[VAL_19]]#0 to %[[VAL_28]]#0 temporary_lhs : !fir.ref<!fir.type<_QFcopyin_derived_typeTmy_type{t_i:i32,t_arr:!fir.array<5xi32>}>>, !fir.ref<!fir.type<_QFcopyin_derived_typeTmy_type{t_i:i32,t_arr:!fir.array<5xi32>}>>
+! CHECK:             omp.barrier
+! CHECK:             fir.call @_QPsub3(%[[VAL_28]]#1) fastmath<contract> : (!fir.ref<!fir.type<_QFcopyin_derived_typeTmy_type{t_i:i32,t_arr:!fir.array<5xi32>}>>) -> ()
+! CHECK:             omp.terminator
+! CHECK:           }
+! CHECK:           return
+! CHECK:         }
+
+subroutine copyin_derived_type()
+  type my_type
+    integer :: t_i
+    integer :: t_arr(5)
+  end type my_type
+  type(my_type), save :: x5
+  !$omp threadprivate(x5)
+
+  !$omp parallel copyin(x5)
+    call sub3(x5)
+  !$omp end parallel
+
+end
+
+! CHECK-LABEL:   func.func @_QPcombined_parallel_worksharing_loop() {
+! CHECK:           %[[VAL_0:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFcombined_parallel_worksharing_loopEi"}
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFcombined_parallel_worksharing_loopEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_2:.*]] = fir.address_of(@_QFcombined_parallel_worksharing_loopEx6) : !fir.ref<i32>
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_2]] {uniq_name = "_QFcombined_parallel_worksharing_loopEx6"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_4:.*]] = omp.threadprivate %[[VAL_3]]#1 : !fir.ref<i32> -> !fir.ref<i32>
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_4]] {uniq_name = "_QFcombined_parallel_worksharing_loopEx6"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           omp.parallel {
+! CHECK:             %[[VAL_6:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+! CHECK:             %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_6]] {uniq_name = "_QFcombined_parallel_worksharing_loopEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:             %[[VAL_8:.*]] = omp.threadprivate %[[VAL_3]]#1 : !fir.ref<i32> -> !fir.ref<i32>
+! CHECK:             %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_8]] {uniq_name = "_QFcombined_parallel_worksharing_loopEx6"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:             %[[VAL_10:.*]] = fir.load %[[VAL_5]]#0 : !fir.ref<i32>
+! CHECK:             hlfir.assign %[[VAL_10]] to %[[VAL_9]]#0 temporary_lhs : i32, !fir.ref<i32>
+! CHECK:             omp.barrier
+! CHECK:             %[[VAL_11:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_12:.*]] = fir.load %[[VAL_9]]#0 : !fir.ref<i32>
+! CHECK:             %[[VAL_13:.*]] = arith.constant 1 : i32
+! CHECK:             omp.wsloop for  (%[[VAL_14:.*]]) : i32 = (%[[VAL_11]]) to (%[[VAL_12]]) inclusive step (%[[VAL_13]]) {
+! CHECK:               fir.store %[[VAL_14]] to %[[VAL_7]]#1 : !fir.ref<i32>
+! CHECK:               fir.call @_QPsub4(%[[VAL_9]]#1) fastmath<contract> : (!fir.ref<i32>) -> ()
+! CHECK:               omp.yield
+! CHECK:             }
+! CHECK:             omp.terminator
+! CHECK:           }
+! CHECK:           return
+! CHECK:         }
+
+subroutine combined_parallel_worksharing_loop()
+  integer, save :: x6
+  !$omp threadprivate(x6)
+
+  !$omp parallel do copyin(x6)
+    do i=1, x6
+      call sub4(x6)
+    end do
+  !$omp end parallel do
+
+end
+
+! CHECK-LABEL:   func.func @_QPcombined_parallel_sections() {
+! CHECK:           %[[VAL_0:.*]] = fir.address_of(@_QFcombined_parallel_sectionsEx7) : !fir.ref<i32>
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFcombined_parallel_sectionsEx7"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_2:.*]] = omp.threadprivate %[[VAL_1]]#1 : !fir.ref<i32> -> !fir.ref<i32>
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_2]] {uniq_name = "_QFcombined_parallel_sectionsEx7"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           omp.parallel {
+! CHECK:             %[[VAL_4:.*]] = omp.threadprivate %[[VAL_1]]#1 : !fir.ref<i32> -> !fir.ref<i32>
+! CHECK:             %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_4]] {uniq_name = "_QFcombined_parallel_sectionsEx7"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:             %[[VAL_6:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<i32>
+! CHECK:             hlfir.assign %[[VAL_6]] to %[[VAL_5]]#0 temporary_lhs : i32, !fir.ref<i32>
+! CHECK:             omp.barrier
+! CHECK:             omp.sections {
+! CHECK:               omp.section {
+! CHECK:                 fir.call @_QPsub5(%[[VAL_5]]#1) fastmath<contract> : (!fir.ref<i32>) -> ()
+! CHECK:                 omp.terminator
+! CHECK:               }
+! CHECK:               omp.section {
+! CHECK:                 fir.call @_QPsub6(%[[VAL_5]]#1) fastmath<contract> : (!fir.ref<i32>) -> ()
+! CHECK:                 omp.terminator
+! CHECK:               }
+! CHECK:               omp.terminator
+! CHECK:             }
+! CHECK:             omp.terminator
+! CHECK:           }
+! CHECK:           return
+! CHECK:         }
+subroutine combined_parallel_sections()
+  integer, save :: x7
+  !$omp threadprivate(x7)
+
+  !$omp parallel sections copyin(x7)
+    !$omp section
+      call sub5(x7)
+    !$omp section
+      call sub6(x7)
+  !$omp end parallel sections
+
+end
+
+! CHECK-LABEL:   func.func @_QPcommon_1() {
+! CHECK:           %[[VAL_0:.*]] = fir.address_of(@c_) : !fir.ref<!fir.array<4xi8>>
+! CHECK:           %[[VAL_1:.*]] = fir.convert %[[VAL_0]] : (!fir.ref<!fir.array<4xi8>>) -> !fir.ref<!fir.array<?xi8>>
+! CHECK:           %[[VAL_2:.*]] = arith.constant 0 : index
+! CHECK:           %[[VAL_3:.*]] = fir.coordinate_of %[[VAL_1]], %[[VAL_2]] : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
+! CHECK:           %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (!fir.ref<i8>) -> !fir.ref<i32>
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_4]] {uniq_name = "_QFcommon_1Ex"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_6:.*]] = omp.threadprivate %[[VAL_0]] : !fir.ref<!fir.array<4xi8>> -> !fir.ref<!fir.array<4xi8>>
+! CHECK:           %[[VAL_7:.*]] = fir.convert %[[VAL_6]] : (!fir.ref<!fir.array<4xi8>>) -> !fir.ref<!fir.array<?xi8>>
+! CHECK:           %[[VAL_8:.*]] = arith.constant 0 : index
+! CHECK:           %[[VAL_9:.*]] = fir.coordinate_of %[[VAL_7]], %[[VAL_8]] : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
+! CHECK:           %[[VAL_10:.*]] = fir.convert %[[VAL_9]] : (!fir.ref<i8>) -> !fir.ref<i32>
+! CHECK:           %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_10]] {uniq_name = "_QFcommon_1Ex"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_12:.*]] = fir.alloca i32 {bindc_name = "y", uniq_name = "_QFcommon_1Ey"}
+! CHECK:           %[[VAL_13:.*]]:2 = hlfir.declare %[[VAL_12]] {uniq_name = "_QFcommon_1Ey"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           omp.parallel {
+! CHECK:             %[[VAL_14:.*]] = omp.threadprivate %[[VAL_0]] : !fir.ref<!fir.array<4xi8>> -> !fir.ref<!fir.array<4xi8>>
+! CHECK:             %[[VAL_15:.*]] = fir.convert %[[VAL_14]] : (!fir.ref<!fir.array<4xi8>>) -> !fir.ref<!fir.array<?xi8>>
+! CHECK:             %[[VAL_16:.*]] = arith.constant 0 : index
+! CHECK:             %[[VAL_17:.*]] = fir.coordinate_of %[[VAL_15]], %[[VAL_16]] : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
+! CHECK:             %[[VAL_18:.*]] = fir.convert %[[VAL_17]] : (!fir.ref<i8>) -> !fir.ref<i32>
+! CHECK:             %[[VAL_19:.*]]:2 = hlfir.declare %[[VAL_18]] {uniq_name = "_QFcommon_1Ex"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:             %[[VAL_20:.*]] = fir.load %[[VAL_11]]#0 : !fir.ref<i32>
+! CHECK:             hlfir.assign %[[VAL_20]] to %[[VAL_19]]#0 temporary_lhs : i32, !fir.ref<i32>
+! CHECK:             omp.barrier
+! CHECK:             omp.sections {
+! CHECK:               omp.section {
+! CHECK:                 %[[VAL_21:.*]] = fir.load %[[VAL_19]]#0 : !fir.ref<i32>
+! CHECK:                 %[[VAL_22:.*]] = arith.constant 1 : i32
+! CHECK:                 %[[VAL_23:.*]] = arith.addi %[[VAL_21]], %[[VAL_22]] : i32
+! CHECK:                 hlfir.assign %[[VAL_23]] to %[[VAL_13]]#0 : i32, !fir.ref<i32>
+! CHECK:                 omp.terminator
+! CHECK:               }
+! CHECK:               omp.section {
+! CHECK:                 %[[VAL_24:.*]] = fir.load %[[VAL_13]]#0 : !fir.ref<i32>
+! CHECK:                 %[[VAL_25:.*]] = fir.load %[[VAL_13]]#0 : !fir.ref<i32>
+! CHECK:                 %[[VAL_26:.*]] = arith.muli %[[VAL_24]], %[[VAL_25]] : i32
+! CHECK:                 hlfir.assign %[[VAL_26]] to %[[VAL_19]]#0 : i32, !fir.ref<i32>
+! CHECK:                 omp.terminator
+! CHECK:               }
+! CHECK:               omp.terminator
+! CHECK:             }
+! CHECK:             omp.terminator
+! CHECK:           }
+! CHECK:           return
+! CHECK:         }
+subroutine common_1()
+  integer :: x
+  integer :: y
+  common /c/ x
+  !$omp threadprivate(/c/)
+
+  !$omp parallel sections copyin(/c/)
+      !$omp section
+        y = x + 1
+      !$omp section
+        x = y * y
+  !$omp end parallel sections
+end subroutine
+
+
+! CHECK-LABEL:   func.func @_QPcommon_2() {
+! CHECK:           %[[VAL_0:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFcommon_2Ei"}
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFcommon_2Ei"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_2:.*]] = fir.address_of(@d_) : !fir.ref<!fir.array<8xi8>>
+! CHECK:           %[[VAL_3:.*]] = fir.convert %[[VAL_2]] : (!fir.ref<!fir.array<8xi8>>) -> !fir.ref<!fir.array<?xi8>>
+! CHECK:           %[[VAL_4:.*]] = arith.constant 0 : index
+! CHECK:           %[[VAL_5:.*]] = fir.coordinate_of %[[VAL_3]], %[[VAL_4]] : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
+! CHECK:           %[[VAL_6:.*]] = fir.convert %[[VAL_5]] : (!fir.ref<i8>) -> !fir.ref<i32>
+! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_6]] {uniq_name = "_QFcommon_2Ex"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_8:.*]] = omp.threadprivate %[[VAL_2]] : !fir.ref<!fir.array<8xi8>> -> !fir.ref<!fir.array<8xi8>>
+! CHECK:           %[[VAL_9:.*]] = fir.convert %[[VAL_8]] : (!fir.ref<!fir.array<8xi8>>) -> !fir.ref<!fir.array<?xi8>>
+! CHECK:           %[[VAL_10:.*]] = arith.constant 0 : index
+! CHECK:           %[[VAL_11:.*]] = fir.coordinate_of %[[VAL_9]], %[[VAL_10]] : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
+! CHECK:           %[[VAL_12:.*]] = fir.convert %[[VAL_11]] : (!fir.ref<i8>) -> !fir.ref<i32>
+! CHECK:           %[[VAL_13:.*]]:2 = hlfir.declare %[[VAL_12]] {uniq_name = "_QFcommon_2Ex"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_14:.*]] = fir.convert %[[VAL_8]] : (!fir.ref<!fir.array<8xi8>>) -> !fir.ref<!fir.array<?xi8>>
+! CHECK:           %[[VAL_15:.*]] = arith.constant 4 : index
+! CHECK:           %[[VAL_16:.*]] = fir.coordinate_of %[[VAL_14]], %[[VAL_15]] : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
+! CHECK:           %[[VAL_17:.*]] = fir.convert %[[VAL_16]] : (!fir.ref<i8>) -> !fir.ref<i32>
+! CHECK:           %[[VAL_18:.*]]:2 = hlfir.declare %[[VAL_17]] {uniq_name = "_QFcommon_2Ey"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           omp.parallel {
+! CHECK:             %[[VAL_19:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+! CHECK:             %[[VAL_20:.*]]:2 = hlfir.declare %[[VAL_19]] {uniq_name = "_QFcommon_2Ei"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:             %[[VAL_21:.*]] = omp.threadprivate %[[VAL_2]] : !fir.ref<!fir.array<8xi8>> -> !fir.ref<!fir.array<8xi8>>
+! CHECK:             %[[VAL_22:.*]] = fir.convert %[[VAL_21]] : (!fir.ref<!fir.array<8xi8>>) -> !fir.ref<!fir.array<?xi8>>
+! CHECK:             %[[VAL_23:.*]] = arith.constant 0 : index
+! CHECK:             %[[VAL_24:.*]] = fir.coordinate_of %[[VAL_22]], %[[VAL_23]] : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
+! CHECK:             %[[VAL_25:.*]] = fir.convert %[[VAL_24]] : (!fir.ref<i8>) -> !fir.ref<i32>
+! CHECK:             %[[VAL_26:.*]]:2 = hlfir.declare %[[VAL_25]] {uniq_name = "_QFcommon_2Ex"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:             %[[VAL_27:.*]] = fir.convert %[[VAL_21]] : (!fir.ref<!fir.array<8xi8>>) -> !fir.ref<!fir.array<?xi8>>
+! CHECK:             %[[VAL_28:.*]] = arith.constant 4 : index
+! CHECK:             %[[VAL_29:.*]] = fir.coordinate_of %[[VAL_27]], %[[VAL_28]] : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
+! CHECK:             %[[VAL_30:.*]] = fir.convert %[[VAL_29]] : (!fir.ref<i8>) -> !fir.ref<i32>
+! CHECK:             %[[VAL_31:.*]]:2 = hlfir.declare %[[VAL_30]] {uniq_name = "_QFcommon_2Ey"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:             %[[VAL_32:.*]] = fir.load %[[VAL_13]]#0 : !fir.ref<i32>
+! CHECK:             hlfir.assign %[[VAL_32]] to %[[VAL_26]]#0 temporary_lhs : i32, !fir.ref<i32>
+! CHECK:             %[[VAL_33:.*]] = fir.load %[[VAL_18]]#0 : !fir.ref<i32>
+! CHECK:             hlfir.assign %[[VAL_33]] to %[[VAL_31]]#0 temporary_lhs : i32, !fir.ref<i32>
+! CHECK:             omp.barrier
+! CHECK:             %[[VAL_34:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_35:.*]] = fir.load %[[VAL_26]]#0 : !fir.ref<i32>
+! CHECK:             %[[VAL_36:.*]] = arith.constant 1 : i32
+! CHECK:             omp.wsloop for  (%[[VAL_37:.*]]) : i32 = (%[[VAL_34]]) to (%[[VAL_35]]) inclusive step (%[[VAL_36]]) {
+! CHECK:               fir.store %[[VAL_37]] to %[[VAL_20]]#1 : !fir.ref<i32>
+! CHECK:               %[[VAL_38:.*]] = fir.load %[[VAL_31]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_39:.*]] = fir.load %[[VAL_20]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_40:.*]] = arith.addi %[[VAL_38]], %[[VAL_39]] : i32
+! CHECK:               hlfir.assign %[[VAL_40]] to %[[VAL_31]]#0 : i32, !fir.ref<i32>
+! CHECK:               omp.yield
+! CHECK:             }
+! CHECK:             omp.terminator
+! CHECK:           }
+! CHECK:           return
+! CHECK:         }
+subroutine common_2()
+  integer :: x
+  integer :: y
+  common /d/ x, y
+  !$omp threadprivate(/d/)
+  
+  !$omp parallel do copyin(/d/)
+     do i = 1, x
+        y = y + i
+     end do
+  !$omp end parallel do
+end subroutine
diff --git a/flang/test/Lower/OpenMP/private-commonblock.f90 b/flang/test/Lower/OpenMP/private-commonblock.f90
new file mode 100644
index 0000000000000..615ddb21129c9
--- /dev/null
+++ b/flang/test/Lower/OpenMP/private-commonblock.f90
@@ -0,0 +1,127 @@
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -o - %s 2>&1 | FileCheck %s
+
+!CHECK: func.func @_QPprivate_common() {
+!CHECK: omp.parallel {
+!CHECK: %[[X:.*]] = fir.alloca f32 {bindc_name = "x", pinned, uniq_name = "_QFprivate_commonEx"}
+!CHECK: %[[X_DECL:.*]]:2 = hlfir.declare %[[X]] {uniq_name = "_QFprivate_commonEx"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+!CHECK: %[[Y:.*]] = fir.alloca f32 {bindc_name = "y", pinned, uniq_name = "_QFprivate_commonEy"}
+!CHECK: %[[Y_DECL:.*]]:2 = hlfir.declare %[[Y]] {uniq_name = "_QFprivate_commonEy"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+!CHECK: omp.terminator
+!CHECK: }
+!CHECK: return
+!CHECK: }
+subroutine private_common
+  common /c/ x, y
+  real x, y
+  !$omp parallel private(/c/)
+  !$omp end parallel
+end subroutine
+
+!CHECK:    %[[BLK_ADDR:.*]] = fir.address_of(@blk_) : !fir.ref<!fir.array<74xi8>>
+!CHECK:    %[[I8_ARR:.*]] = fir.convert %[[BLK_ADDR]] : (!fir.ref<!fir.array<74xi8>>) -> !fir.ref<!fir.array<?xi8>>
+!CHECK:    %[[C0:.*]] = arith.constant 0 : index
+!CHECK:    %[[A_I8_REF:.*]] = fir.coordinate_of %[[I8_ARR]], %[[C0]] : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
+!CHECK:    %[[A_REF:.*]] = fir.convert %[[A_I8_REF]] : (!fir.ref<i8>) -> !fir.ref<i32>
+!CHECK:    %[[A_DECL:.*]]:2 = hlfir.declare %[[A_REF]] {uniq_name = "_QFprivate_clause_commonblockEa"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:    %[[I8_ARR:.*]] = fir.convert %[[BLK_ADDR]] : (!fir.ref<!fir.array<74xi8>>) -> !fir.ref<!fir.array<?xi8>>
+!CHECK:    %[[C4:.*]] = arith.constant 4 : index
+!CHECK:    %[[B_I8_REF:.*]] = fir.coordinate_of %[[I8_ARR]], %[[C4]] : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
+!CHECK:    %[[B_REF:.*]] = fir.convert %[[B_I8_REF:.*]] : (!fir.ref<i8>) -> !fir.ref<!fir.array<10xf32>>
+!CHECK:    %[[C10:.*]] = arith.constant 10 : index
+!CHECK:    %[[SH10:.*]] = fir.shape %[[C10]] : (index) -> !fir.shape<1>
+!CHECK:    %[[B_DECL:.*]]:2 = hlfir.declare %[[B_REF]](%[[SH10]]) {uniq_name = "_QFprivate_clause_commonblockEb"} : (!fir.ref<!fir.array<10xf32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xf32>>, !fir.ref<!fir.array<10xf32>>)
+!CHECK:    %[[I8_ARR:.*]] = fir.convert %[[BLK_ADDR]] : (!fir.ref<!fir.array<74xi8>>) -> !fir.ref<!fir.array<?xi8>>
+!CHECK:    %[[C44:.*]] = arith.constant 44 : index
+!CHECK:    %[[C_I8_REF:.*]] = fir.coordinate_of %[[I8_ARR]], %[[C44]] : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
+!CHECK:    %[[C_REF:.*]] = fir.convert %[[C_I8_REF]] : (!fir.ref<i8>) -> !fir.ref<!fir.char<1,5>>
+!CHECK:    %[[C5:.*]] = arith.constant 5 : index
+!CHECK:    %[[C_DECL:.*]]:2 = hlfir.declare %[[C_REF]] typeparams %[[C5]] {uniq_name = "_QFprivate_clause_commonblockEc"} : (!fir.ref<!fir.char<1,5>>, index) -> (!fir.ref<!fir.char<1,5>>, !fir.ref<!fir.char<1,5>>)
+!CHECK:    %[[I8_ARR:.*]] = fir.convert %[[BLK_ADDR]] : (!fir.ref<!fir.array<74xi8>>) -> !fir.ref<!fir.array<?xi8>>
+!CHECK:    %[[C49:.*]] = arith.constant 49 : index
+!CHECK:    %[[D_I8_REF:.*]] = fir.coordinate_of %[[I8_ARR]], %[[C49]] : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
+!CHECK:    %[[D_REF:.*]] = fir.convert %[[D_I8_REF]] : (!fir.ref<i8>) -> !fir.ref<!fir.array<5x!fir.char<1,5>>>
+!CHECK:    %[[TP5:.*]] = arith.constant 5 : index
+!CHECK:    %[[C5:.*]] = arith.constant 5 : index
+!CHECK:    %[[SH5:.*]] = fir.shape %[[C5]] : (index) -> !fir.shape<1>
+!CHECK:    %[[D_DECL:.*]]:2 = hlfir.declare %[[D_REF]](%[[SH5:.*]]) typeparams %[[TP5]] {uniq_name = "_QFprivate_clause_commonblockEd"} : (!fir.ref<!fir.array<5x!fir.char<1,5>>>, !fir.shape<1>, index) -> (!fir.ref<!fir.array<5x!fir.char<1,5>>>, !fir.ref<!fir.array<5x!fir.char<1,5>>>)
+!CHECK:    %[[C_BOX:.*]] = fir.emboxchar %[[C_DECL]]#1, %c5 : (!fir.ref<!fir.char<1,5>>, index) -> !fir.boxchar<1>
+!CHECK:    %[[D_REF:.*]] = fir.convert %[[D_DECL]]#1 : (!fir.ref<!fir.array<5x!fir.char<1,5>>>) -> !fir.ref<!fir.char<1,5>>
+!CHECK:    %[[D_BOX:.*]] = fir.emboxchar %[[D_REF]], %[[TP5]] : (!fir.ref<!fir.char<1,5>>, index) -> !fir.boxchar<1>
+!CHECK:    fir.call @_QPsub1(%[[A_DECL]]#1, %[[B_DECL]]#1, %[[C_BOX]], %[[D_BOX]]) fastmath<contract> : (!fir.ref<i32>, !fir.ref<!fir.array<10xf32>>, !fir.boxchar<1>, !fir.boxchar<1>) -> ()
+!CHECK:        omp.parallel {
+!CHECK:      %[[A_PVT_REF:.*]] = fir.alloca i32 {bindc_name = "a", pinned, uniq_name = "_QFprivate_clause_commonblockEa"}
+!CHECK:      %[[A_PVT_DECL:.*]]:2 = hlfir.declare %[[A_PVT_REF]] {uniq_name = "_QFprivate_clause_commonblockEa"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:      %[[B_PVT_REF:.*]] = fir.alloca !fir.array<10xf32> {bindc_name = "b", pinned, uniq_name = "_QFprivate_clause_commonblockEb"}
+!CHECK:      %[[SH10:.*]] = fir.shape %c10 : (index) -> !fir.shape<1>
+!CHECK:      %[[B_PVT_DECL:.*]]:2 = hlfir.declare %[[B_PVT_REF]](%[[SH10]]) {uniq_name = "_QFprivate_clause_commonblockEb"} : (!fir.ref<!fir.array<10xf32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xf32>>, !fir.ref<!fir.array<10xf32>>)
+!CHECK:      %[[C_PVT_REF:.*]] = fir.alloca !fir.char<1,5> {bindc_name = "c", pinned, uniq_name = "_QFprivate_clause_commonblockEc"}
+!CHECK:      %[[C_PVT_DECL:.*]]:2 = hlfir.declare %[[C_PVT_REF]] typeparams %{{.*}} {uniq_name = "_QFprivate_clause_commonblockEc"} : (!fir.ref<!fir.char<1,5>>, index) -> (!fir.ref<!fir.char<1,5>>, !fir.ref<!fir.char<1,5>>)
+!CHECK:      %[[D_PVT_REF:.*]] = fir.alloca !fir.array<5x!fir.char<1,5>> {bindc_name = "d", pinned, uniq_name = "_QFprivate_clause_commonblockEd"}
+!CHECK:      %[[SH5:.*]] = fir.shape %c5_1 : (index) -> !fir.shape<1>
+!CHECK:      %[[D_PVT_DECL:.*]]:2 = hlfir.declare %[[D_PVT_REF]](%[[SH5]]) typeparams %[[TP5]] {uniq_name = "_QFprivate_clause_commonblockEd"} : (!fir.ref<!fir.array<5x!fir.char<1,5>>>, !fir.shape<1>, index) -> (!fir.ref<!fir.array<5x!fir.char<1,5>>>, !fir.ref<!fir.array<5x!fir.char<1,5>>>)
+!CHECK:      %[[C_PVT_BOX:.*]] = fir.emboxchar %[[C_PVT_DECL]]#1, %{{.*}} : (!fir.ref<!fir.char<1,5>>, index) -> !fir.boxchar<1>
+!CHECK:      %[[D_PVT_REF:.*]] = fir.convert %[[D_PVT_DECL]]#1 : (!fir.ref<!fir.array<5x!fir.char<1,5>>>) -> !fir.ref<!fir.char<1,5>>
+!CHECK:      %[[D_PVT_BOX:.*]] = fir.emboxchar %[[D_PVT_REF]], %{{.*}} : (!fir.ref<!fir.char<1,5>>, index) -> !fir.boxchar<1>
+!CHECK:      fir.call @_QPsub2(%[[A_PVT_DECL]]#1, %[[B_PVT_DECL]]#1, %[[C_PVT_BOX]], %[[D_PVT_BOX]]) fastmath<contract> : (!fir.ref<i32>, !fir.ref<!fir.array<10xf32>>, !fir.boxchar<1>, !fir.boxchar<1>) -> ()
+!CHECK:      omp.terminator
+!CHECK:    }
+!CHECK:    %[[C_BOX:.*]] = fir.emboxchar %[[C_DECL]]#1, %{{.*}} : (!fir.ref<!fir.char<1,5>>, index) -> !fir.boxchar<1>
+!CHECK:    %[[D_REF:.*]] = fir.convert %[[D_DECL]]#1 : (!fir.ref<!fir.array<5x!fir.char<1,5>>>) -> !fir.ref<!fir.char<1,5>>
+!CHECK:    %[[D_BOX:.*]] = fir.emboxchar %[[D_REF]], %{{.*}} : (!fir.ref<!fir.char<1,5>>, index) -> !fir.boxchar<1>
+!CHECK:    fir.call @_QPsub3(%[[A_DECL]]#1, %[[B_DECL]]#1, %[[C_BOX]], %[[D_BOX]]) fastmath<contract> : (!fir.ref<i32>, !fir.ref<!fir.array<10xf32>>, !fir.boxchar<1>, !fir.boxchar<1>) -> ()
+subroutine private_clause_commonblock()
+  integer::a
+  real::b(10)
+  character(5):: c, d(5)
+  common /blk/ a, b, c, d
+  
+  call sub1(a, b, c, d)
+  !$omp parallel private(/blk/)
+        call sub2(a, b, c, d)
+  !$omp end parallel
+  call sub3(a, b, c, d)
+end subroutine
+
+!CHECK: func.func @_QPprivate_clause_commonblock_pointer() {
+!CHECK:    %[[BLK_ADDR:.*]] = fir.address_of(@blk_) : !fir.ref<!fir.array<74xi8>>
+!CHECK:    %[[BLK_I8_REF:.*]] = fir.convert %[[BLK_ADDR]] : (!fir.ref<!fir.array<74xi8>>) -> !fir.ref<!fir.array<?xi8>>
+!CHECK:    %[[C24:.*]] = arith.constant 24 : index
+!CHECK:    %[[A_I8_REF:.*]] = fir.coordinate_of %[[BLK_I8_REF]], %[[C24]] : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
+!CHECK:    %[[A_REF:.*]] = fir.convert %[[A_I8_REF]] : (!fir.ref<i8>) -> !fir.ref<i32>
+!CHECK:    %[[A_DECL:.*]]:2 = hlfir.declare %[[A_REF]] {uniq_name = "_QFprivate_clause_commonblock_pointerEa"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:    %[[BLK_I8_REF:.*]] = fir.convert %[[BLK_ADDR]] : (!fir.ref<!fir.array<74xi8>>) -> !fir.ref<!fir.array<?xi8>>
+!CHECK:    %[[C0:.*]] = arith.constant 0 : index
+!CHECK:    %[[C_I8_REF:.*]] = fir.coordinate_of %[[BLK_I8_REF]], %[[C0]] : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
+!CHECK:    %[[C_REF:.*]] = fir.convert %[[C_I8_REF]] : (!fir.ref<i8>) -> !fir.ref<!fir.box<!fir.ptr<!fir.complex<4>>>>
+!CHECK:    %[[C_DECL:.*]]:2 = hlfir.declare %[[C_REF]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFprivate_clause_commonblock_pointerEc"} : (!fir.ref<!fir.box<!fir.ptr<!fir.complex<4>>>>) -> (!fir.ref<!fir.box<!fir.ptr<!fir.complex<4>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.complex<4>>>>)
+!CHECK:    %[[C_BOX:.*]] = fir.load %[[C_DECL]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.complex<4>>>>
+!CHECK:    %[[C_ADDR:.*]] = fir.box_addr %[[C_BOX]] : (!fir.box<!fir.ptr<!fir.complex<4>>>) -> !fir.ptr<!fir.complex<4>>
+!CHECK:    %[[C_REF:.*]] = fir.convert %[[C_ADDR]] : (!fir.ptr<!fir.complex<4>>) -> !fir.ref<!fir.complex<4>>
+!CHECK:    fir.call @_QPsub4(%[[C_REF]], %[[A_DECL]]#1) fastmath<contract> : (!fir.ref<!fir.complex<4>>, !fir.ref<i32>) -> ()
+!CHECK:    omp.parallel {
+!CHECK:      %[[C_PVT_REF:.*]] = fir.alloca !fir.box<!fir.ptr<!fir.complex<4>>> {bindc_name = "c", pinned, uniq_name = "_QFprivate_clause_commonblock_pointerEc"}
+!CHECK:      %[[C_PVT_DECL:.*]]:2 = hlfir.declare %[[C_PVT_REF]] {uniq_name = "_QFprivate_clause_commonblock_pointerEc"} : (!fir.ref<!fir.box<!fir.ptr<!fir.complex<4>>>>) -> (!fir.ref<!fir.box<!fir.ptr<!fir.complex<4>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.complex<4>>>>)
+!CHECK:      %[[A_PVT_REF:.*]] = fir.alloca i32 {bindc_name = "a", pinned, uniq_name = "_QFprivate_clause_commonblock_pointerEa"}
+!CHECK:      %[[A_PVT_DECL:.*]]:2 = hlfir.declare %[[A_PVT_REF]] {uniq_name = "_QFprivate_clause_commonblock_pointerEa"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:      %[[C_PVT_BOX:.*]] = fir.load %[[C_PVT_DECL]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.complex<4>>>>
+!CHECK:      %[[C_PVT_ADDR:.*]] = fir.box_addr %[[C_PVT_BOX]] : (!fir.box<!fir.ptr<!fir.complex<4>>>) -> !fir.ptr<!fir.complex<4>>
+!CHECK:      %[[C_PVT_REF:.*]] = fir.convert %[[C_PVT_ADDR]] : (!fir.ptr<!fir.complex<4>>) -> !fir.ref<!fir.complex<4>>
+!CHECK:      fir.call @_QPsub5(%[[C_PVT_REF]], %[[A_PVT_DECL]]#1) fastmath<contract> : (!fir.ref<!fir.complex<4>>, !fir.ref<i32>) -> ()
+!CHECK:      omp.terminator
+!CHECK:    }
+!CHECK:    %[[C_BOX:.*]] = fir.load %[[C_DECL]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.complex<4>>>>
+!CHECK:    %[[C_ADDR:.*]] = fir.box_addr %[[C_BOX]] : (!fir.box<!fir.ptr<!fir.complex<4>>>) -> !fir.ptr<!fir.complex<4>>
+!CHECK:    %[[C_REF:.*]] = fir.convert %[[C_ADDR]] : (!fir.ptr<!fir.complex<4>>) -> !fir.ref<!fir.complex<4>>
+!CHECK:    fir.call @_QPsub6(%[[C_REF]], %[[A_DECL]]#1) fastmath<contract> : (!fir.ref<!fir.complex<4>>, !fir.ref<i32>) -> ()
+!CHECK: return
+!CHECK: }
+subroutine private_clause_commonblock_pointer()
+  complex, pointer :: c
+  integer:: a
+  common /blk/ c, a
+  call sub4(c, a)
+  !$omp parallel private(/blk/)
+        call sub5(c, a)
+  !$omp end parallel
+  call sub6(c, a)
+end subroutine

From a002606404af1b0bc6bb92f46a6c1b7aa534bc67 Mon Sep 17 00:00:00 2001
From: Alex Richardson <alexrichardson@google.com>
Date: Thu, 26 Oct 2023 14:45:21 -0700
Subject: [PATCH 143/877] Add another missing REQUIRES: line

This test depends on the datalayout.
---
 llvm/test/Instrumentation/HWAddressSanitizer/use-after-scope.ll | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/test/Instrumentation/HWAddressSanitizer/use-after-scope.ll b/llvm/test/Instrumentation/HWAddressSanitizer/use-after-scope.ll
index c678164f1f36b..643b2a8c274cf 100644
--- a/llvm/test/Instrumentation/HWAddressSanitizer/use-after-scope.ll
+++ b/llvm/test/Instrumentation/HWAddressSanitizer/use-after-scope.ll
@@ -1,4 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; REQUIRES: aarch64-registered-target, x86-registered-target
 ; RUN: opt -mtriple=x86_64-unknown-linux-gnu -passes=hwasan \
 ; RUN:   -hwasan-use-after-scope=1 -hwasan-generate-tags-with-calls -S < %s | \
 ; RUN:   FileCheck %s --check-prefixes=X86-SCOPE

From c2a1249a8257ed033a98e32e425539c6da6700ec Mon Sep 17 00:00:00 2001
From: Johannes Doerfert <johannes@jdoerfert.de>
Date: Thu, 26 Oct 2023 14:46:55 -0700
Subject: [PATCH 144/877] [OpenMP][NFC] Add min/max threads/teams count into
 the KernelEnvironment (#70257)

The runtime needs to know about the acceptable launch bounds, especially
if the compiler (middle- or backend) assumed those bounds. While this
patch does not yet inform the runtime, it stores the bounds in a place
that can/will be accessed and is associated with the kernel.
---
 .../include/llvm/Frontend/OpenMP/OMPKinds.def |   2 +-
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp     |  27 +++--
 llvm/lib/Transforms/IPO/OpenMPOpt.cpp         |  29 +++++
 .../Transforms/OpenMP/always_inline_device.ll |   6 +-
 .../OpenMP/custom_state_machines.ll           |  82 ++++++-------
 .../OpenMP/custom_state_machines_pre_lto.ll   | 114 +++++++++---------
 .../OpenMP/custom_state_machines_remarks.ll   |   6 +-
 .../Transforms/OpenMP/deduplication_target.ll |   4 +-
 .../get_hardware_num_threads_in_block_fold.ll |  14 +--
 .../Transforms/OpenMP/global_constructor.ll   |   4 +-
 .../OpenMP/globalization_remarks.ll           |   4 +-
 ..._state_machine_function_ptr_replacement.ll |   4 +-
 .../OpenMP/is_spmd_exec_mode_fold.ll          |  18 +--
 .../Transforms/OpenMP/nested_parallelism.ll   |  10 +-
 .../Transforms/OpenMP/parallel_deletion.ll    |  50 ++++----
 .../Transforms/OpenMP/parallel_level_fold.ll  |  14 +--
 .../Transforms/OpenMP/remove_globalization.ll |   8 +-
 .../OpenMP/replace_globalization.ll           |  14 +--
 .../OpenMP/single_threaded_execution.ll       |   4 +-
 llvm/test/Transforms/OpenMP/spmdization.ll    |  86 ++++++-------
 .../Transforms/OpenMP/spmdization_assumes.ll  |   6 +-
 .../OpenMP/spmdization_constant_prop.ll       |   6 +-
 .../Transforms/OpenMP/spmdization_guarding.ll |   8 +-
 ...mdization_guarding_two_reaching_kernels.ll |  14 +--
 .../Transforms/OpenMP/spmdization_indirect.ll |  26 ++--
 .../OpenMP/spmdization_kernel_env_dep.ll      |   6 +-
 ...zation_no_guarding_two_reaching_kernels.ll |  14 +--
 .../Transforms/OpenMP/spmdization_remarks.ll  |   6 +-
 .../OpenMP/value-simplify-openmp-opt.ll       |   6 +-
 .../PhaseOrdering/openmp-opt-module.ll        |   4 +-
 openmp/libomptarget/include/Environment.h     |   7 ++
 31 files changed, 323 insertions(+), 280 deletions(-)

diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
index 39780e7c8934c..be03757d7d57c 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
@@ -95,7 +95,7 @@ __OMP_STRUCT_TYPE(AsyncInfo, __tgt_async_info, false, Int8Ptr)
 __OMP_STRUCT_TYPE(DependInfo, kmp_dep_info, false, SizeTy, SizeTy, Int8)
 __OMP_STRUCT_TYPE(Task, kmp_task_ompbuilder_t, false, VoidPtr, VoidPtr, Int32, VoidPtr, VoidPtr)
 __OMP_STRUCT_TYPE(ConfigurationEnvironment, ConfigurationEnvironmentTy, false,
-                  Int8, Int8, Int8)
+                  Int8, Int8, Int8, Int32, Int32, Int32, Int32)
 __OMP_STRUCT_TYPE(DynamicEnvironment, DynamicEnvironmentTy, false, Int16)
 __OMP_STRUCT_TYPE(KernelEnvironment, KernelEnvironmentTy, false,
                   ConfigurationEnvironment, IdentPtr, DynamicEnvironmentPtr)
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index b40796d4e50a6..ac5a8d1fd817e 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -4071,18 +4071,21 @@ OpenMPIRBuilder::createTargetInit(const LocationDescription &Loc, bool IsSPMD) {
   uint32_t SrcLocStrSize;
   Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
   Constant *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
-  ConstantInt *IsSPMDVal = ConstantInt::getSigned(
-      IntegerType::getInt8Ty(Int8->getContext()),
-      IsSPMD ? OMP_TGT_EXEC_MODE_SPMD : OMP_TGT_EXEC_MODE_GENERIC);
-  ConstantInt *UseGenericStateMachineVal = ConstantInt::getSigned(
-      IntegerType::getInt8Ty(Int8->getContext()), !IsSPMD);
-  ConstantInt *MayUseNestedParallelismVal =
-      ConstantInt::getSigned(IntegerType::getInt8Ty(Int8->getContext()), true);
-  ConstantInt *DebugIndentionLevelVal =
-      ConstantInt::getSigned(IntegerType::getInt16Ty(Int8->getContext()), 0);
+  Constant *IsSPMDVal = ConstantInt::getSigned(
+      Int8, IsSPMD ? OMP_TGT_EXEC_MODE_SPMD : OMP_TGT_EXEC_MODE_GENERIC);
+  Constant *UseGenericStateMachineVal = ConstantInt::getSigned(Int8, !IsSPMD);
+  Constant *MayUseNestedParallelismVal = ConstantInt::getSigned(Int8, true);
+  Constant *DebugIndentionLevelVal = ConstantInt::getSigned(Int16, 0);
 
-  // We need to strip the debug prefix to get the correct kernel name.
   Function *Kernel = Builder.GetInsertBlock()->getParent();
+  auto [MinThreadsVal, MaxThreadsVal] = readThreadBoundsForKernel(*Kernel);
+  auto [MinTeamsVal, MaxTeamsVal] = readTeamBoundsForKernel(*Kernel);
+  Constant *MinThreads = ConstantInt::getSigned(Int32, MinThreadsVal);
+  Constant *MaxThreads = ConstantInt::getSigned(Int32, MaxThreadsVal);
+  Constant *MinTeams = ConstantInt::getSigned(Int32, MinTeamsVal);
+  Constant *MaxTeams = ConstantInt::getSigned(Int32, MaxTeamsVal);
+
+  // We need to strip the debug prefix to get the correct kernel name.
   StringRef KernelName = Kernel->getName();
   const std::string DebugPrefix = "_debug__";
   if (KernelName.ends_with(DebugPrefix))
@@ -4113,6 +4116,10 @@ OpenMPIRBuilder::createTargetInit(const LocationDescription &Loc, bool IsSPMD) {
                                     UseGenericStateMachineVal,
                                     MayUseNestedParallelismVal,
                                     IsSPMDVal,
+                                    MinThreads,
+                                    MaxThreads,
+                                    MinTeams,
+                                    MaxTeams,
                                 });
   Constant *KernelEnvironmentInitializer = ConstantStruct::get(
       KernelEnvironment, {
diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
index e50e74ea6c0d5..e9dbcd334321f 100644
--- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -191,6 +191,10 @@ namespace KernelInfo {
 //   uint8_t UseGenericStateMachine;
 //   uint8_t MayUseNestedParallelism;
 //   llvm::omp::OMPTgtExecModeFlags ExecMode;
+//   int32_t MinThreads;
+//   int32_t MaxThreads;
+//   int32_t MinTeams;
+//   int32_t MaxTeams;
 // };
 
 // struct DynamicEnvironmentTy {
@@ -217,6 +221,10 @@ KERNEL_ENVIRONMENT_IDX(Ident, 1)
 KERNEL_ENVIRONMENT_CONFIGURATION_IDX(UseGenericStateMachine, 0)
 KERNEL_ENVIRONMENT_CONFIGURATION_IDX(MayUseNestedParallelism, 1)
 KERNEL_ENVIRONMENT_CONFIGURATION_IDX(ExecMode, 2)
+KERNEL_ENVIRONMENT_CONFIGURATION_IDX(MinThreads, 3)
+KERNEL_ENVIRONMENT_CONFIGURATION_IDX(MaxThreads, 4)
+KERNEL_ENVIRONMENT_CONFIGURATION_IDX(MinTeams, 5)
+KERNEL_ENVIRONMENT_CONFIGURATION_IDX(MaxTeams, 6)
 
 #undef KERNEL_ENVIRONMENT_CONFIGURATION_IDX
 
@@ -241,6 +249,10 @@ KERNEL_ENVIRONMENT_GETTER(Configuration, ConstantStruct)
 KERNEL_ENVIRONMENT_CONFIGURATION_GETTER(UseGenericStateMachine)
 KERNEL_ENVIRONMENT_CONFIGURATION_GETTER(MayUseNestedParallelism)
 KERNEL_ENVIRONMENT_CONFIGURATION_GETTER(ExecMode)
+KERNEL_ENVIRONMENT_CONFIGURATION_GETTER(MinThreads)
+KERNEL_ENVIRONMENT_CONFIGURATION_GETTER(MaxThreads)
+KERNEL_ENVIRONMENT_CONFIGURATION_GETTER(MinTeams)
+KERNEL_ENVIRONMENT_CONFIGURATION_GETTER(MaxTeams)
 
 #undef KERNEL_ENVIRONMENT_CONFIGURATION_GETTER
 
@@ -3636,6 +3648,10 @@ struct AAKernelInfoFunction : AAKernelInfo {
   KERNEL_ENVIRONMENT_CONFIGURATION_SETTER(UseGenericStateMachine)
   KERNEL_ENVIRONMENT_CONFIGURATION_SETTER(MayUseNestedParallelism)
   KERNEL_ENVIRONMENT_CONFIGURATION_SETTER(ExecMode)
+  KERNEL_ENVIRONMENT_CONFIGURATION_SETTER(MinThreads)
+  KERNEL_ENVIRONMENT_CONFIGURATION_SETTER(MaxThreads)
+  KERNEL_ENVIRONMENT_CONFIGURATION_SETTER(MinTeams)
+  KERNEL_ENVIRONMENT_CONFIGURATION_SETTER(MaxTeams)
 
 #undef KERNEL_ENVIRONMENT_CONFIGURATION_SETTER
 
@@ -3723,6 +3739,19 @@ struct AAKernelInfoFunction : AAKernelInfo {
     else
       setExecModeOfKernelEnvironment(AssumedExecModeC);
 
+    auto *Int32Ty = Type::getInt32Ty(Fn->getContext());
+    auto [MinThreads, MaxThreads] =
+        OpenMPIRBuilder::readThreadBoundsForKernel(*Fn);
+    if (MinThreads)
+      setMinThreadsOfKernelEnvironment(ConstantInt::get(Int32Ty, MinThreads));
+    if (MaxThreads)
+      setMaxThreadsOfKernelEnvironment(ConstantInt::get(Int32Ty, MaxThreads));
+    auto [MinTeams, MaxTeams] = OpenMPIRBuilder::readTeamBoundsForKernel(*Fn);
+    if (MinTeams)
+      setMinTeamsOfKernelEnvironment(ConstantInt::get(Int32Ty, MinTeams));
+    if (MaxTeams)
+      setMaxTeamsOfKernelEnvironment(ConstantInt::get(Int32Ty, MaxTeams));
+
     ConstantInt *MayUseNestedParallelismC =
         KernelInfo::getMayUseNestedParallelismFromKernelEnvironment(KernelEnvC);
     ConstantInt *AssumedMayUseNestedParallelismC = ConstantInt::get(
diff --git a/llvm/test/Transforms/OpenMP/always_inline_device.ll b/llvm/test/Transforms/OpenMP/always_inline_device.ll
index 5478924cf828b..993f097d74118 100644
--- a/llvm/test/Transforms/OpenMP/always_inline_device.ll
+++ b/llvm/test/Transforms/OpenMP/always_inline_device.ll
@@ -3,19 +3,19 @@
 
 %struct.ident_t = type { i32, i32, i32, i32, ptr }
 %struct.KernelEnvironmentTy = type { %struct.ConfigurationEnvironmentTy, ptr, ptr }
-%struct.ConfigurationEnvironmentTy = type { i8, i8, i8 }
+%struct.ConfigurationEnvironmentTy = type { i8, i8, i8, i32, i32, i32, i32 }
 @0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
 @1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, ptr @0 }, align 8
 @G = external global i8
 
-@kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1 }, ptr @1, ptr null }
+@kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
 
 ; Function Attrs: convergent norecurse nounwind
 ;.
 ; CHECK: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
 ; CHECK: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
 ; CHECK: @[[G:[a-zA-Z0-9_$"\\.-]+]] = external global i8
-; CHECK: @[[KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 3 }, ptr @[[GLOB1]], ptr null }
+; CHECK: @[[KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 3, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
 ;.
 define weak void @__omp_offloading_fd02_c0934fc2_foo_l4() #0 {
 ; CHECK: Function Attrs: norecurse nounwind
diff --git a/llvm/test/Transforms/OpenMP/custom_state_machines.ll b/llvm/test/Transforms/OpenMP/custom_state_machines.ll
index 9eefb4d5fa509..89a298ff19549 100644
--- a/llvm/test/Transforms/OpenMP/custom_state_machines.ll
+++ b/llvm/test/Transforms/OpenMP/custom_state_machines.ll
@@ -121,7 +121,7 @@
 
 %struct.ident_t = type { i32, i32, i32, i32, ptr }
 %struct.KernelEnvironmentTy = type { %struct.ConfigurationEnvironmentTy, ptr, ptr }
-%struct.ConfigurationEnvironmentTy = type { i8, i8, i8 }
+%struct.ConfigurationEnvironmentTy = type { i8, i8, i8, i32, i32, i32, i32 }
 
 @0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
 @1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, ptr @0 }, align 8
@@ -129,14 +129,14 @@
 @G = external global i32, align 4
 @3 = private unnamed_addr constant %struct.ident_t { i32 0, i32 322, i32 2, i32 0, ptr @0 }, align 8
 
-@__omp_offloading_14_a36502b_no_state_machine_needed_l14_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1 }, ptr @1, ptr null }
-@__omp_offloading_14_a36502b_simple_state_machine_l22_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1 }, ptr @1, ptr null }
-@__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1 }, ptr @1, ptr null }
-@__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1 }, ptr @1, ptr null }
-@__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1 }, ptr @1, ptr null }
-@__omp_offloading_14_a36502b_simple_state_machine_pure_l77_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1 }, ptr @1, ptr null }
-@__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1 }, ptr @1, ptr null }
-@__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1 }, ptr @1, ptr null }
+@__omp_offloading_14_a36502b_no_state_machine_needed_l14_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
+@__omp_offloading_14_a36502b_simple_state_machine_l22_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
+@__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
+@__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
+@__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
+@__omp_offloading_14_a36502b_simple_state_machine_pure_l77_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
+@__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
+@__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
 
 define weak void @__omp_offloading_14_a36502b_no_state_machine_needed_l14() #0 {
 entry:
@@ -840,14 +840,14 @@ attributes #9 = { convergent nounwind readonly willreturn }
 ; AMDGPU: @[[GLOB2:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
 ; AMDGPU: @[[G:[a-zA-Z0-9_$"\\.-]+]] = external global i32, align 4
 ; AMDGPU: @[[GLOB3:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 322, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
-; AMDGPU: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_NEEDED_L14_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_L22_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_L39_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_WITH_FALLBACK_L55_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_NO_OPENMP_ATTR_L66_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_PURE_L77_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_NESTED_RECURSIVE_L92_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_WEAK_CALLEE_L112_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_NEEDED_L14_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_L22_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_L39_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_WITH_FALLBACK_L55_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_NO_OPENMP_ATTR_L66_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_PURE_L77_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_NESTED_RECURSIVE_L92_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_WEAK_CALLEE_L112_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
 ; AMDGPU: @[[__OMP_OUTLINED__2_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
 ; AMDGPU: @[[__OMP_OUTLINED__3_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
 ; AMDGPU: @[[__OMP_OUTLINED__5_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
@@ -863,14 +863,14 @@ attributes #9 = { convergent nounwind readonly willreturn }
 ; NVPTX: @[[GLOB2:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
 ; NVPTX: @[[G:[a-zA-Z0-9_$"\\.-]+]] = external global i32, align 4
 ; NVPTX: @[[GLOB3:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 322, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
-; NVPTX: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_NEEDED_L14_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null }
-; NVPTX: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_L22_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
-; NVPTX: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_L39_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
-; NVPTX: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_WITH_FALLBACK_L55_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
-; NVPTX: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_NO_OPENMP_ATTR_L66_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
-; NVPTX: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_PURE_L77_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
-; NVPTX: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_NESTED_RECURSIVE_L92_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3 }, ptr @[[GLOB1]], ptr null }
-; NVPTX: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_WEAK_CALLEE_L112_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null }
+; NVPTX: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_NEEDED_L14_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_L22_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_L39_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_WITH_FALLBACK_L55_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_NO_OPENMP_ATTR_L66_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_PURE_L77_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_NESTED_RECURSIVE_L92_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_WEAK_CALLEE_L112_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
 ; NVPTX: @[[__OMP_OUTLINED__2_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
 ; NVPTX: @[[__OMP_OUTLINED__3_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
 ; NVPTX: @[[__OMP_OUTLINED__5_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
@@ -886,28 +886,28 @@ attributes #9 = { convergent nounwind readonly willreturn }
 ; AMDGPU-DISABLED: @[[GLOB2:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
 ; AMDGPU-DISABLED: @[[G:[a-zA-Z0-9_$"\\.-]+]] = external global i32, align 4
 ; AMDGPU-DISABLED: @[[GLOB3:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 322, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
-; AMDGPU-DISABLED: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_NEEDED_L14_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU-DISABLED: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_L22_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU-DISABLED: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_L39_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU-DISABLED: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_WITH_FALLBACK_L55_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU-DISABLED: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_NO_OPENMP_ATTR_L66_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU-DISABLED: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_PURE_L77_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU-DISABLED: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_NESTED_RECURSIVE_L92_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 3 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU-DISABLED: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_WEAK_CALLEE_L112_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU-DISABLED: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_NEEDED_L14_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU-DISABLED: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_L22_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU-DISABLED: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_L39_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU-DISABLED: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_WITH_FALLBACK_L55_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU-DISABLED: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_NO_OPENMP_ATTR_L66_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU-DISABLED: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_PURE_L77_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU-DISABLED: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_NESTED_RECURSIVE_L92_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU-DISABLED: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_WEAK_CALLEE_L112_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
 ;.
 ; NVPTX-DISABLED: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
 ; NVPTX-DISABLED: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
 ; NVPTX-DISABLED: @[[GLOB2:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
 ; NVPTX-DISABLED: @[[G:[a-zA-Z0-9_$"\\.-]+]] = external global i32, align 4
 ; NVPTX-DISABLED: @[[GLOB3:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 322, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
-; NVPTX-DISABLED: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_NEEDED_L14_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null }
-; NVPTX-DISABLED: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_L22_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
-; NVPTX-DISABLED: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_L39_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
-; NVPTX-DISABLED: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_WITH_FALLBACK_L55_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
-; NVPTX-DISABLED: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_NO_OPENMP_ATTR_L66_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
-; NVPTX-DISABLED: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_PURE_L77_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
-; NVPTX-DISABLED: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_NESTED_RECURSIVE_L92_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 3 }, ptr @[[GLOB1]], ptr null }
-; NVPTX-DISABLED: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_WEAK_CALLEE_L112_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null }
+; NVPTX-DISABLED: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_NEEDED_L14_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX-DISABLED: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_L22_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX-DISABLED: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_L39_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX-DISABLED: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_WITH_FALLBACK_L55_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX-DISABLED: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_NO_OPENMP_ATTR_L66_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX-DISABLED: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_PURE_L77_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX-DISABLED: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_NESTED_RECURSIVE_L92_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX-DISABLED: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_WEAK_CALLEE_L112_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
 ;.
 ; AMDGPU: Function Attrs: convergent noinline norecurse nounwind
 ; AMDGPU-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_no_state_machine_needed_l14
diff --git a/llvm/test/Transforms/OpenMP/custom_state_machines_pre_lto.ll b/llvm/test/Transforms/OpenMP/custom_state_machines_pre_lto.ll
index 5b41179e14652..4965c18f33f59 100644
--- a/llvm/test/Transforms/OpenMP/custom_state_machines_pre_lto.ll
+++ b/llvm/test/Transforms/OpenMP/custom_state_machines_pre_lto.ll
@@ -123,21 +123,21 @@
 
 %struct.ident_t = type { i32, i32, i32, i32, ptr }
 %struct.KernelEnvironmentTy = type { %struct.ConfigurationEnvironmentTy, ptr, ptr }
-%struct.ConfigurationEnvironmentTy = type { i8, i8, i8 }
+%struct.ConfigurationEnvironmentTy = type { i8, i8, i8, i32, i32, i32, i32 }
 
 @0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
 @1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, ptr @0 }, align 8
 @2 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 2, i32 0, ptr @0 }, align 8
 @G = external global i32, align 4
 @3 = private unnamed_addr constant %struct.ident_t { i32 0, i32 322, i32 2, i32 0, ptr @0 }, align 8
-@__omp_offloading_14_a36502b_no_state_machine_needed_l14_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1 }, ptr @1, ptr null }
-@__omp_offloading_14_a36502b_simple_state_machine_l22_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1 }, ptr @1, ptr null }
-@__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1 }, ptr @1, ptr null }
-@__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1 }, ptr @1, ptr null }
-@__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1 }, ptr @1, ptr null }
-@__omp_offloading_14_a36502b_simple_state_machine_pure_l77_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1 }, ptr @1, ptr null }
-@__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1 }, ptr @1, ptr null }
-@__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1 }, ptr @1, ptr null }
+@__omp_offloading_14_a36502b_no_state_machine_needed_l14_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
+@__omp_offloading_14_a36502b_simple_state_machine_l22_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
+@__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
+@__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
+@__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
+@__omp_offloading_14_a36502b_simple_state_machine_pure_l77_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
+@__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
+@__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
 
 define weak void @__omp_offloading_14_a36502b_no_state_machine_needed_l14() #0 {
 entry:
@@ -839,84 +839,84 @@ attributes #9 = { convergent nounwind readonly willreturn }
 ; AMDGPU1: @[[GLOB2:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
 ; AMDGPU1: @[[G:[a-zA-Z0-9_$"\\.-]+]] = external global i32, align 4
 ; AMDGPU1: @[[GLOB3:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 322, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
-; AMDGPU1: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_NEEDED_L14_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU1: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_L22_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU1: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_L39_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU1: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_WITH_FALLBACK_L55_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU1: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_NO_OPENMP_ATTR_L66_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU1: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_PURE_L77_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU1: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_NESTED_RECURSIVE_L92_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU1: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_WEAK_CALLEE_L112_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU1: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_NEEDED_L14_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU1: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_L22_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU1: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_L39_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU1: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_WITH_FALLBACK_L55_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU1: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_NO_OPENMP_ATTR_L66_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU1: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_PURE_L77_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU1: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_NESTED_RECURSIVE_L92_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU1: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_WEAK_CALLEE_L112_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
 ;.
 ; NVPTX1: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
 ; NVPTX1: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
 ; NVPTX1: @[[GLOB2:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
 ; NVPTX1: @[[G:[a-zA-Z0-9_$"\\.-]+]] = external global i32, align 4
 ; NVPTX1: @[[GLOB3:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 322, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
-; NVPTX1: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_NEEDED_L14_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null }
-; NVPTX1: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_L22_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
-; NVPTX1: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_L39_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
-; NVPTX1: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_WITH_FALLBACK_L55_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
-; NVPTX1: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_NO_OPENMP_ATTR_L66_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
-; NVPTX1: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_PURE_L77_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
-; NVPTX1: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_NESTED_RECURSIVE_L92_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3 }, ptr @[[GLOB1]], ptr null }
-; NVPTX1: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_WEAK_CALLEE_L112_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null }
+; NVPTX1: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_NEEDED_L14_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX1: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_L22_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX1: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_L39_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX1: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_WITH_FALLBACK_L55_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX1: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_NO_OPENMP_ATTR_L66_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX1: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_PURE_L77_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX1: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_NESTED_RECURSIVE_L92_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX1: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_WEAK_CALLEE_L112_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
 ;.
 ; AMDGPU2: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
 ; AMDGPU2: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
 ; AMDGPU2: @[[GLOB2:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
 ; AMDGPU2: @[[G:[a-zA-Z0-9_$"\\.-]+]] = external global i32, align 4
 ; AMDGPU2: @[[GLOB3:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 322, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
-; AMDGPU2: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_NEEDED_L14_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU2: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_L22_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU2: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_L39_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU2: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_WITH_FALLBACK_L55_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU2: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_NO_OPENMP_ATTR_L66_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU2: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_PURE_L77_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU2: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_NESTED_RECURSIVE_L92_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 3 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU2: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_WEAK_CALLEE_L112_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU2: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_NEEDED_L14_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU2: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_L22_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU2: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_L39_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU2: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_WITH_FALLBACK_L55_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU2: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_NO_OPENMP_ATTR_L66_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU2: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_PURE_L77_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU2: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_NESTED_RECURSIVE_L92_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU2: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_WEAK_CALLEE_L112_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
 ;.
 ; AMDGPU3: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
 ; AMDGPU3: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
 ; AMDGPU3: @[[GLOB2:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
 ; AMDGPU3: @[[G:[a-zA-Z0-9_$"\\.-]+]] = external global i32, align 4
 ; AMDGPU3: @[[GLOB3:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 322, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
-; AMDGPU3: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_NEEDED_L14_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU3: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_L22_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU3: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_L39_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU3: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_WITH_FALLBACK_L55_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU3: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_NO_OPENMP_ATTR_L66_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU3: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_PURE_L77_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU3: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_NESTED_RECURSIVE_L92_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU3: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_WEAK_CALLEE_L112_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU3: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_NEEDED_L14_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU3: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_L22_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU3: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_L39_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU3: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_WITH_FALLBACK_L55_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU3: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_NO_OPENMP_ATTR_L66_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU3: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_PURE_L77_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU3: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_NESTED_RECURSIVE_L92_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU3: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_WEAK_CALLEE_L112_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
 ;.
 ; NVPTX2: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
 ; NVPTX2: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
 ; NVPTX2: @[[GLOB2:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
 ; NVPTX2: @[[G:[a-zA-Z0-9_$"\\.-]+]] = external global i32, align 4
 ; NVPTX2: @[[GLOB3:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 322, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
-; NVPTX2: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_NEEDED_L14_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null }
-; NVPTX2: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_L22_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
-; NVPTX2: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_L39_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
-; NVPTX2: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_WITH_FALLBACK_L55_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
-; NVPTX2: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_NO_OPENMP_ATTR_L66_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
-; NVPTX2: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_PURE_L77_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
-; NVPTX2: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_NESTED_RECURSIVE_L92_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 3 }, ptr @[[GLOB1]], ptr null }
-; NVPTX2: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_WEAK_CALLEE_L112_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null }
+; NVPTX2: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_NEEDED_L14_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX2: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_L22_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX2: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_L39_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX2: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_WITH_FALLBACK_L55_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX2: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_NO_OPENMP_ATTR_L66_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX2: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_PURE_L77_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX2: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_NESTED_RECURSIVE_L92_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX2: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_WEAK_CALLEE_L112_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
 ;.
 ; NVPTX3: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
 ; NVPTX3: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
 ; NVPTX3: @[[GLOB2:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
 ; NVPTX3: @[[G:[a-zA-Z0-9_$"\\.-]+]] = external global i32, align 4
 ; NVPTX3: @[[GLOB3:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 322, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
-; NVPTX3: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_NEEDED_L14_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null }
-; NVPTX3: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_L22_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
-; NVPTX3: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_L39_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
-; NVPTX3: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_WITH_FALLBACK_L55_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
-; NVPTX3: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_NO_OPENMP_ATTR_L66_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
-; NVPTX3: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_PURE_L77_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
-; NVPTX3: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_NESTED_RECURSIVE_L92_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3 }, ptr @[[GLOB1]], ptr null }
-; NVPTX3: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_WEAK_CALLEE_L112_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null }
+; NVPTX3: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_NEEDED_L14_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX3: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_L22_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX3: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_L39_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX3: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_WITH_FALLBACK_L55_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX3: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_NO_OPENMP_ATTR_L66_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX3: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_PURE_L77_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX3: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_NESTED_RECURSIVE_L92_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX3: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_WEAK_CALLEE_L112_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
 ;.
 ; AMDGPU1: Function Attrs: convergent noinline norecurse nounwind
 ; AMDGPU1-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_no_state_machine_needed_l14
diff --git a/llvm/test/Transforms/OpenMP/custom_state_machines_remarks.ll b/llvm/test/Transforms/OpenMP/custom_state_machines_remarks.ll
index 0c542bb27f4f4..621ad16d836c0 100644
--- a/llvm/test/Transforms/OpenMP/custom_state_machines_remarks.ll
+++ b/llvm/test/Transforms/OpenMP/custom_state_machines_remarks.ll
@@ -37,7 +37,7 @@ target triple = "nvptx64"
 
 %struct.ident_t = type { i32, i32, i32, i32, ptr }
 %struct.KernelEnvironmentTy = type { %struct.ConfigurationEnvironmentTy, ptr, ptr }
-%struct.ConfigurationEnvironmentTy = type { i8, i8, i8 }
+%struct.ConfigurationEnvironmentTy = type { i8, i8, i8, i32, i32, i32, i32 }
 
 @0 = private unnamed_addr constant [113 x i8] c";llvm/test/Transforms/OpenMP/custom_state_machines_remarks.c;__omp_offloading_2a_d80d3d_test_fallback_l11;11;1;;\00", align 1
 @1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, ptr @0 }, align 8
@@ -54,8 +54,8 @@ target triple = "nvptx64"
 @12 = private unnamed_addr constant [73 x i8] c";llvm/test/Transforms/OpenMP/custom_state_machines_remarks.c;known;4;1;;\00", align 1
 @13 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 2, i32 0, ptr @12 }, align 8
 @G = external global i32
-@__omp_offloading_2a_d80d3d_test_fallback_l11_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1 }, ptr null, ptr null }
-@__omp_offloading_2a_d80d3d_test_no_fallback_l20_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1 }, ptr null, ptr null }
+@__omp_offloading_2a_d80d3d_test_fallback_l11_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr null, ptr null }
+@__omp_offloading_2a_d80d3d_test_no_fallback_l20_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr null, ptr null }
 
 
 ; Function Attrs: convergent norecurse nounwind
diff --git a/llvm/test/Transforms/OpenMP/deduplication_target.ll b/llvm/test/Transforms/OpenMP/deduplication_target.ll
index eb9177cff4bbf..6174fc64b90bd 100644
--- a/llvm/test/Transforms/OpenMP/deduplication_target.ll
+++ b/llvm/test/Transforms/OpenMP/deduplication_target.ll
@@ -6,12 +6,12 @@ target triple = "nvptx64"
 
 %struct.ident_t = type { i32, i32, i32, i32, ptr }
 %struct.KernelEnvironmentTy = type { %struct.ConfigurationEnvironmentTy, ptr, ptr }
-%struct.ConfigurationEnvironmentTy = type { i8, i8, i8 }
+%struct.ConfigurationEnvironmentTy = type { i8, i8, i8, i32, i32, i32, i32 }
 
 @0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
 @1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, ptr @0 }, align 8
 @2 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 1, i32 0, ptr @0 }, align 8
-@__omp_offloading_50_a3e09bf8_foo_l2_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 2 }, ptr null, ptr null }
+@__omp_offloading_50_a3e09bf8_foo_l2_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 2, i32 0, i32 0, i32 0, i32 0 }, ptr null, ptr null }
 
 declare void @use(i32)
 
diff --git a/llvm/test/Transforms/OpenMP/get_hardware_num_threads_in_block_fold.ll b/llvm/test/Transforms/OpenMP/get_hardware_num_threads_in_block_fold.ll
index d8605f7150e74..abbfc424cedc2 100644
--- a/llvm/test/Transforms/OpenMP/get_hardware_num_threads_in_block_fold.ll
+++ b/llvm/test/Transforms/OpenMP/get_hardware_num_threads_in_block_fold.ll
@@ -3,19 +3,19 @@
 target triple = "nvptx64"
 
 %struct.KernelEnvironmentTy = type { %struct.ConfigurationEnvironmentTy, ptr, ptr }
-%struct.ConfigurationEnvironmentTy = type { i8, i8, i8 }
+%struct.ConfigurationEnvironmentTy = type { i8, i8, i8, i32, i32, i32, i32 }
 
 @G = external global i32
 
-@kernel0_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1 }, ptr null, ptr null }
-@kernel1_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1 }, ptr null, ptr null }
-@kernel2_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1 }, ptr null, ptr null }
+@kernel0_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr null, ptr null }
+@kernel1_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr null, ptr null }
+@kernel2_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr null, ptr null }
 
 ;.
 ; CHECK: @[[G:[a-zA-Z0-9_$"\\.-]+]] = external global i32
-; CHECK: @[[KERNEL0_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 3 }, ptr null, ptr null }
-; CHECK: @[[KERNEL1_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 3 }, ptr null, ptr null }
-; CHECK: @[[KERNEL2_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 3 }, ptr null, ptr null }
+; CHECK: @[[KERNEL0_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 3, i32 0, i32 666, i32 0, i32 777 }, ptr null, ptr null }
+; CHECK: @[[KERNEL1_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 3, i32 0, i32 666, i32 0, i32 777 }, ptr null, ptr null }
+; CHECK: @[[KERNEL2_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 3, i32 0, i32 666, i32 0, i32 777 }, ptr null, ptr null }
 ; CHECK: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
 ; CHECK: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 22, ptr @[[GLOB0]] }, align 8
 ;.
diff --git a/llvm/test/Transforms/OpenMP/global_constructor.ll b/llvm/test/Transforms/OpenMP/global_constructor.ll
index e1a394b526a97..35c027ed5c164 100644
--- a/llvm/test/Transforms/OpenMP/global_constructor.ll
+++ b/llvm/test/Transforms/OpenMP/global_constructor.ll
@@ -3,12 +3,12 @@
 
 %struct.ident_t = type { i32, i32, i32, i32, ptr }
 %struct.KernelEnvironmentTy = type { %struct.ConfigurationEnvironmentTy, ptr, ptr }
-%struct.ConfigurationEnvironmentTy = type { i8, i8, i8 }
+%struct.ConfigurationEnvironmentTy = type { i8, i8, i8, i32, i32, i32, i32 }
 
 @0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
 @1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, ptr @0 }, align 8
 @_ZL6Device = internal global double 0.000000e+00, align 8
-@__omp_offloading_fd02_85283c04_main_l11_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1 }, ptr @1, ptr null }
+@__omp_offloading_fd02_85283c04_main_l11_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
 
 define weak void @__omp_offloading_fd02_85283c04_main_l11(ptr nonnull align 8 dereferenceable(8) %X) local_unnamed_addr "kernel" {
 entry:
diff --git a/llvm/test/Transforms/OpenMP/globalization_remarks.ll b/llvm/test/Transforms/OpenMP/globalization_remarks.ll
index fdd0e41fd2525..df369c7f3f19c 100644
--- a/llvm/test/Transforms/OpenMP/globalization_remarks.ll
+++ b/llvm/test/Transforms/OpenMP/globalization_remarks.ll
@@ -8,10 +8,10 @@ target triple = "nvptx64"
 ; CHECK: remark: globalization_remarks.c:5:7: Found thread data sharing on the GPU. Expect degraded performance due to data globalization.
 
 %struct.KernelEnvironmentTy = type { %struct.ConfigurationEnvironmentTy, ptr, ptr }
-%struct.ConfigurationEnvironmentTy = type { i8, i8, i8 }
+%struct.ConfigurationEnvironmentTy = type { i8, i8, i8, i32, i32, i32, i32 }
 
 @S = external local_unnamed_addr global ptr
-@foo_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1 }, ptr null, ptr null }
+@foo_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr null, ptr null }
 
 define void @foo() "kernel" {
 entry:
diff --git a/llvm/test/Transforms/OpenMP/gpu_state_machine_function_ptr_replacement.ll b/llvm/test/Transforms/OpenMP/gpu_state_machine_function_ptr_replacement.ll
index 37348e8474b57..94fcb35fc380b 100644
--- a/llvm/test/Transforms/OpenMP/gpu_state_machine_function_ptr_replacement.ll
+++ b/llvm/test/Transforms/OpenMP/gpu_state_machine_function_ptr_replacement.ll
@@ -37,12 +37,12 @@
 
 %struct.ident_t = type { i32, i32, i32, i32, ptr }
 %struct.KernelEnvironmentTy = type { %struct.ConfigurationEnvironmentTy, ptr, ptr }
-%struct.ConfigurationEnvironmentTy = type { i8, i8, i8 }
+%struct.ConfigurationEnvironmentTy = type { i8, i8, i8, i32, i32, i32, i32 }
 
 @0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
 @1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, ptr @0 }, align 8
 @2 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 2, i32 0, ptr @0 }, align 8
-@__omp_offloading_10301_87b2c_foo_l7_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1 }, ptr @1, ptr null }
+@__omp_offloading_10301_87b2c_foo_l7_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
 
 define weak void @__omp_offloading_10301_87b2c_foo_l7() "kernel" {
 entry:
diff --git a/llvm/test/Transforms/OpenMP/is_spmd_exec_mode_fold.ll b/llvm/test/Transforms/OpenMP/is_spmd_exec_mode_fold.ll
index 528232fd9e2f9..afc4043d6dd70 100644
--- a/llvm/test/Transforms/OpenMP/is_spmd_exec_mode_fold.ll
+++ b/llvm/test/Transforms/OpenMP/is_spmd_exec_mode_fold.ll
@@ -3,20 +3,20 @@
 target triple = "nvptx64"
 
 %struct.KernelEnvironmentTy = type { %struct.ConfigurationEnvironmentTy, ptr, ptr }
-%struct.ConfigurationEnvironmentTy = type { i8, i8, i8 }
+%struct.ConfigurationEnvironmentTy = type { i8, i8, i8, i32, i32, i32, i32 }
 
 @G = external global i8
-@is_spmd_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 2 }, ptr null, ptr null }
-@will_be_spmd_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1 }, ptr null, ptr null }
-@none_spmd_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1 }, ptr null, ptr null }
-@will_not_be_spmd_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1 }, ptr null, ptr null }
+@is_spmd_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 2, i32 0, i32 0, i32 0, i32 0 }, ptr null, ptr null }
+@will_be_spmd_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr null, ptr null }
+@none_spmd_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr null, ptr null }
+@will_not_be_spmd_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr null, ptr null }
 
 ;.
 ; CHECK: @[[G:[a-zA-Z0-9_$"\\.-]+]] = external global i8
-; CHECK: @[[IS_SPMD_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 2 }, ptr null, ptr null }
-; CHECK: @[[WILL_BE_SPMD_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 3 }, ptr null, ptr null }
-; CHECK: @[[NONE_SPMD_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1 }, ptr null, ptr null }
-; CHECK: @[[WILL_NOT_BE_SPMD_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1 }, ptr null, ptr null }
+; CHECK: @[[IS_SPMD_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 2, i32 0, i32 0, i32 0, i32 0 }, ptr null, ptr null }
+; CHECK: @[[WILL_BE_SPMD_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 3, i32 0, i32 0, i32 0, i32 0 }, ptr null, ptr null }
+; CHECK: @[[NONE_SPMD_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr null, ptr null }
+; CHECK: @[[WILL_NOT_BE_SPMD_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr null, ptr null }
 ;.
 define weak void @is_spmd() "kernel" {
 ; CHECK-LABEL: define {{[^@]+}}@is_spmd
diff --git a/llvm/test/Transforms/OpenMP/nested_parallelism.ll b/llvm/test/Transforms/OpenMP/nested_parallelism.ll
index 03832b32c0b9a..8fdd438355681 100644
--- a/llvm/test/Transforms/OpenMP/nested_parallelism.ll
+++ b/llvm/test/Transforms/OpenMP/nested_parallelism.ll
@@ -24,15 +24,15 @@ target triple = "nvptx64"
 
 %struct.ident_t = type { i32, i32, i32, i32, ptr }
 %struct.KernelEnvironmentTy = type { %struct.ConfigurationEnvironmentTy, ptr, ptr }
-%struct.ConfigurationEnvironmentTy = type { i8, i8, i8 }
+%struct.ConfigurationEnvironmentTy = type { i8, i8, i8, i32, i32, i32, i32 }
 
 @0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
 @1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 22, ptr @0 }, align 8
 @i_shared = internal addrspace(3) global [4 x i8] undef, align 16
 @i.i_shared = internal addrspace(3) global [4 x i8] undef, align 16
 
-@__omp_offloading_10302_bd7e0_main_l13_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 2 }, ptr @1, ptr null }
-@__omp_offloading_10302_bd7e0_main_l16_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1 }, ptr @1, ptr null }
+@__omp_offloading_10302_bd7e0_main_l13_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 2, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
+@__omp_offloading_10302_bd7e0_main_l16_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
 
 
 ;.
@@ -40,8 +40,8 @@ target triple = "nvptx64"
 ; CHECK: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 22, ptr @[[GLOB0]] }, align 8
 ; CHECK: @[[I_SHARED:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] undef, align 16
 ; CHECK: @[[I_I_SHARED:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] undef, align 16
-; CHECK: @[[__OMP_OFFLOADING_10302_BD7E0_MAIN_L13_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 2 }, ptr @[[GLOB1]], ptr null }
-; CHECK: @[[__OMP_OFFLOADING_10302_BD7E0_MAIN_L16_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null }
+; CHECK: @[[__OMP_OFFLOADING_10302_BD7E0_MAIN_L13_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 2, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; CHECK: @[[__OMP_OFFLOADING_10302_BD7E0_MAIN_L16_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
 ;.
 define weak_odr protected void @__omp_offloading_10302_bd7e0_main_l13(i64 noundef %i) local_unnamed_addr "kernel" {
 ; CHECK-LABEL: @__omp_offloading_10302_bd7e0_main_l13(
diff --git a/llvm/test/Transforms/OpenMP/parallel_deletion.ll b/llvm/test/Transforms/OpenMP/parallel_deletion.ll
index 5b3eb1869404a..6319875fd9056 100644
--- a/llvm/test/Transforms/OpenMP/parallel_deletion.ll
+++ b/llvm/test/Transforms/OpenMP/parallel_deletion.ll
@@ -386,12 +386,12 @@ define internal void @.omp_outlined..4(ptr noalias %.global_tid., ptr noalias %.
 ; CHECK-SAME: (ptr noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], ptr noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], ptr nocapture nofree noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4
-; CHECK-NEXT:    [[TRUETMP1:%.*]] = call i32 @__kmpc_master(ptr noundef nonnull @[[GLOB0]], i32 [[TMP]])
-; CHECK-NEXT:    [[TRUETMP2:%.*]] = icmp eq i32 [[TRUETMP1]], 0
-; CHECK-NEXT:    br i1 [[TRUETMP2]], label [[OMP_IF_END:%.*]], label [[OMP_IF_THEN:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_master(ptr noundef nonnull @[[GLOB0]], i32 [[TMP]])
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[TMP2]], label [[OMP_IF_END:%.*]], label [[OMP_IF_THEN:%.*]]
 ; CHECK:       omp_if.then:
-; CHECK-NEXT:    [[TRUETMP3:%.*]] = load i32, ptr [[A]], align 4
-; CHECK-NEXT:    [[INC:%.*]] = add nsw i32 [[TRUETMP3]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[A]], align 4
+; CHECK-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP3]], 1
 ; CHECK-NEXT:    store i32 [[INC]], ptr [[A]], align 4
 ; CHECK-NEXT:    call void @__kmpc_end_master(ptr noundef nonnull @[[GLOB0]], i32 [[TMP]])
 ; CHECK-NEXT:    br label [[OMP_IF_END]]
@@ -459,12 +459,12 @@ define internal void @.omp_outlined..5(ptr noalias %.global_tid., ptr noalias %.
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(ptr noundef nonnull @[[GLOB0]]) #[[ATTR19]]
 ; CHECK-NEXT:    [[TMP:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4
-; CHECK-NEXT:    [[TRUETMP1:%.*]] = call i32 @__kmpc_single(ptr noundef nonnull @[[GLOB0]], i32 [[TMP]])
-; CHECK-NEXT:    [[TRUETMP2:%.*]] = icmp eq i32 [[TRUETMP1]], 0
-; CHECK-NEXT:    br i1 [[TRUETMP2]], label [[OMP_IF_END:%.*]], label [[OMP_IF_THEN:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_single(ptr noundef nonnull @[[GLOB0]], i32 [[TMP]])
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[TMP2]], label [[OMP_IF_END:%.*]], label [[OMP_IF_THEN:%.*]]
 ; CHECK:       omp_if.then:
-; CHECK-NEXT:    [[TRUETMP3:%.*]] = load i32, ptr [[A]], align 4
-; CHECK-NEXT:    [[INC:%.*]] = add nsw i32 [[TRUETMP3]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[A]], align 4
+; CHECK-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP3]], 1
 ; CHECK-NEXT:    store i32 [[INC]], ptr [[A]], align 4
 ; CHECK-NEXT:    call void @__kmpc_end_single(ptr noundef nonnull @[[GLOB0]], i32 [[TMP]])
 ; CHECK-NEXT:    br label [[OMP_IF_END]]
@@ -534,22 +534,22 @@ define internal void @.omp_outlined..6(ptr noalias %.global_tid., ptr noalias %.
 ; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 noundef 4, ptr noundef nonnull align 4 [[A1]]) #[[ATTR20:[0-9]+]]
 ; CHECK-NEXT:    store i32 1, ptr [[A1]], align 4
 ; CHECK-NEXT:    store ptr [[A1]], ptr [[DOTOMP_REDUCTION_RED_LIST]], align 8
-; CHECK-NEXT:    [[TRUETMP2:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4
-; CHECK-NEXT:    [[TRUETMP4:%.*]] = call i32 @__kmpc_reduce_nowait(ptr noundef nonnull @[[GLOB2:[0-9]+]], i32 [[TRUETMP2]], i32 noundef 1, i64 noundef 8, ptr noundef nonnull align 8 [[DOTOMP_REDUCTION_RED_LIST]], ptr noundef nonnull @.omp.reduction.reduction_func, ptr noundef nonnull @.gomp_critical_user_.reduction.var)
-; CHECK-NEXT:    switch i32 [[TRUETMP4]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_reduce_nowait(ptr noundef nonnull @[[GLOB2:[0-9]+]], i32 [[TMP2]], i32 noundef 1, i64 noundef 8, ptr noundef nonnull align 8 [[DOTOMP_REDUCTION_RED_LIST]], ptr noundef nonnull @.omp.reduction.reduction_func, ptr noundef nonnull @.gomp_critical_user_.reduction.var)
+; CHECK-NEXT:    switch i32 [[TMP4]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [
 ; CHECK-NEXT:    i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]]
 ; CHECK-NEXT:    i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]]
 ; CHECK-NEXT:    ]
 ; CHECK:       .omp.reduction.case1:
-; CHECK-NEXT:    [[TRUETMP5:%.*]] = load i32, ptr [[A]], align 4
-; CHECK-NEXT:    [[TRUETMP6:%.*]] = load i32, ptr [[A1]], align 4
-; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TRUETMP5]], [[TRUETMP6]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[A]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[A1]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP5]], [[TMP6]]
 ; CHECK-NEXT:    store i32 [[ADD]], ptr [[A]], align 4
-; CHECK-NEXT:    call void @__kmpc_end_reduce_nowait(ptr noundef nonnull @[[GLOB2]], i32 [[TRUETMP2]], ptr noundef nonnull @.gomp_critical_user_.reduction.var)
+; CHECK-NEXT:    call void @__kmpc_end_reduce_nowait(ptr noundef nonnull @[[GLOB2]], i32 [[TMP2]], ptr noundef nonnull @.gomp_critical_user_.reduction.var)
 ; CHECK-NEXT:    br label [[DOTOMP_REDUCTION_DEFAULT]]
 ; CHECK:       .omp.reduction.case2:
-; CHECK-NEXT:    [[TRUETMP7:%.*]] = load i32, ptr [[A1]], align 4
-; CHECK-NEXT:    [[TRUETMP8:%.*]] = atomicrmw add ptr [[A]], i32 [[TRUETMP7]] monotonic, align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[A1]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = atomicrmw add ptr [[A]], i32 [[TMP7]] monotonic, align 4
 ; CHECK-NEXT:    br label [[DOTOMP_REDUCTION_DEFAULT]]
 ; CHECK:       .omp.reduction.default:
 ; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 noundef 4, ptr noundef nonnull [[A1]])
@@ -646,12 +646,12 @@ define internal void @.omp.reduction.reduction_func(ptr %arg, ptr %arg1) {
 ; CHECK-LABEL: define {{[^@]+}}@.omp.reduction.reduction_func
 ; CHECK-SAME: (ptr nocapture nofree noundef nonnull readonly align 8 dereferenceable(8) [[ARG:%.*]], ptr nocapture nofree noundef nonnull readonly align 8 dereferenceable(8) [[ARG1:%.*]]) #[[ATTR10:[0-9]+]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TRUETMP2:%.*]] = load ptr, ptr [[ARG1]], align 8
-; CHECK-NEXT:    [[TRUETMP4:%.*]] = load ptr, ptr [[ARG]], align 8
-; CHECK-NEXT:    [[TRUETMP5:%.*]] = load i32, ptr [[TRUETMP4]], align 4
-; CHECK-NEXT:    [[TRUETMP6:%.*]] = load i32, ptr [[TRUETMP2]], align 4
-; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TRUETMP5]], [[TRUETMP6]]
-; CHECK-NEXT:    store i32 [[ADD]], ptr [[TRUETMP4]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[ARG1]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[ARG]], align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    store i32 [[ADD]], ptr [[TMP4]], align 4
 ; CHECK-NEXT:    ret void
 ;
 ; CHECK1-LABEL: define {{[^@]+}}@.omp.reduction.reduction_func
diff --git a/llvm/test/Transforms/OpenMP/parallel_level_fold.ll b/llvm/test/Transforms/OpenMP/parallel_level_fold.ll
index eb75d030a05f0..2aae5d3186393 100644
--- a/llvm/test/Transforms/OpenMP/parallel_level_fold.ll
+++ b/llvm/test/Transforms/OpenMP/parallel_level_fold.ll
@@ -3,18 +3,18 @@
 target triple = "nvptx64"
 
 %struct.KernelEnvironmentTy = type { %struct.ConfigurationEnvironmentTy, ptr, ptr }
-%struct.ConfigurationEnvironmentTy = type { i8, i8, i8 }
+%struct.ConfigurationEnvironmentTy = type { i8, i8, i8, i32, i32, i32, i32 }
 
 @G = external global i16
-@none_spmd_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1 }, ptr null, ptr null }
-@spmd_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 2 }, ptr null, ptr null }
-@parallel_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 2 }, ptr null, ptr null }
+@none_spmd_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr null, ptr null }
+@spmd_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 2, i32 0, i32 0, i32 0, i32 0 }, ptr null, ptr null }
+@parallel_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 2, i32 0, i32 0, i32 0, i32 0 }, ptr null, ptr null }
 
 ;.
 ; CHECK: @[[G:[a-zA-Z0-9_$"\\.-]+]] = external global i16
-; CHECK: @[[NONE_SPMD_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1 }, ptr null, ptr null }
-; CHECK: @[[SPMD_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 2 }, ptr null, ptr null }
-; CHECK: @[[PARALLEL_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 2 }, ptr null, ptr null }
+; CHECK: @[[NONE_SPMD_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr null, ptr null }
+; CHECK: @[[SPMD_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 2, i32 0, i32 0, i32 0, i32 0 }, ptr null, ptr null }
+; CHECK: @[[PARALLEL_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 2, i32 0, i32 0, i32 0, i32 0 }, ptr null, ptr null }
 ;.
 define weak void @none_spmd() "kernel" {
 ; CHECK-LABEL: define {{[^@]+}}@none_spmd
diff --git a/llvm/test/Transforms/OpenMP/remove_globalization.ll b/llvm/test/Transforms/OpenMP/remove_globalization.ll
index 2326993589299..4db48c9a874c5 100644
--- a/llvm/test/Transforms/OpenMP/remove_globalization.ll
+++ b/llvm/test/Transforms/OpenMP/remove_globalization.ll
@@ -7,8 +7,8 @@ target triple = "nvptx64"
 
 @S = external local_unnamed_addr global ptr
 %struct.KernelEnvironmentTy = type { %struct.ConfigurationEnvironmentTy, ptr, ptr }
-%struct.ConfigurationEnvironmentTy = type { i8, i8, i8 }
-@kernel_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1 }, ptr null, ptr null }
+%struct.ConfigurationEnvironmentTy = type { i8, i8, i8, i32, i32, i32, i32 }
+@kernel_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr null, ptr null }
 
 
 ; UTC_ARGS: --disable
@@ -22,10 +22,10 @@ target triple = "nvptx64"
 ; Make it a weak definition so we will apply custom state machine rewriting but can't use the body in the reasoning.
 ;.
 ; CHECK: @[[S:[a-zA-Z0-9_$"\\.-]+]] = external local_unnamed_addr global ptr
-; CHECK: @[[KERNEL_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1 }, ptr null, ptr null }
+; CHECK: @[[KERNEL_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr null, ptr null }
 ;.
 ; CHECK-DISABLED: @[[S:[a-zA-Z0-9_$"\\.-]+]] = external local_unnamed_addr global ptr
-; CHECK-DISABLED: @[[KERNEL_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1 }, ptr null, ptr null }
+; CHECK-DISABLED: @[[KERNEL_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr null, ptr null }
 ;.
 define weak i32 @__kmpc_target_init(ptr %0) {
 ; CHECK-LABEL: define {{[^@]+}}@__kmpc_target_init
diff --git a/llvm/test/Transforms/OpenMP/replace_globalization.ll b/llvm/test/Transforms/OpenMP/replace_globalization.ll
index 6469558728757..d7eb120e0d1ff 100644
--- a/llvm/test/Transforms/OpenMP/replace_globalization.ll
+++ b/llvm/test/Transforms/OpenMP/replace_globalization.ll
@@ -15,14 +15,14 @@ target triple = "nvptx64"
 
 %struct.ident_t = type { i32, i32, i32, i32, ptr }
 %struct.KernelEnvironmentTy = type { %struct.ConfigurationEnvironmentTy, ptr, ptr }
-%struct.ConfigurationEnvironmentTy = type { i8, i8, i8 }
+%struct.ConfigurationEnvironmentTy = type { i8, i8, i8, i32, i32, i32, i32 }
 
 @S = external local_unnamed_addr global ptr
 @0 = private unnamed_addr constant [113 x i8] c";llvm/test/Transforms/OpenMP/custom_state_machines_remarks.c;__omp_offloading_2a_d80d3d_test_fallback_l11;11;1;;\00", align 1
 @1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, ptr @0 }, align 8
-@foo_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1 }, ptr @1, ptr null }
-@bar_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1 }, ptr @1, ptr null }
-@baz_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 2 }, ptr @1, ptr null }
+@foo_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
+@bar_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
+@baz_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 2, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
 
 
 define dso_local void @foo() "kernel" {
@@ -129,9 +129,9 @@ declare void @unknown_no_openmp() "llvm.assume"="omp_no_openmp"
 ; CHECK: @[[S:[a-zA-Z0-9_$"\\.-]+]] = external local_unnamed_addr global ptr
 ; CHECK: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [113 x i8] c"
 ; CHECK: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
-; CHECK: @[[FOO_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null }
-; CHECK: @[[BAR_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null }
-; CHECK: @[[BAZ_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 2 }, ptr @[[GLOB1]], ptr null }
+; CHECK: @[[FOO_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; CHECK: @[[BAR_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; CHECK: @[[BAZ_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 2, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
 ; CHECK: @[[OFFSET:[a-zA-Z0-9_$"\\.-]+]] = global i32 undef
 ; CHECK: @[[STACK:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [1024 x i8] undef
 ; CHECK: @[[X_SHARED:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [16 x i8] poison, align 4
diff --git a/llvm/test/Transforms/OpenMP/single_threaded_execution.ll b/llvm/test/Transforms/OpenMP/single_threaded_execution.ll
index 01f73714eb486..0a1ad421fb502 100644
--- a/llvm/test/Transforms/OpenMP/single_threaded_execution.ll
+++ b/llvm/test/Transforms/OpenMP/single_threaded_execution.ll
@@ -5,11 +5,11 @@
 
 %struct.ident_t = type { i32, i32, i32, i32, ptr }
 %struct.KernelEnvironmentTy = type { %struct.ConfigurationEnvironmentTy, ptr, ptr }
-%struct.ConfigurationEnvironmentTy = type { i8, i8, i8 }
+%struct.ConfigurationEnvironmentTy = type { i8, i8, i8, i32, i32, i32, i32 }
 
 @0 = private unnamed_addr constant [1 x i8] c"\00", align 1
 @1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, ptr @0 }, align 8
-@kernel_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1 }, ptr @1, ptr null }
+@kernel_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
 
 
 ; CHECK-NOT: [openmp-opt] Basic block @kernel entry is executed by a single thread.
diff --git a/llvm/test/Transforms/OpenMP/spmdization.ll b/llvm/test/Transforms/OpenMP/spmdization.ll
index 30e3f7a1bd268..fd272c017db13 100644
--- a/llvm/test/Transforms/OpenMP/spmdization.ll
+++ b/llvm/test/Transforms/OpenMP/spmdization.ll
@@ -93,17 +93,17 @@
 %struct.kmp_task_t = type { ptr, ptr, i32, %union.kmp_cmplrdata_t, %union.kmp_cmplrdata_t }
 %union.kmp_cmplrdata_t = type { ptr }
 %struct.anon = type {}
-%struct.ConfigurationEnvironmentTy = type { i8, i8, i8 }
+%struct.ConfigurationEnvironmentTy = type { i8, i8, i8, i32, i32, i32, i32 }
 %struct.KernelEnvironmentTy = type { %struct.ConfigurationEnvironmentTy, ptr, ptr }
 
 @0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
 @1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, ptr @0 }, align 8
-@__omp_offloading_fd02_2044372e_sequential_loop_l5_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1 }, ptr @1, ptr null }
-@__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1 }, ptr @1, ptr null }
-@__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1 }, ptr @1, ptr null }
-@__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1 }, ptr @1, ptr null }
-@__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1 }, ptr @1, ptr null }
-@__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1 }, ptr @1, ptr null }
+@__omp_offloading_fd02_2044372e_sequential_loop_l5_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
+@__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
+@__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
+@__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
+@__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
+@__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
 
 ; AMDGPU-DISABLED: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
 ; AMDGPU-DISABLED: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
@@ -138,12 +138,12 @@
 ;.
 ; AMDGPU: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
 ; AMDGPU: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
-; AMDGPU: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_L5_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_STACK_VAR_L20_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_SHARED_VAR_L35_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_SHARED_VAR_GUARDED_L50_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU: @[[__OMP_OFFLOADING_FD02_2044372E_DO_NOT_SPMDIZE_TARGET_L65_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU: @[[__OMP_OFFLOADING_FD02_2044372E_DO_NOT_SPMDIZE_TASK_L74_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_L5_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_STACK_VAR_L20_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_SHARED_VAR_L35_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_SHARED_VAR_GUARDED_L50_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU: @[[__OMP_OFFLOADING_FD02_2044372E_DO_NOT_SPMDIZE_TARGET_L65_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU: @[[__OMP_OFFLOADING_FD02_2044372E_DO_NOT_SPMDIZE_TASK_L74_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
 ; AMDGPU: @[[GLOB2:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 22, ptr @[[GLOB0]] }, align 8
 ; AMDGPU: @[[X_SHARED:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] poison, align 4
 ; AMDGPU: @[[X_SHARED_1:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] poison, align 4
@@ -151,12 +151,12 @@
 ;.
 ; NVPTX: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
 ; NVPTX: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
-; NVPTX: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_L5_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3 }, ptr @[[GLOB1]], ptr null }
-; NVPTX: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_STACK_VAR_L20_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3 }, ptr @[[GLOB1]], ptr null }
-; NVPTX: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_SHARED_VAR_L35_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3 }, ptr @[[GLOB1]], ptr null }
-; NVPTX: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_SHARED_VAR_GUARDED_L50_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3 }, ptr @[[GLOB1]], ptr null }
-; NVPTX: @[[__OMP_OFFLOADING_FD02_2044372E_DO_NOT_SPMDIZE_TARGET_L65_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null }
-; NVPTX: @[[__OMP_OFFLOADING_FD02_2044372E_DO_NOT_SPMDIZE_TASK_L74_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
+; NVPTX: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_L5_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_STACK_VAR_L20_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_SHARED_VAR_L35_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_SHARED_VAR_GUARDED_L50_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX: @[[__OMP_OFFLOADING_FD02_2044372E_DO_NOT_SPMDIZE_TARGET_L65_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX: @[[__OMP_OFFLOADING_FD02_2044372E_DO_NOT_SPMDIZE_TASK_L74_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
 ; NVPTX: @[[GLOB2:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 22, ptr @[[GLOB0]] }, align 8
 ; NVPTX: @[[X_SHARED:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] poison, align 4
 ; NVPTX: @[[X_SHARED1:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] poison, align 4
@@ -164,12 +164,12 @@
 ;.
 ; AMDGPU-DISABLED1: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
 ; AMDGPU-DISABLED1: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
-; AMDGPU-DISABLED1: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_L5_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU-DISABLED1: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_STACK_VAR_L20_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU-DISABLED1: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_SHARED_VAR_L35_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU-DISABLED1: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_SHARED_VAR_GUARDED_L50_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU-DISABLED1: @[[__OMP_OFFLOADING_FD02_2044372E_DO_NOT_SPMDIZE_TARGET_L65_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU-DISABLED1: @[[__OMP_OFFLOADING_FD02_2044372E_DO_NOT_SPMDIZE_TASK_L74_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU-DISABLED1: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_L5_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU-DISABLED1: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_STACK_VAR_L20_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU-DISABLED1: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_SHARED_VAR_L35_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU-DISABLED1: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_SHARED_VAR_GUARDED_L50_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU-DISABLED1: @[[__OMP_OFFLOADING_FD02_2044372E_DO_NOT_SPMDIZE_TARGET_L65_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU-DISABLED1: @[[__OMP_OFFLOADING_FD02_2044372E_DO_NOT_SPMDIZE_TASK_L74_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
 ; AMDGPU-DISABLED1: @[[X_SHARED:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] poison, align 4
 ; AMDGPU-DISABLED1: @[[X_SHARED_1:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] poison, align 4
 ; AMDGPU-DISABLED1: @[[__OMP_OUTLINED__1_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
@@ -180,12 +180,12 @@
 ;.
 ; AMDGPU-DISABLED2: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
 ; AMDGPU-DISABLED2: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
-; AMDGPU-DISABLED2: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_L5_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU-DISABLED2: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_STACK_VAR_L20_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU-DISABLED2: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_SHARED_VAR_L35_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU-DISABLED2: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_SHARED_VAR_GUARDED_L50_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU-DISABLED2: @[[__OMP_OFFLOADING_FD02_2044372E_DO_NOT_SPMDIZE_TARGET_L65_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU-DISABLED2: @[[__OMP_OFFLOADING_FD02_2044372E_DO_NOT_SPMDIZE_TASK_L74_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU-DISABLED2: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_L5_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU-DISABLED2: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_STACK_VAR_L20_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU-DISABLED2: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_SHARED_VAR_L35_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU-DISABLED2: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_SHARED_VAR_GUARDED_L50_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU-DISABLED2: @[[__OMP_OFFLOADING_FD02_2044372E_DO_NOT_SPMDIZE_TARGET_L65_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU-DISABLED2: @[[__OMP_OFFLOADING_FD02_2044372E_DO_NOT_SPMDIZE_TASK_L74_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
 ; AMDGPU-DISABLED2: @[[X_SHARED:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] poison, align 4
 ; AMDGPU-DISABLED2: @[[X_SHARED_1:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] poison, align 4
 ; AMDGPU-DISABLED2: @[[__OMP_OUTLINED__1_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
@@ -196,12 +196,12 @@
 ;.
 ; NVPTX-DISABLED1: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
 ; NVPTX-DISABLED1: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
-; NVPTX-DISABLED1: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_L5_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
-; NVPTX-DISABLED1: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_STACK_VAR_L20_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
-; NVPTX-DISABLED1: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_SHARED_VAR_L35_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
-; NVPTX-DISABLED1: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_SHARED_VAR_GUARDED_L50_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
-; NVPTX-DISABLED1: @[[__OMP_OFFLOADING_FD02_2044372E_DO_NOT_SPMDIZE_TARGET_L65_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null }
-; NVPTX-DISABLED1: @[[__OMP_OFFLOADING_FD02_2044372E_DO_NOT_SPMDIZE_TASK_L74_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
+; NVPTX-DISABLED1: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_L5_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX-DISABLED1: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_STACK_VAR_L20_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX-DISABLED1: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_SHARED_VAR_L35_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX-DISABLED1: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_SHARED_VAR_GUARDED_L50_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX-DISABLED1: @[[__OMP_OFFLOADING_FD02_2044372E_DO_NOT_SPMDIZE_TARGET_L65_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX-DISABLED1: @[[__OMP_OFFLOADING_FD02_2044372E_DO_NOT_SPMDIZE_TASK_L74_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
 ; NVPTX-DISABLED1: @[[X_SHARED:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] poison, align 4
 ; NVPTX-DISABLED1: @[[X_SHARED1:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] poison, align 4
 ; NVPTX-DISABLED1: @[[__OMP_OUTLINED__1_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
@@ -212,12 +212,12 @@
 ;.
 ; NVPTX-DISABLED2: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
 ; NVPTX-DISABLED2: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
-; NVPTX-DISABLED2: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_L5_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3 }, ptr @[[GLOB1]], ptr null }
-; NVPTX-DISABLED2: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_STACK_VAR_L20_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3 }, ptr @[[GLOB1]], ptr null }
-; NVPTX-DISABLED2: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_SHARED_VAR_L35_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3 }, ptr @[[GLOB1]], ptr null }
-; NVPTX-DISABLED2: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_SHARED_VAR_GUARDED_L50_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3 }, ptr @[[GLOB1]], ptr null }
-; NVPTX-DISABLED2: @[[__OMP_OFFLOADING_FD02_2044372E_DO_NOT_SPMDIZE_TARGET_L65_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null }
-; NVPTX-DISABLED2: @[[__OMP_OFFLOADING_FD02_2044372E_DO_NOT_SPMDIZE_TASK_L74_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
+; NVPTX-DISABLED2: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_L5_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX-DISABLED2: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_STACK_VAR_L20_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX-DISABLED2: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_SHARED_VAR_L35_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX-DISABLED2: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_SHARED_VAR_GUARDED_L50_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX-DISABLED2: @[[__OMP_OFFLOADING_FD02_2044372E_DO_NOT_SPMDIZE_TARGET_L65_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX-DISABLED2: @[[__OMP_OFFLOADING_FD02_2044372E_DO_NOT_SPMDIZE_TASK_L74_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
 ; NVPTX-DISABLED2: @[[X_SHARED:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] poison, align 4
 ; NVPTX-DISABLED2: @[[X_SHARED1:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] poison, align 4
 ; NVPTX-DISABLED2: @[[__OMP_OUTLINED__1_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
diff --git a/llvm/test/Transforms/OpenMP/spmdization_assumes.ll b/llvm/test/Transforms/OpenMP/spmdization_assumes.ll
index e02c3af750a29..1c583475d50a6 100644
--- a/llvm/test/Transforms/OpenMP/spmdization_assumes.ll
+++ b/llvm/test/Transforms/OpenMP/spmdization_assumes.ll
@@ -14,18 +14,18 @@ target triple = "nvptx64"
 
 %struct.ident_t = type { i32, i32, i32, i32, ptr }
 %struct.KernelEnvironmentTy = type { %struct.ConfigurationEnvironmentTy, ptr, ptr }
-%struct.ConfigurationEnvironmentTy = type { i8, i8, i8 }
+%struct.ConfigurationEnvironmentTy = type { i8, i8, i8, i32, i32, i32, i32 }
 
 @0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
 @1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, ptr @0 }, align 8
-@__omp_offloading_fd02_404433c2_main_l5_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1 }, ptr @1, ptr null }
+@__omp_offloading_fd02_404433c2_main_l5_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
 
 
 ; Function Attrs: alwaysinline convergent norecurse nounwind
 ;.
 ; CHECK: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
 ; CHECK: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
-; CHECK: @[[__OMP_OFFLOADING_FD02_404433C2_MAIN_L5_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 3 }, ptr @[[GLOB1]], ptr null }
+; CHECK: @[[__OMP_OFFLOADING_FD02_404433C2_MAIN_L5_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 3, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
 ; CHECK: @[[GLOB2:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 22, ptr @[[GLOB0]] }, align 8
 ;.
 define weak void @__omp_offloading_fd02_404433c2_main_l5(ptr nonnull align 8 dereferenceable(8) %x) local_unnamed_addr #0 {
diff --git a/llvm/test/Transforms/OpenMP/spmdization_constant_prop.ll b/llvm/test/Transforms/OpenMP/spmdization_constant_prop.ll
index 6de019128e0d4..1ac4fd2b45b7a 100644
--- a/llvm/test/Transforms/OpenMP/spmdization_constant_prop.ll
+++ b/llvm/test/Transforms/OpenMP/spmdization_constant_prop.ll
@@ -2,7 +2,7 @@
 ;
 ; Verify we change it to SPMD mode but also avoid propagating the old mode (=generic) into the __kmpc_target_init function.
 ;
-; CHECK: @__omp_offloading_20_11e3950_main_l12_kernel_environment = local_unnamed_addr addrspace(1) constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 3 }, ptr addrspacecast (ptr addrspace(1) @1 to ptr), ptr null }
+; CHECK: @__omp_offloading_20_11e3950_main_l12_kernel_environment = local_unnamed_addr addrspace(1) constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 3, i32 0, i32 0, i32 0, i32 0 }, ptr addrspacecast (ptr addrspace(1) @1 to ptr), ptr null }
 ; CHECK-NOT: store i32 0, ptr addrspace(3) @IsSPMDMode
 ; CHECK-NOT: store i32 0, ptr addrspace(3) @IsSPMDMode
 ; CHECK: store i32 1, ptr addrspace(3) @IsSPMDMode
@@ -14,7 +14,7 @@ target triple = "amdgcn-amd-amdhsa"
 %struct.ident_t = type { i32, i32, i32, i32, ptr }
 %struct.DeviceEnvironmentTy = type { i32, i32, i32, i32 }
 %struct.KernelEnvironmentTy = type { %struct.ConfigurationEnvironmentTy, ptr, ptr }
-%struct.ConfigurationEnvironmentTy = type { i8, i8, i8 }
+%struct.ConfigurationEnvironmentTy = type { i8, i8, i8, i32, i32, i32, i32 }
 %"struct.(anonymous namespace)::SharedMemorySmartStackTy" = type { [512 x i8], [1024 x i8] }
 %"struct.(anonymous namespace)::TeamStateTy" = type { %"struct.(anonymous namespace)::ICVStateTy", i32, ptr }
 %"struct.(anonymous namespace)::ICVStateTy" = type { i32, i32, i32, i32, i32, i32 }
@@ -27,7 +27,7 @@ target triple = "amdgcn-amd-amdhsa"
 @__omp_rtl_debug_kind = weak_odr hidden local_unnamed_addr addrspace(1) constant i32 0
 @__omp_rtl_assume_no_thread_state = weak_odr hidden local_unnamed_addr addrspace(1) constant i32 0
 @omptarget_device_environment = weak protected addrspace(4) global %struct.DeviceEnvironmentTy undef, align 4
-@__omp_offloading_20_11e3950_main_l12_kernel_environment = local_unnamed_addr addrspace(1) constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1 }, ptr addrspacecast (ptr addrspace(1) @1 to ptr), ptr null }
+@__omp_offloading_20_11e3950_main_l12_kernel_environment = local_unnamed_addr addrspace(1) constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr addrspacecast (ptr addrspace(1) @1 to ptr), ptr null }
 @IsSPMDMode = weak hidden addrspace(3) global i32 undef, align 4
 @.str.12 = private unnamed_addr addrspace(4) constant [47 x i8] c"ValueRAII initialization with wrong old value!\00", align 1
 @_ZN12_GLOBAL__N_122SharedMemorySmartStackE = internal addrspace(3) global %"struct.(anonymous namespace)::SharedMemorySmartStackTy" undef, align 16
diff --git a/llvm/test/Transforms/OpenMP/spmdization_guarding.ll b/llvm/test/Transforms/OpenMP/spmdization_guarding.ll
index 342d9e90e9c55..de73250dbb760 100644
--- a/llvm/test/Transforms/OpenMP/spmdization_guarding.ll
+++ b/llvm/test/Transforms/OpenMP/spmdization_guarding.ll
@@ -35,13 +35,13 @@ target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
 target triple = "nvptx64"
 
 %struct.ident_t = type { i32, i32, i32, i32, ptr }
-%struct.ConfigurationEnvironmentTy = type { i8, i8, i8 }
+%struct.ConfigurationEnvironmentTy = type { i8, i8, i8, i32, i32, i32, i32 }
 %struct.KernelEnvironmentTy = type { %struct.ConfigurationEnvironmentTy, ptr, ptr }
 
 @0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
 @1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, ptr @0 }, align 8
 @LocGlob = private unnamed_addr addrspace(5) global i32 43
-@__omp_offloading_2a_fbfa7a_sequential_loop_l6_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1 }, ptr @1, ptr null }
+@__omp_offloading_2a_fbfa7a_sequential_loop_l6_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
 
 
 ; Function Attrs: convergent norecurse nounwind
@@ -49,13 +49,13 @@ target triple = "nvptx64"
 ; CHECK: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
 ; CHECK: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
 ; CHECK: @[[LOCGLOB:[a-zA-Z0-9_$"\\.-]+]] = private unnamed_addr addrspace(5) global i32 43
-; CHECK: @[[__OMP_OFFLOADING_2A_FBFA7A_SEQUENTIAL_LOOP_L6_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 3 }, ptr @[[GLOB1]], ptr null }
+; CHECK: @[[__OMP_OFFLOADING_2A_FBFA7A_SEQUENTIAL_LOOP_L6_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 3, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
 ; CHECK: @[[GLOB2:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 22, ptr @[[GLOB0]] }, align 8
 ;.
 ; CHECK-DISABLED: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
 ; CHECK-DISABLED: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
 ; CHECK-DISABLED: @[[LOCGLOB:[a-zA-Z0-9_$"\\.-]+]] = private unnamed_addr addrspace(5) global i32 43
-; CHECK-DISABLED: @[[__OMP_OFFLOADING_2A_FBFA7A_SEQUENTIAL_LOOP_L6_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null }
+; CHECK-DISABLED: @[[__OMP_OFFLOADING_2A_FBFA7A_SEQUENTIAL_LOOP_L6_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
 ; CHECK-DISABLED: @[[__OMP_OUTLINED__1_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
 ;.
 define weak void @__omp_offloading_2a_fbfa7a_sequential_loop_l6(ptr %x, i64 %N) #0 {
diff --git a/llvm/test/Transforms/OpenMP/spmdization_guarding_two_reaching_kernels.ll b/llvm/test/Transforms/OpenMP/spmdization_guarding_two_reaching_kernels.ll
index e780687cd795c..62aaff572c181 100644
--- a/llvm/test/Transforms/OpenMP/spmdization_guarding_two_reaching_kernels.ll
+++ b/llvm/test/Transforms/OpenMP/spmdization_guarding_two_reaching_kernels.ll
@@ -30,31 +30,31 @@ target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
 target triple = "nvptx64"
 
 %struct.ident_t = type { i32, i32, i32, i32, ptr }
-%struct.ConfigurationEnvironmentTy = type { i8, i8, i8 }
+%struct.ConfigurationEnvironmentTy = type { i8, i8, i8, i32, i32, i32, i32 }
 %struct.KernelEnvironmentTy = type { %struct.ConfigurationEnvironmentTy, ptr, ptr }
 
 @0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
 @1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, ptr @0 }, align 8
 @2 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 2, i32 0, ptr @0 }, align 8
 @G = external global i32, align 4
-@__omp_offloading_2b_10393b5_spmd_l12_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1 }, ptr @1, ptr null }
-@__omp_offloading_2b_10393b5_generic_l20_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1 }, ptr @1, ptr null }
+@__omp_offloading_2b_10393b5_spmd_l12_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
+@__omp_offloading_2b_10393b5_generic_l20_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
 
 ;.
 ; CHECK: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
 ; CHECK: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
 ; CHECK: @[[GLOB2:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
 ; CHECK: @[[G:[a-zA-Z0-9_$"\\.-]+]] = external global i32, align 4
-; CHECK: @[[__OMP_OFFLOADING_2B_10393B5_SPMD_L12_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3 }, ptr @[[GLOB1]], ptr null }
-; CHECK: @[[__OMP_OFFLOADING_2B_10393B5_GENERIC_L20_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 3 }, ptr @[[GLOB1]], ptr null }
+; CHECK: @[[__OMP_OFFLOADING_2B_10393B5_SPMD_L12_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; CHECK: @[[__OMP_OFFLOADING_2B_10393B5_GENERIC_L20_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 3, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
 ; CHECK: @[[GLOB3:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 22, ptr @[[GLOB0]] }, align 8
 ;.
 ; CHECK-DISABLE-SPMDIZATION: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
 ; CHECK-DISABLE-SPMDIZATION: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
 ; CHECK-DISABLE-SPMDIZATION: @[[GLOB2:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
 ; CHECK-DISABLE-SPMDIZATION: @[[G:[a-zA-Z0-9_$"\\.-]+]] = external global i32, align 4
-; CHECK-DISABLE-SPMDIZATION: @[[__OMP_OFFLOADING_2B_10393B5_SPMD_L12_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
-; CHECK-DISABLE-SPMDIZATION: @[[__OMP_OFFLOADING_2B_10393B5_GENERIC_L20_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null }
+; CHECK-DISABLE-SPMDIZATION: @[[__OMP_OFFLOADING_2B_10393B5_SPMD_L12_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; CHECK-DISABLE-SPMDIZATION: @[[__OMP_OFFLOADING_2B_10393B5_GENERIC_L20_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
 ;.
 define weak void @__omp_offloading_2b_10393b5_spmd_l12() "kernel" #0 {
 ; CHECK-LABEL: define {{[^@]+}}@__omp_offloading_2b_10393b5_spmd_l12
diff --git a/llvm/test/Transforms/OpenMP/spmdization_indirect.ll b/llvm/test/Transforms/OpenMP/spmdization_indirect.ll
index 04b0e50d4bce4..3a64b2660cf11 100644
--- a/llvm/test/Transforms/OpenMP/spmdization_indirect.ll
+++ b/llvm/test/Transforms/OpenMP/spmdization_indirect.ll
@@ -3,30 +3,30 @@
 ; RUN: opt --mtriple=nvptx64-- -S -passes=openmp-opt < %s | FileCheck %s --check-prefixes=NVPTX
 
 %struct.ident_t = type { i32, i32, i32, i32, ptr }
-%struct.ConfigurationEnvironmentTy = type { i8, i8, i8 }
+%struct.ConfigurationEnvironmentTy = type { i8, i8, i8, i32, i32, i32, i32 }
 %struct.KernelEnvironmentTy = type { %struct.ConfigurationEnvironmentTy, ptr, ptr }
 
 @0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
 @1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, ptr @0 }, align 8
-@spmd_callees_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1 }, ptr @1, ptr null }
-@spmd_callees_metadata_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1 }, ptr @1, ptr null }
-@spmd_and_non_spmd_callees_metadata_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1 }, ptr @1, ptr null }
-@spmd_and_non_spmd_callee_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1 }, ptr @1, ptr null }
+@spmd_callees_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
+@spmd_callees_metadata_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
+@spmd_and_non_spmd_callees_metadata_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
+@spmd_and_non_spmd_callee_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
 
 ;.
 ; AMDGPU: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
 ; AMDGPU: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
-; AMDGPU: @[[SPMD_CALLEES_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU: @[[SPMD_CALLEES_METADATA_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 3 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU: @[[SPMD_AND_NON_SPMD_CALLEES_METADATA_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU: @[[SPMD_AND_NON_SPMD_CALLEE_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU: @[[SPMD_CALLEES_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU: @[[SPMD_CALLEES_METADATA_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 3, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU: @[[SPMD_AND_NON_SPMD_CALLEES_METADATA_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU: @[[SPMD_AND_NON_SPMD_CALLEE_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
 ;.
 ; NVPTX: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
 ; NVPTX: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
-; NVPTX: @[[SPMD_CALLEES_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3 }, ptr @[[GLOB1]], ptr null }
-; NVPTX: @[[SPMD_CALLEES_METADATA_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 3 }, ptr @[[GLOB1]], ptr null }
-; NVPTX: @[[SPMD_AND_NON_SPMD_CALLEES_METADATA_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null }
-; NVPTX: @[[SPMD_AND_NON_SPMD_CALLEE_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null }
+; NVPTX: @[[SPMD_CALLEES_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX: @[[SPMD_CALLEES_METADATA_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 3, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX: @[[SPMD_AND_NON_SPMD_CALLEES_METADATA_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX: @[[SPMD_AND_NON_SPMD_CALLEE_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
 ;.
 define weak void @spmd_callees(i1 %c) #0 {
 ; AMDGPU-LABEL: define {{[^@]+}}@spmd_callees
diff --git a/llvm/test/Transforms/OpenMP/spmdization_kernel_env_dep.ll b/llvm/test/Transforms/OpenMP/spmdization_kernel_env_dep.ll
index 2ac0ae5b307e4..a5bef5f7e8e15 100644
--- a/llvm/test/Transforms/OpenMP/spmdization_kernel_env_dep.ll
+++ b/llvm/test/Transforms/OpenMP/spmdization_kernel_env_dep.ll
@@ -5,14 +5,14 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:3
 target triple = "amdgcn-amd-amdhsa"
 
 %struct.KernelEnvironmentTy = type { %struct.ConfigurationEnvironmentTy.8, ptr, ptr }
-%struct.ConfigurationEnvironmentTy.8 = type { i8, i8, i8 }
+%struct.ConfigurationEnvironmentTy.8 = type { i8, i8, i8, i32, i32, i32, i32 }
 
 @IsSPMDMode = internal addrspace(3) global i32 undef
-@__omp_offloading_10302_b20a40e_main_l4_kernel_environment = addrspace(1) constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy.8 { i8 1, i8 0, i8 1 }, ptr addrspacecast (ptr addrspace(1) null to ptr), ptr addrspacecast (ptr addrspace(1) null to ptr) }
+@__omp_offloading_10302_b20a40e_main_l4_kernel_environment = addrspace(1) constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy.8 { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr addrspacecast (ptr addrspace(1) null to ptr), ptr addrspacecast (ptr addrspace(1) null to ptr) }
 
 ;.
 ; AMDGPU: @[[ISSPMDMODE:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global i32 undef
-; AMDGPU: @[[__OMP_OFFLOADING_10302_B20A40E_MAIN_L4_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = addrspace(1) constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY_8:%.*]] { i8 0, i8 0, i8 1 }, ptr addrspacecast (ptr addrspace(1) null to ptr), ptr addrspacecast (ptr addrspace(1) null to ptr) }
+; AMDGPU: @[[__OMP_OFFLOADING_10302_B20A40E_MAIN_L4_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = addrspace(1) constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY_8:%.*]] { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr addrspacecast (ptr addrspace(1) null to ptr), ptr addrspacecast (ptr addrspace(1) null to ptr) }
 ;.
 define i32 @fputs() {
 ; AMDGPU-LABEL: define {{[^@]+}}@fputs
diff --git a/llvm/test/Transforms/OpenMP/spmdization_no_guarding_two_reaching_kernels.ll b/llvm/test/Transforms/OpenMP/spmdization_no_guarding_two_reaching_kernels.ll
index 04644a98983f0..c6365c9e0e93d 100644
--- a/llvm/test/Transforms/OpenMP/spmdization_no_guarding_two_reaching_kernels.ll
+++ b/llvm/test/Transforms/OpenMP/spmdization_no_guarding_two_reaching_kernels.ll
@@ -31,30 +31,30 @@ target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
 target triple = "nvptx64"
 
 %struct.ident_t = type { i32, i32, i32, i32, ptr }
-%struct.ConfigurationEnvironmentTy = type { i8, i8, i8 }
+%struct.ConfigurationEnvironmentTy = type { i8, i8, i8, i32, i32, i32, i32 }
 %struct.KernelEnvironmentTy = type { %struct.ConfigurationEnvironmentTy, ptr, ptr }
 
 @0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
 @1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, ptr @0 }, align 8
 @2 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 2, i32 0, ptr @0 }, align 8
 @G = external addrspace(5) global i32, align 4
-@__omp_offloading_2b_10393b5_spmd_l12_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1 }, ptr @1, ptr null }
-@__omp_offloading_2b_10393b5_generic_l20_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1 }, ptr @1, ptr null }
+@__omp_offloading_2b_10393b5_spmd_l12_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
+@__omp_offloading_2b_10393b5_generic_l20_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
 
 ;.
 ; CHECK: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
 ; CHECK: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
 ; CHECK: @[[GLOB2:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
 ; CHECK: @[[G:[a-zA-Z0-9_$"\\.-]+]] = external addrspace(5) global i32, align 4
-; CHECK: @[[__OMP_OFFLOADING_2B_10393B5_SPMD_L12_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 3 }, ptr @[[GLOB1]], ptr null }
-; CHECK: @[[__OMP_OFFLOADING_2B_10393B5_GENERIC_L20_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null }
+; CHECK: @[[__OMP_OFFLOADING_2B_10393B5_SPMD_L12_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 3, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; CHECK: @[[__OMP_OFFLOADING_2B_10393B5_GENERIC_L20_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
 ;.
 ; CHECK-DISABLE-SPMDIZATION: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
 ; CHECK-DISABLE-SPMDIZATION: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
 ; CHECK-DISABLE-SPMDIZATION: @[[GLOB2:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
 ; CHECK-DISABLE-SPMDIZATION: @[[G:[a-zA-Z0-9_$"\\.-]+]] = external addrspace(5) global i32, align 4
-; CHECK-DISABLE-SPMDIZATION: @[[__OMP_OFFLOADING_2B_10393B5_SPMD_L12_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null }
-; CHECK-DISABLE-SPMDIZATION: @[[__OMP_OFFLOADING_2B_10393B5_GENERIC_L20_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null }
+; CHECK-DISABLE-SPMDIZATION: @[[__OMP_OFFLOADING_2B_10393B5_SPMD_L12_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; CHECK-DISABLE-SPMDIZATION: @[[__OMP_OFFLOADING_2B_10393B5_GENERIC_L20_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
 ; CHECK-DISABLE-SPMDIZATION: @[[__OMP_OUTLINED___WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
 ;.
 define weak void @__omp_offloading_2b_10393b5_spmd_l12() #0 {
diff --git a/llvm/test/Transforms/OpenMP/spmdization_remarks.ll b/llvm/test/Transforms/OpenMP/spmdization_remarks.ll
index 08ddcfc07d50c..d2d22c2d8790c 100644
--- a/llvm/test/Transforms/OpenMP/spmdization_remarks.ll
+++ b/llvm/test/Transforms/OpenMP/spmdization_remarks.ll
@@ -38,7 +38,7 @@ target triple = "nvptx64"
 ;; }
 
 %struct.ident_t = type { i32, i32, i32, i32, ptr }
-%struct.ConfigurationEnvironmentTy = type { i8, i8, i8 }
+%struct.ConfigurationEnvironmentTy = type { i8, i8, i8, i32, i32, i32, i32 }
 %struct.KernelEnvironmentTy = type { %struct.ConfigurationEnvironmentTy, ptr, ptr }
 
 @0 = private unnamed_addr constant [103 x i8] c";llvm/test/Transforms/OpenMP/spmdization_remarks.c;__omp_offloading_2a_d80d3d_test_fallback_l11;11;1;;\00", align 1
@@ -57,8 +57,8 @@ target triple = "nvptx64"
 @13 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 2, i32 0, ptr @12 }, align 8
 @G = external global i32
 
-@__omp_offloading_2a_d80d3d_test_fallback_l11_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1 }, ptr @1, ptr null }
-@__omp_offloading_2a_d80d3d_test_no_fallback_l20_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1 }, ptr @1, ptr null }
+@__omp_offloading_2a_d80d3d_test_fallback_l11_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
+@__omp_offloading_2a_d80d3d_test_no_fallback_l20_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
 
 
 ; Function Attrs: convergent norecurse nounwind
diff --git a/llvm/test/Transforms/OpenMP/value-simplify-openmp-opt.ll b/llvm/test/Transforms/OpenMP/value-simplify-openmp-opt.ll
index 5f06f9104ae4a..0a62b1750cb0d 100644
--- a/llvm/test/Transforms/OpenMP/value-simplify-openmp-opt.ll
+++ b/llvm/test/Transforms/OpenMP/value-simplify-openmp-opt.ll
@@ -4,7 +4,7 @@
 
 target triple = "amdgcn-amd-amdhsa"
 
-%struct.ConfigurationEnvironmentTy = type { i8, i8, i8 }
+%struct.ConfigurationEnvironmentTy = type { i8, i8, i8, i32, i32, i32, i32 }
 %struct.KernelEnvironmentTy = type { %struct.ConfigurationEnvironmentTy, ptr, ptr }
 
 @G = internal addrspace(3) global i32 undef, align 4
@@ -27,7 +27,7 @@ target triple = "amdgcn-amd-amdhsa"
 @UAA3 = internal addrspace(3) global i32 undef, align 4
 @UANA1 = internal addrspace(3) global i32 undef, align 4
 @str = private unnamed_addr addrspace(4) constant [1 x i8] c"\00", align 1
-@kernel_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1 }, ptr null, ptr null }
+@kernel_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr null, ptr null }
 
 ; Make sure we do not delete the stores to @G without also replacing the load with `1`.
 ;.
@@ -51,7 +51,7 @@ target triple = "amdgcn-amd-amdhsa"
 ; CHECK: @[[UAA3:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global i32 undef, align 4
 ; CHECK: @[[UANA1:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global i32 undef, align 4
 ; CHECK: @[[STR:[a-zA-Z0-9_$"\\.-]+]] = private unnamed_addr addrspace(4) constant [1 x i8] zeroinitializer, align 1
-; CHECK: @[[KERNEL_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1 }, ptr null, ptr null }
+; CHECK: @[[KERNEL_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr null, ptr null }
 ;.
 define void @kernel() "kernel" {
 ;
diff --git a/llvm/test/Transforms/PhaseOrdering/openmp-opt-module.ll b/llvm/test/Transforms/PhaseOrdering/openmp-opt-module.ll
index b8486ab6a6c9e..a64107b25d520 100644
--- a/llvm/test/Transforms/PhaseOrdering/openmp-opt-module.ll
+++ b/llvm/test/Transforms/PhaseOrdering/openmp-opt-module.ll
@@ -1,12 +1,12 @@
 ; RUN: opt -passes='default<O2>' -pass-remarks-missed=openmp-opt < %s 2>&1 | FileCheck %s --check-prefix=MODULE
 target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
 
-%struct.ConfigurationEnvironmentTy = type { i8, i8, i8 }
+%struct.ConfigurationEnvironmentTy = type { i8, i8, i8, i32, i32, i32, i32 }
 %struct.KernelEnvironmentTy = type { %struct.ConfigurationEnvironmentTy, ptr, ptr }
 
 @.str = private unnamed_addr constant [13 x i8] c"Alloc Shared\00", align 1
 @S = external local_unnamed_addr global ptr
-@foo_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1 }, ptr null, ptr null }
+@foo_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr null, ptr null }
 
 ; MODULE: remark: openmp_opt_module.c:5:7: Found thread data sharing on the GPU. Expect degraded performance due to data globalization.
 
diff --git a/openmp/libomptarget/include/Environment.h b/openmp/libomptarget/include/Environment.h
index 8194736ae4e0a..1374d1e95a554 100644
--- a/openmp/libomptarget/include/Environment.h
+++ b/openmp/libomptarget/include/Environment.h
@@ -80,6 +80,13 @@ struct ConfigurationEnvironmentTy {
   uint8_t UseGenericStateMachine;
   uint8_t MayUseNestedParallelism;
   llvm::omp::OMPTgtExecModeFlags ExecMode;
+  // Information about (legal) launch configurations.
+  //{
+  int32_t MinThreads;
+  int32_t MaxThreads;
+  int32_t MinTeams;
+  int32_t MaxTeams;
+  //}
 };
 
 // NOTE: Please don't change the order of those members as their indices are

From a1e9777b760fbd4428073fd7b0fc8bea15bd2183 Mon Sep 17 00:00:00 2001
From: Mingming Liu <mingmingl@google.com>
Date: Thu, 26 Oct 2023 14:48:36 -0700
Subject: [PATCH 145/877] [NFC] In InstrProf, generalize helper functions to
 take 'GlobalObject'. They currently take 'Functions' as function parameters
 or have 'Func' in the name. (#70287)

- For instance, `collectPGOFuncNameStrings` is reused a lot in https://github.com/llvm/llvm-project/pull/66825 to get the compressed vtable names; and in some added callsites it's just confusing to see 'func' since context clearly shows it's not. This function currently just takes a list of strings as input so name it to `collectGlobalObjectNameStrings`
- Do the rename in a standalone patch since the method is used in non-llvm codebase. It's easier to rollback this NFC in case rename in that codebase takes longer.
---
 llvm/include/llvm/ProfileData/InstrProf.h    | 19 ++---
 llvm/lib/ProfileData/InstrProf.cpp           | 73 +++++++++++---------
 llvm/lib/ProfileData/InstrProfCorrelator.cpp |  2 +-
 llvm/unittests/ProfileData/InstrProfTest.cpp |  4 +-
 4 files changed, 56 insertions(+), 42 deletions(-)

diff --git a/llvm/include/llvm/ProfileData/InstrProf.h b/llvm/include/llvm/ProfileData/InstrProf.h
index 9239c1a691eca..e968f8ffd5075 100644
--- a/llvm/include/llvm/ProfileData/InstrProf.h
+++ b/llvm/include/llvm/ProfileData/InstrProf.h
@@ -220,20 +220,23 @@ StringRef getPGOFuncNameVarInitializer(GlobalVariable *NameVar);
 StringRef getFuncNameWithoutPrefix(StringRef PGOFuncName,
                                    StringRef FileName = "<unknown>");
 
-/// Given a vector of strings (function PGO names) \c NameStrs, the
-/// method generates a combined string \c Result that is ready to be
-/// serialized.  The \c Result string is comprised of three fields:
-/// The first field is the length of the uncompressed strings, and the
-/// the second field is the length of the zlib-compressed string.
-/// Both fields are encoded in ULEB128.  If \c doCompress is false, the
+/// Given a vector of strings (names of global objects like functions or,
+/// virtual tables) \c NameStrs, the method generates a combined string \c
+/// Result that is ready to be serialized.  The \c Result string is comprised of
+/// three fields: The first field is the length of the uncompressed strings, and
+/// the the second field is the length of the zlib-compressed string. Both
+/// fields are encoded in ULEB128.  If \c doCompress is false, the
 ///  third field is the uncompressed strings; otherwise it is the
 /// compressed string. When the string compression is off, the
 /// second field will have value zero.
-Error collectPGOFuncNameStrings(ArrayRef<std::string> NameStrs,
-                                bool doCompression, std::string &Result);
+Error collectGlobalObjectNameStrings(ArrayRef<std::string> NameStrs,
+                                     bool doCompression, std::string &Result);
 
 /// Produce \c Result string with the same format described above. The input
 /// is vector of PGO function name variables that are referenced.
+/// The global variable element in 'NameVars' is a string containing the pgo
+/// name of a function. See `createPGOFuncNameVar` that creates these global
+/// variables.
 Error collectPGOFuncNameStrings(ArrayRef<GlobalVariable *> NameVars,
                                 std::string &Result, bool doCompression = true);
 
diff --git a/llvm/lib/ProfileData/InstrProf.cpp b/llvm/lib/ProfileData/InstrProf.cpp
index ddc11304742df..0cb296b3bde6c 100644
--- a/llvm/lib/ProfileData/InstrProf.cpp
+++ b/llvm/lib/ProfileData/InstrProf.cpp
@@ -265,8 +265,8 @@ static StringRef stripDirPrefix(StringRef PathNameStr, uint32_t NumPrefix) {
   return PathNameStr.substr(LastPos);
 }
 
-static StringRef getStrippedSourceFileName(const Function &F) {
-  StringRef FileName(F.getParent()->getSourceFileName());
+static StringRef getStrippedSourceFileName(const GlobalObject &GO) {
+  StringRef FileName(GO.getParent()->getSourceFileName());
   uint32_t StripLevel = StaticFuncFullModulePrefix ? 0 : (uint32_t)-1;
   if (StripLevel < StaticFuncStripDirNamePrefix)
     StripLevel = StaticFuncStripDirNamePrefix;
@@ -289,56 +289,67 @@ static StringRef getStrippedSourceFileName(const Function &F) {
 // mangled, they cannot be passed to Mach-O linkers via -order_file. We still
 // need to compute this name to lookup functions from profiles built by older
 // compilers.
-static std::string getIRPGOFuncName(const Function &F,
-                                    GlobalValue::LinkageTypes Linkage,
-                                    StringRef FileName) {
+static std::string
+getIRPGONameForGlobalObject(const GlobalObject &GO,
+                            GlobalValue::LinkageTypes Linkage,
+                            StringRef FileName) {
   SmallString<64> Name;
   if (llvm::GlobalValue::isLocalLinkage(Linkage)) {
     Name.append(FileName.empty() ? "<unknown>" : FileName);
     Name.append(";");
   }
-  Mangler().getNameWithPrefix(Name, &F, /*CannotUsePrivateLabel=*/true);
+  Mangler().getNameWithPrefix(Name, &GO, /*CannotUsePrivateLabel=*/true);
   return Name.str().str();
 }
 
-static std::optional<std::string> lookupPGOFuncName(const Function &F) {
-  if (MDNode *MD = getPGOFuncNameMetadata(F)) {
+static std::optional<std::string> lookupPGONameFromMetadata(MDNode *MD) {
+  if (MD != nullptr) {
     StringRef S = cast<MDString>(MD->getOperand(0))->getString();
     return S.str();
   }
   return {};
 }
 
-// See getPGOFuncName()
-std::string getIRPGOFuncName(const Function &F, bool InLTO) {
+// Returns the PGO object name. This function has some special handling
+// when called in LTO optimization. The following only applies when calling in
+// LTO passes (when \c InLTO is true): LTO's internalization privatizes many
+// global linkage symbols. This happens after value profile annotation, but
+// those internal linkage functions should not have a source prefix.
+// Additionally, for ThinLTO mode, exported internal functions are promoted
+// and renamed. We need to ensure that the original internal PGO name is
+// used when computing the GUID that is compared against the profiled GUIDs.
+// To differentiate compiler generated internal symbols from original ones,
+// PGOFuncName meta data are created and attached to the original internal
+// symbols in the value profile annotation step
+// (PGOUseFunc::annotateIndirectCallSites). If a symbol does not have the meta
+// data, its original linkage must be non-internal.
+static std::string getIRPGOObjectName(const GlobalObject &GO, bool InLTO,
+                                      MDNode *PGONameMetadata) {
   if (!InLTO) {
-    auto FileName = getStrippedSourceFileName(F);
-    return getIRPGOFuncName(F, F.getLinkage(), FileName);
+    auto FileName = getStrippedSourceFileName(GO);
+    return getIRPGONameForGlobalObject(GO, GO.getLinkage(), FileName);
   }
 
   // In LTO mode (when InLTO is true), first check if there is a meta data.
-  if (auto IRPGOFuncName = lookupPGOFuncName(F))
+  if (auto IRPGOFuncName = lookupPGONameFromMetadata(PGONameMetadata))
     return *IRPGOFuncName;
 
   // If there is no meta data, the function must be a global before the value
   // profile annotation pass. Its current linkage may be internal if it is
   // internalized in LTO mode.
-  return getIRPGOFuncName(F, GlobalValue::ExternalLinkage, "");
+  return getIRPGONameForGlobalObject(GO, GlobalValue::ExternalLinkage, "");
 }
 
-// Return the PGOFuncName. This function has some special handling when called
-// in LTO optimization. The following only applies when calling in LTO passes
-// (when \c InLTO is true): LTO's internalization privatizes many global linkage
-// symbols. This happens after value profile annotation, but those internal
-// linkage functions should not have a source prefix.
-// Additionally, for ThinLTO mode, exported internal functions are promoted
-// and renamed. We need to ensure that the original internal PGO name is
-// used when computing the GUID that is compared against the profiled GUIDs.
-// To differentiate compiler generated internal symbols from original ones,
-// PGOFuncName meta data are created and attached to the original internal
-// symbols in the value profile annotation step
-// (PGOUseFunc::annotateIndirectCallSites). If a symbol does not have the meta
-// data, its original linkage must be non-internal.
+// Returns the IRPGO function name and does special handling when called
+// in LTO optimization. See the comments of `getIRPGOObjectName` for details.
+std::string getIRPGOFuncName(const Function &F, bool InLTO) {
+  return getIRPGOObjectName(F, InLTO, getPGOFuncNameMetadata(F));
+}
+
+// This is similar to `getIRPGOFuncName` except that this function calls
+// 'getPGOFuncName' to get a name and `getIRPGOFuncName` calls
+// 'getIRPGONameForGlobalObject'. See the difference between two callees in the
+// comments of `getIRPGONameForGlobalObject`.
 std::string getPGOFuncName(const Function &F, bool InLTO, uint64_t Version) {
   if (!InLTO) {
     auto FileName = getStrippedSourceFileName(F);
@@ -346,7 +357,7 @@ std::string getPGOFuncName(const Function &F, bool InLTO, uint64_t Version) {
   }
 
   // In LTO mode (when InLTO is true), first check if there is a meta data.
-  if (auto PGOFuncName = lookupPGOFuncName(F))
+  if (auto PGOFuncName = lookupPGONameFromMetadata(getPGOFuncNameMetadata(F)))
     return *PGOFuncName;
 
   // If there is no meta data, the function must be a global before the value
@@ -494,8 +505,8 @@ void InstrProfSymtab::dumpNames(raw_ostream &OS) const {
     OS << S << '\n';
 }
 
-Error collectPGOFuncNameStrings(ArrayRef<std::string> NameStrs,
-                                bool doCompression, std::string &Result) {
+Error collectGlobalObjectNameStrings(ArrayRef<std::string> NameStrs,
+                                     bool doCompression, std::string &Result) {
   assert(!NameStrs.empty() && "No name data to emit");
 
   uint8_t Header[20], *P = Header;
@@ -545,7 +556,7 @@ Error collectPGOFuncNameStrings(ArrayRef<GlobalVariable *> NameVars,
   for (auto *NameVar : NameVars) {
     NameStrs.push_back(std::string(getPGOFuncNameVarInitializer(NameVar)));
   }
-  return collectPGOFuncNameStrings(
+  return collectGlobalObjectNameStrings(
       NameStrs, compression::zlib::isAvailable() && doCompression, Result);
 }
 
diff --git a/llvm/lib/ProfileData/InstrProfCorrelator.cpp b/llvm/lib/ProfileData/InstrProfCorrelator.cpp
index 71787c9bd8577..f298fcab1220c 100644
--- a/llvm/lib/ProfileData/InstrProfCorrelator.cpp
+++ b/llvm/lib/ProfileData/InstrProfCorrelator.cpp
@@ -152,7 +152,7 @@ Error InstrProfCorrelatorImpl<IntPtrT>::correlateProfileData(int MaxWarnings) {
         instrprof_error::unable_to_correlate_profile,
         "could not find any profile metadata in debug info");
   auto Result =
-      collectPGOFuncNameStrings(NamesVec, /*doCompression=*/false, Names);
+      collectGlobalObjectNameStrings(NamesVec, /*doCompression=*/false, Names);
   CounterOffsets.clear();
   NamesVec.clear();
   return Result;
diff --git a/llvm/unittests/ProfileData/InstrProfTest.cpp b/llvm/unittests/ProfileData/InstrProfTest.cpp
index 494e3c18c81c3..f26f244afc537 100644
--- a/llvm/unittests/ProfileData/InstrProfTest.cpp
+++ b/llvm/unittests/ProfileData/InstrProfTest.cpp
@@ -1368,7 +1368,7 @@ TEST_P(MaybeSparseInstrProfTest, instr_prof_symtab_compression_test) {
   for (bool DoCompression : {false, true}) {
     // Compressing:
     std::string FuncNameStrings1;
-    EXPECT_THAT_ERROR(collectPGOFuncNameStrings(
+    EXPECT_THAT_ERROR(collectGlobalObjectNameStrings(
                           FuncNames1,
                           (DoCompression && compression::zlib::isAvailable()),
                           FuncNameStrings1),
@@ -1376,7 +1376,7 @@ TEST_P(MaybeSparseInstrProfTest, instr_prof_symtab_compression_test) {
 
     // Compressing:
     std::string FuncNameStrings2;
-    EXPECT_THAT_ERROR(collectPGOFuncNameStrings(
+    EXPECT_THAT_ERROR(collectGlobalObjectNameStrings(
                           FuncNames2,
                           (DoCompression && compression::zlib::isAvailable()),
                           FuncNameStrings2),

From 391181062f8d1b9277a9068a7edf41be721ca5cf Mon Sep 17 00:00:00 2001
From: Chi Chun Chen <chichun.chen@hpe.com>
Date: Thu, 26 Oct 2023 16:57:36 -0500
Subject: [PATCH 146/877] Revert "[OpenMP] Patch for Support to loop bind
 clause : Checking Parent Region"

This reverts commit 85f6b2fac9a367337e43ca288c45ea783981cc16.
---
 clang/include/clang/Sema/Sema.h          |   8 +-
 clang/lib/Sema/SemaOpenMP.cpp            |  64 +++-----
 clang/test/OpenMP/loop_bind_messages.cpp | 180 ++---------------------
 clang/test/PCH/pragma-loop.cpp           |   8 +-
 4 files changed, 33 insertions(+), 227 deletions(-)

diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index 18ac85011aa75..1e9752345ffd1 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -11307,7 +11307,6 @@ class Sema final {
   /// on the parameter of the bind clause. In the methods for the
   /// mapped directives, check the parameters of the lastprivate clause.
   bool checkLastPrivateForMappedDirectives(ArrayRef<OMPClause *> Clauses);
-
   /// Depending on the bind clause of OMPD_loop map the directive to new
   /// directives.
   ///    1) loop bind(parallel) --> OMPD_for
@@ -11317,12 +11316,9 @@ class Sema final {
   /// rigorous semantic checking in the new mapped directives.
   bool mapLoopConstruct(llvm::SmallVector<OMPClause *> &ClausesWithoutBind,
                         ArrayRef<OMPClause *> Clauses,
-                        OpenMPBindClauseKind &BindKind,
+                        OpenMPBindClauseKind BindKind,
                         OpenMPDirectiveKind &Kind,
-                        OpenMPDirectiveKind &PrevMappedDirective,
-                        SourceLocation StartLoc, SourceLocation EndLoc,
-                        const DeclarationNameInfo &DirName,
-                        OpenMPDirectiveKind CancelRegion);
+                        OpenMPDirectiveKind &PrevMappedDirective);
 
 public:
   /// The declarator \p D defines a function in the scope \p S which is nested
diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp
index f28e0f2693080..75f9e152dca92 100644
--- a/clang/lib/Sema/SemaOpenMP.cpp
+++ b/clang/lib/Sema/SemaOpenMP.cpp
@@ -5062,18 +5062,6 @@ static bool checkNestingOfRegions(Sema &SemaRef, const DSAStackTy *Stack,
         CurrentRegion != OMPD_cancellation_point &&
         CurrentRegion != OMPD_cancel && CurrentRegion != OMPD_scan)
       return false;
-    // Checks needed for mapping "loop" construct. Please check mapLoopConstruct
-    // for a detailed explanation
-    if (SemaRef.LangOpts.OpenMP >= 50 && CurrentRegion == OMPD_loop &&
-        ((BindKind == OMPC_BIND_parallel) || (BindKind == OMPC_BIND_teams)) &&
-        (isOpenMPWorksharingDirective(ParentRegion) ||
-         ParentRegion == OMPD_loop)) {
-      int ErrorMsgNumber = (BindKind == OMPC_BIND_parallel) ? 1 : 4;
-      SemaRef.Diag(StartLoc, diag::err_omp_prohibited_region)
-          << true << getOpenMPDirectiveName(ParentRegion) << ErrorMsgNumber
-          << getOpenMPDirectiveName(CurrentRegion);
-      return true;
-    }
     if (CurrentRegion == OMPD_cancellation_point ||
         CurrentRegion == OMPD_cancel) {
       // OpenMP [2.16, Nesting of Regions]
@@ -6126,40 +6114,35 @@ processImplicitMapsWithDefaultMappers(Sema &S, DSAStackTy *Stack,
 
 bool Sema::mapLoopConstruct(llvm::SmallVector<OMPClause *> &ClausesWithoutBind,
                             ArrayRef<OMPClause *> Clauses,
-                            OpenMPBindClauseKind &BindKind,
+                            OpenMPBindClauseKind BindKind,
                             OpenMPDirectiveKind &Kind,
-                            OpenMPDirectiveKind &PrevMappedDirective,
-                            SourceLocation StartLoc, SourceLocation EndLoc,
-                            const DeclarationNameInfo &DirName,
-                            OpenMPDirectiveKind CancelRegion) {
+                            OpenMPDirectiveKind &PrevMappedDirective) {
 
   bool UseClausesWithoutBind = false;
 
   // Restricting to "#pragma omp loop bind"
   if (getLangOpts().OpenMP >= 50 && Kind == OMPD_loop) {
-
-    const OpenMPDirectiveKind ParentDirective = DSAStack->getParentDirective();
-
     if (BindKind == OMPC_BIND_unknown) {
       // Setting the enclosing teams or parallel construct for the loop
       // directive without bind clause.
       BindKind = OMPC_BIND_thread; // Default bind(thread) if binding is unknown
 
+      const OpenMPDirectiveKind ParentDirective =
+          DSAStack->getParentDirective();
       if (ParentDirective == OMPD_unknown) {
         Diag(DSAStack->getDefaultDSALocation(),
              diag::err_omp_bind_required_on_loop);
-      } else if (isOpenMPParallelDirective(ParentDirective) &&
-                 !isOpenMPTeamsDirective(ParentDirective)) {
+      } else if (ParentDirective == OMPD_parallel ||
+                 ParentDirective == OMPD_target_parallel) {
         BindKind = OMPC_BIND_parallel;
-      } else if (isOpenMPNestingTeamsDirective(ParentDirective) ||
-                 (ParentDirective == OMPD_target_teams)) {
+      } else if (ParentDirective == OMPD_teams ||
+                 ParentDirective == OMPD_target_teams) {
         BindKind = OMPC_BIND_teams;
       }
     } else {
-      // bind clause is present in loop directive. When the loop directive is
-      // changed to a new directive the bind clause is not used. So, we should
-      // set flag indicating to only use the clauses that aren't the
-      // bind clause.
+      // bind clause is present, so we should set flag indicating to only
+      // use the clauses that aren't the bind clause for the new directive that
+      // loop is lowered to.
       UseClausesWithoutBind = true;
     }
 
@@ -6220,35 +6203,26 @@ StmtResult Sema::ActOnOpenMPExecutableDirective(
     OpenMPDirectiveKind PrevMappedDirective) {
   StmtResult Res = StmtError();
   OpenMPBindClauseKind BindKind = OMPC_BIND_unknown;
-  llvm::SmallVector<OMPClause *> ClausesWithoutBind;
-  bool UseClausesWithoutBind = false;
-
   if (const OMPBindClause *BC =
           OMPExecutableDirective::getSingleClause<OMPBindClause>(Clauses))
     BindKind = BC->getBindKind();
-
-  // Variable used to note down the DirectiveKind because mapLoopConstruct may
-  // change "Kind" variable, due to mapping of "omp loop" to other directives.
-  OpenMPDirectiveKind DK = Kind;
-  if ((Kind == OMPD_loop) || (PrevMappedDirective == OMPD_loop)) {
-    UseClausesWithoutBind = mapLoopConstruct(
-        ClausesWithoutBind, Clauses, BindKind, Kind, PrevMappedDirective,
-        StartLoc, EndLoc, DirName, CancelRegion);
-    DK = OMPD_loop;
-  }
-
   // First check CancelRegion which is then used in checkNestingOfRegions.
   if (checkCancelRegion(*this, Kind, CancelRegion, StartLoc) ||
-      checkNestingOfRegions(*this, DSAStack, DK, DirName, CancelRegion,
-                            BindKind, StartLoc)) {
+      checkNestingOfRegions(*this, DSAStack, Kind, DirName, CancelRegion,
+                            BindKind, StartLoc))
     return StmtError();
-  }
 
   // Report affected OpenMP target offloading behavior when in HIP lang-mode.
   if (getLangOpts().HIP && (isOpenMPTargetExecutionDirective(Kind) ||
                             isOpenMPTargetDataManagementDirective(Kind)))
     Diag(StartLoc, diag::warn_hip_omp_target_directives);
 
+  llvm::SmallVector<OMPClause *> ClausesWithoutBind;
+  bool UseClausesWithoutBind = false;
+
+  UseClausesWithoutBind = mapLoopConstruct(ClausesWithoutBind, Clauses,
+                                           BindKind, Kind, PrevMappedDirective);
+
   llvm::SmallVector<OMPClause *, 8> ClausesWithImplicit;
   VarsWithInheritedDSAType VarsWithInheritedDSA;
   bool ErrorFound = false;
diff --git a/clang/test/OpenMP/loop_bind_messages.cpp b/clang/test/OpenMP/loop_bind_messages.cpp
index 186a990fd7c52..f7fdf28971432 100644
--- a/clang/test/OpenMP/loop_bind_messages.cpp
+++ b/clang/test/OpenMP/loop_bind_messages.cpp
@@ -4,7 +4,6 @@
 
 #define NNN 50
 int aaa[NNN];
-int aaa2[NNN][NNN];
 
 void parallel_loop() {
   #pragma omp parallel
@@ -16,91 +15,6 @@ void parallel_loop() {
    }
 }
 
-void parallel_for_AND_loop_bind() {
-  #pragma omp parallel for
-  for (int i = 0 ; i < NNN ; i++) {
-    #pragma omp loop bind(parallel) // expected-error{{region cannot be closely nested inside 'parallel for' region; perhaps you forget to enclose 'omp loop' directive into a parallel region?}}
-    for (int j = 0 ; j < NNN ; j++) {
-      aaa2[i][j] = i+j;
-    }
-  }
-}
-
-void parallel_nowait() {
-  #pragma omp parallel
-  #pragma omp for nowait
-  for (int i = 0 ; i < NNN ; i++) {
-    #pragma omp loop bind(parallel) // expected-error{{region cannot be closely nested inside 'for' region; perhaps you forget to enclose 'omp loop' directive into a parallel region?}}
-    for (int j = 0 ; j < NNN ; j++) {
-      aaa2[i][j] = i+j;
-    }
-  }
-}
-
-void parallel_for_with_nothing() {
-  #pragma omp parallel for
-  for (int i = 0 ; i < NNN ; i++) {
-    #pragma omp nothing
-    #pragma omp loop // expected-error{{region cannot be closely nested inside 'parallel for' region; perhaps you forget to enclose 'omp loop' directive into a parallel region?}}
-    for (int j = 0 ; j < NNN ; j++) {
-      aaa2[i][j] = i+j;
-    }
-  }
-}
-
-void parallel_targetfor_with_loop_bind() {
-  #pragma omp target teams distribute parallel for 
-  for (int i = 0 ; i < NNN ; i++) {
-    #pragma omp loop bind(parallel) // expected-error{{region cannot be closely nested inside 'target teams distribute parallel for' region; perhaps you forget to enclose 'omp loop' directive into a parallel region?}}
-    for (int j = 0 ; j < NNN ; j++) {
-      aaa2[i][j] = i+j;
-    }
-  }
-}
-
-void parallel_targetparallel_with_loop() {
-  #pragma omp target parallel
-  for (int i = 0 ; i < NNN ; i++) {
-    #pragma omp loop bind(parallel)
-    for (int j = 0 ; j < NNN ; j++) {
-      aaa2[i][j] = i+j;
-    }
-  }
-}
-
-void loop_bind_AND_loop_bind() {
-  #pragma omp parallel for
-  for (int i = 0; i < 100; ++i) {
-    #pragma omp loop bind(parallel) // expected-error{{region cannot be closely nested inside 'parallel for' region; perhaps you forget to enclose 'omp loop' directive into a parallel region?}} 
-    for (int i = 0 ; i < NNN ; i++) {
-      #pragma omp loop bind(parallel) // expected-error{{region cannot be closely nested inside 'loop' region; perhaps you forget to enclose 'omp loop' directive into a parallel region?}} 
-      for (int j = 0 ; j < NNN ; j++) {
-        aaa[j] = j*NNN;
-      }
-    }
-  }
-}
-
-void parallel_with_sections_loop() {
-  #pragma omp parallel
-  {
-     #pragma omp sections
-     {
-        for (int i = 0 ; i < NNN ; i++) {
-          #pragma omp loop bind(parallel) // expected-error{{region cannot be closely nested inside 'sections' region; perhaps you forget to enclose 'omp loop' directive into a parallel region?}}
-          for (int j = 0 ; j < NNN ; j++) {
-            aaa2[i][j] = i+j;
-          }
-        }
-
-        #pragma omp section
-	{
-          aaa[NNN-1] = NNN;
-        }
-     }
-  }
-}
-
 void teams_loop() {
   int var1, var2;
 
@@ -120,23 +34,17 @@ void teams_loop() {
    }
 }
 
-void teams_targetteams_with_loop() {
-  #pragma omp target teams
-  for (int i = 0 ; i < NNN ; i++) {
-    #pragma omp loop bind(teams)
-    for (int j = 0 ; j < NNN ; j++) {
-      aaa2[i][j] = i+j;
-    }
+void orphan_loop_with_bind() {
+  #pragma omp loop bind(parallel) 
+  for (int j = 0 ; j < NNN ; j++) {
+    aaa[j] = j*NNN;
   }
 }
 
-void teams_targetfor_with_loop_bind() {
-  #pragma omp target teams distribute parallel for 
-  for (int i = 0 ; i < NNN ; i++) {
-    #pragma omp loop bind(teams) // expected-error{{region cannot be closely nested inside 'target teams distribute parallel for' region; perhaps you forget to enclose 'omp loop' directive into a teams region?}}
-    for (int j = 0 ; j < NNN ; j++) {
-      aaa2[i][j] = i+j;
-    }
+void orphan_loop_no_bind() {
+  #pragma omp loop  // expected-error{{expected 'bind' clause for 'loop' construct without an enclosing OpenMP construct}}
+  for (int j = 0 ; j < NNN ; j++) {
+    aaa[j] = j*NNN;
   }
 }
 
@@ -157,80 +65,12 @@ void teams_loop_reduction() {
    }
 }
 
-void teams_loop_distribute() {
-  int total = 0;
-
-  #pragma omp teams num_teams(8) thread_limit(256)
-  #pragma omp distribute parallel for dist_schedule(static, 1024) \
-                                      schedule(static, 64)
-  for (int i = 0; i < NNN; i++) {
-    #pragma omp loop bind(teams) // expected-error{{'distribute parallel for' region; perhaps you forget to enclose 'omp loop' directive into a teams region?}}
-    for (int j = 0; j < NNN; j++) {
-      aaa2[i][j] = i+j;
-    }
-  }
-}
-
-void parallel_for_with_loop_teams_bind(){
-  #pragma omp parallel for
-  for (int i = 0; i < NNN; i++) {
-    #pragma omp loop bind(teams) // expected-error{{region cannot be closely nested inside 'parallel for' region; perhaps you forget to enclose 'omp loop' directive into a teams region?}}
-    for (int j = 0 ; j < NNN ; j++) {
-      aaa[i] = i+i*NNN;
-    }
-  }
-}
-
-void teams_with_loop_thread_bind(){
-  #pragma omp teams
-  for (int i = 0; i < NNN; i++) {
-    #pragma omp loop bind(thread)
-    for (int j = 0 ; j < NNN ; j++) {
-      aaa[i] = i+i*NNN;
-    }
-  }
-}
-
-void orphan_loop_no_bind() {
-  #pragma omp loop  // expected-error{{expected 'bind' clause for 'loop' construct without an enclosing OpenMP construct}}
-  for (int j = 0 ; j < NNN ; j++) {
-    aaa[j] = j*NNN;
-  }
-}
-
-void orphan_loop_parallel_bind() {
-  #pragma omp loop bind(parallel) 
-  for (int j = 0 ; j < NNN ; j++) {
-    aaa[j] = j*NNN;
-  }
-}
-
-void orphan_loop_teams_bind(){
-  #pragma omp loop bind(teams)
-  for (int i = 0; i < NNN; i++) {
-    aaa[i] = i+i*NNN;
-  }
-}
-
 int main(int argc, char *argv[]) {
   parallel_loop();
-  parallel_for_AND_loop_bind();
-  parallel_nowait();
-  parallel_for_with_nothing();
-  parallel_targetfor_with_loop_bind();
-  parallel_targetparallel_with_loop();
-  loop_bind_AND_loop_bind();
-  parallel_with_sections_loop();
   teams_loop();
-  teams_targetteams_with_loop();
-  teams_targetfor_with_loop_bind();
-  teams_loop_reduction();
-  teams_loop_distribute();
-  parallel_for_with_loop_teams_bind();
-  teams_with_loop_thread_bind();
+  orphan_loop_with_bind();
   orphan_loop_no_bind();
-  orphan_loop_parallel_bind();
-  orphan_loop_teams_bind();
+  teams_loop_reduction();
 }
 
 #endif
diff --git a/clang/test/PCH/pragma-loop.cpp b/clang/test/PCH/pragma-loop.cpp
index a3c6871041c0e..f5de630ffc912 100644
--- a/clang/test/PCH/pragma-loop.cpp
+++ b/clang/test/PCH/pragma-loop.cpp
@@ -116,13 +116,9 @@ class pragma_test {
 
   inline void run10(int *List, int Length) {
     int i = 0;
-    int j = 0;
-    #pragma omp teams
+#pragma omp loop bind(teams)
     for (int i = 0; i < Length; i++) {
-      #pragma omp loop bind(teams)
-      for (int j = 0; j < Length; j++) {
-        List[i] = i+j;
-      }
+      List[i] = i;
     }
   }
 

From d808d922b42b82ecb21a45e446d612e334d96488 Mon Sep 17 00:00:00 2001
From: Peiming Liu <36770114+PeimingLiu@users.noreply.github.com>
Date: Thu, 26 Oct 2023 15:04:09 -0700
Subject: [PATCH 147/877] [mlir][sparse] introduce
 sparse_tensor.reinterpret_map operation. (#70378)

---
 .../SparseTensor/IR/SparseTensorAttrDefs.td   |   3 +-
 .../SparseTensor/IR/SparseTensorOps.td        |  49 ++++++++
 .../SparseTensor/IR/SparseTensorType.h        |   7 ++
 .../SparseTensor/IR/SparseTensorDialect.cpp   | 107 ++++++++++++++++++
 mlir/test/Dialect/SparseTensor/fold.mlir      |  15 +++
 mlir/test/Dialect/SparseTensor/invalid.mlir   |  63 +++++++++++
 mlir/test/Dialect/SparseTensor/roundtrip.mlir |  20 ++++
 7 files changed, 263 insertions(+), 1 deletion(-)

diff --git a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorAttrDefs.td b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorAttrDefs.td
index 2dd7f8e961929..6c3cfc2c49b2e 100644
--- a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorAttrDefs.td
+++ b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorAttrDefs.td
@@ -430,8 +430,9 @@ def SparseTensorEncodingAttr : SparseTensor_Attr<"SparseTensorEncoding",
     std::optional<uint64_t> getStaticLvlSliceStride(::mlir::sparse_tensor::Level lvl) const;
 
     //
-    // Helper function to build IR related to the encoding.
+    // Helper function to translate between level/dimension space.
     //
+    SmallVector<int64_t> tranlateShape(::mlir::ArrayRef<int64_t> srcShape, ::mlir::sparse_tensor::CrdTransDirectionKind) const;
     ValueRange translateCrds(::mlir::OpBuilder &builder, ::mlir::Location loc, ::mlir::ValueRange crds, ::mlir::sparse_tensor::CrdTransDirectionKind) const;
 
     //
diff --git a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorOps.td b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorOps.td
index 50f5e7335dc92..c5cb0ac155d68 100644
--- a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorOps.td
+++ b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorOps.td
@@ -208,6 +208,55 @@ def SparseTensor_ConvertOp : SparseTensor_Op<"convert",
   let hasVerifier = 1;
 }
 
+def SparseTensor_ReinterpretMapOp : SparseTensor_Op<"reinterpret_map", [NoMemoryEffect]>,
+    Arguments<(ins AnySparseTensor:$source)>,
+    Results<(outs AnySparseTensor:$dest)> {
+  let summary = "Reinterprets the dimension/level maps of the source tensor";
+  let description = [{
+    Reinterprets the dimension-to-level and level-to-dimension map specified in
+    `source` according to the type of `dest`.
+    `reinterpret_map` is a no-op and is introduced merely to resolve type conflicts.
+    It does not make any modification to the source tensor and source/dest tensors
+    are considered to be aliases.
+
+    `source` and `dest` tensors are "reinterpretable" if and only if they have
+    the exactly same storage at a low level.
+    That is, both `source` and `dest` has the same number of levels and level types,
+    and their shape is consistent before and after `reinterpret_map`.
+
+    Example:
+    ```mlir
+    #CSC = #sparse_tensor.encoding<{
+      map = (d0, d1) -> (d1: dense, d0: compressed)
+    }>
+    #CSR = #sparse_tensor.encoding<{
+      map = (d0, d1) -> (d0: dense, d1: compressed)
+    }>
+    %t1 = sparse_tensor.reinterpret_map %t0 : tensor<3x4xi32, #CSC> to tensor<4x3xi32, #CSR>
+
+    #BSR = #sparse_tensor.encoding<{
+      map = ( i, j ) -> ( i floordiv 2 : dense,
+                          j floordiv 3 : compressed,
+                          i mod 2      : dense,
+                          j mod 3      : dense
+      )
+    }>
+    #DSDD = #sparse_tensor.encoding<{
+      map = (i, j, k, l) -> (i: dense, j: compressed, k: dense, l: dense)
+    }>
+    %t1 = sparse_tensor.reinterpret_map %t0 : tensor<6x12xi32, #BSR> to tensor<3x4x2x3xi32, #DSDD>
+    ```
+    }];
+
+  let builders = [
+    OpBuilder<(ins "SparseTensorEncodingAttr":$dstEnc, "Value":$source)>
+  ];
+
+  let assemblyFormat = "$source attr-dict `:` type($source) `to` type($dest)";
+  let hasFolder = 1;
+  let hasVerifier = 1;
+}
+
 def SparseTensor_ToPositionsOp : SparseTensor_Op<"positions", [Pure]>,
     Arguments<(ins AnySparseTensor:$tensor, LevelAttr:$level)>,
     Results<(outs AnyStridedMemRefOfRank<1>:$result)> {
diff --git a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorType.h b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorType.h
index c3e967fdcd90f..7a1f1e2144e04 100644
--- a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorType.h
+++ b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorType.h
@@ -245,6 +245,12 @@ class SparseTensorType {
   /// Returns the dimension-shape.
   ArrayRef<DynSize> getDimShape() const { return rtp.getShape(); }
 
+  /// Returns the Level-shape.
+  SmallVector<DynSize> getLvlShape() const {
+    return getEncoding().tranlateShape(getDimShape(),
+                                       CrdTransDirectionKind::dim2lvl);
+  }
+
   /// Safely looks up the requested dimension-DynSize.  If you intend
   /// to check the result with `ShapedType::isDynamic`, then see the
   /// `getStaticDimSize` method instead.
@@ -281,6 +287,7 @@ class SparseTensorType {
   /// `ShapedType::Trait<T>::getNumDynamicDims`.
   int64_t getNumDynamicDims() const { return rtp.getNumDynamicDims(); }
 
+  ArrayRef<DimLevelType> getLvlTypes() const { return enc.getLvlTypes(); }
   DimLevelType getLvlType(Level l) const {
     // This OOB check is for dense-tensors, since this class knows
     // their lvlRank (whereas STEA::getLvlType will/can only check
diff --git a/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp b/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp
index 359c0a6968583..be44a0d31c92a 100644
--- a/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp
+++ b/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp
@@ -412,6 +412,55 @@ SparseTensorEncodingAttr::getStaticLvlSliceStride(Level lvl) const {
   return getStaticDimSliceStride(toOrigDim(*this, lvl));
 }
 
+SmallVector<int64_t>
+SparseTensorEncodingAttr::tranlateShape(ArrayRef<int64_t> srcShape,
+                                        CrdTransDirectionKind dir) const {
+  if (isIdentity())
+    return SmallVector<int64_t>(srcShape);
+
+  SmallVector<int64_t> ret;
+  unsigned rank =
+      dir == CrdTransDirectionKind::dim2lvl ? getLvlRank() : getDimRank();
+  ret.reserve(rank);
+
+  if (isPermutation()) {
+    for (unsigned r = 0; r < rank; r++) {
+      unsigned trans = dir == CrdTransDirectionKind::dim2lvl
+                           ? toOrigDim(*this, r)
+                           : toStoredDim(*this, r);
+      ret.push_back(srcShape[trans]);
+    }
+    return ret;
+  }
+
+  // Handle non-permutation maps.
+  AffineMap transMap =
+      dir == CrdTransDirectionKind::dim2lvl ? getDimToLvl() : getLvlToDim();
+
+  SmallVector<AffineExpr> dimRep;
+  dimRep.reserve(srcShape.size());
+  for (int64_t sz : srcShape) {
+    if (!ShapedType::isDynamic(sz)) {
+      // Push back the max coordinate for the given dimension/level size.
+      dimRep.push_back(getAffineConstantExpr(sz - 1, getContext()));
+    } else {
+      // A dynamic size, use a AffineDimExpr to symbolize the value.
+      dimRep.push_back(getAffineDimExpr(dimRep.size(), getContext()));
+    }
+  };
+
+  for (AffineExpr exp : transMap.getResults()) {
+    // Do constant propagation on the affine map.
+    AffineExpr evalExp =
+        simplifyAffineExpr(exp.replaceDims(dimRep), srcShape.size(), 0);
+    if (auto c = evalExp.dyn_cast<AffineConstantExpr>())
+      ret.push_back(c.getValue() + 1);
+    else
+      ret.push_back(ShapedType::kDynamic);
+  }
+  return ret;
+}
+
 ValueRange
 SparseTensorEncodingAttr::translateCrds(OpBuilder &builder, Location loc,
                                         ValueRange crds,
@@ -1286,6 +1335,64 @@ OpFoldResult LvlOp::fold(FoldAdaptor adaptor) {
   return {};
 }
 
+void ReinterpretMapOp::build(OpBuilder &odsBuilder, OperationState &odsState,
+                             SparseTensorEncodingAttr dstEnc, Value source) {
+  auto srcStt = getSparseTensorType(source);
+  SmallVector<int64_t> srcLvlShape = srcStt.getLvlShape();
+  SmallVector<int64_t> dstDimShape =
+      dstEnc.tranlateShape(srcLvlShape, CrdTransDirectionKind::lvl2dim);
+  auto dstTp =
+      RankedTensorType::get(dstDimShape, srcStt.getElementType(), dstEnc);
+  return build(odsBuilder, odsState, dstTp, source);
+}
+
+LogicalResult ReinterpretMapOp::verify() {
+  auto srcStt = getSparseTensorType(getSource());
+  auto dstStt = getSparseTensorType(getDest());
+  ArrayRef<DimLevelType> srcLvlTps = srcStt.getLvlTypes();
+  ArrayRef<DimLevelType> dstLvlTps = dstStt.getLvlTypes();
+
+  if (srcLvlTps.size() != dstLvlTps.size())
+    return emitError("Level rank mismatch between source/dest tensors");
+
+  for (auto [srcLvlTp, dstLvlTp] : llvm::zip(srcLvlTps, dstLvlTps))
+    if (srcLvlTp != dstLvlTp)
+      return emitError("Level type mismatch between source/dest tensors");
+
+  if (srcStt.getPosWidth() != dstStt.getPosWidth() ||
+      srcStt.getCrdWidth() != dstStt.getCrdWidth()) {
+    return emitError("Crd/Pos width mismatch between source/dest tensors");
+  }
+
+  if (srcStt.getElementType() != dstStt.getElementType())
+    return emitError("Element type mismatch between source/dest tensors");
+
+  SmallVector<DynSize> srcLvlShape = srcStt.getLvlShape();
+  SmallVector<DynSize> dstLvlShape = dstStt.getLvlShape();
+  for (auto [srcLvlSz, dstLvlSz] : llvm::zip(srcLvlShape, dstLvlShape)) {
+    if (srcLvlSz != dstLvlSz) {
+      // Should we allow one side to be dynamic size, e.g., <?x?> should be
+      // compatible to <3x4>? For now, we require all the level sizes to be
+      // *exactly* matched for simplicity.
+      return emitError("Level size mismatch between source/dest tensors");
+    }
+  }
+
+  return success();
+}
+
+OpFoldResult ReinterpretMapOp::fold(FoldAdaptor adaptor) {
+  if (getSource().getType() == getDest().getType())
+    return getSource();
+
+  if (auto def = getSource().getDefiningOp<ReinterpretMapOp>()) {
+    // A -> B, B -> A ==> A
+    if (def.getSource().getType() == getDest().getType())
+      return def.getSource();
+  }
+  return {};
+}
+
 LogicalResult ToPositionsOp::verify() {
   auto e = getSparseTensorEncoding(getTensor().getType());
   if (failed(lvlIsInBounds(getLevel(), getTensor())))
diff --git a/mlir/test/Dialect/SparseTensor/fold.mlir b/mlir/test/Dialect/SparseTensor/fold.mlir
index cf70cdc61ba78..73851c086aa24 100644
--- a/mlir/test/Dialect/SparseTensor/fold.mlir
+++ b/mlir/test/Dialect/SparseTensor/fold.mlir
@@ -111,3 +111,18 @@ func.func @sparse_lvl_3(%t : tensor<?x?xi32, #BSR>) -> index {
   %l0 = sparse_tensor.lvl %t, %lvl : tensor<?x?xi32, #BSR>
   return  %l0 : index
 }
+
+#DSDD = #sparse_tensor.encoding<{
+  map = (i, j, k, l) -> (i: dense, j: compressed, k: dense, l: dense)
+}>
+
+
+// CHECK-LABEL:   func.func @sparse_reinterpret_map(
+// CHECK-NOT: sparse_tensor.reinterpret_map
+func.func @sparse_reinterpret_map(%t0 : tensor<6x12xi32, #BSR>) -> tensor<6x12xi32, #BSR> {
+  %t1 = sparse_tensor.reinterpret_map %t0 : tensor<6x12xi32, #BSR>
+                                         to tensor<3x4x2x3xi32, #DSDD>
+  %t2 = sparse_tensor.reinterpret_map %t1 : tensor<3x4x2x3xi32, #DSDD>
+                                         to tensor<6x12xi32, #BSR>
+  return %t2 : tensor<6x12xi32, #BSR>
+}
diff --git a/mlir/test/Dialect/SparseTensor/invalid.mlir b/mlir/test/Dialect/SparseTensor/invalid.mlir
index 0217ef152be6a..dd2298d4c3c14 100644
--- a/mlir/test/Dialect/SparseTensor/invalid.mlir
+++ b/mlir/test/Dialect/SparseTensor/invalid.mlir
@@ -964,3 +964,66 @@ func.func @sparse_lvl(%t : tensor<?x?xi32, #BSR>) -> index {
   %l0 = sparse_tensor.lvl %t, %lvl : tensor<?x?xi32, #BSR>
   return  %l0 : index
 }
+
+// -----
+
+#BSR = #sparse_tensor.encoding<{
+  map = ( i, j ) -> ( i floordiv 2 : dense,
+                      j floordiv 3 : compressed,
+                      i mod 2      : dense,
+                      j mod 3      : dense
+  )
+}>
+
+#DSDC = #sparse_tensor.encoding<{
+  map = (i, j, k, l) -> (i: dense, j: compressed, k: dense, l: compressed)
+}>
+
+func.func @sparse_reinterpret_map(%t0 : tensor<6x12xi32, #BSR>) -> tensor<3x4x2x3xf32, #DSDC> {
+  // expected-error@+1 {{Level type mismatch between source/dest tensors}}
+  %t1 = sparse_tensor.reinterpret_map %t0 : tensor<6x12xi32, #BSR>
+                                         to tensor<3x4x2x3xf32, #DSDC>
+  return %t1 : tensor<3x4x2x3xf32, #DSDC>
+}
+
+// -----
+
+#BSR = #sparse_tensor.encoding<{
+  map = ( i, j ) -> ( i floordiv 2 : dense,
+                      j floordiv 3 : compressed,
+                      i mod 2      : dense,
+                      j mod 3      : dense
+  )
+}>
+
+#DSDD = #sparse_tensor.encoding<{
+  map = (i, j, k, l) -> (i: dense, j: compressed, k: dense, l: dense)
+}>
+
+func.func @sparse_reinterpret_map(%t0 : tensor<6x12xi32, #BSR>) -> tensor<3x4x2x3xf32, #DSDD> {
+  // expected-error@+1 {{Element type mismatch between source/dest tensors}}
+  %t1 = sparse_tensor.reinterpret_map %t0 : tensor<6x12xi32, #BSR>
+                                         to tensor<3x4x2x3xf32, #DSDD>
+  return %t1 : tensor<3x4x2x3xf32, #DSDD>
+}
+
+// -----
+
+#BSR = #sparse_tensor.encoding<{
+  map = ( i, j ) -> ( i floordiv 2 : dense,
+                      j floordiv 3 : compressed,
+                      i mod 2      : dense,
+                      j mod 3      : dense
+  )
+}>
+
+#DSDD = #sparse_tensor.encoding<{
+  map = (i, j, k, l) -> (i: dense, j: compressed, k: dense, l: dense)
+}>
+
+func.func @sparse_reinterpret_map(%t0 : tensor<6x12xi32, #BSR>) -> tensor<3x4x2x4xi32, #DSDD> {
+  // expected-error@+1 {{Level size mismatch between source/dest tensors}}
+  %t1 = sparse_tensor.reinterpret_map %t0 : tensor<6x12xi32, #BSR>
+                                         to tensor<3x4x2x4xi32, #DSDD>
+  return %t1 : tensor<3x4x2x4xi32, #DSDD>
+}
diff --git a/mlir/test/Dialect/SparseTensor/roundtrip.mlir b/mlir/test/Dialect/SparseTensor/roundtrip.mlir
index d3332eb3bbe33..17ae8c065945a 100644
--- a/mlir/test/Dialect/SparseTensor/roundtrip.mlir
+++ b/mlir/test/Dialect/SparseTensor/roundtrip.mlir
@@ -690,3 +690,23 @@ func.func @sparse_lvl(%arg0: index, %t : tensor<?x?xi32, #BSR>) -> index {
   %l0 = sparse_tensor.lvl %t, %arg0 : tensor<?x?xi32, #BSR>
   return  %l0 : index
 }
+
+// -----
+
+#BSR = #sparse_tensor.encoding<{
+  map = ( i, j ) -> ( i floordiv 2 : dense,
+                      j floordiv 3 : compressed,
+                      i mod 2      : dense,
+                      j mod 3      : dense
+  )
+}>
+
+#DSDD = #sparse_tensor.encoding<{
+  map = (i, j, k, l) -> (i: dense, j: compressed, k: dense, l: dense)
+}>
+
+func.func @sparse_reinterpret_map(%t0 : tensor<6x12xi32, #BSR>) -> tensor<3x4x2x3xi32, #DSDD> {
+  %t1 = sparse_tensor.reinterpret_map %t0 : tensor<6x12xi32, #BSR>
+                                         to tensor<3x4x2x3xi32, #DSDD>
+  return %t1 : tensor<3x4x2x3xi32, #DSDD>
+}

From 7a1e8783586ecc90ee15f12c7b76799313bb32e8 Mon Sep 17 00:00:00 2001
From: Med Ismail Bennani <ismail@bennani.ma>
Date: Thu, 26 Oct 2023 14:58:03 -0700
Subject: [PATCH 148/877] [lldb] Introduce OperatingSystem{,Python}Interface
 and make use it

This patch aims to consolidate the OperatingSystem scripting affordance
by introducing a stable interface that conforms to the
Scripted{,Python}Interface.

This unify the way we call into python methods from lldb while
also improving its capabilities by allowing us to pass lldb_private
objects are arguments.

Differential Revision: https://reviews.llvm.org/D159314

Signed-off-by: Med Ismail Bennani <ismail@bennani.ma>
---
 .../Interfaces/OperatingSystemInterface.h     |  33 ++++
 .../lldb/Interpreter/ScriptInterpreter.h      |  33 +---
 lldb/include/lldb/lldb-forward.h              |   3 +
 .../Python/OperatingSystemPython.cpp          | 150 +++++++----------
 .../Python/OperatingSystemPython.h            |   7 +-
 .../Python/Interfaces/CMakeLists.txt          |   1 +
 .../OperatingSystemPythonInterface.cpp        |  82 +++++++++
 .../OperatingSystemPythonInterface.h          |  44 +++++
 .../Interfaces/ScriptedPythonInterface.h      |   4 +
 .../Python/ScriptInterpreterPython.cpp        | 159 +-----------------
 .../Python/ScriptInterpreterPythonImpl.h      |  17 +-
 .../Python/PythonTestSuite.cpp                |   7 +-
 12 files changed, 253 insertions(+), 287 deletions(-)
 create mode 100644 lldb/include/lldb/Interpreter/Interfaces/OperatingSystemInterface.h
 create mode 100644 lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/OperatingSystemPythonInterface.cpp
 create mode 100644 lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/OperatingSystemPythonInterface.h

diff --git a/lldb/include/lldb/Interpreter/Interfaces/OperatingSystemInterface.h b/lldb/include/lldb/Interpreter/Interfaces/OperatingSystemInterface.h
new file mode 100644
index 0000000000000..3c46f99f3b356
--- /dev/null
+++ b/lldb/include/lldb/Interpreter/Interfaces/OperatingSystemInterface.h
@@ -0,0 +1,33 @@
+//===-- OperatingSystemInterface.h ------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLDB_INTERPRETER_INTERFACES_OPERATINGSYSTEMINTERFACE_H
+#define LLDB_INTERPRETER_INTERFACES_OPERATINGSYSTEMINTERFACE_H
+
+#include "ScriptedThreadInterface.h"
+#include "lldb/Core/StructuredDataImpl.h"
+
+#include "lldb/lldb-private.h"
+
+namespace lldb_private {
+class OperatingSystemInterface : virtual public ScriptedThreadInterface {
+public:
+  virtual StructuredData::DictionarySP CreateThread(lldb::tid_t tid,
+                                                    lldb::addr_t context) {
+    return {};
+  }
+
+  virtual StructuredData::ArraySP GetThreadInfo() { return {}; }
+
+  virtual std::optional<std::string> GetRegisterContextForTID(lldb::tid_t tid) {
+    return std::nullopt;
+  }
+};
+} // namespace lldb_private
+
+#endif // LLDB_INTERPRETER_INTERFACES_OPERATINGSYSTEMINTERFACE_H
diff --git a/lldb/include/lldb/Interpreter/ScriptInterpreter.h b/lldb/include/lldb/Interpreter/ScriptInterpreter.h
index eacd10d5279be..0146eeb862620 100644
--- a/lldb/include/lldb/Interpreter/ScriptInterpreter.h
+++ b/lldb/include/lldb/Interpreter/ScriptInterpreter.h
@@ -21,6 +21,7 @@
 #include "lldb/Core/ThreadedCommunication.h"
 #include "lldb/Host/PseudoTerminal.h"
 #include "lldb/Host/StreamFile.h"
+#include "lldb/Interpreter/Interfaces/OperatingSystemInterface.h"
 #include "lldb/Interpreter/Interfaces/ScriptedPlatformInterface.h"
 #include "lldb/Interpreter/Interfaces/ScriptedProcessInterface.h"
 #include "lldb/Interpreter/Interfaces/ScriptedThreadInterface.h"
@@ -252,34 +253,6 @@ class ScriptInterpreter : public PluginInterface {
     return lldb::ValueObjectListSP();
   }
 
-  virtual StructuredData::GenericSP
-  OSPlugin_CreatePluginObject(const char *class_name,
-                              lldb::ProcessSP process_sp) {
-    return StructuredData::GenericSP();
-  }
-
-  virtual StructuredData::DictionarySP
-  OSPlugin_RegisterInfo(StructuredData::ObjectSP os_plugin_object_sp) {
-    return StructuredData::DictionarySP();
-  }
-
-  virtual StructuredData::ArraySP
-  OSPlugin_ThreadsInfo(StructuredData::ObjectSP os_plugin_object_sp) {
-    return StructuredData::ArraySP();
-  }
-
-  virtual StructuredData::StringSP
-  OSPlugin_RegisterContextData(StructuredData::ObjectSP os_plugin_object_sp,
-                               lldb::tid_t thread_id) {
-    return StructuredData::StringSP();
-  }
-
-  virtual StructuredData::DictionarySP
-  OSPlugin_CreateThread(StructuredData::ObjectSP os_plugin_object_sp,
-                        lldb::tid_t tid, lldb::addr_t context) {
-    return StructuredData::DictionarySP();
-  }
-
   virtual StructuredData::ObjectSP
   CreateScriptedThreadPlan(const char *class_name,
                            const StructuredDataImpl &args_data,
@@ -593,6 +566,10 @@ class ScriptInterpreter : public PluginInterface {
     return std::make_shared<ScriptedThreadInterface>();
   }
 
+  virtual lldb::OperatingSystemInterfaceSP CreateOperatingSystemInterface() {
+    return std::make_shared<OperatingSystemInterface>();
+  }
+
   ScriptedPlatformInterface &GetScriptedPlatformInterface() {
     return *m_scripted_platform_interface_up;
   }
diff --git a/lldb/include/lldb/lldb-forward.h b/lldb/include/lldb/lldb-forward.h
index 3cd71c8a4ba3c..aa099d4abc3b0 100644
--- a/lldb/include/lldb/lldb-forward.h
+++ b/lldb/include/lldb/lldb-forward.h
@@ -130,6 +130,7 @@ class ObjectContainer;
 class ObjectFile;
 class ObjectFileJITDelegate;
 class OperatingSystem;
+class OperatingSystemInterface;
 class OptionGroup;
 class OptionGroupOptions;
 class OptionGroupPlatform;
@@ -360,6 +361,8 @@ typedef std::shared_ptr<lldb_private::ObjectFileJITDelegate>
 typedef std::weak_ptr<lldb_private::ObjectFileJITDelegate>
     ObjectFileJITDelegateWP;
 typedef std::unique_ptr<lldb_private::OperatingSystem> OperatingSystemUP;
+typedef std::shared_ptr<lldb_private::OperatingSystemInterface>
+    OperatingSystemInterfaceSP;
 typedef std::shared_ptr<lldb_private::OptionValue> OptionValueSP;
 typedef std::weak_ptr<lldb_private::OptionValue> OptionValueWP;
 typedef std::shared_ptr<lldb_private::OptionValueProperties>
diff --git a/lldb/source/Plugins/OperatingSystem/Python/OperatingSystemPython.cpp b/lldb/source/Plugins/OperatingSystem/Python/OperatingSystemPython.cpp
index 9560ae108f3e3..81ee7e328b6c0 100644
--- a/lldb/source/Plugins/OperatingSystem/Python/OperatingSystemPython.cpp
+++ b/lldb/source/Plugins/OperatingSystem/Python/OperatingSystemPython.cpp
@@ -74,47 +74,71 @@ llvm::StringRef OperatingSystemPython::GetPluginDescriptionStatic() {
 OperatingSystemPython::OperatingSystemPython(lldb_private::Process *process,
                                              const FileSpec &python_module_path)
     : OperatingSystem(process), m_thread_list_valobj_sp(), m_register_info_up(),
-      m_interpreter(nullptr), m_python_object_sp() {
+      m_interpreter(nullptr), m_script_object_sp() {
   if (!process)
     return;
   TargetSP target_sp = process->CalculateTarget();
   if (!target_sp)
     return;
   m_interpreter = target_sp->GetDebugger().GetScriptInterpreter();
-  if (m_interpreter) {
-
-    std::string os_plugin_class_name(
-        python_module_path.GetFilename().AsCString(""));
-    if (!os_plugin_class_name.empty()) {
-      LoadScriptOptions options;
-      char python_module_path_cstr[PATH_MAX];
-      python_module_path.GetPath(python_module_path_cstr,
-                                 sizeof(python_module_path_cstr));
-      Status error;
-      if (m_interpreter->LoadScriptingModule(python_module_path_cstr, options,
-                                             error)) {
-        // Strip the ".py" extension if there is one
-        size_t py_extension_pos = os_plugin_class_name.rfind(".py");
-        if (py_extension_pos != std::string::npos)
-          os_plugin_class_name.erase(py_extension_pos);
-        // Add ".OperatingSystemPlugIn" to the module name to get a string like
-        // "modulename.OperatingSystemPlugIn"
-        os_plugin_class_name += ".OperatingSystemPlugIn";
-        StructuredData::ObjectSP object_sp =
-            m_interpreter->OSPlugin_CreatePluginObject(
-                os_plugin_class_name.c_str(), process->CalculateProcess());
-        if (object_sp && object_sp->IsValid())
-          m_python_object_sp = object_sp;
-      }
-    }
+  if (!m_interpreter)
+    return;
+
+  std::string os_plugin_class_name(
+      python_module_path.GetFilename().AsCString(""));
+  if (os_plugin_class_name.empty())
+    return;
+
+  LoadScriptOptions options;
+  char python_module_path_cstr[PATH_MAX];
+  python_module_path.GetPath(python_module_path_cstr,
+                             sizeof(python_module_path_cstr));
+  Status error;
+  if (!m_interpreter->LoadScriptingModule(python_module_path_cstr, options,
+                                          error))
+    return;
+
+  // Strip the ".py" extension if there is one
+  size_t py_extension_pos = os_plugin_class_name.rfind(".py");
+  if (py_extension_pos != std::string::npos)
+    os_plugin_class_name.erase(py_extension_pos);
+  // Add ".OperatingSystemPlugIn" to the module name to get a string like
+  // "modulename.OperatingSystemPlugIn"
+  os_plugin_class_name += ".OperatingSystemPlugIn";
+
+  auto operating_system_interface =
+      m_interpreter->CreateOperatingSystemInterface();
+  if (!operating_system_interface)
+    // FIXME: We should pass an Status& to raise the error to the user.
+    //    return llvm::createStringError(
+    //        llvm::inconvertibleErrorCode(),
+    //        "Failed to create scripted thread interface.");
+    return;
+
+  ExecutionContext exe_ctx(process);
+  auto obj_or_err = operating_system_interface->CreatePluginObject(
+      os_plugin_class_name, exe_ctx, nullptr);
+
+  if (!obj_or_err) {
+    llvm::consumeError(obj_or_err.takeError());
+    return;
   }
+
+  StructuredData::GenericSP owned_script_object_sp = *obj_or_err;
+  if (!owned_script_object_sp->IsValid())
+    //    return llvm::createStringError(llvm::inconvertibleErrorCode(),
+    //                                   "Created script object is invalid.");
+    return;
+
+  m_script_object_sp = owned_script_object_sp;
+  m_operating_system_interface_sp = operating_system_interface;
 }
 
 OperatingSystemPython::~OperatingSystemPython() = default;
 
 DynamicRegisterInfo *OperatingSystemPython::GetDynamicRegisterInfo() {
   if (m_register_info_up == nullptr) {
-    if (!m_interpreter || !m_python_object_sp)
+    if (!m_interpreter || !m_operating_system_interface_sp)
       return nullptr;
     Log *log = GetLog(LLDBLog::OS);
 
@@ -124,7 +148,7 @@ DynamicRegisterInfo *OperatingSystemPython::GetDynamicRegisterInfo() {
               m_process->GetID());
 
     StructuredData::DictionarySP dictionary =
-        m_interpreter->OSPlugin_RegisterInfo(m_python_object_sp);
+        m_operating_system_interface_sp->GetRegisterInfo();
     if (!dictionary)
       return nullptr;
 
@@ -140,27 +164,11 @@ DynamicRegisterInfo *OperatingSystemPython::GetDynamicRegisterInfo() {
 bool OperatingSystemPython::UpdateThreadList(ThreadList &old_thread_list,
                                              ThreadList &core_thread_list,
                                              ThreadList &new_thread_list) {
-  if (!m_interpreter || !m_python_object_sp)
+  if (!m_interpreter || !m_operating_system_interface_sp)
     return false;
 
   Log *log = GetLog(LLDBLog::OS);
 
-  // First thing we have to do is to try to get the API lock, and the
-  // interpreter lock. We're going to change the thread content of the process,
-  // and we're going to use python, which requires the API lock to do it. We
-  // need the interpreter lock to make sure thread_info_dict stays alive.
-  //
-  // If someone already has the API lock, that is ok, we just want to avoid
-  // external code from making new API calls while this call is happening.
-  //
-  // This is a recursive lock so we can grant it to any Python code called on
-  // the stack below us.
-  Target &target = m_process->GetTarget();
-  std::unique_lock<std::recursive_mutex> api_lock(target.GetAPIMutex(),
-                                                  std::defer_lock);
-  (void)api_lock.try_lock(); // See above.
-  auto interpreter_lock = m_interpreter->AcquireInterpreterLock();
-
   LLDB_LOGF(log,
             "OperatingSystemPython::UpdateThreadList() fetching thread "
             "data from python for pid %" PRIu64,
@@ -170,7 +178,7 @@ bool OperatingSystemPython::UpdateThreadList(ThreadList &old_thread_list,
   // the lldb_private::Process subclass, no memory threads will be in this
   // list.
   StructuredData::ArraySP threads_list =
-      m_interpreter->OSPlugin_ThreadsInfo(m_python_object_sp);
+      m_operating_system_interface_sp->GetThreadInfo();
 
   const uint32_t num_cores = core_thread_list.GetSize(false);
 
@@ -281,28 +289,12 @@ RegisterContextSP
 OperatingSystemPython::CreateRegisterContextForThread(Thread *thread,
                                                       addr_t reg_data_addr) {
   RegisterContextSP reg_ctx_sp;
-  if (!m_interpreter || !m_python_object_sp || !thread)
+  if (!m_interpreter || !m_script_object_sp || !thread)
     return reg_ctx_sp;
 
   if (!IsOperatingSystemPluginThread(thread->shared_from_this()))
     return reg_ctx_sp;
 
-  // First thing we have to do is to try to get the API lock, and the
-  // interpreter lock. We're going to change the thread content of the process,
-  // and we're going to use python, which requires the API lock to do it. We
-  // need the interpreter lock to make sure thread_info_dict stays alive.
-  //
-  // If someone already has the API lock, that is ok, we just want to avoid
-  // external code from making new API calls while this call is happening.
-  //
-  // This is a recursive lock so we can grant it to any Python code called on
-  // the stack below us.
-  Target &target = m_process->GetTarget();
-  std::unique_lock<std::recursive_mutex> api_lock(target.GetAPIMutex(),
-                                                  std::defer_lock);
-  (void)api_lock.try_lock(); // See above.
-  auto interpreter_lock = m_interpreter->AcquireInterpreterLock();
-
   Log *log = GetLog(LLDBLog::Thread);
 
   if (reg_data_addr != LLDB_INVALID_ADDRESS) {
@@ -324,11 +316,11 @@ OperatingSystemPython::CreateRegisterContextForThread(Thread *thread,
               ") fetching register data from python",
               thread->GetID(), thread->GetProtocolID());
 
-    StructuredData::StringSP reg_context_data =
-        m_interpreter->OSPlugin_RegisterContextData(m_python_object_sp,
-                                                    thread->GetID());
+    std::optional<std::string> reg_context_data =
+        m_operating_system_interface_sp->GetRegisterContextForTID(
+            thread->GetID());
     if (reg_context_data) {
-      std::string value = std::string(reg_context_data->GetValue());
+      std::string value = *reg_context_data;
       DataBufferSP data_sp(new DataBufferHeap(value.c_str(), value.length()));
       if (data_sp->GetByteSize()) {
         RegisterContextMemory *reg_ctx_memory = new RegisterContextMemory(
@@ -347,6 +339,7 @@ OperatingSystemPython::CreateRegisterContextForThread(Thread *thread,
               "OperatingSystemPython::CreateRegisterContextForThread (tid "
               "= 0x%" PRIx64 ") forcing a dummy register context",
               thread->GetID());
+    Target &target = m_process->GetTarget();
     reg_ctx_sp = std::make_shared<RegisterContextDummy>(
         *thread, 0, target.GetArchitecture().GetAddressByteSize());
   }
@@ -372,26 +365,11 @@ lldb::ThreadSP OperatingSystemPython::CreateThread(lldb::tid_t tid,
             ", context = 0x%" PRIx64 ") fetching register data from python",
             tid, context);
 
-  if (m_interpreter && m_python_object_sp) {
-    // First thing we have to do is to try to get the API lock, and the
-    // interpreter lock. We're going to change the thread content of the
-    // process, and we're going to use python, which requires the API lock to
-    // do it. We need the interpreter lock to make sure thread_info_dict stays
-    // alive.
-    //
-    // If someone already has the API lock, that is ok, we just want to avoid
-    // external code from making new API calls while this call is happening.
-    //
-    // This is a recursive lock so we can grant it to any Python code called on
-    // the stack below us.
-    Target &target = m_process->GetTarget();
-    std::unique_lock<std::recursive_mutex> api_lock(target.GetAPIMutex(),
-                                                    std::defer_lock);
-    (void)api_lock.try_lock(); // See above.
-    auto interpreter_lock = m_interpreter->AcquireInterpreterLock();
+  if (m_interpreter && m_script_object_sp) {
 
     StructuredData::DictionarySP thread_info_dict =
-        m_interpreter->OSPlugin_CreateThread(m_python_object_sp, tid, context);
+        m_operating_system_interface_sp->CreateThread(tid, context);
+
     std::vector<bool> core_used_map;
     if (thread_info_dict) {
       ThreadList core_threads(m_process);
diff --git a/lldb/source/Plugins/OperatingSystem/Python/OperatingSystemPython.h b/lldb/source/Plugins/OperatingSystem/Python/OperatingSystemPython.h
index 7800cf03af8e8..90973acde3ebf 100644
--- a/lldb/source/Plugins/OperatingSystem/Python/OperatingSystemPython.h
+++ b/lldb/source/Plugins/OperatingSystem/Python/OperatingSystemPython.h
@@ -62,7 +62,7 @@ class OperatingSystemPython : public lldb_private::OperatingSystem {
 
 protected:
   bool IsValid() const {
-    return m_python_object_sp && m_python_object_sp->IsValid();
+    return m_script_object_sp && m_script_object_sp->IsValid();
   }
 
   lldb::ThreadSP CreateThreadFromThreadInfo(
@@ -75,8 +75,9 @@ class OperatingSystemPython : public lldb_private::OperatingSystem {
 
   lldb::ValueObjectSP m_thread_list_valobj_sp;
   std::unique_ptr<lldb_private::DynamicRegisterInfo> m_register_info_up;
-  lldb_private::ScriptInterpreter *m_interpreter;
-  lldb_private::StructuredData::ObjectSP m_python_object_sp;
+  lldb_private::ScriptInterpreter *m_interpreter = nullptr;
+  lldb::OperatingSystemInterfaceSP m_operating_system_interface_sp = nullptr;
+  lldb_private::StructuredData::GenericSP m_script_object_sp = nullptr;
 };
 
 #endif // LLDB_ENABLE_PYTHON
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/CMakeLists.txt b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/CMakeLists.txt
index 73aeb32ead855..b22abc49c92a9 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/CMakeLists.txt
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/CMakeLists.txt
@@ -20,6 +20,7 @@ if (LLDB_ENABLE_LIBEDIT)
 endif()
 
 add_lldb_library(lldbPluginScriptInterpreterPythonInterfaces
+  OperatingSystemPythonInterface.cpp
   ScriptedPythonInterface.cpp
   ScriptedProcessPythonInterface.cpp
   ScriptedThreadPythonInterface.cpp
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/OperatingSystemPythonInterface.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/OperatingSystemPythonInterface.cpp
new file mode 100644
index 0000000000000..c162c7367c654
--- /dev/null
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/OperatingSystemPythonInterface.cpp
@@ -0,0 +1,82 @@
+//===-- ScriptedThreadPythonInterface.cpp ---------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "lldb/Host/Config.h"
+#include "lldb/Target/ExecutionContext.h"
+#include "lldb/Utility/Log.h"
+#include "lldb/lldb-enumerations.h"
+
+#if LLDB_ENABLE_PYTHON
+
+// LLDB Python header must be included first
+#include "../lldb-python.h"
+
+#include "../SWIGPythonBridge.h"
+#include "../ScriptInterpreterPythonImpl.h"
+#include "OperatingSystemPythonInterface.h"
+
+using namespace lldb;
+using namespace lldb_private;
+using namespace lldb_private::python;
+using Locker = ScriptInterpreterPythonImpl::Locker;
+
+OperatingSystemPythonInterface::OperatingSystemPythonInterface(
+    ScriptInterpreterPythonImpl &interpreter)
+    : OperatingSystemInterface(), ScriptedThreadPythonInterface(interpreter) {}
+
+llvm::Expected<StructuredData::GenericSP>
+OperatingSystemPythonInterface::CreatePluginObject(
+    llvm::StringRef class_name, ExecutionContext &exe_ctx,
+    StructuredData::DictionarySP args_sp, StructuredData::Generic *script_obj) {
+  return ScriptedPythonInterface::CreatePluginObject(class_name, nullptr,
+                                                     exe_ctx.GetProcessSP());
+}
+
+StructuredData::DictionarySP
+OperatingSystemPythonInterface::CreateThread(lldb::tid_t tid,
+                                             lldb::addr_t context) {
+  Status error;
+  StructuredData::DictionarySP dict = Dispatch<StructuredData::DictionarySP>(
+      "create_thread", error, tid, context);
+
+  if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, dict,
+                                                    error))
+    return {};
+
+  return dict;
+}
+
+StructuredData::ArraySP OperatingSystemPythonInterface::GetThreadInfo() {
+  Status error;
+  StructuredData::ArraySP arr =
+      Dispatch<StructuredData::ArraySP>("get_thread_info", error);
+
+  if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, arr,
+                                                    error))
+    return {};
+
+  return arr;
+}
+
+StructuredData::DictionarySP OperatingSystemPythonInterface::GetRegisterInfo() {
+  return ScriptedThreadPythonInterface::GetRegisterInfo();
+}
+
+std::optional<std::string>
+OperatingSystemPythonInterface::GetRegisterContextForTID(lldb::tid_t tid) {
+  Status error;
+  StructuredData::ObjectSP obj = Dispatch("get_register_data", error, tid);
+
+  if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj,
+                                                    error))
+    return {};
+
+  return obj->GetAsString()->GetValue().str();
+}
+
+#endif
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/OperatingSystemPythonInterface.h b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/OperatingSystemPythonInterface.h
new file mode 100644
index 0000000000000..ee24e0ee30f91
--- /dev/null
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/OperatingSystemPythonInterface.h
@@ -0,0 +1,44 @@
+//===-- OperatingSystemPythonInterface.h ------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLDB_PLUGINS_SCRIPTINTERPRETER_PYTHON_INTERFACES_OPERATINGSYSTEMPYTHONINTERFACE_H
+#define LLDB_PLUGINS_SCRIPTINTERPRETER_PYTHON_INTERFACES_OPERATINGSYSTEMPYTHONINTERFACE_H
+
+#include "lldb/Host/Config.h"
+
+#if LLDB_ENABLE_PYTHON
+
+#include "ScriptedThreadPythonInterface.h"
+#include "lldb/Interpreter/Interfaces/OperatingSystemInterface.h"
+#include <optional>
+
+namespace lldb_private {
+class OperatingSystemPythonInterface
+    : virtual public OperatingSystemInterface,
+      virtual public ScriptedThreadPythonInterface {
+public:
+  OperatingSystemPythonInterface(ScriptInterpreterPythonImpl &interpreter);
+
+  llvm::Expected<StructuredData::GenericSP>
+  CreatePluginObject(llvm::StringRef class_name, ExecutionContext &exe_ctx,
+                     StructuredData::DictionarySP args_sp,
+                     StructuredData::Generic *script_obj = nullptr) override;
+
+  StructuredData::DictionarySP CreateThread(lldb::tid_t tid,
+                                            lldb::addr_t context) override;
+
+  StructuredData::ArraySP GetThreadInfo() override;
+
+  StructuredData::DictionarySP GetRegisterInfo() override;
+
+  std::optional<std::string> GetRegisterContextForTID(lldb::tid_t tid) override;
+};
+} // namespace lldb_private
+
+#endif // LLDB_ENABLE_PYTHON
+#endif // LLDB_PLUGINS_SCRIPTINTERPRETER_PYTHON_INTERFACES_OPERATINGSYSTEMPYTHONINTERFACE_H
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.h b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.h
index 18651f3ddb030..7af9816397099 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.h
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.h
@@ -220,6 +220,10 @@ class ScriptedPythonInterface : virtual public ScriptedInterface {
     return python::SWIGBridge::ToSWIGWrapper(arg);
   }
 
+  python::PythonObject Transform(lldb::ProcessSP arg) {
+    return python::SWIGBridge::ToSWIGWrapper(arg);
+  }
+
   python::PythonObject Transform(lldb::ProcessAttachInfoSP arg) {
     return python::SWIGBridge::ToSWIGWrapper(arg);
   }
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp
index 66e6ac796c934..a57c8e4984ad8 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp
@@ -14,6 +14,7 @@
 // LLDB Python header must be included first
 #include "lldb-python.h"
 
+#include "Interfaces/OperatingSystemPythonInterface.h"
 #include "Interfaces/ScriptedPlatformPythonInterface.h"
 #include "Interfaces/ScriptedProcessPythonInterface.h"
 #include "Interfaces/ScriptedThreadPythonInterface.h"
@@ -1521,6 +1522,11 @@ ScriptInterpreterPythonImpl::CreateScriptedThreadInterface() {
   return std::make_shared<ScriptedThreadPythonInterface>(*this);
 }
 
+OperatingSystemInterfaceSP
+ScriptInterpreterPythonImpl::CreateOperatingSystemInterface() {
+  return std::make_shared<OperatingSystemPythonInterface>(*this);
+}
+
 StructuredData::ObjectSP
 ScriptInterpreterPythonImpl::CreateStructuredDataFromScriptObject(
     ScriptObject obj) {
@@ -1532,159 +1538,6 @@ ScriptInterpreterPythonImpl::CreateStructuredDataFromScriptObject(
   return py_obj.CreateStructuredObject();
 }
 
-StructuredData::GenericSP
-ScriptInterpreterPythonImpl::OSPlugin_CreatePluginObject(
-    const char *class_name, lldb::ProcessSP process_sp) {
-  if (class_name == nullptr || class_name[0] == '\0')
-    return StructuredData::GenericSP();
-
-  if (!process_sp)
-    return StructuredData::GenericSP();
-
-  Locker py_lock(this, Locker::AcquireLock | Locker::NoSTDIN, Locker::FreeLock);
-  PythonObject ret_val = SWIGBridge::LLDBSWIGPythonCreateOSPlugin(
-      class_name, m_dictionary_name.c_str(), process_sp);
-
-  return StructuredData::GenericSP(
-      new StructuredPythonObject(std::move(ret_val)));
-}
-
-StructuredData::DictionarySP ScriptInterpreterPythonImpl::OSPlugin_RegisterInfo(
-    StructuredData::ObjectSP os_plugin_object_sp) {
-  Locker py_lock(this, Locker::AcquireLock | Locker::NoSTDIN, Locker::FreeLock);
-
-  if (!os_plugin_object_sp)
-    return {};
-
-  StructuredData::Generic *generic = os_plugin_object_sp->GetAsGeneric();
-  if (!generic)
-    return {};
-
-  PythonObject implementor(PyRefType::Borrowed,
-                           (PyObject *)generic->GetValue());
-
-  if (!implementor.IsAllocated())
-    return {};
-
-  llvm::Expected<PythonObject> expected_py_return =
-      implementor.CallMethod("get_register_info");
-
-  if (!expected_py_return) {
-    llvm::consumeError(expected_py_return.takeError());
-    return {};
-  }
-
-  PythonObject py_return = std::move(expected_py_return.get());
-
-  if (py_return.get()) {
-    PythonDictionary result_dict(PyRefType::Borrowed, py_return.get());
-    return result_dict.CreateStructuredDictionary();
-  }
-  return StructuredData::DictionarySP();
-}
-
-StructuredData::ArraySP ScriptInterpreterPythonImpl::OSPlugin_ThreadsInfo(
-    StructuredData::ObjectSP os_plugin_object_sp) {
-  Locker py_lock(this, Locker::AcquireLock | Locker::NoSTDIN, Locker::FreeLock);
-  if (!os_plugin_object_sp)
-    return {};
-
-  StructuredData::Generic *generic = os_plugin_object_sp->GetAsGeneric();
-  if (!generic)
-    return {};
-
-  PythonObject implementor(PyRefType::Borrowed,
-                           (PyObject *)generic->GetValue());
-
-  if (!implementor.IsAllocated())
-    return {};
-
-  llvm::Expected<PythonObject> expected_py_return =
-      implementor.CallMethod("get_thread_info");
-
-  if (!expected_py_return) {
-    llvm::consumeError(expected_py_return.takeError());
-    return {};
-  }
-
-  PythonObject py_return = std::move(expected_py_return.get());
-
-  if (py_return.get()) {
-    PythonList result_list(PyRefType::Borrowed, py_return.get());
-    return result_list.CreateStructuredArray();
-  }
-  return StructuredData::ArraySP();
-}
-
-StructuredData::StringSP
-ScriptInterpreterPythonImpl::OSPlugin_RegisterContextData(
-    StructuredData::ObjectSP os_plugin_object_sp, lldb::tid_t tid) {
-  Locker py_lock(this, Locker::AcquireLock | Locker::NoSTDIN, Locker::FreeLock);
-
-  if (!os_plugin_object_sp)
-    return {};
-
-  StructuredData::Generic *generic = os_plugin_object_sp->GetAsGeneric();
-  if (!generic)
-    return {};
-  PythonObject implementor(PyRefType::Borrowed,
-                           (PyObject *)generic->GetValue());
-
-  if (!implementor.IsAllocated())
-    return {};
-
-  llvm::Expected<PythonObject> expected_py_return =
-      implementor.CallMethod("get_register_data", tid);
-
-  if (!expected_py_return) {
-    llvm::consumeError(expected_py_return.takeError());
-    return {};
-  }
-
-  PythonObject py_return = std::move(expected_py_return.get());
-
-  if (py_return.get()) {
-    PythonBytes result(PyRefType::Borrowed, py_return.get());
-    return result.CreateStructuredString();
-  }
-  return {};
-}
-
-StructuredData::DictionarySP ScriptInterpreterPythonImpl::OSPlugin_CreateThread(
-    StructuredData::ObjectSP os_plugin_object_sp, lldb::tid_t tid,
-    lldb::addr_t context) {
-  Locker py_lock(this, Locker::AcquireLock | Locker::NoSTDIN, Locker::FreeLock);
-
-  if (!os_plugin_object_sp)
-    return {};
-
-  StructuredData::Generic *generic = os_plugin_object_sp->GetAsGeneric();
-  if (!generic)
-    return {};
-
-  PythonObject implementor(PyRefType::Borrowed,
-                           (PyObject *)generic->GetValue());
-
-  if (!implementor.IsAllocated())
-    return {};
-
-  llvm::Expected<PythonObject> expected_py_return =
-      implementor.CallMethod("create_thread", tid, context);
-
-  if (!expected_py_return) {
-    llvm::consumeError(expected_py_return.takeError());
-    return {};
-  }
-
-  PythonObject py_return = std::move(expected_py_return.get());
-
-  if (py_return.get()) {
-    PythonDictionary result_dict(PyRefType::Borrowed, py_return.get());
-    return result_dict.CreateStructuredDictionary();
-  }
-  return StructuredData::DictionarySP();
-}
-
 StructuredData::ObjectSP ScriptInterpreterPythonImpl::CreateScriptedThreadPlan(
     const char *class_name, const StructuredDataImpl &args_data,
     std::string &error_str, lldb::ThreadPlanSP thread_plan_sp) {
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPythonImpl.h b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPythonImpl.h
index 00dc1d1baa309..a33499816d8d3 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPythonImpl.h
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPythonImpl.h
@@ -134,24 +134,9 @@ class ScriptInterpreterPythonImpl : public ScriptInterpreterPython {
 
   lldb::ScriptedProcessInterfaceUP CreateScriptedProcessInterface() override;
 
-  StructuredData::GenericSP
-  OSPlugin_CreatePluginObject(const char *class_name,
-                              lldb::ProcessSP process_sp) override;
-
-  StructuredData::DictionarySP
-  OSPlugin_RegisterInfo(StructuredData::ObjectSP os_plugin_object_sp) override;
   lldb::ScriptedThreadInterfaceSP CreateScriptedThreadInterface() override;
 
-  StructuredData::ArraySP
-  OSPlugin_ThreadsInfo(StructuredData::ObjectSP os_plugin_object_sp) override;
-
-  StructuredData::StringSP
-  OSPlugin_RegisterContextData(StructuredData::ObjectSP os_plugin_object_sp,
-                               lldb::tid_t thread_id) override;
-
-  StructuredData::DictionarySP
-  OSPlugin_CreateThread(StructuredData::ObjectSP os_plugin_object_sp,
-                        lldb::tid_t tid, lldb::addr_t context) override;
+  lldb::OperatingSystemInterfaceSP CreateOperatingSystemInterface() override;
 
   StructuredData::ObjectSP
   LoadPluginModule(const FileSpec &file_spec,
diff --git a/lldb/unittests/ScriptInterpreter/Python/PythonTestSuite.cpp b/lldb/unittests/ScriptInterpreter/Python/PythonTestSuite.cpp
index 9dd845f435a07..7f3359f6bf26b 100644
--- a/lldb/unittests/ScriptInterpreter/Python/PythonTestSuite.cpp
+++ b/lldb/unittests/ScriptInterpreter/Python/PythonTestSuite.cpp
@@ -320,6 +320,11 @@ lldb_private::python::SWIGBridge::ToSWIGWrapper(lldb::ExecutionContextRefSP) {
 }
 
 python::PythonObject
-lldb_private::python::SWIGBridge::ToSWIGWrapper(const StructuredDataImpl &) {
+lldb_private::python::SWIGBridge::ToSWIGWrapper(lldb::ProcessSP) {
+  return python::PythonObject();
+}
+
+python::PythonObject lldb_private::python::SWIGBridge::ToSWIGWrapper(
+    const lldb_private::StructuredDataImpl &) {
   return python::PythonObject();
 }

From ec456ba9ca0a097da63daafbb031cb2024f5513a Mon Sep 17 00:00:00 2001
From: Med Ismail Bennani <ismail@bennani.ma>
Date: Mon, 23 Oct 2023 15:42:59 -0700
Subject: [PATCH 149/877] [lldb] Add OperatingSystem base class to the lldb
 python module

This patch introduces an `OperatingSystem` base implementation in the
`lldb` python module to make it easier for lldb users to write their own
implementation.

The `OperatingSystem` base implementation is derived itself from the
`ScriptedThread` base implementation since they share some common grounds.

To achieve that, this patch makes changes to the `ScriptedThread`
initializer since it gets called by the `OperatingSystem` initializer.

I also took the opportunity to document the `OperatingSystem` base
class and methods.

Differential Revision: https://reviews.llvm.org/D159315

Signed-off-by: Med Ismail Bennani <ismail@bennani.ma>
---
 lldb/bindings/python/CMakeLists.txt           |   3 +-
 .../python/templates/operating_system.py      | 103 ++++++++++++++++++
 .../python/templates/scripted_process.py      |  26 +++--
 .../python_os_plugin/operating_system.py      |  40 +------
 4 files changed, 124 insertions(+), 48 deletions(-)
 create mode 100644 lldb/examples/python/templates/operating_system.py

diff --git a/lldb/bindings/python/CMakeLists.txt b/lldb/bindings/python/CMakeLists.txt
index c4806bda27049..c941f764dfc92 100644
--- a/lldb/bindings/python/CMakeLists.txt
+++ b/lldb/bindings/python/CMakeLists.txt
@@ -104,7 +104,8 @@ function(finish_swig_python swig_target lldb_python_bindings_dir lldb_python_tar
     "plugins"
     FILES
     "${LLDB_SOURCE_DIR}/examples/python/templates/scripted_process.py"
-    "${LLDB_SOURCE_DIR}/examples/python/templates/scripted_platform.py")
+    "${LLDB_SOURCE_DIR}/examples/python/templates/scripted_platform.py"
+    "${LLDB_SOURCE_DIR}/examples/python/templates/operating_system.py")
 
   if(APPLE)
     create_python_package(
diff --git a/lldb/examples/python/templates/operating_system.py b/lldb/examples/python/templates/operating_system.py
new file mode 100644
index 0000000000000..a8053bcaa21af
--- /dev/null
+++ b/lldb/examples/python/templates/operating_system.py
@@ -0,0 +1,103 @@
+from abc import abstractmethod
+
+import lldb
+import struct
+
+from lldb.plugins.scripted_process import ScriptedThread
+
+
+class OperatingSystem(ScriptedThread):
+    """
+    Class that provides data for an instance of a LLDB 'OperatingSystemPython' plug-in class.
+
+    ```
+    thread_info = {
+        "tid": tid,
+        "name": "four",
+        "queue": "queue4",
+        "state": "stopped",
+        "stop_reason": "none",
+        "core" : 2
+    }
+    ```
+
+    - tid : thread ID (mandatory)
+    - name : thread name (optional key/value pair)
+    - queue : thread dispatch queue name (optional key/value pair)
+    - state : thread state (mandatory, set to 'stopped' for now)
+    - core : the index of the core (lldb) thread that this OS Thread should shadow
+    - stop_reason : thread stop reason. (mandatory, usually set to 'none')
+        Possible values include:
+        - 'breakpoint': thread is stopped at a breakpoint
+        - 'none': thread is stopped because the process is stopped
+        - 'trace': thread is stopped after single stepping
+        The usual value for this while threads are in memory is 'none'
+    - register_data_addr : the address of the register data in memory (optional key/value pair)
+        Specifying this key/value pair for a thread will avoid a call to get_register_data()
+        and can be used when your registers are in a thread context structure that is contiguous
+        in memory. Don't specify this if your register layout in memory doesn't match the layout
+        described by the dictionary returned from a call to the get_register_info() method.
+    """
+
+    def __init__(self, process):
+        """Initialization needs a valid lldb.SBProcess object. This plug-in
+        will get created after a live process is valid and has stopped for the
+        first time.
+
+        Args:
+            process (lldb.SBProcess): The process owning this thread.
+        """
+        self.registers = None
+        super().__init__(process, None)
+        self.registers = self.register_info
+        self.threads = []
+
+    def create_thread(self, tid, context):
+        """Lazily create an operating system thread using a thread information
+        dictionary and an optional operating system thread context address.
+        This method is called manually, using the SBAPI
+        `lldb.SBProcess.CreateOSPluginThread` affordance.
+
+        Args:
+            tid (int): Thread ID to get `thread_info` dictionary for.
+            context (int): Address of the operating system thread struct.
+
+        Returns:
+            Dict: The `thread_info` dictionary containing the various information
+            for lldb to create a Thread object and add it to the process thread list.
+        """
+        return None
+
+    @abstractmethod
+    def get_thread_info(self):
+        """Get the list of operating system threads. This method gets called
+        automatically every time the process stops and it needs to update its
+        thread list.
+
+        Returns:
+            List[thread_info]: A list of `os_thread` dictionaries
+                containing at least for each entry, the thread id, it's name,
+                queue, state, stop reason. It can also contain a
+                `register_data_addr`. The list can be empty.
+        """
+        pass
+
+    @abstractmethod
+    def get_register_data(self, tid):
+        """Get the operating system thread register context for given a thread
+        id. This method is called when unwinding the stack of one of the
+        operating system threads.
+
+        Args:
+            tid (int): Thread ID to get register context for.
+
+        Returns:
+            str: A byte representing all register's value.
+        """
+        pass
+
+    def get_register_context(self):
+        pass
+
+    def get_stop_reason(self):
+        pass
diff --git a/lldb/examples/python/templates/scripted_process.py b/lldb/examples/python/templates/scripted_process.py
index d74ef02dec859..3ddcebd128eaa 100644
--- a/lldb/examples/python/templates/scripted_process.py
+++ b/lldb/examples/python/templates/scripted_process.py
@@ -244,16 +244,16 @@ class ScriptedThread(metaclass=ABCMeta):
     """
 
     @abstractmethod
-    def __init__(self, scripted_process, args):
+    def __init__(self, process, args):
         """Construct a scripted thread.
 
         Args:
-            process (ScriptedProcess): The scripted process owning this thread.
+            process (ScriptedProcess/lldb.SBProcess): The process owning this thread.
             args (lldb.SBStructuredData): A Dictionary holding arbitrary
                 key/value pairs used by the scripted thread.
         """
         self.target = None
-        self.scripted_process = None
+        self.originating_process = None
         self.process = None
         self.args = None
         self.idx = 0
@@ -268,9 +268,13 @@ def __init__(self, scripted_process, args):
         self.frames = []
         self.extended_info = []
 
-        if isinstance(scripted_process, ScriptedProcess):
-            self.target = scripted_process.target
-            self.scripted_process = scripted_process
+        if (
+            isinstance(process, ScriptedProcess)
+            or isinstance(process, lldb.SBProcess)
+            and process.IsValid()
+        ):
+            self.target = process.target
+            self.originating_process = process
             self.process = self.target.GetProcess()
             self.get_register_info()
 
@@ -354,14 +358,14 @@ def get_stackframes(self):
     def get_register_info(self):
         if self.register_info is None:
             self.register_info = dict()
-            if self.scripted_process.arch == "x86_64":
+            if self.originating_process.arch == "x86_64":
                 self.register_info["sets"] = ["General Purpose Registers"]
                 self.register_info["registers"] = INTEL64_GPR
-            elif "arm64" in self.scripted_process.arch:
+            elif "arm64" in self.originating_process.arch:
                 self.register_info["sets"] = ["General Purpose Registers"]
                 self.register_info["registers"] = ARM64_GPR
             else:
-                raise ValueError("Unknown architecture", self.scripted_process.arch)
+                raise ValueError("Unknown architecture", self.originating_process.arch)
         return self.register_info
 
     @abstractmethod
@@ -505,12 +509,12 @@ def get_stop_reason(self):
 
             # TODO: Passthrough stop reason from driving process
             if self.driving_thread.GetStopReason() != lldb.eStopReasonNone:
-                if "arm64" in self.scripted_process.arch:
+                if "arm64" in self.originating_process.arch:
                     stop_reason["type"] = lldb.eStopReasonException
                     stop_reason["data"][
                         "desc"
                     ] = self.driving_thread.GetStopDescription(100)
-                elif self.scripted_process.arch == "x86_64":
+                elif self.originating_process.arch == "x86_64":
                     stop_reason["type"] = lldb.eStopReasonSignal
                     stop_reason["data"]["signal"] = signal.SIGTRAP
                 else:
diff --git a/lldb/test/API/functionalities/plugins/python_os_plugin/operating_system.py b/lldb/test/API/functionalities/plugins/python_os_plugin/operating_system.py
index 52c678fac2efe..f4404d78492f9 100644
--- a/lldb/test/API/functionalities/plugins/python_os_plugin/operating_system.py
+++ b/lldb/test/API/functionalities/plugins/python_os_plugin/operating_system.py
@@ -1,29 +1,14 @@
-#!/usr/bin/env python
-
 import lldb
 import struct
 
+from lldb.plugins.operating_system import OperatingSystem
+
 
-class OperatingSystemPlugIn(object):
+class OperatingSystemPlugIn(OperatingSystem):
     """Class that provides data for an instance of a LLDB 'OperatingSystemPython' plug-in class"""
 
     def __init__(self, process):
-        """Initialization needs a valid.SBProcess object.
-
-        This plug-in will get created after a live process is valid and has stopped for the
-        first time."""
-        self.process = None
-        self.registers = None
-        self.threads = None
-        if isinstance(process, lldb.SBProcess) and process.IsValid():
-            self.process = process
-            self.threads = None  # Will be an dictionary containing info for each thread
-
-    def get_target(self):
-        # NOTE: Don't use "lldb.target" when trying to get your target as the "lldb.target"
-        # tracks the current target in the LLDB command interpreter which isn't the
-        # correct thing to use for this plug-in.
-        return self.process.target
+        super().__init__(process)
 
     def create_thread(self, tid, context):
         if tid == 0x444444444:
@@ -40,23 +25,6 @@ def create_thread(self, tid, context):
 
     def get_thread_info(self):
         if not self.threads:
-            # The sample dictionary below shows the values that can be returned for a thread
-            # tid => thread ID (mandatory)
-            # name => thread name (optional key/value pair)
-            # queue => thread dispatch queue name (optional key/value pair)
-            # state => thred state (mandatory, set to 'stopped' for now)
-            # stop_reason => thread stop reason. (mandatory, usually set to 'none')
-            #  Possible values include:
-            #   'breakpoint' if the thread is stopped at a breakpoint
-            #   'none' thread is just stopped because the process is stopped
-            #   'trace' the thread just single stepped
-            #   The usual value for this while threads are in memory is 'none'
-            # register_data_addr => the address of the register data in memory (optional key/value pair)
-            #   Specifying this key/value pair for a thread will avoid a call to get_register_data()
-            #   and can be used when your registers are in a thread context structure that is contiguous
-            #   in memory. Don't specify this if your register layout in memory doesn't match the layout
-            # described by the dictionary returned from a call to the
-            # get_register_info() method.
             self.threads = [
                 {
                     "tid": 0x111111111,

From 4e8d6c4f82a557313d3a46e3d567b2042ea38d92 Mon Sep 17 00:00:00 2001
From: ChiaHungDuan <chiahungduan@google.com>
Date: Thu, 26 Oct 2023 15:16:57 -0700
Subject: [PATCH 150/877] [scudo] Pass the max number of blocks to popBlocks
 (#70243)

Make the cache have the fully control on how many blocks to be popped
(At before, it depended the number of blocks stored in the
TransferBatch)
---
 compiler-rt/lib/scudo/standalone/local_cache.h | 16 ++++++++--------
 compiler-rt/lib/scudo/standalone/primary32.h   |  6 +++++-
 compiler-rt/lib/scudo/standalone/primary64.h   |  6 +++++-
 3 files changed, 18 insertions(+), 10 deletions(-)

diff --git a/compiler-rt/lib/scudo/standalone/local_cache.h b/compiler-rt/lib/scudo/standalone/local_cache.h
index 1814272277ff4..6898840eb2bcb 100644
--- a/compiler-rt/lib/scudo/standalone/local_cache.h
+++ b/compiler-rt/lib/scudo/standalone/local_cache.h
@@ -40,7 +40,11 @@ template <class SizeClassAllocator> struct SizeClassAllocatorLocalCache {
     DCHECK_LT(ClassId, NumClasses);
     PerClass *C = &PerClassArray[ClassId];
     if (C->Count == 0) {
-      if (UNLIKELY(!refill(C, ClassId)))
+      initCacheMaybe(C);
+
+      // Refill half of the number of max cached.
+      DCHECK_GT(C->MaxCount / 2, 0U);
+      if (UNLIKELY(!refill(C, ClassId, C->MaxCount / 2)))
         return nullptr;
       DCHECK_GT(C->Count, 0);
     }
@@ -173,14 +177,10 @@ template <class SizeClassAllocator> struct SizeClassAllocatorLocalCache {
       deallocate(BatchClassId, B);
   }
 
-  NOINLINE bool refill(PerClass *C, uptr ClassId) {
-    initCacheMaybe(C);
-
-    // TODO(chiahungduan): Pass the max number cached for each size class.
+  NOINLINE bool refill(PerClass *C, uptr ClassId, u16 MaxRefill) {
     const u16 NumBlocksRefilled =
-        Allocator->popBlocks(this, ClassId, C->Chunks);
-    DCHECK_LE(NumBlocksRefilled,
-              getMaxCached(SizeClassAllocator::getSizeByClassId(ClassId)));
+        Allocator->popBlocks(this, ClassId, C->Chunks, MaxRefill);
+    DCHECK_LE(NumBlocksRefilled, MaxRefill);
     C->Count = static_cast<u16>(C->Count + NumBlocksRefilled);
     return NumBlocksRefilled != 0;
   }
diff --git a/compiler-rt/lib/scudo/standalone/primary32.h b/compiler-rt/lib/scudo/standalone/primary32.h
index c900550ac675e..8281e02ba164c 100644
--- a/compiler-rt/lib/scudo/standalone/primary32.h
+++ b/compiler-rt/lib/scudo/standalone/primary32.h
@@ -191,7 +191,11 @@ template <typename Config> class SizeClassAllocator32 {
     return BlockSize > PageSize;
   }
 
-  u16 popBlocks(CacheT *C, uptr ClassId, CompactPtrT *ToArray) {
+  // Note that the `MaxBlockCount` will be used when we support arbitrary blocks
+  // count. Now it's the same as the number of blocks stored in the
+  // `TransferBatch`.
+  u16 popBlocks(CacheT *C, uptr ClassId, CompactPtrT *ToArray,
+                UNUSED const u16 MaxBlockCount) {
     TransferBatchT *B = popBatch(C, ClassId);
     if (!B)
       return 0;
diff --git a/compiler-rt/lib/scudo/standalone/primary64.h b/compiler-rt/lib/scudo/standalone/primary64.h
index 0f4ba88ee1f4b..d1929ff7212f4 100644
--- a/compiler-rt/lib/scudo/standalone/primary64.h
+++ b/compiler-rt/lib/scudo/standalone/primary64.h
@@ -221,7 +221,11 @@ template <typename Config> class SizeClassAllocator64 {
     DCHECK_EQ(BlocksInUse, BatchClassUsedInFreeLists);
   }
 
-  u16 popBlocks(CacheT *C, uptr ClassId, CompactPtrT *ToArray) {
+  // Note that the `MaxBlockCount` will be used when we support arbitrary blocks
+  // count. Now it's the same as the number of blocks stored in the
+  // `TransferBatch`.
+  u16 popBlocks(CacheT *C, uptr ClassId, CompactPtrT *ToArray,
+                UNUSED const u16 MaxBlockCount) {
     TransferBatchT *B = popBatch(C, ClassId);
     if (!B)
       return 0;

From fc0fd872052cfa05dbc6ce91d12ef9ce924fbf1c Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph@gmail.com>
Date: Thu, 26 Oct 2023 15:18:22 -0700
Subject: [PATCH 151/877] [mlir][www] Fix generation of the Mesh dialect doc

---
 mlir/include/mlir/Dialect/Mesh/IR/CMakeLists.txt | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Mesh/IR/CMakeLists.txt b/mlir/include/mlir/Dialect/Mesh/IR/CMakeLists.txt
index cfc948e305638..79a99e58b4e8f 100644
--- a/mlir/include/mlir/Dialect/Mesh/IR/CMakeLists.txt
+++ b/mlir/include/mlir/Dialect/Mesh/IR/CMakeLists.txt
@@ -1,11 +1,10 @@
 add_mlir_dialect(MeshOps mesh)
-add_mlir_doc(MeshOps MeshOps Dialects/ -gen-op-doc)
+add_mlir_doc(MeshOps MeshOps Dialects/ -gen-dialect-doc -dialect=mesh)
 
 set(LLVM_TARGET_DEFINITIONS MeshBase.td)
 mlir_tablegen(MeshOpsAttributes.h.inc -gen-attrdef-decls)
 mlir_tablegen(MeshOpsAttributes.cpp.inc -gen-attrdef-defs)
 add_public_tablegen_target(MLIRMeshOpsAttrIncGen)
-add_mlir_doc(MeshOps MeshAttributes Dialects/ -gen-attrdef-doc)
 
 set(LLVM_TARGET_DEFINITIONS MeshBase.td)
 mlir_tablegen(MeshOpsEnums.h.inc -gen-enum-decls)

From 5e51363f38d083ab326736c0d4d1b5f9fe0de080 Mon Sep 17 00:00:00 2001
From: Johannes Doerfert <johannes@jdoerfert.de>
Date: Thu, 26 Oct 2023 15:22:42 -0700
Subject: [PATCH 152/877] [OpenMP] Repair mlir test

---
 mlir/test/Target/LLVMIR/omptarget-region-device-llvm.mlir | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/test/Target/LLVMIR/omptarget-region-device-llvm.mlir b/mlir/test/Target/LLVMIR/omptarget-region-device-llvm.mlir
index f907e013d8f26..f16ffc34cca5c 100644
--- a/mlir/test/Target/LLVMIR/omptarget-region-device-llvm.mlir
+++ b/mlir/test/Target/LLVMIR/omptarget-region-device-llvm.mlir
@@ -29,7 +29,7 @@ module attributes {omp.is_target_device = true} {
 // CHECK:      @[[SRC_LOC:.*]] = private unnamed_addr constant [23 x i8] c"{{[^"]*}}", align 1
 // CHECK:      @[[IDENT:.*]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 22, ptr @[[SRC_LOC]] }, align 8
 // CHECK:      @[[DYNA_ENV:.*]] = weak_odr protected global %struct.DynamicEnvironmentTy zeroinitializer
-// CHECK:      @[[KERNEL_ENV:.*]] = weak_odr protected constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 1, i8 1 }, ptr @[[IDENT]], ptr @[[DYNA_ENV]] }
+// CHECK:      @[[KERNEL_ENV:.*]] = weak_odr protected constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[IDENT]], ptr @[[DYNA_ENV]] }
 // CHECK:      define weak_odr protected void @__omp_offloading_{{[^_]+}}_{{[^_]+}}_omp_target_region__l{{[0-9]+}}(ptr %[[ADDR_A:.*]], ptr %[[ADDR_B:.*]], ptr %[[ADDR_C:.*]])
 // CHECK:        %[[TMP_A:.*]] = alloca ptr, align 8
 // CHECK:        store ptr %[[ADDR_A]], ptr %[[TMP_A]], align 8

From f74f2133d3c6f746563da101250fdb599aacfa94 Mon Sep 17 00:00:00 2001
From: cooperp <peter_cooper@apple.com>
Date: Thu, 26 Oct 2023 15:28:09 -0700
Subject: [PATCH 153/877] Always use pthread_rwlock on Apple platforms (#70151)

pthread_rwlock is faster than std::shared_mutex, especially in heavily
threaded code such as the Swift compiler.

rdar://117445844
---
 llvm/include/llvm/Support/RWMutex.h | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/llvm/include/llvm/Support/RWMutex.h b/llvm/include/llvm/Support/RWMutex.h
index 04cf3eb3d3aa7..32987c3b98f1c 100644
--- a/llvm/include/llvm/Support/RWMutex.h
+++ b/llvm/include/llvm/Support/RWMutex.h
@@ -19,12 +19,9 @@
 #include <mutex>
 #include <shared_mutex>
 
-// std::shared_timed_mutex is only available on macOS 10.12 and later.
-#if defined(__APPLE__) && defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__)
-#if __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ < 101200
+#if defined(__APPLE__)
 #define LLVM_USE_RW_MUTEX_IMPL
 #endif
-#endif
 
 namespace llvm {
 namespace sys {

From b165650aee851b15c4f87f0d510b37cb1bf6b814 Mon Sep 17 00:00:00 2001
From: Yinying Li <107574043+yinying-lisa-li@users.noreply.github.com>
Date: Thu, 26 Oct 2023 18:30:34 -0400
Subject: [PATCH 154/877] [mlir][sparse] Return actual identity map instead of
 null map (#70365)

Changes:

1. For both dimToLvl and lvlToDim, always returns the actual map instead
of AffineMap() for identity map.
2. Updated custom builder for encoding to have default values.
3. Non-inferable lvlToDim will still return AffineMap() during
inference, so it will be caught by verifier.
---
 .../Dialect/SparseTensor/IR/SparseTensorAttrDefs.td   | 11 +++++++----
 mlir/lib/Dialect/SparseTensor/IR/Detail/DimLvlMap.cpp |  7 ++++---
 .../Dialect/SparseTensor/IR/SparseTensorDialect.cpp   |  2 +-
 mlir/test/python/dialects/sparse_tensor/dialect.py    |  4 ++--
 4 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorAttrDefs.td b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorAttrDefs.td
index 6c3cfc2c49b2e..7e2ad11752b34 100644
--- a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorAttrDefs.td
+++ b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorAttrDefs.td
@@ -303,10 +303,13 @@ def SparseTensorEncodingAttr : SparseTensor_Attr<"SparseTensorEncoding",
 
   let builders = [
     AttrBuilder<(ins "ArrayRef<::mlir::sparse_tensor::DimLevelType>":$lvlTypes,
-                     "AffineMap":$dimToLvl,
-                     "AffineMap":$lvlToDim,
-                     "unsigned":$posWidth,
-                     "unsigned":$crdWidth), [{
+                     CArg<"AffineMap", "{}">:$dimToLvl,
+                     CArg<"AffineMap", "{}">:$lvlToDim,
+                     CArg<"unsigned", "0">:$posWidth,
+                     CArg<"unsigned", "0">:$crdWidth), [{
+      if (!dimToLvl) {
+        dimToLvl = ::mlir::AffineMap::getMultiDimIdentityMap(lvlTypes.size(), $_ctxt);
+      }
       if (!lvlToDim) {
         lvlToDim = ::mlir::sparse_tensor::inferLvlToDim(dimToLvl, $_ctxt);
       }
diff --git a/mlir/lib/Dialect/SparseTensor/IR/Detail/DimLvlMap.cpp b/mlir/lib/Dialect/SparseTensor/IR/Detail/DimLvlMap.cpp
index 851867926fe67..6a81a11a932f9 100644
--- a/mlir/lib/Dialect/SparseTensor/IR/Detail/DimLvlMap.cpp
+++ b/mlir/lib/Dialect/SparseTensor/IR/Detail/DimLvlMap.cpp
@@ -313,8 +313,7 @@ AffineMap DimLvlMap::getDimToLvlMap(MLIRContext *context) const {
   lvlAffines.reserve(getLvlRank());
   for (const auto &lvlSpec : lvlSpecs)
     lvlAffines.push_back(lvlSpec.getExpr().getAffineExpr());
-  auto map =  AffineMap::get(getDimRank(), getSymRank(), lvlAffines, context);
-  if (map.isIdentity()) return AffineMap();
+  auto map = AffineMap::get(getDimRank(), getSymRank(), lvlAffines, context);
   return map;
 }
 
@@ -328,7 +327,9 @@ AffineMap DimLvlMap::getLvlToDimMap(MLIRContext *context) const {
     }
   }
   auto map = AffineMap::get(getLvlRank(), getSymRank(), dimAffines, context);
-  if (dimAffines.empty() || map.isIdentity())
+  // If no lvlToDim map was passed in, returns a null AffineMap and infers it
+  // in SparseTensorEncodingAttr::parse.
+  if (dimAffines.empty())
     return AffineMap();
   return map;
 }
diff --git a/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp b/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp
index be44a0d31c92a..4f1c446faec37 100644
--- a/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp
+++ b/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp
@@ -291,7 +291,7 @@ SparseTensorEncodingAttr
 SparseTensorEncodingAttr::withDimToLvl(AffineMap dimToLvl) const {
   assert(getImpl() && "Uninitialized SparseTensorEncodingAttr");
   return SparseTensorEncodingAttr::get(getContext(), getLvlTypes(), dimToLvl,
-                                       getLvlToDim(), getPosWidth(),
+                                       AffineMap(), getPosWidth(),
                                        getCrdWidth());
 }
 
diff --git a/mlir/test/python/dialects/sparse_tensor/dialect.py b/mlir/test/python/dialects/sparse_tensor/dialect.py
index 240db6ebd1d1e..fe7b41e536e27 100644
--- a/mlir/test/python/dialects/sparse_tensor/dialect.py
+++ b/mlir/test/python/dialects/sparse_tensor/dialect.py
@@ -30,9 +30,9 @@ def testEncodingAttr1D():
 
         # CHECK: lvl_types: [<DimLevelType.compressed: 8>]
         print(f"lvl_types: {casted.lvl_types}")
-        # CHECK: dim_to_lvl: None
+        # CHECK: dim_to_lvl: (d0) -> (d0)
         print(f"dim_to_lvl: {casted.dim_to_lvl}")
-        # CHECK: lvl_to_dim: None
+        # CHECK: lvl_to_dim: (d0) -> (d0)
         print(f"lvl_to_dim: {casted.lvl_to_dim}")
         # CHECK: pos_width: 16
         print(f"pos_width: {casted.pos_width}")

From 56b99f05fa723ccfce08615f4a5114df52ec247a Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Thu, 26 Oct 2023 15:49:34 -0700
Subject: [PATCH 155/877] [RISCV][GISel] Rename XLenVT sXLen to be consistent
 with other LLTs. (#70288)

This also renames DoubleXLenLLT to sDoubleXLen.
---
 .../RISCV/GISel/RISCVInstructionSelector.cpp  |   8 +-
 .../Target/RISCV/GISel/RISCVLegalizerInfo.cpp | 136 ++++++++----------
 2 files changed, 65 insertions(+), 79 deletions(-)

diff --git a/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp b/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp
index a1f94780d31ce..a8ce01f703f33 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp
+++ b/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp
@@ -384,8 +384,8 @@ bool RISCVInstructionSelector::replacePtrWithInt(MachineOperand &Op,
   Register PtrReg = Op.getReg();
   assert(MRI.getType(PtrReg).isPointer() && "Operand is not a pointer!");
 
-  const LLT XLenLLT = LLT::scalar(STI.getXLen());
-  auto PtrToInt = MIB.buildPtrToInt(XLenLLT, PtrReg);
+  const LLT sXLen = LLT::scalar(STI.getXLen());
+  auto PtrToInt = MIB.buildPtrToInt(sXLen, PtrReg);
   MRI.setRegBank(PtrToInt.getReg(0), RBI.getRegBank(RISCV::GPRRegBankID));
   Op.setReg(PtrToInt.getReg(0));
   return select(*PtrToInt);
@@ -397,11 +397,11 @@ void RISCVInstructionSelector::preISelLower(MachineInstr &MI,
   switch (MI.getOpcode()) {
   case TargetOpcode::G_PTR_ADD: {
     Register DstReg = MI.getOperand(0).getReg();
-    const LLT XLenLLT = LLT::scalar(STI.getXLen());
+    const LLT sXLen = LLT::scalar(STI.getXLen());
 
     replacePtrWithInt(MI.getOperand(1), MIB, MRI);
     MI.setDesc(TII.get(TargetOpcode::G_ADD));
-    MRI.setType(DstReg, XLenLLT);
+    MRI.setType(DstReg, sXLen);
     break;
   }
   }
diff --git a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
index 1d4e279d524be..da5e243b8a9ef 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
+++ b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
@@ -23,8 +23,8 @@ using namespace LegalityPredicates;
 
 RISCVLegalizerInfo::RISCVLegalizerInfo(const RISCVSubtarget &ST) {
   const unsigned XLen = ST.getXLen();
-  const LLT XLenLLT = LLT::scalar(XLen);
-  const LLT DoubleXLenLLT = LLT::scalar(2 * XLen);
+  const LLT sXLen = LLT::scalar(XLen);
+  const LLT sDoubleXLen = LLT::scalar(2 * XLen);
   const LLT p0 = LLT::pointer(0, XLen);
   const LLT s8 = LLT::scalar(8);
   const LLT s16 = LLT::scalar(16);
@@ -34,38 +34,35 @@ RISCVLegalizerInfo::RISCVLegalizerInfo(const RISCVSubtarget &ST) {
   using namespace TargetOpcode;
 
   getActionDefinitionsBuilder({G_ADD, G_SUB, G_AND, G_OR, G_XOR})
-      .legalFor({s32, XLenLLT})
+      .legalFor({s32, sXLen})
       .widenScalarToNextPow2(0)
-      .clampScalar(0, s32, XLenLLT);
+      .clampScalar(0, s32, sXLen);
 
   getActionDefinitionsBuilder(
       {G_UADDE, G_UADDO, G_USUBE, G_USUBO}).lower();
 
-  getActionDefinitionsBuilder({G_SADDO, G_SSUBO}).minScalar(0, XLenLLT).lower();
+  getActionDefinitionsBuilder({G_SADDO, G_SSUBO}).minScalar(0, sXLen).lower();
 
   getActionDefinitionsBuilder({G_ASHR, G_LSHR, G_SHL})
-      .legalFor({{s32, s32}, {XLenLLT, XLenLLT}})
+      .legalFor({{s32, s32}, {sXLen, sXLen}})
       .widenScalarToNextPow2(0)
-      .clampScalar(1, s32, XLenLLT)
-      .clampScalar(0, s32, XLenLLT)
+      .clampScalar(1, s32, sXLen)
+      .clampScalar(0, s32, sXLen)
       .minScalarSameAs(1, 0);
 
   if (ST.is64Bit()) {
     getActionDefinitionsBuilder({G_ZEXT, G_SEXT, G_ANYEXT})
-        .legalFor({{XLenLLT, s32}})
-        .maxScalar(0, XLenLLT);
+        .legalFor({{sXLen, s32}})
+        .maxScalar(0, sXLen);
 
     getActionDefinitionsBuilder(G_SEXT_INREG)
-        .customFor({XLenLLT})
-        .maxScalar(0, XLenLLT)
+        .customFor({sXLen})
+        .maxScalar(0, sXLen)
         .lower();
   } else {
-    getActionDefinitionsBuilder({G_ZEXT, G_SEXT, G_ANYEXT})
-        .maxScalar(0, XLenLLT);
+    getActionDefinitionsBuilder({G_ZEXT, G_SEXT, G_ANYEXT}).maxScalar(0, sXLen);
 
-    getActionDefinitionsBuilder(G_SEXT_INREG)
-        .maxScalar(0, XLenLLT)
-        .lower();
+    getActionDefinitionsBuilder(G_SEXT_INREG).maxScalar(0, sXLen).lower();
   }
 
   // Merge/Unmerge
@@ -75,123 +72,112 @@ RISCVLegalizerInfo::RISCVLegalizerInfo(const RISCVSubtarget &ST) {
     getActionDefinitionsBuilder(Op)
         .widenScalarToNextPow2(LitTyIdx, XLen)
         .widenScalarToNextPow2(BigTyIdx, XLen)
-        .clampScalar(LitTyIdx, XLenLLT, XLenLLT)
-        .clampScalar(BigTyIdx, XLenLLT, XLenLLT);
+        .clampScalar(LitTyIdx, sXLen, sXLen)
+        .clampScalar(BigTyIdx, sXLen, sXLen);
   }
 
   getActionDefinitionsBuilder({G_CONSTANT, G_IMPLICIT_DEF})
-      .legalFor({s32, XLenLLT, p0})
+      .legalFor({s32, sXLen, p0})
       .widenScalarToNextPow2(0)
-      .clampScalar(0, s32, XLenLLT);
+      .clampScalar(0, s32, sXLen);
 
   getActionDefinitionsBuilder(G_ICMP)
-      .legalFor({{XLenLLT, XLenLLT}, {XLenLLT, p0}})
+      .legalFor({{sXLen, sXLen}, {sXLen, p0}})
       .widenScalarToNextPow2(1)
-      .clampScalar(1, XLenLLT, XLenLLT)
-      .clampScalar(0, XLenLLT, XLenLLT);
+      .clampScalar(1, sXLen, sXLen)
+      .clampScalar(0, sXLen, sXLen);
 
   getActionDefinitionsBuilder(G_SELECT)
-      .legalFor({{XLenLLT, XLenLLT}, {p0, XLenLLT}})
+      .legalFor({{sXLen, sXLen}, {p0, sXLen}})
       .widenScalarToNextPow2(0)
-      .clampScalar(0, XLenLLT, XLenLLT)
-      .clampScalar(1, XLenLLT, XLenLLT);
+      .clampScalar(0, sXLen, sXLen)
+      .clampScalar(1, sXLen, sXLen);
 
   getActionDefinitionsBuilder({G_LOAD, G_STORE})
       .legalForTypesWithMemDesc({{s32, p0, s8, 8},
                                  {s32, p0, s16, 16},
                                  {s32, p0, s32, 32},
-                                 {XLenLLT, p0, s8, 8},
-                                 {XLenLLT, p0, s16, 16},
-                                 {XLenLLT, p0, s32, 32},
-                                 {XLenLLT, p0, XLenLLT, XLen},
-                                 {p0, p0, XLenLLT, XLen}})
-      .clampScalar(0, s32, XLenLLT)
+                                 {sXLen, p0, s8, 8},
+                                 {sXLen, p0, s16, 16},
+                                 {sXLen, p0, s32, 32},
+                                 {sXLen, p0, sXLen, XLen},
+                                 {p0, p0, sXLen, XLen}})
+      .clampScalar(0, s32, sXLen)
       .lower();
 
-  auto &ExtLoadActions =
-      getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
-          .legalForTypesWithMemDesc({{s32, p0, s8, 8},
-                                     {s32, p0, s16, 16},
-                                     {XLenLLT, p0, s8, 8},
-                                     {XLenLLT, p0, s16, 16}});
+  auto &ExtLoadActions = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
+                             .legalForTypesWithMemDesc({{s32, p0, s8, 8},
+                                                        {s32, p0, s16, 16},
+                                                        {sXLen, p0, s8, 8},
+                                                        {sXLen, p0, s16, 16}});
   if (XLen == 64)
-    ExtLoadActions.legalForTypesWithMemDesc({{XLenLLT, p0, s32, 32}});
-  ExtLoadActions
-    .widenScalarToNextPow2(0)
-    .clampScalar(0, s32, XLenLLT)
-    .lower();
+    ExtLoadActions.legalForTypesWithMemDesc({{sXLen, p0, s32, 32}});
+  ExtLoadActions.widenScalarToNextPow2(0).clampScalar(0, s32, sXLen).lower();
 
-  getActionDefinitionsBuilder(G_PTR_ADD)
-      .legalFor({{p0, XLenLLT}});
+  getActionDefinitionsBuilder(G_PTR_ADD).legalFor({{p0, sXLen}});
 
   getActionDefinitionsBuilder(G_PTRTOINT)
-      .legalFor({{XLenLLT, p0}})
-      .clampScalar(0, XLenLLT, XLenLLT);
+      .legalFor({{sXLen, p0}})
+      .clampScalar(0, sXLen, sXLen);
 
   getActionDefinitionsBuilder(G_INTTOPTR)
-      .legalFor({{p0, XLenLLT}})
-      .clampScalar(1, XLenLLT, XLenLLT);
+      .legalFor({{p0, sXLen}})
+      .clampScalar(1, sXLen, sXLen);
 
-  getActionDefinitionsBuilder(G_BRCOND)
-      .legalFor({XLenLLT})
-      .minScalar(0, XLenLLT);
+  getActionDefinitionsBuilder(G_BRCOND).legalFor({sXLen}).minScalar(0, sXLen);
 
   getActionDefinitionsBuilder(G_PHI)
-      .legalFor({p0, XLenLLT})
+      .legalFor({p0, sXLen})
       .widenScalarToNextPow2(0)
-      .clampScalar(0, XLenLLT, XLenLLT);
+      .clampScalar(0, sXLen, sXLen);
 
   getActionDefinitionsBuilder(G_GLOBAL_VALUE)
       .legalFor({p0});
 
   if (ST.hasStdExtM() || ST.hasStdExtZmmul()) {
     getActionDefinitionsBuilder(G_MUL)
-        .legalFor({s32, XLenLLT})
+        .legalFor({s32, sXLen})
         .widenScalarToNextPow2(0)
-        .clampScalar(0, s32, XLenLLT);
+        .clampScalar(0, s32, sXLen);
 
     // clang-format off
     getActionDefinitionsBuilder({G_SMULH, G_UMULH})
-        .legalFor({XLenLLT})
+        .legalFor({sXLen})
         .lower();
     // clang-format on
 
-    getActionDefinitionsBuilder({G_SMULO, G_UMULO})
-        .minScalar(0, XLenLLT)
-        .lower();
+    getActionDefinitionsBuilder({G_SMULO, G_UMULO}).minScalar(0, sXLen).lower();
   } else {
     getActionDefinitionsBuilder(G_MUL)
-        .libcallFor({XLenLLT, DoubleXLenLLT})
+        .libcallFor({sXLen, sDoubleXLen})
         .widenScalarToNextPow2(0)
-        .clampScalar(0, XLenLLT, DoubleXLenLLT);
+        .clampScalar(0, sXLen, sDoubleXLen);
 
-    getActionDefinitionsBuilder({G_SMULH, G_UMULH}).lowerFor({XLenLLT});
+    getActionDefinitionsBuilder({G_SMULH, G_UMULH}).lowerFor({sXLen});
 
     getActionDefinitionsBuilder({G_SMULO, G_UMULO})
-        .minScalar(0, XLenLLT)
-        // Widen XLenLLT to DoubleXLenLLT so we can use a single libcall to get
+        .minScalar(0, sXLen)
+        // Widen sXLen to sDoubleXLen so we can use a single libcall to get
         // the low bits for the mul result and high bits to do the overflow
         // check.
         .widenScalarIf(
+            [=](const LegalityQuery &Query) { return Query.Types[0] == sXLen; },
             [=](const LegalityQuery &Query) {
-              return Query.Types[0] == XLenLLT;
-            },
-            [=](const LegalityQuery &Query) {
-              return std::make_pair(0, DoubleXLenLLT);
+              return std::make_pair(0, sDoubleXLen);
             })
         .lower();
   }
 
   if (ST.hasStdExtM()) {
     getActionDefinitionsBuilder({G_UDIV, G_SDIV, G_UREM, G_SREM})
-        .legalFor({s32, XLenLLT})
-        .libcallFor({DoubleXLenLLT})
-        .clampScalar(0, s32, DoubleXLenLLT)
+        .legalFor({s32, sXLen})
+        .libcallFor({sDoubleXLen})
+        .clampScalar(0, s32, sDoubleXLen)
         .widenScalarToNextPow2(0);
   } else {
     getActionDefinitionsBuilder({G_UDIV, G_SDIV, G_UREM, G_SREM})
-        .libcallFor({XLenLLT, DoubleXLenLLT})
-        .clampScalar(0, XLenLLT, DoubleXLenLLT)
+        .libcallFor({sXLen, sDoubleXLen})
+        .clampScalar(0, sXLen, sDoubleXLen)
         .widenScalarToNextPow2(0);
   }
 

From ef222988b477bf91fe3b6e9cd7d881d19af2d605 Mon Sep 17 00:00:00 2001
From: Peiming Liu <36770114+PeimingLiu@users.noreply.github.com>
Date: Thu, 26 Oct 2023 16:00:32 -0700
Subject: [PATCH 156/877] [mlir][sparse] implements
 sparse_tensor.reinterpret_map (#70388)

---
 .../SparseTensor/IR/SparseTensorDialect.cpp       | 15 +++++++++++++--
 .../Transforms/SparseTensorCodegen.cpp            | 14 +++++++++++++-
 .../Transforms/SparseTensorConversion.cpp         | 14 +++++++++++++-
 .../Dialect/SparseTensor/CPU/block.mlir           | 12 ++++++++++++
 4 files changed, 51 insertions(+), 4 deletions(-)

diff --git a/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp b/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp
index 4f1c446faec37..9a6d3161be3d6 100644
--- a/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp
+++ b/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp
@@ -453,11 +453,22 @@ SparseTensorEncodingAttr::tranlateShape(ArrayRef<int64_t> srcShape,
     // Do constant propagation on the affine map.
     AffineExpr evalExp =
         simplifyAffineExpr(exp.replaceDims(dimRep), srcShape.size(), 0);
-    if (auto c = evalExp.dyn_cast<AffineConstantExpr>())
+    if (auto c = evalExp.dyn_cast<AffineConstantExpr>()) {
       ret.push_back(c.getValue() + 1);
-    else
+    } else {
+      if (auto mod = evalExp.dyn_cast<AffineBinaryOpExpr>();
+          mod && mod.getKind() == AffineExprKind::Mod) {
+        // We can still infer a static bound for expressions in form
+        // "d % constant" since d % constant \in [0, constant).
+        if (auto bound = mod.getRHS().dyn_cast<AffineConstantExpr>()) {
+          ret.push_back(bound.getValue());
+          continue;
+        }
+      }
       ret.push_back(ShapedType::kDynamic);
+    }
   }
+  assert(ret.size() == rank);
   return ret;
 }
 
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorCodegen.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorCodegen.cpp
index e855a6e19a717..5cdf8cd7ccc9d 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorCodegen.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorCodegen.cpp
@@ -725,6 +725,18 @@ class SparseCastConverter : public OpConversionPattern<tensor::CastOp> {
   }
 };
 
+class SparseReMapConverter : public OpConversionPattern<ReinterpretMapOp> {
+public:
+  using OpConversionPattern::OpConversionPattern;
+  LogicalResult
+  matchAndRewrite(ReinterpretMapOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    // Simply fold the operation.
+    rewriter.replaceOp(op, adaptor.getSource());
+    return success();
+  }
+};
+
 /// Sparse codegen rule for the alloc operator.
 /// TODO(springerm): remove when bufferization.alloc_tensor is gone
 class SparseTensorAllocConverter
@@ -1564,7 +1576,7 @@ void mlir::populateSparseTensorCodegenPatterns(
                SparseCastConverter, SparseExtractSliceConverter,
                SparseTensorLoadConverter, SparseExpandConverter,
                SparseCompressConverter, SparseInsertConverter,
-               SparseReorderCOOConverter,
+               SparseReorderCOOConverter, SparseReMapConverter,
                SparseSliceGetterOpConverter<ToSliceOffsetOp,
                                             StorageSpecifierKind::DimOffset>,
                SparseSliceGetterOpConverter<ToSliceStrideOp,
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorConversion.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorConversion.cpp
index c3f046e52fd67..a92038ce7c98d 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorConversion.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorConversion.cpp
@@ -336,6 +336,18 @@ class SparseCastConverter : public OpConversionPattern<tensor::CastOp> {
   }
 };
 
+class SparseReMapConverter : public OpConversionPattern<ReinterpretMapOp> {
+public:
+  using OpConversionPattern::OpConversionPattern;
+  LogicalResult
+  matchAndRewrite(ReinterpretMapOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    // Simply fold the operation.
+    rewriter.replaceOp(op, adaptor.getSource());
+    return success();
+  }
+};
+
 /// Sparse conversion rule for the new operator.
 class SparseTensorNewConverter : public OpConversionPattern<NewOp> {
 public:
@@ -770,7 +782,7 @@ void mlir::populateSparseTensorConversionPatterns(TypeConverter &typeConverter,
                                                   RewritePatternSet &patterns) {
   patterns
       .add<SparseReturnConverter, SparseTensorLvlOpConverter,
-           SparseCastConverter, SparseTensorNewConverter,
+           SparseCastConverter, SparseReMapConverter, SparseTensorNewConverter,
            SparseTensorAllocConverter, SparseTensorEmptyConverter,
            SparseTensorDeallocConverter, SparseTensorReorderCOOConverter,
            SparseTensorToPositionsConverter, SparseTensorToCoordinatesConverter,
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/block.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/block.mlir
index d0b5e77bd4a72..78d35ada6acc1 100755
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/block.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/block.mlir
@@ -34,6 +34,11 @@
     )
 }>
 
+#DSDD = #sparse_tensor.encoding<{
+  map = (i, j, k, l) -> ( i  : dense, j  : compressed, k  : dense, l  : dense)
+}>
+
+
 !Filename = !llvm.ptr<i8>
 
 //
@@ -77,6 +82,13 @@ module {
     %vecv = vector.transfer_read %val[%c0], %f0 : memref<?xf64>, vector<12xf64>
     vector.print %vecv : vector<12xf64>
 
+    // CHECK-NEXT: ( 1, 2, 0, 3, 4, 0, 0, 5, 6, 7, 8, 0 )
+    %t1 = sparse_tensor.reinterpret_map %A : tensor<?x?xf64, #BSR>
+                                          to tensor<?x?x2x2xf64, #DSDD>
+    %vdsdd = sparse_tensor.values %t1 : tensor<?x?x2x2xf64, #DSDD> to memref<?xf64>
+    %vecdsdd = vector.transfer_read %vdsdd[%c0], %f0 : memref<?xf64>, vector<12xf64>
+    vector.print %vecdsdd : vector<12xf64>
+
     // Release the resources.
     bufferization.dealloc_tensor %A: tensor<?x?xf64, #BSR>
 

From 7fe29e585723c8a90e798b3d852f2c54efb6f5dd Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph@gmail.com>
Date: Thu, 26 Oct 2023 16:07:58 -0700
Subject: [PATCH 157/877] [mlir][www] Add missing include for ARM SVE passes

---
 mlir/docs/Passes.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/mlir/docs/Passes.md b/mlir/docs/Passes.md
index add48876e826e..2eb94d323b72f 100644
--- a/mlir/docs/Passes.md
+++ b/mlir/docs/Passes.md
@@ -32,6 +32,10 @@ This document describes the available MLIR passes and their contracts.
 
 [include "ArmSMEPasses.md"]
 
+## 'arm\_sve' Dialect Passes
+
+[include "ArmSVEPasses.md"]
+
 ## 'async' Dialect Passes
 
 [include "AsyncPasses.md"]

From ddbaa11e9f43a38d50d62a9b9b07c3653b6bf8ab Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph@gmail.com>
Date: Thu, 26 Oct 2023 17:24:52 -0700
Subject: [PATCH 158/877] Revert "[OpenMP][NFC] Add min/max threads/teams count
 into the KernelEnvironment (#70257)"

This reverts commit c2a1249a8257ed033a98e32e425539c6da6700ec.

The MLIR bots are broken with an omp test failure.
---
 .../include/llvm/Frontend/OpenMP/OMPKinds.def |   2 +-
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp     |  27 ++---
 llvm/lib/Transforms/IPO/OpenMPOpt.cpp         |  29 -----
 .../Transforms/OpenMP/always_inline_device.ll |   6 +-
 .../OpenMP/custom_state_machines.ll           |  82 ++++++-------
 .../OpenMP/custom_state_machines_pre_lto.ll   | 114 +++++++++---------
 .../OpenMP/custom_state_machines_remarks.ll   |   6 +-
 .../Transforms/OpenMP/deduplication_target.ll |   4 +-
 .../get_hardware_num_threads_in_block_fold.ll |  14 +--
 .../Transforms/OpenMP/global_constructor.ll   |   4 +-
 .../OpenMP/globalization_remarks.ll           |   4 +-
 ..._state_machine_function_ptr_replacement.ll |   4 +-
 .../OpenMP/is_spmd_exec_mode_fold.ll          |  18 +--
 .../Transforms/OpenMP/nested_parallelism.ll   |  10 +-
 .../Transforms/OpenMP/parallel_deletion.ll    |  50 ++++----
 .../Transforms/OpenMP/parallel_level_fold.ll  |  14 +--
 .../Transforms/OpenMP/remove_globalization.ll |   8 +-
 .../OpenMP/replace_globalization.ll           |  14 +--
 .../OpenMP/single_threaded_execution.ll       |   4 +-
 llvm/test/Transforms/OpenMP/spmdization.ll    |  86 ++++++-------
 .../Transforms/OpenMP/spmdization_assumes.ll  |   6 +-
 .../OpenMP/spmdization_constant_prop.ll       |   6 +-
 .../Transforms/OpenMP/spmdization_guarding.ll |   8 +-
 ...mdization_guarding_two_reaching_kernels.ll |  14 +--
 .../Transforms/OpenMP/spmdization_indirect.ll |  26 ++--
 .../OpenMP/spmdization_kernel_env_dep.ll      |   6 +-
 ...zation_no_guarding_two_reaching_kernels.ll |  14 +--
 .../Transforms/OpenMP/spmdization_remarks.ll  |   6 +-
 .../OpenMP/value-simplify-openmp-opt.ll       |   6 +-
 .../PhaseOrdering/openmp-opt-module.ll        |   4 +-
 openmp/libomptarget/include/Environment.h     |   7 --
 31 files changed, 280 insertions(+), 323 deletions(-)

diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
index be03757d7d57c..39780e7c8934c 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
@@ -95,7 +95,7 @@ __OMP_STRUCT_TYPE(AsyncInfo, __tgt_async_info, false, Int8Ptr)
 __OMP_STRUCT_TYPE(DependInfo, kmp_dep_info, false, SizeTy, SizeTy, Int8)
 __OMP_STRUCT_TYPE(Task, kmp_task_ompbuilder_t, false, VoidPtr, VoidPtr, Int32, VoidPtr, VoidPtr)
 __OMP_STRUCT_TYPE(ConfigurationEnvironment, ConfigurationEnvironmentTy, false,
-                  Int8, Int8, Int8, Int32, Int32, Int32, Int32)
+                  Int8, Int8, Int8)
 __OMP_STRUCT_TYPE(DynamicEnvironment, DynamicEnvironmentTy, false, Int16)
 __OMP_STRUCT_TYPE(KernelEnvironment, KernelEnvironmentTy, false,
                   ConfigurationEnvironment, IdentPtr, DynamicEnvironmentPtr)
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index ac5a8d1fd817e..b40796d4e50a6 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -4071,21 +4071,18 @@ OpenMPIRBuilder::createTargetInit(const LocationDescription &Loc, bool IsSPMD) {
   uint32_t SrcLocStrSize;
   Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
   Constant *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
-  Constant *IsSPMDVal = ConstantInt::getSigned(
-      Int8, IsSPMD ? OMP_TGT_EXEC_MODE_SPMD : OMP_TGT_EXEC_MODE_GENERIC);
-  Constant *UseGenericStateMachineVal = ConstantInt::getSigned(Int8, !IsSPMD);
-  Constant *MayUseNestedParallelismVal = ConstantInt::getSigned(Int8, true);
-  Constant *DebugIndentionLevelVal = ConstantInt::getSigned(Int16, 0);
-
-  Function *Kernel = Builder.GetInsertBlock()->getParent();
-  auto [MinThreadsVal, MaxThreadsVal] = readThreadBoundsForKernel(*Kernel);
-  auto [MinTeamsVal, MaxTeamsVal] = readTeamBoundsForKernel(*Kernel);
-  Constant *MinThreads = ConstantInt::getSigned(Int32, MinThreadsVal);
-  Constant *MaxThreads = ConstantInt::getSigned(Int32, MaxThreadsVal);
-  Constant *MinTeams = ConstantInt::getSigned(Int32, MinTeamsVal);
-  Constant *MaxTeams = ConstantInt::getSigned(Int32, MaxTeamsVal);
+  ConstantInt *IsSPMDVal = ConstantInt::getSigned(
+      IntegerType::getInt8Ty(Int8->getContext()),
+      IsSPMD ? OMP_TGT_EXEC_MODE_SPMD : OMP_TGT_EXEC_MODE_GENERIC);
+  ConstantInt *UseGenericStateMachineVal = ConstantInt::getSigned(
+      IntegerType::getInt8Ty(Int8->getContext()), !IsSPMD);
+  ConstantInt *MayUseNestedParallelismVal =
+      ConstantInt::getSigned(IntegerType::getInt8Ty(Int8->getContext()), true);
+  ConstantInt *DebugIndentionLevelVal =
+      ConstantInt::getSigned(IntegerType::getInt16Ty(Int8->getContext()), 0);
 
   // We need to strip the debug prefix to get the correct kernel name.
+  Function *Kernel = Builder.GetInsertBlock()->getParent();
   StringRef KernelName = Kernel->getName();
   const std::string DebugPrefix = "_debug__";
   if (KernelName.ends_with(DebugPrefix))
@@ -4116,10 +4113,6 @@ OpenMPIRBuilder::createTargetInit(const LocationDescription &Loc, bool IsSPMD) {
                                     UseGenericStateMachineVal,
                                     MayUseNestedParallelismVal,
                                     IsSPMDVal,
-                                    MinThreads,
-                                    MaxThreads,
-                                    MinTeams,
-                                    MaxTeams,
                                 });
   Constant *KernelEnvironmentInitializer = ConstantStruct::get(
       KernelEnvironment, {
diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
index e9dbcd334321f..e50e74ea6c0d5 100644
--- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -191,10 +191,6 @@ namespace KernelInfo {
 //   uint8_t UseGenericStateMachine;
 //   uint8_t MayUseNestedParallelism;
 //   llvm::omp::OMPTgtExecModeFlags ExecMode;
-//   int32_t MinThreads;
-//   int32_t MaxThreads;
-//   int32_t MinTeams;
-//   int32_t MaxTeams;
 // };
 
 // struct DynamicEnvironmentTy {
@@ -221,10 +217,6 @@ KERNEL_ENVIRONMENT_IDX(Ident, 1)
 KERNEL_ENVIRONMENT_CONFIGURATION_IDX(UseGenericStateMachine, 0)
 KERNEL_ENVIRONMENT_CONFIGURATION_IDX(MayUseNestedParallelism, 1)
 KERNEL_ENVIRONMENT_CONFIGURATION_IDX(ExecMode, 2)
-KERNEL_ENVIRONMENT_CONFIGURATION_IDX(MinThreads, 3)
-KERNEL_ENVIRONMENT_CONFIGURATION_IDX(MaxThreads, 4)
-KERNEL_ENVIRONMENT_CONFIGURATION_IDX(MinTeams, 5)
-KERNEL_ENVIRONMENT_CONFIGURATION_IDX(MaxTeams, 6)
 
 #undef KERNEL_ENVIRONMENT_CONFIGURATION_IDX
 
@@ -249,10 +241,6 @@ KERNEL_ENVIRONMENT_GETTER(Configuration, ConstantStruct)
 KERNEL_ENVIRONMENT_CONFIGURATION_GETTER(UseGenericStateMachine)
 KERNEL_ENVIRONMENT_CONFIGURATION_GETTER(MayUseNestedParallelism)
 KERNEL_ENVIRONMENT_CONFIGURATION_GETTER(ExecMode)
-KERNEL_ENVIRONMENT_CONFIGURATION_GETTER(MinThreads)
-KERNEL_ENVIRONMENT_CONFIGURATION_GETTER(MaxThreads)
-KERNEL_ENVIRONMENT_CONFIGURATION_GETTER(MinTeams)
-KERNEL_ENVIRONMENT_CONFIGURATION_GETTER(MaxTeams)
 
 #undef KERNEL_ENVIRONMENT_CONFIGURATION_GETTER
 
@@ -3648,10 +3636,6 @@ struct AAKernelInfoFunction : AAKernelInfo {
   KERNEL_ENVIRONMENT_CONFIGURATION_SETTER(UseGenericStateMachine)
   KERNEL_ENVIRONMENT_CONFIGURATION_SETTER(MayUseNestedParallelism)
   KERNEL_ENVIRONMENT_CONFIGURATION_SETTER(ExecMode)
-  KERNEL_ENVIRONMENT_CONFIGURATION_SETTER(MinThreads)
-  KERNEL_ENVIRONMENT_CONFIGURATION_SETTER(MaxThreads)
-  KERNEL_ENVIRONMENT_CONFIGURATION_SETTER(MinTeams)
-  KERNEL_ENVIRONMENT_CONFIGURATION_SETTER(MaxTeams)
 
 #undef KERNEL_ENVIRONMENT_CONFIGURATION_SETTER
 
@@ -3739,19 +3723,6 @@ struct AAKernelInfoFunction : AAKernelInfo {
     else
       setExecModeOfKernelEnvironment(AssumedExecModeC);
 
-    auto *Int32Ty = Type::getInt32Ty(Fn->getContext());
-    auto [MinThreads, MaxThreads] =
-        OpenMPIRBuilder::readThreadBoundsForKernel(*Fn);
-    if (MinThreads)
-      setMinThreadsOfKernelEnvironment(ConstantInt::get(Int32Ty, MinThreads));
-    if (MaxThreads)
-      setMaxThreadsOfKernelEnvironment(ConstantInt::get(Int32Ty, MaxThreads));
-    auto [MinTeams, MaxTeams] = OpenMPIRBuilder::readTeamBoundsForKernel(*Fn);
-    if (MinTeams)
-      setMinTeamsOfKernelEnvironment(ConstantInt::get(Int32Ty, MinTeams));
-    if (MaxTeams)
-      setMaxTeamsOfKernelEnvironment(ConstantInt::get(Int32Ty, MaxTeams));
-
     ConstantInt *MayUseNestedParallelismC =
         KernelInfo::getMayUseNestedParallelismFromKernelEnvironment(KernelEnvC);
     ConstantInt *AssumedMayUseNestedParallelismC = ConstantInt::get(
diff --git a/llvm/test/Transforms/OpenMP/always_inline_device.ll b/llvm/test/Transforms/OpenMP/always_inline_device.ll
index 993f097d74118..5478924cf828b 100644
--- a/llvm/test/Transforms/OpenMP/always_inline_device.ll
+++ b/llvm/test/Transforms/OpenMP/always_inline_device.ll
@@ -3,19 +3,19 @@
 
 %struct.ident_t = type { i32, i32, i32, i32, ptr }
 %struct.KernelEnvironmentTy = type { %struct.ConfigurationEnvironmentTy, ptr, ptr }
-%struct.ConfigurationEnvironmentTy = type { i8, i8, i8, i32, i32, i32, i32 }
+%struct.ConfigurationEnvironmentTy = type { i8, i8, i8 }
 @0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
 @1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, ptr @0 }, align 8
 @G = external global i8
 
-@kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
+@kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1 }, ptr @1, ptr null }
 
 ; Function Attrs: convergent norecurse nounwind
 ;.
 ; CHECK: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
 ; CHECK: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
 ; CHECK: @[[G:[a-zA-Z0-9_$"\\.-]+]] = external global i8
-; CHECK: @[[KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 3, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; CHECK: @[[KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 3 }, ptr @[[GLOB1]], ptr null }
 ;.
 define weak void @__omp_offloading_fd02_c0934fc2_foo_l4() #0 {
 ; CHECK: Function Attrs: norecurse nounwind
diff --git a/llvm/test/Transforms/OpenMP/custom_state_machines.ll b/llvm/test/Transforms/OpenMP/custom_state_machines.ll
index 89a298ff19549..9eefb4d5fa509 100644
--- a/llvm/test/Transforms/OpenMP/custom_state_machines.ll
+++ b/llvm/test/Transforms/OpenMP/custom_state_machines.ll
@@ -121,7 +121,7 @@
 
 %struct.ident_t = type { i32, i32, i32, i32, ptr }
 %struct.KernelEnvironmentTy = type { %struct.ConfigurationEnvironmentTy, ptr, ptr }
-%struct.ConfigurationEnvironmentTy = type { i8, i8, i8, i32, i32, i32, i32 }
+%struct.ConfigurationEnvironmentTy = type { i8, i8, i8 }
 
 @0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
 @1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, ptr @0 }, align 8
@@ -129,14 +129,14 @@
 @G = external global i32, align 4
 @3 = private unnamed_addr constant %struct.ident_t { i32 0, i32 322, i32 2, i32 0, ptr @0 }, align 8
 
-@__omp_offloading_14_a36502b_no_state_machine_needed_l14_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
-@__omp_offloading_14_a36502b_simple_state_machine_l22_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
-@__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
-@__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
-@__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
-@__omp_offloading_14_a36502b_simple_state_machine_pure_l77_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
-@__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
-@__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
+@__omp_offloading_14_a36502b_no_state_machine_needed_l14_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1 }, ptr @1, ptr null }
+@__omp_offloading_14_a36502b_simple_state_machine_l22_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1 }, ptr @1, ptr null }
+@__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1 }, ptr @1, ptr null }
+@__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1 }, ptr @1, ptr null }
+@__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1 }, ptr @1, ptr null }
+@__omp_offloading_14_a36502b_simple_state_machine_pure_l77_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1 }, ptr @1, ptr null }
+@__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1 }, ptr @1, ptr null }
+@__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1 }, ptr @1, ptr null }
 
 define weak void @__omp_offloading_14_a36502b_no_state_machine_needed_l14() #0 {
 entry:
@@ -840,14 +840,14 @@ attributes #9 = { convergent nounwind readonly willreturn }
 ; AMDGPU: @[[GLOB2:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
 ; AMDGPU: @[[G:[a-zA-Z0-9_$"\\.-]+]] = external global i32, align 4
 ; AMDGPU: @[[GLOB3:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 322, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
-; AMDGPU: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_NEEDED_L14_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_L22_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_L39_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_WITH_FALLBACK_L55_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_NO_OPENMP_ATTR_L66_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_PURE_L77_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_NESTED_RECURSIVE_L92_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_WEAK_CALLEE_L112_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_NEEDED_L14_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_L22_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_L39_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_WITH_FALLBACK_L55_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_NO_OPENMP_ATTR_L66_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_PURE_L77_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_NESTED_RECURSIVE_L92_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_WEAK_CALLEE_L112_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null }
 ; AMDGPU: @[[__OMP_OUTLINED__2_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
 ; AMDGPU: @[[__OMP_OUTLINED__3_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
 ; AMDGPU: @[[__OMP_OUTLINED__5_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
@@ -863,14 +863,14 @@ attributes #9 = { convergent nounwind readonly willreturn }
 ; NVPTX: @[[GLOB2:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
 ; NVPTX: @[[G:[a-zA-Z0-9_$"\\.-]+]] = external global i32, align 4
 ; NVPTX: @[[GLOB3:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 322, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
-; NVPTX: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_NEEDED_L14_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_L22_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_L39_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_WITH_FALLBACK_L55_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_NO_OPENMP_ATTR_L66_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_PURE_L77_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_NESTED_RECURSIVE_L92_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_WEAK_CALLEE_L112_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_NEEDED_L14_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null }
+; NVPTX: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_L22_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
+; NVPTX: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_L39_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
+; NVPTX: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_WITH_FALLBACK_L55_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
+; NVPTX: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_NO_OPENMP_ATTR_L66_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
+; NVPTX: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_PURE_L77_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
+; NVPTX: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_NESTED_RECURSIVE_L92_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3 }, ptr @[[GLOB1]], ptr null }
+; NVPTX: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_WEAK_CALLEE_L112_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null }
 ; NVPTX: @[[__OMP_OUTLINED__2_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
 ; NVPTX: @[[__OMP_OUTLINED__3_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
 ; NVPTX: @[[__OMP_OUTLINED__5_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
@@ -886,28 +886,28 @@ attributes #9 = { convergent nounwind readonly willreturn }
 ; AMDGPU-DISABLED: @[[GLOB2:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
 ; AMDGPU-DISABLED: @[[G:[a-zA-Z0-9_$"\\.-]+]] = external global i32, align 4
 ; AMDGPU-DISABLED: @[[GLOB3:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 322, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
-; AMDGPU-DISABLED: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_NEEDED_L14_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU-DISABLED: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_L22_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU-DISABLED: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_L39_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU-DISABLED: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_WITH_FALLBACK_L55_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU-DISABLED: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_NO_OPENMP_ATTR_L66_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU-DISABLED: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_PURE_L77_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU-DISABLED: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_NESTED_RECURSIVE_L92_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU-DISABLED: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_WEAK_CALLEE_L112_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU-DISABLED: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_NEEDED_L14_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU-DISABLED: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_L22_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU-DISABLED: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_L39_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU-DISABLED: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_WITH_FALLBACK_L55_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU-DISABLED: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_NO_OPENMP_ATTR_L66_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU-DISABLED: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_PURE_L77_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU-DISABLED: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_NESTED_RECURSIVE_L92_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 3 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU-DISABLED: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_WEAK_CALLEE_L112_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null }
 ;.
 ; NVPTX-DISABLED: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
 ; NVPTX-DISABLED: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
 ; NVPTX-DISABLED: @[[GLOB2:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
 ; NVPTX-DISABLED: @[[G:[a-zA-Z0-9_$"\\.-]+]] = external global i32, align 4
 ; NVPTX-DISABLED: @[[GLOB3:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 322, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
-; NVPTX-DISABLED: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_NEEDED_L14_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX-DISABLED: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_L22_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX-DISABLED: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_L39_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX-DISABLED: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_WITH_FALLBACK_L55_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX-DISABLED: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_NO_OPENMP_ATTR_L66_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX-DISABLED: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_PURE_L77_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX-DISABLED: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_NESTED_RECURSIVE_L92_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX-DISABLED: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_WEAK_CALLEE_L112_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX-DISABLED: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_NEEDED_L14_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null }
+; NVPTX-DISABLED: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_L22_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
+; NVPTX-DISABLED: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_L39_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
+; NVPTX-DISABLED: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_WITH_FALLBACK_L55_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
+; NVPTX-DISABLED: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_NO_OPENMP_ATTR_L66_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
+; NVPTX-DISABLED: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_PURE_L77_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
+; NVPTX-DISABLED: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_NESTED_RECURSIVE_L92_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 3 }, ptr @[[GLOB1]], ptr null }
+; NVPTX-DISABLED: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_WEAK_CALLEE_L112_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null }
 ;.
 ; AMDGPU: Function Attrs: convergent noinline norecurse nounwind
 ; AMDGPU-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_no_state_machine_needed_l14
diff --git a/llvm/test/Transforms/OpenMP/custom_state_machines_pre_lto.ll b/llvm/test/Transforms/OpenMP/custom_state_machines_pre_lto.ll
index 4965c18f33f59..5b41179e14652 100644
--- a/llvm/test/Transforms/OpenMP/custom_state_machines_pre_lto.ll
+++ b/llvm/test/Transforms/OpenMP/custom_state_machines_pre_lto.ll
@@ -123,21 +123,21 @@
 
 %struct.ident_t = type { i32, i32, i32, i32, ptr }
 %struct.KernelEnvironmentTy = type { %struct.ConfigurationEnvironmentTy, ptr, ptr }
-%struct.ConfigurationEnvironmentTy = type { i8, i8, i8, i32, i32, i32, i32 }
+%struct.ConfigurationEnvironmentTy = type { i8, i8, i8 }
 
 @0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
 @1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, ptr @0 }, align 8
 @2 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 2, i32 0, ptr @0 }, align 8
 @G = external global i32, align 4
 @3 = private unnamed_addr constant %struct.ident_t { i32 0, i32 322, i32 2, i32 0, ptr @0 }, align 8
-@__omp_offloading_14_a36502b_no_state_machine_needed_l14_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
-@__omp_offloading_14_a36502b_simple_state_machine_l22_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
-@__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
-@__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
-@__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
-@__omp_offloading_14_a36502b_simple_state_machine_pure_l77_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
-@__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
-@__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
+@__omp_offloading_14_a36502b_no_state_machine_needed_l14_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1 }, ptr @1, ptr null }
+@__omp_offloading_14_a36502b_simple_state_machine_l22_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1 }, ptr @1, ptr null }
+@__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1 }, ptr @1, ptr null }
+@__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1 }, ptr @1, ptr null }
+@__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1 }, ptr @1, ptr null }
+@__omp_offloading_14_a36502b_simple_state_machine_pure_l77_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1 }, ptr @1, ptr null }
+@__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1 }, ptr @1, ptr null }
+@__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1 }, ptr @1, ptr null }
 
 define weak void @__omp_offloading_14_a36502b_no_state_machine_needed_l14() #0 {
 entry:
@@ -839,84 +839,84 @@ attributes #9 = { convergent nounwind readonly willreturn }
 ; AMDGPU1: @[[GLOB2:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
 ; AMDGPU1: @[[G:[a-zA-Z0-9_$"\\.-]+]] = external global i32, align 4
 ; AMDGPU1: @[[GLOB3:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 322, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
-; AMDGPU1: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_NEEDED_L14_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU1: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_L22_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU1: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_L39_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU1: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_WITH_FALLBACK_L55_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU1: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_NO_OPENMP_ATTR_L66_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU1: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_PURE_L77_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU1: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_NESTED_RECURSIVE_L92_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU1: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_WEAK_CALLEE_L112_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU1: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_NEEDED_L14_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU1: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_L22_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU1: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_L39_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU1: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_WITH_FALLBACK_L55_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU1: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_NO_OPENMP_ATTR_L66_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU1: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_PURE_L77_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU1: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_NESTED_RECURSIVE_L92_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU1: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_WEAK_CALLEE_L112_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null }
 ;.
 ; NVPTX1: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
 ; NVPTX1: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
 ; NVPTX1: @[[GLOB2:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
 ; NVPTX1: @[[G:[a-zA-Z0-9_$"\\.-]+]] = external global i32, align 4
 ; NVPTX1: @[[GLOB3:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 322, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
-; NVPTX1: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_NEEDED_L14_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX1: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_L22_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX1: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_L39_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX1: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_WITH_FALLBACK_L55_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX1: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_NO_OPENMP_ATTR_L66_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX1: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_PURE_L77_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX1: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_NESTED_RECURSIVE_L92_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX1: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_WEAK_CALLEE_L112_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX1: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_NEEDED_L14_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null }
+; NVPTX1: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_L22_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
+; NVPTX1: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_L39_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
+; NVPTX1: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_WITH_FALLBACK_L55_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
+; NVPTX1: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_NO_OPENMP_ATTR_L66_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
+; NVPTX1: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_PURE_L77_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
+; NVPTX1: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_NESTED_RECURSIVE_L92_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3 }, ptr @[[GLOB1]], ptr null }
+; NVPTX1: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_WEAK_CALLEE_L112_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null }
 ;.
 ; AMDGPU2: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
 ; AMDGPU2: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
 ; AMDGPU2: @[[GLOB2:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
 ; AMDGPU2: @[[G:[a-zA-Z0-9_$"\\.-]+]] = external global i32, align 4
 ; AMDGPU2: @[[GLOB3:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 322, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
-; AMDGPU2: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_NEEDED_L14_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU2: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_L22_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU2: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_L39_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU2: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_WITH_FALLBACK_L55_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU2: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_NO_OPENMP_ATTR_L66_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU2: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_PURE_L77_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU2: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_NESTED_RECURSIVE_L92_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU2: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_WEAK_CALLEE_L112_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU2: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_NEEDED_L14_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU2: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_L22_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU2: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_L39_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU2: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_WITH_FALLBACK_L55_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU2: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_NO_OPENMP_ATTR_L66_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU2: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_PURE_L77_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU2: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_NESTED_RECURSIVE_L92_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 3 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU2: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_WEAK_CALLEE_L112_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null }
 ;.
 ; AMDGPU3: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
 ; AMDGPU3: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
 ; AMDGPU3: @[[GLOB2:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
 ; AMDGPU3: @[[G:[a-zA-Z0-9_$"\\.-]+]] = external global i32, align 4
 ; AMDGPU3: @[[GLOB3:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 322, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
-; AMDGPU3: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_NEEDED_L14_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU3: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_L22_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU3: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_L39_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU3: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_WITH_FALLBACK_L55_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU3: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_NO_OPENMP_ATTR_L66_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU3: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_PURE_L77_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU3: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_NESTED_RECURSIVE_L92_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU3: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_WEAK_CALLEE_L112_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU3: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_NEEDED_L14_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU3: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_L22_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU3: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_L39_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU3: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_WITH_FALLBACK_L55_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU3: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_NO_OPENMP_ATTR_L66_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU3: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_PURE_L77_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU3: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_NESTED_RECURSIVE_L92_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU3: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_WEAK_CALLEE_L112_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null }
 ;.
 ; NVPTX2: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
 ; NVPTX2: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
 ; NVPTX2: @[[GLOB2:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
 ; NVPTX2: @[[G:[a-zA-Z0-9_$"\\.-]+]] = external global i32, align 4
 ; NVPTX2: @[[GLOB3:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 322, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
-; NVPTX2: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_NEEDED_L14_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX2: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_L22_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX2: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_L39_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX2: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_WITH_FALLBACK_L55_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX2: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_NO_OPENMP_ATTR_L66_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX2: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_PURE_L77_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX2: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_NESTED_RECURSIVE_L92_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX2: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_WEAK_CALLEE_L112_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX2: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_NEEDED_L14_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null }
+; NVPTX2: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_L22_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
+; NVPTX2: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_L39_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
+; NVPTX2: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_WITH_FALLBACK_L55_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
+; NVPTX2: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_NO_OPENMP_ATTR_L66_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
+; NVPTX2: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_PURE_L77_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
+; NVPTX2: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_NESTED_RECURSIVE_L92_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 3 }, ptr @[[GLOB1]], ptr null }
+; NVPTX2: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_WEAK_CALLEE_L112_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null }
 ;.
 ; NVPTX3: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
 ; NVPTX3: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
 ; NVPTX3: @[[GLOB2:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
 ; NVPTX3: @[[G:[a-zA-Z0-9_$"\\.-]+]] = external global i32, align 4
 ; NVPTX3: @[[GLOB3:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 322, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
-; NVPTX3: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_NEEDED_L14_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX3: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_L22_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX3: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_L39_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX3: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_WITH_FALLBACK_L55_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX3: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_NO_OPENMP_ATTR_L66_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX3: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_PURE_L77_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX3: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_NESTED_RECURSIVE_L92_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX3: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_WEAK_CALLEE_L112_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX3: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_NEEDED_L14_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null }
+; NVPTX3: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_L22_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
+; NVPTX3: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_L39_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
+; NVPTX3: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_WITH_FALLBACK_L55_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
+; NVPTX3: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_NO_OPENMP_ATTR_L66_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
+; NVPTX3: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_PURE_L77_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
+; NVPTX3: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_NESTED_RECURSIVE_L92_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3 }, ptr @[[GLOB1]], ptr null }
+; NVPTX3: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_WEAK_CALLEE_L112_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null }
 ;.
 ; AMDGPU1: Function Attrs: convergent noinline norecurse nounwind
 ; AMDGPU1-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_no_state_machine_needed_l14
diff --git a/llvm/test/Transforms/OpenMP/custom_state_machines_remarks.ll b/llvm/test/Transforms/OpenMP/custom_state_machines_remarks.ll
index 621ad16d836c0..0c542bb27f4f4 100644
--- a/llvm/test/Transforms/OpenMP/custom_state_machines_remarks.ll
+++ b/llvm/test/Transforms/OpenMP/custom_state_machines_remarks.ll
@@ -37,7 +37,7 @@ target triple = "nvptx64"
 
 %struct.ident_t = type { i32, i32, i32, i32, ptr }
 %struct.KernelEnvironmentTy = type { %struct.ConfigurationEnvironmentTy, ptr, ptr }
-%struct.ConfigurationEnvironmentTy = type { i8, i8, i8, i32, i32, i32, i32 }
+%struct.ConfigurationEnvironmentTy = type { i8, i8, i8 }
 
 @0 = private unnamed_addr constant [113 x i8] c";llvm/test/Transforms/OpenMP/custom_state_machines_remarks.c;__omp_offloading_2a_d80d3d_test_fallback_l11;11;1;;\00", align 1
 @1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, ptr @0 }, align 8
@@ -54,8 +54,8 @@ target triple = "nvptx64"
 @12 = private unnamed_addr constant [73 x i8] c";llvm/test/Transforms/OpenMP/custom_state_machines_remarks.c;known;4;1;;\00", align 1
 @13 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 2, i32 0, ptr @12 }, align 8
 @G = external global i32
-@__omp_offloading_2a_d80d3d_test_fallback_l11_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr null, ptr null }
-@__omp_offloading_2a_d80d3d_test_no_fallback_l20_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr null, ptr null }
+@__omp_offloading_2a_d80d3d_test_fallback_l11_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1 }, ptr null, ptr null }
+@__omp_offloading_2a_d80d3d_test_no_fallback_l20_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1 }, ptr null, ptr null }
 
 
 ; Function Attrs: convergent norecurse nounwind
diff --git a/llvm/test/Transforms/OpenMP/deduplication_target.ll b/llvm/test/Transforms/OpenMP/deduplication_target.ll
index 6174fc64b90bd..eb9177cff4bbf 100644
--- a/llvm/test/Transforms/OpenMP/deduplication_target.ll
+++ b/llvm/test/Transforms/OpenMP/deduplication_target.ll
@@ -6,12 +6,12 @@ target triple = "nvptx64"
 
 %struct.ident_t = type { i32, i32, i32, i32, ptr }
 %struct.KernelEnvironmentTy = type { %struct.ConfigurationEnvironmentTy, ptr, ptr }
-%struct.ConfigurationEnvironmentTy = type { i8, i8, i8, i32, i32, i32, i32 }
+%struct.ConfigurationEnvironmentTy = type { i8, i8, i8 }
 
 @0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
 @1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, ptr @0 }, align 8
 @2 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 1, i32 0, ptr @0 }, align 8
-@__omp_offloading_50_a3e09bf8_foo_l2_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 2, i32 0, i32 0, i32 0, i32 0 }, ptr null, ptr null }
+@__omp_offloading_50_a3e09bf8_foo_l2_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 2 }, ptr null, ptr null }
 
 declare void @use(i32)
 
diff --git a/llvm/test/Transforms/OpenMP/get_hardware_num_threads_in_block_fold.ll b/llvm/test/Transforms/OpenMP/get_hardware_num_threads_in_block_fold.ll
index abbfc424cedc2..d8605f7150e74 100644
--- a/llvm/test/Transforms/OpenMP/get_hardware_num_threads_in_block_fold.ll
+++ b/llvm/test/Transforms/OpenMP/get_hardware_num_threads_in_block_fold.ll
@@ -3,19 +3,19 @@
 target triple = "nvptx64"
 
 %struct.KernelEnvironmentTy = type { %struct.ConfigurationEnvironmentTy, ptr, ptr }
-%struct.ConfigurationEnvironmentTy = type { i8, i8, i8, i32, i32, i32, i32 }
+%struct.ConfigurationEnvironmentTy = type { i8, i8, i8 }
 
 @G = external global i32
 
-@kernel0_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr null, ptr null }
-@kernel1_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr null, ptr null }
-@kernel2_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr null, ptr null }
+@kernel0_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1 }, ptr null, ptr null }
+@kernel1_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1 }, ptr null, ptr null }
+@kernel2_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1 }, ptr null, ptr null }
 
 ;.
 ; CHECK: @[[G:[a-zA-Z0-9_$"\\.-]+]] = external global i32
-; CHECK: @[[KERNEL0_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 3, i32 0, i32 666, i32 0, i32 777 }, ptr null, ptr null }
-; CHECK: @[[KERNEL1_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 3, i32 0, i32 666, i32 0, i32 777 }, ptr null, ptr null }
-; CHECK: @[[KERNEL2_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 3, i32 0, i32 666, i32 0, i32 777 }, ptr null, ptr null }
+; CHECK: @[[KERNEL0_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 3 }, ptr null, ptr null }
+; CHECK: @[[KERNEL1_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 3 }, ptr null, ptr null }
+; CHECK: @[[KERNEL2_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 3 }, ptr null, ptr null }
 ; CHECK: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
 ; CHECK: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 22, ptr @[[GLOB0]] }, align 8
 ;.
diff --git a/llvm/test/Transforms/OpenMP/global_constructor.ll b/llvm/test/Transforms/OpenMP/global_constructor.ll
index 35c027ed5c164..e1a394b526a97 100644
--- a/llvm/test/Transforms/OpenMP/global_constructor.ll
+++ b/llvm/test/Transforms/OpenMP/global_constructor.ll
@@ -3,12 +3,12 @@
 
 %struct.ident_t = type { i32, i32, i32, i32, ptr }
 %struct.KernelEnvironmentTy = type { %struct.ConfigurationEnvironmentTy, ptr, ptr }
-%struct.ConfigurationEnvironmentTy = type { i8, i8, i8, i32, i32, i32, i32 }
+%struct.ConfigurationEnvironmentTy = type { i8, i8, i8 }
 
 @0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
 @1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, ptr @0 }, align 8
 @_ZL6Device = internal global double 0.000000e+00, align 8
-@__omp_offloading_fd02_85283c04_main_l11_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
+@__omp_offloading_fd02_85283c04_main_l11_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1 }, ptr @1, ptr null }
 
 define weak void @__omp_offloading_fd02_85283c04_main_l11(ptr nonnull align 8 dereferenceable(8) %X) local_unnamed_addr "kernel" {
 entry:
diff --git a/llvm/test/Transforms/OpenMP/globalization_remarks.ll b/llvm/test/Transforms/OpenMP/globalization_remarks.ll
index df369c7f3f19c..fdd0e41fd2525 100644
--- a/llvm/test/Transforms/OpenMP/globalization_remarks.ll
+++ b/llvm/test/Transforms/OpenMP/globalization_remarks.ll
@@ -8,10 +8,10 @@ target triple = "nvptx64"
 ; CHECK: remark: globalization_remarks.c:5:7: Found thread data sharing on the GPU. Expect degraded performance due to data globalization.
 
 %struct.KernelEnvironmentTy = type { %struct.ConfigurationEnvironmentTy, ptr, ptr }
-%struct.ConfigurationEnvironmentTy = type { i8, i8, i8, i32, i32, i32, i32 }
+%struct.ConfigurationEnvironmentTy = type { i8, i8, i8 }
 
 @S = external local_unnamed_addr global ptr
-@foo_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr null, ptr null }
+@foo_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1 }, ptr null, ptr null }
 
 define void @foo() "kernel" {
 entry:
diff --git a/llvm/test/Transforms/OpenMP/gpu_state_machine_function_ptr_replacement.ll b/llvm/test/Transforms/OpenMP/gpu_state_machine_function_ptr_replacement.ll
index 94fcb35fc380b..37348e8474b57 100644
--- a/llvm/test/Transforms/OpenMP/gpu_state_machine_function_ptr_replacement.ll
+++ b/llvm/test/Transforms/OpenMP/gpu_state_machine_function_ptr_replacement.ll
@@ -37,12 +37,12 @@
 
 %struct.ident_t = type { i32, i32, i32, i32, ptr }
 %struct.KernelEnvironmentTy = type { %struct.ConfigurationEnvironmentTy, ptr, ptr }
-%struct.ConfigurationEnvironmentTy = type { i8, i8, i8, i32, i32, i32, i32 }
+%struct.ConfigurationEnvironmentTy = type { i8, i8, i8 }
 
 @0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
 @1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, ptr @0 }, align 8
 @2 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 2, i32 0, ptr @0 }, align 8
-@__omp_offloading_10301_87b2c_foo_l7_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
+@__omp_offloading_10301_87b2c_foo_l7_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1 }, ptr @1, ptr null }
 
 define weak void @__omp_offloading_10301_87b2c_foo_l7() "kernel" {
 entry:
diff --git a/llvm/test/Transforms/OpenMP/is_spmd_exec_mode_fold.ll b/llvm/test/Transforms/OpenMP/is_spmd_exec_mode_fold.ll
index afc4043d6dd70..528232fd9e2f9 100644
--- a/llvm/test/Transforms/OpenMP/is_spmd_exec_mode_fold.ll
+++ b/llvm/test/Transforms/OpenMP/is_spmd_exec_mode_fold.ll
@@ -3,20 +3,20 @@
 target triple = "nvptx64"
 
 %struct.KernelEnvironmentTy = type { %struct.ConfigurationEnvironmentTy, ptr, ptr }
-%struct.ConfigurationEnvironmentTy = type { i8, i8, i8, i32, i32, i32, i32 }
+%struct.ConfigurationEnvironmentTy = type { i8, i8, i8 }
 
 @G = external global i8
-@is_spmd_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 2, i32 0, i32 0, i32 0, i32 0 }, ptr null, ptr null }
-@will_be_spmd_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr null, ptr null }
-@none_spmd_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr null, ptr null }
-@will_not_be_spmd_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr null, ptr null }
+@is_spmd_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 2 }, ptr null, ptr null }
+@will_be_spmd_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1 }, ptr null, ptr null }
+@none_spmd_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1 }, ptr null, ptr null }
+@will_not_be_spmd_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1 }, ptr null, ptr null }
 
 ;.
 ; CHECK: @[[G:[a-zA-Z0-9_$"\\.-]+]] = external global i8
-; CHECK: @[[IS_SPMD_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 2, i32 0, i32 0, i32 0, i32 0 }, ptr null, ptr null }
-; CHECK: @[[WILL_BE_SPMD_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 3, i32 0, i32 0, i32 0, i32 0 }, ptr null, ptr null }
-; CHECK: @[[NONE_SPMD_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr null, ptr null }
-; CHECK: @[[WILL_NOT_BE_SPMD_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr null, ptr null }
+; CHECK: @[[IS_SPMD_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 2 }, ptr null, ptr null }
+; CHECK: @[[WILL_BE_SPMD_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 3 }, ptr null, ptr null }
+; CHECK: @[[NONE_SPMD_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1 }, ptr null, ptr null }
+; CHECK: @[[WILL_NOT_BE_SPMD_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1 }, ptr null, ptr null }
 ;.
 define weak void @is_spmd() "kernel" {
 ; CHECK-LABEL: define {{[^@]+}}@is_spmd
diff --git a/llvm/test/Transforms/OpenMP/nested_parallelism.ll b/llvm/test/Transforms/OpenMP/nested_parallelism.ll
index 8fdd438355681..03832b32c0b9a 100644
--- a/llvm/test/Transforms/OpenMP/nested_parallelism.ll
+++ b/llvm/test/Transforms/OpenMP/nested_parallelism.ll
@@ -24,15 +24,15 @@ target triple = "nvptx64"
 
 %struct.ident_t = type { i32, i32, i32, i32, ptr }
 %struct.KernelEnvironmentTy = type { %struct.ConfigurationEnvironmentTy, ptr, ptr }
-%struct.ConfigurationEnvironmentTy = type { i8, i8, i8, i32, i32, i32, i32 }
+%struct.ConfigurationEnvironmentTy = type { i8, i8, i8 }
 
 @0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
 @1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 22, ptr @0 }, align 8
 @i_shared = internal addrspace(3) global [4 x i8] undef, align 16
 @i.i_shared = internal addrspace(3) global [4 x i8] undef, align 16
 
-@__omp_offloading_10302_bd7e0_main_l13_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 2, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
-@__omp_offloading_10302_bd7e0_main_l16_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
+@__omp_offloading_10302_bd7e0_main_l13_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 2 }, ptr @1, ptr null }
+@__omp_offloading_10302_bd7e0_main_l16_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1 }, ptr @1, ptr null }
 
 
 ;.
@@ -40,8 +40,8 @@ target triple = "nvptx64"
 ; CHECK: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 22, ptr @[[GLOB0]] }, align 8
 ; CHECK: @[[I_SHARED:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] undef, align 16
 ; CHECK: @[[I_I_SHARED:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] undef, align 16
-; CHECK: @[[__OMP_OFFLOADING_10302_BD7E0_MAIN_L13_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 2, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; CHECK: @[[__OMP_OFFLOADING_10302_BD7E0_MAIN_L16_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; CHECK: @[[__OMP_OFFLOADING_10302_BD7E0_MAIN_L13_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 2 }, ptr @[[GLOB1]], ptr null }
+; CHECK: @[[__OMP_OFFLOADING_10302_BD7E0_MAIN_L16_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null }
 ;.
 define weak_odr protected void @__omp_offloading_10302_bd7e0_main_l13(i64 noundef %i) local_unnamed_addr "kernel" {
 ; CHECK-LABEL: @__omp_offloading_10302_bd7e0_main_l13(
diff --git a/llvm/test/Transforms/OpenMP/parallel_deletion.ll b/llvm/test/Transforms/OpenMP/parallel_deletion.ll
index 6319875fd9056..5b3eb1869404a 100644
--- a/llvm/test/Transforms/OpenMP/parallel_deletion.ll
+++ b/llvm/test/Transforms/OpenMP/parallel_deletion.ll
@@ -386,12 +386,12 @@ define internal void @.omp_outlined..4(ptr noalias %.global_tid., ptr noalias %.
 ; CHECK-SAME: (ptr noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], ptr noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], ptr nocapture nofree noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_master(ptr noundef nonnull @[[GLOB0]], i32 [[TMP]])
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[TMP2]], label [[OMP_IF_END:%.*]], label [[OMP_IF_THEN:%.*]]
+; CHECK-NEXT:    [[TRUETMP1:%.*]] = call i32 @__kmpc_master(ptr noundef nonnull @[[GLOB0]], i32 [[TMP]])
+; CHECK-NEXT:    [[TRUETMP2:%.*]] = icmp eq i32 [[TRUETMP1]], 0
+; CHECK-NEXT:    br i1 [[TRUETMP2]], label [[OMP_IF_END:%.*]], label [[OMP_IF_THEN:%.*]]
 ; CHECK:       omp_if.then:
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[A]], align 4
-; CHECK-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP3]], 1
+; CHECK-NEXT:    [[TRUETMP3:%.*]] = load i32, ptr [[A]], align 4
+; CHECK-NEXT:    [[INC:%.*]] = add nsw i32 [[TRUETMP3]], 1
 ; CHECK-NEXT:    store i32 [[INC]], ptr [[A]], align 4
 ; CHECK-NEXT:    call void @__kmpc_end_master(ptr noundef nonnull @[[GLOB0]], i32 [[TMP]])
 ; CHECK-NEXT:    br label [[OMP_IF_END]]
@@ -459,12 +459,12 @@ define internal void @.omp_outlined..5(ptr noalias %.global_tid., ptr noalias %.
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(ptr noundef nonnull @[[GLOB0]]) #[[ATTR19]]
 ; CHECK-NEXT:    [[TMP:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_single(ptr noundef nonnull @[[GLOB0]], i32 [[TMP]])
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 0
-; CHECK-NEXT:    br i1 [[TMP2]], label [[OMP_IF_END:%.*]], label [[OMP_IF_THEN:%.*]]
+; CHECK-NEXT:    [[TRUETMP1:%.*]] = call i32 @__kmpc_single(ptr noundef nonnull @[[GLOB0]], i32 [[TMP]])
+; CHECK-NEXT:    [[TRUETMP2:%.*]] = icmp eq i32 [[TRUETMP1]], 0
+; CHECK-NEXT:    br i1 [[TRUETMP2]], label [[OMP_IF_END:%.*]], label [[OMP_IF_THEN:%.*]]
 ; CHECK:       omp_if.then:
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[A]], align 4
-; CHECK-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP3]], 1
+; CHECK-NEXT:    [[TRUETMP3:%.*]] = load i32, ptr [[A]], align 4
+; CHECK-NEXT:    [[INC:%.*]] = add nsw i32 [[TRUETMP3]], 1
 ; CHECK-NEXT:    store i32 [[INC]], ptr [[A]], align 4
 ; CHECK-NEXT:    call void @__kmpc_end_single(ptr noundef nonnull @[[GLOB0]], i32 [[TMP]])
 ; CHECK-NEXT:    br label [[OMP_IF_END]]
@@ -534,22 +534,22 @@ define internal void @.omp_outlined..6(ptr noalias %.global_tid., ptr noalias %.
 ; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 noundef 4, ptr noundef nonnull align 4 [[A1]]) #[[ATTR20:[0-9]+]]
 ; CHECK-NEXT:    store i32 1, ptr [[A1]], align 4
 ; CHECK-NEXT:    store ptr [[A1]], ptr [[DOTOMP_REDUCTION_RED_LIST]], align 8
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_reduce_nowait(ptr noundef nonnull @[[GLOB2:[0-9]+]], i32 [[TMP2]], i32 noundef 1, i64 noundef 8, ptr noundef nonnull align 8 [[DOTOMP_REDUCTION_RED_LIST]], ptr noundef nonnull @.omp.reduction.reduction_func, ptr noundef nonnull @.gomp_critical_user_.reduction.var)
-; CHECK-NEXT:    switch i32 [[TMP4]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [
+; CHECK-NEXT:    [[TRUETMP2:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4
+; CHECK-NEXT:    [[TRUETMP4:%.*]] = call i32 @__kmpc_reduce_nowait(ptr noundef nonnull @[[GLOB2:[0-9]+]], i32 [[TRUETMP2]], i32 noundef 1, i64 noundef 8, ptr noundef nonnull align 8 [[DOTOMP_REDUCTION_RED_LIST]], ptr noundef nonnull @.omp.reduction.reduction_func, ptr noundef nonnull @.gomp_critical_user_.reduction.var)
+; CHECK-NEXT:    switch i32 [[TRUETMP4]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [
 ; CHECK-NEXT:    i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]]
 ; CHECK-NEXT:    i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]]
 ; CHECK-NEXT:    ]
 ; CHECK:       .omp.reduction.case1:
-; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[A]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[A1]], align 4
-; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[TRUETMP5:%.*]] = load i32, ptr [[A]], align 4
+; CHECK-NEXT:    [[TRUETMP6:%.*]] = load i32, ptr [[A1]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TRUETMP5]], [[TRUETMP6]]
 ; CHECK-NEXT:    store i32 [[ADD]], ptr [[A]], align 4
-; CHECK-NEXT:    call void @__kmpc_end_reduce_nowait(ptr noundef nonnull @[[GLOB2]], i32 [[TMP2]], ptr noundef nonnull @.gomp_critical_user_.reduction.var)
+; CHECK-NEXT:    call void @__kmpc_end_reduce_nowait(ptr noundef nonnull @[[GLOB2]], i32 [[TRUETMP2]], ptr noundef nonnull @.gomp_critical_user_.reduction.var)
 ; CHECK-NEXT:    br label [[DOTOMP_REDUCTION_DEFAULT]]
 ; CHECK:       .omp.reduction.case2:
-; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[A1]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = atomicrmw add ptr [[A]], i32 [[TMP7]] monotonic, align 4
+; CHECK-NEXT:    [[TRUETMP7:%.*]] = load i32, ptr [[A1]], align 4
+; CHECK-NEXT:    [[TRUETMP8:%.*]] = atomicrmw add ptr [[A]], i32 [[TRUETMP7]] monotonic, align 4
 ; CHECK-NEXT:    br label [[DOTOMP_REDUCTION_DEFAULT]]
 ; CHECK:       .omp.reduction.default:
 ; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 noundef 4, ptr noundef nonnull [[A1]])
@@ -646,12 +646,12 @@ define internal void @.omp.reduction.reduction_func(ptr %arg, ptr %arg1) {
 ; CHECK-LABEL: define {{[^@]+}}@.omp.reduction.reduction_func
 ; CHECK-SAME: (ptr nocapture nofree noundef nonnull readonly align 8 dereferenceable(8) [[ARG:%.*]], ptr nocapture nofree noundef nonnull readonly align 8 dereferenceable(8) [[ARG1:%.*]]) #[[ATTR10:[0-9]+]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[ARG1]], align 8
-; CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[ARG]], align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP2]], align 4
-; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP5]], [[TMP6]]
-; CHECK-NEXT:    store i32 [[ADD]], ptr [[TMP4]], align 4
+; CHECK-NEXT:    [[TRUETMP2:%.*]] = load ptr, ptr [[ARG1]], align 8
+; CHECK-NEXT:    [[TRUETMP4:%.*]] = load ptr, ptr [[ARG]], align 8
+; CHECK-NEXT:    [[TRUETMP5:%.*]] = load i32, ptr [[TRUETMP4]], align 4
+; CHECK-NEXT:    [[TRUETMP6:%.*]] = load i32, ptr [[TRUETMP2]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TRUETMP5]], [[TRUETMP6]]
+; CHECK-NEXT:    store i32 [[ADD]], ptr [[TRUETMP4]], align 4
 ; CHECK-NEXT:    ret void
 ;
 ; CHECK1-LABEL: define {{[^@]+}}@.omp.reduction.reduction_func
diff --git a/llvm/test/Transforms/OpenMP/parallel_level_fold.ll b/llvm/test/Transforms/OpenMP/parallel_level_fold.ll
index 2aae5d3186393..eb75d030a05f0 100644
--- a/llvm/test/Transforms/OpenMP/parallel_level_fold.ll
+++ b/llvm/test/Transforms/OpenMP/parallel_level_fold.ll
@@ -3,18 +3,18 @@
 target triple = "nvptx64"
 
 %struct.KernelEnvironmentTy = type { %struct.ConfigurationEnvironmentTy, ptr, ptr }
-%struct.ConfigurationEnvironmentTy = type { i8, i8, i8, i32, i32, i32, i32 }
+%struct.ConfigurationEnvironmentTy = type { i8, i8, i8 }
 
 @G = external global i16
-@none_spmd_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr null, ptr null }
-@spmd_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 2, i32 0, i32 0, i32 0, i32 0 }, ptr null, ptr null }
-@parallel_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 2, i32 0, i32 0, i32 0, i32 0 }, ptr null, ptr null }
+@none_spmd_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1 }, ptr null, ptr null }
+@spmd_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 2 }, ptr null, ptr null }
+@parallel_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 2 }, ptr null, ptr null }
 
 ;.
 ; CHECK: @[[G:[a-zA-Z0-9_$"\\.-]+]] = external global i16
-; CHECK: @[[NONE_SPMD_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr null, ptr null }
-; CHECK: @[[SPMD_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 2, i32 0, i32 0, i32 0, i32 0 }, ptr null, ptr null }
-; CHECK: @[[PARALLEL_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 2, i32 0, i32 0, i32 0, i32 0 }, ptr null, ptr null }
+; CHECK: @[[NONE_SPMD_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1 }, ptr null, ptr null }
+; CHECK: @[[SPMD_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 2 }, ptr null, ptr null }
+; CHECK: @[[PARALLEL_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 2 }, ptr null, ptr null }
 ;.
 define weak void @none_spmd() "kernel" {
 ; CHECK-LABEL: define {{[^@]+}}@none_spmd
diff --git a/llvm/test/Transforms/OpenMP/remove_globalization.ll b/llvm/test/Transforms/OpenMP/remove_globalization.ll
index 4db48c9a874c5..2326993589299 100644
--- a/llvm/test/Transforms/OpenMP/remove_globalization.ll
+++ b/llvm/test/Transforms/OpenMP/remove_globalization.ll
@@ -7,8 +7,8 @@ target triple = "nvptx64"
 
 @S = external local_unnamed_addr global ptr
 %struct.KernelEnvironmentTy = type { %struct.ConfigurationEnvironmentTy, ptr, ptr }
-%struct.ConfigurationEnvironmentTy = type { i8, i8, i8, i32, i32, i32, i32 }
-@kernel_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr null, ptr null }
+%struct.ConfigurationEnvironmentTy = type { i8, i8, i8 }
+@kernel_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1 }, ptr null, ptr null }
 
 
 ; UTC_ARGS: --disable
@@ -22,10 +22,10 @@ target triple = "nvptx64"
 ; Make it a weak definition so we will apply custom state machine rewriting but can't use the body in the reasoning.
 ;.
 ; CHECK: @[[S:[a-zA-Z0-9_$"\\.-]+]] = external local_unnamed_addr global ptr
-; CHECK: @[[KERNEL_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr null, ptr null }
+; CHECK: @[[KERNEL_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1 }, ptr null, ptr null }
 ;.
 ; CHECK-DISABLED: @[[S:[a-zA-Z0-9_$"\\.-]+]] = external local_unnamed_addr global ptr
-; CHECK-DISABLED: @[[KERNEL_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr null, ptr null }
+; CHECK-DISABLED: @[[KERNEL_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1 }, ptr null, ptr null }
 ;.
 define weak i32 @__kmpc_target_init(ptr %0) {
 ; CHECK-LABEL: define {{[^@]+}}@__kmpc_target_init
diff --git a/llvm/test/Transforms/OpenMP/replace_globalization.ll b/llvm/test/Transforms/OpenMP/replace_globalization.ll
index d7eb120e0d1ff..6469558728757 100644
--- a/llvm/test/Transforms/OpenMP/replace_globalization.ll
+++ b/llvm/test/Transforms/OpenMP/replace_globalization.ll
@@ -15,14 +15,14 @@ target triple = "nvptx64"
 
 %struct.ident_t = type { i32, i32, i32, i32, ptr }
 %struct.KernelEnvironmentTy = type { %struct.ConfigurationEnvironmentTy, ptr, ptr }
-%struct.ConfigurationEnvironmentTy = type { i8, i8, i8, i32, i32, i32, i32 }
+%struct.ConfigurationEnvironmentTy = type { i8, i8, i8 }
 
 @S = external local_unnamed_addr global ptr
 @0 = private unnamed_addr constant [113 x i8] c";llvm/test/Transforms/OpenMP/custom_state_machines_remarks.c;__omp_offloading_2a_d80d3d_test_fallback_l11;11;1;;\00", align 1
 @1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, ptr @0 }, align 8
-@foo_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
-@bar_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
-@baz_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 2, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
+@foo_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1 }, ptr @1, ptr null }
+@bar_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1 }, ptr @1, ptr null }
+@baz_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 2 }, ptr @1, ptr null }
 
 
 define dso_local void @foo() "kernel" {
@@ -129,9 +129,9 @@ declare void @unknown_no_openmp() "llvm.assume"="omp_no_openmp"
 ; CHECK: @[[S:[a-zA-Z0-9_$"\\.-]+]] = external local_unnamed_addr global ptr
 ; CHECK: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [113 x i8] c"
 ; CHECK: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
-; CHECK: @[[FOO_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; CHECK: @[[BAR_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; CHECK: @[[BAZ_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 2, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; CHECK: @[[FOO_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null }
+; CHECK: @[[BAR_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null }
+; CHECK: @[[BAZ_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 2 }, ptr @[[GLOB1]], ptr null }
 ; CHECK: @[[OFFSET:[a-zA-Z0-9_$"\\.-]+]] = global i32 undef
 ; CHECK: @[[STACK:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [1024 x i8] undef
 ; CHECK: @[[X_SHARED:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [16 x i8] poison, align 4
diff --git a/llvm/test/Transforms/OpenMP/single_threaded_execution.ll b/llvm/test/Transforms/OpenMP/single_threaded_execution.ll
index 0a1ad421fb502..01f73714eb486 100644
--- a/llvm/test/Transforms/OpenMP/single_threaded_execution.ll
+++ b/llvm/test/Transforms/OpenMP/single_threaded_execution.ll
@@ -5,11 +5,11 @@
 
 %struct.ident_t = type { i32, i32, i32, i32, ptr }
 %struct.KernelEnvironmentTy = type { %struct.ConfigurationEnvironmentTy, ptr, ptr }
-%struct.ConfigurationEnvironmentTy = type { i8, i8, i8, i32, i32, i32, i32 }
+%struct.ConfigurationEnvironmentTy = type { i8, i8, i8 }
 
 @0 = private unnamed_addr constant [1 x i8] c"\00", align 1
 @1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, ptr @0 }, align 8
-@kernel_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
+@kernel_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1 }, ptr @1, ptr null }
 
 
 ; CHECK-NOT: [openmp-opt] Basic block @kernel entry is executed by a single thread.
diff --git a/llvm/test/Transforms/OpenMP/spmdization.ll b/llvm/test/Transforms/OpenMP/spmdization.ll
index fd272c017db13..30e3f7a1bd268 100644
--- a/llvm/test/Transforms/OpenMP/spmdization.ll
+++ b/llvm/test/Transforms/OpenMP/spmdization.ll
@@ -93,17 +93,17 @@
 %struct.kmp_task_t = type { ptr, ptr, i32, %union.kmp_cmplrdata_t, %union.kmp_cmplrdata_t }
 %union.kmp_cmplrdata_t = type { ptr }
 %struct.anon = type {}
-%struct.ConfigurationEnvironmentTy = type { i8, i8, i8, i32, i32, i32, i32 }
+%struct.ConfigurationEnvironmentTy = type { i8, i8, i8 }
 %struct.KernelEnvironmentTy = type { %struct.ConfigurationEnvironmentTy, ptr, ptr }
 
 @0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
 @1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, ptr @0 }, align 8
-@__omp_offloading_fd02_2044372e_sequential_loop_l5_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
-@__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
-@__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
-@__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
-@__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
-@__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
+@__omp_offloading_fd02_2044372e_sequential_loop_l5_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1 }, ptr @1, ptr null }
+@__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1 }, ptr @1, ptr null }
+@__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1 }, ptr @1, ptr null }
+@__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1 }, ptr @1, ptr null }
+@__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1 }, ptr @1, ptr null }
+@__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1 }, ptr @1, ptr null }
 
 ; AMDGPU-DISABLED: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
 ; AMDGPU-DISABLED: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
@@ -138,12 +138,12 @@
 ;.
 ; AMDGPU: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
 ; AMDGPU: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
-; AMDGPU: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_L5_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_STACK_VAR_L20_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_SHARED_VAR_L35_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_SHARED_VAR_GUARDED_L50_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU: @[[__OMP_OFFLOADING_FD02_2044372E_DO_NOT_SPMDIZE_TARGET_L65_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU: @[[__OMP_OFFLOADING_FD02_2044372E_DO_NOT_SPMDIZE_TASK_L74_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_L5_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_STACK_VAR_L20_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_SHARED_VAR_L35_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_SHARED_VAR_GUARDED_L50_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU: @[[__OMP_OFFLOADING_FD02_2044372E_DO_NOT_SPMDIZE_TARGET_L65_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU: @[[__OMP_OFFLOADING_FD02_2044372E_DO_NOT_SPMDIZE_TASK_L74_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
 ; AMDGPU: @[[GLOB2:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 22, ptr @[[GLOB0]] }, align 8
 ; AMDGPU: @[[X_SHARED:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] poison, align 4
 ; AMDGPU: @[[X_SHARED_1:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] poison, align 4
@@ -151,12 +151,12 @@
 ;.
 ; NVPTX: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
 ; NVPTX: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
-; NVPTX: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_L5_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_STACK_VAR_L20_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_SHARED_VAR_L35_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_SHARED_VAR_GUARDED_L50_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX: @[[__OMP_OFFLOADING_FD02_2044372E_DO_NOT_SPMDIZE_TARGET_L65_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX: @[[__OMP_OFFLOADING_FD02_2044372E_DO_NOT_SPMDIZE_TASK_L74_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_L5_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3 }, ptr @[[GLOB1]], ptr null }
+; NVPTX: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_STACK_VAR_L20_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3 }, ptr @[[GLOB1]], ptr null }
+; NVPTX: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_SHARED_VAR_L35_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3 }, ptr @[[GLOB1]], ptr null }
+; NVPTX: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_SHARED_VAR_GUARDED_L50_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3 }, ptr @[[GLOB1]], ptr null }
+; NVPTX: @[[__OMP_OFFLOADING_FD02_2044372E_DO_NOT_SPMDIZE_TARGET_L65_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null }
+; NVPTX: @[[__OMP_OFFLOADING_FD02_2044372E_DO_NOT_SPMDIZE_TASK_L74_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
 ; NVPTX: @[[GLOB2:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 22, ptr @[[GLOB0]] }, align 8
 ; NVPTX: @[[X_SHARED:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] poison, align 4
 ; NVPTX: @[[X_SHARED1:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] poison, align 4
@@ -164,12 +164,12 @@
 ;.
 ; AMDGPU-DISABLED1: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
 ; AMDGPU-DISABLED1: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
-; AMDGPU-DISABLED1: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_L5_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU-DISABLED1: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_STACK_VAR_L20_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU-DISABLED1: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_SHARED_VAR_L35_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU-DISABLED1: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_SHARED_VAR_GUARDED_L50_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU-DISABLED1: @[[__OMP_OFFLOADING_FD02_2044372E_DO_NOT_SPMDIZE_TARGET_L65_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU-DISABLED1: @[[__OMP_OFFLOADING_FD02_2044372E_DO_NOT_SPMDIZE_TASK_L74_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU-DISABLED1: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_L5_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU-DISABLED1: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_STACK_VAR_L20_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU-DISABLED1: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_SHARED_VAR_L35_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU-DISABLED1: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_SHARED_VAR_GUARDED_L50_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU-DISABLED1: @[[__OMP_OFFLOADING_FD02_2044372E_DO_NOT_SPMDIZE_TARGET_L65_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU-DISABLED1: @[[__OMP_OFFLOADING_FD02_2044372E_DO_NOT_SPMDIZE_TASK_L74_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
 ; AMDGPU-DISABLED1: @[[X_SHARED:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] poison, align 4
 ; AMDGPU-DISABLED1: @[[X_SHARED_1:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] poison, align 4
 ; AMDGPU-DISABLED1: @[[__OMP_OUTLINED__1_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
@@ -180,12 +180,12 @@
 ;.
 ; AMDGPU-DISABLED2: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
 ; AMDGPU-DISABLED2: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
-; AMDGPU-DISABLED2: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_L5_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU-DISABLED2: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_STACK_VAR_L20_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU-DISABLED2: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_SHARED_VAR_L35_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU-DISABLED2: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_SHARED_VAR_GUARDED_L50_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU-DISABLED2: @[[__OMP_OFFLOADING_FD02_2044372E_DO_NOT_SPMDIZE_TARGET_L65_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU-DISABLED2: @[[__OMP_OFFLOADING_FD02_2044372E_DO_NOT_SPMDIZE_TASK_L74_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU-DISABLED2: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_L5_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU-DISABLED2: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_STACK_VAR_L20_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU-DISABLED2: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_SHARED_VAR_L35_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU-DISABLED2: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_SHARED_VAR_GUARDED_L50_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU-DISABLED2: @[[__OMP_OFFLOADING_FD02_2044372E_DO_NOT_SPMDIZE_TARGET_L65_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU-DISABLED2: @[[__OMP_OFFLOADING_FD02_2044372E_DO_NOT_SPMDIZE_TASK_L74_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
 ; AMDGPU-DISABLED2: @[[X_SHARED:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] poison, align 4
 ; AMDGPU-DISABLED2: @[[X_SHARED_1:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] poison, align 4
 ; AMDGPU-DISABLED2: @[[__OMP_OUTLINED__1_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
@@ -196,12 +196,12 @@
 ;.
 ; NVPTX-DISABLED1: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
 ; NVPTX-DISABLED1: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
-; NVPTX-DISABLED1: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_L5_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX-DISABLED1: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_STACK_VAR_L20_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX-DISABLED1: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_SHARED_VAR_L35_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX-DISABLED1: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_SHARED_VAR_GUARDED_L50_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX-DISABLED1: @[[__OMP_OFFLOADING_FD02_2044372E_DO_NOT_SPMDIZE_TARGET_L65_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX-DISABLED1: @[[__OMP_OFFLOADING_FD02_2044372E_DO_NOT_SPMDIZE_TASK_L74_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX-DISABLED1: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_L5_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
+; NVPTX-DISABLED1: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_STACK_VAR_L20_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
+; NVPTX-DISABLED1: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_SHARED_VAR_L35_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
+; NVPTX-DISABLED1: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_SHARED_VAR_GUARDED_L50_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
+; NVPTX-DISABLED1: @[[__OMP_OFFLOADING_FD02_2044372E_DO_NOT_SPMDIZE_TARGET_L65_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null }
+; NVPTX-DISABLED1: @[[__OMP_OFFLOADING_FD02_2044372E_DO_NOT_SPMDIZE_TASK_L74_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
 ; NVPTX-DISABLED1: @[[X_SHARED:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] poison, align 4
 ; NVPTX-DISABLED1: @[[X_SHARED1:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] poison, align 4
 ; NVPTX-DISABLED1: @[[__OMP_OUTLINED__1_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
@@ -212,12 +212,12 @@
 ;.
 ; NVPTX-DISABLED2: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
 ; NVPTX-DISABLED2: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
-; NVPTX-DISABLED2: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_L5_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX-DISABLED2: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_STACK_VAR_L20_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX-DISABLED2: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_SHARED_VAR_L35_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX-DISABLED2: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_SHARED_VAR_GUARDED_L50_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX-DISABLED2: @[[__OMP_OFFLOADING_FD02_2044372E_DO_NOT_SPMDIZE_TARGET_L65_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX-DISABLED2: @[[__OMP_OFFLOADING_FD02_2044372E_DO_NOT_SPMDIZE_TASK_L74_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX-DISABLED2: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_L5_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3 }, ptr @[[GLOB1]], ptr null }
+; NVPTX-DISABLED2: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_STACK_VAR_L20_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3 }, ptr @[[GLOB1]], ptr null }
+; NVPTX-DISABLED2: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_SHARED_VAR_L35_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3 }, ptr @[[GLOB1]], ptr null }
+; NVPTX-DISABLED2: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_SHARED_VAR_GUARDED_L50_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3 }, ptr @[[GLOB1]], ptr null }
+; NVPTX-DISABLED2: @[[__OMP_OFFLOADING_FD02_2044372E_DO_NOT_SPMDIZE_TARGET_L65_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null }
+; NVPTX-DISABLED2: @[[__OMP_OFFLOADING_FD02_2044372E_DO_NOT_SPMDIZE_TASK_L74_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
 ; NVPTX-DISABLED2: @[[X_SHARED:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] poison, align 4
 ; NVPTX-DISABLED2: @[[X_SHARED1:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] poison, align 4
 ; NVPTX-DISABLED2: @[[__OMP_OUTLINED__1_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
diff --git a/llvm/test/Transforms/OpenMP/spmdization_assumes.ll b/llvm/test/Transforms/OpenMP/spmdization_assumes.ll
index 1c583475d50a6..e02c3af750a29 100644
--- a/llvm/test/Transforms/OpenMP/spmdization_assumes.ll
+++ b/llvm/test/Transforms/OpenMP/spmdization_assumes.ll
@@ -14,18 +14,18 @@ target triple = "nvptx64"
 
 %struct.ident_t = type { i32, i32, i32, i32, ptr }
 %struct.KernelEnvironmentTy = type { %struct.ConfigurationEnvironmentTy, ptr, ptr }
-%struct.ConfigurationEnvironmentTy = type { i8, i8, i8, i32, i32, i32, i32 }
+%struct.ConfigurationEnvironmentTy = type { i8, i8, i8 }
 
 @0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
 @1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, ptr @0 }, align 8
-@__omp_offloading_fd02_404433c2_main_l5_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
+@__omp_offloading_fd02_404433c2_main_l5_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1 }, ptr @1, ptr null }
 
 
 ; Function Attrs: alwaysinline convergent norecurse nounwind
 ;.
 ; CHECK: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
 ; CHECK: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
-; CHECK: @[[__OMP_OFFLOADING_FD02_404433C2_MAIN_L5_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 3, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; CHECK: @[[__OMP_OFFLOADING_FD02_404433C2_MAIN_L5_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 3 }, ptr @[[GLOB1]], ptr null }
 ; CHECK: @[[GLOB2:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 22, ptr @[[GLOB0]] }, align 8
 ;.
 define weak void @__omp_offloading_fd02_404433c2_main_l5(ptr nonnull align 8 dereferenceable(8) %x) local_unnamed_addr #0 {
diff --git a/llvm/test/Transforms/OpenMP/spmdization_constant_prop.ll b/llvm/test/Transforms/OpenMP/spmdization_constant_prop.ll
index 1ac4fd2b45b7a..6de019128e0d4 100644
--- a/llvm/test/Transforms/OpenMP/spmdization_constant_prop.ll
+++ b/llvm/test/Transforms/OpenMP/spmdization_constant_prop.ll
@@ -2,7 +2,7 @@
 ;
 ; Verify we change it to SPMD mode but also avoid propagating the old mode (=generic) into the __kmpc_target_init function.
 ;
-; CHECK: @__omp_offloading_20_11e3950_main_l12_kernel_environment = local_unnamed_addr addrspace(1) constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 3, i32 0, i32 0, i32 0, i32 0 }, ptr addrspacecast (ptr addrspace(1) @1 to ptr), ptr null }
+; CHECK: @__omp_offloading_20_11e3950_main_l12_kernel_environment = local_unnamed_addr addrspace(1) constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 3 }, ptr addrspacecast (ptr addrspace(1) @1 to ptr), ptr null }
 ; CHECK-NOT: store i32 0, ptr addrspace(3) @IsSPMDMode
 ; CHECK-NOT: store i32 0, ptr addrspace(3) @IsSPMDMode
 ; CHECK: store i32 1, ptr addrspace(3) @IsSPMDMode
@@ -14,7 +14,7 @@ target triple = "amdgcn-amd-amdhsa"
 %struct.ident_t = type { i32, i32, i32, i32, ptr }
 %struct.DeviceEnvironmentTy = type { i32, i32, i32, i32 }
 %struct.KernelEnvironmentTy = type { %struct.ConfigurationEnvironmentTy, ptr, ptr }
-%struct.ConfigurationEnvironmentTy = type { i8, i8, i8, i32, i32, i32, i32 }
+%struct.ConfigurationEnvironmentTy = type { i8, i8, i8 }
 %"struct.(anonymous namespace)::SharedMemorySmartStackTy" = type { [512 x i8], [1024 x i8] }
 %"struct.(anonymous namespace)::TeamStateTy" = type { %"struct.(anonymous namespace)::ICVStateTy", i32, ptr }
 %"struct.(anonymous namespace)::ICVStateTy" = type { i32, i32, i32, i32, i32, i32 }
@@ -27,7 +27,7 @@ target triple = "amdgcn-amd-amdhsa"
 @__omp_rtl_debug_kind = weak_odr hidden local_unnamed_addr addrspace(1) constant i32 0
 @__omp_rtl_assume_no_thread_state = weak_odr hidden local_unnamed_addr addrspace(1) constant i32 0
 @omptarget_device_environment = weak protected addrspace(4) global %struct.DeviceEnvironmentTy undef, align 4
-@__omp_offloading_20_11e3950_main_l12_kernel_environment = local_unnamed_addr addrspace(1) constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr addrspacecast (ptr addrspace(1) @1 to ptr), ptr null }
+@__omp_offloading_20_11e3950_main_l12_kernel_environment = local_unnamed_addr addrspace(1) constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1 }, ptr addrspacecast (ptr addrspace(1) @1 to ptr), ptr null }
 @IsSPMDMode = weak hidden addrspace(3) global i32 undef, align 4
 @.str.12 = private unnamed_addr addrspace(4) constant [47 x i8] c"ValueRAII initialization with wrong old value!\00", align 1
 @_ZN12_GLOBAL__N_122SharedMemorySmartStackE = internal addrspace(3) global %"struct.(anonymous namespace)::SharedMemorySmartStackTy" undef, align 16
diff --git a/llvm/test/Transforms/OpenMP/spmdization_guarding.ll b/llvm/test/Transforms/OpenMP/spmdization_guarding.ll
index de73250dbb760..342d9e90e9c55 100644
--- a/llvm/test/Transforms/OpenMP/spmdization_guarding.ll
+++ b/llvm/test/Transforms/OpenMP/spmdization_guarding.ll
@@ -35,13 +35,13 @@ target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
 target triple = "nvptx64"
 
 %struct.ident_t = type { i32, i32, i32, i32, ptr }
-%struct.ConfigurationEnvironmentTy = type { i8, i8, i8, i32, i32, i32, i32 }
+%struct.ConfigurationEnvironmentTy = type { i8, i8, i8 }
 %struct.KernelEnvironmentTy = type { %struct.ConfigurationEnvironmentTy, ptr, ptr }
 
 @0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
 @1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, ptr @0 }, align 8
 @LocGlob = private unnamed_addr addrspace(5) global i32 43
-@__omp_offloading_2a_fbfa7a_sequential_loop_l6_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
+@__omp_offloading_2a_fbfa7a_sequential_loop_l6_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1 }, ptr @1, ptr null }
 
 
 ; Function Attrs: convergent norecurse nounwind
@@ -49,13 +49,13 @@ target triple = "nvptx64"
 ; CHECK: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
 ; CHECK: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
 ; CHECK: @[[LOCGLOB:[a-zA-Z0-9_$"\\.-]+]] = private unnamed_addr addrspace(5) global i32 43
-; CHECK: @[[__OMP_OFFLOADING_2A_FBFA7A_SEQUENTIAL_LOOP_L6_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 3, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; CHECK: @[[__OMP_OFFLOADING_2A_FBFA7A_SEQUENTIAL_LOOP_L6_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 3 }, ptr @[[GLOB1]], ptr null }
 ; CHECK: @[[GLOB2:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 22, ptr @[[GLOB0]] }, align 8
 ;.
 ; CHECK-DISABLED: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
 ; CHECK-DISABLED: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
 ; CHECK-DISABLED: @[[LOCGLOB:[a-zA-Z0-9_$"\\.-]+]] = private unnamed_addr addrspace(5) global i32 43
-; CHECK-DISABLED: @[[__OMP_OFFLOADING_2A_FBFA7A_SEQUENTIAL_LOOP_L6_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; CHECK-DISABLED: @[[__OMP_OFFLOADING_2A_FBFA7A_SEQUENTIAL_LOOP_L6_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null }
 ; CHECK-DISABLED: @[[__OMP_OUTLINED__1_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
 ;.
 define weak void @__omp_offloading_2a_fbfa7a_sequential_loop_l6(ptr %x, i64 %N) #0 {
diff --git a/llvm/test/Transforms/OpenMP/spmdization_guarding_two_reaching_kernels.ll b/llvm/test/Transforms/OpenMP/spmdization_guarding_two_reaching_kernels.ll
index 62aaff572c181..e780687cd795c 100644
--- a/llvm/test/Transforms/OpenMP/spmdization_guarding_two_reaching_kernels.ll
+++ b/llvm/test/Transforms/OpenMP/spmdization_guarding_two_reaching_kernels.ll
@@ -30,31 +30,31 @@ target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
 target triple = "nvptx64"
 
 %struct.ident_t = type { i32, i32, i32, i32, ptr }
-%struct.ConfigurationEnvironmentTy = type { i8, i8, i8, i32, i32, i32, i32 }
+%struct.ConfigurationEnvironmentTy = type { i8, i8, i8 }
 %struct.KernelEnvironmentTy = type { %struct.ConfigurationEnvironmentTy, ptr, ptr }
 
 @0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
 @1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, ptr @0 }, align 8
 @2 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 2, i32 0, ptr @0 }, align 8
 @G = external global i32, align 4
-@__omp_offloading_2b_10393b5_spmd_l12_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
-@__omp_offloading_2b_10393b5_generic_l20_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
+@__omp_offloading_2b_10393b5_spmd_l12_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1 }, ptr @1, ptr null }
+@__omp_offloading_2b_10393b5_generic_l20_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1 }, ptr @1, ptr null }
 
 ;.
 ; CHECK: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
 ; CHECK: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
 ; CHECK: @[[GLOB2:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
 ; CHECK: @[[G:[a-zA-Z0-9_$"\\.-]+]] = external global i32, align 4
-; CHECK: @[[__OMP_OFFLOADING_2B_10393B5_SPMD_L12_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; CHECK: @[[__OMP_OFFLOADING_2B_10393B5_GENERIC_L20_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 3, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; CHECK: @[[__OMP_OFFLOADING_2B_10393B5_SPMD_L12_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3 }, ptr @[[GLOB1]], ptr null }
+; CHECK: @[[__OMP_OFFLOADING_2B_10393B5_GENERIC_L20_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 3 }, ptr @[[GLOB1]], ptr null }
 ; CHECK: @[[GLOB3:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 22, ptr @[[GLOB0]] }, align 8
 ;.
 ; CHECK-DISABLE-SPMDIZATION: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
 ; CHECK-DISABLE-SPMDIZATION: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
 ; CHECK-DISABLE-SPMDIZATION: @[[GLOB2:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
 ; CHECK-DISABLE-SPMDIZATION: @[[G:[a-zA-Z0-9_$"\\.-]+]] = external global i32, align 4
-; CHECK-DISABLE-SPMDIZATION: @[[__OMP_OFFLOADING_2B_10393B5_SPMD_L12_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; CHECK-DISABLE-SPMDIZATION: @[[__OMP_OFFLOADING_2B_10393B5_GENERIC_L20_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; CHECK-DISABLE-SPMDIZATION: @[[__OMP_OFFLOADING_2B_10393B5_SPMD_L12_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
+; CHECK-DISABLE-SPMDIZATION: @[[__OMP_OFFLOADING_2B_10393B5_GENERIC_L20_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null }
 ;.
 define weak void @__omp_offloading_2b_10393b5_spmd_l12() "kernel" #0 {
 ; CHECK-LABEL: define {{[^@]+}}@__omp_offloading_2b_10393b5_spmd_l12
diff --git a/llvm/test/Transforms/OpenMP/spmdization_indirect.ll b/llvm/test/Transforms/OpenMP/spmdization_indirect.ll
index 3a64b2660cf11..04b0e50d4bce4 100644
--- a/llvm/test/Transforms/OpenMP/spmdization_indirect.ll
+++ b/llvm/test/Transforms/OpenMP/spmdization_indirect.ll
@@ -3,30 +3,30 @@
 ; RUN: opt --mtriple=nvptx64-- -S -passes=openmp-opt < %s | FileCheck %s --check-prefixes=NVPTX
 
 %struct.ident_t = type { i32, i32, i32, i32, ptr }
-%struct.ConfigurationEnvironmentTy = type { i8, i8, i8, i32, i32, i32, i32 }
+%struct.ConfigurationEnvironmentTy = type { i8, i8, i8 }
 %struct.KernelEnvironmentTy = type { %struct.ConfigurationEnvironmentTy, ptr, ptr }
 
 @0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
 @1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, ptr @0 }, align 8
-@spmd_callees_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
-@spmd_callees_metadata_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
-@spmd_and_non_spmd_callees_metadata_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
-@spmd_and_non_spmd_callee_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
+@spmd_callees_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1 }, ptr @1, ptr null }
+@spmd_callees_metadata_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1 }, ptr @1, ptr null }
+@spmd_and_non_spmd_callees_metadata_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1 }, ptr @1, ptr null }
+@spmd_and_non_spmd_callee_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1 }, ptr @1, ptr null }
 
 ;.
 ; AMDGPU: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
 ; AMDGPU: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
-; AMDGPU: @[[SPMD_CALLEES_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU: @[[SPMD_CALLEES_METADATA_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 3, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU: @[[SPMD_AND_NON_SPMD_CALLEES_METADATA_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU: @[[SPMD_AND_NON_SPMD_CALLEE_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU: @[[SPMD_CALLEES_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU: @[[SPMD_CALLEES_METADATA_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 3 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU: @[[SPMD_AND_NON_SPMD_CALLEES_METADATA_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU: @[[SPMD_AND_NON_SPMD_CALLEE_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null }
 ;.
 ; NVPTX: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
 ; NVPTX: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
-; NVPTX: @[[SPMD_CALLEES_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX: @[[SPMD_CALLEES_METADATA_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 3, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX: @[[SPMD_AND_NON_SPMD_CALLEES_METADATA_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX: @[[SPMD_AND_NON_SPMD_CALLEE_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX: @[[SPMD_CALLEES_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3 }, ptr @[[GLOB1]], ptr null }
+; NVPTX: @[[SPMD_CALLEES_METADATA_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 3 }, ptr @[[GLOB1]], ptr null }
+; NVPTX: @[[SPMD_AND_NON_SPMD_CALLEES_METADATA_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null }
+; NVPTX: @[[SPMD_AND_NON_SPMD_CALLEE_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null }
 ;.
 define weak void @spmd_callees(i1 %c) #0 {
 ; AMDGPU-LABEL: define {{[^@]+}}@spmd_callees
diff --git a/llvm/test/Transforms/OpenMP/spmdization_kernel_env_dep.ll b/llvm/test/Transforms/OpenMP/spmdization_kernel_env_dep.ll
index a5bef5f7e8e15..2ac0ae5b307e4 100644
--- a/llvm/test/Transforms/OpenMP/spmdization_kernel_env_dep.ll
+++ b/llvm/test/Transforms/OpenMP/spmdization_kernel_env_dep.ll
@@ -5,14 +5,14 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:3
 target triple = "amdgcn-amd-amdhsa"
 
 %struct.KernelEnvironmentTy = type { %struct.ConfigurationEnvironmentTy.8, ptr, ptr }
-%struct.ConfigurationEnvironmentTy.8 = type { i8, i8, i8, i32, i32, i32, i32 }
+%struct.ConfigurationEnvironmentTy.8 = type { i8, i8, i8 }
 
 @IsSPMDMode = internal addrspace(3) global i32 undef
-@__omp_offloading_10302_b20a40e_main_l4_kernel_environment = addrspace(1) constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy.8 { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr addrspacecast (ptr addrspace(1) null to ptr), ptr addrspacecast (ptr addrspace(1) null to ptr) }
+@__omp_offloading_10302_b20a40e_main_l4_kernel_environment = addrspace(1) constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy.8 { i8 1, i8 0, i8 1 }, ptr addrspacecast (ptr addrspace(1) null to ptr), ptr addrspacecast (ptr addrspace(1) null to ptr) }
 
 ;.
 ; AMDGPU: @[[ISSPMDMODE:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global i32 undef
-; AMDGPU: @[[__OMP_OFFLOADING_10302_B20A40E_MAIN_L4_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = addrspace(1) constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY_8:%.*]] { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr addrspacecast (ptr addrspace(1) null to ptr), ptr addrspacecast (ptr addrspace(1) null to ptr) }
+; AMDGPU: @[[__OMP_OFFLOADING_10302_B20A40E_MAIN_L4_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = addrspace(1) constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY_8:%.*]] { i8 0, i8 0, i8 1 }, ptr addrspacecast (ptr addrspace(1) null to ptr), ptr addrspacecast (ptr addrspace(1) null to ptr) }
 ;.
 define i32 @fputs() {
 ; AMDGPU-LABEL: define {{[^@]+}}@fputs
diff --git a/llvm/test/Transforms/OpenMP/spmdization_no_guarding_two_reaching_kernels.ll b/llvm/test/Transforms/OpenMP/spmdization_no_guarding_two_reaching_kernels.ll
index c6365c9e0e93d..04644a98983f0 100644
--- a/llvm/test/Transforms/OpenMP/spmdization_no_guarding_two_reaching_kernels.ll
+++ b/llvm/test/Transforms/OpenMP/spmdization_no_guarding_two_reaching_kernels.ll
@@ -31,30 +31,30 @@ target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
 target triple = "nvptx64"
 
 %struct.ident_t = type { i32, i32, i32, i32, ptr }
-%struct.ConfigurationEnvironmentTy = type { i8, i8, i8, i32, i32, i32, i32 }
+%struct.ConfigurationEnvironmentTy = type { i8, i8, i8 }
 %struct.KernelEnvironmentTy = type { %struct.ConfigurationEnvironmentTy, ptr, ptr }
 
 @0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
 @1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, ptr @0 }, align 8
 @2 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 2, i32 0, ptr @0 }, align 8
 @G = external addrspace(5) global i32, align 4
-@__omp_offloading_2b_10393b5_spmd_l12_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
-@__omp_offloading_2b_10393b5_generic_l20_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
+@__omp_offloading_2b_10393b5_spmd_l12_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1 }, ptr @1, ptr null }
+@__omp_offloading_2b_10393b5_generic_l20_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1 }, ptr @1, ptr null }
 
 ;.
 ; CHECK: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
 ; CHECK: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
 ; CHECK: @[[GLOB2:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
 ; CHECK: @[[G:[a-zA-Z0-9_$"\\.-]+]] = external addrspace(5) global i32, align 4
-; CHECK: @[[__OMP_OFFLOADING_2B_10393B5_SPMD_L12_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 3, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; CHECK: @[[__OMP_OFFLOADING_2B_10393B5_GENERIC_L20_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; CHECK: @[[__OMP_OFFLOADING_2B_10393B5_SPMD_L12_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 3 }, ptr @[[GLOB1]], ptr null }
+; CHECK: @[[__OMP_OFFLOADING_2B_10393B5_GENERIC_L20_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null }
 ;.
 ; CHECK-DISABLE-SPMDIZATION: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
 ; CHECK-DISABLE-SPMDIZATION: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
 ; CHECK-DISABLE-SPMDIZATION: @[[GLOB2:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
 ; CHECK-DISABLE-SPMDIZATION: @[[G:[a-zA-Z0-9_$"\\.-]+]] = external addrspace(5) global i32, align 4
-; CHECK-DISABLE-SPMDIZATION: @[[__OMP_OFFLOADING_2B_10393B5_SPMD_L12_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; CHECK-DISABLE-SPMDIZATION: @[[__OMP_OFFLOADING_2B_10393B5_GENERIC_L20_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; CHECK-DISABLE-SPMDIZATION: @[[__OMP_OFFLOADING_2B_10393B5_SPMD_L12_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null }
+; CHECK-DISABLE-SPMDIZATION: @[[__OMP_OFFLOADING_2B_10393B5_GENERIC_L20_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null }
 ; CHECK-DISABLE-SPMDIZATION: @[[__OMP_OUTLINED___WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
 ;.
 define weak void @__omp_offloading_2b_10393b5_spmd_l12() #0 {
diff --git a/llvm/test/Transforms/OpenMP/spmdization_remarks.ll b/llvm/test/Transforms/OpenMP/spmdization_remarks.ll
index d2d22c2d8790c..08ddcfc07d50c 100644
--- a/llvm/test/Transforms/OpenMP/spmdization_remarks.ll
+++ b/llvm/test/Transforms/OpenMP/spmdization_remarks.ll
@@ -38,7 +38,7 @@ target triple = "nvptx64"
 ;; }
 
 %struct.ident_t = type { i32, i32, i32, i32, ptr }
-%struct.ConfigurationEnvironmentTy = type { i8, i8, i8, i32, i32, i32, i32 }
+%struct.ConfigurationEnvironmentTy = type { i8, i8, i8 }
 %struct.KernelEnvironmentTy = type { %struct.ConfigurationEnvironmentTy, ptr, ptr }
 
 @0 = private unnamed_addr constant [103 x i8] c";llvm/test/Transforms/OpenMP/spmdization_remarks.c;__omp_offloading_2a_d80d3d_test_fallback_l11;11;1;;\00", align 1
@@ -57,8 +57,8 @@ target triple = "nvptx64"
 @13 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 2, i32 0, ptr @12 }, align 8
 @G = external global i32
 
-@__omp_offloading_2a_d80d3d_test_fallback_l11_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
-@__omp_offloading_2a_d80d3d_test_no_fallback_l20_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
+@__omp_offloading_2a_d80d3d_test_fallback_l11_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1 }, ptr @1, ptr null }
+@__omp_offloading_2a_d80d3d_test_no_fallback_l20_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1 }, ptr @1, ptr null }
 
 
 ; Function Attrs: convergent norecurse nounwind
diff --git a/llvm/test/Transforms/OpenMP/value-simplify-openmp-opt.ll b/llvm/test/Transforms/OpenMP/value-simplify-openmp-opt.ll
index 0a62b1750cb0d..5f06f9104ae4a 100644
--- a/llvm/test/Transforms/OpenMP/value-simplify-openmp-opt.ll
+++ b/llvm/test/Transforms/OpenMP/value-simplify-openmp-opt.ll
@@ -4,7 +4,7 @@
 
 target triple = "amdgcn-amd-amdhsa"
 
-%struct.ConfigurationEnvironmentTy = type { i8, i8, i8, i32, i32, i32, i32 }
+%struct.ConfigurationEnvironmentTy = type { i8, i8, i8 }
 %struct.KernelEnvironmentTy = type { %struct.ConfigurationEnvironmentTy, ptr, ptr }
 
 @G = internal addrspace(3) global i32 undef, align 4
@@ -27,7 +27,7 @@ target triple = "amdgcn-amd-amdhsa"
 @UAA3 = internal addrspace(3) global i32 undef, align 4
 @UANA1 = internal addrspace(3) global i32 undef, align 4
 @str = private unnamed_addr addrspace(4) constant [1 x i8] c"\00", align 1
-@kernel_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr null, ptr null }
+@kernel_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1 }, ptr null, ptr null }
 
 ; Make sure we do not delete the stores to @G without also replacing the load with `1`.
 ;.
@@ -51,7 +51,7 @@ target triple = "amdgcn-amd-amdhsa"
 ; CHECK: @[[UAA3:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global i32 undef, align 4
 ; CHECK: @[[UANA1:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global i32 undef, align 4
 ; CHECK: @[[STR:[a-zA-Z0-9_$"\\.-]+]] = private unnamed_addr addrspace(4) constant [1 x i8] zeroinitializer, align 1
-; CHECK: @[[KERNEL_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr null, ptr null }
+; CHECK: @[[KERNEL_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1 }, ptr null, ptr null }
 ;.
 define void @kernel() "kernel" {
 ;
diff --git a/llvm/test/Transforms/PhaseOrdering/openmp-opt-module.ll b/llvm/test/Transforms/PhaseOrdering/openmp-opt-module.ll
index a64107b25d520..b8486ab6a6c9e 100644
--- a/llvm/test/Transforms/PhaseOrdering/openmp-opt-module.ll
+++ b/llvm/test/Transforms/PhaseOrdering/openmp-opt-module.ll
@@ -1,12 +1,12 @@
 ; RUN: opt -passes='default<O2>' -pass-remarks-missed=openmp-opt < %s 2>&1 | FileCheck %s --check-prefix=MODULE
 target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
 
-%struct.ConfigurationEnvironmentTy = type { i8, i8, i8, i32, i32, i32, i32 }
+%struct.ConfigurationEnvironmentTy = type { i8, i8, i8 }
 %struct.KernelEnvironmentTy = type { %struct.ConfigurationEnvironmentTy, ptr, ptr }
 
 @.str = private unnamed_addr constant [13 x i8] c"Alloc Shared\00", align 1
 @S = external local_unnamed_addr global ptr
-@foo_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr null, ptr null }
+@foo_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1 }, ptr null, ptr null }
 
 ; MODULE: remark: openmp_opt_module.c:5:7: Found thread data sharing on the GPU. Expect degraded performance due to data globalization.
 
diff --git a/openmp/libomptarget/include/Environment.h b/openmp/libomptarget/include/Environment.h
index 1374d1e95a554..8194736ae4e0a 100644
--- a/openmp/libomptarget/include/Environment.h
+++ b/openmp/libomptarget/include/Environment.h
@@ -80,13 +80,6 @@ struct ConfigurationEnvironmentTy {
   uint8_t UseGenericStateMachine;
   uint8_t MayUseNestedParallelism;
   llvm::omp::OMPTgtExecModeFlags ExecMode;
-  // Information about (legal) launch configurations.
-  //{
-  int32_t MinThreads;
-  int32_t MaxThreads;
-  int32_t MinTeams;
-  int32_t MaxTeams;
-  //}
 };
 
 // NOTE: Please don't change the order of those members as their indices are

From f390a76b7e9cabd9932b09b3ba0085f828a8e856 Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph@gmail.com>
Date: Thu, 26 Oct 2023 17:29:44 -0700
Subject: [PATCH 159/877] Revert "Revert "[OpenMP][NFC] Add min/max
 threads/teams count into the KernelEnvironment (#70257)""

This reverts commit ddbaa11e9f43a38d50d62a9b9b07c3653b6bf8ab.

Reapply the original commit, the broken test was repaired in 5e51363f38d083ab326736c0d4d1b5f9fe0de080 in the meantime.
---
 .../include/llvm/Frontend/OpenMP/OMPKinds.def |   2 +-
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp     |  27 +++--
 llvm/lib/Transforms/IPO/OpenMPOpt.cpp         |  29 +++++
 .../Transforms/OpenMP/always_inline_device.ll |   6 +-
 .../OpenMP/custom_state_machines.ll           |  82 ++++++-------
 .../OpenMP/custom_state_machines_pre_lto.ll   | 114 +++++++++---------
 .../OpenMP/custom_state_machines_remarks.ll   |   6 +-
 .../Transforms/OpenMP/deduplication_target.ll |   4 +-
 .../get_hardware_num_threads_in_block_fold.ll |  14 +--
 .../Transforms/OpenMP/global_constructor.ll   |   4 +-
 .../OpenMP/globalization_remarks.ll           |   4 +-
 ..._state_machine_function_ptr_replacement.ll |   4 +-
 .../OpenMP/is_spmd_exec_mode_fold.ll          |  18 +--
 .../Transforms/OpenMP/nested_parallelism.ll   |  10 +-
 .../Transforms/OpenMP/parallel_deletion.ll    |  50 ++++----
 .../Transforms/OpenMP/parallel_level_fold.ll  |  14 +--
 .../Transforms/OpenMP/remove_globalization.ll |   8 +-
 .../OpenMP/replace_globalization.ll           |  14 +--
 .../OpenMP/single_threaded_execution.ll       |   4 +-
 llvm/test/Transforms/OpenMP/spmdization.ll    |  86 ++++++-------
 .../Transforms/OpenMP/spmdization_assumes.ll  |   6 +-
 .../OpenMP/spmdization_constant_prop.ll       |   6 +-
 .../Transforms/OpenMP/spmdization_guarding.ll |   8 +-
 ...mdization_guarding_two_reaching_kernels.ll |  14 +--
 .../Transforms/OpenMP/spmdization_indirect.ll |  26 ++--
 .../OpenMP/spmdization_kernel_env_dep.ll      |   6 +-
 ...zation_no_guarding_two_reaching_kernels.ll |  14 +--
 .../Transforms/OpenMP/spmdization_remarks.ll  |   6 +-
 .../OpenMP/value-simplify-openmp-opt.ll       |   6 +-
 .../PhaseOrdering/openmp-opt-module.ll        |   4 +-
 openmp/libomptarget/include/Environment.h     |   7 ++
 31 files changed, 323 insertions(+), 280 deletions(-)

diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
index 39780e7c8934c..be03757d7d57c 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
@@ -95,7 +95,7 @@ __OMP_STRUCT_TYPE(AsyncInfo, __tgt_async_info, false, Int8Ptr)
 __OMP_STRUCT_TYPE(DependInfo, kmp_dep_info, false, SizeTy, SizeTy, Int8)
 __OMP_STRUCT_TYPE(Task, kmp_task_ompbuilder_t, false, VoidPtr, VoidPtr, Int32, VoidPtr, VoidPtr)
 __OMP_STRUCT_TYPE(ConfigurationEnvironment, ConfigurationEnvironmentTy, false,
-                  Int8, Int8, Int8)
+                  Int8, Int8, Int8, Int32, Int32, Int32, Int32)
 __OMP_STRUCT_TYPE(DynamicEnvironment, DynamicEnvironmentTy, false, Int16)
 __OMP_STRUCT_TYPE(KernelEnvironment, KernelEnvironmentTy, false,
                   ConfigurationEnvironment, IdentPtr, DynamicEnvironmentPtr)
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index b40796d4e50a6..ac5a8d1fd817e 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -4071,18 +4071,21 @@ OpenMPIRBuilder::createTargetInit(const LocationDescription &Loc, bool IsSPMD) {
   uint32_t SrcLocStrSize;
   Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
   Constant *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
-  ConstantInt *IsSPMDVal = ConstantInt::getSigned(
-      IntegerType::getInt8Ty(Int8->getContext()),
-      IsSPMD ? OMP_TGT_EXEC_MODE_SPMD : OMP_TGT_EXEC_MODE_GENERIC);
-  ConstantInt *UseGenericStateMachineVal = ConstantInt::getSigned(
-      IntegerType::getInt8Ty(Int8->getContext()), !IsSPMD);
-  ConstantInt *MayUseNestedParallelismVal =
-      ConstantInt::getSigned(IntegerType::getInt8Ty(Int8->getContext()), true);
-  ConstantInt *DebugIndentionLevelVal =
-      ConstantInt::getSigned(IntegerType::getInt16Ty(Int8->getContext()), 0);
+  Constant *IsSPMDVal = ConstantInt::getSigned(
+      Int8, IsSPMD ? OMP_TGT_EXEC_MODE_SPMD : OMP_TGT_EXEC_MODE_GENERIC);
+  Constant *UseGenericStateMachineVal = ConstantInt::getSigned(Int8, !IsSPMD);
+  Constant *MayUseNestedParallelismVal = ConstantInt::getSigned(Int8, true);
+  Constant *DebugIndentionLevelVal = ConstantInt::getSigned(Int16, 0);
 
-  // We need to strip the debug prefix to get the correct kernel name.
   Function *Kernel = Builder.GetInsertBlock()->getParent();
+  auto [MinThreadsVal, MaxThreadsVal] = readThreadBoundsForKernel(*Kernel);
+  auto [MinTeamsVal, MaxTeamsVal] = readTeamBoundsForKernel(*Kernel);
+  Constant *MinThreads = ConstantInt::getSigned(Int32, MinThreadsVal);
+  Constant *MaxThreads = ConstantInt::getSigned(Int32, MaxThreadsVal);
+  Constant *MinTeams = ConstantInt::getSigned(Int32, MinTeamsVal);
+  Constant *MaxTeams = ConstantInt::getSigned(Int32, MaxTeamsVal);
+
+  // We need to strip the debug prefix to get the correct kernel name.
   StringRef KernelName = Kernel->getName();
   const std::string DebugPrefix = "_debug__";
   if (KernelName.ends_with(DebugPrefix))
@@ -4113,6 +4116,10 @@ OpenMPIRBuilder::createTargetInit(const LocationDescription &Loc, bool IsSPMD) {
                                     UseGenericStateMachineVal,
                                     MayUseNestedParallelismVal,
                                     IsSPMDVal,
+                                    MinThreads,
+                                    MaxThreads,
+                                    MinTeams,
+                                    MaxTeams,
                                 });
   Constant *KernelEnvironmentInitializer = ConstantStruct::get(
       KernelEnvironment, {
diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
index e50e74ea6c0d5..e9dbcd334321f 100644
--- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -191,6 +191,10 @@ namespace KernelInfo {
 //   uint8_t UseGenericStateMachine;
 //   uint8_t MayUseNestedParallelism;
 //   llvm::omp::OMPTgtExecModeFlags ExecMode;
+//   int32_t MinThreads;
+//   int32_t MaxThreads;
+//   int32_t MinTeams;
+//   int32_t MaxTeams;
 // };
 
 // struct DynamicEnvironmentTy {
@@ -217,6 +221,10 @@ KERNEL_ENVIRONMENT_IDX(Ident, 1)
 KERNEL_ENVIRONMENT_CONFIGURATION_IDX(UseGenericStateMachine, 0)
 KERNEL_ENVIRONMENT_CONFIGURATION_IDX(MayUseNestedParallelism, 1)
 KERNEL_ENVIRONMENT_CONFIGURATION_IDX(ExecMode, 2)
+KERNEL_ENVIRONMENT_CONFIGURATION_IDX(MinThreads, 3)
+KERNEL_ENVIRONMENT_CONFIGURATION_IDX(MaxThreads, 4)
+KERNEL_ENVIRONMENT_CONFIGURATION_IDX(MinTeams, 5)
+KERNEL_ENVIRONMENT_CONFIGURATION_IDX(MaxTeams, 6)
 
 #undef KERNEL_ENVIRONMENT_CONFIGURATION_IDX
 
@@ -241,6 +249,10 @@ KERNEL_ENVIRONMENT_GETTER(Configuration, ConstantStruct)
 KERNEL_ENVIRONMENT_CONFIGURATION_GETTER(UseGenericStateMachine)
 KERNEL_ENVIRONMENT_CONFIGURATION_GETTER(MayUseNestedParallelism)
 KERNEL_ENVIRONMENT_CONFIGURATION_GETTER(ExecMode)
+KERNEL_ENVIRONMENT_CONFIGURATION_GETTER(MinThreads)
+KERNEL_ENVIRONMENT_CONFIGURATION_GETTER(MaxThreads)
+KERNEL_ENVIRONMENT_CONFIGURATION_GETTER(MinTeams)
+KERNEL_ENVIRONMENT_CONFIGURATION_GETTER(MaxTeams)
 
 #undef KERNEL_ENVIRONMENT_CONFIGURATION_GETTER
 
@@ -3636,6 +3648,10 @@ struct AAKernelInfoFunction : AAKernelInfo {
   KERNEL_ENVIRONMENT_CONFIGURATION_SETTER(UseGenericStateMachine)
   KERNEL_ENVIRONMENT_CONFIGURATION_SETTER(MayUseNestedParallelism)
   KERNEL_ENVIRONMENT_CONFIGURATION_SETTER(ExecMode)
+  KERNEL_ENVIRONMENT_CONFIGURATION_SETTER(MinThreads)
+  KERNEL_ENVIRONMENT_CONFIGURATION_SETTER(MaxThreads)
+  KERNEL_ENVIRONMENT_CONFIGURATION_SETTER(MinTeams)
+  KERNEL_ENVIRONMENT_CONFIGURATION_SETTER(MaxTeams)
 
 #undef KERNEL_ENVIRONMENT_CONFIGURATION_SETTER
 
@@ -3723,6 +3739,19 @@ struct AAKernelInfoFunction : AAKernelInfo {
     else
       setExecModeOfKernelEnvironment(AssumedExecModeC);
 
+    auto *Int32Ty = Type::getInt32Ty(Fn->getContext());
+    auto [MinThreads, MaxThreads] =
+        OpenMPIRBuilder::readThreadBoundsForKernel(*Fn);
+    if (MinThreads)
+      setMinThreadsOfKernelEnvironment(ConstantInt::get(Int32Ty, MinThreads));
+    if (MaxThreads)
+      setMaxThreadsOfKernelEnvironment(ConstantInt::get(Int32Ty, MaxThreads));
+    auto [MinTeams, MaxTeams] = OpenMPIRBuilder::readTeamBoundsForKernel(*Fn);
+    if (MinTeams)
+      setMinTeamsOfKernelEnvironment(ConstantInt::get(Int32Ty, MinTeams));
+    if (MaxTeams)
+      setMaxTeamsOfKernelEnvironment(ConstantInt::get(Int32Ty, MaxTeams));
+
     ConstantInt *MayUseNestedParallelismC =
         KernelInfo::getMayUseNestedParallelismFromKernelEnvironment(KernelEnvC);
     ConstantInt *AssumedMayUseNestedParallelismC = ConstantInt::get(
diff --git a/llvm/test/Transforms/OpenMP/always_inline_device.ll b/llvm/test/Transforms/OpenMP/always_inline_device.ll
index 5478924cf828b..993f097d74118 100644
--- a/llvm/test/Transforms/OpenMP/always_inline_device.ll
+++ b/llvm/test/Transforms/OpenMP/always_inline_device.ll
@@ -3,19 +3,19 @@
 
 %struct.ident_t = type { i32, i32, i32, i32, ptr }
 %struct.KernelEnvironmentTy = type { %struct.ConfigurationEnvironmentTy, ptr, ptr }
-%struct.ConfigurationEnvironmentTy = type { i8, i8, i8 }
+%struct.ConfigurationEnvironmentTy = type { i8, i8, i8, i32, i32, i32, i32 }
 @0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
 @1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, ptr @0 }, align 8
 @G = external global i8
 
-@kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1 }, ptr @1, ptr null }
+@kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
 
 ; Function Attrs: convergent norecurse nounwind
 ;.
 ; CHECK: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
 ; CHECK: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
 ; CHECK: @[[G:[a-zA-Z0-9_$"\\.-]+]] = external global i8
-; CHECK: @[[KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 3 }, ptr @[[GLOB1]], ptr null }
+; CHECK: @[[KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 3, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
 ;.
 define weak void @__omp_offloading_fd02_c0934fc2_foo_l4() #0 {
 ; CHECK: Function Attrs: norecurse nounwind
diff --git a/llvm/test/Transforms/OpenMP/custom_state_machines.ll b/llvm/test/Transforms/OpenMP/custom_state_machines.ll
index 9eefb4d5fa509..89a298ff19549 100644
--- a/llvm/test/Transforms/OpenMP/custom_state_machines.ll
+++ b/llvm/test/Transforms/OpenMP/custom_state_machines.ll
@@ -121,7 +121,7 @@
 
 %struct.ident_t = type { i32, i32, i32, i32, ptr }
 %struct.KernelEnvironmentTy = type { %struct.ConfigurationEnvironmentTy, ptr, ptr }
-%struct.ConfigurationEnvironmentTy = type { i8, i8, i8 }
+%struct.ConfigurationEnvironmentTy = type { i8, i8, i8, i32, i32, i32, i32 }
 
 @0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
 @1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, ptr @0 }, align 8
@@ -129,14 +129,14 @@
 @G = external global i32, align 4
 @3 = private unnamed_addr constant %struct.ident_t { i32 0, i32 322, i32 2, i32 0, ptr @0 }, align 8
 
-@__omp_offloading_14_a36502b_no_state_machine_needed_l14_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1 }, ptr @1, ptr null }
-@__omp_offloading_14_a36502b_simple_state_machine_l22_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1 }, ptr @1, ptr null }
-@__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1 }, ptr @1, ptr null }
-@__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1 }, ptr @1, ptr null }
-@__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1 }, ptr @1, ptr null }
-@__omp_offloading_14_a36502b_simple_state_machine_pure_l77_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1 }, ptr @1, ptr null }
-@__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1 }, ptr @1, ptr null }
-@__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1 }, ptr @1, ptr null }
+@__omp_offloading_14_a36502b_no_state_machine_needed_l14_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
+@__omp_offloading_14_a36502b_simple_state_machine_l22_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
+@__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
+@__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
+@__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
+@__omp_offloading_14_a36502b_simple_state_machine_pure_l77_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
+@__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
+@__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
 
 define weak void @__omp_offloading_14_a36502b_no_state_machine_needed_l14() #0 {
 entry:
@@ -840,14 +840,14 @@ attributes #9 = { convergent nounwind readonly willreturn }
 ; AMDGPU: @[[GLOB2:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
 ; AMDGPU: @[[G:[a-zA-Z0-9_$"\\.-]+]] = external global i32, align 4
 ; AMDGPU: @[[GLOB3:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 322, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
-; AMDGPU: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_NEEDED_L14_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_L22_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_L39_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_WITH_FALLBACK_L55_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_NO_OPENMP_ATTR_L66_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_PURE_L77_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_NESTED_RECURSIVE_L92_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_WEAK_CALLEE_L112_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_NEEDED_L14_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_L22_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_L39_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_WITH_FALLBACK_L55_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_NO_OPENMP_ATTR_L66_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_PURE_L77_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_NESTED_RECURSIVE_L92_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_WEAK_CALLEE_L112_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
 ; AMDGPU: @[[__OMP_OUTLINED__2_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
 ; AMDGPU: @[[__OMP_OUTLINED__3_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
 ; AMDGPU: @[[__OMP_OUTLINED__5_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
@@ -863,14 +863,14 @@ attributes #9 = { convergent nounwind readonly willreturn }
 ; NVPTX: @[[GLOB2:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
 ; NVPTX: @[[G:[a-zA-Z0-9_$"\\.-]+]] = external global i32, align 4
 ; NVPTX: @[[GLOB3:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 322, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
-; NVPTX: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_NEEDED_L14_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null }
-; NVPTX: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_L22_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
-; NVPTX: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_L39_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
-; NVPTX: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_WITH_FALLBACK_L55_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
-; NVPTX: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_NO_OPENMP_ATTR_L66_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
-; NVPTX: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_PURE_L77_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
-; NVPTX: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_NESTED_RECURSIVE_L92_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3 }, ptr @[[GLOB1]], ptr null }
-; NVPTX: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_WEAK_CALLEE_L112_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null }
+; NVPTX: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_NEEDED_L14_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_L22_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_L39_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_WITH_FALLBACK_L55_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_NO_OPENMP_ATTR_L66_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_PURE_L77_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_NESTED_RECURSIVE_L92_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_WEAK_CALLEE_L112_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
 ; NVPTX: @[[__OMP_OUTLINED__2_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
 ; NVPTX: @[[__OMP_OUTLINED__3_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
 ; NVPTX: @[[__OMP_OUTLINED__5_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
@@ -886,28 +886,28 @@ attributes #9 = { convergent nounwind readonly willreturn }
 ; AMDGPU-DISABLED: @[[GLOB2:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
 ; AMDGPU-DISABLED: @[[G:[a-zA-Z0-9_$"\\.-]+]] = external global i32, align 4
 ; AMDGPU-DISABLED: @[[GLOB3:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 322, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
-; AMDGPU-DISABLED: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_NEEDED_L14_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU-DISABLED: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_L22_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU-DISABLED: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_L39_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU-DISABLED: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_WITH_FALLBACK_L55_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU-DISABLED: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_NO_OPENMP_ATTR_L66_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU-DISABLED: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_PURE_L77_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU-DISABLED: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_NESTED_RECURSIVE_L92_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 3 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU-DISABLED: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_WEAK_CALLEE_L112_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU-DISABLED: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_NEEDED_L14_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU-DISABLED: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_L22_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU-DISABLED: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_L39_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU-DISABLED: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_WITH_FALLBACK_L55_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU-DISABLED: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_NO_OPENMP_ATTR_L66_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU-DISABLED: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_PURE_L77_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU-DISABLED: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_NESTED_RECURSIVE_L92_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU-DISABLED: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_WEAK_CALLEE_L112_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
 ;.
 ; NVPTX-DISABLED: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
 ; NVPTX-DISABLED: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
 ; NVPTX-DISABLED: @[[GLOB2:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
 ; NVPTX-DISABLED: @[[G:[a-zA-Z0-9_$"\\.-]+]] = external global i32, align 4
 ; NVPTX-DISABLED: @[[GLOB3:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 322, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
-; NVPTX-DISABLED: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_NEEDED_L14_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null }
-; NVPTX-DISABLED: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_L22_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
-; NVPTX-DISABLED: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_L39_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
-; NVPTX-DISABLED: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_WITH_FALLBACK_L55_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
-; NVPTX-DISABLED: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_NO_OPENMP_ATTR_L66_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
-; NVPTX-DISABLED: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_PURE_L77_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
-; NVPTX-DISABLED: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_NESTED_RECURSIVE_L92_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 3 }, ptr @[[GLOB1]], ptr null }
-; NVPTX-DISABLED: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_WEAK_CALLEE_L112_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null }
+; NVPTX-DISABLED: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_NEEDED_L14_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX-DISABLED: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_L22_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX-DISABLED: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_L39_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX-DISABLED: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_WITH_FALLBACK_L55_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX-DISABLED: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_NO_OPENMP_ATTR_L66_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX-DISABLED: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_PURE_L77_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX-DISABLED: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_NESTED_RECURSIVE_L92_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX-DISABLED: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_WEAK_CALLEE_L112_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
 ;.
 ; AMDGPU: Function Attrs: convergent noinline norecurse nounwind
 ; AMDGPU-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_no_state_machine_needed_l14
diff --git a/llvm/test/Transforms/OpenMP/custom_state_machines_pre_lto.ll b/llvm/test/Transforms/OpenMP/custom_state_machines_pre_lto.ll
index 5b41179e14652..4965c18f33f59 100644
--- a/llvm/test/Transforms/OpenMP/custom_state_machines_pre_lto.ll
+++ b/llvm/test/Transforms/OpenMP/custom_state_machines_pre_lto.ll
@@ -123,21 +123,21 @@
 
 %struct.ident_t = type { i32, i32, i32, i32, ptr }
 %struct.KernelEnvironmentTy = type { %struct.ConfigurationEnvironmentTy, ptr, ptr }
-%struct.ConfigurationEnvironmentTy = type { i8, i8, i8 }
+%struct.ConfigurationEnvironmentTy = type { i8, i8, i8, i32, i32, i32, i32 }
 
 @0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
 @1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, ptr @0 }, align 8
 @2 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 2, i32 0, ptr @0 }, align 8
 @G = external global i32, align 4
 @3 = private unnamed_addr constant %struct.ident_t { i32 0, i32 322, i32 2, i32 0, ptr @0 }, align 8
-@__omp_offloading_14_a36502b_no_state_machine_needed_l14_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1 }, ptr @1, ptr null }
-@__omp_offloading_14_a36502b_simple_state_machine_l22_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1 }, ptr @1, ptr null }
-@__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1 }, ptr @1, ptr null }
-@__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1 }, ptr @1, ptr null }
-@__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1 }, ptr @1, ptr null }
-@__omp_offloading_14_a36502b_simple_state_machine_pure_l77_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1 }, ptr @1, ptr null }
-@__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1 }, ptr @1, ptr null }
-@__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1 }, ptr @1, ptr null }
+@__omp_offloading_14_a36502b_no_state_machine_needed_l14_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
+@__omp_offloading_14_a36502b_simple_state_machine_l22_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
+@__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
+@__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
+@__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
+@__omp_offloading_14_a36502b_simple_state_machine_pure_l77_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
+@__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
+@__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
 
 define weak void @__omp_offloading_14_a36502b_no_state_machine_needed_l14() #0 {
 entry:
@@ -839,84 +839,84 @@ attributes #9 = { convergent nounwind readonly willreturn }
 ; AMDGPU1: @[[GLOB2:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
 ; AMDGPU1: @[[G:[a-zA-Z0-9_$"\\.-]+]] = external global i32, align 4
 ; AMDGPU1: @[[GLOB3:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 322, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
-; AMDGPU1: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_NEEDED_L14_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU1: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_L22_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU1: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_L39_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU1: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_WITH_FALLBACK_L55_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU1: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_NO_OPENMP_ATTR_L66_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU1: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_PURE_L77_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU1: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_NESTED_RECURSIVE_L92_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU1: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_WEAK_CALLEE_L112_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU1: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_NEEDED_L14_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU1: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_L22_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU1: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_L39_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU1: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_WITH_FALLBACK_L55_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU1: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_NO_OPENMP_ATTR_L66_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU1: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_PURE_L77_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU1: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_NESTED_RECURSIVE_L92_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU1: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_WEAK_CALLEE_L112_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
 ;.
 ; NVPTX1: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
 ; NVPTX1: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
 ; NVPTX1: @[[GLOB2:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
 ; NVPTX1: @[[G:[a-zA-Z0-9_$"\\.-]+]] = external global i32, align 4
 ; NVPTX1: @[[GLOB3:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 322, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
-; NVPTX1: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_NEEDED_L14_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null }
-; NVPTX1: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_L22_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
-; NVPTX1: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_L39_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
-; NVPTX1: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_WITH_FALLBACK_L55_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
-; NVPTX1: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_NO_OPENMP_ATTR_L66_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
-; NVPTX1: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_PURE_L77_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
-; NVPTX1: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_NESTED_RECURSIVE_L92_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3 }, ptr @[[GLOB1]], ptr null }
-; NVPTX1: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_WEAK_CALLEE_L112_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null }
+; NVPTX1: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_NEEDED_L14_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX1: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_L22_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX1: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_L39_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX1: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_WITH_FALLBACK_L55_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX1: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_NO_OPENMP_ATTR_L66_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX1: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_PURE_L77_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX1: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_NESTED_RECURSIVE_L92_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX1: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_WEAK_CALLEE_L112_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
 ;.
 ; AMDGPU2: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
 ; AMDGPU2: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
 ; AMDGPU2: @[[GLOB2:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
 ; AMDGPU2: @[[G:[a-zA-Z0-9_$"\\.-]+]] = external global i32, align 4
 ; AMDGPU2: @[[GLOB3:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 322, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
-; AMDGPU2: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_NEEDED_L14_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU2: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_L22_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU2: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_L39_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU2: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_WITH_FALLBACK_L55_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU2: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_NO_OPENMP_ATTR_L66_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU2: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_PURE_L77_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU2: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_NESTED_RECURSIVE_L92_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 3 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU2: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_WEAK_CALLEE_L112_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU2: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_NEEDED_L14_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU2: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_L22_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU2: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_L39_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU2: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_WITH_FALLBACK_L55_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU2: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_NO_OPENMP_ATTR_L66_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU2: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_PURE_L77_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU2: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_NESTED_RECURSIVE_L92_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU2: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_WEAK_CALLEE_L112_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
 ;.
 ; AMDGPU3: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
 ; AMDGPU3: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
 ; AMDGPU3: @[[GLOB2:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
 ; AMDGPU3: @[[G:[a-zA-Z0-9_$"\\.-]+]] = external global i32, align 4
 ; AMDGPU3: @[[GLOB3:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 322, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
-; AMDGPU3: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_NEEDED_L14_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU3: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_L22_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU3: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_L39_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU3: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_WITH_FALLBACK_L55_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU3: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_NO_OPENMP_ATTR_L66_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU3: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_PURE_L77_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU3: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_NESTED_RECURSIVE_L92_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU3: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_WEAK_CALLEE_L112_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU3: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_NEEDED_L14_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU3: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_L22_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU3: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_L39_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU3: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_WITH_FALLBACK_L55_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU3: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_NO_OPENMP_ATTR_L66_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU3: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_PURE_L77_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU3: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_NESTED_RECURSIVE_L92_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU3: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_WEAK_CALLEE_L112_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
 ;.
 ; NVPTX2: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
 ; NVPTX2: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
 ; NVPTX2: @[[GLOB2:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
 ; NVPTX2: @[[G:[a-zA-Z0-9_$"\\.-]+]] = external global i32, align 4
 ; NVPTX2: @[[GLOB3:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 322, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
-; NVPTX2: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_NEEDED_L14_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null }
-; NVPTX2: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_L22_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
-; NVPTX2: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_L39_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
-; NVPTX2: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_WITH_FALLBACK_L55_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
-; NVPTX2: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_NO_OPENMP_ATTR_L66_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
-; NVPTX2: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_PURE_L77_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
-; NVPTX2: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_NESTED_RECURSIVE_L92_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 3 }, ptr @[[GLOB1]], ptr null }
-; NVPTX2: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_WEAK_CALLEE_L112_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null }
+; NVPTX2: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_NEEDED_L14_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX2: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_L22_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX2: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_L39_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX2: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_WITH_FALLBACK_L55_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX2: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_NO_OPENMP_ATTR_L66_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX2: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_PURE_L77_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX2: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_NESTED_RECURSIVE_L92_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX2: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_WEAK_CALLEE_L112_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
 ;.
 ; NVPTX3: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
 ; NVPTX3: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
 ; NVPTX3: @[[GLOB2:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
 ; NVPTX3: @[[G:[a-zA-Z0-9_$"\\.-]+]] = external global i32, align 4
 ; NVPTX3: @[[GLOB3:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 322, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
-; NVPTX3: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_NEEDED_L14_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null }
-; NVPTX3: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_L22_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
-; NVPTX3: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_L39_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
-; NVPTX3: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_WITH_FALLBACK_L55_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
-; NVPTX3: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_NO_OPENMP_ATTR_L66_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
-; NVPTX3: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_PURE_L77_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
-; NVPTX3: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_NESTED_RECURSIVE_L92_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3 }, ptr @[[GLOB1]], ptr null }
-; NVPTX3: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_WEAK_CALLEE_L112_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null }
+; NVPTX3: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_NEEDED_L14_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX3: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_L22_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX3: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_L39_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX3: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_WITH_FALLBACK_L55_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX3: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_NO_OPENMP_ATTR_L66_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX3: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_PURE_L77_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX3: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_NESTED_RECURSIVE_L92_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX3: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_WEAK_CALLEE_L112_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
 ;.
 ; AMDGPU1: Function Attrs: convergent noinline norecurse nounwind
 ; AMDGPU1-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_no_state_machine_needed_l14
diff --git a/llvm/test/Transforms/OpenMP/custom_state_machines_remarks.ll b/llvm/test/Transforms/OpenMP/custom_state_machines_remarks.ll
index 0c542bb27f4f4..621ad16d836c0 100644
--- a/llvm/test/Transforms/OpenMP/custom_state_machines_remarks.ll
+++ b/llvm/test/Transforms/OpenMP/custom_state_machines_remarks.ll
@@ -37,7 +37,7 @@ target triple = "nvptx64"
 
 %struct.ident_t = type { i32, i32, i32, i32, ptr }
 %struct.KernelEnvironmentTy = type { %struct.ConfigurationEnvironmentTy, ptr, ptr }
-%struct.ConfigurationEnvironmentTy = type { i8, i8, i8 }
+%struct.ConfigurationEnvironmentTy = type { i8, i8, i8, i32, i32, i32, i32 }
 
 @0 = private unnamed_addr constant [113 x i8] c";llvm/test/Transforms/OpenMP/custom_state_machines_remarks.c;__omp_offloading_2a_d80d3d_test_fallback_l11;11;1;;\00", align 1
 @1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, ptr @0 }, align 8
@@ -54,8 +54,8 @@ target triple = "nvptx64"
 @12 = private unnamed_addr constant [73 x i8] c";llvm/test/Transforms/OpenMP/custom_state_machines_remarks.c;known;4;1;;\00", align 1
 @13 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 2, i32 0, ptr @12 }, align 8
 @G = external global i32
-@__omp_offloading_2a_d80d3d_test_fallback_l11_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1 }, ptr null, ptr null }
-@__omp_offloading_2a_d80d3d_test_no_fallback_l20_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1 }, ptr null, ptr null }
+@__omp_offloading_2a_d80d3d_test_fallback_l11_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr null, ptr null }
+@__omp_offloading_2a_d80d3d_test_no_fallback_l20_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr null, ptr null }
 
 
 ; Function Attrs: convergent norecurse nounwind
diff --git a/llvm/test/Transforms/OpenMP/deduplication_target.ll b/llvm/test/Transforms/OpenMP/deduplication_target.ll
index eb9177cff4bbf..6174fc64b90bd 100644
--- a/llvm/test/Transforms/OpenMP/deduplication_target.ll
+++ b/llvm/test/Transforms/OpenMP/deduplication_target.ll
@@ -6,12 +6,12 @@ target triple = "nvptx64"
 
 %struct.ident_t = type { i32, i32, i32, i32, ptr }
 %struct.KernelEnvironmentTy = type { %struct.ConfigurationEnvironmentTy, ptr, ptr }
-%struct.ConfigurationEnvironmentTy = type { i8, i8, i8 }
+%struct.ConfigurationEnvironmentTy = type { i8, i8, i8, i32, i32, i32, i32 }
 
 @0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
 @1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, ptr @0 }, align 8
 @2 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 1, i32 0, ptr @0 }, align 8
-@__omp_offloading_50_a3e09bf8_foo_l2_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 2 }, ptr null, ptr null }
+@__omp_offloading_50_a3e09bf8_foo_l2_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 2, i32 0, i32 0, i32 0, i32 0 }, ptr null, ptr null }
 
 declare void @use(i32)
 
diff --git a/llvm/test/Transforms/OpenMP/get_hardware_num_threads_in_block_fold.ll b/llvm/test/Transforms/OpenMP/get_hardware_num_threads_in_block_fold.ll
index d8605f7150e74..abbfc424cedc2 100644
--- a/llvm/test/Transforms/OpenMP/get_hardware_num_threads_in_block_fold.ll
+++ b/llvm/test/Transforms/OpenMP/get_hardware_num_threads_in_block_fold.ll
@@ -3,19 +3,19 @@
 target triple = "nvptx64"
 
 %struct.KernelEnvironmentTy = type { %struct.ConfigurationEnvironmentTy, ptr, ptr }
-%struct.ConfigurationEnvironmentTy = type { i8, i8, i8 }
+%struct.ConfigurationEnvironmentTy = type { i8, i8, i8, i32, i32, i32, i32 }
 
 @G = external global i32
 
-@kernel0_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1 }, ptr null, ptr null }
-@kernel1_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1 }, ptr null, ptr null }
-@kernel2_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1 }, ptr null, ptr null }
+@kernel0_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr null, ptr null }
+@kernel1_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr null, ptr null }
+@kernel2_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr null, ptr null }
 
 ;.
 ; CHECK: @[[G:[a-zA-Z0-9_$"\\.-]+]] = external global i32
-; CHECK: @[[KERNEL0_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 3 }, ptr null, ptr null }
-; CHECK: @[[KERNEL1_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 3 }, ptr null, ptr null }
-; CHECK: @[[KERNEL2_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 3 }, ptr null, ptr null }
+; CHECK: @[[KERNEL0_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 3, i32 0, i32 666, i32 0, i32 777 }, ptr null, ptr null }
+; CHECK: @[[KERNEL1_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 3, i32 0, i32 666, i32 0, i32 777 }, ptr null, ptr null }
+; CHECK: @[[KERNEL2_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 3, i32 0, i32 666, i32 0, i32 777 }, ptr null, ptr null }
 ; CHECK: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
 ; CHECK: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 22, ptr @[[GLOB0]] }, align 8
 ;.
diff --git a/llvm/test/Transforms/OpenMP/global_constructor.ll b/llvm/test/Transforms/OpenMP/global_constructor.ll
index e1a394b526a97..35c027ed5c164 100644
--- a/llvm/test/Transforms/OpenMP/global_constructor.ll
+++ b/llvm/test/Transforms/OpenMP/global_constructor.ll
@@ -3,12 +3,12 @@
 
 %struct.ident_t = type { i32, i32, i32, i32, ptr }
 %struct.KernelEnvironmentTy = type { %struct.ConfigurationEnvironmentTy, ptr, ptr }
-%struct.ConfigurationEnvironmentTy = type { i8, i8, i8 }
+%struct.ConfigurationEnvironmentTy = type { i8, i8, i8, i32, i32, i32, i32 }
 
 @0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
 @1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, ptr @0 }, align 8
 @_ZL6Device = internal global double 0.000000e+00, align 8
-@__omp_offloading_fd02_85283c04_main_l11_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1 }, ptr @1, ptr null }
+@__omp_offloading_fd02_85283c04_main_l11_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
 
 define weak void @__omp_offloading_fd02_85283c04_main_l11(ptr nonnull align 8 dereferenceable(8) %X) local_unnamed_addr "kernel" {
 entry:
diff --git a/llvm/test/Transforms/OpenMP/globalization_remarks.ll b/llvm/test/Transforms/OpenMP/globalization_remarks.ll
index fdd0e41fd2525..df369c7f3f19c 100644
--- a/llvm/test/Transforms/OpenMP/globalization_remarks.ll
+++ b/llvm/test/Transforms/OpenMP/globalization_remarks.ll
@@ -8,10 +8,10 @@ target triple = "nvptx64"
 ; CHECK: remark: globalization_remarks.c:5:7: Found thread data sharing on the GPU. Expect degraded performance due to data globalization.
 
 %struct.KernelEnvironmentTy = type { %struct.ConfigurationEnvironmentTy, ptr, ptr }
-%struct.ConfigurationEnvironmentTy = type { i8, i8, i8 }
+%struct.ConfigurationEnvironmentTy = type { i8, i8, i8, i32, i32, i32, i32 }
 
 @S = external local_unnamed_addr global ptr
-@foo_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1 }, ptr null, ptr null }
+@foo_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr null, ptr null }
 
 define void @foo() "kernel" {
 entry:
diff --git a/llvm/test/Transforms/OpenMP/gpu_state_machine_function_ptr_replacement.ll b/llvm/test/Transforms/OpenMP/gpu_state_machine_function_ptr_replacement.ll
index 37348e8474b57..94fcb35fc380b 100644
--- a/llvm/test/Transforms/OpenMP/gpu_state_machine_function_ptr_replacement.ll
+++ b/llvm/test/Transforms/OpenMP/gpu_state_machine_function_ptr_replacement.ll
@@ -37,12 +37,12 @@
 
 %struct.ident_t = type { i32, i32, i32, i32, ptr }
 %struct.KernelEnvironmentTy = type { %struct.ConfigurationEnvironmentTy, ptr, ptr }
-%struct.ConfigurationEnvironmentTy = type { i8, i8, i8 }
+%struct.ConfigurationEnvironmentTy = type { i8, i8, i8, i32, i32, i32, i32 }
 
 @0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
 @1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, ptr @0 }, align 8
 @2 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 2, i32 0, ptr @0 }, align 8
-@__omp_offloading_10301_87b2c_foo_l7_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1 }, ptr @1, ptr null }
+@__omp_offloading_10301_87b2c_foo_l7_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
 
 define weak void @__omp_offloading_10301_87b2c_foo_l7() "kernel" {
 entry:
diff --git a/llvm/test/Transforms/OpenMP/is_spmd_exec_mode_fold.ll b/llvm/test/Transforms/OpenMP/is_spmd_exec_mode_fold.ll
index 528232fd9e2f9..afc4043d6dd70 100644
--- a/llvm/test/Transforms/OpenMP/is_spmd_exec_mode_fold.ll
+++ b/llvm/test/Transforms/OpenMP/is_spmd_exec_mode_fold.ll
@@ -3,20 +3,20 @@
 target triple = "nvptx64"
 
 %struct.KernelEnvironmentTy = type { %struct.ConfigurationEnvironmentTy, ptr, ptr }
-%struct.ConfigurationEnvironmentTy = type { i8, i8, i8 }
+%struct.ConfigurationEnvironmentTy = type { i8, i8, i8, i32, i32, i32, i32 }
 
 @G = external global i8
-@is_spmd_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 2 }, ptr null, ptr null }
-@will_be_spmd_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1 }, ptr null, ptr null }
-@none_spmd_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1 }, ptr null, ptr null }
-@will_not_be_spmd_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1 }, ptr null, ptr null }
+@is_spmd_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 2, i32 0, i32 0, i32 0, i32 0 }, ptr null, ptr null }
+@will_be_spmd_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr null, ptr null }
+@none_spmd_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr null, ptr null }
+@will_not_be_spmd_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr null, ptr null }
 
 ;.
 ; CHECK: @[[G:[a-zA-Z0-9_$"\\.-]+]] = external global i8
-; CHECK: @[[IS_SPMD_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 2 }, ptr null, ptr null }
-; CHECK: @[[WILL_BE_SPMD_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 3 }, ptr null, ptr null }
-; CHECK: @[[NONE_SPMD_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1 }, ptr null, ptr null }
-; CHECK: @[[WILL_NOT_BE_SPMD_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1 }, ptr null, ptr null }
+; CHECK: @[[IS_SPMD_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 2, i32 0, i32 0, i32 0, i32 0 }, ptr null, ptr null }
+; CHECK: @[[WILL_BE_SPMD_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 3, i32 0, i32 0, i32 0, i32 0 }, ptr null, ptr null }
+; CHECK: @[[NONE_SPMD_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr null, ptr null }
+; CHECK: @[[WILL_NOT_BE_SPMD_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr null, ptr null }
 ;.
 define weak void @is_spmd() "kernel" {
 ; CHECK-LABEL: define {{[^@]+}}@is_spmd
diff --git a/llvm/test/Transforms/OpenMP/nested_parallelism.ll b/llvm/test/Transforms/OpenMP/nested_parallelism.ll
index 03832b32c0b9a..8fdd438355681 100644
--- a/llvm/test/Transforms/OpenMP/nested_parallelism.ll
+++ b/llvm/test/Transforms/OpenMP/nested_parallelism.ll
@@ -24,15 +24,15 @@ target triple = "nvptx64"
 
 %struct.ident_t = type { i32, i32, i32, i32, ptr }
 %struct.KernelEnvironmentTy = type { %struct.ConfigurationEnvironmentTy, ptr, ptr }
-%struct.ConfigurationEnvironmentTy = type { i8, i8, i8 }
+%struct.ConfigurationEnvironmentTy = type { i8, i8, i8, i32, i32, i32, i32 }
 
 @0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
 @1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 22, ptr @0 }, align 8
 @i_shared = internal addrspace(3) global [4 x i8] undef, align 16
 @i.i_shared = internal addrspace(3) global [4 x i8] undef, align 16
 
-@__omp_offloading_10302_bd7e0_main_l13_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 2 }, ptr @1, ptr null }
-@__omp_offloading_10302_bd7e0_main_l16_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1 }, ptr @1, ptr null }
+@__omp_offloading_10302_bd7e0_main_l13_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 2, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
+@__omp_offloading_10302_bd7e0_main_l16_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
 
 
 ;.
@@ -40,8 +40,8 @@ target triple = "nvptx64"
 ; CHECK: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 22, ptr @[[GLOB0]] }, align 8
 ; CHECK: @[[I_SHARED:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] undef, align 16
 ; CHECK: @[[I_I_SHARED:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] undef, align 16
-; CHECK: @[[__OMP_OFFLOADING_10302_BD7E0_MAIN_L13_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 2 }, ptr @[[GLOB1]], ptr null }
-; CHECK: @[[__OMP_OFFLOADING_10302_BD7E0_MAIN_L16_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null }
+; CHECK: @[[__OMP_OFFLOADING_10302_BD7E0_MAIN_L13_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 2, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; CHECK: @[[__OMP_OFFLOADING_10302_BD7E0_MAIN_L16_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
 ;.
 define weak_odr protected void @__omp_offloading_10302_bd7e0_main_l13(i64 noundef %i) local_unnamed_addr "kernel" {
 ; CHECK-LABEL: @__omp_offloading_10302_bd7e0_main_l13(
diff --git a/llvm/test/Transforms/OpenMP/parallel_deletion.ll b/llvm/test/Transforms/OpenMP/parallel_deletion.ll
index 5b3eb1869404a..6319875fd9056 100644
--- a/llvm/test/Transforms/OpenMP/parallel_deletion.ll
+++ b/llvm/test/Transforms/OpenMP/parallel_deletion.ll
@@ -386,12 +386,12 @@ define internal void @.omp_outlined..4(ptr noalias %.global_tid., ptr noalias %.
 ; CHECK-SAME: (ptr noalias nocapture nofree noundef nonnull readonly align 4 dereferenceable(4) [[DOTGLOBAL_TID_:%.*]], ptr noalias nocapture nofree readnone [[DOTBOUND_TID_:%.*]], ptr nocapture nofree noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4
-; CHECK-NEXT:    [[TRUETMP1:%.*]] = call i32 @__kmpc_master(ptr noundef nonnull @[[GLOB0]], i32 [[TMP]])
-; CHECK-NEXT:    [[TRUETMP2:%.*]] = icmp eq i32 [[TRUETMP1]], 0
-; CHECK-NEXT:    br i1 [[TRUETMP2]], label [[OMP_IF_END:%.*]], label [[OMP_IF_THEN:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_master(ptr noundef nonnull @[[GLOB0]], i32 [[TMP]])
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[TMP2]], label [[OMP_IF_END:%.*]], label [[OMP_IF_THEN:%.*]]
 ; CHECK:       omp_if.then:
-; CHECK-NEXT:    [[TRUETMP3:%.*]] = load i32, ptr [[A]], align 4
-; CHECK-NEXT:    [[INC:%.*]] = add nsw i32 [[TRUETMP3]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[A]], align 4
+; CHECK-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP3]], 1
 ; CHECK-NEXT:    store i32 [[INC]], ptr [[A]], align 4
 ; CHECK-NEXT:    call void @__kmpc_end_master(ptr noundef nonnull @[[GLOB0]], i32 [[TMP]])
 ; CHECK-NEXT:    br label [[OMP_IF_END]]
@@ -459,12 +459,12 @@ define internal void @.omp_outlined..5(ptr noalias %.global_tid., ptr noalias %.
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[OMP_GLOBAL_THREAD_NUM:%.*]] = call i32 @__kmpc_global_thread_num(ptr noundef nonnull @[[GLOB0]]) #[[ATTR19]]
 ; CHECK-NEXT:    [[TMP:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4
-; CHECK-NEXT:    [[TRUETMP1:%.*]] = call i32 @__kmpc_single(ptr noundef nonnull @[[GLOB0]], i32 [[TMP]])
-; CHECK-NEXT:    [[TRUETMP2:%.*]] = icmp eq i32 [[TRUETMP1]], 0
-; CHECK-NEXT:    br i1 [[TRUETMP2]], label [[OMP_IF_END:%.*]], label [[OMP_IF_THEN:%.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_single(ptr noundef nonnull @[[GLOB0]], i32 [[TMP]])
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[TMP2]], label [[OMP_IF_END:%.*]], label [[OMP_IF_THEN:%.*]]
 ; CHECK:       omp_if.then:
-; CHECK-NEXT:    [[TRUETMP3:%.*]] = load i32, ptr [[A]], align 4
-; CHECK-NEXT:    [[INC:%.*]] = add nsw i32 [[TRUETMP3]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[A]], align 4
+; CHECK-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP3]], 1
 ; CHECK-NEXT:    store i32 [[INC]], ptr [[A]], align 4
 ; CHECK-NEXT:    call void @__kmpc_end_single(ptr noundef nonnull @[[GLOB0]], i32 [[TMP]])
 ; CHECK-NEXT:    br label [[OMP_IF_END]]
@@ -534,22 +534,22 @@ define internal void @.omp_outlined..6(ptr noalias %.global_tid., ptr noalias %.
 ; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 noundef 4, ptr noundef nonnull align 4 [[A1]]) #[[ATTR20:[0-9]+]]
 ; CHECK-NEXT:    store i32 1, ptr [[A1]], align 4
 ; CHECK-NEXT:    store ptr [[A1]], ptr [[DOTOMP_REDUCTION_RED_LIST]], align 8
-; CHECK-NEXT:    [[TRUETMP2:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4
-; CHECK-NEXT:    [[TRUETMP4:%.*]] = call i32 @__kmpc_reduce_nowait(ptr noundef nonnull @[[GLOB2:[0-9]+]], i32 [[TRUETMP2]], i32 noundef 1, i64 noundef 8, ptr noundef nonnull align 8 [[DOTOMP_REDUCTION_RED_LIST]], ptr noundef nonnull @.omp.reduction.reduction_func, ptr noundef nonnull @.gomp_critical_user_.reduction.var)
-; CHECK-NEXT:    switch i32 [[TRUETMP4]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTGLOBAL_TID_]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_reduce_nowait(ptr noundef nonnull @[[GLOB2:[0-9]+]], i32 [[TMP2]], i32 noundef 1, i64 noundef 8, ptr noundef nonnull align 8 [[DOTOMP_REDUCTION_RED_LIST]], ptr noundef nonnull @.omp.reduction.reduction_func, ptr noundef nonnull @.gomp_critical_user_.reduction.var)
+; CHECK-NEXT:    switch i32 [[TMP4]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [
 ; CHECK-NEXT:    i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]]
 ; CHECK-NEXT:    i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]]
 ; CHECK-NEXT:    ]
 ; CHECK:       .omp.reduction.case1:
-; CHECK-NEXT:    [[TRUETMP5:%.*]] = load i32, ptr [[A]], align 4
-; CHECK-NEXT:    [[TRUETMP6:%.*]] = load i32, ptr [[A1]], align 4
-; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TRUETMP5]], [[TRUETMP6]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[A]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[A1]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP5]], [[TMP6]]
 ; CHECK-NEXT:    store i32 [[ADD]], ptr [[A]], align 4
-; CHECK-NEXT:    call void @__kmpc_end_reduce_nowait(ptr noundef nonnull @[[GLOB2]], i32 [[TRUETMP2]], ptr noundef nonnull @.gomp_critical_user_.reduction.var)
+; CHECK-NEXT:    call void @__kmpc_end_reduce_nowait(ptr noundef nonnull @[[GLOB2]], i32 [[TMP2]], ptr noundef nonnull @.gomp_critical_user_.reduction.var)
 ; CHECK-NEXT:    br label [[DOTOMP_REDUCTION_DEFAULT]]
 ; CHECK:       .omp.reduction.case2:
-; CHECK-NEXT:    [[TRUETMP7:%.*]] = load i32, ptr [[A1]], align 4
-; CHECK-NEXT:    [[TRUETMP8:%.*]] = atomicrmw add ptr [[A]], i32 [[TRUETMP7]] monotonic, align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[A1]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = atomicrmw add ptr [[A]], i32 [[TMP7]] monotonic, align 4
 ; CHECK-NEXT:    br label [[DOTOMP_REDUCTION_DEFAULT]]
 ; CHECK:       .omp.reduction.default:
 ; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 noundef 4, ptr noundef nonnull [[A1]])
@@ -646,12 +646,12 @@ define internal void @.omp.reduction.reduction_func(ptr %arg, ptr %arg1) {
 ; CHECK-LABEL: define {{[^@]+}}@.omp.reduction.reduction_func
 ; CHECK-SAME: (ptr nocapture nofree noundef nonnull readonly align 8 dereferenceable(8) [[ARG:%.*]], ptr nocapture nofree noundef nonnull readonly align 8 dereferenceable(8) [[ARG1:%.*]]) #[[ATTR10:[0-9]+]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TRUETMP2:%.*]] = load ptr, ptr [[ARG1]], align 8
-; CHECK-NEXT:    [[TRUETMP4:%.*]] = load ptr, ptr [[ARG]], align 8
-; CHECK-NEXT:    [[TRUETMP5:%.*]] = load i32, ptr [[TRUETMP4]], align 4
-; CHECK-NEXT:    [[TRUETMP6:%.*]] = load i32, ptr [[TRUETMP2]], align 4
-; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TRUETMP5]], [[TRUETMP6]]
-; CHECK-NEXT:    store i32 [[ADD]], ptr [[TRUETMP4]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[ARG1]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[ARG]], align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    store i32 [[ADD]], ptr [[TMP4]], align 4
 ; CHECK-NEXT:    ret void
 ;
 ; CHECK1-LABEL: define {{[^@]+}}@.omp.reduction.reduction_func
diff --git a/llvm/test/Transforms/OpenMP/parallel_level_fold.ll b/llvm/test/Transforms/OpenMP/parallel_level_fold.ll
index eb75d030a05f0..2aae5d3186393 100644
--- a/llvm/test/Transforms/OpenMP/parallel_level_fold.ll
+++ b/llvm/test/Transforms/OpenMP/parallel_level_fold.ll
@@ -3,18 +3,18 @@
 target triple = "nvptx64"
 
 %struct.KernelEnvironmentTy = type { %struct.ConfigurationEnvironmentTy, ptr, ptr }
-%struct.ConfigurationEnvironmentTy = type { i8, i8, i8 }
+%struct.ConfigurationEnvironmentTy = type { i8, i8, i8, i32, i32, i32, i32 }
 
 @G = external global i16
-@none_spmd_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1 }, ptr null, ptr null }
-@spmd_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 2 }, ptr null, ptr null }
-@parallel_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 2 }, ptr null, ptr null }
+@none_spmd_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr null, ptr null }
+@spmd_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 2, i32 0, i32 0, i32 0, i32 0 }, ptr null, ptr null }
+@parallel_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 2, i32 0, i32 0, i32 0, i32 0 }, ptr null, ptr null }
 
 ;.
 ; CHECK: @[[G:[a-zA-Z0-9_$"\\.-]+]] = external global i16
-; CHECK: @[[NONE_SPMD_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1 }, ptr null, ptr null }
-; CHECK: @[[SPMD_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 2 }, ptr null, ptr null }
-; CHECK: @[[PARALLEL_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 2 }, ptr null, ptr null }
+; CHECK: @[[NONE_SPMD_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr null, ptr null }
+; CHECK: @[[SPMD_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 2, i32 0, i32 0, i32 0, i32 0 }, ptr null, ptr null }
+; CHECK: @[[PARALLEL_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 2, i32 0, i32 0, i32 0, i32 0 }, ptr null, ptr null }
 ;.
 define weak void @none_spmd() "kernel" {
 ; CHECK-LABEL: define {{[^@]+}}@none_spmd
diff --git a/llvm/test/Transforms/OpenMP/remove_globalization.ll b/llvm/test/Transforms/OpenMP/remove_globalization.ll
index 2326993589299..4db48c9a874c5 100644
--- a/llvm/test/Transforms/OpenMP/remove_globalization.ll
+++ b/llvm/test/Transforms/OpenMP/remove_globalization.ll
@@ -7,8 +7,8 @@ target triple = "nvptx64"
 
 @S = external local_unnamed_addr global ptr
 %struct.KernelEnvironmentTy = type { %struct.ConfigurationEnvironmentTy, ptr, ptr }
-%struct.ConfigurationEnvironmentTy = type { i8, i8, i8 }
-@kernel_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1 }, ptr null, ptr null }
+%struct.ConfigurationEnvironmentTy = type { i8, i8, i8, i32, i32, i32, i32 }
+@kernel_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr null, ptr null }
 
 
 ; UTC_ARGS: --disable
@@ -22,10 +22,10 @@ target triple = "nvptx64"
 ; Make it a weak definition so we will apply custom state machine rewriting but can't use the body in the reasoning.
 ;.
 ; CHECK: @[[S:[a-zA-Z0-9_$"\\.-]+]] = external local_unnamed_addr global ptr
-; CHECK: @[[KERNEL_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1 }, ptr null, ptr null }
+; CHECK: @[[KERNEL_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr null, ptr null }
 ;.
 ; CHECK-DISABLED: @[[S:[a-zA-Z0-9_$"\\.-]+]] = external local_unnamed_addr global ptr
-; CHECK-DISABLED: @[[KERNEL_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1 }, ptr null, ptr null }
+; CHECK-DISABLED: @[[KERNEL_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr null, ptr null }
 ;.
 define weak i32 @__kmpc_target_init(ptr %0) {
 ; CHECK-LABEL: define {{[^@]+}}@__kmpc_target_init
diff --git a/llvm/test/Transforms/OpenMP/replace_globalization.ll b/llvm/test/Transforms/OpenMP/replace_globalization.ll
index 6469558728757..d7eb120e0d1ff 100644
--- a/llvm/test/Transforms/OpenMP/replace_globalization.ll
+++ b/llvm/test/Transforms/OpenMP/replace_globalization.ll
@@ -15,14 +15,14 @@ target triple = "nvptx64"
 
 %struct.ident_t = type { i32, i32, i32, i32, ptr }
 %struct.KernelEnvironmentTy = type { %struct.ConfigurationEnvironmentTy, ptr, ptr }
-%struct.ConfigurationEnvironmentTy = type { i8, i8, i8 }
+%struct.ConfigurationEnvironmentTy = type { i8, i8, i8, i32, i32, i32, i32 }
 
 @S = external local_unnamed_addr global ptr
 @0 = private unnamed_addr constant [113 x i8] c";llvm/test/Transforms/OpenMP/custom_state_machines_remarks.c;__omp_offloading_2a_d80d3d_test_fallback_l11;11;1;;\00", align 1
 @1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, ptr @0 }, align 8
-@foo_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1 }, ptr @1, ptr null }
-@bar_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1 }, ptr @1, ptr null }
-@baz_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 2 }, ptr @1, ptr null }
+@foo_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
+@bar_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
+@baz_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 2, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
 
 
 define dso_local void @foo() "kernel" {
@@ -129,9 +129,9 @@ declare void @unknown_no_openmp() "llvm.assume"="omp_no_openmp"
 ; CHECK: @[[S:[a-zA-Z0-9_$"\\.-]+]] = external local_unnamed_addr global ptr
 ; CHECK: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [113 x i8] c"
 ; CHECK: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
-; CHECK: @[[FOO_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null }
-; CHECK: @[[BAR_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null }
-; CHECK: @[[BAZ_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 2 }, ptr @[[GLOB1]], ptr null }
+; CHECK: @[[FOO_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; CHECK: @[[BAR_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; CHECK: @[[BAZ_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 2, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
 ; CHECK: @[[OFFSET:[a-zA-Z0-9_$"\\.-]+]] = global i32 undef
 ; CHECK: @[[STACK:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [1024 x i8] undef
 ; CHECK: @[[X_SHARED:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [16 x i8] poison, align 4
diff --git a/llvm/test/Transforms/OpenMP/single_threaded_execution.ll b/llvm/test/Transforms/OpenMP/single_threaded_execution.ll
index 01f73714eb486..0a1ad421fb502 100644
--- a/llvm/test/Transforms/OpenMP/single_threaded_execution.ll
+++ b/llvm/test/Transforms/OpenMP/single_threaded_execution.ll
@@ -5,11 +5,11 @@
 
 %struct.ident_t = type { i32, i32, i32, i32, ptr }
 %struct.KernelEnvironmentTy = type { %struct.ConfigurationEnvironmentTy, ptr, ptr }
-%struct.ConfigurationEnvironmentTy = type { i8, i8, i8 }
+%struct.ConfigurationEnvironmentTy = type { i8, i8, i8, i32, i32, i32, i32 }
 
 @0 = private unnamed_addr constant [1 x i8] c"\00", align 1
 @1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, ptr @0 }, align 8
-@kernel_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1 }, ptr @1, ptr null }
+@kernel_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
 
 
 ; CHECK-NOT: [openmp-opt] Basic block @kernel entry is executed by a single thread.
diff --git a/llvm/test/Transforms/OpenMP/spmdization.ll b/llvm/test/Transforms/OpenMP/spmdization.ll
index 30e3f7a1bd268..fd272c017db13 100644
--- a/llvm/test/Transforms/OpenMP/spmdization.ll
+++ b/llvm/test/Transforms/OpenMP/spmdization.ll
@@ -93,17 +93,17 @@
 %struct.kmp_task_t = type { ptr, ptr, i32, %union.kmp_cmplrdata_t, %union.kmp_cmplrdata_t }
 %union.kmp_cmplrdata_t = type { ptr }
 %struct.anon = type {}
-%struct.ConfigurationEnvironmentTy = type { i8, i8, i8 }
+%struct.ConfigurationEnvironmentTy = type { i8, i8, i8, i32, i32, i32, i32 }
 %struct.KernelEnvironmentTy = type { %struct.ConfigurationEnvironmentTy, ptr, ptr }
 
 @0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
 @1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, ptr @0 }, align 8
-@__omp_offloading_fd02_2044372e_sequential_loop_l5_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1 }, ptr @1, ptr null }
-@__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1 }, ptr @1, ptr null }
-@__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1 }, ptr @1, ptr null }
-@__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1 }, ptr @1, ptr null }
-@__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1 }, ptr @1, ptr null }
-@__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1 }, ptr @1, ptr null }
+@__omp_offloading_fd02_2044372e_sequential_loop_l5_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
+@__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
+@__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
+@__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
+@__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
+@__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
 
 ; AMDGPU-DISABLED: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
 ; AMDGPU-DISABLED: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
@@ -138,12 +138,12 @@
 ;.
 ; AMDGPU: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
 ; AMDGPU: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
-; AMDGPU: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_L5_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_STACK_VAR_L20_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_SHARED_VAR_L35_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_SHARED_VAR_GUARDED_L50_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU: @[[__OMP_OFFLOADING_FD02_2044372E_DO_NOT_SPMDIZE_TARGET_L65_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU: @[[__OMP_OFFLOADING_FD02_2044372E_DO_NOT_SPMDIZE_TASK_L74_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_L5_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_STACK_VAR_L20_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_SHARED_VAR_L35_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_SHARED_VAR_GUARDED_L50_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU: @[[__OMP_OFFLOADING_FD02_2044372E_DO_NOT_SPMDIZE_TARGET_L65_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU: @[[__OMP_OFFLOADING_FD02_2044372E_DO_NOT_SPMDIZE_TASK_L74_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
 ; AMDGPU: @[[GLOB2:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 22, ptr @[[GLOB0]] }, align 8
 ; AMDGPU: @[[X_SHARED:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] poison, align 4
 ; AMDGPU: @[[X_SHARED_1:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] poison, align 4
@@ -151,12 +151,12 @@
 ;.
 ; NVPTX: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
 ; NVPTX: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
-; NVPTX: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_L5_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3 }, ptr @[[GLOB1]], ptr null }
-; NVPTX: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_STACK_VAR_L20_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3 }, ptr @[[GLOB1]], ptr null }
-; NVPTX: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_SHARED_VAR_L35_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3 }, ptr @[[GLOB1]], ptr null }
-; NVPTX: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_SHARED_VAR_GUARDED_L50_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3 }, ptr @[[GLOB1]], ptr null }
-; NVPTX: @[[__OMP_OFFLOADING_FD02_2044372E_DO_NOT_SPMDIZE_TARGET_L65_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null }
-; NVPTX: @[[__OMP_OFFLOADING_FD02_2044372E_DO_NOT_SPMDIZE_TASK_L74_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
+; NVPTX: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_L5_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_STACK_VAR_L20_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_SHARED_VAR_L35_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_SHARED_VAR_GUARDED_L50_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX: @[[__OMP_OFFLOADING_FD02_2044372E_DO_NOT_SPMDIZE_TARGET_L65_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX: @[[__OMP_OFFLOADING_FD02_2044372E_DO_NOT_SPMDIZE_TASK_L74_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
 ; NVPTX: @[[GLOB2:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 22, ptr @[[GLOB0]] }, align 8
 ; NVPTX: @[[X_SHARED:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] poison, align 4
 ; NVPTX: @[[X_SHARED1:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] poison, align 4
@@ -164,12 +164,12 @@
 ;.
 ; AMDGPU-DISABLED1: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
 ; AMDGPU-DISABLED1: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
-; AMDGPU-DISABLED1: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_L5_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU-DISABLED1: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_STACK_VAR_L20_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU-DISABLED1: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_SHARED_VAR_L35_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU-DISABLED1: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_SHARED_VAR_GUARDED_L50_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU-DISABLED1: @[[__OMP_OFFLOADING_FD02_2044372E_DO_NOT_SPMDIZE_TARGET_L65_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU-DISABLED1: @[[__OMP_OFFLOADING_FD02_2044372E_DO_NOT_SPMDIZE_TASK_L74_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU-DISABLED1: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_L5_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU-DISABLED1: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_STACK_VAR_L20_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU-DISABLED1: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_SHARED_VAR_L35_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU-DISABLED1: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_SHARED_VAR_GUARDED_L50_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU-DISABLED1: @[[__OMP_OFFLOADING_FD02_2044372E_DO_NOT_SPMDIZE_TARGET_L65_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU-DISABLED1: @[[__OMP_OFFLOADING_FD02_2044372E_DO_NOT_SPMDIZE_TASK_L74_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
 ; AMDGPU-DISABLED1: @[[X_SHARED:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] poison, align 4
 ; AMDGPU-DISABLED1: @[[X_SHARED_1:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] poison, align 4
 ; AMDGPU-DISABLED1: @[[__OMP_OUTLINED__1_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
@@ -180,12 +180,12 @@
 ;.
 ; AMDGPU-DISABLED2: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
 ; AMDGPU-DISABLED2: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
-; AMDGPU-DISABLED2: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_L5_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU-DISABLED2: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_STACK_VAR_L20_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU-DISABLED2: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_SHARED_VAR_L35_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU-DISABLED2: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_SHARED_VAR_GUARDED_L50_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU-DISABLED2: @[[__OMP_OFFLOADING_FD02_2044372E_DO_NOT_SPMDIZE_TARGET_L65_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU-DISABLED2: @[[__OMP_OFFLOADING_FD02_2044372E_DO_NOT_SPMDIZE_TASK_L74_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU-DISABLED2: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_L5_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU-DISABLED2: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_STACK_VAR_L20_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU-DISABLED2: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_SHARED_VAR_L35_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU-DISABLED2: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_SHARED_VAR_GUARDED_L50_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU-DISABLED2: @[[__OMP_OFFLOADING_FD02_2044372E_DO_NOT_SPMDIZE_TARGET_L65_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU-DISABLED2: @[[__OMP_OFFLOADING_FD02_2044372E_DO_NOT_SPMDIZE_TASK_L74_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
 ; AMDGPU-DISABLED2: @[[X_SHARED:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] poison, align 4
 ; AMDGPU-DISABLED2: @[[X_SHARED_1:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] poison, align 4
 ; AMDGPU-DISABLED2: @[[__OMP_OUTLINED__1_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
@@ -196,12 +196,12 @@
 ;.
 ; NVPTX-DISABLED1: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
 ; NVPTX-DISABLED1: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
-; NVPTX-DISABLED1: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_L5_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
-; NVPTX-DISABLED1: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_STACK_VAR_L20_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
-; NVPTX-DISABLED1: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_SHARED_VAR_L35_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
-; NVPTX-DISABLED1: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_SHARED_VAR_GUARDED_L50_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
-; NVPTX-DISABLED1: @[[__OMP_OFFLOADING_FD02_2044372E_DO_NOT_SPMDIZE_TARGET_L65_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null }
-; NVPTX-DISABLED1: @[[__OMP_OFFLOADING_FD02_2044372E_DO_NOT_SPMDIZE_TASK_L74_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
+; NVPTX-DISABLED1: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_L5_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX-DISABLED1: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_STACK_VAR_L20_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX-DISABLED1: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_SHARED_VAR_L35_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX-DISABLED1: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_SHARED_VAR_GUARDED_L50_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX-DISABLED1: @[[__OMP_OFFLOADING_FD02_2044372E_DO_NOT_SPMDIZE_TARGET_L65_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX-DISABLED1: @[[__OMP_OFFLOADING_FD02_2044372E_DO_NOT_SPMDIZE_TASK_L74_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
 ; NVPTX-DISABLED1: @[[X_SHARED:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] poison, align 4
 ; NVPTX-DISABLED1: @[[X_SHARED1:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] poison, align 4
 ; NVPTX-DISABLED1: @[[__OMP_OUTLINED__1_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
@@ -212,12 +212,12 @@
 ;.
 ; NVPTX-DISABLED2: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
 ; NVPTX-DISABLED2: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
-; NVPTX-DISABLED2: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_L5_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3 }, ptr @[[GLOB1]], ptr null }
-; NVPTX-DISABLED2: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_STACK_VAR_L20_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3 }, ptr @[[GLOB1]], ptr null }
-; NVPTX-DISABLED2: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_SHARED_VAR_L35_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3 }, ptr @[[GLOB1]], ptr null }
-; NVPTX-DISABLED2: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_SHARED_VAR_GUARDED_L50_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3 }, ptr @[[GLOB1]], ptr null }
-; NVPTX-DISABLED2: @[[__OMP_OFFLOADING_FD02_2044372E_DO_NOT_SPMDIZE_TARGET_L65_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null }
-; NVPTX-DISABLED2: @[[__OMP_OFFLOADING_FD02_2044372E_DO_NOT_SPMDIZE_TASK_L74_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
+; NVPTX-DISABLED2: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_L5_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX-DISABLED2: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_STACK_VAR_L20_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX-DISABLED2: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_SHARED_VAR_L35_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX-DISABLED2: @[[__OMP_OFFLOADING_FD02_2044372E_SEQUENTIAL_LOOP_TO_SHARED_VAR_GUARDED_L50_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX-DISABLED2: @[[__OMP_OFFLOADING_FD02_2044372E_DO_NOT_SPMDIZE_TARGET_L65_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX-DISABLED2: @[[__OMP_OFFLOADING_FD02_2044372E_DO_NOT_SPMDIZE_TASK_L74_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
 ; NVPTX-DISABLED2: @[[X_SHARED:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] poison, align 4
 ; NVPTX-DISABLED2: @[[X_SHARED1:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] poison, align 4
 ; NVPTX-DISABLED2: @[[__OMP_OUTLINED__1_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
diff --git a/llvm/test/Transforms/OpenMP/spmdization_assumes.ll b/llvm/test/Transforms/OpenMP/spmdization_assumes.ll
index e02c3af750a29..1c583475d50a6 100644
--- a/llvm/test/Transforms/OpenMP/spmdization_assumes.ll
+++ b/llvm/test/Transforms/OpenMP/spmdization_assumes.ll
@@ -14,18 +14,18 @@ target triple = "nvptx64"
 
 %struct.ident_t = type { i32, i32, i32, i32, ptr }
 %struct.KernelEnvironmentTy = type { %struct.ConfigurationEnvironmentTy, ptr, ptr }
-%struct.ConfigurationEnvironmentTy = type { i8, i8, i8 }
+%struct.ConfigurationEnvironmentTy = type { i8, i8, i8, i32, i32, i32, i32 }
 
 @0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
 @1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, ptr @0 }, align 8
-@__omp_offloading_fd02_404433c2_main_l5_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1 }, ptr @1, ptr null }
+@__omp_offloading_fd02_404433c2_main_l5_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
 
 
 ; Function Attrs: alwaysinline convergent norecurse nounwind
 ;.
 ; CHECK: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
 ; CHECK: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
-; CHECK: @[[__OMP_OFFLOADING_FD02_404433C2_MAIN_L5_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 3 }, ptr @[[GLOB1]], ptr null }
+; CHECK: @[[__OMP_OFFLOADING_FD02_404433C2_MAIN_L5_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 3, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
 ; CHECK: @[[GLOB2:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 22, ptr @[[GLOB0]] }, align 8
 ;.
 define weak void @__omp_offloading_fd02_404433c2_main_l5(ptr nonnull align 8 dereferenceable(8) %x) local_unnamed_addr #0 {
diff --git a/llvm/test/Transforms/OpenMP/spmdization_constant_prop.ll b/llvm/test/Transforms/OpenMP/spmdization_constant_prop.ll
index 6de019128e0d4..1ac4fd2b45b7a 100644
--- a/llvm/test/Transforms/OpenMP/spmdization_constant_prop.ll
+++ b/llvm/test/Transforms/OpenMP/spmdization_constant_prop.ll
@@ -2,7 +2,7 @@
 ;
 ; Verify we change it to SPMD mode but also avoid propagating the old mode (=generic) into the __kmpc_target_init function.
 ;
-; CHECK: @__omp_offloading_20_11e3950_main_l12_kernel_environment = local_unnamed_addr addrspace(1) constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 3 }, ptr addrspacecast (ptr addrspace(1) @1 to ptr), ptr null }
+; CHECK: @__omp_offloading_20_11e3950_main_l12_kernel_environment = local_unnamed_addr addrspace(1) constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 3, i32 0, i32 0, i32 0, i32 0 }, ptr addrspacecast (ptr addrspace(1) @1 to ptr), ptr null }
 ; CHECK-NOT: store i32 0, ptr addrspace(3) @IsSPMDMode
 ; CHECK-NOT: store i32 0, ptr addrspace(3) @IsSPMDMode
 ; CHECK: store i32 1, ptr addrspace(3) @IsSPMDMode
@@ -14,7 +14,7 @@ target triple = "amdgcn-amd-amdhsa"
 %struct.ident_t = type { i32, i32, i32, i32, ptr }
 %struct.DeviceEnvironmentTy = type { i32, i32, i32, i32 }
 %struct.KernelEnvironmentTy = type { %struct.ConfigurationEnvironmentTy, ptr, ptr }
-%struct.ConfigurationEnvironmentTy = type { i8, i8, i8 }
+%struct.ConfigurationEnvironmentTy = type { i8, i8, i8, i32, i32, i32, i32 }
 %"struct.(anonymous namespace)::SharedMemorySmartStackTy" = type { [512 x i8], [1024 x i8] }
 %"struct.(anonymous namespace)::TeamStateTy" = type { %"struct.(anonymous namespace)::ICVStateTy", i32, ptr }
 %"struct.(anonymous namespace)::ICVStateTy" = type { i32, i32, i32, i32, i32, i32 }
@@ -27,7 +27,7 @@ target triple = "amdgcn-amd-amdhsa"
 @__omp_rtl_debug_kind = weak_odr hidden local_unnamed_addr addrspace(1) constant i32 0
 @__omp_rtl_assume_no_thread_state = weak_odr hidden local_unnamed_addr addrspace(1) constant i32 0
 @omptarget_device_environment = weak protected addrspace(4) global %struct.DeviceEnvironmentTy undef, align 4
-@__omp_offloading_20_11e3950_main_l12_kernel_environment = local_unnamed_addr addrspace(1) constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1 }, ptr addrspacecast (ptr addrspace(1) @1 to ptr), ptr null }
+@__omp_offloading_20_11e3950_main_l12_kernel_environment = local_unnamed_addr addrspace(1) constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr addrspacecast (ptr addrspace(1) @1 to ptr), ptr null }
 @IsSPMDMode = weak hidden addrspace(3) global i32 undef, align 4
 @.str.12 = private unnamed_addr addrspace(4) constant [47 x i8] c"ValueRAII initialization with wrong old value!\00", align 1
 @_ZN12_GLOBAL__N_122SharedMemorySmartStackE = internal addrspace(3) global %"struct.(anonymous namespace)::SharedMemorySmartStackTy" undef, align 16
diff --git a/llvm/test/Transforms/OpenMP/spmdization_guarding.ll b/llvm/test/Transforms/OpenMP/spmdization_guarding.ll
index 342d9e90e9c55..de73250dbb760 100644
--- a/llvm/test/Transforms/OpenMP/spmdization_guarding.ll
+++ b/llvm/test/Transforms/OpenMP/spmdization_guarding.ll
@@ -35,13 +35,13 @@ target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
 target triple = "nvptx64"
 
 %struct.ident_t = type { i32, i32, i32, i32, ptr }
-%struct.ConfigurationEnvironmentTy = type { i8, i8, i8 }
+%struct.ConfigurationEnvironmentTy = type { i8, i8, i8, i32, i32, i32, i32 }
 %struct.KernelEnvironmentTy = type { %struct.ConfigurationEnvironmentTy, ptr, ptr }
 
 @0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
 @1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, ptr @0 }, align 8
 @LocGlob = private unnamed_addr addrspace(5) global i32 43
-@__omp_offloading_2a_fbfa7a_sequential_loop_l6_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1 }, ptr @1, ptr null }
+@__omp_offloading_2a_fbfa7a_sequential_loop_l6_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
 
 
 ; Function Attrs: convergent norecurse nounwind
@@ -49,13 +49,13 @@ target triple = "nvptx64"
 ; CHECK: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
 ; CHECK: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
 ; CHECK: @[[LOCGLOB:[a-zA-Z0-9_$"\\.-]+]] = private unnamed_addr addrspace(5) global i32 43
-; CHECK: @[[__OMP_OFFLOADING_2A_FBFA7A_SEQUENTIAL_LOOP_L6_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 3 }, ptr @[[GLOB1]], ptr null }
+; CHECK: @[[__OMP_OFFLOADING_2A_FBFA7A_SEQUENTIAL_LOOP_L6_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 3, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
 ; CHECK: @[[GLOB2:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 22, ptr @[[GLOB0]] }, align 8
 ;.
 ; CHECK-DISABLED: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
 ; CHECK-DISABLED: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
 ; CHECK-DISABLED: @[[LOCGLOB:[a-zA-Z0-9_$"\\.-]+]] = private unnamed_addr addrspace(5) global i32 43
-; CHECK-DISABLED: @[[__OMP_OFFLOADING_2A_FBFA7A_SEQUENTIAL_LOOP_L6_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null }
+; CHECK-DISABLED: @[[__OMP_OFFLOADING_2A_FBFA7A_SEQUENTIAL_LOOP_L6_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
 ; CHECK-DISABLED: @[[__OMP_OUTLINED__1_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
 ;.
 define weak void @__omp_offloading_2a_fbfa7a_sequential_loop_l6(ptr %x, i64 %N) #0 {
diff --git a/llvm/test/Transforms/OpenMP/spmdization_guarding_two_reaching_kernels.ll b/llvm/test/Transforms/OpenMP/spmdization_guarding_two_reaching_kernels.ll
index e780687cd795c..62aaff572c181 100644
--- a/llvm/test/Transforms/OpenMP/spmdization_guarding_two_reaching_kernels.ll
+++ b/llvm/test/Transforms/OpenMP/spmdization_guarding_two_reaching_kernels.ll
@@ -30,31 +30,31 @@ target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
 target triple = "nvptx64"
 
 %struct.ident_t = type { i32, i32, i32, i32, ptr }
-%struct.ConfigurationEnvironmentTy = type { i8, i8, i8 }
+%struct.ConfigurationEnvironmentTy = type { i8, i8, i8, i32, i32, i32, i32 }
 %struct.KernelEnvironmentTy = type { %struct.ConfigurationEnvironmentTy, ptr, ptr }
 
 @0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
 @1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, ptr @0 }, align 8
 @2 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 2, i32 0, ptr @0 }, align 8
 @G = external global i32, align 4
-@__omp_offloading_2b_10393b5_spmd_l12_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1 }, ptr @1, ptr null }
-@__omp_offloading_2b_10393b5_generic_l20_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1 }, ptr @1, ptr null }
+@__omp_offloading_2b_10393b5_spmd_l12_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
+@__omp_offloading_2b_10393b5_generic_l20_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
 
 ;.
 ; CHECK: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
 ; CHECK: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
 ; CHECK: @[[GLOB2:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
 ; CHECK: @[[G:[a-zA-Z0-9_$"\\.-]+]] = external global i32, align 4
-; CHECK: @[[__OMP_OFFLOADING_2B_10393B5_SPMD_L12_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3 }, ptr @[[GLOB1]], ptr null }
-; CHECK: @[[__OMP_OFFLOADING_2B_10393B5_GENERIC_L20_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 3 }, ptr @[[GLOB1]], ptr null }
+; CHECK: @[[__OMP_OFFLOADING_2B_10393B5_SPMD_L12_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; CHECK: @[[__OMP_OFFLOADING_2B_10393B5_GENERIC_L20_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 3, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
 ; CHECK: @[[GLOB3:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 22, ptr @[[GLOB0]] }, align 8
 ;.
 ; CHECK-DISABLE-SPMDIZATION: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
 ; CHECK-DISABLE-SPMDIZATION: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
 ; CHECK-DISABLE-SPMDIZATION: @[[GLOB2:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
 ; CHECK-DISABLE-SPMDIZATION: @[[G:[a-zA-Z0-9_$"\\.-]+]] = external global i32, align 4
-; CHECK-DISABLE-SPMDIZATION: @[[__OMP_OFFLOADING_2B_10393B5_SPMD_L12_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1 }, ptr @[[GLOB1]], ptr null }
-; CHECK-DISABLE-SPMDIZATION: @[[__OMP_OFFLOADING_2B_10393B5_GENERIC_L20_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null }
+; CHECK-DISABLE-SPMDIZATION: @[[__OMP_OFFLOADING_2B_10393B5_SPMD_L12_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; CHECK-DISABLE-SPMDIZATION: @[[__OMP_OFFLOADING_2B_10393B5_GENERIC_L20_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
 ;.
 define weak void @__omp_offloading_2b_10393b5_spmd_l12() "kernel" #0 {
 ; CHECK-LABEL: define {{[^@]+}}@__omp_offloading_2b_10393b5_spmd_l12
diff --git a/llvm/test/Transforms/OpenMP/spmdization_indirect.ll b/llvm/test/Transforms/OpenMP/spmdization_indirect.ll
index 04b0e50d4bce4..3a64b2660cf11 100644
--- a/llvm/test/Transforms/OpenMP/spmdization_indirect.ll
+++ b/llvm/test/Transforms/OpenMP/spmdization_indirect.ll
@@ -3,30 +3,30 @@
 ; RUN: opt --mtriple=nvptx64-- -S -passes=openmp-opt < %s | FileCheck %s --check-prefixes=NVPTX
 
 %struct.ident_t = type { i32, i32, i32, i32, ptr }
-%struct.ConfigurationEnvironmentTy = type { i8, i8, i8 }
+%struct.ConfigurationEnvironmentTy = type { i8, i8, i8, i32, i32, i32, i32 }
 %struct.KernelEnvironmentTy = type { %struct.ConfigurationEnvironmentTy, ptr, ptr }
 
 @0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
 @1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, ptr @0 }, align 8
-@spmd_callees_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1 }, ptr @1, ptr null }
-@spmd_callees_metadata_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1 }, ptr @1, ptr null }
-@spmd_and_non_spmd_callees_metadata_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1 }, ptr @1, ptr null }
-@spmd_and_non_spmd_callee_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1 }, ptr @1, ptr null }
+@spmd_callees_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
+@spmd_callees_metadata_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
+@spmd_and_non_spmd_callees_metadata_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
+@spmd_and_non_spmd_callee_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
 
 ;.
 ; AMDGPU: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
 ; AMDGPU: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
-; AMDGPU: @[[SPMD_CALLEES_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU: @[[SPMD_CALLEES_METADATA_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 3 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU: @[[SPMD_AND_NON_SPMD_CALLEES_METADATA_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU: @[[SPMD_AND_NON_SPMD_CALLEE_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU: @[[SPMD_CALLEES_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU: @[[SPMD_CALLEES_METADATA_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 3, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU: @[[SPMD_AND_NON_SPMD_CALLEES_METADATA_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU: @[[SPMD_AND_NON_SPMD_CALLEE_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
 ;.
 ; NVPTX: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
 ; NVPTX: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
-; NVPTX: @[[SPMD_CALLEES_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3 }, ptr @[[GLOB1]], ptr null }
-; NVPTX: @[[SPMD_CALLEES_METADATA_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 3 }, ptr @[[GLOB1]], ptr null }
-; NVPTX: @[[SPMD_AND_NON_SPMD_CALLEES_METADATA_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null }
-; NVPTX: @[[SPMD_AND_NON_SPMD_CALLEE_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null }
+; NVPTX: @[[SPMD_CALLEES_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX: @[[SPMD_CALLEES_METADATA_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 3, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX: @[[SPMD_AND_NON_SPMD_CALLEES_METADATA_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX: @[[SPMD_AND_NON_SPMD_CALLEE_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
 ;.
 define weak void @spmd_callees(i1 %c) #0 {
 ; AMDGPU-LABEL: define {{[^@]+}}@spmd_callees
diff --git a/llvm/test/Transforms/OpenMP/spmdization_kernel_env_dep.ll b/llvm/test/Transforms/OpenMP/spmdization_kernel_env_dep.ll
index 2ac0ae5b307e4..a5bef5f7e8e15 100644
--- a/llvm/test/Transforms/OpenMP/spmdization_kernel_env_dep.ll
+++ b/llvm/test/Transforms/OpenMP/spmdization_kernel_env_dep.ll
@@ -5,14 +5,14 @@ target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:3
 target triple = "amdgcn-amd-amdhsa"
 
 %struct.KernelEnvironmentTy = type { %struct.ConfigurationEnvironmentTy.8, ptr, ptr }
-%struct.ConfigurationEnvironmentTy.8 = type { i8, i8, i8 }
+%struct.ConfigurationEnvironmentTy.8 = type { i8, i8, i8, i32, i32, i32, i32 }
 
 @IsSPMDMode = internal addrspace(3) global i32 undef
-@__omp_offloading_10302_b20a40e_main_l4_kernel_environment = addrspace(1) constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy.8 { i8 1, i8 0, i8 1 }, ptr addrspacecast (ptr addrspace(1) null to ptr), ptr addrspacecast (ptr addrspace(1) null to ptr) }
+@__omp_offloading_10302_b20a40e_main_l4_kernel_environment = addrspace(1) constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy.8 { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr addrspacecast (ptr addrspace(1) null to ptr), ptr addrspacecast (ptr addrspace(1) null to ptr) }
 
 ;.
 ; AMDGPU: @[[ISSPMDMODE:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global i32 undef
-; AMDGPU: @[[__OMP_OFFLOADING_10302_B20A40E_MAIN_L4_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = addrspace(1) constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY_8:%.*]] { i8 0, i8 0, i8 1 }, ptr addrspacecast (ptr addrspace(1) null to ptr), ptr addrspacecast (ptr addrspace(1) null to ptr) }
+; AMDGPU: @[[__OMP_OFFLOADING_10302_B20A40E_MAIN_L4_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = addrspace(1) constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY_8:%.*]] { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr addrspacecast (ptr addrspace(1) null to ptr), ptr addrspacecast (ptr addrspace(1) null to ptr) }
 ;.
 define i32 @fputs() {
 ; AMDGPU-LABEL: define {{[^@]+}}@fputs
diff --git a/llvm/test/Transforms/OpenMP/spmdization_no_guarding_two_reaching_kernels.ll b/llvm/test/Transforms/OpenMP/spmdization_no_guarding_two_reaching_kernels.ll
index 04644a98983f0..c6365c9e0e93d 100644
--- a/llvm/test/Transforms/OpenMP/spmdization_no_guarding_two_reaching_kernels.ll
+++ b/llvm/test/Transforms/OpenMP/spmdization_no_guarding_two_reaching_kernels.ll
@@ -31,30 +31,30 @@ target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
 target triple = "nvptx64"
 
 %struct.ident_t = type { i32, i32, i32, i32, ptr }
-%struct.ConfigurationEnvironmentTy = type { i8, i8, i8 }
+%struct.ConfigurationEnvironmentTy = type { i8, i8, i8, i32, i32, i32, i32 }
 %struct.KernelEnvironmentTy = type { %struct.ConfigurationEnvironmentTy, ptr, ptr }
 
 @0 = private unnamed_addr constant [23 x i8] c";unknown;unknown;0;0;;\00", align 1
 @1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 0, ptr @0 }, align 8
 @2 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 2, i32 0, ptr @0 }, align 8
 @G = external addrspace(5) global i32, align 4
-@__omp_offloading_2b_10393b5_spmd_l12_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1 }, ptr @1, ptr null }
-@__omp_offloading_2b_10393b5_generic_l20_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1 }, ptr @1, ptr null }
+@__omp_offloading_2b_10393b5_spmd_l12_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
+@__omp_offloading_2b_10393b5_generic_l20_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
 
 ;.
 ; CHECK: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
 ; CHECK: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
 ; CHECK: @[[GLOB2:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
 ; CHECK: @[[G:[a-zA-Z0-9_$"\\.-]+]] = external addrspace(5) global i32, align 4
-; CHECK: @[[__OMP_OFFLOADING_2B_10393B5_SPMD_L12_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 3 }, ptr @[[GLOB1]], ptr null }
-; CHECK: @[[__OMP_OFFLOADING_2B_10393B5_GENERIC_L20_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null }
+; CHECK: @[[__OMP_OFFLOADING_2B_10393B5_SPMD_L12_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 3, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; CHECK: @[[__OMP_OFFLOADING_2B_10393B5_GENERIC_L20_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
 ;.
 ; CHECK-DISABLE-SPMDIZATION: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
 ; CHECK-DISABLE-SPMDIZATION: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 0, ptr @[[GLOB0]] }, align 8
 ; CHECK-DISABLE-SPMDIZATION: @[[GLOB2:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
 ; CHECK-DISABLE-SPMDIZATION: @[[G:[a-zA-Z0-9_$"\\.-]+]] = external addrspace(5) global i32, align 4
-; CHECK-DISABLE-SPMDIZATION: @[[__OMP_OFFLOADING_2B_10393B5_SPMD_L12_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null }
-; CHECK-DISABLE-SPMDIZATION: @[[__OMP_OFFLOADING_2B_10393B5_GENERIC_L20_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1 }, ptr @[[GLOB1]], ptr null }
+; CHECK-DISABLE-SPMDIZATION: @[[__OMP_OFFLOADING_2B_10393B5_SPMD_L12_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; CHECK-DISABLE-SPMDIZATION: @[[__OMP_OFFLOADING_2B_10393B5_GENERIC_L20_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
 ; CHECK-DISABLE-SPMDIZATION: @[[__OMP_OUTLINED___WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
 ;.
 define weak void @__omp_offloading_2b_10393b5_spmd_l12() #0 {
diff --git a/llvm/test/Transforms/OpenMP/spmdization_remarks.ll b/llvm/test/Transforms/OpenMP/spmdization_remarks.ll
index 08ddcfc07d50c..d2d22c2d8790c 100644
--- a/llvm/test/Transforms/OpenMP/spmdization_remarks.ll
+++ b/llvm/test/Transforms/OpenMP/spmdization_remarks.ll
@@ -38,7 +38,7 @@ target triple = "nvptx64"
 ;; }
 
 %struct.ident_t = type { i32, i32, i32, i32, ptr }
-%struct.ConfigurationEnvironmentTy = type { i8, i8, i8 }
+%struct.ConfigurationEnvironmentTy = type { i8, i8, i8, i32, i32, i32, i32 }
 %struct.KernelEnvironmentTy = type { %struct.ConfigurationEnvironmentTy, ptr, ptr }
 
 @0 = private unnamed_addr constant [103 x i8] c";llvm/test/Transforms/OpenMP/spmdization_remarks.c;__omp_offloading_2a_d80d3d_test_fallback_l11;11;1;;\00", align 1
@@ -57,8 +57,8 @@ target triple = "nvptx64"
 @13 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 2, i32 0, ptr @12 }, align 8
 @G = external global i32
 
-@__omp_offloading_2a_d80d3d_test_fallback_l11_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1 }, ptr @1, ptr null }
-@__omp_offloading_2a_d80d3d_test_no_fallback_l20_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1 }, ptr @1, ptr null }
+@__omp_offloading_2a_d80d3d_test_fallback_l11_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
+@__omp_offloading_2a_d80d3d_test_no_fallback_l20_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
 
 
 ; Function Attrs: convergent norecurse nounwind
diff --git a/llvm/test/Transforms/OpenMP/value-simplify-openmp-opt.ll b/llvm/test/Transforms/OpenMP/value-simplify-openmp-opt.ll
index 5f06f9104ae4a..0a62b1750cb0d 100644
--- a/llvm/test/Transforms/OpenMP/value-simplify-openmp-opt.ll
+++ b/llvm/test/Transforms/OpenMP/value-simplify-openmp-opt.ll
@@ -4,7 +4,7 @@
 
 target triple = "amdgcn-amd-amdhsa"
 
-%struct.ConfigurationEnvironmentTy = type { i8, i8, i8 }
+%struct.ConfigurationEnvironmentTy = type { i8, i8, i8, i32, i32, i32, i32 }
 %struct.KernelEnvironmentTy = type { %struct.ConfigurationEnvironmentTy, ptr, ptr }
 
 @G = internal addrspace(3) global i32 undef, align 4
@@ -27,7 +27,7 @@ target triple = "amdgcn-amd-amdhsa"
 @UAA3 = internal addrspace(3) global i32 undef, align 4
 @UANA1 = internal addrspace(3) global i32 undef, align 4
 @str = private unnamed_addr addrspace(4) constant [1 x i8] c"\00", align 1
-@kernel_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1 }, ptr null, ptr null }
+@kernel_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr null, ptr null }
 
 ; Make sure we do not delete the stores to @G without also replacing the load with `1`.
 ;.
@@ -51,7 +51,7 @@ target triple = "amdgcn-amd-amdhsa"
 ; CHECK: @[[UAA3:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global i32 undef, align 4
 ; CHECK: @[[UANA1:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global i32 undef, align 4
 ; CHECK: @[[STR:[a-zA-Z0-9_$"\\.-]+]] = private unnamed_addr addrspace(4) constant [1 x i8] zeroinitializer, align 1
-; CHECK: @[[KERNEL_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1 }, ptr null, ptr null }
+; CHECK: @[[KERNEL_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr null, ptr null }
 ;.
 define void @kernel() "kernel" {
 ;
diff --git a/llvm/test/Transforms/PhaseOrdering/openmp-opt-module.ll b/llvm/test/Transforms/PhaseOrdering/openmp-opt-module.ll
index b8486ab6a6c9e..a64107b25d520 100644
--- a/llvm/test/Transforms/PhaseOrdering/openmp-opt-module.ll
+++ b/llvm/test/Transforms/PhaseOrdering/openmp-opt-module.ll
@@ -1,12 +1,12 @@
 ; RUN: opt -passes='default<O2>' -pass-remarks-missed=openmp-opt < %s 2>&1 | FileCheck %s --check-prefix=MODULE
 target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
 
-%struct.ConfigurationEnvironmentTy = type { i8, i8, i8 }
+%struct.ConfigurationEnvironmentTy = type { i8, i8, i8, i32, i32, i32, i32 }
 %struct.KernelEnvironmentTy = type { %struct.ConfigurationEnvironmentTy, ptr, ptr }
 
 @.str = private unnamed_addr constant [13 x i8] c"Alloc Shared\00", align 1
 @S = external local_unnamed_addr global ptr
-@foo_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1 }, ptr null, ptr null }
+@foo_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr null, ptr null }
 
 ; MODULE: remark: openmp_opt_module.c:5:7: Found thread data sharing on the GPU. Expect degraded performance due to data globalization.
 
diff --git a/openmp/libomptarget/include/Environment.h b/openmp/libomptarget/include/Environment.h
index 8194736ae4e0a..1374d1e95a554 100644
--- a/openmp/libomptarget/include/Environment.h
+++ b/openmp/libomptarget/include/Environment.h
@@ -80,6 +80,13 @@ struct ConfigurationEnvironmentTy {
   uint8_t UseGenericStateMachine;
   uint8_t MayUseNestedParallelism;
   llvm::omp::OMPTgtExecModeFlags ExecMode;
+  // Information about (legal) launch configurations.
+  //{
+  int32_t MinThreads;
+  int32_t MaxThreads;
+  int32_t MinTeams;
+  int32_t MaxTeams;
+  //}
 };
 
 // NOTE: Please don't change the order of those members as their indices are

From 0359a78f77fbee15a3c3ca9e6a44301336cc0f6e Mon Sep 17 00:00:00 2001
From: Alex Richardson <alexrichardson@google.com>
Date: Thu, 26 Oct 2023 17:39:44 -0700
Subject: [PATCH 160/877] Add target REQUIRES to 3 more tests

These tests all need an x86 backend for the correct data layout.
---
 .../Instrumentation/SanitizerCoverage/cmp-tracing-api-x86_32.ll  | 1 +
 llvm/test/Transforms/InstCombine/double-float-shrink-2.ll        | 1 +
 llvm/test/Transforms/NewGVN/refine-stores.ll                     | 1 +
 3 files changed, 3 insertions(+)

diff --git a/llvm/test/Instrumentation/SanitizerCoverage/cmp-tracing-api-x86_32.ll b/llvm/test/Instrumentation/SanitizerCoverage/cmp-tracing-api-x86_32.ll
index 2ba72d37b70f0..6ade3e32bd279 100644
--- a/llvm/test/Instrumentation/SanitizerCoverage/cmp-tracing-api-x86_32.ll
+++ b/llvm/test/Instrumentation/SanitizerCoverage/cmp-tracing-api-x86_32.ll
@@ -1,4 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; REQUIRES: x86-registered-target
 ; Test -sanitizer-coverage-trace-compares=1 API declarations on a non-x86_64 arch
 ; RUN: opt < %s -passes='module(sancov-module)' -sanitizer-coverage-level=1 -sanitizer-coverage-trace-compares=1  -S | FileCheck %s
 
diff --git a/llvm/test/Transforms/InstCombine/double-float-shrink-2.ll b/llvm/test/Transforms/InstCombine/double-float-shrink-2.ll
index baa78f82146f9..f2049e2813ebc 100644
--- a/llvm/test/Transforms/InstCombine/double-float-shrink-2.ll
+++ b/llvm/test/Transforms/InstCombine/double-float-shrink-2.ll
@@ -1,4 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; REQUIRES: x86-registered-target,sparc-registered-target
 ; RUN: opt < %s -passes=instcombine -S -mtriple "i386-pc-linux"     | FileCheck %s --check-prefixes=CHECK,DOUBLE-4BYTE-ALIGN
 ; RUN: opt < %s -passes=instcombine -S -mtriple "i386-pc-win32"     | FileCheck %s --check-prefixes=CHECK,DOUBLE-8BYTE-ALIGN
 ; RUN: opt < %s -passes=instcombine -S -mtriple "x86_64-pc-win32"   | FileCheck %s --check-prefixes=CHECK,DOUBLE-8BYTE-ALIGN
diff --git a/llvm/test/Transforms/NewGVN/refine-stores.ll b/llvm/test/Transforms/NewGVN/refine-stores.ll
index eb5acd276a597..65a87e5a95f0a 100644
--- a/llvm/test/Transforms/NewGVN/refine-stores.ll
+++ b/llvm/test/Transforms/NewGVN/refine-stores.ll
@@ -1,4 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; REQUIRES: x86-registered-target
 ; RUN: opt < %s -passes=newgvn -S | FileCheck %s
 ;; Now that we do store refinement, we have to verify that we add fake uses
 ;; when we skip existing stores.

From 7cfac1bedd5059c1e2b6958c70c38d220d813b4d Mon Sep 17 00:00:00 2001
From: Aart Bik <39774503+aartbik@users.noreply.github.com>
Date: Thu, 26 Oct 2023 17:57:46 -0700
Subject: [PATCH 161/877] [mlir][sparse] add boilterplate code for a new
 reintepret map pass (#70393)

The interesting stuff is of course still coming ;-)
---
 .../Dialect/SparseTensor/Transforms/Passes.h  |  8 ++++++
 .../Dialect/SparseTensor/Transforms/Passes.td | 18 +++++++++++++
 .../SparseTensor/Transforms/CMakeLists.txt    |  1 +
 .../Transforms/SparseReinterpretMap.cpp       | 22 +++++++++++++++
 .../Transforms/SparseTensorPasses.cpp         | 27 ++++++++++++-------
 .../SparseTensor/sparse_reinterpret_map.mlir  | 10 +++++++
 6 files changed, 77 insertions(+), 9 deletions(-)
 create mode 100644 mlir/lib/Dialect/SparseTensor/Transforms/SparseReinterpretMap.cpp
 create mode 100644 mlir/test/Dialect/SparseTensor/sparse_reinterpret_map.mlir

diff --git a/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.h b/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.h
index 204bc1ec2def1..835c9baa2b917 100644
--- a/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.h
+++ b/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.h
@@ -47,6 +47,14 @@ enum class GPUDataTransferStrategy { kRegularDMA, kZeroCopy, kPinnedDMA };
 #define GEN_PASS_DECL
 #include "mlir/Dialect/SparseTensor/Transforms/Passes.h.inc"
 
+//===----------------------------------------------------------------------===//
+// The SparseReinterpretMap pass.
+//===----------------------------------------------------------------------===//
+
+void populateSparseReinterpretMap(RewritePatternSet &patterns);
+
+std::unique_ptr<Pass> createSparseReinterpretMapPass();
+
 //===----------------------------------------------------------------------===//
 // The PreSparsificationRewriting pass.
 //===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.td b/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.td
index 73ecf5061fa16..c23e062ef8841 100644
--- a/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.td
@@ -11,6 +11,24 @@
 
 include "mlir/Pass/PassBase.td"
 
+def SparseReinterpretMap : Pass<"sparse-reinterpret-map", "ModuleOp"> {
+  let summary = "Reinterprets sparse tensor type mappings";
+  let description = [{
+    A pass that reinterprets the mappings in all sparse tensor types in a way that
+    enables subsequent sparification. This involves expressing all `linalg.generic`
+    operations in terms of level coordinates (rather than the dimension coordinates
+    of the input tensors) to align the iteration space with the potentially remapped
+    level space as well as resolving cycles in the resulting iteration graphs with
+    explicit sparse tensor conversions where needed.
+  }];
+  let constructor = "mlir::createSparseReinterpretMapPass()";
+  let dependentDialects = [
+    "affine::AffineDialect",
+    "linalg::LinalgDialect",
+    "sparse_tensor::SparseTensorDialect",
+  ];
+}
+
 def PreSparsificationRewrite : Pass<"pre-sparsification-rewrite", "ModuleOp"> {
   let summary = "Applies sparse tensor rewriting rules prior to sparsification";
   let description = [{
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/CMakeLists.txt b/mlir/lib/Dialect/SparseTensor/Transforms/CMakeLists.txt
index 0ca6668c8c747..b8a2ff26b6794 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/CMakeLists.txt
@@ -5,6 +5,7 @@ add_mlir_dialect_library(MLIRSparseTensorTransforms
   LoopEmitter.cpp
   SparseBufferRewriting.cpp
   SparseGPUCodegen.cpp
+  SparseReinterpretMap.cpp
   SparseStorageSpecifierToLLVM.cpp
   SparseTensorCodegen.cpp
   SparseTensorConversion.cpp
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseReinterpretMap.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseReinterpretMap.cpp
new file mode 100644
index 0000000000000..881d235de7384
--- /dev/null
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseReinterpretMap.cpp
@@ -0,0 +1,22 @@
+//===- SparseReinterpretMap.cpp - reinterpret sparse tensor maps ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/SparseTensor/IR/SparseTensor.h"
+#include "mlir/Dialect/SparseTensor/IR/SparseTensorType.h"
+#include "mlir/Dialect/SparseTensor/Transforms/Passes.h"
+
+namespace {
+
+// TODO:
+//   (1) insert the zero-cost sparse_tensor.reinterpret_map ops
+//   (2) rewrite linalg.generic ops traits on level crds
+//   (3) compute topsort, and resolve cyles with sparse_tensor.convert ops
+
+} // namespace
+
+void mlir::populateSparseReinterpretMap(RewritePatternSet &patterns) {}
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorPasses.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorPasses.cpp
index eaf15ff29dd72..241232f7c75cb 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorPasses.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorPasses.cpp
@@ -22,6 +22,7 @@
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 
 namespace mlir {
+#define GEN_PASS_DEF_SPARSEREINTERPRETMAP
 #define GEN_PASS_DEF_PRESPARSIFICATIONREWRITE
 #define GEN_PASS_DEF_SPARSIFICATIONPASS
 #define GEN_PASS_DEF_POSTSPARSIFICATIONREWRITE
@@ -44,9 +45,21 @@ namespace {
 // Passes implementation.
 //===----------------------------------------------------------------------===//
 
+struct SparseReinterpretMap
+    : public impl::SparseReinterpretMapBase<SparseReinterpretMap> {
+  SparseReinterpretMap() = default;
+  SparseReinterpretMap(const SparseReinterpretMap &pass) = default;
+
+  void runOnOperation() override {
+    auto *ctx = &getContext();
+    RewritePatternSet patterns(ctx);
+    populateSparseReinterpretMap(patterns);
+    (void)applyPatternsAndFoldGreedily(getOperation(), std::move(patterns));
+  }
+};
+
 struct PreSparsificationRewritePass
     : public impl::PreSparsificationRewriteBase<PreSparsificationRewritePass> {
-
   PreSparsificationRewritePass() = default;
   PreSparsificationRewritePass(const PreSparsificationRewritePass &pass) =
       default;
@@ -61,7 +74,6 @@ struct PreSparsificationRewritePass
 
 struct SparsificationPass
     : public impl::SparsificationPassBase<SparsificationPass> {
-
   SparsificationPass() = default;
   SparsificationPass(const SparsificationPass &pass) = default;
   SparsificationPass(const SparsificationOptions &options) {
@@ -108,7 +120,6 @@ struct StageSparseOperationsPass
 struct PostSparsificationRewritePass
     : public impl::PostSparsificationRewriteBase<
           PostSparsificationRewritePass> {
-
   PostSparsificationRewritePass() = default;
   PostSparsificationRewritePass(const PostSparsificationRewritePass &pass) =
       default;
@@ -129,7 +140,6 @@ struct PostSparsificationRewritePass
 
 struct SparseTensorConversionPass
     : public impl::SparseTensorConversionPassBase<SparseTensorConversionPass> {
-
   SparseTensorConversionPass() = default;
   SparseTensorConversionPass(const SparseTensorConversionPass &pass) = default;
 
@@ -200,7 +210,6 @@ struct SparseTensorConversionPass
 
 struct SparseTensorCodegenPass
     : public impl::SparseTensorCodegenBase<SparseTensorCodegenPass> {
-
   SparseTensorCodegenPass() = default;
   SparseTensorCodegenPass(const SparseTensorCodegenPass &pass) = default;
   SparseTensorCodegenPass(bool createDeallocs, bool enableInit) {
@@ -266,7 +275,6 @@ struct SparseTensorCodegenPass
 
 struct SparseBufferRewritePass
     : public impl::SparseBufferRewriteBase<SparseBufferRewritePass> {
-
   SparseBufferRewritePass() = default;
   SparseBufferRewritePass(const SparseBufferRewritePass &pass) = default;
   SparseBufferRewritePass(bool enableInit) {
@@ -283,7 +291,6 @@ struct SparseBufferRewritePass
 
 struct SparseVectorizationPass
     : public impl::SparseVectorizationBase<SparseVectorizationPass> {
-
   SparseVectorizationPass() = default;
   SparseVectorizationPass(const SparseVectorizationPass &pass) = default;
   SparseVectorizationPass(unsigned vl, bool vla, bool sidx32) {
@@ -306,7 +313,6 @@ struct SparseVectorizationPass
 
 struct SparseGPUCodegenPass
     : public impl::SparseGPUCodegenBase<SparseGPUCodegenPass> {
-
   SparseGPUCodegenPass() = default;
   SparseGPUCodegenPass(const SparseGPUCodegenPass &pass) = default;
   SparseGPUCodegenPass(unsigned nT) { numThreads = nT; }
@@ -321,7 +327,6 @@ struct SparseGPUCodegenPass
 
 struct StorageSpecifierToLLVMPass
     : public impl::StorageSpecifierToLLVMBase<StorageSpecifierToLLVMPass> {
-
   StorageSpecifierToLLVMPass() = default;
 
   void runOnOperation() override {
@@ -363,6 +368,10 @@ struct StorageSpecifierToLLVMPass
 // Pass creation methods.
 //===----------------------------------------------------------------------===//
 
+std::unique_ptr<Pass> mlir::createSparseReinterpretMapPass() {
+  return std::make_unique<SparseReinterpretMap>();
+}
+
 std::unique_ptr<Pass> mlir::createPreSparsificationRewritePass() {
   return std::make_unique<PreSparsificationRewritePass>();
 }
diff --git a/mlir/test/Dialect/SparseTensor/sparse_reinterpret_map.mlir b/mlir/test/Dialect/SparseTensor/sparse_reinterpret_map.mlir
new file mode 100644
index 0000000000000..8517f2a27ae3f
--- /dev/null
+++ b/mlir/test/Dialect/SparseTensor/sparse_reinterpret_map.mlir
@@ -0,0 +1,10 @@
+// RUN: mlir-opt %s --sparse-reinterpret-map | FileCheck %s
+
+#SparseVector = #sparse_tensor.encoding<{ map = (d0) -> (d0 : compressed) }>
+
+// CHECK-LABEL: func @sparse_nop(
+//  CHECK-SAME: %[[A0:.*]]: tensor<?xf64, #sparse_tensor.encoding<{{{.*}}}>>)
+//       CHECK: return %[[A0]]
+func.func @sparse_nop(%arg0: tensor<?xf64, #SparseVector>) -> tensor<?xf64, #SparseVector> {
+  return %arg0 : tensor<?xf64, #SparseVector>
+}

From f61179f81277eec271d0c561de99a6c4bb7c370d Mon Sep 17 00:00:00 2001
From: spupyrev <spupyrev@users.noreply.github.com>
Date: Thu, 26 Oct 2023 18:10:30 -0700
Subject: [PATCH 162/877] [CodeLayout] Changed option names cds to cdsort
 (#69668)

Renaming cds-> cdsort for consistency. This is NFC unless somebody uses
older names
---
 llvm/lib/Transforms/Utils/CodeLayout.cpp | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Transforms/Utils/CodeLayout.cpp b/llvm/lib/Transforms/Utils/CodeLayout.cpp
index a465e8914c104..f633ad3738361 100644
--- a/llvm/lib/Transforms/Utils/CodeLayout.cpp
+++ b/llvm/lib/Transforms/Utils/CodeLayout.cpp
@@ -115,11 +115,11 @@ static cl::opt<double> MaxMergeDensityRatio(
     "ext-tsp-max-merge-density-ratio", cl::ReallyHidden, cl::init(100),
     cl::desc("The maximum ratio between densities of two chains for merging"));
 
-// Algorithm-specific options for CDS.
-static cl::opt<unsigned> CacheEntries("cds-cache-entries", cl::ReallyHidden,
+// Algorithm-specific options for CDSort.
+static cl::opt<unsigned> CacheEntries("cdsort-cache-entries", cl::ReallyHidden,
                                       cl::desc("The size of the cache"));
 
-static cl::opt<unsigned> CacheSize("cds-cache-size", cl::ReallyHidden,
+static cl::opt<unsigned> CacheSize("cdsort-cache-size", cl::ReallyHidden,
                                    cl::desc("The size of a line in the cache"));
 
 static cl::opt<unsigned>
@@ -127,11 +127,11 @@ static cl::opt<unsigned>
                    cl::desc("The maximum size of a chain to create"));
 
 static cl::opt<double> DistancePower(
-    "cds-distance-power", cl::ReallyHidden,
+    "cdsort-distance-power", cl::ReallyHidden,
     cl::desc("The power exponent for the distance-based locality"));
 
 static cl::opt<double> FrequencyScale(
-    "cds-frequency-scale", cl::ReallyHidden,
+    "cdsort-frequency-scale", cl::ReallyHidden,
     cl::desc("The scale factor for the frequency-based locality"));
 
 namespace {
@@ -1028,8 +1028,8 @@ class ExtTSPImpl {
   std::vector<ChainT *> HotChains;
 };
 
-/// The implementation of the Cache-Directed Sort (CDS) algorithm for ordering
-/// functions represented by a call graph.
+/// The implementation of the Cache-Directed Sort (CDSort) algorithm for
+/// ordering functions represented by a call graph.
 class CDSortImpl {
 public:
   CDSortImpl(const CDSortConfig &Config, ArrayRef<uint64_t> NodeSizes,

From 1ab44d5a326ec570883bc63e5343c44c7614e20e Mon Sep 17 00:00:00 2001
From: Wenju He <wenju.he@intel.com>
Date: Fri, 27 Oct 2023 09:32:20 +0800
Subject: [PATCH 163/877] [DominanceFrontier] make iterating dereferenced
 DominanceFrontierBase::find deterministic (#69711)

---
 .../include/llvm/Analysis/DominanceFrontier.h |  6 ++-
 .../llvm/Analysis/DominanceFrontierImpl.h     |  6 +--
 llvm/include/llvm/Analysis/RegionInfoImpl.h   |  2 +-
 .../Analysis/DominanceFrontier/new_pm_test.ll | 54 +++++++++----------
 4 files changed, 35 insertions(+), 33 deletions(-)

diff --git a/llvm/include/llvm/Analysis/DominanceFrontier.h b/llvm/include/llvm/Analysis/DominanceFrontier.h
index cef5e03b3b7a7..42ede2ac5ece3 100644
--- a/llvm/include/llvm/Analysis/DominanceFrontier.h
+++ b/llvm/include/llvm/Analysis/DominanceFrontier.h
@@ -18,13 +18,13 @@
 #define LLVM_ANALYSIS_DOMINANCEFRONTIER_H
 
 #include "llvm/ADT/GraphTraits.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/Config/llvm-config.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/GenericDomTree.h"
 #include <cassert>
 #include <map>
-#include <set>
 #include <utility>
 
 namespace llvm {
@@ -39,7 +39,9 @@ class raw_ostream;
 template <class BlockT, bool IsPostDom>
 class DominanceFrontierBase {
 public:
-  using DomSetType = std::set<BlockT *>;                // Dom set for a bb
+  // Dom set for a bb. Use SetVector to make iterating dom frontiers of a bb
+  // deterministic.
+  using DomSetType = SetVector<BlockT *>;
   using DomSetMapType = std::map<BlockT *, DomSetType>; // Dom set map
 
 protected:
diff --git a/llvm/include/llvm/Analysis/DominanceFrontierImpl.h b/llvm/include/llvm/Analysis/DominanceFrontierImpl.h
index 3df51d9ad9081..a9c415f470109 100644
--- a/llvm/include/llvm/Analysis/DominanceFrontierImpl.h
+++ b/llvm/include/llvm/Analysis/DominanceFrontierImpl.h
@@ -49,7 +49,7 @@ template <class BlockT, bool IsPostDom>
 void DominanceFrontierBase<BlockT, IsPostDom>::removeBlock(BlockT *BB) {
   assert(find(BB) != end() && "Block is not in DominanceFrontier!");
   for (iterator I = begin(), E = end(); I != E; ++I)
-    I->second.erase(BB);
+    I->second.remove(BB);
   Frontiers.erase(BB);
 }
 
@@ -65,7 +65,7 @@ void DominanceFrontierBase<BlockT, IsPostDom>::removeFromFrontier(
     iterator I, BlockT *Node) {
   assert(I != end() && "BB is not in DominanceFrontier!");
   assert(I->second.count(Node) && "Node is not in DominanceFrontier of BB");
-  I->second.erase(Node);
+  I->second.remove(Node);
 }
 
 template <class BlockT, bool IsPostDom>
@@ -133,7 +133,7 @@ void DominanceFrontierBase<BlockT, IsPostDom>::print(raw_ostream &OS) const {
       OS << " <<exit node>>";
     OS << " is:\t";
 
-    const std::set<BlockT *> &BBs = I->second;
+    const SetVector<BlockT *> &BBs = I->second;
 
     for (const BlockT *BB : BBs) {
       OS << ' ';
diff --git a/llvm/include/llvm/Analysis/RegionInfoImpl.h b/llvm/include/llvm/Analysis/RegionInfoImpl.h
index 2877a9f8f0e08..ec79b35ae324a 100644
--- a/llvm/include/llvm/Analysis/RegionInfoImpl.h
+++ b/llvm/include/llvm/Analysis/RegionInfoImpl.h
@@ -587,7 +587,7 @@ bool RegionInfoBase<Tr>::isRegion(BlockT *entry, BlockT *exit) const {
   for (BlockT *Succ : *entrySuccs) {
     if (Succ == exit || Succ == entry)
       continue;
-    if (exitSuccs->find(Succ) == exitSuccs->end())
+    if (!exitSuccs->contains(Succ))
       return false;
     if (!isCommonDomFrontier(Succ, entry, exit))
       return false;
diff --git a/llvm/test/Analysis/DominanceFrontier/new_pm_test.ll b/llvm/test/Analysis/DominanceFrontier/new_pm_test.ll
index c14239c726c76..f3dcceacc4d66 100644
--- a/llvm/test/Analysis/DominanceFrontier/new_pm_test.ll
+++ b/llvm/test/Analysis/DominanceFrontier/new_pm_test.ll
@@ -9,42 +9,42 @@ define void @a_linear_impl_fig_1() nounwind {
 2:
   br label %3
 3:
-  br i1 1, label %12, label %4
+  br i1 1, label %a12, label %4
 4:
   br i1 1, label %5, label %1
 5:
-  br i1 1, label %8, label %6
+  br i1 1, label %a8, label %6
 6:
-  br i1 1, label %7, label %4
-7:
+  br i1 1, label %a7, label %4
+a7:
   ret void
-8:
-  br i1 1, label %9, label %1
-9:
-  br label %10
-10:
-  br i1 1, label %13, label %11
-11:
-  br i1 1, label %9, label %8
-12:
+a8:
+  br i1 1, label %a9, label %1
+a9:
+  br label %a10
+a10:
+  br i1 1, label %a13, label %a11
+a11:
+  br i1 1, label %a9, label %a8
+a12:
   br i1 1, label %2, label %1
-13:
-   switch i32 0, label %1 [ i32 0, label %9
-                              i32 1, label %8]
+a13:
+   switch i32 0, label %1 [ i32 0, label %a9
+                              i32 1, label %a8]
 }
 
 ; CHECK: DominanceFrontier for function: a_linear_impl_fig_1
 ; CHECK-DAG:  DomFrontier for BB %0 is:
-; CHECK-DAG:  DomFrontier for BB %11 is:   %{{[8|9]}} %{{[8|9]}}
+; CHECK-DAG:  DomFrontier for BB %a11 is:   %a9 %a8
 ; CHECK-DAG:  DomFrontier for BB %1 is:    %1
-; CHECK-DAG:  DomFrontier for BB %2 is:    %{{[1|2]}} %{{[1|2]}}
-; CHECK-DAG:  DomFrontier for BB %3 is:    %{{[1|2]}} %{{[1|2]}}
-; CHECK-DAG:  DomFrontier for BB %12 is:   %{{[1|2]}} %{{[1|2]}}
-; CHECK-DAG:  DomFrontier for BB %4 is:    %{{[1|4]}} %{{[1|4]}}
-; CHECK-DAG:  DomFrontier for BB %5 is:    %{{[1|4]}} %{{[1|4]}}
-; CHECK-DAG:  DomFrontier for BB %8 is:    %{{[1|8]}} %{{[1|8]}}
+; CHECK-DAG:  DomFrontier for BB %2 is:    %1 %2
+; CHECK-DAG:  DomFrontier for BB %3 is:    %1 %2
+; CHECK-DAG:  DomFrontier for BB %a12 is:   %2 %1
+; CHECK-DAG:  DomFrontier for BB %4 is:    %1 %4
+; CHECK-DAG:  DomFrontier for BB %5 is:    %4 %1
+; CHECK-DAG:  DomFrontier for BB %a8 is:    %1 %a8
 ; CHECK-DAG:  DomFrontier for BB %6 is:    %4
-; CHECK-DAG:  DomFrontier for BB %7 is:
-; CHECK-DAG:  DomFrontier for BB %9 is:    %{{[1|8|9]}} %{{[1|8|9]}} %{{[1|8|9]}}
-; CHECK-DAG:  DomFrontier for BB %10 is:   %{{[1|8|9]}} %{{[1|8|9]}} %{{[1|8|9]}}
-; CHECK-DAG:  DomFrontier for BB %13 is:   %{{[1|8|9]}} %{{[1|8|9]}} %{{[1|8|9]}}
+; CHECK-DAG:  DomFrontier for BB %a7 is:
+; CHECK-DAG:  DomFrontier for BB %a9 is:    %a9 %a8 %1
+; CHECK-DAG:  DomFrontier for BB %a10 is:   %a9 %a8 %1
+; CHECK-DAG:  DomFrontier for BB %a13 is:   %1 %a9 %a8

From 56cadac85b02f52ba3e6a84f2cdd73e3b43070ac Mon Sep 17 00:00:00 2001
From: Ryan Prichard <rprichard@google.com>
Date: Thu, 26 Oct 2023 19:13:20 -0700
Subject: [PATCH 164/877] [Workflow] Expand code-format-helper.py error
 reporting (#69686)

* Consider the darker/clang-format command to have failed if the exit
   code is non-zero, regardless of the stdout/stderr output.
* Propagate stderr from the formatter command to the script's
   caller (and into the GitHub log).
* On success, dump stdout to the caller, so it ends up in GitHub's log.
   I'm not sure what this would ever be, but if it exists, it should be
   preserved.
* Just before the script exits, if any formatter failed, print a line
   showing which formatters failed.
---
 llvm/utils/git/code-format-helper.py | 81 +++++++++++++++-------------
 1 file changed, 44 insertions(+), 37 deletions(-)

diff --git a/llvm/utils/git/code-format-helper.py b/llvm/utils/git/code-format-helper.py
index 742fa5a2eeb94..c45bb4d935d47 100644
--- a/llvm/utils/git/code-format-helper.py
+++ b/llvm/utils/git/code-format-helper.py
@@ -36,10 +36,8 @@ def format_run(
     ) -> str | None:
         raise NotImplementedError()
 
-    def pr_comment_text(self, diff: str) -> str:
+    def pr_comment_text_for_diff(self, diff: str) -> str:
         return f"""
-{self.comment_tag}
-
 :warning: {self.friendly_name}, {self.name} found issues in your code. :warning:
 
 <details>
@@ -73,39 +71,40 @@ def find_comment(
                 return comment
         return None
 
-    def update_pr(self, diff: str, args: argparse.Namespace) -> None:
+    def update_pr(
+        self, comment_text: str, args: argparse.Namespace, create_new: bool
+    ) -> None:
         repo = github.Github(args.token).get_repo(args.repo)
         pr = repo.get_issue(args.issue_number).as_pull_request()
 
-        existing_comment = self.find_comment(pr)
-        pr_text = self.pr_comment_text(diff)
-
-        if existing_comment:
-            existing_comment.edit(pr_text)
-        else:
-            pr.as_issue().create_comment(pr_text)
-
-    def update_pr_success(self, args: argparse.Namespace) -> None:
-        repo = github.Github(args.token).get_repo(args.repo)
-        pr = repo.get_issue(args.issue_number).as_pull_request()
+        comment_text = self.comment_tag + "\n\n" + comment_text
 
         existing_comment = self.find_comment(pr)
         if existing_comment:
-            existing_comment.edit(
-                f"""
-{self.comment_tag}
-:white_check_mark: With the latest revision this PR passed the {self.friendly_name}.
-"""
-            )
+            existing_comment.edit(comment_text)
+        elif create_new:
+            pr.as_issue().create_comment(comment_text)
 
     def run(self, changed_files: list[str], args: argparse.Namespace) -> bool:
         diff = self.format_run(changed_files, args)
-        if diff:
-            self.update_pr(diff, args)
+        if diff is None:
+            comment_text = f"""
+:white_check_mark: With the latest revision this PR passed the {self.friendly_name}.
+"""
+            self.update_pr(comment_text, args, create_new=False)
+            return True
+        elif len(diff) > 0:
+            comment_text = self.pr_comment_text_for_diff(diff)
+            self.update_pr(comment_text, args, create_new=True)
             return False
         else:
-            self.update_pr_success(args)
-            return True
+            # The formatter failed but didn't output a diff (e.g. some sort of
+            # infrastructure failure).
+            comment_text = f"""
+:warning: The {self.friendly_name} failed without printing a diff. Check the logs for stderr output. :warning:
+"""
+            self.update_pr(comment_text, args, create_new=False)
+            return False
 
 
 class ClangFormatHelper(FormatHelper):
@@ -123,7 +122,7 @@ def libcxx_excluded_files(self) -> list[str]:
 
     def should_be_excluded(self, path: str) -> bool:
         if path in self.libcxx_excluded_files:
-            print(f"Excluding file {path}")
+            print(f"{self.name}: Excluding file {path}")
             return True
         return False
 
@@ -152,12 +151,15 @@ def format_run(
         print(f"Running: {' '.join(cf_cmd)}")
         self.cf_cmd = cf_cmd
         proc = subprocess.run(cf_cmd, capture_output=True)
+        sys.stdout.write(proc.stderr.decode("utf-8"))
 
-        # formatting needed
-        if proc.returncode == 1:
+        if proc.returncode != 0:
+            # formatting needed, or the command otherwise failed
+            print(f"error: {self.name} exited with code {proc.returncode}")
             return proc.stdout.decode("utf-8")
-
-        return None
+        else:
+            sys.stdout.write(proc.stdout.decode("utf-8"))
+            return None
 
 
 class DarkerFormatHelper(FormatHelper):
@@ -193,12 +195,15 @@ def format_run(
         print(f"Running: {' '.join(darker_cmd)}")
         self.darker_cmd = darker_cmd
         proc = subprocess.run(darker_cmd, capture_output=True)
+        sys.stdout.write(proc.stderr.decode("utf-8"))
 
-        # formatting needed
-        if proc.returncode == 1:
+        if proc.returncode != 0:
+            # formatting needed, or the command otherwise failed
+            print(f"error: {self.name} exited with code {proc.returncode}")
             return proc.stdout.decode("utf-8")
-
-        return None
+        else:
+            sys.stdout.write(proc.stdout.decode("utf-8"))
+            return None
 
 
 ALL_FORMATTERS = (DarkerFormatHelper(), ClangFormatHelper())
@@ -236,9 +241,11 @@ def format_run(
     if args.changed_files:
         changed_files = args.changed_files.split(",")
 
-    exit_code = 0
+    failed_formatters = []
     for fmt in ALL_FORMATTERS:
         if not fmt.run(changed_files, args):
-            exit_code = 1
+            failed_formatters.append(fmt.name)
 
-    sys.exit(exit_code)
+    if len(failed_formatters) > 0:
+        print(f"error: some formatters failed: {' '.join(failed_formatters)}")
+        sys.exit(1)

From 3ea0022bef046d60b0caf477c6a460a5650e325e Mon Sep 17 00:00:00 2001
From: Brad Smith <brad@comstyle.com>
Date: Thu, 26 Oct 2023 22:30:14 -0400
Subject: [PATCH 165/877] [Driver][NFC] Make use of auto (#70400)

---
 clang/lib/Driver/ToolChains/FreeBSD.cpp | 3 +--
 clang/lib/Driver/ToolChains/Fuchsia.cpp | 3 +--
 clang/lib/Driver/ToolChains/Gnu.cpp     | 3 +--
 clang/lib/Driver/ToolChains/Haiku.cpp   | 3 +--
 clang/lib/Driver/ToolChains/NaCl.cpp    | 6 ++----
 clang/lib/Driver/ToolChains/OpenBSD.cpp | 6 ++----
 clang/lib/Driver/ToolChains/Solaris.cpp | 3 +--
 7 files changed, 9 insertions(+), 18 deletions(-)

diff --git a/clang/lib/Driver/ToolChains/FreeBSD.cpp b/clang/lib/Driver/ToolChains/FreeBSD.cpp
index f4c2f70e73576..054f57c21ee6b 100644
--- a/clang/lib/Driver/ToolChains/FreeBSD.cpp
+++ b/clang/lib/Driver/ToolChains/FreeBSD.cpp
@@ -129,8 +129,7 @@ void freebsd::Linker::ConstructJob(Compilation &C, const JobAction &JA,
                                    const InputInfoList &Inputs,
                                    const ArgList &Args,
                                    const char *LinkingOutput) const {
-  const toolchains::FreeBSD &ToolChain =
-      static_cast<const toolchains::FreeBSD &>(getToolChain());
+  const auto &ToolChain = static_cast<const FreeBSD &>(getToolChain());
   const Driver &D = ToolChain.getDriver();
   const llvm::Triple::ArchType Arch = ToolChain.getArch();
   const bool IsPIE =
diff --git a/clang/lib/Driver/ToolChains/Fuchsia.cpp b/clang/lib/Driver/ToolChains/Fuchsia.cpp
index 300a57afd37af..00de63c155229 100644
--- a/clang/lib/Driver/ToolChains/Fuchsia.cpp
+++ b/clang/lib/Driver/ToolChains/Fuchsia.cpp
@@ -34,8 +34,7 @@ void fuchsia::Linker::ConstructJob(Compilation &C, const JobAction &JA,
                                    const InputInfoList &Inputs,
                                    const ArgList &Args,
                                    const char *LinkingOutput) const {
-  const toolchains::Fuchsia &ToolChain =
-      static_cast<const toolchains::Fuchsia &>(getToolChain());
+  const auto &ToolChain = static_cast<const Fuchsia &>(getToolChain());
   const Driver &D = ToolChain.getDriver();
 
   const llvm::Triple &Triple = ToolChain.getEffectiveTriple();
diff --git a/clang/lib/Driver/ToolChains/Gnu.cpp b/clang/lib/Driver/ToolChains/Gnu.cpp
index a98dfa02fc717..2515f85432d75 100644
--- a/clang/lib/Driver/ToolChains/Gnu.cpp
+++ b/clang/lib/Driver/ToolChains/Gnu.cpp
@@ -376,8 +376,7 @@ void tools::gnutools::Linker::ConstructJob(Compilation &C, const JobAction &JA,
   // Generic_ELF, so the static_cast might return a reference to a invalid
   // instance (see PR45061). Ideally, the Linker constructor needs to take a
   // Generic_ELF instead.
-  const toolchains::Generic_ELF &ToolChain =
-      static_cast<const toolchains::Generic_ELF &>(getToolChain());
+  const auto &ToolChain = static_cast<const Generic_ELF &>(getToolChain());
   const Driver &D = ToolChain.getDriver();
 
   const llvm::Triple &Triple = getToolChain().getEffectiveTriple();
diff --git a/clang/lib/Driver/ToolChains/Haiku.cpp b/clang/lib/Driver/ToolChains/Haiku.cpp
index 9f56a0ea5d612..4212dd8de293b 100644
--- a/clang/lib/Driver/ToolChains/Haiku.cpp
+++ b/clang/lib/Driver/ToolChains/Haiku.cpp
@@ -23,8 +23,7 @@ void haiku::Linker::ConstructJob(Compilation &C, const JobAction &JA,
                                    const InputInfoList &Inputs,
                                    const ArgList &Args,
                                    const char *LinkingOutput) const {
-  const toolchains::Haiku &ToolChain =
-      static_cast<const toolchains::Haiku &>(getToolChain());
+  const auto &ToolChain = static_cast<const Haiku &>(getToolChain());
   const Driver &D = ToolChain.getDriver();
   const llvm::Triple::ArchType Arch = ToolChain.getArch();
   const bool Static = Args.hasArg(options::OPT_static);
diff --git a/clang/lib/Driver/ToolChains/NaCl.cpp b/clang/lib/Driver/ToolChains/NaCl.cpp
index 5be08c42b5287..22f038e5152ff 100644
--- a/clang/lib/Driver/ToolChains/NaCl.cpp
+++ b/clang/lib/Driver/ToolChains/NaCl.cpp
@@ -31,8 +31,7 @@ void nacltools::AssemblerARM::ConstructJob(Compilation &C, const JobAction &JA,
                                            const InputInfoList &Inputs,
                                            const ArgList &Args,
                                            const char *LinkingOutput) const {
-  const toolchains::NaClToolChain &ToolChain =
-      static_cast<const toolchains::NaClToolChain &>(getToolChain());
+  const auto &ToolChain = static_cast<const NaClToolChain &>(getToolChain());
   InputInfo NaClMacros(types::TY_PP_Asm, ToolChain.GetNaClArmMacrosPath(),
                        "nacl-arm-macros.s");
   InputInfoList NewInputs;
@@ -52,8 +51,7 @@ void nacltools::Linker::ConstructJob(Compilation &C, const JobAction &JA,
                                      const ArgList &Args,
                                      const char *LinkingOutput) const {
 
-  const toolchains::NaClToolChain &ToolChain =
-      static_cast<const toolchains::NaClToolChain &>(getToolChain());
+  const auto &ToolChain = static_cast<const NaClToolChain &>(getToolChain());
   const Driver &D = ToolChain.getDriver();
   const llvm::Triple::ArchType Arch = ToolChain.getArch();
   const bool IsStatic =
diff --git a/clang/lib/Driver/ToolChains/OpenBSD.cpp b/clang/lib/Driver/ToolChains/OpenBSD.cpp
index cefbdff20ead9..8df6dae3dd6ba 100644
--- a/clang/lib/Driver/ToolChains/OpenBSD.cpp
+++ b/clang/lib/Driver/ToolChains/OpenBSD.cpp
@@ -30,8 +30,7 @@ void openbsd::Assembler::ConstructJob(Compilation &C, const JobAction &JA,
                                       const InputInfoList &Inputs,
                                       const ArgList &Args,
                                       const char *LinkingOutput) const {
-  const toolchains::OpenBSD &ToolChain =
-      static_cast<const toolchains::OpenBSD &>(getToolChain());
+  const auto &ToolChain = static_cast<const OpenBSD &>(getToolChain());
   const Driver &D = ToolChain.getDriver();
   const llvm::Triple &Triple = ToolChain.getTriple();
 
@@ -110,8 +109,7 @@ void openbsd::Linker::ConstructJob(Compilation &C, const JobAction &JA,
                                    const InputInfoList &Inputs,
                                    const ArgList &Args,
                                    const char *LinkingOutput) const {
-  const toolchains::OpenBSD &ToolChain =
-      static_cast<const toolchains::OpenBSD &>(getToolChain());
+  const auto &ToolChain = static_cast<const OpenBSD &>(getToolChain());
   const Driver &D = ToolChain.getDriver();
   const llvm::Triple::ArchType Arch = ToolChain.getArch();
   ArgStringList CmdArgs;
diff --git a/clang/lib/Driver/ToolChains/Solaris.cpp b/clang/lib/Driver/ToolChains/Solaris.cpp
index 2b1ac5a7f475c..3d5a710842eca 100644
--- a/clang/lib/Driver/ToolChains/Solaris.cpp
+++ b/clang/lib/Driver/ToolChains/Solaris.cpp
@@ -126,8 +126,7 @@ void solaris::Linker::ConstructJob(Compilation &C, const JobAction &JA,
 
   if (LinkerIsGnuLd) {
     // Set the correct linker emulation for 32- and 64-bit Solaris.
-    const toolchains::Solaris &ToolChain =
-        static_cast<const toolchains::Solaris &>(getToolChain());
+    const auto &ToolChain = static_cast<const Solaris &>(getToolChain());
     const llvm::Triple::ArchType Arch = ToolChain.getArch();
 
     switch (Arch) {

From 6ca5b814f331a60e2e9c38b8d86110f122e6971b Mon Sep 17 00:00:00 2001
From: Cyndy Ishida <cyndy_ishida@apple.com>
Date: Thu, 26 Oct 2023 20:07:58 -0700
Subject: [PATCH 166/877] [llvm] Add myself as codeowner for TextAPI (#70399)

---
 llvm/CODE_OWNERS.TXT | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/llvm/CODE_OWNERS.TXT b/llvm/CODE_OWNERS.TXT
index 35874381d765a..48026bd2f74eb 100644
--- a/llvm/CODE_OWNERS.TXT
+++ b/llvm/CODE_OWNERS.TXT
@@ -115,6 +115,10 @@ N: Min-Yih Hsu
 E: min@myhsu.dev
 D: M68k Target (lib/Target/M68k/*)
 
+N: Cyndy Ishida
+E: cyndyishida@gmail.com
+D: TextAPI (lib/TextAPI/*) and related TAPI tools
+
 N: Teresa Johnson
 E: tejohnson@google.com
 D: Gold plugin (tools/gold/*) and IR Linker

From 4ec9cda656cc2fde41d4a4415ae363d9a3290c80 Mon Sep 17 00:00:00 2001
From: Med Ismail Bennani <ismail@bennani.ma>
Date: Thu, 26 Oct 2023 21:21:54 -0700
Subject: [PATCH 167/877] [lldb/test] Fix failures following ec456ba9ca0a

This patch fixes the various crashlog test failures following
ec456ba9ca0a, which renamed the process member variable in the Scripted
Thread python base class.

This patch updates the crashlog scripted process implementation to
reflect that change.

Signed-off-by: Med Ismail Bennani <ismail@bennani.ma>
---
 lldb/examples/python/crashlog_scripted_process.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/lldb/examples/python/crashlog_scripted_process.py b/lldb/examples/python/crashlog_scripted_process.py
index 43f767de138cd..c69985b1a072d 100644
--- a/lldb/examples/python/crashlog_scripted_process.py
+++ b/lldb/examples/python/crashlog_scripted_process.py
@@ -159,14 +159,14 @@ def resolve_stackframes(thread, addr_mask, target):
         return frames
 
     def create_stackframes(self):
-        if not (self.scripted_process.load_all_images or self.has_crashed):
+        if not (self.originating_process.load_all_images or self.has_crashed):
             return None
 
         if not self.backing_thread or not len(self.backing_thread.frames):
             return None
 
         self.frames = CrashLogScriptedThread.resolve_stackframes(
-            self.backing_thread, self.scripted_process.addr_mask, self.target
+            self.backing_thread, self.originating_process.addr_mask, self.target
         )
 
         return self.frames
@@ -182,7 +182,7 @@ def __init__(self, process, args, crashlog_thread):
         else:
             self.name = self.backing_thread.name
         self.queue = self.backing_thread.queue
-        self.has_crashed = self.scripted_process.crashed_thread_idx == self.idx
+        self.has_crashed = self.originating_process.crashed_thread_idx == self.idx
         self.create_stackframes()
 
     def get_state(self):
@@ -195,8 +195,8 @@ def get_stop_reason(self) -> Dict[str, Any]:
             return {"type": lldb.eStopReasonNone}
         # TODO: Investigate what stop reason should be reported when crashed
         stop_reason = {"type": lldb.eStopReasonException, "data": {}}
-        if self.scripted_process.exception:
-            stop_reason["data"]["mach_exception"] = self.scripted_process.exception
+        if self.originating_process.exception:
+            stop_reason["data"]["mach_exception"] = self.originating_process.exception
         return stop_reason
 
     def get_register_context(self) -> str:
@@ -209,5 +209,5 @@ def get_register_context(self) -> str:
 
     def get_extended_info(self):
         if self.has_crashed:
-            self.extended_info = self.scripted_process.extended_thread_info
+            self.extended_info = self.originating_process.extended_thread_info
         return self.extended_info

From 88934a82dced0116227e4ad9af8896d8fa0943b8 Mon Sep 17 00:00:00 2001
From: Owen Pan <owenpiano@gmail.com>
Date: Thu, 26 Oct 2023 21:41:33 -0700
Subject: [PATCH 168/877] [clang-format][NFC] Remove extraneous newlines in
 some unit test files

---
 clang/unittests/Format/ConfigParseTest.cpp    |   2 +-
 .../Format/DefinitionBlockSeparatorTest.cpp   |   4 +-
 clang/unittests/Format/FormatTestCSharp.cpp   |  28 +-
 clang/unittests/Format/FormatTestComments.cpp | 270 +++++++++---------
 clang/unittests/Format/FormatTestJava.cpp     |   2 +-
 .../Format/FormatTestMacroExpansion.cpp       |   4 +-
 clang/unittests/Format/TokenAnnotatorTest.cpp |  10 +-
 7 files changed, 160 insertions(+), 160 deletions(-)

diff --git a/clang/unittests/Format/ConfigParseTest.cpp b/clang/unittests/Format/ConfigParseTest.cpp
index f90ed178d99c2..3a6667dbe0eb1 100644
--- a/clang/unittests/Format/ConfigParseTest.cpp
+++ b/clang/unittests/Format/ConfigParseTest.cpp
@@ -849,7 +849,7 @@ TEST(ConfigParseTest, ParsesConfiguration) {
               "    Priority: 2\n"
               "  - Regex: .*\n"
               "    Priority: 1\n"
-              "    CaseSensitive: true\n",
+              "    CaseSensitive: true",
               IncludeStyle.IncludeCategories, ExpectedCategories);
   CHECK_PARSE("IncludeIsMainRegex: 'abc$'", IncludeStyle.IncludeIsMainRegex,
               "abc$");
diff --git a/clang/unittests/Format/DefinitionBlockSeparatorTest.cpp b/clang/unittests/Format/DefinitionBlockSeparatorTest.cpp
index 5cf4edcbdf2e5..f5489498a93b9 100644
--- a/clang/unittests/Format/DefinitionBlockSeparatorTest.cpp
+++ b/clang/unittests/Format/DefinitionBlockSeparatorTest.cpp
@@ -140,7 +140,7 @@ TEST_F(DefinitionBlockSeparatorTest, Basic) {
 
   verifyFormat("enum Foo { FOO, BAR };\n"
                "\n"
-               "enum Bar { FOOBAR, BARFOO };\n",
+               "enum Bar { FOOBAR, BARFOO };",
                Style);
 
   FormatStyle BreakAfterReturnTypeStyle = Style;
@@ -158,7 +158,7 @@ TEST_F(DefinitionBlockSeparatorTest, Basic) {
                "    int r = t * p;\n"
                "    return r;\n"
                "  }\n"
-               "}\n",
+               "}",
                BreakAfterReturnTypeStyle);
 }
 
diff --git a/clang/unittests/Format/FormatTestCSharp.cpp b/clang/unittests/Format/FormatTestCSharp.cpp
index 05d20c5ef1922..4a0840d32341e 100644
--- a/clang/unittests/Format/FormatTestCSharp.cpp
+++ b/clang/unittests/Format/FormatTestCSharp.cpp
@@ -530,19 +530,19 @@ TEST_F(FormatTestCSharp, AttributesIndentation) {
   verifyFormat("[SuppressMessage(\"A\", \"B\", Justification = \"C\")]\n"
                "public override X Y()\n"
                "{\n"
-               "}\n",
+               "}",
                Style);
 
   verifyFormat("[SuppressMessage]\n"
                "public X Y()\n"
                "{\n"
-               "}\n",
+               "}",
                Style);
 
   verifyFormat("[SuppressMessage]\n"
                "public override X Y()\n"
                "{\n"
-               "}\n",
+               "}",
                Style);
 
   verifyFormat("public A(B b) : base(b)\n"
@@ -551,7 +551,7 @@ TEST_F(FormatTestCSharp, AttributesIndentation) {
                "    public override X Y()\n"
                "    {\n"
                "    }\n"
-               "}\n",
+               "}",
                Style);
 
   verifyFormat("public A : Base\n"
@@ -560,7 +560,7 @@ TEST_F(FormatTestCSharp, AttributesIndentation) {
                "[Test]\n"
                "public Foo()\n"
                "{\n"
-               "}\n",
+               "}",
                Style);
 
   verifyFormat("namespace\n"
@@ -572,7 +572,7 @@ TEST_F(FormatTestCSharp, AttributesIndentation) {
                "public Foo()\n"
                "{\n"
                "}\n"
-               "}\n",
+               "}",
                Style);
 }
 
@@ -1561,7 +1561,7 @@ TEST_F(FormatTestCSharp, NamespaceIndentation) {
                "public interface Name1\n"
                "{\n"
                "}\n"
-               "}\n",
+               "}",
                Style);
 
   verifyFormat("namespace A.B\n"
@@ -1569,7 +1569,7 @@ TEST_F(FormatTestCSharp, NamespaceIndentation) {
                "public interface Name1\n"
                "{\n"
                "}\n"
-               "}\n",
+               "}",
                Style);
 
   Style.NamespaceIndentation = FormatStyle::NI_Inner;
@@ -1582,7 +1582,7 @@ TEST_F(FormatTestCSharp, NamespaceIndentation) {
                "    {\n"
                "    }\n"
                "}\n"
-               "}\n",
+               "}",
                Style);
 
   Style.NamespaceIndentation = FormatStyle::NI_All;
@@ -1592,7 +1592,7 @@ TEST_F(FormatTestCSharp, NamespaceIndentation) {
                "    public interface Name1\n"
                "    {\n"
                "    }\n"
-               "}\n",
+               "}",
                Style);
 
   verifyFormat("namespace A\n"
@@ -1603,7 +1603,7 @@ TEST_F(FormatTestCSharp, NamespaceIndentation) {
                "        {\n"
                "        }\n"
                "    }\n"
-               "}\n",
+               "}",
                Style);
 }
 
@@ -1613,7 +1613,7 @@ TEST_F(FormatTestCSharp, SwitchExpression) {
                "    1 => (0 + 0 + 0 + 0 + 0 + 0 + 0 + 0 + 0 + 0 + 0),\n"
                "    2 => 1,\n"
                "    _ => 2\n"
-               "};\n",
+               "};",
                Style);
 }
 
@@ -1625,12 +1625,12 @@ TEST_F(FormatTestCSharp, EmptyShortBlock) {
                "  doA();\n"
                "} catch (Exception e) {\n"
                "  e.printStackTrace();\n"
-               "}\n",
+               "}",
                Style);
 
   verifyFormat("try {\n"
                "  doA();\n"
-               "} catch (Exception e) {}\n",
+               "} catch (Exception e) {}",
                Style);
 }
 
diff --git a/clang/unittests/Format/FormatTestComments.cpp b/clang/unittests/Format/FormatTestComments.cpp
index 5e5324f01d867..967ffa32db79c 100644
--- a/clang/unittests/Format/FormatTestComments.cpp
+++ b/clang/unittests/Format/FormatTestComments.cpp
@@ -27,15 +27,15 @@ TEST_F(FormatTestComments, UnderstandsSingleLineComments) {
   verifyFormat("//* */");
   verifyFormat("// line 1\n"
                "// line 2\n"
-               "void f() {}\n");
+               "void f() {}");
 
-  EXPECT_EQ("// comment\n", format("//comment\n"));
-  EXPECT_EQ("// #comment\n", format("//#comment\n"));
+  EXPECT_EQ("// comment", format("//comment"));
+  EXPECT_EQ("// #comment", format("//#comment"));
 
   EXPECT_EQ("// comment\n"
-            "// clang-format on\n",
+            "// clang-format on",
             format("//comment\n"
-                   "// clang-format on\n"));
+                   "// clang-format on"));
 
   verifyFormat("void f() {\n"
                "  // Doesn't do anything\n"
@@ -1237,11 +1237,11 @@ TEST_F(FormatTestComments, SplitsLongLinesInComments) {
   EXPECT_EQ("/*\n"
             "\n"
             "\n"
-            "    */\n",
+            "    */",
             format("  /*       \n"
                    "      \n"
                    "               \n"
-                   "      */\n"));
+                   "      */"));
 
   EXPECT_EQ("/* a a */",
             format("/* a a            */", getLLVMStyleWithColumns(15)));
@@ -1412,7 +1412,7 @@ TEST_F(FormatTestComments, CommentsInStaticInitializers) {
   verifyFormat("const uint8_t aaaaaaaaaaaaaaaaaaaaaa[0] = {\n"
                "    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // comment\n"
                "    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // comment\n"
-               "    0x00, 0x00, 0x00, 0x00};            // comment\n");
+               "    0x00, 0x00, 0x00, 0x00};            // comment");
 }
 
 TEST_F(FormatTestComments, LineCommentsAfterRightBrace) {
@@ -1483,9 +1483,9 @@ TEST_F(FormatTestComments, ReflowsComments) {
 
   // Keep the trailing newline while reflowing.
   EXPECT_EQ("// long long long\n"
-            "// long long\n",
+            "// long long",
             format("// long long long long\n"
-                   "// long\n",
+                   "// long",
                    getLLVMStyleWithColumns(20)));
 
   // Break a long line and reflow with a part of the next line.
@@ -1499,9 +1499,9 @@ TEST_F(FormatTestComments, ReflowsComments) {
   // Break but do not reflow if the first word from the next line is too long.
   EXPECT_EQ("// long long long\n"
             "// long\n"
-            "// long_long_long\n",
+            "// long_long_long",
             format("// long long long long\n"
-                   "// long_long_long\n",
+                   "// long_long_long",
                    getLLVMStyleWithColumns(20)));
 
   // Don't break or reflow short lines.
@@ -1513,14 +1513,14 @@ TEST_F(FormatTestComments, ReflowsComments) {
 
   // Keep prefixes and decorations while reflowing.
   EXPECT_EQ("/// long long long\n"
-            "/// long long\n",
+            "/// long long",
             format("/// long long long long\n"
-                   "/// long\n",
+                   "/// long",
                    getLLVMStyleWithColumns(20)));
   EXPECT_EQ("//! long long long\n"
-            "//! long long\n",
+            "//! long long",
             format("//! long long long long\n"
-                   "//! long\n",
+                   "//! long",
                    getLLVMStyleWithColumns(20)));
   EXPECT_EQ("/* long long long\n"
             " * long long */",
@@ -1528,14 +1528,14 @@ TEST_F(FormatTestComments, ReflowsComments) {
                    " * long */",
                    getLLVMStyleWithColumns(20)));
   EXPECT_EQ("///< long long long\n"
-            "///< long long\n",
+            "///< long long",
             format("///< long long long long\n"
-                   "///< long\n",
+                   "///< long",
                    getLLVMStyleWithColumns(20)));
   EXPECT_EQ("//!< long long long\n"
-            "//!< long long\n",
+            "//!< long long",
             format("//!< long long long long\n"
-                   "//!< long\n",
+                   "//!< long",
                    getLLVMStyleWithColumns(20)));
 
   // Don't bring leading whitespace up while reflowing.
@@ -1623,9 +1623,9 @@ TEST_F(FormatTestComments, ReflowsComments) {
   // multiple lines.
   EXPECT_EQ("// long long long\n"
             "// long long long\n"
-            "// long long long\n",
+            "// long long long",
             format("// long long long long long long long long\n"
-                   "// long\n",
+                   "// long",
                    getLLVMStyleWithColumns(20)));
 
   // Break the first line, then reflow the beginning of the second and third
@@ -1707,39 +1707,39 @@ TEST_F(FormatTestComments, ReflowsComments) {
   EXPECT_EQ("int a; // Trailing\n"
             "       // comment on\n"
             "       // 2 or 3\n"
-            "       // lines.\n",
+            "       // lines.",
             format("int a; // Trailing comment\n"
                    "       // on 2\n"
                    "       // or 3\n"
-                   "       // lines.\n",
+                   "       // lines.",
                    getLLVMStyleWithColumns(20)));
   EXPECT_EQ("/// This long line\n"
-            "/// gets reflown.\n",
+            "/// gets reflown.",
             format("/// This long line gets\n"
-                   "/// reflown.\n",
+                   "/// reflown.",
                    getLLVMStyleWithColumns(20)));
   EXPECT_EQ("//! This long line\n"
-            "//! gets reflown.\n",
+            "//! gets reflown.",
             format(" //! This long line gets\n"
-                   " //! reflown.\n",
+                   " //! reflown.",
                    getLLVMStyleWithColumns(20)));
   EXPECT_EQ("/* This long line\n"
             " * gets reflown.\n"
-            " */\n",
+            " */",
             format("/* This long line gets\n"
                    " * reflown.\n"
-                   " */\n",
+                   " */",
                    getLLVMStyleWithColumns(20)));
 
   // Reflow after indentation makes a line too long.
   EXPECT_EQ("{\n"
             "  // long long long\n"
             "  // lo long\n"
-            "}\n",
+            "}",
             format("{\n"
                    "// long long long lo\n"
                    "// long\n"
-                   "}\n",
+                   "}",
                    getLLVMStyleWithColumns(20)));
 
   // Break and reflow multiple lines.
@@ -1747,7 +1747,7 @@ TEST_F(FormatTestComments, ReflowsComments) {
             " * Reflow the end of\n"
             " * line by 11 22 33\n"
             " * 4.\n"
-            " */\n",
+            " */",
             format("/*\n"
                    " * Reflow the end of line\n"
                    " * by\n"
@@ -1755,23 +1755,23 @@ TEST_F(FormatTestComments, ReflowsComments) {
                    " * 22\n"
                    " * 33\n"
                    " * 4.\n"
-                   " */\n",
+                   " */",
                    getLLVMStyleWithColumns(20)));
   EXPECT_EQ("/// First line gets\n"
             "/// broken. Second\n"
             "/// line gets\n"
             "/// reflown and\n"
             "/// broken. Third\n"
-            "/// gets reflown.\n",
+            "/// gets reflown.",
             format("/// First line gets broken.\n"
                    "/// Second line gets reflown and broken.\n"
-                   "/// Third gets reflown.\n",
+                   "/// Third gets reflown.",
                    getLLVMStyleWithColumns(20)));
   EXPECT_EQ("int i; // first long\n"
             "       // long snd\n"
-            "       // long.\n",
+            "       // long.",
             format("int i; // first long long\n"
-                   "       // snd long.\n",
+                   "       // snd long.",
                    getLLVMStyleWithColumns(20)));
   EXPECT_EQ("{\n"
             "  // first long line\n"
@@ -1779,12 +1779,12 @@ TEST_F(FormatTestComments, ReflowsComments) {
             "  // long line line\n"
             "  // third long line\n"
             "  // line\n"
-            "}\n",
+            "}",
             format("{\n"
                    "  // first long line line\n"
                    "  // second long line line\n"
                    "  // third long line line\n"
-                   "}\n",
+                   "}",
                    getLLVMStyleWithColumns(20)));
   EXPECT_EQ("int i; /* first line\n"
             "        * second\n"
@@ -1809,20 +1809,20 @@ TEST_F(FormatTestComments, ReflowsComments) {
 
   // Keep the block comment endling '*/' while reflowing.
   EXPECT_EQ("/* Long long long\n"
-            " * line short */\n",
+            " * line short */",
             format("/* Long long long line\n"
-                   " * short */\n",
+                   " * short */",
                    getLLVMStyleWithColumns(20)));
 
   // Don't reflow between separate blocks of comments.
   EXPECT_EQ("/* First comment\n"
             " * block will */\n"
             "/* Snd\n"
-            " */\n",
+            " */",
             format("/* First comment block\n"
                    " * will */\n"
                    "/* Snd\n"
-                   " */\n",
+                   " */",
                    getLLVMStyleWithColumns(20)));
 
   // Don't reflow across blank comment lines.
@@ -1830,10 +1830,10 @@ TEST_F(FormatTestComments, ReflowsComments) {
             "       // line gets\n"
             "       // broken.\n"
             "       //\n"
-            "       // keep.\n",
+            "       // keep.",
             format("int i; // This long line gets broken.\n"
                    "       //  \n"
-                   "       // keep.\n",
+                   "       // keep.",
                    getLLVMStyleWithColumns(20)));
   EXPECT_EQ("{\n"
             "  /// long long long\n"
@@ -1949,11 +1949,11 @@ TEST_F(FormatTestComments, ReflowsComments) {
   EXPECT_EQ("/* First comment\n"
             " * block will */\n"
             "/* Snd\n"
-            " */\n",
+            " */",
             format("/* First comment block\n"
                    " * will */\n"
                    "/* Snd\n"
-                   " */\n",
+                   " */",
                    getLLVMStyleWithColumns(20)));
 
   // Don't reflow lines having different indentation.
@@ -2043,18 +2043,18 @@ TEST_F(FormatTestComments, ReflowsComments) {
   EXPECT_EQ("int i; // long long\n"
             "       // long\n"
             "int j; // long long\n"
-            "       // long\n",
+            "       // long",
             format("int i; // long long long\n"
-                   "int j; // long long long\n",
+                   "int j; // long long long",
                    getLLVMStyleWithColumns(20)));
 
   // Don't reflow if the first word on the next line is longer than the
   // available space at current line.
   EXPECT_EQ("int i; // trigger\n"
             "       // reflow\n"
-            "       // longsec\n",
+            "       // longsec",
             format("int i; // trigger reflow\n"
-                   "       // longsec\n",
+                   "       // longsec",
                    getLLVMStyleWithColumns(20)));
 
   // Simple case that correctly handles reflow in parameter lists.
@@ -2089,16 +2089,16 @@ TEST_F(FormatTestComments, ReflowsCommentsPrecise) {
   // After reflowing, "// reflows into   foo" does not fit the column limit,
   // so we compress the whitespace.
   EXPECT_EQ("// some text that\n"
-            "// reflows into foo\n",
+            "// reflows into foo",
             format("// some text that reflows\n"
-                   "// into   foo\n",
+                   "// into   foo",
                    getLLVMStyleWithColumns(20)));
   // Given one more column, "// reflows into   foo" does fit the limit, so we
   // do not compress the whitespace.
   EXPECT_EQ("// some text that\n"
-            "// reflows into   foo\n",
+            "// reflows into   foo",
             format("// some text that reflows\n"
-                   "// into   foo\n",
+                   "// into   foo",
                    getLLVMStyleWithColumns(21)));
 
   // Make sure that we correctly account for the space added in the reflow case
@@ -2107,56 +2107,56 @@ TEST_F(FormatTestComments, ReflowsCommentsPrecise) {
   // reflow.
   EXPECT_EQ("// some text that\n"
             "// reflows\n"
-            "// into1234567\n",
+            "// into1234567",
             format("// some text that reflows\n"
-                   "// into1234567\n",
+                   "// into1234567",
                    getLLVMStyleWithColumns(21)));
   // Secondly, when the next line ends later, but the first word in that line
   // is precisely one column over the limit, do not reflow.
   EXPECT_EQ("// some text that\n"
             "// reflows\n"
-            "// into1234567 f\n",
+            "// into1234567 f",
             format("// some text that reflows\n"
-                   "// into1234567 f\n",
+                   "// into1234567 f",
                    getLLVMStyleWithColumns(21)));
 }
 
 TEST_F(FormatTestComments, ReflowsCommentsWithExtraWhitespace) {
   // Baseline.
   EXPECT_EQ("// some text\n"
-            "// that re flows\n",
+            "// that re flows",
             format("// some text that\n"
-                   "// re flows\n",
+                   "// re flows",
                    getLLVMStyleWithColumns(16)));
   EXPECT_EQ("// some text\n"
-            "// that re flows\n",
+            "// that re flows",
             format("// some text that\n"
-                   "// re    flows\n",
+                   "// re    flows",
                    getLLVMStyleWithColumns(16)));
   EXPECT_EQ("/* some text\n"
             " * that re flows\n"
-            " */\n",
+            " */",
             format("/* some text that\n"
                    "*      re       flows\n"
-                   "*/\n",
+                   "*/",
                    getLLVMStyleWithColumns(16)));
   // FIXME: We do not reflow if the indent of two subsequent lines differs;
   // given that this is different behavior from block comments, do we want
   // to keep this?
   EXPECT_EQ("// some text\n"
             "// that\n"
-            "//     re flows\n",
+            "//     re flows",
             format("// some text that\n"
-                   "//     re       flows\n",
+                   "//     re       flows",
                    getLLVMStyleWithColumns(16)));
   // Space within parts of a line that fit.
   // FIXME: Use the earliest possible split while reflowing to compress the
   // whitespace within the line.
   EXPECT_EQ("// some text that\n"
             "// does re   flow\n"
-            "// more  here\n",
+            "// more  here",
             format("// some text that does\n"
-                   "// re   flow  more  here\n",
+                   "// re   flow  more  here",
                    getLLVMStyleWithColumns(21)));
 }
 
@@ -2172,11 +2172,11 @@ TEST_F(FormatTestComments, IgnoresIf0Contents) {
   EXPECT_EQ("#if false\n"
             "void f(  ) {  }\n"
             "#endif\n"
-            "void g() {}\n",
+            "void g() {}",
             format("#if false\n"
                    "void f(  ) {  }\n"
                    "#endif\n"
-                   "void g(  ) {  }\n"));
+                   "void g(  ) {  }"));
   EXPECT_EQ("enum E {\n"
             "  One,\n"
             "  Two,\n"
@@ -2420,10 +2420,10 @@ TEST_F(FormatTestComments, BlockComments) {
   EXPECT_EQ(
       "int aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa; /* comment */\n"
       "int bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb;   /* comment */\n"
-      "int cccccccccccccccccccccccccccccc;       /* comment */\n",
+      "int cccccccccccccccccccccccccccccc;       /* comment */",
       format("int aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa; /* comment */\n"
              "int      bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb; /* comment */\n"
-             "int    cccccccccccccccccccccccccccccc;  /* comment */\n"));
+             "int    cccccccccccccccccccccccccccccc;  /* comment */"));
 
   verifyFormat("void f(int * /* unused */) {}");
 
@@ -2595,9 +2595,9 @@ TEST_F(FormatTestComments, AlignTrailingComments) {
   // Align newly broken trailing comments.
   EXPECT_EQ("int ab; // line\n"
             "int a;  // long\n"
-            "        // long\n",
+            "        // long",
             format("int ab; // line\n"
-                   "int a; // long long\n",
+                   "int a; // long long",
                    getLLVMStyleWithColumns(15)));
   EXPECT_EQ("int ab; // line\n"
             "int a;  // long\n"
@@ -2768,7 +2768,7 @@ TEST_F(FormatTestComments, AlignTrailingComments) {
       "#if BAR\n"
       "#else\n"
       "long b_long_name; // Line about b\n"
-      "#endif\n",
+      "#endif",
       format("#if FOO\n"
              "#else\n"
              "long a;           // Line about a\n" // Previous (bad) behavior
@@ -2776,7 +2776,7 @@ TEST_F(FormatTestComments, AlignTrailingComments) {
              "#if BAR\n"
              "#else\n"
              "long b_long_name; // Line about b\n"
-             "#endif\n",
+             "#endif",
              getLLVMStyleWithColumns(80)));
 
   // bug 47589
@@ -2787,21 +2787,21 @@ TEST_F(FormatTestComments, AlignTrailingComments) {
       "#define FOO_SITELOCAL 2   // Site-local scope (deprecated).\n"
       "#define FOO_UNIQUELOCAL 3 // Unique local\n"
       "#define FOO_NODELOCAL 4   // Loopback\n\n"
-      "} // namespace m\n",
+      "} // namespace m",
       format("namespace m {\n\n"
              "#define FOO_GLOBAL 0   // Global scope.\n"
              "#define FOO_LINKLOCAL 1  // Link-local scope.\n"
              "#define FOO_SITELOCAL 2  // Site-local scope (deprecated).\n"
              "#define FOO_UNIQUELOCAL 3 // Unique local\n"
              "#define FOO_NODELOCAL 4  // Loopback\n\n"
-             "} // namespace m\n",
+             "} // namespace m",
              getLLVMStyleWithColumns(80)));
 
   // https://llvm.org/PR53441
   verifyFormat("/* */  //\n"
-               "int a; //\n");
+               "int a; //");
   verifyFormat("/**/   //\n"
-               "int a; //\n");
+               "int a; //");
 }
 
 TEST_F(FormatTestComments, AlignTrailingCommentsAcrossEmptyLines) {
@@ -2810,14 +2810,14 @@ TEST_F(FormatTestComments, AlignTrailingCommentsAcrossEmptyLines) {
   Style.AlignTrailingComments.OverEmptyLines = 1;
   verifyFormat("#include \"a.h\"  // simple\n"
                "\n"
-               "#include \"aa.h\" // example case\n",
+               "#include \"aa.h\" // example case",
                Style);
 
   verifyFormat("#include \"a.h\"   // align across\n"
                "\n"
                "#include \"aa.h\"  // two empty lines\n"
                "\n"
-               "#include \"aaa.h\" // in a row\n",
+               "#include \"aaa.h\" // in a row",
                Style);
 
   verifyFormat("#include \"a.h\"      // align\n"
@@ -2826,19 +2826,19 @@ TEST_F(FormatTestComments, AlignTrailingCommentsAcrossEmptyLines) {
                "\n"
                "#include \"aaaa.h\"   // across\n"
                "#include \"aaaaa.h\"  // one\n"
-               "#include \"aaaaaa.h\" // empty line\n",
+               "#include \"aaaaaa.h\" // empty line",
                Style);
 
   verifyFormat("#include \"a.h\"  // align trailing comments\n"
                "#include \"a.h\"\n"
-               "#include \"aa.h\" // across a line without comment\n",
+               "#include \"aa.h\" // across a line without comment",
                Style);
 
   verifyFormat("#include \"a.h\"   // align across\n"
                "#include \"a.h\"\n"
                "#include \"aa.h\"  // two lines without comment\n"
                "#include \"a.h\"\n"
-               "#include \"aaa.h\" // in a row\n",
+               "#include \"aaa.h\" // in a row",
                Style);
 
   verifyFormat("#include \"a.h\"      // align\n"
@@ -2847,7 +2847,7 @@ TEST_F(FormatTestComments, AlignTrailingCommentsAcrossEmptyLines) {
                "#include \"a.h\"\n"
                "#include \"aaaa.h\"   // across\n"
                "#include \"aaaaa.h\"  // a line without\n"
-               "#include \"aaaaaa.h\" // comment\n",
+               "#include \"aaaaaa.h\" // comment",
                Style);
 
   // Start of testing OverEmptyLines
@@ -2862,7 +2862,7 @@ TEST_F(FormatTestComments, AlignTrailingCommentsAcrossEmptyLines) {
             "#include \"ab.h\"      // comment\n"
             "\n"
             "\n"
-            "#include \"abcdefg.h\" // comment\n",
+            "#include \"abcdefg.h\" // comment",
             format("#include \"a.h\" // comment\n"
                    "\n"
                    "\n"
@@ -2870,7 +2870,7 @@ TEST_F(FormatTestComments, AlignTrailingCommentsAcrossEmptyLines) {
                    "#include \"ab.h\" // comment\n"
                    "\n"
                    "\n"
-                   "#include \"abcdefg.h\" // comment\n",
+                   "#include \"abcdefg.h\" // comment",
                    Style));
 
   Style.MaxEmptyLinesToKeep = 1;
@@ -2893,10 +2893,10 @@ TEST_F(FormatTestComments, AlignTrailingCommentsAcrossEmptyLines) {
   EXPECT_EQ("int ab; // line\n"
             "\n"
             "int a;  // long\n"
-            "        // long\n",
+            "        // long",
             format("int ab; // line\n"
                    "\n"
-                   "int a; // long long\n",
+                   "int a; // long long",
                    Style));
 
   Style.ColumnLimit = 30;
@@ -2908,12 +2908,12 @@ TEST_F(FormatTestComments, AlignTrailingCommentsAcrossEmptyLines) {
             "           // arround.\n"
             "\n"
             "int x = 2; // Is this still\n"
-            "           // aligned?\n",
+            "           // aligned?",
             format("int foo = 12345; // comment\n"
                    "int bar = 1234; // This is a very long comment\n"
                    "                // which is wrapped arround.\n"
                    "\n"
-                   "int x = 2; // Is this still aligned?\n",
+                   "int x = 2; // Is this still aligned?",
                    Style));
 
   Style.ColumnLimit = 35;
@@ -2924,12 +2924,12 @@ TEST_F(FormatTestComments, AlignTrailingCommentsAcrossEmptyLines) {
             "          // wrapped arround.\n"
             "\n"
             "int x =\n"
-            "    2; // Is this still aligned?\n",
+            "    2; // Is this still aligned?",
             format("int foo = 12345; // comment\n"
                    "int bar = 1234; // This is a very long comment\n"
                    "                // which is wrapped arround.\n"
                    "\n"
-                   "int x = 2; // Is this still aligned?\n",
+                   "int x = 2; // Is this still aligned?",
                    Style));
 
   Style.ColumnLimit = 40;
@@ -2938,12 +2938,12 @@ TEST_F(FormatTestComments, AlignTrailingCommentsAcrossEmptyLines) {
             "    1234; // This is a very long comment\n"
             "          // which is wrapped arround.\n"
             "\n"
-            "int x = 2; // Is this still aligned?\n",
+            "int x = 2; // Is this still aligned?",
             format("int foo = 12345; // comment\n"
                    "int bar = 1234; // This is a very long comment\n"
                    "                // which is wrapped arround.\n"
                    "\n"
-                   "int x = 2; // Is this still aligned?\n",
+                   "int x = 2; // Is this still aligned?",
                    Style));
 
   Style.ColumnLimit = 45;
@@ -2952,12 +2952,12 @@ TEST_F(FormatTestComments, AlignTrailingCommentsAcrossEmptyLines) {
             "    1234;  // This is a very long comment\n"
             "           // which is wrapped arround.\n"
             "\n"
-            "int x = 2; // Is this still aligned?\n",
+            "int x = 2; // Is this still aligned?",
             format("int foo = 12345; // comment\n"
                    "int bar = 1234; // This is a very long comment\n"
                    "                // which is wrapped arround.\n"
                    "\n"
-                   "int x = 2; // Is this still aligned?\n",
+                   "int x = 2; // Is this still aligned?",
                    Style));
 
   Style.ColumnLimit = 80;
@@ -2992,21 +2992,21 @@ TEST_F(FormatTestComments, AlignTrailingCommentsLeave) {
   EXPECT_EQ("int a;// do not touch\n"
             "int b; // any comments\n"
             "int c;  // comment\n"
-            "int d;   // comment\n",
+            "int d;   // comment",
             format("int a;// do not touch\n"
                    "int b; // any comments\n"
                    "int c;  // comment\n"
-                   "int d;   // comment\n",
+                   "int d;   // comment",
                    Style));
 
   EXPECT_EQ("int a;   // do not touch\n"
             "int b;  // any comments\n"
             "int c; // comment\n"
-            "int d;// comment\n",
+            "int d;// comment",
             format("int a;   // do not touch\n"
                    "int b;  // any comments\n"
                    "int c; // comment\n"
-                   "int d;// comment\n",
+                   "int d;// comment",
                    Style));
 
   EXPECT_EQ("// do not touch\n"
@@ -3086,10 +3086,10 @@ TEST_F(FormatTestComments, AlignTrailingCommentsLeave) {
             "int bar =\n"
             "    1234; // This is a very long\n"
             "          // comment which is\n"
-            "          // wrapped arround.\n",
+            "          // wrapped arround.",
             format("int foo = 12345; // comment\n"
                    "int bar = 1234;       // This is a very long comment\n"
-                   "          // which is wrapped arround.\n",
+                   "          // which is wrapped arround.",
                    Style));
 }
 
@@ -3671,41 +3671,41 @@ TEST_F(FormatTestComments, IndentsLongJavadocAnnotatedLines) {
       "/**\n"
       " * @param x long long long long long long long long long\n"
       " *     long\n"
-      " */\n",
+      " */",
       format("/**\n"
              " * @param x long long long long long long long long long long\n"
-             " */\n",
+             " */",
              Style));
   EXPECT_EQ("/**\n"
             " * @param x long long long long long long long long long\n"
             " *     long long long long long long long long long long\n"
-            " */\n",
+            " */",
             format("/**\n"
                    " * @param x long long long long long long long long long "
                    "long long long long long long long long long long\n"
-                   " */\n",
+                   " */",
                    Style));
   EXPECT_EQ("/**\n"
             " * @param x long long long long long long long long long\n"
             " *     long long long long long long long long long long\n"
             " *     long\n"
-            " */\n",
+            " */",
             format("/**\n"
                    " * @param x long long long long long long long long long "
                    "long long long long long long long long long long long\n"
-                   " */\n",
+                   " */",
                    Style));
   EXPECT_EQ("/**\n"
             " * Sentence that\n"
             " * should be broken.\n"
             " * @param short\n"
             " * keep indentation\n"
-            " */\n",
+            " */",
             format("/**\n"
                    " * Sentence that should be broken.\n"
                    " * @param short\n"
                    " * keep indentation\n"
-                   " */\n",
+                   " */",
                    Style20));
 
   EXPECT_EQ("/**\n"
@@ -3713,33 +3713,33 @@ TEST_F(FormatTestComments, IndentsLongJavadocAnnotatedLines) {
             " *     to break\n"
             " * @param l2 long2\n"
             " *     to break\n"
-            " */\n",
+            " */",
             format("/**\n"
                    " * @param l1 long1 to break\n"
                    " * @param l2 long2 to break\n"
-                   " */\n",
+                   " */",
                    Style20));
 
   EXPECT_EQ("/**\n"
             " * @param xx to\n"
             " *     break\n"
             " * no reflow\n"
-            " */\n",
+            " */",
             format("/**\n"
                    " * @param xx to break\n"
                    " * no reflow\n"
-                   " */\n",
+                   " */",
                    Style20));
 
   EXPECT_EQ("/**\n"
             " * @param xx to\n"
             " *     break yes\n"
             " *     reflow\n"
-            " */\n",
+            " */",
             format("/**\n"
                    " * @param xx to break\n"
                    " *     yes reflow\n"
-                   " */\n",
+                   " */",
                    Style20));
 
   FormatStyle JSStyle20 = getGoogleStyle(FormatStyle::LK_JavaScript);
@@ -3747,18 +3747,18 @@ TEST_F(FormatTestComments, IndentsLongJavadocAnnotatedLines) {
   EXPECT_EQ("/**\n"
             " * @param l1 long1\n"
             " *     to break\n"
-            " */\n",
+            " */",
             format("/**\n"
                    " * @param l1 long1 to break\n"
-                   " */\n",
+                   " */",
                    JSStyle20));
   EXPECT_EQ("/**\n"
             " * @param {l1 long1\n"
             " *     to break}\n"
-            " */\n",
+            " */",
             format("/**\n"
                    " * @param {l1 long1 to break}\n"
-                   " */\n",
+                   " */",
                    JSStyle20));
 }
 
@@ -3866,7 +3866,7 @@ TEST_F(FormatTestComments, SpaceAtLineCommentBegin) {
       "//} will not move\n"
       "\n"
       "//vv will only move\n"
-      "//} if the line above does\n";
+      "//} if the line above does";
 
   EXPECT_EQ("// Free comment without space\n"
             "\n"
@@ -3937,7 +3937,7 @@ TEST_F(FormatTestComments, SpaceAtLineCommentBegin) {
             "//} will not move\n"
             "\n"
             "// vv will only move\n"
-            "// } if the line above does\n",
+            "// } if the line above does",
             format(Code, Style));
 
   Style.SpacesInLineCommentPrefix = {0, 0};
@@ -4011,7 +4011,7 @@ TEST_F(FormatTestComments, SpaceAtLineCommentBegin) {
             "//} will not move\n"
             "\n"
             "//vv will only move\n"
-            "//} if the line above does\n",
+            "//} if the line above does",
             format(Code, Style));
 
   Style.SpacesInLineCommentPrefix = {2, -1u};
@@ -4084,7 +4084,7 @@ TEST_F(FormatTestComments, SpaceAtLineCommentBegin) {
             "//} will not move\n"
             "\n"
             "//  vv will only move\n"
-            "//  } if the line above does\n",
+            "//  } if the line above does",
             format(Code, Style));
 
   Style = getLLVMStyleWithColumns(20);
@@ -4271,7 +4271,7 @@ TEST_F(FormatTestComments, SpaceAtLineCommentBegin) {
             "\n"
             "//    A Comment to\n"
             "//    be moved\n"
-            "//     with indent\n",
+            "//     with indent",
             format("//A Comment to be moved\n"
                    "// with indent\n"
                    "\n"
@@ -4291,7 +4291,7 @@ TEST_F(FormatTestComments, SpaceAtLineCommentBegin) {
                    "//      with indent\n"
                    "\n"
                    "//      A Comment to be moved\n"
-                   "//       with indent\n",
+                   "//       with indent",
                    Style));
 
   Style.ColumnLimit = 30;
@@ -4321,7 +4321,7 @@ TEST_F(FormatTestComments, SpaceAtLineCommentBegin) {
             "\n"
             "int i; //    A Comment to be\n"
             "       //    moved\n"
-            "       //     with indent\n",
+            "       //     with indent",
             format("int i;//A Comment to be moved\n"
                    "      // with indent\n"
                    "\n"
@@ -4341,7 +4341,7 @@ TEST_F(FormatTestComments, SpaceAtLineCommentBegin) {
                    "      //      with indent\n"
                    "\n"
                    "int i;//      A Comment to be moved\n"
-                   "      //       with indent\n",
+                   "      //       with indent",
                    Style));
 
   Style = getLLVMStyleWithColumns(0);
@@ -4414,7 +4414,7 @@ TEST_F(FormatTestComments, SpaceAtLineCommentBegin) {
             "//} will not move\n"
             "\n"
             "// vv will only move\n"
-            "// } if the line above does\n",
+            "// } if the line above does",
             format(Code, Style));
 
   Style.SpacesInLineCommentPrefix = {0, 0};
@@ -4487,7 +4487,7 @@ TEST_F(FormatTestComments, SpaceAtLineCommentBegin) {
             "//} will not move\n"
             "\n"
             "//vv will only move\n"
-            "//} if the line above does\n",
+            "//} if the line above does",
             format(Code, Style));
 
   Style.SpacesInLineCommentPrefix = {2, -1u};
@@ -4560,7 +4560,7 @@ TEST_F(FormatTestComments, SpaceAtLineCommentBegin) {
             "//} will not move\n"
             "\n"
             "//  vv will only move\n"
-            "//  } if the line above does\n",
+            "//  } if the line above does",
             format(Code, Style));
 }
 
diff --git a/clang/unittests/Format/FormatTestJava.cpp b/clang/unittests/Format/FormatTestJava.cpp
index f3bf70ab40b37..202d603d05779 100644
--- a/clang/unittests/Format/FormatTestJava.cpp
+++ b/clang/unittests/Format/FormatTestJava.cpp
@@ -221,7 +221,7 @@ TEST_F(FormatTestJava, EnumDeclarations) {
                "\"cccccccccccccccccccccccc\"),\n"
                "  SECOND_ENUM(\"a\", \"b\", \"c\");\n"
                "  private VeryLongEnum(String a, String b, String c) {}\n"
-               "}\n");
+               "}");
 }
 
 TEST_F(FormatTestJava, ArrayInitializers) {
diff --git a/clang/unittests/Format/FormatTestMacroExpansion.cpp b/clang/unittests/Format/FormatTestMacroExpansion.cpp
index bc698c60c107d..68250234f4201 100644
--- a/clang/unittests/Format/FormatTestMacroExpansion.cpp
+++ b/clang/unittests/Format/FormatTestMacroExpansion.cpp
@@ -55,11 +55,11 @@ TEST_F(FormatTestMacroExpansion, UnexpandConfiguredMacros) {
   verifyFormat("ID(CALL(CALL(return a * b;)));", Style);
 
   verifyFormat("ASSIGN_OR_RETURN(MySomewhatLongType *variable,\n"
-               "                 MySomewhatLongFunction(SomethingElse()));\n",
+               "                 MySomewhatLongFunction(SomethingElse()));",
                Style);
   verifyFormat("ASSIGN_OR_RETURN(MySomewhatLongType *variable,\n"
                "                 MySomewhatLongFunction(SomethingElse()), "
-               "ReturnMe());\n",
+               "ReturnMe());",
                Style);
 
   verifyFormat(R"(
diff --git a/clang/unittests/Format/TokenAnnotatorTest.cpp b/clang/unittests/Format/TokenAnnotatorTest.cpp
index 290d0103bb3cf..ce4d026207499 100644
--- a/clang/unittests/Format/TokenAnnotatorTest.cpp
+++ b/clang/unittests/Format/TokenAnnotatorTest.cpp
@@ -90,7 +90,7 @@ TEST_F(TokenAnnotatorTest, UnderstandsUsesOfStarAndAmp) {
   EXPECT_EQ(Tokens.size(), 5u) << Tokens;
   EXPECT_TOKEN(Tokens[1], tok::amp, TT_UnaryOperator);
 
-  Tokens = annotate("bool b = 3 == int{3} && true;\n");
+  Tokens = annotate("bool b = 3 == int{3} && true;");
   EXPECT_EQ(Tokens.size(), 13u) << Tokens;
   EXPECT_TOKEN(Tokens[9], tok::ampamp, TT_BinaryOperator);
 
@@ -542,11 +542,11 @@ TEST_F(TokenAnnotatorTest, UnderstandsWhitespaceSensitiveMacros) {
   FormatStyle Style = getLLVMStyle();
   Style.WhitespaceSensitiveMacros.push_back("FOO");
 
-  auto Tokens = annotate("FOO(1+2 )\n", Style);
+  auto Tokens = annotate("FOO(1+2 )", Style);
   EXPECT_EQ(Tokens.size(), 7u) << Tokens;
   EXPECT_TOKEN(Tokens[0], tok::identifier, TT_UntouchableMacroFunc);
 
-  Tokens = annotate("FOO(a:b:c)\n", Style);
+  Tokens = annotate("FOO(a:b:c)", Style);
   EXPECT_EQ(Tokens.size(), 9u) << Tokens;
   EXPECT_TOKEN(Tokens[0], tok::identifier, TT_UntouchableMacroFunc);
 }
@@ -2030,13 +2030,13 @@ TEST_F(TokenAnnotatorTest, UnderstandsVerilogOperators) {
   Tokens = Annotate("case (x)\n"
                     "  x:\n"
                     "    x;\n"
-                    "endcase\n");
+                    "endcase");
   ASSERT_EQ(Tokens.size(), 10u) << Tokens;
   EXPECT_TOKEN(Tokens[5], tok::colon, TT_CaseLabelColon);
   Tokens = Annotate("case (x)\n"
                     "  x ? x : x:\n"
                     "    x;\n"
-                    "endcase\n");
+                    "endcase");
   ASSERT_EQ(Tokens.size(), 14u) << Tokens;
   EXPECT_TOKEN(Tokens[5], tok::question, TT_ConditionalExpr);
   EXPECT_TOKEN(Tokens[7], tok::colon, TT_ConditionalExpr);

From c07903aac7b9bc60d48f0381073285d79642cc3f Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Thu, 26 Oct 2023 21:39:49 -0700
Subject: [PATCH 169/877] [libc++abi] Disable test with a leaks

Leak could be real, as the code terminates before freeing the memory.
---
 libcxx/utils/libcxx/test/params.py   | 1 +
 libcxxabi/test/test_vector2.pass.cpp | 3 +++
 2 files changed, 4 insertions(+)

diff --git a/libcxx/utils/libcxx/test/params.py b/libcxx/utils/libcxx/test/params.py
index e34fd0387f4f5..65e8f029d71b4 100644
--- a/libcxx/utils/libcxx/test/params.py
+++ b/libcxx/utils/libcxx/test/params.py
@@ -242,6 +242,7 @@ def getStdFlag(cfg, std):
                 AddFlag("-fsanitize=leaks")    if sanitizer == "Leaks" else None,
 
                 AddFeature("sanitizer-new-delete") if sanitizer in ["Address", "HWAddress", "Memory", "MemoryWithOrigins", "Thread"] else None,
+                AddFeature("lsan") if sanitizer in ["Address", "HWAddress", "Leaks"] else 
             ]
         )
     ),
diff --git a/libcxxabi/test/test_vector2.pass.cpp b/libcxxabi/test/test_vector2.pass.cpp
index 3f04c3df7aca8..2acbca960f965 100644
--- a/libcxxabi/test/test_vector2.pass.cpp
+++ b/libcxxabi/test/test_vector2.pass.cpp
@@ -6,6 +6,9 @@
 //
 //===----------------------------------------------------------------------===//
 
+// Reports leaks after https://github.com/llvm/llvm-project/pull/66285
+// UNSUPPORTED: lsan
+
 // UNSUPPORTED: no-exceptions
 
 #include "cxxabi.h"

From b88a9f9670bb71e1eadeea76b16be6b953714883 Mon Sep 17 00:00:00 2001
From: Takuya Shimizu <shimizu2486@gmail.com>
Date: Fri, 27 Oct 2023 14:11:27 +0900
Subject: [PATCH 170/877] [clang][ExprConst] Fix crash on uninitialized array
 subobject (#67817)

https://reviews.llvm.org/D146358 was assuming that all subobjects have
their own name (`SubobjectDecl`), but it was not true for array
elements.

Fixes https://github.com/llvm/llvm-project/issues/67317
---
 clang/docs/ReleaseNotes.rst                      |  2 ++
 clang/include/clang/Basic/DiagnosticASTKinds.td  |  2 +-
 clang/lib/AST/ExprConstant.cpp                   | 13 +++++++++----
 clang/lib/AST/Interp/Interp.cpp                  |  3 ++-
 clang/test/SemaCXX/constant-expression-cxx2a.cpp |  6 ++++++
 clang/test/SemaCXX/eval-crashes.cpp              |  7 +++++++
 6 files changed, 27 insertions(+), 6 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 82550232947f7..8eca67677f1d8 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -529,6 +529,8 @@ Bug Fixes in This Version
   ``thread_local`` instead of ``_Thread_local``.
   Fixes (`#70068 <https://github.com/llvm/llvm-project/issues/70068>`_) and
   (`#69167 <https://github.com/llvm/llvm-project/issues/69167>`_)
+- Fix crash from constexpr evaluator evaluating uninitialized arrays as rvalue.
+  Fixes (`#67317 <https://github.com/llvm/llvm-project/issues/67317>`_)
 
 Bug Fixes to Compiler Builtins
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/clang/include/clang/Basic/DiagnosticASTKinds.td b/clang/include/clang/Basic/DiagnosticASTKinds.td
index b70cf1071d865..492f6b270ca52 100644
--- a/clang/include/clang/Basic/DiagnosticASTKinds.td
+++ b/clang/include/clang/Basic/DiagnosticASTKinds.td
@@ -69,7 +69,7 @@ def note_consteval_address_accessible : Note<
   "%select{pointer|reference}0 to a consteval declaration "
   "is not a constant expression">;
 def note_constexpr_uninitialized : Note<
-  "subobject %0 is not initialized">;
+  "subobject %select{of type |}0%1 is not initialized">;
 def note_constexpr_uninitialized_base : Note<
   "constructor of base class %0 is not called">;
 def note_constexpr_static_local : Note<
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index 320e2ef12c38d..5947805f9576f 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -2411,10 +2411,15 @@ static bool CheckEvaluationResult(CheckEvaluationResultKind CERK,
                                   const FieldDecl *SubobjectDecl,
                                   CheckedTemporaries &CheckedTemps) {
   if (!Value.hasValue()) {
-    assert(SubobjectDecl && "SubobjectDecl shall be non-null");
-    Info.FFDiag(DiagLoc, diag::note_constexpr_uninitialized) << SubobjectDecl;
-    Info.Note(SubobjectDecl->getLocation(),
-              diag::note_constexpr_subobject_declared_here);
+    if (SubobjectDecl) {
+      Info.FFDiag(DiagLoc, diag::note_constexpr_uninitialized)
+          << /*(name)*/ 1 << SubobjectDecl;
+      Info.Note(SubobjectDecl->getLocation(),
+                diag::note_constexpr_subobject_declared_here);
+    } else {
+      Info.FFDiag(DiagLoc, diag::note_constexpr_uninitialized)
+          << /*of type*/ 0 << Type;
+    }
     return false;
   }
 
diff --git a/clang/lib/AST/Interp/Interp.cpp b/clang/lib/AST/Interp/Interp.cpp
index 1ebbadc375f38..144b674451e35 100644
--- a/clang/lib/AST/Interp/Interp.cpp
+++ b/clang/lib/AST/Interp/Interp.cpp
@@ -439,7 +439,8 @@ bool CheckPure(InterpState &S, CodePtr OpPC, const CXXMethodDecl *MD) {
 static void DiagnoseUninitializedSubobject(InterpState &S, const SourceInfo &SI,
                                            const FieldDecl *SubObjDecl) {
   assert(SubObjDecl && "Subobject declaration does not exist");
-  S.FFDiag(SI, diag::note_constexpr_uninitialized) << SubObjDecl;
+  S.FFDiag(SI, diag::note_constexpr_uninitialized)
+      << /*(name)*/ 1 << SubObjDecl;
   S.Note(SubObjDecl->getLocation(),
          diag::note_constexpr_subobject_declared_here);
 }
diff --git a/clang/test/SemaCXX/constant-expression-cxx2a.cpp b/clang/test/SemaCXX/constant-expression-cxx2a.cpp
index 09f17d5b38949..e4d97dcb73562 100644
--- a/clang/test/SemaCXX/constant-expression-cxx2a.cpp
+++ b/clang/test/SemaCXX/constant-expression-cxx2a.cpp
@@ -1492,3 +1492,9 @@ class B{
 class D : B{}; // expected-error {{deleted function '~D' cannot override a non-deleted function}}
 // expected-note@-1 {{destructor of 'D' is implicitly deleted because base class 'B' has an inaccessible destructor}}
 }
+
+namespace GH67317 {
+  constexpr unsigned char a = // expected-error {{constexpr variable 'a' must be initialized by a constant expression}} \
+                              // expected-note {{subobject of type 'const unsigned char' is not initialized}}
+    __builtin_bit_cast(unsigned char, *new char[3][1]);
+};
diff --git a/clang/test/SemaCXX/eval-crashes.cpp b/clang/test/SemaCXX/eval-crashes.cpp
index 3e59ad31c559d..ac04b113f99b7 100644
--- a/clang/test/SemaCXX/eval-crashes.cpp
+++ b/clang/test/SemaCXX/eval-crashes.cpp
@@ -54,3 +54,10 @@ namespace pr33140_10 {
   int a(const int &n = 0);
   bool b() { return a() == a(); }
 }
+
+namespace GH67317 {
+struct array {
+  int (&data)[2];
+  array() : data(*new int[1][2]) {}
+};
+}

From be0cbe91735b50a31dc61aad7cf10df4a484c873 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Thu, 26 Oct 2023 21:15:37 -0700
Subject: [PATCH 171/877] [RISCV] Add test cases showing fli being used for
 negative min normalized value.

We can only use fli for the positive normalized value.
---
 llvm/test/CodeGen/RISCV/double-zfa.ll   |  9 +++++++++
 llvm/test/CodeGen/RISCV/float-zfa.ll    |  9 +++++++++
 llvm/test/CodeGen/RISCV/half-zfa-fli.ll | 16 ++++++++++++++++
 3 files changed, 34 insertions(+)

diff --git a/llvm/test/CodeGen/RISCV/double-zfa.ll b/llvm/test/CodeGen/RISCV/double-zfa.ll
index 904dd9f2ccbef..9f244f0daacf5 100644
--- a/llvm/test/CodeGen/RISCV/double-zfa.ll
+++ b/llvm/test/CodeGen/RISCV/double-zfa.ll
@@ -152,6 +152,15 @@ define double @loadfpimm17() {
   ret double -2.0
 }
 
+; Ensure fli isn't incorrecty used for negative min normal value.
+define double @loadfpimm18() {
+; CHECK-LABEL: loadfpimm18:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    fli.d fa0, min
+; CHECK-NEXT:    ret
+  ret double 0x8010000000000000
+}
+
 declare double @llvm.minimum.f64(double, double)
 
 define double @fminm_d(double %a, double %b) nounwind {
diff --git a/llvm/test/CodeGen/RISCV/float-zfa.ll b/llvm/test/CodeGen/RISCV/float-zfa.ll
index baa45e00b565f..ec982a78dfe76 100644
--- a/llvm/test/CodeGen/RISCV/float-zfa.ll
+++ b/llvm/test/CodeGen/RISCV/float-zfa.ll
@@ -106,6 +106,15 @@ define float @loadfpimm12() {
   ret float -2.0
 }
 
+; Ensure fli isn't incorrecty used for negative min normal value.
+define float @loadfpimm13() {
+; CHECK-LABEL: loadfpimm13:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    fli.s fa0, min
+; CHECK-NEXT:    ret
+  ret float 0xb810000000000000
+}
+
 declare float @llvm.minimum.f32(float, float)
 
 define float @fminm_s(float %a, float %b) nounwind {
diff --git a/llvm/test/CodeGen/RISCV/half-zfa-fli.ll b/llvm/test/CodeGen/RISCV/half-zfa-fli.ll
index 2805a8e582b34..73a77660eb3bd 100644
--- a/llvm/test/CodeGen/RISCV/half-zfa-fli.ll
+++ b/llvm/test/CodeGen/RISCV/half-zfa-fli.ll
@@ -211,3 +211,19 @@ define half @loadfpimm14() {
 ; ZFHMIN-NEXT:    ret
   ret half -2.0
 }
+
+; Ensure fli isn't incorrecty used for negative min normal value.
+define half @loadfpimm15() {
+; CHECK-LABEL: loadfpimm15:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    fli.h fa0, min
+; CHECK-NEXT:    ret
+;
+; ZFHMIN-LABEL: loadfpimm15:
+; ZFHMIN:       # %bb.0:
+; ZFHMIN-NEXT:    lui a0, %hi(.LCPI14_0)
+; ZFHMIN-NEXT:    flh fa0, %lo(.LCPI14_0)(a0)
+; ZFHMIN-NEXT:    ret
+  ret half 0xH8400
+}
+

From 8ff14223536d6645d7c92b4488e3eb676e548cb1 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Thu, 26 Oct 2023 21:22:06 -0700
Subject: [PATCH 172/877] [RISCV] Fix incorrect use of Zfa fli instruction for
 negative minimum value. (#70411)

isSmallestNormalized() only considers the magnitude, not the sign.
---
 llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.cpp | 2 +-
 llvm/test/CodeGen/RISCV/double-zfa.ll                | 3 ++-
 llvm/test/CodeGen/RISCV/float-zfa.ll                 | 3 ++-
 llvm/test/CodeGen/RISCV/half-zfa-fli.ll              | 3 ++-
 4 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.cpp
index 7919189d198c8..ed8ae4a7a4550 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.cpp
@@ -251,7 +251,7 @@ int RISCVLoadFPImm::getLoadFPImm(APFloat FPImm) {
          "Unexpected semantics");
 
   // Handle the minimum normalized value which is different for each type.
-  if (FPImm.isSmallestNormalized())
+  if (FPImm.isSmallestNormalized() && !FPImm.isNegative())
     return 1;
 
   // Convert to single precision to use its lookup table.
diff --git a/llvm/test/CodeGen/RISCV/double-zfa.ll b/llvm/test/CodeGen/RISCV/double-zfa.ll
index 9f244f0daacf5..162f33dd5ac9d 100644
--- a/llvm/test/CodeGen/RISCV/double-zfa.ll
+++ b/llvm/test/CodeGen/RISCV/double-zfa.ll
@@ -156,7 +156,8 @@ define double @loadfpimm17() {
 define double @loadfpimm18() {
 ; CHECK-LABEL: loadfpimm18:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    fli.d fa0, min
+; CHECK-NEXT:    lui a0, %hi(.LCPI16_0)
+; CHECK-NEXT:    fld fa0, %lo(.LCPI16_0)(a0)
 ; CHECK-NEXT:    ret
   ret double 0x8010000000000000
 }
diff --git a/llvm/test/CodeGen/RISCV/float-zfa.ll b/llvm/test/CodeGen/RISCV/float-zfa.ll
index ec982a78dfe76..6f9e5ec035224 100644
--- a/llvm/test/CodeGen/RISCV/float-zfa.ll
+++ b/llvm/test/CodeGen/RISCV/float-zfa.ll
@@ -110,7 +110,8 @@ define float @loadfpimm12() {
 define float @loadfpimm13() {
 ; CHECK-LABEL: loadfpimm13:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    fli.s fa0, min
+; CHECK-NEXT:    lui a0, 526336
+; CHECK-NEXT:    fmv.w.x fa0, a0
 ; CHECK-NEXT:    ret
   ret float 0xb810000000000000
 }
diff --git a/llvm/test/CodeGen/RISCV/half-zfa-fli.ll b/llvm/test/CodeGen/RISCV/half-zfa-fli.ll
index 73a77660eb3bd..577b774c27a0b 100644
--- a/llvm/test/CodeGen/RISCV/half-zfa-fli.ll
+++ b/llvm/test/CodeGen/RISCV/half-zfa-fli.ll
@@ -216,7 +216,8 @@ define half @loadfpimm14() {
 define half @loadfpimm15() {
 ; CHECK-LABEL: loadfpimm15:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    fli.h fa0, min
+; CHECK-NEXT:    lui a0, %hi(.LCPI14_0)
+; CHECK-NEXT:    flh fa0, %lo(.LCPI14_0)(a0)
 ; CHECK-NEXT:    ret
 ;
 ; ZFHMIN-LABEL: loadfpimm15:

From 8d30e8016695e9b9a5ae8109400a240d7db9c1be Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Thu, 26 Oct 2023 23:44:21 -0700
Subject: [PATCH 173/877] Fix typo in c07903aac7b9bc60d48f0381073285d79642cc3f

---
 libcxx/utils/libcxx/test/params.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libcxx/utils/libcxx/test/params.py b/libcxx/utils/libcxx/test/params.py
index 65e8f029d71b4..2ead641354e58 100644
--- a/libcxx/utils/libcxx/test/params.py
+++ b/libcxx/utils/libcxx/test/params.py
@@ -242,7 +242,7 @@ def getStdFlag(cfg, std):
                 AddFlag("-fsanitize=leaks")    if sanitizer == "Leaks" else None,
 
                 AddFeature("sanitizer-new-delete") if sanitizer in ["Address", "HWAddress", "Memory", "MemoryWithOrigins", "Thread"] else None,
-                AddFeature("lsan") if sanitizer in ["Address", "HWAddress", "Leaks"] else 
+                AddFeature("lsan") if sanitizer in ["Address", "HWAddress", "Leaks"] else None,
             ]
         )
     ),

From 5558504374cf3364310e5f088c18ce9fb5a58d65 Mon Sep 17 00:00:00 2001
From: Matthias Springer <me@m-sp.org>
Date: Fri, 27 Oct 2023 15:51:45 +0900
Subject: [PATCH 174/877] [mlir][IR] Make `OpOperand` comparable (#70410)

Two `OpOperand`s are the same if they belong to the same owner and have
the same operand number. There are currently no comparison operators
defined on `OpOperand` and we work around this in multiple places by
comparing pointers.

Note: `OpOperand`s are stored in an op, so it is valid to compare their
pointers to determine if they are the same operand. E.g.,
`getOperandNumber` is also implemented via pointer arithmetics.
---
 mlir/include/mlir/IR/UseDefLists.h                     | 10 ++++++++++
 mlir/lib/Dialect/Bufferization/IR/BufferizationOps.cpp |  6 +++---
 .../Tensor/Transforms/BufferizableOpInterfaceImpl.cpp  |  8 ++++----
 3 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/mlir/include/mlir/IR/UseDefLists.h b/mlir/include/mlir/IR/UseDefLists.h
index e3e3e86231465..2d60036716611 100644
--- a/mlir/include/mlir/IR/UseDefLists.h
+++ b/mlir/include/mlir/IR/UseDefLists.h
@@ -146,6 +146,16 @@ class IROperand : public detail::IROperandBase {
     return *this;
   }
 
+  /// Two operands are equal if they have the same owner and the same operand
+  /// number. They are stored inside of ops, so it is valid to compare their
+  /// pointers to determine equality.
+  bool operator==(const IROperand<DerivedT, IRValueT> &other) const {
+    return this == &other;
+  }
+  bool operator!=(const IROperand<DerivedT, IRValueT> &other) const {
+    return !(*this == other);
+  }
+
   /// Return the current value being used by this operand.
   IRValueT get() const { return value; }
 
diff --git a/mlir/lib/Dialect/Bufferization/IR/BufferizationOps.cpp b/mlir/lib/Dialect/Bufferization/IR/BufferizationOps.cpp
index 5716dcc9d9050..52ff6ceeee85b 100644
--- a/mlir/lib/Dialect/Bufferization/IR/BufferizationOps.cpp
+++ b/mlir/lib/Dialect/Bufferization/IR/BufferizationOps.cpp
@@ -537,12 +537,12 @@ LogicalResult DeallocTensorOp::bufferize(RewriterBase &rewriter,
 
 bool MaterializeInDestinationOp::bufferizesToMemoryRead(
     OpOperand &opOperand, const AnalysisState &state) {
-  return &opOperand == &getSourceMutable();
+  return opOperand == getSourceMutable();
 }
 
 bool MaterializeInDestinationOp::bufferizesToMemoryWrite(
     OpOperand &opOperand, const AnalysisState &state) {
-  if (&opOperand == &getDestMutable()) {
+  if (opOperand == getDestMutable()) {
     assert(isa<TensorType>(getDest().getType()) && "expected tensor type");
     return true;
   }
@@ -560,7 +560,7 @@ bool MaterializeInDestinationOp::mustBufferizeInPlace(
 AliasingValueList
 MaterializeInDestinationOp::getAliasingValues(OpOperand &opOperand,
                                               const AnalysisState &state) {
-  if (&opOperand == &getDestMutable()) {
+  if (opOperand == getDestMutable()) {
     assert(isa<TensorType>(getDest().getType()) && "expected tensor type");
     return {{getOperation()->getResult(0), BufferRelation::Equivalent}};
   }
diff --git a/mlir/lib/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.cpp
index 9386d0fd0f04f..a95443db88b50 100644
--- a/mlir/lib/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -644,11 +644,11 @@ struct InsertSliceOpInterface
     RankedTensorType destType = insertSliceOp.getDestType();
 
     // The source is always read.
-    if (&opOperand == &insertSliceOp.getSourceMutable())
+    if (opOperand == insertSliceOp.getSourceMutable())
       return true;
 
     // For the destination, it depends...
-    assert(&opOperand == &insertSliceOp.getDestMutable() && "expected dest");
+    assert(opOperand == insertSliceOp.getDestMutable() && "expected dest");
 
     // Dest is not read if it is entirely overwritten. E.g.:
     // tensor.insert_slice %a into %t[0][10][1] : ... into tensor<10xf32>
@@ -849,7 +849,7 @@ struct ReshapeOpInterface
                               const AnalysisState &state) const {
     // Depending on the layout map, the source buffer may have to be copied.
     auto reshapeOp = cast<tensor::ReshapeOp>(op);
-    return &opOperand == &reshapeOp.getShapeMutable();
+    return opOperand == reshapeOp.getShapeMutable();
   }
 
   bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand,
@@ -931,7 +931,7 @@ struct ParallelInsertSliceOpInterface
   bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand,
                                const AnalysisState &state) const {
     auto parallelInsertSliceOp = cast<ParallelInsertSliceOp>(op);
-    return &opOperand == &parallelInsertSliceOp.getDestMutable();
+    return opOperand == parallelInsertSliceOp.getDestMutable();
   }
 
   LogicalResult bufferize(Operation *op, RewriterBase &rewriter,

From 116eb323b15c4bd616775dfb59e2e81badc1453a Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Thu, 26 Oct 2023 23:54:46 -0700
Subject: [PATCH 175/877] [RISCV] Correct copyPhysReg for GPRPF64. (#70419)

GPRF64 represents a pair of registers. We were only copying the even
part. We need to copy the odd part too.
---
 llvm/lib/Target/RISCV/RISCVInstrInfo.cpp      | 19 ++++++++++++++-----
 .../CodeGen/RISCV/double-maximum-minimum.ll   | 10 ++++++++++
 llvm/test/CodeGen/RISCV/double-select-fcmp.ll | 14 ++++++++++++++
 3 files changed, 38 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index 0abf302bb25e3..ad31b2974993c 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -300,11 +300,6 @@ void RISCVInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
                                  MCRegister SrcReg, bool KillSrc) const {
   const TargetRegisterInfo *TRI = STI.getRegisterInfo();
 
-  if (RISCV::GPRPF64RegClass.contains(DstReg))
-    DstReg = TRI->getSubReg(DstReg, RISCV::sub_32);
-  if (RISCV::GPRPF64RegClass.contains(SrcReg))
-    SrcReg = TRI->getSubReg(SrcReg, RISCV::sub_32);
-
   if (RISCV::GPRRegClass.contains(DstReg, SrcReg)) {
     BuildMI(MBB, MBBI, DL, get(RISCV::ADDI), DstReg)
         .addReg(SrcReg, getKillRegState(KillSrc))
@@ -312,6 +307,20 @@ void RISCVInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     return;
   }
 
+  if (RISCV::GPRPF64RegClass.contains(DstReg, SrcReg)) {
+    // Emit an ADDI for both parts of GPRPF64.
+    BuildMI(MBB, MBBI, DL, get(RISCV::ADDI),
+            TRI->getSubReg(DstReg, RISCV::sub_32))
+        .addReg(TRI->getSubReg(SrcReg, RISCV::sub_32), getKillRegState(KillSrc))
+        .addImm(0);
+    BuildMI(MBB, MBBI, DL, get(RISCV::ADDI),
+            TRI->getSubReg(DstReg, RISCV::sub_32_hi))
+        .addReg(TRI->getSubReg(SrcReg, RISCV::sub_32_hi),
+                getKillRegState(KillSrc))
+        .addImm(0);
+    return;
+  }
+
   // Handle copy from csr
   if (RISCV::VCSRRegClass.contains(SrcReg) &&
       RISCV::GPRRegClass.contains(DstReg)) {
diff --git a/llvm/test/CodeGen/RISCV/double-maximum-minimum.ll b/llvm/test/CodeGen/RISCV/double-maximum-minimum.ll
index 45a31ab709922..0ca20783591ad 100644
--- a/llvm/test/CodeGen/RISCV/double-maximum-minimum.ll
+++ b/llvm/test/CodeGen/RISCV/double-maximum-minimum.ll
@@ -47,14 +47,17 @@ define double @fminimum_f64(double %a, double %b) nounwind {
 ; RV32IZFINXZDINX-NEXT:    lw a1, 12(sp)
 ; RV32IZFINXZDINX-NEXT:    feq.d a6, a0, a0
 ; RV32IZFINXZDINX-NEXT:    mv a4, a2
+; RV32IZFINXZDINX-NEXT:    mv a5, a3
 ; RV32IZFINXZDINX-NEXT:    bnez a6, .LBB0_2
 ; RV32IZFINXZDINX-NEXT:  # %bb.1:
 ; RV32IZFINXZDINX-NEXT:    mv a4, a0
+; RV32IZFINXZDINX-NEXT:    mv a5, a1
 ; RV32IZFINXZDINX-NEXT:  .LBB0_2:
 ; RV32IZFINXZDINX-NEXT:    feq.d a6, a2, a2
 ; RV32IZFINXZDINX-NEXT:    bnez a6, .LBB0_4
 ; RV32IZFINXZDINX-NEXT:  # %bb.3:
 ; RV32IZFINXZDINX-NEXT:    mv a0, a2
+; RV32IZFINXZDINX-NEXT:    mv a1, a3
 ; RV32IZFINXZDINX-NEXT:  .LBB0_4:
 ; RV32IZFINXZDINX-NEXT:    fmin.d a0, a0, a4
 ; RV32IZFINXZDINX-NEXT:    sw a0, 8(sp)
@@ -121,14 +124,17 @@ define double @fmaximum_f64(double %a, double %b) nounwind {
 ; RV32IZFINXZDINX-NEXT:    lw a1, 12(sp)
 ; RV32IZFINXZDINX-NEXT:    feq.d a6, a0, a0
 ; RV32IZFINXZDINX-NEXT:    mv a4, a2
+; RV32IZFINXZDINX-NEXT:    mv a5, a3
 ; RV32IZFINXZDINX-NEXT:    bnez a6, .LBB1_2
 ; RV32IZFINXZDINX-NEXT:  # %bb.1:
 ; RV32IZFINXZDINX-NEXT:    mv a4, a0
+; RV32IZFINXZDINX-NEXT:    mv a5, a1
 ; RV32IZFINXZDINX-NEXT:  .LBB1_2:
 ; RV32IZFINXZDINX-NEXT:    feq.d a6, a2, a2
 ; RV32IZFINXZDINX-NEXT:    bnez a6, .LBB1_4
 ; RV32IZFINXZDINX-NEXT:  # %bb.3:
 ; RV32IZFINXZDINX-NEXT:    mv a0, a2
+; RV32IZFINXZDINX-NEXT:    mv a1, a3
 ; RV32IZFINXZDINX-NEXT:  .LBB1_4:
 ; RV32IZFINXZDINX-NEXT:    fmax.d a0, a0, a4
 ; RV32IZFINXZDINX-NEXT:    sw a0, 8(sp)
@@ -226,14 +232,17 @@ define double @fmaximum_nnan_f64(double %a, double %b) nounwind {
 ; RV32IZFINXZDINX-NEXT:    lw a1, 12(sp)
 ; RV32IZFINXZDINX-NEXT:    feq.d a6, a0, a0
 ; RV32IZFINXZDINX-NEXT:    mv a4, a2
+; RV32IZFINXZDINX-NEXT:    mv a5, a3
 ; RV32IZFINXZDINX-NEXT:    bnez a6, .LBB3_2
 ; RV32IZFINXZDINX-NEXT:  # %bb.1:
 ; RV32IZFINXZDINX-NEXT:    mv a4, a0
+; RV32IZFINXZDINX-NEXT:    mv a5, a1
 ; RV32IZFINXZDINX-NEXT:  .LBB3_2:
 ; RV32IZFINXZDINX-NEXT:    feq.d a6, a2, a2
 ; RV32IZFINXZDINX-NEXT:    bnez a6, .LBB3_4
 ; RV32IZFINXZDINX-NEXT:  # %bb.3:
 ; RV32IZFINXZDINX-NEXT:    mv a0, a2
+; RV32IZFINXZDINX-NEXT:    mv a1, a3
 ; RV32IZFINXZDINX-NEXT:  .LBB3_4:
 ; RV32IZFINXZDINX-NEXT:    fmin.d a0, a0, a4
 ; RV32IZFINXZDINX-NEXT:    sw a0, 8(sp)
@@ -291,6 +300,7 @@ define double @fminimum_nnan_op_f64(double %a, double %b) nounwind {
 ; RV32IZFINXZDINX-NEXT:    bnez a0, .LBB4_2
 ; RV32IZFINXZDINX-NEXT:  # %bb.1:
 ; RV32IZFINXZDINX-NEXT:    mv a0, a2
+; RV32IZFINXZDINX-NEXT:    mv a1, a3
 ; RV32IZFINXZDINX-NEXT:    j .LBB4_3
 ; RV32IZFINXZDINX-NEXT:  .LBB4_2:
 ; RV32IZFINXZDINX-NEXT:    lw a0, 8(sp)
diff --git a/llvm/test/CodeGen/RISCV/double-select-fcmp.ll b/llvm/test/CodeGen/RISCV/double-select-fcmp.ll
index 0c0a5dbc51ed7..766da3680ffc0 100644
--- a/llvm/test/CodeGen/RISCV/double-select-fcmp.ll
+++ b/llvm/test/CodeGen/RISCV/double-select-fcmp.ll
@@ -54,6 +54,7 @@ define double @select_fcmp_oeq(double %a, double %b) nounwind {
 ; CHECKRV32ZDINX-NEXT:    bnez a4, .LBB1_2
 ; CHECKRV32ZDINX-NEXT:  # %bb.1:
 ; CHECKRV32ZDINX-NEXT:    mv a0, a2
+; CHECKRV32ZDINX-NEXT:    mv a1, a3
 ; CHECKRV32ZDINX-NEXT:  .LBB1_2:
 ; CHECKRV32ZDINX-NEXT:    sw a0, 8(sp)
 ; CHECKRV32ZDINX-NEXT:    sw a1, 12(sp)
@@ -100,6 +101,7 @@ define double @select_fcmp_ogt(double %a, double %b) nounwind {
 ; CHECKRV32ZDINX-NEXT:    bnez a4, .LBB2_2
 ; CHECKRV32ZDINX-NEXT:  # %bb.1:
 ; CHECKRV32ZDINX-NEXT:    mv a0, a2
+; CHECKRV32ZDINX-NEXT:    mv a1, a3
 ; CHECKRV32ZDINX-NEXT:  .LBB2_2:
 ; CHECKRV32ZDINX-NEXT:    sw a0, 8(sp)
 ; CHECKRV32ZDINX-NEXT:    sw a1, 12(sp)
@@ -146,6 +148,7 @@ define double @select_fcmp_oge(double %a, double %b) nounwind {
 ; CHECKRV32ZDINX-NEXT:    bnez a4, .LBB3_2
 ; CHECKRV32ZDINX-NEXT:  # %bb.1:
 ; CHECKRV32ZDINX-NEXT:    mv a0, a2
+; CHECKRV32ZDINX-NEXT:    mv a1, a3
 ; CHECKRV32ZDINX-NEXT:  .LBB3_2:
 ; CHECKRV32ZDINX-NEXT:    sw a0, 8(sp)
 ; CHECKRV32ZDINX-NEXT:    sw a1, 12(sp)
@@ -192,6 +195,7 @@ define double @select_fcmp_olt(double %a, double %b) nounwind {
 ; CHECKRV32ZDINX-NEXT:    bnez a4, .LBB4_2
 ; CHECKRV32ZDINX-NEXT:  # %bb.1:
 ; CHECKRV32ZDINX-NEXT:    mv a0, a2
+; CHECKRV32ZDINX-NEXT:    mv a1, a3
 ; CHECKRV32ZDINX-NEXT:  .LBB4_2:
 ; CHECKRV32ZDINX-NEXT:    sw a0, 8(sp)
 ; CHECKRV32ZDINX-NEXT:    sw a1, 12(sp)
@@ -238,6 +242,7 @@ define double @select_fcmp_ole(double %a, double %b) nounwind {
 ; CHECKRV32ZDINX-NEXT:    bnez a4, .LBB5_2
 ; CHECKRV32ZDINX-NEXT:  # %bb.1:
 ; CHECKRV32ZDINX-NEXT:    mv a0, a2
+; CHECKRV32ZDINX-NEXT:    mv a1, a3
 ; CHECKRV32ZDINX-NEXT:  .LBB5_2:
 ; CHECKRV32ZDINX-NEXT:    sw a0, 8(sp)
 ; CHECKRV32ZDINX-NEXT:    sw a1, 12(sp)
@@ -288,6 +293,7 @@ define double @select_fcmp_one(double %a, double %b) nounwind {
 ; CHECKRV32ZDINX-NEXT:    bnez a4, .LBB6_2
 ; CHECKRV32ZDINX-NEXT:  # %bb.1:
 ; CHECKRV32ZDINX-NEXT:    mv a0, a2
+; CHECKRV32ZDINX-NEXT:    mv a1, a3
 ; CHECKRV32ZDINX-NEXT:  .LBB6_2:
 ; CHECKRV32ZDINX-NEXT:    sw a0, 8(sp)
 ; CHECKRV32ZDINX-NEXT:    sw a1, 12(sp)
@@ -340,6 +346,7 @@ define double @select_fcmp_ord(double %a, double %b) nounwind {
 ; CHECKRV32ZDINX-NEXT:    bnez a4, .LBB7_2
 ; CHECKRV32ZDINX-NEXT:  # %bb.1:
 ; CHECKRV32ZDINX-NEXT:    mv a0, a2
+; CHECKRV32ZDINX-NEXT:    mv a1, a3
 ; CHECKRV32ZDINX-NEXT:  .LBB7_2:
 ; CHECKRV32ZDINX-NEXT:    sw a0, 8(sp)
 ; CHECKRV32ZDINX-NEXT:    sw a1, 12(sp)
@@ -392,6 +399,7 @@ define double @select_fcmp_ueq(double %a, double %b) nounwind {
 ; CHECKRV32ZDINX-NEXT:    beqz a4, .LBB8_2
 ; CHECKRV32ZDINX-NEXT:  # %bb.1:
 ; CHECKRV32ZDINX-NEXT:    mv a0, a2
+; CHECKRV32ZDINX-NEXT:    mv a1, a3
 ; CHECKRV32ZDINX-NEXT:  .LBB8_2:
 ; CHECKRV32ZDINX-NEXT:    sw a0, 8(sp)
 ; CHECKRV32ZDINX-NEXT:    sw a1, 12(sp)
@@ -440,6 +448,7 @@ define double @select_fcmp_ugt(double %a, double %b) nounwind {
 ; CHECKRV32ZDINX-NEXT:    beqz a4, .LBB9_2
 ; CHECKRV32ZDINX-NEXT:  # %bb.1:
 ; CHECKRV32ZDINX-NEXT:    mv a0, a2
+; CHECKRV32ZDINX-NEXT:    mv a1, a3
 ; CHECKRV32ZDINX-NEXT:  .LBB9_2:
 ; CHECKRV32ZDINX-NEXT:    sw a0, 8(sp)
 ; CHECKRV32ZDINX-NEXT:    sw a1, 12(sp)
@@ -486,6 +495,7 @@ define double @select_fcmp_uge(double %a, double %b) nounwind {
 ; CHECKRV32ZDINX-NEXT:    beqz a4, .LBB10_2
 ; CHECKRV32ZDINX-NEXT:  # %bb.1:
 ; CHECKRV32ZDINX-NEXT:    mv a0, a2
+; CHECKRV32ZDINX-NEXT:    mv a1, a3
 ; CHECKRV32ZDINX-NEXT:  .LBB10_2:
 ; CHECKRV32ZDINX-NEXT:    sw a0, 8(sp)
 ; CHECKRV32ZDINX-NEXT:    sw a1, 12(sp)
@@ -532,6 +542,7 @@ define double @select_fcmp_ult(double %a, double %b) nounwind {
 ; CHECKRV32ZDINX-NEXT:    beqz a4, .LBB11_2
 ; CHECKRV32ZDINX-NEXT:  # %bb.1:
 ; CHECKRV32ZDINX-NEXT:    mv a0, a2
+; CHECKRV32ZDINX-NEXT:    mv a1, a3
 ; CHECKRV32ZDINX-NEXT:  .LBB11_2:
 ; CHECKRV32ZDINX-NEXT:    sw a0, 8(sp)
 ; CHECKRV32ZDINX-NEXT:    sw a1, 12(sp)
@@ -578,6 +589,7 @@ define double @select_fcmp_ule(double %a, double %b) nounwind {
 ; CHECKRV32ZDINX-NEXT:    beqz a4, .LBB12_2
 ; CHECKRV32ZDINX-NEXT:  # %bb.1:
 ; CHECKRV32ZDINX-NEXT:    mv a0, a2
+; CHECKRV32ZDINX-NEXT:    mv a1, a3
 ; CHECKRV32ZDINX-NEXT:  .LBB12_2:
 ; CHECKRV32ZDINX-NEXT:    sw a0, 8(sp)
 ; CHECKRV32ZDINX-NEXT:    sw a1, 12(sp)
@@ -624,6 +636,7 @@ define double @select_fcmp_une(double %a, double %b) nounwind {
 ; CHECKRV32ZDINX-NEXT:    beqz a4, .LBB13_2
 ; CHECKRV32ZDINX-NEXT:  # %bb.1:
 ; CHECKRV32ZDINX-NEXT:    mv a0, a2
+; CHECKRV32ZDINX-NEXT:    mv a1, a3
 ; CHECKRV32ZDINX-NEXT:  .LBB13_2:
 ; CHECKRV32ZDINX-NEXT:    sw a0, 8(sp)
 ; CHECKRV32ZDINX-NEXT:    sw a1, 12(sp)
@@ -674,6 +687,7 @@ define double @select_fcmp_uno(double %a, double %b) nounwind {
 ; CHECKRV32ZDINX-NEXT:    beqz a4, .LBB14_2
 ; CHECKRV32ZDINX-NEXT:  # %bb.1:
 ; CHECKRV32ZDINX-NEXT:    mv a0, a2
+; CHECKRV32ZDINX-NEXT:    mv a1, a3
 ; CHECKRV32ZDINX-NEXT:  .LBB14_2:
 ; CHECKRV32ZDINX-NEXT:    sw a0, 8(sp)
 ; CHECKRV32ZDINX-NEXT:    sw a1, 12(sp)

From a723a5b637f3a5b0993d5441f36c42221b3777db Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph@gmail.com>
Date: Fri, 20 Oct 2023 01:05:04 -0700
Subject: [PATCH 176/877] Apply clang-tidy fixes for misc-include-cleaner in
 AliasAnalysis.cpp (NFC)

---
 mlir/lib/Analysis/AliasAnalysis.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/mlir/lib/Analysis/AliasAnalysis.cpp b/mlir/lib/Analysis/AliasAnalysis.cpp
index 2f2b782495286..1382060f4dd90 100644
--- a/mlir/lib/Analysis/AliasAnalysis.cpp
+++ b/mlir/lib/Analysis/AliasAnalysis.cpp
@@ -8,6 +8,10 @@
 
 #include "mlir/Analysis/AliasAnalysis.h"
 #include "mlir/Analysis/AliasAnalysis/LocalAliasAnalysis.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/Value.h"
+#include "mlir/Support/LLVM.h"
+#include <memory>
 
 using namespace mlir;
 

From a2e69c570823bec919b3e786ade152e4db6a08ba Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph@gmail.com>
Date: Fri, 20 Oct 2023 01:05:46 -0700
Subject: [PATCH 177/877] Apply clang-tidy fixes for misc-include-cleaner in
 LocalAliasAnalysis.cpp (NFC)

---
 .../Analysis/AliasAnalysis/LocalAliasAnalysis.cpp   | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/mlir/lib/Analysis/AliasAnalysis/LocalAliasAnalysis.cpp b/mlir/lib/Analysis/AliasAnalysis/LocalAliasAnalysis.cpp
index 8aeb3d764e165..b22f3fdc9fb7a 100644
--- a/mlir/lib/Analysis/AliasAnalysis/LocalAliasAnalysis.cpp
+++ b/mlir/lib/Analysis/AliasAnalysis/LocalAliasAnalysis.cpp
@@ -8,12 +8,25 @@
 
 #include "mlir/Analysis/AliasAnalysis/LocalAliasAnalysis.h"
 
+#include "mlir/Analysis/AliasAnalysis.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Block.h"
 #include "mlir/IR/Matchers.h"
+#include "mlir/IR/OpDefinition.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/Region.h"
+#include "mlir/IR/Value.h"
+#include "mlir/IR/ValueRange.h"
 #include "mlir/Interfaces/ControlFlowInterfaces.h"
 #include "mlir/Interfaces/FunctionInterfaces.h"
 #include "mlir/Interfaces/SideEffectInterfaces.h"
 #include "mlir/Interfaces/ViewLikeInterface.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Support/LogicalResult.h"
+#include "llvm/Support/Casting.h"
+#include <cassert>
 #include <optional>
+#include <utility>
 
 using namespace mlir;
 

From 9dd06a1259db0dddd4d0a418421cabbdcaca9a56 Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph@gmail.com>
Date: Fri, 20 Oct 2023 01:06:56 -0700
Subject: [PATCH 178/877] Apply clang-tidy fixes for misc-include-cleaner in
 CallGraph.cpp (NFC)

---
 mlir/lib/Analysis/CallGraph.cpp | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/mlir/lib/Analysis/CallGraph.cpp b/mlir/lib/Analysis/CallGraph.cpp
index 112922647124d..ccd4676632136 100644
--- a/mlir/lib/Analysis/CallGraph.cpp
+++ b/mlir/lib/Analysis/CallGraph.cpp
@@ -14,9 +14,13 @@
 #include "mlir/IR/Operation.h"
 #include "mlir/IR/SymbolTable.h"
 #include "mlir/Interfaces/CallInterfaces.h"
-#include "llvm/ADT/PointerUnion.h"
+#include "mlir/Support/LLVM.h"
 #include "llvm/ADT/SCCIterator.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/iterator_range.h"
 #include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <memory>
 
 using namespace mlir;
 

From 8a5151802541921c5b2ecfa21bc3167a216aa6ee Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph@gmail.com>
Date: Fri, 20 Oct 2023 01:07:54 -0700
Subject: [PATCH 179/877] Apply clang-tidy fixes for misc-include-cleaner in
 ConstantPropagationAnalysis.cpp (NFC)

---
 .../Analysis/DataFlow/ConstantPropagationAnalysis.cpp    | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/mlir/lib/Analysis/DataFlow/ConstantPropagationAnalysis.cpp b/mlir/lib/Analysis/DataFlow/ConstantPropagationAnalysis.cpp
index d25debb34e902..99c041923c8c5 100644
--- a/mlir/lib/Analysis/DataFlow/ConstantPropagationAnalysis.cpp
+++ b/mlir/lib/Analysis/DataFlow/ConstantPropagationAnalysis.cpp
@@ -7,8 +7,17 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/Analysis/DataFlow/ConstantPropagationAnalysis.h"
+#include "mlir/Analysis/DataFlow/SparseAnalysis.h"
+#include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/OpDefinition.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/Value.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Support/LogicalResult.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/Debug.h"
+#include <cassert>
 
 #define DEBUG_TYPE "constant-propagation"
 

From e48038a84b5ce310ffdf26d9aaf407fd32ffb133 Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph@gmail.com>
Date: Fri, 20 Oct 2023 01:08:38 -0700
Subject: [PATCH 180/877] Apply clang-tidy fixes for misc-include-cleaner in
 DeadCodeAnalysis.cpp (NFC)

---
 mlir/lib/Analysis/DataFlow/DeadCodeAnalysis.cpp | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/mlir/lib/Analysis/DataFlow/DeadCodeAnalysis.cpp b/mlir/lib/Analysis/DataFlow/DeadCodeAnalysis.cpp
index 7a39e3ae34c1e..64df74e78915f 100644
--- a/mlir/lib/Analysis/DataFlow/DeadCodeAnalysis.cpp
+++ b/mlir/lib/Analysis/DataFlow/DeadCodeAnalysis.cpp
@@ -8,9 +8,22 @@
 
 #include "mlir/Analysis/DataFlow/DeadCodeAnalysis.h"
 #include "mlir/Analysis/DataFlow/ConstantPropagationAnalysis.h"
+#include "mlir/Analysis/DataFlow/SparseAnalysis.h"
 #include "mlir/Analysis/DataFlowFramework.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Block.h"
+#include "mlir/IR/Diagnostics.h"
+#include "mlir/IR/Location.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/SymbolTable.h"
+#include "mlir/IR/Value.h"
+#include "mlir/IR/ValueRange.h"
 #include "mlir/Interfaces/CallInterfaces.h"
 #include "mlir/Interfaces/ControlFlowInterfaces.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Support/LogicalResult.h"
+#include "llvm/Support/Casting.h"
+#include <cassert>
 #include <optional>
 
 using namespace mlir;

From fba3951b281bb7260efca7551cda9597a7095d7e Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph@gmail.com>
Date: Fri, 20 Oct 2023 01:09:17 -0700
Subject: [PATCH 181/877] Apply clang-tidy fixes for misc-include-cleaner in
 DenseAnalysis.cpp (NFC)

---
 mlir/lib/Analysis/DataFlow/DenseAnalysis.cpp | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/mlir/lib/Analysis/DataFlow/DenseAnalysis.cpp b/mlir/lib/Analysis/DataFlow/DenseAnalysis.cpp
index 5e75883e61468..a6c9f7d7da225 100644
--- a/mlir/lib/Analysis/DataFlow/DenseAnalysis.cpp
+++ b/mlir/lib/Analysis/DataFlow/DenseAnalysis.cpp
@@ -8,8 +8,19 @@
 
 #include "mlir/Analysis/DataFlow/DenseAnalysis.h"
 #include "mlir/Analysis/DataFlow/DeadCodeAnalysis.h"
+#include "mlir/Analysis/DataFlowFramework.h"
+#include "mlir/IR/Block.h"
+#include "mlir/IR/OpDefinition.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/Region.h"
 #include "mlir/Interfaces/CallInterfaces.h"
 #include "mlir/Interfaces/ControlFlowInterfaces.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Support/LogicalResult.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/Casting.h"
+#include <cassert>
+#include <optional>
 
 using namespace mlir;
 using namespace mlir::dataflow;

From fde1ecdec878be591479f508d6a373701deb137e Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Fri, 27 Oct 2023 00:00:40 -0700
Subject: [PATCH 182/877] Revert "[libc++][tests] Fix a few remaining instances
 of outdated static assertion regexes in our test suite"

Fails like https://lab.llvm.org/buildbot/#/builders/238/builds/6278/steps/11/logs/stdio on many bots.

This reverts commit 166b3a86173666ccf823d09e30f40f4a0d386e18.
---
 .../cpp17_iterator_concepts.verify.cpp        | 86 +++++++++----------
 .../mdspan/layout_stride/extents.verify.cpp   |  4 +-
 2 files changed, 45 insertions(+), 45 deletions(-)

diff --git a/libcxx/test/libcxx/algorithms/cpp17_iterator_concepts.verify.cpp b/libcxx/test/libcxx/algorithms/cpp17_iterator_concepts.verify.cpp
index 1661738dbdfa1..3bc49cd3b1b18 100644
--- a/libcxx/test/libcxx/algorithms/cpp17_iterator_concepts.verify.cpp
+++ b/libcxx/test/libcxx/algorithms/cpp17_iterator_concepts.verify.cpp
@@ -83,26 +83,26 @@ struct diff_t_not_signed : valid_iterator<diff_t_not_signed> {
 };
 
 void check_iterator_requirements() {
-  static_assert(std::__cpp17_iterator<missing_deref>); // expected-error-re {{static assertion failed}}
+  static_assert(std::__cpp17_iterator<missing_deref>); // expected-error-re {{{{static assertion|static_assert}} failed}}
   // expected-note@*:* {{indirection requires pointer operand}}
 
-  static_assert(std::__cpp17_iterator<missing_preincrement>); // expected-error-re {{static assertion failed}}
+  static_assert(std::__cpp17_iterator<missing_preincrement>); // expected-error-re {{{{static assertion|static_assert}} failed}}
   // expected-note@*:* {{cannot increment value of type 'missing_preincrement'}}
 
 
-  static_assert(std::__cpp17_iterator<not_move_constructible>); // expected-error-re {{static assertion failed}}
+  static_assert(std::__cpp17_iterator<not_move_constructible>); // expected-error-re {{{{static assertion|static_assert}} failed}}
   // expected-note@*:* {{because 'not_move_constructible' does not satisfy '__cpp17_move_constructible'}}
 
-  static_assert(std::__cpp17_iterator<not_copy_constructible>); // expected-error-re {{static assertion failed}}
+  static_assert(std::__cpp17_iterator<not_copy_constructible>); // expected-error-re {{{{static assertion|static_assert}} failed}}
   // expected-note@*:* {{because 'not_copy_constructible' does not satisfy '__cpp17_copy_constructible'}}
 
-  static_assert(std::__cpp17_iterator<not_move_assignable>); // expected-error-re {{static assertion failed}}
+  static_assert(std::__cpp17_iterator<not_move_assignable>); // expected-error-re {{{{static assertion|static_assert}} failed}}
   // expected-note@*:* {{because 'not_move_assignable' does not satisfy '__cpp17_copy_assignable'}}
 
-  static_assert(std::__cpp17_iterator<not_copy_assignable>); // expected-error-re {{static assertion failed}}
+  static_assert(std::__cpp17_iterator<not_copy_assignable>); // expected-error-re {{{{static assertion|static_assert}} failed}}
   // expectted-note@*:* {{because 'not_copy_assignable' does not satisfy '__cpp17_copy_assignable'}}
 
-  static_assert(std::__cpp17_iterator<diff_t_not_signed>); // expected-error-re {{static assertion failed}}
+  static_assert(std::__cpp17_iterator<diff_t_not_signed>); // expected-error-re {{{{static assertion|static_assert}} failed}}
   // expectted-note@*:* {{'is_signed_v<__iter_diff_t<diff_t_not_signed> >' evaluated to false}}
 }
 
@@ -115,10 +115,10 @@ bool operator==(not_unequality_comparable, not_unequality_comparable);
 bool operator!=(not_unequality_comparable, not_unequality_comparable) = delete;
 
 void check_input_iterator_requirements() {
-  _LIBCPP_REQUIRE_CPP17_INPUT_ITERATOR(not_equality_comparable); // expected-error-re {{static assertion failed}}
+  _LIBCPP_REQUIRE_CPP17_INPUT_ITERATOR(not_equality_comparable); // expected-error-re {{{{static assertion|static_assert}} failed}}
   // expected-note@*:* {{'__lhs == __rhs' would be invalid: overload resolution selected deleted operator '=='}}
 
-  _LIBCPP_REQUIRE_CPP17_INPUT_ITERATOR(not_unequality_comparable); // expected-error-re {{static assertion failed}}
+  _LIBCPP_REQUIRE_CPP17_INPUT_ITERATOR(not_unequality_comparable); // expected-error-re {{{{static assertion|static_assert}} failed}}
   // expected-note@*:* {{'__lhs != __rhs' would be invalid: overload resolution selected deleted operator '!='}}
 }
 
@@ -138,9 +138,9 @@ struct postincrement_not_ref : valid_iterator<postincrement_not_ref> {};
 bool operator==(postincrement_not_ref, postincrement_not_ref);
 
 void check_forward_iterator_requirements() {
-  _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(not_default_constructible); // expected-error-re {{static assertion failed}}
+  _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(not_default_constructible); // expected-error-re {{{{static assertion|static_assert}} failed}}
   // expected-note@*:* {{because 'not_default_constructible' does not satisfy '__cpp17_default_constructible'}}
-  _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(postincrement_not_ref); // expected-error-re {{static assertion failed}}
+  _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(postincrement_not_ref); // expected-error-re {{{{static assertion|static_assert}} failed}}
 #ifndef _AIX
   // expected-note@*:* {{because type constraint 'convertible_to<valid_iterator<postincrement_not_ref>::Proxy, const postincrement_not_ref &>' was not satisfied}}
 #endif
@@ -167,11 +167,11 @@ struct not_returning_iter_reference : valid_forward_iterator<not_returning_iter_
 };
 
 void check_bidirectional_iterator_requirements() {
-  _LIBCPP_REQUIRE_CPP17_BIDIRECTIONAL_ITERATOR(missing_predecrement); // expected-error-re {{static assertion failed}}
+  _LIBCPP_REQUIRE_CPP17_BIDIRECTIONAL_ITERATOR(missing_predecrement); // expected-error-re {{{{static assertion|static_assert}} failed}}
   // expected-note@*:* {{cannot decrement value of type 'missing_predecrement'}}
-  _LIBCPP_REQUIRE_CPP17_BIDIRECTIONAL_ITERATOR(missing_postdecrement); // expected-error-re {{static assertion failed}}
+  _LIBCPP_REQUIRE_CPP17_BIDIRECTIONAL_ITERATOR(missing_postdecrement); // expected-error-re {{{{static assertion|static_assert}} failed}}
   // expected-note@*:* {{cannot decrement value of type 'missing_postdecrement'}}
-  _LIBCPP_REQUIRE_CPP17_BIDIRECTIONAL_ITERATOR(not_returning_iter_reference); // expected-error-re {{static assertion failed}}
+  _LIBCPP_REQUIRE_CPP17_BIDIRECTIONAL_ITERATOR(not_returning_iter_reference); // expected-error-re {{{{static assertion|static_assert}} failed}}
   // expected-note@*:* {{because type constraint 'same_as<int, __iter_reference<not_returning_iter_reference> >' was not satisfied}}
 }
 
@@ -359,62 +359,62 @@ struct missing_const_const_greater_eq : valid_random_access_iterator<missing_con
 };
 
 void check_random_access_iterator() {
-  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_plus_equals); // expected-error-re {{static assertion failed}}
+  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_plus_equals); // expected-error-re {{{{static assertion|static_assert}} failed}}
   // expected-note@*:* {{because '__iter += __n' would be invalid: overload resolution selected deleted operator '+='}}
-  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_minus_equals); // expected-error-re {{static assertion failed}}
+  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_minus_equals); // expected-error-re {{{{static assertion|static_assert}} failed}}
   // expected-note@*:* {{because '__iter -= __n' would be invalid: overload resolution selected deleted operator '-='}}
-  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_plus_iter_diff); // expected-error-re {{static assertion failed}}
+  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_plus_iter_diff); // expected-error-re {{{{static assertion|static_assert}} failed}}
   // expected-note@*:* {{because '__iter + __n' would be invalid: overload resolution selected deleted operator '+'}}
-  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_plus_diff_iter); // expected-error-re {{static assertion failed}}
+  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_plus_diff_iter); // expected-error-re {{{{static assertion|static_assert}} failed}}
   // expected-note@*:* {{because '__n + __iter' would be invalid: overload resolution selected deleted operator '+'}}
-  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_plus_iter_diff_const_mut); // expected-error-re {{static assertion failed}}
+  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_plus_iter_diff_const_mut); // expected-error-re {{{{static assertion|static_assert}} failed}}
   // expected-note@*:* {{because 'std::as_const(__iter) + __n' would be invalid: overload resolution selected deleted operator '+'}}
-  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_plus_iter_diff_mut_const); // expected-error-re {{static assertion failed}}
+  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_plus_iter_diff_mut_const); // expected-error-re {{{{static assertion|static_assert}} failed}}
   // expected-note@*:* {{because '__n + std::as_const(__iter)' would be invalid: overload resolution selected deleted operator '+'}}
-  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_minus_iter_diff_const); // expected-error-re {{static assertion failed}}
+  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_minus_iter_diff_const); // expected-error-re {{{{static assertion|static_assert}} failed}}
   // expected-note@*:* {{because 'std::as_const(__iter) - __n' would be invalid: overload resolution selected deleted operator '-'}}
-  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_minus_iter_iter); // expected-error-re {{static assertion failed}}
+  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_minus_iter_iter); // expected-error-re {{{{static assertion|static_assert}} failed}}
   // expected-note@*:* {{because '__iter - __iter' would be invalid: overload resolution selected deleted operator '-'}}
-  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_minus_const_iter_iter); // expected-error-re {{static assertion failed}}
+  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_minus_const_iter_iter); // expected-error-re {{{{static assertion|static_assert}} failed}}
   // expected-note@*:* {{because 'std::as_const(__iter) - __iter' would be invalid: overload resolution selected deleted operator '-'}}
-  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_minus_iter_const_iter); // expected-error-re {{static assertion failed}}
+  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_minus_iter_const_iter); // expected-error-re {{{{static assertion|static_assert}} failed}}
   // expected-note@*:* {{because '__iter - std::as_const(__iter)' would be invalid: overload resolution selected deleted operator '-'}}
-  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_minus_const_iter_const_iter); // expected-error-re {{static assertion failed}}
+  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_minus_const_iter_const_iter); // expected-error-re {{{{static assertion|static_assert}} failed}}
   // expected-note@*:* {{because 'std::as_const(__iter) - std::as_const(__iter)' would be invalid: overload resolution selected deleted operator '-'}}
-  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_subscript_operator); // expected-error-re {{static assertion failed}}
+  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_subscript_operator); // expected-error-re {{{{static assertion|static_assert}} failed}}
   // expected-note@*:* {{because '__iter[__n]' would be invalid: overload resolution selected deleted operator '[]'}}
-  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_const_subscript_operator); // expected-error-re {{static assertion failed}}
+  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_const_subscript_operator); // expected-error-re {{{{static assertion|static_assert}} failed}}
   // expected-note@*:* {{because 'std::as_const(__iter)[__n]' would be invalid: overload resolution selected deleted operator '[]'}}
-  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_less); // expected-error-re {{static assertion failed}}
+  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_less); // expected-error-re {{{{static assertion|static_assert}} failed}}
   // expected-note@*:* {{because '__iter < __iter' would be invalid: overload resolution selected deleted operator '<'}}
-  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_const_mut_less); // expected-error-re {{static assertion failed}}
+  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_const_mut_less); // expected-error-re {{{{static assertion|static_assert}} failed}}
   // expected-note@*:* {{because 'std::as_const(__iter) < __iter' would be invalid: overload resolution selected deleted operator '<'}}
-  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_mut_const_less); // expected-error-re {{static assertion failed}}
+  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_mut_const_less); // expected-error-re {{{{static assertion|static_assert}} failed}}
   // expected-note@*:* {{because '__iter < std::as_const(__iter)' would be invalid: overload resolution selected deleted operator '<'}}
-  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_const_const_less); // expected-error-re {{static assertion failed}}
+  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_const_const_less); // expected-error-re {{{{static assertion|static_assert}} failed}}
   // expected-note@*:* {{because 'std::as_const(__iter) < std::as_const(__iter)' would be invalid: overload resolution selected deleted operator '<'}}
-  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_greater); // expected-error-re {{static assertion failed}}
+  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_greater); // expected-error-re {{{{static assertion|static_assert}} failed}}
   // expected-note@*:* {{because '__iter > __iter' would be invalid: overload resolution selected deleted operator '>'}}
-  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_const_mut_greater); // expected-error-re {{static assertion failed}}
+  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_const_mut_greater); // expected-error-re {{{{static assertion|static_assert}} failed}}
   // expected-note@*:* {{because 'std::as_const(__iter) > __iter' would be invalid: overload resolution selected deleted operator '>'}}
-  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_mut_const_greater); // expected-error-re {{static assertion failed}}
+  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_mut_const_greater); // expected-error-re {{{{static assertion|static_assert}} failed}}
   // expected-note@*:* {{because '__iter > std::as_const(__iter)' would be invalid: overload resolution selected deleted operator '>'}}
-  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_const_const_greater); // expected-error-re {{static assertion failed}}
+  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_const_const_greater); // expected-error-re {{{{static assertion|static_assert}} failed}}
   // expected-note@*:* {{because 'std::as_const(__iter) > std::as_const(__iter)' would be invalid: overload resolution selected deleted operator '>'}}
-  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_less_eq); // expected-error-re {{static assertion failed}}
+  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_less_eq); // expected-error-re {{{{static assertion|static_assert}} failed}}
   // expected-note@*:* {{because '__iter <= __iter' would be invalid: overload resolution selected deleted operator '<='}}
-  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_const_mut_less_eq); // expected-error-re {{static assertion failed}}
+  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_const_mut_less_eq); // expected-error-re {{{{static assertion|static_assert}} failed}}
   // expected-note@*:* {{because 'std::as_const(__iter) <= __iter' would be invalid: overload resolution selected deleted operator '<='}}
-  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_mut_const_less_eq); // expected-error-re {{static assertion failed}}
+  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_mut_const_less_eq); // expected-error-re {{{{static assertion|static_assert}} failed}}
   // expected-note@*:* {{because '__iter <= std::as_const(__iter)' would be invalid: overload resolution selected deleted operator '<='}}
-  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_const_const_less_eq); // expected-error-re {{static assertion failed}}
+  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_const_const_less_eq); // expected-error-re {{{{static assertion|static_assert}} failed}}
   // expected-note@*:* {{because 'std::as_const(__iter) <= std::as_const(__iter)' would be invalid: overload resolution selected deleted operator '<='}}
-  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_greater_eq); // expected-error-re {{static assertion failed}}
+  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_greater_eq); // expected-error-re {{{{static assertion|static_assert}} failed}}
   // expected-note@*:* {{because '__iter >= __iter' would be invalid: overload resolution selected deleted operator '>='}}
-  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_const_mut_greater_eq); // expected-error-re {{static assertion failed}}
+  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_const_mut_greater_eq); // expected-error-re {{{{static assertion|static_assert}} failed}}
   // expected-note@*:* {{because 'std::as_const(__iter) >= __iter' would be invalid: overload resolution selected deleted operator '>='}}
-  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_mut_const_greater_eq); // expected-error-re {{static assertion failed}}
+  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_mut_const_greater_eq); // expected-error-re {{{{static assertion|static_assert}} failed}}
   // expected-note@*:* {{because '__iter >= std::as_const(__iter)' would be invalid: overload resolution selected deleted operator '>='}}
-  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_const_const_greater_eq); // expected-error-re {{static assertion failed}}
+  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_const_const_greater_eq); // expected-error-re {{{{static assertion|static_assert}} failed}}
   // expected-note@*:* {{because 'std::as_const(__iter) >= std::as_const(__iter)' would be invalid: overload resolution selected deleted operator '>='}}
 }
diff --git a/libcxx/test/std/containers/views/mdspan/layout_stride/extents.verify.cpp b/libcxx/test/std/containers/views/mdspan/layout_stride/extents.verify.cpp
index 46f2b774bcbd9..4742527f7af11 100644
--- a/libcxx/test/std/containers/views/mdspan/layout_stride/extents.verify.cpp
+++ b/libcxx/test/std/containers/views/mdspan/layout_stride/extents.verify.cpp
@@ -23,11 +23,11 @@
 #include <mdspan>
 
 void not_extents() {
-  // expected-error-re@*:* {{static assertion failed {{.*}}layout_stride::mapping template argument must be a specialization of extents}}
+  // expected-error-re@*:* {{{{(static_assert|static assertion)}} failed {{.*}}layout_stride::mapping template argument must be a specialization of extents}}
   [[maybe_unused]] std::layout_stride::mapping<void> mapping;
 }
 
 void representable() {
-  // expected-error-re@*:* {{static assertion failed {{.*}}layout_stride::mapping product of static extents must be representable as index_type.}}
+  // expected-error-re@*:* {{{{(static_assert|static assertion)}} failed {{.*}}layout_stride::mapping product of static extents must be representable as index_type.}}
   [[maybe_unused]] std::layout_stride::mapping<std::extents<char, 20, 20>> mapping;
 }

From 7046202c3dde093420c08e40116568e76a48ee59 Mon Sep 17 00:00:00 2001
From: jeanPerier <jperier@nvidia.com>
Date: Fri, 27 Oct 2023 09:07:48 +0200
Subject: [PATCH 183/877] [flang] Move whole allocatable assignment implicit
 conversion to lowering (#70317)

The front-end is making implicit conversions explicit in assignment and
structure constructors.

While this generally helps and is needed by semantics to fold structure
constructors correctly, this is incorrect when the LHS or component is
an allocatable. The RHS may have non default lower bounds that should be
propagated to the LHS, and making the conversion explicit changes the
semantics. In the structure constructor, the situation is even worse
since Fortran 2018 7.5.10 point 7 allows the value to be a reference to
an unallocated allocatable, and adding an explicit conversion in
semantics will cause a segfault.

This patch removes the explicit convert in semantics when the
LHS/component is a whole allocatable, and update lowering to deal with
the conversion insertion, dealing with preserving the lower bounds and
the tricky structure constructor case.
---
 .../flang/Optimizer/Builder/Character.h       |   5 +
 .../flang/Optimizer/Builder/HLFIRTools.h      |  23 ++++
 flang/lib/Lower/Bridge.cpp                    |  73 ++++-------
 flang/lib/Lower/ConvertExpr.cpp               |  39 +-----
 flang/lib/Lower/ConvertExprToHLFIR.cpp        | 121 ++++++------------
 flang/lib/Optimizer/Builder/Character.cpp     |  39 ++++++
 flang/lib/Optimizer/Builder/HLFIRTools.cpp    |  99 ++++++++++++++
 flang/lib/Semantics/expression.cpp            |  34 ++++-
 flang/test/Lower/HLFIR/charconvert.f90        |   8 +-
 .../implicit-type-conversion-allocatable.f90  |  40 ++++++
 flang/test/Lower/charconvert.f90              |   2 +-
 11 files changed, 309 insertions(+), 174 deletions(-)
 create mode 100644 flang/test/Lower/HLFIR/implicit-type-conversion-allocatable.f90

diff --git a/flang/include/flang/Optimizer/Builder/Character.h b/flang/include/flang/Optimizer/Builder/Character.h
index c83076ee81987..658118eddcc90 100644
--- a/flang/include/flang/Optimizer/Builder/Character.h
+++ b/flang/include/flang/Optimizer/Builder/Character.h
@@ -235,6 +235,11 @@ std::pair<mlir::Value, mlir::Value>
 extractCharacterProcedureTuple(fir::FirOpBuilder &builder, mlir::Location loc,
                                mlir::Value tuple, bool openBoxProc = true);
 
+fir::CharBoxValue convertCharacterKind(fir::FirOpBuilder &builder,
+                                       mlir::Location loc,
+                                       fir::CharBoxValue srcBoxChar,
+                                       int toKind);
+
 } // namespace fir::factory
 
 #endif // FORTRAN_OPTIMIZER_BUILDER_CHARACTER_H
diff --git a/flang/include/flang/Optimizer/Builder/HLFIRTools.h b/flang/include/flang/Optimizer/Builder/HLFIRTools.h
index f0b66baddd960..07bb380320bf7 100644
--- a/flang/include/flang/Optimizer/Builder/HLFIRTools.h
+++ b/flang/include/flang/Optimizer/Builder/HLFIRTools.h
@@ -427,6 +427,29 @@ std::pair<hlfir::Entity, mlir::Value>
 createTempFromMold(mlir::Location loc, fir::FirOpBuilder &builder,
                    hlfir::Entity mold);
 
+hlfir::EntityWithAttributes convertCharacterKind(mlir::Location loc,
+                                                 fir::FirOpBuilder &builder,
+                                                 hlfir::Entity scalarChar,
+                                                 int toKind);
+
+/// Materialize an implicit Fortran type conversion from \p source to \p toType.
+/// This is a no-op if the Fortran category and KIND of \p source are
+/// the same as the one in \p toType. This is also a no-op if \p toType is an
+/// unlimited polymorphic. For characters, this implies that a conversion is
+/// only inserted in case of KIND mismatch (and not in case of length mismatch),
+/// and that the resulting entity length is the same as the one from \p source.
+/// It is valid to call this helper if \p source is an array. If a conversion is
+/// inserted for arrays, a clean-up will be returned. If no conversion is
+/// needed, the source is returned.
+/// Beware that the resulting entity mlir type may not be toType: it will be a
+/// Fortran entity with the same Fortran category and KIND.
+/// If preserveLowerBounds is set, the returned entity will have the same lower
+/// bounds as \p source.
+std::pair<hlfir::Entity, std::optional<hlfir::CleanupFunction>>
+genTypeAndKindConvert(mlir::Location loc, fir::FirOpBuilder &builder,
+                      hlfir::Entity source, mlir::Type toType,
+                      bool preserveLowerBounds);
+
 } // namespace hlfir
 
 #endif // FORTRAN_OPTIMIZER_BUILDER_HLFIRTOOLS_H
diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp
index 761cedb97fb95..9875e37393ef8 100644
--- a/flang/lib/Lower/Bridge.cpp
+++ b/flang/lib/Lower/Bridge.cpp
@@ -3372,47 +3372,25 @@ class FirConverter : public Fortran::lower::AbstractConverter {
     }
   }
 
-  /// Given converted LHS and RHS of the assignment, generate
-  /// explicit type conversion for implicit Logical<->Integer
-  /// conversion. Return Value representing the converted RHS,
-  /// if the implicit Logical<->Integer is detected, otherwise,
-  /// return nullptr. The caller is responsible for inserting
-  /// DestroyOp in case the returned value has hlfir::ExprType.
-  mlir::Value
-  genImplicitLogicalConvert(const Fortran::evaluate::Assignment &assign,
-                            hlfir::Entity rhs,
-                            Fortran::lower::StatementContext &stmtCtx) {
-    mlir::Type fromTy = rhs.getFortranElementType();
-    if (!fromTy.isa<mlir::IntegerType, fir::LogicalType>())
-      return nullptr;
-
-    mlir::Type toTy = hlfir::getFortranElementType(genType(assign.lhs));
-    if (fromTy == toTy)
-      return nullptr;
-    if (!toTy.isa<mlir::IntegerType, fir::LogicalType>())
-      return nullptr;
-
+  /// Given converted LHS and RHS of the assignment, materialize any
+  /// implicit conversion of the RHS to the LHS type. The front-end
+  /// usually already makes those explicit, except for non-standard
+  /// LOGICAL <-> INTEGER, or if the LHS is a whole allocatable
+  /// (making the conversion explicit in the front-end would prevent
+  /// propagation of the LHS lower bound in the reallocation).
+  /// If array temporaries or values are created, the cleanups are
+  /// added in the statement context.
+  hlfir::Entity genImplicitConvert(const Fortran::evaluate::Assignment &assign,
+                                   hlfir::Entity rhs, bool preserveLowerBounds,
+                                   Fortran::lower::StatementContext &stmtCtx) {
     mlir::Location loc = toLocation();
     auto &builder = getFirOpBuilder();
-    if (assign.rhs.Rank() == 0)
-      return builder.createConvert(loc, toTy, rhs);
-
-    mlir::Value shape = hlfir::genShape(loc, builder, rhs);
-    auto genKernel =
-        [&rhs, &toTy](mlir::Location loc, fir::FirOpBuilder &builder,
-                      mlir::ValueRange oneBasedIndices) -> hlfir::Entity {
-      auto elementPtr = hlfir::getElementAt(loc, builder, rhs, oneBasedIndices);
-      auto val = hlfir::loadTrivialScalar(loc, builder, elementPtr);
-      return hlfir::EntityWithAttributes{builder.createConvert(loc, toTy, val)};
-    };
-    mlir::Value convertedRhs = hlfir::genElementalOp(
-        loc, builder, toTy, shape, /*typeParams=*/{}, genKernel,
-        /*isUnordered=*/true);
-    fir::FirOpBuilder *bldr = &builder;
-    stmtCtx.attachCleanup([loc, bldr, convertedRhs]() {
-      bldr->create<hlfir::DestroyOp>(loc, convertedRhs);
-    });
-    return convertedRhs;
+    mlir::Type toType = genType(assign.lhs);
+    auto valueAndPair = hlfir::genTypeAndKindConvert(loc, builder, rhs, toType,
+                                                     preserveLowerBounds);
+    if (valueAndPair.second)
+      stmtCtx.attachCleanup(*valueAndPair.second);
+    return hlfir::Entity{valueAndPair.first};
   }
 
   static void
@@ -3476,14 +3454,17 @@ class FirConverter : public Fortran::lower::AbstractConverter {
       // loops early if possible. This also dereferences pointer and
       // allocatable RHS: the target is being assigned from.
       rhs = hlfir::loadTrivialScalar(loc, builder, rhs);
-      // In intrinsic assignments, Logical<->Integer assignments are allowed as
-      // an extension, but there is no explicit Convert expression for the RHS.
-      // Recognize the type mismatch here and insert explicit scalar convert or
-      // ElementalOp for array assignment.
+      // In intrinsic assignments, the LHS type may not match the RHS type, in
+      // which case an implicit conversion of the LHS must be done. The
+      // front-end usually makes it explicit, unless it cannot (whole
+      // allocatable LHS or Logical<->Integer assignment extension). Recognize
+      // any type mismatches here and insert explicit scalar convert or
+      // ElementalOp for array assignment. Preserve the RHS lower bounds on the
+      // converted entity in case of assignment to whole allocatables so to
+      // propagate the lower bounds to the LHS in case of reallocation.
       if (!userDefinedAssignment)
-        if (mlir::Value conversion =
-                genImplicitLogicalConvert(assign, rhs, stmtCtx))
-          rhs = hlfir::Entity{conversion};
+        rhs = genImplicitConvert(assign, rhs, isWholeAllocatableAssignment,
+                                 stmtCtx);
       return rhs;
     };
 
diff --git a/flang/lib/Lower/ConvertExpr.cpp b/flang/lib/Lower/ConvertExpr.cpp
index 1a2b3856c5267..76d810e9df6dc 100644
--- a/flang/lib/Lower/ConvertExpr.cpp
+++ b/flang/lib/Lower/ConvertExpr.cpp
@@ -1237,41 +1237,8 @@ class ScalarExprLowering {
         [&](const fir::CharBoxValue &boxchar) -> ExtValue {
           if constexpr (TC1 == Fortran::common::TypeCategory::Character &&
                         TC2 == TC1) {
-            // Use char_convert. Each code point is translated from a
-            // narrower/wider encoding to the target encoding. For example, 'A'
-            // may be translated from 0x41 : i8 to 0x0041 : i16. The symbol
-            // for euro (0x20AC : i16) may be translated from a wide character
-            // to "0xE2 0x82 0xAC" : UTF-8.
-            mlir::Value bufferSize = boxchar.getLen();
-            auto kindMap = builder.getKindMap();
-            mlir::Value boxCharAddr = boxchar.getAddr();
-            auto fromTy = boxCharAddr.getType();
-            if (auto charTy = fromTy.dyn_cast<fir::CharacterType>()) {
-              // boxchar is a value, not a variable. Turn it into a temporary.
-              // As a value, it ought to have a constant LEN value.
-              assert(charTy.hasConstantLen() && "must have constant length");
-              mlir::Value tmp = builder.createTemporary(loc, charTy);
-              builder.create<fir::StoreOp>(loc, boxCharAddr, tmp);
-              boxCharAddr = tmp;
-            }
-            auto fromBits =
-                kindMap.getCharacterBitsize(fir::unwrapRefType(fromTy)
-                                                .cast<fir::CharacterType>()
-                                                .getFKind());
-            auto toBits = kindMap.getCharacterBitsize(
-                ty.cast<fir::CharacterType>().getFKind());
-            if (toBits < fromBits) {
-              // Scale by relative ratio to give a buffer of the same length.
-              auto ratio = builder.createIntegerConstant(
-                  loc, bufferSize.getType(), fromBits / toBits);
-              bufferSize =
-                  builder.create<mlir::arith::MulIOp>(loc, bufferSize, ratio);
-            }
-            auto dest = builder.create<fir::AllocaOp>(
-                loc, ty, mlir::ValueRange{bufferSize});
-            builder.create<fir::CharConvertOp>(loc, boxCharAddr,
-                                               boxchar.getLen(), dest);
-            return fir::CharBoxValue{dest, boxchar.getLen()};
+            return fir::factory::convertCharacterKind(builder, loc, boxchar,
+                                                      KIND);
           } else {
             fir::emitFatalError(
                 loc, "unsupported evaluate::Convert between CHARACTER type "
@@ -3965,7 +3932,7 @@ class ArrayExprLowering {
       auto castTo = builder.createConvert(loc, memrefTy, origVal);
       origVal = builder.create<fir::EmboxOp>(loc, eleTy, castTo);
     }
-    mlir::Value val = builder.createConvert(loc, eleTy, origVal);
+    mlir::Value val = builder.convertWithSemantics(loc, eleTy, origVal);
     if (isBoundsSpec()) {
       assert(lbounds.has_value());
       auto lbs = *lbounds;
diff --git a/flang/lib/Lower/ConvertExprToHLFIR.cpp b/flang/lib/Lower/ConvertExprToHLFIR.cpp
index 1da6a5bdd5478..5a51493c9aaa5 100644
--- a/flang/lib/Lower/ConvertExprToHLFIR.cpp
+++ b/flang/lib/Lower/ConvertExprToHLFIR.cpp
@@ -1367,37 +1367,7 @@ struct UnaryOp<
                                          hlfir::Entity lhs) {
     if constexpr (TC1 == Fortran::common::TypeCategory::Character &&
                   TC2 == TC1) {
-      auto kindMap = builder.getKindMap();
-      mlir::Type fromTy = lhs.getFortranElementType();
-      mlir::Value origBufferSize = genCharLength(loc, builder, lhs);
-      mlir::Value bufferSize{origBufferSize};
-      auto fromBits = kindMap.getCharacterBitsize(
-          fir::unwrapRefType(fromTy).cast<fir::CharacterType>().getFKind());
-      mlir::Type toTy = Fortran::lower::getFIRType(
-          builder.getContext(), TC1, KIND, /*params=*/std::nullopt);
-      auto toBits = kindMap.getCharacterBitsize(
-          toTy.cast<fir::CharacterType>().getFKind());
-      if (toBits < fromBits) {
-        // Scale by relative ratio to give a buffer of the same length.
-        auto ratio = builder.createIntegerConstant(loc, bufferSize.getType(),
-                                                   fromBits / toBits);
-        bufferSize =
-            builder.create<mlir::arith::MulIOp>(loc, bufferSize, ratio);
-      }
-      // allocate space on the stack for toBuffer
-      auto dest = builder.create<fir::AllocaOp>(loc, toTy,
-                                                mlir::ValueRange{bufferSize});
-      auto src = hlfir::convertToAddress(loc, builder, lhs,
-                                         lhs.getFortranElementType());
-      builder.create<fir::CharConvertOp>(loc, src.first.getCharBox()->getAddr(),
-                                         origBufferSize, dest);
-      if (src.second.has_value())
-        src.second.value()();
-
-      return hlfir::EntityWithAttributes{builder.create<hlfir::DeclareOp>(
-          loc, dest, "ctor.temp", /*shape=*/nullptr,
-          /*typeparams=*/mlir::ValueRange{origBufferSize},
-          fir::FortranVariableFlagsAttr{})};
+      return hlfir::convertCharacterKind(loc, builder, lhs, KIND);
     }
     mlir::Type type = Fortran::lower::getFIRType(builder.getContext(), TC1,
                                                  KIND, /*params=*/std::nullopt);
@@ -1789,7 +1759,7 @@ class HlfirBuilder {
       // If it is allocatable, then using AssignOp for unallocated RHS
       // will cause illegal dereference. When an unallocated allocatable
       // value is used to construct an allocatable component, the component
-      // must just stay unallocated.
+      // must just stay unallocated (see Fortran 2018 7.5.10 point 7).
 
       // If the component is allocatable and RHS is NULL() expression, then
       // we can just skip it: the LHS must remain unallocated with its
@@ -1798,56 +1768,44 @@ class HlfirBuilder {
           Fortran::evaluate::UnwrapExpr<Fortran::evaluate::NullPointer>(expr))
         continue;
 
+      bool keepLhsLength = false;
+      if (allowRealloc)
+        if (const Fortran::semantics::DeclTypeSpec *declType = sym.GetType())
+          keepLhsLength =
+              declType->category() ==
+                  Fortran::semantics::DeclTypeSpec::Category::Character &&
+              !declType->characterTypeSpec().length().isDeferred();
       // Handle special case when the initializer expression is
       // '{%SET_LENGTH(x,const_kind)}'. In structure constructor,
-      // SET_LENGTH is used for initializers of character allocatable
-      // components with *explicit* length, because they have to keep
-      // their length regardless of the initializer expression's length.
-      // We cannot just lower SET_LENGTH into hlfir.set_length in case
-      // when 'x' is allocatable: if 'x' is unallocated, it is not clear
-      // what hlfir.expr should be produced by hlfir.set_length.
-      // So whenever the initializer expression is SET_LENGTH we
-      // recognize it as the directive to keep the explicit length
-      // of the LHS component, and we completely ignore 'const_kind'
-      // operand assuming that it matches the LHS component's explicit
-      // length. Note that in case when LHS component has deferred length,
-      // the FE does not produce SET_LENGTH expression.
-      //
-      // When SET_LENGTH is recognized, we use 'x' as the initializer
-      // for the LHS component. If 'x' is allocatable, the dynamic
-      // isAllocated check will guard the assign operation as usual.
-      bool keepLhsLength = false;
-      hlfir::Entity rhs = std::visit(
-          [&](const auto &x) -> hlfir::Entity {
-            using T = std::decay_t<decltype(x)>;
-            if constexpr (Fortran::common::HasMember<
-                              T, Fortran::lower::CategoryExpression>) {
-              if constexpr (T::Result::category ==
-                            Fortran::common::TypeCategory::Character) {
-                return std::visit(
-                    [&](const auto &someKind) -> hlfir::Entity {
-                      using T = std::decay_t<decltype(someKind)>;
-                      if (const auto *setLength = std::get_if<
-                              Fortran::evaluate::SetLength<T::Result::kind>>(
-                              &someKind.u)) {
-                        keepLhsLength = true;
-                        return gen(setLength->left());
-                      }
-
-                      return gen(someKind);
-                    },
-                    x.u);
-              }
-            }
-            return gen(x);
-          },
-          expr.u);
-
-      if (!allowRealloc || !rhs.isMutableBox()) {
+      // SET_LENGTH is used for initializers of non-allocatable character
+      // components so that the front-end can better
+      // fold and work with these structure constructors.
+      // Here, they are just noise since the assignment semantics will deal
+      // with any length mismatch, and creating an extra temp with the lhs
+      // length is useless.
+      // TODO: should this be moved into an hlfir.assign + hlfir.set_length
+      // pattern rewrite?
+      hlfir::Entity rhs = gen(expr);
+      if (auto set_length = rhs.getDefiningOp<hlfir::SetLengthOp>())
+        rhs = hlfir::Entity{set_length.getString()};
+
+      // lambda to generate `lhs = rhs` and deal with potential rhs implicit
+      // cast
+      auto genAssign = [&] {
         rhs = hlfir::loadTrivialScalar(loc, builder, rhs);
-        builder.create<hlfir::AssignOp>(loc, rhs, lhs, allowRealloc,
+        auto rhsCastAndCleanup =
+            hlfir::genTypeAndKindConvert(loc, builder, rhs, lhs.getType(),
+                                         /*preserveLowerBounds=*/allowRealloc);
+        builder.create<hlfir::AssignOp>(loc, rhsCastAndCleanup.first, lhs,
+                                        allowRealloc,
                                         allowRealloc ? keepLhsLength : false,
                                         /*temporary_lhs=*/true);
+        if (rhsCastAndCleanup.second)
+          (*rhsCastAndCleanup.second)();
+      };
+
+      if (!allowRealloc || !rhs.isMutableBox()) {
+        genAssign();
         continue;
       }
 
@@ -1860,14 +1818,7 @@ class HlfirBuilder {
                                  "to mutable box");
       mlir::Value isAlloc =
           fir::factory::genIsAllocatedOrAssociatedTest(builder, loc, *fromBox);
-      builder.genIfThen(loc, isAlloc)
-          .genThen([&]() {
-            rhs = hlfir::loadTrivialScalar(loc, builder, rhs);
-            builder.create<hlfir::AssignOp>(loc, rhs, lhs, allowRealloc,
-                                            keepLhsLength,
-                                            /*temporary_lhs=*/true);
-          })
-          .end();
+      builder.genIfThen(loc, isAlloc).genThen(genAssign).end();
     }
 
     return varOp;
diff --git a/flang/lib/Optimizer/Builder/Character.cpp b/flang/lib/Optimizer/Builder/Character.cpp
index 41cdd9a71c735..1a068b1313a6c 100644
--- a/flang/lib/Optimizer/Builder/Character.cpp
+++ b/flang/lib/Optimizer/Builder/Character.cpp
@@ -851,3 +851,42 @@ fir::CharBoxValue fir::factory::CharacterExprHelper::createCharExtremum(
   createAssign(toBuf, fromBuf);
   return temp;
 }
+
+fir::CharBoxValue
+fir::factory::convertCharacterKind(fir::FirOpBuilder &builder,
+                                   mlir::Location loc,
+                                   fir::CharBoxValue srcBoxChar, int toKind) {
+  // Use char_convert. Each code point is translated from a
+  // narrower/wider encoding to the target encoding. For example, 'A'
+  // may be translated from 0x41 : i8 to 0x0041 : i16. The symbol
+  // for euro (0x20AC : i16) may be translated from a wide character
+  // to "0xE2 0x82 0xAC" : UTF-8.
+  mlir::Value bufferSize = srcBoxChar.getLen();
+  auto kindMap = builder.getKindMap();
+  mlir::Value boxCharAddr = srcBoxChar.getAddr();
+  auto fromTy = boxCharAddr.getType();
+  if (auto charTy = fromTy.dyn_cast<fir::CharacterType>()) {
+    // boxchar is a value, not a variable. Turn it into a temporary.
+    // As a value, it ought to have a constant LEN value.
+    assert(charTy.hasConstantLen() && "must have constant length");
+    mlir::Value tmp = builder.createTemporary(loc, charTy);
+    builder.create<fir::StoreOp>(loc, boxCharAddr, tmp);
+    boxCharAddr = tmp;
+  }
+  auto fromBits = kindMap.getCharacterBitsize(
+      fir::unwrapRefType(fromTy).cast<fir::CharacterType>().getFKind());
+  auto toBits = kindMap.getCharacterBitsize(toKind);
+  if (toBits < fromBits) {
+    // Scale by relative ratio to give a buffer of the same length.
+    auto ratio = builder.createIntegerConstant(loc, bufferSize.getType(),
+                                               fromBits / toBits);
+    bufferSize = builder.create<mlir::arith::MulIOp>(loc, bufferSize, ratio);
+  }
+  mlir::Type toType =
+      fir::CharacterType::getUnknownLen(builder.getContext(), toKind);
+  auto dest = builder.createTemporary(loc, toType, /*name=*/{}, /*shape=*/{},
+                                      mlir::ValueRange{bufferSize});
+  builder.create<fir::CharConvertOp>(loc, boxCharAddr, srcBoxChar.getLen(),
+                                     dest);
+  return fir::CharBoxValue{dest, srcBoxChar.getLen()};
+}
diff --git a/flang/lib/Optimizer/Builder/HLFIRTools.cpp b/flang/lib/Optimizer/Builder/HLFIRTools.cpp
index cc4bdf356ae9b..3d0a59b468ba7 100644
--- a/flang/lib/Optimizer/Builder/HLFIRTools.cpp
+++ b/flang/lib/Optimizer/Builder/HLFIRTools.cpp
@@ -1092,3 +1092,102 @@ hlfir::createTempFromMold(mlir::Location loc, fir::FirOpBuilder &builder,
 
   return {hlfir::Entity{declareOp.getBase()}, isHeapAlloc};
 }
+
+hlfir::EntityWithAttributes
+hlfir::convertCharacterKind(mlir::Location loc, fir::FirOpBuilder &builder,
+                            hlfir::Entity scalarChar, int toKind) {
+  auto src = hlfir::convertToAddress(loc, builder, scalarChar,
+                                     scalarChar.getFortranElementType());
+  assert(src.first.getCharBox() && "must be scalar character");
+  fir::CharBoxValue res = fir::factory::convertCharacterKind(
+      builder, loc, *src.first.getCharBox(), toKind);
+  if (src.second.has_value())
+    src.second.value()();
+
+  return hlfir::EntityWithAttributes{builder.create<hlfir::DeclareOp>(
+      loc, res.getAddr(), ".temp.kindconvert", /*shape=*/nullptr,
+      /*typeparams=*/mlir::ValueRange{res.getLen()},
+      fir::FortranVariableFlagsAttr{})};
+}
+
+std::pair<hlfir::Entity, std::optional<hlfir::CleanupFunction>>
+hlfir::genTypeAndKindConvert(mlir::Location loc, fir::FirOpBuilder &builder,
+                             hlfir::Entity source, mlir::Type toType,
+                             bool preserveLowerBounds) {
+  mlir::Type fromType = source.getFortranElementType();
+  toType = hlfir::getFortranElementType(toType);
+  if (!toType || fromType == toType ||
+      !(fir::isa_trivial(toType) || mlir::isa<fir::CharacterType>(toType)))
+    return {source, std::nullopt};
+
+  std::optional<int> toKindCharConvert;
+  if (auto toCharTy = mlir::dyn_cast<fir::CharacterType>(toType)) {
+    if (auto fromCharTy = mlir::dyn_cast<fir::CharacterType>(fromType))
+      if (toCharTy.getFKind() != fromCharTy.getFKind())
+        toKindCharConvert = toCharTy.getFKind();
+    // Do not convert in case of character length mismatch only, hlfir.assign
+    // deals with it.
+    if (!toKindCharConvert)
+      return {source, std::nullopt};
+  }
+
+  if (source.getRank() == 0) {
+    mlir::Value cast = toKindCharConvert
+                           ? mlir::Value{hlfir::convertCharacterKind(
+                                 loc, builder, source, *toKindCharConvert)}
+                           : builder.convertWithSemantics(loc, toType, source);
+    return {hlfir::Entity{cast}, std::nullopt};
+  }
+
+  mlir::Value shape = hlfir::genShape(loc, builder, source);
+  auto genKernel = [source, toType, toKindCharConvert](
+                       mlir::Location loc, fir::FirOpBuilder &builder,
+                       mlir::ValueRange oneBasedIndices) -> hlfir::Entity {
+    auto elementPtr =
+        hlfir::getElementAt(loc, builder, source, oneBasedIndices);
+    auto val = hlfir::loadTrivialScalar(loc, builder, elementPtr);
+    if (toKindCharConvert)
+      return hlfir::convertCharacterKind(loc, builder, val, *toKindCharConvert);
+    return hlfir::EntityWithAttributes{
+        builder.convertWithSemantics(loc, toType, val)};
+  };
+  llvm::SmallVector<mlir::Value, 1> lenParams;
+  hlfir::genLengthParameters(loc, builder, source, lenParams);
+  mlir::Value convertedRhs =
+      hlfir::genElementalOp(loc, builder, toType, shape, lenParams, genKernel,
+                            /*isUnordered=*/true);
+
+  if (preserveLowerBounds && source.hasNonDefaultLowerBounds()) {
+    hlfir::AssociateOp associate =
+        genAssociateExpr(loc, builder, hlfir::Entity{convertedRhs},
+                         convertedRhs.getType(), ".tmp.keeplbounds");
+    fir::ShapeOp shapeOp = associate.getShape().getDefiningOp<fir::ShapeOp>();
+    assert(shapeOp && "associate shape must be a fir.shape");
+    const unsigned rank = shapeOp.getExtents().size();
+    llvm::SmallVector<mlir::Value> lbAndExtents;
+    for (unsigned dim = 0; dim < rank; ++dim) {
+      lbAndExtents.push_back(hlfir::genLBound(loc, builder, source, dim));
+      lbAndExtents.push_back(shapeOp.getExtents()[dim]);
+    }
+    auto shapeShiftType = fir::ShapeShiftType::get(builder.getContext(), rank);
+    mlir::Value shapeShift =
+        builder.create<fir::ShapeShiftOp>(loc, shapeShiftType, lbAndExtents);
+    auto declareOp = builder.create<hlfir::DeclareOp>(
+        loc, associate.getFirBase(), associate.getUniqName(), shapeShift,
+        associate.getTypeparams(), /*flags=*/fir::FortranVariableFlagsAttr{});
+    hlfir::Entity castWithLbounds =
+        mlir::cast<fir::FortranVariableOpInterface>(declareOp.getOperation());
+    fir::FirOpBuilder *bldr = &builder;
+    auto cleanup = [loc, bldr, convertedRhs, associate]() {
+      bldr->create<hlfir::EndAssociateOp>(loc, associate);
+      bldr->create<hlfir::DestroyOp>(loc, convertedRhs);
+    };
+    return {castWithLbounds, cleanup};
+  }
+
+  fir::FirOpBuilder *bldr = &builder;
+  auto cleanup = [loc, bldr, convertedRhs]() {
+    bldr->create<hlfir::DestroyOp>(loc, convertedRhs);
+  };
+  return {hlfir::Entity{convertedRhs}, cleanup};
+}
diff --git a/flang/lib/Semantics/expression.cpp b/flang/lib/Semantics/expression.cpp
index 4ccb2c3ef5d01..33baf56f202d3 100644
--- a/flang/lib/Semantics/expression.cpp
+++ b/flang/lib/Semantics/expression.cpp
@@ -1857,6 +1857,23 @@ MaybeExpr ExpressionAnalyzer::Analyze(const parser::ArrayConstructor &array) {
   return acContext.ToExpr();
 }
 
+// Check if implicit conversion of expr to the symbol type is legal (if needed),
+// and make it explicit if requested.
+static MaybeExpr implicitConvertTo(const semantics::Symbol &sym,
+    Expr<SomeType> &&expr, bool keepConvertImplicit) {
+  if (!keepConvertImplicit) {
+    return ConvertToType(sym, std::move(expr));
+  } else {
+    // Test if a convert could be inserted, but do not make it explicit to
+    // preserve the information that expr is a variable.
+    if (ConvertToType(sym, common::Clone(expr))) {
+      return MaybeExpr{std::move(expr)};
+    }
+  }
+  // Illegal implicit convert.
+  return std::nullopt;
+}
+
 MaybeExpr ExpressionAnalyzer::Analyze(
     const parser::StructureConstructor &structure) {
   auto &parsedType{std::get<parser::DerivedTypeSpec>(structure.t)};
@@ -2061,7 +2078,15 @@ MaybeExpr ExpressionAnalyzer::Analyze(
                 visible->name(), symbol->name(), pointer->name());
           }
         }
-        if (MaybeExpr converted{ConvertToType(*symbol, std::move(*value))}) {
+        // Make implicit conversion explicit to allow folding of the structure
+        // constructors and help semantic checking, unless the component is
+        // allocatable, in which case the value could be an unallocated
+        // allocatable (see Fortran 2018 7.5.10 point 7). The explicit
+        // convert would cause a segfault. Lowering will deal with
+        // conditionally converting and preserving the lower bounds in this
+        // case.
+        if (MaybeExpr converted{implicitConvertTo(
+                *symbol, std::move(*value), IsAllocatable(*symbol))}) {
           if (auto componentShape{GetShape(GetFoldingContext(), *symbol)}) {
             if (auto valueShape{GetShape(GetFoldingContext(), *converted)}) {
               if (GetRank(*componentShape) == 0 && GetRank(*valueShape) > 0) {
@@ -4180,7 +4205,12 @@ std::optional<ProcedureRef> ArgumentAnalyzer::TryDefinedAssignment() {
   Tristate isDefined{
       semantics::IsDefinedAssignment(lhsType, lhsRank, rhsType, rhsRank)};
   if (isDefined == Tristate::No) {
-    if (lhsType && rhsType) {
+    // Make implicit conversion explicit, unless it is an assignment to a whole
+    // allocatable (the explicit conversion would prevent the propagation of the
+    // right hand side if it is a variable). Lowering will deal with the
+    // conversion in this case.
+    if (lhsType && rhsType &&
+        (!IsAllocatableDesignator(lhs) || context_.inWhereBody())) {
       AddAssignmentConversion(*lhsType, *rhsType);
     }
     if (!fatalErrors_) {
diff --git a/flang/test/Lower/HLFIR/charconvert.f90 b/flang/test/Lower/HLFIR/charconvert.f90
index 9b9c8670077dd..9a0ad1e455128 100644
--- a/flang/test/Lower/HLFIR/charconvert.f90
+++ b/flang/test/Lower/HLFIR/charconvert.f90
@@ -19,10 +19,10 @@ end subroutine charconvert1
 ! CHECK:     %[[C4_4:.*]] = arith.constant 4 : index
 ! CHECK:     %[[VAL_38:.*]] = arith.divsi %[[VAL_37]], %[[C4_4]] : index
 ! CHECK:     %[[VAL_39:.*]] = hlfir.designate %[[VAL_2]]#0 (%[[ARG2]])  typeparams %[[VAL_38]] : (!fir.box<!fir.array<?x!fir.char<4,?>>>, index, index) -> !fir.boxchar<4>
+! CHECK:     %[[VAL_42:.*]]:2 = fir.unboxchar %[[VAL_39]] : (!fir.boxchar<4>) -> (!fir.ref<!fir.char<4,?>>, index)
 ! CHECK:     %[[C4_5:.*]] = arith.constant 4 : index
 ! CHECK:     %[[VAL_40:.*]] = arith.muli %[[VAL_38]], %[[C4_5]] : index
 ! CHECK:     %[[VAL_41:.*]] = fir.alloca !fir.char<1,?>(%[[VAL_40]] : index)
-! CHECK:     %[[VAL_42:.*]]:2 = fir.unboxchar %[[VAL_39]] : (!fir.boxchar<4>) -> (!fir.ref<!fir.char<4,?>>, index)
 ! CHECK:     fir.char_convert %[[VAL_42]]#0 for %[[VAL_38:.*]] to %[[VAL_41]] : !fir.ref<!fir.char<4,?>>, index, !fir.ref<!fir.char<1,?>>
 
 subroutine charconvert2(x)
@@ -63,9 +63,9 @@ subroutine charconvert3(c, c4)
 ! CHECK:   %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_2]]#0 typeparams %[[VAL_2]]#1 {uniq_name = "_QFcharconvert3Ec4"} : (!fir.ref<!fir.char<4,?>>, index) -> (!fir.boxchar<4>, !fir.ref<!fir.char<4,?>>)
 ! CHECK:   %[[VAL_4:.*]] = arith.addi %[[VAL_0]]#1, %[[VAL_0]]#1 : index
 ! CHECK:   %[[VAL_5:.*]] = hlfir.concat %[[VAL_1]]#0, %[[VAL_1]]#0 len %[[VAL_4]] : (!fir.boxchar<1>, !fir.boxchar<1>, index) -> !hlfir.expr<!fir.char<1,?>>
-! CHECK:   %[[VAL_6:.*]] = fir.alloca !fir.char<4,?>(%[[VAL_4]] : index)
 ! CHECK:   %[[VAL_7:.*]]:3 = hlfir.associate %[[VAL_5]] typeparams %[[VAL_4]] {uniq_name = "adapt.valuebyref"} : (!hlfir.expr<!fir.char<1,?>>, index) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>, i1)
+! CHECK:   %[[VAL_6:.*]] = fir.alloca !fir.char<4,?>(%[[VAL_4]] : index)
 ! CHECK:   fir.char_convert %[[VAL_7]]#1 for %[[VAL_4:.*]] to %[[VAL_6]] : !fir.ref<!fir.char<1,?>>, index, !fir.ref<!fir.char<4,?>>
 ! CHECK:   hlfir.end_associate %[[VAL_7]]#1, %[[VAL_7]]#2 : !fir.ref<!fir.char<1,?>>, i1
-! CHECK:   %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_6]] typeparams %[[VAL_4]] {uniq_name = "ctor.temp"} : (!fir.ref<!fir.char<4,?>>, index) -> (!fir.boxchar<4>, !fir.ref<!fir.char<4,?>>)
-! CHECK:   hlfir.assign %[[VAL_8]]#0 to %[[VAL_3]]#0 : !fir.boxchar<4>, !fir.boxchar<4>
\ No newline at end of file
+! CHECK:   %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_6]] typeparams %[[VAL_4]] {uniq_name = ".temp.kindconvert"} : (!fir.ref<!fir.char<4,?>>, index) -> (!fir.boxchar<4>, !fir.ref<!fir.char<4,?>>)
+! CHECK:   hlfir.assign %[[VAL_8]]#0 to %[[VAL_3]]#0 : !fir.boxchar<4>, !fir.boxchar<4>
diff --git a/flang/test/Lower/HLFIR/implicit-type-conversion-allocatable.f90 b/flang/test/Lower/HLFIR/implicit-type-conversion-allocatable.f90
new file mode 100644
index 0000000000000..7083a825dfd3b
--- /dev/null
+++ b/flang/test/Lower/HLFIR/implicit-type-conversion-allocatable.f90
@@ -0,0 +1,40 @@
+! Test implicit conversion in assignment to whole allocatables. It
+! is special because care must be taken to propagate the RHS lower
+! bounds to the LHS in case of re-allocation.
+! RUN: bbc -emit-hlfir %s -o - | FileCheck %s
+
+subroutine preserve_lbounds(x, y)
+  integer, allocatable :: x(:)
+  complex, allocatable :: y(:)
+  x = y
+end subroutine
+! CHECK-LABEL:   func.func @_QPpreserve_lbounds(
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare {{.*}}uniq_name = "_QFpreserve_lboundsEx"
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare {{.*}}uniq_name = "_QFpreserve_lboundsEy"
+! CHECK:           %[[VAL_4:.*]] = fir.load %[[VAL_3]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.array<?x!fir.complex<4>>>>>
+! CHECK:           %[[VAL_5:.*]] = arith.constant 0 : index
+! CHECK:           %[[VAL_6:.*]]:3 = fir.box_dims %[[VAL_4]], %[[VAL_5]] : (!fir.box<!fir.heap<!fir.array<?x!fir.complex<4>>>>, index) -> (index, index, index)
+! CHECK:           %[[VAL_7:.*]] = fir.shape %[[VAL_6]]#1 : (index) -> !fir.shape<1>
+! CHECK:           %[[VAL_8:.*]] = hlfir.elemental %[[VAL_7]] unordered : (!fir.shape<1>) -> !hlfir.expr<?xi32> {
+! CHECK:           ^bb0(%[[VAL_9:.*]]: index):
+! CHECK:             %[[VAL_10:.*]] = arith.constant 0 : index
+! CHECK:             %[[VAL_11:.*]]:3 = fir.box_dims %[[VAL_4]], %[[VAL_10]] : (!fir.box<!fir.heap<!fir.array<?x!fir.complex<4>>>>, index) -> (index, index, index)
+! CHECK:             %[[VAL_12:.*]] = arith.constant 1 : index
+! CHECK:             %[[VAL_13:.*]] = arith.subi %[[VAL_11]]#0, %[[VAL_12]] : index
+! CHECK:             %[[VAL_14:.*]] = arith.addi %[[VAL_9]], %[[VAL_13]] : index
+! CHECK:             %[[VAL_15:.*]] = hlfir.designate %[[VAL_4]] (%[[VAL_14]])  : (!fir.box<!fir.heap<!fir.array<?x!fir.complex<4>>>>, index) -> !fir.ref<!fir.complex<4>>
+! CHECK:             %[[VAL_16:.*]] = fir.load %[[VAL_15]] : !fir.ref<!fir.complex<4>>
+! CHECK:             %[[VAL_17:.*]] = fir.extract_value %[[VAL_16]], [0 : index] : (!fir.complex<4>) -> f32
+! CHECK:             %[[VAL_18:.*]] = fir.convert %[[VAL_17]] : (f32) -> i32
+! CHECK:             hlfir.yield_element %[[VAL_18]] : i32
+! CHECK:           }
+! CHECK:           %[[VAL_19:.*]]:3 = hlfir.associate %[[VAL_8]](%[[VAL_7]]) {uniq_name = ".tmp.keeplbounds"} : (!hlfir.expr<?xi32>, !fir.shape<1>) -> (!fir.box<!fir.array<?xi32>>, !fir.ref<!fir.array<?xi32>>, i1)
+! CHECK:           %[[VAL_20:.*]] = arith.constant 0 : index
+! CHECK:           %[[VAL_21:.*]]:3 = fir.box_dims %[[VAL_4]], %[[VAL_20]] : (!fir.box<!fir.heap<!fir.array<?x!fir.complex<4>>>>, index) -> (index, index, index)
+! CHECK:           %[[VAL_22:.*]] = fir.shape_shift %[[VAL_21]]#0, %[[VAL_6]]#1 : (index, index) -> !fir.shapeshift<1>
+! CHECK:           %[[VAL_23:.*]]:2 = hlfir.declare %[[VAL_19]]#1(%[[VAL_22]]) {uniq_name = ".tmp.keeplbounds"} : (!fir.ref<!fir.array<?xi32>>, !fir.shapeshift<1>) -> (!fir.box<!fir.array<?xi32>>, !fir.ref<!fir.array<?xi32>>)
+! CHECK:           hlfir.assign %[[VAL_23]]#0 to %[[VAL_2]]#0 realloc : !fir.box<!fir.array<?xi32>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
+! CHECK:           hlfir.end_associate %[[VAL_19]]#1, %[[VAL_19]]#2 : !fir.ref<!fir.array<?xi32>>, i1
+! CHECK:           hlfir.destroy %[[VAL_8]] : !hlfir.expr<?xi32>
+! CHECK:           return
+! CHECK:         }
diff --git a/flang/test/Lower/charconvert.f90 b/flang/test/Lower/charconvert.f90
index 693d5bf603788..c8ec254b6a541 100644
--- a/flang/test/Lower/charconvert.f90
+++ b/flang/test/Lower/charconvert.f90
@@ -29,4 +29,4 @@ subroutine test_c4_to_c1(c4, c1)
 ! CHECK:   %[[VAL_4:.*]] = arith.muli %[[VAL_2]]#1, %[[C4]] : index
 ! CHECK:   %[[VAL_5:.*]] = fir.alloca !fir.char<1,?>(%[[VAL_4]] : index)
 ! CHECK:   fir.char_convert %[[VAL_3]]#1 for %[[VAL_2]]#1 to %[[VAL_5:.*]] : !fir.ref<!fir.char<4,?>>, index, !fir.ref<!fir.char<1,?>>
-! CHECK:   %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]] typeparams %[[VAL_2]]#1 {uniq_name = "ctor.temp"} : (!fir.ref<!fir.char<1,?>>, index) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
\ No newline at end of file
+! CHECK:   %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]] typeparams %[[VAL_2]]#1 {uniq_name = ".temp.kindconvert"} : (!fir.ref<!fir.char<1,?>>, index) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)

From 58d4fe287e02dab99eec282917c67abbb36fc3e4 Mon Sep 17 00:00:00 2001
From: Phoebe Wang <phoebe.wang@intel.com>
Date: Fri, 27 Oct 2023 15:26:05 +0800
Subject: [PATCH 184/877] [X86][EVEX512] Do not allow 512-bit memcpy without
 EVEX512 (#70420)

Solves crash mentioned in #65920.
---
 llvm/lib/Target/X86/X86ISelLoweringCall.cpp   |  4 +--
 .../lib/Target/X86/X86TargetTransformInfo.cpp |  5 ++--
 llvm/test/CodeGen/X86/evex512-mem.ll          | 29 +++++++++++++++++++
 3 files changed, 34 insertions(+), 4 deletions(-)
 create mode 100644 llvm/test/CodeGen/X86/evex512-mem.ll

diff --git a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
index c47ddae072b4f..2fe145f9267de 100644
--- a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
+++ b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
@@ -281,7 +281,7 @@ EVT X86TargetLowering::getOptimalMemOpType(
     if (Op.size() >= 16 &&
         (!Subtarget.isUnalignedMem16Slow() || Op.isAligned(Align(16)))) {
       // FIXME: Check if unaligned 64-byte accesses are slow.
-      if (Op.size() >= 64 && Subtarget.hasAVX512() &&
+      if (Op.size() >= 64 && Subtarget.hasAVX512() && Subtarget.hasEVEX512() &&
           (Subtarget.getPreferVectorWidth() >= 512)) {
         return Subtarget.hasBWI() ? MVT::v64i8 : MVT::v16i32;
       }
@@ -395,7 +395,7 @@ bool X86TargetLowering::allowsMemoryAccess(LLVMContext &Context,
         return true;
       return false;
     case 512:
-      if (Subtarget.hasAVX512())
+      if (Subtarget.hasAVX512() && Subtarget.hasEVEX512())
         return true;
       return false;
     default:
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 884f22b006bcb..8a04987e768a1 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -180,7 +180,7 @@ X86TTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
   case TargetTransformInfo::RGK_Scalar:
     return TypeSize::getFixed(ST->is64Bit() ? 64 : 32);
   case TargetTransformInfo::RGK_FixedWidthVector:
-    if (ST->hasAVX512() && PreferVectorWidth >= 512)
+    if (ST->hasAVX512() && ST->hasEVEX512() && PreferVectorWidth >= 512)
       return TypeSize::getFixed(512);
     if (ST->hasAVX() && PreferVectorWidth >= 256)
       return TypeSize::getFixed(256);
@@ -6131,7 +6131,8 @@ X86TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
     // Only enable vector loads for equality comparison. Right now the vector
     // version is not as fast for three way compare (see #33329).
     const unsigned PreferredWidth = ST->getPreferVectorWidth();
-    if (PreferredWidth >= 512 && ST->hasAVX512()) Options.LoadSizes.push_back(64);
+    if (PreferredWidth >= 512 && ST->hasAVX512() && ST->hasEVEX512())
+      Options.LoadSizes.push_back(64);
     if (PreferredWidth >= 256 && ST->hasAVX()) Options.LoadSizes.push_back(32);
     if (PreferredWidth >= 128 && ST->hasSSE2()) Options.LoadSizes.push_back(16);
   }
diff --git a/llvm/test/CodeGen/X86/evex512-mem.ll b/llvm/test/CodeGen/X86/evex512-mem.ll
new file mode 100644
index 0000000000000..85bb3b3a5487f
--- /dev/null
+++ b/llvm/test/CodeGen/X86/evex512-mem.ll
@@ -0,0 +1,29 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=avx512f,avx512bw,avx512vl < %s | FileCheck %s --check-prefix=AVX512
+; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=avx512f,avx512bw,avx512vl,-evex512 < %s | FileCheck %s --check-prefix=AVX256
+
+define void @test1() {
+; AVX512-LABEL: test1:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    movq 64, %rax
+; AVX512-NEXT:    movq %rax, (%rax)
+; AVX512-NEXT:    vmovups 0, %zmm0
+; AVX512-NEXT:    vmovups %zmm0, (%rax)
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+;
+; AVX256-LABEL: test1:
+; AVX256:       # %bb.0:
+; AVX256-NEXT:    movq 64, %rax
+; AVX256-NEXT:    movq %rax, (%rax)
+; AVX256-NEXT:    vmovups 0, %ymm0
+; AVX256-NEXT:    vmovups 32, %ymm1
+; AVX256-NEXT:    vmovups %ymm1, (%rax)
+; AVX256-NEXT:    vmovups %ymm0, (%rax)
+; AVX256-NEXT:    vzeroupper
+; AVX256-NEXT:    retq
+  call void @llvm.memcpy.p0.p0.i64(ptr align 8 poison, ptr align 8 null, i64 72, i1 false)
+  ret void
+}
+
+declare void @llvm.memcpy.p0.p0.i64(ptr, ptr, i64, i1)

From 8e247b8f4734b1b829156794bb2d9bf8c9c0e72a Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Fri, 27 Oct 2023 00:30:40 -0700
Subject: [PATCH 185/877] Replace TypeSize::{getFixed,getScalable} with
 canonical TypeSize::{Fixed,Scalable}. NFC

---
 .../llvm/Analysis/TargetTransformInfoImpl.h        |  2 +-
 llvm/include/llvm/CodeGen/BasicTTIImpl.h           |  2 +-
 llvm/lib/Analysis/BasicAliasAnalysis.cpp           |  4 ++--
 .../CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp  |  2 +-
 llvm/lib/IR/DebugInfo.cpp                          |  2 +-
 .../Target/AArch64/AArch64TargetTransformInfo.cpp  | 13 ++++++-------
 .../Target/AMDGPU/AMDGPUTargetTransformInfo.cpp    |  6 +++---
 llvm/lib/Target/AMDGPU/R600TargetTransformInfo.cpp |  2 +-
 llvm/lib/Target/ARM/ARMTargetTransformInfo.h       | 10 +++++-----
 llvm/lib/Target/DirectX/CBufferDataLayout.cpp      |  4 ++--
 .../Target/Hexagon/HexagonTargetTransformInfo.cpp  |  6 +++---
 llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h   |  2 +-
 llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp |  6 +++---
 llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp | 13 ++++++-------
 .../Target/SystemZ/SystemZTargetTransformInfo.cpp  |  6 +++---
 llvm/lib/Target/VE/VETargetTransformInfo.h         |  6 +++---
 .../WebAssembly/WebAssemblyTargetTransformInfo.cpp |  6 +++---
 llvm/lib/Target/X86/X86TargetTransformInfo.cpp     | 12 ++++++------
 .../lib/Transforms/Scalar/DeadStoreElimination.cpp |  2 +-
 llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp     |  6 +++---
 llvm/lib/Transforms/Utils/Local.cpp                |  2 +-
 llvm/unittests/CodeGen/LowLevelTypeTest.cpp        |  4 ++--
 llvm/unittests/IR/InstructionsTest.cpp             | 14 +++++++-------
 23 files changed, 65 insertions(+), 67 deletions(-)

diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index c1ff314ae51c9..e149154435139 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -443,7 +443,7 @@ class TargetTransformInfoImplBase {
   }
 
   TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
-    return TypeSize::getFixed(32);
+    return TypeSize::Fixed(32);
   }
 
   unsigned getMinVectorRegisterBitWidth() const { return 128; }
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 752b554d69899..7a8f36da58cec 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -714,7 +714,7 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
   /// @{
 
   TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
-    return TypeSize::getFixed(32);
+    return TypeSize::Fixed(32);
   }
 
   std::optional<unsigned> getMaxVScale() const { return std::nullopt; }
diff --git a/llvm/lib/Analysis/BasicAliasAnalysis.cpp b/llvm/lib/Analysis/BasicAliasAnalysis.cpp
index 2acd5c47b7681..f70b39b4f51a7 100644
--- a/llvm/lib/Analysis/BasicAliasAnalysis.cpp
+++ b/llvm/lib/Analysis/BasicAliasAnalysis.cpp
@@ -111,7 +111,7 @@ static std::optional<TypeSize> getObjectSize(const Value *V,
   Opts.RoundToAlign = RoundToAlign;
   Opts.NullIsUnknownSize = NullIsValidLoc;
   if (getObjectSize(V, Size, DL, &TLI, Opts))
-    return TypeSize::getFixed(Size);
+    return TypeSize::Fixed(Size);
   return std::nullopt;
 }
 
@@ -177,7 +177,7 @@ static TypeSize getMinimalExtentFrom(const Value &V,
   // accessed, thus valid.
   if (LocSize.isPrecise())
     DerefBytes = std::max(DerefBytes, LocSize.getValue().getKnownMinValue());
-  return TypeSize::getFixed(DerefBytes);
+  return TypeSize::Fixed(DerefBytes);
 }
 
 /// Returns true if we can prove that the object specified by V has size Size.
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index fc9e3ff373498..5bd04e2360679 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -4422,7 +4422,7 @@ void DAGTypeLegalizer::ExpandIntRes_ShiftThroughStack(SDNode *N, SDValue &Lo,
   // FIXME: should we be more picky about alignment?
   Align StackSlotAlignment(1);
   SDValue StackPtr = DAG.CreateStackTemporary(
-      TypeSize::getFixed(StackSlotByteWidth), StackSlotAlignment);
+      TypeSize::Fixed(StackSlotByteWidth), StackSlotAlignment);
   EVT PtrTy = StackPtr.getValueType();
   SDValue Ch = DAG.getEntryNode();
 
diff --git a/llvm/lib/IR/DebugInfo.cpp b/llvm/lib/IR/DebugInfo.cpp
index 48b5501c55ba4..390a27c4bc0c4 100644
--- a/llvm/lib/IR/DebugInfo.cpp
+++ b/llvm/lib/IR/DebugInfo.cpp
@@ -1947,7 +1947,7 @@ std::optional<AssignmentInfo> at::getAssignmentInfo(const DataLayout &DL,
     // We can't use a non-const size, bail.
     return std::nullopt;
   uint64_t SizeInBits = 8 * ConstLengthInBytes->getZExtValue();
-  return getAssignmentInfoImpl(DL, StoreDest, TypeSize::getFixed(SizeInBits));
+  return getAssignmentInfoImpl(DL, StoreDest, TypeSize::Fixed(SizeInBits));
 }
 
 std::optional<AssignmentInfo> at::getAssignmentInfo(const DataLayout &DL,
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index f121dc40e9fe6..5f2d09f0765aa 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -1961,21 +1961,20 @@ TypeSize
 AArch64TTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
   switch (K) {
   case TargetTransformInfo::RGK_Scalar:
-    return TypeSize::getFixed(64);
+    return TypeSize::Fixed(64);
   case TargetTransformInfo::RGK_FixedWidthVector:
     if (!ST->isNeonAvailable() && !EnableFixedwidthAutovecInStreamingMode)
-      return TypeSize::getFixed(0);
+      return TypeSize::Fixed(0);
 
     if (ST->hasSVE())
-      return TypeSize::getFixed(
-          std::max(ST->getMinSVEVectorSizeInBits(), 128u));
+      return TypeSize::Fixed(std::max(ST->getMinSVEVectorSizeInBits(), 128u));
 
-    return TypeSize::getFixed(ST->hasNEON() ? 128 : 0);
+    return TypeSize::Fixed(ST->hasNEON() ? 128 : 0);
   case TargetTransformInfo::RGK_ScalableVector:
     if (!ST->isSVEAvailable() && !EnableScalableAutovecInStreamingMode)
-      return TypeSize::getScalable(0);
+      return TypeSize::Scalable(0);
 
-    return TypeSize::getScalable(ST->hasSVE() ? 128 : 0);
+    return TypeSize::Scalable(ST->hasSVE() ? 128 : 0);
   }
   llvm_unreachable("Unsupported register kind");
 }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index cb877a4695f1e..fa302b68263f7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -321,11 +321,11 @@ TypeSize
 GCNTTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
   switch (K) {
   case TargetTransformInfo::RGK_Scalar:
-    return TypeSize::getFixed(32);
+    return TypeSize::Fixed(32);
   case TargetTransformInfo::RGK_FixedWidthVector:
-    return TypeSize::getFixed(ST->hasPackedFP32Ops() ? 64 : 32);
+    return TypeSize::Fixed(ST->hasPackedFP32Ops() ? 64 : 32);
   case TargetTransformInfo::RGK_ScalableVector:
-    return TypeSize::getScalable(0);
+    return TypeSize::Scalable(0);
   }
   llvm_unreachable("Unsupported register kind");
 }
diff --git a/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.cpp
index 1a1be4a442857..6ca821174bd83 100644
--- a/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.cpp
@@ -38,7 +38,7 @@ unsigned R600TTIImpl::getNumberOfRegisters(bool Vec) const {
 
 TypeSize
 R600TTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
-  return TypeSize::getFixed(32);
+  return TypeSize::Fixed(32);
 }
 
 unsigned R600TTIImpl::getMinVectorRegisterBitWidth() const { return 32; }
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
index bb4b321b53009..71e8a24afc401 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
@@ -165,15 +165,15 @@ class ARMTTIImpl : public BasicTTIImplBase<ARMTTIImpl> {
   TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
     switch (K) {
     case TargetTransformInfo::RGK_Scalar:
-      return TypeSize::getFixed(32);
+      return TypeSize::Fixed(32);
     case TargetTransformInfo::RGK_FixedWidthVector:
       if (ST->hasNEON())
-        return TypeSize::getFixed(128);
+        return TypeSize::Fixed(128);
       if (ST->hasMVEIntegerOps())
-        return TypeSize::getFixed(128);
-      return TypeSize::getFixed(0);
+        return TypeSize::Fixed(128);
+      return TypeSize::Fixed(0);
     case TargetTransformInfo::RGK_ScalableVector:
-      return TypeSize::getScalable(0);
+      return TypeSize::Scalable(0);
     }
     llvm_unreachable("Unsupported register kind");
   }
diff --git a/llvm/lib/Target/DirectX/CBufferDataLayout.cpp b/llvm/lib/Target/DirectX/CBufferDataLayout.cpp
index 41bb69b3d79c6..e539bfe7e6d51 100644
--- a/llvm/lib/Target/DirectX/CBufferDataLayout.cpp
+++ b/llvm/lib/Target/DirectX/CBufferDataLayout.cpp
@@ -76,12 +76,12 @@ TypeSize LegacyCBufferLayout::getTypeAllocSize(Type *Ty) {
   } else if (auto *AT = dyn_cast<ArrayType>(Ty)) {
     unsigned NumElts = AT->getNumElements();
     if (NumElts == 0)
-      return TypeSize::getFixed(0);
+      return TypeSize::Fixed(0);
 
     TypeSize EltSize = getTypeAllocSize(AT->getElementType());
     TypeSize AlignedEltSize = alignTo4Dwords(EltSize);
     // Each new element start 4 dwords aligned.
-    return TypeSize::getFixed(AlignedEltSize * (NumElts - 1) + EltSize);
+    return TypeSize::Fixed(AlignedEltSize * (NumElts - 1) + EltSize);
   } else {
     // NOTE: Use type store size, not align to ABI on basic types for legacy
     // layout.
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
index cf4b66f8bf861..6d4463108c1c0 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
@@ -118,11 +118,11 @@ TypeSize
 HexagonTTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
   switch (K) {
   case TargetTransformInfo::RGK_Scalar:
-    return TypeSize::getFixed(32);
+    return TypeSize::Fixed(32);
   case TargetTransformInfo::RGK_FixedWidthVector:
-    return TypeSize::getFixed(getMinVectorRegisterBitWidth());
+    return TypeSize::Fixed(getMinVectorRegisterBitWidth());
   case TargetTransformInfo::RGK_ScalableVector:
-    return TypeSize::getScalable(0);
+    return TypeSize::Scalable(0);
   }
 
   llvm_unreachable("Unsupported register kind");
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
index 3ce2675560c4d..7c90d66d6ae7d 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
@@ -78,7 +78,7 @@ class NVPTXTTIImpl : public BasicTTIImplBase<NVPTXTTIImpl> {
   // Only <2 x half> should be vectorized, so always return 32 for the vector
   // register size.
   TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
-    return TypeSize::getFixed(32);
+    return TypeSize::Fixed(32);
   }
   unsigned getMinVectorRegisterBitWidth() const { return 32; }
 
diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
index ca0f2c2e18af5..f2d7fb3e245db 100644
--- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@@ -493,11 +493,11 @@ TypeSize
 PPCTTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
   switch (K) {
   case TargetTransformInfo::RGK_Scalar:
-    return TypeSize::getFixed(ST->isPPC64() ? 64 : 32);
+      return TypeSize::Fixed(ST->isPPC64() ? 64 : 32);
   case TargetTransformInfo::RGK_FixedWidthVector:
-    return TypeSize::getFixed(ST->hasAltivec() ? 128 : 0);
+      return TypeSize::Fixed(ST->hasAltivec() ? 128 : 0);
   case TargetTransformInfo::RGK_ScalableVector:
-    return TypeSize::getScalable(0);
+      return TypeSize::Scalable(0);
   }
 
   llvm_unreachable("Unsupported register kind");
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index 09f9b6035aeb0..25bbb189cadd8 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -210,16 +210,15 @@ RISCVTTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
       llvm::bit_floor(std::clamp<unsigned>(RVVRegisterWidthLMUL, 1, 8));
   switch (K) {
   case TargetTransformInfo::RGK_Scalar:
-    return TypeSize::getFixed(ST->getXLen());
+    return TypeSize::Fixed(ST->getXLen());
   case TargetTransformInfo::RGK_FixedWidthVector:
-    return TypeSize::getFixed(
+    return TypeSize::Fixed(
         ST->useRVVForFixedLengthVectors() ? LMUL * ST->getRealMinVLen() : 0);
   case TargetTransformInfo::RGK_ScalableVector:
-    return TypeSize::getScalable(
-        (ST->hasVInstructions() &&
-         ST->getRealMinVLen() >= RISCV::RVVBitsPerBlock)
-            ? LMUL * RISCV::RVVBitsPerBlock
-            : 0);
+    return TypeSize::Scalable((ST->hasVInstructions() &&
+                               ST->getRealMinVLen() >= RISCV::RVVBitsPerBlock)
+                                  ? LMUL * RISCV::RVVBitsPerBlock
+                                  : 0);
   }
 
   llvm_unreachable("Unsupported register kind");
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
index 1f97e0f761c04..b49f223e6b68f 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
@@ -366,11 +366,11 @@ TypeSize
 SystemZTTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
   switch (K) {
   case TargetTransformInfo::RGK_Scalar:
-    return TypeSize::getFixed(64);
+    return TypeSize::Fixed(64);
   case TargetTransformInfo::RGK_FixedWidthVector:
-    return TypeSize::getFixed(ST->hasVector() ? 128 : 0);
+    return TypeSize::Fixed(ST->hasVector() ? 128 : 0);
   case TargetTransformInfo::RGK_ScalableVector:
-    return TypeSize::getScalable(0);
+    return TypeSize::Scalable(0);
   }
 
   llvm_unreachable("Unsupported register kind");
diff --git a/llvm/lib/Target/VE/VETargetTransformInfo.h b/llvm/lib/Target/VE/VETargetTransformInfo.h
index c688447088782..8c9ef850b9581 100644
--- a/llvm/lib/Target/VE/VETargetTransformInfo.h
+++ b/llvm/lib/Target/VE/VETargetTransformInfo.h
@@ -98,12 +98,12 @@ class VETTIImpl : public BasicTTIImplBase<VETTIImpl> {
   TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
     switch (K) {
     case TargetTransformInfo::RGK_Scalar:
-      return TypeSize::getFixed(64);
+      return TypeSize::Fixed(64);
     case TargetTransformInfo::RGK_FixedWidthVector:
       // TODO report vregs once vector isel is stable.
-      return TypeSize::getFixed(0);
+      return TypeSize::Fixed(0);
     case TargetTransformInfo::RGK_ScalableVector:
-      return TypeSize::getScalable(0);
+      return TypeSize::Scalable(0);
     }
 
     llvm_unreachable("Unsupported register kind");
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
index 9a434d9b1db54..306db7cf7614f 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
@@ -40,11 +40,11 @@ TypeSize WebAssemblyTTIImpl::getRegisterBitWidth(
     TargetTransformInfo::RegisterKind K) const {
   switch (K) {
   case TargetTransformInfo::RGK_Scalar:
-    return TypeSize::getFixed(64);
+    return TypeSize::Fixed(64);
   case TargetTransformInfo::RGK_FixedWidthVector:
-    return TypeSize::getFixed(getST()->hasSIMD128() ? 128 : 64);
+    return TypeSize::Fixed(getST()->hasSIMD128() ? 128 : 64);
   case TargetTransformInfo::RGK_ScalableVector:
-    return TypeSize::getScalable(0);
+    return TypeSize::Scalable(0);
   }
 
   llvm_unreachable("Unsupported register kind");
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 8a04987e768a1..3c91fc12bcdc9 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -178,17 +178,17 @@ X86TTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
   unsigned PreferVectorWidth = ST->getPreferVectorWidth();
   switch (K) {
   case TargetTransformInfo::RGK_Scalar:
-    return TypeSize::getFixed(ST->is64Bit() ? 64 : 32);
+    return TypeSize::Fixed(ST->is64Bit() ? 64 : 32);
   case TargetTransformInfo::RGK_FixedWidthVector:
     if (ST->hasAVX512() && ST->hasEVEX512() && PreferVectorWidth >= 512)
-      return TypeSize::getFixed(512);
+      return TypeSize::Fixed(512);
     if (ST->hasAVX() && PreferVectorWidth >= 256)
-      return TypeSize::getFixed(256);
+      return TypeSize::Fixed(256);
     if (ST->hasSSE1() && PreferVectorWidth >= 128)
-      return TypeSize::getFixed(128);
-    return TypeSize::getFixed(0);
+      return TypeSize::Fixed(128);
+    return TypeSize::Fixed(0);
   case TargetTransformInfo::RGK_ScalableVector:
-    return TypeSize::getScalable(0);
+    return TypeSize::Scalable(0);
   }
 
   llvm_unreachable("Unsupported register kind");
diff --git a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
index 75490e984f985..cacefc9718f11 100644
--- a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -214,7 +214,7 @@ static std::optional<TypeSize> getPointerSize(const Value *V,
   Opts.NullIsUnknownSize = NullPointerIsDefined(F);
 
   if (getObjectSize(V, Size, DL, &TLI, Opts))
-    return TypeSize::getFixed(Size);
+    return TypeSize::Fixed(Size);
   return std::nullopt;
 }
 
diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
index b1c60d12aa1b6..b366ec5d4c357 100644
--- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -1720,7 +1720,7 @@ bool MemCpyOptPass::processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI) {
       if (auto *CopySize = dyn_cast<ConstantInt>(M->getLength())) {
         if (auto *C = dyn_cast<CallInst>(MI)) {
           if (performCallSlotOptzn(M, M, M->getDest(), M->getSource(),
-                                   TypeSize::getFixed(CopySize->getZExtValue()),
+                                   TypeSize::Fixed(CopySize->getZExtValue()),
                                    M->getDestAlign().valueOrOne(), BAA,
                                    [C]() -> CallInst * { return C; })) {
             LLVM_DEBUG(dbgs() << "Performed call slot optimization:\n"
@@ -1766,7 +1766,7 @@ bool MemCpyOptPass::processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI) {
   if (Len == nullptr)
     return false;
   if (performStackMoveOptzn(M, M, DestAlloca, SrcAlloca,
-                            TypeSize::getFixed(Len->getZExtValue()), BAA)) {
+                            TypeSize::Fixed(Len->getZExtValue()), BAA)) {
     // Avoid invalidating the iterator.
     BBI = M->getNextNonDebugInstruction()->getIterator();
     eraseInstruction(M);
@@ -1829,7 +1829,7 @@ bool MemCpyOptPass::processByValArgument(CallBase &CB, unsigned ArgNo) {
   // The length of the memcpy must be larger or equal to the size of the byval.
   auto *C1 = dyn_cast<ConstantInt>(MDep->getLength());
   if (!C1 || !TypeSize::isKnownGE(
-                 TypeSize::getFixed(C1->getValue().getZExtValue()), ByValSize))
+                 TypeSize::Fixed(C1->getValue().getZExtValue()), ByValSize))
     return false;
 
   // Get the alignment of the byval.  If the call doesn't specify the alignment,
diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp
index 0326e17a6fa14..a255db7dafa79 100644
--- a/llvm/lib/Transforms/Utils/Local.cpp
+++ b/llvm/lib/Transforms/Utils/Local.cpp
@@ -1497,7 +1497,7 @@ static bool valueCoversEntireFragment(Type *ValTy, DbgVariableIntrinsic *DII) {
   const DataLayout &DL = DII->getModule()->getDataLayout();
   TypeSize ValueSize = DL.getTypeAllocSizeInBits(ValTy);
   if (std::optional<uint64_t> FragmentSize = DII->getFragmentSizeInBits())
-    return TypeSize::isKnownGE(ValueSize, TypeSize::getFixed(*FragmentSize));
+    return TypeSize::isKnownGE(ValueSize, TypeSize::Fixed(*FragmentSize));
 
   // We can't always calculate the size of the DI variable (e.g. if it is a
   // VLA). Try to use the size of the alloca that the dbg intrinsic describes
diff --git a/llvm/unittests/CodeGen/LowLevelTypeTest.cpp b/llvm/unittests/CodeGen/LowLevelTypeTest.cpp
index cd55dc273e19d..7b13fef23ed00 100644
--- a/llvm/unittests/CodeGen/LowLevelTypeTest.cpp
+++ b/llvm/unittests/CodeGen/LowLevelTypeTest.cpp
@@ -382,8 +382,8 @@ static_assert(CEV2P1.isVector());
 static_assert(CEV2P1.getElementCount() == ElementCount::getFixed(2));
 static_assert(CEV2P1.getElementCount() != ElementCount::getFixed(1));
 static_assert(CEV2S32.getElementCount() == ElementCount::getFixed(2));
-static_assert(CEV2S32.getSizeInBits() == TypeSize::getFixed(64));
-static_assert(CEV2P1.getSizeInBits() == TypeSize::getFixed(128));
+static_assert(CEV2S32.getSizeInBits() == TypeSize::Fixed(64));
+static_assert(CEV2P1.getSizeInBits() == TypeSize::Fixed(128));
 static_assert(CEV2P1.getScalarType() == LLT::pointer(1, 64));
 static_assert(CES32.getScalarType() == CES32);
 static_assert(CEV2S32.getScalarType() == CES32);
diff --git a/llvm/unittests/IR/InstructionsTest.cpp b/llvm/unittests/IR/InstructionsTest.cpp
index 20b8529b38632..803bc56bd6d53 100644
--- a/llvm/unittests/IR/InstructionsTest.cpp
+++ b/llvm/unittests/IR/InstructionsTest.cpp
@@ -1746,14 +1746,14 @@ TEST(InstructionsTest, AllocaInst) {
   AllocaInst &F = cast<AllocaInst>(*It++);
   AllocaInst &G = cast<AllocaInst>(*It++);
   AllocaInst &H = cast<AllocaInst>(*It++);
-  EXPECT_EQ(A.getAllocationSizeInBits(DL), TypeSize::getFixed(32));
-  EXPECT_EQ(B.getAllocationSizeInBits(DL), TypeSize::getFixed(128));
+  EXPECT_EQ(A.getAllocationSizeInBits(DL), TypeSize::Fixed(32));
+  EXPECT_EQ(B.getAllocationSizeInBits(DL), TypeSize::Fixed(128));
   EXPECT_FALSE(C.getAllocationSizeInBits(DL));
-  EXPECT_EQ(D.getAllocationSizeInBits(DL), TypeSize::getFixed(512));
-  EXPECT_EQ(E.getAllocationSizeInBits(DL), TypeSize::getScalable(512));
-  EXPECT_EQ(F.getAllocationSizeInBits(DL), TypeSize::getFixed(32));
-  EXPECT_EQ(G.getAllocationSizeInBits(DL), TypeSize::getFixed(768));
-  EXPECT_EQ(H.getAllocationSizeInBits(DL), TypeSize::getFixed(160));
+  EXPECT_EQ(D.getAllocationSizeInBits(DL), TypeSize::Fixed(512));
+  EXPECT_EQ(E.getAllocationSizeInBits(DL), TypeSize::Scalable(512));
+  EXPECT_EQ(F.getAllocationSizeInBits(DL), TypeSize::Fixed(32));
+  EXPECT_EQ(G.getAllocationSizeInBits(DL), TypeSize::Fixed(768));
+  EXPECT_EQ(H.getAllocationSizeInBits(DL), TypeSize::Fixed(160));
 }
 
 TEST(InstructionsTest, InsertAtBegin) {

From fb619b3c78b1b6a026b0fec9fccff4c515465803 Mon Sep 17 00:00:00 2001
From: Petr Hosek <phosek@google.com>
Date: Fri, 27 Oct 2023 07:29:28 +0000
Subject: [PATCH 186/877] [CMake] Address the issue introduced in #69869

While extracting the existing functionality into a function, one of the
variable usages wasn't correctly updated.
---
 llvm/runtimes/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/runtimes/CMakeLists.txt b/llvm/runtimes/CMakeLists.txt
index 4ae2bf2ef20e9..a464c6c43af7b 100644
--- a/llvm/runtimes/CMakeLists.txt
+++ b/llvm/runtimes/CMakeLists.txt
@@ -174,7 +174,7 @@ endif()
 function(_get_runtime_name name out_var)
   string(FIND ${name} "lib" idx)
   if(idx EQUAL 0 AND NOT ${name} STREQUAL "libc")
-    string(SUBSTRING ${name} 3 -1 entry)
+    string(SUBSTRING ${name} 3 -1 name)
   endif()
   set(${out_var} ${name} PARENT_SCOPE)
 endfunction()

From ccc57132aa07165a7dc0ff8a443d772fd2c450c8 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Fri, 27 Oct 2023 00:37:38 -0700
Subject: [PATCH 187/877] TypeSize: remove redundant getFixed and getScalable

---
 llvm/include/llvm/Support/TypeSize.h | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/llvm/include/llvm/Support/TypeSize.h b/llvm/include/llvm/Support/TypeSize.h
index 5c15ed9e4d9bd..dc096d7cc7154 100644
--- a/llvm/include/llvm/Support/TypeSize.h
+++ b/llvm/include/llvm/Support/TypeSize.h
@@ -319,12 +319,6 @@ class TypeSize : public details::FixedOrScalableQuantity<TypeSize, uint64_t> {
   constexpr TypeSize(ScalarTy Quantity, bool Scalable)
       : FixedOrScalableQuantity(Quantity, Scalable) {}
 
-  static constexpr TypeSize getFixed(ScalarTy ExactSize) {
-    return TypeSize(ExactSize, false);
-  }
-  static constexpr TypeSize getScalable(ScalarTy MinimumSize) {
-    return TypeSize(MinimumSize, true);
-  }
   static constexpr TypeSize get(ScalarTy Quantity, bool Scalable) {
     return TypeSize(Quantity, Scalable);
   }

From 1914bcfd3fb4d37b154e9cfeddcec3c04766d1b7 Mon Sep 17 00:00:00 2001
From: Kristof Beyls <kristof.beyls@arm.com>
Date: Fri, 27 Oct 2023 09:45:10 +0200
Subject: [PATCH 188/877] [docs] Improve README: point to office hours and
 online sync-ups (#69323)

The main README.md should probably be kept pretty short and be used to
point new-comers to the most essential ways to get started on or get
involved with LLVM.

Therefore, this patch removes a pointer to IRC (not used very much these
days), and does add pointers to office hours and online sync-ups.
---
 README.md | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index eb8d624d75cec..692f2bb3916f5 100644
--- a/README.md
+++ b/README.md
@@ -32,8 +32,9 @@ the [Contributing to LLVM](https://llvm.org/docs/Contributing.html) guide.
 ## Getting in touch
 
 Join the [LLVM Discourse forums](https://discourse.llvm.org/), [Discord
-chat](https://discord.gg/xS7Z362), or #llvm IRC channel on
-[OFTC](https://oftc.net/).
+chat](https://discord.gg/xS7Z362),
+[LLVM Office Hours](https://llvm.org/docs/GettingInvolved.html#office-hours) or
+[Regular sync-ups](https://llvm.org/docs/GettingInvolved.html#online-sync-ups).
 
 The LLVM project has adopted a [code of conduct](https://llvm.org/docs/CodeOfConduct.html) for
 participants to all modes of communication within the project.

From ea1909f82ceca30b009701f1d349247f8fbf1c3e Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Fri, 27 Oct 2023 09:50:05 +0200
Subject: [PATCH 189/877] [InstSimplify] Add tests for #69050 and #69091 (NFC)

---
 .../InstSimplify/and-or-implied-cond.ll       | 34 +++++++++++++++++++
 1 file changed, 34 insertions(+)

diff --git a/llvm/test/Transforms/InstSimplify/and-or-implied-cond.ll b/llvm/test/Transforms/InstSimplify/and-or-implied-cond.ll
index df442dd8f185c..db38077e0c513 100644
--- a/llvm/test/Transforms/InstSimplify/and-or-implied-cond.ll
+++ b/llvm/test/Transforms/InstSimplify/and-or-implied-cond.ll
@@ -230,3 +230,37 @@ define i1 @uaddo_or_commuted3(i64 %a, i64 %b){
   %cond = or i1 %cond_a, %cond_b
   ret i1 %cond
 }
+
+define i1 @pr69050(i32 %arg, i32 %arg1) {
+; CHECK-LABEL: @pr69050(
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[ARG:%.*]], -1
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[XOR]], [[ARG1:%.*]]
+; CHECK-NEXT:    [[ICMP:%.*]] = icmp ne i32 [[AND]], 0
+; CHECK-NEXT:    [[ICMP2:%.*]] = icmp ne i32 [[ARG]], -1
+; CHECK-NEXT:    [[AND3:%.*]] = and i1 [[ICMP2]], [[ICMP]]
+; CHECK-NEXT:    ret i1 [[AND3]]
+;
+  %xor = xor i32 %arg, -1
+  %and = and i32 %xor, %arg1
+  %icmp = icmp ne i32 %and, 0
+  %icmp2 = icmp ne i32 %arg, -1
+  %and3 = and i1 %icmp2, %icmp
+  ret i1 %and3
+}
+
+define i1 @pr69091(i32 %arg, i32 %arg1) {
+; CHECK-LABEL: @pr69091(
+; CHECK-NEXT:    [[ICMP:%.*]] = icmp ne i32 [[ARG:%.*]], -1
+; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[ARG]], 1
+; CHECK-NEXT:    [[MUL:%.*]] = mul i32 [[ADD]], [[ARG1:%.*]]
+; CHECK-NEXT:    [[ICMP2:%.*]] = icmp ne i32 [[MUL]], 0
+; CHECK-NEXT:    [[OR:%.*]] = or i1 [[ICMP]], [[ICMP2]]
+; CHECK-NEXT:    ret i1 [[OR]]
+;
+  %icmp = icmp ne i32 %arg, -1
+  %add = add i32 %arg, 1
+  %mul = mul i32 %add, %arg1
+  %icmp2 = icmp ne i32 %mul, 0
+  %or = or i1 %icmp, %icmp2
+  ret i1 %or
+}

From 0a10e88915167c88443dd58883e01d543963e40e Mon Sep 17 00:00:00 2001
From: jeanPerier <jperier@nvidia.com>
Date: Fri, 27 Oct 2023 10:33:57 +0200
Subject: [PATCH 190/877] [flang] Implement legacy %VAL and %REF actual
 arguments (#70343)

Update evaluate::ActualArgument to propagate the %VAL and %REF markers
until lowering.
Semantic checks are added to %VAL to ensure the argument is a numerical
or logical scalar.

I did not push these markers into the characteristics because other
compilers do not complain about inconsistent usages (e.g. using %VAL in
a call on a procedure with an interface without VALUE dummies is not
flagged by any compilers I tested, and it is not an issue for lowering,
so I decided to stay simple here and minimize the footprint of these
legacy features).

Lowering retrieves these markers and does the right thing: pass %VAL in
registers and pass %REF by address without adding any extra arguments
for characters.
---
 flang/include/flang/Evaluate/call.h           | 26 +++++--
 flang/include/flang/Evaluate/type.h           |  1 +
 flang/lib/Evaluate/call.cpp                   |  3 +-
 flang/lib/Evaluate/formatting.cpp             |  8 +++
 flang/lib/Evaluate/type.cpp                   |  5 ++
 flang/lib/Lower/CallInterface.cpp             | 35 ++++++++--
 flang/lib/Semantics/expression.cpp            | 68 +++++++++++-------
 .../Lower/HLFIR/calls-percent-val-ref.f90     | 69 +++++++++++++++++++
 flang/test/Semantics/call40.f90               | 46 +++++++++++++
 9 files changed, 226 insertions(+), 35 deletions(-)
 create mode 100644 flang/test/Lower/HLFIR/calls-percent-val-ref.f90
 create mode 100644 flang/test/Semantics/call40.f90

diff --git a/flang/include/flang/Evaluate/call.h b/flang/include/flang/Evaluate/call.h
index d8241c08e3b25..3d766bc08e58d 100644
--- a/flang/include/flang/Evaluate/call.h
+++ b/flang/include/flang/Evaluate/call.h
@@ -52,6 +52,9 @@ using SymbolRef = common::Reference<const Symbol>;
 
 class ActualArgument {
 public:
+  ENUM_CLASS(Attr, PassedObject, PercentVal, PercentRef);
+  using Attrs = common::EnumSet<Attr, Attr_enumSize>;
+
   // Dummy arguments that are TYPE(*) can be forwarded as actual arguments.
   // Since that's the only thing one may do with them in Fortran, they're
   // represented in expressions as a special case of an actual argument.
@@ -118,9 +121,13 @@ class ActualArgument {
   bool isAlternateReturn() const {
     return std::holds_alternative<common::Label>(u_);
   }
-  bool isPassedObject() const { return isPassedObject_; }
+  bool isPassedObject() const { return attrs_.test(Attr::PassedObject); }
   ActualArgument &set_isPassedObject(bool yes = true) {
-    isPassedObject_ = yes;
+    if (yes) {
+      attrs_ = attrs_ + Attr::PassedObject;
+    } else {
+      attrs_ = attrs_ - Attr::PassedObject;
+    }
     return *this;
   }
 
@@ -141,7 +148,18 @@ class ActualArgument {
   // Wrap this argument in parentheses
   void Parenthesize();
 
-  // TODO: Mark legacy %VAL and %REF arguments
+  // Legacy %VAL.
+  bool isPercentVal() const { return attrs_.test(Attr::PercentVal); };
+  ActualArgument &set_isPercentVal() {
+    attrs_ = attrs_ + Attr::PercentVal;
+    return *this;
+  }
+  // Legacy %REF.
+  bool isPercentRef() const { return attrs_.test(Attr::PercentRef); };
+  ActualArgument &set_isPercentRef() {
+    attrs_ = attrs_ + Attr::PercentRef;
+    return *this;
+  }
 
 private:
   // Subtlety: There is a distinction that must be maintained here between an
@@ -153,7 +171,7 @@ class ActualArgument {
       common::Label>
       u_;
   std::optional<parser::CharBlock> keyword_;
-  bool isPassedObject_{false};
+  Attrs attrs_;
   common::Intent dummyIntent_{common::Intent::Default};
   std::optional<parser::CharBlock> sourceLocation_;
 };
diff --git a/flang/include/flang/Evaluate/type.h b/flang/include/flang/Evaluate/type.h
index 13060e42e47ad..ff784ef51f902 100644
--- a/flang/include/flang/Evaluate/type.h
+++ b/flang/include/flang/Evaluate/type.h
@@ -183,6 +183,7 @@ class DynamicType {
   constexpr bool IsUnlimitedPolymorphic() const { // TYPE(*) or CLASS(*)
     return IsPolymorphic() && !derived_;
   }
+  bool IsLengthlessIntrinsicType() const;
   constexpr const semantics::DerivedTypeSpec &GetDerivedTypeSpec() const {
     return DEREF(derived_);
   }
diff --git a/flang/lib/Evaluate/call.cpp b/flang/lib/Evaluate/call.cpp
index 55631ee2a476c..c5b50e806d249 100644
--- a/flang/lib/Evaluate/call.cpp
+++ b/flang/lib/Evaluate/call.cpp
@@ -57,8 +57,7 @@ int ActualArgument::Rank() const {
 }
 
 bool ActualArgument::operator==(const ActualArgument &that) const {
-  return keyword_ == that.keyword_ && isPassedObject_ == that.isPassedObject_ &&
-      u_ == that.u_;
+  return keyword_ == that.keyword_ && attrs_ == that.attrs_ && u_ == that.u_;
 }
 
 void ActualArgument::Parenthesize() {
diff --git a/flang/lib/Evaluate/formatting.cpp b/flang/lib/Evaluate/formatting.cpp
index 52964fff76d6f..5684c07657e61 100644
--- a/flang/lib/Evaluate/formatting.cpp
+++ b/flang/lib/Evaluate/formatting.cpp
@@ -132,6 +132,11 @@ llvm::raw_ostream &ActualArgument::AsFortran(llvm::raw_ostream &o) const {
   if (keyword_) {
     o << keyword_->ToString() << '=';
   }
+  if (isPercentVal()) {
+    o << "%VAL(";
+  } else if (isPercentRef()) {
+    o << "%REF(";
+  }
   common::visit(
       common::visitors{
           [&](const common::CopyableIndirection<Expr<SomeType>> &expr) {
@@ -141,6 +146,9 @@ llvm::raw_ostream &ActualArgument::AsFortran(llvm::raw_ostream &o) const {
           [&](const common::Label &label) { o << '*' << label; },
       },
       u_);
+  if (isPercentVal() || isPercentRef()) {
+    o << ')';
+  }
   return o;
 }
 
diff --git a/flang/lib/Evaluate/type.cpp b/flang/lib/Evaluate/type.cpp
index e5d9851e2496a..dbadc07fdbbc5 100644
--- a/flang/lib/Evaluate/type.cpp
+++ b/flang/lib/Evaluate/type.cpp
@@ -240,6 +240,11 @@ bool DynamicType::IsTypelessIntrinsicArgument() const {
   return category_ == TypeCategory::Integer && kind_ == TypelessKind;
 }
 
+bool DynamicType::IsLengthlessIntrinsicType() const {
+  return common::IsNumericTypeCategory(category_) ||
+      category_ == TypeCategory::Logical;
+}
+
 const semantics::DerivedTypeSpec *GetDerivedTypeSpec(
     const std::optional<DynamicType> &type) {
   return type ? GetDerivedTypeSpec(*type) : nullptr;
diff --git a/flang/lib/Lower/CallInterface.cpp b/flang/lib/Lower/CallInterface.cpp
index ea38b737a303a..43bbbb933658a 100644
--- a/flang/lib/Lower/CallInterface.cpp
+++ b/flang/lib/Lower/CallInterface.cpp
@@ -844,11 +844,40 @@ class Fortran::lower::CallInterfaceImpl {
     return {};
   }
 
+  mlir::Type
+  getRefType(Fortran::evaluate::DynamicType dynamicType,
+             const Fortran::evaluate::characteristics::DummyDataObject &obj) {
+    mlir::Type type = translateDynamicType(dynamicType);
+    fir::SequenceType::Shape bounds = getBounds(obj.type.shape());
+    if (!bounds.empty())
+      type = fir::SequenceType::get(bounds, type);
+    return fir::ReferenceType::get(type);
+  }
+
   void handleImplicitDummy(
       const DummyCharacteristics *characteristics,
       const Fortran::evaluate::characteristics::DummyDataObject &obj,
       const FortranEntity &entity) {
     Fortran::evaluate::DynamicType dynamicType = obj.type.type();
+    if constexpr (std::is_same_v<FortranEntity,
+                                 const Fortran::evaluate::ActualArgument *>) {
+      if (entity) {
+        if (entity->isPercentVal()) {
+          mlir::Type type = translateDynamicType(dynamicType);
+          addFirOperand(type, nextPassedArgPosition(), Property::Value,
+                        dummyNameAttr(entity));
+          addPassedArg(PassEntityBy::Value, entity, characteristics);
+          return;
+        }
+        if (entity->isPercentRef()) {
+          mlir::Type refType = getRefType(dynamicType, obj);
+          addFirOperand(refType, nextPassedArgPosition(), Property::BaseAddress,
+                        dummyNameAttr(entity));
+          addPassedArg(PassEntityBy::BaseAddress, entity, characteristics);
+          return;
+        }
+      }
+    }
     if (dynamicType.category() == Fortran::common::TypeCategory::Character) {
       mlir::Type boxCharTy =
           fir::BoxCharType::get(&mlirContext, dynamicType.kind());
@@ -857,11 +886,7 @@ class Fortran::lower::CallInterfaceImpl {
       addPassedArg(PassEntityBy::BoxChar, entity, characteristics);
     } else {
       // non-PDT derived type allowed in implicit interface.
-      mlir::Type type = translateDynamicType(dynamicType);
-      fir::SequenceType::Shape bounds = getBounds(obj.type.shape());
-      if (!bounds.empty())
-        type = fir::SequenceType::get(bounds, type);
-      mlir::Type refType = fir::ReferenceType::get(type);
+      mlir::Type refType = getRefType(dynamicType, obj);
       addFirOperand(refType, nextPassedArgPosition(), Property::BaseAddress,
                     dummyNameAttr(entity));
       addPassedArg(PassEntityBy::BaseAddress, entity, characteristics);
diff --git a/flang/lib/Semantics/expression.cpp b/flang/lib/Semantics/expression.cpp
index 33baf56f202d3..367414a2b4465 100644
--- a/flang/lib/Semantics/expression.cpp
+++ b/flang/lib/Semantics/expression.cpp
@@ -175,6 +175,7 @@ class ArgumentAnalyzer {
   MaybeExpr TryDefinedOp(std::vector<const char *>, parser::MessageFixedText);
   MaybeExpr TryBoundOp(const Symbol &, int passIndex);
   std::optional<ActualArgument> AnalyzeExpr(const parser::Expr &);
+  std::optional<ActualArgument> AnalyzeVariable(const parser::Variable &);
   MaybeExpr AnalyzeExprOrWholeAssumedSizeArray(const parser::Expr &);
   bool AreConformable() const;
   const Symbol *FindBoundOp(parser::CharBlock, int passIndex,
@@ -3894,13 +3895,14 @@ MaybeExpr ExpressionAnalyzer::AnalyzeComplex(
       std::move(im), GetDefaultKind(TypeCategory::Real)));
 }
 
-void ArgumentAnalyzer::Analyze(const parser::Variable &x) {
+std::optional<ActualArgument> ArgumentAnalyzer::AnalyzeVariable(
+    const parser::Variable &x) {
   source_.ExtendToCover(x.GetSource());
   if (MaybeExpr expr{context_.Analyze(x)}) {
     if (!IsConstantExpr(*expr)) {
-      actuals_.emplace_back(std::move(*expr));
-      SetArgSourceLocation(actuals_.back(), x.GetSource());
-      return;
+      ActualArgument actual{std::move(*expr)};
+      SetArgSourceLocation(actual, x.GetSource());
+      return actual;
     }
     const Symbol *symbol{GetLastSymbol(*expr)};
     if (!symbol) {
@@ -3923,32 +3925,50 @@ void ArgumentAnalyzer::Analyze(const parser::Variable &x) {
     }
   }
   fatalErrors_ = true;
+  return std::nullopt;
+}
+
+void ArgumentAnalyzer::Analyze(const parser::Variable &x) {
+  if (auto actual = AnalyzeVariable(x)) {
+    actuals_.emplace_back(std::move(actual));
+  }
 }
 
 void ArgumentAnalyzer::Analyze(
     const parser::ActualArgSpec &arg, bool isSubroutine) {
   // TODO: C1534: Don't allow a "restricted" specific intrinsic to be passed.
   std::optional<ActualArgument> actual;
-  common::visit(common::visitors{
-                    [&](const common::Indirection<parser::Expr> &x) {
-                      actual = AnalyzeExpr(x.value());
-                      SetArgSourceLocation(actual, x.value().source);
-                    },
-                    [&](const parser::AltReturnSpec &label) {
-                      if (!isSubroutine) {
-                        context_.Say(
-                            "alternate return specification may not appear on"
-                            " function reference"_err_en_US);
-                      }
-                      actual = ActualArgument(label.v);
-                    },
-                    [&](const parser::ActualArg::PercentRef &) {
-                      context_.Say("%REF() intrinsic for arguments"_todo_en_US);
-                    },
-                    [&](const parser::ActualArg::PercentVal &) {
-                      context_.Say("%VAL() intrinsic for arguments"_todo_en_US);
-                    },
-                },
+  common::visit(
+      common::visitors{
+          [&](const common::Indirection<parser::Expr> &x) {
+            actual = AnalyzeExpr(x.value());
+          },
+          [&](const parser::AltReturnSpec &label) {
+            if (!isSubroutine) {
+              context_.Say("alternate return specification may not appear on"
+                           " function reference"_err_en_US);
+            }
+            actual = ActualArgument(label.v);
+          },
+          [&](const parser::ActualArg::PercentRef &percentRef) {
+            actual = AnalyzeVariable(percentRef.v);
+            if (actual.has_value()) {
+              actual->set_isPercentRef();
+            }
+          },
+          [&](const parser::ActualArg::PercentVal &percentVal) {
+            actual = AnalyzeExpr(percentVal.v);
+            if (actual.has_value()) {
+              actual->set_isPercentVal();
+              std::optional<DynamicType> type{actual->GetType()};
+              if (!type || !type->IsLengthlessIntrinsicType() ||
+                  actual->Rank() != 0) {
+                context_.SayAt(percentVal.v,
+                    "%VAL argument must be a scalar numerical or logical expression"_err_en_US);
+              }
+            }
+          },
+      },
       std::get<parser::ActualArg>(arg.t).u);
   if (actual) {
     if (const auto &argKW{std::get<std::optional<parser::Keyword>>(arg.t)}) {
diff --git a/flang/test/Lower/HLFIR/calls-percent-val-ref.f90 b/flang/test/Lower/HLFIR/calls-percent-val-ref.f90
new file mode 100644
index 0000000000000..c6acc42455f1b
--- /dev/null
+++ b/flang/test/Lower/HLFIR/calls-percent-val-ref.f90
@@ -0,0 +1,69 @@
+! Test lowering of legacy %VAL and %REF actual arguments.
+! RUN: bbc -emit-hlfir -o - %s | FileCheck %s
+
+subroutine test_val_1(x)
+  integer :: x
+  call val1(%val(x))
+end subroutine
+! CHECK-LABEL:   func.func @_QPtest_val_1(
+! CHECK-SAME:                             %[[VAL_0:.*]]: !fir.ref<i32> {fir.bindc_name = "x"}) {
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFtest_val_1Ex"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_2:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<i32>
+! CHECK:           fir.call @_QPval1(%[[VAL_2]]) fastmath<contract> : (i32) -> ()
+
+subroutine test_val_2(x)
+  complex, allocatable :: x
+  call val2(%val(x))
+end subroutine
+! CHECK-LABEL:   func.func @_QPtest_val_2(
+! CHECK-SAME:                             %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.heap<!fir.complex<4>>>> {fir.bindc_name = "x"}) {
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtest_val_2Ex"} : (!fir.ref<!fir.box<!fir.heap<!fir.complex<4>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.complex<4>>>>, !fir.ref<!fir.box<!fir.heap<!fir.complex<4>>>>)
+! CHECK:           %[[VAL_2:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<!fir.box<!fir.heap<!fir.complex<4>>>>
+! CHECK:           %[[VAL_3:.*]] = fir.box_addr %[[VAL_2]] : (!fir.box<!fir.heap<!fir.complex<4>>>) -> !fir.heap<!fir.complex<4>>
+! CHECK:           %[[VAL_4:.*]] = fir.load %[[VAL_3]] : !fir.heap<!fir.complex<4>>
+! CHECK:           fir.call @_QPval2(%[[VAL_4]]) fastmath<contract> : (!fir.complex<4>) -> ()
+
+subroutine test_ref_char(x)
+  ! There must be not extra length argument. Only the address is
+  ! passed.
+  character(*) :: x
+  call ref_char(%ref(x))
+end subroutine
+! CHECK-LABEL:   func.func @_QPtest_ref_char(
+! CHECK-SAME:                                %[[VAL_0:.*]]: !fir.boxchar<1> {fir.bindc_name = "x"}) {
+! CHECK:           %[[VAL_1:.*]]:2 = fir.unboxchar %[[VAL_0]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]]#0 typeparams %[[VAL_1]]#1 {uniq_name = "_QFtest_ref_charEx"} : (!fir.ref<!fir.char<1,?>>, index) -> (!fir.boxchar<1>, !fir.ref<!fir.char<1,?>>)
+! CHECK:           %[[VAL_3:.*]]:2 = fir.unboxchar %[[VAL_2]]#0 : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
+! CHECK:           fir.call @_QPref_char(%[[VAL_3]]#0) fastmath<contract> : (!fir.ref<!fir.char<1,?>>) -> ()
+
+subroutine test_ref_1(x)
+  integer :: x
+  call ref1(%ref(x))
+end subroutine
+! CHECK-LABEL:   func.func @_QPtest_ref_1(
+! CHECK-SAME:                             %[[VAL_0:.*]]: !fir.ref<i32> {fir.bindc_name = "x"}) {
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFtest_ref_1Ex"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           fir.call @_QPref1(%[[VAL_1]]#1) fastmath<contract> : (!fir.ref<i32>) -> ()
+
+subroutine test_ref_2(x)
+  complex, pointer :: x
+  call ref2(%ref(x))
+end subroutine
+! CHECK-LABEL:   func.func @_QPtest_ref_2(
+! CHECK-SAME:                             %[[VAL_0:.*]]: !fir.ref<!fir.box<!fir.ptr<!fir.complex<4>>>> {fir.bindc_name = "x"}) {
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest_ref_2Ex"} : (!fir.ref<!fir.box<!fir.ptr<!fir.complex<4>>>>) -> (!fir.ref<!fir.box<!fir.ptr<!fir.complex<4>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.complex<4>>>>)
+! CHECK:           %[[VAL_2:.*]] = fir.load %[[VAL_1]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.complex<4>>>>
+! CHECK:           %[[VAL_3:.*]] = fir.box_addr %[[VAL_2]] : (!fir.box<!fir.ptr<!fir.complex<4>>>) -> !fir.ptr<!fir.complex<4>>
+! CHECK:           %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (!fir.ptr<!fir.complex<4>>) -> !fir.ref<!fir.complex<4>>
+! CHECK:           fir.call @_QPref2(%[[VAL_4]]) fastmath<contract> : (!fir.ref<!fir.complex<4>>) -> ()
+
+subroutine test_skip_copy_in_out(x)
+  real :: x(:)
+  call val3(%val(%loc(x)))
+end subroutine
+! CHECK-LABEL:   func.func @_QPtest_skip_copy_in_out(
+! CHECK-SAME:                                        %[[VAL_0:.*]]: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "x"}) {
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFtest_skip_copy_in_outEx"} : (!fir.box<!fir.array<?xf32>>) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
+! CHECK:           %[[VAL_2:.*]] = fir.box_addr %[[VAL_1]]#1 : (!fir.box<!fir.array<?xf32>>) -> !fir.ref<!fir.array<?xf32>>
+! CHECK:           %[[VAL_3:.*]] = fir.convert %[[VAL_2]] : (!fir.ref<!fir.array<?xf32>>) -> i64
+! CHECK:           fir.call @_QPval3(%[[VAL_3]]) fastmath<contract> : (i64) -> ()
diff --git a/flang/test/Semantics/call40.f90 b/flang/test/Semantics/call40.f90
new file mode 100644
index 0000000000000..492fcdd1256af
--- /dev/null
+++ b/flang/test/Semantics/call40.f90
@@ -0,0 +1,46 @@
+! RUN: %python %S/test_errors.py %s %flang_fc1
+! %VAL en %REF legacy extension semantic tests.
+
+subroutine val_errors(array, string, polymorphic, derived)
+  type t
+    integer :: t
+  end type
+  integer :: array(10)
+  character(*) :: string
+  type(t) :: derived
+  type(*) :: polymorphic
+  !ERROR: %VAL argument must be a scalar numerical or logical expression
+  call foo1(%val(array))
+  !ERROR: %VAL argument must be a scalar numerical or logical expression
+  call foo2(%val(string))
+  !ERROR: %VAL argument must be a scalar numerical or logical expression
+  call foo3(%val(derived))
+  !ERROR: %VAL argument must be a scalar numerical or logical expression
+  !ERROR: Assumed type argument requires an explicit interface
+  call foo4(%val(polymorphic))
+end subroutine
+
+subroutine val_ok()
+  integer :: array(10)
+  real :: x
+  logical :: l
+  complex :: c
+  call ok1(%val(array(1)))
+  call ok2(%val(x))
+  call ok3(%val(l))
+  call ok4(%val(c))
+  call ok5(%val(42))
+  call ok6(%val(x+x))
+end subroutine
+
+subroutine ref_ok(array, string, derived)
+  type t
+    integer :: t
+  end type
+  integer :: array(10)
+  character(*) :: string
+  type(t) :: derived
+  call rok1(%ref(array))
+  call rok2(%ref(string))
+  call rok3(%ref(derived))
+end subroutine

From 5270df3d177ae8849e5f1a2cec28b193ab209ef2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andrzej=20Warzy=C5=84ski?= <andrzej.warzynski@arm.com>
Date: Fri, 27 Oct 2023 09:38:36 +0100
Subject: [PATCH 191/877] [mlir][vector] Add scalable vectors to tests for
 vector.contract (#70039)

Update the remaining tests for matrix multiplication (_matmul_) in:

  * vector-contract-to-outerproduct-transforms.mlir

with cases for scalable vectors.

Note that in order for the "vector.contract -> vector.outerproduct"
patterns to work, only the non-reduction dimension can be scalable (*).
For Matmul operations that is set to be the N dimension (i.e. rows of
the output matrix), which matches how matrix multiplication are normally
implemented for e.g. Arm's SVE. However, making the M dimension scalable
(i.e. columns of the output matrix) should work as well.

Making both parellel dimensions scalable is left as a TODO for when
support for 2-D scalable vectors is more established (this is
work-in-progress as part of the effort to support Arm's SME in MLIR).

The change in:

  * `UnrolledOuterProductGenerator`

is a "bug fix" to make sure that the conversion pattern correctly
propagates scalability when creating `arith.extf` operations.

(*) The conversion tested in this file unrolls along the reduction
dimension, which is not supported for scalable vectors.
---
 .../Vector/Transforms/LowerVectorContract.cpp |   2 +-
 ...r-contract-to-outerproduct-transforms.mlir | 150 ++++++++++++++++++
 2 files changed, 151 insertions(+), 1 deletion(-)

diff --git a/mlir/lib/Dialect/Vector/Transforms/LowerVectorContract.cpp b/mlir/lib/Dialect/Vector/Transforms/LowerVectorContract.cpp
index 5463a7bd8f4c8..6dbe36e605e9a 100644
--- a/mlir/lib/Dialect/Vector/Transforms/LowerVectorContract.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/LowerVectorContract.cpp
@@ -418,7 +418,7 @@ struct UnrolledOuterProductGenerator
       return v;
     Type promotedType = dstElementType;
     if (vecType)
-      promotedType = VectorType::get(vecType.getShape(), promotedType);
+      promotedType = vecType.clone(promotedType);
     if (isa<FloatType>(dstElementType))
       return rewriter.create<arith::ExtFOp>(loc, promotedType, v);
     return rewriter.create<arith::ExtSIOp>(loc, promotedType, v);
diff --git a/mlir/test/Dialect/Vector/vector-contract-to-outerproduct-transforms.mlir b/mlir/test/Dialect/Vector/vector-contract-to-outerproduct-transforms.mlir
index 44fb23088cea9..6933b24a32a83 100644
--- a/mlir/test/Dialect/Vector/vector-contract-to-outerproduct-transforms.mlir
+++ b/mlir/test/Dialect/Vector/vector-contract-to-outerproduct-transforms.mlir
@@ -169,6 +169,42 @@ func.func @matmul(%arg0: vector<2x4xf32>,
   return %0 : vector<2x3xf32>
 }
 
+// CHECK-LABEL: func @matmul_scalable
+// CHECK-SAME: %[[A:[a-zA-Z0-9]*]]: vector<2x4xf32>,
+// CHECK-SAME: %[[B:[a-zA-Z0-9]*]]: vector<4x[3]xf32>,
+// CHECK-SAME: %[[C:[a-zA-Z0-9]*]]: vector<2x[3]xf32>
+//      CHECK: %[[At:.*]] = vector.transpose %[[A]], [1, 0]
+// CHECK-SAME:  : vector<2x4xf32> to vector<4x2xf32>
+//
+//      CHECK: %[[a0:.*]] = vector.extract %[[At]][0] : vector<2xf32> from vector<4x2xf32>
+//      CHECK: %[[b0:.*]] = vector.extract %[[B]][0] : vector<[3]xf32> from vector<4x[3]xf32>
+//      CHECK: %[[c0:.*]] = vector.outerproduct %[[a0]], %[[b0]], %[[C]]
+// CHECK-SAME:  : vector<2xf32>, vector<[3]xf32>
+//
+//      CHECK: %[[a1:.*]] = vector.extract %[[At]][1] : vector<2xf32> from vector<4x2xf32>
+//      CHECK: %[[b1:.*]] = vector.extract %[[B]][1] : vector<[3]xf32> from vector<4x[3]xf32>
+//      CHECK: %[[c1:.*]] = vector.outerproduct %[[a1]], %[[b1]], %[[c0]]
+// CHECK-SAME:  : vector<2xf32>, vector<[3]xf32>
+//
+//      CHECK: %[[a2:.*]] = vector.extract %[[At]][2] : vector<2xf32> from vector<4x2xf32>
+//      CHECK: %[[b2:.*]] = vector.extract %[[B]][2] : vector<[3]xf32> from vector<4x[3]xf32>
+//      CHECK: %[[c2:.*]] = vector.outerproduct %[[a2]], %[[b2]], %[[c1]]
+// CHECK-SAME:  : vector<2xf32>, vector<[3]xf32>
+//
+//      CHECK: %[[a3:.*]] = vector.extract %[[At]][3] : vector<2xf32> from vector<4x2xf32>
+//      CHECK: %[[b3:.*]] = vector.extract %[[B]][3] : vector<[3]xf32> from vector<4x[3]xf32>
+//      CHECK: %[[c3:.*]] = vector.outerproduct %[[a3]], %[[b3]], %[[c2]]
+// CHECK-SAME:  : vector<2xf32>, vector<[3]xf32>
+//
+//      CHECK: return %[[c3]] : vector<2x[3]xf32>
+func.func @matmul_scalable(%arg0: vector<2x4xf32>,
+                          %arg1: vector<4x[3]xf32>,
+                          %arg2: vector<2x[3]xf32>) -> vector<2x[3]xf32> {
+  %0 = vector.contract #matmat_trait %arg0, %arg1, %arg2
+    : vector<2x4xf32>, vector<4x[3]xf32> into vector<2x[3]xf32>
+  return %0 : vector<2x[3]xf32>
+}
+
 // CHECK-LABEL: func @matmul_0
 // CHECK-SAME: %[[A:[a-zA-Z0-9]*]]: vector<2x1xf32>,
 // CHECK-SAME: %[[B:[a-zA-Z0-9]*]]: vector<1x3xf32>,
@@ -186,6 +222,23 @@ func.func @matmul_0(%arg0: vector<2x1xf32>, %arg1: vector<1x3xf32>, %arg2: vecto
   return %0 : vector<2x3xf32>
 }
 
+// CHECK-LABEL: func @matmul_0_scalable
+// CHECK-SAME: %[[A:[a-zA-Z0-9]*]]: vector<2x1xf32>,
+// CHECK-SAME: %[[B:[a-zA-Z0-9]*]]: vector<1x[3]xf32>,
+// CHECK-SAME: %[[C:[a-zA-Z0-9]*]]: vector<2x[3]xf32>
+//      CHECK: %[[At:.*]] = vector.transpose %[[A]], [1, 0]
+//      CHECK: %[[a0:.*]] = vector.extract %[[At]][0] : vector<2xf32> from vector<1x2xf32>
+//      CHECK: %[[b0:.*]] = vector.extract %[[B]][0] : vector<[3]xf32> from vector<1x[3]xf32>
+//      CHECK: %[[c0:.*]] = vector.outerproduct %[[a0]], %[[b0]], %[[C]]
+//      CHECK: return %[[c0]] : vector<2x[3]xf32>
+func.func @matmul_0_scalable(%arg0: vector<2x1xf32>, %arg1: vector<1x[3]xf32>, %arg2: vector<2x[3]xf32>)
+-> vector<2x[3]xf32>
+{
+  %0 = vector.contract #matmat_trait_0 %arg0, %arg1, %arg2
+    : vector<2x1xf32>, vector<1x[3]xf32> into vector<2x[3]xf32>
+  return %0 : vector<2x[3]xf32>
+}
+
 // CHECK-LABEL: func @matmul_0_mixed
 // CHECK-SAME: %[[A:[a-zA-Z0-9]*]]: vector<2x1xf16>,
 // CHECK-SAME: %[[B:[a-zA-Z0-9]*]]: vector<1x3xf16>,
@@ -205,6 +258,25 @@ func.func @matmul_0_mixed(%arg0: vector<2x1xf16>, %arg1: vector<1x3xf16>, %arg2:
   return %0 : vector<2x3xf32>
 }
 
+// CHECK-LABEL: func @matmul_0_mixed_scalable
+// CHECK-SAME: %[[A:[a-zA-Z0-9]*]]: vector<2x1xf16>,
+// CHECK-SAME: %[[B:[a-zA-Z0-9]*]]: vector<1x[3]xf16>,
+// CHECK-SAME: %[[C:[a-zA-Z0-9]*]]: vector<2x[3]xf32>
+//      CHECK: %[[At:.*]] = vector.transpose %[[A]], [1, 0]
+//      CHECK: %[[a0:.*]] = vector.extract %[[At]][0] : vector<2xf16> from vector<1x2xf16>
+//      CHECK: %[[b0:.*]] = vector.extract %[[B]][0] : vector<[3]xf16> from vector<1x[3]xf16>
+//      CHECK: %[[a1:.*]] = arith.extf %[[a0]] : vector<2xf16> to vector<2xf32>
+//      CHECK: %[[b1:.*]] = arith.extf %[[b0]] : vector<[3]xf16> to vector<[3]xf32>
+//      CHECK: %[[c0:.*]] = vector.outerproduct %[[a1]], %[[b1]], %[[C]]
+//      CHECK: return %[[c0]] : vector<2x[3]xf32>
+func.func @matmul_0_mixed_scalable(%arg0: vector<2x1xf16>, %arg1: vector<1x[3]xf16>, %arg2: vector<2x[3]xf32>)
+-> vector<2x[3]xf32>
+{
+  %0 = vector.contract #matmat_trait_0 %arg0, %arg1, %arg2
+    : vector<2x1xf16>, vector<1x[3]xf16> into vector<2x[3]xf32>
+  return %0 : vector<2x[3]xf32>
+}
+
 #matmat_accesses_1 = [
   affine_map<(m, n, k) -> (m, k)>,
   affine_map<(m, n, k) -> (n, k)>,
@@ -233,6 +305,24 @@ func.func @matmul_1(%arg0: vector<2x1xf32>, %arg1: vector<3x1xf32>, %arg2: vecto
   return %0 : vector<2x3xf32>
 }
 
+// CHECK-LABEL: func @matmul_1_scalable
+// CHECK-SAME: %[[A:[a-zA-Z0-9]*]]: vector<2x1xf32>,
+// CHECK-SAME: %[[B:[a-zA-Z0-9]*]]: vector<[3]x1xf32>,
+// CHECK-SAME: %[[C:[a-zA-Z0-9]*]]: vector<2x[3]xf32>
+//      CHECK: %[[At:.*]] = vector.transpose %[[A]], [1, 0]
+//      CHECK: %[[Bt:.*]] = vector.transpose %[[B]], [1, 0]
+//      CHECK: %[[a0:.*]] = vector.extract %[[At]][0] : vector<2xf32> from vector<1x2xf32>
+//      CHECK: %[[b0:.*]] = vector.extract %[[Bt]][0] : vector<[3]xf32> from vector<1x[3]xf32>
+//      CHECK: %[[c0:.*]] = vector.outerproduct %[[a0]], %[[b0]], %[[C]]
+//      CHECK: return %[[c0]] : vector<2x[3]xf32>
+func.func @matmul_1_scalable(%arg0: vector<2x1xf32>, %arg1: vector<[3]x1xf32>, %arg2: vector<2x[3]xf32>)
+-> vector<2x[3]xf32>
+{
+  %0 = vector.contract #matmat_trait_1 %arg0, %arg1, %arg2
+    : vector<2x1xf32>, vector<[3]x1xf32> into vector<2x[3]xf32>
+  return %0 : vector<2x[3]xf32>
+}
+
 #matmat_accesses_2 = [
   affine_map<(m, n, k) -> (k, m)>,
   affine_map<(m, n, k) -> (k, n)>,
@@ -259,6 +349,22 @@ func.func @matmul_2(%arg0: vector<1x2xf32>, %arg1: vector<1x3xf32>, %arg2: vecto
   return %0 : vector<2x3xf32>
 }
 
+// CHECK-LABEL: func @matmul_2_scalable
+// CHECK-SAME: %[[A:[a-zA-Z0-9]*]]: vector<1x2xf32>,
+// CHECK-SAME: %[[B:[a-zA-Z0-9]*]]: vector<1x[3]xf32>,
+// CHECK-SAME: %[[C:[a-zA-Z0-9]*]]: vector<2x[3]xf32>
+//      CHECK: %[[a0:.*]] = vector.extract %[[A]][0] : vector<2xf32> from vector<1x2xf32>
+//      CHECK: %[[b0:.*]] = vector.extract %[[B]][0] : vector<[3]xf32> from vector<1x[3]xf32>
+//      CHECK: %[[c0:.*]] = vector.outerproduct %[[a0]], %[[b0]], %[[C]]
+//      CHECK: return %[[c0]] : vector<2x[3]xf32>
+func.func @matmul_2_scalable(%arg0: vector<1x2xf32>, %arg1: vector<1x[3]xf32>, %arg2: vector<2x[3]xf32>)
+-> vector<2x[3]xf32>
+{
+  %0 = vector.contract #matmat_trait_2 %arg0, %arg1, %arg2
+    : vector<1x2xf32>, vector<1x[3]xf32> into vector<2x[3]xf32>
+  return %0 : vector<2x[3]xf32>
+}
+
 #matmat_accesses_3 = [
   affine_map<(m, n, k) -> (k, m)>,
   affine_map<(m, n, k) -> (n, k)>,
@@ -286,6 +392,23 @@ func.func @matmul_3(%arg0: vector<1x2xf32>, %arg1: vector<3x1xf32>, %arg2: vecto
   return %0 : vector<2x3xf32>
 }
 
+// CHECK-LABEL: func @matmul_3_scalable
+// CHECK-SAME: %[[A:[a-zA-Z0-9]*]]: vector<1x2xf32>,
+// CHECK-SAME: %[[B:[a-zA-Z0-9]*]]: vector<[3]x1xf32>,
+// CHECK-SAME: %[[C:[a-zA-Z0-9]*]]: vector<2x[3]xf32>
+//      CHECK: %[[Bt:.*]] = vector.transpose %[[B]], [1, 0]
+//      CHECK: %[[a0:.*]] = vector.extract %[[A]][0] : vector<2xf32> from vector<1x2xf32>
+//      CHECK: %[[b0:.*]] = vector.extract %[[Bt]][0] : vector<[3]xf32> from vector<1x[3]xf32>
+//      CHECK: %[[c0:.*]] = vector.outerproduct %[[a0]], %[[b0]], %[[C]]
+//      CHECK: return %[[c0]] : vector<2x[3]xf32>
+func.func @matmul_3_scalable(%arg0: vector<1x2xf32>, %arg1: vector<[3]x1xf32>, %arg2: vector<2x[3]xf32>)
+-> vector<2x[3]xf32>
+{
+  %0 = vector.contract #matmat_trait_3 %arg0, %arg1, %arg2
+    : vector<1x2xf32>, vector<[3]x1xf32> into vector<2x[3]xf32>
+  return %0 : vector<2x[3]xf32>
+}
+
 #matmat_accesses_4 = [
   affine_map<(m, n, k) -> (m, k)>,
   affine_map<(m, n, k) -> (k, n)>,
@@ -313,6 +436,33 @@ func.func @matmul_4(%arg0: vector<2x1xf32>, %arg1: vector<1x3xf32>, %arg2: vecto
   return %0 : vector<3x2xf32>
 }
 
+// CHECK-LABEL: func @matmul_4_scalable
+// CHECK-SAME: %[[A:[a-zA-Z0-9]*]]: vector<[2]x1xf32>,
+// CHECK-SAME: %[[B:[a-zA-Z0-9]*]]: vector<1x3xf32>,
+// CHECK-SAME: %[[C:[a-zA-Z0-9]*]]: vector<3x[2]xf32>
+//      CHECK: %[[At:.*]] = vector.transpose %[[A]], [1, 0]
+//      CHECK: %[[b0:.*]] = vector.extract %[[B]][0] : vector<3xf32> from vector<1x3xf32>
+//      CHECK: %[[a0:.*]] = vector.extract %[[At]][0] : vector<[2]xf32> from vector<1x[2]xf32>
+//      CHECK: %[[c0:.*]] = vector.outerproduct %[[b0]], %[[a0]], %[[C]]
+//      CHECK: return %[[c0]] : vector<3x[2]xf32>
+func.func @matmul_4_scalable(%arg0: vector<[2]x1xf32>, %arg1: vector<1x3xf32>, %arg2: vector<3x[2]xf32>)
+-> vector<3x[2]xf32>
+{
+  %0 = vector.contract #matmat_trait_4 %arg0, %arg1, %arg2
+    : vector<[2]x1xf32>, vector<1x3xf32> into vector<3x[2]xf32>
+  return %0 : vector<3x[2]xf32>
+}
+
+#matmat_accesses_5 = [
+  affine_map<(m, n, k) -> (m, k)>,
+  affine_map<(m, n, k) -> (k, n)>,
+  affine_map<(m, n, k) -> (n, m)>
+]
+#matmat_trait_5 = {
+  indexing_maps = #matmat_accesses_5,
+  iterator_types = ["parallel", "parallel", "reduction"]
+}
+
 // CHECK-LABEL: @masked_matvec_mk_k_m
 // CHECK-SAME:  %[[MAT:.+]]: vector<4x2xf32>
 // CHECK-SAME:  %[[VEC:.+]]: vector<2xf32>

From e6e55e620bb6adba50c15ff52e177b5c68e38d61 Mon Sep 17 00:00:00 2001
From: Matthias Springer <me@m-sp.org>
Date: Fri, 27 Oct 2023 17:39:55 +0900
Subject: [PATCH 192/877] [mlir][vector] Fix off-by-one error in
 `getTransferChunkAccessed` (#70292)

If a dimension does not appear in the permutation map of a vector
transfer op, the size of the accessed slice in that dimension is `1`.
Before this fix, `getTransferChunkAccessed` used to return `0` for such
dimensions, which would means that `0` elements in the underlying
tensor/memref are accessed.

Note: There is no test case that fails due to this bug and because this
interface method is currently only used in one place, it is hard to
write a regression test. This fix is in preparation of subset hoisting
functionality that will be added in subsequent commits.
---
 .../mlir/Interfaces/VectorInterfaces.td       | 14 ++++----
 mlir/lib/Dialect/Vector/IR/VectorOps.cpp      | 33 ++++++++++---------
 mlir/test/Dialect/Vector/canonicalize.mlir    | 24 ++++++++++++++
 3 files changed, 48 insertions(+), 23 deletions(-)

diff --git a/mlir/include/mlir/Interfaces/VectorInterfaces.td b/mlir/include/mlir/Interfaces/VectorInterfaces.td
index 346a409a3f3e0..026faf269f368 100644
--- a/mlir/include/mlir/Interfaces/VectorInterfaces.td
+++ b/mlir/include/mlir/Interfaces/VectorInterfaces.td
@@ -257,22 +257,22 @@ def VectorTransferOpInterface : OpInterface<"VectorTransferOpInterface"> {
     >,
     InterfaceMethod<
       /*desc=*/[{
-      Return an upper-bound shape accessed by the transfer op within the
-      tensor/memref operand.
+      Return the shape of the hyperrectangular slice within the tensor/memref
+      operand that is accessed by the transfer op.
       For example:
       ```
-        vector.transfer %w0[%i, %j] {
-          permutation_map = affine_map<(d0, d1) -> (d1, d0, 0)>} :
-          tensor<?x?xf32>, vector<4x2x6xf32>
+        vector.transfer %w0[%i, %j, %k] {
+          permutation_map = affine_map<(d0, d1, d2) -> (d1, d0, 0)>} :
+          tensor<?x?x?xf32>, vector<4x2x6xf32>
       ```
-      returns a shape [2, 4].
+      returns a shape [2, 4, 1].
       }],
       /*retTy=*/"SmallVector<int64_t>",
       /*methodName=*/"getTransferChunkAccessed",
       /*args=*/(ins),
       /*methodBody=*/"",
       /*defaultImplementation=*/[{
-        SmallVector<int64_t> dimSizes($_op.getPermutationMap().getNumDims(), 0);
+        SmallVector<int64_t> dimSizes($_op.getPermutationMap().getNumDims(), 1);
         for (auto vecDims : llvm::zip($_op.getPermutationMap().getResults(),
                                       $_op.getVectorType().getShape())) {
           AffineExpr dim = std::get<0>(vecDims);
diff --git a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
index d77476c109083..f7b15f98e1665 100644
--- a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
+++ b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
@@ -4004,35 +4004,36 @@ struct TransferReadAfterWriteToBroadcast
     auto defWrite = readOp.getSource().getDefiningOp<vector::TransferWriteOp>();
     if (!defWrite)
       return failure();
-
-    SmallVector<int64_t> readDims = readOp.getTransferChunkAccessed();
-    Value vec;
-    if (readOp.getIndices() == defWrite.getIndices() &&
-        readOp.getMask() == defWrite.getMask()) {
-      SmallVector<int64_t> writeDims = defWrite.getTransferChunkAccessed();
-      // TODO: If the writeDim is a superset of the read dims we could do an
-      // extract_strided_slice.
-      if (writeDims == readDims)
-        vec = defWrite.getVector();
-    }
+    // TODO: If the written transfer chunk is a superset of the read transfer
+    // chunk we could do an extract_strided_slice.
+    if (readOp.getTransferChunkAccessed() !=
+        defWrite.getTransferChunkAccessed())
+      return failure();
+    // TODO: Support cases where a dim is explicitly written but implicitly
+    // read (i.e., a unit dim that is rank reduced).
+    if (getUnusedDimsBitVector({readOp.getPermutationMap()}) !=
+        getUnusedDimsBitVector({defWrite.getPermutationMap()}))
+      return failure();
+    if (readOp.getIndices() != defWrite.getIndices() ||
+        readOp.getMask() != defWrite.getMask())
+      return failure();
+    Value vec = defWrite.getVector();
     // TODO: loop through the chain of transfer_write if we can prove that they
     // don't overlap with the transfer_read. This requires improving
     // `isDisjointTransferIndices` helper.
-    if (!vec)
-      return failure();
-    SmallVector<unsigned> permutation;
     AffineMap readMap = compressUnusedDims(readOp.getPermutationMap());
     AffineMap writeMap = compressUnusedDims(defWrite.getPermutationMap());
     AffineMap map = readMap.compose(writeMap);
     if (map.getNumResults() == 0)
       return failure();
-    // Calculate the permuation to apply to go from the vector stored to the
+    // Calculate the permutation to apply to go from the vector stored to the
     // vector read.
+    SmallVector<unsigned> permutation;
     if (!map.isPermutationOfMinorIdentityWithBroadcasting(permutation))
       return failure();
 
     Location loc = readOp.getLoc();
-    // Calculate the broadcast shape by applying the reverse permuation to the
+    // Calculate the broadcast shape by applying the reverse permutation to the
     // final shape we want.
     ArrayRef<int64_t> destShape = readOp.getVectorType().getShape();
     SmallVector<int64_t> broadcastShape(destShape.size());
diff --git a/mlir/test/Dialect/Vector/canonicalize.mlir b/mlir/test/Dialect/Vector/canonicalize.mlir
index dd2c78eb44e9f..d866c14fcbf25 100644
--- a/mlir/test/Dialect/Vector/canonicalize.mlir
+++ b/mlir/test/Dialect/Vector/canonicalize.mlir
@@ -2400,3 +2400,27 @@ func.func @fold_shape_cast_with_constant_mask() -> vector<4xi1>{
   %2 = vector.shape_cast %1 : vector<4x1x1xi1> to vector<4xi1>
   return %2 : vector<4xi1>
 }
+
+// -----
+
+// TODO: This IR could be canonicalized but the canonicalization pattern is not
+// smart enough. For now, just make sure that we do not crash.
+
+// CHECK-LABEL: func.func @load_store_forwarding_rank_mismatch(
+//       CHECK:   vector.transfer_write
+//       CHECK:   vector.transfer_read
+func.func @load_store_forwarding_rank_mismatch(%v0: vector<4x1x1xf32>, %arg0: tensor<4x4x4xf32>) -> (vector<1x100x4x5xf32>) {
+  %c0 = arith.constant 0 : index
+  %cf0 = arith.constant 0.0 : f32
+  // d0 is explicitly written.
+  %w0 = vector.transfer_write %v0, %arg0[%c0, %c0, %c0]
+      {in_bounds = [true, true, true],
+      permutation_map = affine_map<(d0, d1, d2) -> (d2, d1, d0)>} :
+      vector<4x1x1xf32>, tensor<4x4x4xf32>
+  // d0 is implicitly read (rank-reduction of unit dim).
+  %r = vector.transfer_read %w0[%c0, %c0, %c0], %cf0
+      {in_bounds = [true, true, true, true],
+      permutation_map = affine_map<(d0, d1, d2) -> (d1, 0, d2, 0)>} :
+      tensor<4x4x4xf32>, vector<1x100x4x5xf32>
+  return %r : vector<1x100x4x5xf32>
+}
\ No newline at end of file

From b8b491c9d7b19db00830caea2fd2b7d792d9c8cb Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Tue, 3 Oct 2023 13:42:31 +0300
Subject: [PATCH 193/877] AMDGPU: Add infinite looping testcase after subrange
 spilling change

This infinite looped after d8127b2ba8a87a610851b9a462f2fc2526c36e37
---
 .../identical-subrange-spill-infloop.ll       | 405 ++++++++++++++++++
 .../CodeGen/AMDGPU/infloop-subrange-spill.mir | 142 ++++++
 2 files changed, 547 insertions(+)
 create mode 100644 llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/infloop-subrange-spill.mir

diff --git a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll
new file mode 100644
index 0000000000000..27fa6712f9eb4
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll
@@ -0,0 +1,405 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc -march=amdgcn -mcpu=gfx900 < %s | FileCheck %s
+
+define void @main(i1 %arg) #0 {
+; CHECK-LABEL: main:
+; CHECK:       ; %bb.0: ; %bb
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; CHECK-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; CHECK-NEXT:    s_mov_b64 exec, s[4:5]
+; CHECK-NEXT:    v_writelane_b32 v1, s30, 0
+; CHECK-NEXT:    v_writelane_b32 v1, s31, 1
+; CHECK-NEXT:    v_writelane_b32 v1, s36, 2
+; CHECK-NEXT:    v_writelane_b32 v1, s37, 3
+; CHECK-NEXT:    v_writelane_b32 v1, s38, 4
+; CHECK-NEXT:    v_writelane_b32 v1, s39, 5
+; CHECK-NEXT:    v_writelane_b32 v1, s40, 6
+; CHECK-NEXT:    v_writelane_b32 v1, s41, 7
+; CHECK-NEXT:    v_writelane_b32 v1, s42, 8
+; CHECK-NEXT:    v_writelane_b32 v1, s43, 9
+; CHECK-NEXT:    v_writelane_b32 v1, s44, 10
+; CHECK-NEXT:    v_writelane_b32 v1, s45, 11
+; CHECK-NEXT:    v_writelane_b32 v1, s46, 12
+; CHECK-NEXT:    v_writelane_b32 v1, s47, 13
+; CHECK-NEXT:    v_writelane_b32 v1, s48, 14
+; CHECK-NEXT:    v_writelane_b32 v1, s49, 15
+; CHECK-NEXT:    s_getpc_b64 s[24:25]
+; CHECK-NEXT:    v_writelane_b32 v1, s50, 16
+; CHECK-NEXT:    s_movk_i32 s4, 0xf0
+; CHECK-NEXT:    s_mov_b32 s5, s24
+; CHECK-NEXT:    v_writelane_b32 v1, s51, 17
+; CHECK-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0x0
+; CHECK-NEXT:    ; implicit-def: $vgpr4 : SGPR spill to VGPR lane
+; CHECK-NEXT:    s_mov_b64 s[4:5], 0
+; CHECK-NEXT:    s_load_dwordx4 s[28:31], s[4:5], 0x0
+; CHECK-NEXT:    s_movk_i32 s4, 0x130
+; CHECK-NEXT:    s_mov_b32 s5, s24
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_writelane_b32 v4, s36, 0
+; CHECK-NEXT:    v_writelane_b32 v4, s37, 1
+; CHECK-NEXT:    v_writelane_b32 v4, s38, 2
+; CHECK-NEXT:    v_writelane_b32 v4, s39, 3
+; CHECK-NEXT:    v_writelane_b32 v4, s40, 4
+; CHECK-NEXT:    v_writelane_b32 v4, s41, 5
+; CHECK-NEXT:    v_writelane_b32 v4, s42, 6
+; CHECK-NEXT:    v_writelane_b32 v4, s43, 7
+; CHECK-NEXT:    v_writelane_b32 v4, s44, 8
+; CHECK-NEXT:    v_writelane_b32 v4, s45, 9
+; CHECK-NEXT:    v_writelane_b32 v4, s46, 10
+; CHECK-NEXT:    s_load_dwordx16 s[4:19], s[4:5], 0x0
+; CHECK-NEXT:    v_writelane_b32 v4, s47, 11
+; CHECK-NEXT:    v_writelane_b32 v4, s48, 12
+; CHECK-NEXT:    v_writelane_b32 v4, s49, 13
+; CHECK-NEXT:    s_mov_b32 s20, 0
+; CHECK-NEXT:    v_mov_b32_e32 v2, 0
+; CHECK-NEXT:    v_writelane_b32 v4, s50, 14
+; CHECK-NEXT:    v_mov_b32_e32 v5, s28
+; CHECK-NEXT:    v_mov_b32_e32 v6, v2
+; CHECK-NEXT:    s_mov_b32 s21, s20
+; CHECK-NEXT:    s_mov_b32 s22, s20
+; CHECK-NEXT:    s_mov_b32 s23, s20
+; CHECK-NEXT:    v_writelane_b32 v4, s51, 15
+; CHECK-NEXT:    v_mov_b32_e32 v3, v2
+; CHECK-NEXT:    image_sample_lz v5, v[5:6], s[44:51], s[20:23] dmask:0x1
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_writelane_b32 v4, s4, 16
+; CHECK-NEXT:    v_writelane_b32 v1, s52, 18
+; CHECK-NEXT:    v_writelane_b32 v4, s5, 17
+; CHECK-NEXT:    v_writelane_b32 v1, s53, 19
+; CHECK-NEXT:    v_writelane_b32 v4, s6, 18
+; CHECK-NEXT:    v_writelane_b32 v1, s54, 20
+; CHECK-NEXT:    image_sample_lz v6, v[2:3], s[4:11], s[20:23] dmask:0x1
+; CHECK-NEXT:    v_writelane_b32 v4, s7, 19
+; CHECK-NEXT:    v_writelane_b32 v1, s55, 21
+; CHECK-NEXT:    v_writelane_b32 v4, s8, 20
+; CHECK-NEXT:    v_writelane_b32 v1, s56, 22
+; CHECK-NEXT:    v_writelane_b32 v4, s9, 21
+; CHECK-NEXT:    v_writelane_b32 v1, s57, 23
+; CHECK-NEXT:    v_writelane_b32 v4, s10, 22
+; CHECK-NEXT:    v_writelane_b32 v1, s58, 24
+; CHECK-NEXT:    v_writelane_b32 v4, s11, 23
+; CHECK-NEXT:    v_writelane_b32 v1, s59, 25
+; CHECK-NEXT:    v_writelane_b32 v4, s12, 24
+; CHECK-NEXT:    v_writelane_b32 v1, s60, 26
+; CHECK-NEXT:    v_writelane_b32 v4, s13, 25
+; CHECK-NEXT:    v_writelane_b32 v1, s61, 27
+; CHECK-NEXT:    v_writelane_b32 v4, s14, 26
+; CHECK-NEXT:    v_writelane_b32 v1, s62, 28
+; CHECK-NEXT:    v_writelane_b32 v4, s15, 27
+; CHECK-NEXT:    v_writelane_b32 v1, s63, 29
+; CHECK-NEXT:    v_writelane_b32 v4, s16, 28
+; CHECK-NEXT:    v_writelane_b32 v1, s64, 30
+; CHECK-NEXT:    v_writelane_b32 v4, s17, 29
+; CHECK-NEXT:    v_writelane_b32 v1, s65, 31
+; CHECK-NEXT:    v_writelane_b32 v4, s18, 30
+; CHECK-NEXT:    v_writelane_b32 v1, s66, 32
+; CHECK-NEXT:    v_writelane_b32 v4, s19, 31
+; CHECK-NEXT:    s_mov_b32 s4, 48
+; CHECK-NEXT:    s_movk_i32 s28, 0x2f0
+; CHECK-NEXT:    s_mov_b32 s5, s24
+; CHECK-NEXT:    s_mov_b32 s29, s24
+; CHECK-NEXT:    v_writelane_b32 v1, s67, 33
+; CHECK-NEXT:    s_movk_i32 s26, 0x1f0
+; CHECK-NEXT:    s_mov_b32 s27, s24
+; CHECK-NEXT:    s_load_dwordx8 s[4:11], s[4:5], 0x0
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    s_load_dwordx16 s[36:51], s[26:27], 0x0
+; CHECK-NEXT:    s_load_dwordx16 s[52:67], s[28:29], 0x0
+; CHECK-NEXT:    v_and_b32_e32 v0, 1, v0
+; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; CHECK-NEXT:    s_xor_b64 s[24:25], vcc, -1
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    v_mul_f32_e32 v0, v6, v5
+; CHECK-NEXT:    s_and_saveexec_b64 s[26:27], s[24:25]
+; CHECK-NEXT:    s_xor_b64 s[26:27], exec, s[26:27]
+; CHECK-NEXT:    s_cbranch_execz .LBB0_3
+; CHECK-NEXT:  ; %bb.1: ; %bb48
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_writelane_b32 v4, s36, 32
+; CHECK-NEXT:    v_writelane_b32 v4, s37, 33
+; CHECK-NEXT:    v_writelane_b32 v4, s38, 34
+; CHECK-NEXT:    v_writelane_b32 v4, s39, 35
+; CHECK-NEXT:    v_writelane_b32 v4, s40, 36
+; CHECK-NEXT:    v_writelane_b32 v4, s41, 37
+; CHECK-NEXT:    v_writelane_b32 v4, s42, 38
+; CHECK-NEXT:    v_writelane_b32 v4, s43, 39
+; CHECK-NEXT:    v_writelane_b32 v4, s44, 40
+; CHECK-NEXT:    v_writelane_b32 v4, s45, 41
+; CHECK-NEXT:    v_writelane_b32 v4, s46, 42
+; CHECK-NEXT:    v_writelane_b32 v4, s47, 43
+; CHECK-NEXT:    v_writelane_b32 v4, s48, 44
+; CHECK-NEXT:    v_writelane_b32 v4, s49, 45
+; CHECK-NEXT:    v_writelane_b32 v4, s50, 46
+; CHECK-NEXT:    v_writelane_b32 v4, s51, 47
+; CHECK-NEXT:    v_readlane_b32 s36, v4, 0
+; CHECK-NEXT:    v_readlane_b32 s44, v4, 8
+; CHECK-NEXT:    v_readlane_b32 s45, v4, 9
+; CHECK-NEXT:    v_readlane_b32 s46, v4, 10
+; CHECK-NEXT:    v_readlane_b32 s47, v4, 11
+; CHECK-NEXT:    v_readlane_b32 s48, v4, 12
+; CHECK-NEXT:    v_readlane_b32 s49, v4, 13
+; CHECK-NEXT:    v_readlane_b32 s50, v4, 14
+; CHECK-NEXT:    v_readlane_b32 s51, v4, 15
+; CHECK-NEXT:    v_readlane_b32 s37, v4, 1
+; CHECK-NEXT:    v_readlane_b32 s38, v4, 2
+; CHECK-NEXT:    v_readlane_b32 s39, v4, 3
+; CHECK-NEXT:    v_readlane_b32 s40, v4, 4
+; CHECK-NEXT:    v_readlane_b32 s41, v4, 5
+; CHECK-NEXT:    image_sample_lz v5, v[2:3], s[44:51], s[20:23] dmask:0x1
+; CHECK-NEXT:    v_readlane_b32 s42, v4, 6
+; CHECK-NEXT:    v_readlane_b32 s43, v4, 7
+; CHECK-NEXT:    v_readlane_b32 s36, v4, 32
+; CHECK-NEXT:    v_readlane_b32 s37, v4, 33
+; CHECK-NEXT:    v_readlane_b32 s38, v4, 34
+; CHECK-NEXT:    v_readlane_b32 s39, v4, 35
+; CHECK-NEXT:    v_readlane_b32 s40, v4, 36
+; CHECK-NEXT:    v_readlane_b32 s41, v4, 37
+; CHECK-NEXT:    v_readlane_b32 s42, v4, 38
+; CHECK-NEXT:    v_readlane_b32 s43, v4, 39
+; CHECK-NEXT:    v_readlane_b32 s44, v4, 40
+; CHECK-NEXT:    v_readlane_b32 s45, v4, 41
+; CHECK-NEXT:    v_readlane_b32 s46, v4, 42
+; CHECK-NEXT:    v_readlane_b32 s47, v4, 43
+; CHECK-NEXT:    v_readlane_b32 s48, v4, 44
+; CHECK-NEXT:    v_readlane_b32 s49, v4, 45
+; CHECK-NEXT:    v_readlane_b32 s50, v4, 46
+; CHECK-NEXT:    v_readlane_b32 s51, v4, 47
+; CHECK-NEXT:    v_mov_b32_e32 v3, 0
+; CHECK-NEXT:    s_and_b64 vcc, exec, -1
+; CHECK-NEXT:  .LBB0_2: ; %bb50
+; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    s_mov_b32 s21, s20
+; CHECK-NEXT:    s_mov_b32 s22, s20
+; CHECK-NEXT:    s_mov_b32 s23, s20
+; CHECK-NEXT:    image_sample_lz v6, v[2:3], s[44:51], s[8:11] dmask:0x1
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    image_sample_lz v2, v[2:3], s[60:67], s[20:23] dmask:0x1
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    v_sub_f32_e32 v2, v2, v6
+; CHECK-NEXT:    v_mul_f32_e32 v2, v2, v0
+; CHECK-NEXT:    v_mul_f32_e32 v2, v2, v5
+; CHECK-NEXT:    s_mov_b64 vcc, vcc
+; CHECK-NEXT:    s_cbranch_vccnz .LBB0_2
+; CHECK-NEXT:  .LBB0_3: ; %Flow14
+; CHECK-NEXT:    s_andn2_saveexec_b64 s[28:29], s[26:27]
+; CHECK-NEXT:    s_cbranch_execz .LBB0_10
+; CHECK-NEXT:  ; %bb.4: ; %bb32
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_and_saveexec_b64 s[8:9], s[24:25]
+; CHECK-NEXT:    s_xor_b64 vcc, exec, s[8:9]
+; CHECK-NEXT:    s_cbranch_execz .LBB0_6
+; CHECK-NEXT:  ; %bb.5: ; %bb43
+; CHECK-NEXT:    s_mov_b32 s8, 0
+; CHECK-NEXT:    s_mov_b64 s[12:13], s[36:37]
+; CHECK-NEXT:    s_mov_b32 s9, s8
+; CHECK-NEXT:    v_mov_b32_e32 v2, s8
+; CHECK-NEXT:    s_mov_b64 s[14:15], s[38:39]
+; CHECK-NEXT:    s_mov_b64 s[16:17], s[40:41]
+; CHECK-NEXT:    s_mov_b64 s[18:19], s[42:43]
+; CHECK-NEXT:    v_readlane_b32 s36, v4, 0
+; CHECK-NEXT:    v_mov_b32_e32 v3, s9
+; CHECK-NEXT:    s_mov_b32 s10, s8
+; CHECK-NEXT:    s_mov_b32 s11, s8
+; CHECK-NEXT:    v_readlane_b32 s37, v4, 1
+; CHECK-NEXT:    v_readlane_b32 s38, v4, 2
+; CHECK-NEXT:    v_readlane_b32 s39, v4, 3
+; CHECK-NEXT:    v_readlane_b32 s40, v4, 4
+; CHECK-NEXT:    v_readlane_b32 s41, v4, 5
+; CHECK-NEXT:    v_readlane_b32 s42, v4, 6
+; CHECK-NEXT:    v_readlane_b32 s43, v4, 7
+; CHECK-NEXT:    v_mov_b32_e32 v6, 0
+; CHECK-NEXT:    v_mov_b32_e32 v7, v6
+; CHECK-NEXT:    v_readlane_b32 s44, v4, 8
+; CHECK-NEXT:    v_readlane_b32 s45, v4, 9
+; CHECK-NEXT:    v_readlane_b32 s46, v4, 10
+; CHECK-NEXT:    image_sample_lz v5, v[2:3], s[36:43], s[8:11] dmask:0x1
+; CHECK-NEXT:    s_mov_b64 s[42:43], s[18:19]
+; CHECK-NEXT:    s_mov_b64 s[40:41], s[16:17]
+; CHECK-NEXT:    s_mov_b64 s[38:39], s[14:15]
+; CHECK-NEXT:    s_mov_b64 s[36:37], s[12:13]
+; CHECK-NEXT:    v_readlane_b32 s12, v4, 16
+; CHECK-NEXT:    v_readlane_b32 s20, v4, 24
+; CHECK-NEXT:    v_readlane_b32 s21, v4, 25
+; CHECK-NEXT:    v_readlane_b32 s22, v4, 26
+; CHECK-NEXT:    v_readlane_b32 s23, v4, 27
+; CHECK-NEXT:    v_readlane_b32 s24, v4, 28
+; CHECK-NEXT:    v_readlane_b32 s25, v4, 29
+; CHECK-NEXT:    v_readlane_b32 s26, v4, 30
+; CHECK-NEXT:    v_readlane_b32 s27, v4, 31
+; CHECK-NEXT:    v_readlane_b32 s47, v4, 11
+; CHECK-NEXT:    v_readlane_b32 s48, v4, 12
+; CHECK-NEXT:    v_readlane_b32 s49, v4, 13
+; CHECK-NEXT:    v_readlane_b32 s50, v4, 14
+; CHECK-NEXT:    v_readlane_b32 s51, v4, 15
+; CHECK-NEXT:    image_sample_lz v2, v[2:3], s[20:27], s[4:7] dmask:0x1
+; CHECK-NEXT:    v_readlane_b32 s13, v4, 17
+; CHECK-NEXT:    v_readlane_b32 s14, v4, 18
+; CHECK-NEXT:    v_readlane_b32 s15, v4, 19
+; CHECK-NEXT:    v_readlane_b32 s16, v4, 20
+; CHECK-NEXT:    v_readlane_b32 s17, v4, 21
+; CHECK-NEXT:    v_readlane_b32 s18, v4, 22
+; CHECK-NEXT:    v_readlane_b32 s19, v4, 23
+; CHECK-NEXT:    ; implicit-def: $vgpr0
+; CHECK-NEXT:    s_waitcnt vmcnt(1)
+; CHECK-NEXT:    buffer_store_dwordx3 v[5:7], off, s[8:11], 0
+; CHECK-NEXT:    s_waitcnt vmcnt(1)
+; CHECK-NEXT:    buffer_store_dwordx4 v[2:5], off, s[8:11], 0
+; CHECK-NEXT:  .LBB0_6: ; %Flow12
+; CHECK-NEXT:    s_andn2_saveexec_b64 s[4:5], vcc
+; CHECK-NEXT:    s_cbranch_execz .LBB0_9
+; CHECK-NEXT:  ; %bb.7: ; %bb33.preheader
+; CHECK-NEXT:    s_mov_b32 s8, 0
+; CHECK-NEXT:    s_mov_b32 s6, s8
+; CHECK-NEXT:    s_mov_b32 s7, s8
+; CHECK-NEXT:    v_mov_b32_e32 v2, s6
+; CHECK-NEXT:    s_mov_b32 s9, s8
+; CHECK-NEXT:    s_mov_b32 s10, s8
+; CHECK-NEXT:    s_mov_b32 s11, s8
+; CHECK-NEXT:    v_mov_b32_e32 v3, s7
+; CHECK-NEXT:    image_sample_lz v5, v[2:3], s[36:43], s[8:11] dmask:0x1
+; CHECK-NEXT:    image_sample_lz v6, v[2:3], s[52:59], s[8:11] dmask:0x1
+; CHECK-NEXT:    s_and_b64 vcc, exec, 0
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    v_sub_f32_e32 v2, v6, v5
+; CHECK-NEXT:    v_mul_f32_e32 v0, v2, v0
+; CHECK-NEXT:    v_mov_b32_e32 v2, 0
+; CHECK-NEXT:  .LBB0_8: ; %bb33
+; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    v_add_f32_e32 v3, v2, v0
+; CHECK-NEXT:    v_sub_f32_e32 v2, v2, v3
+; CHECK-NEXT:    s_mov_b64 vcc, vcc
+; CHECK-NEXT:    s_cbranch_vccz .LBB0_8
+; CHECK-NEXT:  .LBB0_9: ; %Flow13
+; CHECK-NEXT:    s_or_b64 exec, exec, s[4:5]
+; CHECK-NEXT:  .LBB0_10: ; %UnifiedReturnBlock
+; CHECK-NEXT:    s_or_b64 exec, exec, s[28:29]
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_readlane_b32 s67, v1, 33
+; CHECK-NEXT:    v_readlane_b32 s66, v1, 32
+; CHECK-NEXT:    v_readlane_b32 s65, v1, 31
+; CHECK-NEXT:    v_readlane_b32 s64, v1, 30
+; CHECK-NEXT:    v_readlane_b32 s63, v1, 29
+; CHECK-NEXT:    v_readlane_b32 s62, v1, 28
+; CHECK-NEXT:    v_readlane_b32 s61, v1, 27
+; CHECK-NEXT:    v_readlane_b32 s60, v1, 26
+; CHECK-NEXT:    v_readlane_b32 s59, v1, 25
+; CHECK-NEXT:    v_readlane_b32 s58, v1, 24
+; CHECK-NEXT:    v_readlane_b32 s57, v1, 23
+; CHECK-NEXT:    v_readlane_b32 s56, v1, 22
+; CHECK-NEXT:    v_readlane_b32 s55, v1, 21
+; CHECK-NEXT:    v_readlane_b32 s54, v1, 20
+; CHECK-NEXT:    v_readlane_b32 s53, v1, 19
+; CHECK-NEXT:    v_readlane_b32 s52, v1, 18
+; CHECK-NEXT:    v_readlane_b32 s51, v1, 17
+; CHECK-NEXT:    v_readlane_b32 s50, v1, 16
+; CHECK-NEXT:    v_readlane_b32 s49, v1, 15
+; CHECK-NEXT:    v_readlane_b32 s48, v1, 14
+; CHECK-NEXT:    v_readlane_b32 s47, v1, 13
+; CHECK-NEXT:    v_readlane_b32 s46, v1, 12
+; CHECK-NEXT:    v_readlane_b32 s45, v1, 11
+; CHECK-NEXT:    v_readlane_b32 s44, v1, 10
+; CHECK-NEXT:    v_readlane_b32 s43, v1, 9
+; CHECK-NEXT:    v_readlane_b32 s42, v1, 8
+; CHECK-NEXT:    v_readlane_b32 s41, v1, 7
+; CHECK-NEXT:    v_readlane_b32 s40, v1, 6
+; CHECK-NEXT:    v_readlane_b32 s39, v1, 5
+; CHECK-NEXT:    v_readlane_b32 s38, v1, 4
+; CHECK-NEXT:    v_readlane_b32 s37, v1, 3
+; CHECK-NEXT:    v_readlane_b32 s36, v1, 2
+; CHECK-NEXT:    v_readlane_b32 s31, v1, 1
+; CHECK-NEXT:    v_readlane_b32 s30, v1, 0
+; CHECK-NEXT:    ; kill: killed $vgpr4
+; CHECK-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; CHECK-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; CHECK-NEXT:    s_mov_b64 exec, s[4:5]
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+bb:
+  %i = call i64 @llvm.amdgcn.s.getpc()
+  %i1 = trunc i64 %i to i32
+  %i2 = insertelement <2 x i32> zeroinitializer, i32 %i1, i64 1
+  %i3 = bitcast <2 x i32> %i2 to i64
+  %i4 = inttoptr i64 %i3 to ptr addrspace(4)
+  %i5 = getelementptr i8, ptr addrspace(4) %i4, i64 48
+  %i6 = load <4 x i32>, ptr addrspace(4) %i5, align 16
+  %i7 = getelementptr i8, ptr addrspace(4) %i4, i64 64
+  %i8 = load <4 x i32>, ptr addrspace(4) %i7, align 16
+  %i9 = getelementptr i8, ptr addrspace(4) %i4, i64 240
+  %i10 = load <8 x i32>, ptr addrspace(4) %i9, align 32
+  %i11 = getelementptr i8, ptr addrspace(4) %i4, i64 272
+  %i12 = load <8 x i32>, ptr addrspace(4) %i11, align 32
+  %i13 = getelementptr i8, ptr addrspace(4) %i4, i64 304
+  %i14 = load <8 x i32>, ptr addrspace(4) %i13, align 32
+  %i15 = getelementptr i8, ptr addrspace(4) %i4, i64 336
+  %i16 = load <8 x i32>, ptr addrspace(4) %i15, align 32
+  %i17 = getelementptr i8, ptr addrspace(4) %i4, i64 496
+  %i18 = load <8 x i32>, ptr addrspace(4) %i17, align 32
+  %i19 = getelementptr i8, ptr addrspace(4) %i4, i64 528
+  %i20 = load <8 x i32>, ptr addrspace(4) %i19, align 32
+  %i21 = getelementptr i8, ptr addrspace(4) %i4, i64 752
+  %i22 = load <8 x i32>, ptr addrspace(4) %i21, align 32
+  %i23 = getelementptr i8, ptr addrspace(4) %i4, i64 784
+  %i24 = load <8 x i32>, ptr addrspace(4) %i23, align 32
+  %i25 = load <4 x float>, ptr addrspace(4) null, align 16
+  %i26 = extractelement <4 x float> %i25, i64 0
+  %i27 = call float @llvm.amdgcn.image.sample.lz.2d.f32.f32(i32 1, float %i26, float 0.000000e+00, <8 x i32> %i12, <4 x i32> zeroinitializer, i1 false, i32 0, i32 0)
+  %i28 = call <4 x float> @llvm.amdgcn.image.sample.lz.2d.v4f32.f32(i32 1, float 0.000000e+00, float 0.000000e+00, <8 x i32> %i14, <4 x i32> zeroinitializer, i1 false, i32 0, i32 0)
+  %i29 = extractelement <4 x float> %i28, i64 0
+  %i30 = fmul float %i29, %i27
+  %i31 = call <4 x float> @llvm.amdgcn.image.sample.lz.2d.v4f32.f32(i32 1, float 0.000000e+00, float 0.000000e+00, <8 x i32> %i16, <4 x i32> %i6, i1 false, i32 0, i32 0)
+  br i1 %arg, label %bb32, label %bb48
+
+bb32:                                             ; preds = %bb
+  br i1 %arg, label %bb33, label %bb43
+
+bb33:                                             ; preds = %bb33, %bb32
+  %i34 = phi float [ %i42, %bb33 ], [ 0.000000e+00, %bb32 ]
+  %i35 = call <2 x float> @llvm.amdgcn.image.sample.lz.2d.v2f32.f32(i32 1, float 0.000000e+00, float 0.000000e+00, <8 x i32> %i18, <4 x i32> zeroinitializer, i1 false, i32 0, i32 0)
+  %i36 = extractelement <2 x float> %i35, i64 0
+  %i37 = call <2 x float> @llvm.amdgcn.image.sample.lz.2d.v2f32.f32(i32 1, float 0.000000e+00, float 0.000000e+00, <8 x i32> %i22, <4 x i32> zeroinitializer, i1 false, i32 0, i32 0)
+  %i38 = extractelement <2 x float> %i37, i64 0
+  %i39 = fsub float %i38, %i36
+  %i40 = fmul float %i39, %i30
+  %i41 = fadd float %i34, %i40
+  %i42 = fsub float %i34, %i41
+  br label %bb33
+
+bb43:                                             ; preds = %bb32
+  %i44 = call float @llvm.amdgcn.image.sample.lz.2d.f32.f32(i32 1, float 0.000000e+00, float 0.000000e+00, <8 x i32> %i10, <4 x i32> zeroinitializer, i1 false, i32 0, i32 0)
+  %i45 = bitcast float %i44 to i32
+  %i46 = insertelement <3 x i32> zeroinitializer, i32 %i45, i64 0
+  call void @llvm.amdgcn.raw.buffer.store.v3i32(<3 x i32> %i46, <4 x i32> zeroinitializer, i32 0, i32 0, i32 0)
+  %i47 = bitcast <4 x float> %i31 to <4 x i32>
+  call void @llvm.amdgcn.raw.buffer.store.v4i32(<4 x i32> %i47, <4 x i32> zeroinitializer, i32 0, i32 0, i32 0)
+  ret void
+
+bb48:                                             ; preds = %bb
+  %i49 = call float @llvm.amdgcn.image.sample.lz.2d.f32.f32(i32 1, float 0.000000e+00, float 0.000000e+00, <8 x i32> %i12, <4 x i32> zeroinitializer, i1 false, i32 0, i32 0)
+  br label %bb50
+
+bb50:                                             ; preds = %bb50, %bb48
+  %i51 = phi float [ 0.000000e+00, %bb48 ], [ %i58, %bb50 ]
+  %i52 = call <2 x float> @llvm.amdgcn.image.sample.lz.2d.v2f32.f32(i32 1, float %i51, float 0.000000e+00, <8 x i32> %i20, <4 x i32> %i8, i1 false, i32 0, i32 0)
+  %i53 = extractelement <2 x float> %i52, i64 0
+  %i54 = call <2 x float> @llvm.amdgcn.image.sample.lz.2d.v2f32.f32(i32 1, float %i51, float 0.000000e+00, <8 x i32> %i24, <4 x i32> zeroinitializer, i1 false, i32 0, i32 0)
+  %i55 = extractelement <2 x float> %i54, i64 0
+  %i56 = fsub float %i55, %i53
+  %i57 = fmul float %i56, %i30
+  %i58 = fmul float %i57, %i49
+  br label %bb50
+}
+
+declare i64 @llvm.amdgcn.s.getpc() #1
+declare <4 x float> @llvm.amdgcn.image.sample.lz.2d.v4f32.f32(i32 immarg, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #2
+declare float @llvm.amdgcn.image.sample.lz.2d.f32.f32(i32 immarg, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #2
+declare <2 x float> @llvm.amdgcn.image.sample.lz.2d.v2f32.f32(i32 immarg, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #2
+declare void @llvm.amdgcn.raw.buffer.store.v3i32(<3 x i32>, <4 x i32>, i32, i32, i32 immarg) #3
+declare void @llvm.amdgcn.raw.buffer.store.v4i32(<4 x i32>, <4 x i32>, i32, i32, i32 immarg) #3
+
+attributes #0 = { "amdgpu-waves-per-eu"="10,10" }
+attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #2 = { nocallback nofree nosync nounwind willreturn memory(read) }
+attributes #3 = { nocallback nofree nosync nounwind willreturn memory(write) }
diff --git a/llvm/test/CodeGen/AMDGPU/infloop-subrange-spill.mir b/llvm/test/CodeGen/AMDGPU/infloop-subrange-spill.mir
new file mode 100644
index 0000000000000..1030cdb1b43fc
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/infloop-subrange-spill.mir
@@ -0,0 +1,142 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -start-before=greedy,0 -stop-after=virtregrewriter,0 -simplify-mir -o - %s | FileCheck %s
+--- |
+  define void @main() #0 {
+    ret void
+  }
+
+  attributes #0 = { "amdgpu-waves-per-eu"="10,10" }
+
+...
+---
+name:            main
+tracksRegLiveness: true
+machineFunctionInfo:
+  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
+  frameOffsetReg:  '$sgpr33'
+  stackPtrOffsetReg: '$sgpr32'
+  sgprForEXECCopy: '$sgpr58_sgpr59'
+body:             |
+  ; CHECK-LABEL: name: main
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   liveins: $vgpr0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   dead renamable $sgpr5 = IMPLICIT_DEF
+  ; CHECK-NEXT:   dead undef [[DEF:%[0-9]+]].sub0:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   dead renamable $sgpr5 = IMPLICIT_DEF
+  ; CHECK-NEXT:   dead [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   dead undef [[DEF2:%[0-9]+]].sub0:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   dead renamable $sgpr5 = IMPLICIT_DEF
+  ; CHECK-NEXT:   dead undef [[DEF3:%[0-9]+]].sub1:vreg_64 = IMPLICIT_DEF
+  ; CHECK-NEXT:   dead renamable $sgpr5 = IMPLICIT_DEF
+  ; CHECK-NEXT:   renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX16_IMM undef renamable $sgpr4_sgpr5, 0, 0 :: (invariant load (s512), align 32, addrspace 4)
+  ; CHECK-NEXT:   renamable $sgpr24 = IMPLICIT_DEF
+  ; CHECK-NEXT:   renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = S_LOAD_DWORDX16_IMM undef renamable $sgpr4_sgpr5, 0, 0 :: (invariant load (s512), align 32, addrspace 4)
+  ; CHECK-NEXT:   $exec = S_MOV_B64_term undef renamable $sgpr4_sgpr5
+  ; CHECK-NEXT:   S_CBRANCH_EXECZ %bb.5, implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.4
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   liveins: $sgpr24_sgpr25_sgpr26_sgpr27:0x000000000000000F, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19:0x000000000000FFFF, $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51:0x000000000000FFFF
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $exec = S_MOV_B64_term undef renamable $sgpr4_sgpr5
+  ; CHECK-NEXT:   S_CBRANCH_EXECNZ %bb.3, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   liveins: $sgpr24_sgpr25_sgpr26_sgpr27:0x000000000000000F, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19:0x000000000000FFFF, $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51:0x000000000000FFFF
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   dead [[IMAGE_SAMPLE_LZ_V1_V2_:%[0-9]+]]:vgpr_32 = IMAGE_SAMPLE_LZ_V1_V2 undef [[DEF3]], killed renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43, undef renamable $sgpr24_sgpr25_sgpr26_sgpr27, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 8)
+  ; CHECK-NEXT:   dead [[IMAGE_SAMPLE_LZ_V1_V2_1:%[0-9]+]]:vgpr_32 = IMAGE_SAMPLE_LZ_V1_V2 undef [[DEF3]], killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, renamable $sgpr24_sgpr25_sgpr26_sgpr27, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 8)
+  ; CHECK-NEXT:   SI_RETURN
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   liveins: $sgpr24_sgpr25_sgpr26_sgpr27:0x000000000000000F, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19:0x000000000000FFFF
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   renamable $sgpr12 = IMPLICIT_DEF
+  ; CHECK-NEXT:   renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 = IMPLICIT_DEF
+  ; CHECK-NEXT:   dead undef [[IMAGE_SAMPLE_LZ_V1_V2_2:%[0-9]+]].sub0:vreg_96 = IMAGE_SAMPLE_LZ_V1_V2 undef [[DEF3]], killed renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43, renamable $sgpr12_sgpr13_sgpr14_sgpr15, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 8)
+  ; CHECK-NEXT:   renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX16_IMM undef renamable $sgpr4_sgpr5, 0, 0 :: (invariant load (s512), align 32, addrspace 4)
+  ; CHECK-NEXT:   renamable $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = IMPLICIT_DEF
+  ; CHECK-NEXT:   dead undef [[IMAGE_SAMPLE_LZ_V1_V2_3:%[0-9]+]].sub0:vreg_128 = IMAGE_SAMPLE_LZ_V1_V2 undef [[DEF3]], undef renamable $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51, killed renamable $sgpr12_sgpr13_sgpr14_sgpr15, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 8)
+  ; CHECK-NEXT:   S_BRANCH %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4:
+  ; CHECK-NEXT:   liveins: $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19:0x000000000000FFFF, $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51:0x00000000FFFFFFFF
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   renamable $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27 = COPY killed renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51
+  ; CHECK-NEXT:   renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 = IMPLICIT_DEF
+  ; CHECK-NEXT:   dead [[IMAGE_SAMPLE_LZ_V1_V2_4:%[0-9]+]]:vgpr_32 = IMAGE_SAMPLE_LZ_V1_V2 undef [[DEF]], killed renamable $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51, undef renamable $sgpr24_sgpr25_sgpr26_sgpr27, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 8)
+  ; CHECK-NEXT:   renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 = COPY killed renamable $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
+  ; CHECK-NEXT:   S_BRANCH %bb.6
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.5:
+  ; CHECK-NEXT:   liveins: $sgpr24_sgpr25_sgpr26_sgpr27:0x000000000000000F, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19:0x000000000000FFFF, $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51:0x000000000000FFFF
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $exec = S_XOR_B64_term $exec, undef renamable $sgpr4_sgpr5, implicit-def $scc
+  ; CHECK-NEXT:   S_CBRANCH_EXECZ %bb.7, implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.6:
+  ; CHECK-NEXT:   liveins: $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19:0x000000000000FFFF, $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51:0x00000000FFFFFFFF
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   dead [[IMAGE_SAMPLE_LZ_V1_V2_5:%[0-9]+]]:vgpr_32 = IMAGE_SAMPLE_LZ_V1_V2 undef [[DEF]], renamable $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51, undef renamable $sgpr8_sgpr9_sgpr10_sgpr11, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 8)
+  ; CHECK-NEXT:   renamable $sgpr25 = COPY undef renamable $sgpr24
+  ; CHECK-NEXT:   S_CBRANCH_VCCNZ %bb.6, implicit undef $vcc
+  ; CHECK-NEXT:   S_BRANCH %bb.5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.7:
+  ; CHECK-NEXT:   SI_RETURN
+  bb.0:
+    liveins: $vgpr0
+
+    undef %0.sub1:sreg_64 = IMPLICIT_DEF
+    %1:sgpr_512 = IMPLICIT_DEF
+    undef %2.sub0:vreg_64 = IMPLICIT_DEF
+    %3:sgpr_256 = IMPLICIT_DEF
+    undef %4.sub1:sgpr_64 = IMPLICIT_DEF
+    %5:vgpr_32 = IMPLICIT_DEF
+    undef %6.sub0:vreg_64 = IMPLICIT_DEF
+    undef %7.sub1:sreg_64 = IMPLICIT_DEF
+    undef %8.sub1:vreg_64 = IMPLICIT_DEF
+    undef %9.sub1:sreg_64 = IMPLICIT_DEF
+    %10:sgpr_512 = S_LOAD_DWORDX16_IMM undef %0, 0, 0 :: (invariant load (s512), align 32, addrspace 4)
+    undef %11.sub0:sgpr_128 = IMPLICIT_DEF
+    %12:sgpr_512 = S_LOAD_DWORDX16_IMM undef %7, 0, 0 :: (invariant load (s512), align 32, addrspace 4)
+    $exec = S_MOV_B64_term undef %9
+    S_CBRANCH_EXECZ %bb.5, implicit $exec
+    S_BRANCH %bb.4
+
+  bb.1:
+    $exec = S_MOV_B64_term undef %9
+    S_CBRANCH_EXECNZ %bb.3, implicit $exec
+
+  bb.2:
+    %13:vgpr_32 = IMAGE_SAMPLE_LZ_V1_V2 undef %8, %10.sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7, undef %11, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 8)
+    %14:vgpr_32 = IMAGE_SAMPLE_LZ_V1_V2 undef %8, %12.sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7, %11, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 8)
+    SI_RETURN
+
+  bb.3:
+    undef %15.sub0:sgpr_128 = IMPLICIT_DEF
+    undef %16.sub0:vreg_96 = IMAGE_SAMPLE_LZ_V1_V2 undef %8, %1.sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7, %15, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 8)
+    undef %17.sub0:vreg_128 = IMAGE_SAMPLE_LZ_V1_V2 undef %8, undef %10.sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15, %3.sub0_sub1_sub2_sub3, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 8)
+    S_BRANCH %bb.2
+
+  bb.4:
+    %18:vgpr_32 = IMAGE_SAMPLE_LZ_V1_V2 undef %2, %1.sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15, undef %11, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 8)
+    S_BRANCH %bb.6
+
+  bb.5:
+    $exec = S_XOR_B64_term $exec, undef %9, implicit-def $scc
+    S_CBRANCH_EXECZ %bb.7, implicit $exec
+    S_BRANCH %bb.1
+
+  bb.6:
+    %19:vgpr_32 = IMAGE_SAMPLE_LZ_V1_V2 undef %2, %10.sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15, undef %3.sub4_sub5_sub6_sub7, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 8)
+    undef %11.sub1:sgpr_128 = COPY undef %11.sub0
+    S_CBRANCH_VCCNZ %bb.6, implicit undef $vcc
+    S_BRANCH %bb.5
+
+  bb.7:
+    SI_RETURN
+
+...

From c149ff3d372a70e7d9de838a1ff0357394b8d017 Mon Sep 17 00:00:00 2001
From: Brad Smith <brad@comstyle.com>
Date: Fri, 27 Oct 2023 04:43:19 -0400
Subject: [PATCH 194/877] [Driver] Link Flang runtime on FreeBSD, NetBSD,
 OpenBSD, DragonFly and Haiku (#69817)

---
 clang/lib/Driver/ToolChains/DragonFly.cpp | 10 ++++++++++
 clang/lib/Driver/ToolChains/FreeBSD.cpp   | 14 ++++++++++++++
 clang/lib/Driver/ToolChains/Haiku.cpp     |  9 +++++++++
 clang/lib/Driver/ToolChains/NetBSD.cpp    | 11 +++++++++++
 clang/lib/Driver/ToolChains/OpenBSD.cpp   | 14 ++++++++++++++
 flang/test/Driver/linker-flags.f90        |  9 +++++++++
 6 files changed, 67 insertions(+)

diff --git a/clang/lib/Driver/ToolChains/DragonFly.cpp b/clang/lib/Driver/ToolChains/DragonFly.cpp
index 9dc8d9d4363bd..500dd98665075 100644
--- a/clang/lib/Driver/ToolChains/DragonFly.cpp
+++ b/clang/lib/Driver/ToolChains/DragonFly.cpp
@@ -144,6 +144,16 @@ void dragonfly::Linker::ConstructJob(Compilation &C, const JobAction &JA,
       CmdArgs.push_back("-lm");
     }
 
+    // Additional linker set-up and flags for Fortran. This is required in order
+    // to generate executables. As Fortran runtime depends on the C runtime,
+    // these dependencies need to be listed before the C runtime below (i.e.
+    // AddRuntTimeLibs).
+    if (D.IsFlangMode()) {
+      addFortranRuntimeLibraryPath(ToolChain, Args, CmdArgs);
+      addFortranRuntimeLibs(ToolChain, CmdArgs);
+      CmdArgs.push_back("-lm");
+    }
+
     if (Args.hasArg(options::OPT_pthread))
       CmdArgs.push_back("-lpthread");
 
diff --git a/clang/lib/Driver/ToolChains/FreeBSD.cpp b/clang/lib/Driver/ToolChains/FreeBSD.cpp
index 054f57c21ee6b..a5be32b4a2f07 100644
--- a/clang/lib/Driver/ToolChains/FreeBSD.cpp
+++ b/clang/lib/Driver/ToolChains/FreeBSD.cpp
@@ -290,6 +290,20 @@ void freebsd::Linker::ConstructJob(Compilation &C, const JobAction &JA,
       else
         CmdArgs.push_back("-lm");
     }
+
+    // Additional linker set-up and flags for Fortran. This is required in order
+    // to generate executables. As Fortran runtime depends on the C runtime,
+    // these dependencies need to be listed before the C runtime below (i.e.
+    // AddRuntTimeLibs).
+    if (D.IsFlangMode()) {
+      addFortranRuntimeLibraryPath(ToolChain, Args, CmdArgs);
+      addFortranRuntimeLibs(ToolChain, CmdArgs);
+      if (Profiling)
+        CmdArgs.push_back("-lm_p");
+      else
+        CmdArgs.push_back("-lm");
+    }
+
     if (NeedsSanitizerDeps)
       linkSanitizerRuntimeDeps(ToolChain, Args, CmdArgs);
     if (NeedsXRayDeps)
diff --git a/clang/lib/Driver/ToolChains/Haiku.cpp b/clang/lib/Driver/ToolChains/Haiku.cpp
index 4212dd8de293b..172f16cc46e32 100644
--- a/clang/lib/Driver/ToolChains/Haiku.cpp
+++ b/clang/lib/Driver/ToolChains/Haiku.cpp
@@ -95,6 +95,15 @@ void haiku::Linker::ConstructJob(Compilation &C, const JobAction &JA,
     if (D.CCCIsCXX() && ToolChain.ShouldLinkCXXStdlib(Args))
       ToolChain.AddCXXStdlibLibArgs(Args, CmdArgs);
 
+    // Additional linker set-up and flags for Fortran. This is required in order
+    // to generate executables. As Fortran runtime depends on the C runtime,
+    // these dependencies need to be listed before the C runtime below (i.e.
+    // AddRuntTimeLibs).
+    if (D.IsFlangMode()) {
+      addFortranRuntimeLibraryPath(ToolChain, Args, CmdArgs);
+      addFortranRuntimeLibs(ToolChain, CmdArgs);
+    }
+
     CmdArgs.push_back("-lgcc");
 
     CmdArgs.push_back("--push-state");
diff --git a/clang/lib/Driver/ToolChains/NetBSD.cpp b/clang/lib/Driver/ToolChains/NetBSD.cpp
index 3bf8af642b334..245553d46055a 100644
--- a/clang/lib/Driver/ToolChains/NetBSD.cpp
+++ b/clang/lib/Driver/ToolChains/NetBSD.cpp
@@ -314,6 +314,17 @@ void netbsd::Linker::ConstructJob(Compilation &C, const JobAction &JA,
         ToolChain.AddCXXStdlibLibArgs(Args, CmdArgs);
       CmdArgs.push_back("-lm");
     }
+
+    // Additional linker set-up and flags for Fortran. This is required in order
+    // to generate executables. As Fortran runtime depends on the C runtime,
+    // these dependencies need to be listed before the C runtime below (i.e.
+    // AddRuntTimeLibs).
+    if (D.IsFlangMode()) {
+      addFortranRuntimeLibraryPath(ToolChain, Args, CmdArgs);
+      addFortranRuntimeLibs(ToolChain, CmdArgs);
+      CmdArgs.push_back("-lm");
+    }
+
     if (NeedsSanitizerDeps)
       linkSanitizerRuntimeDeps(ToolChain, Args, CmdArgs);
     if (NeedsXRayDeps)
diff --git a/clang/lib/Driver/ToolChains/OpenBSD.cpp b/clang/lib/Driver/ToolChains/OpenBSD.cpp
index 8df6dae3dd6ba..4e14c3d140a1d 100644
--- a/clang/lib/Driver/ToolChains/OpenBSD.cpp
+++ b/clang/lib/Driver/ToolChains/OpenBSD.cpp
@@ -213,6 +213,20 @@ void openbsd::Linker::ConstructJob(Compilation &C, const JobAction &JA,
       else
         CmdArgs.push_back("-lm");
     }
+
+    // Additional linker set-up and flags for Fortran. This is required in order
+    // to generate executables. As Fortran runtime depends on the C runtime,
+    // these dependencies need to be listed before the C runtime below (i.e.
+    // AddRuntTimeLibs).
+    if (D.IsFlangMode()) {
+      addFortranRuntimeLibraryPath(ToolChain, Args, CmdArgs);
+      addFortranRuntimeLibs(ToolChain, CmdArgs);
+      if (Profiling)
+        CmdArgs.push_back("-lm_p");
+      else
+        CmdArgs.push_back("-lm");
+    }
+
     if (NeedsSanitizerDeps) {
       CmdArgs.push_back(ToolChain.getCompilerRTArgString(Args, "builtins"));
       linkSanitizerRuntimeDeps(ToolChain, Args, CmdArgs);
diff --git a/flang/test/Driver/linker-flags.f90 b/flang/test/Driver/linker-flags.f90
index 213bc032d9645..e4d713df499b7 100644
--- a/flang/test/Driver/linker-flags.f90
+++ b/flang/test/Driver/linker-flags.f90
@@ -5,6 +5,11 @@
 ! RUN: %flang -### --target=ppc64le-linux-gnu %S/Inputs/hello.f90 2>&1 | FileCheck %s --check-prefixes=CHECK,UNIX
 ! RUN: %flang -### --target=aarch64-apple-darwin %S/Inputs/hello.f90 2>&1 | FileCheck %s --check-prefixes=CHECK,DARWIN
 ! RUN: %flang -### --target=sparc-sun-solaris2.11 %S/Inputs/hello.f90 2>&1 | FileCheck %s --check-prefixes=CHECK,UNIX
+! RUN: %flang -### --target=x86_64-unknown-freebsd %S/Inputs/hello.f90 2>&1 | FileCheck %s --check-prefixes=CHECK,UNIX
+! RUN: %flang -### --target=x86_64-unknown-netbsd %S/Inputs/hello.f90 2>&1 | FileCheck %s --check-prefixes=CHECK,UNIX
+! RUN: %flang -### --target=x86_64-unknown-openbsd %S/Inputs/hello.f90 2>&1 | FileCheck %s --check-prefixes=CHECK,UNIX
+! RUN: %flang -### --target=x86_64-unknown-dragonfly %S/Inputs/hello.f90 2>&1 | FileCheck %s --check-prefixes=CHECK,UNIX
+! RUN: %flang -### --target=x86_64-unknown-haiku %S/Inputs/hello.f90 2>&1 | FileCheck %s --check-prefixes=CHECK,HAIKU
 ! RUN: %flang -### --target=x86_64-windows-gnu %S/Inputs/hello.f90 2>&1 | FileCheck %s --check-prefixes=CHECK,MINGW
 
 ! NOTE: Clang's driver library, clangDriver, usually adds 'libcmt' and
@@ -32,6 +37,10 @@
 ! DARWIN-SAME: -lFortranRuntime
 ! DARWIN-SAME: -lFortranDecimal
 
+! HAIKU-LABEL:  "{{.*}}ld{{(\.exe)?}}"
+! HAIKU-SAME: "[[object_file]]"
+! HAIKU-SAME: "-lFortran_main" "-lFortranRuntime" "-lFortranDecimal"
+
 ! MINGW-LABEL:  "{{.*}}ld{{(\.exe)?}}"
 ! MINGW-SAME: "[[object_file]]"
 ! MINGW-SAME: -lFortran_main

From 051fade10f03515b0980c58541fe4c28220e0e67 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder@redhat.com>
Date: Fri, 27 Oct 2023 09:32:13 +0200
Subject: [PATCH 195/877] [clang][Interp][NFC] Use delegate() address-of
 operators

---
 clang/lib/AST/Interp/ByteCodeExprGen.cpp | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/clang/lib/AST/Interp/ByteCodeExprGen.cpp b/clang/lib/AST/Interp/ByteCodeExprGen.cpp
index a5141d728d832..195af664c13da 100644
--- a/clang/lib/AST/Interp/ByteCodeExprGen.cpp
+++ b/clang/lib/AST/Interp/ByteCodeExprGen.cpp
@@ -2549,9 +2549,7 @@ bool ByteCodeExprGen<Emitter>::VisitUnaryOperator(const UnaryOperator *E) {
     return DiscardResult ? this->emitPop(*T, E) : true;
   case UO_AddrOf: // &x
     // We should already have a pointer when we get here.
-    if (!this->visit(SubExpr))
-      return false;
-    return DiscardResult ? this->emitPop(*T, E) : true;
+    return this->delegate(SubExpr);
   case UO_Deref:  // *x
     return dereference(
         SubExpr, DerefKind::Read,

From e64e4784ee1c3ed9bb06458e9c427542927224ab Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder@redhat.com>
Date: Fri, 27 Oct 2023 09:41:09 +0200
Subject: [PATCH 196/877] [clang][Interp][NFC] Rename a parameter

---
 clang/lib/AST/Interp/Program.cpp | 9 ++++-----
 clang/lib/AST/Interp/Program.h   | 2 +-
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/clang/lib/AST/Interp/Program.cpp b/clang/lib/AST/Interp/Program.cpp
index c6d19afd7d221..52e13398163ec 100644
--- a/clang/lib/AST/Interp/Program.cpp
+++ b/clang/lib/AST/Interp/Program.cpp
@@ -138,14 +138,13 @@ std::optional<unsigned> Program::getOrCreateGlobal(const ValueDecl *VD,
   return std::nullopt;
 }
 
-std::optional<unsigned> Program::getOrCreateDummy(const ValueDecl *PD) {
+std::optional<unsigned> Program::getOrCreateDummy(const ValueDecl *VD) {
   // Dedup blocks since they are immutable and pointers cannot be compared.
-  if (auto It = DummyParams.find(PD);
-      It != DummyParams.end())
+  if (auto It = DummyParams.find(VD); It != DummyParams.end())
     return It->second;
 
   // Create dummy descriptor.
-  Descriptor *Desc = allocateDescriptor(PD, std::nullopt);
+  Descriptor *Desc = allocateDescriptor(VD, std::nullopt);
   // Allocate a block for storage.
   unsigned I = Globals.size();
 
@@ -154,7 +153,7 @@ std::optional<unsigned> Program::getOrCreateDummy(const ValueDecl *PD) {
   G->block()->invokeCtor();
 
   Globals.push_back(G);
-  DummyParams[PD] = I;
+  DummyParams[VD] = I;
   return I;
 }
 
diff --git a/clang/lib/AST/Interp/Program.h b/clang/lib/AST/Interp/Program.h
index d843fa06538f5..17342680102cf 100644
--- a/clang/lib/AST/Interp/Program.h
+++ b/clang/lib/AST/Interp/Program.h
@@ -83,7 +83,7 @@ class Program final {
                                             const Expr *Init = nullptr);
 
   /// Returns or creates a dummy value for unknown declarations.
-  std::optional<unsigned> getOrCreateDummy(const ValueDecl *PD);
+  std::optional<unsigned> getOrCreateDummy(const ValueDecl *VD);
 
   /// Creates a global and returns its index.
   std::optional<unsigned> createGlobal(const ValueDecl *VD, const Expr *E);

From 674261b20373b9a769d596eb9ab3f44e2305bc55 Mon Sep 17 00:00:00 2001
From: tyb0807 <vuson@google.com>
Date: Fri, 27 Oct 2023 10:49:58 +0200
Subject: [PATCH 197/877] [mlir][Vector] Add narrow type emulation pattern for
 vector.maskedload (#68443)

---
 .../Transforms/VectorEmulateNarrowType.cpp    | 171 +++++++++++-
 .../Vector/vector-emulate-narrow-type.mlir    | 246 +++++++++++++++++-
 .../Vector/CPU/test-rewrite-narrow-types.mlir |  22 ++
 3 files changed, 435 insertions(+), 4 deletions(-)

diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp
index 94300291dcd7d..3d65123373109 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp
@@ -7,6 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Arith/Transforms/NarrowTypeEmulationConverter.h"
 #include "mlir/Dialect/Arith/Utils/Utils.h"
@@ -103,6 +104,172 @@ struct ConvertVectorLoad final : OpConversionPattern<vector::LoadOp> {
   }
 };
 
+//===----------------------------------------------------------------------===//
+// ConvertVectorMaskedLoad
+//===----------------------------------------------------------------------===//
+
+struct ConvertVectorMaskedLoad final
+    : OpConversionPattern<vector::MaskedLoadOp> {
+  using OpConversionPattern::OpConversionPattern;
+
+  LogicalResult
+  matchAndRewrite(vector::MaskedLoadOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+
+    auto loc = op.getLoc();
+    auto convertedType = cast<MemRefType>(adaptor.getBase().getType());
+    Type oldElementType = op.getType().getElementType();
+    Type newElementType = convertedType.getElementType();
+    int srcBits = oldElementType.getIntOrFloatBitWidth();
+    int dstBits = newElementType.getIntOrFloatBitWidth();
+
+    if (dstBits % srcBits != 0) {
+      return rewriter.notifyMatchFailure(
+          op, "only dstBits % srcBits == 0 supported");
+    }
+    int scale = dstBits / srcBits;
+
+    // Adjust the number of elements to load when emulating narrow types,
+    // and then cast back to the original type with vector.bitcast op.
+    // For example, to emulate i4 to i8, the following op:
+    //
+    //   %mask = vector.constant_mask [3] : vector<6xi1>
+    //   %1 = vector.maskedload %0[%c0, %c0], %mask, %pass_thru :
+    //        memref<3x6xi4>, vector<6xi1>, vector<6xi4> into vector<6xi4>
+    //
+    // can be replaced with
+    //
+    //   %new_mask = vector.constant_mask [2] : vector<3xi1>
+    //   %new_pass_thru = vector.bitcast %pass_thru :
+    //        vector<6xi4> to vector<3xi8>
+    //   %1 = vector.maskedload %0[%linear_index], %new_mask, %new_pass_thru :
+    //        memref<9xi8>, vector<3xi1>, vector<3xi8> into vector<3xi8>
+    //   %2 = vector.bitcast %1 : vector<3xi8> to vector<6xi4>
+    //
+    // Since we are effectively loading 16 bits (2xi8) from the memref with the
+    // new mask, while originally we only wanted to effectively load 12 bits
+    // (3xi4) from the memref, we need to set the second half of the last i8
+    // that was effectively loaded (i.e. the second i8) to %pass_thru.
+    //
+    //   %3 = arith.select %mask, %2, %pass_thru : vector<6xi1>, vector<6xi4>
+    //
+    // Given these input values:
+    //   %mask = [1, 1, 1, 0, 0, 0]
+    //   %0[%c0, %c0] contains [0x1, 0x2, 0x3, 0x4, 0x5, 0x6]
+    //   %pass_thru = [0x7, 0x8, 0x9, 0xA, 0xB, 0xC]
+    //
+    // we'll have:
+    //
+    //   expected output: [0x1, 0x2, 0x3, 0xA, 0xB, 0xC]
+    //
+    //   %new_mask = [1, 1, 0]
+    //   %new_pass_thru = [0x78, 0x9A, 0xBC]
+    //   %1 = [0x12, 0x34, 0xBC]
+    //   %2 = [0x1, 0x2, 0x3, 0x4, 0xB, 0xC]
+    //   %3 = [0x1, 0x2, 0x3, 0xA, 0xB, 0xC]
+    //
+    // TODO: Currently, only the even number of elements loading is supported.
+    // To deal with the odd number of elements, one has to extract the
+    // subvector at the proper offset after bit-casting.
+
+    auto origType = op.getVectorType();
+    auto origElements = origType.getNumElements();
+    if (origElements % scale != 0)
+      return failure();
+
+    auto stridedMetadata =
+        rewriter.create<memref::ExtractStridedMetadataOp>(loc, op.getBase());
+
+    OpFoldResult linearizedIndices;
+    std::tie(std::ignore, linearizedIndices) =
+        memref::getLinearizedMemRefOffsetAndSize(
+            rewriter, loc, srcBits, dstBits,
+            stridedMetadata.getConstifiedMixedOffset(),
+            stridedMetadata.getConstifiedMixedSizes(),
+            stridedMetadata.getConstifiedMixedStrides(),
+            getAsOpFoldResult(adaptor.getIndices()));
+
+    auto numElements = (origElements + scale - 1) / scale;
+    auto newType = VectorType::get(numElements, newElementType);
+
+    auto maskOp = op.getMask().getDefiningOp();
+    SmallVector<vector::ExtractOp, 2> extractOps;
+    // Finding the mask creation operation.
+    while (maskOp &&
+           !isa<vector::CreateMaskOp, vector::ConstantMaskOp>(maskOp)) {
+      if (auto extractOp = dyn_cast<vector::ExtractOp>(maskOp)) {
+        maskOp = extractOp.getVector().getDefiningOp();
+        extractOps.push_back(extractOp);
+      }
+    }
+    auto createMaskOp = dyn_cast_or_null<vector::CreateMaskOp>(maskOp);
+    auto constantMaskOp = dyn_cast_or_null<vector::ConstantMaskOp>(maskOp);
+    if (!createMaskOp && !constantMaskOp)
+      return failure();
+
+    // Computing the "compressed" mask. All the emulation logic (i.e. computing
+    // new mask index) only happens on the last dimension of the vectors.
+    Operation *newMask = nullptr;
+    auto shape = llvm::to_vector(
+        maskOp->getResultTypes()[0].cast<VectorType>().getShape().drop_back());
+    shape.push_back(numElements);
+    auto newMaskType = VectorType::get(shape, rewriter.getI1Type());
+    if (createMaskOp) {
+      auto maskOperands = createMaskOp.getOperands();
+      auto numMaskOperands = maskOperands.size();
+      AffineExpr s0;
+      bindSymbols(rewriter.getContext(), s0);
+      s0 = s0 + scale - 1;
+      s0 = s0.floorDiv(scale);
+      OpFoldResult origIndex =
+          getAsOpFoldResult(maskOperands[numMaskOperands - 1]);
+      OpFoldResult maskIndex =
+          affine::makeComposedFoldedAffineApply(rewriter, loc, s0, origIndex);
+      auto newMaskOperands = llvm::to_vector(maskOperands.drop_back());
+      newMaskOperands.push_back(
+          getValueOrCreateConstantIndexOp(rewriter, loc, maskIndex));
+      newMask = rewriter.create<vector::CreateMaskOp>(loc, newMaskType,
+                                                      newMaskOperands);
+    } else if (constantMaskOp) {
+      auto maskDimSizes = constantMaskOp.getMaskDimSizes().getValue();
+      auto numMaskOperands = maskDimSizes.size();
+      auto origIndex =
+          cast<IntegerAttr>(maskDimSizes[numMaskOperands - 1]).getInt();
+      auto maskIndex =
+          rewriter.getI64IntegerAttr((origIndex + scale - 1) / scale);
+      auto newMaskDimSizes = llvm::to_vector(maskDimSizes.drop_back());
+      newMaskDimSizes.push_back(maskIndex);
+      newMask = rewriter.create<vector::ConstantMaskOp>(
+          loc, newMaskType, rewriter.getArrayAttr(newMaskDimSizes));
+    }
+
+    while (!extractOps.empty()) {
+      newMask = rewriter.create<vector::ExtractOp>(
+          loc, newMask->getResults()[0], extractOps.back().getMixedPosition());
+      extractOps.pop_back();
+    }
+
+    auto newPassThru =
+        rewriter.create<vector::BitCastOp>(loc, newType, op.getPassThru());
+
+    // Generating the new masked load.
+    auto newLoad = rewriter.create<vector::MaskedLoadOp>(
+        loc, newType, adaptor.getBase(),
+        getValueOrCreateConstantIndexOp(rewriter, loc, linearizedIndices),
+        newMask->getResult(0), newPassThru);
+
+    // Setting the part that originally was not effectively loaded from memory
+    // to pass through.
+    auto bitCast =
+        rewriter.create<vector::BitCastOp>(loc, op.getType(), newLoad);
+    auto select = rewriter.create<arith::SelectOp>(loc, op.getMask(), bitCast,
+                                                   op.getPassThru());
+    rewriter.replaceOp(op, select->getResult(0));
+
+    return success();
+  }
+};
+
 //===----------------------------------------------------------------------===//
 // ConvertVectorTransferRead
 //===----------------------------------------------------------------------===//
@@ -588,8 +755,8 @@ void vector::populateVectorNarrowTypeEmulationPatterns(
     RewritePatternSet &patterns) {
 
   // Populate `vector.*` conversion patterns.
-  patterns.add<ConvertVectorLoad, ConvertVectorTransferRead>(
-      typeConverter, patterns.getContext());
+  patterns.add<ConvertVectorLoad, ConvertVectorMaskedLoad,
+               ConvertVectorTransferRead>(typeConverter, patterns.getContext());
 }
 
 void vector::populateVectorNarrowTypeRewritePatterns(
diff --git a/mlir/test/Dialect/Vector/vector-emulate-narrow-type.mlir b/mlir/test/Dialect/Vector/vector-emulate-narrow-type.mlir
index 6fcea33ddc952..e1d6c3be49471 100644
--- a/mlir/test/Dialect/Vector/vector-emulate-narrow-type.mlir
+++ b/mlir/test/Dialect/Vector/vector-emulate-narrow-type.mlir
@@ -1,5 +1,5 @@
-// RUN: mlir-opt --test-emulate-narrow-int="memref-load-bitwidth=8" --cse --split-input-file %s | FileCheck %s
-// RUN: mlir-opt --test-emulate-narrow-int="memref-load-bitwidth=32" --cse --split-input-file %s | FileCheck %s --check-prefix=CHECK32
+// RUN: mlir-opt --test-emulate-narrow-int="arith-compute-bitwidth=1 memref-load-bitwidth=8" --cse --split-input-file %s | FileCheck %s
+// RUN: mlir-opt --test-emulate-narrow-int="arith-compute-bitwidth=1 memref-load-bitwidth=32" --cse --split-input-file %s | FileCheck %s --check-prefix=CHECK32
 
 func.func @vector_load_i8(%arg1: index, %arg2: index) -> vector<4xi8> {
     %0 = memref.alloc() : memref<3x4xi8>
@@ -108,3 +108,245 @@ func.func @vector_transfer_read_i4(%arg1: index, %arg2: index) -> vector<8xi4> {
 //      CHECK32:   %[[INDEX:.+]] = affine.apply #[[MAP]]()[%[[ARG0]], %[[ARG1]]]
 //      CHECK32:   %[[VEC:.+]] = vector.transfer_read %[[ALLOC]][%[[INDEX]]], %[[PAD]] : memref<3xi32>, vector<1xi32>
 //      CHECK32:   %[[VEC_I4:.+]] = vector.bitcast %[[VEC]] : vector<1xi32> to vector<8xi4>
+
+// -----
+
+func.func @vector_maskedload_i8(%arg1: index, %arg2: index, %arg3: index, %passthru: vector<4xi8>) -> vector<4xi8> {
+    %0 = memref.alloc() : memref<3x4xi8>
+    %mask = vector.create_mask %arg3 : vector<4xi1>
+    %1 = vector.maskedload %0[%arg1, %arg2], %mask, %passthru :
+      memref<3x4xi8>, vector<4xi1>, vector<4xi8> into vector<4xi8>
+    return %1 : vector<4xi8>
+}
+// Expect no conversions, i8 is supported.
+//      CHECK: func @vector_maskedload_i8(
+// CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: index, %[[ARG1:[a-zA-Z0-9]+]]: index,
+// CHECK-SAME:     %[[ARG2:[a-zA-Z0-9]+]]: index, %[[ARG3:[a-zA-Z0-9]+]]: vector<4xi8>)
+// CHECK-NEXT:   %[[ALLOC:.+]] = memref.alloc() : memref<3x4xi8>
+// CHECK-NEXT:   %[[MASK:.+]] = vector.create_mask %[[ARG2]] : vector<4xi1>
+// CHECK-NEXT:   [[L:%.+]] = vector.maskedload %[[ALLOC]][%[[ARG0]], %[[ARG1]]], %[[MASK]], %[[ARG3]] :
+// CHECK-SAME:     memref<3x4xi8>, vector<4xi1>, vector<4xi8> into vector<4xi8>
+// CHECK-NEXT:   return
+
+//  CHECK32-DAG: #[[LOAD_IDX_MAP:.+]] = affine_map<()[s0, s1] -> (s0 + s1 floordiv 4)>
+//  CHECK32-DAG: #[[MASK_IDX_MAP:.+]] = affine_map<()[s0] -> ((s0 + 3) floordiv 4)>
+//      CHECK32: func @vector_maskedload_i8(
+// CHECK32-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: index, %[[ARG1:[a-zA-Z0-9]+]]: index,
+// CHECK32-SAME:     %[[ARG2:[a-zA-Z0-9]+]]: index, %[[ARG3:[a-zA-Z0-9]+]]: vector<4xi8>)
+//      CHECK32:   %[[ALLOC:.+]] = memref.alloc() : memref<3xi32>
+//      CHECK32:   %[[ORIG_MASK:.+]] = vector.create_mask %[[ARG2]] : vector<4xi1>
+//      CHECK32:   %[[LD_IDX:.+]] = affine.apply #[[LOAD_IDX_MAP]]()[%[[ARG0]], %[[ARG1]]]
+//      CHECK32:   %[[MASK_IDX:.+]] = affine.apply #[[MASK_IDX_MAP]]()[%[[ARG2]]]
+//      CHECK32:   %[[NEW_MASK:.+]] = vector.create_mask %[[MASK_IDX]] : vector<1xi1>
+//      CHECK32:   %[[NEW_PASSTHRU:.+]] = vector.bitcast %[[ARG3]] : vector<4xi8> to vector<1xi32>
+//      CHECK32:   %[[LOAD:.+]] = vector.maskedload %[[ALLOC]][%[[LD_IDX]]], %[[NEW_MASK]], %[[NEW_PASSTHRU]] :
+// CHECK32-SAME:     memref<3xi32>, vector<1xi1>, vector<1xi32> into vector<1xi32>
+//      CHECK32:   %[[BITCAST:.+]] = vector.bitcast %[[LOAD]] : vector<1xi32> to vector<4xi8>
+//      CHECK32:   %[[SELECT:.+]] = arith.select %[[ORIG_MASK]], %[[BITCAST]], %[[ARG3]] : vector<4xi1>, vector<4xi8>
+//      CHECK32:   return %[[SELECT]]
+
+// -----
+
+func.func @vector_maskedload_i4(%arg1: index, %arg2: index, %arg3: index, %passthru: vector<8xi4>) -> vector<3x8xi4> {
+    %0 = memref.alloc() : memref<3x8xi4>
+    %cst = arith.constant dense<0> : vector<3x8xi4>
+    %mask = vector.create_mask %arg3 : vector<8xi1>
+    %1 = vector.maskedload %0[%arg1, %arg2], %mask, %passthru :
+      memref<3x8xi4>, vector<8xi1>, vector<8xi4> into vector<8xi4>
+    %2 = vector.insert %1, %cst [0] : vector<8xi4> into vector<3x8xi4>
+    return %2 : vector<3x8xi4>
+}
+//  CHECK-DAG: #[[LOAD_IDX_MAP:.+]] = affine_map<()[s0, s1] -> (s0 * 4 + s1 floordiv 2)>
+//  CHECK-DAG: #[[MASK_IDX_MAP:.+]] = affine_map<()[s0] -> ((s0 + 1) floordiv 2)>
+//      CHECK: func @vector_maskedload_i4(
+// CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: index, %[[ARG1:[a-zA-Z0-9]+]]: index,
+// CHECK-SAME:     %[[ARG2:[a-zA-Z0-9]+]]: index, %[[ARG3:[a-zA-Z0-9]+]]: vector<8xi4>)
+//      CHECK:   %[[ALLOC:.+]] = memref.alloc() : memref<12xi8>
+//      CHECK:   %[[ORIG_MASK:.+]] = vector.create_mask %[[ARG2]] : vector<8xi1>
+//      CHECK:   %[[LD_IDX:.+]] = affine.apply #[[LOAD_IDX_MAP]]()[%[[ARG0]], %[[ARG1]]]
+//      CHECK:   %[[MASK_IDX:.+]] = affine.apply #[[MASK_IDX_MAP]]()[%[[ARG2]]]
+//      CHECK:   %[[NEW_MASK:.+]] = vector.create_mask %[[MASK_IDX]] : vector<4xi1>
+//      CHECK:   %[[NEW_PASSTHRU:.+]] = vector.bitcast %[[ARG3]] : vector<8xi4> to vector<4xi8>
+//      CHECK:   %[[LOAD:.+]] = vector.maskedload %[[ALLOC]][%[[LD_IDX]]], %[[NEW_MASK]], %[[NEW_PASSTHRU]] :
+// CHECK-SAME:     memref<12xi8>, vector<4xi1>, vector<4xi8> into vector<4xi8>
+//      CHECK:   %[[BITCAST:.+]] = vector.bitcast %[[LOAD]] : vector<4xi8> to vector<8xi4>
+//      CHECK:   %[[SELECT:.+]] = arith.select %[[ORIG_MASK]], %[[BITCAST]], %[[ARG3]] : vector<8xi1>, vector<8xi4>
+
+//  CHECK32-DAG: #[[LOAD_IDX_MAP:.+]] = affine_map<()[s0, s1] -> (s0 + s1 floordiv 8)>
+//  CHECK32-DAG: #[[MASK_IDX_MAP:.+]] = affine_map<()[s0] -> ((s0 + 7) floordiv 8)>
+//      CHECK32: func @vector_maskedload_i4(
+// CHECK32-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: index, %[[ARG1:[a-zA-Z0-9]+]]: index,
+// CHECK32-SAME:     %[[ARG2:[a-zA-Z0-9]+]]: index, %[[ARG3:[a-zA-Z0-9]+]]: vector<8xi4>)
+//      CHECK32:   %[[ALLOC:.+]] = memref.alloc() : memref<3xi32>
+//      CHECK32:   %[[ORIG_MASK:.+]] = vector.create_mask %[[ARG2]] : vector<8xi1>
+//      CHECK32:   %[[LD_IDX:.+]] = affine.apply #[[LOAD_IDX_MAP]]()[%[[ARG0]], %[[ARG1]]]
+//      CHECK32:   %[[MASK_IDX:.+]] = affine.apply #[[MASK_IDX_MAP]]()[%[[ARG2]]]
+//      CHECK32:   %[[NEW_MASK:.+]] = vector.create_mask %[[MASK_IDX]] : vector<1xi1>
+//      CHECK32:   %[[NEW_PASSTHRU:.+]] = vector.bitcast %[[ARG3]] : vector<8xi4> to vector<1xi32>
+//      CHECK32:   %[[LOAD:.+]] = vector.maskedload %[[ALLOC]][%[[LD_IDX]]], %[[NEW_MASK]], %[[NEW_PASSTHRU]] :
+// CHECK32-SAME:     memref<3xi32>, vector<1xi1>, vector<1xi32> into vector<1xi32>
+//      CHECK32:   %[[BITCAST:.+]] = vector.bitcast %[[LOAD]] : vector<1xi32> to vector<8xi4>
+//      CHECK32:   %[[SELECT:.+]] = arith.select %[[ORIG_MASK]], %[[BITCAST]], %[[ARG3]] : vector<8xi1>, vector<8xi4>
+
+// -----
+
+func.func @vector_cst_maskedload_i8(%arg1: index, %arg2: index, %passthru: vector<4xi8>) -> vector<4xi8> {
+    %0 = memref.alloc() : memref<3x4xi8>
+    %mask = vector.constant_mask [2] : vector<4xi1>
+    %1 = vector.maskedload %0[%arg1, %arg2], %mask, %passthru :
+      memref<3x4xi8>, vector<4xi1>, vector<4xi8> into vector<4xi8>
+    return %1 : vector<4xi8>
+}
+// Expect no conversions, i8 is supported.
+//      CHECK: func @vector_cst_maskedload_i8(
+// CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: index, %[[ARG1:[a-zA-Z0-9]+]]: index,
+// CHECK-SAME:     %[[ARG2:[a-zA-Z0-9]+]]: vector<4xi8>)
+// CHECK-NEXT:   %[[ALLOC:.+]] = memref.alloc() : memref<3x4xi8>
+// CHECK-NEXT:   %[[MASK:.+]] = vector.constant_mask [2] : vector<4xi1>
+// CHECK-NEXT:   [[L:%.+]] = vector.maskedload %[[ALLOC]][%[[ARG0]], %[[ARG1]]], %[[MASK]], %[[ARG2]] :
+// CHECK-SAME:     memref<3x4xi8>, vector<4xi1>, vector<4xi8> into vector<4xi8>
+// CHECK-NEXT:   return
+
+//  CHECK32-DAG: #[[LOAD_IDX_MAP:.+]] = affine_map<()[s0, s1] -> (s0 + s1 floordiv 4)>
+//      CHECK32: func @vector_cst_maskedload_i8(
+// CHECK32-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: index, %[[ARG1:[a-zA-Z0-9]+]]: index,
+// CHECK32-SAME:     %[[ARG3:[a-zA-Z0-9]+]]: vector<4xi8>)
+//      CHECK32:   %[[ALLOC:.+]] = memref.alloc() : memref<3xi32>
+//      CHECK32:   %[[ORIG_MASK:.+]] = vector.constant_mask [2] : vector<4xi1>
+//      CHECK32:   %[[LD_IDX:.+]] = affine.apply #[[LOAD_IDX_MAP]]()[%[[ARG0]], %[[ARG1]]]
+//      CHECK32:   %[[NEW_MASK:.+]] = vector.constant_mask [1] : vector<1xi1>
+//      CHECK32:   %[[NEW_PASSTHRU:.+]] = vector.bitcast %[[ARG3]] : vector<4xi8> to vector<1xi32>
+//      CHECK32:   %[[LOAD:.+]] = vector.maskedload %[[ALLOC]][%[[LD_IDX]]], %[[NEW_MASK]], %[[NEW_PASSTHRU]] :
+// CHECK32-SAME:     memref<3xi32>, vector<1xi1>, vector<1xi32> into vector<1xi32>
+//      CHECK32:   %[[BITCAST:.+]] = vector.bitcast %[[LOAD]] : vector<1xi32> to vector<4xi8>
+//      CHECK32:   %[[SELECT:.+]] = arith.select %[[ORIG_MASK]], %[[BITCAST]], %[[ARG3]] : vector<4xi1>, vector<4xi8>
+//      CHECK32:   return %[[SELECT]]
+
+// -----
+
+func.func @vector_cst_maskedload_i4(%arg1: index, %arg2: index, %passthru: vector<8xi4>) -> vector<3x8xi4> {
+    %0 = memref.alloc() : memref<3x8xi4>
+    %cst = arith.constant dense<0> : vector<3x8xi4>
+    %mask = vector.constant_mask [4] : vector<8xi1>
+    %1 = vector.maskedload %0[%arg1, %arg2], %mask, %passthru :
+      memref<3x8xi4>, vector<8xi1>, vector<8xi4> into vector<8xi4>
+    %2 = vector.insert %1, %cst [0] : vector<8xi4> into vector<3x8xi4>
+    return %2 : vector<3x8xi4>
+}
+//  CHECK-DAG: #[[LOAD_IDX_MAP:.+]] = affine_map<()[s0, s1] -> (s0 * 4 + s1 floordiv 2)>
+//      CHECK: func @vector_cst_maskedload_i4(
+// CHECK-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: index, %[[ARG1:[a-zA-Z0-9]+]]: index,
+// CHECK-SAME:     %[[ARG2:[a-zA-Z0-9]+]]: vector<8xi4>)
+//      CHECK:   %[[ALLOC:.+]] = memref.alloc() : memref<12xi8>
+//      CHECK:   %[[ORIG_MASK:.+]] = vector.constant_mask [4] : vector<8xi1>
+//      CHECK:   %[[LD_IDX:.+]] = affine.apply #[[LOAD_IDX_MAP]]()[%[[ARG0]], %[[ARG1]]]
+//      CHECK:   %[[NEW_MASK:.+]] = vector.constant_mask [2] : vector<4xi1>
+//      CHECK:   %[[NEW_PASSTHRU:.+]] = vector.bitcast %[[ARG2]] : vector<8xi4> to vector<4xi8>
+//      CHECK:   %[[LOAD:.+]] = vector.maskedload %[[ALLOC]][%[[LD_IDX]]], %[[NEW_MASK]], %[[NEW_PASSTHRU]] :
+// CHECK-SAME:     memref<12xi8>, vector<4xi1>, vector<4xi8> into vector<4xi8>
+//      CHECK:   %[[BITCAST:.+]] = vector.bitcast %[[LOAD]] : vector<4xi8> to vector<8xi4>
+//      CHECK:   %[[SELECT:.+]] = arith.select %[[ORIG_MASK]], %[[BITCAST]], %[[ARG2]] : vector<8xi1>, vector<8xi4>
+
+//  CHECK32-DAG: #[[LOAD_IDX_MAP:.+]] = affine_map<()[s0, s1] -> (s0 + s1 floordiv 8)>
+//      CHECK32: func @vector_cst_maskedload_i4(
+// CHECK32-SAME:     %[[ARG0:[a-zA-Z0-9]+]]: index, %[[ARG1:[a-zA-Z0-9]+]]: index,
+// CHECK32-SAME:     %[[ARG2:[a-zA-Z0-9]+]]: vector<8xi4>)
+//      CHECK32:   %[[ALLOC:.+]] = memref.alloc() : memref<3xi32>
+//      CHECK32:   %[[ORIG_MASK:.+]] = vector.constant_mask [4] : vector<8xi1>
+//      CHECK32:   %[[LD_IDX:.+]] = affine.apply #[[LOAD_IDX_MAP]]()[%[[ARG0]], %[[ARG1]]]
+//      CHECK32:   %[[NEW_MASK:.+]] = vector.constant_mask [1] : vector<1xi1>
+//      CHECK32:   %[[NEW_PASSTHRU:.+]] = vector.bitcast %[[ARG2]] : vector<8xi4> to vector<1xi32>
+//      CHECK32:   %[[LOAD:.+]] = vector.maskedload %[[ALLOC]][%[[LD_IDX]]], %[[NEW_MASK]], %[[NEW_PASSTHRU]] :
+// CHECK32-SAME:     memref<3xi32>, vector<1xi1>, vector<1xi32> into vector<1xi32>
+//      CHECK32:   %[[BITCAST:.+]] = vector.bitcast %[[LOAD]] : vector<1xi32> to vector<8xi4>
+//      CHECK32:   %[[SELECT:.+]] = arith.select %[[ORIG_MASK]], %[[BITCAST]], %[[ARG2]] : vector<8xi1>, vector<8xi4>
+
+// -----
+
+func.func @vector_extract_maskedload_i4(%arg1: index) -> vector<8x8x16xi4> {
+    %0 = memref.alloc() : memref<8x8x16xi4>
+    %c0 = arith.constant 0 : index
+    %c16 = arith.constant 16 : index
+    %c8 = arith.constant 8 : index
+    %cst_1 = arith.constant dense<0> : vector<8x8x16xi4>
+    %cst_2 = arith.constant dense<0> : vector<16xi4>
+    %27 = vector.create_mask %c8, %arg1, %c16 : vector<8x8x16xi1>
+    %48 = vector.extract %27[0] : vector<8x16xi1> from vector<8x8x16xi1>
+    %49 = vector.extract %48[0] : vector<16xi1> from vector<8x16xi1>
+    %50 = vector.maskedload %0[%c0, %c0, %c0], %49, %cst_2 : memref<8x8x16xi4>, vector<16xi1>, vector<16xi4> into vector<16xi4>
+    %63 = vector.insert %50, %cst_1 [0, 0] : vector<16xi4> into vector<8x8x16xi4>
+    return %63 : vector<8x8x16xi4>
+}
+//      CHECK: func @vector_extract_maskedload_i4(
+//      CHECK:   %[[ALLOC:.+]] = memref.alloc() : memref<512xi8>
+//      CHECK:   %[[PASSTHRU:.+]] = arith.constant dense<0> : vector<16xi4>
+//      CHECK:   %[[ORIG_MASK:.+]] = vector.create_mask {{.*}} vector<8x8x16xi1>
+//      CHECK:   %[[ORIG_EXT1:.+]] = vector.extract %[[ORIG_MASK]][0] : vector<8x16xi1>
+//      CHECK:   %[[ORIG_EXT2:.+]] = vector.extract %[[ORIG_EXT1]][0] : vector<16xi1>
+//      CHECK:   %[[NEW_MASK:.+]] = vector.create_mask {{.*}} vector<8x8x8xi1>
+//      CHECK:   %[[NEW_EXT1:.+]] = vector.extract %[[NEW_MASK]][0] : vector<8x8xi1>
+//      CHECK:   %[[NEW_EXT2:.+]] = vector.extract %[[NEW_EXT1]][0] : vector<8xi1>
+//      CHECK:   %[[NEW_PASSTHRU:.+]] = vector.bitcast %[[PASSTHRU]] : vector<16xi4> to vector<8xi8>
+//      CHECK:   %[[LOAD:.+]] = vector.maskedload %[[ALLOC]][%c0], %[[NEW_EXT2]], %[[NEW_PASSTHRU]] :
+// CHECK-SAME:     memref<512xi8>, vector<8xi1>, vector<8xi8> into vector<8xi8>
+//      CHECK:   %[[BITCAST:.+]] = vector.bitcast %[[LOAD]] : vector<8xi8> to vector<16xi4>
+//      CHECK:   %[[SELECT:.+]] = arith.select %[[ORIG_EXT2]], %[[BITCAST]], %[[PASSTHRU]] : vector<16xi1>, vector<16xi4>
+
+//      CHECK32: func @vector_extract_maskedload_i4(
+//      CHECK32:   %[[ALLOC:.+]] = memref.alloc() : memref<128xi32>
+//      CHECK32:   %[[PASSTHRU:.+]] = arith.constant dense<0> : vector<16xi4>
+//      CHECK32:   %[[ORIG_MASK:.+]] = vector.create_mask {{.*}} vector<8x8x16xi1>
+//      CHECK32:   %[[ORIG_EXT1:.+]] = vector.extract %[[ORIG_MASK]][0] : vector<8x16xi1>
+//      CHECK32:   %[[ORIG_EXT2:.+]] = vector.extract %[[ORIG_EXT1]][0] : vector<16xi1>
+//      CHECK32:   %[[NEW_MASK:.+]] = vector.create_mask {{.*}} vector<8x8x2xi1>
+//      CHECK32:   %[[NEW_EXT1:.+]] = vector.extract %[[NEW_MASK]][0] : vector<8x2xi1>
+//      CHECK32:   %[[NEW_EXT2:.+]] = vector.extract %[[NEW_EXT1]][0] : vector<2xi1>
+//      CHECK32:   %[[NEW_PASSTHRU:.+]] = vector.bitcast %[[PASSTHRU]] : vector<16xi4> to vector<2xi32>
+//      CHECK32:   %[[LOAD:.+]] = vector.maskedload %[[ALLOC]][%c0], %[[NEW_EXT2]], %[[NEW_PASSTHRU]] :
+// CHECK32-SAME:     memref<128xi32>, vector<2xi1>, vector<2xi32> into vector<2xi32>
+//      CHECK32:   %[[BITCAST:.+]] = vector.bitcast %[[LOAD]] : vector<2xi32> to vector<16xi4>
+//      CHECK32:   %[[SELECT:.+]] = arith.select %[[ORIG_EXT2]], %[[BITCAST]], %[[PASSTHRU]] : vector<16xi1>, vector<16xi4>
+
+// -----
+
+func.func @vector_extract_cst_maskedload_i4() -> vector<8x8x16xi4> {
+    %0 = memref.alloc() : memref<8x8x16xi4>
+    %c0 = arith.constant 0 : index
+    %cst_1 = arith.constant dense<0> : vector<8x8x16xi4>
+    %cst_2 = arith.constant dense<0> : vector<16xi4>
+    %27 = vector.constant_mask [8, 4, 16] : vector<8x8x16xi1>
+    %48 = vector.extract %27[0] : vector<8x16xi1> from vector<8x8x16xi1>
+    %49 = vector.extract %48[0] : vector<16xi1> from vector<8x16xi1>
+    %50 = vector.maskedload %0[%c0, %c0, %c0], %49, %cst_2 : memref<8x8x16xi4>, vector<16xi1>, vector<16xi4> into vector<16xi4>
+    %63 = vector.insert %50, %cst_1 [0, 0] : vector<16xi4> into vector<8x8x16xi4>
+    return %63 : vector<8x8x16xi4>
+}
+//      CHECK: func @vector_extract_cst_maskedload_i4(
+//      CHECK:   %[[ALLOC:.+]] = memref.alloc() : memref<512xi8>
+//      CHECK:   %[[PASSTHRU:.+]] = arith.constant dense<0> : vector<16xi4>
+//      CHECK:   %[[ORIG_MASK:.+]] = vector.constant_mask {{.*}} vector<8x8x16xi1>
+//      CHECK:   %[[ORIG_EXT1:.+]] = vector.extract %[[ORIG_MASK]][0] : vector<8x16xi1>
+//      CHECK:   %[[ORIG_EXT2:.+]] = vector.extract %[[ORIG_EXT1]][0] : vector<16xi1>
+//      CHECK:   %[[NEW_MASK:.+]] = vector.constant_mask {{.*}} vector<8x8x8xi1>
+//      CHECK:   %[[NEW_EXT1:.+]] = vector.extract %[[NEW_MASK]][0] : vector<8x8xi1>
+//      CHECK:   %[[NEW_EXT2:.+]] = vector.extract %[[NEW_EXT1]][0] : vector<8xi1>
+//      CHECK:   %[[NEW_PASSTHRU:.+]] = vector.bitcast %[[PASSTHRU]] : vector<16xi4> to vector<8xi8>
+//      CHECK:   %[[LOAD:.+]] = vector.maskedload %[[ALLOC]][%c0], %[[NEW_EXT2]], %[[NEW_PASSTHRU]] :
+// CHECK-SAME:     memref<512xi8>, vector<8xi1>, vector<8xi8> into vector<8xi8>
+//      CHECK:   %[[BITCAST:.+]] = vector.bitcast %[[LOAD]] : vector<8xi8> to vector<16xi4>
+//      CHECK:   %[[SELECT:.+]] = arith.select %[[ORIG_EXT2]], %[[BITCAST]], %[[PASSTHRU]] : vector<16xi1>, vector<16xi4>
+
+//      CHECK32: func @vector_extract_cst_maskedload_i4(
+//      CHECK32:   %[[ALLOC:.+]] = memref.alloc() : memref<128xi32>
+//      CHECK32:   %[[PASSTHRU:.+]] = arith.constant dense<0> : vector<16xi4>
+//      CHECK32:   %[[ORIG_MASK:.+]] = vector.constant_mask {{.*}} vector<8x8x16xi1>
+//      CHECK32:   %[[ORIG_EXT1:.+]] = vector.extract %[[ORIG_MASK]][0] : vector<8x16xi1>
+//      CHECK32:   %[[ORIG_EXT2:.+]] = vector.extract %[[ORIG_EXT1]][0] : vector<16xi1>
+//      CHECK32:   %[[NEW_MASK:.+]] = vector.constant_mask {{.*}} vector<8x8x2xi1>
+//      CHECK32:   %[[NEW_EXT1:.+]] = vector.extract %[[NEW_MASK]][0] : vector<8x2xi1>
+//      CHECK32:   %[[NEW_EXT2:.+]] = vector.extract %[[NEW_EXT1]][0] : vector<2xi1>
+//      CHECK32:   %[[NEW_PASSTHRU:.+]] = vector.bitcast %[[PASSTHRU]] : vector<16xi4> to vector<2xi32>
+//      CHECK32:   %[[LOAD:.+]] = vector.maskedload %[[ALLOC]][%c0], %[[NEW_EXT2]], %[[NEW_PASSTHRU]] :
+// CHECK32-SAME:     memref<128xi32>, vector<2xi1>, vector<2xi32> into vector<2xi32>
+//      CHECK32:   %[[BITCAST:.+]] = vector.bitcast %[[LOAD]] : vector<2xi32> to vector<16xi4>
+//      CHECK32:   %[[SELECT:.+]] = arith.select %[[ORIG_EXT2]], %[[BITCAST]], %[[PASSTHRU]] : vector<16xi1>, vector<16xi4>
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/test-rewrite-narrow-types.mlir b/mlir/test/Integration/Dialect/Vector/CPU/test-rewrite-narrow-types.mlir
index 6bdeb4523865f..3c8c8d45013dc 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/test-rewrite-narrow-types.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/test-rewrite-narrow-types.mlir
@@ -164,6 +164,13 @@ func.func @fext(%a: vector<5xi8>) {
   return
 }
 
+func.func @fcst_maskedload(%A: memref<?xi4>, %passthru: vector<6xi4>) -> vector<6xi4> {
+  %c0 = arith.constant 0: index
+  %mask = vector.constant_mask [3] : vector<6xi1>
+  %1 = vector.maskedload %A[%c0], %mask, %passthru :
+    memref<?xi4>, vector<6xi1>, vector<6xi4> into vector<6xi4>
+  return %1 : vector<6xi4>
+}
 
 func.func @entry() {
   %v = arith.constant dense<[
@@ -187,6 +194,21 @@ func.func @entry() {
   ]> : vector<5xi8>
   func.call @fext(%v4) : (vector<5xi8>) -> ()
 
+  // Set up memory.
+  %c0 = arith.constant 0: index
+  %c1 = arith.constant 1: index
+  %c6 = arith.constant 6: index
+  %A = memref.alloc(%c6) : memref<?xi4>
+  scf.for %i = %c0 to %c6 step %c1 {
+    %i4 = arith.index_cast %i : index to i4
+    memref.store %i4, %A[%i] : memref<?xi4>
+  }
+  %passthru = arith.constant dense<[7, 8, 9, 10, 11, 12]> : vector<6xi4>
+  %load = call @fcst_maskedload(%A, %passthru) : (memref<?xi4>, vector<6xi4>) -> (vector<6xi4>)
+  vector.print %load : vector<6xi4>
+  // CHECK: ( 1, 2, 3, -6, -5, -4 )
+  memref.dealloc %A : memref<?xi4>
+
   return
 }
 

From 75b0a99668cef7abaf36e09c41bb1eb91234bbf3 Mon Sep 17 00:00:00 2001
From: Ami-zhang <zhanglimin@loongson.cn>
Date: Fri, 27 Oct 2023 16:52:10 +0800
Subject: [PATCH 198/877] [test][compiler-rt] Mark several tests as UNSUPPORTED
 on LoongArch (#69699)

---
 compiler-rt/test/fuzzer/exit_on_src_pos.test                    | 2 ++
 .../test/sanitizer_common/TestCases/Linux/odd_stack_size.cpp    | 2 ++
 .../sanitizer_common/TestCases/Linux/release_to_os_test.cpp     | 2 +-
 compiler-rt/test/xray/TestCases/Posix/fdr-mode.cpp              | 2 ++
 4 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/compiler-rt/test/fuzzer/exit_on_src_pos.test b/compiler-rt/test/fuzzer/exit_on_src_pos.test
index 541e0c4c6e424..020424e2d9fdd 100644
--- a/compiler-rt/test/fuzzer/exit_on_src_pos.test
+++ b/compiler-rt/test/fuzzer/exit_on_src_pos.test
@@ -6,6 +6,8 @@
 
 # Test does not complete on Armv7 Thumb build bot
 UNSUPPORTED: target=thumb{{.*}}
+# Timeout on loongarch64 machine
+UNSUPPORTED: target=loongarch64{{.*}}
 
 RUN: %cpp_compiler -O0 %S/SimpleTest.cpp -o %t-SimpleTest.exe -mllvm -use-unknown-locations=Disable
 RUN: %cpp_compiler -O0 %S/ShrinkControlFlowTest.cpp -o %t-ShrinkControlFlowTest.exe
diff --git a/compiler-rt/test/sanitizer_common/TestCases/Linux/odd_stack_size.cpp b/compiler-rt/test/sanitizer_common/TestCases/Linux/odd_stack_size.cpp
index fc31212b7f18b..9d7d46b462a88 100644
--- a/compiler-rt/test/sanitizer_common/TestCases/Linux/odd_stack_size.cpp
+++ b/compiler-rt/test/sanitizer_common/TestCases/Linux/odd_stack_size.cpp
@@ -5,6 +5,8 @@
 // AddressSanitizer: CHECK failed: asan_thread.cpp:315 "((AddrIsInStack((uptr)&local))) != (0)"
 // https://lab.llvm.org/buildbot/#/builders/18/builds/8162
 // UNSUPPORTED: target=powerpc64{{.*}}
+/// Occasionally fail on loongarch64 machine
+// UNSUPPORTED: target=loongarch64{{.*}}
 
 #include <assert.h>
 #include <stdlib.h>
diff --git a/compiler-rt/test/sanitizer_common/TestCases/Linux/release_to_os_test.cpp b/compiler-rt/test/sanitizer_common/TestCases/Linux/release_to_os_test.cpp
index 67351a9164412..0fa77200bf1cc 100644
--- a/compiler-rt/test/sanitizer_common/TestCases/Linux/release_to_os_test.cpp
+++ b/compiler-rt/test/sanitizer_common/TestCases/Linux/release_to_os_test.cpp
@@ -3,7 +3,7 @@
 
 // Temporarily disable test
 // UNSUPPORTED: tsan
-// UNSUPPORTED: target=powerpc64{{.*}}
+// UNSUPPORTED: target={{(powerpc64|loongarch64).*}}
 
 // Not needed, no allocator.
 // UNSUPPORTED: ubsan
diff --git a/compiler-rt/test/xray/TestCases/Posix/fdr-mode.cpp b/compiler-rt/test/xray/TestCases/Posix/fdr-mode.cpp
index 3065981a2c9a7..0ee8aaa755d5a 100644
--- a/compiler-rt/test/xray/TestCases/Posix/fdr-mode.cpp
+++ b/compiler-rt/test/xray/TestCases/Posix/fdr-mode.cpp
@@ -19,6 +19,8 @@
 // RUN: rm fdr-logging-test-*
 // RUN: rm fdr-unwrite-test-*
 // UNSUPPORTED: target=powerpc64le-{{.*}}
+/// TODO: FDR logging arg1 handler(__xray_ArgLoggerEntry) hasn't implemented yet on LoongArch
+// UNSUPPORTED: target=loongarch64{{.*}}
 // REQUIRES: built-in-llvm-tree
 
 #include "xray/xray_log_interface.h"

From e9a7876c2c81423f2289aa85eeac32d7a55cd3c8 Mon Sep 17 00:00:00 2001
From: Chuanqi Xu <yedeng.yd@linux.alibaba.com>
Date: Fri, 27 Oct 2023 16:32:09 +0800
Subject: [PATCH 199/877] [C++20] [Modules] Chose BMI from for module m with
 the last -fmodule-file=<module-name>= option

Currently if we have multiple `-fmodule-file=<module-name>=<BMI-path>`
flags for the same `<module-name>`, we will pick the BMI-path from the
first flag. And this is inconsistent with what users generally expect.
e.g, we might expect the latter flags can override the former ones.

This patch changes the behavior to match user's expectation.
---
 clang/lib/Frontend/CompilerInvocation.cpp     |  4 ++--
 ...duplicated-module-file-eq-module-name.cppm | 21 +++++++++++++++++++
 2 files changed, 23 insertions(+), 2 deletions(-)
 create mode 100644 clang/test/Modules/duplicated-module-file-eq-module-name.cppm

diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp
index 4e6d7bb16f51b..fd6c250efeda2 100644
--- a/clang/lib/Frontend/CompilerInvocation.cpp
+++ b/clang/lib/Frontend/CompilerInvocation.cpp
@@ -3163,8 +3163,8 @@ static bool ParseHeaderSearchArgs(HeaderSearchOptions &Opts, ArgList &Args,
     StringRef Val = A->getValue();
     if (Val.contains('=')) {
       auto Split = Val.split('=');
-      Opts.PrebuiltModuleFiles.insert(
-          {std::string(Split.first), std::string(Split.second)});
+      Opts.PrebuiltModuleFiles.insert_or_assign(
+          std::string(Split.first), std::string(Split.second));
     }
   }
   for (const auto *A : Args.filtered(OPT_fprebuilt_module_path))
diff --git a/clang/test/Modules/duplicated-module-file-eq-module-name.cppm b/clang/test/Modules/duplicated-module-file-eq-module-name.cppm
new file mode 100644
index 0000000000000..e86dbe2b941ef
--- /dev/null
+++ b/clang/test/Modules/duplicated-module-file-eq-module-name.cppm
@@ -0,0 +1,21 @@
+// Tests that we will pick the last `-fmodule-file=<module-name>=<path>` flag
+// for <module-name>.
+
+// RUN: rm -rf %t
+// RUN: split-file %s %t
+// RUN: cd %t
+//
+// RUN: %clang_cc1 -std=c++20 %t/a.cppm -emit-module-interface -o %t/a.pcm
+// RUN: %clang_cc1 -std=c++20 %t/u.cpp -fmodule-file=a=%t/unexist.pcm \
+// RUN:      -fmodule-file=a=%t/a.pcm -verify -fsyntax-only
+
+//--- a.cppm
+export module a;
+export int a();
+
+//--- u.cpp
+// expected-no-diagnostics
+import a;
+int u() {
+    return a();
+}

From dd4891318c72d3d0503801a2f78da77fa00c6dcf Mon Sep 17 00:00:00 2001
From: Rajveer Singh Bharadwaj <rajveer.developer@icloud.com>
Date: Fri, 27 Oct 2023 14:37:12 +0530
Subject: [PATCH 200/877] [libc++] Fix _CopySegment helper in
 ranges::copy(join_view, out) when called in a static assertion context
 (#69593)

Resolves Issue #69083

The `_CopySegment` helper for `ranges::copy(join_view, out)` is not
`constexpr` causing rejection in `libc++` in a static assertion context
as in the issue snippet.
---
 libcxx/include/__algorithm/copy.h             |   5 +-
 libcxx/include/__algorithm/move.h             |   5 +-
 .../alg.copy/ranges.copy.segmented.pass.cpp   |  15 ++-
 .../ranges.copy_backward.segmented.pass.cpp   | 111 +++++++++++++++++
 .../alg.copy/ranges.copy_n.segmented.pass.cpp | 114 +++++++++++++++++
 .../alg.move/ranges.move.segmented.pass.cpp   | 115 ++++++++++++++++++
 .../ranges.move_backward.segmented.pass.cpp   | 111 +++++++++++++++++
 7 files changed, 468 insertions(+), 8 deletions(-)
 create mode 100644 libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/ranges.copy_backward.segmented.pass.cpp
 create mode 100644 libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/ranges.copy_n.segmented.pass.cpp
 create mode 100644 libcxx/test/std/algorithms/alg.modifying.operations/alg.move/ranges.move.segmented.pass.cpp
 create mode 100644 libcxx/test/std/algorithms/alg.modifying.operations/alg.move/ranges.move_backward.segmented.pass.cpp

diff --git a/libcxx/include/__algorithm/copy.h b/libcxx/include/__algorithm/copy.h
index dfe9898c6480c..b35c6fa04a381 100644
--- a/libcxx/include/__algorithm/copy.h
+++ b/libcxx/include/__algorithm/copy.h
@@ -51,9 +51,10 @@ struct __copy_loop {
 
     _OutIter& __result_;
 
-    _LIBCPP_HIDE_FROM_ABI _CopySegment(_OutIter& __result) : __result_(__result) {}
+    _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 explicit _CopySegment(_OutIter& __result)
+        : __result_(__result) {}
 
-    _LIBCPP_HIDE_FROM_ABI void
+    _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 void
     operator()(typename _Traits::__local_iterator __lfirst, typename _Traits::__local_iterator __llast) {
       __result_ = std::__copy<_AlgPolicy>(__lfirst, __llast, std::move(__result_)).second;
     }
diff --git a/libcxx/include/__algorithm/move.h b/libcxx/include/__algorithm/move.h
index 01aeef4e177e8..e0da07117e66c 100644
--- a/libcxx/include/__algorithm/move.h
+++ b/libcxx/include/__algorithm/move.h
@@ -52,9 +52,10 @@ struct __move_loop {
 
     _OutIter& __result_;
 
-    _LIBCPP_HIDE_FROM_ABI _MoveSegment(_OutIter& __result) : __result_(__result) {}
+    _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 explicit _MoveSegment(_OutIter& __result)
+        : __result_(__result) {}
 
-    _LIBCPP_HIDE_FROM_ABI void
+    _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 void
     operator()(typename _Traits::__local_iterator __lfirst, typename _Traits::__local_iterator __llast) {
       __result_ = std::__move<_AlgPolicy>(__lfirst, __llast, std::move(__result_)).second;
     }
diff --git a/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/ranges.copy.segmented.pass.cpp b/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/ranges.copy.segmented.pass.cpp
index abb9157df9abb..50fb479afcd06 100644
--- a/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/ranges.copy.segmented.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/ranges.copy.segmented.pass.cpp
@@ -92,10 +92,7 @@ constexpr void test_join_view() {
   }
 }
 
-int main(int, char**) {
-  test_containers<std::deque<int>, std::deque<int>>();
-  test_containers<std::deque<int>, std::vector<int>>();
-  test_containers<std::vector<int>, std::deque<int>>();
+constexpr bool test_constexpr() {
   test_containers<std::vector<int>, std::vector<int>>();
 
   types::for_each(types::forward_iterator_list<int*>{}, []<class Iter> {
@@ -103,6 +100,16 @@ int main(int, char**) {
     test_join_view<Iter, sentinel_wrapper<Iter>>();
     test_join_view<Iter, sized_sentinel<Iter>>();
   });
+  return true;
+}
+
+int main(int, char**) {
+  test_containers<std::deque<int>, std::deque<int>>();
+  test_containers<std::deque<int>, std::vector<int>>();
+  test_containers<std::vector<int>, std::deque<int>>();
+
+  test_constexpr();
+  static_assert(test_constexpr());
 
   return 0;
 }
diff --git a/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/ranges.copy_backward.segmented.pass.cpp b/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/ranges.copy_backward.segmented.pass.cpp
new file mode 100644
index 0000000000000..c434cea1208cf
--- /dev/null
+++ b/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/ranges.copy_backward.segmented.pass.cpp
@@ -0,0 +1,111 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// TODO: make `join_view` non-experimental once D2770 is implemented.
+// UNSUPPORTED: !c++experimental
+
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <concepts>
+#include <deque>
+#include <ranges>
+#include <vector>
+
+#include "test_iterators.h"
+#include "type_algorithms.h"
+
+template <class InContainer, class OutContainer>
+constexpr void test_containers() {
+  using InIter  = typename InContainer::iterator;
+  using OutIter = typename OutContainer::iterator;
+
+  {
+    InContainer in{1, 2, 3, 4};
+    OutContainer out(4);
+
+    std::same_as<std::ranges::in_out_result<InIter, OutIter>> auto ret =
+        std::ranges::copy_backward(in.begin(), in.end(), out.end());
+    assert(std::ranges::equal(in, out));
+    assert(ret.in == in.end());
+    assert(ret.out == out.begin());
+  }
+  {
+    InContainer in{1, 2, 3, 4};
+    OutContainer out(4);
+    std::same_as<std::ranges::in_out_result<InIter, OutIter>> auto ret = std::ranges::copy_backward(in, out.end());
+    assert(std::ranges::equal(in, out));
+    assert(ret.in == in.end());
+    assert(ret.out == out.begin());
+  }
+}
+
+template <class Iter, class Sent>
+constexpr void test_join_view() {
+  auto to_subranges = std::views::transform([](auto& vec) {
+    return std::ranges::subrange(Iter(vec.data()), Sent(Iter(vec.data() + vec.size())));
+  });
+
+  { // segmented -> contiguous
+    std::vector<std::vector<int>> vectors = {};
+    auto range                            = vectors | to_subranges;
+    std::vector<std::ranges::subrange<Iter, Sent>> subrange_vector(range.begin(), range.end());
+    std::array<int, 0> arr;
+
+    std::ranges::copy_backward(subrange_vector | std::views::join, arr.end());
+    assert(std::ranges::equal(arr, std::array<int, 0>{}));
+  }
+  { // segmented -> contiguous
+    std::vector<std::vector<int>> vectors = {{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10}, {}};
+    auto range                            = vectors | to_subranges;
+    std::vector<std::ranges::subrange<Iter, Sent>> subrange_vector(range.begin(), range.end());
+    std::array<int, 10> arr;
+
+    std::ranges::copy_backward(subrange_vector | std::views::join, arr.end());
+    assert(std::ranges::equal(arr, std::array{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}));
+  }
+  { // contiguous -> segmented
+    std::vector<std::vector<int>> vectors = {{0, 0, 0, 0}, {0, 0}, {0, 0, 0, 0}, {}};
+    auto range                            = vectors | to_subranges;
+    std::vector<std::ranges::subrange<Iter, Sent>> subrange_vector(range.begin(), range.end());
+    std::array arr = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+
+    std::ranges::copy_backward(arr, (subrange_vector | std::views::join).end());
+    assert(std::ranges::equal(subrange_vector | std::views::join, std::array{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}));
+  }
+  { // segmented -> segmented
+    std::vector<std::vector<int>> vectors = {{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10}, {}};
+    auto range1                           = vectors | to_subranges;
+    std::vector<std::ranges::subrange<Iter, Sent>> subrange_vector(range1.begin(), range1.end());
+    std::vector<std::vector<int>> to_vectors = {{0, 0, 0, 0}, {0, 0, 0, 0}, {}, {0, 0}};
+    auto range2                              = to_vectors | to_subranges;
+    std::vector<std::ranges::subrange<Iter, Sent>> to_subrange_vector(range2.begin(), range2.end());
+
+    std::ranges::copy_backward(subrange_vector | std::views::join, (to_subrange_vector | std::views::join).end());
+    assert(std::ranges::equal(to_subrange_vector | std::views::join, std::array{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}));
+  }
+}
+
+constexpr bool test_constexpr() {
+  test_containers<std::vector<int>, std::vector<int>>();
+
+  types::for_each(types::bidirectional_iterator_list<int*>{}, []<class Iter> { test_join_view<Iter, Iter>(); });
+  return true;
+}
+
+int main(int, char**) {
+  test_containers<std::deque<int>, std::deque<int>>();
+  test_containers<std::deque<int>, std::vector<int>>();
+  test_containers<std::vector<int>, std::deque<int>>();
+
+  test_constexpr();
+  static_assert(test_constexpr());
+
+  return 0;
+}
diff --git a/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/ranges.copy_n.segmented.pass.cpp b/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/ranges.copy_n.segmented.pass.cpp
new file mode 100644
index 0000000000000..eae40cefa663f
--- /dev/null
+++ b/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/ranges.copy_n.segmented.pass.cpp
@@ -0,0 +1,114 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// TODO: make `join_view` non-experimental once D2770 is implemented.
+// UNSUPPORTED: !c++experimental
+
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <concepts>
+#include <deque>
+#include <ranges>
+#include <vector>
+
+#include "test_iterators.h"
+#include "type_algorithms.h"
+
+template <class InContainer, class OutContainer>
+constexpr void test_containers() {
+  using InIter  = typename InContainer::iterator;
+  using OutIter = typename OutContainer::iterator;
+
+  {
+    InContainer in{1, 2, 3, 4};
+    OutContainer out(4);
+
+    std::same_as<std::ranges::in_out_result<InIter, OutIter>> auto ret =
+        std::ranges::copy_n(in.begin(), in.size(), out.begin());
+    assert(std::ranges::equal(in, out));
+    assert(ret.in == in.end());
+    assert(ret.out == out.end());
+  }
+}
+
+template <class Iter, class Sent>
+constexpr void test_join_view() {
+  auto to_subranges = std::views::transform([](auto& vec) {
+    return std::ranges::subrange(Iter(vec.data()), Sent(Iter(vec.data() + vec.size())));
+  });
+
+  { // segmented -> contiguous
+    std::vector<std::vector<int>> vectors = {};
+    auto range                            = vectors | to_subranges;
+    std::vector<std::ranges::subrange<Iter, Sent>> subrange_vector(range.begin(), range.end());
+    std::array<int, 0> arr;
+
+    std::ranges::copy_n((subrange_vector | std::views::join).begin(),
+                        std::ranges::distance(subrange_vector | std::views::join),
+                        arr.begin());
+    assert(std::ranges::equal(arr, std::array<int, 0>{}));
+  }
+  { // segmented -> contiguous
+    std::vector<std::vector<int>> vectors = {{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10}, {}};
+    auto range                            = vectors | to_subranges;
+    std::vector<std::ranges::subrange<Iter, Sent>> subrange_vector(range.begin(), range.end());
+    std::array<int, 10> arr;
+
+    std::ranges::copy_n((subrange_vector | std::views::join).begin(),
+                        std::ranges::distance(subrange_vector | std::views::join),
+                        arr.begin());
+    assert(std::ranges::equal(arr, std::array{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}));
+  }
+  { // contiguous -> segmented
+    std::vector<std::vector<int>> vectors = {{0, 0, 0, 0}, {0, 0}, {0, 0, 0, 0}, {}};
+    auto range                            = vectors | to_subranges;
+    std::vector<std::ranges::subrange<Iter, Sent>> subrange_vector(range.begin(), range.end());
+    std::array arr = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+
+    std::ranges::copy_n(arr.begin(), arr.size(), (subrange_vector | std::views::join).begin());
+    assert(std::ranges::equal(subrange_vector | std::views::join, std::array{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}));
+  }
+  { // segmented -> segmented
+    std::vector<std::vector<int>> vectors = {{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10}, {}};
+    auto range1                           = vectors | to_subranges;
+    std::vector<std::ranges::subrange<Iter, Sent>> subrange_vector(range1.begin(), range1.end());
+    std::vector<std::vector<int>> to_vectors = {{0, 0, 0, 0}, {0, 0, 0, 0}, {}, {0, 0}};
+    auto range2                              = to_vectors | to_subranges;
+    std::vector<std::ranges::subrange<Iter, Sent>> to_subrange_vector(range2.begin(), range2.end());
+
+    std::ranges::copy_n((subrange_vector | std::views::join).begin(),
+                        std::ranges::distance(subrange_vector | std::views::join),
+                        (to_subrange_vector | std::views::join).begin());
+    assert(std::ranges::equal(to_subrange_vector | std::views::join, std::array{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}));
+  }
+}
+
+constexpr bool test_constexpr() {
+  test_containers<std::vector<int>, std::vector<int>>();
+
+  // TODO: this should be cpp20_input_iterator_list, not forward_iterator_list
+  types::for_each(types::forward_iterator_list<int*>{}, []<class Iter> {
+    test_join_view<Iter, Iter>();
+    test_join_view<Iter, sentinel_wrapper<Iter>>();
+    test_join_view<Iter, sized_sentinel<Iter>>();
+  });
+  return true;
+}
+
+int main(int, char**) {
+  test_containers<std::deque<int>, std::deque<int>>();
+  test_containers<std::deque<int>, std::vector<int>>();
+  test_containers<std::vector<int>, std::deque<int>>();
+
+  test_constexpr();
+  static_assert(test_constexpr());
+
+  return 0;
+}
diff --git a/libcxx/test/std/algorithms/alg.modifying.operations/alg.move/ranges.move.segmented.pass.cpp b/libcxx/test/std/algorithms/alg.modifying.operations/alg.move/ranges.move.segmented.pass.cpp
new file mode 100644
index 0000000000000..2df6a10b18504
--- /dev/null
+++ b/libcxx/test/std/algorithms/alg.modifying.operations/alg.move/ranges.move.segmented.pass.cpp
@@ -0,0 +1,115 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// TODO: make `join_view` non-experimental once D2770 is implemented.
+// UNSUPPORTED: !c++experimental
+
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <concepts>
+#include <deque>
+#include <ranges>
+#include <vector>
+
+#include "test_iterators.h"
+#include "type_algorithms.h"
+
+template <class InContainer, class OutContainer>
+constexpr void test_containers() {
+  using InIter  = typename InContainer::iterator;
+  using OutIter = typename OutContainer::iterator;
+
+  {
+    InContainer in{1, 2, 3, 4};
+    OutContainer out(4);
+
+    std::same_as<std::ranges::in_out_result<InIter, OutIter>> auto ret =
+        std::ranges::move(in.begin(), in.end(), out.begin());
+    assert(std::ranges::equal(in, out));
+    assert(ret.in == in.end());
+    assert(ret.out == out.end());
+  }
+  {
+    InContainer in{1, 2, 3, 4};
+    OutContainer out(4);
+    std::same_as<std::ranges::in_out_result<InIter, OutIter>> auto ret = std::ranges::move(in, out.begin());
+    assert(std::ranges::equal(in, out));
+    assert(ret.in == in.end());
+    assert(ret.out == out.end());
+  }
+}
+
+template <class Iter, class Sent>
+constexpr void test_join_view() {
+  auto to_subranges = std::views::transform([](auto& vec) {
+    return std::ranges::subrange(Iter(vec.data()), Sent(Iter(vec.data() + vec.size())));
+  });
+
+  { // segmented -> contiguous
+    std::vector<std::vector<int>> vectors = {};
+    auto range                            = vectors | to_subranges;
+    std::vector<std::ranges::subrange<Iter, Sent>> subrange_vector(range.begin(), range.end());
+    std::array<int, 0> arr;
+
+    std::ranges::move(subrange_vector | std::views::join, arr.begin());
+    assert(std::ranges::equal(arr, std::array<int, 0>{}));
+  }
+  { // segmented -> contiguous
+    std::vector<std::vector<int>> vectors = {{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10}, {}};
+    auto range                            = vectors | to_subranges;
+    std::vector<std::ranges::subrange<Iter, Sent>> subrange_vector(range.begin(), range.end());
+    std::array<int, 10> arr;
+
+    std::ranges::move(subrange_vector | std::views::join, arr.begin());
+    assert(std::ranges::equal(arr, std::array{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}));
+  }
+  { // contiguous -> segmented
+    std::vector<std::vector<int>> vectors = {{0, 0, 0, 0}, {0, 0}, {0, 0, 0, 0}, {}};
+    auto range                            = vectors | to_subranges;
+    std::vector<std::ranges::subrange<Iter, Sent>> subrange_vector(range.begin(), range.end());
+    std::array arr = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+
+    std::ranges::move(arr, (subrange_vector | std::views::join).begin());
+    assert(std::ranges::equal(subrange_vector | std::views::join, std::array{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}));
+  }
+  { // segmented -> segmented
+    std::vector<std::vector<int>> vectors = {{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10}, {}};
+    auto range1                           = vectors | to_subranges;
+    std::vector<std::ranges::subrange<Iter, Sent>> subrange_vector(range1.begin(), range1.end());
+    std::vector<std::vector<int>> to_vectors = {{0, 0, 0, 0}, {0, 0, 0, 0}, {}, {0, 0}};
+    auto range2                              = to_vectors | to_subranges;
+    std::vector<std::ranges::subrange<Iter, Sent>> to_subrange_vector(range2.begin(), range2.end());
+
+    std::ranges::move(subrange_vector | std::views::join, (to_subrange_vector | std::views::join).begin());
+    assert(std::ranges::equal(to_subrange_vector | std::views::join, std::array{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}));
+  }
+}
+
+constexpr bool test_constexpr() {
+  test_containers<std::vector<int>, std::vector<int>>();
+
+  types::for_each(types::forward_iterator_list<int*>{}, []<class Iter> {
+    test_join_view<Iter, Iter>();
+    test_join_view<Iter, sentinel_wrapper<Iter>>();
+    test_join_view<Iter, sized_sentinel<Iter>>();
+  });
+  return true;
+}
+
+int main(int, char**) {
+  test_containers<std::deque<int>, std::deque<int>>();
+  test_containers<std::deque<int>, std::vector<int>>();
+  test_containers<std::vector<int>, std::deque<int>>();
+
+  test_constexpr();
+  static_assert(test_constexpr());
+
+  return 0;
+}
diff --git a/libcxx/test/std/algorithms/alg.modifying.operations/alg.move/ranges.move_backward.segmented.pass.cpp b/libcxx/test/std/algorithms/alg.modifying.operations/alg.move/ranges.move_backward.segmented.pass.cpp
new file mode 100644
index 0000000000000..0f0a71439a10d
--- /dev/null
+++ b/libcxx/test/std/algorithms/alg.modifying.operations/alg.move/ranges.move_backward.segmented.pass.cpp
@@ -0,0 +1,111 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// TODO: make `join_view` non-experimental once D2770 is implemented.
+// UNSUPPORTED: !c++experimental
+
+#include <algorithm>
+#include <array>
+#include <cassert>
+#include <concepts>
+#include <deque>
+#include <ranges>
+#include <vector>
+
+#include "test_iterators.h"
+#include "type_algorithms.h"
+
+template <class InContainer, class OutContainer>
+constexpr void test_containers() {
+  using InIter  = typename InContainer::iterator;
+  using OutIter = typename OutContainer::iterator;
+
+  {
+    InContainer in{1, 2, 3, 4};
+    OutContainer out(4);
+
+    std::same_as<std::ranges::in_out_result<InIter, OutIter>> auto ret =
+        std::ranges::move_backward(in.begin(), in.end(), out.end());
+    assert(std::ranges::equal(in, out));
+    assert(ret.in == in.end());
+    assert(ret.out == out.begin());
+  }
+  {
+    InContainer in{1, 2, 3, 4};
+    OutContainer out(4);
+    std::same_as<std::ranges::in_out_result<InIter, OutIter>> auto ret = std::ranges::move_backward(in, out.end());
+    assert(std::ranges::equal(in, out));
+    assert(ret.in == in.end());
+    assert(ret.out == out.begin());
+  }
+}
+
+template <class Iter, class Sent>
+constexpr void test_join_view() {
+  auto to_subranges = std::views::transform([](auto& vec) {
+    return std::ranges::subrange(Iter(vec.data()), Sent(Iter(vec.data() + vec.size())));
+  });
+
+  { // segmented -> contiguous
+    std::vector<std::vector<int>> vectors = {};
+    auto range                            = vectors | to_subranges;
+    std::vector<std::ranges::subrange<Iter, Sent>> subrange_vector(range.begin(), range.end());
+    std::array<int, 0> arr;
+
+    std::ranges::move_backward(subrange_vector | std::views::join, arr.end());
+    assert(std::ranges::equal(arr, std::array<int, 0>{}));
+  }
+  { // segmented -> contiguous
+    std::vector<std::vector<int>> vectors = {{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10}, {}};
+    auto range                            = vectors | to_subranges;
+    std::vector<std::ranges::subrange<Iter, Sent>> subrange_vector(range.begin(), range.end());
+    std::array<int, 10> arr;
+
+    std::ranges::move_backward(subrange_vector | std::views::join, arr.end());
+    assert(std::ranges::equal(arr, std::array{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}));
+  }
+  { // contiguous -> segmented
+    std::vector<std::vector<int>> vectors = {{0, 0, 0, 0}, {0, 0}, {0, 0, 0, 0}, {}};
+    auto range                            = vectors | to_subranges;
+    std::vector<std::ranges::subrange<Iter, Sent>> subrange_vector(range.begin(), range.end());
+    std::array arr = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+
+    std::ranges::move_backward(arr, (subrange_vector | std::views::join).end());
+    assert(std::ranges::equal(subrange_vector | std::views::join, std::array{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}));
+  }
+  { // segmented -> segmented
+    std::vector<std::vector<int>> vectors = {{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10}, {}};
+    auto range1                           = vectors | to_subranges;
+    std::vector<std::ranges::subrange<Iter, Sent>> subrange_vector(range1.begin(), range1.end());
+    std::vector<std::vector<int>> to_vectors = {{0, 0, 0, 0}, {0, 0, 0, 0}, {}, {0, 0}};
+    auto range2                              = to_vectors | to_subranges;
+    std::vector<std::ranges::subrange<Iter, Sent>> to_subrange_vector(range2.begin(), range2.end());
+
+    std::ranges::move_backward(subrange_vector | std::views::join, (to_subrange_vector | std::views::join).end());
+    assert(std::ranges::equal(to_subrange_vector | std::views::join, std::array{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}));
+  }
+}
+
+constexpr bool test_constexpr() {
+  test_containers<std::vector<int>, std::vector<int>>();
+
+  types::for_each(types::bidirectional_iterator_list<int*>{}, []<class Iter> { test_join_view<Iter, Iter>(); });
+  return true;
+}
+
+int main(int, char**) {
+  test_containers<std::deque<int>, std::deque<int>>();
+  test_containers<std::deque<int>, std::vector<int>>();
+  test_containers<std::vector<int>, std::deque<int>>();
+
+  test_constexpr();
+  static_assert(test_constexpr());
+
+  return 0;
+}

From ec7c1a476d9e9d4018e94857f9bbb7aca2145c03 Mon Sep 17 00:00:00 2001
From: Kunwar Grover <groverkss@gmail.com>
Date: Fri, 27 Oct 2023 14:48:49 +0530
Subject: [PATCH 201/877] Reland [mlir][tools] Introduce tblgen-to-irdl tool
 (#70121)

This patch relands the reverted commit
https://github.com/llvm/llvm-project/commit/e6e9beb977555c84fb16ffdd8dca8649c7d3091d
after fixing the sanitizer issue.
---
 mlir/test/CMakeLists.txt                      |   1 +
 mlir/test/tblgen-to-irdl/CMathDialect.td      |  56 +++++++
 mlir/tools/CMakeLists.txt                     |   1 +
 mlir/tools/tblgen-to-irdl/CMakeLists.txt      |  20 +++
 .../tools/tblgen-to-irdl/OpDefinitionsGen.cpp | 156 ++++++++++++++++++
 mlir/tools/tblgen-to-irdl/tblgen-to-irdl.cpp  |  27 +++
 6 files changed, 261 insertions(+)
 create mode 100644 mlir/test/tblgen-to-irdl/CMathDialect.td
 create mode 100644 mlir/tools/tblgen-to-irdl/CMakeLists.txt
 create mode 100644 mlir/tools/tblgen-to-irdl/OpDefinitionsGen.cpp
 create mode 100644 mlir/tools/tblgen-to-irdl/tblgen-to-irdl.cpp

diff --git a/mlir/test/CMakeLists.txt b/mlir/test/CMakeLists.txt
index 6fc9ae0f3fc58..d81f3c4b1e20c 100644
--- a/mlir/test/CMakeLists.txt
+++ b/mlir/test/CMakeLists.txt
@@ -109,6 +109,7 @@ set(MLIR_TEST_DEPENDS
   mlir-tblgen
   mlir-translate
   tblgen-lsp-server
+  tblgen-to-irdl
   )
 
 # The native target may not be enabled, in this case we won't
diff --git a/mlir/test/tblgen-to-irdl/CMathDialect.td b/mlir/test/tblgen-to-irdl/CMathDialect.td
new file mode 100644
index 0000000000000..57ae8afbba5ee
--- /dev/null
+++ b/mlir/test/tblgen-to-irdl/CMathDialect.td
@@ -0,0 +1,56 @@
+// RUN: tblgen-to-irdl %s -I=%S/../../include --gen-dialect-irdl-defs --dialect=cmath | FileCheck %s
+
+include "mlir/IR/OpBase.td"
+include "mlir/IR/AttrTypeBase.td"
+
+// CHECK-LABEL: irdl.dialect @cmath {
+def CMath_Dialect : Dialect {
+  let name = "cmath";
+}
+
+class CMath_Type<string name, string typeMnemonic, list<Trait> traits = []>
+: TypeDef<CMath_Dialect, name, traits> {
+  let mnemonic = typeMnemonic;
+}
+
+class CMath_Op<string mnemonic, list<Trait> traits = []>
+    : Op<CMath_Dialect, mnemonic, traits>;
+
+def f32Orf64Type : Or<[CPred<"::llvm::isa<::mlir::F32>">,
+                       CPred<"::llvm::isa<::mlir::F64>">]>;
+
+def CMath_ComplexType : CMath_Type<"ComplexType", "complex"> {
+  let parameters = (ins f32Orf64Type:$elementType);
+}
+
+// CHECK:      irdl.operation @identity {
+// CHECK-NEXT:   %0 = irdl.c_pred "(::llvm::isa<cmath::ComplexTypeType>($_self))" 
+// CHECK-NEXT:   irdl.operands()
+// CHECK-NEXT:   irdl.results(%0)
+// CHECK-NEXT: }
+def CMath_IdentityOp : CMath_Op<"identity"> {
+  let results = (outs CMath_ComplexType:$out);
+}
+
+// CHECK:      irdl.operation @mul {
+// CHECK-NEXT:   %0 = irdl.c_pred "(::llvm::isa<cmath::ComplexTypeType>($_self))" 
+// CHECK-NEXT:   %1 = irdl.c_pred "(::llvm::isa<cmath::ComplexTypeType>($_self))" 
+// CHECK-NEXT:   %2 = irdl.c_pred "(::llvm::isa<cmath::ComplexTypeType>($_self))" 
+// CHECK-NEXT:   irdl.operands(%0, %1)
+// CHECK-NEXT:   irdl.results(%2)
+// CHECK-NEXT: }
+def CMath_MulOp : CMath_Op<"mul"> {
+  let arguments = (ins CMath_ComplexType:$in1, CMath_ComplexType:$in2);
+  let results = (outs CMath_ComplexType:$out);
+}
+
+// CHECK:      irdl.operation @norm {
+// CHECK-NEXT:   %0 = irdl.c_pred "(true)" 
+// CHECK-NEXT:   %1 = irdl.c_pred "(::llvm::isa<cmath::ComplexTypeType>($_self))" 
+// CHECK-NEXT:   irdl.operands(%0)
+// CHECK-NEXT:   irdl.results(%1)
+// CHECK-NEXT: }
+def CMath_NormOp : CMath_Op<"norm"> {
+  let arguments = (ins AnyType:$in);
+  let results = (outs CMath_ComplexType:$out);
+}
diff --git a/mlir/tools/CMakeLists.txt b/mlir/tools/CMakeLists.txt
index a01f74f737e1b..9b474385fdae1 100644
--- a/mlir/tools/CMakeLists.txt
+++ b/mlir/tools/CMakeLists.txt
@@ -9,6 +9,7 @@ add_subdirectory(mlir-spirv-cpu-runner)
 add_subdirectory(mlir-translate)
 add_subdirectory(mlir-vulkan-runner)
 add_subdirectory(tblgen-lsp-server)
+add_subdirectory(tblgen-to-irdl)
 
 # mlir-cpu-runner requires ExecutionEngine.
 if(MLIR_ENABLE_EXECUTION_ENGINE)
diff --git a/mlir/tools/tblgen-to-irdl/CMakeLists.txt b/mlir/tools/tblgen-to-irdl/CMakeLists.txt
new file mode 100644
index 0000000000000..a6621cb98ac6b
--- /dev/null
+++ b/mlir/tools/tblgen-to-irdl/CMakeLists.txt
@@ -0,0 +1,20 @@
+set(LLVM_LINK_COMPONENTS
+  TableGen
+)
+
+add_tablegen(tblgen-to-irdl MLIR
+  DESTINATION "${MLIR_TOOLS_INSTALL_DIR}"
+  EXPORT MLIR
+  tblgen-to-irdl.cpp
+  OpDefinitionsGen.cpp
+  )
+
+target_link_libraries(tblgen-to-irdl
+  PRIVATE
+  MLIRIR
+  MLIRIRDL
+  MLIRTblgenLib
+  MLIRSupport
+)
+
+mlir_check_all_link_libraries(tblgen-to-irdl)
diff --git a/mlir/tools/tblgen-to-irdl/OpDefinitionsGen.cpp b/mlir/tools/tblgen-to-irdl/OpDefinitionsGen.cpp
new file mode 100644
index 0000000000000..ba5bf4d9d4abb
--- /dev/null
+++ b/mlir/tools/tblgen-to-irdl/OpDefinitionsGen.cpp
@@ -0,0 +1,156 @@
+//===- OpDefinitionsGen.cpp - IRDL op definitions generator ---------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// OpDefinitionsGen uses the description of operations to generate IRDL
+// definitions for ops.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/IRDL/IR/IRDL.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/Diagnostics.h"
+#include "mlir/IR/Dialect.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/TableGen/AttrOrTypeDef.h"
+#include "mlir/TableGen/GenInfo.h"
+#include "mlir/TableGen/GenNameParser.h"
+#include "mlir/TableGen/Interfaces.h"
+#include "mlir/TableGen/Operator.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/InitLLVM.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/TableGen/Main.h"
+#include "llvm/TableGen/Record.h"
+#include "llvm/TableGen/TableGenBackend.h"
+
+using namespace llvm;
+using namespace mlir;
+using tblgen::NamedTypeConstraint;
+
+static llvm::cl::OptionCategory dialectGenCat("Options for -gen-irdl-dialect");
+llvm::cl::opt<std::string>
+    selectedDialect("dialect", llvm::cl::desc("The dialect to gen for"),
+                    llvm::cl::cat(dialectGenCat), llvm::cl::Required);
+
+irdl::CPredOp createConstraint(OpBuilder &builder,
+                               NamedTypeConstraint namedConstraint) {
+  MLIRContext *ctx = builder.getContext();
+  // Build the constraint as a string.
+  std::string constraint =
+      namedConstraint.constraint.getPredicate().getCondition();
+  // Build a CPredOp to match the C constraint built.
+  irdl::CPredOp op = builder.create<irdl::CPredOp>(
+      UnknownLoc::get(ctx), StringAttr::get(ctx, constraint));
+  return op;
+}
+
+/// Returns the name of the operation without the dialect prefix.
+static StringRef getOperatorName(tblgen::Operator &tblgenOp) {
+  StringRef opName = tblgenOp.getDef().getValueAsString("opName");
+  return opName;
+}
+
+/// Extract an operation to IRDL.
+irdl::OperationOp createIRDLOperation(OpBuilder &builder,
+                                      tblgen::Operator &tblgenOp) {
+  MLIRContext *ctx = builder.getContext();
+  StringRef opName = getOperatorName(tblgenOp);
+
+  irdl::OperationOp op = builder.create<irdl::OperationOp>(
+      UnknownLoc::get(ctx), StringAttr::get(ctx, opName));
+
+  // Add the block in the region.
+  Block &opBlock = op.getBody().emplaceBlock();
+  OpBuilder consBuilder = OpBuilder::atBlockBegin(&opBlock);
+
+  auto getValues = [&](tblgen::Operator::const_value_range namedCons) {
+    SmallVector<Value> operands;
+    SmallVector<irdl::VariadicityAttr> variadicity;
+    for (const NamedTypeConstraint &namedCons : namedCons) {
+      auto operand = createConstraint(consBuilder, namedCons);
+      operands.push_back(operand);
+
+      irdl::VariadicityAttr var;
+      if (namedCons.isOptional())
+        var = consBuilder.getAttr<irdl::VariadicityAttr>(
+            irdl::Variadicity::optional);
+      else if (namedCons.isVariadic())
+        var = consBuilder.getAttr<irdl::VariadicityAttr>(
+            irdl::Variadicity::variadic);
+      else
+        var = consBuilder.getAttr<irdl::VariadicityAttr>(
+            irdl::Variadicity::single);
+
+      variadicity.push_back(var);
+    }
+    return std::make_tuple(operands, variadicity);
+  };
+
+  auto [operands, operandVariadicity] = getValues(tblgenOp.getOperands());
+  auto [results, resultVariadicity] = getValues(tblgenOp.getResults());
+
+  // Create the operands and results operations.
+  consBuilder.create<irdl::OperandsOp>(UnknownLoc::get(ctx), operands,
+                                       operandVariadicity);
+  consBuilder.create<irdl::ResultsOp>(UnknownLoc::get(ctx), results,
+                                      resultVariadicity);
+
+  return op;
+}
+
+static irdl::DialectOp createIRDLDialect(OpBuilder &builder) {
+  MLIRContext *ctx = builder.getContext();
+  return builder.create<irdl::DialectOp>(UnknownLoc::get(ctx),
+                                         StringAttr::get(ctx, selectedDialect));
+}
+
+static std::vector<llvm::Record *>
+getOpDefinitions(const RecordKeeper &recordKeeper) {
+  if (!recordKeeper.getClass("Op"))
+    return {};
+  return recordKeeper.getAllDerivedDefinitions("Op");
+}
+
+static bool emitDialectIRDLDefs(const RecordKeeper &recordKeeper,
+                                raw_ostream &os) {
+  // Initialize.
+  MLIRContext ctx;
+  ctx.getOrLoadDialect<irdl::IRDLDialect>();
+  OpBuilder builder(&ctx);
+
+  // Create a module op and set it as the insertion point.
+  OwningOpRef<ModuleOp> module =
+      builder.create<ModuleOp>(UnknownLoc::get(&ctx));
+  builder = builder.atBlockBegin(module->getBody());
+  // Create the dialect and insert it.
+  irdl::DialectOp dialect = createIRDLDialect(builder);
+  // Set insertion point to start of DialectOp.
+  builder = builder.atBlockBegin(&dialect.getBody().emplaceBlock());
+
+  std::vector<Record *> defs = getOpDefinitions(recordKeeper);
+  for (auto *def : defs) {
+    tblgen::Operator tblgenOp(def);
+    if (tblgenOp.getDialectName() != selectedDialect)
+      continue;
+
+    createIRDLOperation(builder, tblgenOp);
+  }
+
+  // Print the module.
+  module->print(os);
+
+  return false;
+}
+
+static mlir::GenRegistration
+    genOpDefs("gen-dialect-irdl-defs", "Generate IRDL dialect definitions",
+              [](const RecordKeeper &records, raw_ostream &os) {
+                return emitDialectIRDLDefs(records, os);
+              });
diff --git a/mlir/tools/tblgen-to-irdl/tblgen-to-irdl.cpp b/mlir/tools/tblgen-to-irdl/tblgen-to-irdl.cpp
new file mode 100644
index 0000000000000..092ec2ebe81a6
--- /dev/null
+++ b/mlir/tools/tblgen-to-irdl/tblgen-to-irdl.cpp
@@ -0,0 +1,27 @@
+//===- mlir-tblgen.cpp - Top-Level TableGen implementation for MLIR -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the main function for MLIR's TableGen IRDL backend.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/TableGen/GenInfo.h"
+#include "mlir/Tools/mlir-tblgen/MlirTblgenMain.h"
+#include "llvm/TableGen/Record.h"
+
+using namespace llvm;
+using namespace mlir;
+
+// Generator that prints records.
+GenRegistration printRecords("print-records", "Print all records to stdout",
+                             [](const RecordKeeper &records, raw_ostream &os) {
+                               os << records;
+                               return false;
+                             });
+
+int main(int argc, char **argv) { return MlirTblgenMain(argc, argv); }

From 6c3bc910588f962e49470098ccc3b13c29cae493 Mon Sep 17 00:00:00 2001
From: Owen Pan <owenpiano@gmail.com>
Date: Fri, 27 Oct 2023 02:17:11 -0700
Subject: [PATCH 202/877] [clang-format][NFC] Remove more extraneous newlines
 in unit tests

Also removed a duplicate test case.
---
 clang/unittests/Format/FormatTestObjC.cpp     | 98 +++++++++----------
 clang/unittests/Format/FormatTestProto.cpp    |  4 +-
 .../unittests/Format/FormatTestSelective.cpp  | 16 +--
 clang/unittests/Format/FormatTestTableGen.cpp |  4 +-
 .../unittests/Format/FormatTestTextProto.cpp  |  8 +-
 clang/unittests/Format/FormatTestVerilog.cpp  | 14 +--
 clang/unittests/Format/QualifierFixerTest.cpp |  7 +-
 .../Format/UsingDeclarationsSorterTest.cpp    |  8 +-
 8 files changed, 77 insertions(+), 82 deletions(-)

diff --git a/clang/unittests/Format/FormatTestObjC.cpp b/clang/unittests/Format/FormatTestObjC.cpp
index 84a3d240055ff..cd4f9d934127b 100644
--- a/clang/unittests/Format/FormatTestObjC.cpp
+++ b/clang/unittests/Format/FormatTestObjC.cpp
@@ -62,35 +62,33 @@ TEST(FormatTestObjCStyle, DetectsObjCInHeaders) {
   ASSERT_TRUE((bool)Style);
   EXPECT_EQ(FormatStyle::LK_Cpp, Style->Language);
 
-  Style = getStyle("{}", "a.h", "none", "@interface Foo\n@end\n");
+  Style = getStyle("{}", "a.h", "none", "@interface Foo\n@end");
   ASSERT_TRUE((bool)Style);
   EXPECT_EQ(FormatStyle::LK_ObjC, Style->Language);
 
   Style = getStyle("{}", "a.h", "none",
-                   "const int interface = 1;\nconst int end = 2;\n");
+                   "const int interface = 1;\nconst int end = 2;");
   ASSERT_TRUE((bool)Style);
   EXPECT_EQ(FormatStyle::LK_Cpp, Style->Language);
 
-  Style = getStyle("{}", "a.h", "none", "@protocol Foo\n@end\n");
+  Style = getStyle("{}", "a.h", "none", "@protocol Foo\n@end");
   ASSERT_TRUE((bool)Style);
   EXPECT_EQ(FormatStyle::LK_ObjC, Style->Language);
 
   Style = getStyle("{}", "a.h", "none",
-                   "const int protocol = 1;\nconst int end = 2;\n");
+                   "const int protocol = 1;\nconst int end = 2;");
   ASSERT_TRUE((bool)Style);
   EXPECT_EQ(FormatStyle::LK_Cpp, Style->Language);
 
-  Style = getStyle("{}", "a.h", "none", "typedef NS_ENUM(int, Foo) {};\n");
+  Style = getStyle("{}", "a.h", "none", "typedef NS_ENUM(int, Foo) {};");
   ASSERT_TRUE((bool)Style);
   EXPECT_EQ(FormatStyle::LK_ObjC, Style->Language);
 
-  Style =
-      getStyle("{}", "a.h", "none", "typedef NS_CLOSED_ENUM(int, Foo) {};\n");
+  Style = getStyle("{}", "a.h", "none", "typedef NS_CLOSED_ENUM(int, Foo) {};");
   ASSERT_TRUE((bool)Style);
   EXPECT_EQ(FormatStyle::LK_ObjC, Style->Language);
 
-  Style =
-      getStyle("{}", "a.h", "none", "typedef NS_ERROR_ENUM(int, Foo) {};\n");
+  Style = getStyle("{}", "a.h", "none", "typedef NS_ERROR_ENUM(int, Foo) {};");
   ASSERT_TRUE((bool)Style);
   EXPECT_EQ(FormatStyle::LK_ObjC, Style->Language);
 
@@ -118,45 +116,43 @@ FOUNDATION_EXPORT void DoStuff(void);
   ASSERT_TRUE((bool)Style);
   EXPECT_EQ(FormatStyle::LK_Cpp, Style->Language);
 
-  Style =
-      getStyle("{}", "a.h", "none", "inline void Foo() { Log(@\"Foo\"); }\n");
+  Style = getStyle("{}", "a.h", "none", "inline void Foo() { Log(@\"Foo\"); }");
   ASSERT_TRUE((bool)Style);
   EXPECT_EQ(FormatStyle::LK_ObjC, Style->Language);
 
-  Style =
-      getStyle("{}", "a.h", "none", "inline void Foo() { Log(\"Foo\"); }\n");
+  Style = getStyle("{}", "a.h", "none", "inline void Foo() { Log(\"Foo\"); }");
   ASSERT_TRUE((bool)Style);
   EXPECT_EQ(FormatStyle::LK_Cpp, Style->Language);
 
   Style =
-      getStyle("{}", "a.h", "none", "inline void Foo() { id = @[1, 2, 3]; }\n");
+      getStyle("{}", "a.h", "none", "inline void Foo() { id = @[1, 2, 3]; }");
   ASSERT_TRUE((bool)Style);
   EXPECT_EQ(FormatStyle::LK_ObjC, Style->Language);
 
   Style = getStyle("{}", "a.h", "none",
-                   "inline void Foo() { id foo = @{1: 2, 3: 4, 5: 6}; }\n");
+                   "inline void Foo() { id foo = @{1: 2, 3: 4, 5: 6}; }");
   ASSERT_TRUE((bool)Style);
   EXPECT_EQ(FormatStyle::LK_ObjC, Style->Language);
 
   Style = getStyle("{}", "a.h", "none",
-                   "inline void Foo() { int foo[] = {1, 2, 3}; }\n");
+                   "inline void Foo() { int foo[] = {1, 2, 3}; }");
   ASSERT_TRUE((bool)Style);
   EXPECT_EQ(FormatStyle::LK_Cpp, Style->Language);
 
   // ObjC characteristic types.
-  Style = getStyle("{}", "a.h", "none", "extern NSString *kFoo;\n");
+  Style = getStyle("{}", "a.h", "none", "extern NSString *kFoo;");
   ASSERT_TRUE((bool)Style);
   EXPECT_EQ(FormatStyle::LK_ObjC, Style->Language);
 
-  Style = getStyle("{}", "a.h", "none", "extern NSInteger Foo();\n");
+  Style = getStyle("{}", "a.h", "none", "extern NSInteger Foo();");
   ASSERT_TRUE((bool)Style);
   EXPECT_EQ(FormatStyle::LK_ObjC, Style->Language);
 
-  Style = getStyle("{}", "a.h", "none", "NSObject *Foo();\n");
+  Style = getStyle("{}", "a.h", "none", "NSObject *Foo();");
   ASSERT_TRUE((bool)Style);
   EXPECT_EQ(FormatStyle::LK_ObjC, Style->Language);
 
-  Style = getStyle("{}", "a.h", "none", "NSSet *Foo();\n");
+  Style = getStyle("{}", "a.h", "none", "NSSet *Foo();");
   ASSERT_TRUE((bool)Style);
   EXPECT_EQ(FormatStyle::LK_ObjC, Style->Language);
 }
@@ -187,7 +183,7 @@ TEST_F(FormatTestObjC, FormatObjCTryCatch) {
                "  @try {\n"
                "  } @finally {\n"
                "  }\n"
-               "});\n");
+               "});");
 }
 
 TEST_F(FormatTestObjC, FormatObjCAutoreleasepool) {
@@ -196,7 +192,7 @@ TEST_F(FormatTestObjC, FormatObjCAutoreleasepool) {
                "}\n"
                "@autoreleasepool {\n"
                "  f();\n"
-               "}\n");
+               "}");
   Style.BreakBeforeBraces = FormatStyle::BS_Custom;
   Style.BraceWrapping.AfterControlStatement = FormatStyle::BWACS_Always;
   verifyFormat("@autoreleasepool\n"
@@ -206,18 +202,18 @@ TEST_F(FormatTestObjC, FormatObjCAutoreleasepool) {
                "@autoreleasepool\n"
                "{\n"
                "  f();\n"
-               "}\n");
+               "}");
 }
 
 TEST_F(FormatTestObjC, FormatObjCGenerics) {
   Style.ColumnLimit = 40;
   verifyFormat("int aaaaaaaaaaaaaaaa(\n"
                "    NSArray<aaaaaaaaaaaaaaaaaa *>\n"
-               "        aaaaaaaaaaaaaaaaa);\n");
+               "        aaaaaaaaaaaaaaaaa);");
   verifyFormat("int aaaaaaaaaaaaaaaa(\n"
                "    NSArray<aaaaaaaaaaaaaaaaaaa<\n"
                "        aaaaaaaaaaaaaaaa *> *>\n"
-               "        aaaaaaaaaaaaaaaaa);\n");
+               "        aaaaaaaaaaaaaaaaa);");
 }
 
 TEST_F(FormatTestObjC, FormatObjCSynchronized) {
@@ -226,7 +222,7 @@ TEST_F(FormatTestObjC, FormatObjCSynchronized) {
                "}\n"
                "@synchronized(self) {\n"
                "  f();\n"
-               "}\n");
+               "}");
   Style.BreakBeforeBraces = FormatStyle::BS_Custom;
   Style.BraceWrapping.AfterControlStatement = FormatStyle::BWACS_Always;
   verifyFormat("@synchronized(self)\n"
@@ -236,7 +232,7 @@ TEST_F(FormatTestObjC, FormatObjCSynchronized) {
                "@synchronized(self)\n"
                "{\n"
                "  f();\n"
-               "}\n");
+               "}");
 }
 
 TEST_F(FormatTestObjC, FormatObjCInterface) {
@@ -409,7 +405,7 @@ TEST_F(FormatTestObjC, FormatObjCInterface) {
   Style.ColumnLimit = 40;
   // BinPackParameters should be true by default.
   verifyFormat("void eeeeeeee(int eeeee, int eeeee,\n"
-               "              int eeeee, int eeeee);\n");
+               "              int eeeee, int eeeee);");
   // ObjCBinPackProtocolList should be BPS_Never by default.
   verifyFormat("@interface fffffffffffff () <\n"
                "    fffffffffffff,\n"
@@ -511,7 +507,7 @@ TEST_F(FormatTestObjC, FormatObjCProtocol) {
                "@end");
 
   verifyFormat("@protocol Foo;\n"
-               "@protocol Bar;\n");
+               "@protocol Bar;");
 
   verifyFormat("@protocol Foo\n"
                "@end\n"
@@ -530,7 +526,7 @@ TEST_F(FormatTestObjC, FormatObjCProtocol) {
                "- (void)required;\n"
                "@optional\n"
                "@property(assign) int madProp;\n"
-               "@end\n");
+               "@end");
 
   verifyFormat("@property(nonatomic, assign, readonly)\n"
                "    int *looooooooooooooooooooooooooooongNumber;\n"
@@ -562,7 +558,7 @@ TEST_F(FormatTestObjC, FormatObjCMethodDeclarations) {
                "    evenLongerKeyword:(float)theInterval\n"
                "                error:(NSError **)theError {\n"
                "}");
-  verifyFormat("+ (instancetype)new;\n");
+  verifyFormat("+ (instancetype)new;");
   Style.ColumnLimit = 60;
   verifyFormat("- (instancetype)initXxxxxx:(id<x>)x\n"
                "                         y:(id<yyyyyyyyyyyyyyyyyyyy>)y\n"
@@ -573,18 +569,18 @@ TEST_F(FormatTestObjC, FormatObjCMethodDeclarations) {
   Style.ColumnLimit = 40;
   // Make sure selectors with 0, 1, or more arguments are indented when wrapped.
   verifyFormat("- (aaaaaaaaaaaaaaaaaaaaaaaaaaaaa)\n"
-               "    aaaaaaaaaaaaaaaaaaaaaaaaaaaa;\n");
+               "    aaaaaaaaaaaaaaaaaaaaaaaaaaaa;");
   verifyFormat("- (aaaaaaaaaaaaaaaaaaaaaaaaaaaaa)\n"
-               "    aaaaaaaaaaaaaaaaaaaaaaaaaaaa:(int)a;\n");
+               "    aaaaaaaaaaaaaaaaaaaaaaaaaaaa:(int)a;");
   verifyFormat("- (aaaaaaaaaaaaaaaaaaaaaaaaaaaaa)\n"
                "    aaaaaaaaaaaaaaaaaaaaaaaaaaaa:(int)a\n"
-               "    aaaaaaaaaaaaaaaaaaaaaaaaaaaa:(int)a;\n");
+               "    aaaaaaaaaaaaaaaaaaaaaaaaaaaa:(int)a;");
   verifyFormat("- (aaaaaaaaaaaaaaaaaaaaaaaaaaaaa)\n"
                "     aaaaaaaaaaaaaaaaaaaaaaaaaaa:(int)a\n"
-               "    aaaaaaaaaaaaaaaaaaaaaaaaaaaa:(int)a;\n");
+               "    aaaaaaaaaaaaaaaaaaaaaaaaaaaa:(int)a;");
   verifyFormat("- (aaaaaaaaaaaaaaaaaaaaaaaaaaaaa)\n"
                "    aaaaaaaaaaaaaaaaaaaaaaaaaaaa:(int)a\n"
-               "     aaaaaaaaaaaaaaaaaaaaaaaaaaa:(int)a;\n");
+               "     aaaaaaaaaaaaaaaaaaaaaaaaaaa:(int)a;");
 
   // Continuation indent width should win over aligning colons if the function
   // name is long.
@@ -618,7 +614,7 @@ TEST_F(FormatTestObjC, FormatObjCMethodDeclarations) {
                "- (void)foo:(id)bar\n"
                "{\n"
                "}\n"
-               "@end\n");
+               "@end");
 }
 
 TEST_F(FormatTestObjC, FormatObjCMethodExpr) {
@@ -1109,30 +1105,30 @@ TEST_F(FormatTestObjC, ObjCForIn) {
 TEST_F(FormatTestObjC, ObjCCxxKeywords) {
   verifyFormat("+ (instancetype)new {\n"
                "  return nil;\n"
-               "}\n");
+               "}");
   verifyFormat("+ (instancetype)myNew {\n"
                "  return [self new];\n"
-               "}\n");
-  verifyFormat("SEL NewSelector(void) { return @selector(new); }\n");
-  verifyFormat("SEL MacroSelector(void) { return MACRO(new); }\n");
+               "}");
+  verifyFormat("SEL NewSelector(void) { return @selector(new); }");
+  verifyFormat("SEL MacroSelector(void) { return MACRO(new); }");
   verifyFormat("+ (instancetype)delete {\n"
                "  return nil;\n"
-               "}\n");
+               "}");
   verifyFormat("+ (instancetype)myDelete {\n"
                "  return [self delete];\n"
-               "}\n");
-  verifyFormat("SEL DeleteSelector(void) { return @selector(delete); }\n");
-  verifyFormat("SEL MacroSelector(void) { return MACRO(delete); }\n");
-  verifyFormat("MACRO(new:)\n");
-  verifyFormat("MACRO(delete:)\n");
-  verifyFormat("foo = @{MACRO(new:) : MACRO(delete:)}\n");
+               "}");
+  verifyFormat("SEL DeleteSelector(void) { return @selector(delete); }");
+  verifyFormat("SEL MacroSelector(void) { return MACRO(delete); }");
+  verifyFormat("MACRO(new:)");
+  verifyFormat("MACRO(delete:)");
+  verifyFormat("foo = @{MACRO(new:) : MACRO(delete:)}");
   verifyFormat("@implementation Foo\n"
                "// Testing\n"
                "- (Class)class {\n"
                "}\n"
                "- (void)foo {\n"
                "}\n"
-               "@end\n");
+               "@end");
   verifyFormat("@implementation Foo\n"
                "- (Class)class {\n"
                "}\n"
@@ -1162,7 +1158,7 @@ TEST_F(FormatTestObjC, ObjCCxxKeywords) {
                "// Testing\n"
                "- (Class)class;\n"
                "- (void)foo;\n"
-               "@end\n");
+               "@end");
   verifyFormat("@interface Foo\n"
                "- (Class)class;\n"
                "- (void)foo;\n"
@@ -1354,7 +1350,7 @@ TEST_F(FormatTestObjC, ObjCArrayLiterals) {
   // (that raises -Wobjc-string-concatenation).
   verifyFormat("NSArray *foo = @[\n"
                "  @\"aaaaaaaaaaaaaaaaaaaaaaaaaa\"\n"
-               "];\n");
+               "];");
 }
 
 TEST_F(FormatTestObjC, BreaksCallStatementWhereSemiJustOverTheLimit) {
diff --git a/clang/unittests/Format/FormatTestProto.cpp b/clang/unittests/Format/FormatTestProto.cpp
index 5de40b6eae597..4a2d2d68248d3 100644
--- a/clang/unittests/Format/FormatTestProto.cpp
+++ b/clang/unittests/Format/FormatTestProto.cpp
@@ -201,7 +201,7 @@ TEST_F(FormatTestProto, DoesntWrapFileOptions) {
 }
 
 TEST_F(FormatTestProto, TrailingCommentAfterFileOption) {
-  verifyFormat("option java_package = \"foo.pkg\";  // comment\n");
+  verifyFormat("option java_package = \"foo.pkg\";  // comment");
 }
 
 TEST_F(FormatTestProto, FormatsOptions) {
@@ -411,7 +411,7 @@ TEST_F(FormatTestProto, DoesntWrapPackageStatements) {
 }
 
 TEST_F(FormatTestProto, TrailingCommentAfterPackage) {
-  verifyFormat("package foo.pkg;  // comment\n");
+  verifyFormat("package foo.pkg;  // comment");
 }
 
 TEST_F(FormatTestProto, FormatsService) {
diff --git a/clang/unittests/Format/FormatTestSelective.cpp b/clang/unittests/Format/FormatTestSelective.cpp
index da252eb2ae3c3..c21c9bfe60790 100644
--- a/clang/unittests/Format/FormatTestSelective.cpp
+++ b/clang/unittests/Format/FormatTestSelective.cpp
@@ -185,13 +185,13 @@ TEST_F(FormatTestSelective, ContinueReindenting) {
             "int c;\n"
             "int d;\n"
             "int e;\n"
-            "  int f;\n",
+            "  int f;",
             format("int   i;\n"
                    "  int b;\n"
                    " int   c;\n"
                    "  int d;\n"
                    "int e;\n"
-                   "  int f;\n",
+                   "  int f;",
                    11, 0));
 }
 
@@ -201,13 +201,13 @@ TEST_F(FormatTestSelective, ReindentClosingBrace) {
             "  int a;\n"
             "  int b;\n"
             "}\n"
-            " int c;\n",
+            " int c;",
             format("int   i;\n"
                    "  int f(){\n"
                    "int a;\n"
                    "int b;\n"
                    "  }\n"
-                   " int c;\n",
+                   " int c;",
                    11, 0));
   EXPECT_EQ("void f() {\n"
             "  if (foo) {\n"
@@ -216,7 +216,7 @@ TEST_F(FormatTestSelective, ReindentClosingBrace) {
             "    c();\n"
             "  }\n"
             "int d;\n"
-            "}\n",
+            "}",
             format("void f() {\n"
                    "  if (foo) {\n"
                    "b();\n"
@@ -224,7 +224,7 @@ TEST_F(FormatTestSelective, ReindentClosingBrace) {
                    "c();\n"
                    "}\n"
                    "int d;\n"
-                   "}\n",
+                   "}",
                    13, 0));
   EXPECT_EQ("int i = []() {\n"
             "  class C {\n"
@@ -232,14 +232,14 @@ TEST_F(FormatTestSelective, ReindentClosingBrace) {
             "    int b;\n"
             "  };\n"
             "  int c;\n"
-            "};\n",
+            "};",
             format("int i = []() {\n"
                    "  class C{\n"
                    "int a;\n"
                    "int b;\n"
                    "};\n"
                    "int c;\n"
-                   "  };\n",
+                   "  };",
                    17, 0));
 }
 
diff --git a/clang/unittests/Format/FormatTestTableGen.cpp b/clang/unittests/Format/FormatTestTableGen.cpp
index 06029bd8c725f..123f47d6d7f88 100644
--- a/clang/unittests/Format/FormatTestTableGen.cpp
+++ b/clang/unittests/Format/FormatTestTableGen.cpp
@@ -48,11 +48,11 @@ TEST_F(FormatTestTableGen, FormatStringBreak) {
                "           HelpText<\n"
                "               \"This is a very, very, very, very, \"\n"
                "               \"very, very, very, very, very, very, \"\n"
-               "               \"very long help string\">;\n");
+               "               \"very long help string\">;");
 }
 
 TEST_F(FormatTestTableGen, NoSpacesInSquareBracketLists) {
-  verifyFormat("def flag : Flag<[\"-\", \"--\"], \"foo\">;\n");
+  verifyFormat("def flag : Flag<[\"-\", \"--\"], \"foo\">;");
 }
 
 } // namespace format
diff --git a/clang/unittests/Format/FormatTestTextProto.cpp b/clang/unittests/Format/FormatTestTextProto.cpp
index 6df8d45127922..23f46202a3463 100644
--- a/clang/unittests/Format/FormatTestTextProto.cpp
+++ b/clang/unittests/Format/FormatTestTextProto.cpp
@@ -370,7 +370,7 @@ TEST_F(FormatTestTextProto, UnderstandsHashComments) {
             "### another triple-hash comment\n"
             "#### a quadriple-hash comment\n"
             "dd: 100\n"
-            "#### another quadriple-hash comment\n",
+            "#### another quadriple-hash comment",
             format("aaa: 100\n"
                    "##this is a double-hash comment.\n"
                    "bb: 100\n"
@@ -380,7 +380,7 @@ TEST_F(FormatTestTextProto, UnderstandsHashComments) {
                    "### another triple-hash comment\n"
                    "####a quadriple-hash comment\n"
                    "dd: 100\n"
-                   "#### another quadriple-hash comment\n",
+                   "#### another quadriple-hash comment",
                    Style));
 
   // Ensure we support a common pattern for naming sections.
@@ -540,7 +540,7 @@ TEST_F(FormatTestTextProto, AcceptsOperatorAsKey) {
 
 TEST_F(FormatTestTextProto, BreaksConsecutiveStringLiterals) {
   verifyFormat("ala: \"str1\"\n"
-               "     \"str2\"\n");
+               "     \"str2\"");
 }
 
 TEST_F(FormatTestTextProto, PutsMultipleEntriesInExtensionsOnNewlines) {
@@ -669,7 +669,7 @@ TEST_F(FormatTestTextProto, BreaksEntriesOfSubmessagesContainingSubmessages) {
                "  key: 1\n"
                "  sub: {}\n"
                "}\n"
-               "# comment\n");
+               "# comment");
   verifyFormat("sub: {\n"
                "  key: 1\n"
                "  # comment\n"
diff --git a/clang/unittests/Format/FormatTestVerilog.cpp b/clang/unittests/Format/FormatTestVerilog.cpp
index 4c0a065daadd9..1c2692467987d 100644
--- a/clang/unittests/Format/FormatTestVerilog.cpp
+++ b/clang/unittests/Format/FormatTestVerilog.cpp
@@ -229,12 +229,12 @@ TEST_F(FormatTestVerilog, Case) {
                "  16'd1: fork\n"
                "    result = 10'b1011111111;\n"
                "  join\n"
-               "endcase\n");
+               "endcase");
   verifyFormat("case (data)\n"
                "  16'd1: fork : x\n"
                "    result = 10'b1011111111;\n"
                "  join : x\n"
-               "endcase\n");
+               "endcase");
   // Test default.
   verifyFormat("case (data)\n"
                "  default\n"
@@ -300,17 +300,17 @@ TEST_F(FormatTestVerilog, Case) {
   verifyFormat("case ('{x : x, default : 9})\n"
                "endcase",
                Style);
-  verifyFormat("x = '{x : x, default : 9};\n", Style);
+  verifyFormat("x = '{x : x, default : 9};", Style);
   verifyFormat("default:\n"
-               "  x = '{x : x, default : 9};\n",
+               "  x = '{x : x, default : 9};",
                Style);
   Style.SpacesInContainerLiterals = false;
   verifyFormat("case ('{x: x, default: 9})\n"
                "endcase",
                Style);
-  verifyFormat("x = '{x: x, default: 9};\n", Style);
+  verifyFormat("x = '{x: x, default: 9};", Style);
   verifyFormat("default:\n"
-               "  x = '{x: x, default: 9};\n",
+               "  x = '{x: x, default: 9};",
                Style);
   // When the line following the case label needs to be broken, the continuation
   // should be indented correctly.
@@ -657,7 +657,7 @@ TEST_F(FormatTestVerilog, Hierarchy) {
                "    implements x, x, x;\n"
                "  generate\n"
                "  endgenerate\n"
-               "endclass : x\n");
+               "endclass : x");
   verifyFormat("function automatic logic [1 : 0] x\n"
                "    (input x);\n"
                "  generate\n"
diff --git a/clang/unittests/Format/QualifierFixerTest.cpp b/clang/unittests/Format/QualifierFixerTest.cpp
index a8f38593e5c40..73814e7414f5e 100644
--- a/clang/unittests/Format/QualifierFixerTest.cpp
+++ b/clang/unittests/Format/QualifierFixerTest.cpp
@@ -345,7 +345,6 @@ TEST_F(QualifierFixerTest, RightQualifier) {
       "bool tools::addXRayRuntime(const ToolChain&TC, const ArgList &Args) {",
       Style);
   verifyFormat("Foo<Foo<int> const> P;", "Foo<const Foo<int>> P;", Style);
-  verifyFormat("Foo<Foo<int> const> P;\n", "Foo<const Foo<int>> P;\n", Style);
   verifyFormat("Foo<Foo<int> const> P;\n#if 0\n#else\n#endif",
                "Foo<const Foo<int>> P;\n#if 0\n#else\n#endif", Style);
 
@@ -1254,17 +1253,17 @@ TEST_F(QualifierFixerTest, DisableRegions) {
   ReplacementCount = 0;
   verifyFormat("// clang-format off\n"
                "int const inline static a = 0;\n"
-               "// clang-format on\n",
+               "// clang-format on",
                Style);
   EXPECT_EQ(ReplacementCount, 0);
   verifyFormat("// clang-format off\n"
                "int const inline static a = 0;\n"
                "// clang-format on\n"
-               "inline static const int a = 0;\n",
+               "inline static const int a = 0;",
                "// clang-format off\n"
                "int const inline static a = 0;\n"
                "// clang-format on\n"
-               "int const inline static a = 0;\n",
+               "int const inline static a = 0;",
                Style);
 }
 
diff --git a/clang/unittests/Format/UsingDeclarationsSorterTest.cpp b/clang/unittests/Format/UsingDeclarationsSorterTest.cpp
index 6bfee7dd16cc0..c0c0de7076fe6 100644
--- a/clang/unittests/Format/UsingDeclarationsSorterTest.cpp
+++ b/clang/unittests/Format/UsingDeclarationsSorterTest.cpp
@@ -784,7 +784,7 @@ TEST_F(UsingDeclarationsSorterTest,
             "using std::chrono::duration_cast;\n"
             "using std::chrono::microseconds;\n"
             "using std::chrono::seconds;\n"
-            "using std::chrono::steady_clock;\n",
+            "using std::chrono::steady_clock;",
             sortUsingDeclarations("using boost::regex;\n"
                                   "using boost::regex_constants::icase;\n"
                                   "using std::chrono::duration_cast;\n"
@@ -792,7 +792,7 @@ TEST_F(UsingDeclarationsSorterTest,
                                   "using std::chrono::seconds;\n"
                                   "using std::chrono::steady_clock;\n"
                                   "using std::move;\n"
-                                  "using std::string;\n",
+                                  "using std::string;",
                                   Style));
 
   Style.SortUsingDeclarations = FormatStyle::SUD_Lexicographic;
@@ -803,7 +803,7 @@ TEST_F(UsingDeclarationsSorterTest,
             "using std::chrono::seconds;\n"
             "using std::chrono::steady_clock;\n"
             "using std::move;\n"
-            "using std::string;\n",
+            "using std::string;",
             sortUsingDeclarations("using boost::regex;\n"
                                   "using boost::regex_constants::icase;\n"
                                   "using std::move;\n"
@@ -811,7 +811,7 @@ TEST_F(UsingDeclarationsSorterTest,
                                   "using std::chrono::duration_cast;\n"
                                   "using std::chrono::microseconds;\n"
                                   "using std::chrono::seconds;\n"
-                                  "using std::chrono::steady_clock;\n",
+                                  "using std::chrono::steady_clock;",
                                   Style));
 }
 

From 4d4f603793aa134d1d04b60782fac0e6b0ed2172 Mon Sep 17 00:00:00 2001
From: tyb0807 <vuson@google.com>
Date: Fri, 27 Oct 2023 11:26:20 +0200
Subject: [PATCH 203/877] =?UTF-8?q?[mlir][Vector]=20Fix=20integration=20te?=
 =?UTF-8?q?st=20for=20vector.maskedload=20narrow=20type=E2=80=A6=20(#70431?=
 =?UTF-8?q?)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

… emulation

Currently the expected CHECK values are not correct for
`fcst_maskedload` from
mlir/test/Integration/Dialect/Vector/CPU/test-rewrite-narrow-types.mlir
---
 .../Dialect/Vector/CPU/test-rewrite-narrow-types.mlir           | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/test/Integration/Dialect/Vector/CPU/test-rewrite-narrow-types.mlir b/mlir/test/Integration/Dialect/Vector/CPU/test-rewrite-narrow-types.mlir
index 3c8c8d45013dc..a0b39a2b68f43 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/test-rewrite-narrow-types.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/test-rewrite-narrow-types.mlir
@@ -206,7 +206,7 @@ func.func @entry() {
   %passthru = arith.constant dense<[7, 8, 9, 10, 11, 12]> : vector<6xi4>
   %load = call @fcst_maskedload(%A, %passthru) : (memref<?xi4>, vector<6xi4>) -> (vector<6xi4>)
   vector.print %load : vector<6xi4>
-  // CHECK: ( 1, 2, 3, -6, -5, -4 )
+  // CHECK: ( 0, 1, 2, -6, -5, -4 )
   memref.dealloc %A : memref<?xi4>
 
   return

From e4dc7d492c7bf65b281f178c0c00ad423a3a4c5c Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Fri, 27 Oct 2023 11:46:49 +0200
Subject: [PATCH 204/877] [InstCombine] Remove redundant cast of GEP fold (NFC)

With opaque pointers, zero-index GEPs will be eliminated in
general.
---
 .../InstCombine/InstCombineCasts.cpp          | 29 ++-----------------
 .../InstCombine/InstCombineInternal.h         |  1 -
 2 files changed, 2 insertions(+), 28 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
index 5e23514bf5ae0..2285a91cbdf2b 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -1916,29 +1916,6 @@ Instruction *InstCombinerImpl::visitIntToPtr(IntToPtrInst &CI) {
   return nullptr;
 }
 
-/// Implement the transforms for cast of pointer (bitcast/ptrtoint)
-Instruction *InstCombinerImpl::commonPointerCastTransforms(CastInst &CI) {
-  Value *Src = CI.getOperand(0);
-
-  if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Src)) {
-    // If casting the result of a getelementptr instruction with no offset, turn
-    // this into a cast of the original pointer!
-    if (GEP->hasAllZeroIndices() &&
-        // If CI is an addrspacecast and GEP changes the poiner type, merging
-        // GEP into CI would undo canonicalizing addrspacecast with different
-        // pointer types, causing infinite loops.
-        (!isa<AddrSpaceCastInst>(CI) ||
-         GEP->getType() == GEP->getPointerOperandType())) {
-      // Changing the cast operand is usually not a good idea but it is safe
-      // here because the pointer operand is being replaced with another
-      // pointer operand so the opcode doesn't need to change.
-      return replaceOperand(CI, 0, GEP->getOperand(0));
-    }
-  }
-
-  return commonCastTransforms(CI);
-}
-
 Instruction *InstCombinerImpl::visitPtrToInt(PtrToIntInst &CI) {
   // If the destination integer type is not the intptr_t type for this target,
   // do a ptrtoint to intptr_t then do a trunc or zext.  This allows the cast
@@ -1980,7 +1957,7 @@ Instruction *InstCombinerImpl::visitPtrToInt(PtrToIntInst &CI) {
     return InsertElementInst::Create(Vec, NewCast, Index);
   }
 
-  return commonPointerCastTransforms(CI);
+  return commonCastTransforms(CI);
 }
 
 /// This input value (which is known to have vector type) is being zero extended
@@ -2702,11 +2679,9 @@ Instruction *InstCombinerImpl::visitBitCast(BitCastInst &CI) {
   if (Instruction *I = foldBitCastSelect(CI, Builder))
     return I;
 
-  if (SrcTy->isPointerTy())
-    return commonPointerCastTransforms(CI);
   return commonCastTransforms(CI);
 }
 
 Instruction *InstCombinerImpl::visitAddrSpaceCast(AddrSpaceCastInst &CI) {
-  return commonPointerCastTransforms(CI);
+  return commonCastTransforms(CI);
 }
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
index a53d67b2899b7..01c89ea06f2d9 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -131,7 +131,6 @@ class LLVM_LIBRARY_VISIBILITY InstCombinerImpl final
   Instruction *FoldShiftByConstant(Value *Op0, Constant *Op1,
                                    BinaryOperator &I);
   Instruction *commonCastTransforms(CastInst &CI);
-  Instruction *commonPointerCastTransforms(CastInst &CI);
   Instruction *visitTrunc(TruncInst &CI);
   Instruction *visitZExt(ZExtInst &Zext);
   Instruction *visitSExt(SExtInst &Sext);

From 45e2e0377a08fbf3661e84912f39dbc6b251744c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andrzej=20Warzy=C5=84ski?= <andrzej.warzynski@arm.com>
Date: Fri, 27 Oct 2023 11:06:20 +0100
Subject: [PATCH 205/877] [mlir][SVE] Add an e2e test for vectorization of
 linalg.matmul (#70372)

Adds an end-to-end test for scalable vectorization of linalg.matmul.
---
 .../Dialect/Linalg/CPU/ArmSVE/matmul.mlir     | 68 +++++++++++++++++++
 1 file changed, 68 insertions(+)
 create mode 100644 mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/matmul.mlir

diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/matmul.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/matmul.mlir
new file mode 100644
index 0000000000000..2024da2a585d9
--- /dev/null
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSVE/matmul.mlir
@@ -0,0 +1,68 @@
+// RUN: mlir-opt %s -test-transform-dialect-interpreter -test-transform-dialect-erase-schedule \
+// RUN:   -one-shot-bufferize -func-bufferize -cse -canonicalize -convert-vector-to-scf -arm-sve-legalize-vector-storage \
+// RUN:   -convert-vector-to-llvm="enable-arm-sve" -test-lower-to-llvm | \
+// RUN: %mcr_aarch64_cmd -e=matmul_f32 -entry-point-result=void --march=aarch64 --mattr="+sve" -shared-libs=%mlir_runner_utils,%mlir_c_runner_utils | \
+// RUN: FileCheck %s
+
+func.func @matmul_f32() {
+  // Matrix dimensions
+  %K = arith.constant 3 : index
+  %M = arith.constant 5 : index
+  %N = arith.constant 15 : index
+  %c0_f32 = arith.constant 0.0 : f32
+
+  // Allocate the matrices
+  %A_alloc = bufferization.alloc_tensor(%M, %K) : tensor<?x?xf32>
+  %B_alloc = bufferization.alloc_tensor(%K, %N) : tensor<?x?xf32>
+  %C_alloc = bufferization.alloc_tensor(%M, %N) : tensor<?x?xf32>
+
+  // Initialise the matrices
+  %pi = arith.constant  3.14 : f32
+  %A = linalg.fill ins(%pi : f32) outs(%A_alloc : tensor<?x?xf32>) -> tensor<?x?xf32>
+  %B = linalg.fill ins(%pi : f32) outs(%B_alloc : tensor<?x?xf32>) -> tensor<?x?xf32>
+  %C_in = linalg.fill ins(%c0_f32 : f32) outs(%C_alloc : tensor<?x?xf32>) -> tensor<?x?xf32>
+
+  // Matmul
+  %C_out = linalg.matmul ins(%A, %B: tensor<?x?xf32>, tensor<?x?xf32>) outs(%C_in: tensor<?x?xf32>) -> tensor<?x?xf32>
+
+  // Print and verify the output
+  // CHECK-LABEL: SVE: START OF TEST OUTPUT
+  vector.print str "SVE: START OF TEST OUTPUT"
+
+  // CHECK-NEXT: Unranked Memref {{.*}} rank = 2 offset = 0 sizes = [5, 15] strides = [15, 1] data =
+  // CHECK-COUNT-5: [29.5788, 29.5788, 29.5788, 29.5788, 29.5788, 29.5788, 29.5788, 29.5788, 29.5788, 29.5788, 29.5788, 29.5788, 29.5788, 29.5788, 29.5788]
+  %xf = tensor.cast %C_out : tensor<?x?xf32> to tensor<*xf32>
+  call @printMemrefF32(%xf) : (tensor<*xf32>) -> ()
+
+  // CHECK-NEXT: SVE: END OF TEST OUTPUT
+  vector.print str "SVE: END OF TEST OUTPUT"
+
+  return
+}
+
+transform.sequence failures(propagate) {
+^bb1(%module_op: !transform.any_op):
+  // Step 1: Tile
+  %matmul = transform.structured.match ops{["linalg.matmul"]} in %module_op : (!transform.any_op) -> !transform.any_op
+  %func_op = get_parent_op %matmul : (!transform.any_op) -> !transform.op<"func.func">
+  %module_with_tiled_loops, %loops:3 = transform.structured.tile_using_for %matmul [2, [4], 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
+
+  // Step 2: Vectorize
+  %tiled_matmul = transform.structured.match ops{["linalg.matmul"]} in %module_with_tiled_loops : (!transform.any_op) -> !transform.any_op
+  transform.structured.vectorize %tiled_matmul vector_sizes [2, [4], 1] : !transform.any_op
+
+  // Step 3: Lower vector.multi_reduction to vector.contract (+ some helpful patterns)
+  transform.apply_patterns to %func_op {
+    transform.apply_patterns.vector.reduction_to_contract
+    transform.apply_patterns.vector.transfer_permutation_patterns
+    transform.apply_patterns.vector.lower_masked_transfers
+  } : !transform.op<"func.func">
+
+  // Step 4: Lower vector.contract to vector.fma
+  transform.apply_patterns to %func_op {
+    transform.apply_patterns.vector.lower_contraction lowering_strategy = "outerproduct"
+    transform.apply_patterns.vector.lower_outerproduct
+  } : !transform.op<"func.func">
+}
+
+func.func private @printMemrefF32(%ptr : tensor<*xf32>)

From f0899edecae23742e01d6f3913b62841ad123532 Mon Sep 17 00:00:00 2001
From: Ivan Kosarev <ivan.kosarev@amd.com>
Date: Fri, 27 Oct 2023 13:14:40 +0300
Subject: [PATCH 206/877] [AMDGPU] Give some of the VI V_PK_* instructions a
 subtarget predicate. (#70334)

This resolves AsmParser ambiguity between them and similar instructions
for other subtargets, e.g., V_PK_SUB_U16_vi being identical to
V_PK_SUB_U16_gfx11 on GFX11.

Part of <https://github.com/llvm/llvm-project/issues/69256>.
---
 llvm/lib/Target/AMDGPU/VOP3PInstructions.td | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index 05e68f46b3260..9c746cda1a4b1 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -1105,6 +1105,7 @@ multiclass VOP3P_Real_SMFMAC<bits<7> op, string alias> {
   def : MnemonicAlias<alias, !cast<VOP3_Pseudo>(NAME#"_e64").Mnemonic>;
 }
 
+let SubtargetPredicate = isGFX8GFX9 in {
 defm V_PK_MAD_I16 : VOP3P_Real_vi <0x00>;
 defm V_PK_MUL_LO_U16 : VOP3P_Real_vi <0x01>;
 defm V_PK_ADD_I16 : VOP3P_Real_vi <0x02>;
@@ -1125,7 +1126,7 @@ defm V_PK_ADD_F16 : VOP3P_Real_vi <0x0f>;
 defm V_PK_MUL_F16 : VOP3P_Real_vi <0x10>;
 defm V_PK_MIN_F16 : VOP3P_Real_vi <0x11>;
 defm V_PK_MAX_F16 : VOP3P_Real_vi <0x12>;
-
+} // End SubtargetPredicate = isGFX8GFX9
 
 let SubtargetPredicate = HasMadMixInsts in {
 defm V_MAD_MIX_F32 : VOP3P_Real_vi <0x20>;

From 9299ace0a20d3ea4d921c033a49364ea6f24263e Mon Sep 17 00:00:00 2001
From: Owen Pan <owenpiano@gmail.com>
Date: Fri, 27 Oct 2023 03:35:12 -0700
Subject: [PATCH 207/877] [clang-format][NFC] Remove extraneous newlines in
 remaining unit tests

---
 clang/unittests/Format/FormatTestJS.cpp       | 185 +++++++------
 .../Format/NamespaceEndCommentsFixerTest.cpp  |  84 +++---
 .../unittests/Format/SortImportsTestJava.cpp  |  84 +++---
 clang/unittests/Format/SortIncludesTest.cpp   | 246 +++++++++---------
 4 files changed, 299 insertions(+), 300 deletions(-)

diff --git a/clang/unittests/Format/FormatTestJS.cpp b/clang/unittests/Format/FormatTestJS.cpp
index 23b010dbc9826..e185eceb35305 100644
--- a/clang/unittests/Format/FormatTestJS.cpp
+++ b/clang/unittests/Format/FormatTestJS.cpp
@@ -328,7 +328,7 @@ TEST_F(FormatTestJS, ReservedWords) {
                "  switch: string;\n"
                "  case: string;\n"
                "  default: string;\n"
-               "}\n");
+               "}");
   verifyFormat("const Axis = {\n"
                "  for: 'for',\n"
                "  x: 'x'\n"
@@ -354,7 +354,7 @@ TEST_F(FormatTestJS, ReservedWordsMethods) {
                "  let() {\n"
                "    x();\n"
                "  }\n"
-               "}\n");
+               "}");
   verifyFormat("class KeywordNamedMethods {\n"
                "  do() {\n"
                "  }\n"
@@ -370,7 +370,7 @@ TEST_F(FormatTestJS, ReservedWordsMethods) {
                "  }\n"
                "  catch() {\n"
                "  }\n"
-               "}\n");
+               "}");
 }
 
 TEST_F(FormatTestJS, ReservedWordsParenthesized) {
@@ -380,7 +380,7 @@ TEST_F(FormatTestJS, ReservedWordsParenthesized) {
                "typeof (x) === 'string';\n"
                "void (0);\n"
                "delete (x.y);\n"
-               "return (x);\n");
+               "return (x);");
 }
 
 TEST_F(FormatTestJS, ES6DestructuringAssignment) {
@@ -618,33 +618,33 @@ TEST_F(FormatTestJS, GoogModules) {
       "    goog.module.get('my.long.module.name.followedBy.MyLongClassName');");
   verifyFormat("function a() {\n"
                "  goog.setTestOnly();\n"
-               "}\n",
+               "}",
                "function a() {\n"
                "goog.setTestOnly();\n"
-               "}\n");
+               "}");
 }
 
 TEST_F(FormatTestJS, FormatsNamespaces) {
   verifyFormat("namespace Foo {\n"
                "  export let x = 1;\n"
-               "}\n");
+               "}");
   verifyFormat("declare namespace Foo {\n"
                "  export let x: number;\n"
-               "}\n");
+               "}");
 }
 
 TEST_F(FormatTestJS, NamespacesMayNotWrap) {
   verifyFormat("declare namespace foobarbaz {\n"
-               "}\n",
+               "}",
                getGoogleJSStyleWithColumns(18));
   verifyFormat("declare module foobarbaz {\n"
-               "}\n",
+               "}",
                getGoogleJSStyleWithColumns(15));
   verifyFormat("namespace foobarbaz {\n"
-               "}\n",
+               "}",
                getGoogleJSStyleWithColumns(10));
   verifyFormat("module foobarbaz {\n"
-               "}\n",
+               "}",
                getGoogleJSStyleWithColumns(7));
 }
 
@@ -657,15 +657,15 @@ TEST_F(FormatTestJS, AmbientDeclarations) {
                "x();", // TODO(martinprobst): should ideally be indented.
                NineCols);
   verifyFormat("declare function foo();\n"
-               "let x = 1;\n");
+               "let x = 1;");
   verifyFormat("declare function foo(): string;\n"
-               "let x = 1;\n");
+               "let x = 1;");
   verifyFormat("declare function foo(): {x: number};\n"
-               "let x = 1;\n");
+               "let x = 1;");
   verifyFormat("declare class X {}\n"
-               "let x = 1;\n");
+               "let x = 1;");
   verifyFormat("declare interface Y {}\n"
-               "let x = 1;\n");
+               "let x = 1;");
   verifyFormat("declare enum X {\n"
                "}",
                NineCols);
@@ -691,12 +691,12 @@ TEST_F(FormatTestJS, FormatsFreestandingFunctions) {
   verifyFormat("function aFunction() {}\n"
                "(function f() {\n"
                "  var x = 1;\n"
-               "}());\n");
+               "}());");
   verifyFormat("function aFunction() {}\n"
                "{\n"
                "  let x = 1;\n"
                "  console.log(x);\n"
-               "}\n");
+               "}");
   EXPECT_EQ("a = function(x) {}\n"
             "\n"
             "function f(x) {}",
@@ -781,7 +781,7 @@ TEST_F(FormatTestJS, GeneratorFunctions) {
                getGoogleJSStyleWithColumns(8));
   verifyFormat("export function* f() {\n"
                "  yield 1;\n"
-               "}\n");
+               "}");
   verifyFormat("class X {\n"
                "  * generatorMethod() {\n"
                "    yield x;\n"
@@ -791,7 +791,7 @@ TEST_F(FormatTestJS, GeneratorFunctions) {
                "  a: function*() {\n"
                "    //\n"
                "  }\n"
-               "}\n");
+               "}");
 }
 
 TEST_F(FormatTestJS, AsyncFunctions) {
@@ -805,14 +805,14 @@ TEST_F(FormatTestJS, AsyncFunctions) {
                "\n"
                "function a() {\n"
                "  return 1;\n"
-               "}\n",
+               "}",
                "  async   function f() {\n"
                "   return 1;\n"
                "}\n"
                "\n"
                "   function a() {\n"
                "  return   1;\n"
-               "}  \n");
+               "}  ");
   // clang-format must not insert breaks between async and function, otherwise
   // automatic semicolon insertion may trigger (in particular in a class body).
   verifyFormat("async function\n"
@@ -848,15 +848,15 @@ TEST_F(FormatTestJS, AsyncFunctions) {
   verifyFormat("function initialize() {\n"
                "  // Comment.\n"
                "  return async.then();\n"
-               "}\n");
+               "}");
   verifyFormat("for await (const x of y) {\n"
                "  console.log(x);\n"
-               "}\n");
+               "}");
   verifyFormat("function asyncLoop() {\n"
                "  for await (const x of y) {\n"
                "    console.log(x);\n"
                "  }\n"
-               "}\n");
+               "}");
 }
 
 TEST_F(FormatTestJS, OverriddenMembers) {
@@ -865,18 +865,18 @@ TEST_F(FormatTestJS, OverriddenMembers) {
       "  protected override "
       "anOverlyLongPropertyNameSoLongItHasToGoInASeparateLineWhenOverriden:\n"
       "      undefined;\n"
-      "}\n");
+      "}");
   verifyFormat(
       "class C extends P {\n"
       "  protected override "
       "anOverlyLongMethodNameSoLongItHasToGoInASeparateLineWhenOverriden() {\n"
       "  }\n"
-      "}\n");
+      "}");
   verifyFormat("class C extends P {\n"
                "  protected override aMethodName<ATypeParam extends {},\n"
                "                                                    BTypeParam "
                "extends {}>() {}\n"
-               "}\n");
+               "}");
 }
 
 TEST_F(FormatTestJS, FunctionParametersTrailingComma) {
@@ -886,21 +886,21 @@ TEST_F(FormatTestJS, FunctionParametersTrailingComma) {
                "    p3,\n"
                ") {\n"
                "  a;  //\n"
-               "}\n",
+               "}",
                "function trailingComma(p1, p2, p3,) {\n"
                "  a;  //\n"
-               "}\n");
+               "}");
   verifyFormat("trailingComma(\n"
                "    p1,\n"
                "    p2,\n"
                "    p3,\n"
-               ");\n",
-               "trailingComma(p1, p2, p3,);\n");
+               ");",
+               "trailingComma(p1, p2, p3,);");
   verifyFormat("trailingComma(\n"
                "    p1  // hello\n"
-               ");\n",
+               ");",
                "trailingComma(p1 // hello\n"
-               ");\n");
+               ");");
 }
 
 TEST_F(FormatTestJS, ArrayLiterals) {
@@ -1177,7 +1177,7 @@ TEST_F(FormatTestJS, MultipleFunctionLiterals) {
                "      doFoo();\n"
                "      doBaz();\n"
                "    },\n"
-               "    []);\n");
+               "    []);");
   verifyFormat("promise.then(\n"
                "    function success() {\n"
                "      doFoo();\n"
@@ -1187,7 +1187,7 @@ TEST_F(FormatTestJS, MultipleFunctionLiterals) {
                "    function error() {\n"
                "      doFoo();\n"
                "      doBaz();\n"
-               "    });\n");
+               "    });");
   verifyFormat("promise.then(\n"
                "    [],\n"
                "    function success() {\n"
@@ -1197,7 +1197,7 @@ TEST_F(FormatTestJS, MultipleFunctionLiterals) {
                "    function error() {\n"
                "      doFoo();\n"
                "      doBaz();\n"
-               "    });\n");
+               "    });");
 
   verifyFormat("getSomeLongPromise()\n"
                "    .then(function(value) { body(); })\n"
@@ -1232,7 +1232,7 @@ TEST_F(FormatTestJS, ArrowFunctions) {
   verifyFormat("var x = (a) => {\n"
                "  x;\n"
                "  return a;\n"
-               "};\n");
+               "};");
   verifyFormat("var x = (a) => {\n"
                "  function y() {\n"
                "    return 42;\n"
@@ -1269,7 +1269,7 @@ TEST_F(FormatTestJS, ArrowFunctions) {
   verifyFormat("const f = (x: string|null): string|null => {\n"
                "  y;\n"
                "  return x;\n"
-               "}\n");
+               "}");
 }
 
 TEST_F(FormatTestJS, ArrowFunctionStyle) {
@@ -1347,9 +1347,9 @@ TEST_F(FormatTestJS, WrapRespectsAutomaticSemicolonInsertion) {
                "}",
                getGoogleJSStyleWithColumns(20));
   verifyFormat("const x = (5 + 9)\n"
-               "const y = 3\n",
+               "const y = 3",
                "const x = (   5 +    9)\n"
-               "const y = 3\n");
+               "const y = 3");
   // Ideally the foo() bit should be indented relative to the async function().
   verifyFormat("async function\n"
                "foo() {}",
@@ -1477,11 +1477,11 @@ TEST_F(FormatTestJS, ImportExportType) {
   verifyFormat("import type {x, y} from 'y';\n"
                "import type * as x from 'y';\n"
                "import type x from 'y';\n"
-               "import {x, type yu, z} from 'y';\n");
+               "import {x, type yu, z} from 'y';");
   verifyFormat("export type {x, y} from 'y';\n"
                "export {x, type yu, z} from 'y';\n"
                "export type {x, y};\n"
-               "export {x, type yu, z};\n");
+               "export {x, type yu, z};");
 }
 
 TEST_F(FormatTestJS, ClosureStyleCasts) {
@@ -1717,7 +1717,7 @@ TEST_F(FormatTestJS, ClassDeclarations) {
   verifyFormat("class C {\n"
                "  foo() {}\n"
                "  [bar]() {}\n"
-               "}\n");
+               "}");
   verifyFormat("class C {\n  private x: string = 12;\n}");
   verifyFormat("class C {\n  private static x: string = 12;\n}");
   verifyFormat("class C {\n  static x(): string {\n    return 'asd';\n  }\n}");
@@ -1836,7 +1836,7 @@ TEST_F(FormatTestJS, Decorators) {
                "class Y {}");
   verifyFormat("class X {\n"
                "  @property() private isReply = false;\n"
-               "}\n");
+               "}");
 }
 
 TEST_F(FormatTestJS, TypeAliases) {
@@ -1850,18 +1850,18 @@ TEST_F(FormatTestJS, TypeAliases) {
   verifyFormat("export type X = {\n"
                "  a: string,\n"
                "  b?: string,\n"
-               "};\n");
+               "};");
 }
 
 TEST_F(FormatTestJS, TypeInterfaceLineWrapping) {
   const FormatStyle &Style = getGoogleJSStyleWithColumns(20);
   verifyFormat("type LongTypeIsReallyUnreasonablyLong =\n"
-               "    string;\n",
-               "type LongTypeIsReallyUnreasonablyLong = string;\n", Style);
+               "    string;",
+               "type LongTypeIsReallyUnreasonablyLong = string;", Style);
   verifyFormat("interface AbstractStrategyFactoryProvider {\n"
                "  a: number\n"
-               "}\n",
-               "interface AbstractStrategyFactoryProvider { a: number }\n",
+               "}",
+               "interface AbstractStrategyFactoryProvider { a: number }",
                Style);
 }
 
@@ -1869,13 +1869,13 @@ TEST_F(FormatTestJS, RemoveEmptyLinesInArrowFunctions) {
   verifyFormat("x = () => {\n"
                "  foo();\n"
                "  bar();\n"
-               "};\n",
+               "};",
                "x = () => {\n"
                "\n"
                "  foo();\n"
                "  bar();\n"
                "\n"
-               "};\n");
+               "};");
 }
 
 TEST_F(FormatTestJS, Modules) {
@@ -2091,7 +2091,7 @@ TEST_F(FormatTestJS, TemplateStrings) {
                "var x = ` \\` a`;\n"
                "var y;");
   // Escaped dollar.
-  verifyFormat("var x = ` \\${foo}`;\n");
+  verifyFormat("var x = ` \\${foo}`;");
 
   // The token stream can contain two string_literals in sequence, but that
   // doesn't mean that they are implicitly concatenated in JavaScript.
@@ -2217,7 +2217,7 @@ TEST_F(FormatTestJS, UserDefinedTypeGuards) {
       "function foo(check: Object):\n"
       "    check is {foo: string, bar: string, baz: string, foobar: string} {\n"
       "  return 'bar' in check;\n"
-      "}\n");
+      "}");
 }
 
 TEST_F(FormatTestJS, OptionalTypes) {
@@ -2293,10 +2293,10 @@ TEST_F(FormatTestJS, JSDocAnnotations) {
   // make sure clang-format doesn't break before *any* '{'
   verifyFormat("/**\n"
                " * @lala {lala {lalala\n"
-               " */\n",
+               " */",
                "/**\n"
                " * @lala {lala {lalala\n"
-               " */\n",
+               " */",
                getGoogleJSStyleWithColumns(20));
   // cases where '{' is around the column limit
   for (int ColumnLimit = 6; ColumnLimit < 13; ++ColumnLimit) {
@@ -2407,11 +2407,11 @@ TEST_F(FormatTestJS, RequoteStringsSingle) {
   verifyFormat("// clang-format off\n"
                "let x = \"double\";\n"
                "// clang-format on\n"
-               "let x = 'single';\n",
+               "let x = 'single';",
                "// clang-format off\n"
                "let x = \"double\";\n"
                "// clang-format on\n"
-               "let x = \"single\";\n");
+               "let x = \"single\";");
 }
 
 TEST_F(FormatTestJS, RequoteAndIndent) {
@@ -2452,25 +2452,25 @@ TEST_F(FormatTestJS, SupportShebangLines) {
 }
 
 TEST_F(FormatTestJS, NonNullAssertionOperator) {
-  verifyFormat("let x = foo!.bar();\n");
-  verifyFormat("let x = foo ? bar! : baz;\n");
-  verifyFormat("let x = !foo;\n");
+  verifyFormat("let x = foo!.bar();");
+  verifyFormat("let x = foo ? bar! : baz;");
+  verifyFormat("let x = !foo;");
   verifyFormat("if (!+a) {\n}");
-  verifyFormat("let x = foo[0]!;\n");
-  verifyFormat("let x = (foo)!;\n");
-  verifyFormat("let x = x(foo!);\n");
+  verifyFormat("let x = foo[0]!;");
+  verifyFormat("let x = (foo)!;");
+  verifyFormat("let x = x(foo!);");
   verifyFormat("a.aaaaaa(a.a!).then(\n"
-               "    x => x(x));\n",
+               "    x => x(x));",
                getGoogleJSStyleWithColumns(20));
-  verifyFormat("let x = foo! - 1;\n");
-  verifyFormat("let x = {foo: 1}!;\n");
+  verifyFormat("let x = foo! - 1;");
+  verifyFormat("let x = {foo: 1}!;");
   verifyFormat("let x = hello.foo()!\n"
                "            .foo()!\n"
                "            .foo()!\n"
-               "            .foo()!;\n",
+               "            .foo()!;",
                getGoogleJSStyleWithColumns(20));
-  verifyFormat("let x = namespace!;\n");
-  verifyFormat("return !!x;\n");
+  verifyFormat("let x = namespace!;");
+  verifyFormat("return !!x;");
 }
 
 TEST_F(FormatTestJS, CppKeywords) {
@@ -2534,27 +2534,26 @@ TEST_F(FormatTestJS, CppKeywords) {
 }
 
 TEST_F(FormatTestJS, NullPropagatingOperator) {
-  verifyFormat("let x = foo?.bar?.baz();\n");
-  verifyFormat("let x = foo?.(foo);\n");
-  verifyFormat("let x = foo?.['arr'];\n");
+  verifyFormat("let x = foo?.bar?.baz();");
+  verifyFormat("let x = foo?.(foo);");
+  verifyFormat("let x = foo?.['arr'];");
 }
 
 TEST_F(FormatTestJS, NullishCoalescingOperator) {
-  verifyFormat("const val = something ?? 'some other default';\n");
-  verifyFormat(
-      "const val = something ?? otherDefault ??\n"
-      "    evenMore ?? evenMore;\n",
-      "const val = something ?? otherDefault ?? evenMore ?? evenMore;\n",
-      getGoogleJSStyleWithColumns(40));
+  verifyFormat("const val = something ?? 'some other default';");
+  verifyFormat("const val = something ?? otherDefault ??\n"
+               "    evenMore ?? evenMore;",
+               "const val = something ?? otherDefault ?? evenMore ?? evenMore;",
+               getGoogleJSStyleWithColumns(40));
 }
 
 TEST_F(FormatTestJS, AssignmentOperators) {
-  verifyFormat("a &&= b;\n");
-  verifyFormat("a ||= b;\n");
+  verifyFormat("a &&= b;");
+  verifyFormat("a ||= b;");
   // NB: need to split ? ?= to avoid it being interpreted by C++ as a trigraph
   // for #.
   verifyFormat("a ?"
-               "?= b;\n");
+               "?= b;");
 }
 
 TEST_F(FormatTestJS, Conditional) {
@@ -2626,24 +2625,24 @@ TEST_F(FormatTestJS, NestedLiterals) {
 
 TEST_F(FormatTestJS, BackslashesInComments) {
   verifyFormat("// hello \\\n"
-               "if (x) foo();\n",
+               "if (x) foo();",
                "// hello \\\n"
                "     if ( x) \n"
-               "   foo();\n");
+               "   foo();");
   verifyFormat("/* ignore \\\n"
                " */\n"
-               "if (x) foo();\n",
+               "if (x) foo();",
                "/* ignore \\\n"
                " */\n"
-               " if (  x) foo();\n");
+               " if (  x) foo();");
   verifyFormat("// st \\ art\\\n"
                "// comment"
                "// continue \\\n"
-               "formatMe();\n",
+               "formatMe();",
                "// st \\ art\\\n"
                "// comment"
                "// continue \\\n"
-               "formatMe( );\n");
+               "formatMe( );");
 }
 
 TEST_F(FormatTestJS, AddsLastLinePenaltyIfEndingIsBroken) {
@@ -2698,14 +2697,14 @@ TEST_F(FormatTestJS, SupportPrivateFieldsAndMethods) {
                "  #privateMethod() {\n"
                "    this.#privateMethod();  // infinite loop\n"
                "  }\n"
-               "  static #staticPrivateMethod() {}\n");
+               "  static #staticPrivateMethod() {}");
 }
 
 TEST_F(FormatTestJS, DeclaredFields) {
   verifyFormat("class Example {\n"
                "  declare pub: string;\n"
                "  declare private priv: string;\n"
-               "}\n");
+               "}");
 }
 
 TEST_F(FormatTestJS, NoBreakAfterAsserts) {
@@ -2715,12 +2714,12 @@ TEST_F(FormatTestJS, NoBreakAfterAsserts) {
       "State>(\n"
       "      callback: Callback<ExportedState, DependencyState>):\n"
       "      asserts this is ExtendedState<DependencyState&ExportedState>;\n"
-      "}\n",
+      "}",
       "interface Assertable<State extends {}> {\n"
       "  assert<ExportedState extends {}, DependencyState extends State = "
       "State>(callback: Callback<ExportedState, DependencyState>): asserts "
       "this is ExtendedState<DependencyState&ExportedState>;\n"
-      "}\n");
+      "}");
 }
 
 TEST_F(FormatTestJS, NumericSeparators) {
diff --git a/clang/unittests/Format/NamespaceEndCommentsFixerTest.cpp b/clang/unittests/Format/NamespaceEndCommentsFixerTest.cpp
index 1ebcba551e4c9..fe097e9961e2e 100644
--- a/clang/unittests/Format/NamespaceEndCommentsFixerTest.cpp
+++ b/clang/unittests/Format/NamespaceEndCommentsFixerTest.cpp
@@ -67,11 +67,11 @@ TEST_F(NamespaceEndCommentsFixerTest, AddsEndComment) {
   EXPECT_EQ("namespace {\n"
             "int i;\n"
             "int j;\n"
-            "} // namespace\n",
+            "} // namespace",
             fixNamespaceEndComments("namespace {\n"
                                     "int i;\n"
                                     "int j;\n"
-                                    "}\n"));
+                                    "}"));
   EXPECT_EQ("namespace A {\n"
             "int i;\n"
             "int j;\n"
@@ -980,7 +980,7 @@ TEST_F(NamespaceEndCommentsFixerTest,
             "#endif\n"
             "int i;\n"
             "}\n"
-            "}\n",
+            "}",
             fixNamespaceEndComments("#ifdef 1\n"
                                     "namespace A {\n"
                                     "#elseif\n"
@@ -988,7 +988,7 @@ TEST_F(NamespaceEndCommentsFixerTest,
                                     "#endif\n"
                                     "int i;\n"
                                     "}\n"
-                                    "}\n"));
+                                    "}"));
 }
 
 TEST_F(NamespaceEndCommentsFixerTest, AddsEndCommentForNamespacesAroundMacros) {
@@ -1063,12 +1063,12 @@ TEST_F(NamespaceEndCommentsFixerTest, AddsEndCommentForNamespacesAroundMacros) {
             "int i;\n"
             "int j;\n"
             "} // namespace A\n"
-            "#define FOO\n",
+            "#define FOO",
             fixNamespaceEndComments("namespace A {\n"
                                     "int i;\n"
                                     "int j;\n"
                                     "}\n"
-                                    "#define FOO\n"));
+                                    "#define FOO"));
 
   // No replacement if open & close in different conditional blocks
   EXPECT_EQ("#if 1\n"
@@ -1128,7 +1128,7 @@ TEST_F(NamespaceEndCommentsFixerTest,
             "#endif\n"
             "int i;\n"
             "}\n"
-            "}\n",
+            "}",
             fixNamespaceEndComments("#ifdef 1\n"
                                     "namespace A {\n"
                                     "#elseif\n"
@@ -1136,7 +1136,7 @@ TEST_F(NamespaceEndCommentsFixerTest,
                                     "#endif\n"
                                     "int i;\n"
                                     "}\n"
-                                    "}\n"));
+                                    "}"));
   EXPECT_EQ("namespace {\n"
             "int i;\n"
             "int j;\n"
@@ -1255,41 +1255,41 @@ TEST_F(NamespaceEndCommentsFixerTest, HandlesInlineAtEndOfLine_PR32438) {
             "}\n"
             "#define c inline\n"
             "void d() {\n"
-            "}\n",
+            "}",
             fixNamespaceEndComments("template <int> struct a {};\n"
                                     "struct a<bool{}> b() {\n"
                                     "}\n"
                                     "#define c inline\n"
                                     "void d() {\n"
-                                    "}\n"));
+                                    "}"));
 }
 
 TEST_F(NamespaceEndCommentsFixerTest, IgnoreUnbalanced) {
   EXPECT_EQ("namespace A {\n"
             "class Foo {\n"
             "}\n"
-            "} // namespace A\n",
+            "} // namespace A",
             fixNamespaceEndComments("namespace A {\n"
                                     "class Foo {\n"
                                     "}\n"
-                                    "}\n"));
+                                    "}"));
   EXPECT_EQ("namespace A {\n"
             "class Foo {\n"
-            "}\n",
+            "}",
             fixNamespaceEndComments("namespace A {\n"
                                     "class Foo {\n"
-                                    "}\n"));
+                                    "}"));
 
   EXPECT_EQ("namespace A {\n"
             "class Foo {\n"
             "}\n"
             "}\n"
-            "}\n",
+            "}",
             fixNamespaceEndComments("namespace A {\n"
                                     "class Foo {\n"
                                     "}\n"
                                     "}\n"
-                                    "}\n"));
+                                    "}"));
 }
 
 using ShortNamespaceLinesTest = NamespaceEndCommentsFixerTest;
@@ -1298,19 +1298,19 @@ TEST_F(ShortNamespaceLinesTest, ZeroUnwrappedLines) {
   auto Style = getLLVMStyle();
   Style.ShortNamespaceLines = 0u;
 
-  EXPECT_EQ("namespace OneLinerNamespace {}\n",
-            fixNamespaceEndComments("namespace OneLinerNamespace {}\n", Style));
+  EXPECT_EQ("namespace OneLinerNamespace {}",
+            fixNamespaceEndComments("namespace OneLinerNamespace {}", Style));
   EXPECT_EQ("namespace ShortNamespace {\n"
-            "}\n",
+            "}",
             fixNamespaceEndComments("namespace ShortNamespace {\n"
-                                    "}\n",
+                                    "}",
                                     Style));
   EXPECT_EQ("namespace LongNamespace {\n"
             "int i;\n"
-            "} // namespace LongNamespace\n",
+            "} // namespace LongNamespace",
             fixNamespaceEndComments("namespace LongNamespace {\n"
                                     "int i;\n"
-                                    "}\n",
+                                    "}",
                                     Style));
 }
 
@@ -1321,18 +1321,18 @@ TEST_F(ShortNamespaceLinesTest, OneUnwrappedLine) {
   EXPECT_EQ(DefaultUnwrappedLines, Style.ShortNamespaceLines);
   EXPECT_EQ("namespace ShortNamespace {\n"
             "int i;\n"
-            "}\n",
+            "}",
             fixNamespaceEndComments("namespace ShortNamespace {\n"
                                     "int i;\n"
-                                    "}\n"));
+                                    "}"));
   EXPECT_EQ("namespace LongNamespace {\n"
             "int i;\n"
             "int j;\n"
-            "} // namespace LongNamespace\n",
+            "} // namespace LongNamespace",
             fixNamespaceEndComments("namespace LongNamespace {\n"
                                     "int i;\n"
                                     "int j;\n"
-                                    "}\n"));
+                                    "}"));
 }
 
 TEST_F(ShortNamespaceLinesTest, MultipleUnwrappedLine) {
@@ -1342,22 +1342,22 @@ TEST_F(ShortNamespaceLinesTest, MultipleUnwrappedLine) {
   EXPECT_EQ("namespace ShortNamespace {\n"
             "int i;\n"
             "int j;\n"
-            "}\n",
+            "}",
             fixNamespaceEndComments("namespace ShortNamespace {\n"
                                     "int i;\n"
                                     "int j;\n"
-                                    "}\n",
+                                    "}",
                                     Style));
   EXPECT_EQ("namespace LongNamespace {\n"
             "int i;\n"
             "int j;\n"
             "int k;\n"
-            "} // namespace LongNamespace\n",
+            "} // namespace LongNamespace",
             fixNamespaceEndComments("namespace LongNamespace {\n"
                                     "int i;\n"
                                     "int j;\n"
                                     "int k;\n"
-                                    "}\n",
+                                    "}",
                                     Style));
 
   // The namespace body has 5 unwrapped/annotated lines.
@@ -1384,36 +1384,36 @@ TEST_F(ShortNamespaceLinesTest, NamespaceAlias) {
             "{\n"
             "  int i;\n"
             "  int j;\n"
-            "}\n",
+            "}",
             fixNamespaceEndComments("namespace n = nn;\n"
                                     "{\n"
                                     "  int i;\n"
                                     "  int j;\n"
-                                    "}\n",
+                                    "}",
                                     Style));
 
   EXPECT_EQ("namespace n = nn; // comment\n"
             "{\n"
             "  int i;\n"
             "  int j;\n"
-            "}\n",
+            "}",
             fixNamespaceEndComments("namespace n = nn; // comment\n"
                                     "{\n"
                                     "  int i;\n"
                                     "  int j;\n"
-                                    "}\n",
+                                    "}",
                                     Style));
 
   EXPECT_EQ("namespace n = nn; /* comment */\n"
             "{\n"
             "  int i;\n"
             "  int j;\n"
-            "}\n",
+            "}",
             fixNamespaceEndComments("namespace n = nn; /* comment */\n"
                                     "{\n"
                                     "  int i;\n"
                                     "  int j;\n"
-                                    "}\n",
+                                    "}",
                                     Style));
 
   EXPECT_EQ(
@@ -1421,35 +1421,35 @@ TEST_F(ShortNamespaceLinesTest, NamespaceAlias) {
       "{\n"
       "  int i;\n"
       "  int j;\n"
-      "}\n",
+      "}",
       fixNamespaceEndComments("namespace n = nn; /* comment */ /* comment2 */\n"
                               "{\n"
                               "  int i;\n"
                               "  int j;\n"
-                              "}\n",
+                              "}",
                               Style));
 
   EXPECT_EQ("namespace n = nn; {\n"
             "  int i;\n"
             "  int j;\n"
-            "}\n",
+            "}",
             fixNamespaceEndComments("namespace n = nn; {\n"
                                     "  int i;\n"
                                     "  int j;\n"
-                                    "}\n",
+                                    "}",
                                     Style));
   EXPECT_EQ("int foo;\n"
             "namespace n\n"
             "{\n"
             "  int i;\n"
             "  int j;\n"
-            "} // namespace n\n",
+            "} // namespace n",
             fixNamespaceEndComments("int foo;\n"
                                     "namespace n\n"
                                     "{\n"
                                     "  int i;\n"
                                     "  int j;\n"
-                                    "}\n",
+                                    "}",
                                     Style));
 }
 } // end namespace
diff --git a/clang/unittests/Format/SortImportsTestJava.cpp b/clang/unittests/Format/SortImportsTestJava.cpp
index 5a1d0de8b4221..98a6826b1ff58 100644
--- a/clang/unittests/Format/SortImportsTestJava.cpp
+++ b/clang/unittests/Format/SortImportsTestJava.cpp
@@ -39,18 +39,18 @@ class SortImportsTestJava : public ::testing::Test {
 TEST_F(SortImportsTestJava, StaticSplitFromNormal) {
   EXPECT_EQ("import static org.b;\n"
             "\n"
-            "import org.a;\n",
+            "import org.a;",
             sort("import org.a;\n"
-                 "import static org.b;\n"));
+                 "import static org.b;"));
 }
 
 TEST_F(SortImportsTestJava, CapitalBeforeLowercase) {
   EXPECT_EQ("import org.Test;\n"
             "import org.a.Test;\n"
-            "import org.b;\n",
+            "import org.b;",
             sort("import org.a.Test;\n"
                  "import org.Test;\n"
-                 "import org.b;\n"));
+                 "import org.b;"));
 }
 
 TEST_F(SortImportsTestJava, KeepSplitGroupsWithOneNewImport) {
@@ -65,7 +65,7 @@ TEST_F(SortImportsTestJava, KeepSplitGroupsWithOneNewImport) {
             "\n"
             "import org.b;\n"
             "\n"
-            "import com.b;\n",
+            "import com.b;",
             sort("import static com.test.a;\n"
                  "\n"
                  "import static org.a;\n"
@@ -77,7 +77,7 @@ TEST_F(SortImportsTestJava, KeepSplitGroupsWithOneNewImport) {
                  "import org.b;\n"
                  "\n"
                  "import com.b;\n"
-                 "import com.test.c;\n"));
+                 "import com.test.c;"));
 }
 
 TEST_F(SortImportsTestJava, SplitGroupsWithNewline) {
@@ -91,13 +91,13 @@ TEST_F(SortImportsTestJava, SplitGroupsWithNewline) {
             "\n"
             "import org.b;\n"
             "\n"
-            "import com.b;\n",
+            "import com.b;",
             sort("import static com.test.a;\n"
                  "import static org.a;\n"
                  "import static com.a;\n"
                  "import com.test.b;\n"
                  "import org.b;\n"
-                 "import com.b;\n"));
+                 "import com.b;"));
 }
 
 TEST_F(SortImportsTestJava, UnspecifiedGroupAfterAllGroups) {
@@ -108,12 +108,12 @@ TEST_F(SortImportsTestJava, UnspecifiedGroupAfterAllGroups) {
             "import com.a;\n"
             "\n"
             "import abc.a;\n"
-            "import xyz.b;\n",
+            "import xyz.b;",
             sort("import com.test.a;\n"
                  "import com.a;\n"
                  "import xyz.b;\n"
                  "import abc.a;\n"
-                 "import org.a;\n"));
+                 "import org.a;"));
 }
 
 TEST_F(SortImportsTestJava, NoSortOutsideRange) {
@@ -122,12 +122,12 @@ TEST_F(SortImportsTestJava, NoSortOutsideRange) {
             "import org.a;\n"
             "// comments\n"
             "// that do\n"
-            "// nothing\n",
+            "// nothing",
             sort("import org.b;\n"
                  "import org.a;\n"
                  "// comments\n"
                  "// that do\n"
-                 "// nothing\n",
+                 "// nothing",
                  Ranges));
 }
 
@@ -139,13 +139,13 @@ TEST_F(SortImportsTestJava, SortWhenRangeContainsOneLine) {
             "import com.a;\n"
             "// comments\n"
             "// that do\n"
-            "// nothing\n",
+            "// nothing",
             sort("import org.b;\n"
                  "import org.a;\n"
                  "import com.a;\n"
                  "// comments\n"
                  "// that do\n"
-                 "// nothing\n",
+                 "// nothing",
                  Ranges));
 }
 
@@ -153,20 +153,20 @@ TEST_F(SortImportsTestJava, SortLexicographically) {
   EXPECT_EQ("import org.a.*;\n"
             "import org.a.a;\n"
             "import org.aA;\n"
-            "import org.aa;\n",
+            "import org.aa;",
             sort("import org.aa;\n"
                  "import org.a.a;\n"
                  "import org.a.*;\n"
-                 "import org.aA;\n"));
+                 "import org.aA;"));
 }
 
 TEST_F(SortImportsTestJava, StaticInCommentHasNoEffect) {
   EXPECT_EQ("import org.a; // static\n"
             "import org.b;\n"
-            "import org.c; // static\n",
+            "import org.c; // static",
             sort("import org.a; // static\n"
                  "import org.c; // static\n"
-                 "import org.b;\n"));
+                 "import org.b;"));
 }
 
 TEST_F(SortImportsTestJava, CommentsWithAffectedImports) {
@@ -176,63 +176,63 @@ TEST_F(SortImportsTestJava, CommentsWithAffectedImports) {
             " commentB*/\n"
             "import org.b;\n"
             "// commentC\n"
-            "import org.c;\n",
+            "import org.c;",
             sort("import org.a;\n"
                  "// commentC\n"
                  "import org.c;\n"
                  "// commentB\n"
                  "/* commentB\n"
                  " commentB*/\n"
-                 "import org.b;\n"));
+                 "import org.b;"));
 }
 
 TEST_F(SortImportsTestJava, CommentWithUnaffectedImports) {
   EXPECT_EQ("import org.a;\n"
             "// comment\n"
-            "import org.b;\n",
+            "import org.b;",
             sort("import org.a;\n"
                  "// comment\n"
-                 "import org.b;\n"));
+                 "import org.b;"));
 }
 
 TEST_F(SortImportsTestJava, CommentAfterAffectedImports) {
   EXPECT_EQ("import org.a;\n"
             "import org.b;\n"
-            "// comment\n",
+            "// comment",
             sort("import org.b;\n"
                  "import org.a;\n"
-                 "// comment\n"));
+                 "// comment"));
 }
 
 TEST_F(SortImportsTestJava, CommentBeforeAffectedImports) {
   EXPECT_EQ("// comment\n"
             "import org.a;\n"
-            "import org.b;\n",
+            "import org.b;",
             sort("// comment\n"
                  "import org.b;\n"
-                 "import org.a;\n"));
+                 "import org.a;"));
 }
 
 TEST_F(SortImportsTestJava, FormatTotallyOff) {
   EXPECT_EQ("// clang-format off\n"
             "import org.b;\n"
             "import org.a;\n"
-            "// clang-format on\n",
+            "// clang-format on",
             sort("// clang-format off\n"
                  "import org.b;\n"
                  "import org.a;\n"
-                 "// clang-format on\n"));
+                 "// clang-format on"));
 }
 
 TEST_F(SortImportsTestJava, FormatTotallyOn) {
   EXPECT_EQ("// clang-format off\n"
             "// clang-format on\n"
             "import org.a;\n"
-            "import org.b;\n",
+            "import org.b;",
             sort("// clang-format off\n"
                  "// clang-format on\n"
                  "import org.b;\n"
-                 "import org.a;\n"));
+                 "import org.a;"));
 }
 
 TEST_F(SortImportsTestJava, FormatPariallyOnShouldNotReorder) {
@@ -241,13 +241,13 @@ TEST_F(SortImportsTestJava, FormatPariallyOnShouldNotReorder) {
             "import org.a;\n"
             "// clang-format on\n"
             "import org.d;\n"
-            "import org.c;\n",
+            "import org.c;",
             sort("// clang-format off\n"
                  "import org.b;\n"
                  "import org.a;\n"
                  "// clang-format on\n"
                  "import org.d;\n"
-                 "import org.c;\n"));
+                 "import org.c;"));
 }
 
 TEST_F(SortImportsTestJava, SortJavaStaticImport) {
@@ -262,13 +262,13 @@ TEST_F(SortImportsTestJava, SortJavaStaticImport) {
             "\n"
             "import org.b;\n"
             "\n"
-            "import com.b;\n",
+            "import com.b;",
             sort("import static com.test.a;\n"
                  "import static org.a;\n"
                  "import static com.a;\n"
                  "import com.test.b;\n"
                  "import org.b;\n"
-                 "import com.b;\n"));
+                 "import com.b;"));
 
   FmtStyle.SortJavaStaticImport = FormatStyle::SJSIO_After;
   EXPECT_EQ("import com.test.b;\n"
@@ -282,14 +282,14 @@ TEST_F(SortImportsTestJava, SortJavaStaticImport) {
             "\n"
             "import static org.a;\n"
             "\n"
-            "import static com.a;\n",
+            "import static com.a;",
             sort("import static com.test.a;\n"
                  "import static org.a;\n"
                  "import static com.a;\n"
                  "import com.test.b;\n"
                  "import org.b;\n"
                  "import com.b;\n"
-                 "import com.test.c;\n"));
+                 "import com.test.c;"));
 }
 
 TEST_F(SortImportsTestJava, SortJavaStaticImportAsGroup) {
@@ -299,16 +299,16 @@ TEST_F(SortImportsTestJava, SortJavaStaticImportAsGroup) {
             "import com.test.b;\n"
             "\n"
             "import static org.a;\n"
-            "import static org.b;\n",
+            "import static org.b;",
             sort("import com.test.a;\n"
                  "import static org.a;\n"
                  "import com.test.b;\n"
-                 "import static org.b;\n"));
+                 "import static org.b;"));
 }
 
 TEST_F(SortImportsTestJava, DeduplicateImports) {
-  EXPECT_EQ("import org.a;\n", sort("import org.a;\n"
-                                    "import org.a;\n"));
+  EXPECT_EQ("import org.a;", sort("import org.a;\n"
+                                  "import org.a;"));
 }
 
 TEST_F(SortImportsTestJava, NoNewlineAtEnd) {
@@ -324,13 +324,13 @@ TEST_F(SortImportsTestJava, ImportNamedFunction) {
             "  void m() {\n"
             "    importFile();\n"
             "  }\n"
-            "}\n",
+            "}",
             sort("import X;\n"
                  "class C {\n"
                  "  void m() {\n"
                  "    importFile();\n"
                  "  }\n"
-                 "}\n"));
+                 "}"));
 }
 
 TEST_F(SortImportsTestJava, NoReplacementsForValidImports) {
diff --git a/clang/unittests/Format/SortIncludesTest.cpp b/clang/unittests/Format/SortIncludesTest.cpp
index 966a7da293d34..ec142e03b1285 100644
--- a/clang/unittests/Format/SortIncludesTest.cpp
+++ b/clang/unittests/Format/SortIncludesTest.cpp
@@ -55,17 +55,17 @@ class SortIncludesTest : public ::testing::Test {
 TEST_F(SortIncludesTest, BasicSorting) {
   EXPECT_EQ("#include \"a.h\"\n"
             "#include \"b.h\"\n"
-            "#include \"c.h\"\n",
+            "#include \"c.h\"",
             sort("#include \"a.h\"\n"
                  "#include \"c.h\"\n"
-                 "#include \"b.h\"\n"));
+                 "#include \"b.h\""));
 
   EXPECT_EQ("// comment\n"
             "#include <a>\n"
-            "#include <b>\n",
+            "#include <b>",
             sort("// comment\n"
                  "#include <b>\n"
-                 "#include <a>\n",
+                 "#include <a>",
                  {tooling::Range(25, 1)}));
 }
 
@@ -75,13 +75,13 @@ TEST_F(SortIncludesTest, TrailingComments) {
             "                  * long\n"
             "                  * comment*/\n"
             "#include \"c.h\"\n"
-            "#include \"d.h\"\n",
+            "#include \"d.h\"",
             sort("#include \"a.h\"\n"
                  "#include \"c.h\"\n"
                  "#include \"b.h\" /* long\n"
                  "                  * long\n"
                  "                  * comment*/\n"
-                 "#include \"d.h\"\n"));
+                 "#include \"d.h\""));
 }
 
 TEST_F(SortIncludesTest, SortedIncludesUsingSortPriorityAttribute) {
@@ -121,7 +121,7 @@ TEST_F(SortIncludesTest, SortedIncludesUsingSortPriorityAttribute) {
             "\n"
             "#include <paths.h>\n"
             "\n"
-            "#include \"pathnames.h\"\n",
+            "#include \"pathnames.h\"",
             sort("#include <sys/param.h>\n"
                  "#include <sys/types.h>\n"
                  "#include <sys/ioctl.h>\n"
@@ -139,7 +139,7 @@ TEST_F(SortIncludesTest, SortedIncludesUsingSortPriorityAttribute) {
                  "#include <errno.h>\n"
                  "#include <inttypes.h>\n"
                  "#include <stdio.h>\n"
-                 "#include <stdlib.h>\n"));
+                 "#include <stdlib.h>"));
 }
 TEST_F(SortIncludesTest, SortPriorityNotDefined) {
   FmtStyle = getLLVMStyle();
@@ -147,12 +147,12 @@ TEST_F(SortIncludesTest, SortPriorityNotDefined) {
             "#include \"clang/Format/Format.h\"\n"
             "#include \"llvm/ADT/None.h\"\n"
             "#include \"llvm/Support/Debug.h\"\n"
-            "#include \"gtest/gtest.h\"\n",
+            "#include \"gtest/gtest.h\"",
             sort("#include \"clang/Format/Format.h\"\n"
                  "#include \"llvm/ADT/None.h\"\n"
                  "#include \"FormatTestUtils.h\"\n"
                  "#include \"gtest/gtest.h\"\n"
-                 "#include \"llvm/Support/Debug.h\"\n"));
+                 "#include \"llvm/Support/Debug.h\""));
 }
 
 TEST_F(SortIncludesTest, NoReplacementsForValidIncludes) {
@@ -177,9 +177,9 @@ TEST_F(SortIncludesTest, MainFileHeader) {
 
   EXPECT_EQ("#include \"foo.bar.h\"\n"
             "\n"
-            "#include \"a.h\"\n",
+            "#include \"a.h\"",
             sort("#include \"a.h\"\n"
-                 "#include \"foo.bar.h\"\n",
+                 "#include \"foo.bar.h\"",
                  "foo.bar.cc"));
 }
 
@@ -187,22 +187,22 @@ TEST_F(SortIncludesTest, SortedIncludesInMultipleBlocksAreMerged) {
   Style.IncludeBlocks = tooling::IncludeStyle::IBS_Merge;
   EXPECT_EQ("#include \"a.h\"\n"
             "#include \"b.h\"\n"
-            "#include \"c.h\"\n",
+            "#include \"c.h\"",
             sort("#include \"a.h\"\n"
                  "#include \"c.h\"\n"
                  "\n"
                  "\n"
-                 "#include \"b.h\"\n"));
+                 "#include \"b.h\""));
 
   Style.IncludeBlocks = tooling::IncludeStyle::IBS_Regroup;
   EXPECT_EQ("#include \"a.h\"\n"
             "#include \"b.h\"\n"
-            "#include \"c.h\"\n",
+            "#include \"c.h\"",
             sort("#include \"a.h\"\n"
                  "#include \"c.h\"\n"
                  "\n"
                  "\n"
-                 "#include \"b.h\"\n"));
+                 "#include \"b.h\""));
 }
 
 TEST_F(SortIncludesTest, SupportClangFormatOff) {
@@ -213,7 +213,7 @@ TEST_F(SortIncludesTest, SupportClangFormatOff) {
             "#include <b>\n"
             "#include <a>\n"
             "#include <c>\n"
-            "// clang-format on\n",
+            "// clang-format on",
             sort("#include <b>\n"
                  "#include <a>\n"
                  "#include <c>\n"
@@ -221,7 +221,7 @@ TEST_F(SortIncludesTest, SupportClangFormatOff) {
                  "#include <b>\n"
                  "#include <a>\n"
                  "#include <c>\n"
-                 "// clang-format on\n"));
+                 "// clang-format on"));
 
   Style.IncludeBlocks = Style.IBS_Merge;
   std::string Code = "// clang-format off\r\n"
@@ -253,7 +253,7 @@ TEST_F(SortIncludesTest, SupportClangFormatOffCStyle) {
             "#include <b>\n"
             "#include <a>\n"
             "#include <c>\n"
-            "/* clang-format on */\n",
+            "/* clang-format on */",
             sort("#include <b>\n"
                  "#include <a>\n"
                  "#include <c>\n"
@@ -261,7 +261,7 @@ TEST_F(SortIncludesTest, SupportClangFormatOffCStyle) {
                  "#include <b>\n"
                  "#include <a>\n"
                  "#include <c>\n"
-                 "/* clang-format on */\n"));
+                 "/* clang-format on */"));
 
   // Not really turning it off
   EXPECT_EQ("#include <a>\n"
@@ -271,7 +271,7 @@ TEST_F(SortIncludesTest, SupportClangFormatOffCStyle) {
             "#include <a>\n"
             "#include <b>\n"
             "#include <c>\n"
-            "/* clang-format onwards */\n",
+            "/* clang-format onwards */",
             sort("#include <b>\n"
                  "#include <a>\n"
                  "#include <c>\n"
@@ -279,7 +279,7 @@ TEST_F(SortIncludesTest, SupportClangFormatOffCStyle) {
                  "#include <b>\n"
                  "#include <a>\n"
                  "#include <c>\n"
-                 "/* clang-format onwards */\n",
+                 "/* clang-format onwards */",
                  "input.h", 2));
 }
 
@@ -287,77 +287,77 @@ TEST_F(SortIncludesTest, IncludeSortingCanBeDisabled) {
   FmtStyle.SortIncludes = FormatStyle::SI_Never;
   EXPECT_EQ("#include \"a.h\"\n"
             "#include \"c.h\"\n"
-            "#include \"b.h\"\n",
+            "#include \"b.h\"",
             sort("#include \"a.h\"\n"
                  "#include \"c.h\"\n"
-                 "#include \"b.h\"\n",
+                 "#include \"b.h\"",
                  "input.h", 0));
 }
 
 TEST_F(SortIncludesTest, MixIncludeAndImport) {
   EXPECT_EQ("#include \"a.h\"\n"
             "#import \"b.h\"\n"
-            "#include \"c.h\"\n",
+            "#include \"c.h\"",
             sort("#include \"a.h\"\n"
                  "#include \"c.h\"\n"
-                 "#import \"b.h\"\n"));
+                 "#import \"b.h\""));
 }
 
 TEST_F(SortIncludesTest, FixTrailingComments) {
   EXPECT_EQ("#include \"a.h\"  // comment\n"
             "#include \"bb.h\" // comment\n"
-            "#include \"ccc.h\"\n",
+            "#include \"ccc.h\"",
             sort("#include \"a.h\" // comment\n"
                  "#include \"ccc.h\"\n"
-                 "#include \"bb.h\" // comment\n"));
+                 "#include \"bb.h\" // comment"));
 }
 
 TEST_F(SortIncludesTest, LeadingWhitespace) {
   EXPECT_EQ("#include \"a.h\"\n"
             "#include \"b.h\"\n"
-            "#include \"c.h\"\n",
+            "#include \"c.h\"",
             sort(" #include \"a.h\"\n"
                  "  #include \"c.h\"\n"
-                 "   #include \"b.h\"\n"));
+                 "   #include \"b.h\""));
   EXPECT_EQ("#include \"a.h\"\n"
             "#include \"b.h\"\n"
-            "#include \"c.h\"\n",
+            "#include \"c.h\"",
             sort("# include \"a.h\"\n"
                  "#  include \"c.h\"\n"
-                 "#   include \"b.h\"\n"));
-  EXPECT_EQ("#include \"a.h\"\n", sort("#include \"a.h\"\n"
-                                       " #include \"a.h\"\n"));
+                 "#   include \"b.h\""));
+  EXPECT_EQ("#include \"a.h\"", sort("#include \"a.h\"\n"
+                                     " #include \"a.h\""));
 }
 
 TEST_F(SortIncludesTest, TrailingWhitespace) {
   EXPECT_EQ("#include \"a.h\"\n"
             "#include \"b.h\"\n"
-            "#include \"c.h\"\n",
+            "#include \"c.h\"",
             sort("#include \"a.h\" \n"
                  "#include \"c.h\"  \n"
-                 "#include \"b.h\"   \n"));
-  EXPECT_EQ("#include \"a.h\"\n", sort("#include \"a.h\"\n"
-                                       "#include \"a.h\" \n"));
+                 "#include \"b.h\"   "));
+  EXPECT_EQ("#include \"a.h\"", sort("#include \"a.h\"\n"
+                                     "#include \"a.h\" "));
 }
 
 TEST_F(SortIncludesTest, GreaterInComment) {
   EXPECT_EQ("#include \"a.h\"\n"
             "#include \"b.h\" // >\n"
-            "#include \"c.h\"\n",
+            "#include \"c.h\"",
             sort("#include \"a.h\"\n"
                  "#include \"c.h\"\n"
-                 "#include \"b.h\" // >\n"));
+                 "#include \"b.h\" // >"));
 }
 
 TEST_F(SortIncludesTest, SortsLocallyInEachBlock) {
   EXPECT_EQ("#include \"a.h\"\n"
             "#include \"c.h\"\n"
             "\n"
-            "#include \"b.h\"\n",
+            "#include \"b.h\"",
             sort("#include \"a.h\"\n"
                  "#include \"c.h\"\n"
                  "\n"
-                 "#include \"b.h\"\n",
+                 "#include \"b.h\"",
                  "input.h", 0));
 }
 
@@ -365,42 +365,42 @@ TEST_F(SortIncludesTest, SortsAllBlocksWhenMerging) {
   Style.IncludeBlocks = tooling::IncludeStyle::IBS_Merge;
   EXPECT_EQ("#include \"a.h\"\n"
             "#include \"b.h\"\n"
-            "#include \"c.h\"\n",
+            "#include \"c.h\"",
             sort("#include \"a.h\"\n"
                  "#include \"c.h\"\n"
                  "\n"
-                 "#include \"b.h\"\n"));
+                 "#include \"b.h\""));
 }
 
 TEST_F(SortIncludesTest, CommentsAlwaysSeparateGroups) {
   EXPECT_EQ("#include \"a.h\"\n"
             "#include \"c.h\"\n"
             "// comment\n"
-            "#include \"b.h\"\n",
+            "#include \"b.h\"",
             sort("#include \"c.h\"\n"
                  "#include \"a.h\"\n"
                  "// comment\n"
-                 "#include \"b.h\"\n"));
+                 "#include \"b.h\""));
 
   Style.IncludeBlocks = tooling::IncludeStyle::IBS_Merge;
   EXPECT_EQ("#include \"a.h\"\n"
             "#include \"c.h\"\n"
             "// comment\n"
-            "#include \"b.h\"\n",
+            "#include \"b.h\"",
             sort("#include \"c.h\"\n"
                  "#include \"a.h\"\n"
                  "// comment\n"
-                 "#include \"b.h\"\n"));
+                 "#include \"b.h\""));
 
   Style.IncludeBlocks = tooling::IncludeStyle::IBS_Regroup;
   EXPECT_EQ("#include \"a.h\"\n"
             "#include \"c.h\"\n"
             "// comment\n"
-            "#include \"b.h\"\n",
+            "#include \"b.h\"",
             sort("#include \"c.h\"\n"
                  "#include \"a.h\"\n"
                  "// comment\n"
-                 "#include \"b.h\"\n"));
+                 "#include \"b.h\""));
 }
 
 TEST_F(SortIncludesTest, HandlesAngledIncludesAsSeparateBlocks) {
@@ -409,13 +409,13 @@ TEST_F(SortIncludesTest, HandlesAngledIncludesAsSeparateBlocks) {
             "#include <array>\n"
             "#include <b.h>\n"
             "#include <d.h>\n"
-            "#include <vector>\n",
+            "#include <vector>",
             sort("#include <vector>\n"
                  "#include <d.h>\n"
                  "#include <array>\n"
                  "#include <b.h>\n"
                  "#include \"c.h\"\n"
-                 "#include \"a.h\"\n"));
+                 "#include \"a.h\""));
 
   FmtStyle = getGoogleStyle(FormatStyle::LK_Cpp);
   EXPECT_EQ("#include <b.h>\n"
@@ -425,13 +425,13 @@ TEST_F(SortIncludesTest, HandlesAngledIncludesAsSeparateBlocks) {
             "#include <vector>\n"
             "\n"
             "#include \"a.h\"\n"
-            "#include \"c.h\"\n",
+            "#include \"c.h\"",
             sort("#include <vector>\n"
                  "#include <d.h>\n"
                  "#include <array>\n"
                  "#include <b.h>\n"
                  "#include \"c.h\"\n"
-                 "#include \"a.h\"\n"));
+                 "#include \"a.h\""));
 }
 
 TEST_F(SortIncludesTest, RegroupsAngledIncludesInSeparateBlocks) {
@@ -440,21 +440,21 @@ TEST_F(SortIncludesTest, RegroupsAngledIncludesInSeparateBlocks) {
             "#include \"c.h\"\n"
             "\n"
             "#include <b.h>\n"
-            "#include <d.h>\n",
+            "#include <d.h>",
             sort("#include <d.h>\n"
                  "#include <b.h>\n"
                  "#include \"c.h\"\n"
-                 "#include \"a.h\"\n"));
+                 "#include \"a.h\""));
 }
 
 TEST_F(SortIncludesTest, HandlesMultilineIncludes) {
   EXPECT_EQ("#include \"a.h\"\n"
             "#include \"b.h\"\n"
-            "#include \"c.h\"\n",
+            "#include \"c.h\"",
             sort("#include \"a.h\"\n"
                  "#include \\\n"
                  "\"c.h\"\n"
-                 "#include \"b.h\"\n"));
+                 "#include \"b.h\""));
 }
 
 TEST_F(SortIncludesTest, HandlesTrailingCommentsWithAngleBrackets) {
@@ -462,11 +462,11 @@ TEST_F(SortIncludesTest, HandlesTrailingCommentsWithAngleBrackets) {
   EXPECT_EQ("#include <cstdint>\n"
             "\n"
             "#include \"util/bar.h\"\n"
-            "#include \"util/foo/foo.h\" // foo<T>\n",
+            "#include \"util/foo/foo.h\" // foo<T>",
             sort("#include <cstdint>\n"
                  "\n"
                  "#include \"util/bar.h\"\n"
-                 "#include \"util/foo/foo.h\" // foo<T>\n",
+                 "#include \"util/foo/foo.h\" // foo<T>",
                  /*FileName=*/"input.cc",
                  /*ExpectedNumRanges=*/0));
 }
@@ -475,51 +475,51 @@ TEST_F(SortIncludesTest, LeavesMainHeaderFirst) {
   Style.IncludeIsMainRegex = "([-_](test|unittest))?$";
   EXPECT_EQ("#include \"llvm/a.h\"\n"
             "#include \"b.h\"\n"
-            "#include \"c.h\"\n",
+            "#include \"c.h\"",
             sort("#include \"llvm/a.h\"\n"
                  "#include \"c.h\"\n"
-                 "#include \"b.h\"\n",
+                 "#include \"b.h\"",
                  "a.cc"));
   EXPECT_EQ("#include \"llvm/a.h\"\n"
             "#include \"b.h\"\n"
-            "#include \"c.h\"\n",
+            "#include \"c.h\"",
             sort("#include \"llvm/a.h\"\n"
                  "#include \"c.h\"\n"
-                 "#include \"b.h\"\n",
+                 "#include \"b.h\"",
                  "a_test.cc"));
   EXPECT_EQ("#include \"llvm/input.h\"\n"
             "#include \"b.h\"\n"
-            "#include \"c.h\"\n",
+            "#include \"c.h\"",
             sort("#include \"llvm/input.h\"\n"
                  "#include \"c.h\"\n"
-                 "#include \"b.h\"\n",
+                 "#include \"b.h\"",
                  "input.mm"));
 
   // Don't allow prefixes.
   EXPECT_EQ("#include \"b.h\"\n"
             "#include \"c.h\"\n"
-            "#include \"llvm/not_a.h\"\n",
+            "#include \"llvm/not_a.h\"",
             sort("#include \"llvm/not_a.h\"\n"
                  "#include \"c.h\"\n"
-                 "#include \"b.h\"\n",
+                 "#include \"b.h\"",
                  "a.cc"));
 
   // Don't do this for _main and other suffixes.
   EXPECT_EQ("#include \"b.h\"\n"
             "#include \"c.h\"\n"
-            "#include \"llvm/a.h\"\n",
+            "#include \"llvm/a.h\"",
             sort("#include \"llvm/a.h\"\n"
                  "#include \"c.h\"\n"
-                 "#include \"b.h\"\n",
+                 "#include \"b.h\"",
                  "a_main.cc"));
 
   // Don't do this in headers.
   EXPECT_EQ("#include \"b.h\"\n"
             "#include \"c.h\"\n"
-            "#include \"llvm/a.h\"\n",
+            "#include \"llvm/a.h\"",
             sort("#include \"llvm/a.h\"\n"
                  "#include \"c.h\"\n"
-                 "#include \"b.h\"\n",
+                 "#include \"b.h\"",
                  "a.h"));
 
   // Only do this in the first #include block.
@@ -527,23 +527,23 @@ TEST_F(SortIncludesTest, LeavesMainHeaderFirst) {
             "\n"
             "#include \"b.h\"\n"
             "#include \"c.h\"\n"
-            "#include \"llvm/a.h\"\n",
+            "#include \"llvm/a.h\"",
             sort("#include <a>\n"
                  "\n"
                  "#include \"llvm/a.h\"\n"
                  "#include \"c.h\"\n"
-                 "#include \"b.h\"\n",
+                 "#include \"b.h\"",
                  "a.cc"));
 
   // Only recognize the first #include with a matching basename as main include.
   EXPECT_EQ("#include \"a.h\"\n"
             "#include \"b.h\"\n"
             "#include \"c.h\"\n"
-            "#include \"llvm/a.h\"\n",
+            "#include \"llvm/a.h\"",
             sort("#include \"b.h\"\n"
                  "#include \"a.h\"\n"
                  "#include \"c.h\"\n"
-                 "#include \"llvm/a.h\"\n",
+                 "#include \"llvm/a.h\"",
                  "a.cc"));
 }
 
@@ -551,50 +551,50 @@ TEST_F(SortIncludesTest, LeavesMainHeaderFirstInAdditionalExtensions) {
   Style.IncludeIsMainRegex = "([-_](test|unittest))?|(Impl)?$";
   EXPECT_EQ("#include \"b.h\"\n"
             "#include \"c.h\"\n"
-            "#include \"llvm/a.h\"\n",
+            "#include \"llvm/a.h\"",
             sort("#include \"llvm/a.h\"\n"
                  "#include \"c.h\"\n"
-                 "#include \"b.h\"\n",
+                 "#include \"b.h\"",
                  "a_test.xxx"));
   EXPECT_EQ("#include \"b.h\"\n"
             "#include \"c.h\"\n"
-            "#include \"llvm/a.h\"\n",
+            "#include \"llvm/a.h\"",
             sort("#include \"llvm/a.h\"\n"
                  "#include \"c.h\"\n"
-                 "#include \"b.h\"\n",
+                 "#include \"b.h\"",
                  "aImpl.hpp"));
 
   // .cpp extension is considered "main" by default
   EXPECT_EQ("#include \"llvm/a.h\"\n"
             "#include \"b.h\"\n"
-            "#include \"c.h\"\n",
+            "#include \"c.h\"",
             sort("#include \"llvm/a.h\"\n"
                  "#include \"c.h\"\n"
-                 "#include \"b.h\"\n",
+                 "#include \"b.h\"",
                  "aImpl.cpp"));
   EXPECT_EQ("#include \"llvm/a.h\"\n"
             "#include \"b.h\"\n"
-            "#include \"c.h\"\n",
+            "#include \"c.h\"",
             sort("#include \"llvm/a.h\"\n"
                  "#include \"c.h\"\n"
-                 "#include \"b.h\"\n",
+                 "#include \"b.h\"",
                  "a_test.cpp"));
 
   // Allow additional filenames / extensions
   Style.IncludeIsMainSourceRegex = "(Impl\\.hpp)|(\\.xxx)$";
   EXPECT_EQ("#include \"llvm/a.h\"\n"
             "#include \"b.h\"\n"
-            "#include \"c.h\"\n",
+            "#include \"c.h\"",
             sort("#include \"llvm/a.h\"\n"
                  "#include \"c.h\"\n"
-                 "#include \"b.h\"\n",
+                 "#include \"b.h\"",
                  "a_test.xxx"));
   EXPECT_EQ("#include \"llvm/a.h\"\n"
             "#include \"b.h\"\n"
-            "#include \"c.h\"\n",
+            "#include \"c.h\"",
             sort("#include \"llvm/a.h\"\n"
                  "#include \"c.h\"\n"
-                 "#include \"b.h\"\n",
+                 "#include \"b.h\"",
                  "aImpl.hpp"));
 }
 
@@ -604,11 +604,11 @@ TEST_F(SortIncludesTest, RecognizeMainHeaderInAllGroups) {
 
   EXPECT_EQ("#include \"c.h\"\n"
             "#include \"a.h\"\n"
-            "#include \"b.h\"\n",
+            "#include \"b.h\"",
             sort("#include \"b.h\"\n"
                  "\n"
                  "#include \"a.h\"\n"
-                 "#include \"c.h\"\n",
+                 "#include \"c.h\"",
                  "c.cc"));
 }
 
@@ -619,11 +619,11 @@ TEST_F(SortIncludesTest, MainHeaderIsSeparatedWhenRegroupping) {
   EXPECT_EQ("#include \"a.h\"\n"
             "\n"
             "#include \"b.h\"\n"
-            "#include \"c.h\"\n",
+            "#include \"c.h\"",
             sort("#include \"b.h\"\n"
                  "\n"
                  "#include \"a.h\"\n"
-                 "#include \"c.h\"\n",
+                 "#include \"c.h\"",
                  "a.cc"));
 }
 
@@ -636,12 +636,12 @@ TEST_F(SortIncludesTest, SupportOptionalCaseSensitiveSorting) {
             "#include \"A/b.h\"\n"
             "#include \"a/b.h\"\n"
             "#include \"B/A.h\"\n"
-            "#include \"B/a.h\"\n",
+            "#include \"B/a.h\"",
             sort("#include \"B/a.h\"\n"
                  "#include \"B/A.h\"\n"
                  "#include \"A/B.h\"\n"
                  "#include \"a/b.h\"\n"
-                 "#include \"A/b.h\"\n",
+                 "#include \"A/b.h\"",
                  "a.h"));
 
   Style.IncludeBlocks = clang::tooling::IncludeStyle::IBS_Regroup;
@@ -655,7 +655,7 @@ TEST_F(SortIncludesTest, SupportOptionalCaseSensitiveSorting) {
                            "#include <Algorithm>\n"
                            "#include \"vlib.h\"\n"
                            "#include \"Vlib.h\"\n"
-                           "#include \"AST.h\"\n";
+                           "#include \"AST.h\"";
 
   EXPECT_EQ("#include \"AST.h\"\n"
             "#include \"qt.h\"\n"
@@ -666,7 +666,7 @@ TEST_F(SortIncludesTest, SupportOptionalCaseSensitiveSorting) {
             "#include <qtwhatever.h>\n"
             "\n"
             "#include <Algorithm>\n"
-            "#include <algorithm>\n",
+            "#include <algorithm>",
             sort(UnsortedCode));
 }
 
@@ -682,14 +682,14 @@ TEST_F(SortIncludesTest, SupportCaseInsensitiveMatching) {
             "#include \"LLVM/z.h\"\n"
             "#include \"llvm/X.h\"\n"
             "#include \"GTest/GTest.h\"\n"
-            "#include \"gmock/gmock.h\"\n",
+            "#include \"gmock/gmock.h\"",
             sort("#include \"c.h\"\n"
                  "#include \"b.h\"\n"
                  "#include \"GTest/GTest.h\"\n"
                  "#include \"llvm/A.h\"\n"
                  "#include \"gmock/gmock.h\"\n"
                  "#include \"llvm/X.h\"\n"
-                 "#include \"LLVM/z.h\"\n",
+                 "#include \"LLVM/z.h\"",
                  "a_TEST.cc"));
 }
 
@@ -709,7 +709,7 @@ TEST_F(SortIncludesTest, SupportOptionalCaseSensitiveMachting) {
                            "#include \"qa.h\"\n"
                            "#include <queue>\n"
                            "#include <qtwhatever.h>\n"
-                           "#include <QtGlobal>\n";
+                           "#include <QtGlobal>";
 
   EXPECT_EQ("#include \"qa.h\"\n"
             "#include \"qt.h\"\n"
@@ -722,7 +722,7 @@ TEST_F(SortIncludesTest, SupportOptionalCaseSensitiveMachting) {
             "#include <QtGlobal>\n"
             "#include <queue>\n"
             "\n"
-            "#include <algorithm>\n",
+            "#include <algorithm>",
             sort(UnsortedCode));
 
   Style.IncludeCategories[2].RegexIsCaseSensitive = true;
@@ -739,7 +739,7 @@ TEST_F(SortIncludesTest, SupportOptionalCaseSensitiveMachting) {
             "#include <QtGlobal>\n"
             "\n"
             "#include <algorithm>\n"
-            "#include <queue>\n",
+            "#include <queue>",
             sort(UnsortedCode));
 }
 
@@ -748,19 +748,19 @@ TEST_F(SortIncludesTest, NegativePriorities) {
                              {".*", 1, 0, false}};
   EXPECT_EQ("#include \"important_os_header.h\"\n"
             "#include \"c_main.h\"\n"
-            "#include \"a_other.h\"\n",
+            "#include \"a_other.h\"",
             sort("#include \"c_main.h\"\n"
                  "#include \"a_other.h\"\n"
-                 "#include \"important_os_header.h\"\n",
+                 "#include \"important_os_header.h\"",
                  "c_main.cc"));
 
   // check stable when re-run
   EXPECT_EQ("#include \"important_os_header.h\"\n"
             "#include \"c_main.h\"\n"
-            "#include \"a_other.h\"\n",
+            "#include \"a_other.h\"",
             sort("#include \"important_os_header.h\"\n"
                  "#include \"c_main.h\"\n"
-                 "#include \"a_other.h\"\n",
+                 "#include \"a_other.h\"",
                  "c_main.cc", 0));
 }
 
@@ -773,10 +773,10 @@ TEST_F(SortIncludesTest, PriorityGroupsAreSeparatedWhenRegroupping) {
             "\n"
             "#include \"c_main.h\"\n"
             "\n"
-            "#include \"a_other.h\"\n",
+            "#include \"a_other.h\"",
             sort("#include \"c_main.h\"\n"
                  "#include \"a_other.h\"\n"
-                 "#include \"important_os_header.h\"\n",
+                 "#include \"important_os_header.h\"",
                  "c_main.cc"));
 
   // check stable when re-run
@@ -784,12 +784,12 @@ TEST_F(SortIncludesTest, PriorityGroupsAreSeparatedWhenRegroupping) {
             "\n"
             "#include \"c_main.h\"\n"
             "\n"
-            "#include \"a_other.h\"\n",
+            "#include \"a_other.h\"",
             sort("#include \"important_os_header.h\"\n"
                  "\n"
                  "#include \"c_main.h\"\n"
                  "\n"
-                 "#include \"a_other.h\"\n",
+                 "#include \"a_other.h\"",
                  "c_main.cc", 0));
 }
 
@@ -824,73 +824,73 @@ TEST_F(SortIncludesTest, CalculatesCorrectCursorPositionWithRegrouping) {
 TEST_F(SortIncludesTest, DeduplicateIncludes) {
   EXPECT_EQ("#include <a>\n"
             "#include <b>\n"
-            "#include <c>\n",
+            "#include <c>",
             sort("#include <a>\n"
                  "#include <b>\n"
                  "#include <b>\n"
                  "#include <b>\n"
                  "#include <b>\n"
-                 "#include <c>\n"));
+                 "#include <c>"));
 
   Style.IncludeBlocks = tooling::IncludeStyle::IBS_Merge;
   EXPECT_EQ("#include <a>\n"
             "#include <b>\n"
-            "#include <c>\n",
+            "#include <c>",
             sort("#include <a>\n"
                  "#include <b>\n"
                  "\n"
                  "#include <b>\n"
                  "\n"
                  "#include <b>\n"
-                 "#include <c>\n"));
+                 "#include <c>"));
 
   Style.IncludeBlocks = tooling::IncludeStyle::IBS_Regroup;
   EXPECT_EQ("#include <a>\n"
             "#include <b>\n"
-            "#include <c>\n",
+            "#include <c>",
             sort("#include <a>\n"
                  "#include <b>\n"
                  "\n"
                  "#include <b>\n"
                  "\n"
                  "#include <b>\n"
-                 "#include <c>\n"));
+                 "#include <c>"));
 }
 
 TEST_F(SortIncludesTest, SortAndDeduplicateIncludes) {
   EXPECT_EQ("#include <a>\n"
             "#include <b>\n"
-            "#include <c>\n",
+            "#include <c>",
             sort("#include <b>\n"
                  "#include <a>\n"
                  "#include <b>\n"
                  "#include <b>\n"
                  "#include <c>\n"
-                 "#include <b>\n"));
+                 "#include <b>"));
 
   Style.IncludeBlocks = tooling::IncludeStyle::IBS_Merge;
   EXPECT_EQ("#include <a>\n"
             "#include <b>\n"
-            "#include <c>\n",
+            "#include <c>",
             sort("#include <b>\n"
                  "#include <a>\n"
                  "\n"
                  "#include <b>\n"
                  "\n"
                  "#include <c>\n"
-                 "#include <b>\n"));
+                 "#include <b>"));
 
   Style.IncludeBlocks = tooling::IncludeStyle::IBS_Regroup;
   EXPECT_EQ("#include <a>\n"
             "#include <b>\n"
-            "#include <c>\n",
+            "#include <c>",
             sort("#include <b>\n"
                  "#include <a>\n"
                  "\n"
                  "#include <b>\n"
                  "\n"
                  "#include <c>\n"
-                 "#include <b>\n"));
+                 "#include <b>"));
 }
 
 TEST_F(SortIncludesTest, CalculatesCorrectCursorPositionAfterDeduplicate) {
@@ -921,13 +921,13 @@ TEST_F(SortIncludesTest, DeduplicateLocallyInEachBlock) {
             "#include <b>\n"
             "\n"
             "#include <b>\n"
-            "#include <c>\n",
+            "#include <c>",
             sort("#include <a>\n"
                  "#include <b>\n"
                  "\n"
                  "#include <c>\n"
                  "#include <b>\n"
-                 "#include <b>\n"));
+                 "#include <b>"));
 }
 
 TEST_F(SortIncludesTest, ValidAffactedRangesAfterDeduplicatingIncludes) {

From 7360c6acf1c485bde54361dee6e307cf435582a3 Mon Sep 17 00:00:00 2001
From: Brad Smith <brad@comstyle.com>
Date: Fri, 27 Oct 2023 06:46:12 -0400
Subject: [PATCH 208/877] [Driver][test][NFC] Simplify some of the OpenBSD
 tests

---
 clang/test/Driver/openbsd.c | 18 +++++++-----------
 1 file changed, 7 insertions(+), 11 deletions(-)

diff --git a/clang/test/Driver/openbsd.c b/clang/test/Driver/openbsd.c
index 713bf350ee188..f42da97110b50 100644
--- a/clang/test/Driver/openbsd.c
+++ b/clang/test/Driver/openbsd.c
@@ -2,7 +2,7 @@
 // RUN: %clang --target=i686-pc-openbsd -static -### %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-LD-STATIC-EH %s
 // CHECK-LD-STATIC-EH: "-cc1" "-triple" "i686-pc-openbsd"
-// CHECK-LD-STATIC-EH: ld{{.*}}" "-e" "__start" "--eh-frame-hdr" "-Bstatic" "-o" "a.out" "{{.*}}rcrt0.o" "{{.*}}crtbegin.o" "{{.*}}.o" "-lcompiler_rt" "-lc" "-lcompiler_rt" "{{.*}}crtend.o"
+// CHECK-LD-STATIC-EH: ld{{.*}}" "{{.*}}" "--eh-frame-hdr" "-Bstatic"
 
 // Check for profiling variants of libraries when linking and -nopie
 // RUN: %clang --target=i686-pc-openbsd -pg -pthread -### %s 2>&1 \
@@ -26,10 +26,8 @@
 // Check that the new linker flags are passed to OpenBSD
 // RUN: %clang --target=i686-pc-openbsd -r -### %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-LD-R %s
-// RUN: %clang --target=i686-pc-openbsd -s -### %s 2>&1 \
-// RUN:   | FileCheck --check-prefix=CHECK-LD-S %s
-// RUN: %clang --target=i686-pc-openbsd -t -### %s 2>&1 \
-// RUN:   | FileCheck --check-prefix=CHECK-LD-T %s
+// RUN: %clang --target=i686-pc-openbsd -s -t -### %s 2>&1 \
+// RUN:   | FileCheck --check-prefix=CHECK-LD-ST %s
 // RUN: %clang --target=mips64-unknown-openbsd -### %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=CHECK-MIPS64-LD %s
 // RUN: %clang --target=mips64el-unknown-openbsd -### %s 2>&1 \
@@ -39,14 +37,12 @@
 // CHECK-LD-R-NOT: "-l
 // CHECK-LD-R-NOT: crt{{[^./\\]+}}.o
 // CHECK-LD-R:     "-r"
-// CHECK-LD-S: "-cc1" "-triple" "i686-pc-openbsd"
-// CHECK-LD-S: ld{{.*}}" "-e" "__start" "--eh-frame-hdr" "-dynamic-linker" "{{.*}}ld.so" "-o" "a.out" "{{.*}}crt0.o" "{{.*}}crtbegin.o" "-L{{.*}}" "-s" "{{.*}}.o" "-lcompiler_rt" "-lc" "-lcompiler_rt" "{{.*}}crtend.o"
-// CHECK-LD-T: "-cc1" "-triple" "i686-pc-openbsd"
-// CHECK-LD-T: ld{{.*}}" "-e" "__start" "--eh-frame-hdr" "-dynamic-linker" "{{.*}}ld.so" "-o" "a.out" "{{.*}}crt0.o" "{{.*}}crtbegin.o" "-L{{.*}}" "-t" "{{.*}}.o" "-lcompiler_rt" "-lc" "-lcompiler_rt" "{{.*}}crtend.o"
+// CHECK-LD-ST: "-cc1" "-triple" "i686-pc-openbsd"
+// CHECK-LD-ST: ld{{.*}}" "{{.*}}" "-s" "-t"
 // CHECK-MIPS64-LD: "-cc1" "-triple" "mips64-unknown-openbsd"
-// CHECK-MIPS64-LD: ld{{.*}}" "-EB" "-e" "__start" "--eh-frame-hdr" "-dynamic-linker" "{{.*}}ld.so" "-o" "a.out" "{{.*}}crt0.o" "{{.*}}crtbegin.o" "-L{{.*}}" "{{.*}}.o" "-lcompiler_rt" "-lc" "-lcompiler_rt" "{{.*}}crtend.o"
+// CHECK-MIPS64-LD: ld{{.*}}" "-EB"
 // CHECK-MIPS64EL-LD: "-cc1" "-triple" "mips64el-unknown-openbsd"
-// CHECK-MIPS64EL-LD: ld{{.*}}" "-EL" "-e" "__start" "--eh-frame-hdr" "-dynamic-linker" "{{.*}}ld.so" "-o" "a.out" "{{.*}}crt0.o" "{{.*}}crtbegin.o" "-L{{.*}}" "{{.*}}.o" "-lcompiler_rt" "-lc" "-lcompiler_rt" "{{.*}}crtend.o"
+// CHECK-MIPS64EL-LD: ld{{.*}}" "-EL"
 
 // Check that --sysroot is passed to the linker
 // RUN: %clang --target=i686-pc-openbsd -### %s 2>&1 \

From b23426ee0eb4b51b87147de3e376bc49960b7f35 Mon Sep 17 00:00:00 2001
From: hassnaaHamdi <hassnaa.hamdi@arm.com>
Date: Fri, 27 Oct 2023 12:13:46 +0100
Subject: [PATCH 209/877] =?UTF-8?q?[LLVM][AArch64][Assembly]:=20Add=20FAMI?=
 =?UTF-8?q?NMAX=20assembly/disasse=E2=80=A6=20(#70115)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

…mbly.

This patch adds the feature flag FAMINMAX and the assembly/disassembly
for the following instructions of NEON, SVE2 and SME2:
* NEON:
  - FAMIN
  - FAMAX
* SVE2:
  - FAMIN_ZPmZ
  - FAMAX_ZPmZ
* SME2:
  - FAMAX_2Z2Z
  - FAMIN_2Z2Z
  - FAMAX_4Z4Z
  - FAMIN_4Z4Z

That is according to this documentation:
https://developer.arm.com/documentation/ddi0602/2023-09

Co-authored-by: Caroline Concatto <caroline.concatto@arm.com>
---
 .../llvm/TargetParser/AArch64TargetParser.h   |   2 +
 llvm/lib/Target/AArch64/AArch64.td            |   3 +
 llvm/lib/Target/AArch64/AArch64InstrInfo.td   |   8 +
 .../lib/Target/AArch64/AArch64SMEInstrInfo.td |   7 +
 .../lib/Target/AArch64/AArch64SVEInstrInfo.td |   8 +-
 .../AArch64/AsmParser/AArch64AsmParser.cpp    |   1 +
 .../MC/AArch64/FP8/directive-arch-negative.s  |   7 +
 llvm/test/MC/AArch64/FP8/directive-arch.s     |   6 +
 .../MC/AArch64/FP8/faminmax-diagnostics.s     |  64 ++++++
 llvm/test/MC/AArch64/FP8/faminmax.s           | 197 ++++++++++++++++++
 .../AArch64/FP8_SME2/faminmax-diagnoctics.s   |  62 ++++++
 llvm/test/MC/AArch64/FP8_SME2/faminmax.s      | 159 ++++++++++++++
 .../AArch64/FP8_SVE2/faminmax-diagnostics.s   |  99 +++++++++
 llvm/test/MC/AArch64/FP8_SVE2/faminmax.s      | 145 +++++++++++++
 .../TargetParser/TargetParserTest.cpp         |   4 +-
 15 files changed, 770 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/MC/AArch64/FP8/faminmax-diagnostics.s
 create mode 100644 llvm/test/MC/AArch64/FP8/faminmax.s
 create mode 100644 llvm/test/MC/AArch64/FP8_SME2/faminmax-diagnoctics.s
 create mode 100644 llvm/test/MC/AArch64/FP8_SME2/faminmax.s
 create mode 100644 llvm/test/MC/AArch64/FP8_SVE2/faminmax-diagnostics.s
 create mode 100644 llvm/test/MC/AArch64/FP8_SVE2/faminmax.s

diff --git a/llvm/include/llvm/TargetParser/AArch64TargetParser.h b/llvm/include/llvm/TargetParser/AArch64TargetParser.h
index 8ff2947794251..232b3d6a6dbb1 100644
--- a/llvm/include/llvm/TargetParser/AArch64TargetParser.h
+++ b/llvm/include/llvm/TargetParser/AArch64TargetParser.h
@@ -161,6 +161,7 @@ enum ArchExtKind : unsigned {
   AEK_GCS =           57, // FEAT_GCS
   AEK_FPMR =          58, // FEAT_FPMR
   AEK_FP8 =           59, // FEAT_FP8
+  AEK_FAMINMAX =      60, // FEAT_FAMINMAX
   AEK_NUM_EXTENSIONS
 };
 using ExtensionBitset = Bitset<AEK_NUM_EXTENSIONS>;
@@ -271,6 +272,7 @@ inline constexpr ExtensionInfo Extensions[] = {
     {"gcs", AArch64::AEK_GCS, "+gcs", "-gcs", FEAT_INIT, "", 0},
     {"fpmr", AArch64::AEK_FPMR, "+fpmr", "-fpmr", FEAT_INIT, "", 0},
     {"fp8", AArch64::AEK_FP8, "+fp8", "-fp8", FEAT_INIT, "+fpmr", 0},
+    {"faminmax", AArch64::AEK_FAMINMAX, "+faminmax", "-faminmax", FEAT_INIT, "", 0},
     // Special cases
     {"none", AArch64::AEK_NONE, {}, {}, FEAT_INIT, "", ExtensionInfo::MaxFMVPriority},
 };
diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td
index 0c0fa82ffe93c..8fd9358c9f9c7 100644
--- a/llvm/lib/Target/AArch64/AArch64.td
+++ b/llvm/lib/Target/AArch64/AArch64.td
@@ -514,6 +514,9 @@ def FeatureSME2 : SubtargetFeature<"sme2", "HasSME2", "true",
 def FeatureSME2p1 : SubtargetFeature<"sme2p1", "HasSME2p1", "true",
   "Enable Scalable Matrix Extension 2.1 (FEAT_SME2p1) instructions", [FeatureSME2]>;
 
+def FeatureFAMINMAX: SubtargetFeature<"faminmax", "HasFAMINMAX", "true",
+   "Enable FAMIN and FAMAX instructions (FEAT_FAMINMAX)">;
+
 def FeatureAppleA7SysReg  : SubtargetFeature<"apple-a7-sysreg", "HasAppleA7SysReg", "true",
   "Apple A7 (the CPU formerly known as Cyclone)">;
 
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 6f616b2798435..97c5e89a29a97 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -164,6 +164,8 @@ def HasFPMR          : Predicate<"Subtarget->hasFPMR()">,
                                  AssemblerPredicateWithAll<(all_of FeatureFPMR), "fpmr">;
 def HasFP8           : Predicate<"Subtarget->hasFP8()">,
                                  AssemblerPredicateWithAll<(all_of FeatureFP8), "fp8">;
+def HasFAMINMAX      : Predicate<"Subtarget->hasFAMINMAX()">,
+                                 AssemblerPredicateWithAll<(all_of FeatureFAMINMAX), "faminmax">;
 
 // A subset of SVE(2) instructions are legal in Streaming SVE execution mode,
 // they should be enabled if either has been specified.
@@ -9265,6 +9267,12 @@ let Predicates = [HasFP8] in {
   defm FSCALE : SIMDThreeSameVectorFP<0b1, 0b1, 0b111, "fscale", null_frag>;
 } // End let Predicates = [HasFP8]
 
+let Predicates = [HasFAMINMAX] in {
+ defm FAMAX : SIMDThreeSameVectorFP<0b0, 0b1, 0b011, "famax", null_frag>;
+ defm FAMIN : SIMDThreeSameVectorFP<0b1, 0b1, 0b011, "famin", null_frag>;
+} // End let Predicates = [HasFAMAXMIN]
+
+
 include "AArch64InstrAtomics.td"
 include "AArch64SVEInstrInfo.td"
 include "AArch64SMEInstrInfo.td"
diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
index cdc79ec3ae344..f55b84b02f851 100644
--- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
@@ -878,3 +878,10 @@ defm FSCALE_4Z4Z  : sme2_fp_sve_destructive_vector_vg4_multi<"fscale",  0b001100
 
 } // [HasSME2, HasFP8]
 
+let Predicates = [HasSME2, HasFAMINMAX] in {
+defm FAMAX_2Z2Z : sme2_fp_sve_destructive_vector_vg2_multi<"famax", 0b0010100>;
+defm FAMIN_2Z2Z : sme2_fp_sve_destructive_vector_vg2_multi<"famin", 0b0010101>;
+
+defm FAMAX_4Z4Z : sme2_fp_sve_destructive_vector_vg4_multi<"famax", 0b0010100>;
+defm FAMIN_4Z4Z : sme2_fp_sve_destructive_vector_vg4_multi<"famin", 0b0010101>;
+} //[HasSME2, HasFAMINMAX]
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 002d5d28fcf8d..1a586765d58b3 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -4022,4 +4022,10 @@ defm FCVTN_Z2Z_HtoB  : sve2_fp8_down_cvt_single<0b00, "fcvtn", ZZ_h_mul_r>;
 defm FCVTNB_Z2Z_StoB : sve2_fp8_down_cvt_single<0b01, "fcvtnb", ZZ_s_mul_r>;
 defm BFCVTN_Z2Z_HtoB : sve2_fp8_down_cvt_single<0b10, "bfcvtn", ZZ_h_mul_r>;
 defm FCVTNT_Z2Z_StoB : sve2_fp8_down_cvt_single<0b11, "fcvtnt", ZZ_s_mul_r>;
-} // End HasSVE2orSME2, HasFP8
\ No newline at end of file
+} // End HasSVE2orSME2, HasFP8
+
+let Predicates = [HasSVE2orSME2, HasFAMINMAX] in {
+// FP8 Arithmetic - Predicated Group
+defm FAMIN_ZPmZ : sve_fp_2op_p_zds<0b1111, "famin", "", null_frag, DestructiveOther>;
+defm FAMAX_ZPmZ : sve_fp_2op_p_zds<0b1110, "famax", "", null_frag, DestructiveOther>;
+} // End HasSVE2orSME2, HasFAMINMAX
diff --git a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
index 36e34fdc07e7c..35abe3563eb81 100644
--- a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
+++ b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -3640,6 +3640,7 @@ static const struct Extension {
     {"tme", {AArch64::FeatureTME}},
     {"fpmr", {AArch64::FeatureFPMR}},
     {"fp8", {AArch64::FeatureFP8}},
+    {"faminmax", {AArch64::FeatureFAMINMAX}},
 };
 
 static void setRequiredFeatureString(FeatureBitset FBS, std::string &Str) {
diff --git a/llvm/test/MC/AArch64/FP8/directive-arch-negative.s b/llvm/test/MC/AArch64/FP8/directive-arch-negative.s
index cf48416d29d8a..86525ff68134c 100644
--- a/llvm/test/MC/AArch64/FP8/directive-arch-negative.s
+++ b/llvm/test/MC/AArch64/FP8/directive-arch-negative.s
@@ -5,3 +5,10 @@
 bf1cvtl v0.8h, v0.8b
 // CHECK: error: instruction requires: fp8
 // CHECK: bf1cvtl v0.8h, v0.8b
+
+.arch armv9-a+faminmax
+.arch armv9-a+nofaminmax
+famax  v31.4h, v31.4h, v31.4h
+// CHECK: error: instruction requires: faminmax
+// CHECK: famax  v31.4h, v31.4h, v31.4h
+
diff --git a/llvm/test/MC/AArch64/FP8/directive-arch.s b/llvm/test/MC/AArch64/FP8/directive-arch.s
index 8857d4f0bfbe4..e3f0a94c2ff9a 100644
--- a/llvm/test/MC/AArch64/FP8/directive-arch.s
+++ b/llvm/test/MC/AArch64/FP8/directive-arch.s
@@ -5,3 +5,9 @@ bf1cvtl v0.8h, v0.8b
 // CHECK: bf1cvtl v0.8h, v0.8b
 
 .arch armv9-a+nofp8
+.arch armv9-a+faminmax
+famax  v31.4h, v31.4h, v31.4h
+// CHECK: famax  v31.4h, v31.4h, v31.4h
+
+.arch armv9-a+nofaminmax
+
diff --git a/llvm/test/MC/AArch64/FP8/faminmax-diagnostics.s b/llvm/test/MC/AArch64/FP8/faminmax-diagnostics.s
new file mode 100644
index 0000000000000..724b7eadfb416
--- /dev/null
+++ b/llvm/test/MC/AArch64/FP8/faminmax-diagnostics.s
@@ -0,0 +1,64 @@
+// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+faminmax 2>&1 < %s| FileCheck %s
+
+// --------------------------------------------------------------------------//
+// Element size extension incorrect
+
+famax  v0.16s, v0.4s, v0.4s
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid vector kind qualifier
+// CHECK-NEXT: famax  v0.16s, v0.4s, v0.4s
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+famax  v0.4h, v0.4s, v0.4s
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: famax  v0.4h, v0.4s, v0.4s
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+famax  v0.8h, v0.8s, v0.8s
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid vector kind qualifier
+// CHECK-NEXT: famax  v0.8h, v0.8s, v0.8s
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+famax  v0.2s, v0.2h, v0.2h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: famax  v0.2s, v0.2h, v0.2h
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+famax  v0.4s, v31.4h, v0.4h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: famax  v0.4s, v31.4h, v0.4h
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+famax  v0.2d, v31.2h, v0.2h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: famax  v0.2d, v31.2h, v0.2h
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+famin  v0.16s, v0.4s, v0.4s
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid vector kind qualifier
+// CHECK-NEXT: famin  v0.16s, v0.4s, v0.4s
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+famin  v0.4h, v0.4s, v0.4s
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: famin  v0.4h, v0.4s, v0.4s
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+famin  v0.8h, v0.8s, v0.8s
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid vector kind qualifier
+// CHECK-NEXT: famin  v0.8h, v0.8s, v0.8s
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+famin  v0.2s, v0.2h, v0.2h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: famin  v0.2s, v0.2h, v0.2h
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+famin  v0.4s, v31.4h, v0.4h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: famin  v0.4s, v31.4h, v0.4h
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+famin  v0.2d, v31.2h, v0.2h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: famin  v0.2d, v31.2h, v0.2h
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
diff --git a/llvm/test/MC/AArch64/FP8/faminmax.s b/llvm/test/MC/AArch64/FP8/faminmax.s
new file mode 100644
index 0000000000000..02c3b8f11d6ee
--- /dev/null
+++ b/llvm/test/MC/AArch64/FP8/faminmax.s
@@ -0,0 +1,197 @@
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+faminmax < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
+// RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+faminmax < %s \
+// RUN:        | llvm-objdump -d --mattr=+faminmax - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+faminmax < %s \
+// RUN:        | llvm-objdump -d --mattr=-faminmax - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+// Disassemble encoding and check the re-encoding (-show-encoding) matches.
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+faminmax < %s \
+// RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
+// RUN:        | llvm-mc -triple=aarch64 -mattr=+faminmax -disassemble -show-encoding \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+
+/// FAMAX instructions.
+famax  v31.4h, v31.4h, v31.4h
+// CHECK-INST: famax  v31.4h, v31.4h, v31.4h
+// CHECK-ENCODING: [0xff,0x1f,0xdf,0x0e]
+// CHECK-ERROR: instruction requires: faminmax
+// CHECK-UNKNOWN: 0edf1fff  <unknown>
+
+famax  v31.4h, v0.4h, v31.4h
+// CHECK-INST: famax  v31.4h, v0.4h, v31.4h
+// CHECK-ENCODING: [0x1f,0x1c,0xdf,0x0e]
+// CHECK-ERROR: instruction requires: faminmax
+// CHECK-UNKNOWN: 0edf1c1f <unknown>
+
+famax  v0.4h, v0.4h, v0.4h
+// CHECK-INST: famax  v0.4h, v0.4h, v0.4h
+// CHECK-ENCODING: [0x00,0x1c,0xc0,0x0e]
+// CHECK-ERROR: instruction requires: faminmax
+// CHECK-UNKNOWN: 0ec01c00 <unknown>
+
+famax  v31.8h, v31.8h, v31.8h
+// CHECK-INST: famax  v31.8h, v31.8h, v31.8h
+// CHECK-ENCODING: [0xff,0x1f,0xdf,0x4e]
+// CHECK-ERROR: instruction requires: faminmax
+// CHECK-UNKNOWN: 4edf1fff <unknown>
+
+famax  v31.8h, v31.8h, v0.8h
+// CHECK-INST: famax  v31.8h, v31.8h, v0.8h
+// CHECK-ENCODING: [0xff,0x1f,0xc0,0x4e]
+// CHECK-ERROR: instruction requires: faminmax
+// CHECK-UNKNOWN: 4ec01fff <unknown>
+
+famax  v0.8h, v0.8h, v0.8h
+// CHECK-INST: famax  v0.8h, v0.8h, v0.8h
+// CHECK-ENCODING: [0x00,0x1c,0xc0,0x4e]
+// CHECK-ERROR: instruction requires: faminmax
+// CHECK-UNKNOWN: 4ec01c00 <unknown>
+
+famax  v31.2s, v31.2s, v31.2s
+// CHECK-INST: famax  v31.2s, v31.2s, v31.2s
+// CHECK-ENCODING: [0xff,0xdf,0xbf,0x0e]
+// CHECK-ERROR: instruction requires: faminmax
+// CHECK-UNKNOWN: 0ebfdfff <unknown>
+
+famax  v31.2s, v0.2s, v0.2s
+// CHECK-INST: famax  v31.2s, v0.2s, v0.2s
+// CHECK-ENCODING: [0x1f,0xdc,0xa0,0x0e]
+// CHECK-ERROR: instruction requires: faminmax
+// CHECK-UNKNOWN: 0ea0dc1f <unknown>
+
+famax  v0.2s, v0.2s, v0.2s
+// CHECK-INST: famax  v0.2s, v0.2s, v0.2s
+// CHECK-ENCODING: [0x00,0xdc,0xa0,0x0e]
+// CHECK-ERROR: instruction requires: faminmax
+// CHECK-UNKNOWN: 0ea0dc00 <unknown>
+
+famax  v31.4s, v31.4s, v31.4s
+// CHECK-INST: famax  v31.4s, v31.4s, v31.4s
+// CHECK-ENCODING: [0xff,0xdf,0xbf,0x4e]
+// CHECK-ERROR: instruction requires: faminmax
+// CHECK-UNKNOWN: 4ebfdfff <unknown>
+
+famax  v0.4s, v31.4s, v31.4s
+// CHECK-INST: famax  v0.4s, v31.4s, v31.4s
+// CHECK-ENCODING: [0xe0,0xdf,0xbf,0x4e]
+// CHECK-ERROR: instruction requires: faminmax
+// CHECK-UNKNOWN: 4ebfdfe0 <unknown>
+
+famax  v0.4s, v0.4s, v0.4s
+// CHECK-INST: famax  v0.4s, v0.4s, v0.4s
+// CHECK-ENCODING: [0x00,0xdc,0xa0,0x4e]
+// CHECK-ERROR: instruction requires: faminmax
+// CHECK-UNKNOWN: 4ea0dc00 <unknown>
+
+famax  v31.2d, v31.2d, v31.2d
+// CHECK-INST: famax  v31.2d, v31.2d, v31.2d
+// CHECK-ENCODING: [0xff,0xdf,0xff,0x4e]
+// CHECK-ERROR: instruction requires: faminmax
+// CHECK-UNKNOWN: 4effdfff <unknown>
+
+famax  v0.2d, v0.2d, v31.2d
+// CHECK-INST: famax  v0.2d, v0.2d, v31.2d
+// CHECK-ENCODING: [0x00,0xdc,0xff,0x4e]
+// CHECK-ERROR: instruction requires: faminmax
+// CHECK-UNKNOWN: 4effdc00 <unknown>
+
+famax  v0.2d, v0.2d, v0.2d
+// CHECK-INST: famax  v0.2d, v0.2d, v0.2d
+// CHECK-ENCODING: [0x00,0xdc,0xe0,0x4e]
+// CHECK-ERROR: instruction requires: faminmax
+// CHECK-UNKNOWN: 4ee0dc00 <unknown>
+
+
+/// FAMIN instructions.
+famin  v31.4h, v31.4h, v31.4h
+// CHECK-INST: famin  v31.4h, v31.4h, v31.4h
+// CHECK-ENCODING: [0xff,0x1f,0xdf,0x2e]
+// CHECK-ERROR: instruction requires: faminmax
+// CHECK-UNKNOWN: 2edf1fff  <unknown>
+
+famin  v31.4h, v0.4h, v31.4h
+// CHECK-INST: famin  v31.4h, v0.4h, v31.4h
+// CHECK-ENCODING: [0x1f,0x1c,0xdf,0x2e]
+// CHECK-ERROR: instruction requires: faminmax
+// CHECK-UNKNOWN: 2edf1c1f <unknown>
+
+famin  v0.4h, v0.4h, v0.4h
+// CHECK-INST: famin  v0.4h, v0.4h, v0.4h
+// CHECK-ENCODING: [0x00,0x1c,0xc0,0x2e]
+// CHECK-ERROR: instruction requires: faminmax
+// CHECK-UNKNOWN: 2ec01c00 <unknown>
+
+famin  v31.8h, v31.8h, v31.8h
+// CHECK-INST: famin  v31.8h, v31.8h, v31.8h
+// CHECK-ENCODING: [0xff,0x1f,0xdf,0x6e]
+// CHECK-ERROR: instruction requires: faminmax
+// CHECK-UNKNOWN: 6edf1fff <unknown>
+
+famin  v31.8h, v31.8h, v0.8h
+// CHECK-INST: famin  v31.8h, v31.8h, v0.8h
+// CHECK-ENCODING: [0xff,0x1f,0xc0,0x6e]
+// CHECK-ERROR: instruction requires: faminmax
+// CHECK-UNKNOWN: 6ec01fff <unknown>
+
+famin  v0.8h, v0.8h, v0.8h
+// CHECK-INST: famin  v0.8h, v0.8h, v0.8h
+// CHECK-ENCODING: [0x00,0x1c,0xc0,0x6e]
+// CHECK-ERROR: instruction requires: faminmax
+// CHECK-UNKNOWN: 6ec01c00 <unknown>
+
+famin  v31.2s, v31.2s, v31.2s
+// CHECK-INST: famin  v31.2s, v31.2s, v31.2s
+// CHECK-ENCODING: [0xff,0xdf,0xbf,0x2e]
+// CHECK-ERROR: instruction requires: faminmax
+// CHECK-UNKNOWN: 2ebfdfff <unknown>
+
+famin  v31.2s, v0.2s, v0.2s
+// CHECK-INST: famin  v31.2s, v0.2s, v0.2s
+// CHECK-ENCODING: [0x1f,0xdc,0xa0,0x2e]
+// CHECK-ERROR: instruction requires: faminmax
+// CHECK-UNKNOWN: 2ea0dc1f <unknown>
+
+famin  v0.2s, v0.2s, v0.2s
+// CHECK-INST: famin  v0.2s, v0.2s, v0.2s
+// CHECK-ENCODING: [0x00,0xdc,0xa0,0x2e]
+// CHECK-ERROR: instruction requires: faminmax
+// CHECK-UNKNOWN: 2ea0dc00 <unknown>
+
+famin  v31.4s, v31.4s, v31.4s
+// CHECK-INST: famin  v31.4s, v31.4s, v31.4s
+// CHECK-ENCODING: [0xff,0xdf,0xbf,0x6e]
+// CHECK-ERROR: instruction requires: faminmax
+// CHECK-UNKNOWN: 6ebfdfff <unknown>
+
+famin  v0.4s, v31.4s, v31.4s
+// CHECK-INST: famin  v0.4s, v31.4s, v31.4s
+// CHECK-ENCODING: [0xe0,0xdf,0xbf,0x6e]
+// CHECK-ERROR: instruction requires: faminmax
+// CHECK-UNKNOWN: 6ebfdfe0 <unknown>
+
+famin  v0.4s, v0.4s, v0.4s
+// CHECK-INST: famin  v0.4s, v0.4s, v0.4s
+// CHECK-ENCODING: [0x00,0xdc,0xa0,0x6e]
+// CHECK-ERROR: instruction requires: faminmax
+// CHECK-UNKNOWN: 6ea0dc00 <unknown>
+
+famin  v31.2d, v31.2d, v31.2d
+// CHECK-INST: famin  v31.2d, v31.2d, v31.2d
+// CHECK-ENCODING: [0xff,0xdf,0xff,0x6e]
+// CHECK-ERROR: instruction requires: faminmax
+// CHECK-UNKNOWN: 6effdfff <unknown>
+
+famin  v0.2d, v0.2d, v31.2d
+// CHECK-INST: famin  v0.2d, v0.2d, v31.2d
+// CHECK-ENCODING: [0x00,0xdc,0xff,0x6e]
+// CHECK-ERROR: instruction requires: faminmax
+// CHECK-UNKNOWN: 6effdc00 <unknown>
+
+famin  v0.2d, v0.2d, v0.2d
+// CHECK-INST: famin  v0.2d, v0.2d, v0.2d
+// CHECK-ENCODING: [0x00,0xdc,0xe0,0x6e]
+// CHECK-ERROR: instruction requires: faminmax
+// CHECK-UNKNOWN: 6ee0dc00 <unknown>
+
diff --git a/llvm/test/MC/AArch64/FP8_SME2/faminmax-diagnoctics.s b/llvm/test/MC/AArch64/FP8_SME2/faminmax-diagnoctics.s
new file mode 100644
index 0000000000000..d6de69ec0ba08
--- /dev/null
+++ b/llvm/test/MC/AArch64/FP8_SME2/faminmax-diagnoctics.s
@@ -0,0 +1,62 @@
+// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2,+faminmax 2>&1 < %s| FileCheck %s
+
+// --------------------------------------------------------------------------//
+// Incorrect operand
+
+famax   {z0.h-z1.h}, {z0.d-z1.d}, {z0.s-z1.s}
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: famax   {z0.h-z1.h}, {z0.d-z1.d}, {z0.s-z1.s}
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+famax   {z28.s-z31.s}, {z28.h-z31.h}, {z28.d-z31.d}
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: famax   {z28.s-z31.s}, {z28.h-z31.h}, {z28.d-z31.d
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+famin   {z0.h-z1.h}, {z0.s-z1.s}, {z0.s-z1.s}
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: famin   {z0.h-z1.h}, {z0.s-z1.s}, {z0.s-z1.s}
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Incorrect range of vectors
+
+famax   {z1.d-z0.d}, {z0.d-z1.d}, {z0.d-z1.d}
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid number of vectors
+// CHECK-NEXT: famax   {z1.d-z0.d}, {z0.d-z1.d}, {z0.d-z1.d}
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+famax   {z0.h-z3.h}, {z1.h-z4.h}, {z0.h-z3.h}
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 4 consecutive SVE vectors, where the first vector is a multiple of 4 and with matching element types
+// CHECK-NEXT: famax   {z0.h-z3.h}, {z1.h-z4.h}, {z0.h-z3.h}
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+famax   {z0.h-z3.h}, {z0.h-z3.h}, {z2.h-z5.h}
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 4 consecutive SVE vectors, where the first vector is a multiple of 4 and with matching element types
+// CHECK-NEXT: famax   {z0.h-z3.h}, {z0.h-z3.h}, {z2.h-z5.h}
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+famax   {z3.h-z6.h}, {z0.h-z3.h}, {z0.h-z3.h}
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 4 consecutive SVE vectors, where the first vector is a multiple of 4 and with matching element types
+// CHECK-NEXT: famax   {z3.h-z6.h}, {z0.h-z3.h}, {z0.h-z3.h}
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+famin   {z30.h-z31.h}, {z31.h-z0.h}, {z0.h-z1.h}
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors, where the first vector is a multiple of 2 and with matching element types
+// CHECK-NEXT: famin   {z30.h-z31.h}, {z31.h-z0.h}, {z0.h-z1.h}
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+famin   {z29.d-z0.d}, {z0.d-z3.d}, {z0.d-z3.d}
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 4 consecutive SVE vectors, where the first vector is a multiple of 4 and with matching element types
+// CHECK-NEXT: famin   {z29.d-z0.d}, {z0.d-z3.d}, {z0.d-z3.d}
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+famin   {z0.d-z3.d}, {z30.d-z1.d}, {z0.d-z3.d}
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 4 consecutive SVE vectors, where the first vector is a multiple of 4 and with matching element types
+// CHECK-NEXT: famin   {z0.d-z3.d}, {z30.d-z1.d}, {z0.d-z3.d}
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+famin   {z0.d-z3.d}, {z0.d-z3.d}, {z28.d-z30.d}
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: famin   {z0.d-z3.d}, {z0.d-z3.d}, {z28.d-z30.d}
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
\ No newline at end of file
diff --git a/llvm/test/MC/AArch64/FP8_SME2/faminmax.s b/llvm/test/MC/AArch64/FP8_SME2/faminmax.s
new file mode 100644
index 0000000000000..c2b2ab057bcee
--- /dev/null
+++ b/llvm/test/MC/AArch64/FP8_SME2/faminmax.s
@@ -0,0 +1,159 @@
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2,+faminmax < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
+// RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2,+faminmax < %s \
+// RUN:        | llvm-objdump -d --mattr=+sme2,+faminmax - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2,+faminmax < %s \
+// RUN:        | llvm-objdump -d --mattr=-faminmax - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+// Disassemble encoding and check the re-encoding (-show-encoding) matches.
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2,+faminmax < %s \
+// RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
+// RUN:        | llvm-mc -triple=aarch64 -mattr=+sme2,+faminmax -disassemble -show-encoding \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+
+// FAMAX
+famax   {z0.h-z1.h}, {z0.h-z1.h}, {z0.h-z1.h}  // 11000001-01100000-10110001-01000000
+// CHECK-INST: famax   { z0.h, z1.h }, { z0.h, z1.h }, { z0.h, z1.h }
+// CHECK-ENCODING: [0x40,0xb1,0x60,0xc1]
+// CHECK-ERROR: instruction requires: faminmax sme2
+// CHECK-UNKNOWN: c160b140 <unknown>
+
+famax   {z30.h-z31.h}, {z30.h-z31.h}, {z30.h-z31.h}  // 11000001-01111110-10110001-01011110
+// CHECK-INST: famax   { z30.h, z31.h }, { z30.h, z31.h }, { z30.h, z31.h }
+// CHECK-ENCODING: [0x5e,0xb1,0x7e,0xc1]
+// CHECK-ERROR: instruction requires: faminmax sme2
+// CHECK-UNKNOWN: c17eb15e <unknown>
+
+famax   {z0.s-z1.s}, {z0.s-z1.s}, {z0.s-z1.s}  // 11000001-10100000-10110001-01000000
+// CHECK-INST: famax   { z0.s, z1.s }, { z0.s, z1.s }, { z0.s, z1.s }
+// CHECK-ENCODING: [0x40,0xb1,0xa0,0xc1]
+// CHECK-ERROR: instruction requires: faminmax sme2
+// CHECK-UNKNOWN: c1a0b140 <unknown>
+
+famax   {z30.s-z31.s}, {z30.s-z31.s}, {z30.s-z31.s}  // 11000001-10111110-10110001-01011110
+// CHECK-INST: famax   { z30.s, z31.s }, { z30.s, z31.s }, { z30.s, z31.s }
+// CHECK-ENCODING: [0x5e,0xb1,0xbe,0xc1]
+// CHECK-ERROR: instruction requires: faminmax sme2
+// CHECK-UNKNOWN: c1beb15e <unknown>
+
+famax   {z0.d-z1.d}, {z0.d-z1.d}, {z0.d-z1.d}  // 11000001-11100000-10110001-01000000
+// CHECK-INST: famax   { z0.d, z1.d }, { z0.d, z1.d }, { z0.d, z1.d }
+// CHECK-ENCODING: [0x40,0xb1,0xe0,0xc1]
+// CHECK-ERROR: instruction requires: faminmax sme2
+// CHECK-UNKNOWN: c1e0b140 <unknown>
+
+famax   {z30.d-z31.d}, {z30.d-z31.d}, {z30.d-z31.d}  // 11000001-11111110-10110001-01011110
+// CHECK-INST: famax   { z30.d, z31.d }, { z30.d, z31.d }, { z30.d, z31.d }
+// CHECK-ENCODING: [0x5e,0xb1,0xfe,0xc1]
+// CHECK-ERROR: instruction requires: faminmax sme2
+// CHECK-UNKNOWN: c1feb15e <unknown>
+
+famax   {z0.h-z3.h}, {z0.h-z3.h}, {z0.h-z3.h}  // 11000001-01100000-10111001-01000000
+// CHECK-INST: famax   { z0.h - z3.h }, { z0.h - z3.h }, { z0.h - z3.h }
+// CHECK-ENCODING: [0x40,0xb9,0x60,0xc1]
+// CHECK-ERROR: instruction requires: faminmax sme2
+// CHECK-UNKNOWN: c160b940 <unknown>
+
+famax   {z28.h-z31.h}, {z28.h-z31.h}, {z28.h-z31.h}  // 11000001-01111100-10111001-01011100
+// CHECK-INST: famax   { z28.h - z31.h }, { z28.h - z31.h }, { z28.h - z31.h }
+// CHECK-ENCODING: [0x5c,0xb9,0x7c,0xc1]
+// CHECK-ERROR: instruction requires: faminmax sme2
+// CHECK-UNKNOWN: c17cb95c <unknown>
+
+famax   {z0.s-z3.s}, {z0.s-z3.s}, {z0.s-z3.s}  // 11000001-10100000-10111001-01000000
+// CHECK-INST: famax   { z0.s - z3.s }, { z0.s - z3.s }, { z0.s - z3.s }
+// CHECK-ENCODING: [0x40,0xb9,0xa0,0xc1]
+// CHECK-ERROR: instruction requires: faminmax sme2
+// CHECK-UNKNOWN: c1a0b940 <unknown>
+
+famax   {z28.s-z31.s}, {z28.s-z31.s}, {z28.s-z31.s}  // 11000001-10111100-10111001-01011100
+// CHECK-INST: famax   { z28.s - z31.s }, { z28.s - z31.s }, { z28.s - z31.s }
+// CHECK-ENCODING: [0x5c,0xb9,0xbc,0xc1]
+// CHECK-ERROR: instruction requires: faminmax sme2
+// CHECK-UNKNOWN: c1bcb95c <unknown>
+
+famax   {z0.h-z3.h}, {z0.h-z3.h}, {z0.h-z3.h}  // 11000001-01100000-10111001-01000000
+// CHECK-INST: famax   { z0.h - z3.h }, { z0.h - z3.h }, { z0.h - z3.h }
+// CHECK-ENCODING: [0x40,0xb9,0x60,0xc1]
+// CHECK-ERROR: instruction requires: faminmax sme2
+// CHECK-UNKNOWN: c160b940 <unknown>
+
+famax   {z28.h-z31.h}, {z28.h-z31.h}, {z28.h-z31.h}  // 11000001-01111100-10111001-01011100
+// CHECK-INST: famax   { z28.h - z31.h }, { z28.h - z31.h }, { z28.h - z31.h }
+// CHECK-ENCODING: [0x5c,0xb9,0x7c,0xc1]
+// CHECK-ERROR: instruction requires: faminmax sme2
+// CHECK-UNKNOWN: c17cb95c <unknown>
+
+// FAMIN
+famin   {z0.h-z1.h}, {z0.h-z1.h}, {z0.h-z1.h}  // 11000001-01100000-10110001-01000001
+// CHECK-INST: famin   { z0.h, z1.h }, { z0.h, z1.h }, { z0.h, z1.h }
+// CHECK-ENCODING: [0x41,0xb1,0x60,0xc1]
+// CHECK-ERROR: instruction requires: faminmax sme2
+// CHECK-UNKNOWN: c160b141 <unknown>
+
+famin   {z30.h-z31.h}, {z30.h-z31.h}, {z30.h-z31.h}  // 11000001-01111110-10110001-01011111
+// CHECK-INST: famin   { z30.h, z31.h }, { z30.h, z31.h }, { z30.h, z31.h }
+// CHECK-ENCODING: [0x5f,0xb1,0x7e,0xc1]
+// CHECK-ERROR: instruction requires: faminmax sme2
+// CHECK-UNKNOWN: c17eb15f <unknown>
+
+famin   {z0.s-z1.s}, {z0.s-z1.s}, {z0.s-z1.s}  // 11000001-10100000-10110001-01000001
+// CHECK-INST: famin   { z0.s, z1.s }, { z0.s, z1.s }, { z0.s, z1.s }
+// CHECK-ENCODING: [0x41,0xb1,0xa0,0xc1]
+// CHECK-ERROR: instruction requires: faminmax sme2
+// CHECK-UNKNOWN: c1a0b141 <unknown>
+
+famin   {z30.s-z31.s}, {z30.s-z31.s}, {z30.s-z31.s}  // 11000001-10111110-10110001-01011111
+// CHECK-INST: famin   { z30.s, z31.s }, { z30.s, z31.s }, { z30.s, z31.s }
+// CHECK-ENCODING: [0x5f,0xb1,0xbe,0xc1]
+// CHECK-ERROR: instruction requires: faminmax sme2
+// CHECK-UNKNOWN: c1beb15f <unknown>
+
+famin   {z0.d-z1.d}, {z0.d-z1.d}, {z0.d-z1.d}  // 11000001-11100000-10110001-01000001
+// CHECK-INST: famin   { z0.d, z1.d }, { z0.d, z1.d }, { z0.d, z1.d }
+// CHECK-ENCODING: [0x41,0xb1,0xe0,0xc1]
+// CHECK-ERROR: instruction requires: faminmax sme2
+// CHECK-UNKNOWN: c1e0b141 <unknown>
+
+famin   {z30.d-z31.d}, {z30.d-z31.d}, {z30.d-z31.d}  // 11000001-11111110-10110001-01011111
+// CHECK-INST: famin   { z30.d, z31.d }, { z30.d, z31.d }, { z30.d, z31.d }
+// CHECK-ENCODING: [0x5f,0xb1,0xfe,0xc1]
+// CHECK-ERROR: instruction requires: faminmax sme2
+// CHECK-UNKNOWN: c1feb15f <unknown>
+
+famin   {z0.h-z3.h}, {z0.h-z3.h}, {z0.h-z3.h}  // 11000001-01100000-10111001-01000001
+// CHECK-INST: famin   { z0.h - z3.h }, { z0.h - z3.h }, { z0.h - z3.h }
+// CHECK-ENCODING: [0x41,0xb9,0x60,0xc1]
+// CHECK-ERROR: instruction requires: faminmax sme2
+// CHECK-UNKNOWN: c160b941 <unknown>
+
+famin   {z28.h-z31.h}, {z28.h-z31.h}, {z28.h-z31.h}  // 11000001-01111100-10111001-01011101
+// CHECK-INST: famin   { z28.h - z31.h }, { z28.h - z31.h }, { z28.h - z31.h }
+// CHECK-ENCODING: [0x5d,0xb9,0x7c,0xc1]
+// CHECK-ERROR: instruction requires: faminmax sme2
+// CHECK-UNKNOWN: c17cb95d <unknown>
+
+famin   {z0.s-z3.s}, {z0.s-z3.s}, {z0.s-z3.s}  // 11000001-10100000-10111001-01000001
+// CHECK-INST: famin   { z0.s - z3.s }, { z0.s - z3.s }, { z0.s - z3.s }
+// CHECK-ENCODING: [0x41,0xb9,0xa0,0xc1]
+// CHECK-ERROR: instruction requires: faminmax sme2
+// CHECK-UNKNOWN: c1a0b941 <unknown>
+
+famin   {z28.s-z31.s}, {z28.s-z31.s}, {z28.s-z31.s}  // 11000001-10111100-10111001-01011101
+// CHECK-INST: famin   { z28.s - z31.s }, { z28.s - z31.s }, { z28.s - z31.s }
+// CHECK-ENCODING: [0x5d,0xb9,0xbc,0xc1]
+// CHECK-ERROR: instruction requires: faminmax sme2
+// CHECK-UNKNOWN: c1bcb95d <unknown>
+
+famin   {z0.d-z3.d}, {z0.d-z3.d}, {z0.d-z3.d}  // 11000001-11100000-10111001-01000001
+// CHECK-INST: famin   { z0.d - z3.d }, { z0.d - z3.d }, { z0.d - z3.d }
+// CHECK-ENCODING: [0x41,0xb9,0xe0,0xc1]
+// CHECK-ERROR: instruction requires: faminmax sme2
+// CHECK-UNKNOWN: c1e0b941 <unknown>
+
+famin   {z28.d-z31.d}, {z28.d-z31.d}, {z28.d-z31.d}  // 11000001-11111100-10111001-01011101
+// CHECK-INST: famin   { z28.d - z31.d }, { z28.d - z31.d }, { z28.d - z31.d }
+// CHECK-ENCODING: [0x5d,0xb9,0xfc,0xc1]
+// CHECK-ERROR: instruction requires: faminmax sme2
+// CHECK-UNKNOWN: c1fcb95d <unknown>
diff --git a/llvm/test/MC/AArch64/FP8_SVE2/faminmax-diagnostics.s b/llvm/test/MC/AArch64/FP8_SVE2/faminmax-diagnostics.s
new file mode 100644
index 0000000000000..e5d8b246b148b
--- /dev/null
+++ b/llvm/test/MC/AArch64/FP8_SVE2/faminmax-diagnostics.s
@@ -0,0 +1,99 @@
+// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2,+faminmax 2>&1 < %s | FileCheck %s
+// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2,+faminmax 2>&1 < %s | FileCheck %s
+
+// FAMIN:
+// Invalid predicate register
+
+famin z0.h, p8/m, z0.h, z1.h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid restricted predicate register, expected p0..p7 (without element suffix)
+// CHECK-NEXT: famin z0.h, p8/m, z0.h, z1.h
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+famin z31.s, p7/z, z31.s, z30.s
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: famin z31.s, p7/z, z31.s, z30.s
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Invalid vector suffix
+
+famin z23.h, p3/m, z23.h, z13.s
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: famin z23.h, p3/m, z23.h, z13.s
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+famin z23.b, p3/m, z23.d, z13.d
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: famin z23.b, p3/m, z23.d, z13.d
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Z register out of range
+
+famin z31.s, p7/z, z31.s, z32.s
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: famin z31.s, p7/z, z31.s, z32.s
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+famin z0.d, p0/m, z0.d, z35.d
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: famin z0.d, p0/m, z0.d, z35.d
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Negative tests for instructions that are incompatible with movprfx
+
+movprfx z20, z31
+famin   z23.h, p3/m, z23.h, z13.h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx writing to a different destination
+// CHECK-NEXT: famin   z23.h, p3/m, z23.h, z13.h
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+
+// FMAX:
+// Invalid predicate register
+
+famax z0.h, p8/m, z0.h, z1.h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid restricted predicate register, expected p0..p7 (without element suffix)
+// CHECK-NEXT: famax z0.h, p8/m, z0.h, z1.h
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+famax z31.s, p7/z, z31.s, z30.s
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: famax z31.s, p7/z, z31.s, z30.s
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Invalid vector suffix
+
+famax z23.h, p3/m, z23.h, z13.s
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: famax z23.h, p3/m, z23.h, z13.s
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+famax z23.b, p3/m, z23.d, z13.d
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: famax z23.b, p3/m, z23.d, z13.d
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Z register out of range
+
+famax z31.s, p7/z, z31.s, z32.s
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: famax z31.s, p7/z, z31.s, z32.s
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+famax z0.d, p0/m, z0.d, z35.d
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: famax z0.d, p0/m, z0.d, z35.d
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Negative tests for instructions that are incompatible with movprfx
+
+movprfx z20, z31
+famax   z23.h, p3/m, z23.h, z13.h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx writing to a different destination
+// CHECK-NEXT: famax   z23.h, p3/m, z23.h, z13.h
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
\ No newline at end of file
diff --git a/llvm/test/MC/AArch64/FP8_SVE2/faminmax.s b/llvm/test/MC/AArch64/FP8_SVE2/faminmax.s
new file mode 100644
index 0000000000000..3f931b05da836
--- /dev/null
+++ b/llvm/test/MC/AArch64/FP8_SVE2/faminmax.s
@@ -0,0 +1,145 @@
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2,+faminmax < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2,+faminmax < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+
+// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
+// RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
+
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2,+faminmax < %s \
+// RUN:        | llvm-objdump -d --mattr=+sve2,+faminmax - | FileCheck %s --check-prefix=CHECK-INST
+
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2,+faminmax < %s \
+// RUN:        | llvm-objdump -d --mattr=-faminmax - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+
+// Disassemble encoding and check the re-encoding (-show-encoding) matches.
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2,+faminmax < %s \
+// RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
+// RUN:        | llvm-mc -triple=aarch64 -mattr=+sve2,+faminmax -disassemble -show-encoding \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+
+// FAMIN
+
+famin   z0.h, p0/m, z0.h, z1.h  // 01100101-01001111-10000000-00100000
+// CHECK-INST: famin   z0.h, p0/m, z0.h, z1.h
+// CHECK-ENCODING: [0x20,0x80,0x4f,0x65]
+// CHECK-ERROR: instruction requires: faminmax sve2 or sme2
+// CHECK-UNKNOWN: 654f8020 <unknown>
+
+movprfx z23, z31
+famin   z23.h, p3/m, z23.h, z13.h  // 01100101-01001111-10001101-10110111
+// CHECK-INST:  movprfx z23, z31
+// CHECK-INST: famin   z23.h, p3/m, z23.h, z13.h
+// CHECK-ENCODING: [0xb7,0x8d,0x4f,0x65]
+// CHECK-ERROR: instruction requires: faminmax sve2 or sme2
+// CHECK-UNKNOWN: 654f8db7 <unknown>
+
+famin   z31.h, p7/m, z31.h, z30.h  // 01100101-01001111-10011111-11011111
+// CHECK-INST: famin   z31.h, p7/m, z31.h, z30.h
+// CHECK-ENCODING: [0xdf,0x9f,0x4f,0x65]
+// CHECK-ERROR: instruction requires: faminmax sve2 or sme2
+// CHECK-UNKNOWN: 654f9fdf <unknown>
+
+famin   z0.s, p0/m, z0.s, z1.s  // 01100101-10001111-10000000-00100000
+// CHECK-INST: famin   z0.s, p0/m, z0.s, z1.s
+// CHECK-ENCODING: [0x20,0x80,0x8f,0x65]
+// CHECK-ERROR: instruction requires: faminmax sve2 or sme2
+// CHECK-UNKNOWN: 658f8020 <unknown>
+
+movprfx z23, z31
+famin   z23.s, p3/m, z23.s, z13.s  // 01100101-10001111-10001101-10110111
+// CHECK-INST:  movprfx z23, z31
+// CHECK-INST: famin   z23.s, p3/m, z23.s, z13.s
+// CHECK-ENCODING: [0xb7,0x8d,0x8f,0x65]
+// CHECK-ERROR: instruction requires: faminmax sve2 or sme2
+// CHECK-UNKNOWN: 658f8db7 <unknown>
+
+famin   z31.s, p7/m, z31.s, z30.s  // 01100101-10001111-10011111-11011111
+// CHECK-INST: famin   z31.s, p7/m, z31.s, z30.s
+// CHECK-ENCODING: [0xdf,0x9f,0x8f,0x65]
+// CHECK-ERROR: instruction requires: faminmax sve2 or sme2
+// CHECK-UNKNOWN: 658f9fdf <unknown>
+
+famin   z0.d, p0/m, z0.d, z1.d  // 01100101-11001111-10000000-00100000
+// CHECK-INST: famin   z0.d, p0/m, z0.d, z1.d
+// CHECK-ENCODING: [0x20,0x80,0xcf,0x65]
+// CHECK-ERROR: instruction requires: faminmax sve2 or sme2
+// CHECK-UNKNOWN: 65cf8020 <unknown>
+
+movprfx z23, z31
+famin   z23.d, p3/m, z23.d, z13.d  // 01100101-11001111-10001101-10110111
+// CHECK-INST:  movprfx z23, z31
+// CHECK-INST: famin   z23.d, p3/m, z23.d, z13.d
+// CHECK-ENCODING: [0xb7,0x8d,0xcf,0x65]
+// CHECK-ERROR: instruction requires: faminmax sve2 or sme2
+// CHECK-UNKNOWN: 65cf8db7 <unknown>
+
+famin   z31.d, p7/m, z31.d, z30.d  // 01100101-11001111-10011111-11011111
+// CHECK-INST: famin   z31.d, p7/m, z31.d, z30.d
+// CHECK-ENCODING: [0xdf,0x9f,0xcf,0x65]
+// CHECK-ERROR: instruction requires: faminmax sve2 or sme2
+// CHECK-UNKNOWN: 65cf9fdf <unknown>
+
+
+// FAMAX
+
+famax   z0.h, p0/m, z0.h, z1.h  // 01100101-01001110-10000000-00100000
+// CHECK-INST: famax   z0.h, p0/m, z0.h, z1.h
+// CHECK-ENCODING: [0x20,0x80,0x4e,0x65]
+// CHECK-ERROR: instruction requires: faminmax sve2 or sme2
+// CHECK-UNKNOWN: 654e8020 <unknown>
+
+movprfx z23, z31
+famax   z23.h, p3/m, z23.h, z13.h  // 01100101-01001110-10001101-10110111
+// CHECK-INST:  movprfx z23, z31
+// CHECK-INST: famax   z23.h, p3/m, z23.h, z13.h
+// CHECK-ENCODING: [0xb7,0x8d,0x4e,0x65]
+// CHECK-ERROR: instruction requires: faminmax sve2 or sme2
+// CHECK-UNKNOWN: 654e8db7 <unknown>
+
+famax   z31.h, p7/m, z31.h, z30.h  // 01100101-01001110-10011111-11011111
+// CHECK-INST: famax   z31.h, p7/m, z31.h, z30.h
+// CHECK-ENCODING: [0xdf,0x9f,0x4e,0x65]
+// CHECK-ERROR: instruction requires: faminmax sve2 or sme2
+// CHECK-UNKNOWN: 654e9fdf <unknown>
+
+famax   z0.s, p0/m, z0.s, z1.s  // 01100101-10001110-10000000-00100000
+// CHECK-INST: famax   z0.s, p0/m, z0.s, z1.s
+// CHECK-ENCODING: [0x20,0x80,0x8e,0x65]
+// CHECK-ERROR: instruction requires: faminmax sve2 or sme2
+// CHECK-UNKNOWN: 658e8020 <unknown>
+
+movprfx z23, z31
+famax   z23.s, p3/m, z23.s, z13.s  // 01100101-10001110-10001101-10110111
+// CHECK-INST:  movprfx z23, z31
+// CHECK-INST: famax   z23.s, p3/m, z23.s, z13.s
+// CHECK-ENCODING: [0xb7,0x8d,0x8e,0x65]
+// CHECK-ERROR: instruction requires: faminmax sve2 or sme2
+// CHECK-UNKNOWN: 658e8db7 <unknown>
+
+famax   z31.s, p7/m, z31.s, z30.s  // 01100101-10001110-10011111-11011111
+// CHECK-INST: famax   z31.s, p7/m, z31.s, z30.s
+// CHECK-ENCODING: [0xdf,0x9f,0x8e,0x65]
+// CHECK-ERROR: instruction requires: faminmax sve2 or sme2
+// CHECK-UNKNOWN: 658e9fdf <unknown>
+
+famax   z0.d, p0/m, z0.d, z1.d  // 01100101-11001110-10000000-00100000
+// CHECK-INST: famax   z0.d, p0/m, z0.d, z1.d
+// CHECK-ENCODING: [0x20,0x80,0xce,0x65]
+// CHECK-ERROR: instruction requires: faminmax sve2 or sme2
+// CHECK-UNKNOWN: 65ce8020 <unknown>
+
+movprfx z23, z31
+famax   z23.d, p3/m, z23.d, z13.d  // 01100101-11001110-10001101-10110111
+// CHECK-INST:  movprfx z23, z31
+// CHECK-INST: famax   z23.d, p3/m, z23.d, z13.d
+// CHECK-ENCODING: [0xb7,0x8d,0xce,0x65]
+// CHECK-ERROR: instruction requires: faminmax sve2 or sme2
+// CHECK-UNKNOWN: 65ce8db7 <unknown>
+
+famax   z31.d, p7/m, z31.d, z30.d  // 01100101-11001110-10011111-11011111
+// CHECK-INST: famax   z31.d, p7/m, z31.d, z30.d
+// CHECK-ENCODING: [0xdf,0x9f,0xce,0x65]
+// CHECK-ERROR: instruction requires: faminmax sve2 or sme2
+// CHECK-UNKNOWN: 65ce9fdf <unknown>
\ No newline at end of file
diff --git a/llvm/unittests/TargetParser/TargetParserTest.cpp b/llvm/unittests/TargetParser/TargetParserTest.cpp
index b662fbe3457cb..45f39a66ea8b3 100644
--- a/llvm/unittests/TargetParser/TargetParserTest.cpp
+++ b/llvm/unittests/TargetParser/TargetParserTest.cpp
@@ -1732,7 +1732,7 @@ TEST(TargetParserTest, AArch64ExtensionFeatures) {
       AArch64::AEK_RCPC3,   AArch64::AEK_THE,       AArch64::AEK_D128,
       AArch64::AEK_LSE128,  AArch64::AEK_SPECRES2,  AArch64::AEK_RASv2,
       AArch64::AEK_ITE,     AArch64::AEK_GCS,       AArch64::AEK_FPMR,
-      AArch64::AEK_FP8};
+      AArch64::AEK_FP8,     AArch64::AEK_FAMINMAX};
 
   std::vector<StringRef> Features;
 
@@ -1806,6 +1806,7 @@ TEST(TargetParserTest, AArch64ExtensionFeatures) {
   EXPECT_TRUE(llvm::is_contained(Features, "+gcs"));
   EXPECT_TRUE(llvm::is_contained(Features, "+fpmr"));
   EXPECT_TRUE(llvm::is_contained(Features, "+fp8"));
+  EXPECT_TRUE(llvm::is_contained(Features, "+faminmax"));
 
   // Assuming we listed every extension above, this should produce the same
   // result. (note that AEK_NONE doesn't have a name so it won't be in the
@@ -1931,6 +1932,7 @@ TEST(TargetParserTest, AArch64ArchExtFeature) {
       {"gcs", "nogcs", "+gcs", "-gcs"},
       {"fpmr", "nofpmr", "+fpmr", "-fpmr"},
       {"fp8", "nofp8", "+fp8", "-fp8"},
+      {"faminmax", "nofaminmax", "+faminmax", "-faminmax"},
   };
 
   for (unsigned i = 0; i < std::size(ArchExt); i++) {

From c8e1fbc3cc8c2a076f431dcec42a2685c476d05b Mon Sep 17 00:00:00 2001
From: Luke Lau <luke@igalia.com>
Date: Fri, 27 Oct 2023 12:16:28 +0100
Subject: [PATCH 210/877] [RISCV] Keep same SEW/LMUL ratio if possible in
 forward transfer (#69788)

For instructions like vmv.s.x and friends where we don't care about LMUL
or the
SEW/LMUL ratio, we can change the LMUL in its state so that it has the
same
SEW/LMUL ratio as the previous state. This allows us to avoid more VL
toggles
later down the line (i.e. use vsetvli zero, zero, which requires that
the
SEW/LMUL ratio must be the same)

This is an alternative approach to the idea in #69259, but note that
they
don't catch exactly the same test cases.
---
 .../Target/RISCV/MCA/RISCVCustomBehaviour.cpp |   7 +-
 .../RISCV/MCTargetDesc/RISCVBaseInfo.cpp      |   6 +-
 .../Target/RISCV/MCTargetDesc/RISCVBaseInfo.h |   4 +-
 llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp  |  13 ++
 llvm/test/CodeGen/RISCV/double_reduct.ll      |   2 +-
 .../rvv/fixed-vectors-deinterleave-load.ll    |   4 +-
 .../CodeGen/RISCV/rvv/fixed-vectors-insert.ll |   4 +-
 .../RISCV/rvv/fixed-vectors-int-buildvec.ll   |   4 +-
 .../CodeGen/RISCV/rvv/fixed-vectors-int.ll    | 108 ++++++-----
 .../RISCV/rvv/fixed-vectors-masked-gather.ll  | 173 +++++++++++-------
 .../rvv/fixed-vectors-reduction-fp-vp.ll      |   4 +-
 .../RISCV/rvv/fixed-vectors-reduction-fp.ll   |  65 +++----
 .../rvv/fixed-vectors-reduction-int-vp.ll     |   2 +-
 .../RISCV/rvv/fixed-vectors-reduction-int.ll  | 140 ++++++--------
 .../rvv/fixed-vectors-shuffle-transpose.ll    |   8 +-
 .../test/CodeGen/RISCV/rvv/shuffle-reverse.ll |  15 +-
 .../RISCV/rvv/vector-deinterleave-fixed.ll    |   4 +-
 .../CodeGen/RISCV/rvv/vreductions-fp-vp.ll    |  24 +--
 .../CodeGen/RISCV/rvv/vreductions-int-vp.ll   |   4 +-
 .../RISCV/rvv/vsetvli-insert-crossbb.ll       |   5 +-
 .../RISCV/rvv/vsetvli-valid-elen-fp.ll        |   3 +-
 21 files changed, 305 insertions(+), 294 deletions(-)

diff --git a/llvm/lib/Target/RISCV/MCA/RISCVCustomBehaviour.cpp b/llvm/lib/Target/RISCV/MCA/RISCVCustomBehaviour.cpp
index 1bb129702661b..aba2511959af0 100644
--- a/llvm/lib/Target/RISCV/MCA/RISCVCustomBehaviour.cpp
+++ b/llvm/lib/Target/RISCV/MCA/RISCVCustomBehaviour.cpp
@@ -212,9 +212,10 @@ getEEWAndEMULForUnitStrideLoadStore(unsigned Opcode, RISCVII::VLMUL LMUL,
     llvm_unreachable("Opcode is not a vector unit stride load nor store");
   }
 
-  uint8_t EMUL =
-      static_cast<uint8_t>(RISCVVType::getSameRatioLMUL(SEW, LMUL, EEW));
-  return std::make_pair(EEW, EMUL);
+  auto EMUL = RISCVVType::getSameRatioLMUL(SEW, LMUL, EEW);
+  if (!EEW)
+    llvm_unreachable("Invalid SEW or LMUL for new ratio");
+  return std::make_pair(EEW, *EMUL);
 }
 
 unsigned RISCVInstrumentManager::getSchedClassID(
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.cpp
index ed8ae4a7a4550..edcca8805dfc0 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.cpp
@@ -206,12 +206,14 @@ unsigned RISCVVType::getSEWLMULRatio(unsigned SEW, RISCVII::VLMUL VLMul) {
   return (SEW * 8) / LMul;
 }
 
-RISCVII::VLMUL RISCVVType::getSameRatioLMUL(unsigned SEW, RISCVII::VLMUL VLMUL,
-                                            unsigned EEW) {
+std::optional<RISCVII::VLMUL>
+RISCVVType::getSameRatioLMUL(unsigned SEW, RISCVII::VLMUL VLMUL, unsigned EEW) {
   unsigned Ratio = RISCVVType::getSEWLMULRatio(SEW, VLMUL);
   unsigned EMULFixedPoint = (EEW * 8) / Ratio;
   bool Fractional = EMULFixedPoint < 8;
   unsigned EMUL = Fractional ? 8 / EMULFixedPoint : EMULFixedPoint / 8;
+  if (!isValidLMUL(EMUL, Fractional))
+    return std::nullopt;
   return RISCVVType::encodeLMUL(EMUL, Fractional);
 }
 
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
index e7181eadd4973..00b4751905f6a 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
@@ -535,8 +535,8 @@ void printVType(unsigned VType, raw_ostream &OS);
 
 unsigned getSEWLMULRatio(unsigned SEW, RISCVII::VLMUL VLMul);
 
-RISCVII::VLMUL getSameRatioLMUL(unsigned SEW, RISCVII::VLMUL VLMUL,
-                                unsigned EEW);
+std::optional<RISCVII::VLMUL>
+getSameRatioLMUL(unsigned SEW, RISCVII::VLMUL VLMUL, unsigned EEW);
 } // namespace RISCVVType
 
 namespace RISCVRVC {
diff --git a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
index 9d2b2c3b3f592..27f281fff6fc5 100644
--- a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
@@ -539,6 +539,8 @@ class VSETVLIInfo {
     MaskAgnostic = MA;
   }
 
+  void setVLMul(RISCVII::VLMUL VLMul) { this->VLMul = VLMul; }
+
   unsigned encodeVTYPE() const {
     assert(isValid() && !isUnknown() && !SEWLMULRatioOnly &&
            "Can't encode VTYPE for uninitialized or unknown");
@@ -1038,6 +1040,17 @@ void RISCVInsertVSETVLI::transferBefore(VSETVLIInfo &Info,
   if (!RISCVII::hasVLOp(TSFlags))
     return;
 
+  // If we don't use LMUL or the SEW/LMUL ratio, then adjust LMUL so that we
+  // maintain the SEW/LMUL ratio. This allows us to eliminate VL toggles in more
+  // places.
+  DemandedFields Demanded = getDemanded(MI, MRI, ST);
+  if (!Demanded.LMUL && !Demanded.SEWLMULRatio && Info.isValid() &&
+      PrevInfo.isValid() && !Info.isUnknown() && !PrevInfo.isUnknown()) {
+    if (auto NewVLMul = RISCVVType::getSameRatioLMUL(
+            PrevInfo.getSEW(), PrevInfo.getVLMUL(), Info.getSEW()))
+      Info.setVLMul(*NewVLMul);
+  }
+
   // For vmv.s.x and vfmv.s.f, there are only two behaviors, VL = 0 and
   // VL > 0. We can discard the user requested AVL and just use the last
   // one if we can prove it equally zero.  This removes a vsetvli entirely
diff --git a/llvm/test/CodeGen/RISCV/double_reduct.ll b/llvm/test/CodeGen/RISCV/double_reduct.ll
index cecaa9d24f8bc..92f78032e81bf 100644
--- a/llvm/test/CodeGen/RISCV/double_reduct.ll
+++ b/llvm/test/CodeGen/RISCV/double_reduct.ll
@@ -90,7 +90,7 @@ define i16 @add_ext_i16(<16 x i8> %a, <16 x i8> %b) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
 ; CHECK-NEXT:    vwaddu.vv v10, v8, v9
-; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vmv.s.x v8, zero
 ; CHECK-NEXT:    vredsum.vs v8, v10, v8
 ; CHECK-NEXT:    vmv.x.s a0, v8
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll
index 25177734325ce..83edd49bc9637 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll
@@ -22,9 +22,9 @@ define {<16 x i1>, <16 x i1>} @vector_deinterleave_load_v16i1_v32i1(ptr %p) {
 ; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
 ; CHECK-NEXT:    vadd.vi v12, v11, -16
 ; CHECK-NEXT:    li a0, -256
-; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vmv.s.x v0, a0
-; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, mu
+; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, mu
 ; CHECK-NEXT:    vrgather.vv v9, v8, v12, v0.t
 ; CHECK-NEXT:    vmsne.vi v9, v9, 0
 ; CHECK-NEXT:    vadd.vi v12, v11, 1
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert.ll
index cbcca9d2696f4..3cc7371c1ce9a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert.ll
@@ -550,9 +550,9 @@ define void @insertelt_c6_v8i64_0_add(ptr %x, ptr %y) {
 ; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
 ; CHECK-NEXT:    vle64.v v8, (a0)
 ; CHECK-NEXT:    li a2, 6
-; CHECK-NEXT:    vsetivli zero, 8, e64, m1, tu, ma
+; CHECK-NEXT:    vsetvli zero, zero, e64, m4, tu, ma
 ; CHECK-NEXT:    vmv.s.x v8, a2
-; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
 ; CHECK-NEXT:    vle64.v v12, (a1)
 ; CHECK-NEXT:    vadd.vv v8, v8, v12
 ; CHECK-NEXT:    vse64.v v8, (a0)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll
index e9412019a0dec..85939377c6db0 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll
@@ -567,9 +567,9 @@ define void @buildvec_seq_v9i8(ptr %x) {
 ; CHECK-NEXT:    vmv.v.i v8, 3
 ; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
 ; CHECK-NEXT:    li a1, 146
-; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vmv.s.x v0, a1
-; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vmerge.vim v8, v8, 2, v0
 ; CHECK-NEXT:    vsetivli zero, 9, e8, m1, ta, ma
 ; CHECK-NEXT:    vse8.v v8, (a0)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll
index e6868abdb5b1d..c95d144a97089 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll
@@ -1101,21 +1101,20 @@ define void @urem_v2i64(ptr %x, ptr %y) {
 define void @mulhu_v16i8(ptr %x) {
 ; CHECK-LABEL: mulhu_v16i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; CHECK-NEXT:    vle8.v v8, (a0)
 ; CHECK-NEXT:    lui a1, 3
 ; CHECK-NEXT:    addi a1, a1, -2044
-; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
 ; CHECK-NEXT:    vmv.s.x v0, a1
-; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vmv.v.i v9, 0
 ; CHECK-NEXT:    li a1, -128
 ; CHECK-NEXT:    vmerge.vxm v10, v9, a1, v0
 ; CHECK-NEXT:    lui a1, 1
 ; CHECK-NEXT:    addi a2, a1, 32
-; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vmv.s.x v0, a2
-; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    lui a2, %hi(.LCPI65_0)
 ; CHECK-NEXT:    addi a2, a2, %lo(.LCPI65_0)
 ; CHECK-NEXT:    vle8.v v11, (a2)
@@ -1126,21 +1125,21 @@ define void @mulhu_v16i8(ptr %x) {
 ; CHECK-NEXT:    vmulhu.vv v8, v8, v10
 ; CHECK-NEXT:    vadd.vv v8, v8, v9
 ; CHECK-NEXT:    li a2, 513
-; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vmv.s.x v0, a2
-; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vmv.v.i v9, 4
 ; CHECK-NEXT:    vmerge.vim v9, v9, 1, v0
 ; CHECK-NEXT:    addi a1, a1, 78
-; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vmv.s.x v0, a1
-; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vmerge.vim v9, v9, 3, v0
 ; CHECK-NEXT:    lui a1, 8
 ; CHECK-NEXT:    addi a1, a1, 304
-; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vmv.s.x v0, a1
-; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vmerge.vim v9, v9, 2, v0
 ; CHECK-NEXT:    vsrl.vv v8, v8, v9
 ; CHECK-NEXT:    vse8.v v8, (a0)
@@ -1204,9 +1203,9 @@ define void @mulhu_v6i16(ptr %x) {
 ; CHECK-NEXT:    vdivu.vv v9, v10, v9
 ; CHECK-NEXT:    lui a1, 45217
 ; CHECK-NEXT:    addi a1, a1, -1785
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vmv.s.x v10, a1
-; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; CHECK-NEXT:    vsext.vf2 v11, v10
 ; CHECK-NEXT:    vdivu.vv v8, v8, v11
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
@@ -1309,10 +1308,10 @@ define void @mulhs_v16i8(ptr %x) {
 ; CHECK-NEXT:    vmv.v.x v9, a1
 ; CHECK-NEXT:    lui a1, 5
 ; CHECK-NEXT:    addi a1, a1, -1452
-; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vmv.s.x v0, a1
 ; CHECK-NEXT:    li a1, 57
-; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vmerge.vxm v9, v9, a1, v0
 ; CHECK-NEXT:    vmulhu.vv v8, v8, v9
 ; CHECK-NEXT:    vmv.v.i v9, 7
@@ -1367,9 +1366,9 @@ define void @mulhs_v6i16(ptr %x) {
 ; CHECK-NEXT:    vdiv.vv v9, v9, v10
 ; CHECK-NEXT:    lui a1, 1020016
 ; CHECK-NEXT:    addi a1, a1, 2041
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
 ; CHECK-NEXT:    vmv.s.x v10, a1
-; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; CHECK-NEXT:    vsext.vf2 v11, v10
 ; CHECK-NEXT:    vdiv.vv v8, v8, v11
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
@@ -4872,21 +4871,21 @@ define void @mulhu_v32i8(ptr %x) {
 ; LMULMAX2-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
 ; LMULMAX2-NEXT:    vle8.v v8, (a0)
 ; LMULMAX2-NEXT:    vmv.v.i v10, 0
-; LMULMAX2-NEXT:    lui a2, 163907
-; LMULMAX2-NEXT:    addi a2, a2, -2044
-; LMULMAX2-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; LMULMAX2-NEXT:    vmv.s.x v0, a2
-; LMULMAX2-NEXT:    li a2, -128
-; LMULMAX2-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
-; LMULMAX2-NEXT:    vmerge.vxm v12, v10, a2, v0
-; LMULMAX2-NEXT:    lui a2, 66049
-; LMULMAX2-NEXT:    addi a2, a2, 32
-; LMULMAX2-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; LMULMAX2-NEXT:    vmv.s.x v0, a2
-; LMULMAX2-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
-; LMULMAX2-NEXT:    lui a2, %hi(.LCPI181_0)
-; LMULMAX2-NEXT:    addi a2, a2, %lo(.LCPI181_0)
-; LMULMAX2-NEXT:    vle8.v v14, (a2)
+; LMULMAX2-NEXT:    lui a1, 163907
+; LMULMAX2-NEXT:    addi a1, a1, -2044
+; LMULMAX2-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; LMULMAX2-NEXT:    vmv.s.x v0, a1
+; LMULMAX2-NEXT:    li a1, -128
+; LMULMAX2-NEXT:    vsetvli zero, zero, e8, m2, ta, ma
+; LMULMAX2-NEXT:    vmerge.vxm v12, v10, a1, v0
+; LMULMAX2-NEXT:    lui a1, 66049
+; LMULMAX2-NEXT:    addi a1, a1, 32
+; LMULMAX2-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; LMULMAX2-NEXT:    vmv.s.x v0, a1
+; LMULMAX2-NEXT:    vsetvli zero, zero, e8, m2, ta, ma
+; LMULMAX2-NEXT:    lui a1, %hi(.LCPI181_0)
+; LMULMAX2-NEXT:    addi a1, a1, %lo(.LCPI181_0)
+; LMULMAX2-NEXT:    vle8.v v14, (a1)
 ; LMULMAX2-NEXT:    vmerge.vim v10, v10, 1, v0
 ; LMULMAX2-NEXT:    vsrl.vv v10, v8, v10
 ; LMULMAX2-NEXT:    vmulhu.vv v10, v10, v14
@@ -4894,23 +4893,23 @@ define void @mulhu_v32i8(ptr %x) {
 ; LMULMAX2-NEXT:    vmulhu.vv v8, v8, v12
 ; LMULMAX2-NEXT:    vadd.vv v8, v8, v10
 ; LMULMAX2-NEXT:    vmv.v.i v10, 4
-; LMULMAX2-NEXT:    lui a2, 8208
-; LMULMAX2-NEXT:    addi a2, a2, 513
-; LMULMAX2-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; LMULMAX2-NEXT:    vmv.s.x v0, a2
-; LMULMAX2-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; LMULMAX2-NEXT:    lui a1, 8208
+; LMULMAX2-NEXT:    addi a1, a1, 513
+; LMULMAX2-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; LMULMAX2-NEXT:    vmv.s.x v0, a1
+; LMULMAX2-NEXT:    vsetvli zero, zero, e8, m2, ta, ma
 ; LMULMAX2-NEXT:    vmerge.vim v10, v10, 1, v0
-; LMULMAX2-NEXT:    lui a2, 66785
-; LMULMAX2-NEXT:    addi a2, a2, 78
-; LMULMAX2-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; LMULMAX2-NEXT:    vmv.s.x v0, a2
-; LMULMAX2-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; LMULMAX2-NEXT:    lui a1, 66785
+; LMULMAX2-NEXT:    addi a1, a1, 78
+; LMULMAX2-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; LMULMAX2-NEXT:    vmv.s.x v0, a1
+; LMULMAX2-NEXT:    vsetvli zero, zero, e8, m2, ta, ma
 ; LMULMAX2-NEXT:    vmerge.vim v10, v10, 3, v0
-; LMULMAX2-NEXT:    lui a2, 529160
-; LMULMAX2-NEXT:    addi a2, a2, 304
-; LMULMAX2-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; LMULMAX2-NEXT:    vmv.s.x v0, a2
-; LMULMAX2-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; LMULMAX2-NEXT:    lui a1, 529160
+; LMULMAX2-NEXT:    addi a1, a1, 304
+; LMULMAX2-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; LMULMAX2-NEXT:    vmv.s.x v0, a1
+; LMULMAX2-NEXT:    vsetvli zero, zero, e8, m2, ta, ma
 ; LMULMAX2-NEXT:    vmerge.vim v10, v10, 2, v0
 ; LMULMAX2-NEXT:    vsrl.vv v8, v8, v10
 ; LMULMAX2-NEXT:    vse8.v v8, (a0)
@@ -5250,11 +5249,11 @@ define void @mulhs_v32i8(ptr %x) {
 ; LMULMAX2-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
 ; LMULMAX2-NEXT:    vle8.v v8, (a0)
 ; LMULMAX2-NEXT:    vmv.v.i v10, 7
-; LMULMAX2-NEXT:    lui a2, 304453
-; LMULMAX2-NEXT:    addi a2, a2, -1452
-; LMULMAX2-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; LMULMAX2-NEXT:    vmv.s.x v0, a2
-; LMULMAX2-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; LMULMAX2-NEXT:    lui a1, 304453
+; LMULMAX2-NEXT:    addi a1, a1, -1452
+; LMULMAX2-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; LMULMAX2-NEXT:    vmv.s.x v0, a1
+; LMULMAX2-NEXT:    vsetvli zero, zero, e8, m2, ta, ma
 ; LMULMAX2-NEXT:    vmerge.vim v10, v10, 1, v0
 ; LMULMAX2-NEXT:    li a1, -123
 ; LMULMAX2-NEXT:    vmv.v.x v12, a1
@@ -5267,15 +5266,14 @@ define void @mulhs_v32i8(ptr %x) {
 ;
 ; LMULMAX1-LABEL: mulhs_v32i8:
 ; LMULMAX1:       # %bb.0:
-; LMULMAX1-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; LMULMAX1-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; LMULMAX1-NEXT:    vle8.v v8, (a0)
 ; LMULMAX1-NEXT:    addi a1, a0, 16
 ; LMULMAX1-NEXT:    vle8.v v9, (a1)
 ; LMULMAX1-NEXT:    lui a2, 5
 ; LMULMAX1-NEXT:    addi a2, a2, -1452
-; LMULMAX1-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
 ; LMULMAX1-NEXT:    vmv.s.x v0, a2
-; LMULMAX1-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; LMULMAX1-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
 ; LMULMAX1-NEXT:    vmv.v.i v10, -9
 ; LMULMAX1-NEXT:    vmerge.vim v10, v10, 9, v0
 ; LMULMAX1-NEXT:    vdivu.vv v9, v9, v10
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
index 6ee0e4525f5ec..728cf18e1a77d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll
@@ -536,15 +536,16 @@ define <4 x i8> @mgather_truemask_v4i8(<4 x ptr> %ptrs, <4 x i8> %passthru) {
 ; RV64ZVE32F-NEXT:  .LBB9_5: # %cond.load
 ; RV64ZVE32F-NEXT:    ld a2, 0(a0)
 ; RV64ZVE32F-NEXT:    lbu a2, 0(a2)
-; RV64ZVE32F-NEXT:    vsetivli zero, 4, e8, mf4, tu, ma
+; RV64ZVE32F-NEXT:    vsetvli zero, zero, e8, mf2, tu, ma
 ; RV64ZVE32F-NEXT:    vmv.s.x v8, a2
 ; RV64ZVE32F-NEXT:    andi a2, a1, 2
 ; RV64ZVE32F-NEXT:    beqz a2, .LBB9_2
 ; RV64ZVE32F-NEXT:  .LBB9_6: # %cond.load1
 ; RV64ZVE32F-NEXT:    ld a2, 8(a0)
 ; RV64ZVE32F-NEXT:    lbu a2, 0(a2)
-; RV64ZVE32F-NEXT:    vsetivli zero, 2, e8, mf4, tu, ma
+; RV64ZVE32F-NEXT:    vsetvli zero, zero, e8, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vmv.s.x v9, a2
+; RV64ZVE32F-NEXT:    vsetivli zero, 2, e8, mf4, tu, ma
 ; RV64ZVE32F-NEXT:    vslideup.vi v8, v9, 1
 ; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    beqz a2, .LBB9_3
@@ -636,7 +637,7 @@ define <8 x i8> @mgather_v8i8(<8 x ptr> %ptrs, <8 x i1> %m, <8 x i8> %passthru)
 ; RV64ZVE32F-NEXT:  .LBB11_9: # %cond.load
 ; RV64ZVE32F-NEXT:    ld a2, 0(a0)
 ; RV64ZVE32F-NEXT:    lbu a2, 0(a2)
-; RV64ZVE32F-NEXT:    vsetivli zero, 8, e8, mf2, tu, ma
+; RV64ZVE32F-NEXT:    vsetivli zero, 8, e8, mf4, tu, ma
 ; RV64ZVE32F-NEXT:    vmv.s.x v8, a2
 ; RV64ZVE32F-NEXT:    andi a2, a1, 2
 ; RV64ZVE32F-NEXT:    beqz a2, .LBB11_2
@@ -728,7 +729,7 @@ define <8 x i8> @mgather_baseidx_v8i8(ptr %base, <8 x i8> %idxs, <8 x i1> %m, <8
 ; RV64ZVE32F-NEXT:    vmv.x.s a2, v8
 ; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    lbu a2, 0(a2)
-; RV64ZVE32F-NEXT:    vsetivli zero, 8, e8, mf2, tu, ma
+; RV64ZVE32F-NEXT:    vsetivli zero, 8, e8, mf4, tu, ma
 ; RV64ZVE32F-NEXT:    vmv.s.x v9, a2
 ; RV64ZVE32F-NEXT:  .LBB12_2: # %else
 ; RV64ZVE32F-NEXT:    andi a2, a1, 2
@@ -1259,15 +1260,16 @@ define <4 x i16> @mgather_truemask_v4i16(<4 x ptr> %ptrs, <4 x i16> %passthru) {
 ; RV64ZVE32F-NEXT:  .LBB20_5: # %cond.load
 ; RV64ZVE32F-NEXT:    ld a2, 0(a0)
 ; RV64ZVE32F-NEXT:    lh a2, 0(a2)
-; RV64ZVE32F-NEXT:    vsetivli zero, 4, e16, mf2, tu, ma
+; RV64ZVE32F-NEXT:    vsetvli zero, zero, e16, m1, tu, ma
 ; RV64ZVE32F-NEXT:    vmv.s.x v8, a2
 ; RV64ZVE32F-NEXT:    andi a2, a1, 2
 ; RV64ZVE32F-NEXT:    beqz a2, .LBB20_2
 ; RV64ZVE32F-NEXT:  .LBB20_6: # %cond.load1
 ; RV64ZVE32F-NEXT:    ld a2, 8(a0)
 ; RV64ZVE32F-NEXT:    lh a2, 0(a2)
-; RV64ZVE32F-NEXT:    vsetivli zero, 2, e16, mf2, tu, ma
+; RV64ZVE32F-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vmv.s.x v9, a2
+; RV64ZVE32F-NEXT:    vsetivli zero, 2, e16, mf2, tu, ma
 ; RV64ZVE32F-NEXT:    vslideup.vi v8, v9, 1
 ; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    beqz a2, .LBB20_3
@@ -1359,7 +1361,7 @@ define <8 x i16> @mgather_v8i16(<8 x ptr> %ptrs, <8 x i1> %m, <8 x i16> %passthr
 ; RV64ZVE32F-NEXT:  .LBB22_9: # %cond.load
 ; RV64ZVE32F-NEXT:    ld a2, 0(a0)
 ; RV64ZVE32F-NEXT:    lh a2, 0(a2)
-; RV64ZVE32F-NEXT:    vsetivli zero, 8, e16, m1, tu, ma
+; RV64ZVE32F-NEXT:    vsetivli zero, 8, e16, mf2, tu, ma
 ; RV64ZVE32F-NEXT:    vmv.s.x v8, a2
 ; RV64ZVE32F-NEXT:    andi a2, a1, 2
 ; RV64ZVE32F-NEXT:    beqz a2, .LBB22_2
@@ -1454,7 +1456,7 @@ define <8 x i16> @mgather_baseidx_v8i8_v8i16(ptr %base, <8 x i8> %idxs, <8 x i1>
 ; RV64ZVE32F-NEXT:    slli a2, a2, 1
 ; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    lh a2, 0(a2)
-; RV64ZVE32F-NEXT:    vsetivli zero, 8, e16, m1, tu, ma
+; RV64ZVE32F-NEXT:    vsetivli zero, 8, e16, mf2, tu, ma
 ; RV64ZVE32F-NEXT:    vmv.s.x v9, a2
 ; RV64ZVE32F-NEXT:  .LBB23_2: # %else
 ; RV64ZVE32F-NEXT:    andi a2, a1, 2
@@ -1466,8 +1468,9 @@ define <8 x i16> @mgather_baseidx_v8i8_v8i16(ptr %base, <8 x i8> %idxs, <8 x i1>
 ; RV64ZVE32F-NEXT:    slli a2, a2, 1
 ; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    lh a2, 0(a2)
-; RV64ZVE32F-NEXT:    vsetivli zero, 2, e16, m1, tu, ma
+; RV64ZVE32F-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vmv.s.x v10, a2
+; RV64ZVE32F-NEXT:    vsetivli zero, 2, e16, m1, tu, ma
 ; RV64ZVE32F-NEXT:    vslideup.vi v9, v10, 1
 ; RV64ZVE32F-NEXT:  .LBB23_4: # %else2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
@@ -1492,8 +1495,9 @@ define <8 x i16> @mgather_baseidx_v8i8_v8i16(ptr %base, <8 x i8> %idxs, <8 x i1>
 ; RV64ZVE32F-NEXT:    slli a2, a2, 1
 ; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    lh a2, 0(a2)
-; RV64ZVE32F-NEXT:    vsetivli zero, 6, e16, m1, tu, ma
+; RV64ZVE32F-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vmv.s.x v8, a2
+; RV64ZVE32F-NEXT:    vsetivli zero, 6, e16, m1, tu, ma
 ; RV64ZVE32F-NEXT:    vslideup.vi v9, v8, 5
 ; RV64ZVE32F-NEXT:  .LBB23_9: # %else14
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
@@ -1511,8 +1515,9 @@ define <8 x i16> @mgather_baseidx_v8i8_v8i16(ptr %base, <8 x i8> %idxs, <8 x i1>
 ; RV64ZVE32F-NEXT:    slli a2, a2, 1
 ; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    lh a2, 0(a2)
-; RV64ZVE32F-NEXT:    vsetivli zero, 3, e16, m1, tu, ma
+; RV64ZVE32F-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vmv.s.x v11, a2
+; RV64ZVE32F-NEXT:    vsetivli zero, 3, e16, m1, tu, ma
 ; RV64ZVE32F-NEXT:    vslideup.vi v9, v11, 2
 ; RV64ZVE32F-NEXT:    andi a2, a1, 8
 ; RV64ZVE32F-NEXT:    beqz a2, .LBB23_6
@@ -1523,8 +1528,9 @@ define <8 x i16> @mgather_baseidx_v8i8_v8i16(ptr %base, <8 x i8> %idxs, <8 x i1>
 ; RV64ZVE32F-NEXT:    slli a2, a2, 1
 ; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    lh a2, 0(a2)
-; RV64ZVE32F-NEXT:    vsetivli zero, 4, e16, m1, tu, ma
+; RV64ZVE32F-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vmv.s.x v8, a2
+; RV64ZVE32F-NEXT:    vsetivli zero, 4, e16, m1, tu, ma
 ; RV64ZVE32F-NEXT:    vslideup.vi v9, v8, 3
 ; RV64ZVE32F-NEXT:    andi a2, a1, 16
 ; RV64ZVE32F-NEXT:    beqz a2, .LBB23_7
@@ -1545,8 +1551,9 @@ define <8 x i16> @mgather_baseidx_v8i8_v8i16(ptr %base, <8 x i8> %idxs, <8 x i1>
 ; RV64ZVE32F-NEXT:    slli a2, a2, 1
 ; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    lh a2, 0(a2)
-; RV64ZVE32F-NEXT:    vsetivli zero, 7, e16, m1, tu, ma
+; RV64ZVE32F-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vmv.s.x v10, a2
+; RV64ZVE32F-NEXT:    vsetivli zero, 7, e16, m1, tu, ma
 ; RV64ZVE32F-NEXT:    vslideup.vi v9, v10, 6
 ; RV64ZVE32F-NEXT:    andi a1, a1, -128
 ; RV64ZVE32F-NEXT:    beqz a1, .LBB23_11
@@ -1557,8 +1564,9 @@ define <8 x i16> @mgather_baseidx_v8i8_v8i16(ptr %base, <8 x i8> %idxs, <8 x i1>
 ; RV64ZVE32F-NEXT:    slli a1, a1, 1
 ; RV64ZVE32F-NEXT:    add a0, a0, a1
 ; RV64ZVE32F-NEXT:    lh a0, 0(a0)
-; RV64ZVE32F-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; RV64ZVE32F-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vmv.s.x v8, a0
+; RV64ZVE32F-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslideup.vi v9, v8, 7
 ; RV64ZVE32F-NEXT:    vmv1r.v v8, v9
 ; RV64ZVE32F-NEXT:    ret
@@ -1599,7 +1607,7 @@ define <8 x i16> @mgather_baseidx_sext_v8i8_v8i16(ptr %base, <8 x i8> %idxs, <8
 ; RV64ZVE32F-NEXT:    slli a2, a2, 1
 ; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    lh a2, 0(a2)
-; RV64ZVE32F-NEXT:    vsetivli zero, 8, e16, m1, tu, ma
+; RV64ZVE32F-NEXT:    vsetivli zero, 8, e16, mf2, tu, ma
 ; RV64ZVE32F-NEXT:    vmv.s.x v9, a2
 ; RV64ZVE32F-NEXT:  .LBB24_2: # %else
 ; RV64ZVE32F-NEXT:    andi a2, a1, 2
@@ -1611,8 +1619,9 @@ define <8 x i16> @mgather_baseidx_sext_v8i8_v8i16(ptr %base, <8 x i8> %idxs, <8
 ; RV64ZVE32F-NEXT:    slli a2, a2, 1
 ; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    lh a2, 0(a2)
-; RV64ZVE32F-NEXT:    vsetivli zero, 2, e16, m1, tu, ma
+; RV64ZVE32F-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vmv.s.x v10, a2
+; RV64ZVE32F-NEXT:    vsetivli zero, 2, e16, m1, tu, ma
 ; RV64ZVE32F-NEXT:    vslideup.vi v9, v10, 1
 ; RV64ZVE32F-NEXT:  .LBB24_4: # %else2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
@@ -1637,8 +1646,9 @@ define <8 x i16> @mgather_baseidx_sext_v8i8_v8i16(ptr %base, <8 x i8> %idxs, <8
 ; RV64ZVE32F-NEXT:    slli a2, a2, 1
 ; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    lh a2, 0(a2)
-; RV64ZVE32F-NEXT:    vsetivli zero, 6, e16, m1, tu, ma
+; RV64ZVE32F-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vmv.s.x v8, a2
+; RV64ZVE32F-NEXT:    vsetivli zero, 6, e16, m1, tu, ma
 ; RV64ZVE32F-NEXT:    vslideup.vi v9, v8, 5
 ; RV64ZVE32F-NEXT:  .LBB24_9: # %else14
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
@@ -1656,8 +1666,9 @@ define <8 x i16> @mgather_baseidx_sext_v8i8_v8i16(ptr %base, <8 x i8> %idxs, <8
 ; RV64ZVE32F-NEXT:    slli a2, a2, 1
 ; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    lh a2, 0(a2)
-; RV64ZVE32F-NEXT:    vsetivli zero, 3, e16, m1, tu, ma
+; RV64ZVE32F-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vmv.s.x v11, a2
+; RV64ZVE32F-NEXT:    vsetivli zero, 3, e16, m1, tu, ma
 ; RV64ZVE32F-NEXT:    vslideup.vi v9, v11, 2
 ; RV64ZVE32F-NEXT:    andi a2, a1, 8
 ; RV64ZVE32F-NEXT:    beqz a2, .LBB24_6
@@ -1668,8 +1679,9 @@ define <8 x i16> @mgather_baseidx_sext_v8i8_v8i16(ptr %base, <8 x i8> %idxs, <8
 ; RV64ZVE32F-NEXT:    slli a2, a2, 1
 ; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    lh a2, 0(a2)
-; RV64ZVE32F-NEXT:    vsetivli zero, 4, e16, m1, tu, ma
+; RV64ZVE32F-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vmv.s.x v8, a2
+; RV64ZVE32F-NEXT:    vsetivli zero, 4, e16, m1, tu, ma
 ; RV64ZVE32F-NEXT:    vslideup.vi v9, v8, 3
 ; RV64ZVE32F-NEXT:    andi a2, a1, 16
 ; RV64ZVE32F-NEXT:    beqz a2, .LBB24_7
@@ -1690,8 +1702,9 @@ define <8 x i16> @mgather_baseidx_sext_v8i8_v8i16(ptr %base, <8 x i8> %idxs, <8
 ; RV64ZVE32F-NEXT:    slli a2, a2, 1
 ; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    lh a2, 0(a2)
-; RV64ZVE32F-NEXT:    vsetivli zero, 7, e16, m1, tu, ma
+; RV64ZVE32F-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vmv.s.x v10, a2
+; RV64ZVE32F-NEXT:    vsetivli zero, 7, e16, m1, tu, ma
 ; RV64ZVE32F-NEXT:    vslideup.vi v9, v10, 6
 ; RV64ZVE32F-NEXT:    andi a1, a1, -128
 ; RV64ZVE32F-NEXT:    beqz a1, .LBB24_11
@@ -1702,8 +1715,9 @@ define <8 x i16> @mgather_baseidx_sext_v8i8_v8i16(ptr %base, <8 x i8> %idxs, <8
 ; RV64ZVE32F-NEXT:    slli a1, a1, 1
 ; RV64ZVE32F-NEXT:    add a0, a0, a1
 ; RV64ZVE32F-NEXT:    lh a0, 0(a0)
-; RV64ZVE32F-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; RV64ZVE32F-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vmv.s.x v8, a0
+; RV64ZVE32F-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslideup.vi v9, v8, 7
 ; RV64ZVE32F-NEXT:    vmv1r.v v8, v9
 ; RV64ZVE32F-NEXT:    ret
@@ -1744,7 +1758,7 @@ define <8 x i16> @mgather_baseidx_zext_v8i8_v8i16(ptr %base, <8 x i8> %idxs, <8
 ; RV64ZVE32F-NEXT:    slli a2, a2, 1
 ; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    lh a2, 0(a2)
-; RV64ZVE32F-NEXT:    vsetivli zero, 8, e16, m1, tu, ma
+; RV64ZVE32F-NEXT:    vsetivli zero, 8, e16, mf2, tu, ma
 ; RV64ZVE32F-NEXT:    vmv.s.x v9, a2
 ; RV64ZVE32F-NEXT:  .LBB25_2: # %else
 ; RV64ZVE32F-NEXT:    andi a2, a1, 2
@@ -1757,8 +1771,9 @@ define <8 x i16> @mgather_baseidx_zext_v8i8_v8i16(ptr %base, <8 x i8> %idxs, <8
 ; RV64ZVE32F-NEXT:    slli a2, a2, 1
 ; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    lh a2, 0(a2)
-; RV64ZVE32F-NEXT:    vsetivli zero, 2, e16, m1, tu, ma
+; RV64ZVE32F-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vmv.s.x v10, a2
+; RV64ZVE32F-NEXT:    vsetivli zero, 2, e16, m1, tu, ma
 ; RV64ZVE32F-NEXT:    vslideup.vi v9, v10, 1
 ; RV64ZVE32F-NEXT:  .LBB25_4: # %else2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
@@ -1784,8 +1799,9 @@ define <8 x i16> @mgather_baseidx_zext_v8i8_v8i16(ptr %base, <8 x i8> %idxs, <8
 ; RV64ZVE32F-NEXT:    slli a2, a2, 1
 ; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    lh a2, 0(a2)
-; RV64ZVE32F-NEXT:    vsetivli zero, 6, e16, m1, tu, ma
+; RV64ZVE32F-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vmv.s.x v8, a2
+; RV64ZVE32F-NEXT:    vsetivli zero, 6, e16, m1, tu, ma
 ; RV64ZVE32F-NEXT:    vslideup.vi v9, v8, 5
 ; RV64ZVE32F-NEXT:  .LBB25_9: # %else14
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
@@ -1804,8 +1820,9 @@ define <8 x i16> @mgather_baseidx_zext_v8i8_v8i16(ptr %base, <8 x i8> %idxs, <8
 ; RV64ZVE32F-NEXT:    slli a2, a2, 1
 ; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    lh a2, 0(a2)
-; RV64ZVE32F-NEXT:    vsetivli zero, 3, e16, m1, tu, ma
+; RV64ZVE32F-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vmv.s.x v11, a2
+; RV64ZVE32F-NEXT:    vsetivli zero, 3, e16, m1, tu, ma
 ; RV64ZVE32F-NEXT:    vslideup.vi v9, v11, 2
 ; RV64ZVE32F-NEXT:    andi a2, a1, 8
 ; RV64ZVE32F-NEXT:    beqz a2, .LBB25_6
@@ -1817,8 +1834,9 @@ define <8 x i16> @mgather_baseidx_zext_v8i8_v8i16(ptr %base, <8 x i8> %idxs, <8
 ; RV64ZVE32F-NEXT:    slli a2, a2, 1
 ; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    lh a2, 0(a2)
-; RV64ZVE32F-NEXT:    vsetivli zero, 4, e16, m1, tu, ma
+; RV64ZVE32F-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vmv.s.x v8, a2
+; RV64ZVE32F-NEXT:    vsetivli zero, 4, e16, m1, tu, ma
 ; RV64ZVE32F-NEXT:    vslideup.vi v9, v8, 3
 ; RV64ZVE32F-NEXT:    andi a2, a1, 16
 ; RV64ZVE32F-NEXT:    beqz a2, .LBB25_7
@@ -1841,8 +1859,9 @@ define <8 x i16> @mgather_baseidx_zext_v8i8_v8i16(ptr %base, <8 x i8> %idxs, <8
 ; RV64ZVE32F-NEXT:    slli a2, a2, 1
 ; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    lh a2, 0(a2)
-; RV64ZVE32F-NEXT:    vsetivli zero, 7, e16, m1, tu, ma
+; RV64ZVE32F-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vmv.s.x v10, a2
+; RV64ZVE32F-NEXT:    vsetivli zero, 7, e16, m1, tu, ma
 ; RV64ZVE32F-NEXT:    vslideup.vi v9, v10, 6
 ; RV64ZVE32F-NEXT:    andi a1, a1, -128
 ; RV64ZVE32F-NEXT:    beqz a1, .LBB25_11
@@ -1854,8 +1873,9 @@ define <8 x i16> @mgather_baseidx_zext_v8i8_v8i16(ptr %base, <8 x i8> %idxs, <8
 ; RV64ZVE32F-NEXT:    slli a1, a1, 1
 ; RV64ZVE32F-NEXT:    add a0, a0, a1
 ; RV64ZVE32F-NEXT:    lh a0, 0(a0)
-; RV64ZVE32F-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; RV64ZVE32F-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vmv.s.x v8, a0
+; RV64ZVE32F-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslideup.vi v9, v8, 7
 ; RV64ZVE32F-NEXT:    vmv1r.v v8, v9
 ; RV64ZVE32F-NEXT:    ret
@@ -1891,7 +1911,7 @@ define <8 x i16> @mgather_baseidx_v8i16(ptr %base, <8 x i16> %idxs, <8 x i1> %m,
 ; RV64ZVE32F-NEXT:    andi a2, a1, 1
 ; RV64ZVE32F-NEXT:    beqz a2, .LBB26_2
 ; RV64ZVE32F-NEXT:  # %bb.1: # %cond.load
-; RV64ZVE32F-NEXT:    vsetivli zero, 8, e16, m1, tu, ma
+; RV64ZVE32F-NEXT:    vsetivli zero, 8, e16, mf2, tu, ma
 ; RV64ZVE32F-NEXT:    vmv.x.s a2, v8
 ; RV64ZVE32F-NEXT:    slli a2, a2, 1
 ; RV64ZVE32F-NEXT:    add a2, a0, a2
@@ -2324,15 +2344,16 @@ define <4 x i32> @mgather_truemask_v4i32(<4 x ptr> %ptrs, <4 x i32> %passthru) {
 ; RV64ZVE32F-NEXT:  .LBB32_5: # %cond.load
 ; RV64ZVE32F-NEXT:    ld a2, 0(a0)
 ; RV64ZVE32F-NEXT:    lw a2, 0(a2)
-; RV64ZVE32F-NEXT:    vsetivli zero, 4, e32, m1, tu, ma
+; RV64ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, tu, ma
 ; RV64ZVE32F-NEXT:    vmv.s.x v8, a2
 ; RV64ZVE32F-NEXT:    andi a2, a1, 2
 ; RV64ZVE32F-NEXT:    beqz a2, .LBB32_2
 ; RV64ZVE32F-NEXT:  .LBB32_6: # %cond.load1
 ; RV64ZVE32F-NEXT:    ld a2, 8(a0)
 ; RV64ZVE32F-NEXT:    lw a2, 0(a2)
-; RV64ZVE32F-NEXT:    vsetivli zero, 2, e32, m1, tu, ma
+; RV64ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; RV64ZVE32F-NEXT:    vmv.s.x v9, a2
+; RV64ZVE32F-NEXT:    vsetivli zero, 2, e32, m1, tu, ma
 ; RV64ZVE32F-NEXT:    vslideup.vi v8, v9, 1
 ; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    beqz a2, .LBB32_3
@@ -7188,15 +7209,16 @@ define <4 x half> @mgather_truemask_v4f16(<4 x ptr> %ptrs, <4 x half> %passthru)
 ; RV64ZVE32F-NEXT:  .LBB61_5: # %cond.load
 ; RV64ZVE32F-NEXT:    ld a2, 0(a0)
 ; RV64ZVE32F-NEXT:    flh fa5, 0(a2)
-; RV64ZVE32F-NEXT:    vsetivli zero, 4, e16, mf2, tu, ma
+; RV64ZVE32F-NEXT:    vsetvli zero, zero, e16, m1, tu, ma
 ; RV64ZVE32F-NEXT:    vfmv.s.f v8, fa5
 ; RV64ZVE32F-NEXT:    andi a2, a1, 2
 ; RV64ZVE32F-NEXT:    beqz a2, .LBB61_2
 ; RV64ZVE32F-NEXT:  .LBB61_6: # %cond.load1
 ; RV64ZVE32F-NEXT:    ld a2, 8(a0)
 ; RV64ZVE32F-NEXT:    flh fa5, 0(a2)
-; RV64ZVE32F-NEXT:    vsetivli zero, 2, e16, mf2, tu, ma
+; RV64ZVE32F-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vfmv.s.f v9, fa5
+; RV64ZVE32F-NEXT:    vsetivli zero, 2, e16, mf2, tu, ma
 ; RV64ZVE32F-NEXT:    vslideup.vi v8, v9, 1
 ; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    beqz a2, .LBB61_3
@@ -7288,7 +7310,7 @@ define <8 x half> @mgather_v8f16(<8 x ptr> %ptrs, <8 x i1> %m, <8 x half> %passt
 ; RV64ZVE32F-NEXT:  .LBB63_9: # %cond.load
 ; RV64ZVE32F-NEXT:    ld a2, 0(a0)
 ; RV64ZVE32F-NEXT:    flh fa5, 0(a2)
-; RV64ZVE32F-NEXT:    vsetivli zero, 8, e16, m1, tu, ma
+; RV64ZVE32F-NEXT:    vsetivli zero, 8, e16, mf2, tu, ma
 ; RV64ZVE32F-NEXT:    vfmv.s.f v8, fa5
 ; RV64ZVE32F-NEXT:    andi a2, a1, 2
 ; RV64ZVE32F-NEXT:    beqz a2, .LBB63_2
@@ -7383,7 +7405,7 @@ define <8 x half> @mgather_baseidx_v8i8_v8f16(ptr %base, <8 x i8> %idxs, <8 x i1
 ; RV64ZVE32F-NEXT:    slli a2, a2, 1
 ; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    flh fa5, 0(a2)
-; RV64ZVE32F-NEXT:    vsetivli zero, 8, e16, m1, tu, ma
+; RV64ZVE32F-NEXT:    vsetivli zero, 8, e16, mf2, tu, ma
 ; RV64ZVE32F-NEXT:    vfmv.s.f v9, fa5
 ; RV64ZVE32F-NEXT:  .LBB64_2: # %else
 ; RV64ZVE32F-NEXT:    andi a2, a1, 2
@@ -7395,8 +7417,9 @@ define <8 x half> @mgather_baseidx_v8i8_v8f16(ptr %base, <8 x i8> %idxs, <8 x i1
 ; RV64ZVE32F-NEXT:    slli a2, a2, 1
 ; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    flh fa5, 0(a2)
-; RV64ZVE32F-NEXT:    vsetivli zero, 2, e16, m1, tu, ma
+; RV64ZVE32F-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vfmv.s.f v10, fa5
+; RV64ZVE32F-NEXT:    vsetivli zero, 2, e16, m1, tu, ma
 ; RV64ZVE32F-NEXT:    vslideup.vi v9, v10, 1
 ; RV64ZVE32F-NEXT:  .LBB64_4: # %else2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
@@ -7421,8 +7444,9 @@ define <8 x half> @mgather_baseidx_v8i8_v8f16(ptr %base, <8 x i8> %idxs, <8 x i1
 ; RV64ZVE32F-NEXT:    slli a2, a2, 1
 ; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    flh fa5, 0(a2)
-; RV64ZVE32F-NEXT:    vsetivli zero, 6, e16, m1, tu, ma
+; RV64ZVE32F-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vfmv.s.f v8, fa5
+; RV64ZVE32F-NEXT:    vsetivli zero, 6, e16, m1, tu, ma
 ; RV64ZVE32F-NEXT:    vslideup.vi v9, v8, 5
 ; RV64ZVE32F-NEXT:  .LBB64_9: # %else14
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
@@ -7440,8 +7464,9 @@ define <8 x half> @mgather_baseidx_v8i8_v8f16(ptr %base, <8 x i8> %idxs, <8 x i1
 ; RV64ZVE32F-NEXT:    slli a2, a2, 1
 ; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    flh fa5, 0(a2)
-; RV64ZVE32F-NEXT:    vsetivli zero, 3, e16, m1, tu, ma
+; RV64ZVE32F-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vfmv.s.f v11, fa5
+; RV64ZVE32F-NEXT:    vsetivli zero, 3, e16, m1, tu, ma
 ; RV64ZVE32F-NEXT:    vslideup.vi v9, v11, 2
 ; RV64ZVE32F-NEXT:    andi a2, a1, 8
 ; RV64ZVE32F-NEXT:    beqz a2, .LBB64_6
@@ -7452,8 +7477,9 @@ define <8 x half> @mgather_baseidx_v8i8_v8f16(ptr %base, <8 x i8> %idxs, <8 x i1
 ; RV64ZVE32F-NEXT:    slli a2, a2, 1
 ; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    flh fa5, 0(a2)
-; RV64ZVE32F-NEXT:    vsetivli zero, 4, e16, m1, tu, ma
+; RV64ZVE32F-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vfmv.s.f v8, fa5
+; RV64ZVE32F-NEXT:    vsetivli zero, 4, e16, m1, tu, ma
 ; RV64ZVE32F-NEXT:    vslideup.vi v9, v8, 3
 ; RV64ZVE32F-NEXT:    andi a2, a1, 16
 ; RV64ZVE32F-NEXT:    beqz a2, .LBB64_7
@@ -7474,8 +7500,9 @@ define <8 x half> @mgather_baseidx_v8i8_v8f16(ptr %base, <8 x i8> %idxs, <8 x i1
 ; RV64ZVE32F-NEXT:    slli a2, a2, 1
 ; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    flh fa5, 0(a2)
-; RV64ZVE32F-NEXT:    vsetivli zero, 7, e16, m1, tu, ma
+; RV64ZVE32F-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vfmv.s.f v10, fa5
+; RV64ZVE32F-NEXT:    vsetivli zero, 7, e16, m1, tu, ma
 ; RV64ZVE32F-NEXT:    vslideup.vi v9, v10, 6
 ; RV64ZVE32F-NEXT:    andi a1, a1, -128
 ; RV64ZVE32F-NEXT:    beqz a1, .LBB64_11
@@ -7486,8 +7513,9 @@ define <8 x half> @mgather_baseidx_v8i8_v8f16(ptr %base, <8 x i8> %idxs, <8 x i1
 ; RV64ZVE32F-NEXT:    slli a1, a1, 1
 ; RV64ZVE32F-NEXT:    add a0, a0, a1
 ; RV64ZVE32F-NEXT:    flh fa5, 0(a0)
-; RV64ZVE32F-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; RV64ZVE32F-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vfmv.s.f v8, fa5
+; RV64ZVE32F-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslideup.vi v9, v8, 7
 ; RV64ZVE32F-NEXT:    vmv1r.v v8, v9
 ; RV64ZVE32F-NEXT:    ret
@@ -7528,7 +7556,7 @@ define <8 x half> @mgather_baseidx_sext_v8i8_v8f16(ptr %base, <8 x i8> %idxs, <8
 ; RV64ZVE32F-NEXT:    slli a2, a2, 1
 ; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    flh fa5, 0(a2)
-; RV64ZVE32F-NEXT:    vsetivli zero, 8, e16, m1, tu, ma
+; RV64ZVE32F-NEXT:    vsetivli zero, 8, e16, mf2, tu, ma
 ; RV64ZVE32F-NEXT:    vfmv.s.f v9, fa5
 ; RV64ZVE32F-NEXT:  .LBB65_2: # %else
 ; RV64ZVE32F-NEXT:    andi a2, a1, 2
@@ -7540,8 +7568,9 @@ define <8 x half> @mgather_baseidx_sext_v8i8_v8f16(ptr %base, <8 x i8> %idxs, <8
 ; RV64ZVE32F-NEXT:    slli a2, a2, 1
 ; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    flh fa5, 0(a2)
-; RV64ZVE32F-NEXT:    vsetivli zero, 2, e16, m1, tu, ma
+; RV64ZVE32F-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vfmv.s.f v10, fa5
+; RV64ZVE32F-NEXT:    vsetivli zero, 2, e16, m1, tu, ma
 ; RV64ZVE32F-NEXT:    vslideup.vi v9, v10, 1
 ; RV64ZVE32F-NEXT:  .LBB65_4: # %else2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
@@ -7566,8 +7595,9 @@ define <8 x half> @mgather_baseidx_sext_v8i8_v8f16(ptr %base, <8 x i8> %idxs, <8
 ; RV64ZVE32F-NEXT:    slli a2, a2, 1
 ; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    flh fa5, 0(a2)
-; RV64ZVE32F-NEXT:    vsetivli zero, 6, e16, m1, tu, ma
+; RV64ZVE32F-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vfmv.s.f v8, fa5
+; RV64ZVE32F-NEXT:    vsetivli zero, 6, e16, m1, tu, ma
 ; RV64ZVE32F-NEXT:    vslideup.vi v9, v8, 5
 ; RV64ZVE32F-NEXT:  .LBB65_9: # %else14
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
@@ -7585,8 +7615,9 @@ define <8 x half> @mgather_baseidx_sext_v8i8_v8f16(ptr %base, <8 x i8> %idxs, <8
 ; RV64ZVE32F-NEXT:    slli a2, a2, 1
 ; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    flh fa5, 0(a2)
-; RV64ZVE32F-NEXT:    vsetivli zero, 3, e16, m1, tu, ma
+; RV64ZVE32F-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vfmv.s.f v11, fa5
+; RV64ZVE32F-NEXT:    vsetivli zero, 3, e16, m1, tu, ma
 ; RV64ZVE32F-NEXT:    vslideup.vi v9, v11, 2
 ; RV64ZVE32F-NEXT:    andi a2, a1, 8
 ; RV64ZVE32F-NEXT:    beqz a2, .LBB65_6
@@ -7597,8 +7628,9 @@ define <8 x half> @mgather_baseidx_sext_v8i8_v8f16(ptr %base, <8 x i8> %idxs, <8
 ; RV64ZVE32F-NEXT:    slli a2, a2, 1
 ; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    flh fa5, 0(a2)
-; RV64ZVE32F-NEXT:    vsetivli zero, 4, e16, m1, tu, ma
+; RV64ZVE32F-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vfmv.s.f v8, fa5
+; RV64ZVE32F-NEXT:    vsetivli zero, 4, e16, m1, tu, ma
 ; RV64ZVE32F-NEXT:    vslideup.vi v9, v8, 3
 ; RV64ZVE32F-NEXT:    andi a2, a1, 16
 ; RV64ZVE32F-NEXT:    beqz a2, .LBB65_7
@@ -7619,8 +7651,9 @@ define <8 x half> @mgather_baseidx_sext_v8i8_v8f16(ptr %base, <8 x i8> %idxs, <8
 ; RV64ZVE32F-NEXT:    slli a2, a2, 1
 ; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    flh fa5, 0(a2)
-; RV64ZVE32F-NEXT:    vsetivli zero, 7, e16, m1, tu, ma
+; RV64ZVE32F-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vfmv.s.f v10, fa5
+; RV64ZVE32F-NEXT:    vsetivli zero, 7, e16, m1, tu, ma
 ; RV64ZVE32F-NEXT:    vslideup.vi v9, v10, 6
 ; RV64ZVE32F-NEXT:    andi a1, a1, -128
 ; RV64ZVE32F-NEXT:    beqz a1, .LBB65_11
@@ -7631,8 +7664,9 @@ define <8 x half> @mgather_baseidx_sext_v8i8_v8f16(ptr %base, <8 x i8> %idxs, <8
 ; RV64ZVE32F-NEXT:    slli a1, a1, 1
 ; RV64ZVE32F-NEXT:    add a0, a0, a1
 ; RV64ZVE32F-NEXT:    flh fa5, 0(a0)
-; RV64ZVE32F-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; RV64ZVE32F-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vfmv.s.f v8, fa5
+; RV64ZVE32F-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslideup.vi v9, v8, 7
 ; RV64ZVE32F-NEXT:    vmv1r.v v8, v9
 ; RV64ZVE32F-NEXT:    ret
@@ -7673,7 +7707,7 @@ define <8 x half> @mgather_baseidx_zext_v8i8_v8f16(ptr %base, <8 x i8> %idxs, <8
 ; RV64ZVE32F-NEXT:    slli a2, a2, 1
 ; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    flh fa5, 0(a2)
-; RV64ZVE32F-NEXT:    vsetivli zero, 8, e16, m1, tu, ma
+; RV64ZVE32F-NEXT:    vsetivli zero, 8, e16, mf2, tu, ma
 ; RV64ZVE32F-NEXT:    vfmv.s.f v9, fa5
 ; RV64ZVE32F-NEXT:  .LBB66_2: # %else
 ; RV64ZVE32F-NEXT:    andi a2, a1, 2
@@ -7686,8 +7720,9 @@ define <8 x half> @mgather_baseidx_zext_v8i8_v8f16(ptr %base, <8 x i8> %idxs, <8
 ; RV64ZVE32F-NEXT:    slli a2, a2, 1
 ; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    flh fa5, 0(a2)
-; RV64ZVE32F-NEXT:    vsetivli zero, 2, e16, m1, tu, ma
+; RV64ZVE32F-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vfmv.s.f v10, fa5
+; RV64ZVE32F-NEXT:    vsetivli zero, 2, e16, m1, tu, ma
 ; RV64ZVE32F-NEXT:    vslideup.vi v9, v10, 1
 ; RV64ZVE32F-NEXT:  .LBB66_4: # %else2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 4, e8, mf2, ta, ma
@@ -7713,8 +7748,9 @@ define <8 x half> @mgather_baseidx_zext_v8i8_v8f16(ptr %base, <8 x i8> %idxs, <8
 ; RV64ZVE32F-NEXT:    slli a2, a2, 1
 ; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    flh fa5, 0(a2)
-; RV64ZVE32F-NEXT:    vsetivli zero, 6, e16, m1, tu, ma
+; RV64ZVE32F-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vfmv.s.f v8, fa5
+; RV64ZVE32F-NEXT:    vsetivli zero, 6, e16, m1, tu, ma
 ; RV64ZVE32F-NEXT:    vslideup.vi v9, v8, 5
 ; RV64ZVE32F-NEXT:  .LBB66_9: # %else14
 ; RV64ZVE32F-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
@@ -7733,8 +7769,9 @@ define <8 x half> @mgather_baseidx_zext_v8i8_v8f16(ptr %base, <8 x i8> %idxs, <8
 ; RV64ZVE32F-NEXT:    slli a2, a2, 1
 ; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    flh fa5, 0(a2)
-; RV64ZVE32F-NEXT:    vsetivli zero, 3, e16, m1, tu, ma
+; RV64ZVE32F-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vfmv.s.f v11, fa5
+; RV64ZVE32F-NEXT:    vsetivli zero, 3, e16, m1, tu, ma
 ; RV64ZVE32F-NEXT:    vslideup.vi v9, v11, 2
 ; RV64ZVE32F-NEXT:    andi a2, a1, 8
 ; RV64ZVE32F-NEXT:    beqz a2, .LBB66_6
@@ -7746,8 +7783,9 @@ define <8 x half> @mgather_baseidx_zext_v8i8_v8f16(ptr %base, <8 x i8> %idxs, <8
 ; RV64ZVE32F-NEXT:    slli a2, a2, 1
 ; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    flh fa5, 0(a2)
-; RV64ZVE32F-NEXT:    vsetivli zero, 4, e16, m1, tu, ma
+; RV64ZVE32F-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vfmv.s.f v8, fa5
+; RV64ZVE32F-NEXT:    vsetivli zero, 4, e16, m1, tu, ma
 ; RV64ZVE32F-NEXT:    vslideup.vi v9, v8, 3
 ; RV64ZVE32F-NEXT:    andi a2, a1, 16
 ; RV64ZVE32F-NEXT:    beqz a2, .LBB66_7
@@ -7770,8 +7808,9 @@ define <8 x half> @mgather_baseidx_zext_v8i8_v8f16(ptr %base, <8 x i8> %idxs, <8
 ; RV64ZVE32F-NEXT:    slli a2, a2, 1
 ; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    flh fa5, 0(a2)
-; RV64ZVE32F-NEXT:    vsetivli zero, 7, e16, m1, tu, ma
+; RV64ZVE32F-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vfmv.s.f v10, fa5
+; RV64ZVE32F-NEXT:    vsetivli zero, 7, e16, m1, tu, ma
 ; RV64ZVE32F-NEXT:    vslideup.vi v9, v10, 6
 ; RV64ZVE32F-NEXT:    andi a1, a1, -128
 ; RV64ZVE32F-NEXT:    beqz a1, .LBB66_11
@@ -7783,8 +7822,9 @@ define <8 x half> @mgather_baseidx_zext_v8i8_v8f16(ptr %base, <8 x i8> %idxs, <8
 ; RV64ZVE32F-NEXT:    slli a1, a1, 1
 ; RV64ZVE32F-NEXT:    add a0, a0, a1
 ; RV64ZVE32F-NEXT:    flh fa5, 0(a0)
-; RV64ZVE32F-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; RV64ZVE32F-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; RV64ZVE32F-NEXT:    vfmv.s.f v8, fa5
+; RV64ZVE32F-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; RV64ZVE32F-NEXT:    vslideup.vi v9, v8, 7
 ; RV64ZVE32F-NEXT:    vmv1r.v v8, v9
 ; RV64ZVE32F-NEXT:    ret
@@ -7820,7 +7860,7 @@ define <8 x half> @mgather_baseidx_v8f16(ptr %base, <8 x i16> %idxs, <8 x i1> %m
 ; RV64ZVE32F-NEXT:    andi a2, a1, 1
 ; RV64ZVE32F-NEXT:    beqz a2, .LBB67_2
 ; RV64ZVE32F-NEXT:  # %bb.1: # %cond.load
-; RV64ZVE32F-NEXT:    vsetivli zero, 8, e16, m1, tu, ma
+; RV64ZVE32F-NEXT:    vsetivli zero, 8, e16, mf2, tu, ma
 ; RV64ZVE32F-NEXT:    vmv.x.s a2, v8
 ; RV64ZVE32F-NEXT:    slli a2, a2, 1
 ; RV64ZVE32F-NEXT:    add a2, a0, a2
@@ -8127,15 +8167,16 @@ define <4 x float> @mgather_truemask_v4f32(<4 x ptr> %ptrs, <4 x float> %passthr
 ; RV64ZVE32F-NEXT:  .LBB71_5: # %cond.load
 ; RV64ZVE32F-NEXT:    ld a2, 0(a0)
 ; RV64ZVE32F-NEXT:    flw fa5, 0(a2)
-; RV64ZVE32F-NEXT:    vsetivli zero, 4, e32, m1, tu, ma
+; RV64ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, tu, ma
 ; RV64ZVE32F-NEXT:    vfmv.s.f v8, fa5
 ; RV64ZVE32F-NEXT:    andi a2, a1, 2
 ; RV64ZVE32F-NEXT:    beqz a2, .LBB71_2
 ; RV64ZVE32F-NEXT:  .LBB71_6: # %cond.load1
 ; RV64ZVE32F-NEXT:    ld a2, 8(a0)
 ; RV64ZVE32F-NEXT:    flw fa5, 0(a2)
-; RV64ZVE32F-NEXT:    vsetivli zero, 2, e32, m1, tu, ma
+; RV64ZVE32F-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; RV64ZVE32F-NEXT:    vfmv.s.f v9, fa5
+; RV64ZVE32F-NEXT:    vsetivli zero, 2, e32, m1, tu, ma
 ; RV64ZVE32F-NEXT:    vslideup.vi v8, v9, 1
 ; RV64ZVE32F-NEXT:    andi a2, a1, 4
 ; RV64ZVE32F-NEXT:    beqz a2, .LBB71_3
@@ -12122,7 +12163,7 @@ define <16 x i8> @mgather_baseidx_v16i8(ptr %base, <16 x i8> %idxs, <16 x i1> %m
 ; RV64ZVE32F-NEXT:    andi a2, a1, 1
 ; RV64ZVE32F-NEXT:    beqz a2, .LBB97_2
 ; RV64ZVE32F-NEXT:  # %bb.1: # %cond.load
-; RV64ZVE32F-NEXT:    vsetivli zero, 16, e8, m1, tu, ma
+; RV64ZVE32F-NEXT:    vsetivli zero, 16, e8, mf4, tu, ma
 ; RV64ZVE32F-NEXT:    vmv.x.s a2, v8
 ; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    lbu a2, 0(a2)
@@ -12382,7 +12423,7 @@ define <32 x i8> @mgather_baseidx_v32i8(ptr %base, <32 x i8> %idxs, <32 x i1> %m
 ; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    lbu a2, 0(a2)
 ; RV64ZVE32F-NEXT:    li a3, 32
-; RV64ZVE32F-NEXT:    vsetvli zero, a3, e8, m1, tu, ma
+; RV64ZVE32F-NEXT:    vsetvli zero, a3, e8, mf4, tu, ma
 ; RV64ZVE32F-NEXT:    vmv.s.x v10, a2
 ; RV64ZVE32F-NEXT:  .LBB98_2: # %else
 ; RV64ZVE32F-NEXT:    andi a2, a1, 2
@@ -12416,7 +12457,7 @@ define <32 x i8> @mgather_baseidx_v32i8(ptr %base, <32 x i8> %idxs, <32 x i1> %m
 ; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    lbu a2, 0(a2)
 ; RV64ZVE32F-NEXT:    li a3, 32
-; RV64ZVE32F-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; RV64ZVE32F-NEXT:    vsetvli zero, a3, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vmv.s.x v12, a2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 5, e8, m1, tu, ma
 ; RV64ZVE32F-NEXT:    vslideup.vi v10, v12, 4
@@ -12556,7 +12597,7 @@ define <32 x i8> @mgather_baseidx_v32i8(ptr %base, <32 x i8> %idxs, <32 x i1> %m
 ; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    lbu a2, 0(a2)
 ; RV64ZVE32F-NEXT:    li a3, 32
-; RV64ZVE32F-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; RV64ZVE32F-NEXT:    vsetvli zero, a3, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vmv.s.x v12, a2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 21, e8, m2, tu, ma
 ; RV64ZVE32F-NEXT:    vslideup.vi v10, v12, 20
@@ -12706,7 +12747,7 @@ define <32 x i8> @mgather_baseidx_v32i8(ptr %base, <32 x i8> %idxs, <32 x i1> %m
 ; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    lbu a2, 0(a2)
 ; RV64ZVE32F-NEXT:    li a3, 32
-; RV64ZVE32F-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; RV64ZVE32F-NEXT:    vsetvli zero, a3, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vmv.s.x v13, a2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 9, e8, m1, tu, ma
 ; RV64ZVE32F-NEXT:    vslideup.vi v10, v13, 8
@@ -12741,7 +12782,7 @@ define <32 x i8> @mgather_baseidx_v32i8(ptr %base, <32 x i8> %idxs, <32 x i1> %m
 ; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    lbu a2, 0(a2)
 ; RV64ZVE32F-NEXT:    li a3, 32
-; RV64ZVE32F-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; RV64ZVE32F-NEXT:    vsetvli zero, a3, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vmv.s.x v12, a2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 17, e8, m2, tu, ma
 ; RV64ZVE32F-NEXT:    vslideup.vi v10, v12, 16
@@ -12799,7 +12840,7 @@ define <32 x i8> @mgather_baseidx_v32i8(ptr %base, <32 x i8> %idxs, <32 x i1> %m
 ; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    lbu a2, 0(a2)
 ; RV64ZVE32F-NEXT:    li a3, 32
-; RV64ZVE32F-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; RV64ZVE32F-NEXT:    vsetvli zero, a3, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vmv.s.x v12, a2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 25, e8, m2, tu, ma
 ; RV64ZVE32F-NEXT:    vslideup.vi v10, v12, 24
@@ -12834,7 +12875,7 @@ define <32 x i8> @mgather_baseidx_v32i8(ptr %base, <32 x i8> %idxs, <32 x i1> %m
 ; RV64ZVE32F-NEXT:    add a2, a0, a2
 ; RV64ZVE32F-NEXT:    lbu a2, 0(a2)
 ; RV64ZVE32F-NEXT:    li a3, 32
-; RV64ZVE32F-NEXT:    vsetvli zero, a3, e8, m1, ta, ma
+; RV64ZVE32F-NEXT:    vsetvli zero, a3, e8, mf4, ta, ma
 ; RV64ZVE32F-NEXT:    vmv.s.x v12, a2
 ; RV64ZVE32F-NEXT:    vsetivli zero, 29, e8, m2, tu, ma
 ; RV64ZVE32F-NEXT:    vslideup.vi v10, v12, 28
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp-vp.ll
index 7ba248556358f..9df160bf30f00 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp-vp.ll
@@ -185,7 +185,7 @@ define float @vpreduce_fadd_v64f32(float %s, <64 x float> %v, <64 x i1> %m, i32
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    li a1, 32
 ; CHECK-NEXT:  .LBB8_2:
-; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfmv.s.f v25, fa0
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
 ; CHECK-NEXT:    vfredusum.vs v25, v8, v25, v0.t
@@ -213,7 +213,7 @@ define float @vpreduce_ord_fadd_v64f32(float %s, <64 x float> %v, <64 x i1> %m,
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    li a1, 32
 ; CHECK-NEXT:  .LBB9_2:
-; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfmv.s.f v25, fa0
 ; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
 ; CHECK-NEXT:    vfredosum.vs v25, v8, v25, v0.t
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll
index 4766b3727a462..3f6aa72bc2e3b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll
@@ -476,11 +476,10 @@ define float @vreduce_ord_fadd_v8f32(ptr %x, float %s) {
 define float @vreduce_fwadd_v8f32(ptr %x, float %s) {
 ; CHECK-LABEL: vreduce_fwadd_v8f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; CHECK-NEXT:    vle16.v v8, (a0)
-; CHECK-NEXT:    vsetivli zero, 8, e32, m1, ta, ma
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfwredusum.vs v8, v8, v9
 ; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
@@ -494,11 +493,10 @@ define float @vreduce_fwadd_v8f32(ptr %x, float %s) {
 define float @vreduce_ord_fwadd_v8f32(ptr %x, float %s) {
 ; CHECK-LABEL: vreduce_ord_fwadd_v8f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; CHECK-NEXT:    vle16.v v8, (a0)
-; CHECK-NEXT:    vsetivli zero, 8, e32, m1, ta, ma
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vfwredosum.vs v8, v8, v9
 ; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
@@ -542,11 +540,10 @@ define float @vreduce_ord_fadd_v16f32(ptr %x, float %s) {
 define float @vreduce_fwadd_v16f32(ptr %x, float %s) {
 ; CHECK-LABEL: vreduce_fwadd_v16f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; CHECK-NEXT:    vle16.v v8, (a0)
-; CHECK-NEXT:    vsetivli zero, 16, e32, m1, ta, ma
 ; CHECK-NEXT:    vfmv.s.f v10, fa0
-; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfwredusum.vs v8, v8, v10
 ; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
@@ -560,11 +557,10 @@ define float @vreduce_fwadd_v16f32(ptr %x, float %s) {
 define float @vreduce_ord_fwadd_v16f32(ptr %x, float %s) {
 ; CHECK-LABEL: vreduce_ord_fwadd_v16f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; CHECK-NEXT:    vle16.v v8, (a0)
-; CHECK-NEXT:    vsetivli zero, 16, e32, m1, ta, ma
 ; CHECK-NEXT:    vfmv.s.f v10, fa0
-; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vfwredosum.vs v8, v8, v10
 ; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
@@ -611,11 +607,10 @@ define float @vreduce_fwadd_v32f32(ptr %x, float %s) {
 ; CHECK-LABEL: vreduce_fwadd_v32f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 32
-; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
 ; CHECK-NEXT:    vle16.v v8, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
 ; CHECK-NEXT:    vfmv.s.f v12, fa0
-; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwredusum.vs v8, v8, v12
 ; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
@@ -630,11 +625,10 @@ define float @vreduce_ord_fwadd_v32f32(ptr %x, float %s) {
 ; CHECK-LABEL: vreduce_ord_fwadd_v32f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 32
-; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
 ; CHECK-NEXT:    vle16.v v8, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
 ; CHECK-NEXT:    vfmv.s.f v12, fa0
-; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwredosum.vs v8, v8, v12
 ; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
@@ -694,9 +688,8 @@ define float @vreduce_fwadd_v64f32(ptr %x, float %s) {
 ; CHECK-NEXT:    vslidedown.vx v16, v8, a0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vfwadd.vv v24, v8, v16
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vfmv.s.f v8, fa0
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    vfredusum.vs v8, v24, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
 ; CHECK-NEXT:    ret
@@ -888,11 +881,10 @@ define double @vreduce_ord_fadd_v4f64(ptr %x, double %s) {
 define double @vreduce_fwadd_v4f64(ptr %x, double %s) {
 ; CHECK-LABEL: vreduce_fwadd_v4f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    vsetivli zero, 4, e64, m1, ta, ma
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfwredusum.vs v8, v8, v9
 ; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
@@ -906,11 +898,10 @@ define double @vreduce_fwadd_v4f64(ptr %x, double %s) {
 define double @vreduce_ord_fwadd_v4f64(ptr %x, double %s) {
 ; CHECK-LABEL: vreduce_ord_fwadd_v4f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    vsetivli zero, 4, e64, m1, ta, ma
 ; CHECK-NEXT:    vfmv.s.f v9, fa0
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vfwredosum.vs v8, v8, v9
 ; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
@@ -954,11 +945,10 @@ define double @vreduce_ord_fadd_v8f64(ptr %x, double %s) {
 define double @vreduce_fwadd_v8f64(ptr %x, double %s) {
 ; CHECK-LABEL: vreduce_fwadd_v8f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
 ; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    vsetivli zero, 8, e64, m1, ta, ma
 ; CHECK-NEXT:    vfmv.s.f v10, fa0
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfwredusum.vs v8, v8, v10
 ; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
@@ -972,11 +962,10 @@ define double @vreduce_fwadd_v8f64(ptr %x, double %s) {
 define double @vreduce_ord_fwadd_v8f64(ptr %x, double %s) {
 ; CHECK-LABEL: vreduce_ord_fwadd_v8f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
 ; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    vsetivli zero, 8, e64, m1, ta, ma
 ; CHECK-NEXT:    vfmv.s.f v10, fa0
-; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vfwredosum.vs v8, v8, v10
 ; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
@@ -1020,11 +1009,10 @@ define double @vreduce_ord_fadd_v16f64(ptr %x, double %s) {
 define double @vreduce_fwadd_v16f64(ptr %x, double %s) {
 ; CHECK-LABEL: vreduce_fwadd_v16f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    vsetivli zero, 16, e64, m1, ta, ma
 ; CHECK-NEXT:    vfmv.s.f v12, fa0
-; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfwredusum.vs v8, v8, v12
 ; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
@@ -1038,11 +1026,10 @@ define double @vreduce_fwadd_v16f64(ptr %x, double %s) {
 define double @vreduce_ord_fwadd_v16f64(ptr %x, double %s) {
 ; CHECK-LABEL: vreduce_ord_fwadd_v16f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    vsetivli zero, 16, e64, m1, ta, ma
 ; CHECK-NEXT:    vfmv.s.f v12, fa0
-; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; CHECK-NEXT:    vfwredosum.vs v8, v8, v12
 ; CHECK-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
@@ -1099,7 +1086,7 @@ define double @vreduce_fwadd_v32f64(ptr %x, double %s) {
 ; CHECK-NEXT:    vslidedown.vi v16, v8, 16
 ; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; CHECK-NEXT:    vfwadd.vv v24, v8, v16
-; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; CHECK-NEXT:    vfmv.s.f v8, fa0
 ; CHECK-NEXT:    vfredusum.vs v8, v24, v8
 ; CHECK-NEXT:    vfmv.f.s fa0, v8
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int-vp.ll
index 6da5ca06a79c2..4e576f12e1076 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int-vp.ll
@@ -849,7 +849,7 @@ define signext i32 @vpreduce_xor_v64i32(i32 signext %s, <64 x i32> %v, <64 x i1>
 ; CHECK-NEXT:  # %bb.1:
 ; CHECK-NEXT:    li a2, 32
 ; CHECK-NEXT:  .LBB49_2:
-; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vmv.s.x v25, a0
 ; CHECK-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
 ; CHECK-NEXT:    vredxor.vs v25, v8, v25, v0.t
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int.ll
index f2a1f2752cda0..90ded1d70d5fc 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-int.ll
@@ -362,11 +362,10 @@ define i16 @vreduce_add_v16i16(ptr %x) {
 define i16 @vwreduce_add_v16i16(ptr %x) {
 ; CHECK-LABEL: vwreduce_add_v16i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; CHECK-NEXT:    vle8.v v8, (a0)
-; CHECK-NEXT:    vsetivli zero, 16, e16, m1, ta, ma
 ; CHECK-NEXT:    vmv.s.x v9, zero
-; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vwredsum.vs v8, v8, v9
 ; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; CHECK-NEXT:    vmv.x.s a0, v8
@@ -380,11 +379,10 @@ define i16 @vwreduce_add_v16i16(ptr %x) {
 define i16 @vwreduce_uadd_v16i16(ptr %x) {
 ; CHECK-LABEL: vwreduce_uadd_v16i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; CHECK-NEXT:    vle8.v v8, (a0)
-; CHECK-NEXT:    vsetivli zero, 16, e16, m1, ta, ma
 ; CHECK-NEXT:    vmv.s.x v9, zero
-; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
 ; CHECK-NEXT:    vwredsumu.vs v8, v8, v9
 ; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; CHECK-NEXT:    vmv.x.s a0, v8
@@ -416,11 +414,10 @@ define i16 @vwreduce_add_v32i16(ptr %x) {
 ; CHECK-LABEL: vwreduce_add_v32i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 32
-; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
 ; CHECK-NEXT:    vle8.v v8, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
 ; CHECK-NEXT:    vmv.s.x v10, zero
-; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e8, m2, ta, ma
 ; CHECK-NEXT:    vwredsum.vs v8, v8, v10
 ; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; CHECK-NEXT:    vmv.x.s a0, v8
@@ -435,11 +432,10 @@ define i16 @vwreduce_uadd_v32i16(ptr %x) {
 ; CHECK-LABEL: vwreduce_uadd_v32i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 32
-; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
 ; CHECK-NEXT:    vle8.v v8, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
 ; CHECK-NEXT:    vmv.s.x v10, zero
-; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e8, m2, ta, ma
 ; CHECK-NEXT:    vwredsumu.vs v8, v8, v10
 ; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; CHECK-NEXT:    vmv.x.s a0, v8
@@ -471,11 +467,10 @@ define i16 @vwreduce_add_v64i16(ptr %x) {
 ; CHECK-LABEL: vwreduce_add_v64i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 64
-; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
 ; CHECK-NEXT:    vle8.v v8, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
 ; CHECK-NEXT:    vmv.s.x v12, zero
-; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e8, m4, ta, ma
 ; CHECK-NEXT:    vwredsum.vs v8, v8, v12
 ; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; CHECK-NEXT:    vmv.x.s a0, v8
@@ -490,11 +485,10 @@ define i16 @vwreduce_uadd_v64i16(ptr %x) {
 ; CHECK-LABEL: vwreduce_uadd_v64i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 64
-; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
 ; CHECK-NEXT:    vle8.v v8, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
 ; CHECK-NEXT:    vmv.s.x v12, zero
-; CHECK-NEXT:    vsetvli zero, a1, e8, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e8, m4, ta, ma
 ; CHECK-NEXT:    vwredsumu.vs v8, v8, v12
 ; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; CHECK-NEXT:    vmv.x.s a0, v8
@@ -536,9 +530,8 @@ define i16 @vwreduce_add_v128i16(ptr %x) {
 ; CHECK-NEXT:    vslidedown.vx v16, v8, a0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
 ; CHECK-NEXT:    vwadd.vv v24, v8, v16
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m8, ta, ma
 ; CHECK-NEXT:    vmv.s.x v8, zero
-; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; CHECK-NEXT:    vredsum.vs v8, v24, v8
 ; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    ret
@@ -559,9 +552,8 @@ define i16 @vwreduce_uadd_v128i16(ptr %x) {
 ; CHECK-NEXT:    vslidedown.vx v16, v8, a0
 ; CHECK-NEXT:    vsetvli zero, a0, e8, m4, ta, ma
 ; CHECK-NEXT:    vwaddu.vv v24, v8, v16
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m8, ta, ma
 ; CHECK-NEXT:    vmv.s.x v8, zero
-; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; CHECK-NEXT:    vredsum.vs v8, v24, v8
 ; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    ret
@@ -732,11 +724,10 @@ define i32 @vreduce_add_v8i32(ptr %x) {
 define i32 @vwreduce_add_v8i32(ptr %x) {
 ; CHECK-LABEL: vwreduce_add_v8i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; CHECK-NEXT:    vle16.v v8, (a0)
-; CHECK-NEXT:    vsetivli zero, 8, e32, m1, ta, ma
 ; CHECK-NEXT:    vmv.s.x v9, zero
-; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vwredsum.vs v8, v8, v9
 ; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; CHECK-NEXT:    vmv.x.s a0, v8
@@ -750,11 +741,10 @@ define i32 @vwreduce_add_v8i32(ptr %x) {
 define i32 @vwreduce_uadd_v8i32(ptr %x) {
 ; CHECK-LABEL: vwreduce_uadd_v8i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; CHECK-NEXT:    vle16.v v8, (a0)
-; CHECK-NEXT:    vsetivli zero, 8, e32, m1, ta, ma
 ; CHECK-NEXT:    vmv.s.x v9, zero
-; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vwredsumu.vs v8, v8, v9
 ; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; CHECK-NEXT:    vmv.x.s a0, v8
@@ -784,11 +774,10 @@ define i32 @vreduce_add_v16i32(ptr %x) {
 define i32 @vwreduce_add_v16i32(ptr %x) {
 ; CHECK-LABEL: vwreduce_add_v16i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; CHECK-NEXT:    vle16.v v8, (a0)
-; CHECK-NEXT:    vsetivli zero, 16, e32, m1, ta, ma
 ; CHECK-NEXT:    vmv.s.x v10, zero
-; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vwredsum.vs v8, v8, v10
 ; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; CHECK-NEXT:    vmv.x.s a0, v8
@@ -802,11 +791,10 @@ define i32 @vwreduce_add_v16i32(ptr %x) {
 define i32 @vwreduce_uadd_v16i32(ptr %x) {
 ; CHECK-LABEL: vwreduce_uadd_v16i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; CHECK-NEXT:    vle16.v v8, (a0)
-; CHECK-NEXT:    vsetivli zero, 16, e32, m1, ta, ma
 ; CHECK-NEXT:    vmv.s.x v10, zero
-; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vwredsumu.vs v8, v8, v10
 ; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; CHECK-NEXT:    vmv.x.s a0, v8
@@ -838,11 +826,10 @@ define i32 @vwreduce_add_v32i32(ptr %x) {
 ; CHECK-LABEL: vwreduce_add_v32i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 32
-; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
 ; CHECK-NEXT:    vle16.v v8, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
 ; CHECK-NEXT:    vmv.s.x v12, zero
-; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vwredsum.vs v8, v8, v12
 ; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; CHECK-NEXT:    vmv.x.s a0, v8
@@ -857,11 +844,10 @@ define i32 @vwreduce_uadd_v32i32(ptr %x) {
 ; CHECK-LABEL: vwreduce_uadd_v32i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a1, 32
-; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
 ; CHECK-NEXT:    vle16.v v8, (a0)
-; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
 ; CHECK-NEXT:    vmv.s.x v12, zero
-; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; CHECK-NEXT:    vwredsumu.vs v8, v8, v12
 ; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
 ; CHECK-NEXT:    vmv.x.s a0, v8
@@ -903,9 +889,8 @@ define i32 @vwreduce_add_v64i32(ptr %x) {
 ; CHECK-NEXT:    vslidedown.vx v16, v8, a0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vwadd.vv v24, v8, v16
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vmv.s.x v8, zero
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    vredsum.vs v8, v24, v8
 ; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    ret
@@ -926,9 +911,8 @@ define i32 @vwreduce_uadd_v64i32(ptr %x) {
 ; CHECK-NEXT:    vslidedown.vx v16, v8, a0
 ; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
 ; CHECK-NEXT:    vwaddu.vv v24, v8, v16
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vmv.s.x v8, zero
-; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; CHECK-NEXT:    vredsum.vs v8, v24, v8
 ; CHECK-NEXT:    vmv.x.s a0, v8
 ; CHECK-NEXT:    ret
@@ -1135,11 +1119,10 @@ define i64 @vreduce_add_v4i64(ptr %x) {
 define i64 @vwreduce_add_v4i64(ptr %x) {
 ; RV32-LABEL: vwreduce_add_v4i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; RV32-NEXT:    vle32.v v8, (a0)
-; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vmv.s.x v9, zero
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV32-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; RV32-NEXT:    vwredsum.vs v8, v8, v9
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vmv.x.s a0, v8
@@ -1150,11 +1133,10 @@ define i64 @vwreduce_add_v4i64(ptr %x) {
 ;
 ; RV64-LABEL: vwreduce_add_v4i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV64-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; RV64-NEXT:    vle32.v v8, (a0)
-; RV64-NEXT:    vsetivli zero, 4, e64, m1, ta, ma
 ; RV64-NEXT:    vmv.s.x v9, zero
-; RV64-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; RV64-NEXT:    vwredsum.vs v8, v8, v9
 ; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV64-NEXT:    vmv.x.s a0, v8
@@ -1168,11 +1150,10 @@ define i64 @vwreduce_add_v4i64(ptr %x) {
 define i64 @vwreduce_uadd_v4i64(ptr %x) {
 ; RV32-LABEL: vwreduce_uadd_v4i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; RV32-NEXT:    vle32.v v8, (a0)
-; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vmv.s.x v9, zero
-; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV32-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; RV32-NEXT:    vwredsumu.vs v8, v8, v9
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vmv.x.s a0, v8
@@ -1183,11 +1164,10 @@ define i64 @vwreduce_uadd_v4i64(ptr %x) {
 ;
 ; RV64-LABEL: vwreduce_uadd_v4i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV64-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; RV64-NEXT:    vle32.v v8, (a0)
-; RV64-NEXT:    vsetivli zero, 4, e64, m1, ta, ma
 ; RV64-NEXT:    vmv.s.x v9, zero
-; RV64-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV64-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
 ; RV64-NEXT:    vwredsumu.vs v8, v8, v9
 ; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV64-NEXT:    vmv.x.s a0, v8
@@ -1230,11 +1210,10 @@ define i64 @vreduce_add_v8i64(ptr %x) {
 define i64 @vwreduce_add_v8i64(ptr %x) {
 ; RV32-LABEL: vwreduce_add_v8i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
 ; RV32-NEXT:    vle32.v v8, (a0)
-; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vmv.s.x v10, zero
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; RV32-NEXT:    vwredsum.vs v8, v8, v10
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vmv.x.s a0, v8
@@ -1245,11 +1224,10 @@ define i64 @vwreduce_add_v8i64(ptr %x) {
 ;
 ; RV64-LABEL: vwreduce_add_v8i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
 ; RV64-NEXT:    vle32.v v8, (a0)
-; RV64-NEXT:    vsetivli zero, 8, e64, m1, ta, ma
 ; RV64-NEXT:    vmv.s.x v10, zero
-; RV64-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV64-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; RV64-NEXT:    vwredsum.vs v8, v8, v10
 ; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV64-NEXT:    vmv.x.s a0, v8
@@ -1263,11 +1241,10 @@ define i64 @vwreduce_add_v8i64(ptr %x) {
 define i64 @vwreduce_uadd_v8i64(ptr %x) {
 ; RV32-LABEL: vwreduce_uadd_v8i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
 ; RV32-NEXT:    vle32.v v8, (a0)
-; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vmv.s.x v10, zero
-; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; RV32-NEXT:    vwredsumu.vs v8, v8, v10
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vmv.x.s a0, v8
@@ -1278,11 +1255,10 @@ define i64 @vwreduce_uadd_v8i64(ptr %x) {
 ;
 ; RV64-LABEL: vwreduce_uadd_v8i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV64-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
 ; RV64-NEXT:    vle32.v v8, (a0)
-; RV64-NEXT:    vsetivli zero, 8, e64, m1, ta, ma
 ; RV64-NEXT:    vmv.s.x v10, zero
-; RV64-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV64-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; RV64-NEXT:    vwredsumu.vs v8, v8, v10
 ; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV64-NEXT:    vmv.x.s a0, v8
@@ -1325,11 +1301,10 @@ define i64 @vreduce_add_v16i64(ptr %x) {
 define i64 @vwreduce_add_v16i64(ptr %x) {
 ; RV32-LABEL: vwreduce_add_v16i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vle32.v v8, (a0)
-; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vmv.s.x v12, zero
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; RV32-NEXT:    vwredsum.vs v8, v8, v12
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vmv.x.s a0, v8
@@ -1340,11 +1315,10 @@ define i64 @vwreduce_add_v16i64(ptr %x) {
 ;
 ; RV64-LABEL: vwreduce_add_v16i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV64-NEXT:    vle32.v v8, (a0)
-; RV64-NEXT:    vsetivli zero, 16, e64, m1, ta, ma
 ; RV64-NEXT:    vmv.s.x v12, zero
-; RV64-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; RV64-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; RV64-NEXT:    vwredsum.vs v8, v8, v12
 ; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV64-NEXT:    vmv.x.s a0, v8
@@ -1358,11 +1332,10 @@ define i64 @vwreduce_add_v16i64(ptr %x) {
 define i64 @vwreduce_uadd_v16i64(ptr %x) {
 ; RV32-LABEL: vwreduce_uadd_v16i64:
 ; RV32:       # %bb.0:
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV32-NEXT:    vle32.v v8, (a0)
-; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vmv.s.x v12, zero
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; RV32-NEXT:    vwredsumu.vs v8, v8, v12
 ; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV32-NEXT:    vmv.x.s a0, v8
@@ -1373,11 +1346,10 @@ define i64 @vwreduce_uadd_v16i64(ptr %x) {
 ;
 ; RV64-LABEL: vwreduce_uadd_v16i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
 ; RV64-NEXT:    vle32.v v8, (a0)
-; RV64-NEXT:    vsetivli zero, 16, e64, m1, ta, ma
 ; RV64-NEXT:    vmv.s.x v12, zero
-; RV64-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; RV64-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
 ; RV64-NEXT:    vwredsumu.vs v8, v8, v12
 ; RV64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
 ; RV64-NEXT:    vmv.x.s a0, v8
@@ -1433,7 +1405,7 @@ define i64 @vwreduce_add_v32i64(ptr %x) {
 ; RV32-NEXT:    vslidedown.vi v16, v8, 16
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; RV32-NEXT:    vwadd.vv v24, v8, v16
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; RV32-NEXT:    vmv.s.x v8, zero
 ; RV32-NEXT:    vredsum.vs v8, v24, v8
 ; RV32-NEXT:    vmv.x.s a0, v8
@@ -1451,7 +1423,7 @@ define i64 @vwreduce_add_v32i64(ptr %x) {
 ; RV64-NEXT:    vslidedown.vi v16, v8, 16
 ; RV64-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; RV64-NEXT:    vwadd.vv v24, v8, v16
-; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; RV64-NEXT:    vmv.s.x v8, zero
 ; RV64-NEXT:    vredsum.vs v8, v24, v8
 ; RV64-NEXT:    vmv.x.s a0, v8
@@ -1472,7 +1444,7 @@ define i64 @vwreduce_uadd_v32i64(ptr %x) {
 ; RV32-NEXT:    vslidedown.vi v16, v8, 16
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; RV32-NEXT:    vwaddu.vv v24, v8, v16
-; RV32-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV32-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; RV32-NEXT:    vmv.s.x v8, zero
 ; RV32-NEXT:    vredsum.vs v8, v24, v8
 ; RV32-NEXT:    vmv.x.s a0, v8
@@ -1490,7 +1462,7 @@ define i64 @vwreduce_uadd_v32i64(ptr %x) {
 ; RV64-NEXT:    vslidedown.vi v16, v8, 16
 ; RV64-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; RV64-NEXT:    vwaddu.vv v24, v8, v16
-; RV64-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
 ; RV64-NEXT:    vmv.s.x v8, zero
 ; RV64-NEXT:    vredsum.vs v8, v24, v8
 ; RV64-NEXT:    vmv.x.s a0, v8
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-transpose.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-transpose.ll
index a1d2b5106d5a9..d0777962a7565 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-transpose.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-transpose.ll
@@ -45,9 +45,9 @@ define <16 x i8> @trn1.v16i8(<16 x i8> %v0, <16 x i8> %v1) {
 ; CHECK-NEXT:    vadd.vi v8, v11, -1
 ; CHECK-NEXT:    lui a0, 11
 ; CHECK-NEXT:    addi a0, a0, -1366
-; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vmv.s.x v0, a0
-; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, mu
+; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, mu
 ; CHECK-NEXT:    vrgather.vv v10, v9, v8, v0.t
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
@@ -64,9 +64,9 @@ define <16 x i8> @trn2.v16i8(<16 x i8> %v0, <16 x i8> %v1) {
 ; CHECK-NEXT:    vrgather.vv v10, v8, v12
 ; CHECK-NEXT:    lui a0, 11
 ; CHECK-NEXT:    addi a0, a0, -1366
-; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vmv.s.x v0, a0
-; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, mu
+; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, mu
 ; CHECK-NEXT:    vrgather.vv v10, v9, v11, v0.t
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rvv/shuffle-reverse.ll b/llvm/test/CodeGen/RISCV/rvv/shuffle-reverse.ll
index 6984f2b3402a7..032d32109933f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/shuffle-reverse.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/shuffle-reverse.ll
@@ -80,9 +80,9 @@ define <16 x i8> @v8i8_2(<8 x i8> %a, <8 x i8> %b) {
 ; CHECK-NEXT:    vrgather.vv v10, v8, v12
 ; CHECK-NEXT:    vrsub.vi v8, v11, 7
 ; CHECK-NEXT:    li a0, 255
-; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vmv.s.x v0, a0
-; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, mu
+; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, mu
 ; CHECK-NEXT:    vrgather.vv v10, v9, v8, v0.t
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
@@ -117,9 +117,9 @@ define <32 x i8> @v16i8_2(<16 x i8> %a, <16 x i8> %b) {
 ; CHECK-NEXT:    vrsub.vi v8, v8, 15
 ; CHECK-NEXT:    lui a0, 16
 ; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vmv.s.x v0, a0
-; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, mu
+; CHECK-NEXT:    vsetvli zero, zero, e8, m2, ta, mu
 ; CHECK-NEXT:    vrgather.vv v10, v14, v8, v0.t
 ; CHECK-NEXT:    vmv.v.v v8, v10
 ; CHECK-NEXT:    ret
@@ -242,9 +242,9 @@ define <32 x i16> @v16i16_2(<16 x i16> %a, <16 x i16> %b) {
 ; CHECK-NEXT:    vrsub.vi v12, v12, 15
 ; CHECK-NEXT:    lui a0, 16
 ; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; CHECK-NEXT:    vmv.s.x v0, a0
-; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, mu
+; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, mu
 ; CHECK-NEXT:    vrgather.vv v8, v16, v12, v0.t
 ; CHECK-NEXT:    ret
   %v32i16 = shufflevector <16 x i16> %a, <16 x i16> %b,  <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
@@ -376,9 +376,8 @@ define <32 x i32> @v16i32_2(<16 x i32> %a, <16 x i32> %b) {
 ; CHECK-NEXT:    vrsub.vi v16, v16, 15
 ; CHECK-NEXT:    lui a0, 16
 ; CHECK-NEXT:    addi a0, a0, -1
-; CHECK-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, mu
 ; CHECK-NEXT:    vmv.s.x v0, a0
-; CHECK-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
 ; CHECK-NEXT:    vrgatherei16.vv v8, v24, v16, v0.t
 ; CHECK-NEXT:    ret
   %v32i32 = shufflevector <16 x i32> %a, <16 x i32> %b,  <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll
index f7ccf2c32cde0..84e9e2801ff6d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll
@@ -19,9 +19,9 @@ define {<16 x i1>, <16 x i1>} @vector_deinterleave_v16i1_v32i1(<32 x i1> %vec) {
 ; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
 ; CHECK-NEXT:    vadd.vi v12, v11, -16
 ; CHECK-NEXT:    li a0, -256
-; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vmv.s.x v0, a0
-; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, mu
+; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, mu
 ; CHECK-NEXT:    vrgather.vv v9, v8, v12, v0.t
 ; CHECK-NEXT:    vmsne.vi v9, v9, 0
 ; CHECK-NEXT:    vadd.vi v12, v11, 1
diff --git a/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-vp.ll
index 3b1e19ec4b3d2..bd510d26279c4 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-vp.ll
@@ -133,7 +133,7 @@ define half @vpreduce_fadd_nxv4f16(half %s, <vscale x 4 x half> %v, <vscale x 4
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfmv.s.f v8, fa5
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfredusum.vs v8, v10, v8, v0.t
@@ -159,7 +159,7 @@ define half @vpreduce_ord_fadd_nxv4f16(half %s, <vscale x 4 x half> %v, <vscale
 ; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
 ; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfmv.s.f v8, fa5
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfredosum.vs v8, v10, v8, v0.t
@@ -188,7 +188,7 @@ define half @vpreduce_fadd_nxv64f16(half %s, <vscale x 64 x half> %v, <vscale x
 ; ZVFH-NEXT:  # %bb.1:
 ; ZVFH-NEXT:    mv a0, a2
 ; ZVFH-NEXT:  .LBB6_2:
-; ZVFH-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFH-NEXT:    vfmv.s.f v25, fa0
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; ZVFH-NEXT:    vfredusum.vs v25, v8, v25, v0.t
@@ -236,14 +236,14 @@ define half @vpreduce_fadd_nxv64f16(half %s, <vscale x 64 x half> %v, <vscale x
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
 ; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmv.s.f v8, fa5
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfredusum.vs v8, v24, v8, v0.t
 ; ZVFHMIN-NEXT:    vfmv.f.s fa5, v8
 ; ZVFHMIN-NEXT:    fcvt.h.s fa5, fa5
 ; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa5
-; ZVFHMIN-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetivli zero, 1, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmv.s.f v8, fa5
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
@@ -253,7 +253,7 @@ define half @vpreduce_fadd_nxv64f16(half %s, <vscale x 64 x half> %v, <vscale x
 ; ZVFHMIN-NEXT:    vfmv.f.s fa5, v8
 ; ZVFHMIN-NEXT:    fcvt.h.s fa5, fa5
 ; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa5
-; ZVFHMIN-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetivli zero, 1, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmv.s.f v8, fa5
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
@@ -263,7 +263,7 @@ define half @vpreduce_fadd_nxv64f16(half %s, <vscale x 64 x half> %v, <vscale x
 ; ZVFHMIN-NEXT:    vfmv.f.s fa5, v8
 ; ZVFHMIN-NEXT:    fcvt.h.s fa5, fa5
 ; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa5
-; ZVFHMIN-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetivli zero, 1, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmv.s.f v8, fa5
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v1, a3
@@ -294,7 +294,7 @@ define half @vpreduce_ord_fadd_nxv64f16(half %s, <vscale x 64 x half> %v, <vscal
 ; ZVFH-NEXT:  # %bb.1:
 ; ZVFH-NEXT:    mv a0, a2
 ; ZVFH-NEXT:  .LBB7_2:
-; ZVFH-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFH-NEXT:    vfmv.s.f v25, fa0
 ; ZVFH-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
 ; ZVFH-NEXT:    vfredosum.vs v25, v8, v25, v0.t
@@ -342,14 +342,14 @@ define half @vpreduce_ord_fadd_nxv64f16(half %s, <vscale x 64 x half> %v, <vscal
 ; ZVFHMIN-NEXT:    vsetvli a4, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
 ; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa0
-; ZVFHMIN-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmv.s.f v8, fa5
 ; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfredosum.vs v8, v24, v8, v0.t
 ; ZVFHMIN-NEXT:    vfmv.f.s fa5, v8
 ; ZVFHMIN-NEXT:    fcvt.h.s fa5, fa5
 ; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa5
-; ZVFHMIN-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetivli zero, 1, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmv.s.f v8, fa5
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
@@ -359,7 +359,7 @@ define half @vpreduce_ord_fadd_nxv64f16(half %s, <vscale x 64 x half> %v, <vscal
 ; ZVFHMIN-NEXT:    vfmv.f.s fa5, v8
 ; ZVFHMIN-NEXT:    fcvt.h.s fa5, fa5
 ; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa5
-; ZVFHMIN-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetivli zero, 1, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmv.s.f v8, fa5
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
@@ -369,7 +369,7 @@ define half @vpreduce_ord_fadd_nxv64f16(half %s, <vscale x 64 x half> %v, <vscal
 ; ZVFHMIN-NEXT:    vfmv.f.s fa5, v8
 ; ZVFHMIN-NEXT:    fcvt.h.s fa5, fa5
 ; ZVFHMIN-NEXT:    fcvt.s.h fa5, fa5
-; ZVFHMIN-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vsetivli zero, 1, e32, m8, ta, ma
 ; ZVFHMIN-NEXT:    vfmv.s.f v8, fa5
 ; ZVFHMIN-NEXT:    vsetvli a0, zero, e8, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vslidedown.vx v0, v1, a3
diff --git a/llvm/test/CodeGen/RISCV/rvv/vreductions-int-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vreductions-int-vp.ll
index 6a8fe57f23f66..9f7b64c79616e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vreductions-int-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vreductions-int-vp.ll
@@ -1161,7 +1161,7 @@ define signext i32 @vpreduce_umax_nxv32i32(i32 signext %s, <vscale x 32 x i32> %
 ; RV32-NEXT:  # %bb.1:
 ; RV32-NEXT:    mv a1, a3
 ; RV32-NEXT:  .LBB67_2:
-; RV32-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; RV32-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; RV32-NEXT:    vmv.s.x v25, a0
 ; RV32-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
 ; RV32-NEXT:    vredmaxu.vs v25, v8, v25, v0.t
@@ -1187,7 +1187,7 @@ define signext i32 @vpreduce_umax_nxv32i32(i32 signext %s, <vscale x 32 x i32> %
 ; RV64-NEXT:  # %bb.1:
 ; RV64-NEXT:    mv a1, a3
 ; RV64-NEXT:  .LBB67_2:
-; RV64-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; RV64-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; RV64-NEXT:    vmv.s.x v25, a2
 ; RV64-NEXT:    vsetvli zero, a1, e32, m8, ta, ma
 ; RV64-NEXT:    vredmaxu.vs v25, v8, v25, v0.t
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.ll b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.ll
index 44a396ee29a8a..73f651225da64 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.ll
@@ -810,10 +810,9 @@ for.end:                                          ; preds = %for.body
 define <vscale x 4 x i32> @cross_block_mutate(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b,
 ; CHECK-LABEL: cross_block_mutate:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vsetivli a0, 6, e16, m1, ta, ma
-; CHECK-NEXT:    vsetvli a1, zero, e32, m1, tu, ma
+; CHECK-NEXT:    vsetivli a0, 6, e32, m2, tu, ma
 ; CHECK-NEXT:    vmv.s.x v8, a0
-; CHECK-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
 ; CHECK-NEXT:    vadd.vv v8, v8, v10, v0.t
 ; CHECK-NEXT:    ret
                                          <vscale x 4 x i1> %mask) {
diff --git a/llvm/test/CodeGen/RISCV/rvv/vsetvli-valid-elen-fp.ll b/llvm/test/CodeGen/RISCV/rvv/vsetvli-valid-elen-fp.ll
index 7bae1160a8a5f..f92f5e934f9f4 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vsetvli-valid-elen-fp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vsetvli-valid-elen-fp.ll
@@ -7,9 +7,8 @@
 define void @foo(half %y, ptr %i64p) {
 ; CHECK-NO-FELEN64-LABEL: foo:
 ; CHECK-NO-FELEN64:       # %bb.0: # %entry
-; CHECK-NO-FELEN64-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
+; CHECK-NO-FELEN64-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
 ; CHECK-NO-FELEN64-NEXT:    vle64.v v8, (a0)
-; CHECK-NO-FELEN64-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; CHECK-NO-FELEN64-NEXT:    vfmv.s.f v9, fa0
 ; CHECK-NO-FELEN64-NEXT:    #APP
 ; CHECK-NO-FELEN64-NEXT:    # use v8 v9

From 02ef12dd808f19191a4d0a9f511c4e5a5dda59b5 Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett@linaro.org>
Date: Fri, 27 Oct 2023 11:05:28 +0000
Subject: [PATCH 211/877] [lldb] Return unimplemented error from
 ScriptedProcessInterface::CreatePluginObject

After https://github.com/llvm/llvm-project/pull/68052 this function changed from returning
a nullptr with `return {};` to returning Expected and hitting `llvm_unreachable` before it could
do so.

I gather that we're never supposed to call this function, but on Windows we actually do call
this function because `interpreter->CreateScriptedProcessInterface()` returns
`ScriptedProcessInterface` not `ScriptedProcessPythonInterface`. Likely because
`target_sp->GetDebugger().GetScriptInterpreter()` also does not return a Python related class.

The previously XFAILed test crashed with:
```
 # .---command stderr------------
 # | PLEASE submit a bug report to https://github.com/llvm/llvm-project/issues/ and include the crash backtrace.
 # | Stack dump:
 # | 0.  Program arguments: c:\\users\\tcwg\\david.spickett\\build-llvm\\bin\\lldb-test.exe ir-memory-map C:\\Users\\tcwg\\david.spickett\\build-llvm\\tools\\lldb\\test\\Shell\\Expr\\Output\\TestIRMemoryMapWindows.test.tmp C:\\Users\\tcwg\\david.spickett\\llvm-project\\lldb\\test\\Shell\\Expr/Inputs/ir-memory-map-basic
 # | 1.  HandleCommand(command = "run")
 # | Exception Code: 0xC000001D
 # | #0 0x00007ff696b5f588 lldb_private::ScriptedProcessInterface::CreatePluginObject(class llvm::StringRef, class lldb_private::ExecutionContext &, class std::shared_ptr<class lldb_private::StructuredData::Dictionary>, class lldb_private::StructuredData::Generic *) C:\Users\tcwg\david.spickett\llvm-project\lldb\include\lldb\Interpreter\Interfaces\ScriptedProcessInterface.h:28:0
 # | #1 0x00007ff696b1d808 llvm::Expected<std::shared_ptr<lldb_private::StructuredData::Generic> >::operator bool C:\Users\tcwg\david.spickett\llvm-project\llvm\include\llvm\Support\Error.h:567:0
 # | #2 0x00007ff696b1d808 lldb_private::ScriptedProcess::ScriptedProcess(class std::shared_ptr<class lldb_private::Target>, class std::shared_ptr<class lldb_private::Listener>, class lldb_private::ScriptedMetadata const &, class lldb_private::Status &) C:\Users\tcwg\david.spickett\llvm-project\lldb\source\Plugins\Process\scripted\ScriptedProcess.cpp:115:0
 # | #3 0x00007ff696b1d124 std::shared_ptr<lldb_private::ScriptedProcess>::shared_ptr C:\Program Files\Microsoft Visual Studio\2022\Preview\VC\Tools\MSVC\14.35.32124\include\memory:1478:0
 # | #4 0x00007ff696b1d124 lldb_private::ScriptedProcess::CreateInstance(class std::shared_ptr<class lldb_private::Target>, class std::shared_ptr<class lldb_private::Listener>, class lldb_private::FileSpec const *, bool) C:\Users\tcwg\david.spickett\llvm-project\lldb\source\Plugins\Process\scripted\ScriptedProcess.cpp:61:0
 # | #5 0x00007ff69699c8f4 std::_Ptr_base<lldb_private::Process>::_Move_construct_from C:\Program Files\Microsoft Visual Studio\2022\Preview\VC\Tools\MSVC\14.35.32124\include\memory:1237:0
 # | #6 0x00007ff69699c8f4 std::shared_ptr<lldb_private::Process>::shared_ptr C:\Program Files\Microsoft Visual Studio\2022\Preview\VC\Tools\MSVC\14.35.32124\include\memory:1534:0
 # | #7 0x00007ff69699c8f4 std::shared_ptr<lldb_private::Process>::operator= C:\Program Files\Microsoft Visual Studio\2022\Preview\VC\Tools\MSVC\14.35.32124\include\memory:1594:0
 # | #8 0x00007ff69699c8f4 lldb_private::Process::FindPlugin(class std::shared_ptr<class lldb_private::Target>, class llvm::StringRef, class std::shared_ptr<class lldb_private::Listener>, class lldb_private::FileSpec const *, bool) C:\Users\tcwg\david.spickett\llvm-project\lldb\source\Target\Process.cpp:396:0
 # | #9 0x00007ff6969bd708 std::_Ptr_base<lldb_private::Process>::_Move_construct_from C:\Program Files\Microsoft Visual Studio\2022\Preview\VC\Tools\MSVC\14.35.32124\include\memory:1237:0
 # | #10 0x00007ff6969bd708 std::shared_ptr<lldb_private::Process>::shared_ptr C:\Program Files\Microsoft Visual Studio\2022\Preview\VC\Tools\MSVC\14.35.32124\include\memory:1534:0
 # | #11 0x00007ff6969bd708 std::shared_ptr<lldb_private::Process>::operator= C:\Program Files\Microsoft Visual Studio\2022\Preview\VC\Tools\MSVC\14.35.32124\include\memory:1594:0
 # | #12 0x00007ff6969bd708 lldb_private::Target::CreateProcess(class std::shared_ptr<class lldb_private::Listener>, class llvm::StringRef, class lldb_private::FileSpec const *, bool) C:\Users\tcwg\david.spickett\llvm-project\lldb\source\Target\Target.cpp:215:0
 # | #13 0x00007ff696b13af0 std::_Ptr_base<lldb_private::Process>::_Ptr_base C:\Program Files\Microsoft Visual Studio\2022\Preview\VC\Tools\MSVC\14.35.32124\include\memory:1230:0
 # | #14 0x00007ff696b13af0 std::shared_ptr<lldb_private::Process>::shared_ptr C:\Program Files\Microsoft Visual Studio\2022\Preview\VC\Tools\MSVC\14.35.32124\include\memory:1524:0
 # | #15 0x00007ff696b13af0 lldb_private::PlatformWindows::DebugProcess(class lldb_private::ProcessLaunchInfo &, class lldb_private::Debugger &, class lldb_private::Target &, class lldb_private::Status &) C:\Users\tcwg\david.spickett\llvm-project\lldb\source\Plugins\Platform\Windows\PlatformWindows.cpp:495:0
 # | #16 0x00007ff6969cf590 std::_Ptr_base<lldb_private::Process>::_Move_construct_from C:\Program Files\Microsoft Visual Studio\2022\Preview\VC\Tools\MSVC\14.35.32124\include\memory:1237:0
 # | #17 0x00007ff6969cf590 std::shared_ptr<lldb_private::Process>::shared_ptr C:\Program Files\Microsoft Visual Studio\2022\Preview\VC\Tools\MSVC\14.35.32124\include\memory:1534:0
 # | #18 0x00007ff6969cf590 std::shared_ptr<lldb_private::Process>::operator= C:\Program Files\Microsoft Visual Studio\2022\Preview\VC\Tools\MSVC\14.35.32124\include\memory:1594:0
 # | #19 0x00007ff6969cf590 lldb_private::Target::Launch(class lldb_private::ProcessLaunchInfo &, class lldb_private::Stream *) C:\Users\tcwg\david.spickett\llvm-project\lldb\source\Target\Target.cpp:3274:0
 # | #20 0x00007ff696fff82c CommandObjectProcessLaunch::DoExecute(class lldb_private::Args &, class lldb_private::CommandReturnObject &) C:\Users\tcwg\david.spickett\llvm-project\lldb\source\Commands\CommandObjectProcess.cpp:258:0
 # | #21 0x00007ff696fab6c0 lldb_private::CommandObjectParsed::Execute(char const *, class lldb_private::CommandReturnObject &) C:\Users\tcwg\david.spickett\llvm-project\lldb\source\Interpreter\CommandObject.cpp:751:0
 # `-----------------------------
 # error: command failed with exit status: 0xc000001d
```

That might be a bug on the Windows side, or an artifact of how our build is setup,
but whatever it is, having `CreatePluginObject` return an error and
the caller check it, fixes the failing test.

The built lldb can run the script command to use Python, but I'm not sure if that means
anything.
---
 .../lldb/Interpreter/Interfaces/ScriptedProcessInterface.h    | 4 ++--
 lldb/test/Shell/Expr/TestIRMemoryMapWindows.test              | 1 -
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/lldb/include/lldb/Interpreter/Interfaces/ScriptedProcessInterface.h b/lldb/include/lldb/Interpreter/Interfaces/ScriptedProcessInterface.h
index 3e50fb9b37bbc..bedeaf25ae0cb 100644
--- a/lldb/include/lldb/Interpreter/Interfaces/ScriptedProcessInterface.h
+++ b/lldb/include/lldb/Interpreter/Interfaces/ScriptedProcessInterface.h
@@ -12,6 +12,7 @@
 #include "ScriptedInterface.h"
 #include "lldb/Core/StructuredDataImpl.h"
 #include "lldb/Target/MemoryRegionInfo.h"
+#include "lldb/Utility/UnimplementedError.h"
 
 #include "lldb/lldb-private.h"
 
@@ -25,8 +26,7 @@ class ScriptedProcessInterface : virtual public ScriptedInterface {
   CreatePluginObject(llvm::StringRef class_name, ExecutionContext &exe_ctx,
                      StructuredData::DictionarySP args_sp,
                      StructuredData::Generic *script_obj = nullptr) {
-    llvm_unreachable(
-        "Cannot create an instance of the ScriptedProcessInterface!");
+    return {llvm::make_error<UnimplementedError>()};
   }
 
   virtual StructuredData::DictionarySP GetCapabilities() { return {}; }
diff --git a/lldb/test/Shell/Expr/TestIRMemoryMapWindows.test b/lldb/test/Shell/Expr/TestIRMemoryMapWindows.test
index f9f4da3c40920..ae29492c9ccc9 100644
--- a/lldb/test/Shell/Expr/TestIRMemoryMapWindows.test
+++ b/lldb/test/Shell/Expr/TestIRMemoryMapWindows.test
@@ -1,5 +1,4 @@
 # REQUIRES: system-windows
-# XFAIL: system-windows
 
 # RUN: %clang_cl_host /Zi /GS- %p/Inputs/call-function.cpp /c /o %t.obj
 # RUN: %msvc_link /debug:full %t.obj /out:%t

From 9bcb30d31813bbdea6b65789f64aed3f0e58d507 Mon Sep 17 00:00:00 2001
From: Igor Kirillov <igor.kirillov@arm.com>
Date: Fri, 27 Oct 2023 12:41:08 +0100
Subject: [PATCH 212/877] [CodeGen] Improve ExpandMemCmp for more efficient
 non-register aligned sizes handling (#69942)

* Enhanced the logic of ExpandMemCmp pass to merge contiguous
subsequences
  in LoadSequence, based on sizes allowed in `AllowedTailExpansions`.
* This enhancement seeks to minimize the number of basic blocks and
produce optimized code when using memcmp with non-register aligned sizes.
* Enable this feature for AArch64 with memcmp sizes modulo 8 equal to
  3, 5, and 6.
---
 .../llvm/Analysis/TargetTransformInfo.h       |   11 +
 llvm/lib/CodeGen/ExpandMemCmp.cpp             |   93 +-
 .../AArch64/AArch64TargetTransformInfo.cpp    |    1 +
 llvm/test/CodeGen/AArch64/memcmp.ll           | 3002 +++++++++++++++++
 .../Transforms/ExpandMemCmp/AArch64/memcmp.ll |  877 +++++
 5 files changed, 3965 insertions(+), 19 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/memcmp.ll
 create mode 100644 llvm/test/Transforms/ExpandMemCmp/AArch64/memcmp.ll

diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 5234ef8788d9e..3ec80d99b392b 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -907,6 +907,17 @@ class TargetTransformInfo {
     // be done with two 4-byte compares instead of 4+2+1-byte compares. This
     // requires all loads in LoadSizes to be doable in an unaligned way.
     bool AllowOverlappingLoads = false;
+
+    // Sometimes, the amount of data that needs to be compared is smaller than
+    // the standard register size, but it cannot be loaded with just one load
+    // instruction. For example, if the size of the memory comparison is 6
+    // bytes, we can handle it more efficiently by loading all 6 bytes in a
+    // single block and generating an 8-byte number, instead of generating two
+    // separate blocks with conditional jumps for 4 and 2 byte loads. This
+    // approach simplifies the process and produces the comparison result as
+    // normal. This array lists the allowed sizes of memcmp tails that can be
+    // merged into one block
+    SmallVector<unsigned, 4> AllowedTailExpansions;
   };
   MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize,
                                                bool IsZeroCmp) const;
diff --git a/llvm/lib/CodeGen/ExpandMemCmp.cpp b/llvm/lib/CodeGen/ExpandMemCmp.cpp
index 911ebd41afc5b..3f948f734fcf7 100644
--- a/llvm/lib/CodeGen/ExpandMemCmp.cpp
+++ b/llvm/lib/CodeGen/ExpandMemCmp.cpp
@@ -117,8 +117,8 @@ class MemCmpExpansion {
     Value *Lhs = nullptr;
     Value *Rhs = nullptr;
   };
-  LoadPair getLoadPair(Type *LoadSizeType, bool NeedsBSwap, Type *CmpSizeType,
-                       unsigned OffsetBytes);
+  LoadPair getLoadPair(Type *LoadSizeType, Type *BSwapSizeType,
+                       Type *CmpSizeType, unsigned OffsetBytes);
 
   static LoadEntryVector
   computeGreedyLoadSequence(uint64_t Size, llvm::ArrayRef<unsigned> LoadSizes,
@@ -128,6 +128,11 @@ class MemCmpExpansion {
                                  unsigned MaxNumLoads,
                                  unsigned &NumLoadsNonOneByte);
 
+  static void optimiseLoadSequence(
+      LoadEntryVector &LoadSequence,
+      const TargetTransformInfo::MemCmpExpansionOptions &Options,
+      bool IsUsedForZeroCmp);
+
 public:
   MemCmpExpansion(CallInst *CI, uint64_t Size,
                   const TargetTransformInfo::MemCmpExpansionOptions &Options,
@@ -210,6 +215,37 @@ MemCmpExpansion::computeOverlappingLoadSequence(uint64_t Size,
   return LoadSequence;
 }
 
+void MemCmpExpansion::optimiseLoadSequence(
+    LoadEntryVector &LoadSequence,
+    const TargetTransformInfo::MemCmpExpansionOptions &Options,
+    bool IsUsedForZeroCmp) {
+  // This part of code attempts to optimize the LoadSequence by merging allowed
+  // subsequences into single loads of allowed sizes from
+  // `MemCmpExpansionOptions::AllowedTailExpansions`. If it is for zero
+  // comparison or if no allowed tail expansions are specified, we exit early.
+  if (IsUsedForZeroCmp || Options.AllowedTailExpansions.empty())
+    return;
+
+  while (LoadSequence.size() >= 2) {
+    auto Last = LoadSequence[LoadSequence.size() - 1];
+    auto PreLast = LoadSequence[LoadSequence.size() - 2];
+
+    // Exit the loop if the two sequences are not contiguous
+    if (PreLast.Offset + PreLast.LoadSize != Last.Offset)
+      break;
+
+    auto LoadSize = Last.LoadSize + PreLast.LoadSize;
+    if (find(Options.AllowedTailExpansions, LoadSize) ==
+        Options.AllowedTailExpansions.end())
+      break;
+
+    // Remove the last two sequences and replace with the combined sequence
+    LoadSequence.pop_back();
+    LoadSequence.pop_back();
+    LoadSequence.emplace_back(PreLast.Offset, LoadSize);
+  }
+}
+
 // Initialize the basic block structure required for expansion of memcmp call
 // with given maximum load size and memcmp size parameter.
 // This structure includes:
@@ -255,6 +291,7 @@ MemCmpExpansion::MemCmpExpansion(
     }
   }
   assert(LoadSequence.size() <= Options.MaxNumLoads && "broken invariant");
+  optimiseLoadSequence(LoadSequence, Options, IsUsedForZeroCmp);
 }
 
 unsigned MemCmpExpansion::getNumBlocks() {
@@ -278,7 +315,7 @@ void MemCmpExpansion::createResultBlock() {
 }
 
 MemCmpExpansion::LoadPair MemCmpExpansion::getLoadPair(Type *LoadSizeType,
-                                                       bool NeedsBSwap,
+                                                       Type *BSwapSizeType,
                                                        Type *CmpSizeType,
                                                        unsigned OffsetBytes) {
   // Get the memory source at offset `OffsetBytes`.
@@ -307,16 +344,22 @@ MemCmpExpansion::LoadPair MemCmpExpansion::getLoadPair(Type *LoadSizeType,
   if (!Rhs)
     Rhs = Builder.CreateAlignedLoad(LoadSizeType, RhsSource, RhsAlign);
 
+  // Zero extend if Byte Swap intrinsic has different type
+  if (BSwapSizeType && LoadSizeType != BSwapSizeType) {
+    Lhs = Builder.CreateZExt(Lhs, BSwapSizeType);
+    Rhs = Builder.CreateZExt(Rhs, BSwapSizeType);
+  }
+
   // Swap bytes if required.
-  if (NeedsBSwap) {
-    Function *Bswap = Intrinsic::getDeclaration(CI->getModule(),
-                                                Intrinsic::bswap, LoadSizeType);
+  if (BSwapSizeType) {
+    Function *Bswap = Intrinsic::getDeclaration(
+        CI->getModule(), Intrinsic::bswap, BSwapSizeType);
     Lhs = Builder.CreateCall(Bswap, Lhs);
     Rhs = Builder.CreateCall(Bswap, Rhs);
   }
 
   // Zero extend if required.
-  if (CmpSizeType != nullptr && CmpSizeType != LoadSizeType) {
+  if (CmpSizeType != nullptr && CmpSizeType != Lhs->getType()) {
     Lhs = Builder.CreateZExt(Lhs, CmpSizeType);
     Rhs = Builder.CreateZExt(Rhs, CmpSizeType);
   }
@@ -332,7 +375,7 @@ void MemCmpExpansion::emitLoadCompareByteBlock(unsigned BlockIndex,
   BasicBlock *BB = LoadCmpBlocks[BlockIndex];
   Builder.SetInsertPoint(BB);
   const LoadPair Loads =
-      getLoadPair(Type::getInt8Ty(CI->getContext()), /*NeedsBSwap=*/false,
+      getLoadPair(Type::getInt8Ty(CI->getContext()), nullptr,
                   Type::getInt32Ty(CI->getContext()), OffsetBytes);
   Value *Diff = Builder.CreateSub(Loads.Lhs, Loads.Rhs);
 
@@ -385,11 +428,12 @@ Value *MemCmpExpansion::getCompareLoadPairs(unsigned BlockIndex,
   IntegerType *const MaxLoadType =
       NumLoads == 1 ? nullptr
                     : IntegerType::get(CI->getContext(), MaxLoadSize * 8);
+
   for (unsigned i = 0; i < NumLoads; ++i, ++LoadIndex) {
     const LoadEntry &CurLoadEntry = LoadSequence[LoadIndex];
     const LoadPair Loads = getLoadPair(
-        IntegerType::get(CI->getContext(), CurLoadEntry.LoadSize * 8),
-        /*NeedsBSwap=*/false, MaxLoadType, CurLoadEntry.Offset);
+        IntegerType::get(CI->getContext(), CurLoadEntry.LoadSize * 8), nullptr,
+        MaxLoadType, CurLoadEntry.Offset);
 
     if (NumLoads != 1) {
       // If we have multiple loads per block, we need to generate a composite
@@ -475,14 +519,20 @@ void MemCmpExpansion::emitLoadCompareBlock(unsigned BlockIndex) {
 
   Type *LoadSizeType =
       IntegerType::get(CI->getContext(), CurLoadEntry.LoadSize * 8);
-  Type *MaxLoadType = IntegerType::get(CI->getContext(), MaxLoadSize * 8);
+  Type *BSwapSizeType =
+      DL.isLittleEndian()
+          ? IntegerType::get(CI->getContext(),
+                             PowerOf2Ceil(CurLoadEntry.LoadSize * 8))
+          : nullptr;
+  Type *MaxLoadType = IntegerType::get(
+      CI->getContext(),
+      std::max(MaxLoadSize, (unsigned)PowerOf2Ceil(CurLoadEntry.LoadSize)) * 8);
   assert(CurLoadEntry.LoadSize <= MaxLoadSize && "Unexpected load type");
 
   Builder.SetInsertPoint(LoadCmpBlocks[BlockIndex]);
 
-  const LoadPair Loads =
-      getLoadPair(LoadSizeType, /*NeedsBSwap=*/DL.isLittleEndian(), MaxLoadType,
-                  CurLoadEntry.Offset);
+  const LoadPair Loads = getLoadPair(LoadSizeType, BSwapSizeType, MaxLoadType,
+                                     CurLoadEntry.Offset);
 
   // Add the loaded values to the phi nodes for calculating memcmp result only
   // if result is not used in a zero equality.
@@ -587,19 +637,24 @@ Value *MemCmpExpansion::getMemCmpEqZeroOneBlock() {
 /// A memcmp expansion that only has one block of load and compare can bypass
 /// the compare, branch, and phi IR that is required in the general case.
 Value *MemCmpExpansion::getMemCmpOneBlock() {
-  Type *LoadSizeType = IntegerType::get(CI->getContext(), Size * 8);
   bool NeedsBSwap = DL.isLittleEndian() && Size != 1;
+  Type *LoadSizeType = IntegerType::get(CI->getContext(), Size * 8);
+  Type *BSwapSizeType =
+      NeedsBSwap ? IntegerType::get(CI->getContext(), PowerOf2Ceil(Size * 8))
+                 : nullptr;
+  Type *MaxLoadType =
+      IntegerType::get(CI->getContext(),
+                       std::max(MaxLoadSize, (unsigned)PowerOf2Ceil(Size)) * 8);
 
   // The i8 and i16 cases don't need compares. We zext the loaded values and
   // subtract them to get the suitable negative, zero, or positive i32 result.
   if (Size < 4) {
-    const LoadPair Loads =
-        getLoadPair(LoadSizeType, NeedsBSwap, Builder.getInt32Ty(),
-                    /*Offset*/ 0);
+    const LoadPair Loads = getLoadPair(LoadSizeType, BSwapSizeType,
+                                       Builder.getInt32Ty(), /*Offset*/ 0);
     return Builder.CreateSub(Loads.Lhs, Loads.Rhs);
   }
 
-  const LoadPair Loads = getLoadPair(LoadSizeType, NeedsBSwap, LoadSizeType,
+  const LoadPair Loads = getLoadPair(LoadSizeType, BSwapSizeType, MaxLoadType,
                                      /*Offset*/ 0);
   // The result of memcmp is negative, zero, or positive, so produce that by
   // subtracting 2 extended compare bits: sub (ugt, ult).
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 5f2d09f0765aa..1d9dcfc4e9f44 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -2961,6 +2961,7 @@ AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
   // they may wake up the FP unit, which raises the power consumption.  Perhaps
   // they could be used with no holds barred (-O3).
   Options.LoadSizes = {8, 4, 2, 1};
+  Options.AllowedTailExpansions = {3, 5, 6};
   return Options;
 }
 
diff --git a/llvm/test/CodeGen/AArch64/memcmp.ll b/llvm/test/CodeGen/AArch64/memcmp.ll
new file mode 100644
index 0000000000000..b38acbae10915
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/memcmp.ll
@@ -0,0 +1,3002 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc < %s -mtriple=aarch64-unknown-unknown | FileCheck %s
+
+@.str = private constant [513 x i8] c"01234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901\00", align 1
+
+declare dso_local i32 @memcmp(ptr, ptr, i64)
+
+define i32 @length0(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length0:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    ret
+   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 0) nounwind
+   ret i32 %m
+ }
+
+define i1 @length0_eq(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length0_eq:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w0, #1 // =0x1
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 0) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length0_lt(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length0_lt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 0) nounwind
+  %c = icmp slt i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length2(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length2:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldrh w8, [x0]
+; CHECK-NEXT:    ldrh w9, [x1]
+; CHECK-NEXT:    rev w8, w8
+; CHECK-NEXT:    rev w9, w9
+; CHECK-NEXT:    lsr w8, w8, #16
+; CHECK-NEXT:    sub w0, w8, w9, lsr #16
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind
+  ret i32 %m
+}
+
+define i32 @length2_const(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length2_const:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldrh w9, [x0]
+; CHECK-NEXT:    mov w8, #-12594 // =0xffffcece
+; CHECK-NEXT:    rev w9, w9
+; CHECK-NEXT:    add w0, w8, w9, lsr #16
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i64 2) nounwind
+  ret i32 %m
+}
+
+define i1 @length2_gt_const(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length2_gt_const:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldrh w9, [x0]
+; CHECK-NEXT:    mov w8, #-12594 // =0xffffcece
+; CHECK-NEXT:    rev w9, w9
+; CHECK-NEXT:    add w8, w8, w9, lsr #16
+; CHECK-NEXT:    cmp w8, #0
+; CHECK-NEXT:    cset w0, gt
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i64 2) nounwind
+  %c = icmp sgt i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length2_eq(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length2_eq:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldrh w8, [x0]
+; CHECK-NEXT:    ldrh w9, [x1]
+; CHECK-NEXT:    cmp w8, w9
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length2_lt(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length2_lt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldrh w8, [x0]
+; CHECK-NEXT:    ldrh w9, [x1]
+; CHECK-NEXT:    rev w8, w8
+; CHECK-NEXT:    rev w9, w9
+; CHECK-NEXT:    lsr w8, w8, #16
+; CHECK-NEXT:    sub w8, w8, w9, lsr #16
+; CHECK-NEXT:    lsr w0, w8, #31
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind
+  %c = icmp slt i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length2_gt(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length2_gt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldrh w8, [x0]
+; CHECK-NEXT:    ldrh w9, [x1]
+; CHECK-NEXT:    rev w8, w8
+; CHECK-NEXT:    rev w9, w9
+; CHECK-NEXT:    lsr w8, w8, #16
+; CHECK-NEXT:    sub w8, w8, w9, lsr #16
+; CHECK-NEXT:    cmp w8, #0
+; CHECK-NEXT:    cset w0, gt
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind
+  %c = icmp sgt i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length2_eq_const(ptr %X) nounwind {
+; CHECK-LABEL: length2_eq_const:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldrh w8, [x0]
+; CHECK-NEXT:    mov w9, #12849 // =0x3231
+; CHECK-NEXT:    cmp w8, w9
+; CHECK-NEXT:    cset w0, ne
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i64 2) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length2_eq_nobuiltin_attr(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length2_eq_nobuiltin_attr:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    mov w2, #2 // =0x2
+; CHECK-NEXT:    bl memcmp
+; CHECK-NEXT:    cmp w0, #0
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind nobuiltin
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length3(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length3:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldrb w8, [x0, #2]
+; CHECK-NEXT:    ldrh w9, [x0]
+; CHECK-NEXT:    ldrb w10, [x1, #2]
+; CHECK-NEXT:    ldrh w11, [x1]
+; CHECK-NEXT:    orr w8, w9, w8, lsl #16
+; CHECK-NEXT:    orr w9, w11, w10, lsl #16
+; CHECK-NEXT:    rev w8, w8
+; CHECK-NEXT:    rev w9, w9
+; CHECK-NEXT:    sub w0, w8, w9
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 3) nounwind
+  ret i32 %m
+}
+
+define i1 @length3_eq(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length3_eq:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldrh w8, [x0]
+; CHECK-NEXT:    ldrh w9, [x1]
+; CHECK-NEXT:    ldrb w10, [x0, #2]
+; CHECK-NEXT:    ldrb w11, [x1, #2]
+; CHECK-NEXT:    cmp w8, w9
+; CHECK-NEXT:    ccmp w10, w11, #0, eq
+; CHECK-NEXT:    cset w0, ne
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 3) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length4(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length4:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr w8, [x0]
+; CHECK-NEXT:    ldr w9, [x1]
+; CHECK-NEXT:    rev w8, w8
+; CHECK-NEXT:    rev w9, w9
+; CHECK-NEXT:    cmp w8, w9
+; CHECK-NEXT:    cset w8, hi
+; CHECK-NEXT:    cset w9, lo
+; CHECK-NEXT:    sub w0, w8, w9
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind
+  ret i32 %m
+}
+
+define i1 @length4_eq(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length4_eq:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr w8, [x0]
+; CHECK-NEXT:    ldr w9, [x1]
+; CHECK-NEXT:    cmp w8, w9
+; CHECK-NEXT:    cset w0, ne
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length4_lt(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length4_lt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr w8, [x0]
+; CHECK-NEXT:    ldr w9, [x1]
+; CHECK-NEXT:    rev w8, w8
+; CHECK-NEXT:    rev w9, w9
+; CHECK-NEXT:    cmp w8, w9
+; CHECK-NEXT:    cset w8, hi
+; CHECK-NEXT:    cset w9, lo
+; CHECK-NEXT:    sub w8, w8, w9
+; CHECK-NEXT:    lsr w0, w8, #31
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind
+  %c = icmp slt i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length4_gt(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length4_gt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr w8, [x0]
+; CHECK-NEXT:    ldr w9, [x1]
+; CHECK-NEXT:    rev w8, w8
+; CHECK-NEXT:    rev w9, w9
+; CHECK-NEXT:    cmp w8, w9
+; CHECK-NEXT:    cset w8, hi
+; CHECK-NEXT:    cset w9, lo
+; CHECK-NEXT:    sub w8, w8, w9
+; CHECK-NEXT:    cmp w8, #0
+; CHECK-NEXT:    cset w0, gt
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind
+  %c = icmp sgt i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length4_eq_const(ptr %X) nounwind {
+; CHECK-LABEL: length4_eq_const:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr w8, [x0]
+; CHECK-NEXT:    mov w9, #12849 // =0x3231
+; CHECK-NEXT:    movk w9, #13363, lsl #16
+; CHECK-NEXT:    cmp w8, w9
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i64 4) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length5(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length5:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldrb w8, [x0, #4]
+; CHECK-NEXT:    ldr w9, [x0]
+; CHECK-NEXT:    ldrb w10, [x1, #4]
+; CHECK-NEXT:    ldr w11, [x1]
+; CHECK-NEXT:    orr x8, x9, x8, lsl #32
+; CHECK-NEXT:    orr x9, x11, x10, lsl #32
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    cset w8, hi
+; CHECK-NEXT:    cset w9, lo
+; CHECK-NEXT:    sub w0, w8, w9
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 5) nounwind
+  ret i32 %m
+}
+
+define i1 @length5_eq(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length5_eq:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr w8, [x0]
+; CHECK-NEXT:    ldr w9, [x1]
+; CHECK-NEXT:    ldrb w10, [x0, #4]
+; CHECK-NEXT:    ldrb w11, [x1, #4]
+; CHECK-NEXT:    cmp w8, w9
+; CHECK-NEXT:    ccmp w10, w11, #0, eq
+; CHECK-NEXT:    cset w0, ne
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 5) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length5_lt(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length5_lt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldrb w8, [x0, #4]
+; CHECK-NEXT:    ldr w9, [x0]
+; CHECK-NEXT:    ldrb w10, [x1, #4]
+; CHECK-NEXT:    ldr w11, [x1]
+; CHECK-NEXT:    orr x8, x9, x8, lsl #32
+; CHECK-NEXT:    orr x9, x11, x10, lsl #32
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    cset w8, hi
+; CHECK-NEXT:    cset w9, lo
+; CHECK-NEXT:    sub w8, w8, w9
+; CHECK-NEXT:    lsr w0, w8, #31
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 5) nounwind
+  %c = icmp slt i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length6(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length6:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldrh w8, [x0, #4]
+; CHECK-NEXT:    ldr w9, [x0]
+; CHECK-NEXT:    ldrh w10, [x1, #4]
+; CHECK-NEXT:    ldr w11, [x1]
+; CHECK-NEXT:    orr x8, x9, x8, lsl #32
+; CHECK-NEXT:    orr x9, x11, x10, lsl #32
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    cset w8, hi
+; CHECK-NEXT:    cset w9, lo
+; CHECK-NEXT:    sub w0, w8, w9
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 6) nounwind
+  ret i32 %m
+}
+
+define i32 @length7(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length7:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr w8, [x0]
+; CHECK-NEXT:    ldr w9, [x1]
+; CHECK-NEXT:    rev w8, w8
+; CHECK-NEXT:    rev w9, w9
+; CHECK-NEXT:    cmp w8, w9
+; CHECK-NEXT:    b.ne .LBB22_3
+; CHECK-NEXT:  // %bb.1: // %loadbb1
+; CHECK-NEXT:    ldur w8, [x0, #3]
+; CHECK-NEXT:    ldur w9, [x1, #3]
+; CHECK-NEXT:    rev w8, w8
+; CHECK-NEXT:    rev w9, w9
+; CHECK-NEXT:    cmp w8, w9
+; CHECK-NEXT:    b.ne .LBB22_3
+; CHECK-NEXT:  // %bb.2:
+; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB22_3: // %res_block
+; CHECK-NEXT:    cmp w8, w9
+; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
+; CHECK-NEXT:    cneg w0, w8, hs
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 7) nounwind
+  ret i32 %m
+}
+
+define i1 @length7_lt(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length7_lt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr w8, [x0]
+; CHECK-NEXT:    ldr w9, [x1]
+; CHECK-NEXT:    rev w8, w8
+; CHECK-NEXT:    rev w9, w9
+; CHECK-NEXT:    cmp w8, w9
+; CHECK-NEXT:    b.ne .LBB23_3
+; CHECK-NEXT:  // %bb.1: // %loadbb1
+; CHECK-NEXT:    ldur w8, [x0, #3]
+; CHECK-NEXT:    ldur w9, [x1, #3]
+; CHECK-NEXT:    rev w8, w8
+; CHECK-NEXT:    rev w9, w9
+; CHECK-NEXT:    cmp w8, w9
+; CHECK-NEXT:    b.ne .LBB23_3
+; CHECK-NEXT:  // %bb.2:
+; CHECK-NEXT:    lsr w0, wzr, #31
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB23_3: // %res_block
+; CHECK-NEXT:    cmp w8, w9
+; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
+; CHECK-NEXT:    cneg w8, w8, hs
+; CHECK-NEXT:    lsr w0, w8, #31
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 7) nounwind
+  %c = icmp slt i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length7_eq(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length7_eq:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr w8, [x0]
+; CHECK-NEXT:    ldr w9, [x1]
+; CHECK-NEXT:    ldur w10, [x0, #3]
+; CHECK-NEXT:    ldur w11, [x1, #3]
+; CHECK-NEXT:    cmp w8, w9
+; CHECK-NEXT:    ccmp w10, w11, #0, eq
+; CHECK-NEXT:    cset w0, ne
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 7) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length8(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr x8, [x0]
+; CHECK-NEXT:    ldr x9, [x1]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    cset w8, hi
+; CHECK-NEXT:    cset w9, lo
+; CHECK-NEXT:    sub w0, w8, w9
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 8) nounwind
+  ret i32 %m
+}
+
+define i1 @length8_eq(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length8_eq:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr x8, [x0]
+; CHECK-NEXT:    ldr x9, [x1]
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 8) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length8_eq_const(ptr %X) nounwind {
+; CHECK-LABEL: length8_eq_const:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov x9, #12592 // =0x3130
+; CHECK-NEXT:    ldr x8, [x0]
+; CHECK-NEXT:    movk x9, #13106, lsl #16
+; CHECK-NEXT:    movk x9, #13620, lsl #32
+; CHECK-NEXT:    movk x9, #14134, lsl #48
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    cset w0, ne
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 8) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length9(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length9:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr x8, [x0]
+; CHECK-NEXT:    ldr x9, [x1]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB28_2
+; CHECK-NEXT:  // %bb.1: // %loadbb1
+; CHECK-NEXT:    ldrb w8, [x0, #8]
+; CHECK-NEXT:    ldrb w9, [x1, #8]
+; CHECK-NEXT:    sub w0, w8, w9
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB28_2: // %res_block
+; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
+; CHECK-NEXT:    cneg w0, w8, hs
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 9) nounwind
+  ret i32 %m
+}
+
+define i1 @length9_eq(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length9_eq:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr x8, [x0]
+; CHECK-NEXT:    ldr x9, [x1]
+; CHECK-NEXT:    ldrb w10, [x0, #8]
+; CHECK-NEXT:    ldrb w11, [x1, #8]
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    ccmp x10, x11, #0, eq
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 9) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length10(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length10:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr x8, [x0]
+; CHECK-NEXT:    ldr x9, [x1]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB30_3
+; CHECK-NEXT:  // %bb.1: // %loadbb1
+; CHECK-NEXT:    ldrh w8, [x0, #8]
+; CHECK-NEXT:    ldrh w9, [x1, #8]
+; CHECK-NEXT:    rev w8, w8
+; CHECK-NEXT:    rev w9, w9
+; CHECK-NEXT:    lsr w8, w8, #16
+; CHECK-NEXT:    lsr w9, w9, #16
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB30_3
+; CHECK-NEXT:  // %bb.2:
+; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB30_3: // %res_block
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
+; CHECK-NEXT:    cneg w0, w8, hs
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 10) nounwind
+  ret i32 %m
+}
+
+define i1 @length10_eq(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length10_eq:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr x8, [x0]
+; CHECK-NEXT:    ldr x9, [x1]
+; CHECK-NEXT:    ldrh w10, [x0, #8]
+; CHECK-NEXT:    ldrh w11, [x1, #8]
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    ccmp x10, x11, #0, eq
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 10) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length11(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length11:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr x8, [x0]
+; CHECK-NEXT:    ldr x9, [x1]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB32_3
+; CHECK-NEXT:  // %bb.1: // %loadbb1
+; CHECK-NEXT:    ldur x8, [x0, #3]
+; CHECK-NEXT:    ldur x9, [x1, #3]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB32_3
+; CHECK-NEXT:  // %bb.2:
+; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB32_3: // %res_block
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
+; CHECK-NEXT:    cneg w0, w8, hs
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 11) nounwind
+  ret i32 %m
+}
+
+define i1 @length11_eq(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length11_eq:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr x8, [x0]
+; CHECK-NEXT:    ldr x9, [x1]
+; CHECK-NEXT:    ldur x10, [x0, #3]
+; CHECK-NEXT:    ldur x11, [x1, #3]
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    ccmp x10, x11, #0, eq
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 11) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length12_eq(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length12_eq:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr x8, [x0]
+; CHECK-NEXT:    ldr x9, [x1]
+; CHECK-NEXT:    ldr w10, [x0, #8]
+; CHECK-NEXT:    ldr w11, [x1, #8]
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    ccmp x10, x11, #0, eq
+; CHECK-NEXT:    cset w0, ne
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 12) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length12(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length12:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr x8, [x0]
+; CHECK-NEXT:    ldr x9, [x1]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB35_3
+; CHECK-NEXT:  // %bb.1: // %loadbb1
+; CHECK-NEXT:    ldr w8, [x0, #8]
+; CHECK-NEXT:    ldr w9, [x1, #8]
+; CHECK-NEXT:    rev w8, w8
+; CHECK-NEXT:    rev w9, w9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB35_3
+; CHECK-NEXT:  // %bb.2:
+; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB35_3: // %res_block
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
+; CHECK-NEXT:    cneg w0, w8, hs
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 12) nounwind
+  ret i32 %m
+}
+
+define i1 @length13_eq(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length13_eq:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr x8, [x0]
+; CHECK-NEXT:    ldr x9, [x1]
+; CHECK-NEXT:    ldur x10, [x0, #5]
+; CHECK-NEXT:    ldur x11, [x1, #5]
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    ccmp x10, x11, #0, eq
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 13) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length14_eq(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length14_eq:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr x8, [x0]
+; CHECK-NEXT:    ldr x9, [x1]
+; CHECK-NEXT:    ldur x10, [x0, #6]
+; CHECK-NEXT:    ldur x11, [x1, #6]
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    ccmp x10, x11, #0, eq
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 14) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length15(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length15:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr x8, [x0]
+; CHECK-NEXT:    ldr x9, [x1]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB38_3
+; CHECK-NEXT:  // %bb.1: // %loadbb1
+; CHECK-NEXT:    ldur x8, [x0, #7]
+; CHECK-NEXT:    ldur x9, [x1, #7]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB38_3
+; CHECK-NEXT:  // %bb.2:
+; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB38_3: // %res_block
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
+; CHECK-NEXT:    cneg w0, w8, hs
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 15) nounwind
+  ret i32 %m
+}
+
+define i1 @length15_lt(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length15_lt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr x8, [x0]
+; CHECK-NEXT:    ldr x9, [x1]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB39_3
+; CHECK-NEXT:  // %bb.1: // %loadbb1
+; CHECK-NEXT:    ldur x8, [x0, #7]
+; CHECK-NEXT:    ldur x9, [x1, #7]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB39_3
+; CHECK-NEXT:  // %bb.2:
+; CHECK-NEXT:    lsr w0, wzr, #31
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB39_3: // %res_block
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
+; CHECK-NEXT:    cneg w8, w8, hs
+; CHECK-NEXT:    lsr w0, w8, #31
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 15) nounwind
+  %c = icmp slt i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length15_const(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length15_const:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov x8, #14136 // =0x3738
+; CHECK-NEXT:    ldr x9, [x0]
+; CHECK-NEXT:    movk x8, #13622, lsl #16
+; CHECK-NEXT:    movk x8, #13108, lsl #32
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    movk x8, #12594, lsl #48
+; CHECK-NEXT:    cmp x9, x8
+; CHECK-NEXT:    b.ne .LBB40_3
+; CHECK-NEXT:  // %bb.1: // %loadbb1
+; CHECK-NEXT:    mov x8, #13365 // =0x3435
+; CHECK-NEXT:    ldur x9, [x0, #7]
+; CHECK-NEXT:    movk x8, #12851, lsl #16
+; CHECK-NEXT:    movk x8, #12337, lsl #32
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    movk x8, #14393, lsl #48
+; CHECK-NEXT:    cmp x9, x8
+; CHECK-NEXT:    b.ne .LBB40_3
+; CHECK-NEXT:  // %bb.2:
+; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB40_3: // %res_block
+; CHECK-NEXT:    cmp x9, x8
+; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
+; CHECK-NEXT:    cneg w0, w8, hs
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i64 15) nounwind
+  ret i32 %m
+}
+
+define i1 @length15_eq(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length15_eq:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr x8, [x0]
+; CHECK-NEXT:    ldr x9, [x1]
+; CHECK-NEXT:    ldur x10, [x0, #7]
+; CHECK-NEXT:    ldur x11, [x1, #7]
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    ccmp x10, x11, #0, eq
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 15) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length15_gt_const(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length15_gt_const:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov x8, #14136 // =0x3738
+; CHECK-NEXT:    ldr x9, [x0]
+; CHECK-NEXT:    movk x8, #13622, lsl #16
+; CHECK-NEXT:    movk x8, #13108, lsl #32
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    movk x8, #12594, lsl #48
+; CHECK-NEXT:    cmp x9, x8
+; CHECK-NEXT:    b.ne .LBB42_3
+; CHECK-NEXT:  // %bb.1: // %loadbb1
+; CHECK-NEXT:    mov x8, #13365 // =0x3435
+; CHECK-NEXT:    ldur x9, [x0, #7]
+; CHECK-NEXT:    movk x8, #12851, lsl #16
+; CHECK-NEXT:    movk x8, #12337, lsl #32
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    movk x8, #14393, lsl #48
+; CHECK-NEXT:    cmp x9, x8
+; CHECK-NEXT:    b.ne .LBB42_3
+; CHECK-NEXT:  // %bb.2:
+; CHECK-NEXT:    mov w8, wzr
+; CHECK-NEXT:    b .LBB42_4
+; CHECK-NEXT:  .LBB42_3: // %res_block
+; CHECK-NEXT:    cmp x9, x8
+; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
+; CHECK-NEXT:    cneg w8, w8, hs
+; CHECK-NEXT:  .LBB42_4: // %endblock
+; CHECK-NEXT:    cmp w8, #0
+; CHECK-NEXT:    cset w0, gt
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i64 15) nounwind
+  %c = icmp sgt i32 %m, 0
+  ret i1 %c
+}
+
+
+define i32 @length16(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr x8, [x0]
+; CHECK-NEXT:    ldr x9, [x1]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB43_3
+; CHECK-NEXT:  // %bb.1: // %loadbb1
+; CHECK-NEXT:    ldr x8, [x0, #8]
+; CHECK-NEXT:    ldr x9, [x1, #8]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB43_3
+; CHECK-NEXT:  // %bb.2:
+; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB43_3: // %res_block
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
+; CHECK-NEXT:    cneg w0, w8, hs
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 16) nounwind
+  ret i32 %m
+}
+
+define i1 @length16_eq(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: length16_eq:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldp x8, x11, [x1]
+; CHECK-NEXT:    ldp x9, x10, [x0]
+; CHECK-NEXT:    cmp x9, x8
+; CHECK-NEXT:    ccmp x10, x11, #0, eq
+; CHECK-NEXT:    cset w0, ne
+; CHECK-NEXT:    ret
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 16) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length16_lt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: length16_lt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr x8, [x0]
+; CHECK-NEXT:    ldr x9, [x1]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB45_3
+; CHECK-NEXT:  // %bb.1: // %loadbb1
+; CHECK-NEXT:    ldr x8, [x0, #8]
+; CHECK-NEXT:    ldr x9, [x1, #8]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB45_3
+; CHECK-NEXT:  // %bb.2:
+; CHECK-NEXT:    lsr w0, wzr, #31
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB45_3: // %res_block
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
+; CHECK-NEXT:    cneg w8, w8, hs
+; CHECK-NEXT:    lsr w0, w8, #31
+; CHECK-NEXT:    ret
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 16) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length16_gt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: length16_gt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr x8, [x0]
+; CHECK-NEXT:    ldr x9, [x1]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB46_3
+; CHECK-NEXT:  // %bb.1: // %loadbb1
+; CHECK-NEXT:    ldr x8, [x0, #8]
+; CHECK-NEXT:    ldr x9, [x1, #8]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB46_3
+; CHECK-NEXT:  // %bb.2:
+; CHECK-NEXT:    mov w8, wzr
+; CHECK-NEXT:    b .LBB46_4
+; CHECK-NEXT:  .LBB46_3: // %res_block
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
+; CHECK-NEXT:    cneg w8, w8, hs
+; CHECK-NEXT:  .LBB46_4: // %endblock
+; CHECK-NEXT:    cmp w8, #0
+; CHECK-NEXT:    cset w0, gt
+; CHECK-NEXT:    ret
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 16) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length16_eq_const(ptr %X) nounwind {
+; CHECK-LABEL: length16_eq_const:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov x8, #12592 // =0x3130
+; CHECK-NEXT:    ldp x9, x10, [x0]
+; CHECK-NEXT:    movk x8, #13106, lsl #16
+; CHECK-NEXT:    movk x8, #13620, lsl #32
+; CHECK-NEXT:    movk x8, #14134, lsl #48
+; CHECK-NEXT:    cmp x9, x8
+; CHECK-NEXT:    mov x8, #14648 // =0x3938
+; CHECK-NEXT:    movk x8, #12592, lsl #16
+; CHECK-NEXT:    movk x8, #13106, lsl #32
+; CHECK-NEXT:    movk x8, #13620, lsl #48
+; CHECK-NEXT:    ccmp x10, x8, #0, eq
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 16) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+
+define i32 @length24(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length24:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr x8, [x0]
+; CHECK-NEXT:    ldr x9, [x1]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB48_4
+; CHECK-NEXT:  // %bb.1: // %loadbb1
+; CHECK-NEXT:    ldr x8, [x0, #8]
+; CHECK-NEXT:    ldr x9, [x1, #8]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB48_4
+; CHECK-NEXT:  // %bb.2: // %loadbb2
+; CHECK-NEXT:    ldr x8, [x0, #16]
+; CHECK-NEXT:    ldr x9, [x1, #16]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB48_4
+; CHECK-NEXT:  // %bb.3:
+; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB48_4: // %res_block
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
+; CHECK-NEXT:    cneg w0, w8, hs
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 24) nounwind
+  ret i32 %m
+}
+
+define i1 @length24_eq(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: length24_eq:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldp x8, x11, [x1]
+; CHECK-NEXT:    ldr x12, [x0, #16]
+; CHECK-NEXT:    ldp x9, x10, [x0]
+; CHECK-NEXT:    ldr x13, [x1, #16]
+; CHECK-NEXT:    cmp x9, x8
+; CHECK-NEXT:    ccmp x10, x11, #0, eq
+; CHECK-NEXT:    ccmp x12, x13, #0, eq
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    ret
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 24) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length24_lt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: length24_lt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr x8, [x0]
+; CHECK-NEXT:    ldr x9, [x1]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB50_4
+; CHECK-NEXT:  // %bb.1: // %loadbb1
+; CHECK-NEXT:    ldr x8, [x0, #8]
+; CHECK-NEXT:    ldr x9, [x1, #8]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB50_4
+; CHECK-NEXT:  // %bb.2: // %loadbb2
+; CHECK-NEXT:    ldr x8, [x0, #16]
+; CHECK-NEXT:    ldr x9, [x1, #16]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB50_4
+; CHECK-NEXT:  // %bb.3:
+; CHECK-NEXT:    lsr w0, wzr, #31
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB50_4: // %res_block
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
+; CHECK-NEXT:    cneg w8, w8, hs
+; CHECK-NEXT:    lsr w0, w8, #31
+; CHECK-NEXT:    ret
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 24) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length24_gt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: length24_gt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr x8, [x0]
+; CHECK-NEXT:    ldr x9, [x1]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB51_4
+; CHECK-NEXT:  // %bb.1: // %loadbb1
+; CHECK-NEXT:    ldr x8, [x0, #8]
+; CHECK-NEXT:    ldr x9, [x1, #8]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB51_4
+; CHECK-NEXT:  // %bb.2: // %loadbb2
+; CHECK-NEXT:    ldr x8, [x0, #16]
+; CHECK-NEXT:    ldr x9, [x1, #16]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB51_4
+; CHECK-NEXT:  // %bb.3:
+; CHECK-NEXT:    mov w8, wzr
+; CHECK-NEXT:    b .LBB51_5
+; CHECK-NEXT:  .LBB51_4: // %res_block
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
+; CHECK-NEXT:    cneg w8, w8, hs
+; CHECK-NEXT:  .LBB51_5: // %endblock
+; CHECK-NEXT:    cmp w8, #0
+; CHECK-NEXT:    cset w0, gt
+; CHECK-NEXT:    ret
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 24) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length24_eq_const(ptr %X) nounwind {
+; CHECK-LABEL: length24_eq_const:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov x8, #12592 // =0x3130
+; CHECK-NEXT:    ldp x9, x10, [x0]
+; CHECK-NEXT:    movk x8, #13106, lsl #16
+; CHECK-NEXT:    ldr x11, [x0, #16]
+; CHECK-NEXT:    movk x8, #13620, lsl #32
+; CHECK-NEXT:    movk x8, #14134, lsl #48
+; CHECK-NEXT:    cmp x9, x8
+; CHECK-NEXT:    mov x8, #14648 // =0x3938
+; CHECK-NEXT:    movk x8, #12592, lsl #16
+; CHECK-NEXT:    movk x8, #13106, lsl #32
+; CHECK-NEXT:    movk x8, #13620, lsl #48
+; CHECK-NEXT:    ccmp x10, x8, #0, eq
+; CHECK-NEXT:    mov x8, #14134 // =0x3736
+; CHECK-NEXT:    movk x8, #14648, lsl #16
+; CHECK-NEXT:    movk x8, #12592, lsl #32
+; CHECK-NEXT:    movk x8, #13106, lsl #48
+; CHECK-NEXT:    ccmp x11, x8, #0, eq
+; CHECK-NEXT:    cset w0, ne
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 24) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length31(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length31:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr x8, [x0]
+; CHECK-NEXT:    ldr x9, [x1]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB53_5
+; CHECK-NEXT:  // %bb.1: // %loadbb1
+; CHECK-NEXT:    ldr x8, [x0, #8]
+; CHECK-NEXT:    ldr x9, [x1, #8]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB53_5
+; CHECK-NEXT:  // %bb.2: // %loadbb2
+; CHECK-NEXT:    ldr x8, [x0, #16]
+; CHECK-NEXT:    ldr x9, [x1, #16]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB53_5
+; CHECK-NEXT:  // %bb.3: // %loadbb3
+; CHECK-NEXT:    ldur x8, [x0, #23]
+; CHECK-NEXT:    ldur x9, [x1, #23]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB53_5
+; CHECK-NEXT:  // %bb.4:
+; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB53_5: // %res_block
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
+; CHECK-NEXT:    cneg w0, w8, hs
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 31) nounwind
+  ret i32 %m
+}
+
+define i1 @length31_eq(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: length31_eq:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldp x8, x11, [x1]
+; CHECK-NEXT:    ldr x12, [x0, #16]
+; CHECK-NEXT:    ldp x9, x10, [x0]
+; CHECK-NEXT:    ldr x13, [x1, #16]
+; CHECK-NEXT:    cmp x9, x8
+; CHECK-NEXT:    ldur x8, [x0, #23]
+; CHECK-NEXT:    ldur x9, [x1, #23]
+; CHECK-NEXT:    ccmp x10, x11, #0, eq
+; CHECK-NEXT:    ccmp x12, x13, #0, eq
+; CHECK-NEXT:    ccmp x8, x9, #0, eq
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    ret
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 31) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length31_lt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: length31_lt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr x8, [x0]
+; CHECK-NEXT:    ldr x9, [x1]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB55_5
+; CHECK-NEXT:  // %bb.1: // %loadbb1
+; CHECK-NEXT:    ldr x8, [x0, #8]
+; CHECK-NEXT:    ldr x9, [x1, #8]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB55_5
+; CHECK-NEXT:  // %bb.2: // %loadbb2
+; CHECK-NEXT:    ldr x8, [x0, #16]
+; CHECK-NEXT:    ldr x9, [x1, #16]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB55_5
+; CHECK-NEXT:  // %bb.3: // %loadbb3
+; CHECK-NEXT:    ldur x8, [x0, #23]
+; CHECK-NEXT:    ldur x9, [x1, #23]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB55_5
+; CHECK-NEXT:  // %bb.4:
+; CHECK-NEXT:    lsr w0, wzr, #31
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB55_5: // %res_block
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
+; CHECK-NEXT:    cneg w8, w8, hs
+; CHECK-NEXT:    lsr w0, w8, #31
+; CHECK-NEXT:    ret
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 31) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length31_gt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: length31_gt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr x8, [x0]
+; CHECK-NEXT:    ldr x9, [x1]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB56_5
+; CHECK-NEXT:  // %bb.1: // %loadbb1
+; CHECK-NEXT:    ldr x8, [x0, #8]
+; CHECK-NEXT:    ldr x9, [x1, #8]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB56_5
+; CHECK-NEXT:  // %bb.2: // %loadbb2
+; CHECK-NEXT:    ldr x8, [x0, #16]
+; CHECK-NEXT:    ldr x9, [x1, #16]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB56_5
+; CHECK-NEXT:  // %bb.3: // %loadbb3
+; CHECK-NEXT:    ldur x8, [x0, #23]
+; CHECK-NEXT:    ldur x9, [x1, #23]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB56_5
+; CHECK-NEXT:  // %bb.4:
+; CHECK-NEXT:    mov w8, wzr
+; CHECK-NEXT:    b .LBB56_6
+; CHECK-NEXT:  .LBB56_5: // %res_block
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
+; CHECK-NEXT:    cneg w8, w8, hs
+; CHECK-NEXT:  .LBB56_6: // %endblock
+; CHECK-NEXT:    cmp w8, #0
+; CHECK-NEXT:    cset w0, gt
+; CHECK-NEXT:    ret
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 31) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length31_eq_prefer128(ptr %x, ptr %y) nounwind "prefer-vector-width"="128" {
+; CHECK-LABEL: length31_eq_prefer128:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldp x8, x11, [x1]
+; CHECK-NEXT:    ldr x12, [x0, #16]
+; CHECK-NEXT:    ldp x9, x10, [x0]
+; CHECK-NEXT:    ldr x13, [x1, #16]
+; CHECK-NEXT:    cmp x9, x8
+; CHECK-NEXT:    ldur x8, [x0, #23]
+; CHECK-NEXT:    ldur x9, [x1, #23]
+; CHECK-NEXT:    ccmp x10, x11, #0, eq
+; CHECK-NEXT:    ccmp x12, x13, #0, eq
+; CHECK-NEXT:    ccmp x8, x9, #0, eq
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    ret
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 31) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length31_eq_const(ptr %X) nounwind {
+; CHECK-LABEL: length31_eq_const:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov x8, #12592 // =0x3130
+; CHECK-NEXT:    ldp x9, x10, [x0]
+; CHECK-NEXT:    movk x8, #13106, lsl #16
+; CHECK-NEXT:    ldr x11, [x0, #16]
+; CHECK-NEXT:    movk x8, #13620, lsl #32
+; CHECK-NEXT:    movk x8, #14134, lsl #48
+; CHECK-NEXT:    cmp x9, x8
+; CHECK-NEXT:    mov x8, #14648 // =0x3938
+; CHECK-NEXT:    ldur x9, [x0, #23]
+; CHECK-NEXT:    movk x8, #12592, lsl #16
+; CHECK-NEXT:    movk x8, #13106, lsl #32
+; CHECK-NEXT:    movk x8, #13620, lsl #48
+; CHECK-NEXT:    ccmp x10, x8, #0, eq
+; CHECK-NEXT:    mov x8, #14134 // =0x3736
+; CHECK-NEXT:    movk x8, #14648, lsl #16
+; CHECK-NEXT:    movk x8, #12592, lsl #32
+; CHECK-NEXT:    movk x8, #13106, lsl #48
+; CHECK-NEXT:    ccmp x11, x8, #0, eq
+; CHECK-NEXT:    mov x8, #13363 // =0x3433
+; CHECK-NEXT:    movk x8, #13877, lsl #16
+; CHECK-NEXT:    movk x8, #14391, lsl #32
+; CHECK-NEXT:    movk x8, #12345, lsl #48
+; CHECK-NEXT:    ccmp x9, x8, #0, eq
+; CHECK-NEXT:    cset w0, ne
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 31) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length32(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr x8, [x0]
+; CHECK-NEXT:    ldr x9, [x1]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB59_5
+; CHECK-NEXT:  // %bb.1: // %loadbb1
+; CHECK-NEXT:    ldr x8, [x0, #8]
+; CHECK-NEXT:    ldr x9, [x1, #8]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB59_5
+; CHECK-NEXT:  // %bb.2: // %loadbb2
+; CHECK-NEXT:    ldr x8, [x0, #16]
+; CHECK-NEXT:    ldr x9, [x1, #16]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB59_5
+; CHECK-NEXT:  // %bb.3: // %loadbb3
+; CHECK-NEXT:    ldr x8, [x0, #24]
+; CHECK-NEXT:    ldr x9, [x1, #24]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB59_5
+; CHECK-NEXT:  // %bb.4:
+; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB59_5: // %res_block
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
+; CHECK-NEXT:    cneg w0, w8, hs
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 32) nounwind
+  ret i32 %m
+}
+
+
+define i1 @length32_eq(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: length32_eq:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldp x8, x11, [x1]
+; CHECK-NEXT:    ldp x9, x10, [x0]
+; CHECK-NEXT:    ldp x12, x13, [x1, #16]
+; CHECK-NEXT:    cmp x9, x8
+; CHECK-NEXT:    ldp x8, x9, [x0, #16]
+; CHECK-NEXT:    ccmp x10, x11, #0, eq
+; CHECK-NEXT:    ccmp x8, x12, #0, eq
+; CHECK-NEXT:    ccmp x9, x13, #0, eq
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    ret
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 32) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length32_lt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: length32_lt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr x8, [x0]
+; CHECK-NEXT:    ldr x9, [x1]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB61_5
+; CHECK-NEXT:  // %bb.1: // %loadbb1
+; CHECK-NEXT:    ldr x8, [x0, #8]
+; CHECK-NEXT:    ldr x9, [x1, #8]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB61_5
+; CHECK-NEXT:  // %bb.2: // %loadbb2
+; CHECK-NEXT:    ldr x8, [x0, #16]
+; CHECK-NEXT:    ldr x9, [x1, #16]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB61_5
+; CHECK-NEXT:  // %bb.3: // %loadbb3
+; CHECK-NEXT:    ldr x8, [x0, #24]
+; CHECK-NEXT:    ldr x9, [x1, #24]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB61_5
+; CHECK-NEXT:  // %bb.4:
+; CHECK-NEXT:    lsr w0, wzr, #31
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB61_5: // %res_block
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
+; CHECK-NEXT:    cneg w8, w8, hs
+; CHECK-NEXT:    lsr w0, w8, #31
+; CHECK-NEXT:    ret
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 32) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length32_gt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: length32_gt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr x8, [x0]
+; CHECK-NEXT:    ldr x9, [x1]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB62_5
+; CHECK-NEXT:  // %bb.1: // %loadbb1
+; CHECK-NEXT:    ldr x8, [x0, #8]
+; CHECK-NEXT:    ldr x9, [x1, #8]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB62_5
+; CHECK-NEXT:  // %bb.2: // %loadbb2
+; CHECK-NEXT:    ldr x8, [x0, #16]
+; CHECK-NEXT:    ldr x9, [x1, #16]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB62_5
+; CHECK-NEXT:  // %bb.3: // %loadbb3
+; CHECK-NEXT:    ldr x8, [x0, #24]
+; CHECK-NEXT:    ldr x9, [x1, #24]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB62_5
+; CHECK-NEXT:  // %bb.4:
+; CHECK-NEXT:    mov w8, wzr
+; CHECK-NEXT:    b .LBB62_6
+; CHECK-NEXT:  .LBB62_5: // %res_block
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
+; CHECK-NEXT:    cneg w8, w8, hs
+; CHECK-NEXT:  .LBB62_6: // %endblock
+; CHECK-NEXT:    cmp w8, #0
+; CHECK-NEXT:    cset w0, gt
+; CHECK-NEXT:    ret
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 32) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length32_eq_prefer128(ptr %x, ptr %y) nounwind "prefer-vector-width"="128" {
+; CHECK-LABEL: length32_eq_prefer128:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldp x8, x11, [x1]
+; CHECK-NEXT:    ldp x9, x10, [x0]
+; CHECK-NEXT:    ldp x12, x13, [x1, #16]
+; CHECK-NEXT:    cmp x9, x8
+; CHECK-NEXT:    ldp x8, x9, [x0, #16]
+; CHECK-NEXT:    ccmp x10, x11, #0, eq
+; CHECK-NEXT:    ccmp x8, x12, #0, eq
+; CHECK-NEXT:    ccmp x9, x13, #0, eq
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    ret
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 32) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length32_eq_const(ptr %X) nounwind {
+; CHECK-LABEL: length32_eq_const:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov x8, #12592 // =0x3130
+; CHECK-NEXT:    ldp x9, x10, [x0]
+; CHECK-NEXT:    movk x8, #13106, lsl #16
+; CHECK-NEXT:    movk x8, #13620, lsl #32
+; CHECK-NEXT:    movk x8, #14134, lsl #48
+; CHECK-NEXT:    cmp x9, x8
+; CHECK-NEXT:    mov x8, #14648 // =0x3938
+; CHECK-NEXT:    movk x8, #12592, lsl #16
+; CHECK-NEXT:    ldp x9, x11, [x0, #16]
+; CHECK-NEXT:    movk x8, #13106, lsl #32
+; CHECK-NEXT:    movk x8, #13620, lsl #48
+; CHECK-NEXT:    ccmp x10, x8, #0, eq
+; CHECK-NEXT:    mov x8, #14134 // =0x3736
+; CHECK-NEXT:    movk x8, #14648, lsl #16
+; CHECK-NEXT:    movk x8, #12592, lsl #32
+; CHECK-NEXT:    movk x8, #13106, lsl #48
+; CHECK-NEXT:    ccmp x9, x8, #0, eq
+; CHECK-NEXT:    mov x8, #13620 // =0x3534
+; CHECK-NEXT:    movk x8, #14134, lsl #16
+; CHECK-NEXT:    movk x8, #14648, lsl #32
+; CHECK-NEXT:    movk x8, #12592, lsl #48
+; CHECK-NEXT:    ccmp x11, x8, #0, eq
+; CHECK-NEXT:    cset w0, ne
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 32) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length48(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length48:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr x8, [x0]
+; CHECK-NEXT:    ldr x9, [x1]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB65_7
+; CHECK-NEXT:  // %bb.1: // %loadbb1
+; CHECK-NEXT:    ldr x8, [x0, #8]
+; CHECK-NEXT:    ldr x9, [x1, #8]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB65_7
+; CHECK-NEXT:  // %bb.2: // %loadbb2
+; CHECK-NEXT:    ldr x8, [x0, #16]
+; CHECK-NEXT:    ldr x9, [x1, #16]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB65_7
+; CHECK-NEXT:  // %bb.3: // %loadbb3
+; CHECK-NEXT:    ldr x8, [x0, #24]
+; CHECK-NEXT:    ldr x9, [x1, #24]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB65_7
+; CHECK-NEXT:  // %bb.4: // %loadbb4
+; CHECK-NEXT:    ldr x8, [x0, #32]
+; CHECK-NEXT:    ldr x9, [x1, #32]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB65_7
+; CHECK-NEXT:  // %bb.5: // %loadbb5
+; CHECK-NEXT:    ldr x8, [x0, #40]
+; CHECK-NEXT:    ldr x9, [x1, #40]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB65_7
+; CHECK-NEXT:  // %bb.6:
+; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB65_7: // %res_block
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
+; CHECK-NEXT:    cneg w0, w8, hs
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 48) nounwind
+  ret i32 %m
+}
+
+define i1 @length48_eq(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: length48_eq:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldp x8, x11, [x1]
+; CHECK-NEXT:    ldp x9, x10, [x0]
+; CHECK-NEXT:    ldp x12, x13, [x1, #16]
+; CHECK-NEXT:    cmp x9, x8
+; CHECK-NEXT:    ldp x8, x9, [x0, #16]
+; CHECK-NEXT:    ccmp x10, x11, #0, eq
+; CHECK-NEXT:    ccmp x8, x12, #0, eq
+; CHECK-NEXT:    ldp x8, x11, [x0, #32]
+; CHECK-NEXT:    ldp x10, x12, [x1, #32]
+; CHECK-NEXT:    ccmp x9, x13, #0, eq
+; CHECK-NEXT:    ccmp x8, x10, #0, eq
+; CHECK-NEXT:    ccmp x11, x12, #0, eq
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    ret
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 48) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length48_lt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: length48_lt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr x8, [x0]
+; CHECK-NEXT:    ldr x9, [x1]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB67_7
+; CHECK-NEXT:  // %bb.1: // %loadbb1
+; CHECK-NEXT:    ldr x8, [x0, #8]
+; CHECK-NEXT:    ldr x9, [x1, #8]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB67_7
+; CHECK-NEXT:  // %bb.2: // %loadbb2
+; CHECK-NEXT:    ldr x8, [x0, #16]
+; CHECK-NEXT:    ldr x9, [x1, #16]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB67_7
+; CHECK-NEXT:  // %bb.3: // %loadbb3
+; CHECK-NEXT:    ldr x8, [x0, #24]
+; CHECK-NEXT:    ldr x9, [x1, #24]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB67_7
+; CHECK-NEXT:  // %bb.4: // %loadbb4
+; CHECK-NEXT:    ldr x8, [x0, #32]
+; CHECK-NEXT:    ldr x9, [x1, #32]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB67_7
+; CHECK-NEXT:  // %bb.5: // %loadbb5
+; CHECK-NEXT:    ldr x8, [x0, #40]
+; CHECK-NEXT:    ldr x9, [x1, #40]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB67_7
+; CHECK-NEXT:  // %bb.6:
+; CHECK-NEXT:    lsr w0, wzr, #31
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB67_7: // %res_block
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
+; CHECK-NEXT:    cneg w8, w8, hs
+; CHECK-NEXT:    lsr w0, w8, #31
+; CHECK-NEXT:    ret
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 48) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length48_gt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: length48_gt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr x8, [x0]
+; CHECK-NEXT:    ldr x9, [x1]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB68_7
+; CHECK-NEXT:  // %bb.1: // %loadbb1
+; CHECK-NEXT:    ldr x8, [x0, #8]
+; CHECK-NEXT:    ldr x9, [x1, #8]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB68_7
+; CHECK-NEXT:  // %bb.2: // %loadbb2
+; CHECK-NEXT:    ldr x8, [x0, #16]
+; CHECK-NEXT:    ldr x9, [x1, #16]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB68_7
+; CHECK-NEXT:  // %bb.3: // %loadbb3
+; CHECK-NEXT:    ldr x8, [x0, #24]
+; CHECK-NEXT:    ldr x9, [x1, #24]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB68_7
+; CHECK-NEXT:  // %bb.4: // %loadbb4
+; CHECK-NEXT:    ldr x8, [x0, #32]
+; CHECK-NEXT:    ldr x9, [x1, #32]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB68_7
+; CHECK-NEXT:  // %bb.5: // %loadbb5
+; CHECK-NEXT:    ldr x8, [x0, #40]
+; CHECK-NEXT:    ldr x9, [x1, #40]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB68_7
+; CHECK-NEXT:  // %bb.6:
+; CHECK-NEXT:    mov w8, wzr
+; CHECK-NEXT:    b .LBB68_8
+; CHECK-NEXT:  .LBB68_7: // %res_block
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
+; CHECK-NEXT:    cneg w8, w8, hs
+; CHECK-NEXT:  .LBB68_8: // %endblock
+; CHECK-NEXT:    cmp w8, #0
+; CHECK-NEXT:    cset w0, gt
+; CHECK-NEXT:    ret
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 48) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length48_eq_prefer128(ptr %x, ptr %y) nounwind "prefer-vector-width"="128" {
+; CHECK-LABEL: length48_eq_prefer128:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldp x8, x11, [x1]
+; CHECK-NEXT:    ldp x9, x10, [x0]
+; CHECK-NEXT:    ldp x12, x13, [x1, #16]
+; CHECK-NEXT:    cmp x9, x8
+; CHECK-NEXT:    ldp x8, x9, [x0, #16]
+; CHECK-NEXT:    ccmp x10, x11, #0, eq
+; CHECK-NEXT:    ccmp x8, x12, #0, eq
+; CHECK-NEXT:    ldp x8, x11, [x0, #32]
+; CHECK-NEXT:    ldp x10, x12, [x1, #32]
+; CHECK-NEXT:    ccmp x9, x13, #0, eq
+; CHECK-NEXT:    ccmp x8, x10, #0, eq
+; CHECK-NEXT:    ccmp x11, x12, #0, eq
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    ret
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 48) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length48_eq_const(ptr %X) nounwind {
+; CHECK-LABEL: length48_eq_const:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov x8, #12592 // =0x3130
+; CHECK-NEXT:    ldp x9, x10, [x0]
+; CHECK-NEXT:    movk x8, #13106, lsl #16
+; CHECK-NEXT:    ldp x11, x12, [x0, #16]
+; CHECK-NEXT:    movk x8, #13620, lsl #32
+; CHECK-NEXT:    movk x8, #14134, lsl #48
+; CHECK-NEXT:    cmp x9, x8
+; CHECK-NEXT:    mov x9, #14648 // =0x3938
+; CHECK-NEXT:    movk x9, #12592, lsl #16
+; CHECK-NEXT:    movk x9, #13106, lsl #32
+; CHECK-NEXT:    movk x9, #13620, lsl #48
+; CHECK-NEXT:    ccmp x10, x9, #0, eq
+; CHECK-NEXT:    mov x9, #14134 // =0x3736
+; CHECK-NEXT:    movk x9, #14648, lsl #16
+; CHECK-NEXT:    movk x9, #12592, lsl #32
+; CHECK-NEXT:    movk x9, #13106, lsl #48
+; CHECK-NEXT:    ccmp x11, x9, #0, eq
+; CHECK-NEXT:    mov x9, #13620 // =0x3534
+; CHECK-NEXT:    movk x9, #14134, lsl #16
+; CHECK-NEXT:    ldp x10, x11, [x0, #32]
+; CHECK-NEXT:    movk x9, #14648, lsl #32
+; CHECK-NEXT:    movk x9, #12592, lsl #48
+; CHECK-NEXT:    ccmp x12, x9, #0, eq
+; CHECK-NEXT:    mov x9, #13106 // =0x3332
+; CHECK-NEXT:    movk x9, #13620, lsl #16
+; CHECK-NEXT:    movk x9, #14134, lsl #32
+; CHECK-NEXT:    movk x9, #14648, lsl #48
+; CHECK-NEXT:    ccmp x10, x9, #0, eq
+; CHECK-NEXT:    ccmp x11, x8, #0, eq
+; CHECK-NEXT:    cset w0, ne
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 48) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length63(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length63:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr x8, [x0]
+; CHECK-NEXT:    ldr x9, [x1]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB71_9
+; CHECK-NEXT:  // %bb.1: // %loadbb1
+; CHECK-NEXT:    ldr x8, [x0, #8]
+; CHECK-NEXT:    ldr x9, [x1, #8]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB71_9
+; CHECK-NEXT:  // %bb.2: // %loadbb2
+; CHECK-NEXT:    ldr x8, [x0, #16]
+; CHECK-NEXT:    ldr x9, [x1, #16]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB71_9
+; CHECK-NEXT:  // %bb.3: // %loadbb3
+; CHECK-NEXT:    ldr x8, [x0, #24]
+; CHECK-NEXT:    ldr x9, [x1, #24]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB71_9
+; CHECK-NEXT:  // %bb.4: // %loadbb4
+; CHECK-NEXT:    ldr x8, [x0, #32]
+; CHECK-NEXT:    ldr x9, [x1, #32]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB71_9
+; CHECK-NEXT:  // %bb.5: // %loadbb5
+; CHECK-NEXT:    ldr x8, [x0, #40]
+; CHECK-NEXT:    ldr x9, [x1, #40]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB71_9
+; CHECK-NEXT:  // %bb.6: // %loadbb6
+; CHECK-NEXT:    ldr x8, [x0, #48]
+; CHECK-NEXT:    ldr x9, [x1, #48]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB71_9
+; CHECK-NEXT:  // %bb.7: // %loadbb7
+; CHECK-NEXT:    ldur x8, [x0, #55]
+; CHECK-NEXT:    ldur x9, [x1, #55]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB71_9
+; CHECK-NEXT:  // %bb.8:
+; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB71_9: // %res_block
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
+; CHECK-NEXT:    cneg w0, w8, hs
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 63) nounwind
+  ret i32 %m
+}
+
+define i1 @length63_eq(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: length63_eq:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldp x8, x11, [x1]
+; CHECK-NEXT:    ldp x9, x10, [x0]
+; CHECK-NEXT:    ldp x12, x13, [x1, #16]
+; CHECK-NEXT:    cmp x9, x8
+; CHECK-NEXT:    ldp x8, x9, [x0, #16]
+; CHECK-NEXT:    ccmp x10, x11, #0, eq
+; CHECK-NEXT:    ccmp x8, x12, #0, eq
+; CHECK-NEXT:    ldp x8, x11, [x0, #32]
+; CHECK-NEXT:    ldp x10, x12, [x1, #32]
+; CHECK-NEXT:    ccmp x9, x13, #0, eq
+; CHECK-NEXT:    ldr x9, [x0, #48]
+; CHECK-NEXT:    ldr x13, [x1, #48]
+; CHECK-NEXT:    ccmp x8, x10, #0, eq
+; CHECK-NEXT:    ldur x8, [x0, #55]
+; CHECK-NEXT:    ldur x10, [x1, #55]
+; CHECK-NEXT:    ccmp x11, x12, #0, eq
+; CHECK-NEXT:    ccmp x9, x13, #0, eq
+; CHECK-NEXT:    ccmp x8, x10, #0, eq
+; CHECK-NEXT:    cset w0, ne
+; CHECK-NEXT:    ret
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 63) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length63_lt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: length63_lt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr x8, [x0]
+; CHECK-NEXT:    ldr x9, [x1]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB73_9
+; CHECK-NEXT:  // %bb.1: // %loadbb1
+; CHECK-NEXT:    ldr x8, [x0, #8]
+; CHECK-NEXT:    ldr x9, [x1, #8]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB73_9
+; CHECK-NEXT:  // %bb.2: // %loadbb2
+; CHECK-NEXT:    ldr x8, [x0, #16]
+; CHECK-NEXT:    ldr x9, [x1, #16]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB73_9
+; CHECK-NEXT:  // %bb.3: // %loadbb3
+; CHECK-NEXT:    ldr x8, [x0, #24]
+; CHECK-NEXT:    ldr x9, [x1, #24]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB73_9
+; CHECK-NEXT:  // %bb.4: // %loadbb4
+; CHECK-NEXT:    ldr x8, [x0, #32]
+; CHECK-NEXT:    ldr x9, [x1, #32]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB73_9
+; CHECK-NEXT:  // %bb.5: // %loadbb5
+; CHECK-NEXT:    ldr x8, [x0, #40]
+; CHECK-NEXT:    ldr x9, [x1, #40]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB73_9
+; CHECK-NEXT:  // %bb.6: // %loadbb6
+; CHECK-NEXT:    ldr x8, [x0, #48]
+; CHECK-NEXT:    ldr x9, [x1, #48]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB73_9
+; CHECK-NEXT:  // %bb.7: // %loadbb7
+; CHECK-NEXT:    ldur x8, [x0, #55]
+; CHECK-NEXT:    ldur x9, [x1, #55]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB73_9
+; CHECK-NEXT:  // %bb.8:
+; CHECK-NEXT:    lsr w0, wzr, #31
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB73_9: // %res_block
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
+; CHECK-NEXT:    cneg w8, w8, hs
+; CHECK-NEXT:    lsr w0, w8, #31
+; CHECK-NEXT:    ret
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 63) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length63_gt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: length63_gt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr x8, [x0]
+; CHECK-NEXT:    ldr x9, [x1]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB74_9
+; CHECK-NEXT:  // %bb.1: // %loadbb1
+; CHECK-NEXT:    ldr x8, [x0, #8]
+; CHECK-NEXT:    ldr x9, [x1, #8]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB74_9
+; CHECK-NEXT:  // %bb.2: // %loadbb2
+; CHECK-NEXT:    ldr x8, [x0, #16]
+; CHECK-NEXT:    ldr x9, [x1, #16]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB74_9
+; CHECK-NEXT:  // %bb.3: // %loadbb3
+; CHECK-NEXT:    ldr x8, [x0, #24]
+; CHECK-NEXT:    ldr x9, [x1, #24]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB74_9
+; CHECK-NEXT:  // %bb.4: // %loadbb4
+; CHECK-NEXT:    ldr x8, [x0, #32]
+; CHECK-NEXT:    ldr x9, [x1, #32]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB74_9
+; CHECK-NEXT:  // %bb.5: // %loadbb5
+; CHECK-NEXT:    ldr x8, [x0, #40]
+; CHECK-NEXT:    ldr x9, [x1, #40]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB74_9
+; CHECK-NEXT:  // %bb.6: // %loadbb6
+; CHECK-NEXT:    ldr x8, [x0, #48]
+; CHECK-NEXT:    ldr x9, [x1, #48]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB74_9
+; CHECK-NEXT:  // %bb.7: // %loadbb7
+; CHECK-NEXT:    ldur x8, [x0, #55]
+; CHECK-NEXT:    ldur x9, [x1, #55]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB74_9
+; CHECK-NEXT:  // %bb.8:
+; CHECK-NEXT:    mov w8, wzr
+; CHECK-NEXT:    b .LBB74_10
+; CHECK-NEXT:  .LBB74_9: // %res_block
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
+; CHECK-NEXT:    cneg w8, w8, hs
+; CHECK-NEXT:  .LBB74_10: // %endblock
+; CHECK-NEXT:    cmp w8, #0
+; CHECK-NEXT:    cset w0, gt
+; CHECK-NEXT:    ret
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 63) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length63_eq_const(ptr %X) nounwind {
+; CHECK-LABEL: length63_eq_const:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov x8, #12592 // =0x3130
+; CHECK-NEXT:    ldp x9, x10, [x0]
+; CHECK-NEXT:    movk x8, #13106, lsl #16
+; CHECK-NEXT:    ldp x11, x12, [x0, #16]
+; CHECK-NEXT:    movk x8, #13620, lsl #32
+; CHECK-NEXT:    movk x8, #14134, lsl #48
+; CHECK-NEXT:    cmp x9, x8
+; CHECK-NEXT:    mov x9, #14648 // =0x3938
+; CHECK-NEXT:    movk x9, #12592, lsl #16
+; CHECK-NEXT:    movk x9, #13106, lsl #32
+; CHECK-NEXT:    movk x9, #13620, lsl #48
+; CHECK-NEXT:    ccmp x10, x9, #0, eq
+; CHECK-NEXT:    mov x10, #14134 // =0x3736
+; CHECK-NEXT:    movk x10, #14648, lsl #16
+; CHECK-NEXT:    movk x10, #12592, lsl #32
+; CHECK-NEXT:    movk x10, #13106, lsl #48
+; CHECK-NEXT:    ccmp x11, x10, #0, eq
+; CHECK-NEXT:    mov x10, #13620 // =0x3534
+; CHECK-NEXT:    movk x10, #14134, lsl #16
+; CHECK-NEXT:    ldp x11, x13, [x0, #32]
+; CHECK-NEXT:    movk x10, #14648, lsl #32
+; CHECK-NEXT:    movk x10, #12592, lsl #48
+; CHECK-NEXT:    ccmp x12, x10, #0, eq
+; CHECK-NEXT:    mov x10, #13106 // =0x3332
+; CHECK-NEXT:    ldr x12, [x0, #48]
+; CHECK-NEXT:    movk x10, #13620, lsl #16
+; CHECK-NEXT:    movk x10, #14134, lsl #32
+; CHECK-NEXT:    movk x10, #14648, lsl #48
+; CHECK-NEXT:    ccmp x11, x10, #0, eq
+; CHECK-NEXT:    ldur x10, [x0, #55]
+; CHECK-NEXT:    ccmp x13, x8, #0, eq
+; CHECK-NEXT:    mov x8, #13877 // =0x3635
+; CHECK-NEXT:    movk x8, #14391, lsl #16
+; CHECK-NEXT:    ccmp x12, x9, #0, eq
+; CHECK-NEXT:    movk x8, #12345, lsl #32
+; CHECK-NEXT:    movk x8, #12849, lsl #48
+; CHECK-NEXT:    ccmp x10, x8, #0, eq
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 63) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length64(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr x8, [x0]
+; CHECK-NEXT:    ldr x9, [x1]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB76_9
+; CHECK-NEXT:  // %bb.1: // %loadbb1
+; CHECK-NEXT:    ldr x8, [x0, #8]
+; CHECK-NEXT:    ldr x9, [x1, #8]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB76_9
+; CHECK-NEXT:  // %bb.2: // %loadbb2
+; CHECK-NEXT:    ldr x8, [x0, #16]
+; CHECK-NEXT:    ldr x9, [x1, #16]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB76_9
+; CHECK-NEXT:  // %bb.3: // %loadbb3
+; CHECK-NEXT:    ldr x8, [x0, #24]
+; CHECK-NEXT:    ldr x9, [x1, #24]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB76_9
+; CHECK-NEXT:  // %bb.4: // %loadbb4
+; CHECK-NEXT:    ldr x8, [x0, #32]
+; CHECK-NEXT:    ldr x9, [x1, #32]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB76_9
+; CHECK-NEXT:  // %bb.5: // %loadbb5
+; CHECK-NEXT:    ldr x8, [x0, #40]
+; CHECK-NEXT:    ldr x9, [x1, #40]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB76_9
+; CHECK-NEXT:  // %bb.6: // %loadbb6
+; CHECK-NEXT:    ldr x8, [x0, #48]
+; CHECK-NEXT:    ldr x9, [x1, #48]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB76_9
+; CHECK-NEXT:  // %bb.7: // %loadbb7
+; CHECK-NEXT:    ldr x8, [x0, #56]
+; CHECK-NEXT:    ldr x9, [x1, #56]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB76_9
+; CHECK-NEXT:  // %bb.8:
+; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB76_9: // %res_block
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
+; CHECK-NEXT:    cneg w0, w8, hs
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 64) nounwind
+  ret i32 %m
+}
+
+define i1 @length64_eq(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: length64_eq:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldp x8, x11, [x1]
+; CHECK-NEXT:    ldp x9, x10, [x0]
+; CHECK-NEXT:    ldp x12, x13, [x1, #16]
+; CHECK-NEXT:    cmp x9, x8
+; CHECK-NEXT:    ldp x8, x9, [x0, #16]
+; CHECK-NEXT:    ccmp x10, x11, #0, eq
+; CHECK-NEXT:    ccmp x8, x12, #0, eq
+; CHECK-NEXT:    ldp x8, x11, [x0, #32]
+; CHECK-NEXT:    ldp x10, x12, [x1, #32]
+; CHECK-NEXT:    ccmp x9, x13, #0, eq
+; CHECK-NEXT:    ldp x9, x13, [x1, #48]
+; CHECK-NEXT:    ccmp x8, x10, #0, eq
+; CHECK-NEXT:    ldp x8, x10, [x0, #48]
+; CHECK-NEXT:    ccmp x11, x12, #0, eq
+; CHECK-NEXT:    ccmp x8, x9, #0, eq
+; CHECK-NEXT:    ccmp x10, x13, #0, eq
+; CHECK-NEXT:    cset w0, ne
+; CHECK-NEXT:    ret
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 64) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length64_lt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: length64_lt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr x8, [x0]
+; CHECK-NEXT:    ldr x9, [x1]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB78_9
+; CHECK-NEXT:  // %bb.1: // %loadbb1
+; CHECK-NEXT:    ldr x8, [x0, #8]
+; CHECK-NEXT:    ldr x9, [x1, #8]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB78_9
+; CHECK-NEXT:  // %bb.2: // %loadbb2
+; CHECK-NEXT:    ldr x8, [x0, #16]
+; CHECK-NEXT:    ldr x9, [x1, #16]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB78_9
+; CHECK-NEXT:  // %bb.3: // %loadbb3
+; CHECK-NEXT:    ldr x8, [x0, #24]
+; CHECK-NEXT:    ldr x9, [x1, #24]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB78_9
+; CHECK-NEXT:  // %bb.4: // %loadbb4
+; CHECK-NEXT:    ldr x8, [x0, #32]
+; CHECK-NEXT:    ldr x9, [x1, #32]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB78_9
+; CHECK-NEXT:  // %bb.5: // %loadbb5
+; CHECK-NEXT:    ldr x8, [x0, #40]
+; CHECK-NEXT:    ldr x9, [x1, #40]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB78_9
+; CHECK-NEXT:  // %bb.6: // %loadbb6
+; CHECK-NEXT:    ldr x8, [x0, #48]
+; CHECK-NEXT:    ldr x9, [x1, #48]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB78_9
+; CHECK-NEXT:  // %bb.7: // %loadbb7
+; CHECK-NEXT:    ldr x8, [x0, #56]
+; CHECK-NEXT:    ldr x9, [x1, #56]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB78_9
+; CHECK-NEXT:  // %bb.8:
+; CHECK-NEXT:    lsr w0, wzr, #31
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB78_9: // %res_block
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
+; CHECK-NEXT:    cneg w8, w8, hs
+; CHECK-NEXT:    lsr w0, w8, #31
+; CHECK-NEXT:    ret
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 64) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length64_gt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: length64_gt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr x8, [x0]
+; CHECK-NEXT:    ldr x9, [x1]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB79_9
+; CHECK-NEXT:  // %bb.1: // %loadbb1
+; CHECK-NEXT:    ldr x8, [x0, #8]
+; CHECK-NEXT:    ldr x9, [x1, #8]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB79_9
+; CHECK-NEXT:  // %bb.2: // %loadbb2
+; CHECK-NEXT:    ldr x8, [x0, #16]
+; CHECK-NEXT:    ldr x9, [x1, #16]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB79_9
+; CHECK-NEXT:  // %bb.3: // %loadbb3
+; CHECK-NEXT:    ldr x8, [x0, #24]
+; CHECK-NEXT:    ldr x9, [x1, #24]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB79_9
+; CHECK-NEXT:  // %bb.4: // %loadbb4
+; CHECK-NEXT:    ldr x8, [x0, #32]
+; CHECK-NEXT:    ldr x9, [x1, #32]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB79_9
+; CHECK-NEXT:  // %bb.5: // %loadbb5
+; CHECK-NEXT:    ldr x8, [x0, #40]
+; CHECK-NEXT:    ldr x9, [x1, #40]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB79_9
+; CHECK-NEXT:  // %bb.6: // %loadbb6
+; CHECK-NEXT:    ldr x8, [x0, #48]
+; CHECK-NEXT:    ldr x9, [x1, #48]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB79_9
+; CHECK-NEXT:  // %bb.7: // %loadbb7
+; CHECK-NEXT:    ldr x8, [x0, #56]
+; CHECK-NEXT:    ldr x9, [x1, #56]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB79_9
+; CHECK-NEXT:  // %bb.8:
+; CHECK-NEXT:    mov w8, wzr
+; CHECK-NEXT:    b .LBB79_10
+; CHECK-NEXT:  .LBB79_9: // %res_block
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
+; CHECK-NEXT:    cneg w8, w8, hs
+; CHECK-NEXT:  .LBB79_10: // %endblock
+; CHECK-NEXT:    cmp w8, #0
+; CHECK-NEXT:    cset w0, gt
+; CHECK-NEXT:    ret
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 64) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length64_eq_const(ptr %X) nounwind {
+; CHECK-LABEL: length64_eq_const:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov x8, #12592 // =0x3130
+; CHECK-NEXT:    ldp x9, x10, [x0]
+; CHECK-NEXT:    movk x8, #13106, lsl #16
+; CHECK-NEXT:    ldp x11, x12, [x0, #16]
+; CHECK-NEXT:    movk x8, #13620, lsl #32
+; CHECK-NEXT:    ldp x13, x14, [x0, #32]
+; CHECK-NEXT:    movk x8, #14134, lsl #48
+; CHECK-NEXT:    cmp x9, x8
+; CHECK-NEXT:    mov x9, #14648 // =0x3938
+; CHECK-NEXT:    movk x9, #12592, lsl #16
+; CHECK-NEXT:    movk x9, #13106, lsl #32
+; CHECK-NEXT:    movk x9, #13620, lsl #48
+; CHECK-NEXT:    ccmp x10, x9, #0, eq
+; CHECK-NEXT:    mov x10, #14134 // =0x3736
+; CHECK-NEXT:    movk x10, #14648, lsl #16
+; CHECK-NEXT:    movk x10, #12592, lsl #32
+; CHECK-NEXT:    movk x10, #13106, lsl #48
+; CHECK-NEXT:    ccmp x11, x10, #0, eq
+; CHECK-NEXT:    mov x11, #13620 // =0x3534
+; CHECK-NEXT:    movk x11, #14134, lsl #16
+; CHECK-NEXT:    movk x11, #14648, lsl #32
+; CHECK-NEXT:    movk x11, #12592, lsl #48
+; CHECK-NEXT:    ccmp x12, x11, #0, eq
+; CHECK-NEXT:    mov x11, #13106 // =0x3332
+; CHECK-NEXT:    movk x11, #13620, lsl #16
+; CHECK-NEXT:    movk x11, #14134, lsl #32
+; CHECK-NEXT:    movk x11, #14648, lsl #48
+; CHECK-NEXT:    ccmp x13, x11, #0, eq
+; CHECK-NEXT:    ldp x11, x12, [x0, #48]
+; CHECK-NEXT:    ccmp x14, x8, #0, eq
+; CHECK-NEXT:    ccmp x11, x9, #0, eq
+; CHECK-NEXT:    ccmp x12, x10, #0, eq
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 64) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length96(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length96:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w2, #96 // =0x60
+; CHECK-NEXT:    b memcmp
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 96) nounwind
+  ret i32 %m
+}
+
+define i1 @length96_eq(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: length96_eq:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    mov w2, #96 // =0x60
+; CHECK-NEXT:    bl memcmp
+; CHECK-NEXT:    cmp w0, #0
+; CHECK-NEXT:    cset w0, ne
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 96) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length96_lt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: length96_lt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    mov w2, #96 // =0x60
+; CHECK-NEXT:    bl memcmp
+; CHECK-NEXT:    lsr w0, w0, #31
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 96) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length96_gt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: length96_gt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    mov w2, #96 // =0x60
+; CHECK-NEXT:    bl memcmp
+; CHECK-NEXT:    cmp w0, #0
+; CHECK-NEXT:    cset w0, gt
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 96) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length96_eq_const(ptr %X) nounwind {
+; CHECK-LABEL: length96_eq_const:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    adrp x1, .L.str
+; CHECK-NEXT:    add x1, x1, :lo12:.L.str
+; CHECK-NEXT:    mov w2, #96 // =0x60
+; CHECK-NEXT:    bl memcmp
+; CHECK-NEXT:    cmp w0, #0
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 96) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length127(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length127:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w2, #127 // =0x7f
+; CHECK-NEXT:    b memcmp
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 127) nounwind
+  ret i32 %m
+}
+
+define i1 @length127_eq(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: length127_eq:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    mov w2, #127 // =0x7f
+; CHECK-NEXT:    bl memcmp
+; CHECK-NEXT:    cmp w0, #0
+; CHECK-NEXT:    cset w0, ne
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 127) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length127_lt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: length127_lt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    mov w2, #127 // =0x7f
+; CHECK-NEXT:    bl memcmp
+; CHECK-NEXT:    lsr w0, w0, #31
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 127) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length127_gt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: length127_gt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    mov w2, #127 // =0x7f
+; CHECK-NEXT:    bl memcmp
+; CHECK-NEXT:    cmp w0, #0
+; CHECK-NEXT:    cset w0, gt
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 127) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length127_eq_const(ptr %X) nounwind {
+; CHECK-LABEL: length127_eq_const:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    adrp x1, .L.str
+; CHECK-NEXT:    add x1, x1, :lo12:.L.str
+; CHECK-NEXT:    mov w2, #127 // =0x7f
+; CHECK-NEXT:    bl memcmp
+; CHECK-NEXT:    cmp w0, #0
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 127) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length128(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length128:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w2, #128 // =0x80
+; CHECK-NEXT:    b memcmp
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 128) nounwind
+  ret i32 %m
+}
+
+define i1 @length128_eq(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: length128_eq:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    mov w2, #128 // =0x80
+; CHECK-NEXT:    bl memcmp
+; CHECK-NEXT:    cmp w0, #0
+; CHECK-NEXT:    cset w0, ne
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 128) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length128_lt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: length128_lt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    mov w2, #128 // =0x80
+; CHECK-NEXT:    bl memcmp
+; CHECK-NEXT:    lsr w0, w0, #31
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 128) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length128_gt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: length128_gt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    mov w2, #128 // =0x80
+; CHECK-NEXT:    bl memcmp
+; CHECK-NEXT:    cmp w0, #0
+; CHECK-NEXT:    cset w0, gt
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 128) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length128_eq_const(ptr %X) nounwind {
+; CHECK-LABEL: length128_eq_const:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    adrp x1, .L.str
+; CHECK-NEXT:    add x1, x1, :lo12:.L.str
+; CHECK-NEXT:    mov w2, #128 // =0x80
+; CHECK-NEXT:    bl memcmp
+; CHECK-NEXT:    cmp w0, #0
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 128) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length192(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length192:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w2, #192 // =0xc0
+; CHECK-NEXT:    b memcmp
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 192) nounwind
+  ret i32 %m
+}
+
+define i1 @length192_eq(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: length192_eq:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    mov w2, #192 // =0xc0
+; CHECK-NEXT:    bl memcmp
+; CHECK-NEXT:    cmp w0, #0
+; CHECK-NEXT:    cset w0, ne
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 192) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length192_lt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: length192_lt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    mov w2, #192 // =0xc0
+; CHECK-NEXT:    bl memcmp
+; CHECK-NEXT:    lsr w0, w0, #31
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 192) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length192_gt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: length192_gt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    mov w2, #192 // =0xc0
+; CHECK-NEXT:    bl memcmp
+; CHECK-NEXT:    cmp w0, #0
+; CHECK-NEXT:    cset w0, gt
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 192) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length192_eq_const(ptr %X) nounwind {
+; CHECK-LABEL: length192_eq_const:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    adrp x1, .L.str
+; CHECK-NEXT:    add x1, x1, :lo12:.L.str
+; CHECK-NEXT:    mov w2, #192 // =0xc0
+; CHECK-NEXT:    bl memcmp
+; CHECK-NEXT:    cmp w0, #0
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 192) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length255(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length255:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w2, #255 // =0xff
+; CHECK-NEXT:    b memcmp
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 255) nounwind
+  ret i32 %m
+}
+
+define i1 @length255_eq(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: length255_eq:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    mov w2, #255 // =0xff
+; CHECK-NEXT:    bl memcmp
+; CHECK-NEXT:    cmp w0, #0
+; CHECK-NEXT:    cset w0, ne
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 255) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length255_lt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: length255_lt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    mov w2, #255 // =0xff
+; CHECK-NEXT:    bl memcmp
+; CHECK-NEXT:    lsr w0, w0, #31
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 255) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length255_gt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: length255_gt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    mov w2, #255 // =0xff
+; CHECK-NEXT:    bl memcmp
+; CHECK-NEXT:    cmp w0, #0
+; CHECK-NEXT:    cset w0, gt
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 255) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length255_eq_const(ptr %X) nounwind {
+; CHECK-LABEL: length255_eq_const:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    adrp x1, .L.str
+; CHECK-NEXT:    add x1, x1, :lo12:.L.str
+; CHECK-NEXT:    mov w2, #255 // =0xff
+; CHECK-NEXT:    bl memcmp
+; CHECK-NEXT:    cmp w0, #0
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 255) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length256(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length256:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w2, #256 // =0x100
+; CHECK-NEXT:    b memcmp
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 256) nounwind
+  ret i32 %m
+}
+
+define i1 @length256_eq(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: length256_eq:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    mov w2, #256 // =0x100
+; CHECK-NEXT:    bl memcmp
+; CHECK-NEXT:    cmp w0, #0
+; CHECK-NEXT:    cset w0, ne
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 256) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length256_lt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: length256_lt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    mov w2, #256 // =0x100
+; CHECK-NEXT:    bl memcmp
+; CHECK-NEXT:    lsr w0, w0, #31
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 256) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length256_gt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: length256_gt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    mov w2, #256 // =0x100
+; CHECK-NEXT:    bl memcmp
+; CHECK-NEXT:    cmp w0, #0
+; CHECK-NEXT:    cset w0, gt
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 256) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length256_eq_const(ptr %X) nounwind {
+; CHECK-LABEL: length256_eq_const:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    adrp x1, .L.str
+; CHECK-NEXT:    add x1, x1, :lo12:.L.str
+; CHECK-NEXT:    mov w2, #256 // =0x100
+; CHECK-NEXT:    bl memcmp
+; CHECK-NEXT:    cmp w0, #0
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 256) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length384(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length384:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w2, #384 // =0x180
+; CHECK-NEXT:    b memcmp
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 384) nounwind
+  ret i32 %m
+}
+
+define i1 @length384_eq(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: length384_eq:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    mov w2, #384 // =0x180
+; CHECK-NEXT:    bl memcmp
+; CHECK-NEXT:    cmp w0, #0
+; CHECK-NEXT:    cset w0, ne
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 384) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length384_lt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: length384_lt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    mov w2, #384 // =0x180
+; CHECK-NEXT:    bl memcmp
+; CHECK-NEXT:    lsr w0, w0, #31
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 384) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length384_gt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: length384_gt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    mov w2, #384 // =0x180
+; CHECK-NEXT:    bl memcmp
+; CHECK-NEXT:    cmp w0, #0
+; CHECK-NEXT:    cset w0, gt
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 384) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length384_eq_const(ptr %X) nounwind {
+; CHECK-LABEL: length384_eq_const:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    adrp x1, .L.str
+; CHECK-NEXT:    add x1, x1, :lo12:.L.str
+; CHECK-NEXT:    mov w2, #384 // =0x180
+; CHECK-NEXT:    bl memcmp
+; CHECK-NEXT:    cmp w0, #0
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 384) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length511(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length511:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w2, #511 // =0x1ff
+; CHECK-NEXT:    b memcmp
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 511) nounwind
+  ret i32 %m
+}
+
+define i1 @length511_eq(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: length511_eq:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    mov w2, #511 // =0x1ff
+; CHECK-NEXT:    bl memcmp
+; CHECK-NEXT:    cmp w0, #0
+; CHECK-NEXT:    cset w0, ne
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 511) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length511_lt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: length511_lt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    mov w2, #511 // =0x1ff
+; CHECK-NEXT:    bl memcmp
+; CHECK-NEXT:    lsr w0, w0, #31
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 511) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length511_gt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: length511_gt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    mov w2, #511 // =0x1ff
+; CHECK-NEXT:    bl memcmp
+; CHECK-NEXT:    cmp w0, #0
+; CHECK-NEXT:    cset w0, gt
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 511) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length511_eq_const(ptr %X) nounwind {
+; CHECK-LABEL: length511_eq_const:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    adrp x1, .L.str
+; CHECK-NEXT:    add x1, x1, :lo12:.L.str
+; CHECK-NEXT:    mov w2, #511 // =0x1ff
+; CHECK-NEXT:    bl memcmp
+; CHECK-NEXT:    cmp w0, #0
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 511) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length512(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length512:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w2, #512 // =0x200
+; CHECK-NEXT:    b memcmp
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 512) nounwind
+  ret i32 %m
+}
+
+define i1 @length512_eq(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: length512_eq:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    mov w2, #512 // =0x200
+; CHECK-NEXT:    bl memcmp
+; CHECK-NEXT:    cmp w0, #0
+; CHECK-NEXT:    cset w0, ne
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 512) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length512_lt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: length512_lt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    mov w2, #512 // =0x200
+; CHECK-NEXT:    bl memcmp
+; CHECK-NEXT:    lsr w0, w0, #31
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 512) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length512_gt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: length512_gt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    mov w2, #512 // =0x200
+; CHECK-NEXT:    bl memcmp
+; CHECK-NEXT:    cmp w0, #0
+; CHECK-NEXT:    cset w0, gt
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 512) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length512_eq_const(ptr %X) nounwind {
+; CHECK-LABEL: length512_eq_const:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    adrp x1, .L.str
+; CHECK-NEXT:    add x1, x1, :lo12:.L.str
+; CHECK-NEXT:    mov w2, #512 // =0x200
+; CHECK-NEXT:    bl memcmp
+; CHECK-NEXT:    cmp w0, #0
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 512) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @huge_length(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: huge_length:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov x2, #9223372036854775807 // =0x7fffffffffffffff
+; CHECK-NEXT:    b memcmp
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 9223372036854775807) nounwind
+  ret i32 %m
+}
+
+define i1 @huge_length_eq(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: huge_length_eq:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    mov x2, #9223372036854775807 // =0x7fffffffffffffff
+; CHECK-NEXT:    bl memcmp
+; CHECK-NEXT:    cmp w0, #0
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 9223372036854775807) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @nonconst_length(ptr %X, ptr %Y, i64 %size) nounwind {
+; CHECK-LABEL: nonconst_length:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    b memcmp
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 %size) nounwind
+  ret i32 %m
+}
+
+define i1 @nonconst_length_eq(ptr %X, ptr %Y, i64 %size) nounwind {
+; CHECK-LABEL: nonconst_length_eq:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    bl memcmp
+; CHECK-NEXT:    cmp w0, #0
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 %size) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
diff --git a/llvm/test/Transforms/ExpandMemCmp/AArch64/memcmp.ll b/llvm/test/Transforms/ExpandMemCmp/AArch64/memcmp.ll
new file mode 100644
index 0000000000000..95fb883f3cdd5
--- /dev/null
+++ b/llvm/test/Transforms/ExpandMemCmp/AArch64/memcmp.ll
@@ -0,0 +1,877 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; RUN: opt -S -expandmemcmp -memcmp-num-loads-per-block=1 -mtriple=aarch64-unknown-unknown < %s | FileCheck %s
+
+declare i32 @memcmp(ptr nocapture, ptr nocapture, i64)
+
+define i32 @cmp2(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
+; CHECK-LABEL: define i32 @cmp2(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; CHECK-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; CHECK-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; CHECK-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; CHECK-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    ret i32 [[TMP7]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 2)
+  ret i32 %call
+}
+
+define i32 @cmp2_align2(ptr nocapture readonly align 2 %x, ptr nocapture readonly align 2 %y)  {
+; CHECK-LABEL: define i32 @cmp2_align2(
+; CHECK-SAME: ptr nocapture readonly align 2 [[X:%.*]], ptr nocapture readonly align 2 [[Y:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 2
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 2
+; CHECK-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; CHECK-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; CHECK-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; CHECK-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; CHECK-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    ret i32 [[TMP7]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 2)
+  ret i32 %call
+}
+
+define i32 @cmp3(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
+; CHECK-LABEL: define i32 @cmp3(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i24, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i24, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = zext i24 [[TMP1]] to i32
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i24 [[TMP2]] to i32
+; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; CHECK-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    ret i32 [[TMP7]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 3)
+  ret i32 %call
+}
+
+define i32 @cmp4(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
+; CHECK-LABEL: define i32 @cmp4(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; CHECK-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; CHECK-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    ret i32 [[TMP9]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 4)
+  ret i32 %call
+}
+
+define i32 @cmp5(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
+; CHECK-LABEL: define i32 @cmp5(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i40, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i40, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = zext i40 [[TMP1]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i40 [[TMP2]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ugt i64 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ult i64 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP7]] to i32
+; CHECK-NEXT:    [[TMP10:%.*]] = zext i1 [[TMP8]] to i32
+; CHECK-NEXT:    [[TMP11:%.*]] = sub i32 [[TMP9]], [[TMP10]]
+; CHECK-NEXT:    ret i32 [[TMP11]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 5)
+  ret i32 %call
+}
+
+define i32 @cmp6(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
+; CHECK-LABEL: define i32 @cmp6(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i48, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i48, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = zext i48 [[TMP1]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i48 [[TMP2]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ugt i64 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ult i64 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP7]] to i32
+; CHECK-NEXT:    [[TMP10:%.*]] = zext i1 [[TMP8]] to i32
+; CHECK-NEXT:    [[TMP11:%.*]] = sub i32 [[TMP9]], [[TMP10]]
+; CHECK-NEXT:    ret i32 [[TMP11]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 6)
+  ret i32 %call
+}
+
+define i32 @cmp7(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
+; CHECK-LABEL: define i32 @cmp7(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; CHECK-NEXT:    br label [[LOADBB:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; CHECK-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb:
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; CHECK-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; CHECK-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[PHI_RES]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 7)
+  ret i32 %call
+}
+
+define i32 @cmp8(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
+; CHECK-LABEL: define i32 @cmp8(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP1]])
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]])
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ugt i64 [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ult i64 [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; CHECK-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; CHECK-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    ret i32 [[TMP9]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 8)
+  ret i32 %call
+}
+
+define i32 @cmp9(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
+; CHECK-LABEL: define i32 @cmp9(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; CHECK-NEXT:    br label [[LOADBB:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[TMP5:%.*]], [[TMP6:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb:
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; CHECK-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; CHECK-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; CHECK-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    br label [[ENDBLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[PHI_RES]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 9)
+  ret i32 %call
+}
+
+define i32 @cmp10(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
+; CHECK-LABEL: define i32 @cmp10(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; CHECK-NEXT:    br label [[LOADBB:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP14:%.*]], [[LOADBB1:%.*]] ]
+; CHECK-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP15:%.*]], [[LOADBB1]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb:
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; CHECK-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i16, ptr [[TMP8]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = load i16, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP12:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP10]])
+; CHECK-NEXT:    [[TMP13:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP11]])
+; CHECK-NEXT:    [[TMP14]] = zext i16 [[TMP12]] to i64
+; CHECK-NEXT:    [[TMP15]] = zext i16 [[TMP13]] to i64
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[TMP14]], [[TMP15]]
+; CHECK-NEXT:    br i1 [[TMP16]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[PHI_RES]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 10)
+  ret i32 %call
+}
+
+define i32 @cmp11(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
+; CHECK-LABEL: define i32 @cmp11(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; CHECK-NEXT:    br label [[LOADBB:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; CHECK-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb:
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; CHECK-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; CHECK-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[PHI_RES]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 11)
+  ret i32 %call
+}
+
+define i32 @cmp12(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
+; CHECK-LABEL: define i32 @cmp12(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; CHECK-NEXT:    br label [[LOADBB:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP14:%.*]], [[LOADBB1:%.*]] ]
+; CHECK-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP15:%.*]], [[LOADBB1]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb:
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; CHECK-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP12:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; CHECK-NEXT:    [[TMP13:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; CHECK-NEXT:    [[TMP14]] = zext i32 [[TMP12]] to i64
+; CHECK-NEXT:    [[TMP15]] = zext i32 [[TMP13]] to i64
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[TMP14]], [[TMP15]]
+; CHECK-NEXT:    br i1 [[TMP16]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[PHI_RES]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 12)
+  ret i32 %call
+}
+
+define i32 @cmp13(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
+; CHECK-LABEL: define i32 @cmp13(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; CHECK-NEXT:    br label [[LOADBB:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; CHECK-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb:
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; CHECK-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 5
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 5
+; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; CHECK-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[PHI_RES]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 13)
+  ret i32 %call
+}
+
+define i32 @cmp14(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
+; CHECK-LABEL: define i32 @cmp14(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; CHECK-NEXT:    br label [[LOADBB:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; CHECK-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb:
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; CHECK-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 6
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 6
+; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; CHECK-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[PHI_RES]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 14)
+  ret i32 %call
+}
+
+define i32 @cmp15(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
+; CHECK-LABEL: define i32 @cmp15(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; CHECK-NEXT:    br label [[LOADBB:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; CHECK-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb:
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; CHECK-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 7
+; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; CHECK-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[PHI_RES]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 15)
+  ret i32 %call
+}
+
+define i32 @cmp16(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
+; CHECK-LABEL: define i32 @cmp16(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; CHECK-NEXT:    br label [[LOADBB:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; CHECK-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb:
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; CHECK-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; CHECK-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[PHI_RES]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 16)
+  ret i32 %call
+}
+
+define i32 @cmp_eq2(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
+; CHECK-LABEL: define i32 @cmp_eq2(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i16 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP4]], 0
+; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 2)
+  %cmp = icmp eq i32 %call, 0
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @cmp_eq3(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
+; CHECK-LABEL: define i32 @cmp_eq3(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; CHECK-NEXT:    br label [[LOADBB:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb:
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i16 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    br i1 [[TMP3]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; CHECK-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne i8 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    br i1 [[TMP8]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
+; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 3)
+  %cmp = icmp eq i32 %call, 0
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @cmp_eq4(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
+; CHECK-LABEL: define i32 @cmp_eq4(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP4]], 0
+; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 4)
+  %cmp = icmp eq i32 %call, 0
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @cmp_eq5(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
+; CHECK-LABEL: define i32 @cmp_eq5(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; CHECK-NEXT:    br label [[LOADBB:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb:
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    br i1 [[TMP3]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; CHECK-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne i8 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    br i1 [[TMP8]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
+; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 5)
+  %cmp = icmp eq i32 %call, 0
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @cmp_eq6(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
+; CHECK-LABEL: define i32 @cmp_eq6(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; CHECK-NEXT:    br label [[LOADBB:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb:
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    br i1 [[TMP3]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; CHECK-NEXT:    [[TMP6:%.*]] = load i16, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i16, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne i16 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    br i1 [[TMP8]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
+; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 6)
+  %cmp = icmp eq i32 %call, 0
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @cmp_eq6_align4(ptr nocapture readonly align 4 %x, ptr nocapture readonly align 4 %y)  {
+; CHECK-LABEL: define i32 @cmp_eq6_align4(
+; CHECK-SAME: ptr nocapture readonly align 4 [[X:%.*]], ptr nocapture readonly align 4 [[Y:%.*]]) {
+; CHECK-NEXT:    br label [[LOADBB:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb:
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    br i1 [[TMP3]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; CHECK-NEXT:    [[TMP6:%.*]] = load i16, ptr [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = load i16, ptr [[TMP5]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne i16 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    br i1 [[TMP8]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
+; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 6)
+  %cmp = icmp eq i32 %call, 0
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @cmp_eq7(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
+; CHECK-LABEL: define i32 @cmp_eq7(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; CHECK-NEXT:    br label [[LOADBB:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb:
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    br i1 [[TMP3]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne i32 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    br i1 [[TMP8]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
+; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 7)
+  %cmp = icmp eq i32 %call, 0
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @cmp_eq8(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
+; CHECK-LABEL: define i32 @cmp_eq8(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP4]], 0
+; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 8)
+  %cmp = icmp eq i32 %call, 0
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @cmp_eq9(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
+; CHECK-LABEL: define i32 @cmp_eq9(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; CHECK-NEXT:    br label [[LOADBB:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb:
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    br i1 [[TMP3]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne i8 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    br i1 [[TMP8]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
+; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 9)
+  %cmp = icmp eq i32 %call, 0
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @cmp_eq10(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
+; CHECK-LABEL: define i32 @cmp_eq10(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; CHECK-NEXT:    br label [[LOADBB:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb:
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    br i1 [[TMP3]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i16, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i16, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne i16 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    br i1 [[TMP8]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
+; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 10)
+  %cmp = icmp eq i32 %call, 0
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @cmp_eq11(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
+; CHECK-LABEL: define i32 @cmp_eq11(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; CHECK-NEXT:    br label [[LOADBB:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb:
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    br i1 [[TMP3]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne i64 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    br i1 [[TMP8]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
+; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 11)
+  %cmp = icmp eq i32 %call, 0
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @cmp_eq12(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
+; CHECK-LABEL: define i32 @cmp_eq12(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; CHECK-NEXT:    br label [[LOADBB:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb:
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    br i1 [[TMP3]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne i32 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    br i1 [[TMP8]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
+; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 12)
+  %cmp = icmp eq i32 %call, 0
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @cmp_eq13(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
+; CHECK-LABEL: define i32 @cmp_eq13(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; CHECK-NEXT:    br label [[LOADBB:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb:
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    br i1 [[TMP3]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 5
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 5
+; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne i64 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    br i1 [[TMP8]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
+; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 13)
+  %cmp = icmp eq i32 %call, 0
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @cmp_eq14(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
+; CHECK-LABEL: define i32 @cmp_eq14(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; CHECK-NEXT:    br label [[LOADBB:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb:
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    br i1 [[TMP3]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 6
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 6
+; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne i64 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    br i1 [[TMP8]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
+; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 14)
+  %cmp = icmp eq i32 %call, 0
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @cmp_eq15(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
+; CHECK-LABEL: define i32 @cmp_eq15(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; CHECK-NEXT:    br label [[LOADBB:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb:
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    br i1 [[TMP3]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 7
+; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne i64 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    br i1 [[TMP8]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
+; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 15)
+  %cmp = icmp eq i32 %call, 0
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @cmp_eq16(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
+; CHECK-LABEL: define i32 @cmp_eq16(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; CHECK-NEXT:    br label [[LOADBB:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb:
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    br i1 [[TMP3]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne i64 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    br i1 [[TMP8]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
+; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 16)
+  %cmp = icmp eq i32 %call, 0
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}

From f9cd7896581a56494fd2d07d5126e614de7d57d2 Mon Sep 17 00:00:00 2001
From: Christudasan Devadasan <christudasan.devadasan@amd.com>
Date: Fri, 27 Oct 2023 17:24:10 +0530
Subject: [PATCH 213/877] [AMDGPU] Add pseudo instructions for SGPR spill to
 VGPR (#69923)

For a future patch, is it important to keep the lowered SGPR
spills to be recognized as spill instructions during regalloc.
Directly lowering them into V_WRITELANE/V_READLANE won't allow
us to attach the SPILL flag to their instructions.

This patch introduces the pseudo instructions with the SGPRSpill
flag set in their Desc. They will get lowered to equivalent
instructions later during post RA pseudo expansion.
---
 llvm/lib/Target/AMDGPU/SIFrameLowering.cpp    |  13 +-
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp        |  18 +-
 llvm/lib/Target/AMDGPU/SIInstructions.td      |  22 +
 llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp     |  12 +-
 .../AMDGPU/av_spill_cross_bb_usage.mir        |   8 +-
 .../AMDGPU/bb-prolog-spill-during-regalloc.ll |  16 +-
 .../AMDGPU/csr-sgpr-spill-live-ins.mir        |   8 +-
 .../AMDGPU/extend-wwm-virt-reg-liveness.mir   |  16 +-
 .../AMDGPU/pei-scavenge-sgpr-carry-out.mir    |   4 +-
 .../CodeGen/AMDGPU/pei-scavenge-sgpr-gfx9.mir |   9 +-
 .../test/CodeGen/AMDGPU/pei-scavenge-sgpr.mir |   4 +-
 .../AMDGPU/pei-scavenge-vgpr-spill.mir        |  14 +-
 .../AMDGPU/preserve-only-inactive-lane.mir    |   8 +-
 .../CodeGen/AMDGPU/same-slot-agpr-sgpr.mir    |   8 +-
 .../sgpr-spill-dead-frame-in-dbg-value.mir    |   8 +-
 .../AMDGPU/sgpr-spill-partially-undef.mir     |   8 +-
 .../AMDGPU/sgpr-spill-to-vmem-scc-clobber.mir |  32 +-
 .../AMDGPU/sgpr-spill-vmem-large-frame.mir    |   4 +-
 llvm/test/CodeGen/AMDGPU/sgpr-spill.mir       | 886 +++++++++---------
 .../CodeGen/AMDGPU/si-lower-sgpr-spills.mir   |  16 +-
 .../AMDGPU/spill-reg-tuple-super-reg-use.mir  |  16 +-
 .../AMDGPU/spill-sgpr-csr-live-ins.mir        |   2 +-
 .../AMDGPU/spill-sgpr-to-virtual-vgpr.mir     | 212 ++---
 .../CodeGen/AMDGPU/spill-special-sgpr.mir     |  38 +-
 .../CodeGen/AMDGPU/spill-writelane-vgprs.ll   |   8 -
 llvm/test/CodeGen/AMDGPU/spill192.mir         |  26 +-
 llvm/test/CodeGen/AMDGPU/spill224.mir         |  30 +-
 llvm/test/CodeGen/AMDGPU/spill288.mir         |  38 +-
 llvm/test/CodeGen/AMDGPU/spill320.mir         |  42 +-
 llvm/test/CodeGen/AMDGPU/spill352.mir         |  46 +-
 llvm/test/CodeGen/AMDGPU/spill384.mir         |  50 +-
 ...d-op-for-wwm-scratch-reg-spill-restore.mir |  24 +-
 .../AMDGPU/track-spilled-vgpr-liveness.mir    |  18 -
 .../CodeGen/AMDGPU/use_restore_frame_reg.mir  |   9 +-
 34 files changed, 852 insertions(+), 821 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index 5de993ec3cba1..350e2f1e3b987 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -274,7 +274,8 @@ class PrologEpilogSGPRSpillBuilder {
       Register SubReg = NumSubRegs == 1
                             ? SuperReg
                             : Register(TRI.getSubReg(SuperReg, SplitParts[I]));
-      BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_WRITELANE_B32), Spill[I].VGPR)
+      BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_S32_TO_VGPR),
+              Spill[I].VGPR)
           .addReg(SubReg)
           .addImm(Spill[I].Lane)
           .addReg(Spill[I].VGPR, RegState::Undef);
@@ -319,7 +320,7 @@ class PrologEpilogSGPRSpillBuilder {
       Register SubReg = NumSubRegs == 1
                             ? SuperReg
                             : Register(TRI.getSubReg(SuperReg, SplitParts[I]));
-      BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_READLANE_B32), SubReg)
+      BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_RESTORE_S32_FROM_VGPR), SubReg)
           .addReg(Spill[I].VGPR)
           .addImm(Spill[I].Lane);
     }
@@ -1554,12 +1555,10 @@ void SIFrameLowering::determineCalleeSaves(MachineFunction &MF,
       // TODO: Handle this elsewhere at an early point. Walking through all MBBs
       // here would be a bad heuristic. A better way should be by calling
       // allocateWWMSpill during the regalloc pipeline whenever a physical
-      // register is allocated for the intended virtual registers. That will
-      // also help excluding the general use of WRITELANE/READLANE intrinsics
-      // that won't really need any such special handling.
-      if (MI.getOpcode() == AMDGPU::V_WRITELANE_B32)
+      // register is allocated for the intended virtual registers.
+      if (MI.getOpcode() == AMDGPU::SI_SPILL_S32_TO_VGPR)
         MFI->allocateWWMSpill(MF, MI.getOperand(0).getReg());
-      else if (MI.getOpcode() == AMDGPU::V_READLANE_B32)
+      else if (MI.getOpcode() == AMDGPU::SI_RESTORE_S32_FROM_VGPR)
         MFI->allocateWWMSpill(MF, MI.getOperand(1).getReg());
       else if (TII->isWWMRegSpillOpcode(MI.getOpcode()))
         NeedExecCopyReservedReg = true;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 327f8988ac2f1..25e057b639b36 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2111,6 +2111,14 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
     MI.setDesc(get(AMDGPU::S_AND_SAVEEXEC_B32));
     break;
 
+  case AMDGPU::SI_SPILL_S32_TO_VGPR:
+    MI.setDesc(get(AMDGPU::V_WRITELANE_B32));
+    break;
+
+  case AMDGPU::SI_RESTORE_S32_FROM_VGPR:
+    MI.setDesc(get(AMDGPU::V_READLANE_B32));
+    break;
+
   case AMDGPU::V_MOV_B64_PSEUDO: {
     Register Dst = MI.getOperand(0).getReg();
     Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
@@ -4014,7 +4022,9 @@ bool SIInstrInfo::hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const
   // However, executing them with EXEC = 0 causes them to operate on undefined
   // data, which we avoid by returning true here.
   if (Opcode == AMDGPU::V_READFIRSTLANE_B32 ||
-      Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32)
+      Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32 ||
+      Opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR ||
+      Opcode == AMDGPU::SI_SPILL_S32_TO_VGPR)
     return true;
 
   return false;
@@ -4408,7 +4418,9 @@ static bool shouldReadExec(const MachineInstr &MI) {
   if (SIInstrInfo::isVALU(MI)) {
     switch (MI.getOpcode()) {
     case AMDGPU::V_READLANE_B32:
+    case AMDGPU::SI_RESTORE_S32_FROM_VGPR:
     case AMDGPU::V_WRITELANE_B32:
+    case AMDGPU::SI_SPILL_S32_TO_VGPR:
       return false;
     }
 
@@ -9071,7 +9083,9 @@ SIInstrInfo::getInstructionUniformity(const MachineInstr &MI) const {
     return InstructionUniformity::NeverUniform;
 
   unsigned opcode = MI.getOpcode();
-  if (opcode == AMDGPU::V_READLANE_B32 || opcode == AMDGPU::V_READFIRSTLANE_B32)
+  if (opcode == AMDGPU::V_READLANE_B32 ||
+      opcode == AMDGPU::V_READFIRSTLANE_B32 ||
+      opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR)
     return InstructionUniformity::AlwaysUniform;
 
   if (isCopyInstr(MI)) {
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 567f1b812c180..e7eb24033bea9 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -875,6 +875,28 @@ defm SI_SPILL_S384 : SI_SPILL_SGPR <SReg_384>;
 defm SI_SPILL_S512 : SI_SPILL_SGPR <SReg_512>;
 defm SI_SPILL_S1024 : SI_SPILL_SGPR <SReg_1024>;
 
+let SGPRSpill = 1, VALU = 1, isConvergent = 1 in {
+def SI_SPILL_S32_TO_VGPR : PseudoInstSI <(outs VGPR_32:$vdst),
+  (ins SReg_32:$src0, i32imm:$src1, VGPR_32:$vdst_in)> {
+  let Size = 4;
+  let FixedSize = 1;
+  let IsNeverUniform = 1;
+  let hasSideEffects = 0;
+  let mayLoad = 0;
+  let mayStore = 0;
+  let Constraints = "$vdst = $vdst_in";
+}
+
+def SI_RESTORE_S32_FROM_VGPR : PseudoInstSI <(outs SReg_32:$sdst),
+  (ins VGPR_32:$src0, i32imm:$src1)> {
+  let Size = 4;
+  let FixedSize = 1;
+  let hasSideEffects = 0;
+  let mayLoad = 0;
+  let mayStore = 0;
+}
+} // End SGPRSpill = 1, VALU = 1, isConvergent = 1
+
 // VGPR or AGPR spill instructions. In case of AGPR spilling a temp register
 // needs to be used and an extra instruction to move between VGPR and AGPR.
 // UsesTmp adds to the total size of an expanded spill in this case.
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index d0a81673d6528..d22056e372c2d 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -1769,7 +1769,7 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, int Index,
       // Mark the "old value of vgpr" input undef only if this is the first sgpr
       // spill to this specific vgpr in the first basic block.
       auto MIB = BuildMI(*SB.MBB, MI, SB.DL,
-                         SB.TII.get(AMDGPU::V_WRITELANE_B32), Spill.VGPR)
+                         SB.TII.get(AMDGPU::SI_SPILL_S32_TO_VGPR), Spill.VGPR)
                      .addReg(SubReg, getKillRegState(UseKill))
                      .addImm(Spill.Lane)
                      .addReg(Spill.VGPR);
@@ -1815,8 +1815,8 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, int Index,
                 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
 
         MachineInstrBuilder WriteLane =
-            BuildMI(*SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_WRITELANE_B32),
-                    SB.TmpVGPR)
+            BuildMI(*SB.MBB, MI, SB.DL,
+                    SB.TII.get(AMDGPU::SI_SPILL_S32_TO_VGPR), SB.TmpVGPR)
                 .addReg(SubReg, SubKillState)
                 .addImm(i % PVD.PerVGPR)
                 .addReg(SB.TmpVGPR, TmpVGPRFlags);
@@ -1877,8 +1877,8 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI, int Index,
               : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
 
       SpilledReg Spill = VGPRSpills[i];
-      auto MIB = BuildMI(*SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_READLANE_B32),
-                         SubReg)
+      auto MIB = BuildMI(*SB.MBB, MI, SB.DL,
+                         SB.TII.get(AMDGPU::SI_RESTORE_S32_FROM_VGPR), SubReg)
                      .addReg(Spill.VGPR)
                      .addImm(Spill.Lane);
       if (SB.NumSubRegs > 1 && i == 0)
@@ -1911,7 +1911,7 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI, int Index,
 
         bool LastSubReg = (i + 1 == e);
         auto MIB = BuildMI(*SB.MBB, MI, SB.DL,
-                           SB.TII.get(AMDGPU::V_READLANE_B32), SubReg)
+                           SB.TII.get(AMDGPU::SI_RESTORE_S32_FROM_VGPR), SubReg)
                        .addReg(SB.TmpVGPR, getKillRegState(LastSubReg))
                        .addImm(i);
         if (SB.NumSubRegs > 1 && i == 0)
diff --git a/llvm/test/CodeGen/AMDGPU/av_spill_cross_bb_usage.mir b/llvm/test/CodeGen/AMDGPU/av_spill_cross_bb_usage.mir
index 7209d160e6c8a..65648bacd5567 100644
--- a/llvm/test/CodeGen/AMDGPU/av_spill_cross_bb_usage.mir
+++ b/llvm/test/CodeGen/AMDGPU/av_spill_cross_bb_usage.mir
@@ -60,8 +60,8 @@ body:             |
   ; GCN-NEXT:   renamable $vgpr46 = COPY $vgpr1, implicit $exec
   ; GCN-NEXT:   renamable $vgpr45 = COPY $vgpr0, implicit $exec
   ; GCN-NEXT:   renamable $sgpr16_sgpr17 = IMPLICIT_DEF
-  ; GCN-NEXT:   $vgpr40 = V_WRITELANE_B32 $sgpr30, 0, $vgpr40, implicit-def $sgpr30_sgpr31, implicit $sgpr30_sgpr31
-  ; GCN-NEXT:   $vgpr40 = V_WRITELANE_B32 $sgpr31, 1, $vgpr40, implicit $sgpr30_sgpr31
+  ; GCN-NEXT:   $vgpr40 = SI_SPILL_S32_TO_VGPR $sgpr30, 0, $vgpr40, implicit-def $sgpr30_sgpr31, implicit $sgpr30_sgpr31
+  ; GCN-NEXT:   $vgpr40 = SI_SPILL_S32_TO_VGPR $sgpr31, 1, $vgpr40, implicit $sgpr30_sgpr31
   ; GCN-NEXT:   BUFFER_STORE_DWORD_OFFSET killed $vgpr14, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 52, 0, 0, implicit $exec, implicit-def $vgpr14_vgpr15, implicit $vgpr14_vgpr15 :: (store (s32) into %stack.1, addrspace 5)
   ; GCN-NEXT:   BUFFER_STORE_DWORD_OFFSET killed $vgpr15, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 56, 0, 0, implicit $exec, implicit killed $vgpr14_vgpr15 :: (store (s32) into %stack.1 + 4, addrspace 5)
   ; GCN-NEXT:   BUFFER_STORE_DWORD_OFFSET killed $vgpr10, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 60, 0, 0, implicit $exec, implicit-def $vgpr10_vgpr11, implicit $vgpr10_vgpr11 :: (store (s32) into %stack.2, addrspace 5)
@@ -124,8 +124,8 @@ body:             |
 
     ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
     renamable $sgpr16_sgpr17 = IMPLICIT_DEF
-    $vgpr40 = V_WRITELANE_B32 $sgpr30, 0, $vgpr40, implicit-def $sgpr30_sgpr31, implicit $sgpr30_sgpr31
-    $vgpr40 = V_WRITELANE_B32 killed $sgpr31, 1, $vgpr40, implicit killed $sgpr30_sgpr31
+    $vgpr40 = SI_SPILL_S32_TO_VGPR $sgpr30, 0, $vgpr40, implicit-def $sgpr30_sgpr31, implicit $sgpr30_sgpr31
+    $vgpr40 = SI_SPILL_S32_TO_VGPR killed $sgpr31, 1, $vgpr40, implicit killed $sgpr30_sgpr31
     dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr16_sgpr17, 0, csr_amdgpu, implicit-def dead $vgpr0
     %8:vreg_64 = nofpexcept V_FMA_F64_e64 0, %7, 0, %6, 0, %5, 0, 0, implicit $mode, implicit $exec
     ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
diff --git a/llvm/test/CodeGen/AMDGPU/bb-prolog-spill-during-regalloc.ll b/llvm/test/CodeGen/AMDGPU/bb-prolog-spill-during-regalloc.ll
index a844b577c842f..7df42fc506c79 100644
--- a/llvm/test/CodeGen/AMDGPU/bb-prolog-spill-during-regalloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/bb-prolog-spill-during-regalloc.ll
@@ -22,8 +22,8 @@ define i32 @prolog_spill(i32 %arg0, i32 %arg1, i32 %arg2) {
   ; REGALLOC-NEXT:   renamable $sgpr6_sgpr7 = COPY $exec, implicit-def $exec
   ; REGALLOC-NEXT:   renamable $sgpr4_sgpr5 = S_AND_B64 renamable $sgpr6_sgpr7, killed renamable $sgpr4_sgpr5, implicit-def dead $scc
   ; REGALLOC-NEXT:   renamable $sgpr6_sgpr7 = S_XOR_B64 renamable $sgpr4_sgpr5, killed renamable $sgpr6_sgpr7, implicit-def dead $scc
-  ; REGALLOC-NEXT:   renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr6, 0, $vgpr0, implicit-def $sgpr6_sgpr7, implicit $sgpr6_sgpr7
-  ; REGALLOC-NEXT:   renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr7, 1, $vgpr0, implicit killed $sgpr6_sgpr7
+  ; REGALLOC-NEXT:   renamable $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr6, 0, $vgpr0, implicit-def $sgpr6_sgpr7, implicit $sgpr6_sgpr7
+  ; REGALLOC-NEXT:   renamable $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr7, 1, $vgpr0, implicit killed $sgpr6_sgpr7
   ; REGALLOC-NEXT:   SI_SPILL_WWM_V32_SAVE killed $vgpr0, %stack.2, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5)
   ; REGALLOC-NEXT:   $exec = S_MOV_B64_term killed renamable $sgpr4_sgpr5
   ; REGALLOC-NEXT:   S_CBRANCH_EXECZ %bb.1, implicit $exec
@@ -34,13 +34,13 @@ define i32 @prolog_spill(i32 %arg0, i32 %arg1, i32 %arg2) {
   ; REGALLOC-NEXT: {{  $}}
   ; REGALLOC-NEXT:   $vgpr0 = SI_SPILL_WWM_V32_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5)
   ; REGALLOC-NEXT:   $vgpr1 = SI_SPILL_V32_RESTORE %stack.3, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5)
-  ; REGALLOC-NEXT:   $sgpr4 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr4_sgpr5
-  ; REGALLOC-NEXT:   $sgpr5 = V_READLANE_B32 $vgpr0, 1
+  ; REGALLOC-NEXT:   $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0, implicit-def $sgpr4_sgpr5
+  ; REGALLOC-NEXT:   $sgpr5 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 1
   ; REGALLOC-NEXT:   renamable $sgpr4_sgpr5 = S_OR_SAVEEXEC_B64 killed renamable $sgpr4_sgpr5, implicit-def $exec, implicit-def dead $scc, implicit $exec
   ; REGALLOC-NEXT:   SI_SPILL_V32_SAVE killed $vgpr1, %stack.6, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.6, addrspace 5)
   ; REGALLOC-NEXT:   renamable $sgpr4_sgpr5 = S_AND_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def dead $scc
-  ; REGALLOC-NEXT:   renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr4, 2, $vgpr0, implicit-def $sgpr4_sgpr5, implicit $sgpr4_sgpr5
-  ; REGALLOC-NEXT:   renamable $vgpr0 = V_WRITELANE_B32 $sgpr5, 3, $vgpr0, implicit $sgpr4_sgpr5
+  ; REGALLOC-NEXT:   renamable $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr4, 2, $vgpr0, implicit-def $sgpr4_sgpr5, implicit $sgpr4_sgpr5
+  ; REGALLOC-NEXT:   renamable $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr5, 3, $vgpr0, implicit $sgpr4_sgpr5
   ; REGALLOC-NEXT:   SI_SPILL_WWM_V32_SAVE killed $vgpr0, %stack.2, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5)
   ; REGALLOC-NEXT:   $exec = S_XOR_B64_term $exec, killed renamable $sgpr4_sgpr5, implicit-def dead $scc
   ; REGALLOC-NEXT:   S_CBRANCH_EXECZ %bb.4, implicit $exec
@@ -67,8 +67,8 @@ define i32 @prolog_spill(i32 %arg0, i32 %arg1, i32 %arg2) {
   ; REGALLOC-NEXT: bb.4.bb.3:
   ; REGALLOC-NEXT:   $vgpr1 = SI_SPILL_WWM_V32_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5)
   ; REGALLOC-NEXT:   $vgpr0 = SI_SPILL_V32_RESTORE %stack.6, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.6, addrspace 5)
-  ; REGALLOC-NEXT:   $sgpr4 = V_READLANE_B32 $vgpr1, 2, implicit-def $sgpr4_sgpr5
-  ; REGALLOC-NEXT:   $sgpr5 = V_READLANE_B32 $vgpr1, 3
+  ; REGALLOC-NEXT:   $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 2, implicit-def $sgpr4_sgpr5
+  ; REGALLOC-NEXT:   $sgpr5 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 3
   ; REGALLOC-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def dead $scc
   ; REGALLOC-NEXT:   renamable $sgpr4 = S_MOV_B32 5
   ; REGALLOC-NEXT:   renamable $vgpr0 = V_MUL_LO_U32_e64 killed $vgpr0, killed $sgpr4, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/csr-sgpr-spill-live-ins.mir b/llvm/test/CodeGen/AMDGPU/csr-sgpr-spill-live-ins.mir
index aed642d1f0670..4085cf2501b44 100644
--- a/llvm/test/CodeGen/AMDGPU/csr-sgpr-spill-live-ins.mir
+++ b/llvm/test/CodeGen/AMDGPU/csr-sgpr-spill-live-ins.mir
@@ -19,10 +19,10 @@ body: |
   ; CHECK-NEXT:   $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
   ; CHECK-NEXT:   BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.4, addrspace 5)
   ; CHECK-NEXT:   $exec = S_MOV_B64 killed $sgpr4_sgpr5
-  ; CHECK-NEXT:   $vgpr0 = V_WRITELANE_B32 $sgpr42, 0, $vgpr0
-  ; CHECK-NEXT:   $vgpr0 = V_WRITELANE_B32 $sgpr43, 1, $vgpr0
-  ; CHECK-NEXT:   $vgpr0 = V_WRITELANE_B32 $sgpr46, 2, $vgpr0
-  ; CHECK-NEXT:   $vgpr0 = V_WRITELANE_B32 $sgpr47, 3, $vgpr0
+  ; CHECK-NEXT:   $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr42, 0, $vgpr0
+  ; CHECK-NEXT:   $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr43, 1, $vgpr0
+  ; CHECK-NEXT:   $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr46, 2, $vgpr0
+  ; CHECK-NEXT:   $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr47, 3, $vgpr0
   ; CHECK-NEXT:   S_NOP 0
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
diff --git a/llvm/test/CodeGen/AMDGPU/extend-wwm-virt-reg-liveness.mir b/llvm/test/CodeGen/AMDGPU/extend-wwm-virt-reg-liveness.mir
index f802e1b1e18af..3bf7e7b8c5696 100644
--- a/llvm/test/CodeGen/AMDGPU/extend-wwm-virt-reg-liveness.mir
+++ b/llvm/test/CodeGen/AMDGPU/extend-wwm-virt-reg-liveness.mir
@@ -26,9 +26,9 @@ body:             |
     ; GCN: liveins: $sgpr4, $vgpr2_vgpr3
     ; GCN-NEXT: {{  $}}
     ; GCN-NEXT: renamable $vgpr0 = IMPLICIT_DEF
-    ; GCN-NEXT: renamable $vgpr0 = V_WRITELANE_B32 $sgpr4, 0, killed $vgpr0
+    ; GCN-NEXT: renamable $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, killed $vgpr0
     ; GCN-NEXT: S_NOP 0
-    ; GCN-NEXT: $sgpr4 = V_READLANE_B32 $vgpr0, 0
+    ; GCN-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0
     ; GCN-NEXT: renamable $vgpr1 = V_MOV_B32_e32 20, implicit $exec
     ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr2_vgpr3, killed renamable $vgpr1, 0, 0, implicit $exec
     ; GCN-NEXT: KILL killed renamable $vgpr0
@@ -77,9 +77,9 @@ body:             |
   ; GCN-NEXT:   successors: %bb.3(0x80000000)
   ; GCN-NEXT:   liveins: $sgpr6, $vgpr0, $sgpr10_sgpr11
   ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT:   renamable $vgpr0 = V_WRITELANE_B32 $sgpr6, 0, killed $vgpr0
+  ; GCN-NEXT:   renamable $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr6, 0, killed $vgpr0
   ; GCN-NEXT:   S_NOP 0
-  ; GCN-NEXT:   $sgpr6 = V_READLANE_B32 $vgpr0, 0
+  ; GCN-NEXT:   $sgpr6 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0
   ; GCN-NEXT:   renamable $vgpr1 = V_MOV_B32_e32 20, implicit $exec
   ; GCN-NEXT:   S_BRANCH %bb.3
   ; GCN-NEXT: {{  $}}
@@ -143,9 +143,9 @@ body:             |
   ; GCN-NEXT:   successors: %bb.2(0x80000000)
   ; GCN-NEXT:   liveins: $sgpr4, $vgpr0, $sgpr10_sgpr11
   ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT:   renamable $vgpr0 = V_WRITELANE_B32 $sgpr4, 0, killed $vgpr0
+  ; GCN-NEXT:   renamable $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, killed $vgpr0
   ; GCN-NEXT:   S_NOP 0
-  ; GCN-NEXT:   $sgpr4 = V_READLANE_B32 $vgpr0, 0
+  ; GCN-NEXT:   $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0
   ; GCN-NEXT:   renamable $vgpr1 = V_MOV_B32_e32 20, implicit $exec
   ; GCN-NEXT:   S_BRANCH %bb.2
   ; GCN-NEXT: {{  $}}
@@ -245,9 +245,9 @@ body:             |
   ; GCN-NEXT: bb.1:
   ; GCN-NEXT:   liveins: $sgpr4, $vgpr0, $vgpr2_vgpr3
   ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT:   renamable $vgpr0 = V_WRITELANE_B32 $sgpr4, 0, killed $vgpr0
+  ; GCN-NEXT:   renamable $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, killed $vgpr0
   ; GCN-NEXT:   S_NOP 0
-  ; GCN-NEXT:   $sgpr4 = V_READLANE_B32 $vgpr0, 0
+  ; GCN-NEXT:   $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0
   ; GCN-NEXT:   renamable $vgpr1 = V_MOV_B32_e32 10, implicit $exec
   ; GCN-NEXT:   GLOBAL_STORE_DWORD $vgpr2_vgpr3, killed renamable $vgpr1, 0, 0, implicit $exec
   ; GCN-NEXT:   KILL killed renamable $vgpr0
diff --git a/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-carry-out.mir b/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-carry-out.mir
index 33e766ad3bf9e..a204866170759 100644
--- a/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-carry-out.mir
+++ b/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-carry-out.mir
@@ -36,7 +36,7 @@ body:             |
     ; CHECK-NEXT: $sgpr5 = S_ADD_I32 $sgpr33, 1048832, implicit-def dead $scc
     ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr5, 0, 0, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5)
     ; CHECK-NEXT: $exec = S_MOV_B64 killed $sgpr6_sgpr7
-    ; CHECK-NEXT: $vgpr2 = V_WRITELANE_B32 $sgpr4, 0, undef $vgpr2
+    ; CHECK-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, undef $vgpr2
     ; CHECK-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 2097152, implicit-def dead $scc
     ; CHECK-NEXT: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr27, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc
     ; CHECK-NEXT: $sgpr33 = S_LSHR_B32 $sgpr33, 6, implicit-def $scc
@@ -50,7 +50,7 @@ body:             |
     ; CHECK-NEXT: $sgpr33 = S_ADD_I32 killed $sgpr33, -16384, implicit-def $scc
     ; CHECK-NEXT: $sgpr33 = S_LSHL_B32 $sgpr33, 6, implicit-def $scc
     ; CHECK-NEXT: $vgpr0 = V_OR_B32_e32 killed $vgpr3, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr27, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31
-    ; CHECK-NEXT: $sgpr4 = V_READLANE_B32 $vgpr2, 0
+    ; CHECK-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 0
     ; CHECK-NEXT: $sgpr6_sgpr7 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
     ; CHECK-NEXT: $sgpr5 = S_ADD_I32 $sgpr33, 1048832, implicit-def dead $scc
     ; CHECK-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr5, 0, 0, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5)
diff --git a/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-gfx9.mir b/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-gfx9.mir
index 8cffede477045..a4104737d974f 100644
--- a/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-gfx9.mir
+++ b/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-gfx9.mir
@@ -32,7 +32,7 @@ body:             |
     ; MUBUF-NEXT: $sgpr5 = S_ADD_I32 $sgpr33, 1048832, implicit-def dead $scc
     ; MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr5, 0, 0, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5)
     ; MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr6_sgpr7
-    ; MUBUF-NEXT: $vgpr2 = V_WRITELANE_B32 $sgpr4, 0, undef $vgpr2
+    ; MUBUF-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, undef $vgpr2
     ; MUBUF-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 2097152, implicit-def dead $scc
     ; MUBUF-NEXT: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr27, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc
     ; MUBUF-NEXT: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec
@@ -40,7 +40,7 @@ body:             |
     ; MUBUF-NEXT: $vgpr3 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec
     ; MUBUF-NEXT: $vgpr3 = V_ADD_U32_e32 16384, killed $vgpr3, implicit $exec
     ; MUBUF-NEXT: $vgpr0 = V_OR_B32_e32 killed $vgpr3, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr27, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31
-    ; MUBUF-NEXT: $sgpr4 = V_READLANE_B32 $vgpr2, 0
+    ; MUBUF-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 0
     ; MUBUF-NEXT: $sgpr6_sgpr7 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
     ; MUBUF-NEXT: $sgpr5 = S_ADD_I32 $sgpr33, 1048832, implicit-def dead $scc
     ; MUBUF-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr5, 0, 0, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5)
@@ -48,6 +48,7 @@ body:             |
     ; MUBUF-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -2097152, implicit-def dead $scc
     ; MUBUF-NEXT: $sgpr33 = COPY $sgpr4
     ; MUBUF-NEXT: S_ENDPGM 0, implicit $vcc
+    ;
     ; FLATSCR-LABEL: name: scavenge_sgpr_pei_no_sgprs
     ; FLATSCR: liveins: $vgpr1, $vgpr2
     ; FLATSCR-NEXT: {{  $}}
@@ -58,7 +59,7 @@ body:             |
     ; FLATSCR-NEXT: $sgpr5 = S_ADD_I32 $sgpr33, 16388, implicit-def dead $scc
     ; FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr2, killed $sgpr5, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.3, addrspace 5)
     ; FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr6_sgpr7
-    ; FLATSCR-NEXT: $vgpr2 = V_WRITELANE_B32 $sgpr4, 0, undef $vgpr2
+    ; FLATSCR-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, undef $vgpr2
     ; FLATSCR-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 32768, implicit-def dead $scc
     ; FLATSCR-NEXT: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr27, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc
     ; FLATSCR-NEXT: $sgpr33 = S_ADD_I32 $sgpr33, 8192, implicit-def $scc
@@ -67,7 +68,7 @@ body:             |
     ; FLATSCR-NEXT: $sgpr33 = S_ADD_I32 $sgpr33, 16384, implicit-def $scc
     ; FLATSCR-NEXT: $vgpr0 = V_OR_B32_e32 $sgpr33, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr27, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31
     ; FLATSCR-NEXT: $sgpr33 = S_ADD_I32 $sgpr33, -16384, implicit-def $scc
-    ; FLATSCR-NEXT: $sgpr4 = V_READLANE_B32 $vgpr2, 0
+    ; FLATSCR-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 0
     ; FLATSCR-NEXT: $sgpr6_sgpr7 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
     ; FLATSCR-NEXT: $sgpr5 = S_ADD_I32 $sgpr33, 16388, implicit-def dead $scc
     ; FLATSCR-NEXT: $vgpr2 = SCRATCH_LOAD_DWORD_SADDR killed $sgpr5, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.3, addrspace 5)
diff --git a/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr.mir b/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr.mir
index 934c3b11c1ac9..45e95d133e1bb 100644
--- a/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr.mir
+++ b/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr.mir
@@ -31,7 +31,7 @@ body:             |
     ; CHECK-NEXT: $sgpr5 = S_ADD_I32 $sgpr33, 262400, implicit-def dead $scc
     ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr5, 0, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5)
     ; CHECK-NEXT: $exec = S_MOV_B64 killed $sgpr6_sgpr7
-    ; CHECK-NEXT: $vgpr2 = V_WRITELANE_B32 $sgpr4, 0, undef $vgpr2
+    ; CHECK-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, undef $vgpr2
     ; CHECK-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 786432, implicit-def dead $scc
     ; CHECK-NEXT: S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr27, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc
     ; CHECK-NEXT: $sgpr33 = S_LSHR_B32 $sgpr33, 6, implicit-def $scc
@@ -40,7 +40,7 @@ body:             |
     ; CHECK-NEXT: $sgpr33 = S_ADD_I32 killed $sgpr33, -4096, implicit-def $scc
     ; CHECK-NEXT: $sgpr33 = S_LSHL_B32 $sgpr33, 6, implicit-def $scc
     ; CHECK-NEXT: $vgpr0 = V_OR_B32_e32 killed $vgpr3, $vgpr1, implicit $exec, implicit $sgpr4, implicit $sgpr5, implicit $sgpr6, implicit $sgpr7, implicit $sgpr8, implicit $sgpr9, implicit $sgpr10, implicit $sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $sgpr16, implicit $sgpr17, implicit $sgpr18, implicit $sgpr19, implicit $sgpr20, implicit $sgpr21, implicit $sgpr22, implicit $sgpr23, implicit $sgpr24, implicit $sgpr25, implicit $sgpr26, implicit $sgpr27, implicit $sgpr28, implicit $sgpr29, implicit $sgpr30, implicit $sgpr31
-    ; CHECK-NEXT: $sgpr4 = V_READLANE_B32 $vgpr2, 0
+    ; CHECK-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 0
     ; CHECK-NEXT: $sgpr6_sgpr7 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
     ; CHECK-NEXT: $sgpr5 = S_ADD_I32 $sgpr33, 262400, implicit-def dead $scc
     ; CHECK-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr5, 0, 0, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5)
diff --git a/llvm/test/CodeGen/AMDGPU/pei-scavenge-vgpr-spill.mir b/llvm/test/CodeGen/AMDGPU/pei-scavenge-vgpr-spill.mir
index 808e61f517a11..9462d01ba758d 100644
--- a/llvm/test/CodeGen/AMDGPU/pei-scavenge-vgpr-spill.mir
+++ b/llvm/test/CodeGen/AMDGPU/pei-scavenge-vgpr-spill.mir
@@ -33,7 +33,7 @@ body:             |
     ; GFX8-NEXT: $sgpr5 = S_ADD_I32 $sgpr33, 1048832, implicit-def dead $scc
     ; GFX8-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr5, 0, 0, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5)
     ; GFX8-NEXT: $exec = S_MOV_B64 killed $sgpr6_sgpr7
-    ; GFX8-NEXT: $vgpr2 = V_WRITELANE_B32 $sgpr4, 0, undef $vgpr2
+    ; GFX8-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, undef $vgpr2
     ; GFX8-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 2097152, implicit-def dead $scc
     ; GFX8-NEXT: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec
     ; GFX8-NEXT: $vcc_lo = S_MOV_B32 8192
@@ -42,7 +42,7 @@ body:             |
     ; GFX8-NEXT: $vcc_lo = S_MOV_B32 16384
     ; GFX8-NEXT: $vgpr3, dead $vcc = V_ADD_CO_U32_e64 killed $vcc_lo, killed $vgpr3, 0, implicit $exec
     ; GFX8-NEXT: $vgpr0 = V_OR_B32_e32 killed $vgpr3, $vgpr1, implicit $exec
-    ; GFX8-NEXT: $sgpr4 = V_READLANE_B32 $vgpr2, 0
+    ; GFX8-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 0
     ; GFX8-NEXT: $sgpr6_sgpr7 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
     ; GFX8-NEXT: $sgpr5 = S_ADD_I32 $sgpr33, 1048832, implicit-def dead $scc
     ; GFX8-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr5, 0, 0, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5)
@@ -50,6 +50,7 @@ body:             |
     ; GFX8-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -2097152, implicit-def dead $scc
     ; GFX8-NEXT: $sgpr33 = COPY $sgpr4
     ; GFX8-NEXT: S_ENDPGM 0, amdgpu_allvgprs
+    ;
     ; GFX9-LABEL: name: pei_scavenge_vgpr_spill
     ; GFX9: liveins: $vgpr2, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
     ; GFX9-NEXT: {{  $}}
@@ -60,14 +61,14 @@ body:             |
     ; GFX9-NEXT: $sgpr5 = S_ADD_I32 $sgpr33, 1048832, implicit-def dead $scc
     ; GFX9-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr5, 0, 0, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5)
     ; GFX9-NEXT: $exec = S_MOV_B64 killed $sgpr6_sgpr7
-    ; GFX9-NEXT: $vgpr2 = V_WRITELANE_B32 $sgpr4, 0, undef $vgpr2
+    ; GFX9-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, undef $vgpr2
     ; GFX9-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 2097152, implicit-def dead $scc
     ; GFX9-NEXT: $vgpr0 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec
     ; GFX9-NEXT: $vgpr0 = V_ADD_U32_e32 8192, killed $vgpr0, implicit $exec
     ; GFX9-NEXT: $vgpr3 = V_LSHRREV_B32_e64 6, $sgpr33, implicit $exec
     ; GFX9-NEXT: $vgpr3 = V_ADD_U32_e32 16384, killed $vgpr3, implicit $exec
     ; GFX9-NEXT: $vgpr0 = V_OR_B32_e32 killed $vgpr3, $vgpr1, implicit $exec
-    ; GFX9-NEXT: $sgpr4 = V_READLANE_B32 $vgpr2, 0
+    ; GFX9-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 0
     ; GFX9-NEXT: $sgpr6_sgpr7 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
     ; GFX9-NEXT: $sgpr5 = S_ADD_I32 $sgpr33, 1048832, implicit-def dead $scc
     ; GFX9-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr5, 0, 0, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5)
@@ -75,6 +76,7 @@ body:             |
     ; GFX9-NEXT: $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -2097152, implicit-def dead $scc
     ; GFX9-NEXT: $sgpr33 = COPY $sgpr4
     ; GFX9-NEXT: S_ENDPGM 0, amdgpu_allvgprs
+    ;
     ; GFX9-FLATSCR-LABEL: name: pei_scavenge_vgpr_spill
     ; GFX9-FLATSCR: liveins: $vgpr2, $vgpr248_vgpr249_vgpr250_vgpr251, $vgpr252_vgpr253_vgpr254_vgpr255, $vgpr240_vgpr241_vgpr242_vgpr243_vgpr244_vgpr245_vgpr246_vgpr247, $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, $vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63, $vgpr64_vgpr65_vgpr66_vgpr67_vgpr68_vgpr69_vgpr70_vgpr71_vgpr72_vgpr73_vgpr74_vgpr75_vgpr76_vgpr77_vgpr78_vgpr79, $vgpr80_vgpr81_vgpr82_vgpr83_vgpr84_vgpr85_vgpr86_vgpr87_vgpr88_vgpr89_vgpr90_vgpr91_vgpr92_vgpr93_vgpr94_vgpr95, $vgpr96_vgpr97_vgpr98_vgpr99_vgpr100_vgpr101_vgpr102_vgpr103_vgpr104_vgpr105_vgpr106_vgpr107_vgpr108_vgpr109_vgpr110_vgpr111, $vgpr112_vgpr113_vgpr114_vgpr115_vgpr116_vgpr117_vgpr118_vgpr119_vgpr120_vgpr121_vgpr122_vgpr123_vgpr124_vgpr125_vgpr126_vgpr127, $vgpr128_vgpr129_vgpr130_vgpr131_vgpr132_vgpr133_vgpr134_vgpr135_vgpr136_vgpr137_vgpr138_vgpr139_vgpr140_vgpr141_vgpr142_vgpr143, $vgpr144_vgpr145_vgpr146_vgpr147_vgpr148_vgpr149_vgpr150_vgpr151_vgpr152_vgpr153_vgpr154_vgpr155_vgpr156_vgpr157_vgpr158_vgpr159, $vgpr160_vgpr161_vgpr162_vgpr163_vgpr164_vgpr165_vgpr166_vgpr167_vgpr168_vgpr169_vgpr170_vgpr171_vgpr172_vgpr173_vgpr174_vgpr175, $vgpr176_vgpr177_vgpr178_vgpr179_vgpr180_vgpr181_vgpr182_vgpr183_vgpr184_vgpr185_vgpr186_vgpr187_vgpr188_vgpr189_vgpr190_vgpr191, $vgpr192_vgpr193_vgpr194_vgpr195_vgpr196_vgpr197_vgpr198_vgpr199_vgpr200_vgpr201_vgpr202_vgpr203_vgpr204_vgpr205_vgpr206_vgpr207, $vgpr208_vgpr209_vgpr210_vgpr211_vgpr212_vgpr213_vgpr214_vgpr215_vgpr216_vgpr217_vgpr218_vgpr219_vgpr220_vgpr221_vgpr222_vgpr223, $vgpr224_vgpr225_vgpr226_vgpr227_vgpr228_vgpr229_vgpr230_vgpr231_vgpr232_vgpr233_vgpr234_vgpr235_vgpr236_vgpr237_vgpr238_vgpr239
     ; GFX9-FLATSCR-NEXT: {{  $}}
@@ -85,13 +87,13 @@ body:             |
     ; GFX9-FLATSCR-NEXT: $sgpr5 = S_ADD_I32 $sgpr33, 16388, implicit-def dead $scc
     ; GFX9-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr2, killed $sgpr5, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.3, addrspace 5)
     ; GFX9-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr6_sgpr7
-    ; GFX9-FLATSCR-NEXT: $vgpr2 = V_WRITELANE_B32 $sgpr4, 0, undef $vgpr2
+    ; GFX9-FLATSCR-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, undef $vgpr2
     ; GFX9-FLATSCR-NEXT: $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 32768, implicit-def dead $scc
     ; GFX9-FLATSCR-NEXT: $sgpr4 = S_ADD_I32 $sgpr33, 8192, implicit-def $scc
     ; GFX9-FLATSCR-NEXT: $vgpr0 = V_MOV_B32_e32 killed $sgpr4, implicit $exec
     ; GFX9-FLATSCR-NEXT: $sgpr4 = S_ADD_I32 $sgpr33, 16384, implicit-def $scc
     ; GFX9-FLATSCR-NEXT: $vgpr0 = V_OR_B32_e32 killed $sgpr4, $vgpr1, implicit $exec
-    ; GFX9-FLATSCR-NEXT: $sgpr4 = V_READLANE_B32 $vgpr2, 0
+    ; GFX9-FLATSCR-NEXT: $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 0
     ; GFX9-FLATSCR-NEXT: $sgpr6_sgpr7 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
     ; GFX9-FLATSCR-NEXT: $sgpr5 = S_ADD_I32 $sgpr33, 16388, implicit-def dead $scc
     ; GFX9-FLATSCR-NEXT: $vgpr2 = SCRATCH_LOAD_DWORD_SADDR killed $sgpr5, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.3, addrspace 5)
diff --git a/llvm/test/CodeGen/AMDGPU/preserve-only-inactive-lane.mir b/llvm/test/CodeGen/AMDGPU/preserve-only-inactive-lane.mir
index 2f36bba5b75b9..4571e792c7cb5 100644
--- a/llvm/test/CodeGen/AMDGPU/preserve-only-inactive-lane.mir
+++ b/llvm/test/CodeGen/AMDGPU/preserve-only-inactive-lane.mir
@@ -23,17 +23,17 @@ body:             |
     ; GCN-NEXT: $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
     ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
     ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5
-    ; GCN-NEXT: renamable $vgpr0 = V_WRITELANE_B32 $sgpr35, 0, killed $vgpr0
+    ; GCN-NEXT: renamable $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr0
     ; GCN-NEXT: $sgpr35 = S_MOV_B32 5
-    ; GCN-NEXT: $sgpr35 = V_READLANE_B32 $vgpr0, 0
+    ; GCN-NEXT: $sgpr35 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0
     ; GCN-NEXT: renamable $vgpr0 = V_MOV_B32_e32 10, implicit $exec
     ; GCN-NEXT: $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
     ; GCN-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit $vgpr0(tied-def 0) :: (load (s32) from %stack.0, addrspace 5)
     ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5
     ; GCN-NEXT: S_SETPC_B64_return killed renamable $sgpr30_sgpr31, implicit $vgpr0
-    renamable $vgpr0 = V_WRITELANE_B32 $sgpr35, 0, killed $vgpr0
+    renamable $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr35, 0, killed $vgpr0
     $sgpr35 = S_MOV_B32 5
-    $sgpr35 = V_READLANE_B32 $vgpr0, 0
+    $sgpr35 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0
     renamable $vgpr0 = V_MOV_B32_e32 10, implicit $exec
     S_SETPC_B64_return killed renamable $sgpr30_sgpr31, implicit $vgpr0
 ...
diff --git a/llvm/test/CodeGen/AMDGPU/same-slot-agpr-sgpr.mir b/llvm/test/CodeGen/AMDGPU/same-slot-agpr-sgpr.mir
index b323d880c9827..592e0f0cf0c24 100644
--- a/llvm/test/CodeGen/AMDGPU/same-slot-agpr-sgpr.mir
+++ b/llvm/test/CodeGen/AMDGPU/same-slot-agpr-sgpr.mir
@@ -21,8 +21,8 @@ body: |
     ; CHECK-NEXT: $sgpr6_sgpr7 = S_MOV_B64 $exec
     ; CHECK-NEXT: $exec = S_MOV_B64 3, implicit-def $vgpr0
     ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5)
-    ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr4, 0, undef $vgpr0
-    ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr5, 1, $vgpr0, implicit killed $sgpr4_sgpr5
+    ; CHECK-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, undef $vgpr0, implicit $sgpr4_sgpr5
+    ; CHECK-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr5, 1, $vgpr0, implicit killed $sgpr4_sgpr5
     ; CHECK-NEXT: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec
     ; CHECK-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5)
     ; CHECK-NEXT: $exec = S_MOV_B64 killed $sgpr6_sgpr7, implicit killed $vgpr0
@@ -54,8 +54,8 @@ body: |
     ; CHECK-NEXT: $sgpr6_sgpr7 = S_MOV_B64 $exec
     ; CHECK-NEXT: $exec = S_MOV_B64 3, implicit-def $vgpr0
     ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5)
-    ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr4, 0, undef $vgpr0
-    ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr5, 1, $vgpr0, implicit killed $sgpr4_sgpr5
+    ; CHECK-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, undef $vgpr0, implicit $sgpr4_sgpr5
+    ; CHECK-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr5, 1, $vgpr0, implicit killed $sgpr4_sgpr5
     ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5)
     ; CHECK-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5)
     ; CHECK-NEXT: $exec = S_MOV_B64 killed $sgpr6_sgpr7, implicit killed $vgpr0
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-dead-frame-in-dbg-value.mir b/llvm/test/CodeGen/AMDGPU/sgpr-spill-dead-frame-in-dbg-value.mir
index 341fe2d9b453f..a7c841d54f7c9 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-dead-frame-in-dbg-value.mir
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-dead-frame-in-dbg-value.mir
@@ -46,11 +46,11 @@ body:             |
   ; SGPR_SPILL-NEXT: {{  $}}
   ; SGPR_SPILL-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; SGPR_SPILL-NEXT:   renamable $sgpr10 = IMPLICIT_DEF
-  ; SGPR_SPILL-NEXT:   [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr10, 0, [[V_WRITELANE_B32_]]
+  ; SGPR_SPILL-NEXT:   [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR killed $sgpr10, 0, [[V_WRITELANE_B32_]]
   ; SGPR_SPILL-NEXT:   DBG_VALUE $noreg, 0
   ; SGPR_SPILL-NEXT: {{  $}}
   ; SGPR_SPILL-NEXT: bb.1:
-  ; SGPR_SPILL-NEXT:   $sgpr10 = V_READLANE_B32 [[V_WRITELANE_B32_]], 0
+  ; SGPR_SPILL-NEXT:   $sgpr10 = SI_RESTORE_S32_FROM_VGPR [[V_WRITELANE_B32_]], 0
   ; SGPR_SPILL-NEXT:   KILL [[V_WRITELANE_B32_]]
   ; SGPR_SPILL-NEXT:   S_ENDPGM 0
   ; PEI-LABEL: name: test
@@ -59,12 +59,12 @@ body:             |
   ; PEI-NEXT: {{  $}}
   ; PEI-NEXT:   renamable $vgpr0 = IMPLICIT_DEF
   ; PEI-NEXT:   renamable $sgpr10 = IMPLICIT_DEF
-  ; PEI-NEXT:   renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr10, 0, killed $vgpr0
+  ; PEI-NEXT:   renamable $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr10, 0, killed $vgpr0
   ; PEI-NEXT: {{  $}}
   ; PEI-NEXT: bb.1:
   ; PEI-NEXT:   liveins: $vgpr0
   ; PEI-NEXT: {{  $}}
-  ; PEI-NEXT:   $sgpr10 = V_READLANE_B32 $vgpr0, 0
+  ; PEI-NEXT:   $sgpr10 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0
   ; PEI-NEXT:   KILL killed renamable $vgpr0
   ; PEI-NEXT:   S_ENDPGM 0
   bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-partially-undef.mir b/llvm/test/CodeGen/AMDGPU/sgpr-spill-partially-undef.mir
index 26a5eedc3eca3..055e859f50300 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-partially-undef.mir
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-partially-undef.mir
@@ -23,8 +23,8 @@ body:             |
     ; CHECK: liveins: $sgpr4
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-    ; CHECK-NEXT: [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr4, 0, [[V_WRITELANE_B32_]], implicit-def $sgpr4_sgpr5, implicit $sgpr4_sgpr5
-    ; CHECK-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr5, 1, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, [[DEF]], implicit-def $sgpr4_sgpr5, implicit $sgpr4_sgpr5
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr5, 1, [[DEF]], implicit $sgpr4_sgpr5
     SI_SPILL_S64_SAVE renamable $sgpr4_sgpr5, %stack.0, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 :: (store (s64) into %stack.0, align 4, addrspace 5)
 
 ...
@@ -49,8 +49,8 @@ body:             |
     ; CHECK: liveins: $sgpr5
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-    ; CHECK-NEXT: [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr4, 0, [[V_WRITELANE_B32_]], implicit-def $sgpr4_sgpr5, implicit $sgpr4_sgpr5
-    ; CHECK-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr5, 1, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, [[DEF]], implicit-def $sgpr4_sgpr5, implicit $sgpr4_sgpr5
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr5, 1, [[DEF]], implicit $sgpr4_sgpr5
     SI_SPILL_S64_SAVE renamable $sgpr4_sgpr5, %stack.0, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 :: (store (s64) into %stack.0, align 4, addrspace 5)
 
 ...
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-to-vmem-scc-clobber.mir b/llvm/test/CodeGen/AMDGPU/sgpr-spill-to-vmem-scc-clobber.mir
index 797231b3412a8..2c4b7a22facf4 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-to-vmem-scc-clobber.mir
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-to-vmem-scc-clobber.mir
@@ -27,7 +27,7 @@ body:             |
   ; VMEM-GFX8-NEXT:   $sgpr4_sgpr5 = S_MOV_B64 $exec
   ; VMEM-GFX8-NEXT:   $exec = S_MOV_B64 1, implicit-def $vgpr0
   ; VMEM-GFX8-NEXT:   BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5)
-  ; VMEM-GFX8-NEXT:   $vgpr0 = V_WRITELANE_B32 $sgpr8, 0, undef $vgpr0
+  ; VMEM-GFX8-NEXT:   $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr8, 0, undef $vgpr0
   ; VMEM-GFX8-NEXT:   $vgpr1 = V_MOV_B32_e32 8200, implicit $exec
   ; VMEM-GFX8-NEXT:   BUFFER_STORE_DWORD_OFFEN killed $vgpr0, killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5)
   ; VMEM-GFX8-NEXT:   $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5)
@@ -77,8 +77,8 @@ body:             |
   ; VMEM-GFX8-NEXT:   $sgpr4_sgpr5 = S_MOV_B64 $exec
   ; VMEM-GFX8-NEXT:   $exec = S_MOV_B64 3, implicit-def $vgpr0
   ; VMEM-GFX8-NEXT:   BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5)
-  ; VMEM-GFX8-NEXT:   $vgpr0 = V_WRITELANE_B32 $sgpr8, 0, undef $vgpr0, implicit $sgpr8_sgpr9
-  ; VMEM-GFX8-NEXT:   $vgpr0 = V_WRITELANE_B32 $sgpr9, 1, $vgpr0, implicit $sgpr8_sgpr9
+  ; VMEM-GFX8-NEXT:   $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr8, 0, undef $vgpr0, implicit $sgpr8_sgpr9
+  ; VMEM-GFX8-NEXT:   $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr9, 1, $vgpr0, implicit $sgpr8_sgpr9
   ; VMEM-GFX8-NEXT:   $vgpr1 = V_MOV_B32_e32 8200, implicit $exec
   ; VMEM-GFX8-NEXT:   BUFFER_STORE_DWORD_OFFEN killed $vgpr0, killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5)
   ; VMEM-GFX8-NEXT:   $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5)
@@ -128,7 +128,7 @@ body:             |
   ; VMEM-GFX8-NEXT:   BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5)
   ; VMEM-GFX8-NEXT:   $vgpr1 = V_MOV_B32_e32 8200, implicit $exec
   ; VMEM-GFX8-NEXT:   $vgpr0 = BUFFER_LOAD_DWORD_OFFEN killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5)
-  ; VMEM-GFX8-NEXT:   $sgpr8 = V_READLANE_B32 killed $vgpr0, 0
+  ; VMEM-GFX8-NEXT:   $sgpr8 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 0
   ; VMEM-GFX8-NEXT:   $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5)
   ; VMEM-GFX8-NEXT:   $exec = S_MOV_B64 killed $sgpr4_sgpr5, implicit killed $vgpr0
   ; VMEM-GFX8-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit $scc
@@ -176,8 +176,8 @@ body:             |
   ; VMEM-GFX8-NEXT:   BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5)
   ; VMEM-GFX8-NEXT:   $vgpr1 = V_MOV_B32_e32 8200, implicit $exec
   ; VMEM-GFX8-NEXT:   $vgpr0 = BUFFER_LOAD_DWORD_OFFEN killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5)
-  ; VMEM-GFX8-NEXT:   $sgpr8 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr8_sgpr9
-  ; VMEM-GFX8-NEXT:   $sgpr9 = V_READLANE_B32 killed $vgpr0, 1
+  ; VMEM-GFX8-NEXT:   $sgpr8 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0, implicit-def $sgpr8_sgpr9
+  ; VMEM-GFX8-NEXT:   $sgpr9 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 1
   ; VMEM-GFX8-NEXT:   $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5)
   ; VMEM-GFX8-NEXT:   $exec = S_MOV_B64 killed $sgpr4_sgpr5, implicit killed $vgpr0
   ; VMEM-GFX8-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit $scc
@@ -224,7 +224,7 @@ body:             |
   ; VMEM-GFX8-NEXT:   $sgpr4_sgpr5 = S_MOV_B64 $exec
   ; VMEM-GFX8-NEXT:   $exec = S_MOV_B64 1
   ; VMEM-GFX8-NEXT:   BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5)
-  ; VMEM-GFX8-NEXT:   $vgpr0 = V_WRITELANE_B32 $sgpr8, 0, undef $vgpr0
+  ; VMEM-GFX8-NEXT:   $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr8, 0, undef $vgpr0
   ; VMEM-GFX8-NEXT:   BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5)
   ; VMEM-GFX8-NEXT:   $vgpr1 = V_MOV_B32_e32 8200, implicit $exec
   ; VMEM-GFX8-NEXT:   BUFFER_STORE_DWORD_OFFEN killed $vgpr0, killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5)
@@ -280,7 +280,7 @@ body:             |
   ; VMEM-GFX8-NEXT:   BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5)
   ; VMEM-GFX8-NEXT:   $vgpr1 = V_MOV_B32_e32 8200, implicit $exec
   ; VMEM-GFX8-NEXT:   $vgpr0 = BUFFER_LOAD_DWORD_OFFEN killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5)
-  ; VMEM-GFX8-NEXT:   $sgpr8 = V_READLANE_B32 killed $vgpr0, 0
+  ; VMEM-GFX8-NEXT:   $sgpr8 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 0
   ; VMEM-GFX8-NEXT:   $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5)
   ; VMEM-GFX8-NEXT:   $exec = S_MOV_B64 killed $sgpr4_sgpr5
   ; VMEM-GFX8-NEXT:   $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5)
@@ -333,8 +333,8 @@ body:             |
   ; VMEM-GFX8-NEXT:   $sgpr4_sgpr5 = S_MOV_B64 $exec
   ; VMEM-GFX8-NEXT:   $exec = S_MOV_B64 3
   ; VMEM-GFX8-NEXT:   BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5)
-  ; VMEM-GFX8-NEXT:   $vgpr0 = V_WRITELANE_B32 $sgpr8, 0, undef $vgpr0, implicit $sgpr8_sgpr9
-  ; VMEM-GFX8-NEXT:   $vgpr0 = V_WRITELANE_B32 $sgpr9, 1, $vgpr0, implicit $sgpr8_sgpr9
+  ; VMEM-GFX8-NEXT:   $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr8, 0, undef $vgpr0, implicit $sgpr8_sgpr9
+  ; VMEM-GFX8-NEXT:   $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr9, 1, $vgpr0, implicit $sgpr8_sgpr9
   ; VMEM-GFX8-NEXT:   BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5)
   ; VMEM-GFX8-NEXT:   $vgpr1 = V_MOV_B32_e32 8200, implicit $exec
   ; VMEM-GFX8-NEXT:   BUFFER_STORE_DWORD_OFFEN killed $vgpr0, killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5)
@@ -390,8 +390,8 @@ body:             |
   ; VMEM-GFX8-NEXT:   BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5)
   ; VMEM-GFX8-NEXT:   $vgpr1 = V_MOV_B32_e32 8200, implicit $exec
   ; VMEM-GFX8-NEXT:   $vgpr0 = BUFFER_LOAD_DWORD_OFFEN killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5)
-  ; VMEM-GFX8-NEXT:   $sgpr8 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr8_sgpr9
-  ; VMEM-GFX8-NEXT:   $sgpr9 = V_READLANE_B32 killed $vgpr0, 1
+  ; VMEM-GFX8-NEXT:   $sgpr8 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0, implicit-def $sgpr8_sgpr9
+  ; VMEM-GFX8-NEXT:   $sgpr9 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 1
   ; VMEM-GFX8-NEXT:   $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5)
   ; VMEM-GFX8-NEXT:   $exec = S_MOV_B64 killed $sgpr4_sgpr5
   ; VMEM-GFX8-NEXT:   $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5)
@@ -447,7 +447,7 @@ body:             |
   ; VMEM-GFX8-NEXT:   $sgpr4_sgpr5 = S_MOV_B64 $exec
   ; VMEM-GFX8-NEXT:   $exec = S_MOV_B64 1
   ; VMEM-GFX8-NEXT:   BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5)
-  ; VMEM-GFX8-NEXT:   $vgpr0 = V_WRITELANE_B32 $sgpr8, 0, undef $vgpr0
+  ; VMEM-GFX8-NEXT:   $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr8, 0, undef $vgpr0
   ; VMEM-GFX8-NEXT:   BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec :: (store (s32) into %stack.4, addrspace 5)
   ; VMEM-GFX8-NEXT:   $vgpr1 = V_MOV_B32_e32 8200, implicit $exec
   ; VMEM-GFX8-NEXT:   BUFFER_STORE_DWORD_OFFEN killed $vgpr0, killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5)
@@ -457,7 +457,7 @@ body:             |
   ; VMEM-GFX8-NEXT:   $sgpr4_sgpr5 = S_MOV_B64 $exec
   ; VMEM-GFX8-NEXT:   $exec = S_MOV_B64 1
   ; VMEM-GFX8-NEXT:   BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5)
-  ; VMEM-GFX8-NEXT:   $vgpr0 = V_WRITELANE_B32 $sgpr9, 0, undef $vgpr0
+  ; VMEM-GFX8-NEXT:   $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr9, 0, undef $vgpr0
   ; VMEM-GFX8-NEXT:   BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec :: (store (s32) into %stack.4, addrspace 5)
   ; VMEM-GFX8-NEXT:   $vgpr1 = V_MOV_B32_e32 8200, implicit $exec
   ; VMEM-GFX8-NEXT:   BUFFER_STORE_DWORD_OFFEN killed $vgpr0, killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5)
@@ -514,7 +514,7 @@ body:             |
   ; VMEM-GFX8-NEXT:   BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec :: (store (s32) into %stack.4, addrspace 5)
   ; VMEM-GFX8-NEXT:   $vgpr1 = V_MOV_B32_e32 8200, implicit $exec
   ; VMEM-GFX8-NEXT:   $vgpr0 = BUFFER_LOAD_DWORD_OFFEN killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5)
-  ; VMEM-GFX8-NEXT:   $sgpr8 = V_READLANE_B32 killed $vgpr0, 0
+  ; VMEM-GFX8-NEXT:   $sgpr8 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 0
   ; VMEM-GFX8-NEXT:   $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5)
   ; VMEM-GFX8-NEXT:   $exec = S_MOV_B64 killed $sgpr4_sgpr5
   ; VMEM-GFX8-NEXT:   $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec :: (load (s32) from %stack.4, addrspace 5)
@@ -524,7 +524,7 @@ body:             |
   ; VMEM-GFX8-NEXT:   BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec :: (store (s32) into %stack.4, addrspace 5)
   ; VMEM-GFX8-NEXT:   $vgpr1 = V_MOV_B32_e32 16392, implicit $exec
   ; VMEM-GFX8-NEXT:   $vgpr0 = BUFFER_LOAD_DWORD_OFFEN killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5)
-  ; VMEM-GFX8-NEXT:   $sgpr9 = V_READLANE_B32 killed $vgpr0, 0
+  ; VMEM-GFX8-NEXT:   $sgpr9 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 0
   ; VMEM-GFX8-NEXT:   $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5)
   ; VMEM-GFX8-NEXT:   $exec = S_MOV_B64 killed $sgpr4_sgpr5
   ; VMEM-GFX8-NEXT:   $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4, 0, 0, implicit $exec :: (load (s32) from %stack.4, addrspace 5)
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-vmem-large-frame.mir b/llvm/test/CodeGen/AMDGPU/sgpr-spill-vmem-large-frame.mir
index c44892baca366..cac9c85130a7b 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-vmem-large-frame.mir
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-vmem-large-frame.mir
@@ -29,7 +29,7 @@ body:             |
     ; CHECK-NEXT: $sgpr4_sgpr5 = S_MOV_B64 $exec
     ; CHECK-NEXT: $exec = S_MOV_B64 1, implicit-def $vgpr1
     ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5)
-    ; CHECK-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr10, 0, undef $vgpr1
+    ; CHECK-NEXT: $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr10, 0, undef $vgpr1
     ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
     ; CHECK-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5)
     ; CHECK-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5, implicit killed $vgpr1
@@ -37,7 +37,7 @@ body:             |
     ; CHECK-NEXT: $exec = S_MOV_B64 1, implicit-def $vgpr1
     ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5)
     ; CHECK-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5)
-    ; CHECK-NEXT: $sgpr10 = V_READLANE_B32 killed $vgpr1, 0
+    ; CHECK-NEXT: $sgpr10 = SI_RESTORE_S32_FROM_VGPR killed $vgpr1, 0
     ; CHECK-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5)
     ; CHECK-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5, implicit killed $vgpr1
     ; CHECK-NEXT: S_SETPC_B64 $sgpr30_sgpr31, implicit $scc
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill.mir b/llvm/test/CodeGen/AMDGPU/sgpr-spill.mir
index a3fabfffde088..059eb6dad2c31 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-spill.mir
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill.mir
@@ -69,7 +69,7 @@ body:             |
     ; GCN64-MUBUF-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 1, implicit-def $vgpr0
     ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
-    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr12, 0, undef $vgpr0
+    ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr12, 0, undef $vgpr0
     ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 4, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
     ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
@@ -77,7 +77,7 @@ body:             |
     ; GCN64-MUBUF-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 1, implicit-def $vgpr0
     ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
-    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr12, 0, undef $vgpr0
+    ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr12, 0, undef $vgpr0
     ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 4, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
     ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
@@ -85,8 +85,8 @@ body:             |
     ; GCN64-MUBUF-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 3, implicit-def $vgpr0
     ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
-    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr12, 0, undef $vgpr0, implicit $sgpr12_sgpr13
-    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr13, 1, $vgpr0, implicit killed $sgpr12_sgpr13
+    ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr12, 0, undef $vgpr0, implicit $sgpr12_sgpr13
+    ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr13, 1, $vgpr0, implicit killed $sgpr12_sgpr13
     ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 8, 0, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5)
     ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
@@ -94,8 +94,8 @@ body:             |
     ; GCN64-MUBUF-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 3, implicit-def $vgpr0
     ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
-    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr12, 0, undef $vgpr0, implicit $sgpr12_sgpr13
-    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr13, 1, $vgpr0, implicit $sgpr12_sgpr13
+    ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr12, 0, undef $vgpr0, implicit $sgpr12_sgpr13
+    ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr13, 1, $vgpr0, implicit $sgpr12_sgpr13
     ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 8, 0, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5)
     ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
@@ -103,9 +103,9 @@ body:             |
     ; GCN64-MUBUF-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 7, implicit-def $vgpr0
     ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
-    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr12, 0, undef $vgpr0, implicit $sgpr12_sgpr13_sgpr14
-    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr13, 1, $vgpr0, implicit $sgpr12_sgpr13_sgpr14
-    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr14, 2, $vgpr0, implicit killed $sgpr12_sgpr13_sgpr14
+    ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr12, 0, undef $vgpr0, implicit $sgpr12_sgpr13_sgpr14
+    ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr13, 1, $vgpr0, implicit $sgpr12_sgpr13_sgpr14
+    ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr14, 2, $vgpr0, implicit killed $sgpr12_sgpr13_sgpr14
     ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 16, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5)
     ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
@@ -113,10 +113,10 @@ body:             |
     ; GCN64-MUBUF-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 15, implicit-def $vgpr0
     ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
-    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr12, 0, undef $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15
-    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr13, 1, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15
-    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr14, 2, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15
-    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr15, 3, $vgpr0, implicit killed $sgpr12_sgpr13_sgpr14_sgpr15
+    ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr12, 0, undef $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15
+    ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr13, 1, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15
+    ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr14, 2, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15
+    ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr15, 3, $vgpr0, implicit killed $sgpr12_sgpr13_sgpr14_sgpr15
     ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 28, 0, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5)
     ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
@@ -124,11 +124,11 @@ body:             |
     ; GCN64-MUBUF-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 31, implicit-def $vgpr0
     ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
-    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr12, 0, undef $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16
-    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr13, 1, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16
-    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr14, 2, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16
-    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr15, 3, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16
-    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr16, 4, $vgpr0, implicit killed $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16
+    ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr12, 0, undef $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16
+    ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr13, 1, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16
+    ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr14, 2, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16
+    ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr15, 3, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16
+    ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr16, 4, $vgpr0, implicit killed $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16
     ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 44, 0, 0, implicit $exec :: (store (s32) into %stack.4, addrspace 5)
     ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
@@ -136,14 +136,14 @@ body:             |
     ; GCN64-MUBUF-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 255, implicit-def $vgpr0
     ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
-    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr12, 0, undef $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
-    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr13, 1, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
-    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr14, 2, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
-    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr15, 3, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
-    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr16, 4, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
-    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr17, 5, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
-    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr18, 6, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
-    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr19, 7, $vgpr0, implicit killed $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
+    ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr12, 0, undef $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
+    ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr13, 1, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
+    ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr14, 2, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
+    ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr15, 3, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
+    ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr16, 4, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
+    ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr17, 5, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
+    ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr18, 6, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
+    ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr19, 7, $vgpr0, implicit killed $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
     ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 64, 0, 0, implicit $exec :: (store (s32) into %stack.5, addrspace 5)
     ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
@@ -151,22 +151,22 @@ body:             |
     ; GCN64-MUBUF-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 65535, implicit-def $vgpr0
     ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
-    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr12, 0, undef $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
-    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr13, 1, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
-    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr14, 2, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
-    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr15, 3, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
-    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr16, 4, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
-    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr17, 5, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
-    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr18, 6, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
-    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr19, 7, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
-    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr20, 8, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
-    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr21, 9, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
-    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr22, 10, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
-    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr23, 11, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
-    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr24, 12, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
-    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr25, 13, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
-    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr26, 14, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
-    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr27, 15, $vgpr0, implicit killed $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
+    ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr12, 0, undef $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
+    ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr13, 1, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
+    ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr14, 2, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
+    ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr15, 3, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
+    ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr16, 4, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
+    ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr17, 5, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
+    ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr18, 6, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
+    ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr19, 7, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
+    ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr20, 8, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
+    ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr21, 9, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
+    ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr22, 10, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
+    ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr23, 11, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
+    ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr24, 12, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
+    ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr25, 13, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
+    ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr26, 14, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
+    ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr27, 15, $vgpr0, implicit killed $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
     ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 96, 0, 0, implicit $exec :: (store (s32) into %stack.6, addrspace 5)
     ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
@@ -174,38 +174,38 @@ body:             |
     ; GCN64-MUBUF-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 4294967295, implicit-def $vgpr0
     ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
-    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr64, 0, undef $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr65, 1, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr66, 2, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr67, 3, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr68, 4, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr69, 5, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr70, 6, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr71, 7, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr72, 8, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr73, 9, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr74, 10, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr75, 11, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr76, 12, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr77, 13, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr78, 14, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr79, 15, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr80, 16, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr81, 17, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr82, 18, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr83, 19, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr84, 20, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr85, 21, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr86, 22, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr87, 23, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr88, 24, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr89, 25, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr90, 26, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr91, 27, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr92, 28, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr93, 29, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr94, 30, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr95, 31, $vgpr0, implicit killed $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr64, 0, undef $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr65, 1, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr66, 2, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr67, 3, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr68, 4, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr69, 5, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr70, 6, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr71, 7, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr72, 8, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr73, 9, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr74, 10, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr75, 11, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr76, 12, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr77, 13, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr78, 14, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr79, 15, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr80, 16, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr81, 17, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr82, 18, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr83, 19, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr84, 20, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr85, 21, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr86, 22, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr87, 23, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr88, 24, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr89, 25, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr90, 26, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr91, 27, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr92, 28, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr93, 29, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr94, 30, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr95, 31, $vgpr0, implicit killed $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
     ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 160, 0, 0, implicit $exec :: (store (s32) into %stack.7, addrspace 5)
     ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
@@ -213,11 +213,12 @@ body:             |
     ; GCN64-MUBUF-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 1, implicit-def $vgpr0
     ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
-    ; GCN64-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr12, 0, undef $vgpr0
+    ; GCN64-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr12, 0, undef $vgpr0
     ; GCN64-MUBUF-NEXT: $sgpr2 = S_ADD_I32 $sgpr33, 262144, implicit-def dead $scc
     ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, killed $sgpr2, 0, 0, 0, implicit $exec :: (store (s32) into %stack.8, align 4096, addrspace 5)
     ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
+    ;
     ; GCN32-MUBUF-LABEL: name: check_spill
     ; GCN32-MUBUF: liveins: $sgpr8, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr11
     ; GCN32-MUBUF-NEXT: {{  $}}
@@ -232,7 +233,7 @@ body:             |
     ; GCN32-MUBUF-NEXT: $sgpr0 = S_MOV_B32 $exec_lo
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 1, implicit-def $vgpr0
     ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
-    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr12, 0, undef $vgpr0
+    ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr12, 0, undef $vgpr0
     ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 4, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
     ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0, implicit killed $vgpr0
@@ -240,7 +241,7 @@ body:             |
     ; GCN32-MUBUF-NEXT: $sgpr0 = S_MOV_B32 $exec_lo
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 1, implicit-def $vgpr0
     ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
-    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr12, 0, undef $vgpr0
+    ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr12, 0, undef $vgpr0
     ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 4, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
     ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0, implicit killed $vgpr0
@@ -248,8 +249,8 @@ body:             |
     ; GCN32-MUBUF-NEXT: $sgpr0 = S_MOV_B32 $exec_lo
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 3, implicit-def $vgpr0
     ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
-    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr12, 0, undef $vgpr0, implicit $sgpr12_sgpr13
-    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr13, 1, $vgpr0, implicit killed $sgpr12_sgpr13
+    ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr12, 0, undef $vgpr0, implicit $sgpr12_sgpr13
+    ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr13, 1, $vgpr0, implicit killed $sgpr12_sgpr13
     ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 8, 0, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5)
     ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0, implicit killed $vgpr0
@@ -257,8 +258,8 @@ body:             |
     ; GCN32-MUBUF-NEXT: $sgpr0 = S_MOV_B32 $exec_lo
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 3, implicit-def $vgpr0
     ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
-    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr12, 0, undef $vgpr0, implicit $sgpr12_sgpr13
-    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr13, 1, $vgpr0, implicit $sgpr12_sgpr13
+    ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr12, 0, undef $vgpr0, implicit $sgpr12_sgpr13
+    ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr13, 1, $vgpr0, implicit $sgpr12_sgpr13
     ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 8, 0, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5)
     ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0, implicit killed $vgpr0
@@ -266,9 +267,9 @@ body:             |
     ; GCN32-MUBUF-NEXT: $sgpr0 = S_MOV_B32 $exec_lo
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 7, implicit-def $vgpr0
     ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
-    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr12, 0, undef $vgpr0, implicit $sgpr12_sgpr13_sgpr14
-    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr13, 1, $vgpr0, implicit $sgpr12_sgpr13_sgpr14
-    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr14, 2, $vgpr0, implicit killed $sgpr12_sgpr13_sgpr14
+    ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr12, 0, undef $vgpr0, implicit $sgpr12_sgpr13_sgpr14
+    ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr13, 1, $vgpr0, implicit $sgpr12_sgpr13_sgpr14
+    ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr14, 2, $vgpr0, implicit killed $sgpr12_sgpr13_sgpr14
     ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 16, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5)
     ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0, implicit killed $vgpr0
@@ -276,10 +277,10 @@ body:             |
     ; GCN32-MUBUF-NEXT: $sgpr0 = S_MOV_B32 $exec_lo
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 15, implicit-def $vgpr0
     ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
-    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr12, 0, undef $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15
-    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr13, 1, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15
-    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr14, 2, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15
-    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr15, 3, $vgpr0, implicit killed $sgpr12_sgpr13_sgpr14_sgpr15
+    ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr12, 0, undef $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15
+    ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr13, 1, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15
+    ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr14, 2, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15
+    ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr15, 3, $vgpr0, implicit killed $sgpr12_sgpr13_sgpr14_sgpr15
     ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 28, 0, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5)
     ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0, implicit killed $vgpr0
@@ -287,11 +288,11 @@ body:             |
     ; GCN32-MUBUF-NEXT: $sgpr0 = S_MOV_B32 $exec_lo
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 31, implicit-def $vgpr0
     ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
-    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr12, 0, undef $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16
-    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr13, 1, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16
-    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr14, 2, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16
-    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr15, 3, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16
-    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr16, 4, $vgpr0, implicit killed $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16
+    ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr12, 0, undef $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16
+    ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr13, 1, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16
+    ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr14, 2, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16
+    ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr15, 3, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16
+    ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr16, 4, $vgpr0, implicit killed $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16
     ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 44, 0, 0, implicit $exec :: (store (s32) into %stack.4, addrspace 5)
     ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0, implicit killed $vgpr0
@@ -299,14 +300,14 @@ body:             |
     ; GCN32-MUBUF-NEXT: $sgpr0 = S_MOV_B32 $exec_lo
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 255, implicit-def $vgpr0
     ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
-    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr12, 0, undef $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
-    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr13, 1, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
-    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr14, 2, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
-    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr15, 3, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
-    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr16, 4, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
-    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr17, 5, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
-    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr18, 6, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
-    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr19, 7, $vgpr0, implicit killed $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
+    ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr12, 0, undef $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
+    ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr13, 1, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
+    ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr14, 2, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
+    ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr15, 3, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
+    ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr16, 4, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
+    ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr17, 5, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
+    ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr18, 6, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
+    ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr19, 7, $vgpr0, implicit killed $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
     ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 64, 0, 0, implicit $exec :: (store (s32) into %stack.5, addrspace 5)
     ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0, implicit killed $vgpr0
@@ -314,22 +315,22 @@ body:             |
     ; GCN32-MUBUF-NEXT: $sgpr0 = S_MOV_B32 $exec_lo
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 65535, implicit-def $vgpr0
     ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
-    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr12, 0, undef $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
-    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr13, 1, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
-    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr14, 2, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
-    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr15, 3, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
-    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr16, 4, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
-    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr17, 5, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
-    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr18, 6, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
-    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr19, 7, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
-    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr20, 8, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
-    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr21, 9, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
-    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr22, 10, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
-    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr23, 11, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
-    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr24, 12, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
-    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr25, 13, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
-    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr26, 14, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
-    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr27, 15, $vgpr0, implicit killed $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
+    ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr12, 0, undef $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
+    ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr13, 1, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
+    ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr14, 2, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
+    ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr15, 3, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
+    ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr16, 4, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
+    ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr17, 5, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
+    ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr18, 6, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
+    ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr19, 7, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
+    ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr20, 8, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
+    ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr21, 9, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
+    ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr22, 10, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
+    ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr23, 11, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
+    ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr24, 12, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
+    ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr25, 13, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
+    ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr26, 14, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
+    ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr27, 15, $vgpr0, implicit killed $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
     ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 96, 0, 0, implicit $exec :: (store (s32) into %stack.6, addrspace 5)
     ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0, implicit killed $vgpr0
@@ -337,38 +338,38 @@ body:             |
     ; GCN32-MUBUF-NEXT: $sgpr0 = S_MOV_B32 $exec_lo
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 4294967295, implicit-def $vgpr0
     ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
-    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr64, 0, undef $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr65, 1, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr66, 2, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr67, 3, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr68, 4, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr69, 5, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr70, 6, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr71, 7, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr72, 8, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr73, 9, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr74, 10, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr75, 11, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr76, 12, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr77, 13, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr78, 14, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr79, 15, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr80, 16, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr81, 17, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr82, 18, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr83, 19, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr84, 20, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr85, 21, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr86, 22, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr87, 23, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr88, 24, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr89, 25, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr90, 26, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr91, 27, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr92, 28, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr93, 29, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr94, 30, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr95, 31, $vgpr0, implicit killed $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr64, 0, undef $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr65, 1, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr66, 2, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr67, 3, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr68, 4, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr69, 5, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr70, 6, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr71, 7, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr72, 8, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr73, 9, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr74, 10, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr75, 11, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr76, 12, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr77, 13, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr78, 14, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr79, 15, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr80, 16, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr81, 17, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr82, 18, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr83, 19, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr84, 20, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr85, 21, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr86, 22, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr87, 23, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr88, 24, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr89, 25, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr90, 26, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr91, 27, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr92, 28, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr93, 29, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr94, 30, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr95, 31, $vgpr0, implicit killed $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
     ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 160, 0, 0, implicit $exec :: (store (s32) into %stack.7, addrspace 5)
     ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0, implicit killed $vgpr0
@@ -376,11 +377,12 @@ body:             |
     ; GCN32-MUBUF-NEXT: $sgpr0 = S_MOV_B32 $exec_lo
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 1, implicit-def $vgpr0
     ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
-    ; GCN32-MUBUF-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr12, 0, undef $vgpr0
+    ; GCN32-MUBUF-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr12, 0, undef $vgpr0
     ; GCN32-MUBUF-NEXT: $sgpr1 = S_ADD_I32 $sgpr33, 131072, implicit-def dead $scc
     ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, killed $sgpr1, 0, 0, 0, implicit $exec :: (store (s32) into %stack.8, align 4096, addrspace 5)
     ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0, implicit killed $vgpr0
+    ;
     ; GCN64-FLATSCR-LABEL: name: check_spill
     ; GCN64-FLATSCR: liveins: $sgpr8, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr11, $sgpr0_sgpr1
     ; GCN64-FLATSCR-NEXT: {{  $}}
@@ -391,7 +393,7 @@ body:             |
     ; GCN64-FLATSCR-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 1, implicit-def $vgpr0
     ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
-    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr12, 0, undef $vgpr0
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr12, 0, undef $vgpr0
     ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
@@ -399,7 +401,7 @@ body:             |
     ; GCN64-FLATSCR-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 1, implicit-def $vgpr0
     ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
-    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr12, 0, undef $vgpr0
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr12, 0, undef $vgpr0
     ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
@@ -407,8 +409,8 @@ body:             |
     ; GCN64-FLATSCR-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 3, implicit-def $vgpr0
     ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
-    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr12, 0, undef $vgpr0, implicit $sgpr12_sgpr13
-    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr13, 1, $vgpr0, implicit killed $sgpr12_sgpr13
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr12, 0, undef $vgpr0, implicit $sgpr12_sgpr13
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr13, 1, $vgpr0, implicit killed $sgpr12_sgpr13
     ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 8, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
@@ -416,8 +418,8 @@ body:             |
     ; GCN64-FLATSCR-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 3, implicit-def $vgpr0
     ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
-    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr12, 0, undef $vgpr0, implicit $sgpr12_sgpr13
-    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr13, 1, $vgpr0, implicit $sgpr12_sgpr13
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr12, 0, undef $vgpr0, implicit $sgpr12_sgpr13
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr13, 1, $vgpr0, implicit $sgpr12_sgpr13
     ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 8, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.1, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
@@ -425,9 +427,9 @@ body:             |
     ; GCN64-FLATSCR-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 7, implicit-def $vgpr0
     ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
-    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr12, 0, undef $vgpr0, implicit $sgpr12_sgpr13_sgpr14
-    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr13, 1, $vgpr0, implicit $sgpr12_sgpr13_sgpr14
-    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr14, 2, $vgpr0, implicit killed $sgpr12_sgpr13_sgpr14
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr12, 0, undef $vgpr0, implicit $sgpr12_sgpr13_sgpr14
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr13, 1, $vgpr0, implicit $sgpr12_sgpr13_sgpr14
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr14, 2, $vgpr0, implicit killed $sgpr12_sgpr13_sgpr14
     ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 16, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.2, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
@@ -435,10 +437,10 @@ body:             |
     ; GCN64-FLATSCR-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 15, implicit-def $vgpr0
     ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
-    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr12, 0, undef $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15
-    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr13, 1, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15
-    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr14, 2, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15
-    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr15, 3, $vgpr0, implicit killed $sgpr12_sgpr13_sgpr14_sgpr15
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr12, 0, undef $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr13, 1, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr14, 2, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr15, 3, $vgpr0, implicit killed $sgpr12_sgpr13_sgpr14_sgpr15
     ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 28, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.3, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
@@ -446,11 +448,11 @@ body:             |
     ; GCN64-FLATSCR-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 31, implicit-def $vgpr0
     ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
-    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr12, 0, undef $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16
-    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr13, 1, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16
-    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr14, 2, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16
-    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr15, 3, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16
-    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr16, 4, $vgpr0, implicit killed $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr12, 0, undef $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr13, 1, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr14, 2, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr15, 3, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr16, 4, $vgpr0, implicit killed $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16
     ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 44, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.4, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
@@ -458,14 +460,14 @@ body:             |
     ; GCN64-FLATSCR-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 255, implicit-def $vgpr0
     ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
-    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr12, 0, undef $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
-    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr13, 1, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
-    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr14, 2, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
-    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr15, 3, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
-    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr16, 4, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
-    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr17, 5, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
-    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr18, 6, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
-    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr19, 7, $vgpr0, implicit killed $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr12, 0, undef $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr13, 1, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr14, 2, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr15, 3, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr16, 4, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr17, 5, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr18, 6, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr19, 7, $vgpr0, implicit killed $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
     ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 64, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.5, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
@@ -473,22 +475,22 @@ body:             |
     ; GCN64-FLATSCR-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 65535, implicit-def $vgpr0
     ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
-    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr12, 0, undef $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
-    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr13, 1, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
-    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr14, 2, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
-    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr15, 3, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
-    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr16, 4, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
-    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr17, 5, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
-    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr18, 6, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
-    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr19, 7, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
-    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr20, 8, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
-    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr21, 9, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
-    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr22, 10, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
-    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr23, 11, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
-    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr24, 12, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
-    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr25, 13, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
-    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr26, 14, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
-    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr27, 15, $vgpr0, implicit killed $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr12, 0, undef $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr13, 1, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr14, 2, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr15, 3, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr16, 4, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr17, 5, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr18, 6, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr19, 7, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr20, 8, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr21, 9, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr22, 10, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr23, 11, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr24, 12, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr25, 13, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr26, 14, $vgpr0, implicit $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr27, 15, $vgpr0, implicit killed $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
     ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 96, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.6, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
@@ -496,38 +498,38 @@ body:             |
     ; GCN64-FLATSCR-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 4294967295, implicit-def $vgpr0
     ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
-    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr64, 0, undef $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr65, 1, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr66, 2, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr67, 3, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr68, 4, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr69, 5, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr70, 6, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr71, 7, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr72, 8, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr73, 9, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr74, 10, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr75, 11, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr76, 12, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr77, 13, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr78, 14, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr79, 15, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr80, 16, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr81, 17, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr82, 18, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr83, 19, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr84, 20, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr85, 21, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr86, 22, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr87, 23, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr88, 24, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr89, 25, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr90, 26, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr91, 27, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr92, 28, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr93, 29, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr94, 30, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr95, 31, $vgpr0, implicit killed $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr64, 0, undef $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr65, 1, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr66, 2, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr67, 3, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr68, 4, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr69, 5, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr70, 6, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr71, 7, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr72, 8, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr73, 9, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr74, 10, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr75, 11, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr76, 12, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr77, 13, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr78, 14, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr79, 15, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr80, 16, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr81, 17, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr82, 18, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr83, 19, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr84, 20, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr85, 21, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr86, 22, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr87, 23, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr88, 24, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr89, 25, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr90, 26, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr91, 27, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr92, 28, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr93, 29, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr94, 30, $vgpr0, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr95, 31, $vgpr0, implicit killed $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
     ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 160, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.7, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
@@ -535,7 +537,7 @@ body:             |
     ; GCN64-FLATSCR-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 1, implicit-def $vgpr0
     ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
-    ; GCN64-FLATSCR-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr12, 0, undef $vgpr0
+    ; GCN64-FLATSCR-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr12, 0, undef $vgpr0
     ; GCN64-FLATSCR-NEXT: $sgpr2 = S_ADD_I32 $sgpr33, 4096, implicit-def dead $scc
     ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, killed $sgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.8, align 4096, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
@@ -626,119 +628,119 @@ body:             |
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 1, implicit-def $vgpr0
     ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
     ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 4, 0, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5)
-    ; GCN64-MUBUF-NEXT: $sgpr12 = V_READLANE_B32 killed $vgpr0, 0
+    ; GCN64-MUBUF-NEXT: $sgpr12 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 0
     ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GCN64-MUBUF-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 3, implicit-def $vgpr0
     ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
     ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 8, 0, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5)
-    ; GCN64-MUBUF-NEXT: $sgpr12 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr12_sgpr13
-    ; GCN64-MUBUF-NEXT: $sgpr13 = V_READLANE_B32 killed $vgpr0, 1
+    ; GCN64-MUBUF-NEXT: $sgpr12 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0, implicit-def $sgpr12_sgpr13
+    ; GCN64-MUBUF-NEXT: $sgpr13 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 1
     ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GCN64-MUBUF-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 7, implicit-def $vgpr0
     ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
     ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 16, 0, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5)
-    ; GCN64-MUBUF-NEXT: $sgpr12 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr12_sgpr13_sgpr14
-    ; GCN64-MUBUF-NEXT: $sgpr13 = V_READLANE_B32 $vgpr0, 1
-    ; GCN64-MUBUF-NEXT: $sgpr14 = V_READLANE_B32 killed $vgpr0, 2
+    ; GCN64-MUBUF-NEXT: $sgpr12 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0, implicit-def $sgpr12_sgpr13_sgpr14
+    ; GCN64-MUBUF-NEXT: $sgpr13 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 1
+    ; GCN64-MUBUF-NEXT: $sgpr14 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 2
     ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GCN64-MUBUF-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 15, implicit-def $vgpr0
     ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
     ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 28, 0, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5)
-    ; GCN64-MUBUF-NEXT: $sgpr12 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15
-    ; GCN64-MUBUF-NEXT: $sgpr13 = V_READLANE_B32 $vgpr0, 1
-    ; GCN64-MUBUF-NEXT: $sgpr14 = V_READLANE_B32 $vgpr0, 2
-    ; GCN64-MUBUF-NEXT: $sgpr15 = V_READLANE_B32 killed $vgpr0, 3
+    ; GCN64-MUBUF-NEXT: $sgpr12 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15
+    ; GCN64-MUBUF-NEXT: $sgpr13 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 1
+    ; GCN64-MUBUF-NEXT: $sgpr14 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 2
+    ; GCN64-MUBUF-NEXT: $sgpr15 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 3
     ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GCN64-MUBUF-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 31, implicit-def $vgpr0
     ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
     ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 44, 0, 0, implicit $exec :: (load (s32) from %stack.4, addrspace 5)
-    ; GCN64-MUBUF-NEXT: $sgpr12 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16
-    ; GCN64-MUBUF-NEXT: $sgpr13 = V_READLANE_B32 $vgpr0, 1
-    ; GCN64-MUBUF-NEXT: $sgpr14 = V_READLANE_B32 $vgpr0, 2
-    ; GCN64-MUBUF-NEXT: $sgpr15 = V_READLANE_B32 $vgpr0, 3
-    ; GCN64-MUBUF-NEXT: $sgpr16 = V_READLANE_B32 killed $vgpr0, 4
+    ; GCN64-MUBUF-NEXT: $sgpr12 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16
+    ; GCN64-MUBUF-NEXT: $sgpr13 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 1
+    ; GCN64-MUBUF-NEXT: $sgpr14 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 2
+    ; GCN64-MUBUF-NEXT: $sgpr15 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 3
+    ; GCN64-MUBUF-NEXT: $sgpr16 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 4
     ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GCN64-MUBUF-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 255, implicit-def $vgpr0
     ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
     ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 64, 0, 0, implicit $exec :: (load (s32) from %stack.5, addrspace 5)
-    ; GCN64-MUBUF-NEXT: $sgpr12 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
-    ; GCN64-MUBUF-NEXT: $sgpr13 = V_READLANE_B32 $vgpr0, 1
-    ; GCN64-MUBUF-NEXT: $sgpr14 = V_READLANE_B32 $vgpr0, 2
-    ; GCN64-MUBUF-NEXT: $sgpr15 = V_READLANE_B32 $vgpr0, 3
-    ; GCN64-MUBUF-NEXT: $sgpr16 = V_READLANE_B32 $vgpr0, 4
-    ; GCN64-MUBUF-NEXT: $sgpr17 = V_READLANE_B32 $vgpr0, 5
-    ; GCN64-MUBUF-NEXT: $sgpr18 = V_READLANE_B32 $vgpr0, 6
-    ; GCN64-MUBUF-NEXT: $sgpr19 = V_READLANE_B32 killed $vgpr0, 7
+    ; GCN64-MUBUF-NEXT: $sgpr12 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
+    ; GCN64-MUBUF-NEXT: $sgpr13 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 1
+    ; GCN64-MUBUF-NEXT: $sgpr14 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 2
+    ; GCN64-MUBUF-NEXT: $sgpr15 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 3
+    ; GCN64-MUBUF-NEXT: $sgpr16 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 4
+    ; GCN64-MUBUF-NEXT: $sgpr17 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 5
+    ; GCN64-MUBUF-NEXT: $sgpr18 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 6
+    ; GCN64-MUBUF-NEXT: $sgpr19 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 7
     ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GCN64-MUBUF-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 65535, implicit-def $vgpr0
     ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
     ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 96, 0, 0, implicit $exec :: (load (s32) from %stack.6, addrspace 5)
-    ; GCN64-MUBUF-NEXT: $sgpr12 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
-    ; GCN64-MUBUF-NEXT: $sgpr13 = V_READLANE_B32 $vgpr0, 1
-    ; GCN64-MUBUF-NEXT: $sgpr14 = V_READLANE_B32 $vgpr0, 2
-    ; GCN64-MUBUF-NEXT: $sgpr15 = V_READLANE_B32 $vgpr0, 3
-    ; GCN64-MUBUF-NEXT: $sgpr16 = V_READLANE_B32 $vgpr0, 4
-    ; GCN64-MUBUF-NEXT: $sgpr17 = V_READLANE_B32 $vgpr0, 5
-    ; GCN64-MUBUF-NEXT: $sgpr18 = V_READLANE_B32 $vgpr0, 6
-    ; GCN64-MUBUF-NEXT: $sgpr19 = V_READLANE_B32 $vgpr0, 7
-    ; GCN64-MUBUF-NEXT: $sgpr20 = V_READLANE_B32 $vgpr0, 8
-    ; GCN64-MUBUF-NEXT: $sgpr21 = V_READLANE_B32 $vgpr0, 9
-    ; GCN64-MUBUF-NEXT: $sgpr22 = V_READLANE_B32 $vgpr0, 10
-    ; GCN64-MUBUF-NEXT: $sgpr23 = V_READLANE_B32 $vgpr0, 11
-    ; GCN64-MUBUF-NEXT: $sgpr24 = V_READLANE_B32 $vgpr0, 12
-    ; GCN64-MUBUF-NEXT: $sgpr25 = V_READLANE_B32 $vgpr0, 13
-    ; GCN64-MUBUF-NEXT: $sgpr26 = V_READLANE_B32 $vgpr0, 14
-    ; GCN64-MUBUF-NEXT: $sgpr27 = V_READLANE_B32 killed $vgpr0, 15
+    ; GCN64-MUBUF-NEXT: $sgpr12 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
+    ; GCN64-MUBUF-NEXT: $sgpr13 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 1
+    ; GCN64-MUBUF-NEXT: $sgpr14 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 2
+    ; GCN64-MUBUF-NEXT: $sgpr15 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 3
+    ; GCN64-MUBUF-NEXT: $sgpr16 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 4
+    ; GCN64-MUBUF-NEXT: $sgpr17 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 5
+    ; GCN64-MUBUF-NEXT: $sgpr18 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 6
+    ; GCN64-MUBUF-NEXT: $sgpr19 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 7
+    ; GCN64-MUBUF-NEXT: $sgpr20 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 8
+    ; GCN64-MUBUF-NEXT: $sgpr21 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 9
+    ; GCN64-MUBUF-NEXT: $sgpr22 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 10
+    ; GCN64-MUBUF-NEXT: $sgpr23 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 11
+    ; GCN64-MUBUF-NEXT: $sgpr24 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 12
+    ; GCN64-MUBUF-NEXT: $sgpr25 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 13
+    ; GCN64-MUBUF-NEXT: $sgpr26 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 14
+    ; GCN64-MUBUF-NEXT: $sgpr27 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 15
     ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GCN64-MUBUF-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 4294967295, implicit-def $vgpr0
     ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
     ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 160, 0, 0, implicit $exec :: (load (s32) from %stack.7, addrspace 5)
-    ; GCN64-MUBUF-NEXT: $sgpr64 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-MUBUF-NEXT: $sgpr65 = V_READLANE_B32 $vgpr0, 1
-    ; GCN64-MUBUF-NEXT: $sgpr66 = V_READLANE_B32 $vgpr0, 2
-    ; GCN64-MUBUF-NEXT: $sgpr67 = V_READLANE_B32 $vgpr0, 3
-    ; GCN64-MUBUF-NEXT: $sgpr68 = V_READLANE_B32 $vgpr0, 4
-    ; GCN64-MUBUF-NEXT: $sgpr69 = V_READLANE_B32 $vgpr0, 5
-    ; GCN64-MUBUF-NEXT: $sgpr70 = V_READLANE_B32 $vgpr0, 6
-    ; GCN64-MUBUF-NEXT: $sgpr71 = V_READLANE_B32 $vgpr0, 7
-    ; GCN64-MUBUF-NEXT: $sgpr72 = V_READLANE_B32 $vgpr0, 8
-    ; GCN64-MUBUF-NEXT: $sgpr73 = V_READLANE_B32 $vgpr0, 9
-    ; GCN64-MUBUF-NEXT: $sgpr74 = V_READLANE_B32 $vgpr0, 10
-    ; GCN64-MUBUF-NEXT: $sgpr75 = V_READLANE_B32 $vgpr0, 11
-    ; GCN64-MUBUF-NEXT: $sgpr76 = V_READLANE_B32 $vgpr0, 12
-    ; GCN64-MUBUF-NEXT: $sgpr77 = V_READLANE_B32 $vgpr0, 13
-    ; GCN64-MUBUF-NEXT: $sgpr78 = V_READLANE_B32 $vgpr0, 14
-    ; GCN64-MUBUF-NEXT: $sgpr79 = V_READLANE_B32 $vgpr0, 15
-    ; GCN64-MUBUF-NEXT: $sgpr80 = V_READLANE_B32 $vgpr0, 16
-    ; GCN64-MUBUF-NEXT: $sgpr81 = V_READLANE_B32 $vgpr0, 17
-    ; GCN64-MUBUF-NEXT: $sgpr82 = V_READLANE_B32 $vgpr0, 18
-    ; GCN64-MUBUF-NEXT: $sgpr83 = V_READLANE_B32 $vgpr0, 19
-    ; GCN64-MUBUF-NEXT: $sgpr84 = V_READLANE_B32 $vgpr0, 20
-    ; GCN64-MUBUF-NEXT: $sgpr85 = V_READLANE_B32 $vgpr0, 21
-    ; GCN64-MUBUF-NEXT: $sgpr86 = V_READLANE_B32 $vgpr0, 22
-    ; GCN64-MUBUF-NEXT: $sgpr87 = V_READLANE_B32 $vgpr0, 23
-    ; GCN64-MUBUF-NEXT: $sgpr88 = V_READLANE_B32 $vgpr0, 24
-    ; GCN64-MUBUF-NEXT: $sgpr89 = V_READLANE_B32 $vgpr0, 25
-    ; GCN64-MUBUF-NEXT: $sgpr90 = V_READLANE_B32 $vgpr0, 26
-    ; GCN64-MUBUF-NEXT: $sgpr91 = V_READLANE_B32 $vgpr0, 27
-    ; GCN64-MUBUF-NEXT: $sgpr92 = V_READLANE_B32 $vgpr0, 28
-    ; GCN64-MUBUF-NEXT: $sgpr93 = V_READLANE_B32 $vgpr0, 29
-    ; GCN64-MUBUF-NEXT: $sgpr94 = V_READLANE_B32 $vgpr0, 30
-    ; GCN64-MUBUF-NEXT: $sgpr95 = V_READLANE_B32 killed $vgpr0, 31
+    ; GCN64-MUBUF-NEXT: $sgpr64 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0, implicit-def $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-MUBUF-NEXT: $sgpr65 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 1
+    ; GCN64-MUBUF-NEXT: $sgpr66 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 2
+    ; GCN64-MUBUF-NEXT: $sgpr67 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 3
+    ; GCN64-MUBUF-NEXT: $sgpr68 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 4
+    ; GCN64-MUBUF-NEXT: $sgpr69 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 5
+    ; GCN64-MUBUF-NEXT: $sgpr70 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 6
+    ; GCN64-MUBUF-NEXT: $sgpr71 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 7
+    ; GCN64-MUBUF-NEXT: $sgpr72 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 8
+    ; GCN64-MUBUF-NEXT: $sgpr73 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 9
+    ; GCN64-MUBUF-NEXT: $sgpr74 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 10
+    ; GCN64-MUBUF-NEXT: $sgpr75 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 11
+    ; GCN64-MUBUF-NEXT: $sgpr76 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 12
+    ; GCN64-MUBUF-NEXT: $sgpr77 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 13
+    ; GCN64-MUBUF-NEXT: $sgpr78 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 14
+    ; GCN64-MUBUF-NEXT: $sgpr79 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 15
+    ; GCN64-MUBUF-NEXT: $sgpr80 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 16
+    ; GCN64-MUBUF-NEXT: $sgpr81 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 17
+    ; GCN64-MUBUF-NEXT: $sgpr82 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 18
+    ; GCN64-MUBUF-NEXT: $sgpr83 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 19
+    ; GCN64-MUBUF-NEXT: $sgpr84 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 20
+    ; GCN64-MUBUF-NEXT: $sgpr85 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 21
+    ; GCN64-MUBUF-NEXT: $sgpr86 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 22
+    ; GCN64-MUBUF-NEXT: $sgpr87 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 23
+    ; GCN64-MUBUF-NEXT: $sgpr88 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 24
+    ; GCN64-MUBUF-NEXT: $sgpr89 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 25
+    ; GCN64-MUBUF-NEXT: $sgpr90 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 26
+    ; GCN64-MUBUF-NEXT: $sgpr91 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 27
+    ; GCN64-MUBUF-NEXT: $sgpr92 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 28
+    ; GCN64-MUBUF-NEXT: $sgpr93 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 29
+    ; GCN64-MUBUF-NEXT: $sgpr94 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 30
+    ; GCN64-MUBUF-NEXT: $sgpr95 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 31
     ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GCN64-MUBUF-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
@@ -746,9 +748,10 @@ body:             |
     ; GCN64-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
     ; GCN64-MUBUF-NEXT: $sgpr2 = S_ADD_I32 $sgpr33, 262144, implicit-def dead $scc
     ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, killed $sgpr2, 0, 0, 0, implicit $exec :: (load (s32) from %stack.8, align 4096, addrspace 5)
-    ; GCN64-MUBUF-NEXT: $sgpr12 = V_READLANE_B32 killed $vgpr0, 0
+    ; GCN64-MUBUF-NEXT: $sgpr12 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 0
     ; GCN64-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr28_sgpr29_sgpr30_sgpr31, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
     ; GCN64-MUBUF-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
+    ;
     ; GCN32-MUBUF-LABEL: name: check_reload
     ; GCN32-MUBUF: liveins: $sgpr8, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr11
     ; GCN32-MUBUF-NEXT: {{  $}}
@@ -763,119 +766,119 @@ body:             |
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 1, implicit-def $vgpr0
     ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
     ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 4, 0, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5)
-    ; GCN32-MUBUF-NEXT: $sgpr12 = V_READLANE_B32 killed $vgpr0, 0
+    ; GCN32-MUBUF-NEXT: $sgpr12 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 0
     ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0, implicit killed $vgpr0
     ; GCN32-MUBUF-NEXT: $sgpr0 = S_MOV_B32 $exec_lo
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 3, implicit-def $vgpr0
     ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
     ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 8, 0, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5)
-    ; GCN32-MUBUF-NEXT: $sgpr12 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr12_sgpr13
-    ; GCN32-MUBUF-NEXT: $sgpr13 = V_READLANE_B32 killed $vgpr0, 1
+    ; GCN32-MUBUF-NEXT: $sgpr12 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0, implicit-def $sgpr12_sgpr13
+    ; GCN32-MUBUF-NEXT: $sgpr13 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 1
     ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0, implicit killed $vgpr0
     ; GCN32-MUBUF-NEXT: $sgpr0 = S_MOV_B32 $exec_lo
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 7, implicit-def $vgpr0
     ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
     ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 16, 0, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5)
-    ; GCN32-MUBUF-NEXT: $sgpr12 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr12_sgpr13_sgpr14
-    ; GCN32-MUBUF-NEXT: $sgpr13 = V_READLANE_B32 $vgpr0, 1
-    ; GCN32-MUBUF-NEXT: $sgpr14 = V_READLANE_B32 killed $vgpr0, 2
+    ; GCN32-MUBUF-NEXT: $sgpr12 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0, implicit-def $sgpr12_sgpr13_sgpr14
+    ; GCN32-MUBUF-NEXT: $sgpr13 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 1
+    ; GCN32-MUBUF-NEXT: $sgpr14 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 2
     ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0, implicit killed $vgpr0
     ; GCN32-MUBUF-NEXT: $sgpr0 = S_MOV_B32 $exec_lo
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 15, implicit-def $vgpr0
     ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
     ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 28, 0, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5)
-    ; GCN32-MUBUF-NEXT: $sgpr12 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15
-    ; GCN32-MUBUF-NEXT: $sgpr13 = V_READLANE_B32 $vgpr0, 1
-    ; GCN32-MUBUF-NEXT: $sgpr14 = V_READLANE_B32 $vgpr0, 2
-    ; GCN32-MUBUF-NEXT: $sgpr15 = V_READLANE_B32 killed $vgpr0, 3
+    ; GCN32-MUBUF-NEXT: $sgpr12 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15
+    ; GCN32-MUBUF-NEXT: $sgpr13 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 1
+    ; GCN32-MUBUF-NEXT: $sgpr14 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 2
+    ; GCN32-MUBUF-NEXT: $sgpr15 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 3
     ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0, implicit killed $vgpr0
     ; GCN32-MUBUF-NEXT: $sgpr0 = S_MOV_B32 $exec_lo
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 31, implicit-def $vgpr0
     ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
     ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 44, 0, 0, implicit $exec :: (load (s32) from %stack.4, addrspace 5)
-    ; GCN32-MUBUF-NEXT: $sgpr12 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16
-    ; GCN32-MUBUF-NEXT: $sgpr13 = V_READLANE_B32 $vgpr0, 1
-    ; GCN32-MUBUF-NEXT: $sgpr14 = V_READLANE_B32 $vgpr0, 2
-    ; GCN32-MUBUF-NEXT: $sgpr15 = V_READLANE_B32 $vgpr0, 3
-    ; GCN32-MUBUF-NEXT: $sgpr16 = V_READLANE_B32 killed $vgpr0, 4
+    ; GCN32-MUBUF-NEXT: $sgpr12 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16
+    ; GCN32-MUBUF-NEXT: $sgpr13 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 1
+    ; GCN32-MUBUF-NEXT: $sgpr14 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 2
+    ; GCN32-MUBUF-NEXT: $sgpr15 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 3
+    ; GCN32-MUBUF-NEXT: $sgpr16 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 4
     ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0, implicit killed $vgpr0
     ; GCN32-MUBUF-NEXT: $sgpr0 = S_MOV_B32 $exec_lo
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 255, implicit-def $vgpr0
     ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
     ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 64, 0, 0, implicit $exec :: (load (s32) from %stack.5, addrspace 5)
-    ; GCN32-MUBUF-NEXT: $sgpr12 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
-    ; GCN32-MUBUF-NEXT: $sgpr13 = V_READLANE_B32 $vgpr0, 1
-    ; GCN32-MUBUF-NEXT: $sgpr14 = V_READLANE_B32 $vgpr0, 2
-    ; GCN32-MUBUF-NEXT: $sgpr15 = V_READLANE_B32 $vgpr0, 3
-    ; GCN32-MUBUF-NEXT: $sgpr16 = V_READLANE_B32 $vgpr0, 4
-    ; GCN32-MUBUF-NEXT: $sgpr17 = V_READLANE_B32 $vgpr0, 5
-    ; GCN32-MUBUF-NEXT: $sgpr18 = V_READLANE_B32 $vgpr0, 6
-    ; GCN32-MUBUF-NEXT: $sgpr19 = V_READLANE_B32 killed $vgpr0, 7
+    ; GCN32-MUBUF-NEXT: $sgpr12 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
+    ; GCN32-MUBUF-NEXT: $sgpr13 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 1
+    ; GCN32-MUBUF-NEXT: $sgpr14 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 2
+    ; GCN32-MUBUF-NEXT: $sgpr15 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 3
+    ; GCN32-MUBUF-NEXT: $sgpr16 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 4
+    ; GCN32-MUBUF-NEXT: $sgpr17 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 5
+    ; GCN32-MUBUF-NEXT: $sgpr18 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 6
+    ; GCN32-MUBUF-NEXT: $sgpr19 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 7
     ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0, implicit killed $vgpr0
     ; GCN32-MUBUF-NEXT: $sgpr0 = S_MOV_B32 $exec_lo
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 65535, implicit-def $vgpr0
     ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
     ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 96, 0, 0, implicit $exec :: (load (s32) from %stack.6, addrspace 5)
-    ; GCN32-MUBUF-NEXT: $sgpr12 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
-    ; GCN32-MUBUF-NEXT: $sgpr13 = V_READLANE_B32 $vgpr0, 1
-    ; GCN32-MUBUF-NEXT: $sgpr14 = V_READLANE_B32 $vgpr0, 2
-    ; GCN32-MUBUF-NEXT: $sgpr15 = V_READLANE_B32 $vgpr0, 3
-    ; GCN32-MUBUF-NEXT: $sgpr16 = V_READLANE_B32 $vgpr0, 4
-    ; GCN32-MUBUF-NEXT: $sgpr17 = V_READLANE_B32 $vgpr0, 5
-    ; GCN32-MUBUF-NEXT: $sgpr18 = V_READLANE_B32 $vgpr0, 6
-    ; GCN32-MUBUF-NEXT: $sgpr19 = V_READLANE_B32 $vgpr0, 7
-    ; GCN32-MUBUF-NEXT: $sgpr20 = V_READLANE_B32 $vgpr0, 8
-    ; GCN32-MUBUF-NEXT: $sgpr21 = V_READLANE_B32 $vgpr0, 9
-    ; GCN32-MUBUF-NEXT: $sgpr22 = V_READLANE_B32 $vgpr0, 10
-    ; GCN32-MUBUF-NEXT: $sgpr23 = V_READLANE_B32 $vgpr0, 11
-    ; GCN32-MUBUF-NEXT: $sgpr24 = V_READLANE_B32 $vgpr0, 12
-    ; GCN32-MUBUF-NEXT: $sgpr25 = V_READLANE_B32 $vgpr0, 13
-    ; GCN32-MUBUF-NEXT: $sgpr26 = V_READLANE_B32 $vgpr0, 14
-    ; GCN32-MUBUF-NEXT: $sgpr27 = V_READLANE_B32 killed $vgpr0, 15
+    ; GCN32-MUBUF-NEXT: $sgpr12 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
+    ; GCN32-MUBUF-NEXT: $sgpr13 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 1
+    ; GCN32-MUBUF-NEXT: $sgpr14 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 2
+    ; GCN32-MUBUF-NEXT: $sgpr15 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 3
+    ; GCN32-MUBUF-NEXT: $sgpr16 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 4
+    ; GCN32-MUBUF-NEXT: $sgpr17 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 5
+    ; GCN32-MUBUF-NEXT: $sgpr18 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 6
+    ; GCN32-MUBUF-NEXT: $sgpr19 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 7
+    ; GCN32-MUBUF-NEXT: $sgpr20 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 8
+    ; GCN32-MUBUF-NEXT: $sgpr21 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 9
+    ; GCN32-MUBUF-NEXT: $sgpr22 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 10
+    ; GCN32-MUBUF-NEXT: $sgpr23 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 11
+    ; GCN32-MUBUF-NEXT: $sgpr24 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 12
+    ; GCN32-MUBUF-NEXT: $sgpr25 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 13
+    ; GCN32-MUBUF-NEXT: $sgpr26 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 14
+    ; GCN32-MUBUF-NEXT: $sgpr27 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 15
     ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0, implicit killed $vgpr0
     ; GCN32-MUBUF-NEXT: $sgpr0 = S_MOV_B32 $exec_lo
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 4294967295, implicit-def $vgpr0
     ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
     ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 160, 0, 0, implicit $exec :: (load (s32) from %stack.7, addrspace 5)
-    ; GCN32-MUBUF-NEXT: $sgpr64 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN32-MUBUF-NEXT: $sgpr65 = V_READLANE_B32 $vgpr0, 1
-    ; GCN32-MUBUF-NEXT: $sgpr66 = V_READLANE_B32 $vgpr0, 2
-    ; GCN32-MUBUF-NEXT: $sgpr67 = V_READLANE_B32 $vgpr0, 3
-    ; GCN32-MUBUF-NEXT: $sgpr68 = V_READLANE_B32 $vgpr0, 4
-    ; GCN32-MUBUF-NEXT: $sgpr69 = V_READLANE_B32 $vgpr0, 5
-    ; GCN32-MUBUF-NEXT: $sgpr70 = V_READLANE_B32 $vgpr0, 6
-    ; GCN32-MUBUF-NEXT: $sgpr71 = V_READLANE_B32 $vgpr0, 7
-    ; GCN32-MUBUF-NEXT: $sgpr72 = V_READLANE_B32 $vgpr0, 8
-    ; GCN32-MUBUF-NEXT: $sgpr73 = V_READLANE_B32 $vgpr0, 9
-    ; GCN32-MUBUF-NEXT: $sgpr74 = V_READLANE_B32 $vgpr0, 10
-    ; GCN32-MUBUF-NEXT: $sgpr75 = V_READLANE_B32 $vgpr0, 11
-    ; GCN32-MUBUF-NEXT: $sgpr76 = V_READLANE_B32 $vgpr0, 12
-    ; GCN32-MUBUF-NEXT: $sgpr77 = V_READLANE_B32 $vgpr0, 13
-    ; GCN32-MUBUF-NEXT: $sgpr78 = V_READLANE_B32 $vgpr0, 14
-    ; GCN32-MUBUF-NEXT: $sgpr79 = V_READLANE_B32 $vgpr0, 15
-    ; GCN32-MUBUF-NEXT: $sgpr80 = V_READLANE_B32 $vgpr0, 16
-    ; GCN32-MUBUF-NEXT: $sgpr81 = V_READLANE_B32 $vgpr0, 17
-    ; GCN32-MUBUF-NEXT: $sgpr82 = V_READLANE_B32 $vgpr0, 18
-    ; GCN32-MUBUF-NEXT: $sgpr83 = V_READLANE_B32 $vgpr0, 19
-    ; GCN32-MUBUF-NEXT: $sgpr84 = V_READLANE_B32 $vgpr0, 20
-    ; GCN32-MUBUF-NEXT: $sgpr85 = V_READLANE_B32 $vgpr0, 21
-    ; GCN32-MUBUF-NEXT: $sgpr86 = V_READLANE_B32 $vgpr0, 22
-    ; GCN32-MUBUF-NEXT: $sgpr87 = V_READLANE_B32 $vgpr0, 23
-    ; GCN32-MUBUF-NEXT: $sgpr88 = V_READLANE_B32 $vgpr0, 24
-    ; GCN32-MUBUF-NEXT: $sgpr89 = V_READLANE_B32 $vgpr0, 25
-    ; GCN32-MUBUF-NEXT: $sgpr90 = V_READLANE_B32 $vgpr0, 26
-    ; GCN32-MUBUF-NEXT: $sgpr91 = V_READLANE_B32 $vgpr0, 27
-    ; GCN32-MUBUF-NEXT: $sgpr92 = V_READLANE_B32 $vgpr0, 28
-    ; GCN32-MUBUF-NEXT: $sgpr93 = V_READLANE_B32 $vgpr0, 29
-    ; GCN32-MUBUF-NEXT: $sgpr94 = V_READLANE_B32 $vgpr0, 30
-    ; GCN32-MUBUF-NEXT: $sgpr95 = V_READLANE_B32 killed $vgpr0, 31
+    ; GCN32-MUBUF-NEXT: $sgpr64 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0, implicit-def $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN32-MUBUF-NEXT: $sgpr65 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 1
+    ; GCN32-MUBUF-NEXT: $sgpr66 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 2
+    ; GCN32-MUBUF-NEXT: $sgpr67 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 3
+    ; GCN32-MUBUF-NEXT: $sgpr68 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 4
+    ; GCN32-MUBUF-NEXT: $sgpr69 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 5
+    ; GCN32-MUBUF-NEXT: $sgpr70 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 6
+    ; GCN32-MUBUF-NEXT: $sgpr71 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 7
+    ; GCN32-MUBUF-NEXT: $sgpr72 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 8
+    ; GCN32-MUBUF-NEXT: $sgpr73 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 9
+    ; GCN32-MUBUF-NEXT: $sgpr74 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 10
+    ; GCN32-MUBUF-NEXT: $sgpr75 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 11
+    ; GCN32-MUBUF-NEXT: $sgpr76 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 12
+    ; GCN32-MUBUF-NEXT: $sgpr77 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 13
+    ; GCN32-MUBUF-NEXT: $sgpr78 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 14
+    ; GCN32-MUBUF-NEXT: $sgpr79 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 15
+    ; GCN32-MUBUF-NEXT: $sgpr80 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 16
+    ; GCN32-MUBUF-NEXT: $sgpr81 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 17
+    ; GCN32-MUBUF-NEXT: $sgpr82 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 18
+    ; GCN32-MUBUF-NEXT: $sgpr83 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 19
+    ; GCN32-MUBUF-NEXT: $sgpr84 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 20
+    ; GCN32-MUBUF-NEXT: $sgpr85 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 21
+    ; GCN32-MUBUF-NEXT: $sgpr86 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 22
+    ; GCN32-MUBUF-NEXT: $sgpr87 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 23
+    ; GCN32-MUBUF-NEXT: $sgpr88 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 24
+    ; GCN32-MUBUF-NEXT: $sgpr89 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 25
+    ; GCN32-MUBUF-NEXT: $sgpr90 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 26
+    ; GCN32-MUBUF-NEXT: $sgpr91 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 27
+    ; GCN32-MUBUF-NEXT: $sgpr92 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 28
+    ; GCN32-MUBUF-NEXT: $sgpr93 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 29
+    ; GCN32-MUBUF-NEXT: $sgpr94 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 30
+    ; GCN32-MUBUF-NEXT: $sgpr95 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 31
     ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0, implicit killed $vgpr0
     ; GCN32-MUBUF-NEXT: $sgpr0 = S_MOV_B32 $exec_lo
@@ -883,9 +886,10 @@ body:             |
     ; GCN32-MUBUF-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
     ; GCN32-MUBUF-NEXT: $sgpr1 = S_ADD_I32 $sgpr33, 131072, implicit-def dead $scc
     ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, killed $sgpr1, 0, 0, 0, implicit $exec :: (load (s32) from %stack.8, align 4096, addrspace 5)
-    ; GCN32-MUBUF-NEXT: $sgpr12 = V_READLANE_B32 killed $vgpr0, 0
+    ; GCN32-MUBUF-NEXT: $sgpr12 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 0
     ; GCN32-MUBUF-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
     ; GCN32-MUBUF-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0, implicit killed $vgpr0
+    ;
     ; GCN64-FLATSCR-LABEL: name: check_reload
     ; GCN64-FLATSCR: liveins: $sgpr8, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr11, $sgpr0_sgpr1
     ; GCN64-FLATSCR-NEXT: {{  $}}
@@ -896,119 +900,119 @@ body:             |
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 1, implicit-def $vgpr0
     ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0, addrspace 5)
-    ; GCN64-FLATSCR-NEXT: $sgpr12 = V_READLANE_B32 killed $vgpr0, 0
+    ; GCN64-FLATSCR-NEXT: $sgpr12 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 0
     ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GCN64-FLATSCR-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 3, implicit-def $vgpr0
     ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 8, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.1, addrspace 5)
-    ; GCN64-FLATSCR-NEXT: $sgpr12 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr12_sgpr13
-    ; GCN64-FLATSCR-NEXT: $sgpr13 = V_READLANE_B32 killed $vgpr0, 1
+    ; GCN64-FLATSCR-NEXT: $sgpr12 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0, implicit-def $sgpr12_sgpr13
+    ; GCN64-FLATSCR-NEXT: $sgpr13 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 1
     ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GCN64-FLATSCR-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 7, implicit-def $vgpr0
     ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 16, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.2, addrspace 5)
-    ; GCN64-FLATSCR-NEXT: $sgpr12 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr12_sgpr13_sgpr14
-    ; GCN64-FLATSCR-NEXT: $sgpr13 = V_READLANE_B32 $vgpr0, 1
-    ; GCN64-FLATSCR-NEXT: $sgpr14 = V_READLANE_B32 killed $vgpr0, 2
+    ; GCN64-FLATSCR-NEXT: $sgpr12 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0, implicit-def $sgpr12_sgpr13_sgpr14
+    ; GCN64-FLATSCR-NEXT: $sgpr13 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 1
+    ; GCN64-FLATSCR-NEXT: $sgpr14 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 2
     ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GCN64-FLATSCR-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 15, implicit-def $vgpr0
     ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 28, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.3, addrspace 5)
-    ; GCN64-FLATSCR-NEXT: $sgpr12 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15
-    ; GCN64-FLATSCR-NEXT: $sgpr13 = V_READLANE_B32 $vgpr0, 1
-    ; GCN64-FLATSCR-NEXT: $sgpr14 = V_READLANE_B32 $vgpr0, 2
-    ; GCN64-FLATSCR-NEXT: $sgpr15 = V_READLANE_B32 killed $vgpr0, 3
+    ; GCN64-FLATSCR-NEXT: $sgpr12 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15
+    ; GCN64-FLATSCR-NEXT: $sgpr13 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 1
+    ; GCN64-FLATSCR-NEXT: $sgpr14 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 2
+    ; GCN64-FLATSCR-NEXT: $sgpr15 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 3
     ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GCN64-FLATSCR-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 31, implicit-def $vgpr0
     ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 44, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.4, addrspace 5)
-    ; GCN64-FLATSCR-NEXT: $sgpr12 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16
-    ; GCN64-FLATSCR-NEXT: $sgpr13 = V_READLANE_B32 $vgpr0, 1
-    ; GCN64-FLATSCR-NEXT: $sgpr14 = V_READLANE_B32 $vgpr0, 2
-    ; GCN64-FLATSCR-NEXT: $sgpr15 = V_READLANE_B32 $vgpr0, 3
-    ; GCN64-FLATSCR-NEXT: $sgpr16 = V_READLANE_B32 killed $vgpr0, 4
+    ; GCN64-FLATSCR-NEXT: $sgpr12 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16
+    ; GCN64-FLATSCR-NEXT: $sgpr13 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 1
+    ; GCN64-FLATSCR-NEXT: $sgpr14 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 2
+    ; GCN64-FLATSCR-NEXT: $sgpr15 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 3
+    ; GCN64-FLATSCR-NEXT: $sgpr16 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 4
     ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GCN64-FLATSCR-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 255, implicit-def $vgpr0
     ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 64, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.5, addrspace 5)
-    ; GCN64-FLATSCR-NEXT: $sgpr12 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
-    ; GCN64-FLATSCR-NEXT: $sgpr13 = V_READLANE_B32 $vgpr0, 1
-    ; GCN64-FLATSCR-NEXT: $sgpr14 = V_READLANE_B32 $vgpr0, 2
-    ; GCN64-FLATSCR-NEXT: $sgpr15 = V_READLANE_B32 $vgpr0, 3
-    ; GCN64-FLATSCR-NEXT: $sgpr16 = V_READLANE_B32 $vgpr0, 4
-    ; GCN64-FLATSCR-NEXT: $sgpr17 = V_READLANE_B32 $vgpr0, 5
-    ; GCN64-FLATSCR-NEXT: $sgpr18 = V_READLANE_B32 $vgpr0, 6
-    ; GCN64-FLATSCR-NEXT: $sgpr19 = V_READLANE_B32 killed $vgpr0, 7
+    ; GCN64-FLATSCR-NEXT: $sgpr12 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
+    ; GCN64-FLATSCR-NEXT: $sgpr13 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 1
+    ; GCN64-FLATSCR-NEXT: $sgpr14 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 2
+    ; GCN64-FLATSCR-NEXT: $sgpr15 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 3
+    ; GCN64-FLATSCR-NEXT: $sgpr16 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 4
+    ; GCN64-FLATSCR-NEXT: $sgpr17 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 5
+    ; GCN64-FLATSCR-NEXT: $sgpr18 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 6
+    ; GCN64-FLATSCR-NEXT: $sgpr19 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 7
     ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GCN64-FLATSCR-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 65535, implicit-def $vgpr0
     ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 96, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.6, addrspace 5)
-    ; GCN64-FLATSCR-NEXT: $sgpr12 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
-    ; GCN64-FLATSCR-NEXT: $sgpr13 = V_READLANE_B32 $vgpr0, 1
-    ; GCN64-FLATSCR-NEXT: $sgpr14 = V_READLANE_B32 $vgpr0, 2
-    ; GCN64-FLATSCR-NEXT: $sgpr15 = V_READLANE_B32 $vgpr0, 3
-    ; GCN64-FLATSCR-NEXT: $sgpr16 = V_READLANE_B32 $vgpr0, 4
-    ; GCN64-FLATSCR-NEXT: $sgpr17 = V_READLANE_B32 $vgpr0, 5
-    ; GCN64-FLATSCR-NEXT: $sgpr18 = V_READLANE_B32 $vgpr0, 6
-    ; GCN64-FLATSCR-NEXT: $sgpr19 = V_READLANE_B32 $vgpr0, 7
-    ; GCN64-FLATSCR-NEXT: $sgpr20 = V_READLANE_B32 $vgpr0, 8
-    ; GCN64-FLATSCR-NEXT: $sgpr21 = V_READLANE_B32 $vgpr0, 9
-    ; GCN64-FLATSCR-NEXT: $sgpr22 = V_READLANE_B32 $vgpr0, 10
-    ; GCN64-FLATSCR-NEXT: $sgpr23 = V_READLANE_B32 $vgpr0, 11
-    ; GCN64-FLATSCR-NEXT: $sgpr24 = V_READLANE_B32 $vgpr0, 12
-    ; GCN64-FLATSCR-NEXT: $sgpr25 = V_READLANE_B32 $vgpr0, 13
-    ; GCN64-FLATSCR-NEXT: $sgpr26 = V_READLANE_B32 $vgpr0, 14
-    ; GCN64-FLATSCR-NEXT: $sgpr27 = V_READLANE_B32 killed $vgpr0, 15
+    ; GCN64-FLATSCR-NEXT: $sgpr12 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27
+    ; GCN64-FLATSCR-NEXT: $sgpr13 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 1
+    ; GCN64-FLATSCR-NEXT: $sgpr14 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 2
+    ; GCN64-FLATSCR-NEXT: $sgpr15 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 3
+    ; GCN64-FLATSCR-NEXT: $sgpr16 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 4
+    ; GCN64-FLATSCR-NEXT: $sgpr17 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 5
+    ; GCN64-FLATSCR-NEXT: $sgpr18 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 6
+    ; GCN64-FLATSCR-NEXT: $sgpr19 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 7
+    ; GCN64-FLATSCR-NEXT: $sgpr20 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 8
+    ; GCN64-FLATSCR-NEXT: $sgpr21 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 9
+    ; GCN64-FLATSCR-NEXT: $sgpr22 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 10
+    ; GCN64-FLATSCR-NEXT: $sgpr23 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 11
+    ; GCN64-FLATSCR-NEXT: $sgpr24 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 12
+    ; GCN64-FLATSCR-NEXT: $sgpr25 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 13
+    ; GCN64-FLATSCR-NEXT: $sgpr26 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 14
+    ; GCN64-FLATSCR-NEXT: $sgpr27 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 15
     ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GCN64-FLATSCR-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 4294967295, implicit-def $vgpr0
     ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 160, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.7, addrspace 5)
-    ; GCN64-FLATSCR-NEXT: $sgpr64 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN64-FLATSCR-NEXT: $sgpr65 = V_READLANE_B32 $vgpr0, 1
-    ; GCN64-FLATSCR-NEXT: $sgpr66 = V_READLANE_B32 $vgpr0, 2
-    ; GCN64-FLATSCR-NEXT: $sgpr67 = V_READLANE_B32 $vgpr0, 3
-    ; GCN64-FLATSCR-NEXT: $sgpr68 = V_READLANE_B32 $vgpr0, 4
-    ; GCN64-FLATSCR-NEXT: $sgpr69 = V_READLANE_B32 $vgpr0, 5
-    ; GCN64-FLATSCR-NEXT: $sgpr70 = V_READLANE_B32 $vgpr0, 6
-    ; GCN64-FLATSCR-NEXT: $sgpr71 = V_READLANE_B32 $vgpr0, 7
-    ; GCN64-FLATSCR-NEXT: $sgpr72 = V_READLANE_B32 $vgpr0, 8
-    ; GCN64-FLATSCR-NEXT: $sgpr73 = V_READLANE_B32 $vgpr0, 9
-    ; GCN64-FLATSCR-NEXT: $sgpr74 = V_READLANE_B32 $vgpr0, 10
-    ; GCN64-FLATSCR-NEXT: $sgpr75 = V_READLANE_B32 $vgpr0, 11
-    ; GCN64-FLATSCR-NEXT: $sgpr76 = V_READLANE_B32 $vgpr0, 12
-    ; GCN64-FLATSCR-NEXT: $sgpr77 = V_READLANE_B32 $vgpr0, 13
-    ; GCN64-FLATSCR-NEXT: $sgpr78 = V_READLANE_B32 $vgpr0, 14
-    ; GCN64-FLATSCR-NEXT: $sgpr79 = V_READLANE_B32 $vgpr0, 15
-    ; GCN64-FLATSCR-NEXT: $sgpr80 = V_READLANE_B32 $vgpr0, 16
-    ; GCN64-FLATSCR-NEXT: $sgpr81 = V_READLANE_B32 $vgpr0, 17
-    ; GCN64-FLATSCR-NEXT: $sgpr82 = V_READLANE_B32 $vgpr0, 18
-    ; GCN64-FLATSCR-NEXT: $sgpr83 = V_READLANE_B32 $vgpr0, 19
-    ; GCN64-FLATSCR-NEXT: $sgpr84 = V_READLANE_B32 $vgpr0, 20
-    ; GCN64-FLATSCR-NEXT: $sgpr85 = V_READLANE_B32 $vgpr0, 21
-    ; GCN64-FLATSCR-NEXT: $sgpr86 = V_READLANE_B32 $vgpr0, 22
-    ; GCN64-FLATSCR-NEXT: $sgpr87 = V_READLANE_B32 $vgpr0, 23
-    ; GCN64-FLATSCR-NEXT: $sgpr88 = V_READLANE_B32 $vgpr0, 24
-    ; GCN64-FLATSCR-NEXT: $sgpr89 = V_READLANE_B32 $vgpr0, 25
-    ; GCN64-FLATSCR-NEXT: $sgpr90 = V_READLANE_B32 $vgpr0, 26
-    ; GCN64-FLATSCR-NEXT: $sgpr91 = V_READLANE_B32 $vgpr0, 27
-    ; GCN64-FLATSCR-NEXT: $sgpr92 = V_READLANE_B32 $vgpr0, 28
-    ; GCN64-FLATSCR-NEXT: $sgpr93 = V_READLANE_B32 $vgpr0, 29
-    ; GCN64-FLATSCR-NEXT: $sgpr94 = V_READLANE_B32 $vgpr0, 30
-    ; GCN64-FLATSCR-NEXT: $sgpr95 = V_READLANE_B32 killed $vgpr0, 31
+    ; GCN64-FLATSCR-NEXT: $sgpr64 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0, implicit-def $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN64-FLATSCR-NEXT: $sgpr65 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 1
+    ; GCN64-FLATSCR-NEXT: $sgpr66 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 2
+    ; GCN64-FLATSCR-NEXT: $sgpr67 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 3
+    ; GCN64-FLATSCR-NEXT: $sgpr68 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 4
+    ; GCN64-FLATSCR-NEXT: $sgpr69 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 5
+    ; GCN64-FLATSCR-NEXT: $sgpr70 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 6
+    ; GCN64-FLATSCR-NEXT: $sgpr71 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 7
+    ; GCN64-FLATSCR-NEXT: $sgpr72 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 8
+    ; GCN64-FLATSCR-NEXT: $sgpr73 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 9
+    ; GCN64-FLATSCR-NEXT: $sgpr74 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 10
+    ; GCN64-FLATSCR-NEXT: $sgpr75 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 11
+    ; GCN64-FLATSCR-NEXT: $sgpr76 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 12
+    ; GCN64-FLATSCR-NEXT: $sgpr77 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 13
+    ; GCN64-FLATSCR-NEXT: $sgpr78 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 14
+    ; GCN64-FLATSCR-NEXT: $sgpr79 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 15
+    ; GCN64-FLATSCR-NEXT: $sgpr80 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 16
+    ; GCN64-FLATSCR-NEXT: $sgpr81 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 17
+    ; GCN64-FLATSCR-NEXT: $sgpr82 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 18
+    ; GCN64-FLATSCR-NEXT: $sgpr83 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 19
+    ; GCN64-FLATSCR-NEXT: $sgpr84 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 20
+    ; GCN64-FLATSCR-NEXT: $sgpr85 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 21
+    ; GCN64-FLATSCR-NEXT: $sgpr86 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 22
+    ; GCN64-FLATSCR-NEXT: $sgpr87 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 23
+    ; GCN64-FLATSCR-NEXT: $sgpr88 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 24
+    ; GCN64-FLATSCR-NEXT: $sgpr89 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 25
+    ; GCN64-FLATSCR-NEXT: $sgpr90 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 26
+    ; GCN64-FLATSCR-NEXT: $sgpr91 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 27
+    ; GCN64-FLATSCR-NEXT: $sgpr92 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 28
+    ; GCN64-FLATSCR-NEXT: $sgpr93 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 29
+    ; GCN64-FLATSCR-NEXT: $sgpr94 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 30
+    ; GCN64-FLATSCR-NEXT: $sgpr95 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 31
     ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     ; GCN64-FLATSCR-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
@@ -1016,7 +1020,7 @@ body:             |
     ; GCN64-FLATSCR-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $sgpr2 = S_ADD_I32 $sgpr33, 4096, implicit-def dead $scc
     ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR killed $sgpr2, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.8, align 4096, addrspace 5)
-    ; GCN64-FLATSCR-NEXT: $sgpr12 = V_READLANE_B32 killed $vgpr0, 0
+    ; GCN64-FLATSCR-NEXT: $sgpr12 = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 0
     ; GCN64-FLATSCR-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
     ; GCN64-FLATSCR-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     renamable $sgpr12 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32
diff --git a/llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills.mir b/llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills.mir
index 5a022efb38a35..2de7d86223eb2 100644
--- a/llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills.mir
+++ b/llvm/test/CodeGen/AMDGPU/si-lower-sgpr-spills.mir
@@ -1,14 +1,14 @@
 # RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -run-pass=si-lower-sgpr-spills -o - %s | FileCheck %s
 
 # CHECK-LABEL: name: empty_entry_block
-# CHECK: V_WRITELANE
-# CHECK-NEXT: V_WRITELANE
-# CHECK-NEXT: V_WRITELANE
-# CHECK-NEXT: V_WRITELANE
-# CHECK: V_READLANE
-# CHECK-NEXT: V_READLANE
-# CHECK-NEXT: V_READLANE
-# CHECK-NEXT: V_READLANE
+# CHECK: SI_SPILL_S32_TO_VGPR
+# CHECK-NEXT: SI_SPILL_S32_TO_VGPR
+# CHECK-NEXT: SI_SPILL_S32_TO_VGPR
+# CHECK-NEXT: SI_SPILL_S32_TO_VGPR
+# CHECK: SI_RESTORE_S32_FROM_VGPR
+# CHECK-NEXT: SI_RESTORE_S32_FROM_VGPR
+# CHECK-NEXT: SI_RESTORE_S32_FROM_VGPR
+# CHECK-NEXT: SI_RESTORE_S32_FROM_VGPR
 ---
 name:            empty_entry_block
 body: |
diff --git a/llvm/test/CodeGen/AMDGPU/spill-reg-tuple-super-reg-use.mir b/llvm/test/CodeGen/AMDGPU/spill-reg-tuple-super-reg-use.mir
index 95000a143de80..5da314b9f0e3b 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-reg-tuple-super-reg-use.mir
+++ b/llvm/test/CodeGen/AMDGPU/spill-reg-tuple-super-reg-use.mir
@@ -27,10 +27,10 @@ body:             |
     ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr8_sgpr9
     ; GCN-NEXT: renamable $vgpr0 = IMPLICIT_DEF
     ; GCN-NEXT: renamable $sgpr1 = COPY $sgpr2
-    ; GCN-NEXT: renamable $vgpr0 = V_WRITELANE_B32 $sgpr0, 0, killed $vgpr0, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3
-    ; GCN-NEXT: renamable $vgpr0 = V_WRITELANE_B32 $sgpr1, 1, killed $vgpr0
-    ; GCN-NEXT: renamable $vgpr0 = V_WRITELANE_B32 $sgpr2, 2, killed $vgpr0
-    ; GCN-NEXT: renamable $vgpr0 = V_WRITELANE_B32 $sgpr3, 3, killed $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3
+    ; GCN-NEXT: renamable $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr0, 0, killed $vgpr0, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3
+    ; GCN-NEXT: renamable $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr1, 1, killed $vgpr0
+    ; GCN-NEXT: renamable $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr2, 2, killed $vgpr0
+    ; GCN-NEXT: renamable $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr3, 3, killed $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3
     ; GCN-NEXT: renamable $sgpr8 = COPY renamable $sgpr1
     ; GCN-NEXT: KILL killed renamable $vgpr0
     ; GCN-NEXT: $sgpr0_sgpr1 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
@@ -66,10 +66,10 @@ body:             |
     ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr8_sgpr9
     ; GCN-NEXT: renamable $vgpr0 = IMPLICIT_DEF
     ; GCN-NEXT: renamable $sgpr1 = COPY $sgpr2
-    ; GCN-NEXT: renamable $vgpr0 = V_WRITELANE_B32 $sgpr0, 0, killed $vgpr0, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3
-    ; GCN-NEXT: renamable $vgpr0 = V_WRITELANE_B32 $sgpr1, 1, killed $vgpr0
-    ; GCN-NEXT: renamable $vgpr0 = V_WRITELANE_B32 $sgpr2, 2, killed $vgpr0
-    ; GCN-NEXT: renamable $vgpr0 = V_WRITELANE_B32 $sgpr3, 3, killed $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3
+    ; GCN-NEXT: renamable $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr0, 0, killed $vgpr0, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3
+    ; GCN-NEXT: renamable $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr1, 1, killed $vgpr0
+    ; GCN-NEXT: renamable $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr2, 2, killed $vgpr0
+    ; GCN-NEXT: renamable $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr3, 3, killed $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3
     ; GCN-NEXT: KILL killed renamable $vgpr0
     ; GCN-NEXT: $sgpr0_sgpr1 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
     ; GCN-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 0, 0, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5)
diff --git a/llvm/test/CodeGen/AMDGPU/spill-sgpr-csr-live-ins.mir b/llvm/test/CodeGen/AMDGPU/spill-sgpr-csr-live-ins.mir
index 89c305b82b451..d718b49321835 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-sgpr-csr-live-ins.mir
+++ b/llvm/test/CodeGen/AMDGPU/spill-sgpr-csr-live-ins.mir
@@ -12,7 +12,7 @@ body:             |
     ; CHECK-LABEL: name: spill_csr_sgpr_argument
     ; CHECK: liveins: $sgpr50, $vgpr0
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr50, 0, $vgpr0
+    ; CHECK-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr50, 0, $vgpr0
     ; CHECK-NEXT: S_NOP 0, implicit $sgpr50
     ; CHECK-NEXT: $sgpr50 = S_MOV_B32 0
     S_NOP 0, implicit $sgpr50
diff --git a/llvm/test/CodeGen/AMDGPU/spill-sgpr-to-virtual-vgpr.mir b/llvm/test/CodeGen/AMDGPU/spill-sgpr-to-virtual-vgpr.mir
index a77e315dc54a4..efb022ccb0d55 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-sgpr-to-virtual-vgpr.mir
+++ b/llvm/test/CodeGen/AMDGPU/spill-sgpr-to-virtual-vgpr.mir
@@ -23,8 +23,8 @@ body:             |
     ; GCN-NEXT: {{  $}}
     ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
     ; GCN-NEXT: S_NOP 0
-    ; GCN-NEXT: [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr10, 0, [[V_WRITELANE_B32_]]
-    ; GCN-NEXT: $sgpr10 = V_READLANE_B32 [[V_WRITELANE_B32_]], 0
+    ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR killed $sgpr10, 0, [[DEF]]
+    ; GCN-NEXT: $sgpr10 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 0
     ; GCN-NEXT: S_SETPC_B64 $sgpr30_sgpr31
     S_NOP 0
     SI_SPILL_S32_SAVE killed $sgpr10, %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32
@@ -56,106 +56,106 @@ body:             |
     ; GCN: liveins: $sgpr10, $sgpr64, $sgpr65, $sgpr66, $sgpr67, $sgpr68, $sgpr69, $sgpr70, $sgpr71, $sgpr72, $sgpr73, $sgpr74, $sgpr75, $sgpr76, $sgpr77, $sgpr78, $sgpr79, $sgpr80, $sgpr81, $sgpr82, $sgpr83, $sgpr84, $sgpr85, $sgpr86, $sgpr87, $sgpr88, $sgpr89, $sgpr90, $sgpr91, $sgpr92, $sgpr93, $sgpr94, $sgpr95, $vgpr0, $sgpr30_sgpr31, $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71, $sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79, $sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87, $sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
     ; GCN-NEXT: {{  $}}
     ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-    ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr64, 0, $vgpr0
-    ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr65, 1, $vgpr0
-    ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr66, 2, $vgpr0
-    ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr67, 3, $vgpr0
-    ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr68, 4, $vgpr0
-    ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr69, 5, $vgpr0
-    ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr70, 6, $vgpr0
-    ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr71, 7, $vgpr0
-    ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr72, 8, $vgpr0
-    ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr73, 9, $vgpr0
-    ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr74, 10, $vgpr0
-    ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr75, 11, $vgpr0
-    ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr76, 12, $vgpr0
-    ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr77, 13, $vgpr0
-    ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr78, 14, $vgpr0
-    ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr79, 15, $vgpr0
-    ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr80, 16, $vgpr0
-    ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr81, 17, $vgpr0
-    ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr82, 18, $vgpr0
-    ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr83, 19, $vgpr0
-    ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr84, 20, $vgpr0
-    ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr85, 21, $vgpr0
-    ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr86, 22, $vgpr0
-    ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr87, 23, $vgpr0
-    ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr88, 24, $vgpr0
-    ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr89, 25, $vgpr0
-    ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr90, 26, $vgpr0
-    ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr91, 27, $vgpr0
-    ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr92, 28, $vgpr0
-    ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr93, 29, $vgpr0
-    ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr94, 30, $vgpr0
-    ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr95, 31, $vgpr0
+    ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr64, 0, $vgpr0
+    ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr65, 1, $vgpr0
+    ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr66, 2, $vgpr0
+    ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr67, 3, $vgpr0
+    ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr68, 4, $vgpr0
+    ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr69, 5, $vgpr0
+    ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr70, 6, $vgpr0
+    ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr71, 7, $vgpr0
+    ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr72, 8, $vgpr0
+    ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr73, 9, $vgpr0
+    ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr74, 10, $vgpr0
+    ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr75, 11, $vgpr0
+    ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr76, 12, $vgpr0
+    ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr77, 13, $vgpr0
+    ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr78, 14, $vgpr0
+    ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr79, 15, $vgpr0
+    ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr80, 16, $vgpr0
+    ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr81, 17, $vgpr0
+    ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr82, 18, $vgpr0
+    ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr83, 19, $vgpr0
+    ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr84, 20, $vgpr0
+    ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr85, 21, $vgpr0
+    ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr86, 22, $vgpr0
+    ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr87, 23, $vgpr0
+    ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr88, 24, $vgpr0
+    ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr89, 25, $vgpr0
+    ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr90, 26, $vgpr0
+    ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr91, 27, $vgpr0
+    ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr92, 28, $vgpr0
+    ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr93, 29, $vgpr0
+    ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr94, 30, $vgpr0
+    ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr95, 31, $vgpr0
     ; GCN-NEXT: S_NOP 0
-    ; GCN-NEXT: [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr10, 0, [[V_WRITELANE_B32_]]
-    ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr64, 1, [[V_WRITELANE_B32_1]], implicit-def $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr65, 2, [[V_WRITELANE_B32_1]]
-    ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr66, 3, [[V_WRITELANE_B32_1]]
-    ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr67, 4, [[V_WRITELANE_B32_1]]
-    ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr68, 5, [[V_WRITELANE_B32_1]]
-    ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr69, 6, [[V_WRITELANE_B32_1]]
-    ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr70, 7, [[V_WRITELANE_B32_1]]
-    ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr71, 8, [[V_WRITELANE_B32_1]]
-    ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr72, 9, [[V_WRITELANE_B32_1]]
-    ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr73, 10, [[V_WRITELANE_B32_1]]
-    ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr74, 11, [[V_WRITELANE_B32_1]]
-    ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr75, 12, [[V_WRITELANE_B32_1]]
-    ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr76, 13, [[V_WRITELANE_B32_1]]
-    ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr77, 14, [[V_WRITELANE_B32_1]]
-    ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr78, 15, [[V_WRITELANE_B32_1]]
-    ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr79, 16, [[V_WRITELANE_B32_1]]
-    ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr80, 17, [[V_WRITELANE_B32_1]]
-    ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr81, 18, [[V_WRITELANE_B32_1]]
-    ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr82, 19, [[V_WRITELANE_B32_1]]
-    ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr83, 20, [[V_WRITELANE_B32_1]]
-    ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr84, 21, [[V_WRITELANE_B32_1]]
-    ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr85, 22, [[V_WRITELANE_B32_1]]
-    ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr86, 23, [[V_WRITELANE_B32_1]]
-    ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr87, 24, [[V_WRITELANE_B32_1]]
-    ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr88, 25, [[V_WRITELANE_B32_1]]
-    ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr89, 26, [[V_WRITELANE_B32_1]]
-    ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr90, 27, [[V_WRITELANE_B32_1]]
-    ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr91, 28, [[V_WRITELANE_B32_1]]
-    ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr92, 29, [[V_WRITELANE_B32_1]]
-    ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr93, 30, [[V_WRITELANE_B32_1]]
-    ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr94, 31, [[V_WRITELANE_B32_1]]
-    ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr95, 32, [[V_WRITELANE_B32_1]], implicit killed $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR killed $sgpr10, 0, [[DEF]]
+    ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr64, 1, [[DEF]], implicit-def $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr65, 2, [[DEF]]
+    ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr66, 3, [[DEF]]
+    ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr67, 4, [[DEF]]
+    ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr68, 5, [[DEF]]
+    ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr69, 6, [[DEF]]
+    ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr70, 7, [[DEF]]
+    ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr71, 8, [[DEF]]
+    ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr72, 9, [[DEF]]
+    ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr73, 10, [[DEF]]
+    ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr74, 11, [[DEF]]
+    ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr75, 12, [[DEF]]
+    ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr76, 13, [[DEF]]
+    ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr77, 14, [[DEF]]
+    ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr78, 15, [[DEF]]
+    ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr79, 16, [[DEF]]
+    ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr80, 17, [[DEF]]
+    ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr81, 18, [[DEF]]
+    ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr82, 19, [[DEF]]
+    ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr83, 20, [[DEF]]
+    ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr84, 21, [[DEF]]
+    ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr85, 22, [[DEF]]
+    ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr86, 23, [[DEF]]
+    ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr87, 24, [[DEF]]
+    ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr88, 25, [[DEF]]
+    ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr89, 26, [[DEF]]
+    ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr90, 27, [[DEF]]
+    ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr91, 28, [[DEF]]
+    ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr92, 29, [[DEF]]
+    ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr93, 30, [[DEF]]
+    ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr94, 31, [[DEF]]
+    ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR killed $sgpr95, 32, [[DEF]], implicit killed $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
     ; GCN-NEXT: S_NOP 0
-    ; GCN-NEXT: $sgpr64 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 1, implicit-def $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
-    ; GCN-NEXT: $sgpr65 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 2
-    ; GCN-NEXT: $sgpr66 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 3
-    ; GCN-NEXT: $sgpr67 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 4
-    ; GCN-NEXT: $sgpr68 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 5
-    ; GCN-NEXT: $sgpr69 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 6
-    ; GCN-NEXT: $sgpr70 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 7
-    ; GCN-NEXT: $sgpr71 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 8
-    ; GCN-NEXT: $sgpr72 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 9
-    ; GCN-NEXT: $sgpr73 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 10
-    ; GCN-NEXT: $sgpr74 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 11
-    ; GCN-NEXT: $sgpr75 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 12
-    ; GCN-NEXT: $sgpr76 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 13
-    ; GCN-NEXT: $sgpr77 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 14
-    ; GCN-NEXT: $sgpr78 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 15
-    ; GCN-NEXT: $sgpr79 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 16
-    ; GCN-NEXT: $sgpr80 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 17
-    ; GCN-NEXT: $sgpr81 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 18
-    ; GCN-NEXT: $sgpr82 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 19
-    ; GCN-NEXT: $sgpr83 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 20
-    ; GCN-NEXT: $sgpr84 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 21
-    ; GCN-NEXT: $sgpr85 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 22
-    ; GCN-NEXT: $sgpr86 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 23
-    ; GCN-NEXT: $sgpr87 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 24
-    ; GCN-NEXT: $sgpr88 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 25
-    ; GCN-NEXT: $sgpr89 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 26
-    ; GCN-NEXT: $sgpr90 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 27
-    ; GCN-NEXT: $sgpr91 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 28
-    ; GCN-NEXT: $sgpr92 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 29
-    ; GCN-NEXT: $sgpr93 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 30
-    ; GCN-NEXT: $sgpr94 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 31
-    ; GCN-NEXT: $sgpr95 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 32
-    ; GCN-NEXT: $sgpr10 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 0
+    ; GCN-NEXT: $sgpr64 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 1, implicit-def $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN-NEXT: $sgpr65 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 2
+    ; GCN-NEXT: $sgpr66 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 3
+    ; GCN-NEXT: $sgpr67 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 4
+    ; GCN-NEXT: $sgpr68 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 5
+    ; GCN-NEXT: $sgpr69 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 6
+    ; GCN-NEXT: $sgpr70 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 7
+    ; GCN-NEXT: $sgpr71 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 8
+    ; GCN-NEXT: $sgpr72 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 9
+    ; GCN-NEXT: $sgpr73 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 10
+    ; GCN-NEXT: $sgpr74 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 11
+    ; GCN-NEXT: $sgpr75 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 12
+    ; GCN-NEXT: $sgpr76 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 13
+    ; GCN-NEXT: $sgpr77 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 14
+    ; GCN-NEXT: $sgpr78 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 15
+    ; GCN-NEXT: $sgpr79 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 16
+    ; GCN-NEXT: $sgpr80 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 17
+    ; GCN-NEXT: $sgpr81 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 18
+    ; GCN-NEXT: $sgpr82 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 19
+    ; GCN-NEXT: $sgpr83 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 20
+    ; GCN-NEXT: $sgpr84 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 21
+    ; GCN-NEXT: $sgpr85 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 22
+    ; GCN-NEXT: $sgpr86 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 23
+    ; GCN-NEXT: $sgpr87 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 24
+    ; GCN-NEXT: $sgpr88 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 25
+    ; GCN-NEXT: $sgpr89 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 26
+    ; GCN-NEXT: $sgpr90 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 27
+    ; GCN-NEXT: $sgpr91 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 28
+    ; GCN-NEXT: $sgpr92 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 29
+    ; GCN-NEXT: $sgpr93 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 30
+    ; GCN-NEXT: $sgpr94 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 31
+    ; GCN-NEXT: $sgpr95 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 32
+    ; GCN-NEXT: $sgpr10 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 0
     ; GCN-NEXT: S_SETPC_B64 $sgpr30_sgpr31
     S_NOP 0
     SI_SPILL_S32_SAVE killed $sgpr10, %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32
@@ -197,7 +197,7 @@ body:             |
   ; GCN-NEXT:   liveins: $sgpr10, $sgpr30_sgpr31
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   $sgpr10 = S_MOV_B32 10
-  ; GCN-NEXT:   [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr10, 0, [[V_WRITELANE_B32_]]
+  ; GCN-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR killed $sgpr10, 0, [[DEF]]
   ; GCN-NEXT:   S_BRANCH %bb.3
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT: bb.2:
@@ -205,13 +205,13 @@ body:             |
   ; GCN-NEXT:   liveins: $sgpr10, $sgpr30_sgpr31
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   $sgpr10 = S_MOV_B32 20
-  ; GCN-NEXT:   [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr10, 0, [[V_WRITELANE_B32_1]]
+  ; GCN-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR killed $sgpr10, 0, [[DEF]]
   ; GCN-NEXT:   S_BRANCH %bb.3
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT: bb.3:
   ; GCN-NEXT:   liveins: $sgpr10, $sgpr30_sgpr31
   ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT:   $sgpr10 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 0
+  ; GCN-NEXT:   $sgpr10 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 0
   ; GCN-NEXT:   S_SETPC_B64 $sgpr30_sgpr31, implicit $sgpr10
   bb.0:
     liveins: $sgpr10, $sgpr11, $sgpr30_sgpr31
@@ -264,7 +264,7 @@ body:             |
   ; GCN-NEXT:   successors: %bb.2(0x80000000)
   ; GCN-NEXT:   liveins: $sgpr10, $sgpr30_sgpr31
   ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT:   $sgpr10 = V_READLANE_B32 [[DEF]], 0
+  ; GCN-NEXT:   $sgpr10 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 0
   ; GCN-NEXT:   $sgpr10 = S_ADD_I32 $sgpr10, 15, implicit-def dead $scc
   ; GCN-NEXT:   S_BRANCH %bb.2
   ; GCN-NEXT: {{  $}}
@@ -272,7 +272,7 @@ body:             |
   ; GCN-NEXT:   successors: %bb.3(0x80000000)
   ; GCN-NEXT:   liveins: $sgpr10, $sgpr30_sgpr31
   ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT:   $sgpr10 = V_READLANE_B32 [[DEF]], 0
+  ; GCN-NEXT:   $sgpr10 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 0
   ; GCN-NEXT:   $sgpr10 = S_ADD_I32 $sgpr10, 20, implicit-def dead $scc
   ; GCN-NEXT:   S_BRANCH %bb.3
   ; GCN-NEXT: {{  $}}
@@ -281,7 +281,7 @@ body:             |
   ; GCN-NEXT:   liveins: $sgpr10, $sgpr11, $sgpr30_sgpr31
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   $sgpr10 = S_MOV_B32 10
-  ; GCN-NEXT:   [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr10, 0, [[V_WRITELANE_B32_]]
+  ; GCN-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR killed $sgpr10, 0, [[DEF]]
   ; GCN-NEXT:   S_CMP_EQ_U32 $sgpr11, 0, implicit-def $scc
   ; GCN-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit killed $scc
   ; GCN-NEXT:   S_BRANCH %bb.1
diff --git a/llvm/test/CodeGen/AMDGPU/spill-special-sgpr.mir b/llvm/test/CodeGen/AMDGPU/spill-special-sgpr.mir
index 69442032f37ec..3892ceb418959 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-special-sgpr.mir
+++ b/llvm/test/CodeGen/AMDGPU/spill-special-sgpr.mir
@@ -51,8 +51,8 @@ body:             |
     ; GFX9-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GFX9-NEXT: $exec = S_MOV_B64 3, implicit-def $vgpr0
     ; GFX9-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr12_sgpr13_sgpr14_sgpr15, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
-    ; GFX9-NEXT: $vgpr0 = V_WRITELANE_B32 $vcc_lo, 0, undef $vgpr0, implicit $vcc
-    ; GFX9-NEXT: $vgpr0 = V_WRITELANE_B32 $vcc_hi, 1, $vgpr0, implicit $vcc
+    ; GFX9-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $vcc_lo, 0, undef $vgpr0, implicit $vcc
+    ; GFX9-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $vcc_hi, 1, $vgpr0, implicit $vcc
     ; GFX9-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr12_sgpr13_sgpr14_sgpr15, $sgpr33, 4, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
     ; GFX9-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr12_sgpr13_sgpr14_sgpr15, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
     ; GFX9-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
@@ -60,8 +60,8 @@ body:             |
     ; GFX9-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GFX9-NEXT: $exec = S_MOV_B64 3, implicit-def $vgpr0
     ; GFX9-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr12_sgpr13_sgpr14_sgpr15, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
-    ; GFX9-NEXT: $vgpr0 = V_WRITELANE_B32 $vcc_lo, 0, undef $vgpr0, implicit $vcc
-    ; GFX9-NEXT: $vgpr0 = V_WRITELANE_B32 $vcc_hi, 1, $vgpr0, implicit killed $vcc
+    ; GFX9-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $vcc_lo, 0, undef $vgpr0, implicit $vcc
+    ; GFX9-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $vcc_hi, 1, $vgpr0, implicit killed $vcc
     ; GFX9-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr12_sgpr13_sgpr14_sgpr15, $sgpr33, 4, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
     ; GFX9-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr12_sgpr13_sgpr14_sgpr15, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
     ; GFX9-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
@@ -69,10 +69,11 @@ body:             |
     ; GFX9-NEXT: $exec = S_MOV_B64 3, implicit-def $vgpr0
     ; GFX9-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr12_sgpr13_sgpr14_sgpr15, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
     ; GFX9-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr12_sgpr13_sgpr14_sgpr15, $sgpr33, 4, 0, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5)
-    ; GFX9-NEXT: $vcc_lo = V_READLANE_B32 $vgpr0, 0, implicit-def $vcc
-    ; GFX9-NEXT: $vcc_hi = V_READLANE_B32 killed $vgpr0, 1
+    ; GFX9-NEXT: $vcc_lo = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0, implicit-def $vcc
+    ; GFX9-NEXT: $vcc_hi = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 1
     ; GFX9-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr12_sgpr13_sgpr14_sgpr15, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
     ; GFX9-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
+    ;
     ; GFX10-LABEL: name: check_vcc
     ; GFX10: liveins: $sgpr8, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr9
     ; GFX10-NEXT: {{  $}}
@@ -87,8 +88,8 @@ body:             |
     ; GFX10-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GFX10-NEXT: $exec = S_MOV_B64 3, implicit-def $vgpr0
     ; GFX10-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
-    ; GFX10-NEXT: $vgpr0 = V_WRITELANE_B32 $vcc_lo, 0, undef $vgpr0, implicit $vcc
-    ; GFX10-NEXT: $vgpr0 = V_WRITELANE_B32 $vcc_hi, 1, $vgpr0, implicit $vcc
+    ; GFX10-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $vcc_lo, 0, undef $vgpr0, implicit $vcc
+    ; GFX10-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $vcc_hi, 1, $vgpr0, implicit $vcc
     ; GFX10-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 4, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
     ; GFX10-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
     ; GFX10-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
@@ -96,8 +97,8 @@ body:             |
     ; GFX10-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GFX10-NEXT: $exec = S_MOV_B64 3, implicit-def $vgpr0
     ; GFX10-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
-    ; GFX10-NEXT: $vgpr0 = V_WRITELANE_B32 $vcc_lo, 0, undef $vgpr0, implicit $vcc
-    ; GFX10-NEXT: $vgpr0 = V_WRITELANE_B32 $vcc_hi, 1, $vgpr0, implicit killed $vcc
+    ; GFX10-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $vcc_lo, 0, undef $vgpr0, implicit $vcc
+    ; GFX10-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $vcc_hi, 1, $vgpr0, implicit killed $vcc
     ; GFX10-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 4, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
     ; GFX10-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
     ; GFX10-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
@@ -105,10 +106,11 @@ body:             |
     ; GFX10-NEXT: $exec = S_MOV_B64 3, implicit-def $vgpr0
     ; GFX10-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
     ; GFX10-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 4, 0, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5)
-    ; GFX10-NEXT: $vcc_lo = V_READLANE_B32 $vgpr0, 0, implicit-def $vcc
-    ; GFX10-NEXT: $vcc_hi = V_READLANE_B32 killed $vgpr0, 1
+    ; GFX10-NEXT: $vcc_lo = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0, implicit-def $vcc
+    ; GFX10-NEXT: $vcc_hi = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 1
     ; GFX10-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
     ; GFX10-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
+    ;
     ; GFX11-LABEL: name: check_vcc
     ; GFX11: liveins: $sgpr8, $sgpr4_sgpr5, $sgpr6_sgpr7
     ; GFX11-NEXT: {{  $}}
@@ -117,8 +119,8 @@ body:             |
     ; GFX11-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GFX11-NEXT: $exec = S_MOV_B64 3, implicit-def $vgpr0
     ; GFX11-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
-    ; GFX11-NEXT: $vgpr0 = V_WRITELANE_B32 $vcc_lo, 0, undef $vgpr0, implicit $vcc
-    ; GFX11-NEXT: $vgpr0 = V_WRITELANE_B32 $vcc_hi, 1, $vgpr0, implicit $vcc
+    ; GFX11-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $vcc_lo, 0, undef $vgpr0, implicit $vcc
+    ; GFX11-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $vcc_hi, 1, $vgpr0, implicit $vcc
     ; GFX11-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5)
     ; GFX11-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
     ; GFX11-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
@@ -126,8 +128,8 @@ body:             |
     ; GFX11-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec
     ; GFX11-NEXT: $exec = S_MOV_B64 3, implicit-def $vgpr0
     ; GFX11-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
-    ; GFX11-NEXT: $vgpr0 = V_WRITELANE_B32 $vcc_lo, 0, undef $vgpr0, implicit $vcc
-    ; GFX11-NEXT: $vgpr0 = V_WRITELANE_B32 $vcc_hi, 1, $vgpr0, implicit killed $vcc
+    ; GFX11-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $vcc_lo, 0, undef $vgpr0, implicit $vcc
+    ; GFX11-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $vcc_hi, 1, $vgpr0, implicit killed $vcc
     ; GFX11-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.0, addrspace 5)
     ; GFX11-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
     ; GFX11-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
@@ -135,8 +137,8 @@ body:             |
     ; GFX11-NEXT: $exec = S_MOV_B64 3, implicit-def $vgpr0
     ; GFX11-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr0, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %fixed-stack.0, align 16, addrspace 5)
     ; GFX11-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.0, addrspace 5)
-    ; GFX11-NEXT: $vcc_lo = V_READLANE_B32 $vgpr0, 0, implicit-def $vcc
-    ; GFX11-NEXT: $vcc_hi = V_READLANE_B32 killed $vgpr0, 1
+    ; GFX11-NEXT: $vcc_lo = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0, implicit-def $vcc
+    ; GFX11-NEXT: $vcc_hi = SI_RESTORE_S32_FROM_VGPR killed $vgpr0, 1
     ; GFX11-NEXT: $vgpr0 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %fixed-stack.0, align 16, addrspace 5)
     ; GFX11-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1, implicit killed $vgpr0
     $vcc = IMPLICIT_DEF
diff --git a/llvm/test/CodeGen/AMDGPU/spill-writelane-vgprs.ll b/llvm/test/CodeGen/AMDGPU/spill-writelane-vgprs.ll
index c1fc297d45643..c1c69ce568a9c 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-writelane-vgprs.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-writelane-vgprs.ll
@@ -26,22 +26,14 @@ define void @sgpr_spill_writelane() {
   ret void
 }
 
-; FIXME: The writelane intrinsic doesn't really overwrite any inactive lanes
-; and hence there is no need to preserve the VGPR it modifies.
 define void @device_writelane_intrinsic(ptr addrspace(1) %out, i32 %src) {
 ; GCN-LABEL: device_writelane_intrinsic:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    v_mov_b32_e32 v3, 15
 ; GCN-NEXT:    v_readfirstlane_b32 s4, v2
 ; GCN-NEXT:    v_writelane_b32 v3, s4, 23
 ; GCN-NEXT:    global_store_dword v[0:1], v3, off
-; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload
-; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %writelane = call i32 @llvm.amdgcn.writelane(i32 %src, i32 23, i32 15)
diff --git a/llvm/test/CodeGen/AMDGPU/spill192.mir b/llvm/test/CodeGen/AMDGPU/spill192.mir
index ff9aeecfaa673..9371a72d6992b 100644
--- a/llvm/test/CodeGen/AMDGPU/spill192.mir
+++ b/llvm/test/CodeGen/AMDGPU/spill192.mir
@@ -29,18 +29,19 @@ body: |
   ; SPILLED-NEXT: bb.2:
   ; SPILLED-NEXT:   $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 = SI_SPILL_S192_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load (s192) from %stack.0, align 4, addrspace 5)
   ; SPILLED-NEXT:   S_NOP 0, implicit killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9
+  ;
   ; EXPANDED-LABEL: name: spill_restore_sgpr192
   ; EXPANDED: bb.0:
   ; EXPANDED-NEXT:   successors: %bb.1(0x80000000)
   ; EXPANDED-NEXT: {{  $}}
   ; EXPANDED-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; EXPANDED-NEXT:   S_NOP 0, implicit-def renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9
-  ; EXPANDED-NEXT:   [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr4, 0, [[V_WRITELANE_B32_]], implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9
-  ; EXPANDED-NEXT:   [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr5, 1, [[V_WRITELANE_B32_1]]
-  ; EXPANDED-NEXT:   [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr6, 2, [[V_WRITELANE_B32_1]]
-  ; EXPANDED-NEXT:   [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr7, 3, [[V_WRITELANE_B32_1]]
-  ; EXPANDED-NEXT:   [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr8, 4, [[V_WRITELANE_B32_1]]
-  ; EXPANDED-NEXT:   [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr9, 5, [[V_WRITELANE_B32_1]], implicit killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9
+  ; EXPANDED-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, [[DEF]], implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9
+  ; EXPANDED-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr5, 1, [[DEF]]
+  ; EXPANDED-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr6, 2, [[DEF]]
+  ; EXPANDED-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr7, 3, [[DEF]]
+  ; EXPANDED-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr8, 4, [[DEF]]
+  ; EXPANDED-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR killed $sgpr9, 5, [[DEF]], implicit killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9
   ; EXPANDED-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
   ; EXPANDED-NEXT: {{  $}}
   ; EXPANDED-NEXT: bb.1:
@@ -49,12 +50,12 @@ body: |
   ; EXPANDED-NEXT:   S_NOP 1
   ; EXPANDED-NEXT: {{  $}}
   ; EXPANDED-NEXT: bb.2:
-  ; EXPANDED-NEXT:   $sgpr4 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9
-  ; EXPANDED-NEXT:   $sgpr5 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 1
-  ; EXPANDED-NEXT:   $sgpr6 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 2
-  ; EXPANDED-NEXT:   $sgpr7 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 3
-  ; EXPANDED-NEXT:   $sgpr8 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 4
-  ; EXPANDED-NEXT:   $sgpr9 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 5
+  ; EXPANDED-NEXT:   $sgpr4 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9
+  ; EXPANDED-NEXT:   $sgpr5 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 1
+  ; EXPANDED-NEXT:   $sgpr6 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 2
+  ; EXPANDED-NEXT:   $sgpr7 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 3
+  ; EXPANDED-NEXT:   $sgpr8 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 4
+  ; EXPANDED-NEXT:   $sgpr9 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 5
   ; EXPANDED-NEXT:   S_NOP 0, implicit killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9
   bb.0:
     S_NOP 0, implicit-def %0:sgpr_192
@@ -90,6 +91,7 @@ body: |
   ; SPILLED-NEXT: bb.2:
   ; SPILLED-NEXT:   $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 = SI_SPILL_V192_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s192) from %stack.0, align 4, addrspace 5)
   ; SPILLED-NEXT:   S_NOP 0, implicit killed renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5
+  ;
   ; EXPANDED-LABEL: name: spill_restore_vgpr192
   ; EXPANDED: bb.0:
   ; EXPANDED-NEXT:   successors: %bb.1(0x80000000)
diff --git a/llvm/test/CodeGen/AMDGPU/spill224.mir b/llvm/test/CodeGen/AMDGPU/spill224.mir
index 0f84821687d71..2b63abe34eed6 100644
--- a/llvm/test/CodeGen/AMDGPU/spill224.mir
+++ b/llvm/test/CodeGen/AMDGPU/spill224.mir
@@ -27,19 +27,20 @@ body: |
   ; SPILLED-NEXT: bb.2:
   ; SPILLED-NEXT:   $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 = SI_SPILL_S224_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load (s224) from %stack.0, align 4, addrspace 5)
   ; SPILLED-NEXT:   S_NOP 0, implicit killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10
+  ;
   ; EXPANDED-LABEL: name: spill_restore_sgpr224
   ; EXPANDED: bb.0:
   ; EXPANDED-NEXT:   successors: %bb.1(0x80000000)
   ; EXPANDED-NEXT: {{  $}}
   ; EXPANDED-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; EXPANDED-NEXT:   S_NOP 0, implicit-def renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10
-  ; EXPANDED-NEXT:   [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr4, 0, [[V_WRITELANE_B32_]], implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10
-  ; EXPANDED-NEXT:   [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr5, 1, [[V_WRITELANE_B32_1]]
-  ; EXPANDED-NEXT:   [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr6, 2, [[V_WRITELANE_B32_1]]
-  ; EXPANDED-NEXT:   [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr7, 3, [[V_WRITELANE_B32_1]]
-  ; EXPANDED-NEXT:   [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr8, 4, [[V_WRITELANE_B32_1]]
-  ; EXPANDED-NEXT:   [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr9, 5, [[V_WRITELANE_B32_1]]
-  ; EXPANDED-NEXT:   [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr10, 6, [[V_WRITELANE_B32_1]], implicit killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10
+  ; EXPANDED-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, [[DEF]], implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10
+  ; EXPANDED-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr5, 1, [[DEF]]
+  ; EXPANDED-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr6, 2, [[DEF]]
+  ; EXPANDED-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr7, 3, [[DEF]]
+  ; EXPANDED-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr8, 4, [[DEF]]
+  ; EXPANDED-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr9, 5, [[DEF]]
+  ; EXPANDED-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR killed $sgpr10, 6, [[DEF]], implicit killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10
   ; EXPANDED-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
   ; EXPANDED-NEXT: {{  $}}
   ; EXPANDED-NEXT: bb.1:
@@ -48,13 +49,13 @@ body: |
   ; EXPANDED-NEXT:   S_NOP 1
   ; EXPANDED-NEXT: {{  $}}
   ; EXPANDED-NEXT: bb.2:
-  ; EXPANDED-NEXT:   $sgpr4 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10
-  ; EXPANDED-NEXT:   $sgpr5 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 1
-  ; EXPANDED-NEXT:   $sgpr6 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 2
-  ; EXPANDED-NEXT:   $sgpr7 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 3
-  ; EXPANDED-NEXT:   $sgpr8 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 4
-  ; EXPANDED-NEXT:   $sgpr9 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 5
-  ; EXPANDED-NEXT:   $sgpr10 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 6
+  ; EXPANDED-NEXT:   $sgpr4 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10
+  ; EXPANDED-NEXT:   $sgpr5 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 1
+  ; EXPANDED-NEXT:   $sgpr6 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 2
+  ; EXPANDED-NEXT:   $sgpr7 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 3
+  ; EXPANDED-NEXT:   $sgpr8 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 4
+  ; EXPANDED-NEXT:   $sgpr9 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 5
+  ; EXPANDED-NEXT:   $sgpr10 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 6
   ; EXPANDED-NEXT:   S_NOP 0, implicit killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10
   bb.0:
     S_NOP 0, implicit-def %0:sgpr_224
@@ -90,6 +91,7 @@ body: |
   ; SPILLED-NEXT: bb.2:
   ; SPILLED-NEXT:   $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6 = SI_SPILL_V224_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s224) from %stack.0, align 4, addrspace 5)
   ; SPILLED-NEXT:   S_NOP 0, implicit killed renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6
+  ;
   ; EXPANDED-LABEL: name: spill_restore_vgpr224
   ; EXPANDED: bb.0:
   ; EXPANDED-NEXT:   successors: %bb.1(0x80000000)
diff --git a/llvm/test/CodeGen/AMDGPU/spill288.mir b/llvm/test/CodeGen/AMDGPU/spill288.mir
index 219d69fa2d46a..29df0b16c1eaa 100644
--- a/llvm/test/CodeGen/AMDGPU/spill288.mir
+++ b/llvm/test/CodeGen/AMDGPU/spill288.mir
@@ -27,21 +27,22 @@ body: |
   ; SPILLED-NEXT: bb.2:
   ; SPILLED-NEXT:   $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12 = SI_SPILL_S288_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load (s288) from %stack.0, align 4, addrspace 5)
   ; SPILLED-NEXT:   S_NOP 0, implicit killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12
+  ;
   ; EXPANDED-LABEL: name: spill_restore_sgpr288
   ; EXPANDED: bb.0:
   ; EXPANDED-NEXT:   successors: %bb.1(0x80000000)
   ; EXPANDED-NEXT: {{  $}}
   ; EXPANDED-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; EXPANDED-NEXT:   S_NOP 0, implicit-def renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12
-  ; EXPANDED-NEXT:   [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr4, 0, [[V_WRITELANE_B32_]], implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12
-  ; EXPANDED-NEXT:   [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr5, 1, [[V_WRITELANE_B32_1]]
-  ; EXPANDED-NEXT:   [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr6, 2, [[V_WRITELANE_B32_1]]
-  ; EXPANDED-NEXT:   [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr7, 3, [[V_WRITELANE_B32_1]]
-  ; EXPANDED-NEXT:   [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr8, 4, [[V_WRITELANE_B32_1]]
-  ; EXPANDED-NEXT:   [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr9, 5, [[V_WRITELANE_B32_1]]
-  ; EXPANDED-NEXT:   [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr10, 6, [[V_WRITELANE_B32_1]]
-  ; EXPANDED-NEXT:   [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr11, 7, [[V_WRITELANE_B32_1]]
-  ; EXPANDED-NEXT:   [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr12, 8, [[V_WRITELANE_B32_1]], implicit killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12
+  ; EXPANDED-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, [[DEF]], implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12
+  ; EXPANDED-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr5, 1, [[DEF]]
+  ; EXPANDED-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr6, 2, [[DEF]]
+  ; EXPANDED-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr7, 3, [[DEF]]
+  ; EXPANDED-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr8, 4, [[DEF]]
+  ; EXPANDED-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr9, 5, [[DEF]]
+  ; EXPANDED-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr10, 6, [[DEF]]
+  ; EXPANDED-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr11, 7, [[DEF]]
+  ; EXPANDED-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR killed $sgpr12, 8, [[DEF]], implicit killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12
   ; EXPANDED-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
   ; EXPANDED-NEXT: {{  $}}
   ; EXPANDED-NEXT: bb.1:
@@ -50,15 +51,15 @@ body: |
   ; EXPANDED-NEXT:   S_NOP 1
   ; EXPANDED-NEXT: {{  $}}
   ; EXPANDED-NEXT: bb.2:
-  ; EXPANDED-NEXT:   $sgpr4 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12
-  ; EXPANDED-NEXT:   $sgpr5 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 1
-  ; EXPANDED-NEXT:   $sgpr6 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 2
-  ; EXPANDED-NEXT:   $sgpr7 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 3
-  ; EXPANDED-NEXT:   $sgpr8 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 4
-  ; EXPANDED-NEXT:   $sgpr9 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 5
-  ; EXPANDED-NEXT:   $sgpr10 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 6
-  ; EXPANDED-NEXT:   $sgpr11 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 7
-  ; EXPANDED-NEXT:   $sgpr12 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 8
+  ; EXPANDED-NEXT:   $sgpr4 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12
+  ; EXPANDED-NEXT:   $sgpr5 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 1
+  ; EXPANDED-NEXT:   $sgpr6 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 2
+  ; EXPANDED-NEXT:   $sgpr7 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 3
+  ; EXPANDED-NEXT:   $sgpr8 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 4
+  ; EXPANDED-NEXT:   $sgpr9 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 5
+  ; EXPANDED-NEXT:   $sgpr10 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 6
+  ; EXPANDED-NEXT:   $sgpr11 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 7
+  ; EXPANDED-NEXT:   $sgpr12 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 8
   ; EXPANDED-NEXT:   S_NOP 0, implicit killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12
   bb.0:
     S_NOP 0, implicit-def %0:sgpr_288
@@ -94,6 +95,7 @@ body: |
   ; SPILLED-NEXT: bb.2:
   ; SPILLED-NEXT:   $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 = SI_SPILL_V288_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s288) from %stack.0, align 4, addrspace 5)
   ; SPILLED-NEXT:   S_NOP 0, implicit killed renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8
+  ;
   ; EXPANDED-LABEL: name: spill_restore_vgpr288
   ; EXPANDED: bb.0:
   ; EXPANDED-NEXT:   successors: %bb.1(0x80000000)
diff --git a/llvm/test/CodeGen/AMDGPU/spill320.mir b/llvm/test/CodeGen/AMDGPU/spill320.mir
index dcb4b9dda8783..d7ded7d8f01b4 100644
--- a/llvm/test/CodeGen/AMDGPU/spill320.mir
+++ b/llvm/test/CodeGen/AMDGPU/spill320.mir
@@ -27,22 +27,23 @@ body: |
   ; SPILLED-NEXT: bb.2:
   ; SPILLED-NEXT:   $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 = SI_SPILL_S320_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load (s320) from %stack.0, align 4, addrspace 5)
   ; SPILLED-NEXT:   S_NOP 0, implicit killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13
+  ;
   ; EXPANDED-LABEL: name: spill_restore_sgpr320
   ; EXPANDED: bb.0:
   ; EXPANDED-NEXT:   successors: %bb.1(0x80000000)
   ; EXPANDED-NEXT: {{  $}}
   ; EXPANDED-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; EXPANDED-NEXT:   S_NOP 0, implicit-def renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13
-  ; EXPANDED-NEXT:   [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr4, 0, [[V_WRITELANE_B32_]], implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13
-  ; EXPANDED-NEXT:   [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr5, 1, [[V_WRITELANE_B32_1]]
-  ; EXPANDED-NEXT:   [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr6, 2, [[V_WRITELANE_B32_1]]
-  ; EXPANDED-NEXT:   [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr7, 3, [[V_WRITELANE_B32_1]]
-  ; EXPANDED-NEXT:   [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr8, 4, [[V_WRITELANE_B32_1]]
-  ; EXPANDED-NEXT:   [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr9, 5, [[V_WRITELANE_B32_1]]
-  ; EXPANDED-NEXT:   [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr10, 6, [[V_WRITELANE_B32_1]]
-  ; EXPANDED-NEXT:   [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr11, 7, [[V_WRITELANE_B32_1]]
-  ; EXPANDED-NEXT:   [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr12, 8, [[V_WRITELANE_B32_1]]
-  ; EXPANDED-NEXT:   [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr13, 9, [[V_WRITELANE_B32_1]], implicit killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13
+  ; EXPANDED-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, [[DEF]], implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13
+  ; EXPANDED-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr5, 1, [[DEF]]
+  ; EXPANDED-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr6, 2, [[DEF]]
+  ; EXPANDED-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr7, 3, [[DEF]]
+  ; EXPANDED-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr8, 4, [[DEF]]
+  ; EXPANDED-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr9, 5, [[DEF]]
+  ; EXPANDED-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr10, 6, [[DEF]]
+  ; EXPANDED-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr11, 7, [[DEF]]
+  ; EXPANDED-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr12, 8, [[DEF]]
+  ; EXPANDED-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR killed $sgpr13, 9, [[DEF]], implicit killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13
   ; EXPANDED-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
   ; EXPANDED-NEXT: {{  $}}
   ; EXPANDED-NEXT: bb.1:
@@ -51,16 +52,16 @@ body: |
   ; EXPANDED-NEXT:   S_NOP 1
   ; EXPANDED-NEXT: {{  $}}
   ; EXPANDED-NEXT: bb.2:
-  ; EXPANDED-NEXT:   $sgpr4 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13
-  ; EXPANDED-NEXT:   $sgpr5 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 1
-  ; EXPANDED-NEXT:   $sgpr6 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 2
-  ; EXPANDED-NEXT:   $sgpr7 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 3
-  ; EXPANDED-NEXT:   $sgpr8 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 4
-  ; EXPANDED-NEXT:   $sgpr9 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 5
-  ; EXPANDED-NEXT:   $sgpr10 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 6
-  ; EXPANDED-NEXT:   $sgpr11 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 7
-  ; EXPANDED-NEXT:   $sgpr12 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 8
-  ; EXPANDED-NEXT:   $sgpr13 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 9
+  ; EXPANDED-NEXT:   $sgpr4 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13
+  ; EXPANDED-NEXT:   $sgpr5 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 1
+  ; EXPANDED-NEXT:   $sgpr6 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 2
+  ; EXPANDED-NEXT:   $sgpr7 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 3
+  ; EXPANDED-NEXT:   $sgpr8 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 4
+  ; EXPANDED-NEXT:   $sgpr9 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 5
+  ; EXPANDED-NEXT:   $sgpr10 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 6
+  ; EXPANDED-NEXT:   $sgpr11 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 7
+  ; EXPANDED-NEXT:   $sgpr12 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 8
+  ; EXPANDED-NEXT:   $sgpr13 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 9
   ; EXPANDED-NEXT:   S_NOP 0, implicit killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13
   bb.0:
     S_NOP 0, implicit-def %0:sgpr_320
@@ -96,6 +97,7 @@ body: |
   ; SPILLED-NEXT: bb.2:
   ; SPILLED-NEXT:   $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9 = SI_SPILL_V320_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s320) from %stack.0, align 4, addrspace 5)
   ; SPILLED-NEXT:   S_NOP 0, implicit killed renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
+  ;
   ; EXPANDED-LABEL: name: spill_restore_vgpr320
   ; EXPANDED: bb.0:
   ; EXPANDED-NEXT:   successors: %bb.1(0x80000000)
diff --git a/llvm/test/CodeGen/AMDGPU/spill352.mir b/llvm/test/CodeGen/AMDGPU/spill352.mir
index 2ed4437aa57cd..fc8e72eca05ff 100644
--- a/llvm/test/CodeGen/AMDGPU/spill352.mir
+++ b/llvm/test/CodeGen/AMDGPU/spill352.mir
@@ -27,23 +27,24 @@ body: |
   ; SPILLED-NEXT: bb.2:
   ; SPILLED-NEXT:   $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14 = SI_SPILL_S352_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load (s352) from %stack.0, align 4, addrspace 5)
   ; SPILLED-NEXT:   S_NOP 0, implicit killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14
+  ;
   ; EXPANDED-LABEL: name: spill_restore_sgpr352
   ; EXPANDED: bb.0:
   ; EXPANDED-NEXT:   successors: %bb.1(0x80000000)
   ; EXPANDED-NEXT: {{  $}}
   ; EXPANDED-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; EXPANDED-NEXT:   S_NOP 0, implicit-def renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14
-  ; EXPANDED-NEXT:   [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr4, 0, [[V_WRITELANE_B32_]], implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14
-  ; EXPANDED-NEXT:   [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr5, 1, [[V_WRITELANE_B32_1]]
-  ; EXPANDED-NEXT:   [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr6, 2, [[V_WRITELANE_B32_1]]
-  ; EXPANDED-NEXT:   [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr7, 3, [[V_WRITELANE_B32_1]]
-  ; EXPANDED-NEXT:   [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr8, 4, [[V_WRITELANE_B32_1]]
-  ; EXPANDED-NEXT:   [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr9, 5, [[V_WRITELANE_B32_1]]
-  ; EXPANDED-NEXT:   [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr10, 6, [[V_WRITELANE_B32_1]]
-  ; EXPANDED-NEXT:   [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr11, 7, [[V_WRITELANE_B32_1]]
-  ; EXPANDED-NEXT:   [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr12, 8, [[V_WRITELANE_B32_1]]
-  ; EXPANDED-NEXT:   [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr13, 9, [[V_WRITELANE_B32_1]]
-  ; EXPANDED-NEXT:   [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr14, 10, [[V_WRITELANE_B32_1]], implicit killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14
+  ; EXPANDED-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, [[DEF]], implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14
+  ; EXPANDED-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr5, 1, [[DEF]]
+  ; EXPANDED-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr6, 2, [[DEF]]
+  ; EXPANDED-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr7, 3, [[DEF]]
+  ; EXPANDED-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr8, 4, [[DEF]]
+  ; EXPANDED-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr9, 5, [[DEF]]
+  ; EXPANDED-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr10, 6, [[DEF]]
+  ; EXPANDED-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr11, 7, [[DEF]]
+  ; EXPANDED-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr12, 8, [[DEF]]
+  ; EXPANDED-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr13, 9, [[DEF]]
+  ; EXPANDED-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR killed $sgpr14, 10, [[DEF]], implicit killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14
   ; EXPANDED-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
   ; EXPANDED-NEXT: {{  $}}
   ; EXPANDED-NEXT: bb.1:
@@ -52,17 +53,17 @@ body: |
   ; EXPANDED-NEXT:   S_NOP 1
   ; EXPANDED-NEXT: {{  $}}
   ; EXPANDED-NEXT: bb.2:
-  ; EXPANDED-NEXT:   $sgpr4 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14
-  ; EXPANDED-NEXT:   $sgpr5 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 1
-  ; EXPANDED-NEXT:   $sgpr6 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 2
-  ; EXPANDED-NEXT:   $sgpr7 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 3
-  ; EXPANDED-NEXT:   $sgpr8 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 4
-  ; EXPANDED-NEXT:   $sgpr9 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 5
-  ; EXPANDED-NEXT:   $sgpr10 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 6
-  ; EXPANDED-NEXT:   $sgpr11 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 7
-  ; EXPANDED-NEXT:   $sgpr12 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 8
-  ; EXPANDED-NEXT:   $sgpr13 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 9
-  ; EXPANDED-NEXT:   $sgpr14 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 10
+  ; EXPANDED-NEXT:   $sgpr4 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14
+  ; EXPANDED-NEXT:   $sgpr5 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 1
+  ; EXPANDED-NEXT:   $sgpr6 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 2
+  ; EXPANDED-NEXT:   $sgpr7 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 3
+  ; EXPANDED-NEXT:   $sgpr8 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 4
+  ; EXPANDED-NEXT:   $sgpr9 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 5
+  ; EXPANDED-NEXT:   $sgpr10 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 6
+  ; EXPANDED-NEXT:   $sgpr11 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 7
+  ; EXPANDED-NEXT:   $sgpr12 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 8
+  ; EXPANDED-NEXT:   $sgpr13 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 9
+  ; EXPANDED-NEXT:   $sgpr14 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 10
   ; EXPANDED-NEXT:   S_NOP 0, implicit killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14
   bb.0:
     S_NOP 0, implicit-def %0:sgpr_352
@@ -98,6 +99,7 @@ body: |
   ; SPILLED-NEXT: bb.2:
   ; SPILLED-NEXT:   $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10 = SI_SPILL_V352_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s352) from %stack.0, align 4, addrspace 5)
   ; SPILLED-NEXT:   S_NOP 0, implicit killed renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10
+  ;
   ; EXPANDED-LABEL: name: spill_restore_vgpr352
   ; EXPANDED: bb.0:
   ; EXPANDED-NEXT:   successors: %bb.1(0x80000000)
diff --git a/llvm/test/CodeGen/AMDGPU/spill384.mir b/llvm/test/CodeGen/AMDGPU/spill384.mir
index c88bfe20a75fe..3d6fe33f98950 100644
--- a/llvm/test/CodeGen/AMDGPU/spill384.mir
+++ b/llvm/test/CodeGen/AMDGPU/spill384.mir
@@ -27,24 +27,25 @@ body: |
   ; SPILLED-NEXT: bb.2:
   ; SPILLED-NEXT:   $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = SI_SPILL_S384_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load (s384) from %stack.0, align 4, addrspace 5)
   ; SPILLED-NEXT:   S_NOP 0, implicit killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+  ;
   ; EXPANDED-LABEL: name: spill_restore_sgpr384
   ; EXPANDED: bb.0:
   ; EXPANDED-NEXT:   successors: %bb.1(0x80000000)
   ; EXPANDED-NEXT: {{  $}}
   ; EXPANDED-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
   ; EXPANDED-NEXT:   S_NOP 0, implicit-def renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
-  ; EXPANDED-NEXT:   [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr4, 0, [[V_WRITELANE_B32_]], implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
-  ; EXPANDED-NEXT:   [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr5, 1, [[V_WRITELANE_B32_1]]
-  ; EXPANDED-NEXT:   [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr6, 2, [[V_WRITELANE_B32_1]]
-  ; EXPANDED-NEXT:   [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr7, 3, [[V_WRITELANE_B32_1]]
-  ; EXPANDED-NEXT:   [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr8, 4, [[V_WRITELANE_B32_1]]
-  ; EXPANDED-NEXT:   [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr9, 5, [[V_WRITELANE_B32_1]]
-  ; EXPANDED-NEXT:   [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr10, 6, [[V_WRITELANE_B32_1]]
-  ; EXPANDED-NEXT:   [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr11, 7, [[V_WRITELANE_B32_1]]
-  ; EXPANDED-NEXT:   [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr12, 8, [[V_WRITELANE_B32_1]]
-  ; EXPANDED-NEXT:   [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr13, 9, [[V_WRITELANE_B32_1]]
-  ; EXPANDED-NEXT:   [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr14, 10, [[V_WRITELANE_B32_1]]
-  ; EXPANDED-NEXT:   [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr15, 11, [[V_WRITELANE_B32_1]], implicit killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+  ; EXPANDED-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, [[DEF]], implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+  ; EXPANDED-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr5, 1, [[DEF]]
+  ; EXPANDED-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr6, 2, [[DEF]]
+  ; EXPANDED-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr7, 3, [[DEF]]
+  ; EXPANDED-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr8, 4, [[DEF]]
+  ; EXPANDED-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr9, 5, [[DEF]]
+  ; EXPANDED-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr10, 6, [[DEF]]
+  ; EXPANDED-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr11, 7, [[DEF]]
+  ; EXPANDED-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr12, 8, [[DEF]]
+  ; EXPANDED-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr13, 9, [[DEF]]
+  ; EXPANDED-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr14, 10, [[DEF]]
+  ; EXPANDED-NEXT:   [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR killed $sgpr15, 11, [[DEF]], implicit killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
   ; EXPANDED-NEXT:   S_CBRANCH_SCC1 %bb.1, implicit undef $scc
   ; EXPANDED-NEXT: {{  $}}
   ; EXPANDED-NEXT: bb.1:
@@ -53,18 +54,18 @@ body: |
   ; EXPANDED-NEXT:   S_NOP 1
   ; EXPANDED-NEXT: {{  $}}
   ; EXPANDED-NEXT: bb.2:
-  ; EXPANDED-NEXT:   $sgpr4 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
-  ; EXPANDED-NEXT:   $sgpr5 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 1
-  ; EXPANDED-NEXT:   $sgpr6 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 2
-  ; EXPANDED-NEXT:   $sgpr7 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 3
-  ; EXPANDED-NEXT:   $sgpr8 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 4
-  ; EXPANDED-NEXT:   $sgpr9 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 5
-  ; EXPANDED-NEXT:   $sgpr10 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 6
-  ; EXPANDED-NEXT:   $sgpr11 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 7
-  ; EXPANDED-NEXT:   $sgpr12 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 8
-  ; EXPANDED-NEXT:   $sgpr13 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 9
-  ; EXPANDED-NEXT:   $sgpr14 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 10
-  ; EXPANDED-NEXT:   $sgpr15 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 11
+  ; EXPANDED-NEXT:   $sgpr4 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
+  ; EXPANDED-NEXT:   $sgpr5 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 1
+  ; EXPANDED-NEXT:   $sgpr6 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 2
+  ; EXPANDED-NEXT:   $sgpr7 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 3
+  ; EXPANDED-NEXT:   $sgpr8 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 4
+  ; EXPANDED-NEXT:   $sgpr9 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 5
+  ; EXPANDED-NEXT:   $sgpr10 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 6
+  ; EXPANDED-NEXT:   $sgpr11 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 7
+  ; EXPANDED-NEXT:   $sgpr12 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 8
+  ; EXPANDED-NEXT:   $sgpr13 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 9
+  ; EXPANDED-NEXT:   $sgpr14 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 10
+  ; EXPANDED-NEXT:   $sgpr15 = SI_RESTORE_S32_FROM_VGPR [[DEF]], 11
   ; EXPANDED-NEXT:   S_NOP 0, implicit killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15
   bb.0:
     S_NOP 0, implicit-def %0:sgpr_384
@@ -100,6 +101,7 @@ body: |
   ; SPILLED-NEXT: bb.2:
   ; SPILLED-NEXT:   $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11 = SI_SPILL_V384_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s384) from %stack.0, align 4, addrspace 5)
   ; SPILLED-NEXT:   S_NOP 0, implicit killed renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11
+  ;
   ; EXPANDED-LABEL: name: spill_restore_vgpr384
   ; EXPANDED: bb.0:
   ; EXPANDED-NEXT:   successors: %bb.1(0x80000000)
diff --git a/llvm/test/CodeGen/AMDGPU/tied-op-for-wwm-scratch-reg-spill-restore.mir b/llvm/test/CodeGen/AMDGPU/tied-op-for-wwm-scratch-reg-spill-restore.mir
index c4fde5dd2da45..2c4a5dba3520c 100644
--- a/llvm/test/CodeGen/AMDGPU/tied-op-for-wwm-scratch-reg-spill-restore.mir
+++ b/llvm/test/CodeGen/AMDGPU/tied-op-for-wwm-scratch-reg-spill-restore.mir
@@ -23,14 +23,14 @@ body:             |
     ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
     ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5
     ; GCN-NEXT: $vgpr0 = IMPLICIT_DEF
-    ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr20, 0, $vgpr0
+    ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr20, 0, $vgpr0
     ; GCN-NEXT: $vgpr0 = COPY killed renamable $vgpr1, implicit $exec
     ; GCN-NEXT: $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
     ; GCN-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit $vgpr0(tied-def 0) :: (load (s32) from %stack.0, addrspace 5)
     ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5
     ; GCN-NEXT: SI_RETURN implicit $vgpr0
     $vgpr0 = IMPLICIT_DEF
-    $vgpr0 = V_WRITELANE_B32 killed $sgpr20, 0, $vgpr0
+    $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr20, 0, $vgpr0
     $vgpr0 = COPY killed renamable $vgpr1, implicit $exec
     SI_RETURN implicit $vgpr0
 ...
@@ -58,8 +58,8 @@ body:             |
     ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5
     ; GCN-NEXT: $vgpr0 = IMPLICIT_DEF
     ; GCN-NEXT: $vgpr2 = IMPLICIT_DEF
-    ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr20, 0, $vgpr0
-    ; GCN-NEXT: $vgpr2 = V_WRITELANE_B32 killed $sgpr21, 0, $vgpr2
+    ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr20, 0, $vgpr0
+    ; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR killed $sgpr21, 0, $vgpr2
     ; GCN-NEXT: $vgpr0 = COPY $vgpr1, implicit $exec
     ; GCN-NEXT: $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
     ; GCN-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec, implicit $vgpr0(tied-def 0) :: (load (s32) from %stack.0, addrspace 5)
@@ -68,8 +68,8 @@ body:             |
     ; GCN-NEXT: SI_RETURN implicit $vgpr0_vgpr1
     $vgpr0 = IMPLICIT_DEF
     $vgpr2 = IMPLICIT_DEF
-    $vgpr0 = V_WRITELANE_B32 killed $sgpr20, 0, $vgpr0
-    $vgpr2 = V_WRITELANE_B32 killed $sgpr21, 0, $vgpr2
+    $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr20, 0, $vgpr0
+    $vgpr2 = SI_SPILL_S32_TO_VGPR killed $sgpr21, 0, $vgpr2
     $vgpr0 = COPY $vgpr1, implicit $exec
     SI_RETURN implicit $vgpr0_vgpr1
 ...
@@ -95,14 +95,14 @@ body:             |
     ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
     ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5
     ; GCN-NEXT: $vgpr2 = IMPLICIT_DEF
-    ; GCN-NEXT: $vgpr2 = V_WRITELANE_B32 killed $sgpr20, 0, $vgpr2
+    ; GCN-NEXT: $vgpr2 = SI_SPILL_S32_TO_VGPR killed $sgpr20, 0, $vgpr2
     ; GCN-NEXT: $vgpr0 = COPY $vgpr1, implicit $exec
     ; GCN-NEXT: $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
     ; GCN-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5)
     ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5
     ; GCN-NEXT: SI_RETURN implicit $vgpr0_vgpr1
     $vgpr2 = IMPLICIT_DEF
-    $vgpr2 = V_WRITELANE_B32 killed $sgpr20, 0, $vgpr2
+    $vgpr2 = SI_SPILL_S32_TO_VGPR killed $sgpr20, 0, $vgpr2
     $vgpr0 = COPY $vgpr1, implicit $exec
     SI_RETURN implicit $vgpr0_vgpr1
 ...
@@ -127,16 +127,16 @@ body:             |
     ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr40, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
     ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5
     ; GCN-NEXT: $vgpr40 = IMPLICIT_DEF
-    ; GCN-NEXT: $vgpr40 = V_WRITELANE_B32 killed $sgpr20, 0, $vgpr40
-    ; GCN-NEXT: $sgpr20 = V_READLANE_B32 $vgpr40, 0, implicit $exec
+    ; GCN-NEXT: $vgpr40 = SI_SPILL_S32_TO_VGPR killed $sgpr20, 0, $vgpr40
+    ; GCN-NEXT: $sgpr20 = SI_RESTORE_S32_FROM_VGPR $vgpr40, 0, implicit $exec
     ; GCN-NEXT: $vgpr0 = COPY killed $vgpr1, implicit $exec
     ; GCN-NEXT: $sgpr4_sgpr5 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
     ; GCN-NEXT: $vgpr40 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5)
     ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5
     ; GCN-NEXT: SI_RETURN implicit $vgpr0
     $vgpr40 = IMPLICIT_DEF
-    $vgpr40 = V_WRITELANE_B32 killed $sgpr20, 0, $vgpr40
-    $sgpr20 = V_READLANE_B32 $vgpr40, 0, implicit $exec
+    $vgpr40 = SI_SPILL_S32_TO_VGPR killed $sgpr20, 0, $vgpr40
+    $sgpr20 = SI_RESTORE_S32_FROM_VGPR $vgpr40, 0, implicit $exec
     $vgpr0 = COPY killed $vgpr1, implicit $exec
     SI_RETURN implicit $vgpr0
 ...
diff --git a/llvm/test/CodeGen/AMDGPU/track-spilled-vgpr-liveness.mir b/llvm/test/CodeGen/AMDGPU/track-spilled-vgpr-liveness.mir
index e5aec9987bdad..4122a530ee861 100644
--- a/llvm/test/CodeGen/AMDGPU/track-spilled-vgpr-liveness.mir
+++ b/llvm/test/CodeGen/AMDGPU/track-spilled-vgpr-liveness.mir
@@ -18,15 +18,9 @@ body:             |
     ; GCN-LABEL: name: vgpr_use_after_prolog_spill
     ; GCN: liveins: $sgpr42, $vgpr0, $sgpr30_sgpr31
     ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
-    ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
-    ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5
     ; GCN-NEXT: $vgpr0 = V_ADD_U32_e32 8192, killed $vgpr0, implicit $exec
     ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr42, 0, $vgpr0
     ; GCN-NEXT: S_NOP 0, implicit-def $vgpr0
-    ; GCN-NEXT: $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
-    ; GCN-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5)
-    ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5
     ; GCN-NEXT: S_SETPC_B64_return $sgpr30_sgpr31
     $vgpr0 = V_ADD_U32_e32 8192, killed $vgpr0, implicit $exec
     $vgpr0 = V_WRITELANE_B32 killed $sgpr42, 0, $vgpr0
@@ -48,15 +42,9 @@ body:             |
     ; GCN-LABEL: name: livein_vgpr_def_after_prolog_spill
     ; GCN: liveins: $sgpr42, $vgpr0, $vgpr1, $sgpr30_sgpr31
     ; GCN-NEXT: {{  $}}
-    ; GCN-NEXT: $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
-    ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
-    ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5
     ; GCN-NEXT: $vgpr0 = V_ADD_U32_e32 8192, killed $vgpr1, implicit $exec
     ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr42, 0, $vgpr0
     ; GCN-NEXT: S_NOP 0, implicit-def $vgpr0
-    ; GCN-NEXT: $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
-    ; GCN-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5)
-    ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5
     ; GCN-NEXT: S_SETPC_B64_return $sgpr30_sgpr31
     $vgpr0 = V_ADD_U32_e32 8192, killed $vgpr1, implicit $exec
     $vgpr0 = V_WRITELANE_B32 killed $sgpr42, 0, $vgpr0
@@ -77,9 +65,6 @@ body:             |
   ; GCN-NEXT:   successors: %bb.1(0x80000000)
   ; GCN-NEXT:   liveins: $sgpr42, $vgpr0, $sgpr30_sgpr31
   ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT:   $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
-  ; GCN-NEXT:   BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
-  ; GCN-NEXT:   $exec = S_MOV_B64 killed $sgpr4_sgpr5
   ; GCN-NEXT:   S_NOP 0
   ; GCN-NEXT:   S_BRANCH %bb.1
   ; GCN-NEXT: {{  $}}
@@ -88,9 +73,6 @@ body:             |
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   $vgpr0 = V_WRITELANE_B32 killed $sgpr42, 0, $vgpr0
   ; GCN-NEXT:   S_NOP 0, implicit-def $vgpr0
-  ; GCN-NEXT:   $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
-  ; GCN-NEXT:   $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5)
-  ; GCN-NEXT:   $exec = S_MOV_B64 killed $sgpr4_sgpr5
   ; GCN-NEXT:   S_SETPC_B64_return $sgpr30_sgpr31
   bb.0:
     liveins: $sgpr42, $vgpr0, $sgpr30_sgpr31
diff --git a/llvm/test/CodeGen/AMDGPU/use_restore_frame_reg.mir b/llvm/test/CodeGen/AMDGPU/use_restore_frame_reg.mir
index 3a4b4372b8b2d..33fb595157256 100644
--- a/llvm/test/CodeGen/AMDGPU/use_restore_frame_reg.mir
+++ b/llvm/test/CodeGen/AMDGPU/use_restore_frame_reg.mir
@@ -46,7 +46,7 @@ body:             |
   ; MUBUF-NEXT:   $sgpr5 = S_ADD_I32 $sgpr33, 9961728, implicit-def dead $scc
   ; MUBUF-NEXT:   BUFFER_STORE_DWORD_OFFSET $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr5, 0, 0, 0, implicit $exec :: (store (s32) into %stack.20, addrspace 5)
   ; MUBUF-NEXT:   $exec = S_MOV_B64 killed $sgpr6_sgpr7
-  ; MUBUF-NEXT:   $vgpr2 = V_WRITELANE_B32 $sgpr4, 0, undef $vgpr2
+  ; MUBUF-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, undef $vgpr2
   ; MUBUF-NEXT:   $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 11010048, implicit-def dead $scc
   ; MUBUF-NEXT:   S_CMP_EQ_U32 0, 0, implicit-def $scc
   ; MUBUF-NEXT:   S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr27, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc
@@ -67,7 +67,7 @@ body:             |
   ; MUBUF-NEXT: bb.2:
   ; MUBUF-NEXT:   liveins: $vgpr2
   ; MUBUF-NEXT: {{  $}}
-  ; MUBUF-NEXT:   $sgpr4 = V_READLANE_B32 $vgpr2, 0
+  ; MUBUF-NEXT:   $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 0
   ; MUBUF-NEXT:   $sgpr6_sgpr7 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
   ; MUBUF-NEXT:   $sgpr5 = S_ADD_I32 $sgpr33, 9961728, implicit-def dead $scc
   ; MUBUF-NEXT:   $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr5, 0, 0, 0, implicit $exec :: (load (s32) from %stack.20, addrspace 5)
@@ -75,6 +75,7 @@ body:             |
   ; MUBUF-NEXT:   $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -11010048, implicit-def dead $scc
   ; MUBUF-NEXT:   $sgpr33 = COPY $sgpr4
   ; MUBUF-NEXT:   S_ENDPGM 0
+  ;
   ; FLATSCR-LABEL: name: use_restore_frame_reg
   ; FLATSCR: bb.0:
   ; FLATSCR-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
@@ -87,7 +88,7 @@ body:             |
   ; FLATSCR-NEXT:   $sgpr5 = S_ADD_I32 $sgpr33, 155652, implicit-def dead $scc
   ; FLATSCR-NEXT:   SCRATCH_STORE_DWORD_SADDR $vgpr2, killed $sgpr5, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.20, addrspace 5)
   ; FLATSCR-NEXT:   $exec = S_MOV_B64 killed $sgpr6_sgpr7
-  ; FLATSCR-NEXT:   $vgpr2 = V_WRITELANE_B32 $sgpr4, 0, undef $vgpr2
+  ; FLATSCR-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, undef $vgpr2
   ; FLATSCR-NEXT:   $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 172032, implicit-def dead $scc
   ; FLATSCR-NEXT:   S_CMP_EQ_U32 0, 0, implicit-def $scc
   ; FLATSCR-NEXT:   S_NOP 0, implicit-def $sgpr4, implicit-def $sgpr5, implicit-def $sgpr6, implicit-def $sgpr7, implicit-def $sgpr8, implicit-def $sgpr9, implicit-def $sgpr10, implicit-def $sgpr11, implicit-def $sgpr12, implicit-def $sgpr13, implicit-def $sgpr14, implicit-def $sgpr15, implicit-def $sgpr16, implicit-def $sgpr17, implicit-def $sgpr18, implicit-def $sgpr19, implicit-def $sgpr20, implicit-def $sgpr21, implicit-def $sgpr22, implicit-def $sgpr23, implicit-def $sgpr24, implicit-def $sgpr25, implicit-def $sgpr26, implicit-def $sgpr27, implicit-def $sgpr28, implicit-def $sgpr29, implicit-def $sgpr30, implicit-def $sgpr31, implicit-def $vcc
@@ -117,7 +118,7 @@ body:             |
   ; FLATSCR-NEXT: bb.2:
   ; FLATSCR-NEXT:   liveins: $vgpr2
   ; FLATSCR-NEXT: {{  $}}
-  ; FLATSCR-NEXT:   $sgpr4 = V_READLANE_B32 $vgpr2, 0
+  ; FLATSCR-NEXT:   $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 0
   ; FLATSCR-NEXT:   $sgpr6_sgpr7 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
   ; FLATSCR-NEXT:   $sgpr5 = S_ADD_I32 $sgpr33, 155652, implicit-def dead $scc
   ; FLATSCR-NEXT:   $vgpr2 = SCRATCH_LOAD_DWORD_SADDR killed $sgpr5, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.20, addrspace 5)

From 8e0b3a881401ed175691b7f4beac9d500c2341bd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andrzej=20Warzy=C5=84ski?= <andrzej.warzynski@arm.com>
Date: Fri, 27 Oct 2023 13:00:50 +0100
Subject: [PATCH 214/877] [mlir][vector] Update v.contract -> v.outerproduct
 tests (1/N) (#70379)

Tests for conversions from `vector.contract` to `vector.outerproduct`
for _matvec_ operations are updated with cases for scalable vectors.

This patch updates one specific test file (there might be similar
tests elsewhere):

   * vector-contract-to-outerproduct-transforms.mlir.

Only the parallel dimension is made scalable. Making the reduction
dimension scalable would lead to different patterns without
`vector.outerproduct` (that would need to be added to some other file).

One duplicate test for _matvec_ is removed.
---
 ...r-contract-to-outerproduct-transforms.mlir | 377 ++++++++++++------
 1 file changed, 252 insertions(+), 125 deletions(-)

diff --git a/mlir/test/Dialect/Vector/vector-contract-to-outerproduct-transforms.mlir b/mlir/test/Dialect/Vector/vector-contract-to-outerproduct-transforms.mlir
index 6933b24a32a83..965d55c53ba33 100644
--- a/mlir/test/Dialect/Vector/vector-contract-to-outerproduct-transforms.mlir
+++ b/mlir/test/Dialect/Vector/vector-contract-to-outerproduct-transforms.mlir
@@ -1,15 +1,5 @@
 // RUN: mlir-opt %s --transform-interpreter --split-input-file | FileCheck %s
 
-#matvec_accesses = [
-  affine_map<(i, j) -> (i, j)>,
-  affine_map<(i, j) -> (j)>,
-  affine_map<(i, j) -> (i)>
-]
-#matvec_trait = {
-  indexing_maps = #matvec_accesses,
-  iterator_types = ["parallel", "reduction"]
-}
-
 #matmat_accesses = [
   affine_map<(i, j, k) -> (i, k)>,
   affine_map<(i, j, k) -> (k, j)>,
@@ -30,53 +20,6 @@
   iterator_types = ["parallel", "parallel", "reduction"]
 }
 
-// CHECK-LABEL:   func.func @masked_extract_contract2(
-// CHECK-SAME:      %{{.*}}: vector<2x3xf32>,
-// CHECK-SAME:      %{{.*}}: vector<3xf32>,
-// CHECK-SAME:      %{{.*}}: vector<2xf32>,
-// CHECK-SAME:      %[[IN_MASK:.*]]: vector<2x3xi1>) -> vector<2xf32>
-// CHECK:           %[[T_MASK:.*]] = vector.transpose %[[IN_MASK]], [1, 0] : vector<2x3xi1> to vector<3x2xi1>
-// CHECK:           %[[MASK0:.*]] = vector.extract %[[T_MASK]][0] : vector<2xi1> from vector<3x2xi1>
-// CHECK:           vector.mask %[[MASK0]] { vector.outerproduct {{.*}} {kind = #vector.kind<add>} : vector<2xf32>, f32 } : vector<2xi1> -> vector<2xf32>
-
-// CHECK:           %[[MASK1:.*]] = vector.extract %[[T_MASK]][1] : vector<2xi1> from vector<3x2xi1>
-// CHECK:           vector.mask %[[MASK1]] { vector.outerproduct {{.*}} {kind = #vector.kind<add>} : vector<2xf32>, f32 } : vector<2xi1> -> vector<2xf32>
-
-// CHECK:           %[[MASK2:.*]] = vector.extract %[[T_MASK]][2] : vector<2xi1> from vector<3x2xi1>
-// CHECK:           vector.mask %[[MASK2]] { vector.outerproduct {{.*}} {kind = #vector.kind<add>} : vector<2xf32>, f32 } : vector<2xi1> -> vector<2xf32>
-
-func.func @masked_extract_contract2(%arg0: vector<2x3xf32>,
-                                    %arg1: vector<3xf32>,
-                                    %arg2: vector<2xf32>,
-                                    %m: vector<2x3xi1>) -> vector<2xf32> {
-  %0 = vector.mask %m { vector.contract #matvec_trait %arg0, %arg1, %arg2
-          : vector<2x3xf32>, vector<3xf32> into vector<2xf32> } : vector<2x3xi1> -> vector<2xf32>
-  return %0 : vector<2xf32>
-}
-
-
-// CHECK-LABEL:   func.func @masked_extract_contract2_scalable_parallel_dim(
-// CHECK-SAME:      %{{.*}}: vector<[2]x3xf32>,
-// CHECK-SAME:      %{{.*}}: vector<3xf32>,
-// CHECK-SAME:      %{{.*}}: vector<[2]xf32>,
-// CHECK-SAME:      %[[IN_MASK:.*]]: vector<[2]x3xi1>) -> vector<[2]xf32>
-// CHECK:           %[[T_MASK:.*]] = vector.transpose %[[IN_MASK]], [1, 0] : vector<[2]x3xi1> to vector<3x[2]xi1>
-// CHECK:           %[[MASK0:.*]] = vector.extract %[[T_MASK]][0] : vector<[2]xi1> from vector<3x[2]xi1>
-// CHECK:           vector.mask %[[MASK0]] { vector.outerproduct {{.*}} {kind = #vector.kind<add>} : vector<[2]xf32>, f32 } : vector<[2]xi1> -> vector<[2]xf32>
-
-// CHECK:           %[[MASK1:.*]] = vector.extract %[[T_MASK]][1] : vector<[2]xi1> from vector<3x[2]xi1>
-// CHECK:           vector.mask %[[MASK1]] { vector.outerproduct {{.*}} {kind = #vector.kind<add>} : vector<[2]xf32>, f32 } : vector<[2]xi1> -> vector<[2]xf32>
-
-// CHECK:           %[[MASK2:.*]] = vector.extract %[[T_MASK]][2] : vector<[2]xi1> from vector<3x[2]xi1>
-// CHECK:           vector.mask %[[MASK2]] { vector.outerproduct {{.*}} {kind = #vector.kind<add>} : vector<[2]xf32>, f32 } : vector<[2]xi1> -> vector<[2]xf32>
-func.func @masked_extract_contract2_scalable_parallel_dim(%arg0: vector<[2]x3xf32>,
-                                    %arg1: vector<3xf32>,
-                                    %arg2: vector<[2]xf32>,
-                                    %m: vector<[2]x3xi1>) -> vector<[2]xf32> {
-  %0 = vector.mask %m { vector.contract #matvec_trait %arg0, %arg1, %arg2
-          : vector<[2]x3xf32>, vector<3xf32> into vector<[2]xf32> } : vector<[2]x3xi1> -> vector<[2]xf32>
-  return %0 : vector<[2]xf32>
-}
 
 // CHECK-LABEL: func.func @masked_extract_contract4(
 // CHECK-SAME:    %{{.*}}: vector<3x5xf32>,
@@ -463,25 +406,72 @@ func.func @matmul_4_scalable(%arg0: vector<[2]x1xf32>, %arg1: vector<1x3xf32>, %
   iterator_types = ["parallel", "parallel", "reduction"]
 }
 
-// CHECK-LABEL: @masked_matvec_mk_k_m
-// CHECK-SAME:  %[[MAT:.+]]: vector<4x2xf32>
-// CHECK-SAME:  %[[VEC:.+]]: vector<2xf32>
-// CHECK-SAME:  %[[INIT:.+]]: vector<4xf32>
-// CHECK-SAME:  %[[MASK:.+]]: vector<4x2xi1>
-func.func @masked_matvec_mk_k_m(%arg0: vector<4x2xf32>, %arg1: vector<2xf32>, %arg2: vector<4xf32>, %mask: vector<4x2xi1>) -> vector<4xf32> {
-  // CHECK:         vector.transpose %[[MASK]]
-  // CHECK:         vector.transpose %[[MAT]]
-  // CHECK-COUNT-2: vector.mask %{{.*}} { vector.outerproduct %{{.*}}, %{{.*}}, %{{.*}} {kind = #vector.kind<add>} : vector<4xf32>, f32 }
-  %res = vector.mask %mask {
-    vector.contract {
-      indexing_maps = [affine_map<(m, k) -> (m, k)>,
-                       affine_map<(m, k) -> (k)>,
-                       affine_map<(m, k) -> (m)>],
-      iterator_types = ["parallel", "reduction"],
-      kind = #vector.kind<add>
-    } %arg0, %arg1, %arg2 : vector<4x2xf32>, vector<2xf32>, vector<4xf32> into vector<4xf32>
-  } : vector<4x2xi1> -> vector<4xf32>
-  return %res : vector<4xf32>
+#matvec_accesses_1 = [
+  affine_map<(m, k) -> (m, k)>,
+  affine_map<(m, k) -> (k)>,
+  affine_map<(m, k) -> (m)>
+]
+#matvec_trait_1 = {
+  indexing_maps = #matvec_accesses_1,
+  iterator_types = ["parallel", "reduction"]
+}
+
+// CHECK-LABEL:   func.func @masked_matvec_mk_k_m(
+// CHECK-SAME:      %{{.*}}: vector<2x3xf32>,
+// CHECK-SAME:      %{{.*}}: vector<3xf32>,
+// CHECK-SAME:      %{{.*}}: vector<2xf32>,
+// CHECK-SAME:      %[[IN_MASK:.*]]: vector<2x3xi1>) -> vector<2xf32>
+// CHECK:           %[[T_MASK:.*]] = vector.transpose %[[IN_MASK]], [1, 0] : vector<2x3xi1> to vector<3x2xi1>
+// CHECK:           %[[MASK0:.*]] = vector.extract %[[T_MASK]][0] : vector<2xi1> from vector<3x2xi1>
+// CHECK:           vector.mask %[[MASK0]] { vector.outerproduct {{.*}} {kind = #vector.kind<add>} : vector<2xf32>, f32 } : vector<2xi1> -> vector<2xf32>
+
+// CHECK:           %[[MASK1:.*]] = vector.extract %[[T_MASK]][1] : vector<2xi1> from vector<3x2xi1>
+// CHECK:           vector.mask %[[MASK1]] { vector.outerproduct {{.*}} {kind = #vector.kind<add>} : vector<2xf32>, f32 } : vector<2xi1> -> vector<2xf32>
+
+// CHECK:           %[[MASK2:.*]] = vector.extract %[[T_MASK]][2] : vector<2xi1> from vector<3x2xi1>
+// CHECK:           vector.mask %[[MASK2]] { vector.outerproduct {{.*}} {kind = #vector.kind<add>} : vector<2xf32>, f32 } : vector<2xi1> -> vector<2xf32>
+
+func.func @masked_matvec_mk_k_m(%arg0: vector<2x3xf32>,
+                                %arg1: vector<3xf32>,
+                                %arg2: vector<2xf32>,
+                                %m: vector<2x3xi1>) -> vector<2xf32> {
+  %0 = vector.mask %m { vector.contract #matvec_trait_1 %arg0, %arg1, %arg2
+          : vector<2x3xf32>, vector<3xf32> into vector<2xf32> } : vector<2x3xi1> -> vector<2xf32>
+  return %0 : vector<2xf32>
+}
+
+
+// CHECK-LABEL:   func.func @masked_matvec_mk_k_m_scalable_parallel_dim(
+// CHECK-SAME:      %{{.*}}: vector<[2]x3xf32>,
+// CHECK-SAME:      %{{.*}}: vector<3xf32>,
+// CHECK-SAME:      %{{.*}}: vector<[2]xf32>,
+// CHECK-SAME:      %[[IN_MASK:.*]]: vector<[2]x3xi1>) -> vector<[2]xf32>
+// CHECK:           %[[T_MASK:.*]] = vector.transpose %[[IN_MASK]], [1, 0] : vector<[2]x3xi1> to vector<3x[2]xi1>
+// CHECK:           %[[MASK0:.*]] = vector.extract %[[T_MASK]][0] : vector<[2]xi1> from vector<3x[2]xi1>
+// CHECK:           vector.mask %[[MASK0]] { vector.outerproduct {{.*}} {kind = #vector.kind<add>} : vector<[2]xf32>, f32 } : vector<[2]xi1> -> vector<[2]xf32>
+
+// CHECK:           %[[MASK1:.*]] = vector.extract %[[T_MASK]][1] : vector<[2]xi1> from vector<3x[2]xi1>
+// CHECK:           vector.mask %[[MASK1]] { vector.outerproduct {{.*}} {kind = #vector.kind<add>} : vector<[2]xf32>, f32 } : vector<[2]xi1> -> vector<[2]xf32>
+
+// CHECK:           %[[MASK2:.*]] = vector.extract %[[T_MASK]][2] : vector<[2]xi1> from vector<3x[2]xi1>
+// CHECK:           vector.mask %[[MASK2]] { vector.outerproduct {{.*}} {kind = #vector.kind<add>} : vector<[2]xf32>, f32 } : vector<[2]xi1> -> vector<[2]xf32>
+func.func @masked_matvec_mk_k_m_scalable_parallel_dim(%arg0: vector<[2]x3xf32>,
+                                                      %arg1: vector<3xf32>,
+                                                      %arg2: vector<[2]xf32>,
+                                                      %m: vector<[2]x3xi1>) -> vector<[2]xf32> {
+  %0 = vector.mask %m { vector.contract #matvec_trait_1 %arg0, %arg1, %arg2
+          : vector<[2]x3xf32>, vector<3xf32> into vector<[2]xf32> } : vector<[2]x3xi1> -> vector<[2]xf32>
+  return %0 : vector<[2]xf32>
+}
+
+#matvec_accesses_2 = [
+  affine_map<(m, k) -> (k, m)>,
+  affine_map<(m, k) -> (k)>,
+  affine_map<(m, k) -> (m)>
+]
+#matvec_trait_2 = {
+  indexing_maps = #matvec_accesses_2,
+  iterator_types = ["parallel", "reduction"]
 }
 
 // CHECK-LABEL: @masked_matvec_km_k_m
@@ -494,17 +484,38 @@ func.func @masked_matvec_km_k_m(%arg0: vector<2x4xf32>, %arg1: vector<2xf32>, %a
   // CHECK-NOT:     vector.transpose %[[MAT]]
   // CHECK-COUNT-2: vector.mask %{{.*}} { vector.outerproduct %{{.*}}, %{{.*}}, %{{.*}} {kind = #vector.kind<add>} : vector<4xf32>, f32 }
   %res = vector.mask %mask {
-    vector.contract {
-      indexing_maps = [affine_map<(m, k) -> (k, m)>,
-                       affine_map<(m, k) -> (k)>,
-                       affine_map<(m, k) -> (m)>],
-      iterator_types = ["parallel", "reduction"],
-      kind = #vector.kind<add>
-    } %arg0, %arg1, %arg2 : vector<2x4xf32>, vector<2xf32>, vector<4xf32> into vector<4xf32>
+    vector.contract #matvec_trait_2 %arg0, %arg1, %arg2
+      : vector<2x4xf32>, vector<2xf32>, vector<4xf32> into vector<4xf32>
   } : vector<4x2xi1> -> vector<4xf32>
   return %res : vector<4xf32>
 }
 
+// CHECK-LABEL: @masked_matvec_km_k_m_scalable_parallel_dim
+// CHECK-SAME:  %[[MAT:.+]]: vector<2x[4]xf32>
+// CHECK-SAME:  %[[VEC:.+]]: vector<2xf32>
+// CHECK-SAME:  %[[INIT:.+]]: vector<[4]xf32>
+// CHECK-SAME:  %[[MASK:.+]]: vector<[4]x2xi1>
+func.func @masked_matvec_km_k_m_scalable_parallel_dim(%arg0: vector<2x[4]xf32>, %arg1: vector<2xf32>, %arg2: vector<[4]xf32>, %mask: vector<[4]x2xi1>) -> vector<[4]xf32> {
+  // CHECK:         vector.transpose %[[MASK]]
+  // CHECK-NOT:     vector.transpose %[[MAT]]
+  // CHECK-COUNT-2: vector.mask %{{.*}} { vector.outerproduct %{{.*}}, %{{.*}}, %{{.*}} {kind = #vector.kind<add>} : vector<[4]xf32>, f32 }
+  %res = vector.mask %mask {
+    vector.contract #matvec_trait_2 %arg0, %arg1, %arg2
+      : vector<2x[4]xf32>, vector<2xf32>, vector<[4]xf32> into vector<[4]xf32>
+  } : vector<[4]x2xi1> -> vector<[4]xf32>
+  return %res : vector<[4]xf32>
+}
+
+#matvec_accesses_3 = [
+  affine_map<(m, k) -> (k)>,
+  affine_map<(m, k) -> (m, k)>,
+  affine_map<(m, k) -> (m)>
+]
+#matvec_trait_3 = {
+  indexing_maps = #matvec_accesses_3,
+  iterator_types = ["parallel", "reduction"]
+}
+
 // CHECK-LABEL: @masked_matvec_k_mk_m
 // CHECK-SAME:  %[[MAT:.+]]: vector<4x2xf32>
 // CHECK-SAME:  %[[VEC:.+]]: vector<2xf32>
@@ -515,17 +526,54 @@ func.func @masked_matvec_k_mk_m(%arg0: vector<4x2xf32>, %arg1: vector<2xf32>, %a
   // CHECK:         vector.transpose %[[MAT]]
   // CHECK-COUNT-2: vector.mask %{{.*}} { vector.outerproduct %{{.*}}, %{{.*}}, %{{.*}} {kind = #vector.kind<add>} : vector<4xf32>, f32 }
   %res = vector.mask %mask {
-    vector.contract {
-      indexing_maps = [affine_map<(m, k) -> (k)>,
-                       affine_map<(m, k) -> (m, k)>,
-                       affine_map<(m, k) -> (m)>],
-      iterator_types = ["parallel", "reduction"],
-      kind = #vector.kind<add>
-    } %arg1, %arg0, %arg2 : vector<2xf32>, vector<4x2xf32>, vector<4xf32> into vector<4xf32>
+      vector.contract #matvec_trait_3 %arg1, %arg0, %arg2
+        : vector<2xf32>, vector<4x2xf32>, vector<4xf32> into vector<4xf32>
   } : vector<4x2xi1> -> vector<4xf32>
   return %res : vector<4xf32>
 }
 
+// CHECK-LABEL: @masked_matvec_k_mk_m_scalable_parallel_dim
+// CHECK-SAME:  %[[MAT:.+]]: vector<[4]x2xf32>
+// CHECK-SAME:  %[[VEC:.+]]: vector<2xf32>
+// CHECK-SAME:  %[[INIT:.+]]: vector<[4]xf32>
+// CHECK-SAME:  %[[MASK:.+]]: vector<[4]x2xi1>
+func.func @masked_matvec_k_mk_m_scalable_parallel_dim(%arg0: vector<[4]x2xf32>, %arg1: vector<2xf32>, %arg2: vector<[4]xf32>, %mask: vector<[4]x2xi1>) -> vector<[4]xf32> {
+  // CHECK:         vector.transpose %[[MASK]]
+  // CHECK:         vector.transpose %[[MAT]]
+  // CHECK-COUNT-2: vector.mask %{{.*}} { vector.outerproduct %{{.*}}, %{{.*}}, %{{.*}} {kind = #vector.kind<add>} : vector<[4]xf32>, f32 }
+  %res = vector.mask %mask {
+      vector.contract #matvec_trait_3 %arg1, %arg0, %arg2
+        : vector<2xf32>, vector<[4]x2xf32>, vector<[4]xf32> into vector<[4]xf32>
+  } : vector<[4]x2xi1> -> vector<[4]xf32>
+  return %res : vector<[4]xf32>
+}
+
+#matvec_accesses_4 = [
+  affine_map<(m, k) -> (k)>,
+  affine_map<(m, k) -> (k, m)>,
+  affine_map<(m, k) -> (m)>
+]
+#matvec_trait_4 = {
+  indexing_maps = #matvec_accesses_4,
+  iterator_types = ["parallel", "reduction"]
+}
+
+// CHECK-LABEL: @masked_matvec_k_km_m_scalable_parallel_dim
+// CHECK-SAME:  %[[MAT:.+]]: vector<2x[4]xf32>
+// CHECK-SAME:  %[[VEC:.+]]: vector<2xf32>
+// CHECK-SAME:  %[[INIT:.+]]: vector<[4]xf32>
+// CHECK-SAME:  %[[MASK:.+]]: vector<[4]x2xi1>
+func.func @masked_matvec_k_km_m_scalable_parallel_dim(%arg0: vector<2x[4]xf32>, %arg1: vector<2xf32>, %arg2: vector<[4]xf32>, %mask: vector<[4]x2xi1>) -> vector<[4]xf32> {
+  // CHECK:         vector.transpose %[[MASK]]
+  // CHECK-NOT:     vector.transpose %[[MAT]]
+  // CHECK-COUNT-2: vector.mask %{{.*}} { vector.outerproduct %{{.*}}, %{{.*}}, %{{.*}} {kind = #vector.kind<add>} : vector<[4]xf32>, f32 }
+  %res = vector.mask %mask {
+    vector.contract #matvec_trait_4 %arg1, %arg0, %arg2
+      : vector<2xf32>, vector<2x[4]xf32>, vector<[4]xf32> into vector<[4]xf32>
+  } : vector<[4]x2xi1> -> vector<[4]xf32>
+  return %res : vector<[4]xf32>
+}
+
 // CHECK-LABEL: @masked_matvec_k_km_m
 // CHECK-SAME:  %[[MAT:.+]]: vector<2x4xf32>
 // CHECK-SAME:  %[[VEC:.+]]: vector<2xf32>
@@ -536,17 +584,22 @@ func.func @masked_matvec_k_km_m(%arg0: vector<2x4xf32>, %arg1: vector<2xf32>, %a
   // CHECK-NOT:     vector.transpose %[[MAT]]
   // CHECK-COUNT-2: vector.mask %{{.*}} { vector.outerproduct %{{.*}}, %{{.*}}, %{{.*}} {kind = #vector.kind<add>} : vector<4xf32>, f32 }
   %res = vector.mask %mask {
-    vector.contract {
-      indexing_maps = [affine_map<(m, k) -> (k)>,
-                       affine_map<(m, k) -> (k, m)>,
-                       affine_map<(m, k) -> (m)>],
-      iterator_types = ["parallel", "reduction"],
-      kind = #vector.kind<add>
-    } %arg1, %arg0, %arg2 : vector<2xf32>, vector<2x4xf32>, vector<4xf32> into vector<4xf32>
+    vector.contract #matvec_trait_4 %arg1, %arg0, %arg2
+      : vector<2xf32>, vector<2x4xf32>, vector<4xf32> into vector<4xf32>
   } : vector<4x2xi1> -> vector<4xf32>
   return %res : vector<4xf32>
 }
 
+#matvec_accesses_5 = [
+  affine_map<(k, m) -> (m, k)>,
+  affine_map<(k, m) -> (k)>,
+  affine_map<(k, m) -> (m)>
+]
+#matvec_trait_5 = {
+  indexing_maps = #matvec_accesses_5,
+  iterator_types = ["reduction", "parallel"]
+}
+
 // CHECK-LABEL: @masked_tmatvec_mk_k_m
 // CHECK-SAME:  %[[MAT:.+]]: vector<4x2xf32>
 // CHECK-SAME:  %[[VEC:.+]]: vector<2xf32>
@@ -557,17 +610,38 @@ func.func @masked_tmatvec_mk_k_m(%arg0: vector<4x2xf32>, %arg1: vector<2xf32>, %
   // CHECK-NOT:     vector.transpose %[[MASK]]
   // CHECK-COUNT-2: vector.mask %{{.*}} { vector.outerproduct %{{.*}}, %{{.*}}, %{{.*}} {kind = #vector.kind<add>} : vector<4xf32>, f32 }
   %res = vector.mask %mask {
-    vector.contract {
-      indexing_maps = [affine_map<(k, m) -> (m, k)>,
-                       affine_map<(k, m) -> (k)>,
-                       affine_map<(k, m) -> (m)>],
-      iterator_types = ["reduction", "parallel"],
-      kind = #vector.kind<add>
-    } %arg0, %arg1, %arg2 : vector<4x2xf32>, vector<2xf32>, vector<4xf32> into vector<4xf32>
+    vector.contract #matvec_trait_5 %arg0, %arg1, %arg2
+      : vector<4x2xf32>, vector<2xf32>, vector<4xf32> into vector<4xf32>
   } : vector<2x4xi1> -> vector<4xf32>
   return %res : vector<4xf32>
 }
 
+// CHECK-LABEL: @masked_tmatvec_mk_k_m_scalable_parallel_dim
+// CHECK-SAME:  %[[MAT:.+]]: vector<[4]x2xf32>
+// CHECK-SAME:  %[[VEC:.+]]: vector<2xf32>
+// CHECK-SAME:  %[[INIT:.+]]: vector<[4]xf32>
+// CHECK-SAME:  %[[MASK:.+]]: vector<2x[4]xi1>
+func.func @masked_tmatvec_mk_k_m_scalable_parallel_dim(%arg0: vector<[4]x2xf32>, %arg1: vector<2xf32>, %arg2: vector<[4]xf32>, %mask: vector<2x[4]xi1>) -> vector<[4]xf32> {
+  // CHECK:         vector.transpose %[[MAT]]
+  // CHECK-NOT:     vector.transpose %[[MASK]]
+  // CHECK-COUNT-2: vector.mask %{{.*}} { vector.outerproduct %{{.*}}, %{{.*}}, %{{.*}} {kind = #vector.kind<add>} : vector<[4]xf32>, f32 }
+  %res = vector.mask %mask {
+    vector.contract #matvec_trait_5 %arg0, %arg1, %arg2
+      : vector<[4]x2xf32>, vector<2xf32>, vector<[4]xf32> into vector<[4]xf32>
+  } : vector<2x[4]xi1> -> vector<[4]xf32>
+  return %res : vector<[4]xf32>
+}
+
+#matvec_accesses_6 = [
+  affine_map<(k, m) -> (k, m)>,
+  affine_map<(k, m) -> (k)>,
+  affine_map<(k, m) -> (m)>
+]
+#matvec_trait_6 = {
+  indexing_maps = #matvec_accesses_6,
+  iterator_types = ["reduction", "parallel"]
+}
+
 // CHECK-LABEL: @masked_tmatvec_km_k_m
 // CHECK-SAME:  %[[MAT:.+]]: vector<2x4xf32>
 // CHECK-SAME:  %[[VEC:.+]]: vector<2xf32>
@@ -578,17 +652,38 @@ func.func @masked_tmatvec_km_k_m(%arg0: vector<2x4xf32>, %arg1: vector<2xf32>, %
   // CHECK-NOT:     vector.transpose %[[MASK]]
   // CHECK-COUNT-2: vector.mask %{{.*}} { vector.outerproduct %{{.*}}, %{{.*}}, %{{.*}} {kind = #vector.kind<add>} : vector<4xf32>, f32 }
   %res = vector.mask %mask {
-    vector.contract {
-      indexing_maps = [affine_map<(k, m) -> (k, m)>,
-                       affine_map<(k, m) -> (k)>,
-                       affine_map<(k, m) -> (m)>],
-      iterator_types = ["reduction", "parallel"],
-      kind = #vector.kind<add>
-    } %arg0, %arg1, %arg2 : vector<2x4xf32>, vector<2xf32>, vector<4xf32> into vector<4xf32>
+    vector.contract #matvec_trait_6 %arg0, %arg1, %arg2
+      : vector<2x4xf32>, vector<2xf32>, vector<4xf32> into vector<4xf32>
   } : vector<2x4xi1> -> vector<4xf32>
   return %res : vector<4xf32>
 }
 
+// CHECK-LABEL: @masked_tmatvec_km_k_m_scalable_parallel_dim
+// CHECK-SAME:  %[[MAT:.+]]: vector<2x[4]xf32>
+// CHECK-SAME:  %[[VEC:.+]]: vector<2xf32>
+// CHECK-SAME:  %[[INIT:.+]]: vector<[4]xf32>
+// CHECK-SAME:  %[[MASK:.+]]: vector<2x[4]xi1>
+func.func @masked_tmatvec_km_k_m_scalable_parallel_dim(%arg0: vector<2x[4]xf32>, %arg1: vector<2xf32>, %arg2: vector<[4]xf32>, %mask: vector<2x[4]xi1>) -> vector<[4]xf32> {
+  // CHECK-NOT:     vector.transpose %[[MAT]]
+  // CHECK-NOT:     vector.transpose %[[MASK]]
+  // CHECK-COUNT-2: vector.mask %{{.*}} { vector.outerproduct %{{.*}}, %{{.*}}, %{{.*}} {kind = #vector.kind<add>} : vector<[4]xf32>, f32 }
+  %res = vector.mask %mask {
+    vector.contract #matvec_trait_6 %arg0, %arg1, %arg2
+      : vector<2x[4]xf32>, vector<2xf32>, vector<[4]xf32> into vector<[4]xf32>
+  } : vector<2x[4]xi1> -> vector<[4]xf32>
+  return %res : vector<[4]xf32>
+}
+
+#matvec_accesses_7 = [
+  affine_map<(k, m) -> (k)>,
+  affine_map<(k, m) -> (m, k)>,
+  affine_map<(k, m) -> (m)>
+]
+#matvec_trait_7 = {
+  indexing_maps = #matvec_accesses_7,
+  iterator_types = ["reduction", "parallel"]
+}
+
 // CHECK-LABEL: @masked_tmatvec_k_mk_m
 // CHECK-SAME:  %[[MAT:.+]]: vector<4x2xf32>
 // CHECK-SAME:  %[[VEC:.+]]: vector<2xf32>
@@ -599,17 +694,38 @@ func.func @masked_tmatvec_k_mk_m(%arg0: vector<4x2xf32>, %arg1: vector<2xf32>, %
   // CHECK-NOT:     vector.transpose %[[MASK]]
   // CHECK-COUNT-2: vector.mask %{{.*}} { vector.outerproduct %{{.*}}, %{{.*}}, %{{.*}} {kind = #vector.kind<add>} : vector<4xf32>, f32 }
   %res = vector.mask %mask {
-    vector.contract {
-      indexing_maps = [affine_map<(k, m) -> (k)>,
-                       affine_map<(k, m) -> (m, k)>,
-                       affine_map<(k, m) -> (m)>],
-      iterator_types = ["reduction", "parallel"],
-      kind = #vector.kind<add>
-    } %arg1, %arg0, %arg2 : vector<2xf32>, vector<4x2xf32>, vector<4xf32> into vector<4xf32>
+    vector.contract #matvec_trait_7 %arg1, %arg0, %arg2
+      : vector<2xf32>, vector<4x2xf32>, vector<4xf32> into vector<4xf32>
   } : vector<2x4xi1> -> vector<4xf32>
   return %res : vector<4xf32>
 }
 
+// CHECK-LABEL: @masked_tmatvec_k_mk_m_scalable_parallel_dim
+// CHECK-SAME:  %[[MAT:.+]]: vector<[4]x2xf32>
+// CHECK-SAME:  %[[VEC:.+]]: vector<2xf32>
+// CHECK-SAME:  %[[INIT:.+]]: vector<[4]xf32>
+// CHECK-SAME:  %[[MASK:.+]]: vector<2x[4]xi1>
+func.func @masked_tmatvec_k_mk_m_scalable_parallel_dim(%arg0: vector<[4]x2xf32>, %arg1: vector<2xf32>, %arg2: vector<[4]xf32>, %mask: vector<2x[4]xi1>) -> vector<[4]xf32> {
+  // CHECK:         vector.transpose %[[MAT]]
+  // CHECK-NOT:     vector.transpose %[[MASK]]
+  // CHECK-COUNT-2: vector.mask %{{.*}} { vector.outerproduct %{{.*}}, %{{.*}}, %{{.*}} {kind = #vector.kind<add>} : vector<[4]xf32>, f32 }
+  %res = vector.mask %mask {
+    vector.contract #matvec_trait_7 %arg1, %arg0, %arg2
+      : vector<2xf32>, vector<[4]x2xf32>, vector<[4]xf32> into vector<[4]xf32>
+  } : vector<2x[4]xi1> -> vector<[4]xf32>
+  return %res : vector<[4]xf32>
+}
+
+#matvec_accesses_8 = [
+  affine_map<(k, m) -> (k)>,
+  affine_map<(k, m) -> (k, m)>,
+  affine_map<(k, m) -> (m)>
+]
+#matvec_trait_8 = {
+  indexing_maps = #matvec_accesses_8,
+  iterator_types = ["reduction", "parallel"]
+}
+
 // CHECK-LABEL: @masked_tmatvec_k_km_m
 // CHECK-SAME:  %[[MAT:.+]]: vector<2x4xf32>
 // CHECK-SAME:  %[[VEC:.+]]: vector<2xf32>
@@ -620,17 +736,28 @@ func.func @masked_tmatvec_k_km_m(%arg0: vector<2x4xf32>, %arg1: vector<2xf32>, %
   // CHECK-NOT:     vector.transpose %[[MASK]]
   // CHECK-COUNT-2: vector.mask %{{.*}} { vector.outerproduct %{{.*}}, %{{.*}}, %{{.*}} {kind = #vector.kind<add>} : vector<4xf32>, f32 }
   %res = vector.mask %mask {
-    vector.contract {
-      indexing_maps = [affine_map<(k, m) -> (k)>,
-                       affine_map<(k, m) -> (k, m)>,
-                       affine_map<(k, m) -> (m)>],
-      iterator_types = ["reduction", "parallel"],
-      kind = #vector.kind<add>
-    } %arg1, %arg0, %arg2 : vector<2xf32>, vector<2x4xf32>, vector<4xf32> into vector<4xf32>
+    vector.contract #matvec_trait_8 %arg1, %arg0, %arg2
+      : vector<2xf32>, vector<2x4xf32>, vector<4xf32> into vector<4xf32>
   } : vector<2x4xi1> -> vector<4xf32>
   return %res : vector<4xf32>
 }
 
+// CHECK-LABEL: @masked_tmatvec_k_km_m_scalable_parallel_dim
+// CHECK-SAME:  %[[MAT:.+]]: vector<2x[4]xf32>
+// CHECK-SAME:  %[[VEC:.+]]: vector<2xf32>
+// CHECK-SAME:  %[[INIT:.+]]: vector<[4]xf32>
+// CHECK-SAME:  %[[MASK:.+]]: vector<2x[4]xi1>
+func.func @masked_tmatvec_k_km_m_scalable_parallel_dim(%arg0: vector<2x[4]xf32>, %arg1: vector<2xf32>, %arg2: vector<[4]xf32>, %mask: vector<2x[4]xi1>) -> vector<[4]xf32> {
+  // CHECK-NOT:     vector.transpose %[[MAT]]
+  // CHECK-NOT:     vector.transpose %[[MASK]]
+  // CHECK-COUNT-2: vector.mask %{{.*}} { vector.outerproduct %{{.*}}, %{{.*}}, %{{.*}} {kind = #vector.kind<add>} : vector<[4]xf32>, f32 }
+  %res = vector.mask %mask {
+    vector.contract #matvec_trait_8 %arg1, %arg0, %arg2
+      : vector<2xf32>, vector<2x[4]xf32>, vector<[4]xf32> into vector<[4]xf32>
+  } : vector<2x[4]xi1> -> vector<[4]xf32>
+  return %res : vector<[4]xf32>
+}
+
 
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%module_op: !transform.any_op {transform.readonly}) {

From cff665212918f1f21b37d961a3531c9cc289bd3a Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Fri, 27 Oct 2023 13:34:53 +0100
Subject: [PATCH 215/877] [VPlan] Handle VPValues without underlying values in
 getTypeForVPValue.

Fixes a crash after 0c8e5be6fa08.

Full type inference will be added in
https://github.com/llvm/llvm-project/pull/69013
---
 .../Transforms/Vectorize/VPlanTransforms.cpp  |  5 ++-
 .../LoopVectorize/cast-induction.ll           | 37 +++++++++++++++++++
 2 files changed, 40 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index f309ca5f90418..de9495e3db801 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -827,7 +827,7 @@ static Type *getTypeForVPValue(VPValue *VPV) {
   if (auto *VPC = dyn_cast<VPWidenCastRecipe>(VPV))
     return VPC->getResultType();
   auto *UV = VPV->getUnderlyingValue();
-  return UV->getType();
+  return UV ? UV->getType() : nullptr;
 }
 
 /// Try to simplify recipe \p R.
@@ -848,7 +848,8 @@ static void simplifyRecipe(VPRecipeBase &R) {
       break;
     VPValue *A = Zext->getOperand(0);
     VPValue *Trunc = R.getVPSingleValue();
-    if (getTypeForVPValue(Trunc) == getTypeForVPValue(A))
+    Type *TruncToTy = getTypeForVPValue(Trunc);
+    if (TruncToTy && TruncToTy == getTypeForVPValue(A))
       Trunc->replaceAllUsesWith(A);
     break;
   }
diff --git a/llvm/test/Transforms/LoopVectorize/cast-induction.ll b/llvm/test/Transforms/LoopVectorize/cast-induction.ll
index a4433bc7f00d4..3997f5ee34ae7 100644
--- a/llvm/test/Transforms/LoopVectorize/cast-induction.ll
+++ b/llvm/test/Transforms/LoopVectorize/cast-induction.ll
@@ -40,3 +40,40 @@ exit:
   ret void
 }
 
+define void @redundant_iv_cast(ptr %dst) {
+; VF4-LABEL: @redundant_iv_cast
+; VF4: vector.body:
+; VF4:   [[VEC_IND:%.+]] = phi <4 x i16> [ <i16 0, i16 1, i16 2, i16 3>, %vector.ph ], [ [[VEC_IND_NEXT:%.+]], %vector.body ]
+; VF4:  store <4 x i16> [[VEC_IND]]
+; VF4:  [[VEC_IND_NEXT]] = add <4 x i16> [[VEC_IND]], <i16 4, i16 4, i16 4, i16 4>
+;
+; IC2-LABEL: @redundant_iv_cast
+; IC2:      vector.body:
+; IC2-NEXT:  [[CAN_IV:%.+]] = phi i32 [ 0, %vector.ph ], [ [[CAN_IV_NEXT:%.+]], %vector.body ]
+; IC2-NEXT:  [[OFFSET_IDX:%.+]] = trunc i32 [[CAN_IV]] to i16
+; IC2-NEXT:  [[P0:%.+]] = add i16 [[OFFSET_IDX]], 0
+; IC2-NEXT:  [[P1:%.+]] = add i16 [[OFFSET_IDX]], 1
+; IC2-NEXT:  [[Z0:%.+]] = zext i16 [[P0]] to i32
+; IC2-NEXT:  [[Z1:%.+]] = zext i16 [[P1]] to i32
+; IC2-NEXT:  [[T0:%.+]] = trunc i32 [[Z0]] to i16
+; IC2-NEXT:  [[T1:%.+]] = trunc i32 [[Z1]] to i16
+; IC2:       store i16 [[T0]]
+; IC2-NEXT:  store i16 [[T1]]
+;
+entry:
+  br label %loop
+
+loop:
+  %j.0 = phi i16 [ 0, %entry ], [ %inc, %loop ]
+  %ext = zext i16 %j.0 to i32
+  %trunc = trunc i32 %ext to i16
+  %gep = getelementptr inbounds i16, ptr %dst, i16 %j.0
+  store i16 %trunc, ptr %gep
+  %0 = icmp eq i16 10000, %j.0
+  %inc = add i16 %j.0, 1
+  br i1 %0, label %exit, label %loop
+
+
+exit:
+  ret void
+}

From 7391ad38d6cb67ac9364c19c0ba1f4e16f3d71e0 Mon Sep 17 00:00:00 2001
From: Kiran Chandramohan <kiran.chandramohan@arm.com>
Date: Fri, 27 Oct 2023 13:48:41 +0100
Subject: [PATCH 216/877] [Flang][OpenMP] Move the host-ir-flag test to the
 Driver directory (#70441)

Moving since this is almost an end to end test and covers multiple
stages.
---
 flang/test/{Lower/OpenMP/FIR => Driver/OpenMP}/host-ir-flag.f90 | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename flang/test/{Lower/OpenMP/FIR => Driver/OpenMP}/host-ir-flag.f90 (100%)

diff --git a/flang/test/Lower/OpenMP/FIR/host-ir-flag.f90 b/flang/test/Driver/OpenMP/host-ir-flag.f90
similarity index 100%
rename from flang/test/Lower/OpenMP/FIR/host-ir-flag.f90
rename to flang/test/Driver/OpenMP/host-ir-flag.f90

From 4fc1e7db273d2c1d0140ab7234b2f7b05d58a397 Mon Sep 17 00:00:00 2001
From: Pierre van Houtryve <pierre.vanhoutryve@amd.com>
Date: Fri, 27 Oct 2023 14:52:09 +0200
Subject: [PATCH 217/877] [InstSimplify] Fold (a != 0) ? abs(a) : 0 (#70305)

Solves #70204
---
 llvm/lib/Analysis/InstructionSimplify.cpp     |  11 +-
 .../Transforms/InstSimplify/select-abs.ll     | 348 ++++++++++++++++++
 2 files changed, 357 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/Transforms/InstSimplify/select-abs.ll

diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp
index 3d192d0759a1e..f0e60c9a2dac6 100644
--- a/llvm/lib/Analysis/InstructionSimplify.cpp
+++ b/llvm/lib/Analysis/InstructionSimplify.cpp
@@ -4436,8 +4436,15 @@ static Value *simplifyWithOpReplaced(Value *V, Value *Op, Value *RepOp,
   // TODO: This may be unsound, because it only catches some forms of
   // refinement.
   if (!AllowRefinement) {
-    if (canCreatePoison(cast<Operator>(I), !DropFlags))
-      return nullptr;
+    if (canCreatePoison(cast<Operator>(I), !DropFlags)) {
+      // abs cannot create poison if the value is known to never be int_min.
+      if (auto *II = dyn_cast<IntrinsicInst>(I);
+          II && II->getIntrinsicID() == Intrinsic::abs) {
+        if (!ConstOps[0]->isNotMinSignedValue())
+          return nullptr;
+      } else
+        return nullptr;
+    }
     Constant *Res = ConstantFoldInstOperands(I, ConstOps, Q.DL, Q.TLI);
     if (DropFlags && Res && I->hasPoisonGeneratingFlagsOrMetadata())
       DropFlags->push_back(I);
diff --git a/llvm/test/Transforms/InstSimplify/select-abs.ll b/llvm/test/Transforms/InstSimplify/select-abs.ll
new file mode 100644
index 0000000000000..6b4708443fb26
--- /dev/null
+++ b/llvm/test/Transforms/InstSimplify/select-abs.ll
@@ -0,0 +1,348 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -passes=instsimplify < %s | FileCheck %s
+
+declare <4 x i16> @llvm.abs.v4i16(<4 x i16>, i1 immarg)
+declare i32 @llvm.abs.i32(i32, i1 immarg)
+declare i64 @llvm.abs.i64(i64, i1 immarg)
+
+; check (a == 0) ? 0 : abs(a)
+
+define i32 @select_i32_eq0_abs_f(i32 %a) {
+; CHECK-LABEL: @select_i32_eq0_abs_f(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ABS:%.*]] = tail call i32 @llvm.abs.i32(i32 [[A:%.*]], i1 false)
+; CHECK-NEXT:    ret i32 [[ABS]]
+;
+entry:
+  %cond = icmp eq i32 %a, 0
+  %abs = tail call i32 @llvm.abs.i32(i32 %a, i1 false)
+  %res = select i1 %cond, i32 0, i32 %abs
+  ret i32 %res
+}
+
+define i32 @select_i32_eq0_abs_t(i32 %a) {
+; CHECK-LABEL: @select_i32_eq0_abs_t(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ABS:%.*]] = tail call i32 @llvm.abs.i32(i32 [[A:%.*]], i1 true)
+; CHECK-NEXT:    ret i32 [[ABS]]
+;
+entry:
+  %cond = icmp eq i32 %a, 0
+  %abs = tail call i32 @llvm.abs.i32(i32 %a, i1 true)
+  %res = select i1 %cond, i32 0, i32 %abs
+  ret i32 %res
+}
+
+; check (a == int_min) ? int_min : abs(a)
+
+define i32 @select_i32_eqINTMIN_abs_f(i32 %a) {
+; CHECK-LABEL: @select_i32_eqINTMIN_abs_f(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ABS:%.*]] = tail call i32 @llvm.abs.i32(i32 [[A:%.*]], i1 false)
+; CHECK-NEXT:    ret i32 [[ABS]]
+;
+entry:
+  %cond = icmp eq i32 %a, -2147483648
+  %abs = tail call i32 @llvm.abs.i32(i32 %a, i1 false)
+  %res = select i1 %cond, i32 -2147483648, i32 %abs
+  ret i32 %res
+}
+
+; should not transform
+define i32 @select_i32_eqINTMIN_abs_t(i32 %a) {
+; CHECK-LABEL: @select_i32_eqINTMIN_abs_t(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[A:%.*]], -2147483648
+; CHECK-NEXT:    [[ABS:%.*]] = tail call i32 @llvm.abs.i32(i32 [[A]], i1 true)
+; CHECK-NEXT:    [[RES:%.*]] = select i1 [[COND]], i32 -2147483648, i32 [[ABS]]
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+entry:
+  %cond = icmp eq i32 %a, -2147483648
+  %abs = tail call i32 @llvm.abs.i32(i32 %a, i1 true)
+  %res = select i1 %cond, i32 -2147483648, i32 %abs
+  ret i32 %res
+}
+
+; check random values, like (a == -255) ? 255 : abs(a)
+define i32 @select_i32_eqneg255_abs_f(i32 %a) {
+; CHECK-LABEL: @select_i32_eqneg255_abs_f(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ABS:%.*]] = tail call i32 @llvm.abs.i32(i32 [[A:%.*]], i1 false)
+; CHECK-NEXT:    ret i32 [[ABS]]
+;
+entry:
+  %cond = icmp eq i32 %a, -255
+  %abs = tail call i32 @llvm.abs.i32(i32 %a, i1 false)
+  %res = select i1 %cond, i32 255, i32 %abs
+  ret i32 %res
+}
+
+define i32 @select_i32_eqneg255_abs_t(i32 %a) {
+; CHECK-LABEL: @select_i32_eqneg255_abs_t(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ABS:%.*]] = tail call i32 @llvm.abs.i32(i32 [[A:%.*]], i1 true)
+; CHECK-NEXT:    ret i32 [[ABS]]
+;
+entry:
+  %cond = icmp eq i32 %a, -255
+  %abs = tail call i32 @llvm.abs.i32(i32 %a, i1 true)
+  %res = select i1 %cond, i32 255, i32 %abs
+  ret i32 %res
+}
+
+define i32 @select_i32_eq65555_abs_f(i32 %a) {
+; CHECK-LABEL: @select_i32_eq65555_abs_f(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ABS:%.*]] = tail call i32 @llvm.abs.i32(i32 [[A:%.*]], i1 false)
+; CHECK-NEXT:    ret i32 [[ABS]]
+;
+entry:
+  %cond = icmp eq i32 %a, 65555
+  %abs = tail call i32 @llvm.abs.i32(i32 %a, i1 false)
+  %res = select i1 %cond, i32 65555, i32 %abs
+  ret i32 %res
+}
+
+define i32 @select_i32_eq65555_abs_t(i32 %a) {
+; CHECK-LABEL: @select_i32_eq65555_abs_t(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ABS:%.*]] = tail call i32 @llvm.abs.i32(i32 [[A:%.*]], i1 true)
+; CHECK-NEXT:    ret i32 [[ABS]]
+;
+entry:
+  %cond = icmp eq i32 %a, 65555
+  %abs = tail call i32 @llvm.abs.i32(i32 %a, i1 true)
+  %res = select i1 %cond, i32 65555, i32 %abs
+  ret i32 %res
+}
+
+
+; check random values, but the transform doesn't make sense, e.g.
+; (a == 255) ? -255 : abs(a)
+define i32 @bad_select_i32_eq255_abs_f(i32 %a) {
+; CHECK-LABEL: @bad_select_i32_eq255_abs_f(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[A:%.*]], 255
+; CHECK-NEXT:    [[ABS:%.*]] = tail call i32 @llvm.abs.i32(i32 [[A]], i1 false)
+; CHECK-NEXT:    [[RES:%.*]] = select i1 [[COND]], i32 -255, i32 [[ABS]]
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+entry:
+  %cond = icmp eq i32 %a, 255
+  %abs = tail call i32 @llvm.abs.i32(i32 %a, i1 false)
+  %res = select i1 %cond, i32 -255, i32 %abs
+  ret i32 %res
+}
+
+define i32 @bad_select_i32_eq255_abs_t(i32 %a) {
+; CHECK-LABEL: @bad_select_i32_eq255_abs_t(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[A:%.*]], 255
+; CHECK-NEXT:    [[ABS:%.*]] = tail call i32 @llvm.abs.i32(i32 [[A]], i1 true)
+; CHECK-NEXT:    [[RES:%.*]] = select i1 [[COND]], i32 -255, i32 [[ABS]]
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+entry:
+  %cond = icmp eq i32 %a, 255
+  %abs = tail call i32 @llvm.abs.i32(i32 %a, i1 true)
+  %res = select i1 %cond, i32 -255, i32 %abs
+  ret i32 %res
+}
+
+define i32 @bad_select_i32_eq65555_abs_f(i32 %a) {
+; CHECK-LABEL: @bad_select_i32_eq65555_abs_f(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[A:%.*]], -65555
+; CHECK-NEXT:    [[ABS:%.*]] = tail call i32 @llvm.abs.i32(i32 [[A]], i1 false)
+; CHECK-NEXT:    [[RES:%.*]] = select i1 [[COND]], i32 65554, i32 [[ABS]]
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+entry:
+  %cond = icmp eq i32 %a, -65555
+  %abs = tail call i32 @llvm.abs.i32(i32 %a, i1 false)
+  %res = select i1 %cond, i32 65554, i32 %abs
+  ret i32 %res
+}
+
+define i32 @bad_select_i32_eq65555_abs_t(i32 %a) {
+; CHECK-LABEL: @bad_select_i32_eq65555_abs_t(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[A:%.*]], -65555
+; CHECK-NEXT:    [[ABS:%.*]] = tail call i32 @llvm.abs.i32(i32 [[A]], i1 true)
+; CHECK-NEXT:    [[RES:%.*]] = select i1 [[COND]], i32 65554, i32 [[ABS]]
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+entry:
+  %cond = icmp eq i32 %a, -65555
+  %abs = tail call i32 @llvm.abs.i32(i32 %a, i1 true)
+  %res = select i1 %cond, i32 65554, i32 %abs
+  ret i32 %res
+}
+
+define i32 @select_i32_ne0_abs_f(i32 %a) {
+; CHECK-LABEL: @select_i32_ne0_abs_f(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ABS:%.*]] = tail call i32 @llvm.abs.i32(i32 [[A:%.*]], i1 false)
+; CHECK-NEXT:    ret i32 [[ABS]]
+;
+entry:
+  %cond = icmp ne i32 %a, 0
+  %abs = tail call i32 @llvm.abs.i32(i32 %a, i1 false)
+  %res = select i1 %cond, i32 %abs, i32 0
+  ret i32 %res
+}
+
+define i32 @select_i32_ne0_abs_t(i32 %a) {
+; CHECK-LABEL: @select_i32_ne0_abs_t(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ABS:%.*]] = tail call i32 @llvm.abs.i32(i32 [[A:%.*]], i1 true)
+; CHECK-NEXT:    ret i32 [[ABS]]
+;
+entry:
+  %cond = icmp ne i32 %a, 0
+  %abs = tail call i32 @llvm.abs.i32(i32 %a, i1 true)
+  %res = select i1 %cond, i32 %abs, i32 0
+  ret i32 %res
+}
+
+define i64 @select_i64_eq0_abs_t(i64 %a) {
+; CHECK-LABEL: @select_i64_eq0_abs_t(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ABS:%.*]] = tail call i64 @llvm.abs.i64(i64 [[A:%.*]], i1 true)
+; CHECK-NEXT:    ret i64 [[ABS]]
+;
+entry:
+  %cond = icmp eq i64 %a, 0
+  %abs = tail call i64 @llvm.abs.i64(i64 %a, i1 true)
+  %res = select i1 %cond, i64 0, i64 %abs
+  ret i64 %res
+}
+
+define i64 @select_i64_ne0_abs_t(i64 %a) {
+; CHECK-LABEL: @select_i64_ne0_abs_t(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ABS:%.*]] = tail call i64 @llvm.abs.i64(i64 [[A:%.*]], i1 true)
+; CHECK-NEXT:    ret i64 [[ABS]]
+;
+entry:
+  %cond = icmp ne i64 %a, 0
+  %abs = tail call i64 @llvm.abs.i64(i64 %a, i1 true)
+  %res = select i1 %cond, i64 %abs, i64 0
+  ret i64 %res
+}
+
+; TODO: Handle vector cases?
+define <4 x i16> @select_v4i16_eq0_abs_t(<4 x i16> %a) {
+; CHECK-LABEL: @select_v4i16_eq0_abs_t(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq <4 x i16> [[A:%.*]], zeroinitializer
+; CHECK-NEXT:    [[ABS:%.*]] = tail call <4 x i16> @llvm.abs.v4i16(<4 x i16> [[A]], i1 true)
+; CHECK-NEXT:    [[RES:%.*]] = select <4 x i1> [[COND]], <4 x i16> zeroinitializer, <4 x i16> [[ABS]]
+; CHECK-NEXT:    ret <4 x i16> [[RES]]
+;
+entry:
+  %cond = icmp eq <4 x i16> %a, <i16 0, i16 0, i16 0, i16 0>
+  %abs = tail call <4 x i16> @llvm.abs.v4i16(<4 x i16> %a, i1 true)
+  %res = select <4 x i1> %cond, <4 x i16>  <i16 0, i16 0, i16 0, i16 0>, <4 x i16> %abs
+  ret <4 x i16> %res
+}
+
+define <4 x i16> @select_v4i16_ne0_abs_t(<4 x i16> %a) {
+; CHECK-LABEL: @select_v4i16_ne0_abs_t(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[COND:%.*]] = icmp ne <4 x i16> [[A:%.*]], zeroinitializer
+; CHECK-NEXT:    [[ABS:%.*]] = tail call <4 x i16> @llvm.abs.v4i16(<4 x i16> [[A]], i1 true)
+; CHECK-NEXT:    [[RES:%.*]] = select <4 x i1> [[COND]], <4 x i16> [[ABS]], <4 x i16> zeroinitializer
+; CHECK-NEXT:    ret <4 x i16> [[RES]]
+;
+entry:
+  %cond = icmp ne <4 x i16> %a, <i16 0, i16 0, i16 0, i16 0>
+  %abs = tail call <4 x i16> @llvm.abs.v4i16(<4 x i16> %a, i1 true)
+  %res = select <4 x i1> %cond, <4 x i16> %abs, <4 x i16>  <i16 0, i16 0, i16 0, i16 0>
+  ret <4 x i16> %res
+}
+
+define <4 x i16> @select_v4i16_ne0_abs_t_with_undef(<4 x i16> %a) {
+; CHECK-LABEL: @select_v4i16_ne0_abs_t_with_undef(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[COND:%.*]] = icmp ne <4 x i16> [[A:%.*]], zeroinitializer
+; CHECK-NEXT:    [[ABS:%.*]] = tail call <4 x i16> @llvm.abs.v4i16(<4 x i16> [[A]], i1 true)
+; CHECK-NEXT:    [[RES:%.*]] = select <4 x i1> [[COND]], <4 x i16> [[ABS]], <4 x i16> <i16 undef, i16 0, i16 0, i16 0>
+; CHECK-NEXT:    ret <4 x i16> [[RES]]
+;
+entry:
+  %cond = icmp ne <4 x i16> %a, <i16 0, i16 0, i16 0, i16 0>
+  %abs = tail call <4 x i16> @llvm.abs.v4i16(<4 x i16> %a, i1 true)
+  %res = select <4 x i1> %cond, <4 x i16> %abs, <4 x i16>  <i16 undef, i16 0, i16 0, i16 0>
+  ret <4 x i16> %res
+}
+
+define i32 @bad_select_i32_ne0_abs(i32 %a) {
+; CHECK-LABEL: @bad_select_i32_ne0_abs(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  %cond = icmp ne i32 %a, 0
+  %abs = tail call i32 @llvm.abs.i32(i32 %a, i1 false)
+  %res = select i1 %cond, i32 0, i32 %abs
+  ret i32 %res
+}
+
+define i32 @bad_select_i32_eq0_abs(i32 %a) {
+; CHECK-LABEL: @bad_select_i32_eq0_abs(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  %cond = icmp eq i32 %a, 0
+  %abs = tail call i32 @llvm.abs.i32(i32 %a, i1 false)
+  %res = select i1 %cond, i32 %abs, i32 0
+  ret i32 %res
+}
+
+define <4 x i16> @badsplat1_select_v4i16_ne0_abs(<4 x i16> %a) {
+; CHECK-LABEL: @badsplat1_select_v4i16_ne0_abs(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[COND:%.*]] = icmp ne <4 x i16> [[A:%.*]], <i16 0, i16 1, i16 0, i16 0>
+; CHECK-NEXT:    [[ABS:%.*]] = tail call <4 x i16> @llvm.abs.v4i16(<4 x i16> [[A]], i1 true)
+; CHECK-NEXT:    [[RES:%.*]] = select <4 x i1> [[COND]], <4 x i16> [[ABS]], <4 x i16> <i16 0, i16 1, i16 0, i16 0>
+; CHECK-NEXT:    ret <4 x i16> [[RES]]
+;
+entry:
+  %cond = icmp ne <4 x i16> %a, <i16 0, i16 1, i16 0, i16 0>
+  %abs = tail call <4 x i16> @llvm.abs.v4i16(<4 x i16> %a, i1 true)
+  %res = select <4 x i1> %cond, <4 x i16> %abs, <4 x i16>  <i16 0, i16 1, i16 0, i16 0>
+  ret <4 x i16> %res
+}
+
+define <4 x i16> @badsplat2_select_v4i16_ne0_abs(<4 x i16> %a) {
+; CHECK-LABEL: @badsplat2_select_v4i16_ne0_abs(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[COND:%.*]] = icmp ne <4 x i16> [[A:%.*]], <i16 0, i16 undef, i16 0, i16 0>
+; CHECK-NEXT:    [[ABS:%.*]] = tail call <4 x i16> @llvm.abs.v4i16(<4 x i16> [[A]], i1 true)
+; CHECK-NEXT:    [[RES:%.*]] = select <4 x i1> [[COND]], <4 x i16> [[ABS]], <4 x i16> <i16 0, i16 1, i16 0, i16 0>
+; CHECK-NEXT:    ret <4 x i16> [[RES]]
+;
+entry:
+  %cond = icmp ne <4 x i16> %a, <i16 0, i16 undef, i16 0, i16 0>
+  %abs = tail call <4 x i16> @llvm.abs.v4i16(<4 x i16> %a, i1 true)
+  %res = select <4 x i1> %cond, <4 x i16> %abs, <4 x i16>  <i16 0, i16 1, i16 0, i16 0>
+  ret <4 x i16> %res
+}
+
+define <4 x i16> @badsplat3_select_v4i16_ne0_abs(<4 x i16> %a) {
+; CHECK-LABEL: @badsplat3_select_v4i16_ne0_abs(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[COND:%.*]] = icmp ne <4 x i16> [[A:%.*]], zeroinitializer
+; CHECK-NEXT:    [[ABS:%.*]] = tail call <4 x i16> @llvm.abs.v4i16(<4 x i16> [[A]], i1 true)
+; CHECK-NEXT:    [[RES:%.*]] = select <4 x i1> [[COND]], <4 x i16> [[ABS]], <4 x i16> <i16 0, i16 1, i16 0, i16 0>
+; CHECK-NEXT:    ret <4 x i16> [[RES]]
+;
+entry:
+  %cond = icmp ne <4 x i16> %a, <i16 0, i16 0, i16 0, i16 0>
+  %abs = tail call <4 x i16> @llvm.abs.v4i16(<4 x i16> %a, i1 true)
+  %res = select <4 x i1> %cond, <4 x i16> %abs, <4 x i16>  <i16 0, i16 1, i16 0, i16 0>
+  ret <4 x i16> %res
+}

From cdc5e00e73cb868f4e4d2c73f600c150b6544c6d Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Fri, 27 Oct 2023 14:32:54 +0100
Subject: [PATCH 218/877] [LV] Add test case to scalarize ptrtoint
 instructions.

Extra test for https://github.com/llvm/llvm-project/pull/69013
---
 .../interleave-and-scalarize-only.ll          | 40 +++++++++++++++++++
 1 file changed, 40 insertions(+)

diff --git a/llvm/test/Transforms/LoopVectorize/interleave-and-scalarize-only.ll b/llvm/test/Transforms/LoopVectorize/interleave-and-scalarize-only.ll
index 541dc9a70c819..028e0b7b39cba 100644
--- a/llvm/test/Transforms/LoopVectorize/interleave-and-scalarize-only.ll
+++ b/llvm/test/Transforms/LoopVectorize/interleave-and-scalarize-only.ll
@@ -267,3 +267,43 @@ exit:
   %count.0 = trunc i32 %add.lcssa to i16
   ret i16 %count.0
 }
+
+define void @scalarize_ptrtoint(ptr %src, ptr %dst) {
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr ptr, ptr %src, i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr ptr, ptr %src, i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[TMP2]], align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[TMP3]], align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[TMP4]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = ptrtoint ptr [[TMP5]] to i64
+; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[TMP6]], 10
+; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[TMP7]], 10
+; CHECK-NEXT:    [[TMP10:%.*]] = inttoptr i64 [[TMP8]] to ptr
+; CHECK-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP9]] to ptr
+; CHECK-NEXT:    store ptr [[TMP10]], ptr %dst, align 8
+; CHECK-NEXT:    store ptr [[TMP11]], ptr %dst, align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
+; CHECK-NEXT:    br i1 [[TMP12]], label %middle.block, label %vector.body
+
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %gep = getelementptr ptr, ptr %src, i64 %iv
+  %l = load ptr, ptr %gep, align 8
+  %cast = ptrtoint ptr %l to i64
+  %add = add i64 %cast, 10
+  %cast.2 = inttoptr i64 %add to ptr
+  store ptr %cast.2, ptr %dst, align 8
+  %iv.next = add i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, 0
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}

From 37d9dc47931fa6bd722be97f5906641a7e8e6c0e Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Fri, 27 Oct 2023 14:36:05 +0100
Subject: [PATCH 219/877] [X86] Add test case for Issue #66150

---
 .../CodeGen/X86/vector-shuffle-256-v32.ll     | 57 +++++++++++++++++++
 1 file changed, 57 insertions(+)

diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll
index 58401a2e34283..1e7745a4b8836 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll
@@ -5109,6 +5109,63 @@ define <32 x i8> @PR55066(<32 x i8> %a0) {
   ret <32 x i8> %shuffle
 }
 
+define <4 x i64> @PR66150(ptr %b) {
+; AVX1-LABEL: PR66150:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm0[0,0,1,1,4,5,6,7]
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,2,3,3,4,5,6,7]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: PR66150:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,18,18,18,18,18,18,18,18,19,19,19,19,19,19,19,19]
+; AVX2-NEXT:    retq
+;
+; AVX512VLBW-LABEL: PR66150:
+; AVX512VLBW:       # %bb.0:
+; AVX512VLBW-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX512VLBW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
+; AVX512VLBW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,18,18,18,18,18,18,18,18,19,19,19,19,19,19,19,19]
+; AVX512VLBW-NEXT:    retq
+;
+; AVX512VLVBMI-LABEL: PR66150:
+; AVX512VLVBMI:       # %bb.0:
+; AVX512VLVBMI-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX512VLVBMI-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3]
+; AVX512VLVBMI-NEXT:    vpermb %ymm0, %ymm1, %ymm0
+; AVX512VLVBMI-NEXT:    retq
+;
+; XOPAVX1-LABEL: PR66150:
+; XOPAVX1:       # %bb.0:
+; XOPAVX1-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; XOPAVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; XOPAVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm0[0,0,1,1,4,5,6,7]
+; XOPAVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,2,3,3,4,5,6,7]
+; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; XOPAVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5]
+; XOPAVX1-NEXT:    retq
+;
+; XOPAVX2-LABEL: PR66150:
+; XOPAVX2:       # %bb.0:
+; XOPAVX2-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; XOPAVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
+; XOPAVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,18,18,18,18,18,18,18,18,19,19,19,19,19,19,19,19]
+; XOPAVX2-NEXT:    retq
+  %tmp1 = load i32, ptr %b, align 4
+  %tmp2 = insertelement <8 x i32> undef, i32 %tmp1, i64 0
+  %tmp3 = shufflevector <8 x i32> %tmp2, <8 x i32> poison, <8 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 0, i32 poison, i32 poison, i32 poison>
+  %tmp4 = bitcast <8 x i32> %tmp3 to <32 x i8>
+  %tmp5 = shufflevector <32 x i8> %tmp4, <32 x i8> poison, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 18, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19, i32 19>
+  %tmp6 = bitcast <32 x i8> %tmp5 to <4 x i64>
+  ret <4 x i64> %tmp6
+}
+
 define <32 x i8> @insert_dup_mem_v32i8_i32(ptr %ptr) {
 ; AVX1-LABEL: insert_dup_mem_v32i8_i32:
 ; AVX1:       # %bb.0:

From b0b88643a1facc4fcb06783e295de3d4a91dc924 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Fri, 27 Oct 2023 14:38:28 +0100
Subject: [PATCH 220/877] [VPlan] Add initial anlysis to infer scalar type of
 VPValues. (#69013)

This patch adds initial type inferrence for VPValues. It infers the
scalar type of a VPValue, by bottom-up traversing through defining
recipes until root nodes with known types are reached (e.g. live-ins or
load recipes). The types are then propagated top down through
operations.

This is intended as building block for a VPlan-based cost model, which
will need access to type information for VPValues/recipes.

Initial testing is done by asserting the inferred type matches the type
of the result value generated for a widen and replicate recipes.
---
 llvm/lib/Transforms/Vectorize/CMakeLists.txt  |   1 +
 .../Transforms/Vectorize/LoopVectorize.cpp    |  13 +-
 llvm/lib/Transforms/Vectorize/VPlan.h         |  21 +-
 .../Transforms/Vectorize/VPlanAnalysis.cpp    | 232 ++++++++++++++++++
 llvm/lib/Transforms/Vectorize/VPlanAnalysis.h |  61 +++++
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |  12 +
 6 files changed, 333 insertions(+), 7 deletions(-)
 create mode 100644 llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
 create mode 100644 llvm/lib/Transforms/Vectorize/VPlanAnalysis.h

diff --git a/llvm/lib/Transforms/Vectorize/CMakeLists.txt b/llvm/lib/Transforms/Vectorize/CMakeLists.txt
index 998dfd956575d..9674094024b9e 100644
--- a/llvm/lib/Transforms/Vectorize/CMakeLists.txt
+++ b/llvm/lib/Transforms/Vectorize/CMakeLists.txt
@@ -6,6 +6,7 @@ add_llvm_component_library(LLVMVectorize
   Vectorize.cpp
   VectorCombine.cpp
   VPlan.cpp
+  VPlanAnalysis.cpp
   VPlanHCFGBuilder.cpp
   VPlanRecipes.cpp
   VPlanSLP.cpp
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 0807d2a7e5a26..16c761a91ff23 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -57,6 +57,7 @@
 #include "LoopVectorizationPlanner.h"
 #include "VPRecipeBuilder.h"
 #include "VPlan.h"
+#include "VPlanAnalysis.h"
 #include "VPlanHCFGBuilder.h"
 #include "VPlanTransforms.h"
 #include "llvm/ADT/APInt.h"
@@ -2702,8 +2703,15 @@ void InnerLoopVectorizer::scalarizeInstruction(const Instruction *Instr,
   bool IsVoidRetTy = Instr->getType()->isVoidTy();
 
   Instruction *Cloned = Instr->clone();
-  if (!IsVoidRetTy)
+  if (!IsVoidRetTy) {
     Cloned->setName(Instr->getName() + ".cloned");
+#if !defined(NDEBUG)
+    // Verify that VPlan type inference results agree with the type of the
+    // generated values.
+    assert(State.TypeAnalysis.inferScalarType(RepRecipe) == Cloned->getType() &&
+           "inferred type and type from generated instructions do not match");
+#endif
+  }
 
   RepRecipe->setFlags(Cloned);
 
@@ -7660,7 +7668,8 @@ SCEV2ValueTy LoopVectorizationPlanner::executePlan(
     VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);
 
   // Perform the actual loop transformation.
-  VPTransformState State{BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan};
+  VPTransformState State(BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan,
+                         OrigLoop->getHeader()->getContext());
 
   // 0. Generate SCEV-dependent code into the preheader, including TripCount,
   // before making any changes to the CFG.
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index e65a7ab2cd028..96cfc9354ead8 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -23,6 +23,7 @@
 #ifndef LLVM_TRANSFORMS_VECTORIZE_VPLAN_H
 #define LLVM_TRANSFORMS_VECTORIZE_VPLAN_H
 
+#include "VPlanAnalysis.h"
 #include "VPlanValue.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/MapVector.h"
@@ -233,9 +234,9 @@ struct VPIteration {
 struct VPTransformState {
   VPTransformState(ElementCount VF, unsigned UF, LoopInfo *LI,
                    DominatorTree *DT, IRBuilderBase &Builder,
-                   InnerLoopVectorizer *ILV, VPlan *Plan)
+                   InnerLoopVectorizer *ILV, VPlan *Plan, LLVMContext &Ctx)
       : VF(VF), UF(UF), LI(LI), DT(DT), Builder(Builder), ILV(ILV), Plan(Plan),
-        LVer(nullptr) {}
+        LVer(nullptr), TypeAnalysis(Ctx) {}
 
   /// The chosen Vectorization and Unroll Factors of the loop being vectorized.
   ElementCount VF;
@@ -413,6 +414,9 @@ struct VPTransformState {
   /// Map SCEVs to their expanded values. Populated when executing
   /// VPExpandSCEVRecipes.
   DenseMap<const SCEV *, Value *> ExpandedSCEVs;
+
+  /// VPlan-based type analysis.
+  VPTypeAnalysis TypeAnalysis;
 };
 
 /// VPBlockBase is the building block of the Hierarchical Control-Flow Graph.
@@ -1167,6 +1171,8 @@ class VPWidenRecipe : public VPRecipeWithIRFlags, public VPValue {
   /// Produce widened copies of all Ingredients.
   void execute(VPTransformState &State) override;
 
+  unsigned getOpcode() const { return Opcode; }
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   /// Print the recipe.
   void print(raw_ostream &O, const Twine &Indent,
@@ -1458,7 +1464,7 @@ class VPWidenIntOrFpInductionRecipe : public VPHeaderPHIRecipe {
   bool isCanonical() const;
 
   /// Returns the scalar type of the induction.
-  const Type *getScalarType() const {
+  Type *getScalarType() const {
     return Trunc ? Trunc->getType() : IV->getType();
   }
 };
@@ -2080,8 +2086,8 @@ class VPCanonicalIVPHIRecipe : public VPHeaderPHIRecipe {
 #endif
 
   /// Returns the scalar type of the induction.
-  const Type *getScalarType() const {
-    return getOperand(0)->getLiveInIRValue()->getType();
+  Type *getScalarType() const {
+    return getStartValue()->getLiveInIRValue()->getType();
   }
 
   /// Returns true if the recipe only uses the first lane of operand \p Op.
@@ -2192,6 +2198,11 @@ class VPDerivedIVRecipe : public VPRecipeBase, public VPValue {
              VPSlotTracker &SlotTracker) const override;
 #endif
 
+  Type *getScalarType() const {
+    return TruncResultTy ? TruncResultTy
+                         : getStartValue()->getLiveInIRValue()->getType();
+  }
+
   VPValue *getStartValue() const { return getOperand(0); }
   VPValue *getCanonicalIV() const { return getOperand(1); }
   VPValue *getStepValue() const { return getOperand(2); }
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
new file mode 100644
index 0000000000000..ff1b6b30aa3e1
--- /dev/null
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -0,0 +1,232 @@
+//===- VPlanAnalysis.cpp - Various Analyses working on VPlan ----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "VPlanAnalysis.h"
+#include "VPlan.h"
+#include "llvm/ADT/TypeSwitch.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "vplan"
+
+Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPBlendRecipe *R) {
+  Type *ResTy = inferScalarType(R->getIncomingValue(0));
+  for (unsigned I = 1, E = R->getNumIncomingValues(); I != E; ++I) {
+    VPValue *Inc = R->getIncomingValue(I);
+    assert(inferScalarType(Inc) == ResTy &&
+           "different types inferred for different incoming values");
+    CachedTypes[Inc] = ResTy;
+  }
+  return ResTy;
+}
+
+Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) {
+  switch (R->getOpcode()) {
+  case Instruction::Select: {
+    Type *ResTy = inferScalarType(R->getOperand(1));
+    VPValue *OtherV = R->getOperand(2);
+    assert(inferScalarType(OtherV) == ResTy &&
+           "different types inferred for different operands");
+    CachedTypes[OtherV] = ResTy;
+    return ResTy;
+  }
+  case VPInstruction::FirstOrderRecurrenceSplice: {
+    Type *ResTy = inferScalarType(R->getOperand(0));
+    VPValue *OtherV = R->getOperand(1);
+    assert(inferScalarType(OtherV) == ResTy &&
+           "different types inferred for different operands");
+    CachedTypes[OtherV] = ResTy;
+    return ResTy;
+  }
+  default:
+    break;
+  }
+  // Type inference not implemented for opcode.
+  LLVM_DEBUG({
+    dbgs() << "LV: Found unhandled opcode for: ";
+    R->getVPSingleValue()->dump();
+  });
+  llvm_unreachable("Unhandled opcode!");
+}
+
+Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPWidenRecipe *R) {
+  unsigned Opcode = R->getOpcode();
+  switch (Opcode) {
+  case Instruction::ICmp:
+  case Instruction::FCmp:
+    return IntegerType::get(Ctx, 1);
+  case Instruction::UDiv:
+  case Instruction::SDiv:
+  case Instruction::SRem:
+  case Instruction::URem:
+  case Instruction::Add:
+  case Instruction::FAdd:
+  case Instruction::Sub:
+  case Instruction::FSub:
+  case Instruction::Mul:
+  case Instruction::FMul:
+  case Instruction::FDiv:
+  case Instruction::FRem:
+  case Instruction::Shl:
+  case Instruction::LShr:
+  case Instruction::AShr:
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor: {
+    Type *ResTy = inferScalarType(R->getOperand(0));
+    assert(ResTy == inferScalarType(R->getOperand(1)) &&
+           "types for both operands must match for binary op");
+    CachedTypes[R->getOperand(1)] = ResTy;
+    return ResTy;
+  }
+  case Instruction::FNeg:
+  case Instruction::Freeze:
+    return inferScalarType(R->getOperand(0));
+  default:
+    break;
+  }
+
+  // Type inference not implemented for opcode.
+  LLVM_DEBUG({
+    dbgs() << "LV: Found unhandled opcode for: ";
+    R->getVPSingleValue()->dump();
+  });
+  llvm_unreachable("Unhandled opcode!");
+}
+
+Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPWidenCallRecipe *R) {
+  auto &CI = *cast<CallInst>(R->getUnderlyingInstr());
+  return CI.getType();
+}
+
+Type *VPTypeAnalysis::inferScalarTypeForRecipe(
+    const VPWidenMemoryInstructionRecipe *R) {
+  assert(!R->isStore() && "Store recipes should not define any values");
+  return cast<LoadInst>(&R->getIngredient())->getType();
+}
+
+Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPWidenSelectRecipe *R) {
+  Type *ResTy = inferScalarType(R->getOperand(1));
+  VPValue *OtherV = R->getOperand(2);
+  assert(inferScalarType(OtherV) == ResTy &&
+         "different types inferred for different operands");
+  CachedTypes[OtherV] = ResTy;
+  return ResTy;
+}
+
+Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPReplicateRecipe *R) {
+  switch (R->getUnderlyingInstr()->getOpcode()) {
+  case Instruction::Call: {
+    unsigned CallIdx = R->getNumOperands() - (R->isPredicated() ? 2 : 1);
+    return cast<Function>(R->getOperand(CallIdx)->getLiveInIRValue())
+        ->getReturnType();
+  }
+  case Instruction::UDiv:
+  case Instruction::SDiv:
+  case Instruction::SRem:
+  case Instruction::URem:
+  case Instruction::Add:
+  case Instruction::FAdd:
+  case Instruction::Sub:
+  case Instruction::FSub:
+  case Instruction::Mul:
+  case Instruction::FMul:
+  case Instruction::FDiv:
+  case Instruction::FRem:
+  case Instruction::Shl:
+  case Instruction::LShr:
+  case Instruction::AShr:
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor: {
+    Type *ResTy = inferScalarType(R->getOperand(0));
+    assert(ResTy == inferScalarType(R->getOperand(1)) &&
+           "inferred types for operands of binary op don't match");
+    CachedTypes[R->getOperand(1)] = ResTy;
+    return ResTy;
+  }
+  case Instruction::Select: {
+    Type *ResTy = inferScalarType(R->getOperand(1));
+    assert(ResTy == inferScalarType(R->getOperand(2)) &&
+           "inferred types for operands of select op don't match");
+    CachedTypes[R->getOperand(2)] = ResTy;
+    return ResTy;
+  }
+  case Instruction::ICmp:
+  case Instruction::FCmp:
+    return IntegerType::get(Ctx, 1);
+  case Instruction::Alloca:
+  case Instruction::BitCast:
+  case Instruction::Trunc:
+  case Instruction::SExt:
+  case Instruction::ZExt:
+  case Instruction::FPExt:
+  case Instruction::FPTrunc:
+  case Instruction::ExtractValue:
+  case Instruction::SIToFP:
+  case Instruction::UIToFP:
+  case Instruction::FPToSI:
+  case Instruction::FPToUI:
+  case Instruction::PtrToInt:
+  case Instruction::IntToPtr:
+    return R->getUnderlyingInstr()->getType();
+  case Instruction::Freeze:
+  case Instruction::FNeg:
+  case Instruction::GetElementPtr:
+    return inferScalarType(R->getOperand(0));
+  case Instruction::Load:
+    return cast<LoadInst>(R->getUnderlyingInstr())->getType();
+  default:
+    break;
+  }
+  // Type inference not implemented for opcode.
+  LLVM_DEBUG({
+    dbgs() << "LV: Found unhandled opcode for: ";
+    R->getVPSingleValue()->dump();
+  });
+  llvm_unreachable("Unhandled opcode");
+}
+
+Type *VPTypeAnalysis::inferScalarType(const VPValue *V) {
+  if (Type *CachedTy = CachedTypes.lookup(V))
+    return CachedTy;
+
+  if (V->isLiveIn())
+    return V->getLiveInIRValue()->getType();
+
+  Type *ResultTy =
+      TypeSwitch<const VPRecipeBase *, Type *>(V->getDefiningRecipe())
+          .Case<VPCanonicalIVPHIRecipe, VPFirstOrderRecurrencePHIRecipe,
+                VPReductionPHIRecipe, VPWidenPointerInductionRecipe>(
+              [this](const auto *R) {
+                // Handle header phi recipes, except VPWienIntOrFpInduction
+                // which needs special handling due it being possibly truncated.
+                // TODO: consider inferring/caching type of siblings, e.g.,
+                // backedge value, here and in cases below.
+                return inferScalarType(R->getStartValue());
+              })
+          .Case<VPWidenIntOrFpInductionRecipe, VPDerivedIVRecipe>(
+              [](const auto *R) { return R->getScalarType(); })
+          .Case<VPPredInstPHIRecipe, VPWidenPHIRecipe, VPScalarIVStepsRecipe,
+                VPWidenGEPRecipe>([this](const VPRecipeBase *R) {
+            return inferScalarType(R->getOperand(0));
+          })
+          .Case<VPBlendRecipe, VPInstruction, VPWidenRecipe, VPReplicateRecipe,
+                VPWidenCallRecipe, VPWidenMemoryInstructionRecipe,
+                VPWidenSelectRecipe>(
+              [this](const auto *R) { return inferScalarTypeForRecipe(R); })
+          .Case<VPInterleaveRecipe>([V](const VPInterleaveRecipe *R) {
+            // TODO: Use info from interleave group.
+            return V->getUnderlyingValue()->getType();
+          })
+          .Case<VPWidenCastRecipe>(
+              [](const VPWidenCastRecipe *R) { return R->getResultType(); });
+  assert(ResultTy && "could not infer type for the given VPValue");
+  CachedTypes[V] = ResultTy;
+  return ResultTy;
+}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h
new file mode 100644
index 0000000000000..34b6b74588325
--- /dev/null
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h
@@ -0,0 +1,61 @@
+//===- VPlanAnalysis.h - Various Analyses working on VPlan ------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_VECTORIZE_VPLANANALYSIS_H
+#define LLVM_TRANSFORMS_VECTORIZE_VPLANANALYSIS_H
+
+#include "llvm/ADT/DenseMap.h"
+
+namespace llvm {
+
+class LLVMContext;
+class VPValue;
+class VPBlendRecipe;
+class VPInterleaveRecipe;
+class VPInstruction;
+class VPReductionPHIRecipe;
+class VPWidenRecipe;
+class VPWidenCallRecipe;
+class VPWidenCastRecipe;
+class VPWidenIntOrFpInductionRecipe;
+class VPWidenMemoryInstructionRecipe;
+struct VPWidenSelectRecipe;
+class VPReplicateRecipe;
+class Type;
+
+/// An analysis for type-inference for VPValues.
+/// It infers the scalar type for a given VPValue by bottom-up traversing
+/// through defining recipes until root nodes with known types are reached (e.g.
+/// live-ins or load recipes). The types are then propagated top down through
+/// operations.
+/// Note that the analysis caches the inferred types. A new analysis object must
+/// be constructed once a VPlan has been modified in a way that invalidates any
+/// of the previously inferred types.
+class VPTypeAnalysis {
+  DenseMap<const VPValue *, Type *> CachedTypes;
+  LLVMContext &Ctx;
+
+  Type *inferScalarTypeForRecipe(const VPBlendRecipe *R);
+  Type *inferScalarTypeForRecipe(const VPInstruction *R);
+  Type *inferScalarTypeForRecipe(const VPWidenCallRecipe *R);
+  Type *inferScalarTypeForRecipe(const VPWidenRecipe *R);
+  Type *inferScalarTypeForRecipe(const VPWidenIntOrFpInductionRecipe *R);
+  Type *inferScalarTypeForRecipe(const VPWidenMemoryInstructionRecipe *R);
+  Type *inferScalarTypeForRecipe(const VPWidenSelectRecipe *R);
+  Type *inferScalarTypeForRecipe(const VPReplicateRecipe *R);
+
+public:
+  VPTypeAnalysis(LLVMContext &Ctx) : Ctx(Ctx) {}
+
+  /// Infer the type of \p V. Returns the scalar type of \p V.
+  Type *inferScalarType(const VPValue *V);
+};
+
+} // end namespace llvm
+
+#endif // LLVM_TRANSFORMS_VECTORIZE_VPLANANALYSIS_H
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index efc95c1cd08c6..6b3218dca1b18 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "VPlan.h"
+#include "VPlanAnalysis.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Twine.h"
@@ -738,7 +739,18 @@ void VPWidenRecipe::execute(VPTransformState &State) {
                       << Instruction::getOpcodeName(Opcode));
     llvm_unreachable("Unhandled instruction!");
   } // end of switch.
+
+#if !defined(NDEBUG)
+  // Verify that VPlan type inference results agree with the type of the
+  // generated values.
+  for (unsigned Part = 0; Part < State.UF; ++Part) {
+    assert(VectorType::get(State.TypeAnalysis.inferScalarType(this),
+                           State.VF) == State.get(this, Part)->getType() &&
+           "inferred type and type from generated instructions do not match");
+  }
+#endif
 }
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent,
                           VPSlotTracker &SlotTracker) const {

From a0eb6b88f9af9ea51bf5cadcb4c223f3d235ed57 Mon Sep 17 00:00:00 2001
From: Christudasan Devadasan <christudasan.devadasan@amd.com>
Date: Fri, 27 Oct 2023 19:10:18 +0530
Subject: [PATCH 221/877] [AMDGPU] Try to fix the block prologs broken by RA
 inserted instructions (#69924)

The insertion point determined by RA while attempting spills and liverange
split at the beginning of a block goes wrong at times, and the newly
inserted vector instructions are placed before the exec-mask restore
instruction which is wrong. It occurs mainly due to the dependency on
isBasicBlockPrologue that doesn't account early inserted instructions
(spills and splits) during RA and causes the block prolog break.

A better approach for deciding the insertion point should be worked out.
For now, improving the helper function to consider all possible early
insertions. This patch includes the spill instructions. The copies
associated with liverange split should also be included in the block
prolog.
---
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp        |  12 +-
 llvm/lib/Target/AMDGPU/SIInstrInfo.h          |   5 +
 .../GlobalISel/image-waterfall-loop-O0.ll     |   5 +-
 .../AMDGPU/bb-prolog-spill-during-regalloc.ll |   7 +-
 llvm/test/CodeGen/AMDGPU/collapse-endcf.ll    |  71 +--
 .../AMDGPU/control-flow-fastregalloc.ll       |  18 +-
 .../AMDGPU/cross-block-use-is-not-abi-copy.ll |   2 +-
 .../identical-subrange-spill-infloop.ll       | 375 ++++++++------
 ...uf-legalize-operands-non-ptr-intrinsics.ll |  54 +-
 .../CodeGen/AMDGPU/mubuf-legalize-operands.ll |  58 ++-
 .../AMDGPU/partial-sgpr-to-vgpr-spills.ll     |  33 +-
 .../CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll     |   7 +-
 .../CodeGen/AMDGPU/spill-scavenge-offset.ll   | 489 +++++++++---------
 llvm/test/CodeGen/AMDGPU/swdev380865.ll       |  16 +-
 .../AMDGPU/vgpr-spill-placement-issue61083.ll |   3 +-
 .../test/CodeGen/AMDGPU/wwm-reserved-spill.ll |   6 +-
 16 files changed, 632 insertions(+), 529 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 25e057b639b36..fae43bf30a3f6 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -8439,8 +8439,16 @@ unsigned SIInstrInfo::getLiveRangeSplitOpcode(Register SrcReg,
 }
 
 bool SIInstrInfo::isBasicBlockPrologue(const MachineInstr &MI) const {
-  return !MI.isTerminator() && MI.getOpcode() != AMDGPU::COPY &&
-         MI.modifiesRegister(AMDGPU::EXEC, &RI);
+  // We need to handle instructions which may be inserted during register
+  // allocation to handle the prolog. The initial prolog instruction may have
+  // been separated from the start of the block by spills and copies inserted
+  // needed by the prolog.
+  uint16_t Opc = MI.getOpcode();
+
+  // FIXME: Copies inserted in the block prolog for live-range split should also
+  // be included.
+  return (isSpillOpcode(Opc) || (!MI.isTerminator() && Opc != AMDGPU::COPY &&
+                                 MI.modifiesRegister(AMDGPU::EXEC, &RI)));
 }
 
 MachineInstrBuilder
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index e6c64d909d3ee..1fd72e6376e2d 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -675,6 +675,11 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
     return get(Opcode).TSFlags & SIInstrFlags::SGPRSpill;
   }
 
+  bool isSpillOpcode(uint16_t Opcode) const {
+    return get(Opcode).TSFlags &
+           (SIInstrFlags::SGPRSpill | SIInstrFlags::VGPRSpill);
+  }
+
   static bool isWWMRegSpillOpcode(uint16_t Opcode) {
     return Opcode == AMDGPU::SI_SPILL_WWM_V32_SAVE ||
            Opcode == AMDGPU::SI_SPILL_WWM_AV32_SAVE ||
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll
index b19230c2e876c..10cbc56cc5fbe 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll
@@ -144,8 +144,6 @@ define <4 x float> @waterfall_loop(<8 x i32> %vgpr_srd) {
 ; CHECK-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
 ; CHECK-NEXT:    s_mov_b32 exec_lo, s21
 ; CHECK-NEXT:  ; %bb.2: ; in Loop: Header=BB0_1 Depth=1
-; CHECK-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_or_saveexec_b32 s21, -1
 ; CHECK-NEXT:    buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_mov_b32 exec_lo, s21
@@ -163,6 +161,9 @@ define <4 x float> @waterfall_loop(<8 x i32> %vgpr_srd) {
 ; CHECK-NEXT:    v_readlane_b32 s17, v2, 1
 ; CHECK-NEXT:    v_readlane_b32 s18, v2, 2
 ; CHECK-NEXT:    v_readlane_b32 s19, v2, 3
+; CHECK-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    image_sample v0, v[0:1], s[8:15], s[16:19] dmask:0x1 dim:SQ_RSRC_IMG_2D
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
diff --git a/llvm/test/CodeGen/AMDGPU/bb-prolog-spill-during-regalloc.ll b/llvm/test/CodeGen/AMDGPU/bb-prolog-spill-during-regalloc.ll
index 7df42fc506c79..e22cb912552f9 100644
--- a/llvm/test/CodeGen/AMDGPU/bb-prolog-spill-during-regalloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/bb-prolog-spill-during-regalloc.ll
@@ -1,8 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -O0 -verify-machineinstrs --stop-after=regallocfast,1 -o - %s | FileCheck -check-prefix=REGALLOC %s
 
-; FIXME: There are two spill codes inserted wrongly in this test.
-; They are inserted during regalloc for the BBLiveIns - the spill restores for vgpr1 in the Flow block (bb.1) and for vgpr0 in the return block (bb.4).
+; Test to check if the bb prolog spills are inserted correctly during regalloc.
 define i32 @prolog_spill(i32 %arg0, i32 %arg1, i32 %arg2) {
   ; REGALLOC-LABEL: name: prolog_spill
   ; REGALLOC: bb.0.bb.0:
@@ -33,10 +32,10 @@ define i32 @prolog_spill(i32 %arg0, i32 %arg1, i32 %arg2) {
   ; REGALLOC-NEXT:   successors: %bb.2(0x40000000), %bb.4(0x40000000)
   ; REGALLOC-NEXT: {{  $}}
   ; REGALLOC-NEXT:   $vgpr0 = SI_SPILL_WWM_V32_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5)
-  ; REGALLOC-NEXT:   $vgpr1 = SI_SPILL_V32_RESTORE %stack.3, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5)
   ; REGALLOC-NEXT:   $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 0, implicit-def $sgpr4_sgpr5
   ; REGALLOC-NEXT:   $sgpr5 = SI_RESTORE_S32_FROM_VGPR $vgpr0, 1
   ; REGALLOC-NEXT:   renamable $sgpr4_sgpr5 = S_OR_SAVEEXEC_B64 killed renamable $sgpr4_sgpr5, implicit-def $exec, implicit-def dead $scc, implicit $exec
+  ; REGALLOC-NEXT:   $vgpr1 = SI_SPILL_V32_RESTORE %stack.3, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5)
   ; REGALLOC-NEXT:   SI_SPILL_V32_SAVE killed $vgpr1, %stack.6, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.6, addrspace 5)
   ; REGALLOC-NEXT:   renamable $sgpr4_sgpr5 = S_AND_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def dead $scc
   ; REGALLOC-NEXT:   renamable $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr4, 2, $vgpr0, implicit-def $sgpr4_sgpr5, implicit $sgpr4_sgpr5
@@ -66,10 +65,10 @@ define i32 @prolog_spill(i32 %arg0, i32 %arg1, i32 %arg2) {
   ; REGALLOC-NEXT: {{  $}}
   ; REGALLOC-NEXT: bb.4.bb.3:
   ; REGALLOC-NEXT:   $vgpr1 = SI_SPILL_WWM_V32_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5)
-  ; REGALLOC-NEXT:   $vgpr0 = SI_SPILL_V32_RESTORE %stack.6, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.6, addrspace 5)
   ; REGALLOC-NEXT:   $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 2, implicit-def $sgpr4_sgpr5
   ; REGALLOC-NEXT:   $sgpr5 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 3
   ; REGALLOC-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr4_sgpr5, implicit-def dead $scc
+  ; REGALLOC-NEXT:   $vgpr0 = SI_SPILL_V32_RESTORE %stack.6, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.6, addrspace 5)
   ; REGALLOC-NEXT:   renamable $sgpr4 = S_MOV_B32 5
   ; REGALLOC-NEXT:   renamable $vgpr0 = V_MUL_LO_U32_e64 killed $vgpr0, killed $sgpr4, implicit $exec
   ; REGALLOC-NEXT:   KILL killed renamable $vgpr1
diff --git a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll
index 73d5088141cdb..dc519df52919a 100644
--- a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll
+++ b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll
@@ -75,10 +75,10 @@ define amdgpu_kernel void @simple_nested_if(ptr addrspace(1) nocapture %arg) {
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
 ; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
-; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    s_waitcnt vmcnt(1)
+; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    v_readlane_b32 s4, v0, 0
 ; GCN-O0-NEXT:    v_readlane_b32 s5, v0, 1
+; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b32 s2, 0xf000
 ; GCN-O0-NEXT:    s_mov_b32 s0, 0
 ; GCN-O0-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
@@ -104,7 +104,6 @@ define amdgpu_kernel void @simple_nested_if(ptr addrspace(1) nocapture %arg) {
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[0:1]
 ; GCN-O0-NEXT:    s_cbranch_execz .LBB0_3
 ; GCN-O0-NEXT:  ; %bb.2: ; %bb.inner.then
-; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
 ; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
@@ -112,7 +111,9 @@ define amdgpu_kernel void @simple_nested_if(ptr addrspace(1) nocapture %arg) {
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    v_readlane_b32 s0, v0, 0
 ; GCN-O0-NEXT:    v_readlane_b32 s1, v0, 1
+; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    v_mov_b32_e32 v0, 1
+; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    v_add_i32_e64 v1, s[2:3], v1, v0
 ; GCN-O0-NEXT:    v_ashrrev_i32_e64 v3, 31, v1
 ; GCN-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
@@ -248,10 +249,10 @@ define amdgpu_kernel void @uncollapsable_nested_if(ptr addrspace(1) nocapture %a
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
 ; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[8:9]
-; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    s_waitcnt vmcnt(1)
+; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    v_readlane_b32 s4, v0, 0
 ; GCN-O0-NEXT:    v_readlane_b32 s5, v0, 1
+; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b32 s2, 0xf000
 ; GCN-O0-NEXT:    s_mov_b32 s0, 0
 ; GCN-O0-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
@@ -277,7 +278,6 @@ define amdgpu_kernel void @uncollapsable_nested_if(ptr addrspace(1) nocapture %a
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[0:1]
 ; GCN-O0-NEXT:    s_cbranch_execz .LBB1_4
 ; GCN-O0-NEXT:  ; %bb.2: ; %bb.inner.then
-; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
 ; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
@@ -285,7 +285,9 @@ define amdgpu_kernel void @uncollapsable_nested_if(ptr addrspace(1) nocapture %a
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    v_readlane_b32 s0, v0, 0
 ; GCN-O0-NEXT:    v_readlane_b32 s1, v0, 1
+; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    v_mov_b32_e32 v0, 1
+; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    v_add_i32_e64 v1, s[2:3], v1, v0
 ; GCN-O0-NEXT:    v_ashrrev_i32_e64 v3, 31, v1
 ; GCN-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
@@ -311,7 +313,6 @@ define amdgpu_kernel void @uncollapsable_nested_if(ptr addrspace(1) nocapture %a
 ; GCN-O0-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GCN-O0-NEXT:    s_branch .LBB1_5
 ; GCN-O0-NEXT:  .LBB1_4: ; %bb.inner.end
-; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[8:9], -1
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
 ; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
@@ -322,7 +323,9 @@ define amdgpu_kernel void @uncollapsable_nested_if(ptr addrspace(1) nocapture %a
 ; GCN-O0-NEXT:    s_or_b64 exec, exec, s[2:3]
 ; GCN-O0-NEXT:    v_readlane_b32 s0, v0, 0
 ; GCN-O0-NEXT:    v_readlane_b32 s1, v0, 1
+; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    v_mov_b32_e32 v0, 2
+; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    v_add_i32_e64 v1, s[2:3], v1, v0
 ; GCN-O0-NEXT:    v_ashrrev_i32_e64 v3, 31, v1
 ; GCN-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
@@ -508,7 +511,6 @@ define amdgpu_kernel void @nested_if_if_else(ptr addrspace(1) nocapture %arg) {
 ; GCN-O0-NEXT:    s_xor_b64 exec, exec, s[0:1]
 ; GCN-O0-NEXT:    s_cbranch_execz .LBB2_5
 ; GCN-O0-NEXT:  ; %bb.3: ; %bb.then
-; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
 ; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
@@ -516,7 +518,9 @@ define amdgpu_kernel void @nested_if_if_else(ptr addrspace(1) nocapture %arg) {
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    v_readlane_b32 s0, v0, 0
 ; GCN-O0-NEXT:    v_readlane_b32 s1, v0, 1
+; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    v_mov_b32_e32 v0, 1
+; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    v_add_i32_e64 v1, s[2:3], v1, v0
 ; GCN-O0-NEXT:    v_ashrrev_i32_e64 v3, 31, v1
 ; GCN-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
@@ -532,7 +536,6 @@ define amdgpu_kernel void @nested_if_if_else(ptr addrspace(1) nocapture %arg) {
 ; GCN-O0-NEXT:    buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
 ; GCN-O0-NEXT:    s_branch .LBB2_5
 ; GCN-O0-NEXT:  .LBB2_4: ; %bb.else
-; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
 ; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
@@ -540,7 +543,9 @@ define amdgpu_kernel void @nested_if_if_else(ptr addrspace(1) nocapture %arg) {
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    v_readlane_b32 s0, v0, 0
 ; GCN-O0-NEXT:    v_readlane_b32 s1, v0, 1
+; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    v_mov_b32_e32 v0, 2
+; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    v_add_i32_e64 v1, s[2:3], v1, v0
 ; GCN-O0-NEXT:    v_ashrrev_i32_e64 v3, 31, v1
 ; GCN-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
@@ -954,20 +959,21 @@ define amdgpu_kernel void @s_endpgm_unsafe_barrier(ptr addrspace(1) nocapture %a
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[0:1]
 ; GCN-O0-NEXT:    s_cbranch_execz .LBB4_2
 ; GCN-O0-NEXT:  ; %bb.1: ; %bb.then
-; GCN-O0-NEXT:    s_waitcnt expcnt(0)
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GCN-O0-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    v_readlane_b32 s0, v1, 0
 ; GCN-O0-NEXT:    v_readlane_b32 s1, v1, 1
+; GCN-O0-NEXT:    s_waitcnt expcnt(0)
+; GCN-O0-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b32 s2, 0xf000
 ; GCN-O0-NEXT:    s_mov_b32 s4, 0
 ; GCN-O0-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
 ; GCN-O0-NEXT:    s_mov_b32 s5, s2
 ; GCN-O0-NEXT:    ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3
 ; GCN-O0-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    v_ashrrev_i32_e64 v2, 31, v0
 ; GCN-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
 ; GCN-O0-NEXT:    v_mov_b32_e32 v1, v2
@@ -1103,14 +1109,14 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 {
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
 ; GCN-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[14:15]
-; GCN-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    s_waitcnt vmcnt(1)
+; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    v_readlane_b32 s8, v0, 2
 ; GCN-O0-NEXT:    v_readlane_b32 s9, v0, 3
 ; GCN-O0-NEXT:    v_readlane_b32 s6, v0, 0
 ; GCN-O0-NEXT:    v_readlane_b32 s7, v0, 1
 ; GCN-O0-NEXT:    v_writelane_b32 v0, s6, 4
 ; GCN-O0-NEXT:    v_writelane_b32 v0, s7, 5
+; GCN-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b32 s4, 0x207
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    v_cmp_lt_i32_e64 s[4:5], v1, s4
@@ -1133,11 +1139,11 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 {
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
 ; GCN-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[14:15]
-; GCN-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    s_waitcnt vmcnt(1)
+; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    v_readlane_b32 s4, v0, 6
 ; GCN-O0-NEXT:    v_readlane_b32 s5, v0, 7
 ; GCN-O0-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GCN-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b32 s6, 0
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    v_cmp_ne_u32_e64 s[4:5], v1, s6
@@ -1227,18 +1233,20 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 {
 ; GCN-O0-NEXT:    s_branch .LBB5_6
 ; GCN-O0-NEXT:  .LBB5_5: ; %Flow2
 ; GCN-O0-NEXT:    ; in Loop: Header=BB5_1 Depth=1
-; GCN-O0-NEXT:    s_waitcnt expcnt(0)
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[14:15], -1
+; GCN-O0-NEXT:    s_waitcnt expcnt(1)
 ; GCN-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[14:15]
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    v_readlane_b32 s4, v4, 10
 ; GCN-O0-NEXT:    v_readlane_b32 s5, v4, 11
 ; GCN-O0-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GCN-O0-NEXT:    s_waitcnt expcnt(0)
+; GCN-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
@@ -1247,18 +1255,20 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 {
 ; GCN-O0-NEXT:    s_branch .LBB5_7
 ; GCN-O0-NEXT:  .LBB5_6: ; %Flow
 ; GCN-O0-NEXT:    ; in Loop: Header=BB5_1 Depth=1
-; GCN-O0-NEXT:    s_waitcnt expcnt(0)
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[14:15], -1
+; GCN-O0-NEXT:    s_waitcnt expcnt(1)
 ; GCN-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[14:15]
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    v_readlane_b32 s4, v4, 12
 ; GCN-O0-NEXT:    v_readlane_b32 s5, v4, 13
 ; GCN-O0-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GCN-O0-NEXT:    s_waitcnt expcnt(0)
+; GCN-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
@@ -1302,11 +1312,6 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 {
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[14:15]
 ; GCN-O0-NEXT:  .LBB5_9: ; %Flow3
 ; GCN-O0-NEXT:    ; in Loop: Header=BB5_1 Depth=1
-; GCN-O0-NEXT:    s_waitcnt expcnt(0)
-; GCN-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GCN-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[14:15], -1
 ; GCN-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[14:15]
@@ -1318,6 +1323,11 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 {
 ; GCN-O0-NEXT:    v_readlane_b32 s7, v4, 5
 ; GCN-O0-NEXT:    v_readlane_b32 s4, v4, 14
 ; GCN-O0-NEXT:    v_readlane_b32 s5, v4, 15
+; GCN-O0-NEXT:    s_waitcnt expcnt(0)
+; GCN-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GCN-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; GCN-O0-NEXT:    s_and_b64 s[4:5], exec, s[4:5]
 ; GCN-O0-NEXT:    s_or_b64 s[4:5], s[4:5], s[6:7]
 ; GCN-O0-NEXT:    s_mov_b64 s[6:7], 0
@@ -1332,6 +1342,7 @@ define void @scc_liveness(i32 %arg) local_unnamed_addr #0 {
 ; GCN-O0-NEXT:    s_or_saveexec_b64 s[14:15], -1
 ; GCN-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GCN-O0-NEXT:    s_mov_b64 exec, s[14:15]
+; GCN-O0-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
 ; GCN-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
diff --git a/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll b/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll
index d38e0072c94a5..6ef14d31ffa89 100644
--- a/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll
@@ -46,9 +46,6 @@
 
 ; VMEM: [[ENDIF]]:
 
-; Restore val
-; GCN: buffer_load_dword [[RELOAD_VAL:v[0-9]+]], off, s[0:3], 0 offset:[[VAL_OFFSET]] ; 4-byte Folded Reload
-
 ; Reload and restore exec mask
 ; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_LO_LANE]]
 ; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_HI_LANE]]
@@ -60,6 +57,9 @@
 
 ; GCN: s_or_b64 exec, exec, s[[[S_RELOAD_SAVEEXEC_LO]]:[[S_RELOAD_SAVEEXEC_HI]]]
 
+; Restore val
+; GCN: buffer_load_dword [[RELOAD_VAL:v[0-9]+]], off, s[0:3], 0 offset:[[VAL_OFFSET]] ; 4-byte Folded Reload
+
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RELOAD_VAL]]
 
 ; VGPR: .amdhsa_private_segment_fixed_size 16
@@ -120,7 +120,6 @@ endif:
 ; GCN: buffer_store_dword v[[VAL_LOOP_RELOAD]], off, s[0:3], 0 offset:[[VAL_SUB_OFFSET:[0-9]+]] ; 4-byte Folded Spill
 
 ; GCN: [[END]]:
-; GCN: buffer_load_dword v[[VAL_END:[0-9]+]], off, s[0:3], 0 offset:[[VAL_SUB_OFFSET]] ; 4-byte Folded Reload
 ; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_LO_LANE]]
 ; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_HI_LANE]]
 
@@ -130,6 +129,7 @@ endif:
 ; VMEM: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[V_RELOAD_SAVEEXEC]], 1
 
 ; GCN: s_or_b64 exec, exec, s[[[S_RELOAD_SAVEEXEC_LO]]:[[S_RELOAD_SAVEEXEC_HI]]]
+; GCN: buffer_load_dword v[[VAL_END:[0-9]+]], off, s[0:3], 0 offset:[[VAL_SUB_OFFSET]] ; 4-byte Folded Reload
 
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[VAL_END]]
 
@@ -176,7 +176,7 @@ end:
 ; Spill saved exec
 ; VGPR: v_writelane_b32 [[SPILL_VGPR:v[0-9]+]], s[[SAVEEXEC_LO]], [[SAVEEXEC_LO_LANE:[0-9]+]]
 ; VGPR: v_writelane_b32 [[SPILL_VGPR]], s[[SAVEEXEC_HI]], [[SAVEEXEC_HI_LANE:[0-9]+]]
-; VGPR: buffer_store_dword [[SPILL_VGPR]], off, s[0:3], 0 offset:[[VREG_SAVE_RESTORE_OFFSET:[0-9]+]] ; 4-byte Folded Spill
+; VGPR: buffer_store_dword [[SPILL_VGPR]], off, s[0:3], 0 offset:[[SAVEEXEC_OFFSET:[0-9]+]] ; 4-byte Folded Spill
 
 ; VMEM: v_writelane_b32 v[[V_SAVEEXEC:[0-9]+]], s[[SAVEEXEC_LO]], 0
 ; VMEM: v_writelane_b32 v[[V_SAVEEXEC]], s[[SAVEEXEC_HI]], 1
@@ -189,8 +189,7 @@ end:
 ; GCN-NEXT: s_branch [[ELSE:.LBB[0-9]+_[0-9]+]]
 
 ; GCN: [[FLOW]]: ; %Flow
-; VGPR: buffer_load_dword [[SPILL_VGPR:v[0-9]+]], off, s[0:3], 0 offset:[[VREG_SAVE_RESTORE_OFFSET]] ; 4-byte Folded Reload
-; GCN: buffer_load_dword [[FLOW_VAL:v[0-9]+]], off, s[0:3], 0 offset:[[FLOW_VAL_OFFSET:[0-9]+]] ; 4-byte Folded Reload
+; VGPR: buffer_load_dword [[SPILL_VGPR:v[0-9]+]], off, s[0:3], 0 offset:[[SAVEEXEC_OFFSET]] ; 4-byte Folded Reload
 ; VGPR: v_readlane_b32 s[[FLOW_S_RELOAD_SAVEEXEC_LO:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_LO_LANE]]
 ; VGPR: v_readlane_b32 s[[FLOW_S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_HI_LANE]]
 
@@ -201,6 +200,8 @@ end:
 
 ; GCN: s_or_saveexec_b64 s[[[FLOW_S_RELOAD_SAVEEXEC_LO_SAVEEXEC:[0-9]+]]:[[FLOW_S_RELOAD_SAVEEXEC_HI_SAVEEXEC:[0-9]+]]], s[[[FLOW_S_RELOAD_SAVEEXEC_LO]]:[[FLOW_S_RELOAD_SAVEEXEC_HI]]]
 
+; GCN: buffer_load_dword [[FLOW_VAL:v[0-9]+]], off, s[0:3], 0 offset:[[FLOW_VAL_OFFSET:[0-9]+]] ; 4-byte Folded Reload
+
 ; Regular spill value restored after exec modification
 ; Followed by spill
 ; GCN: buffer_store_dword [[FLOW_VAL]], off, s[0:3], 0 offset:[[RESULT_OFFSET:[0-9]+]] ; 4-byte Folded Spill
@@ -233,7 +234,6 @@ end:
 ; GCN-NEXT: s_branch [[FLOW]]
 
 ; GCN: [[ENDIF]]:
-; GCN: buffer_load_dword v[[RESULT:[0-9]+]], off, s[0:3], 0 offset:[[RESULT_OFFSET]] ; 4-byte Folded Reload
 ; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], [[SPILL_VGPR]], [[FLOW_SAVEEXEC_LO_LANE]]
 ; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[FLOW_SAVEEXEC_HI_LANE]]
 
@@ -245,6 +245,8 @@ end:
 
 ; GCN: s_or_b64 exec, exec, s[[[S_RELOAD_SAVEEXEC_LO]]:[[S_RELOAD_SAVEEXEC_HI]]]
 
+; GCN: buffer_load_dword v[[RESULT:[0-9]+]], off, s[0:3], 0 offset:[[RESULT_OFFSET]] ; 4-byte Folded Reload
+
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[RESULT]]
 define amdgpu_kernel void @divergent_if_else_endif(ptr addrspace(1) %out) #0 {
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
index 019b5ab593fe3..f97d15739be29 100644
--- a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
+++ b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
@@ -148,9 +148,9 @@ define { i32, half } @call_split_type_used_outside_block_struct() #0 {
 ; GCN-NEXT:    s_add_u32 s16, s16, func_struct@rel32@lo+4
 ; GCN-NEXT:    s_addc_u32 s17, s17, func_struct@rel32@hi+12
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GCN-NEXT:    v_mov_b32_e32 v1, v4
 ; GCN-NEXT:    v_readlane_b32 s31, v40, 1
 ; GCN-NEXT:    v_readlane_b32 s30, v40, 0
+; GCN-NEXT:    v_mov_b32_e32 v1, v4
 ; GCN-NEXT:    v_readlane_b32 s4, v40, 2
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
 ; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll
index 27fa6712f9eb4..8098304d13422 100644
--- a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll
+++ b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll
@@ -7,7 +7,8 @@ define void @main(i1 %arg) #0 {
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; CHECK-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
-; CHECK-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; CHECK-NEXT:    s_mov_b64 exec, s[4:5]
 ; CHECK-NEXT:    v_writelane_b32 v1, s30, 0
 ; CHECK-NEXT:    v_writelane_b32 v1, s31, 1
@@ -31,238 +32,295 @@ define void @main(i1 %arg) #0 {
 ; CHECK-NEXT:    s_mov_b32 s5, s24
 ; CHECK-NEXT:    v_writelane_b32 v1, s51, 17
 ; CHECK-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0x0
-; CHECK-NEXT:    ; implicit-def: $vgpr4 : SGPR spill to VGPR lane
+; CHECK-NEXT:    ; implicit-def: $vgpr5 : SGPR spill to VGPR lane
 ; CHECK-NEXT:    s_mov_b64 s[4:5], 0
 ; CHECK-NEXT:    s_load_dwordx4 s[28:31], s[4:5], 0x0
 ; CHECK-NEXT:    s_movk_i32 s4, 0x130
 ; CHECK-NEXT:    s_mov_b32 s5, s24
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_writelane_b32 v4, s36, 0
-; CHECK-NEXT:    v_writelane_b32 v4, s37, 1
-; CHECK-NEXT:    v_writelane_b32 v4, s38, 2
-; CHECK-NEXT:    v_writelane_b32 v4, s39, 3
-; CHECK-NEXT:    v_writelane_b32 v4, s40, 4
-; CHECK-NEXT:    v_writelane_b32 v4, s41, 5
-; CHECK-NEXT:    v_writelane_b32 v4, s42, 6
-; CHECK-NEXT:    v_writelane_b32 v4, s43, 7
-; CHECK-NEXT:    v_writelane_b32 v4, s44, 8
-; CHECK-NEXT:    v_writelane_b32 v4, s45, 9
-; CHECK-NEXT:    v_writelane_b32 v4, s46, 10
+; CHECK-NEXT:    v_writelane_b32 v5, s36, 0
+; CHECK-NEXT:    v_writelane_b32 v5, s37, 1
+; CHECK-NEXT:    v_writelane_b32 v5, s38, 2
+; CHECK-NEXT:    v_writelane_b32 v5, s39, 3
+; CHECK-NEXT:    v_writelane_b32 v5, s40, 4
+; CHECK-NEXT:    v_writelane_b32 v5, s41, 5
+; CHECK-NEXT:    v_writelane_b32 v5, s42, 6
+; CHECK-NEXT:    v_writelane_b32 v5, s43, 7
+; CHECK-NEXT:    v_writelane_b32 v5, s44, 8
+; CHECK-NEXT:    v_writelane_b32 v5, s45, 9
+; CHECK-NEXT:    v_writelane_b32 v5, s46, 10
 ; CHECK-NEXT:    s_load_dwordx16 s[4:19], s[4:5], 0x0
-; CHECK-NEXT:    v_writelane_b32 v4, s47, 11
-; CHECK-NEXT:    v_writelane_b32 v4, s48, 12
-; CHECK-NEXT:    v_writelane_b32 v4, s49, 13
+; CHECK-NEXT:    v_writelane_b32 v5, s47, 11
+; CHECK-NEXT:    v_writelane_b32 v5, s48, 12
+; CHECK-NEXT:    v_writelane_b32 v5, s49, 13
 ; CHECK-NEXT:    s_mov_b32 s20, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 0
-; CHECK-NEXT:    v_writelane_b32 v4, s50, 14
-; CHECK-NEXT:    v_mov_b32_e32 v5, s28
-; CHECK-NEXT:    v_mov_b32_e32 v6, v2
+; CHECK-NEXT:    v_writelane_b32 v5, s50, 14
+; CHECK-NEXT:    v_mov_b32_e32 v6, s28
+; CHECK-NEXT:    v_mov_b32_e32 v7, v2
 ; CHECK-NEXT:    s_mov_b32 s21, s20
 ; CHECK-NEXT:    s_mov_b32 s22, s20
 ; CHECK-NEXT:    s_mov_b32 s23, s20
-; CHECK-NEXT:    v_writelane_b32 v4, s51, 15
+; CHECK-NEXT:    v_writelane_b32 v5, s51, 15
 ; CHECK-NEXT:    v_mov_b32_e32 v3, v2
-; CHECK-NEXT:    image_sample_lz v5, v[5:6], s[44:51], s[20:23] dmask:0x1
+; CHECK-NEXT:    image_sample_lz v6, v[6:7], s[44:51], s[20:23] dmask:0x1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_writelane_b32 v4, s4, 16
+; CHECK-NEXT:    v_writelane_b32 v5, s4, 16
+; CHECK-NEXT:    v_writelane_b32 v5, s5, 17
+; CHECK-NEXT:    v_writelane_b32 v5, s6, 18
+; CHECK-NEXT:    v_writelane_b32 v5, s7, 19
+; CHECK-NEXT:    v_writelane_b32 v5, s8, 20
+; CHECK-NEXT:    v_writelane_b32 v5, s9, 21
+; CHECK-NEXT:    image_sample_lz v7, v[2:3], s[4:11], s[20:23] dmask:0x1
+; CHECK-NEXT:    v_writelane_b32 v5, s10, 22
+; CHECK-NEXT:    v_writelane_b32 v5, s11, 23
+; CHECK-NEXT:    v_writelane_b32 v5, s12, 24
+; CHECK-NEXT:    v_writelane_b32 v5, s13, 25
+; CHECK-NEXT:    v_writelane_b32 v5, s14, 26
+; CHECK-NEXT:    v_writelane_b32 v5, s15, 27
+; CHECK-NEXT:    v_writelane_b32 v5, s16, 28
 ; CHECK-NEXT:    v_writelane_b32 v1, s52, 18
-; CHECK-NEXT:    v_writelane_b32 v4, s5, 17
+; CHECK-NEXT:    v_writelane_b32 v5, s17, 29
 ; CHECK-NEXT:    v_writelane_b32 v1, s53, 19
-; CHECK-NEXT:    v_writelane_b32 v4, s6, 18
+; CHECK-NEXT:    v_writelane_b32 v5, s18, 30
 ; CHECK-NEXT:    v_writelane_b32 v1, s54, 20
-; CHECK-NEXT:    image_sample_lz v6, v[2:3], s[4:11], s[20:23] dmask:0x1
-; CHECK-NEXT:    v_writelane_b32 v4, s7, 19
+; CHECK-NEXT:    v_writelane_b32 v5, s19, 31
+; CHECK-NEXT:    s_mov_b32 s4, 48
+; CHECK-NEXT:    s_mov_b32 s5, s24
 ; CHECK-NEXT:    v_writelane_b32 v1, s55, 21
-; CHECK-NEXT:    v_writelane_b32 v4, s8, 20
+; CHECK-NEXT:    s_load_dwordx8 s[4:11], s[4:5], 0x0
 ; CHECK-NEXT:    v_writelane_b32 v1, s56, 22
-; CHECK-NEXT:    v_writelane_b32 v4, s9, 21
 ; CHECK-NEXT:    v_writelane_b32 v1, s57, 23
-; CHECK-NEXT:    v_writelane_b32 v4, s10, 22
 ; CHECK-NEXT:    v_writelane_b32 v1, s58, 24
-; CHECK-NEXT:    v_writelane_b32 v4, s11, 23
 ; CHECK-NEXT:    v_writelane_b32 v1, s59, 25
-; CHECK-NEXT:    v_writelane_b32 v4, s12, 24
 ; CHECK-NEXT:    v_writelane_b32 v1, s60, 26
-; CHECK-NEXT:    v_writelane_b32 v4, s13, 25
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_writelane_b32 v5, s4, 32
 ; CHECK-NEXT:    v_writelane_b32 v1, s61, 27
-; CHECK-NEXT:    v_writelane_b32 v4, s14, 26
+; CHECK-NEXT:    v_writelane_b32 v5, s5, 33
 ; CHECK-NEXT:    v_writelane_b32 v1, s62, 28
-; CHECK-NEXT:    v_writelane_b32 v4, s15, 27
+; CHECK-NEXT:    v_writelane_b32 v5, s6, 34
 ; CHECK-NEXT:    v_writelane_b32 v1, s63, 29
-; CHECK-NEXT:    v_writelane_b32 v4, s16, 28
+; CHECK-NEXT:    v_writelane_b32 v5, s7, 35
 ; CHECK-NEXT:    v_writelane_b32 v1, s64, 30
-; CHECK-NEXT:    v_writelane_b32 v4, s17, 29
+; CHECK-NEXT:    v_writelane_b32 v5, s8, 36
 ; CHECK-NEXT:    v_writelane_b32 v1, s65, 31
-; CHECK-NEXT:    v_writelane_b32 v4, s18, 30
+; CHECK-NEXT:    v_writelane_b32 v5, s9, 37
 ; CHECK-NEXT:    v_writelane_b32 v1, s66, 32
-; CHECK-NEXT:    v_writelane_b32 v4, s19, 31
-; CHECK-NEXT:    s_mov_b32 s4, 48
+; CHECK-NEXT:    s_movk_i32 s26, 0x1f0
 ; CHECK-NEXT:    s_movk_i32 s28, 0x2f0
-; CHECK-NEXT:    s_mov_b32 s5, s24
+; CHECK-NEXT:    s_mov_b32 s27, s24
 ; CHECK-NEXT:    s_mov_b32 s29, s24
+; CHECK-NEXT:    v_writelane_b32 v5, s10, 38
 ; CHECK-NEXT:    v_writelane_b32 v1, s67, 33
-; CHECK-NEXT:    s_movk_i32 s26, 0x1f0
-; CHECK-NEXT:    s_mov_b32 s27, s24
-; CHECK-NEXT:    s_load_dwordx8 s[4:11], s[4:5], 0x0
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    s_load_dwordx16 s[36:51], s[26:27], 0x0
-; CHECK-NEXT:    s_load_dwordx16 s[52:67], s[28:29], 0x0
+; CHECK-NEXT:    v_writelane_b32 v5, s11, 39
+; CHECK-NEXT:    s_load_dwordx16 s[52:67], s[26:27], 0x0
+; CHECK-NEXT:    s_load_dwordx16 s[4:19], s[28:29], 0x0
 ; CHECK-NEXT:    v_and_b32_e32 v0, 1, v0
 ; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; CHECK-NEXT:    s_xor_b64 s[24:25], vcc, -1
+; CHECK-NEXT:    ; implicit-def: $vgpr4 : SGPR spill to VGPR lane
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_mul_f32_e32 v0, v6, v5
+; CHECK-NEXT:    v_mul_f32_e32 v0, v7, v6
 ; CHECK-NEXT:    s_and_saveexec_b64 s[26:27], s[24:25]
 ; CHECK-NEXT:    s_xor_b64 s[26:27], exec, s[26:27]
 ; CHECK-NEXT:    s_cbranch_execz .LBB0_3
 ; CHECK-NEXT:  ; %bb.1: ; %bb48
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_writelane_b32 v4, s36, 32
-; CHECK-NEXT:    v_writelane_b32 v4, s37, 33
-; CHECK-NEXT:    v_writelane_b32 v4, s38, 34
-; CHECK-NEXT:    v_writelane_b32 v4, s39, 35
-; CHECK-NEXT:    v_writelane_b32 v4, s40, 36
-; CHECK-NEXT:    v_writelane_b32 v4, s41, 37
-; CHECK-NEXT:    v_writelane_b32 v4, s42, 38
-; CHECK-NEXT:    v_writelane_b32 v4, s43, 39
-; CHECK-NEXT:    v_writelane_b32 v4, s44, 40
-; CHECK-NEXT:    v_writelane_b32 v4, s45, 41
-; CHECK-NEXT:    v_writelane_b32 v4, s46, 42
-; CHECK-NEXT:    v_writelane_b32 v4, s47, 43
-; CHECK-NEXT:    v_writelane_b32 v4, s48, 44
-; CHECK-NEXT:    v_writelane_b32 v4, s49, 45
-; CHECK-NEXT:    v_writelane_b32 v4, s50, 46
-; CHECK-NEXT:    v_writelane_b32 v4, s51, 47
-; CHECK-NEXT:    v_readlane_b32 s36, v4, 0
-; CHECK-NEXT:    v_readlane_b32 s44, v4, 8
-; CHECK-NEXT:    v_readlane_b32 s45, v4, 9
-; CHECK-NEXT:    v_readlane_b32 s46, v4, 10
-; CHECK-NEXT:    v_readlane_b32 s47, v4, 11
-; CHECK-NEXT:    v_readlane_b32 s48, v4, 12
-; CHECK-NEXT:    v_readlane_b32 s49, v4, 13
-; CHECK-NEXT:    v_readlane_b32 s50, v4, 14
-; CHECK-NEXT:    v_readlane_b32 s51, v4, 15
-; CHECK-NEXT:    v_readlane_b32 s37, v4, 1
-; CHECK-NEXT:    v_readlane_b32 s38, v4, 2
-; CHECK-NEXT:    v_readlane_b32 s39, v4, 3
-; CHECK-NEXT:    v_readlane_b32 s40, v4, 4
-; CHECK-NEXT:    v_readlane_b32 s41, v4, 5
-; CHECK-NEXT:    image_sample_lz v5, v[2:3], s[44:51], s[20:23] dmask:0x1
-; CHECK-NEXT:    v_readlane_b32 s42, v4, 6
-; CHECK-NEXT:    v_readlane_b32 s43, v4, 7
-; CHECK-NEXT:    v_readlane_b32 s36, v4, 32
-; CHECK-NEXT:    v_readlane_b32 s37, v4, 33
-; CHECK-NEXT:    v_readlane_b32 s38, v4, 34
-; CHECK-NEXT:    v_readlane_b32 s39, v4, 35
-; CHECK-NEXT:    v_readlane_b32 s40, v4, 36
-; CHECK-NEXT:    v_readlane_b32 s41, v4, 37
-; CHECK-NEXT:    v_readlane_b32 s42, v4, 38
-; CHECK-NEXT:    v_readlane_b32 s43, v4, 39
-; CHECK-NEXT:    v_readlane_b32 s44, v4, 40
-; CHECK-NEXT:    v_readlane_b32 s45, v4, 41
-; CHECK-NEXT:    v_readlane_b32 s46, v4, 42
-; CHECK-NEXT:    v_readlane_b32 s47, v4, 43
-; CHECK-NEXT:    v_readlane_b32 s48, v4, 44
-; CHECK-NEXT:    v_readlane_b32 s49, v4, 45
-; CHECK-NEXT:    v_readlane_b32 s50, v4, 46
-; CHECK-NEXT:    v_readlane_b32 s51, v4, 47
-; CHECK-NEXT:    v_mov_b32_e32 v3, 0
+; CHECK-NEXT:    v_readlane_b32 s36, v5, 0
+; CHECK-NEXT:    v_readlane_b32 s44, v5, 8
+; CHECK-NEXT:    v_readlane_b32 s45, v5, 9
+; CHECK-NEXT:    v_readlane_b32 s46, v5, 10
+; CHECK-NEXT:    v_readlane_b32 s47, v5, 11
+; CHECK-NEXT:    v_readlane_b32 s48, v5, 12
+; CHECK-NEXT:    v_readlane_b32 s49, v5, 13
+; CHECK-NEXT:    v_readlane_b32 s50, v5, 14
+; CHECK-NEXT:    v_readlane_b32 s51, v5, 15
 ; CHECK-NEXT:    s_and_b64 vcc, exec, -1
+; CHECK-NEXT:    v_readlane_b32 s37, v5, 1
+; CHECK-NEXT:    v_readlane_b32 s38, v5, 2
+; CHECK-NEXT:    v_readlane_b32 s39, v5, 3
+; CHECK-NEXT:    v_readlane_b32 s40, v5, 4
+; CHECK-NEXT:    image_sample_lz v6, v[2:3], s[44:51], s[20:23] dmask:0x1
+; CHECK-NEXT:    v_mov_b32_e32 v3, 0
+; CHECK-NEXT:    v_readlane_b32 s41, v5, 5
+; CHECK-NEXT:    v_readlane_b32 s42, v5, 6
+; CHECK-NEXT:    v_readlane_b32 s43, v5, 7
 ; CHECK-NEXT:  .LBB0_2: ; %bb50
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    v_readlane_b32 s36, v5, 32
+; CHECK-NEXT:    v_readlane_b32 s40, v5, 36
+; CHECK-NEXT:    v_readlane_b32 s41, v5, 37
+; CHECK-NEXT:    v_readlane_b32 s42, v5, 38
+; CHECK-NEXT:    v_readlane_b32 s43, v5, 39
 ; CHECK-NEXT:    s_mov_b32 s21, s20
 ; CHECK-NEXT:    s_mov_b32 s22, s20
 ; CHECK-NEXT:    s_mov_b32 s23, s20
-; CHECK-NEXT:    image_sample_lz v6, v[2:3], s[44:51], s[8:11] dmask:0x1
-; CHECK-NEXT:    s_nop 0
-; CHECK-NEXT:    image_sample_lz v2, v[2:3], s[60:67], s[20:23] dmask:0x1
+; CHECK-NEXT:    v_readlane_b32 s37, v5, 33
+; CHECK-NEXT:    v_readlane_b32 s38, v5, 34
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    image_sample_lz v7, v[2:3], s[60:67], s[40:43] dmask:0x1
+; CHECK-NEXT:    v_readlane_b32 s39, v5, 35
+; CHECK-NEXT:    image_sample_lz v2, v[2:3], s[12:19], s[20:23] dmask:0x1
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_sub_f32_e32 v2, v2, v6
+; CHECK-NEXT:    v_sub_f32_e32 v2, v2, v7
 ; CHECK-NEXT:    v_mul_f32_e32 v2, v2, v0
-; CHECK-NEXT:    v_mul_f32_e32 v2, v2, v5
+; CHECK-NEXT:    v_mul_f32_e32 v2, v2, v6
 ; CHECK-NEXT:    s_mov_b64 vcc, vcc
 ; CHECK-NEXT:    s_cbranch_vccnz .LBB0_2
 ; CHECK-NEXT:  .LBB0_3: ; %Flow14
-; CHECK-NEXT:    s_andn2_saveexec_b64 s[28:29], s[26:27]
+; CHECK-NEXT:    s_or_saveexec_b64 s[20:21], s[26:27]
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_readlane_b32 s12, v5, 32
+; CHECK-NEXT:    v_readlane_b32 s13, v5, 33
+; CHECK-NEXT:    v_readlane_b32 s14, v5, 34
+; CHECK-NEXT:    v_readlane_b32 s15, v5, 35
+; CHECK-NEXT:    v_readlane_b32 s16, v5, 36
+; CHECK-NEXT:    v_readlane_b32 s17, v5, 37
+; CHECK-NEXT:    v_readlane_b32 s18, v5, 38
+; CHECK-NEXT:    v_readlane_b32 s19, v5, 39
+; CHECK-NEXT:    v_writelane_b32 v5, s4, 56
+; CHECK-NEXT:    v_writelane_b32 v5, s5, 57
+; CHECK-NEXT:    v_writelane_b32 v5, s6, 58
+; CHECK-NEXT:    v_writelane_b32 v5, s7, 59
+; CHECK-NEXT:    v_writelane_b32 v5, s8, 60
+; CHECK-NEXT:    v_writelane_b32 v5, s9, 61
+; CHECK-NEXT:    v_writelane_b32 v5, s10, 62
+; CHECK-NEXT:    v_writelane_b32 v5, s11, 63
+; CHECK-NEXT:    v_writelane_b32 v5, s52, 40
+; CHECK-NEXT:    v_writelane_b32 v5, s53, 41
+; CHECK-NEXT:    v_writelane_b32 v5, s54, 42
+; CHECK-NEXT:    v_writelane_b32 v5, s55, 43
+; CHECK-NEXT:    v_writelane_b32 v5, s56, 44
+; CHECK-NEXT:    v_writelane_b32 v5, s57, 45
+; CHECK-NEXT:    v_writelane_b32 v5, s58, 46
+; CHECK-NEXT:    v_writelane_b32 v5, s59, 47
+; CHECK-NEXT:    v_writelane_b32 v4, s12, 0
+; CHECK-NEXT:    v_writelane_b32 v5, s60, 48
+; CHECK-NEXT:    v_writelane_b32 v4, s13, 1
+; CHECK-NEXT:    v_writelane_b32 v5, s61, 49
+; CHECK-NEXT:    v_writelane_b32 v4, s14, 2
+; CHECK-NEXT:    v_writelane_b32 v5, s62, 50
+; CHECK-NEXT:    v_writelane_b32 v4, s15, 3
+; CHECK-NEXT:    v_writelane_b32 v5, s63, 51
+; CHECK-NEXT:    v_writelane_b32 v4, s16, 4
+; CHECK-NEXT:    v_writelane_b32 v5, s64, 52
+; CHECK-NEXT:    v_writelane_b32 v4, s17, 5
+; CHECK-NEXT:    v_writelane_b32 v5, s65, 53
+; CHECK-NEXT:    v_writelane_b32 v4, s18, 6
+; CHECK-NEXT:    v_writelane_b32 v5, s66, 54
+; CHECK-NEXT:    v_writelane_b32 v4, s19, 7
+; CHECK-NEXT:    v_writelane_b32 v5, s67, 55
+; CHECK-NEXT:    s_xor_b64 exec, exec, s[20:21]
 ; CHECK-NEXT:    s_cbranch_execz .LBB0_10
 ; CHECK-NEXT:  ; %bb.4: ; %bb32
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_and_saveexec_b64 s[8:9], s[24:25]
-; CHECK-NEXT:    s_xor_b64 vcc, exec, s[8:9]
+; CHECK-NEXT:    s_xor_b64 s[22:23], exec, s[8:9]
 ; CHECK-NEXT:    s_cbranch_execz .LBB0_6
 ; CHECK-NEXT:  ; %bb.5: ; %bb43
 ; CHECK-NEXT:    s_mov_b32 s8, 0
-; CHECK-NEXT:    s_mov_b64 s[12:13], s[36:37]
 ; CHECK-NEXT:    s_mov_b32 s9, s8
 ; CHECK-NEXT:    v_mov_b32_e32 v2, s8
-; CHECK-NEXT:    s_mov_b64 s[14:15], s[38:39]
-; CHECK-NEXT:    s_mov_b64 s[16:17], s[40:41]
-; CHECK-NEXT:    s_mov_b64 s[18:19], s[42:43]
-; CHECK-NEXT:    v_readlane_b32 s36, v4, 0
+; CHECK-NEXT:    v_readlane_b32 s36, v5, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v3, s9
 ; CHECK-NEXT:    s_mov_b32 s10, s8
 ; CHECK-NEXT:    s_mov_b32 s11, s8
-; CHECK-NEXT:    v_readlane_b32 s37, v4, 1
-; CHECK-NEXT:    v_readlane_b32 s38, v4, 2
-; CHECK-NEXT:    v_readlane_b32 s39, v4, 3
-; CHECK-NEXT:    v_readlane_b32 s40, v4, 4
-; CHECK-NEXT:    v_readlane_b32 s41, v4, 5
-; CHECK-NEXT:    v_readlane_b32 s42, v4, 6
-; CHECK-NEXT:    v_readlane_b32 s43, v4, 7
-; CHECK-NEXT:    v_mov_b32_e32 v6, 0
-; CHECK-NEXT:    v_mov_b32_e32 v7, v6
-; CHECK-NEXT:    v_readlane_b32 s44, v4, 8
-; CHECK-NEXT:    v_readlane_b32 s45, v4, 9
-; CHECK-NEXT:    v_readlane_b32 s46, v4, 10
-; CHECK-NEXT:    image_sample_lz v5, v[2:3], s[36:43], s[8:11] dmask:0x1
-; CHECK-NEXT:    s_mov_b64 s[42:43], s[18:19]
-; CHECK-NEXT:    s_mov_b64 s[40:41], s[16:17]
-; CHECK-NEXT:    s_mov_b64 s[38:39], s[14:15]
-; CHECK-NEXT:    s_mov_b64 s[36:37], s[12:13]
-; CHECK-NEXT:    v_readlane_b32 s12, v4, 16
-; CHECK-NEXT:    v_readlane_b32 s20, v4, 24
-; CHECK-NEXT:    v_readlane_b32 s21, v4, 25
-; CHECK-NEXT:    v_readlane_b32 s22, v4, 26
-; CHECK-NEXT:    v_readlane_b32 s23, v4, 27
-; CHECK-NEXT:    v_readlane_b32 s24, v4, 28
-; CHECK-NEXT:    v_readlane_b32 s25, v4, 29
-; CHECK-NEXT:    v_readlane_b32 s26, v4, 30
-; CHECK-NEXT:    v_readlane_b32 s27, v4, 31
-; CHECK-NEXT:    v_readlane_b32 s47, v4, 11
-; CHECK-NEXT:    v_readlane_b32 s48, v4, 12
-; CHECK-NEXT:    v_readlane_b32 s49, v4, 13
-; CHECK-NEXT:    v_readlane_b32 s50, v4, 14
-; CHECK-NEXT:    v_readlane_b32 s51, v4, 15
-; CHECK-NEXT:    image_sample_lz v2, v[2:3], s[20:27], s[4:7] dmask:0x1
-; CHECK-NEXT:    v_readlane_b32 s13, v4, 17
-; CHECK-NEXT:    v_readlane_b32 s14, v4, 18
-; CHECK-NEXT:    v_readlane_b32 s15, v4, 19
-; CHECK-NEXT:    v_readlane_b32 s16, v4, 20
-; CHECK-NEXT:    v_readlane_b32 s17, v4, 21
-; CHECK-NEXT:    v_readlane_b32 s18, v4, 22
-; CHECK-NEXT:    v_readlane_b32 s19, v4, 23
+; CHECK-NEXT:    v_readlane_b32 s37, v5, 1
+; CHECK-NEXT:    v_readlane_b32 s38, v5, 2
+; CHECK-NEXT:    v_readlane_b32 s39, v5, 3
+; CHECK-NEXT:    v_readlane_b32 s40, v5, 4
+; CHECK-NEXT:    v_readlane_b32 s41, v5, 5
+; CHECK-NEXT:    v_readlane_b32 s42, v5, 6
+; CHECK-NEXT:    v_readlane_b32 s43, v5, 7
+; CHECK-NEXT:    v_readlane_b32 s44, v5, 8
+; CHECK-NEXT:    v_readlane_b32 s45, v5, 9
+; CHECK-NEXT:    v_readlane_b32 s46, v5, 10
+; CHECK-NEXT:    v_readlane_b32 s47, v5, 11
+; CHECK-NEXT:    v_readlane_b32 s48, v5, 12
+; CHECK-NEXT:    v_readlane_b32 s49, v5, 13
+; CHECK-NEXT:    v_readlane_b32 s50, v5, 14
+; CHECK-NEXT:    v_readlane_b32 s51, v5, 15
+; CHECK-NEXT:    image_sample_lz v6, v[2:3], s[36:43], s[8:11] dmask:0x1
+; CHECK-NEXT:    v_readlane_b32 s36, v5, 16
+; CHECK-NEXT:    v_readlane_b32 s44, v5, 24
+; CHECK-NEXT:    v_readlane_b32 s45, v5, 25
+; CHECK-NEXT:    v_readlane_b32 s46, v5, 26
+; CHECK-NEXT:    v_readlane_b32 s47, v5, 27
+; CHECK-NEXT:    v_readlane_b32 s48, v5, 28
+; CHECK-NEXT:    v_readlane_b32 s49, v5, 29
+; CHECK-NEXT:    v_readlane_b32 s50, v5, 30
+; CHECK-NEXT:    v_readlane_b32 s51, v5, 31
+; CHECK-NEXT:    v_mov_b32_e32 v7, 0
+; CHECK-NEXT:    v_mov_b32_e32 v8, v7
+; CHECK-NEXT:    v_readlane_b32 s37, v5, 17
+; CHECK-NEXT:    v_readlane_b32 s38, v5, 18
+; CHECK-NEXT:    v_readlane_b32 s39, v5, 19
+; CHECK-NEXT:    image_sample_lz v2, v[2:3], s[44:51], s[12:15] dmask:0x1
+; CHECK-NEXT:    v_readlane_b32 s40, v5, 20
+; CHECK-NEXT:    v_readlane_b32 s41, v5, 21
+; CHECK-NEXT:    v_readlane_b32 s42, v5, 22
+; CHECK-NEXT:    v_readlane_b32 s43, v5, 23
 ; CHECK-NEXT:    ; implicit-def: $vgpr0
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_dwordx3 v[5:7], off, s[8:11], 0
+; CHECK-NEXT:    buffer_store_dwordx3 v[6:8], off, s[8:11], 0
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
 ; CHECK-NEXT:    buffer_store_dwordx4 v[2:5], off, s[8:11], 0
 ; CHECK-NEXT:  .LBB0_6: ; %Flow12
-; CHECK-NEXT:    s_andn2_saveexec_b64 s[4:5], vcc
+; CHECK-NEXT:    s_andn2_saveexec_b64 s[4:5], s[22:23]
 ; CHECK-NEXT:    s_cbranch_execz .LBB0_9
 ; CHECK-NEXT:  ; %bb.7: ; %bb33.preheader
 ; CHECK-NEXT:    s_mov_b32 s8, 0
 ; CHECK-NEXT:    s_mov_b32 s6, s8
+; CHECK-NEXT:    v_readlane_b32 s36, v5, 40
 ; CHECK-NEXT:    s_mov_b32 s7, s8
 ; CHECK-NEXT:    v_mov_b32_e32 v2, s6
+; CHECK-NEXT:    v_readlane_b32 s37, v5, 41
 ; CHECK-NEXT:    s_mov_b32 s9, s8
 ; CHECK-NEXT:    s_mov_b32 s10, s8
 ; CHECK-NEXT:    s_mov_b32 s11, s8
 ; CHECK-NEXT:    v_mov_b32_e32 v3, s7
-; CHECK-NEXT:    image_sample_lz v5, v[2:3], s[36:43], s[8:11] dmask:0x1
-; CHECK-NEXT:    image_sample_lz v6, v[2:3], s[52:59], s[8:11] dmask:0x1
+; CHECK-NEXT:    v_readlane_b32 s38, v5, 42
+; CHECK-NEXT:    v_readlane_b32 s39, v5, 43
+; CHECK-NEXT:    v_readlane_b32 s40, v5, 44
+; CHECK-NEXT:    v_readlane_b32 s41, v5, 45
+; CHECK-NEXT:    v_readlane_b32 s42, v5, 46
+; CHECK-NEXT:    v_readlane_b32 s43, v5, 47
+; CHECK-NEXT:    v_readlane_b32 s44, v5, 48
+; CHECK-NEXT:    v_readlane_b32 s45, v5, 49
+; CHECK-NEXT:    v_readlane_b32 s46, v5, 50
+; CHECK-NEXT:    v_readlane_b32 s47, v5, 51
+; CHECK-NEXT:    v_readlane_b32 s48, v5, 52
+; CHECK-NEXT:    v_readlane_b32 s49, v5, 53
+; CHECK-NEXT:    v_readlane_b32 s50, v5, 54
+; CHECK-NEXT:    v_readlane_b32 s51, v5, 55
+; CHECK-NEXT:    s_mov_b64 s[12:13], s[36:37]
+; CHECK-NEXT:    s_mov_b64 s[14:15], s[38:39]
+; CHECK-NEXT:    s_mov_b64 s[16:17], s[40:41]
+; CHECK-NEXT:    s_mov_b64 s[18:19], s[42:43]
+; CHECK-NEXT:    image_sample_lz v6, v[2:3], s[36:43], s[8:11] dmask:0x1
+; CHECK-NEXT:    v_readlane_b32 s36, v5, 56
+; CHECK-NEXT:    v_readlane_b32 s37, v5, 57
+; CHECK-NEXT:    v_readlane_b32 s38, v5, 58
+; CHECK-NEXT:    v_readlane_b32 s39, v5, 59
+; CHECK-NEXT:    v_readlane_b32 s40, v5, 60
+; CHECK-NEXT:    v_readlane_b32 s41, v5, 61
+; CHECK-NEXT:    v_readlane_b32 s42, v5, 62
+; CHECK-NEXT:    v_readlane_b32 s43, v5, 63
+; CHECK-NEXT:    ; kill: killed $vgpr2_vgpr3
 ; CHECK-NEXT:    s_and_b64 vcc, exec, 0
+; CHECK-NEXT:    v_readlane_b32 s44, v4, 0
+; CHECK-NEXT:    v_readlane_b32 s45, v4, 1
+; CHECK-NEXT:    v_readlane_b32 s46, v4, 2
+; CHECK-NEXT:    v_readlane_b32 s47, v4, 3
+; CHECK-NEXT:    image_sample_lz v7, v[2:3], s[36:43], s[8:11] dmask:0x1
+; CHECK-NEXT:    v_readlane_b32 s48, v4, 4
+; CHECK-NEXT:    v_readlane_b32 s49, v4, 5
+; CHECK-NEXT:    v_readlane_b32 s50, v4, 6
+; CHECK-NEXT:    v_readlane_b32 s51, v4, 7
+; CHECK-NEXT:    ; kill: killed $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19
+; CHECK-NEXT:    ; kill: killed $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43
+; CHECK-NEXT:    ; kill: killed $sgpr8_sgpr9_sgpr10 killed $sgpr11
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_sub_f32_e32 v2, v6, v5
+; CHECK-NEXT:    v_sub_f32_e32 v2, v7, v6
 ; CHECK-NEXT:    v_mul_f32_e32 v0, v2, v0
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 0
 ; CHECK-NEXT:  .LBB0_8: ; %bb33
@@ -274,8 +332,7 @@ define void @main(i1 %arg) #0 {
 ; CHECK-NEXT:  .LBB0_9: ; %Flow13
 ; CHECK-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; CHECK-NEXT:  .LBB0_10: ; %UnifiedReturnBlock
-; CHECK-NEXT:    s_or_b64 exec, exec, s[28:29]
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    s_or_b64 exec, exec, s[20:21]
 ; CHECK-NEXT:    v_readlane_b32 s67, v1, 33
 ; CHECK-NEXT:    v_readlane_b32 s66, v1, 32
 ; CHECK-NEXT:    v_readlane_b32 s65, v1, 31
@@ -310,10 +367,12 @@ define void @main(i1 %arg) #0 {
 ; CHECK-NEXT:    v_readlane_b32 s36, v1, 2
 ; CHECK-NEXT:    v_readlane_b32 s31, v1, 1
 ; CHECK-NEXT:    v_readlane_b32 s30, v1, 0
+; CHECK-NEXT:    ; kill: killed $vgpr5
 ; CHECK-NEXT:    ; kill: killed $vgpr4
 ; CHECK-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; CHECK-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_mov_b64 exec, s[4:5]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll
index c877740c1baa9..fd261e8e2bb25 100644
--- a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll
+++ b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll
@@ -210,7 +210,6 @@ define float @mubuf_vgpr(<4 x i32> %i, i32 %c) #0 {
 ; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
 ; W64-O0-NEXT:  ; %bb.2: ; in Loop: Header=BB0_1 Depth=1
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
 ; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
@@ -222,7 +221,9 @@ define float @mubuf_vgpr(<4 x i32> %i, i32 %c) #0 {
 ; W64-O0-NEXT:    v_readlane_b32 s10, v1, 5
 ; W64-O0-NEXT:    v_readlane_b32 s11, v1, 6
 ; W64-O0-NEXT:    v_readlane_b32 s6, v1, 0
-; W64-O0-NEXT:    s_nop 4
+; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; W64-O0-NEXT:    s_waitcnt vmcnt(0)
+; W64-O0-NEXT:    s_nop 2
 ; W64-O0-NEXT:    buffer_load_format_x v0, v0, s[8:11], s6 idxen
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
 ; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
@@ -232,11 +233,11 @@ define float @mubuf_vgpr(<4 x i32> %i, i32 %c) #0 {
 ; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
 ; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_waitcnt vmcnt(1)
+; W64-O0-NEXT:    s_waitcnt vmcnt(0)
 ; W64-O0-NEXT:    v_readlane_b32 s4, v1, 1
 ; W64-O0-NEXT:    v_readlane_b32 s5, v1, 2
 ; W64-O0-NEXT:    s_mov_b64 exec, s[4:5]
+; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    ; kill: killed $vgpr1
 ; W64-O0-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
@@ -601,7 +602,6 @@ define void @mubuf_vgpr_adjacent_in_block(<4 x i32> %i, <4 x i32> %j, i32 %c, pt
 ; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
 ; W64-O0-NEXT:  ; %bb.2: ; in Loop: Header=BB1_1 Depth=1
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
 ; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
@@ -613,7 +613,9 @@ define void @mubuf_vgpr_adjacent_in_block(<4 x i32> %i, <4 x i32> %j, i32 %c, pt
 ; W64-O0-NEXT:    v_readlane_b32 s10, v1, 5
 ; W64-O0-NEXT:    v_readlane_b32 s11, v1, 6
 ; W64-O0-NEXT:    v_readlane_b32 s6, v1, 0
-; W64-O0-NEXT:    s_nop 4
+; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; W64-O0-NEXT:    s_waitcnt vmcnt(0)
+; W64-O0-NEXT:    s_nop 2
 ; W64-O0-NEXT:    buffer_load_format_x v0, v0, s[8:11], s6 idxen
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
 ; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
@@ -668,7 +670,6 @@ define void @mubuf_vgpr_adjacent_in_block(<4 x i32> %i, <4 x i32> %j, i32 %c, pt
 ; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
 ; W64-O0-NEXT:  ; %bb.5: ; in Loop: Header=BB1_4 Depth=1
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
 ; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
@@ -680,7 +681,9 @@ define void @mubuf_vgpr_adjacent_in_block(<4 x i32> %i, <4 x i32> %j, i32 %c, pt
 ; W64-O0-NEXT:    v_readlane_b32 s10, v1, 13
 ; W64-O0-NEXT:    v_readlane_b32 s11, v1, 14
 ; W64-O0-NEXT:    v_readlane_b32 s6, v1, 0
-; W64-O0-NEXT:    s_nop 4
+; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; W64-O0-NEXT:    s_waitcnt vmcnt(0)
+; W64-O0-NEXT:    s_nop 2
 ; W64-O0-NEXT:    buffer_load_format_x v0, v0, s[8:11], s6 idxen
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
 ; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
@@ -690,16 +693,16 @@ define void @mubuf_vgpr_adjacent_in_block(<4 x i32> %i, <4 x i32> %j, i32 %c, pt
 ; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
 ; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
+; W64-O0-NEXT:    s_waitcnt vmcnt(0)
+; W64-O0-NEXT:    v_readlane_b32 s4, v0, 9
+; W64-O0-NEXT:    v_readlane_b32 s5, v0, 10
+; W64-O0-NEXT:    s_mov_b64 exec, s[4:5]
 ; W64-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_waitcnt vmcnt(6)
-; W64-O0-NEXT:    v_readlane_b32 s4, v0, 9
-; W64-O0-NEXT:    v_readlane_b32 s5, v0, 10
-; W64-O0-NEXT:    s_mov_b64 exec, s[4:5]
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
 ; W64-O0-NEXT:    global_store_dword v[4:5], v6, off
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
@@ -1119,7 +1122,6 @@ define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr ad
 ; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
 ; W64-O0-NEXT:  ; %bb.2: ; in Loop: Header=BB2_1 Depth=1
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
 ; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
@@ -1131,7 +1133,9 @@ define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr ad
 ; W64-O0-NEXT:    v_readlane_b32 s10, v1, 6
 ; W64-O0-NEXT:    v_readlane_b32 s11, v1, 7
 ; W64-O0-NEXT:    v_readlane_b32 s6, v1, 1
-; W64-O0-NEXT:    s_nop 4
+; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; W64-O0-NEXT:    s_waitcnt vmcnt(0)
+; W64-O0-NEXT:    s_nop 2
 ; W64-O0-NEXT:    buffer_load_format_x v0, v0, s[8:11], s6 idxen
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
 ; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
@@ -1141,13 +1145,13 @@ define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr ad
 ; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
 ; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
-; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; W64-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_waitcnt vmcnt(2)
+; W64-O0-NEXT:    s_waitcnt vmcnt(0)
 ; W64-O0-NEXT:    v_readlane_b32 s6, v0, 2
 ; W64-O0-NEXT:    v_readlane_b32 s7, v0, 3
 ; W64-O0-NEXT:    s_mov_b64 exec, s[6:7]
 ; W64-O0-NEXT:    v_readlane_b32 s4, v0, 1
+; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_mov_b32 s5, 0x3ff
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
 ; W64-O0-NEXT:    v_and_b32_e64 v2, v2, s5
@@ -1214,7 +1218,6 @@ define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr ad
 ; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
 ; W64-O0-NEXT:  ; %bb.6: ; in Loop: Header=BB2_5 Depth=1
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
 ; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
@@ -1226,14 +1229,15 @@ define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr ad
 ; W64-O0-NEXT:    v_readlane_b32 s10, v1, 17
 ; W64-O0-NEXT:    v_readlane_b32 s11, v1, 18
 ; W64-O0-NEXT:    v_readlane_b32 s6, v1, 12
-; W64-O0-NEXT:    s_nop 4
+; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; W64-O0-NEXT:    s_waitcnt vmcnt(0)
+; W64-O0-NEXT:    s_nop 2
 ; W64-O0-NEXT:    buffer_load_format_x v0, v0, s[8:11], s6 idxen
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
 ; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; W64-O0-NEXT:    s_cbranch_execnz .LBB2_5
 ; W64-O0-NEXT:  ; %bb.7:
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
 ; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
@@ -1241,18 +1245,20 @@ define void @mubuf_vgpr_outside_entry(<4 x i32> %i, <4 x i32> %j, i32 %c, ptr ad
 ; W64-O0-NEXT:    v_readlane_b32 s4, v1, 13
 ; W64-O0-NEXT:    v_readlane_b32 s5, v1, 14
 ; W64-O0-NEXT:    s_mov_b64 exec, s[4:5]
+; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; W64-O0-NEXT:    s_waitcnt vmcnt(0)
 ; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; W64-O0-NEXT:  .LBB2_8: ; %bb2
 ; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
 ; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
-; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; W64-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; W64-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_waitcnt vmcnt(3)
+; W64-O0-NEXT:    s_waitcnt vmcnt(0)
 ; W64-O0-NEXT:    v_readlane_b32 s4, v0, 10
 ; W64-O0-NEXT:    v_readlane_b32 s5, v0, 11
 ; W64-O0-NEXT:    s_or_b64 exec, exec, s[4:5]
+; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
 ; W64-O0-NEXT:    global_store_dword v[1:2], v3, off
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll
index ac46f8ce20d60..72fbcc7bd9e40 100644
--- a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll
+++ b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll
@@ -224,7 +224,6 @@ define float @mubuf_vgpr(ptr addrspace(8) %i, i32 %c) #0 {
 ; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
 ; W64-O0-NEXT:  ; %bb.2: ; in Loop: Header=BB0_1 Depth=1
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
 ; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
@@ -236,7 +235,9 @@ define float @mubuf_vgpr(ptr addrspace(8) %i, i32 %c) #0 {
 ; W64-O0-NEXT:    v_readlane_b32 s10, v1, 5
 ; W64-O0-NEXT:    v_readlane_b32 s11, v1, 6
 ; W64-O0-NEXT:    v_readlane_b32 s6, v1, 0
-; W64-O0-NEXT:    s_nop 4
+; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; W64-O0-NEXT:    s_waitcnt vmcnt(0)
+; W64-O0-NEXT:    s_nop 2
 ; W64-O0-NEXT:    buffer_load_format_x v0, v0, s[8:11], s6 idxen
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
 ; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
@@ -246,11 +247,11 @@ define float @mubuf_vgpr(ptr addrspace(8) %i, i32 %c) #0 {
 ; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
 ; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_waitcnt vmcnt(1)
+; W64-O0-NEXT:    s_waitcnt vmcnt(0)
 ; W64-O0-NEXT:    v_readlane_b32 s4, v1, 1
 ; W64-O0-NEXT:    v_readlane_b32 s5, v1, 2
 ; W64-O0-NEXT:    s_mov_b64 exec, s[4:5]
+; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    ; kill: killed $vgpr1
 ; W64-O0-NEXT:    s_xor_saveexec_b64 s[4:5], -1
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
@@ -640,7 +641,6 @@ define void @mubuf_vgpr_adjacent_in_block(ptr addrspace(8) %i, ptr addrspace(8)
 ; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
 ; W64-O0-NEXT:  ; %bb.2: ; in Loop: Header=BB1_1 Depth=1
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
 ; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
@@ -652,7 +652,9 @@ define void @mubuf_vgpr_adjacent_in_block(ptr addrspace(8) %i, ptr addrspace(8)
 ; W64-O0-NEXT:    v_readlane_b32 s10, v1, 5
 ; W64-O0-NEXT:    v_readlane_b32 s11, v1, 6
 ; W64-O0-NEXT:    v_readlane_b32 s6, v1, 0
-; W64-O0-NEXT:    s_nop 4
+; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; W64-O0-NEXT:    s_waitcnt vmcnt(0)
+; W64-O0-NEXT:    s_nop 2
 ; W64-O0-NEXT:    buffer_load_format_x v0, v0, s[8:11], s6 idxen
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
 ; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
@@ -707,7 +709,6 @@ define void @mubuf_vgpr_adjacent_in_block(ptr addrspace(8) %i, ptr addrspace(8)
 ; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
 ; W64-O0-NEXT:  ; %bb.5: ; in Loop: Header=BB1_4 Depth=1
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
 ; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
@@ -719,7 +720,9 @@ define void @mubuf_vgpr_adjacent_in_block(ptr addrspace(8) %i, ptr addrspace(8)
 ; W64-O0-NEXT:    v_readlane_b32 s10, v1, 13
 ; W64-O0-NEXT:    v_readlane_b32 s11, v1, 14
 ; W64-O0-NEXT:    v_readlane_b32 s6, v1, 0
-; W64-O0-NEXT:    s_nop 4
+; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; W64-O0-NEXT:    s_waitcnt vmcnt(0)
+; W64-O0-NEXT:    s_nop 2
 ; W64-O0-NEXT:    buffer_load_format_x v0, v0, s[8:11], s6 idxen
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
 ; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
@@ -729,16 +732,16 @@ define void @mubuf_vgpr_adjacent_in_block(ptr addrspace(8) %i, ptr addrspace(8)
 ; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
 ; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
+; W64-O0-NEXT:    s_waitcnt vmcnt(0)
+; W64-O0-NEXT:    v_readlane_b32 s4, v0, 9
+; W64-O0-NEXT:    v_readlane_b32 s5, v0, 10
+; W64-O0-NEXT:    s_mov_b64 exec, s[4:5]
 ; W64-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_waitcnt vmcnt(6)
-; W64-O0-NEXT:    v_readlane_b32 s4, v0, 9
-; W64-O0-NEXT:    v_readlane_b32 s5, v0, 10
-; W64-O0-NEXT:    s_mov_b64 exec, s[4:5]
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
 ; W64-O0-NEXT:    global_store_dword v[4:5], v6, off
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
@@ -1175,7 +1178,6 @@ define void @mubuf_vgpr_outside_entry(ptr addrspace(8) %i, ptr addrspace(8) %j,
 ; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
 ; W64-O0-NEXT:  ; %bb.2: ; in Loop: Header=BB2_1 Depth=1
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
 ; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
@@ -1187,7 +1189,9 @@ define void @mubuf_vgpr_outside_entry(ptr addrspace(8) %i, ptr addrspace(8) %j,
 ; W64-O0-NEXT:    v_readlane_b32 s10, v1, 6
 ; W64-O0-NEXT:    v_readlane_b32 s11, v1, 7
 ; W64-O0-NEXT:    v_readlane_b32 s6, v1, 1
-; W64-O0-NEXT:    s_nop 4
+; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; W64-O0-NEXT:    s_waitcnt vmcnt(0)
+; W64-O0-NEXT:    s_nop 2
 ; W64-O0-NEXT:    buffer_load_format_x v0, v0, s[8:11], s6 idxen
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
 ; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
@@ -1197,13 +1201,13 @@ define void @mubuf_vgpr_outside_entry(ptr addrspace(8) %i, ptr addrspace(8) %j,
 ; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
 ; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
-; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; W64-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_waitcnt vmcnt(2)
+; W64-O0-NEXT:    s_waitcnt vmcnt(0)
 ; W64-O0-NEXT:    v_readlane_b32 s6, v0, 2
 ; W64-O0-NEXT:    v_readlane_b32 s7, v0, 3
 ; W64-O0-NEXT:    s_mov_b64 exec, s[6:7]
 ; W64-O0-NEXT:    v_readlane_b32 s4, v0, 1
+; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_mov_b32 s5, 0x3ff
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
 ; W64-O0-NEXT:    v_and_b32_e64 v2, v2, s5
@@ -1222,12 +1226,12 @@ define void @mubuf_vgpr_outside_entry(ptr addrspace(8) %i, ptr addrspace(8) %j,
 ; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
 ; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
+; W64-O0-NEXT:    s_waitcnt vmcnt(0)
+; W64-O0-NEXT:    v_readlane_b32 s4, v0, 0
 ; W64-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_waitcnt vmcnt(4)
-; W64-O0-NEXT:    v_readlane_b32 s4, v0, 0
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
 ; W64-O0-NEXT:    v_mov_b32_e32 v7, v5
 ; W64-O0-NEXT:    v_mov_b32_e32 v1, v4
@@ -1291,7 +1295,6 @@ define void @mubuf_vgpr_outside_entry(ptr addrspace(8) %i, ptr addrspace(8) %j,
 ; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
 ; W64-O0-NEXT:  ; %bb.6: ; in Loop: Header=BB2_5 Depth=1
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
 ; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
@@ -1303,14 +1306,15 @@ define void @mubuf_vgpr_outside_entry(ptr addrspace(8) %i, ptr addrspace(8) %j,
 ; W64-O0-NEXT:    v_readlane_b32 s10, v1, 17
 ; W64-O0-NEXT:    v_readlane_b32 s11, v1, 18
 ; W64-O0-NEXT:    v_readlane_b32 s6, v1, 12
-; W64-O0-NEXT:    s_nop 4
+; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; W64-O0-NEXT:    s_waitcnt vmcnt(0)
+; W64-O0-NEXT:    s_nop 2
 ; W64-O0-NEXT:    buffer_load_format_x v0, v0, s[8:11], s6 idxen
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
 ; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
 ; W64-O0-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; W64-O0-NEXT:    s_cbranch_execnz .LBB2_5
 ; W64-O0-NEXT:  ; %bb.7:
-; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
 ; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
@@ -1318,18 +1322,20 @@ define void @mubuf_vgpr_outside_entry(ptr addrspace(8) %i, ptr addrspace(8) %j,
 ; W64-O0-NEXT:    v_readlane_b32 s4, v1, 13
 ; W64-O0-NEXT:    v_readlane_b32 s5, v1, 14
 ; W64-O0-NEXT:    s_mov_b64 exec, s[4:5]
+; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; W64-O0-NEXT:    s_waitcnt vmcnt(0)
 ; W64-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
 ; W64-O0-NEXT:  .LBB2_8: ; %bb2
 ; W64-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
 ; W64-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_mov_b64 exec, s[16:17]
-; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; W64-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; W64-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; W64-O0-NEXT:    s_waitcnt vmcnt(3)
+; W64-O0-NEXT:    s_waitcnt vmcnt(0)
 ; W64-O0-NEXT:    v_readlane_b32 s4, v0, 10
 ; W64-O0-NEXT:    v_readlane_b32 s5, v0, 11
 ; W64-O0-NEXT:    s_or_b64 exec, exec, s[4:5]
+; W64-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; W64-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
 ; W64-O0-NEXT:    global_store_dword v[1:2], v3, off
 ; W64-O0-NEXT:    s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll b/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll
index 454dc881f7bf2..a4948921ed427 100644
--- a/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll
+++ b/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll
@@ -223,15 +223,12 @@ define amdgpu_kernel void @spill_sgprs_to_multiple_vgprs(ptr addrspace(1) %out,
 ; GCN-NEXT:    s_cbranch_scc1 .LBB0_2
 ; GCN-NEXT:  ; %bb.1: ; %bb0
 ; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GCN-NEXT:    buffer_load_dword v0, off, s[92:95], 0 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v2, off, s[92:95], 0 offset:8 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[34:35]
 ; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GCN-NEXT:    buffer_load_dword v1, off, s[92:95], 0 offset:12 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[34:35]
-; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GCN-NEXT:    buffer_load_dword v2, off, s[92:95], 0 offset:8 ; 4-byte Folded Reload
-; GCN-NEXT:    s_mov_b64 exec, s[34:35]
-; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-NEXT:    v_readlane_b32 s8, v2, 56
 ; GCN-NEXT:    v_readlane_b32 s9, v2, 57
 ; GCN-NEXT:    v_readlane_b32 s10, v2, 58
@@ -296,6 +293,7 @@ define amdgpu_kernel void @spill_sgprs_to_multiple_vgprs(ptr addrspace(1) %out,
 ; GCN-NEXT:    v_readlane_b32 s73, v2, 5
 ; GCN-NEXT:    v_readlane_b32 s74, v2, 6
 ; GCN-NEXT:    v_readlane_b32 s75, v2, 7
+; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_readlane_b32 s76, v1, 56
 ; GCN-NEXT:    v_readlane_b32 s77, v1, 57
 ; GCN-NEXT:    v_readlane_b32 s78, v1, 58
@@ -320,6 +318,9 @@ define amdgpu_kernel void @spill_sgprs_to_multiple_vgprs(ptr addrspace(1) %out,
 ; GCN-NEXT:    v_readlane_b32 s5, v1, 5
 ; GCN-NEXT:    v_readlane_b32 s6, v1, 6
 ; GCN-NEXT:    v_readlane_b32 s7, v1, 7
+; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
+; GCN-NEXT:    buffer_load_dword v0, off, s[92:95], 0 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT:    s_mov_b64 exec, s[34:35]
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; use s[0:7]
 ; GCN-NEXT:    ;;#ASMEND
@@ -378,6 +379,7 @@ define amdgpu_kernel void @spill_sgprs_to_multiple_vgprs(ptr addrspace(1) %out,
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; use s[0:7]
 ; GCN-NEXT:    ;;#ASMEND
+; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_readlane_b32 s0, v0, 0
 ; GCN-NEXT:    v_readlane_b32 s1, v0, 1
 ; GCN-NEXT:    v_readlane_b32 s2, v0, 2
@@ -595,12 +597,12 @@ define amdgpu_kernel void @split_sgpr_spill_2_vgprs(ptr addrspace(1) %out, i32 %
 ; GCN-NEXT:    s_cbranch_scc1 .LBB1_2
 ; GCN-NEXT:  ; %bb.1: ; %bb0
 ; GCN-NEXT:    s_or_saveexec_b64 s[28:29], -1
-; GCN-NEXT:    buffer_load_dword v0, off, s[52:55], 0 offset:8 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v1, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[28:29]
 ; GCN-NEXT:    s_or_saveexec_b64 s[28:29], -1
-; GCN-NEXT:    buffer_load_dword v1, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v0, off, s[52:55], 0 offset:8 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[28:29]
-; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-NEXT:    v_readlane_b32 s16, v1, 8
 ; GCN-NEXT:    v_readlane_b32 s17, v1, 9
 ; GCN-NEXT:    v_readlane_b32 s20, v1, 0
@@ -611,6 +613,7 @@ define amdgpu_kernel void @split_sgpr_spill_2_vgprs(ptr addrspace(1) %out, i32 %
 ; GCN-NEXT:    v_readlane_b32 s25, v1, 5
 ; GCN-NEXT:    v_readlane_b32 s26, v1, 6
 ; GCN-NEXT:    v_readlane_b32 s27, v1, 7
+; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_readlane_b32 s36, v0, 32
 ; GCN-NEXT:    v_readlane_b32 s37, v0, 33
 ; GCN-NEXT:    v_readlane_b32 s38, v0, 34
@@ -854,9 +857,6 @@ define amdgpu_kernel void @no_vgprs_last_sgpr_spill(ptr addrspace(1) %out, i32 %
 ; GCN-NEXT:    s_cbranch_scc1 .LBB2_2
 ; GCN-NEXT:  ; %bb.1: ; %bb0
 ; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GCN-NEXT:    buffer_load_dword v0, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload
-; GCN-NEXT:    s_mov_b64 exec, s[34:35]
-; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GCN-NEXT:    buffer_load_dword v1, off, s[52:55], 0 offset:8 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[34:35]
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
@@ -908,6 +908,9 @@ define amdgpu_kernel void @no_vgprs_last_sgpr_spill(ptr addrspace(1) %out, i32 %
 ; GCN-NEXT:    v_readlane_b32 s29, v1, 13
 ; GCN-NEXT:    v_readlane_b32 s30, v1, 14
 ; GCN-NEXT:    v_readlane_b32 s31, v1, 15
+; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
+; GCN-NEXT:    buffer_load_dword v0, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT:    s_mov_b64 exec, s[34:35]
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; use s[16:31]
 ; GCN-NEXT:    ;;#ASMEND
@@ -930,6 +933,7 @@ define amdgpu_kernel void @no_vgprs_last_sgpr_spill(ptr addrspace(1) %out, i32 %
 ; GCN-NEXT:    v_readlane_b32 s17, v1, 61
 ; GCN-NEXT:    v_readlane_b32 s18, v1, 62
 ; GCN-NEXT:    v_readlane_b32 s19, v1, 63
+; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_readlane_b32 s0, v0, 0
 ; GCN-NEXT:    v_readlane_b32 s1, v0, 1
 ; GCN-NEXT:    ;;#ASMSTART
@@ -1105,9 +1109,6 @@ define amdgpu_kernel void @no_vgprs_last_sgpr_spill_live_v0(i32 %in) #1 {
 ; GCN-NEXT:    s_cbranch_scc1 .LBB3_2
 ; GCN-NEXT:  ; %bb.1: ; %bb0
 ; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GCN-NEXT:    buffer_load_dword v1, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload
-; GCN-NEXT:    s_mov_b64 exec, s[34:35]
-; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GCN-NEXT:    buffer_load_dword v2, off, s[52:55], 0 offset:8 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[34:35]
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
@@ -1159,6 +1160,9 @@ define amdgpu_kernel void @no_vgprs_last_sgpr_spill_live_v0(i32 %in) #1 {
 ; GCN-NEXT:    v_readlane_b32 s29, v2, 13
 ; GCN-NEXT:    v_readlane_b32 s30, v2, 14
 ; GCN-NEXT:    v_readlane_b32 s31, v2, 15
+; GCN-NEXT:    s_or_saveexec_b64 s[34:35], -1
+; GCN-NEXT:    buffer_load_dword v1, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT:    s_mov_b64 exec, s[34:35]
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; def v0
 ; GCN-NEXT:    ;;#ASMEND
@@ -1184,6 +1188,7 @@ define amdgpu_kernel void @no_vgprs_last_sgpr_spill_live_v0(i32 %in) #1 {
 ; GCN-NEXT:    v_readlane_b32 s17, v2, 61
 ; GCN-NEXT:    v_readlane_b32 s18, v2, 62
 ; GCN-NEXT:    v_readlane_b32 s19, v2, 63
+; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_readlane_b32 s0, v1, 0
 ; GCN-NEXT:    v_readlane_b32 s1, v1, 1
 ; GCN-NEXT:    ;;#ASMSTART
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll b/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll
index fbbcfd62f5c60..fec732eff798c 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll
@@ -123,9 +123,6 @@ define amdgpu_kernel void @partial_no_vgprs_last_sgpr_spill(ptr addrspace(1) %ou
 ; GCN-NEXT:    s_cbranch_scc1 .LBB0_2
 ; GCN-NEXT:  ; %bb.1: ; %bb0
 ; GCN-NEXT:    s_or_saveexec_b64 s[24:25], -1
-; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload
-; GCN-NEXT:    s_mov_b64 exec, s[24:25]
-; GCN-NEXT:    s_or_saveexec_b64 s[24:25], -1
 ; GCN-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:8 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[24:25]
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
@@ -145,6 +142,9 @@ define amdgpu_kernel void @partial_no_vgprs_last_sgpr_spill(ptr addrspace(1) %ou
 ; GCN-NEXT:    v_readlane_b32 s17, v1, 13
 ; GCN-NEXT:    v_readlane_b32 s18, v1, 14
 ; GCN-NEXT:    v_readlane_b32 s19, v1, 15
+; GCN-NEXT:    s_or_saveexec_b64 s[24:25], -1
+; GCN-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT:    s_mov_b64 exec, s[24:25]
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ; use s[4:19]
 ; GCN-NEXT:    ;;#ASMEND
@@ -202,6 +202,7 @@ define amdgpu_kernel void @partial_no_vgprs_last_sgpr_spill(ptr addrspace(1) %ou
 ; GCN-NEXT:    v_readlane_b32 s21, v1, 61
 ; GCN-NEXT:    v_readlane_b32 s22, v1, 62
 ; GCN-NEXT:    v_readlane_b32 s23, v1, 63
+; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_readlane_b32 s4, v0, 0
 ; GCN-NEXT:    v_readlane_b32 s5, v0, 1
 ; GCN-NEXT:    ;;#ASMSTART
diff --git a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
index 08db1e7fee259..033fd8ef89cfe 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
@@ -10083,17 +10083,30 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    s_add_u32 s40, s40, s3
 ; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
 ; GFX6-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, -1, 0
+; GFX6-NEXT:    s_addc_u32 s41, s41, 0
 ; GFX6-NEXT:    v_mbcnt_hi_u32_b32_e32 v5, -1, v0
 ; GFX6-NEXT:    v_mov_b32_e32 v6, 0
+; GFX6-NEXT:    s_mov_b64 s[4:5], exec
 ; GFX6-NEXT:    s_mov_b32 s6, 0
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
-; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b64 exec, 15
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0
+; GFX6-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    v_writelane_b32 v0, s0, 0
+; GFX6-NEXT:    v_writelane_b32 v0, s1, 1
+; GFX6-NEXT:    v_writelane_b32 v0, s2, 2
+; GFX6-NEXT:    v_writelane_b32 v0, s3, 3
+; GFX6-NEXT:    s_mov_b32 s8, 0x80400
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s8 ; 4-byte Folded Spill
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v7, 8, v5
 ; GFX6-NEXT:    v_mov_b32_e32 v8, v6
 ; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:240
-; GFX6-NEXT:    s_addc_u32 s41, s41, 0
-; GFX6-NEXT:    s_mov_b32 s2, 0x83800
+; GFX6-NEXT:    s_mov_b32 s2, 0x84400
 ; GFX6-NEXT:    s_mov_b64 s[8:9], exec
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
@@ -10103,7 +10116,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:224
-; GFX6-NEXT:    s_mov_b32 s2, 0x83400
+; GFX6-NEXT:    s_mov_b32 s2, 0x84000
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
@@ -10112,7 +10125,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:208
-; GFX6-NEXT:    s_mov_b32 s2, 0x83000
+; GFX6-NEXT:    s_mov_b32 s2, 0x83c00
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
@@ -10121,7 +10134,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:192
-; GFX6-NEXT:    s_mov_b32 s2, 0x82c00
+; GFX6-NEXT:    s_mov_b32 s2, 0x83800
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
@@ -10130,7 +10143,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:176
-; GFX6-NEXT:    s_mov_b32 s2, 0x82800
+; GFX6-NEXT:    s_mov_b32 s2, 0x83400
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
@@ -10139,7 +10152,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:160
-; GFX6-NEXT:    s_mov_b32 s2, 0x82400
+; GFX6-NEXT:    s_mov_b32 s2, 0x83000
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
@@ -10148,7 +10161,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:144
-; GFX6-NEXT:    s_mov_b32 s2, 0x82000
+; GFX6-NEXT:    s_mov_b32 s2, 0x82c00
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
@@ -10157,7 +10170,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:128
-; GFX6-NEXT:    s_mov_b32 s2, 0x81c00
+; GFX6-NEXT:    s_mov_b32 s2, 0x82800
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
@@ -10166,7 +10179,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:112
-; GFX6-NEXT:    s_mov_b32 s2, 0x81800
+; GFX6-NEXT:    s_mov_b32 s2, 0x82400
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
@@ -10175,7 +10188,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:96
-; GFX6-NEXT:    s_mov_b32 s2, 0x81400
+; GFX6-NEXT:    s_mov_b32 s2, 0x82000
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
@@ -10184,25 +10197,41 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:80
-; GFX6-NEXT:    s_mov_b32 s2, 0x81000
+; GFX6-NEXT:    s_mov_b32 s2, 0x81c00
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:64
+; GFX6-NEXT:    s_mov_b32 s2, 0x81400
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
 ; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
 ; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_load_dwordx4 v[17:20], v[7:8], s[4:7], 0 addr64 offset:64
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64
 ; GFX6-NEXT:    buffer_load_dwordx4 v[9:12], v[7:8], s[4:7], 0 addr64 offset:16
-; GFX6-NEXT:    s_mov_b32 s2, 0x80800
+; GFX6-NEXT:    s_mov_b32 s2, 0x80c00
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_store_dword v9, off, s[40:43], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_store_dword v10, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v11, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v12, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    buffer_load_dwordx4 v[9:12], v[7:8], s[4:7], 0 addr64 offset:32
+; GFX6-NEXT:    s_mov_b32 s2, 0x81000
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v9, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v10, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
 ; GFX6-NEXT:    buffer_store_dword v11, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
 ; GFX6-NEXT:    buffer_store_dword v12, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_load_dwordx4 v[13:16], v[7:8], s[4:7], 0 addr64 offset:32
 ; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; GFX6-NEXT:    s_mov_b64 exec, 15
 ; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
@@ -10211,25 +10240,25 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    v_writelane_b32 v4, s1, 1
 ; GFX6-NEXT:    v_writelane_b32 v4, s2, 2
 ; GFX6-NEXT:    v_writelane_b32 v4, s3, 3
-; GFX6-NEXT:    s_mov_b32 s10, 0x80400
+; GFX6-NEXT:    s_mov_b32 s10, 0x80800
 ; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], s10 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    s_mov_b64 exec, s[8:9]
 ; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[7:8], s[4:7], 0 addr64 offset:48
-; GFX6-NEXT:    s_mov_b32 s2, 0x80c00
+; GFX6-NEXT:    s_mov_b32 s0, 0x81800
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 13, v0
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 16, v4
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[40:43], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v7, off, s[40:43], s0 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(3)
 ; GFX6-NEXT:    v_mov_b32_e32 v7, 1
-; GFX6-NEXT:    s_mov_b64 s[2:3], exec
+; GFX6-NEXT:    s_mov_b64 s[0:1], exec
 ; GFX6-NEXT:    buffer_store_dword v7, v4, s[40:43], 0 offen
 ; GFX6-NEXT:    ;;#ASMSTART
 ; GFX6-NEXT:    ; def s[4:11]
@@ -10245,12 +10274,12 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    v_writelane_b32 v4, s9, 5
 ; GFX6-NEXT:    v_writelane_b32 v4, s10, 6
 ; GFX6-NEXT:    v_writelane_b32 v4, s11, 7
-; GFX6-NEXT:    s_mov_b32 s12, 0x83c00
-; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], s12 ; 4-byte Folded Spill
+; GFX6-NEXT:    s_mov_b32 s2, 0x84800
+; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX6-NEXT:    s_mov_b64 exec, s[0:1]
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX6-NEXT:    ;;#ASMSTART
 ; GFX6-NEXT:    ; def s[8:15]
@@ -10262,18 +10291,19 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    ; def s[24:31]
 ; GFX6-NEXT:    ;;#ASMEND
 ; GFX6-NEXT:    ;;#ASMSTART
-; GFX6-NEXT:    ; def s[4:7]
+; GFX6-NEXT:    ; def s[0:3]
 ; GFX6-NEXT:    ;;#ASMEND
 ; GFX6-NEXT:    ;;#ASMSTART
-; GFX6-NEXT:    ; def s[2:3]
+; GFX6-NEXT:    ; def s[4:5]
 ; GFX6-NEXT:    ;;#ASMEND
 ; GFX6-NEXT:    ;;#ASMSTART
 ; GFX6-NEXT:    ; def s33
 ; GFX6-NEXT:    ;;#ASMEND
-; GFX6-NEXT:    s_and_saveexec_b64 s[34:35], vcc
+; GFX6-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GFX6-NEXT:    s_mov_b64 vcc, s[6:7]
 ; GFX6-NEXT:    s_cbranch_execz .LBB1_2
 ; GFX6-NEXT:  ; %bb.1: ; %bb0
-; GFX6-NEXT:    s_mov_b64 s[36:37], exec
+; GFX6-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 0xff
 ; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
@@ -10285,18 +10315,18 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    v_writelane_b32 v4, s13, 5
 ; GFX6-NEXT:    v_writelane_b32 v4, s14, 6
 ; GFX6-NEXT:    v_writelane_b32 v4, s15, 7
-; GFX6-NEXT:    s_mov_b32 s38, 0x84400
-; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], s38 ; 4-byte Folded Spill
+; GFX6-NEXT:    s_mov_b32 s34, 0x85000
+; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], s34 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX6-NEXT:    s_mov_b64 s[36:37], exec
+; GFX6-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX6-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 0xff
-; GFX6-NEXT:    s_mov_b32 s38, 0x83c00
+; GFX6-NEXT:    s_mov_b32 s34, 0x84800
 ; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], s38 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], s34 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_readlane_b32 s8, v4, 0
 ; GFX6-NEXT:    v_readlane_b32 s9, v4, 1
@@ -10308,8 +10338,8 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    v_readlane_b32 s15, v4, 7
 ; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX6-NEXT:    s_mov_b64 s[36:37], exec
+; GFX6-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX6-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 0xff
 ; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
@@ -10321,18 +10351,18 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    v_writelane_b32 v4, s21, 5
 ; GFX6-NEXT:    v_writelane_b32 v4, s22, 6
 ; GFX6-NEXT:    v_writelane_b32 v4, s23, 7
-; GFX6-NEXT:    s_mov_b32 s38, 0x84c00
-; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], s38 ; 4-byte Folded Spill
+; GFX6-NEXT:    s_mov_b32 s34, 0x85800
+; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], s34 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX6-NEXT:    s_mov_b64 s[36:37], exec
+; GFX6-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX6-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 0xff
-; GFX6-NEXT:    s_mov_b32 s38, 0x84400
+; GFX6-NEXT:    s_mov_b32 s34, 0x85000
 ; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], s38 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], s34 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_readlane_b32 s16, v4, 0
 ; GFX6-NEXT:    v_readlane_b32 s17, v4, 1
@@ -10344,8 +10374,8 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    v_readlane_b32 s23, v4, 7
 ; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX6-NEXT:    s_mov_b64 s[36:37], exec
+; GFX6-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX6-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 0xff
 ; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
@@ -10357,18 +10387,18 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    v_writelane_b32 v4, s29, 5
 ; GFX6-NEXT:    v_writelane_b32 v4, s30, 6
 ; GFX6-NEXT:    v_writelane_b32 v4, s31, 7
-; GFX6-NEXT:    s_mov_b32 s38, 0x85400
-; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], s38 ; 4-byte Folded Spill
+; GFX6-NEXT:    s_mov_b32 s34, 0x86000
+; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], s34 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX6-NEXT:    s_mov_b64 s[36:37], exec
+; GFX6-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX6-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 0xff
-; GFX6-NEXT:    s_mov_b32 s38, 0x84c00
+; GFX6-NEXT:    s_mov_b32 s34, 0x85800
 ; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], s38 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], s34 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_readlane_b32 s24, v4, 0
 ; GFX6-NEXT:    v_readlane_b32 s25, v4, 1
@@ -10380,8 +10410,8 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    v_readlane_b32 s31, v4, 7
 ; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX6-NEXT:    s_mov_b64 s[36:37], exec
+; GFX6-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX6-NEXT:    s_mov_b64 s[6:7], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 15
 ; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
@@ -10389,44 +10419,30 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    v_writelane_b32 v4, s1, 1
 ; GFX6-NEXT:    v_writelane_b32 v4, s2, 2
 ; GFX6-NEXT:    v_writelane_b32 v4, s3, 3
-; GFX6-NEXT:    s_mov_b32 s38, 0x85c00
-; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], s38 ; 4-byte Folded Spill
+; GFX6-NEXT:    s_mov_b32 s34, 0x86800
+; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], s34 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    s_mov_b64 exec, s[36:37]
+; GFX6-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX6-NEXT:    s_mov_b64 s[0:1], exec
-; GFX6-NEXT:    s_mov_b64 exec, 15
+; GFX6-NEXT:    s_mov_b64 exec, 3
 ; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    v_writelane_b32 v4, s4, 0
 ; GFX6-NEXT:    v_writelane_b32 v4, s5, 1
-; GFX6-NEXT:    v_writelane_b32 v4, s6, 2
-; GFX6-NEXT:    v_writelane_b32 v4, s7, 3
-; GFX6-NEXT:    s_mov_b32 s36, 0x86000
-; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], s36 ; 4-byte Folded Spill
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX6-NEXT:    s_mov_b64 s[0:1], exec
-; GFX6-NEXT:    s_mov_b64 exec, 3
-; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    v_writelane_b32 v4, s2, 0
-; GFX6-NEXT:    v_writelane_b32 v4, s3, 1
-; GFX6-NEXT:    s_mov_b32 s4, 0x86400
-; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], s4 ; 4-byte Folded Spill
+; GFX6-NEXT:    s_mov_b32 s2, 0x86c00
+; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    s_mov_b64 exec, s[0:1]
-; GFX6-NEXT:    s_mov_b64 s[36:37], exec
+; GFX6-NEXT:    s_mov_b64 s[34:35], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 0xff
-; GFX6-NEXT:    s_mov_b32 s38, 0x85400
+; GFX6-NEXT:    s_mov_b32 s36, 0x86000
 ; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], s38 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], s36 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_readlane_b32 s0, v4, 0
 ; GFX6-NEXT:    v_readlane_b32 s1, v4, 1
@@ -10438,13 +10454,13 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    v_readlane_b32 s7, v4, 7
 ; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX6-NEXT:    s_mov_b64 s[44:45], exec
+; GFX6-NEXT:    s_mov_b64 exec, s[34:35]
+; GFX6-NEXT:    s_mov_b64 s[34:35], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 15
-; GFX6-NEXT:    v_mov_b32_e32 v7, 0x2180
+; GFX6-NEXT:    s_mov_b32 s44, 0x86800
 ; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v4, v7, s[40:43], 0 offen ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], s44 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_readlane_b32 s36, v4, 0
 ; GFX6-NEXT:    v_readlane_b32 s37, v4, 1
@@ -10452,11 +10468,10 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    v_readlane_b32 s39, v4, 3
 ; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    s_mov_b64 exec, s[44:45]
-; GFX6-NEXT:    s_mov_b64 vcc, s[34:35]
+; GFX6-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX6-NEXT:    s_mov_b64 s[44:45], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 3
-; GFX6-NEXT:    v_mov_b32_e32 v7, 0x2190
+; GFX6-NEXT:    v_mov_b32_e32 v7, 0x21b0
 ; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    buffer_load_dword v4, v7, s[40:43], 0 offen ; 4-byte Folded Reload
@@ -10469,54 +10484,8 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    ;;#ASMSTART
 ; GFX6-NEXT:    ; use s[8:15],s[16:23],s[24:31],s[0:7],s[36:39],s[34:35]
 ; GFX6-NEXT:    ;;#ASMEND
-; GFX6-NEXT:    s_mov_b64 s[34:35], vcc
-; GFX6-NEXT:    s_mov_b64 s[4:5], exec
-; GFX6-NEXT:    s_mov_b64 exec, 15
-; GFX6-NEXT:    s_mov_b32 s6, 0x85c00
-; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], s6 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_readlane_b32 s0, v4, 0
-; GFX6-NEXT:    v_readlane_b32 s1, v4, 1
-; GFX6-NEXT:    v_readlane_b32 s2, v4, 2
-; GFX6-NEXT:    v_readlane_b32 s3, v4, 3
-; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX6-NEXT:    s_mov_b32 s2, 0x83c00
-; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
-; GFX6-NEXT:    s_mov_b32 s2, 0x84400
-; GFX6-NEXT:    buffer_store_dword v13, off, s[40:43], s2 ; 4-byte Folded Spill
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v14, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v15, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v16, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
-; GFX6-NEXT:    s_waitcnt expcnt(4)
-; GFX6-NEXT:    v_mov_b32_e32 v0, v17
-; GFX6-NEXT:    v_mov_b32_e32 v1, v18
-; GFX6-NEXT:    v_mov_b32_e32 v2, v19
-; GFX6-NEXT:    v_mov_b32_e32 v3, v20
-; GFX6-NEXT:    s_waitcnt expcnt(0)
 ; GFX6-NEXT:    ;;#ASMSTART
 ; GFX6-NEXT:    ;;#ASMEND
-; GFX6-NEXT:    v_mov_b32_e32 v20, v3
-; GFX6-NEXT:    buffer_load_dword v13, off, s[40:43], s2 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v14, off, s[40:43], s2 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v15, off, s[40:43], s2 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v16, off, s[40:43], s2 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s2, 0x83c00
-; GFX6-NEXT:    v_mov_b32_e32 v19, v2
-; GFX6-NEXT:    v_mov_b32_e32 v18, v1
-; GFX6-NEXT:    v_mov_b32_e32 v17, v0
-; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s2 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Reload
 ; GFX6-NEXT:    ;;#ASMSTART
 ; GFX6-NEXT:    ;;#ASMEND
 ; GFX6-NEXT:    ;;#ASMSTART
@@ -10530,128 +10499,158 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    ;;#ASMSTART
 ; GFX6-NEXT:    ;;#ASMEND
 ; GFX6-NEXT:  .LBB1_2: ; %ret
-; GFX6-NEXT:    s_or_b64 exec, exec, s[34:35]
-; GFX6-NEXT:    s_mov_b64 s[2:3], exec
+; GFX6-NEXT:    s_or_b64 exec, exec, vcc
+; GFX6-NEXT:    s_mov_b64 s[4:5], exec
 ; GFX6-NEXT:    s_mov_b64 exec, 15
-; GFX6-NEXT:    s_mov_b32 s8, 0x80400
+; GFX6-NEXT:    s_mov_b32 s6, 0x80400
 ; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], s8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], s6 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_readlane_b32 s4, v4, 0
-; GFX6-NEXT:    v_readlane_b32 s5, v4, 1
-; GFX6-NEXT:    v_readlane_b32 s6, v4, 2
-; GFX6-NEXT:    v_readlane_b32 s7, v4, 3
+; GFX6-NEXT:    v_readlane_b32 s0, v4, 0
+; GFX6-NEXT:    v_readlane_b32 s1, v4, 1
+; GFX6-NEXT:    v_readlane_b32 s2, v4, 2
+; GFX6-NEXT:    v_readlane_b32 s3, v4, 3
 ; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    s_mov_b64 exec, s[2:3]
-; GFX6-NEXT:    s_mov_b32 s4, 0x83800
+; GFX6-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX6-NEXT:    s_mov_b64 s[36:37], s[0:1]
+; GFX6-NEXT:    s_mov_b64 s[4:5], exec
+; GFX6-NEXT:    s_mov_b64 exec, 15
+; GFX6-NEXT:    s_mov_b32 s6, 0x80800
+; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], s6 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    v_readlane_b32 s0, v4, 0
+; GFX6-NEXT:    v_readlane_b32 s1, v4, 1
+; GFX6-NEXT:    v_readlane_b32 s2, v4, 2
+; GFX6-NEXT:    v_readlane_b32 s3, v4, 3
+; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX6-NEXT:    s_mov_b32 s0, 0x84400
 ; GFX6-NEXT:    v_lshl_b64 v[4:5], v[5:6], 8
-; GFX6-NEXT:    buffer_load_dword v6, off, s[40:43], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
-; GFX6-NEXT:    s_mov_b32 s4, 0x83400
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:240
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v6, off, s[40:43], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x83000
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:224
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v6, off, s[40:43], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x82c00
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:208
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v6, off, s[40:43], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x82800
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:192
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v6, off, s[40:43], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x82400
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:176
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v6, off, s[40:43], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x82000
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:160
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v6, off, s[40:43], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x81c00
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:144
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v6, off, s[40:43], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x81800
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:128
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v6, off, s[40:43], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x81400
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:112
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v6, off, s[40:43], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x81000
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:96
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v6, off, s[40:43], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x80c00
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:80
-; GFX6-NEXT:    buffer_store_dwordx4 v[17:20], v[4:5], s[0:3], 0 addr64 offset:64
-; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v17, off, s[40:43], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v18, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v19, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v20, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x80800
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[17:20], v[4:5], s[0:3], 0 addr64 offset:48
-; GFX6-NEXT:    buffer_store_dwordx4 v[13:16], v[4:5], s[0:3], 0 addr64 offset:32
-; GFX6-NEXT:    buffer_load_dword v6, off, s[40:43], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[0:3], 0 addr64 offset:16
-; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[4:5], s[0:3], 0 addr64
+; GFX6-NEXT:    buffer_load_dword v6, off, s[40:43], s0 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b64 s[38:39], s[2:3]
+; GFX6-NEXT:    s_mov_b32 s0, 0x84000
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[36:39], 0 addr64 offset:240
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    buffer_load_dword v6, off, s[40:43], s0 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s0, 0x83c00
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[36:39], 0 addr64 offset:224
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    buffer_load_dword v6, off, s[40:43], s0 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s0, 0x83800
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[36:39], 0 addr64 offset:208
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    buffer_load_dword v6, off, s[40:43], s0 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s0, 0x83400
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[36:39], 0 addr64 offset:192
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    buffer_load_dword v6, off, s[40:43], s0 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s0, 0x83000
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[36:39], 0 addr64 offset:176
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    buffer_load_dword v6, off, s[40:43], s0 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s0, 0x82c00
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[36:39], 0 addr64 offset:160
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    buffer_load_dword v6, off, s[40:43], s0 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s0, 0x82800
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[36:39], 0 addr64 offset:144
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    buffer_load_dword v6, off, s[40:43], s0 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s0, 0x82400
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[36:39], 0 addr64 offset:128
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    buffer_load_dword v6, off, s[40:43], s0 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s0, 0x82000
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[36:39], 0 addr64 offset:112
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    buffer_load_dword v6, off, s[40:43], s0 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s0, 0x81c00
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[36:39], 0 addr64 offset:96
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    buffer_load_dword v6, off, s[40:43], s0 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s0, 0x81400
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[36:39], 0 addr64 offset:80
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    buffer_load_dword v6, off, s[40:43], s0 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s0, 0x81800
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[36:39], 0 addr64 offset:64
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    buffer_load_dword v6, off, s[40:43], s0 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s0, 0x81000
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[36:39], 0 addr64 offset:48
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    buffer_load_dword v6, off, s[40:43], s0 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s0, 0x80c00
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[36:39], 0 addr64 offset:32
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    buffer_load_dword v6, off, s[40:43], s0 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[36:39], 0 addr64 offset:16
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[4:5], s[36:39], 0 addr64
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX9-FLATSCR-LABEL: test_limited_sgpr:
diff --git a/llvm/test/CodeGen/AMDGPU/swdev380865.ll b/llvm/test/CodeGen/AMDGPU/swdev380865.ll
index 7201ffaf56166..6c98c7def2328 100644
--- a/llvm/test/CodeGen/AMDGPU/swdev380865.ll
+++ b/llvm/test/CodeGen/AMDGPU/swdev380865.ll
@@ -45,9 +45,9 @@ define amdgpu_kernel void @_Z6kernelILi4000ELi1EEvPd(ptr addrspace(1) %x.coerce)
 ; CHECK-NEXT:    v_add_f64 v[0:1], v[0:1], 0
 ; CHECK-NEXT:    s_mov_b32 s2, 0
 ; CHECK-NEXT:    s_mov_b32 s3, 0x40140000
-; CHECK-NEXT:    v_writelane_b32 v2, s6, 9
-; CHECK-NEXT:    v_writelane_b32 v2, s7, 10
-; CHECK-NEXT:    v_writelane_b32 v2, s0, 11
+; CHECK-NEXT:    v_writelane_b32 v2, s0, 9
+; CHECK-NEXT:    v_writelane_b32 v2, s6, 10
+; CHECK-NEXT:    v_writelane_b32 v2, s7, 11
 ; CHECK-NEXT:    v_readlane_b32 s6, v2, 1
 ; CHECK-NEXT:    v_readlane_b32 s7, v2, 2
 ; CHECK-NEXT:    v_add_f64 v[0:1], v[0:1], s[2:3]
@@ -55,8 +55,8 @@ define amdgpu_kernel void @_Z6kernelILi4000ELi1EEvPd(ptr addrspace(1) %x.coerce)
 ; CHECK-NEXT:    s_mov_b32 s0, s2
 ; CHECK-NEXT:    v_writelane_b32 v2, s6, 1
 ; CHECK-NEXT:    v_writelane_b32 v2, s7, 2
-; CHECK-NEXT:    v_readlane_b32 s6, v2, 9
-; CHECK-NEXT:    v_readlane_b32 s7, v2, 10
+; CHECK-NEXT:    v_readlane_b32 s6, v2, 10
+; CHECK-NEXT:    v_readlane_b32 s7, v2, 11
 ; CHECK-NEXT:    s_mov_b32 s6, s2
 ; CHECK-NEXT:    v_add_f64 v[0:1], v[0:1], s[0:1]
 ; CHECK-NEXT:    v_readlane_b32 s0, v2, 3
@@ -92,11 +92,11 @@ define amdgpu_kernel void @_Z6kernelILi4000ELi1EEvPd(ptr addrspace(1) %x.coerce)
 ; CHECK-NEXT:    s_mov_b32 s4, s0
 ; CHECK-NEXT:    v_writelane_b32 v2, s1, 8
 ; CHECK-NEXT:    v_readlane_b32 s0, v2, 0
-; CHECK-NEXT:    v_readlane_b32 s2, v2, 11
+; CHECK-NEXT:    v_readlane_b32 s2, v2, 9
 ; CHECK-NEXT:    s_add_i32 s2, s2, s0
-; CHECK-NEXT:    v_writelane_b32 v2, s2, 11
+; CHECK-NEXT:    v_writelane_b32 v2, s2, 9
 ; CHECK-NEXT:    v_add_f64 v[0:1], v[0:1], s[4:5]
-; CHECK-NEXT:    v_readlane_b32 s0, v2, 11
+; CHECK-NEXT:    v_readlane_b32 s0, v2, 9
 ; CHECK-NEXT:    s_cmpk_lt_i32 s0, 0xa00
 ; CHECK-NEXT:    s_cbranch_scc1 .LBB0_1
 ; CHECK-NEXT:  ; %bb.2: ; %for.cond.cleanup.loopexit
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll b/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll
index c84eee4f9921d..48e27f4480d78 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll
@@ -39,7 +39,6 @@ define amdgpu_kernel void @__omp_offloading_16_dd2df_main_l9()  {
 ; CHECK-NEXT:    s_cbranch_execz .LBB0_2
 ; CHECK-NEXT:  ; %bb.1: ; %bb193
 ; CHECK-NEXT:  .LBB0_2: ; %bb194
-; CHECK-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:8 ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_or_saveexec_b64 s[8:9], -1
 ; CHECK-NEXT:    buffer_load_dword v1, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_mov_b64 exec, s[8:9]
@@ -47,7 +46,9 @@ define amdgpu_kernel void @__omp_offloading_16_dd2df_main_l9()  {
 ; CHECK-NEXT:    v_readlane_b32 s4, v1, 0
 ; CHECK-NEXT:    v_readlane_b32 s5, v1, 1
 ; CHECK-NEXT:    s_or_b64 exec, exec, s[4:5]
+; CHECK-NEXT:    buffer_load_dword v0, off, s[0:3], 0 offset:8 ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_mov_b32 s4, 0
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    v_cmp_ne_u16_e64 s[4:5], v0, s4
 ; CHECK-NEXT:    s_and_b64 vcc, exec, s[4:5]
 ; CHECK-NEXT:    s_cbranch_vccnz .LBB0_4
diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
index b6e7da97e0089..d2340872eb545 100644
--- a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
+++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
@@ -223,9 +223,7 @@ define amdgpu_gfx void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg)
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[46:47], -1
 ; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[46:47]
-; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_readlane_b32 s36, v0, 4
 ; GFX9-O0-NEXT:    v_readlane_b32 s37, v0, 5
 ; GFX9-O0-NEXT:    s_or_b64 exec, exec, s[36:37]
@@ -233,6 +231,8 @@ define amdgpu_gfx void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg)
 ; GFX9-O0-NEXT:    v_readlane_b32 s39, v0, 1
 ; GFX9-O0-NEXT:    v_readlane_b32 s34, v0, 2
 ; GFX9-O0-NEXT:    v_readlane_b32 s35, v0, 3
+; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    v_cmp_eq_u32_e64 s[36:37], v3, v4
 ; GFX9-O0-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[36:37]

From b3b611fe1a2f41e3e3ccf294c3fd9b1316e70c4d Mon Sep 17 00:00:00 2001
From: Konstantin Varlamov <var-const@users.noreply.github.com>
Date: Fri, 27 Oct 2023 06:49:54 -0700
Subject: [PATCH 222/877] [libc++] Fix flakiness in
 `atomic_notify_all.pass.cpp` (#70436)

Avoid relying on sleep for synchronization.
---
 .../atomic_notify_all.pass.cpp                     | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.wait/atomic_notify_all.pass.cpp b/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.wait/atomic_notify_all.pass.cpp
index d22ea42605711..2c8161eedbc4d 100644
--- a/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.wait/atomic_notify_all.pass.cpp
+++ b/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.wait/atomic_notify_all.pass.cpp
@@ -56,15 +56,21 @@ struct TestFn {
     {
       volatile A a(T(2));
       static_assert(noexcept(std::atomic_notify_all(&a)), "");
-      auto f = [&]() {
+
+      std::atomic<bool> is_ready[2] = {false, false};
+      auto f                        = [&](int index) {
         assert(std::atomic_load(&a) == T(2));
+        is_ready[index].store(true);
+
         std::atomic_wait(&a, T(2));
         assert(std::atomic_load(&a) == T(4));
       };
-      std::thread t1 = support::make_test_thread(f);
-      std::thread t2 = support::make_test_thread(f);
-      std::this_thread::sleep_for(std::chrono::milliseconds(100));
+      std::thread t1 = support::make_test_thread(f, /*index=*/0);
+      std::thread t2 = support::make_test_thread(f, /*index=*/1);
 
+      while (!is_ready[0] || !is_ready[1]) {
+        // Spin
+      }
       std::atomic_store(&a, T(4));
       std::atomic_notify_all(&a);
       t1.join();

From c131455ca222b87552445649695b700c96ffc52e Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Fri, 27 Oct 2023 13:54:02 +0000
Subject: [PATCH 223/877] [gn build] Port b0b88643a1fa

---
 llvm/utils/gn/secondary/llvm/lib/Transforms/Vectorize/BUILD.gn | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/utils/gn/secondary/llvm/lib/Transforms/Vectorize/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Transforms/Vectorize/BUILD.gn
index ca67426e08699..044b781e7c31f 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Transforms/Vectorize/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Transforms/Vectorize/BUILD.gn
@@ -12,6 +12,7 @@ static_library("Vectorize") {
     "LoopVectorize.cpp",
     "SLPVectorizer.cpp",
     "VPlan.cpp",
+    "VPlanAnalysis.cpp",
     "VPlanHCFGBuilder.cpp",
     "VPlanRecipes.cpp",
     "VPlanSLP.cpp",

From e9478b167f2a0f3999676946c2c4a13e30014cfe Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andrzej=20Warzy=C5=84ski?= <andrzej.warzynski@arm.com>
Date: Fri, 27 Oct 2023 15:00:01 +0100
Subject: [PATCH 224/877] [mlir][SVE] Add more e2e test for vector.contract 
 (#70367)

Adds basic integration tests for `vector.contract` for the dot product
and matvec operations. These tests exercise scalable vectors.

Depends on https://github.com/llvm/llvm-project/pull/69845
---
 .../Vector/CPU/ArmSVE/test-contraction.mlir   | 98 ++++++++++++++++++-
 1 file changed, 97 insertions(+), 1 deletion(-)

diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/test-contraction.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/test-contraction.mlir
index 9bc6eab39d69f..d86ff56d79e33 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/test-contraction.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/test-contraction.mlir
@@ -13,12 +13,38 @@
 // REDEFINE: %{entry} = matmul_f32
 // RUN: %{run} | FileCheck %s --check-prefix=F32
 
+// REDEFINE: %{entry} = dot_product_i32
+// RUN: %{run} | FileCheck %s --check-prefix=DP
+
+// REDEFINE: %{entry} = matvec_i32
+// RUN: %{run} | FileCheck %s --check-prefix=MV
+
 // NOTE: These tests are meant to complement the integration tests from:
 //    * ../test-contraction.mlir
 // (tests with fixed width vectors). Rather than duplicating those tests, this
 // file focuses on excercissing scalable vectors in a few most common cases.
 
-// TODO: Masks + matvec + dot product
+// TODO: Masks
+
+#dotp_accesses = [
+  affine_map<(i) -> (i)>,
+  affine_map<(i) -> (i)>,
+  affine_map<(i) -> ()>
+]
+#dotp_trait = {
+  indexing_maps = #dotp_accesses,
+  iterator_types = ["reduction"]
+}
+
+#matvec_accesses = [
+  affine_map<(i, j) -> (i, j)>,
+  affine_map<(i, j) -> (j)>,
+  affine_map<(i, j) -> (i)>
+]
+#matvec_trait = {
+  indexing_maps = #matvec_accesses,
+  iterator_types = ["parallel", "reduction"]
+}
 
 #matmat_accesses = [
   affine_map<(i, j, k) -> (i, k)>,
@@ -30,6 +56,76 @@
   iterator_types = ["parallel", "parallel", "reduction"]
 }
 
+// Contraction: dot-product a x b.
+func.func @dot_product_i32() {
+  %acc = arith.constant 0: i32
+
+  %vector_a = arith.constant dense<123> : vector<[4]xi32>
+  %vector_b = arith.constant dense<314> : vector<[4]xi32>
+  %vector_c = arith.constant dense<0> : vector<[4]xi32>
+
+  // DOT PRODUCT 1
+  %dp1 = vector.contract #dotp_trait %vector_a, %vector_b, %acc
+    : vector<[4]xi32>, vector<[4]xi32> into i32
+  // Dot product should be:
+  //   * val = (123 * 314) * 4 * vscale,
+  // so ...
+  %vscale = vector.vscale
+  %vscale_i32 = arith.index_cast %vscale : index to i32
+  %dp1_div = arith.divui %dp1, %vscale_i32 : i32
+  // ... val / vscale = 123 * 314 * 4 = 154488
+  // DP: 154488
+  vector.print %dp1_div : i32
+
+  // DOT PRODUCT 2
+  // The result of this dot-product should be 0.
+  %dp2 = vector.contract #dotp_trait %vector_a, %vector_c, %acc
+    : vector<[4]xi32>, vector<[4]xi32> into i32
+  // DP: 0
+  vector.print %dp2 : i32
+
+  // DP: SVE: END OF TEST OUTPUT
+  vector.print str "SVE: END OF TEST OUTPUT"
+
+  return
+}
+
+// Contraction: matrix-vector A x c
+func.func @matvec_i32() {
+  %acc = arith.constant dense<0>: vector<3xi32>
+
+  %vector_a = arith.constant dense<123> : vector<3x[4]xi32>
+  %vector_b = arith.constant dense<314> : vector<[4]xi32>
+  %vector_c = arith.constant dense<0> : vector<[4]xi32>
+
+  // MATVEC 1
+  %mv1 = vector.contract #matvec_trait %vector_a, %vector_b, %acc
+    : vector<3x[4]xi32>, vector<[4]xi32> into vector<3xi32>
+  // Every element in the output vector is a result of a dot product, for
+  // which:
+  //    val = (123 * 314) * 4 * vscale
+  // so ...
+  %vscale = vector.vscale
+  %vscale_v = vector.splat %vscale : vector<3xindex>
+  %vscale_i32 = arith.index_cast %vscale_v : vector<3xindex> to vector<3xi32>
+  %mv1_div = arith.divui %mv1, %vscale_i32 : vector<3xi32>
+  // ... val / vscale = 123 * 314 * 4 = 154488
+  // MV: 154488, 154488, 154488
+  vector.print %mv1_div : vector<3xi32>
+
+  // MATVEC 2
+  // The result of this matvec should be a vector of 0s.
+  %mv2 = vector.contract #matvec_trait %vector_a, %vector_c, %acc
+    : vector<3x[4]xi32>, vector<[4]xi32> into vector<3xi32>
+  // MV: 0, 0, 0
+  vector.print %mv2 : vector<3xi32>
+
+  // MV: SVE: END OF TEST OUTPUT
+  vector.print str "SVE: END OF TEST OUTPUT"
+
+  return
+}
+
 func.func @matmul_i32() {
   // Setup vector A:
   %vector_a = arith.constant dense<123> : vector<3x5xi32>

From 396b5621f94ca6fc4d86d4fb0e33e9e51a86affd Mon Sep 17 00:00:00 2001
From: Richard Howell <rmaz@users.noreply.github.com>
Date: Fri, 27 Oct 2023 07:10:46 -0700
Subject: [PATCH 225/877] [clang] use relative paths for builtin headers during
 module compilation (#68023)

When including builtin headers as part of a system module, ensure we use
relative paths to those headers. Otherwise the module will fail to compile
when specifying relative resource directories without extra search paths.
---
 clang/include/clang/Lex/ModuleMap.h                   |  5 ++++-
 clang/lib/Lex/ModuleMap.cpp                           |  9 ++++++++-
 clang/lib/Lex/PPDirectives.cpp                        |  9 ++++++++-
 .../Modules/Inputs/builtin-headers/module.modulemap   |  3 +++
 clang/test/Modules/relative-resource-dir.m            | 11 +++++++++++
 5 files changed, 34 insertions(+), 3 deletions(-)
 create mode 100644 clang/test/Modules/Inputs/builtin-headers/module.modulemap
 create mode 100644 clang/test/Modules/relative-resource-dir.m

diff --git a/clang/include/clang/Lex/ModuleMap.h b/clang/include/clang/Lex/ModuleMap.h
index 054d4211b9c6d..69ec8f34a3242 100644
--- a/clang/include/clang/Lex/ModuleMap.h
+++ b/clang/include/clang/Lex/ModuleMap.h
@@ -410,13 +410,16 @@ class ModuleMap {
   }
 
   /// Get the directory that contains Clang-supplied include files.
-  const DirectoryEntry *getBuiltinDir() const {
+  OptionalDirectoryEntryRefDegradesToDirectoryEntryPtr getBuiltinDir() const {
     return BuiltinIncludeDir;
   }
 
   /// Is this a compiler builtin header?
   bool isBuiltinHeader(FileEntryRef File);
 
+  bool shouldImportRelativeToBuiltinIncludeDir(StringRef FileName,
+                                               Module *Module) const;
+
   /// Add a module map callback.
   void addModuleMapCallbacks(std::unique_ptr<ModuleMapCallbacks> Callback) {
     Callbacks.push_back(std::move(Callback));
diff --git a/clang/lib/Lex/ModuleMap.cpp b/clang/lib/Lex/ModuleMap.cpp
index eb7cab54386c4..e4f3cdd3b163f 100644
--- a/clang/lib/Lex/ModuleMap.cpp
+++ b/clang/lib/Lex/ModuleMap.cpp
@@ -348,8 +348,8 @@ bool ModuleMap::resolveAsBuiltinHeader(
   if (!File)
     return false;
 
+  Module::Header H = {Header.FileName, Header.FileName, *File};
   auto Role = headerKindToRole(Header.Kind);
-  Module::Header H = {Header.FileName, std::string(Path.str()), *File};
   addHeader(Mod, H, Role);
   return true;
 }
@@ -417,6 +417,13 @@ bool ModuleMap::isBuiltinHeader(FileEntryRef File) {
          isBuiltinHeaderName(llvm::sys::path::filename(File.getName()));
 }
 
+bool ModuleMap::shouldImportRelativeToBuiltinIncludeDir(StringRef FileName,
+                                                        Module *Module) const {
+  return LangOpts.BuiltinHeadersInSystemModules && BuiltinIncludeDir &&
+         Module->IsSystem && !Module->isPartOfFramework() &&
+         isBuiltinHeaderName(FileName);
+}
+
 ModuleMap::HeadersMap::iterator ModuleMap::findKnownHeader(FileEntryRef File) {
   resolveHeaderDirectives(File);
   HeadersMap::iterator Known = Headers.find(File);
diff --git a/clang/lib/Lex/PPDirectives.cpp b/clang/lib/Lex/PPDirectives.cpp
index 2892d4b777846..6fd515a5f0c75 100644
--- a/clang/lib/Lex/PPDirectives.cpp
+++ b/clang/lib/Lex/PPDirectives.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "clang/Basic/CharInfo.h"
+#include "clang/Basic/DirectoryEntry.h"
 #include "clang/Basic/FileManager.h"
 #include "clang/Basic/IdentifierTable.h"
 #include "clang/Basic/LangOptions.h"
@@ -21,6 +22,7 @@
 #include "clang/Basic/TokenKinds.h"
 #include "clang/Lex/CodeCompletionHandler.h"
 #include "clang/Lex/HeaderSearch.h"
+#include "clang/Lex/HeaderSearchOptions.h"
 #include "clang/Lex/LexDiagnostic.h"
 #include "clang/Lex/LiteralSupport.h"
 #include "clang/Lex/MacroInfo.h"
@@ -982,7 +984,12 @@ OptionalFileEntryRef Preprocessor::LookupFile(
     // map file.
     if (!FileEnt) {
       if (FID == SourceMgr.getMainFileID() && MainFileDir) {
-        Includers.push_back(std::make_pair(std::nullopt, *MainFileDir));
+        auto IncludeDir =
+            HeaderInfo.getModuleMap().shouldImportRelativeToBuiltinIncludeDir(
+                Filename, getCurrentModule())
+                ? HeaderInfo.getModuleMap().getBuiltinDir()
+                : MainFileDir;
+        Includers.push_back(std::make_pair(std::nullopt, *IncludeDir));
         BuildSystemModule = getCurrentModule()->IsSystem;
       } else if ((FileEnt = SourceMgr.getFileEntryRefForID(
                       SourceMgr.getMainFileID()))) {
diff --git a/clang/test/Modules/Inputs/builtin-headers/module.modulemap b/clang/test/Modules/Inputs/builtin-headers/module.modulemap
new file mode 100644
index 0000000000000..78a5b730dc6a9
--- /dev/null
+++ b/clang/test/Modules/Inputs/builtin-headers/module.modulemap
@@ -0,0 +1,3 @@
+module ModuleWithBuiltinHeader [system] {
+    header "float.h"
+}
\ No newline at end of file
diff --git a/clang/test/Modules/relative-resource-dir.m b/clang/test/Modules/relative-resource-dir.m
new file mode 100644
index 0000000000000..2efc61259fd78
--- /dev/null
+++ b/clang/test/Modules/relative-resource-dir.m
@@ -0,0 +1,11 @@
+// REQUIRES: shell
+
+// RUN: EXPECTED_RESOURCE_DIR=`%clang -print-resource-dir` && \
+// RUN:  mkdir -p %t && rm -rf %t/resource-dir && \
+// RUN:  cp -R $EXPECTED_RESOURCE_DIR %t/resource-dir
+// RUN: cd %t && %clang -cc1 -x objective-c -fmodules -fmodule-format=obj \
+// RUN:   -fimplicit-module-maps -fmodules-cache-path=%t.mcp \
+// RUN:   -fbuiltin-headers-in-system-modules \
+// RUN:   -resource-dir resource-dir \
+// RUN:   -emit-module %S/Inputs/builtin-headers/module.modulemap \
+// RUN:   -fmodule-name=ModuleWithBuiltinHeader -o %t.pcm

From deb429e5b0003b4951b6387fe6d0cb0ea4181f8a Mon Sep 17 00:00:00 2001
From: Igor Kirillov <igor.kirillov@arm.com>
Date: Fri, 27 Oct 2023 14:12:45 +0000
Subject: [PATCH 226/877] Revert "[CodeGen] Improve ExpandMemCmp for more
 efficient non-register aligned sizes handling (#69942)"

This reverts commit 9bcb30d31813bbdea6b65789f64aed3f0e58d507.
---
 .../llvm/Analysis/TargetTransformInfo.h       |   11 -
 llvm/lib/CodeGen/ExpandMemCmp.cpp             |   93 +-
 .../AArch64/AArch64TargetTransformInfo.cpp    |    1 -
 llvm/test/CodeGen/AArch64/memcmp.ll           | 3002 -----------------
 .../Transforms/ExpandMemCmp/AArch64/memcmp.ll |  877 -----
 5 files changed, 19 insertions(+), 3965 deletions(-)
 delete mode 100644 llvm/test/CodeGen/AArch64/memcmp.ll
 delete mode 100644 llvm/test/Transforms/ExpandMemCmp/AArch64/memcmp.ll

diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 3ec80d99b392b..5234ef8788d9e 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -907,17 +907,6 @@ class TargetTransformInfo {
     // be done with two 4-byte compares instead of 4+2+1-byte compares. This
     // requires all loads in LoadSizes to be doable in an unaligned way.
     bool AllowOverlappingLoads = false;
-
-    // Sometimes, the amount of data that needs to be compared is smaller than
-    // the standard register size, but it cannot be loaded with just one load
-    // instruction. For example, if the size of the memory comparison is 6
-    // bytes, we can handle it more efficiently by loading all 6 bytes in a
-    // single block and generating an 8-byte number, instead of generating two
-    // separate blocks with conditional jumps for 4 and 2 byte loads. This
-    // approach simplifies the process and produces the comparison result as
-    // normal. This array lists the allowed sizes of memcmp tails that can be
-    // merged into one block
-    SmallVector<unsigned, 4> AllowedTailExpansions;
   };
   MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize,
                                                bool IsZeroCmp) const;
diff --git a/llvm/lib/CodeGen/ExpandMemCmp.cpp b/llvm/lib/CodeGen/ExpandMemCmp.cpp
index 3f948f734fcf7..911ebd41afc5b 100644
--- a/llvm/lib/CodeGen/ExpandMemCmp.cpp
+++ b/llvm/lib/CodeGen/ExpandMemCmp.cpp
@@ -117,8 +117,8 @@ class MemCmpExpansion {
     Value *Lhs = nullptr;
     Value *Rhs = nullptr;
   };
-  LoadPair getLoadPair(Type *LoadSizeType, Type *BSwapSizeType,
-                       Type *CmpSizeType, unsigned OffsetBytes);
+  LoadPair getLoadPair(Type *LoadSizeType, bool NeedsBSwap, Type *CmpSizeType,
+                       unsigned OffsetBytes);
 
   static LoadEntryVector
   computeGreedyLoadSequence(uint64_t Size, llvm::ArrayRef<unsigned> LoadSizes,
@@ -128,11 +128,6 @@ class MemCmpExpansion {
                                  unsigned MaxNumLoads,
                                  unsigned &NumLoadsNonOneByte);
 
-  static void optimiseLoadSequence(
-      LoadEntryVector &LoadSequence,
-      const TargetTransformInfo::MemCmpExpansionOptions &Options,
-      bool IsUsedForZeroCmp);
-
 public:
   MemCmpExpansion(CallInst *CI, uint64_t Size,
                   const TargetTransformInfo::MemCmpExpansionOptions &Options,
@@ -215,37 +210,6 @@ MemCmpExpansion::computeOverlappingLoadSequence(uint64_t Size,
   return LoadSequence;
 }
 
-void MemCmpExpansion::optimiseLoadSequence(
-    LoadEntryVector &LoadSequence,
-    const TargetTransformInfo::MemCmpExpansionOptions &Options,
-    bool IsUsedForZeroCmp) {
-  // This part of code attempts to optimize the LoadSequence by merging allowed
-  // subsequences into single loads of allowed sizes from
-  // `MemCmpExpansionOptions::AllowedTailExpansions`. If it is for zero
-  // comparison or if no allowed tail expansions are specified, we exit early.
-  if (IsUsedForZeroCmp || Options.AllowedTailExpansions.empty())
-    return;
-
-  while (LoadSequence.size() >= 2) {
-    auto Last = LoadSequence[LoadSequence.size() - 1];
-    auto PreLast = LoadSequence[LoadSequence.size() - 2];
-
-    // Exit the loop if the two sequences are not contiguous
-    if (PreLast.Offset + PreLast.LoadSize != Last.Offset)
-      break;
-
-    auto LoadSize = Last.LoadSize + PreLast.LoadSize;
-    if (find(Options.AllowedTailExpansions, LoadSize) ==
-        Options.AllowedTailExpansions.end())
-      break;
-
-    // Remove the last two sequences and replace with the combined sequence
-    LoadSequence.pop_back();
-    LoadSequence.pop_back();
-    LoadSequence.emplace_back(PreLast.Offset, LoadSize);
-  }
-}
-
 // Initialize the basic block structure required for expansion of memcmp call
 // with given maximum load size and memcmp size parameter.
 // This structure includes:
@@ -291,7 +255,6 @@ MemCmpExpansion::MemCmpExpansion(
     }
   }
   assert(LoadSequence.size() <= Options.MaxNumLoads && "broken invariant");
-  optimiseLoadSequence(LoadSequence, Options, IsUsedForZeroCmp);
 }
 
 unsigned MemCmpExpansion::getNumBlocks() {
@@ -315,7 +278,7 @@ void MemCmpExpansion::createResultBlock() {
 }
 
 MemCmpExpansion::LoadPair MemCmpExpansion::getLoadPair(Type *LoadSizeType,
-                                                       Type *BSwapSizeType,
+                                                       bool NeedsBSwap,
                                                        Type *CmpSizeType,
                                                        unsigned OffsetBytes) {
   // Get the memory source at offset `OffsetBytes`.
@@ -344,22 +307,16 @@ MemCmpExpansion::LoadPair MemCmpExpansion::getLoadPair(Type *LoadSizeType,
   if (!Rhs)
     Rhs = Builder.CreateAlignedLoad(LoadSizeType, RhsSource, RhsAlign);
 
-  // Zero extend if Byte Swap intrinsic has different type
-  if (BSwapSizeType && LoadSizeType != BSwapSizeType) {
-    Lhs = Builder.CreateZExt(Lhs, BSwapSizeType);
-    Rhs = Builder.CreateZExt(Rhs, BSwapSizeType);
-  }
-
   // Swap bytes if required.
-  if (BSwapSizeType) {
-    Function *Bswap = Intrinsic::getDeclaration(
-        CI->getModule(), Intrinsic::bswap, BSwapSizeType);
+  if (NeedsBSwap) {
+    Function *Bswap = Intrinsic::getDeclaration(CI->getModule(),
+                                                Intrinsic::bswap, LoadSizeType);
     Lhs = Builder.CreateCall(Bswap, Lhs);
     Rhs = Builder.CreateCall(Bswap, Rhs);
   }
 
   // Zero extend if required.
-  if (CmpSizeType != nullptr && CmpSizeType != Lhs->getType()) {
+  if (CmpSizeType != nullptr && CmpSizeType != LoadSizeType) {
     Lhs = Builder.CreateZExt(Lhs, CmpSizeType);
     Rhs = Builder.CreateZExt(Rhs, CmpSizeType);
   }
@@ -375,7 +332,7 @@ void MemCmpExpansion::emitLoadCompareByteBlock(unsigned BlockIndex,
   BasicBlock *BB = LoadCmpBlocks[BlockIndex];
   Builder.SetInsertPoint(BB);
   const LoadPair Loads =
-      getLoadPair(Type::getInt8Ty(CI->getContext()), nullptr,
+      getLoadPair(Type::getInt8Ty(CI->getContext()), /*NeedsBSwap=*/false,
                   Type::getInt32Ty(CI->getContext()), OffsetBytes);
   Value *Diff = Builder.CreateSub(Loads.Lhs, Loads.Rhs);
 
@@ -428,12 +385,11 @@ Value *MemCmpExpansion::getCompareLoadPairs(unsigned BlockIndex,
   IntegerType *const MaxLoadType =
       NumLoads == 1 ? nullptr
                     : IntegerType::get(CI->getContext(), MaxLoadSize * 8);
-
   for (unsigned i = 0; i < NumLoads; ++i, ++LoadIndex) {
     const LoadEntry &CurLoadEntry = LoadSequence[LoadIndex];
     const LoadPair Loads = getLoadPair(
-        IntegerType::get(CI->getContext(), CurLoadEntry.LoadSize * 8), nullptr,
-        MaxLoadType, CurLoadEntry.Offset);
+        IntegerType::get(CI->getContext(), CurLoadEntry.LoadSize * 8),
+        /*NeedsBSwap=*/false, MaxLoadType, CurLoadEntry.Offset);
 
     if (NumLoads != 1) {
       // If we have multiple loads per block, we need to generate a composite
@@ -519,20 +475,14 @@ void MemCmpExpansion::emitLoadCompareBlock(unsigned BlockIndex) {
 
   Type *LoadSizeType =
       IntegerType::get(CI->getContext(), CurLoadEntry.LoadSize * 8);
-  Type *BSwapSizeType =
-      DL.isLittleEndian()
-          ? IntegerType::get(CI->getContext(),
-                             PowerOf2Ceil(CurLoadEntry.LoadSize * 8))
-          : nullptr;
-  Type *MaxLoadType = IntegerType::get(
-      CI->getContext(),
-      std::max(MaxLoadSize, (unsigned)PowerOf2Ceil(CurLoadEntry.LoadSize)) * 8);
+  Type *MaxLoadType = IntegerType::get(CI->getContext(), MaxLoadSize * 8);
   assert(CurLoadEntry.LoadSize <= MaxLoadSize && "Unexpected load type");
 
   Builder.SetInsertPoint(LoadCmpBlocks[BlockIndex]);
 
-  const LoadPair Loads = getLoadPair(LoadSizeType, BSwapSizeType, MaxLoadType,
-                                     CurLoadEntry.Offset);
+  const LoadPair Loads =
+      getLoadPair(LoadSizeType, /*NeedsBSwap=*/DL.isLittleEndian(), MaxLoadType,
+                  CurLoadEntry.Offset);
 
   // Add the loaded values to the phi nodes for calculating memcmp result only
   // if result is not used in a zero equality.
@@ -637,24 +587,19 @@ Value *MemCmpExpansion::getMemCmpEqZeroOneBlock() {
 /// A memcmp expansion that only has one block of load and compare can bypass
 /// the compare, branch, and phi IR that is required in the general case.
 Value *MemCmpExpansion::getMemCmpOneBlock() {
-  bool NeedsBSwap = DL.isLittleEndian() && Size != 1;
   Type *LoadSizeType = IntegerType::get(CI->getContext(), Size * 8);
-  Type *BSwapSizeType =
-      NeedsBSwap ? IntegerType::get(CI->getContext(), PowerOf2Ceil(Size * 8))
-                 : nullptr;
-  Type *MaxLoadType =
-      IntegerType::get(CI->getContext(),
-                       std::max(MaxLoadSize, (unsigned)PowerOf2Ceil(Size)) * 8);
+  bool NeedsBSwap = DL.isLittleEndian() && Size != 1;
 
   // The i8 and i16 cases don't need compares. We zext the loaded values and
   // subtract them to get the suitable negative, zero, or positive i32 result.
   if (Size < 4) {
-    const LoadPair Loads = getLoadPair(LoadSizeType, BSwapSizeType,
-                                       Builder.getInt32Ty(), /*Offset*/ 0);
+    const LoadPair Loads =
+        getLoadPair(LoadSizeType, NeedsBSwap, Builder.getInt32Ty(),
+                    /*Offset*/ 0);
     return Builder.CreateSub(Loads.Lhs, Loads.Rhs);
   }
 
-  const LoadPair Loads = getLoadPair(LoadSizeType, BSwapSizeType, MaxLoadType,
+  const LoadPair Loads = getLoadPair(LoadSizeType, NeedsBSwap, LoadSizeType,
                                      /*Offset*/ 0);
   // The result of memcmp is negative, zero, or positive, so produce that by
   // subtracting 2 extended compare bits: sub (ugt, ult).
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 1d9dcfc4e9f44..5f2d09f0765aa 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -2961,7 +2961,6 @@ AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
   // they may wake up the FP unit, which raises the power consumption.  Perhaps
   // they could be used with no holds barred (-O3).
   Options.LoadSizes = {8, 4, 2, 1};
-  Options.AllowedTailExpansions = {3, 5, 6};
   return Options;
 }
 
diff --git a/llvm/test/CodeGen/AArch64/memcmp.ll b/llvm/test/CodeGen/AArch64/memcmp.ll
deleted file mode 100644
index b38acbae10915..0000000000000
--- a/llvm/test/CodeGen/AArch64/memcmp.ll
+++ /dev/null
@@ -1,3002 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
-; RUN: llc < %s -mtriple=aarch64-unknown-unknown | FileCheck %s
-
-@.str = private constant [513 x i8] c"01234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901\00", align 1
-
-declare dso_local i32 @memcmp(ptr, ptr, i64)
-
-define i32 @length0(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length0:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w0, wzr
-; CHECK-NEXT:    ret
-   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 0) nounwind
-   ret i32 %m
- }
-
-define i1 @length0_eq(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length0_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w0, #1 // =0x1
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 0) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length0_lt(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length0_lt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w0, wzr
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 0) nounwind
-  %c = icmp slt i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length2(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length2:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldrh w8, [x0]
-; CHECK-NEXT:    ldrh w9, [x1]
-; CHECK-NEXT:    rev w8, w8
-; CHECK-NEXT:    rev w9, w9
-; CHECK-NEXT:    lsr w8, w8, #16
-; CHECK-NEXT:    sub w0, w8, w9, lsr #16
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind
-  ret i32 %m
-}
-
-define i32 @length2_const(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length2_const:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldrh w9, [x0]
-; CHECK-NEXT:    mov w8, #-12594 // =0xffffcece
-; CHECK-NEXT:    rev w9, w9
-; CHECK-NEXT:    add w0, w8, w9, lsr #16
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i64 2) nounwind
-  ret i32 %m
-}
-
-define i1 @length2_gt_const(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length2_gt_const:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldrh w9, [x0]
-; CHECK-NEXT:    mov w8, #-12594 // =0xffffcece
-; CHECK-NEXT:    rev w9, w9
-; CHECK-NEXT:    add w8, w8, w9, lsr #16
-; CHECK-NEXT:    cmp w8, #0
-; CHECK-NEXT:    cset w0, gt
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i64 2) nounwind
-  %c = icmp sgt i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length2_eq(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length2_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldrh w8, [x0]
-; CHECK-NEXT:    ldrh w9, [x1]
-; CHECK-NEXT:    cmp w8, w9
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length2_lt(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length2_lt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldrh w8, [x0]
-; CHECK-NEXT:    ldrh w9, [x1]
-; CHECK-NEXT:    rev w8, w8
-; CHECK-NEXT:    rev w9, w9
-; CHECK-NEXT:    lsr w8, w8, #16
-; CHECK-NEXT:    sub w8, w8, w9, lsr #16
-; CHECK-NEXT:    lsr w0, w8, #31
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind
-  %c = icmp slt i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length2_gt(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length2_gt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldrh w8, [x0]
-; CHECK-NEXT:    ldrh w9, [x1]
-; CHECK-NEXT:    rev w8, w8
-; CHECK-NEXT:    rev w9, w9
-; CHECK-NEXT:    lsr w8, w8, #16
-; CHECK-NEXT:    sub w8, w8, w9, lsr #16
-; CHECK-NEXT:    cmp w8, #0
-; CHECK-NEXT:    cset w0, gt
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind
-  %c = icmp sgt i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length2_eq_const(ptr %X) nounwind {
-; CHECK-LABEL: length2_eq_const:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldrh w8, [x0]
-; CHECK-NEXT:    mov w9, #12849 // =0x3231
-; CHECK-NEXT:    cmp w8, w9
-; CHECK-NEXT:    cset w0, ne
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i64 2) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length2_eq_nobuiltin_attr(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length2_eq_nobuiltin_attr:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    mov w2, #2 // =0x2
-; CHECK-NEXT:    bl memcmp
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind nobuiltin
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length3(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length3:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldrb w8, [x0, #2]
-; CHECK-NEXT:    ldrh w9, [x0]
-; CHECK-NEXT:    ldrb w10, [x1, #2]
-; CHECK-NEXT:    ldrh w11, [x1]
-; CHECK-NEXT:    orr w8, w9, w8, lsl #16
-; CHECK-NEXT:    orr w9, w11, w10, lsl #16
-; CHECK-NEXT:    rev w8, w8
-; CHECK-NEXT:    rev w9, w9
-; CHECK-NEXT:    sub w0, w8, w9
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 3) nounwind
-  ret i32 %m
-}
-
-define i1 @length3_eq(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length3_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldrh w8, [x0]
-; CHECK-NEXT:    ldrh w9, [x1]
-; CHECK-NEXT:    ldrb w10, [x0, #2]
-; CHECK-NEXT:    ldrb w11, [x1, #2]
-; CHECK-NEXT:    cmp w8, w9
-; CHECK-NEXT:    ccmp w10, w11, #0, eq
-; CHECK-NEXT:    cset w0, ne
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 3) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length4(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length4:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr w8, [x0]
-; CHECK-NEXT:    ldr w9, [x1]
-; CHECK-NEXT:    rev w8, w8
-; CHECK-NEXT:    rev w9, w9
-; CHECK-NEXT:    cmp w8, w9
-; CHECK-NEXT:    cset w8, hi
-; CHECK-NEXT:    cset w9, lo
-; CHECK-NEXT:    sub w0, w8, w9
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind
-  ret i32 %m
-}
-
-define i1 @length4_eq(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length4_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr w8, [x0]
-; CHECK-NEXT:    ldr w9, [x1]
-; CHECK-NEXT:    cmp w8, w9
-; CHECK-NEXT:    cset w0, ne
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length4_lt(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length4_lt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr w8, [x0]
-; CHECK-NEXT:    ldr w9, [x1]
-; CHECK-NEXT:    rev w8, w8
-; CHECK-NEXT:    rev w9, w9
-; CHECK-NEXT:    cmp w8, w9
-; CHECK-NEXT:    cset w8, hi
-; CHECK-NEXT:    cset w9, lo
-; CHECK-NEXT:    sub w8, w8, w9
-; CHECK-NEXT:    lsr w0, w8, #31
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind
-  %c = icmp slt i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length4_gt(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length4_gt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr w8, [x0]
-; CHECK-NEXT:    ldr w9, [x1]
-; CHECK-NEXT:    rev w8, w8
-; CHECK-NEXT:    rev w9, w9
-; CHECK-NEXT:    cmp w8, w9
-; CHECK-NEXT:    cset w8, hi
-; CHECK-NEXT:    cset w9, lo
-; CHECK-NEXT:    sub w8, w8, w9
-; CHECK-NEXT:    cmp w8, #0
-; CHECK-NEXT:    cset w0, gt
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind
-  %c = icmp sgt i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length4_eq_const(ptr %X) nounwind {
-; CHECK-LABEL: length4_eq_const:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr w8, [x0]
-; CHECK-NEXT:    mov w9, #12849 // =0x3231
-; CHECK-NEXT:    movk w9, #13363, lsl #16
-; CHECK-NEXT:    cmp w8, w9
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i64 4) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length5(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length5:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldrb w8, [x0, #4]
-; CHECK-NEXT:    ldr w9, [x0]
-; CHECK-NEXT:    ldrb w10, [x1, #4]
-; CHECK-NEXT:    ldr w11, [x1]
-; CHECK-NEXT:    orr x8, x9, x8, lsl #32
-; CHECK-NEXT:    orr x9, x11, x10, lsl #32
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    cset w8, hi
-; CHECK-NEXT:    cset w9, lo
-; CHECK-NEXT:    sub w0, w8, w9
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 5) nounwind
-  ret i32 %m
-}
-
-define i1 @length5_eq(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length5_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr w8, [x0]
-; CHECK-NEXT:    ldr w9, [x1]
-; CHECK-NEXT:    ldrb w10, [x0, #4]
-; CHECK-NEXT:    ldrb w11, [x1, #4]
-; CHECK-NEXT:    cmp w8, w9
-; CHECK-NEXT:    ccmp w10, w11, #0, eq
-; CHECK-NEXT:    cset w0, ne
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 5) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length5_lt(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length5_lt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldrb w8, [x0, #4]
-; CHECK-NEXT:    ldr w9, [x0]
-; CHECK-NEXT:    ldrb w10, [x1, #4]
-; CHECK-NEXT:    ldr w11, [x1]
-; CHECK-NEXT:    orr x8, x9, x8, lsl #32
-; CHECK-NEXT:    orr x9, x11, x10, lsl #32
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    cset w8, hi
-; CHECK-NEXT:    cset w9, lo
-; CHECK-NEXT:    sub w8, w8, w9
-; CHECK-NEXT:    lsr w0, w8, #31
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 5) nounwind
-  %c = icmp slt i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length6(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length6:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldrh w8, [x0, #4]
-; CHECK-NEXT:    ldr w9, [x0]
-; CHECK-NEXT:    ldrh w10, [x1, #4]
-; CHECK-NEXT:    ldr w11, [x1]
-; CHECK-NEXT:    orr x8, x9, x8, lsl #32
-; CHECK-NEXT:    orr x9, x11, x10, lsl #32
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    cset w8, hi
-; CHECK-NEXT:    cset w9, lo
-; CHECK-NEXT:    sub w0, w8, w9
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 6) nounwind
-  ret i32 %m
-}
-
-define i32 @length7(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length7:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr w8, [x0]
-; CHECK-NEXT:    ldr w9, [x1]
-; CHECK-NEXT:    rev w8, w8
-; CHECK-NEXT:    rev w9, w9
-; CHECK-NEXT:    cmp w8, w9
-; CHECK-NEXT:    b.ne .LBB22_3
-; CHECK-NEXT:  // %bb.1: // %loadbb1
-; CHECK-NEXT:    ldur w8, [x0, #3]
-; CHECK-NEXT:    ldur w9, [x1, #3]
-; CHECK-NEXT:    rev w8, w8
-; CHECK-NEXT:    rev w9, w9
-; CHECK-NEXT:    cmp w8, w9
-; CHECK-NEXT:    b.ne .LBB22_3
-; CHECK-NEXT:  // %bb.2:
-; CHECK-NEXT:    mov w0, wzr
-; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB22_3: // %res_block
-; CHECK-NEXT:    cmp w8, w9
-; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
-; CHECK-NEXT:    cneg w0, w8, hs
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 7) nounwind
-  ret i32 %m
-}
-
-define i1 @length7_lt(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length7_lt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr w8, [x0]
-; CHECK-NEXT:    ldr w9, [x1]
-; CHECK-NEXT:    rev w8, w8
-; CHECK-NEXT:    rev w9, w9
-; CHECK-NEXT:    cmp w8, w9
-; CHECK-NEXT:    b.ne .LBB23_3
-; CHECK-NEXT:  // %bb.1: // %loadbb1
-; CHECK-NEXT:    ldur w8, [x0, #3]
-; CHECK-NEXT:    ldur w9, [x1, #3]
-; CHECK-NEXT:    rev w8, w8
-; CHECK-NEXT:    rev w9, w9
-; CHECK-NEXT:    cmp w8, w9
-; CHECK-NEXT:    b.ne .LBB23_3
-; CHECK-NEXT:  // %bb.2:
-; CHECK-NEXT:    lsr w0, wzr, #31
-; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB23_3: // %res_block
-; CHECK-NEXT:    cmp w8, w9
-; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
-; CHECK-NEXT:    cneg w8, w8, hs
-; CHECK-NEXT:    lsr w0, w8, #31
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 7) nounwind
-  %c = icmp slt i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length7_eq(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length7_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr w8, [x0]
-; CHECK-NEXT:    ldr w9, [x1]
-; CHECK-NEXT:    ldur w10, [x0, #3]
-; CHECK-NEXT:    ldur w11, [x1, #3]
-; CHECK-NEXT:    cmp w8, w9
-; CHECK-NEXT:    ccmp w10, w11, #0, eq
-; CHECK-NEXT:    cset w0, ne
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 7) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length8(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length8:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    cset w8, hi
-; CHECK-NEXT:    cset w9, lo
-; CHECK-NEXT:    sub w0, w8, w9
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 8) nounwind
-  ret i32 %m
-}
-
-define i1 @length8_eq(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length8_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 8) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length8_eq_const(ptr %X) nounwind {
-; CHECK-LABEL: length8_eq_const:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x9, #12592 // =0x3130
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    movk x9, #13106, lsl #16
-; CHECK-NEXT:    movk x9, #13620, lsl #32
-; CHECK-NEXT:    movk x9, #14134, lsl #48
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    cset w0, ne
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 8) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length9(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length9:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB28_2
-; CHECK-NEXT:  // %bb.1: // %loadbb1
-; CHECK-NEXT:    ldrb w8, [x0, #8]
-; CHECK-NEXT:    ldrb w9, [x1, #8]
-; CHECK-NEXT:    sub w0, w8, w9
-; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB28_2: // %res_block
-; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
-; CHECK-NEXT:    cneg w0, w8, hs
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 9) nounwind
-  ret i32 %m
-}
-
-define i1 @length9_eq(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length9_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    ldrb w10, [x0, #8]
-; CHECK-NEXT:    ldrb w11, [x1, #8]
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    ccmp x10, x11, #0, eq
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 9) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length10(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length10:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB30_3
-; CHECK-NEXT:  // %bb.1: // %loadbb1
-; CHECK-NEXT:    ldrh w8, [x0, #8]
-; CHECK-NEXT:    ldrh w9, [x1, #8]
-; CHECK-NEXT:    rev w8, w8
-; CHECK-NEXT:    rev w9, w9
-; CHECK-NEXT:    lsr w8, w8, #16
-; CHECK-NEXT:    lsr w9, w9, #16
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB30_3
-; CHECK-NEXT:  // %bb.2:
-; CHECK-NEXT:    mov w0, wzr
-; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB30_3: // %res_block
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
-; CHECK-NEXT:    cneg w0, w8, hs
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 10) nounwind
-  ret i32 %m
-}
-
-define i1 @length10_eq(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length10_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    ldrh w10, [x0, #8]
-; CHECK-NEXT:    ldrh w11, [x1, #8]
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    ccmp x10, x11, #0, eq
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 10) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length11(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length11:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB32_3
-; CHECK-NEXT:  // %bb.1: // %loadbb1
-; CHECK-NEXT:    ldur x8, [x0, #3]
-; CHECK-NEXT:    ldur x9, [x1, #3]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB32_3
-; CHECK-NEXT:  // %bb.2:
-; CHECK-NEXT:    mov w0, wzr
-; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB32_3: // %res_block
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
-; CHECK-NEXT:    cneg w0, w8, hs
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 11) nounwind
-  ret i32 %m
-}
-
-define i1 @length11_eq(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length11_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    ldur x10, [x0, #3]
-; CHECK-NEXT:    ldur x11, [x1, #3]
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    ccmp x10, x11, #0, eq
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 11) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length12_eq(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length12_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    ldr w10, [x0, #8]
-; CHECK-NEXT:    ldr w11, [x1, #8]
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    ccmp x10, x11, #0, eq
-; CHECK-NEXT:    cset w0, ne
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 12) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length12(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length12:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB35_3
-; CHECK-NEXT:  // %bb.1: // %loadbb1
-; CHECK-NEXT:    ldr w8, [x0, #8]
-; CHECK-NEXT:    ldr w9, [x1, #8]
-; CHECK-NEXT:    rev w8, w8
-; CHECK-NEXT:    rev w9, w9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB35_3
-; CHECK-NEXT:  // %bb.2:
-; CHECK-NEXT:    mov w0, wzr
-; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB35_3: // %res_block
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
-; CHECK-NEXT:    cneg w0, w8, hs
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 12) nounwind
-  ret i32 %m
-}
-
-define i1 @length13_eq(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length13_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    ldur x10, [x0, #5]
-; CHECK-NEXT:    ldur x11, [x1, #5]
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    ccmp x10, x11, #0, eq
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 13) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length14_eq(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length14_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    ldur x10, [x0, #6]
-; CHECK-NEXT:    ldur x11, [x1, #6]
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    ccmp x10, x11, #0, eq
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 14) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length15(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length15:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB38_3
-; CHECK-NEXT:  // %bb.1: // %loadbb1
-; CHECK-NEXT:    ldur x8, [x0, #7]
-; CHECK-NEXT:    ldur x9, [x1, #7]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB38_3
-; CHECK-NEXT:  // %bb.2:
-; CHECK-NEXT:    mov w0, wzr
-; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB38_3: // %res_block
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
-; CHECK-NEXT:    cneg w0, w8, hs
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 15) nounwind
-  ret i32 %m
-}
-
-define i1 @length15_lt(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length15_lt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB39_3
-; CHECK-NEXT:  // %bb.1: // %loadbb1
-; CHECK-NEXT:    ldur x8, [x0, #7]
-; CHECK-NEXT:    ldur x9, [x1, #7]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB39_3
-; CHECK-NEXT:  // %bb.2:
-; CHECK-NEXT:    lsr w0, wzr, #31
-; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB39_3: // %res_block
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
-; CHECK-NEXT:    cneg w8, w8, hs
-; CHECK-NEXT:    lsr w0, w8, #31
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 15) nounwind
-  %c = icmp slt i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length15_const(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length15_const:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x8, #14136 // =0x3738
-; CHECK-NEXT:    ldr x9, [x0]
-; CHECK-NEXT:    movk x8, #13622, lsl #16
-; CHECK-NEXT:    movk x8, #13108, lsl #32
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    movk x8, #12594, lsl #48
-; CHECK-NEXT:    cmp x9, x8
-; CHECK-NEXT:    b.ne .LBB40_3
-; CHECK-NEXT:  // %bb.1: // %loadbb1
-; CHECK-NEXT:    mov x8, #13365 // =0x3435
-; CHECK-NEXT:    ldur x9, [x0, #7]
-; CHECK-NEXT:    movk x8, #12851, lsl #16
-; CHECK-NEXT:    movk x8, #12337, lsl #32
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    movk x8, #14393, lsl #48
-; CHECK-NEXT:    cmp x9, x8
-; CHECK-NEXT:    b.ne .LBB40_3
-; CHECK-NEXT:  // %bb.2:
-; CHECK-NEXT:    mov w0, wzr
-; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB40_3: // %res_block
-; CHECK-NEXT:    cmp x9, x8
-; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
-; CHECK-NEXT:    cneg w0, w8, hs
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i64 15) nounwind
-  ret i32 %m
-}
-
-define i1 @length15_eq(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length15_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    ldur x10, [x0, #7]
-; CHECK-NEXT:    ldur x11, [x1, #7]
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    ccmp x10, x11, #0, eq
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 15) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i1 @length15_gt_const(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length15_gt_const:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x8, #14136 // =0x3738
-; CHECK-NEXT:    ldr x9, [x0]
-; CHECK-NEXT:    movk x8, #13622, lsl #16
-; CHECK-NEXT:    movk x8, #13108, lsl #32
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    movk x8, #12594, lsl #48
-; CHECK-NEXT:    cmp x9, x8
-; CHECK-NEXT:    b.ne .LBB42_3
-; CHECK-NEXT:  // %bb.1: // %loadbb1
-; CHECK-NEXT:    mov x8, #13365 // =0x3435
-; CHECK-NEXT:    ldur x9, [x0, #7]
-; CHECK-NEXT:    movk x8, #12851, lsl #16
-; CHECK-NEXT:    movk x8, #12337, lsl #32
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    movk x8, #14393, lsl #48
-; CHECK-NEXT:    cmp x9, x8
-; CHECK-NEXT:    b.ne .LBB42_3
-; CHECK-NEXT:  // %bb.2:
-; CHECK-NEXT:    mov w8, wzr
-; CHECK-NEXT:    b .LBB42_4
-; CHECK-NEXT:  .LBB42_3: // %res_block
-; CHECK-NEXT:    cmp x9, x8
-; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
-; CHECK-NEXT:    cneg w8, w8, hs
-; CHECK-NEXT:  .LBB42_4: // %endblock
-; CHECK-NEXT:    cmp w8, #0
-; CHECK-NEXT:    cset w0, gt
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i64 15) nounwind
-  %c = icmp sgt i32 %m, 0
-  ret i1 %c
-}
-
-
-define i32 @length16(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB43_3
-; CHECK-NEXT:  // %bb.1: // %loadbb1
-; CHECK-NEXT:    ldr x8, [x0, #8]
-; CHECK-NEXT:    ldr x9, [x1, #8]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB43_3
-; CHECK-NEXT:  // %bb.2:
-; CHECK-NEXT:    mov w0, wzr
-; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB43_3: // %res_block
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
-; CHECK-NEXT:    cneg w0, w8, hs
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 16) nounwind
-  ret i32 %m
-}
-
-define i1 @length16_eq(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length16_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp x8, x11, [x1]
-; CHECK-NEXT:    ldp x9, x10, [x0]
-; CHECK-NEXT:    cmp x9, x8
-; CHECK-NEXT:    ccmp x10, x11, #0, eq
-; CHECK-NEXT:    cset w0, ne
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 16) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length16_lt(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length16_lt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB45_3
-; CHECK-NEXT:  // %bb.1: // %loadbb1
-; CHECK-NEXT:    ldr x8, [x0, #8]
-; CHECK-NEXT:    ldr x9, [x1, #8]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB45_3
-; CHECK-NEXT:  // %bb.2:
-; CHECK-NEXT:    lsr w0, wzr, #31
-; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB45_3: // %res_block
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
-; CHECK-NEXT:    cneg w8, w8, hs
-; CHECK-NEXT:    lsr w0, w8, #31
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 16) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length16_gt(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length16_gt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB46_3
-; CHECK-NEXT:  // %bb.1: // %loadbb1
-; CHECK-NEXT:    ldr x8, [x0, #8]
-; CHECK-NEXT:    ldr x9, [x1, #8]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB46_3
-; CHECK-NEXT:  // %bb.2:
-; CHECK-NEXT:    mov w8, wzr
-; CHECK-NEXT:    b .LBB46_4
-; CHECK-NEXT:  .LBB46_3: // %res_block
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
-; CHECK-NEXT:    cneg w8, w8, hs
-; CHECK-NEXT:  .LBB46_4: // %endblock
-; CHECK-NEXT:    cmp w8, #0
-; CHECK-NEXT:    cset w0, gt
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 16) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length16_eq_const(ptr %X) nounwind {
-; CHECK-LABEL: length16_eq_const:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x8, #12592 // =0x3130
-; CHECK-NEXT:    ldp x9, x10, [x0]
-; CHECK-NEXT:    movk x8, #13106, lsl #16
-; CHECK-NEXT:    movk x8, #13620, lsl #32
-; CHECK-NEXT:    movk x8, #14134, lsl #48
-; CHECK-NEXT:    cmp x9, x8
-; CHECK-NEXT:    mov x8, #14648 // =0x3938
-; CHECK-NEXT:    movk x8, #12592, lsl #16
-; CHECK-NEXT:    movk x8, #13106, lsl #32
-; CHECK-NEXT:    movk x8, #13620, lsl #48
-; CHECK-NEXT:    ccmp x10, x8, #0, eq
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 16) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-
-define i32 @length24(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length24:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB48_4
-; CHECK-NEXT:  // %bb.1: // %loadbb1
-; CHECK-NEXT:    ldr x8, [x0, #8]
-; CHECK-NEXT:    ldr x9, [x1, #8]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB48_4
-; CHECK-NEXT:  // %bb.2: // %loadbb2
-; CHECK-NEXT:    ldr x8, [x0, #16]
-; CHECK-NEXT:    ldr x9, [x1, #16]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB48_4
-; CHECK-NEXT:  // %bb.3:
-; CHECK-NEXT:    mov w0, wzr
-; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB48_4: // %res_block
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
-; CHECK-NEXT:    cneg w0, w8, hs
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 24) nounwind
-  ret i32 %m
-}
-
-define i1 @length24_eq(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length24_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp x8, x11, [x1]
-; CHECK-NEXT:    ldr x12, [x0, #16]
-; CHECK-NEXT:    ldp x9, x10, [x0]
-; CHECK-NEXT:    ldr x13, [x1, #16]
-; CHECK-NEXT:    cmp x9, x8
-; CHECK-NEXT:    ccmp x10, x11, #0, eq
-; CHECK-NEXT:    ccmp x12, x13, #0, eq
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 24) nounwind
-  %cmp = icmp eq i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length24_lt(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length24_lt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB50_4
-; CHECK-NEXT:  // %bb.1: // %loadbb1
-; CHECK-NEXT:    ldr x8, [x0, #8]
-; CHECK-NEXT:    ldr x9, [x1, #8]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB50_4
-; CHECK-NEXT:  // %bb.2: // %loadbb2
-; CHECK-NEXT:    ldr x8, [x0, #16]
-; CHECK-NEXT:    ldr x9, [x1, #16]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB50_4
-; CHECK-NEXT:  // %bb.3:
-; CHECK-NEXT:    lsr w0, wzr, #31
-; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB50_4: // %res_block
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
-; CHECK-NEXT:    cneg w8, w8, hs
-; CHECK-NEXT:    lsr w0, w8, #31
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 24) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length24_gt(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length24_gt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB51_4
-; CHECK-NEXT:  // %bb.1: // %loadbb1
-; CHECK-NEXT:    ldr x8, [x0, #8]
-; CHECK-NEXT:    ldr x9, [x1, #8]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB51_4
-; CHECK-NEXT:  // %bb.2: // %loadbb2
-; CHECK-NEXT:    ldr x8, [x0, #16]
-; CHECK-NEXT:    ldr x9, [x1, #16]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB51_4
-; CHECK-NEXT:  // %bb.3:
-; CHECK-NEXT:    mov w8, wzr
-; CHECK-NEXT:    b .LBB51_5
-; CHECK-NEXT:  .LBB51_4: // %res_block
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
-; CHECK-NEXT:    cneg w8, w8, hs
-; CHECK-NEXT:  .LBB51_5: // %endblock
-; CHECK-NEXT:    cmp w8, #0
-; CHECK-NEXT:    cset w0, gt
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 24) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length24_eq_const(ptr %X) nounwind {
-; CHECK-LABEL: length24_eq_const:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x8, #12592 // =0x3130
-; CHECK-NEXT:    ldp x9, x10, [x0]
-; CHECK-NEXT:    movk x8, #13106, lsl #16
-; CHECK-NEXT:    ldr x11, [x0, #16]
-; CHECK-NEXT:    movk x8, #13620, lsl #32
-; CHECK-NEXT:    movk x8, #14134, lsl #48
-; CHECK-NEXT:    cmp x9, x8
-; CHECK-NEXT:    mov x8, #14648 // =0x3938
-; CHECK-NEXT:    movk x8, #12592, lsl #16
-; CHECK-NEXT:    movk x8, #13106, lsl #32
-; CHECK-NEXT:    movk x8, #13620, lsl #48
-; CHECK-NEXT:    ccmp x10, x8, #0, eq
-; CHECK-NEXT:    mov x8, #14134 // =0x3736
-; CHECK-NEXT:    movk x8, #14648, lsl #16
-; CHECK-NEXT:    movk x8, #12592, lsl #32
-; CHECK-NEXT:    movk x8, #13106, lsl #48
-; CHECK-NEXT:    ccmp x11, x8, #0, eq
-; CHECK-NEXT:    cset w0, ne
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 24) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length31(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length31:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB53_5
-; CHECK-NEXT:  // %bb.1: // %loadbb1
-; CHECK-NEXT:    ldr x8, [x0, #8]
-; CHECK-NEXT:    ldr x9, [x1, #8]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB53_5
-; CHECK-NEXT:  // %bb.2: // %loadbb2
-; CHECK-NEXT:    ldr x8, [x0, #16]
-; CHECK-NEXT:    ldr x9, [x1, #16]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB53_5
-; CHECK-NEXT:  // %bb.3: // %loadbb3
-; CHECK-NEXT:    ldur x8, [x0, #23]
-; CHECK-NEXT:    ldur x9, [x1, #23]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB53_5
-; CHECK-NEXT:  // %bb.4:
-; CHECK-NEXT:    mov w0, wzr
-; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB53_5: // %res_block
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
-; CHECK-NEXT:    cneg w0, w8, hs
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 31) nounwind
-  ret i32 %m
-}
-
-define i1 @length31_eq(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length31_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp x8, x11, [x1]
-; CHECK-NEXT:    ldr x12, [x0, #16]
-; CHECK-NEXT:    ldp x9, x10, [x0]
-; CHECK-NEXT:    ldr x13, [x1, #16]
-; CHECK-NEXT:    cmp x9, x8
-; CHECK-NEXT:    ldur x8, [x0, #23]
-; CHECK-NEXT:    ldur x9, [x1, #23]
-; CHECK-NEXT:    ccmp x10, x11, #0, eq
-; CHECK-NEXT:    ccmp x12, x13, #0, eq
-; CHECK-NEXT:    ccmp x8, x9, #0, eq
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 31) nounwind
-  %cmp = icmp eq i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length31_lt(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length31_lt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB55_5
-; CHECK-NEXT:  // %bb.1: // %loadbb1
-; CHECK-NEXT:    ldr x8, [x0, #8]
-; CHECK-NEXT:    ldr x9, [x1, #8]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB55_5
-; CHECK-NEXT:  // %bb.2: // %loadbb2
-; CHECK-NEXT:    ldr x8, [x0, #16]
-; CHECK-NEXT:    ldr x9, [x1, #16]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB55_5
-; CHECK-NEXT:  // %bb.3: // %loadbb3
-; CHECK-NEXT:    ldur x8, [x0, #23]
-; CHECK-NEXT:    ldur x9, [x1, #23]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB55_5
-; CHECK-NEXT:  // %bb.4:
-; CHECK-NEXT:    lsr w0, wzr, #31
-; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB55_5: // %res_block
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
-; CHECK-NEXT:    cneg w8, w8, hs
-; CHECK-NEXT:    lsr w0, w8, #31
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 31) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length31_gt(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length31_gt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB56_5
-; CHECK-NEXT:  // %bb.1: // %loadbb1
-; CHECK-NEXT:    ldr x8, [x0, #8]
-; CHECK-NEXT:    ldr x9, [x1, #8]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB56_5
-; CHECK-NEXT:  // %bb.2: // %loadbb2
-; CHECK-NEXT:    ldr x8, [x0, #16]
-; CHECK-NEXT:    ldr x9, [x1, #16]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB56_5
-; CHECK-NEXT:  // %bb.3: // %loadbb3
-; CHECK-NEXT:    ldur x8, [x0, #23]
-; CHECK-NEXT:    ldur x9, [x1, #23]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB56_5
-; CHECK-NEXT:  // %bb.4:
-; CHECK-NEXT:    mov w8, wzr
-; CHECK-NEXT:    b .LBB56_6
-; CHECK-NEXT:  .LBB56_5: // %res_block
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
-; CHECK-NEXT:    cneg w8, w8, hs
-; CHECK-NEXT:  .LBB56_6: // %endblock
-; CHECK-NEXT:    cmp w8, #0
-; CHECK-NEXT:    cset w0, gt
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 31) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length31_eq_prefer128(ptr %x, ptr %y) nounwind "prefer-vector-width"="128" {
-; CHECK-LABEL: length31_eq_prefer128:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp x8, x11, [x1]
-; CHECK-NEXT:    ldr x12, [x0, #16]
-; CHECK-NEXT:    ldp x9, x10, [x0]
-; CHECK-NEXT:    ldr x13, [x1, #16]
-; CHECK-NEXT:    cmp x9, x8
-; CHECK-NEXT:    ldur x8, [x0, #23]
-; CHECK-NEXT:    ldur x9, [x1, #23]
-; CHECK-NEXT:    ccmp x10, x11, #0, eq
-; CHECK-NEXT:    ccmp x12, x13, #0, eq
-; CHECK-NEXT:    ccmp x8, x9, #0, eq
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 31) nounwind
-  %cmp = icmp eq i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length31_eq_const(ptr %X) nounwind {
-; CHECK-LABEL: length31_eq_const:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x8, #12592 // =0x3130
-; CHECK-NEXT:    ldp x9, x10, [x0]
-; CHECK-NEXT:    movk x8, #13106, lsl #16
-; CHECK-NEXT:    ldr x11, [x0, #16]
-; CHECK-NEXT:    movk x8, #13620, lsl #32
-; CHECK-NEXT:    movk x8, #14134, lsl #48
-; CHECK-NEXT:    cmp x9, x8
-; CHECK-NEXT:    mov x8, #14648 // =0x3938
-; CHECK-NEXT:    ldur x9, [x0, #23]
-; CHECK-NEXT:    movk x8, #12592, lsl #16
-; CHECK-NEXT:    movk x8, #13106, lsl #32
-; CHECK-NEXT:    movk x8, #13620, lsl #48
-; CHECK-NEXT:    ccmp x10, x8, #0, eq
-; CHECK-NEXT:    mov x8, #14134 // =0x3736
-; CHECK-NEXT:    movk x8, #14648, lsl #16
-; CHECK-NEXT:    movk x8, #12592, lsl #32
-; CHECK-NEXT:    movk x8, #13106, lsl #48
-; CHECK-NEXT:    ccmp x11, x8, #0, eq
-; CHECK-NEXT:    mov x8, #13363 // =0x3433
-; CHECK-NEXT:    movk x8, #13877, lsl #16
-; CHECK-NEXT:    movk x8, #14391, lsl #32
-; CHECK-NEXT:    movk x8, #12345, lsl #48
-; CHECK-NEXT:    ccmp x9, x8, #0, eq
-; CHECK-NEXT:    cset w0, ne
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 31) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length32(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB59_5
-; CHECK-NEXT:  // %bb.1: // %loadbb1
-; CHECK-NEXT:    ldr x8, [x0, #8]
-; CHECK-NEXT:    ldr x9, [x1, #8]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB59_5
-; CHECK-NEXT:  // %bb.2: // %loadbb2
-; CHECK-NEXT:    ldr x8, [x0, #16]
-; CHECK-NEXT:    ldr x9, [x1, #16]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB59_5
-; CHECK-NEXT:  // %bb.3: // %loadbb3
-; CHECK-NEXT:    ldr x8, [x0, #24]
-; CHECK-NEXT:    ldr x9, [x1, #24]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB59_5
-; CHECK-NEXT:  // %bb.4:
-; CHECK-NEXT:    mov w0, wzr
-; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB59_5: // %res_block
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
-; CHECK-NEXT:    cneg w0, w8, hs
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 32) nounwind
-  ret i32 %m
-}
-
-
-define i1 @length32_eq(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length32_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp x8, x11, [x1]
-; CHECK-NEXT:    ldp x9, x10, [x0]
-; CHECK-NEXT:    ldp x12, x13, [x1, #16]
-; CHECK-NEXT:    cmp x9, x8
-; CHECK-NEXT:    ldp x8, x9, [x0, #16]
-; CHECK-NEXT:    ccmp x10, x11, #0, eq
-; CHECK-NEXT:    ccmp x8, x12, #0, eq
-; CHECK-NEXT:    ccmp x9, x13, #0, eq
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 32) nounwind
-  %cmp = icmp eq i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length32_lt(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length32_lt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB61_5
-; CHECK-NEXT:  // %bb.1: // %loadbb1
-; CHECK-NEXT:    ldr x8, [x0, #8]
-; CHECK-NEXT:    ldr x9, [x1, #8]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB61_5
-; CHECK-NEXT:  // %bb.2: // %loadbb2
-; CHECK-NEXT:    ldr x8, [x0, #16]
-; CHECK-NEXT:    ldr x9, [x1, #16]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB61_5
-; CHECK-NEXT:  // %bb.3: // %loadbb3
-; CHECK-NEXT:    ldr x8, [x0, #24]
-; CHECK-NEXT:    ldr x9, [x1, #24]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB61_5
-; CHECK-NEXT:  // %bb.4:
-; CHECK-NEXT:    lsr w0, wzr, #31
-; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB61_5: // %res_block
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
-; CHECK-NEXT:    cneg w8, w8, hs
-; CHECK-NEXT:    lsr w0, w8, #31
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 32) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length32_gt(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length32_gt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB62_5
-; CHECK-NEXT:  // %bb.1: // %loadbb1
-; CHECK-NEXT:    ldr x8, [x0, #8]
-; CHECK-NEXT:    ldr x9, [x1, #8]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB62_5
-; CHECK-NEXT:  // %bb.2: // %loadbb2
-; CHECK-NEXT:    ldr x8, [x0, #16]
-; CHECK-NEXT:    ldr x9, [x1, #16]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB62_5
-; CHECK-NEXT:  // %bb.3: // %loadbb3
-; CHECK-NEXT:    ldr x8, [x0, #24]
-; CHECK-NEXT:    ldr x9, [x1, #24]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB62_5
-; CHECK-NEXT:  // %bb.4:
-; CHECK-NEXT:    mov w8, wzr
-; CHECK-NEXT:    b .LBB62_6
-; CHECK-NEXT:  .LBB62_5: // %res_block
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
-; CHECK-NEXT:    cneg w8, w8, hs
-; CHECK-NEXT:  .LBB62_6: // %endblock
-; CHECK-NEXT:    cmp w8, #0
-; CHECK-NEXT:    cset w0, gt
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 32) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length32_eq_prefer128(ptr %x, ptr %y) nounwind "prefer-vector-width"="128" {
-; CHECK-LABEL: length32_eq_prefer128:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp x8, x11, [x1]
-; CHECK-NEXT:    ldp x9, x10, [x0]
-; CHECK-NEXT:    ldp x12, x13, [x1, #16]
-; CHECK-NEXT:    cmp x9, x8
-; CHECK-NEXT:    ldp x8, x9, [x0, #16]
-; CHECK-NEXT:    ccmp x10, x11, #0, eq
-; CHECK-NEXT:    ccmp x8, x12, #0, eq
-; CHECK-NEXT:    ccmp x9, x13, #0, eq
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 32) nounwind
-  %cmp = icmp eq i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length32_eq_const(ptr %X) nounwind {
-; CHECK-LABEL: length32_eq_const:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x8, #12592 // =0x3130
-; CHECK-NEXT:    ldp x9, x10, [x0]
-; CHECK-NEXT:    movk x8, #13106, lsl #16
-; CHECK-NEXT:    movk x8, #13620, lsl #32
-; CHECK-NEXT:    movk x8, #14134, lsl #48
-; CHECK-NEXT:    cmp x9, x8
-; CHECK-NEXT:    mov x8, #14648 // =0x3938
-; CHECK-NEXT:    movk x8, #12592, lsl #16
-; CHECK-NEXT:    ldp x9, x11, [x0, #16]
-; CHECK-NEXT:    movk x8, #13106, lsl #32
-; CHECK-NEXT:    movk x8, #13620, lsl #48
-; CHECK-NEXT:    ccmp x10, x8, #0, eq
-; CHECK-NEXT:    mov x8, #14134 // =0x3736
-; CHECK-NEXT:    movk x8, #14648, lsl #16
-; CHECK-NEXT:    movk x8, #12592, lsl #32
-; CHECK-NEXT:    movk x8, #13106, lsl #48
-; CHECK-NEXT:    ccmp x9, x8, #0, eq
-; CHECK-NEXT:    mov x8, #13620 // =0x3534
-; CHECK-NEXT:    movk x8, #14134, lsl #16
-; CHECK-NEXT:    movk x8, #14648, lsl #32
-; CHECK-NEXT:    movk x8, #12592, lsl #48
-; CHECK-NEXT:    ccmp x11, x8, #0, eq
-; CHECK-NEXT:    cset w0, ne
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 32) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length48(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length48:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB65_7
-; CHECK-NEXT:  // %bb.1: // %loadbb1
-; CHECK-NEXT:    ldr x8, [x0, #8]
-; CHECK-NEXT:    ldr x9, [x1, #8]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB65_7
-; CHECK-NEXT:  // %bb.2: // %loadbb2
-; CHECK-NEXT:    ldr x8, [x0, #16]
-; CHECK-NEXT:    ldr x9, [x1, #16]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB65_7
-; CHECK-NEXT:  // %bb.3: // %loadbb3
-; CHECK-NEXT:    ldr x8, [x0, #24]
-; CHECK-NEXT:    ldr x9, [x1, #24]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB65_7
-; CHECK-NEXT:  // %bb.4: // %loadbb4
-; CHECK-NEXT:    ldr x8, [x0, #32]
-; CHECK-NEXT:    ldr x9, [x1, #32]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB65_7
-; CHECK-NEXT:  // %bb.5: // %loadbb5
-; CHECK-NEXT:    ldr x8, [x0, #40]
-; CHECK-NEXT:    ldr x9, [x1, #40]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB65_7
-; CHECK-NEXT:  // %bb.6:
-; CHECK-NEXT:    mov w0, wzr
-; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB65_7: // %res_block
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
-; CHECK-NEXT:    cneg w0, w8, hs
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 48) nounwind
-  ret i32 %m
-}
-
-define i1 @length48_eq(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length48_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp x8, x11, [x1]
-; CHECK-NEXT:    ldp x9, x10, [x0]
-; CHECK-NEXT:    ldp x12, x13, [x1, #16]
-; CHECK-NEXT:    cmp x9, x8
-; CHECK-NEXT:    ldp x8, x9, [x0, #16]
-; CHECK-NEXT:    ccmp x10, x11, #0, eq
-; CHECK-NEXT:    ccmp x8, x12, #0, eq
-; CHECK-NEXT:    ldp x8, x11, [x0, #32]
-; CHECK-NEXT:    ldp x10, x12, [x1, #32]
-; CHECK-NEXT:    ccmp x9, x13, #0, eq
-; CHECK-NEXT:    ccmp x8, x10, #0, eq
-; CHECK-NEXT:    ccmp x11, x12, #0, eq
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 48) nounwind
-  %cmp = icmp eq i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length48_lt(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length48_lt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB67_7
-; CHECK-NEXT:  // %bb.1: // %loadbb1
-; CHECK-NEXT:    ldr x8, [x0, #8]
-; CHECK-NEXT:    ldr x9, [x1, #8]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB67_7
-; CHECK-NEXT:  // %bb.2: // %loadbb2
-; CHECK-NEXT:    ldr x8, [x0, #16]
-; CHECK-NEXT:    ldr x9, [x1, #16]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB67_7
-; CHECK-NEXT:  // %bb.3: // %loadbb3
-; CHECK-NEXT:    ldr x8, [x0, #24]
-; CHECK-NEXT:    ldr x9, [x1, #24]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB67_7
-; CHECK-NEXT:  // %bb.4: // %loadbb4
-; CHECK-NEXT:    ldr x8, [x0, #32]
-; CHECK-NEXT:    ldr x9, [x1, #32]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB67_7
-; CHECK-NEXT:  // %bb.5: // %loadbb5
-; CHECK-NEXT:    ldr x8, [x0, #40]
-; CHECK-NEXT:    ldr x9, [x1, #40]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB67_7
-; CHECK-NEXT:  // %bb.6:
-; CHECK-NEXT:    lsr w0, wzr, #31
-; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB67_7: // %res_block
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
-; CHECK-NEXT:    cneg w8, w8, hs
-; CHECK-NEXT:    lsr w0, w8, #31
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 48) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length48_gt(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length48_gt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB68_7
-; CHECK-NEXT:  // %bb.1: // %loadbb1
-; CHECK-NEXT:    ldr x8, [x0, #8]
-; CHECK-NEXT:    ldr x9, [x1, #8]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB68_7
-; CHECK-NEXT:  // %bb.2: // %loadbb2
-; CHECK-NEXT:    ldr x8, [x0, #16]
-; CHECK-NEXT:    ldr x9, [x1, #16]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB68_7
-; CHECK-NEXT:  // %bb.3: // %loadbb3
-; CHECK-NEXT:    ldr x8, [x0, #24]
-; CHECK-NEXT:    ldr x9, [x1, #24]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB68_7
-; CHECK-NEXT:  // %bb.4: // %loadbb4
-; CHECK-NEXT:    ldr x8, [x0, #32]
-; CHECK-NEXT:    ldr x9, [x1, #32]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB68_7
-; CHECK-NEXT:  // %bb.5: // %loadbb5
-; CHECK-NEXT:    ldr x8, [x0, #40]
-; CHECK-NEXT:    ldr x9, [x1, #40]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB68_7
-; CHECK-NEXT:  // %bb.6:
-; CHECK-NEXT:    mov w8, wzr
-; CHECK-NEXT:    b .LBB68_8
-; CHECK-NEXT:  .LBB68_7: // %res_block
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
-; CHECK-NEXT:    cneg w8, w8, hs
-; CHECK-NEXT:  .LBB68_8: // %endblock
-; CHECK-NEXT:    cmp w8, #0
-; CHECK-NEXT:    cset w0, gt
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 48) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length48_eq_prefer128(ptr %x, ptr %y) nounwind "prefer-vector-width"="128" {
-; CHECK-LABEL: length48_eq_prefer128:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp x8, x11, [x1]
-; CHECK-NEXT:    ldp x9, x10, [x0]
-; CHECK-NEXT:    ldp x12, x13, [x1, #16]
-; CHECK-NEXT:    cmp x9, x8
-; CHECK-NEXT:    ldp x8, x9, [x0, #16]
-; CHECK-NEXT:    ccmp x10, x11, #0, eq
-; CHECK-NEXT:    ccmp x8, x12, #0, eq
-; CHECK-NEXT:    ldp x8, x11, [x0, #32]
-; CHECK-NEXT:    ldp x10, x12, [x1, #32]
-; CHECK-NEXT:    ccmp x9, x13, #0, eq
-; CHECK-NEXT:    ccmp x8, x10, #0, eq
-; CHECK-NEXT:    ccmp x11, x12, #0, eq
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 48) nounwind
-  %cmp = icmp eq i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length48_eq_const(ptr %X) nounwind {
-; CHECK-LABEL: length48_eq_const:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x8, #12592 // =0x3130
-; CHECK-NEXT:    ldp x9, x10, [x0]
-; CHECK-NEXT:    movk x8, #13106, lsl #16
-; CHECK-NEXT:    ldp x11, x12, [x0, #16]
-; CHECK-NEXT:    movk x8, #13620, lsl #32
-; CHECK-NEXT:    movk x8, #14134, lsl #48
-; CHECK-NEXT:    cmp x9, x8
-; CHECK-NEXT:    mov x9, #14648 // =0x3938
-; CHECK-NEXT:    movk x9, #12592, lsl #16
-; CHECK-NEXT:    movk x9, #13106, lsl #32
-; CHECK-NEXT:    movk x9, #13620, lsl #48
-; CHECK-NEXT:    ccmp x10, x9, #0, eq
-; CHECK-NEXT:    mov x9, #14134 // =0x3736
-; CHECK-NEXT:    movk x9, #14648, lsl #16
-; CHECK-NEXT:    movk x9, #12592, lsl #32
-; CHECK-NEXT:    movk x9, #13106, lsl #48
-; CHECK-NEXT:    ccmp x11, x9, #0, eq
-; CHECK-NEXT:    mov x9, #13620 // =0x3534
-; CHECK-NEXT:    movk x9, #14134, lsl #16
-; CHECK-NEXT:    ldp x10, x11, [x0, #32]
-; CHECK-NEXT:    movk x9, #14648, lsl #32
-; CHECK-NEXT:    movk x9, #12592, lsl #48
-; CHECK-NEXT:    ccmp x12, x9, #0, eq
-; CHECK-NEXT:    mov x9, #13106 // =0x3332
-; CHECK-NEXT:    movk x9, #13620, lsl #16
-; CHECK-NEXT:    movk x9, #14134, lsl #32
-; CHECK-NEXT:    movk x9, #14648, lsl #48
-; CHECK-NEXT:    ccmp x10, x9, #0, eq
-; CHECK-NEXT:    ccmp x11, x8, #0, eq
-; CHECK-NEXT:    cset w0, ne
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 48) nounwind
-  %c = icmp ne i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length63(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length63:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB71_9
-; CHECK-NEXT:  // %bb.1: // %loadbb1
-; CHECK-NEXT:    ldr x8, [x0, #8]
-; CHECK-NEXT:    ldr x9, [x1, #8]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB71_9
-; CHECK-NEXT:  // %bb.2: // %loadbb2
-; CHECK-NEXT:    ldr x8, [x0, #16]
-; CHECK-NEXT:    ldr x9, [x1, #16]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB71_9
-; CHECK-NEXT:  // %bb.3: // %loadbb3
-; CHECK-NEXT:    ldr x8, [x0, #24]
-; CHECK-NEXT:    ldr x9, [x1, #24]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB71_9
-; CHECK-NEXT:  // %bb.4: // %loadbb4
-; CHECK-NEXT:    ldr x8, [x0, #32]
-; CHECK-NEXT:    ldr x9, [x1, #32]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB71_9
-; CHECK-NEXT:  // %bb.5: // %loadbb5
-; CHECK-NEXT:    ldr x8, [x0, #40]
-; CHECK-NEXT:    ldr x9, [x1, #40]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB71_9
-; CHECK-NEXT:  // %bb.6: // %loadbb6
-; CHECK-NEXT:    ldr x8, [x0, #48]
-; CHECK-NEXT:    ldr x9, [x1, #48]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB71_9
-; CHECK-NEXT:  // %bb.7: // %loadbb7
-; CHECK-NEXT:    ldur x8, [x0, #55]
-; CHECK-NEXT:    ldur x9, [x1, #55]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB71_9
-; CHECK-NEXT:  // %bb.8:
-; CHECK-NEXT:    mov w0, wzr
-; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB71_9: // %res_block
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
-; CHECK-NEXT:    cneg w0, w8, hs
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 63) nounwind
-  ret i32 %m
-}
-
-define i1 @length63_eq(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length63_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp x8, x11, [x1]
-; CHECK-NEXT:    ldp x9, x10, [x0]
-; CHECK-NEXT:    ldp x12, x13, [x1, #16]
-; CHECK-NEXT:    cmp x9, x8
-; CHECK-NEXT:    ldp x8, x9, [x0, #16]
-; CHECK-NEXT:    ccmp x10, x11, #0, eq
-; CHECK-NEXT:    ccmp x8, x12, #0, eq
-; CHECK-NEXT:    ldp x8, x11, [x0, #32]
-; CHECK-NEXT:    ldp x10, x12, [x1, #32]
-; CHECK-NEXT:    ccmp x9, x13, #0, eq
-; CHECK-NEXT:    ldr x9, [x0, #48]
-; CHECK-NEXT:    ldr x13, [x1, #48]
-; CHECK-NEXT:    ccmp x8, x10, #0, eq
-; CHECK-NEXT:    ldur x8, [x0, #55]
-; CHECK-NEXT:    ldur x10, [x1, #55]
-; CHECK-NEXT:    ccmp x11, x12, #0, eq
-; CHECK-NEXT:    ccmp x9, x13, #0, eq
-; CHECK-NEXT:    ccmp x8, x10, #0, eq
-; CHECK-NEXT:    cset w0, ne
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 63) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length63_lt(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length63_lt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB73_9
-; CHECK-NEXT:  // %bb.1: // %loadbb1
-; CHECK-NEXT:    ldr x8, [x0, #8]
-; CHECK-NEXT:    ldr x9, [x1, #8]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB73_9
-; CHECK-NEXT:  // %bb.2: // %loadbb2
-; CHECK-NEXT:    ldr x8, [x0, #16]
-; CHECK-NEXT:    ldr x9, [x1, #16]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB73_9
-; CHECK-NEXT:  // %bb.3: // %loadbb3
-; CHECK-NEXT:    ldr x8, [x0, #24]
-; CHECK-NEXT:    ldr x9, [x1, #24]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB73_9
-; CHECK-NEXT:  // %bb.4: // %loadbb4
-; CHECK-NEXT:    ldr x8, [x0, #32]
-; CHECK-NEXT:    ldr x9, [x1, #32]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB73_9
-; CHECK-NEXT:  // %bb.5: // %loadbb5
-; CHECK-NEXT:    ldr x8, [x0, #40]
-; CHECK-NEXT:    ldr x9, [x1, #40]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB73_9
-; CHECK-NEXT:  // %bb.6: // %loadbb6
-; CHECK-NEXT:    ldr x8, [x0, #48]
-; CHECK-NEXT:    ldr x9, [x1, #48]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB73_9
-; CHECK-NEXT:  // %bb.7: // %loadbb7
-; CHECK-NEXT:    ldur x8, [x0, #55]
-; CHECK-NEXT:    ldur x9, [x1, #55]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB73_9
-; CHECK-NEXT:  // %bb.8:
-; CHECK-NEXT:    lsr w0, wzr, #31
-; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB73_9: // %res_block
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
-; CHECK-NEXT:    cneg w8, w8, hs
-; CHECK-NEXT:    lsr w0, w8, #31
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 63) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length63_gt(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length63_gt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB74_9
-; CHECK-NEXT:  // %bb.1: // %loadbb1
-; CHECK-NEXT:    ldr x8, [x0, #8]
-; CHECK-NEXT:    ldr x9, [x1, #8]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB74_9
-; CHECK-NEXT:  // %bb.2: // %loadbb2
-; CHECK-NEXT:    ldr x8, [x0, #16]
-; CHECK-NEXT:    ldr x9, [x1, #16]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB74_9
-; CHECK-NEXT:  // %bb.3: // %loadbb3
-; CHECK-NEXT:    ldr x8, [x0, #24]
-; CHECK-NEXT:    ldr x9, [x1, #24]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB74_9
-; CHECK-NEXT:  // %bb.4: // %loadbb4
-; CHECK-NEXT:    ldr x8, [x0, #32]
-; CHECK-NEXT:    ldr x9, [x1, #32]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB74_9
-; CHECK-NEXT:  // %bb.5: // %loadbb5
-; CHECK-NEXT:    ldr x8, [x0, #40]
-; CHECK-NEXT:    ldr x9, [x1, #40]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB74_9
-; CHECK-NEXT:  // %bb.6: // %loadbb6
-; CHECK-NEXT:    ldr x8, [x0, #48]
-; CHECK-NEXT:    ldr x9, [x1, #48]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB74_9
-; CHECK-NEXT:  // %bb.7: // %loadbb7
-; CHECK-NEXT:    ldur x8, [x0, #55]
-; CHECK-NEXT:    ldur x9, [x1, #55]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB74_9
-; CHECK-NEXT:  // %bb.8:
-; CHECK-NEXT:    mov w8, wzr
-; CHECK-NEXT:    b .LBB74_10
-; CHECK-NEXT:  .LBB74_9: // %res_block
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
-; CHECK-NEXT:    cneg w8, w8, hs
-; CHECK-NEXT:  .LBB74_10: // %endblock
-; CHECK-NEXT:    cmp w8, #0
-; CHECK-NEXT:    cset w0, gt
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 63) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length63_eq_const(ptr %X) nounwind {
-; CHECK-LABEL: length63_eq_const:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x8, #12592 // =0x3130
-; CHECK-NEXT:    ldp x9, x10, [x0]
-; CHECK-NEXT:    movk x8, #13106, lsl #16
-; CHECK-NEXT:    ldp x11, x12, [x0, #16]
-; CHECK-NEXT:    movk x8, #13620, lsl #32
-; CHECK-NEXT:    movk x8, #14134, lsl #48
-; CHECK-NEXT:    cmp x9, x8
-; CHECK-NEXT:    mov x9, #14648 // =0x3938
-; CHECK-NEXT:    movk x9, #12592, lsl #16
-; CHECK-NEXT:    movk x9, #13106, lsl #32
-; CHECK-NEXT:    movk x9, #13620, lsl #48
-; CHECK-NEXT:    ccmp x10, x9, #0, eq
-; CHECK-NEXT:    mov x10, #14134 // =0x3736
-; CHECK-NEXT:    movk x10, #14648, lsl #16
-; CHECK-NEXT:    movk x10, #12592, lsl #32
-; CHECK-NEXT:    movk x10, #13106, lsl #48
-; CHECK-NEXT:    ccmp x11, x10, #0, eq
-; CHECK-NEXT:    mov x10, #13620 // =0x3534
-; CHECK-NEXT:    movk x10, #14134, lsl #16
-; CHECK-NEXT:    ldp x11, x13, [x0, #32]
-; CHECK-NEXT:    movk x10, #14648, lsl #32
-; CHECK-NEXT:    movk x10, #12592, lsl #48
-; CHECK-NEXT:    ccmp x12, x10, #0, eq
-; CHECK-NEXT:    mov x10, #13106 // =0x3332
-; CHECK-NEXT:    ldr x12, [x0, #48]
-; CHECK-NEXT:    movk x10, #13620, lsl #16
-; CHECK-NEXT:    movk x10, #14134, lsl #32
-; CHECK-NEXT:    movk x10, #14648, lsl #48
-; CHECK-NEXT:    ccmp x11, x10, #0, eq
-; CHECK-NEXT:    ldur x10, [x0, #55]
-; CHECK-NEXT:    ccmp x13, x8, #0, eq
-; CHECK-NEXT:    mov x8, #13877 // =0x3635
-; CHECK-NEXT:    movk x8, #14391, lsl #16
-; CHECK-NEXT:    ccmp x12, x9, #0, eq
-; CHECK-NEXT:    movk x8, #12345, lsl #32
-; CHECK-NEXT:    movk x8, #12849, lsl #48
-; CHECK-NEXT:    ccmp x10, x8, #0, eq
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 63) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length64(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length64:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB76_9
-; CHECK-NEXT:  // %bb.1: // %loadbb1
-; CHECK-NEXT:    ldr x8, [x0, #8]
-; CHECK-NEXT:    ldr x9, [x1, #8]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB76_9
-; CHECK-NEXT:  // %bb.2: // %loadbb2
-; CHECK-NEXT:    ldr x8, [x0, #16]
-; CHECK-NEXT:    ldr x9, [x1, #16]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB76_9
-; CHECK-NEXT:  // %bb.3: // %loadbb3
-; CHECK-NEXT:    ldr x8, [x0, #24]
-; CHECK-NEXT:    ldr x9, [x1, #24]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB76_9
-; CHECK-NEXT:  // %bb.4: // %loadbb4
-; CHECK-NEXT:    ldr x8, [x0, #32]
-; CHECK-NEXT:    ldr x9, [x1, #32]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB76_9
-; CHECK-NEXT:  // %bb.5: // %loadbb5
-; CHECK-NEXT:    ldr x8, [x0, #40]
-; CHECK-NEXT:    ldr x9, [x1, #40]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB76_9
-; CHECK-NEXT:  // %bb.6: // %loadbb6
-; CHECK-NEXT:    ldr x8, [x0, #48]
-; CHECK-NEXT:    ldr x9, [x1, #48]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB76_9
-; CHECK-NEXT:  // %bb.7: // %loadbb7
-; CHECK-NEXT:    ldr x8, [x0, #56]
-; CHECK-NEXT:    ldr x9, [x1, #56]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB76_9
-; CHECK-NEXT:  // %bb.8:
-; CHECK-NEXT:    mov w0, wzr
-; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB76_9: // %res_block
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
-; CHECK-NEXT:    cneg w0, w8, hs
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 64) nounwind
-  ret i32 %m
-}
-
-define i1 @length64_eq(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length64_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldp x8, x11, [x1]
-; CHECK-NEXT:    ldp x9, x10, [x0]
-; CHECK-NEXT:    ldp x12, x13, [x1, #16]
-; CHECK-NEXT:    cmp x9, x8
-; CHECK-NEXT:    ldp x8, x9, [x0, #16]
-; CHECK-NEXT:    ccmp x10, x11, #0, eq
-; CHECK-NEXT:    ccmp x8, x12, #0, eq
-; CHECK-NEXT:    ldp x8, x11, [x0, #32]
-; CHECK-NEXT:    ldp x10, x12, [x1, #32]
-; CHECK-NEXT:    ccmp x9, x13, #0, eq
-; CHECK-NEXT:    ldp x9, x13, [x1, #48]
-; CHECK-NEXT:    ccmp x8, x10, #0, eq
-; CHECK-NEXT:    ldp x8, x10, [x0, #48]
-; CHECK-NEXT:    ccmp x11, x12, #0, eq
-; CHECK-NEXT:    ccmp x8, x9, #0, eq
-; CHECK-NEXT:    ccmp x10, x13, #0, eq
-; CHECK-NEXT:    cset w0, ne
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 64) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length64_lt(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length64_lt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB78_9
-; CHECK-NEXT:  // %bb.1: // %loadbb1
-; CHECK-NEXT:    ldr x8, [x0, #8]
-; CHECK-NEXT:    ldr x9, [x1, #8]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB78_9
-; CHECK-NEXT:  // %bb.2: // %loadbb2
-; CHECK-NEXT:    ldr x8, [x0, #16]
-; CHECK-NEXT:    ldr x9, [x1, #16]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB78_9
-; CHECK-NEXT:  // %bb.3: // %loadbb3
-; CHECK-NEXT:    ldr x8, [x0, #24]
-; CHECK-NEXT:    ldr x9, [x1, #24]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB78_9
-; CHECK-NEXT:  // %bb.4: // %loadbb4
-; CHECK-NEXT:    ldr x8, [x0, #32]
-; CHECK-NEXT:    ldr x9, [x1, #32]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB78_9
-; CHECK-NEXT:  // %bb.5: // %loadbb5
-; CHECK-NEXT:    ldr x8, [x0, #40]
-; CHECK-NEXT:    ldr x9, [x1, #40]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB78_9
-; CHECK-NEXT:  // %bb.6: // %loadbb6
-; CHECK-NEXT:    ldr x8, [x0, #48]
-; CHECK-NEXT:    ldr x9, [x1, #48]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB78_9
-; CHECK-NEXT:  // %bb.7: // %loadbb7
-; CHECK-NEXT:    ldr x8, [x0, #56]
-; CHECK-NEXT:    ldr x9, [x1, #56]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB78_9
-; CHECK-NEXT:  // %bb.8:
-; CHECK-NEXT:    lsr w0, wzr, #31
-; CHECK-NEXT:    ret
-; CHECK-NEXT:  .LBB78_9: // %res_block
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
-; CHECK-NEXT:    cneg w8, w8, hs
-; CHECK-NEXT:    lsr w0, w8, #31
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 64) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length64_gt(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length64_gt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr x8, [x0]
-; CHECK-NEXT:    ldr x9, [x1]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB79_9
-; CHECK-NEXT:  // %bb.1: // %loadbb1
-; CHECK-NEXT:    ldr x8, [x0, #8]
-; CHECK-NEXT:    ldr x9, [x1, #8]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB79_9
-; CHECK-NEXT:  // %bb.2: // %loadbb2
-; CHECK-NEXT:    ldr x8, [x0, #16]
-; CHECK-NEXT:    ldr x9, [x1, #16]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB79_9
-; CHECK-NEXT:  // %bb.3: // %loadbb3
-; CHECK-NEXT:    ldr x8, [x0, #24]
-; CHECK-NEXT:    ldr x9, [x1, #24]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB79_9
-; CHECK-NEXT:  // %bb.4: // %loadbb4
-; CHECK-NEXT:    ldr x8, [x0, #32]
-; CHECK-NEXT:    ldr x9, [x1, #32]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB79_9
-; CHECK-NEXT:  // %bb.5: // %loadbb5
-; CHECK-NEXT:    ldr x8, [x0, #40]
-; CHECK-NEXT:    ldr x9, [x1, #40]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB79_9
-; CHECK-NEXT:  // %bb.6: // %loadbb6
-; CHECK-NEXT:    ldr x8, [x0, #48]
-; CHECK-NEXT:    ldr x9, [x1, #48]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB79_9
-; CHECK-NEXT:  // %bb.7: // %loadbb7
-; CHECK-NEXT:    ldr x8, [x0, #56]
-; CHECK-NEXT:    ldr x9, [x1, #56]
-; CHECK-NEXT:    rev x8, x8
-; CHECK-NEXT:    rev x9, x9
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    b.ne .LBB79_9
-; CHECK-NEXT:  // %bb.8:
-; CHECK-NEXT:    mov w8, wzr
-; CHECK-NEXT:    b .LBB79_10
-; CHECK-NEXT:  .LBB79_9: // %res_block
-; CHECK-NEXT:    cmp x8, x9
-; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
-; CHECK-NEXT:    cneg w8, w8, hs
-; CHECK-NEXT:  .LBB79_10: // %endblock
-; CHECK-NEXT:    cmp w8, #0
-; CHECK-NEXT:    cset w0, gt
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 64) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length64_eq_const(ptr %X) nounwind {
-; CHECK-LABEL: length64_eq_const:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x8, #12592 // =0x3130
-; CHECK-NEXT:    ldp x9, x10, [x0]
-; CHECK-NEXT:    movk x8, #13106, lsl #16
-; CHECK-NEXT:    ldp x11, x12, [x0, #16]
-; CHECK-NEXT:    movk x8, #13620, lsl #32
-; CHECK-NEXT:    ldp x13, x14, [x0, #32]
-; CHECK-NEXT:    movk x8, #14134, lsl #48
-; CHECK-NEXT:    cmp x9, x8
-; CHECK-NEXT:    mov x9, #14648 // =0x3938
-; CHECK-NEXT:    movk x9, #12592, lsl #16
-; CHECK-NEXT:    movk x9, #13106, lsl #32
-; CHECK-NEXT:    movk x9, #13620, lsl #48
-; CHECK-NEXT:    ccmp x10, x9, #0, eq
-; CHECK-NEXT:    mov x10, #14134 // =0x3736
-; CHECK-NEXT:    movk x10, #14648, lsl #16
-; CHECK-NEXT:    movk x10, #12592, lsl #32
-; CHECK-NEXT:    movk x10, #13106, lsl #48
-; CHECK-NEXT:    ccmp x11, x10, #0, eq
-; CHECK-NEXT:    mov x11, #13620 // =0x3534
-; CHECK-NEXT:    movk x11, #14134, lsl #16
-; CHECK-NEXT:    movk x11, #14648, lsl #32
-; CHECK-NEXT:    movk x11, #12592, lsl #48
-; CHECK-NEXT:    ccmp x12, x11, #0, eq
-; CHECK-NEXT:    mov x11, #13106 // =0x3332
-; CHECK-NEXT:    movk x11, #13620, lsl #16
-; CHECK-NEXT:    movk x11, #14134, lsl #32
-; CHECK-NEXT:    movk x11, #14648, lsl #48
-; CHECK-NEXT:    ccmp x13, x11, #0, eq
-; CHECK-NEXT:    ldp x11, x12, [x0, #48]
-; CHECK-NEXT:    ccmp x14, x8, #0, eq
-; CHECK-NEXT:    ccmp x11, x9, #0, eq
-; CHECK-NEXT:    ccmp x12, x10, #0, eq
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 64) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length96(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length96:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w2, #96 // =0x60
-; CHECK-NEXT:    b memcmp
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 96) nounwind
-  ret i32 %m
-}
-
-define i1 @length96_eq(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length96_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    mov w2, #96 // =0x60
-; CHECK-NEXT:    bl memcmp
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    cset w0, ne
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 96) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length96_lt(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length96_lt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    mov w2, #96 // =0x60
-; CHECK-NEXT:    bl memcmp
-; CHECK-NEXT:    lsr w0, w0, #31
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 96) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length96_gt(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length96_gt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    mov w2, #96 // =0x60
-; CHECK-NEXT:    bl memcmp
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    cset w0, gt
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 96) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length96_eq_const(ptr %X) nounwind {
-; CHECK-LABEL: length96_eq_const:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    adrp x1, .L.str
-; CHECK-NEXT:    add x1, x1, :lo12:.L.str
-; CHECK-NEXT:    mov w2, #96 // =0x60
-; CHECK-NEXT:    bl memcmp
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 96) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length127(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length127:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w2, #127 // =0x7f
-; CHECK-NEXT:    b memcmp
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 127) nounwind
-  ret i32 %m
-}
-
-define i1 @length127_eq(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length127_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    mov w2, #127 // =0x7f
-; CHECK-NEXT:    bl memcmp
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    cset w0, ne
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 127) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length127_lt(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length127_lt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    mov w2, #127 // =0x7f
-; CHECK-NEXT:    bl memcmp
-; CHECK-NEXT:    lsr w0, w0, #31
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 127) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length127_gt(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length127_gt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    mov w2, #127 // =0x7f
-; CHECK-NEXT:    bl memcmp
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    cset w0, gt
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 127) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length127_eq_const(ptr %X) nounwind {
-; CHECK-LABEL: length127_eq_const:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    adrp x1, .L.str
-; CHECK-NEXT:    add x1, x1, :lo12:.L.str
-; CHECK-NEXT:    mov w2, #127 // =0x7f
-; CHECK-NEXT:    bl memcmp
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 127) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length128(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length128:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w2, #128 // =0x80
-; CHECK-NEXT:    b memcmp
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 128) nounwind
-  ret i32 %m
-}
-
-define i1 @length128_eq(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length128_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    mov w2, #128 // =0x80
-; CHECK-NEXT:    bl memcmp
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    cset w0, ne
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 128) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length128_lt(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length128_lt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    mov w2, #128 // =0x80
-; CHECK-NEXT:    bl memcmp
-; CHECK-NEXT:    lsr w0, w0, #31
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 128) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length128_gt(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length128_gt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    mov w2, #128 // =0x80
-; CHECK-NEXT:    bl memcmp
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    cset w0, gt
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 128) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length128_eq_const(ptr %X) nounwind {
-; CHECK-LABEL: length128_eq_const:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    adrp x1, .L.str
-; CHECK-NEXT:    add x1, x1, :lo12:.L.str
-; CHECK-NEXT:    mov w2, #128 // =0x80
-; CHECK-NEXT:    bl memcmp
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 128) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length192(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length192:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w2, #192 // =0xc0
-; CHECK-NEXT:    b memcmp
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 192) nounwind
-  ret i32 %m
-}
-
-define i1 @length192_eq(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length192_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    mov w2, #192 // =0xc0
-; CHECK-NEXT:    bl memcmp
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    cset w0, ne
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 192) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length192_lt(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length192_lt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    mov w2, #192 // =0xc0
-; CHECK-NEXT:    bl memcmp
-; CHECK-NEXT:    lsr w0, w0, #31
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 192) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length192_gt(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length192_gt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    mov w2, #192 // =0xc0
-; CHECK-NEXT:    bl memcmp
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    cset w0, gt
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 192) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length192_eq_const(ptr %X) nounwind {
-; CHECK-LABEL: length192_eq_const:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    adrp x1, .L.str
-; CHECK-NEXT:    add x1, x1, :lo12:.L.str
-; CHECK-NEXT:    mov w2, #192 // =0xc0
-; CHECK-NEXT:    bl memcmp
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 192) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length255(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length255:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w2, #255 // =0xff
-; CHECK-NEXT:    b memcmp
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 255) nounwind
-  ret i32 %m
-}
-
-define i1 @length255_eq(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length255_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    mov w2, #255 // =0xff
-; CHECK-NEXT:    bl memcmp
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    cset w0, ne
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 255) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length255_lt(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length255_lt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    mov w2, #255 // =0xff
-; CHECK-NEXT:    bl memcmp
-; CHECK-NEXT:    lsr w0, w0, #31
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 255) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length255_gt(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length255_gt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    mov w2, #255 // =0xff
-; CHECK-NEXT:    bl memcmp
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    cset w0, gt
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 255) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length255_eq_const(ptr %X) nounwind {
-; CHECK-LABEL: length255_eq_const:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    adrp x1, .L.str
-; CHECK-NEXT:    add x1, x1, :lo12:.L.str
-; CHECK-NEXT:    mov w2, #255 // =0xff
-; CHECK-NEXT:    bl memcmp
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 255) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length256(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length256:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w2, #256 // =0x100
-; CHECK-NEXT:    b memcmp
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 256) nounwind
-  ret i32 %m
-}
-
-define i1 @length256_eq(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length256_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    mov w2, #256 // =0x100
-; CHECK-NEXT:    bl memcmp
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    cset w0, ne
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 256) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length256_lt(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length256_lt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    mov w2, #256 // =0x100
-; CHECK-NEXT:    bl memcmp
-; CHECK-NEXT:    lsr w0, w0, #31
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 256) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length256_gt(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length256_gt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    mov w2, #256 // =0x100
-; CHECK-NEXT:    bl memcmp
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    cset w0, gt
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 256) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length256_eq_const(ptr %X) nounwind {
-; CHECK-LABEL: length256_eq_const:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    adrp x1, .L.str
-; CHECK-NEXT:    add x1, x1, :lo12:.L.str
-; CHECK-NEXT:    mov w2, #256 // =0x100
-; CHECK-NEXT:    bl memcmp
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 256) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length384(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length384:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w2, #384 // =0x180
-; CHECK-NEXT:    b memcmp
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 384) nounwind
-  ret i32 %m
-}
-
-define i1 @length384_eq(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length384_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    mov w2, #384 // =0x180
-; CHECK-NEXT:    bl memcmp
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    cset w0, ne
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 384) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length384_lt(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length384_lt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    mov w2, #384 // =0x180
-; CHECK-NEXT:    bl memcmp
-; CHECK-NEXT:    lsr w0, w0, #31
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 384) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length384_gt(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length384_gt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    mov w2, #384 // =0x180
-; CHECK-NEXT:    bl memcmp
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    cset w0, gt
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 384) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length384_eq_const(ptr %X) nounwind {
-; CHECK-LABEL: length384_eq_const:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    adrp x1, .L.str
-; CHECK-NEXT:    add x1, x1, :lo12:.L.str
-; CHECK-NEXT:    mov w2, #384 // =0x180
-; CHECK-NEXT:    bl memcmp
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 384) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length511(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length511:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w2, #511 // =0x1ff
-; CHECK-NEXT:    b memcmp
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 511) nounwind
-  ret i32 %m
-}
-
-define i1 @length511_eq(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length511_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    mov w2, #511 // =0x1ff
-; CHECK-NEXT:    bl memcmp
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    cset w0, ne
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 511) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length511_lt(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length511_lt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    mov w2, #511 // =0x1ff
-; CHECK-NEXT:    bl memcmp
-; CHECK-NEXT:    lsr w0, w0, #31
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 511) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length511_gt(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length511_gt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    mov w2, #511 // =0x1ff
-; CHECK-NEXT:    bl memcmp
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    cset w0, gt
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 511) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length511_eq_const(ptr %X) nounwind {
-; CHECK-LABEL: length511_eq_const:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    adrp x1, .L.str
-; CHECK-NEXT:    add x1, x1, :lo12:.L.str
-; CHECK-NEXT:    mov w2, #511 // =0x1ff
-; CHECK-NEXT:    bl memcmp
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 511) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @length512(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: length512:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w2, #512 // =0x200
-; CHECK-NEXT:    b memcmp
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 512) nounwind
-  ret i32 %m
-}
-
-define i1 @length512_eq(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length512_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    mov w2, #512 // =0x200
-; CHECK-NEXT:    bl memcmp
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    cset w0, ne
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 512) nounwind
-  %cmp = icmp ne i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length512_lt(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length512_lt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    mov w2, #512 // =0x200
-; CHECK-NEXT:    bl memcmp
-; CHECK-NEXT:    lsr w0, w0, #31
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 512) nounwind
-  %cmp = icmp slt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length512_gt(ptr %x, ptr %y) nounwind {
-; CHECK-LABEL: length512_gt:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    mov w2, #512 // =0x200
-; CHECK-NEXT:    bl memcmp
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    cset w0, gt
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 512) nounwind
-  %cmp = icmp sgt i32 %call, 0
-  ret i1 %cmp
-}
-
-define i1 @length512_eq_const(ptr %X) nounwind {
-; CHECK-LABEL: length512_eq_const:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    adrp x1, .L.str
-; CHECK-NEXT:    add x1, x1, :lo12:.L.str
-; CHECK-NEXT:    mov w2, #512 // =0x200
-; CHECK-NEXT:    bl memcmp
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 512) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @huge_length(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: huge_length:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x2, #9223372036854775807 // =0x7fffffffffffffff
-; CHECK-NEXT:    b memcmp
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 9223372036854775807) nounwind
-  ret i32 %m
-}
-
-define i1 @huge_length_eq(ptr %X, ptr %Y) nounwind {
-; CHECK-LABEL: huge_length_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    mov x2, #9223372036854775807 // =0x7fffffffffffffff
-; CHECK-NEXT:    bl memcmp
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 9223372036854775807) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
-
-define i32 @nonconst_length(ptr %X, ptr %Y, i64 %size) nounwind {
-; CHECK-LABEL: nonconst_length:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    b memcmp
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 %size) nounwind
-  ret i32 %m
-}
-
-define i1 @nonconst_length_eq(ptr %X, ptr %Y, i64 %size) nounwind {
-; CHECK-LABEL: nonconst_length_eq:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    bl memcmp
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    cset w0, eq
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
-  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 %size) nounwind
-  %c = icmp eq i32 %m, 0
-  ret i1 %c
-}
diff --git a/llvm/test/Transforms/ExpandMemCmp/AArch64/memcmp.ll b/llvm/test/Transforms/ExpandMemCmp/AArch64/memcmp.ll
deleted file mode 100644
index 95fb883f3cdd5..0000000000000
--- a/llvm/test/Transforms/ExpandMemCmp/AArch64/memcmp.ll
+++ /dev/null
@@ -1,877 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
-; RUN: opt -S -expandmemcmp -memcmp-num-loads-per-block=1 -mtriple=aarch64-unknown-unknown < %s | FileCheck %s
-
-declare i32 @memcmp(ptr nocapture, ptr nocapture, i64)
-
-define i32 @cmp2(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; CHECK-LABEL: define i32 @cmp2(
-; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
-; CHECK-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
-; CHECK-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
-; CHECK-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
-; CHECK-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
-; CHECK-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
-; CHECK-NEXT:    ret i32 [[TMP7]]
-;
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 2)
-  ret i32 %call
-}
-
-define i32 @cmp2_align2(ptr nocapture readonly align 2 %x, ptr nocapture readonly align 2 %y)  {
-; CHECK-LABEL: define i32 @cmp2_align2(
-; CHECK-SAME: ptr nocapture readonly align 2 [[X:%.*]], ptr nocapture readonly align 2 [[Y:%.*]]) {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 2
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 2
-; CHECK-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
-; CHECK-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
-; CHECK-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
-; CHECK-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
-; CHECK-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
-; CHECK-NEXT:    ret i32 [[TMP7]]
-;
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 2)
-  ret i32 %call
-}
-
-define i32 @cmp3(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; CHECK-LABEL: define i32 @cmp3(
-; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i24, ptr [[X]], align 1
-; CHECK-NEXT:    [[TMP2:%.*]] = load i24, ptr [[Y]], align 1
-; CHECK-NEXT:    [[TMP3:%.*]] = zext i24 [[TMP1]] to i32
-; CHECK-NEXT:    [[TMP4:%.*]] = zext i24 [[TMP2]] to i32
-; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
-; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
-; CHECK-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
-; CHECK-NEXT:    ret i32 [[TMP7]]
-;
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 3)
-  ret i32 %call
-}
-
-define i32 @cmp4(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; CHECK-LABEL: define i32 @cmp4(
-; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
-; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
-; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
-; CHECK-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
-; CHECK-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
-; CHECK-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
-; CHECK-NEXT:    ret i32 [[TMP9]]
-;
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 4)
-  ret i32 %call
-}
-
-define i32 @cmp5(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; CHECK-LABEL: define i32 @cmp5(
-; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i40, ptr [[X]], align 1
-; CHECK-NEXT:    [[TMP2:%.*]] = load i40, ptr [[Y]], align 1
-; CHECK-NEXT:    [[TMP3:%.*]] = zext i40 [[TMP1]] to i64
-; CHECK-NEXT:    [[TMP4:%.*]] = zext i40 [[TMP2]] to i64
-; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
-; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp ugt i64 [[TMP5]], [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp ult i64 [[TMP5]], [[TMP6]]
-; CHECK-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP7]] to i32
-; CHECK-NEXT:    [[TMP10:%.*]] = zext i1 [[TMP8]] to i32
-; CHECK-NEXT:    [[TMP11:%.*]] = sub i32 [[TMP9]], [[TMP10]]
-; CHECK-NEXT:    ret i32 [[TMP11]]
-;
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 5)
-  ret i32 %call
-}
-
-define i32 @cmp6(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; CHECK-LABEL: define i32 @cmp6(
-; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i48, ptr [[X]], align 1
-; CHECK-NEXT:    [[TMP2:%.*]] = load i48, ptr [[Y]], align 1
-; CHECK-NEXT:    [[TMP3:%.*]] = zext i48 [[TMP1]] to i64
-; CHECK-NEXT:    [[TMP4:%.*]] = zext i48 [[TMP2]] to i64
-; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
-; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp ugt i64 [[TMP5]], [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp ult i64 [[TMP5]], [[TMP6]]
-; CHECK-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP7]] to i32
-; CHECK-NEXT:    [[TMP10:%.*]] = zext i1 [[TMP8]] to i32
-; CHECK-NEXT:    [[TMP11:%.*]] = sub i32 [[TMP9]], [[TMP10]]
-; CHECK-NEXT:    ret i32 [[TMP11]]
-;
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 6)
-  ret i32 %call
-}
-
-define i32 @cmp7(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; CHECK-LABEL: define i32 @cmp7(
-; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
-; CHECK-NEXT:    br label [[LOADBB:%.*]]
-; CHECK:       res_block:
-; CHECK-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
-; CHECK-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
-; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
-; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
-; CHECK:       loadbb:
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
-; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
-; CHECK-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
-; CHECK-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
-; CHECK-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
-; CHECK:       loadbb1:
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
-; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
-; CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
-; CHECK-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
-; CHECK-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
-; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
-; CHECK-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
-; CHECK:       endblock:
-; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
-; CHECK-NEXT:    ret i32 [[PHI_RES]]
-;
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 7)
-  ret i32 %call
-}
-
-define i32 @cmp8(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; CHECK-LABEL: define i32 @cmp8(
-; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
-; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP1]])
-; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]])
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp ugt i64 [[TMP3]], [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp ult i64 [[TMP3]], [[TMP4]]
-; CHECK-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
-; CHECK-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
-; CHECK-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
-; CHECK-NEXT:    ret i32 [[TMP9]]
-;
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 8)
-  ret i32 %call
-}
-
-define i32 @cmp9(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; CHECK-LABEL: define i32 @cmp9(
-; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
-; CHECK-NEXT:    br label [[LOADBB:%.*]]
-; CHECK:       res_block:
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[TMP5:%.*]], [[TMP6:%.*]]
-; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
-; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
-; CHECK:       loadbb:
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
-; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
-; CHECK-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
-; CHECK-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
-; CHECK-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
-; CHECK:       loadbb1:
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
-; CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
-; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
-; CHECK-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
-; CHECK-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
-; CHECK-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
-; CHECK-NEXT:    br label [[ENDBLOCK]]
-; CHECK:       endblock:
-; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
-; CHECK-NEXT:    ret i32 [[PHI_RES]]
-;
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 9)
-  ret i32 %call
-}
-
-define i32 @cmp10(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; CHECK-LABEL: define i32 @cmp10(
-; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
-; CHECK-NEXT:    br label [[LOADBB:%.*]]
-; CHECK:       res_block:
-; CHECK-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP14:%.*]], [[LOADBB1:%.*]] ]
-; CHECK-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP15:%.*]], [[LOADBB1]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
-; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
-; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
-; CHECK:       loadbb:
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
-; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
-; CHECK-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
-; CHECK-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
-; CHECK-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
-; CHECK:       loadbb1:
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
-; CHECK-NEXT:    [[TMP10:%.*]] = load i16, ptr [[TMP8]], align 1
-; CHECK-NEXT:    [[TMP11:%.*]] = load i16, ptr [[TMP9]], align 1
-; CHECK-NEXT:    [[TMP12:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP10]])
-; CHECK-NEXT:    [[TMP13:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP11]])
-; CHECK-NEXT:    [[TMP14]] = zext i16 [[TMP12]] to i64
-; CHECK-NEXT:    [[TMP15]] = zext i16 [[TMP13]] to i64
-; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[TMP14]], [[TMP15]]
-; CHECK-NEXT:    br i1 [[TMP16]], label [[ENDBLOCK]], label [[RES_BLOCK]]
-; CHECK:       endblock:
-; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
-; CHECK-NEXT:    ret i32 [[PHI_RES]]
-;
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 10)
-  ret i32 %call
-}
-
-define i32 @cmp11(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; CHECK-LABEL: define i32 @cmp11(
-; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
-; CHECK-NEXT:    br label [[LOADBB:%.*]]
-; CHECK:       res_block:
-; CHECK-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
-; CHECK-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
-; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
-; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
-; CHECK:       loadbb:
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
-; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
-; CHECK-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
-; CHECK-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
-; CHECK-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
-; CHECK:       loadbb1:
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
-; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
-; CHECK-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
-; CHECK-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
-; CHECK-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
-; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
-; CHECK-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
-; CHECK:       endblock:
-; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
-; CHECK-NEXT:    ret i32 [[PHI_RES]]
-;
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 11)
-  ret i32 %call
-}
-
-define i32 @cmp12(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; CHECK-LABEL: define i32 @cmp12(
-; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
-; CHECK-NEXT:    br label [[LOADBB:%.*]]
-; CHECK:       res_block:
-; CHECK-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP14:%.*]], [[LOADBB1:%.*]] ]
-; CHECK-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP15:%.*]], [[LOADBB1]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
-; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
-; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
-; CHECK:       loadbb:
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
-; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
-; CHECK-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
-; CHECK-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
-; CHECK-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
-; CHECK:       loadbb1:
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
-; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
-; CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
-; CHECK-NEXT:    [[TMP12:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
-; CHECK-NEXT:    [[TMP13:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
-; CHECK-NEXT:    [[TMP14]] = zext i32 [[TMP12]] to i64
-; CHECK-NEXT:    [[TMP15]] = zext i32 [[TMP13]] to i64
-; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[TMP14]], [[TMP15]]
-; CHECK-NEXT:    br i1 [[TMP16]], label [[ENDBLOCK]], label [[RES_BLOCK]]
-; CHECK:       endblock:
-; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
-; CHECK-NEXT:    ret i32 [[PHI_RES]]
-;
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 12)
-  ret i32 %call
-}
-
-define i32 @cmp13(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; CHECK-LABEL: define i32 @cmp13(
-; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
-; CHECK-NEXT:    br label [[LOADBB:%.*]]
-; CHECK:       res_block:
-; CHECK-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
-; CHECK-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
-; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
-; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
-; CHECK:       loadbb:
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
-; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
-; CHECK-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
-; CHECK-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
-; CHECK-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
-; CHECK:       loadbb1:
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 5
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 5
-; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
-; CHECK-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
-; CHECK-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
-; CHECK-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
-; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
-; CHECK-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
-; CHECK:       endblock:
-; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
-; CHECK-NEXT:    ret i32 [[PHI_RES]]
-;
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 13)
-  ret i32 %call
-}
-
-define i32 @cmp14(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; CHECK-LABEL: define i32 @cmp14(
-; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
-; CHECK-NEXT:    br label [[LOADBB:%.*]]
-; CHECK:       res_block:
-; CHECK-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
-; CHECK-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
-; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
-; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
-; CHECK:       loadbb:
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
-; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
-; CHECK-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
-; CHECK-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
-; CHECK-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
-; CHECK:       loadbb1:
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 6
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 6
-; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
-; CHECK-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
-; CHECK-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
-; CHECK-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
-; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
-; CHECK-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
-; CHECK:       endblock:
-; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
-; CHECK-NEXT:    ret i32 [[PHI_RES]]
-;
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 14)
-  ret i32 %call
-}
-
-define i32 @cmp15(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; CHECK-LABEL: define i32 @cmp15(
-; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
-; CHECK-NEXT:    br label [[LOADBB:%.*]]
-; CHECK:       res_block:
-; CHECK-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
-; CHECK-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
-; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
-; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
-; CHECK:       loadbb:
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
-; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
-; CHECK-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
-; CHECK-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
-; CHECK-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
-; CHECK:       loadbb1:
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 7
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 7
-; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
-; CHECK-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
-; CHECK-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
-; CHECK-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
-; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
-; CHECK-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
-; CHECK:       endblock:
-; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
-; CHECK-NEXT:    ret i32 [[PHI_RES]]
-;
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 15)
-  ret i32 %call
-}
-
-define i32 @cmp16(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; CHECK-LABEL: define i32 @cmp16(
-; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
-; CHECK-NEXT:    br label [[LOADBB:%.*]]
-; CHECK:       res_block:
-; CHECK-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
-; CHECK-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
-; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
-; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
-; CHECK:       loadbb:
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
-; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
-; CHECK-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
-; CHECK-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
-; CHECK-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
-; CHECK:       loadbb1:
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
-; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
-; CHECK-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
-; CHECK-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
-; CHECK-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
-; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
-; CHECK-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
-; CHECK:       endblock:
-; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
-; CHECK-NEXT:    ret i32 [[PHI_RES]]
-;
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 16)
-  ret i32 %call
-}
-
-define i32 @cmp_eq2(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; CHECK-LABEL: define i32 @cmp_eq2(
-; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
-; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i16 [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP4]], 0
-; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
-; CHECK-NEXT:    ret i32 [[CONV]]
-;
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 2)
-  %cmp = icmp eq i32 %call, 0
-  %conv = zext i1 %cmp to i32
-  ret i32 %conv
-}
-
-define i32 @cmp_eq3(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; CHECK-LABEL: define i32 @cmp_eq3(
-; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
-; CHECK-NEXT:    br label [[LOADBB:%.*]]
-; CHECK:       res_block:
-; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
-; CHECK:       loadbb:
-; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
-; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
-; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i16 [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    br i1 [[TMP3]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
-; CHECK:       loadbb1:
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 2
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 2
-; CHECK-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
-; CHECK-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne i8 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    br i1 [[TMP8]], label [[RES_BLOCK]], label [[ENDBLOCK]]
-; CHECK:       endblock:
-; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
-; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
-; CHECK-NEXT:    ret i32 [[CONV]]
-;
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 3)
-  %cmp = icmp eq i32 %call, 0
-  %conv = zext i1 %cmp to i32
-  ret i32 %conv
-}
-
-define i32 @cmp_eq4(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; CHECK-LABEL: define i32 @cmp_eq4(
-; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
-; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP4]], 0
-; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
-; CHECK-NEXT:    ret i32 [[CONV]]
-;
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 4)
-  %cmp = icmp eq i32 %call, 0
-  %conv = zext i1 %cmp to i32
-  ret i32 %conv
-}
-
-define i32 @cmp_eq5(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; CHECK-LABEL: define i32 @cmp_eq5(
-; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
-; CHECK-NEXT:    br label [[LOADBB:%.*]]
-; CHECK:       res_block:
-; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
-; CHECK:       loadbb:
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
-; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    br i1 [[TMP3]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
-; CHECK:       loadbb1:
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
-; CHECK-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
-; CHECK-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne i8 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    br i1 [[TMP8]], label [[RES_BLOCK]], label [[ENDBLOCK]]
-; CHECK:       endblock:
-; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
-; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
-; CHECK-NEXT:    ret i32 [[CONV]]
-;
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 5)
-  %cmp = icmp eq i32 %call, 0
-  %conv = zext i1 %cmp to i32
-  ret i32 %conv
-}
-
-define i32 @cmp_eq6(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; CHECK-LABEL: define i32 @cmp_eq6(
-; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
-; CHECK-NEXT:    br label [[LOADBB:%.*]]
-; CHECK:       res_block:
-; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
-; CHECK:       loadbb:
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
-; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    br i1 [[TMP3]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
-; CHECK:       loadbb1:
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
-; CHECK-NEXT:    [[TMP6:%.*]] = load i16, ptr [[TMP4]], align 1
-; CHECK-NEXT:    [[TMP7:%.*]] = load i16, ptr [[TMP5]], align 1
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne i16 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    br i1 [[TMP8]], label [[RES_BLOCK]], label [[ENDBLOCK]]
-; CHECK:       endblock:
-; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
-; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
-; CHECK-NEXT:    ret i32 [[CONV]]
-;
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 6)
-  %cmp = icmp eq i32 %call, 0
-  %conv = zext i1 %cmp to i32
-  ret i32 %conv
-}
-
-define i32 @cmp_eq6_align4(ptr nocapture readonly align 4 %x, ptr nocapture readonly align 4 %y)  {
-; CHECK-LABEL: define i32 @cmp_eq6_align4(
-; CHECK-SAME: ptr nocapture readonly align 4 [[X:%.*]], ptr nocapture readonly align 4 [[Y:%.*]]) {
-; CHECK-NEXT:    br label [[LOADBB:%.*]]
-; CHECK:       res_block:
-; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
-; CHECK:       loadbb:
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    br i1 [[TMP3]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
-; CHECK:       loadbb1:
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
-; CHECK-NEXT:    [[TMP6:%.*]] = load i16, ptr [[TMP4]], align 4
-; CHECK-NEXT:    [[TMP7:%.*]] = load i16, ptr [[TMP5]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne i16 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    br i1 [[TMP8]], label [[RES_BLOCK]], label [[ENDBLOCK]]
-; CHECK:       endblock:
-; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
-; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
-; CHECK-NEXT:    ret i32 [[CONV]]
-;
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 6)
-  %cmp = icmp eq i32 %call, 0
-  %conv = zext i1 %cmp to i32
-  ret i32 %conv
-}
-
-define i32 @cmp_eq7(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; CHECK-LABEL: define i32 @cmp_eq7(
-; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
-; CHECK-NEXT:    br label [[LOADBB:%.*]]
-; CHECK:       res_block:
-; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
-; CHECK:       loadbb:
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
-; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    br i1 [[TMP3]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
-; CHECK:       loadbb1:
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 3
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 3
-; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
-; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne i32 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    br i1 [[TMP8]], label [[RES_BLOCK]], label [[ENDBLOCK]]
-; CHECK:       endblock:
-; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
-; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
-; CHECK-NEXT:    ret i32 [[CONV]]
-;
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 7)
-  %cmp = icmp eq i32 %call, 0
-  %conv = zext i1 %cmp to i32
-  ret i32 %conv
-}
-
-define i32 @cmp_eq8(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; CHECK-LABEL: define i32 @cmp_eq8(
-; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
-; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP4]], 0
-; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
-; CHECK-NEXT:    ret i32 [[CONV]]
-;
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 8)
-  %cmp = icmp eq i32 %call, 0
-  %conv = zext i1 %cmp to i32
-  ret i32 %conv
-}
-
-define i32 @cmp_eq9(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; CHECK-LABEL: define i32 @cmp_eq9(
-; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
-; CHECK-NEXT:    br label [[LOADBB:%.*]]
-; CHECK:       res_block:
-; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
-; CHECK:       loadbb:
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
-; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    br i1 [[TMP3]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
-; CHECK:       loadbb1:
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
-; CHECK-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
-; CHECK-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne i8 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    br i1 [[TMP8]], label [[RES_BLOCK]], label [[ENDBLOCK]]
-; CHECK:       endblock:
-; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
-; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
-; CHECK-NEXT:    ret i32 [[CONV]]
-;
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 9)
-  %cmp = icmp eq i32 %call, 0
-  %conv = zext i1 %cmp to i32
-  ret i32 %conv
-}
-
-define i32 @cmp_eq10(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; CHECK-LABEL: define i32 @cmp_eq10(
-; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
-; CHECK-NEXT:    br label [[LOADBB:%.*]]
-; CHECK:       res_block:
-; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
-; CHECK:       loadbb:
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
-; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    br i1 [[TMP3]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
-; CHECK:       loadbb1:
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
-; CHECK-NEXT:    [[TMP6:%.*]] = load i16, ptr [[TMP4]], align 1
-; CHECK-NEXT:    [[TMP7:%.*]] = load i16, ptr [[TMP5]], align 1
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne i16 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    br i1 [[TMP8]], label [[RES_BLOCK]], label [[ENDBLOCK]]
-; CHECK:       endblock:
-; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
-; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
-; CHECK-NEXT:    ret i32 [[CONV]]
-;
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 10)
-  %cmp = icmp eq i32 %call, 0
-  %conv = zext i1 %cmp to i32
-  ret i32 %conv
-}
-
-define i32 @cmp_eq11(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; CHECK-LABEL: define i32 @cmp_eq11(
-; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
-; CHECK-NEXT:    br label [[LOADBB:%.*]]
-; CHECK:       res_block:
-; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
-; CHECK:       loadbb:
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
-; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    br i1 [[TMP3]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
-; CHECK:       loadbb1:
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 3
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 3
-; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
-; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    br i1 [[TMP8]], label [[RES_BLOCK]], label [[ENDBLOCK]]
-; CHECK:       endblock:
-; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
-; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
-; CHECK-NEXT:    ret i32 [[CONV]]
-;
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 11)
-  %cmp = icmp eq i32 %call, 0
-  %conv = zext i1 %cmp to i32
-  ret i32 %conv
-}
-
-define i32 @cmp_eq12(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; CHECK-LABEL: define i32 @cmp_eq12(
-; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
-; CHECK-NEXT:    br label [[LOADBB:%.*]]
-; CHECK:       res_block:
-; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
-; CHECK:       loadbb:
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
-; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    br i1 [[TMP3]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
-; CHECK:       loadbb1:
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
-; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
-; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne i32 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    br i1 [[TMP8]], label [[RES_BLOCK]], label [[ENDBLOCK]]
-; CHECK:       endblock:
-; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
-; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
-; CHECK-NEXT:    ret i32 [[CONV]]
-;
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 12)
-  %cmp = icmp eq i32 %call, 0
-  %conv = zext i1 %cmp to i32
-  ret i32 %conv
-}
-
-define i32 @cmp_eq13(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; CHECK-LABEL: define i32 @cmp_eq13(
-; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
-; CHECK-NEXT:    br label [[LOADBB:%.*]]
-; CHECK:       res_block:
-; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
-; CHECK:       loadbb:
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
-; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    br i1 [[TMP3]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
-; CHECK:       loadbb1:
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 5
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 5
-; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
-; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    br i1 [[TMP8]], label [[RES_BLOCK]], label [[ENDBLOCK]]
-; CHECK:       endblock:
-; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
-; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
-; CHECK-NEXT:    ret i32 [[CONV]]
-;
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 13)
-  %cmp = icmp eq i32 %call, 0
-  %conv = zext i1 %cmp to i32
-  ret i32 %conv
-}
-
-define i32 @cmp_eq14(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; CHECK-LABEL: define i32 @cmp_eq14(
-; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
-; CHECK-NEXT:    br label [[LOADBB:%.*]]
-; CHECK:       res_block:
-; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
-; CHECK:       loadbb:
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
-; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    br i1 [[TMP3]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
-; CHECK:       loadbb1:
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 6
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 6
-; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
-; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    br i1 [[TMP8]], label [[RES_BLOCK]], label [[ENDBLOCK]]
-; CHECK:       endblock:
-; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
-; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
-; CHECK-NEXT:    ret i32 [[CONV]]
-;
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 14)
-  %cmp = icmp eq i32 %call, 0
-  %conv = zext i1 %cmp to i32
-  ret i32 %conv
-}
-
-define i32 @cmp_eq15(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; CHECK-LABEL: define i32 @cmp_eq15(
-; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
-; CHECK-NEXT:    br label [[LOADBB:%.*]]
-; CHECK:       res_block:
-; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
-; CHECK:       loadbb:
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
-; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    br i1 [[TMP3]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
-; CHECK:       loadbb1:
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 7
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 7
-; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
-; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    br i1 [[TMP8]], label [[RES_BLOCK]], label [[ENDBLOCK]]
-; CHECK:       endblock:
-; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
-; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
-; CHECK-NEXT:    ret i32 [[CONV]]
-;
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 15)
-  %cmp = icmp eq i32 %call, 0
-  %conv = zext i1 %cmp to i32
-  ret i32 %conv
-}
-
-define i32 @cmp_eq16(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
-; CHECK-LABEL: define i32 @cmp_eq16(
-; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
-; CHECK-NEXT:    br label [[LOADBB:%.*]]
-; CHECK:       res_block:
-; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
-; CHECK:       loadbb:
-; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
-; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    br i1 [[TMP3]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
-; CHECK:       loadbb1:
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
-; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
-; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne i64 [[TMP6]], [[TMP7]]
-; CHECK-NEXT:    br i1 [[TMP8]], label [[RES_BLOCK]], label [[ENDBLOCK]]
-; CHECK:       endblock:
-; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
-; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
-; CHECK-NEXT:    ret i32 [[CONV]]
-;
-  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 16)
-  %cmp = icmp eq i32 %call, 0
-  %conv = zext i1 %cmp to i32
-  ret i32 %conv
-}

From cb950c9dc9bff37367068310bdfad75c7274915c Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2@gmail.com>
Date: Fri, 27 Oct 2023 10:19:20 -0400
Subject: [PATCH 227/877] [libc++] Make sure we implement and test LWG2280
 properly (#67670)

We did not mark std::begin/std::end as noexcept for C-style arrays, we
did not have conditional noexcept on cbegin/cend, and we did not mark
array cbegin/cend as constexpr in all Standard modes. Since this is a
LWG issue, we should implement it as a DR in all Standard modes as
usual.

This patch fixes these issues and adds test coverage. Fixes #67471.
---
 libcxx/include/__iterator/access.h            | 27 ++++++-----------
 libcxx/include/iterator                       |  8 ++---
 .../iterator.range/begin-end.array.pass.cpp   | 17 +++++++++--
 .../begin-end.container.pass.cpp              | 29 +++++++++++++++++--
 4 files changed, 55 insertions(+), 26 deletions(-)

diff --git a/libcxx/include/__iterator/access.h b/libcxx/include/__iterator/access.h
index d7bcb3378d56c..2782400ea771b 100644
--- a/libcxx/include/__iterator/access.h
+++ b/libcxx/include/__iterator/access.h
@@ -20,19 +20,13 @@
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 template <class _Tp, size_t _Np>
-_LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_SINCE_CXX14
-_Tp*
-begin(_Tp (&__array)[_Np])
-{
-    return __array;
+_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR _Tp* begin(_Tp (&__array)[_Np]) _NOEXCEPT {
+  return __array;
 }
 
 template <class _Tp, size_t _Np>
-_LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_SINCE_CXX14
-_Tp*
-end(_Tp (&__array)[_Np])
-{
-    return __array + _Np;
+_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR _Tp* end(_Tp (&__array)[_Np]) _NOEXCEPT {
+  return __array + _Np;
 }
 
 #if !defined(_LIBCPP_CXX03_LANG)
@@ -72,17 +66,14 @@ end(const _Cp& __c) -> decltype(__c.end())
 #if _LIBCPP_STD_VER >= 14
 
 template <class _Cp>
-_LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_SINCE_CXX14
-auto cbegin(const _Cp& __c) -> decltype(_VSTD::begin(__c))
-{
-    return _VSTD::begin(__c);
+_LIBCPP_HIDE_FROM_ABI constexpr auto cbegin(const _Cp& __c) noexcept(noexcept(std::begin(__c)))
+    -> decltype(std::begin(__c)) {
+    return std::begin(__c);
 }
 
 template <class _Cp>
-_LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_SINCE_CXX14
-auto cend(const _Cp& __c) -> decltype(_VSTD::end(__c))
-{
-    return _VSTD::end(__c);
+_LIBCPP_HIDE_FROM_ABI constexpr auto cend(const _Cp& __c) noexcept(noexcept(std::end(__c))) -> decltype(std::end(__c)) {
+    return std::end(__c);
 }
 
 #endif
diff --git a/libcxx/include/iterator b/libcxx/include/iterator
index 36fca48a03bbb..dcba1e1c4fba0 100644
--- a/libcxx/include/iterator
+++ b/libcxx/include/iterator
@@ -638,11 +638,11 @@ template <class C> constexpr auto begin(C& c) -> decltype(c.begin());
 template <class C> constexpr auto begin(const C& c) -> decltype(c.begin());             // constexpr since C++17
 template <class C> constexpr auto end(C& c) -> decltype(c.end());                       // constexpr since C++17
 template <class C> constexpr auto end(const C& c) -> decltype(c.end());                 // constexpr since C++17
-template <class T, size_t N> constexpr T* begin(T (&array)[N]) noexcept;                // constexpr since C++14
-template <class T, size_t N> constexpr T* end(T (&array)[N]) noexcept;                  // constexpr since C++14
+template <class T, size_t N> constexpr T* begin(T (&array)[N]) noexcept;
+template <class T, size_t N> constexpr T* end(T (&array)[N]) noexcept;
 
-template <class C> constexpr auto cbegin(const C& c) -> decltype(std::begin(c));        // C++14
-template <class C> constexpr auto cend(const C& c) -> decltype(std::end(c));            // C++14
+template <class C> constexpr auto cbegin(const C& c) noexcept(see-below) -> decltype(std::begin(c)); // C++14
+template <class C> constexpr auto cend(const C& c) noexcept(see-below) -> decltype(std::end(c));     // C++14
 template <class C> constexpr auto rbegin(C& c) -> decltype(c.rbegin());                 // C++14, constexpr since C++17
 template <class C> constexpr auto rbegin(const C& c) -> decltype(c.rbegin());           // C++14, constexpr since C++17
 template <class C> constexpr auto rend(C& c) -> decltype(c.rend());                     // C++14, constexpr since C++17
diff --git a/libcxx/test/std/iterators/iterator.range/begin-end.array.pass.cpp b/libcxx/test/std/iterators/iterator.range/begin-end.array.pass.cpp
index 4b6a3bf11de27..5161069ec8667 100644
--- a/libcxx/test/std/iterators/iterator.range/begin-end.array.pass.cpp
+++ b/libcxx/test/std/iterators/iterator.range/begin-end.array.pass.cpp
@@ -10,8 +10,8 @@
 
 // <iterator>
 //
-// template <class T, size_t N> constexpr T* begin(T (&array)[N]) noexcept; // constexpr since C++14
-// template <class T, size_t N> constexpr T* end(T (&array)[N]) noexcept;   // constexpr since C++14
+// template <class T, size_t N> constexpr T* begin(T (&array)[N]) noexcept;
+// template <class T, size_t N> constexpr T* end(T (&array)[N]) noexcept;
 //
 // template <class T, size_t N> constexpr reverse_iterator<T*> rbegin(T (&array)[N]);      // C++14, constexpr since C++17
 // template <class T, size_t N> constexpr reverse_iterator<T*> rend(T (&array)[N]);        // C++14, constexpr since C++17
@@ -27,16 +27,20 @@ TEST_CONSTEXPR_CXX14 bool test() {
 
   // std::begin(T (&)[N]) / std::end(T (&)[N])
   {
+    ASSERT_NOEXCEPT(std::begin(a));
     ASSERT_SAME_TYPE(decltype(std::begin(a)), int*);
     assert(std::begin(a) == a);
 
+    ASSERT_NOEXCEPT(std::end(a));
     ASSERT_SAME_TYPE(decltype(std::end(a)), int*);
     assert(std::end(a) == a + 3);
 
     // kind of overkill since it follows from the definition, but worth testing
+    ASSERT_NOEXCEPT(std::begin(ca));
     ASSERT_SAME_TYPE(decltype(std::begin(ca)), const int*);
     assert(std::begin(ca) == ca);
 
+    ASSERT_NOEXCEPT(std::end(ca));
     ASSERT_SAME_TYPE(decltype(std::end(ca)), const int*);
     assert(std::end(ca) == ca + 3);
   }
@@ -79,5 +83,14 @@ int main(int, char**) {
   static_assert(test_r(), "");
 #endif
 
+  // Make sure std::begin(T (&)[N]) and std::end(T (&)[N]) are constexpr in C++11 too (see LWG2280).
+  {
+    static constexpr int a[] = {1, 2, 3};
+    constexpr auto b         = std::begin(a);
+    assert(b == a);
+    constexpr auto e = std::end(a);
+    assert(e == a + 3);
+  }
+
   return 0;
 }
diff --git a/libcxx/test/std/iterators/iterator.range/begin-end.container.pass.cpp b/libcxx/test/std/iterators/iterator.range/begin-end.container.pass.cpp
index 51e8e4994ebe1..b795140c2e66a 100644
--- a/libcxx/test/std/iterators/iterator.range/begin-end.container.pass.cpp
+++ b/libcxx/test/std/iterators/iterator.range/begin-end.container.pass.cpp
@@ -15,8 +15,8 @@
 // template <class C> constexpr auto end(C& c) -> decltype(c.end());                       // constexpr since C++17
 // template <class C> constexpr auto end(const C& c) -> decltype(c.end());                 // constexpr since C++17
 //
-// template <class C> constexpr auto cbegin(const C& c) -> decltype(std::begin(c));        // C++14
-// template <class C> constexpr auto cend(const C& c) -> decltype(std::end(c));            // C++14
+// template <class C> constexpr auto cbegin(const C& c) noexcept(see-below) -> decltype(std::begin(c)); // C++14
+// template <class C> constexpr auto cend(const C& c) noexcept(see-below) -> decltype(std::end(c));     // C++14
 // template <class C> constexpr auto rbegin(C& c) -> decltype(c.rbegin());                 // C++14, constexpr since C++17
 // template <class C> constexpr auto rbegin(const C& c) -> decltype(c.rbegin());           // C++14, constexpr since C++17
 // template <class C> constexpr auto rend(C& c) -> decltype(c.rend());                     // C++14, constexpr since C++17
@@ -127,5 +127,30 @@ int main(int, char**) {
   static_assert(test<std::array<int, 3>>());
 #endif
 
+  // Note: Properly testing the conditional noexcept-ness propagation in std::cbegin and std::cend
+  //       requires using C-style arrays, because those are the only ones with a noexcept std::begin
+  //       and std::end inside namespace std
+#if TEST_STD_VER >= 14
+  {
+    int a[]        = {1, 2, 3};
+    auto const& ca = a;
+    ASSERT_NOEXCEPT(std::cbegin(ca));
+    ASSERT_NOEXCEPT(std::cend(ca));
+
+    // kind of overkill, but whatever
+    ASSERT_NOEXCEPT(std::cbegin(a));
+    ASSERT_NOEXCEPT(std::cend(a));
+  }
+
+  // Make sure std::cbegin and std::cend are constexpr in C++14 too (see LWG2280).
+  {
+    static constexpr int a[] = {1, 2, 3};
+    constexpr auto b         = std::cbegin(a);
+    assert(b == a);
+    constexpr auto e = std::cend(a);
+    assert(e == a + 3);
+  }
+#endif
+
   return 0;
 }

From 574c5cc9d8f6fcdf3702f5f98ea07ea6a8ebdfb7 Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2@gmail.com>
Date: Wed, 5 Jul 2023 10:12:03 -0400
Subject: [PATCH 228/877] [libc++] Fix incorrect length check in
 std::basic_filebuf

This patch fixes an ASAN-found issue in std::basic_filebuf where we'd
check the wrong size before proceeding to set our internal buffer to
the externally-provided buffer, leading to the library trying to read
from the incorrect buffer in underflow().

Thanks to Andrey Semin for the patch.

Differential Revision: https://reviews.llvm.org/D154514
---
 libcxx/include/fstream                        |   2 +-
 .../ifstream.members/buffered_reads.pass.cpp  | 227 ++++++++++++++++++
 .../ofstream.members/buffered_writes.pass.cpp | 225 +++++++++++++++++
 .../file.streams/fstreams/types.h             |  83 +++++++
 4 files changed, 536 insertions(+), 1 deletion(-)
 create mode 100644 libcxx/test/std/input.output/file.streams/fstreams/ifstream.members/buffered_reads.pass.cpp
 create mode 100644 libcxx/test/std/input.output/file.streams/fstreams/ofstream.members/buffered_writes.pass.cpp
 create mode 100644 libcxx/test/std/input.output/file.streams/fstreams/types.h

diff --git a/libcxx/include/fstream b/libcxx/include/fstream
index 024eef8a9d669..479d8ed198860 100644
--- a/libcxx/include/fstream
+++ b/libcxx/include/fstream
@@ -912,7 +912,7 @@ basic_filebuf<_CharT, _Traits>::setbuf(char_type* __s, streamsize __n)
     if (!__always_noconv_)
     {
         __ibs_ = max<streamsize>(__n, sizeof(__extbuf_min_));
-        if (__s && __ibs_ >= sizeof(__extbuf_min_))
+        if (__s && __ibs_ > sizeof(__extbuf_min_))
         {
             __intbuf_ = __s;
             __owns_ib_ = false;
diff --git a/libcxx/test/std/input.output/file.streams/fstreams/ifstream.members/buffered_reads.pass.cpp b/libcxx/test/std/input.output/file.streams/fstreams/ifstream.members/buffered_reads.pass.cpp
new file mode 100644
index 0000000000000..699e6c20160fe
--- /dev/null
+++ b/libcxx/test/std/input.output/file.streams/fstreams/ifstream.members/buffered_reads.pass.cpp
@@ -0,0 +1,227 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+// UNSUPPORTED: c++03
+
+// <fstream>
+
+// This test checks the behavior of reading payloads of different sizes in different patterns.
+// In particular, it was written to exercise code paths that deal with buffering inside the fstream
+// implementation.
+//
+// For each test, we test various behaviors w.r.t. how the buffer is handled:
+// - Provide a user-managed buffer to the library. In this case, we test the following corner-cases:
+//    + A 0-sized buffer.
+//    + A buffer size greater than and smaller than the payload size, which causes multiple buffer effects.
+//      Important values are +/- 1 byte from the payload size.
+// - Let the library manage a buffer of a user-provided size 'n'. In this case, we test the following corner-cases:
+//    + A 0-sized buffer.
+//    + A buffer size greater than and smaller than the payload size, which causes multiple buffer effects.
+//      Important values are +/- 1 or 2 bytes from the payload size.
+//    + A buffer size smaller than 8 bytes. If pubsetbuf() is called with less than 8 bytes, the library will
+//      use __extbuf_min_ with 8 bytes instead of allocating anything.
+// - Let the library manage a buffer, without specifying any size. In this case, the library will use the default
+//   buffer size of 4096 bytes.
+
+#include <cassert>
+#include <codecvt>
+#include <fstream>
+#include <locale>
+#include <numeric>
+#include <string>
+#include <vector>
+
+#include "../types.h"
+#include "assert_macros.h"
+#include "platform_support.h"
+#include "test_macros.h"
+
+template <class BufferPolicy>
+void test_read(BufferPolicy policy, const std::vector<std::streamsize>& payload_sizes) {
+  std::streamsize total_size = std::accumulate(payload_sizes.begin(), payload_sizes.end(), 0);
+  std::vector<char> data(total_size);
+  for (std::size_t i = 0; i < data.size(); ++i) {
+    data[i] = static_cast<char>(i % (1 << 8 * sizeof(char)));
+  }
+  std::string p = get_temp_file_name();
+  {
+    std::ofstream ofs;
+    ofs.open(p, std::ios::out | std::ios::binary);
+    assert(ofs.is_open());
+    ofs.write(data.data(), data.size());
+    assert(!ofs.fail());
+    // test that the user's out_buffer buffer was not modified by write()
+    for (std::streamsize j = 0; j < total_size; ++j) {
+      char exp = j % (1 << 8 * sizeof(char));
+      TEST_REQUIRE(data[j] == exp, [&] {
+        test_eprintf("failed after write() at offset %zu: got=%x, expected=%x\n", j, data[j], exp);
+      });
+    }
+    ofs.close();
+  }
+  {
+    std::ifstream ifs;
+    policy(ifs);
+    ifs.open(p, std::ios::ate | std::ios::binary);
+    assert(ifs.is_open());
+    const std::streamsize in_sz = ifs.tellg();
+    TEST_REQUIRE(in_sz == total_size, [&] { test_eprintf("out_sz = %zu, in_sz = %ld\n", total_size, in_sz); });
+    ifs.seekg(0, std::ios::beg);
+    std::size_t previously_read = 0;
+    std::vector<char> in_buffer(total_size);
+    for (const auto& payload_sz : payload_sizes) {
+      ifs.read(in_buffer.data() + previously_read, payload_sz);
+      assert(ifs);
+      for (std::streamsize j = 0; j < payload_sz; ++j) {
+        char exp = (previously_read + j) % (1 << 8 * sizeof(char));
+        TEST_REQUIRE(in_buffer[previously_read + j] == exp, [&] {
+          test_eprintf(
+              "failed after read() at offset %zu (offset %zu in chunk size %zu): got=%x, expected=%x\n",
+              previously_read + j,
+              j,
+              payload_sz,
+              in_buffer[previously_read + j],
+              exp);
+        });
+      }
+      previously_read += payload_sz;
+    }
+  }
+  std::remove(p.c_str());
+}
+
+#ifndef TEST_HAS_NO_WIDE_CHARACTERS
+template <class BufferPolicy>
+void test_read_codecvt(BufferPolicy policy, const std::vector<std::streamsize>& payload_sizes) {
+  std::streamsize total_size = std::accumulate(payload_sizes.begin(), payload_sizes.end(), 0);
+  std::vector<wchar_t> data(total_size);
+  for (std::size_t i = 0; i < data.size(); ++i) {
+    data[i] = static_cast<wchar_t>(i);
+  }
+  std::string p = get_temp_file_name();
+  {
+    std::wofstream ofs;
+    ofs.imbue(std::locale(std::locale::classic(), new std::codecvt_utf8<wchar_t>));
+    ofs.open(p, std::ios::out | std::ios::binary);
+    assert(ofs.is_open());
+    ofs.write(data.data(), data.size());
+    assert(!ofs.fail());
+    // test that the user's out_buffer buffer was not modified by write()
+    for (std::streamsize j = 0; j < total_size; ++j) {
+      wchar_t exp = static_cast<wchar_t>(j);
+      TEST_REQUIRE(data[j] == exp, [&] {
+        test_eprintf("failed after write() at offset %zu: got=%x, expected=%x\n", j, data[j], exp);
+      });
+    }
+    ofs.close();
+  }
+  {
+    std::wifstream ifs;
+    ifs.imbue(std::locale(std::locale::classic(), new std::codecvt_utf8<wchar_t>));
+    policy(ifs);
+    ifs.open(p, std::ios::in | std::ios::binary);
+    assert(ifs.is_open());
+    ifs.seekg(0, std::ios::beg);
+    std::size_t previously_read = 0;
+    std::vector<wchar_t> in_buffer(total_size);
+    for (const auto& payload_sz : payload_sizes) {
+      assert(ifs.read(in_buffer.data() + previously_read, payload_sz));
+      assert(ifs);
+      for (std::streamsize j = 0; j < payload_sz; ++j) {
+        wchar_t exp = static_cast<wchar_t>(j + previously_read);
+        TEST_REQUIRE(in_buffer[j + previously_read] == exp, [&] {
+          test_eprintf(
+              "failed after read() at offset %zu: got=%x, expected=%x\n", j, in_buffer[j + previously_read], exp);
+        });
+      }
+      previously_read += payload_sz;
+    }
+  }
+  std::remove(p.c_str());
+}
+#endif
+
+const std::vector<std::streamsize> buffer_sizes{0L, 3L, 8L, 9L, 11L};
+const std::vector<std::streamsize> io_sizes{0L, 1L, 2L, 3L, 4L, 9L, 10L, 11L, 12L, 13L, 21L, 22L, 23L};
+const std::vector<std::streamsize> io_sizes_default{
+    0L, 1L, 2L, 3L, 4L, 4094L, 4095L, 4096L, 4097L, 4098L, 8190L, 8191L, 8192L, 8193L, 8194L};
+
+// Test single read operations
+void test_1_read() {
+  // with default library buffer size: 4096b
+  for (std::streamsize x : io_sizes_default) {
+    test_read(LibraryDefaultBuffer(), {x});
+#ifndef TEST_HAS_NO_WIDE_CHARACTERS
+    test_read_codecvt(LibraryDefaultBuffer(), {x});
+#endif
+  }
+
+  // with the library-managed buffer of given size
+  for (std::streamsize b : buffer_sizes) {
+    for (std::streamsize x : io_sizes) {
+      test_read(LibraryManagedBuffer(b), {x});
+#ifndef TEST_HAS_NO_WIDE_CHARACTERS
+      test_read_codecvt(LibraryManagedBuffer(b), {x});
+#endif
+    }
+  }
+
+  // with the user-managed buffer of given size
+  for (std::streamsize b : buffer_sizes) {
+    for (std::streamsize x : io_sizes) {
+      test_read(UserManagedBuffer(b), {x});
+#ifndef TEST_HAS_NO_WIDE_CHARACTERS
+      test_read_codecvt(UserManagedBuffer(b), {x});
+#endif
+    }
+  }
+}
+
+// Test two read operations
+void test_2_reads() {
+  // with default library buffer size: 4096b
+  for (std::streamsize a : io_sizes_default) {
+    for (std::streamsize b : io_sizes_default) {
+      test_read(LibraryDefaultBuffer(), {a, b});
+#ifndef TEST_HAS_NO_WIDE_CHARACTERS
+      test_read_codecvt(LibraryDefaultBuffer(), {a, b});
+#endif
+    }
+  }
+
+  // with the library-managed buffer of given size
+  for (std::streamsize buf : buffer_sizes) {
+    for (std::streamsize a : io_sizes) {
+      for (std::streamsize b : io_sizes) {
+        test_read(LibraryManagedBuffer(buf), {a, b});
+#ifndef TEST_HAS_NO_WIDE_CHARACTERS
+        test_read_codecvt(LibraryManagedBuffer(buf), {a, b});
+#endif
+      }
+    }
+  }
+
+  // with the user-managed buffer of given size
+  for (std::streamsize buf : buffer_sizes) {
+    for (std::streamsize a : io_sizes) {
+      for (std::streamsize b : io_sizes) {
+        test_read(UserManagedBuffer(buf), {a, b});
+#ifndef TEST_HAS_NO_WIDE_CHARACTERS
+        test_read_codecvt(UserManagedBuffer(buf), {a, b});
+#endif
+      }
+    }
+  }
+}
+
+int main(int, char**) {
+  test_1_read();
+  test_2_reads();
+  return 0;
+}
diff --git a/libcxx/test/std/input.output/file.streams/fstreams/ofstream.members/buffered_writes.pass.cpp b/libcxx/test/std/input.output/file.streams/fstreams/ofstream.members/buffered_writes.pass.cpp
new file mode 100644
index 0000000000000..4aeaa42c18ade
--- /dev/null
+++ b/libcxx/test/std/input.output/file.streams/fstreams/ofstream.members/buffered_writes.pass.cpp
@@ -0,0 +1,225 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
+// UNSUPPORTED: c++03
+
+// <fstream>
+
+// This test checks the behavior of writing payloads of different sizes in different patterns.
+// In particular, it was written to exercise code paths that deal with buffering inside the fstream
+// implementation.
+//
+// For each test, we test various behaviors w.r.t. how the buffer is handled:
+// - Provide a user-managed buffer to the library. In this case, we test the following corner-cases:
+//    + A 0-sized buffer.
+//    + A buffer size greater than and smaller than the payload size, which causes multiple buffer effects.
+//      Important values are +/- 1 byte from the payload size.
+// - Let the library manage a buffer of a user-provided size 'n'. In this case, we test the following corner-cases:
+//    + A 0-sized buffer.
+//    + A buffer size greater than and smaller than the payload size, which causes multiple buffer effects.
+//      Important values are +/- 1 or 2 bytes from the payload size.
+//    + A buffer size smaller than 8 bytes. If pubsetbuf() is called with less than 8 bytes, the library will
+//      use __extbuf_min_ with 8 bytes instead of allocating anything.
+// - Let the library manage a buffer, without specifying any size. In this case, the library will use the default
+//   buffer size of 4096 bytes.
+
+#include <cassert>
+#include <codecvt>
+#include <fstream>
+#include <locale>
+#include <numeric>
+#include <string>
+#include <vector>
+
+#include "../types.h"
+#include "assert_macros.h"
+#include "platform_support.h"
+#include "test_macros.h"
+
+template <class BufferPolicy>
+void test_write(BufferPolicy policy, const std::vector<std::streamsize>& payload_sizes) {
+  std::size_t previously_written = 0;
+  std::streamsize total_size     = std::accumulate(payload_sizes.begin(), payload_sizes.end(), 0);
+  std::vector<char> data(total_size);
+  for (std::size_t i = 0; i < data.size(); ++i) {
+    data[i] = static_cast<char>(i % (1 << 8 * sizeof(char)));
+  }
+  std::string p = get_temp_file_name();
+  {
+    std::ofstream ofs;
+    policy(ofs);
+    ofs.open(p, std::ios::out | std::ios::binary);
+    assert(ofs.is_open());
+    for (const auto& payload_sz : payload_sizes) {
+      ofs.write(data.data() + previously_written, payload_sz);
+      assert(!ofs.fail());
+      // test that the user's out_buffer buffer was not modified by write()
+      for (std::streamsize j = 0; j < payload_sz; ++j) {
+        char exp = (previously_written + j) % (1 << 8 * sizeof(char));
+        TEST_REQUIRE(data[previously_written + j] == exp, [&] {
+          test_eprintf(
+              "failed after write() at offset %zu (offset %zu in chunk size %zu): got=%x, expected=%x\n",
+              previously_written + j,
+              j,
+              payload_sz,
+              data[previously_written + j],
+              exp);
+        });
+      }
+      previously_written += payload_sz;
+    }
+    ofs.close();
+  }
+  { // verify contents after reading the file back
+    std::ifstream ifs(p.c_str(), std::ios::ate | std::ios::binary);
+    const std::streamsize in_sz = ifs.tellg();
+    TEST_REQUIRE(in_sz == total_size, [&] { test_eprintf("out_sz = %zu, in_sz = %ld\n", total_size, in_sz); });
+    std::vector<char> in_buffer(total_size);
+    ifs.seekg(0, std::ios::beg);
+    assert(ifs.read(in_buffer.data(), total_size));
+    for (std::size_t i = 0; i < in_buffer.size(); ++i) {
+      char exp = i % (1 << 8 * sizeof(char));
+      TEST_REQUIRE(in_buffer[i] == exp, [&] {
+        test_eprintf("failed after read() at offset %zu: got=%x, expected=%x\n", i, in_buffer[i], exp);
+      });
+    }
+  }
+  std::remove(p.c_str());
+}
+
+#ifndef TEST_HAS_NO_WIDE_CHARACTERS
+template <class BufferPolicy>
+void test_write_codecvt(BufferPolicy policy, const std::vector<std::streamsize>& payload_sizes) {
+  std::size_t previously_written = 0;
+  std::streamsize total_size     = std::accumulate(payload_sizes.begin(), payload_sizes.end(), 0);
+  std::vector<wchar_t> data(total_size);
+  for (std::size_t i = 0; i < data.size(); ++i) {
+    data[i] = static_cast<wchar_t>(i);
+  }
+  std::string p = get_temp_file_name();
+  {
+    std::wofstream ofs;
+    ofs.imbue(std::locale(std::locale::classic(), new std::codecvt_utf8<wchar_t>));
+    policy(ofs);
+    ofs.open(p, std::ios::out | std::ios::binary);
+    assert(ofs.is_open());
+    for (const auto& payload_sz : payload_sizes) {
+      ofs.write(data.data() + previously_written, payload_sz);
+      assert(!ofs.fail());
+      // test that the user's out_buffer buffer was not modified by write()
+      for (std::streamsize j = 0; j < payload_sz; ++j) {
+        wchar_t exp = static_cast<wchar_t>(previously_written + j);
+        TEST_REQUIRE(data[previously_written + j] == exp, [&] {
+          test_eprintf(
+              "failed after write() at offset %zu (offset %zu in chunk size %zu): got=%x, expected=%x\n",
+              previously_written + j,
+              j,
+              payload_sz,
+              data[previously_written + j],
+              exp);
+        });
+      }
+      previously_written += payload_sz;
+    }
+    ofs.close();
+  }
+  { // verify contents after reading the file back
+    std::wifstream ifs(p.c_str(), std::ios::in | std::ios::binary);
+    ifs.imbue(std::locale(std::locale::classic(), new std::codecvt_utf8<wchar_t>));
+    std::vector<wchar_t> in_buffer(total_size);
+    assert(ifs.read(in_buffer.data(), total_size));
+    for (std::size_t i = 0; i < in_buffer.size(); ++i) {
+      wchar_t exp = static_cast<wchar_t>(i);
+      TEST_REQUIRE(in_buffer[i] == exp, [&] {
+        test_eprintf("failed after read() at offset %zu: got=%x, expected=%x\n", i, in_buffer[i], exp);
+      });
+    }
+  }
+  std::remove(p.c_str());
+}
+#endif
+
+const std::vector<std::streamsize> buffer_sizes{0L, 3L, 8L, 9L, 11L};
+const std::vector<std::streamsize> io_sizes{0L, 1L, 2L, 3L, 4L, 9L, 10L, 11L, 12L, 13L, 21L, 22L, 23L};
+const std::vector<std::streamsize> io_sizes_default{
+    0L, 1L, 2L, 3L, 4L, 4094L, 4095L, 4096L, 4097L, 4098L, 8190L, 8191L, 8192L, 8193L, 8194L};
+
+// Test single write operations
+void test_1_write() {
+  // with default library buffer size: 4096b
+  for (std::streamsize x : io_sizes_default) {
+    test_write(LibraryDefaultBuffer(), {x});
+#ifndef TEST_HAS_NO_WIDE_CHARACTERS
+    test_write_codecvt(LibraryDefaultBuffer(), {x});
+#endif
+  }
+
+  // with the library-managed buffer of given size
+  for (std::streamsize b : buffer_sizes) {
+    for (std::streamsize x : io_sizes) {
+      test_write(LibraryManagedBuffer(b), {x});
+#ifndef TEST_HAS_NO_WIDE_CHARACTERS
+      test_write_codecvt(LibraryManagedBuffer(b), {x});
+#endif
+    }
+  }
+
+  // with the user-managed buffer of given size
+  for (std::streamsize b : buffer_sizes) {
+    for (std::streamsize x : io_sizes) {
+      test_write(UserManagedBuffer(b), {x});
+#ifndef TEST_HAS_NO_WIDE_CHARACTERS
+      test_write_codecvt(UserManagedBuffer(b), {x});
+#endif
+    }
+  }
+}
+
+// Test two write operations
+void test_2_writes() {
+  // with default library buffer size: 4096b
+  for (std::streamsize a : io_sizes_default) {
+    for (std::streamsize b : io_sizes_default) {
+      test_write(LibraryDefaultBuffer(), {a, b});
+#ifndef TEST_HAS_NO_WIDE_CHARACTERS
+      test_write_codecvt(LibraryDefaultBuffer(), {a, b});
+#endif
+    }
+  }
+
+  // with the library-managed buffer of given size
+  for (std::streamsize buf : buffer_sizes) {
+    for (std::streamsize a : io_sizes) {
+      for (std::streamsize b : io_sizes) {
+        test_write(LibraryManagedBuffer(buf), {a, b});
+#ifndef TEST_HAS_NO_WIDE_CHARACTERS
+        test_write_codecvt(LibraryManagedBuffer(buf), {a, b});
+#endif
+      }
+    }
+  }
+
+  // with the user-managed buffer of given size
+  for (std::streamsize buf : buffer_sizes) {
+    for (std::streamsize a : io_sizes) {
+      for (std::streamsize b : io_sizes) {
+        test_write(UserManagedBuffer(buf), {a, b});
+#ifndef TEST_HAS_NO_WIDE_CHARACTERS
+        test_write_codecvt(UserManagedBuffer(buf), {a, b});
+#endif
+      }
+    }
+  }
+}
+
+int main(int, char**) {
+  test_1_write();
+  test_2_writes();
+  return 0;
+}
diff --git a/libcxx/test/std/input.output/file.streams/fstreams/types.h b/libcxx/test/std/input.output/file.streams/fstreams/types.h
new file mode 100644
index 0000000000000..b919ba2018537
--- /dev/null
+++ b/libcxx/test/std/input.output/file.streams/fstreams/types.h
@@ -0,0 +1,83 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TEST_STD_INPUT_OUTPUT_FILE_STREAMS_FSTREAMS_TYPES_H
+#define TEST_STD_INPUT_OUTPUT_FILE_STREAMS_FSTREAMS_TYPES_H
+
+#include <cstddef>
+#include <fstream>
+#include <vector>
+
+#include "test_macros.h"
+
+struct UserManagedBuffer {
+  UserManagedBuffer(std::size_t size) : size_(size) {}
+
+  UserManagedBuffer(UserManagedBuffer const&) = delete;
+  UserManagedBuffer(UserManagedBuffer&&)      = default;
+
+  void operator()(std::basic_ofstream<char>& stream) {
+    buffers_.emplace_back(new char[size_]);
+    stream.rdbuf()->pubsetbuf(buffers_.back(), size_);
+  }
+  void operator()(std::basic_ifstream<char>& stream) {
+    buffers_.emplace_back(new char[size_]);
+    stream.rdbuf()->pubsetbuf(buffers_.back(), size_);
+  }
+
+#ifndef TEST_HAS_NO_WIDE_CHARACTERS
+  void operator()(std::basic_ofstream<wchar_t>& stream) {
+    wbuffers_.emplace_back(new wchar_t[size_]);
+    stream.rdbuf()->pubsetbuf(wbuffers_.back(), size_);
+  }
+  void operator()(std::basic_ifstream<wchar_t>& stream) {
+    wbuffers_.emplace_back(new wchar_t[size_]);
+    stream.rdbuf()->pubsetbuf(wbuffers_.back(), size_);
+  }
+#endif
+  ~UserManagedBuffer() {
+    for (char* p : buffers_)
+      delete[] p;
+
+#ifndef TEST_HAS_NO_WIDE_CHARACTERS
+    for (wchar_t* p : wbuffers_)
+      delete[] p;
+#endif
+  }
+
+private:
+  std::size_t size_;
+  std::vector<char*> buffers_;
+  std::vector<wchar_t*> wbuffers_;
+};
+
+struct LibraryManagedBuffer {
+  LibraryManagedBuffer(std::size_t size) : size_(size) {}
+  template <class CharT>
+  void operator()(std::basic_ofstream<CharT>& stream) const {
+    stream.rdbuf()->pubsetbuf(nullptr, size_);
+  }
+
+  template <class CharT>
+  void operator()(std::basic_ifstream<CharT>& stream) const {
+    stream.rdbuf()->pubsetbuf(nullptr, size_);
+  }
+
+private:
+  std::size_t size_;
+};
+
+struct LibraryDefaultBuffer {
+  LibraryDefaultBuffer() {}
+  template <class CharT>
+  void operator()(std::basic_ofstream<CharT>&) const {}
+  template <class CharT>
+  void operator()(std::basic_ifstream<CharT>&) const {}
+};
+
+#endif // TEST_STD_INPUT_OUTPUT_FILE_STREAMS_FSTREAMS_TYPES_H

From 3e32b809001cf3a9d6f9cea4030fb9fe66f083f2 Mon Sep 17 00:00:00 2001
From: Razvan Lupusoru <razvan.lupusoru@gmail.com>
Date: Fri, 27 Oct 2023 07:49:31 -0700
Subject: [PATCH 229/877] [flang] Consider bind(c) when lowering calls to
 intrinsic module procedures (#70386)

When attempting to lower an intrinsic module procedure call, take into
account bind(c). Such procedures cannot be lowered as intrinsics since
their implementation is external to the module.

With this solution, the hardcoded "omp_lib" string can be removed when
checking if intrinsic module procedure since all procedure interfaces in
that module use bind(c).
---
 flang/lib/Lower/ConvertCall.cpp               | 10 ++++++---
 flang/module/omp_lib.h                        | 16 +++++++-------
 .../test/Lower/OpenMP/omp-lib-num-threads.f90 | 21 +++++++++++++++++++
 3 files changed, 36 insertions(+), 11 deletions(-)
 create mode 100644 flang/test/Lower/OpenMP/omp-lib-num-threads.f90

diff --git a/flang/lib/Lower/ConvertCall.cpp b/flang/lib/Lower/ConvertCall.cpp
index bc9426827c3ba..82e1ece4efeaf 100644
--- a/flang/lib/Lower/ConvertCall.cpp
+++ b/flang/lib/Lower/ConvertCall.cpp
@@ -2122,7 +2122,12 @@ genProcedureRef(CallContext &callContext) {
   mlir::Location loc = callContext.loc;
   if (auto *intrinsic = callContext.procRef.proc().GetSpecificIntrinsic())
     return genIntrinsicRef(intrinsic, callContext);
-  if (Fortran::lower::isIntrinsicModuleProcRef(callContext.procRef))
+  // If it is an intrinsic module procedure reference - then treat as
+  // intrinsic unless it is bind(c) (since implementation is external from
+  // module).
+  if (Fortran::lower::isIntrinsicModuleProcRef(callContext.procRef) &&
+      !Fortran::semantics::IsBindCProcedure(
+          *callContext.procRef.proc().GetSymbol()))
     return genIntrinsicRef(nullptr, callContext);
 
   if (callContext.isStatementFunctionCall())
@@ -2227,8 +2232,7 @@ bool Fortran::lower::isIntrinsicModuleProcRef(
     return false;
   const Fortran::semantics::Symbol *module =
       symbol->GetUltimate().owner().GetSymbol();
-  return module && module->attrs().test(Fortran::semantics::Attr::INTRINSIC) &&
-         module->name().ToString().find("omp_lib") == std::string::npos;
+  return module && module->attrs().test(Fortran::semantics::Attr::INTRINSIC);
 }
 
 std::optional<hlfir::EntityWithAttributes> Fortran::lower::convertCallToHLFIR(
diff --git a/flang/module/omp_lib.h b/flang/module/omp_lib.h
index d8a9aeb152df5..efc9ceb16968b 100644
--- a/flang/module/omp_lib.h
+++ b/flang/module/omp_lib.h
@@ -90,7 +90,7 @@
     omp_atv_blocked = 17, &
     omp_atv_interleaved = 18
 
-  type :: omp_alloctrait
+  type, bind(c) :: omp_alloctrait
     integer(kind=omp_alloctrait_key_kind) :: key, value
   end type omp_alloctrait
 
@@ -264,23 +264,23 @@
       integer(kind=omp_integer_kind), intent(out) :: place_nums(*)
     end subroutine omp_get_partition_place_nums
 
-    subroutine omp_set_affinity_format(format)
+    subroutine omp_set_affinity_format(format) bind(c)
       import
       character(len=*), intent(in) :: format
     end subroutine omp_set_affinity_format
 
-    function omp_get_affinity_format(buffer)
+    function omp_get_affinity_format(buffer) bind(c)
       import
       character(len=*), intent(out) :: buffer
       integer(kind=omp_integer_kind) :: omp_get_affinity_format
     end function omp_get_affinity_format
 
-    subroutine omp_display_affinity(format)
+    subroutine omp_display_affinity(format) bind(c)
       import
       character(len=*), intent(in) :: format
     end subroutine omp_display_affinity
 
-    function omp_capture_affinity(buffer, format)
+    function omp_capture_affinity(buffer, format) bind(c)
       import
       character(len=*), intent(out) :: buffer
       character(len=*), intent(in) :: format
@@ -339,7 +339,7 @@
       integer(kind=omp_integer_kind) :: omp_pause_resource
     end function omp_pause_resource
 
-    function omp_pause_resource_all(kind)
+    function omp_pause_resource_all(kind) bind(c)
       import
       integer(kind=omp_pause_resource_kind), value :: kind
       integer(kind=omp_integer_kind) :: omp_pause_resource_all
@@ -428,7 +428,7 @@
 ! Device Memory Routines
 
 ! Memory Management Routines
-    function omp_init_allocator(memspace, ntraits, traits)
+    function omp_init_allocator(memspace, ntraits, traits) bind(c)
       import
       integer(kind=omp_memspace_handle_kind), value :: memspace
       integer, value :: ntraits
@@ -446,7 +446,7 @@
       integer(kind=omp_allocator_handle_kind), value :: allocator
     end subroutine omp_set_default_allocator
 
-    function omp_get_default_allocator()
+    function omp_get_default_allocator() bind(c)
       import
       integer(kind=omp_allocator_handle_kind) :: omp_get_default_allocator
     end function omp_get_default_allocator
diff --git a/flang/test/Lower/OpenMP/omp-lib-num-threads.f90 b/flang/test/Lower/OpenMP/omp-lib-num-threads.f90
new file mode 100644
index 0000000000000..01c3e93d97bfc
--- /dev/null
+++ b/flang/test/Lower/OpenMP/omp-lib-num-threads.f90
@@ -0,0 +1,21 @@
+! RUN: %flang_fc1 -emit-hlfir -fopenmp %s -o - 2>&1 | FileCheck %s
+! RUN: bbc -fopenmp -emit-hlfir -o - %s 2>&1 | FileCheck %s
+!
+! Test that the calls to omp_lib's omp_get_num_threads and omp_set_num_threads
+! get lowered even though their implementation is not in the omp_lib module
+! (and this matters because this is an intrinsic module - and calls to
+! intrinsics are specially resolved).
+
+program main
+  use omp_lib
+  integer(omp_integer_kind) :: num_threads
+  integer(omp_integer_kind), parameter :: requested_num_threads = 4
+  call omp_set_num_threads(requested_num_threads)
+  num_threads = omp_get_num_threads()
+  print *, num_threads
+end program
+
+!CHECK-NOT: not yet implemented: intrinsic: omp_set_num_threads
+!CHECK-NOT: not yet implemented: intrinsic: omp_get_num_threads
+!CHECK: fir.call @omp_set_num_threads
+!CHECK: fir.call @omp_get_num_threads

From d8003d021b87485836a8f685b0ac6473069c229b Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett@linaro.org>
Date: Fri, 27 Oct 2023 15:32:57 +0000
Subject: [PATCH 230/877] [lldb][lldb-dap] Remove extra spaces at the end of
 --help examples

---
 lldb/tools/lldb-dap/lldb-dap.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lldb/tools/lldb-dap/lldb-dap.cpp b/lldb/tools/lldb-dap/lldb-dap.cpp
index b5d06a977a449..e103aabb87020 100644
--- a/lldb/tools/lldb-dap/lldb-dap.cpp
+++ b/lldb/tools/lldb-dap/lldb-dap.cpp
@@ -3591,7 +3591,7 @@ static void printHelp(LLDBDAPOptTable &table, llvm::StringRef tool_name) {
   debugger to attach to the process.
 
     lldb-dap -g
-  )___";
+)___";
   llvm::outs() << examples;
 }
 

From 58bdd165d505f3993d15442afd2286c4deedcba2 Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Fri, 27 Oct 2023 16:38:46 +0100
Subject: [PATCH 231/877] [ARM] Add a test for incorrect demand bits / undef
 fold. NFC

---
 .../Transforms/InstCombine/ARM/mve-narrow.ll  | 24 +++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/llvm/test/Transforms/InstCombine/ARM/mve-narrow.ll b/llvm/test/Transforms/InstCombine/ARM/mve-narrow.ll
index 623e774ea2c31..20babc29d535e 100644
--- a/llvm/test/Transforms/InstCombine/ARM/mve-narrow.ll
+++ b/llvm/test/Transforms/InstCombine/ARM/mve-narrow.ll
@@ -241,6 +241,30 @@ define <8 x half> @test_cvtnp_v8i16_bt(<8 x half> %a, <8 x half> %b, <4 x float>
   ret <8 x half> %z
 }
 
+define <4 x i32> @test_vshrn_const(<8 x i16> %a) {
+; CHECK-LABEL: @test_vshrn_const(
+; CHECK-NEXT:    ret <4 x i32> zeroinitializer
+;
+  %y = call <8 x i16> @llvm.arm.mve.vshrn.v8i16.v4i32(<8 x i16> %a, <4 x i32> <i32 512, i32 0, i32 0, i32 0>, i32 3, i32 0, i32 0, i32 0, i32 0, i32 1)
+  %z = shufflevector <8 x i16> %y, <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %za = zext <4 x i16> %z to <4 x i32>
+  ret <4 x i32> %za
+}
+
+define zeroext i16 @test_undef_bits() {
+; CHECK-LABEL: @test_undef_bits(
+; CHECK-NEXT:  e:
+; CHECK-NEXT:    ret i16 0
+;
+e:
+  %0 = call <8 x i16> @llvm.arm.mve.vshrn.v8i16.v4i32(<8 x i16> zeroinitializer, <4 x i32> <i32 256, i32 0, i32 0, i32 0>, i32 8, i32 1, i32 1, i32 1, i32 0, i32 1)
+  %1 = shufflevector <8 x i16> %0, <8 x i16> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %2 = zext <4 x i16> %1 to <4 x i32>
+  %3 = bitcast <4 x i32> %2 to <8 x i16>
+  %4 = extractelement <8 x i16> %3, i32 0
+  ret i16 %4
+}
+
 declare <8 x i16> @llvm.arm.mve.vshrn.v8i16.v4i32(<8 x i16>, <4 x i32>, i32, i32, i32, i32, i32, i32)
 declare <8 x i16> @llvm.arm.mve.vshrn.predicated.v8i16.v4i32.v4i1(<8 x i16>, <4 x i32>, i32, i32, i32, i32, i32, i32, <4 x i1>)
 declare <16 x i8> @llvm.arm.mve.vshrn.v16i8.v8i16(<16 x i8>, <8 x i16>, i32, i32, i32, i32, i32, i32)

From b1b9cd4e457dc3c5d7e59104d5502da23295e49c Mon Sep 17 00:00:00 2001
From: Congcong Cai <congcongcai0907@163.com>
Date: Fri, 27 Oct 2023 23:52:05 +0800
Subject: [PATCH 232/877] [clang]set invalid for lambda which missing capture
 `this` (#70432)

It can suppression further crash.
Fixes: #67687
---
 clang/docs/ReleaseNotes.rst               | 2 ++
 clang/lib/Sema/SemaExprCXX.cpp            | 6 ++++--
 clang/test/SemaCXX/lambda-expressions.cpp | 8 ++++++++
 3 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 8eca67677f1d8..2595737e8b3b1 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -529,6 +529,8 @@ Bug Fixes in This Version
   ``thread_local`` instead of ``_Thread_local``.
   Fixes (`#70068 <https://github.com/llvm/llvm-project/issues/70068>`_) and
   (`#69167 <https://github.com/llvm/llvm-project/issues/69167>`_)
+- Fix crash in evaluating invalid lambda expression which forget capture this.
+  Fixes (`#67687 <https://github.com/llvm/llvm-project/issues/67687>`_)
 - Fix crash from constexpr evaluator evaluating uninitialized arrays as rvalue.
   Fixes (`#67317 <https://github.com/llvm/llvm-project/issues/67317>`_)
 
diff --git a/clang/lib/Sema/SemaExprCXX.cpp b/clang/lib/Sema/SemaExprCXX.cpp
index 1153049496d12..6344c9102330a 100644
--- a/clang/lib/Sema/SemaExprCXX.cpp
+++ b/clang/lib/Sema/SemaExprCXX.cpp
@@ -1333,6 +1333,7 @@ bool Sema::CheckCXXThisCapture(SourceLocation Loc, const bool Explicit,
       if (LSI && isGenericLambdaCallOperatorSpecialization(LSI->CallOperator)) {
         // This context can't implicitly capture 'this'; fail out.
         if (BuildAndDiagnose) {
+          LSI->CallOperator->setInvalidDecl();
           Diag(Loc, diag::err_this_capture)
               << (Explicit && idx == MaxFunctionScopesIndex);
           if (!Explicit)
@@ -1354,10 +1355,11 @@ bool Sema::CheckCXXThisCapture(SourceLocation Loc, const bool Explicit,
         continue;
       }
       // This context can't implicitly capture 'this'; fail out.
-      if (BuildAndDiagnose)
+      if (BuildAndDiagnose) {
+        LSI->CallOperator->setInvalidDecl();
         Diag(Loc, diag::err_this_capture)
             << (Explicit && idx == MaxFunctionScopesIndex);
-
+      }
       if (!Explicit)
         buildLambdaThisCaptureFixit(*this, LSI);
       return true;
diff --git a/clang/test/SemaCXX/lambda-expressions.cpp b/clang/test/SemaCXX/lambda-expressions.cpp
index b4209d50ad03d..1797eef320b86 100644
--- a/clang/test/SemaCXX/lambda-expressions.cpp
+++ b/clang/test/SemaCXX/lambda-expressions.cpp
@@ -382,6 +382,14 @@ namespace PR18128 {
   };
 }
 
+namespace gh67687 {
+struct S {
+  int n;
+  int a = (4, []() { return n; }());  // expected-error {{'this' cannot be implicitly captured in this context}} \
+                                      // expected-note {{explicitly capture 'this'}}
+};
+}
+
 namespace PR18473 {
   template<typename T> void f() {
     T t(0);

From 7c90be2857fc4c6a2e67f203ca289ed7773fae03 Mon Sep 17 00:00:00 2001
From: Paul Walker <paul.walker@arm.com>
Date: Fri, 27 Oct 2023 16:53:30 +0100
Subject: [PATCH 233/877] [SVE] Fix incorrect offset calculation when rewriting
 an instruction's frame index. (#70315)

When partially packing an offset into an SVE load/store instruction we
are incorrectly calculating the remainder.
---
 llvm/lib/Target/AArch64/AArch64InstrInfo.cpp  |  2 +-
 llvm/test/CodeGen/AArch64/framelayout-sve.mir | 22 +++++++++++++++++++
 2 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 7f1421549b149..e9b9f8013abea 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -5657,7 +5657,7 @@ int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI,
     Offset = Remainder;
   else {
     NewOffset = NewOffset < 0 ? MinOff : MaxOff;
-    Offset = Offset - NewOffset * Scale + Remainder;
+    Offset = Offset - NewOffset * Scale;
   }
 
   if (EmittableOffset)
diff --git a/llvm/test/CodeGen/AArch64/framelayout-sve.mir b/llvm/test/CodeGen/AArch64/framelayout-sve.mir
index 54d6a5fc19976..7c87587c6dc4e 100644
--- a/llvm/test/CodeGen/AArch64/framelayout-sve.mir
+++ b/llvm/test/CodeGen/AArch64/framelayout-sve.mir
@@ -41,6 +41,7 @@
   define aarch64_sve_vector_pcs void @save_restore_sve_realign() uwtable { entry: unreachable }
   define aarch64_sve_vector_pcs void @frame_layout() uwtable { entry: unreachable }
   define void @fp_relative_index_with_float_save() uwtable { entry: unreachable }
+  define void @fp_relative_that_is_not_a_multiple_of_VLx16() uwtable { entry: unreachable }
 
 ...
 # +----------+
@@ -1287,3 +1288,24 @@ body:             |
 
     RET_ReallyLR
 ---
+...
+# CHECK-LABEL: name: fp_relative_that_is_not_a_multiple_of_VLx16
+# CHECK:         - { id: 0, name: '', type: default, offset: -156, size: 156, alignment: 1,
+# CHECK-NEXT:        stack-id: scalable-vector
+# CHECK:         - { id: 1, name: '', type: variable-sized, offset: -32, alignment: 1,
+# CHECK-NEXT:        stack-id: default
+
+# CHECK:      $x8 = ADDPL_XXI $fp, -14
+# CHECK-NEXT: $z0 = LD1W_IMM killed renamable $p0, killed $x8, -8
+
+name:            fp_relative_that_is_not_a_multiple_of_VLx16
+stack:
+  - { id: 0, stack-id: scalable-vector, size: 156, alignment: 1 }
+  - { id: 1, stack-id: default, type: variable-sized, alignment: 1 }
+body:             |
+  bb.0.entry:
+    liveins: $p0
+
+    renamable $z0 = LD1W_IMM killed renamable $p0, %stack.0, 0
+    RET_ReallyLR
+---

From 263746754ab2ad099ffb22112cda5926d7ae3353 Mon Sep 17 00:00:00 2001
From: Congcong Cai <congcongcai0907@163.com>
Date: Fri, 27 Oct 2023 23:56:25 +0800
Subject: [PATCH 234/877] [clang-tidy]Fix PreferMemberInitializer false
 positive for reassignment (#70316)

- assignment twice cannot be simplified to once when assigning to
reference type
- safe assignment cannot be advanced before unsafe assignment
---
 .../PreferMemberInitializerCheck.cpp          | 80 ++++++++++++++-----
 clang-tools-extra/docs/ReleaseNotes.rst       |  3 +-
 .../prefer-member-initializer.cpp             | 16 ++++
 3 files changed, 80 insertions(+), 19 deletions(-)

diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/PreferMemberInitializerCheck.cpp b/clang-tools-extra/clang-tidy/cppcoreguidelines/PreferMemberInitializerCheck.cpp
index 0ef13ae298033..b6daf8b936bde 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/PreferMemberInitializerCheck.cpp
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/PreferMemberInitializerCheck.cpp
@@ -8,8 +8,10 @@
 
 #include "PreferMemberInitializerCheck.h"
 #include "clang/AST/ASTContext.h"
+#include "clang/AST/Decl.h"
 #include "clang/ASTMatchers/ASTMatchFinder.h"
 #include "clang/Lex/Lexer.h"
+#include "llvm/ADT/DenseMap.h"
 
 using namespace clang::ast_matchers;
 
@@ -54,31 +56,66 @@ static bool shouldBeDefaultMemberInitializer(const Expr *Value) {
 }
 
 namespace {
+
 AST_MATCHER_P(FieldDecl, indexNotLessThan, unsigned, Index) {
   return Node.getFieldIndex() >= Index;
 }
+
+enum class AssignedLevel {
+  // Field is not assigned.
+  None,
+  // Field is assigned.
+  Default,
+  // Assignment of field has side effect:
+  // - assign to reference.
+  // FIXME: support other side effect.
+  HasSideEffect,
+  // Assignment of field has data dependence.
+  HasDependence,
+};
+
 } // namespace
 
+static bool canAdvanceAssignment(AssignedLevel Level) {
+  return Level == AssignedLevel::None || Level == AssignedLevel::Default;
+}
+
 // Checks if Field is initialised using a field that will be initialised after
 // it.
 // TODO: Probably should guard against function calls that could have side
-// effects or if they do reference another field that's initialized before this
-// field, but is modified before the assignment.
-static bool isSafeAssignment(const FieldDecl *Field, const Expr *Init,
-                             const CXXConstructorDecl *Context) {
+// effects or if they do reference another field that's initialized before
+// this field, but is modified before the assignment.
+static void updateAssignmentLevel(
+    const FieldDecl *Field, const Expr *Init, const CXXConstructorDecl *Ctor,
+    llvm::DenseMap<const FieldDecl *, AssignedLevel> &AssignedFields) {
+  auto It = AssignedFields.find(Field);
+  if (It == AssignedFields.end())
+    It = AssignedFields.insert({Field, AssignedLevel::None}).first;
+
+  if (!canAdvanceAssignment(It->second))
+    // fast path for already decided field.
+    return;
+
+  if (Field->getType().getCanonicalType()->isReferenceType()) {
+    // assign to reference type twice cannot be simplified to once.
+    It->second = AssignedLevel::HasSideEffect;
+    return;
+  }
 
   auto MemberMatcher =
       memberExpr(hasObjectExpression(cxxThisExpr()),
                  member(fieldDecl(indexNotLessThan(Field->getFieldIndex()))));
-
   auto DeclMatcher = declRefExpr(
-      to(varDecl(unless(parmVarDecl()), hasDeclContext(equalsNode(Context)))));
-
-  return match(expr(anyOf(MemberMatcher, DeclMatcher,
-                          hasDescendant(MemberMatcher),
-                          hasDescendant(DeclMatcher))),
-               *Init, Field->getASTContext())
-      .empty();
+      to(varDecl(unless(parmVarDecl()), hasDeclContext(equalsNode(Ctor)))));
+  const bool HasDependence = !match(expr(anyOf(MemberMatcher, DeclMatcher,
+                                               hasDescendant(MemberMatcher),
+                                               hasDescendant(DeclMatcher))),
+                                    *Init, Field->getASTContext())
+                                  .empty();
+  if (HasDependence) {
+    It->second = AssignedLevel::HasDependence;
+    return;
+  }
 }
 
 static std::pair<const FieldDecl *, const Expr *>
@@ -99,9 +136,9 @@ isAssignmentToMemberOf(const CXXRecordDecl *Rec, const Stmt *S,
     if (!isa<CXXThisExpr>(ME->getBase()))
       return std::make_pair(nullptr, nullptr);
     const Expr *Init = BO->getRHS()->IgnoreParenImpCasts();
-    if (isSafeAssignment(Field, Init, Ctor))
-      return std::make_pair(Field, Init);
-  } else if (const auto *COCE = dyn_cast<CXXOperatorCallExpr>(S)) {
+    return std::make_pair(Field, Init);
+  }
+  if (const auto *COCE = dyn_cast<CXXOperatorCallExpr>(S)) {
     if (COCE->getOperator() != OO_Equal)
       return std::make_pair(nullptr, nullptr);
 
@@ -117,10 +154,8 @@ isAssignmentToMemberOf(const CXXRecordDecl *Rec, const Stmt *S,
     if (!isa<CXXThisExpr>(ME->getBase()))
       return std::make_pair(nullptr, nullptr);
     const Expr *Init = COCE->getArg(1)->IgnoreParenImpCasts();
-    if (isSafeAssignment(Field, Init, Ctor))
-      return std::make_pair(Field, Init);
+    return std::make_pair(Field, Init);
   }
-
   return std::make_pair(nullptr, nullptr);
 }
 
@@ -156,6 +191,12 @@ void PreferMemberInitializerCheck::check(
   const CXXRecordDecl *Class = Ctor->getParent();
   bool FirstToCtorInits = true;
 
+  llvm::DenseMap<const FieldDecl *, AssignedLevel> AssignedFields{};
+
+  for (const CXXCtorInitializer *Init : Ctor->inits())
+    if (FieldDecl *Field = Init->getMember())
+      updateAssignmentLevel(Field, Init->getInit(), Ctor, AssignedFields);
+
   for (const Stmt *S : Body->body()) {
     if (S->getBeginLoc().isMacroID()) {
       StringRef MacroName = Lexer::getImmediateMacroName(
@@ -180,6 +221,9 @@ void PreferMemberInitializerCheck::check(
     std::tie(Field, InitValue) = isAssignmentToMemberOf(Class, S, Ctor);
     if (!Field)
       continue;
+    updateAssignmentLevel(Field, InitValue, Ctor, AssignedFields);
+    if (!canAdvanceAssignment(AssignedFields[Field]))
+      continue;
     const bool IsInDefaultMemberInitializer =
         IsUseDefaultMemberInitEnabled && getLangOpts().CPlusPlus11 &&
         Ctor->isDefaultConstructor() &&
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index a511a522efb2e..5ce38ab48bf29 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -256,7 +256,8 @@ Changes in existing checks
 
 - Improved :doc:`cppcoreguidelines-prefer-member-initializer
   <clang-tidy/checks/cppcoreguidelines/prefer-member-initializer>` check to
-  ignore delegate constructors.
+  ignore delegate constructors and ignore re-assignment for reference or when
+  initialization depend on field that is initialized before.
 
 - Improved :doc:`cppcoreguidelines-pro-bounds-array-to-pointer-decay
   <clang-tidy/checks/cppcoreguidelines/pro-bounds-array-to-pointer-decay>` check
diff --git a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/prefer-member-initializer.cpp b/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/prefer-member-initializer.cpp
index 9d7aad52c8fa8..b5603dea316d5 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/prefer-member-initializer.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/prefer-member-initializer.cpp
@@ -570,3 +570,19 @@ struct PR52818  {
 
     int bar;
 };
+
+struct RefReassignment {
+  RefReassignment(int &i) : m_i{i} {
+    m_i = 1;
+  }
+  int & m_i;
+};
+
+struct ReassignmentAfterUnsafetyAssignment {
+  ReassignmentAfterUnsafetyAssignment() {
+    int a = 10;
+    m_i = a;
+    m_i = 1;
+  }
+  int m_i;
+};

From b00c8b9685b8fb4c04ddb04b08e41265b9a7dbee Mon Sep 17 00:00:00 2001
From: Wei Wang <32606824+apolloww@users.noreply.github.com>
Date: Fri, 27 Oct 2023 08:59:21 -0700
Subject: [PATCH 235/877] [PassTimingInfo] Handle nested timers in passes
 (#70165)

Some pass can call other passes, such as CoroCleanupPass. Make pass
timers work with nested passes.
---
 llvm/include/llvm/IR/PassTimingInfo.h |  5 +++--
 llvm/lib/IR/PassTimingInfo.cpp        | 24 ++++++++++++++++++------
 llvm/test/Other/time-passes.ll        | 15 +++++++++++++--
 3 files changed, 34 insertions(+), 10 deletions(-)

diff --git a/llvm/include/llvm/IR/PassTimingInfo.h b/llvm/include/llvm/IR/PassTimingInfo.h
index d464b998fa706..1148399943186 100644
--- a/llvm/include/llvm/IR/PassTimingInfo.h
+++ b/llvm/include/llvm/IR/PassTimingInfo.h
@@ -55,8 +55,9 @@ class TimePassesHandler {
   /// Map of timers for pass invocations
   StringMap<TimerVector> TimingData;
 
-  /// Currently active pass timer.
-  Timer *ActivePassTimer = nullptr;
+  /// Stack of currently active pass timers. Passes can run other
+  /// passes.
+  SmallVector<Timer *, 8> PassActiveTimerStack;
   /// Stack of currently active analysis timers. Analyses can request other
   /// analyses.
   SmallVector<Timer *, 8> AnalysisActiveTimerStack;
diff --git a/llvm/lib/IR/PassTimingInfo.cpp b/llvm/lib/IR/PassTimingInfo.cpp
index cfd27bf787937..3816eff5c0f22 100644
--- a/llvm/lib/IR/PassTimingInfo.cpp
+++ b/llvm/lib/IR/PassTimingInfo.cpp
@@ -252,9 +252,14 @@ static bool shouldIgnorePass(StringRef PassID) {
 void TimePassesHandler::startPassTimer(StringRef PassID) {
   if (shouldIgnorePass(PassID))
     return;
-  assert(!ActivePassTimer && "should only have one pass timer at a time");
+  // Stop the previous pass timer to prevent double counting when a
+  // pass requests another pass.
+  if (!PassActiveTimerStack.empty()) {
+    assert(PassActiveTimerStack.back()->isRunning());
+    PassActiveTimerStack.back()->stopTimer();
+  }
   Timer &MyTimer = getPassTimer(PassID, /*IsPass*/ true);
-  ActivePassTimer = &MyTimer;
+  PassActiveTimerStack.push_back(&MyTimer);
   assert(!MyTimer.isRunning());
   MyTimer.startTimer();
 }
@@ -262,10 +267,17 @@ void TimePassesHandler::startPassTimer(StringRef PassID) {
 void TimePassesHandler::stopPassTimer(StringRef PassID) {
   if (shouldIgnorePass(PassID))
     return;
-  assert(ActivePassTimer);
-  assert(ActivePassTimer->isRunning());
-  ActivePassTimer->stopTimer();
-  ActivePassTimer = nullptr;
+  assert(!PassActiveTimerStack.empty() && "empty stack in popTimer");
+  Timer *MyTimer = PassActiveTimerStack.pop_back_val();
+  assert(MyTimer && "timer should be present");
+  assert(MyTimer->isRunning());
+  MyTimer->stopTimer();
+
+  // Restart the previously stopped timer.
+  if (!PassActiveTimerStack.empty()) {
+    assert(!PassActiveTimerStack.back()->isRunning());
+    PassActiveTimerStack.back()->startTimer();
+  }
 }
 
 void TimePassesHandler::startAnalysisTimer(StringRef PassID) {
diff --git a/llvm/test/Other/time-passes.ll b/llvm/test/Other/time-passes.ll
index f7a0851a69c14..fd73fe2fd243d 100644
--- a/llvm/test/Other/time-passes.ll
+++ b/llvm/test/Other/time-passes.ll
@@ -1,11 +1,11 @@
 ; RUN: opt < %s -disable-output -passes='default<O2>' -time-passes 2>&1 | FileCheck %s --check-prefix=TIME
 ;
 ; For new pass manager, check that -time-passes-per-run emit one report for each pass run.
-; RUN: opt < %s -disable-output -passes='instcombine,instcombine,loop-mssa(licm)' -time-passes-per-run 2>&1 | FileCheck %s --check-prefix=TIME --check-prefix=TIME-PER-RUN
+; RUN: opt < %s -disable-output -passes='coro-cleanup,function(instcombine,instcombine,loop-mssa(licm))' -time-passes-per-run 2>&1 | FileCheck %s --check-prefix=TIME --check-prefix=TIME-PER-RUN --check-prefix=TIME-PER-RUN-CORO
 ; RUN: opt < %s -disable-output -passes='instcombine,loop-mssa(licm),instcombine,loop-mssa(licm)' -time-passes-per-run 2>&1 | FileCheck %s --check-prefix=TIME --check-prefix=TIME-PER-RUN -check-prefix=TIME-DOUBLE-LICM
 ;
 ; For new pass manager, check that -time-passes emit one report for each pass.
-; RUN: opt < %s -disable-output -passes='instcombine,instcombine,loop-mssa(licm)' -time-passes 2>&1 | FileCheck %s --check-prefixes=TIME,TIME-PER-PASS
+; RUN: opt < %s -disable-output -passes='coro-cleanup,function(instcombine,instcombine,loop-mssa(licm))' -time-passes 2>&1 | FileCheck %s --check-prefixes=TIME,TIME-PER-PASS,TIME-PER-PASS-CORO
 ; RUN: opt < %s -disable-output -passes='instcombine,loop-mssa(licm),instcombine,loop-mssa(licm)' -time-passes 2>&1 | FileCheck %s --check-prefixes=TIME,TIME-PER-PASS
 ;
 ; The following 2 test runs verify -info-output-file interaction (default goes to stderr, '-' goes to stdout).
@@ -30,11 +30,15 @@
 ; TIME-PER-RUN-DAG:      LCSSAPass
 ; TIME-PER-RUN-DAG:      LoopSimplifyPass
 ; TIME-PER-RUN-DAG:      VerifierPass
+; TIME-PER-RUN-CORO-DAG:     SimplifyCFGPass #1
+; TIME-PER-RUN-CORO-DAG:     CoroCleanupPass #1
 ; TIME-PER-PASS-DAG:   InstCombinePass
 ; TIME-PER-PASS-DAG:   LICMPass
 ; TIME-PER-PASS-DAG:   LCSSAPass
 ; TIME-PER-PASS-DAG:   LoopSimplifyPass
 ; TIME-PER-PASS-DAG:   VerifierPass
+; TIME-PER-PASS-CORO-DAG:    SimplifyCFGPass
+; TIME-PER-PASS-CORO-DAG:    CoroCleanupPass
 ; TIME-PER-PASS-NOT:   InstCombinePass #
 ; TIME-PER-PASS-NOT:   LICMPass #
 ; TIME-PER-PASS-NOT:   LCSSAPass #
@@ -78,3 +82,10 @@ end:
   ret void
 
 }
+
+define void @baz_coro() {
+  %unused = call ptr @llvm.coro.begin(token none, ptr null)
+  ret void
+}
+
+declare ptr @llvm.coro.begin(token, ptr)

From 13de7593930ba2019b7f6c94116275f50f225d76 Mon Sep 17 00:00:00 2001
From: harishch4 <harishcse44@gmail.com>
Date: Fri, 27 Oct 2023 21:56:58 +0530
Subject: [PATCH 236/877] [Flang][OpenMP] Port openmp threadprivate tests to
 HLFIR flow (#70440)

---
 .../threadprivate-char-array-chararray.f90    |  54 +++++++++
 .../OpenMP/threadprivate-commonblock.f90      | 104 ++++++++++++++++++
 .../threadprivate-integer-different-kinds.f90 |  85 ++++++++++++++
 .../Lower/OpenMP/threadprivate-non-global.f90 | 103 +++++++++++++++++
 .../threadprivate-pointer-allocatable.f90     |  63 +++++++++++
 ...ivate-real-logical-complex-derivedtype.f90 |  69 ++++++++++++
 6 files changed, 478 insertions(+)
 create mode 100644 flang/test/Lower/OpenMP/threadprivate-char-array-chararray.f90
 create mode 100644 flang/test/Lower/OpenMP/threadprivate-commonblock.f90
 create mode 100644 flang/test/Lower/OpenMP/threadprivate-integer-different-kinds.f90
 create mode 100644 flang/test/Lower/OpenMP/threadprivate-non-global.f90
 create mode 100644 flang/test/Lower/OpenMP/threadprivate-pointer-allocatable.f90
 create mode 100644 flang/test/Lower/OpenMP/threadprivate-real-logical-complex-derivedtype.f90

diff --git a/flang/test/Lower/OpenMP/threadprivate-char-array-chararray.f90 b/flang/test/Lower/OpenMP/threadprivate-char-array-chararray.f90
new file mode 100644
index 0000000000000..ce2490aee4f55
--- /dev/null
+++ b/flang/test/Lower/OpenMP/threadprivate-char-array-chararray.f90
@@ -0,0 +1,54 @@
+! This test checks lowering of OpenMP Threadprivate Directive.
+! Test for character, array, and character array.
+
+!RUN: %flang_fc1 -emit-hlfir -fopenmp %s -o - | FileCheck %s
+
+module test
+  character :: x
+  integer :: y(5)
+  character(5) :: z(5)
+
+  !$omp threadprivate(x, y, z)
+
+!CHECK-DAG: fir.global @_QMtestEx : !fir.char<1> {
+!CHECK-DAG: fir.global @_QMtestEy : !fir.array<5xi32> {
+!CHECK-DAG: fir.global @_QMtestEz : !fir.array<5x!fir.char<1,5>> {
+
+contains
+  subroutine sub()
+!CHECK-DAG:  %[[X:.*]] = fir.address_of(@_QMtestEx) : !fir.ref<!fir.char<1>>
+!CHECK-DAG:  %[[X_DECL:.*]]:2 = hlfir.declare %[[X]] typeparams %c1 {uniq_name = "_QMtestEx"} : (!fir.ref<!fir.char<1>>, index) -> (!fir.ref<!fir.char<1>>, !fir.ref<!fir.char<1>>)
+!CHECK-DAG:  %[[OMP_X:.*]] = omp.threadprivate %[[X_DECL]]#1 : !fir.ref<!fir.char<1>> -> !fir.ref<!fir.char<1>>
+!CHECK-DAG:  %[[OMP_X_DECL:.*]]:2 = hlfir.declare %[[OMP_X]] typeparams %c1 {uniq_name = "_QMtestEx"} : (!fir.ref<!fir.char<1>>, index) -> (!fir.ref<!fir.char<1>>, !fir.ref<!fir.char<1>>)
+!CHECK-DAG:  %[[Y:.*]] = fir.address_of(@_QMtestEy) : !fir.ref<!fir.array<5xi32>>
+!CHECK-DAG:  %[[Y_DECL:.*]]:2 = hlfir.declare %[[Y]](%{{.*}}) {uniq_name = "_QMtestEy"} : (!fir.ref<!fir.array<5xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<5xi32>>, !fir.ref<!fir.array<5xi32>>)
+!CHECK-DAG:  %[[OMP_Y:.*]] = omp.threadprivate %[[Y_DECL]]#1 : !fir.ref<!fir.array<5xi32>> -> !fir.ref<!fir.array<5xi32>>
+!CHECK-DAG:  %[[OMP_Y_DECL:.*]]:2 = hlfir.declare %[[OMP_Y]](%{{.*}}) {uniq_name = "_QMtestEy"} : (!fir.ref<!fir.array<5xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<5xi32>>, !fir.ref<!fir.array<5xi32>>)
+!CHECK-DAG:  %[[Z:.*]] = fir.address_of(@_QMtestEz) : !fir.ref<!fir.array<5x!fir.char<1,5>>>
+!CHECK-DAG:  %[[Z_DECL:.*]]:2 = hlfir.declare %[[Z]](%{{.*}}) typeparams %c5_0 {uniq_name = "_QMtestEz"} : (!fir.ref<!fir.array<5x!fir.char<1,5>>>, !fir.shape<1>, index) -> (!fir.ref<!fir.array<5x!fir.char<1,5>>>, !fir.ref<!fir.array<5x!fir.char<1,5>>>)
+!CHECK-DAG:  %[[OMP_Z:.*]] = omp.threadprivate %[[Z_DECL]]#1 : !fir.ref<!fir.array<5x!fir.char<1,5>>> -> !fir.ref<!fir.array<5x!fir.char<1,5>>>
+!CHECK-DAG:  %[[OMP_Z_DECL:.*]]:2 = hlfir.declare %[[OMP_Z]](%{{.*}}) typeparams %c5_0 {uniq_name = "_QMtestEz"} : (!fir.ref<!fir.array<5x!fir.char<1,5>>>, !fir.shape<1>, index) -> (!fir.ref<!fir.array<5x!fir.char<1,5>>>, !fir.ref<!fir.array<5x!fir.char<1,5>>>)
+!CHECK-DAG:  %{{.*}} = fir.convert %[[OMP_X_DECL]]#1 : (!fir.ref<!fir.char<1>>) -> !fir.ref<i8>
+!CHECK-DAG:  %{{.*}} = fir.embox %[[OMP_Y_DECL]]#1(%{{.*}}) : (!fir.ref<!fir.array<5xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<5xi32>>
+!CHECK-DAG:  %{{.*}} = fir.embox %[[OMP_Z_DECL]]#1(%{{.*}}) : (!fir.ref<!fir.array<5x!fir.char<1,5>>>, !fir.shape<1>) -> !fir.box<!fir.array<5x!fir.char<1,5>>>
+    print *, x, y, z
+
+    !$omp parallel
+!CHECK-DAG:  %[[X_PVT:.*]] = omp.threadprivate %[[X_DECL]]#1 : !fir.ref<!fir.char<1>> -> !fir.ref<!fir.char<1>>
+!CHECK-DAG:  %[[X_PVT_DECL:.*]]:2 = hlfir.declare %[[X_PVT]] typeparams %c1 {uniq_name = "_QMtestEx"} : (!fir.ref<!fir.char<1>>, index) -> (!fir.ref<!fir.char<1>>, !fir.ref<!fir.char<1>>)
+!CHECK-DAG:  %[[Y_PVT:.*]] = omp.threadprivate %[[Y_DECL]]#1 : !fir.ref<!fir.array<5xi32>> -> !fir.ref<!fir.array<5xi32>>
+!CHECK-DAG:  %[[Y_PVT_DECL:.*]]:2 = hlfir.declare %[[Y_PVT]](%{{.*}}) {uniq_name = "_QMtestEy"} : (!fir.ref<!fir.array<5xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<5xi32>>, !fir.ref<!fir.array<5xi32>>)
+!CHECK-DAG:  %[[Z_PVT:.*]] = omp.threadprivate %[[Z_DECL]]#1 : !fir.ref<!fir.array<5x!fir.char<1,5>>> -> !fir.ref<!fir.array<5x!fir.char<1,5>>>
+!CHECK-DAG:  %[[Z_PVT_DECL:.*]]:2 = hlfir.declare %[[Z_PVT]](%{{.*}}) typeparams %c5_0 {uniq_name = "_QMtestEz"} : (!fir.ref<!fir.array<5x!fir.char<1,5>>>, !fir.shape<1>, index) -> (!fir.ref<!fir.array<5x!fir.char<1,5>>>, !fir.ref<!fir.array<5x!fir.char<1,5>>>)
+!CHECK-DAG:  %{{.*}} = fir.convert %[[X_PVT_DECL]]#1 : (!fir.ref<!fir.char<1>>) -> !fir.ref<i8>
+!CHECK-DAG:  %{{.*}} = fir.embox %[[Y_PVT_DECL]]#1(%{{.*}}) : (!fir.ref<!fir.array<5xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<5xi32>>
+!CHECK-DAG:  %{{.*}} = fir.embox %[[Z_PVT_DECL]]#1(%{{.*}}) : (!fir.ref<!fir.array<5x!fir.char<1,5>>>, !fir.shape<1>) -> !fir.box<!fir.array<5x!fir.char<1,5>>>
+    print *, x, y, z
+    !$omp end parallel
+!CHECK-DAG:  %{{.*}} = fir.convert %[[OMP_X_DECL]]#1 : (!fir.ref<!fir.char<1>>) -> !fir.ref<i8>
+!CHECK-DAG:  %{{.*}} = fir.embox %[[OMP_Y_DECL]]#1(%{{.*}}) : (!fir.ref<!fir.array<5xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<5xi32>>
+!CHECK-DAG:  %{{.*}} = fir.embox %[[OMP_Z_DECL]]#1(%{{.*}}) : (!fir.ref<!fir.array<5x!fir.char<1,5>>>, !fir.shape<1>) -> !fir.box<!fir.array<5x!fir.char<1,5>>>
+    print *, x, y, z
+
+  end
+end
diff --git a/flang/test/Lower/OpenMP/threadprivate-commonblock.f90 b/flang/test/Lower/OpenMP/threadprivate-commonblock.f90
new file mode 100644
index 0000000000000..ddbe5bd524657
--- /dev/null
+++ b/flang/test/Lower/OpenMP/threadprivate-commonblock.f90
@@ -0,0 +1,104 @@
+! This test checks lowering of OpenMP Threadprivate Directive.
+! Test for common block.
+
+!RUN: %flang_fc1 -emit-hlfir -fopenmp %s -o - | FileCheck %s
+
+module test
+  integer:: a
+  real :: b(2)
+  complex, pointer :: c, d(:)
+  character(5) :: e, f(2)
+  common /blk/ a, b, c, d, e, f
+
+  !$omp threadprivate(/blk/)
+
+!CHECK: fir.global common @blk_(dense<0> : vector<103xi8>) : !fir.array<103xi8>
+
+contains
+  subroutine sub()
+  !CHECK-DAG:  %[[CBLK_ADDR:.*]] = fir.address_of(@blk_) : !fir.ref<!fir.array<103xi8>>
+  !CHECK-DAG:  %[[CBLK_ADDR_CVT:.*]] = fir.convert %[[CBLK_ADDR]] : (!fir.ref<!fir.array<103xi8>>) -> !fir.ref<!fir.array<?xi8>>
+  !CHECK-DAG:  %[[A_ADDR:.*]] = fir.coordinate_of %[[CBLK_ADDR_CVT]], %c0 : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
+  !CHECK-DAG:  %[[A_ADDR_CVT:.*]] = fir.convert %[[A_ADDR]] : (!fir.ref<i8>) -> !fir.ref<i32>
+  !CHECK-DAG:  %[[A_VAL:.*]]:2 = hlfir.declare %[[A_ADDR_CVT]] {uniq_name = "_QMtestEa"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+  !CHECK-DAG:  %[[OMP_CBLK:.*]] = omp.threadprivate %[[CBLK_ADDR]] : !fir.ref<!fir.array<103xi8>> -> !fir.ref<!fir.array<103xi8>>
+  !CHECK-DAG:  %[[OMP_CBLK_ADDR:.*]] = fir.convert %[[OMP_CBLK]] : (!fir.ref<!fir.array<103xi8>>) -> !fir.ref<!fir.array<?xi8>>
+  !CHECK-DAG:  %[[A_ADDR:.*]] = fir.coordinate_of %[[OMP_CBLK_ADDR]], {{.*}} : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
+  !CHECK-DAG:  %[[A_ADDR_CVT:.*]] = fir.convert %[[A_ADDR]] : (!fir.ref<i8>) -> !fir.ref<i32>
+  !CHECK-DAG:  %[[A_DECL:.*]]:2 = hlfir.declare %[[A_ADDR_CVT]] {uniq_name = "_QMtestEa"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+  !CHECK-DAG:  %[[OMP_CBLK_ADDR:.*]] = fir.convert %[[OMP_CBLK]] : (!fir.ref<!fir.array<103xi8>>) -> !fir.ref<!fir.array<?xi8>>
+  !CHECK-DAG:  %[[B_ADDR:.*]] = fir.coordinate_of %[[OMP_CBLK_ADDR]], {{.*}} : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
+  !CHECK-DAG:  %[[B_ADDR_CVT:.*]] = fir.convert %[[B_ADDR]] : (!fir.ref<i8>) -> !fir.ref<!fir.array<2xf32>>
+  !CHECK-DAG:  %[[B_DECL:.*]]:2 = hlfir.declare %[[B_ADDR_CVT]]({{.*}}) {uniq_name = "_QMtestEb"} : (!fir.ref<!fir.array<2xf32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<2xf32>>, !fir.ref<!fir.array<2xf32>>)
+  !CHECK-DAG:  %[[OMP_CBLK_ADDR:.*]] = fir.convert %[[OMP_CBLK]] : (!fir.ref<!fir.array<103xi8>>) -> !fir.ref<!fir.array<?xi8>>
+  !CHECK-DAG:  %[[C_ADDR:.*]] = fir.coordinate_of %[[OMP_CBLK_ADDR]], {{.*}} : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
+  !CHECK-DAG:  %[[C_ADDR_CVT:.*]] = fir.convert %[[C_ADDR]] : (!fir.ref<i8>) -> !fir.ref<!fir.box<!fir.ptr<!fir.complex<4>>>>
+  !CHECK-DAG:  %[[C_DECL:.*]]:2 = hlfir.declare %[[C_ADDR_CVT]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QMtestEc"} : (!fir.ref<!fir.box<!fir.ptr<!fir.complex<4>>>>) -> (!fir.ref<!fir.box<!fir.ptr<!fir.complex<4>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.complex<4>>>>)
+  !CHECK-DAG:  %[[OMP_CBLK_ADDR:.*]] = fir.convert %[[OMP_CBLK]] : (!fir.ref<!fir.array<103xi8>>) -> !fir.ref<!fir.array<?xi8>>
+  !CHECK-DAG:  %[[D_ADDR:.*]] = fir.coordinate_of %[[OMP_CBLK_ADDR]], {{.*}} : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
+  !CHECK-DAG:  %[[D_ADDR_CVT:.*]] = fir.convert %[[D_ADDR]] : (!fir.ref<i8>) -> !fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.complex<4>>>>>
+  !CHECK-DAG:  %[[D_DECL:.*]]:2 = hlfir.declare %[[D_ADDR_CVT]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QMtestEd"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.complex<4>>>>>) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.complex<4>>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.complex<4>>>>>)
+  !CHECK-DAG:  %[[OMP_CBLK_ADDR:.*]] = fir.convert %[[OMP_CBLK]] : (!fir.ref<!fir.array<103xi8>>) -> !fir.ref<!fir.array<?xi8>>
+  !CHECK-DAG:  %[[E_ADDR:.*]] = fir.coordinate_of %[[OMP_CBLK_ADDR]], {{.*}} : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
+  !CHECK-DAG:  %[[E_ADDR_CVT:.*]] = fir.convert %[[E_ADDR]] : (!fir.ref<i8>) -> !fir.ref<!fir.char<1,5>>
+  !CHECK-DAG:  %[[E_DECL:.*]]:2 = hlfir.declare %[[E_ADDR_CVT]] typeparams {{.*}} {uniq_name = "_QMtestEe"} : (!fir.ref<!fir.char<1,5>>, index) -> (!fir.ref<!fir.char<1,5>>, !fir.ref<!fir.char<1,5>>)
+  !CHECK-DAG:  %[[OMP_CBLK_ADDR:.*]] = fir.convert %[[OMP_CBLK]] : (!fir.ref<!fir.array<103xi8>>) -> !fir.ref<!fir.array<?xi8>>
+  !CHECK-DAG:  %[[F_ADDR:.*]] = fir.coordinate_of %[[OMP_CBLK_ADDR]], {{.*}} : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
+  !CHECK-DAG:  %[[F_ADDR_CVT:.*]] = fir.convert %[[F_ADDR]] : (!fir.ref<i8>) -> !fir.ref<!fir.array<2x!fir.char<1,5>>>
+  !CHECK-DAG:  %[[F_DECL:.*]]:2 = hlfir.declare %[[F_ADDR_CVT]]({{.*}}) typeparams {{.*}} {uniq_name = "_QMtestEf"} : (!fir.ref<!fir.array<2x!fir.char<1,5>>>, !fir.shape<1>, index) -> (!fir.ref<!fir.array<2x!fir.char<1,5>>>, !fir.ref<!fir.array<2x!fir.char<1,5>>>)
+  !CHECK-DAG:  {{.*}} = fir.load %[[A_DECL]]#0 : !fir.ref<i32>
+  !CHECK-DAG:  {{.*}} = fir.embox %[[B_DECL]]#1({{.*}}) : (!fir.ref<!fir.array<2xf32>>, !fir.shape<1>) -> !fir.box<!fir.array<2xf32>>
+  !CHECK-DAG:  {{.*}} = fir.load %[[C_DECL]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.complex<4>>>>
+  !CHECK-DAG:  {{.*}} = fir.load %[[D_DECL]]#1 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.complex<4>>>>>
+  !CHECK-DAG:  {{.*}} = fir.convert %[[E_DECL]]#1 : (!fir.ref<!fir.char<1,5>>) -> !fir.ref<i8>
+  !CHECK-DAG:  {{.*}} = fir.embox %[[F_DECL]]#1({{.*}}) : (!fir.ref<!fir.array<2x!fir.char<1,5>>>, !fir.shape<1>) -> !fir.box<!fir.array<2x!fir.char<1,5>>>
+    print *, a, b, c, d, e, f
+
+    !$omp parallel
+    !CHECK-DAG: omp.parallel   {
+    !CHECK-DAG:  %[[TP_PARALLEL:.*]] = omp.threadprivate %[[CBLK_ADDR]] : !fir.ref<!fir.array<103xi8>> -> !fir.ref<!fir.array<103xi8>>
+    !CHECK-DAG:  %[[TP_PARALLEL_ADDR:.*]] = fir.convert %[[TP_PARALLEL]] : (!fir.ref<!fir.array<103xi8>>) -> !fir.ref<!fir.array<?xi8>>
+    !CHECK-DAG:  %[[TP_A_ADDR:.*]] = fir.coordinate_of %[[TP_PARALLEL_ADDR]], {{.*}} : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
+    !CHECK-DAG:  %[[TP_A_ADDR_CVT:.*]] = fir.convert %[[TP_A_ADDR]] : (!fir.ref<i8>) -> !fir.ref<i32>
+    !CHECK-DAG:  %[[TP_A_DECL:.*]]:2 = hlfir.declare %[[TP_A_ADDR_CVT]] {uniq_name = "_QMtestEa"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+    !CHECK-DAG:  %[[TP_PARALLEL_ADDR:.*]] = fir.convert %[[TP_PARALLEL]] : (!fir.ref<!fir.array<103xi8>>) -> !fir.ref<!fir.array<?xi8>>
+    !CHECK-DAG:  %[[TP_B_ADDR:.*]] = fir.coordinate_of %[[TP_PARALLEL_ADDR]], {{.*}} : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
+    !CHECK-DAG:  %[[TP_B_ADDR_CVT:.*]] = fir.convert %[[TP_B_ADDR]] : (!fir.ref<i8>) -> !fir.ref<!fir.array<2xf32>>
+    !CHECK-DAG:  %[[TP_B_DECL:.*]]:2 = hlfir.declare %[[TP_B_ADDR_CVT]](%92) {uniq_name = "_QMtestEb"} : (!fir.ref<!fir.array<2xf32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<2xf32>>, !fir.ref<!fir.array<2xf32>>)
+    !CHECK-DAG:  %[[TP_PARALLEL_ADDR:.*]] = fir.convert %[[TP_PARALLEL]] : (!fir.ref<!fir.array<103xi8>>) -> !fir.ref<!fir.array<?xi8>>
+    !CHECK-DAG:  %[[TP_C_ADDR:.*]] = fir.coordinate_of %[[TP_PARALLEL_ADDR]], {{.*}} : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
+    !CHECK-DAG:  %[[TP_C_ADDR_CVT:.*]] = fir.convert %[[TP_C_ADDR]] : (!fir.ref<i8>) -> !fir.ref<!fir.box<!fir.ptr<!fir.complex<4>>>>
+    !CHECK-DAG:  %[[TP_C_DECL:.*]]:2 = hlfir.declare %[[TP_C_ADDR_CVT]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QMtestEc"} : (!fir.ref<!fir.box<!fir.ptr<!fir.complex<4>>>>) -> (!fir.ref<!fir.box<!fir.ptr<!fir.complex<4>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.complex<4>>>>)
+    !CHECK-DAG:  %[[TP_PARALLEL_ADDR:.*]] = fir.convert %[[TP_PARALLEL]] : (!fir.ref<!fir.array<103xi8>>) -> !fir.ref<!fir.array<?xi8>>
+    !CHECK-DAG:  %[[TP_D_ADDR:.*]] = fir.coordinate_of %[[TP_PARALLEL_ADDR]], {{.*}} : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
+    !CHECK-DAG:  %[[TP_D_ADDR_CVT:.*]] = fir.convert %[[TP_D_ADDR]] : (!fir.ref<i8>) -> !fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.complex<4>>>>>
+    !CHECK-DAG:  %[[TP_D_DECL:.*]]:2 = hlfir.declare %[[TP_D_ADDR_CVT]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QMtestEd"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.complex<4>>>>>) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.complex<4>>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.complex<4>>>>>)
+    !CHECK-DAG:  %[[TP_PARALLEL_ADDR:.*]] = fir.convert %[[TP_PARALLEL]] : (!fir.ref<!fir.array<103xi8>>) -> !fir.ref<!fir.array<?xi8>>
+    !CHECK-DAG:  %[[TP_E_ADDR:.*]] = fir.coordinate_of %[[TP_PARALLEL_ADDR]], {{.*}} : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
+    !CHECK-DAG:  %[[TP_E_ADDR_CVT:.*]] = fir.convert %[[TP_E_ADDR]] : (!fir.ref<i8>) -> !fir.ref<!fir.char<1,5>>
+    !CHECK-DAG:  %[[TP_E_DECL:.*]]:2 = hlfir.declare %[[TP_E_ADDR_CVT]] typeparams {{.*}} {uniq_name = "_QMtestEe"} : (!fir.ref<!fir.char<1,5>>, index) -> (!fir.ref<!fir.char<1,5>>, !fir.ref<!fir.char<1,5>>)
+    !CHECK-DAG:  %[[TP_PARALLEL_ADDR:.*]] = fir.convert %[[TP_PARALLEL]] : (!fir.ref<!fir.array<103xi8>>) -> !fir.ref<!fir.array<?xi8>>
+    !CHECK-DAG:  %[[TP_F_ADDR:.*]] = fir.coordinate_of %[[TP_PARALLEL_ADDR]], %{{.*}} : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
+    !CHECK-DAG:  %[[TP_F_ADDR_CVT:.*]] = fir.convert %[[TP_F_ADDR]] : (!fir.ref<i8>) -> !fir.ref<!fir.array<2x!fir.char<1,5>>>
+    !CHECK-DAG:  %[[TP_F_DECL:.*]]:2 = hlfir.declare %[[TP_F_ADDR_CVT]]({{.*}}) typeparams {{.*}} {uniq_name = "_QMtestEf"} : (!fir.ref<!fir.array<2x!fir.char<1,5>>>, !fir.shape<1>, index) -> (!fir.ref<!fir.array<2x!fir.char<1,5>>>, !fir.ref<!fir.array<2x!fir.char<1,5>>>)
+    !CHECK-DAG:  {{.*}} = fir.load %[[TP_A_DECL]]#0 : !fir.ref<i32>
+    !CHECK-DAG:  {{.*}} = fir.embox %[[TP_B_DECL]]#1({{.*}}) : (!fir.ref<!fir.array<2xf32>>, !fir.shape<1>) -> !fir.box<!fir.array<2xf32>>
+    !CHECK-DAG:  {{.*}} = fir.load %[[TP_C_DECL]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.complex<4>>>>
+    !CHECK-DAG:  {{.*}} = fir.load %[[TP_D_DECL]]#1 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.complex<4>>>>>
+    !CHECK-DAG:  {{.*}} = fir.convert %[[TP_E_DECL]]#1 : (!fir.ref<!fir.char<1,5>>) -> !fir.ref<i8>
+    !CHECK-DAG:  {{.*}} = fir.embox %[[TP_F_DECL]]#1(%{{.*}}) : (!fir.ref<!fir.array<2x!fir.char<1,5>>>, !fir.shape<1>) -> !fir.box<!fir.array<2x!fir.char<1,5>>>
+    !CHECK-DAG: omp.terminator
+    !CHECK-DAG: }
+    print *, a, b, c, d, e, f
+    !$omp end parallel
+
+  !CHECK-DAG:  %{{.*}} = fir.load %[[A_DECL]]#0 : !fir.ref<i32>
+  !CHECK-DAG:  %{{.*}} = fir.embox %[[B_DECL]]#1(%63) : (!fir.ref<!fir.array<2xf32>>, !fir.shape<1>) -> !fir.box<!fir.array<2xf32>>
+  !CHECK-DAG:  %{{.*}} = fir.load %[[C_DECL]]#0 : !fir.ref<!fir.box<!fir.ptr<!fir.complex<4>>>>
+  !CHECK-DAG:  %{{.*}} = fir.load %[[D_DECL]]#1 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.complex<4>>>>>
+  !CHECK-DAG:  %{{.*}} = fir.convert %[[E_DECL]]#1 : (!fir.ref<!fir.char<1,5>>) -> !fir.ref<i8>
+  !CHECK-DAG:  %{{.*}} = fir.embox %[[F_DECL]]#1(%79) : (!fir.ref<!fir.array<2x!fir.char<1,5>>>, !fir.shape<1>) -> !fir.box<!fir.array<2x!fir.char<1,5>>>
+    print *, a, b, c, d, e, f
+
+  end
+end
diff --git a/flang/test/Lower/OpenMP/threadprivate-integer-different-kinds.f90 b/flang/test/Lower/OpenMP/threadprivate-integer-different-kinds.f90
new file mode 100644
index 0000000000000..660202221e3f8
--- /dev/null
+++ b/flang/test/Lower/OpenMP/threadprivate-integer-different-kinds.f90
@@ -0,0 +1,85 @@
+! This test checks lowering of OpenMP Threadprivate Directive.
+! Test for variables with different kind.
+
+!REQUIRES: shell
+!RUN: %flang_fc1 -emit-hlfir -fopenmp %s -o - | FileCheck %s
+
+program test
+  integer, save :: i
+  integer(kind=1), save :: i1
+  integer(kind=2), save :: i2
+  integer(kind=4), save :: i4
+  integer(kind=8), save :: i8
+  integer(kind=16), save :: i16
+
+!CHECK-DAG:  %[[I:.*]] = fir.address_of(@_QFEi) : !fir.ref<i32>
+!CHECK-DAG:  %[[I_DECL:.*]]:2 = hlfir.declare %[[I]] {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK-DAG:  %[[OMP_I:.*]] = omp.threadprivate %[[I_DECL]]#1 : !fir.ref<i32> -> !fir.ref<i32>
+!CHECK-DAG:  %[[OMP_I_DECL:.*]]:2 = hlfir.declare %[[OMP_I]] {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK-DAG:  %[[I1:.*]] = fir.address_of(@_QFEi1) : !fir.ref<i8>
+!CHECK-DAG:  %[[I1_DECL:.*]]:2 = hlfir.declare %[[I1]] {uniq_name = "_QFEi1"} : (!fir.ref<i8>) -> (!fir.ref<i8>, !fir.ref<i8>)
+!CHECK-DAG:  %[[OMP_I1:.*]] = omp.threadprivate %[[I1_DECL]]#1 : !fir.ref<i8> -> !fir.ref<i8>
+!CHECK-DAG:  %[[OMP_I1_DECL:.*]]:2 = hlfir.declare %[[OMP_I1]] {uniq_name = "_QFEi1"} : (!fir.ref<i8>) -> (!fir.ref<i8>, !fir.ref<i8>)
+!CHECK-DAG:  %[[I16:.*]] = fir.address_of(@_QFEi16) : !fir.ref<i128>
+!CHECK-DAG:  %[[I16_DECL:.*]]:2 = hlfir.declare %[[I16]] {uniq_name = "_QFEi16"} : (!fir.ref<i128>) -> (!fir.ref<i128>, !fir.ref<i128>)
+!CHECK-DAG:  %[[OMP_I16:.*]] = omp.threadprivate %[[I16_DECL]]#1 : !fir.ref<i128> -> !fir.ref<i128>
+!CHECK-DAG:  %[[OMP_I16_DECL:.*]]:2 = hlfir.declare %[[OMP_I16]] {uniq_name = "_QFEi16"} : (!fir.ref<i128>) -> (!fir.ref<i128>, !fir.ref<i128>)
+!CHECK-DAG:  %[[I2:.*]] = fir.address_of(@_QFEi2) : !fir.ref<i16>
+!CHECK-DAG:  %[[I2_DECL:.*]]:2 = hlfir.declare %[[I2]] {uniq_name = "_QFEi2"} : (!fir.ref<i16>) -> (!fir.ref<i16>, !fir.ref<i16>)
+!CHECK-DAG:  %[[OMP_I2:.*]] = omp.threadprivate %[[I2_DECL]]#1 : !fir.ref<i16> -> !fir.ref<i16>
+!CHECK-DAG:  %[[OMP_I2_DECL:.*]]:2 = hlfir.declare %[[OMP_I2]] {uniq_name = "_QFEi2"} : (!fir.ref<i16>) -> (!fir.ref<i16>, !fir.ref<i16>)
+!CHECK-DAG:  %[[I4:.*]] = fir.address_of(@_QFEi4) : !fir.ref<i32>
+!CHECK-DAG:  %[[I4_DECL:.*]]:2 = hlfir.declare %[[I4]] {uniq_name = "_QFEi4"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK-DAG:  %[[OMP_I4:.*]] = omp.threadprivate %[[I4_DECL]]#1 : !fir.ref<i32> -> !fir.ref<i32>
+!CHECK-DAG:  %[[OMP_I4_DECL:.*]]:2 = hlfir.declare %[[OMP_I4]] {uniq_name = "_QFEi4"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK-DAG:  %[[I8:.*]] = fir.address_of(@_QFEi8) : !fir.ref<i64>
+!CHECK-DAG:  %[[I8_DECL:.*]]:2 = hlfir.declare %[[I8]] {uniq_name = "_QFEi8"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
+!CHECK-DAG:  %[[OMP_I8:.*]] = omp.threadprivate %[[I8_DECL]]#1 : !fir.ref<i64> -> !fir.ref<i64>
+!CHECK-DAG:  %[[OMP_I8_DECL:.*]]:2 = hlfir.declare %[[OMP_I8]] {uniq_name = "_QFEi8"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
+  !$omp threadprivate(i, i1, i2, i4, i8, i16)
+
+!CHECK-DAG:  %{{.*}} = fir.load %[[OMP_I_DECL]]#0 : !fir.ref<i32>
+!CHECK-DAG:  %{{.*}} = fir.load %[[OMP_I1_DECL]]#0 : !fir.ref<i8>
+!CHECK-DAG:  %{{.*}} = fir.load %[[OMP_I16_DECL]]#0 : !fir.ref<i128>
+!CHECK-DAG:  %{{.*}} = fir.load %[[OMP_I2_DECL]]#0 : !fir.ref<i16>
+!CHECK-DAG:  %{{.*}} = fir.load %[[OMP_I4_DECL]]#0 : !fir.ref<i32>
+!CHECK-DAG:  %{{.*}} = fir.load %[[OMP_I8_DECL]]#0 : !fir.ref<i64>
+  print *, i, i1, i2, i4, i8, i16
+
+  !$omp parallel
+!CHECK-DAG:  %[[I_PVT:.*]] = omp.threadprivate %[[I_DECL]]#1 : !fir.ref<i32> -> !fir.ref<i32>
+!CHECK-DAG:  %[[I_PVT_DECL:.*]]:2 = hlfir.declare %[[I_PVT]] {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK-DAG:  %[[I1_PVT:.*]] = omp.threadprivate %[[I1_DECL]]#1 : !fir.ref<i8> -> !fir.ref<i8>
+!CHECK-DAG:  %[[I1_PVT_DECL:.*]]:2 = hlfir.declare %[[I1_PVT]] {uniq_name = "_QFEi1"} : (!fir.ref<i8>) -> (!fir.ref<i8>, !fir.ref<i8>)
+!CHECK-DAG:  %[[I2_PVT:.*]] = omp.threadprivate %[[I2_DECL]]#1 : !fir.ref<i16> -> !fir.ref<i16>
+!CHECK-DAG:  %[[I2_PVT_DECL:.*]]:2 = hlfir.declare %[[I2_PVT]] {uniq_name = "_QFEi2"} : (!fir.ref<i16>) -> (!fir.ref<i16>, !fir.ref<i16>)
+!CHECK-DAG:  %[[I4_PVT:.*]] = omp.threadprivate %[[I4_DECL]]#1 : !fir.ref<i32> -> !fir.ref<i32>
+!CHECK-DAG:  %[[I4_PVT_DECL:.*]]:2 = hlfir.declare %[[I4_PVT]] {uniq_name = "_QFEi4"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK-DAG:  %[[I8_PVT:.*]] = omp.threadprivate %[[I8_DECL]]#1 : !fir.ref<i64> -> !fir.ref<i64>
+!CHECK-DAG:  %[[I8_PVT_DECL:.*]]:2 = hlfir.declare %[[I8_PVT]] {uniq_name = "_QFEi8"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
+!CHECK-DAG:  %[[I16_PVT:.*]] = omp.threadprivate %[[I16_DECL]]#1 : !fir.ref<i128> -> !fir.ref<i128>
+!CHECK-DAG:  %[[I16_PVT_DECL:.*]]:2 = hlfir.declare %[[I16_PVT]] {uniq_name = "_QFEi16"} : (!fir.ref<i128>) -> (!fir.ref<i128>, !fir.ref<i128>)
+!CHECK-DAG:  %{{.*}} = fir.load %[[I_PVT_DECL]]#0 : !fir.ref<i32>
+!CHECK-DAG:  %{{.*}} = fir.load %[[I1_PVT_DECL]]#0 : !fir.ref<i8>
+!CHECK-DAG:  %{{.*}} = fir.load %[[I16_PVT_DECL]]#0 : !fir.ref<i128>
+!CHECK-DAG:  %{{.*}} = fir.load %[[I2_PVT_DECL]]#0 : !fir.ref<i16>
+!CHECK-DAG:  %{{.*}} = fir.load %[[I4_PVT_DECL]]#0 : !fir.ref<i32>
+!CHECK-DAG:  %{{.*}} = fir.load %[[I8_PVT_DECL]]#0 : !fir.ref<i64>
+    print *, i, i1, i2, i4, i8, i16
+  !$omp end parallel
+
+!CHECK-DAG:  %{{.*}} = fir.load %[[OMP_I_DECL]]#0 : !fir.ref<i32>
+!CHECK-DAG:  %{{.*}} = fir.load %[[OMP_I1_DECL]]#0 : !fir.ref<i8>
+!CHECK-DAG:  %{{.*}} = fir.load %[[OMP_I16_DECL]]#0 : !fir.ref<i128>
+!CHECK-DAG:  %{{.*}} = fir.load %[[OMP_I2_DECL]]#0 : !fir.ref<i16>
+!CHECK-DAG:  %{{.*}} = fir.load %[[OMP_I4_DECL]]#0 : !fir.ref<i32>
+!CHECK-DAG:  %{{.*}} = fir.load %[[OMP_I8_DECL]]#0 : !fir.ref<i64>
+  print *, i, i1, i2, i4, i8, i16
+
+!CHECK-DAG: fir.global internal @_QFEi : i32 {
+!CHECK-DAG: fir.global internal @_QFEi1 : i8 {
+!CHECK-DAG: fir.global internal @_QFEi16 : i128 {
+!CHECK-DAG: fir.global internal @_QFEi2 : i16 {
+!CHECK-DAG: fir.global internal @_QFEi4 : i32 {
+!CHECK-DAG: fir.global internal @_QFEi8 : i64 {
+end
diff --git a/flang/test/Lower/OpenMP/threadprivate-non-global.f90 b/flang/test/Lower/OpenMP/threadprivate-non-global.f90
new file mode 100644
index 0000000000000..7cdadabe2247a
--- /dev/null
+++ b/flang/test/Lower/OpenMP/threadprivate-non-global.f90
@@ -0,0 +1,103 @@
+! This test checks lowering of OpenMP Threadprivate Directive.
+! Test for non-character non-SAVEd non-initialized scalars with or without
+! allocatable or pointer attribute in main program.
+
+!RUN: %flang_fc1 -emit-hlfir -fopenmp %s -o - | FileCheck %s
+
+program test
+  integer :: x
+  real :: y
+  logical :: z
+  complex :: w
+  integer, pointer :: a
+  real, allocatable :: b
+
+!CHECK-DAG:  %[[A:.*]] = fir.address_of(@_QFEa) : !fir.ref<!fir.box<!fir.ptr<i32>>>
+!CHECK-DAG:  %[[OMP_A:.*]] = omp.threadprivate %[[A]] : !fir.ref<!fir.box<!fir.ptr<i32>>> -> !fir.ref<!fir.box<!fir.ptr<i32>>>
+!CHECK-DAG:  %[[OMP_A_DECL:.*]]:2 = hlfir.declare %[[OMP_A]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFEa"} : (!fir.ref<!fir.box<!fir.ptr<i32>>>) -> (!fir.ref<!fir.box<!fir.ptr<i32>>>, !fir.ref<!fir.box<!fir.ptr<i32>>>)
+!CHECK-DAG:  %[[B:.*]] = fir.address_of(@_QFEb) : !fir.ref<!fir.box<!fir.heap<f32>>>
+!CHECK-DAG:  %[[OMP_B:.*]] = omp.threadprivate %[[B]] : !fir.ref<!fir.box<!fir.heap<f32>>> -> !fir.ref<!fir.box<!fir.heap<f32>>>
+!CHECK-DAG:  %[[OMP_B_DECL:.*]]:2 = hlfir.declare %[[OMP_B]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFEb"} : (!fir.ref<!fir.box<!fir.heap<f32>>>) -> (!fir.ref<!fir.box<!fir.heap<f32>>>, !fir.ref<!fir.box<!fir.heap<f32>>>)
+!CHECK-DAG:  %[[W:.*]] = fir.address_of(@_QFEw) : !fir.ref<!fir.complex<4>>
+!CHECK-DAG:  %[[OMP_W:.*]] = omp.threadprivate %[[W]] : !fir.ref<!fir.complex<4>> -> !fir.ref<!fir.complex<4>>
+!CHECK-DAG:  %[[OMP_W_DECL:.*]]:2 = hlfir.declare %[[OMP_W]] {uniq_name = "_QFEw"} : (!fir.ref<!fir.complex<4>>) -> (!fir.ref<!fir.complex<4>>, !fir.ref<!fir.complex<4>>)
+!CHECK-DAG:  %[[X:.*]] = fir.address_of(@_QFEx) : !fir.ref<i32>
+!CHECK-DAG:  %[[OMP_X:.*]] = omp.threadprivate %[[X]] : !fir.ref<i32> -> !fir.ref<i32>
+!CHECK-DAG:  %[[OMP_X_DECL:.*]]:2 = hlfir.declare %[[OMP_X]] {uniq_name = "_QFEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK-DAG:  %[[Y:.*]] = fir.address_of(@_QFEy) : !fir.ref<f32>
+!CHECK-DAG:  %[[OMP_Y:.*]] = omp.threadprivate %[[Y]] : !fir.ref<f32> -> !fir.ref<f32>
+!CHECK-DAG:  %[[OMP_Y_DECL:.*]]:2 = hlfir.declare %[[OMP_Y]] {uniq_name = "_QFEy"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+!CHECK-DAG:  %[[Z:.*]] = fir.address_of(@_QFEz) : !fir.ref<!fir.logical<4>>
+!CHECK-DAG:  %[[OMP_Z:.*]] = omp.threadprivate %[[Z]] : !fir.ref<!fir.logical<4>> -> !fir.ref<!fir.logical<4>>
+!CHECK-DAG:  %[[OMP_Z_DECL:.*]]:2 = hlfir.declare %[[OMP_Z]] {uniq_name = "_QFEz"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+  !$omp threadprivate(x, y, z, w, a, b)
+
+  call sub(a, b)
+
+!CHECK-DAG:  %{{.*}} = fir.load %[[OMP_X_DECL]]#0 : !fir.ref<i32>
+!CHECK-DAG:  %{{.*}} = fir.load %[[OMP_Y_DECL]]#0 : !fir.ref<f32>
+!CHECK-DAG:  %{{.*}} = fir.load %[[OMP_Z_DECL]]#0 : !fir.ref<!fir.logical<4>>
+!CHECK-DAG:  %{{.*}} = fir.load %[[OMP_W_DECL]]#0 : !fir.ref<!fir.complex<4>>
+!CHECK-DAG:  %{{.*}} = fir.load %[[OMP_A_DECL]]#0 : !fir.ref<!fir.box<!fir.ptr<i32>>>
+!CHECK-DAG:  %{{.*}} = fir.load %[[OMP_B_DECL]]#0 : !fir.ref<!fir.box<!fir.heap<f32>>>
+  print *, x, y, z, w, a, b
+
+  !$omp parallel
+!CHECK-DAG:  %[[X_PVT:.*]] = omp.threadprivate %[[X]] : !fir.ref<i32> -> !fir.ref<i32>
+!CHECK-DAG:  %[[X_PVT_DECL:.*]]:2 = hlfir.declare %[[X_PVT]] {uniq_name = "_QFEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK-DAG:  %[[Y_PVT:.*]] = omp.threadprivate %[[Y]] : !fir.ref<f32> -> !fir.ref<f32>
+!CHECK-DAG:  %[[Y_PVT_DECL:.*]]:2 = hlfir.declare %[[Y_PVT]] {uniq_name = "_QFEy"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+!CHECK-DAG:  %[[Z_PVT:.*]] = omp.threadprivate %[[Z]] : !fir.ref<!fir.logical<4>> -> !fir.ref<!fir.logical<4>>
+!CHECK-DAG:  %[[Z_PVT_DECL:.*]]:2 = hlfir.declare %[[Z_PVT]] {uniq_name = "_QFEz"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+!CHECK-DAG:  %[[W_PVT:.*]] = omp.threadprivate %[[W]] : !fir.ref<!fir.complex<4>> -> !fir.ref<!fir.complex<4>>
+!CHECK-DAG:  %[[W_PVT_DECL:.*]]:2 = hlfir.declare %[[W_PVT]] {uniq_name = "_QFEw"} : (!fir.ref<!fir.complex<4>>) -> (!fir.ref<!fir.complex<4>>, !fir.ref<!fir.complex<4>>)
+!CHECK-DAG:  %[[A_PVT:.*]] = omp.threadprivate %[[A]] : !fir.ref<!fir.box<!fir.ptr<i32>>> -> !fir.ref<!fir.box<!fir.ptr<i32>>>
+!CHECK-DAG:  %[[A_PVT_DECL:.*]]:2 = hlfir.declare %[[A_PVT]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFEa"} : (!fir.ref<!fir.box<!fir.ptr<i32>>>) -> (!fir.ref<!fir.box<!fir.ptr<i32>>>, !fir.ref<!fir.box<!fir.ptr<i32>>>)
+!CHECK-DAG:  %[[B_PVT:.*]] = omp.threadprivate %[[B]] : !fir.ref<!fir.box<!fir.heap<f32>>> -> !fir.ref<!fir.box<!fir.heap<f32>>>
+!CHECK-DAG:  %[[B_PVT_DECL:.*]]:2 = hlfir.declare %[[B_PVT]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFEb"} : (!fir.ref<!fir.box<!fir.heap<f32>>>) -> (!fir.ref<!fir.box<!fir.heap<f32>>>, !fir.ref<!fir.box<!fir.heap<f32>>>)
+!CHECK-DAG:  %{{.*}} = fir.load %[[X_PVT_DECL]]#0 : !fir.ref<i32>
+!CHECK-DAG:  %{{.*}} = fir.load %[[Y_PVT_DECL]]#0 : !fir.ref<f32>
+!CHECK-DAG:  %{{.*}} = fir.load %[[Z_PVT_DECL]]#0 : !fir.ref<!fir.logical<4>>
+!CHECK-DAG:  %{{.*}} = fir.load %[[W_PVT_DECL]]#0 : !fir.ref<!fir.complex<4>>
+!CHECK-DAG:  %{{.*}} = fir.load %[[A_PVT_DECL]]#0 : !fir.ref<!fir.box<!fir.ptr<i32>>>
+!CHECK-DAG:  %{{.*}} = fir.load %[[B_PVT_DECL]]#0 : !fir.ref<!fir.box<!fir.heap<f32>>>
+  print *, x, y, z, w, a, b
+  !$omp end parallel
+
+!CHECK-DAG:  %{{.*}} = fir.load %[[OMP_X_DECL]]#0 : !fir.ref<i32>
+!CHECK-DAG:  %{{.*}} = fir.load %[[OMP_Y_DECL]]#0 : !fir.ref<f32>
+!CHECK-DAG:  %{{.*}} = fir.load %[[OMP_Z_DECL]]#0 : !fir.ref<!fir.logical<4>>
+!CHECK-DAG:  %{{.*}} = fir.load %[[OMP_W_DECL]]#0 : !fir.ref<!fir.complex<4>>
+!CHECK-DAG:  %{{.*}} = fir.load %[[OMP_A_DECL]]#0 : !fir.ref<!fir.box<!fir.ptr<i32>>>
+!CHECK-DAG:  %{{.*}} = fir.load %[[OMP_B_DECL]]#0 : !fir.ref<!fir.box<!fir.heap<f32>>>
+  print *, x, y, z, w, a, b
+
+!CHECK:  return
+
+!CHECK-DAG: fir.global internal @_QFEa : !fir.box<!fir.ptr<i32>> {
+!CHECK-DAG:   [[Z0:%.*]] = fir.zero_bits !fir.ptr<i32>
+!CHECK-DAG:   [[E0:%.*]] = fir.embox [[Z0]] : (!fir.ptr<i32>) -> !fir.box<!fir.ptr<i32>>
+!CHECK-DAG:   fir.has_value [[E0]] : !fir.box<!fir.ptr<i32>>
+!CHECK-DAG: }
+!CHECK-DAG: fir.global internal @_QFEb : !fir.box<!fir.heap<f32>> {
+!CHECK-DAG:   [[Z1:%.*]] = fir.zero_bits !fir.heap<f32>
+!CHECK-DAG:   [[E1:%.*]] = fir.embox [[Z1]] : (!fir.heap<f32>) -> !fir.box<!fir.heap<f32>>
+!CHECK-DAG:   fir.has_value [[E1]] : !fir.box<!fir.heap<f32>>
+!CHECK-DAG: }
+!CHECK-DAG: fir.global internal @_QFEw : !fir.complex<4> {
+!CHECK-DAG:   [[Z2:%.*]] = fir.undefined !fir.complex<4>
+!CHECK-DAG:   fir.has_value [[Z2]] : !fir.complex<4>
+!CHECK-DAG: }
+!CHECK-DAG: fir.global internal @_QFEx : i32 {
+!CHECK-DAG:   [[Z3:%.*]] = fir.undefined i32
+!CHECK-DAG:   fir.has_value [[Z3]] : i32
+!CHECK-DAG: }
+!CHECK-DAG: fir.global internal @_QFEy : f32 {
+!CHECK-DAG:   [[Z4:%.*]] = fir.undefined f32
+!CHECK-DAG:   fir.has_value [[Z4]] : f32
+!CHECK-DAG: }
+!CHECK-DAG: fir.global internal @_QFEz : !fir.logical<4> {
+!CHECK-DAG:   [[Z5:%.*]] = fir.undefined !fir.logical<4>
+!CHECK-DAG:   fir.has_value [[Z5]] : !fir.logical<4>
+!CHECK-DAG: }
+end
diff --git a/flang/test/Lower/OpenMP/threadprivate-pointer-allocatable.f90 b/flang/test/Lower/OpenMP/threadprivate-pointer-allocatable.f90
new file mode 100644
index 0000000000000..059667d35d840
--- /dev/null
+++ b/flang/test/Lower/OpenMP/threadprivate-pointer-allocatable.f90
@@ -0,0 +1,63 @@
+! This test checks lowering of OpenMP Threadprivate Directive.
+! Test for allocatable and pointer variables.
+
+!RUN: %flang_fc1 -emit-hlfir -fopenmp %s -o - | FileCheck %s
+
+module test
+  integer, pointer :: x(:), m
+  real, allocatable :: y(:), n
+
+  !$omp threadprivate(x, y, m, n)
+
+!CHECK-DAG: fir.global @_QMtestEm : !fir.box<!fir.ptr<i32>> {
+!CHECK-DAG: fir.global @_QMtestEn : !fir.box<!fir.heap<f32>> {
+!CHECK-DAG: fir.global @_QMtestEx : !fir.box<!fir.ptr<!fir.array<?xi32>>> {
+!CHECK-DAG: fir.global @_QMtestEy : !fir.box<!fir.heap<!fir.array<?xf32>>> {
+
+contains
+  subroutine sub()
+!CHECK-DAG:  %[[M:.*]] = fir.address_of(@_QMtestEm) : !fir.ref<!fir.box<!fir.ptr<i32>>>
+!CHECK-DAG:  %[[M_DECL:.*]]:2 = hlfir.declare %[[M]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QMtestEm"} : (!fir.ref<!fir.box<!fir.ptr<i32>>>) -> (!fir.ref<!fir.box<!fir.ptr<i32>>>, !fir.ref<!fir.box<!fir.ptr<i32>>>)
+!CHECK-DAG:  %[[OMP_M:.*]] = omp.threadprivate %[[M_DECL]]#1 : !fir.ref<!fir.box<!fir.ptr<i32>>> -> !fir.ref<!fir.box<!fir.ptr<i32>>>
+!CHECK-DAG:  %[[OMP_M_DECL:.*]]:2 = hlfir.declare %[[OMP_M]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QMtestEm"} : (!fir.ref<!fir.box<!fir.ptr<i32>>>) -> (!fir.ref<!fir.box<!fir.ptr<i32>>>, !fir.ref<!fir.box<!fir.ptr<i32>>>)
+!CHECK-DAG:  %[[N:.*]] = fir.address_of(@_QMtestEn) : !fir.ref<!fir.box<!fir.heap<f32>>>
+!CHECK-DAG:  %[[N_DECL:.*]]:2 = hlfir.declare %[[N]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMtestEn"} : (!fir.ref<!fir.box<!fir.heap<f32>>>) -> (!fir.ref<!fir.box<!fir.heap<f32>>>, !fir.ref<!fir.box<!fir.heap<f32>>>)
+!CHECK-DAG:  %[[OMP_N:.*]] = omp.threadprivate %[[N_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<f32>>> -> !fir.ref<!fir.box<!fir.heap<f32>>>
+!CHECK-DAG:  %[[OMP_N_DECL:.*]]:2 = hlfir.declare %[[OMP_N]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMtestEn"} : (!fir.ref<!fir.box<!fir.heap<f32>>>) -> (!fir.ref<!fir.box<!fir.heap<f32>>>, !fir.ref<!fir.box<!fir.heap<f32>>>)
+!CHECK-DAG:  %[[X:.*]] = fir.address_of(@_QMtestEx) : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
+!CHECK-DAG:  %[[X_DECL:.*]]:2 = hlfir.declare %[[X]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QMtestEx"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>)
+!CHECK-DAG:  %[[OMP_X:.*]] = omp.threadprivate %[[X_DECL]]#1 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>> -> !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
+!CHECK-DAG:  %[[OMP_X_DECL:.*]]:2 = hlfir.declare %[[OMP_X]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QMtestEx"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>)
+!CHECK-DAG:  %[[Y:.*]] = fir.address_of(@_QMtestEy) : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
+!CHECK-DAG:  %[[Y_DECL:.*]]:2 = hlfir.declare %[[Y]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMtestEy"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
+!CHECK-DAG:  %[[OMP_Y:.*]] = omp.threadprivate %[[Y_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
+!CHECK-DAG:  %[[OMP_Y_DECL:.*]]:2 = hlfir.declare %[[OMP_Y]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMtestEy"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
+!CHECK-DAG:  %{{.*}} = fir.load %[[OMP_X_DECL]]#1 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
+!CHECK-DAG:  %{{.*}} = fir.load %[[OMP_Y_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
+!CHECK-DAG:  %{{.*}} = fir.load %[[OMP_M_DECL]]#0 : !fir.ref<!fir.box<!fir.ptr<i32>>>
+!CHECK-DAG:  %{{.*}} = fir.load %[[OMP_N_DECL]]#0 : !fir.ref<!fir.box<!fir.heap<f32>>>
+    print *, x, y, m, n
+
+    !$omp parallel
+!CHECK-DAG:  %[[X_PVT:.*]] = omp.threadprivate %[[X_DECL]]#1 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>> -> !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
+!CHECK-DAG:  %[[X_PVT_DECL:.*]]:2 = hlfir.declare %[[X_PVT]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QMtestEx"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>)
+!CHECK-DAG:  %[[Y_PVT:.*]] = omp.threadprivate %[[Y_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>> -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
+!CHECK-DAG:  %[[Y_PVT_DECL:.*]]:2 = hlfir.declare %[[Y_PVT]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMtestEy"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>)
+!CHECK-DAG:  %[[Z_PVT:.*]] = omp.threadprivate %[[M_DECL]]#1 : !fir.ref<!fir.box<!fir.ptr<i32>>> -> !fir.ref<!fir.box<!fir.ptr<i32>>>
+!CHECK-DAG:  %[[Z_PVT_DECL:.*]]:2 = hlfir.declare %[[Z_PVT]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QMtestEm"} : (!fir.ref<!fir.box<!fir.ptr<i32>>>) -> (!fir.ref<!fir.box<!fir.ptr<i32>>>, !fir.ref<!fir.box<!fir.ptr<i32>>>)
+!CHECK-DAG:  %[[N_PVT:.*]] = omp.threadprivate %[[N_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<f32>>> -> !fir.ref<!fir.box<!fir.heap<f32>>>
+!CHECK-DAG:  %[[N_PVT_DECL:.*]]:2 = hlfir.declare %[[N_PVT]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QMtestEn"} : (!fir.ref<!fir.box<!fir.heap<f32>>>) -> (!fir.ref<!fir.box<!fir.heap<f32>>>, !fir.ref<!fir.box<!fir.heap<f32>>>)
+!CHECK-DAG:  %{{.*}} = fir.load %[[X_PVT_DECL]]#1 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
+!CHECK-DAG:  %{{.*}} = fir.load %[[Y_PVT_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
+!CHECK-DAG:  %{{.*}} = fir.load %[[Z_PVT_DECL]]#0 : !fir.ref<!fir.box<!fir.ptr<i32>>>
+!CHECK-DAG:  %{{.*}} = fir.load %[[N_PVT_DECL]]#0 : !fir.ref<!fir.box<!fir.heap<f32>>>
+      print *, x, y, m, n
+    !$omp end parallel
+
+!CHECK-DAG:  %{{.*}} = fir.load %[[OMP_X_DECL]]#1 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
+!CHECK-DAG:  %{{.*}} = fir.load %[[OMP_Y_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
+!CHECK-DAG:  %{{.*}} = fir.load %[[OMP_M_DECL]]#0 : !fir.ref<!fir.box<!fir.ptr<i32>>>
+!CHECK-DAG:  %{{.*}} = fir.load %[[OMP_N_DECL]]#0 : !fir.ref<!fir.box<!fir.heap<f32>>>
+    print *, x, y, m, n
+  end
+end
diff --git a/flang/test/Lower/OpenMP/threadprivate-real-logical-complex-derivedtype.f90 b/flang/test/Lower/OpenMP/threadprivate-real-logical-complex-derivedtype.f90
new file mode 100644
index 0000000000000..55f806962a609
--- /dev/null
+++ b/flang/test/Lower/OpenMP/threadprivate-real-logical-complex-derivedtype.f90
@@ -0,0 +1,69 @@
+! This test checks lowering of OpenMP Threadprivate Directive.
+! Test for real, logical, complex, and derived type.
+
+!RUN: %flang_fc1 -emit-hlfir -fopenmp %s -o - | FileCheck %s
+
+module test
+  type my_type
+    integer :: t_i
+    real :: t_arr(5)
+  end type my_type
+  real :: x
+  complex :: y
+  logical :: z
+  type(my_type) :: t
+
+  !$omp threadprivate(x, y, z, t)
+
+!CHECK-DAG: fir.global @_QMtestEt : !fir.type<_QMtestTmy_type{t_i:i32,t_arr:!fir.array<5xf32>}> {
+!CHECK-DAG: fir.global @_QMtestEx : f32 {
+!CHECK-DAG: fir.global @_QMtestEy : !fir.complex<4> {
+!CHECK-DAG: fir.global @_QMtestEz : !fir.logical<4> {
+
+contains
+  subroutine sub()
+!CHECK-DAG:  %[[T:.*]] = fir.address_of(@_QMtestEt) : !fir.ref<!fir.type<_QMtestTmy_type{t_i:i32,t_arr:!fir.array<5xf32>}>>
+!CHECK-DAG:  %[[T_DECL:.*]]:2 = hlfir.declare %[[T]] {uniq_name = "_QMtestEt"} : (!fir.ref<!fir.type<_QMtestTmy_type{t_i:i32,t_arr:!fir.array<5xf32>}>>) -> (!fir.ref<!fir.type<_QMtestTmy_type{t_i:i32,t_arr:!fir.array<5xf32>}>>, !fir.ref<!fir.type<_QMtestTmy_type{t_i:i32,t_arr:!fir.array<5xf32>}>>)
+!CHECK-DAG:  %[[OMP_T:.*]] = omp.threadprivate %[[T_DECL]]#1 : !fir.ref<!fir.type<_QMtestTmy_type{t_i:i32,t_arr:!fir.array<5xf32>}>> -> !fir.ref<!fir.type<_QMtestTmy_type{t_i:i32,t_arr:!fir.array<5xf32>}>>
+!CHECK-DAG:  %[[OMP_T_DECL:.*]]:2 = hlfir.declare %[[OMP_T]] {uniq_name = "_QMtestEt"} : (!fir.ref<!fir.type<_QMtestTmy_type{t_i:i32,t_arr:!fir.array<5xf32>}>>) -> (!fir.ref<!fir.type<_QMtestTmy_type{t_i:i32,t_arr:!fir.array<5xf32>}>>, !fir.ref<!fir.type<_QMtestTmy_type{t_i:i32,t_arr:!fir.array<5xf32>}>>)
+!CHECK-DAG:  %[[X:.*]] = fir.address_of(@_QMtestEx) : !fir.ref<f32>
+!CHECK-DAG:  %[[X_DECL:.*]]:2 = hlfir.declare %[[X]] {uniq_name = "_QMtestEx"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+!CHECK-DAG:  %[[OMP_X:.*]] = omp.threadprivate %[[X_DECL]]#1 : !fir.ref<f32> -> !fir.ref<f32>
+!CHECK-DAG:  %[[OMP_X_DECL:.*]]:2 = hlfir.declare %[[OMP_X]] {uniq_name = "_QMtestEx"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+!CHECK-DAG:  %[[Y:.*]] = fir.address_of(@_QMtestEy) : !fir.ref<!fir.complex<4>>
+!CHECK-DAG:  %[[Y_DECL:.*]]:2 = hlfir.declare %[[Y]] {uniq_name = "_QMtestEy"} : (!fir.ref<!fir.complex<4>>) -> (!fir.ref<!fir.complex<4>>, !fir.ref<!fir.complex<4>>)
+!CHECK-DAG:  %[[OMP_Y:.*]] = omp.threadprivate %[[Y_DECL]]#1 : !fir.ref<!fir.complex<4>> -> !fir.ref<!fir.complex<4>>
+!CHECK-DAG:  %[[OMP_Y_DECL:.*]]:2 = hlfir.declare %[[OMP_Y]] {uniq_name = "_QMtestEy"} : (!fir.ref<!fir.complex<4>>) -> (!fir.ref<!fir.complex<4>>, !fir.ref<!fir.complex<4>>)
+!CHECK-DAG:  %[[Z:.*]] = fir.address_of(@_QMtestEz) : !fir.ref<!fir.logical<4>>
+!CHECK-DAG:  %[[Z_DECL:.*]]:2 = hlfir.declare %[[Z]] {uniq_name = "_QMtestEz"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+!CHECK-DAG:  %[[OMP_Z:.*]] = omp.threadprivate %[[Z_DECL]]#1 : !fir.ref<!fir.logical<4>> -> !fir.ref<!fir.logical<4>>
+!CHECK-DAG:  %[[OMP_Z_DECL:.*]]:2 = hlfir.declare %[[OMP_Z]] {uniq_name = "_QMtestEz"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+!CHECK-DAG:  %{{.*}} = fir.load %[[OMP_X_DECL]]#0 : !fir.ref<f32>
+!CHECK-DAG:  %{{.*}} = fir.load %[[OMP_Y_DECL]]#0 : !fir.ref<!fir.complex<4>>
+!CHECK-DAG:  %{{.*}} = fir.load %[[OMP_Z_DECL]]#0 : !fir.ref<!fir.logical<4>>
+!CHECK-DAG:  %{{.*}} = hlfir.designate %[[OMP_T_DECL]]#0{"t_i"}   : (!fir.ref<!fir.type<_QMtestTmy_type{t_i:i32,t_arr:!fir.array<5xf32>}>>) -> !fir.ref<i32>
+    print *, x, y, z, t%t_i
+
+    !$omp parallel
+!CHECK-DAG:  %[[T_PVT:.*]] = omp.threadprivate %[[T_DECL]]#1 : !fir.ref<!fir.type<_QMtestTmy_type{t_i:i32,t_arr:!fir.array<5xf32>}>> -> !fir.ref<!fir.type<_QMtestTmy_type{t_i:i32,t_arr:!fir.array<5xf32>}>>
+!CHECK-DAG:  %[[T_PVT_DECL:.*]]:2 = hlfir.declare %[[T_PVT]] {uniq_name = "_QMtestEt"} : (!fir.ref<!fir.type<_QMtestTmy_type{t_i:i32,t_arr:!fir.array<5xf32>}>>) -> (!fir.ref<!fir.type<_QMtestTmy_type{t_i:i32,t_arr:!fir.array<5xf32>}>>, !fir.ref<!fir.type<_QMtestTmy_type{t_i:i32,t_arr:!fir.array<5xf32>}>>)
+!CHECK-DAG:  %[[X_PVT:.*]] = omp.threadprivate %[[X_DECL]]#1 : !fir.ref<f32> -> !fir.ref<f32>
+!CHECK-DAG:  %[[X_PVT_DECL:.*]]:2 = hlfir.declare %[[X_PVT]] {uniq_name = "_QMtestEx"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+!CHECK-DAG:  %[[Y_PVT:.*]] = omp.threadprivate %[[Y_DECL]]#1 : !fir.ref<!fir.complex<4>> -> !fir.ref<!fir.complex<4>>
+!CHECK-DAG:  %[[Y_PVT_DECL:.*]]:2 = hlfir.declare %[[Y_PVT]] {uniq_name = "_QMtestEy"} : (!fir.ref<!fir.complex<4>>) -> (!fir.ref<!fir.complex<4>>, !fir.ref<!fir.complex<4>>)
+!CHECK-DAG:  %[[Z_PVT:.*]] = omp.threadprivate %[[Z_DECL]]#1 : !fir.ref<!fir.logical<4>> -> !fir.ref<!fir.logical<4>>
+!CHECK-DAG:  %[[Z_PVT_DECL:.*]]:2 = hlfir.declare %[[Z_PVT]] {uniq_name = "_QMtestEz"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+!CHECK-DAG:  %{{.*}} = fir.load %[[X_PVT_DECL]]#0 : !fir.ref<f32>
+!CHECK-DAG:  %{{.*}} = fir.load %[[Y_PVT_DECL]]#0 : !fir.ref<!fir.complex<4>>
+!CHECK-DAG:  %{{.*}} = fir.load %[[Z_PVT_DECL]]#0 : !fir.ref<!fir.logical<4>>
+!CHECK-DAG:  %{{.*}} = hlfir.designate %[[T_PVT_DECL]]#0{"t_i"}   : (!fir.ref<!fir.type<_QMtestTmy_type{t_i:i32,t_arr:!fir.array<5xf32>}>>) -> !fir.ref<i32>
+print *, x, y, z, t%t_i
+    !$omp end parallel
+!CHECK-DAG:  %{{.*}} = fir.load %[[OMP_X_DECL]]#0 : !fir.ref<f32>
+!CHECK-DAG:  %{{.*}} = fir.load %[[OMP_Y_DECL]]#0 : !fir.ref<!fir.complex<4>>
+!CHECK-DAG:  %{{.*}} = fir.load %[[OMP_Z_DECL]]#0 : !fir.ref<!fir.logical<4>>
+!CHECK-DAG:  %{{.*}} = hlfir.designate %[[OMP_T_DECL]]#0{"t_i"}   : (!fir.ref<!fir.type<_QMtestTmy_type{t_i:i32,t_arr:!fir.array<5xf32>}>>) -> !fir.ref<i32>
+    print *, x, y, z, t%t_i
+
+  end
+end

From 547a5073bed9dec25aec29751c036e2fb7d251c2 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Fri, 27 Oct 2023 09:31:42 -0700
Subject: [PATCH 237/877] TypeSize: remove unused deprecated methods

Complete the migration in https://reviews.llvm.org/D141134 .
---
 llvm/include/llvm/Support/TypeSize.h | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/llvm/include/llvm/Support/TypeSize.h b/llvm/include/llvm/Support/TypeSize.h
index dc096d7cc7154..732757c14a96d 100644
--- a/llvm/include/llvm/Support/TypeSize.h
+++ b/llvm/include/llvm/Support/TypeSize.h
@@ -16,7 +16,6 @@
 #define LLVM_SUPPORT_TYPESIZE_H
 
 #include "llvm/ADT/ArrayRef.h"
-#include "llvm/Support/Compiler.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 
@@ -329,10 +328,8 @@ class TypeSize : public details::FixedOrScalableQuantity<TypeSize, uint64_t> {
     return TypeSize(MinimumSize, true);
   }
 
-  LLVM_DEPRECATED("Use getFixedValue() instead", "getFixedValue")
   constexpr ScalarTy getFixedSize() const { return getFixedValue(); }
   
-  LLVM_DEPRECATED("Use getKnownMinValue() instead", "getKnownMinValue")
   constexpr ScalarTy getKnownMinSize() const { return getKnownMinValue(); }
 
   // All code for this class below this point is needed because of the

From 03485a0406db5d55eb3878eea8da2a04889acc5f Mon Sep 17 00:00:00 2001
From: Shraiysh <Shraiysh.Vaishay@amd.com>
Date: Fri, 27 Oct 2023 11:35:06 -0500
Subject: [PATCH 238/877] [openmp][flang] Add tests for map clause (#70394)

This patch adds basic tests for map clause on target construct for
commonblocks. There will be more tests to add, which will be added in
future patches. Currently failing tests are added in a separate folder
with XFAIL. They should be moved as they are fixed.
---
 .../failing/target_map_common_block1.f90      | 29 +++++++
 .../failing/target_map_common_block2.f90      | 27 ++++++
 .../fortran/target_map_common_block.f90       | 82 +++++++++++++++++++
 3 files changed, 138 insertions(+)
 create mode 100644 openmp/libomptarget/test/offloading/fortran/failing/target_map_common_block1.f90
 create mode 100644 openmp/libomptarget/test/offloading/fortran/failing/target_map_common_block2.f90
 create mode 100644 openmp/libomptarget/test/offloading/fortran/target_map_common_block.f90

diff --git a/openmp/libomptarget/test/offloading/fortran/failing/target_map_common_block1.f90 b/openmp/libomptarget/test/offloading/fortran/failing/target_map_common_block1.f90
new file mode 100644
index 0000000000000..235da47a9103a
--- /dev/null
+++ b/openmp/libomptarget/test/offloading/fortran/failing/target_map_common_block1.f90
@@ -0,0 +1,29 @@
+! REQUIRES: flang, amdgcn-amd-amdhsa
+! UNSUPPORTED: nvptx64-nvidia-cuda
+! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
+! UNSUPPORTED: aarch64-unknown-linux-gnu
+! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+! UNSUPPORTED: x86_64-pc-linux-gnu
+! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+
+! RUN: %libomptarget-compile-fortran-run-and-check-generic
+! XFAIL: *
+
+program main
+  use omp_lib
+  integer :: devices(2), var1
+  common var1
+  var1 = 10
+  print *, "var1 before target = ", var1
+  devices(1) = omp_get_device_num()
+  !$omp target map(tofrom:devices) map(tofrom:var1)
+    var1 = 20
+    devices(2) = omp_get_device_num()
+  !$omp end target
+  print *, "var1 after target = ", var1
+  print *, "devices: ", devices
+end program
+
+! CHECK: var1 before target =  10
+! CHECK: var1 after target =  20
+! CHECK: devices:  1 0
diff --git a/openmp/libomptarget/test/offloading/fortran/failing/target_map_common_block2.f90 b/openmp/libomptarget/test/offloading/fortran/failing/target_map_common_block2.f90
new file mode 100644
index 0000000000000..1f95ef7c46075
--- /dev/null
+++ b/openmp/libomptarget/test/offloading/fortran/failing/target_map_common_block2.f90
@@ -0,0 +1,27 @@
+! REQUIRES: flang, amdgcn-amd-amdhsa
+! UNSUPPORTED: nvptx64-nvidia-cuda
+! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
+! UNSUPPORTED: aarch64-unknown-linux-gnu
+! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+! UNSUPPORTED: x86_64-pc-linux-gnu
+! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+
+! RUN: %libomptarget-compile-fortran-run-and-check-generic
+! XFAIL: *
+
+program main
+  use omp_lib
+  integer :: tmp, var4
+  common var4
+  var4 = 24
+  tmp = 12
+  print *, "var4 before target = ", var4
+  !$omp target map(tofrom:var4)
+    var4 = tmp
+  !$omp end target
+  print *, "var4 after target = ", var4
+end program
+
+! CHECK: var4 before target = 24
+! CHECK: var4 after target = 12
+
diff --git a/openmp/libomptarget/test/offloading/fortran/target_map_common_block.f90 b/openmp/libomptarget/test/offloading/fortran/target_map_common_block.f90
new file mode 100644
index 0000000000000..cc7ce50e661e1
--- /dev/null
+++ b/openmp/libomptarget/test/offloading/fortran/target_map_common_block.f90
@@ -0,0 +1,82 @@
+! Basic offloading test with a target region
+! REQUIRES: flang, amdgcn-amd-amdhsa
+! UNSUPPORTED: nvptx64-nvidia-cuda
+! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
+! UNSUPPORTED: aarch64-unknown-linux-gnu
+! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+! UNSUPPORTED: x86_64-pc-linux-gnu
+! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+
+! RUN: %libomptarget-compile-fortran-run-and-check-generic
+
+! Testing simple variables in common block.
+program main
+  call check_device
+  call commonblock_simple_with_implicit_type
+  call commonblock_simple_with_integer
+  call commonblock_simple_with_real
+end program main
+
+!-----
+
+subroutine check_device
+  use omp_lib
+  integer :: devices(2)
+  devices(1) = omp_get_device_num()
+  !$omp target map(tofrom:devices)
+    devices(2) = omp_get_device_num()
+  !$omp end target
+  print *, "devices: ", devices
+end subroutine check_device
+
+!CHECK: devices: 1 0
+
+!-----
+
+subroutine commonblock_simple_with_implicit_type
+  use omp_lib
+  common var1
+  var1 = 10
+  print *, "var1 before target = ", var1
+  !$omp target map(tofrom:var1)
+    var1 = 20
+  !$omp end target
+  print *, "var1 after target = ", var1
+end subroutine
+
+! CHECK: var1 before target = 10
+! CHECK: var1 after target = 20
+
+! -----
+
+subroutine commonblock_simple_with_integer
+  use omp_lib
+  integer :: var2
+  common var2
+  var2 = 10
+  print *, "var2 before target = ", var2
+  !$omp target map(tofrom:var2)
+    var2 = 20
+  !$omp end target
+  print *, "var2 after target = ", var2
+end subroutine
+
+! CHECK: var2 before target = 10
+! CHECK: var2 after target = 20
+
+! -----
+
+subroutine commonblock_simple_with_real
+  use omp_lib
+  real :: var3
+  common var3
+  var3 = 12.5
+  print *, "var3 before target = ", var3
+  !$omp target map(tofrom:var3)
+    var3 = 14.5
+  !$omp end target
+  print *, "var3 after target = ", var3
+end subroutine
+
+! CHECK: var3 before target = 12.5
+! CHECK: var3 after target = 14.5

From 8f9da1e4be69215c89f913766b7976bcc2e24ac2 Mon Sep 17 00:00:00 2001
From: Jacques Pienaar <jpienaar@google.com>
Date: Fri, 27 Oct 2023 09:36:50 -0700
Subject: [PATCH 239/877] [mlir][ml_program] Add asm names for load (#70409)

---
 .../mlir/Dialect/MLProgram/IR/MLProgramOps.td   | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/mlir/include/mlir/Dialect/MLProgram/IR/MLProgramOps.td b/mlir/include/mlir/Dialect/MLProgram/IR/MLProgramOps.td
index f1d4972f48cff..fd213824b30ca 100644
--- a/mlir/include/mlir/Dialect/MLProgram/IR/MLProgramOps.td
+++ b/mlir/include/mlir/Dialect/MLProgram/IR/MLProgramOps.td
@@ -16,6 +16,7 @@ include "mlir/Interfaces/CallInterfaces.td"
 include "mlir/Interfaces/ControlFlowInterfaces.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
 include "mlir/Interfaces/FunctionInterfaces.td"
+include "mlir/IR/OpAsmInterface.td"
 include "mlir/IR/RegionKindInterface.td"
 include "mlir/IR/SymbolInterfaces.td"
 
@@ -153,6 +154,7 @@ def MLProgram_GlobalOp : MLProgram_Op<"global", [
 //===----------------------------------------------------------------------===//
 
 def MLProgram_GlobalLoadOp : MLProgram_Op<"global_load", [
+    DeclareOpInterfaceMethods<OpAsmOpInterface, ["getAsmResultNames"]>,
     DeclareOpInterfaceMethods<SymbolUserOpInterface>
   ]> {
   let summary = "Direct load of a mutable value from a global";
@@ -190,6 +192,13 @@ def MLProgram_GlobalLoadOp : MLProgram_Op<"global_load", [
     /// Gets the corresponding GlobalOp (or nullptr).
     GlobalOp getGlobalOp(SymbolTableCollection &symbolTable);
   }];
+
+  let extraClassDefinition = [{
+    void $cppClass::getAsmResultNames(
+        function_ref<void(::mlir::Value, ::llvm::StringRef)> setNameFn) {
+      setNameFn(getResult(), getGlobal().getLeafReference());
+    }
+  }];
 }
 
 //===----------------------------------------------------------------------===//
@@ -198,6 +207,7 @@ def MLProgram_GlobalLoadOp : MLProgram_Op<"global_load", [
 
 def MLProgram_GlobalLoadConstOp : MLProgram_Op<"global_load_const", [
     Pure,
+    DeclareOpInterfaceMethods<OpAsmOpInterface, ["getAsmResultNames"]>,
     DeclareOpInterfaceMethods<SymbolUserOpInterface>
   ]> {
   let summary = "Direct load a constant value from a global";
@@ -229,6 +239,13 @@ def MLProgram_GlobalLoadConstOp : MLProgram_Op<"global_load_const", [
     /// Gets the corresponding GlobalOp (or nullptr).
     GlobalOp getGlobalOp(SymbolTableCollection &symbolTable);
   }];
+
+  let extraClassDefinition = [{
+    void $cppClass::getAsmResultNames(
+      function_ref<void(::mlir::Value, ::llvm::StringRef)> setNameFn) {
+        setNameFn(getResult(), getGlobal().getLeafReference());
+    }
+  }];
 }
 
 //===----------------------------------------------------------------------===//

From 7aa65c3c5c29658fb38db832771e49251cb16080 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andrzej=20Warzy=C5=84ski?= <andrzej.warzynski@arm.com>
Date: Fri, 27 Oct 2023 17:40:32 +0100
Subject: [PATCH 240/877] [mlir][Vector] Update v.contract -> v.outerproduct
 tests (NFC) (#70449)

Re-orders tests in vector-contract-to-outerproduct-transforms.mlir so
that the file starts as follows:

  1. plain matmul
  2. plain matmul with scalable vectors
  3. masked matmul
  4. masked matmul with scalable vectors
  5. plain matmul with mixed types
  6. plain matmul with mixed types and scalable vectors

All of the above share the same indexing maps. This allowed to identify
one more duplicate. Following the cases above are examples with
different maps.

In addition, added extra comments to document the tests and to split
them into categories. There is also some extra reformatting to unify the
tests.
---
 ...r-contract-to-outerproduct-transforms.mlir | 305 ++++++++++--------
 1 file changed, 165 insertions(+), 140 deletions(-)

diff --git a/mlir/test/Dialect/Vector/vector-contract-to-outerproduct-transforms.mlir b/mlir/test/Dialect/Vector/vector-contract-to-outerproduct-transforms.mlir
index 965d55c53ba33..2c228a04873e5 100644
--- a/mlir/test/Dialect/Vector/vector-contract-to-outerproduct-transforms.mlir
+++ b/mlir/test/Dialect/Vector/vector-contract-to-outerproduct-transforms.mlir
@@ -1,15 +1,22 @@
 // RUN: mlir-opt %s --transform-interpreter --split-input-file | FileCheck %s
 
-#matmat_accesses = [
-  affine_map<(i, j, k) -> (i, k)>,
-  affine_map<(i, j, k) -> (k, j)>,
-  affine_map<(i, j, k) -> (i, j)>
-]
-#matmat_trait = {
-  indexing_maps = #matmat_accesses,
-  iterator_types = ["parallel", "parallel", "reduction"]
-}
+// NOTE - tests in this file are duplicated so that there's a version for
+//    * _fixed width_ and for _scalable_ vectors.
+// In order for the "vector.contract -> vector.outerproduct" patterns to work,
+// only the non-reduction dimension can be scalable (*). For Matmul operations
+// that is set to be the N dimension (i.e. rows of the output matrix), which
+// matches how matrix multiplication are normally implemented for e.g. 
+// Arm SVE. However, making the M dimension scalable (i.e. columns of the
+// output matrix) should work as well.
+//
+// (*) The conversion tested in this file unrolls along the reduction
+// dimension, which is not supported for scalable vectors.
+//
+// TODO: Matvec without a mask
 
+// ============================================================================
+//  Matmul 0 (plain + masked + mixed types)
+// ============================================================================
 #matmat_accesses_0 = [
   affine_map<(m, n, k) -> (m, k)>,
   affine_map<(m, n, k) -> (k, n)>,
@@ -20,62 +27,6 @@
   iterator_types = ["parallel", "parallel", "reduction"]
 }
 
-
-// CHECK-LABEL: func.func @masked_extract_contract4(
-// CHECK-SAME:    %{{.*}}: vector<3x5xf32>,
-// CHECK-SAME:    %{{.*}}: vector<5x7xf32>,
-// CHECK-SAME:    %{{.*}}: vector<3x7xf32>,
-// CHECK-SAME:    %[[IN_MASK:.*]]: vector<3x7x5xi1>) -> vector<3x7xf32> {
-// CHECK:         %[[T_MASK:.*]] = vector.transpose %[[IN_MASK]], [2, 0, 1] : vector<3x7x5xi1> to vector<5x3x7xi1>
-// CHECK:         %[[T_MASK_R0:.*]] = vector.extract %[[T_MASK]][0] : vector<3x7xi1> from vector<5x3x7xi1>
-// CHECK:         %{{.*}} = vector.mask %[[T_MASK_R0]] { vector.outerproduct %{{.*}} {kind = #vector.kind<add>} : vector<3xf32>, vector<7xf32> } : vector<3x7xi1> -> vector<3x7xf32>
-// CHECK:         %[[T_MASK_R1:.*]] = vector.extract %[[T_MASK]][1] : vector<3x7xi1> from vector<5x3x7xi1>
-// CHECK:         %{{.*}} = vector.mask %[[T_MASK_R1]] { vector.outerproduct %{{.*}} {kind = #vector.kind<add>} : vector<3xf32>, vector<7xf32> } : vector<3x7xi1> -> vector<3x7xf32>
-// CHECK:         %[[T_MASK_R2:.*]] = vector.extract %[[T_MASK]][2] : vector<3x7xi1> from vector<5x3x7xi1>
-// CHECK:         %{{.*}} = vector.mask %[[T_MASK_R2]] { vector.outerproduct %{{.*}} {kind = #vector.kind<add>} : vector<3xf32>, vector<7xf32> } : vector<3x7xi1> -> vector<3x7xf32>
-// CHECK:         %[[T_MASK_R3:.*]] = vector.extract %[[T_MASK]][3] : vector<3x7xi1> from vector<5x3x7xi1>
-// CHECK:         %{{.*}} = vector.mask %[[T_MASK_R3]] { vector.outerproduct %{{.*}} {kind = #vector.kind<add>} : vector<3xf32>, vector<7xf32> } : vector<3x7xi1> -> vector<3x7xf32>
-// CHECK:         %[[T_MASK_R4:.*]] = vector.extract %[[T_MASK]][4] : vector<3x7xi1> from vector<5x3x7xi1>
-// CHECK:         %{{.*}} = vector.mask %[[T_MASK_R4]] { vector.outerproduct %{{.*}} {kind = #vector.kind<add>} : vector<3xf32>, vector<7xf32> } : vector<3x7xi1> -> vector<3x7xf32>
-
-func.func @masked_extract_contract4(%arg0: vector<3x5xf32>,
-                                    %arg1: vector<5x7xf32>,
-                                    %arg2: vector<3x7xf32>,
-                                    %m : vector<3x7x5xi1>) -> vector<3x7xf32> {
-  %0 = vector.mask %m { vector.contract #matmat_trait %arg0, %arg1, %arg2
-  : vector<3x5xf32>, vector<5x7xf32> into vector<3x7xf32> } : vector<3x7x5xi1> -> vector<3x7xf32>
-  return %0 : vector<3x7xf32>
-}
-
-// CHECK-LABEL: func.func @masked_extract_contract4_scalable_J_dim(
-// CHECK-SAME:    %{{.*}}: vector<3x5xf32>,
-// CHECK-SAME:    %{{.*}}: vector<5x[7]xf32>,
-// CHECK-SAME:    %{{.*}}: vector<3x[7]xf32>,
-// CHECK-SAME:    %[[IN_MASK:.*]]: vector<3x[7]x5xi1>) -> vector<3x[7]xf32> {
-// CHECK:         %[[T_MASK:.*]] = vector.transpose %[[IN_MASK]], [2, 0, 1] : vector<3x[7]x5xi1> to vector<5x3x[7]xi1>
-// CHECK:         %[[T_MASK_R0:.*]] = vector.extract %[[T_MASK]][0] : vector<3x[7]xi1> from vector<5x3x[7]xi1>
-// CHECK:         %{{.*}} = vector.mask %[[T_MASK_R0]] { vector.outerproduct %{{.*}} {kind = #vector.kind<add>} : vector<3xf32>, vector<[7]xf32> } : vector<3x[7]xi1> -> vector<3x[7]xf32>
-// CHECK:         %[[T_MASK_R1:.*]] = vector.extract %[[T_MASK]][1] : vector<3x[7]xi1> from vector<5x3x[7]xi1>
-// CHECK:         %[[VAL_13:.*]] = vector.mask %[[T_MASK_R1]] { vector.outerproduct %{{.*}} {kind = #vector.kind<add>} : vector<3xf32>, vector<[7]xf32> } : vector<3x[7]xi1> -> vector<3x[7]xf32>
-// CHECK:         %[[T_MASK_R2:.*]] = vector.extract %[[T_MASK]][2] : vector<3x[7]xi1> from vector<5x3x[7]xi1>
-// CHECK:         %{{.*}} = vector.mask %[[T_MASK_R2]] { vector.outerproduct %{{.*}} {kind = #vector.kind<add>} : vector<3xf32>, vector<[7]xf32> } : vector<3x[7]xi1> -> vector<3x[7]xf32>
-// CHECK:         %[[T_MASK_R3:.*]] = vector.extract %[[T_MASK]][3] : vector<3x[7]xi1> from vector<5x3x[7]xi1>
-// CHECK:         %{{.*}} = vector.mask %[[T_MASK_R3]] { vector.outerproduct %{{.*}} {kind = #vector.kind<add>} : vector<3xf32>, vector<[7]xf32> } : vector<3x[7]xi1> -> vector<3x[7]xf32>
-// CHECK:         %[[T_MASK_R4:.*]] = vector.extract %[[T_MASK]][4] : vector<3x[7]xi1> from vector<5x3x[7]xi1>
-// CHECK:         %{{.*}} = vector.mask %[[T_MASK_R4]] { vector.outerproduct %{{.*}} {kind = #vector.kind<add>} : vector<3xf32>, vector<[7]xf32> } : vector<3x[7]xi1> -> vector<3x[7]xf32>
-
-// Note that only the J dimension is scalable in this example. In theory, all
-// dimensions could be be scalable, but there is no target yet for which this
-// would make sense.
-func.func @masked_extract_contract4_scalable_J_dim(%arg0: vector<3x5xf32>,
-                                    %arg1: vector<5x[7]xf32>,
-                                    %arg2: vector<3x[7]xf32>,
-                                    %m : vector<3x[7]x5xi1>) -> vector<3x[7]xf32> {
-  %0 = vector.mask %m { vector.contract #matmat_trait %arg0, %arg1, %arg2
-  : vector<3x5xf32>, vector<5x[7]xf32> into vector<3x[7]xf32> } : vector<3x[7]x5xi1> -> vector<3x[7]xf32>
-  return %0 : vector<3x[7]xf32>
-}
-
 // CHECK-LABEL: func @matmul
 // CHECK-SAME: %[[A:[a-zA-Z0-9]*]]: vector<2x4xf32>,
 // CHECK-SAME: %[[B:[a-zA-Z0-9]*]]: vector<4x3xf32>,
@@ -105,9 +56,9 @@ func.func @masked_extract_contract4_scalable_J_dim(%arg0: vector<3x5xf32>,
 //
 //      CHECK: return %[[c3]] : vector<2x3xf32>
 func.func @matmul(%arg0: vector<2x4xf32>,
-                          %arg1: vector<4x3xf32>,
-                          %arg2: vector<2x3xf32>) -> vector<2x3xf32> {
-  %0 = vector.contract #matmat_trait %arg0, %arg1, %arg2
+                  %arg1: vector<4x3xf32>,
+                  %arg2: vector<2x3xf32>) -> vector<2x3xf32> {
+  %0 = vector.contract #matmat_trait_0 %arg0, %arg1, %arg2
     : vector<2x4xf32>, vector<4x3xf32> into vector<2x3xf32>
   return %0 : vector<2x3xf32>
 }
@@ -141,48 +92,66 @@ func.func @matmul(%arg0: vector<2x4xf32>,
 //
 //      CHECK: return %[[c3]] : vector<2x[3]xf32>
 func.func @matmul_scalable(%arg0: vector<2x4xf32>,
-                          %arg1: vector<4x[3]xf32>,
-                          %arg2: vector<2x[3]xf32>) -> vector<2x[3]xf32> {
-  %0 = vector.contract #matmat_trait %arg0, %arg1, %arg2
+                           %arg1: vector<4x[3]xf32>,
+                           %arg2: vector<2x[3]xf32>) -> vector<2x[3]xf32> {
+  %0 = vector.contract #matmat_trait_0 %arg0, %arg1, %arg2
     : vector<2x4xf32>, vector<4x[3]xf32> into vector<2x[3]xf32>
   return %0 : vector<2x[3]xf32>
 }
 
-// CHECK-LABEL: func @matmul_0
-// CHECK-SAME: %[[A:[a-zA-Z0-9]*]]: vector<2x1xf32>,
-// CHECK-SAME: %[[B:[a-zA-Z0-9]*]]: vector<1x3xf32>,
-// CHECK-SAME: %[[C:[a-zA-Z0-9]*]]: vector<2x3xf32>
-//      CHECK: %[[At:.*]] = vector.transpose %[[A]], [1, 0]
-//      CHECK: %[[a0:.*]] = vector.extract %[[At]][0] : vector<2xf32> from vector<1x2xf32>
-//      CHECK: %[[b0:.*]] = vector.extract %[[B]][0] : vector<3xf32> from vector<1x3xf32>
-//      CHECK: %[[c0:.*]] = vector.outerproduct %[[a0]], %[[b0]], %[[C]]
-//      CHECK: return %[[c0]] : vector<2x3xf32>
-func.func @matmul_0(%arg0: vector<2x1xf32>, %arg1: vector<1x3xf32>, %arg2: vector<2x3xf32>)
--> vector<2x3xf32>
-{
-  %0 = vector.contract #matmat_trait_0 %arg0, %arg1, %arg2
-    : vector<2x1xf32>, vector<1x3xf32> into vector<2x3xf32>
-  return %0 : vector<2x3xf32>
+// CHECK-LABEL: func.func @masked_matmul(
+// CHECK-SAME:    %{{.*}}: vector<3x5xf32>,
+// CHECK-SAME:    %{{.*}}: vector<5x7xf32>,
+// CHECK-SAME:    %{{.*}}: vector<3x7xf32>,
+// CHECK-SAME:    %[[IN_MASK:.*]]: vector<3x7x5xi1>) -> vector<3x7xf32> {
+// CHECK:         %[[T_MASK:.*]] = vector.transpose %[[IN_MASK]], [2, 0, 1] : vector<3x7x5xi1> to vector<5x3x7xi1>
+// CHECK:         %[[T_MASK_R0:.*]] = vector.extract %[[T_MASK]][0] : vector<3x7xi1> from vector<5x3x7xi1>
+// CHECK:         %{{.*}} = vector.mask %[[T_MASK_R0]] { vector.outerproduct %{{.*}} {kind = #vector.kind<add>} : vector<3xf32>, vector<7xf32> } : vector<3x7xi1> -> vector<3x7xf32>
+// CHECK:         %[[T_MASK_R1:.*]] = vector.extract %[[T_MASK]][1] : vector<3x7xi1> from vector<5x3x7xi1>
+// CHECK:         %{{.*}} = vector.mask %[[T_MASK_R1]] { vector.outerproduct %{{.*}} {kind = #vector.kind<add>} : vector<3xf32>, vector<7xf32> } : vector<3x7xi1> -> vector<3x7xf32>
+// CHECK:         %[[T_MASK_R2:.*]] = vector.extract %[[T_MASK]][2] : vector<3x7xi1> from vector<5x3x7xi1>
+// CHECK:         %{{.*}} = vector.mask %[[T_MASK_R2]] { vector.outerproduct %{{.*}} {kind = #vector.kind<add>} : vector<3xf32>, vector<7xf32> } : vector<3x7xi1> -> vector<3x7xf32>
+// CHECK:         %[[T_MASK_R3:.*]] = vector.extract %[[T_MASK]][3] : vector<3x7xi1> from vector<5x3x7xi1>
+// CHECK:         %{{.*}} = vector.mask %[[T_MASK_R3]] { vector.outerproduct %{{.*}} {kind = #vector.kind<add>} : vector<3xf32>, vector<7xf32> } : vector<3x7xi1> -> vector<3x7xf32>
+// CHECK:         %[[T_MASK_R4:.*]] = vector.extract %[[T_MASK]][4] : vector<3x7xi1> from vector<5x3x7xi1>
+// CHECK:         %{{.*}} = vector.mask %[[T_MASK_R4]] { vector.outerproduct %{{.*}} {kind = #vector.kind<add>} : vector<3xf32>, vector<7xf32> } : vector<3x7xi1> -> vector<3x7xf32>
+
+func.func @masked_matmul(%arg0: vector<3x5xf32>,
+                         %arg1: vector<5x7xf32>,
+                         %arg2: vector<3x7xf32>,
+                         %m : vector<3x7x5xi1>) -> vector<3x7xf32> {
+  %0 = vector.mask %m { vector.contract #matmat_trait_0 %arg0, %arg1, %arg2
+  : vector<3x5xf32>, vector<5x7xf32> into vector<3x7xf32> } : vector<3x7x5xi1> -> vector<3x7xf32>
+  return %0 : vector<3x7xf32>
 }
 
-// CHECK-LABEL: func @matmul_0_scalable
-// CHECK-SAME: %[[A:[a-zA-Z0-9]*]]: vector<2x1xf32>,
-// CHECK-SAME: %[[B:[a-zA-Z0-9]*]]: vector<1x[3]xf32>,
-// CHECK-SAME: %[[C:[a-zA-Z0-9]*]]: vector<2x[3]xf32>
-//      CHECK: %[[At:.*]] = vector.transpose %[[A]], [1, 0]
-//      CHECK: %[[a0:.*]] = vector.extract %[[At]][0] : vector<2xf32> from vector<1x2xf32>
-//      CHECK: %[[b0:.*]] = vector.extract %[[B]][0] : vector<[3]xf32> from vector<1x[3]xf32>
-//      CHECK: %[[c0:.*]] = vector.outerproduct %[[a0]], %[[b0]], %[[C]]
-//      CHECK: return %[[c0]] : vector<2x[3]xf32>
-func.func @matmul_0_scalable(%arg0: vector<2x1xf32>, %arg1: vector<1x[3]xf32>, %arg2: vector<2x[3]xf32>)
--> vector<2x[3]xf32>
-{
-  %0 = vector.contract #matmat_trait_0 %arg0, %arg1, %arg2
-    : vector<2x1xf32>, vector<1x[3]xf32> into vector<2x[3]xf32>
-  return %0 : vector<2x[3]xf32>
+// CHECK-LABEL: func.func @masked_matmul_scalable(
+// CHECK-SAME:    %{{.*}}: vector<3x5xf32>,
+// CHECK-SAME:    %{{.*}}: vector<5x[7]xf32>,
+// CHECK-SAME:    %{{.*}}: vector<3x[7]xf32>,
+// CHECK-SAME:    %[[IN_MASK:.*]]: vector<3x[7]x5xi1>) -> vector<3x[7]xf32> {
+// CHECK:         %[[T_MASK:.*]] = vector.transpose %[[IN_MASK]], [2, 0, 1] : vector<3x[7]x5xi1> to vector<5x3x[7]xi1>
+// CHECK:         %[[T_MASK_R0:.*]] = vector.extract %[[T_MASK]][0] : vector<3x[7]xi1> from vector<5x3x[7]xi1>
+// CHECK:         %{{.*}} = vector.mask %[[T_MASK_R0]] { vector.outerproduct %{{.*}} {kind = #vector.kind<add>} : vector<3xf32>, vector<[7]xf32> } : vector<3x[7]xi1> -> vector<3x[7]xf32>
+// CHECK:         %[[T_MASK_R1:.*]] = vector.extract %[[T_MASK]][1] : vector<3x[7]xi1> from vector<5x3x[7]xi1>
+// CHECK:         %[[VAL_13:.*]] = vector.mask %[[T_MASK_R1]] { vector.outerproduct %{{.*}} {kind = #vector.kind<add>} : vector<3xf32>, vector<[7]xf32> } : vector<3x[7]xi1> -> vector<3x[7]xf32>
+// CHECK:         %[[T_MASK_R2:.*]] = vector.extract %[[T_MASK]][2] : vector<3x[7]xi1> from vector<5x3x[7]xi1>
+// CHECK:         %{{.*}} = vector.mask %[[T_MASK_R2]] { vector.outerproduct %{{.*}} {kind = #vector.kind<add>} : vector<3xf32>, vector<[7]xf32> } : vector<3x[7]xi1> -> vector<3x[7]xf32>
+// CHECK:         %[[T_MASK_R3:.*]] = vector.extract %[[T_MASK]][3] : vector<3x[7]xi1> from vector<5x3x[7]xi1>
+// CHECK:         %{{.*}} = vector.mask %[[T_MASK_R3]] { vector.outerproduct %{{.*}} {kind = #vector.kind<add>} : vector<3xf32>, vector<[7]xf32> } : vector<3x[7]xi1> -> vector<3x[7]xf32>
+// CHECK:         %[[T_MASK_R4:.*]] = vector.extract %[[T_MASK]][4] : vector<3x[7]xi1> from vector<5x3x[7]xi1>
+// CHECK:         %{{.*}} = vector.mask %[[T_MASK_R4]] { vector.outerproduct %{{.*}} {kind = #vector.kind<add>} : vector<3xf32>, vector<[7]xf32> } : vector<3x[7]xi1> -> vector<3x[7]xf32>
+
+func.func @masked_matmul_scalable(%arg0: vector<3x5xf32>,
+                                  %arg1: vector<5x[7]xf32>,
+                                  %arg2: vector<3x[7]xf32>,
+                                  %m : vector<3x[7]x5xi1>) -> vector<3x[7]xf32> {
+  %0 = vector.mask %m { vector.contract #matmat_trait_0 %arg0, %arg1, %arg2
+  : vector<3x5xf32>, vector<5x[7]xf32> into vector<3x[7]xf32> } : vector<3x[7]x5xi1> -> vector<3x[7]xf32>
+  return %0 : vector<3x[7]xf32>
 }
 
-// CHECK-LABEL: func @matmul_0_mixed
+// CHECK-LABEL: func @matmul_mixed
 // CHECK-SAME: %[[A:[a-zA-Z0-9]*]]: vector<2x1xf16>,
 // CHECK-SAME: %[[B:[a-zA-Z0-9]*]]: vector<1x3xf16>,
 // CHECK-SAME: %[[C:[a-zA-Z0-9]*]]: vector<2x3xf32>
@@ -193,15 +162,16 @@ func.func @matmul_0_scalable(%arg0: vector<2x1xf32>, %arg1: vector<1x[3]xf32>, %
 //      CHECK: %[[b1:.*]] = arith.extf %[[b0]] : vector<3xf16> to vector<3xf32>
 //      CHECK: %[[c0:.*]] = vector.outerproduct %[[a1]], %[[b1]], %[[C]]
 //      CHECK: return %[[c0]] : vector<2x3xf32>
-func.func @matmul_0_mixed(%arg0: vector<2x1xf16>, %arg1: vector<1x3xf16>, %arg2: vector<2x3xf32>)
--> vector<2x3xf32>
+func.func @matmul_mixed(%arg0: vector<2x1xf16>,
+                          %arg1: vector<1x3xf16>,
+                          %arg2: vector<2x3xf32>) -> vector<2x3xf32>
 {
   %0 = vector.contract #matmat_trait_0 %arg0, %arg1, %arg2
     : vector<2x1xf16>, vector<1x3xf16> into vector<2x3xf32>
   return %0 : vector<2x3xf32>
 }
 
-// CHECK-LABEL: func @matmul_0_mixed_scalable
+// CHECK-LABEL: func @matmul_mixed_scalable
 // CHECK-SAME: %[[A:[a-zA-Z0-9]*]]: vector<2x1xf16>,
 // CHECK-SAME: %[[B:[a-zA-Z0-9]*]]: vector<1x[3]xf16>,
 // CHECK-SAME: %[[C:[a-zA-Z0-9]*]]: vector<2x[3]xf32>
@@ -212,14 +182,18 @@ func.func @matmul_0_mixed(%arg0: vector<2x1xf16>, %arg1: vector<1x3xf16>, %arg2:
 //      CHECK: %[[b1:.*]] = arith.extf %[[b0]] : vector<[3]xf16> to vector<[3]xf32>
 //      CHECK: %[[c0:.*]] = vector.outerproduct %[[a1]], %[[b1]], %[[C]]
 //      CHECK: return %[[c0]] : vector<2x[3]xf32>
-func.func @matmul_0_mixed_scalable(%arg0: vector<2x1xf16>, %arg1: vector<1x[3]xf16>, %arg2: vector<2x[3]xf32>)
--> vector<2x[3]xf32>
+func.func @matmul_mixed_scalable(%arg0: vector<2x1xf16>,
+                                   %arg1: vector<1x[3]xf16>,
+                                   %arg2: vector<2x[3]xf32>) -> vector<2x[3]xf32>
 {
   %0 = vector.contract #matmat_trait_0 %arg0, %arg1, %arg2
     : vector<2x1xf16>, vector<1x[3]xf16> into vector<2x[3]xf32>
   return %0 : vector<2x[3]xf32>
 }
 
+// ============================================================================
+//  Matmul 1 (plain)
+// ============================================================================
 #matmat_accesses_1 = [
   affine_map<(m, n, k) -> (m, k)>,
   affine_map<(m, n, k) -> (n, k)>,
@@ -240,8 +214,9 @@ func.func @matmul_0_mixed_scalable(%arg0: vector<2x1xf16>, %arg1: vector<1x[3]xf
 //      CHECK: %[[b0:.*]] = vector.extract %[[Bt]][0] : vector<3xf32> from vector<1x3xf32>
 //      CHECK: %[[c0:.*]] = vector.outerproduct %[[a0]], %[[b0]], %[[C]]
 //      CHECK: return %[[c0]] : vector<2x3xf32>
-func.func @matmul_1(%arg0: vector<2x1xf32>, %arg1: vector<3x1xf32>, %arg2: vector<2x3xf32>)
--> vector<2x3xf32>
+func.func @matmul_1(%arg0: vector<2x1xf32>,
+                    %arg1: vector<3x1xf32>,
+                    %arg2: vector<2x3xf32>) -> vector<2x3xf32>
 {
   %0 = vector.contract #matmat_trait_1 %arg0, %arg1, %arg2
     : vector<2x1xf32>, vector<3x1xf32> into vector<2x3xf32>
@@ -258,14 +233,18 @@ func.func @matmul_1(%arg0: vector<2x1xf32>, %arg1: vector<3x1xf32>, %arg2: vecto
 //      CHECK: %[[b0:.*]] = vector.extract %[[Bt]][0] : vector<[3]xf32> from vector<1x[3]xf32>
 //      CHECK: %[[c0:.*]] = vector.outerproduct %[[a0]], %[[b0]], %[[C]]
 //      CHECK: return %[[c0]] : vector<2x[3]xf32>
-func.func @matmul_1_scalable(%arg0: vector<2x1xf32>, %arg1: vector<[3]x1xf32>, %arg2: vector<2x[3]xf32>)
--> vector<2x[3]xf32>
+func.func @matmul_1_scalable(%arg0: vector<2x1xf32>,
+                             %arg1: vector<[3]x1xf32>,
+                             %arg2: vector<2x[3]xf32>) -> vector<2x[3]xf32>
 {
   %0 = vector.contract #matmat_trait_1 %arg0, %arg1, %arg2
     : vector<2x1xf32>, vector<[3]x1xf32> into vector<2x[3]xf32>
   return %0 : vector<2x[3]xf32>
 }
 
+// ============================================================================
+//  Matmul 2 (plain)
+// ============================================================================
 #matmat_accesses_2 = [
   affine_map<(m, n, k) -> (k, m)>,
   affine_map<(m, n, k) -> (k, n)>,
@@ -284,8 +263,9 @@ func.func @matmul_1_scalable(%arg0: vector<2x1xf32>, %arg1: vector<[3]x1xf32>, %
 //      CHECK: %[[b0:.*]] = vector.extract %[[B]][0] : vector<3xf32> from vector<1x3xf32>
 //      CHECK: %[[c0:.*]] = vector.outerproduct %[[a0]], %[[b0]], %[[C]]
 //      CHECK: return %[[c0]] : vector<2x3xf32>
-func.func @matmul_2(%arg0: vector<1x2xf32>, %arg1: vector<1x3xf32>, %arg2: vector<2x3xf32>)
--> vector<2x3xf32>
+func.func @matmul_2(%arg0: vector<1x2xf32>,
+                    %arg1: vector<1x3xf32>,
+                    %arg2: vector<2x3xf32>) -> vector<2x3xf32>
 {
   %0 = vector.contract #matmat_trait_2 %arg0, %arg1, %arg2
     : vector<1x2xf32>, vector<1x3xf32> into vector<2x3xf32>
@@ -300,14 +280,18 @@ func.func @matmul_2(%arg0: vector<1x2xf32>, %arg1: vector<1x3xf32>, %arg2: vecto
 //      CHECK: %[[b0:.*]] = vector.extract %[[B]][0] : vector<[3]xf32> from vector<1x[3]xf32>
 //      CHECK: %[[c0:.*]] = vector.outerproduct %[[a0]], %[[b0]], %[[C]]
 //      CHECK: return %[[c0]] : vector<2x[3]xf32>
-func.func @matmul_2_scalable(%arg0: vector<1x2xf32>, %arg1: vector<1x[3]xf32>, %arg2: vector<2x[3]xf32>)
--> vector<2x[3]xf32>
+func.func @matmul_2_scalable(%arg0: vector<1x2xf32>,
+                             %arg1: vector<1x[3]xf32>,
+                             %arg2: vector<2x[3]xf32>) -> vector<2x[3]xf32>
 {
   %0 = vector.contract #matmat_trait_2 %arg0, %arg1, %arg2
     : vector<1x2xf32>, vector<1x[3]xf32> into vector<2x[3]xf32>
   return %0 : vector<2x[3]xf32>
 }
 
+// ============================================================================
+//  Matmul 3 (plain)
+// ============================================================================
 #matmat_accesses_3 = [
   affine_map<(m, n, k) -> (k, m)>,
   affine_map<(m, n, k) -> (n, k)>,
@@ -327,8 +311,9 @@ func.func @matmul_2_scalable(%arg0: vector<1x2xf32>, %arg1: vector<1x[3]xf32>, %
 //      CHECK: %[[b0:.*]] = vector.extract %[[Bt]][0] : vector<3xf32> from vector<1x3xf32>
 //      CHECK: %[[c0:.*]] = vector.outerproduct %[[a0]], %[[b0]], %[[C]]
 //      CHECK: return %[[c0]] : vector<2x3xf32>
-func.func @matmul_3(%arg0: vector<1x2xf32>, %arg1: vector<3x1xf32>, %arg2: vector<2x3xf32>)
--> vector<2x3xf32>
+func.func @matmul_3(%arg0: vector<1x2xf32>,
+                    %arg1: vector<3x1xf32>,
+                    %arg2: vector<2x3xf32>) -> vector<2x3xf32>
 {
   %0 = vector.contract #matmat_trait_3 %arg0, %arg1, %arg2
     : vector<1x2xf32>, vector<3x1xf32> into vector<2x3xf32>
@@ -344,14 +329,18 @@ func.func @matmul_3(%arg0: vector<1x2xf32>, %arg1: vector<3x1xf32>, %arg2: vecto
 //      CHECK: %[[b0:.*]] = vector.extract %[[Bt]][0] : vector<[3]xf32> from vector<1x[3]xf32>
 //      CHECK: %[[c0:.*]] = vector.outerproduct %[[a0]], %[[b0]], %[[C]]
 //      CHECK: return %[[c0]] : vector<2x[3]xf32>
-func.func @matmul_3_scalable(%arg0: vector<1x2xf32>, %arg1: vector<[3]x1xf32>, %arg2: vector<2x[3]xf32>)
--> vector<2x[3]xf32>
+func.func @matmul_3_scalable(%arg0: vector<1x2xf32>,
+                             %arg1: vector<[3]x1xf32>,
+                             %arg2: vector<2x[3]xf32>) -> vector<2x[3]xf32>
 {
   %0 = vector.contract #matmat_trait_3 %arg0, %arg1, %arg2
     : vector<1x2xf32>, vector<[3]x1xf32> into vector<2x[3]xf32>
   return %0 : vector<2x[3]xf32>
 }
 
+// ============================================================================
+//  Matmul 4 (plain)
+// ============================================================================
 #matmat_accesses_4 = [
   affine_map<(m, n, k) -> (m, k)>,
   affine_map<(m, n, k) -> (k, n)>,
@@ -371,8 +360,9 @@ func.func @matmul_3_scalable(%arg0: vector<1x2xf32>, %arg1: vector<[3]x1xf32>, %
 //      CHECK: %[[a0:.*]] = vector.extract %[[At]][0] : vector<2xf32> from vector<1x2xf32>
 //      CHECK: %[[c0:.*]] = vector.outerproduct %[[b0]], %[[a0]], %[[C]]
 //      CHECK: return %[[c0]] : vector<3x2xf32>
-func.func @matmul_4(%arg0: vector<2x1xf32>, %arg1: vector<1x3xf32>, %arg2: vector<3x2xf32>)
--> vector<3x2xf32>
+func.func @matmul_4(%arg0: vector<2x1xf32>,
+                    %arg1: vector<1x3xf32>,
+                    %arg2: vector<3x2xf32>) -> vector<3x2xf32>
 {
   %0 = vector.contract #matmat_trait_4 %arg0, %arg1, %arg2
     : vector<2x1xf32>, vector<1x3xf32> into vector<3x2xf32>
@@ -388,24 +378,18 @@ func.func @matmul_4(%arg0: vector<2x1xf32>, %arg1: vector<1x3xf32>, %arg2: vecto
 //      CHECK: %[[a0:.*]] = vector.extract %[[At]][0] : vector<[2]xf32> from vector<1x[2]xf32>
 //      CHECK: %[[c0:.*]] = vector.outerproduct %[[b0]], %[[a0]], %[[C]]
 //      CHECK: return %[[c0]] : vector<3x[2]xf32>
-func.func @matmul_4_scalable(%arg0: vector<[2]x1xf32>, %arg1: vector<1x3xf32>, %arg2: vector<3x[2]xf32>)
--> vector<3x[2]xf32>
+func.func @matmul_4_scalable(%arg0: vector<[2]x1xf32>,
+                             %arg1: vector<1x3xf32>,
+                             %arg2: vector<3x[2]xf32>) -> vector<3x[2]xf32>
 {
   %0 = vector.contract #matmat_trait_4 %arg0, %arg1, %arg2
     : vector<[2]x1xf32>, vector<1x3xf32> into vector<3x[2]xf32>
   return %0 : vector<3x[2]xf32>
 }
 
-#matmat_accesses_5 = [
-  affine_map<(m, n, k) -> (m, k)>,
-  affine_map<(m, n, k) -> (k, n)>,
-  affine_map<(m, n, k) -> (n, m)>
-]
-#matmat_trait_5 = {
-  indexing_maps = #matmat_accesses_5,
-  iterator_types = ["parallel", "parallel", "reduction"]
-}
-
+// ============================================================================
+//  Matvec 1 (masked)
+// ============================================================================
 #matvec_accesses_1 = [
   affine_map<(m, k) -> (m, k)>,
   affine_map<(m, k) -> (k)>,
@@ -440,7 +424,6 @@ func.func @masked_matvec_mk_k_m(%arg0: vector<2x3xf32>,
   return %0 : vector<2xf32>
 }
 
-
 // CHECK-LABEL:   func.func @masked_matvec_mk_k_m_scalable_parallel_dim(
 // CHECK-SAME:      %{{.*}}: vector<[2]x3xf32>,
 // CHECK-SAME:      %{{.*}}: vector<3xf32>,
@@ -464,6 +447,9 @@ func.func @masked_matvec_mk_k_m_scalable_parallel_dim(%arg0: vector<[2]x3xf32>,
   return %0 : vector<[2]xf32>
 }
 
+// ============================================================================
+//  Matvec 2 (masked)
+// ============================================================================
 #matvec_accesses_2 = [
   affine_map<(m, k) -> (k, m)>,
   affine_map<(m, k) -> (k)>,
@@ -479,7 +465,10 @@ func.func @masked_matvec_mk_k_m_scalable_parallel_dim(%arg0: vector<[2]x3xf32>,
 // CHECK-SAME:  %[[VEC:.+]]: vector<2xf32>
 // CHECK-SAME:  %[[INIT:.+]]: vector<4xf32>
 // CHECK-SAME:  %[[MASK:.+]]: vector<4x2xi1>
-func.func @masked_matvec_km_k_m(%arg0: vector<2x4xf32>, %arg1: vector<2xf32>, %arg2: vector<4xf32>, %mask: vector<4x2xi1>) -> vector<4xf32> {
+func.func @masked_matvec_km_k_m(%arg0: vector<2x4xf32>,
+                                %arg1: vector<2xf32>,
+                                %arg2: vector<4xf32>, 
+                                %mask: vector<4x2xi1>) -> vector<4xf32> {
   // CHECK:         vector.transpose %[[MASK]]
   // CHECK-NOT:     vector.transpose %[[MAT]]
   // CHECK-COUNT-2: vector.mask %{{.*}} { vector.outerproduct %{{.*}}, %{{.*}}, %{{.*}} {kind = #vector.kind<add>} : vector<4xf32>, f32 }
@@ -495,7 +484,10 @@ func.func @masked_matvec_km_k_m(%arg0: vector<2x4xf32>, %arg1: vector<2xf32>, %a
 // CHECK-SAME:  %[[VEC:.+]]: vector<2xf32>
 // CHECK-SAME:  %[[INIT:.+]]: vector<[4]xf32>
 // CHECK-SAME:  %[[MASK:.+]]: vector<[4]x2xi1>
-func.func @masked_matvec_km_k_m_scalable_parallel_dim(%arg0: vector<2x[4]xf32>, %arg1: vector<2xf32>, %arg2: vector<[4]xf32>, %mask: vector<[4]x2xi1>) -> vector<[4]xf32> {
+func.func @masked_matvec_km_k_m_scalable_parallel_dim(%arg0: vector<2x[4]xf32>,
+                                                      %arg1: vector<2xf32>,
+                                                      %arg2: vector<[4]xf32>,
+                                                      %mask: vector<[4]x2xi1>) -> vector<[4]xf32> {
   // CHECK:         vector.transpose %[[MASK]]
   // CHECK-NOT:     vector.transpose %[[MAT]]
   // CHECK-COUNT-2: vector.mask %{{.*}} { vector.outerproduct %{{.*}}, %{{.*}}, %{{.*}} {kind = #vector.kind<add>} : vector<[4]xf32>, f32 }
@@ -506,6 +498,9 @@ func.func @masked_matvec_km_k_m_scalable_parallel_dim(%arg0: vector<2x[4]xf32>,
   return %res : vector<[4]xf32>
 }
 
+// ============================================================================
+//  Matvec 3 (masked)
+// ============================================================================
 #matvec_accesses_3 = [
   affine_map<(m, k) -> (k)>,
   affine_map<(m, k) -> (m, k)>,
@@ -521,7 +516,10 @@ func.func @masked_matvec_km_k_m_scalable_parallel_dim(%arg0: vector<2x[4]xf32>,
 // CHECK-SAME:  %[[VEC:.+]]: vector<2xf32>
 // CHECK-SAME:  %[[INIT:.+]]: vector<4xf32>
 // CHECK-SAME:  %[[MASK:.+]]: vector<4x2xi1>
-func.func @masked_matvec_k_mk_m(%arg0: vector<4x2xf32>, %arg1: vector<2xf32>, %arg2: vector<4xf32>, %mask: vector<4x2xi1>) -> vector<4xf32> {
+func.func @masked_matvec_k_mk_m(%arg0: vector<4x2xf32>,
+                                %arg1: vector<2xf32>,
+                                %arg2: vector<4xf32>,
+                                %mask: vector<4x2xi1>) -> vector<4xf32> {
   // CHECK:         vector.transpose %[[MASK]]
   // CHECK:         vector.transpose %[[MAT]]
   // CHECK-COUNT-2: vector.mask %{{.*}} { vector.outerproduct %{{.*}}, %{{.*}}, %{{.*}} {kind = #vector.kind<add>} : vector<4xf32>, f32 }
@@ -537,7 +535,10 @@ func.func @masked_matvec_k_mk_m(%arg0: vector<4x2xf32>, %arg1: vector<2xf32>, %a
 // CHECK-SAME:  %[[VEC:.+]]: vector<2xf32>
 // CHECK-SAME:  %[[INIT:.+]]: vector<[4]xf32>
 // CHECK-SAME:  %[[MASK:.+]]: vector<[4]x2xi1>
-func.func @masked_matvec_k_mk_m_scalable_parallel_dim(%arg0: vector<[4]x2xf32>, %arg1: vector<2xf32>, %arg2: vector<[4]xf32>, %mask: vector<[4]x2xi1>) -> vector<[4]xf32> {
+func.func @masked_matvec_k_mk_m_scalable_parallel_dim(%arg0: vector<[4]x2xf32>,
+                                                      %arg1: vector<2xf32>,
+                                                      %arg2: vector<[4]xf32>,
+                                                      %mask: vector<[4]x2xi1>) -> vector<[4]xf32> {
   // CHECK:         vector.transpose %[[MASK]]
   // CHECK:         vector.transpose %[[MAT]]
   // CHECK-COUNT-2: vector.mask %{{.*}} { vector.outerproduct %{{.*}}, %{{.*}}, %{{.*}} {kind = #vector.kind<add>} : vector<[4]xf32>, f32 }
@@ -548,6 +549,9 @@ func.func @masked_matvec_k_mk_m_scalable_parallel_dim(%arg0: vector<[4]x2xf32>,
   return %res : vector<[4]xf32>
 }
 
+// ============================================================================
+//  Matvec 4 (masked)
+// ============================================================================
 #matvec_accesses_4 = [
   affine_map<(m, k) -> (k)>,
   affine_map<(m, k) -> (k, m)>,
@@ -563,7 +567,10 @@ func.func @masked_matvec_k_mk_m_scalable_parallel_dim(%arg0: vector<[4]x2xf32>,
 // CHECK-SAME:  %[[VEC:.+]]: vector<2xf32>
 // CHECK-SAME:  %[[INIT:.+]]: vector<[4]xf32>
 // CHECK-SAME:  %[[MASK:.+]]: vector<[4]x2xi1>
-func.func @masked_matvec_k_km_m_scalable_parallel_dim(%arg0: vector<2x[4]xf32>, %arg1: vector<2xf32>, %arg2: vector<[4]xf32>, %mask: vector<[4]x2xi1>) -> vector<[4]xf32> {
+func.func @masked_matvec_k_km_m_scalable_parallel_dim(%arg0: vector<2x[4]xf32>,
+                                                      %arg1: vector<2xf32>,
+                                                      %arg2: vector<[4]xf32>,
+                                                      %mask: vector<[4]x2xi1>) -> vector<[4]xf32> {
   // CHECK:         vector.transpose %[[MASK]]
   // CHECK-NOT:     vector.transpose %[[MAT]]
   // CHECK-COUNT-2: vector.mask %{{.*}} { vector.outerproduct %{{.*}}, %{{.*}}, %{{.*}} {kind = #vector.kind<add>} : vector<[4]xf32>, f32 }
@@ -579,7 +586,10 @@ func.func @masked_matvec_k_km_m_scalable_parallel_dim(%arg0: vector<2x[4]xf32>,
 // CHECK-SAME:  %[[VEC:.+]]: vector<2xf32>
 // CHECK-SAME:  %[[INIT:.+]]: vector<4xf32>
 // CHECK-SAME:  %[[MASK:.+]]: vector<4x2xi1>
-func.func @masked_matvec_k_km_m(%arg0: vector<2x4xf32>, %arg1: vector<2xf32>, %arg2: vector<4xf32>, %mask: vector<4x2xi1>) -> vector<4xf32> {
+func.func @masked_matvec_k_km_m(%arg0: vector<2x4xf32>,
+                                %arg1: vector<2xf32>,
+                                %arg2: vector<4xf32>,
+                                %mask: vector<4x2xi1>) -> vector<4xf32> {
   // CHECK:         vector.transpose %[[MASK]]
   // CHECK-NOT:     vector.transpose %[[MAT]]
   // CHECK-COUNT-2: vector.mask %{{.*}} { vector.outerproduct %{{.*}}, %{{.*}}, %{{.*}} {kind = #vector.kind<add>} : vector<4xf32>, f32 }
@@ -590,6 +600,9 @@ func.func @masked_matvec_k_km_m(%arg0: vector<2x4xf32>, %arg1: vector<2xf32>, %a
   return %res : vector<4xf32>
 }
 
+// ============================================================================
+//  Matvec 5 (masked)
+// ============================================================================
 #matvec_accesses_5 = [
   affine_map<(k, m) -> (m, k)>,
   affine_map<(k, m) -> (k)>,
@@ -632,6 +645,9 @@ func.func @masked_tmatvec_mk_k_m_scalable_parallel_dim(%arg0: vector<[4]x2xf32>,
   return %res : vector<[4]xf32>
 }
 
+// ============================================================================
+//  Matvec 6 (masked)
+// ============================================================================
 #matvec_accesses_6 = [
   affine_map<(k, m) -> (k, m)>,
   affine_map<(k, m) -> (k)>,
@@ -674,6 +690,9 @@ func.func @masked_tmatvec_km_k_m_scalable_parallel_dim(%arg0: vector<2x[4]xf32>,
   return %res : vector<[4]xf32>
 }
 
+// ============================================================================
+//  Matvec 7 (masked)
+// ============================================================================
 #matvec_accesses_7 = [
   affine_map<(k, m) -> (k)>,
   affine_map<(k, m) -> (m, k)>,
@@ -716,6 +735,9 @@ func.func @masked_tmatvec_k_mk_m_scalable_parallel_dim(%arg0: vector<[4]x2xf32>,
   return %res : vector<[4]xf32>
 }
 
+// ============================================================================
+//  Matvec 8 (masked)
+// ============================================================================
 #matvec_accesses_8 = [
   affine_map<(k, m) -> (k)>,
   affine_map<(k, m) -> (k, m)>,
@@ -759,6 +781,9 @@ func.func @masked_tmatvec_k_km_m_scalable_parallel_dim(%arg0: vector<2x[4]xf32>,
 }
 
 
+// ============================================================================
+//  TD sequence
+// ============================================================================
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%module_op: !transform.any_op {transform.readonly}) {
     %f = transform.structured.match ops{["func.func"]} in %module_op

From 4c43c1eeef434cf0b98ee9094c776d118da1faeb Mon Sep 17 00:00:00 2001
From: Michael Maitland <michaeltmaitland@gmail.com>
Date: Fri, 27 Oct 2023 13:03:25 -0400
Subject: [PATCH 241/877] [RISCV][GISEL] Add legalizer for G_BSWAP (#70226)

Lower G_BSWAP into simpler instructions that can be selected in
instruction selection. A future patch can handle when there is Zbb.
---
 .../Target/RISCV/GISel/RISCVLegalizerInfo.cpp |   2 +
 .../legalizer/rv32/legalize-bswap.mir         | 106 +++++++++++++++
 .../legalizer/rv64/legalize-bswap.mir         | 122 ++++++++++++++++++
 3 files changed, 230 insertions(+)
 create mode 100644 llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv32/legalize-bswap.mir
 create mode 100644 llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-bswap.mir

diff --git a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
index da5e243b8a9ef..ac6a887ed5d82 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
+++ b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
@@ -76,6 +76,8 @@ RISCVLegalizerInfo::RISCVLegalizerInfo(const RISCVSubtarget &ST) {
         .clampScalar(BigTyIdx, sXLen, sXLen);
   }
 
+  getActionDefinitionsBuilder(G_BSWAP).maxScalar(0, sXLen).lower();
+
   getActionDefinitionsBuilder({G_CONSTANT, G_IMPLICIT_DEF})
       .legalFor({s32, sXLen, p0})
       .widenScalarToNextPow2(0)
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv32/legalize-bswap.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv32/legalize-bswap.mir
new file mode 100644
index 0000000000000..f01397078e409
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv32/legalize-bswap.mir
@@ -0,0 +1,106 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
+# RUN: llc -mtriple=riscv32 -run-pass=legalizer %s -o - | FileCheck %s
+
+---
+name:            bswap_i16
+body:             |
+  bb.0:
+    liveins: $x10
+    ; CHECK-LABEL: name: bswap_i16
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: [[ASSERT_ZEXT:%[0-9]+]]:_(s32) = G_ASSERT_ZEXT [[COPY]], 16
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ASSERT_ZEXT]], [[C]](s32)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ASSERT_ZEXT]], [[C2]]
+    ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[AND]], [[C1]](s32)
+    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[LSHR]], [[SHL]]
+    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[OR]], [[C3]]
+    ; CHECK-NEXT: $x10 = COPY [[AND1]](s32)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:_(s32) = COPY $x10
+    %1:_(s32) = G_ASSERT_ZEXT %0, 16
+    %2:_(s16) = G_TRUNC %1(s32)
+    %3:_(s16) = G_BSWAP %2
+    %4:_(s32) = G_ZEXT %3(s16)
+    $x10 = COPY %4(s32)
+    PseudoRET implicit $x10
+...
+---
+name:            bswap_i32
+body:             |
+  bb.0:
+    liveins: $x10
+    ; CHECK-LABEL: name: bswap_i32
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
+    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[C]](s32)
+    ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C]](s32)
+    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[LSHR]], [[SHL]]
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65280
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C1]]
+    ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND]], [[C2]](s32)
+    ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
+    ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C2]](s32)
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C1]]
+    ; CHECK-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[AND1]]
+    ; CHECK-NEXT: $x10 = COPY [[OR2]](s32)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:_(s32) = COPY $x10
+    %1:_(s32) = G_BSWAP %0
+    $x10 = COPY %1(s32)
+    PseudoRET implicit $x10
+...
+---
+name:            bswap_i64
+body:             |
+  bb.0:
+    liveins: $x10, $x11
+    ; CHECK-LABEL: name: bswap_i64
+    ; CHECK: liveins: $x10, $x11
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x11
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
+    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY1]], [[C]](s32)
+    ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY1]], [[C]](s32)
+    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[LSHR]], [[SHL]]
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65280
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]]
+    ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND]], [[C2]](s32)
+    ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
+    ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[COPY1]], [[C2]](s32)
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C1]]
+    ; CHECK-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[AND1]]
+    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
+    ; CHECK-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[C3]](s32)
+    ; CHECK-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C3]](s32)
+    ; CHECK-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[LSHR2]], [[SHL2]]
+    ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 65280
+    ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C4]]
+    ; CHECK-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C5]](s32)
+    ; CHECK-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[OR3]], [[SHL3]]
+    ; CHECK-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C5]](s32)
+    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[LSHR3]], [[C4]]
+    ; CHECK-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[OR4]], [[AND3]]
+    ; CHECK-NEXT: $x10 = COPY [[OR2]](s32)
+    ; CHECK-NEXT: $x11 = COPY [[OR5]](s32)
+    ; CHECK-NEXT: PseudoRET implicit $x10, implicit $x11
+    %0:_(s32) = COPY $x10
+    %1:_(s32) = COPY $x11
+    %2:_(s64) = G_MERGE_VALUES %0(s32), %1(s32)
+    %3:_(s64) = G_BSWAP %2
+    %4:_(s32), %5:_(s32) = G_UNMERGE_VALUES %3(s64)
+    $x10 = COPY %4(s32)
+    $x11 = COPY %5(s32)
+    PseudoRET implicit $x10, implicit $x11
+...
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-bswap.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-bswap.mir
new file mode 100644
index 0000000000000..a4b26481b729c
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-bswap.mir
@@ -0,0 +1,122 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=riscv64 -run-pass=legalizer %s -o - | FileCheck %s
+
+---
+name:            bswap_i16
+body:             |
+  bb.0:
+    liveins: $x10
+    ; CHECK-LABEL: name: bswap_i16
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: [[ASSERT_ZEXT:%[0-9]+]]:_(s64) = G_ASSERT_ZEXT [[COPY]], 16
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[ASSERT_ZEXT]](s64)
+    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[TRUNC]], [[C]](s32)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 65535
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[ASSERT_ZEXT]], [[C2]]
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[AND]](s64)
+    ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[TRUNC1]], [[C1]](s32)
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LSHR]](s32)
+    ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[SHL]](s32)
+    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s64) = G_OR [[ANYEXT]], [[ANYEXT1]]
+    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 65535
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[OR]], [[C3]]
+    ; CHECK-NEXT: $x10 = COPY [[AND1]](s64)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:_(s64) = COPY $x10
+    %1:_(s64) = G_ASSERT_ZEXT %0, 16
+    %2:_(s16) = G_TRUNC %1(s64)
+    %3:_(s16) = G_BSWAP %2
+    %4:_(s64) = G_ZEXT %3(s16)
+    $x10 = COPY %4(s64)
+    PseudoRET implicit $x10
+...
+---
+name:            bswap_i32
+body:             |
+  bb.0:
+    liveins: $x10
+    ; CHECK-LABEL: name: bswap_i32
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: [[ASSERT_ZEXT:%[0-9]+]]:_(s64) = G_ASSERT_ZEXT [[COPY]], 32
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[ASSERT_ZEXT]](s64)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
+    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[TRUNC]], [[C]](s32)
+    ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[TRUNC]], [[C]](s32)
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LSHR]](s32)
+    ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[SHL]](s32)
+    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s64) = G_OR [[ANYEXT]], [[ANYEXT1]]
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 65280
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[ASSERT_ZEXT]], [[C2]]
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[AND]](s64)
+    ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[TRUNC1]], [[C1]](s32)
+    ; CHECK-NEXT: [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[SHL1]](s32)
+    ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s64) = G_OR [[OR]], [[ANYEXT2]]
+    ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[TRUNC]], [[C1]](s32)
+    ; CHECK-NEXT: [[ANYEXT3:%[0-9]+]]:_(s64) = G_ANYEXT [[LSHR1]](s32)
+    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 65280
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[ANYEXT3]], [[C3]]
+    ; CHECK-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[OR1]], [[AND1]]
+    ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4294967295
+    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s64) = G_AND [[OR2]], [[C4]]
+    ; CHECK-NEXT: $x10 = COPY [[AND2]](s64)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:_(s64) = COPY $x10
+    %1:_(s64) = G_ASSERT_ZEXT %0, 32
+    %2:_(s32) = G_TRUNC %1(s64)
+    %3:_(s32) = G_BSWAP %2
+    %4:_(s64) = G_ZEXT %3(s32)
+    $x10 = COPY %4(s64)
+    PseudoRET implicit $x10
+...
+---
+name:            bswap_i64
+body:             |
+  bb.0:
+    liveins: $x10
+    ; CHECK-LABEL: name: bswap_i64
+    ; CHECK: liveins: $x10
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 56
+    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[COPY]], [[C]](s64)
+    ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[COPY]], [[C]](s64)
+    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s64) = G_OR [[LSHR]], [[SHL]]
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 65280
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 40
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY]], [[C1]]
+    ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s64) = G_SHL [[AND]], [[C2]](s64)
+    ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s64) = G_OR [[OR]], [[SHL1]]
+    ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(s64) = G_LSHR [[COPY]], [[C2]](s64)
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[LSHR1]], [[C1]]
+    ; CHECK-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[OR1]], [[AND1]]
+    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 16711680
+    ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 24
+    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s64) = G_AND [[COPY]], [[C3]]
+    ; CHECK-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[AND2]], [[C4]](s64)
+    ; CHECK-NEXT: [[OR3:%[0-9]+]]:_(s64) = G_OR [[OR2]], [[SHL2]]
+    ; CHECK-NEXT: [[LSHR2:%[0-9]+]]:_(s64) = G_LSHR [[COPY]], [[C4]](s64)
+    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s64) = G_AND [[LSHR2]], [[C3]]
+    ; CHECK-NEXT: [[OR4:%[0-9]+]]:_(s64) = G_OR [[OR3]], [[AND3]]
+    ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 -16777216
+    ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
+    ; CHECK-NEXT: [[AND4:%[0-9]+]]:_(s64) = G_AND [[COPY]], [[C5]]
+    ; CHECK-NEXT: [[SHL3:%[0-9]+]]:_(s64) = G_SHL [[AND4]], [[C6]](s64)
+    ; CHECK-NEXT: [[OR5:%[0-9]+]]:_(s64) = G_OR [[OR4]], [[SHL3]]
+    ; CHECK-NEXT: [[LSHR3:%[0-9]+]]:_(s64) = G_LSHR [[COPY]], [[C6]](s64)
+    ; CHECK-NEXT: [[AND5:%[0-9]+]]:_(s64) = G_AND [[LSHR3]], [[C5]]
+    ; CHECK-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[OR5]], [[AND5]]
+    ; CHECK-NEXT: $x10 = COPY [[OR6]](s64)
+    ; CHECK-NEXT: PseudoRET implicit $x10
+    %0:_(s64) = COPY $x10
+    %1:_(s64) = G_BSWAP %0
+    $x10 = COPY %1(s64)
+    PseudoRET implicit $x10
+...
+

From 4b581125ed683e1ed683aa583d4abb17acd8d4e8 Mon Sep 17 00:00:00 2001
From: Michael Maitland <michaeltmaitland@gmail.com>
Date: Fri, 27 Oct 2023 10:19:44 -0700
Subject: [PATCH 242/877] [RISCV][GISel] Fix failing test case for G_BSWAP

The test was not updated correctly in #70226. This patch resolves that
problem.
---
 .../legalizer/rv64/legalize-bswap.mir         | 44 ++++++++-----------
 1 file changed, 18 insertions(+), 26 deletions(-)

diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-bswap.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-bswap.mir
index a4b26481b729c..07bdb346fd2ae 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-bswap.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-bswap.mir
@@ -15,15 +15,14 @@ body:             |
     ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[ASSERT_ZEXT]](s64)
     ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[TRUNC]], [[C]](s32)
     ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 65535
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[ASSERT_ZEXT]], [[C2]]
-    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[AND]](s64)
-    ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[TRUNC1]], [[C1]](s32)
-    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LSHR]](s32)
-    ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[SHL]](s32)
-    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s64) = G_OR [[ANYEXT]], [[ANYEXT1]]
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[ASSERT_ZEXT]](s64)
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[TRUNC1]], [[C2]]
+    ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[AND]], [[C1]](s32)
+    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[LSHR]], [[SHL]]
     ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 65535
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[OR]], [[C3]]
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR]](s32)
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[ANYEXT]], [[C3]]
     ; CHECK-NEXT: $x10 = COPY [[AND1]](s64)
     ; CHECK-NEXT: PseudoRET implicit $x10
     %0:_(s64) = COPY $x10
@@ -48,24 +47,17 @@ body:             |
     ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
     ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[TRUNC]], [[C]](s32)
     ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[TRUNC]], [[C]](s32)
-    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LSHR]](s32)
-    ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[SHL]](s32)
-    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s64) = G_OR [[ANYEXT]], [[ANYEXT1]]
-    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 65280
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[ASSERT_ZEXT]], [[C2]]
-    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[AND]](s64)
-    ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[TRUNC1]], [[C1]](s32)
-    ; CHECK-NEXT: [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[SHL1]](s32)
-    ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s64) = G_OR [[OR]], [[ANYEXT2]]
-    ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[TRUNC]], [[C1]](s32)
-    ; CHECK-NEXT: [[ANYEXT3:%[0-9]+]]:_(s64) = G_ANYEXT [[LSHR1]](s32)
-    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 65280
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[ANYEXT3]], [[C3]]
-    ; CHECK-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[OR1]], [[AND1]]
-    ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 4294967295
-    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s64) = G_AND [[OR2]], [[C4]]
-    ; CHECK-NEXT: $x10 = COPY [[AND2]](s64)
+    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[LSHR]], [[SHL]]
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65280
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[TRUNC]], [[C1]]
+    ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND]], [[C2]](s32)
+    ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
+    ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[TRUNC]], [[C2]](s32)
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C1]]
+    ; CHECK-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[AND1]]
+    ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[OR2]](s32)
+    ; CHECK-NEXT: $x10 = COPY [[ZEXT]](s64)
     ; CHECK-NEXT: PseudoRET implicit $x10
     %0:_(s64) = COPY $x10
     %1:_(s64) = G_ASSERT_ZEXT %0, 32

From 3e6d6f2834b65bf18e7c8824133e647821a02207 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <rampitec@users.noreply.github.com>
Date: Fri, 27 Oct 2023 10:22:54 -0700
Subject: [PATCH 243/877] [AMDGPU] Set Size to 4 for V_MOV_B64_PSEUDO and
 S_MOV_B64_IMM_PSEUDO (#70376)

These are not fixed size instructions, so immediate size shall be added
separately. A minimal opcode size 4 since the inception of the V_MOV_B64
instruction. A real instruction can be as small as 4 bytes in case of
inline immediate. Otherwise it is NFCI.
---
 llvm/lib/Target/AMDGPU/SIInstructions.td | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index e7eb24033bea9..3cd0821b0f86c 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -132,7 +132,7 @@ def V_MOV_B64_PSEUDO : VPseudoInstSI <(outs VReg_64:$vdst),
   let isAsCheapAsAMove = 1;
   let isMoveImm = 1;
   let SchedRW = [Write64Bit];
-  let Size = 16; // Needs maximum 2 v_mov_b32 instructions 8 byte long each.
+  let Size = 4;
   let UseNamedOperandTable = 1;
 }
 
@@ -149,7 +149,7 @@ def S_MOV_B64_IMM_PSEUDO : SPseudoInstSI <(outs SReg_64:$sdst),
   let isAsCheapAsAMove = 1;
   let isMoveImm = 1;
   let SchedRW = [WriteSALU, Write64Bit];
-  let Size = 16; // Needs maximum 2 s_mov_b32 instructions 8 byte long each.
+  let Size = 4;
   let Uses = [];
   let UseNamedOperandTable = 1;
 }

From 793e636ecb194f43a1fbcc7ccbe9b38888b96f0d Mon Sep 17 00:00:00 2001
From: Paul T Robinson <paul.robinson@sony.com>
Date: Fri, 27 Oct 2023 10:26:24 -0700
Subject: [PATCH 244/877] [Headers] Add \returns to _rdpid_u32 (#70481)

Our doc tooling complained about this missing directive.
---
 clang/lib/Headers/immintrin.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/clang/lib/Headers/immintrin.h b/clang/lib/Headers/immintrin.h
index 642602be14e60..9bfe2fcdabdb3 100644
--- a/clang/lib/Headers/immintrin.h
+++ b/clang/lib/Headers/immintrin.h
@@ -291,11 +291,13 @@
 
 #if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
     defined(__RDPID__)
-/// Returns the value of the IA32_TSC_AUX MSR (0xc0000103).
+/// Reads the value of the IA32_TSC_AUX MSR (0xc0000103).
 ///
 /// \headerfile <immintrin.h>
 ///
 /// This intrinsic corresponds to the <c> RDPID </c> instruction.
+///
+/// \returns The 32-bit contents of the MSR.
 static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__, __target__("rdpid")))
 _rdpid_u32(void) {
   return __builtin_ia32_rdpid();

From 6e863c40738f6db56e98dc54458b82fc9b7d597f Mon Sep 17 00:00:00 2001
From: michaelrj-google <71531609+michaelrj-google@users.noreply.github.com>
Date: Fri, 27 Oct 2023 11:04:11 -0700
Subject: [PATCH 245/877] [libc] Fix incorrect printing for alt mode ints
 (#70252)

Previously, our printf would incorrectly handle conversions like
("%#x",0) and ("%#o",0). This patch corrects the behavior to match what
is described in the standard.
---
 libc/src/stdio/printf_core/int_converter.h | 16 ++++++++++++++--
 libc/test/src/stdio/sprintf_test.cpp       | 16 ++++++++++++++++
 2 files changed, 30 insertions(+), 2 deletions(-)

diff --git a/libc/src/stdio/printf_core/int_converter.h b/libc/src/stdio/printf_core/int_converter.h
index 348e81a1fa536..13fcf3f1aa2ed 100644
--- a/libc/src/stdio/printf_core/int_converter.h
+++ b/libc/src/stdio/printf_core/int_converter.h
@@ -113,7 +113,7 @@ LIBC_INLINE int convert_int(Writer *writer, const FormatSection &to_conv) {
   size_t prefix_len;
   char prefix[2];
   if ((to_lower(to_conv.conv_name) == 'x') &&
-      ((flags & FormatFlags::ALTERNATE_FORM) != 0)) {
+      ((flags & FormatFlags::ALTERNATE_FORM) != 0) && num != 0) {
     prefix_len = 2;
     prefix[0] = '0';
     prefix[1] = a + ('x' - 'a');
@@ -158,8 +158,20 @@ LIBC_INLINE int convert_int(Writer *writer, const FormatSection &to_conv) {
                               prefix_len);
   }
 
+  // The standard says that alternate form for the o conversion "increases
+  // the precision, if and only if necessary, to force the first digit of the
+  // result to be a zero (if the value and precision are both 0, a single 0 is
+  // printed)"
+  // This if checks the following conditions:
+  // 1) is this an o conversion in alternate form?
+  // 2) does this number has a leading zero?
+  //    2a) ... because there are additional leading zeroes?
+  //    2b) ... because it is just "0", unless it will not write any digits.
+  const bool has_leading_zero =
+      (zeroes > 0) || ((num == 0) && (digits_written != 0));
   if ((to_conv.conv_name == 'o') &&
-      ((to_conv.flags & FormatFlags::ALTERNATE_FORM) != 0) && zeroes < 1) {
+      ((to_conv.flags & FormatFlags::ALTERNATE_FORM) != 0) &&
+      !has_leading_zero) {
     zeroes = 1;
     --spaces;
   }
diff --git a/libc/test/src/stdio/sprintf_test.cpp b/libc/test/src/stdio/sprintf_test.cpp
index a68ee97d6d6ed..b2c321c0b15c9 100644
--- a/libc/test/src/stdio/sprintf_test.cpp
+++ b/libc/test/src/stdio/sprintf_test.cpp
@@ -329,6 +329,10 @@ TEST(LlvmLibcSPrintfTest, HexConv) {
   EXPECT_EQ(written, 5);
   ASSERT_STREQ(buff, "0xd3f");
 
+  written = LIBC_NAMESPACE::sprintf(buff, "%#x", 0);
+  EXPECT_EQ(written, 1);
+  ASSERT_STREQ(buff, "0");
+
   written = LIBC_NAMESPACE::sprintf(buff, "%#X", 0xE40);
   EXPECT_EQ(written, 5);
   ASSERT_STREQ(buff, "0XE40");
@@ -359,6 +363,10 @@ TEST(LlvmLibcSPrintfTest, HexConv) {
   EXPECT_EQ(written, 9);
   ASSERT_STREQ(buff, "  0X009D4");
 
+  written = LIBC_NAMESPACE::sprintf(buff, "%#.x", 0);
+  EXPECT_EQ(written, 0);
+  ASSERT_STREQ(buff, "");
+
   written = LIBC_NAMESPACE::sprintf(buff, "%-7.5x", 0x260);
   EXPECT_EQ(written, 7);
   ASSERT_STREQ(buff, "00260  ");
@@ -516,6 +524,10 @@ TEST(LlvmLibcSPrintfTest, OctConv) {
   EXPECT_EQ(written, 4);
   ASSERT_STREQ(buff, "0234");
 
+  written = LIBC_NAMESPACE::sprintf(buff, "%#o", 0);
+  EXPECT_EQ(written, 1);
+  ASSERT_STREQ(buff, "0");
+
   written = LIBC_NAMESPACE::sprintf(buff, "%05o", 0470);
   EXPECT_EQ(written, 5);
   ASSERT_STREQ(buff, "00470");
@@ -534,6 +546,10 @@ TEST(LlvmLibcSPrintfTest, OctConv) {
   EXPECT_EQ(written, 7);
   ASSERT_STREQ(buff, "0703   ");
 
+  written = LIBC_NAMESPACE::sprintf(buff, "%#.o", 0);
+  EXPECT_EQ(written, 1);
+  ASSERT_STREQ(buff, "0");
+
   written = LIBC_NAMESPACE::sprintf(buff, "%7.5o", 0314);
   EXPECT_EQ(written, 7);
   ASSERT_STREQ(buff, "  00314");

From 138e6c1c86e8050781cb626598c380651d31206c Mon Sep 17 00:00:00 2001
From: Antonio Frighetto <me@antoniofrighetto.com>
Date: Fri, 27 Oct 2023 17:30:31 +0200
Subject: [PATCH 246/877] [AArch64][TTI] Improve `LegalVF` when gather loads
 are scalarized

After determining the cost of loads that could not be coalesced into
`VectorizedLoads` in SLP, computing the cost of a gather-vectorized
load is carried out. Favour a potentially high valid cost when the
type of a group of loads, whose type is a vector of size dependent
upon `VF`, may be legalized into a scalar value.

Fixes: https://github.com/llvm/llvm-project/issues/68953.
---
 .../AArch64/AArch64TargetTransformInfo.cpp    |  6 ++-
 .../Analysis/CostModel/AArch64/sve-gather.ll  | 19 +++++++
 .../SLPVectorizer/AArch64/gather-load-128.ll  | 54 +++++++++++++++++++
 3 files changed, 78 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/Transforms/SLPVectorizer/AArch64/gather-load-128.ll

diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 5f2d09f0765aa..9ddee9e4edd58 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -2996,7 +2996,7 @@ static unsigned getSVEGatherScatterOverhead(unsigned Opcode) {
 InstructionCost AArch64TTIImpl::getGatherScatterOpCost(
     unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
     Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
-  if (useNeonVector(DataTy))
+  if (useNeonVector(DataTy) || !isLegalMaskedGatherScatter(DataTy))
     return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
                                          Alignment, CostKind, I);
   auto *VT = cast<VectorType>(DataTy);
@@ -3004,6 +3004,10 @@ InstructionCost AArch64TTIImpl::getGatherScatterOpCost(
   if (!LT.first.isValid())
     return InstructionCost::getInvalid();
 
+  if (!LT.second.isVector() ||
+      !isElementTypeLegalForScalableVector(VT->getElementType()))
+    return InstructionCost::getInvalid();
+
   // The code-generator is currently not able to handle scalable vectors
   // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
   // it. This change will be removed when code-generation for these types is
diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-gather.ll b/llvm/test/Analysis/CostModel/AArch64/sve-gather.ll
index 432dfb5726406..c05339d89d35c 100644
--- a/llvm/test/Analysis/CostModel/AArch64/sve-gather.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/sve-gather.ll
@@ -105,9 +105,27 @@ define void @masked_gathers_no_vscale_range() #2 {
   ret void
 }
 
+define <2 x i128> @masked_gather_v1i128(<2 x ptr> %ld, <2 x i1> %masks, <2 x i128> %passthru) #3 {
+; CHECK-LABEL: 'masked_gather_v1i128'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %res = call <2 x i128> @llvm.masked.gather.v2i128.v2p0(<2 x ptr> %ld, i32 0, <2 x i1> %masks, <2 x i128> %passthru)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i128> %res
+;
+; CHECK-VSCALE-2-LABEL: 'masked_gather_v1i128'
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %res = call <2 x i128> @llvm.masked.gather.v2i128.v2p0(<2 x ptr> %ld, i32 0, <2 x i1> %masks, <2 x i128> %passthru)
+; CHECK-VSCALE-2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i128> %res
+;
+; CHECK-VSCALE-1-LABEL: 'masked_gather_v1i128'
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %res = call <2 x i128> @llvm.masked.gather.v2i128.v2p0(<2 x ptr> %ld, i32 0, <2 x i1> %masks, <2 x i128> %passthru)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i128> %res
+;
+  %res = call <2 x i128> @llvm.masked.gather.v2i128.v2p0(<2 x ptr> %ld, i32 0, <2 x i1> %masks, <2 x i128> %passthru)
+  ret <2 x i128> %res
+}
+
 attributes #0 = { "target-features"="+sve" vscale_range(1, 8) }
 attributes #1 = { "target-features"="+sve" vscale_range(1, 16) "tune-cpu"="generic" }
 attributes #2 = { "target-features"="+sve" }
+attributes #3 = { "target-features"="+sve" vscale_range(2, 2) }
 
 declare <vscale x 4 x i32> @llvm.masked.gather.nxv4i32(<vscale x 4 x ptr>, i32, <vscale x 4 x i1>, <vscale x 4 x i32>)
 declare <vscale x 8 x i32> @llvm.masked.gather.nxv8i32(<vscale x 8 x ptr>, i32, <vscale x 8 x i1>, <vscale x 8 x i32>)
@@ -120,3 +138,4 @@ declare <vscale x 2 x float> @llvm.masked.gather.nxv2f32(<vscale x 2 x ptr>, i32
 declare <vscale x 16 x i16> @llvm.masked.gather.nxv16i16(<vscale x 16 x ptr>, i32, <vscale x 16 x i1>, <vscale x 16 x i16>)
 declare <vscale x 8 x i16> @llvm.masked.gather.nxv8i16(<vscale x 8 x ptr>, i32, <vscale x 8 x i1>, <vscale x 8 x i16>)
 declare <vscale x 4 x i16> @llvm.masked.gather.nxv4i16(<vscale x 4 x ptr>, i32, <vscale x 4 x i1>, <vscale x 4 x i16>)
+declare <2 x i128> @llvm.masked.gather.v2i128.v2p0(<2 x ptr>, i32, <2 x i1>, <2 x i128>)
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-load-128.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-load-128.ll
new file mode 100644
index 0000000000000..3f02f974e59e6
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-load-128.ll
@@ -0,0 +1,54 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -passes=slp-vectorizer -mtriple=aarch64-unknown-linux-gnu -mcpu=neoverse-512tvb < %s | FileCheck %s
+
+define void @gather_load_fp128(ptr %arg) #0 {
+; CHECK-LABEL: @gather_load_fp128(
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i8, ptr [[ARG:%.*]], i64 16
+; CHECK-NEXT:    [[LOAD0:%.*]] = load fp128, ptr [[ARG]], align 1
+; CHECK-NEXT:    [[LOAD1:%.*]] = load fp128, ptr [[GEP]], align 1
+; CHECK-NEXT:    [[LOAD2:%.*]] = load fp128, ptr null, align 1
+; CHECK-NEXT:    [[LOAD3:%.*]] = load fp128, ptr null, align 1
+; CHECK-NEXT:    [[FCMP0:%.*]] = fcmp oeq fp128 [[LOAD0]], 0xL00000000000000000000000000000000
+; CHECK-NEXT:    [[FCMP1:%.*]] = fcmp oeq fp128 [[LOAD1]], 0xL00000000000000000000000000000000
+; CHECK-NEXT:    [[FCMP2:%.*]] = fcmp oeq fp128 [[LOAD2]], 0xL00000000000000000000000000000000
+; CHECK-NEXT:    [[FCMP3:%.*]] = fcmp oeq fp128 [[LOAD3]], 0xL00000000000000000000000000000000
+; CHECK-NEXT:    ret void
+;
+  %gep = getelementptr i8, ptr %arg, i64 16
+  %load0 = load fp128, ptr %arg, align 1
+  %load1 = load fp128, ptr %gep, align 1
+  %load2 = load fp128, ptr null, align 1
+  %load3 = load fp128, ptr null, align 1
+  %fcmp0 = fcmp oeq fp128 %load0, 0xL0
+  %fcmp1 = fcmp oeq fp128 %load1, 0xL0
+  %fcmp2 = fcmp oeq fp128 %load2, 0xL0
+  %fcmp3 = fcmp oeq fp128 %load3, 0xL0
+  ret void
+}
+
+define void @gather_load_i128(ptr %arg) #0 {
+; CHECK-LABEL: @gather_load_i128(
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i8, ptr [[ARG:%.*]], i64 16
+; CHECK-NEXT:    [[LOAD0:%.*]] = load i128, ptr [[ARG]], align 1
+; CHECK-NEXT:    [[LOAD1:%.*]] = load i128, ptr [[GEP]], align 1
+; CHECK-NEXT:    [[LOAD2:%.*]] = load i128, ptr null, align 1
+; CHECK-NEXT:    [[LOAD3:%.*]] = load i128, ptr null, align 1
+; CHECK-NEXT:    [[CMP0:%.*]] = icmp eq i128 [[LOAD0]], 0
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i128 [[LOAD1]], 0
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq i128 [[LOAD2]], 0
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i128 [[LOAD3]], 0
+; CHECK-NEXT:    ret void
+;
+  %gep = getelementptr i8, ptr %arg, i64 16
+  %load0 = load i128, ptr %arg, align 1
+  %load1 = load i128, ptr %gep, align 1
+  %load2 = load i128, ptr null, align 1
+  %load3 = load i128, ptr null, align 1
+  %cmp0 = icmp eq i128 %load0, 0
+  %cmp1 = icmp eq i128 %load1, 0
+  %cmp2 = icmp eq i128 %load2, 0
+  %cmp3 = icmp eq i128 %load3, 0
+  ret void
+}
+
+attributes #0 = { vscale_range(2,2) }

From 08e9c462eaa900449c2250c7ae84d9d303de9fb4 Mon Sep 17 00:00:00 2001
From: Arnold Schwaighofer <aschwaighofer@apple.com>
Date: Fri, 27 Oct 2023 11:42:37 -0700
Subject: [PATCH 247/877] SwiftCallingConv: Fix the splitVectorEntry function
 (#69953)

When splitting an entry into multiple entries, the indices of the split
entries are a combination of the original split entry's and the number
of elements we split that entry to.

Failure to do so resulted in non-sensical entries leading e.g to
assertion failures in `getCoerceAndExpandTypes` and runtime failures in
Swift programs.
---
 clang/lib/CodeGen/SwiftCallingConv.cpp |  7 ++++---
 clang/test/CodeGen/64bit-swiftcall.c   | 21 +++++++++++++++++++--
 2 files changed, 23 insertions(+), 5 deletions(-)

diff --git a/clang/lib/CodeGen/SwiftCallingConv.cpp b/clang/lib/CodeGen/SwiftCallingConv.cpp
index 055dd37043866..16fbf52a517db 100644
--- a/clang/lib/CodeGen/SwiftCallingConv.cpp
+++ b/clang/lib/CodeGen/SwiftCallingConv.cpp
@@ -409,9 +409,10 @@ void SwiftAggLowering::splitVectorEntry(unsigned index) {
 
   CharUnits begin = Entries[index].Begin;
   for (unsigned i = 0; i != numElts; ++i) {
-    Entries[index].Type = eltTy;
-    Entries[index].Begin = begin;
-    Entries[index].End = begin + eltSize;
+    unsigned idx = index + i;
+    Entries[idx].Type = eltTy;
+    Entries[idx].Begin = begin;
+    Entries[idx].End = begin + eltSize;
     begin += eltSize;
   }
 }
diff --git a/clang/test/CodeGen/64bit-swiftcall.c b/clang/test/CodeGen/64bit-swiftcall.c
index 5290de2471e8e..da6f18248c2af 100644
--- a/clang/test/CodeGen/64bit-swiftcall.c
+++ b/clang/test/CodeGen/64bit-swiftcall.c
@@ -985,8 +985,8 @@ struct {
 } s;
 } union_het_vecint;
 TEST(union_het_vecint)
-// CHECK: define{{.*}} swiftcc void @return_union_het_vecint(ptr noalias sret([[UNION:.+]])
-// CHECK: define{{.*}} swiftcc void @take_union_het_vecint(ptr
+// CHECK: define{{.*}} swiftcc { i64, i64, i64, i64 } @return_union_het_vecint()
+// CHECK: define{{.*}} swiftcc void @take_union_het_vecint(i64 %0, i64 %1, i64 %2, i64 %3)
 
 typedef struct {
   float3 f3;
@@ -1044,3 +1044,20 @@ typedef struct {
 
 // CHECK-LABEL: use_atomic_padded(i64 %0, i64 %1)
 SWIFTCALL void use_atomic_padded(atomic_padded a) {}
+
+
+typedef union {
+  float4 v;
+  float3 v2;
+  struct {
+    float a;
+    float b;
+    float c;
+    float d;
+  };
+} vector_union;
+
+TEST(vector_union)
+
+// CHECK-LABEL: define swiftcc { float, float, float, float } @return_vector_union()
+// CHECK-LABEL: define swiftcc void @take_vector_union(float %0, float %1, float %2, float %3)

From 11b3b3834cfdd9183eff903ef8ac34a733af2068 Mon Sep 17 00:00:00 2001
From: Med Ismail Bennani <ismail@bennani.ma>
Date: Fri, 27 Oct 2023 11:58:44 -0700
Subject: [PATCH 248/877] [lldb] Make ScriptedInterface::CreateObjectPlugin
 less strict

This patches changes the implementation of `CreateObjectPlugin` in the
various base ScriptedInterface classes, to return an `UnimplementedError`
instead of triggering an assertion.

Signed-off-by: Med Ismail Bennani <ismail@bennani.ma>
---
 lldb/include/lldb/Interpreter/Interfaces/ScriptedInterface.h   | 1 +
 .../lldb/Interpreter/Interfaces/ScriptedPlatformInterface.h    | 3 +--
 .../lldb/Interpreter/Interfaces/ScriptedProcessInterface.h     | 1 -
 .../lldb/Interpreter/Interfaces/ScriptedThreadInterface.h      | 3 +--
 4 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/lldb/include/lldb/Interpreter/Interfaces/ScriptedInterface.h b/lldb/include/lldb/Interpreter/Interfaces/ScriptedInterface.h
index 2406f0f1f9aee..e4816352daa5d 100644
--- a/lldb/include/lldb/Interpreter/Interfaces/ScriptedInterface.h
+++ b/lldb/include/lldb/Interpreter/Interfaces/ScriptedInterface.h
@@ -13,6 +13,7 @@
 #include "lldb/Target/ExecutionContext.h"
 #include "lldb/Utility/LLDBLog.h"
 #include "lldb/Utility/Log.h"
+#include "lldb/Utility/UnimplementedError.h"
 #include "lldb/lldb-private.h"
 
 #include "llvm/Support/Compiler.h"
diff --git a/lldb/include/lldb/Interpreter/Interfaces/ScriptedPlatformInterface.h b/lldb/include/lldb/Interpreter/Interfaces/ScriptedPlatformInterface.h
index dc3630fc75d9e..2dcbb47ffa6de 100644
--- a/lldb/include/lldb/Interpreter/Interfaces/ScriptedPlatformInterface.h
+++ b/lldb/include/lldb/Interpreter/Interfaces/ScriptedPlatformInterface.h
@@ -23,8 +23,7 @@ class ScriptedPlatformInterface : virtual public ScriptedInterface {
   CreatePluginObject(llvm::StringRef class_name, ExecutionContext &exe_ctx,
                      StructuredData::DictionarySP args_sp,
                      StructuredData::Generic *script_obj = nullptr) {
-    llvm_unreachable(
-        "Cannot create an instance of the ScriptedPlatformInterface!");
+    return {llvm::make_error<UnimplementedError>()};
   }
 
   virtual StructuredData::DictionarySP ListProcesses() { return {}; }
diff --git a/lldb/include/lldb/Interpreter/Interfaces/ScriptedProcessInterface.h b/lldb/include/lldb/Interpreter/Interfaces/ScriptedProcessInterface.h
index bedeaf25ae0cb..a429cacd862f1 100644
--- a/lldb/include/lldb/Interpreter/Interfaces/ScriptedProcessInterface.h
+++ b/lldb/include/lldb/Interpreter/Interfaces/ScriptedProcessInterface.h
@@ -12,7 +12,6 @@
 #include "ScriptedInterface.h"
 #include "lldb/Core/StructuredDataImpl.h"
 #include "lldb/Target/MemoryRegionInfo.h"
-#include "lldb/Utility/UnimplementedError.h"
 
 #include "lldb/lldb-private.h"
 
diff --git a/lldb/include/lldb/Interpreter/Interfaces/ScriptedThreadInterface.h b/lldb/include/lldb/Interpreter/Interfaces/ScriptedThreadInterface.h
index 65ec2b0387df0..107e593b5561e 100644
--- a/lldb/include/lldb/Interpreter/Interfaces/ScriptedThreadInterface.h
+++ b/lldb/include/lldb/Interpreter/Interfaces/ScriptedThreadInterface.h
@@ -24,8 +24,7 @@ class ScriptedThreadInterface : virtual public ScriptedInterface {
   CreatePluginObject(llvm::StringRef class_name, ExecutionContext &exe_ctx,
                      StructuredData::DictionarySP args_sp,
                      StructuredData::Generic *script_obj = nullptr) {
-    llvm_unreachable(
-        "Cannot create an instance of the ScriptedThreadInterface!");
+    return {llvm::make_error<UnimplementedError>()};
   }
 
   virtual lldb::tid_t GetThreadID() { return LLDB_INVALID_THREAD_ID; }

From acc6f3e9c1af6c7445aae6f10d4b016ac84112d3 Mon Sep 17 00:00:00 2001
From: bjacob <jacob.benoit.1@gmail.com>
Date: Fri, 27 Oct 2023 15:03:39 -0400
Subject: [PATCH 249/877] TosaToLinalgNamed: add option to prefer HWCF kernel
 layout for Conv2D ops. (#70482)

Switching to FHWC happened in
https://github.com/llvm/llvm-project/pull/68304 and is fine in itself
but caused downstream performance regression
https://github.com/openxla/iree/pull/15296#issuecomment-1782994965 , so
this PR makes this optional.
---
 mlir/include/mlir/Conversion/Passes.td        |  6 +++
 .../Conversion/TosaToLinalg/TosaToLinalg.h    |  8 +++-
 .../TosaToLinalg/TosaToLinalgNamed.cpp        | 42 ++++++++++++++++++-
 .../TosaToLinalg/TosaToLinalgNamedPass.cpp    | 12 ++++--
 .../TosaToLinalg/TosaToLinalgPass.cpp         |  6 ++-
 .../TosaToLinalg/tosa-to-linalg-named.mlir    |  7 ++++
 6 files changed, 73 insertions(+), 8 deletions(-)

diff --git a/mlir/include/mlir/Conversion/Passes.td b/mlir/include/mlir/Conversion/Passes.td
index f05e5a8ae667d..336f0d3af951b 100644
--- a/mlir/include/mlir/Conversion/Passes.td
+++ b/mlir/include/mlir/Conversion/Passes.td
@@ -1126,6 +1126,12 @@ def TosaToLinalgNamed
     Linalg named operations.
   }];
 
+  let options = [
+      Option<"preferConv2DKernelLayoutHWCF", "prefer-conv2d-kernel-layout-hwcf",
+           "bool", /*default=*/"false",
+           "Prefer generating linalg.conv_2d_nhwc_hwcf over linalg.conv_2d_nhwc_fhwc">
+  ];
+
   let constructor = "tosa::createTosaToLinalgNamed()";
 }
 
diff --git a/mlir/include/mlir/Conversion/TosaToLinalg/TosaToLinalg.h b/mlir/include/mlir/Conversion/TosaToLinalg/TosaToLinalg.h
index b4c4eb8651a6f..5fd77c8a0211a 100644
--- a/mlir/include/mlir/Conversion/TosaToLinalg/TosaToLinalg.h
+++ b/mlir/include/mlir/Conversion/TosaToLinalg/TosaToLinalg.h
@@ -26,7 +26,8 @@ namespace mlir {
 namespace tosa {
 
 std::unique_ptr<Pass> createTosaToLinalg();
-std::unique_ptr<Pass> createTosaToLinalgNamed();
+std::unique_ptr<Pass> createTosaToLinalgNamed(
+    const TosaToLinalgNamedOptions &options = TosaToLinalgNamedOptions());
 
 /// Populates passes to convert from TOSA to Linalg on buffers. At the end of
 /// the pass, the function will only contain linalg ops or standard ops if the
@@ -34,6 +35,8 @@ std::unique_ptr<Pass> createTosaToLinalgNamed();
 /// benchmarking performance improvements from the canonicalizations.
 void addTosaToLinalgPasses(
     OpPassManager &pm, const TosaToLinalgOptions &options,
+    const TosaToLinalgNamedOptions &tosaToLinalgNamedOptions =
+        TosaToLinalgNamedOptions(),
     // Note: Default to 'none' level unless otherwise specified.
     tosa::TosaValidationOptions const &validationOptions = {
         tosa::TosaProfileEnum::Undefined, false, tosa::TosaLevelEnum::None});
@@ -46,7 +49,8 @@ void registerTosaToLinalgPipelines();
 void populateTosaToLinalgConversionPatterns(RewritePatternSet *patterns);
 
 /// Populates conversion passes from TOSA dialect to Linalg named operations.
-void populateTosaToLinalgNamedConversionPatterns(RewritePatternSet *patterns);
+void populateTosaToLinalgNamedConversionPatterns(
+    RewritePatternSet *patterns, const TosaToLinalgNamedOptions &options);
 
 } // namespace tosa
 } // namespace mlir
diff --git a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgNamed.cpp b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgNamed.cpp
index ee8f52deadbd1..99a65f63038a4 100644
--- a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgNamed.cpp
+++ b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgNamed.cpp
@@ -26,6 +26,7 @@
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 
 #include <numeric>
+#include <type_traits>
 
 using namespace mlir;
 using namespace mlir::tosa;
@@ -248,6 +249,35 @@ class ConvConverter : public OpConversionPattern<TosaConvOp> {
     pad.resize(pad.size() + 2, 0);
     input = applyPad(loc, input, pad, zeroAttr, rewriter);
 
+    if (4 == inputTy.getRank()) {
+      // For 2D convolutions, we need to check if the target convolution op
+      // wants a HWCF kernel layout.
+      bool wantHwcf =
+          isQuantized ? std::is_same_v<LinalgConvQOp, linalg::Conv2DNhwcHwcfQOp>
+                      : std::is_same_v<LinalgConvOp, linalg::Conv2DNhwcHwcfOp>;
+      if (wantHwcf) {
+        // Transpose the kernel to match dimension ordering of the linalg
+        // convolution operation.
+        // TODO(suderman): See if this can be efficiently folded - check whether
+        // the input is used anywhere else, if not fold the constant.
+        SmallVector<int64_t> weightPerm;
+        for (int i = 1; i < resultTy.getRank(); i++)
+          weightPerm.push_back(i);
+        weightPerm.push_back(0);
+
+        SmallVector<int64_t> newWeightShape;
+        for (auto dim : weightPerm)
+          newWeightShape.push_back(weightShape[dim]);
+        auto weightPermAttr = rewriter.getI64TensorAttr(weightPerm);
+        Value weightPermValue =
+            rewriter.create<arith::ConstantOp>(loc, weightPermAttr);
+        Type newWeightTy =
+            RankedTensorType::get(newWeightShape, weightTy.getElementType());
+        weight = rewriter.create<tosa::TransposeOp>(loc, newWeightTy, weight,
+                                                    weightPermValue);
+      }
+    }
+
     // For Conv3D transpose the kernel to match dimension ordering of the linalg
     // convolution operation. Conv2D has a 1-1 mapping in linalg so better to
     // map directly and then transpose later if desired.
@@ -977,10 +1007,18 @@ class AvgPool2dConverter : public OpRewritePattern<tosa::AvgPool2dOp> {
 } // namespace
 
 void mlir::tosa::populateTosaToLinalgNamedConversionPatterns(
-    RewritePatternSet *patterns) {
+    RewritePatternSet *patterns, const TosaToLinalgNamedOptions &options) {
+  if (options.preferConv2DKernelLayoutHWCF) {
+    patterns->add<ConvConverter<tosa::Conv2DOp, linalg::Conv2DNhwcHwcfOp,
+                                linalg::Conv2DNhwcHwcfQOp>>(
+        patterns->getContext());
+  } else {
+    patterns->add<ConvConverter<tosa::Conv2DOp, linalg::Conv2DNhwcFhwcOp,
+                                linalg::Conv2DNhwcFhwcQOp>>(
+        patterns->getContext());
+  }
   patterns->add<
       // clang-format off
-      ConvConverter<tosa::Conv2DOp, linalg::Conv2DNhwcFhwcOp, linalg::Conv2DNhwcFhwcQOp>,
       ConvConverter<tosa::Conv3DOp, linalg::Conv3DNdhwcDhwcfOp, linalg::Conv3DNdhwcDhwcfQOp>,
       DepthwiseConvConverter,
       MatMulConverter,
diff --git a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgNamedPass.cpp b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgNamedPass.cpp
index 4c941a109ed84..5312dc164c26c 100644
--- a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgNamedPass.cpp
+++ b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgNamedPass.cpp
@@ -37,6 +37,9 @@ namespace {
 struct TosaToLinalgNamed
     : public impl::TosaToLinalgNamedBase<TosaToLinalgNamed> {
 public:
+  TosaToLinalgNamed(const TosaToLinalgNamedOptions &options)
+      : impl::TosaToLinalgNamedBase<TosaToLinalgNamed>(options) {}
+
   void getDependentDialects(DialectRegistry &registry) const override {
     registry
         .insert<arith::ArithDialect, linalg::LinalgDialect, math::MathDialect,
@@ -61,13 +64,16 @@ struct TosaToLinalgNamed
     target.markUnknownOpDynamicallyLegal([](Operation *) { return true; });
 
     FunctionOpInterface func = getOperation();
-    mlir::tosa::populateTosaToLinalgNamedConversionPatterns(&patterns);
+    TosaToLinalgNamedOptions options;
+    options.preferConv2DKernelLayoutHWCF = preferConv2DKernelLayoutHWCF;
+    tosa::populateTosaToLinalgNamedConversionPatterns(&patterns, options);
     if (failed(applyFullConversion(func, target, std::move(patterns))))
       signalPassFailure();
   }
 };
 } // namespace
 
-std::unique_ptr<Pass> mlir::tosa::createTosaToLinalgNamed() {
-  return std::make_unique<TosaToLinalgNamed>();
+std::unique_ptr<Pass>
+mlir::tosa::createTosaToLinalgNamed(const TosaToLinalgNamedOptions &options) {
+  return std::make_unique<TosaToLinalgNamed>(options);
 }
diff --git a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgPass.cpp b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgPass.cpp
index a486e28c50c71..687477810030d 100644
--- a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgPass.cpp
+++ b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalgPass.cpp
@@ -76,6 +76,7 @@ std::unique_ptr<Pass> mlir::tosa::createTosaToLinalg() {
 
 void mlir::tosa::addTosaToLinalgPasses(
     OpPassManager &pm, const TosaToLinalgOptions &options,
+    const TosaToLinalgNamedOptions &tosaToLinalgNamedOptions,
     tosa::TosaValidationOptions const &validationOptions) {
   // Optional decompositions are designed to benefit linalg.
   if (!options.disableTosaDecompositions)
@@ -84,7 +85,8 @@ void mlir::tosa::addTosaToLinalgPasses(
 
   pm.addNestedPass<func::FuncOp>(tosa::createTosaInferShapesPass());
   pm.addNestedPass<func::FuncOp>(tosa::createTosaMakeBroadcastablePass());
-  pm.addNestedPass<func::FuncOp>(tosa::createTosaToLinalgNamed());
+  pm.addNestedPass<func::FuncOp>(
+      tosa::createTosaToLinalgNamed(tosaToLinalgNamedOptions));
   pm.addNestedPass<func::FuncOp>(createCanonicalizerPass());
   // TODO: Remove pass that operates on const tensor and enable optionality
   pm.addNestedPass<func::FuncOp>(tosa::createTosaLayerwiseConstantFoldPass(
@@ -106,7 +108,9 @@ void mlir::tosa::registerTosaToLinalgPipelines() {
       "named operations.",
       [](OpPassManager &pm) {
         TosaToLinalgOptions tosaToLinalgOptions;
+        TosaToLinalgNamedOptions tosaToLinalgNamedOptions;
         tosa::addTosaToLinalgPasses(pm, tosaToLinalgOptions,
+                                    tosaToLinalgNamedOptions,
                                     /* validationOptions = */
                                     {tosa::TosaProfileEnum::BaseInference,
                                      /* StrictOperationSpecAlignment = */ true,
diff --git a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-named.mlir b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-named.mlir
index b601bfb28a4f2..1cf7c8dee6068 100644
--- a/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-named.mlir
+++ b/mlir/test/Conversion/TosaToLinalg/tosa-to-linalg-named.mlir
@@ -1,4 +1,5 @@
 // RUN: mlir-opt --split-input-file -pass-pipeline="builtin.module(func.func(tosa-to-linalg-named))" %s -verify-diagnostics -o -| FileCheck %s
+// RUN: mlir-opt --split-input-file -pass-pipeline="builtin.module(func.func(tosa-to-linalg-named{prefer-conv2d-kernel-layout-hwcf=true}))" %s -verify-diagnostics -o -| FileCheck --check-prefix="HWCF" %s
 
 // CHECK-LABEL: @matmul
 func.func @matmul(%arg0: tensor<1x5x3xf32>, %arg1: tensor<1x3x6xf32>) -> (tensor<1x5x6xf32>) {
@@ -363,11 +364,14 @@ func.func @avg_pool_dyn(%arg0: tensor<?x6x34x62xf32>) -> (tensor<?x5x33x62xf32>)
 
 // CHECK-LABEL: @conv2d_i8
 func.func @conv2d_i8(%input: tensor<1x49x42x27xi8>, %weights: tensor<28x1x1x27xi8>, %bias: tensor<28xi8>) -> () {
+  // HWCF: %[[TRANSPOSE_DIMS:.+]] = arith.constant dense<[1, 2, 3, 0]> : tensor<4xi64>
+  // HWCF: %[[TRANSPOSE:.+]] = tosa.transpose %arg1, %[[TRANSPOSE_DIMS]] : (tensor<28x1x1x27xi8>, tensor<4xi64>) -> tensor<1x1x27x28xi8>
   // CHECK: %[[M_IN:.+]] = tensor.empty()
   // CHECK: %[[CST:.+]] = arith.constant 0
   // CHECK: %[[FILL:.+]] = linalg.fill
   // CHECK: %[[B_IN:.+]] = tensor.empty()
   // CHECK: %[[CONV:.+]] = linalg.conv_2d_nhwc_fhwc_q {dilations = dense<[2, 1]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%arg0, %arg1, %c0_i32_0, %c0_i32_1 : tensor<1x49x42x27xi8>, tensor<28x1x1x27xi8>, i32, i32) outs(%[[FILL]] : tensor<1x45x40x28xi32>) -> tensor<1x45x40x28xi32>
+  // HWCF: %[[CONV:.+]] = linalg.conv_2d_nhwc_hwcf_q {dilations = dense<[2, 1]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%arg0, %[[TRANSPOSE]], %c0_i32_0, %c0_i32_1 : tensor<1x49x42x27xi8>, tensor<1x1x27x28xi8>, i32, i32) outs(%{{[a-zA-Z0-9_]*}} : tensor<1x45x40x28xi32>) -> tensor<1x45x40x28xi32>
   // CHECK: %[[B:.+]] = linalg.generic {indexing_maps = [#[[$MAP1]], #[[$MAP2]], #[[$MAP2]]], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, %[[CONV]] : tensor<28xi8>, tensor<1x45x40x28xi32>) outs(%[[B_IN]] : tensor<1x45x40x28xi32>)
   // CHECK:   arith.extsi
   // CHECK:   arith.addi
@@ -383,11 +387,14 @@ func.func @conv2d_i8(%input: tensor<1x49x42x27xi8>, %weights: tensor<28x1x1x27xi
 
 // CHECK-LABEL: @conv2d_f32
 func.func @conv2d_f32(%input: tensor<1x49x42x27xf32>, %weights: tensor<28x3x3x27xf32>, %bias: tensor<28xf32>) -> () {
+  // HWCF: %[[TRANSPOSE_DIMS:.+]] = arith.constant dense<[1, 2, 3, 0]> : tensor<4xi64>
+  // HWCF: %[[TRANSPOSE:.+]] = tosa.transpose %arg1, %[[TRANSPOSE_DIMS]] : (tensor<28x3x3x27xf32>, tensor<4xi64>) -> tensor<3x3x27x28xf32>
   // CHECK: %[[M_IN:.+]] = tensor.empty()
   // CHECK: %[[CST:.+]] = arith.constant 0
   // CHECK: %[[FILL:.+]] = linalg.fill
   // CHECK: %[[B_IN:.+]] = tensor.empty()
   // CHECK: %[[CONV:.+]] = linalg.conv_2d_nhwc_fhwc {dilations = dense<[2, 1]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%arg0, %arg1 : tensor<1x49x42x27xf32>, tensor<28x3x3x27xf32>) outs(%[[FILL]] : tensor<1x45x40x28xf32>)
+  // HWCF: %[[CONV:.+]] = linalg.conv_2d_nhwc_hwcf {dilations = dense<[2, 1]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%arg0, %[[TRANSPOSE]] : tensor<1x49x42x27xf32>, tensor<3x3x27x28xf32>) outs(%{{[a-zA-Z0-9_]*}} : tensor<1x45x40x28xf32>
   // CHECK: %[[B:.+]] = linalg.generic {indexing_maps = [#[[$MAP1]], #[[$MAP2]], #[[$MAP2]]], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg2, %[[CONV]] : tensor<28xf32>, tensor<1x45x40x28xf32>) outs(%[[B_IN]] : tensor<1x45x40x28xf32>)
   // CHECK:   arith.addf
   // CHECK:   linalg.yield

From 2463a0ed70754524efb6e05b09d632dcacf9b51f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E3=83=84?= <6399679+maflcko@users.noreply.github.com>
Date: Fri, 27 Oct 2023 21:20:37 +0200
Subject: [PATCH 250/877] [clang-tidy][test] Add more
 bugprone-string-constructor tests (#70456)

Test:

* string literal concatenation is accepted
* escaped embedded null char is accepted
* taking over the implicit null char is rejected
---
 .../test/clang-tidy/checkers/bugprone/string-constructor.cpp  | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/string-constructor.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/string-constructor.cpp
index 1ca4d14f85e29..a5b6b240ddc66 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/bugprone/string-constructor.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/string-constructor.cpp
@@ -49,6 +49,8 @@ void Test() {
   // CHECK-MESSAGES: [[@LINE-1]]:15: warning: negative value used as length parameter
   std::string q2("test", 200);
   // CHECK-MESSAGES: [[@LINE-1]]:15: warning: length is bigger than string literal size
+  std::string t1("test", 5);
+  // CHECK-MESSAGES: [[@LINE-1]]:15: warning: length is bigger than string literal size
   std::string q3(kText, 200);
   // CHECK-MESSAGES: [[@LINE-1]]:15: warning: length is bigger than string literal size
   std::string q4(kText2, 200);
@@ -97,6 +99,8 @@ void Valid() {
   std::string s1("test", 4);
   std::string s2("test", 3);
   std::string s3("test");
+  std::string s4("test\000", 5);
+  std::string s6("te" "st", 4);
 
   std::string_view emptyv();
   std::string_view sv1("test", 4);

From 124613c8ed8dc71ca153b54cdbf41cb90d49842b Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Fri, 27 Oct 2023 12:25:42 -0700
Subject: [PATCH 251/877] [RISCV] Separate FPR and VR copyPhysReg
 implementation. (#70492)

This duplicates the BuildMI from the 3 FP copy types, but separates them
from the VR code and gets rid of the IsScalableVector flag.

I need to add FPR<->GPR copies for GISel which needs another variation
of BuildMI. So it seems cleaner to start enumerating each case as its
own if.
---
 llvm/lib/Target/RISCV/RISCVInstrInfo.cpp | 186 ++++++++++++-----------
 1 file changed, 96 insertions(+), 90 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index ad31b2974993c..2f3d2084be703 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -330,13 +330,8 @@ void RISCVInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     return;
   }
 
-  // FPR->FPR copies and VR->VR copies.
-  unsigned Opc;
-  bool IsScalableVector = true;
-  unsigned NF = 1;
-  RISCVII::VLMUL LMul = RISCVII::LMUL_1;
-  unsigned SubRegIdx = RISCV::sub_vrm1_0;
   if (RISCV::FPR16RegClass.contains(DstReg, SrcReg)) {
+    unsigned Opc;
     if (STI.hasStdExtZfh()) {
       Opc = RISCV::FSGNJ_H;
     } else {
@@ -350,14 +345,32 @@ void RISCVInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
                                         &RISCV::FPR32RegClass);
       Opc = RISCV::FSGNJ_S;
     }
-    IsScalableVector = false;
-  } else if (RISCV::FPR32RegClass.contains(DstReg, SrcReg)) {
-    Opc = RISCV::FSGNJ_S;
-    IsScalableVector = false;
-  } else if (RISCV::FPR64RegClass.contains(DstReg, SrcReg)) {
-    Opc = RISCV::FSGNJ_D;
-    IsScalableVector = false;
-  } else if (RISCV::VRRegClass.contains(DstReg, SrcReg)) {
+    BuildMI(MBB, MBBI, DL, get(Opc), DstReg)
+        .addReg(SrcReg, getKillRegState(KillSrc))
+        .addReg(SrcReg, getKillRegState(KillSrc));
+    return;
+  }
+
+  if (RISCV::FPR32RegClass.contains(DstReg, SrcReg)) {
+    BuildMI(MBB, MBBI, DL, get(RISCV::FSGNJ_S), DstReg)
+        .addReg(SrcReg, getKillRegState(KillSrc))
+        .addReg(SrcReg, getKillRegState(KillSrc));
+    return;
+  }
+
+  if (RISCV::FPR64RegClass.contains(DstReg, SrcReg)) {
+    BuildMI(MBB, MBBI, DL, get(RISCV::FSGNJ_D), DstReg)
+        .addReg(SrcReg, getKillRegState(KillSrc))
+        .addReg(SrcReg, getKillRegState(KillSrc));
+    return;
+  }
+
+  // VR->VR copies.
+  unsigned Opc;
+  unsigned NF = 1;
+  RISCVII::VLMUL LMul = RISCVII::LMUL_1;
+  unsigned SubRegIdx = RISCV::sub_vrm1_0;
+  if (RISCV::VRRegClass.contains(DstReg, SrcReg)) {
     Opc = RISCV::VMV1R_V;
     LMul = RISCVII::LMUL_1;
   } else if (RISCV::VRM2RegClass.contains(DstReg, SrcReg)) {
@@ -428,97 +441,90 @@ void RISCVInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     llvm_unreachable("Impossible reg-to-reg copy");
   }
 
-  if (IsScalableVector) {
-    bool UseVMV_V_V = false;
-    bool UseVMV_V_I = false;
-    MachineBasicBlock::const_iterator DefMBBI;
-    if (isConvertibleToVMV_V_V(STI, MBB, MBBI, DefMBBI, LMul)) {
-      UseVMV_V_V = true;
-      // We only need to handle LMUL = 1/2/4/8 here because we only define
-      // vector register classes for LMUL = 1/2/4/8.
-      unsigned VIOpc;
-      switch (LMul) {
-      default:
-        llvm_unreachable("Impossible LMUL for vector register copy.");
-      case RISCVII::LMUL_1:
-        Opc = RISCV::PseudoVMV_V_V_M1;
-        VIOpc = RISCV::PseudoVMV_V_I_M1;
-        break;
-      case RISCVII::LMUL_2:
-        Opc = RISCV::PseudoVMV_V_V_M2;
-        VIOpc = RISCV::PseudoVMV_V_I_M2;
-        break;
-      case RISCVII::LMUL_4:
-        Opc = RISCV::PseudoVMV_V_V_M4;
-        VIOpc = RISCV::PseudoVMV_V_I_M4;
-        break;
-      case RISCVII::LMUL_8:
-        Opc = RISCV::PseudoVMV_V_V_M8;
-        VIOpc = RISCV::PseudoVMV_V_I_M8;
-        break;
-      }
+  bool UseVMV_V_V = false;
+  bool UseVMV_V_I = false;
+  MachineBasicBlock::const_iterator DefMBBI;
+  if (isConvertibleToVMV_V_V(STI, MBB, MBBI, DefMBBI, LMul)) {
+    UseVMV_V_V = true;
+    // We only need to handle LMUL = 1/2/4/8 here because we only define
+    // vector register classes for LMUL = 1/2/4/8.
+    unsigned VIOpc;
+    switch (LMul) {
+    default:
+      llvm_unreachable("Impossible LMUL for vector register copy.");
+    case RISCVII::LMUL_1:
+      Opc = RISCV::PseudoVMV_V_V_M1;
+      VIOpc = RISCV::PseudoVMV_V_I_M1;
+      break;
+    case RISCVII::LMUL_2:
+      Opc = RISCV::PseudoVMV_V_V_M2;
+      VIOpc = RISCV::PseudoVMV_V_I_M2;
+      break;
+    case RISCVII::LMUL_4:
+      Opc = RISCV::PseudoVMV_V_V_M4;
+      VIOpc = RISCV::PseudoVMV_V_I_M4;
+      break;
+    case RISCVII::LMUL_8:
+      Opc = RISCV::PseudoVMV_V_V_M8;
+      VIOpc = RISCV::PseudoVMV_V_I_M8;
+      break;
+    }
 
-      if (DefMBBI->getOpcode() == VIOpc) {
-        UseVMV_V_I = true;
-        Opc = VIOpc;
-      }
+    if (DefMBBI->getOpcode() == VIOpc) {
+      UseVMV_V_I = true;
+      Opc = VIOpc;
     }
+  }
 
-    if (NF == 1) {
-      auto MIB = BuildMI(MBB, MBBI, DL, get(Opc), DstReg);
+  if (NF == 1) {
+    auto MIB = BuildMI(MBB, MBBI, DL, get(Opc), DstReg);
+    if (UseVMV_V_V)
+      MIB.addReg(DstReg, RegState::Undef);
+    if (UseVMV_V_I)
+      MIB = MIB.add(DefMBBI->getOperand(2));
+    else
+      MIB = MIB.addReg(SrcReg, getKillRegState(KillSrc));
+    if (UseVMV_V_V) {
+      const MCInstrDesc &Desc = DefMBBI->getDesc();
+      MIB.add(DefMBBI->getOperand(RISCVII::getVLOpNum(Desc)));  // AVL
+      MIB.add(DefMBBI->getOperand(RISCVII::getSEWOpNum(Desc))); // SEW
+      MIB.addImm(0);                                            // tu, mu
+      MIB.addReg(RISCV::VL, RegState::Implicit);
+      MIB.addReg(RISCV::VTYPE, RegState::Implicit);
+    }
+  } else {
+    int I = 0, End = NF, Incr = 1;
+    unsigned SrcEncoding = TRI->getEncodingValue(SrcReg);
+    unsigned DstEncoding = TRI->getEncodingValue(DstReg);
+    unsigned LMulVal;
+    bool Fractional;
+    std::tie(LMulVal, Fractional) = RISCVVType::decodeVLMUL(LMul);
+    assert(!Fractional && "It is impossible be fractional lmul here.");
+    if (forwardCopyWillClobberTuple(DstEncoding, SrcEncoding, NF * LMulVal)) {
+      I = NF - 1;
+      End = -1;
+      Incr = -1;
+    }
+
+    for (; I != End; I += Incr) {
+      auto MIB = BuildMI(MBB, MBBI, DL, get(Opc),
+                         TRI->getSubReg(DstReg, SubRegIdx + I));
       if (UseVMV_V_V)
-        MIB.addReg(DstReg, RegState::Undef);
+        MIB.addReg(TRI->getSubReg(DstReg, SubRegIdx + I), RegState::Undef);
       if (UseVMV_V_I)
         MIB = MIB.add(DefMBBI->getOperand(2));
       else
-        MIB = MIB.addReg(SrcReg, getKillRegState(KillSrc));
+        MIB = MIB.addReg(TRI->getSubReg(SrcReg, SubRegIdx + I),
+                         getKillRegState(KillSrc));
       if (UseVMV_V_V) {
         const MCInstrDesc &Desc = DefMBBI->getDesc();
         MIB.add(DefMBBI->getOperand(RISCVII::getVLOpNum(Desc))); // AVL
         MIB.add(DefMBBI->getOperand(RISCVII::getSEWOpNum(Desc))); // SEW
-        MIB.addImm(0); // tu, mu
+        MIB.addImm(0);                                            // tu, mu
         MIB.addReg(RISCV::VL, RegState::Implicit);
         MIB.addReg(RISCV::VTYPE, RegState::Implicit);
       }
-    } else {
-      int I = 0, End = NF, Incr = 1;
-      unsigned SrcEncoding = TRI->getEncodingValue(SrcReg);
-      unsigned DstEncoding = TRI->getEncodingValue(DstReg);
-      unsigned LMulVal;
-      bool Fractional;
-      std::tie(LMulVal, Fractional) = RISCVVType::decodeVLMUL(LMul);
-      assert(!Fractional && "It is impossible be fractional lmul here.");
-      if (forwardCopyWillClobberTuple(DstEncoding, SrcEncoding, NF * LMulVal)) {
-        I = NF - 1;
-        End = -1;
-        Incr = -1;
-      }
-
-      for (; I != End; I += Incr) {
-        auto MIB = BuildMI(MBB, MBBI, DL, get(Opc),
-                           TRI->getSubReg(DstReg, SubRegIdx + I));
-        if (UseVMV_V_V)
-          MIB.addReg(TRI->getSubReg(DstReg, SubRegIdx + I),
-                     RegState::Undef);
-        if (UseVMV_V_I)
-          MIB = MIB.add(DefMBBI->getOperand(2));
-        else
-          MIB = MIB.addReg(TRI->getSubReg(SrcReg, SubRegIdx + I),
-                           getKillRegState(KillSrc));
-        if (UseVMV_V_V) {
-          const MCInstrDesc &Desc = DefMBBI->getDesc();
-          MIB.add(DefMBBI->getOperand(RISCVII::getVLOpNum(Desc))); // AVL
-          MIB.add(DefMBBI->getOperand(RISCVII::getSEWOpNum(Desc))); // SEW
-          MIB.addImm(0);  // tu, mu
-          MIB.addReg(RISCV::VL, RegState::Implicit);
-          MIB.addReg(RISCV::VTYPE, RegState::Implicit);
-        }
-      }
     }
-  } else {
-    BuildMI(MBB, MBBI, DL, get(Opc), DstReg)
-        .addReg(SrcReg, getKillRegState(KillSrc))
-        .addReg(SrcReg, getKillRegState(KillSrc));
   }
 }
 

From c18e78cfe3d3bd2982fe5964aff1df23be6b58ee Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Fri, 27 Oct 2023 12:43:56 -0700
Subject: [PATCH 252/877] [RISCV] Add copyPhysRegVector to extract common
 vector code out of copyPhysRegVector. (#70497)

Call this method directly from each vector case with the correct
arguments. This allows us to treat each type of copy as its own
special case and not pass variables to a common merge point. This
is similar to how AArch64 is structured.

I think I can reduce the number of operands to this new method, but
I'll do that as a follow up.
---
 llvm/lib/Target/RISCV/RISCVInstrInfo.cpp | 316 ++++++++++++-----------
 llvm/lib/Target/RISCV/RISCVInstrInfo.h   |   6 +
 2 files changed, 176 insertions(+), 146 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index 2f3d2084be703..9e4e86100a211 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -294,153 +294,12 @@ static bool isConvertibleToVMV_V_V(const RISCVSubtarget &STI,
   return false;
 }
 
-void RISCVInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
-                                 MachineBasicBlock::iterator MBBI,
-                                 const DebugLoc &DL, MCRegister DstReg,
-                                 MCRegister SrcReg, bool KillSrc) const {
+void RISCVInstrInfo::copyPhysRegVector(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+    const DebugLoc &DL, MCRegister DstReg, MCRegister SrcReg, bool KillSrc,
+    unsigned Opc, unsigned NF, RISCVII::VLMUL LMul, unsigned SubRegIdx) const {
   const TargetRegisterInfo *TRI = STI.getRegisterInfo();
 
-  if (RISCV::GPRRegClass.contains(DstReg, SrcReg)) {
-    BuildMI(MBB, MBBI, DL, get(RISCV::ADDI), DstReg)
-        .addReg(SrcReg, getKillRegState(KillSrc))
-        .addImm(0);
-    return;
-  }
-
-  if (RISCV::GPRPF64RegClass.contains(DstReg, SrcReg)) {
-    // Emit an ADDI for both parts of GPRPF64.
-    BuildMI(MBB, MBBI, DL, get(RISCV::ADDI),
-            TRI->getSubReg(DstReg, RISCV::sub_32))
-        .addReg(TRI->getSubReg(SrcReg, RISCV::sub_32), getKillRegState(KillSrc))
-        .addImm(0);
-    BuildMI(MBB, MBBI, DL, get(RISCV::ADDI),
-            TRI->getSubReg(DstReg, RISCV::sub_32_hi))
-        .addReg(TRI->getSubReg(SrcReg, RISCV::sub_32_hi),
-                getKillRegState(KillSrc))
-        .addImm(0);
-    return;
-  }
-
-  // Handle copy from csr
-  if (RISCV::VCSRRegClass.contains(SrcReg) &&
-      RISCV::GPRRegClass.contains(DstReg)) {
-    BuildMI(MBB, MBBI, DL, get(RISCV::CSRRS), DstReg)
-        .addImm(RISCVSysReg::lookupSysRegByName(TRI->getName(SrcReg))->Encoding)
-        .addReg(RISCV::X0);
-    return;
-  }
-
-  if (RISCV::FPR16RegClass.contains(DstReg, SrcReg)) {
-    unsigned Opc;
-    if (STI.hasStdExtZfh()) {
-      Opc = RISCV::FSGNJ_H;
-    } else {
-      assert(STI.hasStdExtF() &&
-             (STI.hasStdExtZfhmin() || STI.hasStdExtZfbfmin()) &&
-             "Unexpected extensions");
-      // Zfhmin/Zfbfmin doesn't have FSGNJ_H, replace FSGNJ_H with FSGNJ_S.
-      DstReg = TRI->getMatchingSuperReg(DstReg, RISCV::sub_16,
-                                        &RISCV::FPR32RegClass);
-      SrcReg = TRI->getMatchingSuperReg(SrcReg, RISCV::sub_16,
-                                        &RISCV::FPR32RegClass);
-      Opc = RISCV::FSGNJ_S;
-    }
-    BuildMI(MBB, MBBI, DL, get(Opc), DstReg)
-        .addReg(SrcReg, getKillRegState(KillSrc))
-        .addReg(SrcReg, getKillRegState(KillSrc));
-    return;
-  }
-
-  if (RISCV::FPR32RegClass.contains(DstReg, SrcReg)) {
-    BuildMI(MBB, MBBI, DL, get(RISCV::FSGNJ_S), DstReg)
-        .addReg(SrcReg, getKillRegState(KillSrc))
-        .addReg(SrcReg, getKillRegState(KillSrc));
-    return;
-  }
-
-  if (RISCV::FPR64RegClass.contains(DstReg, SrcReg)) {
-    BuildMI(MBB, MBBI, DL, get(RISCV::FSGNJ_D), DstReg)
-        .addReg(SrcReg, getKillRegState(KillSrc))
-        .addReg(SrcReg, getKillRegState(KillSrc));
-    return;
-  }
-
-  // VR->VR copies.
-  unsigned Opc;
-  unsigned NF = 1;
-  RISCVII::VLMUL LMul = RISCVII::LMUL_1;
-  unsigned SubRegIdx = RISCV::sub_vrm1_0;
-  if (RISCV::VRRegClass.contains(DstReg, SrcReg)) {
-    Opc = RISCV::VMV1R_V;
-    LMul = RISCVII::LMUL_1;
-  } else if (RISCV::VRM2RegClass.contains(DstReg, SrcReg)) {
-    Opc = RISCV::VMV2R_V;
-    LMul = RISCVII::LMUL_2;
-  } else if (RISCV::VRM4RegClass.contains(DstReg, SrcReg)) {
-    Opc = RISCV::VMV4R_V;
-    LMul = RISCVII::LMUL_4;
-  } else if (RISCV::VRM8RegClass.contains(DstReg, SrcReg)) {
-    Opc = RISCV::VMV8R_V;
-    LMul = RISCVII::LMUL_8;
-  } else if (RISCV::VRN2M1RegClass.contains(DstReg, SrcReg)) {
-    Opc = RISCV::VMV1R_V;
-    SubRegIdx = RISCV::sub_vrm1_0;
-    NF = 2;
-    LMul = RISCVII::LMUL_1;
-  } else if (RISCV::VRN2M2RegClass.contains(DstReg, SrcReg)) {
-    Opc = RISCV::VMV2R_V;
-    SubRegIdx = RISCV::sub_vrm2_0;
-    NF = 2;
-    LMul = RISCVII::LMUL_2;
-  } else if (RISCV::VRN2M4RegClass.contains(DstReg, SrcReg)) {
-    Opc = RISCV::VMV4R_V;
-    SubRegIdx = RISCV::sub_vrm4_0;
-    NF = 2;
-    LMul = RISCVII::LMUL_4;
-  } else if (RISCV::VRN3M1RegClass.contains(DstReg, SrcReg)) {
-    Opc = RISCV::VMV1R_V;
-    SubRegIdx = RISCV::sub_vrm1_0;
-    NF = 3;
-    LMul = RISCVII::LMUL_1;
-  } else if (RISCV::VRN3M2RegClass.contains(DstReg, SrcReg)) {
-    Opc = RISCV::VMV2R_V;
-    SubRegIdx = RISCV::sub_vrm2_0;
-    NF = 3;
-    LMul = RISCVII::LMUL_2;
-  } else if (RISCV::VRN4M1RegClass.contains(DstReg, SrcReg)) {
-    Opc = RISCV::VMV1R_V;
-    SubRegIdx = RISCV::sub_vrm1_0;
-    NF = 4;
-    LMul = RISCVII::LMUL_1;
-  } else if (RISCV::VRN4M2RegClass.contains(DstReg, SrcReg)) {
-    Opc = RISCV::VMV2R_V;
-    SubRegIdx = RISCV::sub_vrm2_0;
-    NF = 4;
-    LMul = RISCVII::LMUL_2;
-  } else if (RISCV::VRN5M1RegClass.contains(DstReg, SrcReg)) {
-    Opc = RISCV::VMV1R_V;
-    SubRegIdx = RISCV::sub_vrm1_0;
-    NF = 5;
-    LMul = RISCVII::LMUL_1;
-  } else if (RISCV::VRN6M1RegClass.contains(DstReg, SrcReg)) {
-    Opc = RISCV::VMV1R_V;
-    SubRegIdx = RISCV::sub_vrm1_0;
-    NF = 6;
-    LMul = RISCVII::LMUL_1;
-  } else if (RISCV::VRN7M1RegClass.contains(DstReg, SrcReg)) {
-    Opc = RISCV::VMV1R_V;
-    SubRegIdx = RISCV::sub_vrm1_0;
-    NF = 7;
-    LMul = RISCVII::LMUL_1;
-  } else if (RISCV::VRN8M1RegClass.contains(DstReg, SrcReg)) {
-    Opc = RISCV::VMV1R_V;
-    SubRegIdx = RISCV::sub_vrm1_0;
-    NF = 8;
-    LMul = RISCVII::LMUL_1;
-  } else {
-    llvm_unreachable("Impossible reg-to-reg copy");
-  }
-
   bool UseVMV_V_V = false;
   bool UseVMV_V_I = false;
   MachineBasicBlock::const_iterator DefMBBI;
@@ -518,7 +377,7 @@ void RISCVInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
                          getKillRegState(KillSrc));
       if (UseVMV_V_V) {
         const MCInstrDesc &Desc = DefMBBI->getDesc();
-        MIB.add(DefMBBI->getOperand(RISCVII::getVLOpNum(Desc))); // AVL
+        MIB.add(DefMBBI->getOperand(RISCVII::getVLOpNum(Desc)));  // AVL
         MIB.add(DefMBBI->getOperand(RISCVII::getSEWOpNum(Desc))); // SEW
         MIB.addImm(0);                                            // tu, mu
         MIB.addReg(RISCV::VL, RegState::Implicit);
@@ -528,6 +387,171 @@ void RISCVInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
   }
 }
 
+void RISCVInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator MBBI,
+                                 const DebugLoc &DL, MCRegister DstReg,
+                                 MCRegister SrcReg, bool KillSrc) const {
+  const TargetRegisterInfo *TRI = STI.getRegisterInfo();
+
+  if (RISCV::GPRRegClass.contains(DstReg, SrcReg)) {
+    BuildMI(MBB, MBBI, DL, get(RISCV::ADDI), DstReg)
+        .addReg(SrcReg, getKillRegState(KillSrc))
+        .addImm(0);
+    return;
+  }
+
+  if (RISCV::GPRPF64RegClass.contains(DstReg, SrcReg)) {
+    // Emit an ADDI for both parts of GPRPF64.
+    BuildMI(MBB, MBBI, DL, get(RISCV::ADDI),
+            TRI->getSubReg(DstReg, RISCV::sub_32))
+        .addReg(TRI->getSubReg(SrcReg, RISCV::sub_32), getKillRegState(KillSrc))
+        .addImm(0);
+    BuildMI(MBB, MBBI, DL, get(RISCV::ADDI),
+            TRI->getSubReg(DstReg, RISCV::sub_32_hi))
+        .addReg(TRI->getSubReg(SrcReg, RISCV::sub_32_hi),
+                getKillRegState(KillSrc))
+        .addImm(0);
+    return;
+  }
+
+  // Handle copy from csr
+  if (RISCV::VCSRRegClass.contains(SrcReg) &&
+      RISCV::GPRRegClass.contains(DstReg)) {
+    BuildMI(MBB, MBBI, DL, get(RISCV::CSRRS), DstReg)
+        .addImm(RISCVSysReg::lookupSysRegByName(TRI->getName(SrcReg))->Encoding)
+        .addReg(RISCV::X0);
+    return;
+  }
+
+  if (RISCV::FPR16RegClass.contains(DstReg, SrcReg)) {
+    unsigned Opc;
+    if (STI.hasStdExtZfh()) {
+      Opc = RISCV::FSGNJ_H;
+    } else {
+      assert(STI.hasStdExtF() &&
+             (STI.hasStdExtZfhmin() || STI.hasStdExtZfbfmin()) &&
+             "Unexpected extensions");
+      // Zfhmin/Zfbfmin doesn't have FSGNJ_H, replace FSGNJ_H with FSGNJ_S.
+      DstReg = TRI->getMatchingSuperReg(DstReg, RISCV::sub_16,
+                                        &RISCV::FPR32RegClass);
+      SrcReg = TRI->getMatchingSuperReg(SrcReg, RISCV::sub_16,
+                                        &RISCV::FPR32RegClass);
+      Opc = RISCV::FSGNJ_S;
+    }
+    BuildMI(MBB, MBBI, DL, get(Opc), DstReg)
+        .addReg(SrcReg, getKillRegState(KillSrc))
+        .addReg(SrcReg, getKillRegState(KillSrc));
+    return;
+  }
+
+  if (RISCV::FPR32RegClass.contains(DstReg, SrcReg)) {
+    BuildMI(MBB, MBBI, DL, get(RISCV::FSGNJ_S), DstReg)
+        .addReg(SrcReg, getKillRegState(KillSrc))
+        .addReg(SrcReg, getKillRegState(KillSrc));
+    return;
+  }
+
+  if (RISCV::FPR64RegClass.contains(DstReg, SrcReg)) {
+    BuildMI(MBB, MBBI, DL, get(RISCV::FSGNJ_D), DstReg)
+        .addReg(SrcReg, getKillRegState(KillSrc))
+        .addReg(SrcReg, getKillRegState(KillSrc));
+    return;
+  }
+
+  // VR->VR copies.
+  if (RISCV::VRRegClass.contains(DstReg, SrcReg)) {
+    copyPhysRegVector(MBB, MBBI, DL, DstReg, SrcReg, KillSrc, RISCV::VMV1R_V,
+                      /*NF=*/1, RISCVII::LMUL_1, RISCV::sub_vrm1_0);
+    return;
+  }
+
+  if (RISCV::VRM2RegClass.contains(DstReg, SrcReg)) {
+    copyPhysRegVector(MBB, MBBI, DL, DstReg, SrcReg, KillSrc, RISCV::VMV2R_V,
+                      /*NF=*/1, RISCVII::LMUL_2, RISCV::sub_vrm1_0);
+    return;
+  }
+
+  if (RISCV::VRM4RegClass.contains(DstReg, SrcReg)) {
+    copyPhysRegVector(MBB, MBBI, DL, DstReg, SrcReg, KillSrc, RISCV::VMV4R_V,
+                      /*NF=*/1, RISCVII::LMUL_4, RISCV::sub_vrm1_0);
+    return;
+  }
+
+  if (RISCV::VRM8RegClass.contains(DstReg, SrcReg)) {
+    copyPhysRegVector(MBB, MBBI, DL, DstReg, SrcReg, KillSrc, RISCV::VMV8R_V,
+                      /*NF=*/1, RISCVII::LMUL_8, RISCV::sub_vrm1_0);
+    return;
+  }
+
+  if (RISCV::VRN2M1RegClass.contains(DstReg, SrcReg)) {
+    copyPhysRegVector(MBB, MBBI, DL, DstReg, SrcReg, KillSrc, RISCV::VMV1R_V,
+                      /*NF=*/2, RISCVII::LMUL_1, RISCV::sub_vrm1_0);
+    return;
+  }
+
+  if (RISCV::VRN2M2RegClass.contains(DstReg, SrcReg)) {
+    copyPhysRegVector(MBB, MBBI, DL, DstReg, SrcReg, KillSrc, RISCV::VMV2R_V,
+                      /*NF=*/2, RISCVII::LMUL_2, RISCV::sub_vrm2_0);
+    return;
+  }
+
+  if (RISCV::VRN2M4RegClass.contains(DstReg, SrcReg)) {
+    copyPhysRegVector(MBB, MBBI, DL, DstReg, SrcReg, KillSrc, RISCV::VMV4R_V,
+                      /*NF=*/2, RISCVII::LMUL_4, RISCV::sub_vrm4_0);
+    return;
+  }
+
+  if (RISCV::VRN3M1RegClass.contains(DstReg, SrcReg)) {
+    copyPhysRegVector(MBB, MBBI, DL, DstReg, SrcReg, KillSrc, RISCV::VMV1R_V,
+                      /*NF=*/3, RISCVII::LMUL_1, RISCV::sub_vrm1_0);
+    return;
+  }
+
+  if (RISCV::VRN3M2RegClass.contains(DstReg, SrcReg)) {
+    copyPhysRegVector(MBB, MBBI, DL, DstReg, SrcReg, KillSrc, RISCV::VMV2R_V,
+                      /*NF=*/3, RISCVII::LMUL_2, RISCV::sub_vrm2_0);
+    return;
+  }
+
+  if (RISCV::VRN4M1RegClass.contains(DstReg, SrcReg)) {
+    copyPhysRegVector(MBB, MBBI, DL, DstReg, SrcReg, KillSrc, RISCV::VMV1R_V,
+                      /*NF=*/4, RISCVII::LMUL_1, RISCV::sub_vrm1_0);
+    return;
+  }
+
+  if (RISCV::VRN4M2RegClass.contains(DstReg, SrcReg)) {
+    copyPhysRegVector(MBB, MBBI, DL, DstReg, SrcReg, KillSrc, RISCV::VMV2R_V,
+                      /*NF=*/4, RISCVII::LMUL_2, RISCV::sub_vrm2_0);
+    return;
+  }
+
+  if (RISCV::VRN5M1RegClass.contains(DstReg, SrcReg)) {
+    copyPhysRegVector(MBB, MBBI, DL, DstReg, SrcReg, KillSrc, RISCV::VMV1R_V,
+                      /*NF=*/5, RISCVII::LMUL_1, RISCV::sub_vrm1_0);
+    return;
+  }
+
+  if (RISCV::VRN6M1RegClass.contains(DstReg, SrcReg)) {
+    copyPhysRegVector(MBB, MBBI, DL, DstReg, SrcReg, KillSrc, RISCV::VMV1R_V,
+                      /*NF=*/6, RISCVII::LMUL_1, RISCV::sub_vrm1_0);
+    return;
+  }
+
+  if (RISCV::VRN7M1RegClass.contains(DstReg, SrcReg)) {
+    copyPhysRegVector(MBB, MBBI, DL, DstReg, SrcReg, KillSrc, RISCV::VMV1R_V,
+                      /*NF=*/7, RISCVII::LMUL_1, RISCV::sub_vrm1_0);
+    return;
+  }
+
+  if (RISCV::VRN8M1RegClass.contains(DstReg, SrcReg)) {
+    copyPhysRegVector(MBB, MBBI, DL, DstReg, SrcReg, KillSrc, RISCV::VMV1R_V,
+                      /*NF=*/8, RISCVII::LMUL_1, RISCV::sub_vrm1_0);
+    return;
+  }
+
+  llvm_unreachable("Impossible reg-to-reg copy");
+}
+
 void RISCVInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
                                          MachineBasicBlock::iterator I,
                                          Register SrcReg, bool IsKill, int FI,
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.h b/llvm/lib/Target/RISCV/RISCVInstrInfo.h
index 5584e5571c9bc..13f1bd4127b12 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.h
@@ -13,6 +13,7 @@
 #ifndef LLVM_LIB_TARGET_RISCV_RISCVINSTRINFO_H
 #define LLVM_LIB_TARGET_RISCV_RISCVINSTRINFO_H
 
+#include "MCTargetDesc/RISCVBaseInfo.h"
 #include "RISCVRegisterInfo.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/IR/DiagnosticInfo.h"
@@ -63,6 +64,11 @@ class RISCVInstrInfo : public RISCVGenInstrInfo {
   unsigned isStoreToStackSlot(const MachineInstr &MI, int &FrameIndex,
                               unsigned &MemBytes) const override;
 
+  void copyPhysRegVector(MachineBasicBlock &MBB,
+                         MachineBasicBlock::iterator MBBI, const DebugLoc &DL,
+                         MCRegister DstReg, MCRegister SrcReg, bool KillSrc,
+                         unsigned Opc, unsigned NF, RISCVII::VLMUL LMul,
+                         unsigned SubRegIdx) const;
   void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
                    const DebugLoc &DL, MCRegister DstReg, MCRegister SrcReg,
                    bool KillSrc) const override;

From 9a091de7fe83af010e6ce38e2ed1227ef475bf49 Mon Sep 17 00:00:00 2001
From: Guozhi Wei <carrot@google.com>
Date: Fri, 27 Oct 2023 19:47:23 +0000
Subject: [PATCH 253/877] [X86, Peephole] Enable FoldImmediate for X86

Enable FoldImmediate for X86 by implementing X86InstrInfo::FoldImmediate.

Also enhanced peephole by deleting identical instructions after FoldImmediate.

Differential Revision: https://reviews.llvm.org/D151848
---
 llvm/include/llvm/CodeGen/MachineFunction.h   |    8 +
 llvm/include/llvm/CodeGen/MachineInstr.h      |    2 +-
 llvm/lib/CodeGen/MachineFunction.cpp          |    6 +
 llvm/lib/CodeGen/MachineInstr.cpp             |    6 +
 llvm/lib/CodeGen/PeepholeOptimizer.cpp        |  123 +-
 llvm/lib/Target/X86/X86InstrInfo.cpp          |  340 +-
 llvm/lib/Target/X86/X86InstrInfo.h            |    9 +
 .../test/CodeGen/AMDGPU/peephole-fold-imm.mir |    1 -
 llvm/test/CodeGen/X86/GlobalISel/phi.ll       |   18 +-
 .../X86/div-rem-pair-recomposition-signed.ll  |  337 +-
 .../div-rem-pair-recomposition-unsigned.ll    |   86 +-
 llvm/test/CodeGen/X86/fast-isel-freeze.ll     |    4 +-
 llvm/test/CodeGen/X86/foldimmediate-size.ll   |   57 +
 llvm/test/CodeGen/X86/foldimmediate.mir       |  143 +
 llvm/test/CodeGen/X86/isel-icmp.ll            |    3 +-
 llvm/test/CodeGen/X86/pcsections-atomics.ll   | 3609 +++++++++--------
 llvm/test/CodeGen/X86/peephole-copy.ll        |   67 +
 llvm/test/CodeGen/X86/peephole-copy.mir       |   35 +
 llvm/test/CodeGen/X86/physreg-pairs.ll        |    2 +-
 llvm/test/CodeGen/X86/popcnt.ll               |  222 +-
 llvm/test/CodeGen/X86/ragreedy-hoist-spill.ll |    2 +-
 llvm/test/CodeGen/X86/remat-phys-dead.ll      |    2 +-
 llvm/test/CodeGen/X86/select_const_i128.ll    |    3 +-
 llvm/test/CodeGen/X86/shrink_vmul.ll          |   48 +-
 ...speculative-load-hardening-call-and-ret.ll |   75 +-
 llvm/test/CodeGen/X86/swifterror.ll           |   28 +-
 .../vector-shuffle-combining-avx512bwvl.ll    |    5 +-
 27 files changed, 3030 insertions(+), 2211 deletions(-)
 create mode 100644 llvm/test/CodeGen/X86/foldimmediate-size.ll
 create mode 100644 llvm/test/CodeGen/X86/foldimmediate.mir
 create mode 100644 llvm/test/CodeGen/X86/peephole-copy.ll
 create mode 100644 llvm/test/CodeGen/X86/peephole-copy.mir

diff --git a/llvm/include/llvm/CodeGen/MachineFunction.h b/llvm/include/llvm/CodeGen/MachineFunction.h
index 8f1651c2958e5..8e3253d492dc9 100644
--- a/llvm/include/llvm/CodeGen/MachineFunction.h
+++ b/llvm/include/llvm/CodeGen/MachineFunction.h
@@ -463,6 +463,11 @@ class LLVM_EXTERNAL_VISIBILITY MachineFunction {
     virtual void MF_HandleInsertion(MachineInstr &MI) = 0;
     /// Callback before a removal. This should not modify the MI directly.
     virtual void MF_HandleRemoval(MachineInstr &MI) = 0;
+    /// Callback before changing MCInstrDesc. This should not modify the MI
+    /// directly.
+    virtual void MF_HandleChangeDesc(MachineInstr &MI, const MCInstrDesc &TID) {
+      return;
+    }
   };
 
   /// Structure used to represent pair of argument number after call lowering
@@ -498,6 +503,9 @@ class LLVM_EXTERNAL_VISIBILITY MachineFunction {
   friend struct ilist_traits<MachineInstr>;
 
 public:
+  // Need to be accessed from MachineInstr::setDesc.
+  void handleChangeDesc(MachineInstr &MI, const MCInstrDesc &TID);
+
   using VariableDbgInfoMapTy = SmallVector<VariableDbgInfo, 4>;
   VariableDbgInfoMapTy VariableDbgInfos;
 
diff --git a/llvm/include/llvm/CodeGen/MachineInstr.h b/llvm/include/llvm/CodeGen/MachineInstr.h
index 6afc2d5cbe6e7..0b9ad764af265 100644
--- a/llvm/include/llvm/CodeGen/MachineInstr.h
+++ b/llvm/include/llvm/CodeGen/MachineInstr.h
@@ -1814,7 +1814,7 @@ class MachineInstr
 
   /// Replace the instruction descriptor (thus opcode) of
   /// the current instruction with a new one.
-  void setDesc(const MCInstrDesc &TID) { MCID = &TID; }
+  void setDesc(const MCInstrDesc &TID);
 
   /// Replace current source information with new such.
   /// Avoid using this, the constructor argument is preferable.
diff --git a/llvm/lib/CodeGen/MachineFunction.cpp b/llvm/lib/CodeGen/MachineFunction.cpp
index 1f14546a25b1c..9e67dcbf27d4c 100644
--- a/llvm/lib/CodeGen/MachineFunction.cpp
+++ b/llvm/lib/CodeGen/MachineFunction.cpp
@@ -178,6 +178,12 @@ void MachineFunction::handleRemoval(MachineInstr &MI) {
     TheDelegate->MF_HandleRemoval(MI);
 }
 
+void MachineFunction::handleChangeDesc(MachineInstr &MI,
+                                       const MCInstrDesc &TID) {
+  if (TheDelegate)
+    TheDelegate->MF_HandleChangeDesc(MI, TID);
+}
+
 void MachineFunction::init() {
   // Assume the function starts in SSA form with correct liveness.
   Properties.set(MachineFunctionProperties::Property::IsSSA);
diff --git a/llvm/lib/CodeGen/MachineInstr.cpp b/llvm/lib/CodeGen/MachineInstr.cpp
index d8467e2af8786..448725893bde0 100644
--- a/llvm/lib/CodeGen/MachineInstr.cpp
+++ b/llvm/lib/CodeGen/MachineInstr.cpp
@@ -138,6 +138,12 @@ MachineInstr::MachineInstr(MachineFunction &MF, const MachineInstr &MI)
   setFlags(MI.Flags);
 }
 
+void MachineInstr::setDesc(const MCInstrDesc &TID) {
+  if (getParent())
+    getMF()->handleChangeDesc(*this, TID);
+  MCID = &TID;
+}
+
 void MachineInstr::moveBefore(MachineInstr *MovePos) {
   MovePos->getParent()->splice(MovePos, getParent(), getIterator());
 }
diff --git a/llvm/lib/CodeGen/PeepholeOptimizer.cpp b/llvm/lib/CodeGen/PeepholeOptimizer.cpp
index a08cc78f11b1b..76b3b16af16bd 100644
--- a/llvm/lib/CodeGen/PeepholeOptimizer.cpp
+++ b/llvm/lib/CodeGen/PeepholeOptimizer.cpp
@@ -149,7 +149,8 @@ namespace {
   class ValueTrackerResult;
   class RecurrenceInstr;
 
-  class PeepholeOptimizer : public MachineFunctionPass {
+  class PeepholeOptimizer : public MachineFunctionPass,
+                            private MachineFunction::Delegate {
     const TargetInstrInfo *TII = nullptr;
     const TargetRegisterInfo *TRI = nullptr;
     MachineRegisterInfo *MRI = nullptr;
@@ -202,7 +203,8 @@ namespace {
     bool isMoveImmediate(MachineInstr &MI, SmallSet<Register, 4> &ImmDefRegs,
                          DenseMap<Register, MachineInstr *> &ImmDefMIs);
     bool foldImmediate(MachineInstr &MI, SmallSet<Register, 4> &ImmDefRegs,
-                       DenseMap<Register, MachineInstr *> &ImmDefMIs);
+                       DenseMap<Register, MachineInstr *> &ImmDefMIs,
+                       bool &Deleted);
 
     /// Finds recurrence cycles, but only ones that formulated around
     /// a def operand and a use operand that are tied. If there is a use
@@ -214,11 +216,10 @@ namespace {
 
     /// If copy instruction \p MI is a virtual register copy or a copy of a
     /// constant physical register to a virtual register, track it in the
-    /// set \p CopyMIs. If this virtual register was previously seen as a
+    /// set CopySrcMIs. If this virtual register was previously seen as a
     /// copy, replace the uses of this copy with the previously seen copy's
     /// destination register.
-    bool foldRedundantCopy(MachineInstr &MI,
-                           DenseMap<RegSubRegPair, MachineInstr *> &CopyMIs);
+    bool foldRedundantCopy(MachineInstr &MI);
 
     /// Is the register \p Reg a non-allocatable physical register?
     bool isNAPhysCopy(Register Reg);
@@ -255,6 +256,49 @@ namespace {
 
     MachineInstr &rewriteSource(MachineInstr &CopyLike,
                                 RegSubRegPair Def, RewriteMapTy &RewriteMap);
+
+    // Set of copies to virtual registers keyed by source register.  Never
+    // holds any physreg which requires def tracking.
+    DenseMap<RegSubRegPair, MachineInstr *> CopySrcMIs;
+
+    // MachineFunction::Delegate implementation. Used to maintain CopySrcMIs.
+    void MF_HandleInsertion(MachineInstr &MI) override {
+      return;
+    }
+
+    bool getCopySrc(MachineInstr &MI, RegSubRegPair &SrcPair) {
+      if (!MI.isCopy())
+        return false;
+
+      Register SrcReg = MI.getOperand(1).getReg();
+      unsigned SrcSubReg = MI.getOperand(1).getSubReg();
+      if (!SrcReg.isVirtual() && !MRI->isConstantPhysReg(SrcReg))
+        return false;
+
+      SrcPair = RegSubRegPair(SrcReg, SrcSubReg);
+      return true;
+    }
+
+    // If a COPY instruction is to be deleted or changed, we should also remove
+    // it from CopySrcMIs.
+    void deleteChangedCopy(MachineInstr &MI) {
+      RegSubRegPair SrcPair;
+      if (!getCopySrc(MI, SrcPair))
+        return;
+
+      auto It = CopySrcMIs.find(SrcPair);
+      if (It != CopySrcMIs.end() && It->second == &MI)
+        CopySrcMIs.erase(It);
+    }
+
+    void MF_HandleRemoval(MachineInstr &MI) override {
+      deleteChangedCopy(MI);
+    }
+
+    void MF_HandleChangeDesc(MachineInstr &MI, const MCInstrDesc &TID) override
+    {
+      deleteChangedCopy(MI);
+    }
   };
 
   /// Helper class to hold instructions that are inside recurrence cycles.
@@ -1351,18 +1395,19 @@ bool PeepholeOptimizer::isMoveImmediate(
     MachineInstr &MI, SmallSet<Register, 4> &ImmDefRegs,
     DenseMap<Register, MachineInstr *> &ImmDefMIs) {
   const MCInstrDesc &MCID = MI.getDesc();
-  if (!MI.isMoveImmediate())
-    return false;
-  if (MCID.getNumDefs() != 1)
+  if (MCID.getNumDefs() != 1 || !MI.getOperand(0).isReg())
     return false;
   Register Reg = MI.getOperand(0).getReg();
-  if (Reg.isVirtual()) {
-    ImmDefMIs.insert(std::make_pair(Reg, &MI));
-    ImmDefRegs.insert(Reg);
-    return true;
-  }
+  if (!Reg.isVirtual())
+    return false;
 
-  return false;
+  int64_t ImmVal;
+  if (!MI.isMoveImmediate() && !TII->getConstValDefinedInReg(MI, Reg, ImmVal))
+    return false;
+
+  ImmDefMIs.insert(std::make_pair(Reg, &MI));
+  ImmDefRegs.insert(Reg);
+  return true;
 }
 
 /// Try folding register operands that are defined by move immediate
@@ -1370,7 +1415,8 @@ bool PeepholeOptimizer::isMoveImmediate(
 /// and only if the def and use are in the same BB.
 bool PeepholeOptimizer::foldImmediate(
     MachineInstr &MI, SmallSet<Register, 4> &ImmDefRegs,
-    DenseMap<Register, MachineInstr *> &ImmDefMIs) {
+    DenseMap<Register, MachineInstr *> &ImmDefMIs, bool &Deleted) {
+  Deleted = false;
   for (unsigned i = 0, e = MI.getDesc().getNumOperands(); i != e; ++i) {
     MachineOperand &MO = MI.getOperand(i);
     if (!MO.isReg() || MO.isDef())
@@ -1384,6 +1430,19 @@ bool PeepholeOptimizer::foldImmediate(
     assert(II != ImmDefMIs.end() && "couldn't find immediate definition");
     if (TII->FoldImmediate(MI, *II->second, Reg, MRI)) {
       ++NumImmFold;
+      // FoldImmediate can delete ImmDefMI if MI was its only user. If ImmDefMI
+      // is not deleted, and we happened to get a same MI, we can delete MI and
+      // replace its users.
+      if (MRI->getVRegDef(Reg) &&
+          MI.isIdenticalTo(*II->second, MachineInstr::IgnoreVRegDefs)) {
+        Register DstReg = MI.getOperand(0).getReg();
+        if (DstReg.isVirtual() &&
+            MRI->getRegClass(DstReg) == MRI->getRegClass(Reg)) {
+          MRI->replaceRegWith(DstReg, Reg);
+          MI.eraseFromParent();
+          Deleted = true;
+        }
+      }
       return true;
     }
   }
@@ -1404,29 +1463,25 @@ bool PeepholeOptimizer::foldImmediate(
 // %2 = COPY %0:sub1
 //
 // Should replace %2 uses with %1:sub1
-bool PeepholeOptimizer::foldRedundantCopy(
-    MachineInstr &MI, DenseMap<RegSubRegPair, MachineInstr *> &CopyMIs) {
+bool PeepholeOptimizer::foldRedundantCopy(MachineInstr &MI) {
   assert(MI.isCopy() && "expected a COPY machine instruction");
 
-  Register SrcReg = MI.getOperand(1).getReg();
-  unsigned SrcSubReg = MI.getOperand(1).getSubReg();
-  if (!SrcReg.isVirtual() && !MRI->isConstantPhysReg(SrcReg))
+  RegSubRegPair SrcPair;
+  if (!getCopySrc(MI, SrcPair))
     return false;
 
   Register DstReg = MI.getOperand(0).getReg();
   if (!DstReg.isVirtual())
     return false;
 
-  RegSubRegPair SrcPair(SrcReg, SrcSubReg);
-
-  if (CopyMIs.insert(std::make_pair(SrcPair, &MI)).second) {
+  if (CopySrcMIs.insert(std::make_pair(SrcPair, &MI)).second) {
     // First copy of this reg seen.
     return false;
   }
 
-  MachineInstr *PrevCopy = CopyMIs.find(SrcPair)->second;
+  MachineInstr *PrevCopy = CopySrcMIs.find(SrcPair)->second;
 
-  assert(SrcSubReg == PrevCopy->getOperand(1).getSubReg() &&
+  assert(SrcPair.SubReg == PrevCopy->getOperand(1).getSubReg() &&
          "Unexpected mismatching subreg!");
 
   Register PrevDstReg = PrevCopy->getOperand(0).getReg();
@@ -1617,6 +1672,7 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) {
   MRI = &MF.getRegInfo();
   DT  = Aggressive ? &getAnalysis<MachineDominatorTree>() : nullptr;
   MLI = &getAnalysis<MachineLoopInfo>();
+  MF.setDelegate(this);
 
   bool Changed = false;
 
@@ -1641,9 +1697,7 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) {
     // without any intervening re-definition of $physreg.
     DenseMap<Register, MachineInstr *> NAPhysToVirtMIs;
 
-    // Set of copies to virtual registers keyed by source register.  Never
-    // holds any physreg which requires def tracking.
-    DenseMap<RegSubRegPair, MachineInstr *> CopySrcMIs;
+    CopySrcMIs.clear();
 
     bool IsLoopHeader = MLI->isLoopHeader(&MBB);
 
@@ -1732,7 +1786,7 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) {
         continue;
       }
 
-      if (MI->isCopy() && (foldRedundantCopy(*MI, CopySrcMIs) ||
+      if (MI->isCopy() && (foldRedundantCopy(*MI) ||
                            foldRedundantNAPhysCopy(*MI, NAPhysToVirtMIs))) {
         LocalMIs.erase(MI);
         LLVM_DEBUG(dbgs() << "Deleting redundant copy: " << *MI << "\n");
@@ -1750,8 +1804,14 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) {
         // next iteration sees the new instructions.
         MII = MI;
         ++MII;
-        if (SeenMoveImm)
-          Changed |= foldImmediate(*MI, ImmDefRegs, ImmDefMIs);
+        if (SeenMoveImm) {
+          bool Deleted;
+          Changed |= foldImmediate(*MI, ImmDefRegs, ImmDefMIs, Deleted);
+          if (Deleted) {
+            LocalMIs.erase(MI);
+            continue;
+          }
+        }
       }
 
       // Check whether MI is a load candidate for folding into a later
@@ -1815,6 +1875,7 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) {
     }
   }
 
+  MF.resetDelegate(this);
   return Changed;
 }
 
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index 8e42ef54b67ac..4c6854da0ada3 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -3867,12 +3867,42 @@ bool X86InstrInfo::verifyInstruction(const MachineInstr &MI,
 bool X86InstrInfo::getConstValDefinedInReg(const MachineInstr &MI,
                                            const Register Reg,
                                            int64_t &ImmVal) const {
-  if (MI.getOpcode() != X86::MOV32ri && MI.getOpcode() != X86::MOV64ri)
+  Register MovReg = Reg;
+  const MachineInstr *MovMI = &MI;
+
+  // Follow use-def for SUBREG_TO_REG to find the real move immediate
+  // instruction. It is quite common for x86-64.
+  if (MI.isSubregToReg()) {
+    // We use following pattern to setup 64b immediate.
+    //      %8:gr32 = MOV32r0 implicit-def dead $eflags
+    //      %6:gr64 = SUBREG_TO_REG 0, killed %8:gr32, %subreg.sub_32bit
+    if (!MI.getOperand(1).isImm())
+      return false;
+    unsigned FillBits = MI.getOperand(1).getImm();
+    unsigned SubIdx = MI.getOperand(3).getImm();
+    MovReg = MI.getOperand(2).getReg();
+    if (SubIdx != X86::sub_32bit || FillBits != 0)
+      return false;
+    const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
+    MovMI = MRI.getUniqueVRegDef(MovReg);
+    if (!MovMI)
+      return false;
+  }
+
+  if (MovMI->getOpcode() == X86::MOV32r0 &&
+      MovMI->getOperand(0).getReg() == MovReg) {
+    ImmVal = 0;
+    return true;
+  }
+
+  if (MovMI->getOpcode() != X86::MOV32ri &&
+      MovMI->getOpcode() != X86::MOV64ri &&
+      MovMI->getOpcode() != X86::MOV32ri64 && MovMI->getOpcode() != X86::MOV8ri)
     return false;
   // Mov Src can be a global address.
-  if (!MI.getOperand(1).isImm() || MI.getOperand(0).getReg() != Reg)
+  if (!MovMI->getOperand(1).isImm() || MovMI->getOperand(0).getReg() != MovReg)
     return false;
-  ImmVal = MI.getOperand(1).getImm();
+  ImmVal = MovMI->getOperand(1).getImm();
   return true;
 }
 
@@ -4769,6 +4799,310 @@ MachineInstr *X86InstrInfo::optimizeLoadInstr(MachineInstr &MI,
   return nullptr;
 }
 
+/// Convert an ALUrr opcode to corresponding ALUri opcode. Such as
+///     ADD32rr  ==>  ADD32ri
+/// ShiftRotate will be set to true if the Opcode is shift or rotate.
+/// If the ALUri can be further changed to COPY when the immediate is 0, set
+/// CanConvert2Copy to true.
+static unsigned ConvertALUrr2ALUri(unsigned Opcode, bool &CanConvert2Copy,
+                                   bool &ShiftRotate) {
+  CanConvert2Copy = false;
+  ShiftRotate = false;
+  unsigned NewOpcode = 0;
+  switch (Opcode) {
+    case X86::ADD64rr:
+      NewOpcode = X86::ADD64ri32;
+      CanConvert2Copy = true;
+      break;
+    case X86::ADC64rr:
+      NewOpcode = X86::ADC64ri32;
+      break;
+    case X86::SUB64rr:
+      NewOpcode = X86::SUB64ri32;
+      CanConvert2Copy = true;
+      break;
+    case X86::SBB64rr:
+      NewOpcode = X86::SBB64ri32;
+      break;
+    case X86::AND64rr:
+      NewOpcode = X86::AND64ri32;
+      break;
+    case X86::OR64rr:
+      NewOpcode = X86::OR64ri32;
+      CanConvert2Copy = true;
+      break;
+    case X86::XOR64rr:
+      NewOpcode = X86::XOR64ri32;
+      CanConvert2Copy = true;
+      break;
+    case X86::TEST64rr:
+      NewOpcode = X86::TEST64ri32;
+      break;
+    case X86::CMP64rr:
+      NewOpcode = X86::CMP64ri32;
+      break;
+    case X86::SHR64rCL:
+      NewOpcode = X86::SHR64ri;
+      ShiftRotate = true;
+      break;
+    case X86::SHL64rCL:
+      NewOpcode = X86::SHL64ri;
+      ShiftRotate = true;
+      break;
+    case X86::SAR64rCL:
+      NewOpcode = X86::SAR64ri;
+      ShiftRotate = true;
+      break;
+    case X86::ROL64rCL:
+      NewOpcode = X86::ROL64ri;
+      ShiftRotate = true;
+      break;
+    case X86::ROR64rCL:
+      NewOpcode = X86::ROR64ri;
+      ShiftRotate = true;
+      break;
+    case X86::RCL64rCL:
+      NewOpcode = X86::RCL64ri;
+      ShiftRotate = true;
+      break;
+    case X86::RCR64rCL:
+      NewOpcode = X86::RCR64ri;
+      ShiftRotate = true;
+      break;
+    case X86::ADD32rr:
+      NewOpcode = X86::ADD32ri;
+      CanConvert2Copy = true;
+      break;
+    case X86::ADC32rr:
+      NewOpcode = X86::ADC32ri;
+      break;
+    case X86::SUB32rr:
+      NewOpcode = X86::SUB32ri;
+      CanConvert2Copy = true;
+      break;
+    case X86::SBB32rr:
+      NewOpcode = X86::SBB32ri;
+      break;
+    case X86::AND32rr:
+      NewOpcode = X86::AND32ri;
+      break;
+    case X86::OR32rr:
+      NewOpcode = X86::OR32ri;
+      CanConvert2Copy = true;
+      break;
+    case X86::XOR32rr:
+      NewOpcode = X86::XOR32ri;
+      CanConvert2Copy = true;
+      break;
+    case X86::TEST32rr:
+      NewOpcode = X86::TEST32ri;
+      break;
+    case X86::CMP32rr:
+      NewOpcode = X86::CMP32ri;
+      break;
+    case X86::SHR32rCL:
+      NewOpcode = X86::SHR32ri;
+      ShiftRotate = true;
+      break;
+    case X86::SHL32rCL:
+      NewOpcode = X86::SHL32ri;
+      ShiftRotate = true;
+      break;
+    case X86::SAR32rCL:
+      NewOpcode = X86::SAR32ri;
+      ShiftRotate = true;
+      break;
+    case X86::ROL32rCL:
+      NewOpcode = X86::ROL32ri;
+      ShiftRotate = true;
+      break;
+    case X86::ROR32rCL:
+      NewOpcode = X86::ROR32ri;
+      ShiftRotate = true;
+      break;
+    case X86::RCL32rCL:
+      NewOpcode = X86::RCL32ri;
+      ShiftRotate = true;
+      break;
+    case X86::RCR32rCL:
+      NewOpcode = X86::RCR32ri;
+      ShiftRotate = true;
+      break;
+  }
+  return NewOpcode;
+}
+
+/// Real implementation of FoldImmediate.
+/// Reg is assigned ImmVal in DefMI, and is used in UseMI.
+/// If MakeChange is true, this function tries to replace Reg by ImmVal in
+/// UseMI. If MakeChange is false, just check if folding is possible.
+/// Return true if folding is successful or possible.
+bool X86InstrInfo::FoldImmediateImpl(MachineInstr &UseMI, MachineInstr *DefMI,
+                                     Register Reg, int64_t ImmVal,
+                                     MachineRegisterInfo *MRI,
+                                     bool MakeChange) const {
+  bool Modified = false;
+  bool ShiftRotate = false;
+  // When ImmVal is 0, some instructions can be changed to COPY.
+  bool CanChangeToCopy = false;
+  unsigned Opc = UseMI.getOpcode();
+
+  // 64 bit operations accept sign extended 32 bit immediates.
+  // 32 bit operations accept all 32 bit immediates, so we don't need to check
+  // them.
+  const TargetRegisterClass *RC = nullptr;
+  if (Reg.isVirtual())
+    RC = MRI->getRegClass(Reg);
+  if ((Reg.isPhysical() && X86::GR64RegClass.contains(Reg)) ||
+      (Reg.isVirtual() && X86::GR64RegClass.hasSubClassEq(RC))) {
+    if (!isInt<32>(ImmVal))
+      return false;
+  }
+
+  if (UseMI.findRegisterUseOperand(Reg)->getSubReg())
+    return false;
+  // Immediate has larger code size than register. So avoid folding the
+  // immediate if it has more than 1 use and we are optimizing for size.
+  if (UseMI.getMF()->getFunction().hasOptSize() && Reg.isVirtual() &&
+      !MRI->hasOneNonDBGUse(Reg))
+    return false;
+
+  unsigned NewOpc;
+  if (Opc == TargetOpcode::COPY) {
+    Register ToReg = UseMI.getOperand(0).getReg();
+    const TargetRegisterClass *RC = nullptr;
+    if (ToReg.isVirtual())
+      RC = MRI->getRegClass(ToReg);
+    bool GR32Reg = (ToReg.isVirtual() && X86::GR32RegClass.hasSubClassEq(RC)) ||
+                   (ToReg.isPhysical() && X86::GR32RegClass.contains(ToReg));
+    bool GR64Reg = (ToReg.isVirtual() && X86::GR64RegClass.hasSubClassEq(RC)) ||
+                   (ToReg.isPhysical() && X86::GR64RegClass.contains(ToReg));
+    bool GR8Reg = (ToReg.isVirtual() && X86::GR8RegClass.hasSubClassEq(RC)) ||
+                  (ToReg.isPhysical() && X86::GR8RegClass.contains(ToReg));
+
+    if (ImmVal == 0) {
+      // We have MOV32r0 only.
+      if (!GR32Reg)
+        return false;
+    }
+
+    if (GR64Reg) {
+      if (isUInt<32>(ImmVal))
+        NewOpc = X86::MOV32ri64;
+      else
+        NewOpc = X86::MOV64ri;
+    } else if (GR32Reg) {
+      NewOpc = X86::MOV32ri;
+      if (ImmVal == 0) {
+        // MOV32r0 clobbers EFLAGS.
+        const TargetRegisterInfo *TRI = &getRegisterInfo();
+        if (UseMI.getParent()->computeRegisterLiveness(TRI, X86::EFLAGS, UseMI)
+            != MachineBasicBlock::LQR_Dead)
+          return false;
+
+        // MOV32r0 is different than other cases because it doesn't encode the
+        // immediate in the instruction. So we directly modify it here.
+        if (!MakeChange)
+          return true;
+        UseMI.setDesc(get(X86::MOV32r0));
+        UseMI.removeOperand(UseMI.findRegisterUseOperandIdx(Reg));
+        UseMI.addOperand(MachineOperand::CreateReg(X86::EFLAGS, /*isDef=*/ true,
+                                                   /*isImp=*/ true,
+                                                   /*isKill=*/ false,
+                                                   /*isDead=*/ true));
+        Modified = true;
+      }
+    } else if (GR8Reg)
+      NewOpc = X86::MOV8ri;
+    else
+      return false;
+  } else
+    NewOpc = ConvertALUrr2ALUri(Opc, CanChangeToCopy, ShiftRotate);
+
+  if (!NewOpc)
+    return false;
+
+  // For SUB instructions the immediate can only be the second source operand.
+  if ((NewOpc == X86::SUB64ri32 || NewOpc == X86::SUB32ri ||
+       NewOpc == X86::SBB64ri32 || NewOpc == X86::SBB32ri) &&
+      UseMI.findRegisterUseOperandIdx(Reg) != 2)
+    return false;
+  // For CMP instructions the immediate can only be at index 1.
+  if ((NewOpc == X86::CMP64ri32 || NewOpc == X86::CMP32ri) &&
+      UseMI.findRegisterUseOperandIdx(Reg) != 1)
+    return false;
+
+  if (ShiftRotate) {
+    unsigned RegIdx = UseMI.findRegisterUseOperandIdx(Reg);
+    if (RegIdx < 2)
+      return false;
+    if (!isInt<8>(ImmVal))
+      return false;
+    assert(Reg == X86::CL);
+
+    if (!MakeChange)
+      return true;
+    UseMI.setDesc(get(NewOpc));
+    UseMI.removeOperand(RegIdx);
+    UseMI.addOperand(MachineOperand::CreateImm(ImmVal));
+    // Reg is physical register $cl, so we don't know if DefMI is dead through
+    // MRI. Let the caller handle it, or pass dead-mi-elimination can delete
+    // the dead physical register define instruction.
+    return true;
+  }
+
+  if (!MakeChange)
+    return true;
+
+  if (!Modified) {
+    // Modify the instruction.
+    if (ImmVal == 0 && CanChangeToCopy &&
+        UseMI.registerDefIsDead(X86::EFLAGS)) {
+      //          %100 = add %101, 0
+      //    ==>
+      //          %100 = COPY %101
+      UseMI.setDesc(get(TargetOpcode::COPY));
+      UseMI.removeOperand(UseMI.findRegisterUseOperandIdx(Reg));
+      UseMI.removeOperand(UseMI.findRegisterDefOperandIdx(X86::EFLAGS));
+      UseMI.untieRegOperand(0);
+      UseMI.clearFlag(MachineInstr::MIFlag::NoSWrap);
+      UseMI.clearFlag(MachineInstr::MIFlag::NoUWrap);
+    } else {
+      unsigned Op1 = 1, Op2 = CommuteAnyOperandIndex;
+      unsigned ImmOpNum = 2;
+      if (!UseMI.getOperand(0).isDef()) {
+        Op1 = 0;                                      // TEST, CMP
+        ImmOpNum = 1;
+      }
+      if (Opc == TargetOpcode::COPY)
+        ImmOpNum = 1;
+      if (findCommutedOpIndices(UseMI, Op1, Op2) &&
+          UseMI.getOperand(Op1).getReg() == Reg)
+        commuteInstruction(UseMI);
+
+      assert(UseMI.getOperand(ImmOpNum).getReg() == Reg);
+      UseMI.setDesc(get(NewOpc));
+      UseMI.getOperand(ImmOpNum).ChangeToImmediate(ImmVal);
+    }
+  }
+
+  if (Reg.isVirtual() && MRI->use_nodbg_empty(Reg))
+    DefMI->eraseFromBundle();
+
+  return true;
+}
+
+/// FoldImmediate - 'Reg' is known to be defined by a move immediate
+/// instruction, try to fold the immediate into the use instruction.
+bool X86InstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
+                                 Register Reg, MachineRegisterInfo *MRI) const {
+  int64_t ImmVal;
+  if (!getConstValDefinedInReg(DefMI, Reg, ImmVal))
+    return false;
+
+  return FoldImmediateImpl(UseMI, &DefMI, Reg, ImmVal, MRI, true);
+}
+
 /// Expand a single-def pseudo instruction to a two-addr
 /// instruction with two undef reads of the register being defined.
 /// This is used for mapping:
diff --git a/llvm/lib/Target/X86/X86InstrInfo.h b/llvm/lib/Target/X86/X86InstrInfo.h
index 7552410c8d579..e1199e20c318e 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.h
+++ b/llvm/lib/Target/X86/X86InstrInfo.h
@@ -550,6 +550,15 @@ class X86InstrInfo final : public X86GenInstrInfo {
                                   Register &FoldAsLoadDefReg,
                                   MachineInstr *&DefMI) const override;
 
+  bool FoldImmediateImpl(MachineInstr &UseMI, MachineInstr *DefMI, Register Reg,
+                         int64_t ImmVal, MachineRegisterInfo *MRI,
+                         bool MakeChange) const;
+
+  /// Reg is known to be defined by a move immediate instruction, try to fold
+  /// the immediate into the use instruction.
+  bool FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, Register Reg,
+                     MachineRegisterInfo *MRI) const override;
+
   std::pair<unsigned, unsigned>
   decomposeMachineOperandsTargetFlags(unsigned TF) const override;
 
diff --git a/llvm/test/CodeGen/AMDGPU/peephole-fold-imm.mir b/llvm/test/CodeGen/AMDGPU/peephole-fold-imm.mir
index 204c1367d84d8..ade192bde4dca 100644
--- a/llvm/test/CodeGen/AMDGPU/peephole-fold-imm.mir
+++ b/llvm/test/CodeGen/AMDGPU/peephole-fold-imm.mir
@@ -8,7 +8,6 @@ body:             |
 
     ; GCN-LABEL: name: fold_simm_virtual
     ; GCN: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-    ; GCN-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0
     ; GCN-NEXT: SI_RETURN_TO_EPILOG
     %0:sreg_32 = S_MOV_B32 0
     %1:sreg_32 = COPY killed %0
diff --git a/llvm/test/CodeGen/X86/GlobalISel/phi.ll b/llvm/test/CodeGen/X86/GlobalISel/phi.ll
index d2ce98d0fb41a..b29540f002598 100644
--- a/llvm/test/CodeGen/X86/GlobalISel/phi.ll
+++ b/llvm/test/CodeGen/X86/GlobalISel/phi.ll
@@ -4,8 +4,7 @@
 define i8 @test_i8(i32 %a, i8 %f, i8 %t) {
 ; ALL-LABEL: test_i8:
 ; ALL:       # %bb.0: # %entry
-; ALL-NEXT:    xorl %ecx, %ecx
-; ALL-NEXT:    cmpl %ecx, %edi
+; ALL-NEXT:    cmpl $0, %edi
 ; ALL-NEXT:    setg %cl
 ; ALL-NEXT:    testb $1, %cl
 ; ALL-NEXT:    je .LBB0_2
@@ -35,8 +34,7 @@ cond.end:                                         ; preds = %cond.false, %cond.t
 define i16 @test_i16(i32 %a, i16 %f, i16 %t) {
 ; ALL-LABEL: test_i16:
 ; ALL:       # %bb.0: # %entry
-; ALL-NEXT:    xorl %ecx, %ecx
-; ALL-NEXT:    cmpl %ecx, %edi
+; ALL-NEXT:    cmpl $0, %edi
 ; ALL-NEXT:    setg %cl
 ; ALL-NEXT:    testb $1, %cl
 ; ALL-NEXT:    je .LBB1_2
@@ -67,8 +65,7 @@ define i32 @test_i32(i32 %a, i32 %f, i32 %t) {
 ; ALL-LABEL: test_i32:
 ; ALL:       # %bb.0: # %entry
 ; ALL-NEXT:    movl %esi, %eax
-; ALL-NEXT:    xorl %ecx, %ecx
-; ALL-NEXT:    cmpl %ecx, %edi
+; ALL-NEXT:    cmpl $0, %edi
 ; ALL-NEXT:    setg %cl
 ; ALL-NEXT:    testb $1, %cl
 ; ALL-NEXT:    je .LBB2_1
@@ -96,8 +93,7 @@ define i64 @test_i64(i32 %a, i64 %f, i64 %t) {
 ; ALL-LABEL: test_i64:
 ; ALL:       # %bb.0: # %entry
 ; ALL-NEXT:    movq %rsi, %rax
-; ALL-NEXT:    xorl %ecx, %ecx
-; ALL-NEXT:    cmpl %ecx, %edi
+; ALL-NEXT:    cmpl $0, %edi
 ; ALL-NEXT:    setg %cl
 ; ALL-NEXT:    testb $1, %cl
 ; ALL-NEXT:    je .LBB3_1
@@ -124,8 +120,7 @@ cond.end:                                         ; preds = %cond.false, %cond.t
 define float @test_float(i32 %a, float %f, float %t) {
 ; ALL-LABEL: test_float:
 ; ALL:       # %bb.0: # %entry
-; ALL-NEXT:    xorl %eax, %eax
-; ALL-NEXT:    cmpl %eax, %edi
+; ALL-NEXT:    cmpl $0, %edi
 ; ALL-NEXT:    setg %al
 ; ALL-NEXT:    testb $1, %al
 ; ALL-NEXT:    je .LBB4_1
@@ -152,8 +147,7 @@ cond.end:                                         ; preds = %cond.false, %cond.t
 define double @test_double(i32 %a, double %f, double %t) {
 ; ALL-LABEL: test_double:
 ; ALL:       # %bb.0: # %entry
-; ALL-NEXT:    xorl %eax, %eax
-; ALL-NEXT:    cmpl %eax, %edi
+; ALL-NEXT:    cmpl $0, %edi
 ; ALL-NEXT:    setg %al
 ; ALL-NEXT:    testb $1, %al
 ; ALL-NEXT:    je .LBB5_1
diff --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
index bf7c1c00c71df..e12ca56023a7f 100644
--- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
+++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
@@ -178,15 +178,15 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    subl $152, %esp
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    sarl $31, %eax
-; X86-NEXT:    movl %ebp, %edx
-; X86-NEXT:    sarl $31, %edx
+; X86-NEXT:    movl %edx, %edi
+; X86-NEXT:    sarl $31, %edi
 ; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    xorl %ecx, %esi
-; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:    movl %esi, %ebp
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    xorl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl %ecx, %ebx
@@ -195,66 +195,67 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    xorl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    subl %eax, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %esi, (%esp) # 4-byte Spill
 ; X86-NEXT:    sbbl %eax, %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    sbbl %eax, %ebx
-; X86-NEXT:    movl %ebx, (%esp) # 4-byte Spill
-; X86-NEXT:    sbbl %eax, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    xorl %ebp, %edi
-; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl %eax, %ebp
+; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edi, %esi
+; X86-NEXT:    xorl %edx, %esi
+; X86-NEXT:    movl %edi, %edx
+; X86-NEXT:    xorl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edi, %ebx
 ; X86-NEXT:    xorl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    xorl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    movl %edi, %ebp
 ; X86-NEXT:    xorl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    subl %edx, %ebp
-; X86-NEXT:    sbbl %edx, %esi
-; X86-NEXT:    sbbl %edx, %ebx
-; X86-NEXT:    sbbl %edx, %edi
-; X86-NEXT:    xorl %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %esi, %eax
-; X86-NEXT:    orl %edi, %eax
+; X86-NEXT:    subl %edi, %ebp
+; X86-NEXT:    sbbl %edi, %ebx
+; X86-NEXT:    sbbl %edi, %edx
+; X86-NEXT:    sbbl %edi, %esi
+; X86-NEXT:    xorl %eax, %edi
+; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    orl %esi, %eax
 ; X86-NEXT:    movl %ebp, %ecx
-; X86-NEXT:    orl %ebx, %ecx
+; X86-NEXT:    orl %edx, %ecx
+; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    orl %eax, %ecx
 ; X86-NEXT:    sete %cl
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    orl (%esp), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl (%esp), %edx # 4-byte Reload
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    orl %eax, %edx
 ; X86-NEXT:    sete %al
 ; X86-NEXT:    orb %cl, %al
 ; X86-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    bsrl %edi, %edx
+; X86-NEXT:    bsrl %esi, %edx
 ; X86-NEXT:    xorl $31, %edx
-; X86-NEXT:    bsrl %ebx, %ecx
+; X86-NEXT:    bsrl %edi, %ecx
 ; X86-NEXT:    xorl $31, %ecx
 ; X86-NEXT:    addl $32, %ecx
-; X86-NEXT:    testl %edi, %edi
+; X86-NEXT:    testl %esi, %esi
 ; X86-NEXT:    cmovnel %edx, %ecx
-; X86-NEXT:    bsrl %esi, %edx
+; X86-NEXT:    bsrl %ebx, %edx
 ; X86-NEXT:    xorl $31, %edx
 ; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    bsrl %ebp, %ebp
 ; X86-NEXT:    xorl $31, %ebp
 ; X86-NEXT:    addl $32, %ebp
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    testl %esi, %esi
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    testl %ebx, %ebx
 ; X86-NEXT:    cmovnel %edx, %ebp
 ; X86-NEXT:    addl $64, %ebp
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    orl %edi, %ebx
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %esi, %edi
 ; X86-NEXT:    cmovnel %ecx, %ebp
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NEXT:    bsrl %edi, %edx
 ; X86-NEXT:    xorl $31, %edx
-; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    bsrl %eax, %ecx
 ; X86-NEXT:    xorl $31, %ecx
 ; X86-NEXT:    addl $32, %ecx
@@ -263,7 +264,7 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    bsrl %ebx, %esi
 ; X86-NEXT:    xorl $31, %esi
-; X86-NEXT:    bsrl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    bsrl (%esp), %edx # 4-byte Folded Reload
 ; X86-NEXT:    xorl $31, %edx
 ; X86-NEXT:    addl $32, %edx
 ; X86-NEXT:    testl %ebx, %ebx
@@ -271,55 +272,51 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    addl $64, %edx
 ; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    orl %edi, %esi
-; X86-NEXT:    movl %edi, %ebx
 ; X86-NEXT:    cmovnel %ecx, %edx
 ; X86-NEXT:    xorl %esi, %esi
 ; X86-NEXT:    subl %edx, %ebp
+; X86-NEXT:    movl $0, %ebx
+; X86-NEXT:    sbbl %ebx, %ebx
 ; X86-NEXT:    movl $0, %edx
 ; X86-NEXT:    sbbl %edx, %edx
 ; X86-NEXT:    movl $0, %eax
 ; X86-NEXT:    sbbl %eax, %eax
-; X86-NEXT:    movl $0, %edi
-; X86-NEXT:    sbbl %edi, %edi
 ; X86-NEXT:    movl $127, %ecx
 ; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    cmpl %ebp, %ecx
 ; X86-NEXT:    movl $0, %ecx
+; X86-NEXT:    sbbl %ebx, %ecx
+; X86-NEXT:    movl $0, %ecx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    sbbl %edx, %ecx
 ; X86-NEXT:    movl $0, %ecx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    sbbl %eax, %ecx
-; X86-NEXT:    movl $0, %ecx
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    sbbl %edi, %ecx
 ; X86-NEXT:    setb %cl
 ; X86-NEXT:    orb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Folded Reload
-; X86-NEXT:    movl %ebx, %edi
 ; X86-NEXT:    cmovnel %esi, %edi
-; X86-NEXT:    movl (%esp), %edx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NEXT:    cmovnel %esi, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    cmovnel %esi, %eax
-; X86-NEXT:    cmovel {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    jne .LBB4_1
-; X86-NEXT:  # %bb.8: # %_udiv-special-cases
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    xorl $127, %ebp
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    orl %ebp, %ecx
+; X86-NEXT:    cmovel (%esp), %esi # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    jne .LBB4_8
+; X86-NEXT:  # %bb.1: # %_udiv-special-cases
+; X86-NEXT:    movl %ebx, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    je .LBB4_9
-; X86-NEXT:  # %bb.5: # %udiv-bb1
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    xorl $127, %ebx
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    orl %ebx, %ecx
+; X86-NEXT:    je .LBB4_8
+; X86-NEXT:  # %bb.2: # %udiv-bb1
+; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl %eax, {{[0-9]+}}(%esp)
@@ -335,82 +332,78 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    shrb $3, %al
 ; X86-NEXT:    andb $15, %al
 ; X86-NEXT:    negb %al
-; X86-NEXT:    movsbl %al, %edi
-; X86-NEXT:    movl 144(%esp,%edi), %edx
-; X86-NEXT:    movl 148(%esp,%edi), %esi
+; X86-NEXT:    movsbl %al, %ebx
+; X86-NEXT:    movl 144(%esp,%ebx), %edx
+; X86-NEXT:    movl 148(%esp,%ebx), %edi
 ; X86-NEXT:    movb %ch, %cl
-; X86-NEXT:    shldl %cl, %edx, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shldl %cl, %edx, %edi
 ; X86-NEXT:    shll %cl, %edx
 ; X86-NEXT:    notb %cl
-; X86-NEXT:    movl 140(%esp,%edi), %eax
+; X86-NEXT:    movl 140(%esp,%ebx), %eax
 ; X86-NEXT:    movl %eax, %esi
 ; X86-NEXT:    shrl %esi
 ; X86-NEXT:    shrl %cl, %esi
 ; X86-NEXT:    orl %edx, %esi
 ; X86-NEXT:    movl %esi, %edx
-; X86-NEXT:    movl 136(%esp,%edi), %esi
+; X86-NEXT:    movl 136(%esp,%ebx), %esi
 ; X86-NEXT:    movb %ch, %cl
 ; X86-NEXT:    shldl %cl, %esi, %eax
 ; X86-NEXT:    shll %cl, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    addl $1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    adcl $0, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    adcl $0, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    adcl $0, %ebx
 ; X86-NEXT:    adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    jae .LBB4_2
+; X86-NEXT:    jae .LBB4_3
 ; X86-NEXT:  # %bb.6:
-; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    xorl %ecx, %ecx
+; X86-NEXT:    xorl %ebx, %ebx
+; X86-NEXT:    xorl %esi, %esi
 ; X86-NEXT:    jmp .LBB4_7
-; X86-NEXT:  .LBB4_1:
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    jmp .LBB4_9
-; X86-NEXT:  .LBB4_2: # %udiv-preheader
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl (%esp), %ebx # 4-byte Reload
-; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT:  .LBB4_3: # %udiv-preheader
+; X86-NEXT:    movl (%esp), %esi # 4-byte Reload
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    movb %bl, %ch
 ; X86-NEXT:    andb $7, %ch
 ; X86-NEXT:    movb %bl, %cl
 ; X86-NEXT:    shrb $3, %cl
 ; X86-NEXT:    andb $15, %cl
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movzbl %cl, %ebx
-; X86-NEXT:    movl 100(%esp,%ebx), %ebp
-; X86-NEXT:    movl %ebp, (%esp) # 4-byte Spill
-; X86-NEXT:    movl 96(%esp,%ebx), %edi
+; X86-NEXT:    movzbl %cl, %ebp
+; X86-NEXT:    movl 100(%esp,%ebp), %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl 96(%esp,%ebp), %ebx
+; X86-NEXT:    movl %ebp, %eax
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edi, %edx
+; X86-NEXT:    movl %ebx, %edx
 ; X86-NEXT:    movb %ch, %cl
-; X86-NEXT:    shrdl %cl, %ebp, %edx
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 88(%esp,%ebx), %edx
-; X86-NEXT:    movl 92(%esp,%ebx), %ebx
-; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    shrdl %cl, %esi, %edx
+; X86-NEXT:    movl 88(%esp,%ebp), %ebp
+; X86-NEXT:    movl 92(%esp,%eax), %esi
+; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    shrl %cl, %eax
 ; X86-NEXT:    notb %cl
-; X86-NEXT:    addl %edi, %edi
-; X86-NEXT:    shll %cl, %edi
-; X86-NEXT:    orl %eax, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    addl %ebx, %ebx
+; X86-NEXT:    shll %cl, %ebx
+; X86-NEXT:    orl %eax, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movb %ch, %cl
-; X86-NEXT:    shrl %cl, (%esp) # 4-byte Folded Spill
-; X86-NEXT:    shrdl %cl, %ebx, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    shrdl %cl, %esi, %ebp
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    addl $-1, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -420,109 +413,113 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    adcl $-1, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    adcl $-1, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    xorl %esi, %esi
 ; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    .p2align 4, 0x90
-; X86-NEXT:  .LBB4_3: # %udiv-do-while
+; X86-NEXT:  .LBB4_4: # %udiv-do-while
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebp, %edx
-; X86-NEXT:    shldl $1, %ebp, (%esp) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    shldl $1, %ebp, %edx
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    shldl $1, %ebx, %ebp
-; X86-NEXT:    shldl $1, %edi, %ebx
+; X86-NEXT:    shldl $1, %edx, %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    shldl $1, %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT:    shldl $1, %ebp, %edx
+; X86-NEXT:    shldl $1, %edi, %ebp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    shldl $1, %eax, %edi
+; X86-NEXT:    orl %esi, %edi
+; X86-NEXT:    movl %edi, (%esp) # 4-byte Spill
+; X86-NEXT:    movl %ecx, %edi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    shldl $1, %ecx, %edi
+; X86-NEXT:    shldl $1, %ecx, %eax
+; X86-NEXT:    orl %esi, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    orl %eax, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    shldl $1, %edi, %ecx
-; X86-NEXT:    orl %eax, %ecx
+; X86-NEXT:    shldl $1, %eax, %ecx
+; X86-NEXT:    orl %esi, %ecx
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    shldl $1, %esi, %edi
-; X86-NEXT:    orl %eax, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    addl %esi, %esi
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    cmpl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    sbbl %ebp, %ecx
+; X86-NEXT:    addl %eax, %eax
+; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    cmpl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    sbbl %edx, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    sbbl (%esp), %ecx # 4-byte Folded Reload
+; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    sbbl %ebx, %ecx
 ; X86-NEXT:    sarl $31, %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    andl $1, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %ecx, %esi
-; X86-NEXT:    andl $1, %esi
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ecx, %esi
-; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT:    andl %edi, %esi
 ; X86-NEXT:    movl %ecx, %edi
 ; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT:    subl %ecx, %ebx
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    sbbl %eax, %ebp
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    sbbl %edi, %edx
-; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    subl %ecx, %ebp
+; X86-NEXT:    sbbl %eax, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    sbbl %esi, (%esp) # 4-byte Folded Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    sbbl %edi, %edx
+; X86-NEXT:    movl (%esp), %edi # 4-byte Reload
+; X86-NEXT:    sbbl %esi, %ebx
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    addl $-1, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    adcl $-1, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT:    adcl $-1, %edi
-; X86-NEXT:    adcl $-1, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT:    adcl $-1, %esi
+; X86-NEXT:    adcl $-1, %ebx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    orl %edx, %eax
+; X86-NEXT:    orl %ebx, %eax
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    orl %edi, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %esi, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    jne .LBB4_3
-; X86-NEXT:  # %bb.4:
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    jne .LBB4_4
+; X86-NEXT:  # %bb.5:
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:  .LBB4_7: # %udiv-loop-exit
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NEXT:    shldl $1, %edx, %edi
-; X86-NEXT:    orl %ecx, %edi
+; X86-NEXT:    orl %esi, %edi
 ; X86-NEXT:    shldl $1, %eax, %edx
-; X86-NEXT:    orl %ecx, %edx
+; X86-NEXT:    orl %esi, %edx
+; X86-NEXT:    movl %esi, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
 ; X86-NEXT:    shldl $1, %esi, %eax
 ; X86-NEXT:    orl %ecx, %eax
 ; X86-NEXT:    addl %esi, %esi
-; X86-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT:  .LBB4_9: # %udiv-end
-; X86-NEXT:    xorl %ebx, %edi
-; X86-NEXT:    xorl %ebx, %edx
-; X86-NEXT:    xorl %ebx, %eax
-; X86-NEXT:    xorl %ebx, %esi
-; X86-NEXT:    subl %ebx, %esi
+; X86-NEXT:    orl %ebx, %esi
+; X86-NEXT:  .LBB4_8: # %udiv-end
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    xorl %ecx, %edi
+; X86-NEXT:    xorl %ecx, %edx
+; X86-NEXT:    xorl %ecx, %eax
+; X86-NEXT:    xorl %ecx, %esi
+; X86-NEXT:    subl %ecx, %esi
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    sbbl %ebx, %eax
+; X86-NEXT:    sbbl %ecx, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    sbbl %ebx, %edx
-; X86-NEXT:    sbbl %ebx, %edi
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl %ecx, %edx
+; X86-NEXT:    sbbl %ecx, %edi
+; X86-NEXT:    movl %edi, (%esp) # 4-byte Spill
 ; X86-NEXT:    movl %esi, (%ebp)
 ; X86-NEXT:    movl %eax, 4(%ebp)
 ; X86-NEXT:    movl %edx, 8(%ebp)
@@ -535,7 +532,7 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    mull %edi
-; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %edx, %edi
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; X86-NEXT:    adcl $0, %ecx
@@ -556,10 +553,10 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    adcl %eax, %edx
 ; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    movl (%esp), %ecx # 4-byte Reload
 ; X86-NEXT:    imull %eax, %ecx
 ; X86-NEXT:    mull %ebx
-; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %eax, (%esp) # 4-byte Spill
 ; X86-NEXT:    imull {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    addl %edx, %ebx
 ; X86-NEXT:    addl %ecx, %ebx
@@ -571,12 +568,12 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    mull %edx
 ; X86-NEXT:    addl %edx, %ebp
 ; X86-NEXT:    addl %ecx, %ebp
-; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT:    addl (%esp), %eax # 4-byte Folded Reload
 ; X86-NEXT:    adcl %ebx, %ebp
 ; X86-NEXT:    addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-NEXT:    adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    subl (%esp), %edx # 4-byte Folded Reload
+; X86-NEXT:    subl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
diff --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll
index 41f5d8590c237..1372bd8047351 100644
--- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll
+++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll
@@ -316,10 +316,10 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    shll %cl, %ebp
 ; X86-NEXT:    addl $1, %edi
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    adcl $0, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    adcl $0, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    adcl $0, %eax
 ; X86-NEXT:    adcl $0, %ebx
 ; X86-NEXT:    jae .LBB4_3
 ; X86-NEXT:  # %bb.6:
@@ -328,15 +328,14 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    movl %edx, %ebx
 ; X86-NEXT:    jmp .LBB4_7
 ; X86-NEXT:  .LBB4_3: # %udiv-preheader
-; X86-NEXT:    movl %ecx, %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
@@ -347,34 +346,31 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    movb %al, %ch
 ; X86-NEXT:    andb $7, %ch
 ; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-NEXT:    shrb $3, %al
 ; X86-NEXT:    andb $15, %al
 ; X86-NEXT:    movzbl %al, %eax
-; X86-NEXT:    movl 84(%esp,%eax), %ebx
+; X86-NEXT:    movl 84(%esp,%eax), %ebp
 ; X86-NEXT:    movl %esi, %edi
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    movl 80(%esp,%eax), %edx
-; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    movl 80(%esp,%eax), %ebx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %ebx, %esi
 ; X86-NEXT:    movb %ch, %cl
-; X86-NEXT:    shrdl %cl, %ebx, %esi
+; X86-NEXT:    shrdl %cl, %ebp, %esi
 ; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl 72(%esp,%eax), %ebp
+; X86-NEXT:    movl 72(%esp,%eax), %esi
 ; X86-NEXT:    movl 76(%esp,%eax), %eax
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    shrl %cl, %esi
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    shrl %cl, %edx
 ; X86-NEXT:    notb %cl
-; X86-NEXT:    addl %edx, %edx
-; X86-NEXT:    shll %cl, %edx
-; X86-NEXT:    orl %esi, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    addl %ebx, %ebx
+; X86-NEXT:    shll %cl, %ebx
+; X86-NEXT:    orl %edx, %ebx
 ; X86-NEXT:    movb %ch, %cl
-; X86-NEXT:    shrl %cl, %ebx
-; X86-NEXT:    movl %ebx, %edx
-; X86-NEXT:    shrdl %cl, %eax, %ebp
-; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    shrl %cl, %ebp
+; X86-NEXT:    shrdl %cl, %eax, %esi
+; X86-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    addl $-1, %eax
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -394,13 +390,12 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:  .LBB4_4: # %udiv-do-while
 ; X86-NEXT:    # =>This Inner Loop Header: Depth=1
 ; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    shldl $1, %ebx, %edx
-; X86-NEXT:    movl %edx, (%esp) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    shldl $1, %edx, %ebx
+; X86-NEXT:    shldl $1, %edx, %ebp
+; X86-NEXT:    movl %ebp, (%esp) # 4-byte Spill
+; X86-NEXT:    shldl $1, %ebx, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    shldl $1, %ebp, %edx
+; X86-NEXT:    shldl $1, %ebp, %ebx
 ; X86-NEXT:    shldl $1, %esi, %ebp
 ; X86-NEXT:    shldl $1, %edi, %esi
 ; X86-NEXT:    orl %ecx, %esi
@@ -417,10 +412,10 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    cmpl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    sbbl %edx, %ecx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    sbbl %ebx, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    sbbl %edx, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    sbbl (%esp), %ecx # 4-byte Folded Reload
 ; X86-NEXT:    sarl $31, %ecx
 ; X86-NEXT:    movl %ecx, %eax
@@ -436,10 +431,10 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    andl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    subl %ecx, %ebp
 ; X86-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    sbbl %eax, %edx
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    sbbl %edi, %ebx
+; X86-NEXT:    sbbl %eax, %ebx
 ; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl %edi, %edx
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    sbbl %eax, (%esp) # 4-byte Folded Spill
@@ -447,17 +442,18 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
 ; X86-NEXT:    addl $-1, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NEXT:    adcl $-1, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    adcl $-1, %ebx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NEXT:    adcl $-1, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    adcl $-1, %ebx
 ; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    orl %edx, %eax
-; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT:    orl %ebx, %ecx
-; X86-NEXT:    movl (%esp), %edx # 4-byte Reload
+; X86-NEXT:    orl %ebx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    orl %edx, %ecx
+; X86-NEXT:    movl (%esp), %ebp # 4-byte Reload
 ; X86-NEXT:    orl %eax, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    jne .LBB4_4
diff --git a/llvm/test/CodeGen/X86/fast-isel-freeze.ll b/llvm/test/CodeGen/X86/fast-isel-freeze.ll
index 8308a28e00a1d..031bccb018772 100644
--- a/llvm/test/CodeGen/X86/fast-isel-freeze.ll
+++ b/llvm/test/CodeGen/X86/fast-isel-freeze.ll
@@ -11,8 +11,8 @@ define i32 @freeze(i32 %t) {
 ;
 ; FAST-LABEL: freeze:
 ; FAST:       # %bb.0:
-; FAST-NEXT:    movl $10, %eax
-; FAST-NEXT:    xorl %edi, %eax
+; FAST-NEXT:    movl %edi, %eax
+; FAST-NEXT:    xorl $10, %eax
 ; FAST-NEXT:    retq
   %1 = freeze i32 %t
   %2 = freeze i32 10
diff --git a/llvm/test/CodeGen/X86/foldimmediate-size.ll b/llvm/test/CodeGen/X86/foldimmediate-size.ll
new file mode 100644
index 0000000000000..8d4c0a462d02d
--- /dev/null
+++ b/llvm/test/CodeGen/X86/foldimmediate-size.ll
@@ -0,0 +1,57 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc < %s -mtriple=x86_64-unknown | FileCheck %s
+
+; When optimize for size, the constant $858993459 is moved into a register,
+; and use that register in following two andl instructions.
+
+define i32 @cnt32_optsize(i32 %x) nounwind readnone optsize {
+; CHECK-LABEL: cnt32_optsize:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    shrl %eax
+; CHECK-NEXT:    andl $1431655765, %eax # imm = 0x55555555
+; CHECK-NEXT:    subl %eax, %edi
+; CHECK-NEXT:    movl $858993459, %eax # imm = 0x33333333
+; CHECK-NEXT:    movl %edi, %ecx
+; CHECK-NEXT:    andl %eax, %ecx
+; CHECK-NEXT:    shrl $2, %edi
+; CHECK-NEXT:    andl %eax, %edi
+; CHECK-NEXT:    addl %ecx, %edi
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    shrl $4, %eax
+; CHECK-NEXT:    addl %edi, %eax
+; CHECK-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
+; CHECK-NEXT:    imull $16843009, %eax, %eax # imm = 0x1010101
+; CHECK-NEXT:    shrl $24, %eax
+; CHECK-NEXT:    retq
+  %cnt = tail call i32 @llvm.ctpop.i32(i32 %x)
+  ret i32 %cnt
+}
+
+; When optimize for speed, the constant $858993459 can be directly folded into
+; two andl instructions.
+
+define i32 @cnt32_optspeed(i32 %x) nounwind readnone {
+; CHECK-LABEL: cnt32_optspeed:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    shrl %eax
+; CHECK-NEXT:    andl $1431655765, %eax # imm = 0x55555555
+; CHECK-NEXT:    subl %eax, %edi
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    andl $858993459, %eax # imm = 0x33333333
+; CHECK-NEXT:    shrl $2, %edi
+; CHECK-NEXT:    andl $858993459, %edi # imm = 0x33333333
+; CHECK-NEXT:    addl %eax, %edi
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    shrl $4, %eax
+; CHECK-NEXT:    addl %edi, %eax
+; CHECK-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
+; CHECK-NEXT:    imull $16843009, %eax, %eax # imm = 0x1010101
+; CHECK-NEXT:    shrl $24, %eax
+; CHECK-NEXT:    retq
+  %cnt = tail call i32 @llvm.ctpop.i32(i32 %x)
+  ret i32 %cnt
+}
+
+declare i32 @llvm.ctpop.i32(i32) nounwind readnone
diff --git a/llvm/test/CodeGen/X86/foldimmediate.mir b/llvm/test/CodeGen/X86/foldimmediate.mir
new file mode 100644
index 0000000000000..5fd5ae9c1ca9f
--- /dev/null
+++ b/llvm/test/CodeGen/X86/foldimmediate.mir
@@ -0,0 +1,143 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
+# RUN: llc -mtriple=x86_64-- -run-pass=peephole-opt %s -o - | FileCheck %s
+--- |
+  define void @foldImmediate() { ret void }
+...
+---
+# Check that immediates can be folded into ALU instructions.
+name: foldImmediate
+registers:
+  - { id: 0, class: gr32 }
+  - { id: 1, class: gr32 }
+  - { id: 2, class: gr32 }
+  - { id: 3, class: gr32 }
+  - { id: 4, class: gr32 }
+  - { id: 5, class: gr32 }
+  - { id: 6, class: gr32 }
+  - { id: 7, class: gr64 }
+  - { id: 8, class: gr64 }
+  - { id: 9, class: gr64 }
+  - { id: 10, class: gr64 }
+  - { id: 11, class: gr64 }
+  - { id: 12, class: gr64 }
+  - { id: 13, class: gr64 }
+  - { id: 14, class: gr64 }
+  - { id: 15, class: gr64 }
+  - { id: 16, class: gr32 }
+  - { id: 17, class: gr64 }
+  - { id: 18, class: gr32 }
+
+body: |
+  bb.0:
+    liveins: $rdi, $rsi
+
+    ; CHECK-LABEL: name: foldImmediate
+    ; CHECK: liveins: $rdi, $rsi
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[MOV32ri:%[0-9]+]]:gr32 = MOV32ri 81
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr32 = COPY $edi
+    ; CHECK-NEXT: [[ADD32ri:%[0-9]+]]:gr32 = ADD32ri [[COPY]], 81, implicit-def $eflags
+    ; CHECK-NEXT: NOOP implicit [[ADD32ri]]
+    ; CHECK-NEXT: [[SUB32ri:%[0-9]+]]:gr32 = SUB32ri [[COPY]], 81, implicit-def $eflags
+    ; CHECK-NEXT: NOOP implicit [[SUB32ri]]
+    ; CHECK-NEXT: [[AND32ri:%[0-9]+]]:gr32 = AND32ri [[COPY]], 81, implicit-def $eflags
+    ; CHECK-NEXT: NOOP implicit [[AND32ri]]
+    ; CHECK-NEXT: [[OR32ri:%[0-9]+]]:gr32 = OR32ri [[COPY]], 81, implicit-def $eflags
+    ; CHECK-NEXT: NOOP implicit [[OR32ri]]
+    ; CHECK-NEXT: [[XOR32ri:%[0-9]+]]:gr32 = XOR32ri [[COPY]], 81, implicit-def $eflags
+    ; CHECK-NEXT: NOOP implicit [[XOR32ri]]
+    ; CHECK-NEXT: TEST32ri [[COPY]], 81, implicit-def $eflags
+    ; CHECK-NEXT: NOOP implicit $eflags
+    ; CHECK-NEXT: CMP32ri [[COPY]], 81, implicit-def $eflags
+    ; CHECK-NEXT: NOOP implicit $eflags
+    ; CHECK-NEXT: [[ADC32ri:%[0-9]+]]:gr32 = ADC32ri [[COPY]], 81, implicit-def $eflags, implicit $eflags
+    ; CHECK-NEXT: NOOP implicit [[ADC32ri]]
+    ; CHECK-NEXT: [[SBB32ri:%[0-9]+]]:gr32 = SBB32ri [[COPY]], 81, implicit-def $eflags, implicit $eflags
+    ; CHECK-NEXT: NOOP implicit [[SBB32ri]]
+    ; CHECK-NEXT: [[SUBREG_TO_REG:%[0-9]+]]:gr64 = SUBREG_TO_REG 0, killed [[MOV32ri]], %subreg.sub_32bit
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr64 = COPY $rsi
+    ; CHECK-NEXT: [[ADD64ri32_:%[0-9]+]]:gr64 = ADD64ri32 [[COPY1]], 81, implicit-def $eflags
+    ; CHECK-NEXT: NOOP implicit [[ADD64ri32_]]
+    ; CHECK-NEXT: [[SUB64ri32_:%[0-9]+]]:gr64 = SUB64ri32 [[COPY1]], 81, implicit-def $eflags
+    ; CHECK-NEXT: NOOP implicit [[SUB64ri32_]]
+    ; CHECK-NEXT: [[AND64ri32_:%[0-9]+]]:gr64 = AND64ri32 [[COPY1]], 81, implicit-def $eflags
+    ; CHECK-NEXT: NOOP implicit [[AND64ri32_]]
+    ; CHECK-NEXT: [[OR64ri32_:%[0-9]+]]:gr64 = OR64ri32 [[COPY1]], 81, implicit-def $eflags
+    ; CHECK-NEXT: NOOP implicit [[OR64ri32_]]
+    ; CHECK-NEXT: [[XOR64ri32_:%[0-9]+]]:gr64 = XOR64ri32 [[COPY1]], 81, implicit-def $eflags
+    ; CHECK-NEXT: NOOP implicit [[XOR64ri32_]]
+    ; CHECK-NEXT: [[MOV32ri64_:%[0-9]+]]:gr64 = MOV32ri64 81
+    ; CHECK-NEXT: NOOP implicit [[MOV32ri64_]]
+    ; CHECK-NEXT: TEST64ri32 [[COPY1]], 81, implicit-def $eflags
+    ; CHECK-NEXT: NOOP implicit $eflags
+    ; CHECK-NEXT: [[ADC64ri32_:%[0-9]+]]:gr64 = ADC64ri32 [[COPY1]], 81, implicit-def $eflags, implicit $eflags
+    ; CHECK-NEXT: NOOP implicit [[ADC64ri32_]]
+    ; CHECK-NEXT: [[SBB64ri32_:%[0-9]+]]:gr64 = SBB64ri32 [[COPY1]], 81, implicit-def $eflags, implicit $eflags
+    ; CHECK-NEXT: NOOP implicit [[SBB64ri32_]]
+    ; CHECK-NEXT: CMP64ri32 [[COPY1]], 81, implicit-def $eflags
+    ; CHECK-NEXT: NOOP implicit $eflags
+    ; CHECK-NEXT: CMP64rr [[SUBREG_TO_REG]], [[COPY1]], implicit-def $eflags
+    ; CHECK-NEXT: NOOP implicit $eflags
+    %0 = MOV32ri 81
+    %1 = COPY $edi
+    %2 = ADD32rr %0, %1, implicit-def $eflags
+    NOOP implicit %2
+
+    %3 = SUB32rr %1, %0, implicit-def $eflags
+    NOOP implicit %3
+
+    %4 = AND32rr %0, %1, implicit-def $eflags
+    NOOP implicit %4
+
+    %5 = OR32rr %0, %1, implicit-def $eflags
+    NOOP implicit %5
+
+    %6 = XOR32rr %0, %1, implicit-def $eflags
+    NOOP implicit %6
+
+    TEST32rr %0, %1, implicit-def $eflags
+    NOOP implicit $eflags
+
+    CMP32rr %1, %0, implicit-def $eflags
+    NOOP implicit $eflags
+
+    %16 = ADC32rr %0, %1, implicit-def $eflags, implicit $eflags
+    NOOP implicit %16
+
+    %18 = SBB32rr %1, %0, implicit-def $eflags, implicit $eflags
+    NOOP implicit %18
+
+    %7 = SUBREG_TO_REG 0, killed %0:gr32, %subreg.sub_32bit
+    %8 = COPY $rsi
+    %9 = ADD64rr %7, %8, implicit-def $eflags
+    NOOP implicit %9
+
+    %10 = SUB64rr %8, %7, implicit-def $eflags
+    NOOP implicit %10
+
+    %11 = AND64rr %8, %7, implicit-def $eflags
+    NOOP implicit %11
+
+    %12 = OR64rr %8, %7, implicit-def $eflags
+    NOOP implicit %12
+
+    %13 = XOR64rr %8, %7, implicit-def $eflags
+    NOOP implicit %13
+
+    %14 = COPY %7
+    NOOP implicit %14
+
+    TEST64rr %8, %7, implicit-def $eflags
+    NOOP implicit $eflags
+
+    %15 = ADC64rr %8, %7, implicit-def $eflags, implicit $eflags
+    NOOP implicit %15
+
+    %17 = SBB64rr %8, %7, implicit-def $eflags, implicit $eflags
+    NOOP implicit %17
+
+    CMP64rr %8, %7, implicit-def $eflags
+    NOOP implicit $eflags
+    CMP64rr %7, %8, implicit-def $eflags
+    NOOP implicit $eflags
+...
diff --git a/llvm/test/CodeGen/X86/isel-icmp.ll b/llvm/test/CodeGen/X86/isel-icmp.ll
index fa5dbaab7e970..d6f8fba81ce4b 100644
--- a/llvm/test/CodeGen/X86/isel-icmp.ll
+++ b/llvm/test/CodeGen/X86/isel-icmp.ll
@@ -168,12 +168,11 @@ define i32 @test_icmp_eq_i64(i64 %a, i64 %b) {
 ; GISEL-X86:       ## %bb.0:
 ; GISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; GISEL-X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; GISEL-X86-NEXT:    xorl %edx, %edx
 ; GISEL-X86-NEXT:    xorl {{[0-9]+}}(%esp), %eax
 ; GISEL-X86-NEXT:    xorl {{[0-9]+}}(%esp), %ecx
 ; GISEL-X86-NEXT:    orl %eax, %ecx
 ; GISEL-X86-NEXT:    xorl %eax, %eax
-; GISEL-X86-NEXT:    cmpl %edx, %ecx
+; GISEL-X86-NEXT:    cmpl $0, %ecx
 ; GISEL-X86-NEXT:    sete %al
 ; GISEL-X86-NEXT:    andl $1, %eax
 ; GISEL-X86-NEXT:    retl
diff --git a/llvm/test/CodeGen/X86/pcsections-atomics.ll b/llvm/test/CodeGen/X86/pcsections-atomics.ll
index e6604c957f1fa..cfc9d50763af4 100644
--- a/llvm/test/CodeGen/X86/pcsections-atomics.ll
+++ b/llvm/test/CodeGen/X86/pcsections-atomics.ll
@@ -2148,14 +2148,17 @@ define void @atomic8_cas_monotonic(ptr %a) {
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
 ; O1-NEXT:    movb $1, %cl
-; O1-NEXT:    movb $42, %al
 ; O1-NEXT:  .Lpcsection65:
-; O1-NEXT:    lock cmpxchgb %cl, (%rdi)
 ; O1-NEXT:    movb $42, %al
 ; O1-NEXT:  .Lpcsection66:
 ; O1-NEXT:    lock cmpxchgb %cl, (%rdi)
-; O1-NEXT:    movb $42, %al
 ; O1-NEXT:  .Lpcsection67:
+; O1-NEXT:    movb $42, %al
+; O1-NEXT:  .Lpcsection68:
+; O1-NEXT:    lock cmpxchgb %cl, (%rdi)
+; O1-NEXT:  .Lpcsection69:
+; O1-NEXT:    movb $42, %al
+; O1-NEXT:  .Lpcsection70:
 ; O1-NEXT:    lock cmpxchgb %cl, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -2164,14 +2167,17 @@ define void @atomic8_cas_monotonic(ptr %a) {
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
 ; O2-NEXT:    movb $1, %cl
-; O2-NEXT:    movb $42, %al
 ; O2-NEXT:  .Lpcsection65:
-; O2-NEXT:    lock cmpxchgb %cl, (%rdi)
 ; O2-NEXT:    movb $42, %al
 ; O2-NEXT:  .Lpcsection66:
 ; O2-NEXT:    lock cmpxchgb %cl, (%rdi)
-; O2-NEXT:    movb $42, %al
 ; O2-NEXT:  .Lpcsection67:
+; O2-NEXT:    movb $42, %al
+; O2-NEXT:  .Lpcsection68:
+; O2-NEXT:    lock cmpxchgb %cl, (%rdi)
+; O2-NEXT:  .Lpcsection69:
+; O2-NEXT:    movb $42, %al
+; O2-NEXT:  .Lpcsection70:
 ; O2-NEXT:    lock cmpxchgb %cl, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -2180,14 +2186,17 @@ define void @atomic8_cas_monotonic(ptr %a) {
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
 ; O3-NEXT:    movb $1, %cl
-; O3-NEXT:    movb $42, %al
 ; O3-NEXT:  .Lpcsection65:
-; O3-NEXT:    lock cmpxchgb %cl, (%rdi)
 ; O3-NEXT:    movb $42, %al
 ; O3-NEXT:  .Lpcsection66:
 ; O3-NEXT:    lock cmpxchgb %cl, (%rdi)
-; O3-NEXT:    movb $42, %al
 ; O3-NEXT:  .Lpcsection67:
+; O3-NEXT:    movb $42, %al
+; O3-NEXT:  .Lpcsection68:
+; O3-NEXT:    lock cmpxchgb %cl, (%rdi)
+; O3-NEXT:  .Lpcsection69:
+; O3-NEXT:    movb $42, %al
+; O3-NEXT:  .Lpcsection70:
 ; O3-NEXT:    lock cmpxchgb %cl, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -2226,14 +2235,17 @@ define void @atomic8_cas_acquire(ptr %a) {
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
 ; O1-NEXT:    movb $1, %cl
+; O1-NEXT:  .Lpcsection71:
 ; O1-NEXT:    movb $42, %al
-; O1-NEXT:  .Lpcsection68:
+; O1-NEXT:  .Lpcsection72:
 ; O1-NEXT:    lock cmpxchgb %cl, (%rdi)
+; O1-NEXT:  .Lpcsection73:
 ; O1-NEXT:    movb $42, %al
-; O1-NEXT:  .Lpcsection69:
+; O1-NEXT:  .Lpcsection74:
 ; O1-NEXT:    lock cmpxchgb %cl, (%rdi)
+; O1-NEXT:  .Lpcsection75:
 ; O1-NEXT:    movb $42, %al
-; O1-NEXT:  .Lpcsection70:
+; O1-NEXT:  .Lpcsection76:
 ; O1-NEXT:    lock cmpxchgb %cl, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -2242,14 +2254,17 @@ define void @atomic8_cas_acquire(ptr %a) {
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
 ; O2-NEXT:    movb $1, %cl
+; O2-NEXT:  .Lpcsection71:
 ; O2-NEXT:    movb $42, %al
-; O2-NEXT:  .Lpcsection68:
+; O2-NEXT:  .Lpcsection72:
 ; O2-NEXT:    lock cmpxchgb %cl, (%rdi)
+; O2-NEXT:  .Lpcsection73:
 ; O2-NEXT:    movb $42, %al
-; O2-NEXT:  .Lpcsection69:
+; O2-NEXT:  .Lpcsection74:
 ; O2-NEXT:    lock cmpxchgb %cl, (%rdi)
+; O2-NEXT:  .Lpcsection75:
 ; O2-NEXT:    movb $42, %al
-; O2-NEXT:  .Lpcsection70:
+; O2-NEXT:  .Lpcsection76:
 ; O2-NEXT:    lock cmpxchgb %cl, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -2258,14 +2273,17 @@ define void @atomic8_cas_acquire(ptr %a) {
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
 ; O3-NEXT:    movb $1, %cl
+; O3-NEXT:  .Lpcsection71:
 ; O3-NEXT:    movb $42, %al
-; O3-NEXT:  .Lpcsection68:
+; O3-NEXT:  .Lpcsection72:
 ; O3-NEXT:    lock cmpxchgb %cl, (%rdi)
+; O3-NEXT:  .Lpcsection73:
 ; O3-NEXT:    movb $42, %al
-; O3-NEXT:  .Lpcsection69:
+; O3-NEXT:  .Lpcsection74:
 ; O3-NEXT:    lock cmpxchgb %cl, (%rdi)
+; O3-NEXT:  .Lpcsection75:
 ; O3-NEXT:    movb $42, %al
-; O3-NEXT:  .Lpcsection70:
+; O3-NEXT:  .Lpcsection76:
 ; O3-NEXT:    lock cmpxchgb %cl, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -2304,14 +2322,17 @@ define void @atomic8_cas_release(ptr %a) {
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
 ; O1-NEXT:    movb $1, %cl
+; O1-NEXT:  .Lpcsection77:
 ; O1-NEXT:    movb $42, %al
-; O1-NEXT:  .Lpcsection71:
+; O1-NEXT:  .Lpcsection78:
 ; O1-NEXT:    lock cmpxchgb %cl, (%rdi)
+; O1-NEXT:  .Lpcsection79:
 ; O1-NEXT:    movb $42, %al
-; O1-NEXT:  .Lpcsection72:
+; O1-NEXT:  .Lpcsection80:
 ; O1-NEXT:    lock cmpxchgb %cl, (%rdi)
+; O1-NEXT:  .Lpcsection81:
 ; O1-NEXT:    movb $42, %al
-; O1-NEXT:  .Lpcsection73:
+; O1-NEXT:  .Lpcsection82:
 ; O1-NEXT:    lock cmpxchgb %cl, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -2320,14 +2341,17 @@ define void @atomic8_cas_release(ptr %a) {
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
 ; O2-NEXT:    movb $1, %cl
+; O2-NEXT:  .Lpcsection77:
 ; O2-NEXT:    movb $42, %al
-; O2-NEXT:  .Lpcsection71:
+; O2-NEXT:  .Lpcsection78:
 ; O2-NEXT:    lock cmpxchgb %cl, (%rdi)
+; O2-NEXT:  .Lpcsection79:
 ; O2-NEXT:    movb $42, %al
-; O2-NEXT:  .Lpcsection72:
+; O2-NEXT:  .Lpcsection80:
 ; O2-NEXT:    lock cmpxchgb %cl, (%rdi)
+; O2-NEXT:  .Lpcsection81:
 ; O2-NEXT:    movb $42, %al
-; O2-NEXT:  .Lpcsection73:
+; O2-NEXT:  .Lpcsection82:
 ; O2-NEXT:    lock cmpxchgb %cl, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -2336,14 +2360,17 @@ define void @atomic8_cas_release(ptr %a) {
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
 ; O3-NEXT:    movb $1, %cl
+; O3-NEXT:  .Lpcsection77:
 ; O3-NEXT:    movb $42, %al
-; O3-NEXT:  .Lpcsection71:
+; O3-NEXT:  .Lpcsection78:
 ; O3-NEXT:    lock cmpxchgb %cl, (%rdi)
+; O3-NEXT:  .Lpcsection79:
 ; O3-NEXT:    movb $42, %al
-; O3-NEXT:  .Lpcsection72:
+; O3-NEXT:  .Lpcsection80:
 ; O3-NEXT:    lock cmpxchgb %cl, (%rdi)
+; O3-NEXT:  .Lpcsection81:
 ; O3-NEXT:    movb $42, %al
-; O3-NEXT:  .Lpcsection73:
+; O3-NEXT:  .Lpcsection82:
 ; O3-NEXT:    lock cmpxchgb %cl, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -2382,14 +2409,17 @@ define void @atomic8_cas_acq_rel(ptr %a) {
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
 ; O1-NEXT:    movb $1, %cl
+; O1-NEXT:  .Lpcsection83:
 ; O1-NEXT:    movb $42, %al
-; O1-NEXT:  .Lpcsection74:
+; O1-NEXT:  .Lpcsection84:
 ; O1-NEXT:    lock cmpxchgb %cl, (%rdi)
+; O1-NEXT:  .Lpcsection85:
 ; O1-NEXT:    movb $42, %al
-; O1-NEXT:  .Lpcsection75:
+; O1-NEXT:  .Lpcsection86:
 ; O1-NEXT:    lock cmpxchgb %cl, (%rdi)
+; O1-NEXT:  .Lpcsection87:
 ; O1-NEXT:    movb $42, %al
-; O1-NEXT:  .Lpcsection76:
+; O1-NEXT:  .Lpcsection88:
 ; O1-NEXT:    lock cmpxchgb %cl, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -2398,14 +2428,17 @@ define void @atomic8_cas_acq_rel(ptr %a) {
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
 ; O2-NEXT:    movb $1, %cl
+; O2-NEXT:  .Lpcsection83:
 ; O2-NEXT:    movb $42, %al
-; O2-NEXT:  .Lpcsection74:
+; O2-NEXT:  .Lpcsection84:
 ; O2-NEXT:    lock cmpxchgb %cl, (%rdi)
+; O2-NEXT:  .Lpcsection85:
 ; O2-NEXT:    movb $42, %al
-; O2-NEXT:  .Lpcsection75:
+; O2-NEXT:  .Lpcsection86:
 ; O2-NEXT:    lock cmpxchgb %cl, (%rdi)
+; O2-NEXT:  .Lpcsection87:
 ; O2-NEXT:    movb $42, %al
-; O2-NEXT:  .Lpcsection76:
+; O2-NEXT:  .Lpcsection88:
 ; O2-NEXT:    lock cmpxchgb %cl, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -2414,14 +2447,17 @@ define void @atomic8_cas_acq_rel(ptr %a) {
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
 ; O3-NEXT:    movb $1, %cl
+; O3-NEXT:  .Lpcsection83:
 ; O3-NEXT:    movb $42, %al
-; O3-NEXT:  .Lpcsection74:
+; O3-NEXT:  .Lpcsection84:
 ; O3-NEXT:    lock cmpxchgb %cl, (%rdi)
+; O3-NEXT:  .Lpcsection85:
 ; O3-NEXT:    movb $42, %al
-; O3-NEXT:  .Lpcsection75:
+; O3-NEXT:  .Lpcsection86:
 ; O3-NEXT:    lock cmpxchgb %cl, (%rdi)
+; O3-NEXT:  .Lpcsection87:
 ; O3-NEXT:    movb $42, %al
-; O3-NEXT:  .Lpcsection76:
+; O3-NEXT:  .Lpcsection88:
 ; O3-NEXT:    lock cmpxchgb %cl, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -2460,14 +2496,17 @@ define void @atomic8_cas_seq_cst(ptr %a) {
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
 ; O1-NEXT:    movb $1, %cl
+; O1-NEXT:  .Lpcsection89:
 ; O1-NEXT:    movb $42, %al
-; O1-NEXT:  .Lpcsection77:
+; O1-NEXT:  .Lpcsection90:
 ; O1-NEXT:    lock cmpxchgb %cl, (%rdi)
+; O1-NEXT:  .Lpcsection91:
 ; O1-NEXT:    movb $42, %al
-; O1-NEXT:  .Lpcsection78:
+; O1-NEXT:  .Lpcsection92:
 ; O1-NEXT:    lock cmpxchgb %cl, (%rdi)
+; O1-NEXT:  .Lpcsection93:
 ; O1-NEXT:    movb $42, %al
-; O1-NEXT:  .Lpcsection79:
+; O1-NEXT:  .Lpcsection94:
 ; O1-NEXT:    lock cmpxchgb %cl, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -2476,14 +2515,17 @@ define void @atomic8_cas_seq_cst(ptr %a) {
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
 ; O2-NEXT:    movb $1, %cl
+; O2-NEXT:  .Lpcsection89:
 ; O2-NEXT:    movb $42, %al
-; O2-NEXT:  .Lpcsection77:
+; O2-NEXT:  .Lpcsection90:
 ; O2-NEXT:    lock cmpxchgb %cl, (%rdi)
+; O2-NEXT:  .Lpcsection91:
 ; O2-NEXT:    movb $42, %al
-; O2-NEXT:  .Lpcsection78:
+; O2-NEXT:  .Lpcsection92:
 ; O2-NEXT:    lock cmpxchgb %cl, (%rdi)
+; O2-NEXT:  .Lpcsection93:
 ; O2-NEXT:    movb $42, %al
-; O2-NEXT:  .Lpcsection79:
+; O2-NEXT:  .Lpcsection94:
 ; O2-NEXT:    lock cmpxchgb %cl, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -2492,14 +2534,17 @@ define void @atomic8_cas_seq_cst(ptr %a) {
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
 ; O3-NEXT:    movb $1, %cl
+; O3-NEXT:  .Lpcsection89:
 ; O3-NEXT:    movb $42, %al
-; O3-NEXT:  .Lpcsection77:
+; O3-NEXT:  .Lpcsection90:
 ; O3-NEXT:    lock cmpxchgb %cl, (%rdi)
+; O3-NEXT:  .Lpcsection91:
 ; O3-NEXT:    movb $42, %al
-; O3-NEXT:  .Lpcsection78:
+; O3-NEXT:  .Lpcsection92:
 ; O3-NEXT:    lock cmpxchgb %cl, (%rdi)
+; O3-NEXT:  .Lpcsection93:
 ; O3-NEXT:    movb $42, %al
-; O3-NEXT:  .Lpcsection79:
+; O3-NEXT:  .Lpcsection94:
 ; O3-NEXT:    lock cmpxchgb %cl, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -2524,7 +2569,7 @@ define i16 @atomic16_load_unordered(ptr %a) {
 ; O1-LABEL: atomic16_load_unordered:
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection80:
+; O1-NEXT:  .Lpcsection95:
 ; O1-NEXT:    movzwl (%rdi), %eax
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -2532,7 +2577,7 @@ define i16 @atomic16_load_unordered(ptr %a) {
 ; O2-LABEL: atomic16_load_unordered:
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection80:
+; O2-NEXT:  .Lpcsection95:
 ; O2-NEXT:    movzwl (%rdi), %eax
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -2540,7 +2585,7 @@ define i16 @atomic16_load_unordered(ptr %a) {
 ; O3-LABEL: atomic16_load_unordered:
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection80:
+; O3-NEXT:  .Lpcsection95:
 ; O3-NEXT:    movzwl (%rdi), %eax
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -2563,7 +2608,7 @@ define i16 @atomic16_load_monotonic(ptr %a) {
 ; O1-LABEL: atomic16_load_monotonic:
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection81:
+; O1-NEXT:  .Lpcsection96:
 ; O1-NEXT:    movzwl (%rdi), %eax
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -2571,7 +2616,7 @@ define i16 @atomic16_load_monotonic(ptr %a) {
 ; O2-LABEL: atomic16_load_monotonic:
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection81:
+; O2-NEXT:  .Lpcsection96:
 ; O2-NEXT:    movzwl (%rdi), %eax
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -2579,7 +2624,7 @@ define i16 @atomic16_load_monotonic(ptr %a) {
 ; O3-LABEL: atomic16_load_monotonic:
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection81:
+; O3-NEXT:  .Lpcsection96:
 ; O3-NEXT:    movzwl (%rdi), %eax
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -2602,7 +2647,7 @@ define i16 @atomic16_load_acquire(ptr %a) {
 ; O1-LABEL: atomic16_load_acquire:
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection82:
+; O1-NEXT:  .Lpcsection97:
 ; O1-NEXT:    movzwl (%rdi), %eax
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -2610,7 +2655,7 @@ define i16 @atomic16_load_acquire(ptr %a) {
 ; O2-LABEL: atomic16_load_acquire:
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection82:
+; O2-NEXT:  .Lpcsection97:
 ; O2-NEXT:    movzwl (%rdi), %eax
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -2618,7 +2663,7 @@ define i16 @atomic16_load_acquire(ptr %a) {
 ; O3-LABEL: atomic16_load_acquire:
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection82:
+; O3-NEXT:  .Lpcsection97:
 ; O3-NEXT:    movzwl (%rdi), %eax
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -2641,7 +2686,7 @@ define i16 @atomic16_load_seq_cst(ptr %a) {
 ; O1-LABEL: atomic16_load_seq_cst:
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection83:
+; O1-NEXT:  .Lpcsection98:
 ; O1-NEXT:    movzwl (%rdi), %eax
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -2649,7 +2694,7 @@ define i16 @atomic16_load_seq_cst(ptr %a) {
 ; O2-LABEL: atomic16_load_seq_cst:
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection83:
+; O2-NEXT:  .Lpcsection98:
 ; O2-NEXT:    movzwl (%rdi), %eax
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -2657,7 +2702,7 @@ define i16 @atomic16_load_seq_cst(ptr %a) {
 ; O3-LABEL: atomic16_load_seq_cst:
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection83:
+; O3-NEXT:  .Lpcsection98:
 ; O3-NEXT:    movzwl (%rdi), %eax
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -2680,7 +2725,7 @@ define void @atomic16_store_unordered(ptr %a) {
 ; O1-LABEL: atomic16_store_unordered:
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection84:
+; O1-NEXT:  .Lpcsection99:
 ; O1-NEXT:    movw $42, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -2688,7 +2733,7 @@ define void @atomic16_store_unordered(ptr %a) {
 ; O2-LABEL: atomic16_store_unordered:
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection84:
+; O2-NEXT:  .Lpcsection99:
 ; O2-NEXT:    movw $42, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -2696,7 +2741,7 @@ define void @atomic16_store_unordered(ptr %a) {
 ; O3-LABEL: atomic16_store_unordered:
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection84:
+; O3-NEXT:  .Lpcsection99:
 ; O3-NEXT:    movw $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -2719,7 +2764,7 @@ define void @atomic16_store_monotonic(ptr %a) {
 ; O1-LABEL: atomic16_store_monotonic:
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection85:
+; O1-NEXT:  .Lpcsection100:
 ; O1-NEXT:    movw $42, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -2727,7 +2772,7 @@ define void @atomic16_store_monotonic(ptr %a) {
 ; O2-LABEL: atomic16_store_monotonic:
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection85:
+; O2-NEXT:  .Lpcsection100:
 ; O2-NEXT:    movw $42, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -2735,7 +2780,7 @@ define void @atomic16_store_monotonic(ptr %a) {
 ; O3-LABEL: atomic16_store_monotonic:
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection85:
+; O3-NEXT:  .Lpcsection100:
 ; O3-NEXT:    movw $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -2758,7 +2803,7 @@ define void @atomic16_store_release(ptr %a) {
 ; O1-LABEL: atomic16_store_release:
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection86:
+; O1-NEXT:  .Lpcsection101:
 ; O1-NEXT:    movw $42, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -2766,7 +2811,7 @@ define void @atomic16_store_release(ptr %a) {
 ; O2-LABEL: atomic16_store_release:
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection86:
+; O2-NEXT:  .Lpcsection101:
 ; O2-NEXT:    movw $42, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -2774,7 +2819,7 @@ define void @atomic16_store_release(ptr %a) {
 ; O3-LABEL: atomic16_store_release:
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection86:
+; O3-NEXT:  .Lpcsection101:
 ; O3-NEXT:    movw $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -2799,7 +2844,7 @@ define void @atomic16_store_seq_cst(ptr %a) {
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
 ; O1-NEXT:    movw $42, %ax
-; O1-NEXT:  .Lpcsection87:
+; O1-NEXT:  .Lpcsection102:
 ; O1-NEXT:    xchgw %ax, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -2808,7 +2853,7 @@ define void @atomic16_store_seq_cst(ptr %a) {
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
 ; O2-NEXT:    movw $42, %ax
-; O2-NEXT:  .Lpcsection87:
+; O2-NEXT:  .Lpcsection102:
 ; O2-NEXT:    xchgw %ax, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -2817,7 +2862,7 @@ define void @atomic16_store_seq_cst(ptr %a) {
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
 ; O3-NEXT:    movw $42, %ax
-; O3-NEXT:  .Lpcsection87:
+; O3-NEXT:  .Lpcsection102:
 ; O3-NEXT:    xchgw %ax, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -2842,7 +2887,7 @@ define void @atomic16_xchg_monotonic(ptr %a) {
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
 ; O1-NEXT:    movw $42, %ax
-; O1-NEXT:  .Lpcsection88:
+; O1-NEXT:  .Lpcsection103:
 ; O1-NEXT:    xchgw %ax, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -2851,7 +2896,7 @@ define void @atomic16_xchg_monotonic(ptr %a) {
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
 ; O2-NEXT:    movw $42, %ax
-; O2-NEXT:  .Lpcsection88:
+; O2-NEXT:  .Lpcsection103:
 ; O2-NEXT:    xchgw %ax, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -2860,7 +2905,7 @@ define void @atomic16_xchg_monotonic(ptr %a) {
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
 ; O3-NEXT:    movw $42, %ax
-; O3-NEXT:  .Lpcsection88:
+; O3-NEXT:  .Lpcsection103:
 ; O3-NEXT:    xchgw %ax, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -2883,7 +2928,7 @@ define void @atomic16_add_monotonic(ptr %a) {
 ; O1-LABEL: atomic16_add_monotonic:
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection89:
+; O1-NEXT:  .Lpcsection104:
 ; O1-NEXT:    lock addw $42, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -2891,7 +2936,7 @@ define void @atomic16_add_monotonic(ptr %a) {
 ; O2-LABEL: atomic16_add_monotonic:
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection89:
+; O2-NEXT:  .Lpcsection104:
 ; O2-NEXT:    lock addw $42, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -2899,7 +2944,7 @@ define void @atomic16_add_monotonic(ptr %a) {
 ; O3-LABEL: atomic16_add_monotonic:
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection89:
+; O3-NEXT:  .Lpcsection104:
 ; O3-NEXT:    lock addw $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -2922,7 +2967,7 @@ define void @atomic16_sub_monotonic(ptr %a) {
 ; O1-LABEL: atomic16_sub_monotonic:
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection90:
+; O1-NEXT:  .Lpcsection105:
 ; O1-NEXT:    lock subw $42, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -2930,7 +2975,7 @@ define void @atomic16_sub_monotonic(ptr %a) {
 ; O2-LABEL: atomic16_sub_monotonic:
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection90:
+; O2-NEXT:  .Lpcsection105:
 ; O2-NEXT:    lock subw $42, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -2938,7 +2983,7 @@ define void @atomic16_sub_monotonic(ptr %a) {
 ; O3-LABEL: atomic16_sub_monotonic:
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection90:
+; O3-NEXT:  .Lpcsection105:
 ; O3-NEXT:    lock subw $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -2961,7 +3006,7 @@ define void @atomic16_and_monotonic(ptr %a) {
 ; O1-LABEL: atomic16_and_monotonic:
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection91:
+; O1-NEXT:  .Lpcsection106:
 ; O1-NEXT:    lock andw $42, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -2969,7 +3014,7 @@ define void @atomic16_and_monotonic(ptr %a) {
 ; O2-LABEL: atomic16_and_monotonic:
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection91:
+; O2-NEXT:  .Lpcsection106:
 ; O2-NEXT:    lock andw $42, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -2977,7 +3022,7 @@ define void @atomic16_and_monotonic(ptr %a) {
 ; O3-LABEL: atomic16_and_monotonic:
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection91:
+; O3-NEXT:  .Lpcsection106:
 ; O3-NEXT:    lock andw $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -3000,7 +3045,7 @@ define void @atomic16_or_monotonic(ptr %a) {
 ; O1-LABEL: atomic16_or_monotonic:
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection92:
+; O1-NEXT:  .Lpcsection107:
 ; O1-NEXT:    lock orw $42, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -3008,7 +3053,7 @@ define void @atomic16_or_monotonic(ptr %a) {
 ; O2-LABEL: atomic16_or_monotonic:
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection92:
+; O2-NEXT:  .Lpcsection107:
 ; O2-NEXT:    lock orw $42, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -3016,7 +3061,7 @@ define void @atomic16_or_monotonic(ptr %a) {
 ; O3-LABEL: atomic16_or_monotonic:
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection92:
+; O3-NEXT:  .Lpcsection107:
 ; O3-NEXT:    lock orw $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -3039,7 +3084,7 @@ define void @atomic16_xor_monotonic(ptr %a) {
 ; O1-LABEL: atomic16_xor_monotonic:
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection93:
+; O1-NEXT:  .Lpcsection108:
 ; O1-NEXT:    lock xorw $42, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -3047,7 +3092,7 @@ define void @atomic16_xor_monotonic(ptr %a) {
 ; O2-LABEL: atomic16_xor_monotonic:
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection93:
+; O2-NEXT:  .Lpcsection108:
 ; O2-NEXT:    lock xorw $42, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -3055,7 +3100,7 @@ define void @atomic16_xor_monotonic(ptr %a) {
 ; O3-LABEL: atomic16_xor_monotonic:
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection93:
+; O3-NEXT:  .Lpcsection108:
 ; O3-NEXT:    lock xorw $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -3104,23 +3149,23 @@ define void @atomic16_nand_monotonic(ptr %a) {
 ; O1-LABEL: atomic16_nand_monotonic:
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection94:
+; O1-NEXT:  .Lpcsection109:
 ; O1-NEXT:    movzwl (%rdi), %eax
 ; O1-NEXT:    .p2align 4, 0x90
 ; O1-NEXT:  .LBB64_1: # %atomicrmw.start
 ; O1-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O1-NEXT:    movl %eax, %ecx
-; O1-NEXT:  .Lpcsection95:
+; O1-NEXT:  .Lpcsection110:
 ; O1-NEXT:    notl %ecx
-; O1-NEXT:  .Lpcsection96:
+; O1-NEXT:  .Lpcsection111:
 ; O1-NEXT:    orl $65493, %ecx # imm = 0xFFD5
-; O1-NEXT:  .Lpcsection97:
+; O1-NEXT:  .Lpcsection112:
 ; O1-NEXT:    # kill: def $ax killed $ax killed $eax
-; O1-NEXT:  .Lpcsection98:
+; O1-NEXT:  .Lpcsection113:
 ; O1-NEXT:    lock cmpxchgw %cx, (%rdi)
-; O1-NEXT:  .Lpcsection99:
+; O1-NEXT:  .Lpcsection114:
 ; O1-NEXT:    # kill: def $ax killed $ax def $eax
-; O1-NEXT:  .Lpcsection100:
+; O1-NEXT:  .Lpcsection115:
 ; O1-NEXT:    jne .LBB64_1
 ; O1-NEXT:  # %bb.2: # %atomicrmw.end
 ; O1-NEXT:    movq $1, foo(%rip)
@@ -3129,23 +3174,23 @@ define void @atomic16_nand_monotonic(ptr %a) {
 ; O2-LABEL: atomic16_nand_monotonic:
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection94:
+; O2-NEXT:  .Lpcsection109:
 ; O2-NEXT:    movzwl (%rdi), %eax
 ; O2-NEXT:    .p2align 4, 0x90
 ; O2-NEXT:  .LBB64_1: # %atomicrmw.start
 ; O2-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O2-NEXT:    movl %eax, %ecx
-; O2-NEXT:  .Lpcsection95:
+; O2-NEXT:  .Lpcsection110:
 ; O2-NEXT:    notl %ecx
-; O2-NEXT:  .Lpcsection96:
+; O2-NEXT:  .Lpcsection111:
 ; O2-NEXT:    orl $65493, %ecx # imm = 0xFFD5
-; O2-NEXT:  .Lpcsection97:
+; O2-NEXT:  .Lpcsection112:
 ; O2-NEXT:    # kill: def $ax killed $ax killed $eax
-; O2-NEXT:  .Lpcsection98:
+; O2-NEXT:  .Lpcsection113:
 ; O2-NEXT:    lock cmpxchgw %cx, (%rdi)
-; O2-NEXT:  .Lpcsection99:
+; O2-NEXT:  .Lpcsection114:
 ; O2-NEXT:    # kill: def $ax killed $ax def $eax
-; O2-NEXT:  .Lpcsection100:
+; O2-NEXT:  .Lpcsection115:
 ; O2-NEXT:    jne .LBB64_1
 ; O2-NEXT:  # %bb.2: # %atomicrmw.end
 ; O2-NEXT:    movq $1, foo(%rip)
@@ -3154,23 +3199,23 @@ define void @atomic16_nand_monotonic(ptr %a) {
 ; O3-LABEL: atomic16_nand_monotonic:
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection94:
+; O3-NEXT:  .Lpcsection109:
 ; O3-NEXT:    movzwl (%rdi), %eax
 ; O3-NEXT:    .p2align 4, 0x90
 ; O3-NEXT:  .LBB64_1: # %atomicrmw.start
 ; O3-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O3-NEXT:    movl %eax, %ecx
-; O3-NEXT:  .Lpcsection95:
+; O3-NEXT:  .Lpcsection110:
 ; O3-NEXT:    notl %ecx
-; O3-NEXT:  .Lpcsection96:
+; O3-NEXT:  .Lpcsection111:
 ; O3-NEXT:    orl $65493, %ecx # imm = 0xFFD5
-; O3-NEXT:  .Lpcsection97:
+; O3-NEXT:  .Lpcsection112:
 ; O3-NEXT:    # kill: def $ax killed $ax killed $eax
-; O3-NEXT:  .Lpcsection98:
+; O3-NEXT:  .Lpcsection113:
 ; O3-NEXT:    lock cmpxchgw %cx, (%rdi)
-; O3-NEXT:  .Lpcsection99:
+; O3-NEXT:  .Lpcsection114:
 ; O3-NEXT:    # kill: def $ax killed $ax def $eax
-; O3-NEXT:  .Lpcsection100:
+; O3-NEXT:  .Lpcsection115:
 ; O3-NEXT:    jne .LBB64_1
 ; O3-NEXT:  # %bb.2: # %atomicrmw.end
 ; O3-NEXT:    movq $1, foo(%rip)
@@ -3196,7 +3241,7 @@ define void @atomic16_xchg_acquire(ptr %a) {
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
 ; O1-NEXT:    movw $42, %ax
-; O1-NEXT:  .Lpcsection101:
+; O1-NEXT:  .Lpcsection116:
 ; O1-NEXT:    xchgw %ax, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -3205,7 +3250,7 @@ define void @atomic16_xchg_acquire(ptr %a) {
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
 ; O2-NEXT:    movw $42, %ax
-; O2-NEXT:  .Lpcsection101:
+; O2-NEXT:  .Lpcsection116:
 ; O2-NEXT:    xchgw %ax, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -3214,7 +3259,7 @@ define void @atomic16_xchg_acquire(ptr %a) {
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
 ; O3-NEXT:    movw $42, %ax
-; O3-NEXT:  .Lpcsection101:
+; O3-NEXT:  .Lpcsection116:
 ; O3-NEXT:    xchgw %ax, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -3237,7 +3282,7 @@ define void @atomic16_add_acquire(ptr %a) {
 ; O1-LABEL: atomic16_add_acquire:
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection102:
+; O1-NEXT:  .Lpcsection117:
 ; O1-NEXT:    lock addw $42, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -3245,7 +3290,7 @@ define void @atomic16_add_acquire(ptr %a) {
 ; O2-LABEL: atomic16_add_acquire:
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection102:
+; O2-NEXT:  .Lpcsection117:
 ; O2-NEXT:    lock addw $42, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -3253,7 +3298,7 @@ define void @atomic16_add_acquire(ptr %a) {
 ; O3-LABEL: atomic16_add_acquire:
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection102:
+; O3-NEXT:  .Lpcsection117:
 ; O3-NEXT:    lock addw $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -3276,7 +3321,7 @@ define void @atomic16_sub_acquire(ptr %a) {
 ; O1-LABEL: atomic16_sub_acquire:
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection103:
+; O1-NEXT:  .Lpcsection118:
 ; O1-NEXT:    lock subw $42, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -3284,7 +3329,7 @@ define void @atomic16_sub_acquire(ptr %a) {
 ; O2-LABEL: atomic16_sub_acquire:
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection103:
+; O2-NEXT:  .Lpcsection118:
 ; O2-NEXT:    lock subw $42, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -3292,7 +3337,7 @@ define void @atomic16_sub_acquire(ptr %a) {
 ; O3-LABEL: atomic16_sub_acquire:
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection103:
+; O3-NEXT:  .Lpcsection118:
 ; O3-NEXT:    lock subw $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -3315,7 +3360,7 @@ define void @atomic16_and_acquire(ptr %a) {
 ; O1-LABEL: atomic16_and_acquire:
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection104:
+; O1-NEXT:  .Lpcsection119:
 ; O1-NEXT:    lock andw $42, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -3323,7 +3368,7 @@ define void @atomic16_and_acquire(ptr %a) {
 ; O2-LABEL: atomic16_and_acquire:
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection104:
+; O2-NEXT:  .Lpcsection119:
 ; O2-NEXT:    lock andw $42, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -3331,7 +3376,7 @@ define void @atomic16_and_acquire(ptr %a) {
 ; O3-LABEL: atomic16_and_acquire:
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection104:
+; O3-NEXT:  .Lpcsection119:
 ; O3-NEXT:    lock andw $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -3354,7 +3399,7 @@ define void @atomic16_or_acquire(ptr %a) {
 ; O1-LABEL: atomic16_or_acquire:
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection105:
+; O1-NEXT:  .Lpcsection120:
 ; O1-NEXT:    lock orw $42, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -3362,7 +3407,7 @@ define void @atomic16_or_acquire(ptr %a) {
 ; O2-LABEL: atomic16_or_acquire:
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection105:
+; O2-NEXT:  .Lpcsection120:
 ; O2-NEXT:    lock orw $42, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -3370,7 +3415,7 @@ define void @atomic16_or_acquire(ptr %a) {
 ; O3-LABEL: atomic16_or_acquire:
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection105:
+; O3-NEXT:  .Lpcsection120:
 ; O3-NEXT:    lock orw $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -3393,7 +3438,7 @@ define void @atomic16_xor_acquire(ptr %a) {
 ; O1-LABEL: atomic16_xor_acquire:
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection106:
+; O1-NEXT:  .Lpcsection121:
 ; O1-NEXT:    lock xorw $42, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -3401,7 +3446,7 @@ define void @atomic16_xor_acquire(ptr %a) {
 ; O2-LABEL: atomic16_xor_acquire:
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection106:
+; O2-NEXT:  .Lpcsection121:
 ; O2-NEXT:    lock xorw $42, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -3409,7 +3454,7 @@ define void @atomic16_xor_acquire(ptr %a) {
 ; O3-LABEL: atomic16_xor_acquire:
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection106:
+; O3-NEXT:  .Lpcsection121:
 ; O3-NEXT:    lock xorw $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -3458,23 +3503,23 @@ define void @atomic16_nand_acquire(ptr %a) {
 ; O1-LABEL: atomic16_nand_acquire:
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection107:
+; O1-NEXT:  .Lpcsection122:
 ; O1-NEXT:    movzwl (%rdi), %eax
 ; O1-NEXT:    .p2align 4, 0x90
 ; O1-NEXT:  .LBB71_1: # %atomicrmw.start
 ; O1-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O1-NEXT:    movl %eax, %ecx
-; O1-NEXT:  .Lpcsection108:
+; O1-NEXT:  .Lpcsection123:
 ; O1-NEXT:    notl %ecx
-; O1-NEXT:  .Lpcsection109:
+; O1-NEXT:  .Lpcsection124:
 ; O1-NEXT:    orl $65493, %ecx # imm = 0xFFD5
-; O1-NEXT:  .Lpcsection110:
+; O1-NEXT:  .Lpcsection125:
 ; O1-NEXT:    # kill: def $ax killed $ax killed $eax
-; O1-NEXT:  .Lpcsection111:
+; O1-NEXT:  .Lpcsection126:
 ; O1-NEXT:    lock cmpxchgw %cx, (%rdi)
-; O1-NEXT:  .Lpcsection112:
+; O1-NEXT:  .Lpcsection127:
 ; O1-NEXT:    # kill: def $ax killed $ax def $eax
-; O1-NEXT:  .Lpcsection113:
+; O1-NEXT:  .Lpcsection128:
 ; O1-NEXT:    jne .LBB71_1
 ; O1-NEXT:  # %bb.2: # %atomicrmw.end
 ; O1-NEXT:    movq $1, foo(%rip)
@@ -3483,23 +3528,23 @@ define void @atomic16_nand_acquire(ptr %a) {
 ; O2-LABEL: atomic16_nand_acquire:
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection107:
+; O2-NEXT:  .Lpcsection122:
 ; O2-NEXT:    movzwl (%rdi), %eax
 ; O2-NEXT:    .p2align 4, 0x90
 ; O2-NEXT:  .LBB71_1: # %atomicrmw.start
 ; O2-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O2-NEXT:    movl %eax, %ecx
-; O2-NEXT:  .Lpcsection108:
+; O2-NEXT:  .Lpcsection123:
 ; O2-NEXT:    notl %ecx
-; O2-NEXT:  .Lpcsection109:
+; O2-NEXT:  .Lpcsection124:
 ; O2-NEXT:    orl $65493, %ecx # imm = 0xFFD5
-; O2-NEXT:  .Lpcsection110:
+; O2-NEXT:  .Lpcsection125:
 ; O2-NEXT:    # kill: def $ax killed $ax killed $eax
-; O2-NEXT:  .Lpcsection111:
+; O2-NEXT:  .Lpcsection126:
 ; O2-NEXT:    lock cmpxchgw %cx, (%rdi)
-; O2-NEXT:  .Lpcsection112:
+; O2-NEXT:  .Lpcsection127:
 ; O2-NEXT:    # kill: def $ax killed $ax def $eax
-; O2-NEXT:  .Lpcsection113:
+; O2-NEXT:  .Lpcsection128:
 ; O2-NEXT:    jne .LBB71_1
 ; O2-NEXT:  # %bb.2: # %atomicrmw.end
 ; O2-NEXT:    movq $1, foo(%rip)
@@ -3508,23 +3553,23 @@ define void @atomic16_nand_acquire(ptr %a) {
 ; O3-LABEL: atomic16_nand_acquire:
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection107:
+; O3-NEXT:  .Lpcsection122:
 ; O3-NEXT:    movzwl (%rdi), %eax
 ; O3-NEXT:    .p2align 4, 0x90
 ; O3-NEXT:  .LBB71_1: # %atomicrmw.start
 ; O3-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O3-NEXT:    movl %eax, %ecx
-; O3-NEXT:  .Lpcsection108:
+; O3-NEXT:  .Lpcsection123:
 ; O3-NEXT:    notl %ecx
-; O3-NEXT:  .Lpcsection109:
+; O3-NEXT:  .Lpcsection124:
 ; O3-NEXT:    orl $65493, %ecx # imm = 0xFFD5
-; O3-NEXT:  .Lpcsection110:
+; O3-NEXT:  .Lpcsection125:
 ; O3-NEXT:    # kill: def $ax killed $ax killed $eax
-; O3-NEXT:  .Lpcsection111:
+; O3-NEXT:  .Lpcsection126:
 ; O3-NEXT:    lock cmpxchgw %cx, (%rdi)
-; O3-NEXT:  .Lpcsection112:
+; O3-NEXT:  .Lpcsection127:
 ; O3-NEXT:    # kill: def $ax killed $ax def $eax
-; O3-NEXT:  .Lpcsection113:
+; O3-NEXT:  .Lpcsection128:
 ; O3-NEXT:    jne .LBB71_1
 ; O3-NEXT:  # %bb.2: # %atomicrmw.end
 ; O3-NEXT:    movq $1, foo(%rip)
@@ -3550,7 +3595,7 @@ define void @atomic16_xchg_release(ptr %a) {
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
 ; O1-NEXT:    movw $42, %ax
-; O1-NEXT:  .Lpcsection114:
+; O1-NEXT:  .Lpcsection129:
 ; O1-NEXT:    xchgw %ax, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -3559,7 +3604,7 @@ define void @atomic16_xchg_release(ptr %a) {
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
 ; O2-NEXT:    movw $42, %ax
-; O2-NEXT:  .Lpcsection114:
+; O2-NEXT:  .Lpcsection129:
 ; O2-NEXT:    xchgw %ax, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -3568,7 +3613,7 @@ define void @atomic16_xchg_release(ptr %a) {
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
 ; O3-NEXT:    movw $42, %ax
-; O3-NEXT:  .Lpcsection114:
+; O3-NEXT:  .Lpcsection129:
 ; O3-NEXT:    xchgw %ax, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -3591,7 +3636,7 @@ define void @atomic16_add_release(ptr %a) {
 ; O1-LABEL: atomic16_add_release:
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection115:
+; O1-NEXT:  .Lpcsection130:
 ; O1-NEXT:    lock addw $42, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -3599,7 +3644,7 @@ define void @atomic16_add_release(ptr %a) {
 ; O2-LABEL: atomic16_add_release:
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection115:
+; O2-NEXT:  .Lpcsection130:
 ; O2-NEXT:    lock addw $42, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -3607,7 +3652,7 @@ define void @atomic16_add_release(ptr %a) {
 ; O3-LABEL: atomic16_add_release:
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection115:
+; O3-NEXT:  .Lpcsection130:
 ; O3-NEXT:    lock addw $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -3630,7 +3675,7 @@ define void @atomic16_sub_release(ptr %a) {
 ; O1-LABEL: atomic16_sub_release:
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection116:
+; O1-NEXT:  .Lpcsection131:
 ; O1-NEXT:    lock subw $42, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -3638,7 +3683,7 @@ define void @atomic16_sub_release(ptr %a) {
 ; O2-LABEL: atomic16_sub_release:
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection116:
+; O2-NEXT:  .Lpcsection131:
 ; O2-NEXT:    lock subw $42, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -3646,7 +3691,7 @@ define void @atomic16_sub_release(ptr %a) {
 ; O3-LABEL: atomic16_sub_release:
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection116:
+; O3-NEXT:  .Lpcsection131:
 ; O3-NEXT:    lock subw $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -3669,7 +3714,7 @@ define void @atomic16_and_release(ptr %a) {
 ; O1-LABEL: atomic16_and_release:
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection117:
+; O1-NEXT:  .Lpcsection132:
 ; O1-NEXT:    lock andw $42, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -3677,7 +3722,7 @@ define void @atomic16_and_release(ptr %a) {
 ; O2-LABEL: atomic16_and_release:
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection117:
+; O2-NEXT:  .Lpcsection132:
 ; O2-NEXT:    lock andw $42, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -3685,7 +3730,7 @@ define void @atomic16_and_release(ptr %a) {
 ; O3-LABEL: atomic16_and_release:
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection117:
+; O3-NEXT:  .Lpcsection132:
 ; O3-NEXT:    lock andw $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -3708,7 +3753,7 @@ define void @atomic16_or_release(ptr %a) {
 ; O1-LABEL: atomic16_or_release:
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection118:
+; O1-NEXT:  .Lpcsection133:
 ; O1-NEXT:    lock orw $42, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -3716,7 +3761,7 @@ define void @atomic16_or_release(ptr %a) {
 ; O2-LABEL: atomic16_or_release:
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection118:
+; O2-NEXT:  .Lpcsection133:
 ; O2-NEXT:    lock orw $42, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -3724,7 +3769,7 @@ define void @atomic16_or_release(ptr %a) {
 ; O3-LABEL: atomic16_or_release:
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection118:
+; O3-NEXT:  .Lpcsection133:
 ; O3-NEXT:    lock orw $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -3747,7 +3792,7 @@ define void @atomic16_xor_release(ptr %a) {
 ; O1-LABEL: atomic16_xor_release:
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection119:
+; O1-NEXT:  .Lpcsection134:
 ; O1-NEXT:    lock xorw $42, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -3755,7 +3800,7 @@ define void @atomic16_xor_release(ptr %a) {
 ; O2-LABEL: atomic16_xor_release:
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection119:
+; O2-NEXT:  .Lpcsection134:
 ; O2-NEXT:    lock xorw $42, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -3763,7 +3808,7 @@ define void @atomic16_xor_release(ptr %a) {
 ; O3-LABEL: atomic16_xor_release:
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection119:
+; O3-NEXT:  .Lpcsection134:
 ; O3-NEXT:    lock xorw $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -3812,23 +3857,23 @@ define void @atomic16_nand_release(ptr %a) {
 ; O1-LABEL: atomic16_nand_release:
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection120:
+; O1-NEXT:  .Lpcsection135:
 ; O1-NEXT:    movzwl (%rdi), %eax
 ; O1-NEXT:    .p2align 4, 0x90
 ; O1-NEXT:  .LBB78_1: # %atomicrmw.start
 ; O1-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O1-NEXT:    movl %eax, %ecx
-; O1-NEXT:  .Lpcsection121:
+; O1-NEXT:  .Lpcsection136:
 ; O1-NEXT:    notl %ecx
-; O1-NEXT:  .Lpcsection122:
+; O1-NEXT:  .Lpcsection137:
 ; O1-NEXT:    orl $65493, %ecx # imm = 0xFFD5
-; O1-NEXT:  .Lpcsection123:
+; O1-NEXT:  .Lpcsection138:
 ; O1-NEXT:    # kill: def $ax killed $ax killed $eax
-; O1-NEXT:  .Lpcsection124:
+; O1-NEXT:  .Lpcsection139:
 ; O1-NEXT:    lock cmpxchgw %cx, (%rdi)
-; O1-NEXT:  .Lpcsection125:
+; O1-NEXT:  .Lpcsection140:
 ; O1-NEXT:    # kill: def $ax killed $ax def $eax
-; O1-NEXT:  .Lpcsection126:
+; O1-NEXT:  .Lpcsection141:
 ; O1-NEXT:    jne .LBB78_1
 ; O1-NEXT:  # %bb.2: # %atomicrmw.end
 ; O1-NEXT:    movq $1, foo(%rip)
@@ -3837,23 +3882,23 @@ define void @atomic16_nand_release(ptr %a) {
 ; O2-LABEL: atomic16_nand_release:
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection120:
+; O2-NEXT:  .Lpcsection135:
 ; O2-NEXT:    movzwl (%rdi), %eax
 ; O2-NEXT:    .p2align 4, 0x90
 ; O2-NEXT:  .LBB78_1: # %atomicrmw.start
 ; O2-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O2-NEXT:    movl %eax, %ecx
-; O2-NEXT:  .Lpcsection121:
+; O2-NEXT:  .Lpcsection136:
 ; O2-NEXT:    notl %ecx
-; O2-NEXT:  .Lpcsection122:
+; O2-NEXT:  .Lpcsection137:
 ; O2-NEXT:    orl $65493, %ecx # imm = 0xFFD5
-; O2-NEXT:  .Lpcsection123:
+; O2-NEXT:  .Lpcsection138:
 ; O2-NEXT:    # kill: def $ax killed $ax killed $eax
-; O2-NEXT:  .Lpcsection124:
+; O2-NEXT:  .Lpcsection139:
 ; O2-NEXT:    lock cmpxchgw %cx, (%rdi)
-; O2-NEXT:  .Lpcsection125:
+; O2-NEXT:  .Lpcsection140:
 ; O2-NEXT:    # kill: def $ax killed $ax def $eax
-; O2-NEXT:  .Lpcsection126:
+; O2-NEXT:  .Lpcsection141:
 ; O2-NEXT:    jne .LBB78_1
 ; O2-NEXT:  # %bb.2: # %atomicrmw.end
 ; O2-NEXT:    movq $1, foo(%rip)
@@ -3862,23 +3907,23 @@ define void @atomic16_nand_release(ptr %a) {
 ; O3-LABEL: atomic16_nand_release:
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection120:
+; O3-NEXT:  .Lpcsection135:
 ; O3-NEXT:    movzwl (%rdi), %eax
 ; O3-NEXT:    .p2align 4, 0x90
 ; O3-NEXT:  .LBB78_1: # %atomicrmw.start
 ; O3-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O3-NEXT:    movl %eax, %ecx
-; O3-NEXT:  .Lpcsection121:
+; O3-NEXT:  .Lpcsection136:
 ; O3-NEXT:    notl %ecx
-; O3-NEXT:  .Lpcsection122:
+; O3-NEXT:  .Lpcsection137:
 ; O3-NEXT:    orl $65493, %ecx # imm = 0xFFD5
-; O3-NEXT:  .Lpcsection123:
+; O3-NEXT:  .Lpcsection138:
 ; O3-NEXT:    # kill: def $ax killed $ax killed $eax
-; O3-NEXT:  .Lpcsection124:
+; O3-NEXT:  .Lpcsection139:
 ; O3-NEXT:    lock cmpxchgw %cx, (%rdi)
-; O3-NEXT:  .Lpcsection125:
+; O3-NEXT:  .Lpcsection140:
 ; O3-NEXT:    # kill: def $ax killed $ax def $eax
-; O3-NEXT:  .Lpcsection126:
+; O3-NEXT:  .Lpcsection141:
 ; O3-NEXT:    jne .LBB78_1
 ; O3-NEXT:  # %bb.2: # %atomicrmw.end
 ; O3-NEXT:    movq $1, foo(%rip)
@@ -3904,7 +3949,7 @@ define void @atomic16_xchg_acq_rel(ptr %a) {
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
 ; O1-NEXT:    movw $42, %ax
-; O1-NEXT:  .Lpcsection127:
+; O1-NEXT:  .Lpcsection142:
 ; O1-NEXT:    xchgw %ax, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -3913,7 +3958,7 @@ define void @atomic16_xchg_acq_rel(ptr %a) {
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
 ; O2-NEXT:    movw $42, %ax
-; O2-NEXT:  .Lpcsection127:
+; O2-NEXT:  .Lpcsection142:
 ; O2-NEXT:    xchgw %ax, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -3922,7 +3967,7 @@ define void @atomic16_xchg_acq_rel(ptr %a) {
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
 ; O3-NEXT:    movw $42, %ax
-; O3-NEXT:  .Lpcsection127:
+; O3-NEXT:  .Lpcsection142:
 ; O3-NEXT:    xchgw %ax, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -3945,7 +3990,7 @@ define void @atomic16_add_acq_rel(ptr %a) {
 ; O1-LABEL: atomic16_add_acq_rel:
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection128:
+; O1-NEXT:  .Lpcsection143:
 ; O1-NEXT:    lock addw $42, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -3953,7 +3998,7 @@ define void @atomic16_add_acq_rel(ptr %a) {
 ; O2-LABEL: atomic16_add_acq_rel:
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection128:
+; O2-NEXT:  .Lpcsection143:
 ; O2-NEXT:    lock addw $42, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -3961,7 +4006,7 @@ define void @atomic16_add_acq_rel(ptr %a) {
 ; O3-LABEL: atomic16_add_acq_rel:
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection128:
+; O3-NEXT:  .Lpcsection143:
 ; O3-NEXT:    lock addw $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -3984,7 +4029,7 @@ define void @atomic16_sub_acq_rel(ptr %a) {
 ; O1-LABEL: atomic16_sub_acq_rel:
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection129:
+; O1-NEXT:  .Lpcsection144:
 ; O1-NEXT:    lock subw $42, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -3992,7 +4037,7 @@ define void @atomic16_sub_acq_rel(ptr %a) {
 ; O2-LABEL: atomic16_sub_acq_rel:
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection129:
+; O2-NEXT:  .Lpcsection144:
 ; O2-NEXT:    lock subw $42, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -4000,7 +4045,7 @@ define void @atomic16_sub_acq_rel(ptr %a) {
 ; O3-LABEL: atomic16_sub_acq_rel:
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection129:
+; O3-NEXT:  .Lpcsection144:
 ; O3-NEXT:    lock subw $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -4023,7 +4068,7 @@ define void @atomic16_and_acq_rel(ptr %a) {
 ; O1-LABEL: atomic16_and_acq_rel:
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection130:
+; O1-NEXT:  .Lpcsection145:
 ; O1-NEXT:    lock andw $42, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -4031,7 +4076,7 @@ define void @atomic16_and_acq_rel(ptr %a) {
 ; O2-LABEL: atomic16_and_acq_rel:
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection130:
+; O2-NEXT:  .Lpcsection145:
 ; O2-NEXT:    lock andw $42, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -4039,7 +4084,7 @@ define void @atomic16_and_acq_rel(ptr %a) {
 ; O3-LABEL: atomic16_and_acq_rel:
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection130:
+; O3-NEXT:  .Lpcsection145:
 ; O3-NEXT:    lock andw $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -4062,7 +4107,7 @@ define void @atomic16_or_acq_rel(ptr %a) {
 ; O1-LABEL: atomic16_or_acq_rel:
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection131:
+; O1-NEXT:  .Lpcsection146:
 ; O1-NEXT:    lock orw $42, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -4070,7 +4115,7 @@ define void @atomic16_or_acq_rel(ptr %a) {
 ; O2-LABEL: atomic16_or_acq_rel:
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection131:
+; O2-NEXT:  .Lpcsection146:
 ; O2-NEXT:    lock orw $42, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -4078,7 +4123,7 @@ define void @atomic16_or_acq_rel(ptr %a) {
 ; O3-LABEL: atomic16_or_acq_rel:
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection131:
+; O3-NEXT:  .Lpcsection146:
 ; O3-NEXT:    lock orw $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -4101,7 +4146,7 @@ define void @atomic16_xor_acq_rel(ptr %a) {
 ; O1-LABEL: atomic16_xor_acq_rel:
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection132:
+; O1-NEXT:  .Lpcsection147:
 ; O1-NEXT:    lock xorw $42, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -4109,7 +4154,7 @@ define void @atomic16_xor_acq_rel(ptr %a) {
 ; O2-LABEL: atomic16_xor_acq_rel:
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection132:
+; O2-NEXT:  .Lpcsection147:
 ; O2-NEXT:    lock xorw $42, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -4117,7 +4162,7 @@ define void @atomic16_xor_acq_rel(ptr %a) {
 ; O3-LABEL: atomic16_xor_acq_rel:
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection132:
+; O3-NEXT:  .Lpcsection147:
 ; O3-NEXT:    lock xorw $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -4166,23 +4211,23 @@ define void @atomic16_nand_acq_rel(ptr %a) {
 ; O1-LABEL: atomic16_nand_acq_rel:
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection133:
+; O1-NEXT:  .Lpcsection148:
 ; O1-NEXT:    movzwl (%rdi), %eax
 ; O1-NEXT:    .p2align 4, 0x90
 ; O1-NEXT:  .LBB85_1: # %atomicrmw.start
 ; O1-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O1-NEXT:    movl %eax, %ecx
-; O1-NEXT:  .Lpcsection134:
+; O1-NEXT:  .Lpcsection149:
 ; O1-NEXT:    notl %ecx
-; O1-NEXT:  .Lpcsection135:
+; O1-NEXT:  .Lpcsection150:
 ; O1-NEXT:    orl $65493, %ecx # imm = 0xFFD5
-; O1-NEXT:  .Lpcsection136:
+; O1-NEXT:  .Lpcsection151:
 ; O1-NEXT:    # kill: def $ax killed $ax killed $eax
-; O1-NEXT:  .Lpcsection137:
+; O1-NEXT:  .Lpcsection152:
 ; O1-NEXT:    lock cmpxchgw %cx, (%rdi)
-; O1-NEXT:  .Lpcsection138:
+; O1-NEXT:  .Lpcsection153:
 ; O1-NEXT:    # kill: def $ax killed $ax def $eax
-; O1-NEXT:  .Lpcsection139:
+; O1-NEXT:  .Lpcsection154:
 ; O1-NEXT:    jne .LBB85_1
 ; O1-NEXT:  # %bb.2: # %atomicrmw.end
 ; O1-NEXT:    movq $1, foo(%rip)
@@ -4191,23 +4236,23 @@ define void @atomic16_nand_acq_rel(ptr %a) {
 ; O2-LABEL: atomic16_nand_acq_rel:
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection133:
+; O2-NEXT:  .Lpcsection148:
 ; O2-NEXT:    movzwl (%rdi), %eax
 ; O2-NEXT:    .p2align 4, 0x90
 ; O2-NEXT:  .LBB85_1: # %atomicrmw.start
 ; O2-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O2-NEXT:    movl %eax, %ecx
-; O2-NEXT:  .Lpcsection134:
+; O2-NEXT:  .Lpcsection149:
 ; O2-NEXT:    notl %ecx
-; O2-NEXT:  .Lpcsection135:
+; O2-NEXT:  .Lpcsection150:
 ; O2-NEXT:    orl $65493, %ecx # imm = 0xFFD5
-; O2-NEXT:  .Lpcsection136:
+; O2-NEXT:  .Lpcsection151:
 ; O2-NEXT:    # kill: def $ax killed $ax killed $eax
-; O2-NEXT:  .Lpcsection137:
+; O2-NEXT:  .Lpcsection152:
 ; O2-NEXT:    lock cmpxchgw %cx, (%rdi)
-; O2-NEXT:  .Lpcsection138:
+; O2-NEXT:  .Lpcsection153:
 ; O2-NEXT:    # kill: def $ax killed $ax def $eax
-; O2-NEXT:  .Lpcsection139:
+; O2-NEXT:  .Lpcsection154:
 ; O2-NEXT:    jne .LBB85_1
 ; O2-NEXT:  # %bb.2: # %atomicrmw.end
 ; O2-NEXT:    movq $1, foo(%rip)
@@ -4216,23 +4261,23 @@ define void @atomic16_nand_acq_rel(ptr %a) {
 ; O3-LABEL: atomic16_nand_acq_rel:
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection133:
+; O3-NEXT:  .Lpcsection148:
 ; O3-NEXT:    movzwl (%rdi), %eax
 ; O3-NEXT:    .p2align 4, 0x90
 ; O3-NEXT:  .LBB85_1: # %atomicrmw.start
 ; O3-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O3-NEXT:    movl %eax, %ecx
-; O3-NEXT:  .Lpcsection134:
+; O3-NEXT:  .Lpcsection149:
 ; O3-NEXT:    notl %ecx
-; O3-NEXT:  .Lpcsection135:
+; O3-NEXT:  .Lpcsection150:
 ; O3-NEXT:    orl $65493, %ecx # imm = 0xFFD5
-; O3-NEXT:  .Lpcsection136:
+; O3-NEXT:  .Lpcsection151:
 ; O3-NEXT:    # kill: def $ax killed $ax killed $eax
-; O3-NEXT:  .Lpcsection137:
+; O3-NEXT:  .Lpcsection152:
 ; O3-NEXT:    lock cmpxchgw %cx, (%rdi)
-; O3-NEXT:  .Lpcsection138:
+; O3-NEXT:  .Lpcsection153:
 ; O3-NEXT:    # kill: def $ax killed $ax def $eax
-; O3-NEXT:  .Lpcsection139:
+; O3-NEXT:  .Lpcsection154:
 ; O3-NEXT:    jne .LBB85_1
 ; O3-NEXT:  # %bb.2: # %atomicrmw.end
 ; O3-NEXT:    movq $1, foo(%rip)
@@ -4258,7 +4303,7 @@ define void @atomic16_xchg_seq_cst(ptr %a) {
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
 ; O1-NEXT:    movw $42, %ax
-; O1-NEXT:  .Lpcsection140:
+; O1-NEXT:  .Lpcsection155:
 ; O1-NEXT:    xchgw %ax, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -4267,7 +4312,7 @@ define void @atomic16_xchg_seq_cst(ptr %a) {
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
 ; O2-NEXT:    movw $42, %ax
-; O2-NEXT:  .Lpcsection140:
+; O2-NEXT:  .Lpcsection155:
 ; O2-NEXT:    xchgw %ax, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -4276,7 +4321,7 @@ define void @atomic16_xchg_seq_cst(ptr %a) {
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
 ; O3-NEXT:    movw $42, %ax
-; O3-NEXT:  .Lpcsection140:
+; O3-NEXT:  .Lpcsection155:
 ; O3-NEXT:    xchgw %ax, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -4299,7 +4344,7 @@ define void @atomic16_add_seq_cst(ptr %a) {
 ; O1-LABEL: atomic16_add_seq_cst:
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection141:
+; O1-NEXT:  .Lpcsection156:
 ; O1-NEXT:    lock addw $42, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -4307,7 +4352,7 @@ define void @atomic16_add_seq_cst(ptr %a) {
 ; O2-LABEL: atomic16_add_seq_cst:
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection141:
+; O2-NEXT:  .Lpcsection156:
 ; O2-NEXT:    lock addw $42, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -4315,7 +4360,7 @@ define void @atomic16_add_seq_cst(ptr %a) {
 ; O3-LABEL: atomic16_add_seq_cst:
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection141:
+; O3-NEXT:  .Lpcsection156:
 ; O3-NEXT:    lock addw $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -4338,7 +4383,7 @@ define void @atomic16_sub_seq_cst(ptr %a) {
 ; O1-LABEL: atomic16_sub_seq_cst:
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection142:
+; O1-NEXT:  .Lpcsection157:
 ; O1-NEXT:    lock subw $42, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -4346,7 +4391,7 @@ define void @atomic16_sub_seq_cst(ptr %a) {
 ; O2-LABEL: atomic16_sub_seq_cst:
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection142:
+; O2-NEXT:  .Lpcsection157:
 ; O2-NEXT:    lock subw $42, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -4354,7 +4399,7 @@ define void @atomic16_sub_seq_cst(ptr %a) {
 ; O3-LABEL: atomic16_sub_seq_cst:
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection142:
+; O3-NEXT:  .Lpcsection157:
 ; O3-NEXT:    lock subw $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -4377,7 +4422,7 @@ define void @atomic16_and_seq_cst(ptr %a) {
 ; O1-LABEL: atomic16_and_seq_cst:
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection143:
+; O1-NEXT:  .Lpcsection158:
 ; O1-NEXT:    lock andw $42, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -4385,7 +4430,7 @@ define void @atomic16_and_seq_cst(ptr %a) {
 ; O2-LABEL: atomic16_and_seq_cst:
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection143:
+; O2-NEXT:  .Lpcsection158:
 ; O2-NEXT:    lock andw $42, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -4393,7 +4438,7 @@ define void @atomic16_and_seq_cst(ptr %a) {
 ; O3-LABEL: atomic16_and_seq_cst:
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection143:
+; O3-NEXT:  .Lpcsection158:
 ; O3-NEXT:    lock andw $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -4416,7 +4461,7 @@ define void @atomic16_or_seq_cst(ptr %a) {
 ; O1-LABEL: atomic16_or_seq_cst:
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection144:
+; O1-NEXT:  .Lpcsection159:
 ; O1-NEXT:    lock orw $42, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -4424,7 +4469,7 @@ define void @atomic16_or_seq_cst(ptr %a) {
 ; O2-LABEL: atomic16_or_seq_cst:
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection144:
+; O2-NEXT:  .Lpcsection159:
 ; O2-NEXT:    lock orw $42, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -4432,7 +4477,7 @@ define void @atomic16_or_seq_cst(ptr %a) {
 ; O3-LABEL: atomic16_or_seq_cst:
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection144:
+; O3-NEXT:  .Lpcsection159:
 ; O3-NEXT:    lock orw $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -4455,7 +4500,7 @@ define void @atomic16_xor_seq_cst(ptr %a) {
 ; O1-LABEL: atomic16_xor_seq_cst:
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection145:
+; O1-NEXT:  .Lpcsection160:
 ; O1-NEXT:    lock xorw $42, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -4463,7 +4508,7 @@ define void @atomic16_xor_seq_cst(ptr %a) {
 ; O2-LABEL: atomic16_xor_seq_cst:
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection145:
+; O2-NEXT:  .Lpcsection160:
 ; O2-NEXT:    lock xorw $42, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -4471,7 +4516,7 @@ define void @atomic16_xor_seq_cst(ptr %a) {
 ; O3-LABEL: atomic16_xor_seq_cst:
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection145:
+; O3-NEXT:  .Lpcsection160:
 ; O3-NEXT:    lock xorw $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -4520,23 +4565,23 @@ define void @atomic16_nand_seq_cst(ptr %a) {
 ; O1-LABEL: atomic16_nand_seq_cst:
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection146:
+; O1-NEXT:  .Lpcsection161:
 ; O1-NEXT:    movzwl (%rdi), %eax
 ; O1-NEXT:    .p2align 4, 0x90
 ; O1-NEXT:  .LBB92_1: # %atomicrmw.start
 ; O1-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O1-NEXT:    movl %eax, %ecx
-; O1-NEXT:  .Lpcsection147:
+; O1-NEXT:  .Lpcsection162:
 ; O1-NEXT:    notl %ecx
-; O1-NEXT:  .Lpcsection148:
+; O1-NEXT:  .Lpcsection163:
 ; O1-NEXT:    orl $65493, %ecx # imm = 0xFFD5
-; O1-NEXT:  .Lpcsection149:
+; O1-NEXT:  .Lpcsection164:
 ; O1-NEXT:    # kill: def $ax killed $ax killed $eax
-; O1-NEXT:  .Lpcsection150:
+; O1-NEXT:  .Lpcsection165:
 ; O1-NEXT:    lock cmpxchgw %cx, (%rdi)
-; O1-NEXT:  .Lpcsection151:
+; O1-NEXT:  .Lpcsection166:
 ; O1-NEXT:    # kill: def $ax killed $ax def $eax
-; O1-NEXT:  .Lpcsection152:
+; O1-NEXT:  .Lpcsection167:
 ; O1-NEXT:    jne .LBB92_1
 ; O1-NEXT:  # %bb.2: # %atomicrmw.end
 ; O1-NEXT:    movq $1, foo(%rip)
@@ -4545,23 +4590,23 @@ define void @atomic16_nand_seq_cst(ptr %a) {
 ; O2-LABEL: atomic16_nand_seq_cst:
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection146:
+; O2-NEXT:  .Lpcsection161:
 ; O2-NEXT:    movzwl (%rdi), %eax
 ; O2-NEXT:    .p2align 4, 0x90
 ; O2-NEXT:  .LBB92_1: # %atomicrmw.start
 ; O2-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O2-NEXT:    movl %eax, %ecx
-; O2-NEXT:  .Lpcsection147:
+; O2-NEXT:  .Lpcsection162:
 ; O2-NEXT:    notl %ecx
-; O2-NEXT:  .Lpcsection148:
+; O2-NEXT:  .Lpcsection163:
 ; O2-NEXT:    orl $65493, %ecx # imm = 0xFFD5
-; O2-NEXT:  .Lpcsection149:
+; O2-NEXT:  .Lpcsection164:
 ; O2-NEXT:    # kill: def $ax killed $ax killed $eax
-; O2-NEXT:  .Lpcsection150:
+; O2-NEXT:  .Lpcsection165:
 ; O2-NEXT:    lock cmpxchgw %cx, (%rdi)
-; O2-NEXT:  .Lpcsection151:
+; O2-NEXT:  .Lpcsection166:
 ; O2-NEXT:    # kill: def $ax killed $ax def $eax
-; O2-NEXT:  .Lpcsection152:
+; O2-NEXT:  .Lpcsection167:
 ; O2-NEXT:    jne .LBB92_1
 ; O2-NEXT:  # %bb.2: # %atomicrmw.end
 ; O2-NEXT:    movq $1, foo(%rip)
@@ -4570,23 +4615,23 @@ define void @atomic16_nand_seq_cst(ptr %a) {
 ; O3-LABEL: atomic16_nand_seq_cst:
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection146:
+; O3-NEXT:  .Lpcsection161:
 ; O3-NEXT:    movzwl (%rdi), %eax
 ; O3-NEXT:    .p2align 4, 0x90
 ; O3-NEXT:  .LBB92_1: # %atomicrmw.start
 ; O3-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O3-NEXT:    movl %eax, %ecx
-; O3-NEXT:  .Lpcsection147:
+; O3-NEXT:  .Lpcsection162:
 ; O3-NEXT:    notl %ecx
-; O3-NEXT:  .Lpcsection148:
+; O3-NEXT:  .Lpcsection163:
 ; O3-NEXT:    orl $65493, %ecx # imm = 0xFFD5
-; O3-NEXT:  .Lpcsection149:
+; O3-NEXT:  .Lpcsection164:
 ; O3-NEXT:    # kill: def $ax killed $ax killed $eax
-; O3-NEXT:  .Lpcsection150:
+; O3-NEXT:  .Lpcsection165:
 ; O3-NEXT:    lock cmpxchgw %cx, (%rdi)
-; O3-NEXT:  .Lpcsection151:
+; O3-NEXT:  .Lpcsection166:
 ; O3-NEXT:    # kill: def $ax killed $ax def $eax
-; O3-NEXT:  .Lpcsection152:
+; O3-NEXT:  .Lpcsection167:
 ; O3-NEXT:    jne .LBB92_1
 ; O3-NEXT:  # %bb.2: # %atomicrmw.end
 ; O3-NEXT:    movq $1, foo(%rip)
@@ -4625,13 +4670,13 @@ define void @atomic16_cas_monotonic(ptr %a) {
 ; O1-NEXT:    movq foo(%rip), %rax
 ; O1-NEXT:    movw $1, %cx
 ; O1-NEXT:    movw $42, %ax
-; O1-NEXT:  .Lpcsection153:
+; O1-NEXT:  .Lpcsection168:
 ; O1-NEXT:    lock cmpxchgw %cx, (%rdi)
 ; O1-NEXT:    movw $42, %ax
-; O1-NEXT:  .Lpcsection154:
+; O1-NEXT:  .Lpcsection169:
 ; O1-NEXT:    lock cmpxchgw %cx, (%rdi)
 ; O1-NEXT:    movw $42, %ax
-; O1-NEXT:  .Lpcsection155:
+; O1-NEXT:  .Lpcsection170:
 ; O1-NEXT:    lock cmpxchgw %cx, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -4641,13 +4686,13 @@ define void @atomic16_cas_monotonic(ptr %a) {
 ; O2-NEXT:    movq foo(%rip), %rax
 ; O2-NEXT:    movw $1, %cx
 ; O2-NEXT:    movw $42, %ax
-; O2-NEXT:  .Lpcsection153:
+; O2-NEXT:  .Lpcsection168:
 ; O2-NEXT:    lock cmpxchgw %cx, (%rdi)
 ; O2-NEXT:    movw $42, %ax
-; O2-NEXT:  .Lpcsection154:
+; O2-NEXT:  .Lpcsection169:
 ; O2-NEXT:    lock cmpxchgw %cx, (%rdi)
 ; O2-NEXT:    movw $42, %ax
-; O2-NEXT:  .Lpcsection155:
+; O2-NEXT:  .Lpcsection170:
 ; O2-NEXT:    lock cmpxchgw %cx, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -4657,13 +4702,13 @@ define void @atomic16_cas_monotonic(ptr %a) {
 ; O3-NEXT:    movq foo(%rip), %rax
 ; O3-NEXT:    movw $1, %cx
 ; O3-NEXT:    movw $42, %ax
-; O3-NEXT:  .Lpcsection153:
+; O3-NEXT:  .Lpcsection168:
 ; O3-NEXT:    lock cmpxchgw %cx, (%rdi)
 ; O3-NEXT:    movw $42, %ax
-; O3-NEXT:  .Lpcsection154:
+; O3-NEXT:  .Lpcsection169:
 ; O3-NEXT:    lock cmpxchgw %cx, (%rdi)
 ; O3-NEXT:    movw $42, %ax
-; O3-NEXT:  .Lpcsection155:
+; O3-NEXT:  .Lpcsection170:
 ; O3-NEXT:    lock cmpxchgw %cx, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -4703,13 +4748,13 @@ define void @atomic16_cas_acquire(ptr %a) {
 ; O1-NEXT:    movq foo(%rip), %rax
 ; O1-NEXT:    movw $1, %cx
 ; O1-NEXT:    movw $42, %ax
-; O1-NEXT:  .Lpcsection156:
+; O1-NEXT:  .Lpcsection171:
 ; O1-NEXT:    lock cmpxchgw %cx, (%rdi)
 ; O1-NEXT:    movw $42, %ax
-; O1-NEXT:  .Lpcsection157:
+; O1-NEXT:  .Lpcsection172:
 ; O1-NEXT:    lock cmpxchgw %cx, (%rdi)
 ; O1-NEXT:    movw $42, %ax
-; O1-NEXT:  .Lpcsection158:
+; O1-NEXT:  .Lpcsection173:
 ; O1-NEXT:    lock cmpxchgw %cx, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -4719,13 +4764,13 @@ define void @atomic16_cas_acquire(ptr %a) {
 ; O2-NEXT:    movq foo(%rip), %rax
 ; O2-NEXT:    movw $1, %cx
 ; O2-NEXT:    movw $42, %ax
-; O2-NEXT:  .Lpcsection156:
+; O2-NEXT:  .Lpcsection171:
 ; O2-NEXT:    lock cmpxchgw %cx, (%rdi)
 ; O2-NEXT:    movw $42, %ax
-; O2-NEXT:  .Lpcsection157:
+; O2-NEXT:  .Lpcsection172:
 ; O2-NEXT:    lock cmpxchgw %cx, (%rdi)
 ; O2-NEXT:    movw $42, %ax
-; O2-NEXT:  .Lpcsection158:
+; O2-NEXT:  .Lpcsection173:
 ; O2-NEXT:    lock cmpxchgw %cx, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -4735,13 +4780,13 @@ define void @atomic16_cas_acquire(ptr %a) {
 ; O3-NEXT:    movq foo(%rip), %rax
 ; O3-NEXT:    movw $1, %cx
 ; O3-NEXT:    movw $42, %ax
-; O3-NEXT:  .Lpcsection156:
+; O3-NEXT:  .Lpcsection171:
 ; O3-NEXT:    lock cmpxchgw %cx, (%rdi)
 ; O3-NEXT:    movw $42, %ax
-; O3-NEXT:  .Lpcsection157:
+; O3-NEXT:  .Lpcsection172:
 ; O3-NEXT:    lock cmpxchgw %cx, (%rdi)
 ; O3-NEXT:    movw $42, %ax
-; O3-NEXT:  .Lpcsection158:
+; O3-NEXT:  .Lpcsection173:
 ; O3-NEXT:    lock cmpxchgw %cx, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -4781,13 +4826,13 @@ define void @atomic16_cas_release(ptr %a) {
 ; O1-NEXT:    movq foo(%rip), %rax
 ; O1-NEXT:    movw $1, %cx
 ; O1-NEXT:    movw $42, %ax
-; O1-NEXT:  .Lpcsection159:
+; O1-NEXT:  .Lpcsection174:
 ; O1-NEXT:    lock cmpxchgw %cx, (%rdi)
 ; O1-NEXT:    movw $42, %ax
-; O1-NEXT:  .Lpcsection160:
+; O1-NEXT:  .Lpcsection175:
 ; O1-NEXT:    lock cmpxchgw %cx, (%rdi)
 ; O1-NEXT:    movw $42, %ax
-; O1-NEXT:  .Lpcsection161:
+; O1-NEXT:  .Lpcsection176:
 ; O1-NEXT:    lock cmpxchgw %cx, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -4797,13 +4842,13 @@ define void @atomic16_cas_release(ptr %a) {
 ; O2-NEXT:    movq foo(%rip), %rax
 ; O2-NEXT:    movw $1, %cx
 ; O2-NEXT:    movw $42, %ax
-; O2-NEXT:  .Lpcsection159:
+; O2-NEXT:  .Lpcsection174:
 ; O2-NEXT:    lock cmpxchgw %cx, (%rdi)
 ; O2-NEXT:    movw $42, %ax
-; O2-NEXT:  .Lpcsection160:
+; O2-NEXT:  .Lpcsection175:
 ; O2-NEXT:    lock cmpxchgw %cx, (%rdi)
 ; O2-NEXT:    movw $42, %ax
-; O2-NEXT:  .Lpcsection161:
+; O2-NEXT:  .Lpcsection176:
 ; O2-NEXT:    lock cmpxchgw %cx, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -4813,13 +4858,13 @@ define void @atomic16_cas_release(ptr %a) {
 ; O3-NEXT:    movq foo(%rip), %rax
 ; O3-NEXT:    movw $1, %cx
 ; O3-NEXT:    movw $42, %ax
-; O3-NEXT:  .Lpcsection159:
+; O3-NEXT:  .Lpcsection174:
 ; O3-NEXT:    lock cmpxchgw %cx, (%rdi)
 ; O3-NEXT:    movw $42, %ax
-; O3-NEXT:  .Lpcsection160:
+; O3-NEXT:  .Lpcsection175:
 ; O3-NEXT:    lock cmpxchgw %cx, (%rdi)
 ; O3-NEXT:    movw $42, %ax
-; O3-NEXT:  .Lpcsection161:
+; O3-NEXT:  .Lpcsection176:
 ; O3-NEXT:    lock cmpxchgw %cx, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -4859,13 +4904,13 @@ define void @atomic16_cas_acq_rel(ptr %a) {
 ; O1-NEXT:    movq foo(%rip), %rax
 ; O1-NEXT:    movw $1, %cx
 ; O1-NEXT:    movw $42, %ax
-; O1-NEXT:  .Lpcsection162:
+; O1-NEXT:  .Lpcsection177:
 ; O1-NEXT:    lock cmpxchgw %cx, (%rdi)
 ; O1-NEXT:    movw $42, %ax
-; O1-NEXT:  .Lpcsection163:
+; O1-NEXT:  .Lpcsection178:
 ; O1-NEXT:    lock cmpxchgw %cx, (%rdi)
 ; O1-NEXT:    movw $42, %ax
-; O1-NEXT:  .Lpcsection164:
+; O1-NEXT:  .Lpcsection179:
 ; O1-NEXT:    lock cmpxchgw %cx, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -4875,13 +4920,13 @@ define void @atomic16_cas_acq_rel(ptr %a) {
 ; O2-NEXT:    movq foo(%rip), %rax
 ; O2-NEXT:    movw $1, %cx
 ; O2-NEXT:    movw $42, %ax
-; O2-NEXT:  .Lpcsection162:
+; O2-NEXT:  .Lpcsection177:
 ; O2-NEXT:    lock cmpxchgw %cx, (%rdi)
 ; O2-NEXT:    movw $42, %ax
-; O2-NEXT:  .Lpcsection163:
+; O2-NEXT:  .Lpcsection178:
 ; O2-NEXT:    lock cmpxchgw %cx, (%rdi)
 ; O2-NEXT:    movw $42, %ax
-; O2-NEXT:  .Lpcsection164:
+; O2-NEXT:  .Lpcsection179:
 ; O2-NEXT:    lock cmpxchgw %cx, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -4891,13 +4936,13 @@ define void @atomic16_cas_acq_rel(ptr %a) {
 ; O3-NEXT:    movq foo(%rip), %rax
 ; O3-NEXT:    movw $1, %cx
 ; O3-NEXT:    movw $42, %ax
-; O3-NEXT:  .Lpcsection162:
+; O3-NEXT:  .Lpcsection177:
 ; O3-NEXT:    lock cmpxchgw %cx, (%rdi)
 ; O3-NEXT:    movw $42, %ax
-; O3-NEXT:  .Lpcsection163:
+; O3-NEXT:  .Lpcsection178:
 ; O3-NEXT:    lock cmpxchgw %cx, (%rdi)
 ; O3-NEXT:    movw $42, %ax
-; O3-NEXT:  .Lpcsection164:
+; O3-NEXT:  .Lpcsection179:
 ; O3-NEXT:    lock cmpxchgw %cx, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -4937,13 +4982,13 @@ define void @atomic16_cas_seq_cst(ptr %a) {
 ; O1-NEXT:    movq foo(%rip), %rax
 ; O1-NEXT:    movw $1, %cx
 ; O1-NEXT:    movw $42, %ax
-; O1-NEXT:  .Lpcsection165:
+; O1-NEXT:  .Lpcsection180:
 ; O1-NEXT:    lock cmpxchgw %cx, (%rdi)
 ; O1-NEXT:    movw $42, %ax
-; O1-NEXT:  .Lpcsection166:
+; O1-NEXT:  .Lpcsection181:
 ; O1-NEXT:    lock cmpxchgw %cx, (%rdi)
 ; O1-NEXT:    movw $42, %ax
-; O1-NEXT:  .Lpcsection167:
+; O1-NEXT:  .Lpcsection182:
 ; O1-NEXT:    lock cmpxchgw %cx, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -4953,13 +4998,13 @@ define void @atomic16_cas_seq_cst(ptr %a) {
 ; O2-NEXT:    movq foo(%rip), %rax
 ; O2-NEXT:    movw $1, %cx
 ; O2-NEXT:    movw $42, %ax
-; O2-NEXT:  .Lpcsection165:
+; O2-NEXT:  .Lpcsection180:
 ; O2-NEXT:    lock cmpxchgw %cx, (%rdi)
 ; O2-NEXT:    movw $42, %ax
-; O2-NEXT:  .Lpcsection166:
+; O2-NEXT:  .Lpcsection181:
 ; O2-NEXT:    lock cmpxchgw %cx, (%rdi)
 ; O2-NEXT:    movw $42, %ax
-; O2-NEXT:  .Lpcsection167:
+; O2-NEXT:  .Lpcsection182:
 ; O2-NEXT:    lock cmpxchgw %cx, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -4969,13 +5014,13 @@ define void @atomic16_cas_seq_cst(ptr %a) {
 ; O3-NEXT:    movq foo(%rip), %rax
 ; O3-NEXT:    movw $1, %cx
 ; O3-NEXT:    movw $42, %ax
-; O3-NEXT:  .Lpcsection165:
+; O3-NEXT:  .Lpcsection180:
 ; O3-NEXT:    lock cmpxchgw %cx, (%rdi)
 ; O3-NEXT:    movw $42, %ax
-; O3-NEXT:  .Lpcsection166:
+; O3-NEXT:  .Lpcsection181:
 ; O3-NEXT:    lock cmpxchgw %cx, (%rdi)
 ; O3-NEXT:    movw $42, %ax
-; O3-NEXT:  .Lpcsection167:
+; O3-NEXT:  .Lpcsection182:
 ; O3-NEXT:    lock cmpxchgw %cx, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -5000,7 +5045,7 @@ define i32 @atomic32_load_unordered(ptr %a) {
 ; O1-LABEL: atomic32_load_unordered:
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection168:
+; O1-NEXT:  .Lpcsection183:
 ; O1-NEXT:    movl (%rdi), %eax
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -5008,7 +5053,7 @@ define i32 @atomic32_load_unordered(ptr %a) {
 ; O2-LABEL: atomic32_load_unordered:
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection168:
+; O2-NEXT:  .Lpcsection183:
 ; O2-NEXT:    movl (%rdi), %eax
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -5016,7 +5061,7 @@ define i32 @atomic32_load_unordered(ptr %a) {
 ; O3-LABEL: atomic32_load_unordered:
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection168:
+; O3-NEXT:  .Lpcsection183:
 ; O3-NEXT:    movl (%rdi), %eax
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -5039,7 +5084,7 @@ define i32 @atomic32_load_monotonic(ptr %a) {
 ; O1-LABEL: atomic32_load_monotonic:
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection169:
+; O1-NEXT:  .Lpcsection184:
 ; O1-NEXT:    movl (%rdi), %eax
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -5047,7 +5092,7 @@ define i32 @atomic32_load_monotonic(ptr %a) {
 ; O2-LABEL: atomic32_load_monotonic:
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection169:
+; O2-NEXT:  .Lpcsection184:
 ; O2-NEXT:    movl (%rdi), %eax
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -5055,7 +5100,7 @@ define i32 @atomic32_load_monotonic(ptr %a) {
 ; O3-LABEL: atomic32_load_monotonic:
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection169:
+; O3-NEXT:  .Lpcsection184:
 ; O3-NEXT:    movl (%rdi), %eax
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -5078,7 +5123,7 @@ define i32 @atomic32_load_acquire(ptr %a) {
 ; O1-LABEL: atomic32_load_acquire:
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection170:
+; O1-NEXT:  .Lpcsection185:
 ; O1-NEXT:    movl (%rdi), %eax
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -5086,7 +5131,7 @@ define i32 @atomic32_load_acquire(ptr %a) {
 ; O2-LABEL: atomic32_load_acquire:
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection170:
+; O2-NEXT:  .Lpcsection185:
 ; O2-NEXT:    movl (%rdi), %eax
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -5094,7 +5139,7 @@ define i32 @atomic32_load_acquire(ptr %a) {
 ; O3-LABEL: atomic32_load_acquire:
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection170:
+; O3-NEXT:  .Lpcsection185:
 ; O3-NEXT:    movl (%rdi), %eax
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -5117,7 +5162,7 @@ define i32 @atomic32_load_seq_cst(ptr %a) {
 ; O1-LABEL: atomic32_load_seq_cst:
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection171:
+; O1-NEXT:  .Lpcsection186:
 ; O1-NEXT:    movl (%rdi), %eax
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -5125,7 +5170,7 @@ define i32 @atomic32_load_seq_cst(ptr %a) {
 ; O2-LABEL: atomic32_load_seq_cst:
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection171:
+; O2-NEXT:  .Lpcsection186:
 ; O2-NEXT:    movl (%rdi), %eax
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -5133,7 +5178,7 @@ define i32 @atomic32_load_seq_cst(ptr %a) {
 ; O3-LABEL: atomic32_load_seq_cst:
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection171:
+; O3-NEXT:  .Lpcsection186:
 ; O3-NEXT:    movl (%rdi), %eax
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -5156,7 +5201,7 @@ define void @atomic32_store_unordered(ptr %a) {
 ; O1-LABEL: atomic32_store_unordered:
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection172:
+; O1-NEXT:  .Lpcsection187:
 ; O1-NEXT:    movl $42, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -5164,7 +5209,7 @@ define void @atomic32_store_unordered(ptr %a) {
 ; O2-LABEL: atomic32_store_unordered:
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection172:
+; O2-NEXT:  .Lpcsection187:
 ; O2-NEXT:    movl $42, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -5172,7 +5217,7 @@ define void @atomic32_store_unordered(ptr %a) {
 ; O3-LABEL: atomic32_store_unordered:
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection172:
+; O3-NEXT:  .Lpcsection187:
 ; O3-NEXT:    movl $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -5195,7 +5240,7 @@ define void @atomic32_store_monotonic(ptr %a) {
 ; O1-LABEL: atomic32_store_monotonic:
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection173:
+; O1-NEXT:  .Lpcsection188:
 ; O1-NEXT:    movl $42, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -5203,7 +5248,7 @@ define void @atomic32_store_monotonic(ptr %a) {
 ; O2-LABEL: atomic32_store_monotonic:
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection173:
+; O2-NEXT:  .Lpcsection188:
 ; O2-NEXT:    movl $42, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -5211,7 +5256,7 @@ define void @atomic32_store_monotonic(ptr %a) {
 ; O3-LABEL: atomic32_store_monotonic:
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection173:
+; O3-NEXT:  .Lpcsection188:
 ; O3-NEXT:    movl $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -5234,7 +5279,7 @@ define void @atomic32_store_release(ptr %a) {
 ; O1-LABEL: atomic32_store_release:
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection174:
+; O1-NEXT:  .Lpcsection189:
 ; O1-NEXT:    movl $42, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -5242,7 +5287,7 @@ define void @atomic32_store_release(ptr %a) {
 ; O2-LABEL: atomic32_store_release:
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection174:
+; O2-NEXT:  .Lpcsection189:
 ; O2-NEXT:    movl $42, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -5250,7 +5295,7 @@ define void @atomic32_store_release(ptr %a) {
 ; O3-LABEL: atomic32_store_release:
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection174:
+; O3-NEXT:  .Lpcsection189:
 ; O3-NEXT:    movl $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -5275,7 +5320,7 @@ define void @atomic32_store_seq_cst(ptr %a) {
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
 ; O1-NEXT:    movl $42, %eax
-; O1-NEXT:  .Lpcsection175:
+; O1-NEXT:  .Lpcsection190:
 ; O1-NEXT:    xchgl %eax, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -5284,7 +5329,7 @@ define void @atomic32_store_seq_cst(ptr %a) {
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
 ; O2-NEXT:    movl $42, %eax
-; O2-NEXT:  .Lpcsection175:
+; O2-NEXT:  .Lpcsection190:
 ; O2-NEXT:    xchgl %eax, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -5293,7 +5338,7 @@ define void @atomic32_store_seq_cst(ptr %a) {
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
 ; O3-NEXT:    movl $42, %eax
-; O3-NEXT:  .Lpcsection175:
+; O3-NEXT:  .Lpcsection190:
 ; O3-NEXT:    xchgl %eax, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -5318,7 +5363,7 @@ define void @atomic32_xchg_monotonic(ptr %a) {
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
 ; O1-NEXT:    movl $42, %eax
-; O1-NEXT:  .Lpcsection176:
+; O1-NEXT:  .Lpcsection191:
 ; O1-NEXT:    xchgl %eax, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -5327,7 +5372,7 @@ define void @atomic32_xchg_monotonic(ptr %a) {
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
 ; O2-NEXT:    movl $42, %eax
-; O2-NEXT:  .Lpcsection176:
+; O2-NEXT:  .Lpcsection191:
 ; O2-NEXT:    xchgl %eax, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -5336,7 +5381,7 @@ define void @atomic32_xchg_monotonic(ptr %a) {
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
 ; O3-NEXT:    movl $42, %eax
-; O3-NEXT:  .Lpcsection176:
+; O3-NEXT:  .Lpcsection191:
 ; O3-NEXT:    xchgl %eax, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -5359,7 +5404,7 @@ define void @atomic32_add_monotonic(ptr %a) {
 ; O1-LABEL: atomic32_add_monotonic:
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection177:
+; O1-NEXT:  .Lpcsection192:
 ; O1-NEXT:    lock addl $42, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -5367,7 +5412,7 @@ define void @atomic32_add_monotonic(ptr %a) {
 ; O2-LABEL: atomic32_add_monotonic:
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection177:
+; O2-NEXT:  .Lpcsection192:
 ; O2-NEXT:    lock addl $42, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -5375,7 +5420,7 @@ define void @atomic32_add_monotonic(ptr %a) {
 ; O3-LABEL: atomic32_add_monotonic:
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection177:
+; O3-NEXT:  .Lpcsection192:
 ; O3-NEXT:    lock addl $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -5398,7 +5443,7 @@ define void @atomic32_sub_monotonic(ptr %a) {
 ; O1-LABEL: atomic32_sub_monotonic:
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection178:
+; O1-NEXT:  .Lpcsection193:
 ; O1-NEXT:    lock subl $42, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -5406,7 +5451,7 @@ define void @atomic32_sub_monotonic(ptr %a) {
 ; O2-LABEL: atomic32_sub_monotonic:
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection178:
+; O2-NEXT:  .Lpcsection193:
 ; O2-NEXT:    lock subl $42, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -5414,7 +5459,7 @@ define void @atomic32_sub_monotonic(ptr %a) {
 ; O3-LABEL: atomic32_sub_monotonic:
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection178:
+; O3-NEXT:  .Lpcsection193:
 ; O3-NEXT:    lock subl $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -5437,7 +5482,7 @@ define void @atomic32_and_monotonic(ptr %a) {
 ; O1-LABEL: atomic32_and_monotonic:
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection179:
+; O1-NEXT:  .Lpcsection194:
 ; O1-NEXT:    lock andl $42, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -5445,7 +5490,7 @@ define void @atomic32_and_monotonic(ptr %a) {
 ; O2-LABEL: atomic32_and_monotonic:
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection179:
+; O2-NEXT:  .Lpcsection194:
 ; O2-NEXT:    lock andl $42, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -5453,7 +5498,7 @@ define void @atomic32_and_monotonic(ptr %a) {
 ; O3-LABEL: atomic32_and_monotonic:
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection179:
+; O3-NEXT:  .Lpcsection194:
 ; O3-NEXT:    lock andl $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -5476,7 +5521,7 @@ define void @atomic32_or_monotonic(ptr %a) {
 ; O1-LABEL: atomic32_or_monotonic:
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection180:
+; O1-NEXT:  .Lpcsection195:
 ; O1-NEXT:    lock orl $42, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -5484,7 +5529,7 @@ define void @atomic32_or_monotonic(ptr %a) {
 ; O2-LABEL: atomic32_or_monotonic:
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection180:
+; O2-NEXT:  .Lpcsection195:
 ; O2-NEXT:    lock orl $42, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -5492,7 +5537,7 @@ define void @atomic32_or_monotonic(ptr %a) {
 ; O3-LABEL: atomic32_or_monotonic:
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection180:
+; O3-NEXT:  .Lpcsection195:
 ; O3-NEXT:    lock orl $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -5515,7 +5560,7 @@ define void @atomic32_xor_monotonic(ptr %a) {
 ; O1-LABEL: atomic32_xor_monotonic:
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection181:
+; O1-NEXT:  .Lpcsection196:
 ; O1-NEXT:    lock xorl $42, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -5523,7 +5568,7 @@ define void @atomic32_xor_monotonic(ptr %a) {
 ; O2-LABEL: atomic32_xor_monotonic:
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection181:
+; O2-NEXT:  .Lpcsection196:
 ; O2-NEXT:    lock xorl $42, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -5531,7 +5576,7 @@ define void @atomic32_xor_monotonic(ptr %a) {
 ; O3-LABEL: atomic32_xor_monotonic:
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection181:
+; O3-NEXT:  .Lpcsection196:
 ; O3-NEXT:    lock xorl $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -5576,19 +5621,19 @@ define void @atomic32_nand_monotonic(ptr %a) {
 ; O1-LABEL: atomic32_nand_monotonic:
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection182:
+; O1-NEXT:  .Lpcsection197:
 ; O1-NEXT:    movl (%rdi), %eax
 ; O1-NEXT:    .p2align 4, 0x90
 ; O1-NEXT:  .LBB112_1: # %atomicrmw.start
 ; O1-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O1-NEXT:    movl %eax, %ecx
-; O1-NEXT:  .Lpcsection183:
+; O1-NEXT:  .Lpcsection198:
 ; O1-NEXT:    notl %ecx
-; O1-NEXT:  .Lpcsection184:
+; O1-NEXT:  .Lpcsection199:
 ; O1-NEXT:    orl $-43, %ecx
-; O1-NEXT:  .Lpcsection185:
+; O1-NEXT:  .Lpcsection200:
 ; O1-NEXT:    lock cmpxchgl %ecx, (%rdi)
-; O1-NEXT:  .Lpcsection186:
+; O1-NEXT:  .Lpcsection201:
 ; O1-NEXT:    jne .LBB112_1
 ; O1-NEXT:  # %bb.2: # %atomicrmw.end
 ; O1-NEXT:    movq $1, foo(%rip)
@@ -5597,19 +5642,19 @@ define void @atomic32_nand_monotonic(ptr %a) {
 ; O2-LABEL: atomic32_nand_monotonic:
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection182:
+; O2-NEXT:  .Lpcsection197:
 ; O2-NEXT:    movl (%rdi), %eax
 ; O2-NEXT:    .p2align 4, 0x90
 ; O2-NEXT:  .LBB112_1: # %atomicrmw.start
 ; O2-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O2-NEXT:    movl %eax, %ecx
-; O2-NEXT:  .Lpcsection183:
+; O2-NEXT:  .Lpcsection198:
 ; O2-NEXT:    notl %ecx
-; O2-NEXT:  .Lpcsection184:
+; O2-NEXT:  .Lpcsection199:
 ; O2-NEXT:    orl $-43, %ecx
-; O2-NEXT:  .Lpcsection185:
+; O2-NEXT:  .Lpcsection200:
 ; O2-NEXT:    lock cmpxchgl %ecx, (%rdi)
-; O2-NEXT:  .Lpcsection186:
+; O2-NEXT:  .Lpcsection201:
 ; O2-NEXT:    jne .LBB112_1
 ; O2-NEXT:  # %bb.2: # %atomicrmw.end
 ; O2-NEXT:    movq $1, foo(%rip)
@@ -5618,19 +5663,19 @@ define void @atomic32_nand_monotonic(ptr %a) {
 ; O3-LABEL: atomic32_nand_monotonic:
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection182:
+; O3-NEXT:  .Lpcsection197:
 ; O3-NEXT:    movl (%rdi), %eax
 ; O3-NEXT:    .p2align 4, 0x90
 ; O3-NEXT:  .LBB112_1: # %atomicrmw.start
 ; O3-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O3-NEXT:    movl %eax, %ecx
-; O3-NEXT:  .Lpcsection183:
+; O3-NEXT:  .Lpcsection198:
 ; O3-NEXT:    notl %ecx
-; O3-NEXT:  .Lpcsection184:
+; O3-NEXT:  .Lpcsection199:
 ; O3-NEXT:    orl $-43, %ecx
-; O3-NEXT:  .Lpcsection185:
+; O3-NEXT:  .Lpcsection200:
 ; O3-NEXT:    lock cmpxchgl %ecx, (%rdi)
-; O3-NEXT:  .Lpcsection186:
+; O3-NEXT:  .Lpcsection201:
 ; O3-NEXT:    jne .LBB112_1
 ; O3-NEXT:  # %bb.2: # %atomicrmw.end
 ; O3-NEXT:    movq $1, foo(%rip)
@@ -5656,7 +5701,7 @@ define void @atomic32_xchg_acquire(ptr %a) {
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
 ; O1-NEXT:    movl $42, %eax
-; O1-NEXT:  .Lpcsection187:
+; O1-NEXT:  .Lpcsection202:
 ; O1-NEXT:    xchgl %eax, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -5665,7 +5710,7 @@ define void @atomic32_xchg_acquire(ptr %a) {
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
 ; O2-NEXT:    movl $42, %eax
-; O2-NEXT:  .Lpcsection187:
+; O2-NEXT:  .Lpcsection202:
 ; O2-NEXT:    xchgl %eax, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -5674,7 +5719,7 @@ define void @atomic32_xchg_acquire(ptr %a) {
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
 ; O3-NEXT:    movl $42, %eax
-; O3-NEXT:  .Lpcsection187:
+; O3-NEXT:  .Lpcsection202:
 ; O3-NEXT:    xchgl %eax, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -5697,7 +5742,7 @@ define void @atomic32_add_acquire(ptr %a) {
 ; O1-LABEL: atomic32_add_acquire:
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection188:
+; O1-NEXT:  .Lpcsection203:
 ; O1-NEXT:    lock addl $42, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -5705,7 +5750,7 @@ define void @atomic32_add_acquire(ptr %a) {
 ; O2-LABEL: atomic32_add_acquire:
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection188:
+; O2-NEXT:  .Lpcsection203:
 ; O2-NEXT:    lock addl $42, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -5713,7 +5758,7 @@ define void @atomic32_add_acquire(ptr %a) {
 ; O3-LABEL: atomic32_add_acquire:
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection188:
+; O3-NEXT:  .Lpcsection203:
 ; O3-NEXT:    lock addl $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -5736,7 +5781,7 @@ define void @atomic32_sub_acquire(ptr %a) {
 ; O1-LABEL: atomic32_sub_acquire:
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection189:
+; O1-NEXT:  .Lpcsection204:
 ; O1-NEXT:    lock subl $42, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -5744,7 +5789,7 @@ define void @atomic32_sub_acquire(ptr %a) {
 ; O2-LABEL: atomic32_sub_acquire:
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection189:
+; O2-NEXT:  .Lpcsection204:
 ; O2-NEXT:    lock subl $42, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -5752,7 +5797,7 @@ define void @atomic32_sub_acquire(ptr %a) {
 ; O3-LABEL: atomic32_sub_acquire:
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection189:
+; O3-NEXT:  .Lpcsection204:
 ; O3-NEXT:    lock subl $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -5775,7 +5820,7 @@ define void @atomic32_and_acquire(ptr %a) {
 ; O1-LABEL: atomic32_and_acquire:
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection190:
+; O1-NEXT:  .Lpcsection205:
 ; O1-NEXT:    lock andl $42, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -5783,7 +5828,7 @@ define void @atomic32_and_acquire(ptr %a) {
 ; O2-LABEL: atomic32_and_acquire:
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection190:
+; O2-NEXT:  .Lpcsection205:
 ; O2-NEXT:    lock andl $42, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -5791,7 +5836,7 @@ define void @atomic32_and_acquire(ptr %a) {
 ; O3-LABEL: atomic32_and_acquire:
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection190:
+; O3-NEXT:  .Lpcsection205:
 ; O3-NEXT:    lock andl $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -5814,7 +5859,7 @@ define void @atomic32_or_acquire(ptr %a) {
 ; O1-LABEL: atomic32_or_acquire:
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection191:
+; O1-NEXT:  .Lpcsection206:
 ; O1-NEXT:    lock orl $42, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -5822,7 +5867,7 @@ define void @atomic32_or_acquire(ptr %a) {
 ; O2-LABEL: atomic32_or_acquire:
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection191:
+; O2-NEXT:  .Lpcsection206:
 ; O2-NEXT:    lock orl $42, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -5830,7 +5875,7 @@ define void @atomic32_or_acquire(ptr %a) {
 ; O3-LABEL: atomic32_or_acquire:
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection191:
+; O3-NEXT:  .Lpcsection206:
 ; O3-NEXT:    lock orl $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -5853,7 +5898,7 @@ define void @atomic32_xor_acquire(ptr %a) {
 ; O1-LABEL: atomic32_xor_acquire:
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection192:
+; O1-NEXT:  .Lpcsection207:
 ; O1-NEXT:    lock xorl $42, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -5861,7 +5906,7 @@ define void @atomic32_xor_acquire(ptr %a) {
 ; O2-LABEL: atomic32_xor_acquire:
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection192:
+; O2-NEXT:  .Lpcsection207:
 ; O2-NEXT:    lock xorl $42, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -5869,7 +5914,7 @@ define void @atomic32_xor_acquire(ptr %a) {
 ; O3-LABEL: atomic32_xor_acquire:
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection192:
+; O3-NEXT:  .Lpcsection207:
 ; O3-NEXT:    lock xorl $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -5914,19 +5959,19 @@ define void @atomic32_nand_acquire(ptr %a) {
 ; O1-LABEL: atomic32_nand_acquire:
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection193:
+; O1-NEXT:  .Lpcsection208:
 ; O1-NEXT:    movl (%rdi), %eax
 ; O1-NEXT:    .p2align 4, 0x90
 ; O1-NEXT:  .LBB119_1: # %atomicrmw.start
 ; O1-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O1-NEXT:    movl %eax, %ecx
-; O1-NEXT:  .Lpcsection194:
+; O1-NEXT:  .Lpcsection209:
 ; O1-NEXT:    notl %ecx
-; O1-NEXT:  .Lpcsection195:
+; O1-NEXT:  .Lpcsection210:
 ; O1-NEXT:    orl $-43, %ecx
-; O1-NEXT:  .Lpcsection196:
+; O1-NEXT:  .Lpcsection211:
 ; O1-NEXT:    lock cmpxchgl %ecx, (%rdi)
-; O1-NEXT:  .Lpcsection197:
+; O1-NEXT:  .Lpcsection212:
 ; O1-NEXT:    jne .LBB119_1
 ; O1-NEXT:  # %bb.2: # %atomicrmw.end
 ; O1-NEXT:    movq $1, foo(%rip)
@@ -5935,19 +5980,19 @@ define void @atomic32_nand_acquire(ptr %a) {
 ; O2-LABEL: atomic32_nand_acquire:
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection193:
+; O2-NEXT:  .Lpcsection208:
 ; O2-NEXT:    movl (%rdi), %eax
 ; O2-NEXT:    .p2align 4, 0x90
 ; O2-NEXT:  .LBB119_1: # %atomicrmw.start
 ; O2-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O2-NEXT:    movl %eax, %ecx
-; O2-NEXT:  .Lpcsection194:
+; O2-NEXT:  .Lpcsection209:
 ; O2-NEXT:    notl %ecx
-; O2-NEXT:  .Lpcsection195:
+; O2-NEXT:  .Lpcsection210:
 ; O2-NEXT:    orl $-43, %ecx
-; O2-NEXT:  .Lpcsection196:
+; O2-NEXT:  .Lpcsection211:
 ; O2-NEXT:    lock cmpxchgl %ecx, (%rdi)
-; O2-NEXT:  .Lpcsection197:
+; O2-NEXT:  .Lpcsection212:
 ; O2-NEXT:    jne .LBB119_1
 ; O2-NEXT:  # %bb.2: # %atomicrmw.end
 ; O2-NEXT:    movq $1, foo(%rip)
@@ -5956,19 +6001,19 @@ define void @atomic32_nand_acquire(ptr %a) {
 ; O3-LABEL: atomic32_nand_acquire:
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection193:
+; O3-NEXT:  .Lpcsection208:
 ; O3-NEXT:    movl (%rdi), %eax
 ; O3-NEXT:    .p2align 4, 0x90
 ; O3-NEXT:  .LBB119_1: # %atomicrmw.start
 ; O3-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O3-NEXT:    movl %eax, %ecx
-; O3-NEXT:  .Lpcsection194:
+; O3-NEXT:  .Lpcsection209:
 ; O3-NEXT:    notl %ecx
-; O3-NEXT:  .Lpcsection195:
+; O3-NEXT:  .Lpcsection210:
 ; O3-NEXT:    orl $-43, %ecx
-; O3-NEXT:  .Lpcsection196:
+; O3-NEXT:  .Lpcsection211:
 ; O3-NEXT:    lock cmpxchgl %ecx, (%rdi)
-; O3-NEXT:  .Lpcsection197:
+; O3-NEXT:  .Lpcsection212:
 ; O3-NEXT:    jne .LBB119_1
 ; O3-NEXT:  # %bb.2: # %atomicrmw.end
 ; O3-NEXT:    movq $1, foo(%rip)
@@ -5994,7 +6039,7 @@ define void @atomic32_xchg_release(ptr %a) {
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
 ; O1-NEXT:    movl $42, %eax
-; O1-NEXT:  .Lpcsection198:
+; O1-NEXT:  .Lpcsection213:
 ; O1-NEXT:    xchgl %eax, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -6003,7 +6048,7 @@ define void @atomic32_xchg_release(ptr %a) {
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
 ; O2-NEXT:    movl $42, %eax
-; O2-NEXT:  .Lpcsection198:
+; O2-NEXT:  .Lpcsection213:
 ; O2-NEXT:    xchgl %eax, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -6012,7 +6057,7 @@ define void @atomic32_xchg_release(ptr %a) {
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
 ; O3-NEXT:    movl $42, %eax
-; O3-NEXT:  .Lpcsection198:
+; O3-NEXT:  .Lpcsection213:
 ; O3-NEXT:    xchgl %eax, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -6035,7 +6080,7 @@ define void @atomic32_add_release(ptr %a) {
 ; O1-LABEL: atomic32_add_release:
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection199:
+; O1-NEXT:  .Lpcsection214:
 ; O1-NEXT:    lock addl $42, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -6043,7 +6088,7 @@ define void @atomic32_add_release(ptr %a) {
 ; O2-LABEL: atomic32_add_release:
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection199:
+; O2-NEXT:  .Lpcsection214:
 ; O2-NEXT:    lock addl $42, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -6051,7 +6096,7 @@ define void @atomic32_add_release(ptr %a) {
 ; O3-LABEL: atomic32_add_release:
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection199:
+; O3-NEXT:  .Lpcsection214:
 ; O3-NEXT:    lock addl $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -6074,7 +6119,7 @@ define void @atomic32_sub_release(ptr %a) {
 ; O1-LABEL: atomic32_sub_release:
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection200:
+; O1-NEXT:  .Lpcsection215:
 ; O1-NEXT:    lock subl $42, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -6082,7 +6127,7 @@ define void @atomic32_sub_release(ptr %a) {
 ; O2-LABEL: atomic32_sub_release:
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection200:
+; O2-NEXT:  .Lpcsection215:
 ; O2-NEXT:    lock subl $42, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -6090,7 +6135,7 @@ define void @atomic32_sub_release(ptr %a) {
 ; O3-LABEL: atomic32_sub_release:
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection200:
+; O3-NEXT:  .Lpcsection215:
 ; O3-NEXT:    lock subl $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -6113,7 +6158,7 @@ define void @atomic32_and_release(ptr %a) {
 ; O1-LABEL: atomic32_and_release:
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection201:
+; O1-NEXT:  .Lpcsection216:
 ; O1-NEXT:    lock andl $42, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -6121,7 +6166,7 @@ define void @atomic32_and_release(ptr %a) {
 ; O2-LABEL: atomic32_and_release:
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection201:
+; O2-NEXT:  .Lpcsection216:
 ; O2-NEXT:    lock andl $42, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -6129,7 +6174,7 @@ define void @atomic32_and_release(ptr %a) {
 ; O3-LABEL: atomic32_and_release:
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection201:
+; O3-NEXT:  .Lpcsection216:
 ; O3-NEXT:    lock andl $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -6152,7 +6197,7 @@ define void @atomic32_or_release(ptr %a) {
 ; O1-LABEL: atomic32_or_release:
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection202:
+; O1-NEXT:  .Lpcsection217:
 ; O1-NEXT:    lock orl $42, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -6160,7 +6205,7 @@ define void @atomic32_or_release(ptr %a) {
 ; O2-LABEL: atomic32_or_release:
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection202:
+; O2-NEXT:  .Lpcsection217:
 ; O2-NEXT:    lock orl $42, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -6168,7 +6213,7 @@ define void @atomic32_or_release(ptr %a) {
 ; O3-LABEL: atomic32_or_release:
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection202:
+; O3-NEXT:  .Lpcsection217:
 ; O3-NEXT:    lock orl $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -6191,7 +6236,7 @@ define void @atomic32_xor_release(ptr %a) {
 ; O1-LABEL: atomic32_xor_release:
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection203:
+; O1-NEXT:  .Lpcsection218:
 ; O1-NEXT:    lock xorl $42, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -6199,7 +6244,7 @@ define void @atomic32_xor_release(ptr %a) {
 ; O2-LABEL: atomic32_xor_release:
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection203:
+; O2-NEXT:  .Lpcsection218:
 ; O2-NEXT:    lock xorl $42, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -6207,7 +6252,7 @@ define void @atomic32_xor_release(ptr %a) {
 ; O3-LABEL: atomic32_xor_release:
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection203:
+; O3-NEXT:  .Lpcsection218:
 ; O3-NEXT:    lock xorl $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -6252,19 +6297,19 @@ define void @atomic32_nand_release(ptr %a) {
 ; O1-LABEL: atomic32_nand_release:
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection204:
+; O1-NEXT:  .Lpcsection219:
 ; O1-NEXT:    movl (%rdi), %eax
 ; O1-NEXT:    .p2align 4, 0x90
 ; O1-NEXT:  .LBB126_1: # %atomicrmw.start
 ; O1-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O1-NEXT:    movl %eax, %ecx
-; O1-NEXT:  .Lpcsection205:
+; O1-NEXT:  .Lpcsection220:
 ; O1-NEXT:    notl %ecx
-; O1-NEXT:  .Lpcsection206:
+; O1-NEXT:  .Lpcsection221:
 ; O1-NEXT:    orl $-43, %ecx
-; O1-NEXT:  .Lpcsection207:
+; O1-NEXT:  .Lpcsection222:
 ; O1-NEXT:    lock cmpxchgl %ecx, (%rdi)
-; O1-NEXT:  .Lpcsection208:
+; O1-NEXT:  .Lpcsection223:
 ; O1-NEXT:    jne .LBB126_1
 ; O1-NEXT:  # %bb.2: # %atomicrmw.end
 ; O1-NEXT:    movq $1, foo(%rip)
@@ -6273,19 +6318,19 @@ define void @atomic32_nand_release(ptr %a) {
 ; O2-LABEL: atomic32_nand_release:
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection204:
+; O2-NEXT:  .Lpcsection219:
 ; O2-NEXT:    movl (%rdi), %eax
 ; O2-NEXT:    .p2align 4, 0x90
 ; O2-NEXT:  .LBB126_1: # %atomicrmw.start
 ; O2-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O2-NEXT:    movl %eax, %ecx
-; O2-NEXT:  .Lpcsection205:
+; O2-NEXT:  .Lpcsection220:
 ; O2-NEXT:    notl %ecx
-; O2-NEXT:  .Lpcsection206:
+; O2-NEXT:  .Lpcsection221:
 ; O2-NEXT:    orl $-43, %ecx
-; O2-NEXT:  .Lpcsection207:
+; O2-NEXT:  .Lpcsection222:
 ; O2-NEXT:    lock cmpxchgl %ecx, (%rdi)
-; O2-NEXT:  .Lpcsection208:
+; O2-NEXT:  .Lpcsection223:
 ; O2-NEXT:    jne .LBB126_1
 ; O2-NEXT:  # %bb.2: # %atomicrmw.end
 ; O2-NEXT:    movq $1, foo(%rip)
@@ -6294,19 +6339,19 @@ define void @atomic32_nand_release(ptr %a) {
 ; O3-LABEL: atomic32_nand_release:
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection204:
+; O3-NEXT:  .Lpcsection219:
 ; O3-NEXT:    movl (%rdi), %eax
 ; O3-NEXT:    .p2align 4, 0x90
 ; O3-NEXT:  .LBB126_1: # %atomicrmw.start
 ; O3-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O3-NEXT:    movl %eax, %ecx
-; O3-NEXT:  .Lpcsection205:
+; O3-NEXT:  .Lpcsection220:
 ; O3-NEXT:    notl %ecx
-; O3-NEXT:  .Lpcsection206:
+; O3-NEXT:  .Lpcsection221:
 ; O3-NEXT:    orl $-43, %ecx
-; O3-NEXT:  .Lpcsection207:
+; O3-NEXT:  .Lpcsection222:
 ; O3-NEXT:    lock cmpxchgl %ecx, (%rdi)
-; O3-NEXT:  .Lpcsection208:
+; O3-NEXT:  .Lpcsection223:
 ; O3-NEXT:    jne .LBB126_1
 ; O3-NEXT:  # %bb.2: # %atomicrmw.end
 ; O3-NEXT:    movq $1, foo(%rip)
@@ -6332,7 +6377,7 @@ define void @atomic32_xchg_acq_rel(ptr %a) {
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
 ; O1-NEXT:    movl $42, %eax
-; O1-NEXT:  .Lpcsection209:
+; O1-NEXT:  .Lpcsection224:
 ; O1-NEXT:    xchgl %eax, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -6341,7 +6386,7 @@ define void @atomic32_xchg_acq_rel(ptr %a) {
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
 ; O2-NEXT:    movl $42, %eax
-; O2-NEXT:  .Lpcsection209:
+; O2-NEXT:  .Lpcsection224:
 ; O2-NEXT:    xchgl %eax, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -6350,7 +6395,7 @@ define void @atomic32_xchg_acq_rel(ptr %a) {
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
 ; O3-NEXT:    movl $42, %eax
-; O3-NEXT:  .Lpcsection209:
+; O3-NEXT:  .Lpcsection224:
 ; O3-NEXT:    xchgl %eax, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -6373,7 +6418,7 @@ define void @atomic32_add_acq_rel(ptr %a) {
 ; O1-LABEL: atomic32_add_acq_rel:
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection210:
+; O1-NEXT:  .Lpcsection225:
 ; O1-NEXT:    lock addl $42, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -6381,7 +6426,7 @@ define void @atomic32_add_acq_rel(ptr %a) {
 ; O2-LABEL: atomic32_add_acq_rel:
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection210:
+; O2-NEXT:  .Lpcsection225:
 ; O2-NEXT:    lock addl $42, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -6389,7 +6434,7 @@ define void @atomic32_add_acq_rel(ptr %a) {
 ; O3-LABEL: atomic32_add_acq_rel:
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection210:
+; O3-NEXT:  .Lpcsection225:
 ; O3-NEXT:    lock addl $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -6412,7 +6457,7 @@ define void @atomic32_sub_acq_rel(ptr %a) {
 ; O1-LABEL: atomic32_sub_acq_rel:
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection211:
+; O1-NEXT:  .Lpcsection226:
 ; O1-NEXT:    lock subl $42, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -6420,7 +6465,7 @@ define void @atomic32_sub_acq_rel(ptr %a) {
 ; O2-LABEL: atomic32_sub_acq_rel:
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection211:
+; O2-NEXT:  .Lpcsection226:
 ; O2-NEXT:    lock subl $42, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -6428,7 +6473,7 @@ define void @atomic32_sub_acq_rel(ptr %a) {
 ; O3-LABEL: atomic32_sub_acq_rel:
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection211:
+; O3-NEXT:  .Lpcsection226:
 ; O3-NEXT:    lock subl $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -6451,7 +6496,7 @@ define void @atomic32_and_acq_rel(ptr %a) {
 ; O1-LABEL: atomic32_and_acq_rel:
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection212:
+; O1-NEXT:  .Lpcsection227:
 ; O1-NEXT:    lock andl $42, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -6459,7 +6504,7 @@ define void @atomic32_and_acq_rel(ptr %a) {
 ; O2-LABEL: atomic32_and_acq_rel:
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection212:
+; O2-NEXT:  .Lpcsection227:
 ; O2-NEXT:    lock andl $42, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -6467,7 +6512,7 @@ define void @atomic32_and_acq_rel(ptr %a) {
 ; O3-LABEL: atomic32_and_acq_rel:
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection212:
+; O3-NEXT:  .Lpcsection227:
 ; O3-NEXT:    lock andl $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -6490,7 +6535,7 @@ define void @atomic32_or_acq_rel(ptr %a) {
 ; O1-LABEL: atomic32_or_acq_rel:
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection213:
+; O1-NEXT:  .Lpcsection228:
 ; O1-NEXT:    lock orl $42, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -6498,7 +6543,7 @@ define void @atomic32_or_acq_rel(ptr %a) {
 ; O2-LABEL: atomic32_or_acq_rel:
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection213:
+; O2-NEXT:  .Lpcsection228:
 ; O2-NEXT:    lock orl $42, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -6506,7 +6551,7 @@ define void @atomic32_or_acq_rel(ptr %a) {
 ; O3-LABEL: atomic32_or_acq_rel:
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection213:
+; O3-NEXT:  .Lpcsection228:
 ; O3-NEXT:    lock orl $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -6529,7 +6574,7 @@ define void @atomic32_xor_acq_rel(ptr %a) {
 ; O1-LABEL: atomic32_xor_acq_rel:
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection214:
+; O1-NEXT:  .Lpcsection229:
 ; O1-NEXT:    lock xorl $42, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -6537,7 +6582,7 @@ define void @atomic32_xor_acq_rel(ptr %a) {
 ; O2-LABEL: atomic32_xor_acq_rel:
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection214:
+; O2-NEXT:  .Lpcsection229:
 ; O2-NEXT:    lock xorl $42, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -6545,7 +6590,7 @@ define void @atomic32_xor_acq_rel(ptr %a) {
 ; O3-LABEL: atomic32_xor_acq_rel:
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection214:
+; O3-NEXT:  .Lpcsection229:
 ; O3-NEXT:    lock xorl $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -6590,19 +6635,19 @@ define void @atomic32_nand_acq_rel(ptr %a) {
 ; O1-LABEL: atomic32_nand_acq_rel:
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection215:
+; O1-NEXT:  .Lpcsection230:
 ; O1-NEXT:    movl (%rdi), %eax
 ; O1-NEXT:    .p2align 4, 0x90
 ; O1-NEXT:  .LBB133_1: # %atomicrmw.start
 ; O1-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O1-NEXT:    movl %eax, %ecx
-; O1-NEXT:  .Lpcsection216:
+; O1-NEXT:  .Lpcsection231:
 ; O1-NEXT:    notl %ecx
-; O1-NEXT:  .Lpcsection217:
+; O1-NEXT:  .Lpcsection232:
 ; O1-NEXT:    orl $-43, %ecx
-; O1-NEXT:  .Lpcsection218:
+; O1-NEXT:  .Lpcsection233:
 ; O1-NEXT:    lock cmpxchgl %ecx, (%rdi)
-; O1-NEXT:  .Lpcsection219:
+; O1-NEXT:  .Lpcsection234:
 ; O1-NEXT:    jne .LBB133_1
 ; O1-NEXT:  # %bb.2: # %atomicrmw.end
 ; O1-NEXT:    movq $1, foo(%rip)
@@ -6611,19 +6656,19 @@ define void @atomic32_nand_acq_rel(ptr %a) {
 ; O2-LABEL: atomic32_nand_acq_rel:
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection215:
+; O2-NEXT:  .Lpcsection230:
 ; O2-NEXT:    movl (%rdi), %eax
 ; O2-NEXT:    .p2align 4, 0x90
 ; O2-NEXT:  .LBB133_1: # %atomicrmw.start
 ; O2-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O2-NEXT:    movl %eax, %ecx
-; O2-NEXT:  .Lpcsection216:
+; O2-NEXT:  .Lpcsection231:
 ; O2-NEXT:    notl %ecx
-; O2-NEXT:  .Lpcsection217:
+; O2-NEXT:  .Lpcsection232:
 ; O2-NEXT:    orl $-43, %ecx
-; O2-NEXT:  .Lpcsection218:
+; O2-NEXT:  .Lpcsection233:
 ; O2-NEXT:    lock cmpxchgl %ecx, (%rdi)
-; O2-NEXT:  .Lpcsection219:
+; O2-NEXT:  .Lpcsection234:
 ; O2-NEXT:    jne .LBB133_1
 ; O2-NEXT:  # %bb.2: # %atomicrmw.end
 ; O2-NEXT:    movq $1, foo(%rip)
@@ -6632,19 +6677,19 @@ define void @atomic32_nand_acq_rel(ptr %a) {
 ; O3-LABEL: atomic32_nand_acq_rel:
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection215:
+; O3-NEXT:  .Lpcsection230:
 ; O3-NEXT:    movl (%rdi), %eax
 ; O3-NEXT:    .p2align 4, 0x90
 ; O3-NEXT:  .LBB133_1: # %atomicrmw.start
 ; O3-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O3-NEXT:    movl %eax, %ecx
-; O3-NEXT:  .Lpcsection216:
+; O3-NEXT:  .Lpcsection231:
 ; O3-NEXT:    notl %ecx
-; O3-NEXT:  .Lpcsection217:
+; O3-NEXT:  .Lpcsection232:
 ; O3-NEXT:    orl $-43, %ecx
-; O3-NEXT:  .Lpcsection218:
+; O3-NEXT:  .Lpcsection233:
 ; O3-NEXT:    lock cmpxchgl %ecx, (%rdi)
-; O3-NEXT:  .Lpcsection219:
+; O3-NEXT:  .Lpcsection234:
 ; O3-NEXT:    jne .LBB133_1
 ; O3-NEXT:  # %bb.2: # %atomicrmw.end
 ; O3-NEXT:    movq $1, foo(%rip)
@@ -6670,7 +6715,7 @@ define void @atomic32_xchg_seq_cst(ptr %a) {
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
 ; O1-NEXT:    movl $42, %eax
-; O1-NEXT:  .Lpcsection220:
+; O1-NEXT:  .Lpcsection235:
 ; O1-NEXT:    xchgl %eax, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -6679,7 +6724,7 @@ define void @atomic32_xchg_seq_cst(ptr %a) {
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
 ; O2-NEXT:    movl $42, %eax
-; O2-NEXT:  .Lpcsection220:
+; O2-NEXT:  .Lpcsection235:
 ; O2-NEXT:    xchgl %eax, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -6688,7 +6733,7 @@ define void @atomic32_xchg_seq_cst(ptr %a) {
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
 ; O3-NEXT:    movl $42, %eax
-; O3-NEXT:  .Lpcsection220:
+; O3-NEXT:  .Lpcsection235:
 ; O3-NEXT:    xchgl %eax, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -6711,7 +6756,7 @@ define void @atomic32_add_seq_cst(ptr %a) {
 ; O1-LABEL: atomic32_add_seq_cst:
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection221:
+; O1-NEXT:  .Lpcsection236:
 ; O1-NEXT:    lock addl $42, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -6719,7 +6764,7 @@ define void @atomic32_add_seq_cst(ptr %a) {
 ; O2-LABEL: atomic32_add_seq_cst:
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection221:
+; O2-NEXT:  .Lpcsection236:
 ; O2-NEXT:    lock addl $42, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -6727,7 +6772,7 @@ define void @atomic32_add_seq_cst(ptr %a) {
 ; O3-LABEL: atomic32_add_seq_cst:
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection221:
+; O3-NEXT:  .Lpcsection236:
 ; O3-NEXT:    lock addl $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -6750,7 +6795,7 @@ define void @atomic32_sub_seq_cst(ptr %a) {
 ; O1-LABEL: atomic32_sub_seq_cst:
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection222:
+; O1-NEXT:  .Lpcsection237:
 ; O1-NEXT:    lock subl $42, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -6758,7 +6803,7 @@ define void @atomic32_sub_seq_cst(ptr %a) {
 ; O2-LABEL: atomic32_sub_seq_cst:
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection222:
+; O2-NEXT:  .Lpcsection237:
 ; O2-NEXT:    lock subl $42, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -6766,7 +6811,7 @@ define void @atomic32_sub_seq_cst(ptr %a) {
 ; O3-LABEL: atomic32_sub_seq_cst:
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection222:
+; O3-NEXT:  .Lpcsection237:
 ; O3-NEXT:    lock subl $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -6789,7 +6834,7 @@ define void @atomic32_and_seq_cst(ptr %a) {
 ; O1-LABEL: atomic32_and_seq_cst:
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection223:
+; O1-NEXT:  .Lpcsection238:
 ; O1-NEXT:    lock andl $42, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -6797,7 +6842,7 @@ define void @atomic32_and_seq_cst(ptr %a) {
 ; O2-LABEL: atomic32_and_seq_cst:
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection223:
+; O2-NEXT:  .Lpcsection238:
 ; O2-NEXT:    lock andl $42, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -6805,7 +6850,7 @@ define void @atomic32_and_seq_cst(ptr %a) {
 ; O3-LABEL: atomic32_and_seq_cst:
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection223:
+; O3-NEXT:  .Lpcsection238:
 ; O3-NEXT:    lock andl $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -6828,7 +6873,7 @@ define void @atomic32_or_seq_cst(ptr %a) {
 ; O1-LABEL: atomic32_or_seq_cst:
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection224:
+; O1-NEXT:  .Lpcsection239:
 ; O1-NEXT:    lock orl $42, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -6836,7 +6881,7 @@ define void @atomic32_or_seq_cst(ptr %a) {
 ; O2-LABEL: atomic32_or_seq_cst:
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection224:
+; O2-NEXT:  .Lpcsection239:
 ; O2-NEXT:    lock orl $42, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -6844,7 +6889,7 @@ define void @atomic32_or_seq_cst(ptr %a) {
 ; O3-LABEL: atomic32_or_seq_cst:
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection224:
+; O3-NEXT:  .Lpcsection239:
 ; O3-NEXT:    lock orl $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -6867,7 +6912,7 @@ define void @atomic32_xor_seq_cst(ptr %a) {
 ; O1-LABEL: atomic32_xor_seq_cst:
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection225:
+; O1-NEXT:  .Lpcsection240:
 ; O1-NEXT:    lock xorl $42, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -6875,7 +6920,7 @@ define void @atomic32_xor_seq_cst(ptr %a) {
 ; O2-LABEL: atomic32_xor_seq_cst:
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection225:
+; O2-NEXT:  .Lpcsection240:
 ; O2-NEXT:    lock xorl $42, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -6883,7 +6928,7 @@ define void @atomic32_xor_seq_cst(ptr %a) {
 ; O3-LABEL: atomic32_xor_seq_cst:
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection225:
+; O3-NEXT:  .Lpcsection240:
 ; O3-NEXT:    lock xorl $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -6928,19 +6973,19 @@ define void @atomic32_nand_seq_cst(ptr %a) {
 ; O1-LABEL: atomic32_nand_seq_cst:
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection226:
+; O1-NEXT:  .Lpcsection241:
 ; O1-NEXT:    movl (%rdi), %eax
 ; O1-NEXT:    .p2align 4, 0x90
 ; O1-NEXT:  .LBB140_1: # %atomicrmw.start
 ; O1-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O1-NEXT:    movl %eax, %ecx
-; O1-NEXT:  .Lpcsection227:
+; O1-NEXT:  .Lpcsection242:
 ; O1-NEXT:    notl %ecx
-; O1-NEXT:  .Lpcsection228:
+; O1-NEXT:  .Lpcsection243:
 ; O1-NEXT:    orl $-43, %ecx
-; O1-NEXT:  .Lpcsection229:
+; O1-NEXT:  .Lpcsection244:
 ; O1-NEXT:    lock cmpxchgl %ecx, (%rdi)
-; O1-NEXT:  .Lpcsection230:
+; O1-NEXT:  .Lpcsection245:
 ; O1-NEXT:    jne .LBB140_1
 ; O1-NEXT:  # %bb.2: # %atomicrmw.end
 ; O1-NEXT:    movq $1, foo(%rip)
@@ -6949,19 +6994,19 @@ define void @atomic32_nand_seq_cst(ptr %a) {
 ; O2-LABEL: atomic32_nand_seq_cst:
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection226:
+; O2-NEXT:  .Lpcsection241:
 ; O2-NEXT:    movl (%rdi), %eax
 ; O2-NEXT:    .p2align 4, 0x90
 ; O2-NEXT:  .LBB140_1: # %atomicrmw.start
 ; O2-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O2-NEXT:    movl %eax, %ecx
-; O2-NEXT:  .Lpcsection227:
+; O2-NEXT:  .Lpcsection242:
 ; O2-NEXT:    notl %ecx
-; O2-NEXT:  .Lpcsection228:
+; O2-NEXT:  .Lpcsection243:
 ; O2-NEXT:    orl $-43, %ecx
-; O2-NEXT:  .Lpcsection229:
+; O2-NEXT:  .Lpcsection244:
 ; O2-NEXT:    lock cmpxchgl %ecx, (%rdi)
-; O2-NEXT:  .Lpcsection230:
+; O2-NEXT:  .Lpcsection245:
 ; O2-NEXT:    jne .LBB140_1
 ; O2-NEXT:  # %bb.2: # %atomicrmw.end
 ; O2-NEXT:    movq $1, foo(%rip)
@@ -6970,19 +7015,19 @@ define void @atomic32_nand_seq_cst(ptr %a) {
 ; O3-LABEL: atomic32_nand_seq_cst:
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection226:
+; O3-NEXT:  .Lpcsection241:
 ; O3-NEXT:    movl (%rdi), %eax
 ; O3-NEXT:    .p2align 4, 0x90
 ; O3-NEXT:  .LBB140_1: # %atomicrmw.start
 ; O3-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O3-NEXT:    movl %eax, %ecx
-; O3-NEXT:  .Lpcsection227:
+; O3-NEXT:  .Lpcsection242:
 ; O3-NEXT:    notl %ecx
-; O3-NEXT:  .Lpcsection228:
+; O3-NEXT:  .Lpcsection243:
 ; O3-NEXT:    orl $-43, %ecx
-; O3-NEXT:  .Lpcsection229:
+; O3-NEXT:  .Lpcsection244:
 ; O3-NEXT:    lock cmpxchgl %ecx, (%rdi)
-; O3-NEXT:  .Lpcsection230:
+; O3-NEXT:  .Lpcsection245:
 ; O3-NEXT:    jne .LBB140_1
 ; O3-NEXT:  # %bb.2: # %atomicrmw.end
 ; O3-NEXT:    movq $1, foo(%rip)
@@ -7020,14 +7065,17 @@ define void @atomic32_cas_monotonic(ptr %a) {
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
 ; O1-NEXT:    movl $1, %ecx
+; O1-NEXT:  .Lpcsection246:
 ; O1-NEXT:    movl $42, %eax
-; O1-NEXT:  .Lpcsection231:
+; O1-NEXT:  .Lpcsection247:
 ; O1-NEXT:    lock cmpxchgl %ecx, (%rdi)
+; O1-NEXT:  .Lpcsection248:
 ; O1-NEXT:    movl $42, %eax
-; O1-NEXT:  .Lpcsection232:
+; O1-NEXT:  .Lpcsection249:
 ; O1-NEXT:    lock cmpxchgl %ecx, (%rdi)
+; O1-NEXT:  .Lpcsection250:
 ; O1-NEXT:    movl $42, %eax
-; O1-NEXT:  .Lpcsection233:
+; O1-NEXT:  .Lpcsection251:
 ; O1-NEXT:    lock cmpxchgl %ecx, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -7036,14 +7084,17 @@ define void @atomic32_cas_monotonic(ptr %a) {
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
 ; O2-NEXT:    movl $1, %ecx
+; O2-NEXT:  .Lpcsection246:
 ; O2-NEXT:    movl $42, %eax
-; O2-NEXT:  .Lpcsection231:
+; O2-NEXT:  .Lpcsection247:
 ; O2-NEXT:    lock cmpxchgl %ecx, (%rdi)
+; O2-NEXT:  .Lpcsection248:
 ; O2-NEXT:    movl $42, %eax
-; O2-NEXT:  .Lpcsection232:
+; O2-NEXT:  .Lpcsection249:
 ; O2-NEXT:    lock cmpxchgl %ecx, (%rdi)
+; O2-NEXT:  .Lpcsection250:
 ; O2-NEXT:    movl $42, %eax
-; O2-NEXT:  .Lpcsection233:
+; O2-NEXT:  .Lpcsection251:
 ; O2-NEXT:    lock cmpxchgl %ecx, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -7052,14 +7103,17 @@ define void @atomic32_cas_monotonic(ptr %a) {
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
 ; O3-NEXT:    movl $1, %ecx
+; O3-NEXT:  .Lpcsection246:
 ; O3-NEXT:    movl $42, %eax
-; O3-NEXT:  .Lpcsection231:
+; O3-NEXT:  .Lpcsection247:
 ; O3-NEXT:    lock cmpxchgl %ecx, (%rdi)
+; O3-NEXT:  .Lpcsection248:
 ; O3-NEXT:    movl $42, %eax
-; O3-NEXT:  .Lpcsection232:
+; O3-NEXT:  .Lpcsection249:
 ; O3-NEXT:    lock cmpxchgl %ecx, (%rdi)
+; O3-NEXT:  .Lpcsection250:
 ; O3-NEXT:    movl $42, %eax
-; O3-NEXT:  .Lpcsection233:
+; O3-NEXT:  .Lpcsection251:
 ; O3-NEXT:    lock cmpxchgl %ecx, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -7098,14 +7152,17 @@ define void @atomic32_cas_acquire(ptr %a) {
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
 ; O1-NEXT:    movl $1, %ecx
+; O1-NEXT:  .Lpcsection252:
 ; O1-NEXT:    movl $42, %eax
-; O1-NEXT:  .Lpcsection234:
+; O1-NEXT:  .Lpcsection253:
 ; O1-NEXT:    lock cmpxchgl %ecx, (%rdi)
+; O1-NEXT:  .Lpcsection254:
 ; O1-NEXT:    movl $42, %eax
-; O1-NEXT:  .Lpcsection235:
+; O1-NEXT:  .Lpcsection255:
 ; O1-NEXT:    lock cmpxchgl %ecx, (%rdi)
+; O1-NEXT:  .Lpcsection256:
 ; O1-NEXT:    movl $42, %eax
-; O1-NEXT:  .Lpcsection236:
+; O1-NEXT:  .Lpcsection257:
 ; O1-NEXT:    lock cmpxchgl %ecx, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -7114,14 +7171,17 @@ define void @atomic32_cas_acquire(ptr %a) {
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
 ; O2-NEXT:    movl $1, %ecx
+; O2-NEXT:  .Lpcsection252:
 ; O2-NEXT:    movl $42, %eax
-; O2-NEXT:  .Lpcsection234:
+; O2-NEXT:  .Lpcsection253:
 ; O2-NEXT:    lock cmpxchgl %ecx, (%rdi)
+; O2-NEXT:  .Lpcsection254:
 ; O2-NEXT:    movl $42, %eax
-; O2-NEXT:  .Lpcsection235:
+; O2-NEXT:  .Lpcsection255:
 ; O2-NEXT:    lock cmpxchgl %ecx, (%rdi)
+; O2-NEXT:  .Lpcsection256:
 ; O2-NEXT:    movl $42, %eax
-; O2-NEXT:  .Lpcsection236:
+; O2-NEXT:  .Lpcsection257:
 ; O2-NEXT:    lock cmpxchgl %ecx, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -7130,14 +7190,17 @@ define void @atomic32_cas_acquire(ptr %a) {
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
 ; O3-NEXT:    movl $1, %ecx
+; O3-NEXT:  .Lpcsection252:
 ; O3-NEXT:    movl $42, %eax
-; O3-NEXT:  .Lpcsection234:
+; O3-NEXT:  .Lpcsection253:
 ; O3-NEXT:    lock cmpxchgl %ecx, (%rdi)
+; O3-NEXT:  .Lpcsection254:
 ; O3-NEXT:    movl $42, %eax
-; O3-NEXT:  .Lpcsection235:
+; O3-NEXT:  .Lpcsection255:
 ; O3-NEXT:    lock cmpxchgl %ecx, (%rdi)
+; O3-NEXT:  .Lpcsection256:
 ; O3-NEXT:    movl $42, %eax
-; O3-NEXT:  .Lpcsection236:
+; O3-NEXT:  .Lpcsection257:
 ; O3-NEXT:    lock cmpxchgl %ecx, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -7176,14 +7239,17 @@ define void @atomic32_cas_release(ptr %a) {
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
 ; O1-NEXT:    movl $1, %ecx
+; O1-NEXT:  .Lpcsection258:
 ; O1-NEXT:    movl $42, %eax
-; O1-NEXT:  .Lpcsection237:
+; O1-NEXT:  .Lpcsection259:
 ; O1-NEXT:    lock cmpxchgl %ecx, (%rdi)
+; O1-NEXT:  .Lpcsection260:
 ; O1-NEXT:    movl $42, %eax
-; O1-NEXT:  .Lpcsection238:
+; O1-NEXT:  .Lpcsection261:
 ; O1-NEXT:    lock cmpxchgl %ecx, (%rdi)
+; O1-NEXT:  .Lpcsection262:
 ; O1-NEXT:    movl $42, %eax
-; O1-NEXT:  .Lpcsection239:
+; O1-NEXT:  .Lpcsection263:
 ; O1-NEXT:    lock cmpxchgl %ecx, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -7192,14 +7258,17 @@ define void @atomic32_cas_release(ptr %a) {
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
 ; O2-NEXT:    movl $1, %ecx
+; O2-NEXT:  .Lpcsection258:
 ; O2-NEXT:    movl $42, %eax
-; O2-NEXT:  .Lpcsection237:
+; O2-NEXT:  .Lpcsection259:
 ; O2-NEXT:    lock cmpxchgl %ecx, (%rdi)
+; O2-NEXT:  .Lpcsection260:
 ; O2-NEXT:    movl $42, %eax
-; O2-NEXT:  .Lpcsection238:
+; O2-NEXT:  .Lpcsection261:
 ; O2-NEXT:    lock cmpxchgl %ecx, (%rdi)
+; O2-NEXT:  .Lpcsection262:
 ; O2-NEXT:    movl $42, %eax
-; O2-NEXT:  .Lpcsection239:
+; O2-NEXT:  .Lpcsection263:
 ; O2-NEXT:    lock cmpxchgl %ecx, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -7208,14 +7277,17 @@ define void @atomic32_cas_release(ptr %a) {
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
 ; O3-NEXT:    movl $1, %ecx
+; O3-NEXT:  .Lpcsection258:
 ; O3-NEXT:    movl $42, %eax
-; O3-NEXT:  .Lpcsection237:
+; O3-NEXT:  .Lpcsection259:
 ; O3-NEXT:    lock cmpxchgl %ecx, (%rdi)
+; O3-NEXT:  .Lpcsection260:
 ; O3-NEXT:    movl $42, %eax
-; O3-NEXT:  .Lpcsection238:
+; O3-NEXT:  .Lpcsection261:
 ; O3-NEXT:    lock cmpxchgl %ecx, (%rdi)
+; O3-NEXT:  .Lpcsection262:
 ; O3-NEXT:    movl $42, %eax
-; O3-NEXT:  .Lpcsection239:
+; O3-NEXT:  .Lpcsection263:
 ; O3-NEXT:    lock cmpxchgl %ecx, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -7254,14 +7326,17 @@ define void @atomic32_cas_acq_rel(ptr %a) {
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
 ; O1-NEXT:    movl $1, %ecx
+; O1-NEXT:  .Lpcsection264:
 ; O1-NEXT:    movl $42, %eax
-; O1-NEXT:  .Lpcsection240:
+; O1-NEXT:  .Lpcsection265:
 ; O1-NEXT:    lock cmpxchgl %ecx, (%rdi)
+; O1-NEXT:  .Lpcsection266:
 ; O1-NEXT:    movl $42, %eax
-; O1-NEXT:  .Lpcsection241:
+; O1-NEXT:  .Lpcsection267:
 ; O1-NEXT:    lock cmpxchgl %ecx, (%rdi)
+; O1-NEXT:  .Lpcsection268:
 ; O1-NEXT:    movl $42, %eax
-; O1-NEXT:  .Lpcsection242:
+; O1-NEXT:  .Lpcsection269:
 ; O1-NEXT:    lock cmpxchgl %ecx, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -7270,14 +7345,17 @@ define void @atomic32_cas_acq_rel(ptr %a) {
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
 ; O2-NEXT:    movl $1, %ecx
+; O2-NEXT:  .Lpcsection264:
 ; O2-NEXT:    movl $42, %eax
-; O2-NEXT:  .Lpcsection240:
+; O2-NEXT:  .Lpcsection265:
 ; O2-NEXT:    lock cmpxchgl %ecx, (%rdi)
+; O2-NEXT:  .Lpcsection266:
 ; O2-NEXT:    movl $42, %eax
-; O2-NEXT:  .Lpcsection241:
+; O2-NEXT:  .Lpcsection267:
 ; O2-NEXT:    lock cmpxchgl %ecx, (%rdi)
+; O2-NEXT:  .Lpcsection268:
 ; O2-NEXT:    movl $42, %eax
-; O2-NEXT:  .Lpcsection242:
+; O2-NEXT:  .Lpcsection269:
 ; O2-NEXT:    lock cmpxchgl %ecx, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -7286,14 +7364,17 @@ define void @atomic32_cas_acq_rel(ptr %a) {
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
 ; O3-NEXT:    movl $1, %ecx
+; O3-NEXT:  .Lpcsection264:
 ; O3-NEXT:    movl $42, %eax
-; O3-NEXT:  .Lpcsection240:
+; O3-NEXT:  .Lpcsection265:
 ; O3-NEXT:    lock cmpxchgl %ecx, (%rdi)
+; O3-NEXT:  .Lpcsection266:
 ; O3-NEXT:    movl $42, %eax
-; O3-NEXT:  .Lpcsection241:
+; O3-NEXT:  .Lpcsection267:
 ; O3-NEXT:    lock cmpxchgl %ecx, (%rdi)
+; O3-NEXT:  .Lpcsection268:
 ; O3-NEXT:    movl $42, %eax
-; O3-NEXT:  .Lpcsection242:
+; O3-NEXT:  .Lpcsection269:
 ; O3-NEXT:    lock cmpxchgl %ecx, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -7332,14 +7413,17 @@ define void @atomic32_cas_seq_cst(ptr %a) {
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
 ; O1-NEXT:    movl $1, %ecx
+; O1-NEXT:  .Lpcsection270:
 ; O1-NEXT:    movl $42, %eax
-; O1-NEXT:  .Lpcsection243:
+; O1-NEXT:  .Lpcsection271:
 ; O1-NEXT:    lock cmpxchgl %ecx, (%rdi)
+; O1-NEXT:  .Lpcsection272:
 ; O1-NEXT:    movl $42, %eax
-; O1-NEXT:  .Lpcsection244:
+; O1-NEXT:  .Lpcsection273:
 ; O1-NEXT:    lock cmpxchgl %ecx, (%rdi)
+; O1-NEXT:  .Lpcsection274:
 ; O1-NEXT:    movl $42, %eax
-; O1-NEXT:  .Lpcsection245:
+; O1-NEXT:  .Lpcsection275:
 ; O1-NEXT:    lock cmpxchgl %ecx, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -7348,14 +7432,17 @@ define void @atomic32_cas_seq_cst(ptr %a) {
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
 ; O2-NEXT:    movl $1, %ecx
+; O2-NEXT:  .Lpcsection270:
 ; O2-NEXT:    movl $42, %eax
-; O2-NEXT:  .Lpcsection243:
+; O2-NEXT:  .Lpcsection271:
 ; O2-NEXT:    lock cmpxchgl %ecx, (%rdi)
+; O2-NEXT:  .Lpcsection272:
 ; O2-NEXT:    movl $42, %eax
-; O2-NEXT:  .Lpcsection244:
+; O2-NEXT:  .Lpcsection273:
 ; O2-NEXT:    lock cmpxchgl %ecx, (%rdi)
+; O2-NEXT:  .Lpcsection274:
 ; O2-NEXT:    movl $42, %eax
-; O2-NEXT:  .Lpcsection245:
+; O2-NEXT:  .Lpcsection275:
 ; O2-NEXT:    lock cmpxchgl %ecx, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -7364,14 +7451,17 @@ define void @atomic32_cas_seq_cst(ptr %a) {
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
 ; O3-NEXT:    movl $1, %ecx
+; O3-NEXT:  .Lpcsection270:
 ; O3-NEXT:    movl $42, %eax
-; O3-NEXT:  .Lpcsection243:
+; O3-NEXT:  .Lpcsection271:
 ; O3-NEXT:    lock cmpxchgl %ecx, (%rdi)
+; O3-NEXT:  .Lpcsection272:
 ; O3-NEXT:    movl $42, %eax
-; O3-NEXT:  .Lpcsection244:
+; O3-NEXT:  .Lpcsection273:
 ; O3-NEXT:    lock cmpxchgl %ecx, (%rdi)
+; O3-NEXT:  .Lpcsection274:
 ; O3-NEXT:    movl $42, %eax
-; O3-NEXT:  .Lpcsection245:
+; O3-NEXT:  .Lpcsection275:
 ; O3-NEXT:    lock cmpxchgl %ecx, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -7396,7 +7486,7 @@ define i64 @atomic64_load_unordered(ptr %a) {
 ; O1-LABEL: atomic64_load_unordered:
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection246:
+; O1-NEXT:  .Lpcsection276:
 ; O1-NEXT:    movq (%rdi), %rax
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -7404,7 +7494,7 @@ define i64 @atomic64_load_unordered(ptr %a) {
 ; O2-LABEL: atomic64_load_unordered:
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection246:
+; O2-NEXT:  .Lpcsection276:
 ; O2-NEXT:    movq (%rdi), %rax
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -7412,7 +7502,7 @@ define i64 @atomic64_load_unordered(ptr %a) {
 ; O3-LABEL: atomic64_load_unordered:
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection246:
+; O3-NEXT:  .Lpcsection276:
 ; O3-NEXT:    movq (%rdi), %rax
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -7435,7 +7525,7 @@ define i64 @atomic64_load_monotonic(ptr %a) {
 ; O1-LABEL: atomic64_load_monotonic:
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection247:
+; O1-NEXT:  .Lpcsection277:
 ; O1-NEXT:    movq (%rdi), %rax
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -7443,7 +7533,7 @@ define i64 @atomic64_load_monotonic(ptr %a) {
 ; O2-LABEL: atomic64_load_monotonic:
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection247:
+; O2-NEXT:  .Lpcsection277:
 ; O2-NEXT:    movq (%rdi), %rax
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -7451,7 +7541,7 @@ define i64 @atomic64_load_monotonic(ptr %a) {
 ; O3-LABEL: atomic64_load_monotonic:
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection247:
+; O3-NEXT:  .Lpcsection277:
 ; O3-NEXT:    movq (%rdi), %rax
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -7474,7 +7564,7 @@ define i64 @atomic64_load_acquire(ptr %a) {
 ; O1-LABEL: atomic64_load_acquire:
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection248:
+; O1-NEXT:  .Lpcsection278:
 ; O1-NEXT:    movq (%rdi), %rax
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -7482,7 +7572,7 @@ define i64 @atomic64_load_acquire(ptr %a) {
 ; O2-LABEL: atomic64_load_acquire:
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection248:
+; O2-NEXT:  .Lpcsection278:
 ; O2-NEXT:    movq (%rdi), %rax
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -7490,7 +7580,7 @@ define i64 @atomic64_load_acquire(ptr %a) {
 ; O3-LABEL: atomic64_load_acquire:
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection248:
+; O3-NEXT:  .Lpcsection278:
 ; O3-NEXT:    movq (%rdi), %rax
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -7513,7 +7603,7 @@ define i64 @atomic64_load_seq_cst(ptr %a) {
 ; O1-LABEL: atomic64_load_seq_cst:
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection249:
+; O1-NEXT:  .Lpcsection279:
 ; O1-NEXT:    movq (%rdi), %rax
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -7521,7 +7611,7 @@ define i64 @atomic64_load_seq_cst(ptr %a) {
 ; O2-LABEL: atomic64_load_seq_cst:
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection249:
+; O2-NEXT:  .Lpcsection279:
 ; O2-NEXT:    movq (%rdi), %rax
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -7529,7 +7619,7 @@ define i64 @atomic64_load_seq_cst(ptr %a) {
 ; O3-LABEL: atomic64_load_seq_cst:
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection249:
+; O3-NEXT:  .Lpcsection279:
 ; O3-NEXT:    movq (%rdi), %rax
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -7552,7 +7642,7 @@ define ptr @atomic64_load_seq_cst_ptr_ty(ptr %a) {
 ; O1-LABEL: atomic64_load_seq_cst_ptr_ty:
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection250:
+; O1-NEXT:  .Lpcsection280:
 ; O1-NEXT:    movq (%rdi), %rax
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -7560,7 +7650,7 @@ define ptr @atomic64_load_seq_cst_ptr_ty(ptr %a) {
 ; O2-LABEL: atomic64_load_seq_cst_ptr_ty:
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection250:
+; O2-NEXT:  .Lpcsection280:
 ; O2-NEXT:    movq (%rdi), %rax
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -7568,7 +7658,7 @@ define ptr @atomic64_load_seq_cst_ptr_ty(ptr %a) {
 ; O3-LABEL: atomic64_load_seq_cst_ptr_ty:
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection250:
+; O3-NEXT:  .Lpcsection280:
 ; O3-NEXT:    movq (%rdi), %rax
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -7591,7 +7681,7 @@ define void @atomic64_store_unordered(ptr %a) {
 ; O1-LABEL: atomic64_store_unordered:
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection251:
+; O1-NEXT:  .Lpcsection281:
 ; O1-NEXT:    movq $42, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -7599,7 +7689,7 @@ define void @atomic64_store_unordered(ptr %a) {
 ; O2-LABEL: atomic64_store_unordered:
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection251:
+; O2-NEXT:  .Lpcsection281:
 ; O2-NEXT:    movq $42, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -7607,7 +7697,7 @@ define void @atomic64_store_unordered(ptr %a) {
 ; O3-LABEL: atomic64_store_unordered:
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection251:
+; O3-NEXT:  .Lpcsection281:
 ; O3-NEXT:    movq $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -7630,7 +7720,7 @@ define void @atomic64_store_monotonic(ptr %a) {
 ; O1-LABEL: atomic64_store_monotonic:
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection252:
+; O1-NEXT:  .Lpcsection282:
 ; O1-NEXT:    movq $42, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -7638,7 +7728,7 @@ define void @atomic64_store_monotonic(ptr %a) {
 ; O2-LABEL: atomic64_store_monotonic:
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection252:
+; O2-NEXT:  .Lpcsection282:
 ; O2-NEXT:    movq $42, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -7646,7 +7736,7 @@ define void @atomic64_store_monotonic(ptr %a) {
 ; O3-LABEL: atomic64_store_monotonic:
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection252:
+; O3-NEXT:  .Lpcsection282:
 ; O3-NEXT:    movq $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -7669,7 +7759,7 @@ define void @atomic64_store_release(ptr %a) {
 ; O1-LABEL: atomic64_store_release:
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection253:
+; O1-NEXT:  .Lpcsection283:
 ; O1-NEXT:    movq $42, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -7677,7 +7767,7 @@ define void @atomic64_store_release(ptr %a) {
 ; O2-LABEL: atomic64_store_release:
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection253:
+; O2-NEXT:  .Lpcsection283:
 ; O2-NEXT:    movq $42, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -7685,7 +7775,7 @@ define void @atomic64_store_release(ptr %a) {
 ; O3-LABEL: atomic64_store_release:
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection253:
+; O3-NEXT:  .Lpcsection283:
 ; O3-NEXT:    movq $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -7710,7 +7800,7 @@ define void @atomic64_store_seq_cst(ptr %a) {
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
 ; O1-NEXT:    movl $42, %eax
-; O1-NEXT:  .Lpcsection254:
+; O1-NEXT:  .Lpcsection284:
 ; O1-NEXT:    xchgq %rax, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -7719,7 +7809,7 @@ define void @atomic64_store_seq_cst(ptr %a) {
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
 ; O2-NEXT:    movl $42, %eax
-; O2-NEXT:  .Lpcsection254:
+; O2-NEXT:  .Lpcsection284:
 ; O2-NEXT:    xchgq %rax, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -7728,7 +7818,7 @@ define void @atomic64_store_seq_cst(ptr %a) {
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
 ; O3-NEXT:    movl $42, %eax
-; O3-NEXT:  .Lpcsection254:
+; O3-NEXT:  .Lpcsection284:
 ; O3-NEXT:    xchgq %rax, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -7751,7 +7841,7 @@ define void @atomic64_store_seq_cst_ptr_ty(ptr %a, ptr %v) {
 ; O1-LABEL: atomic64_store_seq_cst_ptr_ty:
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection255:
+; O1-NEXT:  .Lpcsection285:
 ; O1-NEXT:    xchgq %rsi, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -7759,7 +7849,7 @@ define void @atomic64_store_seq_cst_ptr_ty(ptr %a, ptr %v) {
 ; O2-LABEL: atomic64_store_seq_cst_ptr_ty:
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection255:
+; O2-NEXT:  .Lpcsection285:
 ; O2-NEXT:    xchgq %rsi, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -7767,7 +7857,7 @@ define void @atomic64_store_seq_cst_ptr_ty(ptr %a, ptr %v) {
 ; O3-LABEL: atomic64_store_seq_cst_ptr_ty:
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection255:
+; O3-NEXT:  .Lpcsection285:
 ; O3-NEXT:    xchgq %rsi, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -7792,7 +7882,7 @@ define void @atomic64_xchg_monotonic(ptr %a) {
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
 ; O1-NEXT:    movl $42, %eax
-; O1-NEXT:  .Lpcsection256:
+; O1-NEXT:  .Lpcsection286:
 ; O1-NEXT:    xchgq %rax, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -7801,7 +7891,7 @@ define void @atomic64_xchg_monotonic(ptr %a) {
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
 ; O2-NEXT:    movl $42, %eax
-; O2-NEXT:  .Lpcsection256:
+; O2-NEXT:  .Lpcsection286:
 ; O2-NEXT:    xchgq %rax, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -7810,7 +7900,7 @@ define void @atomic64_xchg_monotonic(ptr %a) {
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
 ; O3-NEXT:    movl $42, %eax
-; O3-NEXT:  .Lpcsection256:
+; O3-NEXT:  .Lpcsection286:
 ; O3-NEXT:    xchgq %rax, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -7833,7 +7923,7 @@ define void @atomic64_add_monotonic(ptr %a) {
 ; O1-LABEL: atomic64_add_monotonic:
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection257:
+; O1-NEXT:  .Lpcsection287:
 ; O1-NEXT:    lock addq $42, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -7841,7 +7931,7 @@ define void @atomic64_add_monotonic(ptr %a) {
 ; O2-LABEL: atomic64_add_monotonic:
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection257:
+; O2-NEXT:  .Lpcsection287:
 ; O2-NEXT:    lock addq $42, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -7849,7 +7939,7 @@ define void @atomic64_add_monotonic(ptr %a) {
 ; O3-LABEL: atomic64_add_monotonic:
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection257:
+; O3-NEXT:  .Lpcsection287:
 ; O3-NEXT:    lock addq $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -7872,7 +7962,7 @@ define void @atomic64_sub_monotonic(ptr %a) {
 ; O1-LABEL: atomic64_sub_monotonic:
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection258:
+; O1-NEXT:  .Lpcsection288:
 ; O1-NEXT:    lock subq $42, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -7880,7 +7970,7 @@ define void @atomic64_sub_monotonic(ptr %a) {
 ; O2-LABEL: atomic64_sub_monotonic:
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection258:
+; O2-NEXT:  .Lpcsection288:
 ; O2-NEXT:    lock subq $42, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -7888,7 +7978,7 @@ define void @atomic64_sub_monotonic(ptr %a) {
 ; O3-LABEL: atomic64_sub_monotonic:
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection258:
+; O3-NEXT:  .Lpcsection288:
 ; O3-NEXT:    lock subq $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -7911,7 +8001,7 @@ define void @atomic64_and_monotonic(ptr %a) {
 ; O1-LABEL: atomic64_and_monotonic:
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection259:
+; O1-NEXT:  .Lpcsection289:
 ; O1-NEXT:    lock andq $42, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -7919,7 +8009,7 @@ define void @atomic64_and_monotonic(ptr %a) {
 ; O2-LABEL: atomic64_and_monotonic:
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection259:
+; O2-NEXT:  .Lpcsection289:
 ; O2-NEXT:    lock andq $42, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -7927,7 +8017,7 @@ define void @atomic64_and_monotonic(ptr %a) {
 ; O3-LABEL: atomic64_and_monotonic:
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection259:
+; O3-NEXT:  .Lpcsection289:
 ; O3-NEXT:    lock andq $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -7950,7 +8040,7 @@ define void @atomic64_or_monotonic(ptr %a) {
 ; O1-LABEL: atomic64_or_monotonic:
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection260:
+; O1-NEXT:  .Lpcsection290:
 ; O1-NEXT:    lock orq $42, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -7958,7 +8048,7 @@ define void @atomic64_or_monotonic(ptr %a) {
 ; O2-LABEL: atomic64_or_monotonic:
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection260:
+; O2-NEXT:  .Lpcsection290:
 ; O2-NEXT:    lock orq $42, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -7966,7 +8056,7 @@ define void @atomic64_or_monotonic(ptr %a) {
 ; O3-LABEL: atomic64_or_monotonic:
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection260:
+; O3-NEXT:  .Lpcsection290:
 ; O3-NEXT:    lock orq $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -7989,7 +8079,7 @@ define void @atomic64_xor_monotonic(ptr %a) {
 ; O1-LABEL: atomic64_xor_monotonic:
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection261:
+; O1-NEXT:  .Lpcsection291:
 ; O1-NEXT:    lock xorq $42, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -7997,7 +8087,7 @@ define void @atomic64_xor_monotonic(ptr %a) {
 ; O2-LABEL: atomic64_xor_monotonic:
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection261:
+; O2-NEXT:  .Lpcsection291:
 ; O2-NEXT:    lock xorq $42, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -8005,7 +8095,7 @@ define void @atomic64_xor_monotonic(ptr %a) {
 ; O3-LABEL: atomic64_xor_monotonic:
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection261:
+; O3-NEXT:  .Lpcsection291:
 ; O3-NEXT:    lock xorq $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -8053,19 +8143,19 @@ define void @atomic64_nand_monotonic(ptr %a) {
 ; O1-LABEL: atomic64_nand_monotonic:
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection262:
+; O1-NEXT:  .Lpcsection292:
 ; O1-NEXT:    movq (%rdi), %rax
 ; O1-NEXT:    .p2align 4, 0x90
 ; O1-NEXT:  .LBB162_1: # %atomicrmw.start
 ; O1-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O1-NEXT:    movl %eax, %ecx
-; O1-NEXT:  .Lpcsection263:
+; O1-NEXT:  .Lpcsection293:
 ; O1-NEXT:    notl %ecx
-; O1-NEXT:  .Lpcsection264:
+; O1-NEXT:  .Lpcsection294:
 ; O1-NEXT:    orq $-43, %rcx
-; O1-NEXT:  .Lpcsection265:
+; O1-NEXT:  .Lpcsection295:
 ; O1-NEXT:    lock cmpxchgq %rcx, (%rdi)
-; O1-NEXT:  .Lpcsection266:
+; O1-NEXT:  .Lpcsection296:
 ; O1-NEXT:    jne .LBB162_1
 ; O1-NEXT:  # %bb.2: # %atomicrmw.end
 ; O1-NEXT:    movq $1, foo(%rip)
@@ -8074,19 +8164,19 @@ define void @atomic64_nand_monotonic(ptr %a) {
 ; O2-LABEL: atomic64_nand_monotonic:
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection262:
+; O2-NEXT:  .Lpcsection292:
 ; O2-NEXT:    movq (%rdi), %rax
 ; O2-NEXT:    .p2align 4, 0x90
 ; O2-NEXT:  .LBB162_1: # %atomicrmw.start
 ; O2-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O2-NEXT:    movl %eax, %ecx
-; O2-NEXT:  .Lpcsection263:
+; O2-NEXT:  .Lpcsection293:
 ; O2-NEXT:    notl %ecx
-; O2-NEXT:  .Lpcsection264:
+; O2-NEXT:  .Lpcsection294:
 ; O2-NEXT:    orq $-43, %rcx
-; O2-NEXT:  .Lpcsection265:
+; O2-NEXT:  .Lpcsection295:
 ; O2-NEXT:    lock cmpxchgq %rcx, (%rdi)
-; O2-NEXT:  .Lpcsection266:
+; O2-NEXT:  .Lpcsection296:
 ; O2-NEXT:    jne .LBB162_1
 ; O2-NEXT:  # %bb.2: # %atomicrmw.end
 ; O2-NEXT:    movq $1, foo(%rip)
@@ -8095,19 +8185,19 @@ define void @atomic64_nand_monotonic(ptr %a) {
 ; O3-LABEL: atomic64_nand_monotonic:
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection262:
+; O3-NEXT:  .Lpcsection292:
 ; O3-NEXT:    movq (%rdi), %rax
 ; O3-NEXT:    .p2align 4, 0x90
 ; O3-NEXT:  .LBB162_1: # %atomicrmw.start
 ; O3-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O3-NEXT:    movl %eax, %ecx
-; O3-NEXT:  .Lpcsection263:
+; O3-NEXT:  .Lpcsection293:
 ; O3-NEXT:    notl %ecx
-; O3-NEXT:  .Lpcsection264:
+; O3-NEXT:  .Lpcsection294:
 ; O3-NEXT:    orq $-43, %rcx
-; O3-NEXT:  .Lpcsection265:
+; O3-NEXT:  .Lpcsection295:
 ; O3-NEXT:    lock cmpxchgq %rcx, (%rdi)
-; O3-NEXT:  .Lpcsection266:
+; O3-NEXT:  .Lpcsection296:
 ; O3-NEXT:    jne .LBB162_1
 ; O3-NEXT:  # %bb.2: # %atomicrmw.end
 ; O3-NEXT:    movq $1, foo(%rip)
@@ -8133,7 +8223,7 @@ define void @atomic64_xchg_acquire(ptr %a) {
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
 ; O1-NEXT:    movl $42, %eax
-; O1-NEXT:  .Lpcsection267:
+; O1-NEXT:  .Lpcsection297:
 ; O1-NEXT:    xchgq %rax, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -8142,7 +8232,7 @@ define void @atomic64_xchg_acquire(ptr %a) {
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
 ; O2-NEXT:    movl $42, %eax
-; O2-NEXT:  .Lpcsection267:
+; O2-NEXT:  .Lpcsection297:
 ; O2-NEXT:    xchgq %rax, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -8151,7 +8241,7 @@ define void @atomic64_xchg_acquire(ptr %a) {
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
 ; O3-NEXT:    movl $42, %eax
-; O3-NEXT:  .Lpcsection267:
+; O3-NEXT:  .Lpcsection297:
 ; O3-NEXT:    xchgq %rax, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -8174,7 +8264,7 @@ define void @atomic64_add_acquire(ptr %a) {
 ; O1-LABEL: atomic64_add_acquire:
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection268:
+; O1-NEXT:  .Lpcsection298:
 ; O1-NEXT:    lock addq $42, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -8182,7 +8272,7 @@ define void @atomic64_add_acquire(ptr %a) {
 ; O2-LABEL: atomic64_add_acquire:
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection268:
+; O2-NEXT:  .Lpcsection298:
 ; O2-NEXT:    lock addq $42, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -8190,7 +8280,7 @@ define void @atomic64_add_acquire(ptr %a) {
 ; O3-LABEL: atomic64_add_acquire:
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection268:
+; O3-NEXT:  .Lpcsection298:
 ; O3-NEXT:    lock addq $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -8213,7 +8303,7 @@ define void @atomic64_sub_acquire(ptr %a) {
 ; O1-LABEL: atomic64_sub_acquire:
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection269:
+; O1-NEXT:  .Lpcsection299:
 ; O1-NEXT:    lock subq $42, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -8221,7 +8311,7 @@ define void @atomic64_sub_acquire(ptr %a) {
 ; O2-LABEL: atomic64_sub_acquire:
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection269:
+; O2-NEXT:  .Lpcsection299:
 ; O2-NEXT:    lock subq $42, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -8229,7 +8319,7 @@ define void @atomic64_sub_acquire(ptr %a) {
 ; O3-LABEL: atomic64_sub_acquire:
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection269:
+; O3-NEXT:  .Lpcsection299:
 ; O3-NEXT:    lock subq $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -8252,7 +8342,7 @@ define void @atomic64_and_acquire(ptr %a) {
 ; O1-LABEL: atomic64_and_acquire:
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection270:
+; O1-NEXT:  .Lpcsection300:
 ; O1-NEXT:    lock andq $42, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -8260,7 +8350,7 @@ define void @atomic64_and_acquire(ptr %a) {
 ; O2-LABEL: atomic64_and_acquire:
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection270:
+; O2-NEXT:  .Lpcsection300:
 ; O2-NEXT:    lock andq $42, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -8268,7 +8358,7 @@ define void @atomic64_and_acquire(ptr %a) {
 ; O3-LABEL: atomic64_and_acquire:
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection270:
+; O3-NEXT:  .Lpcsection300:
 ; O3-NEXT:    lock andq $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -8291,7 +8381,7 @@ define void @atomic64_or_acquire(ptr %a) {
 ; O1-LABEL: atomic64_or_acquire:
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection271:
+; O1-NEXT:  .Lpcsection301:
 ; O1-NEXT:    lock orq $42, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -8299,7 +8389,7 @@ define void @atomic64_or_acquire(ptr %a) {
 ; O2-LABEL: atomic64_or_acquire:
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection271:
+; O2-NEXT:  .Lpcsection301:
 ; O2-NEXT:    lock orq $42, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -8307,7 +8397,7 @@ define void @atomic64_or_acquire(ptr %a) {
 ; O3-LABEL: atomic64_or_acquire:
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection271:
+; O3-NEXT:  .Lpcsection301:
 ; O3-NEXT:    lock orq $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -8330,7 +8420,7 @@ define void @atomic64_xor_acquire(ptr %a) {
 ; O1-LABEL: atomic64_xor_acquire:
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection272:
+; O1-NEXT:  .Lpcsection302:
 ; O1-NEXT:    lock xorq $42, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -8338,7 +8428,7 @@ define void @atomic64_xor_acquire(ptr %a) {
 ; O2-LABEL: atomic64_xor_acquire:
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection272:
+; O2-NEXT:  .Lpcsection302:
 ; O2-NEXT:    lock xorq $42, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -8346,7 +8436,7 @@ define void @atomic64_xor_acquire(ptr %a) {
 ; O3-LABEL: atomic64_xor_acquire:
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection272:
+; O3-NEXT:  .Lpcsection302:
 ; O3-NEXT:    lock xorq $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -8394,19 +8484,19 @@ define void @atomic64_nand_acquire(ptr %a) {
 ; O1-LABEL: atomic64_nand_acquire:
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection273:
+; O1-NEXT:  .Lpcsection303:
 ; O1-NEXT:    movq (%rdi), %rax
 ; O1-NEXT:    .p2align 4, 0x90
 ; O1-NEXT:  .LBB169_1: # %atomicrmw.start
 ; O1-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O1-NEXT:    movl %eax, %ecx
-; O1-NEXT:  .Lpcsection274:
+; O1-NEXT:  .Lpcsection304:
 ; O1-NEXT:    notl %ecx
-; O1-NEXT:  .Lpcsection275:
+; O1-NEXT:  .Lpcsection305:
 ; O1-NEXT:    orq $-43, %rcx
-; O1-NEXT:  .Lpcsection276:
+; O1-NEXT:  .Lpcsection306:
 ; O1-NEXT:    lock cmpxchgq %rcx, (%rdi)
-; O1-NEXT:  .Lpcsection277:
+; O1-NEXT:  .Lpcsection307:
 ; O1-NEXT:    jne .LBB169_1
 ; O1-NEXT:  # %bb.2: # %atomicrmw.end
 ; O1-NEXT:    movq $1, foo(%rip)
@@ -8415,19 +8505,19 @@ define void @atomic64_nand_acquire(ptr %a) {
 ; O2-LABEL: atomic64_nand_acquire:
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection273:
+; O2-NEXT:  .Lpcsection303:
 ; O2-NEXT:    movq (%rdi), %rax
 ; O2-NEXT:    .p2align 4, 0x90
 ; O2-NEXT:  .LBB169_1: # %atomicrmw.start
 ; O2-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O2-NEXT:    movl %eax, %ecx
-; O2-NEXT:  .Lpcsection274:
+; O2-NEXT:  .Lpcsection304:
 ; O2-NEXT:    notl %ecx
-; O2-NEXT:  .Lpcsection275:
+; O2-NEXT:  .Lpcsection305:
 ; O2-NEXT:    orq $-43, %rcx
-; O2-NEXT:  .Lpcsection276:
+; O2-NEXT:  .Lpcsection306:
 ; O2-NEXT:    lock cmpxchgq %rcx, (%rdi)
-; O2-NEXT:  .Lpcsection277:
+; O2-NEXT:  .Lpcsection307:
 ; O2-NEXT:    jne .LBB169_1
 ; O2-NEXT:  # %bb.2: # %atomicrmw.end
 ; O2-NEXT:    movq $1, foo(%rip)
@@ -8436,19 +8526,19 @@ define void @atomic64_nand_acquire(ptr %a) {
 ; O3-LABEL: atomic64_nand_acquire:
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection273:
+; O3-NEXT:  .Lpcsection303:
 ; O3-NEXT:    movq (%rdi), %rax
 ; O3-NEXT:    .p2align 4, 0x90
 ; O3-NEXT:  .LBB169_1: # %atomicrmw.start
 ; O3-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O3-NEXT:    movl %eax, %ecx
-; O3-NEXT:  .Lpcsection274:
+; O3-NEXT:  .Lpcsection304:
 ; O3-NEXT:    notl %ecx
-; O3-NEXT:  .Lpcsection275:
+; O3-NEXT:  .Lpcsection305:
 ; O3-NEXT:    orq $-43, %rcx
-; O3-NEXT:  .Lpcsection276:
+; O3-NEXT:  .Lpcsection306:
 ; O3-NEXT:    lock cmpxchgq %rcx, (%rdi)
-; O3-NEXT:  .Lpcsection277:
+; O3-NEXT:  .Lpcsection307:
 ; O3-NEXT:    jne .LBB169_1
 ; O3-NEXT:  # %bb.2: # %atomicrmw.end
 ; O3-NEXT:    movq $1, foo(%rip)
@@ -8474,7 +8564,7 @@ define void @atomic64_xchg_release(ptr %a) {
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
 ; O1-NEXT:    movl $42, %eax
-; O1-NEXT:  .Lpcsection278:
+; O1-NEXT:  .Lpcsection308:
 ; O1-NEXT:    xchgq %rax, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -8483,7 +8573,7 @@ define void @atomic64_xchg_release(ptr %a) {
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
 ; O2-NEXT:    movl $42, %eax
-; O2-NEXT:  .Lpcsection278:
+; O2-NEXT:  .Lpcsection308:
 ; O2-NEXT:    xchgq %rax, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -8492,7 +8582,7 @@ define void @atomic64_xchg_release(ptr %a) {
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
 ; O3-NEXT:    movl $42, %eax
-; O3-NEXT:  .Lpcsection278:
+; O3-NEXT:  .Lpcsection308:
 ; O3-NEXT:    xchgq %rax, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -8515,7 +8605,7 @@ define void @atomic64_add_release(ptr %a) {
 ; O1-LABEL: atomic64_add_release:
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection279:
+; O1-NEXT:  .Lpcsection309:
 ; O1-NEXT:    lock addq $42, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -8523,7 +8613,7 @@ define void @atomic64_add_release(ptr %a) {
 ; O2-LABEL: atomic64_add_release:
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection279:
+; O2-NEXT:  .Lpcsection309:
 ; O2-NEXT:    lock addq $42, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -8531,7 +8621,7 @@ define void @atomic64_add_release(ptr %a) {
 ; O3-LABEL: atomic64_add_release:
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection279:
+; O3-NEXT:  .Lpcsection309:
 ; O3-NEXT:    lock addq $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -8554,7 +8644,7 @@ define void @atomic64_sub_release(ptr %a) {
 ; O1-LABEL: atomic64_sub_release:
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection280:
+; O1-NEXT:  .Lpcsection310:
 ; O1-NEXT:    lock subq $42, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -8562,7 +8652,7 @@ define void @atomic64_sub_release(ptr %a) {
 ; O2-LABEL: atomic64_sub_release:
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection280:
+; O2-NEXT:  .Lpcsection310:
 ; O2-NEXT:    lock subq $42, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -8570,7 +8660,7 @@ define void @atomic64_sub_release(ptr %a) {
 ; O3-LABEL: atomic64_sub_release:
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection280:
+; O3-NEXT:  .Lpcsection310:
 ; O3-NEXT:    lock subq $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -8593,7 +8683,7 @@ define void @atomic64_and_release(ptr %a) {
 ; O1-LABEL: atomic64_and_release:
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection281:
+; O1-NEXT:  .Lpcsection311:
 ; O1-NEXT:    lock andq $42, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -8601,7 +8691,7 @@ define void @atomic64_and_release(ptr %a) {
 ; O2-LABEL: atomic64_and_release:
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection281:
+; O2-NEXT:  .Lpcsection311:
 ; O2-NEXT:    lock andq $42, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -8609,7 +8699,7 @@ define void @atomic64_and_release(ptr %a) {
 ; O3-LABEL: atomic64_and_release:
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection281:
+; O3-NEXT:  .Lpcsection311:
 ; O3-NEXT:    lock andq $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -8632,7 +8722,7 @@ define void @atomic64_or_release(ptr %a) {
 ; O1-LABEL: atomic64_or_release:
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection282:
+; O1-NEXT:  .Lpcsection312:
 ; O1-NEXT:    lock orq $42, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -8640,7 +8730,7 @@ define void @atomic64_or_release(ptr %a) {
 ; O2-LABEL: atomic64_or_release:
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection282:
+; O2-NEXT:  .Lpcsection312:
 ; O2-NEXT:    lock orq $42, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -8648,7 +8738,7 @@ define void @atomic64_or_release(ptr %a) {
 ; O3-LABEL: atomic64_or_release:
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection282:
+; O3-NEXT:  .Lpcsection312:
 ; O3-NEXT:    lock orq $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -8671,7 +8761,7 @@ define void @atomic64_xor_release(ptr %a) {
 ; O1-LABEL: atomic64_xor_release:
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection283:
+; O1-NEXT:  .Lpcsection313:
 ; O1-NEXT:    lock xorq $42, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -8679,7 +8769,7 @@ define void @atomic64_xor_release(ptr %a) {
 ; O2-LABEL: atomic64_xor_release:
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection283:
+; O2-NEXT:  .Lpcsection313:
 ; O2-NEXT:    lock xorq $42, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -8687,7 +8777,7 @@ define void @atomic64_xor_release(ptr %a) {
 ; O3-LABEL: atomic64_xor_release:
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection283:
+; O3-NEXT:  .Lpcsection313:
 ; O3-NEXT:    lock xorq $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -8735,19 +8825,19 @@ define void @atomic64_nand_release(ptr %a) {
 ; O1-LABEL: atomic64_nand_release:
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection284:
+; O1-NEXT:  .Lpcsection314:
 ; O1-NEXT:    movq (%rdi), %rax
 ; O1-NEXT:    .p2align 4, 0x90
 ; O1-NEXT:  .LBB176_1: # %atomicrmw.start
 ; O1-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O1-NEXT:    movl %eax, %ecx
-; O1-NEXT:  .Lpcsection285:
+; O1-NEXT:  .Lpcsection315:
 ; O1-NEXT:    notl %ecx
-; O1-NEXT:  .Lpcsection286:
+; O1-NEXT:  .Lpcsection316:
 ; O1-NEXT:    orq $-43, %rcx
-; O1-NEXT:  .Lpcsection287:
+; O1-NEXT:  .Lpcsection317:
 ; O1-NEXT:    lock cmpxchgq %rcx, (%rdi)
-; O1-NEXT:  .Lpcsection288:
+; O1-NEXT:  .Lpcsection318:
 ; O1-NEXT:    jne .LBB176_1
 ; O1-NEXT:  # %bb.2: # %atomicrmw.end
 ; O1-NEXT:    movq $1, foo(%rip)
@@ -8756,19 +8846,19 @@ define void @atomic64_nand_release(ptr %a) {
 ; O2-LABEL: atomic64_nand_release:
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection284:
+; O2-NEXT:  .Lpcsection314:
 ; O2-NEXT:    movq (%rdi), %rax
 ; O2-NEXT:    .p2align 4, 0x90
 ; O2-NEXT:  .LBB176_1: # %atomicrmw.start
 ; O2-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O2-NEXT:    movl %eax, %ecx
-; O2-NEXT:  .Lpcsection285:
+; O2-NEXT:  .Lpcsection315:
 ; O2-NEXT:    notl %ecx
-; O2-NEXT:  .Lpcsection286:
+; O2-NEXT:  .Lpcsection316:
 ; O2-NEXT:    orq $-43, %rcx
-; O2-NEXT:  .Lpcsection287:
+; O2-NEXT:  .Lpcsection317:
 ; O2-NEXT:    lock cmpxchgq %rcx, (%rdi)
-; O2-NEXT:  .Lpcsection288:
+; O2-NEXT:  .Lpcsection318:
 ; O2-NEXT:    jne .LBB176_1
 ; O2-NEXT:  # %bb.2: # %atomicrmw.end
 ; O2-NEXT:    movq $1, foo(%rip)
@@ -8777,19 +8867,19 @@ define void @atomic64_nand_release(ptr %a) {
 ; O3-LABEL: atomic64_nand_release:
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection284:
+; O3-NEXT:  .Lpcsection314:
 ; O3-NEXT:    movq (%rdi), %rax
 ; O3-NEXT:    .p2align 4, 0x90
 ; O3-NEXT:  .LBB176_1: # %atomicrmw.start
 ; O3-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O3-NEXT:    movl %eax, %ecx
-; O3-NEXT:  .Lpcsection285:
+; O3-NEXT:  .Lpcsection315:
 ; O3-NEXT:    notl %ecx
-; O3-NEXT:  .Lpcsection286:
+; O3-NEXT:  .Lpcsection316:
 ; O3-NEXT:    orq $-43, %rcx
-; O3-NEXT:  .Lpcsection287:
+; O3-NEXT:  .Lpcsection317:
 ; O3-NEXT:    lock cmpxchgq %rcx, (%rdi)
-; O3-NEXT:  .Lpcsection288:
+; O3-NEXT:  .Lpcsection318:
 ; O3-NEXT:    jne .LBB176_1
 ; O3-NEXT:  # %bb.2: # %atomicrmw.end
 ; O3-NEXT:    movq $1, foo(%rip)
@@ -8815,7 +8905,7 @@ define void @atomic64_xchg_acq_rel(ptr %a) {
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
 ; O1-NEXT:    movl $42, %eax
-; O1-NEXT:  .Lpcsection289:
+; O1-NEXT:  .Lpcsection319:
 ; O1-NEXT:    xchgq %rax, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -8824,7 +8914,7 @@ define void @atomic64_xchg_acq_rel(ptr %a) {
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
 ; O2-NEXT:    movl $42, %eax
-; O2-NEXT:  .Lpcsection289:
+; O2-NEXT:  .Lpcsection319:
 ; O2-NEXT:    xchgq %rax, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -8833,7 +8923,7 @@ define void @atomic64_xchg_acq_rel(ptr %a) {
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
 ; O3-NEXT:    movl $42, %eax
-; O3-NEXT:  .Lpcsection289:
+; O3-NEXT:  .Lpcsection319:
 ; O3-NEXT:    xchgq %rax, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -8856,7 +8946,7 @@ define void @atomic64_add_acq_rel(ptr %a) {
 ; O1-LABEL: atomic64_add_acq_rel:
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection290:
+; O1-NEXT:  .Lpcsection320:
 ; O1-NEXT:    lock addq $42, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -8864,7 +8954,7 @@ define void @atomic64_add_acq_rel(ptr %a) {
 ; O2-LABEL: atomic64_add_acq_rel:
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection290:
+; O2-NEXT:  .Lpcsection320:
 ; O2-NEXT:    lock addq $42, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -8872,7 +8962,7 @@ define void @atomic64_add_acq_rel(ptr %a) {
 ; O3-LABEL: atomic64_add_acq_rel:
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection290:
+; O3-NEXT:  .Lpcsection320:
 ; O3-NEXT:    lock addq $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -8895,7 +8985,7 @@ define void @atomic64_sub_acq_rel(ptr %a) {
 ; O1-LABEL: atomic64_sub_acq_rel:
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection291:
+; O1-NEXT:  .Lpcsection321:
 ; O1-NEXT:    lock subq $42, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -8903,7 +8993,7 @@ define void @atomic64_sub_acq_rel(ptr %a) {
 ; O2-LABEL: atomic64_sub_acq_rel:
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection291:
+; O2-NEXT:  .Lpcsection321:
 ; O2-NEXT:    lock subq $42, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -8911,7 +9001,7 @@ define void @atomic64_sub_acq_rel(ptr %a) {
 ; O3-LABEL: atomic64_sub_acq_rel:
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection291:
+; O3-NEXT:  .Lpcsection321:
 ; O3-NEXT:    lock subq $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -8934,7 +9024,7 @@ define void @atomic64_and_acq_rel(ptr %a) {
 ; O1-LABEL: atomic64_and_acq_rel:
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection292:
+; O1-NEXT:  .Lpcsection322:
 ; O1-NEXT:    lock andq $42, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -8942,7 +9032,7 @@ define void @atomic64_and_acq_rel(ptr %a) {
 ; O2-LABEL: atomic64_and_acq_rel:
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection292:
+; O2-NEXT:  .Lpcsection322:
 ; O2-NEXT:    lock andq $42, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -8950,7 +9040,7 @@ define void @atomic64_and_acq_rel(ptr %a) {
 ; O3-LABEL: atomic64_and_acq_rel:
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection292:
+; O3-NEXT:  .Lpcsection322:
 ; O3-NEXT:    lock andq $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -8973,7 +9063,7 @@ define void @atomic64_or_acq_rel(ptr %a) {
 ; O1-LABEL: atomic64_or_acq_rel:
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection293:
+; O1-NEXT:  .Lpcsection323:
 ; O1-NEXT:    lock orq $42, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -8981,7 +9071,7 @@ define void @atomic64_or_acq_rel(ptr %a) {
 ; O2-LABEL: atomic64_or_acq_rel:
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection293:
+; O2-NEXT:  .Lpcsection323:
 ; O2-NEXT:    lock orq $42, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -8989,7 +9079,7 @@ define void @atomic64_or_acq_rel(ptr %a) {
 ; O3-LABEL: atomic64_or_acq_rel:
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection293:
+; O3-NEXT:  .Lpcsection323:
 ; O3-NEXT:    lock orq $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -9012,7 +9102,7 @@ define void @atomic64_xor_acq_rel(ptr %a) {
 ; O1-LABEL: atomic64_xor_acq_rel:
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection294:
+; O1-NEXT:  .Lpcsection324:
 ; O1-NEXT:    lock xorq $42, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -9020,7 +9110,7 @@ define void @atomic64_xor_acq_rel(ptr %a) {
 ; O2-LABEL: atomic64_xor_acq_rel:
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection294:
+; O2-NEXT:  .Lpcsection324:
 ; O2-NEXT:    lock xorq $42, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -9028,7 +9118,7 @@ define void @atomic64_xor_acq_rel(ptr %a) {
 ; O3-LABEL: atomic64_xor_acq_rel:
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection294:
+; O3-NEXT:  .Lpcsection324:
 ; O3-NEXT:    lock xorq $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -9076,19 +9166,19 @@ define void @atomic64_nand_acq_rel(ptr %a) {
 ; O1-LABEL: atomic64_nand_acq_rel:
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection295:
+; O1-NEXT:  .Lpcsection325:
 ; O1-NEXT:    movq (%rdi), %rax
 ; O1-NEXT:    .p2align 4, 0x90
 ; O1-NEXT:  .LBB183_1: # %atomicrmw.start
 ; O1-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O1-NEXT:    movl %eax, %ecx
-; O1-NEXT:  .Lpcsection296:
+; O1-NEXT:  .Lpcsection326:
 ; O1-NEXT:    notl %ecx
-; O1-NEXT:  .Lpcsection297:
+; O1-NEXT:  .Lpcsection327:
 ; O1-NEXT:    orq $-43, %rcx
-; O1-NEXT:  .Lpcsection298:
+; O1-NEXT:  .Lpcsection328:
 ; O1-NEXT:    lock cmpxchgq %rcx, (%rdi)
-; O1-NEXT:  .Lpcsection299:
+; O1-NEXT:  .Lpcsection329:
 ; O1-NEXT:    jne .LBB183_1
 ; O1-NEXT:  # %bb.2: # %atomicrmw.end
 ; O1-NEXT:    movq $1, foo(%rip)
@@ -9097,19 +9187,19 @@ define void @atomic64_nand_acq_rel(ptr %a) {
 ; O2-LABEL: atomic64_nand_acq_rel:
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection295:
+; O2-NEXT:  .Lpcsection325:
 ; O2-NEXT:    movq (%rdi), %rax
 ; O2-NEXT:    .p2align 4, 0x90
 ; O2-NEXT:  .LBB183_1: # %atomicrmw.start
 ; O2-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O2-NEXT:    movl %eax, %ecx
-; O2-NEXT:  .Lpcsection296:
+; O2-NEXT:  .Lpcsection326:
 ; O2-NEXT:    notl %ecx
-; O2-NEXT:  .Lpcsection297:
+; O2-NEXT:  .Lpcsection327:
 ; O2-NEXT:    orq $-43, %rcx
-; O2-NEXT:  .Lpcsection298:
+; O2-NEXT:  .Lpcsection328:
 ; O2-NEXT:    lock cmpxchgq %rcx, (%rdi)
-; O2-NEXT:  .Lpcsection299:
+; O2-NEXT:  .Lpcsection329:
 ; O2-NEXT:    jne .LBB183_1
 ; O2-NEXT:  # %bb.2: # %atomicrmw.end
 ; O2-NEXT:    movq $1, foo(%rip)
@@ -9118,19 +9208,19 @@ define void @atomic64_nand_acq_rel(ptr %a) {
 ; O3-LABEL: atomic64_nand_acq_rel:
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection295:
+; O3-NEXT:  .Lpcsection325:
 ; O3-NEXT:    movq (%rdi), %rax
 ; O3-NEXT:    .p2align 4, 0x90
 ; O3-NEXT:  .LBB183_1: # %atomicrmw.start
 ; O3-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O3-NEXT:    movl %eax, %ecx
-; O3-NEXT:  .Lpcsection296:
+; O3-NEXT:  .Lpcsection326:
 ; O3-NEXT:    notl %ecx
-; O3-NEXT:  .Lpcsection297:
+; O3-NEXT:  .Lpcsection327:
 ; O3-NEXT:    orq $-43, %rcx
-; O3-NEXT:  .Lpcsection298:
+; O3-NEXT:  .Lpcsection328:
 ; O3-NEXT:    lock cmpxchgq %rcx, (%rdi)
-; O3-NEXT:  .Lpcsection299:
+; O3-NEXT:  .Lpcsection329:
 ; O3-NEXT:    jne .LBB183_1
 ; O3-NEXT:  # %bb.2: # %atomicrmw.end
 ; O3-NEXT:    movq $1, foo(%rip)
@@ -9156,7 +9246,7 @@ define void @atomic64_xchg_seq_cst(ptr %a) {
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
 ; O1-NEXT:    movl $42, %eax
-; O1-NEXT:  .Lpcsection300:
+; O1-NEXT:  .Lpcsection330:
 ; O1-NEXT:    xchgq %rax, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -9165,7 +9255,7 @@ define void @atomic64_xchg_seq_cst(ptr %a) {
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
 ; O2-NEXT:    movl $42, %eax
-; O2-NEXT:  .Lpcsection300:
+; O2-NEXT:  .Lpcsection330:
 ; O2-NEXT:    xchgq %rax, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -9174,7 +9264,7 @@ define void @atomic64_xchg_seq_cst(ptr %a) {
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
 ; O3-NEXT:    movl $42, %eax
-; O3-NEXT:  .Lpcsection300:
+; O3-NEXT:  .Lpcsection330:
 ; O3-NEXT:    xchgq %rax, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -9197,7 +9287,7 @@ define void @atomic64_add_seq_cst(ptr %a) {
 ; O1-LABEL: atomic64_add_seq_cst:
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection301:
+; O1-NEXT:  .Lpcsection331:
 ; O1-NEXT:    lock addq $42, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -9205,7 +9295,7 @@ define void @atomic64_add_seq_cst(ptr %a) {
 ; O2-LABEL: atomic64_add_seq_cst:
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection301:
+; O2-NEXT:  .Lpcsection331:
 ; O2-NEXT:    lock addq $42, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -9213,7 +9303,7 @@ define void @atomic64_add_seq_cst(ptr %a) {
 ; O3-LABEL: atomic64_add_seq_cst:
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection301:
+; O3-NEXT:  .Lpcsection331:
 ; O3-NEXT:    lock addq $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -9236,7 +9326,7 @@ define void @atomic64_sub_seq_cst(ptr %a) {
 ; O1-LABEL: atomic64_sub_seq_cst:
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection302:
+; O1-NEXT:  .Lpcsection332:
 ; O1-NEXT:    lock subq $42, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -9244,7 +9334,7 @@ define void @atomic64_sub_seq_cst(ptr %a) {
 ; O2-LABEL: atomic64_sub_seq_cst:
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection302:
+; O2-NEXT:  .Lpcsection332:
 ; O2-NEXT:    lock subq $42, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -9252,7 +9342,7 @@ define void @atomic64_sub_seq_cst(ptr %a) {
 ; O3-LABEL: atomic64_sub_seq_cst:
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection302:
+; O3-NEXT:  .Lpcsection332:
 ; O3-NEXT:    lock subq $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -9275,7 +9365,7 @@ define void @atomic64_and_seq_cst(ptr %a) {
 ; O1-LABEL: atomic64_and_seq_cst:
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection303:
+; O1-NEXT:  .Lpcsection333:
 ; O1-NEXT:    lock andq $42, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -9283,7 +9373,7 @@ define void @atomic64_and_seq_cst(ptr %a) {
 ; O2-LABEL: atomic64_and_seq_cst:
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection303:
+; O2-NEXT:  .Lpcsection333:
 ; O2-NEXT:    lock andq $42, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -9291,7 +9381,7 @@ define void @atomic64_and_seq_cst(ptr %a) {
 ; O3-LABEL: atomic64_and_seq_cst:
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection303:
+; O3-NEXT:  .Lpcsection333:
 ; O3-NEXT:    lock andq $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -9314,7 +9404,7 @@ define void @atomic64_or_seq_cst(ptr %a) {
 ; O1-LABEL: atomic64_or_seq_cst:
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection304:
+; O1-NEXT:  .Lpcsection334:
 ; O1-NEXT:    lock orq $42, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -9322,7 +9412,7 @@ define void @atomic64_or_seq_cst(ptr %a) {
 ; O2-LABEL: atomic64_or_seq_cst:
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection304:
+; O2-NEXT:  .Lpcsection334:
 ; O2-NEXT:    lock orq $42, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -9330,7 +9420,7 @@ define void @atomic64_or_seq_cst(ptr %a) {
 ; O3-LABEL: atomic64_or_seq_cst:
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection304:
+; O3-NEXT:  .Lpcsection334:
 ; O3-NEXT:    lock orq $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -9353,7 +9443,7 @@ define void @atomic64_xor_seq_cst(ptr %a) {
 ; O1-LABEL: atomic64_xor_seq_cst:
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection305:
+; O1-NEXT:  .Lpcsection335:
 ; O1-NEXT:    lock xorq $42, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -9361,7 +9451,7 @@ define void @atomic64_xor_seq_cst(ptr %a) {
 ; O2-LABEL: atomic64_xor_seq_cst:
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection305:
+; O2-NEXT:  .Lpcsection335:
 ; O2-NEXT:    lock xorq $42, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -9369,7 +9459,7 @@ define void @atomic64_xor_seq_cst(ptr %a) {
 ; O3-LABEL: atomic64_xor_seq_cst:
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection305:
+; O3-NEXT:  .Lpcsection335:
 ; O3-NEXT:    lock xorq $42, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -9417,19 +9507,19 @@ define void @atomic64_nand_seq_cst(ptr %a) {
 ; O1-LABEL: atomic64_nand_seq_cst:
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection306:
+; O1-NEXT:  .Lpcsection336:
 ; O1-NEXT:    movq (%rdi), %rax
 ; O1-NEXT:    .p2align 4, 0x90
 ; O1-NEXT:  .LBB190_1: # %atomicrmw.start
 ; O1-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O1-NEXT:    movl %eax, %ecx
-; O1-NEXT:  .Lpcsection307:
+; O1-NEXT:  .Lpcsection337:
 ; O1-NEXT:    notl %ecx
-; O1-NEXT:  .Lpcsection308:
+; O1-NEXT:  .Lpcsection338:
 ; O1-NEXT:    orq $-43, %rcx
-; O1-NEXT:  .Lpcsection309:
+; O1-NEXT:  .Lpcsection339:
 ; O1-NEXT:    lock cmpxchgq %rcx, (%rdi)
-; O1-NEXT:  .Lpcsection310:
+; O1-NEXT:  .Lpcsection340:
 ; O1-NEXT:    jne .LBB190_1
 ; O1-NEXT:  # %bb.2: # %atomicrmw.end
 ; O1-NEXT:    movq $1, foo(%rip)
@@ -9438,19 +9528,19 @@ define void @atomic64_nand_seq_cst(ptr %a) {
 ; O2-LABEL: atomic64_nand_seq_cst:
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection306:
+; O2-NEXT:  .Lpcsection336:
 ; O2-NEXT:    movq (%rdi), %rax
 ; O2-NEXT:    .p2align 4, 0x90
 ; O2-NEXT:  .LBB190_1: # %atomicrmw.start
 ; O2-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O2-NEXT:    movl %eax, %ecx
-; O2-NEXT:  .Lpcsection307:
+; O2-NEXT:  .Lpcsection337:
 ; O2-NEXT:    notl %ecx
-; O2-NEXT:  .Lpcsection308:
+; O2-NEXT:  .Lpcsection338:
 ; O2-NEXT:    orq $-43, %rcx
-; O2-NEXT:  .Lpcsection309:
+; O2-NEXT:  .Lpcsection339:
 ; O2-NEXT:    lock cmpxchgq %rcx, (%rdi)
-; O2-NEXT:  .Lpcsection310:
+; O2-NEXT:  .Lpcsection340:
 ; O2-NEXT:    jne .LBB190_1
 ; O2-NEXT:  # %bb.2: # %atomicrmw.end
 ; O2-NEXT:    movq $1, foo(%rip)
@@ -9459,19 +9549,19 @@ define void @atomic64_nand_seq_cst(ptr %a) {
 ; O3-LABEL: atomic64_nand_seq_cst:
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection306:
+; O3-NEXT:  .Lpcsection336:
 ; O3-NEXT:    movq (%rdi), %rax
 ; O3-NEXT:    .p2align 4, 0x90
 ; O3-NEXT:  .LBB190_1: # %atomicrmw.start
 ; O3-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O3-NEXT:    movl %eax, %ecx
-; O3-NEXT:  .Lpcsection307:
+; O3-NEXT:  .Lpcsection337:
 ; O3-NEXT:    notl %ecx
-; O3-NEXT:  .Lpcsection308:
+; O3-NEXT:  .Lpcsection338:
 ; O3-NEXT:    orq $-43, %rcx
-; O3-NEXT:  .Lpcsection309:
+; O3-NEXT:  .Lpcsection339:
 ; O3-NEXT:    lock cmpxchgq %rcx, (%rdi)
-; O3-NEXT:  .Lpcsection310:
+; O3-NEXT:  .Lpcsection340:
 ; O3-NEXT:    jne .LBB190_1
 ; O3-NEXT:  # %bb.2: # %atomicrmw.end
 ; O3-NEXT:    movq $1, foo(%rip)
@@ -9509,14 +9599,17 @@ define void @atomic64_cas_monotonic(ptr %a) {
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
 ; O1-NEXT:    movl $1, %ecx
+; O1-NEXT:  .Lpcsection341:
 ; O1-NEXT:    movl $42, %eax
-; O1-NEXT:  .Lpcsection311:
+; O1-NEXT:  .Lpcsection342:
 ; O1-NEXT:    lock cmpxchgq %rcx, (%rdi)
+; O1-NEXT:  .Lpcsection343:
 ; O1-NEXT:    movl $42, %eax
-; O1-NEXT:  .Lpcsection312:
+; O1-NEXT:  .Lpcsection344:
 ; O1-NEXT:    lock cmpxchgq %rcx, (%rdi)
+; O1-NEXT:  .Lpcsection345:
 ; O1-NEXT:    movl $42, %eax
-; O1-NEXT:  .Lpcsection313:
+; O1-NEXT:  .Lpcsection346:
 ; O1-NEXT:    lock cmpxchgq %rcx, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -9525,14 +9618,17 @@ define void @atomic64_cas_monotonic(ptr %a) {
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
 ; O2-NEXT:    movl $1, %ecx
+; O2-NEXT:  .Lpcsection341:
 ; O2-NEXT:    movl $42, %eax
-; O2-NEXT:  .Lpcsection311:
+; O2-NEXT:  .Lpcsection342:
 ; O2-NEXT:    lock cmpxchgq %rcx, (%rdi)
+; O2-NEXT:  .Lpcsection343:
 ; O2-NEXT:    movl $42, %eax
-; O2-NEXT:  .Lpcsection312:
+; O2-NEXT:  .Lpcsection344:
 ; O2-NEXT:    lock cmpxchgq %rcx, (%rdi)
+; O2-NEXT:  .Lpcsection345:
 ; O2-NEXT:    movl $42, %eax
-; O2-NEXT:  .Lpcsection313:
+; O2-NEXT:  .Lpcsection346:
 ; O2-NEXT:    lock cmpxchgq %rcx, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -9541,14 +9637,17 @@ define void @atomic64_cas_monotonic(ptr %a) {
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
 ; O3-NEXT:    movl $1, %ecx
+; O3-NEXT:  .Lpcsection341:
 ; O3-NEXT:    movl $42, %eax
-; O3-NEXT:  .Lpcsection311:
+; O3-NEXT:  .Lpcsection342:
 ; O3-NEXT:    lock cmpxchgq %rcx, (%rdi)
+; O3-NEXT:  .Lpcsection343:
 ; O3-NEXT:    movl $42, %eax
-; O3-NEXT:  .Lpcsection312:
+; O3-NEXT:  .Lpcsection344:
 ; O3-NEXT:    lock cmpxchgq %rcx, (%rdi)
+; O3-NEXT:  .Lpcsection345:
 ; O3-NEXT:    movl $42, %eax
-; O3-NEXT:  .Lpcsection313:
+; O3-NEXT:  .Lpcsection346:
 ; O3-NEXT:    lock cmpxchgq %rcx, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -9587,14 +9686,17 @@ define void @atomic64_cas_acquire(ptr %a) {
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
 ; O1-NEXT:    movl $1, %ecx
+; O1-NEXT:  .Lpcsection347:
 ; O1-NEXT:    movl $42, %eax
-; O1-NEXT:  .Lpcsection314:
+; O1-NEXT:  .Lpcsection348:
 ; O1-NEXT:    lock cmpxchgq %rcx, (%rdi)
+; O1-NEXT:  .Lpcsection349:
 ; O1-NEXT:    movl $42, %eax
-; O1-NEXT:  .Lpcsection315:
+; O1-NEXT:  .Lpcsection350:
 ; O1-NEXT:    lock cmpxchgq %rcx, (%rdi)
+; O1-NEXT:  .Lpcsection351:
 ; O1-NEXT:    movl $42, %eax
-; O1-NEXT:  .Lpcsection316:
+; O1-NEXT:  .Lpcsection352:
 ; O1-NEXT:    lock cmpxchgq %rcx, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -9603,14 +9705,17 @@ define void @atomic64_cas_acquire(ptr %a) {
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
 ; O2-NEXT:    movl $1, %ecx
+; O2-NEXT:  .Lpcsection347:
 ; O2-NEXT:    movl $42, %eax
-; O2-NEXT:  .Lpcsection314:
+; O2-NEXT:  .Lpcsection348:
 ; O2-NEXT:    lock cmpxchgq %rcx, (%rdi)
+; O2-NEXT:  .Lpcsection349:
 ; O2-NEXT:    movl $42, %eax
-; O2-NEXT:  .Lpcsection315:
+; O2-NEXT:  .Lpcsection350:
 ; O2-NEXT:    lock cmpxchgq %rcx, (%rdi)
+; O2-NEXT:  .Lpcsection351:
 ; O2-NEXT:    movl $42, %eax
-; O2-NEXT:  .Lpcsection316:
+; O2-NEXT:  .Lpcsection352:
 ; O2-NEXT:    lock cmpxchgq %rcx, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -9619,14 +9724,17 @@ define void @atomic64_cas_acquire(ptr %a) {
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
 ; O3-NEXT:    movl $1, %ecx
+; O3-NEXT:  .Lpcsection347:
 ; O3-NEXT:    movl $42, %eax
-; O3-NEXT:  .Lpcsection314:
+; O3-NEXT:  .Lpcsection348:
 ; O3-NEXT:    lock cmpxchgq %rcx, (%rdi)
+; O3-NEXT:  .Lpcsection349:
 ; O3-NEXT:    movl $42, %eax
-; O3-NEXT:  .Lpcsection315:
+; O3-NEXT:  .Lpcsection350:
 ; O3-NEXT:    lock cmpxchgq %rcx, (%rdi)
+; O3-NEXT:  .Lpcsection351:
 ; O3-NEXT:    movl $42, %eax
-; O3-NEXT:  .Lpcsection316:
+; O3-NEXT:  .Lpcsection352:
 ; O3-NEXT:    lock cmpxchgq %rcx, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -9665,14 +9773,17 @@ define void @atomic64_cas_release(ptr %a) {
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
 ; O1-NEXT:    movl $1, %ecx
+; O1-NEXT:  .Lpcsection353:
 ; O1-NEXT:    movl $42, %eax
-; O1-NEXT:  .Lpcsection317:
+; O1-NEXT:  .Lpcsection354:
 ; O1-NEXT:    lock cmpxchgq %rcx, (%rdi)
+; O1-NEXT:  .Lpcsection355:
 ; O1-NEXT:    movl $42, %eax
-; O1-NEXT:  .Lpcsection318:
+; O1-NEXT:  .Lpcsection356:
 ; O1-NEXT:    lock cmpxchgq %rcx, (%rdi)
+; O1-NEXT:  .Lpcsection357:
 ; O1-NEXT:    movl $42, %eax
-; O1-NEXT:  .Lpcsection319:
+; O1-NEXT:  .Lpcsection358:
 ; O1-NEXT:    lock cmpxchgq %rcx, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -9681,14 +9792,17 @@ define void @atomic64_cas_release(ptr %a) {
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
 ; O2-NEXT:    movl $1, %ecx
+; O2-NEXT:  .Lpcsection353:
 ; O2-NEXT:    movl $42, %eax
-; O2-NEXT:  .Lpcsection317:
+; O2-NEXT:  .Lpcsection354:
 ; O2-NEXT:    lock cmpxchgq %rcx, (%rdi)
+; O2-NEXT:  .Lpcsection355:
 ; O2-NEXT:    movl $42, %eax
-; O2-NEXT:  .Lpcsection318:
+; O2-NEXT:  .Lpcsection356:
 ; O2-NEXT:    lock cmpxchgq %rcx, (%rdi)
+; O2-NEXT:  .Lpcsection357:
 ; O2-NEXT:    movl $42, %eax
-; O2-NEXT:  .Lpcsection319:
+; O2-NEXT:  .Lpcsection358:
 ; O2-NEXT:    lock cmpxchgq %rcx, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -9697,14 +9811,17 @@ define void @atomic64_cas_release(ptr %a) {
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
 ; O3-NEXT:    movl $1, %ecx
+; O3-NEXT:  .Lpcsection353:
 ; O3-NEXT:    movl $42, %eax
-; O3-NEXT:  .Lpcsection317:
+; O3-NEXT:  .Lpcsection354:
 ; O3-NEXT:    lock cmpxchgq %rcx, (%rdi)
+; O3-NEXT:  .Lpcsection355:
 ; O3-NEXT:    movl $42, %eax
-; O3-NEXT:  .Lpcsection318:
+; O3-NEXT:  .Lpcsection356:
 ; O3-NEXT:    lock cmpxchgq %rcx, (%rdi)
+; O3-NEXT:  .Lpcsection357:
 ; O3-NEXT:    movl $42, %eax
-; O3-NEXT:  .Lpcsection319:
+; O3-NEXT:  .Lpcsection358:
 ; O3-NEXT:    lock cmpxchgq %rcx, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -9743,14 +9860,17 @@ define void @atomic64_cas_acq_rel(ptr %a) {
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
 ; O1-NEXT:    movl $1, %ecx
+; O1-NEXT:  .Lpcsection359:
 ; O1-NEXT:    movl $42, %eax
-; O1-NEXT:  .Lpcsection320:
+; O1-NEXT:  .Lpcsection360:
 ; O1-NEXT:    lock cmpxchgq %rcx, (%rdi)
+; O1-NEXT:  .Lpcsection361:
 ; O1-NEXT:    movl $42, %eax
-; O1-NEXT:  .Lpcsection321:
+; O1-NEXT:  .Lpcsection362:
 ; O1-NEXT:    lock cmpxchgq %rcx, (%rdi)
+; O1-NEXT:  .Lpcsection363:
 ; O1-NEXT:    movl $42, %eax
-; O1-NEXT:  .Lpcsection322:
+; O1-NEXT:  .Lpcsection364:
 ; O1-NEXT:    lock cmpxchgq %rcx, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -9759,14 +9879,17 @@ define void @atomic64_cas_acq_rel(ptr %a) {
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
 ; O2-NEXT:    movl $1, %ecx
+; O2-NEXT:  .Lpcsection359:
 ; O2-NEXT:    movl $42, %eax
-; O2-NEXT:  .Lpcsection320:
+; O2-NEXT:  .Lpcsection360:
 ; O2-NEXT:    lock cmpxchgq %rcx, (%rdi)
+; O2-NEXT:  .Lpcsection361:
 ; O2-NEXT:    movl $42, %eax
-; O2-NEXT:  .Lpcsection321:
+; O2-NEXT:  .Lpcsection362:
 ; O2-NEXT:    lock cmpxchgq %rcx, (%rdi)
+; O2-NEXT:  .Lpcsection363:
 ; O2-NEXT:    movl $42, %eax
-; O2-NEXT:  .Lpcsection322:
+; O2-NEXT:  .Lpcsection364:
 ; O2-NEXT:    lock cmpxchgq %rcx, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -9775,14 +9898,17 @@ define void @atomic64_cas_acq_rel(ptr %a) {
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
 ; O3-NEXT:    movl $1, %ecx
+; O3-NEXT:  .Lpcsection359:
 ; O3-NEXT:    movl $42, %eax
-; O3-NEXT:  .Lpcsection320:
+; O3-NEXT:  .Lpcsection360:
 ; O3-NEXT:    lock cmpxchgq %rcx, (%rdi)
+; O3-NEXT:  .Lpcsection361:
 ; O3-NEXT:    movl $42, %eax
-; O3-NEXT:  .Lpcsection321:
+; O3-NEXT:  .Lpcsection362:
 ; O3-NEXT:    lock cmpxchgq %rcx, (%rdi)
+; O3-NEXT:  .Lpcsection363:
 ; O3-NEXT:    movl $42, %eax
-; O3-NEXT:  .Lpcsection322:
+; O3-NEXT:  .Lpcsection364:
 ; O3-NEXT:    lock cmpxchgq %rcx, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -9821,14 +9947,17 @@ define void @atomic64_cas_seq_cst(ptr %a) {
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
 ; O1-NEXT:    movl $1, %ecx
+; O1-NEXT:  .Lpcsection365:
 ; O1-NEXT:    movl $42, %eax
-; O1-NEXT:  .Lpcsection323:
+; O1-NEXT:  .Lpcsection366:
 ; O1-NEXT:    lock cmpxchgq %rcx, (%rdi)
+; O1-NEXT:  .Lpcsection367:
 ; O1-NEXT:    movl $42, %eax
-; O1-NEXT:  .Lpcsection324:
+; O1-NEXT:  .Lpcsection368:
 ; O1-NEXT:    lock cmpxchgq %rcx, (%rdi)
+; O1-NEXT:  .Lpcsection369:
 ; O1-NEXT:    movl $42, %eax
-; O1-NEXT:  .Lpcsection325:
+; O1-NEXT:  .Lpcsection370:
 ; O1-NEXT:    lock cmpxchgq %rcx, (%rdi)
 ; O1-NEXT:    movq $3, foo(%rip)
 ; O1-NEXT:    retq
@@ -9837,14 +9966,17 @@ define void @atomic64_cas_seq_cst(ptr %a) {
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
 ; O2-NEXT:    movl $1, %ecx
+; O2-NEXT:  .Lpcsection365:
 ; O2-NEXT:    movl $42, %eax
-; O2-NEXT:  .Lpcsection323:
+; O2-NEXT:  .Lpcsection366:
 ; O2-NEXT:    lock cmpxchgq %rcx, (%rdi)
+; O2-NEXT:  .Lpcsection367:
 ; O2-NEXT:    movl $42, %eax
-; O2-NEXT:  .Lpcsection324:
+; O2-NEXT:  .Lpcsection368:
 ; O2-NEXT:    lock cmpxchgq %rcx, (%rdi)
+; O2-NEXT:  .Lpcsection369:
 ; O2-NEXT:    movl $42, %eax
-; O2-NEXT:  .Lpcsection325:
+; O2-NEXT:  .Lpcsection370:
 ; O2-NEXT:    lock cmpxchgq %rcx, (%rdi)
 ; O2-NEXT:    movq $3, foo(%rip)
 ; O2-NEXT:    retq
@@ -9853,14 +9985,17 @@ define void @atomic64_cas_seq_cst(ptr %a) {
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
 ; O3-NEXT:    movl $1, %ecx
+; O3-NEXT:  .Lpcsection365:
 ; O3-NEXT:    movl $42, %eax
-; O3-NEXT:  .Lpcsection323:
+; O3-NEXT:  .Lpcsection366:
 ; O3-NEXT:    lock cmpxchgq %rcx, (%rdi)
+; O3-NEXT:  .Lpcsection367:
 ; O3-NEXT:    movl $42, %eax
-; O3-NEXT:  .Lpcsection324:
+; O3-NEXT:  .Lpcsection368:
 ; O3-NEXT:    lock cmpxchgq %rcx, (%rdi)
+; O3-NEXT:  .Lpcsection369:
 ; O3-NEXT:    movl $42, %eax
-; O3-NEXT:  .Lpcsection325:
+; O3-NEXT:  .Lpcsection370:
 ; O3-NEXT:    lock cmpxchgq %rcx, (%rdi)
 ; O3-NEXT:    movq $3, foo(%rip)
 ; O3-NEXT:    retq
@@ -9887,7 +10022,7 @@ define void @atomic64_cas_seq_cst_ptr_ty(ptr %a, ptr %v1, ptr %v2) {
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq %rsi, %rax
 ; O1-NEXT:    movq foo(%rip), %rcx
-; O1-NEXT:  .Lpcsection326:
+; O1-NEXT:  .Lpcsection371:
 ; O1-NEXT:    lock cmpxchgq %rdx, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -9896,7 +10031,7 @@ define void @atomic64_cas_seq_cst_ptr_ty(ptr %a, ptr %v1, ptr %v2) {
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq %rsi, %rax
 ; O2-NEXT:    movq foo(%rip), %rcx
-; O2-NEXT:  .Lpcsection326:
+; O2-NEXT:  .Lpcsection371:
 ; O2-NEXT:    lock cmpxchgq %rdx, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -9905,7 +10040,7 @@ define void @atomic64_cas_seq_cst_ptr_ty(ptr %a, ptr %v1, ptr %v2) {
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq %rsi, %rax
 ; O3-NEXT:    movq foo(%rip), %rcx
-; O3-NEXT:  .Lpcsection326:
+; O3-NEXT:  .Lpcsection371:
 ; O3-NEXT:    lock cmpxchgq %rdx, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -9934,7 +10069,7 @@ define i64 @atomic_use_cond(ptr %a) {
 ;
 ; O1-LABEL: atomic_use_cond:
 ; O1:       # %bb.0: # %entry
-; O1-NEXT:  .Lpcsection327:
+; O1-NEXT:  .Lpcsection372:
 ; O1-NEXT:    lock decq (%rdi)
 ; O1-NEXT:    jne .LBB197_2
 ; O1-NEXT:  # %bb.1: # %then
@@ -9946,7 +10081,7 @@ define i64 @atomic_use_cond(ptr %a) {
 ;
 ; O2-LABEL: atomic_use_cond:
 ; O2:       # %bb.0: # %entry
-; O2-NEXT:  .Lpcsection327:
+; O2-NEXT:  .Lpcsection372:
 ; O2-NEXT:    lock decq (%rdi)
 ; O2-NEXT:    jne .LBB197_2
 ; O2-NEXT:  # %bb.1: # %then
@@ -9958,7 +10093,7 @@ define i64 @atomic_use_cond(ptr %a) {
 ;
 ; O3-LABEL: atomic_use_cond:
 ; O3:       # %bb.0: # %entry
-; O3-NEXT:  .Lpcsection327:
+; O3-NEXT:  .Lpcsection372:
 ; O3-NEXT:    lock decq (%rdi)
 ; O3-NEXT:    jne .LBB197_2
 ; O3-NEXT:  # %bb.1: # %then
@@ -10005,15 +10140,15 @@ define i128 @atomic128_load_unordered(ptr %a) {
 ; O1-NEXT:    .cfi_def_cfa_offset 16
 ; O1-NEXT:    .cfi_offset %rbx, -16
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection328:
+; O1-NEXT:  .Lpcsection373:
 ; O1-NEXT:    xorl %eax, %eax
-; O1-NEXT:  .Lpcsection329:
+; O1-NEXT:  .Lpcsection374:
 ; O1-NEXT:    xorl %edx, %edx
-; O1-NEXT:  .Lpcsection330:
+; O1-NEXT:  .Lpcsection375:
 ; O1-NEXT:    xorl %ecx, %ecx
-; O1-NEXT:  .Lpcsection331:
+; O1-NEXT:  .Lpcsection376:
 ; O1-NEXT:    xorl %ebx, %ebx
-; O1-NEXT:  .Lpcsection332:
+; O1-NEXT:  .Lpcsection377:
 ; O1-NEXT:    lock cmpxchg16b (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    popq %rbx
@@ -10026,15 +10161,15 @@ define i128 @atomic128_load_unordered(ptr %a) {
 ; O2-NEXT:    .cfi_def_cfa_offset 16
 ; O2-NEXT:    .cfi_offset %rbx, -16
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection328:
+; O2-NEXT:  .Lpcsection373:
 ; O2-NEXT:    xorl %eax, %eax
-; O2-NEXT:  .Lpcsection329:
+; O2-NEXT:  .Lpcsection374:
 ; O2-NEXT:    xorl %edx, %edx
-; O2-NEXT:  .Lpcsection330:
+; O2-NEXT:  .Lpcsection375:
 ; O2-NEXT:    xorl %ecx, %ecx
-; O2-NEXT:  .Lpcsection331:
+; O2-NEXT:  .Lpcsection376:
 ; O2-NEXT:    xorl %ebx, %ebx
-; O2-NEXT:  .Lpcsection332:
+; O2-NEXT:  .Lpcsection377:
 ; O2-NEXT:    lock cmpxchg16b (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    popq %rbx
@@ -10047,15 +10182,15 @@ define i128 @atomic128_load_unordered(ptr %a) {
 ; O3-NEXT:    .cfi_def_cfa_offset 16
 ; O3-NEXT:    .cfi_offset %rbx, -16
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection328:
+; O3-NEXT:  .Lpcsection373:
 ; O3-NEXT:    xorl %eax, %eax
-; O3-NEXT:  .Lpcsection329:
+; O3-NEXT:  .Lpcsection374:
 ; O3-NEXT:    xorl %edx, %edx
-; O3-NEXT:  .Lpcsection330:
+; O3-NEXT:  .Lpcsection375:
 ; O3-NEXT:    xorl %ecx, %ecx
-; O3-NEXT:  .Lpcsection331:
+; O3-NEXT:  .Lpcsection376:
 ; O3-NEXT:    xorl %ebx, %ebx
-; O3-NEXT:  .Lpcsection332:
+; O3-NEXT:  .Lpcsection377:
 ; O3-NEXT:    lock cmpxchg16b (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    popq %rbx
@@ -10094,15 +10229,15 @@ define i128 @atomic128_load_monotonic(ptr %a) {
 ; O1-NEXT:    .cfi_def_cfa_offset 16
 ; O1-NEXT:    .cfi_offset %rbx, -16
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection333:
+; O1-NEXT:  .Lpcsection378:
 ; O1-NEXT:    xorl %eax, %eax
-; O1-NEXT:  .Lpcsection334:
+; O1-NEXT:  .Lpcsection379:
 ; O1-NEXT:    xorl %edx, %edx
-; O1-NEXT:  .Lpcsection335:
+; O1-NEXT:  .Lpcsection380:
 ; O1-NEXT:    xorl %ecx, %ecx
-; O1-NEXT:  .Lpcsection336:
+; O1-NEXT:  .Lpcsection381:
 ; O1-NEXT:    xorl %ebx, %ebx
-; O1-NEXT:  .Lpcsection337:
+; O1-NEXT:  .Lpcsection382:
 ; O1-NEXT:    lock cmpxchg16b (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    popq %rbx
@@ -10115,15 +10250,15 @@ define i128 @atomic128_load_monotonic(ptr %a) {
 ; O2-NEXT:    .cfi_def_cfa_offset 16
 ; O2-NEXT:    .cfi_offset %rbx, -16
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection333:
+; O2-NEXT:  .Lpcsection378:
 ; O2-NEXT:    xorl %eax, %eax
-; O2-NEXT:  .Lpcsection334:
+; O2-NEXT:  .Lpcsection379:
 ; O2-NEXT:    xorl %edx, %edx
-; O2-NEXT:  .Lpcsection335:
+; O2-NEXT:  .Lpcsection380:
 ; O2-NEXT:    xorl %ecx, %ecx
-; O2-NEXT:  .Lpcsection336:
+; O2-NEXT:  .Lpcsection381:
 ; O2-NEXT:    xorl %ebx, %ebx
-; O2-NEXT:  .Lpcsection337:
+; O2-NEXT:  .Lpcsection382:
 ; O2-NEXT:    lock cmpxchg16b (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    popq %rbx
@@ -10136,15 +10271,15 @@ define i128 @atomic128_load_monotonic(ptr %a) {
 ; O3-NEXT:    .cfi_def_cfa_offset 16
 ; O3-NEXT:    .cfi_offset %rbx, -16
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection333:
+; O3-NEXT:  .Lpcsection378:
 ; O3-NEXT:    xorl %eax, %eax
-; O3-NEXT:  .Lpcsection334:
+; O3-NEXT:  .Lpcsection379:
 ; O3-NEXT:    xorl %edx, %edx
-; O3-NEXT:  .Lpcsection335:
+; O3-NEXT:  .Lpcsection380:
 ; O3-NEXT:    xorl %ecx, %ecx
-; O3-NEXT:  .Lpcsection336:
+; O3-NEXT:  .Lpcsection381:
 ; O3-NEXT:    xorl %ebx, %ebx
-; O3-NEXT:  .Lpcsection337:
+; O3-NEXT:  .Lpcsection382:
 ; O3-NEXT:    lock cmpxchg16b (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    popq %rbx
@@ -10183,15 +10318,15 @@ define i128 @atomic128_load_acquire(ptr %a) {
 ; O1-NEXT:    .cfi_def_cfa_offset 16
 ; O1-NEXT:    .cfi_offset %rbx, -16
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection338:
+; O1-NEXT:  .Lpcsection383:
 ; O1-NEXT:    xorl %eax, %eax
-; O1-NEXT:  .Lpcsection339:
+; O1-NEXT:  .Lpcsection384:
 ; O1-NEXT:    xorl %edx, %edx
-; O1-NEXT:  .Lpcsection340:
+; O1-NEXT:  .Lpcsection385:
 ; O1-NEXT:    xorl %ecx, %ecx
-; O1-NEXT:  .Lpcsection341:
+; O1-NEXT:  .Lpcsection386:
 ; O1-NEXT:    xorl %ebx, %ebx
-; O1-NEXT:  .Lpcsection342:
+; O1-NEXT:  .Lpcsection387:
 ; O1-NEXT:    lock cmpxchg16b (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    popq %rbx
@@ -10204,15 +10339,15 @@ define i128 @atomic128_load_acquire(ptr %a) {
 ; O2-NEXT:    .cfi_def_cfa_offset 16
 ; O2-NEXT:    .cfi_offset %rbx, -16
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection338:
+; O2-NEXT:  .Lpcsection383:
 ; O2-NEXT:    xorl %eax, %eax
-; O2-NEXT:  .Lpcsection339:
+; O2-NEXT:  .Lpcsection384:
 ; O2-NEXT:    xorl %edx, %edx
-; O2-NEXT:  .Lpcsection340:
+; O2-NEXT:  .Lpcsection385:
 ; O2-NEXT:    xorl %ecx, %ecx
-; O2-NEXT:  .Lpcsection341:
+; O2-NEXT:  .Lpcsection386:
 ; O2-NEXT:    xorl %ebx, %ebx
-; O2-NEXT:  .Lpcsection342:
+; O2-NEXT:  .Lpcsection387:
 ; O2-NEXT:    lock cmpxchg16b (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    popq %rbx
@@ -10225,15 +10360,15 @@ define i128 @atomic128_load_acquire(ptr %a) {
 ; O3-NEXT:    .cfi_def_cfa_offset 16
 ; O3-NEXT:    .cfi_offset %rbx, -16
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection338:
+; O3-NEXT:  .Lpcsection383:
 ; O3-NEXT:    xorl %eax, %eax
-; O3-NEXT:  .Lpcsection339:
+; O3-NEXT:  .Lpcsection384:
 ; O3-NEXT:    xorl %edx, %edx
-; O3-NEXT:  .Lpcsection340:
+; O3-NEXT:  .Lpcsection385:
 ; O3-NEXT:    xorl %ecx, %ecx
-; O3-NEXT:  .Lpcsection341:
+; O3-NEXT:  .Lpcsection386:
 ; O3-NEXT:    xorl %ebx, %ebx
-; O3-NEXT:  .Lpcsection342:
+; O3-NEXT:  .Lpcsection387:
 ; O3-NEXT:    lock cmpxchg16b (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    popq %rbx
@@ -10272,15 +10407,15 @@ define i128 @atomic128_load_seq_cst(ptr %a) {
 ; O1-NEXT:    .cfi_def_cfa_offset 16
 ; O1-NEXT:    .cfi_offset %rbx, -16
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection343:
+; O1-NEXT:  .Lpcsection388:
 ; O1-NEXT:    xorl %eax, %eax
-; O1-NEXT:  .Lpcsection344:
+; O1-NEXT:  .Lpcsection389:
 ; O1-NEXT:    xorl %edx, %edx
-; O1-NEXT:  .Lpcsection345:
+; O1-NEXT:  .Lpcsection390:
 ; O1-NEXT:    xorl %ecx, %ecx
-; O1-NEXT:  .Lpcsection346:
+; O1-NEXT:  .Lpcsection391:
 ; O1-NEXT:    xorl %ebx, %ebx
-; O1-NEXT:  .Lpcsection347:
+; O1-NEXT:  .Lpcsection392:
 ; O1-NEXT:    lock cmpxchg16b (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    popq %rbx
@@ -10293,15 +10428,15 @@ define i128 @atomic128_load_seq_cst(ptr %a) {
 ; O2-NEXT:    .cfi_def_cfa_offset 16
 ; O2-NEXT:    .cfi_offset %rbx, -16
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection343:
+; O2-NEXT:  .Lpcsection388:
 ; O2-NEXT:    xorl %eax, %eax
-; O2-NEXT:  .Lpcsection344:
+; O2-NEXT:  .Lpcsection389:
 ; O2-NEXT:    xorl %edx, %edx
-; O2-NEXT:  .Lpcsection345:
+; O2-NEXT:  .Lpcsection390:
 ; O2-NEXT:    xorl %ecx, %ecx
-; O2-NEXT:  .Lpcsection346:
+; O2-NEXT:  .Lpcsection391:
 ; O2-NEXT:    xorl %ebx, %ebx
-; O2-NEXT:  .Lpcsection347:
+; O2-NEXT:  .Lpcsection392:
 ; O2-NEXT:    lock cmpxchg16b (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    popq %rbx
@@ -10314,15 +10449,15 @@ define i128 @atomic128_load_seq_cst(ptr %a) {
 ; O3-NEXT:    .cfi_def_cfa_offset 16
 ; O3-NEXT:    .cfi_offset %rbx, -16
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection343:
+; O3-NEXT:  .Lpcsection388:
 ; O3-NEXT:    xorl %eax, %eax
-; O3-NEXT:  .Lpcsection344:
+; O3-NEXT:  .Lpcsection389:
 ; O3-NEXT:    xorl %edx, %edx
-; O3-NEXT:  .Lpcsection345:
+; O3-NEXT:  .Lpcsection390:
 ; O3-NEXT:    xorl %ecx, %ecx
-; O3-NEXT:  .Lpcsection346:
+; O3-NEXT:  .Lpcsection391:
 ; O3-NEXT:    xorl %ebx, %ebx
-; O3-NEXT:  .Lpcsection347:
+; O3-NEXT:  .Lpcsection392:
 ; O3-NEXT:    lock cmpxchg16b (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    popq %rbx
@@ -10347,7 +10482,7 @@ define ptr @atomic128_load_seq_cst_ptr_ty(ptr %a) {
 ; O1-LABEL: atomic128_load_seq_cst_ptr_ty:
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection348:
+; O1-NEXT:  .Lpcsection393:
 ; O1-NEXT:    movq (%rdi), %rax
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -10355,7 +10490,7 @@ define ptr @atomic128_load_seq_cst_ptr_ty(ptr %a) {
 ; O2-LABEL: atomic128_load_seq_cst_ptr_ty:
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection348:
+; O2-NEXT:  .Lpcsection393:
 ; O2-NEXT:    movq (%rdi), %rax
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -10363,7 +10498,7 @@ define ptr @atomic128_load_seq_cst_ptr_ty(ptr %a) {
 ; O3-LABEL: atomic128_load_seq_cst_ptr_ty:
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection348:
+; O3-NEXT:  .Lpcsection393:
 ; O3-NEXT:    movq (%rdi), %rax
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -10420,20 +10555,20 @@ define void @atomic128_store_unordered(ptr %a) {
 ; O1-NEXT:    .cfi_def_cfa_offset 16
 ; O1-NEXT:    .cfi_offset %rbx, -16
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection349:
+; O1-NEXT:  .Lpcsection394:
 ; O1-NEXT:    movq (%rdi), %rax
-; O1-NEXT:  .Lpcsection350:
+; O1-NEXT:  .Lpcsection395:
 ; O1-NEXT:    movq 8(%rdi), %rdx
-; O1-NEXT:  .Lpcsection351:
+; O1-NEXT:  .Lpcsection396:
 ; O1-NEXT:    movl $42, %ebx
 ; O1-NEXT:    .p2align 4, 0x90
 ; O1-NEXT:  .LBB203_1: # %atomicrmw.start
 ; O1-NEXT:    # =>This Inner Loop Header: Depth=1
-; O1-NEXT:  .Lpcsection352:
+; O1-NEXT:  .Lpcsection397:
 ; O1-NEXT:    xorl %ecx, %ecx
-; O1-NEXT:  .Lpcsection353:
+; O1-NEXT:  .Lpcsection398:
 ; O1-NEXT:    lock cmpxchg16b (%rdi)
-; O1-NEXT:  .Lpcsection354:
+; O1-NEXT:  .Lpcsection399:
 ; O1-NEXT:    jne .LBB203_1
 ; O1-NEXT:  # %bb.2: # %atomicrmw.end
 ; O1-NEXT:    movq $1, foo(%rip)
@@ -10447,20 +10582,20 @@ define void @atomic128_store_unordered(ptr %a) {
 ; O2-NEXT:    .cfi_def_cfa_offset 16
 ; O2-NEXT:    .cfi_offset %rbx, -16
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection349:
+; O2-NEXT:  .Lpcsection394:
 ; O2-NEXT:    movq (%rdi), %rax
-; O2-NEXT:  .Lpcsection350:
+; O2-NEXT:  .Lpcsection395:
 ; O2-NEXT:    movq 8(%rdi), %rdx
-; O2-NEXT:  .Lpcsection351:
+; O2-NEXT:  .Lpcsection396:
 ; O2-NEXT:    movl $42, %ebx
 ; O2-NEXT:    .p2align 4, 0x90
 ; O2-NEXT:  .LBB203_1: # %atomicrmw.start
 ; O2-NEXT:    # =>This Inner Loop Header: Depth=1
-; O2-NEXT:  .Lpcsection352:
+; O2-NEXT:  .Lpcsection397:
 ; O2-NEXT:    xorl %ecx, %ecx
-; O2-NEXT:  .Lpcsection353:
+; O2-NEXT:  .Lpcsection398:
 ; O2-NEXT:    lock cmpxchg16b (%rdi)
-; O2-NEXT:  .Lpcsection354:
+; O2-NEXT:  .Lpcsection399:
 ; O2-NEXT:    jne .LBB203_1
 ; O2-NEXT:  # %bb.2: # %atomicrmw.end
 ; O2-NEXT:    movq $1, foo(%rip)
@@ -10474,20 +10609,20 @@ define void @atomic128_store_unordered(ptr %a) {
 ; O3-NEXT:    .cfi_def_cfa_offset 16
 ; O3-NEXT:    .cfi_offset %rbx, -16
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection349:
+; O3-NEXT:  .Lpcsection394:
 ; O3-NEXT:    movq (%rdi), %rax
-; O3-NEXT:  .Lpcsection350:
+; O3-NEXT:  .Lpcsection395:
 ; O3-NEXT:    movq 8(%rdi), %rdx
-; O3-NEXT:  .Lpcsection351:
+; O3-NEXT:  .Lpcsection396:
 ; O3-NEXT:    movl $42, %ebx
 ; O3-NEXT:    .p2align 4, 0x90
 ; O3-NEXT:  .LBB203_1: # %atomicrmw.start
 ; O3-NEXT:    # =>This Inner Loop Header: Depth=1
-; O3-NEXT:  .Lpcsection352:
+; O3-NEXT:  .Lpcsection397:
 ; O3-NEXT:    xorl %ecx, %ecx
-; O3-NEXT:  .Lpcsection353:
+; O3-NEXT:  .Lpcsection398:
 ; O3-NEXT:    lock cmpxchg16b (%rdi)
-; O3-NEXT:  .Lpcsection354:
+; O3-NEXT:  .Lpcsection399:
 ; O3-NEXT:    jne .LBB203_1
 ; O3-NEXT:  # %bb.2: # %atomicrmw.end
 ; O3-NEXT:    movq $1, foo(%rip)
@@ -10547,20 +10682,20 @@ define void @atomic128_store_monotonic(ptr %a) {
 ; O1-NEXT:    .cfi_def_cfa_offset 16
 ; O1-NEXT:    .cfi_offset %rbx, -16
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection355:
+; O1-NEXT:  .Lpcsection400:
 ; O1-NEXT:    movq (%rdi), %rax
-; O1-NEXT:  .Lpcsection356:
+; O1-NEXT:  .Lpcsection401:
 ; O1-NEXT:    movq 8(%rdi), %rdx
-; O1-NEXT:  .Lpcsection357:
+; O1-NEXT:  .Lpcsection402:
 ; O1-NEXT:    movl $42, %ebx
 ; O1-NEXT:    .p2align 4, 0x90
 ; O1-NEXT:  .LBB204_1: # %atomicrmw.start
 ; O1-NEXT:    # =>This Inner Loop Header: Depth=1
-; O1-NEXT:  .Lpcsection358:
+; O1-NEXT:  .Lpcsection403:
 ; O1-NEXT:    xorl %ecx, %ecx
-; O1-NEXT:  .Lpcsection359:
+; O1-NEXT:  .Lpcsection404:
 ; O1-NEXT:    lock cmpxchg16b (%rdi)
-; O1-NEXT:  .Lpcsection360:
+; O1-NEXT:  .Lpcsection405:
 ; O1-NEXT:    jne .LBB204_1
 ; O1-NEXT:  # %bb.2: # %atomicrmw.end
 ; O1-NEXT:    movq $1, foo(%rip)
@@ -10574,20 +10709,20 @@ define void @atomic128_store_monotonic(ptr %a) {
 ; O2-NEXT:    .cfi_def_cfa_offset 16
 ; O2-NEXT:    .cfi_offset %rbx, -16
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection355:
+; O2-NEXT:  .Lpcsection400:
 ; O2-NEXT:    movq (%rdi), %rax
-; O2-NEXT:  .Lpcsection356:
+; O2-NEXT:  .Lpcsection401:
 ; O2-NEXT:    movq 8(%rdi), %rdx
-; O2-NEXT:  .Lpcsection357:
+; O2-NEXT:  .Lpcsection402:
 ; O2-NEXT:    movl $42, %ebx
 ; O2-NEXT:    .p2align 4, 0x90
 ; O2-NEXT:  .LBB204_1: # %atomicrmw.start
 ; O2-NEXT:    # =>This Inner Loop Header: Depth=1
-; O2-NEXT:  .Lpcsection358:
+; O2-NEXT:  .Lpcsection403:
 ; O2-NEXT:    xorl %ecx, %ecx
-; O2-NEXT:  .Lpcsection359:
+; O2-NEXT:  .Lpcsection404:
 ; O2-NEXT:    lock cmpxchg16b (%rdi)
-; O2-NEXT:  .Lpcsection360:
+; O2-NEXT:  .Lpcsection405:
 ; O2-NEXT:    jne .LBB204_1
 ; O2-NEXT:  # %bb.2: # %atomicrmw.end
 ; O2-NEXT:    movq $1, foo(%rip)
@@ -10601,20 +10736,20 @@ define void @atomic128_store_monotonic(ptr %a) {
 ; O3-NEXT:    .cfi_def_cfa_offset 16
 ; O3-NEXT:    .cfi_offset %rbx, -16
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection355:
+; O3-NEXT:  .Lpcsection400:
 ; O3-NEXT:    movq (%rdi), %rax
-; O3-NEXT:  .Lpcsection356:
+; O3-NEXT:  .Lpcsection401:
 ; O3-NEXT:    movq 8(%rdi), %rdx
-; O3-NEXT:  .Lpcsection357:
+; O3-NEXT:  .Lpcsection402:
 ; O3-NEXT:    movl $42, %ebx
 ; O3-NEXT:    .p2align 4, 0x90
 ; O3-NEXT:  .LBB204_1: # %atomicrmw.start
 ; O3-NEXT:    # =>This Inner Loop Header: Depth=1
-; O3-NEXT:  .Lpcsection358:
+; O3-NEXT:  .Lpcsection403:
 ; O3-NEXT:    xorl %ecx, %ecx
-; O3-NEXT:  .Lpcsection359:
+; O3-NEXT:  .Lpcsection404:
 ; O3-NEXT:    lock cmpxchg16b (%rdi)
-; O3-NEXT:  .Lpcsection360:
+; O3-NEXT:  .Lpcsection405:
 ; O3-NEXT:    jne .LBB204_1
 ; O3-NEXT:  # %bb.2: # %atomicrmw.end
 ; O3-NEXT:    movq $1, foo(%rip)
@@ -10674,20 +10809,20 @@ define void @atomic128_store_release(ptr %a) {
 ; O1-NEXT:    .cfi_def_cfa_offset 16
 ; O1-NEXT:    .cfi_offset %rbx, -16
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection361:
+; O1-NEXT:  .Lpcsection406:
 ; O1-NEXT:    movq (%rdi), %rax
-; O1-NEXT:  .Lpcsection362:
+; O1-NEXT:  .Lpcsection407:
 ; O1-NEXT:    movq 8(%rdi), %rdx
-; O1-NEXT:  .Lpcsection363:
+; O1-NEXT:  .Lpcsection408:
 ; O1-NEXT:    movl $42, %ebx
 ; O1-NEXT:    .p2align 4, 0x90
 ; O1-NEXT:  .LBB205_1: # %atomicrmw.start
 ; O1-NEXT:    # =>This Inner Loop Header: Depth=1
-; O1-NEXT:  .Lpcsection364:
+; O1-NEXT:  .Lpcsection409:
 ; O1-NEXT:    xorl %ecx, %ecx
-; O1-NEXT:  .Lpcsection365:
+; O1-NEXT:  .Lpcsection410:
 ; O1-NEXT:    lock cmpxchg16b (%rdi)
-; O1-NEXT:  .Lpcsection366:
+; O1-NEXT:  .Lpcsection411:
 ; O1-NEXT:    jne .LBB205_1
 ; O1-NEXT:  # %bb.2: # %atomicrmw.end
 ; O1-NEXT:    movq $1, foo(%rip)
@@ -10701,20 +10836,20 @@ define void @atomic128_store_release(ptr %a) {
 ; O2-NEXT:    .cfi_def_cfa_offset 16
 ; O2-NEXT:    .cfi_offset %rbx, -16
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection361:
+; O2-NEXT:  .Lpcsection406:
 ; O2-NEXT:    movq (%rdi), %rax
-; O2-NEXT:  .Lpcsection362:
+; O2-NEXT:  .Lpcsection407:
 ; O2-NEXT:    movq 8(%rdi), %rdx
-; O2-NEXT:  .Lpcsection363:
+; O2-NEXT:  .Lpcsection408:
 ; O2-NEXT:    movl $42, %ebx
 ; O2-NEXT:    .p2align 4, 0x90
 ; O2-NEXT:  .LBB205_1: # %atomicrmw.start
 ; O2-NEXT:    # =>This Inner Loop Header: Depth=1
-; O2-NEXT:  .Lpcsection364:
+; O2-NEXT:  .Lpcsection409:
 ; O2-NEXT:    xorl %ecx, %ecx
-; O2-NEXT:  .Lpcsection365:
+; O2-NEXT:  .Lpcsection410:
 ; O2-NEXT:    lock cmpxchg16b (%rdi)
-; O2-NEXT:  .Lpcsection366:
+; O2-NEXT:  .Lpcsection411:
 ; O2-NEXT:    jne .LBB205_1
 ; O2-NEXT:  # %bb.2: # %atomicrmw.end
 ; O2-NEXT:    movq $1, foo(%rip)
@@ -10728,20 +10863,20 @@ define void @atomic128_store_release(ptr %a) {
 ; O3-NEXT:    .cfi_def_cfa_offset 16
 ; O3-NEXT:    .cfi_offset %rbx, -16
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection361:
+; O3-NEXT:  .Lpcsection406:
 ; O3-NEXT:    movq (%rdi), %rax
-; O3-NEXT:  .Lpcsection362:
+; O3-NEXT:  .Lpcsection407:
 ; O3-NEXT:    movq 8(%rdi), %rdx
-; O3-NEXT:  .Lpcsection363:
+; O3-NEXT:  .Lpcsection408:
 ; O3-NEXT:    movl $42, %ebx
 ; O3-NEXT:    .p2align 4, 0x90
 ; O3-NEXT:  .LBB205_1: # %atomicrmw.start
 ; O3-NEXT:    # =>This Inner Loop Header: Depth=1
-; O3-NEXT:  .Lpcsection364:
+; O3-NEXT:  .Lpcsection409:
 ; O3-NEXT:    xorl %ecx, %ecx
-; O3-NEXT:  .Lpcsection365:
+; O3-NEXT:  .Lpcsection410:
 ; O3-NEXT:    lock cmpxchg16b (%rdi)
-; O3-NEXT:  .Lpcsection366:
+; O3-NEXT:  .Lpcsection411:
 ; O3-NEXT:    jne .LBB205_1
 ; O3-NEXT:  # %bb.2: # %atomicrmw.end
 ; O3-NEXT:    movq $1, foo(%rip)
@@ -10801,20 +10936,20 @@ define void @atomic128_store_seq_cst(ptr %a) {
 ; O1-NEXT:    .cfi_def_cfa_offset 16
 ; O1-NEXT:    .cfi_offset %rbx, -16
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection367:
+; O1-NEXT:  .Lpcsection412:
 ; O1-NEXT:    movq (%rdi), %rax
-; O1-NEXT:  .Lpcsection368:
+; O1-NEXT:  .Lpcsection413:
 ; O1-NEXT:    movq 8(%rdi), %rdx
-; O1-NEXT:  .Lpcsection369:
+; O1-NEXT:  .Lpcsection414:
 ; O1-NEXT:    movl $42, %ebx
 ; O1-NEXT:    .p2align 4, 0x90
 ; O1-NEXT:  .LBB206_1: # %atomicrmw.start
 ; O1-NEXT:    # =>This Inner Loop Header: Depth=1
-; O1-NEXT:  .Lpcsection370:
+; O1-NEXT:  .Lpcsection415:
 ; O1-NEXT:    xorl %ecx, %ecx
-; O1-NEXT:  .Lpcsection371:
+; O1-NEXT:  .Lpcsection416:
 ; O1-NEXT:    lock cmpxchg16b (%rdi)
-; O1-NEXT:  .Lpcsection372:
+; O1-NEXT:  .Lpcsection417:
 ; O1-NEXT:    jne .LBB206_1
 ; O1-NEXT:  # %bb.2: # %atomicrmw.end
 ; O1-NEXT:    movq $1, foo(%rip)
@@ -10828,20 +10963,20 @@ define void @atomic128_store_seq_cst(ptr %a) {
 ; O2-NEXT:    .cfi_def_cfa_offset 16
 ; O2-NEXT:    .cfi_offset %rbx, -16
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection367:
+; O2-NEXT:  .Lpcsection412:
 ; O2-NEXT:    movq (%rdi), %rax
-; O2-NEXT:  .Lpcsection368:
+; O2-NEXT:  .Lpcsection413:
 ; O2-NEXT:    movq 8(%rdi), %rdx
-; O2-NEXT:  .Lpcsection369:
+; O2-NEXT:  .Lpcsection414:
 ; O2-NEXT:    movl $42, %ebx
 ; O2-NEXT:    .p2align 4, 0x90
 ; O2-NEXT:  .LBB206_1: # %atomicrmw.start
 ; O2-NEXT:    # =>This Inner Loop Header: Depth=1
-; O2-NEXT:  .Lpcsection370:
+; O2-NEXT:  .Lpcsection415:
 ; O2-NEXT:    xorl %ecx, %ecx
-; O2-NEXT:  .Lpcsection371:
+; O2-NEXT:  .Lpcsection416:
 ; O2-NEXT:    lock cmpxchg16b (%rdi)
-; O2-NEXT:  .Lpcsection372:
+; O2-NEXT:  .Lpcsection417:
 ; O2-NEXT:    jne .LBB206_1
 ; O2-NEXT:  # %bb.2: # %atomicrmw.end
 ; O2-NEXT:    movq $1, foo(%rip)
@@ -10855,20 +10990,20 @@ define void @atomic128_store_seq_cst(ptr %a) {
 ; O3-NEXT:    .cfi_def_cfa_offset 16
 ; O3-NEXT:    .cfi_offset %rbx, -16
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection367:
+; O3-NEXT:  .Lpcsection412:
 ; O3-NEXT:    movq (%rdi), %rax
-; O3-NEXT:  .Lpcsection368:
+; O3-NEXT:  .Lpcsection413:
 ; O3-NEXT:    movq 8(%rdi), %rdx
-; O3-NEXT:  .Lpcsection369:
+; O3-NEXT:  .Lpcsection414:
 ; O3-NEXT:    movl $42, %ebx
 ; O3-NEXT:    .p2align 4, 0x90
 ; O3-NEXT:  .LBB206_1: # %atomicrmw.start
 ; O3-NEXT:    # =>This Inner Loop Header: Depth=1
-; O3-NEXT:  .Lpcsection370:
+; O3-NEXT:  .Lpcsection415:
 ; O3-NEXT:    xorl %ecx, %ecx
-; O3-NEXT:  .Lpcsection371:
+; O3-NEXT:  .Lpcsection416:
 ; O3-NEXT:    lock cmpxchg16b (%rdi)
-; O3-NEXT:  .Lpcsection372:
+; O3-NEXT:  .Lpcsection417:
 ; O3-NEXT:    jne .LBB206_1
 ; O3-NEXT:  # %bb.2: # %atomicrmw.end
 ; O3-NEXT:    movq $1, foo(%rip)
@@ -10894,7 +11029,7 @@ define void @atomic128_store_seq_cst_ptr_ty(ptr %a, ptr %v) {
 ; O1-LABEL: atomic128_store_seq_cst_ptr_ty:
 ; O1:       # %bb.0: # %entry
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection373:
+; O1-NEXT:  .Lpcsection418:
 ; O1-NEXT:    xchgq %rsi, (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    retq
@@ -10902,7 +11037,7 @@ define void @atomic128_store_seq_cst_ptr_ty(ptr %a, ptr %v) {
 ; O2-LABEL: atomic128_store_seq_cst_ptr_ty:
 ; O2:       # %bb.0: # %entry
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection373:
+; O2-NEXT:  .Lpcsection418:
 ; O2-NEXT:    xchgq %rsi, (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    retq
@@ -10910,7 +11045,7 @@ define void @atomic128_store_seq_cst_ptr_ty(ptr %a, ptr %v) {
 ; O3-LABEL: atomic128_store_seq_cst_ptr_ty:
 ; O3:       # %bb.0: # %entry
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection373:
+; O3-NEXT:  .Lpcsection418:
 ; O3-NEXT:    xchgq %rsi, (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    retq
@@ -10967,20 +11102,20 @@ define void @atomic128_xchg_monotonic(ptr %a) {
 ; O1-NEXT:    .cfi_def_cfa_offset 16
 ; O1-NEXT:    .cfi_offset %rbx, -16
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection374:
+; O1-NEXT:  .Lpcsection419:
 ; O1-NEXT:    movq (%rdi), %rax
-; O1-NEXT:  .Lpcsection375:
+; O1-NEXT:  .Lpcsection420:
 ; O1-NEXT:    movq 8(%rdi), %rdx
-; O1-NEXT:  .Lpcsection376:
+; O1-NEXT:  .Lpcsection421:
 ; O1-NEXT:    movl $42, %ebx
 ; O1-NEXT:    .p2align 4, 0x90
 ; O1-NEXT:  .LBB208_1: # %atomicrmw.start
 ; O1-NEXT:    # =>This Inner Loop Header: Depth=1
-; O1-NEXT:  .Lpcsection377:
+; O1-NEXT:  .Lpcsection422:
 ; O1-NEXT:    xorl %ecx, %ecx
-; O1-NEXT:  .Lpcsection378:
+; O1-NEXT:  .Lpcsection423:
 ; O1-NEXT:    lock cmpxchg16b (%rdi)
-; O1-NEXT:  .Lpcsection379:
+; O1-NEXT:  .Lpcsection424:
 ; O1-NEXT:    jne .LBB208_1
 ; O1-NEXT:  # %bb.2: # %atomicrmw.end
 ; O1-NEXT:    movq $1, foo(%rip)
@@ -10994,20 +11129,20 @@ define void @atomic128_xchg_monotonic(ptr %a) {
 ; O2-NEXT:    .cfi_def_cfa_offset 16
 ; O2-NEXT:    .cfi_offset %rbx, -16
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection374:
+; O2-NEXT:  .Lpcsection419:
 ; O2-NEXT:    movq (%rdi), %rax
-; O2-NEXT:  .Lpcsection375:
+; O2-NEXT:  .Lpcsection420:
 ; O2-NEXT:    movq 8(%rdi), %rdx
-; O2-NEXT:  .Lpcsection376:
+; O2-NEXT:  .Lpcsection421:
 ; O2-NEXT:    movl $42, %ebx
 ; O2-NEXT:    .p2align 4, 0x90
 ; O2-NEXT:  .LBB208_1: # %atomicrmw.start
 ; O2-NEXT:    # =>This Inner Loop Header: Depth=1
-; O2-NEXT:  .Lpcsection377:
+; O2-NEXT:  .Lpcsection422:
 ; O2-NEXT:    xorl %ecx, %ecx
-; O2-NEXT:  .Lpcsection378:
+; O2-NEXT:  .Lpcsection423:
 ; O2-NEXT:    lock cmpxchg16b (%rdi)
-; O2-NEXT:  .Lpcsection379:
+; O2-NEXT:  .Lpcsection424:
 ; O2-NEXT:    jne .LBB208_1
 ; O2-NEXT:  # %bb.2: # %atomicrmw.end
 ; O2-NEXT:    movq $1, foo(%rip)
@@ -11021,20 +11156,20 @@ define void @atomic128_xchg_monotonic(ptr %a) {
 ; O3-NEXT:    .cfi_def_cfa_offset 16
 ; O3-NEXT:    .cfi_offset %rbx, -16
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection374:
+; O3-NEXT:  .Lpcsection419:
 ; O3-NEXT:    movq (%rdi), %rax
-; O3-NEXT:  .Lpcsection375:
+; O3-NEXT:  .Lpcsection420:
 ; O3-NEXT:    movq 8(%rdi), %rdx
-; O3-NEXT:  .Lpcsection376:
+; O3-NEXT:  .Lpcsection421:
 ; O3-NEXT:    movl $42, %ebx
 ; O3-NEXT:    .p2align 4, 0x90
 ; O3-NEXT:  .LBB208_1: # %atomicrmw.start
 ; O3-NEXT:    # =>This Inner Loop Header: Depth=1
-; O3-NEXT:  .Lpcsection377:
+; O3-NEXT:  .Lpcsection422:
 ; O3-NEXT:    xorl %ecx, %ecx
-; O3-NEXT:  .Lpcsection378:
+; O3-NEXT:  .Lpcsection423:
 ; O3-NEXT:    lock cmpxchg16b (%rdi)
-; O3-NEXT:  .Lpcsection379:
+; O3-NEXT:  .Lpcsection424:
 ; O3-NEXT:    jne .LBB208_1
 ; O3-NEXT:  # %bb.2: # %atomicrmw.end
 ; O3-NEXT:    movq $1, foo(%rip)
@@ -11094,22 +11229,22 @@ define void @atomic128_add_monotonic(ptr %a) {
 ; O1-NEXT:    .cfi_def_cfa_offset 16
 ; O1-NEXT:    .cfi_offset %rbx, -16
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection380:
+; O1-NEXT:  .Lpcsection425:
 ; O1-NEXT:    movq (%rdi), %rax
-; O1-NEXT:  .Lpcsection381:
+; O1-NEXT:  .Lpcsection426:
 ; O1-NEXT:    movq 8(%rdi), %rdx
 ; O1-NEXT:    .p2align 4, 0x90
 ; O1-NEXT:  .LBB209_1: # %atomicrmw.start
 ; O1-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O1-NEXT:    movq %rax, %rbx
-; O1-NEXT:  .Lpcsection382:
+; O1-NEXT:  .Lpcsection427:
 ; O1-NEXT:    addq $42, %rbx
 ; O1-NEXT:    movq %rdx, %rcx
-; O1-NEXT:  .Lpcsection383:
+; O1-NEXT:  .Lpcsection428:
 ; O1-NEXT:    adcq $0, %rcx
-; O1-NEXT:  .Lpcsection384:
+; O1-NEXT:  .Lpcsection429:
 ; O1-NEXT:    lock cmpxchg16b (%rdi)
-; O1-NEXT:  .Lpcsection385:
+; O1-NEXT:  .Lpcsection430:
 ; O1-NEXT:    jne .LBB209_1
 ; O1-NEXT:  # %bb.2: # %atomicrmw.end
 ; O1-NEXT:    movq $1, foo(%rip)
@@ -11123,22 +11258,22 @@ define void @atomic128_add_monotonic(ptr %a) {
 ; O2-NEXT:    .cfi_def_cfa_offset 16
 ; O2-NEXT:    .cfi_offset %rbx, -16
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection380:
+; O2-NEXT:  .Lpcsection425:
 ; O2-NEXT:    movq (%rdi), %rax
-; O2-NEXT:  .Lpcsection381:
+; O2-NEXT:  .Lpcsection426:
 ; O2-NEXT:    movq 8(%rdi), %rdx
 ; O2-NEXT:    .p2align 4, 0x90
 ; O2-NEXT:  .LBB209_1: # %atomicrmw.start
 ; O2-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O2-NEXT:    movq %rax, %rbx
-; O2-NEXT:  .Lpcsection382:
+; O2-NEXT:  .Lpcsection427:
 ; O2-NEXT:    addq $42, %rbx
 ; O2-NEXT:    movq %rdx, %rcx
-; O2-NEXT:  .Lpcsection383:
+; O2-NEXT:  .Lpcsection428:
 ; O2-NEXT:    adcq $0, %rcx
-; O2-NEXT:  .Lpcsection384:
+; O2-NEXT:  .Lpcsection429:
 ; O2-NEXT:    lock cmpxchg16b (%rdi)
-; O2-NEXT:  .Lpcsection385:
+; O2-NEXT:  .Lpcsection430:
 ; O2-NEXT:    jne .LBB209_1
 ; O2-NEXT:  # %bb.2: # %atomicrmw.end
 ; O2-NEXT:    movq $1, foo(%rip)
@@ -11152,22 +11287,22 @@ define void @atomic128_add_monotonic(ptr %a) {
 ; O3-NEXT:    .cfi_def_cfa_offset 16
 ; O3-NEXT:    .cfi_offset %rbx, -16
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection380:
+; O3-NEXT:  .Lpcsection425:
 ; O3-NEXT:    movq (%rdi), %rax
-; O3-NEXT:  .Lpcsection381:
+; O3-NEXT:  .Lpcsection426:
 ; O3-NEXT:    movq 8(%rdi), %rdx
 ; O3-NEXT:    .p2align 4, 0x90
 ; O3-NEXT:  .LBB209_1: # %atomicrmw.start
 ; O3-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O3-NEXT:    movq %rax, %rbx
-; O3-NEXT:  .Lpcsection382:
+; O3-NEXT:  .Lpcsection427:
 ; O3-NEXT:    addq $42, %rbx
 ; O3-NEXT:    movq %rdx, %rcx
-; O3-NEXT:  .Lpcsection383:
+; O3-NEXT:  .Lpcsection428:
 ; O3-NEXT:    adcq $0, %rcx
-; O3-NEXT:  .Lpcsection384:
+; O3-NEXT:  .Lpcsection429:
 ; O3-NEXT:    lock cmpxchg16b (%rdi)
-; O3-NEXT:  .Lpcsection385:
+; O3-NEXT:  .Lpcsection430:
 ; O3-NEXT:    jne .LBB209_1
 ; O3-NEXT:  # %bb.2: # %atomicrmw.end
 ; O3-NEXT:    movq $1, foo(%rip)
@@ -11227,22 +11362,22 @@ define void @atomic128_sub_monotonic(ptr %a) {
 ; O1-NEXT:    .cfi_def_cfa_offset 16
 ; O1-NEXT:    .cfi_offset %rbx, -16
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection386:
+; O1-NEXT:  .Lpcsection431:
 ; O1-NEXT:    movq (%rdi), %rax
-; O1-NEXT:  .Lpcsection387:
+; O1-NEXT:  .Lpcsection432:
 ; O1-NEXT:    movq 8(%rdi), %rdx
 ; O1-NEXT:    .p2align 4, 0x90
 ; O1-NEXT:  .LBB210_1: # %atomicrmw.start
 ; O1-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O1-NEXT:    movq %rax, %rbx
-; O1-NEXT:  .Lpcsection388:
+; O1-NEXT:  .Lpcsection433:
 ; O1-NEXT:    addq $-42, %rbx
 ; O1-NEXT:    movq %rdx, %rcx
-; O1-NEXT:  .Lpcsection389:
+; O1-NEXT:  .Lpcsection434:
 ; O1-NEXT:    adcq $-1, %rcx
-; O1-NEXT:  .Lpcsection390:
+; O1-NEXT:  .Lpcsection435:
 ; O1-NEXT:    lock cmpxchg16b (%rdi)
-; O1-NEXT:  .Lpcsection391:
+; O1-NEXT:  .Lpcsection436:
 ; O1-NEXT:    jne .LBB210_1
 ; O1-NEXT:  # %bb.2: # %atomicrmw.end
 ; O1-NEXT:    movq $1, foo(%rip)
@@ -11256,22 +11391,22 @@ define void @atomic128_sub_monotonic(ptr %a) {
 ; O2-NEXT:    .cfi_def_cfa_offset 16
 ; O2-NEXT:    .cfi_offset %rbx, -16
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection386:
+; O2-NEXT:  .Lpcsection431:
 ; O2-NEXT:    movq (%rdi), %rax
-; O2-NEXT:  .Lpcsection387:
+; O2-NEXT:  .Lpcsection432:
 ; O2-NEXT:    movq 8(%rdi), %rdx
 ; O2-NEXT:    .p2align 4, 0x90
 ; O2-NEXT:  .LBB210_1: # %atomicrmw.start
 ; O2-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O2-NEXT:    movq %rax, %rbx
-; O2-NEXT:  .Lpcsection388:
+; O2-NEXT:  .Lpcsection433:
 ; O2-NEXT:    addq $-42, %rbx
 ; O2-NEXT:    movq %rdx, %rcx
-; O2-NEXT:  .Lpcsection389:
+; O2-NEXT:  .Lpcsection434:
 ; O2-NEXT:    adcq $-1, %rcx
-; O2-NEXT:  .Lpcsection390:
+; O2-NEXT:  .Lpcsection435:
 ; O2-NEXT:    lock cmpxchg16b (%rdi)
-; O2-NEXT:  .Lpcsection391:
+; O2-NEXT:  .Lpcsection436:
 ; O2-NEXT:    jne .LBB210_1
 ; O2-NEXT:  # %bb.2: # %atomicrmw.end
 ; O2-NEXT:    movq $1, foo(%rip)
@@ -11285,22 +11420,22 @@ define void @atomic128_sub_monotonic(ptr %a) {
 ; O3-NEXT:    .cfi_def_cfa_offset 16
 ; O3-NEXT:    .cfi_offset %rbx, -16
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection386:
+; O3-NEXT:  .Lpcsection431:
 ; O3-NEXT:    movq (%rdi), %rax
-; O3-NEXT:  .Lpcsection387:
+; O3-NEXT:  .Lpcsection432:
 ; O3-NEXT:    movq 8(%rdi), %rdx
 ; O3-NEXT:    .p2align 4, 0x90
 ; O3-NEXT:  .LBB210_1: # %atomicrmw.start
 ; O3-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O3-NEXT:    movq %rax, %rbx
-; O3-NEXT:  .Lpcsection388:
+; O3-NEXT:  .Lpcsection433:
 ; O3-NEXT:    addq $-42, %rbx
 ; O3-NEXT:    movq %rdx, %rcx
-; O3-NEXT:  .Lpcsection389:
+; O3-NEXT:  .Lpcsection434:
 ; O3-NEXT:    adcq $-1, %rcx
-; O3-NEXT:  .Lpcsection390:
+; O3-NEXT:  .Lpcsection435:
 ; O3-NEXT:    lock cmpxchg16b (%rdi)
-; O3-NEXT:  .Lpcsection391:
+; O3-NEXT:  .Lpcsection436:
 ; O3-NEXT:    jne .LBB210_1
 ; O3-NEXT:  # %bb.2: # %atomicrmw.end
 ; O3-NEXT:    movq $1, foo(%rip)
@@ -11362,21 +11497,21 @@ define void @atomic128_and_monotonic(ptr %a) {
 ; O1-NEXT:    .cfi_def_cfa_offset 16
 ; O1-NEXT:    .cfi_offset %rbx, -16
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection392:
+; O1-NEXT:  .Lpcsection437:
 ; O1-NEXT:    movq (%rdi), %rax
-; O1-NEXT:  .Lpcsection393:
+; O1-NEXT:  .Lpcsection438:
 ; O1-NEXT:    movq 8(%rdi), %rdx
 ; O1-NEXT:    .p2align 4, 0x90
 ; O1-NEXT:  .LBB211_1: # %atomicrmw.start
 ; O1-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O1-NEXT:    movl %eax, %ebx
-; O1-NEXT:  .Lpcsection394:
+; O1-NEXT:  .Lpcsection439:
 ; O1-NEXT:    andl $42, %ebx
-; O1-NEXT:  .Lpcsection395:
+; O1-NEXT:  .Lpcsection440:
 ; O1-NEXT:    xorl %ecx, %ecx
-; O1-NEXT:  .Lpcsection396:
+; O1-NEXT:  .Lpcsection441:
 ; O1-NEXT:    lock cmpxchg16b (%rdi)
-; O1-NEXT:  .Lpcsection397:
+; O1-NEXT:  .Lpcsection442:
 ; O1-NEXT:    jne .LBB211_1
 ; O1-NEXT:  # %bb.2: # %atomicrmw.end
 ; O1-NEXT:    movq $1, foo(%rip)
@@ -11390,21 +11525,21 @@ define void @atomic128_and_monotonic(ptr %a) {
 ; O2-NEXT:    .cfi_def_cfa_offset 16
 ; O2-NEXT:    .cfi_offset %rbx, -16
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection392:
+; O2-NEXT:  .Lpcsection437:
 ; O2-NEXT:    movq (%rdi), %rax
-; O2-NEXT:  .Lpcsection393:
+; O2-NEXT:  .Lpcsection438:
 ; O2-NEXT:    movq 8(%rdi), %rdx
 ; O2-NEXT:    .p2align 4, 0x90
 ; O2-NEXT:  .LBB211_1: # %atomicrmw.start
 ; O2-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O2-NEXT:    movl %eax, %ebx
-; O2-NEXT:  .Lpcsection394:
+; O2-NEXT:  .Lpcsection439:
 ; O2-NEXT:    andl $42, %ebx
-; O2-NEXT:  .Lpcsection395:
+; O2-NEXT:  .Lpcsection440:
 ; O2-NEXT:    xorl %ecx, %ecx
-; O2-NEXT:  .Lpcsection396:
+; O2-NEXT:  .Lpcsection441:
 ; O2-NEXT:    lock cmpxchg16b (%rdi)
-; O2-NEXT:  .Lpcsection397:
+; O2-NEXT:  .Lpcsection442:
 ; O2-NEXT:    jne .LBB211_1
 ; O2-NEXT:  # %bb.2: # %atomicrmw.end
 ; O2-NEXT:    movq $1, foo(%rip)
@@ -11418,21 +11553,21 @@ define void @atomic128_and_monotonic(ptr %a) {
 ; O3-NEXT:    .cfi_def_cfa_offset 16
 ; O3-NEXT:    .cfi_offset %rbx, -16
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection392:
+; O3-NEXT:  .Lpcsection437:
 ; O3-NEXT:    movq (%rdi), %rax
-; O3-NEXT:  .Lpcsection393:
+; O3-NEXT:  .Lpcsection438:
 ; O3-NEXT:    movq 8(%rdi), %rdx
 ; O3-NEXT:    .p2align 4, 0x90
 ; O3-NEXT:  .LBB211_1: # %atomicrmw.start
 ; O3-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O3-NEXT:    movl %eax, %ebx
-; O3-NEXT:  .Lpcsection394:
+; O3-NEXT:  .Lpcsection439:
 ; O3-NEXT:    andl $42, %ebx
-; O3-NEXT:  .Lpcsection395:
+; O3-NEXT:  .Lpcsection440:
 ; O3-NEXT:    xorl %ecx, %ecx
-; O3-NEXT:  .Lpcsection396:
+; O3-NEXT:  .Lpcsection441:
 ; O3-NEXT:    lock cmpxchg16b (%rdi)
-; O3-NEXT:  .Lpcsection397:
+; O3-NEXT:  .Lpcsection442:
 ; O3-NEXT:    jne .LBB211_1
 ; O3-NEXT:  # %bb.2: # %atomicrmw.end
 ; O3-NEXT:    movq $1, foo(%rip)
@@ -11490,20 +11625,20 @@ define void @atomic128_or_monotonic(ptr %a) {
 ; O1-NEXT:    .cfi_def_cfa_offset 16
 ; O1-NEXT:    .cfi_offset %rbx, -16
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection398:
+; O1-NEXT:  .Lpcsection443:
 ; O1-NEXT:    movq (%rdi), %rax
-; O1-NEXT:  .Lpcsection399:
+; O1-NEXT:  .Lpcsection444:
 ; O1-NEXT:    movq 8(%rdi), %rdx
 ; O1-NEXT:    .p2align 4, 0x90
 ; O1-NEXT:  .LBB212_1: # %atomicrmw.start
 ; O1-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O1-NEXT:    movq %rax, %rbx
-; O1-NEXT:  .Lpcsection400:
+; O1-NEXT:  .Lpcsection445:
 ; O1-NEXT:    orq $42, %rbx
 ; O1-NEXT:    movq %rdx, %rcx
-; O1-NEXT:  .Lpcsection401:
+; O1-NEXT:  .Lpcsection446:
 ; O1-NEXT:    lock cmpxchg16b (%rdi)
-; O1-NEXT:  .Lpcsection402:
+; O1-NEXT:  .Lpcsection447:
 ; O1-NEXT:    jne .LBB212_1
 ; O1-NEXT:  # %bb.2: # %atomicrmw.end
 ; O1-NEXT:    movq $1, foo(%rip)
@@ -11517,20 +11652,20 @@ define void @atomic128_or_monotonic(ptr %a) {
 ; O2-NEXT:    .cfi_def_cfa_offset 16
 ; O2-NEXT:    .cfi_offset %rbx, -16
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection398:
+; O2-NEXT:  .Lpcsection443:
 ; O2-NEXT:    movq (%rdi), %rax
-; O2-NEXT:  .Lpcsection399:
+; O2-NEXT:  .Lpcsection444:
 ; O2-NEXT:    movq 8(%rdi), %rdx
 ; O2-NEXT:    .p2align 4, 0x90
 ; O2-NEXT:  .LBB212_1: # %atomicrmw.start
 ; O2-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O2-NEXT:    movq %rax, %rbx
-; O2-NEXT:  .Lpcsection400:
+; O2-NEXT:  .Lpcsection445:
 ; O2-NEXT:    orq $42, %rbx
 ; O2-NEXT:    movq %rdx, %rcx
-; O2-NEXT:  .Lpcsection401:
+; O2-NEXT:  .Lpcsection446:
 ; O2-NEXT:    lock cmpxchg16b (%rdi)
-; O2-NEXT:  .Lpcsection402:
+; O2-NEXT:  .Lpcsection447:
 ; O2-NEXT:    jne .LBB212_1
 ; O2-NEXT:  # %bb.2: # %atomicrmw.end
 ; O2-NEXT:    movq $1, foo(%rip)
@@ -11544,20 +11679,20 @@ define void @atomic128_or_monotonic(ptr %a) {
 ; O3-NEXT:    .cfi_def_cfa_offset 16
 ; O3-NEXT:    .cfi_offset %rbx, -16
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection398:
+; O3-NEXT:  .Lpcsection443:
 ; O3-NEXT:    movq (%rdi), %rax
-; O3-NEXT:  .Lpcsection399:
+; O3-NEXT:  .Lpcsection444:
 ; O3-NEXT:    movq 8(%rdi), %rdx
 ; O3-NEXT:    .p2align 4, 0x90
 ; O3-NEXT:  .LBB212_1: # %atomicrmw.start
 ; O3-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O3-NEXT:    movq %rax, %rbx
-; O3-NEXT:  .Lpcsection400:
+; O3-NEXT:  .Lpcsection445:
 ; O3-NEXT:    orq $42, %rbx
 ; O3-NEXT:    movq %rdx, %rcx
-; O3-NEXT:  .Lpcsection401:
+; O3-NEXT:  .Lpcsection446:
 ; O3-NEXT:    lock cmpxchg16b (%rdi)
-; O3-NEXT:  .Lpcsection402:
+; O3-NEXT:  .Lpcsection447:
 ; O3-NEXT:    jne .LBB212_1
 ; O3-NEXT:  # %bb.2: # %atomicrmw.end
 ; O3-NEXT:    movq $1, foo(%rip)
@@ -11615,20 +11750,20 @@ define void @atomic128_xor_monotonic(ptr %a) {
 ; O1-NEXT:    .cfi_def_cfa_offset 16
 ; O1-NEXT:    .cfi_offset %rbx, -16
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection403:
+; O1-NEXT:  .Lpcsection448:
 ; O1-NEXT:    movq (%rdi), %rax
-; O1-NEXT:  .Lpcsection404:
+; O1-NEXT:  .Lpcsection449:
 ; O1-NEXT:    movq 8(%rdi), %rdx
 ; O1-NEXT:    .p2align 4, 0x90
 ; O1-NEXT:  .LBB213_1: # %atomicrmw.start
 ; O1-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O1-NEXT:    movq %rax, %rbx
-; O1-NEXT:  .Lpcsection405:
+; O1-NEXT:  .Lpcsection450:
 ; O1-NEXT:    xorq $42, %rbx
 ; O1-NEXT:    movq %rdx, %rcx
-; O1-NEXT:  .Lpcsection406:
+; O1-NEXT:  .Lpcsection451:
 ; O1-NEXT:    lock cmpxchg16b (%rdi)
-; O1-NEXT:  .Lpcsection407:
+; O1-NEXT:  .Lpcsection452:
 ; O1-NEXT:    jne .LBB213_1
 ; O1-NEXT:  # %bb.2: # %atomicrmw.end
 ; O1-NEXT:    movq $1, foo(%rip)
@@ -11642,20 +11777,20 @@ define void @atomic128_xor_monotonic(ptr %a) {
 ; O2-NEXT:    .cfi_def_cfa_offset 16
 ; O2-NEXT:    .cfi_offset %rbx, -16
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection403:
+; O2-NEXT:  .Lpcsection448:
 ; O2-NEXT:    movq (%rdi), %rax
-; O2-NEXT:  .Lpcsection404:
+; O2-NEXT:  .Lpcsection449:
 ; O2-NEXT:    movq 8(%rdi), %rdx
 ; O2-NEXT:    .p2align 4, 0x90
 ; O2-NEXT:  .LBB213_1: # %atomicrmw.start
 ; O2-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O2-NEXT:    movq %rax, %rbx
-; O2-NEXT:  .Lpcsection405:
+; O2-NEXT:  .Lpcsection450:
 ; O2-NEXT:    xorq $42, %rbx
 ; O2-NEXT:    movq %rdx, %rcx
-; O2-NEXT:  .Lpcsection406:
+; O2-NEXT:  .Lpcsection451:
 ; O2-NEXT:    lock cmpxchg16b (%rdi)
-; O2-NEXT:  .Lpcsection407:
+; O2-NEXT:  .Lpcsection452:
 ; O2-NEXT:    jne .LBB213_1
 ; O2-NEXT:  # %bb.2: # %atomicrmw.end
 ; O2-NEXT:    movq $1, foo(%rip)
@@ -11669,20 +11804,20 @@ define void @atomic128_xor_monotonic(ptr %a) {
 ; O3-NEXT:    .cfi_def_cfa_offset 16
 ; O3-NEXT:    .cfi_offset %rbx, -16
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection403:
+; O3-NEXT:  .Lpcsection448:
 ; O3-NEXT:    movq (%rdi), %rax
-; O3-NEXT:  .Lpcsection404:
+; O3-NEXT:  .Lpcsection449:
 ; O3-NEXT:    movq 8(%rdi), %rdx
 ; O3-NEXT:    .p2align 4, 0x90
 ; O3-NEXT:  .LBB213_1: # %atomicrmw.start
 ; O3-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O3-NEXT:    movq %rax, %rbx
-; O3-NEXT:  .Lpcsection405:
+; O3-NEXT:  .Lpcsection450:
 ; O3-NEXT:    xorq $42, %rbx
 ; O3-NEXT:    movq %rdx, %rcx
-; O3-NEXT:  .Lpcsection406:
+; O3-NEXT:  .Lpcsection451:
 ; O3-NEXT:    lock cmpxchg16b (%rdi)
-; O3-NEXT:  .Lpcsection407:
+; O3-NEXT:  .Lpcsection452:
 ; O3-NEXT:    jne .LBB213_1
 ; O3-NEXT:  # %bb.2: # %atomicrmw.end
 ; O3-NEXT:    movq $1, foo(%rip)
@@ -11746,23 +11881,23 @@ define void @atomic128_nand_monotonic(ptr %a) {
 ; O1-NEXT:    .cfi_def_cfa_offset 16
 ; O1-NEXT:    .cfi_offset %rbx, -16
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection408:
+; O1-NEXT:  .Lpcsection453:
 ; O1-NEXT:    movq (%rdi), %rax
-; O1-NEXT:  .Lpcsection409:
+; O1-NEXT:  .Lpcsection454:
 ; O1-NEXT:    movq 8(%rdi), %rdx
-; O1-NEXT:  .Lpcsection410:
+; O1-NEXT:  .Lpcsection455:
 ; O1-NEXT:    movq $-1, %rcx
 ; O1-NEXT:    .p2align 4, 0x90
 ; O1-NEXT:  .LBB214_1: # %atomicrmw.start
 ; O1-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O1-NEXT:    movl %eax, %ebx
-; O1-NEXT:  .Lpcsection411:
+; O1-NEXT:  .Lpcsection456:
 ; O1-NEXT:    notl %ebx
-; O1-NEXT:  .Lpcsection412:
+; O1-NEXT:  .Lpcsection457:
 ; O1-NEXT:    orq $-43, %rbx
-; O1-NEXT:  .Lpcsection413:
+; O1-NEXT:  .Lpcsection458:
 ; O1-NEXT:    lock cmpxchg16b (%rdi)
-; O1-NEXT:  .Lpcsection414:
+; O1-NEXT:  .Lpcsection459:
 ; O1-NEXT:    jne .LBB214_1
 ; O1-NEXT:  # %bb.2: # %atomicrmw.end
 ; O1-NEXT:    movq $1, foo(%rip)
@@ -11776,23 +11911,23 @@ define void @atomic128_nand_monotonic(ptr %a) {
 ; O2-NEXT:    .cfi_def_cfa_offset 16
 ; O2-NEXT:    .cfi_offset %rbx, -16
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection408:
+; O2-NEXT:  .Lpcsection453:
 ; O2-NEXT:    movq (%rdi), %rax
-; O2-NEXT:  .Lpcsection409:
+; O2-NEXT:  .Lpcsection454:
 ; O2-NEXT:    movq 8(%rdi), %rdx
-; O2-NEXT:  .Lpcsection410:
+; O2-NEXT:  .Lpcsection455:
 ; O2-NEXT:    movq $-1, %rcx
 ; O2-NEXT:    .p2align 4, 0x90
 ; O2-NEXT:  .LBB214_1: # %atomicrmw.start
 ; O2-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O2-NEXT:    movl %eax, %ebx
-; O2-NEXT:  .Lpcsection411:
+; O2-NEXT:  .Lpcsection456:
 ; O2-NEXT:    notl %ebx
-; O2-NEXT:  .Lpcsection412:
+; O2-NEXT:  .Lpcsection457:
 ; O2-NEXT:    orq $-43, %rbx
-; O2-NEXT:  .Lpcsection413:
+; O2-NEXT:  .Lpcsection458:
 ; O2-NEXT:    lock cmpxchg16b (%rdi)
-; O2-NEXT:  .Lpcsection414:
+; O2-NEXT:  .Lpcsection459:
 ; O2-NEXT:    jne .LBB214_1
 ; O2-NEXT:  # %bb.2: # %atomicrmw.end
 ; O2-NEXT:    movq $1, foo(%rip)
@@ -11806,23 +11941,23 @@ define void @atomic128_nand_monotonic(ptr %a) {
 ; O3-NEXT:    .cfi_def_cfa_offset 16
 ; O3-NEXT:    .cfi_offset %rbx, -16
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection408:
+; O3-NEXT:  .Lpcsection453:
 ; O3-NEXT:    movq (%rdi), %rax
-; O3-NEXT:  .Lpcsection409:
+; O3-NEXT:  .Lpcsection454:
 ; O3-NEXT:    movq 8(%rdi), %rdx
-; O3-NEXT:  .Lpcsection410:
+; O3-NEXT:  .Lpcsection455:
 ; O3-NEXT:    movq $-1, %rcx
 ; O3-NEXT:    .p2align 4, 0x90
 ; O3-NEXT:  .LBB214_1: # %atomicrmw.start
 ; O3-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O3-NEXT:    movl %eax, %ebx
-; O3-NEXT:  .Lpcsection411:
+; O3-NEXT:  .Lpcsection456:
 ; O3-NEXT:    notl %ebx
-; O3-NEXT:  .Lpcsection412:
+; O3-NEXT:  .Lpcsection457:
 ; O3-NEXT:    orq $-43, %rbx
-; O3-NEXT:  .Lpcsection413:
+; O3-NEXT:  .Lpcsection458:
 ; O3-NEXT:    lock cmpxchg16b (%rdi)
-; O3-NEXT:  .Lpcsection414:
+; O3-NEXT:  .Lpcsection459:
 ; O3-NEXT:    jne .LBB214_1
 ; O3-NEXT:  # %bb.2: # %atomicrmw.end
 ; O3-NEXT:    movq $1, foo(%rip)
@@ -11882,20 +12017,20 @@ define void @atomic128_xchg_acquire(ptr %a) {
 ; O1-NEXT:    .cfi_def_cfa_offset 16
 ; O1-NEXT:    .cfi_offset %rbx, -16
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection415:
+; O1-NEXT:  .Lpcsection460:
 ; O1-NEXT:    movq (%rdi), %rax
-; O1-NEXT:  .Lpcsection416:
+; O1-NEXT:  .Lpcsection461:
 ; O1-NEXT:    movq 8(%rdi), %rdx
-; O1-NEXT:  .Lpcsection417:
+; O1-NEXT:  .Lpcsection462:
 ; O1-NEXT:    movl $42, %ebx
 ; O1-NEXT:    .p2align 4, 0x90
 ; O1-NEXT:  .LBB215_1: # %atomicrmw.start
 ; O1-NEXT:    # =>This Inner Loop Header: Depth=1
-; O1-NEXT:  .Lpcsection418:
+; O1-NEXT:  .Lpcsection463:
 ; O1-NEXT:    xorl %ecx, %ecx
-; O1-NEXT:  .Lpcsection419:
+; O1-NEXT:  .Lpcsection464:
 ; O1-NEXT:    lock cmpxchg16b (%rdi)
-; O1-NEXT:  .Lpcsection420:
+; O1-NEXT:  .Lpcsection465:
 ; O1-NEXT:    jne .LBB215_1
 ; O1-NEXT:  # %bb.2: # %atomicrmw.end
 ; O1-NEXT:    movq $1, foo(%rip)
@@ -11909,20 +12044,20 @@ define void @atomic128_xchg_acquire(ptr %a) {
 ; O2-NEXT:    .cfi_def_cfa_offset 16
 ; O2-NEXT:    .cfi_offset %rbx, -16
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection415:
+; O2-NEXT:  .Lpcsection460:
 ; O2-NEXT:    movq (%rdi), %rax
-; O2-NEXT:  .Lpcsection416:
+; O2-NEXT:  .Lpcsection461:
 ; O2-NEXT:    movq 8(%rdi), %rdx
-; O2-NEXT:  .Lpcsection417:
+; O2-NEXT:  .Lpcsection462:
 ; O2-NEXT:    movl $42, %ebx
 ; O2-NEXT:    .p2align 4, 0x90
 ; O2-NEXT:  .LBB215_1: # %atomicrmw.start
 ; O2-NEXT:    # =>This Inner Loop Header: Depth=1
-; O2-NEXT:  .Lpcsection418:
+; O2-NEXT:  .Lpcsection463:
 ; O2-NEXT:    xorl %ecx, %ecx
-; O2-NEXT:  .Lpcsection419:
+; O2-NEXT:  .Lpcsection464:
 ; O2-NEXT:    lock cmpxchg16b (%rdi)
-; O2-NEXT:  .Lpcsection420:
+; O2-NEXT:  .Lpcsection465:
 ; O2-NEXT:    jne .LBB215_1
 ; O2-NEXT:  # %bb.2: # %atomicrmw.end
 ; O2-NEXT:    movq $1, foo(%rip)
@@ -11936,20 +12071,20 @@ define void @atomic128_xchg_acquire(ptr %a) {
 ; O3-NEXT:    .cfi_def_cfa_offset 16
 ; O3-NEXT:    .cfi_offset %rbx, -16
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection415:
+; O3-NEXT:  .Lpcsection460:
 ; O3-NEXT:    movq (%rdi), %rax
-; O3-NEXT:  .Lpcsection416:
+; O3-NEXT:  .Lpcsection461:
 ; O3-NEXT:    movq 8(%rdi), %rdx
-; O3-NEXT:  .Lpcsection417:
+; O3-NEXT:  .Lpcsection462:
 ; O3-NEXT:    movl $42, %ebx
 ; O3-NEXT:    .p2align 4, 0x90
 ; O3-NEXT:  .LBB215_1: # %atomicrmw.start
 ; O3-NEXT:    # =>This Inner Loop Header: Depth=1
-; O3-NEXT:  .Lpcsection418:
+; O3-NEXT:  .Lpcsection463:
 ; O3-NEXT:    xorl %ecx, %ecx
-; O3-NEXT:  .Lpcsection419:
+; O3-NEXT:  .Lpcsection464:
 ; O3-NEXT:    lock cmpxchg16b (%rdi)
-; O3-NEXT:  .Lpcsection420:
+; O3-NEXT:  .Lpcsection465:
 ; O3-NEXT:    jne .LBB215_1
 ; O3-NEXT:  # %bb.2: # %atomicrmw.end
 ; O3-NEXT:    movq $1, foo(%rip)
@@ -12009,22 +12144,22 @@ define void @atomic128_add_acquire(ptr %a) {
 ; O1-NEXT:    .cfi_def_cfa_offset 16
 ; O1-NEXT:    .cfi_offset %rbx, -16
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection421:
+; O1-NEXT:  .Lpcsection466:
 ; O1-NEXT:    movq (%rdi), %rax
-; O1-NEXT:  .Lpcsection422:
+; O1-NEXT:  .Lpcsection467:
 ; O1-NEXT:    movq 8(%rdi), %rdx
 ; O1-NEXT:    .p2align 4, 0x90
 ; O1-NEXT:  .LBB216_1: # %atomicrmw.start
 ; O1-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O1-NEXT:    movq %rax, %rbx
-; O1-NEXT:  .Lpcsection423:
+; O1-NEXT:  .Lpcsection468:
 ; O1-NEXT:    addq $42, %rbx
 ; O1-NEXT:    movq %rdx, %rcx
-; O1-NEXT:  .Lpcsection424:
+; O1-NEXT:  .Lpcsection469:
 ; O1-NEXT:    adcq $0, %rcx
-; O1-NEXT:  .Lpcsection425:
+; O1-NEXT:  .Lpcsection470:
 ; O1-NEXT:    lock cmpxchg16b (%rdi)
-; O1-NEXT:  .Lpcsection426:
+; O1-NEXT:  .Lpcsection471:
 ; O1-NEXT:    jne .LBB216_1
 ; O1-NEXT:  # %bb.2: # %atomicrmw.end
 ; O1-NEXT:    movq $1, foo(%rip)
@@ -12038,22 +12173,22 @@ define void @atomic128_add_acquire(ptr %a) {
 ; O2-NEXT:    .cfi_def_cfa_offset 16
 ; O2-NEXT:    .cfi_offset %rbx, -16
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection421:
+; O2-NEXT:  .Lpcsection466:
 ; O2-NEXT:    movq (%rdi), %rax
-; O2-NEXT:  .Lpcsection422:
+; O2-NEXT:  .Lpcsection467:
 ; O2-NEXT:    movq 8(%rdi), %rdx
 ; O2-NEXT:    .p2align 4, 0x90
 ; O2-NEXT:  .LBB216_1: # %atomicrmw.start
 ; O2-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O2-NEXT:    movq %rax, %rbx
-; O2-NEXT:  .Lpcsection423:
+; O2-NEXT:  .Lpcsection468:
 ; O2-NEXT:    addq $42, %rbx
 ; O2-NEXT:    movq %rdx, %rcx
-; O2-NEXT:  .Lpcsection424:
+; O2-NEXT:  .Lpcsection469:
 ; O2-NEXT:    adcq $0, %rcx
-; O2-NEXT:  .Lpcsection425:
+; O2-NEXT:  .Lpcsection470:
 ; O2-NEXT:    lock cmpxchg16b (%rdi)
-; O2-NEXT:  .Lpcsection426:
+; O2-NEXT:  .Lpcsection471:
 ; O2-NEXT:    jne .LBB216_1
 ; O2-NEXT:  # %bb.2: # %atomicrmw.end
 ; O2-NEXT:    movq $1, foo(%rip)
@@ -12067,22 +12202,22 @@ define void @atomic128_add_acquire(ptr %a) {
 ; O3-NEXT:    .cfi_def_cfa_offset 16
 ; O3-NEXT:    .cfi_offset %rbx, -16
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection421:
+; O3-NEXT:  .Lpcsection466:
 ; O3-NEXT:    movq (%rdi), %rax
-; O3-NEXT:  .Lpcsection422:
+; O3-NEXT:  .Lpcsection467:
 ; O3-NEXT:    movq 8(%rdi), %rdx
 ; O3-NEXT:    .p2align 4, 0x90
 ; O3-NEXT:  .LBB216_1: # %atomicrmw.start
 ; O3-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O3-NEXT:    movq %rax, %rbx
-; O3-NEXT:  .Lpcsection423:
+; O3-NEXT:  .Lpcsection468:
 ; O3-NEXT:    addq $42, %rbx
 ; O3-NEXT:    movq %rdx, %rcx
-; O3-NEXT:  .Lpcsection424:
+; O3-NEXT:  .Lpcsection469:
 ; O3-NEXT:    adcq $0, %rcx
-; O3-NEXT:  .Lpcsection425:
+; O3-NEXT:  .Lpcsection470:
 ; O3-NEXT:    lock cmpxchg16b (%rdi)
-; O3-NEXT:  .Lpcsection426:
+; O3-NEXT:  .Lpcsection471:
 ; O3-NEXT:    jne .LBB216_1
 ; O3-NEXT:  # %bb.2: # %atomicrmw.end
 ; O3-NEXT:    movq $1, foo(%rip)
@@ -12142,22 +12277,22 @@ define void @atomic128_sub_acquire(ptr %a) {
 ; O1-NEXT:    .cfi_def_cfa_offset 16
 ; O1-NEXT:    .cfi_offset %rbx, -16
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection427:
+; O1-NEXT:  .Lpcsection472:
 ; O1-NEXT:    movq (%rdi), %rax
-; O1-NEXT:  .Lpcsection428:
+; O1-NEXT:  .Lpcsection473:
 ; O1-NEXT:    movq 8(%rdi), %rdx
 ; O1-NEXT:    .p2align 4, 0x90
 ; O1-NEXT:  .LBB217_1: # %atomicrmw.start
 ; O1-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O1-NEXT:    movq %rax, %rbx
-; O1-NEXT:  .Lpcsection429:
+; O1-NEXT:  .Lpcsection474:
 ; O1-NEXT:    addq $-42, %rbx
 ; O1-NEXT:    movq %rdx, %rcx
-; O1-NEXT:  .Lpcsection430:
+; O1-NEXT:  .Lpcsection475:
 ; O1-NEXT:    adcq $-1, %rcx
-; O1-NEXT:  .Lpcsection431:
+; O1-NEXT:  .Lpcsection476:
 ; O1-NEXT:    lock cmpxchg16b (%rdi)
-; O1-NEXT:  .Lpcsection432:
+; O1-NEXT:  .Lpcsection477:
 ; O1-NEXT:    jne .LBB217_1
 ; O1-NEXT:  # %bb.2: # %atomicrmw.end
 ; O1-NEXT:    movq $1, foo(%rip)
@@ -12171,22 +12306,22 @@ define void @atomic128_sub_acquire(ptr %a) {
 ; O2-NEXT:    .cfi_def_cfa_offset 16
 ; O2-NEXT:    .cfi_offset %rbx, -16
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection427:
+; O2-NEXT:  .Lpcsection472:
 ; O2-NEXT:    movq (%rdi), %rax
-; O2-NEXT:  .Lpcsection428:
+; O2-NEXT:  .Lpcsection473:
 ; O2-NEXT:    movq 8(%rdi), %rdx
 ; O2-NEXT:    .p2align 4, 0x90
 ; O2-NEXT:  .LBB217_1: # %atomicrmw.start
 ; O2-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O2-NEXT:    movq %rax, %rbx
-; O2-NEXT:  .Lpcsection429:
+; O2-NEXT:  .Lpcsection474:
 ; O2-NEXT:    addq $-42, %rbx
 ; O2-NEXT:    movq %rdx, %rcx
-; O2-NEXT:  .Lpcsection430:
+; O2-NEXT:  .Lpcsection475:
 ; O2-NEXT:    adcq $-1, %rcx
-; O2-NEXT:  .Lpcsection431:
+; O2-NEXT:  .Lpcsection476:
 ; O2-NEXT:    lock cmpxchg16b (%rdi)
-; O2-NEXT:  .Lpcsection432:
+; O2-NEXT:  .Lpcsection477:
 ; O2-NEXT:    jne .LBB217_1
 ; O2-NEXT:  # %bb.2: # %atomicrmw.end
 ; O2-NEXT:    movq $1, foo(%rip)
@@ -12200,22 +12335,22 @@ define void @atomic128_sub_acquire(ptr %a) {
 ; O3-NEXT:    .cfi_def_cfa_offset 16
 ; O3-NEXT:    .cfi_offset %rbx, -16
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection427:
+; O3-NEXT:  .Lpcsection472:
 ; O3-NEXT:    movq (%rdi), %rax
-; O3-NEXT:  .Lpcsection428:
+; O3-NEXT:  .Lpcsection473:
 ; O3-NEXT:    movq 8(%rdi), %rdx
 ; O3-NEXT:    .p2align 4, 0x90
 ; O3-NEXT:  .LBB217_1: # %atomicrmw.start
 ; O3-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O3-NEXT:    movq %rax, %rbx
-; O3-NEXT:  .Lpcsection429:
+; O3-NEXT:  .Lpcsection474:
 ; O3-NEXT:    addq $-42, %rbx
 ; O3-NEXT:    movq %rdx, %rcx
-; O3-NEXT:  .Lpcsection430:
+; O3-NEXT:  .Lpcsection475:
 ; O3-NEXT:    adcq $-1, %rcx
-; O3-NEXT:  .Lpcsection431:
+; O3-NEXT:  .Lpcsection476:
 ; O3-NEXT:    lock cmpxchg16b (%rdi)
-; O3-NEXT:  .Lpcsection432:
+; O3-NEXT:  .Lpcsection477:
 ; O3-NEXT:    jne .LBB217_1
 ; O3-NEXT:  # %bb.2: # %atomicrmw.end
 ; O3-NEXT:    movq $1, foo(%rip)
@@ -12277,21 +12412,21 @@ define void @atomic128_and_acquire(ptr %a) {
 ; O1-NEXT:    .cfi_def_cfa_offset 16
 ; O1-NEXT:    .cfi_offset %rbx, -16
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection433:
+; O1-NEXT:  .Lpcsection478:
 ; O1-NEXT:    movq (%rdi), %rax
-; O1-NEXT:  .Lpcsection434:
+; O1-NEXT:  .Lpcsection479:
 ; O1-NEXT:    movq 8(%rdi), %rdx
 ; O1-NEXT:    .p2align 4, 0x90
 ; O1-NEXT:  .LBB218_1: # %atomicrmw.start
 ; O1-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O1-NEXT:    movl %eax, %ebx
-; O1-NEXT:  .Lpcsection435:
+; O1-NEXT:  .Lpcsection480:
 ; O1-NEXT:    andl $42, %ebx
-; O1-NEXT:  .Lpcsection436:
+; O1-NEXT:  .Lpcsection481:
 ; O1-NEXT:    xorl %ecx, %ecx
-; O1-NEXT:  .Lpcsection437:
+; O1-NEXT:  .Lpcsection482:
 ; O1-NEXT:    lock cmpxchg16b (%rdi)
-; O1-NEXT:  .Lpcsection438:
+; O1-NEXT:  .Lpcsection483:
 ; O1-NEXT:    jne .LBB218_1
 ; O1-NEXT:  # %bb.2: # %atomicrmw.end
 ; O1-NEXT:    movq $1, foo(%rip)
@@ -12305,21 +12440,21 @@ define void @atomic128_and_acquire(ptr %a) {
 ; O2-NEXT:    .cfi_def_cfa_offset 16
 ; O2-NEXT:    .cfi_offset %rbx, -16
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection433:
+; O2-NEXT:  .Lpcsection478:
 ; O2-NEXT:    movq (%rdi), %rax
-; O2-NEXT:  .Lpcsection434:
+; O2-NEXT:  .Lpcsection479:
 ; O2-NEXT:    movq 8(%rdi), %rdx
 ; O2-NEXT:    .p2align 4, 0x90
 ; O2-NEXT:  .LBB218_1: # %atomicrmw.start
 ; O2-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O2-NEXT:    movl %eax, %ebx
-; O2-NEXT:  .Lpcsection435:
+; O2-NEXT:  .Lpcsection480:
 ; O2-NEXT:    andl $42, %ebx
-; O2-NEXT:  .Lpcsection436:
+; O2-NEXT:  .Lpcsection481:
 ; O2-NEXT:    xorl %ecx, %ecx
-; O2-NEXT:  .Lpcsection437:
+; O2-NEXT:  .Lpcsection482:
 ; O2-NEXT:    lock cmpxchg16b (%rdi)
-; O2-NEXT:  .Lpcsection438:
+; O2-NEXT:  .Lpcsection483:
 ; O2-NEXT:    jne .LBB218_1
 ; O2-NEXT:  # %bb.2: # %atomicrmw.end
 ; O2-NEXT:    movq $1, foo(%rip)
@@ -12333,21 +12468,21 @@ define void @atomic128_and_acquire(ptr %a) {
 ; O3-NEXT:    .cfi_def_cfa_offset 16
 ; O3-NEXT:    .cfi_offset %rbx, -16
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection433:
+; O3-NEXT:  .Lpcsection478:
 ; O3-NEXT:    movq (%rdi), %rax
-; O3-NEXT:  .Lpcsection434:
+; O3-NEXT:  .Lpcsection479:
 ; O3-NEXT:    movq 8(%rdi), %rdx
 ; O3-NEXT:    .p2align 4, 0x90
 ; O3-NEXT:  .LBB218_1: # %atomicrmw.start
 ; O3-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O3-NEXT:    movl %eax, %ebx
-; O3-NEXT:  .Lpcsection435:
+; O3-NEXT:  .Lpcsection480:
 ; O3-NEXT:    andl $42, %ebx
-; O3-NEXT:  .Lpcsection436:
+; O3-NEXT:  .Lpcsection481:
 ; O3-NEXT:    xorl %ecx, %ecx
-; O3-NEXT:  .Lpcsection437:
+; O3-NEXT:  .Lpcsection482:
 ; O3-NEXT:    lock cmpxchg16b (%rdi)
-; O3-NEXT:  .Lpcsection438:
+; O3-NEXT:  .Lpcsection483:
 ; O3-NEXT:    jne .LBB218_1
 ; O3-NEXT:  # %bb.2: # %atomicrmw.end
 ; O3-NEXT:    movq $1, foo(%rip)
@@ -12405,20 +12540,20 @@ define void @atomic128_or_acquire(ptr %a) {
 ; O1-NEXT:    .cfi_def_cfa_offset 16
 ; O1-NEXT:    .cfi_offset %rbx, -16
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection439:
+; O1-NEXT:  .Lpcsection484:
 ; O1-NEXT:    movq (%rdi), %rax
-; O1-NEXT:  .Lpcsection440:
+; O1-NEXT:  .Lpcsection485:
 ; O1-NEXT:    movq 8(%rdi), %rdx
 ; O1-NEXT:    .p2align 4, 0x90
 ; O1-NEXT:  .LBB219_1: # %atomicrmw.start
 ; O1-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O1-NEXT:    movq %rax, %rbx
-; O1-NEXT:  .Lpcsection441:
+; O1-NEXT:  .Lpcsection486:
 ; O1-NEXT:    orq $42, %rbx
 ; O1-NEXT:    movq %rdx, %rcx
-; O1-NEXT:  .Lpcsection442:
+; O1-NEXT:  .Lpcsection487:
 ; O1-NEXT:    lock cmpxchg16b (%rdi)
-; O1-NEXT:  .Lpcsection443:
+; O1-NEXT:  .Lpcsection488:
 ; O1-NEXT:    jne .LBB219_1
 ; O1-NEXT:  # %bb.2: # %atomicrmw.end
 ; O1-NEXT:    movq $1, foo(%rip)
@@ -12432,20 +12567,20 @@ define void @atomic128_or_acquire(ptr %a) {
 ; O2-NEXT:    .cfi_def_cfa_offset 16
 ; O2-NEXT:    .cfi_offset %rbx, -16
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection439:
+; O2-NEXT:  .Lpcsection484:
 ; O2-NEXT:    movq (%rdi), %rax
-; O2-NEXT:  .Lpcsection440:
+; O2-NEXT:  .Lpcsection485:
 ; O2-NEXT:    movq 8(%rdi), %rdx
 ; O2-NEXT:    .p2align 4, 0x90
 ; O2-NEXT:  .LBB219_1: # %atomicrmw.start
 ; O2-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O2-NEXT:    movq %rax, %rbx
-; O2-NEXT:  .Lpcsection441:
+; O2-NEXT:  .Lpcsection486:
 ; O2-NEXT:    orq $42, %rbx
 ; O2-NEXT:    movq %rdx, %rcx
-; O2-NEXT:  .Lpcsection442:
+; O2-NEXT:  .Lpcsection487:
 ; O2-NEXT:    lock cmpxchg16b (%rdi)
-; O2-NEXT:  .Lpcsection443:
+; O2-NEXT:  .Lpcsection488:
 ; O2-NEXT:    jne .LBB219_1
 ; O2-NEXT:  # %bb.2: # %atomicrmw.end
 ; O2-NEXT:    movq $1, foo(%rip)
@@ -12459,20 +12594,20 @@ define void @atomic128_or_acquire(ptr %a) {
 ; O3-NEXT:    .cfi_def_cfa_offset 16
 ; O3-NEXT:    .cfi_offset %rbx, -16
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection439:
+; O3-NEXT:  .Lpcsection484:
 ; O3-NEXT:    movq (%rdi), %rax
-; O3-NEXT:  .Lpcsection440:
+; O3-NEXT:  .Lpcsection485:
 ; O3-NEXT:    movq 8(%rdi), %rdx
 ; O3-NEXT:    .p2align 4, 0x90
 ; O3-NEXT:  .LBB219_1: # %atomicrmw.start
 ; O3-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O3-NEXT:    movq %rax, %rbx
-; O3-NEXT:  .Lpcsection441:
+; O3-NEXT:  .Lpcsection486:
 ; O3-NEXT:    orq $42, %rbx
 ; O3-NEXT:    movq %rdx, %rcx
-; O3-NEXT:  .Lpcsection442:
+; O3-NEXT:  .Lpcsection487:
 ; O3-NEXT:    lock cmpxchg16b (%rdi)
-; O3-NEXT:  .Lpcsection443:
+; O3-NEXT:  .Lpcsection488:
 ; O3-NEXT:    jne .LBB219_1
 ; O3-NEXT:  # %bb.2: # %atomicrmw.end
 ; O3-NEXT:    movq $1, foo(%rip)
@@ -12530,20 +12665,20 @@ define void @atomic128_xor_acquire(ptr %a) {
 ; O1-NEXT:    .cfi_def_cfa_offset 16
 ; O1-NEXT:    .cfi_offset %rbx, -16
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection444:
+; O1-NEXT:  .Lpcsection489:
 ; O1-NEXT:    movq (%rdi), %rax
-; O1-NEXT:  .Lpcsection445:
+; O1-NEXT:  .Lpcsection490:
 ; O1-NEXT:    movq 8(%rdi), %rdx
 ; O1-NEXT:    .p2align 4, 0x90
 ; O1-NEXT:  .LBB220_1: # %atomicrmw.start
 ; O1-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O1-NEXT:    movq %rax, %rbx
-; O1-NEXT:  .Lpcsection446:
+; O1-NEXT:  .Lpcsection491:
 ; O1-NEXT:    xorq $42, %rbx
 ; O1-NEXT:    movq %rdx, %rcx
-; O1-NEXT:  .Lpcsection447:
+; O1-NEXT:  .Lpcsection492:
 ; O1-NEXT:    lock cmpxchg16b (%rdi)
-; O1-NEXT:  .Lpcsection448:
+; O1-NEXT:  .Lpcsection493:
 ; O1-NEXT:    jne .LBB220_1
 ; O1-NEXT:  # %bb.2: # %atomicrmw.end
 ; O1-NEXT:    movq $1, foo(%rip)
@@ -12557,20 +12692,20 @@ define void @atomic128_xor_acquire(ptr %a) {
 ; O2-NEXT:    .cfi_def_cfa_offset 16
 ; O2-NEXT:    .cfi_offset %rbx, -16
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection444:
+; O2-NEXT:  .Lpcsection489:
 ; O2-NEXT:    movq (%rdi), %rax
-; O2-NEXT:  .Lpcsection445:
+; O2-NEXT:  .Lpcsection490:
 ; O2-NEXT:    movq 8(%rdi), %rdx
 ; O2-NEXT:    .p2align 4, 0x90
 ; O2-NEXT:  .LBB220_1: # %atomicrmw.start
 ; O2-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O2-NEXT:    movq %rax, %rbx
-; O2-NEXT:  .Lpcsection446:
+; O2-NEXT:  .Lpcsection491:
 ; O2-NEXT:    xorq $42, %rbx
 ; O2-NEXT:    movq %rdx, %rcx
-; O2-NEXT:  .Lpcsection447:
+; O2-NEXT:  .Lpcsection492:
 ; O2-NEXT:    lock cmpxchg16b (%rdi)
-; O2-NEXT:  .Lpcsection448:
+; O2-NEXT:  .Lpcsection493:
 ; O2-NEXT:    jne .LBB220_1
 ; O2-NEXT:  # %bb.2: # %atomicrmw.end
 ; O2-NEXT:    movq $1, foo(%rip)
@@ -12584,20 +12719,20 @@ define void @atomic128_xor_acquire(ptr %a) {
 ; O3-NEXT:    .cfi_def_cfa_offset 16
 ; O3-NEXT:    .cfi_offset %rbx, -16
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection444:
+; O3-NEXT:  .Lpcsection489:
 ; O3-NEXT:    movq (%rdi), %rax
-; O3-NEXT:  .Lpcsection445:
+; O3-NEXT:  .Lpcsection490:
 ; O3-NEXT:    movq 8(%rdi), %rdx
 ; O3-NEXT:    .p2align 4, 0x90
 ; O3-NEXT:  .LBB220_1: # %atomicrmw.start
 ; O3-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O3-NEXT:    movq %rax, %rbx
-; O3-NEXT:  .Lpcsection446:
+; O3-NEXT:  .Lpcsection491:
 ; O3-NEXT:    xorq $42, %rbx
 ; O3-NEXT:    movq %rdx, %rcx
-; O3-NEXT:  .Lpcsection447:
+; O3-NEXT:  .Lpcsection492:
 ; O3-NEXT:    lock cmpxchg16b (%rdi)
-; O3-NEXT:  .Lpcsection448:
+; O3-NEXT:  .Lpcsection493:
 ; O3-NEXT:    jne .LBB220_1
 ; O3-NEXT:  # %bb.2: # %atomicrmw.end
 ; O3-NEXT:    movq $1, foo(%rip)
@@ -12661,23 +12796,23 @@ define void @atomic128_nand_acquire(ptr %a) {
 ; O1-NEXT:    .cfi_def_cfa_offset 16
 ; O1-NEXT:    .cfi_offset %rbx, -16
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection449:
+; O1-NEXT:  .Lpcsection494:
 ; O1-NEXT:    movq (%rdi), %rax
-; O1-NEXT:  .Lpcsection450:
+; O1-NEXT:  .Lpcsection495:
 ; O1-NEXT:    movq 8(%rdi), %rdx
-; O1-NEXT:  .Lpcsection451:
+; O1-NEXT:  .Lpcsection496:
 ; O1-NEXT:    movq $-1, %rcx
 ; O1-NEXT:    .p2align 4, 0x90
 ; O1-NEXT:  .LBB221_1: # %atomicrmw.start
 ; O1-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O1-NEXT:    movl %eax, %ebx
-; O1-NEXT:  .Lpcsection452:
+; O1-NEXT:  .Lpcsection497:
 ; O1-NEXT:    notl %ebx
-; O1-NEXT:  .Lpcsection453:
+; O1-NEXT:  .Lpcsection498:
 ; O1-NEXT:    orq $-43, %rbx
-; O1-NEXT:  .Lpcsection454:
+; O1-NEXT:  .Lpcsection499:
 ; O1-NEXT:    lock cmpxchg16b (%rdi)
-; O1-NEXT:  .Lpcsection455:
+; O1-NEXT:  .Lpcsection500:
 ; O1-NEXT:    jne .LBB221_1
 ; O1-NEXT:  # %bb.2: # %atomicrmw.end
 ; O1-NEXT:    movq $1, foo(%rip)
@@ -12691,23 +12826,23 @@ define void @atomic128_nand_acquire(ptr %a) {
 ; O2-NEXT:    .cfi_def_cfa_offset 16
 ; O2-NEXT:    .cfi_offset %rbx, -16
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection449:
+; O2-NEXT:  .Lpcsection494:
 ; O2-NEXT:    movq (%rdi), %rax
-; O2-NEXT:  .Lpcsection450:
+; O2-NEXT:  .Lpcsection495:
 ; O2-NEXT:    movq 8(%rdi), %rdx
-; O2-NEXT:  .Lpcsection451:
+; O2-NEXT:  .Lpcsection496:
 ; O2-NEXT:    movq $-1, %rcx
 ; O2-NEXT:    .p2align 4, 0x90
 ; O2-NEXT:  .LBB221_1: # %atomicrmw.start
 ; O2-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O2-NEXT:    movl %eax, %ebx
-; O2-NEXT:  .Lpcsection452:
+; O2-NEXT:  .Lpcsection497:
 ; O2-NEXT:    notl %ebx
-; O2-NEXT:  .Lpcsection453:
+; O2-NEXT:  .Lpcsection498:
 ; O2-NEXT:    orq $-43, %rbx
-; O2-NEXT:  .Lpcsection454:
+; O2-NEXT:  .Lpcsection499:
 ; O2-NEXT:    lock cmpxchg16b (%rdi)
-; O2-NEXT:  .Lpcsection455:
+; O2-NEXT:  .Lpcsection500:
 ; O2-NEXT:    jne .LBB221_1
 ; O2-NEXT:  # %bb.2: # %atomicrmw.end
 ; O2-NEXT:    movq $1, foo(%rip)
@@ -12721,23 +12856,23 @@ define void @atomic128_nand_acquire(ptr %a) {
 ; O3-NEXT:    .cfi_def_cfa_offset 16
 ; O3-NEXT:    .cfi_offset %rbx, -16
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection449:
+; O3-NEXT:  .Lpcsection494:
 ; O3-NEXT:    movq (%rdi), %rax
-; O3-NEXT:  .Lpcsection450:
+; O3-NEXT:  .Lpcsection495:
 ; O3-NEXT:    movq 8(%rdi), %rdx
-; O3-NEXT:  .Lpcsection451:
+; O3-NEXT:  .Lpcsection496:
 ; O3-NEXT:    movq $-1, %rcx
 ; O3-NEXT:    .p2align 4, 0x90
 ; O3-NEXT:  .LBB221_1: # %atomicrmw.start
 ; O3-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O3-NEXT:    movl %eax, %ebx
-; O3-NEXT:  .Lpcsection452:
+; O3-NEXT:  .Lpcsection497:
 ; O3-NEXT:    notl %ebx
-; O3-NEXT:  .Lpcsection453:
+; O3-NEXT:  .Lpcsection498:
 ; O3-NEXT:    orq $-43, %rbx
-; O3-NEXT:  .Lpcsection454:
+; O3-NEXT:  .Lpcsection499:
 ; O3-NEXT:    lock cmpxchg16b (%rdi)
-; O3-NEXT:  .Lpcsection455:
+; O3-NEXT:  .Lpcsection500:
 ; O3-NEXT:    jne .LBB221_1
 ; O3-NEXT:  # %bb.2: # %atomicrmw.end
 ; O3-NEXT:    movq $1, foo(%rip)
@@ -12797,20 +12932,20 @@ define void @atomic128_xchg_release(ptr %a) {
 ; O1-NEXT:    .cfi_def_cfa_offset 16
 ; O1-NEXT:    .cfi_offset %rbx, -16
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection456:
+; O1-NEXT:  .Lpcsection501:
 ; O1-NEXT:    movq (%rdi), %rax
-; O1-NEXT:  .Lpcsection457:
+; O1-NEXT:  .Lpcsection502:
 ; O1-NEXT:    movq 8(%rdi), %rdx
-; O1-NEXT:  .Lpcsection458:
+; O1-NEXT:  .Lpcsection503:
 ; O1-NEXT:    movl $42, %ebx
 ; O1-NEXT:    .p2align 4, 0x90
 ; O1-NEXT:  .LBB222_1: # %atomicrmw.start
 ; O1-NEXT:    # =>This Inner Loop Header: Depth=1
-; O1-NEXT:  .Lpcsection459:
+; O1-NEXT:  .Lpcsection504:
 ; O1-NEXT:    xorl %ecx, %ecx
-; O1-NEXT:  .Lpcsection460:
+; O1-NEXT:  .Lpcsection505:
 ; O1-NEXT:    lock cmpxchg16b (%rdi)
-; O1-NEXT:  .Lpcsection461:
+; O1-NEXT:  .Lpcsection506:
 ; O1-NEXT:    jne .LBB222_1
 ; O1-NEXT:  # %bb.2: # %atomicrmw.end
 ; O1-NEXT:    movq $1, foo(%rip)
@@ -12824,20 +12959,20 @@ define void @atomic128_xchg_release(ptr %a) {
 ; O2-NEXT:    .cfi_def_cfa_offset 16
 ; O2-NEXT:    .cfi_offset %rbx, -16
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection456:
+; O2-NEXT:  .Lpcsection501:
 ; O2-NEXT:    movq (%rdi), %rax
-; O2-NEXT:  .Lpcsection457:
+; O2-NEXT:  .Lpcsection502:
 ; O2-NEXT:    movq 8(%rdi), %rdx
-; O2-NEXT:  .Lpcsection458:
+; O2-NEXT:  .Lpcsection503:
 ; O2-NEXT:    movl $42, %ebx
 ; O2-NEXT:    .p2align 4, 0x90
 ; O2-NEXT:  .LBB222_1: # %atomicrmw.start
 ; O2-NEXT:    # =>This Inner Loop Header: Depth=1
-; O2-NEXT:  .Lpcsection459:
+; O2-NEXT:  .Lpcsection504:
 ; O2-NEXT:    xorl %ecx, %ecx
-; O2-NEXT:  .Lpcsection460:
+; O2-NEXT:  .Lpcsection505:
 ; O2-NEXT:    lock cmpxchg16b (%rdi)
-; O2-NEXT:  .Lpcsection461:
+; O2-NEXT:  .Lpcsection506:
 ; O2-NEXT:    jne .LBB222_1
 ; O2-NEXT:  # %bb.2: # %atomicrmw.end
 ; O2-NEXT:    movq $1, foo(%rip)
@@ -12851,20 +12986,20 @@ define void @atomic128_xchg_release(ptr %a) {
 ; O3-NEXT:    .cfi_def_cfa_offset 16
 ; O3-NEXT:    .cfi_offset %rbx, -16
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection456:
+; O3-NEXT:  .Lpcsection501:
 ; O3-NEXT:    movq (%rdi), %rax
-; O3-NEXT:  .Lpcsection457:
+; O3-NEXT:  .Lpcsection502:
 ; O3-NEXT:    movq 8(%rdi), %rdx
-; O3-NEXT:  .Lpcsection458:
+; O3-NEXT:  .Lpcsection503:
 ; O3-NEXT:    movl $42, %ebx
 ; O3-NEXT:    .p2align 4, 0x90
 ; O3-NEXT:  .LBB222_1: # %atomicrmw.start
 ; O3-NEXT:    # =>This Inner Loop Header: Depth=1
-; O3-NEXT:  .Lpcsection459:
+; O3-NEXT:  .Lpcsection504:
 ; O3-NEXT:    xorl %ecx, %ecx
-; O3-NEXT:  .Lpcsection460:
+; O3-NEXT:  .Lpcsection505:
 ; O3-NEXT:    lock cmpxchg16b (%rdi)
-; O3-NEXT:  .Lpcsection461:
+; O3-NEXT:  .Lpcsection506:
 ; O3-NEXT:    jne .LBB222_1
 ; O3-NEXT:  # %bb.2: # %atomicrmw.end
 ; O3-NEXT:    movq $1, foo(%rip)
@@ -12923,22 +13058,22 @@ define void @atomic128_add_release(ptr %a) {
 ; O1-NEXT:    .cfi_def_cfa_offset 16
 ; O1-NEXT:    .cfi_offset %rbx, -16
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection462:
+; O1-NEXT:  .Lpcsection507:
 ; O1-NEXT:    movq (%rdi), %rax
-; O1-NEXT:  .Lpcsection463:
+; O1-NEXT:  .Lpcsection508:
 ; O1-NEXT:    movq 8(%rdi), %rdx
 ; O1-NEXT:    .p2align 4, 0x90
 ; O1-NEXT:  .LBB223_1: # %atomicrmw.start
 ; O1-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O1-NEXT:    movq %rax, %rbx
-; O1-NEXT:  .Lpcsection464:
+; O1-NEXT:  .Lpcsection509:
 ; O1-NEXT:    addq $42, %rbx
 ; O1-NEXT:    movq %rdx, %rcx
-; O1-NEXT:  .Lpcsection465:
+; O1-NEXT:  .Lpcsection510:
 ; O1-NEXT:    adcq $0, %rcx
-; O1-NEXT:  .Lpcsection466:
+; O1-NEXT:  .Lpcsection511:
 ; O1-NEXT:    lock cmpxchg16b (%rdi)
-; O1-NEXT:  .Lpcsection467:
+; O1-NEXT:  .Lpcsection512:
 ; O1-NEXT:    jne .LBB223_1
 ; O1-NEXT:  # %bb.2: # %atomicrmw.end
 ; O1-NEXT:    movq $1, foo(%rip)
@@ -12952,22 +13087,22 @@ define void @atomic128_add_release(ptr %a) {
 ; O2-NEXT:    .cfi_def_cfa_offset 16
 ; O2-NEXT:    .cfi_offset %rbx, -16
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection462:
+; O2-NEXT:  .Lpcsection507:
 ; O2-NEXT:    movq (%rdi), %rax
-; O2-NEXT:  .Lpcsection463:
+; O2-NEXT:  .Lpcsection508:
 ; O2-NEXT:    movq 8(%rdi), %rdx
 ; O2-NEXT:    .p2align 4, 0x90
 ; O2-NEXT:  .LBB223_1: # %atomicrmw.start
 ; O2-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O2-NEXT:    movq %rax, %rbx
-; O2-NEXT:  .Lpcsection464:
+; O2-NEXT:  .Lpcsection509:
 ; O2-NEXT:    addq $42, %rbx
 ; O2-NEXT:    movq %rdx, %rcx
-; O2-NEXT:  .Lpcsection465:
+; O2-NEXT:  .Lpcsection510:
 ; O2-NEXT:    adcq $0, %rcx
-; O2-NEXT:  .Lpcsection466:
+; O2-NEXT:  .Lpcsection511:
 ; O2-NEXT:    lock cmpxchg16b (%rdi)
-; O2-NEXT:  .Lpcsection467:
+; O2-NEXT:  .Lpcsection512:
 ; O2-NEXT:    jne .LBB223_1
 ; O2-NEXT:  # %bb.2: # %atomicrmw.end
 ; O2-NEXT:    movq $1, foo(%rip)
@@ -12981,22 +13116,22 @@ define void @atomic128_add_release(ptr %a) {
 ; O3-NEXT:    .cfi_def_cfa_offset 16
 ; O3-NEXT:    .cfi_offset %rbx, -16
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection462:
+; O3-NEXT:  .Lpcsection507:
 ; O3-NEXT:    movq (%rdi), %rax
-; O3-NEXT:  .Lpcsection463:
+; O3-NEXT:  .Lpcsection508:
 ; O3-NEXT:    movq 8(%rdi), %rdx
 ; O3-NEXT:    .p2align 4, 0x90
 ; O3-NEXT:  .LBB223_1: # %atomicrmw.start
 ; O3-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O3-NEXT:    movq %rax, %rbx
-; O3-NEXT:  .Lpcsection464:
+; O3-NEXT:  .Lpcsection509:
 ; O3-NEXT:    addq $42, %rbx
 ; O3-NEXT:    movq %rdx, %rcx
-; O3-NEXT:  .Lpcsection465:
+; O3-NEXT:  .Lpcsection510:
 ; O3-NEXT:    adcq $0, %rcx
-; O3-NEXT:  .Lpcsection466:
+; O3-NEXT:  .Lpcsection511:
 ; O3-NEXT:    lock cmpxchg16b (%rdi)
-; O3-NEXT:  .Lpcsection467:
+; O3-NEXT:  .Lpcsection512:
 ; O3-NEXT:    jne .LBB223_1
 ; O3-NEXT:  # %bb.2: # %atomicrmw.end
 ; O3-NEXT:    movq $1, foo(%rip)
@@ -13056,22 +13191,22 @@ define void @atomic128_sub_release(ptr %a) {
 ; O1-NEXT:    .cfi_def_cfa_offset 16
 ; O1-NEXT:    .cfi_offset %rbx, -16
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection468:
+; O1-NEXT:  .Lpcsection513:
 ; O1-NEXT:    movq (%rdi), %rax
-; O1-NEXT:  .Lpcsection469:
+; O1-NEXT:  .Lpcsection514:
 ; O1-NEXT:    movq 8(%rdi), %rdx
 ; O1-NEXT:    .p2align 4, 0x90
 ; O1-NEXT:  .LBB224_1: # %atomicrmw.start
 ; O1-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O1-NEXT:    movq %rax, %rbx
-; O1-NEXT:  .Lpcsection470:
+; O1-NEXT:  .Lpcsection515:
 ; O1-NEXT:    addq $-42, %rbx
 ; O1-NEXT:    movq %rdx, %rcx
-; O1-NEXT:  .Lpcsection471:
+; O1-NEXT:  .Lpcsection516:
 ; O1-NEXT:    adcq $-1, %rcx
-; O1-NEXT:  .Lpcsection472:
+; O1-NEXT:  .Lpcsection517:
 ; O1-NEXT:    lock cmpxchg16b (%rdi)
-; O1-NEXT:  .Lpcsection473:
+; O1-NEXT:  .Lpcsection518:
 ; O1-NEXT:    jne .LBB224_1
 ; O1-NEXT:  # %bb.2: # %atomicrmw.end
 ; O1-NEXT:    movq $1, foo(%rip)
@@ -13085,22 +13220,22 @@ define void @atomic128_sub_release(ptr %a) {
 ; O2-NEXT:    .cfi_def_cfa_offset 16
 ; O2-NEXT:    .cfi_offset %rbx, -16
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection468:
+; O2-NEXT:  .Lpcsection513:
 ; O2-NEXT:    movq (%rdi), %rax
-; O2-NEXT:  .Lpcsection469:
+; O2-NEXT:  .Lpcsection514:
 ; O2-NEXT:    movq 8(%rdi), %rdx
 ; O2-NEXT:    .p2align 4, 0x90
 ; O2-NEXT:  .LBB224_1: # %atomicrmw.start
 ; O2-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O2-NEXT:    movq %rax, %rbx
-; O2-NEXT:  .Lpcsection470:
+; O2-NEXT:  .Lpcsection515:
 ; O2-NEXT:    addq $-42, %rbx
 ; O2-NEXT:    movq %rdx, %rcx
-; O2-NEXT:  .Lpcsection471:
+; O2-NEXT:  .Lpcsection516:
 ; O2-NEXT:    adcq $-1, %rcx
-; O2-NEXT:  .Lpcsection472:
+; O2-NEXT:  .Lpcsection517:
 ; O2-NEXT:    lock cmpxchg16b (%rdi)
-; O2-NEXT:  .Lpcsection473:
+; O2-NEXT:  .Lpcsection518:
 ; O2-NEXT:    jne .LBB224_1
 ; O2-NEXT:  # %bb.2: # %atomicrmw.end
 ; O2-NEXT:    movq $1, foo(%rip)
@@ -13114,22 +13249,22 @@ define void @atomic128_sub_release(ptr %a) {
 ; O3-NEXT:    .cfi_def_cfa_offset 16
 ; O3-NEXT:    .cfi_offset %rbx, -16
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection468:
+; O3-NEXT:  .Lpcsection513:
 ; O3-NEXT:    movq (%rdi), %rax
-; O3-NEXT:  .Lpcsection469:
+; O3-NEXT:  .Lpcsection514:
 ; O3-NEXT:    movq 8(%rdi), %rdx
 ; O3-NEXT:    .p2align 4, 0x90
 ; O3-NEXT:  .LBB224_1: # %atomicrmw.start
 ; O3-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O3-NEXT:    movq %rax, %rbx
-; O3-NEXT:  .Lpcsection470:
+; O3-NEXT:  .Lpcsection515:
 ; O3-NEXT:    addq $-42, %rbx
 ; O3-NEXT:    movq %rdx, %rcx
-; O3-NEXT:  .Lpcsection471:
+; O3-NEXT:  .Lpcsection516:
 ; O3-NEXT:    adcq $-1, %rcx
-; O3-NEXT:  .Lpcsection472:
+; O3-NEXT:  .Lpcsection517:
 ; O3-NEXT:    lock cmpxchg16b (%rdi)
-; O3-NEXT:  .Lpcsection473:
+; O3-NEXT:  .Lpcsection518:
 ; O3-NEXT:    jne .LBB224_1
 ; O3-NEXT:  # %bb.2: # %atomicrmw.end
 ; O3-NEXT:    movq $1, foo(%rip)
@@ -13191,21 +13326,21 @@ define void @atomic128_and_release(ptr %a) {
 ; O1-NEXT:    .cfi_def_cfa_offset 16
 ; O1-NEXT:    .cfi_offset %rbx, -16
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection474:
+; O1-NEXT:  .Lpcsection519:
 ; O1-NEXT:    movq (%rdi), %rax
-; O1-NEXT:  .Lpcsection475:
+; O1-NEXT:  .Lpcsection520:
 ; O1-NEXT:    movq 8(%rdi), %rdx
 ; O1-NEXT:    .p2align 4, 0x90
 ; O1-NEXT:  .LBB225_1: # %atomicrmw.start
 ; O1-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O1-NEXT:    movl %eax, %ebx
-; O1-NEXT:  .Lpcsection476:
+; O1-NEXT:  .Lpcsection521:
 ; O1-NEXT:    andl $42, %ebx
-; O1-NEXT:  .Lpcsection477:
+; O1-NEXT:  .Lpcsection522:
 ; O1-NEXT:    xorl %ecx, %ecx
-; O1-NEXT:  .Lpcsection478:
+; O1-NEXT:  .Lpcsection523:
 ; O1-NEXT:    lock cmpxchg16b (%rdi)
-; O1-NEXT:  .Lpcsection479:
+; O1-NEXT:  .Lpcsection524:
 ; O1-NEXT:    jne .LBB225_1
 ; O1-NEXT:  # %bb.2: # %atomicrmw.end
 ; O1-NEXT:    movq $1, foo(%rip)
@@ -13219,21 +13354,21 @@ define void @atomic128_and_release(ptr %a) {
 ; O2-NEXT:    .cfi_def_cfa_offset 16
 ; O2-NEXT:    .cfi_offset %rbx, -16
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection474:
+; O2-NEXT:  .Lpcsection519:
 ; O2-NEXT:    movq (%rdi), %rax
-; O2-NEXT:  .Lpcsection475:
+; O2-NEXT:  .Lpcsection520:
 ; O2-NEXT:    movq 8(%rdi), %rdx
 ; O2-NEXT:    .p2align 4, 0x90
 ; O2-NEXT:  .LBB225_1: # %atomicrmw.start
 ; O2-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O2-NEXT:    movl %eax, %ebx
-; O2-NEXT:  .Lpcsection476:
+; O2-NEXT:  .Lpcsection521:
 ; O2-NEXT:    andl $42, %ebx
-; O2-NEXT:  .Lpcsection477:
+; O2-NEXT:  .Lpcsection522:
 ; O2-NEXT:    xorl %ecx, %ecx
-; O2-NEXT:  .Lpcsection478:
+; O2-NEXT:  .Lpcsection523:
 ; O2-NEXT:    lock cmpxchg16b (%rdi)
-; O2-NEXT:  .Lpcsection479:
+; O2-NEXT:  .Lpcsection524:
 ; O2-NEXT:    jne .LBB225_1
 ; O2-NEXT:  # %bb.2: # %atomicrmw.end
 ; O2-NEXT:    movq $1, foo(%rip)
@@ -13247,21 +13382,21 @@ define void @atomic128_and_release(ptr %a) {
 ; O3-NEXT:    .cfi_def_cfa_offset 16
 ; O3-NEXT:    .cfi_offset %rbx, -16
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection474:
+; O3-NEXT:  .Lpcsection519:
 ; O3-NEXT:    movq (%rdi), %rax
-; O3-NEXT:  .Lpcsection475:
+; O3-NEXT:  .Lpcsection520:
 ; O3-NEXT:    movq 8(%rdi), %rdx
 ; O3-NEXT:    .p2align 4, 0x90
 ; O3-NEXT:  .LBB225_1: # %atomicrmw.start
 ; O3-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O3-NEXT:    movl %eax, %ebx
-; O3-NEXT:  .Lpcsection476:
+; O3-NEXT:  .Lpcsection521:
 ; O3-NEXT:    andl $42, %ebx
-; O3-NEXT:  .Lpcsection477:
+; O3-NEXT:  .Lpcsection522:
 ; O3-NEXT:    xorl %ecx, %ecx
-; O3-NEXT:  .Lpcsection478:
+; O3-NEXT:  .Lpcsection523:
 ; O3-NEXT:    lock cmpxchg16b (%rdi)
-; O3-NEXT:  .Lpcsection479:
+; O3-NEXT:  .Lpcsection524:
 ; O3-NEXT:    jne .LBB225_1
 ; O3-NEXT:  # %bb.2: # %atomicrmw.end
 ; O3-NEXT:    movq $1, foo(%rip)
@@ -13319,20 +13454,20 @@ define void @atomic128_or_release(ptr %a) {
 ; O1-NEXT:    .cfi_def_cfa_offset 16
 ; O1-NEXT:    .cfi_offset %rbx, -16
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection480:
+; O1-NEXT:  .Lpcsection525:
 ; O1-NEXT:    movq (%rdi), %rax
-; O1-NEXT:  .Lpcsection481:
+; O1-NEXT:  .Lpcsection526:
 ; O1-NEXT:    movq 8(%rdi), %rdx
 ; O1-NEXT:    .p2align 4, 0x90
 ; O1-NEXT:  .LBB226_1: # %atomicrmw.start
 ; O1-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O1-NEXT:    movq %rax, %rbx
-; O1-NEXT:  .Lpcsection482:
+; O1-NEXT:  .Lpcsection527:
 ; O1-NEXT:    orq $42, %rbx
 ; O1-NEXT:    movq %rdx, %rcx
-; O1-NEXT:  .Lpcsection483:
+; O1-NEXT:  .Lpcsection528:
 ; O1-NEXT:    lock cmpxchg16b (%rdi)
-; O1-NEXT:  .Lpcsection484:
+; O1-NEXT:  .Lpcsection529:
 ; O1-NEXT:    jne .LBB226_1
 ; O1-NEXT:  # %bb.2: # %atomicrmw.end
 ; O1-NEXT:    movq $1, foo(%rip)
@@ -13346,20 +13481,20 @@ define void @atomic128_or_release(ptr %a) {
 ; O2-NEXT:    .cfi_def_cfa_offset 16
 ; O2-NEXT:    .cfi_offset %rbx, -16
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection480:
+; O2-NEXT:  .Lpcsection525:
 ; O2-NEXT:    movq (%rdi), %rax
-; O2-NEXT:  .Lpcsection481:
+; O2-NEXT:  .Lpcsection526:
 ; O2-NEXT:    movq 8(%rdi), %rdx
 ; O2-NEXT:    .p2align 4, 0x90
 ; O2-NEXT:  .LBB226_1: # %atomicrmw.start
 ; O2-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O2-NEXT:    movq %rax, %rbx
-; O2-NEXT:  .Lpcsection482:
+; O2-NEXT:  .Lpcsection527:
 ; O2-NEXT:    orq $42, %rbx
 ; O2-NEXT:    movq %rdx, %rcx
-; O2-NEXT:  .Lpcsection483:
+; O2-NEXT:  .Lpcsection528:
 ; O2-NEXT:    lock cmpxchg16b (%rdi)
-; O2-NEXT:  .Lpcsection484:
+; O2-NEXT:  .Lpcsection529:
 ; O2-NEXT:    jne .LBB226_1
 ; O2-NEXT:  # %bb.2: # %atomicrmw.end
 ; O2-NEXT:    movq $1, foo(%rip)
@@ -13373,20 +13508,20 @@ define void @atomic128_or_release(ptr %a) {
 ; O3-NEXT:    .cfi_def_cfa_offset 16
 ; O3-NEXT:    .cfi_offset %rbx, -16
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection480:
+; O3-NEXT:  .Lpcsection525:
 ; O3-NEXT:    movq (%rdi), %rax
-; O3-NEXT:  .Lpcsection481:
+; O3-NEXT:  .Lpcsection526:
 ; O3-NEXT:    movq 8(%rdi), %rdx
 ; O3-NEXT:    .p2align 4, 0x90
 ; O3-NEXT:  .LBB226_1: # %atomicrmw.start
 ; O3-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O3-NEXT:    movq %rax, %rbx
-; O3-NEXT:  .Lpcsection482:
+; O3-NEXT:  .Lpcsection527:
 ; O3-NEXT:    orq $42, %rbx
 ; O3-NEXT:    movq %rdx, %rcx
-; O3-NEXT:  .Lpcsection483:
+; O3-NEXT:  .Lpcsection528:
 ; O3-NEXT:    lock cmpxchg16b (%rdi)
-; O3-NEXT:  .Lpcsection484:
+; O3-NEXT:  .Lpcsection529:
 ; O3-NEXT:    jne .LBB226_1
 ; O3-NEXT:  # %bb.2: # %atomicrmw.end
 ; O3-NEXT:    movq $1, foo(%rip)
@@ -13444,20 +13579,20 @@ define void @atomic128_xor_release(ptr %a) {
 ; O1-NEXT:    .cfi_def_cfa_offset 16
 ; O1-NEXT:    .cfi_offset %rbx, -16
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection485:
+; O1-NEXT:  .Lpcsection530:
 ; O1-NEXT:    movq (%rdi), %rax
-; O1-NEXT:  .Lpcsection486:
+; O1-NEXT:  .Lpcsection531:
 ; O1-NEXT:    movq 8(%rdi), %rdx
 ; O1-NEXT:    .p2align 4, 0x90
 ; O1-NEXT:  .LBB227_1: # %atomicrmw.start
 ; O1-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O1-NEXT:    movq %rax, %rbx
-; O1-NEXT:  .Lpcsection487:
+; O1-NEXT:  .Lpcsection532:
 ; O1-NEXT:    xorq $42, %rbx
 ; O1-NEXT:    movq %rdx, %rcx
-; O1-NEXT:  .Lpcsection488:
+; O1-NEXT:  .Lpcsection533:
 ; O1-NEXT:    lock cmpxchg16b (%rdi)
-; O1-NEXT:  .Lpcsection489:
+; O1-NEXT:  .Lpcsection534:
 ; O1-NEXT:    jne .LBB227_1
 ; O1-NEXT:  # %bb.2: # %atomicrmw.end
 ; O1-NEXT:    movq $1, foo(%rip)
@@ -13471,20 +13606,20 @@ define void @atomic128_xor_release(ptr %a) {
 ; O2-NEXT:    .cfi_def_cfa_offset 16
 ; O2-NEXT:    .cfi_offset %rbx, -16
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection485:
+; O2-NEXT:  .Lpcsection530:
 ; O2-NEXT:    movq (%rdi), %rax
-; O2-NEXT:  .Lpcsection486:
+; O2-NEXT:  .Lpcsection531:
 ; O2-NEXT:    movq 8(%rdi), %rdx
 ; O2-NEXT:    .p2align 4, 0x90
 ; O2-NEXT:  .LBB227_1: # %atomicrmw.start
 ; O2-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O2-NEXT:    movq %rax, %rbx
-; O2-NEXT:  .Lpcsection487:
+; O2-NEXT:  .Lpcsection532:
 ; O2-NEXT:    xorq $42, %rbx
 ; O2-NEXT:    movq %rdx, %rcx
-; O2-NEXT:  .Lpcsection488:
+; O2-NEXT:  .Lpcsection533:
 ; O2-NEXT:    lock cmpxchg16b (%rdi)
-; O2-NEXT:  .Lpcsection489:
+; O2-NEXT:  .Lpcsection534:
 ; O2-NEXT:    jne .LBB227_1
 ; O2-NEXT:  # %bb.2: # %atomicrmw.end
 ; O2-NEXT:    movq $1, foo(%rip)
@@ -13498,20 +13633,20 @@ define void @atomic128_xor_release(ptr %a) {
 ; O3-NEXT:    .cfi_def_cfa_offset 16
 ; O3-NEXT:    .cfi_offset %rbx, -16
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection485:
+; O3-NEXT:  .Lpcsection530:
 ; O3-NEXT:    movq (%rdi), %rax
-; O3-NEXT:  .Lpcsection486:
+; O3-NEXT:  .Lpcsection531:
 ; O3-NEXT:    movq 8(%rdi), %rdx
 ; O3-NEXT:    .p2align 4, 0x90
 ; O3-NEXT:  .LBB227_1: # %atomicrmw.start
 ; O3-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O3-NEXT:    movq %rax, %rbx
-; O3-NEXT:  .Lpcsection487:
+; O3-NEXT:  .Lpcsection532:
 ; O3-NEXT:    xorq $42, %rbx
 ; O3-NEXT:    movq %rdx, %rcx
-; O3-NEXT:  .Lpcsection488:
+; O3-NEXT:  .Lpcsection533:
 ; O3-NEXT:    lock cmpxchg16b (%rdi)
-; O3-NEXT:  .Lpcsection489:
+; O3-NEXT:  .Lpcsection534:
 ; O3-NEXT:    jne .LBB227_1
 ; O3-NEXT:  # %bb.2: # %atomicrmw.end
 ; O3-NEXT:    movq $1, foo(%rip)
@@ -13575,23 +13710,23 @@ define void @atomic128_nand_release(ptr %a) {
 ; O1-NEXT:    .cfi_def_cfa_offset 16
 ; O1-NEXT:    .cfi_offset %rbx, -16
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection490:
+; O1-NEXT:  .Lpcsection535:
 ; O1-NEXT:    movq (%rdi), %rax
-; O1-NEXT:  .Lpcsection491:
+; O1-NEXT:  .Lpcsection536:
 ; O1-NEXT:    movq 8(%rdi), %rdx
-; O1-NEXT:  .Lpcsection492:
+; O1-NEXT:  .Lpcsection537:
 ; O1-NEXT:    movq $-1, %rcx
 ; O1-NEXT:    .p2align 4, 0x90
 ; O1-NEXT:  .LBB228_1: # %atomicrmw.start
 ; O1-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O1-NEXT:    movl %eax, %ebx
-; O1-NEXT:  .Lpcsection493:
+; O1-NEXT:  .Lpcsection538:
 ; O1-NEXT:    notl %ebx
-; O1-NEXT:  .Lpcsection494:
+; O1-NEXT:  .Lpcsection539:
 ; O1-NEXT:    orq $-43, %rbx
-; O1-NEXT:  .Lpcsection495:
+; O1-NEXT:  .Lpcsection540:
 ; O1-NEXT:    lock cmpxchg16b (%rdi)
-; O1-NEXT:  .Lpcsection496:
+; O1-NEXT:  .Lpcsection541:
 ; O1-NEXT:    jne .LBB228_1
 ; O1-NEXT:  # %bb.2: # %atomicrmw.end
 ; O1-NEXT:    movq $1, foo(%rip)
@@ -13605,23 +13740,23 @@ define void @atomic128_nand_release(ptr %a) {
 ; O2-NEXT:    .cfi_def_cfa_offset 16
 ; O2-NEXT:    .cfi_offset %rbx, -16
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection490:
+; O2-NEXT:  .Lpcsection535:
 ; O2-NEXT:    movq (%rdi), %rax
-; O2-NEXT:  .Lpcsection491:
+; O2-NEXT:  .Lpcsection536:
 ; O2-NEXT:    movq 8(%rdi), %rdx
-; O2-NEXT:  .Lpcsection492:
+; O2-NEXT:  .Lpcsection537:
 ; O2-NEXT:    movq $-1, %rcx
 ; O2-NEXT:    .p2align 4, 0x90
 ; O2-NEXT:  .LBB228_1: # %atomicrmw.start
 ; O2-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O2-NEXT:    movl %eax, %ebx
-; O2-NEXT:  .Lpcsection493:
+; O2-NEXT:  .Lpcsection538:
 ; O2-NEXT:    notl %ebx
-; O2-NEXT:  .Lpcsection494:
+; O2-NEXT:  .Lpcsection539:
 ; O2-NEXT:    orq $-43, %rbx
-; O2-NEXT:  .Lpcsection495:
+; O2-NEXT:  .Lpcsection540:
 ; O2-NEXT:    lock cmpxchg16b (%rdi)
-; O2-NEXT:  .Lpcsection496:
+; O2-NEXT:  .Lpcsection541:
 ; O2-NEXT:    jne .LBB228_1
 ; O2-NEXT:  # %bb.2: # %atomicrmw.end
 ; O2-NEXT:    movq $1, foo(%rip)
@@ -13635,23 +13770,23 @@ define void @atomic128_nand_release(ptr %a) {
 ; O3-NEXT:    .cfi_def_cfa_offset 16
 ; O3-NEXT:    .cfi_offset %rbx, -16
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection490:
+; O3-NEXT:  .Lpcsection535:
 ; O3-NEXT:    movq (%rdi), %rax
-; O3-NEXT:  .Lpcsection491:
+; O3-NEXT:  .Lpcsection536:
 ; O3-NEXT:    movq 8(%rdi), %rdx
-; O3-NEXT:  .Lpcsection492:
+; O3-NEXT:  .Lpcsection537:
 ; O3-NEXT:    movq $-1, %rcx
 ; O3-NEXT:    .p2align 4, 0x90
 ; O3-NEXT:  .LBB228_1: # %atomicrmw.start
 ; O3-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O3-NEXT:    movl %eax, %ebx
-; O3-NEXT:  .Lpcsection493:
+; O3-NEXT:  .Lpcsection538:
 ; O3-NEXT:    notl %ebx
-; O3-NEXT:  .Lpcsection494:
+; O3-NEXT:  .Lpcsection539:
 ; O3-NEXT:    orq $-43, %rbx
-; O3-NEXT:  .Lpcsection495:
+; O3-NEXT:  .Lpcsection540:
 ; O3-NEXT:    lock cmpxchg16b (%rdi)
-; O3-NEXT:  .Lpcsection496:
+; O3-NEXT:  .Lpcsection541:
 ; O3-NEXT:    jne .LBB228_1
 ; O3-NEXT:  # %bb.2: # %atomicrmw.end
 ; O3-NEXT:    movq $1, foo(%rip)
@@ -13711,20 +13846,20 @@ define void @atomic128_xchg_acq_rel(ptr %a) {
 ; O1-NEXT:    .cfi_def_cfa_offset 16
 ; O1-NEXT:    .cfi_offset %rbx, -16
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection497:
+; O1-NEXT:  .Lpcsection542:
 ; O1-NEXT:    movq (%rdi), %rax
-; O1-NEXT:  .Lpcsection498:
+; O1-NEXT:  .Lpcsection543:
 ; O1-NEXT:    movq 8(%rdi), %rdx
-; O1-NEXT:  .Lpcsection499:
+; O1-NEXT:  .Lpcsection544:
 ; O1-NEXT:    movl $42, %ebx
 ; O1-NEXT:    .p2align 4, 0x90
 ; O1-NEXT:  .LBB229_1: # %atomicrmw.start
 ; O1-NEXT:    # =>This Inner Loop Header: Depth=1
-; O1-NEXT:  .Lpcsection500:
+; O1-NEXT:  .Lpcsection545:
 ; O1-NEXT:    xorl %ecx, %ecx
-; O1-NEXT:  .Lpcsection501:
+; O1-NEXT:  .Lpcsection546:
 ; O1-NEXT:    lock cmpxchg16b (%rdi)
-; O1-NEXT:  .Lpcsection502:
+; O1-NEXT:  .Lpcsection547:
 ; O1-NEXT:    jne .LBB229_1
 ; O1-NEXT:  # %bb.2: # %atomicrmw.end
 ; O1-NEXT:    movq $1, foo(%rip)
@@ -13738,20 +13873,20 @@ define void @atomic128_xchg_acq_rel(ptr %a) {
 ; O2-NEXT:    .cfi_def_cfa_offset 16
 ; O2-NEXT:    .cfi_offset %rbx, -16
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection497:
+; O2-NEXT:  .Lpcsection542:
 ; O2-NEXT:    movq (%rdi), %rax
-; O2-NEXT:  .Lpcsection498:
+; O2-NEXT:  .Lpcsection543:
 ; O2-NEXT:    movq 8(%rdi), %rdx
-; O2-NEXT:  .Lpcsection499:
+; O2-NEXT:  .Lpcsection544:
 ; O2-NEXT:    movl $42, %ebx
 ; O2-NEXT:    .p2align 4, 0x90
 ; O2-NEXT:  .LBB229_1: # %atomicrmw.start
 ; O2-NEXT:    # =>This Inner Loop Header: Depth=1
-; O2-NEXT:  .Lpcsection500:
+; O2-NEXT:  .Lpcsection545:
 ; O2-NEXT:    xorl %ecx, %ecx
-; O2-NEXT:  .Lpcsection501:
+; O2-NEXT:  .Lpcsection546:
 ; O2-NEXT:    lock cmpxchg16b (%rdi)
-; O2-NEXT:  .Lpcsection502:
+; O2-NEXT:  .Lpcsection547:
 ; O2-NEXT:    jne .LBB229_1
 ; O2-NEXT:  # %bb.2: # %atomicrmw.end
 ; O2-NEXT:    movq $1, foo(%rip)
@@ -13765,20 +13900,20 @@ define void @atomic128_xchg_acq_rel(ptr %a) {
 ; O3-NEXT:    .cfi_def_cfa_offset 16
 ; O3-NEXT:    .cfi_offset %rbx, -16
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection497:
+; O3-NEXT:  .Lpcsection542:
 ; O3-NEXT:    movq (%rdi), %rax
-; O3-NEXT:  .Lpcsection498:
+; O3-NEXT:  .Lpcsection543:
 ; O3-NEXT:    movq 8(%rdi), %rdx
-; O3-NEXT:  .Lpcsection499:
+; O3-NEXT:  .Lpcsection544:
 ; O3-NEXT:    movl $42, %ebx
 ; O3-NEXT:    .p2align 4, 0x90
 ; O3-NEXT:  .LBB229_1: # %atomicrmw.start
 ; O3-NEXT:    # =>This Inner Loop Header: Depth=1
-; O3-NEXT:  .Lpcsection500:
+; O3-NEXT:  .Lpcsection545:
 ; O3-NEXT:    xorl %ecx, %ecx
-; O3-NEXT:  .Lpcsection501:
+; O3-NEXT:  .Lpcsection546:
 ; O3-NEXT:    lock cmpxchg16b (%rdi)
-; O3-NEXT:  .Lpcsection502:
+; O3-NEXT:  .Lpcsection547:
 ; O3-NEXT:    jne .LBB229_1
 ; O3-NEXT:  # %bb.2: # %atomicrmw.end
 ; O3-NEXT:    movq $1, foo(%rip)
@@ -13838,22 +13973,22 @@ define void @atomic128_add_acq_rel(ptr %a) {
 ; O1-NEXT:    .cfi_def_cfa_offset 16
 ; O1-NEXT:    .cfi_offset %rbx, -16
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection503:
+; O1-NEXT:  .Lpcsection548:
 ; O1-NEXT:    movq (%rdi), %rax
-; O1-NEXT:  .Lpcsection504:
+; O1-NEXT:  .Lpcsection549:
 ; O1-NEXT:    movq 8(%rdi), %rdx
 ; O1-NEXT:    .p2align 4, 0x90
 ; O1-NEXT:  .LBB230_1: # %atomicrmw.start
 ; O1-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O1-NEXT:    movq %rax, %rbx
-; O1-NEXT:  .Lpcsection505:
+; O1-NEXT:  .Lpcsection550:
 ; O1-NEXT:    addq $42, %rbx
 ; O1-NEXT:    movq %rdx, %rcx
-; O1-NEXT:  .Lpcsection506:
+; O1-NEXT:  .Lpcsection551:
 ; O1-NEXT:    adcq $0, %rcx
-; O1-NEXT:  .Lpcsection507:
+; O1-NEXT:  .Lpcsection552:
 ; O1-NEXT:    lock cmpxchg16b (%rdi)
-; O1-NEXT:  .Lpcsection508:
+; O1-NEXT:  .Lpcsection553:
 ; O1-NEXT:    jne .LBB230_1
 ; O1-NEXT:  # %bb.2: # %atomicrmw.end
 ; O1-NEXT:    movq $1, foo(%rip)
@@ -13867,22 +14002,22 @@ define void @atomic128_add_acq_rel(ptr %a) {
 ; O2-NEXT:    .cfi_def_cfa_offset 16
 ; O2-NEXT:    .cfi_offset %rbx, -16
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection503:
+; O2-NEXT:  .Lpcsection548:
 ; O2-NEXT:    movq (%rdi), %rax
-; O2-NEXT:  .Lpcsection504:
+; O2-NEXT:  .Lpcsection549:
 ; O2-NEXT:    movq 8(%rdi), %rdx
 ; O2-NEXT:    .p2align 4, 0x90
 ; O2-NEXT:  .LBB230_1: # %atomicrmw.start
 ; O2-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O2-NEXT:    movq %rax, %rbx
-; O2-NEXT:  .Lpcsection505:
+; O2-NEXT:  .Lpcsection550:
 ; O2-NEXT:    addq $42, %rbx
 ; O2-NEXT:    movq %rdx, %rcx
-; O2-NEXT:  .Lpcsection506:
+; O2-NEXT:  .Lpcsection551:
 ; O2-NEXT:    adcq $0, %rcx
-; O2-NEXT:  .Lpcsection507:
+; O2-NEXT:  .Lpcsection552:
 ; O2-NEXT:    lock cmpxchg16b (%rdi)
-; O2-NEXT:  .Lpcsection508:
+; O2-NEXT:  .Lpcsection553:
 ; O2-NEXT:    jne .LBB230_1
 ; O2-NEXT:  # %bb.2: # %atomicrmw.end
 ; O2-NEXT:    movq $1, foo(%rip)
@@ -13896,22 +14031,22 @@ define void @atomic128_add_acq_rel(ptr %a) {
 ; O3-NEXT:    .cfi_def_cfa_offset 16
 ; O3-NEXT:    .cfi_offset %rbx, -16
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection503:
+; O3-NEXT:  .Lpcsection548:
 ; O3-NEXT:    movq (%rdi), %rax
-; O3-NEXT:  .Lpcsection504:
+; O3-NEXT:  .Lpcsection549:
 ; O3-NEXT:    movq 8(%rdi), %rdx
 ; O3-NEXT:    .p2align 4, 0x90
 ; O3-NEXT:  .LBB230_1: # %atomicrmw.start
 ; O3-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O3-NEXT:    movq %rax, %rbx
-; O3-NEXT:  .Lpcsection505:
+; O3-NEXT:  .Lpcsection550:
 ; O3-NEXT:    addq $42, %rbx
 ; O3-NEXT:    movq %rdx, %rcx
-; O3-NEXT:  .Lpcsection506:
+; O3-NEXT:  .Lpcsection551:
 ; O3-NEXT:    adcq $0, %rcx
-; O3-NEXT:  .Lpcsection507:
+; O3-NEXT:  .Lpcsection552:
 ; O3-NEXT:    lock cmpxchg16b (%rdi)
-; O3-NEXT:  .Lpcsection508:
+; O3-NEXT:  .Lpcsection553:
 ; O3-NEXT:    jne .LBB230_1
 ; O3-NEXT:  # %bb.2: # %atomicrmw.end
 ; O3-NEXT:    movq $1, foo(%rip)
@@ -13971,22 +14106,22 @@ define void @atomic128_sub_acq_rel(ptr %a) {
 ; O1-NEXT:    .cfi_def_cfa_offset 16
 ; O1-NEXT:    .cfi_offset %rbx, -16
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection509:
+; O1-NEXT:  .Lpcsection554:
 ; O1-NEXT:    movq (%rdi), %rax
-; O1-NEXT:  .Lpcsection510:
+; O1-NEXT:  .Lpcsection555:
 ; O1-NEXT:    movq 8(%rdi), %rdx
 ; O1-NEXT:    .p2align 4, 0x90
 ; O1-NEXT:  .LBB231_1: # %atomicrmw.start
 ; O1-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O1-NEXT:    movq %rax, %rbx
-; O1-NEXT:  .Lpcsection511:
+; O1-NEXT:  .Lpcsection556:
 ; O1-NEXT:    addq $-42, %rbx
 ; O1-NEXT:    movq %rdx, %rcx
-; O1-NEXT:  .Lpcsection512:
+; O1-NEXT:  .Lpcsection557:
 ; O1-NEXT:    adcq $-1, %rcx
-; O1-NEXT:  .Lpcsection513:
+; O1-NEXT:  .Lpcsection558:
 ; O1-NEXT:    lock cmpxchg16b (%rdi)
-; O1-NEXT:  .Lpcsection514:
+; O1-NEXT:  .Lpcsection559:
 ; O1-NEXT:    jne .LBB231_1
 ; O1-NEXT:  # %bb.2: # %atomicrmw.end
 ; O1-NEXT:    movq $1, foo(%rip)
@@ -14000,22 +14135,22 @@ define void @atomic128_sub_acq_rel(ptr %a) {
 ; O2-NEXT:    .cfi_def_cfa_offset 16
 ; O2-NEXT:    .cfi_offset %rbx, -16
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection509:
+; O2-NEXT:  .Lpcsection554:
 ; O2-NEXT:    movq (%rdi), %rax
-; O2-NEXT:  .Lpcsection510:
+; O2-NEXT:  .Lpcsection555:
 ; O2-NEXT:    movq 8(%rdi), %rdx
 ; O2-NEXT:    .p2align 4, 0x90
 ; O2-NEXT:  .LBB231_1: # %atomicrmw.start
 ; O2-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O2-NEXT:    movq %rax, %rbx
-; O2-NEXT:  .Lpcsection511:
+; O2-NEXT:  .Lpcsection556:
 ; O2-NEXT:    addq $-42, %rbx
 ; O2-NEXT:    movq %rdx, %rcx
-; O2-NEXT:  .Lpcsection512:
+; O2-NEXT:  .Lpcsection557:
 ; O2-NEXT:    adcq $-1, %rcx
-; O2-NEXT:  .Lpcsection513:
+; O2-NEXT:  .Lpcsection558:
 ; O2-NEXT:    lock cmpxchg16b (%rdi)
-; O2-NEXT:  .Lpcsection514:
+; O2-NEXT:  .Lpcsection559:
 ; O2-NEXT:    jne .LBB231_1
 ; O2-NEXT:  # %bb.2: # %atomicrmw.end
 ; O2-NEXT:    movq $1, foo(%rip)
@@ -14029,22 +14164,22 @@ define void @atomic128_sub_acq_rel(ptr %a) {
 ; O3-NEXT:    .cfi_def_cfa_offset 16
 ; O3-NEXT:    .cfi_offset %rbx, -16
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection509:
+; O3-NEXT:  .Lpcsection554:
 ; O3-NEXT:    movq (%rdi), %rax
-; O3-NEXT:  .Lpcsection510:
+; O3-NEXT:  .Lpcsection555:
 ; O3-NEXT:    movq 8(%rdi), %rdx
 ; O3-NEXT:    .p2align 4, 0x90
 ; O3-NEXT:  .LBB231_1: # %atomicrmw.start
 ; O3-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O3-NEXT:    movq %rax, %rbx
-; O3-NEXT:  .Lpcsection511:
+; O3-NEXT:  .Lpcsection556:
 ; O3-NEXT:    addq $-42, %rbx
 ; O3-NEXT:    movq %rdx, %rcx
-; O3-NEXT:  .Lpcsection512:
+; O3-NEXT:  .Lpcsection557:
 ; O3-NEXT:    adcq $-1, %rcx
-; O3-NEXT:  .Lpcsection513:
+; O3-NEXT:  .Lpcsection558:
 ; O3-NEXT:    lock cmpxchg16b (%rdi)
-; O3-NEXT:  .Lpcsection514:
+; O3-NEXT:  .Lpcsection559:
 ; O3-NEXT:    jne .LBB231_1
 ; O3-NEXT:  # %bb.2: # %atomicrmw.end
 ; O3-NEXT:    movq $1, foo(%rip)
@@ -14106,21 +14241,21 @@ define void @atomic128_and_acq_rel(ptr %a) {
 ; O1-NEXT:    .cfi_def_cfa_offset 16
 ; O1-NEXT:    .cfi_offset %rbx, -16
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection515:
+; O1-NEXT:  .Lpcsection560:
 ; O1-NEXT:    movq (%rdi), %rax
-; O1-NEXT:  .Lpcsection516:
+; O1-NEXT:  .Lpcsection561:
 ; O1-NEXT:    movq 8(%rdi), %rdx
 ; O1-NEXT:    .p2align 4, 0x90
 ; O1-NEXT:  .LBB232_1: # %atomicrmw.start
 ; O1-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O1-NEXT:    movl %eax, %ebx
-; O1-NEXT:  .Lpcsection517:
+; O1-NEXT:  .Lpcsection562:
 ; O1-NEXT:    andl $42, %ebx
-; O1-NEXT:  .Lpcsection518:
+; O1-NEXT:  .Lpcsection563:
 ; O1-NEXT:    xorl %ecx, %ecx
-; O1-NEXT:  .Lpcsection519:
+; O1-NEXT:  .Lpcsection564:
 ; O1-NEXT:    lock cmpxchg16b (%rdi)
-; O1-NEXT:  .Lpcsection520:
+; O1-NEXT:  .Lpcsection565:
 ; O1-NEXT:    jne .LBB232_1
 ; O1-NEXT:  # %bb.2: # %atomicrmw.end
 ; O1-NEXT:    movq $1, foo(%rip)
@@ -14134,21 +14269,21 @@ define void @atomic128_and_acq_rel(ptr %a) {
 ; O2-NEXT:    .cfi_def_cfa_offset 16
 ; O2-NEXT:    .cfi_offset %rbx, -16
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection515:
+; O2-NEXT:  .Lpcsection560:
 ; O2-NEXT:    movq (%rdi), %rax
-; O2-NEXT:  .Lpcsection516:
+; O2-NEXT:  .Lpcsection561:
 ; O2-NEXT:    movq 8(%rdi), %rdx
 ; O2-NEXT:    .p2align 4, 0x90
 ; O2-NEXT:  .LBB232_1: # %atomicrmw.start
 ; O2-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O2-NEXT:    movl %eax, %ebx
-; O2-NEXT:  .Lpcsection517:
+; O2-NEXT:  .Lpcsection562:
 ; O2-NEXT:    andl $42, %ebx
-; O2-NEXT:  .Lpcsection518:
+; O2-NEXT:  .Lpcsection563:
 ; O2-NEXT:    xorl %ecx, %ecx
-; O2-NEXT:  .Lpcsection519:
+; O2-NEXT:  .Lpcsection564:
 ; O2-NEXT:    lock cmpxchg16b (%rdi)
-; O2-NEXT:  .Lpcsection520:
+; O2-NEXT:  .Lpcsection565:
 ; O2-NEXT:    jne .LBB232_1
 ; O2-NEXT:  # %bb.2: # %atomicrmw.end
 ; O2-NEXT:    movq $1, foo(%rip)
@@ -14162,21 +14297,21 @@ define void @atomic128_and_acq_rel(ptr %a) {
 ; O3-NEXT:    .cfi_def_cfa_offset 16
 ; O3-NEXT:    .cfi_offset %rbx, -16
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection515:
+; O3-NEXT:  .Lpcsection560:
 ; O3-NEXT:    movq (%rdi), %rax
-; O3-NEXT:  .Lpcsection516:
+; O3-NEXT:  .Lpcsection561:
 ; O3-NEXT:    movq 8(%rdi), %rdx
 ; O3-NEXT:    .p2align 4, 0x90
 ; O3-NEXT:  .LBB232_1: # %atomicrmw.start
 ; O3-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O3-NEXT:    movl %eax, %ebx
-; O3-NEXT:  .Lpcsection517:
+; O3-NEXT:  .Lpcsection562:
 ; O3-NEXT:    andl $42, %ebx
-; O3-NEXT:  .Lpcsection518:
+; O3-NEXT:  .Lpcsection563:
 ; O3-NEXT:    xorl %ecx, %ecx
-; O3-NEXT:  .Lpcsection519:
+; O3-NEXT:  .Lpcsection564:
 ; O3-NEXT:    lock cmpxchg16b (%rdi)
-; O3-NEXT:  .Lpcsection520:
+; O3-NEXT:  .Lpcsection565:
 ; O3-NEXT:    jne .LBB232_1
 ; O3-NEXT:  # %bb.2: # %atomicrmw.end
 ; O3-NEXT:    movq $1, foo(%rip)
@@ -14234,20 +14369,20 @@ define void @atomic128_or_acq_rel(ptr %a) {
 ; O1-NEXT:    .cfi_def_cfa_offset 16
 ; O1-NEXT:    .cfi_offset %rbx, -16
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection521:
+; O1-NEXT:  .Lpcsection566:
 ; O1-NEXT:    movq (%rdi), %rax
-; O1-NEXT:  .Lpcsection522:
+; O1-NEXT:  .Lpcsection567:
 ; O1-NEXT:    movq 8(%rdi), %rdx
 ; O1-NEXT:    .p2align 4, 0x90
 ; O1-NEXT:  .LBB233_1: # %atomicrmw.start
 ; O1-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O1-NEXT:    movq %rax, %rbx
-; O1-NEXT:  .Lpcsection523:
+; O1-NEXT:  .Lpcsection568:
 ; O1-NEXT:    orq $42, %rbx
 ; O1-NEXT:    movq %rdx, %rcx
-; O1-NEXT:  .Lpcsection524:
+; O1-NEXT:  .Lpcsection569:
 ; O1-NEXT:    lock cmpxchg16b (%rdi)
-; O1-NEXT:  .Lpcsection525:
+; O1-NEXT:  .Lpcsection570:
 ; O1-NEXT:    jne .LBB233_1
 ; O1-NEXT:  # %bb.2: # %atomicrmw.end
 ; O1-NEXT:    movq $1, foo(%rip)
@@ -14261,20 +14396,20 @@ define void @atomic128_or_acq_rel(ptr %a) {
 ; O2-NEXT:    .cfi_def_cfa_offset 16
 ; O2-NEXT:    .cfi_offset %rbx, -16
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection521:
+; O2-NEXT:  .Lpcsection566:
 ; O2-NEXT:    movq (%rdi), %rax
-; O2-NEXT:  .Lpcsection522:
+; O2-NEXT:  .Lpcsection567:
 ; O2-NEXT:    movq 8(%rdi), %rdx
 ; O2-NEXT:    .p2align 4, 0x90
 ; O2-NEXT:  .LBB233_1: # %atomicrmw.start
 ; O2-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O2-NEXT:    movq %rax, %rbx
-; O2-NEXT:  .Lpcsection523:
+; O2-NEXT:  .Lpcsection568:
 ; O2-NEXT:    orq $42, %rbx
 ; O2-NEXT:    movq %rdx, %rcx
-; O2-NEXT:  .Lpcsection524:
+; O2-NEXT:  .Lpcsection569:
 ; O2-NEXT:    lock cmpxchg16b (%rdi)
-; O2-NEXT:  .Lpcsection525:
+; O2-NEXT:  .Lpcsection570:
 ; O2-NEXT:    jne .LBB233_1
 ; O2-NEXT:  # %bb.2: # %atomicrmw.end
 ; O2-NEXT:    movq $1, foo(%rip)
@@ -14288,20 +14423,20 @@ define void @atomic128_or_acq_rel(ptr %a) {
 ; O3-NEXT:    .cfi_def_cfa_offset 16
 ; O3-NEXT:    .cfi_offset %rbx, -16
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection521:
+; O3-NEXT:  .Lpcsection566:
 ; O3-NEXT:    movq (%rdi), %rax
-; O3-NEXT:  .Lpcsection522:
+; O3-NEXT:  .Lpcsection567:
 ; O3-NEXT:    movq 8(%rdi), %rdx
 ; O3-NEXT:    .p2align 4, 0x90
 ; O3-NEXT:  .LBB233_1: # %atomicrmw.start
 ; O3-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O3-NEXT:    movq %rax, %rbx
-; O3-NEXT:  .Lpcsection523:
+; O3-NEXT:  .Lpcsection568:
 ; O3-NEXT:    orq $42, %rbx
 ; O3-NEXT:    movq %rdx, %rcx
-; O3-NEXT:  .Lpcsection524:
+; O3-NEXT:  .Lpcsection569:
 ; O3-NEXT:    lock cmpxchg16b (%rdi)
-; O3-NEXT:  .Lpcsection525:
+; O3-NEXT:  .Lpcsection570:
 ; O3-NEXT:    jne .LBB233_1
 ; O3-NEXT:  # %bb.2: # %atomicrmw.end
 ; O3-NEXT:    movq $1, foo(%rip)
@@ -14359,20 +14494,20 @@ define void @atomic128_xor_acq_rel(ptr %a) {
 ; O1-NEXT:    .cfi_def_cfa_offset 16
 ; O1-NEXT:    .cfi_offset %rbx, -16
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection526:
+; O1-NEXT:  .Lpcsection571:
 ; O1-NEXT:    movq (%rdi), %rax
-; O1-NEXT:  .Lpcsection527:
+; O1-NEXT:  .Lpcsection572:
 ; O1-NEXT:    movq 8(%rdi), %rdx
 ; O1-NEXT:    .p2align 4, 0x90
 ; O1-NEXT:  .LBB234_1: # %atomicrmw.start
 ; O1-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O1-NEXT:    movq %rax, %rbx
-; O1-NEXT:  .Lpcsection528:
+; O1-NEXT:  .Lpcsection573:
 ; O1-NEXT:    xorq $42, %rbx
 ; O1-NEXT:    movq %rdx, %rcx
-; O1-NEXT:  .Lpcsection529:
+; O1-NEXT:  .Lpcsection574:
 ; O1-NEXT:    lock cmpxchg16b (%rdi)
-; O1-NEXT:  .Lpcsection530:
+; O1-NEXT:  .Lpcsection575:
 ; O1-NEXT:    jne .LBB234_1
 ; O1-NEXT:  # %bb.2: # %atomicrmw.end
 ; O1-NEXT:    movq $1, foo(%rip)
@@ -14386,20 +14521,20 @@ define void @atomic128_xor_acq_rel(ptr %a) {
 ; O2-NEXT:    .cfi_def_cfa_offset 16
 ; O2-NEXT:    .cfi_offset %rbx, -16
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection526:
+; O2-NEXT:  .Lpcsection571:
 ; O2-NEXT:    movq (%rdi), %rax
-; O2-NEXT:  .Lpcsection527:
+; O2-NEXT:  .Lpcsection572:
 ; O2-NEXT:    movq 8(%rdi), %rdx
 ; O2-NEXT:    .p2align 4, 0x90
 ; O2-NEXT:  .LBB234_1: # %atomicrmw.start
 ; O2-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O2-NEXT:    movq %rax, %rbx
-; O2-NEXT:  .Lpcsection528:
+; O2-NEXT:  .Lpcsection573:
 ; O2-NEXT:    xorq $42, %rbx
 ; O2-NEXT:    movq %rdx, %rcx
-; O2-NEXT:  .Lpcsection529:
+; O2-NEXT:  .Lpcsection574:
 ; O2-NEXT:    lock cmpxchg16b (%rdi)
-; O2-NEXT:  .Lpcsection530:
+; O2-NEXT:  .Lpcsection575:
 ; O2-NEXT:    jne .LBB234_1
 ; O2-NEXT:  # %bb.2: # %atomicrmw.end
 ; O2-NEXT:    movq $1, foo(%rip)
@@ -14413,20 +14548,20 @@ define void @atomic128_xor_acq_rel(ptr %a) {
 ; O3-NEXT:    .cfi_def_cfa_offset 16
 ; O3-NEXT:    .cfi_offset %rbx, -16
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection526:
+; O3-NEXT:  .Lpcsection571:
 ; O3-NEXT:    movq (%rdi), %rax
-; O3-NEXT:  .Lpcsection527:
+; O3-NEXT:  .Lpcsection572:
 ; O3-NEXT:    movq 8(%rdi), %rdx
 ; O3-NEXT:    .p2align 4, 0x90
 ; O3-NEXT:  .LBB234_1: # %atomicrmw.start
 ; O3-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O3-NEXT:    movq %rax, %rbx
-; O3-NEXT:  .Lpcsection528:
+; O3-NEXT:  .Lpcsection573:
 ; O3-NEXT:    xorq $42, %rbx
 ; O3-NEXT:    movq %rdx, %rcx
-; O3-NEXT:  .Lpcsection529:
+; O3-NEXT:  .Lpcsection574:
 ; O3-NEXT:    lock cmpxchg16b (%rdi)
-; O3-NEXT:  .Lpcsection530:
+; O3-NEXT:  .Lpcsection575:
 ; O3-NEXT:    jne .LBB234_1
 ; O3-NEXT:  # %bb.2: # %atomicrmw.end
 ; O3-NEXT:    movq $1, foo(%rip)
@@ -14490,23 +14625,23 @@ define void @atomic128_nand_acq_rel(ptr %a) {
 ; O1-NEXT:    .cfi_def_cfa_offset 16
 ; O1-NEXT:    .cfi_offset %rbx, -16
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection531:
+; O1-NEXT:  .Lpcsection576:
 ; O1-NEXT:    movq (%rdi), %rax
-; O1-NEXT:  .Lpcsection532:
+; O1-NEXT:  .Lpcsection577:
 ; O1-NEXT:    movq 8(%rdi), %rdx
-; O1-NEXT:  .Lpcsection533:
+; O1-NEXT:  .Lpcsection578:
 ; O1-NEXT:    movq $-1, %rcx
 ; O1-NEXT:    .p2align 4, 0x90
 ; O1-NEXT:  .LBB235_1: # %atomicrmw.start
 ; O1-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O1-NEXT:    movl %eax, %ebx
-; O1-NEXT:  .Lpcsection534:
+; O1-NEXT:  .Lpcsection579:
 ; O1-NEXT:    notl %ebx
-; O1-NEXT:  .Lpcsection535:
+; O1-NEXT:  .Lpcsection580:
 ; O1-NEXT:    orq $-43, %rbx
-; O1-NEXT:  .Lpcsection536:
+; O1-NEXT:  .Lpcsection581:
 ; O1-NEXT:    lock cmpxchg16b (%rdi)
-; O1-NEXT:  .Lpcsection537:
+; O1-NEXT:  .Lpcsection582:
 ; O1-NEXT:    jne .LBB235_1
 ; O1-NEXT:  # %bb.2: # %atomicrmw.end
 ; O1-NEXT:    movq $1, foo(%rip)
@@ -14520,23 +14655,23 @@ define void @atomic128_nand_acq_rel(ptr %a) {
 ; O2-NEXT:    .cfi_def_cfa_offset 16
 ; O2-NEXT:    .cfi_offset %rbx, -16
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection531:
+; O2-NEXT:  .Lpcsection576:
 ; O2-NEXT:    movq (%rdi), %rax
-; O2-NEXT:  .Lpcsection532:
+; O2-NEXT:  .Lpcsection577:
 ; O2-NEXT:    movq 8(%rdi), %rdx
-; O2-NEXT:  .Lpcsection533:
+; O2-NEXT:  .Lpcsection578:
 ; O2-NEXT:    movq $-1, %rcx
 ; O2-NEXT:    .p2align 4, 0x90
 ; O2-NEXT:  .LBB235_1: # %atomicrmw.start
 ; O2-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O2-NEXT:    movl %eax, %ebx
-; O2-NEXT:  .Lpcsection534:
+; O2-NEXT:  .Lpcsection579:
 ; O2-NEXT:    notl %ebx
-; O2-NEXT:  .Lpcsection535:
+; O2-NEXT:  .Lpcsection580:
 ; O2-NEXT:    orq $-43, %rbx
-; O2-NEXT:  .Lpcsection536:
+; O2-NEXT:  .Lpcsection581:
 ; O2-NEXT:    lock cmpxchg16b (%rdi)
-; O2-NEXT:  .Lpcsection537:
+; O2-NEXT:  .Lpcsection582:
 ; O2-NEXT:    jne .LBB235_1
 ; O2-NEXT:  # %bb.2: # %atomicrmw.end
 ; O2-NEXT:    movq $1, foo(%rip)
@@ -14550,23 +14685,23 @@ define void @atomic128_nand_acq_rel(ptr %a) {
 ; O3-NEXT:    .cfi_def_cfa_offset 16
 ; O3-NEXT:    .cfi_offset %rbx, -16
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection531:
+; O3-NEXT:  .Lpcsection576:
 ; O3-NEXT:    movq (%rdi), %rax
-; O3-NEXT:  .Lpcsection532:
+; O3-NEXT:  .Lpcsection577:
 ; O3-NEXT:    movq 8(%rdi), %rdx
-; O3-NEXT:  .Lpcsection533:
+; O3-NEXT:  .Lpcsection578:
 ; O3-NEXT:    movq $-1, %rcx
 ; O3-NEXT:    .p2align 4, 0x90
 ; O3-NEXT:  .LBB235_1: # %atomicrmw.start
 ; O3-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O3-NEXT:    movl %eax, %ebx
-; O3-NEXT:  .Lpcsection534:
+; O3-NEXT:  .Lpcsection579:
 ; O3-NEXT:    notl %ebx
-; O3-NEXT:  .Lpcsection535:
+; O3-NEXT:  .Lpcsection580:
 ; O3-NEXT:    orq $-43, %rbx
-; O3-NEXT:  .Lpcsection536:
+; O3-NEXT:  .Lpcsection581:
 ; O3-NEXT:    lock cmpxchg16b (%rdi)
-; O3-NEXT:  .Lpcsection537:
+; O3-NEXT:  .Lpcsection582:
 ; O3-NEXT:    jne .LBB235_1
 ; O3-NEXT:  # %bb.2: # %atomicrmw.end
 ; O3-NEXT:    movq $1, foo(%rip)
@@ -14626,20 +14761,20 @@ define void @atomic128_xchg_seq_cst(ptr %a) {
 ; O1-NEXT:    .cfi_def_cfa_offset 16
 ; O1-NEXT:    .cfi_offset %rbx, -16
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection538:
+; O1-NEXT:  .Lpcsection583:
 ; O1-NEXT:    movq (%rdi), %rax
-; O1-NEXT:  .Lpcsection539:
+; O1-NEXT:  .Lpcsection584:
 ; O1-NEXT:    movq 8(%rdi), %rdx
-; O1-NEXT:  .Lpcsection540:
+; O1-NEXT:  .Lpcsection585:
 ; O1-NEXT:    movl $42, %ebx
 ; O1-NEXT:    .p2align 4, 0x90
 ; O1-NEXT:  .LBB236_1: # %atomicrmw.start
 ; O1-NEXT:    # =>This Inner Loop Header: Depth=1
-; O1-NEXT:  .Lpcsection541:
+; O1-NEXT:  .Lpcsection586:
 ; O1-NEXT:    xorl %ecx, %ecx
-; O1-NEXT:  .Lpcsection542:
+; O1-NEXT:  .Lpcsection587:
 ; O1-NEXT:    lock cmpxchg16b (%rdi)
-; O1-NEXT:  .Lpcsection543:
+; O1-NEXT:  .Lpcsection588:
 ; O1-NEXT:    jne .LBB236_1
 ; O1-NEXT:  # %bb.2: # %atomicrmw.end
 ; O1-NEXT:    movq $1, foo(%rip)
@@ -14653,20 +14788,20 @@ define void @atomic128_xchg_seq_cst(ptr %a) {
 ; O2-NEXT:    .cfi_def_cfa_offset 16
 ; O2-NEXT:    .cfi_offset %rbx, -16
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection538:
+; O2-NEXT:  .Lpcsection583:
 ; O2-NEXT:    movq (%rdi), %rax
-; O2-NEXT:  .Lpcsection539:
+; O2-NEXT:  .Lpcsection584:
 ; O2-NEXT:    movq 8(%rdi), %rdx
-; O2-NEXT:  .Lpcsection540:
+; O2-NEXT:  .Lpcsection585:
 ; O2-NEXT:    movl $42, %ebx
 ; O2-NEXT:    .p2align 4, 0x90
 ; O2-NEXT:  .LBB236_1: # %atomicrmw.start
 ; O2-NEXT:    # =>This Inner Loop Header: Depth=1
-; O2-NEXT:  .Lpcsection541:
+; O2-NEXT:  .Lpcsection586:
 ; O2-NEXT:    xorl %ecx, %ecx
-; O2-NEXT:  .Lpcsection542:
+; O2-NEXT:  .Lpcsection587:
 ; O2-NEXT:    lock cmpxchg16b (%rdi)
-; O2-NEXT:  .Lpcsection543:
+; O2-NEXT:  .Lpcsection588:
 ; O2-NEXT:    jne .LBB236_1
 ; O2-NEXT:  # %bb.2: # %atomicrmw.end
 ; O2-NEXT:    movq $1, foo(%rip)
@@ -14680,20 +14815,20 @@ define void @atomic128_xchg_seq_cst(ptr %a) {
 ; O3-NEXT:    .cfi_def_cfa_offset 16
 ; O3-NEXT:    .cfi_offset %rbx, -16
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection538:
+; O3-NEXT:  .Lpcsection583:
 ; O3-NEXT:    movq (%rdi), %rax
-; O3-NEXT:  .Lpcsection539:
+; O3-NEXT:  .Lpcsection584:
 ; O3-NEXT:    movq 8(%rdi), %rdx
-; O3-NEXT:  .Lpcsection540:
+; O3-NEXT:  .Lpcsection585:
 ; O3-NEXT:    movl $42, %ebx
 ; O3-NEXT:    .p2align 4, 0x90
 ; O3-NEXT:  .LBB236_1: # %atomicrmw.start
 ; O3-NEXT:    # =>This Inner Loop Header: Depth=1
-; O3-NEXT:  .Lpcsection541:
+; O3-NEXT:  .Lpcsection586:
 ; O3-NEXT:    xorl %ecx, %ecx
-; O3-NEXT:  .Lpcsection542:
+; O3-NEXT:  .Lpcsection587:
 ; O3-NEXT:    lock cmpxchg16b (%rdi)
-; O3-NEXT:  .Lpcsection543:
+; O3-NEXT:  .Lpcsection588:
 ; O3-NEXT:    jne .LBB236_1
 ; O3-NEXT:  # %bb.2: # %atomicrmw.end
 ; O3-NEXT:    movq $1, foo(%rip)
@@ -14753,22 +14888,22 @@ define void @atomic128_add_seq_cst(ptr %a) {
 ; O1-NEXT:    .cfi_def_cfa_offset 16
 ; O1-NEXT:    .cfi_offset %rbx, -16
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection544:
+; O1-NEXT:  .Lpcsection589:
 ; O1-NEXT:    movq (%rdi), %rax
-; O1-NEXT:  .Lpcsection545:
+; O1-NEXT:  .Lpcsection590:
 ; O1-NEXT:    movq 8(%rdi), %rdx
 ; O1-NEXT:    .p2align 4, 0x90
 ; O1-NEXT:  .LBB237_1: # %atomicrmw.start
 ; O1-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O1-NEXT:    movq %rax, %rbx
-; O1-NEXT:  .Lpcsection546:
+; O1-NEXT:  .Lpcsection591:
 ; O1-NEXT:    addq $42, %rbx
 ; O1-NEXT:    movq %rdx, %rcx
-; O1-NEXT:  .Lpcsection547:
+; O1-NEXT:  .Lpcsection592:
 ; O1-NEXT:    adcq $0, %rcx
-; O1-NEXT:  .Lpcsection548:
+; O1-NEXT:  .Lpcsection593:
 ; O1-NEXT:    lock cmpxchg16b (%rdi)
-; O1-NEXT:  .Lpcsection549:
+; O1-NEXT:  .Lpcsection594:
 ; O1-NEXT:    jne .LBB237_1
 ; O1-NEXT:  # %bb.2: # %atomicrmw.end
 ; O1-NEXT:    movq $1, foo(%rip)
@@ -14782,22 +14917,22 @@ define void @atomic128_add_seq_cst(ptr %a) {
 ; O2-NEXT:    .cfi_def_cfa_offset 16
 ; O2-NEXT:    .cfi_offset %rbx, -16
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection544:
+; O2-NEXT:  .Lpcsection589:
 ; O2-NEXT:    movq (%rdi), %rax
-; O2-NEXT:  .Lpcsection545:
+; O2-NEXT:  .Lpcsection590:
 ; O2-NEXT:    movq 8(%rdi), %rdx
 ; O2-NEXT:    .p2align 4, 0x90
 ; O2-NEXT:  .LBB237_1: # %atomicrmw.start
 ; O2-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O2-NEXT:    movq %rax, %rbx
-; O2-NEXT:  .Lpcsection546:
+; O2-NEXT:  .Lpcsection591:
 ; O2-NEXT:    addq $42, %rbx
 ; O2-NEXT:    movq %rdx, %rcx
-; O2-NEXT:  .Lpcsection547:
+; O2-NEXT:  .Lpcsection592:
 ; O2-NEXT:    adcq $0, %rcx
-; O2-NEXT:  .Lpcsection548:
+; O2-NEXT:  .Lpcsection593:
 ; O2-NEXT:    lock cmpxchg16b (%rdi)
-; O2-NEXT:  .Lpcsection549:
+; O2-NEXT:  .Lpcsection594:
 ; O2-NEXT:    jne .LBB237_1
 ; O2-NEXT:  # %bb.2: # %atomicrmw.end
 ; O2-NEXT:    movq $1, foo(%rip)
@@ -14811,22 +14946,22 @@ define void @atomic128_add_seq_cst(ptr %a) {
 ; O3-NEXT:    .cfi_def_cfa_offset 16
 ; O3-NEXT:    .cfi_offset %rbx, -16
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection544:
+; O3-NEXT:  .Lpcsection589:
 ; O3-NEXT:    movq (%rdi), %rax
-; O3-NEXT:  .Lpcsection545:
+; O3-NEXT:  .Lpcsection590:
 ; O3-NEXT:    movq 8(%rdi), %rdx
 ; O3-NEXT:    .p2align 4, 0x90
 ; O3-NEXT:  .LBB237_1: # %atomicrmw.start
 ; O3-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O3-NEXT:    movq %rax, %rbx
-; O3-NEXT:  .Lpcsection546:
+; O3-NEXT:  .Lpcsection591:
 ; O3-NEXT:    addq $42, %rbx
 ; O3-NEXT:    movq %rdx, %rcx
-; O3-NEXT:  .Lpcsection547:
+; O3-NEXT:  .Lpcsection592:
 ; O3-NEXT:    adcq $0, %rcx
-; O3-NEXT:  .Lpcsection548:
+; O3-NEXT:  .Lpcsection593:
 ; O3-NEXT:    lock cmpxchg16b (%rdi)
-; O3-NEXT:  .Lpcsection549:
+; O3-NEXT:  .Lpcsection594:
 ; O3-NEXT:    jne .LBB237_1
 ; O3-NEXT:  # %bb.2: # %atomicrmw.end
 ; O3-NEXT:    movq $1, foo(%rip)
@@ -14886,22 +15021,22 @@ define void @atomic128_sub_seq_cst(ptr %a) {
 ; O1-NEXT:    .cfi_def_cfa_offset 16
 ; O1-NEXT:    .cfi_offset %rbx, -16
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection550:
+; O1-NEXT:  .Lpcsection595:
 ; O1-NEXT:    movq (%rdi), %rax
-; O1-NEXT:  .Lpcsection551:
+; O1-NEXT:  .Lpcsection596:
 ; O1-NEXT:    movq 8(%rdi), %rdx
 ; O1-NEXT:    .p2align 4, 0x90
 ; O1-NEXT:  .LBB238_1: # %atomicrmw.start
 ; O1-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O1-NEXT:    movq %rax, %rbx
-; O1-NEXT:  .Lpcsection552:
+; O1-NEXT:  .Lpcsection597:
 ; O1-NEXT:    addq $-42, %rbx
 ; O1-NEXT:    movq %rdx, %rcx
-; O1-NEXT:  .Lpcsection553:
+; O1-NEXT:  .Lpcsection598:
 ; O1-NEXT:    adcq $-1, %rcx
-; O1-NEXT:  .Lpcsection554:
+; O1-NEXT:  .Lpcsection599:
 ; O1-NEXT:    lock cmpxchg16b (%rdi)
-; O1-NEXT:  .Lpcsection555:
+; O1-NEXT:  .Lpcsection600:
 ; O1-NEXT:    jne .LBB238_1
 ; O1-NEXT:  # %bb.2: # %atomicrmw.end
 ; O1-NEXT:    movq $1, foo(%rip)
@@ -14915,22 +15050,22 @@ define void @atomic128_sub_seq_cst(ptr %a) {
 ; O2-NEXT:    .cfi_def_cfa_offset 16
 ; O2-NEXT:    .cfi_offset %rbx, -16
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection550:
+; O2-NEXT:  .Lpcsection595:
 ; O2-NEXT:    movq (%rdi), %rax
-; O2-NEXT:  .Lpcsection551:
+; O2-NEXT:  .Lpcsection596:
 ; O2-NEXT:    movq 8(%rdi), %rdx
 ; O2-NEXT:    .p2align 4, 0x90
 ; O2-NEXT:  .LBB238_1: # %atomicrmw.start
 ; O2-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O2-NEXT:    movq %rax, %rbx
-; O2-NEXT:  .Lpcsection552:
+; O2-NEXT:  .Lpcsection597:
 ; O2-NEXT:    addq $-42, %rbx
 ; O2-NEXT:    movq %rdx, %rcx
-; O2-NEXT:  .Lpcsection553:
+; O2-NEXT:  .Lpcsection598:
 ; O2-NEXT:    adcq $-1, %rcx
-; O2-NEXT:  .Lpcsection554:
+; O2-NEXT:  .Lpcsection599:
 ; O2-NEXT:    lock cmpxchg16b (%rdi)
-; O2-NEXT:  .Lpcsection555:
+; O2-NEXT:  .Lpcsection600:
 ; O2-NEXT:    jne .LBB238_1
 ; O2-NEXT:  # %bb.2: # %atomicrmw.end
 ; O2-NEXT:    movq $1, foo(%rip)
@@ -14944,22 +15079,22 @@ define void @atomic128_sub_seq_cst(ptr %a) {
 ; O3-NEXT:    .cfi_def_cfa_offset 16
 ; O3-NEXT:    .cfi_offset %rbx, -16
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection550:
+; O3-NEXT:  .Lpcsection595:
 ; O3-NEXT:    movq (%rdi), %rax
-; O3-NEXT:  .Lpcsection551:
+; O3-NEXT:  .Lpcsection596:
 ; O3-NEXT:    movq 8(%rdi), %rdx
 ; O3-NEXT:    .p2align 4, 0x90
 ; O3-NEXT:  .LBB238_1: # %atomicrmw.start
 ; O3-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O3-NEXT:    movq %rax, %rbx
-; O3-NEXT:  .Lpcsection552:
+; O3-NEXT:  .Lpcsection597:
 ; O3-NEXT:    addq $-42, %rbx
 ; O3-NEXT:    movq %rdx, %rcx
-; O3-NEXT:  .Lpcsection553:
+; O3-NEXT:  .Lpcsection598:
 ; O3-NEXT:    adcq $-1, %rcx
-; O3-NEXT:  .Lpcsection554:
+; O3-NEXT:  .Lpcsection599:
 ; O3-NEXT:    lock cmpxchg16b (%rdi)
-; O3-NEXT:  .Lpcsection555:
+; O3-NEXT:  .Lpcsection600:
 ; O3-NEXT:    jne .LBB238_1
 ; O3-NEXT:  # %bb.2: # %atomicrmw.end
 ; O3-NEXT:    movq $1, foo(%rip)
@@ -15021,21 +15156,21 @@ define void @atomic128_and_seq_cst(ptr %a) {
 ; O1-NEXT:    .cfi_def_cfa_offset 16
 ; O1-NEXT:    .cfi_offset %rbx, -16
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection556:
+; O1-NEXT:  .Lpcsection601:
 ; O1-NEXT:    movq (%rdi), %rax
-; O1-NEXT:  .Lpcsection557:
+; O1-NEXT:  .Lpcsection602:
 ; O1-NEXT:    movq 8(%rdi), %rdx
 ; O1-NEXT:    .p2align 4, 0x90
 ; O1-NEXT:  .LBB239_1: # %atomicrmw.start
 ; O1-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O1-NEXT:    movl %eax, %ebx
-; O1-NEXT:  .Lpcsection558:
+; O1-NEXT:  .Lpcsection603:
 ; O1-NEXT:    andl $42, %ebx
-; O1-NEXT:  .Lpcsection559:
+; O1-NEXT:  .Lpcsection604:
 ; O1-NEXT:    xorl %ecx, %ecx
-; O1-NEXT:  .Lpcsection560:
+; O1-NEXT:  .Lpcsection605:
 ; O1-NEXT:    lock cmpxchg16b (%rdi)
-; O1-NEXT:  .Lpcsection561:
+; O1-NEXT:  .Lpcsection606:
 ; O1-NEXT:    jne .LBB239_1
 ; O1-NEXT:  # %bb.2: # %atomicrmw.end
 ; O1-NEXT:    movq $1, foo(%rip)
@@ -15049,21 +15184,21 @@ define void @atomic128_and_seq_cst(ptr %a) {
 ; O2-NEXT:    .cfi_def_cfa_offset 16
 ; O2-NEXT:    .cfi_offset %rbx, -16
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection556:
+; O2-NEXT:  .Lpcsection601:
 ; O2-NEXT:    movq (%rdi), %rax
-; O2-NEXT:  .Lpcsection557:
+; O2-NEXT:  .Lpcsection602:
 ; O2-NEXT:    movq 8(%rdi), %rdx
 ; O2-NEXT:    .p2align 4, 0x90
 ; O2-NEXT:  .LBB239_1: # %atomicrmw.start
 ; O2-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O2-NEXT:    movl %eax, %ebx
-; O2-NEXT:  .Lpcsection558:
+; O2-NEXT:  .Lpcsection603:
 ; O2-NEXT:    andl $42, %ebx
-; O2-NEXT:  .Lpcsection559:
+; O2-NEXT:  .Lpcsection604:
 ; O2-NEXT:    xorl %ecx, %ecx
-; O2-NEXT:  .Lpcsection560:
+; O2-NEXT:  .Lpcsection605:
 ; O2-NEXT:    lock cmpxchg16b (%rdi)
-; O2-NEXT:  .Lpcsection561:
+; O2-NEXT:  .Lpcsection606:
 ; O2-NEXT:    jne .LBB239_1
 ; O2-NEXT:  # %bb.2: # %atomicrmw.end
 ; O2-NEXT:    movq $1, foo(%rip)
@@ -15077,21 +15212,21 @@ define void @atomic128_and_seq_cst(ptr %a) {
 ; O3-NEXT:    .cfi_def_cfa_offset 16
 ; O3-NEXT:    .cfi_offset %rbx, -16
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection556:
+; O3-NEXT:  .Lpcsection601:
 ; O3-NEXT:    movq (%rdi), %rax
-; O3-NEXT:  .Lpcsection557:
+; O3-NEXT:  .Lpcsection602:
 ; O3-NEXT:    movq 8(%rdi), %rdx
 ; O3-NEXT:    .p2align 4, 0x90
 ; O3-NEXT:  .LBB239_1: # %atomicrmw.start
 ; O3-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O3-NEXT:    movl %eax, %ebx
-; O3-NEXT:  .Lpcsection558:
+; O3-NEXT:  .Lpcsection603:
 ; O3-NEXT:    andl $42, %ebx
-; O3-NEXT:  .Lpcsection559:
+; O3-NEXT:  .Lpcsection604:
 ; O3-NEXT:    xorl %ecx, %ecx
-; O3-NEXT:  .Lpcsection560:
+; O3-NEXT:  .Lpcsection605:
 ; O3-NEXT:    lock cmpxchg16b (%rdi)
-; O3-NEXT:  .Lpcsection561:
+; O3-NEXT:  .Lpcsection606:
 ; O3-NEXT:    jne .LBB239_1
 ; O3-NEXT:  # %bb.2: # %atomicrmw.end
 ; O3-NEXT:    movq $1, foo(%rip)
@@ -15149,20 +15284,20 @@ define void @atomic128_or_seq_cst(ptr %a) {
 ; O1-NEXT:    .cfi_def_cfa_offset 16
 ; O1-NEXT:    .cfi_offset %rbx, -16
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection562:
+; O1-NEXT:  .Lpcsection607:
 ; O1-NEXT:    movq (%rdi), %rax
-; O1-NEXT:  .Lpcsection563:
+; O1-NEXT:  .Lpcsection608:
 ; O1-NEXT:    movq 8(%rdi), %rdx
 ; O1-NEXT:    .p2align 4, 0x90
 ; O1-NEXT:  .LBB240_1: # %atomicrmw.start
 ; O1-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O1-NEXT:    movq %rax, %rbx
-; O1-NEXT:  .Lpcsection564:
+; O1-NEXT:  .Lpcsection609:
 ; O1-NEXT:    orq $42, %rbx
 ; O1-NEXT:    movq %rdx, %rcx
-; O1-NEXT:  .Lpcsection565:
+; O1-NEXT:  .Lpcsection610:
 ; O1-NEXT:    lock cmpxchg16b (%rdi)
-; O1-NEXT:  .Lpcsection566:
+; O1-NEXT:  .Lpcsection611:
 ; O1-NEXT:    jne .LBB240_1
 ; O1-NEXT:  # %bb.2: # %atomicrmw.end
 ; O1-NEXT:    movq $1, foo(%rip)
@@ -15176,20 +15311,20 @@ define void @atomic128_or_seq_cst(ptr %a) {
 ; O2-NEXT:    .cfi_def_cfa_offset 16
 ; O2-NEXT:    .cfi_offset %rbx, -16
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection562:
+; O2-NEXT:  .Lpcsection607:
 ; O2-NEXT:    movq (%rdi), %rax
-; O2-NEXT:  .Lpcsection563:
+; O2-NEXT:  .Lpcsection608:
 ; O2-NEXT:    movq 8(%rdi), %rdx
 ; O2-NEXT:    .p2align 4, 0x90
 ; O2-NEXT:  .LBB240_1: # %atomicrmw.start
 ; O2-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O2-NEXT:    movq %rax, %rbx
-; O2-NEXT:  .Lpcsection564:
+; O2-NEXT:  .Lpcsection609:
 ; O2-NEXT:    orq $42, %rbx
 ; O2-NEXT:    movq %rdx, %rcx
-; O2-NEXT:  .Lpcsection565:
+; O2-NEXT:  .Lpcsection610:
 ; O2-NEXT:    lock cmpxchg16b (%rdi)
-; O2-NEXT:  .Lpcsection566:
+; O2-NEXT:  .Lpcsection611:
 ; O2-NEXT:    jne .LBB240_1
 ; O2-NEXT:  # %bb.2: # %atomicrmw.end
 ; O2-NEXT:    movq $1, foo(%rip)
@@ -15203,20 +15338,20 @@ define void @atomic128_or_seq_cst(ptr %a) {
 ; O3-NEXT:    .cfi_def_cfa_offset 16
 ; O3-NEXT:    .cfi_offset %rbx, -16
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection562:
+; O3-NEXT:  .Lpcsection607:
 ; O3-NEXT:    movq (%rdi), %rax
-; O3-NEXT:  .Lpcsection563:
+; O3-NEXT:  .Lpcsection608:
 ; O3-NEXT:    movq 8(%rdi), %rdx
 ; O3-NEXT:    .p2align 4, 0x90
 ; O3-NEXT:  .LBB240_1: # %atomicrmw.start
 ; O3-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O3-NEXT:    movq %rax, %rbx
-; O3-NEXT:  .Lpcsection564:
+; O3-NEXT:  .Lpcsection609:
 ; O3-NEXT:    orq $42, %rbx
 ; O3-NEXT:    movq %rdx, %rcx
-; O3-NEXT:  .Lpcsection565:
+; O3-NEXT:  .Lpcsection610:
 ; O3-NEXT:    lock cmpxchg16b (%rdi)
-; O3-NEXT:  .Lpcsection566:
+; O3-NEXT:  .Lpcsection611:
 ; O3-NEXT:    jne .LBB240_1
 ; O3-NEXT:  # %bb.2: # %atomicrmw.end
 ; O3-NEXT:    movq $1, foo(%rip)
@@ -15274,20 +15409,20 @@ define void @atomic128_xor_seq_cst(ptr %a) {
 ; O1-NEXT:    .cfi_def_cfa_offset 16
 ; O1-NEXT:    .cfi_offset %rbx, -16
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection567:
+; O1-NEXT:  .Lpcsection612:
 ; O1-NEXT:    movq (%rdi), %rax
-; O1-NEXT:  .Lpcsection568:
+; O1-NEXT:  .Lpcsection613:
 ; O1-NEXT:    movq 8(%rdi), %rdx
 ; O1-NEXT:    .p2align 4, 0x90
 ; O1-NEXT:  .LBB241_1: # %atomicrmw.start
 ; O1-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O1-NEXT:    movq %rax, %rbx
-; O1-NEXT:  .Lpcsection569:
+; O1-NEXT:  .Lpcsection614:
 ; O1-NEXT:    xorq $42, %rbx
 ; O1-NEXT:    movq %rdx, %rcx
-; O1-NEXT:  .Lpcsection570:
+; O1-NEXT:  .Lpcsection615:
 ; O1-NEXT:    lock cmpxchg16b (%rdi)
-; O1-NEXT:  .Lpcsection571:
+; O1-NEXT:  .Lpcsection616:
 ; O1-NEXT:    jne .LBB241_1
 ; O1-NEXT:  # %bb.2: # %atomicrmw.end
 ; O1-NEXT:    movq $1, foo(%rip)
@@ -15301,20 +15436,20 @@ define void @atomic128_xor_seq_cst(ptr %a) {
 ; O2-NEXT:    .cfi_def_cfa_offset 16
 ; O2-NEXT:    .cfi_offset %rbx, -16
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection567:
+; O2-NEXT:  .Lpcsection612:
 ; O2-NEXT:    movq (%rdi), %rax
-; O2-NEXT:  .Lpcsection568:
+; O2-NEXT:  .Lpcsection613:
 ; O2-NEXT:    movq 8(%rdi), %rdx
 ; O2-NEXT:    .p2align 4, 0x90
 ; O2-NEXT:  .LBB241_1: # %atomicrmw.start
 ; O2-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O2-NEXT:    movq %rax, %rbx
-; O2-NEXT:  .Lpcsection569:
+; O2-NEXT:  .Lpcsection614:
 ; O2-NEXT:    xorq $42, %rbx
 ; O2-NEXT:    movq %rdx, %rcx
-; O2-NEXT:  .Lpcsection570:
+; O2-NEXT:  .Lpcsection615:
 ; O2-NEXT:    lock cmpxchg16b (%rdi)
-; O2-NEXT:  .Lpcsection571:
+; O2-NEXT:  .Lpcsection616:
 ; O2-NEXT:    jne .LBB241_1
 ; O2-NEXT:  # %bb.2: # %atomicrmw.end
 ; O2-NEXT:    movq $1, foo(%rip)
@@ -15328,20 +15463,20 @@ define void @atomic128_xor_seq_cst(ptr %a) {
 ; O3-NEXT:    .cfi_def_cfa_offset 16
 ; O3-NEXT:    .cfi_offset %rbx, -16
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection567:
+; O3-NEXT:  .Lpcsection612:
 ; O3-NEXT:    movq (%rdi), %rax
-; O3-NEXT:  .Lpcsection568:
+; O3-NEXT:  .Lpcsection613:
 ; O3-NEXT:    movq 8(%rdi), %rdx
 ; O3-NEXT:    .p2align 4, 0x90
 ; O3-NEXT:  .LBB241_1: # %atomicrmw.start
 ; O3-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O3-NEXT:    movq %rax, %rbx
-; O3-NEXT:  .Lpcsection569:
+; O3-NEXT:  .Lpcsection614:
 ; O3-NEXT:    xorq $42, %rbx
 ; O3-NEXT:    movq %rdx, %rcx
-; O3-NEXT:  .Lpcsection570:
+; O3-NEXT:  .Lpcsection615:
 ; O3-NEXT:    lock cmpxchg16b (%rdi)
-; O3-NEXT:  .Lpcsection571:
+; O3-NEXT:  .Lpcsection616:
 ; O3-NEXT:    jne .LBB241_1
 ; O3-NEXT:  # %bb.2: # %atomicrmw.end
 ; O3-NEXT:    movq $1, foo(%rip)
@@ -15405,23 +15540,23 @@ define void @atomic128_nand_seq_cst(ptr %a) {
 ; O1-NEXT:    .cfi_def_cfa_offset 16
 ; O1-NEXT:    .cfi_offset %rbx, -16
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection572:
+; O1-NEXT:  .Lpcsection617:
 ; O1-NEXT:    movq (%rdi), %rax
-; O1-NEXT:  .Lpcsection573:
+; O1-NEXT:  .Lpcsection618:
 ; O1-NEXT:    movq 8(%rdi), %rdx
-; O1-NEXT:  .Lpcsection574:
+; O1-NEXT:  .Lpcsection619:
 ; O1-NEXT:    movq $-1, %rcx
 ; O1-NEXT:    .p2align 4, 0x90
 ; O1-NEXT:  .LBB242_1: # %atomicrmw.start
 ; O1-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O1-NEXT:    movl %eax, %ebx
-; O1-NEXT:  .Lpcsection575:
+; O1-NEXT:  .Lpcsection620:
 ; O1-NEXT:    notl %ebx
-; O1-NEXT:  .Lpcsection576:
+; O1-NEXT:  .Lpcsection621:
 ; O1-NEXT:    orq $-43, %rbx
-; O1-NEXT:  .Lpcsection577:
+; O1-NEXT:  .Lpcsection622:
 ; O1-NEXT:    lock cmpxchg16b (%rdi)
-; O1-NEXT:  .Lpcsection578:
+; O1-NEXT:  .Lpcsection623:
 ; O1-NEXT:    jne .LBB242_1
 ; O1-NEXT:  # %bb.2: # %atomicrmw.end
 ; O1-NEXT:    movq $1, foo(%rip)
@@ -15435,23 +15570,23 @@ define void @atomic128_nand_seq_cst(ptr %a) {
 ; O2-NEXT:    .cfi_def_cfa_offset 16
 ; O2-NEXT:    .cfi_offset %rbx, -16
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection572:
+; O2-NEXT:  .Lpcsection617:
 ; O2-NEXT:    movq (%rdi), %rax
-; O2-NEXT:  .Lpcsection573:
+; O2-NEXT:  .Lpcsection618:
 ; O2-NEXT:    movq 8(%rdi), %rdx
-; O2-NEXT:  .Lpcsection574:
+; O2-NEXT:  .Lpcsection619:
 ; O2-NEXT:    movq $-1, %rcx
 ; O2-NEXT:    .p2align 4, 0x90
 ; O2-NEXT:  .LBB242_1: # %atomicrmw.start
 ; O2-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O2-NEXT:    movl %eax, %ebx
-; O2-NEXT:  .Lpcsection575:
+; O2-NEXT:  .Lpcsection620:
 ; O2-NEXT:    notl %ebx
-; O2-NEXT:  .Lpcsection576:
+; O2-NEXT:  .Lpcsection621:
 ; O2-NEXT:    orq $-43, %rbx
-; O2-NEXT:  .Lpcsection577:
+; O2-NEXT:  .Lpcsection622:
 ; O2-NEXT:    lock cmpxchg16b (%rdi)
-; O2-NEXT:  .Lpcsection578:
+; O2-NEXT:  .Lpcsection623:
 ; O2-NEXT:    jne .LBB242_1
 ; O2-NEXT:  # %bb.2: # %atomicrmw.end
 ; O2-NEXT:    movq $1, foo(%rip)
@@ -15465,23 +15600,23 @@ define void @atomic128_nand_seq_cst(ptr %a) {
 ; O3-NEXT:    .cfi_def_cfa_offset 16
 ; O3-NEXT:    .cfi_offset %rbx, -16
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection572:
+; O3-NEXT:  .Lpcsection617:
 ; O3-NEXT:    movq (%rdi), %rax
-; O3-NEXT:  .Lpcsection573:
+; O3-NEXT:  .Lpcsection618:
 ; O3-NEXT:    movq 8(%rdi), %rdx
-; O3-NEXT:  .Lpcsection574:
+; O3-NEXT:  .Lpcsection619:
 ; O3-NEXT:    movq $-1, %rcx
 ; O3-NEXT:    .p2align 4, 0x90
 ; O3-NEXT:  .LBB242_1: # %atomicrmw.start
 ; O3-NEXT:    # =>This Inner Loop Header: Depth=1
 ; O3-NEXT:    movl %eax, %ebx
-; O3-NEXT:  .Lpcsection575:
+; O3-NEXT:  .Lpcsection620:
 ; O3-NEXT:    notl %ebx
-; O3-NEXT:  .Lpcsection576:
+; O3-NEXT:  .Lpcsection621:
 ; O3-NEXT:    orq $-43, %rbx
-; O3-NEXT:  .Lpcsection577:
+; O3-NEXT:  .Lpcsection622:
 ; O3-NEXT:    lock cmpxchg16b (%rdi)
-; O3-NEXT:  .Lpcsection578:
+; O3-NEXT:  .Lpcsection623:
 ; O3-NEXT:    jne .LBB242_1
 ; O3-NEXT:  # %bb.2: # %atomicrmw.end
 ; O3-NEXT:    movq $1, foo(%rip)
@@ -15542,31 +15677,31 @@ define void @atomic128_cas_monotonic(ptr %a) {
 ; O1-NEXT:    .cfi_def_cfa_offset 16
 ; O1-NEXT:    .cfi_offset %rbx, -16
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection579:
+; O1-NEXT:  .Lpcsection624:
 ; O1-NEXT:    movl $42, %eax
-; O1-NEXT:  .Lpcsection580:
+; O1-NEXT:  .Lpcsection625:
 ; O1-NEXT:    movl $1, %ebx
-; O1-NEXT:  .Lpcsection581:
+; O1-NEXT:  .Lpcsection626:
 ; O1-NEXT:    xorl %edx, %edx
-; O1-NEXT:  .Lpcsection582:
+; O1-NEXT:  .Lpcsection627:
 ; O1-NEXT:    xorl %ecx, %ecx
-; O1-NEXT:  .Lpcsection583:
+; O1-NEXT:  .Lpcsection628:
 ; O1-NEXT:    lock cmpxchg16b (%rdi)
-; O1-NEXT:  .Lpcsection584:
+; O1-NEXT:  .Lpcsection629:
 ; O1-NEXT:    movl $42, %eax
-; O1-NEXT:  .Lpcsection585:
+; O1-NEXT:  .Lpcsection630:
 ; O1-NEXT:    xorl %edx, %edx
-; O1-NEXT:  .Lpcsection586:
+; O1-NEXT:  .Lpcsection631:
 ; O1-NEXT:    xorl %ecx, %ecx
-; O1-NEXT:  .Lpcsection587:
+; O1-NEXT:  .Lpcsection632:
 ; O1-NEXT:    lock cmpxchg16b (%rdi)
-; O1-NEXT:  .Lpcsection588:
+; O1-NEXT:  .Lpcsection633:
 ; O1-NEXT:    movl $42, %eax
-; O1-NEXT:  .Lpcsection589:
+; O1-NEXT:  .Lpcsection634:
 ; O1-NEXT:    xorl %edx, %edx
-; O1-NEXT:  .Lpcsection590:
+; O1-NEXT:  .Lpcsection635:
 ; O1-NEXT:    xorl %ecx, %ecx
-; O1-NEXT:  .Lpcsection591:
+; O1-NEXT:  .Lpcsection636:
 ; O1-NEXT:    lock cmpxchg16b (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    popq %rbx
@@ -15579,31 +15714,31 @@ define void @atomic128_cas_monotonic(ptr %a) {
 ; O2-NEXT:    .cfi_def_cfa_offset 16
 ; O2-NEXT:    .cfi_offset %rbx, -16
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection579:
+; O2-NEXT:  .Lpcsection624:
 ; O2-NEXT:    movl $42, %eax
-; O2-NEXT:  .Lpcsection580:
+; O2-NEXT:  .Lpcsection625:
 ; O2-NEXT:    movl $1, %ebx
-; O2-NEXT:  .Lpcsection581:
+; O2-NEXT:  .Lpcsection626:
 ; O2-NEXT:    xorl %edx, %edx
-; O2-NEXT:  .Lpcsection582:
+; O2-NEXT:  .Lpcsection627:
 ; O2-NEXT:    xorl %ecx, %ecx
-; O2-NEXT:  .Lpcsection583:
+; O2-NEXT:  .Lpcsection628:
 ; O2-NEXT:    lock cmpxchg16b (%rdi)
-; O2-NEXT:  .Lpcsection584:
+; O2-NEXT:  .Lpcsection629:
 ; O2-NEXT:    movl $42, %eax
-; O2-NEXT:  .Lpcsection585:
+; O2-NEXT:  .Lpcsection630:
 ; O2-NEXT:    xorl %edx, %edx
-; O2-NEXT:  .Lpcsection586:
+; O2-NEXT:  .Lpcsection631:
 ; O2-NEXT:    xorl %ecx, %ecx
-; O2-NEXT:  .Lpcsection587:
+; O2-NEXT:  .Lpcsection632:
 ; O2-NEXT:    lock cmpxchg16b (%rdi)
-; O2-NEXT:  .Lpcsection588:
+; O2-NEXT:  .Lpcsection633:
 ; O2-NEXT:    movl $42, %eax
-; O2-NEXT:  .Lpcsection589:
+; O2-NEXT:  .Lpcsection634:
 ; O2-NEXT:    xorl %edx, %edx
-; O2-NEXT:  .Lpcsection590:
+; O2-NEXT:  .Lpcsection635:
 ; O2-NEXT:    xorl %ecx, %ecx
-; O2-NEXT:  .Lpcsection591:
+; O2-NEXT:  .Lpcsection636:
 ; O2-NEXT:    lock cmpxchg16b (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    popq %rbx
@@ -15616,31 +15751,31 @@ define void @atomic128_cas_monotonic(ptr %a) {
 ; O3-NEXT:    .cfi_def_cfa_offset 16
 ; O3-NEXT:    .cfi_offset %rbx, -16
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection579:
+; O3-NEXT:  .Lpcsection624:
 ; O3-NEXT:    movl $42, %eax
-; O3-NEXT:  .Lpcsection580:
+; O3-NEXT:  .Lpcsection625:
 ; O3-NEXT:    movl $1, %ebx
-; O3-NEXT:  .Lpcsection581:
+; O3-NEXT:  .Lpcsection626:
 ; O3-NEXT:    xorl %edx, %edx
-; O3-NEXT:  .Lpcsection582:
+; O3-NEXT:  .Lpcsection627:
 ; O3-NEXT:    xorl %ecx, %ecx
-; O3-NEXT:  .Lpcsection583:
+; O3-NEXT:  .Lpcsection628:
 ; O3-NEXT:    lock cmpxchg16b (%rdi)
-; O3-NEXT:  .Lpcsection584:
+; O3-NEXT:  .Lpcsection629:
 ; O3-NEXT:    movl $42, %eax
-; O3-NEXT:  .Lpcsection585:
+; O3-NEXT:  .Lpcsection630:
 ; O3-NEXT:    xorl %edx, %edx
-; O3-NEXT:  .Lpcsection586:
+; O3-NEXT:  .Lpcsection631:
 ; O3-NEXT:    xorl %ecx, %ecx
-; O3-NEXT:  .Lpcsection587:
+; O3-NEXT:  .Lpcsection632:
 ; O3-NEXT:    lock cmpxchg16b (%rdi)
-; O3-NEXT:  .Lpcsection588:
+; O3-NEXT:  .Lpcsection633:
 ; O3-NEXT:    movl $42, %eax
-; O3-NEXT:  .Lpcsection589:
+; O3-NEXT:  .Lpcsection634:
 ; O3-NEXT:    xorl %edx, %edx
-; O3-NEXT:  .Lpcsection590:
+; O3-NEXT:  .Lpcsection635:
 ; O3-NEXT:    xorl %ecx, %ecx
-; O3-NEXT:  .Lpcsection591:
+; O3-NEXT:  .Lpcsection636:
 ; O3-NEXT:    lock cmpxchg16b (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    popq %rbx
@@ -15702,31 +15837,31 @@ define void @atomic128_cas_acquire(ptr %a) {
 ; O1-NEXT:    .cfi_def_cfa_offset 16
 ; O1-NEXT:    .cfi_offset %rbx, -16
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection592:
+; O1-NEXT:  .Lpcsection637:
 ; O1-NEXT:    movl $42, %eax
-; O1-NEXT:  .Lpcsection593:
+; O1-NEXT:  .Lpcsection638:
 ; O1-NEXT:    movl $1, %ebx
-; O1-NEXT:  .Lpcsection594:
+; O1-NEXT:  .Lpcsection639:
 ; O1-NEXT:    xorl %edx, %edx
-; O1-NEXT:  .Lpcsection595:
+; O1-NEXT:  .Lpcsection640:
 ; O1-NEXT:    xorl %ecx, %ecx
-; O1-NEXT:  .Lpcsection596:
+; O1-NEXT:  .Lpcsection641:
 ; O1-NEXT:    lock cmpxchg16b (%rdi)
-; O1-NEXT:  .Lpcsection597:
+; O1-NEXT:  .Lpcsection642:
 ; O1-NEXT:    movl $42, %eax
-; O1-NEXT:  .Lpcsection598:
+; O1-NEXT:  .Lpcsection643:
 ; O1-NEXT:    xorl %edx, %edx
-; O1-NEXT:  .Lpcsection599:
+; O1-NEXT:  .Lpcsection644:
 ; O1-NEXT:    xorl %ecx, %ecx
-; O1-NEXT:  .Lpcsection600:
+; O1-NEXT:  .Lpcsection645:
 ; O1-NEXT:    lock cmpxchg16b (%rdi)
-; O1-NEXT:  .Lpcsection601:
+; O1-NEXT:  .Lpcsection646:
 ; O1-NEXT:    movl $42, %eax
-; O1-NEXT:  .Lpcsection602:
+; O1-NEXT:  .Lpcsection647:
 ; O1-NEXT:    xorl %edx, %edx
-; O1-NEXT:  .Lpcsection603:
+; O1-NEXT:  .Lpcsection648:
 ; O1-NEXT:    xorl %ecx, %ecx
-; O1-NEXT:  .Lpcsection604:
+; O1-NEXT:  .Lpcsection649:
 ; O1-NEXT:    lock cmpxchg16b (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    popq %rbx
@@ -15739,31 +15874,31 @@ define void @atomic128_cas_acquire(ptr %a) {
 ; O2-NEXT:    .cfi_def_cfa_offset 16
 ; O2-NEXT:    .cfi_offset %rbx, -16
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection592:
+; O2-NEXT:  .Lpcsection637:
 ; O2-NEXT:    movl $42, %eax
-; O2-NEXT:  .Lpcsection593:
+; O2-NEXT:  .Lpcsection638:
 ; O2-NEXT:    movl $1, %ebx
-; O2-NEXT:  .Lpcsection594:
+; O2-NEXT:  .Lpcsection639:
 ; O2-NEXT:    xorl %edx, %edx
-; O2-NEXT:  .Lpcsection595:
+; O2-NEXT:  .Lpcsection640:
 ; O2-NEXT:    xorl %ecx, %ecx
-; O2-NEXT:  .Lpcsection596:
+; O2-NEXT:  .Lpcsection641:
 ; O2-NEXT:    lock cmpxchg16b (%rdi)
-; O2-NEXT:  .Lpcsection597:
+; O2-NEXT:  .Lpcsection642:
 ; O2-NEXT:    movl $42, %eax
-; O2-NEXT:  .Lpcsection598:
+; O2-NEXT:  .Lpcsection643:
 ; O2-NEXT:    xorl %edx, %edx
-; O2-NEXT:  .Lpcsection599:
+; O2-NEXT:  .Lpcsection644:
 ; O2-NEXT:    xorl %ecx, %ecx
-; O2-NEXT:  .Lpcsection600:
+; O2-NEXT:  .Lpcsection645:
 ; O2-NEXT:    lock cmpxchg16b (%rdi)
-; O2-NEXT:  .Lpcsection601:
+; O2-NEXT:  .Lpcsection646:
 ; O2-NEXT:    movl $42, %eax
-; O2-NEXT:  .Lpcsection602:
+; O2-NEXT:  .Lpcsection647:
 ; O2-NEXT:    xorl %edx, %edx
-; O2-NEXT:  .Lpcsection603:
+; O2-NEXT:  .Lpcsection648:
 ; O2-NEXT:    xorl %ecx, %ecx
-; O2-NEXT:  .Lpcsection604:
+; O2-NEXT:  .Lpcsection649:
 ; O2-NEXT:    lock cmpxchg16b (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    popq %rbx
@@ -15776,31 +15911,31 @@ define void @atomic128_cas_acquire(ptr %a) {
 ; O3-NEXT:    .cfi_def_cfa_offset 16
 ; O3-NEXT:    .cfi_offset %rbx, -16
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection592:
+; O3-NEXT:  .Lpcsection637:
 ; O3-NEXT:    movl $42, %eax
-; O3-NEXT:  .Lpcsection593:
+; O3-NEXT:  .Lpcsection638:
 ; O3-NEXT:    movl $1, %ebx
-; O3-NEXT:  .Lpcsection594:
+; O3-NEXT:  .Lpcsection639:
 ; O3-NEXT:    xorl %edx, %edx
-; O3-NEXT:  .Lpcsection595:
+; O3-NEXT:  .Lpcsection640:
 ; O3-NEXT:    xorl %ecx, %ecx
-; O3-NEXT:  .Lpcsection596:
+; O3-NEXT:  .Lpcsection641:
 ; O3-NEXT:    lock cmpxchg16b (%rdi)
-; O3-NEXT:  .Lpcsection597:
+; O3-NEXT:  .Lpcsection642:
 ; O3-NEXT:    movl $42, %eax
-; O3-NEXT:  .Lpcsection598:
+; O3-NEXT:  .Lpcsection643:
 ; O3-NEXT:    xorl %edx, %edx
-; O3-NEXT:  .Lpcsection599:
+; O3-NEXT:  .Lpcsection644:
 ; O3-NEXT:    xorl %ecx, %ecx
-; O3-NEXT:  .Lpcsection600:
+; O3-NEXT:  .Lpcsection645:
 ; O3-NEXT:    lock cmpxchg16b (%rdi)
-; O3-NEXT:  .Lpcsection601:
+; O3-NEXT:  .Lpcsection646:
 ; O3-NEXT:    movl $42, %eax
-; O3-NEXT:  .Lpcsection602:
+; O3-NEXT:  .Lpcsection647:
 ; O3-NEXT:    xorl %edx, %edx
-; O3-NEXT:  .Lpcsection603:
+; O3-NEXT:  .Lpcsection648:
 ; O3-NEXT:    xorl %ecx, %ecx
-; O3-NEXT:  .Lpcsection604:
+; O3-NEXT:  .Lpcsection649:
 ; O3-NEXT:    lock cmpxchg16b (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    popq %rbx
@@ -15862,31 +15997,31 @@ define void @atomic128_cas_release(ptr %a) {
 ; O1-NEXT:    .cfi_def_cfa_offset 16
 ; O1-NEXT:    .cfi_offset %rbx, -16
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection605:
+; O1-NEXT:  .Lpcsection650:
 ; O1-NEXT:    movl $42, %eax
-; O1-NEXT:  .Lpcsection606:
+; O1-NEXT:  .Lpcsection651:
 ; O1-NEXT:    movl $1, %ebx
-; O1-NEXT:  .Lpcsection607:
+; O1-NEXT:  .Lpcsection652:
 ; O1-NEXT:    xorl %edx, %edx
-; O1-NEXT:  .Lpcsection608:
+; O1-NEXT:  .Lpcsection653:
 ; O1-NEXT:    xorl %ecx, %ecx
-; O1-NEXT:  .Lpcsection609:
+; O1-NEXT:  .Lpcsection654:
 ; O1-NEXT:    lock cmpxchg16b (%rdi)
-; O1-NEXT:  .Lpcsection610:
+; O1-NEXT:  .Lpcsection655:
 ; O1-NEXT:    movl $42, %eax
-; O1-NEXT:  .Lpcsection611:
+; O1-NEXT:  .Lpcsection656:
 ; O1-NEXT:    xorl %edx, %edx
-; O1-NEXT:  .Lpcsection612:
+; O1-NEXT:  .Lpcsection657:
 ; O1-NEXT:    xorl %ecx, %ecx
-; O1-NEXT:  .Lpcsection613:
+; O1-NEXT:  .Lpcsection658:
 ; O1-NEXT:    lock cmpxchg16b (%rdi)
-; O1-NEXT:  .Lpcsection614:
+; O1-NEXT:  .Lpcsection659:
 ; O1-NEXT:    movl $42, %eax
-; O1-NEXT:  .Lpcsection615:
+; O1-NEXT:  .Lpcsection660:
 ; O1-NEXT:    xorl %edx, %edx
-; O1-NEXT:  .Lpcsection616:
+; O1-NEXT:  .Lpcsection661:
 ; O1-NEXT:    xorl %ecx, %ecx
-; O1-NEXT:  .Lpcsection617:
+; O1-NEXT:  .Lpcsection662:
 ; O1-NEXT:    lock cmpxchg16b (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    popq %rbx
@@ -15899,31 +16034,31 @@ define void @atomic128_cas_release(ptr %a) {
 ; O2-NEXT:    .cfi_def_cfa_offset 16
 ; O2-NEXT:    .cfi_offset %rbx, -16
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection605:
+; O2-NEXT:  .Lpcsection650:
 ; O2-NEXT:    movl $42, %eax
-; O2-NEXT:  .Lpcsection606:
+; O2-NEXT:  .Lpcsection651:
 ; O2-NEXT:    movl $1, %ebx
-; O2-NEXT:  .Lpcsection607:
+; O2-NEXT:  .Lpcsection652:
 ; O2-NEXT:    xorl %edx, %edx
-; O2-NEXT:  .Lpcsection608:
+; O2-NEXT:  .Lpcsection653:
 ; O2-NEXT:    xorl %ecx, %ecx
-; O2-NEXT:  .Lpcsection609:
+; O2-NEXT:  .Lpcsection654:
 ; O2-NEXT:    lock cmpxchg16b (%rdi)
-; O2-NEXT:  .Lpcsection610:
+; O2-NEXT:  .Lpcsection655:
 ; O2-NEXT:    movl $42, %eax
-; O2-NEXT:  .Lpcsection611:
+; O2-NEXT:  .Lpcsection656:
 ; O2-NEXT:    xorl %edx, %edx
-; O2-NEXT:  .Lpcsection612:
+; O2-NEXT:  .Lpcsection657:
 ; O2-NEXT:    xorl %ecx, %ecx
-; O2-NEXT:  .Lpcsection613:
+; O2-NEXT:  .Lpcsection658:
 ; O2-NEXT:    lock cmpxchg16b (%rdi)
-; O2-NEXT:  .Lpcsection614:
+; O2-NEXT:  .Lpcsection659:
 ; O2-NEXT:    movl $42, %eax
-; O2-NEXT:  .Lpcsection615:
+; O2-NEXT:  .Lpcsection660:
 ; O2-NEXT:    xorl %edx, %edx
-; O2-NEXT:  .Lpcsection616:
+; O2-NEXT:  .Lpcsection661:
 ; O2-NEXT:    xorl %ecx, %ecx
-; O2-NEXT:  .Lpcsection617:
+; O2-NEXT:  .Lpcsection662:
 ; O2-NEXT:    lock cmpxchg16b (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    popq %rbx
@@ -15936,31 +16071,31 @@ define void @atomic128_cas_release(ptr %a) {
 ; O3-NEXT:    .cfi_def_cfa_offset 16
 ; O3-NEXT:    .cfi_offset %rbx, -16
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection605:
+; O3-NEXT:  .Lpcsection650:
 ; O3-NEXT:    movl $42, %eax
-; O3-NEXT:  .Lpcsection606:
+; O3-NEXT:  .Lpcsection651:
 ; O3-NEXT:    movl $1, %ebx
-; O3-NEXT:  .Lpcsection607:
+; O3-NEXT:  .Lpcsection652:
 ; O3-NEXT:    xorl %edx, %edx
-; O3-NEXT:  .Lpcsection608:
+; O3-NEXT:  .Lpcsection653:
 ; O3-NEXT:    xorl %ecx, %ecx
-; O3-NEXT:  .Lpcsection609:
+; O3-NEXT:  .Lpcsection654:
 ; O3-NEXT:    lock cmpxchg16b (%rdi)
-; O3-NEXT:  .Lpcsection610:
+; O3-NEXT:  .Lpcsection655:
 ; O3-NEXT:    movl $42, %eax
-; O3-NEXT:  .Lpcsection611:
+; O3-NEXT:  .Lpcsection656:
 ; O3-NEXT:    xorl %edx, %edx
-; O3-NEXT:  .Lpcsection612:
+; O3-NEXT:  .Lpcsection657:
 ; O3-NEXT:    xorl %ecx, %ecx
-; O3-NEXT:  .Lpcsection613:
+; O3-NEXT:  .Lpcsection658:
 ; O3-NEXT:    lock cmpxchg16b (%rdi)
-; O3-NEXT:  .Lpcsection614:
+; O3-NEXT:  .Lpcsection659:
 ; O3-NEXT:    movl $42, %eax
-; O3-NEXT:  .Lpcsection615:
+; O3-NEXT:  .Lpcsection660:
 ; O3-NEXT:    xorl %edx, %edx
-; O3-NEXT:  .Lpcsection616:
+; O3-NEXT:  .Lpcsection661:
 ; O3-NEXT:    xorl %ecx, %ecx
-; O3-NEXT:  .Lpcsection617:
+; O3-NEXT:  .Lpcsection662:
 ; O3-NEXT:    lock cmpxchg16b (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    popq %rbx
@@ -16022,31 +16157,31 @@ define void @atomic128_cas_acq_rel(ptr %a) {
 ; O1-NEXT:    .cfi_def_cfa_offset 16
 ; O1-NEXT:    .cfi_offset %rbx, -16
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection618:
+; O1-NEXT:  .Lpcsection663:
 ; O1-NEXT:    movl $42, %eax
-; O1-NEXT:  .Lpcsection619:
+; O1-NEXT:  .Lpcsection664:
 ; O1-NEXT:    movl $1, %ebx
-; O1-NEXT:  .Lpcsection620:
+; O1-NEXT:  .Lpcsection665:
 ; O1-NEXT:    xorl %edx, %edx
-; O1-NEXT:  .Lpcsection621:
+; O1-NEXT:  .Lpcsection666:
 ; O1-NEXT:    xorl %ecx, %ecx
-; O1-NEXT:  .Lpcsection622:
+; O1-NEXT:  .Lpcsection667:
 ; O1-NEXT:    lock cmpxchg16b (%rdi)
-; O1-NEXT:  .Lpcsection623:
+; O1-NEXT:  .Lpcsection668:
 ; O1-NEXT:    movl $42, %eax
-; O1-NEXT:  .Lpcsection624:
+; O1-NEXT:  .Lpcsection669:
 ; O1-NEXT:    xorl %edx, %edx
-; O1-NEXT:  .Lpcsection625:
+; O1-NEXT:  .Lpcsection670:
 ; O1-NEXT:    xorl %ecx, %ecx
-; O1-NEXT:  .Lpcsection626:
+; O1-NEXT:  .Lpcsection671:
 ; O1-NEXT:    lock cmpxchg16b (%rdi)
-; O1-NEXT:  .Lpcsection627:
+; O1-NEXT:  .Lpcsection672:
 ; O1-NEXT:    movl $42, %eax
-; O1-NEXT:  .Lpcsection628:
+; O1-NEXT:  .Lpcsection673:
 ; O1-NEXT:    xorl %edx, %edx
-; O1-NEXT:  .Lpcsection629:
+; O1-NEXT:  .Lpcsection674:
 ; O1-NEXT:    xorl %ecx, %ecx
-; O1-NEXT:  .Lpcsection630:
+; O1-NEXT:  .Lpcsection675:
 ; O1-NEXT:    lock cmpxchg16b (%rdi)
 ; O1-NEXT:    movq $1, foo(%rip)
 ; O1-NEXT:    popq %rbx
@@ -16059,31 +16194,31 @@ define void @atomic128_cas_acq_rel(ptr %a) {
 ; O2-NEXT:    .cfi_def_cfa_offset 16
 ; O2-NEXT:    .cfi_offset %rbx, -16
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection618:
+; O2-NEXT:  .Lpcsection663:
 ; O2-NEXT:    movl $42, %eax
-; O2-NEXT:  .Lpcsection619:
+; O2-NEXT:  .Lpcsection664:
 ; O2-NEXT:    movl $1, %ebx
-; O2-NEXT:  .Lpcsection620:
+; O2-NEXT:  .Lpcsection665:
 ; O2-NEXT:    xorl %edx, %edx
-; O2-NEXT:  .Lpcsection621:
+; O2-NEXT:  .Lpcsection666:
 ; O2-NEXT:    xorl %ecx, %ecx
-; O2-NEXT:  .Lpcsection622:
+; O2-NEXT:  .Lpcsection667:
 ; O2-NEXT:    lock cmpxchg16b (%rdi)
-; O2-NEXT:  .Lpcsection623:
+; O2-NEXT:  .Lpcsection668:
 ; O2-NEXT:    movl $42, %eax
-; O2-NEXT:  .Lpcsection624:
+; O2-NEXT:  .Lpcsection669:
 ; O2-NEXT:    xorl %edx, %edx
-; O2-NEXT:  .Lpcsection625:
+; O2-NEXT:  .Lpcsection670:
 ; O2-NEXT:    xorl %ecx, %ecx
-; O2-NEXT:  .Lpcsection626:
+; O2-NEXT:  .Lpcsection671:
 ; O2-NEXT:    lock cmpxchg16b (%rdi)
-; O2-NEXT:  .Lpcsection627:
+; O2-NEXT:  .Lpcsection672:
 ; O2-NEXT:    movl $42, %eax
-; O2-NEXT:  .Lpcsection628:
+; O2-NEXT:  .Lpcsection673:
 ; O2-NEXT:    xorl %edx, %edx
-; O2-NEXT:  .Lpcsection629:
+; O2-NEXT:  .Lpcsection674:
 ; O2-NEXT:    xorl %ecx, %ecx
-; O2-NEXT:  .Lpcsection630:
+; O2-NEXT:  .Lpcsection675:
 ; O2-NEXT:    lock cmpxchg16b (%rdi)
 ; O2-NEXT:    movq $1, foo(%rip)
 ; O2-NEXT:    popq %rbx
@@ -16096,31 +16231,31 @@ define void @atomic128_cas_acq_rel(ptr %a) {
 ; O3-NEXT:    .cfi_def_cfa_offset 16
 ; O3-NEXT:    .cfi_offset %rbx, -16
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection618:
+; O3-NEXT:  .Lpcsection663:
 ; O3-NEXT:    movl $42, %eax
-; O3-NEXT:  .Lpcsection619:
+; O3-NEXT:  .Lpcsection664:
 ; O3-NEXT:    movl $1, %ebx
-; O3-NEXT:  .Lpcsection620:
+; O3-NEXT:  .Lpcsection665:
 ; O3-NEXT:    xorl %edx, %edx
-; O3-NEXT:  .Lpcsection621:
+; O3-NEXT:  .Lpcsection666:
 ; O3-NEXT:    xorl %ecx, %ecx
-; O3-NEXT:  .Lpcsection622:
+; O3-NEXT:  .Lpcsection667:
 ; O3-NEXT:    lock cmpxchg16b (%rdi)
-; O3-NEXT:  .Lpcsection623:
+; O3-NEXT:  .Lpcsection668:
 ; O3-NEXT:    movl $42, %eax
-; O3-NEXT:  .Lpcsection624:
+; O3-NEXT:  .Lpcsection669:
 ; O3-NEXT:    xorl %edx, %edx
-; O3-NEXT:  .Lpcsection625:
+; O3-NEXT:  .Lpcsection670:
 ; O3-NEXT:    xorl %ecx, %ecx
-; O3-NEXT:  .Lpcsection626:
+; O3-NEXT:  .Lpcsection671:
 ; O3-NEXT:    lock cmpxchg16b (%rdi)
-; O3-NEXT:  .Lpcsection627:
+; O3-NEXT:  .Lpcsection672:
 ; O3-NEXT:    movl $42, %eax
-; O3-NEXT:  .Lpcsection628:
+; O3-NEXT:  .Lpcsection673:
 ; O3-NEXT:    xorl %edx, %edx
-; O3-NEXT:  .Lpcsection629:
+; O3-NEXT:  .Lpcsection674:
 ; O3-NEXT:    xorl %ecx, %ecx
-; O3-NEXT:  .Lpcsection630:
+; O3-NEXT:  .Lpcsection675:
 ; O3-NEXT:    lock cmpxchg16b (%rdi)
 ; O3-NEXT:    movq $1, foo(%rip)
 ; O3-NEXT:    popq %rbx
@@ -16182,31 +16317,31 @@ define void @atomic128_cas_seq_cst(ptr %a) {
 ; O1-NEXT:    .cfi_def_cfa_offset 16
 ; O1-NEXT:    .cfi_offset %rbx, -16
 ; O1-NEXT:    movq foo(%rip), %rax
-; O1-NEXT:  .Lpcsection631:
+; O1-NEXT:  .Lpcsection676:
 ; O1-NEXT:    movl $42, %eax
-; O1-NEXT:  .Lpcsection632:
+; O1-NEXT:  .Lpcsection677:
 ; O1-NEXT:    movl $1, %ebx
-; O1-NEXT:  .Lpcsection633:
+; O1-NEXT:  .Lpcsection678:
 ; O1-NEXT:    xorl %edx, %edx
-; O1-NEXT:  .Lpcsection634:
+; O1-NEXT:  .Lpcsection679:
 ; O1-NEXT:    xorl %ecx, %ecx
-; O1-NEXT:  .Lpcsection635:
+; O1-NEXT:  .Lpcsection680:
 ; O1-NEXT:    lock cmpxchg16b (%rdi)
-; O1-NEXT:  .Lpcsection636:
+; O1-NEXT:  .Lpcsection681:
 ; O1-NEXT:    movl $42, %eax
-; O1-NEXT:  .Lpcsection637:
+; O1-NEXT:  .Lpcsection682:
 ; O1-NEXT:    xorl %edx, %edx
-; O1-NEXT:  .Lpcsection638:
+; O1-NEXT:  .Lpcsection683:
 ; O1-NEXT:    xorl %ecx, %ecx
-; O1-NEXT:  .Lpcsection639:
+; O1-NEXT:  .Lpcsection684:
 ; O1-NEXT:    lock cmpxchg16b (%rdi)
-; O1-NEXT:  .Lpcsection640:
+; O1-NEXT:  .Lpcsection685:
 ; O1-NEXT:    movl $42, %eax
-; O1-NEXT:  .Lpcsection641:
+; O1-NEXT:  .Lpcsection686:
 ; O1-NEXT:    xorl %edx, %edx
-; O1-NEXT:  .Lpcsection642:
+; O1-NEXT:  .Lpcsection687:
 ; O1-NEXT:    xorl %ecx, %ecx
-; O1-NEXT:  .Lpcsection643:
+; O1-NEXT:  .Lpcsection688:
 ; O1-NEXT:    lock cmpxchg16b (%rdi)
 ; O1-NEXT:    movq $3, foo(%rip)
 ; O1-NEXT:    popq %rbx
@@ -16219,31 +16354,31 @@ define void @atomic128_cas_seq_cst(ptr %a) {
 ; O2-NEXT:    .cfi_def_cfa_offset 16
 ; O2-NEXT:    .cfi_offset %rbx, -16
 ; O2-NEXT:    movq foo(%rip), %rax
-; O2-NEXT:  .Lpcsection631:
+; O2-NEXT:  .Lpcsection676:
 ; O2-NEXT:    movl $42, %eax
-; O2-NEXT:  .Lpcsection632:
+; O2-NEXT:  .Lpcsection677:
 ; O2-NEXT:    movl $1, %ebx
-; O2-NEXT:  .Lpcsection633:
+; O2-NEXT:  .Lpcsection678:
 ; O2-NEXT:    xorl %edx, %edx
-; O2-NEXT:  .Lpcsection634:
+; O2-NEXT:  .Lpcsection679:
 ; O2-NEXT:    xorl %ecx, %ecx
-; O2-NEXT:  .Lpcsection635:
+; O2-NEXT:  .Lpcsection680:
 ; O2-NEXT:    lock cmpxchg16b (%rdi)
-; O2-NEXT:  .Lpcsection636:
+; O2-NEXT:  .Lpcsection681:
 ; O2-NEXT:    movl $42, %eax
-; O2-NEXT:  .Lpcsection637:
+; O2-NEXT:  .Lpcsection682:
 ; O2-NEXT:    xorl %edx, %edx
-; O2-NEXT:  .Lpcsection638:
+; O2-NEXT:  .Lpcsection683:
 ; O2-NEXT:    xorl %ecx, %ecx
-; O2-NEXT:  .Lpcsection639:
+; O2-NEXT:  .Lpcsection684:
 ; O2-NEXT:    lock cmpxchg16b (%rdi)
-; O2-NEXT:  .Lpcsection640:
+; O2-NEXT:  .Lpcsection685:
 ; O2-NEXT:    movl $42, %eax
-; O2-NEXT:  .Lpcsection641:
+; O2-NEXT:  .Lpcsection686:
 ; O2-NEXT:    xorl %edx, %edx
-; O2-NEXT:  .Lpcsection642:
+; O2-NEXT:  .Lpcsection687:
 ; O2-NEXT:    xorl %ecx, %ecx
-; O2-NEXT:  .Lpcsection643:
+; O2-NEXT:  .Lpcsection688:
 ; O2-NEXT:    lock cmpxchg16b (%rdi)
 ; O2-NEXT:    movq $3, foo(%rip)
 ; O2-NEXT:    popq %rbx
@@ -16256,31 +16391,31 @@ define void @atomic128_cas_seq_cst(ptr %a) {
 ; O3-NEXT:    .cfi_def_cfa_offset 16
 ; O3-NEXT:    .cfi_offset %rbx, -16
 ; O3-NEXT:    movq foo(%rip), %rax
-; O3-NEXT:  .Lpcsection631:
+; O3-NEXT:  .Lpcsection676:
 ; O3-NEXT:    movl $42, %eax
-; O3-NEXT:  .Lpcsection632:
+; O3-NEXT:  .Lpcsection677:
 ; O3-NEXT:    movl $1, %ebx
-; O3-NEXT:  .Lpcsection633:
+; O3-NEXT:  .Lpcsection678:
 ; O3-NEXT:    xorl %edx, %edx
-; O3-NEXT:  .Lpcsection634:
+; O3-NEXT:  .Lpcsection679:
 ; O3-NEXT:    xorl %ecx, %ecx
-; O3-NEXT:  .Lpcsection635:
+; O3-NEXT:  .Lpcsection680:
 ; O3-NEXT:    lock cmpxchg16b (%rdi)
-; O3-NEXT:  .Lpcsection636:
+; O3-NEXT:  .Lpcsection681:
 ; O3-NEXT:    movl $42, %eax
-; O3-NEXT:  .Lpcsection637:
+; O3-NEXT:  .Lpcsection682:
 ; O3-NEXT:    xorl %edx, %edx
-; O3-NEXT:  .Lpcsection638:
+; O3-NEXT:  .Lpcsection683:
 ; O3-NEXT:    xorl %ecx, %ecx
-; O3-NEXT:  .Lpcsection639:
+; O3-NEXT:  .Lpcsection684:
 ; O3-NEXT:    lock cmpxchg16b (%rdi)
-; O3-NEXT:  .Lpcsection640:
+; O3-NEXT:  .Lpcsection685:
 ; O3-NEXT:    movl $42, %eax
-; O3-NEXT:  .Lpcsection641:
+; O3-NEXT:  .Lpcsection686:
 ; O3-NEXT:    xorl %edx, %edx
-; O3-NEXT:  .Lpcsection642:
+; O3-NEXT:  .Lpcsection687:
 ; O3-NEXT:    xorl %ecx, %ecx
-; O3-NEXT:  .Lpcsection643:
+; O3-NEXT:  .Lpcsection688:
 ; O3-NEXT:    lock cmpxchg16b (%rdi)
 ; O3-NEXT:    movq $3, foo(%rip)
 ; O3-NEXT:    popq %rbx
diff --git a/llvm/test/CodeGen/X86/peephole-copy.ll b/llvm/test/CodeGen/X86/peephole-copy.ll
new file mode 100644
index 0000000000000..69112d0a1f471
--- /dev/null
+++ b/llvm/test/CodeGen/X86/peephole-copy.ll
@@ -0,0 +1,67 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc < %s -mtriple=x86_64-pc-linux-gnu | FileCheck %s
+
+; Correctly tracking COPY instructions in peephole should not crash compiler.
+
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture)
+
+define void @foo(ptr %p1, ptr %p2, ptr %p3, ptr %p4) {
+; CHECK-LABEL: foo:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movq %rcx, %rax
+; CHECK-NEXT:    movl $5, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl $5, %r8d
+; CHECK-NEXT:    xorl %ecx, %ecx
+; CHECK-NEXT:    cmpl $0, (%rdi)
+; CHECK-NEXT:    setle %cl
+; CHECK-NEXT:    movl (%rax), %eax
+; CHECK-NEXT:    shll %cl, %eax
+; CHECK-NEXT:    movl %r8d, (%rdx)
+; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:  .LBB0_1: # %loop2.header
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    movl $5, %edi
+; CHECK-NEXT:    orl %eax, %edi
+; CHECK-NEXT:    je .LBB0_1
+; CHECK-NEXT:  # %bb.2: # %exit
+; CHECK-NEXT:    movl %r8d, (%rdx)
+; CHECK-NEXT:    movl %ecx, (%rsi)
+; CHECK-NEXT:    retq
+entry:
+  %q = alloca i32, align 4
+  %r = alloca i32, align 4
+  store i32 5, ptr %q, align 4
+  call void @llvm.lifetime.start.p0(i64 4, ptr %r)
+  %vq1 = load i32, ptr %q, align 4
+  %vb = load i32, ptr %p1, align 4
+  %cmp3 = icmp slt i32 %vb, 1
+  %conv4 = zext i1 %cmp3 to i32
+  %va = load i32, ptr %p4, align 4
+  %shl5 = shl i32 %va, %conv4
+  %vq2 = load i32, ptr %q, align 4
+  %cmp = icmp slt i32 %vb, 1
+  %conv = zext i1 %cmp to i32
+  %shl = shl i32 %va, %conv
+  %vq3 = load i32, ptr %q, align 4
+  %cmp.us = icmp slt i32 %vb, 1
+  %conv.us = zext i1 %cmp.us to i32
+  %shl.us = shl i32 %va, %conv.us
+  br label %loop2.preheader
+
+loop2.preheader:
+  store i32 %vq1, ptr %p3, align 4
+  br label %loop2.header
+
+loop2.header:
+  %cond1 = icmp eq i32 %vq3, 0
+  br i1 %cond1, label %loop2.body, label %exit
+
+loop2.body:
+  %cond2 = icmp eq i32 %shl.us, 0
+  br i1 %cond2, label %loop2.header, label %exit
+
+exit:
+  store i32 %vq2, ptr %p3, align 4
+  store i32 %conv.us, ptr %p2, align 4
+  ret void
+}
diff --git a/llvm/test/CodeGen/X86/peephole-copy.mir b/llvm/test/CodeGen/X86/peephole-copy.mir
new file mode 100644
index 0000000000000..e24abf84d12a6
--- /dev/null
+++ b/llvm/test/CodeGen/X86/peephole-copy.mir
@@ -0,0 +1,35 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
+# RUN: llc -mtriple=i686-- -run-pass=peephole-opt %s -o - | FileCheck %s
+--- |
+  define void @c() {
+  entry:
+    tail call void asm sideeffect "", "q,~{dirflag},~{fpsr},~{flags}"(i32 512)
+    tail call void asm sideeffect "", "q,~{dirflag},~{fpsr},~{flags}"(i32 512)
+    ret void
+  }
+...
+---
+# In peephole optimization the modified COPY instruction should not cause
+# compiler failure.
+name: c
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: gr32_abcd }
+  - { id: 1, class: gr32_abcd }
+  - { id: 2, class: gr32 }
+
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: c
+    ; CHECK: [[MOV32ri:%[0-9]+]]:gr32_abcd = MOV32ri 512
+    ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 2359305 /* reguse:GR32 */, [[MOV32ri]], 1 /* reguse */, implicit-def early-clobber $df
+    ; CHECK-NEXT: [[MOV32ri1:%[0-9]+]]:gr32_abcd = MOV32ri 512
+    ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 2359305 /* reguse:GR32 */, [[MOV32ri1]], 1 /* reguse */, implicit-def early-clobber $df
+    ; CHECK-NEXT: RET 0
+    %2 = MOV32ri 512
+    %0 = COPY %2
+    INLINEASM &"", 1 /* sideeffect attdialect */, 2359305 /* reguse:GR32_ABCD */, %0:gr32_abcd, 1 /* clobber */, implicit-def early-clobber $df
+    %1 = COPY %2
+    INLINEASM &"", 1 /* sideeffect attdialect */, 2359305 /* reguse:GR32_ABCD */, %1:gr32_abcd, 1 /* clobber */, implicit-def early-clobber $df
+    RET 0
+...
diff --git a/llvm/test/CodeGen/X86/physreg-pairs.ll b/llvm/test/CodeGen/X86/physreg-pairs.ll
index 5e1d430311a64..07ee803709caa 100644
--- a/llvm/test/CodeGen/X86/physreg-pairs.ll
+++ b/llvm/test/CodeGen/X86/physreg-pairs.ll
@@ -145,8 +145,8 @@ define dso_local i64 @test_ebp(i64 %in) local_unnamed_addr nounwind {
 ; CHECK-LABEL: test_ebp:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushl %ebp
-; CHECK-NEXT:    movl $19088743, %esp # imm = 0x1234567
 ; CHECK-NEXT:    movl $-1985229329, %ebp # imm = 0x89ABCDEF
+; CHECK-NEXT:    movl $19088743, %esp # imm = 0x1234567
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    movl %ebp, %eax
 ; CHECK-NEXT:    #NO_APP
diff --git a/llvm/test/CodeGen/X86/popcnt.ll b/llvm/test/CodeGen/X86/popcnt.ll
index 5ed14ab6e0b97..a0879ad930a30 100644
--- a/llvm/test/CodeGen/X86/popcnt.ll
+++ b/llvm/test/CodeGen/X86/popcnt.ll
@@ -1044,12 +1044,11 @@ define i32 @cnt32_pgso(i32 %x) nounwind readnone !prof !14 {
 ; X86-NEXT:    shrl %ecx
 ; X86-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
 ; X86-NEXT:    subl %ecx, %eax
-; X86-NEXT:    movl $858993459, %ecx # imm = 0x33333333
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    andl %ecx, %edx
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl $858993459, %ecx # imm = 0x33333333
 ; X86-NEXT:    shrl $2, %eax
-; X86-NEXT:    andl %ecx, %eax
-; X86-NEXT:    addl %edx, %eax
+; X86-NEXT:    andl $858993459, %eax # imm = 0x33333333
+; X86-NEXT:    addl %ecx, %eax
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    shrl $4, %ecx
 ; X86-NEXT:    addl %eax, %ecx
@@ -1064,12 +1063,11 @@ define i32 @cnt32_pgso(i32 %x) nounwind readnone !prof !14 {
 ; X64-NEXT:    shrl %eax
 ; X64-NEXT:    andl $1431655765, %eax # imm = 0x55555555
 ; X64-NEXT:    subl %eax, %edi
-; X64-NEXT:    movl $858993459, %eax # imm = 0x33333333
-; X64-NEXT:    movl %edi, %ecx
-; X64-NEXT:    andl %eax, %ecx
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    andl $858993459, %eax # imm = 0x33333333
 ; X64-NEXT:    shrl $2, %edi
-; X64-NEXT:    andl %eax, %edi
-; X64-NEXT:    addl %ecx, %edi
+; X64-NEXT:    andl $858993459, %edi # imm = 0x33333333
+; X64-NEXT:    addl %eax, %edi
 ; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    shrl $4, %eax
 ; X64-NEXT:    addl %edi, %eax
@@ -1094,49 +1092,40 @@ define i32 @cnt32_pgso(i32 %x) nounwind readnone !prof !14 {
 define i64 @cnt64_pgso(i64 %x) nounwind readnone !prof !14 {
 ; X86-NOSSE-LABEL: cnt64_pgso:
 ; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl %ebx
-; X86-NOSSE-NEXT:    pushl %edi
-; X86-NOSSE-NEXT:    pushl %esi
 ; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NOSSE-NEXT:    movl %esi, %ecx
-; X86-NOSSE-NEXT:    shrl %ecx
-; X86-NOSSE-NEXT:    movl $1431655765, %edx # imm = 0x55555555
-; X86-NOSSE-NEXT:    andl %edx, %ecx
-; X86-NOSSE-NEXT:    subl %ecx, %esi
-; X86-NOSSE-NEXT:    movl $858993459, %ecx # imm = 0x33333333
-; X86-NOSSE-NEXT:    movl %esi, %edi
-; X86-NOSSE-NEXT:    andl %ecx, %edi
-; X86-NOSSE-NEXT:    shrl $2, %esi
-; X86-NOSSE-NEXT:    andl %ecx, %esi
-; X86-NOSSE-NEXT:    addl %edi, %esi
-; X86-NOSSE-NEXT:    movl %esi, %ebx
-; X86-NOSSE-NEXT:    shrl $4, %ebx
-; X86-NOSSE-NEXT:    addl %esi, %ebx
-; X86-NOSSE-NEXT:    movl $252645135, %edi # imm = 0xF0F0F0F
-; X86-NOSSE-NEXT:    andl %edi, %ebx
-; X86-NOSSE-NEXT:    imull $16843009, %ebx, %esi # imm = 0x1010101
-; X86-NOSSE-NEXT:    shrl $24, %esi
-; X86-NOSSE-NEXT:    movl %eax, %ebx
-; X86-NOSSE-NEXT:    shrl %ebx
-; X86-NOSSE-NEXT:    andl %edx, %ebx
-; X86-NOSSE-NEXT:    subl %ebx, %eax
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOSSE-NEXT:    movl %ecx, %edx
+; X86-NOSSE-NEXT:    shrl %edx
+; X86-NOSSE-NEXT:    andl $1431655765, %edx # imm = 0x55555555
+; X86-NOSSE-NEXT:    subl %edx, %ecx
+; X86-NOSSE-NEXT:    movl %ecx, %edx
+; X86-NOSSE-NEXT:    andl $858993459, %edx # imm = 0x33333333
+; X86-NOSSE-NEXT:    shrl $2, %ecx
+; X86-NOSSE-NEXT:    andl $858993459, %ecx # imm = 0x33333333
+; X86-NOSSE-NEXT:    addl %edx, %ecx
+; X86-NOSSE-NEXT:    movl %ecx, %edx
+; X86-NOSSE-NEXT:    shrl $4, %edx
+; X86-NOSSE-NEXT:    addl %ecx, %edx
+; X86-NOSSE-NEXT:    andl $252645135, %edx # imm = 0xF0F0F0F
+; X86-NOSSE-NEXT:    imull $16843009, %edx, %ecx # imm = 0x1010101
+; X86-NOSSE-NEXT:    shrl $24, %ecx
 ; X86-NOSSE-NEXT:    movl %eax, %edx
-; X86-NOSSE-NEXT:    andl %ecx, %edx
+; X86-NOSSE-NEXT:    shrl %edx
+; X86-NOSSE-NEXT:    andl $1431655765, %edx # imm = 0x55555555
+; X86-NOSSE-NEXT:    subl %edx, %eax
+; X86-NOSSE-NEXT:    movl %eax, %edx
+; X86-NOSSE-NEXT:    andl $858993459, %edx # imm = 0x33333333
 ; X86-NOSSE-NEXT:    shrl $2, %eax
-; X86-NOSSE-NEXT:    andl %ecx, %eax
+; X86-NOSSE-NEXT:    andl $858993459, %eax # imm = 0x33333333
 ; X86-NOSSE-NEXT:    addl %edx, %eax
-; X86-NOSSE-NEXT:    movl %eax, %ecx
-; X86-NOSSE-NEXT:    shrl $4, %ecx
-; X86-NOSSE-NEXT:    addl %eax, %ecx
-; X86-NOSSE-NEXT:    andl %edi, %ecx
-; X86-NOSSE-NEXT:    imull $16843009, %ecx, %eax # imm = 0x1010101
+; X86-NOSSE-NEXT:    movl %eax, %edx
+; X86-NOSSE-NEXT:    shrl $4, %edx
+; X86-NOSSE-NEXT:    addl %eax, %edx
+; X86-NOSSE-NEXT:    andl $252645135, %edx # imm = 0xF0F0F0F
+; X86-NOSSE-NEXT:    imull $16843009, %edx, %eax # imm = 0x1010101
 ; X86-NOSSE-NEXT:    shrl $24, %eax
-; X86-NOSSE-NEXT:    addl %esi, %eax
+; X86-NOSSE-NEXT:    addl %ecx, %eax
 ; X86-NOSSE-NEXT:    xorl %edx, %edx
-; X86-NOSSE-NEXT:    popl %esi
-; X86-NOSSE-NEXT:    popl %edi
-; X86-NOSSE-NEXT:    popl %ebx
 ; X86-NOSSE-NEXT:    retl
 ;
 ; X64-LABEL: cnt64_pgso:
@@ -1223,92 +1212,85 @@ define i64 @cnt64_pgso(i64 %x) nounwind readnone !prof !14 {
 define i128 @cnt128_pgso(i128 %x) nounwind readnone !prof !14 {
 ; X86-NOSSE-LABEL: cnt128_pgso:
 ; X86-NOSSE:       # %bb.0:
-; X86-NOSSE-NEXT:    pushl %ebp
 ; X86-NOSSE-NEXT:    pushl %ebx
 ; X86-NOSSE-NEXT:    pushl %edi
 ; X86-NOSSE-NEXT:    pushl %esi
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-NOSSE-NEXT:    movl %ebx, %ecx
-; X86-NOSSE-NEXT:    shrl %ecx
-; X86-NOSSE-NEXT:    movl $1431655765, %edi # imm = 0x55555555
-; X86-NOSSE-NEXT:    andl %edi, %ecx
-; X86-NOSSE-NEXT:    subl %ecx, %ebx
-; X86-NOSSE-NEXT:    movl $858993459, %ecx # imm = 0x33333333
-; X86-NOSSE-NEXT:    movl %ebx, %ebp
-; X86-NOSSE-NEXT:    andl %ecx, %ebp
-; X86-NOSSE-NEXT:    shrl $2, %ebx
-; X86-NOSSE-NEXT:    andl %ecx, %ebx
-; X86-NOSSE-NEXT:    addl %ebp, %ebx
-; X86-NOSSE-NEXT:    movl %ebx, %ebp
-; X86-NOSSE-NEXT:    shrl $4, %ebp
-; X86-NOSSE-NEXT:    addl %ebx, %ebp
-; X86-NOSSE-NEXT:    movl %eax, %ebx
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NOSSE-NEXT:    movl %edi, %ebx
 ; X86-NOSSE-NEXT:    shrl %ebx
-; X86-NOSSE-NEXT:    andl %edi, %ebx
-; X86-NOSSE-NEXT:    subl %ebx, %eax
-; X86-NOSSE-NEXT:    movl %eax, %ebx
-; X86-NOSSE-NEXT:    andl %ecx, %ebx
-; X86-NOSSE-NEXT:    shrl $2, %eax
-; X86-NOSSE-NEXT:    andl %ecx, %eax
-; X86-NOSSE-NEXT:    addl %ebx, %eax
-; X86-NOSSE-NEXT:    movl %eax, %edi
-; X86-NOSSE-NEXT:    shrl $4, %edi
-; X86-NOSSE-NEXT:    addl %eax, %edi
-; X86-NOSSE-NEXT:    movl $252645135, %ebx # imm = 0xF0F0F0F
-; X86-NOSSE-NEXT:    andl %ebx, %ebp
-; X86-NOSSE-NEXT:    imull $16843009, %ebp, %eax # imm = 0x1010101
-; X86-NOSSE-NEXT:    shrl $24, %eax
-; X86-NOSSE-NEXT:    andl %ebx, %edi
-; X86-NOSSE-NEXT:    imull $16843009, %edi, %edi # imm = 0x1010101
+; X86-NOSSE-NEXT:    andl $1431655765, %ebx # imm = 0x55555555
+; X86-NOSSE-NEXT:    subl %ebx, %edi
+; X86-NOSSE-NEXT:    movl %edi, %ebx
+; X86-NOSSE-NEXT:    andl $858993459, %ebx # imm = 0x33333333
+; X86-NOSSE-NEXT:    shrl $2, %edi
+; X86-NOSSE-NEXT:    andl $858993459, %edi # imm = 0x33333333
+; X86-NOSSE-NEXT:    addl %ebx, %edi
+; X86-NOSSE-NEXT:    movl %edi, %ebx
+; X86-NOSSE-NEXT:    shrl $4, %ebx
+; X86-NOSSE-NEXT:    addl %edi, %ebx
+; X86-NOSSE-NEXT:    andl $252645135, %ebx # imm = 0xF0F0F0F
+; X86-NOSSE-NEXT:    imull $16843009, %ebx, %edi # imm = 0x1010101
 ; X86-NOSSE-NEXT:    shrl $24, %edi
-; X86-NOSSE-NEXT:    addl %eax, %edi
-; X86-NOSSE-NEXT:    movl %esi, %eax
-; X86-NOSSE-NEXT:    shrl %eax
-; X86-NOSSE-NEXT:    movl $1431655765, %ebp # imm = 0x55555555
-; X86-NOSSE-NEXT:    andl %ebp, %eax
-; X86-NOSSE-NEXT:    subl %eax, %esi
-; X86-NOSSE-NEXT:    movl %esi, %eax
-; X86-NOSSE-NEXT:    andl %ecx, %eax
+; X86-NOSSE-NEXT:    movl %esi, %ebx
+; X86-NOSSE-NEXT:    shrl %ebx
+; X86-NOSSE-NEXT:    andl $1431655765, %ebx # imm = 0x55555555
+; X86-NOSSE-NEXT:    subl %ebx, %esi
+; X86-NOSSE-NEXT:    movl %esi, %ebx
+; X86-NOSSE-NEXT:    andl $858993459, %ebx # imm = 0x33333333
 ; X86-NOSSE-NEXT:    shrl $2, %esi
-; X86-NOSSE-NEXT:    andl %ecx, %esi
-; X86-NOSSE-NEXT:    addl %eax, %esi
-; X86-NOSSE-NEXT:    movl %esi, %ebp
-; X86-NOSSE-NEXT:    shrl $4, %ebp
-; X86-NOSSE-NEXT:    addl %esi, %ebp
-; X86-NOSSE-NEXT:    movl %edx, %eax
-; X86-NOSSE-NEXT:    shrl %eax
-; X86-NOSSE-NEXT:    movl $1431655765, %esi # imm = 0x55555555
-; X86-NOSSE-NEXT:    andl %esi, %eax
-; X86-NOSSE-NEXT:    subl %eax, %edx
-; X86-NOSSE-NEXT:    movl %edx, %eax
-; X86-NOSSE-NEXT:    andl %ecx, %eax
+; X86-NOSSE-NEXT:    andl $858993459, %esi # imm = 0x33333333
+; X86-NOSSE-NEXT:    addl %ebx, %esi
+; X86-NOSSE-NEXT:    movl %esi, %ebx
+; X86-NOSSE-NEXT:    shrl $4, %ebx
+; X86-NOSSE-NEXT:    addl %esi, %ebx
+; X86-NOSSE-NEXT:    andl $252645135, %ebx # imm = 0xF0F0F0F
+; X86-NOSSE-NEXT:    imull $16843009, %ebx, %esi # imm = 0x1010101
+; X86-NOSSE-NEXT:    shrl $24, %esi
+; X86-NOSSE-NEXT:    addl %edi, %esi
+; X86-NOSSE-NEXT:    movl %edx, %edi
+; X86-NOSSE-NEXT:    shrl %edi
+; X86-NOSSE-NEXT:    andl $1431655765, %edi # imm = 0x55555555
+; X86-NOSSE-NEXT:    subl %edi, %edx
+; X86-NOSSE-NEXT:    movl %edx, %edi
+; X86-NOSSE-NEXT:    andl $858993459, %edi # imm = 0x33333333
 ; X86-NOSSE-NEXT:    shrl $2, %edx
-; X86-NOSSE-NEXT:    andl %ecx, %edx
-; X86-NOSSE-NEXT:    addl %eax, %edx
-; X86-NOSSE-NEXT:    movl %edx, %eax
-; X86-NOSSE-NEXT:    shrl $4, %eax
-; X86-NOSSE-NEXT:    addl %edx, %eax
-; X86-NOSSE-NEXT:    andl %ebx, %ebp
-; X86-NOSSE-NEXT:    andl %ebx, %eax
-; X86-NOSSE-NEXT:    imull $16843009, %ebp, %ecx # imm = 0x1010101
-; X86-NOSSE-NEXT:    shrl $24, %ecx
-; X86-NOSSE-NEXT:    imull $16843009, %eax, %edx # imm = 0x1010101
-; X86-NOSSE-NEXT:    shrl $24, %edx
-; X86-NOSSE-NEXT:    addl %ecx, %edx
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOSSE-NEXT:    andl $858993459, %edx # imm = 0x33333333
 ; X86-NOSSE-NEXT:    addl %edi, %edx
-; X86-NOSSE-NEXT:    xorl %ecx, %ecx
-; X86-NOSSE-NEXT:    movl %ecx, 12(%eax)
-; X86-NOSSE-NEXT:    movl %ecx, 8(%eax)
-; X86-NOSSE-NEXT:    movl %ecx, 4(%eax)
-; X86-NOSSE-NEXT:    movl %edx, (%eax)
+; X86-NOSSE-NEXT:    movl %edx, %edi
+; X86-NOSSE-NEXT:    shrl $4, %edi
+; X86-NOSSE-NEXT:    addl %edx, %edi
+; X86-NOSSE-NEXT:    andl $252645135, %edi # imm = 0xF0F0F0F
+; X86-NOSSE-NEXT:    imull $16843009, %edi, %edx # imm = 0x1010101
+; X86-NOSSE-NEXT:    shrl $24, %edx
+; X86-NOSSE-NEXT:    movl %ecx, %edi
+; X86-NOSSE-NEXT:    shrl %edi
+; X86-NOSSE-NEXT:    andl $1431655765, %edi # imm = 0x55555555
+; X86-NOSSE-NEXT:    subl %edi, %ecx
+; X86-NOSSE-NEXT:    movl %ecx, %edi
+; X86-NOSSE-NEXT:    andl $858993459, %edi # imm = 0x33333333
+; X86-NOSSE-NEXT:    shrl $2, %ecx
+; X86-NOSSE-NEXT:    andl $858993459, %ecx # imm = 0x33333333
+; X86-NOSSE-NEXT:    addl %edi, %ecx
+; X86-NOSSE-NEXT:    movl %ecx, %edi
+; X86-NOSSE-NEXT:    shrl $4, %edi
+; X86-NOSSE-NEXT:    addl %ecx, %edi
+; X86-NOSSE-NEXT:    andl $252645135, %edi # imm = 0xF0F0F0F
+; X86-NOSSE-NEXT:    imull $16843009, %edi, %ecx # imm = 0x1010101
+; X86-NOSSE-NEXT:    shrl $24, %ecx
+; X86-NOSSE-NEXT:    addl %edx, %ecx
+; X86-NOSSE-NEXT:    addl %esi, %ecx
+; X86-NOSSE-NEXT:    xorl %edx, %edx
+; X86-NOSSE-NEXT:    movl %edx, 12(%eax)
+; X86-NOSSE-NEXT:    movl %edx, 8(%eax)
+; X86-NOSSE-NEXT:    movl %edx, 4(%eax)
+; X86-NOSSE-NEXT:    movl %ecx, (%eax)
 ; X86-NOSSE-NEXT:    popl %esi
 ; X86-NOSSE-NEXT:    popl %edi
 ; X86-NOSSE-NEXT:    popl %ebx
-; X86-NOSSE-NEXT:    popl %ebp
 ; X86-NOSSE-NEXT:    retl $4
 ;
 ; X64-LABEL: cnt128_pgso:
diff --git a/llvm/test/CodeGen/X86/ragreedy-hoist-spill.ll b/llvm/test/CodeGen/X86/ragreedy-hoist-spill.ll
index e9448a800fd95..c0bfb71e189cd 100644
--- a/llvm/test/CodeGen/X86/ragreedy-hoist-spill.ll
+++ b/llvm/test/CodeGen/X86/ragreedy-hoist-spill.ll
@@ -81,8 +81,8 @@ define ptr @SyFgets(ptr %line, i64 %length, i64 %fid) {
 ; CHECK-NEXT:    imulq $1040, %rdx, %rax ## imm = 0x410
 ; CHECK-NEXT:    movq _syBuf@GOTPCREL(%rip), %rcx
 ; CHECK-NEXT:    leaq 8(%rcx,%rax), %rdx
-; CHECK-NEXT:    movl $1, %r13d
 ; CHECK-NEXT:    movq _syCTRO@GOTPCREL(%rip), %rax
+; CHECK-NEXT:    movl $1, %r13d
 ; CHECK-NEXT:    movb $1, %cl
 ; CHECK-NEXT:    .p2align 4, 0x90
 ; CHECK-NEXT:  LBB0_9: ## %do.body
diff --git a/llvm/test/CodeGen/X86/remat-phys-dead.ll b/llvm/test/CodeGen/X86/remat-phys-dead.ll
index 09f2e4320b6d3..9a0a219869353 100644
--- a/llvm/test/CodeGen/X86/remat-phys-dead.ll
+++ b/llvm/test/CodeGen/X86/remat-phys-dead.ll
@@ -18,6 +18,6 @@ define i8 @test_remat() {
 define i32 @test_remat32() {
   ret i32 0
 ; CHECK: REGISTER COALESCER
-; CHECK: Remat: $eax = MOV32r0 implicit-def dead $eflags
+; CHECK: $eax = MOV32r0 implicit-def dead $eflags
 }
 
diff --git a/llvm/test/CodeGen/X86/select_const_i128.ll b/llvm/test/CodeGen/X86/select_const_i128.ll
index af38bd6ce9e3e..503cf13ec4131 100644
--- a/llvm/test/CodeGen/X86/select_const_i128.ll
+++ b/llvm/test/CodeGen/X86/select_const_i128.ll
@@ -9,10 +9,9 @@ define i128 @select_eq_i128(ptr %a) {
 ; CHECK-NEXT:    xorl %eax, %eax
 ; CHECK-NEXT:    ptest %xmm0, %xmm0
 ; CHECK-NEXT:    setne %al
-; CHECK-NEXT:    xorl %ecx, %ecx
 ; CHECK-NEXT:    addq $-1, %rax
 ; CHECK-NEXT:    movabsq $9223372036854775807, %rdx # imm = 0x7FFFFFFFFFFFFFFF
-; CHECK-NEXT:    adcq %rcx, %rdx
+; CHECK-NEXT:    adcq $0, %rdx
 ; CHECK-NEXT:    retq
   %1 = load i128, ptr %a, align 16
   %cmp = icmp eq i128 %1, 1
diff --git a/llvm/test/CodeGen/X86/shrink_vmul.ll b/llvm/test/CodeGen/X86/shrink_vmul.ll
index 524ecf2aece7e..3d80c1554b6c3 100644
--- a/llvm/test/CodeGen/X86/shrink_vmul.ll
+++ b/llvm/test/CodeGen/X86/shrink_vmul.ll
@@ -1986,29 +1986,29 @@ define void @PR34947(ptr %p0, ptr %p1) nounwind {
 ; X86-SSE-NEXT:    movdqa (%eax), %xmm3
 ; X86-SSE-NEXT:    movdqa (%ecx), %xmm0
 ; X86-SSE-NEXT:    movdqa 16(%ecx), %xmm1
-; X86-SSE-NEXT:    pxor %xmm4, %xmm4
+; X86-SSE-NEXT:    pxor %xmm5, %xmm5
 ; X86-SSE-NEXT:    movdqa %xmm3, %xmm2
 ; X86-SSE-NEXT:    pextrw $7, %xmm3, %eax
 ; X86-SSE-NEXT:    pextrw $4, %xmm3, %edi
 ; X86-SSE-NEXT:    pextrw $0, %xmm3, %ebp
 ; X86-SSE-NEXT:    pextrw $1, %xmm3, %esi
 ; X86-SSE-NEXT:    pextrw $3, %xmm3, %ebx
-; X86-SSE-NEXT:    movdqa %xmm3, %xmm5
-; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; X86-SSE-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
+; X86-SSE-NEXT:    movdqa %xmm3, %xmm4
+; X86-SSE-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
+; X86-SSE-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7]
 ; X86-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[3,3,3,3]
 ; X86-SSE-NEXT:    movd %xmm3, %ecx
 ; X86-SSE-NEXT:    xorl %edx, %edx
 ; X86-SSE-NEXT:    divl %ecx
 ; X86-SSE-NEXT:    movd %edx, %xmm3
-; X86-SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[2,3,2,3]
-; X86-SSE-NEXT:    movd %xmm4, %eax
-; X86-SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[2,3,2,3]
-; X86-SSE-NEXT:    movd %xmm4, %ecx
+; X86-SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm2[2,3,2,3]
+; X86-SSE-NEXT:    movd %xmm5, %eax
+; X86-SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[2,3,2,3]
+; X86-SSE-NEXT:    movd %xmm5, %ecx
 ; X86-SSE-NEXT:    xorl %edx, %edx
 ; X86-SSE-NEXT:    divl %ecx
-; X86-SSE-NEXT:    movd %edx, %xmm4
-; X86-SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; X86-SSE-NEXT:    movd %edx, %xmm5
+; X86-SSE-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
 ; X86-SSE-NEXT:    movl %edi, %eax
 ; X86-SSE-NEXT:    xorl %edx, %edx
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %edi
@@ -2022,7 +2022,7 @@ define void @PR34947(ptr %p0, ptr %p1) nounwind {
 ; X86-SSE-NEXT:    divl %ecx
 ; X86-SSE-NEXT:    movd %edx, %xmm1
 ; X86-SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
-; X86-SSE-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0]
+; X86-SSE-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm5[0]
 ; X86-SSE-NEXT:    movl %ebp, %eax
 ; X86-SSE-NEXT:    xorl %edx, %edx
 ; X86-SSE-NEXT:    divl (%edi)
@@ -2040,7 +2040,7 @@ define void @PR34947(ptr %p0, ptr %p1) nounwind {
 ; X86-SSE-NEXT:    xorl %edx, %edx
 ; X86-SSE-NEXT:    divl %ecx
 ; X86-SSE-NEXT:    movd %edx, %xmm2
-; X86-SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[2,3,2,3]
+; X86-SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
 ; X86-SSE-NEXT:    movd %xmm4, %eax
 ; X86-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
 ; X86-SSE-NEXT:    movd %xmm0, %ecx
@@ -2207,29 +2207,29 @@ define void @PR34947(ptr %p0, ptr %p1) nounwind {
 ; X64-SSE-NEXT:    movdqa (%rdi), %xmm3
 ; X64-SSE-NEXT:    movdqa (%rsi), %xmm0
 ; X64-SSE-NEXT:    movdqa 16(%rsi), %xmm1
-; X64-SSE-NEXT:    pxor %xmm4, %xmm4
+; X64-SSE-NEXT:    pxor %xmm5, %xmm5
 ; X64-SSE-NEXT:    movdqa %xmm3, %xmm2
 ; X64-SSE-NEXT:    pextrw $7, %xmm3, %eax
 ; X64-SSE-NEXT:    pextrw $4, %xmm3, %r8d
 ; X64-SSE-NEXT:    pextrw $0, %xmm3, %r10d
 ; X64-SSE-NEXT:    pextrw $1, %xmm3, %edi
 ; X64-SSE-NEXT:    pextrw $3, %xmm3, %r9d
-; X64-SSE-NEXT:    movdqa %xmm3, %xmm5
-; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; X64-SSE-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
+; X64-SSE-NEXT:    movdqa %xmm3, %xmm4
+; X64-SSE-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
+; X64-SSE-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7]
 ; X64-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[3,3,3,3]
 ; X64-SSE-NEXT:    movd %xmm3, %r11d
 ; X64-SSE-NEXT:    xorl %edx, %edx
 ; X64-SSE-NEXT:    divl %r11d
 ; X64-SSE-NEXT:    movd %edx, %xmm3
-; X64-SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[2,3,2,3]
-; X64-SSE-NEXT:    movd %xmm4, %eax
-; X64-SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[2,3,2,3]
-; X64-SSE-NEXT:    movd %xmm4, %r11d
+; X64-SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm2[2,3,2,3]
+; X64-SSE-NEXT:    movd %xmm5, %eax
+; X64-SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[2,3,2,3]
+; X64-SSE-NEXT:    movd %xmm5, %r11d
 ; X64-SSE-NEXT:    xorl %edx, %edx
 ; X64-SSE-NEXT:    divl %r11d
-; X64-SSE-NEXT:    movd %edx, %xmm4
-; X64-SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; X64-SSE-NEXT:    movd %edx, %xmm5
+; X64-SSE-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
 ; X64-SSE-NEXT:    movl %r8d, %eax
 ; X64-SSE-NEXT:    xorl %edx, %edx
 ; X64-SSE-NEXT:    divl 16(%rsi)
@@ -2242,7 +2242,7 @@ define void @PR34947(ptr %p0, ptr %p1) nounwind {
 ; X64-SSE-NEXT:    divl %r8d
 ; X64-SSE-NEXT:    movd %edx, %xmm1
 ; X64-SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
-; X64-SSE-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0]
+; X64-SSE-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm5[0]
 ; X64-SSE-NEXT:    movl %r10d, %eax
 ; X64-SSE-NEXT:    xorl %edx, %edx
 ; X64-SSE-NEXT:    divl (%rsi)
@@ -2260,7 +2260,7 @@ define void @PR34947(ptr %p0, ptr %p1) nounwind {
 ; X64-SSE-NEXT:    xorl %edx, %edx
 ; X64-SSE-NEXT:    divl %edi
 ; X64-SSE-NEXT:    movd %edx, %xmm2
-; X64-SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[2,3,2,3]
+; X64-SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
 ; X64-SSE-NEXT:    movd %xmm4, %eax
 ; X64-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
 ; X64-SSE-NEXT:    movd %xmm0, %edi
diff --git a/llvm/test/CodeGen/X86/speculative-load-hardening-call-and-ret.ll b/llvm/test/CodeGen/X86/speculative-load-hardening-call-and-ret.ll
index 3f2226aec2d3f..064812323d177 100644
--- a/llvm/test/CodeGen/X86/speculative-load-hardening-call-and-ret.ll
+++ b/llvm/test/CodeGen/X86/speculative-load-hardening-call-and-ret.ll
@@ -283,15 +283,14 @@ define i32 @test_call_setjmp(ptr%ptr) nounwind {
 ; X64-NOPIC-NEXT:    pushq %rbp
 ; X64-NOPIC-NEXT:    pushq %r15
 ; X64-NOPIC-NEXT:    pushq %r14
-; X64-NOPIC-NEXT:    pushq %r13
 ; X64-NOPIC-NEXT:    pushq %r12
 ; X64-NOPIC-NEXT:    pushq %rbx
-; X64-NOPIC-NEXT:    subq $24, %rsp
+; X64-NOPIC-NEXT:    subq $16, %rsp
 ; X64-NOPIC-NEXT:    movq %rsp, %rax
 ; X64-NOPIC-NEXT:    movq %rdi, %rbx
 ; X64-NOPIC-NEXT:    movq $-1, %r15
 ; X64-NOPIC-NEXT:    sarq $63, %rax
-; X64-NOPIC-NEXT:    leaq {{[0-9]+}}(%rsp), %r14
+; X64-NOPIC-NEXT:    movq %rsp, %r14
 ; X64-NOPIC-NEXT:    shlq $47, %rax
 ; X64-NOPIC-NEXT:    movq %r14, %rdi
 ; X64-NOPIC-NEXT:    orq %rax, %rsp
@@ -302,24 +301,23 @@ define i32 @test_call_setjmp(ptr%ptr) nounwind {
 ; X64-NOPIC-NEXT:    sarq $63, %rax
 ; X64-NOPIC-NEXT:    cmpq $.Lslh_ret_addr4, %r12
 ; X64-NOPIC-NEXT:    cmovneq %r15, %rax
-; X64-NOPIC-NEXT:    movl (%rbx), %r12d
-; X64-NOPIC-NEXT:    movl $42, %ebp
+; X64-NOPIC-NEXT:    movl (%rbx), %ebp
 ; X64-NOPIC-NEXT:    shlq $47, %rax
 ; X64-NOPIC-NEXT:    movq %r14, %rdi
-; X64-NOPIC-NEXT:    movl %ebp, %esi
+; X64-NOPIC-NEXT:    movl $42, %esi
 ; X64-NOPIC-NEXT:    orq %rax, %rsp
-; X64-NOPIC-NEXT:    movq $.Lslh_ret_addr5, %r13
+; X64-NOPIC-NEXT:    movq $.Lslh_ret_addr5, %r12
 ; X64-NOPIC-NEXT:    callq sigsetjmp@PLT
 ; X64-NOPIC-NEXT:  .Lslh_ret_addr5:
 ; X64-NOPIC-NEXT:    movq %rsp, %rax
 ; X64-NOPIC-NEXT:    sarq $63, %rax
-; X64-NOPIC-NEXT:    cmpq $.Lslh_ret_addr5, %r13
+; X64-NOPIC-NEXT:    cmpq $.Lslh_ret_addr5, %r12
 ; X64-NOPIC-NEXT:    cmovneq %r15, %rax
-; X64-NOPIC-NEXT:    addl (%rbx), %r12d
+; X64-NOPIC-NEXT:    addl (%rbx), %ebp
 ; X64-NOPIC-NEXT:    shlq $47, %rax
 ; X64-NOPIC-NEXT:    movq %r14, %rdi
 ; X64-NOPIC-NEXT:    movq %r14, %rsi
-; X64-NOPIC-NEXT:    movl %ebp, %edx
+; X64-NOPIC-NEXT:    movl $42, %edx
 ; X64-NOPIC-NEXT:    orq %rax, %rsp
 ; X64-NOPIC-NEXT:    movq $.Lslh_ret_addr6, %r14
 ; X64-NOPIC-NEXT:    callq __sigsetjmp@PLT
@@ -329,15 +327,14 @@ define i32 @test_call_setjmp(ptr%ptr) nounwind {
 ; X64-NOPIC-NEXT:    cmpq $.Lslh_ret_addr6, %r14
 ; X64-NOPIC-NEXT:    movq %rax, %rcx
 ; X64-NOPIC-NEXT:    cmovneq %r15, %rcx
-; X64-NOPIC-NEXT:    addl (%rbx), %r12d
-; X64-NOPIC-NEXT:    movl %r12d, %eax
+; X64-NOPIC-NEXT:    addl (%rbx), %ebp
+; X64-NOPIC-NEXT:    movl %ebp, %eax
 ; X64-NOPIC-NEXT:    orl %ecx, %eax
 ; X64-NOPIC-NEXT:    shlq $47, %rcx
 ; X64-NOPIC-NEXT:    orq %rcx, %rsp
-; X64-NOPIC-NEXT:    addq $24, %rsp
+; X64-NOPIC-NEXT:    addq $16, %rsp
 ; X64-NOPIC-NEXT:    popq %rbx
 ; X64-NOPIC-NEXT:    popq %r12
-; X64-NOPIC-NEXT:    popq %r13
 ; X64-NOPIC-NEXT:    popq %r14
 ; X64-NOPIC-NEXT:    popq %r15
 ; X64-NOPIC-NEXT:    popq %rbp
@@ -348,15 +345,14 @@ define i32 @test_call_setjmp(ptr%ptr) nounwind {
 ; X64-NOPIC-MCM-NEXT:    pushq %rbp
 ; X64-NOPIC-MCM-NEXT:    pushq %r15
 ; X64-NOPIC-MCM-NEXT:    pushq %r14
-; X64-NOPIC-MCM-NEXT:    pushq %r13
 ; X64-NOPIC-MCM-NEXT:    pushq %r12
 ; X64-NOPIC-MCM-NEXT:    pushq %rbx
-; X64-NOPIC-MCM-NEXT:    subq $24, %rsp
+; X64-NOPIC-MCM-NEXT:    subq $16, %rsp
 ; X64-NOPIC-MCM-NEXT:    movq %rsp, %rax
 ; X64-NOPIC-MCM-NEXT:    movq %rdi, %rbx
 ; X64-NOPIC-MCM-NEXT:    movq $-1, %r15
 ; X64-NOPIC-MCM-NEXT:    sarq $63, %rax
-; X64-NOPIC-MCM-NEXT:    leaq {{[0-9]+}}(%rsp), %r14
+; X64-NOPIC-MCM-NEXT:    movq %rsp, %r14
 ; X64-NOPIC-MCM-NEXT:    shlq $47, %rax
 ; X64-NOPIC-MCM-NEXT:    movq %r14, %rdi
 ; X64-NOPIC-MCM-NEXT:    orq %rax, %rsp
@@ -368,25 +364,24 @@ define i32 @test_call_setjmp(ptr%ptr) nounwind {
 ; X64-NOPIC-MCM-NEXT:    leaq .Lslh_ret_addr4(%rip), %rcx
 ; X64-NOPIC-MCM-NEXT:    cmpq %rcx, %r12
 ; X64-NOPIC-MCM-NEXT:    cmovneq %r15, %rax
-; X64-NOPIC-MCM-NEXT:    movl (%rbx), %r12d
-; X64-NOPIC-MCM-NEXT:    movl $42, %ebp
+; X64-NOPIC-MCM-NEXT:    movl (%rbx), %ebp
 ; X64-NOPIC-MCM-NEXT:    shlq $47, %rax
 ; X64-NOPIC-MCM-NEXT:    movq %r14, %rdi
-; X64-NOPIC-MCM-NEXT:    movl %ebp, %esi
+; X64-NOPIC-MCM-NEXT:    movl $42, %esi
 ; X64-NOPIC-MCM-NEXT:    orq %rax, %rsp
-; X64-NOPIC-MCM-NEXT:    leaq .Lslh_ret_addr5(%rip), %r13
+; X64-NOPIC-MCM-NEXT:    leaq .Lslh_ret_addr5(%rip), %r12
 ; X64-NOPIC-MCM-NEXT:    callq sigsetjmp@PLT
 ; X64-NOPIC-MCM-NEXT:  .Lslh_ret_addr5:
 ; X64-NOPIC-MCM-NEXT:    movq %rsp, %rax
 ; X64-NOPIC-MCM-NEXT:    sarq $63, %rax
 ; X64-NOPIC-MCM-NEXT:    leaq .Lslh_ret_addr5(%rip), %rcx
-; X64-NOPIC-MCM-NEXT:    cmpq %rcx, %r13
+; X64-NOPIC-MCM-NEXT:    cmpq %rcx, %r12
 ; X64-NOPIC-MCM-NEXT:    cmovneq %r15, %rax
-; X64-NOPIC-MCM-NEXT:    addl (%rbx), %r12d
+; X64-NOPIC-MCM-NEXT:    addl (%rbx), %ebp
 ; X64-NOPIC-MCM-NEXT:    shlq $47, %rax
 ; X64-NOPIC-MCM-NEXT:    movq %r14, %rdi
 ; X64-NOPIC-MCM-NEXT:    movq %r14, %rsi
-; X64-NOPIC-MCM-NEXT:    movl %ebp, %edx
+; X64-NOPIC-MCM-NEXT:    movl $42, %edx
 ; X64-NOPIC-MCM-NEXT:    orq %rax, %rsp
 ; X64-NOPIC-MCM-NEXT:    leaq .Lslh_ret_addr6(%rip), %r14
 ; X64-NOPIC-MCM-NEXT:    callq __sigsetjmp@PLT
@@ -397,15 +392,14 @@ define i32 @test_call_setjmp(ptr%ptr) nounwind {
 ; X64-NOPIC-MCM-NEXT:    cmpq %rcx, %r14
 ; X64-NOPIC-MCM-NEXT:    movq %rax, %rcx
 ; X64-NOPIC-MCM-NEXT:    cmovneq %r15, %rcx
-; X64-NOPIC-MCM-NEXT:    addl (%rbx), %r12d
-; X64-NOPIC-MCM-NEXT:    movl %r12d, %eax
+; X64-NOPIC-MCM-NEXT:    addl (%rbx), %ebp
+; X64-NOPIC-MCM-NEXT:    movl %ebp, %eax
 ; X64-NOPIC-MCM-NEXT:    orl %ecx, %eax
 ; X64-NOPIC-MCM-NEXT:    shlq $47, %rcx
 ; X64-NOPIC-MCM-NEXT:    orq %rcx, %rsp
-; X64-NOPIC-MCM-NEXT:    addq $24, %rsp
+; X64-NOPIC-MCM-NEXT:    addq $16, %rsp
 ; X64-NOPIC-MCM-NEXT:    popq %rbx
 ; X64-NOPIC-MCM-NEXT:    popq %r12
-; X64-NOPIC-MCM-NEXT:    popq %r13
 ; X64-NOPIC-MCM-NEXT:    popq %r14
 ; X64-NOPIC-MCM-NEXT:    popq %r15
 ; X64-NOPIC-MCM-NEXT:    popq %rbp
@@ -416,15 +410,14 @@ define i32 @test_call_setjmp(ptr%ptr) nounwind {
 ; X64-PIC-NEXT:    pushq %rbp
 ; X64-PIC-NEXT:    pushq %r15
 ; X64-PIC-NEXT:    pushq %r14
-; X64-PIC-NEXT:    pushq %r13
 ; X64-PIC-NEXT:    pushq %r12
 ; X64-PIC-NEXT:    pushq %rbx
-; X64-PIC-NEXT:    subq $24, %rsp
+; X64-PIC-NEXT:    subq $16, %rsp
 ; X64-PIC-NEXT:    movq %rsp, %rax
 ; X64-PIC-NEXT:    movq %rdi, %rbx
 ; X64-PIC-NEXT:    movq $-1, %r15
 ; X64-PIC-NEXT:    sarq $63, %rax
-; X64-PIC-NEXT:    leaq {{[0-9]+}}(%rsp), %r14
+; X64-PIC-NEXT:    movq %rsp, %r14
 ; X64-PIC-NEXT:    shlq $47, %rax
 ; X64-PIC-NEXT:    movq %r14, %rdi
 ; X64-PIC-NEXT:    orq %rax, %rsp
@@ -436,25 +429,24 @@ define i32 @test_call_setjmp(ptr%ptr) nounwind {
 ; X64-PIC-NEXT:    leaq .Lslh_ret_addr4(%rip), %rcx
 ; X64-PIC-NEXT:    cmpq %rcx, %r12
 ; X64-PIC-NEXT:    cmovneq %r15, %rax
-; X64-PIC-NEXT:    movl (%rbx), %r12d
-; X64-PIC-NEXT:    movl $42, %ebp
+; X64-PIC-NEXT:    movl (%rbx), %ebp
 ; X64-PIC-NEXT:    shlq $47, %rax
 ; X64-PIC-NEXT:    movq %r14, %rdi
-; X64-PIC-NEXT:    movl %ebp, %esi
+; X64-PIC-NEXT:    movl $42, %esi
 ; X64-PIC-NEXT:    orq %rax, %rsp
-; X64-PIC-NEXT:    leaq .Lslh_ret_addr5(%rip), %r13
+; X64-PIC-NEXT:    leaq .Lslh_ret_addr5(%rip), %r12
 ; X64-PIC-NEXT:    callq sigsetjmp@PLT
 ; X64-PIC-NEXT:  .Lslh_ret_addr5:
 ; X64-PIC-NEXT:    movq %rsp, %rax
 ; X64-PIC-NEXT:    sarq $63, %rax
 ; X64-PIC-NEXT:    leaq .Lslh_ret_addr5(%rip), %rcx
-; X64-PIC-NEXT:    cmpq %rcx, %r13
+; X64-PIC-NEXT:    cmpq %rcx, %r12
 ; X64-PIC-NEXT:    cmovneq %r15, %rax
-; X64-PIC-NEXT:    addl (%rbx), %r12d
+; X64-PIC-NEXT:    addl (%rbx), %ebp
 ; X64-PIC-NEXT:    shlq $47, %rax
 ; X64-PIC-NEXT:    movq %r14, %rdi
 ; X64-PIC-NEXT:    movq %r14, %rsi
-; X64-PIC-NEXT:    movl %ebp, %edx
+; X64-PIC-NEXT:    movl $42, %edx
 ; X64-PIC-NEXT:    orq %rax, %rsp
 ; X64-PIC-NEXT:    leaq .Lslh_ret_addr6(%rip), %r14
 ; X64-PIC-NEXT:    callq __sigsetjmp@PLT
@@ -465,15 +457,14 @@ define i32 @test_call_setjmp(ptr%ptr) nounwind {
 ; X64-PIC-NEXT:    cmpq %rcx, %r14
 ; X64-PIC-NEXT:    movq %rax, %rcx
 ; X64-PIC-NEXT:    cmovneq %r15, %rcx
-; X64-PIC-NEXT:    addl (%rbx), %r12d
-; X64-PIC-NEXT:    movl %r12d, %eax
+; X64-PIC-NEXT:    addl (%rbx), %ebp
+; X64-PIC-NEXT:    movl %ebp, %eax
 ; X64-PIC-NEXT:    orl %ecx, %eax
 ; X64-PIC-NEXT:    shlq $47, %rcx
 ; X64-PIC-NEXT:    orq %rcx, %rsp
-; X64-PIC-NEXT:    addq $24, %rsp
+; X64-PIC-NEXT:    addq $16, %rsp
 ; X64-PIC-NEXT:    popq %rbx
 ; X64-PIC-NEXT:    popq %r12
-; X64-PIC-NEXT:    popq %r13
 ; X64-PIC-NEXT:    popq %r14
 ; X64-PIC-NEXT:    popq %r15
 ; X64-PIC-NEXT:    popq %rbp
diff --git a/llvm/test/CodeGen/X86/swifterror.ll b/llvm/test/CodeGen/X86/swifterror.ll
index 6d77e04504e2d..5814146a54613 100644
--- a/llvm/test/CodeGen/X86/swifterror.ll
+++ b/llvm/test/CodeGen/X86/swifterror.ll
@@ -1566,11 +1566,11 @@ define swiftcc { i64, i64, i64, i64} @params_and_return_in_reg(i64, i64, i64, i6
 ; CHECK-APPLE-NEXT:    .cfi_offset %r14, -32
 ; CHECK-APPLE-NEXT:    .cfi_offset %r15, -24
 ; CHECK-APPLE-NEXT:    .cfi_offset %rbp, -16
-; CHECK-APPLE-NEXT:    movq %r12, %rbx
-; CHECK-APPLE-NEXT:    movq %r13, (%rsp) ## 8-byte Spill
+; CHECK-APPLE-NEXT:    movq %r12, (%rsp) ## 8-byte Spill
+; CHECK-APPLE-NEXT:    movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
 ; CHECK-APPLE-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
 ; CHECK-APPLE-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; CHECK-APPLE-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; CHECK-APPLE-NEXT:    movq %rcx, %rbx
 ; CHECK-APPLE-NEXT:    movq %rdx, %r14
 ; CHECK-APPLE-NEXT:    movq %rsi, %r15
 ; CHECK-APPLE-NEXT:    movq %rdi, %rbp
@@ -1587,16 +1587,16 @@ define swiftcc { i64, i64, i64, i64} @params_and_return_in_reg(i64, i64, i64, i6
 ; CHECK-APPLE-NEXT:    movq %rbp, %rdi
 ; CHECK-APPLE-NEXT:    movq %r15, %rsi
 ; CHECK-APPLE-NEXT:    movq %r14, %rdx
-; CHECK-APPLE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx ## 8-byte Reload
+; CHECK-APPLE-NEXT:    movq %rbx, %rcx
 ; CHECK-APPLE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 ## 8-byte Reload
 ; CHECK-APPLE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 ## 8-byte Reload
-; CHECK-APPLE-NEXT:    movq (%rsp), %r13 ## 8-byte Reload
-; CHECK-APPLE-NEXT:    movq %rbx, %r12
+; CHECK-APPLE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 ## 8-byte Reload
+; CHECK-APPLE-NEXT:    movq (%rsp), %r12 ## 8-byte Reload
 ; CHECK-APPLE-NEXT:    callq _params_and_return_in_reg2
-; CHECK-APPLE-NEXT:    movq %rax, %r14
-; CHECK-APPLE-NEXT:    movq %rdx, %r15
-; CHECK-APPLE-NEXT:    movq %rcx, %rbp
-; CHECK-APPLE-NEXT:    movq %r8, %rbx
+; CHECK-APPLE-NEXT:    movq %rax, %rbx
+; CHECK-APPLE-NEXT:    movq %rdx, %r14
+; CHECK-APPLE-NEXT:    movq %rcx, %r15
+; CHECK-APPLE-NEXT:    movq %r8, %rbp
 ; CHECK-APPLE-NEXT:    movq %r12, (%rsp) ## 8-byte Spill
 ; CHECK-APPLE-NEXT:    movl $1, %edi
 ; CHECK-APPLE-NEXT:    movl $2, %esi
@@ -1607,10 +1607,10 @@ define swiftcc { i64, i64, i64, i64} @params_and_return_in_reg(i64, i64, i64, i6
 ; CHECK-APPLE-NEXT:    xorl %r13d, %r13d
 ; CHECK-APPLE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 ## 8-byte Reload
 ; CHECK-APPLE-NEXT:    callq _params_in_reg2
-; CHECK-APPLE-NEXT:    movq %r14, %rax
-; CHECK-APPLE-NEXT:    movq %r15, %rdx
-; CHECK-APPLE-NEXT:    movq %rbp, %rcx
-; CHECK-APPLE-NEXT:    movq %rbx, %r8
+; CHECK-APPLE-NEXT:    movq %rbx, %rax
+; CHECK-APPLE-NEXT:    movq %r14, %rdx
+; CHECK-APPLE-NEXT:    movq %r15, %rcx
+; CHECK-APPLE-NEXT:    movq %rbp, %r8
 ; CHECK-APPLE-NEXT:    movq (%rsp), %r12 ## 8-byte Reload
 ; CHECK-APPLE-NEXT:    addq $48, %rsp
 ; CHECK-APPLE-NEXT:    popq %rbx
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll
index 8f46209689a1d..23c37af1db2f7 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll
@@ -173,13 +173,14 @@ define <8 x i32> @PR46393(<8 x i16> %a0, i8 %a1) {
 define i64 @PR55050() {
 ; X86-LABEL: PR55050:
 ; X86:       # %bb.0: # %entry
+; X86-NEXT:    xorl %edx, %edx
 ; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    testb %al, %al
+; X86-NEXT:    testb %dl, %dl
 ; X86-NEXT:    jne .LBB10_2
 ; X86-NEXT:  # %bb.1: # %if
 ; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    xorl %edx, %edx
 ; X86-NEXT:  .LBB10_2: # %exit
-; X86-NEXT:    movl %eax, %edx
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: PR55050:

From 741579930e9b49cee9d859a7468f5e9d0c6f2006 Mon Sep 17 00:00:00 2001
From: Brad Smith <brad@comstyle.com>
Date: Fri, 27 Oct 2023 15:53:14 -0400
Subject: [PATCH 254/877] [Driver][NFC] Make some derived classes of Tool final
 (#70416)

---
 clang/lib/Driver/ToolChains/AIX.h            | 4 ++--
 clang/lib/Driver/ToolChains/AMDGPU.h         | 2 +-
 clang/lib/Driver/ToolChains/AVR.h            | 2 +-
 clang/lib/Driver/ToolChains/BareMetal.h      | 2 +-
 clang/lib/Driver/ToolChains/CSKYToolChain.h  | 2 +-
 clang/lib/Driver/ToolChains/CrossWindows.h   | 4 ++--
 clang/lib/Driver/ToolChains/Cuda.h           | 4 ++--
 clang/lib/Driver/ToolChains/DragonFly.h      | 4 ++--
 clang/lib/Driver/ToolChains/FreeBSD.h        | 4 ++--
 clang/lib/Driver/ToolChains/Fuchsia.h        | 2 +-
 clang/lib/Driver/ToolChains/HIPAMD.h         | 2 +-
 clang/lib/Driver/ToolChains/HIPSPV.h         | 2 +-
 clang/lib/Driver/ToolChains/Hexagon.h        | 4 ++--
 clang/lib/Driver/ToolChains/MSP430.h         | 2 +-
 clang/lib/Driver/ToolChains/MSVC.h           | 2 +-
 clang/lib/Driver/ToolChains/MinGW.h          | 2 +-
 clang/lib/Driver/ToolChains/NaCl.h           | 2 +-
 clang/lib/Driver/ToolChains/NetBSD.h         | 4 ++--
 clang/lib/Driver/ToolChains/OpenBSD.h        | 4 ++--
 clang/lib/Driver/ToolChains/PS4CPU.h         | 4 ++--
 clang/lib/Driver/ToolChains/RISCVToolchain.h | 2 +-
 clang/lib/Driver/ToolChains/SPIRV.h          | 2 +-
 clang/lib/Driver/ToolChains/Solaris.h        | 4 ++--
 clang/lib/Driver/ToolChains/WebAssembly.h    | 2 +-
 clang/lib/Driver/ToolChains/XCore.h          | 4 ++--
 clang/lib/Driver/ToolChains/ZOS.h            | 4 ++--
 26 files changed, 38 insertions(+), 38 deletions(-)

diff --git a/clang/lib/Driver/ToolChains/AIX.h b/clang/lib/Driver/ToolChains/AIX.h
index 04f9b2240999c..755d87e07ec50 100644
--- a/clang/lib/Driver/ToolChains/AIX.h
+++ b/clang/lib/Driver/ToolChains/AIX.h
@@ -19,7 +19,7 @@ namespace tools {
 /// Directly call system default assembler and linker.
 namespace aix {
 
-class LLVM_LIBRARY_VISIBILITY Assembler : public Tool {
+class LLVM_LIBRARY_VISIBILITY Assembler final : public Tool {
 public:
   Assembler(const ToolChain &TC) : Tool("aix::Assembler", "assembler", TC) {}
 
@@ -31,7 +31,7 @@ class LLVM_LIBRARY_VISIBILITY Assembler : public Tool {
                     const char *LinkingOutput) const override;
 };
 
-class LLVM_LIBRARY_VISIBILITY Linker : public Tool {
+class LLVM_LIBRARY_VISIBILITY Linker final : public Tool {
 public:
   Linker(const ToolChain &TC) : Tool("aix::Linker", "linker", TC) {}
 
diff --git a/clang/lib/Driver/ToolChains/AMDGPU.h b/clang/lib/Driver/ToolChains/AMDGPU.h
index d10d487dd44c5..b3361b1e36079 100644
--- a/clang/lib/Driver/ToolChains/AMDGPU.h
+++ b/clang/lib/Driver/ToolChains/AMDGPU.h
@@ -26,7 +26,7 @@ namespace driver {
 namespace tools {
 namespace amdgpu {
 
-class LLVM_LIBRARY_VISIBILITY Linker : public Tool {
+class LLVM_LIBRARY_VISIBILITY Linker final : public Tool {
 public:
   Linker(const ToolChain &TC) : Tool("amdgpu::Linker", "ld.lld", TC) {}
   bool isLinkJob() const override { return true; }
diff --git a/clang/lib/Driver/ToolChains/AVR.h b/clang/lib/Driver/ToolChains/AVR.h
index d432d81744b92..247188b7eaad7 100644
--- a/clang/lib/Driver/ToolChains/AVR.h
+++ b/clang/lib/Driver/ToolChains/AVR.h
@@ -49,7 +49,7 @@ class LLVM_LIBRARY_VISIBILITY AVRToolChain : public Generic_ELF {
 
 namespace tools {
 namespace AVR {
-class LLVM_LIBRARY_VISIBILITY Linker : public Tool {
+class LLVM_LIBRARY_VISIBILITY Linker final : public Tool {
 public:
   Linker(const llvm::Triple &Triple, const ToolChain &TC)
       : Tool("AVR::Linker", "avr-ld", TC), Triple(Triple) {}
diff --git a/clang/lib/Driver/ToolChains/BareMetal.h b/clang/lib/Driver/ToolChains/BareMetal.h
index fc39a2a10e019..f602ef2be3542 100644
--- a/clang/lib/Driver/ToolChains/BareMetal.h
+++ b/clang/lib/Driver/ToolChains/BareMetal.h
@@ -98,7 +98,7 @@ class LLVM_LIBRARY_VISIBILITY StaticLibTool : public Tool {
                     const char *LinkingOutput) const override;
 };
 
-class LLVM_LIBRARY_VISIBILITY Linker : public Tool {
+class LLVM_LIBRARY_VISIBILITY Linker final : public Tool {
 public:
   Linker(const ToolChain &TC) : Tool("baremetal::Linker", "ld.lld", TC) {}
   bool isLinkJob() const override { return true; }
diff --git a/clang/lib/Driver/ToolChains/CSKYToolChain.h b/clang/lib/Driver/ToolChains/CSKYToolChain.h
index 52b37539cc5f1..a57324a426416 100644
--- a/clang/lib/Driver/ToolChains/CSKYToolChain.h
+++ b/clang/lib/Driver/ToolChains/CSKYToolChain.h
@@ -44,7 +44,7 @@ class LLVM_LIBRARY_VISIBILITY CSKYToolChain : public Generic_ELF {
 
 namespace tools {
 namespace CSKY {
-class LLVM_LIBRARY_VISIBILITY Linker : public Tool {
+class LLVM_LIBRARY_VISIBILITY Linker final : public Tool {
 public:
   Linker(const ToolChain &TC) : Tool("CSKY::Linker", "ld", TC) {}
   bool hasIntegratedCPP() const override { return false; }
diff --git a/clang/lib/Driver/ToolChains/CrossWindows.h b/clang/lib/Driver/ToolChains/CrossWindows.h
index 0ba17bc3e305c..c5df55a24296f 100644
--- a/clang/lib/Driver/ToolChains/CrossWindows.h
+++ b/clang/lib/Driver/ToolChains/CrossWindows.h
@@ -20,7 +20,7 @@ namespace driver {
 namespace tools {
 
 namespace CrossWindows {
-class LLVM_LIBRARY_VISIBILITY Assembler : public Tool {
+class LLVM_LIBRARY_VISIBILITY Assembler final : public Tool {
 public:
   Assembler(const ToolChain &TC) : Tool("CrossWindows::Assembler", "as", TC) {}
 
@@ -32,7 +32,7 @@ class LLVM_LIBRARY_VISIBILITY Assembler : public Tool {
                     const char *LinkingOutput) const override;
 };
 
-class LLVM_LIBRARY_VISIBILITY Linker : public Tool {
+class LLVM_LIBRARY_VISIBILITY Linker final : public Tool {
 public:
   Linker(const ToolChain &TC) : Tool("CrossWindows::Linker", "ld", TC) {}
 
diff --git a/clang/lib/Driver/ToolChains/Cuda.h b/clang/lib/Driver/ToolChains/Cuda.h
index f7c0c7ea1c98c..8a053f3393e12 100644
--- a/clang/lib/Driver/ToolChains/Cuda.h
+++ b/clang/lib/Driver/ToolChains/Cuda.h
@@ -81,7 +81,7 @@ namespace tools {
 namespace NVPTX {
 
 // Run ptxas, the NVPTX assembler.
-class LLVM_LIBRARY_VISIBILITY Assembler : public Tool {
+class LLVM_LIBRARY_VISIBILITY Assembler final : public Tool {
 public:
   Assembler(const ToolChain &TC) : Tool("NVPTX::Assembler", "ptxas", TC) {}
 
@@ -108,7 +108,7 @@ class LLVM_LIBRARY_VISIBILITY FatBinary : public Tool {
 };
 
 // Runs nvlink, which links GPU object files ("cubin" files) into a single file.
-class LLVM_LIBRARY_VISIBILITY Linker : public Tool {
+class LLVM_LIBRARY_VISIBILITY Linker final : public Tool {
 public:
   Linker(const ToolChain &TC) : Tool("NVPTX::Linker", "nvlink", TC) {}
 
diff --git a/clang/lib/Driver/ToolChains/DragonFly.h b/clang/lib/Driver/ToolChains/DragonFly.h
index 4dceb09a17364..715f6b45519ba 100644
--- a/clang/lib/Driver/ToolChains/DragonFly.h
+++ b/clang/lib/Driver/ToolChains/DragonFly.h
@@ -19,7 +19,7 @@ namespace tools {
 
 /// Directly call GNU Binutils assembler and linker
 namespace dragonfly {
-class LLVM_LIBRARY_VISIBILITY Assembler : public Tool {
+class LLVM_LIBRARY_VISIBILITY Assembler final : public Tool {
 public:
   Assembler(const ToolChain &TC)
       : Tool("dragonfly::Assembler", "assembler", TC) {}
@@ -32,7 +32,7 @@ class LLVM_LIBRARY_VISIBILITY Assembler : public Tool {
                     const char *LinkingOutput) const override;
 };
 
-class LLVM_LIBRARY_VISIBILITY Linker : public Tool {
+class LLVM_LIBRARY_VISIBILITY Linker final : public Tool {
 public:
   Linker(const ToolChain &TC) : Tool("dragonfly::Linker", "linker", TC) {}
 
diff --git a/clang/lib/Driver/ToolChains/FreeBSD.h b/clang/lib/Driver/ToolChains/FreeBSD.h
index 740274b314ba1..959cbf78ddc79 100644
--- a/clang/lib/Driver/ToolChains/FreeBSD.h
+++ b/clang/lib/Driver/ToolChains/FreeBSD.h
@@ -19,7 +19,7 @@ namespace tools {
 
 /// Directly call GNU Binutils assembler and linker
 namespace freebsd {
-class LLVM_LIBRARY_VISIBILITY Assembler : public Tool {
+class LLVM_LIBRARY_VISIBILITY Assembler final : public Tool {
 public:
   Assembler(const ToolChain &TC)
       : Tool("freebsd::Assembler", "assembler", TC) {}
@@ -32,7 +32,7 @@ class LLVM_LIBRARY_VISIBILITY Assembler : public Tool {
                     const char *LinkingOutput) const override;
 };
 
-class LLVM_LIBRARY_VISIBILITY Linker : public Tool {
+class LLVM_LIBRARY_VISIBILITY Linker final : public Tool {
 public:
   Linker(const ToolChain &TC) : Tool("freebsd::Linker", "linker", TC) {}
 
diff --git a/clang/lib/Driver/ToolChains/Fuchsia.h b/clang/lib/Driver/ToolChains/Fuchsia.h
index 95e1785c9fac9..619968f585024 100644
--- a/clang/lib/Driver/ToolChains/Fuchsia.h
+++ b/clang/lib/Driver/ToolChains/Fuchsia.h
@@ -32,7 +32,7 @@ class LLVM_LIBRARY_VISIBILITY StaticLibTool : public Tool {
                     const char *LinkingOutput) const override;
 };
 
-class LLVM_LIBRARY_VISIBILITY Linker : public Tool {
+class LLVM_LIBRARY_VISIBILITY Linker final : public Tool {
 public:
   Linker(const ToolChain &TC) : Tool("fuchsia::Linker", "ld.lld", TC) {}
 
diff --git a/clang/lib/Driver/ToolChains/HIPAMD.h b/clang/lib/Driver/ToolChains/HIPAMD.h
index 7275b1a137ac8..d81a9733014cc 100644
--- a/clang/lib/Driver/ToolChains/HIPAMD.h
+++ b/clang/lib/Driver/ToolChains/HIPAMD.h
@@ -21,7 +21,7 @@ namespace tools {
 namespace AMDGCN {
 // Runs llvm-link/opt/llc/lld, which links multiple LLVM bitcode, together with
 // device library, then compiles it to ISA in a shared object.
-class LLVM_LIBRARY_VISIBILITY Linker : public Tool {
+class LLVM_LIBRARY_VISIBILITY Linker final : public Tool {
 public:
   Linker(const ToolChain &TC) : Tool("AMDGCN::Linker", "amdgcn-link", TC) {}
 
diff --git a/clang/lib/Driver/ToolChains/HIPSPV.h b/clang/lib/Driver/ToolChains/HIPSPV.h
index 1eaef432171ec..ecd82e7052e48 100644
--- a/clang/lib/Driver/ToolChains/HIPSPV.h
+++ b/clang/lib/Driver/ToolChains/HIPSPV.h
@@ -20,7 +20,7 @@ namespace HIPSPV {
 
 // Runs llvm-link/opt/llc/lld, which links multiple LLVM bitcode, together with
 // device library, then compiles it to SPIR-V in a shared object.
-class LLVM_LIBRARY_VISIBILITY Linker : public Tool {
+class LLVM_LIBRARY_VISIBILITY Linker final : public Tool {
 public:
   Linker(const ToolChain &TC) : Tool("HIPSPV::Linker", "hipspv-link", TC) {}
 
diff --git a/clang/lib/Driver/ToolChains/Hexagon.h b/clang/lib/Driver/ToolChains/Hexagon.h
index 4799c3028ff9a..e35a224dced41 100644
--- a/clang/lib/Driver/ToolChains/Hexagon.h
+++ b/clang/lib/Driver/ToolChains/Hexagon.h
@@ -20,7 +20,7 @@ namespace hexagon {
 // For Hexagon, we do not need to instantiate tools for PreProcess, PreCompile
 // and Compile.
 // We simply use "clang -cc1" for those actions.
-class LLVM_LIBRARY_VISIBILITY Assembler : public Tool {
+class LLVM_LIBRARY_VISIBILITY Assembler final : public Tool {
 public:
   Assembler(const ToolChain &TC)
       : Tool("hexagon::Assembler", "hexagon-as", TC) {}
@@ -35,7 +35,7 @@ class LLVM_LIBRARY_VISIBILITY Assembler : public Tool {
                     const char *LinkingOutput) const override;
 };
 
-class LLVM_LIBRARY_VISIBILITY Linker : public Tool {
+class LLVM_LIBRARY_VISIBILITY Linker final : public Tool {
 public:
   Linker(const ToolChain &TC) : Tool("hexagon::Linker", "hexagon-ld", TC) {}
 
diff --git a/clang/lib/Driver/ToolChains/MSP430.h b/clang/lib/Driver/ToolChains/MSP430.h
index 2e838c027e0f8..a224c63754118 100644
--- a/clang/lib/Driver/ToolChains/MSP430.h
+++ b/clang/lib/Driver/ToolChains/MSP430.h
@@ -59,7 +59,7 @@ class LLVM_LIBRARY_VISIBILITY MSP430ToolChain : public Generic_ELF {
 namespace tools {
 namespace msp430 {
 
-class LLVM_LIBRARY_VISIBILITY Linker : public Tool {
+class LLVM_LIBRARY_VISIBILITY Linker final : public Tool {
 public:
   Linker(const ToolChain &TC) : Tool("MSP430::Linker", "msp430-elf-ld", TC) {}
   bool hasIntegratedCPP() const override { return false; }
diff --git a/clang/lib/Driver/ToolChains/MSVC.h b/clang/lib/Driver/ToolChains/MSVC.h
index 0f687bc70ae47..fb27c621f2d1f 100644
--- a/clang/lib/Driver/ToolChains/MSVC.h
+++ b/clang/lib/Driver/ToolChains/MSVC.h
@@ -23,7 +23,7 @@ namespace tools {
 
 /// Visual studio tools.
 namespace visualstudio {
-class LLVM_LIBRARY_VISIBILITY Linker : public Tool {
+class LLVM_LIBRARY_VISIBILITY Linker final : public Tool {
 public:
   Linker(const ToolChain &TC) : Tool("visualstudio::Linker", "linker", TC) {}
 
diff --git a/clang/lib/Driver/ToolChains/MinGW.h b/clang/lib/Driver/ToolChains/MinGW.h
index a71013912d0a3..75d629d5d1cb3 100644
--- a/clang/lib/Driver/ToolChains/MinGW.h
+++ b/clang/lib/Driver/ToolChains/MinGW.h
@@ -34,7 +34,7 @@ class LLVM_LIBRARY_VISIBILITY Assembler : public Tool {
                     const char *LinkingOutput) const override;
 };
 
-class LLVM_LIBRARY_VISIBILITY Linker : public Tool {
+class LLVM_LIBRARY_VISIBILITY Linker final : public Tool {
 public:
   Linker(const ToolChain &TC) : Tool("MinGW::Linker", "linker", TC) {}
 
diff --git a/clang/lib/Driver/ToolChains/NaCl.h b/clang/lib/Driver/ToolChains/NaCl.h
index 5e5fdb583bb60..01d4719e7b929 100644
--- a/clang/lib/Driver/ToolChains/NaCl.h
+++ b/clang/lib/Driver/ToolChains/NaCl.h
@@ -27,7 +27,7 @@ class LLVM_LIBRARY_VISIBILITY AssemblerARM : public gnutools::Assembler {
                     const char *LinkingOutput) const override;
 };
 
-class LLVM_LIBRARY_VISIBILITY Linker : public Tool {
+class LLVM_LIBRARY_VISIBILITY Linker final : public Tool {
 public:
   Linker(const ToolChain &TC) : Tool("NaCl::Linker", "linker", TC) {}
 
diff --git a/clang/lib/Driver/ToolChains/NetBSD.h b/clang/lib/Driver/ToolChains/NetBSD.h
index 044c4239c3f48..96303acaa0099 100644
--- a/clang/lib/Driver/ToolChains/NetBSD.h
+++ b/clang/lib/Driver/ToolChains/NetBSD.h
@@ -19,7 +19,7 @@ namespace tools {
 
 /// Directly call GNU Binutils assembler and linker
 namespace netbsd {
-class LLVM_LIBRARY_VISIBILITY Assembler : public Tool {
+class LLVM_LIBRARY_VISIBILITY Assembler final : public Tool {
 public:
   Assembler(const ToolChain &TC) : Tool("netbsd::Assembler", "assembler", TC) {}
 
@@ -31,7 +31,7 @@ class LLVM_LIBRARY_VISIBILITY Assembler : public Tool {
                     const char *LinkingOutput) const override;
 };
 
-class LLVM_LIBRARY_VISIBILITY Linker : public Tool {
+class LLVM_LIBRARY_VISIBILITY Linker final : public Tool {
 public:
   Linker(const ToolChain &TC) : Tool("netbsd::Linker", "linker", TC) {}
 
diff --git a/clang/lib/Driver/ToolChains/OpenBSD.h b/clang/lib/Driver/ToolChains/OpenBSD.h
index 486dbf1407c24..b4350e72d5d26 100644
--- a/clang/lib/Driver/ToolChains/OpenBSD.h
+++ b/clang/lib/Driver/ToolChains/OpenBSD.h
@@ -20,7 +20,7 @@ namespace tools {
 
 /// Directly call GNU Binutils assembler and linker
 namespace openbsd {
-class LLVM_LIBRARY_VISIBILITY Assembler : public Tool {
+class LLVM_LIBRARY_VISIBILITY Assembler final : public Tool {
 public:
   Assembler(const ToolChain &TC)
       : Tool("openbsd::Assembler", "assembler", TC) {}
@@ -33,7 +33,7 @@ class LLVM_LIBRARY_VISIBILITY Assembler : public Tool {
                     const char *LinkingOutput) const override;
 };
 
-class LLVM_LIBRARY_VISIBILITY Linker : public Tool {
+class LLVM_LIBRARY_VISIBILITY Linker final : public Tool {
 public:
   Linker(const ToolChain &TC) : Tool("openbsd::Linker", "linker", TC) {}
 
diff --git a/clang/lib/Driver/ToolChains/PS4CPU.h b/clang/lib/Driver/ToolChains/PS4CPU.h
index a51351d367be8..fee80e77462f3 100644
--- a/clang/lib/Driver/ToolChains/PS4CPU.h
+++ b/clang/lib/Driver/ToolChains/PS4CPU.h
@@ -27,7 +27,7 @@ void addProfileRTArgs(const ToolChain &TC, const llvm::opt::ArgList &Args,
 void addSanitizerArgs(const ToolChain &TC, const llvm::opt::ArgList &Args,
                       llvm::opt::ArgStringList &CmdArgs);
 
-class LLVM_LIBRARY_VISIBILITY Assembler : public Tool {
+class LLVM_LIBRARY_VISIBILITY Assembler final : public Tool {
 public:
   Assembler(const ToolChain &TC) : Tool("PScpu::Assembler", "assembler", TC) {}
 
@@ -39,7 +39,7 @@ class LLVM_LIBRARY_VISIBILITY Assembler : public Tool {
                     const char *LinkingOutput) const override;
 };
 
-class LLVM_LIBRARY_VISIBILITY Linker : public Tool {
+class LLVM_LIBRARY_VISIBILITY Linker final : public Tool {
 public:
   Linker(const ToolChain &TC) : Tool("PScpu::Linker", "linker", TC) {}
 
diff --git a/clang/lib/Driver/ToolChains/RISCVToolchain.h b/clang/lib/Driver/ToolChains/RISCVToolchain.h
index de6960726f1cd..cec817ef7190b 100644
--- a/clang/lib/Driver/ToolChains/RISCVToolchain.h
+++ b/clang/lib/Driver/ToolChains/RISCVToolchain.h
@@ -46,7 +46,7 @@ class LLVM_LIBRARY_VISIBILITY RISCVToolChain : public Generic_ELF {
 
 namespace tools {
 namespace RISCV {
-class LLVM_LIBRARY_VISIBILITY Linker : public Tool {
+class LLVM_LIBRARY_VISIBILITY Linker final : public Tool {
 public:
   Linker(const ToolChain &TC) : Tool("RISCV::Linker", "ld", TC) {}
   bool hasIntegratedCPP() const override { return false; }
diff --git a/clang/lib/Driver/ToolChains/SPIRV.h b/clang/lib/Driver/ToolChains/SPIRV.h
index 3cd613d9aac29..d4247ee0557f4 100644
--- a/clang/lib/Driver/ToolChains/SPIRV.h
+++ b/clang/lib/Driver/ToolChains/SPIRV.h
@@ -36,7 +36,7 @@ class LLVM_LIBRARY_VISIBILITY Translator : public Tool {
                     const char *LinkingOutput) const override;
 };
 
-class LLVM_LIBRARY_VISIBILITY Linker : public Tool {
+class LLVM_LIBRARY_VISIBILITY Linker final : public Tool {
 public:
   Linker(const ToolChain &TC) : Tool("SPIRV::Linker", "spirv-link", TC) {}
   bool hasIntegratedCPP() const override { return false; }
diff --git a/clang/lib/Driver/ToolChains/Solaris.h b/clang/lib/Driver/ToolChains/Solaris.h
index b8f2497d7c8eb..9ec83b773da44 100644
--- a/clang/lib/Driver/ToolChains/Solaris.h
+++ b/clang/lib/Driver/ToolChains/Solaris.h
@@ -19,7 +19,7 @@ namespace tools {
 
 /// Directly call Solaris assembler and linker
 namespace solaris {
-class LLVM_LIBRARY_VISIBILITY Assembler : public gnutools::Assembler {
+class LLVM_LIBRARY_VISIBILITY Assembler final : public gnutools::Assembler {
 public:
   Assembler(const ToolChain &TC) : gnutools::Assembler(TC) {}
 
@@ -33,7 +33,7 @@ class LLVM_LIBRARY_VISIBILITY Assembler : public gnutools::Assembler {
 
 bool isLinkerGnuLd(const ToolChain &TC, const llvm::opt::ArgList &Args);
 
-class LLVM_LIBRARY_VISIBILITY Linker : public Tool {
+class LLVM_LIBRARY_VISIBILITY Linker final : public Tool {
 public:
   Linker(const ToolChain &TC) : Tool("solaris::Linker", "linker", TC) {}
 
diff --git a/clang/lib/Driver/ToolChains/WebAssembly.h b/clang/lib/Driver/ToolChains/WebAssembly.h
index 39589ffc1e3b6..ae60f464c1081 100644
--- a/clang/lib/Driver/ToolChains/WebAssembly.h
+++ b/clang/lib/Driver/ToolChains/WebAssembly.h
@@ -18,7 +18,7 @@ namespace driver {
 namespace tools {
 namespace wasm {
 
-class LLVM_LIBRARY_VISIBILITY Linker : public Tool {
+class LLVM_LIBRARY_VISIBILITY Linker final : public Tool {
 public:
   explicit Linker(const ToolChain &TC) : Tool("wasm::Linker", "linker", TC) {}
   bool isLinkJob() const override { return true; }
diff --git a/clang/lib/Driver/ToolChains/XCore.h b/clang/lib/Driver/ToolChains/XCore.h
index f2e66350243e5..95359a6e2542f 100644
--- a/clang/lib/Driver/ToolChains/XCore.h
+++ b/clang/lib/Driver/ToolChains/XCore.h
@@ -20,7 +20,7 @@ namespace XCore {
 // For XCore, we do not need to instantiate tools for PreProcess, PreCompile and
 // Compile.
 // We simply use "clang -cc1" for those actions.
-class LLVM_LIBRARY_VISIBILITY Assembler : public Tool {
+class LLVM_LIBRARY_VISIBILITY Assembler final : public Tool {
 public:
   Assembler(const ToolChain &TC) : Tool("XCore::Assembler", "XCore-as", TC) {}
 
@@ -31,7 +31,7 @@ class LLVM_LIBRARY_VISIBILITY Assembler : public Tool {
                     const char *LinkingOutput) const override;
 };
 
-class LLVM_LIBRARY_VISIBILITY Linker : public Tool {
+class LLVM_LIBRARY_VISIBILITY Linker final : public Tool {
 public:
   Linker(const ToolChain &TC) : Tool("XCore::Linker", "XCore-ld", TC) {}
 
diff --git a/clang/lib/Driver/ToolChains/ZOS.h b/clang/lib/Driver/ToolChains/ZOS.h
index 4b9b332c0f3fb..45204ba0a543c 100644
--- a/clang/lib/Driver/ToolChains/ZOS.h
+++ b/clang/lib/Driver/ToolChains/ZOS.h
@@ -19,7 +19,7 @@ namespace tools {
 /// Directly call system default assembler and linker.
 namespace zos {
 
-class LLVM_LIBRARY_VISIBILITY Assembler : public Tool {
+class LLVM_LIBRARY_VISIBILITY Assembler final : public Tool {
 public:
   Assembler(const ToolChain &TC) : Tool("zos::Assembler", "assembler", TC) {}
 
@@ -31,7 +31,7 @@ class LLVM_LIBRARY_VISIBILITY Assembler : public Tool {
                     const char *LinkingOutput) const override;
 };
 
-class LLVM_LIBRARY_VISIBILITY Linker : public Tool {
+class LLVM_LIBRARY_VISIBILITY Linker final : public Tool {
 public:
   Linker(const ToolChain &TC) : Tool("zos::Linker", "linker", TC) {}
 

From 8e447a123b0006b4c14fcb61049559039b3da32f Mon Sep 17 00:00:00 2001
From: Joseph Huber <35342157+jhuber6@users.noreply.github.com>
Date: Fri, 27 Oct 2023 14:55:37 -0500
Subject: [PATCH 255/877] [libc] Optimize the RPC memory copy for the AMDGPU
 target (#70467)

Summary:
We previously made the change to make the GPU target use builtin
implementations of memory copy functions. However, this had the negative
effect of massively increasing register usages when using the printing
interface. For example, a `printf` call went from using 25 VGPRs to 54
simply because of using the builtin. However, we probably want to still
export the builitin, but for the RPC interface we heavily prefer small
resource usage over the performance gains of fully unrolling this loop.
For NVPTX however, the builtin implementation causes the resource usage
to go down (36 registers total for a regular `fputs` call) so we will
maintain that implementation.

I think specializing this is the right call as we will always prefer the
implementation with the smallest resource footprint for this interface,
as performance is already going to be heavily bottlenecked by the use of
fine-grained memory.
---
 libc/src/__support/RPC/rpc.h      |  9 ++++-----
 libc/src/__support/RPC/rpc_util.h | 14 ++++++++++++++
 2 files changed, 18 insertions(+), 5 deletions(-)

diff --git a/libc/src/__support/RPC/rpc.h b/libc/src/__support/RPC/rpc.h
index 88c62dcdc340f..08c1dfd10d6d7 100644
--- a/libc/src/__support/RPC/rpc.h
+++ b/libc/src/__support/RPC/rpc.h
@@ -24,7 +24,6 @@
 #include "src/__support/CPP/functional.h"
 #include "src/__support/CPP/optional.h"
 #include "src/__support/GPU/utils.h"
-#include "src/string/memory_utils/inline_memcpy.h"
 
 #include <stdint.h>
 
@@ -458,7 +457,7 @@ LIBC_INLINE void Port<T, S>::send_n(const void *const *src, uint64_t *size) {
         lane_value(size, id) > sizeof(Buffer::data) - sizeof(uint64_t)
             ? sizeof(Buffer::data) - sizeof(uint64_t)
             : lane_value(size, id);
-    inline_memcpy(&buffer->data[1], lane_value(src, id), len);
+    rpc_memcpy(&buffer->data[1], lane_value(src, id), len);
   });
   uint64_t idx = sizeof(Buffer::data) - sizeof(uint64_t);
   uint64_t mask = process.packet[index].header.mask;
@@ -468,7 +467,7 @@ LIBC_INLINE void Port<T, S>::send_n(const void *const *src, uint64_t *size) {
                          ? sizeof(Buffer::data)
                          : lane_value(size, id) - idx;
       if (idx < lane_value(size, id))
-        inline_memcpy(buffer->data, advance(lane_value(src, id), idx), len);
+        rpc_memcpy(buffer->data, advance(lane_value(src, id), idx), len);
     });
     idx += sizeof(Buffer::data);
   }
@@ -491,7 +490,7 @@ LIBC_INLINE void Port<T, S>::recv_n(void **dst, uint64_t *size, A &&alloc) {
         lane_value(size, id) > sizeof(Buffer::data) - sizeof(uint64_t)
             ? sizeof(Buffer::data) - sizeof(uint64_t)
             : lane_value(size, id);
-    inline_memcpy(lane_value(dst, id), &buffer->data[1], len);
+    rpc_memcpy(lane_value(dst, id), &buffer->data[1], len);
   });
   uint64_t idx = sizeof(Buffer::data) - sizeof(uint64_t);
   uint64_t mask = process.packet[index].header.mask;
@@ -501,7 +500,7 @@ LIBC_INLINE void Port<T, S>::recv_n(void **dst, uint64_t *size, A &&alloc) {
                          ? sizeof(Buffer::data)
                          : lane_value(size, id) - idx;
       if (idx < lane_value(size, id))
-        inline_memcpy(advance(lane_value(dst, id), idx), buffer->data, len);
+        rpc_memcpy(advance(lane_value(dst, id), idx), buffer->data, len);
     });
     idx += sizeof(Buffer::data);
   }
diff --git a/libc/src/__support/RPC/rpc_util.h b/libc/src/__support/RPC/rpc_util.h
index 46ca841c49199..04620b0487f4a 100644
--- a/libc/src/__support/RPC/rpc_util.h
+++ b/libc/src/__support/RPC/rpc_util.h
@@ -13,6 +13,8 @@
 #include "src/__support/GPU/utils.h"
 #include "src/__support/macros/attributes.h" // LIBC_INLINE
 #include "src/__support/macros/properties/architectures.h"
+#include "src/string/memory_utils/generic/byte_per_byte.h"
+#include "src/string/memory_utils/inline_memcpy.h"
 
 namespace LIBC_NAMESPACE {
 namespace rpc {
@@ -64,6 +66,18 @@ template <typename T, typename U> LIBC_INLINE T *advance(T *ptr, U bytes) {
     return reinterpret_cast<T *>(reinterpret_cast<uint8_t *>(ptr) + bytes);
 }
 
+/// Wrapper around the optimal memory copy implementation for the target.
+LIBC_INLINE void rpc_memcpy(void *dst, const void *src, size_t count) {
+  // The built-in memcpy prefers to fully unroll loops. We want to minimize
+  // resource usage so we use a single nounroll loop implementation.
+#if defined(LIBC_TARGET_ARCH_IS_AMDGPU)
+  inline_memcpy_byte_per_byte(reinterpret_cast<Ptr>(dst),
+                              reinterpret_cast<CPtr>(src), count);
+#else
+  inline_memcpy(dst, src, count);
+#endif
+}
+
 } // namespace rpc
 } // namespace LIBC_NAMESPACE
 

From 5513d58ad5da093ac94ec511e575b1d27606b737 Mon Sep 17 00:00:00 2001
From: Joseph Huber <35342157+jhuber6@users.noreply.github.com>
Date: Fri, 27 Oct 2023 14:56:29 -0500
Subject: [PATCH 256/877] [OpenMP][AMDGPU] Do not include 'ockl'
 implementations in OpenMP (#70462)

Summary:
The 'ockl' bitcode library from the ROCm device library contains several
implementations of functions like `printf` and `malloc`. We currently do
not depend on these in the OpenMP toolchain, so we shouldn't be linking
them. The primary motivation behind this change is the library rewriting
calls to `printf` and pulling in other unused 'hostcall' routines.
---
 clang/lib/Driver/ToolChains/AMDGPU.cpp      | 3 ++-
 clang/test/Driver/amdgpu-openmp-toolchain.c | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/clang/lib/Driver/ToolChains/AMDGPU.cpp b/clang/lib/Driver/ToolChains/AMDGPU.cpp
index d4b33ad551c43..cad206ea4df1b 100644
--- a/clang/lib/Driver/ToolChains/AMDGPU.cpp
+++ b/clang/lib/Driver/ToolChains/AMDGPU.cpp
@@ -951,7 +951,8 @@ RocmInstallationDetector::getCommonBitcodeLibs(
   auto AddBCLib = [&](StringRef BCFile) { BCLibs.push_back(BCFile.str()); };
 
   AddBCLib(getOCMLPath());
-  AddBCLib(getOCKLPath());
+  if (!isOpenMP)
+    AddBCLib(getOCKLPath());
   AddBCLib(getDenormalsAreZeroPath(DAZ));
   AddBCLib(getUnsafeMathPath(UnsafeMathOpt || FastRelaxedMath));
   AddBCLib(getFiniteOnlyPath(FiniteOnly || FastRelaxedMath));
diff --git a/clang/test/Driver/amdgpu-openmp-toolchain.c b/clang/test/Driver/amdgpu-openmp-toolchain.c
index 06b6ed324e74e..f38486ad0cccc 100644
--- a/clang/test/Driver/amdgpu-openmp-toolchain.c
+++ b/clang/test/Driver/amdgpu-openmp-toolchain.c
@@ -56,7 +56,7 @@
 // RUN: %clang -### -target x86_64-pc-linux-gnu -fopenmp --offload-arch=gfx803 \
 // RUN:   --rocm-device-lib-path=%S/Inputs/rocm/amdgcn/bitcode -fopenmp-new-driver %s  2>&1 | \
 // RUN: FileCheck %s --check-prefix=CHECK-LIB-DEVICE
-// CHECK-LIB-DEVICE: "-cc1" {{.*}}ocml.bc"{{.*}}ockl.bc"{{.*}}oclc_daz_opt_on.bc"{{.*}}oclc_unsafe_math_off.bc"{{.*}}oclc_finite_only_off.bc"{{.*}}oclc_correctly_rounded_sqrt_on.bc"{{.*}}oclc_wavefrontsize64_on.bc"{{.*}}oclc_isa_version_803.bc"
+// CHECK-LIB-DEVICE: "-cc1" {{.*}}ocml.bc"{{.*}}oclc_daz_opt_on.bc"{{.*}}oclc_unsafe_math_off.bc"{{.*}}oclc_finite_only_off.bc"{{.*}}oclc_correctly_rounded_sqrt_on.bc"{{.*}}oclc_wavefrontsize64_on.bc"{{.*}}oclc_isa_version_803.bc"
 
 // RUN: %clang -### -target x86_64-pc-linux-gnu -fopenmp --offload-arch=gfx803 -nogpulib \
 // RUN:   --rocm-device-lib-path=%S/Inputs/rocm/amdgcn/bitcode -fopenmp-new-driver %s  2>&1 | \

From 5c159222627e2201bee73c401be3b76f995b2336 Mon Sep 17 00:00:00 2001
From: Youngsuk Kim <youngsuk.kim@hpe.com>
Date: Fri, 27 Oct 2023 14:56:44 -0500
Subject: [PATCH 257/877] [clang] Remove no-op ptr-to-ptr bitcasts (NFC)

Opaque pointer cleanup effort. NFC.
---
 clang/lib/CodeGen/CGDecl.cpp             | 2 --
 clang/lib/CodeGen/CGExprCXX.cpp          | 4 +---
 clang/lib/CodeGen/CGNonTrivialStruct.cpp | 2 --
 clang/lib/CodeGen/MicrosoftCXXABI.cpp    | 2 +-
 4 files changed, 2 insertions(+), 8 deletions(-)

diff --git a/clang/lib/CodeGen/CGDecl.cpp b/clang/lib/CodeGen/CGDecl.cpp
index 4d4c94d008c9d..9e4319d27858e 100644
--- a/clang/lib/CodeGen/CGDecl.cpp
+++ b/clang/lib/CodeGen/CGDecl.cpp
@@ -1358,7 +1358,6 @@ llvm::Value *CodeGenFunction::EmitLifetimeStart(llvm::TypeSize Size,
          "Pointer should be in alloca address space");
   llvm::Value *SizeV = llvm::ConstantInt::get(
       Int64Ty, Size.isScalable() ? -1 : Size.getFixedValue());
-  Addr = Builder.CreateBitCast(Addr, AllocaInt8PtrTy);
   llvm::CallInst *C =
       Builder.CreateCall(CGM.getLLVMLifetimeStartFn(), {SizeV, Addr});
   C->setDoesNotThrow();
@@ -1369,7 +1368,6 @@ void CodeGenFunction::EmitLifetimeEnd(llvm::Value *Size, llvm::Value *Addr) {
   assert(Addr->getType()->getPointerAddressSpace() ==
              CGM.getDataLayout().getAllocaAddrSpace() &&
          "Pointer should be in alloca address space");
-  Addr = Builder.CreateBitCast(Addr, AllocaInt8PtrTy);
   llvm::CallInst *C =
       Builder.CreateCall(CGM.getLLVMLifetimeEndFn(), {Size, Addr});
   C->setDoesNotThrow();
diff --git a/clang/lib/CodeGen/CGExprCXX.cpp b/clang/lib/CodeGen/CGExprCXX.cpp
index 2e7059cc8f5b6..4bd8462ab5f8b 100644
--- a/clang/lib/CodeGen/CGExprCXX.cpp
+++ b/clang/lib/CodeGen/CGExprCXX.cpp
@@ -1106,9 +1106,7 @@ void CodeGenFunction::EmitNewArrayInitializer(
       // element.  TODO: some of these stores can be trivially
       // observed to be unnecessary.
       if (EndOfInit.isValid()) {
-        auto FinishedPtr =
-          Builder.CreateBitCast(CurPtr.getPointer(), BeginPtr.getType());
-        Builder.CreateStore(FinishedPtr, EndOfInit);
+        Builder.CreateStore(CurPtr.getPointer(), EndOfInit);
       }
       // FIXME: If the last initializer is an incomplete initializer list for
       // an array, and we have an array filler, we can fold together the two
diff --git a/clang/lib/CodeGen/CGNonTrivialStruct.cpp b/clang/lib/CodeGen/CGNonTrivialStruct.cpp
index 3d2b1b8b2f78d..15c4a3e706c9d 100644
--- a/clang/lib/CodeGen/CGNonTrivialStruct.cpp
+++ b/clang/lib/CodeGen/CGNonTrivialStruct.cpp
@@ -367,8 +367,6 @@ template <class Derived> struct GenFuncBase {
         CGF.Builder.CreateNUWMul(BaseEltSizeVal, NumElts);
     llvm::Value *DstArrayEnd = CGF.Builder.CreateInBoundsGEP(
         CGF.Int8Ty, DstAddr.getPointer(), SizeInBytes);
-    DstArrayEnd = CGF.Builder.CreateBitCast(
-        DstArrayEnd, CGF.CGM.Int8PtrPtrTy, "dstarray.end");
     llvm::BasicBlock *PreheaderBB = CGF.Builder.GetInsertBlock();
 
     // Create the header block and insert the phi instructions.
diff --git a/clang/lib/CodeGen/MicrosoftCXXABI.cpp b/clang/lib/CodeGen/MicrosoftCXXABI.cpp
index d623f8f63ae56..b6f941052abee 100644
--- a/clang/lib/CodeGen/MicrosoftCXXABI.cpp
+++ b/clang/lib/CodeGen/MicrosoftCXXABI.cpp
@@ -4449,7 +4449,7 @@ void MicrosoftCXXABI::emitThrow(CodeGenFunction &CGF, const CXXThrowExpr *E) {
 
   // Call into the runtime to throw the exception.
   llvm::Value *Args[] = {
-    CGF.Builder.CreateBitCast(AI.getPointer(), CGM.Int8PtrTy),
+    AI.getPointer(),
     TI
   };
   CGF.EmitNoreturnRuntimeCallOrInvoke(getThrowFn(), Args);

From 33330966e59b2654024c0eb889837c0c54695c98 Mon Sep 17 00:00:00 2001
From: Daniil Suchkov <dsuchkov@azul.com>
Date: Fri, 27 Oct 2023 19:56:43 +0000
Subject: [PATCH 258/877] [Test] NFC. Add a test exposing a SCEV bug causing an
 LSR miscompile

---
 .../scev-incorrect-nuw-inference.ll           | 69 +++++++++++++++++++
 1 file changed, 69 insertions(+)
 create mode 100644 llvm/test/Transforms/LoopStrengthReduce/scev-incorrect-nuw-inference.ll

diff --git a/llvm/test/Transforms/LoopStrengthReduce/scev-incorrect-nuw-inference.ll b/llvm/test/Transforms/LoopStrengthReduce/scev-incorrect-nuw-inference.ll
new file mode 100644
index 0000000000000..9347169b3902c
--- /dev/null
+++ b/llvm/test/Transforms/LoopStrengthReduce/scev-incorrect-nuw-inference.ll
@@ -0,0 +1,69 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; RUN: opt < %s -loop-reduce -S | FileCheck %s
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128-ni:1-p2:32:8:8:32-ni:2"
+target triple = "x86_64-unknown-linux-gnu"
+
+; FIXME: the returned value should be equal to
+;     zext (trunk (%phi-1) to i16) to i64
+;   or simply
+;     zext (%phi-1) to i64
+; which means it should be equal to 1209. Currently, due to a bug in SCEV, it's
+; over 65534.
+define noundef i64 @test() {
+; CHECK-LABEL: define noundef i64 @test() {
+; CHECK-NEXT:  bb2:
+; CHECK-NEXT:    br label [[BB3:%.*]]
+; CHECK:       bb3:
+; CHECK-NEXT:    [[LSR_IV:%.*]] = phi i64 [ [[LSR_IV_NEXT:%.*]], [[BB10:%.*]] ], [ 0, [[BB2:%.*]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[LSR_IV]], 65535
+; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[LSR_IV]], 65407
+; CHECK-NEXT:    [[ICMP5:%.*]] = icmp ult i64 [[TMP1]], -256
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[TMP0]] to i32
+; CHECK-NEXT:    [[ICMP6:%.*]] = icmp ult i32 [[TMP2]], 128
+; CHECK-NEXT:    [[OR:%.*]] = or i1 [[ICMP5]], [[ICMP6]]
+; CHECK-NEXT:    br i1 [[OR]], label [[BB10]], label [[BB7:%.*]]
+; CHECK:       bb7:
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[LSR_IV]] to i32
+; CHECK-NEXT:    call void @foo(i32 [[TMP1]])
+; CHECK-NEXT:    unreachable
+; CHECK:       bb10:
+; CHECK-NEXT:    [[LSR_IV_NEXT]] = add nuw nsw i64 [[LSR_IV]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[LSR_IV_NEXT]], -1
+; CHECK-NEXT:    [[TMP:%.*]] = trunc i64 [[TMP2]] to i32
+; CHECK-NEXT:    [[ICMP12:%.*]] = icmp ult i32 [[TMP]], 1210
+; CHECK-NEXT:    br i1 [[ICMP12]], label [[BB3]], label [[BB13:%.*]]
+; CHECK:       bb13:
+; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[LSR_IV_NEXT]], 65534
+; CHECK-NEXT:    ret i64 [[TMP3]]
+;
+bb2:
+  br label %bb3
+
+bb3:                                              ; preds = %bb10, %bb2
+  %phi = phi i32 [ 0, %bb2 ], [ %add11, %bb10 ]
+  %add = add nuw nsw i32 %phi, 65535
+  %and = and i32 %add, 65535
+  %zext = zext i32 %and to i64
+  %add4 = add nsw i64 %zext, -128
+  %icmp5 = icmp ult i64 %add4, -256
+  %icmp6 = icmp ult i32 %and, 128
+  %or = or i1 %icmp5, %icmp6
+  br i1 %or, label %bb10, label %bb7
+
+bb7:                                              ; preds = %bb3
+  call void @foo(i32 %phi)
+  unreachable
+
+bb10:                                             ; preds = %bb3
+  %add11 = add nuw nsw i32 %phi, 1
+  %icmp12 = icmp ult i32 %phi, 1210
+  br i1 %icmp12, label %bb3, label %bb13
+
+bb13:                                             ; preds = %bb10
+  ret i64 %zext
+
+  uselistorder i32 %phi, { 0, 3, 1, 2 }
+}
+
+declare void @foo(i32)

From 8e463b3083e4bf11231097771d78187db2127760 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?=
 =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?=
 =?UTF-8?q?=E3=83=B3=29?= <clementval@gmail.com>
Date: Fri, 27 Oct 2023 13:31:05 -0700
Subject: [PATCH 259/877] [flang][openacc] Convert rhs expr to the lhs type on
 atomic read/write (#70377)

In some cases the rhs expression scalar type is not the same as the lhs
type. A convert op is needed before the acc.atomic.read or
acc.atomic.write operation to fit with the requirements of the
operations.
---
 flang/lib/Lower/DirectivesCommon.h            | 34 ++++++++-----------
 .../test/Lower/OpenACC/acc-atomic-capture.f90 | 25 ++++++++++++++
 flang/test/Lower/OpenACC/acc-atomic-read.f90  | 14 ++++++++
 3 files changed, 54 insertions(+), 19 deletions(-)

diff --git a/flang/lib/Lower/DirectivesCommon.h b/flang/lib/Lower/DirectivesCommon.h
index 656fd36099b51..1b231ee1b891b 100644
--- a/flang/lib/Lower/DirectivesCommon.h
+++ b/flang/lib/Lower/DirectivesCommon.h
@@ -357,6 +357,11 @@ void genOmpAccAtomicRead(Fortran::lower::AbstractConverter &converter,
       fir::getBase(converter.genExprAddr(fromExpr, stmtCtx));
   mlir::Value toAddress = fir::getBase(converter.genExprAddr(
       *Fortran::semantics::GetExpr(assignmentStmtVariable), stmtCtx));
+  mlir::Location loc = converter.getCurrentLocation();
+  fir::FirOpBuilder &builder = converter.getFirOpBuilder();
+  if (fromAddress.getType() != toAddress.getType())
+    fromAddress =
+        builder.create<fir::ConvertOp>(loc, toAddress.getType(), fromAddress);
   genOmpAccAtomicCaptureStatement(converter, fromAddress, toAddress,
                                   leftHandClauseList, rightHandClauseList,
                                   elementType);
@@ -427,10 +432,12 @@ void genOmpAccAtomicCapture(Fortran::lower::AbstractConverter &converter,
 
   const Fortran::parser::AssignmentStmt &stmt1 =
       std::get<typename AtomicT::Stmt1>(atomicCapture.t).v.statement;
+  const Fortran::evaluate::Assignment &assign1 = *stmt1.typedAssignment->v;
   const auto &stmt1Var{std::get<Fortran::parser::Variable>(stmt1.t)};
   const auto &stmt1Expr{std::get<Fortran::parser::Expr>(stmt1.t)};
   const Fortran::parser::AssignmentStmt &stmt2 =
       std::get<typename AtomicT::Stmt2>(atomicCapture.t).v.statement;
+  const Fortran::evaluate::Assignment &assign2 = *stmt2.typedAssignment->v;
   const auto &stmt2Var{std::get<Fortran::parser::Variable>(stmt2.t)};
   const auto &stmt2Expr{std::get<Fortran::parser::Expr>(stmt2.t)};
 
@@ -442,36 +449,25 @@ void genOmpAccAtomicCapture(Fortran::lower::AbstractConverter &converter,
   mlir::Value stmt1LHSArg, stmt1RHSArg, stmt2LHSArg, stmt2RHSArg;
   mlir::Type elementType;
   // LHS evaluations are common to all combinations of `atomic.capture`
-  stmt1LHSArg = fir::getBase(
-      converter.genExprAddr(*Fortran::semantics::GetExpr(stmt1Var), stmtCtx));
-  stmt2LHSArg = fir::getBase(
-      converter.genExprAddr(*Fortran::semantics::GetExpr(stmt2Var), stmtCtx));
+  stmt1LHSArg = fir::getBase(converter.genExprAddr(assign1.lhs, stmtCtx));
+  stmt2LHSArg = fir::getBase(converter.genExprAddr(assign2.lhs, stmtCtx));
 
   // Operation specific RHS evaluations
   if (checkForSingleVariableOnRHS(stmt1)) {
     // Atomic capture construct is of the form [capture-stmt, update-stmt] or
     // of the form [capture-stmt, write-stmt]
-    stmt1RHSArg = fir::getBase(converter.genExprAddr(
-        *Fortran::semantics::GetExpr(stmt1Expr), stmtCtx));
-    stmt2RHSArg = fir::getBase(converter.genExprValue(
-        *Fortran::semantics::GetExpr(stmt2Expr), stmtCtx));
-
+    stmt1RHSArg = fir::getBase(converter.genExprAddr(assign1.rhs, stmtCtx));
+    stmt2RHSArg = fir::getBase(converter.genExprValue(assign2.rhs, stmtCtx));
   } else {
     // Atomic capture construct is of the form [update-stmt, capture-stmt]
-    stmt1RHSArg = fir::getBase(converter.genExprValue(
-        *Fortran::semantics::GetExpr(stmt1Expr), stmtCtx));
-    stmt2RHSArg = fir::getBase(converter.genExprAddr(
-        *Fortran::semantics::GetExpr(stmt2Expr), stmtCtx));
+    stmt1RHSArg = fir::getBase(converter.genExprValue(assign1.rhs, stmtCtx));
+    stmt2RHSArg = fir::getBase(converter.genExprAddr(assign2.lhs, stmtCtx));
   }
   // Type information used in generation of `atomic.update` operation
   mlir::Type stmt1VarType =
-      fir::getBase(converter.genExprValue(
-                       *Fortran::semantics::GetExpr(stmt1Var), stmtCtx))
-          .getType();
+      fir::getBase(converter.genExprValue(assign1.lhs, stmtCtx)).getType();
   mlir::Type stmt2VarType =
-      fir::getBase(converter.genExprValue(
-                       *Fortran::semantics::GetExpr(stmt2Var), stmtCtx))
-          .getType();
+      fir::getBase(converter.genExprValue(assign2.lhs, stmtCtx)).getType();
 
   mlir::Operation *atomicCaptureOp = nullptr;
   if constexpr (std::is_same<AtomicListT,
diff --git a/flang/test/Lower/OpenACC/acc-atomic-capture.f90 b/flang/test/Lower/OpenACC/acc-atomic-capture.f90
index 382991cf7221b..59c16cf61acf7 100644
--- a/flang/test/Lower/OpenACC/acc-atomic-capture.f90
+++ b/flang/test/Lower/OpenACC/acc-atomic-capture.f90
@@ -97,3 +97,28 @@ subroutine pointers_in_atomic_capture()
         b = a
     !$acc end atomic
 end subroutine
+
+
+subroutine capture_with_convert_f32_to_i32()
+  implicit none
+  integer :: k, v, i
+
+  k = 1
+  v = 0
+
+  !$acc atomic capture
+  v = k
+  k = (i + 1) * 3.14
+  !$acc end atomic
+end subroutine
+
+! CHECK-LABEL: func.func @_QPcapture_with_convert_f32_to_i32()
+! CHECK: %[[K:.*]] = fir.alloca i32 {bindc_name = "k", uniq_name = "_QFcapture_with_convert_f32_to_i32Ek"}
+! CHECK: %[[V:.*]] = fir.alloca i32 {bindc_name = "v", uniq_name = "_QFcapture_with_convert_f32_to_i32Ev"}
+! CHECK: %[[CST:.*]] = arith.constant 3.140000e+00 : f32
+! CHECK: %[[MUL:.*]] = arith.mulf %{{.*}}, %[[CST]] fastmath<contract> : f32
+! CHECK: %[[CONV:.*]] = fir.convert %[[MUL]] : (f32) -> i32
+! CHECK: acc.atomic.capture {
+! CHECK:   acc.atomic.read %[[V]] = %[[K]] : !fir.ref<i32>, i32
+! CHECK:   acc.atomic.write %[[K]] = %[[CONV]] : !fir.ref<i32>, i32
+! CHECK: }
diff --git a/flang/test/Lower/OpenACC/acc-atomic-read.f90 b/flang/test/Lower/OpenACC/acc-atomic-read.f90
index 28f0ce44e6f41..3a718576124c3 100644
--- a/flang/test/Lower/OpenACC/acc-atomic-read.f90
+++ b/flang/test/Lower/OpenACC/acc-atomic-read.f90
@@ -46,3 +46,17 @@ subroutine atomic_read_pointer()
 
   x = y
 end
+
+subroutine atomic_read_with_convert()
+  integer(4) :: x
+  integer(8) :: y
+
+  !$acc atomic read
+  y = x
+end
+
+! CHECK-LABEL: func.func @_QPatomic_read_with_convert() {
+! CHECK: %[[X:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFatomic_read_with_convertEx"}
+! CHECK: %[[Y:.*]] = fir.alloca i64 {bindc_name = "y", uniq_name = "_QFatomic_read_with_convertEy"}
+! CHECK: %[[CONV:.*]] = fir.convert %[[X]] : (!fir.ref<i32>) -> !fir.ref<i64>
+! CHECK: acc.atomic.read %[[Y]] = %[[CONV]] : !fir.ref<i64>, i32

From d136432038b9c0bdf5f0e3f21d9007fa4060c2d7 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <rampitec@users.noreply.github.com>
Date: Fri, 27 Oct 2023 13:32:48 -0700
Subject: [PATCH 260/877] [AMDGPU] Remove unneeded implicit-def from
 shrink-i32-kimm.mir. NFC. (#70489)

---
 llvm/test/CodeGen/AMDGPU/shrink-i32-kimm.mir | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/shrink-i32-kimm.mir b/llvm/test/CodeGen/AMDGPU/shrink-i32-kimm.mir
index e2198faf13f71..02cda1a0d4533 100644
--- a/llvm/test/CodeGen/AMDGPU/shrink-i32-kimm.mir
+++ b/llvm/test/CodeGen/AMDGPU/shrink-i32-kimm.mir
@@ -52,6 +52,6 @@ body:             |
   bb.0:
 
     ; GCN-LABEL: name: shrink_kimm32_mul_i32
-    ; GCN: $sgpr0 = S_MULK_I32 undef $sgpr0, -2048, implicit-def $scc
-    $sgpr0 = S_MUL_I32 undef $sgpr0, 4294965248, implicit-def $scc
+    ; GCN: $sgpr0 = S_MULK_I32 undef $sgpr0, -2048
+    $sgpr0 = S_MUL_I32 undef $sgpr0, 4294965248
 ...

From 035c154f4f6ebf7d7c5cc4a21c267b3a08bfb062 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Thu, 26 Oct 2023 15:06:04 -0700
Subject: [PATCH 261/877] [RISCV] Refactor
 RISCVPostRAExpandPseudo::expandMovImm and RISCVInstrInfo::movImm to prepare
 for merging.

Fix small bug where RISCVPostRAExpandPseudo::expandMovImm set the
kill flag on X0.
---
 llvm/lib/Target/RISCV/RISCVInstrInfo.cpp      |  9 +++---
 .../RISCV/RISCVPostRAExpandPseudoInsts.cpp    | 28 ++++++++-----------
 2 files changed, 17 insertions(+), 20 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index 9e4e86100a211..d430524bb11e1 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -787,6 +787,7 @@ void RISCVInstrInfo::movImm(MachineBasicBlock &MBB,
   assert(!Seq.empty());
 
   for (const RISCVMatInt::Inst &Inst : Seq) {
+    unsigned SrcRegState = getKillRegState(SrcReg != RISCV::X0);
     switch (Inst.getOpndKind()) {
     case RISCVMatInt::Imm:
       BuildMI(MBB, MBBI, DL, get(Inst.getOpcode()), DstReg)
@@ -795,19 +796,19 @@ void RISCVInstrInfo::movImm(MachineBasicBlock &MBB,
       break;
     case RISCVMatInt::RegX0:
       BuildMI(MBB, MBBI, DL, get(Inst.getOpcode()), DstReg)
-          .addReg(SrcReg, getKillRegState(SrcReg != RISCV::X0))
+          .addReg(SrcReg, SrcRegState)
           .addReg(RISCV::X0)
           .setMIFlag(Flag);
       break;
     case RISCVMatInt::RegReg:
       BuildMI(MBB, MBBI, DL, get(Inst.getOpcode()), DstReg)
-          .addReg(SrcReg, getKillRegState(SrcReg != RISCV::X0))
-          .addReg(SrcReg, getKillRegState(SrcReg != RISCV::X0))
+          .addReg(SrcReg, SrcRegState)
+          .addReg(SrcReg, SrcRegState)
           .setMIFlag(Flag);
       break;
     case RISCVMatInt::RegImm:
       BuildMI(MBB, MBBI, DL, get(Inst.getOpcode()), DstReg)
-          .addReg(SrcReg, getKillRegState(SrcReg != RISCV::X0))
+          .addReg(SrcReg, SrcRegState)
           .addImm(Inst.getImm())
           .setMIFlag(Flag);
       break;
diff --git a/llvm/lib/Target/RISCV/RISCVPostRAExpandPseudoInsts.cpp b/llvm/lib/Target/RISCV/RISCVPostRAExpandPseudoInsts.cpp
index 407e7cfd6fef8..2a5b693a5b4f5 100644
--- a/llvm/lib/Target/RISCV/RISCVPostRAExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/RISCV/RISCVPostRAExpandPseudoInsts.cpp
@@ -101,36 +101,32 @@ bool RISCVPostRAExpandPseudo::expandMovImm(MachineBasicBlock &MBB,
 
   for (RISCVMatInt::Inst &Inst : Seq) {
     bool LastItem = ++Num == Seq.size();
+    unsigned DstRegState = getDeadRegState(DstIsDead && LastItem) |
+                           getRenamableRegState(Renamable);
+    unsigned SrcRegState = getKillRegState(SrcReg != RISCV::X0) |
+                           getRenamableRegState(SrcRenamable);
     switch (Inst.getOpndKind()) {
     case RISCVMatInt::Imm:
       BuildMI(MBB, MBBI, DL, TII->get(Inst.getOpcode()))
-          .addReg(DstReg, RegState::Define |
-                              getDeadRegState(DstIsDead && LastItem) |
-                              getRenamableRegState(Renamable))
+          .addReg(DstReg, RegState::Define | DstRegState)
           .addImm(Inst.getImm());
       break;
     case RISCVMatInt::RegX0:
       BuildMI(MBB, MBBI, DL, TII->get(Inst.getOpcode()))
-          .addReg(DstReg, RegState::Define |
-                              getDeadRegState(DstIsDead && LastItem) |
-                              getRenamableRegState(Renamable))
-          .addReg(SrcReg, RegState::Kill | getRenamableRegState(SrcRenamable))
+          .addReg(DstReg, RegState::Define | DstRegState)
+          .addReg(SrcReg, SrcRegState)
           .addReg(RISCV::X0);
       break;
     case RISCVMatInt::RegReg:
       BuildMI(MBB, MBBI, DL, TII->get(Inst.getOpcode()))
-          .addReg(DstReg, RegState::Define |
-                              getDeadRegState(DstIsDead && LastItem) |
-                              getRenamableRegState(Renamable))
-          .addReg(SrcReg, RegState::Kill | getRenamableRegState(SrcRenamable))
-          .addReg(SrcReg, RegState::Kill | getRenamableRegState(SrcRenamable));
+          .addReg(DstReg, RegState::Define | DstRegState)
+          .addReg(SrcReg, SrcRegState)
+          .addReg(SrcReg, SrcRegState);
       break;
     case RISCVMatInt::RegImm:
       BuildMI(MBB, MBBI, DL, TII->get(Inst.getOpcode()))
-          .addReg(DstReg, RegState::Define |
-                              getDeadRegState(DstIsDead && LastItem) |
-                              getRenamableRegState(Renamable))
-          .addReg(SrcReg, RegState::Kill | getRenamableRegState(SrcRenamable))
+          .addReg(DstReg, RegState::Define | DstRegState)
+          .addReg(SrcReg, SrcRegState)
           .addImm(Inst.getImm());
       break;
     }

From b679ec86e3cba95b1858077fb78bfff46080eb73 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Thu, 26 Oct 2023 15:30:26 -0700
Subject: [PATCH 262/877] [RISCV] Use RISCVInstrInfo::movImm to implement most
 of RISCVPostRAExpandPseudo::expandMovImm (#70389)

---
 llvm/lib/Target/RISCV/RISCVInstrInfo.cpp      | 25 ++++++++---
 llvm/lib/Target/RISCV/RISCVInstrInfo.h        |  3 +-
 .../RISCV/RISCVPostRAExpandPseudoInsts.cpp    | 43 ++-----------------
 3 files changed, 25 insertions(+), 46 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index d430524bb11e1..fdc9c08ef9343 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -776,7 +776,8 @@ MachineInstr *RISCVInstrInfo::foldMemoryOperandImpl(
 void RISCVInstrInfo::movImm(MachineBasicBlock &MBB,
                             MachineBasicBlock::iterator MBBI,
                             const DebugLoc &DL, Register DstReg, uint64_t Val,
-                            MachineInstr::MIFlag Flag) const {
+                            MachineInstr::MIFlag Flag, bool DstRenamable,
+                            bool DstIsDead) const {
   Register SrcReg = RISCV::X0;
 
   if (!STI.is64Bit() && !isInt<32>(Val))
@@ -786,28 +787,39 @@ void RISCVInstrInfo::movImm(MachineBasicBlock &MBB,
       RISCVMatInt::generateInstSeq(Val, STI.getFeatureBits());
   assert(!Seq.empty());
 
+  bool SrcRenamable = false;
+  unsigned Num = 0;
+
   for (const RISCVMatInt::Inst &Inst : Seq) {
-    unsigned SrcRegState = getKillRegState(SrcReg != RISCV::X0);
+    bool LastItem = ++Num == Seq.size();
+    unsigned DstRegState = getDeadRegState(DstIsDead && LastItem) |
+                           getRenamableRegState(DstRenamable);
+    unsigned SrcRegState = getKillRegState(SrcReg != RISCV::X0) |
+                           getRenamableRegState(SrcRenamable);
     switch (Inst.getOpndKind()) {
     case RISCVMatInt::Imm:
-      BuildMI(MBB, MBBI, DL, get(Inst.getOpcode()), DstReg)
+      BuildMI(MBB, MBBI, DL, get(Inst.getOpcode()))
+          .addReg(DstReg, RegState::Define | DstRegState)
           .addImm(Inst.getImm())
           .setMIFlag(Flag);
       break;
     case RISCVMatInt::RegX0:
-      BuildMI(MBB, MBBI, DL, get(Inst.getOpcode()), DstReg)
+      BuildMI(MBB, MBBI, DL, get(Inst.getOpcode()))
+          .addReg(DstReg, RegState::Define | DstRegState)
           .addReg(SrcReg, SrcRegState)
           .addReg(RISCV::X0)
           .setMIFlag(Flag);
       break;
     case RISCVMatInt::RegReg:
-      BuildMI(MBB, MBBI, DL, get(Inst.getOpcode()), DstReg)
+      BuildMI(MBB, MBBI, DL, get(Inst.getOpcode()))
+          .addReg(DstReg, RegState::Define | DstRegState)
           .addReg(SrcReg, SrcRegState)
           .addReg(SrcReg, SrcRegState)
           .setMIFlag(Flag);
       break;
     case RISCVMatInt::RegImm:
-      BuildMI(MBB, MBBI, DL, get(Inst.getOpcode()), DstReg)
+      BuildMI(MBB, MBBI, DL, get(Inst.getOpcode()))
+          .addReg(DstReg, RegState::Define | DstRegState)
           .addReg(SrcReg, SrcRegState)
           .addImm(Inst.getImm())
           .setMIFlag(Flag);
@@ -816,6 +828,7 @@ void RISCVInstrInfo::movImm(MachineBasicBlock &MBB,
 
     // Only the first instruction has X0 as its source.
     SrcReg = DstReg;
+    SrcRenamable = DstRenamable;
   }
 }
 
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.h b/llvm/lib/Target/RISCV/RISCVInstrInfo.h
index 13f1bd4127b12..0777c5abac012 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.h
@@ -97,7 +97,8 @@ class RISCVInstrInfo : public RISCVGenInstrInfo {
   // Materializes the given integer Val into DstReg.
   void movImm(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
               const DebugLoc &DL, Register DstReg, uint64_t Val,
-              MachineInstr::MIFlag Flag = MachineInstr::NoFlags) const;
+              MachineInstr::MIFlag Flag = MachineInstr::NoFlags,
+              bool DstRenamable = false, bool DstIsDead = false) const;
 
   unsigned getInstSizeInBytes(const MachineInstr &MI) const override;
 
diff --git a/llvm/lib/Target/RISCV/RISCVPostRAExpandPseudoInsts.cpp b/llvm/lib/Target/RISCV/RISCVPostRAExpandPseudoInsts.cpp
index 2a5b693a5b4f5..bc9b66d6ca6b1 100644
--- a/llvm/lib/Target/RISCV/RISCVPostRAExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/RISCV/RISCVPostRAExpandPseudoInsts.cpp
@@ -92,48 +92,13 @@ bool RISCVPostRAExpandPseudo::expandMovImm(MachineBasicBlock &MBB,
       Val, MBB.getParent()->getSubtarget().getFeatureBits());
   assert(!Seq.empty());
 
-  Register SrcReg = RISCV::X0;
   Register DstReg = MBBI->getOperand(0).getReg();
   bool DstIsDead = MBBI->getOperand(0).isDead();
   bool Renamable = MBBI->getOperand(0).isRenamable();
-  bool SrcRenamable = false;
-  unsigned Num = 0;
-
-  for (RISCVMatInt::Inst &Inst : Seq) {
-    bool LastItem = ++Num == Seq.size();
-    unsigned DstRegState = getDeadRegState(DstIsDead && LastItem) |
-                           getRenamableRegState(Renamable);
-    unsigned SrcRegState = getKillRegState(SrcReg != RISCV::X0) |
-                           getRenamableRegState(SrcRenamable);
-    switch (Inst.getOpndKind()) {
-    case RISCVMatInt::Imm:
-      BuildMI(MBB, MBBI, DL, TII->get(Inst.getOpcode()))
-          .addReg(DstReg, RegState::Define | DstRegState)
-          .addImm(Inst.getImm());
-      break;
-    case RISCVMatInt::RegX0:
-      BuildMI(MBB, MBBI, DL, TII->get(Inst.getOpcode()))
-          .addReg(DstReg, RegState::Define | DstRegState)
-          .addReg(SrcReg, SrcRegState)
-          .addReg(RISCV::X0);
-      break;
-    case RISCVMatInt::RegReg:
-      BuildMI(MBB, MBBI, DL, TII->get(Inst.getOpcode()))
-          .addReg(DstReg, RegState::Define | DstRegState)
-          .addReg(SrcReg, SrcRegState)
-          .addReg(SrcReg, SrcRegState);
-      break;
-    case RISCVMatInt::RegImm:
-      BuildMI(MBB, MBBI, DL, TII->get(Inst.getOpcode()))
-          .addReg(DstReg, RegState::Define | DstRegState)
-          .addReg(SrcReg, SrcRegState)
-          .addImm(Inst.getImm());
-      break;
-    }
-    // Only the first instruction has X0 as its source.
-    SrcReg = DstReg;
-    SrcRenamable = Renamable;
-  }
+
+  TII->movImm(MBB, MBBI, DL, DstReg, Val, MachineInstr::NoFlags, Renamable,
+              DstIsDead);
+
   MBBI->eraseFromParent();
   return true;
 }

From 8363996894b29811d74fa7ce10f2dda7cb79c28f Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Fri, 27 Oct 2023 13:36:44 -0700
Subject: [PATCH 263/877] [RISCV] Reduce the number of parameters to
 copyPhysRegVector. NFC (#70502)

The Lmul and SubRegIdx can be derived from the opcode.

Make NF default to 1.
---
 llvm/lib/Target/RISCV/RISCVInstrInfo.cpp | 163 ++++++++++++-----------
 llvm/lib/Target/RISCV/RISCVInstrInfo.h   |   4 +-
 2 files changed, 87 insertions(+), 80 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index fdc9c08ef9343..bfe43bae7cb12 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -294,40 +294,52 @@ static bool isConvertibleToVMV_V_V(const RISCVSubtarget &STI,
   return false;
 }
 
-void RISCVInstrInfo::copyPhysRegVector(
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
-    const DebugLoc &DL, MCRegister DstReg, MCRegister SrcReg, bool KillSrc,
-    unsigned Opc, unsigned NF, RISCVII::VLMUL LMul, unsigned SubRegIdx) const {
+void RISCVInstrInfo::copyPhysRegVector(MachineBasicBlock &MBB,
+                                       MachineBasicBlock::iterator MBBI,
+                                       const DebugLoc &DL, MCRegister DstReg,
+                                       MCRegister SrcReg, bool KillSrc,
+                                       unsigned Opc, unsigned NF) const {
   const TargetRegisterInfo *TRI = STI.getRegisterInfo();
 
+  RISCVII::VLMUL LMul;
+  unsigned SubRegIdx;
+  unsigned VVOpc, VIOpc;
+  switch (Opc) {
+  default:
+    llvm_unreachable("Impossible LMUL for vector register copy.");
+  case RISCV::VMV1R_V:
+    LMul = RISCVII::LMUL_1;
+    SubRegIdx = RISCV::sub_vrm1_0;
+    VVOpc = RISCV::PseudoVMV_V_V_M1;
+    VIOpc = RISCV::PseudoVMV_V_I_M1;
+    break;
+  case RISCV::VMV2R_V:
+    LMul = RISCVII::LMUL_2;
+    SubRegIdx = RISCV::sub_vrm2_0;
+    VVOpc = RISCV::PseudoVMV_V_V_M2;
+    VIOpc = RISCV::PseudoVMV_V_I_M2;
+    break;
+  case RISCV::VMV4R_V:
+    LMul = RISCVII::LMUL_4;
+    SubRegIdx = RISCV::sub_vrm4_0;
+    VVOpc = RISCV::PseudoVMV_V_V_M4;
+    VIOpc = RISCV::PseudoVMV_V_I_M4;
+    break;
+  case RISCV::VMV8R_V:
+    assert(NF == 1);
+    LMul = RISCVII::LMUL_8;
+    SubRegIdx = RISCV::sub_vrm1_0; // There is no sub_vrm8_0.
+    VVOpc = RISCV::PseudoVMV_V_V_M8;
+    VIOpc = RISCV::PseudoVMV_V_I_M8;
+    break;
+  }
+
   bool UseVMV_V_V = false;
   bool UseVMV_V_I = false;
   MachineBasicBlock::const_iterator DefMBBI;
   if (isConvertibleToVMV_V_V(STI, MBB, MBBI, DefMBBI, LMul)) {
     UseVMV_V_V = true;
-    // We only need to handle LMUL = 1/2/4/8 here because we only define
-    // vector register classes for LMUL = 1/2/4/8.
-    unsigned VIOpc;
-    switch (LMul) {
-    default:
-      llvm_unreachable("Impossible LMUL for vector register copy.");
-    case RISCVII::LMUL_1:
-      Opc = RISCV::PseudoVMV_V_V_M1;
-      VIOpc = RISCV::PseudoVMV_V_I_M1;
-      break;
-    case RISCVII::LMUL_2:
-      Opc = RISCV::PseudoVMV_V_V_M2;
-      VIOpc = RISCV::PseudoVMV_V_I_M2;
-      break;
-    case RISCVII::LMUL_4:
-      Opc = RISCV::PseudoVMV_V_V_M4;
-      VIOpc = RISCV::PseudoVMV_V_I_M4;
-      break;
-    case RISCVII::LMUL_8:
-      Opc = RISCV::PseudoVMV_V_V_M8;
-      VIOpc = RISCV::PseudoVMV_V_I_M8;
-      break;
-    }
+    Opc = VVOpc;
 
     if (DefMBBI->getOpcode() == VIOpc) {
       UseVMV_V_I = true;
@@ -351,38 +363,39 @@ void RISCVInstrInfo::copyPhysRegVector(
       MIB.addReg(RISCV::VL, RegState::Implicit);
       MIB.addReg(RISCV::VTYPE, RegState::Implicit);
     }
-  } else {
-    int I = 0, End = NF, Incr = 1;
-    unsigned SrcEncoding = TRI->getEncodingValue(SrcReg);
-    unsigned DstEncoding = TRI->getEncodingValue(DstReg);
-    unsigned LMulVal;
-    bool Fractional;
-    std::tie(LMulVal, Fractional) = RISCVVType::decodeVLMUL(LMul);
-    assert(!Fractional && "It is impossible be fractional lmul here.");
-    if (forwardCopyWillClobberTuple(DstEncoding, SrcEncoding, NF * LMulVal)) {
-      I = NF - 1;
-      End = -1;
-      Incr = -1;
-    }
+    return;
+  }
 
-    for (; I != End; I += Incr) {
-      auto MIB = BuildMI(MBB, MBBI, DL, get(Opc),
-                         TRI->getSubReg(DstReg, SubRegIdx + I));
-      if (UseVMV_V_V)
-        MIB.addReg(TRI->getSubReg(DstReg, SubRegIdx + I), RegState::Undef);
-      if (UseVMV_V_I)
-        MIB = MIB.add(DefMBBI->getOperand(2));
-      else
-        MIB = MIB.addReg(TRI->getSubReg(SrcReg, SubRegIdx + I),
-                         getKillRegState(KillSrc));
-      if (UseVMV_V_V) {
-        const MCInstrDesc &Desc = DefMBBI->getDesc();
-        MIB.add(DefMBBI->getOperand(RISCVII::getVLOpNum(Desc)));  // AVL
-        MIB.add(DefMBBI->getOperand(RISCVII::getSEWOpNum(Desc))); // SEW
-        MIB.addImm(0);                                            // tu, mu
-        MIB.addReg(RISCV::VL, RegState::Implicit);
-        MIB.addReg(RISCV::VTYPE, RegState::Implicit);
-      }
+  int I = 0, End = NF, Incr = 1;
+  unsigned SrcEncoding = TRI->getEncodingValue(SrcReg);
+  unsigned DstEncoding = TRI->getEncodingValue(DstReg);
+  unsigned LMulVal;
+  bool Fractional;
+  std::tie(LMulVal, Fractional) = RISCVVType::decodeVLMUL(LMul);
+  assert(!Fractional && "It is impossible be fractional lmul here.");
+  if (forwardCopyWillClobberTuple(DstEncoding, SrcEncoding, NF * LMulVal)) {
+    I = NF - 1;
+    End = -1;
+    Incr = -1;
+  }
+
+  for (; I != End; I += Incr) {
+    auto MIB =
+        BuildMI(MBB, MBBI, DL, get(Opc), TRI->getSubReg(DstReg, SubRegIdx + I));
+    if (UseVMV_V_V)
+      MIB.addReg(TRI->getSubReg(DstReg, SubRegIdx + I), RegState::Undef);
+    if (UseVMV_V_I)
+      MIB = MIB.add(DefMBBI->getOperand(2));
+    else
+      MIB = MIB.addReg(TRI->getSubReg(SrcReg, SubRegIdx + I),
+                       getKillRegState(KillSrc));
+    if (UseVMV_V_V) {
+      const MCInstrDesc &Desc = DefMBBI->getDesc();
+      MIB.add(DefMBBI->getOperand(RISCVII::getVLOpNum(Desc)));  // AVL
+      MIB.add(DefMBBI->getOperand(RISCVII::getSEWOpNum(Desc))); // SEW
+      MIB.addImm(0);                                            // tu, mu
+      MIB.addReg(RISCV::VL, RegState::Implicit);
+      MIB.addReg(RISCV::VTYPE, RegState::Implicit);
     }
   }
 }
@@ -460,92 +473,88 @@ void RISCVInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
 
   // VR->VR copies.
   if (RISCV::VRRegClass.contains(DstReg, SrcReg)) {
-    copyPhysRegVector(MBB, MBBI, DL, DstReg, SrcReg, KillSrc, RISCV::VMV1R_V,
-                      /*NF=*/1, RISCVII::LMUL_1, RISCV::sub_vrm1_0);
+    copyPhysRegVector(MBB, MBBI, DL, DstReg, SrcReg, KillSrc, RISCV::VMV1R_V);
     return;
   }
 
   if (RISCV::VRM2RegClass.contains(DstReg, SrcReg)) {
-    copyPhysRegVector(MBB, MBBI, DL, DstReg, SrcReg, KillSrc, RISCV::VMV2R_V,
-                      /*NF=*/1, RISCVII::LMUL_2, RISCV::sub_vrm1_0);
+    copyPhysRegVector(MBB, MBBI, DL, DstReg, SrcReg, KillSrc, RISCV::VMV2R_V);
     return;
   }
 
   if (RISCV::VRM4RegClass.contains(DstReg, SrcReg)) {
-    copyPhysRegVector(MBB, MBBI, DL, DstReg, SrcReg, KillSrc, RISCV::VMV4R_V,
-                      /*NF=*/1, RISCVII::LMUL_4, RISCV::sub_vrm1_0);
+    copyPhysRegVector(MBB, MBBI, DL, DstReg, SrcReg, KillSrc, RISCV::VMV4R_V);
     return;
   }
 
   if (RISCV::VRM8RegClass.contains(DstReg, SrcReg)) {
-    copyPhysRegVector(MBB, MBBI, DL, DstReg, SrcReg, KillSrc, RISCV::VMV8R_V,
-                      /*NF=*/1, RISCVII::LMUL_8, RISCV::sub_vrm1_0);
+    copyPhysRegVector(MBB, MBBI, DL, DstReg, SrcReg, KillSrc, RISCV::VMV8R_V);
     return;
   }
 
   if (RISCV::VRN2M1RegClass.contains(DstReg, SrcReg)) {
     copyPhysRegVector(MBB, MBBI, DL, DstReg, SrcReg, KillSrc, RISCV::VMV1R_V,
-                      /*NF=*/2, RISCVII::LMUL_1, RISCV::sub_vrm1_0);
+                      /*NF=*/2);
     return;
   }
 
   if (RISCV::VRN2M2RegClass.contains(DstReg, SrcReg)) {
     copyPhysRegVector(MBB, MBBI, DL, DstReg, SrcReg, KillSrc, RISCV::VMV2R_V,
-                      /*NF=*/2, RISCVII::LMUL_2, RISCV::sub_vrm2_0);
+                      /*NF=*/2);
     return;
   }
 
   if (RISCV::VRN2M4RegClass.contains(DstReg, SrcReg)) {
     copyPhysRegVector(MBB, MBBI, DL, DstReg, SrcReg, KillSrc, RISCV::VMV4R_V,
-                      /*NF=*/2, RISCVII::LMUL_4, RISCV::sub_vrm4_0);
+                      /*NF=*/2);
     return;
   }
 
   if (RISCV::VRN3M1RegClass.contains(DstReg, SrcReg)) {
     copyPhysRegVector(MBB, MBBI, DL, DstReg, SrcReg, KillSrc, RISCV::VMV1R_V,
-                      /*NF=*/3, RISCVII::LMUL_1, RISCV::sub_vrm1_0);
+                      /*NF=*/3);
     return;
   }
 
   if (RISCV::VRN3M2RegClass.contains(DstReg, SrcReg)) {
     copyPhysRegVector(MBB, MBBI, DL, DstReg, SrcReg, KillSrc, RISCV::VMV2R_V,
-                      /*NF=*/3, RISCVII::LMUL_2, RISCV::sub_vrm2_0);
+                      /*NF=*/3);
     return;
   }
 
   if (RISCV::VRN4M1RegClass.contains(DstReg, SrcReg)) {
     copyPhysRegVector(MBB, MBBI, DL, DstReg, SrcReg, KillSrc, RISCV::VMV1R_V,
-                      /*NF=*/4, RISCVII::LMUL_1, RISCV::sub_vrm1_0);
+                      /*NF=*/4);
     return;
   }
 
   if (RISCV::VRN4M2RegClass.contains(DstReg, SrcReg)) {
     copyPhysRegVector(MBB, MBBI, DL, DstReg, SrcReg, KillSrc, RISCV::VMV2R_V,
-                      /*NF=*/4, RISCVII::LMUL_2, RISCV::sub_vrm2_0);
+                      /*NF=*/4);
     return;
   }
 
   if (RISCV::VRN5M1RegClass.contains(DstReg, SrcReg)) {
     copyPhysRegVector(MBB, MBBI, DL, DstReg, SrcReg, KillSrc, RISCV::VMV1R_V,
-                      /*NF=*/5, RISCVII::LMUL_1, RISCV::sub_vrm1_0);
+                      /*NF=*/5);
     return;
   }
 
   if (RISCV::VRN6M1RegClass.contains(DstReg, SrcReg)) {
     copyPhysRegVector(MBB, MBBI, DL, DstReg, SrcReg, KillSrc, RISCV::VMV1R_V,
-                      /*NF=*/6, RISCVII::LMUL_1, RISCV::sub_vrm1_0);
+                      /*NF=*/6);
     return;
   }
 
   if (RISCV::VRN7M1RegClass.contains(DstReg, SrcReg)) {
     copyPhysRegVector(MBB, MBBI, DL, DstReg, SrcReg, KillSrc, RISCV::VMV1R_V,
-                      /*NF=*/7, RISCVII::LMUL_1, RISCV::sub_vrm1_0);
+                      /*NF=*/7);
     return;
   }
 
   if (RISCV::VRN8M1RegClass.contains(DstReg, SrcReg)) {
     copyPhysRegVector(MBB, MBBI, DL, DstReg, SrcReg, KillSrc, RISCV::VMV1R_V,
-                      /*NF=*/8, RISCVII::LMUL_1, RISCV::sub_vrm1_0);
+                      /*NF=*/8);
     return;
   }
 
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.h b/llvm/lib/Target/RISCV/RISCVInstrInfo.h
index 0777c5abac012..a51dd0c316ef6 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.h
@@ -13,7 +13,6 @@
 #ifndef LLVM_LIB_TARGET_RISCV_RISCVINSTRINFO_H
 #define LLVM_LIB_TARGET_RISCV_RISCVINSTRINFO_H
 
-#include "MCTargetDesc/RISCVBaseInfo.h"
 #include "RISCVRegisterInfo.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/IR/DiagnosticInfo.h"
@@ -67,8 +66,7 @@ class RISCVInstrInfo : public RISCVGenInstrInfo {
   void copyPhysRegVector(MachineBasicBlock &MBB,
                          MachineBasicBlock::iterator MBBI, const DebugLoc &DL,
                          MCRegister DstReg, MCRegister SrcReg, bool KillSrc,
-                         unsigned Opc, unsigned NF, RISCVII::VLMUL LMul,
-                         unsigned SubRegIdx) const;
+                         unsigned Opc, unsigned NF = 1) const;
   void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
                    const DebugLoc &DL, MCRegister DstReg, MCRegister SrcReg,
                    bool KillSrc) const override;

From 505e32302c07b9081fa1bc05daf72bafed6ca29a Mon Sep 17 00:00:00 2001
From: Daniil Suchkov <dsuchkov@azul.com>
Date: Fri, 27 Oct 2023 20:40:19 +0000
Subject: [PATCH 264/877] [Test] NFC. Add missing "REQUIRES:
 x86-registered-target" to LoopStrengthReduce/scev-incorrect-nuw-inference.ll

---
 .../LoopStrengthReduce/scev-incorrect-nuw-inference.ll           | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/test/Transforms/LoopStrengthReduce/scev-incorrect-nuw-inference.ll b/llvm/test/Transforms/LoopStrengthReduce/scev-incorrect-nuw-inference.ll
index 9347169b3902c..3a9cb160e88d7 100644
--- a/llvm/test/Transforms/LoopStrengthReduce/scev-incorrect-nuw-inference.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/scev-incorrect-nuw-inference.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
 ; RUN: opt < %s -loop-reduce -S | FileCheck %s
+; REQUIRES: x86-registered-target
 
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128-ni:1-p2:32:8:8:32-ni:2"
 target triple = "x86_64-unknown-linux-gnu"

From 7d225bf347fc6b6abb3cac8380073f97910dcdaf Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Fri, 27 Oct 2023 21:59:40 +0100
Subject: [PATCH 265/877] [AArch64] Clarify that Anyext is OK for MOPS
 instructions. NFC

The instruction should only read the bottom 8 bits of the register, so an
anyext is OK here. Update the comment from zext->anyext to clarify.

Closes #70270 and #70298
---
 .../Target/AArch64/GISel/AArch64LegalizerInfo.cpp  | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index 226cdee44ece2..598a195d4fb10 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -1270,11 +1270,12 @@ bool AArch64LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
   }
   case Intrinsic::aarch64_mops_memset_tag: {
     assert(MI.getOpcode() == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS);
-    // Zext the value to 64 bit
+    // Anyext the value being set to 64 bit (only the bottom 8 bits are read by
+    // the instruction).
     MachineIRBuilder MIB(MI);
     auto &Value = MI.getOperand(3);
-    Register ZExtValueReg = MIB.buildAnyExt(LLT::scalar(64), Value).getReg(0);
-    Value.setReg(ZExtValueReg);
+    Register ExtValueReg = MIB.buildAnyExt(LLT::scalar(64), Value).getReg(0);
+    Value.setReg(ExtValueReg);
     return true;
   }
   case Intrinsic::prefetch: {
@@ -1793,11 +1794,12 @@ bool AArch64LegalizerInfo::legalizeMemOps(MachineInstr &MI,
 
   // Tagged version MOPSMemorySetTagged is legalised in legalizeIntrinsic
   if (MI.getOpcode() == TargetOpcode::G_MEMSET) {
-    // Zext the value operand to 64 bit
+    // Anyext the value being set to 64 bit (only the bottom 8 bits are read by
+    // the instruction).
     auto &Value = MI.getOperand(1);
-    Register ZExtValueReg =
+    Register ExtValueReg =
         MIRBuilder.buildAnyExt(LLT::scalar(64), Value).getReg(0);
-    Value.setReg(ZExtValueReg);
+    Value.setReg(ExtValueReg);
     return true;
   }
 

From 840bf2a0bb94d351fc9ecc21f827dca54a977509 Mon Sep 17 00:00:00 2001
From: Mircea Trofin <mtrofin@google.com>
Date: Fri, 27 Oct 2023 13:58:08 -0700
Subject: [PATCH 266/877] [mlgo][regalloc] Fix tests post 9a091de7

---
 .../MLRegAlloc/Inputs/reference-log-noml.txt  | 72 +++++++++----------
 .../Inputs/reference-prio-log-noml.txt        | 12 ++--
 .../MLRegAlloc/dev-mode-prio-logging.ll       |  2 +-
 3 files changed, 43 insertions(+), 43 deletions(-)

diff --git a/llvm/test/CodeGen/MLRegAlloc/Inputs/reference-log-noml.txt b/llvm/test/CodeGen/MLRegAlloc/Inputs/reference-log-noml.txt
index 5f3b5be403b7e..231f632403d48 100644
--- a/llvm/test/CodeGen/MLRegAlloc/Inputs/reference-log-noml.txt
+++ b/llvm/test/CodeGen/MLRegAlloc/Inputs/reference-log-noml.txt
@@ -16,8 +16,8 @@ hint_weights_by_max: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.7265065908432007,0.0,
 start_bb_freq_by_max: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.3333333432674408,0.3333333432674408,0.3333333432674408,0.3333333432674408,0.1666666716337204,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.3333333432674408
 end_bb_freq_by_max: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.2714630176778883e-10,0.9760092496871948,0.9760092496871948,0.9760092496871948,2.2714630176778883e-10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.9760092496871948
 hottest_bb_freq_by_max: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.23831403255462646,0.07943800836801529,0.07943800836801529,0.07943800836801529,0.9912577867507935,0.07069581001996994,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
-liverange_size: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.9591121673583984,0.7940031290054321,0.7908878326416016,0.7379283308982849,0.9061526656150818,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.7352024912834167
-use_def_density: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05104188248515129,0.01772311143577099,0.01417447254061699,0.014231426641345024,1.0,0.0737093985080719,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.4279724359512329
+liverange_size: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.9647942781448364,0.7939082384109497,0.7907436490058899,0.7401107549667358,0.9173259735107422,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.7436708807945251
+use_def_density: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05123833194375038,0.017619721591472626,0.014218696393072605,0.014276761561632156,1.0,0.07275574654340744,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.4243086874485016
 max_stage: 0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
 min_stage: 0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
 progress: 0.7777777910232544
@@ -40,8 +40,8 @@ hint_weights_by_max: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
 start_bb_freq_by_max: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.3333333432674408,0.0,0.3333333432674408,0.3333333432674408,0.1666666716337204,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.3333333432674408
 end_bb_freq_by_max: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.2714630176778883e-10,0.0,0.9760092496871948,0.9760092496871948,2.2714630176778883e-10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.9760092496871948
 hottest_bb_freq_by_max: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2404157966375351,0.08013860136270523,0.0,0.08013860136270523,1.0,0.07131929695606232,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.08013860136270523
-liverange_size: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.9591121673583984,0.0,0.7908878326416016,0.7379283308982849,0.9061526656150818,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.7940031290054321
-use_def_density: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05104188248515129,0.01772311143577099,0.0,0.014231426641345024,1.0,0.0737093985080719,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01417447254061699
+liverange_size: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.9647942781448364,0.0,0.7907436490058899,0.7401107549667358,0.9173259735107422,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.7939082384109497
+use_def_density: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05123833194375038,0.017619721591472626,0.0,0.014276761561632156,1.0,0.07275574654340744,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.014218696393072605
 max_stage: 0,0,0,0,0,0,0,0,0,1,1,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
 min_stage: 0,0,0,0,0,0,0,0,0,1,1,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
 progress: 0.7777777910232544
@@ -64,8 +64,8 @@ hint_weights_by_max: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2421688437461853,0.0,
 start_bb_freq_by_max: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.1666666716337204,0.3333333432674408,0.3333333432674408,0.3333333432674408,0.3333333432674408,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1666666716337204
 end_bb_freq_by_max: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.2714630176778883e-10,2.2714630176778883e-10,0.9760092496871948,0.9760092496871948,0.9760092496871948,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.9760092496871948
 hottest_bb_freq_by_max: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.06705831736326218,0.01989283785223961,0.02235277369618416,0.2813863754272461,0.02235277369618416,0.27892643213272095,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
-liverange_size: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.9061526656150818,0.9591121673583984,0.7352024912834167,0.7908878326416016,0.7379283308982849,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.6725077629089355
-use_def_density: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05104188248515129,0.0737093985080719,0.01772311143577099,0.4279724359512329,0.014231426641345024,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.4858442544937134
+liverange_size: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.9173259735107422,0.9647942781448364,0.7436708807945251,0.7907436490058899,0.7401107549667358,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.6831487417221069
+use_def_density: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05123833194375038,0.07275574654340744,0.017619721591472626,0.4243086874485016,0.014276761561632156,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.47955840826034546
 max_stage: 0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
 min_stage: 0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
 progress: 0.7777777910232544
@@ -88,8 +88,8 @@ hint_weights_by_max: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2421688437461853,0.0,
 start_bb_freq_by_max: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.3333333432674408,0.3333333432674408,0.0,0.3333333432674408,0.1666666716337204,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.3333333432674408
 end_bb_freq_by_max: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.2714630176778883e-10,0.9760092496871948,0.0,0.9760092496871948,2.2714630176778883e-10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.9760092496871948
 hottest_bb_freq_by_max: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.23831403255462646,0.07943800836801529,1.0,0.0,0.9912577867507935,0.07069581001996994,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.07943800836801529
-liverange_size: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.9591121673583984,0.7352024912834167,0.0,0.7379283308982849,0.9061526656150818,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.7908878326416016
-use_def_density: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05104188248515129,0.01772311143577099,0.4279724359512329,0.0,1.0,0.0737093985080719,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.014231426641345024
+liverange_size: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.9647942781448364,0.7436708807945251,0.0,0.7401107549667358,0.9173259735107422,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.7907436490058899
+use_def_density: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05123833194375038,0.017619721591472626,0.4243086874485016,0.0,1.0,0.07275574654340744,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.014276761561632156
 max_stage: 0,0,0,0,0,0,0,0,0,1,1,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
 min_stage: 0,0,0,0,0,0,0,0,0,1,1,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
 progress: 0.7777777910232544
@@ -112,8 +112,8 @@ hint_weights_by_max: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2421688437461853,0.0,
 start_bb_freq_by_max: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.3333333432674408,0.3333333432674408,0.1666666716337204,0.3333333432674408,0.1666666716337204,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1666666716337204
 end_bb_freq_by_max: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.2714630176778883e-10,0.9760092496871948,0.9760092496871948,0.9760092496871948,2.2714630176778883e-10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.9760092496871948
 hottest_bb_freq_by_max: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.06705831736326218,0.02235277369618416,0.2813863754272461,1.0,0.27892643213272095,0.01989283785223961,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01117638684809208
-liverange_size: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.9591121673583984,0.7352024912834167,0.6725077629089355,0.7379283308982849,0.9061526656150818,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.6693925261497498
-use_def_density: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05104188248515129,0.01772311143577099,0.4279724359512329,0.4858442544937134,1.0,0.0737093985080719,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00449750293046236
+liverange_size: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.9647942781448364,0.7436708807945251,0.6831487417221069,0.7401107549667358,0.9173259735107422,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.6799841523170471
+use_def_density: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05123833194375038,0.017619721591472626,0.4243086874485016,0.47955840826034546,1.0,0.07275574654340744,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.004439314361661673
 max_stage: 0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
 min_stage: 0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
 progress: 0.7777777910232544
@@ -136,8 +136,8 @@ hint_weights_by_max: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2421688437461853,0.0,
 start_bb_freq_by_max: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.3333333432674408,0.3333333432674408,0.1666666716337204,0.3333333432674408,0.1666666716337204,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1666666716337204
 end_bb_freq_by_max: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.2714630176778883e-10,0.9760092496871948,0.9760092496871948,0.9760092496871948,2.2714630176778883e-10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.9760092496871948
 hottest_bb_freq_by_max: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.06705831736326218,0.02235277369618416,0.2813863754272461,1.0,0.27892643213272095,0.01989283785223961,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01822916604578495
-liverange_size: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.9591121673583984,0.7352024912834167,0.6725077629089355,0.7379283308982849,0.9061526656150818,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.6662772297859192
-use_def_density: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05104188248515129,0.01772311143577099,0.4279724359512329,0.4858442544937134,1.0,0.0737093985080719,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.008109557442367077
+liverange_size: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.9647942781448364,0.7436708807945251,0.6831487417221069,0.7401107549667358,0.9173259735107422,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.6768196225166321
+use_def_density: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05123833194375038,0.017619721591472626,0.4243086874485016,0.47955840826034546,1.0,0.07275574654340744,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.008004635572433472
 max_stage: 0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
 min_stage: 0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
 progress: 0.7777777910232544
@@ -160,8 +160,8 @@ hint_weights_by_max: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2421688586473465,0.0,
 start_bb_freq_by_max: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.1666666716337204,0.3333333432674408,0.3333333432674408,0.1666666716337204,0.3333333432674408,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1666666716337204
 end_bb_freq_by_max: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.2714630176778883e-10,2.2714630176778883e-10,0.9760092496871948,0.9760092496871948,0.9760092496871948,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.9760092496871948
 hottest_bb_freq_by_max: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.06705831736326218,0.01989283785223961,0.02235277369618416,0.2813863754272461,1.0,0.27892643213272095,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2631579041481018
-liverange_size: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.9061526656150818,0.9591121673583984,0.7352024912834167,0.6725077629089355,0.7379283308982849,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.6631619930267334
-use_def_density: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05104188248515129,0.0737093985080719,0.01772311143577099,0.4279724359512329,0.4858442544937134,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.07601386308670044
+liverange_size: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.9173259735107422,0.9647942781448364,0.7436708807945251,0.6831487417221069,0.7401107549667358,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.673655092716217
+use_def_density: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05123833194375038,0.07275574654340744,0.017619721591472626,0.4243086874485016,0.47955840826034546,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.07503040134906769
 max_stage: 0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
 min_stage: 0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
 progress: 0.7777777910232544
@@ -184,8 +184,8 @@ hint_weights_by_max: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2421688437461853,0.0,
 start_bb_freq_by_max: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.3333333432674408,0.1666666716337204,0.3333333432674408,0.1666666716337204,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.3333333432674408
 end_bb_freq_by_max: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.9760092496871948,0.9760092496871948,0.9760092496871948,2.2714630176778883e-10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.2714630176778883e-10
 hottest_bb_freq_by_max: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.06705831736326218,0.0,0.2813863754272461,1.0,0.27892643213272095,0.01989283785223961,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02235277369618416
-liverange_size: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.7352024912834167,0.6725077629089355,0.7379283308982849,0.9061526656150818,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.9591121673583984
-use_def_density: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05104188248515129,0.0,0.4279724359512329,0.4858442544937134,1.0,0.0737093985080719,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01772311143577099
+liverange_size: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.7436708807945251,0.6831487417221069,0.7401107549667358,0.9173259735107422,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.9647942781448364
+use_def_density: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05123833194375038,0.0,0.4243086874485016,0.47955840826034546,1.0,0.07275574654340744,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.017619721591472626
 max_stage: 0,0,0,0,0,0,0,0,0,1,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
 min_stage: 0,0,0,0,0,0,0,0,0,1,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
 progress: 0.7777777910232544
@@ -208,8 +208,8 @@ hint_weights_by_max: 1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2421688437461853,
 start_bb_freq_by_max: 0.3333333432674408,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.1666666716337204,0.1666666716337204,0.3333333432674408,0.1666666716337204,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1666666716337204
 end_bb_freq_by_max: 0.9760092496871948,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.9760092496871948,0.9760092496871948,0.9760092496871948,2.2714630176778883e-10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5110765099525452
 hottest_bb_freq_by_max: 0.2813863754272461,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.06705831736326218,0.2631579041481018,1.0,0.27892643213272095,0.01989283785223961,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.27892643213272095
-liverange_size: 0.7352024912834167,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.6631619930267334,0.6725077629089355,0.7379283308982849,0.9061526656150818,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.46105918288230896
-use_def_density: 0.42606985569000244,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05081497132778168,0.07567594200372696,0.48368439078330994,0.9955543875694275,0.07338171452283859,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
+liverange_size: 0.7436708807945251,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.673655092716217,0.6831487417221069,0.7401107549667358,0.9173259735107422,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.4683544337749481
+use_def_density: 0.4243086874485016,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05123833194375038,0.07503040134906769,0.47955840826034546,1.0,0.07275574654340744,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.99146968126297
 max_stage: 1,0,0,0,0,0,0,0,0,0,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
 min_stage: 1,0,0,0,0,0,0,0,0,0,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
 progress: 0.7777777910232544
@@ -232,8 +232,8 @@ hint_weights_by_max: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.7265065908432007,
 start_bb_freq_by_max: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.1666666716337204,0.1666666716337204,0.3333333432674408,0.1666666716337204,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.3333333432674408
 end_bb_freq_by_max: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.9760092496871948,0.9760092496871948,0.9760092496871948,2.2714630176778883e-10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.9760092496871948
 hottest_bb_freq_by_max: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.06705831736326218,0.2631579041481018,1.0,0.27892643213272095,0.01989283785223961,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2813863754272461
-liverange_size: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.6631619930267334,0.6725077629089355,0.7379283308982849,0.9061526656150818,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.7352024912834167
-use_def_density: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05104188248515129,0.07601386308670044,0.4858442544937134,1.0,0.0737093985080719,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.4279724359512329
+liverange_size: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.673655092716217,0.6831487417221069,0.7401107549667358,0.9173259735107422,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.7436708807945251
+use_def_density: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05123833194375038,0.07503040134906769,0.47955840826034546,1.0,0.07275574654340744,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.4243086874485016
 max_stage: 0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
 min_stage: 0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
 progress: 0.7777777910232544
@@ -256,8 +256,8 @@ hint_weights_by_max: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
 start_bb_freq_by_max: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1666666716337204,0.0,0.1666666716337204,0.3333333432674408,0.1666666716337204,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
 end_bb_freq_by_max: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.9760092496871948,0.0,0.9760092496871948,0.9760092496871948,2.2714630176778883e-10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
 hottest_bb_freq_by_max: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2631579041481018,0.0,1.0,0.27892643213272095,0.01989283785223961,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.06705831736326218
-liverange_size: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.6631619930267334,0.0,0.6725077629089355,0.7379283308982849,0.9061526656150818,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
-use_def_density: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.07601386308670044,0.0,0.4858442544937134,1.0,0.0737093985080719,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05104188248515129
+liverange_size: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.673655092716217,0.0,0.6831487417221069,0.7401107549667358,0.9173259735107422,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
+use_def_density: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.07503040134906769,0.0,0.47955840826034546,1.0,0.07275574654340744,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05123833194375038
 max_stage: 0,0,0,0,0,0,0,0,0,0,1,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
 min_stage: 0,0,0,0,0,0,0,0,0,0,1,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
 progress: 0.7777777910232544
@@ -280,8 +280,8 @@ hint_weights_by_max: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
 start_bb_freq_by_max: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.7152887582778931,0.35764437913894653,0.35764437913894653,0.35764437913894653,0.7152887582778931,0.35764437913894653,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
 end_bb_freq_by_max: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.6365708112716675,0.6365708112716675,0.3333333432674408,0.6365708112716675,0.6365708112716675,1.481489098065225e-10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
 hottest_bb_freq_by_max: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2813863754272461,0.2631579041481018,0.27892643213272095,1.0,0.27892643213272095,0.01989283785223961,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2631579041481018
-liverange_size: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.811345100402832,0.7318435907363892,0.5088096261024475,0.7421572804450989,0.8143532276153564,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2819080352783203
-use_def_density: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.3284657895565033,0.058340102434158325,0.7709200978279114,0.37288200855255127,0.7674928903579712,0.05657143518328667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
+liverange_size: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.8106942772865295,0.7343682646751404,0.510564923286438,0.744717538356781,0.8068132996559143,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2828805446624756
+use_def_density: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.3299224376678467,0.058340102434158325,0.7709200978279114,0.37288200855255127,0.7775528430938721,0.05657143518328667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
 max_stage: 0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
 min_stage: 0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
 progress: 0.7777777910232544
@@ -304,8 +304,8 @@ hint_weights_by_max: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
 start_bb_freq_by_max: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.5,0.5,0.5,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5
 end_bb_freq_by_max: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.5236390233039856,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.3272966886977997e-10
 hottest_bb_freq_by_max: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2813863754272461,0.0,0.2631579041481018,0.27892643213272095,1.0,0.27892643213272095,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01989283785223961
-liverange_size: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.811345100402832,0.0,0.7318435907363892,0.5088096261024475,0.7421572804450989,0.8143532276153564,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
-use_def_density: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.42606985569000244,0.0,0.07567594200372696,1.0,0.48368439078330994,0.9955543875694275,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.07338171452283859
+liverange_size: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.8106942772865295,0.0,0.7343682646751404,0.510564923286438,0.744717538356781,0.8068132996559143,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
+use_def_density: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.4243086874485016,0.0,0.07503040134906769,0.99146968126297,0.47955840826034546,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.07275574654340744
 max_stage: 0,0,0,0,0,0,0,0,0,1,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
 min_stage: 0,0,0,0,0,0,0,0,0,1,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
 progress: 0.7777777910232544
@@ -328,8 +328,8 @@ hint_weights_by_max: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.9982500076293
 start_bb_freq_by_max: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.7152887582778931,0.35764437913894653,0.35764437913894653,0.35764437913894653,0.7152887582778931,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.7152887582778931
 end_bb_freq_by_max: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.6522180438041687,0.6365708112716675,0.3333333432674408,0.6365708112716675,0.6365708112716675,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.015647225081920624
 hottest_bb_freq_by_max: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2813863754272461,0.2631579041481018,0.27892643213272095,1.0,0.27892643213272095,0.2631579041481018,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02235277369618416
-liverange_size: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.8219112157821655,0.5714285969734192,0.8334941864013672,0.9145752787590027,0.31660231947898865,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.9652509689331055
-use_def_density: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.3284657895565033,0.058340102434158325,0.7709200978279114,0.37288200855255127,0.7674928903579712,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.016097404062747955
+liverange_size: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.8283073902130127,0.575875461101532,0.8399805426597595,0.9100194573402405,0.3190661370754242,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.9688715934753418
+use_def_density: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.3299224376678467,0.058340102434158325,0.7709200978279114,0.37288200855255127,0.7775528430938721,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.016164302825927734
 max_stage: 0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4
 min_stage: 0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4
 progress: 0.2222222238779068
@@ -352,8 +352,8 @@ hint_weights_by_max: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.9
 start_bb_freq_by_max: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.7152887582778931,0.35764437913894653,0.35764437913894653,0.35764437913894653,0.7152887582778931,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.7152887582778931
 end_bb_freq_by_max: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.6365708112716675,0.6365708112716675,0.3333333432674408,0.6365708112716675,0.6365708112716675,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.96297819613045e-10
 hottest_bb_freq_by_max: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2813863754272461,0.2631579041481018,0.27892643213272095,1.0,0.27892643213272095,0.2631579041481018,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02235277369618416
-liverange_size: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.8797762989997864,0.7935694456100464,0.5517241358757019,0.8047530055046082,0.8830382227897644,0.30568498373031616,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
-use_def_density: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.3284657895565033,0.058340102434158325,0.7709200978279114,0.37288200855255127,0.7674928903579712,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.008228360675275326
+liverange_size: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.8826290965080261,0.7995305061340332,0.5558685660362244,0.8107981085777283,0.8784037828445435,0.3079812228679657,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
+use_def_density: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.3299224376678467,0.058340102434158325,0.7709200978279114,0.37288200855255127,0.7775528430938721,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00826177466660738
 max_stage: 0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4
 min_stage: 0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4
 progress: 0.1944444477558136
@@ -376,8 +376,8 @@ hint_weights_by_max: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.9434669613838196,
 start_bb_freq_by_max: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.7152887582778931,1.0,0.35764437913894653,0.35764437913894653,0.35764437913894653,0.7152887582778931,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.35764437913894653
 end_bb_freq_by_max: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.6365708112716675,1.0,1.481489098065225e-10,0.3333333432674408,0.6365708112716675,0.6365708112716675,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.96297819613045e-10
 hottest_bb_freq_by_max: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2813863754272461,0.2631579041481018,0.2631579041481018,0.27892643213272095,1.0,0.27892643213272095,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.015625
-liverange_size: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.8342907428741455,0.2898806929588318,1.0,0.5231993198394775,0.7631462812423706,0.8373839855194092,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.9253203868865967
-use_def_density: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.3284657895565033,1.0,0.058340102434158325,0.7709200978279114,0.37288200855255127,0.7674928903579712,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.015127303078770638
+liverange_size: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.8337028622627258,0.290909081697464,1.0,0.5250554084777832,0.7658536434173584,0.8297117352485657,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.9250554442405701
+use_def_density: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.3299224376678467,1.0,0.058340102434158325,0.7709200978279114,0.37288200855255127,0.7775528430938721,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.015127303078770638
 max_stage: 0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4
 min_stage: 0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4
 progress: 0.1388888955116272
@@ -400,8 +400,8 @@ hint_weights_by_max: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.07419288158416748
 start_bb_freq_by_max: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.35764437913894653,0.35764437913894653,0.35764437913894653,0.35764437913894653,0.7152887582778931,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.7152887582778931
 end_bb_freq_by_max: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.6365708112716675,0.6365708112716675,0.3333333432674408,0.6365708112716675,0.6365708112716675,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.6365708112716675
 hottest_bb_freq_by_max: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2813863754272461,0.2631579041481018,0.27892643213272095,1.0,0.27892643213272095,0.2631579041481018,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02235277369618416
-liverange_size: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.9451456069946289,0.5747572779655457,0.8383495211601257,0.9199029207229614,0.3184466063976288,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.9839805960655212
-use_def_density: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.3284657895565033,0.07900823652744293,0.7709200978279114,0.37288200855255127,0.7674928903579712,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.006958406884223223
+liverange_size: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.9521695971488953,0.5838264226913452,0.8515779376029968,0.922583818435669,0.32347139716148376,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.9837278127670288
+use_def_density: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.3299224376678467,0.07900823652744293,0.7709200978279114,0.37288200855255127,0.7775528430938721,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.007072303909808397
 max_stage: 0,0,0,0,0,0,0,0,0,4,4,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4
 min_stage: 0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4
 progress: 0.1111111119389534
@@ -424,8 +424,8 @@ hint_weights_by_max: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.07419288158416748
 start_bb_freq_by_max: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.35764437913894653,0.35764437913894653,0.35764437913894653,0.35764437913894653,0.7152887582778931,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.7152887582778931
 end_bb_freq_by_max: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.6365708112716675,0.6365708112716675,0.3333333432674408,0.6365708112716675,0.6365708112716675,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.6365708112716675
 hottest_bb_freq_by_max: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2813863754272461,0.2631579041481018,0.27892643213272095,1.0,0.27892643213272095,0.2631579041481018,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.02235277369618416
-liverange_size: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.9451456069946289,0.5747572779655457,0.8383495211601257,0.9199029207229614,0.3184466063976288,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.9849514365196228
-use_def_density: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.3284657895565033,0.07900823652744293,0.7709200978279114,0.37288200855255127,0.7674928903579712,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.006951410323381424
+liverange_size: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.9521695971488953,0.5838264226913452,0.8515779376029968,0.922583818435669,0.32347139716148376,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.9847140312194824
+use_def_density: 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.3299224376678467,0.07900823652744293,0.7709200978279114,0.37288200855255127,0.7775528430938721,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.007065076380968094
 max_stage: 0,0,0,0,0,0,0,0,0,4,4,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4
 min_stage: 0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4
 progress: 0.0833333358168602
diff --git a/llvm/test/CodeGen/MLRegAlloc/Inputs/reference-prio-log-noml.txt b/llvm/test/CodeGen/MLRegAlloc/Inputs/reference-prio-log-noml.txt
index beb0c5205979c..01b4a3835c978 100644
--- a/llvm/test/CodeGen/MLRegAlloc/Inputs/reference-prio-log-noml.txt
+++ b/llvm/test/CodeGen/MLRegAlloc/Inputs/reference-prio-log-noml.txt
@@ -171,7 +171,7 @@ observation: 28
 li_size: 0
 stage: 0
 weight: 0.0
-priority: 2147485184.0
+priority: 2147484928.0
 reward: 0.0
 observation: 29
 li_size: 0
@@ -237,7 +237,7 @@ observation: 39
 li_size: 0
 stage: 0
 weight: 0.0
-priority: 3598.0
+priority: 3534.0
 reward: 0.0
 observation: 40
 li_size: 0
@@ -249,7 +249,7 @@ observation: 41
 li_size: 0
 stage: 0
 weight: 0.0
-priority: 3582.0
+priority: 3518.0
 reward: 0.0
 observation: 42
 li_size: 0
@@ -273,7 +273,7 @@ observation: 45
 li_size: 0
 stage: 0
 weight: 0.0
-priority: 4078.0
+priority: 4046.0
 reward: 0.0
 observation: 46
 li_size: 0
@@ -291,7 +291,7 @@ observation: 48
 li_size: 0
 stage: 0
 weight: 0.0
-priority: 4384.0
+priority: 4304.0
 reward: 0.0
 observation: 49
 li_size: 0
@@ -309,7 +309,7 @@ observation: 51
 li_size: 0
 stage: 0
 weight: 0.0
-priority: 2684358144.0
+priority: 2684357888.0
 reward: 0.0
 observation: 52
 li_size: 0
diff --git a/llvm/test/CodeGen/MLRegAlloc/dev-mode-prio-logging.ll b/llvm/test/CodeGen/MLRegAlloc/dev-mode-prio-logging.ll
index 21bb75278874a..6b013b55df77a 100644
--- a/llvm/test/CodeGen/MLRegAlloc/dev-mode-prio-logging.ll
+++ b/llvm/test/CodeGen/MLRegAlloc/dev-mode-prio-logging.ll
@@ -24,5 +24,5 @@
 ; CHECK-NOT: nan
 ; CHECK-LABEL: priority:
 ; NOML-SAME: 2684358144.0
-; ML-SAME: 3599
+; ML-SAME: 3535
 ; CHECK-LABEL: reward:

From 6a93da99002bfd1e12ee42be79033aa0c47374aa Mon Sep 17 00:00:00 2001
From: Peiming Liu <36770114+PeimingLiu@users.noreply.github.com>
Date: Fri, 27 Oct 2023 14:14:09 -0700
Subject: [PATCH 267/877] [mlir][sparse] add ReinterpretMapScopeOption for the
 pass (#70486)

---
 .../mlir/Dialect/SparseTensor/Transforms/Passes.h   | 11 ++++++++++-
 .../mlir/Dialect/SparseTensor/Transforms/Passes.td  | 13 +++++++++++++
 .../Transforms/SparseReinterpretMap.cpp             |  3 ++-
 .../SparseTensor/Transforms/SparseTensorPasses.cpp  | 12 +++++++++++-
 4 files changed, 36 insertions(+), 3 deletions(-)

diff --git a/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.h b/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.h
index 835c9baa2b917..b1979f032393b 100644
--- a/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.h
+++ b/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.h
@@ -40,6 +40,13 @@ enum class SparseParallelizationStrategy {
   kAnyStorageAnyLoop
 };
 
+/// Defines a scope for reinterpret map pass.
+enum class ReinterpretMapScope {
+  kAll,           // reinterprets all applicable operations
+  kGenericOnly,   // reinterprets only linalg.generic
+  kExceptGeneric, // reinterprets operation other than linalg.generic
+};
+
 /// Defines data movement strategy between host and device for GPU.
 // TODO : Zero copy is disabled due to correctness bugs (tracker #64316)
 enum class GPUDataTransferStrategy { kRegularDMA, kZeroCopy, kPinnedDMA };
@@ -51,9 +58,11 @@ enum class GPUDataTransferStrategy { kRegularDMA, kZeroCopy, kPinnedDMA };
 // The SparseReinterpretMap pass.
 //===----------------------------------------------------------------------===//
 
-void populateSparseReinterpretMap(RewritePatternSet &patterns);
+void populateSparseReinterpretMap(RewritePatternSet &patterns,
+                                  ReinterpretMapScope scope);
 
 std::unique_ptr<Pass> createSparseReinterpretMapPass();
+std::unique_ptr<Pass> createSparseReinterpretMapPass(ReinterpretMapScope scope);
 
 //===----------------------------------------------------------------------===//
 // The PreSparsificationRewriting pass.
diff --git a/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.td b/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.td
index c23e062ef8841..db9cc701d253c 100644
--- a/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.td
@@ -27,6 +27,19 @@ def SparseReinterpretMap : Pass<"sparse-reinterpret-map", "ModuleOp"> {
     "linalg::LinalgDialect",
     "sparse_tensor::SparseTensorDialect",
   ];
+  let options = [
+    Option<"scope", "scope", "mlir::ReinterpretMapScope",
+       "mlir::ReinterpretMapScope::kAll",
+       "Set the reiterpretation scope", [{llvm::cl::values(
+         clEnumValN(mlir::ReinterpretMapScope::kAll, "all",
+                    "Run on every applicable operations."),
+         clEnumValN(mlir::ReinterpretMapScope::kGenericOnly,
+                    "only-generic",
+                    "Run only on linalg.generic operations."),
+         clEnumValN(mlir::ReinterpretMapScope::kExceptGeneric,
+                    "except-generic",
+                    "Run on operations expect linalg.generic (e.g., foreach)"))}]>,
+  ];
 }
 
 def PreSparsificationRewrite : Pass<"pre-sparsification-rewrite", "ModuleOp"> {
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseReinterpretMap.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseReinterpretMap.cpp
index 881d235de7384..10722ccb6eea7 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseReinterpretMap.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseReinterpretMap.cpp
@@ -19,4 +19,5 @@ namespace {
 
 } // namespace
 
-void mlir::populateSparseReinterpretMap(RewritePatternSet &patterns) {}
+void mlir::populateSparseReinterpretMap(RewritePatternSet &patterns,
+                                        ReinterpretMapScope scope) {}
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorPasses.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorPasses.cpp
index 241232f7c75cb..095a6ab9a508e 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorPasses.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorPasses.cpp
@@ -49,11 +49,14 @@ struct SparseReinterpretMap
     : public impl::SparseReinterpretMapBase<SparseReinterpretMap> {
   SparseReinterpretMap() = default;
   SparseReinterpretMap(const SparseReinterpretMap &pass) = default;
+  SparseReinterpretMap(const SparseReinterpretMapOptions &options) {
+    scope = options.scope;
+  }
 
   void runOnOperation() override {
     auto *ctx = &getContext();
     RewritePatternSet patterns(ctx);
-    populateSparseReinterpretMap(patterns);
+    populateSparseReinterpretMap(patterns, scope);
     (void)applyPatternsAndFoldGreedily(getOperation(), std::move(patterns));
   }
 };
@@ -372,6 +375,13 @@ std::unique_ptr<Pass> mlir::createSparseReinterpretMapPass() {
   return std::make_unique<SparseReinterpretMap>();
 }
 
+std::unique_ptr<Pass>
+mlir::createSparseReinterpretMapPass(ReinterpretMapScope scope) {
+  SparseReinterpretMapOptions options;
+  options.scope = scope;
+  return std::make_unique<SparseReinterpretMap>(options);
+}
+
 std::unique_ptr<Pass> mlir::createPreSparsificationRewritePass() {
   return std::make_unique<PreSparsificationRewritePass>();
 }

From 703895b131720682a3ca596a96a7c94fb281c0e4 Mon Sep 17 00:00:00 2001
From: Yusra Syeda <99052248+ysyeda@users.noreply.github.com>
Date: Fri, 27 Oct 2023 17:22:49 -0400
Subject: [PATCH 268/877] [clang] Language to String function (#69487)

This PR adds a function which converts the language to string. This is
intended to be used by the z/OS target, see the patch here:
https://github.com/llvm/llvm-project/pull/68926

---------

Co-authored-by: Yusra Syeda <yusra.syeda@ibm.com>
---
 clang/include/clang/Basic/LangStandard.h |  1 +
 clang/lib/Basic/LangStandards.cpp        | 33 ++++++++++++++++++++++++
 2 files changed, 34 insertions(+)

diff --git a/clang/include/clang/Basic/LangStandard.h b/clang/include/clang/Basic/LangStandard.h
index 6356f16acc811..bc49669a82ad2 100644
--- a/clang/include/clang/Basic/LangStandard.h
+++ b/clang/include/clang/Basic/LangStandard.h
@@ -43,6 +43,7 @@ enum class Language : uint8_t {
   HLSL,
   ///@}
 };
+StringRef languageToString(Language L);
 
 enum LangFeatures {
   LineComment = (1 << 0),
diff --git a/clang/lib/Basic/LangStandards.cpp b/clang/lib/Basic/LangStandards.cpp
index af9cf4f273920..ab09c7221dda9 100644
--- a/clang/lib/Basic/LangStandards.cpp
+++ b/clang/lib/Basic/LangStandards.cpp
@@ -13,6 +13,39 @@
 #include "llvm/TargetParser/Triple.h"
 using namespace clang;
 
+StringRef clang::languageToString(Language L) {
+  switch (L) {
+  case Language::Unknown:
+    return "Unknown";
+  case Language::Asm:
+    return "Asm";
+  case Language::LLVM_IR:
+    return "LLVM IR";
+  case Language::C:
+    return "C";
+  case Language::CXX:
+    return "C++";
+  case Language::ObjC:
+    return "Objective-C";
+  case Language::ObjCXX:
+    return "Objective-C++";
+  case Language::OpenCL:
+    return "OpenCL";
+  case Language::OpenCLCXX:
+    return "OpenCLC++";
+  case Language::CUDA:
+    return "CUDA";
+  case Language::RenderScript:
+    return "RenderScript";
+  case Language::HIP:
+    return "HIP";
+  case Language::HLSL:
+    return "HLSL";
+  }
+
+  llvm_unreachable("unhandled language kind");
+}
+
 #define LANGSTANDARD(id, name, lang, desc, features)                           \
   static const LangStandard Lang_##id = {name, desc, features, Language::lang};
 #include "clang/Basic/LangStandards.def"

From d37b283cdd37feca5ea71456cf350005add268e7 Mon Sep 17 00:00:00 2001
From: Amara Emerson <amara@apple.com>
Date: Fri, 27 Oct 2023 14:08:01 -0700
Subject: [PATCH 269/877] Revert "[InstCombine] Add oneuse checks to shr + cmp
 constant folds."

This reverts commit a66051c68a43af39f9fd962f71d58ae0efcf860d.

This seems to have caused issue #70509 so reverting until I have time
to investigate.
---
 .../lib/Transforms/InstCombine/InstCombineCompares.cpp |  2 +-
 .../InstCombine/ashr-icmp-minmax-idiom-break.ll        |  7 ++++---
 llvm/test/Transforms/InstCombine/icmp-shr-lt-gt.ll     |  2 +-
 .../PhaseOrdering/icmp-ashr-breaking-select-idiom.ll   | 10 +++++++---
 4 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
index fd12b01736f3f..a09c9b48be9d5 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -2451,7 +2451,7 @@ Instruction *InstCombinerImpl::foldICmpShrConstant(ICmpInst &Cmp,
   // constant-value-based preconditions in the folds below, then we could assert
   // those conditions rather than checking them. This is difficult because of
   // undef/poison (PR34838).
-  if (IsAShr && Shr->hasOneUse()) {
+  if (IsAShr) {
     if (IsExact || Pred == CmpInst::ICMP_SLT || Pred == CmpInst::ICMP_ULT) {
       // When ShAmtC can be shifted losslessly:
       // icmp PRED (ashr exact X, ShAmtC), C --> icmp PRED X, (C << ShAmtC)
diff --git a/llvm/test/Transforms/InstCombine/ashr-icmp-minmax-idiom-break.ll b/llvm/test/Transforms/InstCombine/ashr-icmp-minmax-idiom-break.ll
index c6d6e916b2c78..a539b91366896 100644
--- a/llvm/test/Transforms/InstCombine/ashr-icmp-minmax-idiom-break.ll
+++ b/llvm/test/Transforms/InstCombine/ashr-icmp-minmax-idiom-break.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
 ; RUN: opt < %s -passes=instcombine -S | FileCheck %s
 
-; Check we don't have sub-optimal codegen due to min/max idiom breakage.
-; On AArch64, these constants are also expensive to materialize,
+; This test is pre-committed to show sub-optimal codegen due to
+; min/max idiom breakage. On AArch64, these constants are also expensive to materialize,
 ; and therefore generate poor code vs maintaining the min/max idiom.
 
 define i64 @dont_break_minmax_i64(i64 %conv, i64 %conv2) {
@@ -10,7 +10,8 @@ define i64 @dont_break_minmax_i64(i64 %conv, i64 %conv2) {
 ; CHECK-SAME: (i64 [[CONV:%.*]], i64 [[CONV2:%.*]]) {
 ; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i64 [[CONV]], [[CONV2]]
 ; CHECK-NEXT:    [[SHR:%.*]] = ashr i64 [[MUL]], 4
-; CHECK-NEXT:    [[SPEC_SELECT_I:%.*]] = call i64 @llvm.smin.i64(i64 [[SHR]], i64 348731)
+; CHECK-NEXT:    [[CMP4_I:%.*]] = icmp slt i64 [[MUL]], 5579712
+; CHECK-NEXT:    [[SPEC_SELECT_I:%.*]] = select i1 [[CMP4_I]], i64 [[SHR]], i64 348731
 ; CHECK-NEXT:    ret i64 [[SPEC_SELECT_I]]
 ;
   %mul = mul nsw i64 %conv, %conv2
diff --git a/llvm/test/Transforms/InstCombine/icmp-shr-lt-gt.ll b/llvm/test/Transforms/InstCombine/icmp-shr-lt-gt.ll
index 538590e188c44..08a763a50bf95 100644
--- a/llvm/test/Transforms/InstCombine/icmp-shr-lt-gt.ll
+++ b/llvm/test/Transforms/InstCombine/icmp-shr-lt-gt.ll
@@ -900,7 +900,7 @@ define i1 @ashrsgt_01_00(i4 %x) {
 define i1 @ashrsgt_01_00_multiuse(i4 %x, ptr %p) {
 ; CHECK-LABEL: @ashrsgt_01_00_multiuse(
 ; CHECK-NEXT:    [[S:%.*]] = ashr i4 [[X:%.*]], 1
-; CHECK-NEXT:    [[C:%.*]] = icmp sgt i4 [[S]], 0
+; CHECK-NEXT:    [[C:%.*]] = icmp sgt i4 [[X]], 1
 ; CHECK-NEXT:    store i4 [[S]], ptr [[P:%.*]], align 1
 ; CHECK-NEXT:    ret i1 [[C]]
 ;
diff --git a/llvm/test/Transforms/PhaseOrdering/icmp-ashr-breaking-select-idiom.ll b/llvm/test/Transforms/PhaseOrdering/icmp-ashr-breaking-select-idiom.ll
index 67d721b23d6f0..8559f973f281a 100644
--- a/llvm/test/Transforms/PhaseOrdering/icmp-ashr-breaking-select-idiom.ll
+++ b/llvm/test/Transforms/PhaseOrdering/icmp-ashr-breaking-select-idiom.ll
@@ -5,7 +5,8 @@ define i32 @testa(i32 %mul) {
 ; CHECK-LABEL: define i32 @testa(
 ; CHECK-SAME: i32 [[MUL:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:    [[SHR:%.*]] = ashr i32 [[MUL]], 15
-; CHECK-NEXT:    [[SPEC_SELECT_I:%.*]] = tail call i32 @llvm.smin.i32(i32 [[SHR]], i32 32767)
+; CHECK-NEXT:    [[CMP4_I:%.*]] = icmp slt i32 [[MUL]], 1073741824
+; CHECK-NEXT:    [[SPEC_SELECT_I:%.*]] = select i1 [[CMP4_I]], i32 [[SHR]], i32 32767
 ; CHECK-NEXT:    ret i32 [[SPEC_SELECT_I]]
 ;
   %shr = ashr i32 %mul, 15
@@ -19,8 +20,11 @@ define i32 @testb(i32 %mul) {
 ; CHECK-LABEL: define i32 @testb(
 ; CHECK-SAME: i32 [[MUL:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; CHECK-NEXT:    [[SHR102:%.*]] = ashr i32 [[MUL]], 7
-; CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.smax.i32(i32 [[SHR102]], i32 -128)
-; CHECK-NEXT:    [[SPEC_SELECT_I:%.*]] = tail call i32 @llvm.smin.i32(i32 [[TMP1]], i32 127)
+; CHECK-NEXT:    [[CMP4_I:%.*]] = icmp sgt i32 [[MUL]], 16383
+; CHECK-NEXT:    [[RETVAL_0_I:%.*]] = select i1 [[CMP4_I]], i32 127, i32 -128
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[MUL]], 16384
+; CHECK-NEXT:    [[CLEANUP_DEST_SLOT_0_I:%.*]] = icmp ult i32 [[TMP1]], 32768
+; CHECK-NEXT:    [[SPEC_SELECT_I:%.*]] = select i1 [[CLEANUP_DEST_SLOT_0_I]], i32 [[SHR102]], i32 [[RETVAL_0_I]]
 ; CHECK-NEXT:    ret i32 [[SPEC_SELECT_I]]
 ;
   %shr102 = ashr i32 %mul, 7

From 22c3d9d9a41b2cc12c38622735e9963374fe7b1c Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Fri, 27 Oct 2023 14:22:21 -0700
Subject: [PATCH 270/877] [libc++] Disable test which timeouts on ARM with
 sanitizers

Introduced with cf0f6a146038ca95e9ab9df40c153fd22015a042 (#70020).

https://lab.llvm.org/buildbot/#/builders/237/builds/5145
https://lab.llvm.org/buildbot/#/builders/236/builds/7004
https://lab.llvm.org/buildbot/#/builders/239/builds/4269

Sometimes it passes with Asan but it's extemly slow

```
Slowest Tests:
1388.61s: llvm-libc++-shared.cfg.in :: std/experimental/simd/simd.reference/reference_assignment.pass.cpp
309.42s:  llvm-libc++-shared.cfg.in :: std/containers/sequences/deque/deque.modifiers/insert_size_value.pass.cpp
```
---
 .../simd/simd.reference/reference_assignment.pass.cpp          | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/libcxx/test/std/experimental/simd/simd.reference/reference_assignment.pass.cpp b/libcxx/test/std/experimental/simd/simd.reference/reference_assignment.pass.cpp
index 4b943689c6fde..bf84cc4b04e3d 100644
--- a/libcxx/test/std/experimental/simd/simd.reference/reference_assignment.pass.cpp
+++ b/libcxx/test/std/experimental/simd/simd.reference/reference_assignment.pass.cpp
@@ -8,6 +8,9 @@
 
 // UNSUPPORTED: c++03, c++11, c++14
 
+// FIXME: Timeouts.
+// UNSUPPORTED: sanitizer-new-delete
+
 // <experimental/simd>
 //
 // [simd.reference]

From d3cbf9fb64399dbf8718c600d8ac63bd1afaa9d1 Mon Sep 17 00:00:00 2001
From: Jonas Devlieghere <jonas@devlieghere.com>
Date: Fri, 27 Oct 2023 14:50:21 -0700
Subject: [PATCH 271/877] [lldb] Add logging to ObjectFileMachO::ParseSymtab
 (#70490)

I have a crash when parsing the dyld trie data and I want to ask the
originator to send me the (possibly corrupted) binary that's causing
this. This patch adds some logging to help pinpoint that file.
---
 lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp b/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp
index efcfdf5ac2cd3..db186ab958020 100644
--- a/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp
+++ b/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp
@@ -2220,9 +2220,12 @@ void ObjectFileMachO::ParseSymtab(Symtab &symtab) {
   if (!module_sp)
     return;
 
+  Log *log = GetLog(LLDBLog::Symbols);
+
   const FileSpec &file = m_file ? m_file : module_sp->GetFileSpec();
   const char *file_name = file.GetFilename().AsCString("<Unknown>");
   LLDB_SCOPED_TIMERF("ObjectFileMachO::ParseSymtab () module = %s", file_name);
+  LLDB_LOG(log, "Parsing symbol table for {0}", file_name);
   Progress progress(llvm::formatv("Parsing symbol table for {0}", file_name));
 
   llvm::MachO::symtab_command symtab_load_command = {0, 0, 0, 0, 0, 0};
@@ -2252,7 +2255,6 @@ void ObjectFileMachO::ParseSymtab(Symtab &symtab) {
   lldb::offset_t offset = MachHeaderSizeFromMagic(m_header.magic);
   uint32_t i;
   FileSpecList dylib_files;
-  Log *log = GetLog(LLDBLog::Symbols);
   llvm::StringRef g_objc_v2_prefix_class("_OBJC_CLASS_$_");
   llvm::StringRef g_objc_v2_prefix_metaclass("_OBJC_METACLASS_$_");
   llvm::StringRef g_objc_v2_prefix_ivar("_OBJC_IVAR_$_");
@@ -2652,7 +2654,9 @@ void ObjectFileMachO::ParseSymtab(Symtab &symtab) {
   std::vector<TrieEntryWithOffset> external_sym_trie_entries;
   std::set<lldb::addr_t> resolver_addresses;
 
-  if (dyld_trie_data.GetByteSize() > 0) {
+  const size_t dyld_trie_data_size = dyld_trie_data.GetByteSize();
+  if (dyld_trie_data_size > 0) {
+    LLDB_LOG(log, "Parsing {0} bytes of dyld trie data", dyld_trie_data_size);
     SectionSP text_segment_sp =
         GetSectionList()->FindSectionByName(GetSegmentNameTEXT());
     lldb::addr_t text_segment_file_addr = LLDB_INVALID_ADDRESS;

From 6397ea77421979f47be27b769bceb6982f55894f Mon Sep 17 00:00:00 2001
From: Aart Bik <39774503+aartbik@users.noreply.github.com>
Date: Fri, 27 Oct 2023 15:16:45 -0700
Subject: [PATCH 272/877] [mlir][sparse] minor edit of map reinterpretation
 pass doc (#70513)

Fixed typo, 80-col
---
 .../mlir/Dialect/SparseTensor/Transforms/Passes.td  | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.td b/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.td
index db9cc701d253c..99dbd7ab3677e 100644
--- a/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.td
@@ -14,12 +14,13 @@ include "mlir/Pass/PassBase.td"
 def SparseReinterpretMap : Pass<"sparse-reinterpret-map", "ModuleOp"> {
   let summary = "Reinterprets sparse tensor type mappings";
   let description = [{
-    A pass that reinterprets the mappings in all sparse tensor types in a way that
-    enables subsequent sparification. This involves expressing all `linalg.generic`
-    operations in terms of level coordinates (rather than the dimension coordinates
-    of the input tensors) to align the iteration space with the potentially remapped
-    level space as well as resolving cycles in the resulting iteration graphs with
-    explicit sparse tensor conversions where needed.
+    A pass that reinterprets the mappings in all sparse tensor types in a
+    way that enables subsequent sparsification. This involves expressing all
+    `linalg.generic` operations in terms of level coordinates (rather than
+    the dimension coordinates of the input tensors) to align the iteration
+    space with the potentially remapped level space as well as resolving cycles
+    in the resulting iteration graphs with explicit sparse tensor conversions
+    where needed.
   }];
   let constructor = "mlir::createSparseReinterpretMapPass()";
   let dependentDialects = [

From 8ceb72ffe5750fa4d1f90b747b56727a5a74c78b Mon Sep 17 00:00:00 2001
From: Changpeng Fang <changpeng.fang@amd.com>
Date: Fri, 27 Oct 2023 15:28:31 -0700
Subject: [PATCH 273/877] [AMDGPU] make v32i16/v32f16 legal (#70484)

Some upcoming intrinsics will be using these new types
---
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp |   2 +-
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     | 102 +++-
 llvm/lib/Target/AMDGPU/SIInstructions.td      |  10 +
 llvm/lib/Target/AMDGPU/SIRegisterInfo.td      |   4 +-
 .../test/Analysis/CostModel/AMDGPU/add-sub.ll |   4 +-
 .../Analysis/CostModel/AMDGPU/arith-ssat.ll   |  16 +-
 .../Analysis/CostModel/AMDGPU/arith-usat.ll   |  16 +-
 llvm/test/Analysis/CostModel/AMDGPU/fadd.ll   |   4 +-
 llvm/test/Analysis/CostModel/AMDGPU/fdiv.ll   |   8 +-
 llvm/test/Analysis/CostModel/AMDGPU/fma.ll    |   4 +-
 llvm/test/Analysis/CostModel/AMDGPU/fmul.ll   |   4 +-
 llvm/test/Analysis/CostModel/AMDGPU/fsub.ll   |   4 +-
 llvm/test/Analysis/CostModel/AMDGPU/mul.ll    |  20 +-
 .../CodeGen/AMDGPU/inlineasm-illegal-type.ll  |   8 -
 llvm/test/CodeGen/AMDGPU/inlineasm-v16.ll     |  22 +
 llvm/test/CodeGen/AMDGPU/load-constant-i16.ll | 504 +++++++++---------
 llvm/test/CodeGen/AMDGPU/load-global-i16.ll   | 456 ++++++++--------
 17 files changed, 643 insertions(+), 545 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index adf4e0139e03c..4bf68d2934256 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -384,7 +384,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
        MVT::v12f32, MVT::v16f16, MVT::v16i16, MVT::v16f32, MVT::v16i32,
        MVT::v32f32, MVT::v32i32, MVT::v2f64,  MVT::v2i64,  MVT::v3f64,
        MVT::v3i64,  MVT::v4f64,  MVT::v4i64,  MVT::v8f64,  MVT::v8i64,
-       MVT::v16f64, MVT::v16i64},
+       MVT::v16f64, MVT::v16i64, MVT::v32i16, MVT::v32f16},
       Custom);
 
   setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 7d457edad0d5c..413b3b5afa57a 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -165,6 +165,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
     addRegisterClass(MVT::v8f16, &AMDGPU::SGPR_128RegClass);
     addRegisterClass(MVT::v16i16, &AMDGPU::SGPR_256RegClass);
     addRegisterClass(MVT::v16f16, &AMDGPU::SGPR_256RegClass);
+    addRegisterClass(MVT::v32i16, &AMDGPU::SGPR_512RegClass);
+    addRegisterClass(MVT::v32f16, &AMDGPU::SGPR_512RegClass);
   }
 
   addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass);
@@ -269,13 +271,13 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
   // We only support LOAD/STORE and vector manipulation ops for vectors
   // with > 4 elements.
   for (MVT VT :
-       {MVT::v8i32,  MVT::v8f32,  MVT::v9i32,   MVT::v9f32,  MVT::v10i32,
-        MVT::v10f32, MVT::v11i32, MVT::v11f32,  MVT::v12i32, MVT::v12f32,
-        MVT::v16i32, MVT::v16f32, MVT::v2i64,   MVT::v2f64,  MVT::v4i16,
-        MVT::v4f16,  MVT::v3i64,  MVT::v3f64,   MVT::v6i32,  MVT::v6f32,
-        MVT::v4i64,  MVT::v4f64,  MVT::v8i64,   MVT::v8f64,  MVT::v8i16,
-        MVT::v8f16,  MVT::v16i16, MVT::v16f16,  MVT::v16i64, MVT::v16f64,
-        MVT::v32i32, MVT::v32f32}) {
+       {MVT::v8i32,  MVT::v8f32,  MVT::v9i32,  MVT::v9f32,  MVT::v10i32,
+        MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
+        MVT::v16i32, MVT::v16f32, MVT::v2i64,  MVT::v2f64,  MVT::v4i16,
+        MVT::v4f16,  MVT::v3i64,  MVT::v3f64,  MVT::v6i32,  MVT::v6f32,
+        MVT::v4i64,  MVT::v4f64,  MVT::v8i64,  MVT::v8f64,  MVT::v8i16,
+        MVT::v8f16,  MVT::v16i16, MVT::v16f16, MVT::v16i64, MVT::v16f64,
+        MVT::v32i32, MVT::v32f32, MVT::v32i16, MVT::v32f16}) {
     for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
       switch (Op) {
       case ISD::LOAD:
@@ -553,8 +555,9 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
     if (STI.hasMadF16())
       setOperationAction(ISD::FMAD, MVT::f16, Legal);
 
-    for (MVT VT : {MVT::v2i16, MVT::v2f16, MVT::v4i16, MVT::v4f16, MVT::v8i16,
-                   MVT::v8f16, MVT::v16i16, MVT::v16f16}) {
+    for (MVT VT :
+         {MVT::v2i16, MVT::v2f16, MVT::v4i16, MVT::v4f16, MVT::v8i16,
+          MVT::v8f16, MVT::v16i16, MVT::v16f16, MVT::v32i16, MVT::v32f16}) {
       for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
         switch (Op) {
         case ISD::LOAD:
@@ -640,6 +643,16 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::STORE, MVT::v16f16, Promote);
     AddPromotedToType(ISD::STORE, MVT::v16f16, MVT::v8i32);
 
+    setOperationAction(ISD::LOAD, MVT::v32i16, Promote);
+    AddPromotedToType(ISD::LOAD, MVT::v32i16, MVT::v16i32);
+    setOperationAction(ISD::LOAD, MVT::v32f16, Promote);
+    AddPromotedToType(ISD::LOAD, MVT::v32f16, MVT::v16i32);
+
+    setOperationAction(ISD::STORE, MVT::v32i16, Promote);
+    AddPromotedToType(ISD::STORE, MVT::v32i16, MVT::v16i32);
+    setOperationAction(ISD::STORE, MVT::v32f16, Promote);
+    AddPromotedToType(ISD::STORE, MVT::v32f16, MVT::v16i32);
+
     setOperationAction({ISD::ANY_EXTEND, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND},
                        MVT::v2i32, Expand);
     setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Expand);
@@ -662,12 +675,15 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
     setOperationAction({ISD::FMAXNUM_IEEE, ISD::FMINNUM_IEEE}, MVT::f16, Legal);
 
     setOperationAction({ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE},
-                       {MVT::v4f16, MVT::v8f16, MVT::v16f16}, Custom);
+                       {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
+                       Custom);
 
     setOperationAction({ISD::FMINNUM, ISD::FMAXNUM},
-                       {MVT::v4f16, MVT::v8f16, MVT::v16f16}, Expand);
+                       {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
+                       Expand);
 
-    for (MVT Vec16 : {MVT::v8i16, MVT::v8f16, MVT::v16i16, MVT::v16f16}) {
+    for (MVT Vec16 : {MVT::v8i16, MVT::v8f16, MVT::v16i16, MVT::v16f16,
+                      MVT::v32i16, MVT::v32f16}) {
       setOperationAction(
           {ISD::BUILD_VECTOR, ISD::EXTRACT_VECTOR_ELT, ISD::SCALAR_TO_VECTOR},
           Vec16, Custom);
@@ -690,10 +706,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
 
     setOperationAction(ISD::VECTOR_SHUFFLE,
                        {MVT::v4f16, MVT::v4i16, MVT::v8f16, MVT::v8i16,
-                        MVT::v16f16, MVT::v16i16},
+                        MVT::v16f16, MVT::v16i16, MVT::v32f16, MVT::v32i16},
                        Custom);
 
-    for (MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16})
+    for (MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})
       // Split vector operations.
       setOperationAction({ISD::SHL, ISD::SRA, ISD::SRL, ISD::ADD, ISD::SUB,
                           ISD::MUL, ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX,
@@ -701,7 +717,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
                           ISD::SSUBSAT},
                          VT, Custom);
 
-    for (MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16})
+    for (MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16})
       // Split vector operations.
       setOperationAction({ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FCANONICALIZE},
                          VT, Custom);
@@ -737,7 +753,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
 
   setOperationAction(ISD::SELECT,
                      {MVT::v4i16, MVT::v4f16, MVT::v2i8, MVT::v4i8, MVT::v8i8,
-                      MVT::v8i16, MVT::v8f16, MVT::v16i16, MVT::v16f16},
+                      MVT::v8i16, MVT::v8f16, MVT::v16i16, MVT::v16f16,
+                      MVT::v32i16, MVT::v32f16},
                      Custom);
 
   setOperationAction({ISD::SMULO, ISD::UMULO}, MVT::i64, Custom);
@@ -5107,7 +5124,7 @@ SDValue SITargetLowering::splitUnaryVectorOp(SDValue Op,
   assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
          VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
          VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
-         VT == MVT::v32f32);
+         VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
 
   SDValue Lo, Hi;
   std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0);
@@ -5130,7 +5147,7 @@ SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op,
   assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
          VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
          VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
-         VT == MVT::v32f32);
+         VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
 
   SDValue Lo0, Hi0;
   std::tie(Lo0, Hi0) = DAG.SplitVectorOperand(Op.getNode(), 0);
@@ -5897,7 +5914,8 @@ SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op,
   if (IsIEEEMode)
     return expandFMINNUM_FMAXNUM(Op.getNode(), DAG);
 
-  if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16)
+  if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
+      VT == MVT::v16f16)
     return splitBinaryVectorOp(Op, DAG);
   return Op;
 }
@@ -6415,7 +6433,7 @@ SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
   if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI))
     return Combined;
 
-  if (VecSize == 128 || VecSize == 256) {
+  if (VecSize == 128 || VecSize == 256 || VecSize == 512) {
     SDValue Lo, Hi;
     EVT LoVT, HiVT;
     std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VecVT);
@@ -6428,9 +6446,7 @@ SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
       Hi = DAG.getBitcast(HiVT,
                           DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
                                       DAG.getConstant(1, SL, MVT::i32)));
-    } else {
-      assert(VecSize == 256);
-
+    } else if (VecSize == 256) {
       SDValue V2 = DAG.getBitcast(MVT::v4i64, Vec);
       SDValue Parts[4];
       for (unsigned P = 0; P < 4; ++P) {
@@ -6442,6 +6458,22 @@ SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
                                             Parts[0], Parts[1]));
       Hi = DAG.getBitcast(HiVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
                                             Parts[2], Parts[3]));
+    } else {
+      assert(VecSize == 512);
+
+      SDValue V2 = DAG.getBitcast(MVT::v8i64, Vec);
+      SDValue Parts[8];
+      for (unsigned P = 0; P < 8; ++P) {
+        Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
+                               DAG.getConstant(P, SL, MVT::i32));
+      }
+
+      Lo = DAG.getBitcast(LoVT,
+                          DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,
+                                      Parts[0], Parts[1], Parts[2], Parts[3]));
+      Hi = DAG.getBitcast(HiVT,
+                          DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,
+                                      Parts[4], Parts[5],Parts[6], Parts[7]));
     }
 
     EVT IdxVT = Idx.getValueType();
@@ -6607,6 +6639,27 @@ SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
     return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
   }
 
+  if (VT == MVT::v32i16 || VT == MVT::v32f16) {
+    EVT QuarterVT = MVT::getVectorVT(VT.getVectorElementType().getSimpleVT(),
+                                     VT.getVectorNumElements() / 8);
+    MVT QuarterIntVT = MVT::getIntegerVT(QuarterVT.getSizeInBits());
+
+    SmallVector<SDValue, 8> Parts[8];
+    for (unsigned I = 0, E = VT.getVectorNumElements() / 8; I != E; ++I) {
+      for (unsigned P = 0; P < 8; ++P)
+        Parts[P].push_back(Op.getOperand(I + P * E));
+    }
+    SDValue Casts[8];
+    for (unsigned P = 0; P < 8; ++P) {
+      SDValue Vec = DAG.getBuildVector(QuarterVT, SL, Parts[P]);
+      Casts[P] = DAG.getNode(ISD::BITCAST, SL, QuarterIntVT, Vec);
+    }
+
+    SDValue Blend =
+        DAG.getBuildVector(MVT::getVectorVT(QuarterIntVT, 8), SL, Casts);
+    return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
+  }
+
   assert(VT == MVT::v2f16 || VT == MVT::v2i16);
   assert(!Subtarget->hasVOP3PInsts() && "this should be legal");
 
@@ -9507,7 +9560,8 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
 
 SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
   EVT VT = Op.getValueType();
-  if (VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256)
+  if (VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256 ||
+      VT.getSizeInBits() == 512)
     return splitTernaryVectorOp(Op, DAG);
 
   assert(VT.getSizeInBits() == 64);
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 3cd0821b0f86c..e56269438472e 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -1619,6 +1619,16 @@ def : BitConvert <v12i32, v12f32, VReg_384>;
 def : BitConvert <v12f32, v12i32, VReg_384>;
 
 // 512-bit bitcast
+def : BitConvert <v32f16, v32i16, VReg_512>;
+def : BitConvert <v32i16, v32f16, VReg_512>;
+def : BitConvert <v32f16, v16i32, VReg_512>;
+def : BitConvert <v32f16, v16f32, VReg_512>;
+def : BitConvert <v16f32, v32f16, VReg_512>;
+def : BitConvert <v16i32, v32f16, VReg_512>;
+def : BitConvert <v32i16, v16i32, VReg_512>;
+def : BitConvert <v32i16, v16f32, VReg_512>;
+def : BitConvert <v16f32, v32i16, VReg_512>;
+def : BitConvert <v16i32, v32i16, VReg_512>;
 def : BitConvert <v16i32, v16f32, VReg_512>;
 def : BitConvert <v16f32, v16i32, VReg_512>;
 def : BitConvert <v8i64,  v8f64,  VReg_512>;
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index 304ee53cc5a87..7ea2280c474b0 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -930,7 +930,7 @@ defm "" : SRegClass<11, [v11i32, v11f32], SGPR_352Regs, TTMP_352Regs>;
 defm "" : SRegClass<12, [v12i32, v12f32], SGPR_384Regs, TTMP_384Regs>;
 
 let GlobalPriority = true in {
-defm "" : SRegClass<16, [v16i32, v16f32, v8i64, v8f64], SGPR_512Regs, TTMP_512Regs>;
+defm "" : SRegClass<16, [v16i32, v16f32, v8i64, v8f64, v32i16, v32f16], SGPR_512Regs, TTMP_512Regs>;
 defm "" : SRegClass<32, [v32i32, v32f32, v16i64, v16f64], SGPR_1024Regs>;
 }
 
@@ -984,7 +984,7 @@ defm VReg_352 : VRegClass<11, [v11i32, v11f32], (add VGPR_352)>;
 defm VReg_384 : VRegClass<12, [v12i32, v12f32], (add VGPR_384)>;
 
 let GlobalPriority = true in {
-defm VReg_512 : VRegClass<16, [v16i32, v16f32, v8i64, v8f64], (add VGPR_512)>;
+defm VReg_512 : VRegClass<16, [v16i32, v16f32, v8i64, v8f64, v32i16, v32f16], (add VGPR_512)>;
 defm VReg_1024 : VRegClass<32, [v32i32, v32f32, v16i64, v16f64], (add VGPR_1024)>;
 }
 
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/add-sub.ll b/llvm/test/Analysis/CostModel/AMDGPU/add-sub.ll
index 2bebc5ed9b53b..2a966b4ea178f 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/add-sub.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/add-sub.ll
@@ -76,7 +76,7 @@ define amdgpu_kernel void @add_i16() #0 {
 ; FAST16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v5i16 = add <5 x i16> undef, undef
 ; FAST16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v6i16 = add <6 x i16> undef, undef
 ; FAST16-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i16 = add <16 x i16> undef, undef
-; FAST16-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v17i16 = add <17 x i16> undef, undef
+; FAST16-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v17i16 = add <17 x i16> undef, undef
 ; FAST16-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; SLOW16-LABEL: 'add_i16'
@@ -98,7 +98,7 @@ define amdgpu_kernel void @add_i16() #0 {
 ; FAST16-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v5i16 = add <5 x i16> undef, undef
 ; FAST16-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v6i16 = add <6 x i16> undef, undef
 ; FAST16-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16i16 = add <16 x i16> undef, undef
-; FAST16-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v17i16 = add <17 x i16> undef, undef
+; FAST16-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v17i16 = add <17 x i16> undef, undef
 ; FAST16-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SLOW16-SIZE-LABEL: 'add_i16'
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/arith-ssat.ll b/llvm/test/Analysis/CostModel/AMDGPU/arith-ssat.ll
index b57f26cdc2928..564bc4912af7d 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/arith-ssat.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/arith-ssat.ll
@@ -57,8 +57,8 @@ define i32 @add(i32 %arg) {
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I16 = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> undef, <4 x i16> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V17I16 = call <17 x i16> @llvm.sadd.sat.v17i16(<17 x i16> undef, <17 x i16> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %V17I16 = call <17 x i16> @llvm.sadd.sat.v17i16(<17 x i16> undef, <17 x i16> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I8 = call <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8> undef, <2 x i8> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I8 = call <4 x i8> @llvm.sadd.sat.v4i8(<4 x i8> undef, <4 x i8> undef)
@@ -115,8 +115,8 @@ define i32 @add(i32 %arg) {
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = call <4 x i16> @llvm.sadd.sat.v4i16(<4 x i16> undef, <4 x i16> undef)
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V17I16 = call <17 x i16> @llvm.sadd.sat.v17i16(<17 x i16> undef, <17 x i16> undef)
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V17I16 = call <17 x i16> @llvm.sadd.sat.v17i16(<17 x i16> undef, <17 x i16> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V32I16 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.sadd.sat.i8(i8 undef, i8 undef)
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I8 = call <2 x i8> @llvm.sadd.sat.v2i8(<2 x i8> undef, <2 x i8> undef)
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I8 = call <4 x i8> @llvm.sadd.sat.v4i8(<4 x i8> undef, <4 x i8> undef)
@@ -237,8 +237,8 @@ define i32 @sub(i32 %arg) {
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I16 = call <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16> undef, <4 x i16> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V17I16 = call <17 x i16> @llvm.ssub.sat.v17i16(<17 x i16> undef, <17 x i16> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %V17I16 = call <17 x i16> @llvm.ssub.sat.v17i16(<17 x i16> undef, <17 x i16> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I8 = call <2 x i8> @llvm.ssub.sat.v2i8(<2 x i8> undef, <2 x i8> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I8 = call <4 x i8> @llvm.ssub.sat.v4i8(<4 x i8> undef, <4 x i8> undef)
@@ -295,8 +295,8 @@ define i32 @sub(i32 %arg) {
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = call <4 x i16> @llvm.ssub.sat.v4i16(<4 x i16> undef, <4 x i16> undef)
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V17I16 = call <17 x i16> @llvm.ssub.sat.v17i16(<17 x i16> undef, <17 x i16> undef)
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V17I16 = call <17 x i16> @llvm.ssub.sat.v17i16(<17 x i16> undef, <17 x i16> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V32I16 = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.ssub.sat.i8(i8 undef, i8 undef)
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I8 = call <2 x i8> @llvm.ssub.sat.v2i8(<2 x i8> undef, <2 x i8> undef)
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I8 = call <4 x i8> @llvm.ssub.sat.v4i8(<4 x i8> undef, <4 x i8> undef)
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/arith-usat.ll b/llvm/test/Analysis/CostModel/AMDGPU/arith-usat.ll
index b1ff4a4a0acb1..d6481caef916d 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/arith-usat.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/arith-usat.ll
@@ -57,8 +57,8 @@ define i32 @add(i32 %arg) {
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I16 = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> undef, <4 x i16> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V17I16 = call <17 x i16> @llvm.uadd.sat.v17i16(<17 x i16> undef, <17 x i16> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %V17I16 = call <17 x i16> @llvm.uadd.sat.v17i16(<17 x i16> undef, <17 x i16> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I8 = call <2 x i8> @llvm.uadd.sat.v2i8(<2 x i8> undef, <2 x i8> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I8 = call <4 x i8> @llvm.uadd.sat.v4i8(<4 x i8> undef, <4 x i8> undef)
@@ -115,8 +115,8 @@ define i32 @add(i32 %arg) {
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> undef, <4 x i16> undef)
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.uadd.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V17I16 = call <17 x i16> @llvm.uadd.sat.v17i16(<17 x i16> undef, <17 x i16> undef)
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V17I16 = call <17 x i16> @llvm.uadd.sat.v17i16(<17 x i16> undef, <17 x i16> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V32I16 = call <32 x i16> @llvm.uadd.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.uadd.sat.i8(i8 undef, i8 undef)
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I8 = call <2 x i8> @llvm.uadd.sat.v2i8(<2 x i8> undef, <2 x i8> undef)
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I8 = call <4 x i8> @llvm.uadd.sat.v4i8(<4 x i8> undef, <4 x i8> undef)
@@ -237,8 +237,8 @@ define i32 @sub(i32 %arg) {
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I16 = call <4 x i16> @llvm.usub.sat.v4i16(<4 x i16> undef, <4 x i16> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V17I16 = call <17 x i16> @llvm.usub.sat.v17i16(<17 x i16> undef, <17 x i16> undef)
-; FAST-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %V17I16 = call <17 x i16> @llvm.usub.sat.v17i16(<17 x i16> undef, <17 x i16> undef)
+; FAST-NEXT:  Cost Model: Found an estimated cost of 384 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2I8 = call <2 x i8> @llvm.usub.sat.v2i8(<2 x i8> undef, <2 x i8> undef)
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V4I8 = call <4 x i8> @llvm.usub.sat.v4i8(<4 x i8> undef, <4 x i8> undef)
@@ -295,8 +295,8 @@ define i32 @sub(i32 %arg) {
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I16 = call <4 x i16> @llvm.usub.sat.v4i16(<4 x i16> undef, <4 x i16> undef)
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I16 = call <8 x i16> @llvm.usub.sat.v8i16(<8 x i16> undef, <8 x i16> undef)
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I16 = call <16 x i16> @llvm.usub.sat.v16i16(<16 x i16> undef, <16 x i16> undef)
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V17I16 = call <17 x i16> @llvm.usub.sat.v17i16(<17 x i16> undef, <17 x i16> undef)
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V17I16 = call <17 x i16> @llvm.usub.sat.v17i16(<17 x i16> undef, <17 x i16> undef)
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V32I16 = call <32 x i16> @llvm.usub.sat.v32i16(<32 x i16> undef, <32 x i16> undef)
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = call i8 @llvm.usub.sat.i8(i8 undef, i8 undef)
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I8 = call <2 x i8> @llvm.usub.sat.v2i8(<2 x i8> undef, <2 x i8> undef)
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I8 = call <4 x i8> @llvm.usub.sat.v4i8(<4 x i8> undef, <4 x i8> undef)
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fadd.ll b/llvm/test/Analysis/CostModel/AMDGPU/fadd.ll
index d22d8a98b4a43..55994d865fa6c 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/fadd.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/fadd.ll
@@ -115,7 +115,7 @@ define amdgpu_kernel void @fadd_f16() #0 {
 ; FASTF16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = fadd <4 x half> undef, undef
 ; FASTF16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = fadd <5 x half> undef, undef
 ; FASTF16-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = fadd <16 x half> undef, undef
-; FASTF16-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v17f16 = fadd <17 x half> undef, undef
+; FASTF16-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v17f16 = fadd <17 x half> undef, undef
 ; FASTF16-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; SLOWF64-LABEL: 'fadd_f16'
@@ -135,7 +135,7 @@ define amdgpu_kernel void @fadd_f16() #0 {
 ; FASTF16-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = fadd <4 x half> undef, undef
 ; FASTF16-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = fadd <5 x half> undef, undef
 ; FASTF16-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = fadd <16 x half> undef, undef
-; FASTF16-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v17f16 = fadd <17 x half> undef, undef
+; FASTF16-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v17f16 = fadd <17 x half> undef, undef
 ; FASTF16-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SLOWF64-SIZE-LABEL: 'fadd_f16'
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fdiv.ll b/llvm/test/Analysis/CostModel/AMDGPU/fdiv.ll
index 2830bfcdaed20..911b4319eaa4e 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/fdiv.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/fdiv.ll
@@ -227,7 +227,7 @@ define amdgpu_kernel void @fdiv_f16_f32ieee() #0 {
 ; FP16-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v4f16 = fdiv <4 x half> undef, undef
 ; FP16-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v5f16 = fdiv <5 x half> undef, undef
 ; FP16-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %v16f16 = fdiv <16 x half> undef, undef
-; FP16-NEXT:  Cost Model: Found an estimated cost of 768 for instruction: %v17f16 = fdiv <17 x half> undef, undef
+; FP16-NEXT:  Cost Model: Found an estimated cost of 1152 for instruction: %v17f16 = fdiv <17 x half> undef, undef
 ; FP16-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; NOFP16-SIZE-LABEL: 'fdiv_f16_f32ieee'
@@ -247,7 +247,7 @@ define amdgpu_kernel void @fdiv_f16_f32ieee() #0 {
 ; FP16-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v4f16 = fdiv <4 x half> undef, undef
 ; FP16-SIZE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v5f16 = fdiv <5 x half> undef, undef
 ; FP16-SIZE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %v16f16 = fdiv <16 x half> undef, undef
-; FP16-SIZE-NEXT:  Cost Model: Found an estimated cost of 512 for instruction: %v17f16 = fdiv <17 x half> undef, undef
+; FP16-SIZE-NEXT:  Cost Model: Found an estimated cost of 768 for instruction: %v17f16 = fdiv <17 x half> undef, undef
 ; FP16-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %f16 = fdiv half undef, undef
@@ -278,7 +278,7 @@ define amdgpu_kernel void @fdiv_f16_f32ftzdaz() #1 {
 ; FP16-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v4f16 = fdiv <4 x half> undef, undef
 ; FP16-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v5f16 = fdiv <5 x half> undef, undef
 ; FP16-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %v16f16 = fdiv <16 x half> undef, undef
-; FP16-NEXT:  Cost Model: Found an estimated cost of 768 for instruction: %v17f16 = fdiv <17 x half> undef, undef
+; FP16-NEXT:  Cost Model: Found an estimated cost of 1152 for instruction: %v17f16 = fdiv <17 x half> undef, undef
 ; FP16-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; NOFP16-SIZE-LABEL: 'fdiv_f16_f32ftzdaz'
@@ -298,7 +298,7 @@ define amdgpu_kernel void @fdiv_f16_f32ftzdaz() #1 {
 ; FP16-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v4f16 = fdiv <4 x half> undef, undef
 ; FP16-SIZE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v5f16 = fdiv <5 x half> undef, undef
 ; FP16-SIZE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %v16f16 = fdiv <16 x half> undef, undef
-; FP16-SIZE-NEXT:  Cost Model: Found an estimated cost of 512 for instruction: %v17f16 = fdiv <17 x half> undef, undef
+; FP16-SIZE-NEXT:  Cost Model: Found an estimated cost of 768 for instruction: %v17f16 = fdiv <17 x half> undef, undef
 ; FP16-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %f16 = fdiv half undef, undef
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fma.ll b/llvm/test/Analysis/CostModel/AMDGPU/fma.ll
index a9f1210a598f0..ab4e98201f6d7 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/fma.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/fma.ll
@@ -145,7 +145,7 @@ define amdgpu_kernel void @fma_f16() #0 {
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = call <4 x half> @llvm.fma.v4f16(<4 x half> undef, <4 x half> undef, <4 x half> undef) #2
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v5f16 = call <5 x half> @llvm.fma.v5f16(<5 x half> undef, <5 x half> undef, <5 x half> undef) #2
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = call <16 x half> @llvm.fma.v16f16(<16 x half> undef, <16 x half> undef, <16 x half> undef) #2
-; FAST-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v17f16 = call <17 x half> @llvm.fma.v17f16(<17 x half> undef, <17 x half> undef, <17 x half> undef) #2
+; FAST-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v17f16 = call <17 x half> @llvm.fma.v17f16(<17 x half> undef, <17 x half> undef, <17 x half> undef) #2
 ; FAST-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; SLOW-LABEL: 'fma_f16'
@@ -165,7 +165,7 @@ define amdgpu_kernel void @fma_f16() #0 {
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f16 = call <4 x half> @llvm.fma.v4f16(<4 x half> undef, <4 x half> undef, <4 x half> undef) #2
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v5f16 = call <5 x half> @llvm.fma.v5f16(<5 x half> undef, <5 x half> undef, <5 x half> undef) #2
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16f16 = call <16 x half> @llvm.fma.v16f16(<16 x half> undef, <16 x half> undef, <16 x half> undef) #2
-; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v17f16 = call <17 x half> @llvm.fma.v17f16(<17 x half> undef, <17 x half> undef, <17 x half> undef) #2
+; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v17f16 = call <17 x half> @llvm.fma.v17f16(<17 x half> undef, <17 x half> undef, <17 x half> undef) #2
 ; FAST-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SLOW-SIZE-LABEL: 'fma_f16'
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fmul.ll b/llvm/test/Analysis/CostModel/AMDGPU/fmul.ll
index c8dab09e0dbf7..2e4a9c70f3717 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/fmul.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/fmul.ll
@@ -115,7 +115,7 @@ define amdgpu_kernel void @fmul_f16() #0 {
 ; GFX9-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = fmul <4 x half> undef, undef
 ; GFX9-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = fmul <5 x half> undef, undef
 ; GFX9-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = fmul <16 x half> undef, undef
-; GFX9-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v17f16 = fmul <17 x half> undef, undef
+; GFX9-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v17f16 = fmul <17 x half> undef, undef
 ; GFX9-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; SLOW-LABEL: 'fmul_f16'
@@ -135,7 +135,7 @@ define amdgpu_kernel void @fmul_f16() #0 {
 ; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = fmul <4 x half> undef, undef
 ; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = fmul <5 x half> undef, undef
 ; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = fmul <16 x half> undef, undef
-; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v17f16 = fmul <17 x half> undef, undef
+; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v17f16 = fmul <17 x half> undef, undef
 ; GFX9-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SLOW-SIZE-LABEL: 'fmul_f16'
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/fsub.ll b/llvm/test/Analysis/CostModel/AMDGPU/fsub.ll
index b3bf580e75e66..4e71a71326bad 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/fsub.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/fsub.ll
@@ -115,7 +115,7 @@ define amdgpu_kernel void @fsub_f16() #0 {
 ; FASTF16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = fsub <4 x half> undef, undef
 ; FASTF16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = fsub <5 x half> undef, undef
 ; FASTF16-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = fsub <16 x half> undef, undef
-; FASTF16-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v17f16 = fsub <17 x half> undef, undef
+; FASTF16-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v17f16 = fsub <17 x half> undef, undef
 ; FASTF16-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; SLOWF64-LABEL: 'fsub_f16'
@@ -135,7 +135,7 @@ define amdgpu_kernel void @fsub_f16() #0 {
 ; FASTF16-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = fsub <4 x half> undef, undef
 ; FASTF16-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v5f16 = fsub <5 x half> undef, undef
 ; FASTF16-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f16 = fsub <16 x half> undef, undef
-; FASTF16-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v17f16 = fsub <17 x half> undef, undef
+; FASTF16-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v17f16 = fsub <17 x half> undef, undef
 ; FASTF16-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SLOWF64-SIZE-LABEL: 'fsub_f16'
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/mul.ll b/llvm/test/Analysis/CostModel/AMDGPU/mul.ll
index 1444db7248330..e6193791ff53a 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/mul.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/mul.ll
@@ -79,7 +79,7 @@ define amdgpu_kernel void @mul_i16() #0 {
 ; FAST16-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4i16 = mul <4 x i16> undef, undef
 ; FAST16-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v5i16 = mul <5 x i16> undef, undef
 ; FAST16-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16i16 = mul <16 x i16> undef, undef
-; FAST16-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %v17i16 = mul <17 x i16> undef, undef
+; FAST16-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %v17i16 = mul <17 x i16> undef, undef
 ; FAST16-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
 ; SLOW16-SIZE-LABEL: 'mul_i16'
@@ -99,7 +99,7 @@ define amdgpu_kernel void @mul_i16() #0 {
 ; FAST16-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4i16 = mul <4 x i16> undef, undef
 ; FAST16-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v5i16 = mul <5 x i16> undef, undef
 ; FAST16-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16i16 = mul <16 x i16> undef, undef
-; FAST16-SIZE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v17i16 = mul <17 x i16> undef, undef
+; FAST16-SIZE-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v17i16 = mul <17 x i16> undef, undef
 ; FAST16-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %i16 = mul i16 undef, undef
@@ -144,7 +144,7 @@ define i32 @mul_constpow2() {
 ; FAST16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = mul i16 undef, 16
 ; FAST16-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8i16 = mul <8 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
 ; FAST16-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16i16 = mul <16 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
-; FAST16-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V32i16 = mul <32 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
+; FAST16-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V32i16 = mul <32 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
 ; FAST16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = mul i8 undef, 16
 ; FAST16-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16i8 = mul <16 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
 ; FAST16-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V32i8 = mul <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
@@ -182,7 +182,7 @@ define i32 @mul_constpow2() {
 ; FAST16-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = mul i16 undef, 16
 ; FAST16-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8i16 = mul <8 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
 ; FAST16-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16i16 = mul <16 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
-; FAST16-SIZE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32i16 = mul <32 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
+; FAST16-SIZE-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32i16 = mul <32 x i16> undef, <i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 128, i16 256>
 ; FAST16-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = mul i8 undef, 16
 ; FAST16-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16i8 = mul <16 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
 ; FAST16-SIZE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32i8 = mul <32 x i8> undef, <i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16, i8 2, i8 4, i8 8, i8 16>
@@ -244,7 +244,7 @@ define i32 @mul_uniformconstpow2() {
 ; FAST16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = mul i16 undef, 16
 ; FAST16-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8i16 = mul <8 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
 ; FAST16-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16i16 = mul <16 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
-; FAST16-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V32i16 = mul <32 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
+; FAST16-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V32i16 = mul <32 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
 ; FAST16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = mul i8 undef, 16
 ; FAST16-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16i8 = mul <16 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
 ; FAST16-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V32i8 = mul <32 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
@@ -282,7 +282,7 @@ define i32 @mul_uniformconstpow2() {
 ; FAST16-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = mul i16 undef, 16
 ; FAST16-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8i16 = mul <8 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
 ; FAST16-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16i16 = mul <16 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
-; FAST16-SIZE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32i16 = mul <32 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
+; FAST16-SIZE-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32i16 = mul <32 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
 ; FAST16-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = mul i8 undef, 16
 ; FAST16-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16i8 = mul <16 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
 ; FAST16-SIZE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32i8 = mul <32 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>
@@ -344,7 +344,7 @@ define i32 @mul_constnegpow2() {
 ; FAST16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = mul i16 undef, -16
 ; FAST16-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8i16 = mul <8 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
 ; FAST16-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16i16 = mul <16 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
-; FAST16-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V32i16 = mul <32 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
+; FAST16-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V32i16 = mul <32 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
 ; FAST16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = mul i8 undef, -16
 ; FAST16-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16i8 = mul <16 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
 ; FAST16-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V32i8 = mul <32 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
@@ -382,7 +382,7 @@ define i32 @mul_constnegpow2() {
 ; FAST16-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = mul i16 undef, -16
 ; FAST16-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8i16 = mul <8 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
 ; FAST16-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16i16 = mul <16 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
-; FAST16-SIZE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32i16 = mul <32 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
+; FAST16-SIZE-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32i16 = mul <32 x i16> undef, <i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256, i16 -2, i16 -4, i16 -8, i16 -16, i16 -32, i16 -64, i16 -128, i16 -256>
 ; FAST16-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = mul i8 undef, -16
 ; FAST16-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16i8 = mul <16 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
 ; FAST16-SIZE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32i8 = mul <32 x i8> undef, <i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16, i8 -2, i8 -4, i8 -8, i8 -16>
@@ -444,7 +444,7 @@ define i32 @mul_uniformconstnegpow2() {
 ; FAST16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = mul i16 undef, -16
 ; FAST16-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8i16 = mul <8 x i16> undef, <i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16>
 ; FAST16-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16i16 = mul <16 x i16> undef, <i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16>
-; FAST16-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V32i16 = mul <32 x i16> undef, <i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16>
+; FAST16-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %V32i16 = mul <32 x i16> undef, <i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16>
 ; FAST16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = mul i8 undef, -16
 ; FAST16-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V16i8 = mul <16 x i8> undef, <i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16>
 ; FAST16-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %V32i8 = mul <32 x i8> undef, <i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16>
@@ -482,7 +482,7 @@ define i32 @mul_uniformconstnegpow2() {
 ; FAST16-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I16 = mul i16 undef, -16
 ; FAST16-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8i16 = mul <8 x i16> undef, <i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16>
 ; FAST16-SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16i16 = mul <16 x i16> undef, <i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16>
-; FAST16-SIZE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32i16 = mul <32 x i16> undef, <i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16>
+; FAST16-SIZE-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V32i16 = mul <32 x i16> undef, <i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16>
 ; FAST16-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I8 = mul i8 undef, -16
 ; FAST16-SIZE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16i8 = mul <16 x i8> undef, <i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16>
 ; FAST16-SIZE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V32i8 = mul <32 x i8> undef, <i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16, i8 -16>
diff --git a/llvm/test/CodeGen/AMDGPU/inlineasm-illegal-type.ll b/llvm/test/CodeGen/AMDGPU/inlineasm-illegal-type.ll
index e81c12126953d..ba9d9f75230d1 100644
--- a/llvm/test/CodeGen/AMDGPU/inlineasm-illegal-type.ll
+++ b/llvm/test/CodeGen/AMDGPU/inlineasm-illegal-type.ll
@@ -18,14 +18,6 @@ define amdgpu_kernel void @v_input_output_i8() {
   ret void
 }
 
-; GCN: error: couldn't allocate output register for constraint 's'
-; GCN: error: couldn't allocate input reg for constraint 's'
-define amdgpu_kernel void @s_input_output_v32f16() {
-  %v = tail call <32 x half> asm sideeffect "s_mov_b32 $0, -1", "=s"()
-  tail call void asm sideeffect "; use $0", "s"(<32 x half> %v)
-  ret void
-}
-
 ; SICI: error: couldn't allocate output register for constraint 's'
 ; SICI: error: couldn't allocate input reg for constraint 's'
 ; VI-NOT: error
diff --git a/llvm/test/CodeGen/AMDGPU/inlineasm-v16.ll b/llvm/test/CodeGen/AMDGPU/inlineasm-v16.ll
index 2093324622225..c3c08b55f607d 100644
--- a/llvm/test/CodeGen/AMDGPU/inlineasm-v16.ll
+++ b/llvm/test/CodeGen/AMDGPU/inlineasm-v16.ll
@@ -90,4 +90,26 @@ define amdgpu_kernel void @v_input_output_v16i16() {
   ret void
 }
 
+; GCN-LABEL: {{^}}v_input_output_v32f16
+; GCN: v_mov_b32 v[0:15], -1
+; GCN: ; use v[0:15]
+; INVALID: error: couldn't allocate output register for constraint 'v'
+; INVALID: error: couldn't allocate input reg for constraint 'v'
+define amdgpu_kernel void @v_input_output_v32f16() {
+  %v = tail call <32 x half> asm sideeffect "v_mov_b32 $0, -1", "=v"()
+  tail call void asm sideeffect "; use $0", "v"(<32 x half> %v)
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_input_output_v32i16
+; GCN: v_mov_b32 v[0:15], -1
+; GCN: ; use v[0:15]
+; INVALID: error: couldn't allocate output register for constraint 'v'
+; INVALID: error: couldn't allocate input reg for constraint 'v'
+define amdgpu_kernel void @v_input_output_v32i16() {
+  %v = tail call <32 x i16> asm sideeffect "v_mov_b32 $0, -1", "=v"()
+  tail call void asm sideeffect "; use $0", "v"(<32 x i16> %v)
+  ret void
+}
+
 attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
index bee3d455187ca..5332da6827ec3 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
@@ -2341,104 +2341,104 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i32(ptr addrspace(1) %
 ; GCN-NOHSA-VI-NEXT:    s_load_dwordx16 s[0:15], s[18:19], 0x0
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s18, s1, 16
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s19, s0, 16
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s20, s3, 16
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s21, s2, 16
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s22, s5, 16
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s23, s4, 16
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s24, s7, 16
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s25, s6, 16
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s26, s9, 16
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s27, s8, 16
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s28, s11, 16
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s29, s10, 16
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s30, s13, 16
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s31, s12, 16
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s33, s15, 16
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s34, s14, 16
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s35, s1, 0xffff
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s36, s0, 0xffff
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s19, s1, 0xffff
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s20, s0, 16
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s21, s0, 0xffff
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s22, s3, 16
 ; GCN-NOHSA-VI-NEXT:    s_and_b32 s3, s3, 0xffff
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s23, s2, 16
 ; GCN-NOHSA-VI-NEXT:    s_and_b32 s2, s2, 0xffff
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s24, s5, 16
 ; GCN-NOHSA-VI-NEXT:    s_and_b32 s5, s5, 0xffff
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s25, s4, 16
 ; GCN-NOHSA-VI-NEXT:    s_and_b32 s4, s4, 0xffff
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s26, s7, 16
 ; GCN-NOHSA-VI-NEXT:    s_and_b32 s7, s7, 0xffff
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s27, s6, 16
 ; GCN-NOHSA-VI-NEXT:    s_and_b32 s6, s6, 0xffff
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s28, s9, 16
 ; GCN-NOHSA-VI-NEXT:    s_and_b32 s9, s9, 0xffff
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s29, s8, 16
 ; GCN-NOHSA-VI-NEXT:    s_and_b32 s8, s8, 0xffff
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s30, s11, 16
 ; GCN-NOHSA-VI-NEXT:    s_and_b32 s11, s11, 0xffff
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s31, s10, 16
 ; GCN-NOHSA-VI-NEXT:    s_and_b32 s10, s10, 0xffff
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s33, s13, 16
 ; GCN-NOHSA-VI-NEXT:    s_and_b32 s13, s13, 0xffff
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s34, s12, 16
 ; GCN-NOHSA-VI-NEXT:    s_and_b32 s12, s12, 0xffff
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s0, s15, 0xffff
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s1, s14, 0xffff
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s0
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s0, s15, 16
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s1, s15, 0xffff
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s15, s14, 16
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s14, s14, 0xffff
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s0
 ; GCN-NOHSA-VI-NEXT:    s_add_u32 s0, s16, 0x70
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s1
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s1
 ; GCN-NOHSA-VI-NEXT:    s_addc_u32 s1, s17, 0
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v5, s1
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v4, s0
 ; GCN-NOHSA-VI-NEXT:    s_add_u32 s0, s16, 0x60
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s34
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s33
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s14
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s15
 ; GCN-NOHSA-VI-NEXT:    s_addc_u32 s1, s17, 0
 ; GCN-NOHSA-VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v5, s1
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v4, s0
 ; GCN-NOHSA-VI-NEXT:    s_add_u32 s0, s16, 0x50
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s12
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s31
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s34
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s13
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s30
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s33
 ; GCN-NOHSA-VI-NEXT:    s_addc_u32 s1, s17, 0
 ; GCN-NOHSA-VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v5, s1
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v4, s0
 ; GCN-NOHSA-VI-NEXT:    s_add_u32 s0, s16, 64
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s10
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s29
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s31
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s11
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s28
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s30
 ; GCN-NOHSA-VI-NEXT:    s_addc_u32 s1, s17, 0
 ; GCN-NOHSA-VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v5, s1
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v4, s0
 ; GCN-NOHSA-VI-NEXT:    s_add_u32 s0, s16, 48
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s8
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s27
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s29
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s9
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s26
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s28
 ; GCN-NOHSA-VI-NEXT:    s_addc_u32 s1, s17, 0
 ; GCN-NOHSA-VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v5, s1
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v4, s0
 ; GCN-NOHSA-VI-NEXT:    s_add_u32 s0, s16, 32
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s25
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s27
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s7
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s24
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s26
 ; GCN-NOHSA-VI-NEXT:    s_addc_u32 s1, s17, 0
 ; GCN-NOHSA-VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v5, s1
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v4, s0
 ; GCN-NOHSA-VI-NEXT:    s_add_u32 s0, s16, 16
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s23
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s25
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s5
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s22
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s24
 ; GCN-NOHSA-VI-NEXT:    s_addc_u32 s1, s17, 0
 ; GCN-NOHSA-VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v5, s1
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s21
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s23
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s3
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s20
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s22
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v4, s0
 ; GCN-NOHSA-VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v4, s16
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s36
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s19
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s35
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s21
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s20
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s19
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s18
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v5, s17
 ; GCN-NOHSA-VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
@@ -6598,134 +6598,127 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i64(ptr addrspace(1) %
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NOHSA-VI-NEXT:    s_load_dwordx16 s[0:15], s[18:19], 0x0
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s20, s1, 16
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s21, s3, 16
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s22, s5, 16
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s23, s7, 16
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s24, s9, 16
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s25, s11, 16
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s26, s13, 16
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s27, s15, 16
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s28, s14, 16
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s29, s12, 16
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s30, s10, 16
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s31, s8, 16
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s33, s6, 16
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s34, s4, 16
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s19, s2, 16
 ; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s18, s0, 16
 ; GCN-NOHSA-VI-NEXT:    s_and_b32 s0, s0, 0xffff
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s2, s2, 0xffff
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s35, s4, 0xffff
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s6, s6, 0xffff
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s8, s8, 0xffff
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s10, s10, 0xffff
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s12, s12, 0xffff
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s14, s14, 0xffff
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s19, s1, 16
 ; GCN-NOHSA-VI-NEXT:    s_and_b32 s1, s1, 0xffff
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s20, s2, 16
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s2, s2, 0xffff
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s21, s3, 16
 ; GCN-NOHSA-VI-NEXT:    s_and_b32 s3, s3, 0xffff
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s36, s5, 0xffff
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s22, s4, 16
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s23, s4, 0xffff
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s24, s5, 16
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s25, s5, 0xffff
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s26, s6, 16
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s6, s6, 0xffff
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s27, s7, 16
 ; GCN-NOHSA-VI-NEXT:    s_and_b32 s7, s7, 0xffff
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s28, s8, 16
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s8, s8, 0xffff
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s29, s9, 16
 ; GCN-NOHSA-VI-NEXT:    s_and_b32 s9, s9, 0xffff
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s30, s10, 16
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s10, s10, 0xffff
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s31, s11, 16
 ; GCN-NOHSA-VI-NEXT:    s_and_b32 s11, s11, 0xffff
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s33, s12, 16
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s12, s12, 0xffff
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s34, s13, 16
 ; GCN-NOHSA-VI-NEXT:    s_and_b32 s13, s13, 0xffff
-; GCN-NOHSA-VI-NEXT:    s_and_b32 s4, s15, 0xffff
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s35, s14, 16
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s14, s14, 0xffff
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s4, s15, 16
+; GCN-NOHSA-VI-NEXT:    s_and_b32 s5, s15, 0xffff
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s4
 ; GCN-NOHSA-VI-NEXT:    s_add_u32 s4, s16, 0xf0
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s5
 ; GCN-NOHSA-VI-NEXT:    s_addc_u32 s5, s17, 0
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v4, s4
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s27
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v5, s5
-; GCN-NOHSA-VI-NEXT:    s_add_u32 s4, s16, 0xd0
-; GCN-NOHSA-VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-NOHSA-VI-NEXT:    s_addc_u32 s5, s17, 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v4, s4
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s13
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s26
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v5, s5
-; GCN-NOHSA-VI-NEXT:    s_add_u32 s4, s16, 0xb0
+; GCN-NOHSA-VI-NEXT:    s_add_u32 s4, s16, 0xe0
 ; GCN-NOHSA-VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-NOHSA-VI-NEXT:    s_addc_u32 s5, s17, 0
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v4, s4
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s11
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s25
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s14
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s35
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v5, s5
-; GCN-NOHSA-VI-NEXT:    s_add_u32 s4, s16, 0x90
+; GCN-NOHSA-VI-NEXT:    s_add_u32 s4, s16, 0xd0
 ; GCN-NOHSA-VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-NOHSA-VI-NEXT:    s_addc_u32 s5, s17, 0
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v4, s4
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s9
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s24
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s13
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s34
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v5, s5
-; GCN-NOHSA-VI-NEXT:    s_add_u32 s4, s16, 0x70
+; GCN-NOHSA-VI-NEXT:    s_add_u32 s4, s16, 0xc0
 ; GCN-NOHSA-VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-NOHSA-VI-NEXT:    s_addc_u32 s5, s17, 0
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v4, s4
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s7
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s23
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s12
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s33
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v5, s5
-; GCN-NOHSA-VI-NEXT:    s_add_u32 s4, s16, 0x50
+; GCN-NOHSA-VI-NEXT:    s_add_u32 s4, s16, 0xb0
 ; GCN-NOHSA-VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-NOHSA-VI-NEXT:    s_addc_u32 s5, s17, 0
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v4, s4
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s36
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s22
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s11
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s31
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v5, s5
-; GCN-NOHSA-VI-NEXT:    s_add_u32 s4, s16, 48
+; GCN-NOHSA-VI-NEXT:    s_add_u32 s4, s16, 0xa0
 ; GCN-NOHSA-VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-NOHSA-VI-NEXT:    s_addc_u32 s5, s17, 0
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v4, s4
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s3
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s21
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s10
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s30
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v5, s5
-; GCN-NOHSA-VI-NEXT:    s_add_u32 s4, s16, 16
+; GCN-NOHSA-VI-NEXT:    s_add_u32 s4, s16, 0x90
 ; GCN-NOHSA-VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-NOHSA-VI-NEXT:    s_addc_u32 s5, s17, 0
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v4, s4
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s1
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s20
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s9
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s29
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v5, s5
-; GCN-NOHSA-VI-NEXT:    s_add_u32 s4, s16, 0xe0
+; GCN-NOHSA-VI-NEXT:    s_add_u32 s4, s16, 0x80
 ; GCN-NOHSA-VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-NOHSA-VI-NEXT:    s_addc_u32 s5, s17, 0
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v4, s4
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s14
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s8
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s28
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v5, s5
-; GCN-NOHSA-VI-NEXT:    s_add_u32 s4, s16, 0xc0
+; GCN-NOHSA-VI-NEXT:    s_add_u32 s4, s16, 0x70
 ; GCN-NOHSA-VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-NOHSA-VI-NEXT:    s_addc_u32 s5, s17, 0
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v4, s4
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s12
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s29
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s7
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s27
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v5, s5
-; GCN-NOHSA-VI-NEXT:    s_add_u32 s4, s16, 0xa0
+; GCN-NOHSA-VI-NEXT:    s_add_u32 s4, s16, 0x60
 ; GCN-NOHSA-VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-NOHSA-VI-NEXT:    s_addc_u32 s5, s17, 0
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v4, s4
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s10
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s30
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s6
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s26
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v5, s5
-; GCN-NOHSA-VI-NEXT:    s_add_u32 s4, s16, 0x80
+; GCN-NOHSA-VI-NEXT:    s_add_u32 s4, s16, 0x50
 ; GCN-NOHSA-VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-NOHSA-VI-NEXT:    s_addc_u32 s5, s17, 0
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v4, s4
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s8
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s31
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s25
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s24
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v5, s5
-; GCN-NOHSA-VI-NEXT:    s_add_u32 s4, s16, 0x60
+; GCN-NOHSA-VI-NEXT:    s_add_u32 s4, s16, 64
 ; GCN-NOHSA-VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-NOHSA-VI-NEXT:    s_addc_u32 s5, s17, 0
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v4, s4
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s6
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s33
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s23
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s22
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v5, s5
-; GCN-NOHSA-VI-NEXT:    s_add_u32 s4, s16, 64
+; GCN-NOHSA-VI-NEXT:    s_add_u32 s4, s16, 48
 ; GCN-NOHSA-VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-NOHSA-VI-NEXT:    s_addc_u32 s5, s17, 0
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v4, s4
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s35
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s34
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s3
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s21
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v5, s5
 ; GCN-NOHSA-VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-NOHSA-VI-NEXT:    s_nop 0
@@ -6733,6 +6726,13 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i64(ptr addrspace(1) %
 ; GCN-NOHSA-VI-NEXT:    s_add_u32 s2, s16, 32
 ; GCN-NOHSA-VI-NEXT:    s_addc_u32 s3, s17, 0
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v5, s3
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v4, s2
+; GCN-NOHSA-VI-NEXT:    s_add_u32 s2, s16, 16
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s20
+; GCN-NOHSA-VI-NEXT:    s_addc_u32 s3, s17, 0
+; GCN-NOHSA-VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v5, s3
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s1
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s19
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v4, s2
 ; GCN-NOHSA-VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
@@ -7269,197 +7269,201 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) %
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NOHSA-VI-NEXT:    s_load_dwordx16 s[0:15], s[18:19], 0x0
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s46, s15
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s48, s13
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s50, s11
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s52, s9
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s54, s7
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s56, s5
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s44, s3
-; GCN-NOHSA-VI-NEXT:    s_mov_b32 s40, s1
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s58, s14, 16
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s60, s12, 16
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s62, s10, 16
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s64, s8, 16
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s66, s6, 16
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s68, s4, 16
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s70, s2, 16
-; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s72, s0, 16
+; GCN-NOHSA-VI-NEXT:    s_mov_b32 s20, s1
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s26, s1, 16
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[24:25], s[2:3], 0x100000
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s2, s2, 16
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[30:31], s[4:5], 0x100000
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s4, s4, 16
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[38:39], s[6:7], 0x100000
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s6, s6, 16
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[44:45], s[8:9], 0x100000
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s8, s8, 16
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[50:51], s[10:11], 0x100000
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s10, s10, 16
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[56:57], s[12:13], 0x100000
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s12, s12, 16
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[62:63], s[14:15], 0x100000
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s14, s14, 16
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[22:23], s[20:21], 0x100000
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[20:21], s[26:27], 0x100000
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[26:27], s[2:3], 0x100000
+; GCN-NOHSA-VI-NEXT:    s_mov_b32 s2, s3
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[34:35], s[4:5], 0x100000
+; GCN-NOHSA-VI-NEXT:    s_mov_b32 s4, s5
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[40:41], s[6:7], 0x100000
+; GCN-NOHSA-VI-NEXT:    s_mov_b32 s6, s7
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[46:47], s[8:9], 0x100000
+; GCN-NOHSA-VI-NEXT:    s_mov_b32 s8, s9
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[52:53], s[10:11], 0x100000
+; GCN-NOHSA-VI-NEXT:    s_mov_b32 s10, s11
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[58:59], s[12:13], 0x100000
+; GCN-NOHSA-VI-NEXT:    s_mov_b32 s12, s13
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[64:65], s[14:15], 0x100000
+; GCN-NOHSA-VI-NEXT:    s_mov_b32 s14, s15
 ; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[18:19], s[0:1], 0x100000
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[20:21], s[2:3], 0x100000
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[22:23], s[4:5], 0x100000
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[24:25], s[6:7], 0x100000
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[26:27], s[8:9], 0x100000
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[28:29], s[10:11], 0x100000
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[30:31], s[12:13], 0x100000
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[34:35], s[14:15], 0x100000
-; GCN-NOHSA-VI-NEXT:    s_ashr_i64 s[36:37], s[0:1], 48
-; GCN-NOHSA-VI-NEXT:    s_ashr_i64 s[38:39], s[2:3], 48
-; GCN-NOHSA-VI-NEXT:    s_ashr_i64 s[42:43], s[4:5], 48
-; GCN-NOHSA-VI-NEXT:    s_ashr_i64 s[74:75], s[6:7], 48
-; GCN-NOHSA-VI-NEXT:    s_ashr_i64 s[76:77], s[8:9], 48
-; GCN-NOHSA-VI-NEXT:    s_ashr_i64 s[78:79], s[10:11], 48
-; GCN-NOHSA-VI-NEXT:    s_ashr_i64 s[80:81], s[12:13], 48
-; GCN-NOHSA-VI-NEXT:    s_ashr_i64 s[82:83], s[14:15], 48
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[0:1], s[72:73], 0x100000
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[2:3], s[70:71], 0x100000
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[4:5], s[68:69], 0x100000
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[6:7], s[66:67], 0x100000
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[8:9], s[64:65], 0x100000
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[10:11], s[62:63], 0x100000
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[12:13], s[60:61], 0x100000
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[14:15], s[58:59], 0x100000
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[40:41], s[40:41], 0x100000
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[44:45], s[44:45], 0x100000
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[56:57], s[56:57], 0x100000
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[54:55], s[54:55], 0x100000
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[52:53], s[52:53], 0x100000
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[50:51], s[50:51], 0x100000
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[48:49], s[48:49], 0x100000
-; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[46:47], s[46:47], 0x100000
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s46
-; GCN-NOHSA-VI-NEXT:    s_add_u32 s46, s16, 0xf0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s47
-; GCN-NOHSA-VI-NEXT:    s_addc_u32 s47, s17, 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v4, s46
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s82
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s83
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v5, s47
-; GCN-NOHSA-VI-NEXT:    s_add_u32 s46, s16, 0xd0
-; GCN-NOHSA-VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-NOHSA-VI-NEXT:    s_addc_u32 s47, s17, 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v4, s46
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s48
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s49
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s80
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s81
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v5, s47
-; GCN-NOHSA-VI-NEXT:    s_add_u32 s46, s16, 0xb0
-; GCN-NOHSA-VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-NOHSA-VI-NEXT:    s_addc_u32 s47, s17, 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v4, s46
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s50
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s51
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s78
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s79
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v5, s47
-; GCN-NOHSA-VI-NEXT:    s_add_u32 s46, s16, 0x90
-; GCN-NOHSA-VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-NOHSA-VI-NEXT:    s_addc_u32 s47, s17, 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v4, s46
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s52
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s53
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s76
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s77
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v5, s47
-; GCN-NOHSA-VI-NEXT:    s_add_u32 s46, s16, 0x70
-; GCN-NOHSA-VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-NOHSA-VI-NEXT:    s_addc_u32 s47, s17, 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v4, s46
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s54
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s55
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s74
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s75
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v5, s47
-; GCN-NOHSA-VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-NOHSA-VI-NEXT:    s_nop 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s42
-; GCN-NOHSA-VI-NEXT:    s_add_u32 s42, s16, 0x50
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s43
-; GCN-NOHSA-VI-NEXT:    s_addc_u32 s43, s17, 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v4, s42
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s56
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s57
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v5, s43
-; GCN-NOHSA-VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-NOHSA-VI-NEXT:    s_nop 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s38
-; GCN-NOHSA-VI-NEXT:    s_add_u32 s38, s16, 48
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s39
-; GCN-NOHSA-VI-NEXT:    s_addc_u32 s39, s17, 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v4, s38
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s44
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s45
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v5, s39
-; GCN-NOHSA-VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-NOHSA-VI-NEXT:    s_nop 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s36
-; GCN-NOHSA-VI-NEXT:    s_add_u32 s36, s16, 16
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s37
-; GCN-NOHSA-VI-NEXT:    s_addc_u32 s37, s17, 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v4, s36
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s40
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s41
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v5, s37
-; GCN-NOHSA-VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
-; GCN-NOHSA-VI-NEXT:    s_nop 0
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s0, s0, 16
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[28:29], s[2:3], 0x100000
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s2, s3, 16
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[36:37], s[4:5], 0x100000
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s4, s5, 16
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[42:43], s[6:7], 0x100000
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s6, s7, 16
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[48:49], s[8:9], 0x100000
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s8, s9, 16
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[54:55], s[10:11], 0x100000
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s10, s11, 16
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[60:61], s[12:13], 0x100000
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s12, s13, 16
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[66:67], s[14:15], 0x100000
+; GCN-NOHSA-VI-NEXT:    s_lshr_b32 s14, s15, 16
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[0:1], s[0:1], 0x100000
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[2:3], s[2:3], 0x100000
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[4:5], s[4:5], 0x100000
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[6:7], s[6:7], 0x100000
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[8:9], s[8:9], 0x100000
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[10:11], s[10:11], 0x100000
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[12:13], s[12:13], 0x100000
+; GCN-NOHSA-VI-NEXT:    s_bfe_i64 s[14:15], s[14:15], 0x100000
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s14
-; GCN-NOHSA-VI-NEXT:    s_add_u32 s14, s16, 0xe0
+; GCN-NOHSA-VI-NEXT:    s_add_u32 s14, s16, 0xf0
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s15
 ; GCN-NOHSA-VI-NEXT:    s_addc_u32 s15, s17, 0
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v4, s14
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s34
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s35
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s66
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s67
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v5, s15
+; GCN-NOHSA-VI-NEXT:    s_add_u32 s14, s16, 0xe0
+; GCN-NOHSA-VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-NOHSA-VI-NEXT:    s_addc_u32 s15, s17, 0
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v4, s14
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s62
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s63
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s64
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s65
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v5, s15
 ; GCN-NOHSA-VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-NOHSA-VI-NEXT:    s_nop 0
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s12
-; GCN-NOHSA-VI-NEXT:    s_add_u32 s12, s16, 0xc0
+; GCN-NOHSA-VI-NEXT:    s_add_u32 s12, s16, 0xd0
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s13
 ; GCN-NOHSA-VI-NEXT:    s_addc_u32 s13, s17, 0
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v4, s12
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s30
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s31
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s60
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s61
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v5, s13
+; GCN-NOHSA-VI-NEXT:    s_add_u32 s12, s16, 0xc0
+; GCN-NOHSA-VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-NOHSA-VI-NEXT:    s_addc_u32 s13, s17, 0
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v4, s12
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s56
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s57
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s58
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s59
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v5, s13
 ; GCN-NOHSA-VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-NOHSA-VI-NEXT:    s_nop 0
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s10
-; GCN-NOHSA-VI-NEXT:    s_add_u32 s10, s16, 0xa0
+; GCN-NOHSA-VI-NEXT:    s_add_u32 s10, s16, 0xb0
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s11
 ; GCN-NOHSA-VI-NEXT:    s_addc_u32 s11, s17, 0
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v4, s10
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s28
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s29
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s54
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s55
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v5, s11
+; GCN-NOHSA-VI-NEXT:    s_add_u32 s10, s16, 0xa0
+; GCN-NOHSA-VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-NOHSA-VI-NEXT:    s_addc_u32 s11, s17, 0
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v4, s10
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s50
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s51
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s52
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s53
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v5, s11
 ; GCN-NOHSA-VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-NOHSA-VI-NEXT:    s_nop 0
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s8
-; GCN-NOHSA-VI-NEXT:    s_add_u32 s8, s16, 0x80
+; GCN-NOHSA-VI-NEXT:    s_add_u32 s8, s16, 0x90
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s9
 ; GCN-NOHSA-VI-NEXT:    s_addc_u32 s9, s17, 0
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v4, s8
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s26
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s27
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s48
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s49
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v5, s9
+; GCN-NOHSA-VI-NEXT:    s_add_u32 s8, s16, 0x80
+; GCN-NOHSA-VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-NOHSA-VI-NEXT:    s_addc_u32 s9, s17, 0
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v4, s8
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s44
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s45
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s46
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s47
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v5, s9
 ; GCN-NOHSA-VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-NOHSA-VI-NEXT:    s_nop 0
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s6
-; GCN-NOHSA-VI-NEXT:    s_add_u32 s6, s16, 0x60
+; GCN-NOHSA-VI-NEXT:    s_add_u32 s6, s16, 0x70
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s7
 ; GCN-NOHSA-VI-NEXT:    s_addc_u32 s7, s17, 0
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v4, s6
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s24
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s25
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s42
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s43
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v5, s7
+; GCN-NOHSA-VI-NEXT:    s_add_u32 s6, s16, 0x60
+; GCN-NOHSA-VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-NOHSA-VI-NEXT:    s_addc_u32 s7, s17, 0
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v4, s6
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s38
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s39
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s40
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s41
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v5, s7
 ; GCN-NOHSA-VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-NOHSA-VI-NEXT:    s_nop 0
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s4
-; GCN-NOHSA-VI-NEXT:    s_add_u32 s4, s16, 64
+; GCN-NOHSA-VI-NEXT:    s_add_u32 s4, s16, 0x50
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s5
 ; GCN-NOHSA-VI-NEXT:    s_addc_u32 s5, s17, 0
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v4, s4
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s22
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s23
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s36
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s37
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v5, s5
+; GCN-NOHSA-VI-NEXT:    s_add_u32 s4, s16, 64
+; GCN-NOHSA-VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-NOHSA-VI-NEXT:    s_addc_u32 s5, s17, 0
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v4, s4
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s30
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s31
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s34
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s35
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v5, s5
 ; GCN-NOHSA-VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-NOHSA-VI-NEXT:    s_nop 0
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s2
-; GCN-NOHSA-VI-NEXT:    s_add_u32 s2, s16, 32
+; GCN-NOHSA-VI-NEXT:    s_add_u32 s2, s16, 48
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s3
 ; GCN-NOHSA-VI-NEXT:    s_addc_u32 s3, s17, 0
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v5, s3
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s20
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s21
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v4, s2
+; GCN-NOHSA-VI-NEXT:    s_add_u32 s2, s16, 32
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s28
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s29
+; GCN-NOHSA-VI-NEXT:    s_addc_u32 s3, s17, 0
+; GCN-NOHSA-VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v5, s3
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v4, s2
+; GCN-NOHSA-VI-NEXT:    s_add_u32 s2, s16, 16
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s24
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s25
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s26
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s27
+; GCN-NOHSA-VI-NEXT:    s_addc_u32 s3, s17, 0
+; GCN-NOHSA-VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v5, s3
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v0, s22
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, s23
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, s20
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, s21
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v4, s2
 ; GCN-NOHSA-VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v4, s16
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
index e89c44d5b94a8..25a84e9e787fb 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
@@ -2767,45 +2767,45 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i32(ptr addrspace(1) %ou
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s1, s5
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(3)
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v19, 16, v3
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v18, 0xffff, v3
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v17, 16, v2
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v23, 16, v1
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v35, 16, v13
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v33, 16, v12
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v34, 0xffff, v13
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v32, 0xffff, v12
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v21, 16, v0
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v27, 16, v7
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v25, 16, v6
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v18, 0xffff, v3
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v31, 16, v15
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v30, 0xffff, v15
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v29, 16, v14
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v28, 0xffff, v14
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v15, 16, v13
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v14, 0xffff, v13
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v13, 16, v12
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v12, 0xffff, v12
 ; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v16, 0xffff, v2
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v22, 0xffff, v1
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v20, 0xffff, v0
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v4
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v26, 0xffff, v7
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v24, 0xffff, v6
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v2, 0xffff, v5
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v0, 0xffff, v4
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v7, 16, v11
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v5, 16, v10
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v31, 16, v9
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v29, 16, v8
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v6, 0xffff, v11
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v4, 0xffff, v10
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v30, 0xffff, v9
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v28, 0xffff, v8
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v11, 16, v15
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v9, 16, v14
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v10, 0xffff, v15
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v8, 0xffff, v14
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:96
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:112
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:64
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:80
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:48
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[0:3], 0
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v2, 0xffff, v1
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v23, 16, v7
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v22, 0xffff, v7
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v21, 16, v6
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v20, 0xffff, v6
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v7, 16, v5
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v6, 0xffff, v5
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v27, 16, v11
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v26, 0xffff, v11
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v25, 16, v10
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v24, 0xffff, v10
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v11, 16, v9
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v10, 0xffff, v9
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v9, 16, v8
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:96
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:112
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:64
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:80
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:32
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:48
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:16
 ; GCN-NOHSA-VI-NEXT:    s_endpgm
 ;
@@ -6108,29 +6108,29 @@ define amdgpu_kernel void @global_zextload_v8i16_to_v8i64(ptr addrspace(1) %out,
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s8, s6
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s9, s7
 ; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v19, 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v17, 0
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v4, 0
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v6, v4
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s0, s4
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s1, s5
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v7, 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v5, 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v11, 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v9, 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v15, 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v13, 0
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v8, v4
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v10, v4
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v12, v4
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v14, v4
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v16, v4
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v18, v4
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v18, 16, v3
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v16, 0xffff, v3
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v4, 0xffff, v0
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v10, 16, v1
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v8, 0xffff, v1
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v14, 16, v2
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v12, 0xffff, v2
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:48
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v17, 16, v0
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v15, 0xffff, v0
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v13, 16, v1
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v11, 0xffff, v1
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v9, 16, v2
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v7, 0xffff, v2
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[3:6], off, s[0:3], 0 offset:48
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[7:10], off, s[0:3], 0 offset:32
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[11:14], off, s[0:3], 0 offset:16
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[15:18], off, s[0:3], 0
 ; GCN-NOHSA-VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: global_zextload_v8i16_to_v8i64:
@@ -7522,95 +7522,95 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(ptr addrspace(1) %ou
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s8, s6
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s9, s7
-; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[11:14], off, s[8:11], 0
-; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[15:18], off, s[8:11], 0 offset:16
-; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[28:31], off, s[8:11], 0 offset:32
-; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[32:35], off, s[8:11], 0 offset:48
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v55, 0
+; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[5:8], off, s[8:11], 0
+; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[21:24], off, s[8:11], 0 offset:16
+; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[31:34], off, s[8:11], 0 offset:32
+; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[35:38], off, s[8:11], 0 offset:48
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v60, 0
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v58, 0
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s0, s4
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s1, s5
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v26, v55
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v23, v55
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v53, 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v51, v55
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v44, v55
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v46, v55
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v20, v55
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v7, 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v48, v55
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v9, v55
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v5, v55
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v55, 0
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v52, 0
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v49, 0
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v46, 0
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v43, 0
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v40, 0
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v29, 0
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v39, 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v57, v55
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v41, v55
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v37, v55
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, v55
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v11, 0
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v9, 0
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v15, 0
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v13, 0
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v19, 0
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v17, 0
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v27, 0
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v25, 0
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(3)
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v24, 16, v14
-; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v38, 16, v28
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v36, 0xffff, v28
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v42, 16, v30
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v40, 0xffff, v30
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v30, 16, v29
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v28, 0xffff, v29
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v45, 16, v31
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v43, 0xffff, v31
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v29, v55
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v31, v55
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v27, 16, v18
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v25, 0xffff, v18
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:144
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v22, 0xffff, v14
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v28, v55
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v6
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v52, 16, v34
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v50, 0xffff, v34
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[25:28], off, s[0:3], 0 offset:112
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v21, 16, v12
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v25, v55
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v17
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v10, 16, v13
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v11
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v4, 0xffff, v11
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v8, 0xffff, v13
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v19, 0xffff, v12
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v14, 16, v15
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v12, 0xffff, v15
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v0, 0xffff, v17
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v17, 16, v16
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v15, 0xffff, v16
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v49, 16, v32
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v47, 0xffff, v32
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v47, 16, v34
+; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v59, 16, v37
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v57, 0xffff, v37
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v56, 16, v38
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v54, 0xffff, v38
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[57:60], off, s[0:3], 0 offset:224
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v53, 16, v35
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v57, 0
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v51, 0xffff, v35
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[54:57], off, s[0:3], 0 offset:240
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v50, 16, v36
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v54, 0
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v48, 0xffff, v36
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[51:54], off, s[0:3], 0 offset:192
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v45, 0xffff, v34
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v51, 0
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[48:51], off, s[0:3], 0 offset:208
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v44, 16, v31
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v48, 0
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v42, 0xffff, v31
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[45:48], off, s[0:3], 0 offset:176
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v30, 16, v23
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v45, 0
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v28, 0xffff, v23
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v41, 16, v32
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v39, 0xffff, v32
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v34, 16, v33
 ; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v32, 0xffff, v33
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v56, 16, v35
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v54, 0xffff, v35
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v33, v55
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v35, v55
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v16, v55
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v18, v55
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[22:25], off, s[0:3], 0 offset:48
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[50:53], off, s[0:3], 0 offset:224
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v22, v55
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v50, 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v11, 0
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:208
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[43:46], off, s[0:3], 0 offset:176
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:80
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:16
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v15, 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v43, 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v13, v55
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[47:50], off, s[0:3], 0 offset:192
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[54:57], off, s[0:3], 0 offset:240
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[40:43], off, s[0:3], 0 offset:160
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[36:39], off, s[0:3], 0 offset:128
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:64
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:32
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v35, 0
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v33, 0
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[42:45], off, s[0:3], 0 offset:128
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v31, 0
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v42, 0
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v0, 0xffff, v6
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v5
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v4, 0xffff, v5
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v10, 16, v8
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v14, 16, v7
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v12, 0xffff, v7
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v18, 16, v22
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v16, 0xffff, v22
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v22, 16, v21
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v20, 0xffff, v21
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v26, 16, v24
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v24, 0xffff, v24
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:160
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[39:42], off, s[0:3], 0 offset:144
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v7, 0
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v5, 0
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v23, 0
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v21, 0
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:96
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:112
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:64
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:80
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:48
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
 ; GCN-NOHSA-VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: global_zextload_v32i16_to_v32i64:
@@ -8242,100 +8242,116 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %ou
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s8, s6
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s9, s7
-; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0 offset:48
-; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:32
-; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[8:11], off, s[8:11], 0 offset:16
-; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[12:15], off, s[8:11], 0
+; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[7:10], off, s[8:11], 0
+; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[15:18], off, s[8:11], 0 offset:48
+; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[11:14], off, s[8:11], 0 offset:32
+; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[3:6], off, s[8:11], 0 offset:16
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s0, s4
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s1, s5
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(3)
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i64 v[22:23], 48, v[0:1]
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v20, v1, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v21, 31, v20
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:208
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v19, v3
-; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(3)
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i64 v[22:23], 48, v[4:5]
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v20, v5, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v21, 31, v20
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:144
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v19, v19, 0, 16
-; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(3)
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i64 v[22:23], 48, v[8:9]
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v20, v9, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v21, 31, v20
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:80
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v18, v7
-; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(3)
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i64 v[22:23], 48, v[12:13]
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v20, v13, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v21, 31, v20
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:16
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v18, v18, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i64 v[21:22], 48, v[2:3]
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v0, v8, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v8
+; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(2)
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v8, 16, v17
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v21, v8, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v19, v17, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, v18
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v20, 31, v19
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:240
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v17, v11
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i64 v[20:21], 48, v[6:7]
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v19, 31, v18
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[18:21], off, s[0:3], 0 offset:176
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v17, v17, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i64 v[19:20], 48, v[10:11]
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v16, v15
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v22, 31, v21
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v8, 16, v18
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:224
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v17, v2, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v19, v8, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v18, 31, v17
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[17:20], off, s[0:3], 0 offset:112
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v16, v16, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i64 v[18:19], 48, v[14:15]
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v17, 31, v16
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v15, v2, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v6
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:48
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v23, v2, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v17, v1, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v22, 16, v4
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v21, 16, v10
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v9, 16, v8
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v7, 16, v14
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v20, 31, v19
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v15
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[17:20], off, s[0:3], 0 offset:240
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v8, v7, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v17, v15, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v19, v2, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v18, 31, v17
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v20, 31, v19
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v16
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[17:20], off, s[0:3], 0 offset:192
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v15, v16, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v17, v2, 0, 16
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v16, 31, v15
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v18, 31, v17
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v12
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v25, v0, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v27, v2, 0, 16
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:224
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v3, v1, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v1, v12, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v5, v14, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v7, v7, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v11, v9, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v9, v8, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v13, v10, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v15, v21, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v19, v22, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v17, v4, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v21, v6, 0, 16
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v26, 31, v25
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v28, 31, v27
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v6, 31, v5
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v10, 31, v9
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v14, 31, v13
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:208
+; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(5)
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v2, v14
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v15, 16, v13
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v17, v15, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v15, v13, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v16, 31, v15
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v18, 31, v17
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v22, 31, v21
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v8, 31, v7
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:160
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v13, v2, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v15, v14, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v14, 31, v13
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v16, 31, v15
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v11
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[13:16], off, s[0:3], 0 offset:176
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v15, v2, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v13, v11, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v14, 31, v13
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v16, 31, v15
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v12
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[13:16], off, s[0:3], 0 offset:128
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v11, v12, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v13, v2, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v17, v10
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v18, 16, v10
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v12, 31, v11
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v14, 31, v13
+; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(7)
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v10, 16, v5
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v2, v1, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v4
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[11:14], off, s[0:3], 0 offset:144
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v15, v10, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v13, v5, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v10, v7, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v22, v18, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v18, v1, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, v6
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v7, 16, v6
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v12, v9, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v9, 16, v9
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v14, 31, v13
 ; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v16, 31, v15
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v20, 31, v19
-; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v24, 31, v23
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[25:28], off, s[0:3], 0 offset:192
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[21:24], off, s[0:3], 0 offset:160
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[17:20], off, s[0:3], 0 offset:128
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v24, v1, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v26, v7, 0, 16
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[13:16], off, s[0:3], 0 offset:96
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[9:12], off, s[0:3], 0 offset:64
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[5:8], off, s[0:3], 0 offset:32
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[1:4], off, s[0:3], 0
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v20, v17, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v16, v4, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v14, v9, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v4, v3, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_bfe_i32 v6, v5, 0, 16
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v25, 31, v24
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v27, 31, v26
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v9, 31, v8
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v11, 31, v10
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v21, 31, v20
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v23, 31, v22
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v13, 31, v12
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v15, 31, v14
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v17, 31, v16
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v19, 31, v18
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v5, 31, v4
+; GCN-NOHSA-VI-NEXT:    v_ashrrev_i32_e32 v7, 31, v6
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:112
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:64
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:80
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:48
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
 ; GCN-NOHSA-VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: global_sextload_v32i16_to_v32i64:

From 731bdcef2ef89201b77a86533245a8b4c96a2d7f Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Fri, 29 Sep 2023 11:45:09 -0500
Subject: [PATCH 274/877] [InstCombine] Add tests for folding (icmp eq/ne (and
 (add/sub/xor A, P2), P2), 0/P2); NFC

---
 .../InstCombine/icmp-and-add-sub-xor-p2.ll    | 154 ++++++++++++++++++
 1 file changed, 154 insertions(+)
 create mode 100644 llvm/test/Transforms/InstCombine/icmp-and-add-sub-xor-p2.ll

diff --git a/llvm/test/Transforms/InstCombine/icmp-and-add-sub-xor-p2.ll b/llvm/test/Transforms/InstCombine/icmp-and-add-sub-xor-p2.ll
new file mode 100644
index 0000000000000..b9796e4486301
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/icmp-and-add-sub-xor-p2.ll
@@ -0,0 +1,154 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=instcombine -S -o - %s | FileCheck %s
+
+declare void @use.i8(i8)
+declare void @use.v2i8(<2 x i8>)
+define i1 @src_add_eq_p2(i8 %x, i8 %yy) {
+; CHECK-LABEL: @src_add_eq_p2(
+; CHECK-NEXT:    [[NY:%.*]] = sub i8 0, [[YY:%.*]]
+; CHECK-NEXT:    [[Y:%.*]] = and i8 [[NY]], [[YY]]
+; CHECK-NEXT:    [[X1:%.*]] = add i8 [[Y]], [[X:%.*]]
+; CHECK-NEXT:    call void @use.i8(i8 [[X1]])
+; CHECK-NEXT:    [[V:%.*]] = and i8 [[X1]], [[Y]]
+; CHECK-NEXT:    [[R:%.*]] = icmp eq i8 [[V]], [[Y]]
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %ny = sub i8 0, %yy
+  %y = and i8 %ny, %yy
+  %x1 = add i8 %y, %x
+  call void @use.i8(i8 %x1)
+  %v = and i8 %x1, %y
+  %r = icmp eq i8 %v, %y
+  ret i1 %r
+}
+
+define i1 @src_add_eq_p2_fail_multiuse(i8 %x, i8 %yy) {
+; CHECK-LABEL: @src_add_eq_p2_fail_multiuse(
+; CHECK-NEXT:    [[NY:%.*]] = sub i8 0, [[YY:%.*]]
+; CHECK-NEXT:    [[Y:%.*]] = and i8 [[NY]], [[YY]]
+; CHECK-NEXT:    [[X1:%.*]] = add i8 [[Y]], [[X:%.*]]
+; CHECK-NEXT:    call void @use.i8(i8 [[X1]])
+; CHECK-NEXT:    [[V:%.*]] = and i8 [[X1]], [[Y]]
+; CHECK-NEXT:    call void @use.i8(i8 [[V]])
+; CHECK-NEXT:    [[R:%.*]] = icmp eq i8 [[V]], [[Y]]
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %ny = sub i8 0, %yy
+  %y = and i8 %ny, %yy
+  %x1 = add i8 %x, %y
+  call void @use.i8(i8 %x1)
+  %v = and i8 %x1, %y
+  call void @use.i8(i8 %v)
+  %r = icmp eq i8 %v, %y
+  ret i1 %r
+}
+
+define i1 @src_xor_ne_zero(i8 %x, i8 %yy) {
+; CHECK-LABEL: @src_xor_ne_zero(
+; CHECK-NEXT:    [[NY:%.*]] = sub i8 0, [[YY:%.*]]
+; CHECK-NEXT:    [[Y:%.*]] = and i8 [[NY]], [[YY]]
+; CHECK-NEXT:    [[X1:%.*]] = xor i8 [[Y]], [[X:%.*]]
+; CHECK-NEXT:    call void @use.i8(i8 [[X1]])
+; CHECK-NEXT:    [[V:%.*]] = and i8 [[X1]], [[Y]]
+; CHECK-NEXT:    [[R:%.*]] = icmp ne i8 [[V]], 0
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %ny = sub i8 0, %yy
+  %y = and i8 %ny, %yy
+  %x1 = xor i8 %x, %y
+  call void @use.i8(i8 %x1)
+  %v = and i8 %x1, %y
+  %r = icmp ne i8 %v, 0
+  ret i1 %r
+}
+
+define i1 @src_xor_ne_zero_fail_different_p2(i8 %x, i8 %yy) {
+; CHECK-LABEL: @src_xor_ne_zero_fail_different_p2(
+; CHECK-NEXT:    [[NY:%.*]] = sub i8 0, [[YY:%.*]]
+; CHECK-NEXT:    [[Y:%.*]] = and i8 [[NY]], [[YY]]
+; CHECK-NEXT:    [[Y2:%.*]] = shl i8 [[Y]], 1
+; CHECK-NEXT:    [[X1:%.*]] = xor i8 [[Y]], [[X:%.*]]
+; CHECK-NEXT:    call void @use.i8(i8 [[X1]])
+; CHECK-NEXT:    [[V:%.*]] = and i8 [[X1]], [[Y2]]
+; CHECK-NEXT:    [[R:%.*]] = icmp ne i8 [[V]], 0
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %ny = sub i8 0, %yy
+  %y = and i8 %ny, %yy
+  %y2 = shl i8 %y, 1
+  %x1 = xor i8 %x, %y
+  call void @use.i8(i8 %x1)
+  %v = and i8 %x1, %y2
+  %r = icmp ne i8 %v, 0
+  ret i1 %r
+}
+
+define <2 x i1> @src_sub_ne_p2(<2 x i8> %x, <2 x i8> %yy) {
+; CHECK-LABEL: @src_sub_ne_p2(
+; CHECK-NEXT:    [[NY:%.*]] = sub <2 x i8> zeroinitializer, [[YY:%.*]]
+; CHECK-NEXT:    [[Y:%.*]] = and <2 x i8> [[NY]], [[YY]]
+; CHECK-NEXT:    [[X1:%.*]] = sub <2 x i8> [[X:%.*]], [[Y]]
+; CHECK-NEXT:    call void @use.v2i8(<2 x i8> [[X1]])
+; CHECK-NEXT:    [[V:%.*]] = and <2 x i8> [[X1]], [[Y]]
+; CHECK-NEXT:    [[R:%.*]] = icmp ne <2 x i8> [[V]], [[Y]]
+; CHECK-NEXT:    ret <2 x i1> [[R]]
+;
+  %ny = sub <2 x i8> zeroinitializer, %yy
+  %y = and <2 x i8> %ny, %yy
+  %x1 = sub <2 x i8> %x, %y
+  call void @use.v2i8(<2 x i8> %x1)
+  %v = and <2 x i8> %x1, %y
+  %r = icmp ne <2 x i8> %v, %y
+  ret <2 x i1> %r
+}
+
+define <2 x i1> @src_sub_eq_zero(<2 x i8> %x, <2 x i8> %yy) {
+; CHECK-LABEL: @src_sub_eq_zero(
+; CHECK-NEXT:    [[Y:%.*]] = shl <2 x i8> <i8 1, i8 2>, [[YY:%.*]]
+; CHECK-NEXT:    [[X1:%.*]] = sub <2 x i8> [[X:%.*]], [[Y]]
+; CHECK-NEXT:    call void @use.v2i8(<2 x i8> [[X1]])
+; CHECK-NEXT:    [[V:%.*]] = and <2 x i8> [[X1]], [[Y]]
+; CHECK-NEXT:    [[R:%.*]] = icmp eq <2 x i8> [[V]], zeroinitializer
+; CHECK-NEXT:    ret <2 x i1> [[R]]
+;
+  %y = shl <2 x i8> <i8 1, i8 2>, %yy
+  %x1 = sub <2 x i8> %x, %y
+  call void @use.v2i8(<2 x i8> %x1)
+  %v = and <2 x i8> %x1, %y
+  %r = icmp eq <2 x i8> %v, zeroinitializer
+  ret <2 x i1> %r
+}
+
+define <2 x i1> @src_sub_eq_zero_fail_commuted(<2 x i8> %x, <2 x i8> %yy) {
+; CHECK-LABEL: @src_sub_eq_zero_fail_commuted(
+; CHECK-NEXT:    [[Y:%.*]] = shl <2 x i8> <i8 1, i8 2>, [[YY:%.*]]
+; CHECK-NEXT:    [[X1:%.*]] = sub <2 x i8> [[Y]], [[X:%.*]]
+; CHECK-NEXT:    call void @use.v2i8(<2 x i8> [[X1]])
+; CHECK-NEXT:    [[V:%.*]] = and <2 x i8> [[X1]], [[Y]]
+; CHECK-NEXT:    [[R:%.*]] = icmp eq <2 x i8> [[V]], zeroinitializer
+; CHECK-NEXT:    ret <2 x i1> [[R]]
+;
+  %y = shl <2 x i8> <i8 1, i8 2>, %yy
+  %x1 = sub <2 x i8> %y, %x
+  call void @use.v2i8(<2 x i8> %x1)
+  %v = and <2 x i8> %x1, %y
+  %r = icmp eq <2 x i8> %v, zeroinitializer
+  ret <2 x i1> %r
+}
+
+define <2 x i1> @src_sub_eq_zero_fail_non_p2(<2 x i8> %x, <2 x i8> %yy) {
+; CHECK-LABEL: @src_sub_eq_zero_fail_non_p2(
+; CHECK-NEXT:    [[Y:%.*]] = shl <2 x i8> <i8 1, i8 3>, [[YY:%.*]]
+; CHECK-NEXT:    [[X1:%.*]] = sub <2 x i8> [[X:%.*]], [[Y]]
+; CHECK-NEXT:    call void @use.v2i8(<2 x i8> [[X1]])
+; CHECK-NEXT:    [[V:%.*]] = and <2 x i8> [[X1]], [[Y]]
+; CHECK-NEXT:    [[R:%.*]] = icmp eq <2 x i8> [[V]], zeroinitializer
+; CHECK-NEXT:    ret <2 x i1> [[R]]
+;
+  %y = shl <2 x i8> <i8 1, i8 3>, %yy
+  %x1 = sub <2 x i8> %x, %y
+  call void @use.v2i8(<2 x i8> %x1)
+  %v = and <2 x i8> %x1, %y
+  %r = icmp eq <2 x i8> %v, zeroinitializer
+  ret <2 x i1> %r
+}

From 0289dad538fa2fdc1a82a26f5d19f94fbd20d949 Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Fri, 29 Sep 2023 11:45:15 -0500
Subject: [PATCH 275/877] [InstCombine] Add folds for (icmp eq/ne (and
 (add/sub/xor A, P2), P2), 0/P2)

- `(icmp eq/ne (and (add/sub/xor X, P2), P2), P2)`
    -> `(icmp eq/ne (and X, P2), 0)`
- `(icmp eq/ne (and (add/sub/xor X, P2), P2), 0)`
    -> `(icmp eq/ne (and X, P2), P2)`

Folds like this come up with reasonable regularity in odd/even loops.

Proofs: https://alive2.llvm.org/ce/z/45pq2x

Closes #67836
---
 .../InstCombine/InstCombineCompares.cpp       | 26 +++++++++++++++++++
 .../InstCombine/icmp-and-add-sub-xor-p2.ll    | 16 ++++++------
 llvm/test/Transforms/InstCombine/pr25342.ll   | 12 ++++-----
 .../PGOProfile/cspgo_profile_summary.ll       |  2 +-
 4 files changed, 41 insertions(+), 15 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
index a09c9b48be9d5..d254c4706e0aa 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -5483,6 +5483,32 @@ Instruction *InstCombinerImpl::foldICmpEquality(ICmpInst &I) {
                          m_CombineAnd(m_Value(B), m_Unless(m_ImmConstant())))))
     return new ICmpInst(Pred, Builder.CreateXor(A, B), Cst);
 
+  {
+    // (icmp eq/ne (and (add/sub/xor X, P2), P2), P2)
+    auto m_Matcher =
+        m_CombineOr(m_CombineOr(m_c_Add(m_Value(B), m_Deferred(A)),
+                                m_c_Xor(m_Value(B), m_Deferred(A))),
+                    m_Sub(m_Value(B), m_Deferred(A)));
+    std::optional<bool> IsZero = std::nullopt;
+    if (match(&I, m_c_ICmp(PredUnused, m_OneUse(m_c_And(m_Value(A), m_Matcher)),
+                           m_Deferred(A))))
+      IsZero = false;
+    // (icmp eq/ne (and (add/sub/xor X, P2), P2), 0)
+    else if (match(&I,
+                   m_ICmp(PredUnused, m_OneUse(m_c_And(m_Value(A), m_Matcher)),
+                          m_Zero())))
+      IsZero = true;
+
+    if (IsZero && isKnownToBeAPowerOfTwo(A, /* OrZero */ true, /*Depth*/ 0, &I))
+      // (icmp eq/ne (and (add/sub/xor X, P2), P2), P2)
+      //    -> (icmp eq/ne (and X, P2), 0)
+      // (icmp eq/ne (and (add/sub/xor X, P2), P2), 0)
+      //    -> (icmp eq/ne (and X, P2), P2)
+      return new ICmpInst(Pred, Builder.CreateAnd(B, A),
+                          *IsZero ? A
+                                  : ConstantInt::getNullValue(A->getType()));
+  }
+
   return nullptr;
 }
 
diff --git a/llvm/test/Transforms/InstCombine/icmp-and-add-sub-xor-p2.ll b/llvm/test/Transforms/InstCombine/icmp-and-add-sub-xor-p2.ll
index b9796e4486301..c8a3dfcd68cd4 100644
--- a/llvm/test/Transforms/InstCombine/icmp-and-add-sub-xor-p2.ll
+++ b/llvm/test/Transforms/InstCombine/icmp-and-add-sub-xor-p2.ll
@@ -9,8 +9,8 @@ define i1 @src_add_eq_p2(i8 %x, i8 %yy) {
 ; CHECK-NEXT:    [[Y:%.*]] = and i8 [[NY]], [[YY]]
 ; CHECK-NEXT:    [[X1:%.*]] = add i8 [[Y]], [[X:%.*]]
 ; CHECK-NEXT:    call void @use.i8(i8 [[X1]])
-; CHECK-NEXT:    [[V:%.*]] = and i8 [[X1]], [[Y]]
-; CHECK-NEXT:    [[R:%.*]] = icmp eq i8 [[V]], [[Y]]
+; CHECK-NEXT:    [[TMP1:%.*]] = and i8 [[Y]], [[X]]
+; CHECK-NEXT:    [[R:%.*]] = icmp eq i8 [[TMP1]], 0
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
   %ny = sub i8 0, %yy
@@ -49,8 +49,8 @@ define i1 @src_xor_ne_zero(i8 %x, i8 %yy) {
 ; CHECK-NEXT:    [[Y:%.*]] = and i8 [[NY]], [[YY]]
 ; CHECK-NEXT:    [[X1:%.*]] = xor i8 [[Y]], [[X:%.*]]
 ; CHECK-NEXT:    call void @use.i8(i8 [[X1]])
-; CHECK-NEXT:    [[V:%.*]] = and i8 [[X1]], [[Y]]
-; CHECK-NEXT:    [[R:%.*]] = icmp ne i8 [[V]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = and i8 [[Y]], [[X]]
+; CHECK-NEXT:    [[R:%.*]] = icmp ne i8 [[TMP1]], [[Y]]
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
   %ny = sub i8 0, %yy
@@ -89,8 +89,8 @@ define <2 x i1> @src_sub_ne_p2(<2 x i8> %x, <2 x i8> %yy) {
 ; CHECK-NEXT:    [[Y:%.*]] = and <2 x i8> [[NY]], [[YY]]
 ; CHECK-NEXT:    [[X1:%.*]] = sub <2 x i8> [[X:%.*]], [[Y]]
 ; CHECK-NEXT:    call void @use.v2i8(<2 x i8> [[X1]])
-; CHECK-NEXT:    [[V:%.*]] = and <2 x i8> [[X1]], [[Y]]
-; CHECK-NEXT:    [[R:%.*]] = icmp ne <2 x i8> [[V]], [[Y]]
+; CHECK-NEXT:    [[TMP1:%.*]] = and <2 x i8> [[Y]], [[X]]
+; CHECK-NEXT:    [[R:%.*]] = icmp ne <2 x i8> [[TMP1]], zeroinitializer
 ; CHECK-NEXT:    ret <2 x i1> [[R]]
 ;
   %ny = sub <2 x i8> zeroinitializer, %yy
@@ -107,8 +107,8 @@ define <2 x i1> @src_sub_eq_zero(<2 x i8> %x, <2 x i8> %yy) {
 ; CHECK-NEXT:    [[Y:%.*]] = shl <2 x i8> <i8 1, i8 2>, [[YY:%.*]]
 ; CHECK-NEXT:    [[X1:%.*]] = sub <2 x i8> [[X:%.*]], [[Y]]
 ; CHECK-NEXT:    call void @use.v2i8(<2 x i8> [[X1]])
-; CHECK-NEXT:    [[V:%.*]] = and <2 x i8> [[X1]], [[Y]]
-; CHECK-NEXT:    [[R:%.*]] = icmp eq <2 x i8> [[V]], zeroinitializer
+; CHECK-NEXT:    [[TMP1:%.*]] = and <2 x i8> [[Y]], [[X]]
+; CHECK-NEXT:    [[R:%.*]] = icmp eq <2 x i8> [[TMP1]], [[Y]]
 ; CHECK-NEXT:    ret <2 x i1> [[R]]
 ;
   %y = shl <2 x i8> <i8 1, i8 2>, %yy
diff --git a/llvm/test/Transforms/InstCombine/pr25342.ll b/llvm/test/Transforms/InstCombine/pr25342.ll
index e1a6822e79084..2f85f99c4ce00 100644
--- a/llvm/test/Transforms/InstCombine/pr25342.ll
+++ b/llvm/test/Transforms/InstCombine/pr25342.ll
@@ -78,7 +78,7 @@ define void @multi_phi(i32 signext %n) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[FOR_COND:%.*]]
 ; CHECK:       for.cond:
-; CHECK-NEXT:    [[TMP0:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[TMP6:%.*]], [[ODD_BB:%.*]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[TMP7:%.*]], [[ODD_BB:%.*]] ]
 ; CHECK-NEXT:    [[I_0:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INC:%.*]], [[ODD_BB]] ]
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[I_0]], [[N:%.*]]
 ; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]]
@@ -92,14 +92,14 @@ define void @multi_phi(i32 signext %n) {
 ; CHECK-NEXT:    [[SUB_I:%.*]] = fsub float [[MUL_I]], [[MUL4_I]]
 ; CHECK-NEXT:    [[ADD_I:%.*]] = fadd float [[SUB_I]], [[TMP0]]
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_0]], 1
-; CHECK-NEXT:    [[BIT0:%.*]] = and i32 [[INC]], 1
-; CHECK-NEXT:    [[EVEN_NOT_NOT:%.*]] = icmp eq i32 [[BIT0]], 0
-; CHECK-NEXT:    br i1 [[EVEN_NOT_NOT]], label [[EVEN_BB:%.*]], label [[ODD_BB]]
+; CHECK-NEXT:    [[TMP5:%.*]] = and i32 [[I_0]], 1
+; CHECK-NEXT:    [[EVEN_NOT_NOT_NOT:%.*]] = icmp eq i32 [[TMP5]], 0
+; CHECK-NEXT:    br i1 [[EVEN_NOT_NOT_NOT]], label [[ODD_BB]], label [[EVEN_BB:%.*]]
 ; CHECK:       even.bb:
-; CHECK-NEXT:    [[TMP5:%.*]] = fadd float [[SUB_I]], [[ADD_I]]
+; CHECK-NEXT:    [[TMP6:%.*]] = fadd float [[SUB_I]], [[ADD_I]]
 ; CHECK-NEXT:    br label [[ODD_BB]]
 ; CHECK:       odd.bb:
-; CHECK-NEXT:    [[TMP6]] = phi float [ [[ADD_I]], [[FOR_BODY]] ], [ [[TMP5]], [[EVEN_BB]] ]
+; CHECK-NEXT:    [[TMP7]] = phi float [ [[ADD_I]], [[FOR_BODY]] ], [ [[TMP6]], [[EVEN_BB]] ]
 ; CHECK-NEXT:    br label [[FOR_COND]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    store float [[TMP0]], ptr @dd, align 4
diff --git a/llvm/test/Transforms/PGOProfile/cspgo_profile_summary.ll b/llvm/test/Transforms/PGOProfile/cspgo_profile_summary.ll
index b4a27ee656133..7040bac6a4c43 100644
--- a/llvm/test/Transforms/PGOProfile/cspgo_profile_summary.ll
+++ b/llvm/test/Transforms/PGOProfile/cspgo_profile_summary.ll
@@ -104,7 +104,7 @@ for.end:
 ; CSPGOSUMMARY-LABEL: @foo
 ; CSPGOSUMMARY: %even.odd.i = select i1 %tobool.i{{[0-9]*}}, ptr @even, ptr @odd
 ; CSPGOSUMMARY-SAME: !prof ![[BW_CSPGO_BAR]]
-; CSPGOSUMMARY: %even.odd.i2 = select i1 %tobool.i{{[0-9]*}}, ptr @even, ptr @odd
+; CSPGOSUMMARY: %even.odd.i2 = select i1 %tobool.i{{[0-9]*}}, ptr @odd, ptr @even
 ; CSPGOSUMMARY-SAME: !prof ![[BW_CSPGO_BAR]]
 
 declare dso_local i32 @bar_m(i32)

From 4e4433f629a5ecfea538808449d513b5762a67b6 Mon Sep 17 00:00:00 2001
From: Sergei Barannikov <barannikov88@gmail.com>
Date: Sat, 28 Oct 2023 02:33:50 +0300
Subject: [PATCH 276/877] [lldb] Remove some declarations without definitions
 (#70514)

The corresponding definitions were removed in 7dcbe3d3 and 2a8fa2a8.
Also remove a couple of variables made dead by those changes.
---
 .../ExpressionParser/Clang/IRForTarget.cpp    |  8 --
 .../ExpressionParser/Clang/IRForTarget.h      | 74 -------------------
 2 files changed, 82 deletions(-)

diff --git a/lldb/source/Plugins/ExpressionParser/Clang/IRForTarget.cpp b/lldb/source/Plugins/ExpressionParser/Clang/IRForTarget.cpp
index 1b7e86bb187f2..81bb0e73e91f9 100644
--- a/lldb/source/Plugins/ExpressionParser/Clang/IRForTarget.cpp
+++ b/lldb/source/Plugins/ExpressionParser/Clang/IRForTarget.cpp
@@ -1641,14 +1641,6 @@ bool IRForTarget::runOnModule(Module &llvm_module) {
     }
   }
 
-  llvm::Type *int8_ty = Type::getInt8Ty(m_module->getContext());
-
-  m_reloc_placeholder = new llvm::GlobalVariable(
-      (*m_module), int8_ty, false /* IsConstant */,
-      GlobalVariable::InternalLinkage, Constant::getNullValue(int8_ty),
-      "reloc_placeholder", nullptr /* InsertBefore */,
-      GlobalVariable::NotThreadLocal /* ThreadLocal */, 0 /* AddressSpace */);
-
   ////////////////////////////////////////////////////////////
   // Replace $__lldb_expr_result with a persistent variable
   //
diff --git a/lldb/source/Plugins/ExpressionParser/Clang/IRForTarget.h b/lldb/source/Plugins/ExpressionParser/Clang/IRForTarget.h
index eb93952e2b3c2..a924187ba04c0 100644
--- a/lldb/source/Plugins/ExpressionParser/Clang/IRForTarget.h
+++ b/lldb/source/Plugins/ExpressionParser/Clang/IRForTarget.h
@@ -111,43 +111,6 @@ class IRForTarget {
   ///     True on success; false otherwise.
   bool FixFunctionLinkage(llvm::Function &llvm_function);
 
-  /// A module-level pass to replace all function pointers with their
-  /// integer equivalents.
-
-  /// The top-level pass implementation
-  ///
-  /// \param[in] llvm_function
-  ///     The function currently being processed.
-  ///
-  /// \return
-  ///     True on success; false otherwise.
-  bool HasSideEffects(llvm::Function &llvm_function);
-
-  /// A function-level pass to check whether the function has side
-  /// effects.
-
-  /// Get the address of a function, and a location to put the complete Value
-  /// of the function if one is available.
-  ///
-  /// \param[in] function
-  ///     The function to find the location of.
-  ///
-  /// \param[out] ptr
-  ///     The location of the function in the target.
-  ///
-  /// \param[out] name
-  ///     The resolved name of the function (matters for intrinsics).
-  ///
-  /// \param[out] value_ptr
-  ///     A variable to put the function's completed Value* in, or NULL
-  ///     if the Value* shouldn't be stored anywhere.
-  ///
-  /// \return
-  ///     The pointer.
-  LookupResult GetFunctionAddress(llvm::Function *function, uint64_t &ptr,
-                                  lldb_private::ConstString &name,
-                                  llvm::Constant **&value_ptr);
-
   /// A function-level pass to take the generated global value
   /// $__lldb_expr_result and make it into a persistent variable. Also see
   /// ASTResultSynthesizer.
@@ -170,30 +133,6 @@ class IRForTarget {
 private:
   clang::NamedDecl *DeclForGlobal(llvm::GlobalValue *global);
 
-  /// Set the constant result variable m_const_result to the provided
-  /// constant, assuming it can be evaluated.  The result variable will be
-  /// reset to NULL later if the expression has side effects.
-  ///
-  /// \param[in] initializer
-  ///     The constant initializer for the variable.
-  ///
-  /// \param[in] name
-  ///     The name of the result variable.
-  ///
-  /// \param[in] type
-  ///     The Clang type of the result variable.
-  void MaybeSetConstantResult(llvm::Constant *initializer,
-                              lldb_private::ConstString name,
-                              lldb_private::TypeFromParser type);
-
-  /// If the IR represents a cast of a variable, set m_const_result to the
-  /// result of the cast.  The result variable will be reset to
-  /// NULL latger if the expression has side effects.
-  ///
-  /// \param[in] type
-  ///     The Clang type of the result variable.
-  void MaybeSetCastResult(lldb_private::TypeFromParser type);
-
   /// The top-level pass implementation
   ///
   /// \param[in] llvm_function
@@ -409,15 +348,9 @@ class IRForTarget {
   lldb_private::Stream &m_error_stream;
   /// The execution unit containing the IR being created.
   lldb_private::IRExecutionUnit &m_execution_unit;
-  /// If non-NULL, the store instruction that writes to the result variable.  If
-  /// m_has_side_effects is true, this is NULL.
-  llvm::StoreInst *m_result_store = nullptr;
   /// True if the function's result in the AST is a pointer (see comments in
   /// ASTResultSynthesizer::SynthesizeBodyResult)
   bool m_result_is_pointer = false;
-  /// A placeholder that will be replaced by a pointer to the final location of
-  /// the static allocation.
-  llvm::GlobalVariable *m_reloc_placeholder = nullptr;
 
   class FunctionValueCache {
   public:
@@ -454,13 +387,6 @@ class IRForTarget {
                              FunctionValueCache &value_maker,
                              FunctionValueCache &entry_instruction_finder,
                              lldb_private::Stream &error_stream);
-
-  /// Commit the allocation in m_data_allocator and use its final location to
-  /// replace m_reloc_placeholder.
-  ///
-  /// \return
-  ///     True on success; false otherwise
-  bool CompleteDataAllocation();
 };
 
 #endif // LLDB_SOURCE_PLUGINS_EXPRESSIONPARSER_CLANG_IRFORTARGET_H

From 01828c4323172db5901ac3e959d52553b2bd74e5 Mon Sep 17 00:00:00 2001
From: Konstantinos Parasyris <koparasy@gmail.com>
Date: Fri, 27 Oct 2023 16:46:34 -0700
Subject: [PATCH 277/877] [OpenMP] record-replay use static-cast  (#70516)

[OpenMP] Fixes #69905
---
 .../libomptarget/tools/kernelreplay/llvm-omp-kernel-replay.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/openmp/libomptarget/tools/kernelreplay/llvm-omp-kernel-replay.cpp b/openmp/libomptarget/tools/kernelreplay/llvm-omp-kernel-replay.cpp
index 57773900e1e1b..0de59e23634a5 100644
--- a/openmp/libomptarget/tools/kernelreplay/llvm-omp-kernel-replay.cpp
+++ b/openmp/libomptarget/tools/kernelreplay/llvm-omp-kernel-replay.cpp
@@ -85,8 +85,7 @@ int main(int argc, char **argv) {
   auto *TgtArgOffsetsArray =
       JsonKernelInfo->getAsObject()->getArray("ArgOffsets");
   for (auto It : *TgtArgOffsetsArray)
-    TgtArgOffsets.push_back(
-        reinterpret_cast<ptrdiff_t>(It.getAsInteger().value()));
+    TgtArgOffsets.push_back(static_cast<ptrdiff_t>(It.getAsInteger().value()));
 
   __tgt_offload_entry KernelEntry = {nullptr, nullptr, 0, 0, 0};
   std::string KernelEntryName = KernelFunc.value().str();

From 7d608ee2bb0a9511387491eef2209bea9fdcbb03 Mon Sep 17 00:00:00 2001
From: Peiming Liu <36770114+PeimingLiu@users.noreply.github.com>
Date: Fri, 27 Oct 2023 16:46:58 -0700
Subject: [PATCH 278/877] [mlir][sparse] unify sparse_tensor.out rewriting
 rules (#70518)

---
 .../Transforms/SparseTensorConversion.cpp     | 41 +------------------
 .../Transforms/SparseTensorRewriting.cpp      |  4 +-
 .../test/Dialect/SparseTensor/conversion.mlir | 28 -------------
 .../SparseTensor/python/test_output.py        | 30 +++++++++-----
 4 files changed, 23 insertions(+), 80 deletions(-)

diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorConversion.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorConversion.cpp
index a92038ce7c98d..570be951cab84 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorConversion.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorConversion.cpp
@@ -270,13 +270,6 @@ static Value genValuesCall(OpBuilder &builder, Location loc, ShapedType tp,
       .getResult(0);
 }
 
-/// Generates a call to release/delete a `SparseTensorCOO`.
-static void genDelCOOCall(OpBuilder &builder, Location loc, Type elemTp,
-                          Value coo) {
-  SmallString<21> name{"delSparseTensorCOO", primaryTypeFunctionSuffix(elemTp)};
-  createFuncCall(builder, loc, name, {}, coo, EmitCInterface::Off);
-}
-
 //===----------------------------------------------------------------------===//
 // Conversion rules.
 //===----------------------------------------------------------------------===//
@@ -707,37 +700,6 @@ class SparseTensorCompressConverter : public OpConversionPattern<CompressOp> {
   }
 };
 
-/// Sparse conversion rule for the output operator.
-class SparseTensorOutConverter : public OpConversionPattern<OutOp> {
-public:
-  using OpConversionPattern::OpConversionPattern;
-  LogicalResult
-  matchAndRewrite(OutOp op, OpAdaptor adaptor,
-                  ConversionPatternRewriter &rewriter) const override {
-    const Location loc = op->getLoc();
-    const auto srcTp = getSparseTensorType(op.getTensor());
-    // Convert to default permuted COO.
-    Value src = adaptor.getOperands()[0];
-    SmallVector<Value> dimSizes = getDimSizes(rewriter, loc, srcTp, src);
-    Value coo = NewCallParams(rewriter, loc)
-                    .genBuffers(srcTp.withoutDimToLvl(), dimSizes)
-                    .genNewCall(Action::kToCOO, src);
-    // Then output the tensor to external file with coordinates in the
-    // externally visible lexicographic coordinate order.  A sort is
-    // required if the source was not in that order yet (note that the
-    // sort can be dropped altogether if external format does not care
-    // about the order at all, but here we assume it does).
-    const Value sort = constantI1(rewriter, loc, !srcTp.isIdentity());
-    SmallVector<Value, 3> outParams{coo, adaptor.getOperands()[1], sort};
-    const Type elemTp = srcTp.getElementType();
-    SmallString<18> name{"outSparseTensor", primaryTypeFunctionSuffix(elemTp)};
-    createFuncCall(rewriter, loc, name, {}, outParams, EmitCInterface::Off);
-    genDelCOOCall(rewriter, loc, elemTp, coo);
-    rewriter.eraseOp(op);
-    return success();
-  }
-};
-
 /// Sparse conversion rule for the sparse_tensor.pack operator.
 class SparseTensorAssembleConverter : public OpConversionPattern<AssembleOp> {
 public:
@@ -789,6 +751,5 @@ void mlir::populateSparseTensorConversionPatterns(TypeConverter &typeConverter,
            SparseTensorToValuesConverter, SparseNumberOfEntriesConverter,
            SparseTensorLoadConverter, SparseTensorInsertConverter,
            SparseTensorExpandConverter, SparseTensorCompressConverter,
-           SparseTensorOutConverter, SparseTensorAssembleConverter>(
-          typeConverter, patterns.getContext());
+           SparseTensorAssembleConverter>(typeConverter, patterns.getContext());
 }
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp
index 6dcac38eb4f35..e9bcb5dc070ad 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp
@@ -1312,12 +1312,12 @@ void mlir::populatePostSparsificationRewriting(RewritePatternSet &patterns,
                ReshapeRewriter<tensor::CollapseShapeOp>,
                Sparse2SparseReshapeRewriter<tensor::ExpandShapeOp>,
                Sparse2SparseReshapeRewriter<tensor::CollapseShapeOp>,
-               SparseTensorDimOpRewriter, TensorReshapeRewriter>(
+               SparseTensorDimOpRewriter, TensorReshapeRewriter, OutRewriter>(
       patterns.getContext());
   if (enableForeach)
     patterns.add<ForeachRewriter>(patterns.getContext());
   if (enableConvert)
     patterns.add<DirectConvertRewriter>(patterns.getContext());
   if (!enableRT)
-    patterns.add<NewRewriter, OutRewriter>(patterns.getContext());
+    patterns.add<NewRewriter>(patterns.getContext());
 }
diff --git a/mlir/test/Dialect/SparseTensor/conversion.mlir b/mlir/test/Dialect/SparseTensor/conversion.mlir
index 1d7599b3a4edb..092ba6b8358b5 100644
--- a/mlir/test/Dialect/SparseTensor/conversion.mlir
+++ b/mlir/test/Dialect/SparseTensor/conversion.mlir
@@ -398,34 +398,6 @@ func.func @sparse_compression(%tensor: tensor<8x8xf64, #CSR>,
   return %0 : tensor<8x8xf64, #CSR>
 }
 
-// CHECK-LABEL: func @sparse_out1(
-//  CHECK-SAME: %[[A:.*]]: !llvm.ptr<i8>,
-//  CHECK-SAME: %[[B:.*]]: !llvm.ptr<i8>)
-//  CHECK-DAG:  %[[ToCOO:.*]] = arith.constant 5 : i32
-//  CHECK-DAG:  %[[Sort:.*]] = arith.constant false
-//       CHECK: %[[COO:.*]] = call @newSparseTensor(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %[[ToCOO]], %[[A]])
-//       CHECK: call @outSparseTensorF64(%[[COO]], %[[B]], %[[Sort]]) : (!llvm.ptr<i8>, !llvm.ptr<i8>, i1) -> ()
-//       CHECK: call @delSparseTensorCOOF64(%[[COO]])
-//       CHECK: return
-func.func @sparse_out1(%arg0: tensor<?x?xf64, #CSR>, %arg1: !llvm.ptr<i8>) {
-  sparse_tensor.out %arg0, %arg1 : tensor<?x?xf64, #CSR>, !llvm.ptr<i8>
-  return
-}
-
-// CHECK-LABEL: func @sparse_out2(
-//  CHECK-SAME: %[[A:.*]]: !llvm.ptr<i8>,
-//  CHECK-SAME: %[[B:.*]]: !llvm.ptr<i8>)
-//  CHECK-DAG:  %[[ToCOO:.*]] = arith.constant 5 : i32
-//  CHECK-DAG:  %[[Sort:.*]] = arith.constant true
-//       CHECK: %[[COO:.*]] = call @newSparseTensor(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %[[ToCOO]], %[[A]])
-//       CHECK: call @outSparseTensorF32(%[[COO]], %[[B]], %[[Sort]]) : (!llvm.ptr<i8>, !llvm.ptr<i8>, i1) -> ()
-//       CHECK: call @delSparseTensorCOOF32(%[[COO]])
-//       CHECK: return
-func.func @sparse_out2(%arg0: tensor<?x?x?xf32, #SparseTensor>, %arg1: !llvm.ptr<i8>) {
-  sparse_tensor.out %arg0, %arg1 : tensor<?x?x?xf32, #SparseTensor>, !llvm.ptr<i8>
-  return
-}
-
 // CHECK-LABEL: func @sparse_and_dense_init(
 //       CHECK: %[[S:.*]] = call @newSparseTensor
 //       CHECK: %[[D:.*]] = tensor.empty
diff --git a/mlir/test/Integration/Dialect/SparseTensor/python/test_output.py b/mlir/test/Integration/Dialect/SparseTensor/python/test_output.py
index 1afac10be3adb..5a8c92f7cd21f 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/python/test_output.py
+++ b/mlir/test/Integration/Dialect/SparseTensor/python/test_output.py
@@ -29,14 +29,14 @@ def boilerplate(attr: st.EncodingAttr):
 """
 
 
-def expected():
+def expected(id_map):
     """Returns expected contents of output.
 
-    Regardless of the dimension ordering, compression, and bitwidths that are
-    used in the sparse tensor, the output is always lexicographically sorted
-    by natural index order.
+    Output appears as dimension coordinates but lexicographically
+    sorted by level coordinates.
     """
-    return f"""; extended FROSTT format
+    return (
+        f"""# extended FROSTT format
 2 5
 10 10
 1 1 1
@@ -45,13 +45,23 @@ def expected():
 5 5 5
 10 1 4
 """
+        if id_map
+        else f"""# extended FROSTT format
+2 5
+10 10
+1 1 1
+10 1 4
+2 2 2
+5 5 5
+1 10 3
+"""
+    )
 
 
 def build_compile_and_run_output(attr: st.EncodingAttr, compiler, expected):
     # Build and Compile.
     module = ir.Module.parse(boilerplate(attr))
     engine = compiler.compile_and_jit(module)
-
     # Invoke the kernel and compare output.
     with tempfile.TemporaryDirectory() as test_dir:
         out = os.path.join(test_dir, "out.tns")
@@ -83,20 +93,20 @@ def main():
             [st.DimLevelType.compressed, st.DimLevelType.compressed],
         ]
         orderings = [
-            ir.AffineMap.get_permutation([0, 1]),
-            ir.AffineMap.get_permutation([1, 0]),
+            (ir.AffineMap.get_permutation([0, 1]), True),
+            (ir.AffineMap.get_permutation([1, 0]), False),
         ]
         bitwidths = [8, 16, 32, 64]
         compiler = sparse_compiler.SparseCompiler(
             options="", opt_level=2, shared_libs=[support_lib]
         )
         for level in levels:
-            for ordering in orderings:
+            for ordering, id_map in orderings:
                 for bwidth in bitwidths:
                     attr = st.EncodingAttr.get(
                         level, ordering, ordering, bwidth, bwidth
                     )
-                    build_compile_and_run_output(attr, compiler, expected())
+                    build_compile_and_run_output(attr, compiler, expected(id_map))
                     count = count + 1
 
         # Now do the same for BSR.

From 4f183b1f6e17e5c9acd42d5c0608391533540eff Mon Sep 17 00:00:00 2001
From: Aart Bik <39774503+aartbik@users.noreply.github.com>
Date: Fri, 27 Oct 2023 16:58:41 -0700
Subject: [PATCH 279/877] [mlir][sparse] remove obsoleted output methods from
 runtime (#70523)

Our CODE and LIB are more unified every day!
---
 .../mlir/ExecutionEngine/SparseTensor/File.h  | 31 ++-----------------
 .../ExecutionEngine/SparseTensorRuntime.h     |  7 -----
 .../ExecutionEngine/SparseTensorRuntime.cpp   | 11 -------
 3 files changed, 2 insertions(+), 47 deletions(-)

diff --git a/mlir/include/mlir/ExecutionEngine/SparseTensor/File.h b/mlir/include/mlir/ExecutionEngine/SparseTensor/File.h
index 1b5f0553a3af9..85bbfe06a86a0 100644
--- a/mlir/include/mlir/ExecutionEngine/SparseTensor/File.h
+++ b/mlir/include/mlir/ExecutionEngine/SparseTensor/File.h
@@ -1,4 +1,4 @@
-//===- File.h - Reading/writing sparse tensors from/to files ----*- C++ -*-===//
+//===- File.h - Reading sparse tensors from files --------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file implements reading and writing sparse tensor files in one of the
+// This file implements reading sparse tensor from files in one of the
 // following external formats:
 //
 // (1) Matrix Market Exchange (MME): *.mtx
@@ -358,33 +358,6 @@ bool SparseTensorReader::readToBuffersLoop(const MapRef &map, C *lvlCoordinates,
   return isSorted;
 }
 
-/// Writes the sparse tensor to `filename` in extended FROSTT format.
-template <typename V>
-inline void writeExtFROSTT(const SparseTensorCOO<V> &coo,
-                           const char *filename) {
-  assert(filename && "Got nullptr for filename");
-  const auto &dimSizes = coo.getDimSizes();
-  const auto &elements = coo.getElements();
-  const uint64_t dimRank = coo.getRank();
-  const uint64_t nse = elements.size();
-  std::fstream file;
-  file.open(filename, std::ios_base::out | std::ios_base::trunc);
-  assert(file.is_open());
-  file << "; extended FROSTT format\n" << dimRank << " " << nse << std::endl;
-  for (uint64_t d = 0; d < dimRank - 1; ++d)
-    file << dimSizes[d] << " ";
-  file << dimSizes[dimRank - 1] << std::endl;
-  for (uint64_t i = 0; i < nse; ++i) {
-    const auto &coords = elements[i].coords;
-    for (uint64_t d = 0; d < dimRank; ++d)
-      file << (coords[d] + 1) << " ";
-    file << elements[i].value << std::endl;
-  }
-  file.flush();
-  file.close();
-  assert(file.good());
-}
-
 } // namespace sparse_tensor
 } // namespace mlir
 
diff --git a/mlir/include/mlir/ExecutionEngine/SparseTensorRuntime.h b/mlir/include/mlir/ExecutionEngine/SparseTensorRuntime.h
index bbfe6ed19e70d..0eb0590ac7e57 100644
--- a/mlir/include/mlir/ExecutionEngine/SparseTensorRuntime.h
+++ b/mlir/include/mlir/ExecutionEngine/SparseTensorRuntime.h
@@ -166,13 +166,6 @@ MLIR_CRUNNERUTILS_EXPORT void endForwardingInsert(void *tensor);
 /// Tensor-storage method to finalize lexicographic insertions.
 MLIR_CRUNNERUTILS_EXPORT void endLexInsert(void *tensor);
 
-/// Coordinate-scheme method to write to file in extended FROSTT format.
-#define DECL_OUTSPARSETENSOR(VNAME, V)                                         \
-  MLIR_CRUNNERUTILS_EXPORT void outSparseTensor##VNAME(void *coo, void *dest,  \
-                                                       bool sort);
-MLIR_SPARSETENSOR_FOREVERY_V(DECL_OUTSPARSETENSOR)
-#undef DECL_OUTSPARSETENSOR
-
 /// Releases the memory for the tensor-storage object.
 MLIR_CRUNNERUTILS_EXPORT void delSparseTensor(void *tensor);
 
diff --git a/mlir/lib/ExecutionEngine/SparseTensorRuntime.cpp b/mlir/lib/ExecutionEngine/SparseTensorRuntime.cpp
index 441a6094eddf0..f84fdd3964c14 100644
--- a/mlir/lib/ExecutionEngine/SparseTensorRuntime.cpp
+++ b/mlir/lib/ExecutionEngine/SparseTensorRuntime.cpp
@@ -496,17 +496,6 @@ void endLexInsert(void *tensor) {
   return static_cast<SparseTensorStorageBase *>(tensor)->endLexInsert();
 }
 
-#define IMPL_OUTSPARSETENSOR(VNAME, V)                                         \
-  void outSparseTensor##VNAME(void *coo, void *dest, bool sort) {              \
-    assert(coo);                                                               \
-    auto &coo_ = *static_cast<SparseTensorCOO<V> *>(coo);                      \
-    if (sort)                                                                  \
-      coo_.sort();                                                             \
-    return writeExtFROSTT(coo_, static_cast<char *>(dest));                    \
-  }
-MLIR_SPARSETENSOR_FOREVERY_V(IMPL_OUTSPARSETENSOR)
-#undef IMPL_OUTSPARSETENSOR
-
 void delSparseTensor(void *tensor) {
   delete static_cast<SparseTensorStorageBase *>(tensor);
 }

From d0f2c28a88378a6366f0d0aacc2675325eae5208 Mon Sep 17 00:00:00 2001
From: Youngsuk Kim <youngsuk.kim@hpe.com>
Date: Fri, 27 Oct 2023 19:13:23 -0500
Subject: [PATCH 280/877] [DataFlowSanitizer] Remove no-op ptr-to-ptr bitcasts
 (NFC)

Opaque pointer cleanup effort. NFC.
---
 .../Instrumentation/DataFlowSanitizer.cpp     | 35 +++++++------------
 1 file changed, 13 insertions(+), 22 deletions(-)

diff --git a/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
index dbc18458530a7..adc0beed4d9a1 100644
--- a/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
@@ -2436,9 +2436,9 @@ void DFSanVisitor::visitLoadInst(LoadInst &LI) {
 
   if (ClEventCallbacks) {
     IRBuilder<> IRB(Pos);
-    Value *Addr8 = IRB.CreateBitCast(LI.getPointerOperand(), DFSF.DFS.Int8Ptr);
+    Value *Addr = LI.getPointerOperand();
     CallInst *CI =
-        IRB.CreateCall(DFSF.DFS.DFSanLoadCallbackFn, {PrimitiveShadow, Addr8});
+        IRB.CreateCall(DFSF.DFS.DFSanLoadCallbackFn, {PrimitiveShadow, Addr});
     CI->addParamAttr(0, Attribute::ZExt);
   }
 
@@ -2554,9 +2554,7 @@ void DFSanFunction::storeZeroPrimitiveShadow(Value *Addr, uint64_t Size,
       IntegerType::get(*DFS.Ctx, Size * DFS.ShadowWidthBits);
   Value *ExtZeroShadow = ConstantInt::get(ShadowTy, 0);
   Value *ShadowAddr = DFS.getShadowAddress(Addr, Pos);
-  Value *ExtShadowAddr =
-      IRB.CreateBitCast(ShadowAddr, PointerType::getUnqual(ShadowTy));
-  IRB.CreateAlignedStore(ExtZeroShadow, ExtShadowAddr, ShadowAlign);
+  IRB.CreateAlignedStore(ExtZeroShadow, ShadowAddr, ShadowAlign);
   // Do not write origins for 0 shadows because we do not trace origins for
   // untainted sinks.
 }
@@ -2611,11 +2609,9 @@ void DFSanFunction::storePrimitiveShadowOrigin(Value *Addr, uint64_t Size,
           ShadowVec, PrimitiveShadow,
           ConstantInt::get(Type::getInt32Ty(*DFS.Ctx), I));
     }
-    Value *ShadowVecAddr =
-        IRB.CreateBitCast(ShadowAddr, PointerType::getUnqual(ShadowVecTy));
     do {
       Value *CurShadowVecAddr =
-          IRB.CreateConstGEP1_32(ShadowVecTy, ShadowVecAddr, Offset);
+          IRB.CreateConstGEP1_32(ShadowVecTy, ShadowAddr, Offset);
       IRB.CreateAlignedStore(ShadowVec, CurShadowVecAddr, ShadowAlign);
       LeftSize -= ShadowVecSize;
       ++Offset;
@@ -2699,9 +2695,9 @@ void DFSanVisitor::visitStoreInst(StoreInst &SI) {
                                   PrimitiveShadow, Origin, &SI);
   if (ClEventCallbacks) {
     IRBuilder<> IRB(&SI);
-    Value *Addr8 = IRB.CreateBitCast(SI.getPointerOperand(), DFSF.DFS.Int8Ptr);
+    Value *Addr = SI.getPointerOperand();
     CallInst *CI =
-        IRB.CreateCall(DFSF.DFS.DFSanStoreCallbackFn, {PrimitiveShadow, Addr8});
+        IRB.CreateCall(DFSF.DFS.DFSanStoreCallbackFn, {PrimitiveShadow, Addr});
     CI->addParamAttr(0, Attribute::ZExt);
   }
 }
@@ -2918,11 +2914,9 @@ void DFSanVisitor::visitMemSetInst(MemSetInst &I) {
   Value *ValOrigin = DFSF.DFS.shouldTrackOrigins()
                          ? DFSF.getOrigin(I.getValue())
                          : DFSF.DFS.ZeroOrigin;
-  IRB.CreateCall(
-      DFSF.DFS.DFSanSetLabelFn,
-      {ValShadow, ValOrigin,
-       IRB.CreateBitCast(I.getDest(), Type::getInt8PtrTy(*DFSF.DFS.Ctx)),
-       IRB.CreateZExtOrTrunc(I.getLength(), DFSF.DFS.IntptrTy)});
+  IRB.CreateCall(DFSF.DFS.DFSanSetLabelFn,
+                 {ValShadow, ValOrigin, I.getDest(),
+                  IRB.CreateZExtOrTrunc(I.getLength(), DFSF.DFS.IntptrTy)});
 }
 
 void DFSanVisitor::visitMemTransferInst(MemTransferInst &I) {
@@ -2938,23 +2932,20 @@ void DFSanVisitor::visitMemTransferInst(MemTransferInst &I) {
          IRB.CreateIntCast(I.getArgOperand(2), DFSF.DFS.IntptrTy, false)});
   }
 
-  Value *RawDestShadow = DFSF.DFS.getShadowAddress(I.getDest(), &I);
+  Value *DestShadow = DFSF.DFS.getShadowAddress(I.getDest(), &I);
   Value *SrcShadow = DFSF.DFS.getShadowAddress(I.getSource(), &I);
   Value *LenShadow =
       IRB.CreateMul(I.getLength(), ConstantInt::get(I.getLength()->getType(),
                                                     DFSF.DFS.ShadowWidthBytes));
-  Type *Int8Ptr = Type::getInt8PtrTy(*DFSF.DFS.Ctx);
-  Value *DestShadow = IRB.CreateBitCast(RawDestShadow, Int8Ptr);
-  SrcShadow = IRB.CreateBitCast(SrcShadow, Int8Ptr);
   auto *MTI = cast<MemTransferInst>(
       IRB.CreateCall(I.getFunctionType(), I.getCalledOperand(),
                      {DestShadow, SrcShadow, LenShadow, I.getVolatileCst()}));
   MTI->setDestAlignment(DFSF.getShadowAlign(I.getDestAlign().valueOrOne()));
   MTI->setSourceAlignment(DFSF.getShadowAlign(I.getSourceAlign().valueOrOne()));
   if (ClEventCallbacks) {
-    IRB.CreateCall(DFSF.DFS.DFSanMemTransferCallbackFn,
-                   {RawDestShadow,
-                    IRB.CreateZExtOrTrunc(I.getLength(), DFSF.DFS.IntptrTy)});
+    IRB.CreateCall(
+        DFSF.DFS.DFSanMemTransferCallbackFn,
+        {DestShadow, IRB.CreateZExtOrTrunc(I.getLength(), DFSF.DFS.IntptrTy)});
   }
 }
 

From 558c1f123ea7f3134c1f34124df6921376465494 Mon Sep 17 00:00:00 2001
From: Z572 <37945516+Z572@users.noreply.github.com>
Date: Sat, 28 Oct 2023 08:28:55 +0800
Subject: [PATCH 281/877] [Flang][OpenMP] skip CompilerDirective on
 ResolveOmpTopLevelParts. (#69580)

Fixes #68966
---
 flang/lib/Semantics/resolve-directives.cpp         | 4 +++-
 flang/test/Semantics/OpenMP/compiler-directive.f90 | 8 ++++++++
 2 files changed, 11 insertions(+), 1 deletion(-)
 create mode 100644 flang/test/Semantics/OpenMP/compiler-directive.f90

diff --git a/flang/lib/Semantics/resolve-directives.cpp b/flang/lib/Semantics/resolve-directives.cpp
index 2a6523f579f27..66d3a9da36505 100644
--- a/flang/lib/Semantics/resolve-directives.cpp
+++ b/flang/lib/Semantics/resolve-directives.cpp
@@ -2241,7 +2241,9 @@ void ResolveOmpTopLevelParts(
       if (!std::holds_alternative<common::Indirection<parser::Module>>(
               unit.u) &&
           !std::holds_alternative<common::Indirection<parser::Submodule>>(
-              unit.u)) {
+              unit.u) &&
+          !std::holds_alternative<
+              common::Indirection<parser::CompilerDirective>>(unit.u)) {
         Symbol *symbol{common::visit(
             [&context](auto &x) {
               Scope *scope = GetScope(context, x.value());
diff --git a/flang/test/Semantics/OpenMP/compiler-directive.f90 b/flang/test/Semantics/OpenMP/compiler-directive.f90
new file mode 100644
index 0000000000000..5d3e9bae27fd8
--- /dev/null
+++ b/flang/test/Semantics/OpenMP/compiler-directive.f90
@@ -0,0 +1,8 @@
+! RUN: %python %S/../test_errors.py %s %flang -fopenmp
+! CompilerDirective with openmp tests
+
+!ERROR: !DIR$ IGNORE_TKR directive must appear in a subroutine or function
+!dir$ ignore_tkr
+
+program main
+end program main

From a8e0acf660639078a7e5341dd6ec82326d3b796b Mon Sep 17 00:00:00 2001
From: Brad Smith <brad@comstyle.com>
Date: Fri, 27 Oct 2023 22:10:05 -0400
Subject: [PATCH 282/877] [Driver][NFC] Fix a typo in the function name

---
 clang/lib/Driver/ToolChains/DragonFly.cpp | 2 +-
 clang/lib/Driver/ToolChains/FreeBSD.cpp   | 2 +-
 clang/lib/Driver/ToolChains/Gnu.cpp       | 2 +-
 clang/lib/Driver/ToolChains/Haiku.cpp     | 2 +-
 clang/lib/Driver/ToolChains/NetBSD.cpp    | 2 +-
 clang/lib/Driver/ToolChains/OpenBSD.cpp   | 2 +-
 6 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/clang/lib/Driver/ToolChains/DragonFly.cpp b/clang/lib/Driver/ToolChains/DragonFly.cpp
index 500dd98665075..ed7f751adc0ef 100644
--- a/clang/lib/Driver/ToolChains/DragonFly.cpp
+++ b/clang/lib/Driver/ToolChains/DragonFly.cpp
@@ -147,7 +147,7 @@ void dragonfly::Linker::ConstructJob(Compilation &C, const JobAction &JA,
     // Additional linker set-up and flags for Fortran. This is required in order
     // to generate executables. As Fortran runtime depends on the C runtime,
     // these dependencies need to be listed before the C runtime below (i.e.
-    // AddRuntTimeLibs).
+    // AddRunTimeLibs).
     if (D.IsFlangMode()) {
       addFortranRuntimeLibraryPath(ToolChain, Args, CmdArgs);
       addFortranRuntimeLibs(ToolChain, CmdArgs);
diff --git a/clang/lib/Driver/ToolChains/FreeBSD.cpp b/clang/lib/Driver/ToolChains/FreeBSD.cpp
index a5be32b4a2f07..e7dcccc9e9fc4 100644
--- a/clang/lib/Driver/ToolChains/FreeBSD.cpp
+++ b/clang/lib/Driver/ToolChains/FreeBSD.cpp
@@ -294,7 +294,7 @@ void freebsd::Linker::ConstructJob(Compilation &C, const JobAction &JA,
     // Additional linker set-up and flags for Fortran. This is required in order
     // to generate executables. As Fortran runtime depends on the C runtime,
     // these dependencies need to be listed before the C runtime below (i.e.
-    // AddRuntTimeLibs).
+    // AddRunTimeLibs).
     if (D.IsFlangMode()) {
       addFortranRuntimeLibraryPath(ToolChain, Args, CmdArgs);
       addFortranRuntimeLibs(ToolChain, CmdArgs);
diff --git a/clang/lib/Driver/ToolChains/Gnu.cpp b/clang/lib/Driver/ToolChains/Gnu.cpp
index 2515f85432d75..5237951f84cce 100644
--- a/clang/lib/Driver/ToolChains/Gnu.cpp
+++ b/clang/lib/Driver/ToolChains/Gnu.cpp
@@ -573,7 +573,7 @@ void tools::gnutools::Linker::ConstructJob(Compilation &C, const JobAction &JA,
   // Additional linker set-up and flags for Fortran. This is required in order
   // to generate executables. As Fortran runtime depends on the C runtime,
   // these dependencies need to be listed before the C runtime below (i.e.
-  // AddRuntTimeLibs).
+  // AddRunTimeLibs).
   if (D.IsFlangMode()) {
     addFortranRuntimeLibraryPath(ToolChain, Args, CmdArgs);
     addFortranRuntimeLibs(ToolChain, CmdArgs);
diff --git a/clang/lib/Driver/ToolChains/Haiku.cpp b/clang/lib/Driver/ToolChains/Haiku.cpp
index 172f16cc46e32..b940150788f65 100644
--- a/clang/lib/Driver/ToolChains/Haiku.cpp
+++ b/clang/lib/Driver/ToolChains/Haiku.cpp
@@ -98,7 +98,7 @@ void haiku::Linker::ConstructJob(Compilation &C, const JobAction &JA,
     // Additional linker set-up and flags for Fortran. This is required in order
     // to generate executables. As Fortran runtime depends on the C runtime,
     // these dependencies need to be listed before the C runtime below (i.e.
-    // AddRuntTimeLibs).
+    // AddRunTimeLibs).
     if (D.IsFlangMode()) {
       addFortranRuntimeLibraryPath(ToolChain, Args, CmdArgs);
       addFortranRuntimeLibs(ToolChain, CmdArgs);
diff --git a/clang/lib/Driver/ToolChains/NetBSD.cpp b/clang/lib/Driver/ToolChains/NetBSD.cpp
index 245553d46055a..7a1d4561c6f2f 100644
--- a/clang/lib/Driver/ToolChains/NetBSD.cpp
+++ b/clang/lib/Driver/ToolChains/NetBSD.cpp
@@ -318,7 +318,7 @@ void netbsd::Linker::ConstructJob(Compilation &C, const JobAction &JA,
     // Additional linker set-up and flags for Fortran. This is required in order
     // to generate executables. As Fortran runtime depends on the C runtime,
     // these dependencies need to be listed before the C runtime below (i.e.
-    // AddRuntTimeLibs).
+    // AddRunTimeLibs).
     if (D.IsFlangMode()) {
       addFortranRuntimeLibraryPath(ToolChain, Args, CmdArgs);
       addFortranRuntimeLibs(ToolChain, CmdArgs);
diff --git a/clang/lib/Driver/ToolChains/OpenBSD.cpp b/clang/lib/Driver/ToolChains/OpenBSD.cpp
index 4e14c3d140a1d..16a311be31be7 100644
--- a/clang/lib/Driver/ToolChains/OpenBSD.cpp
+++ b/clang/lib/Driver/ToolChains/OpenBSD.cpp
@@ -217,7 +217,7 @@ void openbsd::Linker::ConstructJob(Compilation &C, const JobAction &JA,
     // Additional linker set-up and flags for Fortran. This is required in order
     // to generate executables. As Fortran runtime depends on the C runtime,
     // these dependencies need to be listed before the C runtime below (i.e.
-    // AddRuntTimeLibs).
+    // AddRunTimeLibs).
     if (D.IsFlangMode()) {
       addFortranRuntimeLibraryPath(ToolChain, Args, CmdArgs);
       addFortranRuntimeLibs(ToolChain, CmdArgs);

From 223852aecf3f01cea21c40ea128a26f6a194345d Mon Sep 17 00:00:00 2001
From: Brad Smith <brad@comstyle.com>
Date: Fri, 27 Oct 2023 22:53:24 -0400
Subject: [PATCH 283/877] [OpenMP] Fix building for 32-bit DragonFly, NetBSD,
 OpenBSD (#70527)

Fixing ```#error "Unknown or unsupported OS"```
---
 openmp/runtime/src/kmp_runtime.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/openmp/runtime/src/kmp_runtime.cpp b/openmp/runtime/src/kmp_runtime.cpp
index 9bf5105d134e8..e83c09383769a 100644
--- a/openmp/runtime/src/kmp_runtime.cpp
+++ b/openmp/runtime/src/kmp_runtime.cpp
@@ -8925,7 +8925,8 @@ __kmp_determine_reduction_method(
 
 #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS
 
-#if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS || KMP_OS_HURD
+#if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||     \
+    KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_HURD
 
     // basic tuning
 

From 3cd2a0bc1a2dcf851f1821765946b77d0e65bd2e Mon Sep 17 00:00:00 2001
From: Matthias Springer <me@m-sp.org>
Date: Sat, 28 Oct 2023 12:10:36 +0900
Subject: [PATCH 284/877] [mlir][Interfaces] `LoopLikeOpInterface`: Add helpers
 to query tied inits/iter_args (#70408)

The `LoopLikeOpInterface` already has interface methods to query inits
and iter_args. This commit adds helper functions to query tied
init/iter_arg pairs and removes the corresponding functions for
`scf::ForOp`.
---
 mlir/include/mlir/Dialect/SCF/IR/SCFOps.td    | 27 ++--------------
 .../mlir/Interfaces/LoopLikeInterface.td      | 31 ++++++++++++++++++-
 .../Linalg/Transforms/HoistPadding.cpp        |  2 +-
 mlir/lib/Dialect/SCF/IR/SCF.cpp               |  4 +--
 .../BufferizableOpInterfaceImpl.cpp           | 10 +++---
 .../SCF/Transforms/LoopCanonicalization.cpp   |  2 +-
 .../Dialect/SCF/Transforms/LoopPipelining.cpp |  4 +--
 .../SCF/Transforms/TileUsingInterface.cpp     |  2 +-
 8 files changed, 45 insertions(+), 37 deletions(-)

diff --git a/mlir/include/mlir/Dialect/SCF/IR/SCFOps.td b/mlir/include/mlir/Dialect/SCF/IR/SCFOps.td
index fde9176c670bc..43beebc1bf541 100644
--- a/mlir/include/mlir/Dialect/SCF/IR/SCFOps.td
+++ b/mlir/include/mlir/Dialect/SCF/IR/SCFOps.td
@@ -123,7 +123,8 @@ def ForOp : SCF_Op<"for",
       [AutomaticAllocationScope, DeclareOpInterfaceMethods<LoopLikeOpInterface,
        ["getInitsMutable", "getSingleInductionVar", "getSingleLowerBound",
         "getSingleStep", "getSingleUpperBound", "getYieldedValuesMutable",
-        "promoteIfSingleIteration", "replaceWithAdditionalYields"]>,
+        "getLoopResults", "promoteIfSingleIteration",
+        "replaceWithAdditionalYields"]>,
        AllTypesMatch<["lowerBound", "upperBound", "step"]>,
        ConditionallySpeculatable,
        DeclareOpInterfaceMethods<RegionBranchOpInterface,
@@ -268,28 +269,6 @@ def ForOp : SCF_Op<"for",
     /// Number of operands controlling the loop: lb, ub, step
     unsigned getNumControlOperands() { return 3; }
 
-    /// Get the region iter arg that corresponds to an OpOperand.
-    /// This helper prevents internal op implementation detail leakage to
-    /// clients by hiding the operand / block argument mapping.
-    BlockArgument getRegionIterArgForOpOperand(OpOperand &opOperand) {
-      assert(opOperand.getOperandNumber() >= getNumControlOperands() &&
-             "expected an iter args operand");
-      assert(opOperand.getOwner() == getOperation() &&
-             "opOperand does not belong to this scf::ForOp operation");
-      return getRegionIterArgs()[
-        opOperand.getOperandNumber() - getNumControlOperands()];
-    }
-    /// Get the OpOperand& that corresponds to a region iter arg.
-    /// This helper prevents internal op implementation detail leakage to
-    /// clients by hiding the operand / block argument mapping.
-    OpOperand &getOpOperandForRegionIterArg(BlockArgument bbArg) {
-      assert(bbArg.getArgNumber() >= getNumInductionVars() &&
-             "expected a bbArg that is not an induction variable");
-      assert(bbArg.getOwner()->getParentOp() == getOperation() &&
-             "bbArg does not belong to the scf::ForOp body");
-      return getOperation()->getOpOperand(
-        getNumControlOperands() + bbArg.getArgNumber() - getNumInductionVars());
-    }
     /// Get the OpResult that corresponds to an OpOperand.
     /// Assert that opOperand is an iterArg.
     /// This helper prevents internal op implementation detail leakage to
@@ -963,7 +942,7 @@ def WhileOp : SCF_Op<"while",
     [DeclareOpInterfaceMethods<RegionBranchOpInterface,
         ["getEntrySuccessorOperands"]>,
      DeclareOpInterfaceMethods<LoopLikeOpInterface,
-        ["getRegionIterArgs", "getYieldedValuesMutable"]>,
+        ["getLoopResults", "getRegionIterArgs", "getYieldedValuesMutable"]>,
      RecursiveMemoryEffects, SingleBlock]> {
   let summary = "a generic 'while' loop";
   let description = [{
diff --git a/mlir/include/mlir/Interfaces/LoopLikeInterface.td b/mlir/include/mlir/Interfaces/LoopLikeInterface.td
index afb7860491664..d3d07eec8ebff 100644
--- a/mlir/include/mlir/Interfaces/LoopLikeInterface.td
+++ b/mlir/include/mlir/Interfaces/LoopLikeInterface.td
@@ -238,7 +238,36 @@ def LoopLikeOpInterface : OpInterface<"LoopLikeOpInterface"> {
       unsigned firstOperandIndex = initsMutable.begin()->getOperandNumber();
       return OperandRange(
           $_op->operand_begin() + firstOperandIndex,
-          $_op->operand_begin() + firstOperandIndex + initsMutable.size());    }
+          $_op->operand_begin() + firstOperandIndex + initsMutable.size());
+    }
+
+    /// Return the region iter_arg that corresponds to the given init operand.
+    BlockArgument getTiedLoopRegionIterArg(OpOperand *opOperand) {
+      auto initsMutable = $_op.getInitsMutable();
+      auto it = llvm::find(initsMutable, *opOperand);
+      if (it == initsMutable.end())
+        return {};
+      return $_op.getRegionIterArgs()[std::distance(initsMutable.begin(), it)];
+    }
+
+    /// Return the init operand that corresponds to the given region iter_arg.
+    OpOperand *getTiedLoopInit(BlockArgument bbArg) {
+      auto iterArgs = $_op.getRegionIterArgs();
+      auto it = llvm::find(iterArgs, bbArg);
+      if (it == iterArgs.end())
+        return {};
+      return &$_op.getInitsMutable()[std::distance(iterArgs.begin(), it)];
+    }
+
+    /// Return the yielded value that corresponds to the given region iter_arg.
+    OpOperand *getTiedLoopYieldedValue(BlockArgument bbArg) {
+      auto iterArgs = $_op.getRegionIterArgs();
+      auto it = llvm::find(iterArgs, bbArg);
+      if (it == iterArgs.end())
+        return {};
+      return
+          &$_op.getYieldedValuesMutable()[std::distance(iterArgs.begin(), it)];
+    }
   }];
 
   let verifyWithRegions = 1;
diff --git a/mlir/lib/Dialect/Linalg/Transforms/HoistPadding.cpp b/mlir/lib/Dialect/Linalg/Transforms/HoistPadding.cpp
index 8fef99bb37509..19f704f5232ed 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/HoistPadding.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/HoistPadding.cpp
@@ -558,7 +558,7 @@ static FailureOr<PackingResult> buildPackingLoopNestImpl(
       break;
     if (forOp != outerLoop && !outerLoop->isAncestor(forOp))
       break;
-    OpOperand &operand = forOp.getOpOperandForRegionIterArg(bbArg);
+    OpOperand &operand = *forOp.getTiedLoopInit(bbArg);
     bvm.map(bbArg, operand.get());
     bbArg = dyn_cast<BlockArgument>(operand.get());
   }
diff --git a/mlir/lib/Dialect/SCF/IR/SCF.cpp b/mlir/lib/Dialect/SCF/IR/SCF.cpp
index cb888bc17c571..b8b75f3f476a5 100644
--- a/mlir/lib/Dialect/SCF/IR/SCF.cpp
+++ b/mlir/lib/Dialect/SCF/IR/SCF.cpp
@@ -994,8 +994,8 @@ replaceTensorCastForOpIterArg(PatternRewriter &rewriter, OpOperand &operand,
   // corresponding to the `replacement` value.
   OpBuilder::InsertionGuard g(rewriter);
   rewriter.setInsertionPoint(&newBlock, newBlock.begin());
-  BlockArgument newRegionIterArg = newForOp.getRegionIterArgForOpOperand(
-      newForOp->getOpOperand(operand.getOperandNumber()));
+  BlockArgument newRegionIterArg = newForOp.getTiedLoopRegionIterArg(
+      &newForOp->getOpOperand(operand.getOperandNumber()));
   Value castIn = rewriter.create<tensor::CastOp>(newForOp.getLoc(), oldType,
                                                  newRegionIterArg);
   newBlockTransferArgs[newRegionIterArg.getArgNumber()] = castIn;
diff --git a/mlir/lib/Dialect/SCF/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Dialect/SCF/Transforms/BufferizableOpInterfaceImpl.cpp
index 455b7d8bcaff0..885e00b48ff84 100644
--- a/mlir/lib/Dialect/SCF/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/SCF/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -602,7 +602,7 @@ struct ForOpInterface
 
     // scf::ForOp alone doesn't bufferize to a memory read, one of the uses of
     // its matching bbArg may.
-    return state.isValueRead(forOp.getRegionIterArgForOpOperand(opOperand));
+    return state.isValueRead(forOp.getTiedLoopRegionIterArg(&opOperand));
   }
 
   bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand,
@@ -626,7 +626,7 @@ struct ForOpInterface
     // corresponding iter_args and yield values are equivalent.
     auto forOp = cast<scf::ForOp>(op);
     OpOperand &forOperand = forOp.getOpOperandForResult(opResult);
-    auto bbArg = forOp.getRegionIterArgForOpOperand(forOperand);
+    auto bbArg = forOp.getTiedLoopRegionIterArg(&forOperand);
     bool equivalentYield = state.areEquivalentBufferizedValues(
         bbArg, forOp.getYieldedValues()[opResult.getResultNumber()]);
     return equivalentYield ? BufferRelation::Equivalent
@@ -703,15 +703,15 @@ struct ForOpInterface
 
     if (auto opResult = dyn_cast<OpResult>(value)) {
       // The type of an OpResult must match the corresponding iter_arg type.
-      BlockArgument bbArg = forOp.getRegionIterArgForOpOperand(
-          forOp.getOpOperandForResult(opResult));
+      BlockArgument bbArg = forOp.getTiedLoopRegionIterArg(
+          &forOp.getOpOperandForResult(opResult));
       return bufferization::getBufferType(bbArg, options, invocationStack);
     }
 
     // Compute result/argument number.
     BlockArgument bbArg = cast<BlockArgument>(value);
     unsigned resultNum =
-        forOp.getResultForOpOperand(forOp.getOpOperandForRegionIterArg(bbArg))
+        forOp.getResultForOpOperand(*forOp.getTiedLoopInit(bbArg))
             .getResultNumber();
 
     // Compute the bufferized type.
diff --git a/mlir/lib/Dialect/SCF/Transforms/LoopCanonicalization.cpp b/mlir/lib/Dialect/SCF/Transforms/LoopCanonicalization.cpp
index 43e79d309c667..eee0791b397ae 100644
--- a/mlir/lib/Dialect/SCF/Transforms/LoopCanonicalization.cpp
+++ b/mlir/lib/Dialect/SCF/Transforms/LoopCanonicalization.cpp
@@ -98,7 +98,7 @@ struct DimOfIterArgFolder : public OpRewritePattern<OpTy> {
     if (!isShapePreserving(forOp, blockArg.getArgNumber() - 1))
       return failure();
 
-    Value initArg = forOp.getOpOperandForRegionIterArg(blockArg).get();
+    Value initArg = forOp.getTiedLoopInit(blockArg)->get();
     rewriter.updateRootInPlace(
         dimOp, [&]() { dimOp.getSourceMutable().assign(initArg); });
 
diff --git a/mlir/lib/Dialect/SCF/Transforms/LoopPipelining.cpp b/mlir/lib/Dialect/SCF/Transforms/LoopPipelining.cpp
index 4ff7965c0858a..5537a8b212c51 100644
--- a/mlir/lib/Dialect/SCF/Transforms/LoopPipelining.cpp
+++ b/mlir/lib/Dialect/SCF/Transforms/LoopPipelining.cpp
@@ -194,8 +194,8 @@ cloneAndUpdateOperands(RewriterBase &rewriter, Operation *op,
 
 void LoopPipelinerInternal::emitPrologue(RewriterBase &rewriter) {
   // Initialize the iteration argument to the loop initiale values.
-  for (BlockArgument &arg : forOp.getRegionIterArgs()) {
-    OpOperand &operand = forOp.getOpOperandForRegionIterArg(arg);
+  for (auto [arg, operand] :
+       llvm::zip(forOp.getRegionIterArgs(), forOp.getInitsMutable())) {
     setValueMapping(arg, operand.get(), 0);
   }
   auto yield = cast<scf::YieldOp>(forOp.getBody()->getTerminator());
diff --git a/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp b/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp
index 2bddb498c21ac..e649125a09fea 100644
--- a/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp
+++ b/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp
@@ -523,7 +523,7 @@ getUntiledProducerFromSliceSource(OpOperand *source,
     scf::ForOp loop = *loopIt;
     if (iterArg.getOwner()->getParentOp() != loop)
       break;
-    source = &loop.getOpOperandForRegionIterArg(iterArg);
+    source = loop.getTiedLoopInit(iterArg);
     loopIt++;
   }
   if (loopIt == loops.rend())

From f70e39ec173192058976805a2c51ac438bb2ff2f Mon Sep 17 00:00:00 2001
From: Rahman Lavaee <rahmanl@google.com>
Date: Fri, 27 Oct 2023 21:49:39 -0700
Subject: [PATCH 285/877] [BasicBlockSections] Apply path cloning with
 -basic-block-sections. (#68860)

https://github.com/llvm/llvm-project/commit/28b912687900bc0a67cd61c374fce296b09963c4
introduced the path cloning format in the basic-block-sections profile.

This PR validates and applies path clonings.
A path cloning is valid if all of these conditions hold:
  1. All bb ids in the path are mapped to existing blocks.
2. Each two consecutive bb ids in the path have a successor relationship
in the CFG.
3. The path does not include a block with indirect branches, except
possibly as the last block.

Applying a path cloning involves cloning all blocks in the path (except
the first one) and setting up their branches.
Once all clonings are applied, the cluster information is used to guide
block layout in the modified function.
---
 .../llvm/CodeGen/BasicBlockSectionUtils.h     |   9 +
 .../CodeGen/BasicBlockSectionsProfileReader.h |  59 ++---
 llvm/include/llvm/CodeGen/MachineBasicBlock.h |  15 +-
 llvm/include/llvm/CodeGen/MachineFunction.h   |   7 +-
 llvm/include/llvm/CodeGen/Passes.h            |   2 +
 llvm/include/llvm/InitializePasses.h          |   1 +
 llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp    |  49 ++--
 llvm/lib/CodeGen/BasicBlockPathCloning.cpp    | 245 ++++++++++++++++++
 llvm/lib/CodeGen/BasicBlockSections.cpp       |  55 ++--
 .../BasicBlockSectionsProfileReader.cpp       |  60 +++--
 llvm/lib/CodeGen/CMakeLists.txt               |   1 +
 llvm/lib/CodeGen/CodeGen.cpp                  |   1 +
 llvm/lib/CodeGen/MIRParser/MIParser.cpp       |  18 +-
 llvm/lib/CodeGen/MachineBasicBlock.cpp        |   6 +-
 llvm/lib/CodeGen/MachineFunction.cpp          |   7 +-
 llvm/lib/CodeGen/TargetInstrInfo.cpp          |  15 +-
 llvm/lib/CodeGen/TargetPassConfig.cpp         |   1 +
 .../X86/basic-block-labels-mir-parse.mir      |   2 +-
 .../X86/basic-block-sections-cloning-1.ll     |  71 +++++
 .../X86/basic-block-sections-cloning-2.ll     |  86 ++++++
 ...block-sections-cloning-indirect-invalid.ll |  45 ++++
 .../basic-block-sections-cloning-indirect.ll  |  43 +++
 .../basic-block-sections-cloning-invalid.ll   |  72 +++++
 23 files changed, 736 insertions(+), 134 deletions(-)
 create mode 100644 llvm/lib/CodeGen/BasicBlockPathCloning.cpp
 create mode 100644 llvm/test/CodeGen/X86/basic-block-sections-cloning-1.ll
 create mode 100644 llvm/test/CodeGen/X86/basic-block-sections-cloning-2.ll
 create mode 100644 llvm/test/CodeGen/X86/basic-block-sections-cloning-indirect-invalid.ll
 create mode 100644 llvm/test/CodeGen/X86/basic-block-sections-cloning-indirect.ll
 create mode 100644 llvm/test/CodeGen/X86/basic-block-sections-cloning-invalid.ll

diff --git a/llvm/include/llvm/CodeGen/BasicBlockSectionUtils.h b/llvm/include/llvm/CodeGen/BasicBlockSectionUtils.h
index d43f399b2c310..292abf8b2b516 100644
--- a/llvm/include/llvm/CodeGen/BasicBlockSectionUtils.h
+++ b/llvm/include/llvm/CodeGen/BasicBlockSectionUtils.h
@@ -27,6 +27,15 @@ void sortBasicBlocksAndUpdateBranches(MachineFunction &MF,
 
 void avoidZeroOffsetLandingPad(MachineFunction &MF);
 
+/// This checks if the source of this function has drifted since this binary was
+/// profiled previously.
+/// For now, we are piggy backing on what PGO does to
+/// detect this with instrumented profiles.  PGO emits an hash of the IR and
+/// checks if the hash has changed.  Advanced basic block layout is usually done
+/// on top of PGO optimized binaries and hence this check works well in
+/// practice.
+bool hasInstrProfHashMismatch(MachineFunction &MF);
+
 } // end namespace llvm
 
 #endif // LLVM_CODEGEN_BASICBLOCKSECTIONUTILS_H
diff --git a/llvm/include/llvm/CodeGen/BasicBlockSectionsProfileReader.h b/llvm/include/llvm/CodeGen/BasicBlockSectionsProfileReader.h
index 6e01dfd11ee6d..dfb8d5d9f2f5d 100644
--- a/llvm/include/llvm/CodeGen/BasicBlockSectionsProfileReader.h
+++ b/llvm/include/llvm/CodeGen/BasicBlockSectionsProfileReader.h
@@ -19,33 +19,22 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/IR/Module.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/LineIterator.h"
 #include "llvm/Support/MemoryBuffer.h"
+using namespace llvm;
 
 namespace llvm {
 
-// This structure represents a unique ID for every block specified in the
-// input profile.
-struct ProfileBBID {
-  // Basic block id associated with `MachineBasicBlock::BBID`.
-  unsigned BBID;
-  // The clone id associated with the block. This is zero for the original
-  // block. For the cloned ones, it is equal to 1 + index of the associated
-  // path in `FunctionPathAndClusterInfo::ClonePaths`.
-  unsigned CloneID;
-};
-
 // This struct represents the cluster information for a machine basic block,
-// which is specifed by a unique ID. This templated struct is used for both the
-// raw input profile (as `BBClusterInfo<ProfileBBID>`) and the processed profile
-// after applying the clonings (as `BBClusterInfo<unsigned>`).
-template <typename BBIDType> struct BBClusterInfo {
+// which is specifed by a unique ID (`MachineBasicBlock::BBID`).
+struct BBClusterInfo {
   // Basic block ID.
-  BBIDType BasicBlockID;
+  UniqueBBID BBID;
   // Cluster ID this basic block belongs to.
   unsigned ClusterID;
   // Position of basic block within the cluster.
@@ -54,31 +43,31 @@ template <typename BBIDType> struct BBClusterInfo {
 
 // This represents the raw input profile for one function.
 struct FunctionPathAndClusterInfo {
-  // BB Cluster information specified by `ProfileBBID`s (before cloning).
-  SmallVector<BBClusterInfo<ProfileBBID>> ClusterInfo;
+  // BB Cluster information specified by `UniqueBBID`s.
+  SmallVector<BBClusterInfo> ClusterInfo;
   // Paths to clone. A path a -> b -> c -> d implies cloning b, c, and d along
   // the edge a -> b (a is not cloned). The index of the path in this vector
-  // determines the `ProfileBBID::CloneID` of the cloned blocks in that path.
+  // determines the `UniqueBBID::CloneID` of the cloned blocks in that path.
   SmallVector<SmallVector<unsigned>> ClonePaths;
 };
 
-// Provides DenseMapInfo for ProfileBBID.
-template <> struct DenseMapInfo<ProfileBBID> {
-  static inline ProfileBBID getEmptyKey() {
+// Provides DenseMapInfo for UniqueBBID.
+template <> struct DenseMapInfo<UniqueBBID> {
+  static inline UniqueBBID getEmptyKey() {
     unsigned EmptyKey = DenseMapInfo<unsigned>::getEmptyKey();
-    return ProfileBBID{EmptyKey, EmptyKey};
+    return UniqueBBID{EmptyKey, EmptyKey};
   }
-  static inline ProfileBBID getTombstoneKey() {
+  static inline UniqueBBID getTombstoneKey() {
     unsigned TombstoneKey = DenseMapInfo<unsigned>::getTombstoneKey();
-    return ProfileBBID{TombstoneKey, TombstoneKey};
+    return UniqueBBID{TombstoneKey, TombstoneKey};
   }
-  static unsigned getHashValue(const ProfileBBID &Val) {
+  static unsigned getHashValue(const UniqueBBID &Val) {
     std::pair<unsigned, unsigned> PairVal =
-        std::make_pair(Val.BBID, Val.CloneID);
+        std::make_pair(Val.BaseID, Val.CloneID);
     return DenseMapInfo<std::pair<unsigned, unsigned>>::getHashValue(PairVal);
   }
-  static bool isEqual(const ProfileBBID &LHS, const ProfileBBID &RHS) {
-    return DenseMapInfo<unsigned>::isEqual(LHS.BBID, RHS.BBID) &&
+  static bool isEqual(const UniqueBBID &LHS, const UniqueBBID &RHS) {
+    return DenseMapInfo<unsigned>::isEqual(LHS.BaseID, RHS.BaseID) &&
            DenseMapInfo<unsigned>::isEqual(LHS.CloneID, RHS.CloneID);
   }
 };
@@ -113,8 +102,12 @@ class BasicBlockSectionsProfileReader : public ImmutablePass {
   // function. If the first element is true and the second element is empty, it
   // means unique basic block sections are desired for all basic blocks of the
   // function.
-  std::pair<bool, FunctionPathAndClusterInfo>
-  getPathAndClusterInfoForFunction(StringRef FuncName) const;
+  std::pair<bool, SmallVector<BBClusterInfo>>
+  getClusterInfoForFunction(StringRef FuncName) const;
+
+  // Returns the path clonings for the given function.
+  SmallVector<SmallVector<unsigned>>
+  getClonePathsForFunction(StringRef FuncName) const;
 
   // Initializes the FunctionNameToDIFilename map for the current module and
   // then reads the profile for the matching functions.
@@ -134,11 +127,11 @@ class BasicBlockSectionsProfileReader : public ImmutablePass {
         inconvertibleErrorCode());
   }
 
-  // Parses a `ProfileBBID` from `S`. `S` must be in the form "<bbid>"
+  // Parses a `UniqueBBID` from `S`. `S` must be in the form "<bbid>"
   // (representing an original block) or "<bbid>.<cloneid>" (representing a
   // cloned block) where bbid is a non-negative integer and cloneid is a
   // positive integer.
-  Expected<ProfileBBID> parseProfileBBID(StringRef S) const;
+  Expected<UniqueBBID> parseUniqueBBID(StringRef S) const;
 
   // Reads the basic block sections profile for functions in this module.
   Error ReadProfile();
diff --git a/llvm/include/llvm/CodeGen/MachineBasicBlock.h b/llvm/include/llvm/CodeGen/MachineBasicBlock.h
index 15c4fcd8399c1..4b5336fac33ea 100644
--- a/llvm/include/llvm/CodeGen/MachineBasicBlock.h
+++ b/llvm/include/llvm/CodeGen/MachineBasicBlock.h
@@ -74,6 +74,13 @@ struct MBBSectionID {
   MBBSectionID(SectionType T) : Type(T), Number(0) {}
 };
 
+// This structure represents the information for a basic block.
+struct UniqueBBID {
+  unsigned BaseID;
+  // sections profile).
+  unsigned CloneID;
+};
+
 template <> struct ilist_traits<MachineInstr> {
 private:
   friend class MachineBasicBlock; // Set by the owning MachineBasicBlock.
@@ -180,7 +187,7 @@ class MachineBasicBlock
 
   /// Fixed unique ID assigned to this basic block upon creation. Used with
   /// basic block sections and basic block labels.
-  std::optional<unsigned> BBID;
+  std::optional<UniqueBBID> BBID;
 
   /// With basic block sections, this stores the Section ID of the basic block.
   MBBSectionID SectionID{0};
@@ -633,7 +640,7 @@ class MachineBasicBlock
 
   void setIsEndSection(bool V = true) { IsEndSection = V; }
 
-  std::optional<unsigned> getBBID() const { return BBID; }
+  std::optional<UniqueBBID> getBBID() const { return BBID; }
 
   /// Returns the section ID of this basic block.
   MBBSectionID getSectionID() const { return SectionID; }
@@ -645,7 +652,7 @@ class MachineBasicBlock
   }
 
   /// Sets the fixed BBID of this basic block.
-  void setBBID(unsigned V) {
+  void setBBID(const UniqueBBID &V) {
     assert(!BBID.has_value() && "Cannot change BBID.");
     BBID = V;
   }
@@ -753,7 +760,7 @@ class MachineBasicBlock
   ///
   /// This is useful when doing a partial clone of successors. Afterward, the
   /// probabilities may need to be normalized.
-  void copySuccessor(MachineBasicBlock *Orig, succ_iterator I);
+  void copySuccessor(const MachineBasicBlock *Orig, succ_iterator I);
 
   /// Split the old successor into old plus new and updates the probability
   /// info.
diff --git a/llvm/include/llvm/CodeGen/MachineFunction.h b/llvm/include/llvm/CodeGen/MachineFunction.h
index 8e3253d492dc9..05c9b14a423cd 100644
--- a/llvm/include/llvm/CodeGen/MachineFunction.h
+++ b/llvm/include/llvm/CodeGen/MachineFunction.h
@@ -1013,8 +1013,11 @@ class LLVM_EXTERNAL_VISIBILITY MachineFunction {
   void deleteMachineInstr(MachineInstr *MI);
 
   /// CreateMachineBasicBlock - Allocate a new MachineBasicBlock. Use this
-  /// instead of `new MachineBasicBlock'.
-  MachineBasicBlock *CreateMachineBasicBlock(const BasicBlock *bb = nullptr);
+  /// instead of `new MachineBasicBlock'. Sets `MachineBasicBlock::BBID` if
+  /// basic-block-sections is enabled for the function.
+  MachineBasicBlock *
+  CreateMachineBasicBlock(const BasicBlock *BB = nullptr,
+                          std::optional<UniqueBBID> BBID = std::nullopt);
 
   /// DeleteMachineBasicBlock - Delete the given MachineBasicBlock.
   void deleteMachineBasicBlock(MachineBasicBlock *MBB);
diff --git a/llvm/include/llvm/CodeGen/Passes.h b/llvm/include/llvm/CodeGen/Passes.h
index 8d14eef949e91..712048017bca1 100644
--- a/llvm/include/llvm/CodeGen/Passes.h
+++ b/llvm/include/llvm/CodeGen/Passes.h
@@ -65,6 +65,8 @@ namespace llvm {
   /// basic blocks and is enabled with -fbasic-block-sections.
   MachineFunctionPass *createBasicBlockSectionsPass();
 
+  MachineFunctionPass *createBasicBlockPathCloningPass();
+
   /// createMachineFunctionSplitterPass - This pass splits machine functions
   /// using profile information.
   MachineFunctionPass *createMachineFunctionSplitterPass();
diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h
index 9fbd431c22778..fafae8b5ecd7a 100644
--- a/llvm/include/llvm/InitializePasses.h
+++ b/llvm/include/llvm/InitializePasses.h
@@ -55,6 +55,7 @@ void initializeAssignmentTrackingAnalysisPass(PassRegistry &);
 void initializeAssumeBuilderPassLegacyPassPass(PassRegistry &);
 void initializeAssumptionCacheTrackerPass(PassRegistry&);
 void initializeAtomicExpandPass(PassRegistry&);
+void initializeBasicBlockPathCloningPass(PassRegistry &);
 void initializeBasicBlockSectionsProfileReaderPass(PassRegistry &);
 void initializeBasicBlockSectionsPass(PassRegistry &);
 void initializeBarrierNoopPass(PassRegistry&);
diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index b1a670fa3c255..fd440718fd378 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -1372,7 +1372,11 @@ void AsmPrinter::emitBBAddrMapSection(const MachineFunction &MF) {
     if (BBAddrMapVersion > 1) {
       OutStreamer->AddComment("BB id");
       // Emit the BB ID for this basic block.
-      OutStreamer->emitULEB128IntValue(*MBB.getBBID());
+      // We only emit BaseID since CloneID is unset for
+      // basic-block-sections=labels.
+      // TODO: Emit the full BBID when labels and sections can be mixed
+      // together.
+      OutStreamer->emitULEB128IntValue(MBB.getBBID()->BaseID);
     }
     // Emit the basic block offset relative to the end of the previous block.
     // This is zero unless the block is padded due to alignment.
@@ -1932,30 +1936,33 @@ void AsmPrinter::emitFunctionBody() {
   // MBB profile information has been set
   if (MBBProfileDumpFileOutput && !MF->empty() &&
       MF->getFunction().getEntryCount()) {
-    if (!MF->hasBBLabels())
+    if (!MF->hasBBLabels()) {
       MF->getContext().reportError(
           SMLoc(),
           "Unable to find BB labels for MBB profile dump. -mbb-profile-dump "
           "must be called with -basic-block-sections=labels");
-    MachineBlockFrequencyInfo &MBFI =
-        getAnalysis<LazyMachineBlockFrequencyInfoPass>().getBFI();
-    // The entry count and the entry basic block frequency aren't the same. We
-    // want to capture "absolute" frequencies, i.e. the frequency with which a
-    // MBB is executed when the program is executed. From there, we can derive
-    // Function-relative frequencies (divide by the value for the first MBB).
-    // We also have the information about frequency with which functions
-    // were called. This helps, for example, in a type of integration tests
-    // where we want to cross-validate the compiler's profile with a real
-    // profile.
-    // Using double precision because uint64 values used to encode mbb
-    // "frequencies" may be quite large.
-    const double EntryCount =
-        static_cast<double>(MF->getFunction().getEntryCount()->getCount());
-    for (const auto &MBB : *MF) {
-      const double MBBRelFreq = MBFI.getBlockFreqRelativeToEntryBlock(&MBB);
-      const double AbsMBBFreq = MBBRelFreq * EntryCount;
-      *MBBProfileDumpFileOutput.get()
-          << MF->getName() << "," << MBB.getBBID() << "," << AbsMBBFreq << "\n";
+    } else {
+      MachineBlockFrequencyInfo &MBFI =
+          getAnalysis<LazyMachineBlockFrequencyInfoPass>().getBFI();
+      // The entry count and the entry basic block frequency aren't the same. We
+      // want to capture "absolute" frequencies, i.e. the frequency with which a
+      // MBB is executed when the program is executed. From there, we can derive
+      // Function-relative frequencies (divide by the value for the first MBB).
+      // We also have the information about frequency with which functions
+      // were called. This helps, for example, in a type of integration tests
+      // where we want to cross-validate the compiler's profile with a real
+      // profile.
+      // Using double precision because uint64 values used to encode mbb
+      // "frequencies" may be quite large.
+      const double EntryCount =
+          static_cast<double>(MF->getFunction().getEntryCount()->getCount());
+      for (const auto &MBB : *MF) {
+        const double MBBRelFreq = MBFI.getBlockFreqRelativeToEntryBlock(&MBB);
+        const double AbsMBBFreq = MBBRelFreq * EntryCount;
+        *MBBProfileDumpFileOutput.get()
+            << MF->getName() << "," << MBB.getBBID()->BaseID << ","
+            << AbsMBBFreq << "\n";
+      }
     }
   }
 }
diff --git a/llvm/lib/CodeGen/BasicBlockPathCloning.cpp b/llvm/lib/CodeGen/BasicBlockPathCloning.cpp
new file mode 100644
index 0000000000000..5d5f3c3da4816
--- /dev/null
+++ b/llvm/lib/CodeGen/BasicBlockPathCloning.cpp
@@ -0,0 +1,245 @@
+//===-- BasicBlockPathCloning.cpp ---=========-----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// BasicBlockPathCloning implementation.
+///
+/// The purpose of this pass is to clone basic block paths based on information
+/// provided by the -fbasic-block-sections=list option.
+/// Please refer to BasicBlockSectionsProfileReader.cpp to see a path cloning
+/// example.
+//===----------------------------------------------------------------------===//
+// This pass clones the machine basic blocks alongs the given paths and sets up
+// the CFG. It assigns BBIDs to the cloned blocks so that the
+// `BasicBlockSections` pass can correctly map the cluster information to the
+// blocks. The cloned block's BBID will have the same BaseID as the original
+// block, but will get a unique non-zero CloneID (original blocks all have zero
+// CloneIDs). This pass applies a path cloning if it satisfies the following
+// conditions:
+//   1. All BBIDs in the path should be mapped to existing blocks.
+//   2. Each two consecutive BBIDs in the path must have a successor
+//   relationship in the CFG.
+//   3. The path should not include a block with indirect branches, except for
+//   the last block.
+// If a path does not satisfy all three conditions, it will be rejected, but the
+// CloneIDs for its (supposed to be cloned) blocks will be bypassed to make sure
+// that the `BasicBlockSections` pass can map cluster info correctly to the
+// actually-cloned blocks.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/BasicBlockSectionUtils.h"
+#include "llvm/CodeGen/BasicBlockSectionsProfileReader.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/WithColor.h"
+#include "llvm/Target/TargetMachine.h"
+
+using namespace llvm;
+
+namespace {
+
+// Clones the given block and assigns the given `CloneID` to its BBID. Copies
+// the instructions into the new block and sets up its successors.
+MachineBasicBlock *CloneMachineBasicBlock(MachineBasicBlock &OrigBB,
+                                          unsigned CloneID) {
+  auto &MF = *OrigBB.getParent();
+  auto TII = MF.getSubtarget().getInstrInfo();
+  // Create the clone block and set its BBID based on the original block.
+  MachineBasicBlock *CloneBB = MF.CreateMachineBasicBlock(
+      OrigBB.getBasicBlock(), UniqueBBID{OrigBB.getBBID()->BaseID, CloneID});
+  MF.push_back(CloneBB);
+
+  // Copy the instructions.
+  for (auto &I : OrigBB.instrs()) {
+    // Bundled instructions are duplicated together.
+    if (I.isBundledWithPred())
+      continue;
+    TII->duplicate(*CloneBB, CloneBB->end(), I);
+  }
+
+  // Add the successors of the original block as the new block's successors.
+  // We set the predecessor after returning from this call.
+  for (auto SI = OrigBB.succ_begin(), SE = OrigBB.succ_end(); SI != SE; ++SI)
+    CloneBB->copySuccessor(&OrigBB, SI);
+
+  if (auto FT = OrigBB.getFallThrough(/*JumpToFallThrough=*/false)) {
+    // The original block has an implicit fall through.
+    // Insert an explicit unconditional jump from the cloned block to the
+    // fallthrough block. Technically, this is only needed for the last block
+    // of the path, but we do it for all clones for consistency.
+    TII->insertUnconditionalBranch(*CloneBB, FT, CloneBB->findBranchDebugLoc());
+  }
+  return CloneBB;
+}
+
+// Returns if we can legally apply the cloning represented by `ClonePath`.
+// `BBIDToBlock` contains the original basic blocks in function `MF` keyed by
+// their `BBID::BaseID`.
+bool IsValidCloning(const MachineFunction &MF,
+                    const DenseMap<unsigned, MachineBasicBlock *> &BBIDToBlock,
+                    const SmallVector<unsigned> &ClonePath) {
+  const MachineBasicBlock *PrevBB = nullptr;
+  for (size_t I = 0; I < ClonePath.size(); ++I) {
+    unsigned BBID = ClonePath[I];
+    const MachineBasicBlock *PathBB = BBIDToBlock.lookup(BBID);
+    if (!PathBB) {
+      WithColor::warning() << "no block with id " << BBID << " in function "
+                           << MF.getName() << "\n";
+      return false;
+    }
+
+    if (PrevBB) {
+      if (!PrevBB->isSuccessor(PathBB)) {
+        WithColor::warning()
+            << "block #" << BBID << " is not a successor of block #"
+            << PrevBB->getBBID()->BaseID << " in function " << MF.getName()
+            << "\n";
+        return false;
+      }
+
+      for (auto &MI : *PathBB) {
+        // Avoid cloning when the block contains non-duplicable instructions.
+        // CFI instructions are marked as non-duplicable only because of Darwin,
+        // so we exclude them from this check.
+        if (MI.isNotDuplicable() && !MI.isCFIInstruction()) {
+          WithColor::warning()
+              << "block #" << BBID
+              << " has non-duplicable instructions in function " << MF.getName()
+              << "\n";
+          return false;
+        }
+      }
+    }
+
+    if (I != ClonePath.size() - 1 && !PathBB->empty() &&
+        PathBB->back().isIndirectBranch()) {
+      WithColor::warning()
+          << "block #" << BBID
+          << " has indirect branch and appears as the non-tail block of a "
+             "path in function "
+          << MF.getName() << "\n";
+      return false;
+    }
+    PrevBB = PathBB;
+  }
+  return true;
+}
+
+// Applies all clonings specified in `ClonePaths` to `MF`. Returns true
+// if any clonings have been applied.
+bool ApplyCloning(MachineFunction &MF,
+                  const SmallVector<SmallVector<unsigned>> &ClonePaths) {
+  if (ClonePaths.empty())
+    return false;
+  bool AnyPathsCloned = false;
+  // Map from the final BB IDs to the `MachineBasicBlock`s.
+  DenseMap<unsigned, MachineBasicBlock *> BBIDToBlock;
+  for (auto &BB : MF)
+    BBIDToBlock.try_emplace(BB.getBBID()->BaseID, &BB);
+
+  DenseMap<unsigned, unsigned> NClonesForBBID;
+  auto TII = MF.getSubtarget().getInstrInfo();
+  for (const auto &ClonePath : ClonePaths) {
+    if (!IsValidCloning(MF, BBIDToBlock, ClonePath)) {
+      // We still need to increment the number of clones so we can map
+      // to the cluster info correctly.
+      for (unsigned BBID : ClonePath)
+        ++NClonesForBBID[BBID];
+      continue;
+    }
+    MachineBasicBlock *PrevBB = nullptr;
+    for (unsigned BBID : ClonePath) {
+      MachineBasicBlock *OrigBB = BBIDToBlock.at(BBID);
+      if (PrevBB == nullptr) {
+        // The first block in the path is not cloned. We only need to make it
+        // branch to the next cloned block in the path. Here, we make its
+        // fallthrough explicit so we can change it later.
+        if (auto FT = OrigBB->getFallThrough(/*JumpToFallThrough=*/false)) {
+          TII->insertUnconditionalBranch(*OrigBB, FT,
+                                         OrigBB->findBranchDebugLoc());
+        }
+        PrevBB = OrigBB;
+        continue;
+      }
+      MachineBasicBlock *CloneBB =
+          CloneMachineBasicBlock(*OrigBB, ++NClonesForBBID[BBID]);
+
+      // Set up the previous block in the path to jump to the clone. This also
+      // transfers the successor/predecessor relationship of PrevBB and OrigBB
+      // to that of PrevBB and CloneBB.
+      PrevBB->ReplaceUsesOfBlockWith(OrigBB, CloneBB);
+
+      // Copy the livein set.
+      for (auto &LiveIn : OrigBB->liveins())
+        CloneBB->addLiveIn(LiveIn);
+
+      PrevBB = CloneBB;
+    }
+    AnyPathsCloned = true;
+  }
+  return AnyPathsCloned;
+}
+} // end anonymous namespace
+
+namespace llvm {
+class BasicBlockPathCloning : public MachineFunctionPass {
+public:
+  static char ID;
+
+  BasicBlockSectionsProfileReader *BBSectionsProfileReader = nullptr;
+
+  BasicBlockPathCloning() : MachineFunctionPass(ID) {
+    initializeBasicBlockPathCloningPass(*PassRegistry::getPassRegistry());
+  }
+
+  StringRef getPassName() const override { return "Basic Block Path Cloning"; }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+  /// Identify basic blocks that need separate sections and prepare to emit them
+  /// accordingly.
+  bool runOnMachineFunction(MachineFunction &MF) override;
+};
+
+} // namespace llvm
+
+char BasicBlockPathCloning::ID = 0;
+INITIALIZE_PASS_BEGIN(
+    BasicBlockPathCloning, "bb-path-cloning",
+    "Applies path clonings for the -basic-block-sections=list option", false,
+    false)
+INITIALIZE_PASS_DEPENDENCY(BasicBlockSectionsProfileReader)
+INITIALIZE_PASS_END(
+    BasicBlockPathCloning, "bb-path-cloning",
+    "Applies path clonings for the -basic-block-sections=list option", false,
+    false)
+
+bool BasicBlockPathCloning::runOnMachineFunction(MachineFunction &MF) {
+  assert(MF.getTarget().getBBSectionsType() == BasicBlockSection::List &&
+         "BB Sections list not enabled!");
+  if (hasInstrProfHashMismatch(MF))
+    return false;
+
+  return ApplyCloning(MF, getAnalysis<BasicBlockSectionsProfileReader>()
+                              .getClonePathsForFunction(MF.getName()));
+}
+
+void BasicBlockPathCloning::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  AU.addRequired<BasicBlockSectionsProfileReader>();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+MachineFunctionPass *llvm::createBasicBlockPathCloningPass() {
+  return new BasicBlockPathCloning();
+}
diff --git a/llvm/lib/CodeGen/BasicBlockSections.cpp b/llvm/lib/CodeGen/BasicBlockSections.cpp
index 632fd68d88b5c..42997d2287d61 100644
--- a/llvm/lib/CodeGen/BasicBlockSections.cpp
+++ b/llvm/lib/CodeGen/BasicBlockSections.cpp
@@ -175,12 +175,12 @@ updateBranches(MachineFunction &MF,
 // clusters, they are moved into a single "Exception" section. Eventually,
 // clusters are ordered in increasing order of their IDs, with the "Exception"
 // and "Cold" succeeding all other clusters.
-// ClusterInfoByBBID represents the cluster information for basic blocks. It
+// FuncClusterInfo represents the cluster information for basic blocks. It
 // maps from BBID of basic blocks to their cluster information. If this is
 // empty, it means unique sections for all basic blocks in the function.
-static void assignSections(
-    MachineFunction &MF,
-    const DenseMap<unsigned, BBClusterInfo<unsigned>> &ClusterInfoByBBID) {
+static void
+assignSections(MachineFunction &MF,
+               const DenseMap<UniqueBBID, BBClusterInfo> &FuncClusterInfo) {
   assert(MF.hasBBSections() && "BB Sections is not set for function.");
   // This variable stores the section ID of the cluster containing eh_pads (if
   // all eh_pads are one cluster). If more than one cluster contain eh_pads, we
@@ -191,17 +191,17 @@ static void assignSections(
     // With the 'all' option, every basic block is placed in a unique section.
     // With the 'list' option, every basic block is placed in a section
     // associated with its cluster, unless we want individual unique sections
-    // for every basic block in this function (if ClusterInfoByBBID is empty).
+    // for every basic block in this function (if FuncClusterInfo is empty).
     if (MF.getTarget().getBBSectionsType() == llvm::BasicBlockSection::All ||
-        ClusterInfoByBBID.empty()) {
+        FuncClusterInfo.empty()) {
       // If unique sections are desired for all basic blocks of the function, we
       // set every basic block's section ID equal to its original position in
       // the layout (which is equal to its number). This ensures that basic
       // blocks are ordered canonically.
       MBB.setSectionID(MBB.getNumber());
     } else {
-      auto I = ClusterInfoByBBID.find(*MBB.getBBID());
-      if (I != ClusterInfoByBBID.end()) {
+      auto I = FuncClusterInfo.find(*MBB.getBBID());
+      if (I != FuncClusterInfo.end()) {
         MBB.setSectionID(I->second.ClusterID);
       } else {
         // BB goes into the special cold section if it is not specified in the
@@ -264,12 +264,7 @@ void llvm::avoidZeroOffsetLandingPad(MachineFunction &MF) {
   }
 }
 
-// This checks if the source of this function has drifted since this binary was
-// profiled previously.  For now, we are piggy backing on what PGO does to
-// detect this with instrumented profiles.  PGO emits an hash of the IR and
-// checks if the hash has changed.  Advanced basic block layout is usually done
-// on top of PGO optimized binaries and hence this check works well in practice.
-static bool hasInstrProfHashMismatch(MachineFunction &MF) {
+bool llvm::hasInstrProfHashMismatch(MachineFunction &MF) {
   if (!BBSectionsDetectSourceDrift)
     return false;
 
@@ -290,7 +285,7 @@ bool BasicBlockSections::runOnMachineFunction(MachineFunction &MF) {
   assert(BBSectionsType != BasicBlockSection::None &&
          "BB Sections not enabled!");
 
-  // Check for source drift.  If the source has changed since the profiles
+  // Check for source drift. If the source has changed since the profiles
   // were obtained, optimizing basic blocks might be sub-optimal.
   // This only applies to BasicBlockSection::List as it creates
   // clusters of basic blocks using basic block ids. Source drift can
@@ -298,38 +293,30 @@ bool BasicBlockSections::runOnMachineFunction(MachineFunction &MF) {
   // regards to performance.
   if (BBSectionsType == BasicBlockSection::List &&
       hasInstrProfHashMismatch(MF))
-    return true;
+    return false;
   // Renumber blocks before sorting them. This is useful for accessing the
   // original layout positions and finding the original fallthroughs.
   MF.RenumberBlocks();
 
   if (BBSectionsType == BasicBlockSection::Labels) {
     MF.setBBSectionsType(BBSectionsType);
-    return true;
+    return false;
   }
 
-  DenseMap<unsigned, BBClusterInfo<unsigned>> ClusterInfoByBBID;
+  DenseMap<UniqueBBID, BBClusterInfo> FuncClusterInfo;
   if (BBSectionsType == BasicBlockSection::List) {
-    auto [HasProfile, PathAndClusterInfo] =
+    auto [HasProfile, ClusterInfo] =
         getAnalysis<BasicBlockSectionsProfileReader>()
-            .getPathAndClusterInfoForFunction(MF.getName());
+            .getClusterInfoForFunction(MF.getName());
     if (!HasProfile)
-      return true;
-    for (const BBClusterInfo<ProfileBBID> &BBP :
-         PathAndClusterInfo.ClusterInfo) {
-      // TODO: Apply the path cloning profile.
-      assert(!BBP.BasicBlockID.CloneID && "Path cloning is not supported yet");
-      const auto [I, Inserted] = ClusterInfoByBBID.try_emplace(
-          BBP.BasicBlockID.BBID,
-          BBClusterInfo<unsigned>{BBP.BasicBlockID.BBID, BBP.ClusterID,
-                                  BBP.PositionInCluster});
-      (void)I;
-      assert(Inserted && "Duplicate BBID found in profile");
+      return false;
+    for (auto &BBClusterInfo : ClusterInfo) {
+      FuncClusterInfo.try_emplace(BBClusterInfo.BBID, BBClusterInfo);
     }
   }
 
   MF.setBBSectionsType(BBSectionsType);
-  assignSections(MF, ClusterInfoByBBID);
+  assignSections(MF, FuncClusterInfo);
 
   // We make sure that the cluster including the entry basic block precedes all
   // other clusters.
@@ -363,8 +350,8 @@ bool BasicBlockSections::runOnMachineFunction(MachineFunction &MF) {
     // If the two basic block are in the same section, the order is decided by
     // their position within the section.
     if (XSectionID.Type == MBBSectionID::SectionType::Default)
-      return ClusterInfoByBBID.lookup(*X.getBBID()).PositionInCluster <
-             ClusterInfoByBBID.lookup(*Y.getBBID()).PositionInCluster;
+      return FuncClusterInfo.lookup(*X.getBBID()).PositionInCluster <
+             FuncClusterInfo.lookup(*Y.getBBID()).PositionInCluster;
     return X.getNumber() < Y.getNumber();
   };
 
diff --git a/llvm/lib/CodeGen/BasicBlockSectionsProfileReader.cpp b/llvm/lib/CodeGen/BasicBlockSectionsProfileReader.cpp
index 6bb412a6c7534..96662378a8693 100644
--- a/llvm/lib/CodeGen/BasicBlockSectionsProfileReader.cpp
+++ b/llvm/lib/CodeGen/BasicBlockSectionsProfileReader.cpp
@@ -35,15 +35,15 @@ INITIALIZE_PASS(BasicBlockSectionsProfileReader, "bbsections-profile-reader",
                 "Reads and parses a basic block sections profile.", false,
                 false)
 
-Expected<ProfileBBID>
-BasicBlockSectionsProfileReader::parseProfileBBID(StringRef S) const {
+Expected<UniqueBBID>
+BasicBlockSectionsProfileReader::parseUniqueBBID(StringRef S) const {
   SmallVector<StringRef, 2> Parts;
   S.split(Parts, '.');
   if (Parts.size() > 2)
     return createProfileParseError(Twine("unable to parse basic block id: '") +
                                    S + "'");
-  unsigned long long BBID;
-  if (getAsUnsignedInteger(Parts[0], 10, BBID))
+  unsigned long long BaseBBID;
+  if (getAsUnsignedInteger(Parts[0], 10, BaseBBID))
     return createProfileParseError(
         Twine("unable to parse BB id: '" + Parts[0]) +
         "': unsigned integer expected");
@@ -51,21 +51,27 @@ BasicBlockSectionsProfileReader::parseProfileBBID(StringRef S) const {
   if (Parts.size() > 1 && getAsUnsignedInteger(Parts[1], 10, CloneID))
     return createProfileParseError(Twine("unable to parse clone id: '") +
                                    Parts[1] + "': unsigned integer expected");
-  return ProfileBBID{static_cast<unsigned>(BBID),
-                     static_cast<unsigned>(CloneID)};
+  return UniqueBBID{static_cast<unsigned>(BaseBBID),
+                    static_cast<unsigned>(CloneID)};
 }
 
 bool BasicBlockSectionsProfileReader::isFunctionHot(StringRef FuncName) const {
-  return getPathAndClusterInfoForFunction(FuncName).first;
+  return getClusterInfoForFunction(FuncName).first;
 }
 
-std::pair<bool, FunctionPathAndClusterInfo>
-BasicBlockSectionsProfileReader::getPathAndClusterInfoForFunction(
+std::pair<bool, SmallVector<BBClusterInfo>>
+BasicBlockSectionsProfileReader::getClusterInfoForFunction(
     StringRef FuncName) const {
   auto R = ProgramPathAndClusterInfo.find(getAliasName(FuncName));
   return R != ProgramPathAndClusterInfo.end()
-             ? std::pair(true, R->second)
-             : std::pair(false, FunctionPathAndClusterInfo());
+             ? std::pair(true, R->second.ClusterInfo)
+             : std::pair(false, SmallVector<BBClusterInfo>());
+}
+
+SmallVector<SmallVector<unsigned>>
+BasicBlockSectionsProfileReader::getClonePathsForFunction(
+    StringRef FuncName) const {
+  return ProgramPathAndClusterInfo.lookup(getAliasName(FuncName)).ClonePaths;
 }
 
 // Reads the version 1 basic block sections profile. Profile for each function
@@ -133,7 +139,7 @@ Error BasicBlockSectionsProfileReader::ReadV1Profile() {
 
   // Temporary set to ensure every basic block ID appears once in the clusters
   // of a function.
-  DenseSet<ProfileBBID> FuncBBIDs;
+  DenseSet<UniqueBBID> FuncBBIDs;
 
   // Debug-info-based module filename for the current function. Empty string
   // means no filename.
@@ -199,7 +205,7 @@ Error BasicBlockSectionsProfileReader::ReadV1Profile() {
       // Reset current cluster position.
       CurrentPosition = 0;
       for (auto BasicBlockIDStr : Values) {
-        auto BasicBlockID = parseProfileBBID(BasicBlockIDStr);
+        auto BasicBlockID = parseUniqueBBID(BasicBlockIDStr);
         if (!BasicBlockID)
           return BasicBlockID.takeError();
         if (!FuncBBIDs.insert(*BasicBlockID).second)
@@ -207,28 +213,32 @@ Error BasicBlockSectionsProfileReader::ReadV1Profile() {
               Twine("duplicate basic block id found '") + BasicBlockIDStr +
               "'");
 
-        if (!BasicBlockID->BBID && CurrentPosition)
+        if (!BasicBlockID->BaseID && CurrentPosition)
           return createProfileParseError(
               "entry BB (0) does not begin a cluster.");
 
-        FI->second.ClusterInfo.emplace_back(BBClusterInfo<ProfileBBID>{
+        FI->second.ClusterInfo.emplace_back(BBClusterInfo{
             *std::move(BasicBlockID), CurrentCluster, CurrentPosition++});
       }
       CurrentCluster++;
       continue;
     case 'p': { // Basic block cloning path specifier.
+      // Skip the profile when we the profile iterator (FI) refers to the
+      // past-the-end element.
+      if (FI == ProgramPathAndClusterInfo.end())
+        continue;
       SmallSet<unsigned, 5> BBsInPath;
       FI->second.ClonePaths.push_back({});
       for (size_t I = 0; I < Values.size(); ++I) {
-        auto BBIDStr = Values[I];
-        unsigned long long BBID = 0;
-        if (getAsUnsignedInteger(BBIDStr, 10, BBID))
+        auto BaseBBIDStr = Values[I];
+        unsigned long long BaseBBID = 0;
+        if (getAsUnsignedInteger(BaseBBIDStr, 10, BaseBBID))
           return createProfileParseError(Twine("unsigned integer expected: '") +
-                                         BBIDStr + "'");
-        if (I != 0 && !BBsInPath.insert(BBID).second)
+                                         BaseBBIDStr + "'");
+        if (I != 0 && !BBsInPath.insert(BaseBBID).second)
           return createProfileParseError(
-              Twine("duplicate cloned block in path: '") + BBIDStr + "'");
-        FI->second.ClonePaths.back().push_back(BBID);
+              Twine("duplicate cloned block in path: '") + BaseBBIDStr + "'");
+        FI->second.ClonePaths.back().push_back(BaseBBID);
       }
       continue;
     }
@@ -282,9 +292,9 @@ Error BasicBlockSectionsProfileReader::ReadV0Profile() {
               "entry BB (0) does not begin a cluster");
 
         FI->second.ClusterInfo.emplace_back(
-            BBClusterInfo<ProfileBBID>({{static_cast<unsigned>(BBID), 0},
-                                        CurrentCluster,
-                                        CurrentPosition++}));
+            BBClusterInfo({{static_cast<unsigned>(BBID), 0},
+                           CurrentCluster,
+                           CurrentPosition++}));
       }
       CurrentCluster++;
     } else {
diff --git a/llvm/lib/CodeGen/CMakeLists.txt b/llvm/lib/CodeGen/CMakeLists.txt
index 389c70d04f17b..df2d1831ee5fd 100644
--- a/llvm/lib/CodeGen/CMakeLists.txt
+++ b/llvm/lib/CodeGen/CMakeLists.txt
@@ -46,6 +46,7 @@ add_llvm_component_library(LLVMCodeGen
   BranchRelaxation.cpp
   BreakFalseDeps.cpp
   BasicBlockSections.cpp
+  BasicBlockPathCloning.cpp
   BasicBlockSectionsProfileReader.cpp
   CalcSpillWeights.cpp
   CallBrPrepare.cpp
diff --git a/llvm/lib/CodeGen/CodeGen.cpp b/llvm/lib/CodeGen/CodeGen.cpp
index 6272b654b3295..79a95ee0d747a 100644
--- a/llvm/lib/CodeGen/CodeGen.cpp
+++ b/llvm/lib/CodeGen/CodeGen.cpp
@@ -20,6 +20,7 @@ using namespace llvm;
 void llvm::initializeCodeGen(PassRegistry &Registry) {
   initializeAssignmentTrackingAnalysisPass(Registry);
   initializeAtomicExpandPass(Registry);
+  initializeBasicBlockPathCloningPass(Registry);
   initializeBasicBlockSectionsPass(Registry);
   initializeBranchFolderPassPass(Registry);
   initializeBranchRelaxationPass(Registry);
diff --git a/llvm/lib/CodeGen/MIRParser/MIParser.cpp b/llvm/lib/CodeGen/MIRParser/MIParser.cpp
index 65280c65b6878..c01b34d6f490b 100644
--- a/llvm/lib/CodeGen/MIRParser/MIParser.cpp
+++ b/llvm/lib/CodeGen/MIRParser/MIParser.cpp
@@ -500,7 +500,7 @@ class MIParser {
   bool parseAlignment(uint64_t &Alignment);
   bool parseAddrspace(unsigned &Addrspace);
   bool parseSectionID(std::optional<MBBSectionID> &SID);
-  bool parseBBID(std::optional<unsigned> &BBID);
+  bool parseBBID(std::optional<UniqueBBID> &BBID);
   bool parseCallFrameSize(unsigned &CallFrameSize);
   bool parseOperandsOffset(MachineOperand &Op);
   bool parseIRValue(const Value *&V);
@@ -666,14 +666,20 @@ bool MIParser::parseSectionID(std::optional<MBBSectionID> &SID) {
 }
 
 // Parse Machine Basic Block ID.
-bool MIParser::parseBBID(std::optional<unsigned> &BBID) {
+bool MIParser::parseBBID(std::optional<UniqueBBID> &BBID) {
   assert(Token.is(MIToken::kw_bb_id));
   lex();
-  unsigned Value = 0;
-  if (getUnsigned(Value))
+  unsigned BaseID = 0;
+  unsigned CloneID = 0;
+  if (getUnsigned(BaseID))
     return error("Unknown BB ID");
-  BBID = Value;
   lex();
+  if (Token.is(MIToken::IntegerLiteral)) {
+    if (getUnsigned(CloneID))
+      return error("Unknown Clone ID");
+    lex();
+  }
+  BBID = {BaseID, CloneID};
   return false;
 }
 
@@ -705,7 +711,7 @@ bool MIParser::parseBasicBlockDefinition(
   bool IsEHFuncletEntry = false;
   std::optional<MBBSectionID> SectionID;
   uint64_t Alignment = 0;
-  std::optional<unsigned> BBID;
+  std::optional<UniqueBBID> BBID;
   unsigned CallFrameSize = 0;
   BasicBlock *BB = nullptr;
   if (consumeIfPresent(MIToken::lparen)) {
diff --git a/llvm/lib/CodeGen/MachineBasicBlock.cpp b/llvm/lib/CodeGen/MachineBasicBlock.cpp
index 5f9e4a66c0d22..ef8e1bd63024f 100644
--- a/llvm/lib/CodeGen/MachineBasicBlock.cpp
+++ b/llvm/lib/CodeGen/MachineBasicBlock.cpp
@@ -567,7 +567,9 @@ void MachineBasicBlock::printName(raw_ostream &os, unsigned printNameFlags,
     }
     if (getBBID().has_value()) {
       os << (hasAttributes ? ", " : " (");
-      os << "bb_id " << *getBBID();
+      os << "bb_id " << getBBID()->BaseID;
+      if (getBBID()->CloneID != 0)
+        os << " " << getBBID()->CloneID;
       hasAttributes = true;
     }
     if (CallFrameSize != 0) {
@@ -886,7 +888,7 @@ void MachineBasicBlock::replaceSuccessor(MachineBasicBlock *Old,
   removeSuccessor(OldI);
 }
 
-void MachineBasicBlock::copySuccessor(MachineBasicBlock *Orig,
+void MachineBasicBlock::copySuccessor(const MachineBasicBlock *Orig,
                                       succ_iterator I) {
   if (!Orig->Probs.empty())
     addSuccessor(*I, Orig->getSuccProbability(I));
diff --git a/llvm/lib/CodeGen/MachineFunction.cpp b/llvm/lib/CodeGen/MachineFunction.cpp
index 9e67dcbf27d4c..07eb0ba7f45c2 100644
--- a/llvm/lib/CodeGen/MachineFunction.cpp
+++ b/llvm/lib/CodeGen/MachineFunction.cpp
@@ -457,16 +457,17 @@ void MachineFunction::deleteMachineInstr(MachineInstr *MI) {
 /// Allocate a new MachineBasicBlock. Use this instead of
 /// `new MachineBasicBlock'.
 MachineBasicBlock *
-MachineFunction::CreateMachineBasicBlock(const BasicBlock *bb) {
+MachineFunction::CreateMachineBasicBlock(const BasicBlock *BB,
+                                         std::optional<UniqueBBID> BBID) {
   MachineBasicBlock *MBB =
       new (BasicBlockRecycler.Allocate<MachineBasicBlock>(Allocator))
-          MachineBasicBlock(*this, bb);
+          MachineBasicBlock(*this, BB);
   // Set BBID for `-basic-block=sections=labels` and
   // `-basic-block-sections=list` to allow robust mapping of profiles to basic
   // blocks.
   if (Target.getBBSectionsType() == BasicBlockSection::Labels ||
       Target.getBBSectionsType() == BasicBlockSection::List)
-    MBB->setBBID(NextBBID++);
+    MBB->setBBID(BBID.has_value() ? *BBID : UniqueBBID{NextBBID++, 0});
   return MBB;
 }
 
diff --git a/llvm/lib/CodeGen/TargetInstrInfo.cpp b/llvm/lib/CodeGen/TargetInstrInfo.cpp
index bf1605f06bd88..fe7efb73a2dce 100644
--- a/llvm/lib/CodeGen/TargetInstrInfo.cpp
+++ b/llvm/lib/CodeGen/TargetInstrInfo.cpp
@@ -34,6 +34,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
 
 using namespace llvm;
 
@@ -430,10 +431,18 @@ bool TargetInstrInfo::produceSameValue(const MachineInstr &MI0,
   return MI0.isIdenticalTo(MI1, MachineInstr::IgnoreVRegDefs);
 }
 
-MachineInstr &TargetInstrInfo::duplicate(MachineBasicBlock &MBB,
-    MachineBasicBlock::iterator InsertBefore, const MachineInstr &Orig) const {
-  assert(!Orig.isNotDuplicable() && "Instruction cannot be duplicated");
+MachineInstr &
+TargetInstrInfo::duplicate(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator InsertBefore,
+                           const MachineInstr &Orig) const {
   MachineFunction &MF = *MBB.getParent();
+  // CFI instructions are marked as non-duplicable, because Darwin compact
+  // unwind info emission can't handle multiple prologue setups.
+  assert((!Orig.isNotDuplicable() ||
+          (!MF.getTarget().getTargetTriple().isOSDarwin() &&
+           Orig.isCFIInstruction())) &&
+         "Instruction cannot be duplicated");
+
   return MF.cloneMachineInstrBundle(MBB, InsertBefore, Orig);
 }
 
diff --git a/llvm/lib/CodeGen/TargetPassConfig.cpp b/llvm/lib/CodeGen/TargetPassConfig.cpp
index e6ecbc9b03f71..1f7c949cd6031 100644
--- a/llvm/lib/CodeGen/TargetPassConfig.cpp
+++ b/llvm/lib/CodeGen/TargetPassConfig.cpp
@@ -1267,6 +1267,7 @@ void TargetPassConfig::addMachinePasses() {
     if (TM->getBBSectionsType() == llvm::BasicBlockSection::List) {
       addPass(llvm::createBasicBlockSectionsProfileReaderPass(
           TM->getBBSectionsFuncListBuf()));
+      addPass(llvm::createBasicBlockPathCloningPass());
     }
     addPass(llvm::createBasicBlockSectionsPass());
   } else if (TM->Options.EnableMachineFunctionSplitter ||
diff --git a/llvm/test/CodeGen/X86/basic-block-labels-mir-parse.mir b/llvm/test/CodeGen/X86/basic-block-labels-mir-parse.mir
index 74a7bcf3ae82f..f11707c719895 100644
--- a/llvm/test/CodeGen/X86/basic-block-labels-mir-parse.mir
+++ b/llvm/test/CodeGen/X86/basic-block-labels-mir-parse.mir
@@ -136,7 +136,7 @@ body:             |
   
     MOV32mi $rbp, 1, $noreg, -8, $noreg, 0 :: (store (s32) into %ir.2)
   
-  bb.3 (%ir-block.9, bb_id 3):
+  bb.3 (%ir-block.9, bb_id 3 2):
     renamable $eax = MOV32rm $rbp, 1, $noreg, -8, $noreg :: (load (s32) from %ir.2)
     $rbp = frame-destroy POP64r implicit-def $rsp, implicit $rsp
     frame-destroy CFI_INSTRUCTION def_cfa $rsp, 8
diff --git a/llvm/test/CodeGen/X86/basic-block-sections-cloning-1.ll b/llvm/test/CodeGen/X86/basic-block-sections-cloning-1.ll
new file mode 100644
index 0000000000000..0f84b891a7c52
--- /dev/null
+++ b/llvm/test/CodeGen/X86/basic-block-sections-cloning-1.ll
@@ -0,0 +1,71 @@
+;; Test cloning a single path with -basic-block-sections.
+
+declare void @effect(i32 zeroext)
+
+;; Test a valid application of path cloning.
+; RUN: echo 'v1' > %t
+; RUN: echo 'f foo' >> %t
+; RUN: echo 'p 0 3 5' >> %t
+; RUN: echo 'c 0 3.1 5.1 1 2 3 4 5' >> %t
+; RUN: llc < %s -mtriple=x86_64-pc-linux -O0 -function-sections -basic-block-sections=%t | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-pc-linux -O0 -function-sections -basic-block-sections=%t -stop-after=bb-path-cloning | FileCheck %s --check-prefix=MIR
+
+define void @foo(i1 %a, i1 %b, i1 %c, i1 %d) {
+b0:
+  call void @effect(i32 0)
+  br i1 %a, label %b1, label %b3
+
+b1:                                           ; preds = %b0
+  call void @effect(i32 1)
+  br i1 %b, label %b2, label %b3
+
+b2:                                             ; preds = %b1
+  call void @effect(i32 2)
+  br label %b3
+
+b3:                                            ; preds = %b0, %b1, %b2
+  call void @effect(i32 3)
+  br i1 %c, label %b4, label %b5
+
+b4:                                             ; preds = %b3
+  call void @effect(i32 4)
+  br i1 %d, label %b5, label %cold
+
+b5:                                            ; preds = %b3, %b4
+  call void @effect(i32 5)
+  ret void
+cold:
+  call void @effect(i32 6)                     ; preds = %b4
+  ret void
+}
+
+;; Check the cloned block ids in MIR.
+
+; MIR: bb.7.b3 (bb_id 3 1):
+; MIR: bb.8.b5 (bb_id 5 1):
+
+;; Check the final layout and branches.
+
+;; bb section:
+; CHECK:        .section    .text.foo,"ax",@progbits
+; CHECK:      foo:
+; CHECK:      # %bb.0:        # %b0
+; CHECK:        jne .LBB0_1
+; CHECK-NEXT: # %bb.7:        # %b3
+; CHECK:        jne .LBB0_4
+; CHECK-NEXT: # %bb.8:        # %b5
+; CHECK:        retq
+; CHECK-NEXT: .LBB0_1:        # %b1
+; CHECK:        je .LBB0_3
+; CHECK-NEXT: # %bb.2:        # %b2
+; CHECK:        callq effect@PLT
+; CHECK-NEXT: .LBB0_3:        # %b3
+; CHECK:        je .LBB0_5
+; CHECK-NEXT: .LBB0_4:        # %b4
+; CHECK:        je foo.cold
+; CHECK-NEXT: .LBB0_5:        # %b5
+; CHECK:        retq
+
+;; split section
+; CHECK:        .section    .text.split.foo,"ax",@progbits
+; CHECK:      foo.cold:      # %cold
diff --git a/llvm/test/CodeGen/X86/basic-block-sections-cloning-2.ll b/llvm/test/CodeGen/X86/basic-block-sections-cloning-2.ll
new file mode 100644
index 0000000000000..c433491a49430
--- /dev/null
+++ b/llvm/test/CodeGen/X86/basic-block-sections-cloning-2.ll
@@ -0,0 +1,86 @@
+;; Test cloning two paths with -basic-block-sections.
+
+declare void @effect(i32 zeroext)
+
+; RUN: echo 'v1' > %t
+; RUN: echo 'f foo' >> %t
+; RUN: echo 'p 0 3 5' >> %t
+; RUN: echo 'p 1 3 4 5' >> %t
+; RUN: echo 'c 0 3.1 5.1' >> %t
+; RUN: echo 'c 1 3.2 4.1 5.2 2 3 4 5' >> %t
+; RUN: llc < %s -mtriple=x86_64-pc-linux -O0 -function-sections -basic-block-sections=%t | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-pc-linux -O0 -function-sections -basic-block-sections=%t -stop-after=bb-path-cloning | FileCheck %s --check-prefix=MIR
+
+define void @foo(i1 %a, i1 %b, i1 %c, i1 %d) {
+b0:
+  call void @effect(i32 0)
+  br i1 %a, label %b1, label %b3
+
+b1:                                           ; preds = %b0
+  call void @effect(i32 1)
+  br i1 %b, label %b2, label %b3
+
+b2:                                             ; preds = %b1
+  call void @effect(i32 2)
+  br label %b3
+
+b3:                                            ; preds = %b0, %b1, %b2
+  call void @effect(i32 3)
+  br i1 %c, label %b4, label %b5
+
+b4:                                             ; preds = %b3
+  call void @effect(i32 4)
+  br i1 %d, label %b5, label %cold
+
+b5:                                            ; preds = %b3, %b4
+  call void @effect(i32 5)
+  ret void
+cold:
+  call void @effect(i32 6)                     ; preds = %b4
+  ret void
+}
+
+;; Check the cloned block ids in MIR.
+
+; MIR:   bb.7.b3 (bb_id 3 1):
+; MIR:   bb.8.b5 (bb_id 5 1):
+; MIR:   bb.9.b3 (bb_id 3 2):
+; MIR:   bb.10.b4 (bb_id 4 1):
+; MIR:   bb.11.b5 (bb_id 5 2):
+
+;; Check the final layout and branches.
+
+;; first cluster:
+; CHECK:        .section    .text.foo,"ax",@progbits
+; CHECK:      foo:
+; CHECK:      # %bb.0:        # %b0
+; CHECK:        jne foo.__part.1
+; CHECK-NEXT: # %bb.7:        # %b3
+; CHECK:        jne .LBB0_4
+; CHECK-NEXT: # %bb.8:        # %b5
+; CHECK:        retq
+
+;; second cluster:
+; CHECK:        .section    .text.foo,"ax",@progbits,unique,1
+; CHECK-NEXT: foo.__part.1:   # %b1
+; CHECK:        jne .LBB0_2
+; CHECK-NEXT: # %bb.9:        # %b3
+; CHECK:        je .LBB0_5
+; CHECK-NEXT: # %bb.10:       # %b4
+; CHECK:        je foo.cold
+; CHECK-NEXT: # %bb.11:       # %b5
+; CHECK:        retq
+; CHECK-NEXT: .LBB0_2:        # %b2
+; CHECK:        callq	effect@PLT
+; CHECK-NEXT: # %bb.3:        # %b3
+; CHECK:        je .LBB0_5
+; CHECK-NEXT: .LBB0_4:        # %b4
+; CHECK:        je foo.cold
+; CHECK-NEXT: .LBB0_5:       # %b5
+; CHECK:        retq
+
+;; split section
+; CHECK:        .section    .text.split.foo,"ax",@progbits
+; CHECK:      foo.cold:      # %cold
+
+
diff --git a/llvm/test/CodeGen/X86/basic-block-sections-cloning-indirect-invalid.ll b/llvm/test/CodeGen/X86/basic-block-sections-cloning-indirect-invalid.ll
new file mode 100644
index 0000000000000..d8686cdfa098e
--- /dev/null
+++ b/llvm/test/CodeGen/X86/basic-block-sections-cloning-indirect-invalid.ll
@@ -0,0 +1,45 @@
+;; Tests for invalid path cloning with -basic-block-sections involving indirect branches.
+
+declare void @effect(i32 zeroext)
+
+;; Test failed application of path cloning for paths with indirect branches.
+; RUN: echo 'v1' > %t1
+; RUN: echo 'f bar' >> %t1
+; RUN: echo 'p 0 1 2' >> %t1
+; RUN: echo 'c 0 1.1 2.1 1' >> %t1
+; RUN: llc < %s -mtriple=x86_64-pc-linux -O0 -function-sections -basic-block-sections=%t1 2> %t1.err | FileCheck %s
+; RUN: FileCheck %s --check-prefix=WARN < %t1.err
+; RUN: echo 'v1' > %t2
+; RUN: echo 'f bar' >> %t2
+; RUN: echo 'p 1 2' >> %t2
+; RUN: echo 'c 0 1 2.1' >> %t2
+; RUN: llc < %s -mtriple=x86_64-pc-linux -O0 -function-sections -basic-block-sections=%t2 2> %t2.err | FileCheck %s
+; RUN: FileCheck %s --check-prefix=WARN < %t2.err
+
+
+define void @bar(i1 %a, i1 %b) {
+b0:
+  call void @effect(i32 0)
+  br i1 %a, label %b1, label %b2
+b1:                                              ; preds = %b0
+  call void @effect(i32 1)
+  %0 = select i1 %b,                           ; <ptr> [#uses=1]
+              ptr blockaddress(@bar, %b2),
+              ptr blockaddress(@bar, %b3)
+  indirectbr ptr %0, [label %b2, label %b3]
+b2:                                              ; preds = %b0, %b1
+  call void @effect(i32 2)
+  ret void
+b3:
+  call void @effect(i32 3)                       ; preds = %b1
+  ret void
+}
+
+; CHECK:   .section    .text.bar,"ax",@progbits
+; CHECK:   bar:
+; CHECK: # %bb.0:        # %b0
+; CHECK: # %bb.1:        # %b1
+; CHECK:   .section    .text.split.bar,"ax",@progbits
+; CHECK: bar.cold:       # %b2   
+
+; WARN: warning: block #1 has indirect branch and appears as the non-tail block of a path in function bar
diff --git a/llvm/test/CodeGen/X86/basic-block-sections-cloning-indirect.ll b/llvm/test/CodeGen/X86/basic-block-sections-cloning-indirect.ll
new file mode 100644
index 0000000000000..3d9a8d36ca105
--- /dev/null
+++ b/llvm/test/CodeGen/X86/basic-block-sections-cloning-indirect.ll
@@ -0,0 +1,43 @@
+;; Test for cloning a path ending with indirect branch with -basic-block-sections.
+
+declare void @effect(i32 zeroext)
+
+;; Test valid application of cloning for a path with indirect branch.
+; RUN: echo 'v1' > %t
+; RUN: echo 'f bar' >> %t
+; RUN: echo 'p 0 1' >> %t
+; RUN: echo 'c 0 1.1 2 1' >> %t
+; RUN: llc < %s -mtriple=x86_64-pc-linux -O0 -function-sections -basic-block-sections=%t | FileCheck %s
+
+define void @bar(i1 %a, i1 %b) {
+b0:
+  call void @effect(i32 0)
+  br i1 %a, label %b1, label %b2
+b1:                                              ; preds = %b0
+  call void @effect(i32 1)
+  %0 = select i1 %b,                           ; <ptr> [#uses=1]
+              ptr blockaddress(@bar, %b2),
+              ptr blockaddress(@bar, %b3)
+  indirectbr ptr %0, [label %b2, label %b3]
+b2:                                              ; preds = %b0, %b1
+  call void @effect(i32 2)
+  ret void
+b3:
+  call void @effect(i32 3)                       ; preds = %b1
+  ret void
+}
+
+; CHECK:        .section    .text.bar,"ax",@progbits
+; CHECK:      bar:
+; CHECK:      # %bb.0:        # %b0
+; CHECK:        je .LBB0_2
+; CHECK-NEXT: # %bb.4:        # %b1
+; CHECK:        jmpq *%rax
+; CHECK-NEXT: .Ltmp0:         # Block address taken
+; CHECK-NEXT: .LBB0_2:        # %b2
+; CHECK:        retq
+; CHECK-NEXT: # %bb.1:        # %b1
+; CHECK:        jmpq *%rax
+; CHECK:        .section    .text.split.bar,"ax",@progbits
+; CHECK:      bar.cold:       # %b3
+
diff --git a/llvm/test/CodeGen/X86/basic-block-sections-cloning-invalid.ll b/llvm/test/CodeGen/X86/basic-block-sections-cloning-invalid.ll
new file mode 100644
index 0000000000000..521ec43ef050a
--- /dev/null
+++ b/llvm/test/CodeGen/X86/basic-block-sections-cloning-invalid.ll
@@ -0,0 +1,72 @@
+;; Tests for invalid or (partially invalid) path clonings with -basic-block-sections.
+
+declare void @effect(i32 zeroext)
+
+;; Test failed application of path cloning.
+; RUN: echo 'v1' > %t1
+; RUN: echo 'f foo' >> %t1
+; RUN: echo 'p 0 2 3' >> %t1
+; RUN: echo 'c 0 2.1 3.1 1' >> %t1
+; RUN: llc < %s -mtriple=x86_64-pc-linux -O0 -function-sections -basic-block-sections=%t1 2> %t1.err | FileCheck %s
+; RUN: FileCheck %s --check-prefixes=WARN1 < %t1.err
+;; Test that valid clonings are applied correctly, even if invalid clonings exist.
+; RUN: echo 'v1' > %t2
+; RUN: echo 'f foo' >> %t2
+; RUN: echo 'p 0 2 3' >> %t2
+; RUN: echo 'p 0 1 3' >> %t2
+; RUN: echo 'c 0 1.1 3.2 2.1 3.1 1' >> %t2
+; RUN: llc < %s -mtriple=x86_64-pc-linux -O0 -function-sections -basic-block-sections=%t2 2> %t2.err | FileCheck %s --check-prefixes=PATH
+; RUN: FileCheck %s --check-prefixes=WARN1 < %t2.err
+; RUN: echo 'v1' > %t3
+; RUN: echo 'f foo' >> %t3
+; RUN: echo 'p 0 100' >> %t3
+; RUN: echo 'c 0 100.1 1' >> %t3
+; RUN: llc < %s -mtriple=x86_64-pc-linux -O0 -function-sections -basic-block-sections=%t3 2> %t3.err | FileCheck %s
+; RUN: FileCheck %s --check-prefixes=WARN2 < %t3.err
+
+define void @foo(i1 %a, i1 %b, i1 %c, i1 %d) {
+b0:
+  call void @effect(i32 0)
+  br i1 %a, label %b1, label %b3
+
+b1:                                           ; preds = %b0
+  call void @effect(i32 1)
+  br i1 %b, label %b2, label %b3
+
+b2:                                             ; preds = %b1
+  call void @effect(i32 2)
+  br label %b3
+
+b3:                                            ; preds = %b0, %b1, %b2
+  call void @effect(i32 3)
+  br i1 %c, label %b4, label %b5
+
+b4:                                             ; preds = %b3
+  call void @effect(i32 4)
+  br i1 %d, label %b5, label %cold
+
+b5:                                            ; preds = %b3, %b4
+  call void @effect(i32 5)
+  ret void
+cold:
+  call void @effect(i32 6)                     ; preds = %b4
+  ret void
+}
+
+; CHECK:   .section    .text.foo,"ax",@progbits
+; CHECK: foo:
+; CHECK: # %bb.0:        # %b0
+
+; CHECK:   je .LBB0_3
+; PATH:  # %bb.7:      # %b1
+; PATH:  # %bb.8:      # %b3
+; PATH:    jne .LBB0_4
+; CHECK: # %bb.1:      # %b1
+; CHECK:   jne foo.cold
+
+; CHECK: foo.cold:      # %b2
+
+;; Check the warnings
+; WARN1: warning: block #2 is not a successor of block #0 in function foo
+; WARN2: warning: no block with id 100 in function foo
+

From 631e2911ea298bc12564df8acd16bba89790426a Mon Sep 17 00:00:00 2001
From: Matteo Franciolini <m_franciolini@apple.com>
Date: Fri, 27 Oct 2023 23:13:21 -0700
Subject: [PATCH 286/877] [mlir][bytecode] Fix D155919 and enable
 backward-compatibility and back-deployment between version 5 and version 6 of
 the bytecode encoding (#70498)

Before D155919, when a dialect was leveraging properties to store
attributes with `usePropertiesForAttributes`, the operand segment sizes
attribute was emitted in the property section in sorted order together
with all the other attributes of an op. After D155919, version 5 of the
bytecode was emitting and parsing operand segment sizes after all the
properties of an op, breaking backward compatibility and back
deployment. This patch fixes the emission ordering and allows to parse
bytecode files emitted before (D155919) with version 5 of MLIR bytecode.
The patch also enables to emit correctly version 5 of MLIR bytecode.
---
 mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp | 105 +++++++++++++++-----
 1 file changed, 81 insertions(+), 24 deletions(-)

diff --git a/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp b/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp
index 7029c0eac15c3..a620265b3dd88 100644
--- a/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp
+++ b/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp
@@ -156,26 +156,36 @@ static const char *const valueRangeReturnCode = R"(
 )";
 
 /// Read operand/result segment_size from bytecode.
-static const char *const readBytecodeSegmentSize = R"(
-if ($_reader.getBytecodeVersion() < /*kNativePropertiesODSSegmentSize=*/6) {
-  ::mlir::DenseI32ArrayAttr attr;
-  if (::mlir::failed($_reader.readAttribute(attr))) return ::mlir::failure();
-  if (attr.size() > static_cast<int64_t>(sizeof($_storage) / sizeof(int32_t))) {
-    $_reader.emitError("size mismatch for operand/result_segment_size");
-    return ::mlir::failure();
+static const char *const readBytecodeSegmentSizeNative = R"(
+  if ($_reader.getBytecodeVersion() >= /*kNativePropertiesODSSegmentSize=*/6)
+    return $_reader.readSparseArray(::llvm::MutableArrayRef($_storage));
+)";
+
+static const char *const readBytecodeSegmentSizeLegacy = R"(
+  if ($_reader.getBytecodeVersion() < /*kNativePropertiesODSSegmentSize=*/6) {
+    auto &$_storage = prop.$_propName;
+    ::mlir::DenseI32ArrayAttr attr;
+    if (::mlir::failed($_reader.readAttribute(attr))) return ::mlir::failure();
+    if (attr.size() > static_cast<int64_t>(sizeof($_storage) / sizeof(int32_t))) {
+      $_reader.emitError("size mismatch for operand/result_segment_size");
+      return ::mlir::failure();
+    }
+    ::llvm::copy(::llvm::ArrayRef<int32_t>(attr), $_storage.begin());
   }
-  ::llvm::copy(::llvm::ArrayRef<int32_t>(attr), $_storage.begin());
-} else {
-  return $_reader.readSparseArray(::llvm::MutableArrayRef($_storage));
-}
 )";
 
 /// Write operand/result segment_size to bytecode.
-static const char *const writeBytecodeSegmentSize = R"(
-if ($_writer.getBytecodeVersion() < /*kNativePropertiesODSSegmentSize=*/6)
-  $_writer.writeAttribute(::mlir::DenseI32ArrayAttr::get(getContext(), $_storage));
-else
-  $_writer.writeSparseArray(::llvm::ArrayRef($_storage));
+static const char *const writeBytecodeSegmentSizeNative = R"(
+  if ($_writer.getBytecodeVersion() >= /*kNativePropertiesODSSegmentSize=*/6)
+    $_writer.writeSparseArray(::llvm::ArrayRef($_storage));
+)";
+
+/// Write operand/result segment_size to bytecode.
+static const char *const writeBytecodeSegmentSizeLegacy = R"(
+if ($_writer.getBytecodeVersion() < /*kNativePropertiesODSSegmentSize=*/6) {
+  auto &$_storage = prop.$_propName;
+  $_writer.writeAttribute(::mlir::DenseI32ArrayAttr::get($_ctxt, $_storage));
+}
 )";
 
 /// A header for indicating code sections.
@@ -389,6 +399,14 @@ class OpOrAdaptorHelper {
     return resultSegmentsSize;
   }
 
+  uint32_t getOperandSegmentSizesLegacyIndex() {
+    return operandSegmentSizesLegacyIndex;
+  }
+
+  uint32_t getResultSegmentSizesLegacyIndex() {
+    return resultSegmentSizesLegacyIndex;
+  }
+
 private:
   // Compute the attribute metadata.
   void computeAttrMetadata();
@@ -407,6 +425,12 @@ class OpOrAdaptorHelper {
   std::optional<NamedProperty> resultSegmentsSize;
   std::string resultSegmentsSizeStorage;
 
+  // Indices to store the position in the emission order of the operand/result
+  // segment sizes attribute if emitted as part of the properties for legacy
+  // bytecode encodings, i.e. versions less than 6.
+  uint32_t operandSegmentSizesLegacyIndex = 0;
+  uint32_t resultSegmentSizesLegacyIndex = 0;
+
   // The number of required attributes.
   unsigned numRequired;
 };
@@ -435,8 +459,8 @@ void OpOrAdaptorHelper::computeAttrMetadata() {
         "::mlir::DenseI32ArrayAttr::get($_ctxt, $_storage)",
         /*convertFromAttributeCall=*/
         "return convertFromAttribute($_storage, $_attr, $_diag);",
-        /*readFromMlirBytecodeCall=*/readBytecodeSegmentSize,
-        /*writeToMlirBytecodeCall=*/writeBytecodeSegmentSize,
+        /*readFromMlirBytecodeCall=*/readBytecodeSegmentSizeNative,
+        /*writeToMlirBytecodeCall=*/writeBytecodeSegmentSizeNative,
         /*hashPropertyCall=*/
         "::llvm::hash_combine_range(std::begin($_storage), "
         "std::end($_storage));",
@@ -478,6 +502,21 @@ void OpOrAdaptorHelper::computeAttrMetadata() {
                return lhs.attrName < rhs.attrName;
              });
 
+  // Store the position of the legacy operand_segment_sizes /
+  // result_segment_sizes so we can emit a backward compatible property readers
+  // and writers.
+  StringRef legacyOperandSegmentSizeName =
+      StringLiteral("operand_segment_sizes");
+  StringRef legacyResultSegmentSizeName = StringLiteral("result_segment_sizes");
+  operandSegmentSizesLegacyIndex = 0;
+  resultSegmentSizesLegacyIndex = 0;
+  for (auto item : sortedAttrMetadata) {
+    if (item.attrName < legacyOperandSegmentSizeName)
+      ++operandSegmentSizesLegacyIndex;
+    if (item.attrName < legacyResultSegmentSizeName)
+      ++resultSegmentSizesLegacyIndex;
+  }
+
   // Compute the subrange bounds for each attribute.
   numRequired = 0;
   for (AttributeMetadata &attr : sortedAttrMetadata) {
@@ -1580,15 +1619,33 @@ void OpEmitter::genPropertiesSupportForBytecode(
   readPropertiesMethod
       << "  auto &prop = state.getOrAddProperties<Properties>(); (void)prop;";
   writePropertiesMethod << "  auto &prop = getProperties(); (void)prop;\n";
-  for (const auto &attrOrProp : attrOrProperties) {
+  for (const auto &item : llvm::enumerate(attrOrProperties)) {
+    auto &attrOrProp = item.value();
+    FmtContext fctx;
+    fctx.addSubst("_reader", "reader")
+        .addSubst("_writer", "writer")
+        .addSubst("_storage", propertyStorage)
+        .addSubst("_ctxt", "this->getContext()");
+    // If the op emits operand/result segment sizes as a property, emit the
+    // legacy reader/writer in the appropriate order to allow backward
+    // compatibility and back deployment.
+    if (emitHelper.getOperandSegmentsSize().has_value() &&
+        item.index() == emitHelper.getOperandSegmentSizesLegacyIndex()) {
+      FmtContext fmtCtxt(fctx);
+      fmtCtxt.addSubst("_propName", operandSegmentAttrName);
+      readPropertiesMethod << tgfmt(readBytecodeSegmentSizeLegacy, &fmtCtxt);
+      writePropertiesMethod << tgfmt(writeBytecodeSegmentSizeLegacy, &fmtCtxt);
+    }
+    if (emitHelper.getResultSegmentsSize().has_value() &&
+        item.index() == emitHelper.getResultSegmentSizesLegacyIndex()) {
+      FmtContext fmtCtxt(fctx);
+      fmtCtxt.addSubst("_propName", resultSegmentAttrName);
+      readPropertiesMethod << tgfmt(readBytecodeSegmentSizeLegacy, &fmtCtxt);
+      writePropertiesMethod << tgfmt(writeBytecodeSegmentSizeLegacy, &fmtCtxt);
+    }
     if (const auto *namedProperty =
             attrOrProp.dyn_cast<const NamedProperty *>()) {
       StringRef name = namedProperty->name;
-      FmtContext fctx;
-      fctx.addSubst("_reader", "reader")
-          .addSubst("_writer", "writer")
-          .addSubst("_storage", propertyStorage)
-          .addSubst("_ctxt", "this->getContext()");
       readPropertiesMethod << formatv(
           R"(
   {{

From 19c0c0bdc8bd3b1a68f4d16647ed1e023e1a8e38 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Sat, 28 Oct 2023 00:33:33 -0700
Subject: [PATCH 287/877] TypeSize: remove unused deprecated methods

Forgotten in commit 547a5073bed9dec25aec29751c036e2fb7d251c2
---
 llvm/include/llvm/Support/TypeSize.h | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/llvm/include/llvm/Support/TypeSize.h b/llvm/include/llvm/Support/TypeSize.h
index 732757c14a96d..8e638f8278e82 100644
--- a/llvm/include/llvm/Support/TypeSize.h
+++ b/llvm/include/llvm/Support/TypeSize.h
@@ -328,10 +328,6 @@ class TypeSize : public details::FixedOrScalableQuantity<TypeSize, uint64_t> {
     return TypeSize(MinimumSize, true);
   }
 
-  constexpr ScalarTy getFixedSize() const { return getFixedValue(); }
-  
-  constexpr ScalarTy getKnownMinSize() const { return getKnownMinValue(); }
-
   // All code for this class below this point is needed because of the
   // temporary implicit conversion to uint64_t. The operator overloads are
   // needed because otherwise the conversion of the parent class

From 6ed2d30d2f944824efeadb67721b57bfcf5b473b Mon Sep 17 00:00:00 2001
From: Felix Schneider <fx.schn@gmail.com>
Date: Sat, 28 Oct 2023 10:08:42 +0200
Subject: [PATCH 288/877] [mlir][tosa] Add verifier for tosa.reverse (#70500)

This patch adds a verifier to tosa.reverse which checks the axis
argument and input/output tensor ranks for validity. We allow a special
case where `axis == 0 && rank == 0`.
---
 mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td |  3 +-
 mlir/lib/Dialect/Tosa/IR/TosaOps.cpp         | 29 ++++++++++++++++++++
 mlir/test/Dialect/Tosa/canonicalize.mlir     |  2 +-
 mlir/test/Dialect/Tosa/invalid.mlir          |  8 ++++++
 4 files changed, 40 insertions(+), 2 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td b/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td
index c0baf478358c1..81b9e93c2095f 100644
--- a/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td
+++ b/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td
@@ -1593,7 +1593,8 @@ def Tosa_ReverseOp: Tosa_Op<"reverse", [
   );
 
   let hasFolder = 1;
-
+  let hasVerifier = 1;
+  
   let assemblyFormat = "operands attr-dict `:` functional-type(operands, results)";
 }
 
diff --git a/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp b/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp
index c9e64a67302e7..9f619a3531ab6 100644
--- a/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp
+++ b/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp
@@ -1768,6 +1768,35 @@ void IfOp::print(OpAsmPrinter &p) {
   p.printOptionalAttrDict((*this)->getAttrs());
 }
 
+LogicalResult ReverseOp::verify() {
+  TensorType inputType = getInput().getType();
+  TensorType outputType = getOutput().getType();
+  int32_t reverseAxis = getAxis();
+
+  if (reverseAxis < 0)
+    return emitOpError("expected non-negative reverse axis");
+  if (inputType.hasRank()) {
+    int64_t inputRank = inputType.getRank();
+    // We allow for a special case where the input/output shape has rank 0 and
+    // axis is also 0.
+    if (reverseAxis >= inputRank && !(reverseAxis == 0 && inputRank == 0))
+      return emitOpError("expect input tensor rank (")
+             << inputRank << ") to be larger than reverse axis (" << reverseAxis
+             << ")";
+  }
+  if (outputType.hasRank()) {
+    int64_t outputRank = outputType.getRank();
+    if (inputType.hasRank() && outputRank != inputType.getRank())
+      return emitOpError(
+          "expect output tensor rank to be equal to input tensor rank");
+    if (reverseAxis >= outputRank && !(reverseAxis == 0 && outputRank == 0))
+      return emitOpError("expect output tensor rank (")
+             << outputRank << ") to be larger than reverse axis ("
+             << reverseAxis << ")";
+  }
+  return success();
+}
+
 // parse and print of WhileOp refer to the implementation of SCF dialect.
 ParseResult WhileOp::parse(OpAsmParser &parser, OperationState &result) {
   SmallVector<OpAsmParser::Argument, 4> regionArgs;
diff --git a/mlir/test/Dialect/Tosa/canonicalize.mlir b/mlir/test/Dialect/Tosa/canonicalize.mlir
index 46a31d6cf3e96..102c9ed1578cd 100644
--- a/mlir/test/Dialect/Tosa/canonicalize.mlir
+++ b/mlir/test/Dialect/Tosa/canonicalize.mlir
@@ -600,6 +600,6 @@ func.func nested @fold_reduce_rank_zero() {
   // CHECK-NOT: tosa.reverse
   %0 = tensor.empty() : tensor<i32>
   %1 = tosa.reduce_min %0 {axis = 0 : i32} : (tensor<i32>) -> tensor<i32>
-  %2 = tosa.reverse %0 {axis = 0 : i32} : (tensor<i32>) -> tensor<1x10xi32>
+  %2 = tosa.reverse %0 {axis = 0 : i32} : (tensor<i32>) -> tensor<i32>
   return
 }
diff --git a/mlir/test/Dialect/Tosa/invalid.mlir b/mlir/test/Dialect/Tosa/invalid.mlir
index 8a290299b925a..8e23a1fde04bc 100644
--- a/mlir/test/Dialect/Tosa/invalid.mlir
+++ b/mlir/test/Dialect/Tosa/invalid.mlir
@@ -200,6 +200,14 @@ func.func @test_reshape_type_mismatch(%arg0 : tensor<13x21x3xf32>) -> () {
 
 // -----
 
+func.func @test_reverse_axis_out_of_range(%arg0 : tensor<13x21x3xf32>) -> () {
+  // expected-error@+1 {{'tosa.reverse' op expect input tensor rank (3) to be larger than reverse axis (5)}}
+  %0 = tosa.reverse %arg0 {axis = 5 : i32} : (tensor<13x21x3xf32>) -> tensor<?x?x?xi32>
+  return
+}
+
+// -----
+
 func.func @test_const_attribute_type_mismatch() -> tensor<100x100xf32> {
   // expected-error@+1 {{'tosa.const' op failed to verify that all of {value, output} have same shape}}
   %0 = "tosa.const"() {value = dense<0.000000e+00> : tensor<1x1xf32>} : () -> tensor<100x100xf32>

From 8db38bd34421f87cbe7813aeeca81bcbd935025d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Endre=20F=C3=BCl=C3=B6p?= <endre.fulop@sigmatechnology.se>
Date: Sat, 28 Oct 2023 10:22:46 +0200
Subject: [PATCH 289/877] [analyzer] Extend EnumCastOutOfRange diagnostics
 (#68191)

EnumCastOutOfRange checker now reports the name of the enum in the
warning message. Additionally, a note-tag is placed to highlight the
location of the declaration.
---
 .../Checkers/EnumCastOutOfRangeChecker.cpp    |  40 +++++--
 clang/test/Analysis/enum-cast-out-of-range.c  |  44 ++++++-
 .../test/Analysis/enum-cast-out-of-range.cpp  | 108 +++++++++---------
 3 files changed, 123 insertions(+), 69 deletions(-)

diff --git a/clang/lib/StaticAnalyzer/Checkers/EnumCastOutOfRangeChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/EnumCastOutOfRangeChecker.cpp
index 89be6a47250a2..5844f43991001 100644
--- a/clang/lib/StaticAnalyzer/Checkers/EnumCastOutOfRangeChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/EnumCastOutOfRangeChecker.cpp
@@ -59,7 +59,8 @@ class ConstraintBasedEQEvaluator {
 // value can be matching.
 class EnumCastOutOfRangeChecker : public Checker<check::PreStmt<CastExpr>> {
   mutable std::unique_ptr<BugType> EnumValueCastOutOfRange;
-  void reportWarning(CheckerContext &C) const;
+  void reportWarning(CheckerContext &C, const CastExpr *CE,
+                     const EnumDecl *E) const;
 
 public:
   void checkPreStmt(const CastExpr *CE, CheckerContext &C) const;
@@ -72,21 +73,38 @@ EnumValueVector getDeclValuesForEnum(const EnumDecl *ED) {
   EnumValueVector DeclValues(
       std::distance(ED->enumerator_begin(), ED->enumerator_end()));
   llvm::transform(ED->enumerators(), DeclValues.begin(),
-                 [](const EnumConstantDecl *D) { return D->getInitVal(); });
+                  [](const EnumConstantDecl *D) { return D->getInitVal(); });
   return DeclValues;
 }
 } // namespace
 
-void EnumCastOutOfRangeChecker::reportWarning(CheckerContext &C) const {
+void EnumCastOutOfRangeChecker::reportWarning(CheckerContext &C,
+                                              const CastExpr *CE,
+                                              const EnumDecl *E) const {
+  assert(E && "valid EnumDecl* is expected");
   if (const ExplodedNode *N = C.generateNonFatalErrorNode()) {
     if (!EnumValueCastOutOfRange)
       EnumValueCastOutOfRange.reset(
           new BugType(this, "Enum cast out of range"));
-    constexpr llvm::StringLiteral Msg =
-        "The value provided to the cast expression is not in the valid range"
-        " of values for the enum";
-    C.emitReport(std::make_unique<PathSensitiveBugReport>(
-        *EnumValueCastOutOfRange, Msg, N));
+
+    llvm::SmallString<128> Msg{"The value provided to the cast expression is "
+                               "not in the valid range of values for "};
+    StringRef EnumName{E->getName()};
+    if (EnumName.empty()) {
+      Msg += "the enum";
+    } else {
+      Msg += '\'';
+      Msg += EnumName;
+      Msg += '\'';
+    }
+
+    auto BR = std::make_unique<PathSensitiveBugReport>(*EnumValueCastOutOfRange,
+                                                       Msg, N);
+    bugreporter::trackExpressionValue(N, CE->getSubExpr(), *BR);
+    BR->addNote("enum declared here",
+                PathDiagnosticLocation::create(E, C.getSourceManager()),
+                {E->getSourceRange()});
+    C.emitReport(std::move(BR));
   }
 }
 
@@ -138,13 +156,13 @@ void EnumCastOutOfRangeChecker::checkPreStmt(const CastExpr *CE,
     return;
 
   // Check if any of the enum values possibly match.
-  bool PossibleValueMatch = llvm::any_of(
-      DeclValues, ConstraintBasedEQEvaluator(C, *ValueToCast));
+  bool PossibleValueMatch =
+      llvm::any_of(DeclValues, ConstraintBasedEQEvaluator(C, *ValueToCast));
 
   // If there is no value that can possibly match any of the enum values, then
   // warn.
   if (!PossibleValueMatch)
-    reportWarning(C);
+    reportWarning(C, CE, ED);
 }
 
 void ento::registerEnumCastOutOfRangeChecker(CheckerManager &mgr) {
diff --git a/clang/test/Analysis/enum-cast-out-of-range.c b/clang/test/Analysis/enum-cast-out-of-range.c
index 3282cba653d71..4e5c9bb9ffdec 100644
--- a/clang/test/Analysis/enum-cast-out-of-range.c
+++ b/clang/test/Analysis/enum-cast-out-of-range.c
@@ -1,7 +1,9 @@
 // RUN: %clang_analyze_cc1 \
 // RUN:   -analyzer-checker=core,alpha.cplusplus.EnumCastOutOfRange \
+// RUN:   -analyzer-output text \
 // RUN:   -verify %s
 
+// expected-note@+1 + {{enum declared here}}
 enum En_t {
   En_0 = -4,
   En_1,
@@ -11,17 +13,23 @@ enum En_t {
 };
 
 void unscopedUnspecifiedCStyle(void) {
-  enum En_t Below = (enum En_t)(-5);    // expected-warning {{not in the valid range}}
+  enum En_t Below = (enum En_t)(-5);    // expected-warning {{not in the valid range of values for 'En_t'}}
+                                        // expected-note@-1 {{not in the valid range of values for 'En_t'}}
   enum En_t NegVal1 = (enum En_t)(-4);  // OK.
   enum En_t NegVal2 = (enum En_t)(-3);  // OK.
-  enum En_t InRange1 = (enum En_t)(-2); // expected-warning {{not in the valid range}}
-  enum En_t InRange2 = (enum En_t)(-1); // expected-warning {{not in the valid range}}
-  enum En_t InRange3 = (enum En_t)(0);  // expected-warning {{not in the valid range}}
+  enum En_t InRange1 = (enum En_t)(-2); // expected-warning {{not in the valid range of values for 'En_t'}}
+                                        // expected-note@-1 {{not in the valid range of values for 'En_t'}}
+  enum En_t InRange2 = (enum En_t)(-1); // expected-warning {{not in the valid range of values for 'En_t'}}
+                                        // expected-note@-1 {{not in the valid range of values for 'En_t'}}
+  enum En_t InRange3 = (enum En_t)(0);  // expected-warning {{not in the valid range of values for 'En_t'}}
+                                        // expected-note@-1 {{not in the valid range of values for 'En_t'}}
   enum En_t PosVal1 = (enum En_t)(1);   // OK.
   enum En_t PosVal2 = (enum En_t)(2);   // OK.
-  enum En_t InRange4 = (enum En_t)(3);  // expected-warning {{not in the valid range}}
+  enum En_t InRange4 = (enum En_t)(3);  // expected-warning {{not in the valid range of values for 'En_t'}}
+                                        // expected-note@-1 {{not in the valid range of values for 'En_t'}}
   enum En_t PosVal3 = (enum En_t)(4);   // OK.
-  enum En_t Above = (enum En_t)(5);     // expected-warning {{not in the valid range}}
+  enum En_t Above = (enum En_t)(5);     // expected-warning {{not in the valid range of values for 'En_t'}}
+                                        // expected-note@-1 {{not in the valid range of values for 'En_t'}}
 }
 
 enum En_t unused;
@@ -32,3 +40,27 @@ void unusedExpr(void) {
   // generate a warning having nothing to do with this checker.
   unused; // expected-warning {{expression result unused}}
 }
+
+// Test typedef-ed anonymous enums
+typedef enum { // expected-note {{enum declared here}}
+    TD_0 = 0,
+} TD_t;
+
+void testTypeDefEnum(void) {
+  (void)(TD_t)(-1); // expected-warning {{not in the valid range of values for the enum}}
+                    // expected-note@-1 {{not in the valid range of values for the enum}}
+}
+
+// Test expression tracking
+void set(int* p, int v) {
+  *p = v; // expected-note {{The value -1 is assigned to 'i'}}
+}
+
+
+void testTrackExpression(int i) {
+  set(&i, -1); // expected-note {{Passing the value -1 via 2nd parameter 'v'}}
+               // expected-note@-1 {{Calling 'set'}}
+               // expected-note@-2 {{Returning from 'set'}}
+  (void)(enum En_t)(i); // expected-warning {{not in the valid range of values for 'En_t'}}
+                        // expected-note@-1 {{not in the valid range of values for 'En_t'}}
+}
diff --git a/clang/test/Analysis/enum-cast-out-of-range.cpp b/clang/test/Analysis/enum-cast-out-of-range.cpp
index abc1431e5be14..0eb740664ecdc 100644
--- a/clang/test/Analysis/enum-cast-out-of-range.cpp
+++ b/clang/test/Analysis/enum-cast-out-of-range.cpp
@@ -2,6 +2,7 @@
 // RUN:   -analyzer-checker=core,alpha.cplusplus.EnumCastOutOfRange \
 // RUN:   -std=c++11 -verify %s
 
+// expected-note@+1 + {{enum declared here}}
 enum unscoped_unspecified_t {
   unscoped_unspecified_0 = -4,
   unscoped_unspecified_1,
@@ -10,6 +11,7 @@ enum unscoped_unspecified_t {
   unscoped_unspecified_4 = 4
 };
 
+// expected-note@+1 + {{enum declared here}}
 enum unscoped_specified_t : int {
   unscoped_specified_0 = -4,
   unscoped_specified_1,
@@ -18,6 +20,7 @@ enum unscoped_specified_t : int {
   unscoped_specified_4 = 4
 };
 
+// expected-note@+1 + {{enum declared here}}
 enum class scoped_unspecified_t {
   scoped_unspecified_0 = -4,
   scoped_unspecified_1,
@@ -26,6 +29,7 @@ enum class scoped_unspecified_t {
   scoped_unspecified_4 = 4
 };
 
+// expected-note@+1 + {{enum declared here}}
 enum class scoped_specified_t : int {
   scoped_specified_0 = -4,
   scoped_specified_1,
@@ -39,115 +43,115 @@ struct S {
 };
 
 void unscopedUnspecified() {
-  unscoped_unspecified_t InvalidBeforeRangeBegin = static_cast<unscoped_unspecified_t>(-5); // expected-warning {{The value provided to the cast expression is not in the valid range of values for the enum}}
+  unscoped_unspecified_t InvalidBeforeRangeBegin = static_cast<unscoped_unspecified_t>(-5); // expected-warning {{The value provided to the cast expression is not in the valid range of values for 'unscoped_unspecified_t'}}
   unscoped_unspecified_t ValidNegativeValue1 = static_cast<unscoped_unspecified_t>(-4); // OK.
   unscoped_unspecified_t ValidNegativeValue2 = static_cast<unscoped_unspecified_t>(-3); // OK.
-  unscoped_unspecified_t InvalidInsideRange1 = static_cast<unscoped_unspecified_t>(-2); // expected-warning {{The value provided to the cast expression is not in the valid range of values for the enum}}
-  unscoped_unspecified_t InvalidInsideRange2 = static_cast<unscoped_unspecified_t>(-1); // expected-warning {{The value provided to the cast expression is not in the valid range of values for the enum}}
-  unscoped_unspecified_t InvalidInsideRange3 = static_cast<unscoped_unspecified_t>(0); // expected-warning {{The value provided to the cast expression is not in the valid range of values for the enum}}
+  unscoped_unspecified_t InvalidInsideRange1 = static_cast<unscoped_unspecified_t>(-2); // expected-warning {{The value provided to the cast expression is not in the valid range of values for 'unscoped_unspecified_t'}}
+  unscoped_unspecified_t InvalidInsideRange2 = static_cast<unscoped_unspecified_t>(-1); // expected-warning {{The value provided to the cast expression is not in the valid range of values for 'unscoped_unspecified_t'}}
+  unscoped_unspecified_t InvalidInsideRange3 = static_cast<unscoped_unspecified_t>(0); // expected-warning {{The value provided to the cast expression is not in the valid range of values for 'unscoped_unspecified_t'}}
   unscoped_unspecified_t ValidPositiveValue1 = static_cast<unscoped_unspecified_t>(1); // OK.
   unscoped_unspecified_t ValidPositiveValue2 = static_cast<unscoped_unspecified_t>(2); // OK.
-  unscoped_unspecified_t InvalidInsideRange4 = static_cast<unscoped_unspecified_t>(3); // expected-warning {{The value provided to the cast expression is not in the valid range of values for the enum}}
+  unscoped_unspecified_t InvalidInsideRange4 = static_cast<unscoped_unspecified_t>(3); // expected-warning {{The value provided to the cast expression is not in the valid range of values for 'unscoped_unspecified_t'}}
   unscoped_unspecified_t ValidPositiveValue3 = static_cast<unscoped_unspecified_t>(4); // OK.
-  unscoped_unspecified_t InvalidAfterRangeEnd = static_cast<unscoped_unspecified_t>(5); // expected-warning {{The value provided to the cast expression is not in the valid range of values for the enum}}
+  unscoped_unspecified_t InvalidAfterRangeEnd = static_cast<unscoped_unspecified_t>(5); // expected-warning {{The value provided to the cast expression is not in the valid range of values for 'unscoped_unspecified_t'}}
 }
 
 void unscopedSpecified() {
-  unscoped_specified_t InvalidBeforeRangeBegin = static_cast<unscoped_specified_t>(-5); // expected-warning {{The value provided to the cast expression is not in the valid range of values for the enum}}
+  unscoped_specified_t InvalidBeforeRangeBegin = static_cast<unscoped_specified_t>(-5); // expected-warning {{The value provided to the cast expression is not in the valid range of values for 'unscoped_specified_t'}}
   unscoped_specified_t ValidNegativeValue1 = static_cast<unscoped_specified_t>(-4); // OK.
   unscoped_specified_t ValidNegativeValue2 = static_cast<unscoped_specified_t>(-3); // OK.
-  unscoped_specified_t InvalidInsideRange1 = static_cast<unscoped_specified_t>(-2); // expected-warning {{The value provided to the cast expression is not in the valid range of values for the enum}}
-  unscoped_specified_t InvalidInsideRange2 = static_cast<unscoped_specified_t>(-1); // expected-warning {{The value provided to the cast expression is not in the valid range of values for the enum}}
-  unscoped_specified_t InvalidInsideRange3 = static_cast<unscoped_specified_t>(0); // expected-warning {{The value provided to the cast expression is not in the valid range of values for the enum}}
+  unscoped_specified_t InvalidInsideRange1 = static_cast<unscoped_specified_t>(-2); // expected-warning {{The value provided to the cast expression is not in the valid range of values for 'unscoped_specified_t'}}
+  unscoped_specified_t InvalidInsideRange2 = static_cast<unscoped_specified_t>(-1); // expected-warning {{The value provided to the cast expression is not in the valid range of values for 'unscoped_specified_t'}}
+  unscoped_specified_t InvalidInsideRange3 = static_cast<unscoped_specified_t>(0); // expected-warning {{The value provided to the cast expression is not in the valid range of values for 'unscoped_specified_t'}}
   unscoped_specified_t ValidPositiveValue1 = static_cast<unscoped_specified_t>(1); // OK.
   unscoped_specified_t ValidPositiveValue2 = static_cast<unscoped_specified_t>(2); // OK.
-  unscoped_specified_t InvalidInsideRange4 = static_cast<unscoped_specified_t>(3); // expected-warning {{The value provided to the cast expression is not in the valid range of values for the enum}}
+  unscoped_specified_t InvalidInsideRange4 = static_cast<unscoped_specified_t>(3); // expected-warning {{The value provided to the cast expression is not in the valid range of values for 'unscoped_specified_t'}}
   unscoped_specified_t ValidPositiveValue3 = static_cast<unscoped_specified_t>(4); // OK.
-  unscoped_specified_t InvalidAfterRangeEnd = static_cast<unscoped_specified_t>(5); // expected-warning {{The value provided to the cast expression is not in the valid range of values for the enum}}
+  unscoped_specified_t InvalidAfterRangeEnd = static_cast<unscoped_specified_t>(5); // expected-warning {{The value provided to the cast expression is not in the valid range of values for 'unscoped_specified_t'}}
 }
 
 void scopedUnspecified() {
-  scoped_unspecified_t InvalidBeforeRangeBegin = static_cast<scoped_unspecified_t>(-5); // expected-warning{{The value provided to the cast expression is not in the valid range of values for the enum}}
+  scoped_unspecified_t InvalidBeforeRangeBegin = static_cast<scoped_unspecified_t>(-5); // expected-warning{{The value provided to the cast expression is not in the valid range of values for 'scoped_unspecified_t'}}
   scoped_unspecified_t ValidNegativeValue1 = static_cast<scoped_unspecified_t>(-4); // OK.
   scoped_unspecified_t ValidNegativeValue2 = static_cast<scoped_unspecified_t>(-3); // OK.
-  scoped_unspecified_t InvalidInsideRange1 = static_cast<scoped_unspecified_t>(-2); // expected-warning {{The value provided to the cast expression is not in the valid range of values for the enum}}
-  scoped_unspecified_t InvalidInsideRange2 = static_cast<scoped_unspecified_t>(-1); // expected-warning {{The value provided to the cast expression is not in the valid range of values for the enum}}
-  scoped_unspecified_t InvalidInsideRange3 = static_cast<scoped_unspecified_t>(0); // expected-warning {{The value provided to the cast expression is not in the valid range of values for the enum}}
+  scoped_unspecified_t InvalidInsideRange1 = static_cast<scoped_unspecified_t>(-2); // expected-warning {{The value provided to the cast expression is not in the valid range of values for 'scoped_unspecified_t'}}
+  scoped_unspecified_t InvalidInsideRange2 = static_cast<scoped_unspecified_t>(-1); // expected-warning {{The value provided to the cast expression is not in the valid range of values for 'scoped_unspecified_t'}}
+  scoped_unspecified_t InvalidInsideRange3 = static_cast<scoped_unspecified_t>(0); // expected-warning {{The value provided to the cast expression is not in the valid range of values for 'scoped_unspecified_t'}}
   scoped_unspecified_t ValidPositiveValue1 = static_cast<scoped_unspecified_t>(1); // OK.
   scoped_unspecified_t ValidPositiveValue2 = static_cast<scoped_unspecified_t>(2); // OK.
-  scoped_unspecified_t InvalidInsideRange4 = static_cast<scoped_unspecified_t>(3); // expected-warning {{The value provided to the cast expression is not in the valid range of values for the enum}}
+  scoped_unspecified_t InvalidInsideRange4 = static_cast<scoped_unspecified_t>(3); // expected-warning {{The value provided to the cast expression is not in the valid range of values for 'scoped_unspecified_t'}}
   scoped_unspecified_t ValidPositiveValue3 = static_cast<scoped_unspecified_t>(4); // OK.
-  scoped_unspecified_t InvalidAfterRangeEnd = static_cast<scoped_unspecified_t>(5); // expected-warning {{The value provided to the cast expression is not in the valid range of values for the enum}}
+  scoped_unspecified_t InvalidAfterRangeEnd = static_cast<scoped_unspecified_t>(5); // expected-warning {{The value provided to the cast expression is not in the valid range of values for 'scoped_unspecified_t'}}
 }
 
 void scopedSpecified() {
-  scoped_specified_t InvalidBeforeRangeBegin = static_cast<scoped_specified_t>(-5); // expected-warning {{The value provided to the cast expression is not in the valid range of values for the enum}}
+  scoped_specified_t InvalidBeforeRangeBegin = static_cast<scoped_specified_t>(-5); // expected-warning {{The value provided to the cast expression is not in the valid range of values for 'scoped_specified_t'}}
   scoped_specified_t ValidNegativeValue1 = static_cast<scoped_specified_t>(-4); // OK.
   scoped_specified_t ValidNegativeValue2 = static_cast<scoped_specified_t>(-3); // OK.
-  scoped_specified_t InvalidInsideRange1 = static_cast<scoped_specified_t>(-2); // expected-warning {{The value provided to the cast expression is not in the valid range of values for the enum}}
-  scoped_specified_t InvalidInsideRange2 = static_cast<scoped_specified_t>(-1); // expected-warning {{The value provided to the cast expression is not in the valid range of values for the enum}}
-  scoped_specified_t InvalidInsideRange3 = static_cast<scoped_specified_t>(0); // expected-warning {{The value provided to the cast expression is not in the valid range of values for the enum}}
+  scoped_specified_t InvalidInsideRange1 = static_cast<scoped_specified_t>(-2); // expected-warning {{The value provided to the cast expression is not in the valid range of values for 'scoped_specified_t'}}
+  scoped_specified_t InvalidInsideRange2 = static_cast<scoped_specified_t>(-1); // expected-warning {{The value provided to the cast expression is not in the valid range of values for 'scoped_specified_t'}}
+  scoped_specified_t InvalidInsideRange3 = static_cast<scoped_specified_t>(0); // expected-warning {{The value provided to the cast expression is not in the valid range of values for 'scoped_specified_t'}}
   scoped_specified_t ValidPositiveValue1 = static_cast<scoped_specified_t>(1); // OK.
   scoped_specified_t ValidPositiveValue2 = static_cast<scoped_specified_t>(2); // OK.
-  scoped_specified_t InvalidInsideRange4 = static_cast<scoped_specified_t>(3); // expected-warning {{The value provided to the cast expression is not in the valid range of values for the enum}}
+  scoped_specified_t InvalidInsideRange4 = static_cast<scoped_specified_t>(3); // expected-warning {{The value provided to the cast expression is not in the valid range of values for 'scoped_specified_t'}}
   scoped_specified_t ValidPositiveValue3 = static_cast<scoped_specified_t>(4); // OK.
-  scoped_specified_t InvalidAfterRangeEnd = static_cast<scoped_specified_t>(5); // expected-warning {{The value provided to the cast expression is not in the valid range of values for the enum}}
+  scoped_specified_t InvalidAfterRangeEnd = static_cast<scoped_specified_t>(5); // expected-warning {{The value provided to the cast expression is not in the valid range of values for 'scoped_specified_t'}}
 }
 
 void unscopedUnspecifiedCStyle() {
-  unscoped_unspecified_t InvalidBeforeRangeBegin = (unscoped_unspecified_t)(-5); // expected-warning {{The value provided to the cast expression is not in the valid range of values for the enum}}
+  unscoped_unspecified_t InvalidBeforeRangeBegin = (unscoped_unspecified_t)(-5); // expected-warning {{The value provided to the cast expression is not in the valid range of values for 'unscoped_unspecified_t'}}
   unscoped_unspecified_t ValidNegativeValue1 = (unscoped_unspecified_t)(-4); // OK.
   unscoped_unspecified_t ValidNegativeValue2 = (unscoped_unspecified_t)(-3); // OK.
-  unscoped_unspecified_t InvalidInsideRange1 = (unscoped_unspecified_t)(-2); // expected-warning {{The value provided to the cast expression is not in the valid range of values for the enum}}
-  unscoped_unspecified_t InvalidInsideRange2 = (unscoped_unspecified_t)(-1); // expected-warning {{The value provided to the cast expression is not in the valid range of values for the enum}}
-  unscoped_unspecified_t InvalidInsideRange3 = (unscoped_unspecified_t)(0); // expected-warning {{The value provided to the cast expression is not in the valid range of values for the enum}}
+  unscoped_unspecified_t InvalidInsideRange1 = (unscoped_unspecified_t)(-2); // expected-warning {{The value provided to the cast expression is not in the valid range of values for 'unscoped_unspecified_t'}}
+  unscoped_unspecified_t InvalidInsideRange2 = (unscoped_unspecified_t)(-1); // expected-warning {{The value provided to the cast expression is not in the valid range of values for 'unscoped_unspecified_t'}}
+  unscoped_unspecified_t InvalidInsideRange3 = (unscoped_unspecified_t)(0); // expected-warning {{The value provided to the cast expression is not in the valid range of values for 'unscoped_unspecified_t'}}
   unscoped_unspecified_t ValidPositiveValue1 = (unscoped_unspecified_t)(1); // OK.
   unscoped_unspecified_t ValidPositiveValue2 = (unscoped_unspecified_t)(2); // OK.
-  unscoped_unspecified_t InvalidInsideRange4 = (unscoped_unspecified_t)(3); // expected-warning {{The value provided to the cast expression is not in the valid range of values for the enum}}
+  unscoped_unspecified_t InvalidInsideRange4 = (unscoped_unspecified_t)(3); // expected-warning {{The value provided to the cast expression is not in the valid range of values for 'unscoped_unspecified_t'}}
   unscoped_unspecified_t ValidPositiveValue3 = (unscoped_unspecified_t)(4); // OK.
-  unscoped_unspecified_t InvalidAfterRangeEnd = (unscoped_unspecified_t)(5); // expected-warning {{The value provided to the cast expression is not in the valid range of values for the enum}}
+  unscoped_unspecified_t InvalidAfterRangeEnd = (unscoped_unspecified_t)(5); // expected-warning {{The value provided to the cast expression is not in the valid range of values for 'unscoped_unspecified_t'}}
 }
 
 void unscopedSpecifiedCStyle() {
-  unscoped_specified_t InvalidBeforeRangeBegin = (unscoped_specified_t)(-5); // expected-warning {{The value provided to the cast expression is not in the valid range of values for the enum}}
+  unscoped_specified_t InvalidBeforeRangeBegin = (unscoped_specified_t)(-5); // expected-warning {{The value provided to the cast expression is not in the valid range of values for 'unscoped_specified_t'}}
   unscoped_specified_t ValidNegativeValue1 = (unscoped_specified_t)(-4); // OK.
   unscoped_specified_t ValidNegativeValue2 = (unscoped_specified_t)(-3); // OK.
-  unscoped_specified_t InvalidInsideRange1 = (unscoped_specified_t)(-2); // expected-warning {{The value provided to the cast expression is not in the valid range of values for the enum}}
-  unscoped_specified_t InvalidInsideRange2 = (unscoped_specified_t)(-1); // expected-warning {{The value provided to the cast expression is not in the valid range of values for the enum}}
-  unscoped_specified_t InvalidInsideRange3 = (unscoped_specified_t)(0); // expected-warning {{The value provided to the cast expression is not in the valid range of values for the enum}}
+  unscoped_specified_t InvalidInsideRange1 = (unscoped_specified_t)(-2); // expected-warning {{The value provided to the cast expression is not in the valid range of values for 'unscoped_specified_t'}}
+  unscoped_specified_t InvalidInsideRange2 = (unscoped_specified_t)(-1); // expected-warning {{The value provided to the cast expression is not in the valid range of values for 'unscoped_specified_t'}}
+  unscoped_specified_t InvalidInsideRange3 = (unscoped_specified_t)(0); // expected-warning {{The value provided to the cast expression is not in the valid range of values for 'unscoped_specified_t'}}
   unscoped_specified_t ValidPositiveValue1 = (unscoped_specified_t)(1); // OK.
   unscoped_specified_t ValidPositiveValue2 = (unscoped_specified_t)(2); // OK.
-  unscoped_specified_t InvalidInsideRange4 = (unscoped_specified_t)(3); // expected-warning {{The value provided to the cast expression is not in the valid range of values for the enum}}
+  unscoped_specified_t InvalidInsideRange4 = (unscoped_specified_t)(3); // expected-warning {{The value provided to the cast expression is not in the valid range of values for 'unscoped_specified_t'}}
   unscoped_specified_t ValidPositiveValue3 = (unscoped_specified_t)(4); // OK.
-  unscoped_specified_t InvalidAfterRangeEnd = (unscoped_specified_t)(5); // expected-warning {{The value provided to the cast expression is not in the valid range of values for the enum}}
+  unscoped_specified_t InvalidAfterRangeEnd = (unscoped_specified_t)(5); // expected-warning {{The value provided to the cast expression is not in the valid range of values for 'unscoped_specified_t'}}
 }
 
 void scopedUnspecifiedCStyle() {
-  scoped_unspecified_t InvalidBeforeRangeBegin = (scoped_unspecified_t)(-5); // expected-warning{{The value provided to the cast expression is not in the valid range of values for the enum}}
+  scoped_unspecified_t InvalidBeforeRangeBegin = (scoped_unspecified_t)(-5); // expected-warning{{The value provided to the cast expression is not in the valid range of values for 'scoped_unspecified_t'}}
   scoped_unspecified_t ValidNegativeValue1 = (scoped_unspecified_t)(-4); // OK.
   scoped_unspecified_t ValidNegativeValue2 = (scoped_unspecified_t)(-3); // OK.
-  scoped_unspecified_t InvalidInsideRange1 = (scoped_unspecified_t)(-2); // expected-warning {{The value provided to the cast expression is not in the valid range of values for the enum}}
-  scoped_unspecified_t InvalidInsideRange2 = (scoped_unspecified_t)(-1); // expected-warning {{The value provided to the cast expression is not in the valid range of values for the enum}}
-  scoped_unspecified_t InvalidInsideRange3 = (scoped_unspecified_t)(0); // expected-warning {{The value provided to the cast expression is not in the valid range of values for the enum}}
+  scoped_unspecified_t InvalidInsideRange1 = (scoped_unspecified_t)(-2); // expected-warning {{The value provided to the cast expression is not in the valid range of values for 'scoped_unspecified_t'}}
+  scoped_unspecified_t InvalidInsideRange2 = (scoped_unspecified_t)(-1); // expected-warning {{The value provided to the cast expression is not in the valid range of values for 'scoped_unspecified_t'}}
+  scoped_unspecified_t InvalidInsideRange3 = (scoped_unspecified_t)(0); // expected-warning {{The value provided to the cast expression is not in the valid range of values for 'scoped_unspecified_t'}}
   scoped_unspecified_t ValidPositiveValue1 = (scoped_unspecified_t)(1); // OK.
   scoped_unspecified_t ValidPositiveValue2 = (scoped_unspecified_t)(2); // OK.
-  scoped_unspecified_t InvalidInsideRange4 = (scoped_unspecified_t)(3); // expected-warning {{The value provided to the cast expression is not in the valid range of values for the enum}}
+  scoped_unspecified_t InvalidInsideRange4 = (scoped_unspecified_t)(3); // expected-warning {{The value provided to the cast expression is not in the valid range of values for 'scoped_unspecified_t'}}
   scoped_unspecified_t ValidPositiveValue3 = (scoped_unspecified_t)(4); // OK.
-  scoped_unspecified_t InvalidAfterRangeEnd = (scoped_unspecified_t)(5); // expected-warning {{The value provided to the cast expression is not in the valid range of values for the enum}}
+  scoped_unspecified_t InvalidAfterRangeEnd = (scoped_unspecified_t)(5); // expected-warning {{The value provided to the cast expression is not in the valid range of values for 'scoped_unspecified_t'}}
 }
 
 void scopedSpecifiedCStyle() {
-  scoped_specified_t InvalidBeforeRangeBegin = (scoped_specified_t)(-5); // expected-warning {{The value provided to the cast expression is not in the valid range of values for the enum}}
+  scoped_specified_t InvalidBeforeRangeBegin = (scoped_specified_t)(-5); // expected-warning {{The value provided to the cast expression is not in the valid range of values for 'scoped_specified_t'}}
   scoped_specified_t ValidNegativeValue1 = (scoped_specified_t)(-4); // OK.
   scoped_specified_t ValidNegativeValue2 = (scoped_specified_t)(-3); // OK.
-  scoped_specified_t InvalidInsideRange1 = (scoped_specified_t)(-2); // expected-warning {{The value provided to the cast expression is not in the valid range of values for the enum}}
-  scoped_specified_t InvalidInsideRange2 = (scoped_specified_t)(-1); // expected-warning {{The value provided to the cast expression is not in the valid range of values for the enum}}
-  scoped_specified_t InvalidInsideRange3 = (scoped_specified_t)(0); // expected-warning {{The value provided to the cast expression is not in the valid range of values for the enum}}
+  scoped_specified_t InvalidInsideRange1 = (scoped_specified_t)(-2); // expected-warning {{The value provided to the cast expression is not in the valid range of values for 'scoped_specified_t'}}
+  scoped_specified_t InvalidInsideRange2 = (scoped_specified_t)(-1); // expected-warning {{The value provided to the cast expression is not in the valid range of values for 'scoped_specified_t'}}
+  scoped_specified_t InvalidInsideRange3 = (scoped_specified_t)(0); // expected-warning {{The value provided to the cast expression is not in the valid range of values for 'scoped_specified_t'}}
   scoped_specified_t ValidPositiveValue1 = (scoped_specified_t)(1); // OK.
   scoped_specified_t ValidPositiveValue2 = (scoped_specified_t)(2); // OK.
-  scoped_specified_t InvalidInsideRange4 = (scoped_specified_t)(3); // expected-warning {{The value provided to the cast expression is not in the valid range of values for the enum}}
+  scoped_specified_t InvalidInsideRange4 = (scoped_specified_t)(3); // expected-warning {{The value provided to the cast expression is not in the valid range of values for 'scoped_specified_t'}}
   scoped_specified_t ValidPositiveValue3 = (scoped_specified_t)(4); // OK.
-  scoped_specified_t InvalidAfterRangeEnd = (scoped_specified_t)(5); // expected-warning {{The value provided to the cast expression is not in the valid range of values for the enum}}
+  scoped_specified_t InvalidAfterRangeEnd = (scoped_specified_t)(5); // expected-warning {{The value provided to the cast expression is not in the valid range of values for 'scoped_specified_t'}}
 }
 
 unscoped_unspecified_t unused;
@@ -165,12 +169,12 @@ void rangeConstrained1(int input) {
 
 void rangeConstrained2(int input) {
   if (input < -5)
-    auto value = static_cast<scoped_specified_t>(input); // expected-warning {{The value provided to the cast expression is not in the valid range of values for the enum}}
+    auto value = static_cast<scoped_specified_t>(input); // expected-warning {{The value provided to the cast expression is not in the valid range of values for 'scoped_specified_t'}}
 }
 
 void rangeConstrained3(int input) {
   if (input >= -2 && input <= -1)
-    auto value = static_cast<scoped_specified_t>(input); // expected-warning {{The value provided to the cast expression is not in the valid range of values for the enum}}
+    auto value = static_cast<scoped_specified_t>(input); // expected-warning {{The value provided to the cast expression is not in the valid range of values for 'scoped_specified_t'}}
 }
 
 void rangeConstrained4(int input) {
@@ -190,13 +194,13 @@ void rangeConstrained6(int input) {
 
 void rangeConstrained7(int input) {
   if (input >= 3 && input <= 3)
-    auto value = static_cast<scoped_specified_t>(input); // expected-warning {{The value provided to the cast expression is not in the valid range of values for the enum}}
+    auto value = static_cast<scoped_specified_t>(input); // expected-warning {{The value provided to the cast expression is not in the valid range of values for 'scoped_specified_t'}}
 }
 
 void enumBitFieldAssignment() {
   S s;
   s.E = static_cast<unscoped_unspecified_t>(4); // OK.
-  s.E = static_cast<unscoped_unspecified_t>(5); // expected-warning {{The value provided to the cast expression is not in the valid range of values for the enum}}
+  s.E = static_cast<unscoped_unspecified_t>(5); // expected-warning {{The value provided to the cast expression is not in the valid range of values for 'unscoped_unspecified_t'}}
 }
 
 
From fc6bdb8549842613da51b9d570b29e27cc709f69 Mon Sep 17 00:00:00 2001
From: XChy <xxs_chy@outlook.com>
Date: Sat, 28 Oct 2023 17:10:20 +0800
Subject: [PATCH 290/877] [SimplifyCFG] Reland transform for redirecting phis
 between unmergeable BB and SuccBB (#68473)

Reland #67275 with #68953 resolved.
---
 llvm/lib/Transforms/Utils/Local.cpp           | 166 +++++++--
 llvm/test/CodeGen/ARM/jump-table-islands.ll   |   4 +
 .../Transforms/JumpThreading/codesize-loop.ll |  48 ++-
 .../Transforms/SimplifyCFG/branch-fold.ll     |  32 ++
 .../SimplifyCFG/merge-phis-in-switch.ll       | 327 ++++++++++++++++++
 .../Transforms/SimplifyCFG/multiple-phis.ll   | 279 ++++++++++++++-
 .../SimplifyCFG/switch-simplify-crash2.ll     |   3 +
 7 files changed, 789 insertions(+), 70 deletions(-)
 create mode 100644 llvm/test/Transforms/SimplifyCFG/merge-phis-in-switch.ll

diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp
index a255db7dafa79..aacf66bfe38eb 100644
--- a/llvm/lib/Transforms/Utils/Local.cpp
+++ b/llvm/lib/Transforms/Utils/Local.cpp
@@ -850,17 +850,17 @@ static bool CanMergeValues(Value *First, Value *Second) {
 /// branch to Succ, into Succ.
 ///
 /// Assumption: Succ is the single successor for BB.
-static bool CanPropagatePredecessorsForPHIs(BasicBlock *BB, BasicBlock *Succ) {
+static bool
+CanPropagatePredecessorsForPHIs(BasicBlock *BB, BasicBlock *Succ,
+                                const SmallPtrSetImpl<BasicBlock *> &BBPreds) {
   assert(*succ_begin(BB) == Succ && "Succ is not successor of BB!");
 
   LLVM_DEBUG(dbgs() << "Looking to fold " << BB->getName() << " into "
                     << Succ->getName() << "\n");
   // Shortcut, if there is only a single predecessor it must be BB and merging
   // is always safe
-  if (Succ->getSinglePredecessor()) return true;
-
-  // Make a list of the predecessors of BB
-  SmallPtrSet<BasicBlock*, 16> BBPreds(pred_begin(BB), pred_end(BB));
+  if (Succ->getSinglePredecessor())
+    return true;
 
   // Look at all the phi nodes in Succ, to see if they present a conflict when
   // merging these blocks
@@ -1000,6 +1000,35 @@ static void replaceUndefValuesInPhi(PHINode *PN,
   }
 }
 
+// Only when they shares a single common predecessor, return true.
+// Only handles cases when BB can't be merged while its predecessors can be
+// redirected.
+static bool
+CanRedirectPredsOfEmptyBBToSucc(BasicBlock *BB, BasicBlock *Succ,
+                                const SmallPtrSetImpl<BasicBlock *> &BBPreds,
+                                const SmallPtrSetImpl<BasicBlock *> &SuccPreds,
+                                BasicBlock *&CommonPred) {
+
+  // There must be phis in BB, otherwise BB will be merged into Succ directly
+  if (BB->phis().empty() || Succ->phis().empty())
+    return false;
+
+  // BB must have predecessors not shared that can be redirected to Succ
+  if (!BB->hasNPredecessorsOrMore(2))
+    return false;
+
+  // Get single common predecessors of both BB and Succ
+  for (BasicBlock *SuccPred : SuccPreds) {
+    if (BBPreds.count(SuccPred)) {
+      if (CommonPred)
+        return false;
+      CommonPred = SuccPred;
+    }
+  }
+
+  return true;
+}
+
 /// Replace a value flowing from a block to a phi with
 /// potentially multiple instances of that value flowing from the
 /// block's predecessors to the phi.
@@ -1007,9 +1036,11 @@ static void replaceUndefValuesInPhi(PHINode *PN,
 /// \param BB The block with the value flowing into the phi.
 /// \param BBPreds The predecessors of BB.
 /// \param PN The phi that we are updating.
+/// \param CommonPred The common predecessor of BB and PN's BasicBlock
 static void redirectValuesFromPredecessorsToPhi(BasicBlock *BB,
                                                 const PredBlockVector &BBPreds,
-                                                PHINode *PN) {
+                                                PHINode *PN,
+                                                BasicBlock *CommonPred) {
   Value *OldVal = PN->removeIncomingValue(BB, false);
   assert(OldVal && "No entry in PHI for Pred BB!");
 
@@ -1037,26 +1068,39 @@ static void redirectValuesFromPredecessorsToPhi(BasicBlock *BB,
       // will trigger asserts if we try to clean it up now, without also
       // simplifying the corresponding conditional branch).
       BasicBlock *PredBB = OldValPN->getIncomingBlock(i);
+
+      if (PredBB == CommonPred)
+        continue;
+
       Value *PredVal = OldValPN->getIncomingValue(i);
-      Value *Selected = selectIncomingValueForBlock(PredVal, PredBB,
-                                                    IncomingValues);
+      Value *Selected =
+          selectIncomingValueForBlock(PredVal, PredBB, IncomingValues);
 
       // And add a new incoming value for this predecessor for the
       // newly retargeted branch.
       PN->addIncoming(Selected, PredBB);
     }
+    if (CommonPred)
+      PN->addIncoming(OldValPN->getIncomingValueForBlock(CommonPred), BB);
+
   } else {
     for (unsigned i = 0, e = BBPreds.size(); i != e; ++i) {
       // Update existing incoming values in PN for this
       // predecessor of BB.
       BasicBlock *PredBB = BBPreds[i];
-      Value *Selected = selectIncomingValueForBlock(OldVal, PredBB,
-                                                    IncomingValues);
+
+      if (PredBB == CommonPred)
+        continue;
+
+      Value *Selected =
+          selectIncomingValueForBlock(OldVal, PredBB, IncomingValues);
 
       // And add a new incoming value for this predecessor for the
       // newly retargeted branch.
       PN->addIncoming(Selected, PredBB);
     }
+    if (CommonPred)
+      PN->addIncoming(OldVal, BB);
   }
 
   replaceUndefValuesInPhi(PN, IncomingValues);
@@ -1067,13 +1111,30 @@ bool llvm::TryToSimplifyUncondBranchFromEmptyBlock(BasicBlock *BB,
   assert(BB != &BB->getParent()->getEntryBlock() &&
          "TryToSimplifyUncondBranchFromEmptyBlock called on entry block!");
 
-  // We can't eliminate infinite loops.
+  // We can't simplify infinite loops.
   BasicBlock *Succ = cast<BranchInst>(BB->getTerminator())->getSuccessor(0);
-  if (BB == Succ) return false;
+  if (BB == Succ)
+    return false;
+
+  SmallPtrSet<BasicBlock *, 16> BBPreds(pred_begin(BB), pred_end(BB));
+  SmallPtrSet<BasicBlock *, 16> SuccPreds(pred_begin(Succ), pred_end(Succ));
 
-  // Check to see if merging these blocks would cause conflicts for any of the
-  // phi nodes in BB or Succ. If not, we can safely merge.
-  if (!CanPropagatePredecessorsForPHIs(BB, Succ)) return false;
+  // The single common predecessor of BB and Succ when BB cannot be killed
+  BasicBlock *CommonPred = nullptr;
+
+  bool BBKillable = CanPropagatePredecessorsForPHIs(BB, Succ, BBPreds);
+
+  // Even if we can not fold bB into Succ, we may be able to redirect the
+  // predecessors of BB to Succ.
+  bool BBPhisMergeable =
+      BBKillable ||
+      CanRedirectPredsOfEmptyBBToSucc(BB, Succ, BBPreds, SuccPreds, CommonPred);
+
+  if (!BBKillable && !BBPhisMergeable)
+    return false;
+
+  // Check to see if merging these blocks/phis would cause conflicts for any of
+  // the phi nodes in BB or Succ. If not, we can safely merge.
 
   // Check for cases where Succ has multiple predecessors and a PHI node in BB
   // has uses which will not disappear when the PHI nodes are merged.  It is
@@ -1102,6 +1163,11 @@ bool llvm::TryToSimplifyUncondBranchFromEmptyBlock(BasicBlock *BB,
     }
   }
 
+  if (BBPhisMergeable && CommonPred)
+    LLVM_DEBUG(dbgs() << "Found Common Predecessor between: " << BB->getName()
+                      << " and " << Succ->getName() << " : "
+                      << CommonPred->getName() << "\n");
+
   // 'BB' and 'BB->Pred' are loop latches, bail out to presrve inner loop
   // metadata.
   //
@@ -1174,25 +1240,37 @@ bool llvm::TryToSimplifyUncondBranchFromEmptyBlock(BasicBlock *BB,
           if (PredTI->hasMetadata(LLVMContext::MD_loop))
             return false;
 
-  LLVM_DEBUG(dbgs() << "Killing Trivial BB: \n" << *BB);
+  if (BBKillable)
+    LLVM_DEBUG(dbgs() << "Killing Trivial BB: \n" << *BB);
+  else if (BBPhisMergeable)
+    LLVM_DEBUG(dbgs() << "Merge Phis in Trivial BB: \n" << *BB);
 
   SmallVector<DominatorTree::UpdateType, 32> Updates;
+
   if (DTU) {
     // To avoid processing the same predecessor more than once.
     SmallPtrSet<BasicBlock *, 8> SeenPreds;
-    // All predecessors of BB will be moved to Succ.
-    SmallPtrSet<BasicBlock *, 8> PredsOfSucc(pred_begin(Succ), pred_end(Succ));
+    // All predecessors of BB (except the common predecessor) will be moved to
+    // Succ.
     Updates.reserve(Updates.size() + 2 * pred_size(BB) + 1);
-    for (auto *PredOfBB : predecessors(BB))
-      // This predecessor of BB may already have Succ as a successor.
-      if (!PredsOfSucc.contains(PredOfBB))
+
+    for (auto *PredOfBB : predecessors(BB)) {
+      // Do not modify those common predecessors of BB and Succ
+      if (!SuccPreds.contains(PredOfBB))
         if (SeenPreds.insert(PredOfBB).second)
           Updates.push_back({DominatorTree::Insert, PredOfBB, Succ});
+    }
+
     SeenPreds.clear();
+
     for (auto *PredOfBB : predecessors(BB))
-      if (SeenPreds.insert(PredOfBB).second)
+      // When BB cannot be killed, do not remove the edge between BB and
+      // CommonPred.
+      if (SeenPreds.insert(PredOfBB).second && PredOfBB != CommonPred)
         Updates.push_back({DominatorTree::Delete, PredOfBB, BB});
-    Updates.push_back({DominatorTree::Delete, BB, Succ});
+
+    if (BBKillable)
+      Updates.push_back({DominatorTree::Delete, BB, Succ});
   }
 
   if (isa<PHINode>(Succ->begin())) {
@@ -1204,21 +1282,19 @@ bool llvm::TryToSimplifyUncondBranchFromEmptyBlock(BasicBlock *BB,
     // Loop over all of the PHI nodes in the successor of BB.
     for (BasicBlock::iterator I = Succ->begin(); isa<PHINode>(I); ++I) {
       PHINode *PN = cast<PHINode>(I);
-
-      redirectValuesFromPredecessorsToPhi(BB, BBPreds, PN);
+      redirectValuesFromPredecessorsToPhi(BB, BBPreds, PN, CommonPred);
     }
   }
 
   if (Succ->getSinglePredecessor()) {
     // BB is the only predecessor of Succ, so Succ will end up with exactly
     // the same predecessors BB had.
-
     // Copy over any phi, debug or lifetime instruction.
     BB->getTerminator()->eraseFromParent();
     Succ->splice(Succ->getFirstNonPHI()->getIterator(), BB);
   } else {
     while (PHINode *PN = dyn_cast<PHINode>(&BB->front())) {
-      // We explicitly check for such uses in CanPropagatePredecessorsForPHIs.
+      // We explicitly check for such uses for merging phis.
       assert(PN->use_empty() && "There shouldn't be any uses here!");
       PN->eraseFromParent();
     }
@@ -1231,21 +1307,35 @@ bool llvm::TryToSimplifyUncondBranchFromEmptyBlock(BasicBlock *BB,
       for (BasicBlock *Pred : predecessors(BB))
         Pred->getTerminator()->setMetadata(LLVMContext::MD_loop, LoopMD);
 
-  // Everything that jumped to BB now goes to Succ.
-  BB->replaceAllUsesWith(Succ);
-  if (!Succ->hasName()) Succ->takeName(BB);
-
-  // Clear the successor list of BB to match updates applying to DTU later.
-  if (BB->getTerminator())
-    BB->back().eraseFromParent();
-  new UnreachableInst(BB->getContext(), BB);
-  assert(succ_empty(BB) && "The successor list of BB isn't empty before "
-                           "applying corresponding DTU updates.");
+  if (BBKillable) {
+    // Everything that jumped to BB now goes to Succ.
+    BB->replaceAllUsesWith(Succ);
+
+    if (!Succ->hasName())
+      Succ->takeName(BB);
+
+    // Clear the successor list of BB to match updates applying to DTU later.
+    if (BB->getTerminator())
+      BB->back().eraseFromParent();
+
+    new UnreachableInst(BB->getContext(), BB);
+    assert(succ_empty(BB) && "The successor list of BB isn't empty before "
+                             "applying corresponding DTU updates.");
+  } else if (BBPhisMergeable) {
+    //  Everything except CommonPred that jumped to BB now goes to Succ.
+    BB->replaceUsesWithIf(Succ, [BBPreds, CommonPred](Use &U) -> bool {
+      if (Instruction *UseInst = dyn_cast<Instruction>(U.getUser()))
+        return UseInst->getParent() != CommonPred &&
+               BBPreds.contains(UseInst->getParent());
+      return false;
+    });
+  }
 
   if (DTU)
     DTU->applyUpdates(Updates);
 
-  DeleteDeadBlock(BB, DTU);
+  if (BBKillable)
+    DeleteDeadBlock(BB, DTU);
 
   return true;
 }
diff --git a/llvm/test/CodeGen/ARM/jump-table-islands.ll b/llvm/test/CodeGen/ARM/jump-table-islands.ll
index c327affc04539..e774930bd2bb0 100644
--- a/llvm/test/CodeGen/ARM/jump-table-islands.ll
+++ b/llvm/test/CodeGen/ARM/jump-table-islands.ll
@@ -2,6 +2,8 @@
 
 %BigInt = type i8500
 
+declare void @use(%BigInt)
+
 define %BigInt @test_moved_jumptable(i1 %tst, i32 %sw, %BigInt %l) {
 ; CHECK-LABEL: test_moved_jumptable:
 
@@ -34,6 +36,8 @@ other:
 
 end:
   %val = phi %BigInt [ %l, %complex ], [ -1, %simple ]
+; Prevent SimplifyCFG from simplifying the phi node above.
+  call void @use(%BigInt %val)
   ret %BigInt %val
 }
 
diff --git a/llvm/test/Transforms/JumpThreading/codesize-loop.ll b/llvm/test/Transforms/JumpThreading/codesize-loop.ll
index a1a2b1dfd15ed..f01c149ac45a3 100644
--- a/llvm/test/Transforms/JumpThreading/codesize-loop.ll
+++ b/llvm/test/Transforms/JumpThreading/codesize-loop.ll
@@ -42,16 +42,14 @@ define i32 @test_minsize(i32 %argc, ptr nocapture readonly %argv) local_unnamed_
 ; OVERIDE-NEXT:    [[TMP3:%.*]] = mul i32 [[CALL]], [[TMP2]]
 ; OVERIDE-NEXT:    [[TMP4:%.*]] = icmp sgt i32 [[CALL]], 0
 ; OVERIDE-NEXT:    [[COND_FR:%.*]] = freeze i1 [[TMP4]]
-; OVERIDE-NEXT:    br i1 [[COND_FR]], label [[COND_END_THREAD]], label [[TMP6:%.*]]
+; OVERIDE-NEXT:    br i1 [[COND_FR]], label [[TMP5:%.*]], label [[COND_END_THREAD]]
+; OVERIDE:       5:
+; OVERIDE-NEXT:    br label [[COND_END_THREAD]]
 ; OVERIDE:       cond.end.thread:
-; OVERIDE-NEXT:    [[TMP5:%.*]] = phi i32 [ [[TMP3]], [[COND_END]] ], [ 205962976, [[ENTRY:%.*]] ]
-; OVERIDE-NEXT:    [[COND3:%.*]] = phi i32 [ [[CALL]], [[COND_END]] ], [ 46, [[ENTRY]] ]
-; OVERIDE-NEXT:    br label [[TMP6]]
-; OVERIDE:       6:
-; OVERIDE-NEXT:    [[TMP7:%.*]] = phi i32 [ [[TMP5]], [[COND_END_THREAD]] ], [ [[TMP3]], [[COND_END]] ]
-; OVERIDE-NEXT:    [[TMP8:%.*]] = phi i32 [ [[COND3]], [[COND_END_THREAD]] ], [ 0, [[COND_END]] ]
-; OVERIDE-NEXT:    [[TMP9:%.*]] = mul i32 [[TMP7]], [[TMP8]]
-; OVERIDE-NEXT:    [[CALL33:%.*]] = tail call i32 (ptr, ...) @printf(ptr nonnull dereferenceable(1) @.str, i32 [[TMP9]])
+; OVERIDE-NEXT:    [[TMP6:%.*]] = phi i32 [ [[TMP3]], [[COND_END]] ], [ [[TMP3]], [[TMP5]] ], [ 205962976, [[ENTRY:%.*]] ]
+; OVERIDE-NEXT:    [[TMP7:%.*]] = phi i32 [ 0, [[COND_END]] ], [ [[CALL]], [[TMP5]] ], [ 46, [[ENTRY]] ]
+; OVERIDE-NEXT:    [[TMP8:%.*]] = mul i32 [[TMP6]], [[TMP7]]
+; OVERIDE-NEXT:    [[CALL33:%.*]] = tail call i32 (ptr, ...) @printf(ptr nonnull dereferenceable(1) @.str, i32 [[TMP8]])
 ; OVERIDE-NEXT:    ret i32 0
 ;
 entry:
@@ -90,16 +88,14 @@ define i32 @test_optsize(i32 %argc, ptr nocapture readonly %argv) local_unnamed_
 ; DEFAULT-NEXT:    [[TMP3:%.*]] = mul i32 [[CALL]], [[TMP2]]
 ; DEFAULT-NEXT:    [[TMP4:%.*]] = icmp sgt i32 [[CALL]], 0
 ; DEFAULT-NEXT:    [[COND_FR:%.*]] = freeze i1 [[TMP4]]
-; DEFAULT-NEXT:    br i1 [[COND_FR]], label [[COND_END_THREAD]], label [[TMP6:%.*]]
+; DEFAULT-NEXT:    br i1 [[COND_FR]], label [[TMP5:%.*]], label [[COND_END_THREAD]]
+; DEFAULT:       5:
+; DEFAULT-NEXT:    br label [[COND_END_THREAD]]
 ; DEFAULT:       cond.end.thread:
-; DEFAULT-NEXT:    [[TMP5:%.*]] = phi i32 [ [[TMP3]], [[COND_END]] ], [ 205962976, [[ENTRY:%.*]] ]
-; DEFAULT-NEXT:    [[COND3:%.*]] = phi i32 [ [[CALL]], [[COND_END]] ], [ 46, [[ENTRY]] ]
-; DEFAULT-NEXT:    br label [[TMP6]]
-; DEFAULT:       6:
-; DEFAULT-NEXT:    [[TMP7:%.*]] = phi i32 [ [[TMP5]], [[COND_END_THREAD]] ], [ [[TMP3]], [[COND_END]] ]
-; DEFAULT-NEXT:    [[TMP8:%.*]] = phi i32 [ [[COND3]], [[COND_END_THREAD]] ], [ 0, [[COND_END]] ]
-; DEFAULT-NEXT:    [[TMP9:%.*]] = mul i32 [[TMP7]], [[TMP8]]
-; DEFAULT-NEXT:    [[CALL33:%.*]] = tail call i32 (ptr, ...) @printf(ptr nonnull dereferenceable(1) @.str, i32 [[TMP9]])
+; DEFAULT-NEXT:    [[TMP6:%.*]] = phi i32 [ [[TMP3]], [[COND_END]] ], [ [[TMP3]], [[TMP5]] ], [ 205962976, [[ENTRY:%.*]] ]
+; DEFAULT-NEXT:    [[TMP7:%.*]] = phi i32 [ 0, [[COND_END]] ], [ [[CALL]], [[TMP5]] ], [ 46, [[ENTRY]] ]
+; DEFAULT-NEXT:    [[TMP8:%.*]] = mul i32 [[TMP6]], [[TMP7]]
+; DEFAULT-NEXT:    [[CALL33:%.*]] = tail call i32 (ptr, ...) @printf(ptr nonnull dereferenceable(1) @.str, i32 [[TMP8]])
 ; DEFAULT-NEXT:    ret i32 0
 ;
 ; OVERIDE-LABEL: @test_optsize(
@@ -115,16 +111,14 @@ define i32 @test_optsize(i32 %argc, ptr nocapture readonly %argv) local_unnamed_
 ; OVERIDE-NEXT:    [[TMP3:%.*]] = mul i32 [[CALL]], [[TMP2]]
 ; OVERIDE-NEXT:    [[TMP4:%.*]] = icmp sgt i32 [[CALL]], 0
 ; OVERIDE-NEXT:    [[COND_FR:%.*]] = freeze i1 [[TMP4]]
-; OVERIDE-NEXT:    br i1 [[COND_FR]], label [[COND_END_THREAD]], label [[TMP6:%.*]]
+; OVERIDE-NEXT:    br i1 [[COND_FR]], label [[TMP5:%.*]], label [[COND_END_THREAD]]
+; OVERIDE:       5:
+; OVERIDE-NEXT:    br label [[COND_END_THREAD]]
 ; OVERIDE:       cond.end.thread:
-; OVERIDE-NEXT:    [[TMP5:%.*]] = phi i32 [ [[TMP3]], [[COND_END]] ], [ 205962976, [[ENTRY:%.*]] ]
-; OVERIDE-NEXT:    [[COND3:%.*]] = phi i32 [ [[CALL]], [[COND_END]] ], [ 46, [[ENTRY]] ]
-; OVERIDE-NEXT:    br label [[TMP6]]
-; OVERIDE:       6:
-; OVERIDE-NEXT:    [[TMP7:%.*]] = phi i32 [ [[TMP5]], [[COND_END_THREAD]] ], [ [[TMP3]], [[COND_END]] ]
-; OVERIDE-NEXT:    [[TMP8:%.*]] = phi i32 [ [[COND3]], [[COND_END_THREAD]] ], [ 0, [[COND_END]] ]
-; OVERIDE-NEXT:    [[TMP9:%.*]] = mul i32 [[TMP7]], [[TMP8]]
-; OVERIDE-NEXT:    [[CALL33:%.*]] = tail call i32 (ptr, ...) @printf(ptr nonnull dereferenceable(1) @.str, i32 [[TMP9]])
+; OVERIDE-NEXT:    [[TMP6:%.*]] = phi i32 [ [[TMP3]], [[COND_END]] ], [ [[TMP3]], [[TMP5]] ], [ 205962976, [[ENTRY:%.*]] ]
+; OVERIDE-NEXT:    [[TMP7:%.*]] = phi i32 [ 0, [[COND_END]] ], [ [[CALL]], [[TMP5]] ], [ 46, [[ENTRY]] ]
+; OVERIDE-NEXT:    [[TMP8:%.*]] = mul i32 [[TMP6]], [[TMP7]]
+; OVERIDE-NEXT:    [[CALL33:%.*]] = tail call i32 (ptr, ...) @printf(ptr nonnull dereferenceable(1) @.str, i32 [[TMP8]])
 ; OVERIDE-NEXT:    ret i32 0
 ;
 entry:
diff --git a/llvm/test/Transforms/SimplifyCFG/branch-fold.ll b/llvm/test/Transforms/SimplifyCFG/branch-fold.ll
index b8b5aaa73cf50..2f5fb4f33013d 100644
--- a/llvm/test/Transforms/SimplifyCFG/branch-fold.ll
+++ b/llvm/test/Transforms/SimplifyCFG/branch-fold.ll
@@ -114,3 +114,35 @@ bb0:
   call void @foo()
   ret void
 }
+
+define i8 @common_pred(i8 noundef %arg, i1 %c1, i1 %c2) {
+; CHECK-LABEL: @common_pred(
+; CHECK-NEXT:  Pred:
+; CHECK-NEXT:    call void @dummy()
+; CHECK-NEXT:    br i1 [[C1:%.*]], label [[COMMONPRED:%.*]], label [[SUCC:%.*]]
+; CHECK:       CommonPred:
+; CHECK-NEXT:    call void @dummy()
+; CHECK-NEXT:    [[SPEC_SELECT:%.*]] = select i1 [[C2:%.*]], i8 4, i8 1
+; CHECK-NEXT:    br label [[SUCC]]
+; CHECK:       Succ:
+; CHECK-NEXT:    [[PHI2:%.*]] = phi i8 [ 0, [[PRED:%.*]] ], [ [[SPEC_SELECT]], [[COMMONPRED]] ]
+; CHECK-NEXT:    ret i8 [[PHI2]]
+;
+Pred:
+call void @dummy()
+  br i1 %c1, label %CommonPred, label %BB
+
+CommonPred:
+call void @dummy()
+  br i1 %c2, label %Succ, label %BB
+
+BB:
+  %phi1 = phi i8 [ 0, %Pred ], [1, %CommonPred]
+  br label %Succ
+
+Succ:
+  %phi2 = phi i8 [ %phi1, %BB ], [ 4, %CommonPred ]
+  ret i8 %phi2
+}
+
+declare void @dummy()
diff --git a/llvm/test/Transforms/SimplifyCFG/merge-phis-in-switch.ll b/llvm/test/Transforms/SimplifyCFG/merge-phis-in-switch.ll
new file mode 100644
index 0000000000000..0ffa53a5c9e10
--- /dev/null
+++ b/llvm/test/Transforms/SimplifyCFG/merge-phis-in-switch.ll
@@ -0,0 +1,327 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; RUN: opt < %s -passes=simplifycfg -simplifycfg-require-and-preserve-domtree=1 -S | FileCheck %s
+
+; Test a bunch of cases where the combination of phis in switch should be merged into one phi
+
+declare void @use(i8)
+
+define i8 @phis_of_switch_minimal(i8 noundef %arg) {
+; CHECK-LABEL: define i8 @phis_of_switch_minimal
+; CHECK-SAME: (i8 noundef [[ARG:%.*]]) {
+; CHECK-NEXT:  start:
+; CHECK-NEXT:    switch i8 [[ARG]], label [[UNREACHABLE:%.*]] [
+; CHECK-NEXT:    i8 0, label [[CASE01:%.*]]
+; CHECK-NEXT:    i8 1, label [[CASE1:%.*]]
+; CHECK-NEXT:    i8 2, label [[END:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       unreachable:
+; CHECK-NEXT:    unreachable
+; CHECK:       case1:
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       case01:
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[PHI2:%.*]] = phi i8 [ 3, [[START:%.*]] ], [ 2, [[CASE1]] ], [ 1, [[CASE01]] ]
+; CHECK-NEXT:    ret i8 [[PHI2]]
+;
+start:
+  switch i8 %arg, label %unreachable [
+  i8 0, label %case01
+  i8 1, label %case1
+  i8 2, label %end
+  ]
+unreachable:
+  unreachable
+case1:
+  br label %case01
+
+case01:
+  %phi1 = phi i8 [ 2, %case1 ], [1, %start]
+  br label %end
+
+end:
+  %phi2 = phi i8 [ %phi1, %case01 ], [ 3, %start ]
+  ret i8 %phi2
+}
+
+define i8 @phis_of_switch(i8 noundef %arg) {
+; CHECK-LABEL: define i8 @phis_of_switch
+; CHECK-SAME: (i8 noundef [[ARG:%.*]]) {
+; CHECK-NEXT:  start:
+; CHECK-NEXT:    switch i8 [[ARG]], label [[UNREACHABLE:%.*]] [
+; CHECK-NEXT:    i8 0, label [[CASE012:%.*]]
+; CHECK-NEXT:    i8 1, label [[CASE1:%.*]]
+; CHECK-NEXT:    i8 2, label [[CASE2:%.*]]
+; CHECK-NEXT:    i8 3, label [[END:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       unreachable:
+; CHECK-NEXT:    unreachable
+; CHECK:       case1:
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       case2:
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       case012:
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[PHI2:%.*]] = phi i8 [ 4, [[START:%.*]] ], [ 3, [[CASE2]] ], [ 2, [[CASE1]] ], [ 1, [[CASE012]] ]
+; CHECK-NEXT:    ret i8 [[PHI2]]
+;
+start:
+  switch i8 %arg, label %unreachable [
+  i8 0, label %case012
+  i8 1, label %case1
+  i8 2, label %case2
+  i8 3, label %end
+  ]
+unreachable:
+  unreachable
+case1:
+  br label %case012
+
+case2:
+  br label %case012
+
+case012:
+  %phi1 = phi i8 [ 3, %case2 ], [ 2, %case1 ], [1, %start]
+  br label %end
+
+end:
+  %phi2 = phi i8 [ %phi1, %case012 ], [ 4, %start ]
+  ret i8 %phi2
+}
+
+define i8 @multiple_phis_of_switch(i8 noundef %arg) {
+; CHECK-LABEL: define i8 @multiple_phis_of_switch
+; CHECK-SAME: (i8 noundef [[ARG:%.*]]) {
+; CHECK-NEXT:  start:
+; CHECK-NEXT:    switch i8 [[ARG]], label [[UNREACHABLE:%.*]] [
+; CHECK-NEXT:    i8 0, label [[CASE012:%.*]]
+; CHECK-NEXT:    i8 1, label [[CASE1:%.*]]
+; CHECK-NEXT:    i8 2, label [[CASE2:%.*]]
+; CHECK-NEXT:    i8 3, label [[END:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       unreachable:
+; CHECK-NEXT:    unreachable
+; CHECK:       case1:
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       case2:
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       case012:
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[PHI2_1:%.*]] = phi i8 [ 4, [[START:%.*]] ], [ 3, [[CASE2]] ], [ 2, [[CASE1]] ], [ 1, [[CASE012]] ]
+; CHECK-NEXT:    [[PHI2_2:%.*]] = phi i8 [ 5, [[START]] ], [ 3, [[CASE2]] ], [ 2, [[CASE1]] ], [ 1, [[CASE012]] ]
+; CHECK-NEXT:    [[PHI2_3:%.*]] = phi i8 [ 3, [[START]] ], [ 6, [[CASE2]] ], [ 5, [[CASE1]] ], [ 4, [[CASE012]] ]
+; CHECK-NEXT:    call void @use(i8 [[PHI2_1]])
+; CHECK-NEXT:    call void @use(i8 [[PHI2_2]])
+; CHECK-NEXT:    call void @use(i8 [[PHI2_3]])
+; CHECK-NEXT:    ret i8 [[PHI2_1]]
+;
+start:
+  switch i8 %arg, label %unreachable [
+  i8 0, label %case012
+  i8 1, label %case1
+  i8 2, label %case2
+  i8 3, label %end
+  ]
+unreachable:
+  unreachable
+case1:
+  br label %case012
+
+case2:
+  br label %case012
+
+case012:
+  %phi1_1 = phi i8 [ 3, %case2 ], [ 2, %case1 ], [1, %start]
+  %phi1_2 = phi i8 [ 6, %case2 ], [ 5, %case1 ], [4, %start]
+  br label %end
+
+end:
+  %phi2_1 = phi i8 [ %phi1_1, %case012 ], [ 4, %start ]
+  %phi2_2 = phi i8 [ %phi1_1, %case012 ], [ 5, %start ]
+  %phi2_3 = phi i8 [ %phi1_2, %case012 ], [ 3, %start ]
+  call void @use(i8 %phi2_1)
+  call void @use(i8 %phi2_2)
+  call void @use(i8 %phi2_3)
+  ret i8 %phi2_1
+}
+
+define i8 @phis_of_switch_multiple_stage0(i8 noundef %arg) {
+; CHECK-LABEL: define i8 @phis_of_switch_multiple_stage0
+; CHECK-SAME: (i8 noundef [[ARG:%.*]]) {
+; CHECK-NEXT:  start:
+; CHECK-NEXT:    switch i8 [[ARG]], label [[UNREACHABLE:%.*]] [
+; CHECK-NEXT:    i8 0, label [[CASE0:%.*]]
+; CHECK-NEXT:    i8 1, label [[CASE1:%.*]]
+; CHECK-NEXT:    i8 2, label [[CASE2:%.*]]
+; CHECK-NEXT:    i8 3, label [[CASE0123:%.*]]
+; CHECK-NEXT:    i8 4, label [[CASE01234:%.*]]
+; CHECK-NEXT:    i8 5, label [[END:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       unreachable:
+; CHECK-NEXT:    unreachable
+; CHECK:       case0:
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       case1:
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       case2:
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       case0123:
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       case01234:
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[PHI3:%.*]] = phi i8 [ 6, [[START:%.*]] ], [ 3, [[CASE2]] ], [ 2, [[CASE1]] ], [ 1, [[CASE0]] ], [ 4, [[CASE0123]] ], [ 5, [[CASE01234]] ]
+; CHECK-NEXT:    ret i8 [[PHI3]]
+;
+start:
+  switch i8 %arg, label %unreachable [
+  i8 0, label %case0
+  i8 1, label %case1
+  i8 2, label %case2
+  i8 3, label %case0123
+  i8 4, label %case01234
+  i8 5, label %end
+  ]
+unreachable:
+  unreachable
+
+case0:
+  br label %case0123
+
+case1:
+  br label %case0123
+
+case2:
+  br label %case0123
+
+case0123:
+  %phi1 = phi i8 [4, %start], [ 3, %case2 ], [ 2, %case1 ], [ 1, %case0 ]
+  br label %case01234
+
+case01234:
+  %phi2 = phi i8 [ %phi1, %case0123 ], [ 5, %start ]
+  br label %end
+
+end:
+  %phi3 = phi i8 [ %phi2, %case01234 ], [6, %start]
+  ret i8 %phi3
+}
+
+define i8 @phis_of_switch_multiple_stage1(i8 noundef %arg) {
+; CHECK-LABEL: define i8 @phis_of_switch_multiple_stage1
+; CHECK-SAME: (i8 noundef [[ARG:%.*]]) {
+; CHECK-NEXT:  start:
+; CHECK-NEXT:    switch i8 [[ARG]], label [[UNREACHABLE:%.*]] [
+; CHECK-NEXT:    i8 0, label [[CASE0:%.*]]
+; CHECK-NEXT:    i8 1, label [[CASE1:%.*]]
+; CHECK-NEXT:    i8 2, label [[CASE012:%.*]]
+; CHECK-NEXT:    i8 3, label [[CASE3:%.*]]
+; CHECK-NEXT:    i8 4, label [[CASE4:%.*]]
+; CHECK-NEXT:    i8 5, label [[CASE345:%.*]]
+; CHECK-NEXT:    i8 6, label [[CASE0123456:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       unreachable:
+; CHECK-NEXT:    unreachable
+; CHECK:       case0:
+; CHECK-NEXT:    br label [[CASE0123456]]
+; CHECK:       case1:
+; CHECK-NEXT:    br label [[CASE0123456]]
+; CHECK:       case012:
+; CHECK-NEXT:    br label [[CASE0123456]]
+; CHECK:       case3:
+; CHECK-NEXT:    br label [[CASE0123456]]
+; CHECK:       case4:
+; CHECK-NEXT:    br label [[CASE0123456]]
+; CHECK:       case345:
+; CHECK-NEXT:    br label [[CASE0123456]]
+; CHECK:       case0123456:
+; CHECK-NEXT:    [[PHI1234567:%.*]] = phi i8 [ 7, [[START:%.*]] ], [ 2, [[CASE1]] ], [ 1, [[CASE0]] ], [ 3, [[CASE012]] ], [ 5, [[CASE4]] ], [ 4, [[CASE3]] ], [ 6, [[CASE345]] ]
+; CHECK-NEXT:    ret i8 [[PHI1234567]]
+;
+start:
+  switch i8 %arg, label %unreachable [
+  i8 0, label %case0
+  i8 1, label %case1
+  i8 2, label %case012
+  i8 3, label %case3
+  i8 4, label %case4
+  i8 5, label %case345
+  i8 6, label %case0123456
+  ]
+unreachable:
+  unreachable
+case0:
+  br label %case012
+
+case1:
+  br label %case012
+
+case012:
+  %phi123 = phi i8 [3, %start], [ 2, %case1 ], [ 1, %case0 ]
+  br label %case0123456
+
+case3:
+  br label %case345
+
+case4:
+  br label %case345
+
+case345:
+  %phi456 = phi i8 [6, %start], [ 5, %case4 ], [ 4, %case3 ]
+  br label %case0123456
+
+case0123456:
+  %phi1234567 = phi i8 [7, %start], [ %phi456, %case345 ], [ %phi123, %case012 ]
+  ret i8 %phi1234567
+
+}
+
+define i8 @phis_of_switch_extra_use_fail(i8 noundef %arg) {
+; CHECK-LABEL: define i8 @phis_of_switch_extra_use_fail
+; CHECK-SAME: (i8 noundef [[ARG:%.*]]) {
+; CHECK-NEXT:  start:
+; CHECK-NEXT:    switch i8 [[ARG]], label [[UNREACHABLE:%.*]] [
+; CHECK-NEXT:    i8 0, label [[CASE012:%.*]]
+; CHECK-NEXT:    i8 1, label [[CASE1:%.*]]
+; CHECK-NEXT:    i8 2, label [[CASE2:%.*]]
+; CHECK-NEXT:    i8 3, label [[END:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       unreachable:
+; CHECK-NEXT:    unreachable
+; CHECK:       case1:
+; CHECK-NEXT:    br label [[CASE012]]
+; CHECK:       case2:
+; CHECK-NEXT:    br label [[CASE012]]
+; CHECK:       case012:
+; CHECK-NEXT:    [[PHI1:%.*]] = phi i8 [ 3, [[CASE2]] ], [ 2, [[CASE1]] ], [ 1, [[START:%.*]] ]
+; CHECK-NEXT:    call void @use(i8 [[PHI1]])
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[PHI2:%.*]] = phi i8 [ [[PHI1]], [[CASE012]] ], [ 4, [[START]] ]
+; CHECK-NEXT:    ret i8 [[PHI2]]
+;
+start:
+  switch i8 %arg, label %unreachable [
+  i8 0, label %case012
+  i8 1, label %case1
+  i8 2, label %case2 i8 3, label %end
+  ]
+unreachable:
+  unreachable
+case1:
+  br label %case012
+
+case2:
+  br label %case012
+
+case012:
+  %phi1 = phi i8 [ 3, %case2 ], [ 2, %case1 ], [1, %start]
+  call void @use(i8 %phi1)
+  br label %end
+
+end:
+  %phi2 = phi i8 [ %phi1, %case012 ], [ 4, %start ]
+  ret i8 %phi2
+}
diff --git a/llvm/test/Transforms/SimplifyCFG/multiple-phis.ll b/llvm/test/Transforms/SimplifyCFG/multiple-phis.ll
index eae851f2ddada..d63973eab71e4 100644
--- a/llvm/test/Transforms/SimplifyCFG/multiple-phis.ll
+++ b/llvm/test/Transforms/SimplifyCFG/multiple-phis.ll
@@ -1,4 +1,5 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
+
 ; RUN: opt -passes=simplifycfg -simplifycfg-require-and-preserve-domtree=1 -keep-loops=false -S < %s | FileCheck %s
 ; RUN: opt -passes='simplifycfg<no-keep-loops>' -S < %s | FileCheck %s
 
@@ -7,11 +8,12 @@
 ; SimplifyCFG if-converts one of the phis, it should do both.
 
 define i32 @upper_bound(ptr %r, i32 %high, i32 %k) nounwind {
-; CHECK-LABEL: @upper_bound(
+; CHECK-LABEL: define i32 @upper_bound
+; CHECK-SAME: (ptr [[R:%.*]], i32 [[HIGH:%.*]], i32 [[K:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[WHILE_COND:%.*]]
 ; CHECK:       while.cond:
-; CHECK-NEXT:    [[HIGH_ADDR_0:%.*]] = phi i32 [ [[HIGH:%.*]], [[ENTRY:%.*]] ], [ [[SPEC_SELECT:%.*]], [[WHILE_BODY:%.*]] ]
+; CHECK-NEXT:    [[HIGH_ADDR_0:%.*]] = phi i32 [ [[HIGH]], [[ENTRY:%.*]] ], [ [[SPEC_SELECT:%.*]], [[WHILE_BODY:%.*]] ]
 ; CHECK-NEXT:    [[LOW_0:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[SPEC_SELECT1:%.*]], [[WHILE_BODY]] ]
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[LOW_0]], [[HIGH_ADDR_0]]
 ; CHECK-NEXT:    br i1 [[CMP]], label [[WHILE_BODY]], label [[WHILE_END:%.*]]
@@ -19,9 +21,9 @@ define i32 @upper_bound(ptr %r, i32 %high, i32 %k) nounwind {
 ; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[LOW_0]], [[HIGH_ADDR_0]]
 ; CHECK-NEXT:    [[DIV:%.*]] = udiv i32 [[ADD]], 2
 ; CHECK-NEXT:    [[IDXPROM:%.*]] = zext i32 [[DIV]] to i64
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[R:%.*]], i64 [[IDXPROM]]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[R]], i64 [[IDXPROM]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp ult i32 [[K:%.*]], [[TMP0]]
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ult i32 [[K]], [[TMP0]]
 ; CHECK-NEXT:    [[ADD2:%.*]] = add i32 [[DIV]], 1
 ; CHECK-NEXT:    [[SPEC_SELECT]] = select i1 [[CMP1]], i32 [[DIV]], i32 [[HIGH_ADDR_0]]
 ; CHECK-NEXT:    [[SPEC_SELECT1]] = select i1 [[CMP1]], i32 [[LOW_0]], i32 [[ADD2]]
@@ -57,3 +59,270 @@ if.else:                                          ; preds = %while.body
 while.end:                                        ; preds = %while.cond
   ret i32 %low.0
 }
+
+define i32 @merge0(i1 %c1, i1 %c2, i1 %c3) {
+; CHECK-LABEL: define i32 @merge0
+; CHECK-SAME: (i1 [[C1:%.*]], i1 [[C2:%.*]], i1 [[C3:%.*]]) {
+; CHECK-NEXT:  j2:
+; CHECK-NEXT:    [[DOT:%.*]] = select i1 [[C2]], i32 0, i32 1
+; CHECK-NEXT:    [[SPEC_SELECT:%.*]] = select i1 [[C3]], i32 2, i32 3
+; CHECK-NEXT:    [[PHI2:%.*]] = select i1 [[C1]], i32 [[DOT]], i32 [[SPEC_SELECT]]
+; CHECK-NEXT:    ret i32 [[PHI2]]
+;
+  br i1 %c1, label %if1, label %else1
+
+if1:
+  br i1 %c2, label %if2, label %else2
+
+if2:
+  br label %j1 else2:
+  br label %j1
+
+else1:
+  br i1 %c3, label %j1, label %j2
+
+j1:
+  %phi1 = phi i32 [ 0, %if2 ], [ 1, %else2 ], [ 2, %else1 ]
+  br label %j2
+
+j2:
+  %phi2 = phi i32 [ %phi1, %j1 ], [ 3, %else1 ]
+  ret i32 %phi2
+}
+
+define i8 @merge1(i8 noundef %arg, i1 %c1, i1 %c2) {
+; CHECK-LABEL: define i8 @merge1
+; CHECK-SAME: (i8 noundef [[ARG:%.*]], i1 [[C1:%.*]], i1 [[C2:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    switch i8 [[ARG]], label [[UNREACHABLE:%.*]] [
+; CHECK-NEXT:    i8 -123, label [[CASE0:%.*]]
+; CHECK-NEXT:    i8 66, label [[SUCC:%.*]]
+; CHECK-NEXT:    i8 123, label [[CASE2:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       unreachable:
+; CHECK-NEXT:    unreachable
+; CHECK:       case0:
+; CHECK-NEXT:    [[C1_NOT:%.*]] = xor i1 [[C1]], true
+; CHECK-NEXT:    [[C2_NOT:%.*]] = xor i1 [[C2]], true
+; CHECK-NEXT:    [[BRMERGE:%.*]] = select i1 [[C1_NOT]], i1 true, i1 [[C2_NOT]]
+; CHECK-NEXT:    [[DOTMUX:%.*]] = select i1 [[C1_NOT]], i8 0, i8 3
+; CHECK-NEXT:    [[SPEC_SELECT:%.*]] = select i1 [[BRMERGE]], i8 [[DOTMUX]], i8 4
+; CHECK-NEXT:    br label [[SUCC]]
+; CHECK:       case2:
+; CHECK-NEXT:    br label [[SUCC]]
+; CHECK:       Succ:
+; CHECK-NEXT:    [[PHI2:%.*]] = phi i8 [ 2, [[CASE2]] ], [ 1, [[ENTRY:%.*]] ], [ [[SPEC_SELECT]], [[CASE0]] ]
+; CHECK-NEXT:    ret i8 [[PHI2]]
+;
+entry:
+  switch i8 %arg, label %unreachable [
+  i8 -123, label %case0
+  i8 66, label %case1
+  i8 123, label %case2
+  ]
+
+unreachable:
+  unreachable
+
+case0:
+  br i1 %c1, label %CommonPred, label %BB
+
+case1:
+  br label %BB
+
+case2:
+  br label %BB
+
+CommonPred:
+  br i1 %c2, label %Succ, label %BB
+
+BB:
+  %phi1 = phi i8 [ 0, %case0 ], [1, %case1],[2, %case2],[3,%CommonPred]
+  br label %Succ
+
+Succ:
+  %phi2 = phi i8 [ %phi1, %BB ], [ 4, %CommonPred ]
+  ret i8 %phi2
+}
+
+define i8 @merge1_unfoldable_one_block(i8 noundef %arg, i1 %c1, i1 %c2) {
+; CHECK-LABEL: define i8 @merge1_unfoldable_one_block
+; CHECK-SAME: (i8 noundef [[ARG:%.*]], i1 [[C1:%.*]], i1 [[C2:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    switch i8 [[ARG]], label [[UNREACHABLE:%.*]] [
+; CHECK-NEXT:    i8 -123, label [[CASE0:%.*]]
+; CHECK-NEXT:    i8 66, label [[SUCC:%.*]]
+; CHECK-NEXT:    i8 123, label [[CASE2:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       unreachable:
+; CHECK-NEXT:    unreachable
+; CHECK:       case0:
+; CHECK-NEXT:    call void @dummy()
+; CHECK-NEXT:    [[C1_NOT:%.*]] = xor i1 [[C1]], true
+; CHECK-NEXT:    [[C2_NOT:%.*]] = xor i1 [[C2]], true
+; CHECK-NEXT:    [[BRMERGE:%.*]] = select i1 [[C1_NOT]], i1 true, i1 [[C2_NOT]]
+; CHECK-NEXT:    [[DOTMUX:%.*]] = select i1 [[C1_NOT]], i8 0, i8 3
+; CHECK-NEXT:    [[SPEC_SELECT:%.*]] = select i1 [[BRMERGE]], i8 [[DOTMUX]], i8 4
+; CHECK-NEXT:    br label [[SUCC]]
+; CHECK:       case2:
+; CHECK-NEXT:    br label [[SUCC]]
+; CHECK:       Succ:
+; CHECK-NEXT:    [[PHI2:%.*]] = phi i8 [ 2, [[CASE2]] ], [ 1, [[ENTRY:%.*]] ], [ [[SPEC_SELECT]], [[CASE0]] ]
+; CHECK-NEXT:    ret i8 [[PHI2]]
+;
+entry:
+  switch i8 %arg, label %unreachable [
+  i8 -123, label %case0
+  i8 66, label %case1
+  i8 123, label %case2
+  ]
+
+unreachable:
+  unreachable
+
+case0:
+  call void @dummy()
+  br i1 %c1, label %CommonPred, label %BB
+
+case1:
+  br label %BB
+
+case2:
+  br label %BB
+
+CommonPred:
+  br i1 %c2, label %Succ, label %BB
+
+BB:
+  %phi1 = phi i8 [ 0, %case0 ], [1, %case1],[2, %case2],[3,%CommonPred]
+  br label %Succ
+
+Succ:
+  %phi2 = phi i8 [ %phi1, %BB ], [ 4, %CommonPred ]
+  ret i8 %phi2
+}
+
+define i8 @merge1_unfoldable_two_block(i8 noundef %arg, i1 %c1, i1 %c2) {
+; CHECK-LABEL: define i8 @merge1_unfoldable_two_block
+; CHECK-SAME: (i8 noundef [[ARG:%.*]], i1 [[C1:%.*]], i1 [[C2:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    switch i8 [[ARG]], label [[UNREACHABLE:%.*]] [
+; CHECK-NEXT:    i8 -123, label [[CASE0:%.*]]
+; CHECK-NEXT:    i8 66, label [[CASE1:%.*]]
+; CHECK-NEXT:    i8 123, label [[SUCC:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       unreachable:
+; CHECK-NEXT:    unreachable
+; CHECK:       case0:
+; CHECK-NEXT:    call void @dummy()
+; CHECK-NEXT:    [[C1_NOT:%.*]] = xor i1 [[C1]], true
+; CHECK-NEXT:    [[C2_NOT:%.*]] = xor i1 [[C2]], true
+; CHECK-NEXT:    [[BRMERGE:%.*]] = select i1 [[C1_NOT]], i1 true, i1 [[C2_NOT]]
+; CHECK-NEXT:    [[DOTMUX:%.*]] = select i1 [[C1_NOT]], i8 0, i8 3
+; CHECK-NEXT:    [[SPEC_SELECT:%.*]] = select i1 [[BRMERGE]], i8 [[DOTMUX]], i8 4
+; CHECK-NEXT:    br label [[SUCC]]
+; CHECK:       case1:
+; CHECK-NEXT:    call void @dummy()
+; CHECK-NEXT:    br label [[SUCC]]
+; CHECK:       Succ:
+; CHECK-NEXT:    [[PHI2:%.*]] = phi i8 [ 1, [[CASE1]] ], [ 2, [[ENTRY:%.*]] ], [ [[SPEC_SELECT]], [[CASE0]] ]
+; CHECK-NEXT:    ret i8 [[PHI2]]
+;
+entry:
+  switch i8 %arg, label %unreachable [
+  i8 -123, label %case0
+  i8 66, label %case1
+  i8 123, label %case2
+  ]
+
+unreachable:
+  unreachable
+
+case0:
+call void @dummy()
+  br i1 %c1, label %CommonPred, label %BB
+
+case1:
+call void @dummy()
+  br label %BB
+
+case2:
+  br label %BB
+
+CommonPred:
+  br i1 %c2, label %Succ, label %BB
+
+BB:
+  %phi1 = phi i8 [ 0, %case0 ], [1, %case1],[2, %case2],[3,%CommonPred]
+  br label %Succ
+
+Succ:
+  %phi2 = phi i8 [ %phi1, %BB ], [ 4, %CommonPred ]
+  ret i8 %phi2
+}
+
+define i8 @merge1_unfoldable_all_block(i8 noundef %arg, i1 %c1, i1 %c2) {
+; CHECK-LABEL: define i8 @merge1_unfoldable_all_block
+; CHECK-SAME: (i8 noundef [[ARG:%.*]], i1 [[C1:%.*]], i1 [[C2:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    switch i8 [[ARG]], label [[UNREACHABLE:%.*]] [
+; CHECK-NEXT:    i8 -123, label [[CASE0:%.*]]
+; CHECK-NEXT:    i8 66, label [[CASE1:%.*]]
+; CHECK-NEXT:    i8 123, label [[CASE2:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       unreachable:
+; CHECK-NEXT:    unreachable
+; CHECK:       case0:
+; CHECK-NEXT:    call void @dummy()
+; CHECK-NEXT:    br i1 [[C1]], label [[COMMONPRED:%.*]], label [[SUCC:%.*]]
+; CHECK:       case1:
+; CHECK-NEXT:    call void @dummy()
+; CHECK-NEXT:    br label [[SUCC]]
+; CHECK:       case2:
+; CHECK-NEXT:    call void @dummy()
+; CHECK-NEXT:    br label [[SUCC]]
+; CHECK:       CommonPred:
+; CHECK-NEXT:    call void @dummy()
+; CHECK-NEXT:    [[SPEC_SELECT:%.*]] = select i1 [[C2]], i8 4, i8 3
+; CHECK-NEXT:    br label [[SUCC]]
+; CHECK:       Succ:
+; CHECK-NEXT:    [[PHI2:%.*]] = phi i8 [ 0, [[CASE0]] ], [ 1, [[CASE1]] ], [ 2, [[CASE2]] ], [ [[SPEC_SELECT]], [[COMMONPRED]] ]
+; CHECK-NEXT:    ret i8 [[PHI2]]
+;
+entry:
+  switch i8 %arg, label %unreachable [
+  i8 -123, label %case0
+  i8 66, label %case1
+  i8 123, label %case2
+  ]
+
+unreachable:
+  unreachable
+
+case0:
+call void @dummy()
+  br i1 %c1, label %CommonPred, label %BB
+
+case1:
+call void @dummy()
+  br label %BB
+
+case2:
+call void @dummy()
+  br label %BB
+
+CommonPred:
+call void @dummy()
+  br i1 %c2, label %Succ, label %BB
+
+BB:
+  %phi1 = phi i8 [ 0, %case0 ], [1, %case1],[2, %case2],[3,%CommonPred]
+  br label %Succ
+
+Succ:
+  %phi2 = phi i8 [ %phi1, %BB ], [ 4, %CommonPred ]
+  ret i8 %phi2
+}
+
+declare void @dummy()
+
diff --git a/llvm/test/Transforms/SimplifyCFG/switch-simplify-crash2.ll b/llvm/test/Transforms/SimplifyCFG/switch-simplify-crash2.ll
index eb0947397a699..63985ec565bb9 100644
--- a/llvm/test/Transforms/SimplifyCFG/switch-simplify-crash2.ll
+++ b/llvm/test/Transforms/SimplifyCFG/switch-simplify-crash2.ll
@@ -6,6 +6,9 @@ define i8 @test() {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[LOOP2:%.*]]
 ; CHECK:       loop2:
+; CHECK-NEXT:    [[PHI2:%.*]] = phi i8 [ 0, [[ENTRY:%.*]] ], [ [[SPEC_SELECT:%.*]], [[LOOP2]] ]
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i8 [[PHI2]], 0
+; CHECK-NEXT:    [[SPEC_SELECT]] = select i1 [[COND]], i8 0, i8 [[PHI2]]
 ; CHECK-NEXT:    br label [[LOOP2]]
 ;
 entry:

From 4f6757ce4bc78b7f56e4cb2a84c35444cd71c145 Mon Sep 17 00:00:00 2001
From: Jonas Hahnfeld <hahnjo@hahnjo.de>
Date: Sat, 28 Oct 2023 11:30:43 +0200
Subject: [PATCH 291/877] [JITLink][RISCV] Implement eh_frame handling (#68253)

This requires adding a `NegDelta32` edge kind that cannot be mapped to
existing relocations.

Co-authored-by: Job Noorman <jnoorman@igalia.com>
---
 clang/test/Interpreter/simple-exception.cpp   |   4 +-
 .../InterpreterExceptionTest.cpp              |   5 -
 .../llvm/ExecutionEngine/JITLink/riscv.h      |   6 +
 .../lib/ExecutionEngine/JITLink/ELF_riscv.cpp |  16 +++
 llvm/lib/ExecutionEngine/JITLink/riscv.cpp    |   2 +
 .../JITLink/RISCV/ELF_ehframe.s               |  74 +++++++++++++
 .../JITLink/RISCV/ELF_ehframe.test            | 104 ++++++++++++++++++
 7 files changed, 204 insertions(+), 7 deletions(-)
 create mode 100644 llvm/test/ExecutionEngine/JITLink/RISCV/ELF_ehframe.s
 create mode 100644 llvm/test/ExecutionEngine/JITLink/RISCV/ELF_ehframe.test

diff --git a/clang/test/Interpreter/simple-exception.cpp b/clang/test/Interpreter/simple-exception.cpp
index 8741886a0a621..6749acd6e6bd2 100644
--- a/clang/test/Interpreter/simple-exception.cpp
+++ b/clang/test/Interpreter/simple-exception.cpp
@@ -1,7 +1,7 @@
 // clang-format off
 // UNSUPPORTED: system-aix
-// XFAIL for arm, arm64, riscv, or running on Windows.
-// XFAIL: target={{(arm|riscv).*}}, system-windows
+// XFAIL for arm and arm64, or running on Windows.
+// XFAIL: target=arm{{.*}}, system-windows
 // RUN: cat %s | clang-repl | FileCheck %s
 
 // Incompatible with msan. It passes with -O3 but fail -Oz. Interpreter
diff --git a/clang/unittests/Interpreter/ExceptionTests/InterpreterExceptionTest.cpp b/clang/unittests/Interpreter/ExceptionTests/InterpreterExceptionTest.cpp
index 7b47d93446192..2f1c4efb381f0 100644
--- a/clang/unittests/Interpreter/ExceptionTests/InterpreterExceptionTest.cpp
+++ b/clang/unittests/Interpreter/ExceptionTests/InterpreterExceptionTest.cpp
@@ -122,11 +122,6 @@ extern "C" int throw_exception() {
                               Triple.getArch() == llvm::Triple::aarch64_32))
     GTEST_SKIP();
 
-  // FIXME: RISC-V fails as .eh_frame handling is not yet implemented in
-  // JITLink for RISC-V. See PR #66067.
-  if (Triple.isRISCV())
-    GTEST_SKIP();
-
   llvm::cantFail(Interp->ParseAndExecute(ExceptionCode));
   testing::internal::CaptureStdout();
   auto ThrowException =
diff --git a/llvm/include/llvm/ExecutionEngine/JITLink/riscv.h b/llvm/include/llvm/ExecutionEngine/JITLink/riscv.h
index cb66289180880..a31f7d73b099f 100644
--- a/llvm/include/llvm/ExecutionEngine/JITLink/riscv.h
+++ b/llvm/include/llvm/ExecutionEngine/JITLink/riscv.h
@@ -214,6 +214,12 @@ enum EdgeKind_riscv : Edge::Kind {
   /// Linker relaxation will use this to ensure all code sequences are properly
   /// aligned and then remove these edges from the graph.
   AlignRelaxable,
+
+  /// 32-bit negative delta.
+  ///
+  /// Fixup expression:
+  ///   Fixup <- Fixup - Target + Addend
+  NegDelta32,
 };
 
 /// Returns a string name for the given riscv edge. For debugging purposes
diff --git a/llvm/lib/ExecutionEngine/JITLink/ELF_riscv.cpp b/llvm/lib/ExecutionEngine/JITLink/ELF_riscv.cpp
index 35816ea66cf9b..d0701ba08bd91 100644
--- a/llvm/lib/ExecutionEngine/JITLink/ELF_riscv.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/ELF_riscv.cpp
@@ -11,10 +11,12 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ExecutionEngine/JITLink/ELF_riscv.h"
+#include "EHFrameSupportImpl.h"
 #include "ELFLinkGraphBuilder.h"
 #include "JITLinkGeneric.h"
 #include "PerGraphGOTAndPLTStubsBuilder.h"
 #include "llvm/BinaryFormat/ELF.h"
+#include "llvm/ExecutionEngine/JITLink/DWARFRecordSectionSplitter.h"
 #include "llvm/ExecutionEngine/JITLink/JITLink.h"
 #include "llvm/ExecutionEngine/JITLink/riscv.h"
 #include "llvm/Object/ELF.h"
@@ -456,6 +458,13 @@ class ELFJITLinker_riscv : public JITLinker<ELFJITLinker_riscv> {
     case AlignRelaxable:
       // Ignore when the relaxation pass did not run
       break;
+    case NegDelta32: {
+      int64_t Value = FixupAddress - E.getTarget().getAddress() + E.getAddend();
+      if (LLVM_UNLIKELY(!isInRangeForImm(Value, 32)))
+        return makeTargetOutOfRangeError(G, B, E);
+      *(little32_t *)FixupPtr = static_cast<uint32_t>(Value);
+      break;
+    }
     }
     return Error::success();
   }
@@ -958,6 +967,13 @@ void link_ELF_riscv(std::unique_ptr<LinkGraph> G,
   PassConfiguration Config;
   const Triple &TT = G->getTargetTriple();
   if (Ctx->shouldAddDefaultTargetPasses(TT)) {
+
+    Config.PrePrunePasses.push_back(DWARFRecordSectionSplitter(".eh_frame"));
+    Config.PrePrunePasses.push_back(EHFrameEdgeFixer(
+        ".eh_frame", G->getPointerSize(), Edge::Invalid, Edge::Invalid,
+        Edge::Invalid, Edge::Invalid, NegDelta32));
+    Config.PrePrunePasses.push_back(EHFrameNullTerminator(".eh_frame"));
+
     if (auto MarkLive = Ctx->getMarkLivePass(TT))
       Config.PrePrunePasses.push_back(std::move(MarkLive));
     else
diff --git a/llvm/lib/ExecutionEngine/JITLink/riscv.cpp b/llvm/lib/ExecutionEngine/JITLink/riscv.cpp
index a78843b161479..a4e4daef97fb5 100644
--- a/llvm/lib/ExecutionEngine/JITLink/riscv.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/riscv.cpp
@@ -82,6 +82,8 @@ const char *getEdgeKindName(Edge::Kind K) {
     return "CallRelaxable";
   case AlignRelaxable:
     return "AlignRelaxable";
+  case NegDelta32:
+    return "NegDelta32";
   }
   return getGenericEdgeKindName(K);
 }
diff --git a/llvm/test/ExecutionEngine/JITLink/RISCV/ELF_ehframe.s b/llvm/test/ExecutionEngine/JITLink/RISCV/ELF_ehframe.s
new file mode 100644
index 0000000000000..9173bf7c3d95e
--- /dev/null
+++ b/llvm/test/ExecutionEngine/JITLink/RISCV/ELF_ehframe.s
@@ -0,0 +1,74 @@
+# REQUIRES: asserts
+
+# RUN: llvm-mc -triple=riscv32-linux-gnu -mattr=+relax -filetype=obj -o %t.32.o %s
+# RUN: llvm-jitlink -noexec -phony-externals -debug-only=jitlink %t.32.o 2>&1 | \
+# RUN:   FileCheck %s
+
+# RUN: llvm-mc -triple=riscv64-linux-gnu -mattr=+relax -filetype=obj -o %t.64.o %s
+# RUN: llvm-jitlink -noexec -phony-externals -debug-only=jitlink %t.64.o 2>&1 | \
+# RUN:   FileCheck %s
+
+# Check that splitting of eh-frame sections works.
+#
+# CHECK: DWARFRecordSectionSplitter: Processing .eh_frame...
+# CHECK:  Processing block at
+# CHECK:    Processing CFI record at
+# CHECK:      Extracted {{.*}} section = .eh_frame
+# CHECK:    Processing CFI record at
+# CHECK:      Extracted {{.*}} section = .eh_frame
+# CHECK: EHFrameEdgeFixer: Processing .eh_frame in "{{.*}}"...
+# CHECK:   Processing block at
+# CHECK:     Record is CIE
+# CHECK:   Processing block at
+# CHECK:     Record is FDE
+# CHECK:       Adding edge at {{.*}} to CIE at: {{.*}}
+# CHECK:       Existing edge at {{.*}} to PC begin at {{.*}}
+# CHECK:       Adding keep-alive edge from target at {{.*}} to FDE at {{.*}}
+# CHECK:   Processing block at
+# CHECK:     Record is FDE
+# CHECK:       Adding edge at {{.*}} to CIE at: {{.*}}
+# CHECK:       Existing edge at {{.*}} to PC begin at {{.*}}
+# CHECK:       Adding keep-alive edge from target at {{.*}} to FDE at {{.*}}
+
+## This is "int main { throw 1; }" compiled for riscv32. We use the 32-bit
+## version because it is also legal for riscv64.
+	.text
+	.globl	main
+	.p2align	1
+	.type	main,@function
+main:
+	.cfi_startproc
+	addi	sp, sp, -16
+	.cfi_def_cfa_offset 16
+	sw	ra, 12(sp)
+	.cfi_offset ra, -4
+	li	a0, 4
+	call	__cxa_allocate_exception
+	li	a1, 1
+	sw	a1, 0(a0)
+	lga a1, _ZTIi
+	li	a2, 0
+	call	__cxa_throw
+.Lfunc_end0:
+	.size	main, .Lfunc_end0-main
+	.cfi_endproc
+
+	.globl	dup
+	.p2align	1
+	.type	dup,@function
+dup:
+	.cfi_startproc
+	addi	sp, sp, -16
+	.cfi_def_cfa_offset 16
+	sw	ra, 12(sp)
+	.cfi_offset ra, -4
+	li	a0, 4
+	call	__cxa_allocate_exception
+	li	a1, 1
+	sw	a1, 0(a0)
+	lga a1, _ZTIi
+	li	a2, 0
+	call	__cxa_throw
+.Lfunc_end1:
+	.size	dup, .Lfunc_end1-dup
+	.cfi_endproc
diff --git a/llvm/test/ExecutionEngine/JITLink/RISCV/ELF_ehframe.test b/llvm/test/ExecutionEngine/JITLink/RISCV/ELF_ehframe.test
new file mode 100644
index 0000000000000..95666d2e232b7
--- /dev/null
+++ b/llvm/test/ExecutionEngine/JITLink/RISCV/ELF_ehframe.test
@@ -0,0 +1,104 @@
+# RUN: yaml2obj -DELFCLASS=ELFCLASS32 -o %t.32.o %s
+# RUN: llvm-jitlink -noexec -check %s %t.32.o
+# RUN: yaml2obj -DELFCLASS=ELFCLASS64 -o %t.64.o %s
+# RUN: llvm-jitlink -noexec -check %s %t.64.o
+
+### Compiled from the following code with -mattr=+relax to force relocations for
+### address_range and DW_CFA_advance_loc (both needed for .balign).
+## 	.text
+## 	.globl	main
+## 	.p2align	1
+## 	.type	main,@function
+## main:
+## 	.cfi_startproc
+##     .balign 8
+## 	addi	sp, sp, -16
+## cfa_advance_loc:
+## 	.cfi_def_cfa_offset 16
+##     nop
+## main_end:
+## 	.size	main, main_end-main
+## 	.cfi_endproc
+
+--- !ELF
+FileHeader:
+  Class:           [[ELFCLASS]]
+  Data:            ELFDATA2LSB
+  Type:            ET_REL
+  Machine:         EM_RISCV
+  SectionHeaderStringTable: .strtab
+Sections:
+  - Name:            .text
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    AddressAlign:    0x8
+    Content:         13000000130101FF13000000
+  - Name:            .eh_frame
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC ]
+    AddressAlign:    0x8
+    Content:         1000000000000000017A5200017801011B0C02001000000018000000000000000000000000400E10
+  - Name:            .rela.text
+    Type:            SHT_RELA
+    Flags:           [ SHF_INFO_LINK ]
+    Link:            .symtab
+    AddressAlign:    0x8
+    Info:            .text
+    Relocations:
+      - Type:            R_RISCV_ALIGN
+        Addend:          4
+  - Name:            .rela.eh_frame
+    Type:            SHT_RELA
+    Flags:           [ SHF_INFO_LINK ]
+    Link:            .symtab
+    AddressAlign:    0x8
+    Info:            .eh_frame
+    Relocations:
+      - Offset:          0x1C
+        Symbol:          main
+        Type:            R_RISCV_32_PCREL
+      - Offset:          0x20
+        Symbol:          main_end
+        Type:            R_RISCV_ADD32
+      - Offset:          0x20
+        Symbol:          main
+        Type:            R_RISCV_SUB32
+      - Offset:          0x25
+        Symbol:          cfa_advance_loc
+        Type:            R_RISCV_SET6
+      - Offset:          0x25
+        Symbol:          main
+        Type:            R_RISCV_SUB6
+  - Type:            SectionHeaderTable
+    Sections:
+      - Name:            .strtab
+      - Name:            .text
+      - Name:            .rela.text
+      - Name:            .eh_frame
+      - Name:            .rela.eh_frame
+      - Name:            .symtab
+Symbols:
+  - Name:            cfa_advance_loc
+    Section:         .text
+    Value:           0x8
+  - Name:            main_end
+    Section:         .text
+    Value:           0xC
+  - Name:            main
+    Type:            STT_FUNC
+    Section:         .text
+    Binding:         STB_GLOBAL
+    Size:            0xC
+  - Name:            eh_frame
+    Type:            STT_SECTION
+    Binding:         STB_GLOBAL
+    Section:         .eh_frame
+    Size:            0x28
+...
+
+## CIE_pointer
+# jitlink-check: *{4}(eh_frame + 0x1c) = main - (eh_frame + 0x1c)
+## address_range
+# jitlink-check: *{4}(eh_frame + 0x20) = main_end - main
+## DW_CFA_advance_loc
+# jitlink-check: (*{1}(eh_frame + 0x25)) & 0x3f = cfa_advance_loc - main

From 46cb7e4eeae3a3d64d2d6ba82a6309162bbb9808 Mon Sep 17 00:00:00 2001
From: Allen <zhongyunde@huawei.com>
Date: Sat, 28 Oct 2023 17:47:46 +0800
Subject: [PATCH 292/877] [LoopDist] Update the pragma info of loop distribute,
 NFC (#69825)

Base on D19403, the exact pragma of distribute is
  `#pragma clang loop distribute`
---
 flang/test/Driver/optimization-remark.f90        |  6 +++---
 llvm/lib/Analysis/LoopAccessAnalysis.cpp         |  2 +-
 llvm/lib/Transforms/Scalar/LoopDistribute.cpp    |  6 +++---
 .../Analysis/LoopAccessAnalysis/pointer-phis.ll  |  6 +++---
 .../LoopVectorize/diag-with-hotness-info-2.ll    |  6 +++---
 .../LoopVectorize/memory-dep-remarks.ll          | 16 ++++++++--------
 .../LoopVectorize/unsafe-dep-remark.ll           |  2 +-
 7 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/flang/test/Driver/optimization-remark.f90 b/flang/test/Driver/optimization-remark.f90
index 145121a4574f6..e7a7abfaf6ab8 100644
--- a/flang/test/Driver/optimization-remark.f90
+++ b/flang/test/Driver/optimization-remark.f90
@@ -47,7 +47,7 @@
 ! MISSED-REGEX-LOOP-ONLY:       optimization-remark.f90:76:4: remark: loop not vectorized [-Rpass-missed=loop-vectorize]
 
 
-! ANALYSIS-REGEX-LOOP-ONLY:     optimization-remark.f90:79:7: remark: loop not vectorized: unsafe dependent memory operations in loop. Use #pragma loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+! ANALYSIS-REGEX-LOOP-ONLY:     optimization-remark.f90:79:7: remark: loop not vectorized: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
 ! ANALYSIS-REGEX-LOOP-ONLY:     Unknown data dependence. Memory location is the same as accessed at optimization-remark.f90:78:7 [-Rpass-analysis=loop-vectorize]
 ! ANALYSIS-REGEX-LOOP-ONLY-NOT: remark: {{.*}}: IR instruction count changed from {{[0-9]+}} to {{[0-9]+}}; Delta: {{-?[0-9]+}} [-Rpass-analysis=size-info]
 
@@ -56,10 +56,10 @@
 
 ! MISSED:                       optimization-remark.f90:77:7: remark: failed to hoist load with loop-invariant address because load is conditionally executed [-Rpass-missed=licm]
 ! MISSED:                       optimization-remark.f90:76:4: remark: loop not vectorized [-Rpass-missed=loop-vectorize]
-! MISSED-NOT:                   optimization-remark.f90:79:7: remark: loop not vectorized: unsafe dependent memory operations in loop. Use #pragma loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+! MISSED-NOT:                   optimization-remark.f90:79:7: remark: loop not vectorized: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
 ! MISSED-NOT:                   Unknown data dependence. Memory location is the same as accessed at optimization-remark.f90:78:7 [-Rpass-analysis=loop-vectorize]
 
-! ANALYSIS:                     optimization-remark.f90:79:7: remark: loop not vectorized: unsafe dependent memory operations in loop. Use #pragma loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+! ANALYSIS:                     optimization-remark.f90:79:7: remark: loop not vectorized: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
 ! ANALYSIS:                     Unknown data dependence. Memory location is the same as accessed at optimization-remark.f90:78:7 [-Rpass-analysis=loop-vectorize]
 ! ANALYSIS:                     remark: {{.*}}: IR instruction count changed from {{[0-9]+}} to {{[0-9]+}}; Delta: {{-?[0-9]+}} [-Rpass-analysis=size-info]
 ! ANALYSIS-NOT:                 optimization-remark.f90:77:7: remark: failed to hoist load with loop-invariant address because load is conditionally executed [-Rpass-missed=licm]
diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index 540b70229b536..3d1edd5f038a2 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -2532,7 +2532,7 @@ void LoopAccessInfo::emitUnsafeDependenceRemark() {
       HasForcedDistribution
           ? "unsafe dependent memory operations in loop."
           : "unsafe dependent memory operations in loop. Use "
-            "#pragma loop distribute(enable) to allow loop distribution "
+            "#pragma clang loop distribute(enable) to allow loop distribution "
             "to attempt to isolate the offending operations into a separate "
             "loop";
   OptimizationRemarkAnalysis &R =
diff --git a/llvm/lib/Transforms/Scalar/LoopDistribute.cpp b/llvm/lib/Transforms/Scalar/LoopDistribute.cpp
index 27196e46ca566..626888c74bad8 100644
--- a/llvm/lib/Transforms/Scalar/LoopDistribute.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopDistribute.cpp
@@ -104,9 +104,9 @@ static cl::opt<unsigned> DistributeSCEVCheckThreshold(
 static cl::opt<unsigned> PragmaDistributeSCEVCheckThreshold(
     "loop-distribute-scev-check-threshold-with-pragma", cl::init(128),
     cl::Hidden,
-    cl::desc(
-        "The maximum number of SCEV checks allowed for Loop "
-        "Distribution for loop marked with #pragma loop distribute(enable)"));
+    cl::desc("The maximum number of SCEV checks allowed for Loop "
+             "Distribution for loop marked with #pragma clang loop "
+             "distribute(enable)"));
 
 static cl::opt<bool> EnableLoopDistribute(
     "enable-loop-distribute", cl::Hidden,
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/pointer-phis.ll b/llvm/test/Analysis/LoopAccessAnalysis/pointer-phis.ll
index 07ebaf2dfe058..b912a476651a2 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/pointer-phis.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/pointer-phis.ll
@@ -202,7 +202,7 @@ exit:                                             ; preds = %loop.latch
 define i32 @store_with_pointer_phi_incoming_phi(ptr %A, ptr %B, ptr %C, i1 %c.0, i1 %c.1) {
 ; CHECK-LABEL: 'store_with_pointer_phi_incoming_phi'
 ; CHECK-NEXT:  loop.header:
-; CHECK-NEXT:    Report: unsafe dependent memory operations in loop. Use #pragma loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK-NEXT:    Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
 ; CHECK-NEXT:    Unknown data dependence.
 ; CHECK-NEXT:    Dependences:
 ; CHECK-NEXT:      Unknown:
@@ -279,7 +279,7 @@ exit:                                             ; preds = %loop.latch
 define i32 @store_with_pointer_phi_incoming_phi_irreducible_cycle(ptr %A, ptr %B, ptr %C, i1 %c.0, i1 %c.1) {
 ; CHECK-LABEL: 'store_with_pointer_phi_incoming_phi_irreducible_cycle'
 ; CHECK-NEXT:  loop.header:
-; CHECK-NEXT:    Report: unsafe dependent memory operations in loop. Use #pragma loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK-NEXT:    Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
 ; CHECK-NEXT:    Unknown data dependence.
 ; CHECK-NEXT:    Dependences:
 ; CHECK-NEXT:      Unknown:
@@ -417,7 +417,7 @@ exit:                                             ; preds = %loop.latch
 define void @phi_load_store_memdep_check(i1 %c, ptr %A, ptr %B, ptr %C) {
 ; CHECK-LABEL: Loop access info in function 'phi_load_store_memdep_check':
 ; CHECK-NEXT:   for.body:
-; CHECK-NEXT:    Report: unsafe dependent memory operations in loop. Use #pragma loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK-NEXT:    Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
 ; CHECK-NEXT:    Unknown data dependence.
 ; CHECK-NEXT:    Dependences:
 ; CHECK-NEXT:      Unknown:
diff --git a/llvm/test/Transforms/LoopVectorize/diag-with-hotness-info-2.ll b/llvm/test/Transforms/LoopVectorize/diag-with-hotness-info-2.ll
index 4da1d099645be..65bd36cfa0853 100644
--- a/llvm/test/Transforms/LoopVectorize/diag-with-hotness-info-2.ll
+++ b/llvm/test/Transforms/LoopVectorize/diag-with-hotness-info-2.ll
@@ -21,11 +21,11 @@
 ;  19	  }
 ;  20	}
 
-; CHECK: remark: /tmp/s.c:3:14: loop not vectorized: unsafe dependent memory operations in loop. Use #pragma loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK: remark: /tmp/s.c:3:14: loop not vectorized: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
 ; CHECK-NEXT: Backward loop carried data dependence. Memory location is the same as accessed at /tmp/s.c:3:16 (hotness: 300)
-; CHECK: remark: /tmp/s.c:10:14: loop not vectorized: unsafe dependent memory operations in loop. Use #pragma loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK: remark: /tmp/s.c:10:14: loop not vectorized: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
 ; CHECK-NEXT: Backward loop carried data dependence. Memory location is the same as accessed at /tmp/s.c:10:16 (hotness: 5000)
-; CHECK: remark: /tmp/s.c:17:14: loop not vectorized: unsafe dependent memory operations in loop. Use #pragma loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK: remark: /tmp/s.c:17:14: loop not vectorized: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
 ; CHECK-NEXT: Backward loop carried data dependence. Memory location is the same as accessed at /tmp/s.c:17:16{{$}}
 
 ; ModuleID = '/tmp/s.c'
diff --git a/llvm/test/Transforms/LoopVectorize/memory-dep-remarks.ll b/llvm/test/Transforms/LoopVectorize/memory-dep-remarks.ll
index 151a882dfb814..d96d85512621c 100644
--- a/llvm/test/Transforms/LoopVectorize/memory-dep-remarks.ll
+++ b/llvm/test/Transforms/LoopVectorize/memory-dep-remarks.ll
@@ -149,7 +149,7 @@ for.body:                                         ; preds = %entry, %for.body
 ;   }
 ; }
 
-; CHECK: remark: source.c:48:14: loop not vectorized: unsafe dependent memory operations in loop. Use #pragma loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK: remark: source.c:48:14: loop not vectorized: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
 ; CHECK-NEXT: Backward loop carried data dependence. Memory location is the same as accessed at source.c:47:5
 
 define void @test_backward_dep(i64 %n, ptr nocapture %A) {
@@ -194,7 +194,7 @@ for.body:                                         ; preds = %for.body.preheader,
 ;   }
 ; }
 
-; CHECK: remark: source.c:61:12: loop not vectorized: unsafe dependent memory operations in loop. Use #pragma loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK: remark: source.c:61:12: loop not vectorized: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
 ; CHECK-NEXT: Forward loop carried data dependence that prevents store-to-load forwarding. Memory location is the same as accessed at source.c:60:5
 
 define void @test_forwardButPreventsForwarding_dep(i64 %n, ptr nocapture %A, ptr nocapture %B) !dbg !166 {
@@ -232,7 +232,7 @@ for.body:                                         ; preds = %entry, %for.body
 ;   }
 ; }
 
-; CHECK: remark: source.c:74:5: loop not vectorized: unsafe dependent memory operations in loop. Use #pragma loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK: remark: source.c:74:5: loop not vectorized: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
 ; CHECK: Backward loop carried data dependence that prevents store-to-load forwarding. Memory location is the same as accessed at source.c:74:21
 
 define void @test_backwardVectorizableButPreventsForwarding(i64 %n, ptr nocapture %A) !dbg !189 {
@@ -268,7 +268,7 @@ for.body:                                         ; preds = %entry, %for.body
 ;   }
 ; }
 
-; CHECK: remark: source.c:83:7: loop not vectorized: unsafe dependent memory operations in loop. Use #pragma loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK: remark: source.c:83:7: loop not vectorized: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
 ; CHECK: Unknown data dependence. Memory location is the same as accessed at source.c:82:7
 
 define void @test_unknown_dep(i64 %n, ptr nocapture %A) !dbg !214 {
@@ -315,7 +315,7 @@ for.body:                                         ; preds = %entry, %for.body
 ; YAML-NEXT: Function:        test_backward_dep
 ; YAML-NEXT: Args:
 ; YAML-NEXT:   - String:          'loop not vectorized: '
-; YAML-NEXT:   - String:          'unsafe dependent memory operations in loop. Use #pragma loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop'
+; YAML-NEXT:   - String:          'unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop'
 ; YAML-NEXT:   - String:          "\nBackward loop carried data dependence."
 ; YAML-NEXT:   - String:          ' Memory location is the same as accessed at '
 ; YAML-NEXT:   - Location:        'source.c:47:5'
@@ -335,7 +335,7 @@ for.body:                                         ; preds = %entry, %for.body
 ; YAML-NEXT: Function:        test_forwardButPreventsForwarding_dep
 ; YAML-NEXT: Args:
 ; YAML-NEXT:   - String:          'loop not vectorized: '
-; YAML-NEXT:   - String:          'unsafe dependent memory operations in loop. Use #pragma loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop'
+; YAML-NEXT:   - String:          'unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop'
 ; YAML-NEXT:   - String:          "\nForward loop carried data dependence that prevents store-to-load forwarding."
 ; YAML-NEXT:   - String:          ' Memory location is the same as accessed at '
 ; YAML-NEXT:   - Location:        'source.c:60:5'
@@ -355,7 +355,7 @@ for.body:                                         ; preds = %entry, %for.body
 ; YAML-NEXT: Function:        test_backwardVectorizableButPreventsForwarding
 ; YAML-NEXT: Args:
 ; YAML-NEXT:   - String:          'loop not vectorized: '
-; YAML-NEXT:   - String:          'unsafe dependent memory operations in loop. Use #pragma loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop'
+; YAML-NEXT:   - String:          'unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop'
 ; YAML-NEXT:   - String:          "\nBackward loop carried data dependence that prevents store-to-load forwarding."
 ; YAML-NEXT:   - String:          ' Memory location is the same as accessed at '
 ; YAML-NEXT:   - Location:        'source.c:74:21'
@@ -375,7 +375,7 @@ for.body:                                         ; preds = %entry, %for.body
 ; YAML-NEXT: Function:        test_unknown_dep
 ; YAML-NEXT: Args:
 ; YAML-NEXT:   - String:          'loop not vectorized: '
-; YAML-NEXT:   - String:          'unsafe dependent memory operations in loop. Use #pragma loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop'
+; YAML-NEXT:   - String:          'unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop'
 ; YAML-NEXT:   - String:          "\nUnknown data dependence."
 ; YAML-NEXT:   - String:          ' Memory location is the same as accessed at '
 ; YAML-NEXT:   - Location:        'source.c:82:7'
diff --git a/llvm/test/Transforms/LoopVectorize/unsafe-dep-remark.ll b/llvm/test/Transforms/LoopVectorize/unsafe-dep-remark.ll
index 9394270f3a754..3d28604f7992c 100644
--- a/llvm/test/Transforms/LoopVectorize/unsafe-dep-remark.ll
+++ b/llvm/test/Transforms/LoopVectorize/unsafe-dep-remark.ll
@@ -11,7 +11,7 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 ;     5	  }
 ;     6	}
 
-; CHECK: remark: /tmp/kk.c:3:14: loop not vectorized: unsafe dependent memory operations in loop. Use #pragma loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK: remark: /tmp/kk.c:3:14: loop not vectorized: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
 
 define void @success(ptr nocapture %A, ptr nocapture readonly %B, ptr nocapture %C, ptr nocapture readonly %D, ptr nocapture readonly %E, i32 %N) !dbg !6 {
 entry:

From 2228b35f93256449ca2524ed38dd068b87163efc Mon Sep 17 00:00:00 2001
From: Amara Emerson <amara@apple.com>
Date: Sat, 28 Oct 2023 01:20:18 -0700
Subject: [PATCH 293/877] Revert "Revert "[InstCombine] Add oneuse checks to
 shr + cmp constant folds.""

This reverts commit d37b283cdd37feca5ea71456cf350005add268e7.

There was a simple logic bug in the else path. Tests codegen is different with
the fix.
---
 .../lib/Transforms/InstCombine/InstCombineCompares.cpp |  4 ++--
 .../InstCombine/ashr-icmp-minmax-idiom-break.ll        |  7 +++----
 llvm/test/Transforms/InstCombine/icmp-shr-lt-gt.ll     |  4 ++--
 llvm/test/Transforms/InstCombine/icmp-shr.ll           |  2 +-
 .../PhaseOrdering/icmp-ashr-breaking-select-idiom.ll   | 10 +++-------
 5 files changed, 11 insertions(+), 16 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
index d254c4706e0aa..1764917107f03 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -2451,7 +2451,7 @@ Instruction *InstCombinerImpl::foldICmpShrConstant(ICmpInst &Cmp,
   // constant-value-based preconditions in the folds below, then we could assert
   // those conditions rather than checking them. This is difficult because of
   // undef/poison (PR34838).
-  if (IsAShr) {
+  if (IsAShr && Shr->hasOneUse()) {
     if (IsExact || Pred == CmpInst::ICMP_SLT || Pred == CmpInst::ICMP_ULT) {
       // When ShAmtC can be shifted losslessly:
       // icmp PRED (ashr exact X, ShAmtC), C --> icmp PRED X, (C << ShAmtC)
@@ -2491,7 +2491,7 @@ Instruction *InstCombinerImpl::foldICmpShrConstant(ICmpInst &Cmp,
                             ConstantInt::getAllOnesValue(ShrTy));
       }
     }
-  } else {
+  } else if (!IsAShr) {
     if (Pred == CmpInst::ICMP_ULT || (Pred == CmpInst::ICMP_UGT && IsExact)) {
       // icmp ult (lshr X, ShAmtC), C --> icmp ult X, (C << ShAmtC)
       // icmp ugt (lshr exact X, ShAmtC), C --> icmp ugt X, (C << ShAmtC)
diff --git a/llvm/test/Transforms/InstCombine/ashr-icmp-minmax-idiom-break.ll b/llvm/test/Transforms/InstCombine/ashr-icmp-minmax-idiom-break.ll
index a539b91366896..c6d6e916b2c78 100644
--- a/llvm/test/Transforms/InstCombine/ashr-icmp-minmax-idiom-break.ll
+++ b/llvm/test/Transforms/InstCombine/ashr-icmp-minmax-idiom-break.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
 ; RUN: opt < %s -passes=instcombine -S | FileCheck %s
 
-; This test is pre-committed to show sub-optimal codegen due to
-; min/max idiom breakage. On AArch64, these constants are also expensive to materialize,
+; Check we don't have sub-optimal codegen due to min/max idiom breakage.
+; On AArch64, these constants are also expensive to materialize,
 ; and therefore generate poor code vs maintaining the min/max idiom.
 
 define i64 @dont_break_minmax_i64(i64 %conv, i64 %conv2) {
@@ -10,8 +10,7 @@ define i64 @dont_break_minmax_i64(i64 %conv, i64 %conv2) {
 ; CHECK-SAME: (i64 [[CONV:%.*]], i64 [[CONV2:%.*]]) {
 ; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i64 [[CONV]], [[CONV2]]
 ; CHECK-NEXT:    [[SHR:%.*]] = ashr i64 [[MUL]], 4
-; CHECK-NEXT:    [[CMP4_I:%.*]] = icmp slt i64 [[MUL]], 5579712
-; CHECK-NEXT:    [[SPEC_SELECT_I:%.*]] = select i1 [[CMP4_I]], i64 [[SHR]], i64 348731
+; CHECK-NEXT:    [[SPEC_SELECT_I:%.*]] = call i64 @llvm.smin.i64(i64 [[SHR]], i64 348731)
 ; CHECK-NEXT:    ret i64 [[SPEC_SELECT_I]]
 ;
   %mul = mul nsw i64 %conv, %conv2
diff --git a/llvm/test/Transforms/InstCombine/icmp-shr-lt-gt.ll b/llvm/test/Transforms/InstCombine/icmp-shr-lt-gt.ll
index 08a763a50bf95..1b8efe4351c6d 100644
--- a/llvm/test/Transforms/InstCombine/icmp-shr-lt-gt.ll
+++ b/llvm/test/Transforms/InstCombine/icmp-shr-lt-gt.ll
@@ -900,7 +900,7 @@ define i1 @ashrsgt_01_00(i4 %x) {
 define i1 @ashrsgt_01_00_multiuse(i4 %x, ptr %p) {
 ; CHECK-LABEL: @ashrsgt_01_00_multiuse(
 ; CHECK-NEXT:    [[S:%.*]] = ashr i4 [[X:%.*]], 1
-; CHECK-NEXT:    [[C:%.*]] = icmp sgt i4 [[X]], 1
+; CHECK-NEXT:    [[C:%.*]] = icmp sgt i4 [[S]], 0
 ; CHECK-NEXT:    store i4 [[S]], ptr [[P:%.*]], align 1
 ; CHECK-NEXT:    ret i1 [[C]]
 ;
@@ -2442,7 +2442,7 @@ define i1 @ashr_sle_noexact(i8 %x) {
 define i1 @ashr_00_00_ashr_extra_use(i8 %x, ptr %ptr) {
 ; CHECK-LABEL: @ashr_00_00_ashr_extra_use(
 ; CHECK-NEXT:    [[S:%.*]] = ashr exact i8 [[X:%.*]], 3
-; CHECK-NEXT:    [[C:%.*]] = icmp ult i8 [[X]], 88
+; CHECK-NEXT:    [[C:%.*]] = icmp ult i8 [[S]], 11
 ; CHECK-NEXT:    store i8 [[S]], ptr [[PTR:%.*]], align 1
 ; CHECK-NEXT:    ret i1 [[C]]
 ;
diff --git a/llvm/test/Transforms/InstCombine/icmp-shr.ll b/llvm/test/Transforms/InstCombine/icmp-shr.ll
index 253b38c8bfe59..44beb5c1c578b 100644
--- a/llvm/test/Transforms/InstCombine/icmp-shr.ll
+++ b/llvm/test/Transforms/InstCombine/icmp-shr.ll
@@ -780,7 +780,7 @@ define i1 @ashr_ult_2(i4 %x) {
 define i1 @ashr_ult_2_multiuse(i4 %x, ptr %p) {
 ; CHECK-LABEL: @ashr_ult_2_multiuse(
 ; CHECK-NEXT:    [[S:%.*]] = ashr i4 [[X:%.*]], 1
-; CHECK-NEXT:    [[R:%.*]] = icmp ult i4 [[X]], 4
+; CHECK-NEXT:    [[R:%.*]] = icmp ult i4 [[S]], 2
 ; CHECK-NEXT:    store i4 [[S]], ptr [[P:%.*]], align 1
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
diff --git a/llvm/test/Transforms/PhaseOrdering/icmp-ashr-breaking-select-idiom.ll b/llvm/test/Transforms/PhaseOrdering/icmp-ashr-breaking-select-idiom.ll
index 8559f973f281a..67d721b23d6f0 100644
--- a/llvm/test/Transforms/PhaseOrdering/icmp-ashr-breaking-select-idiom.ll
+++ b/llvm/test/Transforms/PhaseOrdering/icmp-ashr-breaking-select-idiom.ll
@@ -5,8 +5,7 @@ define i32 @testa(i32 %mul) {
 ; CHECK-LABEL: define i32 @testa(
 ; CHECK-SAME: i32 [[MUL:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:    [[SHR:%.*]] = ashr i32 [[MUL]], 15
-; CHECK-NEXT:    [[CMP4_I:%.*]] = icmp slt i32 [[MUL]], 1073741824
-; CHECK-NEXT:    [[SPEC_SELECT_I:%.*]] = select i1 [[CMP4_I]], i32 [[SHR]], i32 32767
+; CHECK-NEXT:    [[SPEC_SELECT_I:%.*]] = tail call i32 @llvm.smin.i32(i32 [[SHR]], i32 32767)
 ; CHECK-NEXT:    ret i32 [[SPEC_SELECT_I]]
 ;
   %shr = ashr i32 %mul, 15
@@ -20,11 +19,8 @@ define i32 @testb(i32 %mul) {
 ; CHECK-LABEL: define i32 @testb(
 ; CHECK-SAME: i32 [[MUL:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; CHECK-NEXT:    [[SHR102:%.*]] = ashr i32 [[MUL]], 7
-; CHECK-NEXT:    [[CMP4_I:%.*]] = icmp sgt i32 [[MUL]], 16383
-; CHECK-NEXT:    [[RETVAL_0_I:%.*]] = select i1 [[CMP4_I]], i32 127, i32 -128
-; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[MUL]], 16384
-; CHECK-NEXT:    [[CLEANUP_DEST_SLOT_0_I:%.*]] = icmp ult i32 [[TMP1]], 32768
-; CHECK-NEXT:    [[SPEC_SELECT_I:%.*]] = select i1 [[CLEANUP_DEST_SLOT_0_I]], i32 [[SHR102]], i32 [[RETVAL_0_I]]
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i32 @llvm.smax.i32(i32 [[SHR102]], i32 -128)
+; CHECK-NEXT:    [[SPEC_SELECT_I:%.*]] = tail call i32 @llvm.smin.i32(i32 [[TMP1]], i32 127)
 ; CHECK-NEXT:    ret i32 [[SPEC_SELECT_I]]
 ;
   %shr102 = ashr i32 %mul, 7

From d4b8572f11869fd47a4606ea91ee59d2833ce23c Mon Sep 17 00:00:00 2001
From: Alexander Shaposhnikov <ashaposhnikov@google.com>
Date: Sat, 28 Oct 2023 10:20:18 +0000
Subject: [PATCH 294/877] [compiler-rt] Fix src_rep_t_clz and clz_in_sig_frac

This is a follow-up to 910a4bf5b.

1. __builtin_clz takes unsigned int, thus for uint16_t
src_rep_t_clz can't use it directly but should subtract 16
(leading 16 bits of the promoted argument are zero).

2. Fix (and simplify) clz_in_sig_frac.

Test plan: ninja check-compiler-rt
(extendhfsf2_test.c and extenddftf2_test.c)
---
 compiler-rt/lib/builtins/fp_extend.h              | 8 ++++++--
 compiler-rt/test/builtins/Unit/extenddftf2_test.c | 6 +++++-
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/compiler-rt/lib/builtins/fp_extend.h b/compiler-rt/lib/builtins/fp_extend.h
index 961e521846a21..95ea2a7ac4b2c 100644
--- a/compiler-rt/lib/builtins/fp_extend.h
+++ b/compiler-rt/lib/builtins/fp_extend.h
@@ -75,7 +75,11 @@ static const int srcSigFracBits = 10;
 // srcBits - srcSigFracBits - 1
 static const int srcExpBits = 5;
 
-#define src_rep_t_clz __builtin_clz
+static inline int src_rep_t_clz_impl(src_rep_t a) {
+  return __builtin_clz(a) - 16;
+}
+
+#define src_rep_t_clz src_rep_t_clz_impl
 
 #else
 #error Source should be half, single, or double precision!
@@ -138,7 +142,7 @@ static inline src_rep_t extract_sig_frac_from_src(src_rep_t x) {
 
 #ifdef src_rep_t_clz
 static inline int clz_in_sig_frac(src_rep_t sigFrac) {
-      const int skip = (sizeof(dst_t) * CHAR_BIT - srcBits) + 1 + srcExpBits;
+      const int skip = 1 + srcExpBits;
       return src_rep_t_clz(sigFrac) - skip;
 }
 #endif
diff --git a/compiler-rt/test/builtins/Unit/extenddftf2_test.c b/compiler-rt/test/builtins/Unit/extenddftf2_test.c
index fcc030ca92202..b2fbdcf1b0794 100644
--- a/compiler-rt/test/builtins/Unit/extenddftf2_test.c
+++ b/compiler-rt/test/builtins/Unit/extenddftf2_test.c
@@ -64,7 +64,11 @@ int main()
                           UINT64_C(0x3fd2edcba9876543),
                           UINT64_C(0x2000000000000000)))
         return 1;
-
+    // denormal
+    if (test__extenddftf2(1.8194069811494184E-308,
+                          UINT64_C(0x3c00a2a7757954b9),
+                          UINT64_C(0x6000000000000000)))
+        return 1;
 #else
     printf("skipped\n");
 

From 39b939555f959b93061b3c4c8fffc13a63737074 Mon Sep 17 00:00:00 2001
From: gilsaia <794433219@qq.com>
Date: Sat, 28 Oct 2023 18:55:44 +0800
Subject: [PATCH 295/877] [MLIR][Presburger] Add simplify function (#69107)

Added the simplify function to reduce the size of the constraint system,
referencing the ISL implementation.

Tested it on a simple Benchmark implemented by myself, calling SImplify
before the operation and calling Simplify on the result after Subtract
were tested, respectively.

The Benchmark can be found here:
[benchmark](https://github.com/gilsaia/llvm-project-test-fpl/blob/develop_benchmark/mlir/benchmark/presburger/Benchmark.cpp)

For the case of calling Simplify before each operation, the overall
result is shown in the following figure.

![image](https://github.com/llvm/llvm-project/assets/38588948/7099286e-b9a2-42e0-bc2a-1ed6627ead00)

A comparison of the constraint system sizes and time for each operation
is as follows

![image](https://github.com/llvm/llvm-project/assets/38588948/e5d0e488-f76e-4438-b19e-f6163699c526)

![image](https://github.com/llvm/llvm-project/assets/38588948/119a08de-4ee1-4cde-886c-50a91b502d93)

![image](https://github.com/llvm/llvm-project/assets/38588948/7a8b69ac-6cdb-41ab-9a75-cd016664fa5a)

![image](https://github.com/llvm/llvm-project/assets/38588948/c84b6eb1-62dc-4bae-a771-67d97ebf514a)

![image](https://github.com/llvm/llvm-project/assets/38588948/cdbfa3ed-0155-481e-9273-9d6dba3a2d7b)

![image](https://github.com/llvm/llvm-project/assets/38588948/8c945cff-a0a4-472a-a178-6b6a70a1b16a)

![image](https://github.com/llvm/llvm-project/assets/38588948/0bfe3a2b-3568-4d31-bebf-bd1b3c4e734e)

![image](https://github.com/llvm/llvm-project/assets/38588948/f1a99d56-edf5-45de-a506-512c0584f1d8)

![image](https://github.com/llvm/llvm-project/assets/38588948/ffef3312-6c99-494c-bb52-73aa8df275bb)

![image](https://github.com/llvm/llvm-project/assets/38588948/3e5924a7-8e1f-49d1-bd27-02a2e10a5cc4)

![image](https://github.com/llvm/llvm-project/assets/38588948/cec8be0e-dd19-46fa-88b4-2585d4031c9e)

![image](https://github.com/llvm/llvm-project/assets/38588948/3cb68e89-82c7-4cd2-b6bc-70f15e495ce8)

For the case of calling Simplify on the result after Subtract, the
overall results are as follows

![image](https://github.com/llvm/llvm-project/assets/38588948/be5b9c50-7417-42c8-abbf-8a50f093c3f5)

A comparison of the constraint system sizes and time for subtract is as
follows

![image](https://github.com/llvm/llvm-project/assets/38588948/fafe10ba-f8bd-43cd-b281-aaebf09af0af)

![image](https://github.com/llvm/llvm-project/assets/38588948/24662b40-42fc-47ee-a0a3-1b8b8f5778d2)
---
 .../Analysis/Presburger/IntegerRelation.h     |  26 +++-
 .../Analysis/Presburger/PresburgerRelation.h  |  10 +-
 .../Analysis/Presburger/IntegerRelation.cpp   | 136 +++++++++++++++++-
 .../Presburger/PresburgerRelation.cpp         |  48 +++++--
 4 files changed, 200 insertions(+), 20 deletions(-)

diff --git a/mlir/include/mlir/Analysis/Presburger/IntegerRelation.h b/mlir/include/mlir/Analysis/Presburger/IntegerRelation.h
index 56484622ec980..4c6b810f92e95 100644
--- a/mlir/include/mlir/Analysis/Presburger/IntegerRelation.h
+++ b/mlir/include/mlir/Analysis/Presburger/IntegerRelation.h
@@ -91,6 +91,15 @@ class IntegerRelation {
     return IntegerRelation(space);
   }
 
+  /// Return an empty system containing an invalid equation 0 = 1.
+  static IntegerRelation getEmpty(const PresburgerSpace &space) {
+    IntegerRelation result(0, 1, space.getNumVars() + 1, space);
+    SmallVector<int64_t> invalidEq(space.getNumVars() + 1, 0);
+    invalidEq.back() = 1;
+    result.addEquality(invalidEq);
+    return result;
+  }
+
   /// Return the kind of this IntegerRelation.
   virtual Kind getKind() const { return Kind::IntegerRelation; }
 
@@ -138,7 +147,7 @@ class IntegerRelation {
   /// returns false. The equality check is performed in a plain manner, by
   /// comparing if all the equalities and inequalities in `this` and `other`
   /// are the same.
-  bool isPlainEqual(const IntegerRelation &other) const;
+  bool isObviouslyEqual(const IntegerRelation &other) const;
 
   /// Return whether this is a subset of the given IntegerRelation. This is
   /// integer-exact and somewhat expensive, since it uses the integer emptiness
@@ -351,6 +360,9 @@ class IntegerRelation {
   /// Returns false otherwise.
   bool isEmpty() const;
 
+  /// Performs GCD checks and invalid constraint checks.
+  bool isObviouslyEmpty() const;
+
   /// Runs the GCD test on all equality constraints. Returns true if this test
   /// fails on any equality. Returns false otherwise.
   /// This test can be used to disprove the existence of a solution. If it
@@ -545,6 +557,10 @@ class IntegerRelation {
 
   void removeDuplicateDivs();
 
+  /// Simplify the constraint system by removing canonicalizing constraints and
+  /// removing redundant constraints.
+  void simplify();
+
   /// Converts variables of kind srcKind in the range [varStart, varLimit) to
   /// variables of kind dstKind. If `pos` is given, the variables are placed at
   /// position `pos` of dstKind, otherwise they are placed after all the other
@@ -724,6 +740,10 @@ class IntegerRelation {
   /// Returns the number of variables eliminated.
   unsigned gaussianEliminateVars(unsigned posStart, unsigned posLimit);
 
+  /// Perform a Gaussian elimination operation to reduce all equations to
+  /// standard form. Returns whether the constraint system was modified.
+  bool gaussianEliminate();
+
   /// Eliminates the variable at the specified position using Fourier-Motzkin
   /// variable elimination, but uses Gaussian elimination if there is an
   /// equality involving that variable. If the result of the elimination is
@@ -755,6 +775,10 @@ class IntegerRelation {
   /// equalities.
   bool isColZero(unsigned pos) const;
 
+  /// Checks for identical inequalities and eliminates redundant inequalities.
+  /// Returns whether the constraint system was modified.
+  bool removeDuplicateConstraints();
+
   /// Returns false if the fields corresponding to various variable counts, or
   /// equality/inequality buffer sizes aren't consistent; true otherwise. This
   /// is meant to be used within an assert internally.
diff --git a/mlir/include/mlir/Analysis/Presburger/PresburgerRelation.h b/mlir/include/mlir/Analysis/Presburger/PresburgerRelation.h
index 0c9c5cf67b4c3..c6b00eca90733 100644
--- a/mlir/include/mlir/Analysis/Presburger/PresburgerRelation.h
+++ b/mlir/include/mlir/Analysis/Presburger/PresburgerRelation.h
@@ -169,17 +169,17 @@ class PresburgerRelation {
   bool isIntegerEmpty() const;
 
   /// Return true if there is no disjunct, false otherwise.
-  bool isPlainEmpty() const;
+  bool isObviouslyEmpty() const;
 
   /// Return true if the set is known to have one unconstrained disjunct, false
   /// otherwise.
-  bool isPlainUniverse() const;
+  bool isObviouslyUniverse() const;
 
   /// Perform a quick equality check on `this` and `other`. The relations are
   /// equal if the check return true, but may or may not be equal if the check
   /// returns false. This is doing by directly comparing whether each internal
   /// disjunct is the same.
-  bool isPlainEqual(const PresburgerRelation &set) const;
+  bool isObviouslyEqual(const PresburgerRelation &set) const;
 
   /// Return true if the set is consist of a single disjunct, without any local
   /// variables, false otherwise.
@@ -213,6 +213,10 @@ class PresburgerRelation {
   /// also be a union of convex disjuncts.
   PresburgerRelation computeReprWithOnlyDivLocals() const;
 
+  /// Simplify each disjunct, canonicalizing each disjunct and removing
+  /// redundencies.
+  PresburgerRelation simplify() const;
+
   /// Print the set's internal state.
   void print(raw_ostream &os) const;
   void dump() const;
diff --git a/mlir/lib/Analysis/Presburger/IntegerRelation.cpp b/mlir/lib/Analysis/Presburger/IntegerRelation.cpp
index be764bd7c9176..a18947a04602a 100644
--- a/mlir/lib/Analysis/Presburger/IntegerRelation.cpp
+++ b/mlir/lib/Analysis/Presburger/IntegerRelation.cpp
@@ -80,7 +80,7 @@ bool IntegerRelation::isEqual(const IntegerRelation &other) const {
   return PresburgerRelation(*this).isEqual(PresburgerRelation(other));
 }
 
-bool IntegerRelation::isPlainEqual(const IntegerRelation &other) const {
+bool IntegerRelation::isObviouslyEqual(const IntegerRelation &other) const {
   if (!space.isEqual(other.getSpace()))
     return false;
   if (getNumEqualities() != other.getNumEqualities())
@@ -701,6 +701,12 @@ bool IntegerRelation::isEmpty() const {
   return false;
 }
 
+bool IntegerRelation::isObviouslyEmpty() const {
+  if (isEmptyByGCDTest() || hasInvalidConstraint())
+    return true;
+  return false;
+}
+
 // Runs the GCD test on all equality constraints. Returns 'true' if this test
 // fails on any equality. Returns 'false' otherwise.
 // This test can be used to disprove the existence of a solution. If it returns
@@ -1079,6 +1085,57 @@ unsigned IntegerRelation::gaussianEliminateVars(unsigned posStart,
   return posLimit - posStart;
 }
 
+bool IntegerRelation::gaussianEliminate() {
+  gcdTightenInequalities();
+  unsigned firstVar = 0, vars = getNumVars();
+  unsigned nowDone, eqs, pivotRow;
+  for (nowDone = 0, eqs = getNumEqualities(); nowDone < eqs; ++nowDone) {
+    // Finds the first non-empty column.
+    for (; firstVar < vars; ++firstVar) {
+      if (!findConstraintWithNonZeroAt(firstVar, true, &pivotRow))
+        continue;
+      break;
+    }
+    // The matrix has been normalized to row echelon form.
+    if (firstVar >= vars)
+      break;
+
+    // The first pivot row found is below where it should currently be placed.
+    if (pivotRow > nowDone) {
+      equalities.swapRows(pivotRow, nowDone);
+      pivotRow = nowDone;
+    }
+
+    // Normalize all lower equations and all inequalities.
+    for (unsigned i = nowDone + 1; i < eqs; ++i) {
+      eliminateFromConstraint(this, i, pivotRow, firstVar, 0, true);
+      equalities.normalizeRow(i);
+    }
+    for (unsigned i = 0, ineqs = getNumInequalities(); i < ineqs; ++i) {
+      eliminateFromConstraint(this, i, pivotRow, firstVar, 0, false);
+      inequalities.normalizeRow(i);
+    }
+    gcdTightenInequalities();
+  }
+
+  // No redundant rows.
+  if (nowDone == eqs)
+    return false;
+
+  // Check to see if the redundant rows constant is zero, a non-zero value means
+  // the set is empty.
+  for (unsigned i = nowDone; i < eqs; ++i) {
+    if (atEq(i, vars) == 0)
+      continue;
+
+    *this = getEmpty(getSpace());
+    return true;
+  }
+  // Eliminate rows that are confined to be all zeros.
+  removeEqualityRange(nowDone, eqs);
+  return true;
+}
+
 // A more complex check to eliminate redundant inequalities. Uses FourierMotzkin
 // to check if a constraint is redundant.
 void IntegerRelation::removeRedundantInequalities() {
@@ -1269,6 +1326,21 @@ void IntegerRelation::removeDuplicateDivs() {
   divs.removeDuplicateDivs(merge);
 }
 
+void IntegerRelation::simplify() {
+  bool changed = true;
+  // Repeat until we reach a fixed point.
+  while (changed) {
+    if (isObviouslyEmpty())
+      return;
+    changed = false;
+    normalizeConstraintsByGCD();
+    changed |= gaussianEliminate();
+    changed |= removeDuplicateConstraints();
+  }
+  // Current set is not empty.
+  return;
+}
+
 /// Removes local variables using equalities. Each equality is checked if it
 /// can be reduced to the form: `e = affine-expr`, where `e` is a local
 /// variable and `affine-expr` is an affine expression not containing `e`.
@@ -2216,6 +2288,68 @@ IntegerPolyhedron IntegerRelation::getDomainSet() const {
   return IntegerPolyhedron(std::move(copyRel));
 }
 
+bool IntegerRelation::removeDuplicateConstraints() {
+  bool changed = false;
+  SmallDenseMap<ArrayRef<MPInt>, unsigned> hashTable;
+  unsigned ineqs = getNumInequalities(), cols = getNumCols();
+
+  if (ineqs <= 1)
+    return changed;
+
+  // Check if the non-constant part of the constraint is the same.
+  ArrayRef<MPInt> row = getInequality(0).drop_back();
+  hashTable.insert({row, 0});
+  for (unsigned k = 1; k < ineqs; ++k) {
+    row = getInequality(k).drop_back();
+    if (!hashTable.contains(row)) {
+      hashTable.insert({row, k});
+      continue;
+    }
+
+    // For identical cases, keep only the smaller part of the constant term.
+    unsigned l = hashTable[row];
+    changed = true;
+    if (atIneq(k, cols - 1) <= atIneq(l, cols - 1))
+      inequalities.swapRows(k, l);
+    removeInequality(k);
+    --k;
+    --ineqs;
+  }
+
+  // Check the neg form of each inequality, need an extra vector to store it.
+  SmallVector<MPInt> negIneq(cols - 1);
+  for (unsigned k = 0; k < ineqs; ++k) {
+    row = getInequality(k).drop_back();
+    negIneq.assign(row.begin(), row.end());
+    for (MPInt &ele : negIneq)
+      ele = -ele;
+    if (!hashTable.contains(negIneq))
+      continue;
+
+    // For cases where the neg is the same as other inequalities, check that the
+    // sum of their constant terms is positive.
+    unsigned l = hashTable[row];
+    auto sum = atIneq(l, cols - 1) + atIneq(k, cols - 1);
+    if (sum > 0 || l == k)
+      continue;
+
+    // A sum of constant terms equal to zero combines two inequalities into one
+    // equation, less than zero means the set is empty.
+    changed = true;
+    if (k < l)
+      std::swap(l, k);
+    if (sum == 0) {
+      addEquality(getInequality(k));
+      removeInequality(k);
+      removeInequality(l);
+    } else
+      *this = getEmpty(getSpace());
+    break;
+  }
+
+  return changed;
+}
+
 IntegerPolyhedron IntegerRelation::getRangeSet() const {
   IntegerRelation copyRel = *this;
 
diff --git a/mlir/lib/Analysis/Presburger/PresburgerRelation.cpp b/mlir/lib/Analysis/Presburger/PresburgerRelation.cpp
index 5a9cf71fc8679..a82b60cddc73d 100644
--- a/mlir/lib/Analysis/Presburger/PresburgerRelation.cpp
+++ b/mlir/lib/Analysis/Presburger/PresburgerRelation.cpp
@@ -83,19 +83,19 @@ void PresburgerRelation::unionInPlace(const IntegerRelation &disjunct) {
 void PresburgerRelation::unionInPlace(const PresburgerRelation &set) {
   assert(space.isCompatible(set.getSpace()) && "Spaces should match");
 
-  if (isPlainEqual(set))
+  if (isObviouslyEqual(set))
     return;
 
-  if (isPlainEmpty()) {
+  if (isObviouslyEmpty()) {
     disjuncts = set.disjuncts;
     return;
   }
-  if (set.isPlainEmpty())
+  if (set.isObviouslyEmpty())
     return;
 
-  if (isPlainUniverse())
+  if (isObviouslyUniverse())
     return;
-  if (set.isPlainUniverse()) {
+  if (set.isObviouslyUniverse()) {
     disjuncts = set.disjuncts;
     return;
   }
@@ -144,10 +144,10 @@ PresburgerRelation::intersect(const PresburgerRelation &set) const {
 
   // If the set is empty or the other set is universe,
   // directly return the set
-  if (isPlainEmpty() || set.isPlainUniverse())
+  if (isObviouslyEmpty() || set.isObviouslyUniverse())
     return *this;
 
-  if (set.isPlainEmpty() || isPlainUniverse())
+  if (set.isObviouslyEmpty() || isObviouslyUniverse())
     return set;
 
   PresburgerRelation result(getSpace());
@@ -576,6 +576,9 @@ static PresburgerRelation getSetDifference(IntegerRelation b,
     }
   }
 
+  // Try to simplify the results.
+  result = result.simplify();
+
   return result;
 }
 
@@ -593,7 +596,7 @@ PresburgerRelation::subtract(const PresburgerRelation &set) const {
 
   // If we know that the two sets are clearly equal, we can simply return the
   // empty set.
-  if (isPlainEqual(set))
+  if (isObviouslyEqual(set))
     return result;
 
   // We compute (U_i t_i) \ (U_i set_i) as U_i (t_i \ V_i set_i).
@@ -615,7 +618,7 @@ bool PresburgerRelation::isEqual(const PresburgerRelation &set) const {
   return this->isSubsetOf(set) && set.isSubsetOf(*this);
 }
 
-bool PresburgerRelation::isPlainEqual(const PresburgerRelation &set) const {
+bool PresburgerRelation::isObviouslyEqual(const PresburgerRelation &set) const {
   if (!space.isCompatible(set.getSpace()))
     return false;
 
@@ -625,7 +628,7 @@ bool PresburgerRelation::isPlainEqual(const PresburgerRelation &set) const {
   // Compare each disjunct in this PresburgerRelation with the corresponding
   // disjunct in the other PresburgerRelation.
   for (unsigned int i = 0, n = getNumDisjuncts(); i < n; ++i) {
-    if (!getDisjunct(i).isPlainEqual(set.getDisjunct(i)))
+    if (!getDisjunct(i).isObviouslyEqual(set.getDisjunct(i)))
       return false;
   }
   return true;
@@ -635,10 +638,12 @@ bool PresburgerRelation::isPlainEqual(const PresburgerRelation &set) const {
 /// otherwise. It is a simple check that only check if the relation has at least
 /// one unconstrained disjunct, indicating the absence of constraints or
 /// conditions.
-bool PresburgerRelation::isPlainUniverse() const {
-  return llvm::any_of(getAllDisjuncts(), [](const IntegerRelation &disjunct) {
-    return disjunct.getNumConstraints() == 0;
-  });
+bool PresburgerRelation::isObviouslyUniverse() const {
+  for (const IntegerRelation &disjunct : getAllDisjuncts()) {
+    if (disjunct.getNumConstraints() == 0)
+      return true;
+  }
+  return false;
 }
 
 bool PresburgerRelation::isConvexNoLocals() const {
@@ -646,7 +651,9 @@ bool PresburgerRelation::isConvexNoLocals() const {
 }
 
 /// Return true if there is no disjunct, false otherwise.
-bool PresburgerRelation::isPlainEmpty() const { return getNumDisjuncts() == 0; }
+bool PresburgerRelation::isObviouslyEmpty() const {
+  return getNumDisjuncts() == 0;
+}
 
 /// Return true if all the sets in the union are known to be integer empty,
 /// false otherwise.
@@ -1015,6 +1022,17 @@ bool PresburgerRelation::hasOnlyDivLocals() const {
   });
 }
 
+PresburgerRelation PresburgerRelation::simplify() const {
+  PresburgerRelation origin = *this;
+  PresburgerRelation result = PresburgerRelation(getSpace());
+  for (IntegerRelation &disjunct : origin.disjuncts) {
+    disjunct.simplify();
+    if (!disjunct.isObviouslyEmpty())
+      result.unionInPlace(disjunct);
+  }
+  return result;
+}
+
 void PresburgerRelation::print(raw_ostream &os) const {
   os << "Number of Disjuncts: " << getNumDisjuncts() << "\n";
   for (const IntegerRelation &disjunct : disjuncts) {

From 497b2ebb9edcfd5315586b796f47589e9820b4b9 Mon Sep 17 00:00:00 2001
From: Vlad Serebrennikov <serebrennikov.vladislav@gmail.com>
Date: Sat, 28 Oct 2023 16:02:03 +0400
Subject: [PATCH 296/877] [clang-format] Change LLVM style to
 BreakAfterAttributes == ABS_Leave (#70360)

This patch addresses some examples of bad formatting in Clang. The following commit contains only changes suggested by clang-format: https://github.com/llvm/llvm-project/pull/70349/commits/21689b56d1fc1db0b2263e8049ff656d3757ad36.
I believe it's a net negative on readability, with a couple of particularly bad cases. Highlights:
```diff
-    [[clang::preferred_type(bool)]]
-    mutable unsigned CachedLocalOrUnnamed : 1;
+    [[clang::preferred_type(bool)]] mutable unsigned CachedLocalOrUnnamed : 1;
```
```diff
-    [[clang::preferred_type(TypeDependence)]]
-    unsigned Dependence : llvm::BitWidth<TypeDependence>;
+    [[clang::preferred_type(TypeDependence)]] unsigned Dependence
+        : llvm::BitWidth<TypeDependence>;
```
```diff
-    [[clang::preferred_type(ExceptionSpecificationType)]]
-    unsigned ExceptionSpecType : 4;
+    [[clang::preferred_type(
+        ExceptionSpecificationType)]] unsigned ExceptionSpecType : 4;
```
Style guides doesn't appear to have an opinion on this matter.
---
 clang/docs/ReleaseNotes.rst           | 1 +
 clang/lib/Format/Format.cpp           | 2 +-
 clang/unittests/Format/FormatTest.cpp | 8 ++++----
 3 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 2595737e8b3b1..bc28bb567f693 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -820,6 +820,7 @@ clang-format
 ------------
 - Add ``AllowBreakBeforeNoexceptSpecifier`` option.
 - Add ``AllowShortCompoundRequirementOnASingleLine`` option.
+- Change ``BreakAfterAttributes`` from ``Never`` to ``Leave`` in LLVM style.
 
 libclang
 --------
diff --git a/clang/lib/Format/Format.cpp b/clang/lib/Format/Format.cpp
index edb33f4af4def..e1abcac5604a4 100644
--- a/clang/lib/Format/Format.cpp
+++ b/clang/lib/Format/Format.cpp
@@ -1476,7 +1476,7 @@ FormatStyle getLLVMStyle(FormatStyle::LanguageKind Language) {
                              /*SplitEmptyFunction=*/true,
                              /*SplitEmptyRecord=*/true,
                              /*SplitEmptyNamespace=*/true};
-  LLVMStyle.BreakAfterAttributes = FormatStyle::ABS_Never;
+  LLVMStyle.BreakAfterAttributes = FormatStyle::ABS_Leave;
   LLVMStyle.BreakAfterJavaFieldAnnotations = false;
   LLVMStyle.BreakArrays = true;
   LLVMStyle.BreakBeforeBinaryOperators = FormatStyle::BOS_None;
diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp
index b2d84f2ee3895..15c12c2b8b0da 100644
--- a/clang/unittests/Format/FormatTest.cpp
+++ b/clang/unittests/Format/FormatTest.cpp
@@ -26179,7 +26179,6 @@ TEST_F(FormatTest, RemoveSemicolon) {
 
 TEST_F(FormatTest, BreakAfterAttributes) {
   FormatStyle Style = getLLVMStyle();
-  EXPECT_EQ(Style.BreakAfterAttributes, FormatStyle::ABS_Never);
 
   constexpr StringRef Code("[[nodiscard]] inline int f(int &i);\n"
                            "[[foo([[]])]] [[nodiscard]]\n"
@@ -26194,6 +26193,10 @@ TEST_F(FormatTest, BreakAfterAttributes) {
                            "  return 1;\n"
                            "}");
 
+  EXPECT_EQ(Style.BreakAfterAttributes, FormatStyle::ABS_Leave);
+  verifyNoChange(Code, Style);
+
+  Style.BreakAfterAttributes = FormatStyle::ABS_Never;
   verifyFormat("[[nodiscard]] inline int f(int &i);\n"
                "[[foo([[]])]] [[nodiscard]] int g(int &i);\n"
                "[[nodiscard]] inline int f(int &i) {\n"
@@ -26206,9 +26209,6 @@ TEST_F(FormatTest, BreakAfterAttributes) {
                "}",
                Code, Style);
 
-  Style.BreakAfterAttributes = FormatStyle::ABS_Leave;
-  verifyNoChange(Code, Style);
-
   Style.BreakAfterAttributes = FormatStyle::ABS_Always;
   verifyFormat("[[nodiscard]]\n"
                "inline int f(int &i);\n"

From b1554fe080e8b074a30f2a4cb88666bdb7b1ce9e Mon Sep 17 00:00:00 2001
From: Igor Kudrin <ikudrin@accesssoftek.com>
Date: Sun, 29 Oct 2023 01:04:24 +0700
Subject: [PATCH 297/877] [Linker] Do not keep a private member of a
 non-prevailing comdat group (#69143)

`IRMover` links in referenced private global values unconditionally, see
`IRLinker::shouldLink()`. If they are part of a non-prevailing comdat,
this leads to duplication of the values. Full and Thin LTO avoid
duplication by changing the linkage of members of non-prevailing comdat
groups to `available_externally`, which was implemented in
https://reviews.llvm.org/D34803 and https://reviews.llvm.org/D135427.
This patch does the same for `Linker`, but limits the effect only to
private members without aliases to minimize interference.

Motivation example:
```
> cat foo.h
inline int foo(int a) { return a + 1; }
> cat bar.cpp
#include "foo.h"
int bar(int a) { return foo(a + 1); }
> cat main.cpp
#include "foo.h"
int bar(int a);
int main(int argc, const char* argv[]) { return bar(argc) + foo(argc); }
> clang++ -c -flto -fprofile-instr-generate main.cpp -o main.o
> clang++ -c -flto -fprofile-instr-generate bar.cpp -o bar.o
> clang++ -fuse-ld=lld -fprofile-instr-generate main.o bar.o -o test1
> ./test1
> llvm-profdata merge --text default.profraw -o -
_Z3fooi
# Counter Values:
2
> llvm-link main.o bar.o -o combined.o
> clang++ -fuse-ld=lld -fprofile-instr-generate combined.o -o test2
> ./test2
> llvm-profdata merge --text default.profraw -o -
_Z3fooi
# Counter Values:
4
```
---
 llvm/lib/Linker/LinkModules.cpp               | 21 ++++++++++++++
 llvm/test/Linker/comdat-nonprevailing-decl.ll | 29 +++++++++++++++++++
 2 files changed, 50 insertions(+)

diff --git a/llvm/lib/Linker/LinkModules.cpp b/llvm/lib/Linker/LinkModules.cpp
index 2f5fac4951f29..4fe1f1a0f5183 100644
--- a/llvm/lib/Linker/LinkModules.cpp
+++ b/llvm/lib/Linker/LinkModules.cpp
@@ -462,6 +462,7 @@ void ModuleLinker::dropReplacedComdat(
 bool ModuleLinker::run() {
   Module &DstM = Mover.getModule();
   DenseSet<const Comdat *> ReplacedDstComdats;
+  DenseSet<const Comdat *> NonPrevailingComdats;
 
   for (const auto &SMEC : SrcM->getComdatSymbolTable()) {
     const Comdat &C = SMEC.getValue();
@@ -473,6 +474,9 @@ bool ModuleLinker::run() {
       return true;
     ComdatsChosen[&C] = std::make_pair(SK, From);
 
+    if (From == LinkFrom::Dst)
+      NonPrevailingComdats.insert(&C);
+
     if (From != LinkFrom::Src)
       continue;
 
@@ -497,6 +501,23 @@ bool ModuleLinker::run() {
   for (Function &GV : llvm::make_early_inc_range(DstM))
     dropReplacedComdat(GV, ReplacedDstComdats);
 
+  if (!NonPrevailingComdats.empty()) {
+    DenseSet<GlobalObject *> AliasedGlobals;
+    for (auto &GA : SrcM->aliases())
+      if (GlobalObject *GO = GA.getAliaseeObject(); GO && GO->getComdat())
+        AliasedGlobals.insert(GO);
+    for (const Comdat *C : NonPrevailingComdats) {
+      SmallVector<GlobalObject *> ToUpdate;
+      for (GlobalObject *GO : C->getUsers())
+        if (GO->hasPrivateLinkage() && !AliasedGlobals.contains(GO))
+          ToUpdate.push_back(GO);
+      for (GlobalObject *GO : ToUpdate) {
+        GO->setLinkage(GlobalValue::AvailableExternallyLinkage);
+        GO->setComdat(nullptr);
+      }
+    }
+  }
+
   for (GlobalVariable &GV : SrcM->globals())
     if (GV.hasLinkOnceLinkage())
       if (const Comdat *SC = GV.getComdat())
diff --git a/llvm/test/Linker/comdat-nonprevailing-decl.ll b/llvm/test/Linker/comdat-nonprevailing-decl.ll
index d3dfec8fb2bc7..30dd0e0abf8c5 100644
--- a/llvm/test/Linker/comdat-nonprevailing-decl.ll
+++ b/llvm/test/Linker/comdat-nonprevailing-decl.ll
@@ -1,5 +1,6 @@
 ; RUN: rm -rf %t && split-file %s %t
 ; RUN: llvm-link -S %t/1.ll %t/1-aux.ll -o - | FileCheck %s
+; RUN: llvm-link -S %t/2.ll %t/2-aux.ll -o - | FileCheck %s --check-prefix=CHECK2
 
 ;--- 1.ll
 $c = comdat any
@@ -23,3 +24,31 @@ define ptr @f3() {
   ret ptr @v3
 }
 
+;--- 2.ll
+;; Check that a private global variable from a non-prevailing comdat group is
+;; converted into 'available_externally' and excluded from the comdat group.
+
+; CHECK2: $__profc_foo = comdat any
+; CHECK2: @llvm.compiler.used = appending global [2 x ptr] [ptr @__profd_foo.[[SUFFIX:[0-9]+]], ptr @__profd_foo]
+; CHECK2: @__profd_foo.[[SUFFIX]] = private global ptr @__profc_foo, comdat($__profc_foo)
+; CHECK2: @__profc_foo = linkonce_odr global i64 1, comdat
+; CHECK2: @__profd_foo = available_externally dso_local global ptr @__profc_foo{{$}}
+
+$__profc_foo = comdat any
+@__profc_foo = linkonce_odr global i64 1, comdat
+@__profd_foo = private global ptr @__profc_foo, comdat($__profc_foo)
+@llvm.compiler.used = appending global [1 x ptr] [ ptr @__profd_foo ]
+
+define ptr @bar() {
+  ret ptr @__profc_foo
+}
+
+;--- 2-aux.ll
+$__profc_foo = comdat any
+@__profc_foo = linkonce_odr global i64 1, comdat
+@__profd_foo = private global ptr @__profc_foo, comdat($__profc_foo)
+@llvm.compiler.used = appending global [1 x ptr] [ ptr @__profd_foo ]
+
+define ptr @baz() {
+  ret ptr @__profc_foo
+}

From e00d32afb9d33a1eca48e2b041c9688436706c5b Mon Sep 17 00:00:00 2001
From: Owen Pan <owenpiano@gmail.com>
Date: Sat, 28 Oct 2023 12:48:00 -0700
Subject: [PATCH 298/877] [clang-format][NFC] Delete TT_LambdaArrow (#70519)

It's one type of TT_TrailingReturnArrow.
---
 clang/lib/Format/ContinuationIndenter.cpp     |  8 +++--
 clang/lib/Format/FormatToken.h                |  3 +-
 clang/lib/Format/TokenAnnotator.cpp           | 34 +++++++++----------
 clang/lib/Format/UnwrappedLineParser.cpp      |  2 +-
 clang/unittests/Format/TokenAnnotatorTest.cpp | 12 +++----
 5 files changed, 29 insertions(+), 30 deletions(-)

diff --git a/clang/lib/Format/ContinuationIndenter.cpp b/clang/lib/Format/ContinuationIndenter.cpp
index 3b28f84fd8417..3a829cdedb77f 100644
--- a/clang/lib/Format/ContinuationIndenter.cpp
+++ b/clang/lib/Format/ContinuationIndenter.cpp
@@ -813,8 +813,10 @@ void ContinuationIndenter::addTokenOnCurrentLine(LineState &State, bool DryRun,
     CurrentState.ContainsUnwrappedBuilder = true;
   }
 
-  if (Current.is(TT_LambdaArrow) && Style.Language == FormatStyle::LK_Java)
+  if (Current.is(TT_TrailingReturnArrow) &&
+      Style.Language == FormatStyle::LK_Java) {
     CurrentState.NoLineBreak = true;
+  }
   if (Current.isMemberAccess() && Previous.is(tok::r_paren) &&
       (Previous.MatchingParen &&
        (Previous.TotalLength - Previous.MatchingParen->TotalLength > 10))) {
@@ -969,7 +971,7 @@ unsigned ContinuationIndenter::addTokenOnNewLine(LineState &State,
   //
   // is common and should be formatted like a free-standing function. The same
   // goes for wrapping before the lambda return type arrow.
-  if (Current.isNot(TT_LambdaArrow) &&
+  if (Current.isNot(TT_TrailingReturnArrow) &&
       (!Style.isJavaScript() || Current.NestingLevel != 0 ||
        !PreviousNonComment || PreviousNonComment->isNot(tok::equal) ||
        !Current.isOneOf(Keywords.kw_async, Keywords.kw_function))) {
@@ -1548,7 +1550,7 @@ unsigned ContinuationIndenter::moveStateToNextToken(LineState &State,
   }
   if (Current.isOneOf(TT_BinaryOperator, TT_ConditionalExpr) && Newline)
     CurrentState.NestedBlockIndent = State.Column + Current.ColumnWidth + 1;
-  if (Current.isOneOf(TT_LambdaLSquare, TT_LambdaArrow))
+  if (Current.isOneOf(TT_LambdaLSquare, TT_TrailingReturnArrow))
     CurrentState.LastSpace = State.Column;
   if (Current.is(TT_RequiresExpression) &&
       Style.RequiresExpressionIndentation == FormatStyle::REI_Keyword) {
diff --git a/clang/lib/Format/FormatToken.h b/clang/lib/Format/FormatToken.h
index acd24f836199d..87f6a76ec5bfd 100644
--- a/clang/lib/Format/FormatToken.h
+++ b/clang/lib/Format/FormatToken.h
@@ -100,7 +100,6 @@ namespace format {
   TYPE(JsTypeColon)                                                            \
   TYPE(JsTypeOperator)                                                         \
   TYPE(JsTypeOptionalQuestion)                                                 \
-  TYPE(LambdaArrow)                                                            \
   TYPE(LambdaLBrace)                                                           \
   TYPE(LambdaLSquare)                                                          \
   TYPE(LeadingJavaAnnotation)                                                  \
@@ -690,7 +689,7 @@ struct FormatToken {
   bool isMemberAccess() const {
     return isOneOf(tok::arrow, tok::period, tok::arrowstar) &&
            !isOneOf(TT_DesignatedInitializerPeriod, TT_TrailingReturnArrow,
-                    TT_LambdaArrow, TT_LeadingJavaAnnotation);
+                    TT_LeadingJavaAnnotation);
   }
 
   bool isUnaryOperator() const {
diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp
index e0ea8bcdb07a3..9ec8b93e39fd2 100644
--- a/clang/lib/Format/TokenAnnotator.cpp
+++ b/clang/lib/Format/TokenAnnotator.cpp
@@ -766,7 +766,8 @@ class AnnotatingParser {
         }
         // An arrow after an ObjC method expression is not a lambda arrow.
         if (CurrentToken->getType() == TT_ObjCMethodExpr &&
-            CurrentToken->Next && CurrentToken->Next->is(TT_LambdaArrow)) {
+            CurrentToken->Next &&
+            CurrentToken->Next->is(TT_TrailingReturnArrow)) {
           CurrentToken->Next->overwriteFixedType(TT_Unknown);
         }
         Left->MatchingParen = CurrentToken;
@@ -1409,10 +1410,8 @@ class AnnotatingParser {
       }
       break;
     case tok::arrow:
-      if (Tok->isNot(TT_LambdaArrow) && Tok->Previous &&
-          Tok->Previous->is(tok::kw_noexcept)) {
+      if (Tok->Previous && Tok->Previous->is(tok::kw_noexcept))
         Tok->setType(TT_TrailingReturnArrow);
-      }
       break;
     default:
       break;
@@ -1689,11 +1688,11 @@ class AnnotatingParser {
             TT_LambdaLSquare, TT_LambdaLBrace, TT_AttributeMacro, TT_IfMacro,
             TT_ForEachMacro, TT_TypenameMacro, TT_FunctionLBrace,
             TT_ImplicitStringLiteral, TT_InlineASMBrace, TT_FatArrow,
-            TT_LambdaArrow, TT_NamespaceMacro, TT_OverloadedOperator,
-            TT_RegexLiteral, TT_TemplateString, TT_ObjCStringLiteral,
-            TT_UntouchableMacroFunc, TT_StatementAttributeLikeMacro,
-            TT_FunctionLikeOrFreestandingMacro, TT_ClassLBrace, TT_EnumLBrace,
-            TT_RecordLBrace, TT_StructLBrace, TT_UnionLBrace, TT_RequiresClause,
+            TT_NamespaceMacro, TT_OverloadedOperator, TT_RegexLiteral,
+            TT_TemplateString, TT_ObjCStringLiteral, TT_UntouchableMacroFunc,
+            TT_StatementAttributeLikeMacro, TT_FunctionLikeOrFreestandingMacro,
+            TT_ClassLBrace, TT_EnumLBrace, TT_RecordLBrace, TT_StructLBrace,
+            TT_UnionLBrace, TT_RequiresClause,
             TT_RequiresClauseInARequiresExpression, TT_RequiresExpression,
             TT_RequiresExpressionLParen, TT_RequiresExpressionLBrace,
             TT_BracedListLBrace)) {
@@ -1876,7 +1875,7 @@ class AnnotatingParser {
       Contexts.back().IsExpression = true;
     } else if (Current.is(TT_TrailingReturnArrow)) {
       Contexts.back().IsExpression = false;
-    } else if (Current.is(TT_LambdaArrow) || Current.is(Keywords.kw_assert)) {
+    } else if (Current.is(Keywords.kw_assert)) {
       Contexts.back().IsExpression = Style.Language == FormatStyle::LK_Java;
     } else if (Current.Previous &&
                Current.Previous->is(TT_CtorInitializerColon)) {
@@ -2010,7 +2009,7 @@ class AnnotatingParser {
       AutoFound = true;
     } else if (Current.is(tok::arrow) &&
                Style.Language == FormatStyle::LK_Java) {
-      Current.setType(TT_LambdaArrow);
+      Current.setType(TT_TrailingReturnArrow);
     } else if (Current.is(tok::arrow) && AutoFound &&
                Line.MightBeFunctionDecl && Current.NestingLevel == 0 &&
                !Current.Previous->isOneOf(tok::kw_operator, tok::identifier)) {
@@ -2857,7 +2856,7 @@ class ExpressionParser {
       }
       if (Current->is(TT_JsComputedPropertyName))
         return prec::Assignment;
-      if (Current->is(TT_LambdaArrow))
+      if (Current->is(TT_TrailingReturnArrow))
         return prec::Comma;
       if (Current->is(TT_FatArrow))
         return prec::Assignment;
@@ -3742,7 +3741,7 @@ unsigned TokenAnnotator::splitPenalty(const AnnotatedLine &Line,
   }
   if (Right.is(TT_PointerOrReference))
     return 190;
-  if (Right.is(TT_LambdaArrow))
+  if (Right.is(TT_TrailingReturnArrow))
     return 110;
   if (Left.is(tok::equal) && Right.is(tok::l_brace))
     return 160;
@@ -4744,10 +4743,9 @@ bool TokenAnnotator::spaceRequiredBefore(const AnnotatedLine &Line,
     return false;
   }
 
-  if (Right.isOneOf(TT_TrailingReturnArrow, TT_LambdaArrow) ||
-      Left.isOneOf(TT_TrailingReturnArrow, TT_LambdaArrow)) {
+  if (Right.is(TT_TrailingReturnArrow) || Left.is(TT_TrailingReturnArrow))
     return true;
-  }
+
   if (Left.is(tok::comma) && Right.isNot(TT_OverloadedOperatorLParen) &&
       // In an unexpanded macro call we only find the parentheses and commas
       // in a line; the commas and closing parenthesis do not require a space.
@@ -5724,8 +5722,8 @@ bool TokenAnnotator::canBreakBefore(const AnnotatedLine &Line,
   return Left.isOneOf(tok::comma, tok::coloncolon, tok::semi, tok::l_brace,
                       tok::kw_class, tok::kw_struct, tok::comment) ||
          Right.isMemberAccess() ||
-         Right.isOneOf(TT_TrailingReturnArrow, TT_LambdaArrow, tok::lessless,
-                       tok::colon, tok::l_square, tok::at) ||
+         Right.isOneOf(TT_TrailingReturnArrow, tok::lessless, tok::colon,
+                       tok::l_square, tok::at) ||
          (Left.is(tok::r_paren) &&
           Right.isOneOf(tok::identifier, tok::kw_const)) ||
          (Left.is(tok::l_paren) && Right.isNot(tok::r_paren)) ||
diff --git a/clang/lib/Format/UnwrappedLineParser.cpp b/clang/lib/Format/UnwrappedLineParser.cpp
index 488d8dc07b1ea..30f9bcbfa2930 100644
--- a/clang/lib/Format/UnwrappedLineParser.cpp
+++ b/clang/lib/Format/UnwrappedLineParser.cpp
@@ -2256,7 +2256,7 @@ bool UnwrappedLineParser::tryToParseLambda() {
       // This might or might not actually be a lambda arrow (this could be an
       // ObjC method invocation followed by a dereferencing arrow). We might
       // reset this back to TT_Unknown in TokenAnnotator.
-      FormatTok->setFinalizedType(TT_LambdaArrow);
+      FormatTok->setFinalizedType(TT_TrailingReturnArrow);
       SeenArrow = true;
       nextToken();
       break;
diff --git a/clang/unittests/Format/TokenAnnotatorTest.cpp b/clang/unittests/Format/TokenAnnotatorTest.cpp
index ce4d026207499..c16c7d6475245 100644
--- a/clang/unittests/Format/TokenAnnotatorTest.cpp
+++ b/clang/unittests/Format/TokenAnnotatorTest.cpp
@@ -1438,19 +1438,19 @@ TEST_F(TokenAnnotatorTest, UnderstandsLambdas) {
   Tokens = annotate("[]() -> auto {}");
   ASSERT_EQ(Tokens.size(), 9u) << Tokens;
   EXPECT_TOKEN(Tokens[0], tok::l_square, TT_LambdaLSquare);
-  EXPECT_TOKEN(Tokens[4], tok::arrow, TT_LambdaArrow);
+  EXPECT_TOKEN(Tokens[4], tok::arrow, TT_TrailingReturnArrow);
   EXPECT_TOKEN(Tokens[6], tok::l_brace, TT_LambdaLBrace);
 
   Tokens = annotate("[]() -> auto & {}");
   ASSERT_EQ(Tokens.size(), 10u) << Tokens;
   EXPECT_TOKEN(Tokens[0], tok::l_square, TT_LambdaLSquare);
-  EXPECT_TOKEN(Tokens[4], tok::arrow, TT_LambdaArrow);
+  EXPECT_TOKEN(Tokens[4], tok::arrow, TT_TrailingReturnArrow);
   EXPECT_TOKEN(Tokens[7], tok::l_brace, TT_LambdaLBrace);
 
   Tokens = annotate("[]() -> auto * {}");
   ASSERT_EQ(Tokens.size(), 10u) << Tokens;
   EXPECT_TOKEN(Tokens[0], tok::l_square, TT_LambdaLSquare);
-  EXPECT_TOKEN(Tokens[4], tok::arrow, TT_LambdaArrow);
+  EXPECT_TOKEN(Tokens[4], tok::arrow, TT_TrailingReturnArrow);
   EXPECT_TOKEN(Tokens[7], tok::l_brace, TT_LambdaLBrace);
 
   Tokens = annotate("[] {}");
@@ -1466,7 +1466,7 @@ TEST_F(TokenAnnotatorTest, UnderstandsLambdas) {
   Tokens = annotate("[] -> auto {}");
   ASSERT_EQ(Tokens.size(), 7u) << Tokens;
   EXPECT_TOKEN(Tokens[0], tok::l_square, TT_LambdaLSquare);
-  EXPECT_TOKEN(Tokens[2], tok::arrow, TT_LambdaArrow);
+  EXPECT_TOKEN(Tokens[2], tok::arrow, TT_TrailingReturnArrow);
   EXPECT_TOKEN(Tokens[4], tok::l_brace, TT_LambdaLBrace);
 
   Tokens = annotate("[] <typename T> () {}");
@@ -1547,7 +1547,7 @@ TEST_F(TokenAnnotatorTest, UnderstandsLambdas) {
   ASSERT_EQ(Tokens.size(), 20u) << Tokens;
   EXPECT_TOKEN(Tokens[0], tok::l_square, TT_LambdaLSquare);
   EXPECT_TOKEN(Tokens[2], tok::less, TT_TemplateOpener);
-  EXPECT_TOKEN(Tokens[10], tok::arrow, TT_LambdaArrow);
+  EXPECT_TOKEN(Tokens[10], tok::arrow, TT_TrailingReturnArrow);
   EXPECT_TOKEN(Tokens[12], tok::kw_requires, TT_RequiresClause);
   EXPECT_TRUE(Tokens[16]->ClosesRequiresClause);
   EXPECT_TOKEN(Tokens[17], tok::l_brace, TT_LambdaLBrace);
@@ -1608,7 +1608,7 @@ TEST_F(TokenAnnotatorTest, UnderstandsLambdas) {
   EXPECT_TOKEN(Tokens[2], tok::less, TT_TemplateOpener);
   EXPECT_TOKEN(Tokens[6], tok::kw_requires, TT_RequiresClause);
   EXPECT_TRUE(Tokens[10]->ClosesRequiresClause);
-  EXPECT_TOKEN(Tokens[11], tok::arrow, TT_LambdaArrow);
+  EXPECT_TOKEN(Tokens[11], tok::arrow, TT_TrailingReturnArrow);
   EXPECT_TOKEN(Tokens[13], tok::l_brace, TT_LambdaLBrace);
 
   Tokens = annotate("[] <typename T> requires Foo<T> (T t) requires Bar<T> {}");

From 49ae2efb809db20603d500f02dd278247b542c73 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Sat, 28 Oct 2023 15:08:03 -0700
Subject: [PATCH 299/877] [RISCV][GISel] Support
 G_FMA/NEG/ABS/SQRT/MAXNUM/MINNUM for F and D extension.

---
 .../Target/RISCV/GISel/RISCVLegalizerInfo.cpp |   3 +-
 .../RISCV/GISel/RISCVRegisterBankInfo.cpp     |  21 +-
 .../instruction-select/fp-arith.mir           | 280 ++++++++++++++++++
 .../legalizer/legalize-fp-arith.mir           | 244 +++++++++++++++
 .../GlobalISel/regbankselect/fp-arith.mir     | 268 +++++++++++++++++
 5 files changed, 814 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
index ac6a887ed5d82..7cecc601cea10 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
+++ b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
@@ -190,7 +190,8 @@ RISCVLegalizerInfo::RISCVLegalizerInfo(const RISCVSubtarget &ST) {
 
   // FP Operations
 
-  getActionDefinitionsBuilder({G_FADD, G_FSUB, G_FMUL, G_FDIV})
+  getActionDefinitionsBuilder({G_FADD, G_FSUB, G_FMUL, G_FDIV, G_FMA, G_FNEG,
+                               G_FABS, G_FSQRT, G_FMAXNUM, G_FMINNUM})
       .legalIf([=, &ST](const LegalityQuery &Query) -> bool {
         return (ST.hasStdExtF() && typeIs(0, s32)(Query)) ||
                (ST.hasStdExtD() && typeIs(0, s64)(Query));
diff --git a/llvm/lib/Target/RISCV/GISel/RISCVRegisterBankInfo.cpp b/llvm/lib/Target/RISCV/GISel/RISCVRegisterBankInfo.cpp
index 2a99a7174f88e..5ad96b812d65c 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVRegisterBankInfo.cpp
+++ b/llvm/lib/Target/RISCV/GISel/RISCVRegisterBankInfo.cpp
@@ -178,13 +178,32 @@ RISCVRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
   case TargetOpcode::G_FADD:
   case TargetOpcode::G_FSUB:
   case TargetOpcode::G_FMUL:
-  case TargetOpcode::G_FDIV: {
+  case TargetOpcode::G_FDIV:
+  case TargetOpcode::G_FNEG:
+  case TargetOpcode::G_FABS:
+  case TargetOpcode::G_FSQRT:
+  case TargetOpcode::G_FMAXNUM:
+  case TargetOpcode::G_FMINNUM: {
     LLT Ty = MRI.getType(MI.getOperand(0).getReg());
     OperandsMapping = Ty.getSizeInBits() == 64
                           ? &RISCV::ValueMappings[RISCV::FPR64Idx]
                           : &RISCV::ValueMappings[RISCV::FPR32Idx];
     break;
   }
+  case TargetOpcode::G_FMA: {
+    LLT Ty = MRI.getType(MI.getOperand(0).getReg());
+    OperandsMapping =
+        Ty.getSizeInBits() == 64
+            ? getOperandsMapping({&RISCV::ValueMappings[RISCV::FPR64Idx],
+                                  &RISCV::ValueMappings[RISCV::FPR64Idx],
+                                  &RISCV::ValueMappings[RISCV::FPR64Idx],
+                                  &RISCV::ValueMappings[RISCV::FPR64Idx]})
+            : getOperandsMapping({&RISCV::ValueMappings[RISCV::FPR32Idx],
+                                  &RISCV::ValueMappings[RISCV::FPR32Idx],
+                                  &RISCV::ValueMappings[RISCV::FPR32Idx],
+                                  &RISCV::ValueMappings[RISCV::FPR32Idx]});
+    break;
+  }
   default:
     return getInvalidInstructionMapping();
   }
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/fp-arith.mir b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/fp-arith.mir
index 0a4fa10ba0b67..865ade4f3170c 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/fp-arith.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/fp-arith.mir
@@ -99,6 +99,98 @@ body:             |
     $f10_f = COPY %2(s32)
     PseudoRET implicit $f10_f
 
+...
+---
+name:            fma_f32
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $f10_f, $f11_f, $f12_f
+
+    ; CHECK-LABEL: name: fma_f32
+    ; CHECK: liveins: $f10_f, $f11_f, $f12_f
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr32 = COPY $f10_f
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:fpr32 = COPY $f11_f
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:fpr32 = COPY $f12_f
+    ; CHECK-NEXT: [[FMADD_S:%[0-9]+]]:fpr32 = nofpexcept FMADD_S [[COPY]], [[COPY1]], [[COPY2]], 7
+    ; CHECK-NEXT: $f10_f = COPY [[FMADD_S]]
+    ; CHECK-NEXT: PseudoRET implicit $f10_f
+    %0:fprb(s32) = COPY $f10_f
+    %1:fprb(s32) = COPY $f11_f
+    %2:fprb(s32) = COPY $f12_f
+    %3:fprb(s32) = G_FMA %0, %1, %2
+    $f10_f = COPY %3(s32)
+    PseudoRET implicit $f10_f
+
+...
+---
+name:            fneg_f32
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $f10_f, $f11_f, $f12_f
+
+    ; CHECK-LABEL: name: fneg_f32
+    ; CHECK: liveins: $f10_f, $f11_f, $f12_f
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr32 = COPY $f10_f
+    ; CHECK-NEXT: [[FSGNJN_S:%[0-9]+]]:fpr32 = FSGNJN_S [[COPY]], [[COPY]]
+    ; CHECK-NEXT: $f10_f = COPY [[FSGNJN_S]]
+    ; CHECK-NEXT: PseudoRET implicit $f10_f
+    %0:fprb(s32) = COPY $f10_f
+    %1:fprb(s32) = G_FNEG %0
+    $f10_f = COPY %1(s32)
+    PseudoRET implicit $f10_f
+
+...
+---
+name:            fabs_f32
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $f10_f, $f11_f, $f12_f
+
+    ; CHECK-LABEL: name: fabs_f32
+    ; CHECK: liveins: $f10_f, $f11_f, $f12_f
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr32 = COPY $f10_f
+    ; CHECK-NEXT: [[FSGNJX_S:%[0-9]+]]:fpr32 = FSGNJX_S [[COPY]], [[COPY]]
+    ; CHECK-NEXT: $f10_f = COPY [[FSGNJX_S]]
+    ; CHECK-NEXT: PseudoRET implicit $f10_f
+    %0:fprb(s32) = COPY $f10_f
+    %1:fprb(s32) = G_FABS %0
+    $f10_f = COPY %1(s32)
+    PseudoRET implicit $f10_f
+
+...
+---
+name:            fsqrt_f32
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $f10_f, $f11_f, $f12_f
+
+    ; CHECK-LABEL: name: fsqrt_f32
+    ; CHECK: liveins: $f10_f, $f11_f, $f12_f
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr32 = COPY $f10_f
+    ; CHECK-NEXT: [[FSQRT_S:%[0-9]+]]:fpr32 = nofpexcept FSQRT_S [[COPY]], 7
+    ; CHECK-NEXT: $f10_f = COPY [[FSQRT_S]]
+    ; CHECK-NEXT: PseudoRET implicit $f10_f
+    %0:fprb(s32) = COPY $f10_f
+    %1:fprb(s32) = G_FSQRT %0
+    $f10_f = COPY %1(s32)
+    PseudoRET implicit $f10_f
+
 ...
 ---
 name:            fadd_f64
@@ -123,6 +215,54 @@ body:             |
     $f10_d = COPY %2(s64)
     PseudoRET implicit $f10_d
 
+...
+---
+name:            fmaxnum_f32
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $f10_f, $f11_f
+
+    ; CHECK-LABEL: name: fmaxnum_f32
+    ; CHECK: liveins: $f10_f, $f11_f
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr32 = COPY $f10_f
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:fpr32 = COPY $f11_f
+    ; CHECK-NEXT: [[FMAX_S:%[0-9]+]]:fpr32 = nofpexcept FMAX_S [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: $f10_f = COPY [[FMAX_S]]
+    ; CHECK-NEXT: PseudoRET implicit $f10_f
+    %0:fprb(s32) = COPY $f10_f
+    %1:fprb(s32) = COPY $f11_f
+    %2:fprb(s32) = G_FMAXNUM %0, %1
+    $f10_f = COPY %2(s32)
+    PseudoRET implicit $f10_f
+
+...
+---
+name:            fminnum_f32
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $f10_f, $f11_f
+
+    ; CHECK-LABEL: name: fminnum_f32
+    ; CHECK: liveins: $f10_f, $f11_f
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr32 = COPY $f10_f
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:fpr32 = COPY $f11_f
+    ; CHECK-NEXT: [[FMIN_S:%[0-9]+]]:fpr32 = nofpexcept FMIN_S [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: $f10_f = COPY [[FMIN_S]]
+    ; CHECK-NEXT: PseudoRET implicit $f10_f
+    %0:fprb(s32) = COPY $f10_f
+    %1:fprb(s32) = COPY $f11_f
+    %2:fprb(s32) = G_FMINNUM %0, %1
+    $f10_f = COPY %2(s32)
+    PseudoRET implicit $f10_f
+
 ...
 ---
 name:            fsub_f64
@@ -196,3 +336,143 @@ body:             |
     PseudoRET implicit $f10_d
 
 ...
+---
+name:            fma_f64
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $f10_d, $f11_d, $f12_d
+
+    ; CHECK-LABEL: name: fma_f64
+    ; CHECK: liveins: $f10_d, $f11_d, $f12_d
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr64 = COPY $f10_d
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:fpr64 = COPY $f11_d
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:fpr64 = COPY $f12_d
+    ; CHECK-NEXT: [[FMADD_D:%[0-9]+]]:fpr64 = nofpexcept FMADD_D [[COPY]], [[COPY1]], [[COPY2]], 7
+    ; CHECK-NEXT: $f10_d = COPY [[FMADD_D]]
+    ; CHECK-NEXT: PseudoRET implicit $f10_d
+    %0:fprb(s64) = COPY $f10_d
+    %1:fprb(s64) = COPY $f11_d
+    %2:fprb(s64) = COPY $f12_d
+    %3:fprb(s64) = G_FMA %0, %1, %2
+    $f10_d = COPY %3(s64)
+    PseudoRET implicit $f10_d
+
+...
+---
+name:            fneg_f64
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $f10_d
+
+    ; CHECK-LABEL: name: fneg_f64
+    ; CHECK: liveins: $f10_d
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr64 = COPY $f10_d
+    ; CHECK-NEXT: [[FSGNJN_D:%[0-9]+]]:fpr64 = FSGNJN_D [[COPY]], [[COPY]]
+    ; CHECK-NEXT: $f10_d = COPY [[FSGNJN_D]]
+    ; CHECK-NEXT: PseudoRET implicit $f10_d
+    %0:fprb(s64) = COPY $f10_d
+    %1:fprb(s64) = G_FNEG %0
+    $f10_d = COPY %1(s64)
+    PseudoRET implicit $f10_d
+
+...
+---
+name:            fabs_f64
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $f10_d
+
+    ; CHECK-LABEL: name: fabs_f64
+    ; CHECK: liveins: $f10_d
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr64 = COPY $f10_d
+    ; CHECK-NEXT: [[FSGNJX_D:%[0-9]+]]:fpr64 = FSGNJX_D [[COPY]], [[COPY]]
+    ; CHECK-NEXT: $f10_d = COPY [[FSGNJX_D]]
+    ; CHECK-NEXT: PseudoRET implicit $f10_d
+    %0:fprb(s64) = COPY $f10_d
+    %1:fprb(s64) = G_FABS %0
+    $f10_d = COPY %1(s64)
+    PseudoRET implicit $f10_d
+
+...
+---
+name:            fsqrt_f64
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $f10_d
+
+    ; CHECK-LABEL: name: fsqrt_f64
+    ; CHECK: liveins: $f10_d
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr64 = COPY $f10_d
+    ; CHECK-NEXT: [[FSQRT_D:%[0-9]+]]:fpr64 = nofpexcept FSQRT_D [[COPY]], 7
+    ; CHECK-NEXT: $f10_d = COPY [[FSQRT_D]]
+    ; CHECK-NEXT: PseudoRET implicit $f10_d
+    %0:fprb(s64) = COPY $f10_d
+    %1:fprb(s64) = G_FSQRT %0
+    $f10_d = COPY %1(s64)
+    PseudoRET implicit $f10_d
+
+...
+---
+name:            fmaxnum_f64
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $f10_d, $f11_d
+
+    ; CHECK-LABEL: name: fmaxnum_f64
+    ; CHECK: liveins: $f10_d, $f11_d
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr64 = COPY $f10_d
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:fpr64 = COPY $f11_d
+    ; CHECK-NEXT: [[FMAX_D:%[0-9]+]]:fpr64 = nofpexcept FMAX_D [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: $f10_d = COPY [[FMAX_D]]
+    ; CHECK-NEXT: PseudoRET implicit $f10_d
+    %0:fprb(s64) = COPY $f10_d
+    %1:fprb(s64) = COPY $f11_d
+    %2:fprb(s64) = G_FMAXNUM %0, %1
+    $f10_d = COPY %2(s64)
+    PseudoRET implicit $f10_d
+
+...
+---
+name:            fminnum_f64
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $f10_d, $f11_d
+
+    ; CHECK-LABEL: name: fminnum_f64
+    ; CHECK: liveins: $f10_d, $f11_d
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr64 = COPY $f10_d
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:fpr64 = COPY $f11_d
+    ; CHECK-NEXT: [[FMIN_D:%[0-9]+]]:fpr64 = nofpexcept FMIN_D [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: $f10_d = COPY [[FMIN_D]]
+    ; CHECK-NEXT: PseudoRET implicit $f10_d
+    %0:fprb(s64) = COPY $f10_d
+    %1:fprb(s64) = COPY $f11_d
+    %2:fprb(s64) = G_FMINNUM %0, %1
+    $f10_d = COPY %2(s64)
+    PseudoRET implicit $f10_d
+
+...
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-fp-arith.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-fp-arith.mir
index e6a6c695e145b..e7bb8dfd89dae 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-fp-arith.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-fp-arith.mir
@@ -87,6 +87,128 @@ body:             |
     $f10_f = COPY %2(s32)
     PseudoRET implicit $f10_f
 
+...
+---
+name:            fma_f32
+body:             |
+  bb.0:
+    liveins: $f10_f, $f11_f, $f12_f
+
+    ; CHECK-LABEL: name: fma_f32
+    ; CHECK: liveins: $f10_f, $f11_f, $f12_f
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $f10_f
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $f11_f
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $f12_f
+    ; CHECK-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
+    ; CHECK-NEXT: $f10_f = COPY [[FMA]](s32)
+    ; CHECK-NEXT: PseudoRET implicit $f10_f
+    %0:_(s32) = COPY $f10_f
+    %1:_(s32) = COPY $f11_f
+    %2:_(s32) = COPY $f12_f
+    %3:_(s32) = G_FMA %0, %1, %2
+    $f10_f = COPY %3(s32)
+    PseudoRET implicit $f10_f
+
+...
+---
+name:            fneg_f32
+body:             |
+  bb.0:
+    liveins: $f10_f
+
+    ; CHECK-LABEL: name: fneg_f32
+    ; CHECK: liveins: $f10_f
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $f10_f
+    ; CHECK-NEXT: [[FNEG:%[0-9]+]]:_(s32) = G_FNEG [[COPY]]
+    ; CHECK-NEXT: $f10_f = COPY [[FNEG]](s32)
+    ; CHECK-NEXT: PseudoRET implicit $f10_f
+    %0:_(s32) = COPY $f10_f
+    %1:_(s32) = G_FNEG %0
+    $f10_f = COPY %1(s32)
+    PseudoRET implicit $f10_f
+
+...
+---
+name:            fabs_f32
+body:             |
+  bb.0:
+    liveins: $f10_f
+
+    ; CHECK-LABEL: name: fabs_f32
+    ; CHECK: liveins: $f10_f
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $f10_f
+    ; CHECK-NEXT: [[FABS:%[0-9]+]]:_(s32) = G_FABS [[COPY]]
+    ; CHECK-NEXT: $f10_f = COPY [[FABS]](s32)
+    ; CHECK-NEXT: PseudoRET implicit $f10_f
+    %0:_(s32) = COPY $f10_f
+    %1:_(s32) = G_FABS %0
+    $f10_f = COPY %1(s32)
+    PseudoRET implicit $f10_f
+
+...
+---
+name:            fsqrt_f32
+body:             |
+  bb.0:
+    liveins: $f10_f
+
+    ; CHECK-LABEL: name: fsqrt_f32
+    ; CHECK: liveins: $f10_f
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $f10_f
+    ; CHECK-NEXT: [[FSQRT:%[0-9]+]]:_(s32) = G_FSQRT [[COPY]]
+    ; CHECK-NEXT: $f10_f = COPY [[FSQRT]](s32)
+    ; CHECK-NEXT: PseudoRET implicit $f10_f
+    %0:_(s32) = COPY $f10_f
+    %1:_(s32) = G_FSQRT %0
+    $f10_f = COPY %1(s32)
+    PseudoRET implicit $f10_f
+
+...
+---
+name:            fmaxnum_f32
+body:             |
+  bb.0:
+    liveins: $f10_f, $f11_f
+
+    ; CHECK-LABEL: name: fmaxnum_f32
+    ; CHECK: liveins: $f10_f, $f11_f
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $f10_f
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $f11_f
+    ; CHECK-NEXT: [[FMAXNUM:%[0-9]+]]:_(s32) = G_FMAXNUM [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: $f10_f = COPY [[FMAXNUM]](s32)
+    ; CHECK-NEXT: PseudoRET implicit $f10_f
+    %0:_(s32) = COPY $f10_f
+    %1:_(s32) = COPY $f11_f
+    %2:_(s32) = G_FMAXNUM %0, %1
+    $f10_f = COPY %2(s32)
+    PseudoRET implicit $f10_f
+
+...
+---
+name:            fminnum_f32
+body:             |
+  bb.0:
+    liveins: $f10_f, $f11_f
+
+    ; CHECK-LABEL: name: fminnum_f32
+    ; CHECK: liveins: $f10_f, $f11_f
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $f10_f
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $f11_f
+    ; CHECK-NEXT: [[FMINNUM:%[0-9]+]]:_(s32) = G_FMINNUM [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: $f10_f = COPY [[FMINNUM]](s32)
+    ; CHECK-NEXT: PseudoRET implicit $f10_f
+    %0:_(s32) = COPY $f10_f
+    %1:_(s32) = COPY $f11_f
+    %2:_(s32) = G_FMINNUM %0, %1
+    $f10_f = COPY %2(s32)
+    PseudoRET implicit $f10_f
+
 ...
 ---
 name:            fadd_f64
@@ -172,3 +294,125 @@ body:             |
     PseudoRET implicit $f10_d
 
 ...
+---
+name:            fma_f64
+body:             |
+  bb.0:
+    liveins: $f10_d, $f11_d, $f12_d
+
+    ; CHECK-LABEL: name: fma_f64
+    ; CHECK: liveins: $f10_d, $f11_d, $f12_d
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $f10_d
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $f11_d
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY $f12_d
+    ; CHECK-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
+    ; CHECK-NEXT: $f10_d = COPY [[FMA]](s64)
+    ; CHECK-NEXT: PseudoRET implicit $f10_d
+    %0:_(s64) = COPY $f10_d
+    %1:_(s64) = COPY $f11_d
+    %2:_(s64) = COPY $f12_d
+    %3:_(s64) = G_FMA %0, %1, %2
+    $f10_d = COPY %3(s64)
+    PseudoRET implicit $f10_d
+
+...
+---
+name:            fneg_f64
+body:             |
+  bb.0:
+    liveins: $f10_d
+
+    ; CHECK-LABEL: name: fneg_f64
+    ; CHECK: liveins: $f10_d
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $f10_d
+    ; CHECK-NEXT: [[FNEG:%[0-9]+]]:_(s64) = G_FNEG [[COPY]]
+    ; CHECK-NEXT: $f10_d = COPY [[FNEG]](s64)
+    ; CHECK-NEXT: PseudoRET implicit $f10_d
+    %0:_(s64) = COPY $f10_d
+    %1:_(s64) = G_FNEG %0
+    $f10_d = COPY %1(s64)
+    PseudoRET implicit $f10_d
+
+...
+---
+name:            fabs_f64
+body:             |
+  bb.0:
+    liveins: $f10_d
+
+    ; CHECK-LABEL: name: fabs_f64
+    ; CHECK: liveins: $f10_d
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $f10_d
+    ; CHECK-NEXT: [[FABS:%[0-9]+]]:_(s64) = G_FABS [[COPY]]
+    ; CHECK-NEXT: $f10_d = COPY [[FABS]](s64)
+    ; CHECK-NEXT: PseudoRET implicit $f10_d
+    %0:_(s64) = COPY $f10_d
+    %1:_(s64) = G_FABS %0
+    $f10_d = COPY %1(s64)
+    PseudoRET implicit $f10_d
+
+...
+---
+name:            fsqrt_f64
+body:             |
+  bb.0:
+    liveins: $f10_d
+
+    ; CHECK-LABEL: name: fsqrt_f64
+    ; CHECK: liveins: $f10_d
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $f10_d
+    ; CHECK-NEXT: [[FSQRT:%[0-9]+]]:_(s64) = G_FSQRT [[COPY]]
+    ; CHECK-NEXT: $f10_d = COPY [[FSQRT]](s64)
+    ; CHECK-NEXT: PseudoRET implicit $f10_d
+    %0:_(s64) = COPY $f10_d
+    %1:_(s64) = G_FSQRT %0
+    $f10_d = COPY %1(s64)
+    PseudoRET implicit $f10_d
+
+...
+---
+name:            fmaxnum_f64
+body:             |
+  bb.0:
+    liveins: $f10_d, $f11_d
+
+    ; CHECK-LABEL: name: fmaxnum_f64
+    ; CHECK: liveins: $f10_d, $f11_d
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $f10_d
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $f11_d
+    ; CHECK-NEXT: [[FMAXNUM:%[0-9]+]]:_(s64) = G_FMAXNUM [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: $f10_d = COPY [[FMAXNUM]](s64)
+    ; CHECK-NEXT: PseudoRET implicit $f10_d
+    %0:_(s64) = COPY $f10_d
+    %1:_(s64) = COPY $f11_d
+    %2:_(s64) = G_FMAXNUM %0, %1
+    $f10_d = COPY %2(s64)
+    PseudoRET implicit $f10_d
+
+...
+---
+name:            fminnum_f64
+body:             |
+  bb.0:
+    liveins: $f10_d, $f11_d
+
+    ; CHECK-LABEL: name: fminnum_f64
+    ; CHECK: liveins: $f10_d, $f11_d
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $f10_d
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $f11_d
+    ; CHECK-NEXT: [[FMINNUM:%[0-9]+]]:_(s64) = G_FMINNUM [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: $f10_d = COPY [[FMINNUM]](s64)
+    ; CHECK-NEXT: PseudoRET implicit $f10_d
+    %0:_(s64) = COPY $f10_d
+    %1:_(s64) = COPY $f11_d
+    %2:_(s64) = G_FMINNUM %0, %1
+    $f10_d = COPY %2(s64)
+    PseudoRET implicit $f10_d
+
+...
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/regbankselect/fp-arith.mir b/llvm/test/CodeGen/RISCV/GlobalISel/regbankselect/fp-arith.mir
index 0fc708b5bb1f3..1933678daef7b 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/regbankselect/fp-arith.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/regbankselect/fp-arith.mir
@@ -97,6 +97,140 @@ body:             |
     $f10_f = COPY %2(s32)
     PseudoRET implicit $f10_f
 
+...
+---
+name:            fma_f32
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $f10_f, $f11_f, $f12_f
+
+    ; CHECK-LABEL: name: fma_f32
+    ; CHECK: liveins: $f10_f, $f11_f, $f12_f
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fprb(s32) = COPY $f10_f
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:fprb(s32) = COPY $f11_f
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:fprb(s32) = COPY $f12_f
+    ; CHECK-NEXT: [[FMA:%[0-9]+]]:fprb(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
+    ; CHECK-NEXT: $f10_f = COPY [[FMA]](s32)
+    ; CHECK-NEXT: PseudoRET implicit $f10_f
+    %0:_(s32) = COPY $f10_f
+    %1:_(s32) = COPY $f11_f
+    %2:_(s32) = COPY $f12_f
+    %3:_(s32) = G_FMA %0, %1, %2
+    $f10_f = COPY %3(s32)
+    PseudoRET implicit $f10_f
+
+...
+---
+name:            fneg_f32
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $f10_f
+
+    ; CHECK-LABEL: name: fneg_f32
+    ; CHECK: liveins: $f10_f
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fprb(s32) = COPY $f10_f
+    ; CHECK-NEXT: [[FNEG:%[0-9]+]]:fprb(s32) = G_FNEG [[COPY]]
+    ; CHECK-NEXT: $f10_f = COPY [[FNEG]](s32)
+    ; CHECK-NEXT: PseudoRET implicit $f10_f
+    %0:_(s32) = COPY $f10_f
+    %1:_(s32) = G_FNEG %0
+    $f10_f = COPY %1(s32)
+    PseudoRET implicit $f10_f
+
+...
+---
+name:            fabs_f32
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $f10_f
+
+    ; CHECK-LABEL: name: fabs_f32
+    ; CHECK: liveins: $f10_f
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fprb(s32) = COPY $f10_f
+    ; CHECK-NEXT: [[FABS:%[0-9]+]]:fprb(s32) = G_FABS [[COPY]]
+    ; CHECK-NEXT: $f10_f = COPY [[FABS]](s32)
+    ; CHECK-NEXT: PseudoRET implicit $f10_f
+    %0:_(s32) = COPY $f10_f
+    %1:_(s32) = G_FABS %0
+    $f10_f = COPY %1(s32)
+    PseudoRET implicit $f10_f
+
+...
+---
+name:            fsqrt_f32
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $f10_f
+
+    ; CHECK-LABEL: name: fsqrt_f32
+    ; CHECK: liveins: $f10_f
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fprb(s32) = COPY $f10_f
+    ; CHECK-NEXT: [[FSQRT:%[0-9]+]]:fprb(s32) = G_FSQRT [[COPY]]
+    ; CHECK-NEXT: $f10_f = COPY [[FSQRT]](s32)
+    ; CHECK-NEXT: PseudoRET implicit $f10_f
+    %0:_(s32) = COPY $f10_f
+    %1:_(s32) = G_FSQRT %0
+    $f10_f = COPY %1(s32)
+    PseudoRET implicit $f10_f
+
+...
+---
+name:            fmaxnum_f32
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $f10_f, $f11_f
+
+    ; CHECK-LABEL: name: fmaxnum_f32
+    ; CHECK: liveins: $f10_f, $f11_f
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fprb(s32) = COPY $f10_f
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:fprb(s32) = COPY $f11_f
+    ; CHECK-NEXT: [[FMAXNUM:%[0-9]+]]:fprb(s32) = G_FMAXNUM [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: $f10_f = COPY [[FMAXNUM]](s32)
+    ; CHECK-NEXT: PseudoRET implicit $f10_f
+    %0:_(s32) = COPY $f10_f
+    %1:_(s32) = COPY $f11_f
+    %2:_(s32) = G_FMAXNUM %0, %1
+    $f10_f = COPY %2(s32)
+    PseudoRET implicit $f10_f
+
+...
+---
+name:            fminnum_f32
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $f10_f, $f11_f
+
+    ; CHECK-LABEL: name: fminnum_f32
+    ; CHECK: liveins: $f10_f, $f11_f
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fprb(s32) = COPY $f10_f
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:fprb(s32) = COPY $f11_f
+    ; CHECK-NEXT: [[FMINNUM:%[0-9]+]]:fprb(s32) = G_FMINNUM [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: $f10_f = COPY [[FMINNUM]](s32)
+    ; CHECK-NEXT: PseudoRET implicit $f10_f
+    %0:_(s32) = COPY $f10_f
+    %1:_(s32) = COPY $f11_f
+    %2:_(s32) = G_FMINNUM %0, %1
+    $f10_f = COPY %2(s32)
+    PseudoRET implicit $f10_f
+
 ...
 ---
 name:            fadd_f64
@@ -190,3 +324,137 @@ body:             |
     PseudoRET implicit $f10_d
 
 ...
+---
+name:            fma_f64
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $f10_d, $f11_d, $f12_d
+
+    ; CHECK-LABEL: name: fma_f64
+    ; CHECK: liveins: $f10_d, $f11_d, $f12_d
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fprb(s64) = COPY $f10_d
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:fprb(s64) = COPY $f11_d
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:fprb(s64) = COPY $f12_d
+    ; CHECK-NEXT: [[FMA:%[0-9]+]]:fprb(s64) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
+    ; CHECK-NEXT: $f10_d = COPY [[FMA]](s64)
+    ; CHECK-NEXT: PseudoRET implicit $f10_d
+    %0:_(s64) = COPY $f10_d
+    %1:_(s64) = COPY $f11_d
+    %2:_(s64) = COPY $f12_d
+    %3:_(s64) = G_FMA %0, %1, %2
+    $f10_d = COPY %3(s64)
+    PseudoRET implicit $f10_d
+
+...
+---
+name:            fneg_f64
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $f10_d
+
+    ; CHECK-LABEL: name: fneg_f64
+    ; CHECK: liveins: $f10_d
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fprb(s64) = COPY $f10_d
+    ; CHECK-NEXT: [[FNEG:%[0-9]+]]:fprb(s64) = G_FNEG [[COPY]]
+    ; CHECK-NEXT: $f10_d = COPY [[FNEG]](s64)
+    ; CHECK-NEXT: PseudoRET implicit $f10_d
+    %0:_(s64) = COPY $f10_d
+    %1:_(s64) = G_FNEG %0
+    $f10_d = COPY %1(s64)
+    PseudoRET implicit $f10_d
+
+...
+---
+name:            fabs_f64
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $f10_d
+
+    ; CHECK-LABEL: name: fabs_f64
+    ; CHECK: liveins: $f10_d
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fprb(s64) = COPY $f10_d
+    ; CHECK-NEXT: [[FABS:%[0-9]+]]:fprb(s64) = G_FABS [[COPY]]
+    ; CHECK-NEXT: $f10_d = COPY [[FABS]](s64)
+    ; CHECK-NEXT: PseudoRET implicit $f10_d
+    %0:_(s64) = COPY $f10_d
+    %1:_(s64) = G_FABS %0
+    $f10_d = COPY %1(s64)
+    PseudoRET implicit $f10_d
+
+...
+---
+name:            fsqrt_f64
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $f10_d
+
+    ; CHECK-LABEL: name: fsqrt_f64
+    ; CHECK: liveins: $f10_d
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fprb(s64) = COPY $f10_d
+    ; CHECK-NEXT: [[FSQRT:%[0-9]+]]:fprb(s64) = G_FSQRT [[COPY]]
+    ; CHECK-NEXT: $f10_d = COPY [[FSQRT]](s64)
+    ; CHECK-NEXT: PseudoRET implicit $f10_d
+    %0:_(s64) = COPY $f10_d
+    %1:_(s64) = G_FSQRT %0
+    $f10_d = COPY %1(s64)
+    PseudoRET implicit $f10_d
+
+...
+---
+name:            fmaxnum_f64
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $f10_d, $f11_d
+
+    ; CHECK-LABEL: name: fmaxnum_f64
+    ; CHECK: liveins: $f10_d, $f11_d
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fprb(s64) = COPY $f10_d
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:fprb(s64) = COPY $f11_d
+    ; CHECK-NEXT: [[FMAXNUM:%[0-9]+]]:fprb(s64) = G_FMAXNUM [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: $f10_d = COPY [[FMAXNUM]](s64)
+    ; CHECK-NEXT: PseudoRET implicit $f10_d
+    %0:_(s64) = COPY $f10_d
+    %1:_(s64) = COPY $f11_d
+    %2:_(s64) = G_FMAXNUM %0, %1
+    $f10_d = COPY %2(s64)
+    PseudoRET implicit $f10_d
+
+...
+---
+name:            fminnum_f64
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $f10_d, $f11_d
+
+    ; CHECK-LABEL: name: fminnum_f64
+    ; CHECK: liveins: $f10_d, $f11_d
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fprb(s64) = COPY $f10_d
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:fprb(s64) = COPY $f11_d
+    ; CHECK-NEXT: [[FMINNUM:%[0-9]+]]:fprb(s64) = G_FMINNUM [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: $f10_d = COPY [[FMINNUM]](s64)
+    ; CHECK-NEXT: PseudoRET implicit $f10_d
+    %0:_(s64) = COPY $f10_d
+    %1:_(s64) = COPY $f11_d
+    %2:_(s64) = G_FMINNUM %0, %1
+    $f10_d = COPY %2(s64)
+    PseudoRET implicit $f10_d
+
+...

From 4ac304242b65413f4eae21af300dd14cb14ed066 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Sat, 28 Oct 2023 16:15:45 -0700
Subject: [PATCH 300/877] [RISCV][GISel] Support G_FPEXT/G_FPTRUNC for F and D
 extension.

---
 .../Target/RISCV/GISel/RISCVLegalizerInfo.cpp | 11 ++++
 .../RISCV/GISel/RISCVRegisterBankInfo.cpp     | 24 +++++++++
 .../instruction-select/fp-ext-trunc.mir       | 50 +++++++++++++++++++
 .../legalizer/legalize-fp-ext-trunc.mir       | 44 ++++++++++++++++
 .../GlobalISel/regbankselect/fp-ext-trunc.mir | 50 +++++++++++++++++++
 5 files changed, 179 insertions(+)
 create mode 100644 llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/fp-ext-trunc.mir
 create mode 100644 llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-fp-ext-trunc.mir
 create mode 100644 llvm/test/CodeGen/RISCV/GlobalISel/regbankselect/fp-ext-trunc.mir

diff --git a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
index 7cecc601cea10..e80d620936e89 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
+++ b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
@@ -197,6 +197,17 @@ RISCVLegalizerInfo::RISCVLegalizerInfo(const RISCVSubtarget &ST) {
                (ST.hasStdExtD() && typeIs(0, s64)(Query));
       });
 
+  getActionDefinitionsBuilder(G_FPTRUNC).legalIf(
+      [=, &ST](const LegalityQuery &Query) -> bool {
+        return (ST.hasStdExtD() && typeIs(0, s32)(Query) &&
+                typeIs(1, s64)(Query));
+      });
+  getActionDefinitionsBuilder(G_FPEXT).legalIf(
+      [=, &ST](const LegalityQuery &Query) -> bool {
+        return (ST.hasStdExtD() && typeIs(0, s64)(Query) &&
+                typeIs(1, s32)(Query));
+      });
+
   getLegacyLegalizerInfo().computeTables();
 }
 
diff --git a/llvm/lib/Target/RISCV/GISel/RISCVRegisterBankInfo.cpp b/llvm/lib/Target/RISCV/GISel/RISCVRegisterBankInfo.cpp
index 5ad96b812d65c..f005948d20944 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVRegisterBankInfo.cpp
+++ b/llvm/lib/Target/RISCV/GISel/RISCVRegisterBankInfo.cpp
@@ -204,6 +204,30 @@ RISCVRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
                                   &RISCV::ValueMappings[RISCV::FPR32Idx]});
     break;
   }
+  case TargetOpcode::G_FPEXT: {
+    LLT ToTy = MRI.getType(MI.getOperand(0).getReg());
+    (void)ToTy;
+    LLT FromTy = MRI.getType(MI.getOperand(1).getReg());
+    (void)FromTy;
+    assert(ToTy.getSizeInBits() == 64 && FromTy.getSizeInBits() == 32 &&
+           "Unsupported size for G_FPEXT");
+    OperandsMapping =
+        getOperandsMapping({&RISCV::ValueMappings[RISCV::FPR64Idx],
+                            &RISCV::ValueMappings[RISCV::FPR32Idx]});
+    break;
+  }
+  case TargetOpcode::G_FPTRUNC: {
+    LLT ToTy = MRI.getType(MI.getOperand(0).getReg());
+    (void)ToTy;
+    LLT FromTy = MRI.getType(MI.getOperand(1).getReg());
+    (void)FromTy;
+    assert(ToTy.getSizeInBits() == 32 && FromTy.getSizeInBits() == 64 &&
+           "Unsupported size for G_FPTRUNC");
+    OperandsMapping =
+        getOperandsMapping({&RISCV::ValueMappings[RISCV::FPR32Idx],
+                            &RISCV::ValueMappings[RISCV::FPR64Idx]});
+    break;
+  }
   default:
     return getInvalidInstructionMapping();
   }
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/fp-ext-trunc.mir b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/fp-ext-trunc.mir
new file mode 100644
index 0000000000000..9f0de60dab782
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/fp-ext-trunc.mir
@@ -0,0 +1,50 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=riscv32 -mattr=+d -run-pass=instruction-select \
+# RUN:   -simplify-mir -verify-machineinstrs %s -o - | FileCheck %s
+# RUN: llc -mtriple=riscv64 -mattr=+d -run-pass=instruction-select \
+# RUN:   -simplify-mir -verify-machineinstrs %s -o - | FileCheck %s
+
+---
+name:            fpext
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $f10_f
+
+    ; CHECK-LABEL: name: fpext
+    ; CHECK: liveins: $f10_f
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr32 = COPY $f10_f
+    ; CHECK-NEXT: [[FCVT_D_S:%[0-9]+]]:fpr64 = nofpexcept FCVT_D_S [[COPY]], 0
+    ; CHECK-NEXT: $f10_d = COPY [[FCVT_D_S]]
+    ; CHECK-NEXT: PseudoRET implicit $f10_d
+    %0:fprb(s32) = COPY $f10_f
+    %1:fprb(s64) = G_FPEXT %0(s32)
+    $f10_d = COPY %1(s64)
+    PseudoRET implicit $f10_d
+
+...
+---
+name:            fptrunc
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $f10_d
+
+    ; CHECK-LABEL: name: fptrunc
+    ; CHECK: liveins: $f10_d
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr64 = COPY $f10_d
+    ; CHECK-NEXT: [[FCVT_S_D:%[0-9]+]]:fpr32 = nofpexcept FCVT_S_D [[COPY]], 7
+    ; CHECK-NEXT: $f10_f = COPY [[FCVT_S_D]]
+    ; CHECK-NEXT: PseudoRET implicit $f10_f
+    %0:fprb(s64) = COPY $f10_d
+    %1:fprb(s32) = G_FPTRUNC %0(s64)
+    $f10_f = COPY %1(s32)
+    PseudoRET implicit $f10_f
+
+...
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-fp-ext-trunc.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-fp-ext-trunc.mir
new file mode 100644
index 0000000000000..29355f50dc4b4
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-fp-ext-trunc.mir
@@ -0,0 +1,44 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
+# RUN: llc -mtriple=riscv32 -mattr=+d -run-pass=legalizer %s -o - \
+# RUN: | FileCheck %s
+# RUN: llc -mtriple=riscv64 -mattr=+d -run-pass=legalizer %s -o - \
+# RUN: | FileCheck %s
+
+---
+name:            fpext
+body:             |
+  bb.1:
+    liveins: $f10_f
+
+    ; CHECK-LABEL: name: fpext
+    ; CHECK: liveins: $f10_f
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $f10_f
+    ; CHECK-NEXT: [[FPEXT:%[0-9]+]]:_(s64) = G_FPEXT [[COPY]](s32)
+    ; CHECK-NEXT: $f10_d = COPY [[FPEXT]](s64)
+    ; CHECK-NEXT: PseudoRET implicit $f10_d
+    %0:_(s32) = COPY $f10_f
+    %1:_(s64) = G_FPEXT %0(s32)
+    $f10_d = COPY %1(s64)
+    PseudoRET implicit $f10_d
+
+...
+---
+name:            fptrunc
+body:             |
+  bb.1:
+    liveins: $f10_d
+
+    ; CHECK-LABEL: name: fptrunc
+    ; CHECK: liveins: $f10_d
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $f10_d
+    ; CHECK-NEXT: [[FPTRUNC:%[0-9]+]]:_(s32) = G_FPTRUNC [[COPY]](s64)
+    ; CHECK-NEXT: $f10_f = COPY [[FPTRUNC]](s32)
+    ; CHECK-NEXT: PseudoRET implicit $f10_f
+    %0:_(s64) = COPY $f10_d
+    %1:_(s32) = G_FPTRUNC %0(s64)
+    $f10_f = COPY %1(s32)
+    PseudoRET implicit $f10_f
+
+...
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/regbankselect/fp-ext-trunc.mir b/llvm/test/CodeGen/RISCV/GlobalISel/regbankselect/fp-ext-trunc.mir
new file mode 100644
index 0000000000000..1ed7ceea2f650
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/regbankselect/fp-ext-trunc.mir
@@ -0,0 +1,50 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=riscv32 -mattr=+d -run-pass=regbankselect \
+# RUN:   -simplify-mir -verify-machineinstrs %s \
+# RUN:   -o - | FileCheck %s
+# RUN: llc -mtriple=riscv64 -mattr=+d -run-pass=regbankselect \
+# RUN:   -simplify-mir -verify-machineinstrs %s \
+# RUN:   -o - | FileCheck %s
+
+---
+name:            fpext
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $f10_f
+
+    ; CHECK-LABEL: name: fpext
+    ; CHECK: liveins: $f10_f
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fprb(s32) = COPY $f10_f
+    ; CHECK-NEXT: [[FPEXT:%[0-9]+]]:fprb(s64) = G_FPEXT [[COPY]](s32)
+    ; CHECK-NEXT: $f10_d = COPY [[FPEXT]](s64)
+    ; CHECK-NEXT: PseudoRET implicit $f10_d
+    %0:_(s32) = COPY $f10_f
+    %1:_(s64) = G_FPEXT %0(s32)
+    $f10_d = COPY %1(s64)
+    PseudoRET implicit $f10_d
+
+...
+---
+name:            fptrunc
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.1:
+    liveins: $f10_d
+
+    ; CHECK-LABEL: name: fptrunc
+    ; CHECK: liveins: $f10_d
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:fprb(s64) = COPY $f10_d
+    ; CHECK-NEXT: [[FPTRUNC:%[0-9]+]]:fprb(s32) = G_FPTRUNC [[COPY]](s64)
+    ; CHECK-NEXT: $f10_f = COPY [[FPTRUNC]](s32)
+    ; CHECK-NEXT: PseudoRET implicit $f10_f
+    %0:_(s64) = COPY $f10_d
+    %1:_(s32) = G_FPTRUNC %0(s64)
+    $f10_f = COPY %1(s32)
+    PseudoRET implicit $f10_f
+
+...

From a5a908654ee42d270af06c11f17bd558f25e9c36 Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph@gmail.com>
Date: Fri, 20 Oct 2023 01:09:56 -0700
Subject: [PATCH 301/877] Apply clang-tidy fixes for misc-include-cleaner in
 DataFlowFramework.cpp (NFC)

---
 mlir/lib/Analysis/DataFlowFramework.cpp | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/mlir/lib/Analysis/DataFlowFramework.cpp b/mlir/lib/Analysis/DataFlowFramework.cpp
index db00a8a18e8d0..5ef24f201f8a6 100644
--- a/mlir/lib/Analysis/DataFlowFramework.cpp
+++ b/mlir/lib/Analysis/DataFlowFramework.cpp
@@ -7,7 +7,15 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/Analysis/DataFlowFramework.h"
+#include "mlir/IR/Location.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/Value.h"
+#include "mlir/Support/LogicalResult.h"
+#include "llvm/ADT/iterator.h"
+#include "llvm/Config/abi-breaking.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
 
 #define DEBUG_TYPE "dataflow"
 #if LLVM_ENABLE_ABI_BREAKING_CHECKS

From 383f2bd597d8c8504f5d5d68ff21c7cc267f3ee9 Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph@gmail.com>
Date: Fri, 20 Oct 2023 01:10:35 -0700
Subject: [PATCH 302/877] Apply clang-tidy fixes for misc-include-cleaner in
 IntegerRangeAnalysis.cpp (NFC)

---
 mlir/lib/Analysis/DataFlow/IntegerRangeAnalysis.cpp | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/mlir/lib/Analysis/DataFlow/IntegerRangeAnalysis.cpp b/mlir/lib/Analysis/DataFlow/IntegerRangeAnalysis.cpp
index bd08d1a33ec16..a43263bc11113 100644
--- a/mlir/lib/Analysis/DataFlow/IntegerRangeAnalysis.cpp
+++ b/mlir/lib/Analysis/DataFlow/IntegerRangeAnalysis.cpp
@@ -14,10 +14,22 @@
 
 #include "mlir/Analysis/DataFlow/IntegerRangeAnalysis.h"
 #include "mlir/Analysis/DataFlow/ConstantPropagationAnalysis.h"
+#include "mlir/Analysis/DataFlow/SparseAnalysis.h"
+#include "mlir/Analysis/DataFlowFramework.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/Dialect.h"
+#include "mlir/IR/OpDefinition.h"
+#include "mlir/IR/Value.h"
+#include "mlir/Interfaces/ControlFlowInterfaces.h"
 #include "mlir/Interfaces/InferIntRangeInterface.h"
 #include "mlir/Interfaces/LoopLikeInterface.h"
+#include "mlir/Support/LLVM.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/Debug.h"
+#include <cassert>
 #include <optional>
+#include <utility>
 
 #define DEBUG_TYPE "int-range-analysis"
 

From 37d1d9fe41d35de9119405d17a50d3306b60d069 Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph@gmail.com>
Date: Fri, 20 Oct 2023 01:11:15 -0700
Subject: [PATCH 303/877] Apply clang-tidy fixes for misc-include-cleaner in
 LivenessAnalysis.cpp (NFC)

---
 mlir/lib/Analysis/DataFlow/LivenessAnalysis.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/mlir/lib/Analysis/DataFlow/LivenessAnalysis.cpp b/mlir/lib/Analysis/DataFlow/LivenessAnalysis.cpp
index fd65a564f3c4e..2820d27b65f7a 100644
--- a/mlir/lib/Analysis/DataFlow/LivenessAnalysis.cpp
+++ b/mlir/lib/Analysis/DataFlow/LivenessAnalysis.cpp
@@ -6,6 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "mlir/IR/SymbolTable.h"
+#include <cassert>
 #include <mlir/Analysis/DataFlow/LivenessAnalysis.h>
 
 #include <mlir/Analysis/DataFlow/ConstantPropagationAnalysis.h>

From ebc2c4bde3ec0d37d1bcb224e7ffcc18ac4e2620 Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph@gmail.com>
Date: Fri, 20 Oct 2023 01:11:56 -0700
Subject: [PATCH 304/877] Apply clang-tidy fixes for misc-include-cleaner in
 SparseAnalysis.cpp (NFC)

---
 mlir/lib/Analysis/DataFlow/SparseAnalysis.cpp | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/mlir/lib/Analysis/DataFlow/SparseAnalysis.cpp b/mlir/lib/Analysis/DataFlow/SparseAnalysis.cpp
index 02a0ce1bb2921..9f544d656df92 100644
--- a/mlir/lib/Analysis/DataFlow/SparseAnalysis.cpp
+++ b/mlir/lib/Analysis/DataFlow/SparseAnalysis.cpp
@@ -9,7 +9,20 @@
 #include "mlir/Analysis/DataFlow/SparseAnalysis.h"
 #include "mlir/Analysis/DataFlow/DeadCodeAnalysis.h"
 #include "mlir/Analysis/DataFlowFramework.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/Region.h"
+#include "mlir/IR/SymbolTable.h"
+#include "mlir/IR/Value.h"
+#include "mlir/IR/ValueRange.h"
 #include "mlir/Interfaces/CallInterfaces.h"
+#include "mlir/Interfaces/ControlFlowInterfaces.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Support/LogicalResult.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/Casting.h"
+#include <cassert>
+#include <optional>
 
 using namespace mlir;
 using namespace mlir::dataflow;

From 1a2e77cf9e11dbf56b5720c607313a566eebb16e Mon Sep 17 00:00:00 2001
From: Amara Emerson <amara@apple.com>
Date: Sat, 28 Oct 2023 22:32:00 -0700
Subject: [PATCH 305/877] Revert "Revert "Inlining: Run the legacy
 AlwaysInliner before the regular inliner.""

This reverts commit 86bfeb906e3a95ae428f3e97d78d3d22a7c839f3.

This is a long time coming re-application that was originally reverted due to
regressions, unrelated to the actual inlining change. These regressions have since
been fixed due to another long-in-the-making change of a66051c6 landing.

Original commit message for reference:
---
    We have several situations where it's beneficial for code size to ensure that every
    call to always-inline functions are inlined before normal inlining decisions are
    made. While the normal inliner runs in a "MandatoryOnly" mode to try to do this,
    it only does it on a per-SCC basis, rather than the whole module. Ensuring that
    all mandatory inlinings are done before any heuristic based decisions are made
    just makes sense.

    Despite being referred to the "legacy" AlwaysInliner pass, it's already necessary
    for -O0 because the CGSCC inliner is too expensive in compile time to run at -O0.

    This also fixes an exponential compile time blow up in
    https://github.com/llvm/llvm-project/issues/59126

    Differential Revision: https://reviews.llvm.org/D143624
---
---
 .../optimization-remark-with-hotness-new-pm.c |    2 +-
 clang/test/Headers/__clang_hip_math.hip       | 1010 ++++++++---------
 clang/test/OpenMP/bug57757.cpp                |   15 +-
 llvm/lib/Passes/PassBuilderPipelines.cpp      |    4 +-
 llvm/test/Other/new-pm-defaults.ll            |    4 +-
 llvm/test/Other/new-pm-print-pipeline.ll      |    2 +-
 .../Other/new-pm-thinlto-postlink-defaults.ll |    3 +-
 .../new-pm-thinlto-postlink-pgo-defaults.ll   |    2 +-
 ...-pm-thinlto-postlink-samplepgo-defaults.ll |    3 +-
 .../Other/new-pm-thinlto-prelink-defaults.ll  |    4 +-
 .../new-pm-thinlto-prelink-pgo-defaults.ll    |    2 +-
 ...w-pm-thinlto-prelink-samplepgo-defaults.ll |    2 +-
 .../Inline/always-inline-phase-ordering.ll    |  164 +++
 13 files changed, 692 insertions(+), 525 deletions(-)
 create mode 100644 llvm/test/Transforms/Inline/always-inline-phase-ordering.ll

diff --git a/clang/test/Frontend/optimization-remark-with-hotness-new-pm.c b/clang/test/Frontend/optimization-remark-with-hotness-new-pm.c
index cb1992ec4cb85..173b43ba41bfe 100644
--- a/clang/test/Frontend/optimization-remark-with-hotness-new-pm.c
+++ b/clang/test/Frontend/optimization-remark-with-hotness-new-pm.c
@@ -73,7 +73,7 @@ void bar(int x) {
   // THRESHOLD-NOT: hotness
   // NO_PGO: '-fdiagnostics-show-hotness' requires profile-guided optimization information
   // NO_PGO: '-fdiagnostics-hotness-threshold=' requires profile-guided optimization information
-  // expected-remark@+1 {{'foo' inlined into 'bar': always inline attribute at callsite bar:8:10; (hotness:}}
+  // expected-remark@+1 {{'foo' inlined into 'bar' with (cost=always): always inline attribute at callsite bar:8:10; (hotness:}}
   sum += foo(x, x - 2);
 }
 
diff --git a/clang/test/Headers/__clang_hip_math.hip b/clang/test/Headers/__clang_hip_math.hip
index fc18e14d82296..b57c38d70b14c 100644
--- a/clang/test/Headers/__clang_hip_math.hip
+++ b/clang/test/Headers/__clang_hip_math.hip
@@ -138,26 +138,26 @@ extern "C" __device__ uint64_t test___make_mantissa_base16(const char *p) {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[P:%.*]], align 1, !tbaa [[TBAA4]]
 // CHECK-NEXT:    [[CMP_I:%.*]] = icmp eq i8 [[TMP0]], 48
-// CHECK-NEXT:    br i1 [[CMP_I]], label [[IF_THEN_I:%.*]], label [[WHILE_COND_I30_I:%.*]]
+// CHECK-NEXT:    br i1 [[CMP_I]], label [[IF_THEN_I:%.*]], label [[WHILE_COND_I14_I:%.*]]
 // CHECK:       if.then.i:
 // CHECK-NEXT:    [[INCDEC_PTR_I:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 1
 // CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[INCDEC_PTR_I]], align 1, !tbaa [[TBAA4]]
-// CHECK-NEXT:    switch i8 [[TMP1]], label [[WHILE_COND_I14_I:%.*]] [
-// CHECK-NEXT:    i8 120, label [[WHILE_COND_I_I_PREHEADER:%.*]]
-// CHECK-NEXT:    i8 88, label [[WHILE_COND_I_I_PREHEADER]]
+// CHECK-NEXT:    switch i8 [[TMP1]], label [[WHILE_COND_I_I:%.*]] [
+// CHECK-NEXT:    i8 120, label [[WHILE_COND_I30_I_PREHEADER:%.*]]
+// CHECK-NEXT:    i8 88, label [[WHILE_COND_I30_I_PREHEADER]]
 // CHECK-NEXT:    ]
-// CHECK:       while.cond.i.i.preheader:
-// CHECK-NEXT:    br label [[WHILE_COND_I_I:%.*]]
-// CHECK:       while.cond.i.i:
-// CHECK-NEXT:    [[__TAGP_ADDR_0_I_I:%.*]] = phi ptr [ [[__TAGP_ADDR_1_I_I:%.*]], [[CLEANUP_I_I:%.*]] ], [ [[INCDEC_PTR_I]], [[WHILE_COND_I_I_PREHEADER]] ]
-// CHECK-NEXT:    [[__R_0_I_I:%.*]] = phi i64 [ [[__R_2_I_I:%.*]], [[CLEANUP_I_I]] ], [ 0, [[WHILE_COND_I_I_PREHEADER]] ]
-// CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr [[__TAGP_ADDR_0_I_I]], align 1, !tbaa [[TBAA4]]
-// CHECK-NEXT:    [[CMP_NOT_I_I:%.*]] = icmp eq i8 [[TMP2]], 0
-// CHECK-NEXT:    br i1 [[CMP_NOT_I_I]], label [[_ZL15__MAKE_MANTISSAPKC_EXIT:%.*]], label [[WHILE_BODY_I_I:%.*]]
-// CHECK:       while.body.i.i:
+// CHECK:       while.cond.i30.i.preheader:
+// CHECK-NEXT:    br label [[WHILE_COND_I30_I:%.*]]
+// CHECK:       while.cond.i30.i:
+// CHECK-NEXT:    [[__TAGP_ADDR_0_I31_I:%.*]] = phi ptr [ [[__TAGP_ADDR_1_I37_I:%.*]], [[CLEANUP_I36_I:%.*]] ], [ [[INCDEC_PTR_I]], [[WHILE_COND_I30_I_PREHEADER]] ]
+// CHECK-NEXT:    [[__R_0_I32_I:%.*]] = phi i64 [ [[__R_2_I_I:%.*]], [[CLEANUP_I36_I]] ], [ 0, [[WHILE_COND_I30_I_PREHEADER]] ]
+// CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr [[__TAGP_ADDR_0_I31_I]], align 1, !tbaa [[TBAA4]]
+// CHECK-NEXT:    [[CMP_NOT_I33_I:%.*]] = icmp eq i8 [[TMP2]], 0
+// CHECK-NEXT:    br i1 [[CMP_NOT_I33_I]], label [[_ZL15__MAKE_MANTISSAPKC_EXIT:%.*]], label [[WHILE_BODY_I34_I:%.*]]
+// CHECK:       while.body.i34.i:
 // CHECK-NEXT:    [[TMP3:%.*]] = add i8 [[TMP2]], -48
-// CHECK-NEXT:    [[OR_COND_I_I:%.*]] = icmp ult i8 [[TMP3]], 10
-// CHECK-NEXT:    br i1 [[OR_COND_I_I]], label [[IF_END31_I_I:%.*]], label [[IF_ELSE_I_I:%.*]]
+// CHECK-NEXT:    [[OR_COND_I35_I:%.*]] = icmp ult i8 [[TMP3]], 10
+// CHECK-NEXT:    br i1 [[OR_COND_I35_I]], label [[IF_END31_I_I:%.*]], label [[IF_ELSE_I_I:%.*]]
 // CHECK:       if.else.i.i:
 // CHECK-NEXT:    [[TMP4:%.*]] = add i8 [[TMP2]], -97
 // CHECK-NEXT:    [[OR_COND33_I_I:%.*]] = icmp ult i8 [[TMP4]], 6
@@ -165,33 +165,54 @@ extern "C" __device__ uint64_t test___make_mantissa_base16(const char *p) {
 // CHECK:       if.else17.i.i:
 // CHECK-NEXT:    [[TMP5:%.*]] = add i8 [[TMP2]], -65
 // CHECK-NEXT:    [[OR_COND34_I_I:%.*]] = icmp ult i8 [[TMP5]], 6
-// CHECK-NEXT:    br i1 [[OR_COND34_I_I]], label [[IF_END31_I_I]], label [[CLEANUP_I_I]]
+// CHECK-NEXT:    br i1 [[OR_COND34_I_I]], label [[IF_END31_I_I]], label [[CLEANUP_I36_I]]
 // CHECK:       if.end31.i.i:
-// CHECK-NEXT:    [[DOTSINK:%.*]] = phi i64 [ -48, [[WHILE_BODY_I_I]] ], [ -87, [[IF_ELSE_I_I]] ], [ -55, [[IF_ELSE17_I_I]] ]
-// CHECK-NEXT:    [[MUL24_I_I:%.*]] = shl i64 [[__R_0_I_I]], 4
+// CHECK-NEXT:    [[DOTSINK:%.*]] = phi i64 [ -48, [[WHILE_BODY_I34_I]] ], [ -87, [[IF_ELSE_I_I]] ], [ -55, [[IF_ELSE17_I_I]] ]
+// CHECK-NEXT:    [[MUL24_I_I:%.*]] = shl i64 [[__R_0_I32_I]], 4
 // CHECK-NEXT:    [[CONV25_I_I:%.*]] = sext i8 [[TMP2]] to i64
 // CHECK-NEXT:    [[ADD26_I_I:%.*]] = add i64 [[MUL24_I_I]], [[DOTSINK]]
 // CHECK-NEXT:    [[ADD28_I_I:%.*]] = add i64 [[ADD26_I_I]], [[CONV25_I_I]]
+// CHECK-NEXT:    [[INCDEC_PTR_I40_I:%.*]] = getelementptr inbounds i8, ptr [[__TAGP_ADDR_0_I31_I]], i64 1
+// CHECK-NEXT:    br label [[CLEANUP_I36_I]]
+// CHECK:       cleanup.i36.i:
+// CHECK-NEXT:    [[__TAGP_ADDR_1_I37_I]] = phi ptr [ [[INCDEC_PTR_I40_I]], [[IF_END31_I_I]] ], [ [[__TAGP_ADDR_0_I31_I]], [[IF_ELSE17_I_I]] ]
+// CHECK-NEXT:    [[__R_2_I_I]] = phi i64 [ [[ADD28_I_I]], [[IF_END31_I_I]] ], [ [[__R_0_I32_I]], [[IF_ELSE17_I_I]] ]
+// CHECK-NEXT:    [[COND_I_I:%.*]] = phi i1 [ true, [[IF_END31_I_I]] ], [ false, [[IF_ELSE17_I_I]] ]
+// CHECK-NEXT:    br i1 [[COND_I_I]], label [[WHILE_COND_I30_I]], label [[_ZL15__MAKE_MANTISSAPKC_EXIT]], !llvm.loop [[LOOP11]]
+// CHECK:       while.cond.i.i:
+// CHECK-NEXT:    [[__TAGP_ADDR_0_I_I:%.*]] = phi ptr [ [[__TAGP_ADDR_1_I_I:%.*]], [[CLEANUP_I_I:%.*]] ], [ [[INCDEC_PTR_I]], [[IF_THEN_I]] ]
+// CHECK-NEXT:    [[__R_0_I_I:%.*]] = phi i64 [ [[__R_1_I_I:%.*]], [[CLEANUP_I_I]] ], [ 0, [[IF_THEN_I]] ]
+// CHECK-NEXT:    [[TMP6:%.*]] = load i8, ptr [[__TAGP_ADDR_0_I_I]], align 1, !tbaa [[TBAA4]]
+// CHECK-NEXT:    [[CMP_NOT_I_I:%.*]] = icmp eq i8 [[TMP6]], 0
+// CHECK-NEXT:    br i1 [[CMP_NOT_I_I]], label [[_ZL15__MAKE_MANTISSAPKC_EXIT]], label [[WHILE_BODY_I_I:%.*]]
+// CHECK:       while.body.i.i:
+// CHECK-NEXT:    [[TMP7:%.*]] = and i8 [[TMP6]], -8
+// CHECK-NEXT:    [[OR_COND_I_I:%.*]] = icmp eq i8 [[TMP7]], 48
+// CHECK-NEXT:    br i1 [[OR_COND_I_I]], label [[IF_THEN_I_I:%.*]], label [[CLEANUP_I_I]]
+// CHECK:       if.then.i.i:
+// CHECK-NEXT:    [[MUL_I_I:%.*]] = shl i64 [[__R_0_I_I]], 3
+// CHECK-NEXT:    [[CONV5_I_I:%.*]] = sext i8 [[TMP6]] to i64
+// CHECK-NEXT:    [[ADD_I_I:%.*]] = add i64 [[MUL_I_I]], -48
+// CHECK-NEXT:    [[SUB_I_I:%.*]] = add i64 [[ADD_I_I]], [[CONV5_I_I]]
 // CHECK-NEXT:    [[INCDEC_PTR_I_I:%.*]] = getelementptr inbounds i8, ptr [[__TAGP_ADDR_0_I_I]], i64 1
 // CHECK-NEXT:    br label [[CLEANUP_I_I]]
 // CHECK:       cleanup.i.i:
-// CHECK-NEXT:    [[__TAGP_ADDR_1_I_I]] = phi ptr [ [[INCDEC_PTR_I_I]], [[IF_END31_I_I]] ], [ [[__TAGP_ADDR_0_I_I]], [[IF_ELSE17_I_I]] ]
-// CHECK-NEXT:    [[__R_2_I_I]] = phi i64 [ [[ADD28_I_I]], [[IF_END31_I_I]] ], [ [[__R_0_I_I]], [[IF_ELSE17_I_I]] ]
-// CHECK-NEXT:    [[COND_I_I:%.*]] = phi i1 [ true, [[IF_END31_I_I]] ], [ false, [[IF_ELSE17_I_I]] ]
-// CHECK-NEXT:    br i1 [[COND_I_I]], label [[WHILE_COND_I_I]], label [[_ZL15__MAKE_MANTISSAPKC_EXIT]], !llvm.loop [[LOOP11]]
+// CHECK-NEXT:    [[__TAGP_ADDR_1_I_I]] = phi ptr [ [[INCDEC_PTR_I_I]], [[IF_THEN_I_I]] ], [ [[__TAGP_ADDR_0_I_I]], [[WHILE_BODY_I_I]] ]
+// CHECK-NEXT:    [[__R_1_I_I]] = phi i64 [ [[SUB_I_I]], [[IF_THEN_I_I]] ], [ [[__R_0_I_I]], [[WHILE_BODY_I_I]] ]
+// CHECK-NEXT:    br i1 [[OR_COND_I_I]], label [[WHILE_COND_I_I]], label [[_ZL15__MAKE_MANTISSAPKC_EXIT]], !llvm.loop [[LOOP7]]
 // CHECK:       while.cond.i14.i:
-// CHECK-NEXT:    [[__TAGP_ADDR_0_I15_I:%.*]] = phi ptr [ [[__TAGP_ADDR_1_I21_I:%.*]], [[CLEANUP_I20_I:%.*]] ], [ [[INCDEC_PTR_I]], [[IF_THEN_I]] ]
-// CHECK-NEXT:    [[__R_0_I16_I:%.*]] = phi i64 [ [[__R_1_I22_I:%.*]], [[CLEANUP_I20_I]] ], [ 0, [[IF_THEN_I]] ]
-// CHECK-NEXT:    [[TMP6:%.*]] = load i8, ptr [[__TAGP_ADDR_0_I15_I]], align 1, !tbaa [[TBAA4]]
-// CHECK-NEXT:    [[CMP_NOT_I17_I:%.*]] = icmp eq i8 [[TMP6]], 0
+// CHECK-NEXT:    [[__TAGP_ADDR_0_I15_I:%.*]] = phi ptr [ [[__TAGP_ADDR_1_I21_I:%.*]], [[CLEANUP_I20_I:%.*]] ], [ [[P]], [[ENTRY:%.*]] ]
+// CHECK-NEXT:    [[__R_0_I16_I:%.*]] = phi i64 [ [[__R_1_I22_I:%.*]], [[CLEANUP_I20_I]] ], [ 0, [[ENTRY]] ]
+// CHECK-NEXT:    [[TMP8:%.*]] = load i8, ptr [[__TAGP_ADDR_0_I15_I]], align 1, !tbaa [[TBAA4]]
+// CHECK-NEXT:    [[CMP_NOT_I17_I:%.*]] = icmp eq i8 [[TMP8]], 0
 // CHECK-NEXT:    br i1 [[CMP_NOT_I17_I]], label [[_ZL15__MAKE_MANTISSAPKC_EXIT]], label [[WHILE_BODY_I18_I:%.*]]
 // CHECK:       while.body.i18.i:
-// CHECK-NEXT:    [[TMP7:%.*]] = and i8 [[TMP6]], -8
-// CHECK-NEXT:    [[OR_COND_I19_I:%.*]] = icmp eq i8 [[TMP7]], 48
+// CHECK-NEXT:    [[TMP9:%.*]] = add i8 [[TMP8]], -48
+// CHECK-NEXT:    [[OR_COND_I19_I:%.*]] = icmp ult i8 [[TMP9]], 10
 // CHECK-NEXT:    br i1 [[OR_COND_I19_I]], label [[IF_THEN_I24_I:%.*]], label [[CLEANUP_I20_I]]
 // CHECK:       if.then.i24.i:
-// CHECK-NEXT:    [[MUL_I25_I:%.*]] = shl i64 [[__R_0_I16_I]], 3
-// CHECK-NEXT:    [[CONV5_I26_I:%.*]] = sext i8 [[TMP6]] to i64
+// CHECK-NEXT:    [[MUL_I25_I:%.*]] = mul i64 [[__R_0_I16_I]], 10
+// CHECK-NEXT:    [[CONV5_I26_I:%.*]] = sext i8 [[TMP8]] to i64
 // CHECK-NEXT:    [[ADD_I27_I:%.*]] = add i64 [[MUL_I25_I]], -48
 // CHECK-NEXT:    [[SUB_I28_I:%.*]] = add i64 [[ADD_I27_I]], [[CONV5_I26_I]]
 // CHECK-NEXT:    [[INCDEC_PTR_I29_I:%.*]] = getelementptr inbounds i8, ptr [[__TAGP_ADDR_0_I15_I]], i64 1
@@ -199,30 +220,9 @@ extern "C" __device__ uint64_t test___make_mantissa_base16(const char *p) {
 // CHECK:       cleanup.i20.i:
 // CHECK-NEXT:    [[__TAGP_ADDR_1_I21_I]] = phi ptr [ [[INCDEC_PTR_I29_I]], [[IF_THEN_I24_I]] ], [ [[__TAGP_ADDR_0_I15_I]], [[WHILE_BODY_I18_I]] ]
 // CHECK-NEXT:    [[__R_1_I22_I]] = phi i64 [ [[SUB_I28_I]], [[IF_THEN_I24_I]] ], [ [[__R_0_I16_I]], [[WHILE_BODY_I18_I]] ]
-// CHECK-NEXT:    br i1 [[OR_COND_I19_I]], label [[WHILE_COND_I14_I]], label [[_ZL15__MAKE_MANTISSAPKC_EXIT]], !llvm.loop [[LOOP7]]
-// CHECK:       while.cond.i30.i:
-// CHECK-NEXT:    [[__TAGP_ADDR_0_I31_I:%.*]] = phi ptr [ [[__TAGP_ADDR_1_I37_I:%.*]], [[CLEANUP_I36_I:%.*]] ], [ [[P]], [[ENTRY:%.*]] ]
-// CHECK-NEXT:    [[__R_0_I32_I:%.*]] = phi i64 [ [[__R_1_I38_I:%.*]], [[CLEANUP_I36_I]] ], [ 0, [[ENTRY]] ]
-// CHECK-NEXT:    [[TMP8:%.*]] = load i8, ptr [[__TAGP_ADDR_0_I31_I]], align 1, !tbaa [[TBAA4]]
-// CHECK-NEXT:    [[CMP_NOT_I33_I:%.*]] = icmp eq i8 [[TMP8]], 0
-// CHECK-NEXT:    br i1 [[CMP_NOT_I33_I]], label [[_ZL15__MAKE_MANTISSAPKC_EXIT]], label [[WHILE_BODY_I34_I:%.*]]
-// CHECK:       while.body.i34.i:
-// CHECK-NEXT:    [[TMP9:%.*]] = add i8 [[TMP8]], -48
-// CHECK-NEXT:    [[OR_COND_I35_I:%.*]] = icmp ult i8 [[TMP9]], 10
-// CHECK-NEXT:    br i1 [[OR_COND_I35_I]], label [[IF_THEN_I40_I:%.*]], label [[CLEANUP_I36_I]]
-// CHECK:       if.then.i40.i:
-// CHECK-NEXT:    [[MUL_I41_I:%.*]] = mul i64 [[__R_0_I32_I]], 10
-// CHECK-NEXT:    [[CONV5_I42_I:%.*]] = sext i8 [[TMP8]] to i64
-// CHECK-NEXT:    [[ADD_I43_I:%.*]] = add i64 [[MUL_I41_I]], -48
-// CHECK-NEXT:    [[SUB_I44_I:%.*]] = add i64 [[ADD_I43_I]], [[CONV5_I42_I]]
-// CHECK-NEXT:    [[INCDEC_PTR_I45_I:%.*]] = getelementptr inbounds i8, ptr [[__TAGP_ADDR_0_I31_I]], i64 1
-// CHECK-NEXT:    br label [[CLEANUP_I36_I]]
-// CHECK:       cleanup.i36.i:
-// CHECK-NEXT:    [[__TAGP_ADDR_1_I37_I]] = phi ptr [ [[INCDEC_PTR_I45_I]], [[IF_THEN_I40_I]] ], [ [[__TAGP_ADDR_0_I31_I]], [[WHILE_BODY_I34_I]] ]
-// CHECK-NEXT:    [[__R_1_I38_I]] = phi i64 [ [[SUB_I44_I]], [[IF_THEN_I40_I]] ], [ [[__R_0_I32_I]], [[WHILE_BODY_I34_I]] ]
-// CHECK-NEXT:    br i1 [[OR_COND_I35_I]], label [[WHILE_COND_I30_I]], label [[_ZL15__MAKE_MANTISSAPKC_EXIT]], !llvm.loop [[LOOP10]]
+// CHECK-NEXT:    br i1 [[OR_COND_I19_I]], label [[WHILE_COND_I14_I]], label [[_ZL15__MAKE_MANTISSAPKC_EXIT]], !llvm.loop [[LOOP10]]
 // CHECK:       _ZL15__make_mantissaPKc.exit:
-// CHECK-NEXT:    [[RETVAL_0_I:%.*]] = phi i64 [ 0, [[CLEANUP_I20_I]] ], [ [[__R_0_I16_I]], [[WHILE_COND_I14_I]] ], [ 0, [[CLEANUP_I_I]] ], [ [[__R_0_I_I]], [[WHILE_COND_I_I]] ], [ 0, [[CLEANUP_I36_I]] ], [ [[__R_0_I32_I]], [[WHILE_COND_I30_I]] ]
+// CHECK-NEXT:    [[RETVAL_0_I:%.*]] = phi i64 [ 0, [[CLEANUP_I_I]] ], [ [[__R_0_I_I]], [[WHILE_COND_I_I]] ], [ 0, [[CLEANUP_I36_I]] ], [ [[__R_0_I32_I]], [[WHILE_COND_I30_I]] ], [ 0, [[CLEANUP_I20_I]] ], [ [[__R_0_I16_I]], [[WHILE_COND_I14_I]] ]
 // CHECK-NEXT:    ret i64 [[RETVAL_0_I]]
 //
 extern "C" __device__ uint64_t test___make_mantissa(const char *p) {
@@ -231,8 +231,8 @@ extern "C" __device__ uint64_t test___make_mantissa(const char *p) {
 
 // CHECK-LABEL: @test_abs(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ABS_I:%.*]] = tail call noundef i32 @llvm.abs.i32(i32 [[X:%.*]], i1 true)
-// CHECK-NEXT:    ret i32 [[ABS_I]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call noundef i32 @llvm.abs.i32(i32 [[X:%.*]], i1 true)
+// CHECK-NEXT:    ret i32 [[TMP0]]
 //
 extern "C" __device__ int test_abs(int x) {
   return abs(x);
@@ -240,8 +240,8 @@ extern "C" __device__ int test_abs(int x) {
 
 // CHECK-LABEL: @test_labs(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ABS_I:%.*]] = tail call noundef i64 @llvm.abs.i64(i64 [[X:%.*]], i1 true)
-// CHECK-NEXT:    ret i64 [[ABS_I]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call noundef i64 @llvm.abs.i64(i64 [[X:%.*]], i1 true)
+// CHECK-NEXT:    ret i64 [[TMP0]]
 //
 extern "C" __device__ long test_labs(long x) {
   return labs(x);
@@ -249,8 +249,8 @@ extern "C" __device__ long test_labs(long x) {
 
 // CHECK-LABEL: @test_llabs(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[ABS_I:%.*]] = tail call noundef i64 @llvm.abs.i64(i64 [[X:%.*]], i1 true)
-// CHECK-NEXT:    ret i64 [[ABS_I]]
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call noundef i64 @llvm.abs.i64(i64 [[X:%.*]], i1 true)
+// CHECK-NEXT:    ret i64 [[TMP0]]
 //
 extern "C" __device__ long long test_llabs(long x) {
   return llabs(x);
@@ -649,8 +649,8 @@ extern "C" __device__ double test_copysign(double x, double y) {
 //
 // APPROX-LABEL: @test_cosf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I_I:%.*]] = tail call contract noundef float @__ocml_native_cos_f32(float noundef [[X:%.*]]) #[[ATTR16:[0-9]+]]
-// APPROX-NEXT:    ret float [[CALL_I_I]]
+// APPROX-NEXT:    [[CALL_I1:%.*]] = tail call contract noundef float @__ocml_native_cos_f32(float noundef [[X:%.*]]) #[[ATTR16:[0-9]+]]
+// APPROX-NEXT:    ret float [[CALL_I1]]
 //
 extern "C" __device__ float test_cosf(float x) {
   return cosf(x);
@@ -1670,30 +1670,30 @@ extern "C" __device__ double test_j1(double x) {
 // DEFAULT-NEXT:    i32 1, label [[IF_THEN2_I:%.*]]
 // DEFAULT-NEXT:    ]
 // DEFAULT:       if.then.i:
-// DEFAULT-NEXT:    [[CALL_I_I:%.*]] = tail call contract noundef float @__ocml_j0_f32(float noundef [[Y:%.*]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I20_I:%.*]] = tail call contract noundef float @__ocml_j0_f32(float noundef [[Y:%.*]]) #[[ATTR16]]
 // DEFAULT-NEXT:    br label [[_ZL3JNFIF_EXIT:%.*]]
 // DEFAULT:       if.then2.i:
-// DEFAULT-NEXT:    [[CALL_I20_I:%.*]] = tail call contract noundef float @__ocml_j1_f32(float noundef [[Y]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I22_I:%.*]] = tail call contract noundef float @__ocml_j1_f32(float noundef [[Y]]) #[[ATTR16]]
 // DEFAULT-NEXT:    br label [[_ZL3JNFIF_EXIT]]
 // DEFAULT:       if.end4.i:
-// DEFAULT-NEXT:    [[CALL_I21_I:%.*]] = tail call contract noundef float @__ocml_j0_f32(float noundef [[Y]]) #[[ATTR16]]
-// DEFAULT-NEXT:    [[CALL_I22_I:%.*]] = tail call contract noundef float @__ocml_j1_f32(float noundef [[Y]]) #[[ATTR16]]
-// DEFAULT-NEXT:    [[CMP723_I:%.*]] = icmp sgt i32 [[X]], 1
-// DEFAULT-NEXT:    br i1 [[CMP723_I]], label [[FOR_BODY_I:%.*]], label [[_ZL3JNFIF_EXIT]]
+// DEFAULT-NEXT:    [[CALL_I_I:%.*]] = tail call contract noundef float @__ocml_j0_f32(float noundef [[Y]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I21_I:%.*]] = tail call contract noundef float @__ocml_j1_f32(float noundef [[Y]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CMP7_I1:%.*]] = icmp sgt i32 [[X]], 1
+// DEFAULT-NEXT:    br i1 [[CMP7_I1]], label [[FOR_BODY_I:%.*]], label [[_ZL3JNFIF_EXIT]]
 // DEFAULT:       for.body.i:
-// DEFAULT-NEXT:    [[__I_026_I:%.*]] = phi i32 [ [[INC_I:%.*]], [[FOR_BODY_I]] ], [ 1, [[IF_END4_I]] ]
-// DEFAULT-NEXT:    [[__X1_025_I:%.*]] = phi float [ [[SUB_I:%.*]], [[FOR_BODY_I]] ], [ [[CALL_I22_I]], [[IF_END4_I]] ]
-// DEFAULT-NEXT:    [[__X0_024_I:%.*]] = phi float [ [[__X1_025_I]], [[FOR_BODY_I]] ], [ [[CALL_I21_I]], [[IF_END4_I]] ]
-// DEFAULT-NEXT:    [[MUL_I:%.*]] = shl nuw nsw i32 [[__I_026_I]], 1
+// DEFAULT-NEXT:    [[__I_0_I4:%.*]] = phi i32 [ [[INC_I:%.*]], [[FOR_BODY_I]] ], [ 1, [[IF_END4_I]] ]
+// DEFAULT-NEXT:    [[__X1_0_I3:%.*]] = phi float [ [[SUB_I:%.*]], [[FOR_BODY_I]] ], [ [[CALL_I21_I]], [[IF_END4_I]] ]
+// DEFAULT-NEXT:    [[__X0_0_I2:%.*]] = phi float [ [[__X1_0_I3]], [[FOR_BODY_I]] ], [ [[CALL_I_I]], [[IF_END4_I]] ]
+// DEFAULT-NEXT:    [[MUL_I:%.*]] = shl nuw nsw i32 [[__I_0_I4]], 1
 // DEFAULT-NEXT:    [[CONV_I:%.*]] = sitofp i32 [[MUL_I]] to float
 // DEFAULT-NEXT:    [[DIV_I:%.*]] = fdiv contract float [[CONV_I]], [[Y]]
-// DEFAULT-NEXT:    [[MUL8_I:%.*]] = fmul contract float [[__X1_025_I]], [[DIV_I]]
-// DEFAULT-NEXT:    [[SUB_I]] = fsub contract float [[MUL8_I]], [[__X0_024_I]]
-// DEFAULT-NEXT:    [[INC_I]] = add nuw nsw i32 [[__I_026_I]], 1
-// DEFAULT-NEXT:    [[EXITCOND_NOT_I:%.*]] = icmp eq i32 [[INC_I]], [[X]]
-// DEFAULT-NEXT:    br i1 [[EXITCOND_NOT_I]], label [[_ZL3JNFIF_EXIT]], label [[FOR_BODY_I]], !llvm.loop [[LOOP14:![0-9]+]]
+// DEFAULT-NEXT:    [[MUL8_I:%.*]] = fmul contract float [[__X1_0_I3]], [[DIV_I]]
+// DEFAULT-NEXT:    [[SUB_I]] = fsub contract float [[MUL8_I]], [[__X0_0_I2]]
+// DEFAULT-NEXT:    [[INC_I]] = add nuw nsw i32 [[__I_0_I4]], 1
+// DEFAULT-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC_I]], [[X]]
+// DEFAULT-NEXT:    br i1 [[EXITCOND_NOT]], label [[_ZL3JNFIF_EXIT]], label [[FOR_BODY_I]], !llvm.loop [[LOOP14:![0-9]+]]
 // DEFAULT:       _ZL3jnfif.exit:
-// DEFAULT-NEXT:    [[RETVAL_0_I:%.*]] = phi float [ [[CALL_I_I]], [[IF_THEN_I]] ], [ [[CALL_I20_I]], [[IF_THEN2_I]] ], [ [[CALL_I22_I]], [[IF_END4_I]] ], [ [[SUB_I]], [[FOR_BODY_I]] ]
+// DEFAULT-NEXT:    [[RETVAL_0_I:%.*]] = phi float [ [[CALL_I20_I]], [[IF_THEN_I]] ], [ [[CALL_I22_I]], [[IF_THEN2_I]] ], [ [[CALL_I21_I]], [[IF_END4_I]] ], [ [[SUB_I]], [[FOR_BODY_I]] ]
 // DEFAULT-NEXT:    ret float [[RETVAL_0_I]]
 //
 // FINITEONLY-LABEL: @test_jnf(
@@ -1703,30 +1703,30 @@ extern "C" __device__ double test_j1(double x) {
 // FINITEONLY-NEXT:    i32 1, label [[IF_THEN2_I:%.*]]
 // FINITEONLY-NEXT:    ]
 // FINITEONLY:       if.then.i:
-// FINITEONLY-NEXT:    [[CALL_I_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_j0_f32(float noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I20_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_j0_f32(float noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR16]]
 // FINITEONLY-NEXT:    br label [[_ZL3JNFIF_EXIT:%.*]]
 // FINITEONLY:       if.then2.i:
-// FINITEONLY-NEXT:    [[CALL_I20_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_j1_f32(float noundef nofpclass(nan inf) [[Y]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I22_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_j1_f32(float noundef nofpclass(nan inf) [[Y]]) #[[ATTR16]]
 // FINITEONLY-NEXT:    br label [[_ZL3JNFIF_EXIT]]
 // FINITEONLY:       if.end4.i:
-// FINITEONLY-NEXT:    [[CALL_I21_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_j0_f32(float noundef nofpclass(nan inf) [[Y]]) #[[ATTR16]]
-// FINITEONLY-NEXT:    [[CALL_I22_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_j1_f32(float noundef nofpclass(nan inf) [[Y]]) #[[ATTR16]]
-// FINITEONLY-NEXT:    [[CMP723_I:%.*]] = icmp sgt i32 [[X]], 1
-// FINITEONLY-NEXT:    br i1 [[CMP723_I]], label [[FOR_BODY_I:%.*]], label [[_ZL3JNFIF_EXIT]]
+// FINITEONLY-NEXT:    [[CALL_I_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_j0_f32(float noundef nofpclass(nan inf) [[Y]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I21_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_j1_f32(float noundef nofpclass(nan inf) [[Y]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CMP7_I1:%.*]] = icmp sgt i32 [[X]], 1
+// FINITEONLY-NEXT:    br i1 [[CMP7_I1]], label [[FOR_BODY_I:%.*]], label [[_ZL3JNFIF_EXIT]]
 // FINITEONLY:       for.body.i:
-// FINITEONLY-NEXT:    [[__I_026_I:%.*]] = phi i32 [ [[INC_I:%.*]], [[FOR_BODY_I]] ], [ 1, [[IF_END4_I]] ]
-// FINITEONLY-NEXT:    [[__X1_025_I:%.*]] = phi float [ [[SUB_I:%.*]], [[FOR_BODY_I]] ], [ [[CALL_I22_I]], [[IF_END4_I]] ]
-// FINITEONLY-NEXT:    [[__X0_024_I:%.*]] = phi float [ [[__X1_025_I]], [[FOR_BODY_I]] ], [ [[CALL_I21_I]], [[IF_END4_I]] ]
-// FINITEONLY-NEXT:    [[MUL_I:%.*]] = shl nuw nsw i32 [[__I_026_I]], 1
+// FINITEONLY-NEXT:    [[__I_0_I4:%.*]] = phi i32 [ [[INC_I:%.*]], [[FOR_BODY_I]] ], [ 1, [[IF_END4_I]] ]
+// FINITEONLY-NEXT:    [[__X1_0_I3:%.*]] = phi float [ [[SUB_I:%.*]], [[FOR_BODY_I]] ], [ [[CALL_I21_I]], [[IF_END4_I]] ]
+// FINITEONLY-NEXT:    [[__X0_0_I2:%.*]] = phi float [ [[__X1_0_I3]], [[FOR_BODY_I]] ], [ [[CALL_I_I]], [[IF_END4_I]] ]
+// FINITEONLY-NEXT:    [[MUL_I:%.*]] = shl nuw nsw i32 [[__I_0_I4]], 1
 // FINITEONLY-NEXT:    [[CONV_I:%.*]] = sitofp i32 [[MUL_I]] to float
 // FINITEONLY-NEXT:    [[DIV_I:%.*]] = fdiv nnan ninf contract float [[CONV_I]], [[Y]]
-// FINITEONLY-NEXT:    [[MUL8_I:%.*]] = fmul nnan ninf contract float [[__X1_025_I]], [[DIV_I]]
-// FINITEONLY-NEXT:    [[SUB_I]] = fsub nnan ninf contract float [[MUL8_I]], [[__X0_024_I]]
-// FINITEONLY-NEXT:    [[INC_I]] = add nuw nsw i32 [[__I_026_I]], 1
-// FINITEONLY-NEXT:    [[EXITCOND_NOT_I:%.*]] = icmp eq i32 [[INC_I]], [[X]]
-// FINITEONLY-NEXT:    br i1 [[EXITCOND_NOT_I]], label [[_ZL3JNFIF_EXIT]], label [[FOR_BODY_I]], !llvm.loop [[LOOP14:![0-9]+]]
+// FINITEONLY-NEXT:    [[MUL8_I:%.*]] = fmul nnan ninf contract float [[__X1_0_I3]], [[DIV_I]]
+// FINITEONLY-NEXT:    [[SUB_I]] = fsub nnan ninf contract float [[MUL8_I]], [[__X0_0_I2]]
+// FINITEONLY-NEXT:    [[INC_I]] = add nuw nsw i32 [[__I_0_I4]], 1
+// FINITEONLY-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC_I]], [[X]]
+// FINITEONLY-NEXT:    br i1 [[EXITCOND_NOT]], label [[_ZL3JNFIF_EXIT]], label [[FOR_BODY_I]], !llvm.loop [[LOOP14:![0-9]+]]
 // FINITEONLY:       _ZL3jnfif.exit:
-// FINITEONLY-NEXT:    [[RETVAL_0_I:%.*]] = phi float [ [[CALL_I_I]], [[IF_THEN_I]] ], [ [[CALL_I20_I]], [[IF_THEN2_I]] ], [ [[CALL_I22_I]], [[IF_END4_I]] ], [ [[SUB_I]], [[FOR_BODY_I]] ]
+// FINITEONLY-NEXT:    [[RETVAL_0_I:%.*]] = phi float [ [[CALL_I20_I]], [[IF_THEN_I]] ], [ [[CALL_I22_I]], [[IF_THEN2_I]] ], [ [[CALL_I21_I]], [[IF_END4_I]] ], [ [[SUB_I]], [[FOR_BODY_I]] ]
 // FINITEONLY-NEXT:    ret float [[RETVAL_0_I]]
 //
 // APPROX-LABEL: @test_jnf(
@@ -1736,30 +1736,30 @@ extern "C" __device__ double test_j1(double x) {
 // APPROX-NEXT:    i32 1, label [[IF_THEN2_I:%.*]]
 // APPROX-NEXT:    ]
 // APPROX:       if.then.i:
-// APPROX-NEXT:    [[CALL_I_I:%.*]] = tail call contract noundef float @__ocml_j0_f32(float noundef [[Y:%.*]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I20_I:%.*]] = tail call contract noundef float @__ocml_j0_f32(float noundef [[Y:%.*]]) #[[ATTR16]]
 // APPROX-NEXT:    br label [[_ZL3JNFIF_EXIT:%.*]]
 // APPROX:       if.then2.i:
-// APPROX-NEXT:    [[CALL_I20_I:%.*]] = tail call contract noundef float @__ocml_j1_f32(float noundef [[Y]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I22_I:%.*]] = tail call contract noundef float @__ocml_j1_f32(float noundef [[Y]]) #[[ATTR16]]
 // APPROX-NEXT:    br label [[_ZL3JNFIF_EXIT]]
 // APPROX:       if.end4.i:
-// APPROX-NEXT:    [[CALL_I21_I:%.*]] = tail call contract noundef float @__ocml_j0_f32(float noundef [[Y]]) #[[ATTR16]]
-// APPROX-NEXT:    [[CALL_I22_I:%.*]] = tail call contract noundef float @__ocml_j1_f32(float noundef [[Y]]) #[[ATTR16]]
-// APPROX-NEXT:    [[CMP723_I:%.*]] = icmp sgt i32 [[X]], 1
-// APPROX-NEXT:    br i1 [[CMP723_I]], label [[FOR_BODY_I:%.*]], label [[_ZL3JNFIF_EXIT]]
+// APPROX-NEXT:    [[CALL_I_I:%.*]] = tail call contract noundef float @__ocml_j0_f32(float noundef [[Y]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I21_I:%.*]] = tail call contract noundef float @__ocml_j1_f32(float noundef [[Y]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CMP7_I1:%.*]] = icmp sgt i32 [[X]], 1
+// APPROX-NEXT:    br i1 [[CMP7_I1]], label [[FOR_BODY_I:%.*]], label [[_ZL3JNFIF_EXIT]]
 // APPROX:       for.body.i:
-// APPROX-NEXT:    [[__I_026_I:%.*]] = phi i32 [ [[INC_I:%.*]], [[FOR_BODY_I]] ], [ 1, [[IF_END4_I]] ]
-// APPROX-NEXT:    [[__X1_025_I:%.*]] = phi float [ [[SUB_I:%.*]], [[FOR_BODY_I]] ], [ [[CALL_I22_I]], [[IF_END4_I]] ]
-// APPROX-NEXT:    [[__X0_024_I:%.*]] = phi float [ [[__X1_025_I]], [[FOR_BODY_I]] ], [ [[CALL_I21_I]], [[IF_END4_I]] ]
-// APPROX-NEXT:    [[MUL_I:%.*]] = shl nuw nsw i32 [[__I_026_I]], 1
+// APPROX-NEXT:    [[__I_0_I4:%.*]] = phi i32 [ [[INC_I:%.*]], [[FOR_BODY_I]] ], [ 1, [[IF_END4_I]] ]
+// APPROX-NEXT:    [[__X1_0_I3:%.*]] = phi float [ [[SUB_I:%.*]], [[FOR_BODY_I]] ], [ [[CALL_I21_I]], [[IF_END4_I]] ]
+// APPROX-NEXT:    [[__X0_0_I2:%.*]] = phi float [ [[__X1_0_I3]], [[FOR_BODY_I]] ], [ [[CALL_I_I]], [[IF_END4_I]] ]
+// APPROX-NEXT:    [[MUL_I:%.*]] = shl nuw nsw i32 [[__I_0_I4]], 1
 // APPROX-NEXT:    [[CONV_I:%.*]] = sitofp i32 [[MUL_I]] to float
 // APPROX-NEXT:    [[DIV_I:%.*]] = fdiv contract float [[CONV_I]], [[Y]]
-// APPROX-NEXT:    [[MUL8_I:%.*]] = fmul contract float [[__X1_025_I]], [[DIV_I]]
-// APPROX-NEXT:    [[SUB_I]] = fsub contract float [[MUL8_I]], [[__X0_024_I]]
-// APPROX-NEXT:    [[INC_I]] = add nuw nsw i32 [[__I_026_I]], 1
-// APPROX-NEXT:    [[EXITCOND_NOT_I:%.*]] = icmp eq i32 [[INC_I]], [[X]]
-// APPROX-NEXT:    br i1 [[EXITCOND_NOT_I]], label [[_ZL3JNFIF_EXIT]], label [[FOR_BODY_I]], !llvm.loop [[LOOP14:![0-9]+]]
+// APPROX-NEXT:    [[MUL8_I:%.*]] = fmul contract float [[__X1_0_I3]], [[DIV_I]]
+// APPROX-NEXT:    [[SUB_I]] = fsub contract float [[MUL8_I]], [[__X0_0_I2]]
+// APPROX-NEXT:    [[INC_I]] = add nuw nsw i32 [[__I_0_I4]], 1
+// APPROX-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC_I]], [[X]]
+// APPROX-NEXT:    br i1 [[EXITCOND_NOT]], label [[_ZL3JNFIF_EXIT]], label [[FOR_BODY_I]], !llvm.loop [[LOOP14:![0-9]+]]
 // APPROX:       _ZL3jnfif.exit:
-// APPROX-NEXT:    [[RETVAL_0_I:%.*]] = phi float [ [[CALL_I_I]], [[IF_THEN_I]] ], [ [[CALL_I20_I]], [[IF_THEN2_I]] ], [ [[CALL_I22_I]], [[IF_END4_I]] ], [ [[SUB_I]], [[FOR_BODY_I]] ]
+// APPROX-NEXT:    [[RETVAL_0_I:%.*]] = phi float [ [[CALL_I20_I]], [[IF_THEN_I]] ], [ [[CALL_I22_I]], [[IF_THEN2_I]] ], [ [[CALL_I21_I]], [[IF_END4_I]] ], [ [[SUB_I]], [[FOR_BODY_I]] ]
 // APPROX-NEXT:    ret float [[RETVAL_0_I]]
 //
 extern "C" __device__ float test_jnf(int x, float y) {
@@ -1773,30 +1773,30 @@ extern "C" __device__ float test_jnf(int x, float y) {
 // DEFAULT-NEXT:    i32 1, label [[IF_THEN2_I:%.*]]
 // DEFAULT-NEXT:    ]
 // DEFAULT:       if.then.i:
-// DEFAULT-NEXT:    [[CALL_I_I:%.*]] = tail call contract noundef double @__ocml_j0_f64(double noundef [[Y:%.*]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I20_I:%.*]] = tail call contract noundef double @__ocml_j0_f64(double noundef [[Y:%.*]]) #[[ATTR16]]
 // DEFAULT-NEXT:    br label [[_ZL2JNID_EXIT:%.*]]
 // DEFAULT:       if.then2.i:
-// DEFAULT-NEXT:    [[CALL_I20_I:%.*]] = tail call contract noundef double @__ocml_j1_f64(double noundef [[Y]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I22_I:%.*]] = tail call contract noundef double @__ocml_j1_f64(double noundef [[Y]]) #[[ATTR16]]
 // DEFAULT-NEXT:    br label [[_ZL2JNID_EXIT]]
 // DEFAULT:       if.end4.i:
-// DEFAULT-NEXT:    [[CALL_I21_I:%.*]] = tail call contract noundef double @__ocml_j0_f64(double noundef [[Y]]) #[[ATTR16]]
-// DEFAULT-NEXT:    [[CALL_I22_I:%.*]] = tail call contract noundef double @__ocml_j1_f64(double noundef [[Y]]) #[[ATTR16]]
-// DEFAULT-NEXT:    [[CMP723_I:%.*]] = icmp sgt i32 [[X]], 1
-// DEFAULT-NEXT:    br i1 [[CMP723_I]], label [[FOR_BODY_I:%.*]], label [[_ZL2JNID_EXIT]]
+// DEFAULT-NEXT:    [[CALL_I_I:%.*]] = tail call contract noundef double @__ocml_j0_f64(double noundef [[Y]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I21_I:%.*]] = tail call contract noundef double @__ocml_j1_f64(double noundef [[Y]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CMP7_I1:%.*]] = icmp sgt i32 [[X]], 1
+// DEFAULT-NEXT:    br i1 [[CMP7_I1]], label [[FOR_BODY_I:%.*]], label [[_ZL2JNID_EXIT]]
 // DEFAULT:       for.body.i:
-// DEFAULT-NEXT:    [[__I_026_I:%.*]] = phi i32 [ [[INC_I:%.*]], [[FOR_BODY_I]] ], [ 1, [[IF_END4_I]] ]
-// DEFAULT-NEXT:    [[__X1_025_I:%.*]] = phi double [ [[SUB_I:%.*]], [[FOR_BODY_I]] ], [ [[CALL_I22_I]], [[IF_END4_I]] ]
-// DEFAULT-NEXT:    [[__X0_024_I:%.*]] = phi double [ [[__X1_025_I]], [[FOR_BODY_I]] ], [ [[CALL_I21_I]], [[IF_END4_I]] ]
-// DEFAULT-NEXT:    [[MUL_I:%.*]] = shl nuw nsw i32 [[__I_026_I]], 1
+// DEFAULT-NEXT:    [[__I_0_I4:%.*]] = phi i32 [ [[INC_I:%.*]], [[FOR_BODY_I]] ], [ 1, [[IF_END4_I]] ]
+// DEFAULT-NEXT:    [[__X1_0_I3:%.*]] = phi double [ [[SUB_I:%.*]], [[FOR_BODY_I]] ], [ [[CALL_I21_I]], [[IF_END4_I]] ]
+// DEFAULT-NEXT:    [[__X0_0_I2:%.*]] = phi double [ [[__X1_0_I3]], [[FOR_BODY_I]] ], [ [[CALL_I_I]], [[IF_END4_I]] ]
+// DEFAULT-NEXT:    [[MUL_I:%.*]] = shl nuw nsw i32 [[__I_0_I4]], 1
 // DEFAULT-NEXT:    [[CONV_I:%.*]] = sitofp i32 [[MUL_I]] to double
 // DEFAULT-NEXT:    [[DIV_I:%.*]] = fdiv contract double [[CONV_I]], [[Y]]
-// DEFAULT-NEXT:    [[MUL8_I:%.*]] = fmul contract double [[__X1_025_I]], [[DIV_I]]
-// DEFAULT-NEXT:    [[SUB_I]] = fsub contract double [[MUL8_I]], [[__X0_024_I]]
-// DEFAULT-NEXT:    [[INC_I]] = add nuw nsw i32 [[__I_026_I]], 1
-// DEFAULT-NEXT:    [[EXITCOND_NOT_I:%.*]] = icmp eq i32 [[INC_I]], [[X]]
-// DEFAULT-NEXT:    br i1 [[EXITCOND_NOT_I]], label [[_ZL2JNID_EXIT]], label [[FOR_BODY_I]], !llvm.loop [[LOOP15:![0-9]+]]
+// DEFAULT-NEXT:    [[MUL8_I:%.*]] = fmul contract double [[__X1_0_I3]], [[DIV_I]]
+// DEFAULT-NEXT:    [[SUB_I]] = fsub contract double [[MUL8_I]], [[__X0_0_I2]]
+// DEFAULT-NEXT:    [[INC_I]] = add nuw nsw i32 [[__I_0_I4]], 1
+// DEFAULT-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC_I]], [[X]]
+// DEFAULT-NEXT:    br i1 [[EXITCOND_NOT]], label [[_ZL2JNID_EXIT]], label [[FOR_BODY_I]], !llvm.loop [[LOOP15:![0-9]+]]
 // DEFAULT:       _ZL2jnid.exit:
-// DEFAULT-NEXT:    [[RETVAL_0_I:%.*]] = phi double [ [[CALL_I_I]], [[IF_THEN_I]] ], [ [[CALL_I20_I]], [[IF_THEN2_I]] ], [ [[CALL_I22_I]], [[IF_END4_I]] ], [ [[SUB_I]], [[FOR_BODY_I]] ]
+// DEFAULT-NEXT:    [[RETVAL_0_I:%.*]] = phi double [ [[CALL_I20_I]], [[IF_THEN_I]] ], [ [[CALL_I22_I]], [[IF_THEN2_I]] ], [ [[CALL_I21_I]], [[IF_END4_I]] ], [ [[SUB_I]], [[FOR_BODY_I]] ]
 // DEFAULT-NEXT:    ret double [[RETVAL_0_I]]
 //
 // FINITEONLY-LABEL: @test_jn(
@@ -1806,30 +1806,30 @@ extern "C" __device__ float test_jnf(int x, float y) {
 // FINITEONLY-NEXT:    i32 1, label [[IF_THEN2_I:%.*]]
 // FINITEONLY-NEXT:    ]
 // FINITEONLY:       if.then.i:
-// FINITEONLY-NEXT:    [[CALL_I_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_j0_f64(double noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I20_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_j0_f64(double noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR16]]
 // FINITEONLY-NEXT:    br label [[_ZL2JNID_EXIT:%.*]]
 // FINITEONLY:       if.then2.i:
-// FINITEONLY-NEXT:    [[CALL_I20_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_j1_f64(double noundef nofpclass(nan inf) [[Y]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I22_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_j1_f64(double noundef nofpclass(nan inf) [[Y]]) #[[ATTR16]]
 // FINITEONLY-NEXT:    br label [[_ZL2JNID_EXIT]]
 // FINITEONLY:       if.end4.i:
-// FINITEONLY-NEXT:    [[CALL_I21_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_j0_f64(double noundef nofpclass(nan inf) [[Y]]) #[[ATTR16]]
-// FINITEONLY-NEXT:    [[CALL_I22_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_j1_f64(double noundef nofpclass(nan inf) [[Y]]) #[[ATTR16]]
-// FINITEONLY-NEXT:    [[CMP723_I:%.*]] = icmp sgt i32 [[X]], 1
-// FINITEONLY-NEXT:    br i1 [[CMP723_I]], label [[FOR_BODY_I:%.*]], label [[_ZL2JNID_EXIT]]
+// FINITEONLY-NEXT:    [[CALL_I_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_j0_f64(double noundef nofpclass(nan inf) [[Y]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I21_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_j1_f64(double noundef nofpclass(nan inf) [[Y]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CMP7_I1:%.*]] = icmp sgt i32 [[X]], 1
+// FINITEONLY-NEXT:    br i1 [[CMP7_I1]], label [[FOR_BODY_I:%.*]], label [[_ZL2JNID_EXIT]]
 // FINITEONLY:       for.body.i:
-// FINITEONLY-NEXT:    [[__I_026_I:%.*]] = phi i32 [ [[INC_I:%.*]], [[FOR_BODY_I]] ], [ 1, [[IF_END4_I]] ]
-// FINITEONLY-NEXT:    [[__X1_025_I:%.*]] = phi double [ [[SUB_I:%.*]], [[FOR_BODY_I]] ], [ [[CALL_I22_I]], [[IF_END4_I]] ]
-// FINITEONLY-NEXT:    [[__X0_024_I:%.*]] = phi double [ [[__X1_025_I]], [[FOR_BODY_I]] ], [ [[CALL_I21_I]], [[IF_END4_I]] ]
-// FINITEONLY-NEXT:    [[MUL_I:%.*]] = shl nuw nsw i32 [[__I_026_I]], 1
+// FINITEONLY-NEXT:    [[__I_0_I4:%.*]] = phi i32 [ [[INC_I:%.*]], [[FOR_BODY_I]] ], [ 1, [[IF_END4_I]] ]
+// FINITEONLY-NEXT:    [[__X1_0_I3:%.*]] = phi double [ [[SUB_I:%.*]], [[FOR_BODY_I]] ], [ [[CALL_I21_I]], [[IF_END4_I]] ]
+// FINITEONLY-NEXT:    [[__X0_0_I2:%.*]] = phi double [ [[__X1_0_I3]], [[FOR_BODY_I]] ], [ [[CALL_I_I]], [[IF_END4_I]] ]
+// FINITEONLY-NEXT:    [[MUL_I:%.*]] = shl nuw nsw i32 [[__I_0_I4]], 1
 // FINITEONLY-NEXT:    [[CONV_I:%.*]] = sitofp i32 [[MUL_I]] to double
 // FINITEONLY-NEXT:    [[DIV_I:%.*]] = fdiv nnan ninf contract double [[CONV_I]], [[Y]]
-// FINITEONLY-NEXT:    [[MUL8_I:%.*]] = fmul nnan ninf contract double [[__X1_025_I]], [[DIV_I]]
-// FINITEONLY-NEXT:    [[SUB_I]] = fsub nnan ninf contract double [[MUL8_I]], [[__X0_024_I]]
-// FINITEONLY-NEXT:    [[INC_I]] = add nuw nsw i32 [[__I_026_I]], 1
-// FINITEONLY-NEXT:    [[EXITCOND_NOT_I:%.*]] = icmp eq i32 [[INC_I]], [[X]]
-// FINITEONLY-NEXT:    br i1 [[EXITCOND_NOT_I]], label [[_ZL2JNID_EXIT]], label [[FOR_BODY_I]], !llvm.loop [[LOOP15:![0-9]+]]
+// FINITEONLY-NEXT:    [[MUL8_I:%.*]] = fmul nnan ninf contract double [[__X1_0_I3]], [[DIV_I]]
+// FINITEONLY-NEXT:    [[SUB_I]] = fsub nnan ninf contract double [[MUL8_I]], [[__X0_0_I2]]
+// FINITEONLY-NEXT:    [[INC_I]] = add nuw nsw i32 [[__I_0_I4]], 1
+// FINITEONLY-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC_I]], [[X]]
+// FINITEONLY-NEXT:    br i1 [[EXITCOND_NOT]], label [[_ZL2JNID_EXIT]], label [[FOR_BODY_I]], !llvm.loop [[LOOP15:![0-9]+]]
 // FINITEONLY:       _ZL2jnid.exit:
-// FINITEONLY-NEXT:    [[RETVAL_0_I:%.*]] = phi double [ [[CALL_I_I]], [[IF_THEN_I]] ], [ [[CALL_I20_I]], [[IF_THEN2_I]] ], [ [[CALL_I22_I]], [[IF_END4_I]] ], [ [[SUB_I]], [[FOR_BODY_I]] ]
+// FINITEONLY-NEXT:    [[RETVAL_0_I:%.*]] = phi double [ [[CALL_I20_I]], [[IF_THEN_I]] ], [ [[CALL_I22_I]], [[IF_THEN2_I]] ], [ [[CALL_I21_I]], [[IF_END4_I]] ], [ [[SUB_I]], [[FOR_BODY_I]] ]
 // FINITEONLY-NEXT:    ret double [[RETVAL_0_I]]
 //
 // APPROX-LABEL: @test_jn(
@@ -1839,30 +1839,30 @@ extern "C" __device__ float test_jnf(int x, float y) {
 // APPROX-NEXT:    i32 1, label [[IF_THEN2_I:%.*]]
 // APPROX-NEXT:    ]
 // APPROX:       if.then.i:
-// APPROX-NEXT:    [[CALL_I_I:%.*]] = tail call contract noundef double @__ocml_j0_f64(double noundef [[Y:%.*]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I20_I:%.*]] = tail call contract noundef double @__ocml_j0_f64(double noundef [[Y:%.*]]) #[[ATTR16]]
 // APPROX-NEXT:    br label [[_ZL2JNID_EXIT:%.*]]
 // APPROX:       if.then2.i:
-// APPROX-NEXT:    [[CALL_I20_I:%.*]] = tail call contract noundef double @__ocml_j1_f64(double noundef [[Y]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I22_I:%.*]] = tail call contract noundef double @__ocml_j1_f64(double noundef [[Y]]) #[[ATTR16]]
 // APPROX-NEXT:    br label [[_ZL2JNID_EXIT]]
 // APPROX:       if.end4.i:
-// APPROX-NEXT:    [[CALL_I21_I:%.*]] = tail call contract noundef double @__ocml_j0_f64(double noundef [[Y]]) #[[ATTR16]]
-// APPROX-NEXT:    [[CALL_I22_I:%.*]] = tail call contract noundef double @__ocml_j1_f64(double noundef [[Y]]) #[[ATTR16]]
-// APPROX-NEXT:    [[CMP723_I:%.*]] = icmp sgt i32 [[X]], 1
-// APPROX-NEXT:    br i1 [[CMP723_I]], label [[FOR_BODY_I:%.*]], label [[_ZL2JNID_EXIT]]
+// APPROX-NEXT:    [[CALL_I_I:%.*]] = tail call contract noundef double @__ocml_j0_f64(double noundef [[Y]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I21_I:%.*]] = tail call contract noundef double @__ocml_j1_f64(double noundef [[Y]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CMP7_I1:%.*]] = icmp sgt i32 [[X]], 1
+// APPROX-NEXT:    br i1 [[CMP7_I1]], label [[FOR_BODY_I:%.*]], label [[_ZL2JNID_EXIT]]
 // APPROX:       for.body.i:
-// APPROX-NEXT:    [[__I_026_I:%.*]] = phi i32 [ [[INC_I:%.*]], [[FOR_BODY_I]] ], [ 1, [[IF_END4_I]] ]
-// APPROX-NEXT:    [[__X1_025_I:%.*]] = phi double [ [[SUB_I:%.*]], [[FOR_BODY_I]] ], [ [[CALL_I22_I]], [[IF_END4_I]] ]
-// APPROX-NEXT:    [[__X0_024_I:%.*]] = phi double [ [[__X1_025_I]], [[FOR_BODY_I]] ], [ [[CALL_I21_I]], [[IF_END4_I]] ]
-// APPROX-NEXT:    [[MUL_I:%.*]] = shl nuw nsw i32 [[__I_026_I]], 1
+// APPROX-NEXT:    [[__I_0_I4:%.*]] = phi i32 [ [[INC_I:%.*]], [[FOR_BODY_I]] ], [ 1, [[IF_END4_I]] ]
+// APPROX-NEXT:    [[__X1_0_I3:%.*]] = phi double [ [[SUB_I:%.*]], [[FOR_BODY_I]] ], [ [[CALL_I21_I]], [[IF_END4_I]] ]
+// APPROX-NEXT:    [[__X0_0_I2:%.*]] = phi double [ [[__X1_0_I3]], [[FOR_BODY_I]] ], [ [[CALL_I_I]], [[IF_END4_I]] ]
+// APPROX-NEXT:    [[MUL_I:%.*]] = shl nuw nsw i32 [[__I_0_I4]], 1
 // APPROX-NEXT:    [[CONV_I:%.*]] = sitofp i32 [[MUL_I]] to double
 // APPROX-NEXT:    [[DIV_I:%.*]] = fdiv contract double [[CONV_I]], [[Y]]
-// APPROX-NEXT:    [[MUL8_I:%.*]] = fmul contract double [[__X1_025_I]], [[DIV_I]]
-// APPROX-NEXT:    [[SUB_I]] = fsub contract double [[MUL8_I]], [[__X0_024_I]]
-// APPROX-NEXT:    [[INC_I]] = add nuw nsw i32 [[__I_026_I]], 1
-// APPROX-NEXT:    [[EXITCOND_NOT_I:%.*]] = icmp eq i32 [[INC_I]], [[X]]
-// APPROX-NEXT:    br i1 [[EXITCOND_NOT_I]], label [[_ZL2JNID_EXIT]], label [[FOR_BODY_I]], !llvm.loop [[LOOP15:![0-9]+]]
+// APPROX-NEXT:    [[MUL8_I:%.*]] = fmul contract double [[__X1_0_I3]], [[DIV_I]]
+// APPROX-NEXT:    [[SUB_I]] = fsub contract double [[MUL8_I]], [[__X0_0_I2]]
+// APPROX-NEXT:    [[INC_I]] = add nuw nsw i32 [[__I_0_I4]], 1
+// APPROX-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC_I]], [[X]]
+// APPROX-NEXT:    br i1 [[EXITCOND_NOT]], label [[_ZL2JNID_EXIT]], label [[FOR_BODY_I]], !llvm.loop [[LOOP15:![0-9]+]]
 // APPROX:       _ZL2jnid.exit:
-// APPROX-NEXT:    [[RETVAL_0_I:%.*]] = phi double [ [[CALL_I_I]], [[IF_THEN_I]] ], [ [[CALL_I20_I]], [[IF_THEN2_I]] ], [ [[CALL_I22_I]], [[IF_END4_I]] ], [ [[SUB_I]], [[FOR_BODY_I]] ]
+// APPROX-NEXT:    [[RETVAL_0_I:%.*]] = phi double [ [[CALL_I20_I]], [[IF_THEN_I]] ], [ [[CALL_I22_I]], [[IF_THEN2_I]] ], [ [[CALL_I21_I]], [[IF_END4_I]] ], [ [[SUB_I]], [[FOR_BODY_I]] ]
 // APPROX-NEXT:    ret double [[RETVAL_0_I]]
 //
 extern "C" __device__ double test_jn(int x, double y) {
@@ -2364,26 +2364,26 @@ extern "C" __device__ double test_modf(double x, double* y) {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[TAG:%.*]], align 1, !tbaa [[TBAA4]]
 // CHECK-NEXT:    [[CMP_I_I:%.*]] = icmp eq i8 [[TMP0]], 48
-// CHECK-NEXT:    br i1 [[CMP_I_I]], label [[IF_THEN_I_I:%.*]], label [[WHILE_COND_I30_I_I:%.*]]
+// CHECK-NEXT:    br i1 [[CMP_I_I]], label [[IF_THEN_I_I:%.*]], label [[WHILE_COND_I14_I_I:%.*]]
 // CHECK:       if.then.i.i:
 // CHECK-NEXT:    [[INCDEC_PTR_I_I:%.*]] = getelementptr inbounds i8, ptr [[TAG]], i64 1
 // CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[INCDEC_PTR_I_I]], align 1, !tbaa [[TBAA4]]
-// CHECK-NEXT:    switch i8 [[TMP1]], label [[WHILE_COND_I14_I_I:%.*]] [
-// CHECK-NEXT:    i8 120, label [[WHILE_COND_I_I_I_PREHEADER:%.*]]
-// CHECK-NEXT:    i8 88, label [[WHILE_COND_I_I_I_PREHEADER]]
+// CHECK-NEXT:    switch i8 [[TMP1]], label [[WHILE_COND_I_I_I:%.*]] [
+// CHECK-NEXT:    i8 120, label [[WHILE_COND_I30_I_I_PREHEADER:%.*]]
+// CHECK-NEXT:    i8 88, label [[WHILE_COND_I30_I_I_PREHEADER]]
 // CHECK-NEXT:    ]
-// CHECK:       while.cond.i.i.i.preheader:
-// CHECK-NEXT:    br label [[WHILE_COND_I_I_I:%.*]]
-// CHECK:       while.cond.i.i.i:
-// CHECK-NEXT:    [[__TAGP_ADDR_0_I_I_I:%.*]] = phi ptr [ [[__TAGP_ADDR_1_I_I_I:%.*]], [[CLEANUP_I_I_I:%.*]] ], [ [[INCDEC_PTR_I_I]], [[WHILE_COND_I_I_I_PREHEADER]] ]
-// CHECK-NEXT:    [[__R_0_I_I_I:%.*]] = phi i64 [ [[__R_2_I_I_I:%.*]], [[CLEANUP_I_I_I]] ], [ 0, [[WHILE_COND_I_I_I_PREHEADER]] ]
-// CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr [[__TAGP_ADDR_0_I_I_I]], align 1, !tbaa [[TBAA4]]
-// CHECK-NEXT:    [[CMP_NOT_I_I_I:%.*]] = icmp eq i8 [[TMP2]], 0
-// CHECK-NEXT:    br i1 [[CMP_NOT_I_I_I]], label [[_ZL4NANFPKC_EXIT:%.*]], label [[WHILE_BODY_I_I_I:%.*]]
-// CHECK:       while.body.i.i.i:
+// CHECK:       while.cond.i30.i.i.preheader:
+// CHECK-NEXT:    br label [[WHILE_COND_I30_I_I:%.*]]
+// CHECK:       while.cond.i30.i.i:
+// CHECK-NEXT:    [[__TAGP_ADDR_0_I31_I_I:%.*]] = phi ptr [ [[__TAGP_ADDR_1_I37_I_I:%.*]], [[CLEANUP_I36_I_I:%.*]] ], [ [[INCDEC_PTR_I_I]], [[WHILE_COND_I30_I_I_PREHEADER]] ]
+// CHECK-NEXT:    [[__R_0_I32_I_I:%.*]] = phi i64 [ [[__R_2_I_I_I:%.*]], [[CLEANUP_I36_I_I]] ], [ 0, [[WHILE_COND_I30_I_I_PREHEADER]] ]
+// CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr [[__TAGP_ADDR_0_I31_I_I]], align 1, !tbaa [[TBAA4]]
+// CHECK-NEXT:    [[CMP_NOT_I33_I_I:%.*]] = icmp eq i8 [[TMP2]], 0
+// CHECK-NEXT:    br i1 [[CMP_NOT_I33_I_I]], label [[_ZL4NANFPKC_EXIT:%.*]], label [[WHILE_BODY_I34_I_I:%.*]]
+// CHECK:       while.body.i34.i.i:
 // CHECK-NEXT:    [[TMP3:%.*]] = add i8 [[TMP2]], -48
-// CHECK-NEXT:    [[OR_COND_I_I_I:%.*]] = icmp ult i8 [[TMP3]], 10
-// CHECK-NEXT:    br i1 [[OR_COND_I_I_I]], label [[IF_END31_I_I_I:%.*]], label [[IF_ELSE_I_I_I:%.*]]
+// CHECK-NEXT:    [[OR_COND_I35_I_I:%.*]] = icmp ult i8 [[TMP3]], 10
+// CHECK-NEXT:    br i1 [[OR_COND_I35_I_I]], label [[IF_END31_I_I_I:%.*]], label [[IF_ELSE_I_I_I:%.*]]
 // CHECK:       if.else.i.i.i:
 // CHECK-NEXT:    [[TMP4:%.*]] = add i8 [[TMP2]], -97
 // CHECK-NEXT:    [[OR_COND33_I_I_I:%.*]] = icmp ult i8 [[TMP4]], 6
@@ -2391,33 +2391,54 @@ extern "C" __device__ double test_modf(double x, double* y) {
 // CHECK:       if.else17.i.i.i:
 // CHECK-NEXT:    [[TMP5:%.*]] = add i8 [[TMP2]], -65
 // CHECK-NEXT:    [[OR_COND34_I_I_I:%.*]] = icmp ult i8 [[TMP5]], 6
-// CHECK-NEXT:    br i1 [[OR_COND34_I_I_I]], label [[IF_END31_I_I_I]], label [[CLEANUP_I_I_I]]
+// CHECK-NEXT:    br i1 [[OR_COND34_I_I_I]], label [[IF_END31_I_I_I]], label [[CLEANUP_I36_I_I]]
 // CHECK:       if.end31.i.i.i:
-// CHECK-NEXT:    [[DOTSINK:%.*]] = phi i64 [ -48, [[WHILE_BODY_I_I_I]] ], [ -87, [[IF_ELSE_I_I_I]] ], [ -55, [[IF_ELSE17_I_I_I]] ]
-// CHECK-NEXT:    [[MUL24_I_I_I:%.*]] = shl i64 [[__R_0_I_I_I]], 4
+// CHECK-NEXT:    [[DOTSINK:%.*]] = phi i64 [ -48, [[WHILE_BODY_I34_I_I]] ], [ -87, [[IF_ELSE_I_I_I]] ], [ -55, [[IF_ELSE17_I_I_I]] ]
+// CHECK-NEXT:    [[MUL24_I_I_I:%.*]] = shl i64 [[__R_0_I32_I_I]], 4
 // CHECK-NEXT:    [[CONV25_I_I_I:%.*]] = sext i8 [[TMP2]] to i64
 // CHECK-NEXT:    [[ADD26_I_I_I:%.*]] = add i64 [[MUL24_I_I_I]], [[DOTSINK]]
 // CHECK-NEXT:    [[ADD28_I_I_I:%.*]] = add i64 [[ADD26_I_I_I]], [[CONV25_I_I_I]]
+// CHECK-NEXT:    [[INCDEC_PTR_I40_I_I:%.*]] = getelementptr inbounds i8, ptr [[__TAGP_ADDR_0_I31_I_I]], i64 1
+// CHECK-NEXT:    br label [[CLEANUP_I36_I_I]]
+// CHECK:       cleanup.i36.i.i:
+// CHECK-NEXT:    [[__TAGP_ADDR_1_I37_I_I]] = phi ptr [ [[INCDEC_PTR_I40_I_I]], [[IF_END31_I_I_I]] ], [ [[__TAGP_ADDR_0_I31_I_I]], [[IF_ELSE17_I_I_I]] ]
+// CHECK-NEXT:    [[__R_2_I_I_I]] = phi i64 [ [[ADD28_I_I_I]], [[IF_END31_I_I_I]] ], [ [[__R_0_I32_I_I]], [[IF_ELSE17_I_I_I]] ]
+// CHECK-NEXT:    [[COND_I_I_I:%.*]] = phi i1 [ true, [[IF_END31_I_I_I]] ], [ false, [[IF_ELSE17_I_I_I]] ]
+// CHECK-NEXT:    br i1 [[COND_I_I_I]], label [[WHILE_COND_I30_I_I]], label [[_ZL4NANFPKC_EXIT]], !llvm.loop [[LOOP11]]
+// CHECK:       while.cond.i.i.i:
+// CHECK-NEXT:    [[__TAGP_ADDR_0_I_I_I:%.*]] = phi ptr [ [[__TAGP_ADDR_1_I_I_I:%.*]], [[CLEANUP_I_I_I:%.*]] ], [ [[INCDEC_PTR_I_I]], [[IF_THEN_I_I]] ]
+// CHECK-NEXT:    [[__R_0_I_I_I:%.*]] = phi i64 [ [[__R_1_I_I_I:%.*]], [[CLEANUP_I_I_I]] ], [ 0, [[IF_THEN_I_I]] ]
+// CHECK-NEXT:    [[TMP6:%.*]] = load i8, ptr [[__TAGP_ADDR_0_I_I_I]], align 1, !tbaa [[TBAA4]]
+// CHECK-NEXT:    [[CMP_NOT_I_I_I:%.*]] = icmp eq i8 [[TMP6]], 0
+// CHECK-NEXT:    br i1 [[CMP_NOT_I_I_I]], label [[_ZL4NANFPKC_EXIT]], label [[WHILE_BODY_I_I_I:%.*]]
+// CHECK:       while.body.i.i.i:
+// CHECK-NEXT:    [[TMP7:%.*]] = and i8 [[TMP6]], -8
+// CHECK-NEXT:    [[OR_COND_I_I_I:%.*]] = icmp eq i8 [[TMP7]], 48
+// CHECK-NEXT:    br i1 [[OR_COND_I_I_I]], label [[IF_THEN_I_I_I:%.*]], label [[CLEANUP_I_I_I]]
+// CHECK:       if.then.i.i.i:
+// CHECK-NEXT:    [[MUL_I_I_I:%.*]] = shl i64 [[__R_0_I_I_I]], 3
+// CHECK-NEXT:    [[CONV5_I_I_I:%.*]] = sext i8 [[TMP6]] to i64
+// CHECK-NEXT:    [[ADD_I_I_I:%.*]] = add i64 [[MUL_I_I_I]], -48
+// CHECK-NEXT:    [[SUB_I_I_I:%.*]] = add i64 [[ADD_I_I_I]], [[CONV5_I_I_I]]
 // CHECK-NEXT:    [[INCDEC_PTR_I_I_I:%.*]] = getelementptr inbounds i8, ptr [[__TAGP_ADDR_0_I_I_I]], i64 1
 // CHECK-NEXT:    br label [[CLEANUP_I_I_I]]
 // CHECK:       cleanup.i.i.i:
-// CHECK-NEXT:    [[__TAGP_ADDR_1_I_I_I]] = phi ptr [ [[INCDEC_PTR_I_I_I]], [[IF_END31_I_I_I]] ], [ [[__TAGP_ADDR_0_I_I_I]], [[IF_ELSE17_I_I_I]] ]
-// CHECK-NEXT:    [[__R_2_I_I_I]] = phi i64 [ [[ADD28_I_I_I]], [[IF_END31_I_I_I]] ], [ [[__R_0_I_I_I]], [[IF_ELSE17_I_I_I]] ]
-// CHECK-NEXT:    [[COND_I_I_I:%.*]] = phi i1 [ true, [[IF_END31_I_I_I]] ], [ false, [[IF_ELSE17_I_I_I]] ]
-// CHECK-NEXT:    br i1 [[COND_I_I_I]], label [[WHILE_COND_I_I_I]], label [[_ZL4NANFPKC_EXIT]], !llvm.loop [[LOOP11]]
+// CHECK-NEXT:    [[__TAGP_ADDR_1_I_I_I]] = phi ptr [ [[INCDEC_PTR_I_I_I]], [[IF_THEN_I_I_I]] ], [ [[__TAGP_ADDR_0_I_I_I]], [[WHILE_BODY_I_I_I]] ]
+// CHECK-NEXT:    [[__R_1_I_I_I]] = phi i64 [ [[SUB_I_I_I]], [[IF_THEN_I_I_I]] ], [ [[__R_0_I_I_I]], [[WHILE_BODY_I_I_I]] ]
+// CHECK-NEXT:    br i1 [[OR_COND_I_I_I]], label [[WHILE_COND_I_I_I]], label [[_ZL4NANFPKC_EXIT]], !llvm.loop [[LOOP7]]
 // CHECK:       while.cond.i14.i.i:
-// CHECK-NEXT:    [[__TAGP_ADDR_0_I15_I_I:%.*]] = phi ptr [ [[__TAGP_ADDR_1_I21_I_I:%.*]], [[CLEANUP_I20_I_I:%.*]] ], [ [[INCDEC_PTR_I_I]], [[IF_THEN_I_I]] ]
-// CHECK-NEXT:    [[__R_0_I16_I_I:%.*]] = phi i64 [ [[__R_1_I22_I_I:%.*]], [[CLEANUP_I20_I_I]] ], [ 0, [[IF_THEN_I_I]] ]
-// CHECK-NEXT:    [[TMP6:%.*]] = load i8, ptr [[__TAGP_ADDR_0_I15_I_I]], align 1, !tbaa [[TBAA4]]
-// CHECK-NEXT:    [[CMP_NOT_I17_I_I:%.*]] = icmp eq i8 [[TMP6]], 0
+// CHECK-NEXT:    [[__TAGP_ADDR_0_I15_I_I:%.*]] = phi ptr [ [[__TAGP_ADDR_1_I21_I_I:%.*]], [[CLEANUP_I20_I_I:%.*]] ], [ [[TAG]], [[ENTRY:%.*]] ]
+// CHECK-NEXT:    [[__R_0_I16_I_I:%.*]] = phi i64 [ [[__R_1_I22_I_I:%.*]], [[CLEANUP_I20_I_I]] ], [ 0, [[ENTRY]] ]
+// CHECK-NEXT:    [[TMP8:%.*]] = load i8, ptr [[__TAGP_ADDR_0_I15_I_I]], align 1, !tbaa [[TBAA4]]
+// CHECK-NEXT:    [[CMP_NOT_I17_I_I:%.*]] = icmp eq i8 [[TMP8]], 0
 // CHECK-NEXT:    br i1 [[CMP_NOT_I17_I_I]], label [[_ZL4NANFPKC_EXIT]], label [[WHILE_BODY_I18_I_I:%.*]]
 // CHECK:       while.body.i18.i.i:
-// CHECK-NEXT:    [[TMP7:%.*]] = and i8 [[TMP6]], -8
-// CHECK-NEXT:    [[OR_COND_I19_I_I:%.*]] = icmp eq i8 [[TMP7]], 48
+// CHECK-NEXT:    [[TMP9:%.*]] = add i8 [[TMP8]], -48
+// CHECK-NEXT:    [[OR_COND_I19_I_I:%.*]] = icmp ult i8 [[TMP9]], 10
 // CHECK-NEXT:    br i1 [[OR_COND_I19_I_I]], label [[IF_THEN_I24_I_I:%.*]], label [[CLEANUP_I20_I_I]]
 // CHECK:       if.then.i24.i.i:
-// CHECK-NEXT:    [[MUL_I25_I_I:%.*]] = shl i64 [[__R_0_I16_I_I]], 3
-// CHECK-NEXT:    [[CONV5_I26_I_I:%.*]] = sext i8 [[TMP6]] to i64
+// CHECK-NEXT:    [[MUL_I25_I_I:%.*]] = mul i64 [[__R_0_I16_I_I]], 10
+// CHECK-NEXT:    [[CONV5_I26_I_I:%.*]] = sext i8 [[TMP8]] to i64
 // CHECK-NEXT:    [[ADD_I27_I_I:%.*]] = add i64 [[MUL_I25_I_I]], -48
 // CHECK-NEXT:    [[SUB_I28_I_I:%.*]] = add i64 [[ADD_I27_I_I]], [[CONV5_I26_I_I]]
 // CHECK-NEXT:    [[INCDEC_PTR_I29_I_I:%.*]] = getelementptr inbounds i8, ptr [[__TAGP_ADDR_0_I15_I_I]], i64 1
@@ -2425,30 +2446,9 @@ extern "C" __device__ double test_modf(double x, double* y) {
 // CHECK:       cleanup.i20.i.i:
 // CHECK-NEXT:    [[__TAGP_ADDR_1_I21_I_I]] = phi ptr [ [[INCDEC_PTR_I29_I_I]], [[IF_THEN_I24_I_I]] ], [ [[__TAGP_ADDR_0_I15_I_I]], [[WHILE_BODY_I18_I_I]] ]
 // CHECK-NEXT:    [[__R_1_I22_I_I]] = phi i64 [ [[SUB_I28_I_I]], [[IF_THEN_I24_I_I]] ], [ [[__R_0_I16_I_I]], [[WHILE_BODY_I18_I_I]] ]
-// CHECK-NEXT:    br i1 [[OR_COND_I19_I_I]], label [[WHILE_COND_I14_I_I]], label [[_ZL4NANFPKC_EXIT]], !llvm.loop [[LOOP7]]
-// CHECK:       while.cond.i30.i.i:
-// CHECK-NEXT:    [[__TAGP_ADDR_0_I31_I_I:%.*]] = phi ptr [ [[__TAGP_ADDR_1_I37_I_I:%.*]], [[CLEANUP_I36_I_I:%.*]] ], [ [[TAG]], [[ENTRY:%.*]] ]
-// CHECK-NEXT:    [[__R_0_I32_I_I:%.*]] = phi i64 [ [[__R_1_I38_I_I:%.*]], [[CLEANUP_I36_I_I]] ], [ 0, [[ENTRY]] ]
-// CHECK-NEXT:    [[TMP8:%.*]] = load i8, ptr [[__TAGP_ADDR_0_I31_I_I]], align 1, !tbaa [[TBAA4]]
-// CHECK-NEXT:    [[CMP_NOT_I33_I_I:%.*]] = icmp eq i8 [[TMP8]], 0
-// CHECK-NEXT:    br i1 [[CMP_NOT_I33_I_I]], label [[_ZL4NANFPKC_EXIT]], label [[WHILE_BODY_I34_I_I:%.*]]
-// CHECK:       while.body.i34.i.i:
-// CHECK-NEXT:    [[TMP9:%.*]] = add i8 [[TMP8]], -48
-// CHECK-NEXT:    [[OR_COND_I35_I_I:%.*]] = icmp ult i8 [[TMP9]], 10
-// CHECK-NEXT:    br i1 [[OR_COND_I35_I_I]], label [[IF_THEN_I40_I_I:%.*]], label [[CLEANUP_I36_I_I]]
-// CHECK:       if.then.i40.i.i:
-// CHECK-NEXT:    [[MUL_I41_I_I:%.*]] = mul i64 [[__R_0_I32_I_I]], 10
-// CHECK-NEXT:    [[CONV5_I42_I_I:%.*]] = sext i8 [[TMP8]] to i64
-// CHECK-NEXT:    [[ADD_I43_I_I:%.*]] = add i64 [[MUL_I41_I_I]], -48
-// CHECK-NEXT:    [[SUB_I44_I_I:%.*]] = add i64 [[ADD_I43_I_I]], [[CONV5_I42_I_I]]
-// CHECK-NEXT:    [[INCDEC_PTR_I45_I_I:%.*]] = getelementptr inbounds i8, ptr [[__TAGP_ADDR_0_I31_I_I]], i64 1
-// CHECK-NEXT:    br label [[CLEANUP_I36_I_I]]
-// CHECK:       cleanup.i36.i.i:
-// CHECK-NEXT:    [[__TAGP_ADDR_1_I37_I_I]] = phi ptr [ [[INCDEC_PTR_I45_I_I]], [[IF_THEN_I40_I_I]] ], [ [[__TAGP_ADDR_0_I31_I_I]], [[WHILE_BODY_I34_I_I]] ]
-// CHECK-NEXT:    [[__R_1_I38_I_I]] = phi i64 [ [[SUB_I44_I_I]], [[IF_THEN_I40_I_I]] ], [ [[__R_0_I32_I_I]], [[WHILE_BODY_I34_I_I]] ]
-// CHECK-NEXT:    br i1 [[OR_COND_I35_I_I]], label [[WHILE_COND_I30_I_I]], label [[_ZL4NANFPKC_EXIT]], !llvm.loop [[LOOP10]]
+// CHECK-NEXT:    br i1 [[OR_COND_I19_I_I]], label [[WHILE_COND_I14_I_I]], label [[_ZL4NANFPKC_EXIT]], !llvm.loop [[LOOP10]]
 // CHECK:       _ZL4nanfPKc.exit:
-// CHECK-NEXT:    [[RETVAL_0_I_I:%.*]] = phi i64 [ 0, [[CLEANUP_I20_I_I]] ], [ [[__R_0_I16_I_I]], [[WHILE_COND_I14_I_I]] ], [ 0, [[CLEANUP_I_I_I]] ], [ [[__R_0_I_I_I]], [[WHILE_COND_I_I_I]] ], [ 0, [[CLEANUP_I36_I_I]] ], [ [[__R_0_I32_I_I]], [[WHILE_COND_I30_I_I]] ]
+// CHECK-NEXT:    [[RETVAL_0_I_I:%.*]] = phi i64 [ 0, [[CLEANUP_I_I_I]] ], [ [[__R_0_I_I_I]], [[WHILE_COND_I_I_I]] ], [ 0, [[CLEANUP_I36_I_I]] ], [ [[__R_0_I32_I_I]], [[WHILE_COND_I30_I_I]] ], [ 0, [[CLEANUP_I20_I_I]] ], [ [[__R_0_I16_I_I]], [[WHILE_COND_I14_I_I]] ]
 // CHECK-NEXT:    [[CONV_I:%.*]] = trunc i64 [[RETVAL_0_I_I]] to i32
 // CHECK-NEXT:    [[BF_VALUE_I:%.*]] = and i32 [[CONV_I]], 4194303
 // CHECK-NEXT:    [[BF_SET9_I:%.*]] = or i32 [[BF_VALUE_I]], 2143289344
@@ -2463,26 +2463,26 @@ extern "C" __device__ float test_nanf(const char *tag) {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[TAG:%.*]], align 1, !tbaa [[TBAA4]]
 // CHECK-NEXT:    [[CMP_I_I:%.*]] = icmp eq i8 [[TMP0]], 48
-// CHECK-NEXT:    br i1 [[CMP_I_I]], label [[IF_THEN_I_I:%.*]], label [[WHILE_COND_I30_I_I:%.*]]
+// CHECK-NEXT:    br i1 [[CMP_I_I]], label [[IF_THEN_I_I:%.*]], label [[WHILE_COND_I14_I_I:%.*]]
 // CHECK:       if.then.i.i:
 // CHECK-NEXT:    [[INCDEC_PTR_I_I:%.*]] = getelementptr inbounds i8, ptr [[TAG]], i64 1
 // CHECK-NEXT:    [[TMP1:%.*]] = load i8, ptr [[INCDEC_PTR_I_I]], align 1, !tbaa [[TBAA4]]
-// CHECK-NEXT:    switch i8 [[TMP1]], label [[WHILE_COND_I14_I_I:%.*]] [
-// CHECK-NEXT:    i8 120, label [[WHILE_COND_I_I_I_PREHEADER:%.*]]
-// CHECK-NEXT:    i8 88, label [[WHILE_COND_I_I_I_PREHEADER]]
+// CHECK-NEXT:    switch i8 [[TMP1]], label [[WHILE_COND_I_I_I:%.*]] [
+// CHECK-NEXT:    i8 120, label [[WHILE_COND_I30_I_I_PREHEADER:%.*]]
+// CHECK-NEXT:    i8 88, label [[WHILE_COND_I30_I_I_PREHEADER]]
 // CHECK-NEXT:    ]
-// CHECK:       while.cond.i.i.i.preheader:
-// CHECK-NEXT:    br label [[WHILE_COND_I_I_I:%.*]]
-// CHECK:       while.cond.i.i.i:
-// CHECK-NEXT:    [[__TAGP_ADDR_0_I_I_I:%.*]] = phi ptr [ [[__TAGP_ADDR_1_I_I_I:%.*]], [[CLEANUP_I_I_I:%.*]] ], [ [[INCDEC_PTR_I_I]], [[WHILE_COND_I_I_I_PREHEADER]] ]
-// CHECK-NEXT:    [[__R_0_I_I_I:%.*]] = phi i64 [ [[__R_2_I_I_I:%.*]], [[CLEANUP_I_I_I]] ], [ 0, [[WHILE_COND_I_I_I_PREHEADER]] ]
-// CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr [[__TAGP_ADDR_0_I_I_I]], align 1, !tbaa [[TBAA4]]
-// CHECK-NEXT:    [[CMP_NOT_I_I_I:%.*]] = icmp eq i8 [[TMP2]], 0
-// CHECK-NEXT:    br i1 [[CMP_NOT_I_I_I]], label [[_ZL3NANPKC_EXIT:%.*]], label [[WHILE_BODY_I_I_I:%.*]]
-// CHECK:       while.body.i.i.i:
+// CHECK:       while.cond.i30.i.i.preheader:
+// CHECK-NEXT:    br label [[WHILE_COND_I30_I_I:%.*]]
+// CHECK:       while.cond.i30.i.i:
+// CHECK-NEXT:    [[__TAGP_ADDR_0_I31_I_I:%.*]] = phi ptr [ [[__TAGP_ADDR_1_I37_I_I:%.*]], [[CLEANUP_I36_I_I:%.*]] ], [ [[INCDEC_PTR_I_I]], [[WHILE_COND_I30_I_I_PREHEADER]] ]
+// CHECK-NEXT:    [[__R_0_I32_I_I:%.*]] = phi i64 [ [[__R_2_I_I_I:%.*]], [[CLEANUP_I36_I_I]] ], [ 0, [[WHILE_COND_I30_I_I_PREHEADER]] ]
+// CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr [[__TAGP_ADDR_0_I31_I_I]], align 1, !tbaa [[TBAA4]]
+// CHECK-NEXT:    [[CMP_NOT_I33_I_I:%.*]] = icmp eq i8 [[TMP2]], 0
+// CHECK-NEXT:    br i1 [[CMP_NOT_I33_I_I]], label [[_ZL3NANPKC_EXIT:%.*]], label [[WHILE_BODY_I34_I_I:%.*]]
+// CHECK:       while.body.i34.i.i:
 // CHECK-NEXT:    [[TMP3:%.*]] = add i8 [[TMP2]], -48
-// CHECK-NEXT:    [[OR_COND_I_I_I:%.*]] = icmp ult i8 [[TMP3]], 10
-// CHECK-NEXT:    br i1 [[OR_COND_I_I_I]], label [[IF_END31_I_I_I:%.*]], label [[IF_ELSE_I_I_I:%.*]]
+// CHECK-NEXT:    [[OR_COND_I35_I_I:%.*]] = icmp ult i8 [[TMP3]], 10
+// CHECK-NEXT:    br i1 [[OR_COND_I35_I_I]], label [[IF_END31_I_I_I:%.*]], label [[IF_ELSE_I_I_I:%.*]]
 // CHECK:       if.else.i.i.i:
 // CHECK-NEXT:    [[TMP4:%.*]] = add i8 [[TMP2]], -97
 // CHECK-NEXT:    [[OR_COND33_I_I_I:%.*]] = icmp ult i8 [[TMP4]], 6
@@ -2490,33 +2490,54 @@ extern "C" __device__ float test_nanf(const char *tag) {
 // CHECK:       if.else17.i.i.i:
 // CHECK-NEXT:    [[TMP5:%.*]] = add i8 [[TMP2]], -65
 // CHECK-NEXT:    [[OR_COND34_I_I_I:%.*]] = icmp ult i8 [[TMP5]], 6
-// CHECK-NEXT:    br i1 [[OR_COND34_I_I_I]], label [[IF_END31_I_I_I]], label [[CLEANUP_I_I_I]]
+// CHECK-NEXT:    br i1 [[OR_COND34_I_I_I]], label [[IF_END31_I_I_I]], label [[CLEANUP_I36_I_I]]
 // CHECK:       if.end31.i.i.i:
-// CHECK-NEXT:    [[DOTSINK:%.*]] = phi i64 [ -48, [[WHILE_BODY_I_I_I]] ], [ -87, [[IF_ELSE_I_I_I]] ], [ -55, [[IF_ELSE17_I_I_I]] ]
-// CHECK-NEXT:    [[MUL24_I_I_I:%.*]] = shl i64 [[__R_0_I_I_I]], 4
+// CHECK-NEXT:    [[DOTSINK:%.*]] = phi i64 [ -48, [[WHILE_BODY_I34_I_I]] ], [ -87, [[IF_ELSE_I_I_I]] ], [ -55, [[IF_ELSE17_I_I_I]] ]
+// CHECK-NEXT:    [[MUL24_I_I_I:%.*]] = shl i64 [[__R_0_I32_I_I]], 4
 // CHECK-NEXT:    [[CONV25_I_I_I:%.*]] = sext i8 [[TMP2]] to i64
 // CHECK-NEXT:    [[ADD26_I_I_I:%.*]] = add i64 [[MUL24_I_I_I]], [[DOTSINK]]
 // CHECK-NEXT:    [[ADD28_I_I_I:%.*]] = add i64 [[ADD26_I_I_I]], [[CONV25_I_I_I]]
+// CHECK-NEXT:    [[INCDEC_PTR_I40_I_I:%.*]] = getelementptr inbounds i8, ptr [[__TAGP_ADDR_0_I31_I_I]], i64 1
+// CHECK-NEXT:    br label [[CLEANUP_I36_I_I]]
+// CHECK:       cleanup.i36.i.i:
+// CHECK-NEXT:    [[__TAGP_ADDR_1_I37_I_I]] = phi ptr [ [[INCDEC_PTR_I40_I_I]], [[IF_END31_I_I_I]] ], [ [[__TAGP_ADDR_0_I31_I_I]], [[IF_ELSE17_I_I_I]] ]
+// CHECK-NEXT:    [[__R_2_I_I_I]] = phi i64 [ [[ADD28_I_I_I]], [[IF_END31_I_I_I]] ], [ [[__R_0_I32_I_I]], [[IF_ELSE17_I_I_I]] ]
+// CHECK-NEXT:    [[COND_I_I_I:%.*]] = phi i1 [ true, [[IF_END31_I_I_I]] ], [ false, [[IF_ELSE17_I_I_I]] ]
+// CHECK-NEXT:    br i1 [[COND_I_I_I]], label [[WHILE_COND_I30_I_I]], label [[_ZL3NANPKC_EXIT]], !llvm.loop [[LOOP11]]
+// CHECK:       while.cond.i.i.i:
+// CHECK-NEXT:    [[__TAGP_ADDR_0_I_I_I:%.*]] = phi ptr [ [[__TAGP_ADDR_1_I_I_I:%.*]], [[CLEANUP_I_I_I:%.*]] ], [ [[INCDEC_PTR_I_I]], [[IF_THEN_I_I]] ]
+// CHECK-NEXT:    [[__R_0_I_I_I:%.*]] = phi i64 [ [[__R_1_I_I_I:%.*]], [[CLEANUP_I_I_I]] ], [ 0, [[IF_THEN_I_I]] ]
+// CHECK-NEXT:    [[TMP6:%.*]] = load i8, ptr [[__TAGP_ADDR_0_I_I_I]], align 1, !tbaa [[TBAA4]]
+// CHECK-NEXT:    [[CMP_NOT_I_I_I:%.*]] = icmp eq i8 [[TMP6]], 0
+// CHECK-NEXT:    br i1 [[CMP_NOT_I_I_I]], label [[_ZL3NANPKC_EXIT]], label [[WHILE_BODY_I_I_I:%.*]]
+// CHECK:       while.body.i.i.i:
+// CHECK-NEXT:    [[TMP7:%.*]] = and i8 [[TMP6]], -8
+// CHECK-NEXT:    [[OR_COND_I_I_I:%.*]] = icmp eq i8 [[TMP7]], 48
+// CHECK-NEXT:    br i1 [[OR_COND_I_I_I]], label [[IF_THEN_I_I_I:%.*]], label [[CLEANUP_I_I_I]]
+// CHECK:       if.then.i.i.i:
+// CHECK-NEXT:    [[MUL_I_I_I:%.*]] = shl i64 [[__R_0_I_I_I]], 3
+// CHECK-NEXT:    [[CONV5_I_I_I:%.*]] = sext i8 [[TMP6]] to i64
+// CHECK-NEXT:    [[ADD_I_I_I:%.*]] = add i64 [[MUL_I_I_I]], -48
+// CHECK-NEXT:    [[SUB_I_I_I:%.*]] = add i64 [[ADD_I_I_I]], [[CONV5_I_I_I]]
 // CHECK-NEXT:    [[INCDEC_PTR_I_I_I:%.*]] = getelementptr inbounds i8, ptr [[__TAGP_ADDR_0_I_I_I]], i64 1
 // CHECK-NEXT:    br label [[CLEANUP_I_I_I]]
 // CHECK:       cleanup.i.i.i:
-// CHECK-NEXT:    [[__TAGP_ADDR_1_I_I_I]] = phi ptr [ [[INCDEC_PTR_I_I_I]], [[IF_END31_I_I_I]] ], [ [[__TAGP_ADDR_0_I_I_I]], [[IF_ELSE17_I_I_I]] ]
-// CHECK-NEXT:    [[__R_2_I_I_I]] = phi i64 [ [[ADD28_I_I_I]], [[IF_END31_I_I_I]] ], [ [[__R_0_I_I_I]], [[IF_ELSE17_I_I_I]] ]
-// CHECK-NEXT:    [[COND_I_I_I:%.*]] = phi i1 [ true, [[IF_END31_I_I_I]] ], [ false, [[IF_ELSE17_I_I_I]] ]
-// CHECK-NEXT:    br i1 [[COND_I_I_I]], label [[WHILE_COND_I_I_I]], label [[_ZL3NANPKC_EXIT]], !llvm.loop [[LOOP11]]
+// CHECK-NEXT:    [[__TAGP_ADDR_1_I_I_I]] = phi ptr [ [[INCDEC_PTR_I_I_I]], [[IF_THEN_I_I_I]] ], [ [[__TAGP_ADDR_0_I_I_I]], [[WHILE_BODY_I_I_I]] ]
+// CHECK-NEXT:    [[__R_1_I_I_I]] = phi i64 [ [[SUB_I_I_I]], [[IF_THEN_I_I_I]] ], [ [[__R_0_I_I_I]], [[WHILE_BODY_I_I_I]] ]
+// CHECK-NEXT:    br i1 [[OR_COND_I_I_I]], label [[WHILE_COND_I_I_I]], label [[_ZL3NANPKC_EXIT]], !llvm.loop [[LOOP7]]
 // CHECK:       while.cond.i14.i.i:
-// CHECK-NEXT:    [[__TAGP_ADDR_0_I15_I_I:%.*]] = phi ptr [ [[__TAGP_ADDR_1_I21_I_I:%.*]], [[CLEANUP_I20_I_I:%.*]] ], [ [[INCDEC_PTR_I_I]], [[IF_THEN_I_I]] ]
-// CHECK-NEXT:    [[__R_0_I16_I_I:%.*]] = phi i64 [ [[__R_1_I22_I_I:%.*]], [[CLEANUP_I20_I_I]] ], [ 0, [[IF_THEN_I_I]] ]
-// CHECK-NEXT:    [[TMP6:%.*]] = load i8, ptr [[__TAGP_ADDR_0_I15_I_I]], align 1, !tbaa [[TBAA4]]
-// CHECK-NEXT:    [[CMP_NOT_I17_I_I:%.*]] = icmp eq i8 [[TMP6]], 0
+// CHECK-NEXT:    [[__TAGP_ADDR_0_I15_I_I:%.*]] = phi ptr [ [[__TAGP_ADDR_1_I21_I_I:%.*]], [[CLEANUP_I20_I_I:%.*]] ], [ [[TAG]], [[ENTRY:%.*]] ]
+// CHECK-NEXT:    [[__R_0_I16_I_I:%.*]] = phi i64 [ [[__R_1_I22_I_I:%.*]], [[CLEANUP_I20_I_I]] ], [ 0, [[ENTRY]] ]
+// CHECK-NEXT:    [[TMP8:%.*]] = load i8, ptr [[__TAGP_ADDR_0_I15_I_I]], align 1, !tbaa [[TBAA4]]
+// CHECK-NEXT:    [[CMP_NOT_I17_I_I:%.*]] = icmp eq i8 [[TMP8]], 0
 // CHECK-NEXT:    br i1 [[CMP_NOT_I17_I_I]], label [[_ZL3NANPKC_EXIT]], label [[WHILE_BODY_I18_I_I:%.*]]
 // CHECK:       while.body.i18.i.i:
-// CHECK-NEXT:    [[TMP7:%.*]] = and i8 [[TMP6]], -8
-// CHECK-NEXT:    [[OR_COND_I19_I_I:%.*]] = icmp eq i8 [[TMP7]], 48
+// CHECK-NEXT:    [[TMP9:%.*]] = add i8 [[TMP8]], -48
+// CHECK-NEXT:    [[OR_COND_I19_I_I:%.*]] = icmp ult i8 [[TMP9]], 10
 // CHECK-NEXT:    br i1 [[OR_COND_I19_I_I]], label [[IF_THEN_I24_I_I:%.*]], label [[CLEANUP_I20_I_I]]
 // CHECK:       if.then.i24.i.i:
-// CHECK-NEXT:    [[MUL_I25_I_I:%.*]] = shl i64 [[__R_0_I16_I_I]], 3
-// CHECK-NEXT:    [[CONV5_I26_I_I:%.*]] = sext i8 [[TMP6]] to i64
+// CHECK-NEXT:    [[MUL_I25_I_I:%.*]] = mul i64 [[__R_0_I16_I_I]], 10
+// CHECK-NEXT:    [[CONV5_I26_I_I:%.*]] = sext i8 [[TMP8]] to i64
 // CHECK-NEXT:    [[ADD_I27_I_I:%.*]] = add i64 [[MUL_I25_I_I]], -48
 // CHECK-NEXT:    [[SUB_I28_I_I:%.*]] = add i64 [[ADD_I27_I_I]], [[CONV5_I26_I_I]]
 // CHECK-NEXT:    [[INCDEC_PTR_I29_I_I:%.*]] = getelementptr inbounds i8, ptr [[__TAGP_ADDR_0_I15_I_I]], i64 1
@@ -2524,30 +2545,9 @@ extern "C" __device__ float test_nanf(const char *tag) {
 // CHECK:       cleanup.i20.i.i:
 // CHECK-NEXT:    [[__TAGP_ADDR_1_I21_I_I]] = phi ptr [ [[INCDEC_PTR_I29_I_I]], [[IF_THEN_I24_I_I]] ], [ [[__TAGP_ADDR_0_I15_I_I]], [[WHILE_BODY_I18_I_I]] ]
 // CHECK-NEXT:    [[__R_1_I22_I_I]] = phi i64 [ [[SUB_I28_I_I]], [[IF_THEN_I24_I_I]] ], [ [[__R_0_I16_I_I]], [[WHILE_BODY_I18_I_I]] ]
-// CHECK-NEXT:    br i1 [[OR_COND_I19_I_I]], label [[WHILE_COND_I14_I_I]], label [[_ZL3NANPKC_EXIT]], !llvm.loop [[LOOP7]]
-// CHECK:       while.cond.i30.i.i:
-// CHECK-NEXT:    [[__TAGP_ADDR_0_I31_I_I:%.*]] = phi ptr [ [[__TAGP_ADDR_1_I37_I_I:%.*]], [[CLEANUP_I36_I_I:%.*]] ], [ [[TAG]], [[ENTRY:%.*]] ]
-// CHECK-NEXT:    [[__R_0_I32_I_I:%.*]] = phi i64 [ [[__R_1_I38_I_I:%.*]], [[CLEANUP_I36_I_I]] ], [ 0, [[ENTRY]] ]
-// CHECK-NEXT:    [[TMP8:%.*]] = load i8, ptr [[__TAGP_ADDR_0_I31_I_I]], align 1, !tbaa [[TBAA4]]
-// CHECK-NEXT:    [[CMP_NOT_I33_I_I:%.*]] = icmp eq i8 [[TMP8]], 0
-// CHECK-NEXT:    br i1 [[CMP_NOT_I33_I_I]], label [[_ZL3NANPKC_EXIT]], label [[WHILE_BODY_I34_I_I:%.*]]
-// CHECK:       while.body.i34.i.i:
-// CHECK-NEXT:    [[TMP9:%.*]] = add i8 [[TMP8]], -48
-// CHECK-NEXT:    [[OR_COND_I35_I_I:%.*]] = icmp ult i8 [[TMP9]], 10
-// CHECK-NEXT:    br i1 [[OR_COND_I35_I_I]], label [[IF_THEN_I40_I_I:%.*]], label [[CLEANUP_I36_I_I]]
-// CHECK:       if.then.i40.i.i:
-// CHECK-NEXT:    [[MUL_I41_I_I:%.*]] = mul i64 [[__R_0_I32_I_I]], 10
-// CHECK-NEXT:    [[CONV5_I42_I_I:%.*]] = sext i8 [[TMP8]] to i64
-// CHECK-NEXT:    [[ADD_I43_I_I:%.*]] = add i64 [[MUL_I41_I_I]], -48
-// CHECK-NEXT:    [[SUB_I44_I_I:%.*]] = add i64 [[ADD_I43_I_I]], [[CONV5_I42_I_I]]
-// CHECK-NEXT:    [[INCDEC_PTR_I45_I_I:%.*]] = getelementptr inbounds i8, ptr [[__TAGP_ADDR_0_I31_I_I]], i64 1
-// CHECK-NEXT:    br label [[CLEANUP_I36_I_I]]
-// CHECK:       cleanup.i36.i.i:
-// CHECK-NEXT:    [[__TAGP_ADDR_1_I37_I_I]] = phi ptr [ [[INCDEC_PTR_I45_I_I]], [[IF_THEN_I40_I_I]] ], [ [[__TAGP_ADDR_0_I31_I_I]], [[WHILE_BODY_I34_I_I]] ]
-// CHECK-NEXT:    [[__R_1_I38_I_I]] = phi i64 [ [[SUB_I44_I_I]], [[IF_THEN_I40_I_I]] ], [ [[__R_0_I32_I_I]], [[WHILE_BODY_I34_I_I]] ]
-// CHECK-NEXT:    br i1 [[OR_COND_I35_I_I]], label [[WHILE_COND_I30_I_I]], label [[_ZL3NANPKC_EXIT]], !llvm.loop [[LOOP10]]
+// CHECK-NEXT:    br i1 [[OR_COND_I19_I_I]], label [[WHILE_COND_I14_I_I]], label [[_ZL3NANPKC_EXIT]], !llvm.loop [[LOOP10]]
 // CHECK:       _ZL3nanPKc.exit:
-// CHECK-NEXT:    [[RETVAL_0_I_I:%.*]] = phi i64 [ 0, [[CLEANUP_I20_I_I]] ], [ [[__R_0_I16_I_I]], [[WHILE_COND_I14_I_I]] ], [ 0, [[CLEANUP_I_I_I]] ], [ [[__R_0_I_I_I]], [[WHILE_COND_I_I_I]] ], [ 0, [[CLEANUP_I36_I_I]] ], [ [[__R_0_I32_I_I]], [[WHILE_COND_I30_I_I]] ]
+// CHECK-NEXT:    [[RETVAL_0_I_I:%.*]] = phi i64 [ 0, [[CLEANUP_I_I_I]] ], [ [[__R_0_I_I_I]], [[WHILE_COND_I_I_I]] ], [ 0, [[CLEANUP_I36_I_I]] ], [ [[__R_0_I32_I_I]], [[WHILE_COND_I30_I_I]] ], [ 0, [[CLEANUP_I20_I_I]] ], [ [[__R_0_I16_I_I]], [[WHILE_COND_I14_I_I]] ]
 // CHECK-NEXT:    [[BF_VALUE_I:%.*]] = and i64 [[RETVAL_0_I_I]], 2251799813685247
 // CHECK-NEXT:    [[BF_SET9_I:%.*]] = or i64 [[BF_VALUE_I]], 9221120237041090560
 // CHECK-NEXT:    [[TMP10:%.*]] = bitcast i64 [[BF_SET9_I]] to double
@@ -2819,62 +2819,62 @@ extern "C" __device__ double test_normcdfinv(double x) {
 
 // DEFAULT-LABEL: @test_normf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[TOBOOL_NOT5_I:%.*]] = icmp eq i32 [[X:%.*]], 0
-// DEFAULT-NEXT:    br i1 [[TOBOOL_NOT5_I]], label [[_ZL5NORMFIPKF_EXIT:%.*]], label [[WHILE_BODY_I:%.*]]
+// DEFAULT-NEXT:    [[TOBOOL_NOT_I1:%.*]] = icmp eq i32 [[X:%.*]], 0
+// DEFAULT-NEXT:    br i1 [[TOBOOL_NOT_I1]], label [[_ZL5NORMFIPKF_EXIT:%.*]], label [[WHILE_BODY_I:%.*]]
 // DEFAULT:       while.body.i:
-// DEFAULT-NEXT:    [[__R_08_I:%.*]] = phi float [ [[ADD_I:%.*]], [[WHILE_BODY_I]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
-// DEFAULT-NEXT:    [[__A_ADDR_07_I:%.*]] = phi ptr [ [[INCDEC_PTR_I:%.*]], [[WHILE_BODY_I]] ], [ [[Y:%.*]], [[ENTRY]] ]
-// DEFAULT-NEXT:    [[__DIM_ADDR_06_I:%.*]] = phi i32 [ [[DEC_I:%.*]], [[WHILE_BODY_I]] ], [ [[X]], [[ENTRY]] ]
-// DEFAULT-NEXT:    [[DEC_I]] = add nsw i32 [[__DIM_ADDR_06_I]], -1
-// DEFAULT-NEXT:    [[TMP0:%.*]] = load float, ptr [[__A_ADDR_07_I]], align 4, !tbaa [[TBAA16]]
+// DEFAULT-NEXT:    [[__R_0_I4:%.*]] = phi float [ [[ADD_I:%.*]], [[WHILE_BODY_I]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
+// DEFAULT-NEXT:    [[__A_ADDR_0_I3:%.*]] = phi ptr [ [[INCDEC_PTR_I:%.*]], [[WHILE_BODY_I]] ], [ [[Y:%.*]], [[ENTRY]] ]
+// DEFAULT-NEXT:    [[__DIM_ADDR_0_I2:%.*]] = phi i32 [ [[DEC_I:%.*]], [[WHILE_BODY_I]] ], [ [[X]], [[ENTRY]] ]
+// DEFAULT-NEXT:    [[DEC_I]] = add nsw i32 [[__DIM_ADDR_0_I2]], -1
+// DEFAULT-NEXT:    [[TMP0:%.*]] = load float, ptr [[__A_ADDR_0_I3]], align 4, !tbaa [[TBAA16]]
 // DEFAULT-NEXT:    [[MUL_I:%.*]] = fmul contract float [[TMP0]], [[TMP0]]
-// DEFAULT-NEXT:    [[ADD_I]] = fadd contract float [[__R_08_I]], [[MUL_I]]
-// DEFAULT-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds float, ptr [[__A_ADDR_07_I]], i64 1
+// DEFAULT-NEXT:    [[ADD_I]] = fadd contract float [[__R_0_I4]], [[MUL_I]]
+// DEFAULT-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds float, ptr [[__A_ADDR_0_I3]], i64 1
 // DEFAULT-NEXT:    [[TOBOOL_NOT_I:%.*]] = icmp eq i32 [[DEC_I]], 0
 // DEFAULT-NEXT:    br i1 [[TOBOOL_NOT_I]], label [[_ZL5NORMFIPKF_EXIT]], label [[WHILE_BODY_I]], !llvm.loop [[LOOP20:![0-9]+]]
 // DEFAULT:       _ZL5normfiPKf.exit:
-// DEFAULT-NEXT:    [[__R_0_LCSSA_I:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD_I]], [[WHILE_BODY_I]] ]
-// DEFAULT-NEXT:    [[TMP1:%.*]] = tail call contract noundef float @llvm.sqrt.f32(float [[__R_0_LCSSA_I]])
+// DEFAULT-NEXT:    [[__R_0_I_LCSSA:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD_I]], [[WHILE_BODY_I]] ]
+// DEFAULT-NEXT:    [[TMP1:%.*]] = tail call contract noundef float @llvm.sqrt.f32(float [[__R_0_I_LCSSA]])
 // DEFAULT-NEXT:    ret float [[TMP1]]
 //
 // FINITEONLY-LABEL: @test_normf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[TOBOOL_NOT5_I:%.*]] = icmp eq i32 [[X:%.*]], 0
-// FINITEONLY-NEXT:    br i1 [[TOBOOL_NOT5_I]], label [[_ZL5NORMFIPKF_EXIT:%.*]], label [[WHILE_BODY_I:%.*]]
+// FINITEONLY-NEXT:    [[TOBOOL_NOT_I1:%.*]] = icmp eq i32 [[X:%.*]], 0
+// FINITEONLY-NEXT:    br i1 [[TOBOOL_NOT_I1]], label [[_ZL5NORMFIPKF_EXIT:%.*]], label [[WHILE_BODY_I:%.*]]
 // FINITEONLY:       while.body.i:
-// FINITEONLY-NEXT:    [[__R_08_I:%.*]] = phi float [ [[ADD_I:%.*]], [[WHILE_BODY_I]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
-// FINITEONLY-NEXT:    [[__A_ADDR_07_I:%.*]] = phi ptr [ [[INCDEC_PTR_I:%.*]], [[WHILE_BODY_I]] ], [ [[Y:%.*]], [[ENTRY]] ]
-// FINITEONLY-NEXT:    [[__DIM_ADDR_06_I:%.*]] = phi i32 [ [[DEC_I:%.*]], [[WHILE_BODY_I]] ], [ [[X]], [[ENTRY]] ]
-// FINITEONLY-NEXT:    [[DEC_I]] = add nsw i32 [[__DIM_ADDR_06_I]], -1
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = load float, ptr [[__A_ADDR_07_I]], align 4, !tbaa [[TBAA16]]
+// FINITEONLY-NEXT:    [[__R_0_I4:%.*]] = phi float [ [[ADD_I:%.*]], [[WHILE_BODY_I]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
+// FINITEONLY-NEXT:    [[__A_ADDR_0_I3:%.*]] = phi ptr [ [[INCDEC_PTR_I:%.*]], [[WHILE_BODY_I]] ], [ [[Y:%.*]], [[ENTRY]] ]
+// FINITEONLY-NEXT:    [[__DIM_ADDR_0_I2:%.*]] = phi i32 [ [[DEC_I:%.*]], [[WHILE_BODY_I]] ], [ [[X]], [[ENTRY]] ]
+// FINITEONLY-NEXT:    [[DEC_I]] = add nsw i32 [[__DIM_ADDR_0_I2]], -1
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = load float, ptr [[__A_ADDR_0_I3]], align 4, !tbaa [[TBAA16]]
 // FINITEONLY-NEXT:    [[MUL_I:%.*]] = fmul nnan ninf contract float [[TMP0]], [[TMP0]]
-// FINITEONLY-NEXT:    [[ADD_I]] = fadd nnan ninf contract float [[__R_08_I]], [[MUL_I]]
-// FINITEONLY-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds float, ptr [[__A_ADDR_07_I]], i64 1
+// FINITEONLY-NEXT:    [[ADD_I]] = fadd nnan ninf contract float [[__R_0_I4]], [[MUL_I]]
+// FINITEONLY-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds float, ptr [[__A_ADDR_0_I3]], i64 1
 // FINITEONLY-NEXT:    [[TOBOOL_NOT_I:%.*]] = icmp eq i32 [[DEC_I]], 0
 // FINITEONLY-NEXT:    br i1 [[TOBOOL_NOT_I]], label [[_ZL5NORMFIPKF_EXIT]], label [[WHILE_BODY_I]], !llvm.loop [[LOOP20:![0-9]+]]
 // FINITEONLY:       _ZL5normfiPKf.exit:
-// FINITEONLY-NEXT:    [[__R_0_LCSSA_I:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD_I]], [[WHILE_BODY_I]] ]
-// FINITEONLY-NEXT:    [[TMP1:%.*]] = tail call nnan ninf contract noundef float @llvm.sqrt.f32(float [[__R_0_LCSSA_I]])
+// FINITEONLY-NEXT:    [[__R_0_I_LCSSA:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD_I]], [[WHILE_BODY_I]] ]
+// FINITEONLY-NEXT:    [[TMP1:%.*]] = tail call nnan ninf contract noundef float @llvm.sqrt.f32(float [[__R_0_I_LCSSA]])
 // FINITEONLY-NEXT:    ret float [[TMP1]]
 //
 // APPROX-LABEL: @test_normf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[TOBOOL_NOT5_I:%.*]] = icmp eq i32 [[X:%.*]], 0
-// APPROX-NEXT:    br i1 [[TOBOOL_NOT5_I]], label [[_ZL5NORMFIPKF_EXIT:%.*]], label [[WHILE_BODY_I:%.*]]
+// APPROX-NEXT:    [[TOBOOL_NOT_I1:%.*]] = icmp eq i32 [[X:%.*]], 0
+// APPROX-NEXT:    br i1 [[TOBOOL_NOT_I1]], label [[_ZL5NORMFIPKF_EXIT:%.*]], label [[WHILE_BODY_I:%.*]]
 // APPROX:       while.body.i:
-// APPROX-NEXT:    [[__R_08_I:%.*]] = phi float [ [[ADD_I:%.*]], [[WHILE_BODY_I]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
-// APPROX-NEXT:    [[__A_ADDR_07_I:%.*]] = phi ptr [ [[INCDEC_PTR_I:%.*]], [[WHILE_BODY_I]] ], [ [[Y:%.*]], [[ENTRY]] ]
-// APPROX-NEXT:    [[__DIM_ADDR_06_I:%.*]] = phi i32 [ [[DEC_I:%.*]], [[WHILE_BODY_I]] ], [ [[X]], [[ENTRY]] ]
-// APPROX-NEXT:    [[DEC_I]] = add nsw i32 [[__DIM_ADDR_06_I]], -1
-// APPROX-NEXT:    [[TMP0:%.*]] = load float, ptr [[__A_ADDR_07_I]], align 4, !tbaa [[TBAA16]]
+// APPROX-NEXT:    [[__R_0_I4:%.*]] = phi float [ [[ADD_I:%.*]], [[WHILE_BODY_I]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
+// APPROX-NEXT:    [[__A_ADDR_0_I3:%.*]] = phi ptr [ [[INCDEC_PTR_I:%.*]], [[WHILE_BODY_I]] ], [ [[Y:%.*]], [[ENTRY]] ]
+// APPROX-NEXT:    [[__DIM_ADDR_0_I2:%.*]] = phi i32 [ [[DEC_I:%.*]], [[WHILE_BODY_I]] ], [ [[X]], [[ENTRY]] ]
+// APPROX-NEXT:    [[DEC_I]] = add nsw i32 [[__DIM_ADDR_0_I2]], -1
+// APPROX-NEXT:    [[TMP0:%.*]] = load float, ptr [[__A_ADDR_0_I3]], align 4, !tbaa [[TBAA16]]
 // APPROX-NEXT:    [[MUL_I:%.*]] = fmul contract float [[TMP0]], [[TMP0]]
-// APPROX-NEXT:    [[ADD_I]] = fadd contract float [[__R_08_I]], [[MUL_I]]
-// APPROX-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds float, ptr [[__A_ADDR_07_I]], i64 1
+// APPROX-NEXT:    [[ADD_I]] = fadd contract float [[__R_0_I4]], [[MUL_I]]
+// APPROX-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds float, ptr [[__A_ADDR_0_I3]], i64 1
 // APPROX-NEXT:    [[TOBOOL_NOT_I:%.*]] = icmp eq i32 [[DEC_I]], 0
 // APPROX-NEXT:    br i1 [[TOBOOL_NOT_I]], label [[_ZL5NORMFIPKF_EXIT]], label [[WHILE_BODY_I]], !llvm.loop [[LOOP20:![0-9]+]]
 // APPROX:       _ZL5normfiPKf.exit:
-// APPROX-NEXT:    [[__R_0_LCSSA_I:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD_I]], [[WHILE_BODY_I]] ]
-// APPROX-NEXT:    [[TMP1:%.*]] = tail call contract noundef float @llvm.sqrt.f32(float [[__R_0_LCSSA_I]])
+// APPROX-NEXT:    [[__R_0_I_LCSSA:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD_I]], [[WHILE_BODY_I]] ]
+// APPROX-NEXT:    [[TMP1:%.*]] = tail call contract noundef float @llvm.sqrt.f32(float [[__R_0_I_LCSSA]])
 // APPROX-NEXT:    ret float [[TMP1]]
 //
 extern "C" __device__ float test_normf(int x, const float *y) {
@@ -2883,62 +2883,62 @@ extern "C" __device__ float test_normf(int x, const float *y) {
 
 // DEFAULT-LABEL: @test_norm(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[TOBOOL_NOT5_I:%.*]] = icmp eq i32 [[X:%.*]], 0
-// DEFAULT-NEXT:    br i1 [[TOBOOL_NOT5_I]], label [[_ZL4NORMIPKD_EXIT:%.*]], label [[WHILE_BODY_I:%.*]]
+// DEFAULT-NEXT:    [[TOBOOL_NOT_I1:%.*]] = icmp eq i32 [[X:%.*]], 0
+// DEFAULT-NEXT:    br i1 [[TOBOOL_NOT_I1]], label [[_ZL4NORMIPKD_EXIT:%.*]], label [[WHILE_BODY_I:%.*]]
 // DEFAULT:       while.body.i:
-// DEFAULT-NEXT:    [[__R_08_I:%.*]] = phi double [ [[ADD_I:%.*]], [[WHILE_BODY_I]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
-// DEFAULT-NEXT:    [[__A_ADDR_07_I:%.*]] = phi ptr [ [[INCDEC_PTR_I:%.*]], [[WHILE_BODY_I]] ], [ [[Y:%.*]], [[ENTRY]] ]
-// DEFAULT-NEXT:    [[__DIM_ADDR_06_I:%.*]] = phi i32 [ [[DEC_I:%.*]], [[WHILE_BODY_I]] ], [ [[X]], [[ENTRY]] ]
-// DEFAULT-NEXT:    [[DEC_I]] = add nsw i32 [[__DIM_ADDR_06_I]], -1
-// DEFAULT-NEXT:    [[TMP0:%.*]] = load double, ptr [[__A_ADDR_07_I]], align 8, !tbaa [[TBAA18]]
+// DEFAULT-NEXT:    [[__R_0_I4:%.*]] = phi double [ [[ADD_I:%.*]], [[WHILE_BODY_I]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
+// DEFAULT-NEXT:    [[__A_ADDR_0_I3:%.*]] = phi ptr [ [[INCDEC_PTR_I:%.*]], [[WHILE_BODY_I]] ], [ [[Y:%.*]], [[ENTRY]] ]
+// DEFAULT-NEXT:    [[__DIM_ADDR_0_I2:%.*]] = phi i32 [ [[DEC_I:%.*]], [[WHILE_BODY_I]] ], [ [[X]], [[ENTRY]] ]
+// DEFAULT-NEXT:    [[DEC_I]] = add nsw i32 [[__DIM_ADDR_0_I2]], -1
+// DEFAULT-NEXT:    [[TMP0:%.*]] = load double, ptr [[__A_ADDR_0_I3]], align 8, !tbaa [[TBAA18]]
 // DEFAULT-NEXT:    [[MUL_I:%.*]] = fmul contract double [[TMP0]], [[TMP0]]
-// DEFAULT-NEXT:    [[ADD_I]] = fadd contract double [[__R_08_I]], [[MUL_I]]
-// DEFAULT-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds double, ptr [[__A_ADDR_07_I]], i64 1
+// DEFAULT-NEXT:    [[ADD_I]] = fadd contract double [[__R_0_I4]], [[MUL_I]]
+// DEFAULT-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds double, ptr [[__A_ADDR_0_I3]], i64 1
 // DEFAULT-NEXT:    [[TOBOOL_NOT_I:%.*]] = icmp eq i32 [[DEC_I]], 0
 // DEFAULT-NEXT:    br i1 [[TOBOOL_NOT_I]], label [[_ZL4NORMIPKD_EXIT]], label [[WHILE_BODY_I]], !llvm.loop [[LOOP21:![0-9]+]]
 // DEFAULT:       _ZL4normiPKd.exit:
-// DEFAULT-NEXT:    [[__R_0_LCSSA_I:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[ADD_I]], [[WHILE_BODY_I]] ]
-// DEFAULT-NEXT:    [[TMP1:%.*]] = tail call contract noundef double @llvm.sqrt.f64(double [[__R_0_LCSSA_I]])
+// DEFAULT-NEXT:    [[__R_0_I_LCSSA:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[ADD_I]], [[WHILE_BODY_I]] ]
+// DEFAULT-NEXT:    [[TMP1:%.*]] = tail call contract noundef double @llvm.sqrt.f64(double [[__R_0_I_LCSSA]])
 // DEFAULT-NEXT:    ret double [[TMP1]]
 //
 // FINITEONLY-LABEL: @test_norm(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[TOBOOL_NOT5_I:%.*]] = icmp eq i32 [[X:%.*]], 0
-// FINITEONLY-NEXT:    br i1 [[TOBOOL_NOT5_I]], label [[_ZL4NORMIPKD_EXIT:%.*]], label [[WHILE_BODY_I:%.*]]
+// FINITEONLY-NEXT:    [[TOBOOL_NOT_I1:%.*]] = icmp eq i32 [[X:%.*]], 0
+// FINITEONLY-NEXT:    br i1 [[TOBOOL_NOT_I1]], label [[_ZL4NORMIPKD_EXIT:%.*]], label [[WHILE_BODY_I:%.*]]
 // FINITEONLY:       while.body.i:
-// FINITEONLY-NEXT:    [[__R_08_I:%.*]] = phi double [ [[ADD_I:%.*]], [[WHILE_BODY_I]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
-// FINITEONLY-NEXT:    [[__A_ADDR_07_I:%.*]] = phi ptr [ [[INCDEC_PTR_I:%.*]], [[WHILE_BODY_I]] ], [ [[Y:%.*]], [[ENTRY]] ]
-// FINITEONLY-NEXT:    [[__DIM_ADDR_06_I:%.*]] = phi i32 [ [[DEC_I:%.*]], [[WHILE_BODY_I]] ], [ [[X]], [[ENTRY]] ]
-// FINITEONLY-NEXT:    [[DEC_I]] = add nsw i32 [[__DIM_ADDR_06_I]], -1
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = load double, ptr [[__A_ADDR_07_I]], align 8, !tbaa [[TBAA18]]
+// FINITEONLY-NEXT:    [[__R_0_I4:%.*]] = phi double [ [[ADD_I:%.*]], [[WHILE_BODY_I]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
+// FINITEONLY-NEXT:    [[__A_ADDR_0_I3:%.*]] = phi ptr [ [[INCDEC_PTR_I:%.*]], [[WHILE_BODY_I]] ], [ [[Y:%.*]], [[ENTRY]] ]
+// FINITEONLY-NEXT:    [[__DIM_ADDR_0_I2:%.*]] = phi i32 [ [[DEC_I:%.*]], [[WHILE_BODY_I]] ], [ [[X]], [[ENTRY]] ]
+// FINITEONLY-NEXT:    [[DEC_I]] = add nsw i32 [[__DIM_ADDR_0_I2]], -1
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = load double, ptr [[__A_ADDR_0_I3]], align 8, !tbaa [[TBAA18]]
 // FINITEONLY-NEXT:    [[MUL_I:%.*]] = fmul nnan ninf contract double [[TMP0]], [[TMP0]]
-// FINITEONLY-NEXT:    [[ADD_I]] = fadd nnan ninf contract double [[__R_08_I]], [[MUL_I]]
-// FINITEONLY-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds double, ptr [[__A_ADDR_07_I]], i64 1
+// FINITEONLY-NEXT:    [[ADD_I]] = fadd nnan ninf contract double [[__R_0_I4]], [[MUL_I]]
+// FINITEONLY-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds double, ptr [[__A_ADDR_0_I3]], i64 1
 // FINITEONLY-NEXT:    [[TOBOOL_NOT_I:%.*]] = icmp eq i32 [[DEC_I]], 0
 // FINITEONLY-NEXT:    br i1 [[TOBOOL_NOT_I]], label [[_ZL4NORMIPKD_EXIT]], label [[WHILE_BODY_I]], !llvm.loop [[LOOP21:![0-9]+]]
 // FINITEONLY:       _ZL4normiPKd.exit:
-// FINITEONLY-NEXT:    [[__R_0_LCSSA_I:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[ADD_I]], [[WHILE_BODY_I]] ]
-// FINITEONLY-NEXT:    [[TMP1:%.*]] = tail call nnan ninf contract noundef double @llvm.sqrt.f64(double [[__R_0_LCSSA_I]])
+// FINITEONLY-NEXT:    [[__R_0_I_LCSSA:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[ADD_I]], [[WHILE_BODY_I]] ]
+// FINITEONLY-NEXT:    [[TMP1:%.*]] = tail call nnan ninf contract noundef double @llvm.sqrt.f64(double [[__R_0_I_LCSSA]])
 // FINITEONLY-NEXT:    ret double [[TMP1]]
 //
 // APPROX-LABEL: @test_norm(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[TOBOOL_NOT5_I:%.*]] = icmp eq i32 [[X:%.*]], 0
-// APPROX-NEXT:    br i1 [[TOBOOL_NOT5_I]], label [[_ZL4NORMIPKD_EXIT:%.*]], label [[WHILE_BODY_I:%.*]]
+// APPROX-NEXT:    [[TOBOOL_NOT_I1:%.*]] = icmp eq i32 [[X:%.*]], 0
+// APPROX-NEXT:    br i1 [[TOBOOL_NOT_I1]], label [[_ZL4NORMIPKD_EXIT:%.*]], label [[WHILE_BODY_I:%.*]]
 // APPROX:       while.body.i:
-// APPROX-NEXT:    [[__R_08_I:%.*]] = phi double [ [[ADD_I:%.*]], [[WHILE_BODY_I]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
-// APPROX-NEXT:    [[__A_ADDR_07_I:%.*]] = phi ptr [ [[INCDEC_PTR_I:%.*]], [[WHILE_BODY_I]] ], [ [[Y:%.*]], [[ENTRY]] ]
-// APPROX-NEXT:    [[__DIM_ADDR_06_I:%.*]] = phi i32 [ [[DEC_I:%.*]], [[WHILE_BODY_I]] ], [ [[X]], [[ENTRY]] ]
-// APPROX-NEXT:    [[DEC_I]] = add nsw i32 [[__DIM_ADDR_06_I]], -1
-// APPROX-NEXT:    [[TMP0:%.*]] = load double, ptr [[__A_ADDR_07_I]], align 8, !tbaa [[TBAA18]]
+// APPROX-NEXT:    [[__R_0_I4:%.*]] = phi double [ [[ADD_I:%.*]], [[WHILE_BODY_I]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
+// APPROX-NEXT:    [[__A_ADDR_0_I3:%.*]] = phi ptr [ [[INCDEC_PTR_I:%.*]], [[WHILE_BODY_I]] ], [ [[Y:%.*]], [[ENTRY]] ]
+// APPROX-NEXT:    [[__DIM_ADDR_0_I2:%.*]] = phi i32 [ [[DEC_I:%.*]], [[WHILE_BODY_I]] ], [ [[X]], [[ENTRY]] ]
+// APPROX-NEXT:    [[DEC_I]] = add nsw i32 [[__DIM_ADDR_0_I2]], -1
+// APPROX-NEXT:    [[TMP0:%.*]] = load double, ptr [[__A_ADDR_0_I3]], align 8, !tbaa [[TBAA18]]
 // APPROX-NEXT:    [[MUL_I:%.*]] = fmul contract double [[TMP0]], [[TMP0]]
-// APPROX-NEXT:    [[ADD_I]] = fadd contract double [[__R_08_I]], [[MUL_I]]
-// APPROX-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds double, ptr [[__A_ADDR_07_I]], i64 1
+// APPROX-NEXT:    [[ADD_I]] = fadd contract double [[__R_0_I4]], [[MUL_I]]
+// APPROX-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds double, ptr [[__A_ADDR_0_I3]], i64 1
 // APPROX-NEXT:    [[TOBOOL_NOT_I:%.*]] = icmp eq i32 [[DEC_I]], 0
 // APPROX-NEXT:    br i1 [[TOBOOL_NOT_I]], label [[_ZL4NORMIPKD_EXIT]], label [[WHILE_BODY_I]], !llvm.loop [[LOOP21:![0-9]+]]
 // APPROX:       _ZL4normiPKd.exit:
-// APPROX-NEXT:    [[__R_0_LCSSA_I:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[ADD_I]], [[WHILE_BODY_I]] ]
-// APPROX-NEXT:    [[TMP1:%.*]] = tail call contract noundef double @llvm.sqrt.f64(double [[__R_0_LCSSA_I]])
+// APPROX-NEXT:    [[__R_0_I_LCSSA:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[ADD_I]], [[WHILE_BODY_I]] ]
+// APPROX-NEXT:    [[TMP1:%.*]] = tail call contract noundef double @llvm.sqrt.f64(double [[__R_0_I_LCSSA]])
 // APPROX-NEXT:    ret double [[TMP1]]
 //
 extern "C" __device__ double test_norm(int x, const double *y) {
@@ -3243,62 +3243,62 @@ extern "C" __device__ double test_rint(double x) {
 
 // DEFAULT-LABEL: @test_rnormf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[TOBOOL_NOT5_I:%.*]] = icmp eq i32 [[X:%.*]], 0
-// DEFAULT-NEXT:    br i1 [[TOBOOL_NOT5_I]], label [[_ZL6RNORMFIPKF_EXIT:%.*]], label [[WHILE_BODY_I:%.*]]
+// DEFAULT-NEXT:    [[TOBOOL_NOT_I1:%.*]] = icmp eq i32 [[X:%.*]], 0
+// DEFAULT-NEXT:    br i1 [[TOBOOL_NOT_I1]], label [[_ZL6RNORMFIPKF_EXIT:%.*]], label [[WHILE_BODY_I:%.*]]
 // DEFAULT:       while.body.i:
-// DEFAULT-NEXT:    [[__R_08_I:%.*]] = phi float [ [[ADD_I:%.*]], [[WHILE_BODY_I]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
-// DEFAULT-NEXT:    [[__A_ADDR_07_I:%.*]] = phi ptr [ [[INCDEC_PTR_I:%.*]], [[WHILE_BODY_I]] ], [ [[Y:%.*]], [[ENTRY]] ]
-// DEFAULT-NEXT:    [[__DIM_ADDR_06_I:%.*]] = phi i32 [ [[DEC_I:%.*]], [[WHILE_BODY_I]] ], [ [[X]], [[ENTRY]] ]
-// DEFAULT-NEXT:    [[DEC_I]] = add nsw i32 [[__DIM_ADDR_06_I]], -1
-// DEFAULT-NEXT:    [[TMP0:%.*]] = load float, ptr [[__A_ADDR_07_I]], align 4, !tbaa [[TBAA16]]
+// DEFAULT-NEXT:    [[__R_0_I4:%.*]] = phi float [ [[ADD_I:%.*]], [[WHILE_BODY_I]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
+// DEFAULT-NEXT:    [[__A_ADDR_0_I3:%.*]] = phi ptr [ [[INCDEC_PTR_I:%.*]], [[WHILE_BODY_I]] ], [ [[Y:%.*]], [[ENTRY]] ]
+// DEFAULT-NEXT:    [[__DIM_ADDR_0_I2:%.*]] = phi i32 [ [[DEC_I:%.*]], [[WHILE_BODY_I]] ], [ [[X]], [[ENTRY]] ]
+// DEFAULT-NEXT:    [[DEC_I]] = add nsw i32 [[__DIM_ADDR_0_I2]], -1
+// DEFAULT-NEXT:    [[TMP0:%.*]] = load float, ptr [[__A_ADDR_0_I3]], align 4, !tbaa [[TBAA16]]
 // DEFAULT-NEXT:    [[MUL_I:%.*]] = fmul contract float [[TMP0]], [[TMP0]]
-// DEFAULT-NEXT:    [[ADD_I]] = fadd contract float [[__R_08_I]], [[MUL_I]]
-// DEFAULT-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds float, ptr [[__A_ADDR_07_I]], i64 1
+// DEFAULT-NEXT:    [[ADD_I]] = fadd contract float [[__R_0_I4]], [[MUL_I]]
+// DEFAULT-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds float, ptr [[__A_ADDR_0_I3]], i64 1
 // DEFAULT-NEXT:    [[TOBOOL_NOT_I:%.*]] = icmp eq i32 [[DEC_I]], 0
 // DEFAULT-NEXT:    br i1 [[TOBOOL_NOT_I]], label [[_ZL6RNORMFIPKF_EXIT]], label [[WHILE_BODY_I]], !llvm.loop [[LOOP22:![0-9]+]]
 // DEFAULT:       _ZL6rnormfiPKf.exit:
-// DEFAULT-NEXT:    [[__R_0_LCSSA_I:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD_I]], [[WHILE_BODY_I]] ]
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rsqrt_f32(float noundef [[__R_0_LCSSA_I]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[__R_0_I_LCSSA:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD_I]], [[WHILE_BODY_I]] ]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rsqrt_f32(float noundef [[__R_0_I_LCSSA]]) #[[ATTR15]]
 // DEFAULT-NEXT:    ret float [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_rnormf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[TOBOOL_NOT5_I:%.*]] = icmp eq i32 [[X:%.*]], 0
-// FINITEONLY-NEXT:    br i1 [[TOBOOL_NOT5_I]], label [[_ZL6RNORMFIPKF_EXIT:%.*]], label [[WHILE_BODY_I:%.*]]
+// FINITEONLY-NEXT:    [[TOBOOL_NOT_I1:%.*]] = icmp eq i32 [[X:%.*]], 0
+// FINITEONLY-NEXT:    br i1 [[TOBOOL_NOT_I1]], label [[_ZL6RNORMFIPKF_EXIT:%.*]], label [[WHILE_BODY_I:%.*]]
 // FINITEONLY:       while.body.i:
-// FINITEONLY-NEXT:    [[__R_08_I:%.*]] = phi float [ [[ADD_I:%.*]], [[WHILE_BODY_I]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
-// FINITEONLY-NEXT:    [[__A_ADDR_07_I:%.*]] = phi ptr [ [[INCDEC_PTR_I:%.*]], [[WHILE_BODY_I]] ], [ [[Y:%.*]], [[ENTRY]] ]
-// FINITEONLY-NEXT:    [[__DIM_ADDR_06_I:%.*]] = phi i32 [ [[DEC_I:%.*]], [[WHILE_BODY_I]] ], [ [[X]], [[ENTRY]] ]
-// FINITEONLY-NEXT:    [[DEC_I]] = add nsw i32 [[__DIM_ADDR_06_I]], -1
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = load float, ptr [[__A_ADDR_07_I]], align 4, !tbaa [[TBAA16]]
+// FINITEONLY-NEXT:    [[__R_0_I4:%.*]] = phi float [ [[ADD_I:%.*]], [[WHILE_BODY_I]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
+// FINITEONLY-NEXT:    [[__A_ADDR_0_I3:%.*]] = phi ptr [ [[INCDEC_PTR_I:%.*]], [[WHILE_BODY_I]] ], [ [[Y:%.*]], [[ENTRY]] ]
+// FINITEONLY-NEXT:    [[__DIM_ADDR_0_I2:%.*]] = phi i32 [ [[DEC_I:%.*]], [[WHILE_BODY_I]] ], [ [[X]], [[ENTRY]] ]
+// FINITEONLY-NEXT:    [[DEC_I]] = add nsw i32 [[__DIM_ADDR_0_I2]], -1
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = load float, ptr [[__A_ADDR_0_I3]], align 4, !tbaa [[TBAA16]]
 // FINITEONLY-NEXT:    [[MUL_I:%.*]] = fmul nnan ninf contract float [[TMP0]], [[TMP0]]
-// FINITEONLY-NEXT:    [[ADD_I]] = fadd nnan ninf contract float [[__R_08_I]], [[MUL_I]]
-// FINITEONLY-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds float, ptr [[__A_ADDR_07_I]], i64 1
+// FINITEONLY-NEXT:    [[ADD_I]] = fadd nnan ninf contract float [[__R_0_I4]], [[MUL_I]]
+// FINITEONLY-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds float, ptr [[__A_ADDR_0_I3]], i64 1
 // FINITEONLY-NEXT:    [[TOBOOL_NOT_I:%.*]] = icmp eq i32 [[DEC_I]], 0
 // FINITEONLY-NEXT:    br i1 [[TOBOOL_NOT_I]], label [[_ZL6RNORMFIPKF_EXIT]], label [[WHILE_BODY_I]], !llvm.loop [[LOOP22:![0-9]+]]
 // FINITEONLY:       _ZL6rnormfiPKf.exit:
-// FINITEONLY-NEXT:    [[__R_0_LCSSA_I:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD_I]], [[WHILE_BODY_I]] ]
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_rsqrt_f32(float noundef nofpclass(nan inf) [[__R_0_LCSSA_I]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[__R_0_I_LCSSA:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD_I]], [[WHILE_BODY_I]] ]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_rsqrt_f32(float noundef nofpclass(nan inf) [[__R_0_I_LCSSA]]) #[[ATTR15]]
 // FINITEONLY-NEXT:    ret float [[CALL_I]]
 //
 // APPROX-LABEL: @test_rnormf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[TOBOOL_NOT5_I:%.*]] = icmp eq i32 [[X:%.*]], 0
-// APPROX-NEXT:    br i1 [[TOBOOL_NOT5_I]], label [[_ZL6RNORMFIPKF_EXIT:%.*]], label [[WHILE_BODY_I:%.*]]
+// APPROX-NEXT:    [[TOBOOL_NOT_I1:%.*]] = icmp eq i32 [[X:%.*]], 0
+// APPROX-NEXT:    br i1 [[TOBOOL_NOT_I1]], label [[_ZL6RNORMFIPKF_EXIT:%.*]], label [[WHILE_BODY_I:%.*]]
 // APPROX:       while.body.i:
-// APPROX-NEXT:    [[__R_08_I:%.*]] = phi float [ [[ADD_I:%.*]], [[WHILE_BODY_I]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
-// APPROX-NEXT:    [[__A_ADDR_07_I:%.*]] = phi ptr [ [[INCDEC_PTR_I:%.*]], [[WHILE_BODY_I]] ], [ [[Y:%.*]], [[ENTRY]] ]
-// APPROX-NEXT:    [[__DIM_ADDR_06_I:%.*]] = phi i32 [ [[DEC_I:%.*]], [[WHILE_BODY_I]] ], [ [[X]], [[ENTRY]] ]
-// APPROX-NEXT:    [[DEC_I]] = add nsw i32 [[__DIM_ADDR_06_I]], -1
-// APPROX-NEXT:    [[TMP0:%.*]] = load float, ptr [[__A_ADDR_07_I]], align 4, !tbaa [[TBAA16]]
+// APPROX-NEXT:    [[__R_0_I4:%.*]] = phi float [ [[ADD_I:%.*]], [[WHILE_BODY_I]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
+// APPROX-NEXT:    [[__A_ADDR_0_I3:%.*]] = phi ptr [ [[INCDEC_PTR_I:%.*]], [[WHILE_BODY_I]] ], [ [[Y:%.*]], [[ENTRY]] ]
+// APPROX-NEXT:    [[__DIM_ADDR_0_I2:%.*]] = phi i32 [ [[DEC_I:%.*]], [[WHILE_BODY_I]] ], [ [[X]], [[ENTRY]] ]
+// APPROX-NEXT:    [[DEC_I]] = add nsw i32 [[__DIM_ADDR_0_I2]], -1
+// APPROX-NEXT:    [[TMP0:%.*]] = load float, ptr [[__A_ADDR_0_I3]], align 4, !tbaa [[TBAA16]]
 // APPROX-NEXT:    [[MUL_I:%.*]] = fmul contract float [[TMP0]], [[TMP0]]
-// APPROX-NEXT:    [[ADD_I]] = fadd contract float [[__R_08_I]], [[MUL_I]]
-// APPROX-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds float, ptr [[__A_ADDR_07_I]], i64 1
+// APPROX-NEXT:    [[ADD_I]] = fadd contract float [[__R_0_I4]], [[MUL_I]]
+// APPROX-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds float, ptr [[__A_ADDR_0_I3]], i64 1
 // APPROX-NEXT:    [[TOBOOL_NOT_I:%.*]] = icmp eq i32 [[DEC_I]], 0
 // APPROX-NEXT:    br i1 [[TOBOOL_NOT_I]], label [[_ZL6RNORMFIPKF_EXIT]], label [[WHILE_BODY_I]], !llvm.loop [[LOOP22:![0-9]+]]
 // APPROX:       _ZL6rnormfiPKf.exit:
-// APPROX-NEXT:    [[__R_0_LCSSA_I:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD_I]], [[WHILE_BODY_I]] ]
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rsqrt_f32(float noundef [[__R_0_LCSSA_I]]) #[[ATTR15]]
+// APPROX-NEXT:    [[__R_0_I_LCSSA:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD_I]], [[WHILE_BODY_I]] ]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef float @__ocml_rsqrt_f32(float noundef [[__R_0_I_LCSSA]]) #[[ATTR15]]
 // APPROX-NEXT:    ret float [[CALL_I]]
 //
 extern "C" __device__ float test_rnormf(int x, const float* y) {
@@ -3307,62 +3307,62 @@ extern "C" __device__ float test_rnormf(int x, const float* y) {
 
 // DEFAULT-LABEL: @test_rnorm(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[TOBOOL_NOT5_I:%.*]] = icmp eq i32 [[X:%.*]], 0
-// DEFAULT-NEXT:    br i1 [[TOBOOL_NOT5_I]], label [[_ZL5RNORMIPKD_EXIT:%.*]], label [[WHILE_BODY_I:%.*]]
+// DEFAULT-NEXT:    [[TOBOOL_NOT_I1:%.*]] = icmp eq i32 [[X:%.*]], 0
+// DEFAULT-NEXT:    br i1 [[TOBOOL_NOT_I1]], label [[_ZL5RNORMIPKD_EXIT:%.*]], label [[WHILE_BODY_I:%.*]]
 // DEFAULT:       while.body.i:
-// DEFAULT-NEXT:    [[__R_08_I:%.*]] = phi double [ [[ADD_I:%.*]], [[WHILE_BODY_I]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
-// DEFAULT-NEXT:    [[__A_ADDR_07_I:%.*]] = phi ptr [ [[INCDEC_PTR_I:%.*]], [[WHILE_BODY_I]] ], [ [[Y:%.*]], [[ENTRY]] ]
-// DEFAULT-NEXT:    [[__DIM_ADDR_06_I:%.*]] = phi i32 [ [[DEC_I:%.*]], [[WHILE_BODY_I]] ], [ [[X]], [[ENTRY]] ]
-// DEFAULT-NEXT:    [[DEC_I]] = add nsw i32 [[__DIM_ADDR_06_I]], -1
-// DEFAULT-NEXT:    [[TMP0:%.*]] = load double, ptr [[__A_ADDR_07_I]], align 8, !tbaa [[TBAA18]]
+// DEFAULT-NEXT:    [[__R_0_I4:%.*]] = phi double [ [[ADD_I:%.*]], [[WHILE_BODY_I]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
+// DEFAULT-NEXT:    [[__A_ADDR_0_I3:%.*]] = phi ptr [ [[INCDEC_PTR_I:%.*]], [[WHILE_BODY_I]] ], [ [[Y:%.*]], [[ENTRY]] ]
+// DEFAULT-NEXT:    [[__DIM_ADDR_0_I2:%.*]] = phi i32 [ [[DEC_I:%.*]], [[WHILE_BODY_I]] ], [ [[X]], [[ENTRY]] ]
+// DEFAULT-NEXT:    [[DEC_I]] = add nsw i32 [[__DIM_ADDR_0_I2]], -1
+// DEFAULT-NEXT:    [[TMP0:%.*]] = load double, ptr [[__A_ADDR_0_I3]], align 8, !tbaa [[TBAA18]]
 // DEFAULT-NEXT:    [[MUL_I:%.*]] = fmul contract double [[TMP0]], [[TMP0]]
-// DEFAULT-NEXT:    [[ADD_I]] = fadd contract double [[__R_08_I]], [[MUL_I]]
-// DEFAULT-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds double, ptr [[__A_ADDR_07_I]], i64 1
+// DEFAULT-NEXT:    [[ADD_I]] = fadd contract double [[__R_0_I4]], [[MUL_I]]
+// DEFAULT-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds double, ptr [[__A_ADDR_0_I3]], i64 1
 // DEFAULT-NEXT:    [[TOBOOL_NOT_I:%.*]] = icmp eq i32 [[DEC_I]], 0
 // DEFAULT-NEXT:    br i1 [[TOBOOL_NOT_I]], label [[_ZL5RNORMIPKD_EXIT]], label [[WHILE_BODY_I]], !llvm.loop [[LOOP23:![0-9]+]]
 // DEFAULT:       _ZL5rnormiPKd.exit:
-// DEFAULT-NEXT:    [[__R_0_LCSSA_I:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[ADD_I]], [[WHILE_BODY_I]] ]
-// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rsqrt_f64(double noundef [[__R_0_LCSSA_I]]) #[[ATTR15]]
+// DEFAULT-NEXT:    [[__R_0_I_LCSSA:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[ADD_I]], [[WHILE_BODY_I]] ]
+// DEFAULT-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rsqrt_f64(double noundef [[__R_0_I_LCSSA]]) #[[ATTR15]]
 // DEFAULT-NEXT:    ret double [[CALL_I]]
 //
 // FINITEONLY-LABEL: @test_rnorm(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[TOBOOL_NOT5_I:%.*]] = icmp eq i32 [[X:%.*]], 0
-// FINITEONLY-NEXT:    br i1 [[TOBOOL_NOT5_I]], label [[_ZL5RNORMIPKD_EXIT:%.*]], label [[WHILE_BODY_I:%.*]]
+// FINITEONLY-NEXT:    [[TOBOOL_NOT_I1:%.*]] = icmp eq i32 [[X:%.*]], 0
+// FINITEONLY-NEXT:    br i1 [[TOBOOL_NOT_I1]], label [[_ZL5RNORMIPKD_EXIT:%.*]], label [[WHILE_BODY_I:%.*]]
 // FINITEONLY:       while.body.i:
-// FINITEONLY-NEXT:    [[__R_08_I:%.*]] = phi double [ [[ADD_I:%.*]], [[WHILE_BODY_I]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
-// FINITEONLY-NEXT:    [[__A_ADDR_07_I:%.*]] = phi ptr [ [[INCDEC_PTR_I:%.*]], [[WHILE_BODY_I]] ], [ [[Y:%.*]], [[ENTRY]] ]
-// FINITEONLY-NEXT:    [[__DIM_ADDR_06_I:%.*]] = phi i32 [ [[DEC_I:%.*]], [[WHILE_BODY_I]] ], [ [[X]], [[ENTRY]] ]
-// FINITEONLY-NEXT:    [[DEC_I]] = add nsw i32 [[__DIM_ADDR_06_I]], -1
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = load double, ptr [[__A_ADDR_07_I]], align 8, !tbaa [[TBAA18]]
+// FINITEONLY-NEXT:    [[__R_0_I4:%.*]] = phi double [ [[ADD_I:%.*]], [[WHILE_BODY_I]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
+// FINITEONLY-NEXT:    [[__A_ADDR_0_I3:%.*]] = phi ptr [ [[INCDEC_PTR_I:%.*]], [[WHILE_BODY_I]] ], [ [[Y:%.*]], [[ENTRY]] ]
+// FINITEONLY-NEXT:    [[__DIM_ADDR_0_I2:%.*]] = phi i32 [ [[DEC_I:%.*]], [[WHILE_BODY_I]] ], [ [[X]], [[ENTRY]] ]
+// FINITEONLY-NEXT:    [[DEC_I]] = add nsw i32 [[__DIM_ADDR_0_I2]], -1
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = load double, ptr [[__A_ADDR_0_I3]], align 8, !tbaa [[TBAA18]]
 // FINITEONLY-NEXT:    [[MUL_I:%.*]] = fmul nnan ninf contract double [[TMP0]], [[TMP0]]
-// FINITEONLY-NEXT:    [[ADD_I]] = fadd nnan ninf contract double [[__R_08_I]], [[MUL_I]]
-// FINITEONLY-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds double, ptr [[__A_ADDR_07_I]], i64 1
+// FINITEONLY-NEXT:    [[ADD_I]] = fadd nnan ninf contract double [[__R_0_I4]], [[MUL_I]]
+// FINITEONLY-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds double, ptr [[__A_ADDR_0_I3]], i64 1
 // FINITEONLY-NEXT:    [[TOBOOL_NOT_I:%.*]] = icmp eq i32 [[DEC_I]], 0
 // FINITEONLY-NEXT:    br i1 [[TOBOOL_NOT_I]], label [[_ZL5RNORMIPKD_EXIT]], label [[WHILE_BODY_I]], !llvm.loop [[LOOP23:![0-9]+]]
 // FINITEONLY:       _ZL5rnormiPKd.exit:
-// FINITEONLY-NEXT:    [[__R_0_LCSSA_I:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[ADD_I]], [[WHILE_BODY_I]] ]
-// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_rsqrt_f64(double noundef nofpclass(nan inf) [[__R_0_LCSSA_I]]) #[[ATTR15]]
+// FINITEONLY-NEXT:    [[__R_0_I_LCSSA:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[ADD_I]], [[WHILE_BODY_I]] ]
+// FINITEONLY-NEXT:    [[CALL_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_rsqrt_f64(double noundef nofpclass(nan inf) [[__R_0_I_LCSSA]]) #[[ATTR15]]
 // FINITEONLY-NEXT:    ret double [[CALL_I]]
 //
 // APPROX-LABEL: @test_rnorm(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[TOBOOL_NOT5_I:%.*]] = icmp eq i32 [[X:%.*]], 0
-// APPROX-NEXT:    br i1 [[TOBOOL_NOT5_I]], label [[_ZL5RNORMIPKD_EXIT:%.*]], label [[WHILE_BODY_I:%.*]]
+// APPROX-NEXT:    [[TOBOOL_NOT_I1:%.*]] = icmp eq i32 [[X:%.*]], 0
+// APPROX-NEXT:    br i1 [[TOBOOL_NOT_I1]], label [[_ZL5RNORMIPKD_EXIT:%.*]], label [[WHILE_BODY_I:%.*]]
 // APPROX:       while.body.i:
-// APPROX-NEXT:    [[__R_08_I:%.*]] = phi double [ [[ADD_I:%.*]], [[WHILE_BODY_I]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
-// APPROX-NEXT:    [[__A_ADDR_07_I:%.*]] = phi ptr [ [[INCDEC_PTR_I:%.*]], [[WHILE_BODY_I]] ], [ [[Y:%.*]], [[ENTRY]] ]
-// APPROX-NEXT:    [[__DIM_ADDR_06_I:%.*]] = phi i32 [ [[DEC_I:%.*]], [[WHILE_BODY_I]] ], [ [[X]], [[ENTRY]] ]
-// APPROX-NEXT:    [[DEC_I]] = add nsw i32 [[__DIM_ADDR_06_I]], -1
-// APPROX-NEXT:    [[TMP0:%.*]] = load double, ptr [[__A_ADDR_07_I]], align 8, !tbaa [[TBAA18]]
+// APPROX-NEXT:    [[__R_0_I4:%.*]] = phi double [ [[ADD_I:%.*]], [[WHILE_BODY_I]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
+// APPROX-NEXT:    [[__A_ADDR_0_I3:%.*]] = phi ptr [ [[INCDEC_PTR_I:%.*]], [[WHILE_BODY_I]] ], [ [[Y:%.*]], [[ENTRY]] ]
+// APPROX-NEXT:    [[__DIM_ADDR_0_I2:%.*]] = phi i32 [ [[DEC_I:%.*]], [[WHILE_BODY_I]] ], [ [[X]], [[ENTRY]] ]
+// APPROX-NEXT:    [[DEC_I]] = add nsw i32 [[__DIM_ADDR_0_I2]], -1
+// APPROX-NEXT:    [[TMP0:%.*]] = load double, ptr [[__A_ADDR_0_I3]], align 8, !tbaa [[TBAA18]]
 // APPROX-NEXT:    [[MUL_I:%.*]] = fmul contract double [[TMP0]], [[TMP0]]
-// APPROX-NEXT:    [[ADD_I]] = fadd contract double [[__R_08_I]], [[MUL_I]]
-// APPROX-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds double, ptr [[__A_ADDR_07_I]], i64 1
+// APPROX-NEXT:    [[ADD_I]] = fadd contract double [[__R_0_I4]], [[MUL_I]]
+// APPROX-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds double, ptr [[__A_ADDR_0_I3]], i64 1
 // APPROX-NEXT:    [[TOBOOL_NOT_I:%.*]] = icmp eq i32 [[DEC_I]], 0
 // APPROX-NEXT:    br i1 [[TOBOOL_NOT_I]], label [[_ZL5RNORMIPKD_EXIT]], label [[WHILE_BODY_I]], !llvm.loop [[LOOP23:![0-9]+]]
 // APPROX:       _ZL5rnormiPKd.exit:
-// APPROX-NEXT:    [[__R_0_LCSSA_I:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[ADD_I]], [[WHILE_BODY_I]] ]
-// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rsqrt_f64(double noundef [[__R_0_LCSSA_I]]) #[[ATTR15]]
+// APPROX-NEXT:    [[__R_0_I_LCSSA:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[ADD_I]], [[WHILE_BODY_I]] ]
+// APPROX-NEXT:    [[CALL_I:%.*]] = tail call contract noundef double @__ocml_rsqrt_f64(double noundef [[__R_0_I_LCSSA]]) #[[ATTR15]]
 // APPROX-NEXT:    ret double [[CALL_I]]
 //
 extern "C" __device__ double test_rnorm(int x, const double* y) {
@@ -3838,8 +3838,8 @@ extern "C" __device__ void test_sincospi(double x, double *y, double *z) {
 //
 // APPROX-LABEL: @test_sinf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I_I:%.*]] = tail call contract noundef float @__ocml_native_sin_f32(float noundef [[X:%.*]]) #[[ATTR16]]
-// APPROX-NEXT:    ret float [[CALL_I_I]]
+// APPROX-NEXT:    [[CALL_I1:%.*]] = tail call contract noundef float @__ocml_native_sin_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// APPROX-NEXT:    ret float [[CALL_I1]]
 //
 extern "C" __device__ float test_sinf(float x) {
   return sinf(x);
@@ -4175,30 +4175,30 @@ extern "C" __device__ double test_y1(double x) {
 // DEFAULT-NEXT:    i32 1, label [[IF_THEN2_I:%.*]]
 // DEFAULT-NEXT:    ]
 // DEFAULT:       if.then.i:
-// DEFAULT-NEXT:    [[CALL_I_I:%.*]] = tail call contract noundef float @__ocml_y0_f32(float noundef [[Y:%.*]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I20_I:%.*]] = tail call contract noundef float @__ocml_y0_f32(float noundef [[Y:%.*]]) #[[ATTR16]]
 // DEFAULT-NEXT:    br label [[_ZL3YNFIF_EXIT:%.*]]
 // DEFAULT:       if.then2.i:
-// DEFAULT-NEXT:    [[CALL_I20_I:%.*]] = tail call contract noundef float @__ocml_y1_f32(float noundef [[Y]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I22_I:%.*]] = tail call contract noundef float @__ocml_y1_f32(float noundef [[Y]]) #[[ATTR16]]
 // DEFAULT-NEXT:    br label [[_ZL3YNFIF_EXIT]]
 // DEFAULT:       if.end4.i:
-// DEFAULT-NEXT:    [[CALL_I21_I:%.*]] = tail call contract noundef float @__ocml_y0_f32(float noundef [[Y]]) #[[ATTR16]]
-// DEFAULT-NEXT:    [[CALL_I22_I:%.*]] = tail call contract noundef float @__ocml_y1_f32(float noundef [[Y]]) #[[ATTR16]]
-// DEFAULT-NEXT:    [[CMP723_I:%.*]] = icmp sgt i32 [[X]], 1
-// DEFAULT-NEXT:    br i1 [[CMP723_I]], label [[FOR_BODY_I:%.*]], label [[_ZL3YNFIF_EXIT]]
+// DEFAULT-NEXT:    [[CALL_I_I:%.*]] = tail call contract noundef float @__ocml_y0_f32(float noundef [[Y]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I21_I:%.*]] = tail call contract noundef float @__ocml_y1_f32(float noundef [[Y]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CMP7_I1:%.*]] = icmp sgt i32 [[X]], 1
+// DEFAULT-NEXT:    br i1 [[CMP7_I1]], label [[FOR_BODY_I:%.*]], label [[_ZL3YNFIF_EXIT]]
 // DEFAULT:       for.body.i:
-// DEFAULT-NEXT:    [[__I_026_I:%.*]] = phi i32 [ [[INC_I:%.*]], [[FOR_BODY_I]] ], [ 1, [[IF_END4_I]] ]
-// DEFAULT-NEXT:    [[__X1_025_I:%.*]] = phi float [ [[SUB_I:%.*]], [[FOR_BODY_I]] ], [ [[CALL_I22_I]], [[IF_END4_I]] ]
-// DEFAULT-NEXT:    [[__X0_024_I:%.*]] = phi float [ [[__X1_025_I]], [[FOR_BODY_I]] ], [ [[CALL_I21_I]], [[IF_END4_I]] ]
-// DEFAULT-NEXT:    [[MUL_I:%.*]] = shl nuw nsw i32 [[__I_026_I]], 1
+// DEFAULT-NEXT:    [[__I_0_I4:%.*]] = phi i32 [ [[INC_I:%.*]], [[FOR_BODY_I]] ], [ 1, [[IF_END4_I]] ]
+// DEFAULT-NEXT:    [[__X1_0_I3:%.*]] = phi float [ [[SUB_I:%.*]], [[FOR_BODY_I]] ], [ [[CALL_I21_I]], [[IF_END4_I]] ]
+// DEFAULT-NEXT:    [[__X0_0_I2:%.*]] = phi float [ [[__X1_0_I3]], [[FOR_BODY_I]] ], [ [[CALL_I_I]], [[IF_END4_I]] ]
+// DEFAULT-NEXT:    [[MUL_I:%.*]] = shl nuw nsw i32 [[__I_0_I4]], 1
 // DEFAULT-NEXT:    [[CONV_I:%.*]] = sitofp i32 [[MUL_I]] to float
 // DEFAULT-NEXT:    [[DIV_I:%.*]] = fdiv contract float [[CONV_I]], [[Y]]
-// DEFAULT-NEXT:    [[MUL8_I:%.*]] = fmul contract float [[__X1_025_I]], [[DIV_I]]
-// DEFAULT-NEXT:    [[SUB_I]] = fsub contract float [[MUL8_I]], [[__X0_024_I]]
-// DEFAULT-NEXT:    [[INC_I]] = add nuw nsw i32 [[__I_026_I]], 1
-// DEFAULT-NEXT:    [[EXITCOND_NOT_I:%.*]] = icmp eq i32 [[INC_I]], [[X]]
-// DEFAULT-NEXT:    br i1 [[EXITCOND_NOT_I]], label [[_ZL3YNFIF_EXIT]], label [[FOR_BODY_I]], !llvm.loop [[LOOP24:![0-9]+]]
+// DEFAULT-NEXT:    [[MUL8_I:%.*]] = fmul contract float [[__X1_0_I3]], [[DIV_I]]
+// DEFAULT-NEXT:    [[SUB_I]] = fsub contract float [[MUL8_I]], [[__X0_0_I2]]
+// DEFAULT-NEXT:    [[INC_I]] = add nuw nsw i32 [[__I_0_I4]], 1
+// DEFAULT-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC_I]], [[X]]
+// DEFAULT-NEXT:    br i1 [[EXITCOND_NOT]], label [[_ZL3YNFIF_EXIT]], label [[FOR_BODY_I]], !llvm.loop [[LOOP24:![0-9]+]]
 // DEFAULT:       _ZL3ynfif.exit:
-// DEFAULT-NEXT:    [[RETVAL_0_I:%.*]] = phi float [ [[CALL_I_I]], [[IF_THEN_I]] ], [ [[CALL_I20_I]], [[IF_THEN2_I]] ], [ [[CALL_I22_I]], [[IF_END4_I]] ], [ [[SUB_I]], [[FOR_BODY_I]] ]
+// DEFAULT-NEXT:    [[RETVAL_0_I:%.*]] = phi float [ [[CALL_I20_I]], [[IF_THEN_I]] ], [ [[CALL_I22_I]], [[IF_THEN2_I]] ], [ [[CALL_I21_I]], [[IF_END4_I]] ], [ [[SUB_I]], [[FOR_BODY_I]] ]
 // DEFAULT-NEXT:    ret float [[RETVAL_0_I]]
 //
 // FINITEONLY-LABEL: @test_ynf(
@@ -4208,30 +4208,30 @@ extern "C" __device__ double test_y1(double x) {
 // FINITEONLY-NEXT:    i32 1, label [[IF_THEN2_I:%.*]]
 // FINITEONLY-NEXT:    ]
 // FINITEONLY:       if.then.i:
-// FINITEONLY-NEXT:    [[CALL_I_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_y0_f32(float noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I20_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_y0_f32(float noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR16]]
 // FINITEONLY-NEXT:    br label [[_ZL3YNFIF_EXIT:%.*]]
 // FINITEONLY:       if.then2.i:
-// FINITEONLY-NEXT:    [[CALL_I20_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_y1_f32(float noundef nofpclass(nan inf) [[Y]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I22_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_y1_f32(float noundef nofpclass(nan inf) [[Y]]) #[[ATTR16]]
 // FINITEONLY-NEXT:    br label [[_ZL3YNFIF_EXIT]]
 // FINITEONLY:       if.end4.i:
-// FINITEONLY-NEXT:    [[CALL_I21_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_y0_f32(float noundef nofpclass(nan inf) [[Y]]) #[[ATTR16]]
-// FINITEONLY-NEXT:    [[CALL_I22_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_y1_f32(float noundef nofpclass(nan inf) [[Y]]) #[[ATTR16]]
-// FINITEONLY-NEXT:    [[CMP723_I:%.*]] = icmp sgt i32 [[X]], 1
-// FINITEONLY-NEXT:    br i1 [[CMP723_I]], label [[FOR_BODY_I:%.*]], label [[_ZL3YNFIF_EXIT]]
+// FINITEONLY-NEXT:    [[CALL_I_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_y0_f32(float noundef nofpclass(nan inf) [[Y]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I21_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_y1_f32(float noundef nofpclass(nan inf) [[Y]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CMP7_I1:%.*]] = icmp sgt i32 [[X]], 1
+// FINITEONLY-NEXT:    br i1 [[CMP7_I1]], label [[FOR_BODY_I:%.*]], label [[_ZL3YNFIF_EXIT]]
 // FINITEONLY:       for.body.i:
-// FINITEONLY-NEXT:    [[__I_026_I:%.*]] = phi i32 [ [[INC_I:%.*]], [[FOR_BODY_I]] ], [ 1, [[IF_END4_I]] ]
-// FINITEONLY-NEXT:    [[__X1_025_I:%.*]] = phi float [ [[SUB_I:%.*]], [[FOR_BODY_I]] ], [ [[CALL_I22_I]], [[IF_END4_I]] ]
-// FINITEONLY-NEXT:    [[__X0_024_I:%.*]] = phi float [ [[__X1_025_I]], [[FOR_BODY_I]] ], [ [[CALL_I21_I]], [[IF_END4_I]] ]
-// FINITEONLY-NEXT:    [[MUL_I:%.*]] = shl nuw nsw i32 [[__I_026_I]], 1
+// FINITEONLY-NEXT:    [[__I_0_I4:%.*]] = phi i32 [ [[INC_I:%.*]], [[FOR_BODY_I]] ], [ 1, [[IF_END4_I]] ]
+// FINITEONLY-NEXT:    [[__X1_0_I3:%.*]] = phi float [ [[SUB_I:%.*]], [[FOR_BODY_I]] ], [ [[CALL_I21_I]], [[IF_END4_I]] ]
+// FINITEONLY-NEXT:    [[__X0_0_I2:%.*]] = phi float [ [[__X1_0_I3]], [[FOR_BODY_I]] ], [ [[CALL_I_I]], [[IF_END4_I]] ]
+// FINITEONLY-NEXT:    [[MUL_I:%.*]] = shl nuw nsw i32 [[__I_0_I4]], 1
 // FINITEONLY-NEXT:    [[CONV_I:%.*]] = sitofp i32 [[MUL_I]] to float
 // FINITEONLY-NEXT:    [[DIV_I:%.*]] = fdiv nnan ninf contract float [[CONV_I]], [[Y]]
-// FINITEONLY-NEXT:    [[MUL8_I:%.*]] = fmul nnan ninf contract float [[__X1_025_I]], [[DIV_I]]
-// FINITEONLY-NEXT:    [[SUB_I]] = fsub nnan ninf contract float [[MUL8_I]], [[__X0_024_I]]
-// FINITEONLY-NEXT:    [[INC_I]] = add nuw nsw i32 [[__I_026_I]], 1
-// FINITEONLY-NEXT:    [[EXITCOND_NOT_I:%.*]] = icmp eq i32 [[INC_I]], [[X]]
-// FINITEONLY-NEXT:    br i1 [[EXITCOND_NOT_I]], label [[_ZL3YNFIF_EXIT]], label [[FOR_BODY_I]], !llvm.loop [[LOOP24:![0-9]+]]
+// FINITEONLY-NEXT:    [[MUL8_I:%.*]] = fmul nnan ninf contract float [[__X1_0_I3]], [[DIV_I]]
+// FINITEONLY-NEXT:    [[SUB_I]] = fsub nnan ninf contract float [[MUL8_I]], [[__X0_0_I2]]
+// FINITEONLY-NEXT:    [[INC_I]] = add nuw nsw i32 [[__I_0_I4]], 1
+// FINITEONLY-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC_I]], [[X]]
+// FINITEONLY-NEXT:    br i1 [[EXITCOND_NOT]], label [[_ZL3YNFIF_EXIT]], label [[FOR_BODY_I]], !llvm.loop [[LOOP24:![0-9]+]]
 // FINITEONLY:       _ZL3ynfif.exit:
-// FINITEONLY-NEXT:    [[RETVAL_0_I:%.*]] = phi float [ [[CALL_I_I]], [[IF_THEN_I]] ], [ [[CALL_I20_I]], [[IF_THEN2_I]] ], [ [[CALL_I22_I]], [[IF_END4_I]] ], [ [[SUB_I]], [[FOR_BODY_I]] ]
+// FINITEONLY-NEXT:    [[RETVAL_0_I:%.*]] = phi float [ [[CALL_I20_I]], [[IF_THEN_I]] ], [ [[CALL_I22_I]], [[IF_THEN2_I]] ], [ [[CALL_I21_I]], [[IF_END4_I]] ], [ [[SUB_I]], [[FOR_BODY_I]] ]
 // FINITEONLY-NEXT:    ret float [[RETVAL_0_I]]
 //
 // APPROX-LABEL: @test_ynf(
@@ -4241,30 +4241,30 @@ extern "C" __device__ double test_y1(double x) {
 // APPROX-NEXT:    i32 1, label [[IF_THEN2_I:%.*]]
 // APPROX-NEXT:    ]
 // APPROX:       if.then.i:
-// APPROX-NEXT:    [[CALL_I_I:%.*]] = tail call contract noundef float @__ocml_y0_f32(float noundef [[Y:%.*]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I20_I:%.*]] = tail call contract noundef float @__ocml_y0_f32(float noundef [[Y:%.*]]) #[[ATTR16]]
 // APPROX-NEXT:    br label [[_ZL3YNFIF_EXIT:%.*]]
 // APPROX:       if.then2.i:
-// APPROX-NEXT:    [[CALL_I20_I:%.*]] = tail call contract noundef float @__ocml_y1_f32(float noundef [[Y]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I22_I:%.*]] = tail call contract noundef float @__ocml_y1_f32(float noundef [[Y]]) #[[ATTR16]]
 // APPROX-NEXT:    br label [[_ZL3YNFIF_EXIT]]
 // APPROX:       if.end4.i:
-// APPROX-NEXT:    [[CALL_I21_I:%.*]] = tail call contract noundef float @__ocml_y0_f32(float noundef [[Y]]) #[[ATTR16]]
-// APPROX-NEXT:    [[CALL_I22_I:%.*]] = tail call contract noundef float @__ocml_y1_f32(float noundef [[Y]]) #[[ATTR16]]
-// APPROX-NEXT:    [[CMP723_I:%.*]] = icmp sgt i32 [[X]], 1
-// APPROX-NEXT:    br i1 [[CMP723_I]], label [[FOR_BODY_I:%.*]], label [[_ZL3YNFIF_EXIT]]
+// APPROX-NEXT:    [[CALL_I_I:%.*]] = tail call contract noundef float @__ocml_y0_f32(float noundef [[Y]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I21_I:%.*]] = tail call contract noundef float @__ocml_y1_f32(float noundef [[Y]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CMP7_I1:%.*]] = icmp sgt i32 [[X]], 1
+// APPROX-NEXT:    br i1 [[CMP7_I1]], label [[FOR_BODY_I:%.*]], label [[_ZL3YNFIF_EXIT]]
 // APPROX:       for.body.i:
-// APPROX-NEXT:    [[__I_026_I:%.*]] = phi i32 [ [[INC_I:%.*]], [[FOR_BODY_I]] ], [ 1, [[IF_END4_I]] ]
-// APPROX-NEXT:    [[__X1_025_I:%.*]] = phi float [ [[SUB_I:%.*]], [[FOR_BODY_I]] ], [ [[CALL_I22_I]], [[IF_END4_I]] ]
-// APPROX-NEXT:    [[__X0_024_I:%.*]] = phi float [ [[__X1_025_I]], [[FOR_BODY_I]] ], [ [[CALL_I21_I]], [[IF_END4_I]] ]
-// APPROX-NEXT:    [[MUL_I:%.*]] = shl nuw nsw i32 [[__I_026_I]], 1
+// APPROX-NEXT:    [[__I_0_I4:%.*]] = phi i32 [ [[INC_I:%.*]], [[FOR_BODY_I]] ], [ 1, [[IF_END4_I]] ]
+// APPROX-NEXT:    [[__X1_0_I3:%.*]] = phi float [ [[SUB_I:%.*]], [[FOR_BODY_I]] ], [ [[CALL_I21_I]], [[IF_END4_I]] ]
+// APPROX-NEXT:    [[__X0_0_I2:%.*]] = phi float [ [[__X1_0_I3]], [[FOR_BODY_I]] ], [ [[CALL_I_I]], [[IF_END4_I]] ]
+// APPROX-NEXT:    [[MUL_I:%.*]] = shl nuw nsw i32 [[__I_0_I4]], 1
 // APPROX-NEXT:    [[CONV_I:%.*]] = sitofp i32 [[MUL_I]] to float
 // APPROX-NEXT:    [[DIV_I:%.*]] = fdiv contract float [[CONV_I]], [[Y]]
-// APPROX-NEXT:    [[MUL8_I:%.*]] = fmul contract float [[__X1_025_I]], [[DIV_I]]
-// APPROX-NEXT:    [[SUB_I]] = fsub contract float [[MUL8_I]], [[__X0_024_I]]
-// APPROX-NEXT:    [[INC_I]] = add nuw nsw i32 [[__I_026_I]], 1
-// APPROX-NEXT:    [[EXITCOND_NOT_I:%.*]] = icmp eq i32 [[INC_I]], [[X]]
-// APPROX-NEXT:    br i1 [[EXITCOND_NOT_I]], label [[_ZL3YNFIF_EXIT]], label [[FOR_BODY_I]], !llvm.loop [[LOOP24:![0-9]+]]
+// APPROX-NEXT:    [[MUL8_I:%.*]] = fmul contract float [[__X1_0_I3]], [[DIV_I]]
+// APPROX-NEXT:    [[SUB_I]] = fsub contract float [[MUL8_I]], [[__X0_0_I2]]
+// APPROX-NEXT:    [[INC_I]] = add nuw nsw i32 [[__I_0_I4]], 1
+// APPROX-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC_I]], [[X]]
+// APPROX-NEXT:    br i1 [[EXITCOND_NOT]], label [[_ZL3YNFIF_EXIT]], label [[FOR_BODY_I]], !llvm.loop [[LOOP24:![0-9]+]]
 // APPROX:       _ZL3ynfif.exit:
-// APPROX-NEXT:    [[RETVAL_0_I:%.*]] = phi float [ [[CALL_I_I]], [[IF_THEN_I]] ], [ [[CALL_I20_I]], [[IF_THEN2_I]] ], [ [[CALL_I22_I]], [[IF_END4_I]] ], [ [[SUB_I]], [[FOR_BODY_I]] ]
+// APPROX-NEXT:    [[RETVAL_0_I:%.*]] = phi float [ [[CALL_I20_I]], [[IF_THEN_I]] ], [ [[CALL_I22_I]], [[IF_THEN2_I]] ], [ [[CALL_I21_I]], [[IF_END4_I]] ], [ [[SUB_I]], [[FOR_BODY_I]] ]
 // APPROX-NEXT:    ret float [[RETVAL_0_I]]
 //
 extern "C" __device__ float test_ynf(int x, float y) {
@@ -4278,30 +4278,30 @@ extern "C" __device__ float test_ynf(int x, float y) {
 // DEFAULT-NEXT:    i32 1, label [[IF_THEN2_I:%.*]]
 // DEFAULT-NEXT:    ]
 // DEFAULT:       if.then.i:
-// DEFAULT-NEXT:    [[CALL_I_I:%.*]] = tail call contract noundef double @__ocml_y0_f64(double noundef [[Y:%.*]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I20_I:%.*]] = tail call contract noundef double @__ocml_y0_f64(double noundef [[Y:%.*]]) #[[ATTR16]]
 // DEFAULT-NEXT:    br label [[_ZL2YNID_EXIT:%.*]]
 // DEFAULT:       if.then2.i:
-// DEFAULT-NEXT:    [[CALL_I20_I:%.*]] = tail call contract noundef double @__ocml_y1_f64(double noundef [[Y]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I22_I:%.*]] = tail call contract noundef double @__ocml_y1_f64(double noundef [[Y]]) #[[ATTR16]]
 // DEFAULT-NEXT:    br label [[_ZL2YNID_EXIT]]
 // DEFAULT:       if.end4.i:
-// DEFAULT-NEXT:    [[CALL_I21_I:%.*]] = tail call contract noundef double @__ocml_y0_f64(double noundef [[Y]]) #[[ATTR16]]
-// DEFAULT-NEXT:    [[CALL_I22_I:%.*]] = tail call contract noundef double @__ocml_y1_f64(double noundef [[Y]]) #[[ATTR16]]
-// DEFAULT-NEXT:    [[CMP723_I:%.*]] = icmp sgt i32 [[X]], 1
-// DEFAULT-NEXT:    br i1 [[CMP723_I]], label [[FOR_BODY_I:%.*]], label [[_ZL2YNID_EXIT]]
+// DEFAULT-NEXT:    [[CALL_I_I:%.*]] = tail call contract noundef double @__ocml_y0_f64(double noundef [[Y]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I21_I:%.*]] = tail call contract noundef double @__ocml_y1_f64(double noundef [[Y]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CMP7_I1:%.*]] = icmp sgt i32 [[X]], 1
+// DEFAULT-NEXT:    br i1 [[CMP7_I1]], label [[FOR_BODY_I:%.*]], label [[_ZL2YNID_EXIT]]
 // DEFAULT:       for.body.i:
-// DEFAULT-NEXT:    [[__I_026_I:%.*]] = phi i32 [ [[INC_I:%.*]], [[FOR_BODY_I]] ], [ 1, [[IF_END4_I]] ]
-// DEFAULT-NEXT:    [[__X1_025_I:%.*]] = phi double [ [[SUB_I:%.*]], [[FOR_BODY_I]] ], [ [[CALL_I22_I]], [[IF_END4_I]] ]
-// DEFAULT-NEXT:    [[__X0_024_I:%.*]] = phi double [ [[__X1_025_I]], [[FOR_BODY_I]] ], [ [[CALL_I21_I]], [[IF_END4_I]] ]
-// DEFAULT-NEXT:    [[MUL_I:%.*]] = shl nuw nsw i32 [[__I_026_I]], 1
+// DEFAULT-NEXT:    [[__I_0_I4:%.*]] = phi i32 [ [[INC_I:%.*]], [[FOR_BODY_I]] ], [ 1, [[IF_END4_I]] ]
+// DEFAULT-NEXT:    [[__X1_0_I3:%.*]] = phi double [ [[SUB_I:%.*]], [[FOR_BODY_I]] ], [ [[CALL_I21_I]], [[IF_END4_I]] ]
+// DEFAULT-NEXT:    [[__X0_0_I2:%.*]] = phi double [ [[__X1_0_I3]], [[FOR_BODY_I]] ], [ [[CALL_I_I]], [[IF_END4_I]] ]
+// DEFAULT-NEXT:    [[MUL_I:%.*]] = shl nuw nsw i32 [[__I_0_I4]], 1
 // DEFAULT-NEXT:    [[CONV_I:%.*]] = sitofp i32 [[MUL_I]] to double
 // DEFAULT-NEXT:    [[DIV_I:%.*]] = fdiv contract double [[CONV_I]], [[Y]]
-// DEFAULT-NEXT:    [[MUL8_I:%.*]] = fmul contract double [[__X1_025_I]], [[DIV_I]]
-// DEFAULT-NEXT:    [[SUB_I]] = fsub contract double [[MUL8_I]], [[__X0_024_I]]
-// DEFAULT-NEXT:    [[INC_I]] = add nuw nsw i32 [[__I_026_I]], 1
-// DEFAULT-NEXT:    [[EXITCOND_NOT_I:%.*]] = icmp eq i32 [[INC_I]], [[X]]
-// DEFAULT-NEXT:    br i1 [[EXITCOND_NOT_I]], label [[_ZL2YNID_EXIT]], label [[FOR_BODY_I]], !llvm.loop [[LOOP25:![0-9]+]]
+// DEFAULT-NEXT:    [[MUL8_I:%.*]] = fmul contract double [[__X1_0_I3]], [[DIV_I]]
+// DEFAULT-NEXT:    [[SUB_I]] = fsub contract double [[MUL8_I]], [[__X0_0_I2]]
+// DEFAULT-NEXT:    [[INC_I]] = add nuw nsw i32 [[__I_0_I4]], 1
+// DEFAULT-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC_I]], [[X]]
+// DEFAULT-NEXT:    br i1 [[EXITCOND_NOT]], label [[_ZL2YNID_EXIT]], label [[FOR_BODY_I]], !llvm.loop [[LOOP25:![0-9]+]]
 // DEFAULT:       _ZL2ynid.exit:
-// DEFAULT-NEXT:    [[RETVAL_0_I:%.*]] = phi double [ [[CALL_I_I]], [[IF_THEN_I]] ], [ [[CALL_I20_I]], [[IF_THEN2_I]] ], [ [[CALL_I22_I]], [[IF_END4_I]] ], [ [[SUB_I]], [[FOR_BODY_I]] ]
+// DEFAULT-NEXT:    [[RETVAL_0_I:%.*]] = phi double [ [[CALL_I20_I]], [[IF_THEN_I]] ], [ [[CALL_I22_I]], [[IF_THEN2_I]] ], [ [[CALL_I21_I]], [[IF_END4_I]] ], [ [[SUB_I]], [[FOR_BODY_I]] ]
 // DEFAULT-NEXT:    ret double [[RETVAL_0_I]]
 //
 // FINITEONLY-LABEL: @test_yn(
@@ -4311,30 +4311,30 @@ extern "C" __device__ float test_ynf(int x, float y) {
 // FINITEONLY-NEXT:    i32 1, label [[IF_THEN2_I:%.*]]
 // FINITEONLY-NEXT:    ]
 // FINITEONLY:       if.then.i:
-// FINITEONLY-NEXT:    [[CALL_I_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_y0_f64(double noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I20_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_y0_f64(double noundef nofpclass(nan inf) [[Y:%.*]]) #[[ATTR16]]
 // FINITEONLY-NEXT:    br label [[_ZL2YNID_EXIT:%.*]]
 // FINITEONLY:       if.then2.i:
-// FINITEONLY-NEXT:    [[CALL_I20_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_y1_f64(double noundef nofpclass(nan inf) [[Y]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I22_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_y1_f64(double noundef nofpclass(nan inf) [[Y]]) #[[ATTR16]]
 // FINITEONLY-NEXT:    br label [[_ZL2YNID_EXIT]]
 // FINITEONLY:       if.end4.i:
-// FINITEONLY-NEXT:    [[CALL_I21_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_y0_f64(double noundef nofpclass(nan inf) [[Y]]) #[[ATTR16]]
-// FINITEONLY-NEXT:    [[CALL_I22_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_y1_f64(double noundef nofpclass(nan inf) [[Y]]) #[[ATTR16]]
-// FINITEONLY-NEXT:    [[CMP723_I:%.*]] = icmp sgt i32 [[X]], 1
-// FINITEONLY-NEXT:    br i1 [[CMP723_I]], label [[FOR_BODY_I:%.*]], label [[_ZL2YNID_EXIT]]
+// FINITEONLY-NEXT:    [[CALL_I_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_y0_f64(double noundef nofpclass(nan inf) [[Y]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I21_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) double @__ocml_y1_f64(double noundef nofpclass(nan inf) [[Y]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CMP7_I1:%.*]] = icmp sgt i32 [[X]], 1
+// FINITEONLY-NEXT:    br i1 [[CMP7_I1]], label [[FOR_BODY_I:%.*]], label [[_ZL2YNID_EXIT]]
 // FINITEONLY:       for.body.i:
-// FINITEONLY-NEXT:    [[__I_026_I:%.*]] = phi i32 [ [[INC_I:%.*]], [[FOR_BODY_I]] ], [ 1, [[IF_END4_I]] ]
-// FINITEONLY-NEXT:    [[__X1_025_I:%.*]] = phi double [ [[SUB_I:%.*]], [[FOR_BODY_I]] ], [ [[CALL_I22_I]], [[IF_END4_I]] ]
-// FINITEONLY-NEXT:    [[__X0_024_I:%.*]] = phi double [ [[__X1_025_I]], [[FOR_BODY_I]] ], [ [[CALL_I21_I]], [[IF_END4_I]] ]
-// FINITEONLY-NEXT:    [[MUL_I:%.*]] = shl nuw nsw i32 [[__I_026_I]], 1
+// FINITEONLY-NEXT:    [[__I_0_I4:%.*]] = phi i32 [ [[INC_I:%.*]], [[FOR_BODY_I]] ], [ 1, [[IF_END4_I]] ]
+// FINITEONLY-NEXT:    [[__X1_0_I3:%.*]] = phi double [ [[SUB_I:%.*]], [[FOR_BODY_I]] ], [ [[CALL_I21_I]], [[IF_END4_I]] ]
+// FINITEONLY-NEXT:    [[__X0_0_I2:%.*]] = phi double [ [[__X1_0_I3]], [[FOR_BODY_I]] ], [ [[CALL_I_I]], [[IF_END4_I]] ]
+// FINITEONLY-NEXT:    [[MUL_I:%.*]] = shl nuw nsw i32 [[__I_0_I4]], 1
 // FINITEONLY-NEXT:    [[CONV_I:%.*]] = sitofp i32 [[MUL_I]] to double
 // FINITEONLY-NEXT:    [[DIV_I:%.*]] = fdiv nnan ninf contract double [[CONV_I]], [[Y]]
-// FINITEONLY-NEXT:    [[MUL8_I:%.*]] = fmul nnan ninf contract double [[__X1_025_I]], [[DIV_I]]
-// FINITEONLY-NEXT:    [[SUB_I]] = fsub nnan ninf contract double [[MUL8_I]], [[__X0_024_I]]
-// FINITEONLY-NEXT:    [[INC_I]] = add nuw nsw i32 [[__I_026_I]], 1
-// FINITEONLY-NEXT:    [[EXITCOND_NOT_I:%.*]] = icmp eq i32 [[INC_I]], [[X]]
-// FINITEONLY-NEXT:    br i1 [[EXITCOND_NOT_I]], label [[_ZL2YNID_EXIT]], label [[FOR_BODY_I]], !llvm.loop [[LOOP25:![0-9]+]]
+// FINITEONLY-NEXT:    [[MUL8_I:%.*]] = fmul nnan ninf contract double [[__X1_0_I3]], [[DIV_I]]
+// FINITEONLY-NEXT:    [[SUB_I]] = fsub nnan ninf contract double [[MUL8_I]], [[__X0_0_I2]]
+// FINITEONLY-NEXT:    [[INC_I]] = add nuw nsw i32 [[__I_0_I4]], 1
+// FINITEONLY-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC_I]], [[X]]
+// FINITEONLY-NEXT:    br i1 [[EXITCOND_NOT]], label [[_ZL2YNID_EXIT]], label [[FOR_BODY_I]], !llvm.loop [[LOOP25:![0-9]+]]
 // FINITEONLY:       _ZL2ynid.exit:
-// FINITEONLY-NEXT:    [[RETVAL_0_I:%.*]] = phi double [ [[CALL_I_I]], [[IF_THEN_I]] ], [ [[CALL_I20_I]], [[IF_THEN2_I]] ], [ [[CALL_I22_I]], [[IF_END4_I]] ], [ [[SUB_I]], [[FOR_BODY_I]] ]
+// FINITEONLY-NEXT:    [[RETVAL_0_I:%.*]] = phi double [ [[CALL_I20_I]], [[IF_THEN_I]] ], [ [[CALL_I22_I]], [[IF_THEN2_I]] ], [ [[CALL_I21_I]], [[IF_END4_I]] ], [ [[SUB_I]], [[FOR_BODY_I]] ]
 // FINITEONLY-NEXT:    ret double [[RETVAL_0_I]]
 //
 // APPROX-LABEL: @test_yn(
@@ -4344,30 +4344,30 @@ extern "C" __device__ float test_ynf(int x, float y) {
 // APPROX-NEXT:    i32 1, label [[IF_THEN2_I:%.*]]
 // APPROX-NEXT:    ]
 // APPROX:       if.then.i:
-// APPROX-NEXT:    [[CALL_I_I:%.*]] = tail call contract noundef double @__ocml_y0_f64(double noundef [[Y:%.*]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I20_I:%.*]] = tail call contract noundef double @__ocml_y0_f64(double noundef [[Y:%.*]]) #[[ATTR16]]
 // APPROX-NEXT:    br label [[_ZL2YNID_EXIT:%.*]]
 // APPROX:       if.then2.i:
-// APPROX-NEXT:    [[CALL_I20_I:%.*]] = tail call contract noundef double @__ocml_y1_f64(double noundef [[Y]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I22_I:%.*]] = tail call contract noundef double @__ocml_y1_f64(double noundef [[Y]]) #[[ATTR16]]
 // APPROX-NEXT:    br label [[_ZL2YNID_EXIT]]
 // APPROX:       if.end4.i:
-// APPROX-NEXT:    [[CALL_I21_I:%.*]] = tail call contract noundef double @__ocml_y0_f64(double noundef [[Y]]) #[[ATTR16]]
-// APPROX-NEXT:    [[CALL_I22_I:%.*]] = tail call contract noundef double @__ocml_y1_f64(double noundef [[Y]]) #[[ATTR16]]
-// APPROX-NEXT:    [[CMP723_I:%.*]] = icmp sgt i32 [[X]], 1
-// APPROX-NEXT:    br i1 [[CMP723_I]], label [[FOR_BODY_I:%.*]], label [[_ZL2YNID_EXIT]]
+// APPROX-NEXT:    [[CALL_I_I:%.*]] = tail call contract noundef double @__ocml_y0_f64(double noundef [[Y]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I21_I:%.*]] = tail call contract noundef double @__ocml_y1_f64(double noundef [[Y]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CMP7_I1:%.*]] = icmp sgt i32 [[X]], 1
+// APPROX-NEXT:    br i1 [[CMP7_I1]], label [[FOR_BODY_I:%.*]], label [[_ZL2YNID_EXIT]]
 // APPROX:       for.body.i:
-// APPROX-NEXT:    [[__I_026_I:%.*]] = phi i32 [ [[INC_I:%.*]], [[FOR_BODY_I]] ], [ 1, [[IF_END4_I]] ]
-// APPROX-NEXT:    [[__X1_025_I:%.*]] = phi double [ [[SUB_I:%.*]], [[FOR_BODY_I]] ], [ [[CALL_I22_I]], [[IF_END4_I]] ]
-// APPROX-NEXT:    [[__X0_024_I:%.*]] = phi double [ [[__X1_025_I]], [[FOR_BODY_I]] ], [ [[CALL_I21_I]], [[IF_END4_I]] ]
-// APPROX-NEXT:    [[MUL_I:%.*]] = shl nuw nsw i32 [[__I_026_I]], 1
+// APPROX-NEXT:    [[__I_0_I4:%.*]] = phi i32 [ [[INC_I:%.*]], [[FOR_BODY_I]] ], [ 1, [[IF_END4_I]] ]
+// APPROX-NEXT:    [[__X1_0_I3:%.*]] = phi double [ [[SUB_I:%.*]], [[FOR_BODY_I]] ], [ [[CALL_I21_I]], [[IF_END4_I]] ]
+// APPROX-NEXT:    [[__X0_0_I2:%.*]] = phi double [ [[__X1_0_I3]], [[FOR_BODY_I]] ], [ [[CALL_I_I]], [[IF_END4_I]] ]
+// APPROX-NEXT:    [[MUL_I:%.*]] = shl nuw nsw i32 [[__I_0_I4]], 1
 // APPROX-NEXT:    [[CONV_I:%.*]] = sitofp i32 [[MUL_I]] to double
 // APPROX-NEXT:    [[DIV_I:%.*]] = fdiv contract double [[CONV_I]], [[Y]]
-// APPROX-NEXT:    [[MUL8_I:%.*]] = fmul contract double [[__X1_025_I]], [[DIV_I]]
-// APPROX-NEXT:    [[SUB_I]] = fsub contract double [[MUL8_I]], [[__X0_024_I]]
-// APPROX-NEXT:    [[INC_I]] = add nuw nsw i32 [[__I_026_I]], 1
-// APPROX-NEXT:    [[EXITCOND_NOT_I:%.*]] = icmp eq i32 [[INC_I]], [[X]]
-// APPROX-NEXT:    br i1 [[EXITCOND_NOT_I]], label [[_ZL2YNID_EXIT]], label [[FOR_BODY_I]], !llvm.loop [[LOOP25:![0-9]+]]
+// APPROX-NEXT:    [[MUL8_I:%.*]] = fmul contract double [[__X1_0_I3]], [[DIV_I]]
+// APPROX-NEXT:    [[SUB_I]] = fsub contract double [[MUL8_I]], [[__X0_0_I2]]
+// APPROX-NEXT:    [[INC_I]] = add nuw nsw i32 [[__I_0_I4]], 1
+// APPROX-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC_I]], [[X]]
+// APPROX-NEXT:    br i1 [[EXITCOND_NOT]], label [[_ZL2YNID_EXIT]], label [[FOR_BODY_I]], !llvm.loop [[LOOP25:![0-9]+]]
 // APPROX:       _ZL2ynid.exit:
-// APPROX-NEXT:    [[RETVAL_0_I:%.*]] = phi double [ [[CALL_I_I]], [[IF_THEN_I]] ], [ [[CALL_I20_I]], [[IF_THEN2_I]] ], [ [[CALL_I22_I]], [[IF_END4_I]] ], [ [[SUB_I]], [[FOR_BODY_I]] ]
+// APPROX-NEXT:    [[RETVAL_0_I:%.*]] = phi double [ [[CALL_I20_I]], [[IF_THEN_I]] ], [ [[CALL_I22_I]], [[IF_THEN2_I]] ], [ [[CALL_I21_I]], [[IF_END4_I]] ], [ [[SUB_I]], [[FOR_BODY_I]] ]
 // APPROX-NEXT:    ret double [[RETVAL_0_I]]
 //
 extern "C" __device__ double test_yn(int x, double y) {
@@ -4742,26 +4742,26 @@ extern "C" __device__ float test___sinf(float x) {
 
 // DEFAULT-LABEL: @test___tanf(
 // DEFAULT-NEXT:  entry:
-// DEFAULT-NEXT:    [[CALL_I_I:%.*]] = tail call contract noundef float @__ocml_native_sin_f32(float noundef [[X:%.*]]) #[[ATTR16]]
-// DEFAULT-NEXT:    [[CALL_I3_I:%.*]] = tail call contract noundef float @__ocml_native_cos_f32(float noundef [[X]]) #[[ATTR16]]
-// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract float @llvm.amdgcn.rcp.f32(float [[CALL_I3_I]])
-// DEFAULT-NEXT:    [[MUL_I:%.*]] = fmul contract float [[CALL_I_I]], [[TMP0]]
+// DEFAULT-NEXT:    [[CALL_I3_I:%.*]] = tail call contract noundef float @__ocml_native_sin_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[CALL_I_I:%.*]] = tail call contract noundef float @__ocml_native_cos_f32(float noundef [[X]]) #[[ATTR16]]
+// DEFAULT-NEXT:    [[TMP0:%.*]] = tail call contract float @llvm.amdgcn.rcp.f32(float [[CALL_I_I]])
+// DEFAULT-NEXT:    [[MUL_I:%.*]] = fmul contract float [[CALL_I3_I]], [[TMP0]]
 // DEFAULT-NEXT:    ret float [[MUL_I]]
 //
 // FINITEONLY-LABEL: @test___tanf(
 // FINITEONLY-NEXT:  entry:
-// FINITEONLY-NEXT:    [[CALL_I_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_native_sin_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
-// FINITEONLY-NEXT:    [[CALL_I3_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_native_cos_f32(float noundef nofpclass(nan inf) [[X]]) #[[ATTR16]]
-// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract float @llvm.amdgcn.rcp.f32(float [[CALL_I3_I]])
-// FINITEONLY-NEXT:    [[MUL_I:%.*]] = fmul nnan ninf contract float [[CALL_I_I]], [[TMP0]]
+// FINITEONLY-NEXT:    [[CALL_I3_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_native_sin_f32(float noundef nofpclass(nan inf) [[X:%.*]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[CALL_I_I:%.*]] = tail call nnan ninf contract noundef nofpclass(nan inf) float @__ocml_native_cos_f32(float noundef nofpclass(nan inf) [[X]]) #[[ATTR16]]
+// FINITEONLY-NEXT:    [[TMP0:%.*]] = tail call nnan ninf contract float @llvm.amdgcn.rcp.f32(float [[CALL_I_I]])
+// FINITEONLY-NEXT:    [[MUL_I:%.*]] = fmul nnan ninf contract float [[CALL_I3_I]], [[TMP0]]
 // FINITEONLY-NEXT:    ret float [[MUL_I]]
 //
 // APPROX-LABEL: @test___tanf(
 // APPROX-NEXT:  entry:
-// APPROX-NEXT:    [[CALL_I_I:%.*]] = tail call contract noundef float @__ocml_native_sin_f32(float noundef [[X:%.*]]) #[[ATTR16]]
-// APPROX-NEXT:    [[CALL_I3_I:%.*]] = tail call contract noundef float @__ocml_native_cos_f32(float noundef [[X]]) #[[ATTR16]]
-// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract float @llvm.amdgcn.rcp.f32(float [[CALL_I3_I]])
-// APPROX-NEXT:    [[MUL_I:%.*]] = fmul contract float [[CALL_I_I]], [[TMP0]]
+// APPROX-NEXT:    [[CALL_I3_I:%.*]] = tail call contract noundef float @__ocml_native_sin_f32(float noundef [[X:%.*]]) #[[ATTR16]]
+// APPROX-NEXT:    [[CALL_I_I:%.*]] = tail call contract noundef float @__ocml_native_cos_f32(float noundef [[X]]) #[[ATTR16]]
+// APPROX-NEXT:    [[TMP0:%.*]] = tail call contract float @llvm.amdgcn.rcp.f32(float [[CALL_I_I]])
+// APPROX-NEXT:    [[MUL_I:%.*]] = fmul contract float [[CALL_I3_I]], [[TMP0]]
 // APPROX-NEXT:    ret float [[MUL_I]]
 //
 extern "C" __device__ float test___tanf(float x) {
diff --git a/clang/test/OpenMP/bug57757.cpp b/clang/test/OpenMP/bug57757.cpp
index 7894796ac4628..7acfe134ddd0b 100644
--- a/clang/test/OpenMP/bug57757.cpp
+++ b/clang/test/OpenMP/bug57757.cpp
@@ -32,24 +32,23 @@ void foo() {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T:%.*]], ptr [[TMP1]], i64 0, i32 2
 // CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META13:![0-9]+]])
-// CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META16:![0-9]+]])
-// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4, !tbaa [[TBAA18:![0-9]+]], !alias.scope !13, !noalias !16
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4, !tbaa [[TBAA16:![0-9]+]], !alias.scope !13, !noalias !17
 // CHECK-NEXT:    switch i32 [[TMP3]], label [[DOTOMP_OUTLINED__EXIT:%.*]] [
 // CHECK-NEXT:    i32 0, label [[DOTUNTIED_JMP__I:%.*]]
 // CHECK-NEXT:    i32 1, label [[DOTUNTIED_NEXT__I:%.*]]
 // CHECK-NEXT:    ]
 // CHECK:       .untied.jmp..i:
-// CHECK-NEXT:    store i32 1, ptr [[TMP2]], align 4, !tbaa [[TBAA18]], !alias.scope !13, !noalias !16
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call i32 @__kmpc_omp_task(ptr nonnull @[[GLOB1]], i32 [[TMP0]], ptr [[TMP1]]), !noalias !19
+// CHECK-NEXT:    store i32 1, ptr [[TMP2]], align 4, !tbaa [[TBAA16]], !alias.scope !13, !noalias !17
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call i32 @__kmpc_omp_task(ptr nonnull @[[GLOB1]], i32 [[TMP0]], ptr [[TMP1]]), !noalias !13
 // CHECK-NEXT:    br label [[DOTOMP_OUTLINED__EXIT]]
 // CHECK:       .untied.next..i:
 // CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES:%.*]], ptr [[TMP1]], i64 0, i32 1
 // CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES]], ptr [[TMP1]], i64 0, i32 1, i32 2
 // CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES]], ptr [[TMP1]], i64 0, i32 1, i32 1
-// CHECK-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[TMP5]], align 8, !tbaa [[TBAA20:![0-9]+]], !alias.scope !16, !noalias !13
-// CHECK-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA18]], !alias.scope !16, !noalias !13
-// CHECK-NEXT:    [[TMP10:%.*]] = load float, ptr [[TMP6]], align 4, !tbaa [[TBAA21:![0-9]+]], !alias.scope !16, !noalias !13
-// CHECK-NEXT:    tail call void [[TMP8]](i32 noundef [[TMP9]], float noundef [[TMP10]]) #[[ATTR2:[0-9]+]], !noalias !19
+// CHECK-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[TMP5]], align 8, !tbaa [[TBAA19:![0-9]+]], !noalias !13
+// CHECK-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA16]], !noalias !13
+// CHECK-NEXT:    [[TMP10:%.*]] = load float, ptr [[TMP6]], align 4, !tbaa [[TBAA20:![0-9]+]], !noalias !13
+// CHECK-NEXT:    tail call void [[TMP8]](i32 noundef [[TMP9]], float noundef [[TMP10]]) #[[ATTR2:[0-9]+]], !noalias !13
 // CHECK-NEXT:    br label [[DOTOMP_OUTLINED__EXIT]]
 // CHECK:       .omp_outlined..exit:
 // CHECK-NEXT:    ret i32 0
diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index 600f8d43caaf2..baea2913338cd 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -164,7 +164,7 @@ static cl::opt<bool> EnableModuleInliner("enable-module-inliner",
                                          cl::desc("Enable module inliner"));
 
 static cl::opt<bool> PerformMandatoryInliningsFirst(
-    "mandatory-inlining-first", cl::init(true), cl::Hidden,
+    "mandatory-inlining-first", cl::init(false), cl::Hidden,
     cl::desc("Perform mandatory inlinings module-wide, before performing "
              "inlining"));
 
@@ -1127,6 +1127,8 @@ PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level,
   if (EnableSyntheticCounts && !PGOOpt)
     MPM.addPass(SyntheticCountsPropagation());
 
+  MPM.addPass(AlwaysInlinerPass(/*InsertLifetimeIntrinsics=*/true));
+
   if (EnableModuleInliner)
     MPM.addPass(buildModuleInlinerPipeline(Level, Phase));
   else
diff --git a/llvm/test/Other/new-pm-defaults.ll b/llvm/test/Other/new-pm-defaults.ll
index 098db2b959a29..ecdb5a5e010d9 100644
--- a/llvm/test/Other/new-pm-defaults.ll
+++ b/llvm/test/Other/new-pm-defaults.ll
@@ -121,6 +121,8 @@
 ; CHECK-O-NEXT: Running analysis: OuterAnalysisManagerProxy
 ; CHECK-EP-PEEPHOLE-NEXT: Running pass: NoOpFunctionPass
 ; CHECK-O-NEXT: Running pass: SimplifyCFGPass
+; CHECK-O-NEXT: Running pass: AlwaysInlinerPass
+; CHECK-O-NEXT: Running analysis: ProfileSummaryAnalysis
 ; CHECK-O-NEXT: Running pass: ModuleInlinerWrapperPass
 ; CHECK-O-NEXT: Running analysis: InlineAdvisorAnalysis
 ; CHECK-O-NEXT: Running pass: RequireAnalysisPass<{{.*}}GlobalsAA
@@ -129,14 +131,12 @@
 ; CHECK-O-NEXT: Running pass: InvalidateAnalysisPass<{{.*}}AAManager
 ; CHECK-O-NEXT: Invalidating analysis: AAManager
 ; CHECK-O-NEXT: Running pass: RequireAnalysisPass<{{.*}}ProfileSummaryAnalysis
-; CHECK-O-NEXT: Running analysis: ProfileSummaryAnalysis
 ; CHECK-O-NEXT: Running analysis: InnerAnalysisManagerProxy
 ; CHECK-O-NEXT: Running analysis: LazyCallGraphAnalysis
 ; CHECK-O-NEXT: Running analysis: FunctionAnalysisManagerCGSCCProxy
 ; CHECK-O-NEXT: Running analysis: OuterAnalysisManagerProxy<{{.*}}LazyCallGraph::SCC{{.*}}>
 ; CHECK-O-NEXT: Running pass: DevirtSCCRepeatedPass
 ; CHECK-O-NEXT: Running pass: InlinerPass
-; CHECK-O-NEXT: Running pass: InlinerPass
 ; CHECK-O-NEXT: Running pass: PostOrderFunctionAttrsPass
 ; CHECK-O3-NEXT: Running pass: ArgumentPromotionPass
 ; CHECK-O2-NEXT: Running pass: OpenMPOptCGSCCPass on (foo)
diff --git a/llvm/test/Other/new-pm-print-pipeline.ll b/llvm/test/Other/new-pm-print-pipeline.ll
index 6214d6abcef3a..c9b06ac7e6e8e 100644
--- a/llvm/test/Other/new-pm-print-pipeline.ll
+++ b/llvm/test/Other/new-pm-print-pipeline.ll
@@ -62,7 +62,7 @@
 ; CHECK-20: cgscc(inline<only-mandatory>,inline),cgscc(inline)
 
 ; RUN: opt -disable-output -disable-verify -print-pipeline-passes -passes='scc-oz-module-inliner' < %s | FileCheck %s --match-full-lines --check-prefixes=CHECK-21
-; CHECK-21: require<globals-aa>,function(invalidate<aa>),require<profile-summary>,cgscc(devirt<4>(inline<only-mandatory>,inline,{{.*}},instcombine{{.*}}))
+; CHECK-21: require<globals-aa>,function(invalidate<aa>),require<profile-summary>,cgscc(devirt<4>(inline,{{.*}},instcombine{{.*}}))
 
 ; RUN: opt -disable-output -disable-verify -print-pipeline-passes -passes='cgscc(function<eager-inv>(no-op-function)),function<eager-inv>(no-op-function)' < %s | FileCheck %s --match-full-lines --check-prefixes=CHECK-22
 ; CHECK-22: cgscc(function<eager-inv>(no-op-function)),function<eager-inv>(no-op-function)
diff --git a/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll
index 1103c8f984fa2..064362eabbf83 100644
--- a/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll
@@ -61,6 +61,8 @@
 ; CHECK-O-NEXT: Running analysis: TypeBasedAA
 ; CHECK-O-NEXT: Running analysis: OuterAnalysisManagerProxy
 ; CHECK-O-NEXT: Running pass: SimplifyCFGPass
+; CHECK-O-NEXT: Running pass: AlwaysInlinerPass
+; CHECK-PRELINK-O-NEXT: Running analysis: ProfileSummaryAnalysis
 ; CHECK-O-NEXT: Running pass: ModuleInlinerWrapperPass
 ; CHECK-O-NEXT: Running analysis: InlineAdvisorAnalysis
 ; CHECK-O-NEXT: Running pass: RequireAnalysisPass<{{.*}}GlobalsAA
@@ -75,7 +77,6 @@
 ; CHECK-O-NEXT: Running analysis: OuterAnalysisManagerProxy
 ; CHECK-O-NEXT: Running pass: DevirtSCCRepeatedPass
 ; CHECK-O-NEXT: Running pass: InlinerPass
-; CHECK-O-NEXT: Running pass: InlinerPass
 ; CHECK-O-NEXT: Running pass: PostOrderFunctionAttrsPass
 ; CHECK-O3-NEXT: Running pass: ArgumentPromotionPass
 ; CHECK-O2-NEXT: Running pass: OpenMPOptCGSCCPass on (foo)
diff --git a/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll
index 05d165ae399f9..19a44867e434a 100644
--- a/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll
@@ -50,6 +50,7 @@
 ; CHECK-O-NEXT: Running analysis: LoopAnalysis on foo
 ; CHECK-O-NEXT: Running analysis: PostDominatorTreeAnalysis on foo
 ; CHECK-O-NEXT: Running pass: SimplifyCFGPass
+; CHECK-O-NEXT: Running pass: AlwaysInlinerPass
 ; CHECK-O-NEXT: Running pass: ModuleInlinerWrapperPass
 ; CHECK-O-NEXT: Running analysis: InlineAdvisorAnalysis
 ; CHECK-O-NEXT: Running pass: RequireAnalysisPass<{{.*}}GlobalsAA
@@ -64,7 +65,6 @@
 ; CHECK-O-NEXT: Running analysis: OuterAnalysisManagerProxy<{{.*}}LazyCallGraph::SCC{{.*}}>
 ; CHECK-O-NEXT: Running pass: DevirtSCCRepeatedPass
 ; CHECK-O-NEXT: Running pass: InlinerPass
-; CHECK-O-NEXT: Running pass: InlinerPass
 ; CHECK-O-NEXT: Running pass: PostOrderFunctionAttrsPass
 ; CHECK-O3-NEXT: Running pass: ArgumentPromotionPass
 ; CHECK-O2-NEXT: Running pass: OpenMPOptCGSCCPass
diff --git a/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll
index 2393ba9dbde4b..ac80a31d8fd4b 100644
--- a/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll
@@ -58,6 +58,8 @@
 ; CHECK-O-NEXT: Running analysis: LoopAnalysis on foo
 ; CHECK-O-NEXT: Running analysis: PostDominatorTreeAnalysis on foo
 ; CHECK-O-NEXT: Running pass: SimplifyCFGPass on foo
+
+; CHECK-O-NEXT: Running pass: AlwaysInlinerPass
 ; CHECK-O-NEXT: Running pass: ModuleInlinerWrapperPass
 ; CHECK-O-NEXT: Running analysis: InlineAdvisorAnalysis
 ; CHECK-O-NEXT: Running pass: RequireAnalysisPass<{{.*}}GlobalsAA
@@ -71,7 +73,6 @@
 ; CHECK-O-NEXT: Running analysis: OuterAnalysisManagerProxy
 ; CHECK-O-NEXT: Running pass: DevirtSCCRepeatedPass
 ; CHECK-O-NEXT: Running pass: InlinerPass
-; CHECK-O-NEXT: Running pass: InlinerPass
 ; CHECK-O-NEXT: Running pass: PostOrderFunctionAttrsPass
 ; CHECK-O3-NEXT: Running pass: ArgumentPromotionPass
 ; CHECK-O2-NEXT: Running pass: OpenMPOptCGSCCPass
diff --git a/llvm/test/Other/new-pm-thinlto-prelink-defaults.ll b/llvm/test/Other/new-pm-thinlto-prelink-defaults.ll
index 722d586c86907..6486639e07b49 100644
--- a/llvm/test/Other/new-pm-thinlto-prelink-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-prelink-defaults.ll
@@ -92,6 +92,8 @@
 ; CHECK-O-NEXT: Running analysis: TypeBasedAA
 ; CHECK-O-NEXT: Running analysis: OuterAnalysisManagerProxy
 ; CHECK-O-NEXT: Running pass: SimplifyCFGPass
+; CHECK-O-NEXT: Running pass: AlwaysInlinerPass
+; CHECK-O-NEXT: Running analysis: ProfileSummaryAnalysis
 ; CHECK-O-NEXT: Running pass: ModuleInlinerWrapperPass
 ; CHECK-O-NEXT: Running analysis: InlineAdvisorAnalysis
 ; CHECK-O-NEXT: Running pass: RequireAnalysisPass<{{.*}}GlobalsAA
@@ -100,14 +102,12 @@
 ; CHECK-O-NEXT: Running pass: InvalidateAnalysisPass<{{.*}}AAManager
 ; CHECK-O-NEXT: Invalidating analysis: AAManager
 ; CHECK-O-NEXT: Running pass: RequireAnalysisPass<{{.*}}ProfileSummaryAnalysis
-; CHECK-O-NEXT: Running analysis: ProfileSummaryAnalysis
 ; CHECK-O-NEXT: Running analysis: InnerAnalysisManagerProxy
 ; CHECK-O-NEXT: Running analysis: LazyCallGraphAnalysis
 ; CHECK-O-NEXT: Running analysis: FunctionAnalysisManagerCGSCCProxy
 ; CHECK-O-NEXT: Running analysis: OuterAnalysisManagerProxy
 ; CHECK-O-NEXT: Running pass: DevirtSCCRepeatedPass
 ; CHECK-O-NEXT: Running pass: InlinerPass
-; CHECK-O-NEXT: Running pass: InlinerPass
 ; CHECK-O-NEXT: Running pass: PostOrderFunctionAttrsPass
 ; CHECK-O3-NEXT: Running pass: ArgumentPromotionPass
 ; CHECK-O2-NEXT: Running pass: OpenMPOptCGSCCPass on (foo)
diff --git a/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll
index 7e4c82666f0cd..09f9f0f48badd 100644
--- a/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll
@@ -83,6 +83,7 @@
 ; CHECK-O-NEXT: Running pass: PGOIndirectCallPromotion on
 ; CHECK-O-NEXT: Running analysis: InnerAnalysisManagerProxy
 ; CHECK-O-NEXT: Running analysis: OptimizationRemarkEmitterAnalysis on foo
+; CHECK-O-NEXT: Running pass: AlwaysInlinerPass
 ; CHECK-O-NEXT: Running pass: ModuleInlinerWrapperPass
 ; CHECK-O-NEXT: Running analysis: InlineAdvisorAnalysis
 ; CHECK-O-NEXT: Running pass: RequireAnalysisPass<{{.*}}GlobalsAA
@@ -97,7 +98,6 @@
 ; CHECK-O-NEXT: Running analysis: OuterAnalysisManagerProxy<{{.*}}LazyCallGraph::SCC{{.*}}>
 ; CHECK-O-NEXT: Running pass: DevirtSCCRepeatedPass
 ; CHECK-O-NEXT: Running pass: InlinerPass
-; CHECK-O-NEXT: Running pass: InlinerPass
 ; CHECK-O-NEXT: Running pass: PostOrderFunctionAttrsPass
 ; CHECK-O3-NEXT: Running pass: ArgumentPromotionPass
 ; CHECK-O2-NEXT: Running pass: OpenMPOptCGSCCPass
diff --git a/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll
index c609fbe31de21..47bdbfd2d357d 100644
--- a/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll
@@ -63,6 +63,7 @@
 ; CHECK-O-NEXT: Running analysis: LoopAnalysis on foo
 ; CHECK-O-NEXT: Running analysis: PostDominatorTreeAnalysis on foo
 ; CHECK-O-NEXT: Running pass: SimplifyCFGPass on foo
+; CHECK-O-NEXT: Running pass: AlwaysInlinerPass
 ; CHECK-O-NEXT: Running pass: ModuleInlinerWrapperPass
 ; CHECK-O-NEXT: Running analysis: InlineAdvisorAnalysis
 ; CHECK-O-NEXT: Running pass: RequireAnalysisPass<{{.*}}GlobalsAA
@@ -76,7 +77,6 @@
 ; CHECK-O-NEXT: Running analysis: OuterAnalysisManagerProxy<{{.*}}LazyCallGraph::SCC{{.*}}>
 ; CHECK-O-NEXT: Running pass: DevirtSCCRepeatedPass
 ; CHECK-O-NEXT: Running pass: InlinerPass
-; CHECK-O-NEXT: Running pass: InlinerPass
 ; CHECK-O-NEXT: Running pass: PostOrderFunctionAttrsPass
 ; CHECK-O3-NEXT: Running pass: ArgumentPromotionPass
 ; CHECK-O2-NEXT: Running pass: OpenMPOptCGSCCPass
diff --git a/llvm/test/Transforms/Inline/always-inline-phase-ordering.ll b/llvm/test/Transforms/Inline/always-inline-phase-ordering.ll
new file mode 100644
index 0000000000000..e69ca48344900
--- /dev/null
+++ b/llvm/test/Transforms/Inline/always-inline-phase-ordering.ll
@@ -0,0 +1,164 @@
+; RUN: opt --Os -pass-remarks=inline -S < %s 2>&1 | FileCheck %s
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "arm64e-apple-macosx13"
+
+; CHECK: remark: <unknown>:0:0: 'wibble' inlined into 'bar.8' with (cost=always): always inline attribute
+; CHECK: remark: <unknown>:0:0: 'wibble' inlined into 'pluto' with (cost=always): always inline attribute
+; CHECK: remark: <unknown>:0:0: 'snork' inlined into 'blam' with (cost=always): always inline attribute
+; CHECK: remark: <unknown>:0:0: 'wobble' inlined into 'blam' with (cost=always): always inline attribute
+; CHECK: remark: <unknown>:0:0: 'wobble' inlined into 'snork' with (cost=always): always inline attribute
+; CHECK: remark: <unknown>:0:0: 'spam' inlined into 'blam' with (cost=65, threshold=75)
+; CHECK: remark: <unknown>:0:0: 'wibble.1' inlined into 'widget' with (cost=30, threshold=75)
+; CHECK: remark: <unknown>:0:0: 'widget' inlined into 'bar.8' with (cost=30, threshold=75)
+; CHECK: remark: <unknown>:0:0: 'barney' inlined into 'wombat' with (cost=30, threshold=75)
+
+define linkonce_odr void @wombat(ptr %arg) #0 {
+bb:
+  call void @barney()
+  ret void
+}
+
+define i1 @foo() {
+bb:
+  call void @wombat(ptr null)
+  unreachable
+}
+
+define linkonce_odr void @pluto() #1 !prof !38 {
+bb:
+  call void @wibble()
+  ret void
+}
+
+; Function Attrs: alwaysinline
+define linkonce_odr void @wibble() #2 {
+bb:
+  call void @widget()
+  ret void
+}
+
+define linkonce_odr void @widget() {
+bb:
+  call void @wibble.1()
+  ret void
+}
+
+define linkonce_odr void @wibble.1() {
+bb:
+  %0 = call i32 @foo.2()
+  call void @blam()
+  ret void
+}
+
+declare i32 @foo.2()
+
+define linkonce_odr void @blam() {
+bb:
+  %tmp = call i32 @snork()
+  %tmpv1 = call ptr @wombat.3()
+  call void @eggs()
+  %tmpv2 = call ptr @wombat.3()
+  ret void
+}
+
+; Function Attrs: alwaysinline
+define linkonce_odr i32 @snork() #2 {
+bb:
+  %tmpv1 = call i32 @spam()
+  %tmpv2 = call i32 @wobble()
+  call void @widget.4(i32 %tmpv2)
+  ret i32 0
+}
+
+declare void @eggs()
+
+declare ptr @wombat.3()
+
+define linkonce_odr i32 @spam() {
+bb:
+  %tmpv1 = call i32 @wombat.6()
+  %tmpv2 = call i64 @wobble.5(i8 0)
+  %tmpv3 = call i64 @bar()
+  ret i32 0
+}
+
+; Function Attrs: alwaysinline
+define linkonce_odr i32 @wobble() #2 {
+bb:
+  %tmpv = call i64 @wobble.5(i8 0)
+  %tmpv1 = call i64 @eggs.7()
+  %tmpv2 = call i64 @wobble.5(i8 0)
+  %tmpv3 = call i64 @eggs.7()
+  %tmpv4 = lshr i64 %tmpv1, 1
+  %tmpv5 = trunc i64 %tmpv4 to i32
+  %tmpv6 = xor i32 %tmpv5, 23
+  ret i32 %tmpv6
+}
+
+declare void @widget.4(i32)
+
+declare i64 @bar()
+
+declare i64 @wobble.5(i8)
+
+declare i32 @wombat.6()
+
+declare i64 @eggs.7()
+
+define linkonce_odr void @barney() {
+bb:
+  call void @bar.8()
+  call void @pluto()
+  unreachable
+}
+
+define linkonce_odr void @bar.8() {
+bb:
+  call void @wibble()
+  ret void
+}
+
+attributes #0 = { "frame-pointer"="non-leaf" }
+attributes #1 = { "target-cpu"="apple-m1" }
+attributes #2 = { alwaysinline }
+
+!llvm.module.flags = !{!0, !1, !30, !31, !32, !36, !37}
+
+!0 = !{i32 2, !"SDK Version", [2 x i32] [i32 13, i32 3]}
+!1 = !{i32 1, !"ProfileSummary", !2}
+!2 = !{!3, !4, !5, !6, !7, !8, !9, !10, !11, !12}
+!3 = !{!"ProfileFormat", !"InstrProf"}
+!4 = !{!"TotalCount", i64 864540306756}
+!5 = !{!"MaxCount", i64 6596759955}
+!6 = !{!"MaxInternalCount", i64 2828618424}
+!7 = !{!"MaxFunctionCount", i64 6596759955}
+!8 = !{!"NumCounts", i64 268920}
+!9 = !{!"NumFunctions", i64 106162}
+!10 = !{!"IsPartialProfile", i64 0}
+!11 = !{!"PartialProfileRatio", double 0.000000e+00}
+!12 = !{!"DetailedSummary", !13}
+!13 = !{!14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !28, !29}
+!14 = !{i32 10000, i64 5109654023, i32 2}
+!15 = !{i32 100000, i64 2480859832, i32 25}
+!16 = !{i32 200000, i64 1566552109, i32 70}
+!17 = !{i32 300000, i64 973667919, i32 140}
+!18 = !{i32 400000, i64 552159773, i32 263}
+!19 = !{i32 500000, i64 353879860, i32 463}
+!20 = !{i32 600000, i64 187122455, i32 799}
+!21 = !{i32 700000, i64 105465980, i32 1419}
+!22 = !{i32 800000, i64 49243829, i32 2620}
+!23 = !{i32 900000, i64 15198227, i32 5898}
+!24 = !{i32 950000, i64 5545670, i32 10696}
+!25 = !{i32 990000, i64 804816, i32 25738}
+!26 = !{i32 999000, i64 73999, i32 53382}
+!27 = !{i32 999900, i64 6530, i32 83503}
+!28 = !{i32 999990, i64 899, i32 110416}
+!29 = !{i32 999999, i64 120, i32 130201}
+!30 = !{i32 7, !"Dwarf Version", i32 4}
+!31 = !{i32 2, !"Debug Info Version", i32 3}
+!32 = !{i32 1, !"wchar_size", i32 4}
+!34 = !{!35}
+!35 = !{i32 0, i1 false}
+!36 = !{i32 8, !"PIC Level", i32 2}
+!37 = !{i32 7, !"frame-pointer", i32 1}
+!38 = !{!"function_entry_count", i64 15128150}

From 133e50db31681b3becc75438d09b792a7fc28177 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Sat, 28 Oct 2023 23:48:12 -0700
Subject: [PATCH 306/877] [RISCV][GISel] Directly emit X0 from
 getICMPOperandsForBranch instead of using buildConstant+selectConstant.

This is simpler to implement and matches what SelectionDAG emits.

Also allows us to make getICMPOperandsForBranch a static function.
---
 .../RISCV/GISel/RISCVInstructionSelector.cpp  | 22 +++++--------------
 .../instruction-select/select-rv32.mir        |  6 ++---
 .../instruction-select/select-rv64.mir        |  6 ++---
 3 files changed, 10 insertions(+), 24 deletions(-)

diff --git a/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp b/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp
index a8ce01f703f33..bd3662c942de8 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp
+++ b/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp
@@ -91,13 +91,6 @@ class RISCVInstructionSelector : public InstructionSelector {
   void renderImm(MachineInstrBuilder &MIB, const MachineInstr &MI,
                  int OpIdx) const;
 
-  /// Sets CC, LHS, and RHS so that they form an equivelent G_ICMP (ICMPCC, LHS,
-  /// RHS) to that of MI, but whose condition code matches one of the
-  /// comparisons supported directly by branches in the RISC-V ISA.
-  void getICMPOperandsForBranch(MachineInstr &MI, MachineIRBuilder &MIB,
-                                MachineRegisterInfo &MRI, RISCVCC::CondCode &CC,
-                                Register &LHS, Register &RHS) const;
-
   void renderTrailingZeros(MachineInstrBuilder &MIB, const MachineInstr &MI,
                            int OpIdx) const;
 
@@ -594,9 +587,10 @@ static RISCVCC::CondCode getRISCVCCFromICMP(CmpInst::Predicate CC) {
   }
 }
 
-void RISCVInstructionSelector::getICMPOperandsForBranch(
-    MachineInstr &MI, MachineIRBuilder &MIB, MachineRegisterInfo &MRI,
-    RISCVCC::CondCode &CC, Register &LHS, Register &RHS) const {
+static void getICMPOperandsForBranch(MachineInstr &MI, MachineIRBuilder &MIB,
+                                     MachineRegisterInfo &MRI,
+                                     RISCVCC::CondCode &CC, Register &LHS,
+                                     Register &RHS) {
   assert(MI.getOpcode() == TargetOpcode::G_ICMP);
   CmpInst::Predicate ICMPCC =
       static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate());
@@ -609,21 +603,17 @@ void RISCVInstructionSelector::getICMPOperandsForBranch(
     case CmpInst::Predicate::ICMP_SGT:
       // Convert X > -1 to X >= 0
       if (*Constant == -1) {
-        MachineInstr *Zero = MIB.buildConstant(MRI.getType(RHS), 0);
-        selectConstant(*Zero, MIB, MRI);
         CC = RISCVCC::COND_GE;
-        RHS = Zero->getOperand(0).getReg();
+        RHS = RISCV::X0;
         return;
       }
       break;
     case CmpInst::Predicate::ICMP_SLT:
       // Convert X < 1 to 0 >= X
       if (*Constant == 1) {
-        MachineInstr *Zero = MIB.buildConstant(MRI.getType(RHS), 0);
-        selectConstant(*Zero, MIB, MRI);
         CC = RISCVCC::COND_GE;
         RHS = LHS;
-        LHS = Zero->getOperand(0).getReg();
+        LHS = RISCV::X0;
         return;
       }
       break;
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/select-rv32.mir b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/select-rv32.mir
index b9bd9b980e2e4..552a270b54a79 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/select-rv32.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/select-rv32.mir
@@ -128,8 +128,7 @@ body:            |
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x10
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr = COPY $x11
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr = COPY $x12
-    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr = COPY $x0
-    ; CHECK-NEXT: [[Select_GPR_Using_CC_GPR:%[0-9]+]]:gpr = Select_GPR_Using_CC_GPR [[COPY2]], [[COPY3]], 3, [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: [[Select_GPR_Using_CC_GPR:%[0-9]+]]:gpr = Select_GPR_Using_CC_GPR [[COPY2]], $x0, 3, [[COPY]], [[COPY1]]
     ; CHECK-NEXT: $x10 = COPY [[Select_GPR_Using_CC_GPR]]
     ; CHECK-NEXT: PseudoRET implicit $x10
     %0:gprb(s32) = COPY $x10
@@ -159,8 +158,7 @@ body:            |
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x10
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr = COPY $x11
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr = COPY $x12
-    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr = COPY $x0
-    ; CHECK-NEXT: [[Select_GPR_Using_CC_GPR:%[0-9]+]]:gpr = Select_GPR_Using_CC_GPR [[COPY3]], [[COPY2]], 3, [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: [[Select_GPR_Using_CC_GPR:%[0-9]+]]:gpr = Select_GPR_Using_CC_GPR $x0, [[COPY2]], 3, [[COPY]], [[COPY1]]
     ; CHECK-NEXT: $x10 = COPY [[Select_GPR_Using_CC_GPR]]
     ; CHECK-NEXT: PseudoRET implicit $x10
     %0:gprb(s32) = COPY $x10
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/select-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/select-rv64.mir
index 6eee273d320be..16342a83d2275 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/select-rv64.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/select-rv64.mir
@@ -128,8 +128,7 @@ body:            |
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x10
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr = COPY $x11
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr = COPY $x12
-    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr = COPY $x0
-    ; CHECK-NEXT: [[Select_GPR_Using_CC_GPR:%[0-9]+]]:gpr = Select_GPR_Using_CC_GPR [[COPY2]], [[COPY3]], 3, [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: [[Select_GPR_Using_CC_GPR:%[0-9]+]]:gpr = Select_GPR_Using_CC_GPR [[COPY2]], $x0, 3, [[COPY]], [[COPY1]]
     ; CHECK-NEXT: $x10 = COPY [[Select_GPR_Using_CC_GPR]]
     ; CHECK-NEXT: PseudoRET implicit $x10
     %0:gprb(s64) = COPY $x10
@@ -159,8 +158,7 @@ body:            |
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x10
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr = COPY $x11
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr = COPY $x12
-    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr = COPY $x0
-    ; CHECK-NEXT: [[Select_GPR_Using_CC_GPR:%[0-9]+]]:gpr = Select_GPR_Using_CC_GPR [[COPY3]], [[COPY2]], 3, [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: [[Select_GPR_Using_CC_GPR:%[0-9]+]]:gpr = Select_GPR_Using_CC_GPR $x0, [[COPY2]], 3, [[COPY]], [[COPY1]]
     ; CHECK-NEXT: $x10 = COPY [[Select_GPR_Using_CC_GPR]]
     ; CHECK-NEXT: PseudoRET implicit $x10
     %0:gprb(s64) = COPY $x10

From 0c4f326d8bc97a2fdb0533a68dbe1a7164da3911 Mon Sep 17 00:00:00 2001
From: DianQK <dianqk@dianqk.net>
Date: Sun, 29 Oct 2023 16:07:55 +0800
Subject: [PATCH 307/877] [MemCpyOpt] Combine alias metadatas when replacing
 byval arguments (#70580)

Fixes #70578.
---
 llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp |  1 +
 llvm/test/Transforms/MemCpyOpt/memcpy.ll       | 14 ++++++++++++++
 2 files changed, 15 insertions(+)

diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
index b366ec5d4c357..a31dedafdcbc4 100644
--- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -1864,6 +1864,7 @@ bool MemCpyOptPass::processByValArgument(CallBase &CB, unsigned ArgNo) {
                     << "  " << CB << "\n");
 
   // Otherwise we're good!  Update the byval argument.
+  combineAAMetadata(&CB, MDep);
   CB.setArgOperand(ArgNo, MDep->getSource());
   ++NumMemCpyInstr;
   return true;
diff --git a/llvm/test/Transforms/MemCpyOpt/memcpy.ll b/llvm/test/Transforms/MemCpyOpt/memcpy.ll
index f7c1c85517ffa..7488203d5db16 100644
--- a/llvm/test/Transforms/MemCpyOpt/memcpy.ll
+++ b/llvm/test/Transforms/MemCpyOpt/memcpy.ll
@@ -393,6 +393,7 @@ declare void @f1(ptr nocapture sret(%struct.big))
 declare void @f2(ptr)
 
 declare void @f(ptr)
+declare void @f_byval(ptr byval(i32))
 declare void @f_full_readonly(ptr nocapture noalias readonly)
 
 define void @immut_param(ptr align 4 noalias %val) {
@@ -709,6 +710,19 @@ define void @immut_param_noalias_metadata(ptr align 4 byval(i32) %ptr) {
   ret void
 }
 
+define void @byval_param_noalias_metadata(ptr align 4 byval(i32) %ptr) {
+; CHECK-LABEL: @byval_param_noalias_metadata(
+; CHECK-NEXT:    store i32 1, ptr [[PTR:%.*]], align 4, !noalias !0
+; CHECK-NEXT:    call void @f_byval(ptr byval(i32) align 4 [[PTR]])
+; CHECK-NEXT:    ret void
+;
+  %tmp = alloca i32, align 4
+  store i32 1, ptr %ptr, !noalias !2
+  call void @llvm.memcpy.p0.p0.i64(ptr align 4 %tmp, ptr align 4 %ptr, i64 4, i1 false)
+  call void @f_byval(ptr align 4 byval(i32) %tmp), !alias.scope !2
+  ret void
+}
+
 !0 = !{!0}
 !1 = !{!1, !0}
 !2 = !{!1}

From 35649c639cd3780652156f8aaca610d0a5f578b3 Mon Sep 17 00:00:00 2001
From: Christian Ulmann <christian.ulmann@nextsilicon.com>
Date: Sun, 29 Oct 2023 09:40:15 +0100
Subject: [PATCH 308/877] [MLIR][LLVM] Deprecate typed pointer operation
 builder (#70576)

This commit deprecates LLVM dialect builders that expect typed pointers.
In the process of removing typed pointers from the LLVM dialect, these
builders will eventually be removed.
---
 mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
index 2a572ab4de706..638c31b39682e 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
@@ -210,7 +210,8 @@ def LLVM_AllocaOp : LLVM_Op<"alloca",
       TypeAttr::get(allocatedType), allocaInst->isUsedWithInAlloca());
   }];
   let builders = [
-    OpBuilder<(ins "Type":$resultType, "Value":$arraySize,
+    DeprecatedOpBuilder<"the usage of typed pointers is deprecated",
+               (ins "Type":$resultType, "Value":$arraySize,
                "unsigned":$alignment),
     [{
       assert(!::llvm::cast<LLVMPointerType>(resultType).isOpaque() &&
@@ -281,10 +282,12 @@ def LLVM_GEPOp : LLVM_Op<"getelementptr", [Pure,
     OpBuilder<(ins "Type":$resultType, "Type":$basePtrType, "Value":$basePtr,
                "ValueRange":$indices, CArg<"bool", "false">:$inbounds,
                CArg<"ArrayRef<NamedAttribute>", "{}">:$attributes)>,
-    OpBuilder<(ins "Type":$resultType, "Value":$basePtr,
+    DeprecatedOpBuilder<"the usage of typed pointers is deprecated",
+               (ins "Type":$resultType, "Value":$basePtr,
                "ValueRange":$indices, CArg<"bool", "false">:$inbounds,
                CArg<"ArrayRef<NamedAttribute>", "{}">:$attributes)>,
-    OpBuilder<(ins "Type":$resultType, "Value":$basePtr,
+    DeprecatedOpBuilder<"the usage of typed pointers is deprecated",
+               (ins "Type":$resultType, "Value":$basePtr,
                "ArrayRef<GEPArg>":$indices, CArg<"bool", "false">:$inbounds,
                CArg<"ArrayRef<NamedAttribute>", "{}">:$attributes)>,
     OpBuilder<(ins "Type":$resultType, "Type":$basePtrType, "Value":$basePtr,
@@ -388,7 +391,8 @@ def LLVM_LoadOp : LLVM_MemAccessOpBase<"load",
         getLLVMSyncScope(loadInst));
   }];
   let builders = [
-    OpBuilder<(ins "Value":$addr, CArg<"unsigned", "0">:$alignment,
+    DeprecatedOpBuilder<"the usage of typed pointers is deprecated",
+      (ins "Value":$addr, CArg<"unsigned", "0">:$alignment,
       CArg<"bool", "false">:$isVolatile, CArg<"bool", "false">:$isNonTemporal)>,
     OpBuilder<(ins "Type":$type, "Value":$addr,
       CArg<"unsigned", "0">:$alignment, CArg<"bool", "false">:$isVolatile,

From f39c38584eb762702a651e87e63162c9bc4842a3 Mon Sep 17 00:00:00 2001
From: Aiden Grossman <agrossman154@yahoo.com>
Date: Sun, 29 Oct 2023 08:35:28 +0000
Subject: [PATCH 309/877] [MLGO] Fix tests post 1a2e77c

This patch switched the default value of the mandatory-inlining-first
flag from true to false. This broke one of the MLGO tests that relied on
the default value of this flag. This patch explicitly sets the value to
fix the test and avoid future breakages.
---
 .../Transforms/Inline/ML/enable-inline-advisor-printing-ml.ll   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/test/Transforms/Inline/ML/enable-inline-advisor-printing-ml.ll b/llvm/test/Transforms/Inline/ML/enable-inline-advisor-printing-ml.ll
index 84e51db8d94fd..376548b3f06fe 100644
--- a/llvm/test/Transforms/Inline/ML/enable-inline-advisor-printing-ml.ll
+++ b/llvm/test/Transforms/Inline/ML/enable-inline-advisor-printing-ml.ll
@@ -1,7 +1,7 @@
 ; REQUIRES: llvm_inliner_model_autogenerated
 
 ; RUN: opt -enable-ml-inliner=release -passes=scc-oz-module-inliner \
-; RUN:     -keep-inline-advisor-for-printing \
+; RUN:     -keep-inline-advisor-for-printing -mandatory-inlining-first=1 \
 ; RUN:     -enable-scc-inline-advisor-printing -S < %s 2>&1 | FileCheck %s
 
 ; RUN: opt -enable-ml-inliner=release -passes=scc-oz-module-inliner \

From e605fba343d1f3cd2f538249546cfce71212ed23 Mon Sep 17 00:00:00 2001
From: hstk30-hw <144751974+hstk30-hw@users.noreply.github.com>
Date: Sun, 29 Oct 2023 17:28:11 +0800
Subject: [PATCH 310/877] fix: asan support aarch64be (#70536)

fix bad `Offset` for aarch64be asan

---------

Co-authored-by: Vitaly Buka <vitalybuka@gmail.com>
---
 .../Instrumentation/AddressSanitizer.cpp      |  3 ++-
 .../AddressSanitizer/aarch64be.ll             | 23 +++++++++++++++++++
 2 files changed, 25 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/Instrumentation/AddressSanitizer/aarch64be.ll

diff --git a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index e80ee1953de6b..1277f88111d11 100644
--- a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -491,7 +491,8 @@ static ShadowMapping getShadowMapping(const Triple &TargetTriple, int LongSize,
   bool IsMIPS32 = TargetTriple.isMIPS32();
   bool IsMIPS64 = TargetTriple.isMIPS64();
   bool IsArmOrThumb = TargetTriple.isARM() || TargetTriple.isThumb();
-  bool IsAArch64 = TargetTriple.getArch() == Triple::aarch64;
+  bool IsAArch64 = TargetTriple.getArch() == Triple::aarch64 ||
+                   TargetTriple.getArch() == Triple::aarch64_be;
   bool IsLoongArch64 = TargetTriple.isLoongArch64();
   bool IsRISCV64 = TargetTriple.getArch() == Triple::riscv64;
   bool IsWindows = TargetTriple.isOSWindows();
diff --git a/llvm/test/Instrumentation/AddressSanitizer/aarch64be.ll b/llvm/test/Instrumentation/AddressSanitizer/aarch64be.ll
new file mode 100644
index 0000000000000..1850efaf8a927
--- /dev/null
+++ b/llvm/test/Instrumentation/AddressSanitizer/aarch64be.ll
@@ -0,0 +1,23 @@
+; RUN: opt < %s -passes='asan-pipeline' -S -mtriple=aarch64-linux-gnu | FileCheck --check-prefix=CHECK-AARCH64LE %s
+ 
+; RUN: opt < %s -passes='asan-pipeline' -S -mtriple=aarch64_be-linux-gnu | FileCheck --check-prefix=CHECK-AARCH64BE %s
+ 
+define i32 @read_4_bytes(i32* %a) sanitize_address {
+entry:
+  %tmp1 = load i32, i32* %a, align 4
+  ret i32 %tmp1
+}
+ 
+; CHECK-AARCH64LE: @read_4_bytes
+; CHECK-AARCH64LE-NOT: ret
+; Check for ASAN's Offset for AArch64 LE (1 << 36 or 68719476736)
+; CHECK-AARCH64LE: lshr {{.*}} 3
+; CHECK-AARCH64Le-NEXT: {{68719476736}}
+; CHECK-AARCH64LE: ret
+ 
+; CHECK-AARCH64BE: @read_4_bytes
+; CHECK-AARCH64BE-NOT: ret
+; Check for ASAN's Offset for AArch64 BE (1 << 36 or 68719476736)
+; CHECK-AARCH64BE: lshr {{.*}} 3
+; CHECK-AARCH64BE-NEXT: {{68719476736}}
+; CHECK-AARCH64BE: ret

From a68ce6cc0116afcc9ddcdd41c8ab82f96bd3fa4b Mon Sep 17 00:00:00 2001
From: Benjamin Kramer <benny.kra@googlemail.com>
Date: Sun, 29 Oct 2023 10:56:43 +0100
Subject: [PATCH 311/877] [bazel] Add missing depdendency for
 a5a908654ee42d270af06c11f17bd558f25e9c36

---
 utils/bazel/llvm-project-overlay/mlir/BUILD.bazel | 1 +
 1 file changed, 1 insertion(+)

diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index 2b6a82111d879..99364fe11e872 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -8046,6 +8046,7 @@ cc_library(
         ":Support",
         ":ViewLikeInterface",
         "//llvm:Support",
+        "//llvm:config",
     ],
 )
 

From 47793481a4dac8b23fbdface9679835ea551a86c Mon Sep 17 00:00:00 2001
From: Kai Sasaki <lewuathe@gmail.com>
Date: Sun, 29 Oct 2023 19:14:10 +0900
Subject: [PATCH 312/877] [mlir][tosa] Fix not to crash with large permutation
 indexes  (#69857)

---
 mlir/lib/Dialect/Tosa/IR/TosaOps.cpp          | 12 +++++++++
 mlir/test/Dialect/Tosa/tosa-infer-shapes.mlir | 26 +++++++++++++++++++
 2 files changed, 38 insertions(+)

diff --git a/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp b/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp
index 9f619a3531ab6..4ec6714a7e02a 100644
--- a/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp
+++ b/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp
@@ -1016,6 +1016,18 @@ LogicalResult tosa::TransposeOp::inferReturnTypeComponents(
   if (matchPattern(adaptor.getPerms(), m_Constant(&attr)) &&
       attr.getType().getRank() == 1) {
     ShapeAdaptor permShape = attr;
+    // Constant permutation must be the same length as the input rank.
+    if (inputShape.getRank() != permShape.getRank())
+      return emitOptionalError(location,
+                               "constant permutation must be the same length"
+                               " as the input rank");
+
+    // Constant permutation values must be within the input rank.
+    for (int i = 0, e = inputShape.getRank(); i < e; i++) {
+      if (inputShape.getRank() <= permShape.getDimSize(i))
+        return failure();
+    }
+
     outputShape.reserve(inputShape.getRank());
     for (int i = 0, s = inputShape.getRank(); i < s; i++) {
       outputShape[i] = inputShape.getDimSize(permShape.getDimSize(i));
diff --git a/mlir/test/Dialect/Tosa/tosa-infer-shapes.mlir b/mlir/test/Dialect/Tosa/tosa-infer-shapes.mlir
index 1ce4defcf4a6e..7af66ae1dbc90 100644
--- a/mlir/test/Dialect/Tosa/tosa-infer-shapes.mlir
+++ b/mlir/test/Dialect/Tosa/tosa-infer-shapes.mlir
@@ -1272,3 +1272,29 @@ func.func @test_tosa_use_def_chain(%arg0: tensor<1x32x32x3xf32>, %arg1: tensor<1
   %1 = tosa.max_pool2d %0 {kernel = array<i64: 2, 2>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 2, 2>} : (tensor<?x32x32x16xf32>) -> tensor<?x16x16x16xf32>
   return %1 : tensor<?x16x16x16xf32>
 }
+
+// -----
+
+// CHECK-LABEL: test_rank_size_constant_permutation
+func.func @test_rank_size_constant_permutation() {
+  %c6 = arith.constant 6 : index
+  %cst_26 = arith.constant dense<[0, 2]> : tensor<2xi32>
+  %14 = tensor.empty(%c6) : tensor<?x27xi64>
+  // Fail to infer the shape but not crash.
+  // CHECK: (tensor<?x27xi64>, tensor<2xi32>) -> tensor<?x27xi64>
+  %72 = tosa.transpose %14, %cst_26 : (tensor<?x27xi64>, tensor<2xi32>) -> tensor<?x27xi64>
+  return
+}
+
+// -----
+
+// CHECK-LABEL: test_large_constant_permutation
+func.func @test_large_constant_permutation() {
+  %c6 = arith.constant 6 : index
+  %cst_26 = arith.constant dense<[1185677355, 332462212]> : tensor<2xi32>
+  %14 = tensor.empty(%c6) : tensor<?x27xi64>
+  // Fail to infer the shape but not crash.
+  // CHECK: (tensor<?x27xi64>, tensor<2xi32>) -> tensor<?x27xi64>
+  %72 = tosa.transpose %14, %cst_26 : (tensor<?x27xi64>, tensor<2xi32>) -> tensor<?x27xi64>
+  return
+}

From 640274ff1fe17852615564a80d9f8c69b67f1d59 Mon Sep 17 00:00:00 2001
From: Nikolas Klauser <nikolasklauser@berlin.de>
Date: Sun, 29 Oct 2023 11:49:50 +0100
Subject: [PATCH 313/877] [libc++][NFC] Refactor __enable_ifs in <cstddef> to
 be defaulted template arguments

This makes the library more consistent and reduces the size of mangled
names a bit as a bonus.
---
 libcxx/include/cstddef | 50 ++++++++++++++++++++----------------------
 1 file changed, 24 insertions(+), 26 deletions(-)

diff --git a/libcxx/include/cstddef b/libcxx/include/cstddef
index a364e6e551756..3844d4a373323 100644
--- a/libcxx/include/cstddef
+++ b/libcxx/include/cstddef
@@ -112,32 +112,30 @@ _LIBCPP_HIDE_FROM_ABI constexpr byte  operator~ (byte __b) noexcept
     ));
 }
 
-template <class _Tp>
-using _EnableByteOverload = __enable_if_t<is_integral<_Tp>::value, byte>;
-
-template <class _Integer>
-_LIBCPP_HIDE_FROM_ABI constexpr _EnableByteOverload<_Integer> &
-  operator<<=(byte& __lhs, _Integer __shift) noexcept
-  { return __lhs = __lhs << __shift; }
-
-template <class _Integer>
-_LIBCPP_HIDE_FROM_ABI constexpr _EnableByteOverload<_Integer>
-  operator<< (byte  __lhs, _Integer __shift) noexcept
-  { return static_cast<byte>(static_cast<unsigned char>(static_cast<unsigned int>(__lhs) << __shift)); }
-
-template <class _Integer>
-_LIBCPP_HIDE_FROM_ABI constexpr _EnableByteOverload<_Integer> &
-  operator>>=(byte& __lhs, _Integer __shift) noexcept
-  { return __lhs = __lhs >> __shift; }
-
-template <class _Integer>
-_LIBCPP_HIDE_FROM_ABI constexpr _EnableByteOverload<_Integer>
-  operator>> (byte  __lhs, _Integer __shift) noexcept
-  { return static_cast<byte>(static_cast<unsigned char>(static_cast<unsigned int>(__lhs) >> __shift)); }
-
-template <class _Integer, class = _EnableByteOverload<_Integer> >
-_LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI constexpr _Integer
-  to_integer(byte __b) noexcept { return static_cast<_Integer>(__b); }
+template <class _Integer, __enable_if_t<is_integral<_Integer>::value, int> = 0>
+_LIBCPP_HIDE_FROM_ABI constexpr byte& operator<<=(byte& __lhs, _Integer __shift) noexcept {
+  return __lhs = __lhs << __shift;
+}
+
+template <class _Integer, __enable_if_t<is_integral<_Integer>::value, int> = 0>
+_LIBCPP_HIDE_FROM_ABI constexpr byte operator<<(byte __lhs, _Integer __shift) noexcept {
+  return static_cast<byte>(static_cast<unsigned char>(static_cast<unsigned int>(__lhs) << __shift));
+}
+
+template <class _Integer, __enable_if_t<is_integral<_Integer>::value, int> = 0>
+_LIBCPP_HIDE_FROM_ABI constexpr byte& operator>>=(byte& __lhs, _Integer __shift) noexcept {
+  return __lhs = __lhs >> __shift;
+}
+
+template <class _Integer, __enable_if_t<is_integral<_Integer>::value, int> = 0>
+_LIBCPP_HIDE_FROM_ABI constexpr byte operator>>(byte __lhs, _Integer __shift) noexcept {
+  return static_cast<byte>(static_cast<unsigned char>(static_cast<unsigned int>(__lhs) >> __shift));
+}
+
+template <class _Integer, __enable_if_t<is_integral<_Integer>::value, int> = 0>
+_LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI constexpr _Integer to_integer(byte __b) noexcept {
+  return static_cast<_Integer>(__b);
+}
 
 } // namespace std
 

From f8fe40090ab302921bc4b83969bfec8b31c902aa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder@redhat.com>
Date: Sun, 29 Oct 2023 14:15:01 +0100
Subject: [PATCH 314/877] [clang][Interp][NFC] Make IntegralAP::isSigned()
 constexpr

---
 clang/lib/AST/Interp/IntegralAP.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/lib/AST/Interp/IntegralAP.h b/clang/lib/AST/Interp/IntegralAP.h
index 45e5b49546270..cfed9ca29336d 100644
--- a/clang/lib/AST/Interp/IntegralAP.h
+++ b/clang/lib/AST/Interp/IntegralAP.h
@@ -124,7 +124,7 @@ template <bool Signed> class IntegralAP final {
   bool isNegative() const { return !V.isNonNegative(); }
   bool isMin() const { return V.isMinValue(); }
   bool isMax() const { return V.isMaxValue(); }
-  static bool isSigned() { return Signed; }
+  static constexpr bool isSigned() { return Signed; }
   bool isMinusOne() const { return Signed && V == -1; }
 
   unsigned countLeadingZeros() const { return V.countl_zero(); }

From b5281afe42246680ede4d563227df3657d501028 Mon Sep 17 00:00:00 2001
From: Phoebe Wang <phoebe.wang@intel.com>
Date: Sun, 29 Oct 2023 21:55:00 +0800
Subject: [PATCH 315/877] [X86] Avoid returning the same shuffle operation for
 broadcast (#70592)

This is to fix a crash since aab8b2eb080d, which generates a new pattern
```
      t35: v8i32 = xor t11, t14
    t36: v8i32 = vector_shuffle<0,1,0,1,0,1,0,1> t35, undef:v8i32
```

The pattern exposed a bug introduced since f885c08034, which breaks
element widen but doesn't handle the broadcast case.

The patch just solved the crash issue. I observed performance regression
cased by above patches in the test, which may need further
investigation.
---
 llvm/lib/Target/X86/X86ISelLowering.cpp       |  6 ++++
 .../CodeGen/X86/shuffle-combine-crash-5.ll    | 30 +++++++++++++++++++
 2 files changed, 36 insertions(+)
 create mode 100644 llvm/test/CodeGen/X86/shuffle-combine-crash-5.ll

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 6411f27da0776..18f6a695e4502 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -15293,6 +15293,12 @@ static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(
       for (int i = 0; i != NumElts; i += NumBroadcastElts)
         for (int j = 0; j != NumBroadcastElts; ++j)
           BroadcastMask[i + j] = j;
+
+      // Avoid returning the same shuffle operation. For example,
+      // v8i32 = vector_shuffle<0,1,0,1,0,1,0,1> t5, undef:v8i32
+      if (BroadcastMask == Mask)
+        return SDValue();
+
       return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),
                                   BroadcastMask);
     }
diff --git a/llvm/test/CodeGen/X86/shuffle-combine-crash-5.ll b/llvm/test/CodeGen/X86/shuffle-combine-crash-5.ll
new file mode 100644
index 0000000000000..f012c05a09573
--- /dev/null
+++ b/llvm/test/CodeGen/X86/shuffle-combine-crash-5.ll
@@ -0,0 +1,30 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512bw,avx512vl | FileCheck %s
+
+define i1 @test(ptr %q) {
+; CHECK-LABEL: test:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vpbroadcastq %xmm0, %ymm0
+; CHECK-NEXT:    vptest %ymm0, %ymm0
+; CHECK-NEXT:    sete %al
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+entry:
+  %0 = load i64, ptr %q, align 8
+  %add = add nsw i64 %0, 0
+  %add2 = add nsw i64 %add, 0
+  %add5 = add nsw i64 %add2, 0
+  %vecinit1.i.i68 = insertelement <2 x i64> poison, i64 %add5, i64 0
+  %add8 = add nsw i64 %add5, 0
+  %vecinit.i.i55 = insertelement <4 x i64> undef, i64 %add8, i64 0
+  %1 = bitcast <2 x i64> %vecinit1.i.i68 to <4 x i32>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+  %3 = bitcast <4 x i64> %vecinit.i.i55 to <8 x i32>
+  %4 = shufflevector <8 x i32> %3, <8 x i32> poison, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+  %5 = icmp ne <8 x i32> %2, %4
+  %6 = bitcast <8 x i1> %5 to i8
+  %7 = icmp eq i8 %6, 0
+  ret i1 %7
+}

From 3f64c0fc48c5b2efc5b9ba11351647d515f23418 Mon Sep 17 00:00:00 2001
From: Da-Viper <57949090+Da-Viper@users.noreply.github.com>
Date: Sun, 29 Oct 2023 14:17:32 +0000
Subject: [PATCH 316/877] [clang-tidy] Fix
 readability-avoid-const-params-in-decls - point to the correct const location
 (#69103)

Fixes the warning marker for redundant const from beginning of variable
to actual location of const.

Fixes #68492
---
 .../readability/AvoidConstParamsInDecls.cpp   | 43 ++++++++++++-------
 clang-tools-extra/docs/ReleaseNotes.rst       |  4 ++
 .../avoid-const-params-in-decls.cpp           | 14 +++---
 3 files changed, 38 insertions(+), 23 deletions(-)

diff --git a/clang-tools-extra/clang-tidy/readability/AvoidConstParamsInDecls.cpp b/clang-tools-extra/clang-tidy/readability/AvoidConstParamsInDecls.cpp
index 6476f1d7fdf2b..24cbbd8bc60a2 100644
--- a/clang-tools-extra/clang-tidy/readability/AvoidConstParamsInDecls.cpp
+++ b/clang-tools-extra/clang-tidy/readability/AvoidConstParamsInDecls.cpp
@@ -21,6 +21,24 @@ SourceRange getTypeRange(const ParmVarDecl &Param) {
   return {Param.getBeginLoc(), Param.getLocation().getLocWithOffset(-1)};
 }
 
+// Finds the location of the qualifying `const` token in the `ParmValDecl`'s
+// return type. Returns `std::nullopt` when the parm type is not
+// `const`-qualified like when the type is an alias or a macro.
+static std::optional<Token>
+findConstToRemove(const ParmVarDecl &Param,
+                  const MatchFinder::MatchResult &Result) {
+
+  CharSourceRange FileRange = Lexer::makeFileCharRange(
+      CharSourceRange::getTokenRange(getTypeRange(Param)),
+      *Result.SourceManager, Result.Context->getLangOpts());
+
+  if (FileRange.isInvalid())
+    return std::nullopt;
+
+  return tidy::utils::lexer::getQualifyingToken(
+      tok::kw_const, FileRange, *Result.Context, *Result.SourceManager);
+}
+
 } // namespace
 
 void AvoidConstParamsInDecls::storeOptions(ClangTidyOptions::OptionMap &Opts) {
@@ -30,11 +48,10 @@ void AvoidConstParamsInDecls::storeOptions(ClangTidyOptions::OptionMap &Opts) {
 void AvoidConstParamsInDecls::registerMatchers(MatchFinder *Finder) {
   const auto ConstParamDecl =
       parmVarDecl(hasType(qualType(isConstQualified()))).bind("param");
-  Finder->addMatcher(
-      functionDecl(unless(isDefinition()),
-                   has(typeLoc(forEach(ConstParamDecl))))
-          .bind("func"),
-      this);
+  Finder->addMatcher(functionDecl(unless(isDefinition()),
+                                  has(typeLoc(forEach(ConstParamDecl))))
+                         .bind("func"),
+                     this);
 }
 
 void AvoidConstParamsInDecls::check(const MatchFinder::MatchResult &Result) {
@@ -50,7 +67,10 @@ void AvoidConstParamsInDecls::check(const MatchFinder::MatchResult &Result) {
     return;
   }
 
-  auto Diag = diag(Param->getBeginLoc(),
+  const auto Tok = findConstToRemove(*Param, Result);
+  const auto ConstLocation = Tok ? Tok->getLocation() : Param->getBeginLoc();
+
+  auto Diag = diag(ConstLocation,
                    "parameter %0 is const-qualified in the function "
                    "declaration; const-qualification of parameters only has an "
                    "effect in function definitions");
@@ -70,18 +90,9 @@ void AvoidConstParamsInDecls::check(const MatchFinder::MatchResult &Result) {
     // from a macro.
     return;
   }
-
-  CharSourceRange FileRange = Lexer::makeFileCharRange(
-      CharSourceRange::getTokenRange(getTypeRange(*Param)),
-      *Result.SourceManager, getLangOpts());
-
-  if (!FileRange.isValid())
-    return;
-
-  auto Tok = tidy::utils::lexer::getQualifyingToken(
-      tok::kw_const, FileRange, *Result.Context, *Result.SourceManager);
   if (!Tok)
     return;
+
   Diag << FixItHint::CreateRemoval(
       CharSourceRange::getTokenRange(Tok->getLocation(), Tok->getLocation()));
 }
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index 5ce38ab48bf29..ecfb3aa9267f1 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -369,6 +369,10 @@ Changes in existing checks
   <clang-tidy/checks/readability/braces-around-statements>` check to
   ignore false-positive for ``if constexpr`` in lambda expression.
 
+- Improved :doc:`readability-avoid-const-params-in-decls
+  <clang-tidy/checks/readability/avoid-const-params-in-decls>` diagnositics to
+  highlight the const location
+
 - Improved :doc:`readability-container-size-empty
   <clang-tidy/checks/readability/container-size-empty>` check to
   detect comparison between string and empty string literals and support
diff --git a/clang-tools-extra/test/clang-tidy/checkers/readability/avoid-const-params-in-decls.cpp b/clang-tools-extra/test/clang-tidy/checkers/readability/avoid-const-params-in-decls.cpp
index d521fd238b7d5..bc098efe3044a 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/readability/avoid-const-params-in-decls.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/readability/avoid-const-params-in-decls.cpp
@@ -9,15 +9,15 @@ void F1(const int i);
 // CHECK-FIXES: void F1(int i);
 
 void F2(const int *const i);
-// CHECK-MESSAGES: :[[@LINE-1]]:9: warning: parameter 'i' is const-qualified
+// CHECK-MESSAGES: :[[@LINE-1]]:20: warning: parameter 'i' is const-qualified
 // CHECK-FIXES: void F2(const int *i);
 
 void F3(int const i);
-// CHECK-MESSAGES: :[[@LINE-1]]:9: warning: parameter 'i' is const-qualified
+// CHECK-MESSAGES: :[[@LINE-1]]:13: warning: parameter 'i' is const-qualified
 // CHECK-FIXES: void F3(int i);
 
 void F4(alias_type const i);
-// CHECK-MESSAGES: :[[@LINE-1]]:9: warning: parameter 'i' is const-qualified
+// CHECK-MESSAGES: :[[@LINE-1]]:20: warning: parameter 'i' is const-qualified
 // CHECK-FIXES: void F4(alias_type i);
 
 void F5(const int);
@@ -25,7 +25,7 @@ void F5(const int);
 // CHECK-FIXES: void F5(int);
 
 void F6(const int *const);
-// CHECK-MESSAGES: :[[@LINE-1]]:9: warning: parameter 1 is const-qualified
+// CHECK-MESSAGES: :[[@LINE-1]]:20: warning: parameter 1 is const-qualified
 // CHECK-FIXES: void F6(const int *);
 
 void F7(int, const int);
@@ -42,7 +42,7 @@ void F9(const int long_name);
 // CHECK-FIXES: void F9(int long_name);
 
 void F10(const int *const *const long_name);
-// CHECK-MESSAGES: :[[@LINE-1]]:10: warning: parameter 'long_name'
+// CHECK-MESSAGES: :[[@LINE-1]]:28: warning: parameter 'long_name'
 // CHECK-FIXES: void F10(const int *const *long_name);
 
 void F11(const unsigned int /*v*/);
@@ -71,11 +71,11 @@ void F15(const A<const int> Named);
 // CHECK-FIXES: void F15(A<const int> Named);
 
 void F16(const A<const int> *const);
-// CHECK-MESSAGES: :[[@LINE-1]]:10: warning: parameter 1 is const-qualified
+// CHECK-MESSAGES: :[[@LINE-1]]:30: warning: parameter 1 is const-qualified
 // CHECK-FIXES: void F16(const A<const int> *);
 
 void F17(const A<const int> *const Named);
-// CHECK-MESSAGES: :[[@LINE-1]]:10: warning: parameter 'Named' is const-qualified
+// CHECK-MESSAGES: :[[@LINE-1]]:30: warning: parameter 'Named' is const-qualified
 // CHECK-FIXES: void F17(const A<const int> *Named);
 
 struct Foo {

From d96529af3c362c53ef2e8c883a9e571fb3626927 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Sun, 29 Oct 2023 15:38:46 +0000
Subject: [PATCH 317/877] [DAG] Attempt shl narrowing in SimplifyDemandedBits
 (REAPPLIED)

If a shl node leaves the upper half bits zero / undemanded, then see if we can profitably perform this with a half-width shl and a free trunc/zext.

Followup to D146121

Reapplied - moved after the ShrinkDemandedOp call; reuse the existing KnownBits result; ensure that we only attempt this if all the upper bits are demanded; 547dc461225ba should address the remaining regressions that were noticed in the previous commit.

Differential Revision: https://reviews.llvm.org/D155472
---
 .../CodeGen/SelectionDAG/TargetLowering.cpp   |   32 +
 .../AMDGPU/amdgcn-load-offset-from-reg.ll     |    3 +-
 .../branch-folding-implicit-def-subreg.ll     |  458 ++-
 llvm/test/CodeGen/AMDGPU/collapse-endcf.ll    |   25 +-
 llvm/test/CodeGen/AMDGPU/idiv-licm.ll         |  529 ++--
 .../AMDGPU/promote-constOffset-to-imm.ll      | 1725 +++++------
 .../CodeGen/AMDGPU/spill-scavenge-offset.ll   |  721 +++--
 llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll |    2 +-
 .../test/CodeGen/AMDGPU/vni8-across-blocks.ll | 2682 ++++++++---------
 llvm/test/CodeGen/AMDGPU/xnor.ll              |    6 +-
 llvm/test/CodeGen/X86/2009-05-30-ISelBug.ll   |    3 +-
 .../test/CodeGen/X86/atomic-rm-bit-test-64.ll |   15 +-
 llvm/test/CodeGen/X86/avx512vnni-combine.ll   |    2 +-
 llvm/test/CodeGen/X86/avxvnni-combine.ll      |    8 +-
 llvm/test/CodeGen/X86/bswap.ll                |   15 +-
 llvm/test/CodeGen/X86/buildvec-insertvec.ll   |    2 +-
 llvm/test/CodeGen/X86/cmp-concat.ll           |    8 +-
 ...r-breaks-subreg-to-reg-liveness-reduced.ll |    2 +-
 llvm/test/CodeGen/X86/combine-bitreverse.ll   |    8 +-
 .../CodeGen/X86/const-shift-of-constmasked.ll |    3 +-
 llvm/test/CodeGen/X86/dagcombine-shifts.ll    |   29 +-
 llvm/test/CodeGen/X86/divmod128.ll            |   31 +-
 llvm/test/CodeGen/X86/extract-bits.ll         |    6 +-
 llvm/test/CodeGen/X86/fold-and-shift.ll       |    3 +-
 llvm/test/CodeGen/X86/fp128-i128.ll           |    4 +-
 llvm/test/CodeGen/X86/lea-dagdag.ll           |    2 +-
 llvm/test/CodeGen/X86/lea-opt2.ll             |    2 +-
 llvm/test/CodeGen/X86/lsr-loop-exit-cond.ll   |   42 +-
 llvm/test/CodeGen/X86/parity.ll               |    4 +-
 llvm/test/CodeGen/X86/pr62653.ll              |  111 +-
 llvm/test/CodeGen/X86/select.ll               |   24 +-
 llvm/test/CodeGen/X86/select_const.ll         |    2 +-
 llvm/test/CodeGen/X86/selectcc-to-shiftand.ll |    2 +-
 llvm/test/CodeGen/X86/setcc.ll                |    2 +-
 llvm/test/CodeGen/X86/shift-combine.ll        |   13 +-
 .../X86/vector-shuffle-variable-128.ll        |   80 +-
 .../X86/vector-shuffle-variable-256.ll        |  224 +-
 llvm/test/CodeGen/X86/vselect.ll              |    8 +-
 llvm/test/CodeGen/X86/zext-shl.ll             |    6 +-
 39 files changed, 3210 insertions(+), 3634 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index c139850eb9963..a7e8d6f8cc8e2 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -1818,6 +1818,38 @@ bool TargetLowering::SimplifyDemandedBits(
       if ((ShAmt < DemandedBits.getActiveBits()) &&
           ShrinkDemandedOp(Op, BitWidth, DemandedBits, TLO))
         return true;
+
+      // Narrow shift to lower half - similar to ShrinkDemandedOp.
+      // (shl i64:x, K) -> (i64 zero_extend (shl (i32 (trunc i64:x)), K))
+      // Only do this if we demand the upper half so the knownbits are correct.
+      unsigned HalfWidth = BitWidth / 2;
+      if ((BitWidth % 2) == 0 && !VT.isVector() && ShAmt < HalfWidth &&
+          DemandedBits.countLeadingOnes() >= HalfWidth) {
+        EVT HalfVT = EVT::getIntegerVT(*TLO.DAG.getContext(), HalfWidth);
+        if (isNarrowingProfitable(VT, HalfVT) &&
+            isTypeDesirableForOp(ISD::SHL, HalfVT) &&
+            isTruncateFree(VT, HalfVT) && isZExtFree(HalfVT, VT) &&
+            (!TLO.LegalOperations() || isOperationLegal(ISD::SHL, HalfVT))) {
+          // If we're demanding the upper bits at all, we must ensure
+          // that the upper bits of the shift result are known to be zero,
+          // which is equivalent to the narrow shift being NUW.
+          if (bool IsNUW = (Known.countMinLeadingZeros() >= HalfWidth)) {
+            unsigned NumSignBits = TLO.DAG.ComputeNumSignBits(Op0, Depth + 1);
+            bool IsNSW = NumSignBits > (ShAmt + HalfWidth);
+            SDNodeFlags Flags;
+            Flags.setNoSignedWrap(IsNSW);
+            Flags.setNoUnsignedWrap(IsNUW);
+            SDValue NewOp = TLO.DAG.getNode(ISD::TRUNCATE, dl, HalfVT, Op0);
+            SDValue NewShiftAmt = TLO.DAG.getShiftAmountConstant(
+                ShAmt, HalfVT, dl, TLO.LegalTypes());
+            SDValue NewShift = TLO.DAG.getNode(ISD::SHL, dl, HalfVT, NewOp,
+                                               NewShiftAmt, Flags);
+            SDValue NewExt =
+                TLO.DAG.getNode(ISD::ZERO_EXTEND, dl, VT, NewShift);
+            return TLO.CombineTo(Op, NewExt);
+          }
+        }
+      }
     } else {
       // This is a variable shift, so we can't shift the demand mask by a known
       // amount. But if we are not demanding high bits, then we are not
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn-load-offset-from-reg.ll b/llvm/test/CodeGen/AMDGPU/amdgcn-load-offset-from-reg.ll
index 181b5b71bdd48..d7b9eebff77c0 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn-load-offset-from-reg.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn-load-offset-from-reg.ll
@@ -32,7 +32,8 @@ define amdgpu_cs void @test_load_zext(i32 inreg %0, i32 inreg %1, i32 inreg %res
 ; the base may be the RHS operand of the load in SDAG.
 ; GCN-LABEL: name: test_complex_reg_offset
 ; GCN-DAG: %[[BASE:.*]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @0 + 4,
-; GCN-DAG: %[[OFFSET:.*]]:sreg_32 = S_LSHL_B32
+; SDAG-DAG: %[[OFFSET:.*]]:sreg_32 = nuw nsw S_LSHL_B32
+; GISEL-DAG: %[[OFFSET:.*]]:sreg_32 = S_LSHL_B32
 ; SDAG: S_LOAD_DWORD_SGPR_IMM killed %[[BASE]], killed %[[OFFSET]], 0, 0
 ; GISEL: S_LOAD_DWORD_SGPR_IMM %[[BASE]], %[[OFFSET]], 0, 0
 define amdgpu_ps void @test_complex_reg_offset(ptr addrspace(1) %out) {
diff --git a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
index 5990736f664fb..b5a823355543c 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
@@ -37,44 +37,43 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $sgpr34_sgpr35 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $vcc = S_AND_B64 $exec, renamable $sgpr30_sgpr31, implicit-def dead $scc
+  ; GFX90A-NEXT:   $vgpr22 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   $vgpr10 = IMPLICIT_DEF
   ; GFX90A-NEXT:   $vgpr24 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   $agpr0 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   $vgpr26 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   $vgpr18 = IMPLICIT_DEF
   ; GFX90A-NEXT:   $vgpr20 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   $vgpr22 = IMPLICIT_DEF
   ; GFX90A-NEXT:   S_CBRANCH_VCCNZ %bb.58, implicit $vcc
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.2:
   ; GFX90A-NEXT:   successors: %bb.3(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr24, $sgpr33, $vgpr31, $agpr0, $vgpr26, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8, $sgpr9, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr56, $sgpr57, $sgpr20_sgpr21_sgpr22, $sgpr24_sgpr25_sgpr26, $sgpr26_sgpr27, $vgpr2, $vgpr3, $vgpr20, $vgpr22
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr22, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8, $sgpr9, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr56, $sgpr57, $sgpr20_sgpr21_sgpr22, $sgpr24_sgpr25_sgpr26, $sgpr26_sgpr27, $vgpr2, $vgpr3, $vgpr10, $vgpr24, $vgpr18, $vgpr20
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $sgpr17 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $sgpr23 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $agpr1 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr11 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr19 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr21 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr23 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr25 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr27 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $sgpr36_sgpr37 = S_MOV_B64 0
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.3.Flow17:
   ; GFX90A-NEXT:   successors: %bb.4(0x40000000), %bb.57(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr23, $sgpr33, $vgpr31, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr26_vgpr27:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr23, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   renamable $vgpr4 = V_AND_B32_e32 1023, $vgpr31, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr30 = V_AND_B32_e32 1023, $vgpr31, implicit $exec
   ; GFX90A-NEXT:   renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr34_sgpr35, implicit-def dead $scc
   ; GFX90A-NEXT:   S_CBRANCH_VCCZ %bb.57, implicit $vcc
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.4.bb15:
   ; GFX90A-NEXT:   successors: %bb.35(0x40000000), %bb.5(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr0_vgpr1 = V_LSHLREV_B64_e64 2, $vgpr2_vgpr3, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr5 = COPY renamable $sgpr25, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr4 = COPY renamable $sgpr25, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr46, renamable $vcc = V_ADD_CO_U32_e64 $sgpr24, $vgpr0, 0, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr47, dead renamable $vcc = V_ADDC_U32_e64 killed $vgpr5, killed $vgpr1, killed $vcc, 0, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr5 = V_MOV_B32_e32 0, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr0 = V_LSHLREV_B32_e32 2, $vgpr4, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr47, dead renamable $vcc = V_ADDC_U32_e64 killed $vgpr4, killed $vgpr1, killed $vcc, 0, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr0 = nuw nsw V_LSHLREV_B32_e32 2, $vgpr30, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr40, renamable $vcc = V_ADD_CO_U32_e64 $vgpr46, killed $vgpr0, 0, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr41, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr47, killed $vcc, 0, implicit $exec
   ; GFX90A-NEXT:   renamable $vcc = S_AND_B64 $exec, renamable $sgpr30_sgpr31, implicit-def dead $scc
@@ -82,7 +81,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.5:
   ; GFX90A-NEXT:   successors: %bb.6(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $sgpr34_sgpr35 = S_MOV_B64 -1
   ; GFX90A-NEXT:   renamable $sgpr58_sgpr59 = S_MOV_B64 0
@@ -95,9 +94,9 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   renamable $sgpr42_sgpr43 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $sgpr40_sgpr41 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $sgpr38_sgpr39 = S_MOV_B64 0
-  ; GFX90A-NEXT:   renamable $vgpr10_vgpr11 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr8_vgpr9 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr6_vgpr7 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr4_vgpr5 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr0_vgpr1 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr62_vgpr63 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr60_vgpr61 = IMPLICIT_DEF
@@ -105,32 +104,32 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   renamable $vgpr56_vgpr57 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr44_vgpr45 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr42_vgpr43 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr19 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr17 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr16 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr30 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr18 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr54 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr15 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $agpr1 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr14 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr52 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr16 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr53 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr13 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr11 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $sgpr17 = IMPLICIT_DEF
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.6.Flow20:
   ; GFX90A-NEXT:   successors: %bb.7(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   renamable $vgpr21 = COPY renamable $sgpr17, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr19 = COPY renamable $sgpr17, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr18 = COPY $sgpr17, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr21 = COPY $sgpr17, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr20 = COPY $sgpr17, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr23 = COPY $sgpr17, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr22 = COPY $sgpr17, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr25 = COPY $sgpr17, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr24 = COPY $sgpr17, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr27 = COPY $sgpr17, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr26 = COPY $sgpr17, implicit $exec
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.7.Flow19:
   ; GFX90A-NEXT:   successors: %bb.62(0x40000000), %bb.8(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr26_vgpr27:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $sgpr56_sgpr57 = S_MOV_B64 0
   ; GFX90A-NEXT:   $sgpr24_sgpr25 = S_AND_SAVEEXEC_B64 $sgpr36_sgpr37, implicit-def $exec, implicit-def $scc, implicit $exec
@@ -138,7 +137,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.8.Flow32:
   ; GFX90A-NEXT:   successors: %bb.9(0x40000000), %bb.10(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr24_sgpr25, implicit-def $scc
   ; GFX90A-NEXT:   $sgpr12_sgpr13 = S_AND_SAVEEXEC_B64 $sgpr18_sgpr19, implicit-def $exec, implicit-def $scc, implicit $exec
@@ -147,15 +146,15 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.9.bb89:
   ; GFX90A-NEXT:   successors: %bb.10(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET renamable $vgpr11, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5)
-  ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr10, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5)
+  ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET renamable $vgpr9, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5)
+  ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr8, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5)
   ; GFX90A-NEXT:   renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.10.Flow33:
   ; GFX90A-NEXT:   successors: %bb.11(0x40000000), %bb.12(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def $scc
   ; GFX90A-NEXT:   $sgpr12_sgpr13 = S_AND_SAVEEXEC_B64 $sgpr58_sgpr59, implicit-def $exec, implicit-def $scc, implicit $exec
@@ -164,15 +163,15 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.11.bb84:
   ; GFX90A-NEXT:   successors: %bb.12(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET renamable $vgpr9, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5)
-  ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr8, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5)
+  ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET renamable $vgpr7, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5)
+  ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5)
   ; GFX90A-NEXT:   renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.12.Flow34:
   ; GFX90A-NEXT:   successors: %bb.13(0x40000000), %bb.14(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def $scc
   ; GFX90A-NEXT:   $sgpr12_sgpr13 = S_AND_SAVEEXEC_B64 $sgpr54_sgpr55, implicit-def $exec, implicit-def $scc, implicit $exec
@@ -181,10 +180,10 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.13.bb79:
   ; GFX90A-NEXT:   successors: %bb.14(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr56_sgpr57, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET renamable $vgpr7, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5)
-  ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5)
+  ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET renamable $vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5)
+  ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5)
   ; GFX90A-NEXT:   renamable $sgpr56_sgpr57 = S_OR_B64 killed renamable $sgpr56_sgpr57, $exec, implicit-def dead $scc
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.14.Flow35:
@@ -366,7 +365,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.35.bb20:
   ; GFX90A-NEXT:   successors: %bb.37(0x40000000), %bb.36(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr0 = GLOBAL_LOAD_SBYTE renamable $vgpr40_vgpr41, 1024, 0, implicit $exec :: (load (s8) from %ir.i21, addrspace 1)
   ; GFX90A-NEXT:   renamable $vgpr42 = V_ADD_CO_U32_e32 1024, $vgpr40, implicit-def $vcc, implicit $exec
@@ -383,37 +382,37 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   renamable $vgpr43, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec
   ; GFX90A-NEXT:   renamable $vcc = V_CMP_LT_I16_e64 0, killed $vgpr0, implicit $exec
   ; GFX90A-NEXT:   renamable $sgpr40_sgpr41 = S_MOV_B64 0
-  ; GFX90A-NEXT:   renamable $vgpr10_vgpr11 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr8_vgpr9 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr6_vgpr7 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr4_vgpr5 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr0_vgpr1 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr62_vgpr63 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr60_vgpr61 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr58_vgpr59 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr56_vgpr57 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr44_vgpr45 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr19 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr17 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr16 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr30 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr18 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr54 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr15 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $agpr1 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr14 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr52 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr16 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr53 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr13 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr11 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $sgpr17 = IMPLICIT_DEF
   ; GFX90A-NEXT:   $sgpr24_sgpr25 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX90A-NEXT:   S_CBRANCH_EXECNZ %bb.37, implicit $exec
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.36.Flow21:
   ; GFX90A-NEXT:   successors: %bb.6(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr24_sgpr25, implicit-def $scc
   ; GFX90A-NEXT:   S_BRANCH %bb.6
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.37.bb27:
   ; GFX90A-NEXT:   successors: %bb.39(0x40000000), %bb.38(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19, $sgpr58_sgpr59, $sgpr54_sgpr55, $sgpr52_sgpr53, $sgpr50_sgpr51, $sgpr48_sgpr49, $sgpr46_sgpr47, $sgpr44_sgpr45, $sgpr42_sgpr43
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19, $sgpr58_sgpr59, $sgpr54_sgpr55, $sgpr52_sgpr53, $sgpr50_sgpr51, $sgpr48_sgpr49, $sgpr46_sgpr47, $sgpr44_sgpr45, $sgpr42_sgpr43
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr0 = GLOBAL_LOAD_UBYTE renamable $vgpr40_vgpr41, 2048, 0, implicit $exec :: (load (s8) from %ir.i28, addrspace 1)
   ; GFX90A-NEXT:   renamable $vgpr44 = V_ADD_CO_U32_e32 2048, $vgpr40, implicit-def $vcc, implicit $exec
@@ -421,29 +420,29 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   renamable $sgpr60_sgpr61 = COPY renamable $sgpr36_sgpr37
   ; GFX90A-NEXT:   renamable $vgpr45, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec
   ; GFX90A-NEXT:   renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr0, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr10_vgpr11 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr8_vgpr9 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr6_vgpr7 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr4_vgpr5 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr0_vgpr1 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr62_vgpr63 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr60_vgpr61 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr58_vgpr59 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr56_vgpr57 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr19 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr17 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr16 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr30 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr18 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr54 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr15 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $agpr1 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr14 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr52 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr16 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr53 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr13 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr11 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $sgpr17 = IMPLICIT_DEF
   ; GFX90A-NEXT:   $sgpr38_sgpr39 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX90A-NEXT:   S_CBRANCH_EXECNZ %bb.39, implicit $exec
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.38.Flow22:
   ; GFX90A-NEXT:   successors: %bb.36(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr38_sgpr39, implicit-def $scc
   ; GFX90A-NEXT:   renamable $sgpr38_sgpr39 = S_XOR_B64 $exec, -1, implicit-def dead $scc
@@ -464,7 +463,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.39.bb34:
   ; GFX90A-NEXT:   successors: %bb.41(0x40000000), %bb.40(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19, $sgpr58_sgpr59, $sgpr54_sgpr55, $sgpr52_sgpr53, $sgpr50_sgpr51, $sgpr48_sgpr49, $sgpr46_sgpr47, $sgpr44_sgpr45
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr18_sgpr19, $sgpr58_sgpr59, $sgpr54_sgpr55, $sgpr52_sgpr53, $sgpr50_sgpr51, $sgpr48_sgpr49, $sgpr46_sgpr47, $sgpr44_sgpr45
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr0 = GLOBAL_LOAD_UBYTE renamable $vgpr40_vgpr41, 3072, 0, implicit $exec :: (load (s8) from %ir.i35, addrspace 1)
   ; GFX90A-NEXT:   renamable $vgpr56 = V_ADD_CO_U32_e32 3072, $vgpr40, implicit-def $vcc, implicit $exec
@@ -472,28 +471,28 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   renamable $sgpr60_sgpr61 = COPY renamable $sgpr36_sgpr37
   ; GFX90A-NEXT:   renamable $vgpr57, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec
   ; GFX90A-NEXT:   renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr0, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr10_vgpr11 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr8_vgpr9 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr6_vgpr7 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr4_vgpr5 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr0_vgpr1 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr62_vgpr63 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr60_vgpr61 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr58_vgpr59 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr19 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr17 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr16 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr30 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr18 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr54 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr15 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $agpr1 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr14 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr52 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr16 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr53 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr13 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr11 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $sgpr17 = IMPLICIT_DEF
   ; GFX90A-NEXT:   $sgpr40_sgpr41 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX90A-NEXT:   S_CBRANCH_EXECNZ %bb.41, implicit $exec
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.40.Flow23:
   ; GFX90A-NEXT:   successors: %bb.38(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr40_sgpr41, implicit-def $scc
   ; GFX90A-NEXT:   renamable $sgpr40_sgpr41 = S_XOR_B64 $exec, -1, implicit-def dead $scc
@@ -513,7 +512,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.41.bb41:
   ; GFX90A-NEXT:   successors: %bb.46(0x40000000), %bb.42(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr58_sgpr59, $sgpr54_sgpr55, $sgpr52_sgpr53, $sgpr50_sgpr51, $sgpr48_sgpr49, $sgpr46_sgpr47
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr58_sgpr59, $sgpr54_sgpr55, $sgpr52_sgpr53, $sgpr50_sgpr51, $sgpr48_sgpr49, $sgpr46_sgpr47
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr58 = V_ADD_CO_U32_e32 4096, $vgpr40, implicit-def $vcc, implicit $exec
   ; GFX90A-NEXT:   renamable $sgpr18_sgpr19 = COPY $vcc
@@ -522,33 +521,33 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   renamable $sgpr18_sgpr19 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $sgpr44_sgpr45 = S_MOV_B64 -1
   ; GFX90A-NEXT:   renamable $sgpr60_sgpr61 = COPY renamable $sgpr36_sgpr37
-  ; GFX90A-NEXT:   renamable $vgpr20, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr18, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec
   ; GFX90A-NEXT:   renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr0, implicit $exec
   ; GFX90A-NEXT:   renamable $sgpr62_sgpr63 = S_MOV_B64 0
-  ; GFX90A-NEXT:   renamable $vgpr10_vgpr11 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr8_vgpr9 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr6_vgpr7 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr4_vgpr5 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr0_vgpr1 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr62_vgpr63 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr60_vgpr61 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr19 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr17 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr16 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr30 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr18 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr54 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr15 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $agpr1 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr14 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr52 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr16 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr53 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr13 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr11 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $sgpr17 = IMPLICIT_DEF
   ; GFX90A-NEXT:   $sgpr42_sgpr43 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX90A-NEXT:   S_CBRANCH_EXECNZ %bb.46, implicit $exec
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.42.Flow24:
   ; GFX90A-NEXT:   successors: %bb.40(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr17, $vgpr19, $vgpr20, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr42_sgpr43, implicit-def $scc
-  ; GFX90A-NEXT:   renamable $vgpr59 = COPY killed renamable $vgpr20, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr59 = COPY killed renamable $vgpr18, implicit $exec
   ; GFX90A-NEXT:   renamable $sgpr42_sgpr43 = S_XOR_B64 $exec, -1, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $sgpr44_sgpr45 = S_AND_B64 killed renamable $sgpr44_sgpr45, $exec, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $sgpr46_sgpr47 = S_AND_B64 killed renamable $sgpr62_sgpr63, $exec, implicit-def dead $scc
@@ -565,7 +564,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.43.bb55:
   ; GFX90A-NEXT:   successors: %bb.48(0x40000000), %bb.44(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr44_sgpr45, $sgpr52_sgpr53, $sgpr58_sgpr59, $sgpr54_sgpr55, $sgpr46_sgpr47
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr44_sgpr45, $sgpr52_sgpr53, $sgpr58_sgpr59, $sgpr54_sgpr55, $sgpr46_sgpr47
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   S_BITCMP1_B32 killed renamable $sgpr33, 16, implicit-def $scc
   ; GFX90A-NEXT:   renamable $sgpr64_sgpr65 = S_CSELECT_B64 -1, 0, implicit killed $scc
@@ -573,33 +572,33 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   renamable $vgpr62 = V_ADD_CO_U32_e32 6144, $vgpr40, implicit-def $vcc, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr63, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $vcc, 0, implicit $exec
   ; GFX90A-NEXT:   renamable $vcc = S_AND_B64 $exec, renamable $sgpr48_sgpr49, implicit-def dead $scc
-  ; GFX90A-NEXT:   $agpr0 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   $vgpr14 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   $vgpr10 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   $vgpr12 = IMPLICIT_DEF
   ; GFX90A-NEXT:   S_CBRANCH_VCCNZ %bb.48, implicit $vcc
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.44:
   ; GFX90A-NEXT:   successors: %bb.45(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr58, $vgpr57, $vgpr20, $vgpr61, $vgpr31, $vgpr63, $agpr0, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8, $sgpr9, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $vgpr40, $vgpr62, $vgpr60, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22, $sgpr22_sgpr23, $sgpr24_sgpr25_sgpr26, $sgpr26_sgpr27, $vgpr56, $vgpr47, $vgpr2, $vgpr3, $vgpr4, $vgpr46, $vgpr45, $vgpr44, $vgpr43, $vgpr42, $vgpr41, $vgpr14
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr58, $vgpr57, $vgpr18, $vgpr30, $vgpr31, $vgpr61, $vgpr63, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8, $sgpr9, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $vgpr40, $vgpr62, $vgpr60, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22, $sgpr22_sgpr23, $sgpr24_sgpr25_sgpr26, $sgpr26_sgpr27, $vgpr56, $vgpr47, $vgpr2, $vgpr3, $vgpr46, $vgpr45, $vgpr44, $vgpr43, $vgpr42, $vgpr41, $vgpr10, $vgpr12
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $sgpr50_sgpr51 = COPY renamable $sgpr36_sgpr37
-  ; GFX90A-NEXT:   renamable $vgpr10_vgpr11 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr8_vgpr9 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr6_vgpr7 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr4_vgpr5 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr0_vgpr1 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr19 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr17 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr16 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr30 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr18 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr54 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr15 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $agpr1 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr14 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr52 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr16 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr53 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr13 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr11 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $sgpr17 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $sgpr48_sgpr49 = S_MOV_B64 0
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.45.Flow26:
   ; GFX90A-NEXT:   successors: %bb.47(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr17, $vgpr19, $vgpr20, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $sgpr60_sgpr61 = S_XOR_B64 $exec, -1, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $sgpr70_sgpr71 = S_AND_B64 killed renamable $sgpr44_sgpr45, $exec, implicit-def dead $scc
@@ -615,7 +614,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.46.bb48:
   ; GFX90A-NEXT:   successors: %bb.43(0x40000000), %bb.47(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr46_sgpr47, $sgpr58_sgpr59, $sgpr54_sgpr55, $sgpr44_sgpr45, $sgpr52_sgpr53
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr33, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr46_sgpr47, $sgpr58_sgpr59, $sgpr54_sgpr55, $sgpr44_sgpr45, $sgpr52_sgpr53
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr60 = V_ADD_CO_U32_e32 5120, $vgpr40, implicit-def $vcc, implicit $exec
   ; GFX90A-NEXT:   renamable $sgpr18_sgpr19 = COPY $vcc
@@ -629,26 +628,26 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   renamable $vgpr61, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr41, killed $sgpr18_sgpr19, 0, implicit $exec
   ; GFX90A-NEXT:   renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr0, implicit $exec
   ; GFX90A-NEXT:   renamable $sgpr70_sgpr71 = S_MOV_B64 0
-  ; GFX90A-NEXT:   renamable $vgpr10_vgpr11 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr8_vgpr9 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr6_vgpr7 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr4_vgpr5 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr0_vgpr1 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr62_vgpr63 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr19 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr17 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr16 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr30 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr18 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr54 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr15 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $agpr1 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr14 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr52 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr16 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr53 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr13 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr11 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $sgpr17 = IMPLICIT_DEF
   ; GFX90A-NEXT:   $sgpr18_sgpr19 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX90A-NEXT:   S_CBRANCH_EXECNZ %bb.43, implicit $exec
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.47.Flow25:
   ; GFX90A-NEXT:   successors: %bb.42(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr17, $vgpr19, $vgpr20, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $sgpr70_sgpr71, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr46_sgpr47, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr66_sgpr67, $sgpr68_sgpr69, $sgpr70_sgpr71, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr18_sgpr19, implicit-def $scc
   ; GFX90A-NEXT:   renamable $sgpr44_sgpr45 = S_XOR_B64 $exec, -1, implicit-def dead $scc
@@ -666,133 +665,135 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.48.bb63:
   ; GFX90A-NEXT:   successors: %bb.50(0x40000000), %bb.49(0x40000000)
-  ; GFX90A-NEXT:   liveins: $vcc, $sgpr14, $sgpr15, $sgpr16, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr48_sgpr49, $sgpr56_sgpr57:0x000000000000000F, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr52_sgpr53, $sgpr58_sgpr59, $sgpr54_sgpr55, $sgpr46_sgpr47
+  ; GFX90A-NEXT:   liveins: $vcc, $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr48_sgpr49, $sgpr56_sgpr57:0x000000000000000F, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr52_sgpr53, $sgpr58_sgpr59, $sgpr54_sgpr55, $sgpr46_sgpr47
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $sgpr44_sgpr45 = S_MOV_B64 0
   ; GFX90A-NEXT:   S_CBRANCH_VCCNZ %bb.50, implicit $vcc
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.49:
   ; GFX90A-NEXT:   successors: %bb.44(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr52_sgpr53, $sgpr58_sgpr59, $sgpr54_sgpr55
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr52_sgpr53, $sgpr58_sgpr59, $sgpr54_sgpr55
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $sgpr46_sgpr47 = S_MOV_B64 -1
   ; GFX90A-NEXT:   S_BRANCH %bb.44
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.50.bb68:
   ; GFX90A-NEXT:   successors: %bb.54(0x40000000), %bb.51(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr56_sgpr57:0x000000000000000F, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr46_sgpr47, $sgpr52_sgpr53, $sgpr58_sgpr59, $sgpr54_sgpr55
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr48_sgpr49, $sgpr56_sgpr57:0x000000000000000F, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr46_sgpr47, $sgpr52_sgpr53, $sgpr58_sgpr59, $sgpr54_sgpr55
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   renamable $vgpr0_vgpr1 = V_LSHLREV_B64_e64 3, $vgpr4_vgpr5, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr0 = nuw nsw V_LSHLREV_B32_e32 3, $vgpr30, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr1 = V_MOV_B32_e32 0, implicit $exec
   ; GFX90A-NEXT:   renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr48_sgpr49, implicit-def dead $scc
   ; GFX90A-NEXT:   S_CBRANCH_VCCNZ %bb.54, implicit $vcc
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.51:
   ; GFX90A-NEXT:   successors: %bb.45(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr52_sgpr53, $sgpr58_sgpr59, $sgpr54_sgpr55
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr52_sgpr53, $sgpr58_sgpr59, $sgpr54_sgpr55
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $sgpr48_sgpr49 = S_MOV_B64 -1
   ; GFX90A-NEXT:   renamable $sgpr50_sgpr51 = COPY renamable $sgpr36_sgpr37
-  ; GFX90A-NEXT:   renamable $vgpr10_vgpr11 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr8_vgpr9 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr6_vgpr7 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr19 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr4_vgpr5 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr17 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr16 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr30 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr18 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr54 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr15 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $agpr1 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr14 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr52 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr16 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr53 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr13 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr11 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $sgpr17 = IMPLICIT_DEF
   ; GFX90A-NEXT:   S_BRANCH %bb.45
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.52.bb80:
   ; GFX90A-NEXT:   successors: %bb.59(0x40000000), %bb.53(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr56_sgpr57:0x000000000000000F, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr56_sgpr57:0x000000000000000F, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $sgpr17 = S_BFE_U32 renamable $sgpr20, 65560, implicit-def dead $scc
   ; GFX90A-NEXT:   S_CMP_EQ_U32 killed renamable $sgpr17, 0, implicit-def $scc
-  ; GFX90A-NEXT:   renamable $vgpr8 = V_ADD_CO_U32_e32 4096, $vgpr0, implicit-def $vcc, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr9, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr1, killed $vcc, 0, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr6 = V_ADD_CO_U32_e32 4096, $vgpr0, implicit-def $vcc, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr7, dead renamable $sgpr50_sgpr51 = V_ADDC_U32_e64 0, 0, killed $vcc, 0, implicit $exec
   ; GFX90A-NEXT:   S_CBRANCH_SCC1 %bb.59, implicit killed $scc
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.53:
   ; GFX90A-NEXT:   successors: %bb.61(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr60_sgpr61, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr60_sgpr61, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $sgpr50_sgpr51 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $sgpr52_sgpr53 = S_MOV_B64 -1
   ; GFX90A-NEXT:   renamable $sgpr62_sgpr63 = COPY renamable $sgpr36_sgpr37
-  ; GFX90A-NEXT:   renamable $vgpr10_vgpr11 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr19 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr8_vgpr9 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr17 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr16 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr30 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr18 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr54 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr15 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $agpr1 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr14 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr52 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr16 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr53 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr13 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr11 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $sgpr17 = IMPLICIT_DEF
   ; GFX90A-NEXT:   S_BRANCH %bb.61
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.54.bb73:
   ; GFX90A-NEXT:   successors: %bb.52(0x40000000), %bb.55(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr56_sgpr57:0x000000000000000F, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr52_sgpr53, $sgpr58_sgpr59
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr56_sgpr57:0x000000000000000F, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003F, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr52_sgpr53
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   renamable $vgpr5 = GLOBAL_LOAD_UBYTE renamable $vgpr0_vgpr1, 2048, 0, implicit $exec :: (load (s8) from %ir.i74, addrspace 1)
-  ; GFX90A-NEXT:   renamable $vgpr6 = V_ADD_CO_U32_e32 2048, $vgpr0, implicit-def $vcc, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr6 = GLOBAL_LOAD_UBYTE renamable $vgpr0_vgpr1, 2048, 0, implicit $exec :: (load (s8) from %ir.i74, addrspace 1)
+  ; GFX90A-NEXT:   renamable $vgpr4 = V_ADD_CO_U32_e32 2048, $vgpr0, implicit-def $vcc, implicit $exec
   ; GFX90A-NEXT:   renamable $sgpr48_sgpr49 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $sgpr54_sgpr55 = S_MOV_B64 -1
   ; GFX90A-NEXT:   renamable $sgpr50_sgpr51 = COPY renamable $sgpr36_sgpr37
-  ; GFX90A-NEXT:   renamable $vgpr7, dead renamable $vcc = V_ADDC_U32_e64 0, $vgpr1, killed $vcc, 0, implicit $exec
-  ; GFX90A-NEXT:   renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr5, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr10_vgpr11 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr5, dead renamable $sgpr58_sgpr59 = V_ADDC_U32_e64 0, 0, killed $vcc, 0, implicit $exec
+  ; GFX90A-NEXT:   renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr6, implicit $exec
+  ; GFX90A-NEXT:   renamable $sgpr58_sgpr59 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $vgpr8_vgpr9 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr19 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr6_vgpr7 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr17 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr16 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr30 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr18 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr54 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr15 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $agpr1 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr14 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr52 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr16 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr53 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr13 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr11 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $sgpr17 = IMPLICIT_DEF
   ; GFX90A-NEXT:   $sgpr60_sgpr61 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX90A-NEXT:   S_CBRANCH_EXECNZ %bb.52, implicit $exec
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.55.Flow29:
   ; GFX90A-NEXT:   successors: %bb.45(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr17, $vgpr19, $vgpr20, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr60_sgpr61, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr60_sgpr61, implicit-def $scc
   ; GFX90A-NEXT:   S_BRANCH %bb.45
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.56.bb90:
   ; GFX90A-NEXT:   successors: %bb.60(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr52_sgpr53, $sgpr56_sgpr57:0x000000000000000F, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
-  ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   renamable $vgpr54 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed $sgpr64_sgpr65, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr5 = V_MOV_B32_e32 0, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr16_vgpr17 = DS_READ_B64_gfx9 killed renamable $vgpr5, 0, 0, implicit $exec :: (load (s64) from `ptr addrspace(3) null`, addrspace 3)
-  ; GFX90A-NEXT:   renamable $vgpr5 = COPY renamable $sgpr21, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr18_vgpr19 = DS_READ_B64_gfx9 killed renamable $vgpr5, 0, 0, implicit $exec :: (load (s64) from %ir.7, addrspace 3)
-  ; GFX90A-NEXT:   renamable $vgpr5 = COPY renamable $sgpr22, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr14_vgpr15 = DS_READ_B64_gfx9 killed renamable $vgpr5, 0, 0, implicit $exec :: (load (s64) from %ir.8, addrspace 3)
-  ; GFX90A-NEXT:   renamable $vgpr5 = COPY renamable $sgpr56, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr13 = V_ALIGNBIT_B32_e64 killed $sgpr57, killed $vgpr5, 1, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr30 = V_ALIGNBIT_B32_e64 $vgpr19, $vgpr18, 1, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr19 = V_CNDMASK_B32_e64 0, 0, 0, 1, $sgpr12_sgpr13, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr17 = V_ALIGNBIT_B32_e64 $vgpr17, $vgpr16, 1, implicit $exec
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr52_sgpr53, $sgpr56_sgpr57:0x000000000000000F, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT: {{  $}}
+  ; GFX90A-NEXT:   renamable $vgpr53 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed $sgpr64_sgpr65, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr10 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr14_vgpr15 = DS_READ_B64_gfx9 killed renamable $vgpr10, 0, 0, implicit $exec :: (load (s64) from `ptr addrspace(3) null`, addrspace 3)
+  ; GFX90A-NEXT:   renamable $vgpr10 = COPY renamable $sgpr21, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr16_vgpr17 = DS_READ_B64_gfx9 killed renamable $vgpr10, 0, 0, implicit $exec :: (load (s64) from %ir.7, addrspace 3)
+  ; GFX90A-NEXT:   renamable $vgpr10 = COPY renamable $sgpr22, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr12_vgpr13 = DS_READ_B64_gfx9 killed renamable $vgpr10, 0, 0, implicit $exec :: (load (s64) from %ir.8, addrspace 3)
+  ; GFX90A-NEXT:   renamable $vgpr10 = COPY renamable $sgpr56, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr11 = V_ALIGNBIT_B32_e64 killed $sgpr57, killed $vgpr10, 1, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr52 = V_ALIGNBIT_B32_e64 $vgpr17, $vgpr16, 1, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr17 = V_CNDMASK_B32_e64 0, 0, 0, 1, $sgpr12_sgpr13, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr15 = V_ALIGNBIT_B32_e64 $vgpr15, $vgpr14, 1, implicit $exec
   ; GFX90A-NEXT:   renamable $sgpr50_sgpr51 = S_XOR_B64 $exec, -1, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $sgpr62_sgpr63 = S_OR_B64 renamable $sgpr36_sgpr37, $exec, implicit-def dead $scc
   ; GFX90A-NEXT:   S_BRANCH %bb.60
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.57:
   ; GFX90A-NEXT:   successors: %bb.7(0x80000000)
-  ; GFX90A-NEXT:   liveins: $exec:0x000000000000000F, $sgpr14, $sgpr15, $sgpr16, $sgpr17:0x0000000000000003, $sgpr23:0x0000000000000003, $vgpr31, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr26_vgpr27:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $exec:0x000000000000000F, $sgpr14, $sgpr15, $sgpr16, $sgpr17:0x0000000000000003, $sgpr23:0x0000000000000003, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr36_sgpr37, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr2_vgpr3:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   renamable $vgpr17 = COPY killed renamable $sgpr23, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr19 = COPY killed renamable $sgpr17, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr15 = COPY killed renamable $sgpr23, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr17 = COPY killed renamable $sgpr17, implicit $exec
   ; GFX90A-NEXT:   renamable $sgpr58_sgpr59 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $sgpr54_sgpr55 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $sgpr52_sgpr53 = S_MOV_B64 0
@@ -803,9 +804,9 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   renamable $sgpr42_sgpr43 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $sgpr40_sgpr41 = S_MOV_B64 0
   ; GFX90A-NEXT:   renamable $sgpr38_sgpr39 = S_MOV_B64 0
-  ; GFX90A-NEXT:   renamable $vgpr10_vgpr11 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr8_vgpr9 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr6_vgpr7 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr4_vgpr5 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr0_vgpr1 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr62_vgpr63 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr60_vgpr61 = IMPLICIT_DEF
@@ -815,12 +816,12 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   renamable $vgpr42_vgpr43 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr40_vgpr41 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr46_vgpr47 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr16 = COPY renamable $vgpr17, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr30 = COPY renamable $vgpr17, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr18 = COPY renamable $vgpr17, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr54 = COPY renamable $vgpr19, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr15 = COPY renamable $vgpr17, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr14 = COPY renamable $vgpr17, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr14 = COPY renamable $vgpr15, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr52 = COPY renamable $vgpr15, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr16 = COPY renamable $vgpr15, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr53 = COPY renamable $vgpr17, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr13 = COPY renamable $vgpr15, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr12 = COPY renamable $vgpr15, implicit $exec
   ; GFX90A-NEXT:   renamable $sgpr34_sgpr35 = S_MOV_B64 0
   ; GFX90A-NEXT:   S_BRANCH %bb.7
   ; GFX90A-NEXT: {{  $}}
@@ -829,15 +830,15 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr56_sgpr57:0x000000000000000F, $sgpr20_sgpr21_sgpr22_sgpr23:0x00000000000000FF, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000FF, $vgpr2_vgpr3:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr24_vgpr25 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from `ptr addrspace(3) null`, addrspace 3)
+  ; GFX90A-NEXT:   renamable $vgpr22_vgpr23 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from `ptr addrspace(3) null`, addrspace 3)
   ; GFX90A-NEXT:   renamable $vgpr0 = COPY renamable $sgpr23, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr22_vgpr23 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.434, addrspace 3)
+  ; GFX90A-NEXT:   renamable $vgpr20_vgpr21 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.434, addrspace 3)
   ; GFX90A-NEXT:   renamable $vgpr0 = COPY renamable $sgpr21, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr20_vgpr21 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.7, addrspace 3)
+  ; GFX90A-NEXT:   renamable $vgpr18_vgpr19 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.7, addrspace 3)
   ; GFX90A-NEXT:   renamable $vgpr0 = COPY killed renamable $sgpr17, implicit $exec
-  ; GFX90A-NEXT:   renamable $agpr0_agpr1 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.435, addrspace 3)
+  ; GFX90A-NEXT:   renamable $vgpr10_vgpr11 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.435, addrspace 3)
   ; GFX90A-NEXT:   renamable $vgpr0 = COPY renamable $sgpr22, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr26_vgpr27 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.8, addrspace 3)
+  ; GFX90A-NEXT:   renamable $vgpr24_vgpr25 = DS_READ_B64_gfx9 killed renamable $vgpr0, 0, 0, implicit $exec :: (load (s64) from %ir.8, addrspace 3)
   ; GFX90A-NEXT:   renamable $sgpr36_sgpr37 = S_MOV_B64 -1
   ; GFX90A-NEXT:   renamable $sgpr23 = S_MOV_B32 0
   ; GFX90A-NEXT:   renamable $sgpr17 = S_MOV_B32 0
@@ -845,38 +846,37 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.59.bb85:
   ; GFX90A-NEXT:   successors: %bb.56(0x40000000), %bb.60(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr20, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr56_sgpr57:0x000000000000000F, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr18, $vgpr30, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr56_sgpr57:0x000000000000000F, $sgpr60_sgpr61, $sgpr64_sgpr65, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   renamable $vgpr10 = V_OR_B32_e32 1, $vgpr8, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr11 = COPY renamable $vgpr9, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr5 = FLAT_LOAD_UBYTE renamable $vgpr10_vgpr11, 0, 0, implicit $exec, implicit $flat_scr :: (load (s8) from %ir.i86)
+  ; GFX90A-NEXT:   renamable $vgpr8 = V_OR_B32_e32 1, $vgpr6, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr9 = COPY renamable $vgpr7, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr10 = FLAT_LOAD_UBYTE renamable $vgpr8_vgpr9, 0, 0, implicit $exec, implicit $flat_scr :: (load (s8) from %ir.i86)
   ; GFX90A-NEXT:   renamable $sgpr17 = S_MOV_B32 0
   ; GFX90A-NEXT:   renamable $sgpr50_sgpr51 = S_MOV_B64 -1
-  ; GFX90A-NEXT:   renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr5, implicit $exec
+  ; GFX90A-NEXT:   renamable $vcc = V_CMP_EQ_U16_e64 0, killed $vgpr10, implicit $exec
   ; GFX90A-NEXT:   renamable $sgpr62_sgpr63 = COPY renamable $sgpr36_sgpr37
-  ; GFX90A-NEXT:   renamable $vgpr19 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr17 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr16 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr30 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr18 = IMPLICIT_DEF
-  ; GFX90A-NEXT:   renamable $vgpr54 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr15 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr14 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr52 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr16 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr53 = IMPLICIT_DEF
   ; GFX90A-NEXT:   renamable $vgpr13 = IMPLICIT_DEF
+  ; GFX90A-NEXT:   renamable $vgpr11 = IMPLICIT_DEF
   ; GFX90A-NEXT:   $sgpr52_sgpr53 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX90A-NEXT:   S_CBRANCH_EXECNZ %bb.56, implicit $exec
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.60.Flow31:
   ; GFX90A-NEXT:   successors: %bb.61(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr17, $vgpr19, $vgpr20, $vgpr30, $vgpr31, $vgpr54, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000C, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr52_sgpr53, implicit-def $scc
   ; GFX90A-NEXT:   renamable $sgpr52_sgpr53 = S_MOV_B64 0
-  ; GFX90A-NEXT:   renamable $vgpr12 = COPY renamable $vgpr16, implicit $exec
-  ; GFX90A-NEXT:   renamable $agpr0_agpr1 = COPY killed renamable $vgpr12_vgpr13, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr10 = COPY renamable $vgpr14, implicit $exec
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.61.Flow30:
   ; GFX90A-NEXT:   successors: %bb.55(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr17, $vgpr19, $vgpr20, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $sgpr17, $vgpr15, $vgpr17, $vgpr18, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr60_sgpr61, $sgpr62_sgpr63, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x0000000000000003, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $sgpr54_sgpr55 = S_XOR_B64 $exec, -1, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $sgpr58_sgpr59 = S_AND_B64 killed renamable $sgpr52_sgpr53, $exec, implicit-def dead $scc
@@ -888,7 +888,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.62.bb140:
   ; GFX90A-NEXT:   successors: %bb.68(0x40000000), %bb.63(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr26_vgpr27:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr30_sgpr31, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $sgpr36_sgpr37 = S_MOV_B64 -1
   ; GFX90A-NEXT:   renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr30_sgpr31, implicit-def dead $scc
@@ -896,122 +896,120 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.63.Flow13:
   ; GFX90A-NEXT:   successors: %bb.64(0x40000000), %bb.66(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000C, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   $vcc = S_ANDN2_B64 $exec, killed renamable $sgpr36_sgpr37, implicit-def dead $scc
   ; GFX90A-NEXT:   S_CBRANCH_VCCNZ %bb.66, implicit $vcc
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.64.bb159:
   ; GFX90A-NEXT:   successors: %bb.67(0x40000000), %bb.65(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000C, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   renamable $vcc = V_CMP_NE_U32_e64 0, killed $vgpr4, implicit $exec
+  ; GFX90A-NEXT:   renamable $vcc = V_CMP_NE_U32_e64 0, killed $vgpr30, implicit $exec
   ; GFX90A-NEXT:   $sgpr12_sgpr13 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX90A-NEXT:   renamable $sgpr12_sgpr13 = S_XOR_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def dead $scc
   ; GFX90A-NEXT:   S_CBRANCH_EXECNZ %bb.67, implicit $exec
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.65.Flow10:
   ; GFX90A-NEXT:   successors: %bb.66(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   $sgpr12_sgpr13 = S_ANDN2_SAVEEXEC_B64 $sgpr12_sgpr13, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX90A-NEXT:   $exec = S_OR_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def $scc
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.66.Flow14:
   ; GFX90A-NEXT:   successors: %bb.8(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $sgpr56_sgpr57 = COPY $exec
   ; GFX90A-NEXT:   S_BRANCH %bb.8
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.67.bb161:
   ; GFX90A-NEXT:   successors: %bb.65(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000C, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   renamable $vgpr2 = V_OR_B32_e32 killed $vgpr23, killed $vgpr25, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr2 = V_OR_B32_e32 killed $vgpr2, killed $vgpr27, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr3 = COPY killed renamable $agpr1, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr3 = V_OR_B32_e32 killed $vgpr3, killed $vgpr21, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr2 = V_OR_B32_e32 killed $vgpr21, killed $vgpr23, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr2 = V_OR_B32_e32 killed $vgpr2, killed $vgpr25, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr3 = V_OR_B32_e32 killed $vgpr11, killed $vgpr19, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr2 = V_OR_B32_e32 killed $vgpr3, killed $vgpr2, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr3 = V_MOV_B32_e32 0, implicit $exec
-  ; GFX90A-NEXT:   renamable $vcc = V_CMP_EQ_U16_sdwa 0, killed $vgpr54, 0, $vgpr3, 0, 0, 6, implicit $exec
+  ; GFX90A-NEXT:   renamable $vcc = V_CMP_EQ_U16_sdwa 0, killed $vgpr53, 0, $vgpr3, 0, 0, 6, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr2 = V_CNDMASK_B32_e64 0, 0, 0, killed $vgpr2, killed $vcc, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr4 = V_OR_B32_e32 killed $vgpr30, killed $vgpr15, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr2 = V_OR_B32_e32 killed $vgpr4, killed $vgpr2, implicit $exec
-  ; GFX90A-NEXT:   renamable $vcc = V_CMP_EQ_U16_sdwa 0, killed $vgpr19, 0, $vgpr3, 0, 0, 6, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr10 = V_OR_B32_e32 killed $vgpr52, killed $vgpr13, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr2 = V_OR_B32_e32 killed $vgpr10, killed $vgpr2, implicit $exec
+  ; GFX90A-NEXT:   renamable $vcc = V_CMP_EQ_U16_sdwa 0, killed $vgpr17, 0, $vgpr3, 0, 0, 6, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr2 = V_CNDMASK_B32_e64 0, 0, 0, killed $vgpr2, killed $vcc, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr2 = V_OR_B32_e32 killed $vgpr2, killed $vgpr17, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr2 = V_OR_B32_e32 killed $vgpr2, killed $vgpr15, implicit $exec
   ; GFX90A-NEXT:   DS_WRITE2_B32_gfx9 killed renamable $vgpr3, killed renamable $vgpr2, renamable $vgpr3, 0, 1, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, align 4, addrspace 3)
   ; GFX90A-NEXT:   S_BRANCH %bb.65
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.68.bb174:
   ; GFX90A-NEXT:   successors: %bb.72(0x40000000), %bb.69(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000F, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000F, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr26_vgpr27:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
-  ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   renamable $vgpr28 = V_OR_B32_e32 1, $vgpr26, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr38 = V_OR_B32_e32 $vgpr28, $vgpr24, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr36 = V_OR_B32_e32 $vgpr38, $vgpr22, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr32 = V_CNDMASK_B32_e64 0, $vgpr36, 0, 0, $sgpr12_sgpr13, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr50 = V_OR_B32_e32 $vgpr32, $vgpr20, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr12_vgpr13 = COPY renamable $agpr0_agpr1, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr48 = V_OR_B32_e32 $vgpr50, killed $vgpr12, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr34 = V_OR_B32_e32 $vgpr48, $vgpr14, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr52 = V_CNDMASK_B32_e64 0, 0, 0, $vgpr34, killed $sgpr12_sgpr13, implicit $exec
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT: {{  $}}
+  ; GFX90A-NEXT:   renamable $vgpr26 = V_OR_B32_e32 1, $vgpr24, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr38 = V_OR_B32_e32 $vgpr26, $vgpr22, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr34 = V_OR_B32_e32 $vgpr38, $vgpr20, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr28 = V_CNDMASK_B32_e64 0, $vgpr34, 0, 0, $sgpr12_sgpr13, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr36 = V_OR_B32_e32 $vgpr28, $vgpr18, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr48 = V_OR_B32_e32 $vgpr36, $vgpr10, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr32 = V_OR_B32_e32 $vgpr48, $vgpr12, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr50 = V_CNDMASK_B32_e64 0, 0, 0, $vgpr32, killed $sgpr12_sgpr13, implicit $exec
   ; GFX90A-NEXT:   renamable $sgpr12_sgpr13 = S_MOV_B64 -1
   ; GFX90A-NEXT:   renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr28_sgpr29, implicit-def dead $scc
   ; GFX90A-NEXT:   S_CBRANCH_VCCNZ %bb.72, implicit $vcc
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.69.Flow:
   ; GFX90A-NEXT:   successors: %bb.70(0x40000000), %bb.71(0x40000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000C, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr28_vgpr29:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr52_vgpr53:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x0000000000000003, $vgpr28_vgpr29:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   $vcc = S_ANDN2_B64 $exec, killed renamable $sgpr12_sgpr13, implicit-def dead $scc
   ; GFX90A-NEXT:   S_CBRANCH_VCCNZ %bb.71, implicit $vcc
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.70.bb186:
   ; GFX90A-NEXT:   successors: %bb.71(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000C, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr28_vgpr29:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr52_vgpr53:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x0000000000000003, $vgpr28_vgpr29:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr2_vgpr3 = V_LSHLREV_B64_e64 3, killed $vgpr2_vgpr3, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr5 = COPY renamable $sgpr27, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr10 = COPY renamable $sgpr27, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr2, renamable $vcc = V_ADD_CO_U32_e64 killed $sgpr26, $vgpr2, 0, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr3, dead renamable $vcc = V_ADDC_U32_e64 killed $vgpr5, killed $vgpr3, killed $vcc, 0, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr29 = V_MOV_B32_e32 0, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr39 = COPY renamable $vgpr29, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr37 = COPY renamable $vgpr29, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr51 = COPY renamable $vgpr29, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr49 = COPY renamable $vgpr29, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr33 = COPY renamable $vgpr29, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr53 = COPY renamable $vgpr29, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr35 = COPY renamable $vgpr29, implicit $exec
-  ; GFX90A-NEXT:   DS_WRITE_B64_gfx9 renamable $vgpr29, renamable $vgpr28_vgpr29, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3)
-  ; GFX90A-NEXT:   renamable $vgpr5 = COPY renamable $sgpr21, implicit $exec
-  ; GFX90A-NEXT:   DS_WRITE_B64_gfx9 renamable $vgpr5, killed renamable $vgpr38_vgpr39, 0, 0, implicit $exec :: (store (s64) into %ir.7, addrspace 3)
+  ; GFX90A-NEXT:   renamable $vgpr3, dead renamable $vcc = V_ADDC_U32_e64 killed $vgpr10, killed $vgpr3, killed $vcc, 0, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr27 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr39 = COPY renamable $vgpr27, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr35 = COPY renamable $vgpr27, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr37 = COPY renamable $vgpr27, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr49 = COPY renamable $vgpr27, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr29 = COPY renamable $vgpr27, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr51 = COPY renamable $vgpr27, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr33 = COPY renamable $vgpr27, implicit $exec
+  ; GFX90A-NEXT:   DS_WRITE_B64_gfx9 renamable $vgpr27, renamable $vgpr26_vgpr27, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3)
+  ; GFX90A-NEXT:   renamable $vgpr10 = COPY renamable $sgpr21, implicit $exec
+  ; GFX90A-NEXT:   DS_WRITE_B64_gfx9 renamable $vgpr10, killed renamable $vgpr38_vgpr39, 0, 0, implicit $exec :: (store (s64) into %ir.7, addrspace 3)
   ; GFX90A-NEXT:   renamable $vgpr12 = COPY killed renamable $sgpr22, implicit $exec
-  ; GFX90A-NEXT:   DS_WRITE_B64_gfx9 killed renamable $vgpr12, killed renamable $vgpr36_vgpr37, 0, 0, implicit $exec :: (store (s64) into %ir.8, addrspace 3)
-  ; GFX90A-NEXT:   DS_WRITE_B64_gfx9 renamable $vgpr29, killed renamable $vgpr50_vgpr51, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3)
-  ; GFX90A-NEXT:   DS_WRITE_B64_gfx9 renamable $vgpr5, killed renamable $vgpr48_vgpr49, 0, 0, implicit $exec :: (store (s64) into %ir.7, addrspace 3)
-  ; GFX90A-NEXT:   DS_WRITE_B64_gfx9 renamable $vgpr29, killed renamable $vgpr32_vgpr33, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3)
-  ; GFX90A-NEXT:   DS_WRITE_B64_gfx9 killed renamable $vgpr5, killed renamable $vgpr52_vgpr53, 0, 0, implicit $exec :: (store (s64) into %ir.7, addrspace 3)
-  ; GFX90A-NEXT:   DS_WRITE_B64_gfx9 killed renamable $vgpr29, killed renamable $vgpr34_vgpr35, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3)
+  ; GFX90A-NEXT:   DS_WRITE_B64_gfx9 killed renamable $vgpr12, killed renamable $vgpr34_vgpr35, 0, 0, implicit $exec :: (store (s64) into %ir.8, addrspace 3)
+  ; GFX90A-NEXT:   DS_WRITE_B64_gfx9 renamable $vgpr27, killed renamable $vgpr36_vgpr37, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3)
+  ; GFX90A-NEXT:   DS_WRITE_B64_gfx9 renamable $vgpr10, killed renamable $vgpr48_vgpr49, 0, 0, implicit $exec :: (store (s64) into %ir.7, addrspace 3)
+  ; GFX90A-NEXT:   DS_WRITE_B64_gfx9 renamable $vgpr27, killed renamable $vgpr28_vgpr29, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3)
+  ; GFX90A-NEXT:   DS_WRITE_B64_gfx9 killed renamable $vgpr10, killed renamable $vgpr50_vgpr51, 0, 0, implicit $exec :: (store (s64) into %ir.7, addrspace 3)
+  ; GFX90A-NEXT:   DS_WRITE_B64_gfx9 killed renamable $vgpr27, killed renamable $vgpr32_vgpr33, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3)
   ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null` + 4, basealign 8, addrspace 5)
   ; GFX90A-NEXT:   BUFFER_STORE_DWORD_OFFSET killed renamable $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (store (s32) into `ptr addrspace(5) null`, align 8, addrspace 5)
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.71.Flow9:
   ; GFX90A-NEXT:   successors: %bb.63(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000C, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $vgpr0_vgpr1:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $sgpr36_sgpr37 = S_MOV_B64 0
   ; GFX90A-NEXT:   S_BRANCH %bb.63
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT: bb.72.bb196:
   ; GFX90A-NEXT:   successors: %bb.69(0x80000000)
-  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr17, $vgpr19, $vgpr30, $vgpr31, $vgpr54, $agpr0_agpr1:0x000000000000000C, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x0000000000000003, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr14_vgpr15:0x000000000000000C, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x0000000000000003, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x000000000000000C, $vgpr28_vgpr29:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr52_vgpr53:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
+  ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000C, $vgpr12_vgpr13:0x000000000000000C, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000C, $vgpr20_vgpr21:0x000000000000000C, $vgpr22_vgpr23:0x000000000000000C, $vgpr24_vgpr25:0x000000000000000C, $vgpr26_vgpr27:0x0000000000000003, $vgpr28_vgpr29:0x0000000000000003, $vgpr32_vgpr33:0x0000000000000003, $vgpr34_vgpr35:0x0000000000000003, $vgpr36_vgpr37:0x0000000000000003, $vgpr38_vgpr39:0x0000000000000003, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr48_vgpr49:0x0000000000000003, $vgpr50_vgpr51:0x0000000000000003, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
-  ; GFX90A-NEXT:   renamable $vgpr5 = V_OR_B32_e32 $vgpr52, killed $vgpr18, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr12 = V_OR_B32_e32 killed $vgpr5, killed $vgpr16, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr13 = V_MOV_B32_e32 0, implicit $exec
-  ; GFX90A-NEXT:   DS_WRITE_B64_gfx9 killed renamable $vgpr13, renamable $vgpr12_vgpr13, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3)
+  ; GFX90A-NEXT:   renamable $vgpr10 = V_OR_B32_e32 $vgpr50, killed $vgpr16, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr54 = V_OR_B32_e32 killed $vgpr10, killed $vgpr14, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr55 = V_MOV_B32_e32 0, implicit $exec
+  ; GFX90A-NEXT:   DS_WRITE_B64_gfx9 killed renamable $vgpr55, renamable $vgpr54_vgpr55, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3)
   ; GFX90A-NEXT:   renamable $sgpr12_sgpr13 = S_MOV_B64 0
   ; GFX90A-NEXT:   S_BRANCH %bb.69
 bb:
diff --git a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll
index dc519df52919a..8b1c5c3ae071c 100644
--- a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll
+++ b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll
@@ -452,14 +452,14 @@ define amdgpu_kernel void @nested_if_if_else(ptr addrspace(1) nocapture %arg) {
 ; GCN-O0-NEXT:    s_mov_b32 s5, s2
 ; GCN-O0-NEXT:    ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3
 ; GCN-O0-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; GCN-O0-NEXT:    s_mov_b32 s4, 2
+; GCN-O0-NEXT:    v_lshlrev_b32_e64 v3, s4, v1
 ; GCN-O0-NEXT:    s_mov_b32 s4, 0
 ; GCN-O0-NEXT:    ; implicit-def: $sgpr4
-; GCN-O0-NEXT:    v_mov_b32_e32 v4, 0
 ; GCN-O0-NEXT:    s_waitcnt expcnt(0)
-; GCN-O0-NEXT:    v_mov_b32_e32 v2, v1
-; GCN-O0-NEXT:    v_mov_b32_e32 v3, v4
-; GCN-O0-NEXT:    s_mov_b32 s4, 2
-; GCN-O0-NEXT:    v_lshl_b64 v[3:4], v[2:3], s4
+; GCN-O0-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-O0-NEXT:    ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec
+; GCN-O0-NEXT:    v_mov_b32_e32 v4, v2
 ; GCN-O0-NEXT:    v_mov_b32_e32 v2, 0
 ; GCN-O0-NEXT:    buffer_store_dword v2, v[3:4], s[0:3], 0 addr64
 ; GCN-O0-NEXT:    s_mov_b32 s0, 1
@@ -689,15 +689,14 @@ define amdgpu_kernel void @nested_if_else_if(ptr addrspace(1) nocapture %arg) {
 ; GCN-O0-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
 ; GCN-O0-NEXT:    v_mov_b32_e32 v2, v1
 ; GCN-O0-NEXT:    buffer_store_dword v2, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill
-; GCN-O0-NEXT:    s_mov_b32 s0, 0
-; GCN-O0-NEXT:    ; implicit-def: $sgpr0
-; GCN-O0-NEXT:    v_mov_b32_e32 v4, 0
-; GCN-O0-NEXT:    s_waitcnt expcnt(0)
-; GCN-O0-NEXT:    v_mov_b32_e32 v2, v1
-; GCN-O0-NEXT:    v_mov_b32_e32 v3, v4
 ; GCN-O0-NEXT:    s_mov_b32 s0, 2
-; GCN-O0-NEXT:    s_mov_b32 s1, s0
-; GCN-O0-NEXT:    v_lshl_b64 v[3:4], v[2:3], s1
+; GCN-O0-NEXT:    v_lshlrev_b32_e64 v3, s0, v1
+; GCN-O0-NEXT:    s_mov_b32 s1, 0
+; GCN-O0-NEXT:    ; implicit-def: $sgpr1
+; GCN-O0-NEXT:    s_waitcnt expcnt(0)
+; GCN-O0-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-O0-NEXT:    ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec
+; GCN-O0-NEXT:    v_mov_b32_e32 v4, v2
 ; GCN-O0-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-O0-NEXT:    s_mov_b32 s2, s4
 ; GCN-O0-NEXT:    v_mov_b32_e32 v2, v3
diff --git a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll
index 64060ebbb159e..8382624af5d74 100644
--- a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll
+++ b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll
@@ -646,10 +646,8 @@ define amdgpu_kernel void @udiv16_invariant_denom(ptr addrspace(1) nocapture %ar
 ; GFX9-LABEL: udiv16_invariant_denom:
 ; GFX9:       ; %bb.0: ; %bb
 ; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x2c
-; GFX9-NEXT:    s_mov_b32 s5, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-NEXT:    s_movk_i32 s6, 0x400
-; GFX9-NEXT:    s_mov_b32 s7, 0
+; GFX9-NEXT:    s_movk_i32 s4, 0x400
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_and_b32 s2, s2, 0xffff
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s2
@@ -657,22 +655,19 @@ define amdgpu_kernel void @udiv16_invariant_denom(ptr addrspace(1) nocapture %ar
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v0
 ; GFX9-NEXT:  .LBB4_1: ; %bb3
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    s_and_b32 s4, 0xffff, s7
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, s4
-; GFX9-NEXT:    v_add_u16_e64 v3, s7, 1
-; GFX9-NEXT:    v_readfirstlane_b32 s7, v3
-; GFX9-NEXT:    v_cmp_eq_u16_e32 vcc, s6, v3
-; GFX9-NEXT:    v_mul_f32_e32 v3, v4, v1
-; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v5, v3
-; GFX9-NEXT:    s_lshl_b64 s[0:1], s[4:5], 1
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff, v2
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, v3
+; GFX9-NEXT:    v_add_u16_e32 v2, 1, v2
+; GFX9-NEXT:    v_cmp_eq_u16_e32 vcc, s4, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 1, v3
+; GFX9-NEXT:    v_mul_f32_e32 v5, v4, v1
+; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v6, v5
+; GFX9-NEXT:    v_mad_f32 v4, -v5, v0, v4
+; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v4|, v0
+; GFX9-NEXT:    v_addc_co_u32_e64 v4, s[0:1], 0, v6, s[0:1]
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_add_u32 s8, s2, s0
-; GFX9-NEXT:    v_mad_f32 v3, -v3, v0, v4
-; GFX9-NEXT:    s_addc_u32 s9, s3, s1
-; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v3|, v0
-; GFX9-NEXT:    v_addc_co_u32_e64 v3, s[0:1], 0, v5, s[0:1]
-; GFX9-NEXT:    global_store_short v2, v3, s[8:9]
+; GFX9-NEXT:    global_store_short v3, v4, s[2:3]
 ; GFX9-NEXT:    s_cbranch_vccz .LBB4_1
 ; GFX9-NEXT:  ; %bb.2: ; %bb2
 ; GFX9-NEXT:    s_endpgm
@@ -683,30 +678,25 @@ define amdgpu_kernel void @udiv16_invariant_denom(ptr addrspace(1) nocapture %ar
 ; GFX10-NEXT:    s_load_dword s4, s[0:1], 0x2c
 ; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
-; GFX10-NEXT:    s_mov_b32 s1, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_and_b32 s0, s4, 0xffff
-; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s0
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v1, v0
 ; GFX10-NEXT:  .LBB4_1: ; %bb3
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_and_b32 s0, 0xffff, s4
-; GFX10-NEXT:    v_add_nc_u16 v3, s4, 1
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v4, s0
-; GFX10-NEXT:    s_lshl_b64 s[4:5], s[0:1], 1
-; GFX10-NEXT:    s_add_u32 s6, s2, s4
-; GFX10-NEXT:    v_readfirstlane_b32 s4, v3
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x400, v3
-; GFX10-NEXT:    v_mul_f32_e32 v3, v4, v1
-; GFX10-NEXT:    s_addc_u32 s7, s3, s5
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff, v2
+; GFX10-NEXT:    v_add_nc_u16 v2, v2, 1
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v4, v3
+; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x400, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 1, v3
+; GFX10-NEXT:    v_mul_f32_e32 v5, v4, v1
 ; GFX10-NEXT:    s_and_b32 vcc_lo, exec_lo, vcc_lo
-; GFX10-NEXT:    v_trunc_f32_e32 v3, v3
-; GFX10-NEXT:    v_mad_f32 v4, -v3, v0, v4
-; GFX10-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; GFX10-NEXT:    v_trunc_f32_e32 v5, v5
+; GFX10-NEXT:    v_mad_f32 v4, -v5, v0, v4
+; GFX10-NEXT:    v_cvt_u32_f32_e32 v5, v5
 ; GFX10-NEXT:    v_cmp_ge_f32_e64 s0, |v4|, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s0, 0, v3, s0
-; GFX10-NEXT:    global_store_short v2, v3, s[6:7]
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v4, s0, 0, v5, s0
+; GFX10-NEXT:    global_store_short v3, v4, s[2:3]
 ; GFX10-NEXT:    s_cbranch_vccz .LBB4_1
 ; GFX10-NEXT:  ; %bb.2: ; %bb2
 ; GFX10-NEXT:    s_endpgm
@@ -717,36 +707,31 @@ define amdgpu_kernel void @udiv16_invariant_denom(ptr addrspace(1) nocapture %ar
 ; GFX11-NEXT:    s_load_b32 s4, s[0:1], 0x2c
 ; GFX11-NEXT:    s_load_b64 s[2:3], s[0:1], 0x24
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
-; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_and_b32 s0, s4, 0xffff
-; GFX11-NEXT:    s_mov_b32 s4, 0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_rcp_iflag_f32_e32 v1, v0
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB4_1: ; %bb3
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_and_b32 s0, 0xffff, s4
-; GFX11-NEXT:    v_add_nc_u16 v3, s4, 1
-; GFX11-NEXT:    v_cvt_f32_u32_e32 v4, s0
-; GFX11-NEXT:    s_lshl_b64 s[4:5], s[0:1], 1
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    s_add_u32 s6, s2, s4
-; GFX11-NEXT:    v_readfirstlane_b32 s4, v3
-; GFX11-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x400, v3
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v2
+; GFX11-NEXT:    v_add_nc_u16 v2, v2, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cvt_f32_u32_e32 v4, v3
+; GFX11-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x400, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 1, v3
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_mul_f32_e32 v3, v4, v1
-; GFX11-NEXT:    s_addc_u32 s7, s3, s5
+; GFX11-NEXT:    v_mul_f32_e32 v5, v4, v1
 ; GFX11-NEXT:    s_and_b32 vcc_lo, exec_lo, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_trunc_f32_e32 v3, v3
-; GFX11-NEXT:    v_fma_f32 v4, -v3, v0, v4
-; GFX11-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; GFX11-NEXT:    v_trunc_f32_e32 v5, v5
+; GFX11-NEXT:    v_fma_f32 v4, -v5, v0, v4
+; GFX11-NEXT:    v_cvt_u32_f32_e32 v5, v5
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_ge_f32_e64 s0, |v4|, v0
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, s0, 0, v3, s0
-; GFX11-NEXT:    global_store_b16 v2, v3, s[6:7]
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v4, s0, 0, v5, s0
+; GFX11-NEXT:    global_store_b16 v3, v4, s[2:3]
 ; GFX11-NEXT:    s_cbranch_vccz .LBB4_1
 ; GFX11-NEXT:  ; %bb.2: ; %bb2
 ; GFX11-NEXT:    s_nop 0
@@ -773,33 +758,31 @@ define amdgpu_kernel void @urem16_invariant_denom(ptr addrspace(1) nocapture %ar
 ; GFX9-LABEL: urem16_invariant_denom:
 ; GFX9:       ; %bb.0: ; %bb
 ; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x2c
-; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-NEXT:    s_movk_i32 s7, 0x400
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    s_movk_i32 s5, 0x400
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_and_b32 s6, s2, 0xffff
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, s6
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v2
+; GFX9-NEXT:    s_and_b32 s4, s2, 0xffff
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s4
+; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v0
 ; GFX9-NEXT:  .LBB5_1: ; %bb3
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffff, v4
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v8, v0
-; GFX9-NEXT:    v_lshlrev_b64 v[5:6], 1, v[0:1]
-; GFX9-NEXT:    v_add_u16_e32 v4, 1, v4
-; GFX9-NEXT:    v_mov_b32_e32 v7, s5
-; GFX9-NEXT:    v_mul_f32_e32 v9, v8, v3
-; GFX9-NEXT:    v_trunc_f32_e32 v9, v9
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v10, v9
-; GFX9-NEXT:    v_mad_f32 v8, -v9, v2, v8
-; GFX9-NEXT:    v_cmp_ge_f32_e64 s[2:3], |v8|, v2
-; GFX9-NEXT:    v_cmp_eq_u16_e32 vcc, s7, v4
-; GFX9-NEXT:    v_addc_co_u32_e64 v8, s[2:3], 0, v10, s[2:3]
-; GFX9-NEXT:    v_mul_lo_u32 v8, v8, s6
-; GFX9-NEXT:    v_add_co_u32_e64 v5, s[0:1], s4, v5
-; GFX9-NEXT:    v_addc_co_u32_e64 v6, s[0:1], v7, v6, s[0:1]
-; GFX9-NEXT:    v_sub_u32_e32 v0, v0, v8
-; GFX9-NEXT:    global_store_short v[5:6], v0, off
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xffff, v2
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, v3
+; GFX9-NEXT:    v_add_u16_e32 v2, 1, v2
+; GFX9-NEXT:    v_cmp_eq_u16_e32 vcc, s5, v2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v5, 1, v3
+; GFX9-NEXT:    v_mul_f32_e32 v6, v4, v1
+; GFX9-NEXT:    v_trunc_f32_e32 v6, v6
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v7, v6
+; GFX9-NEXT:    v_mad_f32 v4, -v6, v0, v4
+; GFX9-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v4|, v0
+; GFX9-NEXT:    s_and_b64 vcc, exec, vcc
+; GFX9-NEXT:    v_addc_co_u32_e64 v4, s[0:1], 0, v7, s[0:1]
+; GFX9-NEXT:    v_mul_lo_u32 v4, v4, s4
+; GFX9-NEXT:    v_sub_u32_e32 v3, v3, v4
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_store_short v5, v3, s[2:3]
 ; GFX9-NEXT:    s_cbranch_vccz .LBB5_1
 ; GFX9-NEXT:  ; %bb.2: ; %bb2
 ; GFX9-NEXT:    s_endpgm
@@ -809,30 +792,27 @@ define amdgpu_kernel void @urem16_invariant_denom(ptr addrspace(1) nocapture %ar
 ; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    s_load_dword s4, s[0:1], 0x2c
 ; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX10-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-NEXT:    v_mov_b32_e32 v4, 0
+; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_and_b32 s1, s4, 0xffff
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v2, s1
-; GFX10-NEXT:    v_rcp_iflag_f32_e32 v3, v2
+; GFX10-NEXT:    s_and_b32 s0, s4, 0xffff
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s0
+; GFX10-NEXT:    v_rcp_iflag_f32_e32 v1, v0
 ; GFX10-NEXT:  .LBB5_1: ; %bb3
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v4
-; GFX10-NEXT:    v_add_nc_u16 v4, v4, 1
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v7, v0
-; GFX10-NEXT:    v_lshlrev_b64 v[5:6], 1, v[0:1]
-; GFX10-NEXT:    v_mul_f32_e32 v8, v7, v3
-; GFX10-NEXT:    v_add_co_u32 v5, s0, s2, v5
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v6, s0, s3, v6, s0
-; GFX10-NEXT:    v_trunc_f32_e32 v8, v8
-; GFX10-NEXT:    v_mad_f32 v7, -v8, v2, v7
-; GFX10-NEXT:    v_cvt_u32_f32_e32 v8, v8
-; GFX10-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v7|, v2
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, 0, v8, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x400, v4
-; GFX10-NEXT:    v_mul_lo_u32 v7, v7, s1
-; GFX10-NEXT:    v_sub_nc_u32_e32 v0, v0, v7
-; GFX10-NEXT:    global_store_short v[5:6], v0, off
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffff, v2
+; GFX10-NEXT:    v_add_nc_u16 v2, v2, 1
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v4, v3
+; GFX10-NEXT:    v_mul_f32_e32 v5, v4, v1
+; GFX10-NEXT:    v_trunc_f32_e32 v5, v5
+; GFX10-NEXT:    v_mad_f32 v4, -v5, v0, v4
+; GFX10-NEXT:    v_cvt_u32_f32_e32 v5, v5
+; GFX10-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v4|, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v5, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x400, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 1, v3
+; GFX10-NEXT:    v_mul_lo_u32 v4, v4, s0
+; GFX10-NEXT:    v_sub_nc_u32_e32 v3, v3, v4
+; GFX10-NEXT:    global_store_short v5, v3, s[2:3]
 ; GFX10-NEXT:    s_cbranch_vccz .LBB5_1
 ; GFX10-NEXT:  ; %bb.2: ; %bb2
 ; GFX10-NEXT:    s_endpgm
@@ -840,38 +820,36 @@ define amdgpu_kernel void @urem16_invariant_denom(ptr addrspace(1) nocapture %ar
 ; GFX11-LABEL: urem16_invariant_denom:
 ; GFX11:       ; %bb.0: ; %bb
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b32 s4, s[0:1], 0x2c
-; GFX11-NEXT:    s_load_b64 s[2:3], s[0:1], 0x24
-; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v4, 0
+; GFX11-NEXT:    s_load_b32 s2, s[0:1], 0x2c
+; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_and_b32 s1, s4, 0xffff
+; GFX11-NEXT:    s_and_b32 s2, s2, 0xffff
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cvt_f32_u32_e32 v2, s1
-; GFX11-NEXT:    v_rcp_iflag_f32_e32 v3, v2
+; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, s2
+; GFX11-NEXT:    v_rcp_iflag_f32_e32 v1, v0
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB5_1: ; %bb3
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v4
-; GFX11-NEXT:    v_add_nc_u16 v4, v4, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cvt_f32_u32_e32 v7, v0
-; GFX11-NEXT:    v_lshlrev_b64 v[5:6], 1, v[0:1]
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffff, v2
+; GFX11-NEXT:    v_add_nc_u16 v2, v2, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cvt_f32_u32_e32 v4, v3
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_mul_f32_e32 v8, v7, v3
-; GFX11-NEXT:    v_add_co_u32 v5, s0, s2, v5
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v6, s0, s3, v6, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_trunc_f32_e32 v8, v8
-; GFX11-NEXT:    v_fma_f32 v7, -v8, v2, v7
-; GFX11-NEXT:    v_cvt_u32_f32_e32 v8, v8
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v7|, v2
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, 0, v8, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x400, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_mul_lo_u32 v7, v7, s1
-; GFX11-NEXT:    v_sub_nc_u32_e32 v0, v0, v7
-; GFX11-NEXT:    global_store_b16 v[5:6], v0, off
+; GFX11-NEXT:    v_mul_f32_e32 v5, v4, v1
+; GFX11-NEXT:    v_trunc_f32_e32 v5, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_fma_f32 v4, -v5, v0, v4
+; GFX11-NEXT:    v_cvt_u32_f32_e32 v5, v5
+; GFX11-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v4|, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v5, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x400, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 1, v3
+; GFX11-NEXT:    v_mul_lo_u32 v4, v4, s2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_sub_nc_u32_e32 v3, v3, v4
+; GFX11-NEXT:    global_store_b16 v5, v3, s[0:1]
 ; GFX11-NEXT:    s_cbranch_vccz .LBB5_1
 ; GFX11-NEXT:  ; %bb.2: ; %bb2
 ; GFX11-NEXT:    s_nop 0
@@ -898,38 +876,35 @@ define amdgpu_kernel void @sdiv16_invariant_denom(ptr addrspace(1) nocapture %ar
 ; GFX9-LABEL: sdiv16_invariant_denom:
 ; GFX9:       ; %bb.0: ; %bb
 ; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x2c
-; GFX9-NEXT:    s_mov_b32 s3, 0
+; GFX9-NEXT:    s_mov_b32 s4, 0
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-NEXT:    s_movk_i32 s5, 0x400
+; GFX9-NEXT:    s_movk_i32 s3, 0x400
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_sext_i32_i16 s4, s2
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s4
-; GFX9-NEXT:    s_mov_b32 s6, 0
+; GFX9-NEXT:    s_sext_i32_i16 s2, s2
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s2
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v0
 ; GFX9-NEXT:  .LBB6_1: ; %bb3
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    s_sext_i32_i16 s2, s6
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v4, s2
-; GFX9-NEXT:    s_xor_b32 s7, s2, s4
-; GFX9-NEXT:    s_ashr_i32 s2, s7, 30
-; GFX9-NEXT:    s_or_b32 s2, s2, 1
-; GFX9-NEXT:    v_mul_f32_e32 v5, v4, v1
-; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
-; GFX9-NEXT:    v_mad_f32 v4, -v5, v0, v4
-; GFX9-NEXT:    v_cmp_ge_f32_e64 s[8:9], |v4|, |v0|
-; GFX9-NEXT:    v_cvt_i32_f32_e32 v5, v5
-; GFX9-NEXT:    s_and_b64 s[8:9], s[8:9], exec
-; GFX9-NEXT:    s_cselect_b32 s7, s2, 0
-; GFX9-NEXT:    s_and_b32 s2, s6, 0xffff
-; GFX9-NEXT:    v_add_u16_e64 v3, s6, 1
-; GFX9-NEXT:    s_lshl_b64 s[8:9], s[2:3], 1
-; GFX9-NEXT:    v_cmp_eq_u16_e32 vcc, s5, v3
-; GFX9-NEXT:    s_add_u32 s8, s0, s8
-; GFX9-NEXT:    v_readfirstlane_b32 s6, v3
-; GFX9-NEXT:    v_add_u32_e32 v3, s7, v5
-; GFX9-NEXT:    s_addc_u32 s9, s1, s9
-; GFX9-NEXT:    global_store_short v2, v3, s[8:9]
+; GFX9-NEXT:    s_sext_i32_i16 s5, s4
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, s5
+; GFX9-NEXT:    s_xor_b32 s6, s5, s2
+; GFX9-NEXT:    s_ashr_i32 s5, s6, 30
+; GFX9-NEXT:    s_or_b32 s5, s5, 1
+; GFX9-NEXT:    v_mul_f32_e32 v4, v3, v1
+; GFX9-NEXT:    v_trunc_f32_e32 v4, v4
+; GFX9-NEXT:    v_mad_f32 v3, -v4, v0, v3
+; GFX9-NEXT:    v_cvt_i32_f32_e32 v4, v4
+; GFX9-NEXT:    v_cmp_ge_f32_e64 s[6:7], |v3|, |v0|
+; GFX9-NEXT:    s_and_b64 s[6:7], s[6:7], exec
+; GFX9-NEXT:    v_add_u16_e64 v2, s4, 1
+; GFX9-NEXT:    s_cselect_b32 s5, s5, 0
+; GFX9-NEXT:    s_and_b32 s6, 0xffff, s4
+; GFX9-NEXT:    v_cmp_eq_u16_e32 vcc, s3, v2
+; GFX9-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX9-NEXT:    v_add_u32_e32 v2, s5, v4
+; GFX9-NEXT:    s_lshl_b32 s5, s6, 1
+; GFX9-NEXT:    v_mov_b32_e32 v3, s5
+; GFX9-NEXT:    global_store_short v3, v2, s[0:1]
 ; GFX9-NEXT:    s_cbranch_vccz .LBB6_1
 ; GFX9-NEXT:  ; %bb.2: ; %bb2
 ; GFX9-NEXT:    s_endpgm
@@ -939,36 +914,33 @@ define amdgpu_kernel void @sdiv16_invariant_denom(ptr addrspace(1) nocapture %ar
 ; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    s_load_dword s4, s[0:1], 0x2c
 ; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    s_mov_b32 s1, 0
-; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_sext_i32_i16 s4, s4
-; GFX10-NEXT:    v_cvt_f32_i32_e32 v0, s4
+; GFX10-NEXT:    s_sext_i32_i16 s0, s4
+; GFX10-NEXT:    v_cvt_f32_i32_e32 v0, s0
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v1, v0
 ; GFX10-NEXT:  .LBB6_1: ; %bb3
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_sext_i32_i16 s0, s5
-; GFX10-NEXT:    v_add_nc_u16 v3, s5, 1
-; GFX10-NEXT:    v_cvt_f32_i32_e32 v4, s0
-; GFX10-NEXT:    s_xor_b32 s0, s0, s4
-; GFX10-NEXT:    s_ashr_i32 s0, s0, 30
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x400, v3
-; GFX10-NEXT:    v_mul_f32_e32 v5, v4, v1
-; GFX10-NEXT:    s_or_b32 s0, s0, 1
-; GFX10-NEXT:    v_trunc_f32_e32 v5, v5
-; GFX10-NEXT:    v_mad_f32 v4, -v5, v0, v4
-; GFX10-NEXT:    v_cmp_ge_f32_e64 s6, |v4|, |v0|
-; GFX10-NEXT:    v_cvt_i32_f32_e32 v4, v5
-; GFX10-NEXT:    s_and_b32 s6, s6, exec_lo
-; GFX10-NEXT:    s_cselect_b32 s6, s0, 0
-; GFX10-NEXT:    s_and_b32 s0, s5, 0xffff
-; GFX10-NEXT:    v_readfirstlane_b32 s5, v3
-; GFX10-NEXT:    v_add_nc_u32_e32 v3, s6, v4
-; GFX10-NEXT:    s_lshl_b64 s[6:7], s[0:1], 1
-; GFX10-NEXT:    s_add_u32 s6, s2, s6
-; GFX10-NEXT:    s_addc_u32 s7, s3, s7
-; GFX10-NEXT:    global_store_short v2, v3, s[6:7]
+; GFX10-NEXT:    s_sext_i32_i16 s4, s1
+; GFX10-NEXT:    v_add_nc_u16 v2, s1, 1
+; GFX10-NEXT:    v_cvt_f32_i32_e32 v3, s4
+; GFX10-NEXT:    s_xor_b32 s5, s4, s0
+; GFX10-NEXT:    s_ashr_i32 s4, s5, 30
+; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x400, v2
+; GFX10-NEXT:    v_mul_f32_e32 v4, v3, v1
+; GFX10-NEXT:    s_or_b32 s4, s4, 1
+; GFX10-NEXT:    v_trunc_f32_e32 v4, v4
+; GFX10-NEXT:    v_mad_f32 v3, -v4, v0, v3
+; GFX10-NEXT:    v_cvt_i32_f32_e32 v4, v4
+; GFX10-NEXT:    v_cmp_ge_f32_e64 s5, |v3|, |v0|
+; GFX10-NEXT:    s_and_b32 s5, s5, exec_lo
+; GFX10-NEXT:    s_cselect_b32 s4, s4, 0
+; GFX10-NEXT:    s_and_b32 s5, 0xffff, s1
+; GFX10-NEXT:    v_readfirstlane_b32 s1, v2
+; GFX10-NEXT:    s_lshl_b32 s5, s5, 1
+; GFX10-NEXT:    v_add_nc_u32_e32 v2, s4, v4
+; GFX10-NEXT:    v_mov_b32_e32 v3, s5
+; GFX10-NEXT:    global_store_short v3, v2, s[2:3]
 ; GFX10-NEXT:    s_cbranch_vccz .LBB6_1
 ; GFX10-NEXT:  ; %bb.2: ; %bb2
 ; GFX10-NEXT:    s_endpgm
@@ -978,43 +950,39 @@ define amdgpu_kernel void @sdiv16_invariant_denom(ptr addrspace(1) nocapture %ar
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    s_load_b32 s2, s[0:1], 0x2c
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_mov_b32 s3, 0
-; GFX11-NEXT:    s_mov_b32 s5, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_sext_i32_i16 s4, s2
+; GFX11-NEXT:    s_sext_i32_i16 s2, s2
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cvt_f32_i32_e32 v0, s4
+; GFX11-NEXT:    v_cvt_f32_i32_e32 v0, s2
 ; GFX11-NEXT:    v_rcp_iflag_f32_e32 v1, v0
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB6_1: ; %bb3
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_sext_i32_i16 s2, s5
-; GFX11-NEXT:    v_add_nc_u16 v3, s5, 1
-; GFX11-NEXT:    v_cvt_f32_i32_e32 v4, s2
-; GFX11-NEXT:    s_xor_b32 s2, s2, s4
+; GFX11-NEXT:    s_sext_i32_i16 s4, s3
+; GFX11-NEXT:    v_add_nc_u16 v2, s3, 1
+; GFX11-NEXT:    v_cvt_f32_i32_e32 v3, s4
+; GFX11-NEXT:    s_xor_b32 s5, s4, s2
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    s_ashr_i32 s2, s2, 30
-; GFX11-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x400, v3
+; GFX11-NEXT:    s_ashr_i32 s4, s5, 30
+; GFX11-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x400, v2
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_mul_f32_e32 v5, v4, v1
-; GFX11-NEXT:    s_or_b32 s2, s2, 1
+; GFX11-NEXT:    v_mul_f32_e32 v4, v3, v1
+; GFX11-NEXT:    s_or_b32 s4, s4, 1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_trunc_f32_e32 v5, v5
-; GFX11-NEXT:    v_fma_f32 v4, -v5, v0, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cmp_ge_f32_e64 s6, |v4|, |v0|
-; GFX11-NEXT:    v_cvt_i32_f32_e32 v4, v5
-; GFX11-NEXT:    s_and_b32 s6, s6, exec_lo
-; GFX11-NEXT:    s_cselect_b32 s6, s2, 0
-; GFX11-NEXT:    s_and_b32 s2, s5, 0xffff
-; GFX11-NEXT:    v_readfirstlane_b32 s5, v3
-; GFX11-NEXT:    v_add_nc_u32_e32 v3, s6, v4
-; GFX11-NEXT:    s_lshl_b64 s[6:7], s[2:3], 1
+; GFX11-NEXT:    v_trunc_f32_e32 v4, v4
+; GFX11-NEXT:    v_fma_f32 v3, -v4, v0, v3
+; GFX11-NEXT:    v_cvt_i32_f32_e32 v4, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_ge_f32_e64 s5, |v3|, |v0|
+; GFX11-NEXT:    s_and_b32 s5, s5, exec_lo
+; GFX11-NEXT:    s_cselect_b32 s4, s4, 0
+; GFX11-NEXT:    s_and_b32 s5, 0xffff, s3
+; GFX11-NEXT:    v_readfirstlane_b32 s3, v2
+; GFX11-NEXT:    s_lshl_b32 s5, s5, 1
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_add_u32 s6, s0, s6
-; GFX11-NEXT:    s_addc_u32 s7, s1, s7
-; GFX11-NEXT:    global_store_b16 v2, v3, s[6:7]
+; GFX11-NEXT:    v_dual_mov_b32 v3, s5 :: v_dual_add_nc_u32 v2, s4, v4
+; GFX11-NEXT:    global_store_b16 v3, v2, s[0:1]
 ; GFX11-NEXT:    s_cbranch_vccz .LBB6_1
 ; GFX11-NEXT:  ; %bb.2: ; %bb2
 ; GFX11-NEXT:    s_nop 0
@@ -1041,40 +1009,37 @@ define amdgpu_kernel void @srem16_invariant_denom(ptr addrspace(1) nocapture %ar
 ; GFX9-LABEL: srem16_invariant_denom:
 ; GFX9:       ; %bb.0: ; %bb
 ; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x2c
-; GFX9-NEXT:    s_mov_b32 s3, 0
+; GFX9-NEXT:    s_mov_b32 s4, 0
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-NEXT:    s_movk_i32 s5, 0x400
+; GFX9-NEXT:    s_movk_i32 s3, 0x400
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_sext_i32_i16 s4, s2
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s4
-; GFX9-NEXT:    s_mov_b32 s6, 0
+; GFX9-NEXT:    s_sext_i32_i16 s2, s2
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s2
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v0
 ; GFX9-NEXT:  .LBB7_1: ; %bb3
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    s_sext_i32_i16 s7, s6
-; GFX9-NEXT:    v_cvt_f32_i32_e32 v4, s7
-; GFX9-NEXT:    s_xor_b32 s2, s7, s4
-; GFX9-NEXT:    s_ashr_i32 s2, s2, 30
-; GFX9-NEXT:    s_or_b32 s2, s2, 1
-; GFX9-NEXT:    v_mul_f32_e32 v5, v4, v1
-; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
-; GFX9-NEXT:    v_mad_f32 v4, -v5, v0, v4
-; GFX9-NEXT:    v_cvt_i32_f32_e32 v5, v5
-; GFX9-NEXT:    v_cmp_ge_f32_e64 s[8:9], |v4|, |v0|
-; GFX9-NEXT:    s_and_b64 s[8:9], s[8:9], exec
-; GFX9-NEXT:    v_add_u16_e64 v3, s6, 1
-; GFX9-NEXT:    s_cselect_b32 s8, s2, 0
-; GFX9-NEXT:    v_cmp_eq_u16_e32 vcc, s5, v3
-; GFX9-NEXT:    s_and_b32 s2, s6, 0xffff
-; GFX9-NEXT:    v_readfirstlane_b32 s6, v3
-; GFX9-NEXT:    v_add_u32_e32 v3, s8, v5
-; GFX9-NEXT:    v_mul_lo_u32 v3, v3, s4
-; GFX9-NEXT:    s_lshl_b64 s[8:9], s[2:3], 1
-; GFX9-NEXT:    s_add_u32 s8, s0, s8
-; GFX9-NEXT:    s_addc_u32 s9, s1, s9
-; GFX9-NEXT:    v_sub_u32_e32 v3, s7, v3
-; GFX9-NEXT:    global_store_short v2, v3, s[8:9]
+; GFX9-NEXT:    s_sext_i32_i16 s5, s4
+; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, s5
+; GFX9-NEXT:    s_xor_b32 s6, s5, s2
+; GFX9-NEXT:    s_ashr_i32 s6, s6, 30
+; GFX9-NEXT:    s_or_b32 s8, s6, 1
+; GFX9-NEXT:    v_mul_f32_e32 v4, v3, v1
+; GFX9-NEXT:    v_trunc_f32_e32 v4, v4
+; GFX9-NEXT:    v_mad_f32 v3, -v4, v0, v3
+; GFX9-NEXT:    v_cvt_i32_f32_e32 v4, v4
+; GFX9-NEXT:    v_cmp_ge_f32_e64 s[6:7], |v3|, |v0|
+; GFX9-NEXT:    s_and_b64 s[6:7], s[6:7], exec
+; GFX9-NEXT:    v_add_u16_e64 v2, s4, 1
+; GFX9-NEXT:    s_cselect_b32 s6, s8, 0
+; GFX9-NEXT:    v_cmp_eq_u16_e32 vcc, s3, v2
+; GFX9-NEXT:    s_and_b32 s7, 0xffff, s4
+; GFX9-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX9-NEXT:    v_add_u32_e32 v2, s6, v4
+; GFX9-NEXT:    v_mul_lo_u32 v2, v2, s2
+; GFX9-NEXT:    s_lshl_b32 s6, s7, 1
+; GFX9-NEXT:    v_mov_b32_e32 v3, s6
+; GFX9-NEXT:    v_sub_u32_e32 v2, s5, v2
+; GFX9-NEXT:    global_store_short v3, v2, s[0:1]
 ; GFX9-NEXT:    s_cbranch_vccz .LBB7_1
 ; GFX9-NEXT:  ; %bb.2: ; %bb2
 ; GFX9-NEXT:    s_endpgm
@@ -1084,38 +1049,36 @@ define amdgpu_kernel void @srem16_invariant_denom(ptr addrspace(1) nocapture %ar
 ; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    s_load_dword s4, s[0:1], 0x2c
 ; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    s_mov_b32 s1, 0
-; GFX10-NEXT:    s_mov_b32 s5, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_sext_i32_i16 s4, s4
-; GFX10-NEXT:    v_cvt_f32_i32_e32 v0, s4
+; GFX10-NEXT:    s_sext_i32_i16 s0, s4
+; GFX10-NEXT:    v_cvt_f32_i32_e32 v0, s0
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v1, v0
 ; GFX10-NEXT:  .LBB7_1: ; %bb3
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    s_sext_i32_i16 s8, s5
-; GFX10-NEXT:    v_add_nc_u16 v3, s5, 1
-; GFX10-NEXT:    v_cvt_f32_i32_e32 v4, s8
-; GFX10-NEXT:    s_xor_b32 s0, s8, s4
-; GFX10-NEXT:    s_ashr_i32 s0, s0, 30
-; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x400, v3
-; GFX10-NEXT:    v_mul_f32_e32 v5, v4, v1
-; GFX10-NEXT:    s_or_b32 s0, s0, 1
-; GFX10-NEXT:    v_trunc_f32_e32 v5, v5
-; GFX10-NEXT:    v_mad_f32 v4, -v5, v0, v4
-; GFX10-NEXT:    v_cmp_ge_f32_e64 s6, |v4|, |v0|
-; GFX10-NEXT:    v_cvt_i32_f32_e32 v4, v5
+; GFX10-NEXT:    s_sext_i32_i16 s4, s1
+; GFX10-NEXT:    v_add_nc_u16 v2, s1, 1
+; GFX10-NEXT:    v_cvt_f32_i32_e32 v3, s4
+; GFX10-NEXT:    s_xor_b32 s5, s4, s0
+; GFX10-NEXT:    s_ashr_i32 s5, s5, 30
+; GFX10-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x400, v2
+; GFX10-NEXT:    v_mul_f32_e32 v4, v3, v1
+; GFX10-NEXT:    s_or_b32 s5, s5, 1
+; GFX10-NEXT:    v_trunc_f32_e32 v4, v4
+; GFX10-NEXT:    v_mad_f32 v3, -v4, v0, v3
+; GFX10-NEXT:    v_cmp_ge_f32_e64 s6, |v3|, |v0|
+; GFX10-NEXT:    v_cvt_i32_f32_e32 v3, v4
 ; GFX10-NEXT:    s_and_b32 s6, s6, exec_lo
-; GFX10-NEXT:    s_cselect_b32 s6, s0, 0
-; GFX10-NEXT:    s_and_b32 s0, s5, 0xffff
-; GFX10-NEXT:    v_add_nc_u32_e32 v4, s6, v4
-; GFX10-NEXT:    v_readfirstlane_b32 s5, v3
-; GFX10-NEXT:    s_lshl_b64 s[6:7], s[0:1], 1
-; GFX10-NEXT:    s_add_u32 s6, s2, s6
-; GFX10-NEXT:    v_mul_lo_u32 v3, v4, s4
-; GFX10-NEXT:    s_addc_u32 s7, s3, s7
-; GFX10-NEXT:    v_sub_nc_u32_e32 v3, s8, v3
-; GFX10-NEXT:    global_store_short v2, v3, s[6:7]
+; GFX10-NEXT:    s_cselect_b32 s5, s5, 0
+; GFX10-NEXT:    s_and_b32 vcc_lo, exec_lo, vcc_lo
+; GFX10-NEXT:    v_add_nc_u32_e32 v3, s5, v3
+; GFX10-NEXT:    s_and_b32 s5, 0xffff, s1
+; GFX10-NEXT:    v_readfirstlane_b32 s1, v2
+; GFX10-NEXT:    s_lshl_b32 s5, s5, 1
+; GFX10-NEXT:    v_mov_b32_e32 v2, s5
+; GFX10-NEXT:    v_mul_lo_u32 v3, v3, s0
+; GFX10-NEXT:    v_sub_nc_u32_e32 v3, s4, v3
+; GFX10-NEXT:    global_store_short v2, v3, s[2:3]
 ; GFX10-NEXT:    s_cbranch_vccz .LBB7_1
 ; GFX10-NEXT:  ; %bb.2: ; %bb2
 ; GFX10-NEXT:    s_endpgm
@@ -1125,46 +1088,44 @@ define amdgpu_kernel void @srem16_invariant_denom(ptr addrspace(1) nocapture %ar
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    s_load_b32 s2, s[0:1], 0x2c
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_mov_b32 s3, 0
-; GFX11-NEXT:    s_mov_b32 s5, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_sext_i32_i16 s4, s2
+; GFX11-NEXT:    s_sext_i32_i16 s2, s2
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cvt_f32_i32_e32 v0, s4
+; GFX11-NEXT:    v_cvt_f32_i32_e32 v0, s2
 ; GFX11-NEXT:    v_rcp_iflag_f32_e32 v1, v0
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB7_1: ; %bb3
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT:    s_sext_i32_i16 s8, s5
-; GFX11-NEXT:    v_add_nc_u16 v3, s5, 1
-; GFX11-NEXT:    v_cvt_f32_i32_e32 v4, s8
-; GFX11-NEXT:    s_xor_b32 s2, s8, s4
+; GFX11-NEXT:    s_sext_i32_i16 s4, s3
+; GFX11-NEXT:    v_add_nc_u16 v2, s3, 1
+; GFX11-NEXT:    v_cvt_f32_i32_e32 v3, s4
+; GFX11-NEXT:    s_xor_b32 s5, s4, s2
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    s_ashr_i32 s2, s2, 30
-; GFX11-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x400, v3
+; GFX11-NEXT:    s_ashr_i32 s5, s5, 30
+; GFX11-NEXT:    v_cmp_eq_u16_e32 vcc_lo, 0x400, v2
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_mul_f32_e32 v5, v4, v1
-; GFX11-NEXT:    s_or_b32 s2, s2, 1
+; GFX11-NEXT:    v_mul_f32_e32 v4, v3, v1
+; GFX11-NEXT:    s_or_b32 s5, s5, 1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_trunc_f32_e32 v5, v5
-; GFX11-NEXT:    v_fma_f32 v4, -v5, v0, v4
+; GFX11-NEXT:    v_trunc_f32_e32 v4, v4
+; GFX11-NEXT:    v_fma_f32 v3, -v4, v0, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cmp_ge_f32_e64 s6, |v4|, |v0|
-; GFX11-NEXT:    v_cvt_i32_f32_e32 v4, v5
+; GFX11-NEXT:    v_cmp_ge_f32_e64 s6, |v3|, |v0|
+; GFX11-NEXT:    v_cvt_i32_f32_e32 v3, v4
 ; GFX11-NEXT:    s_and_b32 s6, s6, exec_lo
-; GFX11-NEXT:    s_cselect_b32 s6, s2, 0
-; GFX11-NEXT:    s_and_b32 s2, s5, 0xffff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    v_add_nc_u32_e32 v4, s6, v4
-; GFX11-NEXT:    v_readfirstlane_b32 s5, v3
-; GFX11-NEXT:    s_lshl_b64 s[6:7], s[2:3], 1
-; GFX11-NEXT:    s_add_u32 s6, s0, s6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_mul_lo_u32 v3, v4, s4
-; GFX11-NEXT:    s_addc_u32 s7, s1, s7
-; GFX11-NEXT:    v_sub_nc_u32_e32 v3, s8, v3
-; GFX11-NEXT:    global_store_b16 v2, v3, s[6:7]
+; GFX11-NEXT:    s_cselect_b32 s5, s5, 0
+; GFX11-NEXT:    s_and_b32 vcc_lo, exec_lo, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    v_add_nc_u32_e32 v3, s5, v3
+; GFX11-NEXT:    s_and_b32 s5, 0xffff, s3
+; GFX11-NEXT:    v_readfirstlane_b32 s3, v2
+; GFX11-NEXT:    s_lshl_b32 s5, s5, 1
+; GFX11-NEXT:    v_mov_b32_e32 v2, s5
+; GFX11-NEXT:    v_mul_lo_u32 v3, v3, s2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_sub_nc_u32_e32 v3, s4, v3
+; GFX11-NEXT:    global_store_b16 v2, v3, s[0:1]
 ; GFX11-NEXT:    s_cbranch_vccz .LBB7_1
 ; GFX11-NEXT:  ; %bb.2: ; %bb2
 ; GFX11-NEXT:    s_nop 0
diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
index 17b387be79258..3cb03099da93d 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
@@ -28,138 +28,136 @@ define amdgpu_kernel void @clmem_read_simplified(ptr addrspace(1)  %buffer) {
 ; GFX8-NEXT:    s_mov_b32 s32, 0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xff, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 7, v0
-; GFX8-NEXT:    v_mov_b32_e32 v2, 0
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff8000, v0
-; GFX8-NEXT:    v_mov_b32_e32 v4, s35
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, s34, v0
-; GFX8-NEXT:    v_lshlrev_b64 v[0:1], 3, v[1:2]
-; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v3, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v4, v1, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 7, v0
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff8000, v1
+; GFX8-NEXT:    v_mov_b32_e32 v2, s35
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, s34, v1
+; GFX8-NEXT:    v_mov_b32_e32 v3, 3
+; GFX8-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v1, v0
+; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v2, vcc
 ; GFX8-NEXT:    s_movk_i32 s0, 0x800
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s0, v0
-; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, 0, v1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s0, v3
+; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, 0, v4, vcc
 ; GFX8-NEXT:    s_movk_i32 s0, 0x1000
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, s0, v0
-; GFX8-NEXT:    v_addc_u32_e32 v8, vcc, 0, v1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, s0, v3
+; GFX8-NEXT:    v_addc_u32_e32 v8, vcc, 0, v4, vcc
 ; GFX8-NEXT:    s_movk_i32 s0, 0x1800
-; GFX8-NEXT:    v_add_u32_e32 v9, vcc, s0, v0
-; GFX8-NEXT:    v_addc_u32_e32 v10, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dwordx2 v[11:12], v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, s0, v3
+; GFX8-NEXT:    v_addc_u32_e32 v10, vcc, 0, v4, vcc
+; GFX8-NEXT:    flat_load_dwordx2 v[11:12], v[3:4]
 ; GFX8-NEXT:    flat_load_dwordx2 v[5:6], v[5:6]
 ; GFX8-NEXT:    flat_load_dwordx2 v[7:8], v[7:8]
 ; GFX8-NEXT:    flat_load_dwordx2 v[9:10], v[9:10]
 ; GFX8-NEXT:    s_movk_i32 s0, 0x2000
-; GFX8-NEXT:    v_add_u32_e32 v13, vcc, s0, v0
-; GFX8-NEXT:    v_addc_u32_e32 v14, vcc, 0, v1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v13, vcc, s0, v3
+; GFX8-NEXT:    v_addc_u32_e32 v14, vcc, 0, v4, vcc
 ; GFX8-NEXT:    s_movk_i32 s0, 0x2800
-; GFX8-NEXT:    v_add_u32_e32 v15, vcc, s0, v0
-; GFX8-NEXT:    v_addc_u32_e32 v16, vcc, 0, v1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v15, vcc, s0, v3
+; GFX8-NEXT:    v_addc_u32_e32 v16, vcc, 0, v4, vcc
 ; GFX8-NEXT:    flat_load_dwordx2 v[13:14], v[13:14]
 ; GFX8-NEXT:    flat_load_dwordx2 v[15:16], v[15:16]
 ; GFX8-NEXT:    s_movk_i32 s0, 0x3000
-; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s0, v0
-; GFX8-NEXT:    v_addc_u32_e32 v18, vcc, 0, v1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s0, v3
+; GFX8-NEXT:    v_addc_u32_e32 v18, vcc, 0, v4, vcc
 ; GFX8-NEXT:    flat_load_dwordx2 v[17:18], v[17:18]
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x3800, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x3800, v3
+; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
+; GFX8-NEXT:    flat_load_dwordx2 v[3:4], v[3:4]
 ; GFX8-NEXT:    s_waitcnt vmcnt(6)
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v5, v11
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v5, v11
 ; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v6, v12, vcc
 ; GFX8-NEXT:    s_waitcnt vmcnt(5)
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v7, v2
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v7, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v8, v5, vcc
 ; GFX8-NEXT:    s_waitcnt vmcnt(4)
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v9, v2
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v9, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v10, v5, vcc
 ; GFX8-NEXT:    s_waitcnt vmcnt(3)
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v13, v2
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v13, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v14, v5, vcc
 ; GFX8-NEXT:    s_waitcnt vmcnt(2)
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v15, v2
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v15, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v16, v5, vcc
 ; GFX8-NEXT:    s_waitcnt vmcnt(1)
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v17, v2
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v17, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v18, v5, vcc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v1, v5, vcc
-; GFX8-NEXT:    flat_store_dwordx2 v[3:4], v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v0
+; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, v4, v5, vcc
+; GFX8-NEXT:    flat_store_dwordx2 v[1:2], v[3:4]
 ; GFX8-NEXT:    s_endpgm
 ;
-; GFX900-LABEL: clmem_read_simplified:
-; GFX900:       ; %bb.0: ; %entry
-; GFX900-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX900-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX900-NEXT:    s_mov_b32 s38, -1
-; GFX900-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX900-NEXT:    s_add_u32 s36, s36, s3
-; GFX900-NEXT:    s_addc_u32 s37, s37, 0
-; GFX900-NEXT:    s_load_dwordx2 s[34:35], s[0:1], 0x24
-; GFX900-NEXT:    s_getpc_b64 s[0:1]
-; GFX900-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4
-; GFX900-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12
-; GFX900-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX900-NEXT:    s_mov_b64 s[0:1], s[36:37]
-; GFX900-NEXT:    v_mov_b32_e32 v31, v0
-; GFX900-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX900-NEXT:    v_mov_b32_e32 v0, 0
-; GFX900-NEXT:    s_mov_b32 s32, 0
-; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX900-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX900-NEXT:    v_and_b32_e32 v1, 0xff, v0
-; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 7, v0
-; GFX900-NEXT:    v_and_b32_e32 v18, 0xffff8000, v0
-; GFX900-NEXT:    v_mov_b32_e32 v2, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, s35
-; GFX900-NEXT:    v_add_co_u32_e32 v3, vcc, s34, v18
-; GFX900-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v0, vcc
-; GFX900-NEXT:    v_lshlrev_b64 v[0:1], 3, v[1:2]
-; GFX900-NEXT:    s_movk_i32 s1, 0x2000
-; GFX900-NEXT:    v_add_co_u32_e32 v0, vcc, v3, v0
-; GFX900-NEXT:    v_addc_co_u32_e32 v1, vcc, v4, v1, vcc
-; GFX900-NEXT:    global_load_dwordx2 v[2:3], v[0:1], off
-; GFX900-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:2048
-; GFX900-NEXT:    v_add_co_u32_e32 v6, vcc, s1, v0
-; GFX900-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
-; GFX900-NEXT:    global_load_dwordx2 v[8:9], v[6:7], off offset:-4096
-; GFX900-NEXT:    s_movk_i32 s0, 0x1000
-; GFX900-NEXT:    v_add_co_u32_e32 v10, vcc, s0, v0
-; GFX900-NEXT:    v_addc_co_u32_e32 v11, vcc, 0, v1, vcc
-; GFX900-NEXT:    global_load_dwordx2 v[12:13], v[10:11], off offset:2048
-; GFX900-NEXT:    global_load_dwordx2 v[14:15], v[6:7], off
-; GFX900-NEXT:    global_load_dwordx2 v[16:17], v[6:7], off offset:2048
-; GFX900-NEXT:    s_movk_i32 s0, 0x3000
-; GFX900-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
-; GFX900-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX900-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off
-; GFX900-NEXT:    global_load_dwordx2 v[10:11], v[0:1], off offset:2048
-; GFX900-NEXT:    s_waitcnt vmcnt(6)
-; GFX900-NEXT:    v_add_co_u32_e32 v0, vcc, v4, v2
-; GFX900-NEXT:    v_addc_co_u32_e32 v1, vcc, v5, v3, vcc
-; GFX900-NEXT:    s_waitcnt vmcnt(5)
-; GFX900-NEXT:    v_add_co_u32_e32 v0, vcc, v8, v0
-; GFX900-NEXT:    v_addc_co_u32_e32 v1, vcc, v9, v1, vcc
-; GFX900-NEXT:    s_waitcnt vmcnt(4)
-; GFX900-NEXT:    v_add_co_u32_e32 v0, vcc, v12, v0
-; GFX900-NEXT:    v_addc_co_u32_e32 v1, vcc, v13, v1, vcc
-; GFX900-NEXT:    s_waitcnt vmcnt(3)
-; GFX900-NEXT:    v_add_co_u32_e32 v0, vcc, v14, v0
-; GFX900-NEXT:    v_addc_co_u32_e32 v1, vcc, v15, v1, vcc
-; GFX900-NEXT:    s_waitcnt vmcnt(2)
-; GFX900-NEXT:    v_add_co_u32_e32 v0, vcc, v16, v0
-; GFX900-NEXT:    v_addc_co_u32_e32 v1, vcc, v17, v1, vcc
-; GFX900-NEXT:    s_waitcnt vmcnt(1)
-; GFX900-NEXT:    v_add_co_u32_e32 v0, vcc, v6, v0
-; GFX900-NEXT:    v_addc_co_u32_e32 v1, vcc, v7, v1, vcc
-; GFX900-NEXT:    s_waitcnt vmcnt(0)
-; GFX900-NEXT:    v_add_co_u32_e32 v0, vcc, v10, v0
-; GFX900-NEXT:    v_addc_co_u32_e32 v1, vcc, v11, v1, vcc
-; GFX900-NEXT:    global_store_dwordx2 v18, v[0:1], s[34:35]
-; GFX900-NEXT:    s_endpgm
+; GFX9-LABEL: clmem_read_simplified:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT:    s_mov_b32 s38, -1
+; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX9-NEXT:    s_add_u32 s36, s36, s3
+; GFX9-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[0:1], 0x24
+; GFX9-NEXT:    s_getpc_b64 s[0:1]
+; GFX9-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT:    v_mov_b32_e32 v31, v0
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_mov_b32 s32, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 7, v0
+; GFX9-NEXT:    v_and_b32_e32 v18, 0xffff8000, v1
+; GFX9-NEXT:    v_mov_b32_e32 v1, s35
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s34, v18
+; GFX9-NEXT:    v_mov_b32_e32 v3, 3
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    s_movk_i32 s1, 0x2000
+; GFX9-NEXT:    global_load_dwordx2 v[2:3], v[0:1], off
+; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:2048
+; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, s1, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
+; GFX9-NEXT:    global_load_dwordx2 v[8:9], v[6:7], off offset:-4096
+; GFX9-NEXT:    s_movk_i32 s0, 0x1000
+; GFX9-NEXT:    v_add_co_u32_e32 v10, vcc, s0, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v11, vcc, 0, v1, vcc
+; GFX9-NEXT:    global_load_dwordx2 v[12:13], v[10:11], off offset:2048
+; GFX9-NEXT:    global_load_dwordx2 v[14:15], v[6:7], off
+; GFX9-NEXT:    global_load_dwordx2 v[16:17], v[6:7], off offset:2048
+; GFX9-NEXT:    s_movk_i32 s0, 0x3000
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off
+; GFX9-NEXT:    global_load_dwordx2 v[10:11], v[0:1], off offset:2048
+; GFX9-NEXT:    s_waitcnt vmcnt(6)
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v4, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v5, v3, vcc
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v8, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v9, v1, vcc
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v12, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v13, v1, vcc
+; GFX9-NEXT:    s_waitcnt vmcnt(3)
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v14, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v15, v1, vcc
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v16, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v17, v1, vcc
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v6, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v7, v1, vcc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v10, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v11, v1, vcc
+; GFX9-NEXT:    global_store_dwordx2 v18, v[0:1], s[34:35]
+; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: clmem_read_simplified:
 ; GFX10:       ; %bb.0: ; %entry
@@ -181,15 +179,14 @@ define amdgpu_kernel void @clmem_read_simplified(ptr addrspace(1)  %buffer) {
 ; GFX10-NEXT:    s_mov_b32 s32, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 7, v0
-; GFX10-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX10-NEXT:    v_and_b32_e32 v20, 0xffff8000, v2
-; GFX10-NEXT:    v_lshlrev_b64 v[0:1], 3, v[0:1]
-; GFX10-NEXT:    v_add_co_u32 v2, s0, s34, v20
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s0, s35, 0, s0
-; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v2, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 7, v0
+; GFX10-NEXT:    v_mov_b32_e32 v2, 3
+; GFX10-NEXT:    v_and_b32_e32 v20, 0xffff8000, v1
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX10-NEXT:    v_add_co_u32 v1, s0, s34, v20
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v2, s0, s35, 0, s0
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v1, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo
 ; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v0, 0x1000
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
 ; GFX10-NEXT:    v_add_co_u32 v8, vcc_lo, v0, 0x2000
@@ -234,77 +231,6 @@ define amdgpu_kernel void @clmem_read_simplified(ptr addrspace(1)  %buffer) {
 ; GFX10-NEXT:    global_store_dwordx2 v20, v[0:1], s[34:35]
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX90A-LABEL: clmem_read_simplified:
-; GFX90A:       ; %bb.0: ; %entry
-; GFX90A-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX90A-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX90A-NEXT:    s_mov_b32 s38, -1
-; GFX90A-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX90A-NEXT:    s_add_u32 s36, s36, s3
-; GFX90A-NEXT:    s_addc_u32 s37, s37, 0
-; GFX90A-NEXT:    s_load_dwordx2 s[34:35], s[0:1], 0x24
-; GFX90A-NEXT:    s_getpc_b64 s[0:1]
-; GFX90A-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4
-; GFX90A-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12
-; GFX90A-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX90A-NEXT:    s_mov_b64 s[0:1], s[36:37]
-; GFX90A-NEXT:    v_mov_b32_e32 v31, v0
-; GFX90A-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90A-NEXT:    s_mov_b32 s32, 0
-; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX90A-NEXT:    v_and_b32_e32 v2, 0xff, v0
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, 7, v0
-; GFX90A-NEXT:    v_and_b32_e32 v18, 0xffff8000, v0
-; GFX90A-NEXT:    v_mov_b32_e32 v3, 0
-; GFX90A-NEXT:    v_mov_b32_e32 v0, s35
-; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, s34, v18
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v0, vcc
-; GFX90A-NEXT:    v_lshlrev_b64 v[0:1], 3, v[2:3]
-; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v4, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v5, v1, vcc
-; GFX90A-NEXT:    s_movk_i32 s1, 0x2000
-; GFX90A-NEXT:    global_load_dwordx2 v[2:3], v[0:1], off
-; GFX90A-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:2048
-; GFX90A-NEXT:    v_add_co_u32_e32 v6, vcc, s1, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v1, vcc
-; GFX90A-NEXT:    global_load_dwordx2 v[8:9], v[6:7], off offset:-4096
-; GFX90A-NEXT:    s_movk_i32 s0, 0x1000
-; GFX90A-NEXT:    v_add_co_u32_e32 v10, vcc, s0, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v11, vcc, 0, v1, vcc
-; GFX90A-NEXT:    global_load_dwordx2 v[12:13], v[10:11], off offset:2048
-; GFX90A-NEXT:    global_load_dwordx2 v[14:15], v[6:7], off
-; GFX90A-NEXT:    global_load_dwordx2 v[16:17], v[6:7], off offset:2048
-; GFX90A-NEXT:    s_movk_i32 s0, 0x3000
-; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX90A-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off
-; GFX90A-NEXT:    global_load_dwordx2 v[10:11], v[0:1], off offset:2048
-; GFX90A-NEXT:    s_waitcnt vmcnt(6)
-; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v4, v2
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v5, v3, vcc
-; GFX90A-NEXT:    s_waitcnt vmcnt(5)
-; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v8, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v9, v1, vcc
-; GFX90A-NEXT:    s_waitcnt vmcnt(4)
-; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v12, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v13, v1, vcc
-; GFX90A-NEXT:    s_waitcnt vmcnt(3)
-; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v14, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v15, v1, vcc
-; GFX90A-NEXT:    s_waitcnt vmcnt(2)
-; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v16, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v17, v1, vcc
-; GFX90A-NEXT:    s_waitcnt vmcnt(1)
-; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v6, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v7, v1, vcc
-; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v10, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v11, v1, vcc
-; GFX90A-NEXT:    global_store_dwordx2 v18, v[0:1], s[34:35]
-; GFX90A-NEXT:    s_endpgm
-;
 ; GFX11-LABEL: clmem_read_simplified:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_getpc_b64 s[2:3]
@@ -316,17 +242,17 @@ define amdgpu_kernel void @clmem_read_simplified(ptr addrspace(1)  %buffer) {
 ; GFX11-NEXT:    s_mov_b32 s32, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
-; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v2, 7, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 7, v0
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff8000, v2
-; GFX11-NEXT:    v_lshlrev_b64 v[0:1], 3, v[0:1]
+; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff8000, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_co_u32 v2, s0, s34, v16
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, s35, 0, s0
+; GFX11-NEXT:    v_add_co_u32 v1, s0, s34, v16
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v2, null, s35, 0, s0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v2, v0
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v1, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_load_b64 v[2:3], v[0:1], off
 ; GFX11-NEXT:    global_load_b64 v[4:5], v[0:1], off offset:2048
@@ -436,98 +362,97 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1)  %buffer) {
 ; GFX8-NEXT:    s_mov_b32 s32, 0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xff, v0
-; GFX8-NEXT:    v_mov_b32_e32 v2, 0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 17, v0
-; GFX8-NEXT:    v_lshlrev_b64 v[1:2], 3, v[1:2]
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xfe000000, v0
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v0
-; GFX8-NEXT:    v_mov_b32_e32 v3, s35
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, s34, v1
-; GFX8-NEXT:    v_addc_u32_e32 v2, vcc, v2, v3, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 17, v0
+; GFX8-NEXT:    v_mov_b32_e32 v2, 3
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xfe000000, v1
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT:    v_mov_b32_e32 v2, s35
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s34, v0
+; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v2, vcc
 ; GFX8-NEXT:    s_movk_i32 s0, 0x5000
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, s0, v1
-; GFX8-NEXT:    v_mov_b32_e32 v3, 0
-; GFX8-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, s0, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v4, 0
+; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX8-NEXT:    s_movk_i32 s0, 0x7f
 ; GFX8-NEXT:  .LBB1_1: ; %for.cond.preheader
 ; GFX8-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX8-NEXT:    ; Child Loop BB1_2 Depth 2
+; GFX8-NEXT:    v_mov_b32_e32 v7, v3
 ; GFX8-NEXT:    v_mov_b32_e32 v6, v2
-; GFX8-NEXT:    v_mov_b32_e32 v5, v1
 ; GFX8-NEXT:    s_mov_b32 s1, 0
 ; GFX8-NEXT:  .LBB1_2: ; %for.body
 ; GFX8-NEXT:    ; Parent Loop BB1_1 Depth=1
 ; GFX8-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0xffffb000, v5
-; GFX8-NEXT:    v_addc_u32_e32 v8, vcc, -1, v6, vcc
-; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0xffffb800, v5
-; GFX8-NEXT:    v_addc_u32_e32 v10, vcc, -1, v6, vcc
-; GFX8-NEXT:    v_add_u32_e32 v11, vcc, 0xffffc000, v5
-; GFX8-NEXT:    flat_load_dwordx2 v[7:8], v[7:8]
-; GFX8-NEXT:    flat_load_dwordx2 v[9:10], v[9:10]
-; GFX8-NEXT:    v_addc_u32_e32 v12, vcc, -1, v6, vcc
-; GFX8-NEXT:    v_add_u32_e32 v13, vcc, 0xffffc800, v5
-; GFX8-NEXT:    v_addc_u32_e32 v14, vcc, -1, v6, vcc
-; GFX8-NEXT:    v_add_u32_e32 v15, vcc, 0xffffd000, v5
-; GFX8-NEXT:    flat_load_dwordx2 v[11:12], v[11:12]
-; GFX8-NEXT:    flat_load_dwordx2 v[13:14], v[13:14]
-; GFX8-NEXT:    v_addc_u32_e32 v16, vcc, -1, v6, vcc
-; GFX8-NEXT:    v_add_u32_e32 v17, vcc, 0xffffd800, v5
-; GFX8-NEXT:    v_addc_u32_e32 v18, vcc, -1, v6, vcc
-; GFX8-NEXT:    flat_load_dwordx2 v[15:16], v[15:16]
-; GFX8-NEXT:    flat_load_dwordx2 v[17:18], v[17:18]
-; GFX8-NEXT:    v_add_u32_e32 v19, vcc, 0xffffe000, v5
-; GFX8-NEXT:    v_addc_u32_e32 v20, vcc, -1, v6, vcc
-; GFX8-NEXT:    v_add_u32_e32 v21, vcc, 0xffffe800, v5
-; GFX8-NEXT:    flat_load_dwordx2 v[19:20], v[19:20]
-; GFX8-NEXT:    v_addc_u32_e32 v22, vcc, -1, v6, vcc
-; GFX8-NEXT:    flat_load_dwordx2 v[21:22], v[21:22]
-; GFX8-NEXT:    v_add_u32_e32 v23, vcc, 0xfffff000, v5
-; GFX8-NEXT:    v_addc_u32_e32 v24, vcc, -1, v6, vcc
-; GFX8-NEXT:    flat_load_dwordx2 v[23:24], v[23:24]
-; GFX8-NEXT:    v_add_u32_e32 v25, vcc, 0xfffff800, v5
-; GFX8-NEXT:    v_addc_u32_e32 v26, vcc, -1, v6, vcc
-; GFX8-NEXT:    flat_load_dwordx2 v[25:26], v[25:26]
-; GFX8-NEXT:    flat_load_dwordx2 v[27:28], v[5:6]
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0x10000, v5
-; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, 0, v6, vcc
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 0xffffb000, v6
+; GFX8-NEXT:    v_addc_u32_e32 v9, vcc, -1, v7, vcc
+; GFX8-NEXT:    v_add_u32_e32 v10, vcc, 0xffffb800, v6
+; GFX8-NEXT:    v_addc_u32_e32 v11, vcc, -1, v7, vcc
+; GFX8-NEXT:    v_add_u32_e32 v12, vcc, 0xffffc000, v6
+; GFX8-NEXT:    flat_load_dwordx2 v[8:9], v[8:9]
+; GFX8-NEXT:    flat_load_dwordx2 v[10:11], v[10:11]
+; GFX8-NEXT:    v_addc_u32_e32 v13, vcc, -1, v7, vcc
+; GFX8-NEXT:    v_add_u32_e32 v14, vcc, 0xffffc800, v6
+; GFX8-NEXT:    v_addc_u32_e32 v15, vcc, -1, v7, vcc
+; GFX8-NEXT:    v_add_u32_e32 v16, vcc, 0xffffd000, v6
+; GFX8-NEXT:    flat_load_dwordx2 v[12:13], v[12:13]
+; GFX8-NEXT:    flat_load_dwordx2 v[14:15], v[14:15]
+; GFX8-NEXT:    v_addc_u32_e32 v17, vcc, -1, v7, vcc
+; GFX8-NEXT:    v_add_u32_e32 v18, vcc, 0xffffd800, v6
+; GFX8-NEXT:    v_addc_u32_e32 v19, vcc, -1, v7, vcc
+; GFX8-NEXT:    flat_load_dwordx2 v[16:17], v[16:17]
+; GFX8-NEXT:    flat_load_dwordx2 v[18:19], v[18:19]
+; GFX8-NEXT:    v_add_u32_e32 v20, vcc, 0xffffe000, v6
+; GFX8-NEXT:    v_addc_u32_e32 v21, vcc, -1, v7, vcc
+; GFX8-NEXT:    v_add_u32_e32 v22, vcc, 0xffffe800, v6
+; GFX8-NEXT:    flat_load_dwordx2 v[20:21], v[20:21]
+; GFX8-NEXT:    v_addc_u32_e32 v23, vcc, -1, v7, vcc
+; GFX8-NEXT:    flat_load_dwordx2 v[22:23], v[22:23]
+; GFX8-NEXT:    v_add_u32_e32 v24, vcc, 0xfffff000, v6
+; GFX8-NEXT:    v_addc_u32_e32 v25, vcc, -1, v7, vcc
+; GFX8-NEXT:    flat_load_dwordx2 v[24:25], v[24:25]
+; GFX8-NEXT:    v_add_u32_e32 v26, vcc, 0xfffff800, v6
+; GFX8-NEXT:    v_addc_u32_e32 v27, vcc, -1, v7, vcc
+; GFX8-NEXT:    flat_load_dwordx2 v[26:27], v[26:27]
+; GFX8-NEXT:    flat_load_dwordx2 v[28:29], v[6:7]
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 0x10000, v6
+; GFX8-NEXT:    v_addc_u32_e32 v7, vcc, 0, v7, vcc
 ; GFX8-NEXT:    s_addk_i32 s1, 0x2000
 ; GFX8-NEXT:    s_cmp_gt_u32 s1, 0x3fffff
 ; GFX8-NEXT:    s_waitcnt vmcnt(10)
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v7, v3
-; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, v8, v4, vcc
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v8, v4
+; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, v9, v5, vcc
 ; GFX8-NEXT:    s_waitcnt vmcnt(9)
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v9, v3
-; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, v10, v4, vcc
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v10, v0
+; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, v11, v4, vcc
 ; GFX8-NEXT:    s_waitcnt vmcnt(8)
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v11, v3
-; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, v12, v4, vcc
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v12, v0
+; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, v13, v4, vcc
 ; GFX8-NEXT:    s_waitcnt vmcnt(7)
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v13, v3
-; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, v14, v4, vcc
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v14, v0
+; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, v15, v4, vcc
 ; GFX8-NEXT:    s_waitcnt vmcnt(6)
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v15, v3
-; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, v16, v4, vcc
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v16, v0
+; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, v17, v4, vcc
 ; GFX8-NEXT:    s_waitcnt vmcnt(5)
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v17, v3
-; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, v18, v4, vcc
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v18, v0
+; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, v19, v4, vcc
 ; GFX8-NEXT:    s_waitcnt vmcnt(4)
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v19, v3
-; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, v20, v4, vcc
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v20, v0
+; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, v21, v4, vcc
 ; GFX8-NEXT:    s_waitcnt vmcnt(3)
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v21, v3
-; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, v22, v4, vcc
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v22, v0
+; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, v23, v4, vcc
 ; GFX8-NEXT:    s_waitcnt vmcnt(2)
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v23, v3
-; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, v24, v4, vcc
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v24, v0
+; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, v25, v4, vcc
 ; GFX8-NEXT:    s_waitcnt vmcnt(1)
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v25, v3
-; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, v26, v4, vcc
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v26, v0
+; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v27, v4, vcc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v27, v3
-; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, v28, v4, vcc
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v28, v0
+; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v29, v5, vcc
 ; GFX8-NEXT:    s_cbranch_scc0 .LBB1_2
 ; GFX8-NEXT:  ; %bb.3: ; %while.cond.loopexit
 ; GFX8-NEXT:    ; in Loop: Header=BB1_1 Depth=1
@@ -538,10 +463,10 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1)  %buffer) {
 ; GFX8-NEXT:    s_mov_b32 s0, s1
 ; GFX8-NEXT:    s_branch .LBB1_1
 ; GFX8-NEXT:  .LBB1_5: ; %while.end
-; GFX8-NEXT:    v_mov_b32_e32 v1, s35
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s34, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[3:4]
+; GFX8-NEXT:    v_mov_b32_e32 v2, s35
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s34, v1
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v2, vcc
+; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[4:5]
 ; GFX8-NEXT:    s_endpgm
 ;
 ; GFX900-LABEL: clmem_read:
@@ -565,14 +490,12 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1)  %buffer) {
 ; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX900-NEXT:    v_and_b32_e32 v1, 0xff, v0
-; GFX900-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 17, v0
-; GFX900-NEXT:    v_lshlrev_b64 v[1:2], 3, v[1:2]
 ; GFX900-NEXT:    v_and_b32_e32 v0, 0xfe000000, v0
-; GFX900-NEXT:    v_or_b32_e32 v1, v1, v0
-; GFX900-NEXT:    v_mov_b32_e32 v3, s35
+; GFX900-NEXT:    v_lshl_or_b32 v1, v1, 3, v0
+; GFX900-NEXT:    v_mov_b32_e32 v2, s35
 ; GFX900-NEXT:    v_add_co_u32_e32 v1, vcc, s34, v1
-; GFX900-NEXT:    v_addc_co_u32_e32 v2, vcc, v2, v3, vcc
+; GFX900-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v2, vcc
 ; GFX900-NEXT:    v_add_co_u32_e32 v1, vcc, 0x5000, v1
 ; GFX900-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX900-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v2, vcc
@@ -681,17 +604,15 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1)  %buffer) {
 ; GFX10-NEXT:    s_mov_b32 s32, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT:    v_mov_b32_e32 v2, 0
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xff, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 17, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 17, v0
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xff, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX10-NEXT:    s_movk_i32 s1, 0x7f
-; GFX10-NEXT:    v_lshlrev_b64 v[1:2], 3, v[1:2]
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xfe000000, v0
-; GFX10-NEXT:    v_or_b32_e32 v1, v1, v0
-; GFX10-NEXT:    v_add_co_u32 v1, vcc_lo, v1, s34
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, s35, v2, vcc_lo
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xfe000000, v1
+; GFX10-NEXT:    v_lshl_or_b32 v1, v2, 3, v0
+; GFX10-NEXT:    v_add_co_u32 v1, s0, v1, s34
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v2, s0, 0, s35, s0
 ; GFX10-NEXT:    v_add_co_u32 v1, vcc_lo, 0x5000, v1
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, 0, v2, vcc_lo
 ; GFX10-NEXT:  .LBB1_1: ; %for.cond.preheader
@@ -795,15 +716,13 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1)  %buffer) {
 ; GFX90A-NEXT:    s_mov_b32 s32, 0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX90A-NEXT:    v_and_b32_e32 v2, 0xff, v0
-; GFX90A-NEXT:    v_mov_b32_e32 v3, 0
+; GFX90A-NEXT:    v_and_b32_e32 v1, 0xff, v0
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, 17, v0
 ; GFX90A-NEXT:    v_and_b32_e32 v0, 0xfe000000, v0
-; GFX90A-NEXT:    v_lshlrev_b64 v[2:3], 3, v[2:3]
-; GFX90A-NEXT:    v_or_b32_e32 v1, v2, v0
+; GFX90A-NEXT:    v_lshl_or_b32 v1, v1, 3, v0
 ; GFX90A-NEXT:    v_mov_b32_e32 v2, s35
 ; GFX90A-NEXT:    v_add_co_u32_e32 v1, vcc, s34, v1
-; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v2, vcc
+; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v2, vcc
 ; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, 0x5000, v1
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
 ; GFX90A-NEXT:    s_movk_i32 s2, 0x7f
@@ -901,20 +820,18 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1)  %buffer) {
 ; GFX11-NEXT:    s_mov_b32 s32, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
-; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_and_b32 v1, 0xff, v0
-; GFX11-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_lshlrev_b32 v0, 17, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 17, v0
+; GFX11-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_and_b32 v2, 0xff, v0
 ; GFX11-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX11-NEXT:    s_movk_i32 s1, 0x7f
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_lshlrev_b64 v[1:2], 3, v[1:2]
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xfe000000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xfe000000, v1
+; GFX11-NEXT:    v_lshl_or_b32 v1, v2, 3, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_or_b32_e32 v1, v1, v0
-; GFX11-NEXT:    v_add_co_u32 v1, vcc_lo, v1, s34
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, s35, v2, vcc_lo
+; GFX11-NEXT:    v_add_co_u32 v1, s0, v1, s34
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v2, null, 0, s35, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_add_co_u32 v1, vcc_lo, 0x5000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, 0, v2, vcc_lo
 ; GFX11-NEXT:  .LBB1_1: ; %for.cond.preheader
 ; GFX11-NEXT:    ; =>This Loop Header: Depth=1
@@ -1133,39 +1050,38 @@ define amdgpu_kernel void @Address32(ptr addrspace(1) %buffer) {
 ; GFX8-NEXT:    s_mov_b32 s32, 0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xff, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 7, v0
-; GFX8-NEXT:    v_mov_b32_e32 v2, 0
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff8000, v0
-; GFX8-NEXT:    v_mov_b32_e32 v4, s35
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, s34, v0
-; GFX8-NEXT:    v_lshlrev_b64 v[0:1], 2, v[1:2]
-; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v3, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v4, v1, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 7, v0
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff8000, v1
+; GFX8-NEXT:    v_mov_b32_e32 v2, s35
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, s34, v1
+; GFX8-NEXT:    v_mov_b32_e32 v3, 2
+; GFX8-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v1, v0
+; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v2, vcc
 ; GFX8-NEXT:    s_movk_i32 s0, 0x400
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s0, v0
-; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, 0, v1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s0, v3
+; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, 0, v4, vcc
 ; GFX8-NEXT:    s_movk_i32 s0, 0x800
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, s0, v0
-; GFX8-NEXT:    v_addc_u32_e32 v8, vcc, 0, v1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, s0, v3
+; GFX8-NEXT:    v_addc_u32_e32 v8, vcc, 0, v4, vcc
 ; GFX8-NEXT:    s_movk_i32 s0, 0xc00
-; GFX8-NEXT:    v_add_u32_e32 v9, vcc, s0, v0
-; GFX8-NEXT:    v_addc_u32_e32 v10, vcc, 0, v1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, s0, v3
+; GFX8-NEXT:    v_addc_u32_e32 v10, vcc, 0, v4, vcc
 ; GFX8-NEXT:    s_movk_i32 s0, 0x1000
-; GFX8-NEXT:    v_add_u32_e32 v11, vcc, s0, v0
-; GFX8-NEXT:    v_addc_u32_e32 v12, vcc, 0, v1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v11, vcc, s0, v3
+; GFX8-NEXT:    v_addc_u32_e32 v12, vcc, 0, v4, vcc
 ; GFX8-NEXT:    s_movk_i32 s0, 0x1400
-; GFX8-NEXT:    v_add_u32_e32 v13, vcc, s0, v0
-; GFX8-NEXT:    v_addc_u32_e32 v14, vcc, 0, v1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v13, vcc, s0, v3
+; GFX8-NEXT:    v_addc_u32_e32 v14, vcc, 0, v4, vcc
 ; GFX8-NEXT:    s_movk_i32 s0, 0x1800
-; GFX8-NEXT:    v_add_u32_e32 v15, vcc, s0, v0
-; GFX8-NEXT:    v_addc_u32_e32 v16, vcc, 0, v1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v15, vcc, s0, v3
+; GFX8-NEXT:    v_addc_u32_e32 v16, vcc, 0, v4, vcc
 ; GFX8-NEXT:    s_movk_i32 s0, 0x1c00
-; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s0, v0
-; GFX8-NEXT:    v_addc_u32_e32 v18, vcc, 0, v1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s0, v3
+; GFX8-NEXT:    v_addc_u32_e32 v18, vcc, 0, v4, vcc
 ; GFX8-NEXT:    s_movk_i32 s0, 0x2000
-; GFX8-NEXT:    flat_load_dword v2, v[0:1]
+; GFX8-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX8-NEXT:    flat_load_dword v19, v[5:6]
 ; GFX8-NEXT:    flat_load_dword v7, v[7:8]
 ; GFX8-NEXT:    flat_load_dword v8, v[9:10]
@@ -1173,90 +1089,89 @@ define amdgpu_kernel void @Address32(ptr addrspace(1) %buffer) {
 ; GFX8-NEXT:    flat_load_dword v10, v[13:14]
 ; GFX8-NEXT:    flat_load_dword v11, v[15:16]
 ; GFX8-NEXT:    flat_load_dword v12, v[17:18]
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s0, v0
-; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, 0, v1, vcc
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x2400, v0
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s0, v3
+; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, 0, v4, vcc
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x2400, v3
 ; GFX8-NEXT:    flat_load_dword v5, v[5:6]
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dword v0, v[0:1]
+; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
+; GFX8-NEXT:    flat_load_dword v3, v[3:4]
 ; GFX8-NEXT:    s_waitcnt vmcnt(8)
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v19, v2
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v19, v0
 ; GFX8-NEXT:    s_waitcnt vmcnt(7)
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v7, v1
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v7, v0
 ; GFX8-NEXT:    s_waitcnt vmcnt(6)
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v8, v1
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v8, v0
 ; GFX8-NEXT:    s_waitcnt vmcnt(5)
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v9, v1
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v9, v0
 ; GFX8-NEXT:    s_waitcnt vmcnt(4)
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v10, v1
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v10, v0
 ; GFX8-NEXT:    s_waitcnt vmcnt(3)
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v11, v1
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v11, v0
 ; GFX8-NEXT:    s_waitcnt vmcnt(2)
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v12, v1
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v12, v0
 ; GFX8-NEXT:    s_waitcnt vmcnt(1)
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v5, v1
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v5, v0
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
-; GFX8-NEXT:    flat_store_dword v[3:4], v0
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v3, v0
+; GFX8-NEXT:    flat_store_dword v[1:2], v0
 ; GFX8-NEXT:    s_endpgm
 ;
-; GFX900-LABEL: Address32:
-; GFX900:       ; %bb.0: ; %entry
-; GFX900-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX900-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX900-NEXT:    s_mov_b32 s38, -1
-; GFX900-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX900-NEXT:    s_add_u32 s36, s36, s3
-; GFX900-NEXT:    s_addc_u32 s37, s37, 0
-; GFX900-NEXT:    s_load_dwordx2 s[34:35], s[0:1], 0x24
-; GFX900-NEXT:    s_getpc_b64 s[0:1]
-; GFX900-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4
-; GFX900-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12
-; GFX900-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX900-NEXT:    s_mov_b64 s[0:1], s[36:37]
-; GFX900-NEXT:    v_mov_b32_e32 v31, v0
-; GFX900-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX900-NEXT:    v_mov_b32_e32 v0, 0
-; GFX900-NEXT:    s_mov_b32 s32, 0
-; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX900-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX900-NEXT:    v_and_b32_e32 v1, 0xff, v0
-; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 7, v0
-; GFX900-NEXT:    v_and_b32_e32 v4, 0xffff8000, v0
-; GFX900-NEXT:    v_mov_b32_e32 v2, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, s35
-; GFX900-NEXT:    v_add_co_u32_e32 v3, vcc, s34, v4
-; GFX900-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v0, vcc
-; GFX900-NEXT:    v_lshlrev_b64 v[0:1], 2, v[1:2]
-; GFX900-NEXT:    s_movk_i32 s0, 0x1000
-; GFX900-NEXT:    v_add_co_u32_e32 v0, vcc, v3, v0
-; GFX900-NEXT:    v_addc_co_u32_e32 v1, vcc, v5, v1, vcc
-; GFX900-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v0
-; GFX900-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
-; GFX900-NEXT:    global_load_dword v5, v[0:1], off
-; GFX900-NEXT:    global_load_dword v6, v[0:1], off offset:1024
-; GFX900-NEXT:    global_load_dword v7, v[0:1], off offset:2048
-; GFX900-NEXT:    global_load_dword v8, v[0:1], off offset:3072
-; GFX900-NEXT:    global_load_dword v9, v[2:3], off
-; GFX900-NEXT:    global_load_dword v10, v[2:3], off offset:1024
-; GFX900-NEXT:    global_load_dword v11, v[2:3], off offset:2048
-; GFX900-NEXT:    global_load_dword v12, v[2:3], off offset:3072
-; GFX900-NEXT:    v_add_co_u32_e32 v0, vcc, 0x2000, v0
-; GFX900-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX900-NEXT:    global_load_dword v2, v[0:1], off
-; GFX900-NEXT:    global_load_dword v3, v[0:1], off offset:1024
-; GFX900-NEXT:    s_waitcnt vmcnt(8)
-; GFX900-NEXT:    v_add_u32_e32 v0, v6, v5
-; GFX900-NEXT:    s_waitcnt vmcnt(6)
-; GFX900-NEXT:    v_add3_u32 v0, v7, v0, v8
-; GFX900-NEXT:    s_waitcnt vmcnt(4)
-; GFX900-NEXT:    v_add3_u32 v0, v9, v0, v10
-; GFX900-NEXT:    s_waitcnt vmcnt(2)
-; GFX900-NEXT:    v_add3_u32 v0, v11, v0, v12
-; GFX900-NEXT:    s_waitcnt vmcnt(0)
-; GFX900-NEXT:    v_add3_u32 v0, v2, v0, v3
-; GFX900-NEXT:    global_store_dword v4, v0, s[34:35]
-; GFX900-NEXT:    s_endpgm
+; GFX9-LABEL: Address32:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT:    s_mov_b32 s38, -1
+; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX9-NEXT:    s_add_u32 s36, s36, s3
+; GFX9-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[0:1], 0x24
+; GFX9-NEXT:    s_getpc_b64 s[0:1]
+; GFX9-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT:    v_mov_b32_e32 v31, v0
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_mov_b32 s32, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 7, v0
+; GFX9-NEXT:    v_and_b32_e32 v4, 0xffff8000, v1
+; GFX9-NEXT:    v_mov_b32_e32 v1, s35
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s34, v4
+; GFX9-NEXT:    v_mov_b32_e32 v3, 2
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    s_movk_i32 s0, 0x1000
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
+; GFX9-NEXT:    global_load_dword v5, v[0:1], off
+; GFX9-NEXT:    global_load_dword v6, v[0:1], off offset:1024
+; GFX9-NEXT:    global_load_dword v7, v[0:1], off offset:2048
+; GFX9-NEXT:    global_load_dword v8, v[0:1], off offset:3072
+; GFX9-NEXT:    global_load_dword v9, v[2:3], off
+; GFX9-NEXT:    global_load_dword v10, v[2:3], off offset:1024
+; GFX9-NEXT:    global_load_dword v11, v[2:3], off offset:2048
+; GFX9-NEXT:    global_load_dword v12, v[2:3], off offset:3072
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x2000, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    global_load_dword v2, v[0:1], off
+; GFX9-NEXT:    global_load_dword v3, v[0:1], off offset:1024
+; GFX9-NEXT:    s_waitcnt vmcnt(8)
+; GFX9-NEXT:    v_add_u32_e32 v0, v6, v5
+; GFX9-NEXT:    s_waitcnt vmcnt(6)
+; GFX9-NEXT:    v_add3_u32 v0, v7, v0, v8
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    v_add3_u32 v0, v9, v0, v10
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    v_add3_u32 v0, v11, v0, v12
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_add3_u32 v0, v2, v0, v3
+; GFX9-NEXT:    global_store_dword v4, v0, s[34:35]
+; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: Address32:
 ; GFX10:       ; %bb.0: ; %entry
@@ -1278,15 +1193,14 @@ define amdgpu_kernel void @Address32(ptr addrspace(1) %buffer) {
 ; GFX10-NEXT:    s_mov_b32 s32, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 7, v0
-; GFX10-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff8000, v2
-; GFX10-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
-; GFX10-NEXT:    v_add_co_u32 v2, s0, s34, v8
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s0, s35, 0, s0
-; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v2, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 7, v0
+; GFX10-NEXT:    v_mov_b32_e32 v2, 2
+; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff8000, v1
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX10-NEXT:    v_add_co_u32 v1, s0, s34, v8
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v2, s0, s35, 0, s0
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v1, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo
 ; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, 0x800, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
 ; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, v0, 0x1000
@@ -1325,64 +1239,6 @@ define amdgpu_kernel void @Address32(ptr addrspace(1) %buffer) {
 ; GFX10-NEXT:    global_store_dword v8, v0, s[34:35]
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX90A-LABEL: Address32:
-; GFX90A:       ; %bb.0: ; %entry
-; GFX90A-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX90A-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX90A-NEXT:    s_mov_b32 s38, -1
-; GFX90A-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX90A-NEXT:    s_add_u32 s36, s36, s3
-; GFX90A-NEXT:    s_addc_u32 s37, s37, 0
-; GFX90A-NEXT:    s_load_dwordx2 s[34:35], s[0:1], 0x24
-; GFX90A-NEXT:    s_getpc_b64 s[0:1]
-; GFX90A-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4
-; GFX90A-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12
-; GFX90A-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX90A-NEXT:    s_mov_b64 s[0:1], s[36:37]
-; GFX90A-NEXT:    v_mov_b32_e32 v31, v0
-; GFX90A-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90A-NEXT:    s_mov_b32 s32, 0
-; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX90A-NEXT:    v_and_b32_e32 v2, 0xff, v0
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, 7, v0
-; GFX90A-NEXT:    v_and_b32_e32 v4, 0xffff8000, v0
-; GFX90A-NEXT:    v_mov_b32_e32 v3, 0
-; GFX90A-NEXT:    v_mov_b32_e32 v0, s35
-; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, s34, v4
-; GFX90A-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v0, vcc
-; GFX90A-NEXT:    v_lshlrev_b64 v[0:1], 2, v[2:3]
-; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v5, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v6, v1, vcc
-; GFX90A-NEXT:    s_movk_i32 s0, 0x1000
-; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
-; GFX90A-NEXT:    global_load_dword v5, v[0:1], off
-; GFX90A-NEXT:    global_load_dword v6, v[0:1], off offset:1024
-; GFX90A-NEXT:    global_load_dword v7, v[0:1], off offset:2048
-; GFX90A-NEXT:    global_load_dword v8, v[0:1], off offset:3072
-; GFX90A-NEXT:    global_load_dword v9, v[2:3], off
-; GFX90A-NEXT:    global_load_dword v10, v[2:3], off offset:1024
-; GFX90A-NEXT:    global_load_dword v11, v[2:3], off offset:2048
-; GFX90A-NEXT:    global_load_dword v12, v[2:3], off offset:3072
-; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0x2000, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX90A-NEXT:    global_load_dword v2, v[0:1], off
-; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:1024
-; GFX90A-NEXT:    s_waitcnt vmcnt(8)
-; GFX90A-NEXT:    v_add_u32_e32 v0, v6, v5
-; GFX90A-NEXT:    s_waitcnt vmcnt(6)
-; GFX90A-NEXT:    v_add3_u32 v0, v7, v0, v8
-; GFX90A-NEXT:    s_waitcnt vmcnt(4)
-; GFX90A-NEXT:    v_add3_u32 v0, v9, v0, v10
-; GFX90A-NEXT:    s_waitcnt vmcnt(2)
-; GFX90A-NEXT:    v_add3_u32 v0, v11, v0, v12
-; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_add3_u32 v0, v2, v0, v3
-; GFX90A-NEXT:    global_store_dword v4, v0, s[34:35]
-; GFX90A-NEXT:    s_endpgm
-;
 ; GFX11-LABEL: Address32:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_getpc_b64 s[2:3]
@@ -1394,17 +1250,17 @@ define amdgpu_kernel void @Address32(ptr addrspace(1) %buffer) {
 ; GFX11-NEXT:    s_mov_b32 s32, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
-; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v2, 7, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 7, v0
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff8000, v2
-; GFX11-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff8000, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_co_u32 v2, s0, s34, v6
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, s35, 0, s0
+; GFX11-NEXT:    v_add_co_u32 v1, s0, s34, v6
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v2, null, s35, 0, s0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v2, v0
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v1, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_load_b32 v7, v[0:1], off
 ; GFX11-NEXT:    global_load_b32 v8, v[0:1], off offset:1024
@@ -1511,89 +1367,87 @@ define amdgpu_kernel void @Offset64(ptr addrspace(1)  %buffer) {
 ; GFX8-NEXT:    s_mov_b32 s32, 0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xff, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 7, v0
-; GFX8-NEXT:    v_mov_b32_e32 v2, 0
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff8000, v0
-; GFX8-NEXT:    v_mov_b32_e32 v4, s35
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, s34, v0
-; GFX8-NEXT:    v_lshlrev_b64 v[0:1], 3, v[1:2]
-; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v3, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v4, v1, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 7, v0
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff8000, v1
+; GFX8-NEXT:    v_mov_b32_e32 v2, s35
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, s34, v1
+; GFX8-NEXT:    v_mov_b32_e32 v3, 3
+; GFX8-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v1, v0
+; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v2, vcc
 ; GFX8-NEXT:    s_movk_i32 s0, 0xf000
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s0, v0
-; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, 0, v1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s0, v3
+; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, 0, v4, vcc
 ; GFX8-NEXT:    s_movk_i32 s0, 0xf800
-; GFX8-NEXT:    flat_load_dwordx2 v[7:8], v[0:1]
+; GFX8-NEXT:    flat_load_dwordx2 v[7:8], v[3:4]
 ; GFX8-NEXT:    flat_load_dwordx2 v[5:6], v[5:6]
-; GFX8-NEXT:    v_add_u32_e32 v9, vcc, s0, v0
-; GFX8-NEXT:    v_addc_u32_e32 v10, vcc, 0, v1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, s0, v3
+; GFX8-NEXT:    v_addc_u32_e32 v10, vcc, 0, v4, vcc
 ; GFX8-NEXT:    flat_load_dwordx2 v[9:10], v[9:10]
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 1, v1, vcc
-; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0, v3
+; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 1, v4, vcc
+; GFX8-NEXT:    flat_load_dwordx2 v[3:4], v[3:4]
 ; GFX8-NEXT:    s_waitcnt vmcnt(2)
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v5, v7
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v5, v7
 ; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v6, v8, vcc
 ; GFX8-NEXT:    s_waitcnt vmcnt(1)
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v9, v2
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v9, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v10, v5, vcc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v1, v5, vcc
-; GFX8-NEXT:    flat_store_dwordx2 v[3:4], v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v0
+; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, v4, v5, vcc
+; GFX8-NEXT:    flat_store_dwordx2 v[1:2], v[3:4]
 ; GFX8-NEXT:    s_endpgm
 ;
-; GFX900-LABEL: Offset64:
-; GFX900:       ; %bb.0: ; %entry
-; GFX900-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX900-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX900-NEXT:    s_mov_b32 s38, -1
-; GFX900-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX900-NEXT:    s_add_u32 s36, s36, s3
-; GFX900-NEXT:    s_addc_u32 s37, s37, 0
-; GFX900-NEXT:    s_load_dwordx2 s[34:35], s[0:1], 0x24
-; GFX900-NEXT:    s_getpc_b64 s[0:1]
-; GFX900-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4
-; GFX900-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12
-; GFX900-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX900-NEXT:    s_mov_b64 s[0:1], s[36:37]
-; GFX900-NEXT:    v_mov_b32_e32 v31, v0
-; GFX900-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX900-NEXT:    v_mov_b32_e32 v0, 0
-; GFX900-NEXT:    s_mov_b32 s32, 0
-; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX900-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX900-NEXT:    v_and_b32_e32 v1, 0xff, v0
-; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 7, v0
-; GFX900-NEXT:    v_and_b32_e32 v12, 0xffff8000, v0
-; GFX900-NEXT:    v_mov_b32_e32 v2, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, s35
-; GFX900-NEXT:    v_add_co_u32_e32 v3, vcc, s34, v12
-; GFX900-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v0, vcc
-; GFX900-NEXT:    v_lshlrev_b64 v[0:1], 3, v[1:2]
-; GFX900-NEXT:    s_movk_i32 s0, 0xf000
-; GFX900-NEXT:    v_add_co_u32_e32 v0, vcc, v3, v0
-; GFX900-NEXT:    v_addc_co_u32_e32 v1, vcc, v4, v1, vcc
-; GFX900-NEXT:    v_add_co_u32_e32 v4, vcc, 0, v0
-; GFX900-NEXT:    v_addc_co_u32_e32 v5, vcc, 1, v1, vcc
-; GFX900-NEXT:    global_load_dwordx2 v[2:3], v[0:1], off
-; GFX900-NEXT:    global_load_dwordx2 v[6:7], v[4:5], off offset:-4096
-; GFX900-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
-; GFX900-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX900-NEXT:    global_load_dwordx2 v[8:9], v[4:5], off
-; GFX900-NEXT:    global_load_dwordx2 v[10:11], v[0:1], off offset:2048
-; GFX900-NEXT:    s_waitcnt vmcnt(2)
-; GFX900-NEXT:    v_add_co_u32_e32 v0, vcc, v6, v2
-; GFX900-NEXT:    v_addc_co_u32_e32 v1, vcc, v7, v3, vcc
-; GFX900-NEXT:    s_waitcnt vmcnt(0)
-; GFX900-NEXT:    v_add_co_u32_e32 v0, vcc, v10, v0
-; GFX900-NEXT:    v_addc_co_u32_e32 v1, vcc, v11, v1, vcc
-; GFX900-NEXT:    v_add_co_u32_e32 v0, vcc, v8, v0
-; GFX900-NEXT:    v_addc_co_u32_e32 v1, vcc, v9, v1, vcc
-; GFX900-NEXT:    global_store_dwordx2 v12, v[0:1], s[34:35]
-; GFX900-NEXT:    s_endpgm
+; GFX9-LABEL: Offset64:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT:    s_mov_b32 s38, -1
+; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX9-NEXT:    s_add_u32 s36, s36, s3
+; GFX9-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[0:1], 0x24
+; GFX9-NEXT:    s_getpc_b64 s[0:1]
+; GFX9-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT:    v_mov_b32_e32 v31, v0
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_mov_b32 s32, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 7, v0
+; GFX9-NEXT:    v_and_b32_e32 v12, 0xffff8000, v1
+; GFX9-NEXT:    v_mov_b32_e32 v1, s35
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s34, v12
+; GFX9-NEXT:    v_mov_b32_e32 v3, 3
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, 0, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 1, v1, vcc
+; GFX9-NEXT:    global_load_dwordx2 v[2:3], v[0:1], off
+; GFX9-NEXT:    global_load_dwordx2 v[6:7], v[4:5], off offset:-4096
+; GFX9-NEXT:    s_movk_i32 s0, 0xf000
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    global_load_dwordx2 v[8:9], v[4:5], off
+; GFX9-NEXT:    global_load_dwordx2 v[10:11], v[0:1], off offset:2048
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v6, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v7, v3, vcc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v10, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v11, v1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v8, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v9, v1, vcc
+; GFX9-NEXT:    global_store_dwordx2 v12, v[0:1], s[34:35]
+; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: Offset64:
 ; GFX10:       ; %bb.0: ; %entry
@@ -1615,15 +1469,14 @@ define amdgpu_kernel void @Offset64(ptr addrspace(1)  %buffer) {
 ; GFX10-NEXT:    s_mov_b32 s32, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 7, v0
-; GFX10-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX10-NEXT:    v_and_b32_e32 v12, 0xffff8000, v2
-; GFX10-NEXT:    v_lshlrev_b64 v[0:1], 3, v[0:1]
-; GFX10-NEXT:    v_add_co_u32 v2, s0, s34, v12
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s0, s35, 0, s0
-; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v2, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 7, v0
+; GFX10-NEXT:    v_mov_b32_e32 v2, 3
+; GFX10-NEXT:    v_and_b32_e32 v12, 0xffff8000, v1
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX10-NEXT:    v_add_co_u32 v1, s0, s34, v12
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v2, s0, s35, 0, s0
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v1, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo
 ; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v0, 0xfffff800
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
 ; GFX10-NEXT:    s_clause 0x1
@@ -1646,56 +1499,6 @@ define amdgpu_kernel void @Offset64(ptr addrspace(1)  %buffer) {
 ; GFX10-NEXT:    global_store_dwordx2 v12, v[0:1], s[34:35]
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX90A-LABEL: Offset64:
-; GFX90A:       ; %bb.0: ; %entry
-; GFX90A-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX90A-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX90A-NEXT:    s_mov_b32 s38, -1
-; GFX90A-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX90A-NEXT:    s_add_u32 s36, s36, s3
-; GFX90A-NEXT:    s_addc_u32 s37, s37, 0
-; GFX90A-NEXT:    s_load_dwordx2 s[34:35], s[0:1], 0x24
-; GFX90A-NEXT:    s_getpc_b64 s[0:1]
-; GFX90A-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4
-; GFX90A-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12
-; GFX90A-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX90A-NEXT:    s_mov_b64 s[0:1], s[36:37]
-; GFX90A-NEXT:    v_mov_b32_e32 v31, v0
-; GFX90A-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90A-NEXT:    s_mov_b32 s32, 0
-; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX90A-NEXT:    v_and_b32_e32 v2, 0xff, v0
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, 7, v0
-; GFX90A-NEXT:    v_and_b32_e32 v12, 0xffff8000, v0
-; GFX90A-NEXT:    v_mov_b32_e32 v3, 0
-; GFX90A-NEXT:    v_mov_b32_e32 v0, s35
-; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, s34, v12
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v0, vcc
-; GFX90A-NEXT:    v_lshlrev_b64 v[0:1], 3, v[2:3]
-; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v4, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v5, v1, vcc
-; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 1, v1, vcc
-; GFX90A-NEXT:    global_load_dwordx2 v[2:3], v[0:1], off
-; GFX90A-NEXT:    global_load_dwordx2 v[6:7], v[4:5], off offset:-4096
-; GFX90A-NEXT:    s_movk_i32 s0, 0xf000
-; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX90A-NEXT:    global_load_dwordx2 v[8:9], v[4:5], off
-; GFX90A-NEXT:    global_load_dwordx2 v[10:11], v[0:1], off offset:2048
-; GFX90A-NEXT:    s_waitcnt vmcnt(2)
-; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v6, v2
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v7, v3, vcc
-; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v10, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v11, v1, vcc
-; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v8, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v9, v1, vcc
-; GFX90A-NEXT:    global_store_dwordx2 v12, v[0:1], s[34:35]
-; GFX90A-NEXT:    s_endpgm
-;
 ; GFX11-LABEL: Offset64:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_getpc_b64 s[2:3]
@@ -1707,17 +1510,17 @@ define amdgpu_kernel void @Offset64(ptr addrspace(1)  %buffer) {
 ; GFX11-NEXT:    s_mov_b32 s32, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
-; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v2, 7, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 7, v0
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff8000, v2
-; GFX11-NEXT:    v_lshlrev_b64 v[0:1], 3, v[0:1]
+; GFX11-NEXT:    v_and_b32_e32 v8, 0xffff8000, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_co_u32 v2, s0, s34, v8
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, s35, 0, s0
+; GFX11-NEXT:    v_add_co_u32 v1, s0, s34, v8
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v2, null, s35, 0, s0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v2, v0
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v1, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, v0, 0
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 1, v1, vcc_lo
@@ -1792,81 +1595,80 @@ define amdgpu_kernel void @p32Offset64(ptr addrspace(1)  %buffer) {
 ; GFX8-NEXT:    s_mov_b32 s32, 0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xff, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 7, v0
-; GFX8-NEXT:    v_mov_b32_e32 v2, 0
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff8000, v0
-; GFX8-NEXT:    v_mov_b32_e32 v4, s35
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, s34, v0
-; GFX8-NEXT:    v_lshlrev_b64 v[0:1], 2, v[1:2]
-; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v3, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v4, v1, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 7, v0
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff8000, v1
+; GFX8-NEXT:    v_mov_b32_e32 v2, s35
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, s34, v1
+; GFX8-NEXT:    v_mov_b32_e32 v3, 2
+; GFX8-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v1, v0
+; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v2, vcc
 ; GFX8-NEXT:    s_mov_b32 s0, 0x7ffff800
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s0, v0
-; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, 0, v1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s0, v3
+; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, 0, v4, vcc
 ; GFX8-NEXT:    s_mov_b32 s0, 0x7ffffc00
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, s0, v0
-; GFX8-NEXT:    v_addc_u32_e32 v8, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dword v2, v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, s0, v3
+; GFX8-NEXT:    v_addc_u32_e32 v8, vcc, 0, v4, vcc
+; GFX8-NEXT:    flat_load_dword v0, v[3:4]
 ; GFX8-NEXT:    flat_load_dword v5, v[5:6]
 ; GFX8-NEXT:    flat_load_dword v6, v[7:8]
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x80000000, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dword v0, v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x80000000, v3
+; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
+; GFX8-NEXT:    flat_load_dword v3, v[3:4]
 ; GFX8-NEXT:    s_waitcnt vmcnt(2)
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v5, v2
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v5, v0
 ; GFX8-NEXT:    s_waitcnt vmcnt(1)
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v6, v1
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v6, v0
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
-; GFX8-NEXT:    flat_store_dword v[3:4], v0
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v3, v0
+; GFX8-NEXT:    flat_store_dword v[1:2], v0
 ; GFX8-NEXT:    s_endpgm
 ;
-; GFX900-LABEL: p32Offset64:
-; GFX900:       ; %bb.0: ; %entry
-; GFX900-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX900-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX900-NEXT:    s_mov_b32 s38, -1
-; GFX900-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX900-NEXT:    s_add_u32 s36, s36, s3
-; GFX900-NEXT:    s_addc_u32 s37, s37, 0
-; GFX900-NEXT:    s_load_dwordx2 s[34:35], s[0:1], 0x24
-; GFX900-NEXT:    s_getpc_b64 s[0:1]
-; GFX900-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4
-; GFX900-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12
-; GFX900-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX900-NEXT:    s_mov_b64 s[0:1], s[36:37]
-; GFX900-NEXT:    v_mov_b32_e32 v31, v0
-; GFX900-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX900-NEXT:    v_mov_b32_e32 v0, 0
-; GFX900-NEXT:    s_mov_b32 s32, 0
-; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX900-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX900-NEXT:    v_and_b32_e32 v1, 0xff, v0
-; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 7, v0
-; GFX900-NEXT:    v_and_b32_e32 v6, 0xffff8000, v0
-; GFX900-NEXT:    v_mov_b32_e32 v2, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, s35
-; GFX900-NEXT:    v_add_co_u32_e32 v3, vcc, s34, v6
-; GFX900-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v0, vcc
-; GFX900-NEXT:    v_lshlrev_b64 v[0:1], 2, v[1:2]
-; GFX900-NEXT:    v_add_co_u32_e32 v0, vcc, v3, v0
-; GFX900-NEXT:    v_addc_co_u32_e32 v1, vcc, v4, v1, vcc
-; GFX900-NEXT:    v_add_co_u32_e32 v2, vcc, 0x7ffff000, v0
-; GFX900-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
-; GFX900-NEXT:    v_add_co_u32_e32 v4, vcc, 0x80000000, v0
-; GFX900-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
-; GFX900-NEXT:    global_load_dword v7, v[0:1], off
-; GFX900-NEXT:    global_load_dword v8, v[2:3], off offset:2048
-; GFX900-NEXT:    global_load_dword v9, v[2:3], off offset:3072
-; GFX900-NEXT:    global_load_dword v10, v[4:5], off
-; GFX900-NEXT:    s_waitcnt vmcnt(2)
-; GFX900-NEXT:    v_add_u32_e32 v0, v8, v7
-; GFX900-NEXT:    s_waitcnt vmcnt(0)
-; GFX900-NEXT:    v_add3_u32 v0, v9, v0, v10
-; GFX900-NEXT:    global_store_dword v6, v0, s[34:35]
-; GFX900-NEXT:    s_endpgm
+; GFX9-LABEL: p32Offset64:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT:    s_mov_b32 s38, -1
+; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX9-NEXT:    s_add_u32 s36, s36, s3
+; GFX9-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[0:1], 0x24
+; GFX9-NEXT:    s_getpc_b64 s[0:1]
+; GFX9-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT:    v_mov_b32_e32 v31, v0
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_mov_b32 s32, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 7, v0
+; GFX9-NEXT:    v_and_b32_e32 v6, 0xffff8000, v1
+; GFX9-NEXT:    v_mov_b32_e32 v1, s35
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s34, v6
+; GFX9-NEXT:    v_mov_b32_e32 v3, 2
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    s_mov_b32 s0, 0x7ffff000
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, 0x80000000, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
+; GFX9-NEXT:    global_load_dword v7, v[0:1], off
+; GFX9-NEXT:    global_load_dword v8, v[2:3], off offset:2048
+; GFX9-NEXT:    global_load_dword v9, v[2:3], off offset:3072
+; GFX9-NEXT:    global_load_dword v10, v[4:5], off
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    v_add_u32_e32 v0, v8, v7
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_add3_u32 v0, v9, v0, v10
+; GFX9-NEXT:    global_store_dword v6, v0, s[34:35]
+; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: p32Offset64:
 ; GFX10:       ; %bb.0: ; %entry
@@ -1888,15 +1690,14 @@ define amdgpu_kernel void @p32Offset64(ptr addrspace(1)  %buffer) {
 ; GFX10-NEXT:    s_mov_b32 s32, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 7, v0
-; GFX10-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff8000, v2
-; GFX10-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
-; GFX10-NEXT:    v_add_co_u32 v2, s0, s34, v4
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s0, s35, 0, s0
-; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v2, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 7, v0
+; GFX10-NEXT:    v_mov_b32_e32 v2, 2
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xffff8000, v1
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX10-NEXT:    v_add_co_u32 v1, s0, s34, v4
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v2, s0, s35, 0, s0
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v1, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo
 ; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v0, 0x80000000
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
 ; GFX10-NEXT:    global_load_dword v5, v[0:1], off
@@ -1913,51 +1714,6 @@ define amdgpu_kernel void @p32Offset64(ptr addrspace(1)  %buffer) {
 ; GFX10-NEXT:    global_store_dword v4, v0, s[34:35]
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX90A-LABEL: p32Offset64:
-; GFX90A:       ; %bb.0: ; %entry
-; GFX90A-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX90A-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX90A-NEXT:    s_mov_b32 s38, -1
-; GFX90A-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX90A-NEXT:    s_add_u32 s36, s36, s3
-; GFX90A-NEXT:    s_addc_u32 s37, s37, 0
-; GFX90A-NEXT:    s_load_dwordx2 s[34:35], s[0:1], 0x24
-; GFX90A-NEXT:    s_getpc_b64 s[0:1]
-; GFX90A-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4
-; GFX90A-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12
-; GFX90A-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX90A-NEXT:    s_mov_b64 s[0:1], s[36:37]
-; GFX90A-NEXT:    v_mov_b32_e32 v31, v0
-; GFX90A-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90A-NEXT:    s_mov_b32 s32, 0
-; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX90A-NEXT:    v_and_b32_e32 v2, 0xff, v0
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, 7, v0
-; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff8000, v0
-; GFX90A-NEXT:    v_mov_b32_e32 v3, 0
-; GFX90A-NEXT:    v_mov_b32_e32 v0, s35
-; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, s34, v6
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v0, vcc
-; GFX90A-NEXT:    v_lshlrev_b64 v[0:1], 2, v[2:3]
-; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v4, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v5, v1, vcc
-; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, 0x7ffff000, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
-; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x80000000, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
-; GFX90A-NEXT:    global_load_dword v7, v[0:1], off
-; GFX90A-NEXT:    global_load_dword v8, v[2:3], off offset:2048
-; GFX90A-NEXT:    global_load_dword v9, v[2:3], off offset:3072
-; GFX90A-NEXT:    global_load_dword v10, v[4:5], off
-; GFX90A-NEXT:    s_waitcnt vmcnt(2)
-; GFX90A-NEXT:    v_add_u32_e32 v0, v8, v7
-; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_add3_u32 v0, v9, v0, v10
-; GFX90A-NEXT:    global_store_dword v6, v0, s[34:35]
-; GFX90A-NEXT:    s_endpgm
-;
 ; GFX11-LABEL: p32Offset64:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_getpc_b64 s[2:3]
@@ -1969,17 +1725,17 @@ define amdgpu_kernel void @p32Offset64(ptr addrspace(1)  %buffer) {
 ; GFX11-NEXT:    s_mov_b32 s32, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
-; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v2, 7, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 7, v0
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff8000, v2
-; GFX11-NEXT:    v_lshlrev_b64 v[0:1], 2, v[0:1]
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff8000, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_co_u32 v2, s0, s34, v6
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, s35, 0, s0
+; GFX11-NEXT:    v_add_co_u32 v1, s0, s34, v6
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v2, null, s35, 0, s0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v2, v0
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v1, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, 0x7ffff000, v0
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
@@ -2320,137 +2076,135 @@ define amdgpu_kernel void @ReverseOrder(ptr addrspace(1) %buffer) {
 ; GFX8-NEXT:    s_mov_b32 s32, 0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xff, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 7, v0
-; GFX8-NEXT:    v_mov_b32_e32 v2, 0
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff8000, v0
-; GFX8-NEXT:    v_mov_b32_e32 v4, s35
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, s34, v0
-; GFX8-NEXT:    v_lshlrev_b64 v[0:1], 3, v[1:2]
-; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v3, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v4, v1, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 7, v0
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff8000, v1
+; GFX8-NEXT:    v_mov_b32_e32 v2, s35
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, s34, v1
+; GFX8-NEXT:    v_mov_b32_e32 v3, 3
+; GFX8-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v1, v0
+; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v2, vcc
 ; GFX8-NEXT:    s_movk_i32 s0, 0x3800
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s0, v0
-; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, 0, v1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, s0, v3
+; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, 0, v4, vcc
 ; GFX8-NEXT:    s_movk_i32 s0, 0x3000
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, s0, v0
-; GFX8-NEXT:    v_addc_u32_e32 v8, vcc, 0, v1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, s0, v3
+; GFX8-NEXT:    v_addc_u32_e32 v8, vcc, 0, v4, vcc
 ; GFX8-NEXT:    s_movk_i32 s0, 0x2800
-; GFX8-NEXT:    v_add_u32_e32 v9, vcc, s0, v0
-; GFX8-NEXT:    v_addc_u32_e32 v10, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dwordx2 v[11:12], v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, s0, v3
+; GFX8-NEXT:    v_addc_u32_e32 v10, vcc, 0, v4, vcc
+; GFX8-NEXT:    flat_load_dwordx2 v[11:12], v[3:4]
 ; GFX8-NEXT:    flat_load_dwordx2 v[5:6], v[5:6]
 ; GFX8-NEXT:    flat_load_dwordx2 v[7:8], v[7:8]
 ; GFX8-NEXT:    flat_load_dwordx2 v[9:10], v[9:10]
 ; GFX8-NEXT:    s_movk_i32 s0, 0x2000
-; GFX8-NEXT:    v_add_u32_e32 v13, vcc, s0, v0
-; GFX8-NEXT:    v_addc_u32_e32 v14, vcc, 0, v1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v13, vcc, s0, v3
+; GFX8-NEXT:    v_addc_u32_e32 v14, vcc, 0, v4, vcc
 ; GFX8-NEXT:    s_movk_i32 s0, 0x1800
-; GFX8-NEXT:    v_add_u32_e32 v15, vcc, s0, v0
-; GFX8-NEXT:    v_addc_u32_e32 v16, vcc, 0, v1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v15, vcc, s0, v3
+; GFX8-NEXT:    v_addc_u32_e32 v16, vcc, 0, v4, vcc
 ; GFX8-NEXT:    flat_load_dwordx2 v[13:14], v[13:14]
 ; GFX8-NEXT:    flat_load_dwordx2 v[15:16], v[15:16]
 ; GFX8-NEXT:    s_movk_i32 s0, 0x1000
-; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s0, v0
-; GFX8-NEXT:    v_addc_u32_e32 v18, vcc, 0, v1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v17, vcc, s0, v3
+; GFX8-NEXT:    v_addc_u32_e32 v18, vcc, 0, v4, vcc
 ; GFX8-NEXT:    flat_load_dwordx2 v[17:18], v[17:18]
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x800, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x800, v3
+; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
+; GFX8-NEXT:    flat_load_dwordx2 v[3:4], v[3:4]
 ; GFX8-NEXT:    s_waitcnt vmcnt(6)
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v5, v11
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v5, v11
 ; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v6, v12, vcc
 ; GFX8-NEXT:    s_waitcnt vmcnt(5)
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v7, v2
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v7, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v8, v5, vcc
 ; GFX8-NEXT:    s_waitcnt vmcnt(4)
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v9, v2
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v9, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v10, v5, vcc
 ; GFX8-NEXT:    s_waitcnt vmcnt(3)
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v13, v2
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v13, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v14, v5, vcc
 ; GFX8-NEXT:    s_waitcnt vmcnt(2)
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v15, v2
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v15, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v16, v5, vcc
 ; GFX8-NEXT:    s_waitcnt vmcnt(1)
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v17, v2
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v17, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v18, v5, vcc
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v1, v5, vcc
-; GFX8-NEXT:    flat_store_dwordx2 v[3:4], v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v0
+; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, v4, v5, vcc
+; GFX8-NEXT:    flat_store_dwordx2 v[1:2], v[3:4]
 ; GFX8-NEXT:    s_endpgm
 ;
-; GFX900-LABEL: ReverseOrder:
-; GFX900:       ; %bb.0: ; %entry
-; GFX900-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX900-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX900-NEXT:    s_mov_b32 s38, -1
-; GFX900-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX900-NEXT:    s_add_u32 s36, s36, s3
-; GFX900-NEXT:    s_addc_u32 s37, s37, 0
-; GFX900-NEXT:    s_load_dwordx2 s[34:35], s[0:1], 0x24
-; GFX900-NEXT:    s_getpc_b64 s[0:1]
-; GFX900-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4
-; GFX900-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12
-; GFX900-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX900-NEXT:    s_mov_b64 s[0:1], s[36:37]
-; GFX900-NEXT:    v_mov_b32_e32 v31, v0
-; GFX900-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX900-NEXT:    v_mov_b32_e32 v0, 0
-; GFX900-NEXT:    s_mov_b32 s32, 0
-; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX900-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX900-NEXT:    v_and_b32_e32 v1, 0xff, v0
-; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 7, v0
-; GFX900-NEXT:    v_and_b32_e32 v22, 0xffff8000, v0
-; GFX900-NEXT:    v_mov_b32_e32 v2, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, s35
-; GFX900-NEXT:    v_add_co_u32_e32 v3, vcc, s34, v22
-; GFX900-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v0, vcc
-; GFX900-NEXT:    v_lshlrev_b64 v[0:1], 3, v[1:2]
-; GFX900-NEXT:    s_movk_i32 s0, 0x3000
-; GFX900-NEXT:    v_add_co_u32_e32 v0, vcc, v3, v0
-; GFX900-NEXT:    v_addc_co_u32_e32 v1, vcc, v4, v1, vcc
-; GFX900-NEXT:    v_add_co_u32_e32 v4, vcc, s0, v0
-; GFX900-NEXT:    global_load_dwordx2 v[2:3], v[0:1], off
-; GFX900-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
-; GFX900-NEXT:    global_load_dwordx2 v[6:7], v[4:5], off offset:2048
-; GFX900-NEXT:    global_load_dwordx2 v[8:9], v[4:5], off
-; GFX900-NEXT:    s_movk_i32 s0, 0x2000
-; GFX900-NEXT:    v_add_co_u32_e32 v4, vcc, s0, v0
-; GFX900-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
-; GFX900-NEXT:    global_load_dwordx2 v[10:11], v[4:5], off offset:2048
-; GFX900-NEXT:    s_movk_i32 s0, 0x1000
-; GFX900-NEXT:    v_add_co_u32_e32 v12, vcc, s0, v0
-; GFX900-NEXT:    v_addc_co_u32_e32 v13, vcc, 0, v1, vcc
-; GFX900-NEXT:    global_load_dwordx2 v[14:15], v[12:13], off
-; GFX900-NEXT:    global_load_dwordx2 v[16:17], v[4:5], off
-; GFX900-NEXT:    global_load_dwordx2 v[18:19], v[12:13], off offset:2048
-; GFX900-NEXT:    global_load_dwordx2 v[20:21], v[0:1], off offset:2048
-; GFX900-NEXT:    s_waitcnt vmcnt(6)
-; GFX900-NEXT:    v_add_co_u32_e32 v0, vcc, v6, v2
-; GFX900-NEXT:    v_addc_co_u32_e32 v1, vcc, v7, v3, vcc
-; GFX900-NEXT:    s_waitcnt vmcnt(5)
-; GFX900-NEXT:    v_add_co_u32_e32 v0, vcc, v8, v0
-; GFX900-NEXT:    v_addc_co_u32_e32 v1, vcc, v9, v1, vcc
-; GFX900-NEXT:    s_waitcnt vmcnt(4)
-; GFX900-NEXT:    v_add_co_u32_e32 v0, vcc, v10, v0
-; GFX900-NEXT:    v_addc_co_u32_e32 v1, vcc, v11, v1, vcc
-; GFX900-NEXT:    s_waitcnt vmcnt(2)
-; GFX900-NEXT:    v_add_co_u32_e32 v0, vcc, v16, v0
-; GFX900-NEXT:    v_addc_co_u32_e32 v1, vcc, v17, v1, vcc
-; GFX900-NEXT:    s_waitcnt vmcnt(1)
-; GFX900-NEXT:    v_add_co_u32_e32 v0, vcc, v18, v0
-; GFX900-NEXT:    v_addc_co_u32_e32 v1, vcc, v19, v1, vcc
-; GFX900-NEXT:    v_add_co_u32_e32 v0, vcc, v14, v0
-; GFX900-NEXT:    v_addc_co_u32_e32 v1, vcc, v15, v1, vcc
-; GFX900-NEXT:    s_waitcnt vmcnt(0)
-; GFX900-NEXT:    v_add_co_u32_e32 v0, vcc, v20, v0
-; GFX900-NEXT:    v_addc_co_u32_e32 v1, vcc, v21, v1, vcc
-; GFX900-NEXT:    global_store_dwordx2 v22, v[0:1], s[34:35]
-; GFX900-NEXT:    s_endpgm
+; GFX9-LABEL: ReverseOrder:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT:    s_mov_b32 s38, -1
+; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX9-NEXT:    s_add_u32 s36, s36, s3
+; GFX9-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[0:1], 0x24
+; GFX9-NEXT:    s_getpc_b64 s[0:1]
+; GFX9-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT:    v_mov_b32_e32 v31, v0
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_mov_b32 s32, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 7, v0
+; GFX9-NEXT:    v_and_b32_e32 v22, 0xffff8000, v1
+; GFX9-NEXT:    v_mov_b32_e32 v1, s35
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s34, v22
+; GFX9-NEXT:    v_mov_b32_e32 v3, 3
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    s_movk_i32 s0, 0x3000
+; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, s0, v0
+; GFX9-NEXT:    global_load_dwordx2 v[2:3], v[0:1], off
+; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
+; GFX9-NEXT:    global_load_dwordx2 v[6:7], v[4:5], off offset:2048
+; GFX9-NEXT:    global_load_dwordx2 v[8:9], v[4:5], off
+; GFX9-NEXT:    s_movk_i32 s0, 0x2000
+; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, s0, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
+; GFX9-NEXT:    global_load_dwordx2 v[10:11], v[4:5], off offset:2048
+; GFX9-NEXT:    s_movk_i32 s0, 0x1000
+; GFX9-NEXT:    v_add_co_u32_e32 v12, vcc, s0, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v13, vcc, 0, v1, vcc
+; GFX9-NEXT:    global_load_dwordx2 v[14:15], v[12:13], off
+; GFX9-NEXT:    global_load_dwordx2 v[16:17], v[4:5], off
+; GFX9-NEXT:    global_load_dwordx2 v[18:19], v[12:13], off offset:2048
+; GFX9-NEXT:    global_load_dwordx2 v[20:21], v[0:1], off offset:2048
+; GFX9-NEXT:    s_waitcnt vmcnt(6)
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v6, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v7, v3, vcc
+; GFX9-NEXT:    s_waitcnt vmcnt(5)
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v8, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v9, v1, vcc
+; GFX9-NEXT:    s_waitcnt vmcnt(4)
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v10, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v11, v1, vcc
+; GFX9-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v16, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v17, v1, vcc
+; GFX9-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v18, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v19, v1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v14, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v15, v1, vcc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v20, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v21, v1, vcc
+; GFX9-NEXT:    global_store_dwordx2 v22, v[0:1], s[34:35]
+; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: ReverseOrder:
 ; GFX10:       ; %bb.0: ; %entry
@@ -2472,15 +2226,14 @@ define amdgpu_kernel void @ReverseOrder(ptr addrspace(1) %buffer) {
 ; GFX10-NEXT:    s_mov_b32 s32, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 7, v0
-; GFX10-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX10-NEXT:    v_and_b32_e32 v20, 0xffff8000, v2
-; GFX10-NEXT:    v_lshlrev_b64 v[0:1], 3, v[0:1]
-; GFX10-NEXT:    v_add_co_u32 v2, s0, s34, v20
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s0, s35, 0, s0
-; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v2, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 7, v0
+; GFX10-NEXT:    v_mov_b32_e32 v2, 3
+; GFX10-NEXT:    v_and_b32_e32 v20, 0xffff8000, v1
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX10-NEXT:    v_add_co_u32 v1, s0, s34, v20
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v2, s0, s35, 0, s0
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v1, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo
 ; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, 0x3800, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
 ; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, 0x3000, v0
@@ -2529,76 +2282,6 @@ define amdgpu_kernel void @ReverseOrder(ptr addrspace(1) %buffer) {
 ; GFX10-NEXT:    global_store_dwordx2 v20, v[0:1], s[34:35]
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX90A-LABEL: ReverseOrder:
-; GFX90A:       ; %bb.0: ; %entry
-; GFX90A-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX90A-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX90A-NEXT:    s_mov_b32 s38, -1
-; GFX90A-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX90A-NEXT:    s_add_u32 s36, s36, s3
-; GFX90A-NEXT:    s_addc_u32 s37, s37, 0
-; GFX90A-NEXT:    s_load_dwordx2 s[34:35], s[0:1], 0x24
-; GFX90A-NEXT:    s_getpc_b64 s[0:1]
-; GFX90A-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4
-; GFX90A-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12
-; GFX90A-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX90A-NEXT:    s_mov_b64 s[0:1], s[36:37]
-; GFX90A-NEXT:    v_mov_b32_e32 v31, v0
-; GFX90A-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90A-NEXT:    s_mov_b32 s32, 0
-; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX90A-NEXT:    v_and_b32_e32 v2, 0xff, v0
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, 7, v0
-; GFX90A-NEXT:    v_and_b32_e32 v22, 0xffff8000, v0
-; GFX90A-NEXT:    v_mov_b32_e32 v3, 0
-; GFX90A-NEXT:    v_mov_b32_e32 v0, s35
-; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, s34, v22
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v0, vcc
-; GFX90A-NEXT:    v_lshlrev_b64 v[0:1], 3, v[2:3]
-; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v4, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v5, v1, vcc
-; GFX90A-NEXT:    s_movk_i32 s0, 0x3000
-; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, s0, v0
-; GFX90A-NEXT:    global_load_dwordx2 v[2:3], v[0:1], off
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
-; GFX90A-NEXT:    global_load_dwordx2 v[6:7], v[4:5], off offset:2048
-; GFX90A-NEXT:    global_load_dwordx2 v[8:9], v[4:5], off
-; GFX90A-NEXT:    s_movk_i32 s0, 0x2000
-; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, s0, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
-; GFX90A-NEXT:    global_load_dwordx2 v[10:11], v[4:5], off offset:2048
-; GFX90A-NEXT:    s_movk_i32 s0, 0x1000
-; GFX90A-NEXT:    v_add_co_u32_e32 v12, vcc, s0, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v13, vcc, 0, v1, vcc
-; GFX90A-NEXT:    global_load_dwordx2 v[14:15], v[12:13], off
-; GFX90A-NEXT:    global_load_dwordx2 v[16:17], v[4:5], off
-; GFX90A-NEXT:    global_load_dwordx2 v[18:19], v[12:13], off offset:2048
-; GFX90A-NEXT:    global_load_dwordx2 v[20:21], v[0:1], off offset:2048
-; GFX90A-NEXT:    s_waitcnt vmcnt(6)
-; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v6, v2
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v7, v3, vcc
-; GFX90A-NEXT:    s_waitcnt vmcnt(5)
-; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v8, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v9, v1, vcc
-; GFX90A-NEXT:    s_waitcnt vmcnt(4)
-; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v10, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v11, v1, vcc
-; GFX90A-NEXT:    s_waitcnt vmcnt(2)
-; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v16, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v17, v1, vcc
-; GFX90A-NEXT:    s_waitcnt vmcnt(1)
-; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v18, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v19, v1, vcc
-; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v14, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v15, v1, vcc
-; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v20, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v21, v1, vcc
-; GFX90A-NEXT:    global_store_dwordx2 v22, v[0:1], s[34:35]
-; GFX90A-NEXT:    s_endpgm
-;
 ; GFX11-LABEL: ReverseOrder:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_getpc_b64 s[2:3]
@@ -2610,17 +2293,17 @@ define amdgpu_kernel void @ReverseOrder(ptr addrspace(1) %buffer) {
 ; GFX11-NEXT:    s_mov_b32 s32, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
-; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v2, 7, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 7, v0
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff8000, v2
-; GFX11-NEXT:    v_lshlrev_b64 v[0:1], 3, v[0:1]
+; GFX11-NEXT:    v_and_b32_e32 v16, 0xffff8000, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_co_u32 v2, s0, s34, v16
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, s35, 0, s0
+; GFX11-NEXT:    v_add_co_u32 v1, s0, s34, v16
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v2, null, s35, 0, s0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v2, v0
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v3, v1, vcc_lo
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v1, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, 0x3000, v0
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v1, vcc_lo
@@ -2731,71 +2414,69 @@ define hidden amdgpu_kernel void @negativeoffset(ptr addrspace(1) nocapture %buf
 ; GFX8-NEXT:    s_mov_b32 s32, 0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xff, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 7, v0
-; GFX8-NEXT:    v_mov_b32_e32 v2, 0
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff8000, v0
-; GFX8-NEXT:    v_mov_b32_e32 v4, s35
-; GFX8-NEXT:    v_add_u32_e32 v3, vcc, s34, v0
-; GFX8-NEXT:    v_lshlrev_b64 v[0:1], 3, v[1:2]
-; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v3, v0
-; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, v4, v1, vcc
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 7, v0
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff8000, v1
+; GFX8-NEXT:    v_mov_b32_e32 v2, s35
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, s34, v1
+; GFX8-NEXT:    v_mov_b32_e32 v3, 3
+; GFX8-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v1, v0
+; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, 0, v2, vcc
 ; GFX8-NEXT:    s_movk_i32 s0, 0x800
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v2
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v6, vcc
-; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0, v2
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, s0, v0
+; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, -1, v6, vcc
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 0, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, -1, v6, vcc
-; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX8-NEXT:    flat_load_dwordx2 v[3:4], v[3:4]
 ; GFX8-NEXT:    flat_load_dwordx2 v[5:6], v[5:6]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v5, v0
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v6, v1, vcc
-; GFX8-NEXT:    flat_store_dwordx2 v[3:4], v[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v5, v3
+; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, v6, v4, vcc
+; GFX8-NEXT:    flat_store_dwordx2 v[1:2], v[3:4]
 ; GFX8-NEXT:    s_endpgm
 ;
-; GFX900-LABEL: negativeoffset:
-; GFX900:       ; %bb.0: ; %entry
-; GFX900-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX900-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX900-NEXT:    s_mov_b32 s38, -1
-; GFX900-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX900-NEXT:    s_add_u32 s36, s36, s3
-; GFX900-NEXT:    s_addc_u32 s37, s37, 0
-; GFX900-NEXT:    s_load_dwordx2 s[34:35], s[0:1], 0x24
-; GFX900-NEXT:    s_getpc_b64 s[0:1]
-; GFX900-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4
-; GFX900-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12
-; GFX900-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX900-NEXT:    s_mov_b64 s[0:1], s[36:37]
-; GFX900-NEXT:    v_mov_b32_e32 v31, v0
-; GFX900-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX900-NEXT:    v_mov_b32_e32 v0, 0
-; GFX900-NEXT:    s_mov_b32 s32, 0
-; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX900-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX900-NEXT:    v_and_b32_e32 v1, 0xff, v0
-; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 7, v0
-; GFX900-NEXT:    v_and_b32_e32 v8, 0xffff8000, v0
-; GFX900-NEXT:    v_mov_b32_e32 v2, 0
-; GFX900-NEXT:    v_mov_b32_e32 v0, s35
-; GFX900-NEXT:    v_add_co_u32_e32 v3, vcc, s34, v8
-; GFX900-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v0, vcc
-; GFX900-NEXT:    v_lshlrev_b64 v[0:1], 3, v[1:2]
-; GFX900-NEXT:    s_movk_i32 s0, 0x1000
-; GFX900-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v0
-; GFX900-NEXT:    v_addc_co_u32_e32 v3, vcc, v4, v1, vcc
-; GFX900-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v2
-; GFX900-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v3, vcc
-; GFX900-NEXT:    v_add_co_u32_e32 v2, vcc, 0, v2
-; GFX900-NEXT:    v_addc_co_u32_e32 v3, vcc, -1, v3, vcc
-; GFX900-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:-2048
-; GFX900-NEXT:    global_load_dwordx2 v[6:7], v[2:3], off
-; GFX900-NEXT:    s_waitcnt vmcnt(0)
-; GFX900-NEXT:    v_add_co_u32_e32 v0, vcc, v6, v4
-; GFX900-NEXT:    v_addc_co_u32_e32 v1, vcc, v7, v5, vcc
-; GFX900-NEXT:    global_store_dwordx2 v8, v[0:1], s[34:35]
-; GFX900-NEXT:    s_endpgm
+; GFX9-LABEL: negativeoffset:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT:    s_mov_b32 s38, -1
+; GFX9-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX9-NEXT:    s_add_u32 s36, s36, s3
+; GFX9-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-NEXT:    s_load_dwordx2 s[34:35], s[0:1], 0x24
+; GFX9-NEXT:    s_getpc_b64 s[0:1]
+; GFX9-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT:    v_mov_b32_e32 v31, v0
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_mov_b32 s32, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 7, v0
+; GFX9-NEXT:    v_and_b32_e32 v8, 0xffff8000, v1
+; GFX9-NEXT:    v_mov_b32_e32 v1, s35
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, s34, v8
+; GFX9-NEXT:    v_mov_b32_e32 v3, 3
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
+; GFX9-NEXT:    s_movk_i32 s0, 0x1000
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v3, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, 0, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, -1, v3, vcc
+; GFX9-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:-2048
+; GFX9-NEXT:    global_load_dwordx2 v[6:7], v[2:3], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v6, v4
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v7, v5, vcc
+; GFX9-NEXT:    global_store_dwordx2 v8, v[0:1], s[34:35]
+; GFX9-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: negativeoffset:
 ; GFX10:       ; %bb.0: ; %entry
@@ -2817,19 +2498,18 @@ define hidden amdgpu_kernel void @negativeoffset(ptr addrspace(1) nocapture %buf
 ; GFX10-NEXT:    s_mov_b32 s32, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 7, v0
-; GFX10-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff8000, v2
-; GFX10-NEXT:    v_lshlrev_b64 v[0:1], 3, v[0:1]
-; GFX10-NEXT:    v_add_co_u32 v2, s0, s34, v8
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s0, s35, 0, s0
-; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v1, vcc_lo
-; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800, v2
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v3, vcc_lo
-; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, 0, v2
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, -1, v3, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 7, v0
+; GFX10-NEXT:    v_mov_b32_e32 v2, 3
+; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff8000, v1
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX10-NEXT:    v_add_co_u32 v1, s0, s34, v8
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v2, s0, s35, 0, s0
+; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, v1, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v2, vcc_lo
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x800, v3
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v4, vcc_lo
+; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, 0, v3
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, -1, v4, vcc_lo
 ; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
 ; GFX10-NEXT:    global_load_dwordx2 v[6:7], v[2:3], off
@@ -2839,49 +2519,6 @@ define hidden amdgpu_kernel void @negativeoffset(ptr addrspace(1) nocapture %buf
 ; GFX10-NEXT:    global_store_dwordx2 v8, v[0:1], s[34:35]
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX90A-LABEL: negativeoffset:
-; GFX90A:       ; %bb.0: ; %entry
-; GFX90A-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
-; GFX90A-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
-; GFX90A-NEXT:    s_mov_b32 s38, -1
-; GFX90A-NEXT:    s_mov_b32 s39, 0xe00000
-; GFX90A-NEXT:    s_add_u32 s36, s36, s3
-; GFX90A-NEXT:    s_addc_u32 s37, s37, 0
-; GFX90A-NEXT:    s_load_dwordx2 s[34:35], s[0:1], 0x24
-; GFX90A-NEXT:    s_getpc_b64 s[0:1]
-; GFX90A-NEXT:    s_add_u32 s0, s0, _Z13get_global_idj@gotpcrel32@lo+4
-; GFX90A-NEXT:    s_addc_u32 s1, s1, _Z13get_global_idj@gotpcrel32@hi+12
-; GFX90A-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX90A-NEXT:    s_mov_b64 s[0:1], s[36:37]
-; GFX90A-NEXT:    v_mov_b32_e32 v31, v0
-; GFX90A-NEXT:    s_mov_b64 s[2:3], s[38:39]
-; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90A-NEXT:    s_mov_b32 s32, 0
-; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX90A-NEXT:    v_and_b32_e32 v2, 0xff, v0
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, 7, v0
-; GFX90A-NEXT:    v_and_b32_e32 v8, 0xffff8000, v0
-; GFX90A-NEXT:    v_mov_b32_e32 v3, 0
-; GFX90A-NEXT:    v_mov_b32_e32 v0, s35
-; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, s34, v8
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v0, vcc
-; GFX90A-NEXT:    v_lshlrev_b64 v[0:1], 3, v[2:3]
-; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, v4, v0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v5, v1, vcc
-; GFX90A-NEXT:    s_movk_i32 s0, 0x1000
-; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v2
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v3, vcc
-; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, 0, v2
-; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, -1, v3, vcc
-; GFX90A-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off offset:-2048
-; GFX90A-NEXT:    global_load_dwordx2 v[6:7], v[2:3], off
-; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v6, v4
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v7, v5, vcc
-; GFX90A-NEXT:    global_store_dwordx2 v8, v[0:1], s[34:35]
-; GFX90A-NEXT:    s_endpgm
-;
 ; GFX11-LABEL: negativeoffset:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_getpc_b64 s[2:3]
@@ -2893,22 +2530,22 @@ define hidden amdgpu_kernel void @negativeoffset(ptr addrspace(1) nocapture %buf
 ; GFX11-NEXT:    s_mov_b32 s32, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[2:3]
-; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v2, 7, v0
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 7, v0
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff8000, v2
-; GFX11-NEXT:    v_lshlrev_b64 v[0:1], 3, v[0:1]
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xffff8000, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_co_u32 v2, s0, s34, v4
-; GFX11-NEXT:    v_add_co_ci_u32_e64 v3, null, s35, 0, s0
+; GFX11-NEXT:    v_add_co_u32 v1, s0, s34, v4
+; GFX11-NEXT:    v_add_co_ci_u32_e64 v2, null, s35, 0, s0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v0
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v1, vcc_lo
+; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, v1, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v2, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v2
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v3, vcc_lo
-; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, 0, v2
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, -1, v3, vcc_lo
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v3
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v5, vcc_lo
+; GFX11-NEXT:    v_add_co_u32 v2, vcc_lo, 0, v3
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, -1, v5, vcc_lo
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off offset:-2048
 ; GFX11-NEXT:    global_load_b64 v[2:3], v[2:3], off
diff --git a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
index 033fd8ef89cfe..ebd7dfdd0b92a 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
@@ -10084,28 +10084,27 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
 ; GFX6-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, -1, 0
 ; GFX6-NEXT:    s_addc_u32 s41, s41, 0
-; GFX6-NEXT:    v_mbcnt_hi_u32_b32_e32 v5, -1, v0
+; GFX6-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, -1, v0
 ; GFX6-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX6-NEXT:    s_mov_b64 s[4:5], exec
 ; GFX6-NEXT:    s_mov_b32 s6, 0
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b64 exec, 15
-; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_writelane_b32 v0, s0, 0
-; GFX6-NEXT:    v_writelane_b32 v0, s1, 1
-; GFX6-NEXT:    v_writelane_b32 v0, s2, 2
-; GFX6-NEXT:    v_writelane_b32 v0, s3, 3
+; GFX6-NEXT:    v_writelane_b32 v1, s0, 0
+; GFX6-NEXT:    v_writelane_b32 v1, s1, 1
+; GFX6-NEXT:    v_writelane_b32 v1, s2, 2
+; GFX6-NEXT:    v_writelane_b32 v1, s3, 3
 ; GFX6-NEXT:    s_mov_b32 s8, 0x80400
-; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s8 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
-; GFX6-NEXT:    v_lshlrev_b32_e32 v7, 8, v5
-; GFX6-NEXT:    v_mov_b32_e32 v8, v6
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:240
+; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 8, v0
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:240
 ; GFX6-NEXT:    s_mov_b32 s2, 0x84400
 ; GFX6-NEXT:    s_mov_b64 s[8:9], exec
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
@@ -10115,7 +10114,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
 ; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:224
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:224
 ; GFX6-NEXT:    s_mov_b32 s2, 0x84000
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
@@ -10124,7 +10123,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
 ; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:208
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:208
 ; GFX6-NEXT:    s_mov_b32 s2, 0x83c00
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
@@ -10133,7 +10132,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
 ; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:192
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:192
 ; GFX6-NEXT:    s_mov_b32 s2, 0x83800
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
@@ -10142,7 +10141,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
 ; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:176
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:176
 ; GFX6-NEXT:    s_mov_b32 s2, 0x83400
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
@@ -10151,7 +10150,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
 ; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:160
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:160
 ; GFX6-NEXT:    s_mov_b32 s2, 0x83000
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
@@ -10160,7 +10159,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
 ; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:144
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:144
 ; GFX6-NEXT:    s_mov_b32 s2, 0x82c00
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
@@ -10169,7 +10168,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
 ; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:128
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:128
 ; GFX6-NEXT:    s_mov_b32 s2, 0x82800
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
@@ -10178,7 +10177,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
 ; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:112
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:112
 ; GFX6-NEXT:    s_mov_b32 s2, 0x82400
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
@@ -10187,7 +10186,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
 ; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:96
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:96
 ; GFX6-NEXT:    s_mov_b32 s2, 0x82000
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
@@ -10196,7 +10195,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
 ; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:80
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:80
 ; GFX6-NEXT:    s_mov_b32 s2, 0x81c00
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
@@ -10205,7 +10204,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
 ; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:64
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:64
 ; GFX6-NEXT:    s_mov_b32 s2, 0x81400
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
@@ -10214,24 +10213,24 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
 ; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64
-; GFX6-NEXT:    buffer_load_dwordx4 v[9:12], v[7:8], s[4:7], 0 addr64 offset:16
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64
+; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[5:6], s[4:7], 0 addr64 offset:16
 ; GFX6-NEXT:    s_mov_b32 s2, 0x80c00
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v9, off, s[40:43], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v7, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v10, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v11, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v12, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v8, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v9, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v10, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[9:12], v[7:8], s[4:7], 0 addr64 offset:32
+; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[5:6], s[4:7], 0 addr64 offset:32
 ; GFX6-NEXT:    s_mov_b32 s2, 0x81000
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v9, off, s[40:43], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v7, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v10, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v11, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v12, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v8, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v9, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v10, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; GFX6-NEXT:    s_mov_b64 exec, 15
 ; GFX6-NEXT:    buffer_store_dword v4, off, s[40:43], 0
@@ -10246,7 +10245,7 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    buffer_load_dword v4, off, s[40:43], 0
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    s_mov_b64 exec, s[8:9]
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[7:8], s[4:7], 0 addr64 offset:48
+; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[5:6], s[4:7], 0 addr64 offset:48
 ; GFX6-NEXT:    s_mov_b32 s0, 0x81800
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 13, v0
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, 16, v4
@@ -10530,197 +10529,195 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX6-NEXT:    s_mov_b32 s0, 0x84400
-; GFX6-NEXT:    v_lshl_b64 v[4:5], v[5:6], 8
-; GFX6-NEXT:    buffer_load_dword v6, off, s[40:43], s0 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_mov_b64 s[38:39], s[2:3]
 ; GFX6-NEXT:    s_mov_b32 s0, 0x84000
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[36:39], 0 addr64 offset:240
+; GFX6-NEXT:    buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:240
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v6, off, s[40:43], s0 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_mov_b32 s0, 0x83c00
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[36:39], 0 addr64 offset:224
+; GFX6-NEXT:    buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:224
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v6, off, s[40:43], s0 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_mov_b32 s0, 0x83800
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[36:39], 0 addr64 offset:208
+; GFX6-NEXT:    buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:208
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v6, off, s[40:43], s0 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_mov_b32 s0, 0x83400
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[36:39], 0 addr64 offset:192
+; GFX6-NEXT:    buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:192
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v6, off, s[40:43], s0 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_mov_b32 s0, 0x83000
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[36:39], 0 addr64 offset:176
+; GFX6-NEXT:    buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:176
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v6, off, s[40:43], s0 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_mov_b32 s0, 0x82c00
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[36:39], 0 addr64 offset:160
+; GFX6-NEXT:    buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:160
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v6, off, s[40:43], s0 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_mov_b32 s0, 0x82800
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[36:39], 0 addr64 offset:144
+; GFX6-NEXT:    buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:144
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v6, off, s[40:43], s0 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_mov_b32 s0, 0x82400
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[36:39], 0 addr64 offset:128
+; GFX6-NEXT:    buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:128
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v6, off, s[40:43], s0 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_mov_b32 s0, 0x82000
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[36:39], 0 addr64 offset:112
+; GFX6-NEXT:    buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:112
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v6, off, s[40:43], s0 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_mov_b32 s0, 0x81c00
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[36:39], 0 addr64 offset:96
+; GFX6-NEXT:    buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:96
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v6, off, s[40:43], s0 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_mov_b32 s0, 0x81400
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[36:39], 0 addr64 offset:80
+; GFX6-NEXT:    buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:80
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v6, off, s[40:43], s0 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_mov_b32 s0, 0x81800
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[36:39], 0 addr64 offset:64
+; GFX6-NEXT:    buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:64
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v6, off, s[40:43], s0 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_mov_b32 s0, 0x81000
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[36:39], 0 addr64 offset:48
+; GFX6-NEXT:    buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:48
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v6, off, s[40:43], s0 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_mov_b32 s0, 0x80c00
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[36:39], 0 addr64 offset:32
+; GFX6-NEXT:    buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:32
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v6, off, s[40:43], s0 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v7, off, s[40:43], s0 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v8, off, s[40:43], s0 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v9, off, s[40:43], s0 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v10, off, s[40:43], s0 offset:12 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[6:9], v[4:5], s[36:39], 0 addr64 offset:16
-; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[4:5], s[36:39], 0 addr64
+; GFX6-NEXT:    buffer_store_dwordx4 v[7:10], v[5:6], s[36:39], 0 addr64 offset:16
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[36:39], 0 addr64
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX9-FLATSCR-LABEL: test_limited_sgpr:
 ; GFX9-FLATSCR:       ; %bb.0: ; %entry
 ; GFX9-FLATSCR-NEXT:    s_load_dwordx4 s[36:39], s[0:1], 0x24
 ; GFX9-FLATSCR-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
-; GFX9-FLATSCR-NEXT:    v_mbcnt_hi_u32_b32 v5, -1, v0
-; GFX9-FLATSCR-NEXT:    v_lshlrev_b32_e32 v0, 8, v5
+; GFX9-FLATSCR-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
+; GFX9-FLATSCR-NEXT:    v_lshlrev_b32_e32 v5, 8, v0
 ; GFX9-FLATSCR-NEXT:    s_add_u32 flat_scratch_lo, s2, s5
 ; GFX9-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[1:4], v0, s[38:39] offset:240
+; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[38:39] offset:240
 ; GFX9-FLATSCR-NEXT:    s_addc_u32 flat_scratch_hi, s3, 0
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x20b0
-; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v6, 0
-; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v7, 1
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[1:4], s0 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[8:11], v0, s[38:39] offset:224
-; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[1:4], v0, s[38:39] offset:208
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x20a0
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[1:4], s0 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[20:23], v0, s[38:39] offset:192
-; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[1:4], v0, s[38:39] offset:176
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x2090
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[1:4], s0 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[16:19], v0, s[38:39] offset:160
-; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[1:4], v0, s[38:39] offset:144
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x2080
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[1:4], s0 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[1:4], v0, s[38:39] offset:128
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x20c0
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[1:4], s0 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[1:4], v0, s[38:39] offset:112
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x2060
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[1:4], s0 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[1:4], v0, s[38:39] offset:96
 ; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x2050
+; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v4, 16
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[1:4], s0 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[1:4], v0, s[38:39] offset:80
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[38:39] offset:224
 ; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x2040
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[1:4], s0 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[1:4], v0, s[38:39] offset:64
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[38:39] offset:208
 ; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x2030
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[1:4], s0 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[1:4], v0, s[38:39] offset:48
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[19:22], v5, s[38:39] offset:192
+; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[15:18], v5, s[38:39] offset:176
+; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[11:14], v5, s[38:39] offset:160
+; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[38:39] offset:144
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x2010
+; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v5, s[38:39] offset:128
+; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[38:39] offset:112
 ; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x2020
+; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x20c0
+; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v6, 1
+; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[7:10], v5, s[38:39]
+; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[38:39] offset:96
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x20b0
+; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(2)
+; GFX9-FLATSCR-NEXT:    v_lshl_add_u32 v4, v7, 13, v4
+; GFX9-FLATSCR-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
+; GFX9-FLATSCR-NEXT:    scratch_store_dword v4, v6, off
+; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[38:39] offset:80
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x20a0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[1:4], s0 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[1:4], v0, s[38:39] offset:32
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x2070
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[38:39] offset:64
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x2090
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[1:4], s0 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[1:4], v0, s[38:39] offset:16
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x2010
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[38:39] offset:48
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x2080
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[1:4], s0 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v0, s[38:39]
-; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v4, 16
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[38:39] offset:32
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x2070
+; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[38:39] offset:16
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x2060
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    v_lshl_add_u32 v4, v0, 13, v4
-; GFX9-FLATSCR-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-FLATSCR-NEXT:    scratch_store_dword v4, v7, off
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    ;;#ASMSTART
 ; GFX9-FLATSCR-NEXT:    ; def s[0:7]
 ; GFX9-FLATSCR-NEXT:    ;;#ASMEND
@@ -10749,23 +10746,27 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX9-FLATSCR-NEXT:    ; use s[0:7],s[8:15],s[16:23],s[24:31],s[40:43],s[38:39]
 ; GFX9-FLATSCR-NEXT:    ;;#ASMEND
 ; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x20d0
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[15:18], s0 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x20e0
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[16:19], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v0, v11
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[19:22], s0 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x20f0
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[20:23], s0 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x2100
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[8:11], s0 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v1, v12
+; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v2, v13
+; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v3, v14
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[7:10], s0 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    s_nop 0
 ; GFX9-FLATSCR-NEXT:    ;;#ASMSTART
 ; GFX9-FLATSCR-NEXT:    ;;#ASMEND
-; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[8:11], off, s0 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x20f0
-; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[20:23], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, s0 ; 16-byte Folded Reload
 ; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x20e0
-; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[16:19], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[19:22], off, s0 ; 16-byte Folded Reload
 ; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x20d0
-; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[15:18], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v14, v3
+; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v13, v2
+; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v12, v1
+; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v11, v0
 ; GFX9-FLATSCR-NEXT:    ;;#ASMSTART
 ; GFX9-FLATSCR-NEXT:    ;;#ASMEND
 ; GFX9-FLATSCR-NEXT:    ;;#ASMSTART
@@ -10780,62 +10781,58 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX9-FLATSCR-NEXT:    ;;#ASMEND
 ; GFX9-FLATSCR-NEXT:  .LBB1_2: ; %ret
 ; GFX9-FLATSCR-NEXT:    s_or_b64 exec, exec, s[34:35]
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x20c0
+; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
 ; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x20b0
-; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[12:15], off, s0 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    v_lshlrev_b64 v[4:5], 8, v[5:6]
-; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v6, s37
-; GFX9-FLATSCR-NEXT:    v_add_co_u32_e32 v4, vcc, s36, v4
-; GFX9-FLATSCR-NEXT:    v_addc_co_u32_e32 v5, vcc, v6, v5, vcc
+; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[36:37] offset:112
+; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
 ; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x20a0
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[4:5], v[12:15], off offset:240
-; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[4:5], v[8:11], off offset:224
-; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[6:9], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[36:37] offset:96
+; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
 ; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x2090
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[4:5], v[6:9], off offset:208
-; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[4:5], v[20:23], off offset:192
-; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[20:23], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[36:37] offset:80
+; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
 ; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x2080
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[4:5], v[20:23], off offset:176
-; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[4:5], v[16:19], off offset:160
-; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[16:19], off, s0 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x20c0
-; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[6:9], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[36:37] offset:64
+; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x2070
+; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[36:37] offset:48
+; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
 ; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x2060
-; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[12:15], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[36:37] offset:32
+; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
 ; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x2050
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[4:5], v[16:19], off offset:144
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[4:5], v[6:9], off offset:128
-; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[6:9], off, s0 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x2040
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(3)
-; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[4:5], v[12:15], off offset:112
-; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[4:5], v[6:9], off offset:96
-; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[6:9], off, s0 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x2030
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[4:5], v[6:9], off offset:80
+; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[36:37] offset:16
+; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[7:10], s[36:37]
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[6:9], off, s0 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x2020
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x2040
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[4:5], v[6:9], off offset:64
+; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[6:9], s[36:37] offset:240
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[6:9], off, s0 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x2070
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x2030
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[4:5], v[6:9], off offset:48
+; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[6:9], s[36:37] offset:224
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[6:9], off, s0 ; 16-byte Folded Reload
 ; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x2010
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[4:5], v[6:9], off offset:32
-; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[6:9], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[6:9], s[36:37] offset:208
+; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[19:22], s[36:37] offset:192
+; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[15:18], s[36:37] offset:176
+; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[11:14], s[36:37] offset:160
+; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x2020
+; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[36:37] offset:144
+; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[0:3], off, s0 ; 16-byte Folded Reload
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[4:5], v[6:9], off offset:16
-; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
+; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[36:37] offset:128
 ; GFX9-FLATSCR-NEXT:    s_endpgm
 ;
 ; GFX10-FLATSCR-LABEL: test_limited_sgpr:
@@ -10846,32 +10843,31 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX10-FLATSCR-NEXT:    s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
 ; GFX10-FLATSCR-NEXT:    s_load_dwordx4 s[36:39], s[0:1], 0x24
 ; GFX10-FLATSCR-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v6, 0
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v7, 1
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v6, 1
 ; GFX10-FLATSCR-NEXT:    s_mov_b32 s33, exec_lo
-; GFX10-FLATSCR-NEXT:    v_mbcnt_hi_u32_b32 v5, -1, v0
-; GFX10-FLATSCR-NEXT:    v_lshlrev_b32_e32 v0, 8, v5
+; GFX10-FLATSCR-NEXT:    v_mbcnt_hi_u32_b32 v0, -1, v0
+; GFX10-FLATSCR-NEXT:    v_lshlrev_b32_e32 v5, 8, v0
 ; GFX10-FLATSCR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-FLATSCR-NEXT:    s_clause 0xf
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[64:67], v0, s[38:39] offset:240
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[60:63], v0, s[38:39] offset:224
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[56:59], v0, s[38:39] offset:208
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[52:55], v0, s[38:39] offset:192
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[48:51], v0, s[38:39] offset:176
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[44:47], v0, s[38:39] offset:160
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[40:43], v0, s[38:39] offset:144
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[36:39], v0, s[38:39] offset:128
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[32:35], v0, s[38:39] offset:112
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[28:31], v0, s[38:39] offset:96
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[24:27], v0, s[38:39] offset:80
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[20:23], v0, s[38:39] offset:64
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[16:19], v0, s[38:39] offset:48
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[12:15], v0, s[38:39] offset:32
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[8:11], v0, s[38:39] offset:16
-; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v0, s[38:39]
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[35:38], v5, s[38:39] offset:240
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[31:34], v5, s[38:39] offset:224
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[27:30], v5, s[38:39] offset:208
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[23:26], v5, s[38:39] offset:192
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[19:22], v5, s[38:39] offset:176
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[15:18], v5, s[38:39] offset:160
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[11:14], v5, s[38:39] offset:144
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[7:10], v5, s[38:39] offset:128
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[63:66], v5, s[38:39] offset:112
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[59:62], v5, s[38:39] offset:96
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[55:58], v5, s[38:39] offset:80
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[51:54], v5, s[38:39] offset:64
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[47:50], v5, s[38:39] offset:48
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[43:46], v5, s[38:39] offset:32
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[39:42], v5, s[38:39] offset:16
+; GFX10-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[38:39]
 ; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-FLATSCR-NEXT:    v_lshl_add_u32 v4, v0, 13, 16
-; GFX10-FLATSCR-NEXT:    scratch_store_dword v4, v7, off
+; GFX10-FLATSCR-NEXT:    scratch_store_dword v4, v6, off
 ; GFX10-FLATSCR-NEXT:    ;;#ASMSTART
 ; GFX10-FLATSCR-NEXT:    ; def s[0:7]
 ; GFX10-FLATSCR-NEXT:    ;;#ASMEND
@@ -10900,124 +10896,124 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX10-FLATSCR-NEXT:    ; use s[0:7],s[8:15],s[16:23],s[24:31],s[40:43],s[34:35]
 ; GFX10-FLATSCR-NEXT:    ;;#ASMEND
 ; GFX10-FLATSCR-NEXT:    s_movk_i32 s0, 0x2010
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v88, v59
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v92, v63
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v87, v58
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v86, v57
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v85, v56
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v91, v62
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v90, v61
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v89, v60
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v60, v35
-; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[64:67], s0 ; 16-byte Folded Spill
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v68, v39
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v59, v34
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v58, v33
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v57, v32
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v67, v38
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v66, v37
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v65, v36
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v36, v11
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v72, v43
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v76, v47
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v80, v51
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v84, v55
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v33, v8
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v71, v42
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v70, v41
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v69, v40
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v40, v15
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v75, v46
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v74, v45
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v73, v44
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v44, v19
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v79, v50
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v78, v49
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v77, v48
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v48, v23
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v83, v54
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v82, v53
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v81, v52
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v52, v27
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v56, v31
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v35, v10
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v34, v9
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v37, v12
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v41, v16
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v45, v20
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v49, v24
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v53, v28
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v39, v14
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v38, v13
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v43, v18
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v42, v17
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v47, v22
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v46, v21
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v51, v26
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v50, v25
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v55, v30
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v54, v29
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v88, v58
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v92, v62
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v87, v57
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v86, v56
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v85, v55
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v91, v61
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v90, v60
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v89, v59
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v60, v34
+; GFX10-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[63:66], s0 ; 16-byte Folded Spill
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v68, v38
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v59, v33
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v58, v32
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v57, v31
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v67, v37
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v66, v36
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v65, v35
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v36, v10
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v72, v42
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v76, v46
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v80, v50
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v84, v54
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v33, v7
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v71, v41
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v70, v40
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v69, v39
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v40, v14
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v75, v45
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v74, v44
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v73, v43
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v44, v18
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v79, v49
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v78, v48
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v77, v47
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v48, v22
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v83, v53
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v82, v52
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v81, v51
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v52, v26
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v56, v30
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v35, v9
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v34, v8
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v37, v11
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v41, v15
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v45, v19
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v49, v23
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v53, v27
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v39, v13
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v38, v12
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v43, v17
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v42, v16
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v47, v21
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v46, v20
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v51, v25
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v50, v24
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v55, v29
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v54, v28
 ; GFX10-FLATSCR-NEXT:    ;;#ASMSTART
 ; GFX10-FLATSCR-NEXT:    ;;#ASMEND
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v8, v33
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v28, v53
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v24, v49
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v20, v45
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v16, v41
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v12, v37
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v9, v34
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v10, v35
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v11, v36
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v32, v57
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v29, v54
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v30, v55
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v31, v56
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v25, v50
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v26, v51
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v27, v52
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v21, v46
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v22, v47
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v23, v48
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v17, v42
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v18, v43
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v19, v44
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v13, v38
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v14, v39
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v15, v40
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v33, v58
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v34, v59
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v35, v60
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v7, v33
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v27, v53
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v23, v49
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v19, v45
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v15, v41
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v11, v37
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v8, v34
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v9, v35
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v10, v36
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v31, v57
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v28, v54
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v29, v55
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v30, v56
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v24, v50
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v25, v51
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v26, v52
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v20, v46
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v21, v47
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v22, v48
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v16, v42
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v17, v43
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v18, v44
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v12, v38
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v13, v39
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v14, v40
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v32, v58
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v33, v59
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v34, v60
 ; GFX10-FLATSCR-NEXT:    ;;#ASMSTART
 ; GFX10-FLATSCR-NEXT:    ;;#ASMEND
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v36, v65
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v37, v66
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v38, v67
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v39, v68
-; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[64:67], off, s0 ; 16-byte Folded Reload
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v60, v89
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v56, v85
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v52, v81
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v48, v77
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v44, v73
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v40, v69
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v61, v90
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v62, v91
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v63, v92
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v57, v86
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v58, v87
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v59, v88
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v53, v82
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v54, v83
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v55, v84
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v49, v78
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v50, v79
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v51, v80
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v45, v74
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v46, v75
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v47, v76
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v41, v70
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v42, v71
-; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v43, v72
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v35, v65
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v36, v66
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v37, v67
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v38, v68
+; GFX10-FLATSCR-NEXT:    scratch_load_dwordx4 v[63:66], off, s0 ; 16-byte Folded Reload
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v59, v89
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v55, v85
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v51, v81
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v47, v77
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v43, v73
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v39, v69
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v60, v90
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v61, v91
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v62, v92
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v56, v86
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v57, v87
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v58, v88
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v52, v82
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v53, v83
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v54, v84
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v48, v78
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v49, v79
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v50, v80
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v44, v74
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v45, v75
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v46, v76
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v40, v70
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v41, v71
+; GFX10-FLATSCR-NEXT:    v_mov_b32_e32 v42, v72
 ; GFX10-FLATSCR-NEXT:    ;;#ASMSTART
 ; GFX10-FLATSCR-NEXT:    ;;#ASMEND
 ; GFX10-FLATSCR-NEXT:    ;;#ASMSTART
@@ -11030,26 +11026,23 @@ define amdgpu_kernel void @test_limited_sgpr(ptr addrspace(1) %out, ptr addrspac
 ; GFX10-FLATSCR-NEXT:    ;;#ASMEND
 ; GFX10-FLATSCR-NEXT:  .LBB1_2: ; %ret
 ; GFX10-FLATSCR-NEXT:    s_or_b32 exec_lo, exec_lo, s33
-; GFX10-FLATSCR-NEXT:    v_lshlrev_b64 v[4:5], 8, v[5:6]
-; GFX10-FLATSCR-NEXT:    v_add_co_u32 v4, vcc_lo, s36, v4
-; GFX10-FLATSCR-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, s37, v5, vcc_lo
-; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[4:5], v[64:67], off offset:240
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[4:5], v[60:63], off offset:224
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[4:5], v[56:59], off offset:208
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[4:5], v[52:55], off offset:192
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[4:5], v[48:51], off offset:176
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[4:5], v[44:47], off offset:160
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[4:5], v[40:43], off offset:144
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[4:5], v[36:39], off offset:128
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[4:5], v[32:35], off offset:112
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[4:5], v[28:31], off offset:96
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[4:5], v[24:27], off offset:80
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[4:5], v[20:23], off offset:64
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[4:5], v[16:19], off offset:48
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[4:5], v[12:15], off offset:32
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[4:5], v[8:11], off offset:16
-; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
+; GFX10-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[63:66], s[36:37] offset:112
+; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[59:62], s[36:37] offset:96
+; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[55:58], s[36:37] offset:80
+; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[51:54], s[36:37] offset:64
+; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[47:50], s[36:37] offset:48
+; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[43:46], s[36:37] offset:32
+; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[39:42], s[36:37] offset:16
+; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[36:37]
+; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[35:38], s[36:37] offset:240
+; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[31:34], s[36:37] offset:224
+; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[27:30], s[36:37] offset:208
+; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[23:26], s[36:37] offset:192
+; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[19:22], s[36:37] offset:176
+; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[15:18], s[36:37] offset:160
+; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[11:14], s[36:37] offset:144
+; GFX10-FLATSCR-NEXT:    global_store_dwordx4 v5, v[7:10], s[36:37] offset:128
 ; GFX10-FLATSCR-NEXT:    s_endpgm
 entry:
   %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll
index 930ba80ad6963..cc73302f85637 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll
@@ -571,7 +571,7 @@ define protected amdgpu_kernel void @nested_waterfalls(ptr addrspace(1) %tex.coe
   ; SI-NEXT:   successors: %bb.2(0x80000000)
   ; SI-NEXT: {{  $}}
   ; SI-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM killed [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.tex.coerce.kernarg.offset, align 4, addrspace 4)
-  ; SI-NEXT:   [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 3, killed [[COPY1]](s32), implicit $exec
+  ; SI-NEXT:   [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = nuw nsw V_LSHLREV_B32_e64 3, killed [[COPY1]](s32), implicit $exec
   ; SI-NEXT:   [[GLOBAL_LOAD_DWORDX2_SADDR:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2_SADDR killed [[S_LOAD_DWORDX2_IMM]], killed [[V_LSHLREV_B32_e64_]], 0, 0, implicit $exec :: (load (s64) from %ir.idx, addrspace 1)
   ; SI-NEXT:   [[GLOBAL_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = GLOBAL_LOAD_DWORDX4 [[GLOBAL_LOAD_DWORDX2_SADDR]], 16, 0, implicit $exec :: (invariant load (s128) from %ir.3 + 16, addrspace 4)
   ; SI-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[GLOBAL_LOAD_DWORDX4_]].sub3
diff --git a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
index b8d18f56b7602..dfbc4790e63b0 100644
--- a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
+++ b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
@@ -6,22 +6,18 @@ define amdgpu_kernel void @v3i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1)
 ; GFX906:       ; %bb.0: ; %entry
 ; GFX906-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX906-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX906-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GFX906-NEXT:    v_lshlrev_b32_e32 v5, 2, v0
+; GFX906-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    global_load_dword v2, v1, s[4:5]
-; GFX906-NEXT:    v_mov_b32_e32 v1, 0
+; GFX906-NEXT:    global_load_dword v2, v5, s[4:5]
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 8, v2
 ; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX906-NEXT:    s_cbranch_execz .LBB0_2
 ; GFX906-NEXT:  ; %bb.1: ; %bb.1
-; GFX906-NEXT:    v_lshlrev_b64 v[2:3], 2, v[0:1]
-; GFX906-NEXT:    v_mov_b32_e32 v0, s7
-; GFX906-NEXT:    v_add_co_u32_e32 v2, vcc, s6, v2
-; GFX906-NEXT:    v_addc_co_u32_e32 v3, vcc, v0, v3, vcc
-; GFX906-NEXT:    global_load_dword v2, v[2:3], off
+; GFX906-NEXT:    global_load_dword v2, v5, s[6:7]
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 8, v2
@@ -54,11 +50,11 @@ define amdgpu_kernel void @v4i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1)
 ; GFX906:       ; %bb.0: ; %entry
 ; GFX906-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX906-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX906-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
+; GFX906-NEXT:    v_lshlrev_b32_e32 v6, 2, v0
+; GFX906-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    global_load_dword v2, v1, s[4:5]
-; GFX906-NEXT:    v_mov_b32_e32 v1, 0
+; GFX906-NEXT:    global_load_dword v2, v6, s[4:5]
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v2
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
@@ -66,11 +62,7 @@ define amdgpu_kernel void @v4i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1)
 ; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX906-NEXT:    s_cbranch_execz .LBB1_2
 ; GFX906-NEXT:  ; %bb.1: ; %bb.1
-; GFX906-NEXT:    v_lshlrev_b64 v[2:3], 2, v[0:1]
-; GFX906-NEXT:    v_mov_b32_e32 v0, s7
-; GFX906-NEXT:    v_add_co_u32_e32 v2, vcc, s6, v2
-; GFX906-NEXT:    v_addc_co_u32_e32 v3, vcc, v0, v3, vcc
-; GFX906-NEXT:    global_load_dword v2, v[2:3], off
+; GFX906-NEXT:    global_load_dword v2, v6, s[6:7]
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v2
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
@@ -106,36 +98,32 @@ define amdgpu_kernel void @v5i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1)
 ; GFX906:       ; %bb.0: ; %entry
 ; GFX906-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX906-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX906-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
+; GFX906-NEXT:    v_lshlrev_b32_e32 v7, 3, v0
+; GFX906-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    global_load_dwordx2 v[2:3], v1, s[4:5]
-; GFX906-NEXT:    v_mov_b32_e32 v1, 0
+; GFX906-NEXT:    global_load_dwordx2 v[1:2], v7, s[4:5]
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshrrev_b64 v[4:5], 24, v[2:3]
-; GFX906-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v6, 8, v2
+; GFX906-NEXT:    v_lshrrev_b64 v[3:4], 24, v[1:2]
+; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
+; GFX906-NEXT:    v_lshrrev_b32_e32 v6, 8, v1
 ; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX906-NEXT:    s_cbranch_execz .LBB2_2
 ; GFX906-NEXT:  ; %bb.1: ; %bb.1
-; GFX906-NEXT:    v_lshlrev_b64 v[2:3], 3, v[0:1]
-; GFX906-NEXT:    v_mov_b32_e32 v0, s7
-; GFX906-NEXT:    v_add_co_u32_e32 v2, vcc, s6, v2
-; GFX906-NEXT:    v_addc_co_u32_e32 v3, vcc, v0, v3, vcc
-; GFX906-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off
+; GFX906-NEXT:    global_load_dwordx2 v[1:2], v7, s[6:7]
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshrrev_b64 v[4:5], 24, v[2:3]
-; GFX906-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v6, 8, v2
+; GFX906-NEXT:    v_lshrrev_b64 v[3:4], 24, v[1:2]
+; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
+; GFX906-NEXT:    v_lshrrev_b32_e32 v6, 8, v1
 ; GFX906-NEXT:  .LBB2_2: ; %bb.2
 ; GFX906-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v6
-; GFX906-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v4
-; GFX906-NEXT:    v_or_b32_sdwa v2, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    global_store_byte v1, v3, s[2:3] offset:4
-; GFX906-NEXT:    global_store_dword v1, v0, s[2:3]
+; GFX906-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v3
+; GFX906-NEXT:    v_or_b32_sdwa v1, v4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    global_store_byte v5, v2, s[2:3] offset:4
+; GFX906-NEXT:    global_store_dword v5, v0, s[2:3]
 ; GFX906-NEXT:    s_endpgm
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -159,46 +147,42 @@ define amdgpu_kernel void @v8i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1)
 ; GFX906:       ; %bb.0: ; %entry
 ; GFX906-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX906-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX906-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
+; GFX906-NEXT:    v_lshlrev_b32_e32 v10, 3, v0
+; GFX906-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    global_load_dwordx2 v[2:3], v1, s[4:5]
-; GFX906-NEXT:    v_mov_b32_e32 v1, 0
+; GFX906-NEXT:    global_load_dwordx2 v[1:2], v10, s[4:5]
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 24, v3
-; GFX906-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
-; GFX906-NEXT:    v_lshrrev_b32_e32 v6, 8, v3
-; GFX906-NEXT:    v_lshrrev_b32_e32 v7, 24, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
+; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 24, v2
+; GFX906-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
+; GFX906-NEXT:    v_lshrrev_b32_e32 v6, 8, v2
+; GFX906-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
+; GFX906-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v1
 ; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX906-NEXT:    s_cbranch_execz .LBB3_2
 ; GFX906-NEXT:  ; %bb.1: ; %bb.1
-; GFX906-NEXT:    v_lshlrev_b64 v[2:3], 3, v[0:1]
-; GFX906-NEXT:    v_mov_b32_e32 v0, s7
-; GFX906-NEXT:    v_add_co_u32_e32 v2, vcc, s6, v2
-; GFX906-NEXT:    v_addc_co_u32_e32 v3, vcc, v0, v3, vcc
-; GFX906-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off
+; GFX906-NEXT:    global_load_dwordx2 v[1:2], v10, s[6:7]
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 24, v3
-; GFX906-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
-; GFX906-NEXT:    v_lshrrev_b32_e32 v6, 8, v3
-; GFX906-NEXT:    v_lshrrev_b32_e32 v7, 24, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
+; GFX906-NEXT:    v_lshrrev_b32_e32 v4, 24, v2
+; GFX906-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
+; GFX906-NEXT:    v_lshrrev_b32_e32 v6, 8, v2
+; GFX906-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
+; GFX906-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 8, v1
 ; GFX906-NEXT:  .LBB3_2: ; %bb.2
 ; GFX906-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v9
-; GFX906-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v7
-; GFX906-NEXT:    v_or_b32_sdwa v2, v8, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v6
-; GFX906-NEXT:    v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v4
-; GFX906-NEXT:    v_or_b32_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    global_store_dwordx2 v1, v[2:3], s[2:3]
+; GFX906-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v7
+; GFX906-NEXT:    v_or_b32_sdwa v1, v8, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v6
+; GFX906-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v4
+; GFX906-NEXT:    v_or_b32_sdwa v2, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    global_store_dwordx2 v3, v[0:1], s[2:3]
 ; GFX906-NEXT:    s_endpgm
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -222,68 +206,64 @@ define amdgpu_kernel void @v16i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1
 ; GFX906:       ; %bb.0: ; %entry
 ; GFX906-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX906-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX906-NEXT:    v_lshlrev_b32_e32 v1, 4, v0
+; GFX906-NEXT:    v_lshlrev_b32_e32 v18, 4, v0
+; GFX906-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    global_load_dwordx4 v[2:5], v1, s[4:5]
-; GFX906-NEXT:    v_mov_b32_e32 v1, 0
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v6, 24, v5
-; GFX906-NEXT:    v_lshrrev_b32_e32 v7, 16, v5
-; GFX906-NEXT:    v_lshrrev_b32_e32 v8, 8, v5
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v4
-; GFX906-NEXT:    v_lshrrev_b32_e32 v10, 16, v4
-; GFX906-NEXT:    v_lshrrev_b32_e32 v11, 8, v4
-; GFX906-NEXT:    v_lshrrev_b32_e32 v12, 24, v3
-; GFX906-NEXT:    v_lshrrev_b32_e32 v13, 16, v3
-; GFX906-NEXT:    v_lshrrev_b32_e32 v14, 8, v3
-; GFX906-NEXT:    v_lshrrev_b32_e32 v15, 24, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v16, 16, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v17, 8, v2
+; GFX906-NEXT:    global_load_dwordx4 v[1:4], v18, s[4:5]
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v6, 24, v4
+; GFX906-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
+; GFX906-NEXT:    v_lshrrev_b32_e32 v8, 8, v4
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v3
+; GFX906-NEXT:    v_lshrrev_b32_e32 v10, 16, v3
+; GFX906-NEXT:    v_lshrrev_b32_e32 v11, 8, v3
+; GFX906-NEXT:    v_lshrrev_b32_e32 v12, 24, v2
+; GFX906-NEXT:    v_lshrrev_b32_e32 v13, 16, v2
+; GFX906-NEXT:    v_lshrrev_b32_e32 v14, 8, v2
+; GFX906-NEXT:    v_lshrrev_b32_e32 v15, 24, v1
+; GFX906-NEXT:    v_lshrrev_b32_e32 v16, 16, v1
+; GFX906-NEXT:    v_lshrrev_b32_e32 v17, 8, v1
 ; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
 ; GFX906-NEXT:    s_cbranch_execz .LBB4_2
 ; GFX906-NEXT:  ; %bb.1: ; %bb.1
-; GFX906-NEXT:    v_lshlrev_b64 v[2:3], 4, v[0:1]
-; GFX906-NEXT:    v_mov_b32_e32 v0, s7
-; GFX906-NEXT:    v_add_co_u32_e32 v2, vcc, s6, v2
-; GFX906-NEXT:    v_addc_co_u32_e32 v3, vcc, v0, v3, vcc
-; GFX906-NEXT:    global_load_dwordx4 v[2:5], v[2:3], off
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v6, 24, v5
-; GFX906-NEXT:    v_lshrrev_b32_e32 v7, 16, v5
-; GFX906-NEXT:    v_lshrrev_b32_e32 v8, 8, v5
-; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v4
-; GFX906-NEXT:    v_lshrrev_b32_e32 v10, 16, v4
-; GFX906-NEXT:    v_lshrrev_b32_e32 v11, 8, v4
-; GFX906-NEXT:    v_lshrrev_b32_e32 v12, 24, v3
-; GFX906-NEXT:    v_lshrrev_b32_e32 v13, 16, v3
-; GFX906-NEXT:    v_lshrrev_b32_e32 v14, 8, v3
-; GFX906-NEXT:    v_lshrrev_b32_e32 v15, 24, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v16, 16, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v17, 8, v2
+; GFX906-NEXT:    global_load_dwordx4 v[1:4], v18, s[6:7]
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v6, 24, v4
+; GFX906-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
+; GFX906-NEXT:    v_lshrrev_b32_e32 v8, 8, v4
+; GFX906-NEXT:    v_lshrrev_b32_e32 v9, 24, v3
+; GFX906-NEXT:    v_lshrrev_b32_e32 v10, 16, v3
+; GFX906-NEXT:    v_lshrrev_b32_e32 v11, 8, v3
+; GFX906-NEXT:    v_lshrrev_b32_e32 v12, 24, v2
+; GFX906-NEXT:    v_lshrrev_b32_e32 v13, 16, v2
+; GFX906-NEXT:    v_lshrrev_b32_e32 v14, 8, v2
+; GFX906-NEXT:    v_lshrrev_b32_e32 v15, 24, v1
+; GFX906-NEXT:    v_lshrrev_b32_e32 v16, 16, v1
+; GFX906-NEXT:    v_lshrrev_b32_e32 v17, 8, v1
 ; GFX906-NEXT:  .LBB4_2: ; %bb.2
 ; GFX906-NEXT:    s_or_b64 exec, exec, s[0:1]
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v17
-; GFX906-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v15
-; GFX906-NEXT:    v_or_b32_sdwa v2, v16, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v14
-; GFX906-NEXT:    v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v12
-; GFX906-NEXT:    v_or_b32_sdwa v3, v13, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v11
-; GFX906-NEXT:    v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_lshlrev_b16_e32 v4, 8, v9
-; GFX906-NEXT:    v_or_b32_sdwa v4, v10, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v8
-; GFX906-NEXT:    v_or_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_lshlrev_b16_e32 v5, 8, v6
-; GFX906-NEXT:    v_or_b32_sdwa v5, v7, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v5, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    global_store_dwordx4 v1, v[2:5], s[2:3]
+; GFX906-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v15
+; GFX906-NEXT:    v_or_b32_sdwa v1, v16, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v14
+; GFX906-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v12
+; GFX906-NEXT:    v_or_b32_sdwa v2, v13, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v11
+; GFX906-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v9
+; GFX906-NEXT:    v_or_b32_sdwa v3, v10, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v8
+; GFX906-NEXT:    v_or_b32_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_lshlrev_b16_e32 v4, 8, v6
+; GFX906-NEXT:    v_or_b32_sdwa v4, v7, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    global_store_dwordx4 v5, v[0:3], s[2:3]
 ; GFX906-NEXT:    s_endpgm
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -306,118 +286,114 @@ define amdgpu_kernel void @v32i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1
 ; GFX906-LABEL: v32i8_liveout:
 ; GFX906:       ; %bb.0: ; %entry
 ; GFX906-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX906-NEXT:    v_lshlrev_b32_e32 v1, 5, v0
+; GFX906-NEXT:    v_lshlrev_b32_e32 v31, 5, v0
 ; GFX906-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
+; GFX906-NEXT:    v_mov_b32_e32 v9, 0
 ; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    global_load_dwordx4 v[2:5], v1, s[4:5] offset:16
-; GFX906-NEXT:    global_load_dwordx4 v[6:9], v1, s[4:5]
-; GFX906-NEXT:    v_mov_b32_e32 v1, 0
+; GFX906-NEXT:    global_load_dwordx4 v[1:4], v31, s[4:5] offset:16
+; GFX906-NEXT:    global_load_dwordx4 v[5:8], v31, s[4:5]
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v10, 24, v5
-; GFX906-NEXT:    v_lshrrev_b32_e32 v11, 16, v5
-; GFX906-NEXT:    v_lshrrev_b32_e32 v12, 8, v5
-; GFX906-NEXT:    v_lshrrev_b32_e32 v13, 24, v4
-; GFX906-NEXT:    v_lshrrev_b32_e32 v14, 16, v4
-; GFX906-NEXT:    v_lshrrev_b32_e32 v15, 8, v4
-; GFX906-NEXT:    v_lshrrev_b32_e32 v16, 24, v3
-; GFX906-NEXT:    v_lshrrev_b32_e32 v17, 16, v3
-; GFX906-NEXT:    v_lshrrev_b32_e32 v18, 8, v3
-; GFX906-NEXT:    v_lshrrev_b32_e32 v19, 24, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v20, 16, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v21, 8, v2
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v22, 24, v9
-; GFX906-NEXT:    v_lshrrev_b32_e32 v23, 16, v9
-; GFX906-NEXT:    v_lshrrev_b32_e32 v24, 8, v9
-; GFX906-NEXT:    v_lshrrev_b32_e32 v25, 24, v8
-; GFX906-NEXT:    v_lshrrev_b32_e32 v26, 16, v8
-; GFX906-NEXT:    v_lshrrev_b32_e32 v27, 8, v8
-; GFX906-NEXT:    v_lshrrev_b32_e32 v28, 24, v7
-; GFX906-NEXT:    v_lshrrev_b32_e32 v30, 16, v7
-; GFX906-NEXT:    v_lshrrev_b32_e32 v29, 8, v7
-; GFX906-NEXT:    v_lshrrev_b32_e32 v32, 24, v6
-; GFX906-NEXT:    v_lshrrev_b32_e32 v33, 16, v6
-; GFX906-NEXT:    v_lshrrev_b32_e32 v31, 8, v6
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v4
+; GFX906-NEXT:    v_lshrrev_b32_e32 v10, 16, v4
+; GFX906-NEXT:    v_lshrrev_b32_e32 v11, 8, v4
+; GFX906-NEXT:    v_lshrrev_b32_e32 v12, 24, v3
+; GFX906-NEXT:    v_lshrrev_b32_e32 v13, 16, v3
+; GFX906-NEXT:    v_lshrrev_b32_e32 v14, 8, v3
+; GFX906-NEXT:    v_lshrrev_b32_e32 v15, 24, v2
+; GFX906-NEXT:    v_lshrrev_b32_e32 v16, 16, v2
+; GFX906-NEXT:    v_lshrrev_b32_e32 v17, 8, v2
+; GFX906-NEXT:    v_lshrrev_b32_e32 v18, 24, v1
+; GFX906-NEXT:    v_lshrrev_b32_e32 v19, 16, v1
+; GFX906-NEXT:    v_lshrrev_b32_e32 v20, 8, v1
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v21, 24, v8
+; GFX906-NEXT:    v_lshrrev_b32_e32 v22, 16, v8
+; GFX906-NEXT:    v_lshrrev_b32_e32 v23, 8, v8
+; GFX906-NEXT:    v_lshrrev_b32_e32 v24, 24, v7
+; GFX906-NEXT:    v_lshrrev_b32_e32 v25, 16, v7
+; GFX906-NEXT:    v_lshrrev_b32_e32 v26, 8, v7
+; GFX906-NEXT:    v_lshrrev_b32_e32 v27, 24, v6
+; GFX906-NEXT:    v_lshrrev_b32_e32 v28, 16, v6
+; GFX906-NEXT:    v_lshrrev_b32_e32 v29, 8, v6
+; GFX906-NEXT:    v_lshrrev_b32_e32 v30, 24, v5
+; GFX906-NEXT:    v_lshrrev_b32_e32 v32, 16, v5
+; GFX906-NEXT:    v_lshrrev_b32_e32 v33, 8, v5
 ; GFX906-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX906-NEXT:    s_cbranch_execz .LBB5_2
 ; GFX906-NEXT:  ; %bb.1: ; %bb.1
-; GFX906-NEXT:    v_lshlrev_b64 v[2:3], 5, v[0:1]
-; GFX906-NEXT:    v_mov_b32_e32 v0, s7
-; GFX906-NEXT:    v_add_co_u32_e32 v10, vcc, s6, v2
-; GFX906-NEXT:    v_addc_co_u32_e32 v11, vcc, v0, v3, vcc
-; GFX906-NEXT:    global_load_dwordx4 v[2:5], v[10:11], off offset:16
-; GFX906-NEXT:    global_load_dwordx4 v[6:9], v[10:11], off
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v10, 24, v5
-; GFX906-NEXT:    v_lshrrev_b32_e32 v11, 16, v5
-; GFX906-NEXT:    v_lshrrev_b32_e32 v12, 8, v5
-; GFX906-NEXT:    v_lshrrev_b32_e32 v13, 24, v4
-; GFX906-NEXT:    v_lshrrev_b32_e32 v14, 16, v4
-; GFX906-NEXT:    v_lshrrev_b32_e32 v15, 8, v4
-; GFX906-NEXT:    v_lshrrev_b32_e32 v16, 24, v3
-; GFX906-NEXT:    v_lshrrev_b32_e32 v17, 16, v3
-; GFX906-NEXT:    v_lshrrev_b32_e32 v18, 8, v3
-; GFX906-NEXT:    v_lshrrev_b32_e32 v19, 24, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v20, 16, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v21, 8, v2
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v22, 24, v9
-; GFX906-NEXT:    v_lshrrev_b32_e32 v23, 16, v9
-; GFX906-NEXT:    v_lshrrev_b32_e32 v24, 8, v9
-; GFX906-NEXT:    v_lshrrev_b32_e32 v25, 24, v8
-; GFX906-NEXT:    v_lshrrev_b32_e32 v26, 16, v8
-; GFX906-NEXT:    v_lshrrev_b32_e32 v27, 8, v8
-; GFX906-NEXT:    v_lshrrev_b32_e32 v28, 24, v7
-; GFX906-NEXT:    v_lshrrev_b32_e32 v30, 16, v7
-; GFX906-NEXT:    v_lshrrev_b32_e32 v29, 8, v7
-; GFX906-NEXT:    v_lshrrev_b32_e32 v32, 24, v6
-; GFX906-NEXT:    v_lshrrev_b32_e32 v33, 16, v6
-; GFX906-NEXT:    v_lshrrev_b32_e32 v31, 8, v6
+; GFX906-NEXT:    global_load_dwordx4 v[1:4], v31, s[6:7] offset:16
+; GFX906-NEXT:    global_load_dwordx4 v[5:8], v31, s[6:7]
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v4
+; GFX906-NEXT:    v_lshrrev_b32_e32 v10, 16, v4
+; GFX906-NEXT:    v_lshrrev_b32_e32 v11, 8, v4
+; GFX906-NEXT:    v_lshrrev_b32_e32 v12, 24, v3
+; GFX906-NEXT:    v_lshrrev_b32_e32 v13, 16, v3
+; GFX906-NEXT:    v_lshrrev_b32_e32 v14, 8, v3
+; GFX906-NEXT:    v_lshrrev_b32_e32 v15, 24, v2
+; GFX906-NEXT:    v_lshrrev_b32_e32 v16, 16, v2
+; GFX906-NEXT:    v_lshrrev_b32_e32 v17, 8, v2
+; GFX906-NEXT:    v_lshrrev_b32_e32 v18, 24, v1
+; GFX906-NEXT:    v_lshrrev_b32_e32 v19, 16, v1
+; GFX906-NEXT:    v_lshrrev_b32_e32 v20, 8, v1
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v21, 24, v8
+; GFX906-NEXT:    v_lshrrev_b32_e32 v22, 16, v8
+; GFX906-NEXT:    v_lshrrev_b32_e32 v23, 8, v8
+; GFX906-NEXT:    v_lshrrev_b32_e32 v24, 24, v7
+; GFX906-NEXT:    v_lshrrev_b32_e32 v25, 16, v7
+; GFX906-NEXT:    v_lshrrev_b32_e32 v26, 8, v7
+; GFX906-NEXT:    v_lshrrev_b32_e32 v27, 24, v6
+; GFX906-NEXT:    v_lshrrev_b32_e32 v28, 16, v6
+; GFX906-NEXT:    v_lshrrev_b32_e32 v29, 8, v6
+; GFX906-NEXT:    v_lshrrev_b32_e32 v30, 24, v5
+; GFX906-NEXT:    v_lshrrev_b32_e32 v32, 16, v5
+; GFX906-NEXT:    v_lshrrev_b32_e32 v33, 8, v5
 ; GFX906-NEXT:  .LBB5_2: ; %bb.2
 ; GFX906-NEXT:    s_or_b64 exec, exec, s[2:3]
-; GFX906-NEXT:    v_lshlrev_b16_e32 v28, 8, v28
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v32
-; GFX906-NEXT:    v_or_b32_sdwa v28, v30, v28 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_lshlrev_b16_e32 v30, 8, v31
-; GFX906-NEXT:    v_or_b32_sdwa v0, v33, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v6, v6, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v6, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v27
-; GFX906-NEXT:    v_or_b32_sdwa v0, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_lshlrev_b16_e32 v8, 8, v25
-; GFX906-NEXT:    v_or_b32_sdwa v8, v26, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v8, v0, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v24
-; GFX906-NEXT:    v_or_b32_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_lshlrev_b16_e32 v9, 8, v22
-; GFX906-NEXT:    v_or_b32_sdwa v9, v23, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v9, v0, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v21
-; GFX906-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v19
-; GFX906-NEXT:    v_or_b32_sdwa v2, v20, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v18
-; GFX906-NEXT:    v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v16
-; GFX906-NEXT:    v_or_b32_sdwa v3, v17, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v15
-; GFX906-NEXT:    v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_lshlrev_b16_e32 v4, 8, v13
-; GFX906-NEXT:    v_or_b32_sdwa v4, v14, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v12
+; GFX906-NEXT:    v_lshlrev_b16_e32 v30, 8, v30
+; GFX906-NEXT:    v_lshlrev_b16_e32 v31, 8, v33
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v29, 8, v29
-; GFX906-NEXT:    v_or_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_lshlrev_b16_e32 v5, 8, v10
-; GFX906-NEXT:    v_or_b32_sdwa v7, v7, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v5, v11, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v7, v7, v28 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v5, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    global_store_dwordx4 v1, v[6:9], s[0:1]
-; GFX906-NEXT:    global_store_dwordx4 v1, v[2:5], s[0:1] offset:16
+; GFX906-NEXT:    v_lshlrev_b16_e32 v27, 8, v27
+; GFX906-NEXT:    v_lshlrev_b16_e32 v26, 8, v26
+; GFX906-NEXT:    v_lshlrev_b16_e32 v24, 8, v24
+; GFX906-NEXT:    v_lshlrev_b16_e32 v23, 8, v23
+; GFX906-NEXT:    v_lshlrev_b16_e32 v21, 8, v21
+; GFX906-NEXT:    v_or_b32_sdwa v30, v32, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v5, v5, v31 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v6, v6, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v27, v28, v27 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v7, v7, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v24, v25, v24 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v8, v8, v23 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v21, v22, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v5, v5, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v6, v6, v27 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v7, v7, v24 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v8, v8, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    global_store_dwordx4 v9, v[5:8], s[0:1]
+; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX906-NEXT:    v_lshlrev_b16_e32 v5, 8, v20
+; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_lshlrev_b16_e32 v5, 8, v18
+; GFX906-NEXT:    v_or_b32_sdwa v5, v19, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    v_lshlrev_b16_e32 v5, 8, v17
+; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_lshlrev_b16_e32 v5, 8, v15
+; GFX906-NEXT:    v_or_b32_sdwa v5, v16, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    v_lshlrev_b16_e32 v5, 8, v14
+; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_lshlrev_b16_e32 v5, 8, v12
+; GFX906-NEXT:    v_or_b32_sdwa v5, v13, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    v_lshlrev_b16_e32 v5, 8, v11
+; GFX906-NEXT:    v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v0, v10, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    global_store_dwordx4 v9, v[1:4], s[0:1] offset:16
 ; GFX906-NEXT:    s_endpgm
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
@@ -446,1540 +422,1548 @@ define amdgpu_kernel void @v256i8_liveout(ptr addrspace(1) %src1, ptr addrspace(
 ; GFX906-NEXT:    s_add_u32 s8, s8, s3
 ; GFX906-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX906-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX906-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; GFX906-NEXT:    v_lshlrev_b32_e32 v63, 3, v0
 ; GFX906-NEXT:    s_addc_u32 s9, s9, 0
 ; GFX906-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX906-NEXT:    global_load_dwordx4 v[18:21], v2, s[4:5] offset:240
-; GFX906-NEXT:    global_load_dwordx4 v[6:9], v2, s[4:5] offset:224
-; GFX906-NEXT:    global_load_dwordx4 v[10:13], v2, s[4:5] offset:208
-; GFX906-NEXT:    global_load_dwordx4 v[14:17], v2, s[4:5] offset:192
-; GFX906-NEXT:    v_mov_b32_e32 v1, 0
+; GFX906-NEXT:    global_load_dwordx4 v[17:20], v63, s[4:5] offset:240
+; GFX906-NEXT:    global_load_dwordx4 v[5:8], v63, s[4:5] offset:224
+; GFX906-NEXT:    global_load_dwordx4 v[9:12], v63, s[4:5] offset:208
+; GFX906-NEXT:    global_load_dwordx4 v[13:16], v63, s[4:5] offset:192
 ; GFX906-NEXT:    v_cmp_gt_u32_e32 vcc, 15, v0
+; GFX906-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v21
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:20 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v21
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:24 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v21
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:28 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v20
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:32 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v20
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:36 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v20
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:40 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v19
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:44 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v19
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:48 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v19
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:52 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v18
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:56 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v18
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:60 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v18, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    buffer_store_dword v19, off, s[8:11], 0 offset:8 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v20, off, s[8:11], 0 offset:12 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v21, off, s[8:11], 0 offset:16 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v18
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:64 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v9
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:68 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v9
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:72 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v9
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:76 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v8
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:80 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v8
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:84 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v8
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:88 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v7
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:92 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v7
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:96 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v7
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:100 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v6
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:104 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v6
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:108 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v6
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:112 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v13
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:116 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v13
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:120 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v13
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:124 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v12
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:128 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v12
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:132 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v12
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:136 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v11
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:140 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v11
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:144 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v11
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:148 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v10
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:152 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v10
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:156 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v10
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:160 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v17
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:164 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v17
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:168 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v17
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:180 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v16
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:172 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v16
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:176 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v16
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:192 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v15
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:184 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v15
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:188 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v15
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:204 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v14
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:196 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v14
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:200 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v14
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:208 ; 4-byte Folded Spill
-; GFX906-NEXT:    global_load_dwordx4 v[18:21], v2, s[4:5] offset:176
-; GFX906-NEXT:    global_load_dwordx4 v[22:25], v2, s[4:5] offset:160
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v21
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:212 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v21
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:216 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v21
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:228 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v20
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:220 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v20
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:224 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v20
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:240 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v19
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:232 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v19
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:236 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v19
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:252 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v18
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:244 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v18
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:248 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v18
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:256 ; 4-byte Folded Spill
-; GFX906-NEXT:    s_waitcnt vmcnt(12)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v25
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:260 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v25
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:264 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v25
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:276 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v24
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:268 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v24
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:272 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v24
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:288 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v23
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:280 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v23
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:284 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v23
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:300 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v22
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:292 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v22
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:296 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v22
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:304 ; 4-byte Folded Spill
-; GFX906-NEXT:    global_load_dwordx4 v[26:29], v2, s[4:5] offset:144
-; GFX906-NEXT:    global_load_dwordx4 v[30:33], v2, s[4:5] offset:128
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v29
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:308 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v29
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:312 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v29
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:324 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v28
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:316 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v28
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:320 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v28
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:336 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v27
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:328 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v27
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:332 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v27
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:348 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v26
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:340 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v26
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:344 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v26
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:352 ; 4-byte Folded Spill
-; GFX906-NEXT:    s_waitcnt vmcnt(12)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v33
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:356 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v33
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:360 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v33
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:372 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v32
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:364 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v32
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:368 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v32
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:384 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v31
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:376 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v31
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:380 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v31
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:396 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v30
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:388 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v30
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:392 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v30
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:400 ; 4-byte Folded Spill
-; GFX906-NEXT:    global_load_dwordx4 v[34:37], v2, s[4:5] offset:112
-; GFX906-NEXT:    global_load_dwordx4 v[38:41], v2, s[4:5] offset:96
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v37
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:404 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v37
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:408 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v37
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:420 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v36
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:412 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v36
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:416 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v36
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:432 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v35
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:424 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v35
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:428 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v35
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:444 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v34
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:436 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v34
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:440 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v34
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:448 ; 4-byte Folded Spill
-; GFX906-NEXT:    s_waitcnt vmcnt(12)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v41
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:452 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v41
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:456 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v41
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:468 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v40
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:460 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v40
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:464 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v40
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:480 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v39
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:472 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v39
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:476 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v39
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:492 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v38
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:484 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v38
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:488 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v38
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:496 ; 4-byte Folded Spill
-; GFX906-NEXT:    global_load_dwordx4 v[42:45], v2, s[4:5] offset:80
-; GFX906-NEXT:    global_load_dwordx4 v[46:49], v2, s[4:5] offset:64
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v45
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:500 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v45
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:504 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v45
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:516 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v44
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:508 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v44
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:512 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v44
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:528 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v43
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:520 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v43
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:524 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v43
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:540 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v42
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:532 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v42
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:536 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v42
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:544 ; 4-byte Folded Spill
-; GFX906-NEXT:    s_waitcnt vmcnt(12)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v49
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:548 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v49
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:552 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v49
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:564 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v48
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:556 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v48
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:560 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v48
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:576 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v47
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:568 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v47
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:572 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v47
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:588 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v46
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:580 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v46
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:584 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v46
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:592 ; 4-byte Folded Spill
-; GFX906-NEXT:    global_load_dwordx4 v[50:53], v2, s[4:5] offset:48
-; GFX906-NEXT:    global_load_dwordx4 v[54:57], v2, s[4:5] offset:32
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v53
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:596 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v53
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:600 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v53
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:612 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v52
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:604 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v52
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:608 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v52
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:624 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v51
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:616 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v51
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:620 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v51
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:636 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v50
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:628 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v50
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:632 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v50
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:640 ; 4-byte Folded Spill
-; GFX906-NEXT:    s_waitcnt vmcnt(12)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v57
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:644 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v57
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:648 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v57
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:660 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v56
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:652 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v56
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:656 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v56
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:672 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v55
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:664 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v55
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:668 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v55
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:684 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 24, v54
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:676 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 16, v54
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:680 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v3, 8, v54
-; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:688 ; 4-byte Folded Spill
-; GFX906-NEXT:    global_load_dwordx4 v[58:61], v2, s[4:5] offset:16
-; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    global_load_dwordx4 v[2:5], v2, s[4:5]
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 24, v61
-; GFX906-NEXT:    buffer_store_dword v62, off, s[8:11], 0 offset:692 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 16, v61
-; GFX906-NEXT:    buffer_store_dword v62, off, s[8:11], 0 offset:696 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 8, v61
-; GFX906-NEXT:    buffer_store_dword v62, off, s[8:11], 0 offset:708 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 24, v60
-; GFX906-NEXT:    buffer_store_dword v62, off, s[8:11], 0 offset:700 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 16, v60
-; GFX906-NEXT:    buffer_store_dword v62, off, s[8:11], 0 offset:704 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 8, v60
-; GFX906-NEXT:    buffer_store_dword v62, off, s[8:11], 0 offset:720 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 24, v59
-; GFX906-NEXT:    buffer_store_dword v62, off, s[8:11], 0 offset:712 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 16, v59
-; GFX906-NEXT:    buffer_store_dword v62, off, s[8:11], 0 offset:716 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 8, v59
-; GFX906-NEXT:    buffer_store_dword v62, off, s[8:11], 0 offset:732 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 24, v58
-; GFX906-NEXT:    buffer_store_dword v62, off, s[8:11], 0 offset:724 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 16, v58
-; GFX906-NEXT:    buffer_store_dword v62, off, s[8:11], 0 offset:728 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 8, v58
-; GFX906-NEXT:    buffer_store_dword v62, off, s[8:11], 0 offset:736 ; 4-byte Folded Spill
-; GFX906-NEXT:    s_waitcnt vmcnt(12)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 24, v5
-; GFX906-NEXT:    buffer_store_dword v62, off, s[8:11], 0 offset:740 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 16, v5
-; GFX906-NEXT:    buffer_store_dword v62, off, s[8:11], 0 offset:744 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 8, v5
-; GFX906-NEXT:    buffer_store_dword v62, off, s[8:11], 0 offset:756 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 24, v4
-; GFX906-NEXT:    buffer_store_dword v62, off, s[8:11], 0 offset:748 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 16, v4
-; GFX906-NEXT:    buffer_store_dword v62, off, s[8:11], 0 offset:752 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 8, v4
-; GFX906-NEXT:    buffer_store_dword v62, off, s[8:11], 0 offset:768 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 24, v3
-; GFX906-NEXT:    v_lshrrev_b32_e32 v63, 24, v2
-; GFX906-NEXT:    buffer_store_dword v62, off, s[8:11], 0 offset:760 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 16, v3
-; GFX906-NEXT:    buffer_store_dword v63, off, s[8:11], 0 offset:772 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v63, 16, v2
-; GFX906-NEXT:    buffer_store_dword v62, off, s[8:11], 0 offset:764 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 8, v3
-; GFX906-NEXT:    buffer_store_dword v63, off, s[8:11], 0 offset:776 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v63, 8, v2
-; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
-; GFX906-NEXT:    s_cbranch_execz .LBB6_2
-; GFX906-NEXT:  ; %bb.1: ; %bb.1
-; GFX906-NEXT:    v_lshlrev_b64 v[2:3], 3, v[0:1]
-; GFX906-NEXT:    v_mov_b32_e32 v0, s7
-; GFX906-NEXT:    v_add_co_u32_e32 v2, vcc, s6, v2
-; GFX906-NEXT:    v_addc_co_u32_e32 v3, vcc, v0, v3, vcc
-; GFX906-NEXT:    global_load_dwordx4 v[18:21], v[2:3], off offset:240
-; GFX906-NEXT:    global_load_dwordx4 v[6:9], v[2:3], off offset:224
-; GFX906-NEXT:    global_load_dwordx4 v[10:13], v[2:3], off offset:208
-; GFX906-NEXT:    global_load_dwordx4 v[14:17], v[2:3], off offset:192
-; GFX906-NEXT:    s_waitcnt vmcnt(3)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v21
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v20
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:20 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v21
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v20
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:24 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v21
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v20
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:28 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v20
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v19
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:32 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v20
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v19
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:36 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v20
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v19
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:40 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v19
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v18
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:44 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v19
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v18
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:48 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v19
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v18
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:52 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v18
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v17
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:56 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v18
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v17
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:60 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v18, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    buffer_store_dword v19, off, s[8:11], 0 offset:8 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v20, off, s[8:11], 0 offset:12 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v21, off, s[8:11], 0 offset:16 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v18
+; GFX906-NEXT:    buffer_store_dword v18, off, s[8:11], 0 offset:8 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v19, off, s[8:11], 0 offset:12 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v20, off, s[8:11], 0 offset:16 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v17
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:64 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v9
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v8
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:68 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v9
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v8
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:72 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v9
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v8
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:76 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v8
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v7
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:80 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v8
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v7
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:84 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v8
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v7
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:88 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v7
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v6
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:92 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v7
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v6
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:96 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v7
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v6
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:100 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v6
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v5
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:104 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v6
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v5
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:108 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v6
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v5
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:112 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v13
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v12
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:116 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v13
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v12
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:120 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v13
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v12
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:124 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v12
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v11
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:128 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v12
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v11
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:132 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v12
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v11
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:136 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v11
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v10
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:140 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v11
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v10
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:144 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v11
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v10
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:148 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v10
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v9
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:152 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v10
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v9
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:156 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v10
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v9
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:160 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v17
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:164 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v17
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:168 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v17
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:180 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v16
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:172 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:164 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v16
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:176 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:168 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v16
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:192 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:172 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v15
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:184 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:180 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v15
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:188 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:184 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v15
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:204 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:176 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v14
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:196 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:192 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v14
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:200 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:196 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v14
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:188 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v13
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:204 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v13
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:208 ; 4-byte Folded Spill
-; GFX906-NEXT:    global_load_dwordx4 v[18:21], v[2:3], off offset:176
-; GFX906-NEXT:    global_load_dwordx4 v[22:25], v[2:3], off offset:160
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v13
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:200 ; 4-byte Folded Spill
+; GFX906-NEXT:    global_load_dwordx4 v[17:20], v63, s[4:5] offset:176
+; GFX906-NEXT:    global_load_dwordx4 v[21:24], v63, s[4:5] offset:160
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v21
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v20
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:212 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v21
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v20
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:216 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v21
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v20
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:228 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v20
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v19
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:220 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v20
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v19
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:224 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v20
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v19
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:240 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v19
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v18
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:232 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v19
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v18
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:236 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v19
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v18
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:252 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v18
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v17
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:244 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v18
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v17
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:248 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v18
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v17
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:256 ; 4-byte Folded Spill
 ; GFX906-NEXT:    s_waitcnt vmcnt(12)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v25
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v24
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:260 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v25
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v24
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:264 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v25
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v24
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:276 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v24
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v23
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:268 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v24
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v23
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:272 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v24
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v23
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:288 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v23
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v22
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:280 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v23
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v22
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:284 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v23
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v22
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:300 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v22
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v21
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:292 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v22
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v21
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:296 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v22
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v21
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:304 ; 4-byte Folded Spill
-; GFX906-NEXT:    global_load_dwordx4 v[26:29], v[2:3], off offset:144
-; GFX906-NEXT:    global_load_dwordx4 v[30:33], v[2:3], off offset:128
+; GFX906-NEXT:    global_load_dwordx4 v[25:28], v63, s[4:5] offset:144
+; GFX906-NEXT:    global_load_dwordx4 v[29:32], v63, s[4:5] offset:128
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v29
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v28
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:308 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v29
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v28
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:312 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v29
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v28
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:324 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v28
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v27
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:316 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v28
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v27
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:320 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v28
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v27
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:336 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v27
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v26
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:328 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v27
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v26
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:332 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v27
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v26
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:348 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v26
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v25
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:340 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v25
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:344 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v25
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:352 ; 4-byte Folded Spill
+; GFX906-NEXT:    s_waitcnt vmcnt(12)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v32
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:356 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v32
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:360 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v32
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:372 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v31
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:364 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v31
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:368 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v31
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:384 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v30
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:376 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v30
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:380 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v30
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:396 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v29
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:388 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v29
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:392 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v29
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:400 ; 4-byte Folded Spill
+; GFX906-NEXT:    global_load_dwordx4 v[33:36], v63, s[4:5] offset:112
+; GFX906-NEXT:    global_load_dwordx4 v[37:40], v63, s[4:5] offset:96
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v36
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:404 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v36
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:408 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v36
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:420 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v35
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:412 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v35
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:416 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v35
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:432 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v34
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:424 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v34
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:428 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v34
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:444 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v33
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:436 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v33
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:440 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v33
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:448 ; 4-byte Folded Spill
+; GFX906-NEXT:    s_waitcnt vmcnt(12)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v40
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:452 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v40
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:456 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v40
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:468 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v39
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:460 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v39
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:464 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v39
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:480 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v38
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:472 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v38
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:476 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v38
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:492 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v37
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:484 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v37
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:488 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v37
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:496 ; 4-byte Folded Spill
+; GFX906-NEXT:    global_load_dwordx4 v[41:44], v63, s[4:5] offset:80
+; GFX906-NEXT:    global_load_dwordx4 v[45:48], v63, s[4:5] offset:64
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v44
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:500 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v44
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:504 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v44
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:516 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v43
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:508 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v43
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:512 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v43
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:528 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v42
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:520 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v42
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:524 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v42
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:540 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v41
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:532 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v41
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:536 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v41
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:544 ; 4-byte Folded Spill
+; GFX906-NEXT:    s_waitcnt vmcnt(12)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v48
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:548 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v48
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:552 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v48
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:564 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v47
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:556 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v47
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:560 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v47
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:576 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v46
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:568 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v46
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:572 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v46
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:588 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v45
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:580 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v45
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:584 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v45
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:592 ; 4-byte Folded Spill
+; GFX906-NEXT:    global_load_dwordx4 v[49:52], v63, s[4:5] offset:48
+; GFX906-NEXT:    global_load_dwordx4 v[53:56], v63, s[4:5] offset:32
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v52
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:596 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v52
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:600 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v52
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:612 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v51
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:604 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v51
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:608 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v51
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:624 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v50
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:616 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v50
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:620 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v50
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:636 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v49
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:628 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v49
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:632 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v49
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:640 ; 4-byte Folded Spill
+; GFX906-NEXT:    s_waitcnt vmcnt(12)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v56
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:644 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v56
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:648 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v56
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:660 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v55
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:652 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v55
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:656 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v55
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:672 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v54
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:664 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v54
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:668 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v54
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:684 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v53
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:676 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v53
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:680 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v53
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:688 ; 4-byte Folded Spill
+; GFX906-NEXT:    global_load_dwordx4 v[57:60], v63, s[4:5] offset:16
+; GFX906-NEXT:    s_nop 0
+; GFX906-NEXT:    global_load_dwordx4 v[0:3], v63, s[4:5]
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 24, v60
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:692 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 16, v60
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:696 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 8, v60
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:708 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 24, v59
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:700 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 16, v59
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:704 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 8, v59
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:720 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 24, v58
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:712 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 16, v58
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:716 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 8, v58
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:732 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 24, v57
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:724 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 16, v57
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:728 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 8, v57
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:736 ; 4-byte Folded Spill
+; GFX906-NEXT:    s_waitcnt vmcnt(12)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 24, v3
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:740 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 16, v3
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:744 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 8, v3
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:756 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 24, v2
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:748 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 16, v2
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:752 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 8, v2
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:768 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 24, v1
+; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 24, v0
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:760 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 16, v1
+; GFX906-NEXT:    buffer_store_dword v62, off, s[8:11], 0 offset:772 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 16, v0
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:764 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 8, v1
+; GFX906-NEXT:    buffer_store_dword v62, off, s[8:11], 0 offset:776 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 8, v0
+; GFX906-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX906-NEXT:    s_cbranch_execz .LBB6_2
+; GFX906-NEXT:  ; %bb.1: ; %bb.1
+; GFX906-NEXT:    global_load_dwordx4 v[0:3], v63, s[6:7] offset:240
+; GFX906-NEXT:    global_load_dwordx4 v[5:8], v63, s[6:7] offset:224
+; GFX906-NEXT:    global_load_dwordx4 v[9:12], v63, s[6:7] offset:208
+; GFX906-NEXT:    global_load_dwordx4 v[13:16], v63, s[6:7] offset:192
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v17, 24, v3
+; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:20 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v17, 16, v3
+; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:24 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v17, 8, v3
+; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:28 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v17, 24, v2
+; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:32 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v17, 16, v2
+; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:36 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v17, 8, v2
+; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:40 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v17, 24, v1
+; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:44 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v17, 16, v1
+; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:48 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v17, 8, v1
+; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:52 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v17, 24, v0
+; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:56 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v17, 16, v0
+; GFX906-NEXT:    buffer_store_dword v17, off, s[8:11], 0 offset:60 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:4 ; 4-byte Folded Spill
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    buffer_store_dword v1, off, s[8:11], 0 offset:8 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v2, off, s[8:11], 0 offset:12 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v3, off, s[8:11], 0 offset:16 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v0
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:64 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v8
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:68 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v8
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:72 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v8
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:76 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v7
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:80 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v7
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:84 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v7
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:88 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v6
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:92 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v6
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:96 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v6
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:100 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v5
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:104 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v5
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:108 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v5
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:112 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v12
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:116 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v12
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:120 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v12
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:124 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v11
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:128 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v11
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:132 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v11
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:136 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v10
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:140 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v10
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:144 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v10
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:148 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v9
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:152 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v9
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:156 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v9
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:160 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v16
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:164 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v16
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:168 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v16
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:172 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v15
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:180 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v15
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:184 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v15
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:176 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v14
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:192 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v14
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:196 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v14
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:188 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v13
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:204 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v13
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:208 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v13
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:200 ; 4-byte Folded Spill
+; GFX906-NEXT:    global_load_dwordx4 v[17:20], v63, s[6:7] offset:176
+; GFX906-NEXT:    global_load_dwordx4 v[21:24], v63, s[6:7] offset:160
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v20
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:212 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v20
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:216 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v20
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:228 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v19
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:220 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v19
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:224 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v19
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:240 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v18
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:232 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v18
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:236 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v18
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:252 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v17
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:244 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v17
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:248 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v17
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:256 ; 4-byte Folded Spill
+; GFX906-NEXT:    s_waitcnt vmcnt(12)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v24
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:260 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v24
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:264 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v24
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:276 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v23
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:268 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v23
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:272 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v23
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:288 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v22
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:280 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v22
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:284 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v22
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:300 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v21
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:292 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v21
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:296 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v21
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:304 ; 4-byte Folded Spill
+; GFX906-NEXT:    global_load_dwordx4 v[25:28], v63, s[6:7] offset:144
+; GFX906-NEXT:    global_load_dwordx4 v[29:32], v63, s[6:7] offset:128
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v28
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:308 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v28
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:312 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v28
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:324 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v27
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:316 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v27
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:320 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v27
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:336 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v26
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:328 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v26
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:344 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:332 ; 4-byte Folded Spill
 ; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v26
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:348 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v25
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:340 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v25
+; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:344 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v25
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:352 ; 4-byte Folded Spill
 ; GFX906-NEXT:    s_waitcnt vmcnt(12)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v33
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v32
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:356 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v33
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v32
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:360 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v33
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v32
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:372 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v32
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v31
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:364 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v32
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v31
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:368 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v32
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v31
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:384 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v31
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v30
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:376 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v31
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v30
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:380 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v31
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v30
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:396 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v30
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v29
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:388 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v30
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v29
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:392 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v30
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v29
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:400 ; 4-byte Folded Spill
-; GFX906-NEXT:    global_load_dwordx4 v[34:37], v[2:3], off offset:112
-; GFX906-NEXT:    global_load_dwordx4 v[38:41], v[2:3], off offset:96
+; GFX906-NEXT:    global_load_dwordx4 v[33:36], v63, s[6:7] offset:112
+; GFX906-NEXT:    global_load_dwordx4 v[37:40], v63, s[6:7] offset:96
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v37
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v36
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:404 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v37
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v36
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:408 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v37
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v36
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:420 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v36
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v35
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:412 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v36
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v35
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:416 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v36
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v35
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:432 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v35
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v34
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:424 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v35
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v34
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:428 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v35
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v34
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:444 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v34
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v33
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:436 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v34
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v33
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:440 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v34
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v33
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:448 ; 4-byte Folded Spill
 ; GFX906-NEXT:    s_waitcnt vmcnt(12)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v41
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v40
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:452 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v41
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v40
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:456 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v41
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v40
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:468 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v40
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v39
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:460 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v40
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v39
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:464 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v40
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v39
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:480 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v39
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v38
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:472 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v39
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v38
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:476 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v39
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v38
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:492 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v38
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v37
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:484 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v38
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v37
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:488 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v38
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v37
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:496 ; 4-byte Folded Spill
-; GFX906-NEXT:    global_load_dwordx4 v[42:45], v[2:3], off offset:80
-; GFX906-NEXT:    global_load_dwordx4 v[46:49], v[2:3], off offset:64
+; GFX906-NEXT:    global_load_dwordx4 v[41:44], v63, s[6:7] offset:80
+; GFX906-NEXT:    global_load_dwordx4 v[45:48], v63, s[6:7] offset:64
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v45
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v44
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:500 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v45
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v44
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:504 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v45
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v44
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:516 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v44
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v43
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:508 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v44
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v43
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:512 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v44
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v43
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:528 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v43
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v42
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:520 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v43
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v42
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:524 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v43
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v42
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:540 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v42
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v41
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:532 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v42
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v41
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:536 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v42
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v41
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:544 ; 4-byte Folded Spill
 ; GFX906-NEXT:    s_waitcnt vmcnt(12)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v49
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v48
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:548 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v49
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v48
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:552 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v49
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v48
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:564 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v48
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v47
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:556 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v48
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v47
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:560 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v48
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v47
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:576 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v47
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v46
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:568 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v47
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v46
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:572 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v47
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v46
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:588 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v46
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v45
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:580 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v46
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v45
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:584 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v46
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v45
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:592 ; 4-byte Folded Spill
-; GFX906-NEXT:    global_load_dwordx4 v[50:53], v[2:3], off offset:48
-; GFX906-NEXT:    global_load_dwordx4 v[54:57], v[2:3], off offset:32
+; GFX906-NEXT:    global_load_dwordx4 v[49:52], v63, s[6:7] offset:48
+; GFX906-NEXT:    global_load_dwordx4 v[53:56], v63, s[6:7] offset:32
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v53
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v52
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:596 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v53
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v52
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:600 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v53
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v52
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:612 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v52
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v51
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:604 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v52
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v51
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:608 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v52
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v51
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:624 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v51
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v50
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:616 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v51
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v50
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:620 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v51
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v50
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:636 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v50
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v49
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:628 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v50
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v49
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:632 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v50
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v49
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:640 ; 4-byte Folded Spill
 ; GFX906-NEXT:    s_waitcnt vmcnt(12)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v57
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v56
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:644 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v57
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v56
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:648 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v57
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v56
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:660 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v56
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v55
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:652 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v56
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v55
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:656 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v56
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v55
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:672 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v55
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v54
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:664 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v55
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v54
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:668 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v55
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v54
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:684 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v54
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v53
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:676 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v54
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v53
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:680 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v54
+; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v53
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:688 ; 4-byte Folded Spill
-; GFX906-NEXT:    global_load_dwordx4 v[58:61], v[2:3], off offset:16
+; GFX906-NEXT:    global_load_dwordx4 v[57:60], v63, s[6:7] offset:16
 ; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    global_load_dwordx4 v[2:5], v[2:3], off
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v61
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:692 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v61
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:696 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v61
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:708 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v60
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:700 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v60
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:704 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v60
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:720 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v59
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:712 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v59
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:716 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v59
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:732 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v58
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:724 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v58
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:728 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v58
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:736 ; 4-byte Folded Spill
+; GFX906-NEXT:    global_load_dwordx4 v[0:3], v63, s[6:7]
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 24, v60
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:692 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 16, v60
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:696 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 8, v60
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:708 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 24, v59
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:700 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 16, v59
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:704 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 8, v59
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:720 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 24, v58
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:712 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 16, v58
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:716 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 8, v58
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:732 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 24, v57
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:724 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 16, v57
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:728 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 8, v57
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:736 ; 4-byte Folded Spill
 ; GFX906-NEXT:    s_waitcnt vmcnt(12)
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v5
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:740 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v5
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:744 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v5
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:756 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v4
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:748 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v4
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:752 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 8, v4
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:768 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v3
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:760 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v3
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:764 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 24, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 8, v3
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:772 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_lshrrev_b32_e32 v0, 16, v2
-; GFX906-NEXT:    v_lshrrev_b32_e32 v63, 8, v2
-; GFX906-NEXT:    buffer_store_dword v0, off, s[8:11], 0 offset:776 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 24, v3
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:740 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 16, v3
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:744 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 8, v3
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:756 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 24, v2
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:748 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 16, v2
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:752 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 8, v2
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:768 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 24, v1
+; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 24, v0
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:760 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 16, v1
+; GFX906-NEXT:    buffer_store_dword v62, off, s[8:11], 0 offset:772 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 16, v0
+; GFX906-NEXT:    buffer_store_dword v61, off, s[8:11], 0 offset:764 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v61, 8, v1
+; GFX906-NEXT:    buffer_store_dword v62, off, s[8:11], 0 offset:776 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_lshrrev_b32_e32 v62, 8, v0
 ; GFX906-NEXT:  .LBB6_2: ; %bb.2
 ; GFX906-NEXT:    s_or_b64 exec, exec, s[0:1]
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v63
-; GFX906-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v62
-; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:768 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_lshlrev_b16_e32 v61, 8, v61
+; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v61, off, s[8:11], 0 offset:768 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_lshlrev_b16_e32 v62, 8, v62
+; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    buffer_load_dword v62, off, s[8:11], 0 offset:776 ; 4-byte Folded Reload
 ; GFX906-NEXT:    buffer_load_dword v63, off, s[8:11], 0 offset:764 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(2)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v4, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:756 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_lshlrev_b16_e32 v61, 8, v61
+; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v61, off, s[8:11], 0 offset:756 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v5, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:772 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_lshlrev_b16_e32 v61, 8, v61
+; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v61, off, s[8:11], 0 offset:772 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v2, v62, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_lshlrev_b16_e32 v61, 8, v61
+; GFX906-NEXT:    v_or_b32_sdwa v61, v62, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    buffer_load_dword v62, off, s[8:11], 0 offset:760 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:748 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v61, off, s[8:11], 0 offset:748 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v62, 8, v62
 ; GFX906-NEXT:    v_or_b32_sdwa v62, v63, v62 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v62 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX906-NEXT:    buffer_load_dword v62, off, s[8:11], 0 offset:752 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX906-NEXT:    v_lshlrev_b16_e32 v61, 8, v61
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v0, v62, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:740 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_or_b32_sdwa v61, v62, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v61, off, s[8:11], 0 offset:740 ; 4-byte Folded Reload
 ; GFX906-NEXT:    buffer_load_dword v62, off, s[8:11], 0 offset:744 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX906-NEXT:    v_lshlrev_b16_e32 v61, 8, v61
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v0, v62, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    global_store_dwordx4 v1, v[2:5], s[2:3]
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:732 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
+; GFX906-NEXT:    v_or_b32_sdwa v61, v62, v61 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v61 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
 ; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:736 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_or_b32_sdwa v3, v59, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    s_nop 0
+; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:732 ; 4-byte Folded Reload
 ; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:720 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v59, off, s[8:11], 0 offset:716 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:708 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT:    v_or_b32_sdwa v0, v58, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v4, v60, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:708 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_or_b32_sdwa v0, v57, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; GFX906-NEXT:    buffer_load_dword v57, off, s[8:11], 0 offset:724 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_or_b32_sdwa v1, v58, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    buffer_load_dword v58, off, s[8:11], 0 offset:728 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v5, v61, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:724 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v2, v58, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v2, v59, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v59, off, s[8:11], 0 offset:716 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v57, 8, v57
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_or_b32_sdwa v57, v58, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    buffer_load_dword v58, off, s[8:11], 0 offset:712 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:700 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v57, off, s[8:11], 0 offset:700 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v58, 8, v58
 ; GFX906-NEXT:    v_or_b32_sdwa v58, v59, v58 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v58 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX906-NEXT:    buffer_load_dword v58, off, s[8:11], 0 offset:704 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX906-NEXT:    v_lshlrev_b16_e32 v57, 8, v57
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v0, v58, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:692 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_or_b32_sdwa v57, v58, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v57, off, s[8:11], 0 offset:692 ; 4-byte Folded Reload
 ; GFX906-NEXT:    buffer_load_dword v58, off, s[8:11], 0 offset:696 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
+; GFX906-NEXT:    v_or_b32_sdwa v3, v60, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX906-NEXT:    v_lshlrev_b16_e32 v57, 8, v57
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v0, v58, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    global_store_dwordx4 v1, v[2:5], s[2:3] offset:16
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:684 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
+; GFX906-NEXT:    v_or_b32_sdwa v57, v58, v57 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v57 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:16
 ; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:688 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_or_b32_sdwa v3, v55, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    s_nop 0
+; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:684 ; 4-byte Folded Reload
 ; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:672 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v55, off, s[8:11], 0 offset:668 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:660 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT:    v_or_b32_sdwa v0, v54, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v4, v56, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:660 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_or_b32_sdwa v0, v53, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; GFX906-NEXT:    buffer_load_dword v53, off, s[8:11], 0 offset:676 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_or_b32_sdwa v1, v54, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    buffer_load_dword v54, off, s[8:11], 0 offset:680 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v5, v57, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:676 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v2, v54, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v2, v55, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v55, off, s[8:11], 0 offset:668 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v53, 8, v53
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_or_b32_sdwa v53, v54, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    buffer_load_dword v54, off, s[8:11], 0 offset:664 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:652 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v53, off, s[8:11], 0 offset:652 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v54, 8, v54
 ; GFX906-NEXT:    v_or_b32_sdwa v54, v55, v54 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v54 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX906-NEXT:    buffer_load_dword v54, off, s[8:11], 0 offset:656 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX906-NEXT:    v_lshlrev_b16_e32 v53, 8, v53
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v0, v54, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:644 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_or_b32_sdwa v53, v54, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v53, off, s[8:11], 0 offset:644 ; 4-byte Folded Reload
 ; GFX906-NEXT:    buffer_load_dword v54, off, s[8:11], 0 offset:648 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
+; GFX906-NEXT:    v_or_b32_sdwa v3, v56, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX906-NEXT:    v_lshlrev_b16_e32 v53, 8, v53
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v0, v54, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    global_store_dwordx4 v1, v[2:5], s[2:3] offset:32
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:636 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
+; GFX906-NEXT:    v_or_b32_sdwa v53, v54, v53 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v53 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:32
 ; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:640 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_or_b32_sdwa v3, v51, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    s_nop 0
+; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:636 ; 4-byte Folded Reload
 ; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:624 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v51, off, s[8:11], 0 offset:620 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:612 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT:    v_or_b32_sdwa v0, v50, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v4, v52, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:612 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_or_b32_sdwa v0, v49, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; GFX906-NEXT:    buffer_load_dword v49, off, s[8:11], 0 offset:628 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_or_b32_sdwa v1, v50, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    buffer_load_dword v50, off, s[8:11], 0 offset:632 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v5, v53, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:628 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v2, v50, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v2, v51, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v51, off, s[8:11], 0 offset:620 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v49, 8, v49
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_or_b32_sdwa v49, v50, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    buffer_load_dword v50, off, s[8:11], 0 offset:616 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:604 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v49, off, s[8:11], 0 offset:604 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v50, 8, v50
 ; GFX906-NEXT:    v_or_b32_sdwa v50, v51, v50 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v50 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX906-NEXT:    buffer_load_dword v50, off, s[8:11], 0 offset:608 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX906-NEXT:    v_lshlrev_b16_e32 v49, 8, v49
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v0, v50, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:596 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_or_b32_sdwa v49, v50, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v49, off, s[8:11], 0 offset:596 ; 4-byte Folded Reload
 ; GFX906-NEXT:    buffer_load_dword v50, off, s[8:11], 0 offset:600 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
+; GFX906-NEXT:    v_or_b32_sdwa v3, v52, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX906-NEXT:    v_lshlrev_b16_e32 v49, 8, v49
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v0, v50, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    global_store_dwordx4 v1, v[2:5], s[2:3] offset:48
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:588 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
+; GFX906-NEXT:    v_or_b32_sdwa v49, v50, v49 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:48
 ; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:592 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_or_b32_sdwa v3, v47, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    s_nop 0
+; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:588 ; 4-byte Folded Reload
 ; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:576 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v47, off, s[8:11], 0 offset:572 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:564 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT:    v_or_b32_sdwa v0, v46, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v4, v48, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:564 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_or_b32_sdwa v0, v45, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; GFX906-NEXT:    buffer_load_dword v45, off, s[8:11], 0 offset:580 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_or_b32_sdwa v1, v46, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    buffer_load_dword v46, off, s[8:11], 0 offset:584 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v5, v49, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:580 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v2, v46, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v2, v47, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v47, off, s[8:11], 0 offset:572 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v45, 8, v45
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_or_b32_sdwa v45, v46, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    buffer_load_dword v46, off, s[8:11], 0 offset:568 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:556 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v45, off, s[8:11], 0 offset:556 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v46, 8, v46
 ; GFX906-NEXT:    v_or_b32_sdwa v46, v47, v46 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v46 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX906-NEXT:    buffer_load_dword v46, off, s[8:11], 0 offset:560 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX906-NEXT:    v_lshlrev_b16_e32 v45, 8, v45
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v0, v46, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:548 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_or_b32_sdwa v45, v46, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v45, off, s[8:11], 0 offset:548 ; 4-byte Folded Reload
 ; GFX906-NEXT:    buffer_load_dword v46, off, s[8:11], 0 offset:552 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
+; GFX906-NEXT:    v_or_b32_sdwa v3, v48, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX906-NEXT:    v_lshlrev_b16_e32 v45, 8, v45
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v0, v46, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    global_store_dwordx4 v1, v[2:5], s[2:3] offset:64
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:540 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
+; GFX906-NEXT:    v_or_b32_sdwa v45, v46, v45 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v45 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:64
 ; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:544 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_or_b32_sdwa v3, v43, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    s_nop 0
+; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:540 ; 4-byte Folded Reload
 ; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:528 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v43, off, s[8:11], 0 offset:524 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:516 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT:    v_or_b32_sdwa v0, v42, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v4, v44, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:516 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_or_b32_sdwa v0, v41, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; GFX906-NEXT:    buffer_load_dword v41, off, s[8:11], 0 offset:532 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_or_b32_sdwa v1, v42, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    buffer_load_dword v42, off, s[8:11], 0 offset:536 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v5, v45, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:532 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v2, v42, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v2, v43, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v43, off, s[8:11], 0 offset:524 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v41, 8, v41
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_or_b32_sdwa v41, v42, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    buffer_load_dword v42, off, s[8:11], 0 offset:520 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:508 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v41, off, s[8:11], 0 offset:508 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v42, 8, v42
 ; GFX906-NEXT:    v_or_b32_sdwa v42, v43, v42 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v42 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX906-NEXT:    buffer_load_dword v42, off, s[8:11], 0 offset:512 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX906-NEXT:    v_lshlrev_b16_e32 v41, 8, v41
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v0, v42, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:500 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_or_b32_sdwa v41, v42, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v41, off, s[8:11], 0 offset:500 ; 4-byte Folded Reload
 ; GFX906-NEXT:    buffer_load_dword v42, off, s[8:11], 0 offset:504 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
+; GFX906-NEXT:    v_or_b32_sdwa v3, v44, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX906-NEXT:    v_lshlrev_b16_e32 v41, 8, v41
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v0, v42, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    global_store_dwordx4 v1, v[2:5], s[2:3] offset:80
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:492 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
+; GFX906-NEXT:    v_or_b32_sdwa v41, v42, v41 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v41 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:80
 ; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:496 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_or_b32_sdwa v3, v39, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    s_nop 0
+; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:492 ; 4-byte Folded Reload
 ; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:480 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v39, off, s[8:11], 0 offset:476 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:468 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT:    v_or_b32_sdwa v0, v38, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v4, v40, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:468 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_or_b32_sdwa v0, v37, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; GFX906-NEXT:    buffer_load_dword v37, off, s[8:11], 0 offset:484 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_or_b32_sdwa v1, v38, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    buffer_load_dword v38, off, s[8:11], 0 offset:488 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v5, v41, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:484 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v2, v38, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v2, v39, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v39, off, s[8:11], 0 offset:476 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v37, 8, v37
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_or_b32_sdwa v37, v38, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    buffer_load_dword v38, off, s[8:11], 0 offset:472 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:460 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v37, off, s[8:11], 0 offset:460 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v38, 8, v38
 ; GFX906-NEXT:    v_or_b32_sdwa v38, v39, v38 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v38 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX906-NEXT:    buffer_load_dword v38, off, s[8:11], 0 offset:464 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX906-NEXT:    v_lshlrev_b16_e32 v37, 8, v37
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v0, v38, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:452 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_or_b32_sdwa v37, v38, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v37, off, s[8:11], 0 offset:452 ; 4-byte Folded Reload
 ; GFX906-NEXT:    buffer_load_dword v38, off, s[8:11], 0 offset:456 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
+; GFX906-NEXT:    v_or_b32_sdwa v3, v40, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX906-NEXT:    v_lshlrev_b16_e32 v37, 8, v37
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v0, v38, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    global_store_dwordx4 v1, v[2:5], s[2:3] offset:96
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:444 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
+; GFX906-NEXT:    v_or_b32_sdwa v37, v38, v37 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v37 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:96
 ; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:448 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_or_b32_sdwa v3, v35, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    s_nop 0
+; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:444 ; 4-byte Folded Reload
 ; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:432 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v35, off, s[8:11], 0 offset:428 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:420 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT:    v_or_b32_sdwa v0, v34, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v4, v36, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:420 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_or_b32_sdwa v0, v33, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; GFX906-NEXT:    buffer_load_dword v33, off, s[8:11], 0 offset:436 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_or_b32_sdwa v1, v34, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    buffer_load_dword v34, off, s[8:11], 0 offset:440 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v5, v37, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:436 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v2, v34, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v2, v35, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v35, off, s[8:11], 0 offset:428 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v33, 8, v33
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_or_b32_sdwa v33, v34, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    buffer_load_dword v34, off, s[8:11], 0 offset:424 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:412 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v33, off, s[8:11], 0 offset:412 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v34, 8, v34
 ; GFX906-NEXT:    v_or_b32_sdwa v34, v35, v34 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v34 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX906-NEXT:    buffer_load_dword v34, off, s[8:11], 0 offset:416 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX906-NEXT:    v_lshlrev_b16_e32 v33, 8, v33
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v0, v34, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:404 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_or_b32_sdwa v33, v34, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v33, off, s[8:11], 0 offset:404 ; 4-byte Folded Reload
 ; GFX906-NEXT:    buffer_load_dword v34, off, s[8:11], 0 offset:408 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
+; GFX906-NEXT:    v_or_b32_sdwa v3, v36, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX906-NEXT:    v_lshlrev_b16_e32 v33, 8, v33
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v0, v34, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    global_store_dwordx4 v1, v[2:5], s[2:3] offset:112
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:396 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
+; GFX906-NEXT:    v_or_b32_sdwa v33, v34, v33 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v33 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:112
 ; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:400 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_or_b32_sdwa v3, v31, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    s_nop 0
+; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:396 ; 4-byte Folded Reload
 ; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:384 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v31, off, s[8:11], 0 offset:380 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:372 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT:    v_or_b32_sdwa v0, v30, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v4, v32, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:372 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_or_b32_sdwa v0, v29, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; GFX906-NEXT:    buffer_load_dword v29, off, s[8:11], 0 offset:388 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_or_b32_sdwa v1, v30, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    buffer_load_dword v30, off, s[8:11], 0 offset:392 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v5, v33, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:388 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v2, v30, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v2, v31, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v31, off, s[8:11], 0 offset:380 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v29, 8, v29
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_or_b32_sdwa v29, v30, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    buffer_load_dword v30, off, s[8:11], 0 offset:376 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:364 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v29, off, s[8:11], 0 offset:364 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v30, 8, v30
 ; GFX906-NEXT:    v_or_b32_sdwa v30, v31, v30 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v30 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX906-NEXT:    buffer_load_dword v30, off, s[8:11], 0 offset:368 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX906-NEXT:    v_lshlrev_b16_e32 v29, 8, v29
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v0, v30, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:356 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_or_b32_sdwa v29, v30, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v29, off, s[8:11], 0 offset:356 ; 4-byte Folded Reload
 ; GFX906-NEXT:    buffer_load_dword v30, off, s[8:11], 0 offset:360 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
+; GFX906-NEXT:    v_or_b32_sdwa v3, v32, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v0, v30, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    global_store_dwordx4 v1, v[2:5], s[2:3] offset:128
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:348 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_lshlrev_b16_e32 v29, 8, v29
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
+; GFX906-NEXT:    v_or_b32_sdwa v29, v30, v29 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v29 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:128
 ; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:352 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_or_b32_sdwa v3, v27, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    s_nop 0
+; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:348 ; 4-byte Folded Reload
 ; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:336 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v27, off, s[8:11], 0 offset:332 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:324 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT:    v_or_b32_sdwa v0, v26, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v4, v28, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:324 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_or_b32_sdwa v0, v25, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; GFX906-NEXT:    buffer_load_dword v25, off, s[8:11], 0 offset:340 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_or_b32_sdwa v1, v26, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    buffer_load_dword v26, off, s[8:11], 0 offset:344 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v5, v29, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:340 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v2, v26, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v2, v27, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v27, off, s[8:11], 0 offset:332 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v25, 8, v25
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_or_b32_sdwa v25, v26, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    buffer_load_dword v26, off, s[8:11], 0 offset:328 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:316 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v25, off, s[8:11], 0 offset:316 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v26, 8, v26
 ; GFX906-NEXT:    v_or_b32_sdwa v26, v27, v26 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v26 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX906-NEXT:    buffer_load_dword v26, off, s[8:11], 0 offset:320 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX906-NEXT:    v_lshlrev_b16_e32 v25, 8, v25
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v0, v26, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:308 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_or_b32_sdwa v25, v26, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v25, off, s[8:11], 0 offset:308 ; 4-byte Folded Reload
 ; GFX906-NEXT:    buffer_load_dword v26, off, s[8:11], 0 offset:312 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
+; GFX906-NEXT:    v_or_b32_sdwa v3, v28, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v0, v26, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    global_store_dwordx4 v1, v[2:5], s[2:3] offset:144
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:300 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_lshlrev_b16_e32 v25, 8, v25
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
+; GFX906-NEXT:    v_or_b32_sdwa v25, v26, v25 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v25 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:144
 ; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:304 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_or_b32_sdwa v3, v23, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    s_nop 0
+; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:300 ; 4-byte Folded Reload
 ; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:288 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v23, off, s[8:11], 0 offset:284 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:276 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT:    v_or_b32_sdwa v0, v22, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v4, v24, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:276 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_or_b32_sdwa v0, v21, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; GFX906-NEXT:    buffer_load_dword v21, off, s[8:11], 0 offset:292 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_or_b32_sdwa v1, v22, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    buffer_load_dword v22, off, s[8:11], 0 offset:296 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v5, v25, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:292 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v2, v22, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v2, v23, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v23, off, s[8:11], 0 offset:284 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v21, 8, v21
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_or_b32_sdwa v21, v22, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    buffer_load_dword v22, off, s[8:11], 0 offset:280 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:268 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v21, off, s[8:11], 0 offset:268 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v22, 8, v22
 ; GFX906-NEXT:    v_or_b32_sdwa v22, v23, v22 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX906-NEXT:    buffer_load_dword v22, off, s[8:11], 0 offset:272 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX906-NEXT:    v_lshlrev_b16_e32 v21, 8, v21
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v0, v22, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:260 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_or_b32_sdwa v21, v22, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v21, off, s[8:11], 0 offset:260 ; 4-byte Folded Reload
 ; GFX906-NEXT:    buffer_load_dword v22, off, s[8:11], 0 offset:264 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
+; GFX906-NEXT:    v_or_b32_sdwa v3, v24, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v0, v22, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    global_store_dwordx4 v1, v[2:5], s[2:3] offset:160
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:252 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_lshlrev_b16_e32 v21, 8, v21
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
+; GFX906-NEXT:    v_or_b32_sdwa v21, v22, v21 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:160
 ; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:256 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_or_b32_sdwa v3, v19, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    s_nop 0
+; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:252 ; 4-byte Folded Reload
 ; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:240 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v19, off, s[8:11], 0 offset:236 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:228 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT:    v_or_b32_sdwa v0, v18, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v4, v20, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:228 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_or_b32_sdwa v0, v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; GFX906-NEXT:    buffer_load_dword v17, off, s[8:11], 0 offset:244 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_or_b32_sdwa v1, v18, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    buffer_load_dword v18, off, s[8:11], 0 offset:248 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v5, v21, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:244 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v2, v18, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v2, v19, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v19, off, s[8:11], 0 offset:236 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v17, 8, v17
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    buffer_load_dword v18, off, s[8:11], 0 offset:232 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:220 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v17, off, s[8:11], 0 offset:220 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v18, 8, v18
 ; GFX906-NEXT:    v_or_b32_sdwa v18, v19, v18 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX906-NEXT:    buffer_load_dword v18, off, s[8:11], 0 offset:224 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX906-NEXT:    v_lshlrev_b16_e32 v17, 8, v17
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v0, v18, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:212 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v17, off, s[8:11], 0 offset:212 ; 4-byte Folded Reload
 ; GFX906-NEXT:    buffer_load_dword v18, off, s[8:11], 0 offset:216 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
+; GFX906-NEXT:    v_or_b32_sdwa v3, v20, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v0, v18, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    global_store_dwordx4 v1, v[2:5], s[2:3] offset:176
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:204 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_lshlrev_b16_e32 v17, 8, v17
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:208 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_or_b32_sdwa v3, v15, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:192 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v15, off, s[8:11], 0 offset:188 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    v_or_b32_sdwa v17, v18, v17 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:176
+; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:204 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_nop 0
+; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:208 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:196 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:188 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT:    v_or_b32_sdwa v0, v14, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:192 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
+; GFX906-NEXT:    v_or_b32_sdwa v3, v14, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v14, off, s[8:11], 0 offset:168 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; GFX906-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:200 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_or_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:184 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v4, v16, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v2, v13, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:180 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v14, off, s[8:11], 0 offset:200 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v13, off, s[8:11], 0 offset:164 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v5, v17, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:196 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v2, v14, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v14, off, s[8:11], 0 offset:184 ; 4-byte Folded Reload
-; GFX906-NEXT:    v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:172 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v14, 8, v14
-; GFX906-NEXT:    v_or_b32_sdwa v14, v15, v14 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v14, off, s[8:11], 0 offset:176 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:176 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX906-NEXT:    v_lshlrev_b16_e32 v13, 8, v13
+; GFX906-NEXT:    v_or_b32_sdwa v13, v14, v13 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v0, v14, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:164 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v14, off, s[8:11], 0 offset:168 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
+; GFX906-NEXT:    v_or_b32_sdwa v3, v15, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:172 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_or_b32_sdwa v0, v14, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v5, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    global_store_dwordx4 v1, v[2:5], s[2:3] offset:192
+; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
+; GFX906-NEXT:    v_or_b32_sdwa v3, v16, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:192
 ; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:160 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:156 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:152 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v4, off, s[8:11], 0 offset:144 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:132 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(4)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT:    v_or_b32_sdwa v0, v10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v10, off, s[8:11], 0 offset:120 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:156 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:152 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:144 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(3)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX906-NEXT:    v_or_b32_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v9, off, s[8:11], 0 offset:132 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; GFX906-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:148 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:140 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
+; GFX906-NEXT:    v_or_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:148 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:140 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:136 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:128 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v10, off, s[8:11], 0 offset:120 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
-; GFX906-NEXT:    v_or_b32_sdwa v0, v11, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:136 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v4, off, s[8:11], 0 offset:128 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v4, 8, v4
-; GFX906-NEXT:    v_or_b32_sdwa v0, v12, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:124 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:116 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_or_b32_sdwa v2, v11, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v3, v9, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:124 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v9, off, s[8:11], 0 offset:116 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v5, 8, v5
-; GFX906-NEXT:    v_or_b32_sdwa v0, v13, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v5, v10, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v5, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    global_store_dwordx4 v1, v[2:5], s[2:3] offset:208
+; GFX906-NEXT:    v_lshlrev_b16_e32 v9, 8, v9
+; GFX906-NEXT:    v_or_b32_sdwa v3, v12, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v9, v10, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:208
 ; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:112 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:108 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:104 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v4, off, s[8:11], 0 offset:96 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:84 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(4)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT:    v_or_b32_sdwa v0, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:72 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:108 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:104 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:96 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(3)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX906-NEXT:    v_or_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:84 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; GFX906-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:100 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:92 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
+; GFX906-NEXT:    v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:100 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:92 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:88 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:80 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:72 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
-; GFX906-NEXT:    v_or_b32_sdwa v0, v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v3, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:88 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v4, off, s[8:11], 0 offset:80 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v4, 8, v4
-; GFX906-NEXT:    v_or_b32_sdwa v0, v8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v4, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:76 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_or_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:76 ; 4-byte Folded Reload
 ; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:68 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v5, 8, v5
-; GFX906-NEXT:    v_or_b32_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_or_b32_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v5, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    global_store_dwordx4 v1, v[2:5], s[2:3] offset:224
+; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:224
 ; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:64 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_nop 0
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:8 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v4, off, s[8:11], 0 offset:12 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:16 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:60 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(5)
+; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:4 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:8 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v7, off, s[8:11], 0 offset:12 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v8, off, s[8:11], 0 offset:16 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:56 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:60 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:48 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(7)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX906-NEXT:    s_waitcnt vmcnt(3)
+; GFX906-NEXT:    v_or_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:56 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX906-NEXT:    v_or_b32_sdwa v2, v6, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:52 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:48 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT:    v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:44 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
-; GFX906-NEXT:    v_or_b32_sdwa v3, v6, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:40 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:36 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v1, off, s[8:11], 0 offset:52 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:44 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:36 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT:    v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v4, off, s[8:11], 0 offset:32 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v4, 8, v4
-; GFX906-NEXT:    v_or_b32_sdwa v4, v6, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    buffer_load_dword v0, off, s[8:11], 0 offset:28 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
+; GFX906-NEXT:    v_or_b32_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v2, off, s[8:11], 0 offset:40 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:32 ; 4-byte Folded Reload
 ; GFX906-NEXT:    buffer_load_dword v6, off, s[8:11], 0 offset:24 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(2)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
 ; GFX906-NEXT:    s_waitcnt vmcnt(1)
-; GFX906-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT:    v_or_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
+; GFX906-NEXT:    v_or_b32_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT:    v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    buffer_load_dword v3, off, s[8:11], 0 offset:28 ; 4-byte Folded Reload
 ; GFX906-NEXT:    buffer_load_dword v5, off, s[8:11], 0 offset:20 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_waitcnt vmcnt(1)
+; GFX906-NEXT:    v_lshlrev_b16_e32 v3, 8, v3
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    v_lshlrev_b16_e32 v5, 8, v5
+; GFX906-NEXT:    v_or_b32_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX906-NEXT:    v_or_b32_sdwa v5, v6, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT:    v_or_b32_sdwa v5, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX906-NEXT:    global_store_dwordx4 v1, v[2:5], s[2:3] offset:240
+; GFX906-NEXT:    v_or_b32_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX906-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3] offset:240
 ; GFX906-NEXT:    s_endpgm
 entry:
   %idx = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/xnor.ll b/llvm/test/CodeGen/AMDGPU/xnor.ll
index 75a21bdc0ff3b..fe70e9e426ddf 100644
--- a/llvm/test/CodeGen/AMDGPU/xnor.ll
+++ b/llvm/test/CodeGen/AMDGPU/xnor.ll
@@ -113,8 +113,7 @@ define amdgpu_kernel void @xnor_v_s_i32_one_use(ptr addrspace(1) %out, i32 %s) {
 ; GCN-LABEL: {{^}}xnor_i64_s_v_one_use
 ; GCN-NOT: s_xnor_b64
 ; GCN: s_not_b64
-; GCN: v_xor_b32
-; GCN: v_xor_b32
+; GCN: v_xor_b32_e32
 ; GCN-DL: v_xnor_b32
 ; GCN-DL: v_xnor_b32
 define amdgpu_kernel void @xnor_i64_s_v_one_use(
@@ -132,8 +131,7 @@ entry:
 ; GCN-LABEL: {{^}}xnor_i64_v_s_one_use
 ; GCN-NOT: s_xnor_b64
 ; GCN: s_not_b64
-; GCN: v_xor_b32
-; GCN: v_xor_b32
+; GCN: v_xor_b32_e32
 ; GCN-DL: v_xnor_b32
 ; GCN-DL: v_xnor_b32
 define amdgpu_kernel void @xnor_i64_v_s_one_use(
diff --git a/llvm/test/CodeGen/X86/2009-05-30-ISelBug.ll b/llvm/test/CodeGen/X86/2009-05-30-ISelBug.ll
index f2a53653ec2ff..26d8905b48d04 100644
--- a/llvm/test/CodeGen/X86/2009-05-30-ISelBug.ll
+++ b/llvm/test/CodeGen/X86/2009-05-30-ISelBug.ll
@@ -8,9 +8,8 @@ define void @BZ2_bzDecompress_bb5_2E_outer_bb35_2E_i_bb54_2E_i(ptr, i32 %c_nbloc
 ; CHECK-NEXT:    movl %edx, %edx
 ; CHECK-NEXT:    movl (%rdi,%rdx,4), %edx
 ; CHECK-NEXT:    movzbl %dl, %r10d
-; CHECK-NEXT:    # kill: def $edx killed $edx def $rdx
-; CHECK-NEXT:    shrl $8, %edx
 ; CHECK-NEXT:    addl $4, %r10d
+; CHECK-NEXT:    shrl $8, %edx
 ; CHECK-NEXT:    movl (%rdi,%rdx,4), %edx
 ; CHECK-NEXT:    movzbl %dl, %edi
 ; CHECK-NEXT:    shrl $8, %edx
diff --git a/llvm/test/CodeGen/X86/atomic-rm-bit-test-64.ll b/llvm/test/CodeGen/X86/atomic-rm-bit-test-64.ll
index ad85a090010f8..8ff4f4067dabd 100644
--- a/llvm/test/CodeGen/X86/atomic-rm-bit-test-64.ll
+++ b/llvm/test/CodeGen/X86/atomic-rm-bit-test-64.ll
@@ -1392,11 +1392,8 @@ return:                                           ; preds = %entry, %if.then
 define i64 @atomic_shl1_xor_64_const_br(ptr %v) nounwind {
 ; CHECK-LABEL: atomic_shl1_xor_64_const_br:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    xorl %eax, %eax
 ; CHECK-NEXT:    lock btcq $4, (%rdi)
-; CHECK-NEXT:    setb %al
-; CHECK-NEXT:    shlq $4, %rax
-; CHECK-NEXT:    je .LBB48_1
+; CHECK-NEXT:    jae .LBB48_1
 ; CHECK-NEXT:  # %bb.2: # %if.then
 ; CHECK-NEXT:    movq 32(%rdi), %rax
 ; CHECK-NEXT:    retq
@@ -1458,12 +1455,9 @@ return:                                           ; preds = %entry, %if.then
 define i64 @atomic_shl1_xor_64_const_brz(ptr %v) nounwind {
 ; CHECK-LABEL: atomic_shl1_xor_64_const_brz:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    xorl %eax, %eax
 ; CHECK-NEXT:    lock btcq $4, (%rdi)
-; CHECK-NEXT:    setb %al
-; CHECK-NEXT:    shlq $4, %rax
 ; CHECK-NEXT:    movl $123, %eax
-; CHECK-NEXT:    je .LBB50_1
+; CHECK-NEXT:    jae .LBB50_1
 ; CHECK-NEXT:  # %bb.2: # %return
 ; CHECK-NEXT:    retq
 ; CHECK-NEXT:  .LBB50_1: # %if.then
@@ -1524,11 +1518,8 @@ return:                                           ; preds = %entry, %if.then
 define i64 @atomic_shl1_xor_64_const_brnz(ptr %v) nounwind {
 ; CHECK-LABEL: atomic_shl1_xor_64_const_brnz:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    xorl %eax, %eax
 ; CHECK-NEXT:    lock btcq $4, (%rdi)
-; CHECK-NEXT:    setb %al
-; CHECK-NEXT:    shlq $4, %rax
-; CHECK-NEXT:    je .LBB52_1
+; CHECK-NEXT:    jae .LBB52_1
 ; CHECK-NEXT:  # %bb.2: # %if.then
 ; CHECK-NEXT:    movq 32(%rdi), %rax
 ; CHECK-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/avx512vnni-combine.ll b/llvm/test/CodeGen/X86/avx512vnni-combine.ll
index 7a0527be05419..f0c8a7e208326 100644
--- a/llvm/test/CodeGen/X86/avx512vnni-combine.ll
+++ b/llvm/test/CodeGen/X86/avx512vnni-combine.ll
@@ -73,7 +73,7 @@ define <8 x i64> @foo_512(i32 %0, <8 x i64> %1, <8 x i64> %2, ptr %3) {
 ; CHECK-NEXT:  # %bb.4: # %.preheader
 ; CHECK-NEXT:    shlq $6, %rcx
 ; CHECK-NEXT:    addq %rcx, %rsi
-; CHECK-NEXT:    shlq $6, %rax
+; CHECK-NEXT:    shll $6, %eax
 ; CHECK-NEXT:    xorl %ecx, %ecx
 ; CHECK-NEXT:    .p2align 4, 0x90
 ; CHECK-NEXT:  .LBB1_5: # =>This Inner Loop Header: Depth=1
diff --git a/llvm/test/CodeGen/X86/avxvnni-combine.ll b/llvm/test/CodeGen/X86/avxvnni-combine.ll
index d8e73a5cf37d8..75e29df9f34ac 100644
--- a/llvm/test/CodeGen/X86/avxvnni-combine.ll
+++ b/llvm/test/CodeGen/X86/avxvnni-combine.ll
@@ -78,7 +78,7 @@ define <2 x i64> @foo_128(i32 %0, <2 x i64> %1, <2 x i64> %2, ptr %3) {
 ; AVX-NEXT:  # %bb.4: # %.preheader
 ; AVX-NEXT:    shlq $4, %rcx
 ; AVX-NEXT:    addq %rcx, %rsi
-; AVX-NEXT:    shlq $4, %rax
+; AVX-NEXT:    shll $4, %eax
 ; AVX-NEXT:    xorl %ecx, %ecx
 ; AVX-NEXT:    .p2align 4, 0x90
 ; AVX-NEXT:  .LBB1_5: # =>This Inner Loop Header: Depth=1
@@ -125,7 +125,7 @@ define <2 x i64> @foo_128(i32 %0, <2 x i64> %1, <2 x i64> %2, ptr %3) {
 ; AVX512-NEXT:  # %bb.4: # %.preheader
 ; AVX512-NEXT:    shlq $4, %rcx
 ; AVX512-NEXT:    addq %rcx, %rsi
-; AVX512-NEXT:    shlq $4, %rax
+; AVX512-NEXT:    shll $4, %eax
 ; AVX512-NEXT:    xorl %ecx, %ecx
 ; AVX512-NEXT:    .p2align 4, 0x90
 ; AVX512-NEXT:  .LBB1_5: # =>This Inner Loop Header: Depth=1
@@ -425,7 +425,7 @@ define <4 x i64> @foo_256(i32 %0, <4 x i64> %1, <4 x i64> %2, ptr %3) {
 ; AVX-NEXT:  # %bb.4: # %.preheader
 ; AVX-NEXT:    shlq $5, %rcx
 ; AVX-NEXT:    addq %rcx, %rsi
-; AVX-NEXT:    shlq $5, %rax
+; AVX-NEXT:    shll $5, %eax
 ; AVX-NEXT:    xorl %ecx, %ecx
 ; AVX-NEXT:    .p2align 4, 0x90
 ; AVX-NEXT:  .LBB4_5: # =>This Inner Loop Header: Depth=1
@@ -472,7 +472,7 @@ define <4 x i64> @foo_256(i32 %0, <4 x i64> %1, <4 x i64> %2, ptr %3) {
 ; AVX512-NEXT:  # %bb.4: # %.preheader
 ; AVX512-NEXT:    shlq $5, %rcx
 ; AVX512-NEXT:    addq %rcx, %rsi
-; AVX512-NEXT:    shlq $5, %rax
+; AVX512-NEXT:    shll $5, %eax
 ; AVX512-NEXT:    xorl %ecx, %ecx
 ; AVX512-NEXT:    .p2align 4, 0x90
 ; AVX512-NEXT:  .LBB4_5: # =>This Inner Loop Header: Depth=1
diff --git a/llvm/test/CodeGen/X86/bswap.ll b/llvm/test/CodeGen/X86/bswap.ll
index 17fd612b812eb..81eac5676bb5c 100644
--- a/llvm/test/CodeGen/X86/bswap.ll
+++ b/llvm/test/CodeGen/X86/bswap.ll
@@ -168,8 +168,8 @@ define i64 @not_bswap() {
 ; CHECK64-NEXT:    movzwl var16(%rip), %eax
 ; CHECK64-NEXT:    movl %eax, %ecx
 ; CHECK64-NEXT:    shrl $8, %ecx
-; CHECK64-NEXT:    shlq $8, %rax
-; CHECK64-NEXT:    orq %rcx, %rax
+; CHECK64-NEXT:    shll $8, %eax
+; CHECK64-NEXT:    orl %ecx, %eax
 ; CHECK64-NEXT:    retq
   %init = load i16, ptr @var16
   %big = zext i16 %init to i64
@@ -197,7 +197,7 @@ define i64 @not_useful_bswap() {
 ; CHECK64-LABEL: not_useful_bswap:
 ; CHECK64:       # %bb.0:
 ; CHECK64-NEXT:    movzbl var8(%rip), %eax
-; CHECK64-NEXT:    shlq $8, %rax
+; CHECK64-NEXT:    shll $8, %eax
 ; CHECK64-NEXT:    retq
   %init = load i8, ptr @var8
   %big = zext i8 %init to i64
@@ -224,12 +224,9 @@ define i64 @finally_useful_bswap() {
 ;
 ; CHECK64-LABEL: finally_useful_bswap:
 ; CHECK64:       # %bb.0:
-; CHECK64-NEXT:    movzwl var16(%rip), %ecx
-; CHECK64-NEXT:    movzbl %cl, %eax
-; CHECK64-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
-; CHECK64-NEXT:    shrl $8, %ecx
-; CHECK64-NEXT:    shlq $8, %rax
-; CHECK64-NEXT:    orq %rcx, %rax
+; CHECK64-NEXT:    movzwl var16(%rip), %eax
+; CHECK64-NEXT:    bswapl %eax
+; CHECK64-NEXT:    shrl $16, %eax
 ; CHECK64-NEXT:    retq
   %init = load i16, ptr @var16
   %big = zext i16 %init to i64
diff --git a/llvm/test/CodeGen/X86/buildvec-insertvec.ll b/llvm/test/CodeGen/X86/buildvec-insertvec.ll
index 5500ad3323043..a3568716edd9e 100644
--- a/llvm/test/CodeGen/X86/buildvec-insertvec.ll
+++ b/llvm/test/CodeGen/X86/buildvec-insertvec.ll
@@ -832,7 +832,7 @@ define void @pr59781(ptr %in, ptr %out) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movzwl (%rdi), %eax
 ; CHECK-NEXT:    movzbl 2(%rdi), %ecx
-; CHECK-NEXT:    shlq $16, %rcx
+; CHECK-NEXT:    shll $16, %ecx
 ; CHECK-NEXT:    orq %rax, %rcx
 ; CHECK-NEXT:    movq %rcx, (%rsi)
 ; CHECK-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/cmp-concat.ll b/llvm/test/CodeGen/X86/cmp-concat.ll
index 33834bc4470ae..5e030de1409f2 100644
--- a/llvm/test/CodeGen/X86/cmp-concat.ll
+++ b/llvm/test/CodeGen/X86/cmp-concat.ll
@@ -35,8 +35,8 @@ define i1 @cmp_anybits_concat_shl_shl_i16(i16 %x, i16 %y) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movzwl %di, %eax
 ; CHECK-NEXT:    movzwl %si, %ecx
-; CHECK-NEXT:    shlq $8, %rcx
-; CHECK-NEXT:    orq %rax, %rcx
+; CHECK-NEXT:    shll $8, %ecx
+; CHECK-NEXT:    orl %eax, %ecx
 ; CHECK-NEXT:    sete %al
 ; CHECK-NEXT:    retq
   %zx = zext i16 %x to i64
@@ -53,8 +53,8 @@ define i1 @cmp_anybits_concat_shl_shl_i16_commute(i16 %x, i16 %y) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movzwl %di, %eax
 ; CHECK-NEXT:    movzwl %si, %ecx
-; CHECK-NEXT:    shlq $8, %rcx
-; CHECK-NEXT:    orq %rax, %rcx
+; CHECK-NEXT:    shll $8, %ecx
+; CHECK-NEXT:    orl %eax, %ecx
 ; CHECK-NEXT:    sete %al
 ; CHECK-NEXT:    retq
   %zx = zext i16 %x to i64
diff --git a/llvm/test/CodeGen/X86/coalescer-breaks-subreg-to-reg-liveness-reduced.ll b/llvm/test/CodeGen/X86/coalescer-breaks-subreg-to-reg-liveness-reduced.ll
index 63061b0d851b6..a5295d44b07c8 100644
--- a/llvm/test/CodeGen/X86/coalescer-breaks-subreg-to-reg-liveness-reduced.ll
+++ b/llvm/test/CodeGen/X86/coalescer-breaks-subreg-to-reg-liveness-reduced.ll
@@ -67,7 +67,7 @@ define void @foo(ptr %arg3, i1 %icmp16) #0 {
 ; CHECK-NEXT:    xorl %edi, %edi
 ; CHECK-NEXT:    xorl %eax, %eax
 ; CHECK-NEXT:    callq *%rax
-; CHECK-NEXT:    shlq $4, %r14
+; CHECK-NEXT:    shll $4, %r14d
 ; CHECK-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload
 ; CHECK-NEXT:    movl %r13d, 0
 ; CHECK-NEXT:    movb $0, 4
diff --git a/llvm/test/CodeGen/X86/combine-bitreverse.ll b/llvm/test/CodeGen/X86/combine-bitreverse.ll
index 32579bd05605e..9f81fab54a49d 100644
--- a/llvm/test/CodeGen/X86/combine-bitreverse.ll
+++ b/llvm/test/CodeGen/X86/combine-bitreverse.ll
@@ -368,20 +368,20 @@ define i64 @test_bitreverse_shli_bitreverse_i64(i64 %a) nounwind {
 ; X64-NEXT:    bswapq %rax
 ; X64-NEXT:    movl %eax, %ecx
 ; X64-NEXT:    andl $235867919, %ecx # imm = 0xE0F0F0F
-; X64-NEXT:    shlq $4, %rcx
+; X64-NEXT:    shll $4, %ecx
 ; X64-NEXT:    shrl $4, %eax
 ; X64-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
-; X64-NEXT:    orq %rcx, %rax
+; X64-NEXT:    orl %ecx, %eax
 ; X64-NEXT:    movl %eax, %ecx
 ; X64-NEXT:    andl $590558003, %ecx # imm = 0x23333333
 ; X64-NEXT:    shrl $2, %eax
 ; X64-NEXT:    andl $858993459, %eax # imm = 0x33333333
-; X64-NEXT:    leaq (%rax,%rcx,4), %rax
+; X64-NEXT:    leal (%rax,%rcx,4), %eax
 ; X64-NEXT:    movl %eax, %ecx
 ; X64-NEXT:    andl $357913941, %ecx # imm = 0x15555555
 ; X64-NEXT:    shrl %eax
 ; X64-NEXT:    andl $1431655765, %eax # imm = 0x55555555
-; X64-NEXT:    leaq (%rax,%rcx,2), %rax
+; X64-NEXT:    leal (%rax,%rcx,2), %eax
 ; X64-NEXT:    retq
     %1 = call i64 @llvm.bitreverse.i64(i64 %a)
     %2 = shl i64 %1, 33
diff --git a/llvm/test/CodeGen/X86/const-shift-of-constmasked.ll b/llvm/test/CodeGen/X86/const-shift-of-constmasked.ll
index 5c23c155ed85f..142ac754c3f7e 100644
--- a/llvm/test/CodeGen/X86/const-shift-of-constmasked.ll
+++ b/llvm/test/CodeGen/X86/const-shift-of-constmasked.ll
@@ -1933,8 +1933,7 @@ define i64 @test_i64_2147483647_mask_shl_1(i64 %a0) {
 ;
 ; X64-LABEL: test_i64_2147483647_mask_shl_1:
 ; X64:       # %bb.0:
-; X64-NEXT:    andl $2147483647, %edi # imm = 0x7FFFFFFF
-; X64-NEXT:    leaq (%rdi,%rdi), %rax
+; X64-NEXT:    leal (%rdi,%rdi), %eax
 ; X64-NEXT:    retq
   %t0 = and i64 %a0, 2147483647
   %t1 = shl i64 %t0, 1
diff --git a/llvm/test/CodeGen/X86/dagcombine-shifts.ll b/llvm/test/CodeGen/X86/dagcombine-shifts.ll
index 6bfda6827d520..42b325dd4c229 100644
--- a/llvm/test/CodeGen/X86/dagcombine-shifts.ll
+++ b/llvm/test/CodeGen/X86/dagcombine-shifts.ll
@@ -146,7 +146,7 @@ define i64 @fun7(i8 zeroext %v) {
 ; X64:       # %bb.0: # %entry
 ; X64-NEXT:    sarb $4, %dil
 ; X64-NEXT:    movzbl %dil, %eax
-; X64-NEXT:    shlq $4, %rax
+; X64-NEXT:    shll $4, %eax
 ; X64-NEXT:    retq
 entry:
   %shr = ashr i8 %v, 4
@@ -166,9 +166,7 @@ define i64 @fun8(i16 zeroext %v) {
 ; X64-LABEL: fun8:
 ; X64:       # %bb.0: # %entry
 ; X64-NEXT:    movswl %di, %eax
-; X64-NEXT:    shrl $4, %eax
-; X64-NEXT:    movzwl %ax, %eax
-; X64-NEXT:    shlq $4, %rax
+; X64-NEXT:    andl $1048560, %eax # imm = 0xFFFF0
 ; X64-NEXT:    retq
 entry:
   %shr = ashr i16 %v, 4
@@ -217,11 +215,12 @@ define i64 @fun10(i8 zeroext %v) {
 ;
 ; X64-LABEL: fun10:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    shrb $4, %dil
-; X64-NEXT:    movzbl %dil, %ecx
-; X64-NEXT:    movq %rcx, %rax
-; X64-NEXT:    shlq $4, %rax
-; X64-NEXT:    orq %rcx, %rax
+; X64-NEXT:    # kill: def $edi killed $edi def $rdi
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    shrb $4, %al
+; X64-NEXT:    movzbl %al, %eax
+; X64-NEXT:    andl $-16, %edi
+; X64-NEXT:    orq %rdi, %rax
 ; X64-NEXT:    retq
 entry:
   %shr = lshr i8 %v, 4
@@ -245,9 +244,9 @@ define i64 @fun11(i16 zeroext %v) {
 ; X64-LABEL: fun11:
 ; X64:       # %bb.0: # %entry
 ; X64-NEXT:    # kill: def $edi killed $edi def $rdi
-; X64-NEXT:    shrl $4, %edi
-; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    shlq $4, %rax
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    shrl $4, %eax
+; X64-NEXT:    andl $-16, %edi
 ; X64-NEXT:    addq %rdi, %rax
 ; X64-NEXT:    retq
 entry:
@@ -273,9 +272,9 @@ define i64 @fun12(i32 zeroext %v) {
 ; X64-LABEL: fun12:
 ; X64:       # %bb.0: # %entry
 ; X64-NEXT:    # kill: def $edi killed $edi def $rdi
-; X64-NEXT:    shrl $4, %edi
-; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    shlq $4, %rax
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    shrl $4, %eax
+; X64-NEXT:    andl $-16, %edi
 ; X64-NEXT:    addq %rdi, %rax
 ; X64-NEXT:    retq
 entry:
diff --git a/llvm/test/CodeGen/X86/divmod128.ll b/llvm/test/CodeGen/X86/divmod128.ll
index 2e13776715e5b..3796dd796eaf9 100644
--- a/llvm/test/CodeGen/X86/divmod128.ll
+++ b/llvm/test/CodeGen/X86/divmod128.ll
@@ -425,36 +425,35 @@ entry:
 define i128 @urem_i128_12(i128 %x) nounwind {
 ; X86-64-LABEL: urem_i128_12:
 ; X86-64:       # %bb.0: # %entry
-; X86-64-NEXT:    movq %rsi, %rax
-; X86-64-NEXT:    shldq $62, %rdi, %rax
+; X86-64-NEXT:    movq %rsi, %rcx
+; X86-64-NEXT:    shldq $62, %rdi, %rcx
 ; X86-64-NEXT:    shrq $2, %rsi
-; X86-64-NEXT:    addq %rax, %rsi
-; X86-64-NEXT:    adcq $0, %rsi
-; X86-64-NEXT:    movabsq $-6148914691236517205, %rcx # imm = 0xAAAAAAAAAAAAAAAB
-; X86-64-NEXT:    movq %rsi, %rax
-; X86-64-NEXT:    mulq %rcx
+; X86-64-NEXT:    addq %rsi, %rcx
+; X86-64-NEXT:    adcq $0, %rcx
+; X86-64-NEXT:    movabsq $-6148914691236517205, %rdx # imm = 0xAAAAAAAAAAAAAAAB
+; X86-64-NEXT:    movq %rcx, %rax
+; X86-64-NEXT:    mulq %rdx
 ; X86-64-NEXT:    shrq %rdx
-; X86-64-NEXT:    leaq (%rdx,%rdx,2), %rax
-; X86-64-NEXT:    subq %rax, %rsi
+; X86-64-NEXT:    leal (%rdx,%rdx,2), %eax
+; X86-64-NEXT:    subl %eax, %ecx
 ; X86-64-NEXT:    andl $3, %edi
-; X86-64-NEXT:    leaq (%rdi,%rsi,4), %rax
+; X86-64-NEXT:    leaq (%rdi,%rcx,4), %rax
 ; X86-64-NEXT:    xorl %edx, %edx
 ; X86-64-NEXT:    retq
 ;
 ; WIN64-LABEL: urem_i128_12:
 ; WIN64:       # %bb.0: # %entry
 ; WIN64-NEXT:    movq %rdx, %r8
-; WIN64-NEXT:    movq %rdx, %rax
-; WIN64-NEXT:    shldq $62, %rcx, %rax
-; WIN64-NEXT:    shrq $2, %r8
-; WIN64-NEXT:    addq %rax, %r8
+; WIN64-NEXT:    shldq $62, %rcx, %r8
+; WIN64-NEXT:    shrq $2, %rdx
+; WIN64-NEXT:    addq %rdx, %r8
 ; WIN64-NEXT:    adcq $0, %r8
 ; WIN64-NEXT:    movabsq $-6148914691236517205, %rdx # imm = 0xAAAAAAAAAAAAAAAB
 ; WIN64-NEXT:    movq %r8, %rax
 ; WIN64-NEXT:    mulq %rdx
 ; WIN64-NEXT:    shrq %rdx
-; WIN64-NEXT:    leaq (%rdx,%rdx,2), %rax
-; WIN64-NEXT:    subq %rax, %r8
+; WIN64-NEXT:    leal (%rdx,%rdx,2), %eax
+; WIN64-NEXT:    subl %eax, %r8d
 ; WIN64-NEXT:    andl $3, %ecx
 ; WIN64-NEXT:    leaq (%rcx,%r8,4), %rax
 ; WIN64-NEXT:    xorl %edx, %edx
diff --git a/llvm/test/CodeGen/X86/extract-bits.ll b/llvm/test/CodeGen/X86/extract-bits.ll
index 38a1de251a3d9..90e075bfabf0a 100644
--- a/llvm/test/CodeGen/X86/extract-bits.ll
+++ b/llvm/test/CodeGen/X86/extract-bits.ll
@@ -8091,9 +8091,9 @@ define void @pr38938(ptr %a0, ptr %a1) nounwind {
 ; X64-NOBMI-LABEL: pr38938:
 ; X64-NOBMI:       # %bb.0:
 ; X64-NOBMI-NEXT:    movl (%rsi), %eax
-; X64-NOBMI-NEXT:    shrl $21, %eax
-; X64-NOBMI-NEXT:    andl $1023, %eax # imm = 0x3FF
-; X64-NOBMI-NEXT:    incl (%rdi,%rax,4)
+; X64-NOBMI-NEXT:    shrl $19, %eax
+; X64-NOBMI-NEXT:    andl $4092, %eax # imm = 0xFFC
+; X64-NOBMI-NEXT:    incl (%rdi,%rax)
 ; X64-NOBMI-NEXT:    retq
 ;
 ; X64-BMINOTBM-LABEL: pr38938:
diff --git a/llvm/test/CodeGen/X86/fold-and-shift.ll b/llvm/test/CodeGen/X86/fold-and-shift.ll
index 41adab63a11a6..985d7c6c82f06 100644
--- a/llvm/test/CodeGen/X86/fold-and-shift.ll
+++ b/llvm/test/CodeGen/X86/fold-and-shift.ll
@@ -36,8 +36,7 @@ define i32 @t2(ptr %X, i32 %i) {
 ; X64-LABEL: t2:
 ; X64:       # %bb.0: # %entry
 ; X64-NEXT:    movzwl %si, %eax
-; X64-NEXT:    addl %eax, %eax
-; X64-NEXT:    movl (%rdi,%rax,2), %eax
+; X64-NEXT:    movl (%rdi,%rax,4), %eax
 ; X64-NEXT:    retq
 entry:
   %tmp2 = shl i32 %i, 1
diff --git a/llvm/test/CodeGen/X86/fp128-i128.ll b/llvm/test/CodeGen/X86/fp128-i128.ll
index b9e6803686621..f48790de86fe9 100644
--- a/llvm/test/CodeGen/X86/fp128-i128.ll
+++ b/llvm/test/CodeGen/X86/fp128-i128.ll
@@ -137,7 +137,7 @@ define fp128 @TestI128_1(fp128 %x) #0 {
 ; SSE-NEXT:    xorl %ecx, %ecx
 ; SSE-NEXT:    testl %eax, %eax
 ; SSE-NEXT:    sets %cl
-; SSE-NEXT:    shlq $4, %rcx
+; SSE-NEXT:    shll $4, %ecx
 ; SSE-NEXT:    movaps {{\.?LCPI[0-9]+_[0-9]+}}(%rcx), %xmm0
 ; SSE-NEXT:    popq %rax
 ; SSE-NEXT:    retq
@@ -151,7 +151,7 @@ define fp128 @TestI128_1(fp128 %x) #0 {
 ; AVX-NEXT:    xorl %ecx, %ecx
 ; AVX-NEXT:    testl %eax, %eax
 ; AVX-NEXT:    sets %cl
-; AVX-NEXT:    shlq $4, %rcx
+; AVX-NEXT:    shll $4, %ecx
 ; AVX-NEXT:    vmovaps {{\.?LCPI[0-9]+_[0-9]+}}(%rcx), %xmm0
 ; AVX-NEXT:    popq %rax
 ; AVX-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/lea-dagdag.ll b/llvm/test/CodeGen/X86/lea-dagdag.ll
index 2705bd00f5d2c..f81851a92d8de 100644
--- a/llvm/test/CodeGen/X86/lea-dagdag.ll
+++ b/llvm/test/CodeGen/X86/lea-dagdag.ll
@@ -199,7 +199,7 @@ define i64 @and_i32_zext_shl_add_i64_overshift(i64 %t0, i32 %t1) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    # kill: def $esi killed $esi def $rsi
 ; CHECK-NEXT:    andl $8, %esi
-; CHECK-NEXT:    shlq $4, %rsi
+; CHECK-NEXT:    shll $4, %esi
 ; CHECK-NEXT:    leaq (%rsi,%rdi), %rax
 ; CHECK-NEXT:    retq
   %t4 = and i32 %t1, 8
diff --git a/llvm/test/CodeGen/X86/lea-opt2.ll b/llvm/test/CodeGen/X86/lea-opt2.ll
index cec19dcf49c8d..f7588577a3e9a 100644
--- a/llvm/test/CodeGen/X86/lea-opt2.ll
+++ b/llvm/test/CodeGen/X86/lea-opt2.ll
@@ -192,7 +192,7 @@ define void @test9(i64 %p, i64 %s) {
 ; CHECK-NEXT:    xorl %ecx, %ecx
 ; CHECK-NEXT:    testl $4095, %eax # imm = 0xFFF
 ; CHECK-NEXT:    setne %cl
-; CHECK-NEXT:    shlq $12, %rcx
+; CHECK-NEXT:    shll $12, %ecx
 ; CHECK-NEXT:    addq %rax, %rcx
 ; CHECK-NEXT:    andq $-4096, %rcx # imm = 0xF000
 ; CHECK-NEXT:    addq %rcx, %rdi
diff --git a/llvm/test/CodeGen/X86/lsr-loop-exit-cond.ll b/llvm/test/CodeGen/X86/lsr-loop-exit-cond.ll
index 9f0c1ea1dc3f6..a2e4e4784d361 100644
--- a/llvm/test/CodeGen/X86/lsr-loop-exit-cond.ll
+++ b/llvm/test/CodeGen/X86/lsr-loop-exit-cond.ll
@@ -28,10 +28,10 @@ define void @t(i8* nocapture %in, i8* nocapture %out, i32* nocapture %rk, i32 %r
 ; GENERIC-NEXT:    movzbl %r8b, %r14d
 ; GENERIC-NEXT:    ## kill: def $r8d killed $r8d def $r8
 ; GENERIC-NEXT:    shrl $24, %r8d
-; GENERIC-NEXT:    movl %ebx, %ebp
-; GENERIC-NEXT:    shrl $16, %ebp
-; GENERIC-NEXT:    movzbl %bpl, %r15d
-; GENERIC-NEXT:    movl (%rax,%r15,4), %ebp
+; GENERIC-NEXT:    movl %ebx, %r15d
+; GENERIC-NEXT:    shrl $14, %r15d
+; GENERIC-NEXT:    andl $1020, %r15d ## imm = 0x3FC
+; GENERIC-NEXT:    movl (%rax,%r15), %ebp
 ; GENERIC-NEXT:    xorl (%rdi,%r8,4), %ebp
 ; GENERIC-NEXT:    xorl -12(%r9), %ebp
 ; GENERIC-NEXT:    shrl $24, %ebx
@@ -46,9 +46,9 @@ define void @t(i8* nocapture %in, i8* nocapture %out, i32* nocapture %rk, i32 %r
 ; GENERIC-NEXT:  ## %bb.2: ## %bb1
 ; GENERIC-NEXT:    ## in Loop: Header=BB0_1 Depth=1
 ; GENERIC-NEXT:    movl %r14d, %ebx
-; GENERIC-NEXT:    shrl $16, %ebx
-; GENERIC-NEXT:    movzbl %bl, %ebx
-; GENERIC-NEXT:    xorl (%rax,%rbx,4), %r8d
+; GENERIC-NEXT:    shrl $14, %ebx
+; GENERIC-NEXT:    andl $1020, %ebx ## imm = 0x3FC
+; GENERIC-NEXT:    xorl (%rax,%rbx), %r8d
 ; GENERIC-NEXT:    xorl -4(%r9), %r8d
 ; GENERIC-NEXT:    shrl $24, %r14d
 ; GENERIC-NEXT:    movzbl %bpl, %ebx
@@ -61,9 +61,9 @@ define void @t(i8* nocapture %in, i8* nocapture %out, i32* nocapture %rk, i32 %r
 ; GENERIC-NEXT:    shlq $4, %rcx
 ; GENERIC-NEXT:    andl $-16777216, %r8d ## imm = 0xFF000000
 ; GENERIC-NEXT:    movl %r14d, %r9d
-; GENERIC-NEXT:    shrl $16, %r9d
-; GENERIC-NEXT:    movzbl %r9b, %r9d
-; GENERIC-NEXT:    movzbl 2(%rax,%r9,4), %r9d
+; GENERIC-NEXT:    shrl $14, %r9d
+; GENERIC-NEXT:    andl $1020, %r9d ## imm = 0x3FC
+; GENERIC-NEXT:    movzbl 2(%rax,%r9), %r9d
 ; GENERIC-NEXT:    shll $16, %r9d
 ; GENERIC-NEXT:    orl %r8d, %r9d
 ; GENERIC-NEXT:    xorl 16(%rcx,%rdx), %r9d
@@ -93,7 +93,6 @@ define void @t(i8* nocapture %in, i8* nocapture %out, i32* nocapture %rk, i32 %r
 ;
 ; ATOM-LABEL: t:
 ; ATOM:       ## %bb.0: ## %entry
-; ATOM-NEXT:    pushq %rbp
 ; ATOM-NEXT:    pushq %r15
 ; ATOM-NEXT:    pushq %r14
 ; ATOM-NEXT:    pushq %rbx
@@ -113,10 +112,10 @@ define void @t(i8* nocapture %in, i8* nocapture %out, i32* nocapture %rk, i32 %r
 ; ATOM-NEXT:    movl %r8d, %r14d
 ; ATOM-NEXT:    movzbl %r8b, %r8d
 ; ATOM-NEXT:    shrl $24, %r15d
-; ATOM-NEXT:    shrl $16, %ebx
+; ATOM-NEXT:    shrl $14, %ebx
 ; ATOM-NEXT:    shrl $24, %r14d
-; ATOM-NEXT:    movzbl %bl, %ebx
-; ATOM-NEXT:    movl (%rax,%rbx,4), %ebx
+; ATOM-NEXT:    andl $1020, %ebx ## imm = 0x3FC
+; ATOM-NEXT:    movl (%rax,%rbx), %ebx
 ; ATOM-NEXT:    xorl (%rdi,%r14,4), %ebx
 ; ATOM-NEXT:    movl (%r10,%r8,4), %r14d
 ; ATOM-NEXT:    xorl -12(%r9), %ebx
@@ -129,12 +128,12 @@ define void @t(i8* nocapture %in, i8* nocapture %out, i32* nocapture %rk, i32 %r
 ; ATOM-NEXT:    jb LBB0_3
 ; ATOM-NEXT:  ## %bb.2: ## %bb1
 ; ATOM-NEXT:    ## in Loop: Header=BB0_1 Depth=1
-; ATOM-NEXT:    movl %r14d, %ebp
+; ATOM-NEXT:    movl %r14d, %r15d
 ; ATOM-NEXT:    movzbl %bl, %ebx
 ; ATOM-NEXT:    shrl $24, %r14d
-; ATOM-NEXT:    shrl $16, %ebp
-; ATOM-NEXT:    movzbl %bpl, %r15d
-; ATOM-NEXT:    xorl (%rax,%r15,4), %r8d
+; ATOM-NEXT:    shrl $14, %r15d
+; ATOM-NEXT:    andl $1020, %r15d ## imm = 0x3FC
+; ATOM-NEXT:    xorl (%rax,%r15), %r8d
 ; ATOM-NEXT:    movl (%r10,%rbx,4), %r15d
 ; ATOM-NEXT:    xorl (%rdi,%r14,4), %r15d
 ; ATOM-NEXT:    xorl -4(%r9), %r8d
@@ -146,11 +145,11 @@ define void @t(i8* nocapture %in, i8* nocapture %out, i32* nocapture %rk, i32 %r
 ; ATOM-NEXT:    andl $-16777216, %r8d ## imm = 0xFF000000
 ; ATOM-NEXT:    shrl $8, %r14d
 ; ATOM-NEXT:    shlq $4, %rcx
-; ATOM-NEXT:    shrl $16, %r9d
+; ATOM-NEXT:    shrl $14, %r9d
 ; ATOM-NEXT:    movzbl 3(%rdi,%r14,4), %edi
-; ATOM-NEXT:    movzbl %r9b, %r9d
+; ATOM-NEXT:    andl $1020, %r9d ## imm = 0x3FC
 ; ATOM-NEXT:    shll $24, %edi
-; ATOM-NEXT:    movzbl 2(%rax,%r9,4), %r9d
+; ATOM-NEXT:    movzbl 2(%rax,%r9), %r9d
 ; ATOM-NEXT:    shll $16, %r9d
 ; ATOM-NEXT:    orl %r8d, %r9d
 ; ATOM-NEXT:    movzbl %bl, %r8d
@@ -172,7 +171,6 @@ define void @t(i8* nocapture %in, i8* nocapture %out, i32* nocapture %rk, i32 %r
 ; ATOM-NEXT:    popq %rbx
 ; ATOM-NEXT:    popq %r14
 ; ATOM-NEXT:    popq %r15
-; ATOM-NEXT:    popq %rbp
 ; ATOM-NEXT:    retq
 entry:
 	%0 = load i32, i32* %rk, align 4		; <i32> [#uses=1]
diff --git a/llvm/test/CodeGen/X86/parity.ll b/llvm/test/CodeGen/X86/parity.ll
index cada55919c8ce..420f5ba5ab433 100644
--- a/llvm/test/CodeGen/X86/parity.ll
+++ b/llvm/test/CodeGen/X86/parity.ll
@@ -637,7 +637,7 @@ define i64 @parity_64_shift(i64 %0) {
 ; X64-NOPOPCNT-NEXT:    xorl %eax, %eax
 ; X64-NOPOPCNT-NEXT:    xorb %ch, %cl
 ; X64-NOPOPCNT-NEXT:    setnp %al
-; X64-NOPOPCNT-NEXT:    addq %rax, %rax
+; X64-NOPOPCNT-NEXT:    addl %eax, %eax
 ; X64-NOPOPCNT-NEXT:    retq
 ;
 ; X86-POPCNT-LABEL: parity_64_shift:
@@ -654,7 +654,7 @@ define i64 @parity_64_shift(i64 %0) {
 ; X64-POPCNT:       # %bb.0:
 ; X64-POPCNT-NEXT:    popcntq %rdi, %rax
 ; X64-POPCNT-NEXT:    andl $1, %eax
-; X64-POPCNT-NEXT:    addq %rax, %rax
+; X64-POPCNT-NEXT:    addl %eax, %eax
 ; X64-POPCNT-NEXT:    retq
   %2 = tail call i64 @llvm.ctpop.i64(i64 %0)
   %3 = shl nuw nsw i64 %2, 1
diff --git a/llvm/test/CodeGen/X86/pr62653.ll b/llvm/test/CodeGen/X86/pr62653.ll
index 0a03c1831f657..b6a1bf47983dc 100644
--- a/llvm/test/CodeGen/X86/pr62653.ll
+++ b/llvm/test/CodeGen/X86/pr62653.ll
@@ -4,124 +4,117 @@
 define <64 x i4> @pr62653(<64 x i4> %a0) nounwind {
 ; CHECK-LABEL: pr62653:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    # kill: def $r9d killed $r9d def $r9
-; CHECK-NEXT:    # kill: def $r8d killed $r8d def $r8
-; CHECK-NEXT:    # kill: def $ecx killed $ecx def $rcx
-; CHECK-NEXT:    # kill: def $edx killed $edx def $rdx
-; CHECK-NEXT:    # kill: def $esi killed $esi def $rsi
 ; CHECK-NEXT:    movq %rdi, %rax
 ; CHECK-NEXT:    movzbl {{[0-9]+}}(%rsp), %edi
 ; CHECK-NEXT:    andl $15, %edi
+; CHECK-NEXT:    shll $4, %edi
 ; CHECK-NEXT:    movzbl {{[0-9]+}}(%rsp), %r10d
 ; CHECK-NEXT:    andl $15, %r10d
-; CHECK-NEXT:    shlq $4, %r10
 ; CHECK-NEXT:    orq %rdi, %r10
 ; CHECK-NEXT:    movzbl {{[0-9]+}}(%rsp), %edi
 ; CHECK-NEXT:    andl $15, %edi
-; CHECK-NEXT:    shlq $8, %rdi
+; CHECK-NEXT:    shll $8, %edi
 ; CHECK-NEXT:    orq %r10, %rdi
 ; CHECK-NEXT:    movzbl {{[0-9]+}}(%rsp), %r10d
 ; CHECK-NEXT:    andl $15, %r10d
-; CHECK-NEXT:    shlq $12, %r10
+; CHECK-NEXT:    shll $12, %r10d
 ; CHECK-NEXT:    orq %rdi, %r10
-; CHECK-NEXT:    movzbl {{[0-9]+}}(%rsp), %r11d
-; CHECK-NEXT:    andl $15, %r11d
-; CHECK-NEXT:    shlq $16, %r11
-; CHECK-NEXT:    orq %r10, %r11
 ; CHECK-NEXT:    movzbl {{[0-9]+}}(%rsp), %edi
 ; CHECK-NEXT:    andl $15, %edi
-; CHECK-NEXT:    shlq $20, %rdi
-; CHECK-NEXT:    orq %r11, %rdi
+; CHECK-NEXT:    shll $16, %edi
+; CHECK-NEXT:    orq %r10, %rdi
 ; CHECK-NEXT:    movzbl {{[0-9]+}}(%rsp), %r10d
 ; CHECK-NEXT:    andl $15, %r10d
-; CHECK-NEXT:    shlq $24, %r10
+; CHECK-NEXT:    shll $20, %r10d
 ; CHECK-NEXT:    movzbl {{[0-9]+}}(%rsp), %r11d
 ; CHECK-NEXT:    andl $15, %r11d
-; CHECK-NEXT:    shlq $28, %r11
+; CHECK-NEXT:    shll $24, %r11d
 ; CHECK-NEXT:    orq %r10, %r11
 ; CHECK-NEXT:    movzbl {{[0-9]+}}(%rsp), %r10d
-; CHECK-NEXT:    andl $15, %r10d
-; CHECK-NEXT:    shlq $32, %r10
+; CHECK-NEXT:    shll $28, %r10d
 ; CHECK-NEXT:    orq %r11, %r10
 ; CHECK-NEXT:    movzbl {{[0-9]+}}(%rsp), %r11d
 ; CHECK-NEXT:    andl $15, %r11d
-; CHECK-NEXT:    shlq $36, %r11
+; CHECK-NEXT:    shlq $32, %r11
 ; CHECK-NEXT:    orq %r10, %r11
 ; CHECK-NEXT:    movzbl {{[0-9]+}}(%rsp), %r10d
 ; CHECK-NEXT:    andl $15, %r10d
-; CHECK-NEXT:    shlq $40, %r10
+; CHECK-NEXT:    shlq $36, %r10
 ; CHECK-NEXT:    orq %r11, %r10
+; CHECK-NEXT:    orq %rdi, %r10
+; CHECK-NEXT:    movzbl {{[0-9]+}}(%rsp), %edi
+; CHECK-NEXT:    andl $15, %edi
+; CHECK-NEXT:    shlq $40, %rdi
 ; CHECK-NEXT:    movzbl {{[0-9]+}}(%rsp), %r11d
 ; CHECK-NEXT:    andl $15, %r11d
 ; CHECK-NEXT:    shlq $44, %r11
-; CHECK-NEXT:    orq %r10, %r11
 ; CHECK-NEXT:    orq %rdi, %r11
 ; CHECK-NEXT:    movzbl {{[0-9]+}}(%rsp), %edi
 ; CHECK-NEXT:    andl $15, %edi
 ; CHECK-NEXT:    shlq $48, %rdi
-; CHECK-NEXT:    movzbl {{[0-9]+}}(%rsp), %r10d
-; CHECK-NEXT:    andl $15, %r10d
-; CHECK-NEXT:    shlq $52, %r10
-; CHECK-NEXT:    orq %rdi, %r10
-; CHECK-NEXT:    orq %r11, %r10
-; CHECK-NEXT:    movq %r10, 8(%rax)
+; CHECK-NEXT:    orq %r11, %rdi
+; CHECK-NEXT:    movzbl {{[0-9]+}}(%rsp), %r11d
+; CHECK-NEXT:    andl $15, %r11d
+; CHECK-NEXT:    shlq $52, %r11
+; CHECK-NEXT:    orq %rdi, %r11
+; CHECK-NEXT:    orq %r10, %r11
+; CHECK-NEXT:    movq %r11, 8(%rax)
+; CHECK-NEXT:    movzbl {{[0-9]+}}(%rsp), %edi
+; CHECK-NEXT:    andl $15, %edi
+; CHECK-NEXT:    shlq $32, %rdi
 ; CHECK-NEXT:    andl $15, %esi
 ; CHECK-NEXT:    andl $15, %edx
-; CHECK-NEXT:    shlq $4, %rdx
-; CHECK-NEXT:    orq %rsi, %rdx
+; CHECK-NEXT:    shll $4, %edx
+; CHECK-NEXT:    orl %esi, %edx
 ; CHECK-NEXT:    andl $15, %ecx
-; CHECK-NEXT:    shlq $8, %rcx
-; CHECK-NEXT:    orq %rdx, %rcx
+; CHECK-NEXT:    shll $8, %ecx
+; CHECK-NEXT:    orl %edx, %ecx
 ; CHECK-NEXT:    andl $15, %r8d
-; CHECK-NEXT:    shlq $12, %r8
-; CHECK-NEXT:    orq %rcx, %r8
+; CHECK-NEXT:    shll $12, %r8d
+; CHECK-NEXT:    orl %ecx, %r8d
 ; CHECK-NEXT:    andl $15, %r9d
-; CHECK-NEXT:    shlq $16, %r9
-; CHECK-NEXT:    orq %r8, %r9
+; CHECK-NEXT:    shll $16, %r9d
+; CHECK-NEXT:    orl %r8d, %r9d
 ; CHECK-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx
 ; CHECK-NEXT:    andl $15, %ecx
-; CHECK-NEXT:    shlq $20, %rcx
-; CHECK-NEXT:    orq %r9, %rcx
-; CHECK-NEXT:    movzbl {{[0-9]+}}(%rsp), %esi
-; CHECK-NEXT:    andl $15, %esi
-; CHECK-NEXT:    shlq $24, %rsi
+; CHECK-NEXT:    shll $20, %ecx
+; CHECK-NEXT:    orl %r9d, %ecx
 ; CHECK-NEXT:    movzbl {{[0-9]+}}(%rsp), %edx
 ; CHECK-NEXT:    andl $15, %edx
-; CHECK-NEXT:    shlq $28, %rdx
-; CHECK-NEXT:    orq %rsi, %rdx
-; CHECK-NEXT:    orq %rcx, %rdx
-; CHECK-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx
-; CHECK-NEXT:    andl $15, %ecx
-; CHECK-NEXT:    shlq $32, %rcx
+; CHECK-NEXT:    shll $24, %edx
 ; CHECK-NEXT:    movzbl {{[0-9]+}}(%rsp), %esi
-; CHECK-NEXT:    andl $15, %esi
-; CHECK-NEXT:    shlq $36, %rsi
-; CHECK-NEXT:    orq %rcx, %rsi
+; CHECK-NEXT:    shll $28, %esi
+; CHECK-NEXT:    orl %edx, %esi
+; CHECK-NEXT:    orl %ecx, %esi
+; CHECK-NEXT:    orq %rdi, %rsi
 ; CHECK-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx
 ; CHECK-NEXT:    andl $15, %ecx
-; CHECK-NEXT:    shlq $40, %rcx
+; CHECK-NEXT:    shlq $36, %rcx
 ; CHECK-NEXT:    orq %rsi, %rcx
+; CHECK-NEXT:    movzbl {{[0-9]+}}(%rsp), %edx
+; CHECK-NEXT:    andl $15, %edx
+; CHECK-NEXT:    shlq $40, %rdx
+; CHECK-NEXT:    orq %rcx, %rdx
+; CHECK-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx
+; CHECK-NEXT:    andl $15, %ecx
+; CHECK-NEXT:    shlq $44, %rcx
 ; CHECK-NEXT:    orq %rdx, %rcx
 ; CHECK-NEXT:    movzbl {{[0-9]+}}(%rsp), %edx
 ; CHECK-NEXT:    andl $15, %edx
-; CHECK-NEXT:    shlq $44, %rdx
+; CHECK-NEXT:    shlq $48, %rdx
 ; CHECK-NEXT:    movzbl {{[0-9]+}}(%rsp), %esi
 ; CHECK-NEXT:    andl $15, %esi
-; CHECK-NEXT:    shlq $48, %rsi
+; CHECK-NEXT:    shlq $52, %rsi
 ; CHECK-NEXT:    orq %rdx, %rsi
 ; CHECK-NEXT:    movzbl {{[0-9]+}}(%rsp), %edx
 ; CHECK-NEXT:    andl $15, %edx
-; CHECK-NEXT:    shlq $52, %rdx
+; CHECK-NEXT:    shlq $56, %rdx
 ; CHECK-NEXT:    orq %rsi, %rdx
 ; CHECK-NEXT:    movzbl {{[0-9]+}}(%rsp), %esi
-; CHECK-NEXT:    andl $15, %esi
-; CHECK-NEXT:    shlq $56, %rsi
+; CHECK-NEXT:    shlq $60, %rsi
 ; CHECK-NEXT:    orq %rdx, %rsi
 ; CHECK-NEXT:    orq %rcx, %rsi
-; CHECK-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx
-; CHECK-NEXT:    shlq $60, %rcx
-; CHECK-NEXT:    orq %rsi, %rcx
-; CHECK-NEXT:    movq %rcx, (%rax)
+; CHECK-NEXT:    movq %rsi, (%rax)
 ; CHECK-NEXT:    retq
   %res = shufflevector <64 x i4> %a0, <64 x i4> zeroinitializer, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 64, i32 65, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
   ret <64 x i4> %res
diff --git a/llvm/test/CodeGen/X86/select.ll b/llvm/test/CodeGen/X86/select.ll
index 213b2b018d0ad..6d2b73e2108ba 100644
--- a/llvm/test/CodeGen/X86/select.ll
+++ b/llvm/test/CodeGen/X86/select.ll
@@ -393,22 +393,22 @@ define void @test6(i32 %C, ptr %A, ptr %B) nounwind {
 define x86_fp80 @test7(i32 %tmp8) nounwind {
 ; GENERIC-LABEL: test7:
 ; GENERIC:       ## %bb.0:
-; GENERIC-NEXT:    xorl %eax, %eax
-; GENERIC-NEXT:    testl %edi, %edi
-; GENERIC-NEXT:    setns %al
-; GENERIC-NEXT:    shlq $4, %rax
-; GENERIC-NEXT:    leaq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rcx
-; GENERIC-NEXT:    fldt (%rax,%rcx)
+; GENERIC-NEXT:    ## kill: def $edi killed $edi def $rdi
+; GENERIC-NEXT:    notl %edi
+; GENERIC-NEXT:    shrl $27, %edi
+; GENERIC-NEXT:    andl $-16, %edi
+; GENERIC-NEXT:    leaq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax
+; GENERIC-NEXT:    fldt (%rdi,%rax)
 ; GENERIC-NEXT:    retq
 ;
 ; ATOM-LABEL: test7:
 ; ATOM:       ## %bb.0:
-; ATOM-NEXT:    xorl %eax, %eax
-; ATOM-NEXT:    leaq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rcx
-; ATOM-NEXT:    testl %edi, %edi
-; ATOM-NEXT:    setns %al
-; ATOM-NEXT:    shlq $4, %rax
-; ATOM-NEXT:    fldt (%rax,%rcx)
+; ATOM-NEXT:    ## kill: def $edi killed $edi def $rdi
+; ATOM-NEXT:    leaq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %rax
+; ATOM-NEXT:    notl %edi
+; ATOM-NEXT:    shrl $27, %edi
+; ATOM-NEXT:    andl $-16, %edi
+; ATOM-NEXT:    fldt (%rdi,%rax)
 ; ATOM-NEXT:    retq
 ;
 ; ATHLON-LABEL: test7:
diff --git a/llvm/test/CodeGen/X86/select_const.ll b/llvm/test/CodeGen/X86/select_const.ll
index dd054348ac014..eba22036701b4 100644
--- a/llvm/test/CodeGen/X86/select_const.ll
+++ b/llvm/test/CodeGen/X86/select_const.ll
@@ -628,7 +628,7 @@ define i64 @select_pow2_diff_neg_invert(i1 zeroext %cond) {
 ; X64:       # %bb.0:
 ; X64-NEXT:    xorb $1, %dil
 ; X64-NEXT:    movzbl %dil, %eax
-; X64-NEXT:    shlq $7, %rax
+; X64-NEXT:    shll $7, %eax
 ; X64-NEXT:    addq $-99, %rax
 ; X64-NEXT:    retq
   %sel = select i1 %cond, i64 -99, i64 29
diff --git a/llvm/test/CodeGen/X86/selectcc-to-shiftand.ll b/llvm/test/CodeGen/X86/selectcc-to-shiftand.ll
index 220b7dc46dd94..03f4c0f61cdd1 100644
--- a/llvm/test/CodeGen/X86/selectcc-to-shiftand.ll
+++ b/llvm/test/CodeGen/X86/selectcc-to-shiftand.ll
@@ -194,7 +194,7 @@ define i64 @sel_shift_bool_i64(i1 %t) {
 ; ANY:       # %bb.0:
 ; ANY-NEXT:    movl %edi, %eax
 ; ANY-NEXT:    andl $1, %eax
-; ANY-NEXT:    shlq $16, %rax
+; ANY-NEXT:    shll $16, %eax
 ; ANY-NEXT:    retq
   %shl = select i1 %t, i64 65536, i64 0
   ret i64 %shl
diff --git a/llvm/test/CodeGen/X86/setcc.ll b/llvm/test/CodeGen/X86/setcc.ll
index c38318d5f6a25..60ac6df3f77af 100644
--- a/llvm/test/CodeGen/X86/setcc.ll
+++ b/llvm/test/CodeGen/X86/setcc.ll
@@ -64,7 +64,7 @@ define i64 @t3(i64 %x) nounwind readnone ssp {
 ; X64-NEXT:    xorl %eax, %eax
 ; X64-NEXT:    cmpq $18, %rdi
 ; X64-NEXT:    setb %al
-; X64-NEXT:    shlq $6, %rax
+; X64-NEXT:    shll $6, %eax
 ; X64-NEXT:    retq
   %t0 = icmp ult i64 %x, 18
   %if = select i1 %t0, i64 64, i64 0
diff --git a/llvm/test/CodeGen/X86/shift-combine.ll b/llvm/test/CodeGen/X86/shift-combine.ll
index ab504d0a43fef..cf45641fba632 100644
--- a/llvm/test/CodeGen/X86/shift-combine.ll
+++ b/llvm/test/CodeGen/X86/shift-combine.ll
@@ -15,9 +15,8 @@ define dso_local i32 @test_lshr_and(i32 %x) {
 ; X64-LABEL: test_lshr_and:
 ; X64:       # %bb.0:
 ; X64-NEXT:    # kill: def $edi killed $edi def $rdi
-; X64-NEXT:    shrl $2, %edi
-; X64-NEXT:    andl $3, %edi
-; X64-NEXT:    movl array(,%rdi,4), %eax
+; X64-NEXT:    andl $12, %edi
+; X64-NEXT:    movl array(%rdi), %eax
 ; X64-NEXT:    retq
   %tmp2 = lshr i32 %x, 2
   %tmp3 = and i32 %tmp2, 3
@@ -104,8 +103,8 @@ define dso_local ptr @test_exact4(i32 %a, i32 %b, ptr %x)  {
 ; X64:       # %bb.0:
 ; X64-NEXT:    # kill: def $esi killed $esi def $rsi
 ; X64-NEXT:    subl %edi, %esi
-; X64-NEXT:    shrl $3, %esi
-; X64-NEXT:    leaq (%rdx,%rsi,4), %rax
+; X64-NEXT:    shrl %esi
+; X64-NEXT:    leaq (%rsi,%rdx), %rax
 ; X64-NEXT:    retq
   %sub = sub i32 %b, %a
   %shr = lshr exact i32 %sub, 3
@@ -126,8 +125,8 @@ define dso_local ptr @test_exact5(i32 %a, i32 %b, ptr %x)  {
 ; X64:       # %bb.0:
 ; X64-NEXT:    # kill: def $esi killed $esi def $rsi
 ; X64-NEXT:    subl %edi, %esi
-; X64-NEXT:    shrl $3, %esi
-; X64-NEXT:    leaq (%rdx,%rsi,4), %rax
+; X64-NEXT:    shrl %esi
+; X64-NEXT:    leaq (%rsi,%rdx), %rax
 ; X64-NEXT:    retq
   %sub = sub i32 %b, %a
   %shr = lshr exact i32 %sub, 3
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-variable-128.ll b/llvm/test/CodeGen/X86/vector-shuffle-variable-128.ll
index 3ca0e2121e0d1..ce8d2acd035f6 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-variable-128.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-variable-128.ll
@@ -255,28 +255,28 @@ define <8 x i16> @var_shuffle_v8i16_v8i16_xxxxxxxx_i16(<8 x i16> %x, i16 %i0, i1
 ; SSE2-NEXT:    andl $7, %r8d
 ; SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; SSE2-NEXT:    andl $7, %r9d
-; SSE2-NEXT:    movzwl -24(%rsp,%rcx,2), %ecx
-; SSE2-NEXT:    movd %ecx, %xmm0
-; SSE2-NEXT:    movzwl -24(%rsp,%rdx,2), %ecx
-; SSE2-NEXT:    movd %ecx, %xmm1
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT:    movzwl -24(%rsp,%rsi,2), %ecx
-; SSE2-NEXT:    movd %ecx, %xmm2
-; SSE2-NEXT:    movzwl -24(%rsp,%rdi,2), %ecx
-; SSE2-NEXT:    movd %ecx, %xmm0
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT:    movzwl -24(%rsp,%r9,2), %ecx
-; SSE2-NEXT:    movd %ecx, %xmm1
-; SSE2-NEXT:    movzwl -24(%rsp,%r8,2), %ecx
-; SSE2-NEXT:    movd %ecx, %xmm2
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; SSE2-NEXT:    movzwl -24(%rsp,%r10,2), %ecx
-; SSE2-NEXT:    movd %ecx, %xmm1
+; SSE2-NEXT:    movzwl -24(%rsp,%r10,2), %r10d
+; SSE2-NEXT:    movd %r10d, %xmm0
 ; SSE2-NEXT:    movzwl -24(%rsp,%rax,2), %eax
+; SSE2-NEXT:    movd %eax, %xmm1
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-NEXT:    movzwl -24(%rsp,%r9,2), %eax
+; SSE2-NEXT:    movd %eax, %xmm0
+; SSE2-NEXT:    movzwl -24(%rsp,%r8,2), %eax
+; SSE2-NEXT:    movd %eax, %xmm2
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSE2-NEXT:    movzwl -24(%rsp,%rcx,2), %eax
+; SSE2-NEXT:    movd %eax, %xmm0
+; SSE2-NEXT:    movzwl -24(%rsp,%rdx,2), %eax
+; SSE2-NEXT:    movd %eax, %xmm1
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-NEXT:    movzwl -24(%rsp,%rsi,2), %eax
 ; SSE2-NEXT:    movd %eax, %xmm3
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; SSE2-NEXT:    movzwl -24(%rsp,%rdi,2), %eax
+; SSE2-NEXT:    movd %eax, %xmm0
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
 ; SSE2-NEXT:    retq
 ;
@@ -299,28 +299,28 @@ define <8 x i16> @var_shuffle_v8i16_v8i16_xxxxxxxx_i16(<8 x i16> %x, i16 %i0, i1
 ; SSSE3-NEXT:    andl $7, %r8d
 ; SSSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; SSSE3-NEXT:    andl $7, %r9d
-; SSSE3-NEXT:    movzwl -24(%rsp,%rcx,2), %ecx
-; SSSE3-NEXT:    movd %ecx, %xmm0
-; SSSE3-NEXT:    movzwl -24(%rsp,%rdx,2), %ecx
-; SSSE3-NEXT:    movd %ecx, %xmm1
-; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSSE3-NEXT:    movzwl -24(%rsp,%rsi,2), %ecx
-; SSSE3-NEXT:    movd %ecx, %xmm2
-; SSSE3-NEXT:    movzwl -24(%rsp,%rdi,2), %ecx
-; SSSE3-NEXT:    movd %ecx, %xmm0
-; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSSE3-NEXT:    movzwl -24(%rsp,%r9,2), %ecx
-; SSSE3-NEXT:    movd %ecx, %xmm1
-; SSSE3-NEXT:    movzwl -24(%rsp,%r8,2), %ecx
-; SSSE3-NEXT:    movd %ecx, %xmm2
-; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; SSSE3-NEXT:    movzwl -24(%rsp,%r10,2), %ecx
-; SSSE3-NEXT:    movd %ecx, %xmm1
+; SSSE3-NEXT:    movzwl -24(%rsp,%r10,2), %r10d
+; SSSE3-NEXT:    movd %r10d, %xmm0
 ; SSSE3-NEXT:    movzwl -24(%rsp,%rax,2), %eax
+; SSSE3-NEXT:    movd %eax, %xmm1
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSSE3-NEXT:    movzwl -24(%rsp,%r9,2), %eax
+; SSSE3-NEXT:    movd %eax, %xmm0
+; SSSE3-NEXT:    movzwl -24(%rsp,%r8,2), %eax
+; SSSE3-NEXT:    movd %eax, %xmm2
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSSE3-NEXT:    movzwl -24(%rsp,%rcx,2), %eax
+; SSSE3-NEXT:    movd %eax, %xmm0
+; SSSE3-NEXT:    movzwl -24(%rsp,%rdx,2), %eax
+; SSSE3-NEXT:    movd %eax, %xmm1
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSSE3-NEXT:    movzwl -24(%rsp,%rsi,2), %eax
 ; SSSE3-NEXT:    movd %eax, %xmm3
-; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
-; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; SSSE3-NEXT:    movzwl -24(%rsp,%rdi,2), %eax
+; SSSE3-NEXT:    movd %eax, %xmm0
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
 ; SSSE3-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-variable-256.ll b/llvm/test/CodeGen/X86/vector-shuffle-variable-256.ll
index f3bafec3399a7..8f78438dedf92 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-variable-256.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-variable-256.ll
@@ -293,52 +293,52 @@ define <16 x i16> @var_shuffle_v16i16_v16i16_xxxxxxxxxxxxxxxx_i16(<16 x i16> %x,
 ; AVX1-NEXT:    # kill: def $edx killed $edx def $rdx
 ; AVX1-NEXT:    # kill: def $esi killed $esi def $rsi
 ; AVX1-NEXT:    # kill: def $edi killed $edi def $rdi
-; AVX1-NEXT:    andl $15, %edi
-; AVX1-NEXT:    vmovaps %ymm0, (%rsp)
-; AVX1-NEXT:    movzwl (%rsp,%rdi,2), %eax
-; AVX1-NEXT:    vmovd %eax, %xmm0
-; AVX1-NEXT:    andl $15, %esi
-; AVX1-NEXT:    vpinsrw $1, (%rsp,%rsi,2), %xmm0, %xmm0
-; AVX1-NEXT:    andl $15, %edx
-; AVX1-NEXT:    vpinsrw $2, (%rsp,%rdx,2), %xmm0, %xmm0
-; AVX1-NEXT:    andl $15, %ecx
-; AVX1-NEXT:    vpinsrw $3, (%rsp,%rcx,2), %xmm0, %xmm0
-; AVX1-NEXT:    andl $15, %r8d
-; AVX1-NEXT:    vpinsrw $4, (%rsp,%r8,2), %xmm0, %xmm0
-; AVX1-NEXT:    andl $15, %r9d
-; AVX1-NEXT:    vpinsrw $5, (%rsp,%r9,2), %xmm0, %xmm0
-; AVX1-NEXT:    movl 16(%rbp), %eax
-; AVX1-NEXT:    andl $15, %eax
-; AVX1-NEXT:    vpinsrw $6, (%rsp,%rax,2), %xmm0, %xmm0
-; AVX1-NEXT:    movl 24(%rbp), %eax
-; AVX1-NEXT:    andl $15, %eax
-; AVX1-NEXT:    vpinsrw $7, (%rsp,%rax,2), %xmm0, %xmm0
 ; AVX1-NEXT:    movl 32(%rbp), %eax
 ; AVX1-NEXT:    andl $15, %eax
+; AVX1-NEXT:    vmovaps %ymm0, (%rsp)
 ; AVX1-NEXT:    movzwl (%rsp,%rax,2), %eax
-; AVX1-NEXT:    vmovd %eax, %xmm1
+; AVX1-NEXT:    vmovd %eax, %xmm0
 ; AVX1-NEXT:    movl 40(%rbp), %eax
 ; AVX1-NEXT:    andl $15, %eax
-; AVX1-NEXT:    vpinsrw $1, (%rsp,%rax,2), %xmm1, %xmm1
+; AVX1-NEXT:    vpinsrw $1, (%rsp,%rax,2), %xmm0, %xmm0
 ; AVX1-NEXT:    movl 48(%rbp), %eax
 ; AVX1-NEXT:    andl $15, %eax
-; AVX1-NEXT:    vpinsrw $2, (%rsp,%rax,2), %xmm1, %xmm1
+; AVX1-NEXT:    vpinsrw $2, (%rsp,%rax,2), %xmm0, %xmm0
 ; AVX1-NEXT:    movl 56(%rbp), %eax
 ; AVX1-NEXT:    andl $15, %eax
-; AVX1-NEXT:    vpinsrw $3, (%rsp,%rax,2), %xmm1, %xmm1
+; AVX1-NEXT:    vpinsrw $3, (%rsp,%rax,2), %xmm0, %xmm0
 ; AVX1-NEXT:    movl 64(%rbp), %eax
 ; AVX1-NEXT:    andl $15, %eax
-; AVX1-NEXT:    vpinsrw $4, (%rsp,%rax,2), %xmm1, %xmm1
+; AVX1-NEXT:    vpinsrw $4, (%rsp,%rax,2), %xmm0, %xmm0
 ; AVX1-NEXT:    movl 72(%rbp), %eax
 ; AVX1-NEXT:    andl $15, %eax
-; AVX1-NEXT:    vpinsrw $5, (%rsp,%rax,2), %xmm1, %xmm1
+; AVX1-NEXT:    vpinsrw $5, (%rsp,%rax,2), %xmm0, %xmm0
 ; AVX1-NEXT:    movl 80(%rbp), %eax
 ; AVX1-NEXT:    andl $15, %eax
-; AVX1-NEXT:    vpinsrw $6, (%rsp,%rax,2), %xmm1, %xmm1
+; AVX1-NEXT:    vpinsrw $6, (%rsp,%rax,2), %xmm0, %xmm0
 ; AVX1-NEXT:    movl 88(%rbp), %eax
 ; AVX1-NEXT:    andl $15, %eax
+; AVX1-NEXT:    vpinsrw $7, (%rsp,%rax,2), %xmm0, %xmm0
+; AVX1-NEXT:    andl $15, %edi
+; AVX1-NEXT:    movzwl (%rsp,%rdi,2), %eax
+; AVX1-NEXT:    vmovd %eax, %xmm1
+; AVX1-NEXT:    andl $15, %esi
+; AVX1-NEXT:    vpinsrw $1, (%rsp,%rsi,2), %xmm1, %xmm1
+; AVX1-NEXT:    andl $15, %edx
+; AVX1-NEXT:    vpinsrw $2, (%rsp,%rdx,2), %xmm1, %xmm1
+; AVX1-NEXT:    andl $15, %ecx
+; AVX1-NEXT:    vpinsrw $3, (%rsp,%rcx,2), %xmm1, %xmm1
+; AVX1-NEXT:    andl $15, %r8d
+; AVX1-NEXT:    vpinsrw $4, (%rsp,%r8,2), %xmm1, %xmm1
+; AVX1-NEXT:    andl $15, %r9d
+; AVX1-NEXT:    vpinsrw $5, (%rsp,%r9,2), %xmm1, %xmm1
+; AVX1-NEXT:    movl 16(%rbp), %eax
+; AVX1-NEXT:    andl $15, %eax
+; AVX1-NEXT:    vpinsrw $6, (%rsp,%rax,2), %xmm1, %xmm1
+; AVX1-NEXT:    movl 24(%rbp), %eax
+; AVX1-NEXT:    andl $15, %eax
 ; AVX1-NEXT:    vpinsrw $7, (%rsp,%rax,2), %xmm1, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX1-NEXT:    movq %rbp, %rsp
 ; AVX1-NEXT:    popq %rbp
 ; AVX1-NEXT:    retq
@@ -355,52 +355,52 @@ define <16 x i16> @var_shuffle_v16i16_v16i16_xxxxxxxxxxxxxxxx_i16(<16 x i16> %x,
 ; AVX2-NEXT:    # kill: def $edx killed $edx def $rdx
 ; AVX2-NEXT:    # kill: def $esi killed $esi def $rsi
 ; AVX2-NEXT:    # kill: def $edi killed $edi def $rdi
-; AVX2-NEXT:    andl $15, %edi
-; AVX2-NEXT:    vmovaps %ymm0, (%rsp)
-; AVX2-NEXT:    movzwl (%rsp,%rdi,2), %eax
-; AVX2-NEXT:    vmovd %eax, %xmm0
-; AVX2-NEXT:    andl $15, %esi
-; AVX2-NEXT:    vpinsrw $1, (%rsp,%rsi,2), %xmm0, %xmm0
-; AVX2-NEXT:    andl $15, %edx
-; AVX2-NEXT:    vpinsrw $2, (%rsp,%rdx,2), %xmm0, %xmm0
-; AVX2-NEXT:    andl $15, %ecx
-; AVX2-NEXT:    vpinsrw $3, (%rsp,%rcx,2), %xmm0, %xmm0
-; AVX2-NEXT:    andl $15, %r8d
-; AVX2-NEXT:    vpinsrw $4, (%rsp,%r8,2), %xmm0, %xmm0
-; AVX2-NEXT:    andl $15, %r9d
-; AVX2-NEXT:    vpinsrw $5, (%rsp,%r9,2), %xmm0, %xmm0
-; AVX2-NEXT:    movl 16(%rbp), %eax
-; AVX2-NEXT:    andl $15, %eax
-; AVX2-NEXT:    vpinsrw $6, (%rsp,%rax,2), %xmm0, %xmm0
-; AVX2-NEXT:    movl 24(%rbp), %eax
-; AVX2-NEXT:    andl $15, %eax
-; AVX2-NEXT:    vpinsrw $7, (%rsp,%rax,2), %xmm0, %xmm0
 ; AVX2-NEXT:    movl 32(%rbp), %eax
 ; AVX2-NEXT:    andl $15, %eax
+; AVX2-NEXT:    vmovaps %ymm0, (%rsp)
 ; AVX2-NEXT:    movzwl (%rsp,%rax,2), %eax
-; AVX2-NEXT:    vmovd %eax, %xmm1
+; AVX2-NEXT:    vmovd %eax, %xmm0
 ; AVX2-NEXT:    movl 40(%rbp), %eax
 ; AVX2-NEXT:    andl $15, %eax
-; AVX2-NEXT:    vpinsrw $1, (%rsp,%rax,2), %xmm1, %xmm1
+; AVX2-NEXT:    vpinsrw $1, (%rsp,%rax,2), %xmm0, %xmm0
 ; AVX2-NEXT:    movl 48(%rbp), %eax
 ; AVX2-NEXT:    andl $15, %eax
-; AVX2-NEXT:    vpinsrw $2, (%rsp,%rax,2), %xmm1, %xmm1
+; AVX2-NEXT:    vpinsrw $2, (%rsp,%rax,2), %xmm0, %xmm0
 ; AVX2-NEXT:    movl 56(%rbp), %eax
 ; AVX2-NEXT:    andl $15, %eax
-; AVX2-NEXT:    vpinsrw $3, (%rsp,%rax,2), %xmm1, %xmm1
+; AVX2-NEXT:    vpinsrw $3, (%rsp,%rax,2), %xmm0, %xmm0
 ; AVX2-NEXT:    movl 64(%rbp), %eax
 ; AVX2-NEXT:    andl $15, %eax
-; AVX2-NEXT:    vpinsrw $4, (%rsp,%rax,2), %xmm1, %xmm1
+; AVX2-NEXT:    vpinsrw $4, (%rsp,%rax,2), %xmm0, %xmm0
 ; AVX2-NEXT:    movl 72(%rbp), %eax
 ; AVX2-NEXT:    andl $15, %eax
-; AVX2-NEXT:    vpinsrw $5, (%rsp,%rax,2), %xmm1, %xmm1
+; AVX2-NEXT:    vpinsrw $5, (%rsp,%rax,2), %xmm0, %xmm0
 ; AVX2-NEXT:    movl 80(%rbp), %eax
 ; AVX2-NEXT:    andl $15, %eax
-; AVX2-NEXT:    vpinsrw $6, (%rsp,%rax,2), %xmm1, %xmm1
+; AVX2-NEXT:    vpinsrw $6, (%rsp,%rax,2), %xmm0, %xmm0
 ; AVX2-NEXT:    movl 88(%rbp), %eax
 ; AVX2-NEXT:    andl $15, %eax
+; AVX2-NEXT:    vpinsrw $7, (%rsp,%rax,2), %xmm0, %xmm0
+; AVX2-NEXT:    andl $15, %edi
+; AVX2-NEXT:    movzwl (%rsp,%rdi,2), %eax
+; AVX2-NEXT:    vmovd %eax, %xmm1
+; AVX2-NEXT:    andl $15, %esi
+; AVX2-NEXT:    vpinsrw $1, (%rsp,%rsi,2), %xmm1, %xmm1
+; AVX2-NEXT:    andl $15, %edx
+; AVX2-NEXT:    vpinsrw $2, (%rsp,%rdx,2), %xmm1, %xmm1
+; AVX2-NEXT:    andl $15, %ecx
+; AVX2-NEXT:    vpinsrw $3, (%rsp,%rcx,2), %xmm1, %xmm1
+; AVX2-NEXT:    andl $15, %r8d
+; AVX2-NEXT:    vpinsrw $4, (%rsp,%r8,2), %xmm1, %xmm1
+; AVX2-NEXT:    andl $15, %r9d
+; AVX2-NEXT:    vpinsrw $5, (%rsp,%r9,2), %xmm1, %xmm1
+; AVX2-NEXT:    movl 16(%rbp), %eax
+; AVX2-NEXT:    andl $15, %eax
+; AVX2-NEXT:    vpinsrw $6, (%rsp,%rax,2), %xmm1, %xmm1
+; AVX2-NEXT:    movl 24(%rbp), %eax
+; AVX2-NEXT:    andl $15, %eax
 ; AVX2-NEXT:    vpinsrw $7, (%rsp,%rax,2), %xmm1, %xmm1
-; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
 ; AVX2-NEXT:    movq %rbp, %rsp
 ; AVX2-NEXT:    popq %rbp
 ; AVX2-NEXT:    retq
@@ -448,52 +448,52 @@ define <16 x i16> @var_shuffle_v16i16_v8i16_xxxxxxxxxxxxxxxx_i16(<8 x i16> %x, i
 ; AVX1-NEXT:    # kill: def $edx killed $edx def $rdx
 ; AVX1-NEXT:    # kill: def $esi killed $esi def $rsi
 ; AVX1-NEXT:    # kill: def $edi killed $edi def $rdi
-; AVX1-NEXT:    andl $7, %edi
-; AVX1-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX1-NEXT:    movzwl -24(%rsp,%rdi,2), %eax
-; AVX1-NEXT:    vmovd %eax, %xmm0
-; AVX1-NEXT:    andl $7, %esi
-; AVX1-NEXT:    vpinsrw $1, -24(%rsp,%rsi,2), %xmm0, %xmm0
-; AVX1-NEXT:    andl $7, %edx
-; AVX1-NEXT:    vpinsrw $2, -24(%rsp,%rdx,2), %xmm0, %xmm0
-; AVX1-NEXT:    andl $7, %ecx
-; AVX1-NEXT:    vpinsrw $3, -24(%rsp,%rcx,2), %xmm0, %xmm0
-; AVX1-NEXT:    andl $7, %r8d
-; AVX1-NEXT:    vpinsrw $4, -24(%rsp,%r8,2), %xmm0, %xmm0
-; AVX1-NEXT:    andl $7, %r9d
-; AVX1-NEXT:    vpinsrw $5, -24(%rsp,%r9,2), %xmm0, %xmm0
 ; AVX1-NEXT:    movl {{[0-9]+}}(%rsp), %eax
 ; AVX1-NEXT:    andl $7, %eax
-; AVX1-NEXT:    vpinsrw $6, -24(%rsp,%rax,2), %xmm0, %xmm0
+; AVX1-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT:    movzwl -24(%rsp,%rax,2), %eax
+; AVX1-NEXT:    vmovd %eax, %xmm0
 ; AVX1-NEXT:    movl {{[0-9]+}}(%rsp), %eax
 ; AVX1-NEXT:    andl $7, %eax
-; AVX1-NEXT:    vpinsrw $7, -24(%rsp,%rax,2), %xmm0, %xmm0
+; AVX1-NEXT:    vpinsrw $1, -24(%rsp,%rax,2), %xmm0, %xmm0
 ; AVX1-NEXT:    movl {{[0-9]+}}(%rsp), %eax
 ; AVX1-NEXT:    andl $7, %eax
-; AVX1-NEXT:    movzwl -24(%rsp,%rax,2), %eax
-; AVX1-NEXT:    vmovd %eax, %xmm1
+; AVX1-NEXT:    vpinsrw $2, -24(%rsp,%rax,2), %xmm0, %xmm0
 ; AVX1-NEXT:    movl {{[0-9]+}}(%rsp), %eax
 ; AVX1-NEXT:    andl $7, %eax
-; AVX1-NEXT:    vpinsrw $1, -24(%rsp,%rax,2), %xmm1, %xmm1
+; AVX1-NEXT:    vpinsrw $3, -24(%rsp,%rax,2), %xmm0, %xmm0
 ; AVX1-NEXT:    movl {{[0-9]+}}(%rsp), %eax
 ; AVX1-NEXT:    andl $7, %eax
-; AVX1-NEXT:    vpinsrw $2, -24(%rsp,%rax,2), %xmm1, %xmm1
+; AVX1-NEXT:    vpinsrw $4, -24(%rsp,%rax,2), %xmm0, %xmm0
 ; AVX1-NEXT:    movl {{[0-9]+}}(%rsp), %eax
 ; AVX1-NEXT:    andl $7, %eax
-; AVX1-NEXT:    vpinsrw $3, -24(%rsp,%rax,2), %xmm1, %xmm1
+; AVX1-NEXT:    vpinsrw $5, -24(%rsp,%rax,2), %xmm0, %xmm0
 ; AVX1-NEXT:    movl {{[0-9]+}}(%rsp), %eax
 ; AVX1-NEXT:    andl $7, %eax
-; AVX1-NEXT:    vpinsrw $4, -24(%rsp,%rax,2), %xmm1, %xmm1
+; AVX1-NEXT:    vpinsrw $6, -24(%rsp,%rax,2), %xmm0, %xmm0
 ; AVX1-NEXT:    movl {{[0-9]+}}(%rsp), %eax
 ; AVX1-NEXT:    andl $7, %eax
-; AVX1-NEXT:    vpinsrw $5, -24(%rsp,%rax,2), %xmm1, %xmm1
+; AVX1-NEXT:    vpinsrw $7, -24(%rsp,%rax,2), %xmm0, %xmm0
+; AVX1-NEXT:    andl $7, %edi
+; AVX1-NEXT:    movzwl -24(%rsp,%rdi,2), %eax
+; AVX1-NEXT:    vmovd %eax, %xmm1
+; AVX1-NEXT:    andl $7, %esi
+; AVX1-NEXT:    vpinsrw $1, -24(%rsp,%rsi,2), %xmm1, %xmm1
+; AVX1-NEXT:    andl $7, %edx
+; AVX1-NEXT:    vpinsrw $2, -24(%rsp,%rdx,2), %xmm1, %xmm1
+; AVX1-NEXT:    andl $7, %ecx
+; AVX1-NEXT:    vpinsrw $3, -24(%rsp,%rcx,2), %xmm1, %xmm1
+; AVX1-NEXT:    andl $7, %r8d
+; AVX1-NEXT:    vpinsrw $4, -24(%rsp,%r8,2), %xmm1, %xmm1
+; AVX1-NEXT:    andl $7, %r9d
+; AVX1-NEXT:    vpinsrw $5, -24(%rsp,%r9,2), %xmm1, %xmm1
 ; AVX1-NEXT:    movl {{[0-9]+}}(%rsp), %eax
 ; AVX1-NEXT:    andl $7, %eax
 ; AVX1-NEXT:    vpinsrw $6, -24(%rsp,%rax,2), %xmm1, %xmm1
 ; AVX1-NEXT:    movl {{[0-9]+}}(%rsp), %eax
 ; AVX1-NEXT:    andl $7, %eax
 ; AVX1-NEXT:    vpinsrw $7, -24(%rsp,%rax,2), %xmm1, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: var_shuffle_v16i16_v8i16_xxxxxxxxxxxxxxxx_i16:
@@ -504,52 +504,52 @@ define <16 x i16> @var_shuffle_v16i16_v8i16_xxxxxxxxxxxxxxxx_i16(<8 x i16> %x, i
 ; AVX2-NEXT:    # kill: def $edx killed $edx def $rdx
 ; AVX2-NEXT:    # kill: def $esi killed $esi def $rsi
 ; AVX2-NEXT:    # kill: def $edi killed $edi def $rdi
-; AVX2-NEXT:    andl $7, %edi
-; AVX2-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movzwl -24(%rsp,%rdi,2), %eax
-; AVX2-NEXT:    vmovd %eax, %xmm0
-; AVX2-NEXT:    andl $7, %esi
-; AVX2-NEXT:    vpinsrw $1, -24(%rsp,%rsi,2), %xmm0, %xmm0
-; AVX2-NEXT:    andl $7, %edx
-; AVX2-NEXT:    vpinsrw $2, -24(%rsp,%rdx,2), %xmm0, %xmm0
-; AVX2-NEXT:    andl $7, %ecx
-; AVX2-NEXT:    vpinsrw $3, -24(%rsp,%rcx,2), %xmm0, %xmm0
-; AVX2-NEXT:    andl $7, %r8d
-; AVX2-NEXT:    vpinsrw $4, -24(%rsp,%r8,2), %xmm0, %xmm0
-; AVX2-NEXT:    andl $7, %r9d
-; AVX2-NEXT:    vpinsrw $5, -24(%rsp,%r9,2), %xmm0, %xmm0
 ; AVX2-NEXT:    movl {{[0-9]+}}(%rsp), %eax
 ; AVX2-NEXT:    andl $7, %eax
-; AVX2-NEXT:    vpinsrw $6, -24(%rsp,%rax,2), %xmm0, %xmm0
+; AVX2-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movzwl -24(%rsp,%rax,2), %eax
+; AVX2-NEXT:    vmovd %eax, %xmm0
 ; AVX2-NEXT:    movl {{[0-9]+}}(%rsp), %eax
 ; AVX2-NEXT:    andl $7, %eax
-; AVX2-NEXT:    vpinsrw $7, -24(%rsp,%rax,2), %xmm0, %xmm0
+; AVX2-NEXT:    vpinsrw $1, -24(%rsp,%rax,2), %xmm0, %xmm0
 ; AVX2-NEXT:    movl {{[0-9]+}}(%rsp), %eax
 ; AVX2-NEXT:    andl $7, %eax
-; AVX2-NEXT:    movzwl -24(%rsp,%rax,2), %eax
-; AVX2-NEXT:    vmovd %eax, %xmm1
+; AVX2-NEXT:    vpinsrw $2, -24(%rsp,%rax,2), %xmm0, %xmm0
 ; AVX2-NEXT:    movl {{[0-9]+}}(%rsp), %eax
 ; AVX2-NEXT:    andl $7, %eax
-; AVX2-NEXT:    vpinsrw $1, -24(%rsp,%rax,2), %xmm1, %xmm1
+; AVX2-NEXT:    vpinsrw $3, -24(%rsp,%rax,2), %xmm0, %xmm0
 ; AVX2-NEXT:    movl {{[0-9]+}}(%rsp), %eax
 ; AVX2-NEXT:    andl $7, %eax
-; AVX2-NEXT:    vpinsrw $2, -24(%rsp,%rax,2), %xmm1, %xmm1
+; AVX2-NEXT:    vpinsrw $4, -24(%rsp,%rax,2), %xmm0, %xmm0
 ; AVX2-NEXT:    movl {{[0-9]+}}(%rsp), %eax
 ; AVX2-NEXT:    andl $7, %eax
-; AVX2-NEXT:    vpinsrw $3, -24(%rsp,%rax,2), %xmm1, %xmm1
+; AVX2-NEXT:    vpinsrw $5, -24(%rsp,%rax,2), %xmm0, %xmm0
 ; AVX2-NEXT:    movl {{[0-9]+}}(%rsp), %eax
 ; AVX2-NEXT:    andl $7, %eax
-; AVX2-NEXT:    vpinsrw $4, -24(%rsp,%rax,2), %xmm1, %xmm1
+; AVX2-NEXT:    vpinsrw $6, -24(%rsp,%rax,2), %xmm0, %xmm0
 ; AVX2-NEXT:    movl {{[0-9]+}}(%rsp), %eax
 ; AVX2-NEXT:    andl $7, %eax
-; AVX2-NEXT:    vpinsrw $5, -24(%rsp,%rax,2), %xmm1, %xmm1
+; AVX2-NEXT:    vpinsrw $7, -24(%rsp,%rax,2), %xmm0, %xmm0
+; AVX2-NEXT:    andl $7, %edi
+; AVX2-NEXT:    movzwl -24(%rsp,%rdi,2), %eax
+; AVX2-NEXT:    vmovd %eax, %xmm1
+; AVX2-NEXT:    andl $7, %esi
+; AVX2-NEXT:    vpinsrw $1, -24(%rsp,%rsi,2), %xmm1, %xmm1
+; AVX2-NEXT:    andl $7, %edx
+; AVX2-NEXT:    vpinsrw $2, -24(%rsp,%rdx,2), %xmm1, %xmm1
+; AVX2-NEXT:    andl $7, %ecx
+; AVX2-NEXT:    vpinsrw $3, -24(%rsp,%rcx,2), %xmm1, %xmm1
+; AVX2-NEXT:    andl $7, %r8d
+; AVX2-NEXT:    vpinsrw $4, -24(%rsp,%r8,2), %xmm1, %xmm1
+; AVX2-NEXT:    andl $7, %r9d
+; AVX2-NEXT:    vpinsrw $5, -24(%rsp,%r9,2), %xmm1, %xmm1
 ; AVX2-NEXT:    movl {{[0-9]+}}(%rsp), %eax
 ; AVX2-NEXT:    andl $7, %eax
 ; AVX2-NEXT:    vpinsrw $6, -24(%rsp,%rax,2), %xmm1, %xmm1
 ; AVX2-NEXT:    movl {{[0-9]+}}(%rsp), %eax
 ; AVX2-NEXT:    andl $7, %eax
 ; AVX2-NEXT:    vpinsrw $7, -24(%rsp,%rax,2), %xmm1, %xmm1
-; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
 ; AVX2-NEXT:    retq
   %x0  = extractelement <8 x i16> %x, i32 %i0
   %x1  = extractelement <8 x i16> %x, i32 %i1
@@ -597,13 +597,13 @@ define <4 x i64> @mem_shuffle_v4i64_v4i64_xxxx_i64(<4 x i64> %x, ptr %i) nounwin
 ; ALL-NEXT:    movq %rsp, %rbp
 ; ALL-NEXT:    andq $-32, %rsp
 ; ALL-NEXT:    subq $64, %rsp
-; ALL-NEXT:    movq (%rdi), %rax
-; ALL-NEXT:    movq 8(%rdi), %rcx
+; ALL-NEXT:    movl (%rdi), %eax
+; ALL-NEXT:    movl 8(%rdi), %ecx
 ; ALL-NEXT:    andl $3, %eax
 ; ALL-NEXT:    andl $3, %ecx
-; ALL-NEXT:    movq 16(%rdi), %rdx
+; ALL-NEXT:    movl 16(%rdi), %edx
 ; ALL-NEXT:    andl $3, %edx
-; ALL-NEXT:    movq 24(%rdi), %rsi
+; ALL-NEXT:    movl 24(%rdi), %esi
 ; ALL-NEXT:    andl $3, %esi
 ; ALL-NEXT:    vmovaps %ymm0, (%rsp)
 ; ALL-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
@@ -637,13 +637,13 @@ define <4 x i64> @mem_shuffle_v4i64_v4i64_xxxx_i64(<4 x i64> %x, ptr %i) nounwin
 define <4 x i64> @mem_shuffle_v4i64_v2i64_xxxx_i64(<2 x i64> %x, ptr %i) nounwind {
 ; ALL-LABEL: mem_shuffle_v4i64_v2i64_xxxx_i64:
 ; ALL:       # %bb.0:
-; ALL-NEXT:    movq (%rdi), %rax
-; ALL-NEXT:    movq 8(%rdi), %rcx
+; ALL-NEXT:    movl (%rdi), %eax
+; ALL-NEXT:    movl 8(%rdi), %ecx
 ; ALL-NEXT:    andl $1, %eax
 ; ALL-NEXT:    andl $1, %ecx
-; ALL-NEXT:    movq 16(%rdi), %rdx
+; ALL-NEXT:    movl 16(%rdi), %edx
 ; ALL-NEXT:    andl $1, %edx
-; ALL-NEXT:    movq 24(%rdi), %rsi
+; ALL-NEXT:    movl 24(%rdi), %esi
 ; ALL-NEXT:    andl $1, %esi
 ; ALL-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
 ; ALL-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
diff --git a/llvm/test/CodeGen/X86/vselect.ll b/llvm/test/CodeGen/X86/vselect.ll
index 784d32bde1b5b..f45ee75e74f31 100644
--- a/llvm/test/CodeGen/X86/vselect.ll
+++ b/llvm/test/CodeGen/X86/vselect.ll
@@ -724,18 +724,18 @@ define i64 @vselect_any_extend_vector_inreg_crash(ptr %x) {
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
 ; SSE-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE-NEXT:    movq %xmm0, %rax
+; SSE-NEXT:    movd %xmm0, %eax
 ; SSE-NEXT:    andl $1, %eax
-; SSE-NEXT:    shlq $15, %rax
+; SSE-NEXT:    shll $15, %eax
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: vselect_any_extend_vector_inreg_crash:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
 ; AVX-NEXT:    vpcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX-NEXT:    vmovq %xmm0, %rax
+; AVX-NEXT:    vmovd %xmm0, %eax
 ; AVX-NEXT:    andl $1, %eax
-; AVX-NEXT:    shlq $15, %rax
+; AVX-NEXT:    shll $15, %eax
 ; AVX-NEXT:    retq
 0:
   %1 = load <8 x i8>, ptr %x
diff --git a/llvm/test/CodeGen/X86/zext-shl.ll b/llvm/test/CodeGen/X86/zext-shl.ll
index 8c27e0da6acf7..bc0981781df8f 100644
--- a/llvm/test/CodeGen/X86/zext-shl.ll
+++ b/llvm/test/CodeGen/X86/zext-shl.ll
@@ -51,7 +51,7 @@ define i64 @i64_zext_shift_i16_zext_i8(i8 %a0) nounwind {
 ; X64-LABEL: i64_zext_shift_i16_zext_i8:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movzbl %dil, %eax
-; X64-NEXT:    shlq $5, %rax
+; X64-NEXT:    shll $5, %eax
 ; X64-NEXT:    retq
   %t0 = zext i8 %a0 to i16
   %t1 = shl i16 %t0, 5
@@ -112,7 +112,7 @@ define i128 @i128_zext_shift_i64_zext_i8(i8 %a0) nounwind {
 ; X64-LABEL: i128_zext_shift_i64_zext_i8:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movzbl %dil, %eax
-; X64-NEXT:    shlq $4, %rax
+; X64-NEXT:    shll $4, %eax
 ; X64-NEXT:    xorl %edx, %edx
 ; X64-NEXT:    retq
   %t0 = zext i8 %a0 to i64
@@ -136,7 +136,7 @@ define i128 @i128_zext_shift_i64_zext_i16(i16 %a0) nounwind {
 ; X64-LABEL: i128_zext_shift_i64_zext_i16:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movzwl %di, %eax
-; X64-NEXT:    shlq $7, %rax
+; X64-NEXT:    shll $7, %eax
 ; X64-NEXT:    xorl %edx, %edx
 ; X64-NEXT:    retq
   %t0 = zext i16 %a0 to i64

From 64ed11689d0d65291d39fa828b91ed0eee3b2c9f Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Sun, 29 Oct 2023 16:05:00 +0000
Subject: [PATCH 318/877] [ASan] aarch64be.ll - fix missing
 aarch64-registered-target requirement and incorrect passes list

Fixes buildbots broken by e605fba343d1f3cd2f
---
 llvm/test/Instrumentation/AddressSanitizer/aarch64be.ll | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/test/Instrumentation/AddressSanitizer/aarch64be.ll b/llvm/test/Instrumentation/AddressSanitizer/aarch64be.ll
index 1850efaf8a927..eb522a0f3f317 100644
--- a/llvm/test/Instrumentation/AddressSanitizer/aarch64be.ll
+++ b/llvm/test/Instrumentation/AddressSanitizer/aarch64be.ll
@@ -1,6 +1,6 @@
-; RUN: opt < %s -passes='asan-pipeline' -S -mtriple=aarch64-linux-gnu | FileCheck --check-prefix=CHECK-AARCH64LE %s
- 
-; RUN: opt < %s -passes='asan-pipeline' -S -mtriple=aarch64_be-linux-gnu | FileCheck --check-prefix=CHECK-AARCH64BE %s
+; RUN: opt < %s -passes=asan -S -mtriple=aarch64-linux-gnu | FileCheck --check-prefix=CHECK-AARCH64LE %s
+; RUN: opt < %s -passes=asan -S -mtriple=aarch64_be-linux-gnu | FileCheck --check-prefix=CHECK-AARCH64BE %s
+; REQUIRES: aarch64-registered-target
  
 define i32 @read_4_bytes(i32* %a) sanitize_address {
 entry:

From a3a68e048f07eeb0798c859b313e15e7d6076c0e Mon Sep 17 00:00:00 2001
From: Nico Weber <thakis@chromium.org>
Date: Sun, 29 Oct 2023 12:17:55 -0400
Subject: [PATCH 319/877] [gn] port f70e39ec1731

---
 llvm/utils/gn/secondary/llvm/lib/CodeGen/BUILD.gn | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/utils/gn/secondary/llvm/lib/CodeGen/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/CodeGen/BUILD.gn
index cba8e7db3ccf7..047f6583ec4e8 100644
--- a/llvm/utils/gn/secondary/llvm/lib/CodeGen/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/CodeGen/BUILD.gn
@@ -40,6 +40,7 @@ static_library("CodeGen") {
     "Analysis.cpp",
     "AssignmentTrackingAnalysis.cpp",
     "AtomicExpandPass.cpp",
+    "BasicBlockPathCloning.cpp",
     "BasicBlockSections.cpp",
     "BasicBlockSectionsProfileReader.cpp",
     "BasicTargetTransformInfo.cpp",

From a65070a76a7c823c28f1c95a21e4857ed6e123ef Mon Sep 17 00:00:00 2001
From: philnik777 <nikolasklauser@berlin.de>
Date: Sun, 29 Oct 2023 18:31:37 +0100
Subject: [PATCH 320/877] [libc++] Remove a few transitive includes (#70553)

---
 libcxx/include/__filesystem/directory_entry.h |  1 -
 .../include/__filesystem/filesystem_error.h   |  2 --
 libcxx/include/__fwd/string_view.h            |  2 +-
 libcxx/include/__ios/fpos.h                   |  2 +-
 libcxx/include/__iterator/istream_iterator.h  |  3 +-
 .../include/__iterator/istreambuf_iterator.h  |  3 +-
 libcxx/include/__iterator/ostream_iterator.h  |  3 +-
 libcxx/include/__memory/shared_ptr.h          |  4 +--
 .../__memory_resource/polymorphic_allocator.h |  1 -
 .../__random/independent_bits_engine.h        |  3 +-
 .../__random/piecewise_linear_distribution.h  |  2 +-
 libcxx/include/__ranges/istream_view.h        |  3 +-
 libcxx/include/__thread/id.h                  |  2 +-
 libcxx/include/__thread/thread.h              |  1 -
 libcxx/include/__tree                         |  1 -
 libcxx/include/algorithm                      |  1 -
 libcxx/include/charconv                       |  7 ++--
 libcxx/include/complex                        |  4 +--
 libcxx/include/condition_variable             |  1 +
 libcxx/include/experimental/iterator          |  2 +-
 libcxx/include/ext/hash_map                   |  1 -
 libcxx/include/filesystem                     |  2 ++
 libcxx/include/forward_list                   |  1 +
 libcxx/include/functional                     |  6 ++--
 libcxx/include/future                         |  1 +
 libcxx/include/istream                        |  1 +
 libcxx/include/iterator                       |  2 --
 libcxx/include/list                           |  1 +
 libcxx/include/map                            |  1 +
 libcxx/include/memory_resource                |  4 +++
 libcxx/include/mutex                          |  3 +-
 libcxx/include/new                            |  3 +-
 libcxx/include/numeric                        |  2 +-
 libcxx/include/optional                       |  3 +-
 libcxx/include/ostream                        |  2 ++
 libcxx/include/ranges                         |  1 +
 libcxx/include/set                            |  1 +
 libcxx/include/span                           |  1 -
 libcxx/include/stdexcept                      |  3 +-
 libcxx/include/stop_token                     |  4 +++
 libcxx/include/streambuf                      |  5 ++-
 libcxx/include/unordered_set                  |  1 +
 libcxx/src/optional.cpp                       |  1 +
 .../support.dynamic/libcpp_deallocate.sh.cpp  |  5 ++-
 .../test/libcxx/transitive_includes/cxx03.csv |  1 +
 .../test/libcxx/transitive_includes/cxx11.csv |  1 +
 .../test/libcxx/transitive_includes/cxx14.csv |  1 +
 .../test/libcxx/transitive_includes/cxx17.csv |  1 +
 .../test/libcxx/transitive_includes/cxx20.csv |  1 +
 .../test/libcxx/transitive_includes/cxx23.csv | 36 +------------------
 .../test/libcxx/transitive_includes/cxx26.csv | 36 +------------------
 .../function.objects/func.blocks.pass.cpp     |  5 +--
 .../range.repeat.view/size.pass.cpp           |  5 ++-
 .../bad_function_call.pass.cpp                |  1 +
 .../util.smartptr.shared.create/types.h       |  1 +
 .../variant.swap/swap.pass.cpp                |  1 +
 56 files changed, 78 insertions(+), 115 deletions(-)

diff --git a/libcxx/include/__filesystem/directory_entry.h b/libcxx/include/__filesystem/directory_entry.h
index bb7a061db4cea..6baaca0709edb 100644
--- a/libcxx/include/__filesystem/directory_entry.h
+++ b/libcxx/include/__filesystem/directory_entry.h
@@ -26,7 +26,6 @@
 #include <__utility/move.h>
 #include <__utility/unreachable.h>
 #include <cstdint>
-#include <iosfwd>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
diff --git a/libcxx/include/__filesystem/filesystem_error.h b/libcxx/include/__filesystem/filesystem_error.h
index 713cc357ee82f..3ffba1b6c2d61 100644
--- a/libcxx/include/__filesystem/filesystem_error.h
+++ b/libcxx/include/__filesystem/filesystem_error.h
@@ -18,8 +18,6 @@
 #include <__system_error/system_error.h>
 #include <__utility/forward.h>
 #include <__verbose_abort>
-#include <iosfwd>
-#include <new>
 #include <string>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
diff --git a/libcxx/include/__fwd/string_view.h b/libcxx/include/__fwd/string_view.h
index 4818990248da8..786765ca6a2ed 100644
--- a/libcxx/include/__fwd/string_view.h
+++ b/libcxx/include/__fwd/string_view.h
@@ -11,7 +11,7 @@
 #define _LIBCPP___FWD_STRING_VIEW_H
 
 #include <__config>
-#include <iosfwd> // char_traits
+#include <__fwd/string.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
diff --git a/libcxx/include/__ios/fpos.h b/libcxx/include/__ios/fpos.h
index 87f0135fc3ea2..ae578bdbb916c 100644
--- a/libcxx/include/__ios/fpos.h
+++ b/libcxx/include/__ios/fpos.h
@@ -11,7 +11,7 @@
 #define _LIBCPP___IOS_FPOS_H
 
 #include <__config>
-#include <iosfwd>
+#include <__fwd/ios.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
diff --git a/libcxx/include/__iterator/istream_iterator.h b/libcxx/include/__iterator/istream_iterator.h
index 989902f212bde..4630a1b1585c8 100644
--- a/libcxx/include/__iterator/istream_iterator.h
+++ b/libcxx/include/__iterator/istream_iterator.h
@@ -11,12 +11,13 @@
 #define _LIBCPP___ITERATOR_ISTREAM_ITERATOR_H
 
 #include <__config>
+#include <__fwd/istream.h>
+#include <__fwd/string.h>
 #include <__iterator/default_sentinel.h>
 #include <__iterator/iterator.h>
 #include <__iterator/iterator_traits.h>
 #include <__memory/addressof.h>
 #include <cstddef>
-#include <iosfwd> // for forward declarations of char_traits and basic_istream
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
diff --git a/libcxx/include/__iterator/istreambuf_iterator.h b/libcxx/include/__iterator/istreambuf_iterator.h
index e39fec6d72dd3..e8e22ccdeb5e2 100644
--- a/libcxx/include/__iterator/istreambuf_iterator.h
+++ b/libcxx/include/__iterator/istreambuf_iterator.h
@@ -11,10 +11,11 @@
 #define _LIBCPP___ITERATOR_ISTREAMBUF_ITERATOR_H
 
 #include <__config>
+#include <__fwd/istream.h>
+#include <__fwd/streambuf.h>
 #include <__iterator/default_sentinel.h>
 #include <__iterator/iterator.h>
 #include <__iterator/iterator_traits.h>
-#include <iosfwd> // for forward declaration of basic_streambuf
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
diff --git a/libcxx/include/__iterator/ostream_iterator.h b/libcxx/include/__iterator/ostream_iterator.h
index 025712bb1ca96..85edebabeb1e1 100644
--- a/libcxx/include/__iterator/ostream_iterator.h
+++ b/libcxx/include/__iterator/ostream_iterator.h
@@ -11,11 +11,12 @@
 #define _LIBCPP___ITERATOR_OSTREAM_ITERATOR_H
 
 #include <__config>
+#include <__fwd/ostream.h>
+#include <__fwd/string.h>
 #include <__iterator/iterator.h>
 #include <__iterator/iterator_traits.h>
 #include <__memory/addressof.h>
 #include <cstddef>
-#include <iosfwd> // for forward declarations of char_traits and basic_ostream
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
diff --git a/libcxx/include/__memory/shared_ptr.h b/libcxx/include/__memory/shared_ptr.h
index d9ddb8a17be27..96e1f80bdede5 100644
--- a/libcxx/include/__memory/shared_ptr.h
+++ b/libcxx/include/__memory/shared_ptr.h
@@ -14,9 +14,11 @@
 #include <__compare/compare_three_way.h>
 #include <__compare/ordering.h>
 #include <__config>
+#include <__exception/exception.h>
 #include <__functional/binary_function.h>
 #include <__functional/operations.h>
 #include <__functional/reference_wrapper.h>
+#include <__fwd/ostream.h>
 #include <__iterator/access.h>
 #include <__memory/addressof.h>
 #include <__memory/allocation_guard.h>
@@ -49,9 +51,7 @@
 #include <__utility/swap.h>
 #include <__verbose_abort>
 #include <cstddef>
-#include <iosfwd>
 #include <new>
-#include <stdexcept>
 #include <typeinfo>
 #if !defined(_LIBCPP_HAS_NO_ATOMIC_HEADER)
 #  include <__atomic/memory_order.h>
diff --git a/libcxx/include/__memory_resource/polymorphic_allocator.h b/libcxx/include/__memory_resource/polymorphic_allocator.h
index 8fcce65ad2f4b..3a2794931767b 100644
--- a/libcxx/include/__memory_resource/polymorphic_allocator.h
+++ b/libcxx/include/__memory_resource/polymorphic_allocator.h
@@ -17,7 +17,6 @@
 #include <cstddef>
 #include <limits>
 #include <new>
-#include <stdexcept>
 #include <tuple>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
diff --git a/libcxx/include/__random/independent_bits_engine.h b/libcxx/include/__random/independent_bits_engine.h
index 0164ce08fb5d4..376524722d997 100644
--- a/libcxx/include/__random/independent_bits_engine.h
+++ b/libcxx/include/__random/independent_bits_engine.h
@@ -10,6 +10,8 @@
 #define _LIBCPP___RANDOM_INDEPENDENT_BITS_ENGINE_H
 
 #include <__config>
+#include <__fwd/istream.h>
+#include <__fwd/ostream.h>
 #include <__random/is_seed_sequence.h>
 #include <__random/log2.h>
 #include <__type_traits/conditional.h>
@@ -17,7 +19,6 @@
 #include <__type_traits/is_convertible.h>
 #include <__utility/move.h>
 #include <cstddef>
-#include <iosfwd>
 #include <limits>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
diff --git a/libcxx/include/__random/piecewise_linear_distribution.h b/libcxx/include/__random/piecewise_linear_distribution.h
index 6be44b29fcc84..8d0458d125fd5 100644
--- a/libcxx/include/__random/piecewise_linear_distribution.h
+++ b/libcxx/include/__random/piecewise_linear_distribution.h
@@ -13,8 +13,8 @@
 #include <__config>
 #include <__random/is_valid.h>
 #include <__random/uniform_real_distribution.h>
+#include <cmath>
 #include <iosfwd>
-#include <numeric>
 #include <vector>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
diff --git a/libcxx/include/__ranges/istream_view.h b/libcxx/include/__ranges/istream_view.h
index 66cd91527616a..71af102c33203 100644
--- a/libcxx/include/__ranges/istream_view.h
+++ b/libcxx/include/__ranges/istream_view.h
@@ -14,6 +14,8 @@
 #include <__concepts/derived_from.h>
 #include <__concepts/movable.h>
 #include <__config>
+#include <__fwd/istream.h>
+#include <__fwd/string.h>
 #include <__iterator/default_sentinel.h>
 #include <__iterator/iterator_traits.h>
 #include <__memory/addressof.h>
@@ -21,7 +23,6 @@
 #include <__type_traits/remove_cvref.h>
 #include <__utility/forward.h>
 #include <cstddef>
-#include <iosfwd>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
diff --git a/libcxx/include/__thread/id.h b/libcxx/include/__thread/id.h
index fd6e6dffafa95..83b1d8eceede3 100644
--- a/libcxx/include/__thread/id.h
+++ b/libcxx/include/__thread/id.h
@@ -13,8 +13,8 @@
 #include <__compare/ordering.h>
 #include <__config>
 #include <__fwd/hash.h>
+#include <__fwd/ostream.h>
 #include <__threading_support>
-#include <iosfwd>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
diff --git a/libcxx/include/__thread/thread.h b/libcxx/include/__thread/thread.h
index a1861e4948889..6ce09147f8d3d 100644
--- a/libcxx/include/__thread/thread.h
+++ b/libcxx/include/__thread/thread.h
@@ -21,7 +21,6 @@
 #include <__thread/id.h>
 #include <__threading_support>
 #include <__utility/forward.h>
-#include <iosfwd>
 #include <tuple>
 
 #ifndef _LIBCPP_HAS_NO_LOCALIZATION
diff --git a/libcxx/include/__tree b/libcxx/include/__tree
index eccadea8a0139..68768e17e6006 100644
--- a/libcxx/include/__tree
+++ b/libcxx/include/__tree
@@ -41,7 +41,6 @@
 #include <__utility/pair.h>
 #include <__utility/swap.h>
 #include <limits>
-#include <stdexcept>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
diff --git a/libcxx/include/algorithm b/libcxx/include/algorithm
index 7dcf396b683db..578de74d401dc 100644
--- a/libcxx/include/algorithm
+++ b/libcxx/include/algorithm
@@ -1754,7 +1754,6 @@ template <class BidirectionalIterator, class Compare>
 
 #include <__assert> // all public C++ headers provide the assertion handler
 #include <__config>
-#include <cstddef>
 #include <version>
 
 #include <__algorithm/adjacent_find.h>
diff --git a/libcxx/include/charconv b/libcxx/include/charconv
index 92273b4ce4766..5a2869acba871 100644
--- a/libcxx/include/charconv
+++ b/libcxx/include/charconv
@@ -82,9 +82,7 @@ namespace std {
 #include <__charconv/traits.h>
 #include <__config>
 #include <__system_error/errc.h>
-#include <cmath> // for log2f
-#include <cstdint>
-#include <limits>
+#include <version>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
@@ -95,10 +93,13 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 _LIBCPP_END_NAMESPACE_STD
 
 #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20
+#  include <cmath>
 #  include <concepts>
+#  include <cstdint>
 #  include <cstdlib>
 #  include <cstring>
 #  include <iosfwd>
+#  include <limits>
 #  include <type_traits>
 #endif
 
diff --git a/libcxx/include/complex b/libcxx/include/complex
index c8ffde9515636..5cfb160ee00f0 100644
--- a/libcxx/include/complex
+++ b/libcxx/include/complex
@@ -234,8 +234,6 @@ template<class T> complex<T> tanh (const complex<T>&);
 #include <__assert> // all public C++ headers provide the assertion handler
 #include <__config>
 #include <cmath>
-#include <iosfwd>
-#include <stdexcept>
 #include <version>
 
 #if !defined(_LIBCPP_HAS_NO_LOCALIZATION)
@@ -1543,6 +1541,8 @@ inline namespace literals
 _LIBCPP_END_NAMESPACE_STD
 
 #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20
+#  include <iosfwd>
+#  include <stdexcept>
 #  include <type_traits>
 #endif
 
diff --git a/libcxx/include/condition_variable b/libcxx/include/condition_variable
index f6d72b4833718..eb1c4bf5de6b6 100644
--- a/libcxx/include/condition_variable
+++ b/libcxx/include/condition_variable
@@ -343,6 +343,7 @@ _LIBCPP_END_NAMESPACE_STD
 #  include <cstdlib>
 #  include <cstring>
 #  include <initializer_list>
+#  include <iosfwd>
 #  include <new>
 #  include <stdexcept>
 #  include <system_error>
diff --git a/libcxx/include/experimental/iterator b/libcxx/include/experimental/iterator
index a5e3dffba980b..5f9842de4f7f8 100644
--- a/libcxx/include/experimental/iterator
+++ b/libcxx/include/experimental/iterator
@@ -58,7 +58,6 @@ namespace std {
 #include <__utility/forward.h>
 #include <__utility/move.h>
 #include <experimental/__config>
-#include <iosfwd> // char_traits
 #include <iterator>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
@@ -120,6 +119,7 @@ _LIBCPP_END_NAMESPACE_LFTS
 #endif // _LIBCPP_STD_VER >= 14
 
 #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20
+#  include <iosfwd>
 #  include <type_traits>
 #endif
 
diff --git a/libcxx/include/ext/hash_map b/libcxx/include/ext/hash_map
index de963675eb793..6df48bcb5f8d9 100644
--- a/libcxx/include/ext/hash_map
+++ b/libcxx/include/ext/hash_map
@@ -207,7 +207,6 @@ template <class Key, class T, class Hash, class Pred, class Alloc>
 #include <algorithm>
 #include <ext/__hash>
 #include <functional>
-#include <stdexcept>
 
 #if defined(__DEPRECATED) && __DEPRECATED
 #if defined(_LIBCPP_WARNING)
diff --git a/libcxx/include/filesystem b/libcxx/include/filesystem
index 99ba713ad07ba..ec68354a9fc93 100644
--- a/libcxx/include/filesystem
+++ b/libcxx/include/filesystem
@@ -565,6 +565,8 @@ inline constexpr bool std::ranges::enable_view<std::filesystem::recursive_direct
 #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20
 #  include <concepts>
 #  include <cstdlib>
+#  include <iosfwd>
+#  include <new>
 #  include <system_error>
 #endif
 
diff --git a/libcxx/include/forward_list b/libcxx/include/forward_list
index 09338ab695713..06e38cb9a568b 100644
--- a/libcxx/include/forward_list
+++ b/libcxx/include/forward_list
@@ -1891,6 +1891,7 @@ _LIBCPP_POP_MACROS
 #  include <functional>
 #  include <iosfwd>
 #  include <iterator>
+#  include <stdexcept>
 #  include <type_traits>
 #  include <typeinfo>
 #endif
diff --git a/libcxx/include/functional b/libcxx/include/functional
index 5f9e8fa82a891..fd99e11fb1818 100644
--- a/libcxx/include/functional
+++ b/libcxx/include/functional
@@ -542,8 +542,6 @@ POLICY:  For non-variadic implementations, the number of arguments is limited
 #include <__functional/unary_negate.h>
 #include <__type_traits/unwrap_ref.h>
 #include <__utility/forward.h>
-#include <memory> // TODO: find out why removing this breaks the modules build
-#include <typeinfo>
 #include <version>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
@@ -555,8 +553,12 @@ POLICY:  For non-variadic implementations, the number of arguments is limited
 #  include <concepts>
 #  include <cstdlib>
 #  include <exception>
+#  include <iosfwd>
+#  include <memory>
+#  include <stdexcept>
 #  include <tuple>
 #  include <type_traits>
+#  include <typeinfo>
 #  include <utility>
 #endif
 
diff --git a/libcxx/include/future b/libcxx/include/future
index f372b8e6ad5d8..96aa47fc60fcb 100644
--- a/libcxx/include/future
+++ b/libcxx/include/future
@@ -2476,6 +2476,7 @@ _LIBCPP_END_NAMESPACE_STD
 #  include <atomic>
 #  include <cstdlib>
 #  include <exception>
+#  include <iosfwd>
 #  include <system_error>
 #endif
 
diff --git a/libcxx/include/istream b/libcxx/include/istream
index 89370086106c5..d9c1a79cdd54e 100644
--- a/libcxx/include/istream
+++ b/libcxx/include/istream
@@ -1647,6 +1647,7 @@ _LIBCPP_END_NAMESPACE_STD
 
 #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20
 #  include <concepts>
+#  include <iosfwd>
 #  include <type_traits>
 #endif
 
diff --git a/libcxx/include/iterator b/libcxx/include/iterator
index dcba1e1c4fba0..2f9280742370a 100644
--- a/libcxx/include/iterator
+++ b/libcxx/include/iterator
@@ -716,8 +716,6 @@ template <class E> constexpr const E* data(initializer_list<E> il) noexcept;
 #include <__iterator/wrap_iter.h>
 #include <__memory/addressof.h>
 #include <__memory/pointer_traits.h>
-#include <cstddef>
-#include <initializer_list>
 #include <version>
 
 // standard-mandated includes
diff --git a/libcxx/include/list b/libcxx/include/list
index e5b524b8835a1..feca1605c773c 100644
--- a/libcxx/include/list
+++ b/libcxx/include/list
@@ -2107,6 +2107,7 @@ _LIBCPP_POP_MACROS
 #  include <functional>
 #  include <iosfwd>
 #  include <iterator>
+#  include <stdexcept>
 #  include <type_traits>
 #  include <typeinfo>
 #endif
diff --git a/libcxx/include/map b/libcxx/include/map
index 17bb715f249d7..04cf2d3134223 100644
--- a/libcxx/include/map
+++ b/libcxx/include/map
@@ -596,6 +596,7 @@ erase_if(multimap<Key, T, Compare, Allocator>& c, Predicate pred);  // C++20
 #include <__utility/forward.h>
 #include <__utility/piecewise_construct.h>
 #include <__utility/swap.h>
+#include <stdexcept>
 #include <tuple>
 #include <version>
 
diff --git a/libcxx/include/memory_resource b/libcxx/include/memory_resource
index 276e08f23fa12..e9c87777e8f75 100644
--- a/libcxx/include/memory_resource
+++ b/libcxx/include/memory_resource
@@ -62,4 +62,8 @@ namespace std::pmr {
 #  pragma GCC system_header
 #endif
 
+#if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20
+#  include <stdexcept>
+#endif
+
 #endif /* _LIBCPP_MEMORY_RESOURCE */
diff --git a/libcxx/include/mutex b/libcxx/include/mutex
index ec110c8c07ded..bdf1742dfcfe1 100644
--- a/libcxx/include/mutex
+++ b/libcxx/include/mutex
@@ -200,7 +200,7 @@ template<class Callable, class ...Args>
 #include <__thread/id.h>
 #include <__threading_support>
 #include <__utility/forward.h>
-#include <cstdint>
+#include <cstddef>
 #include <limits>
 #ifndef _LIBCPP_CXX03_LANG
 # include <tuple>
@@ -567,6 +567,7 @@ _LIBCPP_POP_MACROS
 #  include <cstring>
 #  include <ctime>
 #  include <initializer_list>
+#  include <iosfwd>
 #  include <new>
 #  include <stdexcept>
 #  include <system_error>
diff --git a/libcxx/include/new b/libcxx/include/new
index 0a97c3e37add5..7a2ef54f778ee 100644
--- a/libcxx/include/new
+++ b/libcxx/include/new
@@ -94,9 +94,7 @@ void  operator delete[](void* ptr, void*) noexcept;
 #include <__type_traits/is_function.h>
 #include <__type_traits/is_same.h>
 #include <__type_traits/remove_cv.h>
-#include <__verbose_abort>
 #include <cstddef>
-#include <cstdlib>
 #include <version>
 
 #if defined(_LIBCPP_ABI_VCRUNTIME)
@@ -370,6 +368,7 @@ inline constexpr size_t hardware_constructive_interference_size = __GCC_CONSTRUC
 _LIBCPP_END_NAMESPACE_STD
 
 #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20
+#  include <cstdlib>
 #  include <exception>
 #  include <type_traits>
 #endif
diff --git a/libcxx/include/numeric b/libcxx/include/numeric
index 3fcf6cefdb4b8..d09d0a81fcc03 100644
--- a/libcxx/include/numeric
+++ b/libcxx/include/numeric
@@ -146,7 +146,6 @@ template<class T>
 
 #include <__assert> // all public C++ headers provide the assertion handler
 #include <__config>
-#include <cmath> // for isnormal
 #include <version>
 
 #include <__numeric/accumulate.h>
@@ -170,6 +169,7 @@ template<class T>
 #endif
 
 #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20
+#  include <cmath>
 #  include <concepts>
 #  include <functional>
 #  include <iterator>
diff --git a/libcxx/include/optional b/libcxx/include/optional
index fc172c752582f..ab90cc8ce7f77 100644
--- a/libcxx/include/optional
+++ b/libcxx/include/optional
@@ -183,6 +183,7 @@ namespace std {
 #include <__compare/three_way_comparable.h>
 #include <__concepts/invocable.h>
 #include <__config>
+#include <__exception/exception.h>
 #include <__functional/hash.h>
 #include <__functional/invoke.h>
 #include <__functional/reference_wrapper.h>
@@ -227,7 +228,6 @@ namespace std {
 #include <__verbose_abort>
 #include <initializer_list>
 #include <new>
-#include <stdexcept>
 #include <version>
 
 // standard-mandated includes
@@ -1686,6 +1686,7 @@ _LIBCPP_POP_MACROS
 #  include <iterator>
 #  include <memory>
 #  include <ratio>
+#  include <stdexcept>
 #  include <tuple>
 #  include <type_traits>
 #  include <typeinfo>
diff --git a/libcxx/include/ostream b/libcxx/include/ostream
index f30cfe26e7551..469232964d5b7 100644
--- a/libcxx/include/ostream
+++ b/libcxx/include/ostream
@@ -1201,7 +1201,9 @@ _LIBCPP_END_NAMESPACE_STD
 #  include <atomic>
 #  include <concepts>
 #  include <cstdlib>
+#  include <iosfwd>
 #  include <iterator>
+#  include <stdexcept>
 #  include <type_traits>
 #endif
 
diff --git a/libcxx/include/ranges b/libcxx/include/ranges
index db592fd5cb360..f71a92f8a660b 100644
--- a/libcxx/include/ranges
+++ b/libcxx/include/ranges
@@ -437,6 +437,7 @@ namespace std {
 
 #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20
 #  include <cstdlib>
+#  include <iosfwd>
 #  include <type_traits>
 #endif
 
diff --git a/libcxx/include/set b/libcxx/include/set
index 75be1e13ede51..ce6663f34b663 100644
--- a/libcxx/include/set
+++ b/libcxx/include/set
@@ -1760,6 +1760,7 @@ _LIBCPP_END_NAMESPACE_STD
 #  include <cstdlib>
 #  include <functional>
 #  include <iterator>
+#  include <stdexcept>
 #  include <type_traits>
 #endif
 
diff --git a/libcxx/include/span b/libcxx/include/span
index b050dfe6e340e..69b0a2875e26c 100644
--- a/libcxx/include/span
+++ b/libcxx/include/span
@@ -146,7 +146,6 @@ template<class R>
 #include <__utility/forward.h>
 #include <array>        // for array
 #include <cstddef>      // for byte
-#include <limits>
 #include <version>
 
 // standard-mandated includes
diff --git a/libcxx/include/stdexcept b/libcxx/include/stdexcept
index cc6b0c5f38819..902054bb034fa 100644
--- a/libcxx/include/stdexcept
+++ b/libcxx/include/stdexcept
@@ -44,8 +44,8 @@ public:
 #include <__assert> // all public C++ headers provide the assertion handler
 #include <__config>
 #include <__exception/exception.h>
+#include <__fwd/string.h>
 #include <__verbose_abort>
-#include <iosfwd>  // for string forward decl
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
@@ -309,6 +309,7 @@ _LIBCPP_END_NAMESPACE_STD
 #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20
 #  include <cstdlib>
 #  include <exception>
+#  include <iosfwd>
 #endif
 
 #endif // _LIBCPP_STDEXCEPT
diff --git a/libcxx/include/stop_token b/libcxx/include/stop_token
index dd43ac298b9ab..b223ceb27f0d2 100644
--- a/libcxx/include/stop_token
+++ b/libcxx/include/stop_token
@@ -46,4 +46,8 @@ namespace std {
 #  error "<stop_token> is not supported since libc++ has been configured without support for threads."
 #endif
 
+#if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20
+#  include <iosfwd>
+#endif
+
 #endif // _LIBCPP_STOP_TOKEN
diff --git a/libcxx/include/streambuf b/libcxx/include/streambuf
index 095ac7d3ad837..9d94e42f37fe2 100644
--- a/libcxx/include/streambuf
+++ b/libcxx/include/streambuf
@@ -110,7 +110,6 @@ protected:
 #include <__assert> // all public C++ headers provide the assertion handler
 #include <__config>
 #include <__fwd/streambuf.h>
-#include <cstdint>
 #include <ios>
 #include <iosfwd>
 #include <version>
@@ -499,4 +498,8 @@ _LIBCPP_END_NAMESPACE_STD
 
 _LIBCPP_POP_MACROS
 
+#if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20
+#  include <cstdint>
+#endif
+
 #endif // _LIBCPP_STREAMBUF
diff --git a/libcxx/include/unordered_set b/libcxx/include/unordered_set
index f1b4104df4f68..e8fd22ba6bb8a 100644
--- a/libcxx/include/unordered_set
+++ b/libcxx/include/unordered_set
@@ -1976,6 +1976,7 @@ _LIBCPP_END_NAMESPACE_STD
 #  include <cstdlib>
 #  include <functional>
 #  include <iterator>
+#  include <stdexcept>
 #  include <type_traits>
 #endif
 
diff --git a/libcxx/src/optional.cpp b/libcxx/src/optional.cpp
index d0214981a2fda..fc91152f011cc 100644
--- a/libcxx/src/optional.cpp
+++ b/libcxx/src/optional.cpp
@@ -8,6 +8,7 @@
 
 #include <__availability>
 #include <optional>
+#include <stdexcept>
 
 namespace std
 {
diff --git a/libcxx/test/libcxx/language.support/support.dynamic/libcpp_deallocate.sh.cpp b/libcxx/test/libcxx/language.support/support.dynamic/libcpp_deallocate.sh.cpp
index 3b33737466c74..3b0849873ac91 100644
--- a/libcxx/test/libcxx/language.support/support.dynamic/libcpp_deallocate.sh.cpp
+++ b/libcxx/test/libcxx/language.support/support.dynamic/libcpp_deallocate.sh.cpp
@@ -34,10 +34,9 @@
 // RUN: %{build} -fno-aligned-allocation -fno-sized-deallocation -DNO_ALIGN -DNO_SIZE
 // RUN: %{run}
 
-#include <new>
-#include <typeinfo>
-#include <string>
 #include <cassert>
+#include <cstdlib>
+#include <new>
 
 #include "test_macros.h"
 
diff --git a/libcxx/test/libcxx/transitive_includes/cxx03.csv b/libcxx/test/libcxx/transitive_includes/cxx03.csv
index 153335c3519c4..e41a3b290cf6e 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx03.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx03.csv
@@ -111,6 +111,7 @@ charconv iosfwd
 charconv limits
 charconv new
 charconv type_traits
+charconv version
 chrono bit
 chrono compare
 chrono concepts
diff --git a/libcxx/test/libcxx/transitive_includes/cxx11.csv b/libcxx/test/libcxx/transitive_includes/cxx11.csv
index f3f8956df64ee..4ac3dccc50040 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx11.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx11.csv
@@ -111,6 +111,7 @@ charconv iosfwd
 charconv limits
 charconv new
 charconv type_traits
+charconv version
 chrono bit
 chrono compare
 chrono concepts
diff --git a/libcxx/test/libcxx/transitive_includes/cxx14.csv b/libcxx/test/libcxx/transitive_includes/cxx14.csv
index fb90523d5552a..4f3fdfdd4d2ca 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx14.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx14.csv
@@ -111,6 +111,7 @@ charconv iosfwd
 charconv limits
 charconv new
 charconv type_traits
+charconv version
 chrono bit
 chrono compare
 chrono concepts
diff --git a/libcxx/test/libcxx/transitive_includes/cxx17.csv b/libcxx/test/libcxx/transitive_includes/cxx17.csv
index fb90523d5552a..4f3fdfdd4d2ca 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx17.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx17.csv
@@ -111,6 +111,7 @@ charconv iosfwd
 charconv limits
 charconv new
 charconv type_traits
+charconv version
 chrono bit
 chrono compare
 chrono concepts
diff --git a/libcxx/test/libcxx/transitive_includes/cxx20.csv b/libcxx/test/libcxx/transitive_includes/cxx20.csv
index 63ea6c09b1f50..590a39e1b28e9 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx20.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx20.csv
@@ -110,6 +110,7 @@ charconv iosfwd
 charconv limits
 charconv new
 charconv type_traits
+charconv version
 chrono array
 chrono bit
 chrono charconv
diff --git a/libcxx/test/libcxx/transitive_includes/cxx23.csv b/libcxx/test/libcxx/transitive_includes/cxx23.csv
index 236dedd186c92..aa6879026c902 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx23.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx23.csv
@@ -62,12 +62,12 @@ bitset string_view
 bitset version
 ccomplex complex
 charconv cerrno
-charconv cmath
 charconv cstddef
 charconv cstdint
 charconv initializer_list
 charconv limits
 charconv new
+charconv version
 chrono array
 chrono cmath
 chrono compare
@@ -99,10 +99,8 @@ codecvt cstdlib
 codecvt cstring
 codecvt cwchar
 codecvt initializer_list
-codecvt iosfwd
 codecvt limits
 codecvt new
-codecvt stdexcept
 codecvt string
 codecvt tuple
 codecvt typeinfo
@@ -113,9 +111,7 @@ compare cstdint
 compare limits
 compare version
 complex cmath
-complex iosfwd
 complex sstream
-complex stdexcept
 complex version
 concepts cstddef
 concepts version
@@ -125,7 +121,6 @@ condition_variable cstddef
 condition_variable cstdint
 condition_variable cstring
 condition_variable ctime
-condition_variable iosfwd
 condition_variable limits
 condition_variable new
 condition_variable ratio
@@ -170,7 +165,6 @@ experimental/deque experimental/memory_resource
 experimental/forward_list experimental/memory_resource
 experimental/forward_list forward_list
 experimental/iterator cstddef
-experimental/iterator iosfwd
 experimental/iterator iterator
 experimental/list experimental/memory_resource
 experimental/list list
@@ -208,10 +202,8 @@ filesystem cstdint
 filesystem cstring
 filesystem ctime
 filesystem iomanip
-filesystem iosfwd
 filesystem limits
 filesystem locale
-filesystem new
 filesystem ratio
 filesystem string
 filesystem string_view
@@ -237,7 +229,6 @@ forward_list cstddef
 forward_list initializer_list
 forward_list limits
 forward_list new
-forward_list stdexcept
 forward_list tuple
 forward_list version
 fstream cctype
@@ -250,12 +241,10 @@ fstream cstring
 fstream cwchar
 fstream filesystem
 fstream initializer_list
-fstream iosfwd
 fstream istream
 fstream limits
 fstream new
 fstream ostream
-fstream stdexcept
 fstream string
 fstream tuple
 fstream typeinfo
@@ -265,11 +254,8 @@ functional cstddef
 functional cstdint
 functional cstring
 functional initializer_list
-functional iosfwd
 functional limits
-functional memory
 functional new
-functional stdexcept
 functional tuple
 functional typeinfo
 functional unordered_map
@@ -281,7 +267,6 @@ future cstdint
 future cstdlib
 future cstring
 future initializer_list
-future iosfwd
 future limits
 future mutex
 future new
@@ -320,7 +305,6 @@ iostream ostream
 iostream streambuf
 iostream version
 istream cstddef
-istream iosfwd
 istream ostream
 istream version
 iterator compare
@@ -346,7 +330,6 @@ list cstring
 list initializer_list
 list limits
 list new
-list stdexcept
 list tuple
 list version
 locale cctype
@@ -364,7 +347,6 @@ locale ios
 locale iosfwd
 locale limits
 locale new
-locale stdexcept
 locale streambuf
 locale string
 locale tuple
@@ -392,10 +374,8 @@ memory cstddef
 memory cstdint
 memory cstring
 memory initializer_list
-memory iosfwd
 memory limits
 memory new
-memory stdexcept
 memory tuple
 memory typeinfo
 memory version
@@ -404,14 +384,12 @@ memory_resource cstdint
 memory_resource limits
 memory_resource mutex
 memory_resource new
-memory_resource stdexcept
 memory_resource tuple
 memory_resource version
 mutex cerrno
 mutex cstddef
 mutex cstdint
 mutex ctime
-mutex iosfwd
 mutex limits
 mutex new
 mutex ratio
@@ -421,11 +399,9 @@ mutex tuple
 mutex typeinfo
 mutex version
 new cstddef
-new cstdlib
 new version
 numbers version
 numeric climits
-numeric cmath
 numeric cstddef
 numeric cstdint
 numeric cstring
@@ -444,7 +420,6 @@ optional cstring
 optional initializer_list
 optional limits
 optional new
-optional stdexcept
 optional version
 ostream bitset
 ostream cerrno
@@ -453,11 +428,9 @@ ostream cstdint
 ostream cstring
 ostream initializer_list
 ostream ios
-ostream iosfwd
 ostream limits
 ostream locale
 ostream new
-ostream stdexcept
 ostream streambuf
 ostream string
 ostream typeinfo
@@ -501,7 +474,6 @@ ranges compare
 ranges cstddef
 ranges cwchar
 ranges initializer_list
-ranges iosfwd
 ranges iterator
 ranges limits
 ranges new
@@ -523,7 +495,6 @@ regex cstring
 regex cwchar
 regex deque
 regex initializer_list
-regex iosfwd
 regex limits
 regex new
 regex stdexcept
@@ -551,7 +522,6 @@ set initializer_list
 set limits
 set new
 set optional
-set stdexcept
 set tuple
 set version
 shared_mutex cerrno
@@ -582,17 +552,14 @@ stack initializer_list
 stack limits
 stack new
 stack version
-stdexcept iosfwd
 stop_token atomic
 stop_token cstddef
 stop_token cstdint
 stop_token cstring
 stop_token ctime
-stop_token iosfwd
 stop_token limits
 stop_token ratio
 stop_token version
-streambuf cstdint
 streambuf ios
 streambuf iosfwd
 streambuf version
@@ -683,7 +650,6 @@ unordered_set initializer_list
 unordered_set limits
 unordered_set new
 unordered_set optional
-unordered_set stdexcept
 unordered_set tuple
 unordered_set version
 utility compare
diff --git a/libcxx/test/libcxx/transitive_includes/cxx26.csv b/libcxx/test/libcxx/transitive_includes/cxx26.csv
index 236dedd186c92..aa6879026c902 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx26.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx26.csv
@@ -62,12 +62,12 @@ bitset string_view
 bitset version
 ccomplex complex
 charconv cerrno
-charconv cmath
 charconv cstddef
 charconv cstdint
 charconv initializer_list
 charconv limits
 charconv new
+charconv version
 chrono array
 chrono cmath
 chrono compare
@@ -99,10 +99,8 @@ codecvt cstdlib
 codecvt cstring
 codecvt cwchar
 codecvt initializer_list
-codecvt iosfwd
 codecvt limits
 codecvt new
-codecvt stdexcept
 codecvt string
 codecvt tuple
 codecvt typeinfo
@@ -113,9 +111,7 @@ compare cstdint
 compare limits
 compare version
 complex cmath
-complex iosfwd
 complex sstream
-complex stdexcept
 complex version
 concepts cstddef
 concepts version
@@ -125,7 +121,6 @@ condition_variable cstddef
 condition_variable cstdint
 condition_variable cstring
 condition_variable ctime
-condition_variable iosfwd
 condition_variable limits
 condition_variable new
 condition_variable ratio
@@ -170,7 +165,6 @@ experimental/deque experimental/memory_resource
 experimental/forward_list experimental/memory_resource
 experimental/forward_list forward_list
 experimental/iterator cstddef
-experimental/iterator iosfwd
 experimental/iterator iterator
 experimental/list experimental/memory_resource
 experimental/list list
@@ -208,10 +202,8 @@ filesystem cstdint
 filesystem cstring
 filesystem ctime
 filesystem iomanip
-filesystem iosfwd
 filesystem limits
 filesystem locale
-filesystem new
 filesystem ratio
 filesystem string
 filesystem string_view
@@ -237,7 +229,6 @@ forward_list cstddef
 forward_list initializer_list
 forward_list limits
 forward_list new
-forward_list stdexcept
 forward_list tuple
 forward_list version
 fstream cctype
@@ -250,12 +241,10 @@ fstream cstring
 fstream cwchar
 fstream filesystem
 fstream initializer_list
-fstream iosfwd
 fstream istream
 fstream limits
 fstream new
 fstream ostream
-fstream stdexcept
 fstream string
 fstream tuple
 fstream typeinfo
@@ -265,11 +254,8 @@ functional cstddef
 functional cstdint
 functional cstring
 functional initializer_list
-functional iosfwd
 functional limits
-functional memory
 functional new
-functional stdexcept
 functional tuple
 functional typeinfo
 functional unordered_map
@@ -281,7 +267,6 @@ future cstdint
 future cstdlib
 future cstring
 future initializer_list
-future iosfwd
 future limits
 future mutex
 future new
@@ -320,7 +305,6 @@ iostream ostream
 iostream streambuf
 iostream version
 istream cstddef
-istream iosfwd
 istream ostream
 istream version
 iterator compare
@@ -346,7 +330,6 @@ list cstring
 list initializer_list
 list limits
 list new
-list stdexcept
 list tuple
 list version
 locale cctype
@@ -364,7 +347,6 @@ locale ios
 locale iosfwd
 locale limits
 locale new
-locale stdexcept
 locale streambuf
 locale string
 locale tuple
@@ -392,10 +374,8 @@ memory cstddef
 memory cstdint
 memory cstring
 memory initializer_list
-memory iosfwd
 memory limits
 memory new
-memory stdexcept
 memory tuple
 memory typeinfo
 memory version
@@ -404,14 +384,12 @@ memory_resource cstdint
 memory_resource limits
 memory_resource mutex
 memory_resource new
-memory_resource stdexcept
 memory_resource tuple
 memory_resource version
 mutex cerrno
 mutex cstddef
 mutex cstdint
 mutex ctime
-mutex iosfwd
 mutex limits
 mutex new
 mutex ratio
@@ -421,11 +399,9 @@ mutex tuple
 mutex typeinfo
 mutex version
 new cstddef
-new cstdlib
 new version
 numbers version
 numeric climits
-numeric cmath
 numeric cstddef
 numeric cstdint
 numeric cstring
@@ -444,7 +420,6 @@ optional cstring
 optional initializer_list
 optional limits
 optional new
-optional stdexcept
 optional version
 ostream bitset
 ostream cerrno
@@ -453,11 +428,9 @@ ostream cstdint
 ostream cstring
 ostream initializer_list
 ostream ios
-ostream iosfwd
 ostream limits
 ostream locale
 ostream new
-ostream stdexcept
 ostream streambuf
 ostream string
 ostream typeinfo
@@ -501,7 +474,6 @@ ranges compare
 ranges cstddef
 ranges cwchar
 ranges initializer_list
-ranges iosfwd
 ranges iterator
 ranges limits
 ranges new
@@ -523,7 +495,6 @@ regex cstring
 regex cwchar
 regex deque
 regex initializer_list
-regex iosfwd
 regex limits
 regex new
 regex stdexcept
@@ -551,7 +522,6 @@ set initializer_list
 set limits
 set new
 set optional
-set stdexcept
 set tuple
 set version
 shared_mutex cerrno
@@ -582,17 +552,14 @@ stack initializer_list
 stack limits
 stack new
 stack version
-stdexcept iosfwd
 stop_token atomic
 stop_token cstddef
 stop_token cstdint
 stop_token cstring
 stop_token ctime
-stop_token iosfwd
 stop_token limits
 stop_token ratio
 stop_token version
-streambuf cstdint
 streambuf ios
 streambuf iosfwd
 streambuf version
@@ -683,7 +650,6 @@ unordered_set initializer_list
 unordered_set limits
 unordered_set new
 unordered_set optional
-unordered_set stdexcept
 unordered_set tuple
 unordered_set version
 utility compare
diff --git a/libcxx/test/libcxx/utilities/function.objects/func.blocks.pass.cpp b/libcxx/test/libcxx/utilities/function.objects/func.blocks.pass.cpp
index b95b6ebb534a1..a917e9fe1f81e 100644
--- a/libcxx/test/libcxx/utilities/function.objects/func.blocks.pass.cpp
+++ b/libcxx/test/libcxx/utilities/function.objects/func.blocks.pass.cpp
@@ -16,9 +16,10 @@
 
 // ADDITIONAL_COMPILE_FLAGS: -fblocks
 
-#include <functional>
-#include <cstdlib>
 #include <cassert>
+#include <cstdlib>
+#include <functional>
+#include <typeinfo>
 
 #include <Block.h>
 
diff --git a/libcxx/test/std/ranges/range.factories/range.repeat.view/size.pass.cpp b/libcxx/test/std/ranges/range.factories/range.repeat.view/size.pass.cpp
index 097dc94b8658e..72531b059aa24 100644
--- a/libcxx/test/std/ranges/range.factories/range.repeat.view/size.pass.cpp
+++ b/libcxx/test/std/ranges/range.factories/range.repeat.view/size.pass.cpp
@@ -10,11 +10,10 @@
 
 // constexpr auto size() const requires (!same_as<Bound, unreachable_sentinel_t>);
 
-#include <ranges>
-#include <numeric>
-#include <concepts>
 #include <cassert>
 #include <iterator>
+#include <limits>
+#include <ranges>
 
 template <class T>
 concept has_size = requires(T&& view) {
diff --git a/libcxx/test/std/utilities/function.objects/func.wrap/func.wrap.badcall/bad_function_call.pass.cpp b/libcxx/test/std/utilities/function.objects/func.wrap/func.wrap.badcall/bad_function_call.pass.cpp
index c390e5904c5f4..ca01ed9058e17 100644
--- a/libcxx/test/std/utilities/function.objects/func.wrap/func.wrap.badcall/bad_function_call.pass.cpp
+++ b/libcxx/test/std/utilities/function.objects/func.wrap/func.wrap.badcall/bad_function_call.pass.cpp
@@ -18,6 +18,7 @@
 //   bad_function_call();
 // };
 
+#include <exception>
 #include <functional>
 #include <type_traits>
 
diff --git a/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.create/types.h b/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.create/types.h
index 7cd65231404fa..67e989ed32619 100644
--- a/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.create/types.h
+++ b/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.create/types.h
@@ -11,6 +11,7 @@
 
 #include <cassert>
 #include <cstddef>
+#include <exception>
 
 #include "test_macros.h"
 
diff --git a/libcxx/test/std/utilities/variant/variant.variant/variant.swap/swap.pass.cpp b/libcxx/test/std/utilities/variant/variant.variant/variant.swap/swap.pass.cpp
index 64e65ac4d3c62..1f388b44f9546 100644
--- a/libcxx/test/std/utilities/variant/variant.variant/variant.swap/swap.pass.cpp
+++ b/libcxx/test/std/utilities/variant/variant.variant/variant.swap/swap.pass.cpp
@@ -17,6 +17,7 @@
 // void swap(variant& rhs) noexcept(see below)
 
 #include <cassert>
+#include <cstdlib>
 #include <string>
 #include <type_traits>
 #include <variant>

From 8d2efd7427ff01277f68a95ec27b83193c1c15fb Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Sun, 29 Oct 2023 17:09:16 +0000
Subject: [PATCH 321/877] [DAG] Avoid ComputeNumSignBits call when we know the
 result is unsigned

D146121 needs to set the NSW flag, but given the result is NUW then we know that the result has leading zeros, so we don't need to call ComputeNumSignBits - just reuse the existing KnownBits value instead.
---
 llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index a7e8d6f8cc8e2..de4a3098c9351 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -1834,8 +1834,7 @@ bool TargetLowering::SimplifyDemandedBits(
           // that the upper bits of the shift result are known to be zero,
           // which is equivalent to the narrow shift being NUW.
           if (bool IsNUW = (Known.countMinLeadingZeros() >= HalfWidth)) {
-            unsigned NumSignBits = TLO.DAG.ComputeNumSignBits(Op0, Depth + 1);
-            bool IsNSW = NumSignBits > (ShAmt + HalfWidth);
+            bool IsNSW = Known.countMinSignBits() > HalfWidth;
             SDNodeFlags Flags;
             Flags.setNoSignedWrap(IsNSW);
             Flags.setNoUnsignedWrap(IsNUW);

From 31b91213bd28647e061bd59f1b6712d1dcc27b74 Mon Sep 17 00:00:00 2001
From: Johannes Doerfert <johannes@jdoerfert.de>
Date: Wed, 25 Oct 2023 16:44:15 -0700
Subject: [PATCH 322/877] [OpenMP] Unify the min/max thread/teams pathways

We used to pass the min/max threads/teams values through different paths
from the frontend to the middle end. This simplifies the situation by
passing the values once, only when we will create the KernelEnvironment,
which contains the values. At that point we also manifest the metadata,
as appropriate. Some footguns have also been removed, e.g., our target
check is now triple-based, not calling convention-based, as the latter
is dependent on the ordering of operations. The types of the values have
been unified to int32_t.
---
 clang/lib/CodeGen/CGOpenMPRuntime.cpp         |   93 +-
 clang/lib/CodeGen/CGOpenMPRuntime.h           |   10 +-
 clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp      |   30 +-
 clang/lib/CodeGen/CGOpenMPRuntimeGPU.h        |    4 +-
 clang/test/OpenMP/bug57757.cpp                |   15 +-
 clang/test/OpenMP/distribute_simd_codegen.cpp | 1016 ++++-----
 ...arget_num_teams_num_threads_attributes.cpp |  112 +-
 clang/test/OpenMP/target_parallel_codegen.cpp |  220 +-
 .../OpenMP/target_parallel_debug_codegen.cpp  |  840 +++----
 .../OpenMP/target_parallel_for_codegen.cpp    |  628 +++---
 .../target_parallel_for_debug_codegen.cpp     | 1178 +++++-----
 .../target_parallel_for_simd_codegen.cpp      | 1864 +++++++--------
 ...target_parallel_generic_loop_codegen-3.cpp | 1178 +++++-----
 .../target_teams_distribute_simd_codegen.cpp  | 1996 ++++++++---------
 .../OpenMP/teams_distribute_simd_codegen.cpp  |  412 ++--
 .../llvm/Frontend/OpenMP/OMPIRBuilder.h       |   53 +-
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp     |  115 +-
 llvm/lib/Transforms/IPO/OpenMPOpt.cpp         |    6 +-
 18 files changed, 4850 insertions(+), 4920 deletions(-)

diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index 6262b3416a173..c1be7c2d03215 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -6002,6 +6002,42 @@ void CGOpenMPRuntime::emitUsesAllocatorsFini(CodeGenFunction &CGF,
       {ThreadId, AllocatorVal});
 }
 
+void CGOpenMPRuntime::computeMinAndMaxThreadsAndTeams(
+    const OMPExecutableDirective &D, CodeGenFunction &CGF,
+    int32_t &MinThreadsVal, int32_t &MaxThreadsVal, int32_t &MinTeamsVal,
+    int32_t &MaxTeamsVal) {
+
+  getNumTeamsExprForTargetDirective(CGF, D, MinTeamsVal, MaxTeamsVal);
+  getNumThreadsExprForTargetDirective(CGF, D, MaxThreadsVal,
+                                      /*UpperBoundOnly=*/true);
+
+  for (auto *C : D.getClausesOfKind<OMPXAttributeClause>()) {
+    for (auto *A : C->getAttrs()) {
+      int32_t AttrMinThreadsVal = 1, AttrMaxThreadsVal = -1;
+      int32_t AttrMinBlocksVal = 1, AttrMaxBlocksVal = -1;
+      if (auto *Attr = dyn_cast<CUDALaunchBoundsAttr>(A))
+        CGM.handleCUDALaunchBoundsAttr(nullptr, Attr, &AttrMaxThreadsVal,
+                                       &AttrMinBlocksVal, &AttrMaxBlocksVal);
+      else if (auto *Attr = dyn_cast<AMDGPUFlatWorkGroupSizeAttr>(A))
+        CGM.handleAMDGPUFlatWorkGroupSizeAttr(
+            nullptr, Attr, /*ReqdWGS=*/nullptr, &AttrMinThreadsVal,
+            &AttrMaxThreadsVal);
+      else
+        continue;
+
+      MinThreadsVal = std::max(MinThreadsVal, AttrMinThreadsVal);
+      if (AttrMaxThreadsVal > 0)
+        MaxThreadsVal = MaxThreadsVal > 0
+                            ? std::min(MaxThreadsVal, AttrMaxThreadsVal)
+                            : AttrMaxThreadsVal;
+      MinTeamsVal = std::max(MinTeamsVal, AttrMinBlocksVal);
+      if (AttrMaxBlocksVal > 0)
+        MaxTeamsVal = MaxTeamsVal > 0 ? std::min(MaxTeamsVal, AttrMaxBlocksVal)
+                                      : AttrMaxBlocksVal;
+    }
+  }
+}
+
 void CGOpenMPRuntime::emitTargetOutlinedFunctionHelper(
     const OMPExecutableDirective &D, StringRef ParentName,
     llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID,
@@ -6020,47 +6056,8 @@ void CGOpenMPRuntime::emitTargetOutlinedFunctionHelper(
         return CGF.GenerateOpenMPCapturedStmtFunction(CS, D.getBeginLoc());
       };
 
-  // Get NumTeams and ThreadLimit attributes
-  int32_t DefaultValMinTeams = 1;
-  int32_t DefaultValMaxTeams = -1;
-  uint32_t DefaultValMinThreads = 1;
-  uint32_t DefaultValMaxThreads = UINT32_MAX;
-
-  getNumTeamsExprForTargetDirective(CGF, D, DefaultValMinTeams,
-                                    DefaultValMaxTeams);
-  getNumThreadsExprForTargetDirective(CGF, D, DefaultValMaxThreads,
-                                      /*UpperBoundOnly=*/true);
-
-  for (auto *C : D.getClausesOfKind<OMPXAttributeClause>()) {
-    for (auto *A : C->getAttrs()) {
-      int32_t MinThreadsVal = 1, MaxThreadsVal = 0;
-      int32_t MinBlocksVal = 1, MaxBlocksVal = -1;
-      if (auto *Attr = dyn_cast<CUDALaunchBoundsAttr>(A))
-        CGM.handleCUDALaunchBoundsAttr(nullptr, Attr, &MaxThreadsVal,
-                                       &MinBlocksVal, &MaxBlocksVal);
-      else if (auto *Attr = dyn_cast<AMDGPUFlatWorkGroupSizeAttr>(A))
-        CGM.handleAMDGPUFlatWorkGroupSizeAttr(
-            nullptr, Attr, /*ReqdWGS=*/nullptr, &MinThreadsVal, &MaxThreadsVal);
-      else
-        continue;
-
-      DefaultValMinThreads =
-          std::max(DefaultValMinThreads, uint32_t(MinThreadsVal));
-      DefaultValMaxThreads =
-          DefaultValMaxThreads
-              ? std::min(DefaultValMaxThreads, uint32_t(MaxThreadsVal))
-              : MaxThreadsVal;
-      DefaultValMinTeams = DefaultValMinTeams
-                               ? std::max(DefaultValMinTeams, MinBlocksVal)
-                               : MinBlocksVal;
-      DefaultValMaxTeams = std::min(DefaultValMaxTeams, MaxBlocksVal);
-    }
-  }
-
-  OMPBuilder.emitTargetRegionFunction(
-      EntryInfo, GenerateOutlinedFunction, DefaultValMinTeams,
-      DefaultValMaxTeams, DefaultValMinThreads, DefaultValMaxThreads,
-      IsOffloadEntry, OutlinedFn, OutlinedFnID);
+  OMPBuilder.emitTargetRegionFunction(EntryInfo, GenerateOutlinedFunction,
+                                      IsOffloadEntry, OutlinedFn, OutlinedFnID);
 
   if (!OutlinedFn)
     return;
@@ -6306,7 +6303,7 @@ llvm::Value *CGOpenMPRuntime::emitNumTeamsForTargetDirective(
 /// store the condition in \p CondVal. If \p E, and \p CondVal respectively, are
 /// nullptr, no expression evaluation is perfomed.
 static void getNumThreads(CodeGenFunction &CGF, const CapturedStmt *CS,
-                          const Expr **E, uint32_t &UpperBound,
+                          const Expr **E, int32_t &UpperBound,
                           bool UpperBoundOnly, llvm::Value **CondVal) {
   const Stmt *Child = CGOpenMPRuntime::getSingleCompoundChild(
       CGF.getContext(), CS->getCapturedStmt());
@@ -6368,10 +6365,10 @@ static void getNumThreads(CodeGenFunction &CGF, const CapturedStmt *CS,
               UpperBound
                   ? Constant->getZExtValue()
                   : std::min(UpperBound,
-                             static_cast<uint32_t>(Constant->getZExtValue()));
+                             static_cast<int32_t>(Constant->getZExtValue()));
       // If we haven't found a upper bound, remember we saw a thread limiting
       // clause.
-      if (UpperBound == UINT32_MAX)
+      if (UpperBound == -1)
         UpperBound = 0;
       if (!E)
         return;
@@ -6397,7 +6394,7 @@ static void getNumThreads(CodeGenFunction &CGF, const CapturedStmt *CS,
 }
 
 const Expr *CGOpenMPRuntime::getNumThreadsExprForTargetDirective(
-    CodeGenFunction &CGF, const OMPExecutableDirective &D, uint32_t &UpperBound,
+    CodeGenFunction &CGF, const OMPExecutableDirective &D, int32_t &UpperBound,
     bool UpperBoundOnly, llvm::Value **CondVal, const Expr **ThreadLimitExpr) {
   assert((!CGF.getLangOpts().OpenMPIsTargetDevice || UpperBoundOnly) &&
          "Clauses associated with the teams directive expected to be emitted "
@@ -6414,11 +6411,11 @@ const Expr *CGOpenMPRuntime::getNumThreadsExprForTargetDirective(
       if (auto Constant = E->getIntegerConstantExpr(CGF.getContext()))
         UpperBound = UpperBound ? Constant->getZExtValue()
                                 : std::min(UpperBound,
-                                           uint32_t(Constant->getZExtValue()));
+                                           int32_t(Constant->getZExtValue()));
     }
     // If we haven't found a upper bound, remember we saw a thread limiting
     // clause.
-    if (UpperBound == UINT32_MAX)
+    if (UpperBound == -1)
       UpperBound = 0;
     if (EPtr)
       *EPtr = E;
@@ -6562,7 +6559,7 @@ llvm::Value *CGOpenMPRuntime::emitNumThreadsForTargetDirective(
   llvm::Value *CondVal = nullptr;
   llvm::Value *ThreadLimitVal = nullptr;
   const Expr *ThreadLimitExpr = nullptr;
-  uint32_t UpperBound = -1;
+  int32_t UpperBound = -1;
 
   const Expr *NT = getNumThreadsExprForTargetDirective(
       CGF, D, UpperBound, /* UpperBoundOnly */ false, &CondVal,
diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.h b/clang/lib/CodeGen/CGOpenMPRuntime.h
index d2f922da33209..0c4ad46e881b9 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.h
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.h
@@ -311,6 +311,14 @@ class CGOpenMPRuntime {
   /// An OpenMP-IR-Builder instance.
   llvm::OpenMPIRBuilder OMPBuilder;
 
+  /// Helper to determine the min/max number of threads/teams for \p D.
+  void computeMinAndMaxThreadsAndTeams(const OMPExecutableDirective &D,
+                                       CodeGenFunction &CGF,
+                                       int32_t &MinThreadsVal,
+                                       int32_t &MaxThreadsVal,
+                                       int32_t &MinTeamsVal,
+                                       int32_t &MaxTeamsVal);
+
   /// Helper to emit outlined function for 'target' directive.
   /// \param D Directive to emit.
   /// \param ParentName Name of the function that encloses the target region.
@@ -649,7 +657,7 @@ class CGOpenMPRuntime {
   /// UpperBoundOnly is true, no expression evaluation is perfomed.
   const Expr *getNumThreadsExprForTargetDirective(
       CodeGenFunction &CGF, const OMPExecutableDirective &D,
-      uint32_t &UpperBound, bool UpperBoundOnly,
+      int32_t &UpperBound, bool UpperBoundOnly,
       llvm::Value **CondExpr = nullptr, const Expr **ThreadLimitExpr = nullptr);
 
   /// Emit an expression that denotes the number of threads a target region
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
index 152a7511f4dd1..9d00ebae70280 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -757,13 +757,15 @@ void CGOpenMPRuntimeGPU::emitNonSPMDKernel(const OMPExecutableDirective &D,
   // Emit target region as a standalone region.
   class NVPTXPrePostActionTy : public PrePostActionTy {
     CGOpenMPRuntimeGPU::EntryFunctionState &EST;
+    const OMPExecutableDirective &D;
 
   public:
-    NVPTXPrePostActionTy(CGOpenMPRuntimeGPU::EntryFunctionState &EST)
-        : EST(EST) {}
+    NVPTXPrePostActionTy(CGOpenMPRuntimeGPU::EntryFunctionState &EST,
+                         const OMPExecutableDirective &D)
+        : EST(EST), D(D) {}
     void Enter(CodeGenFunction &CGF) override {
       auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
-      RT.emitKernelInit(CGF, EST, /* IsSPMD */ false);
+      RT.emitKernelInit(D, CGF, EST, /* IsSPMD */ false);
       // Skip target region initialization.
       RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
     }
@@ -772,7 +774,7 @@ void CGOpenMPRuntimeGPU::emitNonSPMDKernel(const OMPExecutableDirective &D,
       RT.clearLocThreadIdInsertPt(CGF);
       RT.emitKernelDeinit(CGF, EST, /* IsSPMD */ false);
     }
-  } Action(EST);
+  } Action(EST, D);
   CodeGen.setAction(Action);
   IsInTTDRegion = true;
   emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
@@ -780,10 +782,17 @@ void CGOpenMPRuntimeGPU::emitNonSPMDKernel(const OMPExecutableDirective &D,
   IsInTTDRegion = false;
 }
 
-void CGOpenMPRuntimeGPU::emitKernelInit(CodeGenFunction &CGF,
+void CGOpenMPRuntimeGPU::emitKernelInit(const OMPExecutableDirective &D,
+                                        CodeGenFunction &CGF,
                                         EntryFunctionState &EST, bool IsSPMD) {
+  int32_t MinThreadsVal = 1, MaxThreadsVal = -1, MinTeamsVal = 1,
+          MaxTeamsVal = -1;
+  computeMinAndMaxThreadsAndTeams(D, CGF, MinThreadsVal, MaxThreadsVal,
+                                  MinTeamsVal, MaxTeamsVal);
+
   CGBuilderTy &Bld = CGF.Builder;
-  Bld.restoreIP(OMPBuilder.createTargetInit(Bld, IsSPMD));
+  Bld.restoreIP(OMPBuilder.createTargetInit(
+      Bld, IsSPMD, MinThreadsVal, MaxThreadsVal, MinTeamsVal, MaxTeamsVal));
   if (!IsSPMD)
     emitGenericVarsProlog(CGF, EST.Loc);
 }
@@ -815,19 +824,20 @@ void CGOpenMPRuntimeGPU::emitSPMDKernel(const OMPExecutableDirective &D,
     CGOpenMPRuntimeGPU::EntryFunctionState &EST;
     bool IsBareKernel;
     DataSharingMode Mode;
+    const OMPExecutableDirective &D;
 
   public:
     NVPTXPrePostActionTy(CGOpenMPRuntimeGPU &RT,
                          CGOpenMPRuntimeGPU::EntryFunctionState &EST,
-                         bool IsBareKernel)
+                         bool IsBareKernel, const OMPExecutableDirective &D)
         : RT(RT), EST(EST), IsBareKernel(IsBareKernel),
-          Mode(RT.CurrentDataSharingMode) {}
+          Mode(RT.CurrentDataSharingMode), D(D) {}
     void Enter(CodeGenFunction &CGF) override {
       if (IsBareKernel) {
         RT.CurrentDataSharingMode = DataSharingMode::DS_CUDA;
         return;
       }
-      RT.emitKernelInit(CGF, EST, /* IsSPMD */ true);
+      RT.emitKernelInit(D, CGF, EST, /* IsSPMD */ true);
       // Skip target region initialization.
       RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
     }
@@ -839,7 +849,7 @@ void CGOpenMPRuntimeGPU::emitSPMDKernel(const OMPExecutableDirective &D,
       RT.clearLocThreadIdInsertPt(CGF);
       RT.emitKernelDeinit(CGF, EST, /* IsSPMD */ true);
     }
-  } Action(*this, EST, IsBareKernel);
+  } Action(*this, EST, IsBareKernel, D);
   CodeGen.setAction(Action);
   IsInTTDRegion = true;
   emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
index c4501a1a2a496..46e1361f2f895 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
@@ -60,8 +60,8 @@ class CGOpenMPRuntimeGPU : public CGOpenMPRuntime {
   void syncCTAThreads(CodeGenFunction &CGF);
 
   /// Helper for target directive initialization.
-  void emitKernelInit(CodeGenFunction &CGF, EntryFunctionState &EST,
-                      bool IsSPMD);
+  void emitKernelInit(const OMPExecutableDirective &D, CodeGenFunction &CGF,
+                      EntryFunctionState &EST, bool IsSPMD);
 
   /// Helper for target directive finalization.
   void emitKernelDeinit(CodeGenFunction &CGF, EntryFunctionState &EST,
diff --git a/clang/test/OpenMP/bug57757.cpp b/clang/test/OpenMP/bug57757.cpp
index 7acfe134ddd0b..7894796ac4628 100644
--- a/clang/test/OpenMP/bug57757.cpp
+++ b/clang/test/OpenMP/bug57757.cpp
@@ -32,23 +32,24 @@ void foo() {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T:%.*]], ptr [[TMP1]], i64 0, i32 2
 // CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META13:![0-9]+]])
-// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4, !tbaa [[TBAA16:![0-9]+]], !alias.scope !13, !noalias !17
+// CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META16:![0-9]+]])
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4, !tbaa [[TBAA18:![0-9]+]], !alias.scope !13, !noalias !16
 // CHECK-NEXT:    switch i32 [[TMP3]], label [[DOTOMP_OUTLINED__EXIT:%.*]] [
 // CHECK-NEXT:    i32 0, label [[DOTUNTIED_JMP__I:%.*]]
 // CHECK-NEXT:    i32 1, label [[DOTUNTIED_NEXT__I:%.*]]
 // CHECK-NEXT:    ]
 // CHECK:       .untied.jmp..i:
-// CHECK-NEXT:    store i32 1, ptr [[TMP2]], align 4, !tbaa [[TBAA16]], !alias.scope !13, !noalias !17
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call i32 @__kmpc_omp_task(ptr nonnull @[[GLOB1]], i32 [[TMP0]], ptr [[TMP1]]), !noalias !13
+// CHECK-NEXT:    store i32 1, ptr [[TMP2]], align 4, !tbaa [[TBAA18]], !alias.scope !13, !noalias !16
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call i32 @__kmpc_omp_task(ptr nonnull @[[GLOB1]], i32 [[TMP0]], ptr [[TMP1]]), !noalias !19
 // CHECK-NEXT:    br label [[DOTOMP_OUTLINED__EXIT]]
 // CHECK:       .untied.next..i:
 // CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES:%.*]], ptr [[TMP1]], i64 0, i32 1
 // CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES]], ptr [[TMP1]], i64 0, i32 1, i32 2
 // CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES]], ptr [[TMP1]], i64 0, i32 1, i32 1
-// CHECK-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[TMP5]], align 8, !tbaa [[TBAA19:![0-9]+]], !noalias !13
-// CHECK-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA16]], !noalias !13
-// CHECK-NEXT:    [[TMP10:%.*]] = load float, ptr [[TMP6]], align 4, !tbaa [[TBAA20:![0-9]+]], !noalias !13
-// CHECK-NEXT:    tail call void [[TMP8]](i32 noundef [[TMP9]], float noundef [[TMP10]]) #[[ATTR2:[0-9]+]], !noalias !13
+// CHECK-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[TMP5]], align 8, !tbaa [[TBAA20:![0-9]+]], !alias.scope !16, !noalias !13
+// CHECK-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA18]], !alias.scope !16, !noalias !13
+// CHECK-NEXT:    [[TMP10:%.*]] = load float, ptr [[TMP6]], align 4, !tbaa [[TBAA21:![0-9]+]], !alias.scope !16, !noalias !13
+// CHECK-NEXT:    tail call void [[TMP8]](i32 noundef [[TMP9]], float noundef [[TMP10]]) #[[ATTR2:[0-9]+]], !noalias !19
 // CHECK-NEXT:    br label [[DOTOMP_OUTLINED__EXIT]]
 // CHECK:       .omp_outlined..exit:
 // CHECK-NEXT:    ret i32 0
diff --git a/clang/test/OpenMP/distribute_simd_codegen.cpp b/clang/test/OpenMP/distribute_simd_codegen.cpp
index 297f508575d99..f74abbe32e454 100644
--- a/clang/test/OpenMP/distribute_simd_codegen.cpp
+++ b/clang/test/OpenMP/distribute_simd_codegen.cpp
@@ -220,7 +220,7 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK1-NEXT:    [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0
 // CHECK1-NEXT:    br i1 [[TMP32]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK1:       omp_offload.failed:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z23without_schedule_clausePfS_S_S__l70(ptr [[TMP0]], ptr [[TMP1]], ptr [[TMP2]], ptr [[TMP3]]) #[[ATTR4:[0-9]+]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z23without_schedule_clausePfS_S_S__l70(ptr [[TMP0]], ptr [[TMP1]], ptr [[TMP2]], ptr [[TMP3]]) #[[ATTR3:[0-9]+]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    ret void
@@ -242,7 +242,7 @@ int fint(void) { return ftemplate<int>(); }
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z23without_schedule_clausePfS_S_S__l70.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[D:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[D:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -291,45 +291,45 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK1-NEXT:    store i32 [[TMP9]], ptr [[DOTOMP_IV]], align 4
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK1:       omp.inner.for.cond:
-// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP13:![0-9]+]]
-// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP13]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP8:![0-9]+]]
+// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP8]]
 // CHECK1-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP10]], [[TMP11]]
 // CHECK1-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK1:       omp.inner.for.body:
-// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP13]]
+// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP8]]
 // CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP12]], 7
 // CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 33, [[MUL]]
-// CHECK1-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP13]]
-// CHECK1-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[TMP1]], align 8, !llvm.access.group [[ACC_GRP13]]
-// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP13]]
+// CHECK1-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP8]]
+// CHECK1-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[TMP1]], align 8, !llvm.access.group [[ACC_GRP8]]
+// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP8]]
 // CHECK1-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP14]] to i64
 // CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP13]], i64 [[IDXPROM]]
-// CHECK1-NEXT:    [[TMP15:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP13]]
-// CHECK1-NEXT:    [[TMP16:%.*]] = load ptr, ptr [[TMP2]], align 8, !llvm.access.group [[ACC_GRP13]]
-// CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP13]]
+// CHECK1-NEXT:    [[TMP15:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP8]]
+// CHECK1-NEXT:    [[TMP16:%.*]] = load ptr, ptr [[TMP2]], align 8, !llvm.access.group [[ACC_GRP8]]
+// CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP8]]
 // CHECK1-NEXT:    [[IDXPROM2:%.*]] = sext i32 [[TMP17]] to i64
 // CHECK1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i64 [[IDXPROM2]]
-// CHECK1-NEXT:    [[TMP18:%.*]] = load float, ptr [[ARRAYIDX3]], align 4, !llvm.access.group [[ACC_GRP13]]
+// CHECK1-NEXT:    [[TMP18:%.*]] = load float, ptr [[ARRAYIDX3]], align 4, !llvm.access.group [[ACC_GRP8]]
 // CHECK1-NEXT:    [[MUL4:%.*]] = fmul float [[TMP15]], [[TMP18]]
-// CHECK1-NEXT:    [[TMP19:%.*]] = load ptr, ptr [[TMP3]], align 8, !llvm.access.group [[ACC_GRP13]]
-// CHECK1-NEXT:    [[TMP20:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP13]]
+// CHECK1-NEXT:    [[TMP19:%.*]] = load ptr, ptr [[TMP3]], align 8, !llvm.access.group [[ACC_GRP8]]
+// CHECK1-NEXT:    [[TMP20:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP8]]
 // CHECK1-NEXT:    [[IDXPROM5:%.*]] = sext i32 [[TMP20]] to i64
 // CHECK1-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds float, ptr [[TMP19]], i64 [[IDXPROM5]]
-// CHECK1-NEXT:    [[TMP21:%.*]] = load float, ptr [[ARRAYIDX6]], align 4, !llvm.access.group [[ACC_GRP13]]
+// CHECK1-NEXT:    [[TMP21:%.*]] = load float, ptr [[ARRAYIDX6]], align 4, !llvm.access.group [[ACC_GRP8]]
 // CHECK1-NEXT:    [[MUL7:%.*]] = fmul float [[MUL4]], [[TMP21]]
-// CHECK1-NEXT:    [[TMP22:%.*]] = load ptr, ptr [[TMP0]], align 8, !llvm.access.group [[ACC_GRP13]]
-// CHECK1-NEXT:    [[TMP23:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP13]]
+// CHECK1-NEXT:    [[TMP22:%.*]] = load ptr, ptr [[TMP0]], align 8, !llvm.access.group [[ACC_GRP8]]
+// CHECK1-NEXT:    [[TMP23:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP8]]
 // CHECK1-NEXT:    [[IDXPROM8:%.*]] = sext i32 [[TMP23]] to i64
 // CHECK1-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i64 [[IDXPROM8]]
-// CHECK1-NEXT:    store float [[MUL7]], ptr [[ARRAYIDX9]], align 4, !llvm.access.group [[ACC_GRP13]]
+// CHECK1-NEXT:    store float [[MUL7]], ptr [[ARRAYIDX9]], align 4, !llvm.access.group [[ACC_GRP8]]
 // CHECK1-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK1:       omp.body.continue:
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK1:       omp.inner.for.inc:
-// CHECK1-NEXT:    [[TMP24:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP13]]
+// CHECK1-NEXT:    [[TMP24:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP8]]
 // CHECK1-NEXT:    [[ADD10:%.*]] = add nsw i32 [[TMP24]], 1
-// CHECK1-NEXT:    store i32 [[ADD10]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP13]]
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP14:![0-9]+]]
+// CHECK1-NEXT:    store i32 [[ADD10]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP8]]
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP9:![0-9]+]]
 // CHECK1:       omp.inner.for.end:
 // CHECK1-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK1:       omp.loop.exit:
@@ -420,7 +420,7 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK1-NEXT:    [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0
 // CHECK1-NEXT:    br i1 [[TMP32]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK1:       omp_offload.failed:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z18static_not_chunkedPfS_S_S__l86(ptr [[TMP0]], ptr [[TMP1]], ptr [[TMP2]], ptr [[TMP3]]) #[[ATTR4]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z18static_not_chunkedPfS_S_S__l86(ptr [[TMP0]], ptr [[TMP1]], ptr [[TMP2]], ptr [[TMP3]]) #[[ATTR3]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    ret void
@@ -442,7 +442,7 @@ int fint(void) { return ftemplate<int>(); }
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z18static_not_chunkedPfS_S_S__l86.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[D:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[D:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -527,7 +527,7 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK1-NEXT:    [[TMP23:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
 // CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP23]], 1
 // CHECK1-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP20:![0-9]+]]
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP15:![0-9]+]]
 // CHECK1:       omp.inner.for.end:
 // CHECK1-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK1:       omp.loop.exit:
@@ -618,7 +618,7 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK1-NEXT:    [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0
 // CHECK1-NEXT:    br i1 [[TMP32]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK1:       omp_offload.failed:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z14static_chunkedPfS_S_S__l103(ptr [[TMP0]], ptr [[TMP1]], ptr [[TMP2]], ptr [[TMP3]]) #[[ATTR4]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z14static_chunkedPfS_S_S__l103(ptr [[TMP0]], ptr [[TMP1]], ptr [[TMP2]], ptr [[TMP3]]) #[[ATTR3]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    ret void
@@ -640,7 +640,7 @@ int fint(void) { return ftemplate<int>(); }
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z14static_chunkedPfS_S_S__l103.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[D:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[D:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -694,45 +694,45 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK1:       omp.dispatch.body:
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK1:       omp.inner.for.cond:
-// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22:![0-9]+]]
-// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP17:![0-9]+]]
+// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP17]]
 // CHECK1-NEXT:    [[CMP2:%.*]] = icmp ule i32 [[TMP11]], [[TMP12]]
 // CHECK1-NEXT:    br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK1:       omp.inner.for.body:
-// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP17]]
 // CHECK1-NEXT:    [[MUL:%.*]] = mul i32 [[TMP13]], 127
 // CHECK1-NEXT:    [[ADD:%.*]] = add i32 131071, [[MUL]]
-// CHECK1-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP22]]
-// CHECK1-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[TMP1]], align 8, !llvm.access.group [[ACC_GRP22]]
-// CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK1-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP17]]
+// CHECK1-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[TMP1]], align 8, !llvm.access.group [[ACC_GRP17]]
+// CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP17]]
 // CHECK1-NEXT:    [[IDXPROM:%.*]] = zext i32 [[TMP15]] to i64
 // CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i64 [[IDXPROM]]
-// CHECK1-NEXT:    [[TMP16:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP22]]
-// CHECK1-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[TMP2]], align 8, !llvm.access.group [[ACC_GRP22]]
-// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK1-NEXT:    [[TMP16:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP17]]
+// CHECK1-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[TMP2]], align 8, !llvm.access.group [[ACC_GRP17]]
+// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP17]]
 // CHECK1-NEXT:    [[IDXPROM3:%.*]] = zext i32 [[TMP18]] to i64
 // CHECK1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i64 [[IDXPROM3]]
-// CHECK1-NEXT:    [[TMP19:%.*]] = load float, ptr [[ARRAYIDX4]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK1-NEXT:    [[TMP19:%.*]] = load float, ptr [[ARRAYIDX4]], align 4, !llvm.access.group [[ACC_GRP17]]
 // CHECK1-NEXT:    [[MUL5:%.*]] = fmul float [[TMP16]], [[TMP19]]
-// CHECK1-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[TMP3]], align 8, !llvm.access.group [[ACC_GRP22]]
-// CHECK1-NEXT:    [[TMP21:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK1-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[TMP3]], align 8, !llvm.access.group [[ACC_GRP17]]
+// CHECK1-NEXT:    [[TMP21:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP17]]
 // CHECK1-NEXT:    [[IDXPROM6:%.*]] = zext i32 [[TMP21]] to i64
 // CHECK1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[IDXPROM6]]
-// CHECK1-NEXT:    [[TMP22:%.*]] = load float, ptr [[ARRAYIDX7]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK1-NEXT:    [[TMP22:%.*]] = load float, ptr [[ARRAYIDX7]], align 4, !llvm.access.group [[ACC_GRP17]]
 // CHECK1-NEXT:    [[MUL8:%.*]] = fmul float [[MUL5]], [[TMP22]]
-// CHECK1-NEXT:    [[TMP23:%.*]] = load ptr, ptr [[TMP0]], align 8, !llvm.access.group [[ACC_GRP22]]
-// CHECK1-NEXT:    [[TMP24:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK1-NEXT:    [[TMP23:%.*]] = load ptr, ptr [[TMP0]], align 8, !llvm.access.group [[ACC_GRP17]]
+// CHECK1-NEXT:    [[TMP24:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP17]]
 // CHECK1-NEXT:    [[IDXPROM9:%.*]] = zext i32 [[TMP24]] to i64
 // CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds float, ptr [[TMP23]], i64 [[IDXPROM9]]
-// CHECK1-NEXT:    store float [[MUL8]], ptr [[ARRAYIDX10]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK1-NEXT:    store float [[MUL8]], ptr [[ARRAYIDX10]], align 4, !llvm.access.group [[ACC_GRP17]]
 // CHECK1-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK1:       omp.body.continue:
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK1:       omp.inner.for.inc:
-// CHECK1-NEXT:    [[TMP25:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK1-NEXT:    [[TMP25:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP17]]
 // CHECK1-NEXT:    [[ADD11:%.*]] = add i32 [[TMP25]], 1
-// CHECK1-NEXT:    store i32 [[ADD11]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP23:![0-9]+]]
+// CHECK1-NEXT:    store i32 [[ADD11]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP17]]
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP18:![0-9]+]]
 // CHECK1:       omp.inner.for.end:
 // CHECK1-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK1:       omp.dispatch.inc:
@@ -835,7 +835,7 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK1-NEXT:    [[TMP30:%.*]] = icmp ne i32 [[TMP29]], 0
 // CHECK1-NEXT:    br i1 [[TMP30]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK1:       omp_offload.failed:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z12test_precondv_l115(i64 [[TMP1]], i64 [[TMP3]]) #[[ATTR4]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z12test_precondv_l115(i64 [[TMP1]], i64 [[TMP3]]) #[[ATTR3]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    ret void
@@ -853,7 +853,7 @@ int fint(void) { return ftemplate<int>(); }
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z12test_precondv_l115.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[I:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[A:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[I:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[A:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -917,26 +917,26 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK1-NEXT:    store i32 [[TMP13]], ptr [[DOTOMP_IV]], align 4
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK1:       omp.inner.for.cond:
-// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP25:![0-9]+]]
-// CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP25]]
+// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP20:![0-9]+]]
+// CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP20]]
 // CHECK1-NEXT:    [[CMP8:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]]
 // CHECK1-NEXT:    br i1 [[CMP8]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK1:       omp.inner.for.body:
-// CHECK1-NEXT:    [[TMP16:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR_]], align 1, !llvm.access.group [[ACC_GRP25]]
+// CHECK1-NEXT:    [[TMP16:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR_]], align 1, !llvm.access.group [[ACC_GRP20]]
 // CHECK1-NEXT:    [[CONV9:%.*]] = sext i8 [[TMP16]] to i32
-// CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP25]]
+// CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP20]]
 // CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP17]], 1
 // CHECK1-NEXT:    [[ADD10:%.*]] = add nsw i32 [[CONV9]], [[MUL]]
 // CHECK1-NEXT:    [[CONV11:%.*]] = trunc i32 [[ADD10]] to i8
-// CHECK1-NEXT:    store i8 [[CONV11]], ptr [[I6]], align 1, !llvm.access.group [[ACC_GRP25]]
+// CHECK1-NEXT:    store i8 [[CONV11]], ptr [[I6]], align 1, !llvm.access.group [[ACC_GRP20]]
 // CHECK1-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK1:       omp.body.continue:
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK1:       omp.inner.for.inc:
-// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP25]]
+// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP20]]
 // CHECK1-NEXT:    [[ADD12:%.*]] = add nsw i32 [[TMP18]], 1
-// CHECK1-NEXT:    store i32 [[ADD12]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP25]]
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP26:![0-9]+]]
+// CHECK1-NEXT:    store i32 [[ADD12]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP20]]
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP21:![0-9]+]]
 // CHECK1:       omp.inner.for.end:
 // CHECK1-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK1:       omp.loop.exit:
@@ -1025,7 +1025,7 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK1-NEXT:    [[TMP21:%.*]] = icmp ne i32 [[TMP20]], 0
 // CHECK1-NEXT:    br i1 [[TMP21]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK1:       omp_offload.failed:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_v_l135(i64 [[TMP1]]) #[[ATTR4]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_v_l135(i64 [[TMP1]]) #[[ATTR3]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    ret i32 0
@@ -1041,7 +1041,7 @@ int fint(void) { return ftemplate<int>(); }
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_v_l135.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[AA:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[AA:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1088,23 +1088,23 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK1:       omp.dispatch.body:
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK1:       omp.inner.for.cond:
-// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP28:![0-9]+]]
-// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP28]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP23:![0-9]+]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP23]]
 // CHECK1-NEXT:    [[CMP2:%.*]] = icmp sle i32 [[TMP9]], [[TMP10]]
 // CHECK1-NEXT:    br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK1:       omp.inner.for.body:
-// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP28]]
+// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP23]]
 // CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP11]], 1
 // CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK1-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP28]]
+// CHECK1-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP23]]
 // CHECK1-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK1:       omp.body.continue:
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK1:       omp.inner.for.inc:
-// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP28]]
+// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP23]]
 // CHECK1-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP12]], 1
-// CHECK1-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP28]]
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP29:![0-9]+]]
+// CHECK1-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP24:![0-9]+]]
 // CHECK1:       omp.inner.for.end:
 // CHECK1-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK1:       omp.dispatch.inc:
@@ -1130,7 +1130,7 @@ int fint(void) { return ftemplate<int>(); }
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK1-SAME: () #[[ATTR5:[0-9]+]] {
+// CHECK1-SAME: () #[[ATTR4:[0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK1-NEXT:    ret void
@@ -1212,7 +1212,7 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK3-NEXT:    [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0
 // CHECK3-NEXT:    br i1 [[TMP32]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK3:       omp_offload.failed:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z23without_schedule_clausePfS_S_S__l70(ptr [[TMP0]], ptr [[TMP1]], ptr [[TMP2]], ptr [[TMP3]]) #[[ATTR4:[0-9]+]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z23without_schedule_clausePfS_S_S__l70(ptr [[TMP0]], ptr [[TMP1]], ptr [[TMP2]], ptr [[TMP3]]) #[[ATTR3:[0-9]+]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK3:       omp_offload.cont:
 // CHECK3-NEXT:    ret void
@@ -1234,7 +1234,7 @@ int fint(void) { return ftemplate<int>(); }
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z23without_schedule_clausePfS_S_S__l70.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[D:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[D:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1283,41 +1283,41 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK3-NEXT:    store i32 [[TMP9]], ptr [[DOTOMP_IV]], align 4
 // CHECK3-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK3:       omp.inner.for.cond:
-// CHECK3-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP14:![0-9]+]]
-// CHECK3-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP14]]
+// CHECK3-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP9:![0-9]+]]
+// CHECK3-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP9]]
 // CHECK3-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP10]], [[TMP11]]
 // CHECK3-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK3:       omp.inner.for.body:
-// CHECK3-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP14]]
+// CHECK3-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP9]]
 // CHECK3-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP12]], 7
 // CHECK3-NEXT:    [[ADD:%.*]] = add nsw i32 33, [[MUL]]
-// CHECK3-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP14]]
-// CHECK3-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[TMP1]], align 4, !llvm.access.group [[ACC_GRP14]]
-// CHECK3-NEXT:    [[TMP14:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP14]]
+// CHECK3-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP9]]
+// CHECK3-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[TMP1]], align 4, !llvm.access.group [[ACC_GRP9]]
+// CHECK3-NEXT:    [[TMP14:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP9]]
 // CHECK3-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP13]], i32 [[TMP14]]
-// CHECK3-NEXT:    [[TMP15:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP14]]
-// CHECK3-NEXT:    [[TMP16:%.*]] = load ptr, ptr [[TMP2]], align 4, !llvm.access.group [[ACC_GRP14]]
-// CHECK3-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP14]]
+// CHECK3-NEXT:    [[TMP15:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP9]]
+// CHECK3-NEXT:    [[TMP16:%.*]] = load ptr, ptr [[TMP2]], align 4, !llvm.access.group [[ACC_GRP9]]
+// CHECK3-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP9]]
 // CHECK3-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i32 [[TMP17]]
-// CHECK3-NEXT:    [[TMP18:%.*]] = load float, ptr [[ARRAYIDX2]], align 4, !llvm.access.group [[ACC_GRP14]]
+// CHECK3-NEXT:    [[TMP18:%.*]] = load float, ptr [[ARRAYIDX2]], align 4, !llvm.access.group [[ACC_GRP9]]
 // CHECK3-NEXT:    [[MUL3:%.*]] = fmul float [[TMP15]], [[TMP18]]
-// CHECK3-NEXT:    [[TMP19:%.*]] = load ptr, ptr [[TMP3]], align 4, !llvm.access.group [[ACC_GRP14]]
-// CHECK3-NEXT:    [[TMP20:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP14]]
+// CHECK3-NEXT:    [[TMP19:%.*]] = load ptr, ptr [[TMP3]], align 4, !llvm.access.group [[ACC_GRP9]]
+// CHECK3-NEXT:    [[TMP20:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP9]]
 // CHECK3-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[TMP19]], i32 [[TMP20]]
-// CHECK3-NEXT:    [[TMP21:%.*]] = load float, ptr [[ARRAYIDX4]], align 4, !llvm.access.group [[ACC_GRP14]]
+// CHECK3-NEXT:    [[TMP21:%.*]] = load float, ptr [[ARRAYIDX4]], align 4, !llvm.access.group [[ACC_GRP9]]
 // CHECK3-NEXT:    [[MUL5:%.*]] = fmul float [[MUL3]], [[TMP21]]
-// CHECK3-NEXT:    [[TMP22:%.*]] = load ptr, ptr [[TMP0]], align 4, !llvm.access.group [[ACC_GRP14]]
-// CHECK3-NEXT:    [[TMP23:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP14]]
+// CHECK3-NEXT:    [[TMP22:%.*]] = load ptr, ptr [[TMP0]], align 4, !llvm.access.group [[ACC_GRP9]]
+// CHECK3-NEXT:    [[TMP23:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP9]]
 // CHECK3-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i32 [[TMP23]]
-// CHECK3-NEXT:    store float [[MUL5]], ptr [[ARRAYIDX6]], align 4, !llvm.access.group [[ACC_GRP14]]
+// CHECK3-NEXT:    store float [[MUL5]], ptr [[ARRAYIDX6]], align 4, !llvm.access.group [[ACC_GRP9]]
 // CHECK3-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK3:       omp.body.continue:
 // CHECK3-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK3:       omp.inner.for.inc:
-// CHECK3-NEXT:    [[TMP24:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP14]]
+// CHECK3-NEXT:    [[TMP24:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP9]]
 // CHECK3-NEXT:    [[ADD7:%.*]] = add nsw i32 [[TMP24]], 1
-// CHECK3-NEXT:    store i32 [[ADD7]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP14]]
-// CHECK3-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP15:![0-9]+]]
+// CHECK3-NEXT:    store i32 [[ADD7]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP9]]
+// CHECK3-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP10:![0-9]+]]
 // CHECK3:       omp.inner.for.end:
 // CHECK3-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK3:       omp.loop.exit:
@@ -1408,7 +1408,7 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK3-NEXT:    [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0
 // CHECK3-NEXT:    br i1 [[TMP32]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK3:       omp_offload.failed:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z18static_not_chunkedPfS_S_S__l86(ptr [[TMP0]], ptr [[TMP1]], ptr [[TMP2]], ptr [[TMP3]]) #[[ATTR4]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z18static_not_chunkedPfS_S_S__l86(ptr [[TMP0]], ptr [[TMP1]], ptr [[TMP2]], ptr [[TMP3]]) #[[ATTR3]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK3:       omp_offload.cont:
 // CHECK3-NEXT:    ret void
@@ -1430,7 +1430,7 @@ int fint(void) { return ftemplate<int>(); }
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z18static_not_chunkedPfS_S_S__l86.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[D:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[D:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1511,7 +1511,7 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK3-NEXT:    [[TMP23:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
 // CHECK3-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP23]], 1
 // CHECK3-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4
-// CHECK3-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP21:![0-9]+]]
+// CHECK3-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP16:![0-9]+]]
 // CHECK3:       omp.inner.for.end:
 // CHECK3-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK3:       omp.loop.exit:
@@ -1602,7 +1602,7 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK3-NEXT:    [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0
 // CHECK3-NEXT:    br i1 [[TMP32]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK3:       omp_offload.failed:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z14static_chunkedPfS_S_S__l103(ptr [[TMP0]], ptr [[TMP1]], ptr [[TMP2]], ptr [[TMP3]]) #[[ATTR4]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z14static_chunkedPfS_S_S__l103(ptr [[TMP0]], ptr [[TMP1]], ptr [[TMP2]], ptr [[TMP3]]) #[[ATTR3]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK3:       omp_offload.cont:
 // CHECK3-NEXT:    ret void
@@ -1624,7 +1624,7 @@ int fint(void) { return ftemplate<int>(); }
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z14static_chunkedPfS_S_S__l103.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[D:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[D:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1678,41 +1678,41 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK3:       omp.dispatch.body:
 // CHECK3-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK3:       omp.inner.for.cond:
-// CHECK3-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP23:![0-9]+]]
-// CHECK3-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK3-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP18:![0-9]+]]
+// CHECK3-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK3-NEXT:    [[CMP2:%.*]] = icmp ule i32 [[TMP11]], [[TMP12]]
 // CHECK3-NEXT:    br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK3:       omp.inner.for.body:
-// CHECK3-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK3-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK3-NEXT:    [[MUL:%.*]] = mul i32 [[TMP13]], 127
 // CHECK3-NEXT:    [[ADD:%.*]] = add i32 131071, [[MUL]]
-// CHECK3-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP23]]
-// CHECK3-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[TMP1]], align 4, !llvm.access.group [[ACC_GRP23]]
-// CHECK3-NEXT:    [[TMP15:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK3-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK3-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[TMP1]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK3-NEXT:    [[TMP15:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK3-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i32 [[TMP15]]
-// CHECK3-NEXT:    [[TMP16:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP23]]
-// CHECK3-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[TMP2]], align 4, !llvm.access.group [[ACC_GRP23]]
-// CHECK3-NEXT:    [[TMP18:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK3-NEXT:    [[TMP16:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK3-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[TMP2]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK3-NEXT:    [[TMP18:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK3-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i32 [[TMP18]]
-// CHECK3-NEXT:    [[TMP19:%.*]] = load float, ptr [[ARRAYIDX3]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK3-NEXT:    [[TMP19:%.*]] = load float, ptr [[ARRAYIDX3]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK3-NEXT:    [[MUL4:%.*]] = fmul float [[TMP16]], [[TMP19]]
-// CHECK3-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[TMP3]], align 4, !llvm.access.group [[ACC_GRP23]]
-// CHECK3-NEXT:    [[TMP21:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK3-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[TMP3]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK3-NEXT:    [[TMP21:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK3-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i32 [[TMP21]]
-// CHECK3-NEXT:    [[TMP22:%.*]] = load float, ptr [[ARRAYIDX5]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK3-NEXT:    [[TMP22:%.*]] = load float, ptr [[ARRAYIDX5]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK3-NEXT:    [[MUL6:%.*]] = fmul float [[MUL4]], [[TMP22]]
-// CHECK3-NEXT:    [[TMP23:%.*]] = load ptr, ptr [[TMP0]], align 4, !llvm.access.group [[ACC_GRP23]]
-// CHECK3-NEXT:    [[TMP24:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK3-NEXT:    [[TMP23:%.*]] = load ptr, ptr [[TMP0]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK3-NEXT:    [[TMP24:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK3-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[TMP23]], i32 [[TMP24]]
-// CHECK3-NEXT:    store float [[MUL6]], ptr [[ARRAYIDX7]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK3-NEXT:    store float [[MUL6]], ptr [[ARRAYIDX7]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK3-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK3:       omp.body.continue:
 // CHECK3-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK3:       omp.inner.for.inc:
-// CHECK3-NEXT:    [[TMP25:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK3-NEXT:    [[TMP25:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK3-NEXT:    [[ADD8:%.*]] = add i32 [[TMP25]], 1
-// CHECK3-NEXT:    store i32 [[ADD8]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP23]]
-// CHECK3-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP24:![0-9]+]]
+// CHECK3-NEXT:    store i32 [[ADD8]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK3-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP19:![0-9]+]]
 // CHECK3:       omp.inner.for.end:
 // CHECK3-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK3:       omp.dispatch.inc:
@@ -1815,7 +1815,7 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK3-NEXT:    [[TMP30:%.*]] = icmp ne i32 [[TMP29]], 0
 // CHECK3-NEXT:    br i1 [[TMP30]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK3:       omp_offload.failed:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z12test_precondv_l115(i32 [[TMP1]], i32 [[TMP3]]) #[[ATTR4]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z12test_precondv_l115(i32 [[TMP1]], i32 [[TMP3]]) #[[ATTR3]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK3:       omp_offload.cont:
 // CHECK3-NEXT:    ret void
@@ -1833,7 +1833,7 @@ int fint(void) { return ftemplate<int>(); }
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z12test_precondv_l115.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[I:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[A:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[I:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[A:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1897,26 +1897,26 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK3-NEXT:    store i32 [[TMP13]], ptr [[DOTOMP_IV]], align 4
 // CHECK3-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK3:       omp.inner.for.cond:
-// CHECK3-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26:![0-9]+]]
-// CHECK3-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK3-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP21:![0-9]+]]
+// CHECK3-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP21]]
 // CHECK3-NEXT:    [[CMP8:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]]
 // CHECK3-NEXT:    br i1 [[CMP8]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK3:       omp.inner.for.body:
-// CHECK3-NEXT:    [[TMP16:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR_]], align 1, !llvm.access.group [[ACC_GRP26]]
+// CHECK3-NEXT:    [[TMP16:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR_]], align 1, !llvm.access.group [[ACC_GRP21]]
 // CHECK3-NEXT:    [[CONV9:%.*]] = sext i8 [[TMP16]] to i32
-// CHECK3-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK3-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP21]]
 // CHECK3-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP17]], 1
 // CHECK3-NEXT:    [[ADD10:%.*]] = add nsw i32 [[CONV9]], [[MUL]]
 // CHECK3-NEXT:    [[CONV11:%.*]] = trunc i32 [[ADD10]] to i8
-// CHECK3-NEXT:    store i8 [[CONV11]], ptr [[I6]], align 1, !llvm.access.group [[ACC_GRP26]]
+// CHECK3-NEXT:    store i8 [[CONV11]], ptr [[I6]], align 1, !llvm.access.group [[ACC_GRP21]]
 // CHECK3-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK3:       omp.body.continue:
 // CHECK3-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK3:       omp.inner.for.inc:
-// CHECK3-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK3-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP21]]
 // CHECK3-NEXT:    [[ADD12:%.*]] = add nsw i32 [[TMP18]], 1
-// CHECK3-NEXT:    store i32 [[ADD12]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
-// CHECK3-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP27:![0-9]+]]
+// CHECK3-NEXT:    store i32 [[ADD12]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP21]]
+// CHECK3-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP22:![0-9]+]]
 // CHECK3:       omp.inner.for.end:
 // CHECK3-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK3:       omp.loop.exit:
@@ -2005,7 +2005,7 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK3-NEXT:    [[TMP21:%.*]] = icmp ne i32 [[TMP20]], 0
 // CHECK3-NEXT:    br i1 [[TMP21]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK3:       omp_offload.failed:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_v_l135(i32 [[TMP1]]) #[[ATTR4]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_v_l135(i32 [[TMP1]]) #[[ATTR3]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK3:       omp_offload.cont:
 // CHECK3-NEXT:    ret i32 0
@@ -2021,7 +2021,7 @@ int fint(void) { return ftemplate<int>(); }
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_v_l135.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[AA:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[AA:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -2068,23 +2068,23 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK3:       omp.dispatch.body:
 // CHECK3-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK3:       omp.inner.for.cond:
-// CHECK3-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP29:![0-9]+]]
-// CHECK3-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP29]]
+// CHECK3-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP24:![0-9]+]]
+// CHECK3-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP24]]
 // CHECK3-NEXT:    [[CMP2:%.*]] = icmp sle i32 [[TMP9]], [[TMP10]]
 // CHECK3-NEXT:    br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK3:       omp.inner.for.body:
-// CHECK3-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP29]]
+// CHECK3-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP24]]
 // CHECK3-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP11]], 1
 // CHECK3-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK3-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP29]]
+// CHECK3-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP24]]
 // CHECK3-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK3:       omp.body.continue:
 // CHECK3-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK3:       omp.inner.for.inc:
-// CHECK3-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP29]]
+// CHECK3-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP24]]
 // CHECK3-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP12]], 1
-// CHECK3-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP29]]
-// CHECK3-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP30:![0-9]+]]
+// CHECK3-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP24]]
+// CHECK3-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP25:![0-9]+]]
 // CHECK3:       omp.inner.for.end:
 // CHECK3-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK3:       omp.dispatch.inc:
@@ -2110,7 +2110,7 @@ int fint(void) { return ftemplate<int>(); }
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK3-SAME: () #[[ATTR5:[0-9]+]] {
+// CHECK3-SAME: () #[[ATTR4:[0-9]+]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK3-NEXT:    ret void
@@ -2192,7 +2192,7 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK5-NEXT:    [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0
 // CHECK5-NEXT:    br i1 [[TMP32]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK5:       omp_offload.failed:
-// CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z23without_schedule_clausePfS_S_S__l70(ptr [[TMP0]], ptr [[TMP1]], ptr [[TMP2]], ptr [[TMP3]]) #[[ATTR4:[0-9]+]]
+// CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z23without_schedule_clausePfS_S_S__l70(ptr [[TMP0]], ptr [[TMP1]], ptr [[TMP2]], ptr [[TMP3]]) #[[ATTR3:[0-9]+]]
 // CHECK5-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK5:       omp_offload.cont:
 // CHECK5-NEXT:    ret void
@@ -2214,7 +2214,7 @@ int fint(void) { return ftemplate<int>(); }
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z23without_schedule_clausePfS_S_S__l70.omp_outlined
-// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[D:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[D:%.*]]) #[[ATTR1]] {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -2263,45 +2263,45 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK5-NEXT:    store i32 [[TMP9]], ptr [[DOTOMP_IV]], align 4
 // CHECK5-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK5:       omp.inner.for.cond:
-// CHECK5-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP13:![0-9]+]]
-// CHECK5-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP13]]
+// CHECK5-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP8:![0-9]+]]
+// CHECK5-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP8]]
 // CHECK5-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP10]], [[TMP11]]
 // CHECK5-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK5:       omp.inner.for.body:
-// CHECK5-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP13]]
+// CHECK5-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP8]]
 // CHECK5-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP12]], 7
 // CHECK5-NEXT:    [[ADD:%.*]] = add nsw i32 33, [[MUL]]
-// CHECK5-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP13]]
-// CHECK5-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[TMP1]], align 8, !llvm.access.group [[ACC_GRP13]]
-// CHECK5-NEXT:    [[TMP14:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP13]]
+// CHECK5-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP8]]
+// CHECK5-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[TMP1]], align 8, !llvm.access.group [[ACC_GRP8]]
+// CHECK5-NEXT:    [[TMP14:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP8]]
 // CHECK5-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP14]] to i64
 // CHECK5-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP13]], i64 [[IDXPROM]]
-// CHECK5-NEXT:    [[TMP15:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP13]]
-// CHECK5-NEXT:    [[TMP16:%.*]] = load ptr, ptr [[TMP2]], align 8, !llvm.access.group [[ACC_GRP13]]
-// CHECK5-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP13]]
+// CHECK5-NEXT:    [[TMP15:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP8]]
+// CHECK5-NEXT:    [[TMP16:%.*]] = load ptr, ptr [[TMP2]], align 8, !llvm.access.group [[ACC_GRP8]]
+// CHECK5-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP8]]
 // CHECK5-NEXT:    [[IDXPROM2:%.*]] = sext i32 [[TMP17]] to i64
 // CHECK5-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i64 [[IDXPROM2]]
-// CHECK5-NEXT:    [[TMP18:%.*]] = load float, ptr [[ARRAYIDX3]], align 4, !llvm.access.group [[ACC_GRP13]]
+// CHECK5-NEXT:    [[TMP18:%.*]] = load float, ptr [[ARRAYIDX3]], align 4, !llvm.access.group [[ACC_GRP8]]
 // CHECK5-NEXT:    [[MUL4:%.*]] = fmul float [[TMP15]], [[TMP18]]
-// CHECK5-NEXT:    [[TMP19:%.*]] = load ptr, ptr [[TMP3]], align 8, !llvm.access.group [[ACC_GRP13]]
-// CHECK5-NEXT:    [[TMP20:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP13]]
+// CHECK5-NEXT:    [[TMP19:%.*]] = load ptr, ptr [[TMP3]], align 8, !llvm.access.group [[ACC_GRP8]]
+// CHECK5-NEXT:    [[TMP20:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP8]]
 // CHECK5-NEXT:    [[IDXPROM5:%.*]] = sext i32 [[TMP20]] to i64
 // CHECK5-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds float, ptr [[TMP19]], i64 [[IDXPROM5]]
-// CHECK5-NEXT:    [[TMP21:%.*]] = load float, ptr [[ARRAYIDX6]], align 4, !llvm.access.group [[ACC_GRP13]]
+// CHECK5-NEXT:    [[TMP21:%.*]] = load float, ptr [[ARRAYIDX6]], align 4, !llvm.access.group [[ACC_GRP8]]
 // CHECK5-NEXT:    [[MUL7:%.*]] = fmul float [[MUL4]], [[TMP21]]
-// CHECK5-NEXT:    [[TMP22:%.*]] = load ptr, ptr [[TMP0]], align 8, !llvm.access.group [[ACC_GRP13]]
-// CHECK5-NEXT:    [[TMP23:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP13]]
+// CHECK5-NEXT:    [[TMP22:%.*]] = load ptr, ptr [[TMP0]], align 8, !llvm.access.group [[ACC_GRP8]]
+// CHECK5-NEXT:    [[TMP23:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP8]]
 // CHECK5-NEXT:    [[IDXPROM8:%.*]] = sext i32 [[TMP23]] to i64
 // CHECK5-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i64 [[IDXPROM8]]
-// CHECK5-NEXT:    store float [[MUL7]], ptr [[ARRAYIDX9]], align 4, !llvm.access.group [[ACC_GRP13]]
+// CHECK5-NEXT:    store float [[MUL7]], ptr [[ARRAYIDX9]], align 4, !llvm.access.group [[ACC_GRP8]]
 // CHECK5-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK5:       omp.body.continue:
 // CHECK5-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK5:       omp.inner.for.inc:
-// CHECK5-NEXT:    [[TMP24:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP13]]
+// CHECK5-NEXT:    [[TMP24:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP8]]
 // CHECK5-NEXT:    [[ADD10:%.*]] = add nsw i32 [[TMP24]], 1
-// CHECK5-NEXT:    store i32 [[ADD10]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP13]]
-// CHECK5-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP14:![0-9]+]]
+// CHECK5-NEXT:    store i32 [[ADD10]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP8]]
+// CHECK5-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP9:![0-9]+]]
 // CHECK5:       omp.inner.for.end:
 // CHECK5-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK5:       omp.loop.exit:
@@ -2392,7 +2392,7 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK5-NEXT:    [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0
 // CHECK5-NEXT:    br i1 [[TMP32]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK5:       omp_offload.failed:
-// CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z18static_not_chunkedPfS_S_S__l86(ptr [[TMP0]], ptr [[TMP1]], ptr [[TMP2]], ptr [[TMP3]]) #[[ATTR4]]
+// CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z18static_not_chunkedPfS_S_S__l86(ptr [[TMP0]], ptr [[TMP1]], ptr [[TMP2]], ptr [[TMP3]]) #[[ATTR3]]
 // CHECK5-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK5:       omp_offload.cont:
 // CHECK5-NEXT:    ret void
@@ -2414,7 +2414,7 @@ int fint(void) { return ftemplate<int>(); }
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z18static_not_chunkedPfS_S_S__l86.omp_outlined
-// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[D:%.*]]) #[[ATTR2]] {
+// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[D:%.*]]) #[[ATTR1]] {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -2470,7 +2470,7 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK5-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP11]], 7
 // CHECK5-NEXT:    [[SUB:%.*]] = sub nsw i32 32000000, [[MUL]]
 // CHECK5-NEXT:    store i32 [[SUB]], ptr [[I]], align 4
-// CHECK5-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[TMP1]], align 8, !nontemporal !20
+// CHECK5-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[TMP1]], align 8, !nontemporal !15
 // CHECK5-NEXT:    [[TMP13:%.*]] = load i32, ptr [[I]], align 4
 // CHECK5-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP13]] to i64
 // CHECK5-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i64 [[IDXPROM]]
@@ -2487,7 +2487,7 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK5-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds float, ptr [[TMP18]], i64 [[IDXPROM5]]
 // CHECK5-NEXT:    [[TMP20:%.*]] = load float, ptr [[ARRAYIDX6]], align 4
 // CHECK5-NEXT:    [[MUL7:%.*]] = fmul float [[MUL4]], [[TMP20]]
-// CHECK5-NEXT:    [[TMP21:%.*]] = load ptr, ptr [[TMP0]], align 8, !nontemporal !20
+// CHECK5-NEXT:    [[TMP21:%.*]] = load ptr, ptr [[TMP0]], align 8, !nontemporal !15
 // CHECK5-NEXT:    [[TMP22:%.*]] = load i32, ptr [[I]], align 4
 // CHECK5-NEXT:    [[IDXPROM8:%.*]] = sext i32 [[TMP22]] to i64
 // CHECK5-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds float, ptr [[TMP21]], i64 [[IDXPROM8]]
@@ -2499,7 +2499,7 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK5-NEXT:    [[TMP23:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
 // CHECK5-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP23]], 1
 // CHECK5-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4
-// CHECK5-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP21:![0-9]+]]
+// CHECK5-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP16:![0-9]+]]
 // CHECK5:       omp.inner.for.end:
 // CHECK5-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK5:       omp.loop.exit:
@@ -2590,7 +2590,7 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK5-NEXT:    [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0
 // CHECK5-NEXT:    br i1 [[TMP32]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK5:       omp_offload.failed:
-// CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z14static_chunkedPfS_S_S__l103(ptr [[TMP0]], ptr [[TMP1]], ptr [[TMP2]], ptr [[TMP3]]) #[[ATTR4]]
+// CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z14static_chunkedPfS_S_S__l103(ptr [[TMP0]], ptr [[TMP1]], ptr [[TMP2]], ptr [[TMP3]]) #[[ATTR3]]
 // CHECK5-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK5:       omp_offload.cont:
 // CHECK5-NEXT:    ret void
@@ -2612,7 +2612,7 @@ int fint(void) { return ftemplate<int>(); }
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z14static_chunkedPfS_S_S__l103.omp_outlined
-// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[D:%.*]]) #[[ATTR2]] {
+// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[D:%.*]]) #[[ATTR1]] {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -2666,45 +2666,45 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK5:       omp.dispatch.body:
 // CHECK5-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK5:       omp.inner.for.cond:
-// CHECK5-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP23:![0-9]+]]
-// CHECK5-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK5-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP18:![0-9]+]]
+// CHECK5-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK5-NEXT:    [[CMP2:%.*]] = icmp ule i32 [[TMP11]], [[TMP12]]
 // CHECK5-NEXT:    br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK5:       omp.inner.for.body:
-// CHECK5-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK5-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK5-NEXT:    [[MUL:%.*]] = mul i32 [[TMP13]], 127
 // CHECK5-NEXT:    [[ADD:%.*]] = add i32 131071, [[MUL]]
-// CHECK5-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP23]]
-// CHECK5-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[TMP1]], align 8, !llvm.access.group [[ACC_GRP23]]
-// CHECK5-NEXT:    [[TMP15:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK5-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK5-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[TMP1]], align 8, !llvm.access.group [[ACC_GRP18]]
+// CHECK5-NEXT:    [[TMP15:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK5-NEXT:    [[IDXPROM:%.*]] = zext i32 [[TMP15]] to i64
 // CHECK5-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i64 [[IDXPROM]]
-// CHECK5-NEXT:    [[TMP16:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP23]]
-// CHECK5-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[TMP2]], align 8, !llvm.access.group [[ACC_GRP23]]
-// CHECK5-NEXT:    [[TMP18:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK5-NEXT:    [[TMP16:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK5-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[TMP2]], align 8, !llvm.access.group [[ACC_GRP18]]
+// CHECK5-NEXT:    [[TMP18:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK5-NEXT:    [[IDXPROM3:%.*]] = zext i32 [[TMP18]] to i64
 // CHECK5-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i64 [[IDXPROM3]]
-// CHECK5-NEXT:    [[TMP19:%.*]] = load float, ptr [[ARRAYIDX4]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK5-NEXT:    [[TMP19:%.*]] = load float, ptr [[ARRAYIDX4]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK5-NEXT:    [[MUL5:%.*]] = fmul float [[TMP16]], [[TMP19]]
-// CHECK5-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[TMP3]], align 8, !llvm.access.group [[ACC_GRP23]]
-// CHECK5-NEXT:    [[TMP21:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK5-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[TMP3]], align 8, !llvm.access.group [[ACC_GRP18]]
+// CHECK5-NEXT:    [[TMP21:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK5-NEXT:    [[IDXPROM6:%.*]] = zext i32 [[TMP21]] to i64
 // CHECK5-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[IDXPROM6]]
-// CHECK5-NEXT:    [[TMP22:%.*]] = load float, ptr [[ARRAYIDX7]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK5-NEXT:    [[TMP22:%.*]] = load float, ptr [[ARRAYIDX7]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK5-NEXT:    [[MUL8:%.*]] = fmul float [[MUL5]], [[TMP22]]
-// CHECK5-NEXT:    [[TMP23:%.*]] = load ptr, ptr [[TMP0]], align 8, !llvm.access.group [[ACC_GRP23]]
-// CHECK5-NEXT:    [[TMP24:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK5-NEXT:    [[TMP23:%.*]] = load ptr, ptr [[TMP0]], align 8, !llvm.access.group [[ACC_GRP18]]
+// CHECK5-NEXT:    [[TMP24:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK5-NEXT:    [[IDXPROM9:%.*]] = zext i32 [[TMP24]] to i64
 // CHECK5-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds float, ptr [[TMP23]], i64 [[IDXPROM9]]
-// CHECK5-NEXT:    store float [[MUL8]], ptr [[ARRAYIDX10]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK5-NEXT:    store float [[MUL8]], ptr [[ARRAYIDX10]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK5-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK5:       omp.body.continue:
 // CHECK5-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK5:       omp.inner.for.inc:
-// CHECK5-NEXT:    [[TMP25:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK5-NEXT:    [[TMP25:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK5-NEXT:    [[ADD11:%.*]] = add i32 [[TMP25]], 1
-// CHECK5-NEXT:    store i32 [[ADD11]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP23]]
-// CHECK5-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP24:![0-9]+]]
+// CHECK5-NEXT:    store i32 [[ADD11]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK5-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP19:![0-9]+]]
 // CHECK5:       omp.inner.for.end:
 // CHECK5-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK5:       omp.dispatch.inc:
@@ -2807,7 +2807,7 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK5-NEXT:    [[TMP30:%.*]] = icmp ne i32 [[TMP29]], 0
 // CHECK5-NEXT:    br i1 [[TMP30]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK5:       omp_offload.failed:
-// CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z12test_precondv_l115(i64 [[TMP1]], i64 [[TMP3]]) #[[ATTR4]]
+// CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z12test_precondv_l115(i64 [[TMP1]], i64 [[TMP3]]) #[[ATTR3]]
 // CHECK5-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK5:       omp_offload.cont:
 // CHECK5-NEXT:    ret void
@@ -2825,7 +2825,7 @@ int fint(void) { return ftemplate<int>(); }
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z12test_precondv_l115.omp_outlined
-// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[I:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[A:%.*]]) #[[ATTR2]] {
+// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[I:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[A:%.*]]) #[[ATTR1]] {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -2893,26 +2893,26 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK5:       omp_if.then:
 // CHECK5-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK5:       omp.inner.for.cond:
-// CHECK5-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26:![0-9]+]]
-// CHECK5-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK5-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP21:![0-9]+]]
+// CHECK5-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP21]]
 // CHECK5-NEXT:    [[CMP8:%.*]] = icmp sle i32 [[TMP15]], [[TMP16]]
 // CHECK5-NEXT:    br i1 [[CMP8]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK5:       omp.inner.for.body:
-// CHECK5-NEXT:    [[TMP17:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR_]], align 1, !llvm.access.group [[ACC_GRP26]]
+// CHECK5-NEXT:    [[TMP17:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR_]], align 1, !llvm.access.group [[ACC_GRP21]]
 // CHECK5-NEXT:    [[CONV9:%.*]] = sext i8 [[TMP17]] to i32
-// CHECK5-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK5-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP21]]
 // CHECK5-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1
 // CHECK5-NEXT:    [[ADD10:%.*]] = add nsw i32 [[CONV9]], [[MUL]]
 // CHECK5-NEXT:    [[CONV11:%.*]] = trunc i32 [[ADD10]] to i8
-// CHECK5-NEXT:    store i8 [[CONV11]], ptr [[I6]], align 1, !nontemporal !20, !llvm.access.group [[ACC_GRP26]]
+// CHECK5-NEXT:    store i8 [[CONV11]], ptr [[I6]], align 1, !nontemporal !15, !llvm.access.group [[ACC_GRP21]]
 // CHECK5-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK5:       omp.body.continue:
 // CHECK5-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK5:       omp.inner.for.inc:
-// CHECK5-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK5-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP21]]
 // CHECK5-NEXT:    [[ADD12:%.*]] = add nsw i32 [[TMP19]], 1
-// CHECK5-NEXT:    store i32 [[ADD12]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
-// CHECK5-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP27:![0-9]+]]
+// CHECK5-NEXT:    store i32 [[ADD12]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP21]]
+// CHECK5-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP22:![0-9]+]]
 // CHECK5:       omp.inner.for.end:
 // CHECK5-NEXT:    br label [[OMP_IF_END:%.*]]
 // CHECK5:       omp_if.else:
@@ -2937,7 +2937,7 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK5-NEXT:    [[TMP24:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
 // CHECK5-NEXT:    [[ADD22:%.*]] = add nsw i32 [[TMP24]], 1
 // CHECK5-NEXT:    store i32 [[ADD22]], ptr [[DOTOMP_IV]], align 4
-// CHECK5-NEXT:    br label [[OMP_INNER_FOR_COND13]], !llvm.loop [[LOOP29:![0-9]+]]
+// CHECK5-NEXT:    br label [[OMP_INNER_FOR_COND13]], !llvm.loop [[LOOP24:![0-9]+]]
 // CHECK5:       omp.inner.for.end23:
 // CHECK5-NEXT:    br label [[OMP_IF_END]]
 // CHECK5:       omp_if.end:
@@ -3028,7 +3028,7 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK5-NEXT:    [[TMP21:%.*]] = icmp ne i32 [[TMP20]], 0
 // CHECK5-NEXT:    br i1 [[TMP21]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK5:       omp_offload.failed:
-// CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_v_l135(i64 [[TMP1]]) #[[ATTR4]]
+// CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_v_l135(i64 [[TMP1]]) #[[ATTR3]]
 // CHECK5-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK5:       omp_offload.cont:
 // CHECK5-NEXT:    ret i32 0
@@ -3044,7 +3044,7 @@ int fint(void) { return ftemplate<int>(); }
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_v_l135.omp_outlined
-// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[AA:%.*]]) #[[ATTR2]] {
+// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[AA:%.*]]) #[[ATTR1]] {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -3091,23 +3091,23 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK5:       omp.dispatch.body:
 // CHECK5-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK5:       omp.inner.for.cond:
-// CHECK5-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP31:![0-9]+]]
-// CHECK5-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP31]]
+// CHECK5-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26:![0-9]+]]
+// CHECK5-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP26]]
 // CHECK5-NEXT:    [[CMP2:%.*]] = icmp sle i32 [[TMP9]], [[TMP10]]
 // CHECK5-NEXT:    br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK5:       omp.inner.for.body:
-// CHECK5-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP31]]
+// CHECK5-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
 // CHECK5-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP11]], 1
 // CHECK5-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK5-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP31]]
+// CHECK5-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP26]]
 // CHECK5-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK5:       omp.body.continue:
 // CHECK5-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK5:       omp.inner.for.inc:
-// CHECK5-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP31]]
+// CHECK5-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
 // CHECK5-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP12]], 1
-// CHECK5-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP31]]
-// CHECK5-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP32:![0-9]+]]
+// CHECK5-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK5-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP27:![0-9]+]]
 // CHECK5:       omp.inner.for.end:
 // CHECK5-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK5:       omp.dispatch.inc:
@@ -3133,7 +3133,7 @@ int fint(void) { return ftemplate<int>(); }
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK5-SAME: () #[[ATTR5:[0-9]+]] {
+// CHECK5-SAME: () #[[ATTR4:[0-9]+]] {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK5-NEXT:    ret void
@@ -3215,7 +3215,7 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK7-NEXT:    [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0
 // CHECK7-NEXT:    br i1 [[TMP32]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK7:       omp_offload.failed:
-// CHECK7-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z23without_schedule_clausePfS_S_S__l70(ptr [[TMP0]], ptr [[TMP1]], ptr [[TMP2]], ptr [[TMP3]]) #[[ATTR4:[0-9]+]]
+// CHECK7-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z23without_schedule_clausePfS_S_S__l70(ptr [[TMP0]], ptr [[TMP1]], ptr [[TMP2]], ptr [[TMP3]]) #[[ATTR3:[0-9]+]]
 // CHECK7-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK7:       omp_offload.cont:
 // CHECK7-NEXT:    ret void
@@ -3237,7 +3237,7 @@ int fint(void) { return ftemplate<int>(); }
 //
 //
 // CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z23without_schedule_clausePfS_S_S__l70.omp_outlined
-// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[D:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[D:%.*]]) #[[ATTR1]] {
 // CHECK7-NEXT:  entry:
 // CHECK7-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK7-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -3286,41 +3286,41 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK7-NEXT:    store i32 [[TMP9]], ptr [[DOTOMP_IV]], align 4
 // CHECK7-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK7:       omp.inner.for.cond:
-// CHECK7-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP14:![0-9]+]]
-// CHECK7-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP14]]
+// CHECK7-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP9:![0-9]+]]
+// CHECK7-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP9]]
 // CHECK7-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP10]], [[TMP11]]
 // CHECK7-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK7:       omp.inner.for.body:
-// CHECK7-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP14]]
+// CHECK7-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP9]]
 // CHECK7-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP12]], 7
 // CHECK7-NEXT:    [[ADD:%.*]] = add nsw i32 33, [[MUL]]
-// CHECK7-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP14]]
-// CHECK7-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[TMP1]], align 4, !llvm.access.group [[ACC_GRP14]]
-// CHECK7-NEXT:    [[TMP14:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP14]]
+// CHECK7-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP9]]
+// CHECK7-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[TMP1]], align 4, !llvm.access.group [[ACC_GRP9]]
+// CHECK7-NEXT:    [[TMP14:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP9]]
 // CHECK7-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP13]], i32 [[TMP14]]
-// CHECK7-NEXT:    [[TMP15:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP14]]
-// CHECK7-NEXT:    [[TMP16:%.*]] = load ptr, ptr [[TMP2]], align 4, !llvm.access.group [[ACC_GRP14]]
-// CHECK7-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP14]]
+// CHECK7-NEXT:    [[TMP15:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP9]]
+// CHECK7-NEXT:    [[TMP16:%.*]] = load ptr, ptr [[TMP2]], align 4, !llvm.access.group [[ACC_GRP9]]
+// CHECK7-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP9]]
 // CHECK7-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i32 [[TMP17]]
-// CHECK7-NEXT:    [[TMP18:%.*]] = load float, ptr [[ARRAYIDX2]], align 4, !llvm.access.group [[ACC_GRP14]]
+// CHECK7-NEXT:    [[TMP18:%.*]] = load float, ptr [[ARRAYIDX2]], align 4, !llvm.access.group [[ACC_GRP9]]
 // CHECK7-NEXT:    [[MUL3:%.*]] = fmul float [[TMP15]], [[TMP18]]
-// CHECK7-NEXT:    [[TMP19:%.*]] = load ptr, ptr [[TMP3]], align 4, !llvm.access.group [[ACC_GRP14]]
-// CHECK7-NEXT:    [[TMP20:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP14]]
+// CHECK7-NEXT:    [[TMP19:%.*]] = load ptr, ptr [[TMP3]], align 4, !llvm.access.group [[ACC_GRP9]]
+// CHECK7-NEXT:    [[TMP20:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP9]]
 // CHECK7-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[TMP19]], i32 [[TMP20]]
-// CHECK7-NEXT:    [[TMP21:%.*]] = load float, ptr [[ARRAYIDX4]], align 4, !llvm.access.group [[ACC_GRP14]]
+// CHECK7-NEXT:    [[TMP21:%.*]] = load float, ptr [[ARRAYIDX4]], align 4, !llvm.access.group [[ACC_GRP9]]
 // CHECK7-NEXT:    [[MUL5:%.*]] = fmul float [[MUL3]], [[TMP21]]
-// CHECK7-NEXT:    [[TMP22:%.*]] = load ptr, ptr [[TMP0]], align 4, !llvm.access.group [[ACC_GRP14]]
-// CHECK7-NEXT:    [[TMP23:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP14]]
+// CHECK7-NEXT:    [[TMP22:%.*]] = load ptr, ptr [[TMP0]], align 4, !llvm.access.group [[ACC_GRP9]]
+// CHECK7-NEXT:    [[TMP23:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP9]]
 // CHECK7-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i32 [[TMP23]]
-// CHECK7-NEXT:    store float [[MUL5]], ptr [[ARRAYIDX6]], align 4, !llvm.access.group [[ACC_GRP14]]
+// CHECK7-NEXT:    store float [[MUL5]], ptr [[ARRAYIDX6]], align 4, !llvm.access.group [[ACC_GRP9]]
 // CHECK7-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK7:       omp.body.continue:
 // CHECK7-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK7:       omp.inner.for.inc:
-// CHECK7-NEXT:    [[TMP24:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP14]]
+// CHECK7-NEXT:    [[TMP24:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP9]]
 // CHECK7-NEXT:    [[ADD7:%.*]] = add nsw i32 [[TMP24]], 1
-// CHECK7-NEXT:    store i32 [[ADD7]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP14]]
-// CHECK7-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP15:![0-9]+]]
+// CHECK7-NEXT:    store i32 [[ADD7]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP9]]
+// CHECK7-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP10:![0-9]+]]
 // CHECK7:       omp.inner.for.end:
 // CHECK7-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK7:       omp.loop.exit:
@@ -3411,7 +3411,7 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK7-NEXT:    [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0
 // CHECK7-NEXT:    br i1 [[TMP32]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK7:       omp_offload.failed:
-// CHECK7-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z18static_not_chunkedPfS_S_S__l86(ptr [[TMP0]], ptr [[TMP1]], ptr [[TMP2]], ptr [[TMP3]]) #[[ATTR4]]
+// CHECK7-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z18static_not_chunkedPfS_S_S__l86(ptr [[TMP0]], ptr [[TMP1]], ptr [[TMP2]], ptr [[TMP3]]) #[[ATTR3]]
 // CHECK7-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK7:       omp_offload.cont:
 // CHECK7-NEXT:    ret void
@@ -3433,7 +3433,7 @@ int fint(void) { return ftemplate<int>(); }
 //
 //
 // CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z18static_not_chunkedPfS_S_S__l86.omp_outlined
-// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[D:%.*]]) #[[ATTR2]] {
+// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[D:%.*]]) #[[ATTR1]] {
 // CHECK7-NEXT:  entry:
 // CHECK7-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK7-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -3489,7 +3489,7 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK7-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP11]], 7
 // CHECK7-NEXT:    [[SUB:%.*]] = sub nsw i32 32000000, [[MUL]]
 // CHECK7-NEXT:    store i32 [[SUB]], ptr [[I]], align 4
-// CHECK7-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[TMP1]], align 4, !nontemporal !21
+// CHECK7-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[TMP1]], align 4, !nontemporal !16
 // CHECK7-NEXT:    [[TMP13:%.*]] = load i32, ptr [[I]], align 4
 // CHECK7-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 [[TMP13]]
 // CHECK7-NEXT:    [[TMP14:%.*]] = load float, ptr [[ARRAYIDX]], align 4
@@ -3503,7 +3503,7 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK7-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[TMP18]], i32 [[TMP19]]
 // CHECK7-NEXT:    [[TMP20:%.*]] = load float, ptr [[ARRAYIDX4]], align 4
 // CHECK7-NEXT:    [[MUL5:%.*]] = fmul float [[MUL3]], [[TMP20]]
-// CHECK7-NEXT:    [[TMP21:%.*]] = load ptr, ptr [[TMP0]], align 4, !nontemporal !21
+// CHECK7-NEXT:    [[TMP21:%.*]] = load ptr, ptr [[TMP0]], align 4, !nontemporal !16
 // CHECK7-NEXT:    [[TMP22:%.*]] = load i32, ptr [[I]], align 4
 // CHECK7-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds float, ptr [[TMP21]], i32 [[TMP22]]
 // CHECK7-NEXT:    store float [[MUL5]], ptr [[ARRAYIDX6]], align 4
@@ -3514,7 +3514,7 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK7-NEXT:    [[TMP23:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
 // CHECK7-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP23]], 1
 // CHECK7-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4
-// CHECK7-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP22:![0-9]+]]
+// CHECK7-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP17:![0-9]+]]
 // CHECK7:       omp.inner.for.end:
 // CHECK7-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK7:       omp.loop.exit:
@@ -3605,7 +3605,7 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK7-NEXT:    [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0
 // CHECK7-NEXT:    br i1 [[TMP32]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK7:       omp_offload.failed:
-// CHECK7-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z14static_chunkedPfS_S_S__l103(ptr [[TMP0]], ptr [[TMP1]], ptr [[TMP2]], ptr [[TMP3]]) #[[ATTR4]]
+// CHECK7-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z14static_chunkedPfS_S_S__l103(ptr [[TMP0]], ptr [[TMP1]], ptr [[TMP2]], ptr [[TMP3]]) #[[ATTR3]]
 // CHECK7-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK7:       omp_offload.cont:
 // CHECK7-NEXT:    ret void
@@ -3627,7 +3627,7 @@ int fint(void) { return ftemplate<int>(); }
 //
 //
 // CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z14static_chunkedPfS_S_S__l103.omp_outlined
-// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[D:%.*]]) #[[ATTR2]] {
+// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[D:%.*]]) #[[ATTR1]] {
 // CHECK7-NEXT:  entry:
 // CHECK7-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK7-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -3681,41 +3681,41 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK7:       omp.dispatch.body:
 // CHECK7-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK7:       omp.inner.for.cond:
-// CHECK7-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP24:![0-9]+]]
-// CHECK7-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP24]]
+// CHECK7-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19:![0-9]+]]
+// CHECK7-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP19]]
 // CHECK7-NEXT:    [[CMP2:%.*]] = icmp ule i32 [[TMP11]], [[TMP12]]
 // CHECK7-NEXT:    br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK7:       omp.inner.for.body:
-// CHECK7-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP24]]
+// CHECK7-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19]]
 // CHECK7-NEXT:    [[MUL:%.*]] = mul i32 [[TMP13]], 127
 // CHECK7-NEXT:    [[ADD:%.*]] = add i32 131071, [[MUL]]
-// CHECK7-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP24]]
-// CHECK7-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[TMP1]], align 4, !llvm.access.group [[ACC_GRP24]]
-// CHECK7-NEXT:    [[TMP15:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP24]]
+// CHECK7-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK7-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[TMP1]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK7-NEXT:    [[TMP15:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP19]]
 // CHECK7-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i32 [[TMP15]]
-// CHECK7-NEXT:    [[TMP16:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP24]]
-// CHECK7-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[TMP2]], align 4, !llvm.access.group [[ACC_GRP24]]
-// CHECK7-NEXT:    [[TMP18:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP24]]
+// CHECK7-NEXT:    [[TMP16:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK7-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[TMP2]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK7-NEXT:    [[TMP18:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP19]]
 // CHECK7-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i32 [[TMP18]]
-// CHECK7-NEXT:    [[TMP19:%.*]] = load float, ptr [[ARRAYIDX3]], align 4, !llvm.access.group [[ACC_GRP24]]
+// CHECK7-NEXT:    [[TMP19:%.*]] = load float, ptr [[ARRAYIDX3]], align 4, !llvm.access.group [[ACC_GRP19]]
 // CHECK7-NEXT:    [[MUL4:%.*]] = fmul float [[TMP16]], [[TMP19]]
-// CHECK7-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[TMP3]], align 4, !llvm.access.group [[ACC_GRP24]]
-// CHECK7-NEXT:    [[TMP21:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP24]]
+// CHECK7-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[TMP3]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK7-NEXT:    [[TMP21:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP19]]
 // CHECK7-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i32 [[TMP21]]
-// CHECK7-NEXT:    [[TMP22:%.*]] = load float, ptr [[ARRAYIDX5]], align 4, !llvm.access.group [[ACC_GRP24]]
+// CHECK7-NEXT:    [[TMP22:%.*]] = load float, ptr [[ARRAYIDX5]], align 4, !llvm.access.group [[ACC_GRP19]]
 // CHECK7-NEXT:    [[MUL6:%.*]] = fmul float [[MUL4]], [[TMP22]]
-// CHECK7-NEXT:    [[TMP23:%.*]] = load ptr, ptr [[TMP0]], align 4, !llvm.access.group [[ACC_GRP24]]
-// CHECK7-NEXT:    [[TMP24:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP24]]
+// CHECK7-NEXT:    [[TMP23:%.*]] = load ptr, ptr [[TMP0]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK7-NEXT:    [[TMP24:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP19]]
 // CHECK7-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[TMP23]], i32 [[TMP24]]
-// CHECK7-NEXT:    store float [[MUL6]], ptr [[ARRAYIDX7]], align 4, !llvm.access.group [[ACC_GRP24]]
+// CHECK7-NEXT:    store float [[MUL6]], ptr [[ARRAYIDX7]], align 4, !llvm.access.group [[ACC_GRP19]]
 // CHECK7-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK7:       omp.body.continue:
 // CHECK7-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK7:       omp.inner.for.inc:
-// CHECK7-NEXT:    [[TMP25:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP24]]
+// CHECK7-NEXT:    [[TMP25:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19]]
 // CHECK7-NEXT:    [[ADD8:%.*]] = add i32 [[TMP25]], 1
-// CHECK7-NEXT:    store i32 [[ADD8]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP24]]
-// CHECK7-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP25:![0-9]+]]
+// CHECK7-NEXT:    store i32 [[ADD8]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK7-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP20:![0-9]+]]
 // CHECK7:       omp.inner.for.end:
 // CHECK7-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK7:       omp.dispatch.inc:
@@ -3818,7 +3818,7 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK7-NEXT:    [[TMP30:%.*]] = icmp ne i32 [[TMP29]], 0
 // CHECK7-NEXT:    br i1 [[TMP30]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK7:       omp_offload.failed:
-// CHECK7-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z12test_precondv_l115(i32 [[TMP1]], i32 [[TMP3]]) #[[ATTR4]]
+// CHECK7-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z12test_precondv_l115(i32 [[TMP1]], i32 [[TMP3]]) #[[ATTR3]]
 // CHECK7-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK7:       omp_offload.cont:
 // CHECK7-NEXT:    ret void
@@ -3836,7 +3836,7 @@ int fint(void) { return ftemplate<int>(); }
 //
 //
 // CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z12test_precondv_l115.omp_outlined
-// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[I:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[A:%.*]]) #[[ATTR2]] {
+// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[I:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[A:%.*]]) #[[ATTR1]] {
 // CHECK7-NEXT:  entry:
 // CHECK7-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK7-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -3904,26 +3904,26 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK7:       omp_if.then:
 // CHECK7-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK7:       omp.inner.for.cond:
-// CHECK7-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP27:![0-9]+]]
-// CHECK7-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP27]]
+// CHECK7-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22:![0-9]+]]
+// CHECK7-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP22]]
 // CHECK7-NEXT:    [[CMP8:%.*]] = icmp sle i32 [[TMP15]], [[TMP16]]
 // CHECK7-NEXT:    br i1 [[CMP8]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK7:       omp.inner.for.body:
-// CHECK7-NEXT:    [[TMP17:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR_]], align 1, !llvm.access.group [[ACC_GRP27]]
+// CHECK7-NEXT:    [[TMP17:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR_]], align 1, !llvm.access.group [[ACC_GRP22]]
 // CHECK7-NEXT:    [[CONV9:%.*]] = sext i8 [[TMP17]] to i32
-// CHECK7-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP27]]
+// CHECK7-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
 // CHECK7-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1
 // CHECK7-NEXT:    [[ADD10:%.*]] = add nsw i32 [[CONV9]], [[MUL]]
 // CHECK7-NEXT:    [[CONV11:%.*]] = trunc i32 [[ADD10]] to i8
-// CHECK7-NEXT:    store i8 [[CONV11]], ptr [[I6]], align 1, !nontemporal !21, !llvm.access.group [[ACC_GRP27]]
+// CHECK7-NEXT:    store i8 [[CONV11]], ptr [[I6]], align 1, !nontemporal !16, !llvm.access.group [[ACC_GRP22]]
 // CHECK7-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK7:       omp.body.continue:
 // CHECK7-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK7:       omp.inner.for.inc:
-// CHECK7-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP27]]
+// CHECK7-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
 // CHECK7-NEXT:    [[ADD12:%.*]] = add nsw i32 [[TMP19]], 1
-// CHECK7-NEXT:    store i32 [[ADD12]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP27]]
-// CHECK7-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP28:![0-9]+]]
+// CHECK7-NEXT:    store i32 [[ADD12]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK7-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP23:![0-9]+]]
 // CHECK7:       omp.inner.for.end:
 // CHECK7-NEXT:    br label [[OMP_IF_END:%.*]]
 // CHECK7:       omp_if.else:
@@ -3948,7 +3948,7 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK7-NEXT:    [[TMP24:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
 // CHECK7-NEXT:    [[ADD22:%.*]] = add nsw i32 [[TMP24]], 1
 // CHECK7-NEXT:    store i32 [[ADD22]], ptr [[DOTOMP_IV]], align 4
-// CHECK7-NEXT:    br label [[OMP_INNER_FOR_COND13]], !llvm.loop [[LOOP30:![0-9]+]]
+// CHECK7-NEXT:    br label [[OMP_INNER_FOR_COND13]], !llvm.loop [[LOOP25:![0-9]+]]
 // CHECK7:       omp.inner.for.end23:
 // CHECK7-NEXT:    br label [[OMP_IF_END]]
 // CHECK7:       omp_if.end:
@@ -4039,7 +4039,7 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK7-NEXT:    [[TMP21:%.*]] = icmp ne i32 [[TMP20]], 0
 // CHECK7-NEXT:    br i1 [[TMP21]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK7:       omp_offload.failed:
-// CHECK7-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_v_l135(i32 [[TMP1]]) #[[ATTR4]]
+// CHECK7-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_v_l135(i32 [[TMP1]]) #[[ATTR3]]
 // CHECK7-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK7:       omp_offload.cont:
 // CHECK7-NEXT:    ret i32 0
@@ -4055,7 +4055,7 @@ int fint(void) { return ftemplate<int>(); }
 //
 //
 // CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_v_l135.omp_outlined
-// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[AA:%.*]]) #[[ATTR2]] {
+// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[AA:%.*]]) #[[ATTR1]] {
 // CHECK7-NEXT:  entry:
 // CHECK7-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK7-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -4102,23 +4102,23 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK7:       omp.dispatch.body:
 // CHECK7-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK7:       omp.inner.for.cond:
-// CHECK7-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP32:![0-9]+]]
-// CHECK7-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP32]]
+// CHECK7-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP27:![0-9]+]]
+// CHECK7-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP27]]
 // CHECK7-NEXT:    [[CMP2:%.*]] = icmp sle i32 [[TMP9]], [[TMP10]]
 // CHECK7-NEXT:    br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK7:       omp.inner.for.body:
-// CHECK7-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP32]]
+// CHECK7-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP27]]
 // CHECK7-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP11]], 1
 // CHECK7-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK7-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP32]]
+// CHECK7-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP27]]
 // CHECK7-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK7:       omp.body.continue:
 // CHECK7-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK7:       omp.inner.for.inc:
-// CHECK7-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP32]]
+// CHECK7-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP27]]
 // CHECK7-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP12]], 1
-// CHECK7-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP32]]
-// CHECK7-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP33:![0-9]+]]
+// CHECK7-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP27]]
+// CHECK7-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP28:![0-9]+]]
 // CHECK7:       omp.inner.for.end:
 // CHECK7-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK7:       omp.dispatch.inc:
@@ -4144,7 +4144,7 @@ int fint(void) { return ftemplate<int>(); }
 //
 //
 // CHECK7-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK7-SAME: () #[[ATTR5:[0-9]+]] {
+// CHECK7-SAME: () #[[ATTR4:[0-9]+]] {
 // CHECK7-NEXT:  entry:
 // CHECK7-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK7-NEXT:    ret void
@@ -5504,7 +5504,7 @@ int fint(void) { return ftemplate<int>(); }
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z23without_schedule_clausePfS_S_S__l70.omp_outlined
-// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[D:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[D:%.*]]) #[[ATTR0]] {
 // CHECK17-NEXT:  entry:
 // CHECK17-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -5553,45 +5553,45 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK17-NEXT:    store i32 [[TMP9]], ptr [[DOTOMP_IV]], align 4
 // CHECK17-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK17:       omp.inner.for.cond:
-// CHECK17-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP14:![0-9]+]]
-// CHECK17-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP14]]
+// CHECK17-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP9:![0-9]+]]
+// CHECK17-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP9]]
 // CHECK17-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP10]], [[TMP11]]
 // CHECK17-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK17:       omp.inner.for.body:
-// CHECK17-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP14]]
+// CHECK17-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP9]]
 // CHECK17-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP12]], 7
 // CHECK17-NEXT:    [[ADD:%.*]] = add nsw i32 33, [[MUL]]
-// CHECK17-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP14]]
-// CHECK17-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[TMP1]], align 8, !llvm.access.group [[ACC_GRP14]]
-// CHECK17-NEXT:    [[TMP14:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP14]]
+// CHECK17-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP9]]
+// CHECK17-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[TMP1]], align 8, !llvm.access.group [[ACC_GRP9]]
+// CHECK17-NEXT:    [[TMP14:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP9]]
 // CHECK17-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP14]] to i64
 // CHECK17-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP13]], i64 [[IDXPROM]]
-// CHECK17-NEXT:    [[TMP15:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP14]]
-// CHECK17-NEXT:    [[TMP16:%.*]] = load ptr, ptr [[TMP2]], align 8, !llvm.access.group [[ACC_GRP14]]
-// CHECK17-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP14]]
+// CHECK17-NEXT:    [[TMP15:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP9]]
+// CHECK17-NEXT:    [[TMP16:%.*]] = load ptr, ptr [[TMP2]], align 8, !llvm.access.group [[ACC_GRP9]]
+// CHECK17-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP9]]
 // CHECK17-NEXT:    [[IDXPROM2:%.*]] = sext i32 [[TMP17]] to i64
 // CHECK17-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i64 [[IDXPROM2]]
-// CHECK17-NEXT:    [[TMP18:%.*]] = load float, ptr [[ARRAYIDX3]], align 4, !llvm.access.group [[ACC_GRP14]]
+// CHECK17-NEXT:    [[TMP18:%.*]] = load float, ptr [[ARRAYIDX3]], align 4, !llvm.access.group [[ACC_GRP9]]
 // CHECK17-NEXT:    [[MUL4:%.*]] = fmul float [[TMP15]], [[TMP18]]
-// CHECK17-NEXT:    [[TMP19:%.*]] = load ptr, ptr [[TMP3]], align 8, !llvm.access.group [[ACC_GRP14]]
-// CHECK17-NEXT:    [[TMP20:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP14]]
+// CHECK17-NEXT:    [[TMP19:%.*]] = load ptr, ptr [[TMP3]], align 8, !llvm.access.group [[ACC_GRP9]]
+// CHECK17-NEXT:    [[TMP20:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP9]]
 // CHECK17-NEXT:    [[IDXPROM5:%.*]] = sext i32 [[TMP20]] to i64
 // CHECK17-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds float, ptr [[TMP19]], i64 [[IDXPROM5]]
-// CHECK17-NEXT:    [[TMP21:%.*]] = load float, ptr [[ARRAYIDX6]], align 4, !llvm.access.group [[ACC_GRP14]]
+// CHECK17-NEXT:    [[TMP21:%.*]] = load float, ptr [[ARRAYIDX6]], align 4, !llvm.access.group [[ACC_GRP9]]
 // CHECK17-NEXT:    [[MUL7:%.*]] = fmul float [[MUL4]], [[TMP21]]
-// CHECK17-NEXT:    [[TMP22:%.*]] = load ptr, ptr [[TMP0]], align 8, !llvm.access.group [[ACC_GRP14]]
-// CHECK17-NEXT:    [[TMP23:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP14]]
+// CHECK17-NEXT:    [[TMP22:%.*]] = load ptr, ptr [[TMP0]], align 8, !llvm.access.group [[ACC_GRP9]]
+// CHECK17-NEXT:    [[TMP23:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP9]]
 // CHECK17-NEXT:    [[IDXPROM8:%.*]] = sext i32 [[TMP23]] to i64
 // CHECK17-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i64 [[IDXPROM8]]
-// CHECK17-NEXT:    store float [[MUL7]], ptr [[ARRAYIDX9]], align 4, !llvm.access.group [[ACC_GRP14]]
+// CHECK17-NEXT:    store float [[MUL7]], ptr [[ARRAYIDX9]], align 4, !llvm.access.group [[ACC_GRP9]]
 // CHECK17-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK17:       omp.body.continue:
 // CHECK17-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK17:       omp.inner.for.inc:
-// CHECK17-NEXT:    [[TMP24:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP14]]
+// CHECK17-NEXT:    [[TMP24:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP9]]
 // CHECK17-NEXT:    [[ADD10:%.*]] = add nsw i32 [[TMP24]], 1
-// CHECK17-NEXT:    store i32 [[ADD10]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP14]]
-// CHECK17-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP15:![0-9]+]]
+// CHECK17-NEXT:    store i32 [[ADD10]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP9]]
+// CHECK17-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP10:![0-9]+]]
 // CHECK17:       omp.inner.for.end:
 // CHECK17-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK17:       omp.loop.exit:
@@ -5622,7 +5622,7 @@ int fint(void) { return ftemplate<int>(); }
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z18static_not_chunkedPfS_S_S__l86.omp_outlined
-// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[D:%.*]]) #[[ATTR1]] {
+// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[D:%.*]]) #[[ATTR0]] {
 // CHECK17-NEXT:  entry:
 // CHECK17-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -5707,7 +5707,7 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK17-NEXT:    [[TMP23:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
 // CHECK17-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP23]], 1
 // CHECK17-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4
-// CHECK17-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP21:![0-9]+]]
+// CHECK17-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP16:![0-9]+]]
 // CHECK17:       omp.inner.for.end:
 // CHECK17-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK17:       omp.loop.exit:
@@ -5738,7 +5738,7 @@ int fint(void) { return ftemplate<int>(); }
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z14static_chunkedPfS_S_S__l103.omp_outlined
-// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[D:%.*]]) #[[ATTR1]] {
+// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[D:%.*]]) #[[ATTR0]] {
 // CHECK17-NEXT:  entry:
 // CHECK17-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -5792,45 +5792,45 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK17:       omp.dispatch.body:
 // CHECK17-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK17:       omp.inner.for.cond:
-// CHECK17-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP23:![0-9]+]]
-// CHECK17-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK17-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP18:![0-9]+]]
+// CHECK17-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK17-NEXT:    [[CMP2:%.*]] = icmp ule i32 [[TMP11]], [[TMP12]]
 // CHECK17-NEXT:    br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK17:       omp.inner.for.body:
-// CHECK17-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK17-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK17-NEXT:    [[MUL:%.*]] = mul i32 [[TMP13]], 127
 // CHECK17-NEXT:    [[ADD:%.*]] = add i32 131071, [[MUL]]
-// CHECK17-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP23]]
-// CHECK17-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[TMP1]], align 8, !llvm.access.group [[ACC_GRP23]]
-// CHECK17-NEXT:    [[TMP15:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK17-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK17-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[TMP1]], align 8, !llvm.access.group [[ACC_GRP18]]
+// CHECK17-NEXT:    [[TMP15:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK17-NEXT:    [[IDXPROM:%.*]] = zext i32 [[TMP15]] to i64
 // CHECK17-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i64 [[IDXPROM]]
-// CHECK17-NEXT:    [[TMP16:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP23]]
-// CHECK17-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[TMP2]], align 8, !llvm.access.group [[ACC_GRP23]]
-// CHECK17-NEXT:    [[TMP18:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK17-NEXT:    [[TMP16:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK17-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[TMP2]], align 8, !llvm.access.group [[ACC_GRP18]]
+// CHECK17-NEXT:    [[TMP18:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK17-NEXT:    [[IDXPROM3:%.*]] = zext i32 [[TMP18]] to i64
 // CHECK17-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i64 [[IDXPROM3]]
-// CHECK17-NEXT:    [[TMP19:%.*]] = load float, ptr [[ARRAYIDX4]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK17-NEXT:    [[TMP19:%.*]] = load float, ptr [[ARRAYIDX4]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK17-NEXT:    [[MUL5:%.*]] = fmul float [[TMP16]], [[TMP19]]
-// CHECK17-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[TMP3]], align 8, !llvm.access.group [[ACC_GRP23]]
-// CHECK17-NEXT:    [[TMP21:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK17-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[TMP3]], align 8, !llvm.access.group [[ACC_GRP18]]
+// CHECK17-NEXT:    [[TMP21:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK17-NEXT:    [[IDXPROM6:%.*]] = zext i32 [[TMP21]] to i64
 // CHECK17-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[IDXPROM6]]
-// CHECK17-NEXT:    [[TMP22:%.*]] = load float, ptr [[ARRAYIDX7]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK17-NEXT:    [[TMP22:%.*]] = load float, ptr [[ARRAYIDX7]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK17-NEXT:    [[MUL8:%.*]] = fmul float [[MUL5]], [[TMP22]]
-// CHECK17-NEXT:    [[TMP23:%.*]] = load ptr, ptr [[TMP0]], align 8, !llvm.access.group [[ACC_GRP23]]
-// CHECK17-NEXT:    [[TMP24:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK17-NEXT:    [[TMP23:%.*]] = load ptr, ptr [[TMP0]], align 8, !llvm.access.group [[ACC_GRP18]]
+// CHECK17-NEXT:    [[TMP24:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK17-NEXT:    [[IDXPROM9:%.*]] = zext i32 [[TMP24]] to i64
 // CHECK17-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds float, ptr [[TMP23]], i64 [[IDXPROM9]]
-// CHECK17-NEXT:    store float [[MUL8]], ptr [[ARRAYIDX10]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK17-NEXT:    store float [[MUL8]], ptr [[ARRAYIDX10]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK17-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK17:       omp.body.continue:
 // CHECK17-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK17:       omp.inner.for.inc:
-// CHECK17-NEXT:    [[TMP25:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK17-NEXT:    [[TMP25:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK17-NEXT:    [[ADD11:%.*]] = add i32 [[TMP25]], 1
-// CHECK17-NEXT:    store i32 [[ADD11]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP23]]
-// CHECK17-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP24:![0-9]+]]
+// CHECK17-NEXT:    store i32 [[ADD11]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK17-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP19:![0-9]+]]
 // CHECK17:       omp.inner.for.end:
 // CHECK17-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK17:       omp.dispatch.inc:
@@ -5867,7 +5867,7 @@ int fint(void) { return ftemplate<int>(); }
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z12test_precondv_l115.omp_outlined
-// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[I:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[A:%.*]]) #[[ATTR1]] {
+// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[I:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[A:%.*]]) #[[ATTR0]] {
 // CHECK17-NEXT:  entry:
 // CHECK17-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -5931,26 +5931,26 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK17-NEXT:    store i32 [[TMP13]], ptr [[DOTOMP_IV]], align 4
 // CHECK17-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK17:       omp.inner.for.cond:
-// CHECK17-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26:![0-9]+]]
-// CHECK17-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK17-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP21:![0-9]+]]
+// CHECK17-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP21]]
 // CHECK17-NEXT:    [[CMP8:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]]
 // CHECK17-NEXT:    br i1 [[CMP8]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK17:       omp.inner.for.body:
-// CHECK17-NEXT:    [[TMP16:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR_]], align 1, !llvm.access.group [[ACC_GRP26]]
+// CHECK17-NEXT:    [[TMP16:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR_]], align 1, !llvm.access.group [[ACC_GRP21]]
 // CHECK17-NEXT:    [[CONV9:%.*]] = sext i8 [[TMP16]] to i32
-// CHECK17-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK17-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP21]]
 // CHECK17-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP17]], 1
 // CHECK17-NEXT:    [[ADD10:%.*]] = add nsw i32 [[CONV9]], [[MUL]]
 // CHECK17-NEXT:    [[CONV11:%.*]] = trunc i32 [[ADD10]] to i8
-// CHECK17-NEXT:    store i8 [[CONV11]], ptr [[I6]], align 1, !llvm.access.group [[ACC_GRP26]]
+// CHECK17-NEXT:    store i8 [[CONV11]], ptr [[I6]], align 1, !llvm.access.group [[ACC_GRP21]]
 // CHECK17-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK17:       omp.body.continue:
 // CHECK17-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK17:       omp.inner.for.inc:
-// CHECK17-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK17-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP21]]
 // CHECK17-NEXT:    [[ADD12:%.*]] = add nsw i32 [[TMP18]], 1
-// CHECK17-NEXT:    store i32 [[ADD12]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
-// CHECK17-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP27:![0-9]+]]
+// CHECK17-NEXT:    store i32 [[ADD12]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP21]]
+// CHECK17-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP22:![0-9]+]]
 // CHECK17:       omp.inner.for.end:
 // CHECK17-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK17:       omp.loop.exit:
@@ -5990,7 +5990,7 @@ int fint(void) { return ftemplate<int>(); }
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_v_l135.omp_outlined
-// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[AA:%.*]]) #[[ATTR1]] {
+// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[AA:%.*]]) #[[ATTR0]] {
 // CHECK17-NEXT:  entry:
 // CHECK17-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -6037,23 +6037,23 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK17:       omp.dispatch.body:
 // CHECK17-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK17:       omp.inner.for.cond:
-// CHECK17-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP29:![0-9]+]]
-// CHECK17-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP29]]
+// CHECK17-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP24:![0-9]+]]
+// CHECK17-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP24]]
 // CHECK17-NEXT:    [[CMP2:%.*]] = icmp sle i32 [[TMP9]], [[TMP10]]
 // CHECK17-NEXT:    br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK17:       omp.inner.for.body:
-// CHECK17-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP29]]
+// CHECK17-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP24]]
 // CHECK17-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP11]], 1
 // CHECK17-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK17-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP29]]
+// CHECK17-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP24]]
 // CHECK17-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK17:       omp.body.continue:
 // CHECK17-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK17:       omp.inner.for.inc:
-// CHECK17-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP29]]
+// CHECK17-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP24]]
 // CHECK17-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP12]], 1
-// CHECK17-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP29]]
-// CHECK17-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP30:![0-9]+]]
+// CHECK17-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP24]]
+// CHECK17-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP25:![0-9]+]]
 // CHECK17:       omp.inner.for.end:
 // CHECK17-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK17:       omp.dispatch.inc:
@@ -6094,7 +6094,7 @@ int fint(void) { return ftemplate<int>(); }
 //
 //
 // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z23without_schedule_clausePfS_S_S__l70.omp_outlined
-// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[D:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[D:%.*]]) #[[ATTR0]] {
 // CHECK19-NEXT:  entry:
 // CHECK19-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -6143,41 +6143,41 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK19-NEXT:    store i32 [[TMP9]], ptr [[DOTOMP_IV]], align 4
 // CHECK19-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK19:       omp.inner.for.cond:
-// CHECK19-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP15:![0-9]+]]
-// CHECK19-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP15]]
+// CHECK19-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP10:![0-9]+]]
+// CHECK19-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP10]]
 // CHECK19-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP10]], [[TMP11]]
 // CHECK19-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK19:       omp.inner.for.body:
-// CHECK19-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP15]]
+// CHECK19-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP10]]
 // CHECK19-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP12]], 7
 // CHECK19-NEXT:    [[ADD:%.*]] = add nsw i32 33, [[MUL]]
-// CHECK19-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP15]]
-// CHECK19-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[TMP1]], align 4, !llvm.access.group [[ACC_GRP15]]
-// CHECK19-NEXT:    [[TMP14:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP15]]
+// CHECK19-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP10]]
+// CHECK19-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[TMP1]], align 4, !llvm.access.group [[ACC_GRP10]]
+// CHECK19-NEXT:    [[TMP14:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP10]]
 // CHECK19-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP13]], i32 [[TMP14]]
-// CHECK19-NEXT:    [[TMP15:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP15]]
-// CHECK19-NEXT:    [[TMP16:%.*]] = load ptr, ptr [[TMP2]], align 4, !llvm.access.group [[ACC_GRP15]]
-// CHECK19-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP15]]
+// CHECK19-NEXT:    [[TMP15:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP10]]
+// CHECK19-NEXT:    [[TMP16:%.*]] = load ptr, ptr [[TMP2]], align 4, !llvm.access.group [[ACC_GRP10]]
+// CHECK19-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP10]]
 // CHECK19-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i32 [[TMP17]]
-// CHECK19-NEXT:    [[TMP18:%.*]] = load float, ptr [[ARRAYIDX2]], align 4, !llvm.access.group [[ACC_GRP15]]
+// CHECK19-NEXT:    [[TMP18:%.*]] = load float, ptr [[ARRAYIDX2]], align 4, !llvm.access.group [[ACC_GRP10]]
 // CHECK19-NEXT:    [[MUL3:%.*]] = fmul float [[TMP15]], [[TMP18]]
-// CHECK19-NEXT:    [[TMP19:%.*]] = load ptr, ptr [[TMP3]], align 4, !llvm.access.group [[ACC_GRP15]]
-// CHECK19-NEXT:    [[TMP20:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP15]]
+// CHECK19-NEXT:    [[TMP19:%.*]] = load ptr, ptr [[TMP3]], align 4, !llvm.access.group [[ACC_GRP10]]
+// CHECK19-NEXT:    [[TMP20:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP10]]
 // CHECK19-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[TMP19]], i32 [[TMP20]]
-// CHECK19-NEXT:    [[TMP21:%.*]] = load float, ptr [[ARRAYIDX4]], align 4, !llvm.access.group [[ACC_GRP15]]
+// CHECK19-NEXT:    [[TMP21:%.*]] = load float, ptr [[ARRAYIDX4]], align 4, !llvm.access.group [[ACC_GRP10]]
 // CHECK19-NEXT:    [[MUL5:%.*]] = fmul float [[MUL3]], [[TMP21]]
-// CHECK19-NEXT:    [[TMP22:%.*]] = load ptr, ptr [[TMP0]], align 4, !llvm.access.group [[ACC_GRP15]]
-// CHECK19-NEXT:    [[TMP23:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP15]]
+// CHECK19-NEXT:    [[TMP22:%.*]] = load ptr, ptr [[TMP0]], align 4, !llvm.access.group [[ACC_GRP10]]
+// CHECK19-NEXT:    [[TMP23:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP10]]
 // CHECK19-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i32 [[TMP23]]
-// CHECK19-NEXT:    store float [[MUL5]], ptr [[ARRAYIDX6]], align 4, !llvm.access.group [[ACC_GRP15]]
+// CHECK19-NEXT:    store float [[MUL5]], ptr [[ARRAYIDX6]], align 4, !llvm.access.group [[ACC_GRP10]]
 // CHECK19-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK19:       omp.body.continue:
 // CHECK19-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK19:       omp.inner.for.inc:
-// CHECK19-NEXT:    [[TMP24:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP15]]
+// CHECK19-NEXT:    [[TMP24:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP10]]
 // CHECK19-NEXT:    [[ADD7:%.*]] = add nsw i32 [[TMP24]], 1
-// CHECK19-NEXT:    store i32 [[ADD7]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP15]]
-// CHECK19-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP16:![0-9]+]]
+// CHECK19-NEXT:    store i32 [[ADD7]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP10]]
+// CHECK19-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP11:![0-9]+]]
 // CHECK19:       omp.inner.for.end:
 // CHECK19-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK19:       omp.loop.exit:
@@ -6208,7 +6208,7 @@ int fint(void) { return ftemplate<int>(); }
 //
 //
 // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z18static_not_chunkedPfS_S_S__l86.omp_outlined
-// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[D:%.*]]) #[[ATTR1]] {
+// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[D:%.*]]) #[[ATTR0]] {
 // CHECK19-NEXT:  entry:
 // CHECK19-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -6289,7 +6289,7 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK19-NEXT:    [[TMP23:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
 // CHECK19-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP23]], 1
 // CHECK19-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4
-// CHECK19-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP22:![0-9]+]]
+// CHECK19-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP17:![0-9]+]]
 // CHECK19:       omp.inner.for.end:
 // CHECK19-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK19:       omp.loop.exit:
@@ -6320,7 +6320,7 @@ int fint(void) { return ftemplate<int>(); }
 //
 //
 // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z14static_chunkedPfS_S_S__l103.omp_outlined
-// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[D:%.*]]) #[[ATTR1]] {
+// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[D:%.*]]) #[[ATTR0]] {
 // CHECK19-NEXT:  entry:
 // CHECK19-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -6374,41 +6374,41 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK19:       omp.dispatch.body:
 // CHECK19-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK19:       omp.inner.for.cond:
-// CHECK19-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP24:![0-9]+]]
-// CHECK19-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP24]]
+// CHECK19-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19:![0-9]+]]
+// CHECK19-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP19]]
 // CHECK19-NEXT:    [[CMP2:%.*]] = icmp ule i32 [[TMP11]], [[TMP12]]
 // CHECK19-NEXT:    br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK19:       omp.inner.for.body:
-// CHECK19-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP24]]
+// CHECK19-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19]]
 // CHECK19-NEXT:    [[MUL:%.*]] = mul i32 [[TMP13]], 127
 // CHECK19-NEXT:    [[ADD:%.*]] = add i32 131071, [[MUL]]
-// CHECK19-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP24]]
-// CHECK19-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[TMP1]], align 4, !llvm.access.group [[ACC_GRP24]]
-// CHECK19-NEXT:    [[TMP15:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP24]]
+// CHECK19-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK19-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[TMP1]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK19-NEXT:    [[TMP15:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP19]]
 // CHECK19-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i32 [[TMP15]]
-// CHECK19-NEXT:    [[TMP16:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP24]]
-// CHECK19-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[TMP2]], align 4, !llvm.access.group [[ACC_GRP24]]
-// CHECK19-NEXT:    [[TMP18:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP24]]
+// CHECK19-NEXT:    [[TMP16:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK19-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[TMP2]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK19-NEXT:    [[TMP18:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP19]]
 // CHECK19-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i32 [[TMP18]]
-// CHECK19-NEXT:    [[TMP19:%.*]] = load float, ptr [[ARRAYIDX3]], align 4, !llvm.access.group [[ACC_GRP24]]
+// CHECK19-NEXT:    [[TMP19:%.*]] = load float, ptr [[ARRAYIDX3]], align 4, !llvm.access.group [[ACC_GRP19]]
 // CHECK19-NEXT:    [[MUL4:%.*]] = fmul float [[TMP16]], [[TMP19]]
-// CHECK19-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[TMP3]], align 4, !llvm.access.group [[ACC_GRP24]]
-// CHECK19-NEXT:    [[TMP21:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP24]]
+// CHECK19-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[TMP3]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK19-NEXT:    [[TMP21:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP19]]
 // CHECK19-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i32 [[TMP21]]
-// CHECK19-NEXT:    [[TMP22:%.*]] = load float, ptr [[ARRAYIDX5]], align 4, !llvm.access.group [[ACC_GRP24]]
+// CHECK19-NEXT:    [[TMP22:%.*]] = load float, ptr [[ARRAYIDX5]], align 4, !llvm.access.group [[ACC_GRP19]]
 // CHECK19-NEXT:    [[MUL6:%.*]] = fmul float [[MUL4]], [[TMP22]]
-// CHECK19-NEXT:    [[TMP23:%.*]] = load ptr, ptr [[TMP0]], align 4, !llvm.access.group [[ACC_GRP24]]
-// CHECK19-NEXT:    [[TMP24:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP24]]
+// CHECK19-NEXT:    [[TMP23:%.*]] = load ptr, ptr [[TMP0]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK19-NEXT:    [[TMP24:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP19]]
 // CHECK19-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[TMP23]], i32 [[TMP24]]
-// CHECK19-NEXT:    store float [[MUL6]], ptr [[ARRAYIDX7]], align 4, !llvm.access.group [[ACC_GRP24]]
+// CHECK19-NEXT:    store float [[MUL6]], ptr [[ARRAYIDX7]], align 4, !llvm.access.group [[ACC_GRP19]]
 // CHECK19-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK19:       omp.body.continue:
 // CHECK19-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK19:       omp.inner.for.inc:
-// CHECK19-NEXT:    [[TMP25:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP24]]
+// CHECK19-NEXT:    [[TMP25:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19]]
 // CHECK19-NEXT:    [[ADD8:%.*]] = add i32 [[TMP25]], 1
-// CHECK19-NEXT:    store i32 [[ADD8]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP24]]
-// CHECK19-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP25:![0-9]+]]
+// CHECK19-NEXT:    store i32 [[ADD8]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK19-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP20:![0-9]+]]
 // CHECK19:       omp.inner.for.end:
 // CHECK19-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK19:       omp.dispatch.inc:
@@ -6445,7 +6445,7 @@ int fint(void) { return ftemplate<int>(); }
 //
 //
 // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z12test_precondv_l115.omp_outlined
-// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[I:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[A:%.*]]) #[[ATTR1]] {
+// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[I:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[A:%.*]]) #[[ATTR0]] {
 // CHECK19-NEXT:  entry:
 // CHECK19-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -6509,26 +6509,26 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK19-NEXT:    store i32 [[TMP13]], ptr [[DOTOMP_IV]], align 4
 // CHECK19-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK19:       omp.inner.for.cond:
-// CHECK19-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP27:![0-9]+]]
-// CHECK19-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP27]]
+// CHECK19-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22:![0-9]+]]
+// CHECK19-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP22]]
 // CHECK19-NEXT:    [[CMP8:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]]
 // CHECK19-NEXT:    br i1 [[CMP8]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK19:       omp.inner.for.body:
-// CHECK19-NEXT:    [[TMP16:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR_]], align 1, !llvm.access.group [[ACC_GRP27]]
+// CHECK19-NEXT:    [[TMP16:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR_]], align 1, !llvm.access.group [[ACC_GRP22]]
 // CHECK19-NEXT:    [[CONV9:%.*]] = sext i8 [[TMP16]] to i32
-// CHECK19-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP27]]
+// CHECK19-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
 // CHECK19-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP17]], 1
 // CHECK19-NEXT:    [[ADD10:%.*]] = add nsw i32 [[CONV9]], [[MUL]]
 // CHECK19-NEXT:    [[CONV11:%.*]] = trunc i32 [[ADD10]] to i8
-// CHECK19-NEXT:    store i8 [[CONV11]], ptr [[I6]], align 1, !llvm.access.group [[ACC_GRP27]]
+// CHECK19-NEXT:    store i8 [[CONV11]], ptr [[I6]], align 1, !llvm.access.group [[ACC_GRP22]]
 // CHECK19-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK19:       omp.body.continue:
 // CHECK19-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK19:       omp.inner.for.inc:
-// CHECK19-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP27]]
+// CHECK19-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
 // CHECK19-NEXT:    [[ADD12:%.*]] = add nsw i32 [[TMP18]], 1
-// CHECK19-NEXT:    store i32 [[ADD12]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP27]]
-// CHECK19-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP28:![0-9]+]]
+// CHECK19-NEXT:    store i32 [[ADD12]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK19-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP23:![0-9]+]]
 // CHECK19:       omp.inner.for.end:
 // CHECK19-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK19:       omp.loop.exit:
@@ -6568,7 +6568,7 @@ int fint(void) { return ftemplate<int>(); }
 //
 //
 // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_v_l135.omp_outlined
-// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[AA:%.*]]) #[[ATTR1]] {
+// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[AA:%.*]]) #[[ATTR0]] {
 // CHECK19-NEXT:  entry:
 // CHECK19-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -6615,23 +6615,23 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK19:       omp.dispatch.body:
 // CHECK19-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK19:       omp.inner.for.cond:
-// CHECK19-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP30:![0-9]+]]
-// CHECK19-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP30]]
+// CHECK19-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP25:![0-9]+]]
+// CHECK19-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP25]]
 // CHECK19-NEXT:    [[CMP2:%.*]] = icmp sle i32 [[TMP9]], [[TMP10]]
 // CHECK19-NEXT:    br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK19:       omp.inner.for.body:
-// CHECK19-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP30]]
+// CHECK19-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP25]]
 // CHECK19-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP11]], 1
 // CHECK19-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK19-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP30]]
+// CHECK19-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP25]]
 // CHECK19-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK19:       omp.body.continue:
 // CHECK19-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK19:       omp.inner.for.inc:
-// CHECK19-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP30]]
+// CHECK19-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP25]]
 // CHECK19-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP12]], 1
-// CHECK19-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP30]]
-// CHECK19-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP31:![0-9]+]]
+// CHECK19-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP25]]
+// CHECK19-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP26:![0-9]+]]
 // CHECK19:       omp.inner.for.end:
 // CHECK19-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK19:       omp.dispatch.inc:
@@ -6672,7 +6672,7 @@ int fint(void) { return ftemplate<int>(); }
 //
 //
 // CHECK21-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z23without_schedule_clausePfS_S_S__l70.omp_outlined
-// CHECK21-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[D:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK21-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[D:%.*]]) #[[ATTR0]] {
 // CHECK21-NEXT:  entry:
 // CHECK21-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK21-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -6721,45 +6721,45 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK21-NEXT:    store i32 [[TMP9]], ptr [[DOTOMP_IV]], align 4
 // CHECK21-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK21:       omp.inner.for.cond:
-// CHECK21-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP14:![0-9]+]]
-// CHECK21-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP14]]
+// CHECK21-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP9:![0-9]+]]
+// CHECK21-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP9]]
 // CHECK21-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP10]], [[TMP11]]
 // CHECK21-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK21:       omp.inner.for.body:
-// CHECK21-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP14]]
+// CHECK21-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP9]]
 // CHECK21-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP12]], 7
 // CHECK21-NEXT:    [[ADD:%.*]] = add nsw i32 33, [[MUL]]
-// CHECK21-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP14]]
-// CHECK21-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[TMP1]], align 8, !llvm.access.group [[ACC_GRP14]]
-// CHECK21-NEXT:    [[TMP14:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP14]]
+// CHECK21-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP9]]
+// CHECK21-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[TMP1]], align 8, !llvm.access.group [[ACC_GRP9]]
+// CHECK21-NEXT:    [[TMP14:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP9]]
 // CHECK21-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP14]] to i64
 // CHECK21-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP13]], i64 [[IDXPROM]]
-// CHECK21-NEXT:    [[TMP15:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP14]]
-// CHECK21-NEXT:    [[TMP16:%.*]] = load ptr, ptr [[TMP2]], align 8, !llvm.access.group [[ACC_GRP14]]
-// CHECK21-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP14]]
+// CHECK21-NEXT:    [[TMP15:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP9]]
+// CHECK21-NEXT:    [[TMP16:%.*]] = load ptr, ptr [[TMP2]], align 8, !llvm.access.group [[ACC_GRP9]]
+// CHECK21-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP9]]
 // CHECK21-NEXT:    [[IDXPROM2:%.*]] = sext i32 [[TMP17]] to i64
 // CHECK21-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i64 [[IDXPROM2]]
-// CHECK21-NEXT:    [[TMP18:%.*]] = load float, ptr [[ARRAYIDX3]], align 4, !llvm.access.group [[ACC_GRP14]]
+// CHECK21-NEXT:    [[TMP18:%.*]] = load float, ptr [[ARRAYIDX3]], align 4, !llvm.access.group [[ACC_GRP9]]
 // CHECK21-NEXT:    [[MUL4:%.*]] = fmul float [[TMP15]], [[TMP18]]
-// CHECK21-NEXT:    [[TMP19:%.*]] = load ptr, ptr [[TMP3]], align 8, !llvm.access.group [[ACC_GRP14]]
-// CHECK21-NEXT:    [[TMP20:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP14]]
+// CHECK21-NEXT:    [[TMP19:%.*]] = load ptr, ptr [[TMP3]], align 8, !llvm.access.group [[ACC_GRP9]]
+// CHECK21-NEXT:    [[TMP20:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP9]]
 // CHECK21-NEXT:    [[IDXPROM5:%.*]] = sext i32 [[TMP20]] to i64
 // CHECK21-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds float, ptr [[TMP19]], i64 [[IDXPROM5]]
-// CHECK21-NEXT:    [[TMP21:%.*]] = load float, ptr [[ARRAYIDX6]], align 4, !llvm.access.group [[ACC_GRP14]]
+// CHECK21-NEXT:    [[TMP21:%.*]] = load float, ptr [[ARRAYIDX6]], align 4, !llvm.access.group [[ACC_GRP9]]
 // CHECK21-NEXT:    [[MUL7:%.*]] = fmul float [[MUL4]], [[TMP21]]
-// CHECK21-NEXT:    [[TMP22:%.*]] = load ptr, ptr [[TMP0]], align 8, !llvm.access.group [[ACC_GRP14]]
-// CHECK21-NEXT:    [[TMP23:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP14]]
+// CHECK21-NEXT:    [[TMP22:%.*]] = load ptr, ptr [[TMP0]], align 8, !llvm.access.group [[ACC_GRP9]]
+// CHECK21-NEXT:    [[TMP23:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP9]]
 // CHECK21-NEXT:    [[IDXPROM8:%.*]] = sext i32 [[TMP23]] to i64
 // CHECK21-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i64 [[IDXPROM8]]
-// CHECK21-NEXT:    store float [[MUL7]], ptr [[ARRAYIDX9]], align 4, !llvm.access.group [[ACC_GRP14]]
+// CHECK21-NEXT:    store float [[MUL7]], ptr [[ARRAYIDX9]], align 4, !llvm.access.group [[ACC_GRP9]]
 // CHECK21-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK21:       omp.body.continue:
 // CHECK21-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK21:       omp.inner.for.inc:
-// CHECK21-NEXT:    [[TMP24:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP14]]
+// CHECK21-NEXT:    [[TMP24:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP9]]
 // CHECK21-NEXT:    [[ADD10:%.*]] = add nsw i32 [[TMP24]], 1
-// CHECK21-NEXT:    store i32 [[ADD10]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP14]]
-// CHECK21-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP15:![0-9]+]]
+// CHECK21-NEXT:    store i32 [[ADD10]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP9]]
+// CHECK21-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP10:![0-9]+]]
 // CHECK21:       omp.inner.for.end:
 // CHECK21-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK21:       omp.loop.exit:
@@ -6790,7 +6790,7 @@ int fint(void) { return ftemplate<int>(); }
 //
 //
 // CHECK21-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z18static_not_chunkedPfS_S_S__l86.omp_outlined
-// CHECK21-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[D:%.*]]) #[[ATTR1]] {
+// CHECK21-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[D:%.*]]) #[[ATTR0]] {
 // CHECK21-NEXT:  entry:
 // CHECK21-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK21-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -6846,7 +6846,7 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK21-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP11]], 7
 // CHECK21-NEXT:    [[SUB:%.*]] = sub nsw i32 32000000, [[MUL]]
 // CHECK21-NEXT:    store i32 [[SUB]], ptr [[I]], align 4
-// CHECK21-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[TMP1]], align 8, !nontemporal !21
+// CHECK21-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[TMP1]], align 8, !nontemporal !16
 // CHECK21-NEXT:    [[TMP13:%.*]] = load i32, ptr [[I]], align 4
 // CHECK21-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP13]] to i64
 // CHECK21-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i64 [[IDXPROM]]
@@ -6863,7 +6863,7 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK21-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds float, ptr [[TMP18]], i64 [[IDXPROM5]]
 // CHECK21-NEXT:    [[TMP20:%.*]] = load float, ptr [[ARRAYIDX6]], align 4
 // CHECK21-NEXT:    [[MUL7:%.*]] = fmul float [[MUL4]], [[TMP20]]
-// CHECK21-NEXT:    [[TMP21:%.*]] = load ptr, ptr [[TMP0]], align 8, !nontemporal !21
+// CHECK21-NEXT:    [[TMP21:%.*]] = load ptr, ptr [[TMP0]], align 8, !nontemporal !16
 // CHECK21-NEXT:    [[TMP22:%.*]] = load i32, ptr [[I]], align 4
 // CHECK21-NEXT:    [[IDXPROM8:%.*]] = sext i32 [[TMP22]] to i64
 // CHECK21-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds float, ptr [[TMP21]], i64 [[IDXPROM8]]
@@ -6875,7 +6875,7 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK21-NEXT:    [[TMP23:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
 // CHECK21-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP23]], 1
 // CHECK21-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4
-// CHECK21-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP22:![0-9]+]]
+// CHECK21-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP17:![0-9]+]]
 // CHECK21:       omp.inner.for.end:
 // CHECK21-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK21:       omp.loop.exit:
@@ -6906,7 +6906,7 @@ int fint(void) { return ftemplate<int>(); }
 //
 //
 // CHECK21-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z14static_chunkedPfS_S_S__l103.omp_outlined
-// CHECK21-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[D:%.*]]) #[[ATTR1]] {
+// CHECK21-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[D:%.*]]) #[[ATTR0]] {
 // CHECK21-NEXT:  entry:
 // CHECK21-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK21-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -6960,45 +6960,45 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK21:       omp.dispatch.body:
 // CHECK21-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK21:       omp.inner.for.cond:
-// CHECK21-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP24:![0-9]+]]
-// CHECK21-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP24]]
+// CHECK21-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19:![0-9]+]]
+// CHECK21-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP19]]
 // CHECK21-NEXT:    [[CMP2:%.*]] = icmp ule i32 [[TMP11]], [[TMP12]]
 // CHECK21-NEXT:    br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK21:       omp.inner.for.body:
-// CHECK21-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP24]]
+// CHECK21-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19]]
 // CHECK21-NEXT:    [[MUL:%.*]] = mul i32 [[TMP13]], 127
 // CHECK21-NEXT:    [[ADD:%.*]] = add i32 131071, [[MUL]]
-// CHECK21-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP24]]
-// CHECK21-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[TMP1]], align 8, !llvm.access.group [[ACC_GRP24]]
-// CHECK21-NEXT:    [[TMP15:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP24]]
+// CHECK21-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK21-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[TMP1]], align 8, !llvm.access.group [[ACC_GRP19]]
+// CHECK21-NEXT:    [[TMP15:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP19]]
 // CHECK21-NEXT:    [[IDXPROM:%.*]] = zext i32 [[TMP15]] to i64
 // CHECK21-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i64 [[IDXPROM]]
-// CHECK21-NEXT:    [[TMP16:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP24]]
-// CHECK21-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[TMP2]], align 8, !llvm.access.group [[ACC_GRP24]]
-// CHECK21-NEXT:    [[TMP18:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP24]]
+// CHECK21-NEXT:    [[TMP16:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK21-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[TMP2]], align 8, !llvm.access.group [[ACC_GRP19]]
+// CHECK21-NEXT:    [[TMP18:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP19]]
 // CHECK21-NEXT:    [[IDXPROM3:%.*]] = zext i32 [[TMP18]] to i64
 // CHECK21-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i64 [[IDXPROM3]]
-// CHECK21-NEXT:    [[TMP19:%.*]] = load float, ptr [[ARRAYIDX4]], align 4, !llvm.access.group [[ACC_GRP24]]
+// CHECK21-NEXT:    [[TMP19:%.*]] = load float, ptr [[ARRAYIDX4]], align 4, !llvm.access.group [[ACC_GRP19]]
 // CHECK21-NEXT:    [[MUL5:%.*]] = fmul float [[TMP16]], [[TMP19]]
-// CHECK21-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[TMP3]], align 8, !llvm.access.group [[ACC_GRP24]]
-// CHECK21-NEXT:    [[TMP21:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP24]]
+// CHECK21-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[TMP3]], align 8, !llvm.access.group [[ACC_GRP19]]
+// CHECK21-NEXT:    [[TMP21:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP19]]
 // CHECK21-NEXT:    [[IDXPROM6:%.*]] = zext i32 [[TMP21]] to i64
 // CHECK21-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[IDXPROM6]]
-// CHECK21-NEXT:    [[TMP22:%.*]] = load float, ptr [[ARRAYIDX7]], align 4, !llvm.access.group [[ACC_GRP24]]
+// CHECK21-NEXT:    [[TMP22:%.*]] = load float, ptr [[ARRAYIDX7]], align 4, !llvm.access.group [[ACC_GRP19]]
 // CHECK21-NEXT:    [[MUL8:%.*]] = fmul float [[MUL5]], [[TMP22]]
-// CHECK21-NEXT:    [[TMP23:%.*]] = load ptr, ptr [[TMP0]], align 8, !llvm.access.group [[ACC_GRP24]]
-// CHECK21-NEXT:    [[TMP24:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP24]]
+// CHECK21-NEXT:    [[TMP23:%.*]] = load ptr, ptr [[TMP0]], align 8, !llvm.access.group [[ACC_GRP19]]
+// CHECK21-NEXT:    [[TMP24:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP19]]
 // CHECK21-NEXT:    [[IDXPROM9:%.*]] = zext i32 [[TMP24]] to i64
 // CHECK21-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds float, ptr [[TMP23]], i64 [[IDXPROM9]]
-// CHECK21-NEXT:    store float [[MUL8]], ptr [[ARRAYIDX10]], align 4, !llvm.access.group [[ACC_GRP24]]
+// CHECK21-NEXT:    store float [[MUL8]], ptr [[ARRAYIDX10]], align 4, !llvm.access.group [[ACC_GRP19]]
 // CHECK21-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK21:       omp.body.continue:
 // CHECK21-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK21:       omp.inner.for.inc:
-// CHECK21-NEXT:    [[TMP25:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP24]]
+// CHECK21-NEXT:    [[TMP25:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19]]
 // CHECK21-NEXT:    [[ADD11:%.*]] = add i32 [[TMP25]], 1
-// CHECK21-NEXT:    store i32 [[ADD11]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP24]]
-// CHECK21-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP25:![0-9]+]]
+// CHECK21-NEXT:    store i32 [[ADD11]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK21-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP20:![0-9]+]]
 // CHECK21:       omp.inner.for.end:
 // CHECK21-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK21:       omp.dispatch.inc:
@@ -7035,7 +7035,7 @@ int fint(void) { return ftemplate<int>(); }
 //
 //
 // CHECK21-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z12test_precondv_l115.omp_outlined
-// CHECK21-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[I:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[A:%.*]]) #[[ATTR1]] {
+// CHECK21-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[I:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[A:%.*]]) #[[ATTR0]] {
 // CHECK21-NEXT:  entry:
 // CHECK21-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK21-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -7103,26 +7103,26 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK21:       omp_if.then:
 // CHECK21-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK21:       omp.inner.for.cond:
-// CHECK21-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP27:![0-9]+]]
-// CHECK21-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP27]]
+// CHECK21-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22:![0-9]+]]
+// CHECK21-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP22]]
 // CHECK21-NEXT:    [[CMP8:%.*]] = icmp sle i32 [[TMP15]], [[TMP16]]
 // CHECK21-NEXT:    br i1 [[CMP8]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK21:       omp.inner.for.body:
-// CHECK21-NEXT:    [[TMP17:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR_]], align 1, !llvm.access.group [[ACC_GRP27]]
+// CHECK21-NEXT:    [[TMP17:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR_]], align 1, !llvm.access.group [[ACC_GRP22]]
 // CHECK21-NEXT:    [[CONV9:%.*]] = sext i8 [[TMP17]] to i32
-// CHECK21-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP27]]
+// CHECK21-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
 // CHECK21-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1
 // CHECK21-NEXT:    [[ADD10:%.*]] = add nsw i32 [[CONV9]], [[MUL]]
 // CHECK21-NEXT:    [[CONV11:%.*]] = trunc i32 [[ADD10]] to i8
-// CHECK21-NEXT:    store i8 [[CONV11]], ptr [[I6]], align 1, !nontemporal !21, !llvm.access.group [[ACC_GRP27]]
+// CHECK21-NEXT:    store i8 [[CONV11]], ptr [[I6]], align 1, !nontemporal !16, !llvm.access.group [[ACC_GRP22]]
 // CHECK21-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK21:       omp.body.continue:
 // CHECK21-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK21:       omp.inner.for.inc:
-// CHECK21-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP27]]
+// CHECK21-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
 // CHECK21-NEXT:    [[ADD12:%.*]] = add nsw i32 [[TMP19]], 1
-// CHECK21-NEXT:    store i32 [[ADD12]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP27]]
-// CHECK21-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP28:![0-9]+]]
+// CHECK21-NEXT:    store i32 [[ADD12]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK21-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP23:![0-9]+]]
 // CHECK21:       omp.inner.for.end:
 // CHECK21-NEXT:    br label [[OMP_IF_END:%.*]]
 // CHECK21:       omp_if.else:
@@ -7147,7 +7147,7 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK21-NEXT:    [[TMP24:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
 // CHECK21-NEXT:    [[ADD22:%.*]] = add nsw i32 [[TMP24]], 1
 // CHECK21-NEXT:    store i32 [[ADD22]], ptr [[DOTOMP_IV]], align 4
-// CHECK21-NEXT:    br label [[OMP_INNER_FOR_COND13]], !llvm.loop [[LOOP30:![0-9]+]]
+// CHECK21-NEXT:    br label [[OMP_INNER_FOR_COND13]], !llvm.loop [[LOOP25:![0-9]+]]
 // CHECK21:       omp.inner.for.end23:
 // CHECK21-NEXT:    br label [[OMP_IF_END]]
 // CHECK21:       omp_if.end:
@@ -7189,7 +7189,7 @@ int fint(void) { return ftemplate<int>(); }
 //
 //
 // CHECK21-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_v_l135.omp_outlined
-// CHECK21-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[AA:%.*]]) #[[ATTR1]] {
+// CHECK21-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[AA:%.*]]) #[[ATTR0]] {
 // CHECK21-NEXT:  entry:
 // CHECK21-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK21-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -7236,23 +7236,23 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK21:       omp.dispatch.body:
 // CHECK21-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK21:       omp.inner.for.cond:
-// CHECK21-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP32:![0-9]+]]
-// CHECK21-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP32]]
+// CHECK21-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP27:![0-9]+]]
+// CHECK21-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP27]]
 // CHECK21-NEXT:    [[CMP2:%.*]] = icmp sle i32 [[TMP9]], [[TMP10]]
 // CHECK21-NEXT:    br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK21:       omp.inner.for.body:
-// CHECK21-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP32]]
+// CHECK21-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP27]]
 // CHECK21-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP11]], 1
 // CHECK21-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK21-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP32]]
+// CHECK21-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP27]]
 // CHECK21-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK21:       omp.body.continue:
 // CHECK21-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK21:       omp.inner.for.inc:
-// CHECK21-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP32]]
+// CHECK21-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP27]]
 // CHECK21-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP12]], 1
-// CHECK21-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP32]]
-// CHECK21-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP33:![0-9]+]]
+// CHECK21-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP27]]
+// CHECK21-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP28:![0-9]+]]
 // CHECK21:       omp.inner.for.end:
 // CHECK21-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK21:       omp.dispatch.inc:
@@ -7293,7 +7293,7 @@ int fint(void) { return ftemplate<int>(); }
 //
 //
 // CHECK23-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z23without_schedule_clausePfS_S_S__l70.omp_outlined
-// CHECK23-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[D:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK23-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[D:%.*]]) #[[ATTR0]] {
 // CHECK23-NEXT:  entry:
 // CHECK23-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK23-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -7342,41 +7342,41 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK23-NEXT:    store i32 [[TMP9]], ptr [[DOTOMP_IV]], align 4
 // CHECK23-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK23:       omp.inner.for.cond:
-// CHECK23-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP15:![0-9]+]]
-// CHECK23-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP15]]
+// CHECK23-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP10:![0-9]+]]
+// CHECK23-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP10]]
 // CHECK23-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP10]], [[TMP11]]
 // CHECK23-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK23:       omp.inner.for.body:
-// CHECK23-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP15]]
+// CHECK23-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP10]]
 // CHECK23-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP12]], 7
 // CHECK23-NEXT:    [[ADD:%.*]] = add nsw i32 33, [[MUL]]
-// CHECK23-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP15]]
-// CHECK23-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[TMP1]], align 4, !llvm.access.group [[ACC_GRP15]]
-// CHECK23-NEXT:    [[TMP14:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP15]]
+// CHECK23-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP10]]
+// CHECK23-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[TMP1]], align 4, !llvm.access.group [[ACC_GRP10]]
+// CHECK23-NEXT:    [[TMP14:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP10]]
 // CHECK23-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP13]], i32 [[TMP14]]
-// CHECK23-NEXT:    [[TMP15:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP15]]
-// CHECK23-NEXT:    [[TMP16:%.*]] = load ptr, ptr [[TMP2]], align 4, !llvm.access.group [[ACC_GRP15]]
-// CHECK23-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP15]]
+// CHECK23-NEXT:    [[TMP15:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP10]]
+// CHECK23-NEXT:    [[TMP16:%.*]] = load ptr, ptr [[TMP2]], align 4, !llvm.access.group [[ACC_GRP10]]
+// CHECK23-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP10]]
 // CHECK23-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i32 [[TMP17]]
-// CHECK23-NEXT:    [[TMP18:%.*]] = load float, ptr [[ARRAYIDX2]], align 4, !llvm.access.group [[ACC_GRP15]]
+// CHECK23-NEXT:    [[TMP18:%.*]] = load float, ptr [[ARRAYIDX2]], align 4, !llvm.access.group [[ACC_GRP10]]
 // CHECK23-NEXT:    [[MUL3:%.*]] = fmul float [[TMP15]], [[TMP18]]
-// CHECK23-NEXT:    [[TMP19:%.*]] = load ptr, ptr [[TMP3]], align 4, !llvm.access.group [[ACC_GRP15]]
-// CHECK23-NEXT:    [[TMP20:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP15]]
+// CHECK23-NEXT:    [[TMP19:%.*]] = load ptr, ptr [[TMP3]], align 4, !llvm.access.group [[ACC_GRP10]]
+// CHECK23-NEXT:    [[TMP20:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP10]]
 // CHECK23-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[TMP19]], i32 [[TMP20]]
-// CHECK23-NEXT:    [[TMP21:%.*]] = load float, ptr [[ARRAYIDX4]], align 4, !llvm.access.group [[ACC_GRP15]]
+// CHECK23-NEXT:    [[TMP21:%.*]] = load float, ptr [[ARRAYIDX4]], align 4, !llvm.access.group [[ACC_GRP10]]
 // CHECK23-NEXT:    [[MUL5:%.*]] = fmul float [[MUL3]], [[TMP21]]
-// CHECK23-NEXT:    [[TMP22:%.*]] = load ptr, ptr [[TMP0]], align 4, !llvm.access.group [[ACC_GRP15]]
-// CHECK23-NEXT:    [[TMP23:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP15]]
+// CHECK23-NEXT:    [[TMP22:%.*]] = load ptr, ptr [[TMP0]], align 4, !llvm.access.group [[ACC_GRP10]]
+// CHECK23-NEXT:    [[TMP23:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP10]]
 // CHECK23-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds float, ptr [[TMP22]], i32 [[TMP23]]
-// CHECK23-NEXT:    store float [[MUL5]], ptr [[ARRAYIDX6]], align 4, !llvm.access.group [[ACC_GRP15]]
+// CHECK23-NEXT:    store float [[MUL5]], ptr [[ARRAYIDX6]], align 4, !llvm.access.group [[ACC_GRP10]]
 // CHECK23-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK23:       omp.body.continue:
 // CHECK23-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK23:       omp.inner.for.inc:
-// CHECK23-NEXT:    [[TMP24:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP15]]
+// CHECK23-NEXT:    [[TMP24:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP10]]
 // CHECK23-NEXT:    [[ADD7:%.*]] = add nsw i32 [[TMP24]], 1
-// CHECK23-NEXT:    store i32 [[ADD7]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP15]]
-// CHECK23-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP16:![0-9]+]]
+// CHECK23-NEXT:    store i32 [[ADD7]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP10]]
+// CHECK23-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP11:![0-9]+]]
 // CHECK23:       omp.inner.for.end:
 // CHECK23-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK23:       omp.loop.exit:
@@ -7407,7 +7407,7 @@ int fint(void) { return ftemplate<int>(); }
 //
 //
 // CHECK23-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z18static_not_chunkedPfS_S_S__l86.omp_outlined
-// CHECK23-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[D:%.*]]) #[[ATTR1]] {
+// CHECK23-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[D:%.*]]) #[[ATTR0]] {
 // CHECK23-NEXT:  entry:
 // CHECK23-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK23-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -7463,7 +7463,7 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK23-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP11]], 7
 // CHECK23-NEXT:    [[SUB:%.*]] = sub nsw i32 32000000, [[MUL]]
 // CHECK23-NEXT:    store i32 [[SUB]], ptr [[I]], align 4
-// CHECK23-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[TMP1]], align 4, !nontemporal !22
+// CHECK23-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[TMP1]], align 4, !nontemporal !17
 // CHECK23-NEXT:    [[TMP13:%.*]] = load i32, ptr [[I]], align 4
 // CHECK23-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 [[TMP13]]
 // CHECK23-NEXT:    [[TMP14:%.*]] = load float, ptr [[ARRAYIDX]], align 4
@@ -7477,7 +7477,7 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK23-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[TMP18]], i32 [[TMP19]]
 // CHECK23-NEXT:    [[TMP20:%.*]] = load float, ptr [[ARRAYIDX4]], align 4
 // CHECK23-NEXT:    [[MUL5:%.*]] = fmul float [[MUL3]], [[TMP20]]
-// CHECK23-NEXT:    [[TMP21:%.*]] = load ptr, ptr [[TMP0]], align 4, !nontemporal !22
+// CHECK23-NEXT:    [[TMP21:%.*]] = load ptr, ptr [[TMP0]], align 4, !nontemporal !17
 // CHECK23-NEXT:    [[TMP22:%.*]] = load i32, ptr [[I]], align 4
 // CHECK23-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds float, ptr [[TMP21]], i32 [[TMP22]]
 // CHECK23-NEXT:    store float [[MUL5]], ptr [[ARRAYIDX6]], align 4
@@ -7488,7 +7488,7 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK23-NEXT:    [[TMP23:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
 // CHECK23-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP23]], 1
 // CHECK23-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4
-// CHECK23-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP23:![0-9]+]]
+// CHECK23-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP18:![0-9]+]]
 // CHECK23:       omp.inner.for.end:
 // CHECK23-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK23:       omp.loop.exit:
@@ -7519,7 +7519,7 @@ int fint(void) { return ftemplate<int>(); }
 //
 //
 // CHECK23-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z14static_chunkedPfS_S_S__l103.omp_outlined
-// CHECK23-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[D:%.*]]) #[[ATTR1]] {
+// CHECK23-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[D:%.*]]) #[[ATTR0]] {
 // CHECK23-NEXT:  entry:
 // CHECK23-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK23-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -7573,41 +7573,41 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK23:       omp.dispatch.body:
 // CHECK23-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK23:       omp.inner.for.cond:
-// CHECK23-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP25:![0-9]+]]
-// CHECK23-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP25]]
+// CHECK23-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP20:![0-9]+]]
+// CHECK23-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP20]]
 // CHECK23-NEXT:    [[CMP2:%.*]] = icmp ule i32 [[TMP11]], [[TMP12]]
 // CHECK23-NEXT:    br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK23:       omp.inner.for.body:
-// CHECK23-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP25]]
+// CHECK23-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP20]]
 // CHECK23-NEXT:    [[MUL:%.*]] = mul i32 [[TMP13]], 127
 // CHECK23-NEXT:    [[ADD:%.*]] = add i32 131071, [[MUL]]
-// CHECK23-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP25]]
-// CHECK23-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[TMP1]], align 4, !llvm.access.group [[ACC_GRP25]]
-// CHECK23-NEXT:    [[TMP15:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP25]]
+// CHECK23-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP20]]
+// CHECK23-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[TMP1]], align 4, !llvm.access.group [[ACC_GRP20]]
+// CHECK23-NEXT:    [[TMP15:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP20]]
 // CHECK23-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i32 [[TMP15]]
-// CHECK23-NEXT:    [[TMP16:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP25]]
-// CHECK23-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[TMP2]], align 4, !llvm.access.group [[ACC_GRP25]]
-// CHECK23-NEXT:    [[TMP18:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP25]]
+// CHECK23-NEXT:    [[TMP16:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP20]]
+// CHECK23-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[TMP2]], align 4, !llvm.access.group [[ACC_GRP20]]
+// CHECK23-NEXT:    [[TMP18:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP20]]
 // CHECK23-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, ptr [[TMP17]], i32 [[TMP18]]
-// CHECK23-NEXT:    [[TMP19:%.*]] = load float, ptr [[ARRAYIDX3]], align 4, !llvm.access.group [[ACC_GRP25]]
+// CHECK23-NEXT:    [[TMP19:%.*]] = load float, ptr [[ARRAYIDX3]], align 4, !llvm.access.group [[ACC_GRP20]]
 // CHECK23-NEXT:    [[MUL4:%.*]] = fmul float [[TMP16]], [[TMP19]]
-// CHECK23-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[TMP3]], align 4, !llvm.access.group [[ACC_GRP25]]
-// CHECK23-NEXT:    [[TMP21:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP25]]
+// CHECK23-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[TMP3]], align 4, !llvm.access.group [[ACC_GRP20]]
+// CHECK23-NEXT:    [[TMP21:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP20]]
 // CHECK23-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i32 [[TMP21]]
-// CHECK23-NEXT:    [[TMP22:%.*]] = load float, ptr [[ARRAYIDX5]], align 4, !llvm.access.group [[ACC_GRP25]]
+// CHECK23-NEXT:    [[TMP22:%.*]] = load float, ptr [[ARRAYIDX5]], align 4, !llvm.access.group [[ACC_GRP20]]
 // CHECK23-NEXT:    [[MUL6:%.*]] = fmul float [[MUL4]], [[TMP22]]
-// CHECK23-NEXT:    [[TMP23:%.*]] = load ptr, ptr [[TMP0]], align 4, !llvm.access.group [[ACC_GRP25]]
-// CHECK23-NEXT:    [[TMP24:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP25]]
+// CHECK23-NEXT:    [[TMP23:%.*]] = load ptr, ptr [[TMP0]], align 4, !llvm.access.group [[ACC_GRP20]]
+// CHECK23-NEXT:    [[TMP24:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP20]]
 // CHECK23-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[TMP23]], i32 [[TMP24]]
-// CHECK23-NEXT:    store float [[MUL6]], ptr [[ARRAYIDX7]], align 4, !llvm.access.group [[ACC_GRP25]]
+// CHECK23-NEXT:    store float [[MUL6]], ptr [[ARRAYIDX7]], align 4, !llvm.access.group [[ACC_GRP20]]
 // CHECK23-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK23:       omp.body.continue:
 // CHECK23-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK23:       omp.inner.for.inc:
-// CHECK23-NEXT:    [[TMP25:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP25]]
+// CHECK23-NEXT:    [[TMP25:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP20]]
 // CHECK23-NEXT:    [[ADD8:%.*]] = add i32 [[TMP25]], 1
-// CHECK23-NEXT:    store i32 [[ADD8]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP25]]
-// CHECK23-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP26:![0-9]+]]
+// CHECK23-NEXT:    store i32 [[ADD8]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP20]]
+// CHECK23-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP21:![0-9]+]]
 // CHECK23:       omp.inner.for.end:
 // CHECK23-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK23:       omp.dispatch.inc:
@@ -7644,7 +7644,7 @@ int fint(void) { return ftemplate<int>(); }
 //
 //
 // CHECK23-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z12test_precondv_l115.omp_outlined
-// CHECK23-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[I:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[A:%.*]]) #[[ATTR1]] {
+// CHECK23-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[I:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[A:%.*]]) #[[ATTR0]] {
 // CHECK23-NEXT:  entry:
 // CHECK23-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK23-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -7712,26 +7712,26 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK23:       omp_if.then:
 // CHECK23-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK23:       omp.inner.for.cond:
-// CHECK23-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP28:![0-9]+]]
-// CHECK23-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP28]]
+// CHECK23-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP23:![0-9]+]]
+// CHECK23-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP23]]
 // CHECK23-NEXT:    [[CMP8:%.*]] = icmp sle i32 [[TMP15]], [[TMP16]]
 // CHECK23-NEXT:    br i1 [[CMP8]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK23:       omp.inner.for.body:
-// CHECK23-NEXT:    [[TMP17:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR_]], align 1, !llvm.access.group [[ACC_GRP28]]
+// CHECK23-NEXT:    [[TMP17:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR_]], align 1, !llvm.access.group [[ACC_GRP23]]
 // CHECK23-NEXT:    [[CONV9:%.*]] = sext i8 [[TMP17]] to i32
-// CHECK23-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP28]]
+// CHECK23-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP23]]
 // CHECK23-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1
 // CHECK23-NEXT:    [[ADD10:%.*]] = add nsw i32 [[CONV9]], [[MUL]]
 // CHECK23-NEXT:    [[CONV11:%.*]] = trunc i32 [[ADD10]] to i8
-// CHECK23-NEXT:    store i8 [[CONV11]], ptr [[I6]], align 1, !nontemporal !22, !llvm.access.group [[ACC_GRP28]]
+// CHECK23-NEXT:    store i8 [[CONV11]], ptr [[I6]], align 1, !nontemporal !17, !llvm.access.group [[ACC_GRP23]]
 // CHECK23-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK23:       omp.body.continue:
 // CHECK23-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK23:       omp.inner.for.inc:
-// CHECK23-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP28]]
+// CHECK23-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP23]]
 // CHECK23-NEXT:    [[ADD12:%.*]] = add nsw i32 [[TMP19]], 1
-// CHECK23-NEXT:    store i32 [[ADD12]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP28]]
-// CHECK23-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP29:![0-9]+]]
+// CHECK23-NEXT:    store i32 [[ADD12]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK23-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP24:![0-9]+]]
 // CHECK23:       omp.inner.for.end:
 // CHECK23-NEXT:    br label [[OMP_IF_END:%.*]]
 // CHECK23:       omp_if.else:
@@ -7756,7 +7756,7 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK23-NEXT:    [[TMP24:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
 // CHECK23-NEXT:    [[ADD22:%.*]] = add nsw i32 [[TMP24]], 1
 // CHECK23-NEXT:    store i32 [[ADD22]], ptr [[DOTOMP_IV]], align 4
-// CHECK23-NEXT:    br label [[OMP_INNER_FOR_COND13]], !llvm.loop [[LOOP31:![0-9]+]]
+// CHECK23-NEXT:    br label [[OMP_INNER_FOR_COND13]], !llvm.loop [[LOOP26:![0-9]+]]
 // CHECK23:       omp.inner.for.end23:
 // CHECK23-NEXT:    br label [[OMP_IF_END]]
 // CHECK23:       omp_if.end:
@@ -7798,7 +7798,7 @@ int fint(void) { return ftemplate<int>(); }
 //
 //
 // CHECK23-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_v_l135.omp_outlined
-// CHECK23-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[AA:%.*]]) #[[ATTR1]] {
+// CHECK23-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[AA:%.*]]) #[[ATTR0]] {
 // CHECK23-NEXT:  entry:
 // CHECK23-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK23-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -7845,23 +7845,23 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK23:       omp.dispatch.body:
 // CHECK23-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK23:       omp.inner.for.cond:
-// CHECK23-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP33:![0-9]+]]
-// CHECK23-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP33]]
+// CHECK23-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP28:![0-9]+]]
+// CHECK23-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP28]]
 // CHECK23-NEXT:    [[CMP2:%.*]] = icmp sle i32 [[TMP9]], [[TMP10]]
 // CHECK23-NEXT:    br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK23:       omp.inner.for.body:
-// CHECK23-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP33]]
+// CHECK23-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP28]]
 // CHECK23-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP11]], 1
 // CHECK23-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK23-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP33]]
+// CHECK23-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP28]]
 // CHECK23-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK23:       omp.body.continue:
 // CHECK23-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK23:       omp.inner.for.inc:
-// CHECK23-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP33]]
+// CHECK23-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP28]]
 // CHECK23-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP12]], 1
-// CHECK23-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP33]]
-// CHECK23-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP34:![0-9]+]]
+// CHECK23-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP28]]
+// CHECK23-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP29:![0-9]+]]
 // CHECK23:       omp.inner.for.end:
 // CHECK23-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK23:       omp.dispatch.inc:
diff --git a/clang/test/OpenMP/target_num_teams_num_threads_attributes.cpp b/clang/test/OpenMP/target_num_teams_num_threads_attributes.cpp
index da0c4946c2220..613b21ff7f75f 100644
--- a/clang/test/OpenMP/target_num_teams_num_threads_attributes.cpp
+++ b/clang/test/OpenMP/target_num_teams_num_threads_attributes.cpp
@@ -1,52 +1,9 @@
-// RUN: %clang_cc1 -verify -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s -check-prefix=CHECK1
-// RUN: %clang_cc1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s -check-prefix=CHECK1
-// RUN: %clang_cc1 -verify -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s  -check-prefix=CHECK1
-// RUN: %clang_cc1 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s  -check-prefix=CHECK1
-
-// RUN: %clang_cc1 -verify -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s -check-prefix=CHECK2
-// RUN: %clang_cc1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s -check-prefix=CHECK2
-// RUN: %clang_cc1 -verify -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s  -check-prefix=CHECK2
-// RUN: %clang_cc1 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s  -check-prefix=CHECK2
-
-// RUN: %clang_cc1 -verify -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s -check-prefix=CHECK3
-// RUN: %clang_cc1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s -check-prefix=CHECK3
-// RUN: %clang_cc1 -verify -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s  -check-prefix=CHECK3
-// RUN: %clang_cc1 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s  -check-prefix=CHECK3
-
-// RUN: %clang_cc1 -verify -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s -check-prefix=CHECK4
-// RUN: %clang_cc1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s -check-prefix=CHECK4
-// RUN: %clang_cc1 -verify -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s  -check-prefix=CHECK4
-// RUN: %clang_cc1 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s  -check-prefix=CHECK4
-
-// RUN: %clang_cc1 -verify -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s -check-prefix=CHECK5
-// RUN: %clang_cc1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s -check-prefix=CHECK5
-// RUN: %clang_cc1 -verify -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s  -check-prefix=CHECK5
-// RUN: %clang_cc1 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s  -check-prefix=CHECK5
-
-// RUN: %clang_cc1 -verify -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s -check-prefix=CHECK6
-// RUN: %clang_cc1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s -check-prefix=CHECK6
-// RUN: %clang_cc1 -verify -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s  -check-prefix=CHECK6
-// RUN: %clang_cc1 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s  -check-prefix=CHECK6
-
-// RUN: %clang_cc1 -verify -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s -check-prefix=CHECK7
-// RUN: %clang_cc1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s -check-prefix=CHECK7
-// RUN: %clang_cc1 -verify -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck -allow-deprecated-dag-overlap  %s  -check-prefix=CHECK7
-// RUN: %clang_cc1 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -fopenmp-targets=i386-pc-linux-gnu -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck -allow-deprecated-dag-overlap  %s  -check-prefix=CHECK7
-
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-unknown -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s 
+// RUN: %clang_cc1 -target-cpu gfx900 -fopenmp -x c++ -std=c++11 -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-unknown -fopenmp-targets=nvptx64 -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple nvptx64 -fopenmp-targets=nvptx64 -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s 
+// RUN: %clang_cc1 -target-cpu sm_80 -fopenmp -x c++ -std=c++11 -triple nvptx64 -fopenmp-targets=nvptx64 -emit-llvm %s -fopenmp-is-target-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s
 
 // expected-no-diagnostics
 
@@ -54,7 +11,6 @@
 #ifndef HEADER
 #define HEADER
 
-// COM: Checking for default target and thread limit: CHECK1
 
 void default_val_num_teams() {
     #pragma omp target simd
@@ -62,7 +18,6 @@ void default_val_num_teams() {
         int a_var;
 }
 
-// COM: Check for num_teams(22) all the same: CHECK2
 void foo1() {
     #pragma omp target teams num_teams(22)
     { int a_var; }
@@ -80,7 +35,6 @@ void foo3() {
         int a_var;
 }
 
-// COM: Check for num_teams differently: CHECK3
 void bar1() {
     #pragma omp target teams num_teams(22)
     { int a_var; }
@@ -98,27 +52,23 @@ void bar3() {
         int a_var;
 }
 
-// COM: Check for const int expression: CHECK4
 void const_int() {
     const int NT = 22;
     #pragma omp target teams num_teams(NT)
     { int a_var; }
 }
 
-// COM: Checking for num threads: CHECK5
 void thread_limit() {
     #pragma omp target teams thread_limit(22)
     { int a_var; }
 }
 
-// COM: Checking for num threads and thread limit: CHECK6
 void num_threads() {
     #pragma omp target teams distribute parallel for thread_limit(22) num_threads(11)
     for (int i = 0; i < 22; i++)
         int a_var;
 }
 
-// COM: Checking for thread_limit and num_teams: CHECK6
 void threads_and_teams() {
     #pragma omp target teams distribute parallel for thread_limit(22) num_teams(33)
     for (int i = 0; i < 22; i++)
@@ -127,49 +77,15 @@ void threads_and_teams() {
 
 #endif
 
-// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+.*}}default_val_num_teams{{[^(]*}}
-// CHECK1-SAME: () #[[ATTR_OUTLINED_DEF_SIMD:[0-9]+]] {
-
-// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+.*}}foo1{{[^(]*}}
-// CHECK2-SAME: () #[[ATTR_OUTLINED_CHECK2:[0-9]+]] {
-// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+.*}}foo2{{[^(]*}}
-// CHECK2-SAME: () #[[ATTR_OUTLINED_CHECK2:[0-9]+]] {
-// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+.*}}foo3{{[^(]*}}
-// CHECK2-SAME: () #[[ATTR_OUTLINED_CHECK2:[0-9]+]] {
-
-// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+.*}}bar1{{[^(]*}}
-// CHECK3-SAME: () #[[ATTR_OUTLINED_CHECK3_1:[0-9]+]] {
-// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+.*}}bar2{{[^(]*}}
-// CHECK3-SAME: () #[[ATTR_OUTLINED_CHECK3_2:[0-9]+]] {
-// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+.*}}bar3{{[^(]*}}
-// CHECK3-SAME: () #[[ATTR_OUTLINED_CHECK3_3:[0-9]+]] {
-
-// CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+.*}}const_int{{[^(]*}}
-// CHECK4-SAME: () #[[ATTR_OUTLINED_CHECK4:[0-9]+]] {
-
-// CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+.*}}thread_limit{{[^(]*}}
-// CHECK5-SAME: () #[[ATTR_OUTLINED_CHECK5:[0-9]+]] {
-
-// CHECK6-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+.*}}num_threads{{[^(]*}}
-// CHECK6-SAME: () #[[ATTR_OUTLINED_CHECK6:[0-9]+]] {
-
-// CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+.*}}threads_and_teams{{[^(]*}}
-// CHECK7-SAME: () #[[ATTR_OUTLINED_CHECK7:[0-9]+]] {
-
-
-
-// CHECK1: attributes #[[ATTR_OUTLINED_DEF_SIMD]] = {{{.*"omp_target_num_teams"="1".*"omp_target_thread_limit"="1".*}}}
-
-// CHECK2: attributes #[[ATTR_OUTLINED_CHECK2]] = {{{.*"omp_target_num_teams"="22".*}}}
-
-// CHECK3: attributes #[[ATTR_OUTLINED_CHECK3_1]] = {{{.*"omp_target_num_teams"="22".*}}}
-// CHECK3: attributes #[[ATTR_OUTLINED_CHECK3_2]] = {{{.*"omp_target_num_teams"="33".*}}}
-// CHECK3: attributes #[[ATTR_OUTLINED_CHECK3_3]] = {{{.*"omp_target_num_teams"="44".*}}}
 
-// CHECK4: attributes #[[ATTR_OUTLINED_CHECK4]] = {{{.*"omp_target_num_teams"="22".*}}}
+// CHECK:      "omp_target_num_teams"="1"
+// CHECK:      "omp_target_num_teams"="22"
+// CHECK:      "omp_target_num_teams"="33"
+// CHECK:      "omp_target_num_teams"="44"
 
-// CHECK5: attributes #[[ATTR_OUTLINED_CHECK5]] = {{{.*"omp_target_thread_limit"="22".*}}}
+// CHECK:      "omp_target_thread_limit"="22"
 
-// CHECK6: attributes #[[ATTR_OUTLINED_CHECK6]] = {{{.*"omp_target_thread_limit"="11".*}}}
+// CHECK:      "omp_target_thread_limit"="11"
 
-// CHECK7: attributes #[[ATTR_OUTLINED_CHECK7]] = {{{.*"omp_target_num_teams"="33".*"omp_target_thread_limit"="22".*}}}
+// CHECK:      "omp_target_num_teams"="33"
+// CHECK-SAME: "omp_target_thread_limit"="22"
diff --git a/clang/test/OpenMP/target_parallel_codegen.cpp b/clang/test/OpenMP/target_parallel_codegen.cpp
index c8af38e32e638..f0c3d6917e68e 100644
--- a/clang/test/OpenMP/target_parallel_codegen.cpp
+++ b/clang/test/OpenMP/target_parallel_codegen.cpp
@@ -343,7 +343,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[A]], align 4
 // CHECK1-NEXT:    store i32 [[TMP10]], ptr [[A_CASTED]], align 4
 // CHECK1-NEXT:    [[TMP11:%.*]] = load i64, ptr [[A_CASTED]], align 8
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l104(i64 [[TMP11]]) #[[ATTR4:[0-9]+]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l104(i64 [[TMP11]]) #[[ATTR3:[0-9]+]]
 // CHECK1-NEXT:    [[TMP12:%.*]] = load i16, ptr [[AA]], align 2
 // CHECK1-NEXT:    store i16 [[TMP12]], ptr [[AA_CASTED]], align 2
 // CHECK1-NEXT:    [[TMP13:%.*]] = load i64, ptr [[AA_CASTED]], align 8
@@ -385,7 +385,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP33:%.*]] = icmp ne i32 [[TMP32]], 0
 // CHECK1-NEXT:    br i1 [[TMP33]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK1:       omp_offload.failed:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l110(i64 [[TMP13]]) #[[ATTR4]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l110(i64 [[TMP13]]) #[[ATTR3]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    [[TMP34:%.*]] = load i32, ptr [[A]], align 4
@@ -442,12 +442,12 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP61:%.*]] = icmp ne i32 [[TMP60]], 0
 // CHECK1-NEXT:    br i1 [[TMP61]], label [[OMP_OFFLOAD_FAILED8:%.*]], label [[OMP_OFFLOAD_CONT9:%.*]]
 // CHECK1:       omp_offload.failed8:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l119(i64 [[TMP35]], i64 [[TMP37]]) #[[ATTR4]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l119(i64 [[TMP35]], i64 [[TMP37]]) #[[ATTR3]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT9]]
 // CHECK1:       omp_offload.cont9:
 // CHECK1-NEXT:    br label [[OMP_IF_END:%.*]]
 // CHECK1:       omp_if.else:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l119(i64 [[TMP35]], i64 [[TMP37]]) #[[ATTR4]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l119(i64 [[TMP35]], i64 [[TMP37]]) #[[ATTR3]]
 // CHECK1-NEXT:    br label [[OMP_IF_END]]
 // CHECK1:       omp_if.end:
 // CHECK1-NEXT:    [[TMP62:%.*]] = load i32, ptr [[A]], align 4
@@ -552,12 +552,12 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP114:%.*]] = icmp ne i32 [[TMP113]], 0
 // CHECK1-NEXT:    br i1 [[TMP114]], label [[OMP_OFFLOAD_FAILED17:%.*]], label [[OMP_OFFLOAD_CONT18:%.*]]
 // CHECK1:       omp_offload.failed17:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l144(i64 [[TMP63]], ptr [[B]], i64 [[TMP2]], ptr [[VLA]], ptr [[C]], i64 5, i64 [[TMP5]], ptr [[VLA1]], ptr [[D]]) #[[ATTR4]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l144(i64 [[TMP63]], ptr [[B]], i64 [[TMP2]], ptr [[VLA]], ptr [[C]], i64 5, i64 [[TMP5]], ptr [[VLA1]], ptr [[D]]) #[[ATTR3]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT18]]
 // CHECK1:       omp_offload.cont18:
 // CHECK1-NEXT:    br label [[OMP_IF_END20:%.*]]
 // CHECK1:       omp_if.else19:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l144(i64 [[TMP63]], ptr [[B]], i64 [[TMP2]], ptr [[VLA]], ptr [[C]], i64 5, i64 [[TMP5]], ptr [[VLA1]], ptr [[D]]) #[[ATTR4]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l144(i64 [[TMP63]], ptr [[B]], i64 [[TMP2]], ptr [[VLA]], ptr [[C]], i64 5, i64 [[TMP5]], ptr [[VLA1]], ptr [[D]]) #[[ATTR3]]
 // CHECK1-NEXT:    br label [[OMP_IF_END20]]
 // CHECK1:       omp_if.end20:
 // CHECK1-NEXT:    [[TMP115:%.*]] = load i32, ptr [[A]], align 4
@@ -574,7 +574,7 @@ int bar(int n){
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l100.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -584,7 +584,7 @@ int bar(int n){
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@.omp_task_entry.
-// CHECK1-SAME: (i32 noundef signext [[TMP0:%.*]], ptr noalias noundef [[TMP1:%.*]]) #[[ATTR5:[0-9]+]] {
+// CHECK1-SAME: (i32 noundef signext [[TMP0:%.*]], ptr noalias noundef [[TMP1:%.*]]) #[[ATTR4:[0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR_I:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[DOTPART_ID__ADDR_I:%.*]] = alloca ptr, align 8
@@ -603,54 +603,54 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T:%.*]], ptr [[TMP4]], i32 0, i32 2
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], ptr [[TMP4]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8
-// CHECK1-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META26:![0-9]+]])
-// CHECK1-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META29:![0-9]+]])
-// CHECK1-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META31:![0-9]+]])
-// CHECK1-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META33:![0-9]+]])
-// CHECK1-NEXT:    store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !35
-// CHECK1-NEXT:    store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 8, !noalias !35
-// CHECK1-NEXT:    store ptr null, ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias !35
-// CHECK1-NEXT:    store ptr null, ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !35
-// CHECK1-NEXT:    store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 8, !noalias !35
-// CHECK1-NEXT:    store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !35
-// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !35
-// CHECK1-NEXT:    store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias !35
+// CHECK1-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META12:![0-9]+]])
+// CHECK1-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META15:![0-9]+]])
+// CHECK1-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META17:![0-9]+]])
+// CHECK1-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META19:![0-9]+]])
+// CHECK1-NEXT:    store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !21
+// CHECK1-NEXT:    store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 8, !noalias !21
+// CHECK1-NEXT:    store ptr null, ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias !21
+// CHECK1-NEXT:    store ptr null, ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !21
+// CHECK1-NEXT:    store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 8, !noalias !21
+// CHECK1-NEXT:    store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !21
+// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !21
+// CHECK1-NEXT:    store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias !21
 // CHECK1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 1
-// CHECK1-NEXT:    store i32 0, ptr [[TMP9]], align 4, !noalias !35
+// CHECK1-NEXT:    store i32 0, ptr [[TMP9]], align 4, !noalias !21
 // CHECK1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 2
-// CHECK1-NEXT:    store ptr null, ptr [[TMP10]], align 8, !noalias !35
+// CHECK1-NEXT:    store ptr null, ptr [[TMP10]], align 8, !noalias !21
 // CHECK1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 3
-// CHECK1-NEXT:    store ptr null, ptr [[TMP11]], align 8, !noalias !35
+// CHECK1-NEXT:    store ptr null, ptr [[TMP11]], align 8, !noalias !21
 // CHECK1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 4
-// CHECK1-NEXT:    store ptr null, ptr [[TMP12]], align 8, !noalias !35
+// CHECK1-NEXT:    store ptr null, ptr [[TMP12]], align 8, !noalias !21
 // CHECK1-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 5
-// CHECK1-NEXT:    store ptr null, ptr [[TMP13]], align 8, !noalias !35
+// CHECK1-NEXT:    store ptr null, ptr [[TMP13]], align 8, !noalias !21
 // CHECK1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 6
-// CHECK1-NEXT:    store ptr null, ptr [[TMP14]], align 8, !noalias !35
+// CHECK1-NEXT:    store ptr null, ptr [[TMP14]], align 8, !noalias !21
 // CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 7
-// CHECK1-NEXT:    store ptr null, ptr [[TMP15]], align 8, !noalias !35
+// CHECK1-NEXT:    store ptr null, ptr [[TMP15]], align 8, !noalias !21
 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 8
-// CHECK1-NEXT:    store i64 0, ptr [[TMP16]], align 8, !noalias !35
+// CHECK1-NEXT:    store i64 0, ptr [[TMP16]], align 8, !noalias !21
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 9
-// CHECK1-NEXT:    store i64 1, ptr [[TMP17]], align 8, !noalias !35
+// CHECK1-NEXT:    store i64 1, ptr [[TMP17]], align 8, !noalias !21
 // CHECK1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 10
-// CHECK1-NEXT:    store [3 x i32] [i32 1, i32 0, i32 0], ptr [[TMP18]], align 4, !noalias !35
+// CHECK1-NEXT:    store [3 x i32] [i32 1, i32 0, i32 0], ptr [[TMP18]], align 4, !noalias !21
 // CHECK1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 11
-// CHECK1-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP19]], align 4, !noalias !35
+// CHECK1-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP19]], align 4, !noalias !21
 // CHECK1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 12
-// CHECK1-NEXT:    store i32 0, ptr [[TMP20]], align 4, !noalias !35
+// CHECK1-NEXT:    store i32 0, ptr [[TMP20]], align 4, !noalias !21
 // CHECK1-NEXT:    [[TMP21:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l100.region_id, ptr [[KERNEL_ARGS_I]])
 // CHECK1-NEXT:    [[TMP22:%.*]] = icmp ne i32 [[TMP21]], 0
 // CHECK1-NEXT:    br i1 [[TMP22]], label [[OMP_OFFLOAD_FAILED_I:%.*]], label [[DOTOMP_OUTLINED__EXIT:%.*]]
 // CHECK1:       omp_offload.failed.i:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l100() #[[ATTR4]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l100() #[[ATTR3]]
 // CHECK1-NEXT:    br label [[DOTOMP_OUTLINED__EXIT]]
 // CHECK1:       .omp_outlined..exit:
 // CHECK1-NEXT:    ret i32 0
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l104
-// CHECK1-SAME: (i64 noundef [[A:%.*]]) #[[ATTR3]] {
+// CHECK1-SAME: (i64 noundef [[A:%.*]]) #[[ATTR2]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
 // CHECK1-NEXT:    [[A_CASTED:%.*]] = alloca i64, align 8
@@ -663,7 +663,7 @@ int bar(int n){
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l104.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]]) #[[ATTR3]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]]) #[[ATTR2]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -691,7 +691,7 @@ int bar(int n){
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l110.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[AA:%.*]]) #[[ATTR3]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[AA:%.*]]) #[[ATTR2]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -736,7 +736,7 @@ int bar(int n){
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l119.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]]) #[[ATTR3]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]]) #[[ATTR2]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -795,7 +795,7 @@ int bar(int n){
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l144.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BN:%.*]], ptr noundef nonnull align 8 dereferenceable(400) [[C:%.*]], i64 noundef [[VLA1:%.*]], i64 noundef [[VLA3:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[CN:%.*]], ptr noundef nonnull align 8 dereferenceable(16) [[D:%.*]]) #[[ATTR3]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BN:%.*]], ptr noundef nonnull align 8 dereferenceable(400) [[C:%.*]], i64 noundef [[VLA1:%.*]], i64 noundef [[VLA3:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[CN:%.*]], ptr noundef nonnull align 8 dereferenceable(16) [[D:%.*]]) #[[ATTR2]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1001,12 +1001,12 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP43:%.*]] = icmp ne i32 [[TMP42]], 0
 // CHECK1-NEXT:    br i1 [[TMP43]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK1:       omp_offload.failed:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l216(ptr [[THIS1]], i64 [[TMP6]], i64 2, i64 [[TMP2]], ptr [[VLA]]) #[[ATTR4]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l216(ptr [[THIS1]], i64 [[TMP6]], i64 2, i64 [[TMP2]], ptr [[VLA]]) #[[ATTR3]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    br label [[OMP_IF_END:%.*]]
 // CHECK1:       omp_if.else:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l216(ptr [[THIS1]], i64 [[TMP6]], i64 2, i64 [[TMP2]], ptr [[VLA]]) #[[ATTR4]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l216(ptr [[THIS1]], i64 [[TMP6]], i64 2, i64 [[TMP2]], ptr [[VLA]]) #[[ATTR3]]
 // CHECK1-NEXT:    br label [[OMP_IF_END]]
 // CHECK1:       omp_if.end:
 // CHECK1-NEXT:    [[TMP44:%.*]] = mul nsw i64 1, [[TMP2]]
@@ -1109,12 +1109,12 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP35:%.*]] = icmp ne i32 [[TMP34]], 0
 // CHECK1-NEXT:    br i1 [[TMP35]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK1:       omp_offload.failed:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l198(i64 [[TMP1]], i64 [[TMP3]], i64 [[TMP5]], ptr [[B]]) #[[ATTR4]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l198(i64 [[TMP1]], i64 [[TMP3]], i64 [[TMP5]], ptr [[B]]) #[[ATTR3]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    br label [[OMP_IF_END:%.*]]
 // CHECK1:       omp_if.else:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l198(i64 [[TMP1]], i64 [[TMP3]], i64 [[TMP5]], ptr [[B]]) #[[ATTR4]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l198(i64 [[TMP1]], i64 [[TMP3]], i64 [[TMP5]], ptr [[B]]) #[[ATTR3]]
 // CHECK1-NEXT:    br label [[OMP_IF_END]]
 // CHECK1:       omp_if.end:
 // CHECK1-NEXT:    [[TMP36:%.*]] = load i32, ptr [[A]], align 4
@@ -1197,12 +1197,12 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP30:%.*]] = icmp ne i32 [[TMP29]], 0
 // CHECK1-NEXT:    br i1 [[TMP30]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK1:       omp_offload.failed:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l181(i64 [[TMP1]], i64 [[TMP3]], ptr [[B]]) #[[ATTR4]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l181(i64 [[TMP1]], i64 [[TMP3]], ptr [[B]]) #[[ATTR3]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    br label [[OMP_IF_END:%.*]]
 // CHECK1:       omp_if.else:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l181(i64 [[TMP1]], i64 [[TMP3]], ptr [[B]]) #[[ATTR4]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l181(i64 [[TMP1]], i64 [[TMP3]], ptr [[B]]) #[[ATTR3]]
 // CHECK1-NEXT:    br label [[OMP_IF_END]]
 // CHECK1:       omp_if.end:
 // CHECK1-NEXT:    [[TMP31:%.*]] = load i32, ptr [[A]], align 4
@@ -1235,7 +1235,7 @@ int bar(int n){
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l216.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]], i64 noundef [[B:%.*]], i64 noundef [[VLA:%.*]], i64 noundef [[VLA1:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[C:%.*]]) #[[ATTR3]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]], i64 noundef [[B:%.*]], i64 noundef [[VLA:%.*]], i64 noundef [[VLA1:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[C:%.*]]) #[[ATTR2]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1301,7 +1301,7 @@ int bar(int n){
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l198.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]], i64 noundef [[AAA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR3]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]], i64 noundef [[AAA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR2]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1359,7 +1359,7 @@ int bar(int n){
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l181.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR3]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR2]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1388,7 +1388,7 @@ int bar(int n){
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK1-SAME: () #[[ATTR8:[0-9]+]] {
+// CHECK1-SAME: () #[[ATTR7:[0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK1-NEXT:    ret void
@@ -1444,7 +1444,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP8:%.*]] = load i32, ptr [[A]], align 4
 // CHECK3-NEXT:    store i32 [[TMP8]], ptr [[A_CASTED]], align 4
 // CHECK3-NEXT:    [[TMP9:%.*]] = load i32, ptr [[A_CASTED]], align 4
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l104(i32 [[TMP9]]) #[[ATTR4:[0-9]+]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l104(i32 [[TMP9]]) #[[ATTR3:[0-9]+]]
 // CHECK3-NEXT:    [[TMP10:%.*]] = load i16, ptr [[AA]], align 2
 // CHECK3-NEXT:    store i16 [[TMP10]], ptr [[AA_CASTED]], align 2
 // CHECK3-NEXT:    [[TMP11:%.*]] = load i32, ptr [[AA_CASTED]], align 4
@@ -1486,7 +1486,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP31:%.*]] = icmp ne i32 [[TMP30]], 0
 // CHECK3-NEXT:    br i1 [[TMP31]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK3:       omp_offload.failed:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l110(i32 [[TMP11]]) #[[ATTR4]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l110(i32 [[TMP11]]) #[[ATTR3]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK3:       omp_offload.cont:
 // CHECK3-NEXT:    [[TMP32:%.*]] = load i32, ptr [[A]], align 4
@@ -1543,12 +1543,12 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP59:%.*]] = icmp ne i32 [[TMP58]], 0
 // CHECK3-NEXT:    br i1 [[TMP59]], label [[OMP_OFFLOAD_FAILED8:%.*]], label [[OMP_OFFLOAD_CONT9:%.*]]
 // CHECK3:       omp_offload.failed8:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l119(i32 [[TMP33]], i32 [[TMP35]]) #[[ATTR4]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l119(i32 [[TMP33]], i32 [[TMP35]]) #[[ATTR3]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT9]]
 // CHECK3:       omp_offload.cont9:
 // CHECK3-NEXT:    br label [[OMP_IF_END:%.*]]
 // CHECK3:       omp_if.else:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l119(i32 [[TMP33]], i32 [[TMP35]]) #[[ATTR4]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l119(i32 [[TMP33]], i32 [[TMP35]]) #[[ATTR3]]
 // CHECK3-NEXT:    br label [[OMP_IF_END]]
 // CHECK3:       omp_if.end:
 // CHECK3-NEXT:    [[TMP60:%.*]] = load i32, ptr [[A]], align 4
@@ -1655,12 +1655,12 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP114:%.*]] = icmp ne i32 [[TMP113]], 0
 // CHECK3-NEXT:    br i1 [[TMP114]], label [[OMP_OFFLOAD_FAILED17:%.*]], label [[OMP_OFFLOAD_CONT18:%.*]]
 // CHECK3:       omp_offload.failed17:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l144(i32 [[TMP61]], ptr [[B]], i32 [[TMP1]], ptr [[VLA]], ptr [[C]], i32 5, i32 [[TMP3]], ptr [[VLA1]], ptr [[D]]) #[[ATTR4]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l144(i32 [[TMP61]], ptr [[B]], i32 [[TMP1]], ptr [[VLA]], ptr [[C]], i32 5, i32 [[TMP3]], ptr [[VLA1]], ptr [[D]]) #[[ATTR3]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT18]]
 // CHECK3:       omp_offload.cont18:
 // CHECK3-NEXT:    br label [[OMP_IF_END20:%.*]]
 // CHECK3:       omp_if.else19:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l144(i32 [[TMP61]], ptr [[B]], i32 [[TMP1]], ptr [[VLA]], ptr [[C]], i32 5, i32 [[TMP3]], ptr [[VLA1]], ptr [[D]]) #[[ATTR4]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l144(i32 [[TMP61]], ptr [[B]], i32 [[TMP1]], ptr [[VLA]], ptr [[C]], i32 5, i32 [[TMP3]], ptr [[VLA1]], ptr [[D]]) #[[ATTR3]]
 // CHECK3-NEXT:    br label [[OMP_IF_END20]]
 // CHECK3:       omp_if.end20:
 // CHECK3-NEXT:    [[TMP115:%.*]] = load i32, ptr [[A]], align 4
@@ -1677,7 +1677,7 @@ int bar(int n){
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l100.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1687,7 +1687,7 @@ int bar(int n){
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@.omp_task_entry.
-// CHECK3-SAME: (i32 noundef [[TMP0:%.*]], ptr noalias noundef [[TMP1:%.*]]) #[[ATTR5:[0-9]+]] {
+// CHECK3-SAME: (i32 noundef [[TMP0:%.*]], ptr noalias noundef [[TMP1:%.*]]) #[[ATTR4:[0-9]+]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR_I:%.*]] = alloca i32, align 4
 // CHECK3-NEXT:    [[DOTPART_ID__ADDR_I:%.*]] = alloca ptr, align 4
@@ -1706,54 +1706,54 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T:%.*]], ptr [[TMP4]], i32 0, i32 2
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], ptr [[TMP4]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 4
-// CHECK3-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META27:![0-9]+]])
-// CHECK3-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META30:![0-9]+]])
-// CHECK3-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META32:![0-9]+]])
-// CHECK3-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META34:![0-9]+]])
-// CHECK3-NEXT:    store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !36
-// CHECK3-NEXT:    store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 4, !noalias !36
-// CHECK3-NEXT:    store ptr null, ptr [[DOTPRIVATES__ADDR_I]], align 4, !noalias !36
-// CHECK3-NEXT:    store ptr null, ptr [[DOTCOPY_FN__ADDR_I]], align 4, !noalias !36
-// CHECK3-NEXT:    store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 4, !noalias !36
-// CHECK3-NEXT:    store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 4, !noalias !36
-// CHECK3-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 4, !noalias !36
-// CHECK3-NEXT:    store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias !36
+// CHECK3-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META13:![0-9]+]])
+// CHECK3-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META16:![0-9]+]])
+// CHECK3-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META18:![0-9]+]])
+// CHECK3-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META20:![0-9]+]])
+// CHECK3-NEXT:    store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !22
+// CHECK3-NEXT:    store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 4, !noalias !22
+// CHECK3-NEXT:    store ptr null, ptr [[DOTPRIVATES__ADDR_I]], align 4, !noalias !22
+// CHECK3-NEXT:    store ptr null, ptr [[DOTCOPY_FN__ADDR_I]], align 4, !noalias !22
+// CHECK3-NEXT:    store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 4, !noalias !22
+// CHECK3-NEXT:    store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 4, !noalias !22
+// CHECK3-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 4, !noalias !22
+// CHECK3-NEXT:    store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias !22
 // CHECK3-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 1
-// CHECK3-NEXT:    store i32 0, ptr [[TMP9]], align 4, !noalias !36
+// CHECK3-NEXT:    store i32 0, ptr [[TMP9]], align 4, !noalias !22
 // CHECK3-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 2
-// CHECK3-NEXT:    store ptr null, ptr [[TMP10]], align 4, !noalias !36
+// CHECK3-NEXT:    store ptr null, ptr [[TMP10]], align 4, !noalias !22
 // CHECK3-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 3
-// CHECK3-NEXT:    store ptr null, ptr [[TMP11]], align 4, !noalias !36
+// CHECK3-NEXT:    store ptr null, ptr [[TMP11]], align 4, !noalias !22
 // CHECK3-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 4
-// CHECK3-NEXT:    store ptr null, ptr [[TMP12]], align 4, !noalias !36
+// CHECK3-NEXT:    store ptr null, ptr [[TMP12]], align 4, !noalias !22
 // CHECK3-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 5
-// CHECK3-NEXT:    store ptr null, ptr [[TMP13]], align 4, !noalias !36
+// CHECK3-NEXT:    store ptr null, ptr [[TMP13]], align 4, !noalias !22
 // CHECK3-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 6
-// CHECK3-NEXT:    store ptr null, ptr [[TMP14]], align 4, !noalias !36
+// CHECK3-NEXT:    store ptr null, ptr [[TMP14]], align 4, !noalias !22
 // CHECK3-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 7
-// CHECK3-NEXT:    store ptr null, ptr [[TMP15]], align 4, !noalias !36
+// CHECK3-NEXT:    store ptr null, ptr [[TMP15]], align 4, !noalias !22
 // CHECK3-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 8
-// CHECK3-NEXT:    store i64 0, ptr [[TMP16]], align 8, !noalias !36
+// CHECK3-NEXT:    store i64 0, ptr [[TMP16]], align 8, !noalias !22
 // CHECK3-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 9
-// CHECK3-NEXT:    store i64 1, ptr [[TMP17]], align 8, !noalias !36
+// CHECK3-NEXT:    store i64 1, ptr [[TMP17]], align 8, !noalias !22
 // CHECK3-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 10
-// CHECK3-NEXT:    store [3 x i32] [i32 1, i32 0, i32 0], ptr [[TMP18]], align 4, !noalias !36
+// CHECK3-NEXT:    store [3 x i32] [i32 1, i32 0, i32 0], ptr [[TMP18]], align 4, !noalias !22
 // CHECK3-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 11
-// CHECK3-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP19]], align 4, !noalias !36
+// CHECK3-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP19]], align 4, !noalias !22
 // CHECK3-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 12
-// CHECK3-NEXT:    store i32 0, ptr [[TMP20]], align 4, !noalias !36
+// CHECK3-NEXT:    store i32 0, ptr [[TMP20]], align 4, !noalias !22
 // CHECK3-NEXT:    [[TMP21:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1]], i64 -1, i32 1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l100.region_id, ptr [[KERNEL_ARGS_I]])
 // CHECK3-NEXT:    [[TMP22:%.*]] = icmp ne i32 [[TMP21]], 0
 // CHECK3-NEXT:    br i1 [[TMP22]], label [[OMP_OFFLOAD_FAILED_I:%.*]], label [[DOTOMP_OUTLINED__EXIT:%.*]]
 // CHECK3:       omp_offload.failed.i:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l100() #[[ATTR4]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l100() #[[ATTR3]]
 // CHECK3-NEXT:    br label [[DOTOMP_OUTLINED__EXIT]]
 // CHECK3:       .omp_outlined..exit:
 // CHECK3-NEXT:    ret i32 0
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l104
-// CHECK3-SAME: (i32 noundef [[A:%.*]]) #[[ATTR3]] {
+// CHECK3-SAME: (i32 noundef [[A:%.*]]) #[[ATTR2]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 // CHECK3-NEXT:    [[A_CASTED:%.*]] = alloca i32, align 4
@@ -1766,7 +1766,7 @@ int bar(int n){
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l104.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]]) #[[ATTR3]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]]) #[[ATTR2]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1794,7 +1794,7 @@ int bar(int n){
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l110.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[AA:%.*]]) #[[ATTR3]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[AA:%.*]]) #[[ATTR2]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1839,7 +1839,7 @@ int bar(int n){
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l119.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]]) #[[ATTR3]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]]) #[[ATTR2]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1898,7 +1898,7 @@ int bar(int n){
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l144.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BN:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[C:%.*]], i32 noundef [[VLA1:%.*]], i32 noundef [[VLA3:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[CN:%.*]], ptr noundef nonnull align 4 dereferenceable(12) [[D:%.*]]) #[[ATTR3]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BN:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[C:%.*]], i32 noundef [[VLA1:%.*]], i32 noundef [[VLA3:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[CN:%.*]], ptr noundef nonnull align 4 dereferenceable(12) [[D:%.*]]) #[[ATTR2]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -2104,12 +2104,12 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP43:%.*]] = icmp ne i32 [[TMP42]], 0
 // CHECK3-NEXT:    br i1 [[TMP43]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK3:       omp_offload.failed:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l216(ptr [[THIS1]], i32 [[TMP5]], i32 2, i32 [[TMP1]], ptr [[VLA]]) #[[ATTR4]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l216(ptr [[THIS1]], i32 [[TMP5]], i32 2, i32 [[TMP1]], ptr [[VLA]]) #[[ATTR3]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK3:       omp_offload.cont:
 // CHECK3-NEXT:    br label [[OMP_IF_END:%.*]]
 // CHECK3:       omp_if.else:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l216(ptr [[THIS1]], i32 [[TMP5]], i32 2, i32 [[TMP1]], ptr [[VLA]]) #[[ATTR4]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l216(ptr [[THIS1]], i32 [[TMP5]], i32 2, i32 [[TMP1]], ptr [[VLA]]) #[[ATTR3]]
 // CHECK3-NEXT:    br label [[OMP_IF_END]]
 // CHECK3:       omp_if.end:
 // CHECK3-NEXT:    [[TMP44:%.*]] = mul nsw i32 1, [[TMP1]]
@@ -2212,12 +2212,12 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP35:%.*]] = icmp ne i32 [[TMP34]], 0
 // CHECK3-NEXT:    br i1 [[TMP35]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK3:       omp_offload.failed:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l198(i32 [[TMP1]], i32 [[TMP3]], i32 [[TMP5]], ptr [[B]]) #[[ATTR4]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l198(i32 [[TMP1]], i32 [[TMP3]], i32 [[TMP5]], ptr [[B]]) #[[ATTR3]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK3:       omp_offload.cont:
 // CHECK3-NEXT:    br label [[OMP_IF_END:%.*]]
 // CHECK3:       omp_if.else:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l198(i32 [[TMP1]], i32 [[TMP3]], i32 [[TMP5]], ptr [[B]]) #[[ATTR4]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l198(i32 [[TMP1]], i32 [[TMP3]], i32 [[TMP5]], ptr [[B]]) #[[ATTR3]]
 // CHECK3-NEXT:    br label [[OMP_IF_END]]
 // CHECK3:       omp_if.end:
 // CHECK3-NEXT:    [[TMP36:%.*]] = load i32, ptr [[A]], align 4
@@ -2300,12 +2300,12 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP30:%.*]] = icmp ne i32 [[TMP29]], 0
 // CHECK3-NEXT:    br i1 [[TMP30]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK3:       omp_offload.failed:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l181(i32 [[TMP1]], i32 [[TMP3]], ptr [[B]]) #[[ATTR4]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l181(i32 [[TMP1]], i32 [[TMP3]], ptr [[B]]) #[[ATTR3]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK3:       omp_offload.cont:
 // CHECK3-NEXT:    br label [[OMP_IF_END:%.*]]
 // CHECK3:       omp_if.else:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l181(i32 [[TMP1]], i32 [[TMP3]], ptr [[B]]) #[[ATTR4]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l181(i32 [[TMP1]], i32 [[TMP3]], ptr [[B]]) #[[ATTR3]]
 // CHECK3-NEXT:    br label [[OMP_IF_END]]
 // CHECK3:       omp_if.end:
 // CHECK3-NEXT:    [[TMP31:%.*]] = load i32, ptr [[A]], align 4
@@ -2338,7 +2338,7 @@ int bar(int n){
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l216.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]], i32 noundef [[B:%.*]], i32 noundef [[VLA:%.*]], i32 noundef [[VLA1:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[C:%.*]]) #[[ATTR3]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]], i32 noundef [[B:%.*]], i32 noundef [[VLA:%.*]], i32 noundef [[VLA1:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[C:%.*]]) #[[ATTR2]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -2404,7 +2404,7 @@ int bar(int n){
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l198.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]], i32 noundef [[AAA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR3]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]], i32 noundef [[AAA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR2]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -2462,7 +2462,7 @@ int bar(int n){
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l181.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR3]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR2]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -2491,7 +2491,7 @@ int bar(int n){
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK3-SAME: () #[[ATTR8:[0-9]+]] {
+// CHECK3-SAME: () #[[ATTR7:[0-9]+]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK3-NEXT:    ret void
@@ -2505,7 +2505,7 @@ int bar(int n){
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l100.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -2528,7 +2528,7 @@ int bar(int n){
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l110.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[AA:%.*]]) #[[ATTR1]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[AA:%.*]]) #[[ATTR0]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -2573,7 +2573,7 @@ int bar(int n){
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l119.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]]) #[[ATTR1]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]]) #[[ATTR0]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -2632,7 +2632,7 @@ int bar(int n){
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l144.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BN:%.*]], ptr noundef nonnull align 8 dereferenceable(400) [[C:%.*]], i64 noundef [[VLA1:%.*]], i64 noundef [[VLA3:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[CN:%.*]], ptr noundef nonnull align 8 dereferenceable(16) [[D:%.*]]) #[[ATTR1]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BN:%.*]], ptr noundef nonnull align 8 dereferenceable(400) [[C:%.*]], i64 noundef [[VLA1:%.*]], i64 noundef [[VLA3:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[CN:%.*]], ptr noundef nonnull align 8 dereferenceable(16) [[D:%.*]]) #[[ATTR0]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -2732,7 +2732,7 @@ int bar(int n){
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l198.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]], i64 noundef [[AAA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR1]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]], i64 noundef [[AAA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -2793,7 +2793,7 @@ int bar(int n){
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l216.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]], i64 noundef [[B:%.*]], i64 noundef [[VLA:%.*]], i64 noundef [[VLA1:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[C:%.*]]) #[[ATTR1]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]], i64 noundef [[B:%.*]], i64 noundef [[VLA:%.*]], i64 noundef [[VLA1:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[C:%.*]]) #[[ATTR0]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -2853,7 +2853,7 @@ int bar(int n){
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l181.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR1]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -2889,7 +2889,7 @@ int bar(int n){
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l100.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -2912,7 +2912,7 @@ int bar(int n){
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l110.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[AA:%.*]]) #[[ATTR1]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[AA:%.*]]) #[[ATTR0]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -2957,7 +2957,7 @@ int bar(int n){
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l119.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]]) #[[ATTR1]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]]) #[[ATTR0]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -3016,7 +3016,7 @@ int bar(int n){
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l144.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BN:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[C:%.*]], i32 noundef [[VLA1:%.*]], i32 noundef [[VLA3:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[CN:%.*]], ptr noundef nonnull align 4 dereferenceable(12) [[D:%.*]]) #[[ATTR1]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BN:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[C:%.*]], i32 noundef [[VLA1:%.*]], i32 noundef [[VLA3:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[CN:%.*]], ptr noundef nonnull align 4 dereferenceable(12) [[D:%.*]]) #[[ATTR0]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -3116,7 +3116,7 @@ int bar(int n){
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l198.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]], i32 noundef [[AAA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR1]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]], i32 noundef [[AAA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -3177,7 +3177,7 @@ int bar(int n){
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l216.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]], i32 noundef [[B:%.*]], i32 noundef [[VLA:%.*]], i32 noundef [[VLA1:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[C:%.*]]) #[[ATTR1]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]], i32 noundef [[B:%.*]], i32 noundef [[VLA:%.*]], i32 noundef [[VLA1:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[C:%.*]]) #[[ATTR0]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -3237,7 +3237,7 @@ int bar(int n){
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l181.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR1]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
diff --git a/clang/test/OpenMP/target_parallel_debug_codegen.cpp b/clang/test/OpenMP/target_parallel_debug_codegen.cpp
index ea872af611e02..8d13e3387e73f 100644
--- a/clang/test/OpenMP/target_parallel_debug_codegen.cpp
+++ b/clang/test/OpenMP/target_parallel_debug_codegen.cpp
@@ -65,7 +65,7 @@ int main() {
   return 0;
 }
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l23_debug__
-// CHECK1-SAME: (ptr addrspace(1) noalias noundef [[C:%.*]], i32 noundef [[A:%.*]], ptr noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR0:[0-9]+]] !dbg [[DBG23:![0-9]+]] {
+// CHECK1-SAME: (ptr addrspace(1) noalias noundef [[C:%.*]], i32 noundef [[A:%.*]], ptr noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR0:[0-9]+]] !dbg [[DBG32:![0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[C_ADDR:%.*]] = alloca ptr addrspace(1), align 8
 // CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
@@ -77,50 +77,50 @@ int main() {
 // CHECK1-NEXT:    [[A_CASTED:%.*]] = alloca i64, align 8
 // CHECK1-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x ptr], align 8
 // CHECK1-NEXT:    store ptr addrspace(1) [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META39:![0-9]+]], metadata !DIExpression()), !dbg [[DBG40:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META48:![0-9]+]], metadata !DIExpression()), !dbg [[DBG49:![0-9]+]]
 // CHECK1-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META41:![0-9]+]], metadata !DIExpression()), !dbg [[DBG42:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META50:![0-9]+]], metadata !DIExpression()), !dbg [[DBG51:![0-9]+]]
 // CHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META43:![0-9]+]], metadata !DIExpression()), !dbg [[DBG44:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META52:![0-9]+]], metadata !DIExpression()), !dbg [[DBG53:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META45:![0-9]+]], metadata !DIExpression()), !dbg [[DBG46:![0-9]+]]
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG47:![0-9]+]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG47]]
-// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG47]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG47]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG47]]
-// CHECK1-NEXT:    store ptr [[TMP3]], ptr [[_TMP1]], align 8, !dbg [[DBG47]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG47]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG47]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = addrspacecast ptr addrspace(1) [[TMP5]] to ptr, !dbg [[DBG47]]
-// CHECK1-NEXT:    store ptr [[TMP6]], ptr [[_TMP2]], align 8, !dbg [[DBG47]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG47]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l23_kernel_environment), !dbg [[DBG47]]
-// CHECK1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP8]], -1, !dbg [[DBG47]]
-// CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]], !dbg [[DBG47]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META54:![0-9]+]], metadata !DIExpression()), !dbg [[DBG55:![0-9]+]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG56:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG56]]
+// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG56]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG56]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG56]]
+// CHECK1-NEXT:    store ptr [[TMP3]], ptr [[_TMP1]], align 8, !dbg [[DBG56]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG56]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG56]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = addrspacecast ptr addrspace(1) [[TMP5]] to ptr, !dbg [[DBG56]]
+// CHECK1-NEXT:    store ptr [[TMP6]], ptr [[_TMP2]], align 8, !dbg [[DBG56]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG56]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l23_kernel_environment), !dbg [[DBG56]]
+// CHECK1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP8]], -1, !dbg [[DBG56]]
+// CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]], !dbg [[DBG56]]
 // CHECK1:       user_code.entry:
 // CHECK1-NEXT:    [[TMP9:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB3:[0-9]+]])
-// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG48:![0-9]+]]
-// CHECK1-NEXT:    store i32 [[TMP10]], ptr [[A_CASTED]], align 4, !dbg [[DBG48]]
-// CHECK1-NEXT:    [[TMP11:%.*]] = load i64, ptr [[A_CASTED]], align 8, !dbg [[DBG48]]
-// CHECK1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 0, !dbg [[DBG48]]
-// CHECK1-NEXT:    store ptr [[TMP2]], ptr [[TMP12]], align 8, !dbg [[DBG48]]
-// CHECK1-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 1, !dbg [[DBG48]]
-// CHECK1-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP11]] to ptr, !dbg [[DBG48]]
-// CHECK1-NEXT:    store ptr [[TMP14]], ptr [[TMP13]], align 8, !dbg [[DBG48]]
-// CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 2, !dbg [[DBG48]]
-// CHECK1-NEXT:    store ptr [[TMP4]], ptr [[TMP15]], align 8, !dbg [[DBG48]]
-// CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 3, !dbg [[DBG48]]
-// CHECK1-NEXT:    store ptr [[TMP7]], ptr [[TMP16]], align 8, !dbg [[DBG48]]
-// CHECK1-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB3]], i32 [[TMP9]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l23_debug___omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 4), !dbg [[DBG48]]
-// CHECK1-NEXT:    call void @__kmpc_target_deinit(), !dbg [[DBG49:![0-9]+]]
-// CHECK1-NEXT:    ret void, !dbg [[DBG51:![0-9]+]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG57:![0-9]+]]
+// CHECK1-NEXT:    store i32 [[TMP10]], ptr [[A_CASTED]], align 4, !dbg [[DBG57]]
+// CHECK1-NEXT:    [[TMP11:%.*]] = load i64, ptr [[A_CASTED]], align 8, !dbg [[DBG57]]
+// CHECK1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 0, !dbg [[DBG57]]
+// CHECK1-NEXT:    store ptr [[TMP2]], ptr [[TMP12]], align 8, !dbg [[DBG57]]
+// CHECK1-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 1, !dbg [[DBG57]]
+// CHECK1-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP11]] to ptr, !dbg [[DBG57]]
+// CHECK1-NEXT:    store ptr [[TMP14]], ptr [[TMP13]], align 8, !dbg [[DBG57]]
+// CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 2, !dbg [[DBG57]]
+// CHECK1-NEXT:    store ptr [[TMP4]], ptr [[TMP15]], align 8, !dbg [[DBG57]]
+// CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 3, !dbg [[DBG57]]
+// CHECK1-NEXT:    store ptr [[TMP7]], ptr [[TMP16]], align 8, !dbg [[DBG57]]
+// CHECK1-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB3]], i32 [[TMP9]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l23_debug___omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 4), !dbg [[DBG57]]
+// CHECK1-NEXT:    call void @__kmpc_target_deinit(), !dbg [[DBG58:![0-9]+]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG60:![0-9]+]]
 // CHECK1:       worker.exit:
-// CHECK1-NEXT:    ret void, !dbg [[DBG47]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG56]]
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l23_debug___omp_outlined_debug__
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], i32 noundef [[A:%.*]], ptr noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG52:![0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], i32 noundef [[A:%.*]], ptr noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR2:[0-9]+]] !dbg [[DBG61:![0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -137,83 +137,83 @@ int main() {
 // CHECK1-NEXT:    [[H:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[D:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META59:![0-9]+]], metadata !DIExpression()), !dbg [[DBG60:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META68:![0-9]+]], metadata !DIExpression()), !dbg [[DBG69:![0-9]+]]
 // CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META61:![0-9]+]], metadata !DIExpression()), !dbg [[DBG60]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META70:![0-9]+]], metadata !DIExpression()), !dbg [[DBG69]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META62:![0-9]+]], metadata !DIExpression()), !dbg [[DBG63:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META71:![0-9]+]], metadata !DIExpression()), !dbg [[DBG72:![0-9]+]]
 // CHECK1-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META64:![0-9]+]], metadata !DIExpression()), !dbg [[DBG65:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META73:![0-9]+]], metadata !DIExpression()), !dbg [[DBG74:![0-9]+]]
 // CHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META66:![0-9]+]], metadata !DIExpression()), !dbg [[DBG67:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META75:![0-9]+]], metadata !DIExpression()), !dbg [[DBG76:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META68:![0-9]+]], metadata !DIExpression()), !dbg [[DBG69:![0-9]+]]
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG70:![0-9]+]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG70]]
-// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG70]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG70]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG70]]
-// CHECK1-NEXT:    store ptr [[TMP3]], ptr [[_TMP1]], align 8, !dbg [[DBG70]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG70]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG70]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = addrspacecast ptr addrspace(1) [[TMP5]] to ptr, !dbg [[DBG70]]
-// CHECK1-NEXT:    store ptr [[TMP6]], ptr [[_TMP2]], align 8, !dbg [[DBG70]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG70]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B3]], metadata [[META71:![0-9]+]], metadata !DIExpression()), !dbg [[DBG60]]
-// CHECK1-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[B3]], ptr align 4 [[TMP4]], i64 400, i1 false), !dbg [[DBG70]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[F]], metadata [[META72:![0-9]+]], metadata !DIExpression()), !dbg [[DBG75:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 1, !dbg [[DBG76:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX]], i64 0, i64 1, !dbg [[DBG76]]
-// CHECK1-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX4]], i64 0, i64 1, !dbg [[DBG76]]
-// CHECK1-NEXT:    store ptr [[ARRAYIDX5]], ptr [[F]], align 8, !dbg [[DBG75]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[G]], metadata [[META77:![0-9]+]], metadata !DIExpression()), !dbg [[DBG78:![0-9]+]]
-// CHECK1-NEXT:    store ptr [[A_ADDR]], ptr [[G]], align 8, !dbg [[DBG78]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[H]], metadata [[META79:![0-9]+]], metadata !DIExpression()), !dbg [[DBG80:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B3]], i64 0, i64 1, !dbg [[DBG81:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX6]], i64 0, i64 1, !dbg [[DBG81]]
-// CHECK1-NEXT:    store ptr [[ARRAYIDX7]], ptr [[H]], align 8, !dbg [[DBG80]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[D]], metadata [[META82:![0-9]+]], metadata !DIExpression()), !dbg [[DBG83:![0-9]+]]
-// CHECK1-NEXT:    store i32 15, ptr [[D]], align 4, !dbg [[DBG83]]
-// CHECK1-NEXT:    store i32 5, ptr [[A_ADDR]], align 4, !dbg [[DBG84:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B3]], i64 0, i64 0, !dbg [[DBG85:![0-9]+]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG86:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP8]] to i64, !dbg [[DBG85]]
-// CHECK1-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX8]], i64 0, i64 [[IDXPROM]], !dbg [[DBG85]]
-// CHECK1-NEXT:    store i32 10, ptr [[ARRAYIDX9]], align 4, !dbg [[DBG87:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG88:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX10]], i64 0, i64 0, !dbg [[DBG88]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG89:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM12:%.*]] = sext i32 [[TMP9]] to i64, !dbg [[DBG88]]
-// CHECK1-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX11]], i64 0, i64 [[IDXPROM12]], !dbg [[DBG88]]
-// CHECK1-NEXT:    store i32 11, ptr [[ARRAYIDX13]], align 4, !dbg [[DBG90:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG91:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX14]], i64 0, i64 0, !dbg [[DBG91]]
-// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG92:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM16:%.*]] = sext i32 [[TMP10]] to i64, !dbg [[DBG91]]
-// CHECK1-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX15]], i64 0, i64 [[IDXPROM16]], !dbg [[DBG91]]
-// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX17]], align 4, !dbg [[DBG91]]
-// CHECK1-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B3]], i64 0, i64 0, !dbg [[DBG93:![0-9]+]]
-// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG94:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM19:%.*]] = sext i32 [[TMP12]] to i64, !dbg [[DBG93]]
-// CHECK1-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX18]], i64 0, i64 [[IDXPROM19]], !dbg [[DBG93]]
-// CHECK1-NEXT:    store i32 [[TMP11]], ptr [[ARRAYIDX20]], align 4, !dbg [[DBG95:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX21:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B3]], i64 0, i64 0, !dbg [[DBG96:![0-9]+]]
-// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG97:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM22:%.*]] = sext i32 [[TMP13]] to i64, !dbg [[DBG96]]
-// CHECK1-NEXT:    [[ARRAYIDX23:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX21]], i64 0, i64 [[IDXPROM22]], !dbg [[DBG96]]
-// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[ARRAYIDX23]], align 4, !dbg [[DBG96]]
-// CHECK1-NEXT:    [[TMP15:%.*]] = load i8, ptr [[TMP7]], align 1, !dbg [[DBG98:![0-9]+]]
-// CHECK1-NEXT:    [[TOBOOL:%.*]] = trunc i8 [[TMP15]] to i1, !dbg [[DBG98]]
-// CHECK1-NEXT:    [[CONV:%.*]] = zext i1 [[TOBOOL]] to i32, !dbg [[DBG98]]
-// CHECK1-NEXT:    [[OR:%.*]] = or i32 [[CONV]], [[TMP14]], !dbg [[DBG98]]
-// CHECK1-NEXT:    [[TOBOOL24:%.*]] = icmp ne i32 [[OR]], 0, !dbg [[DBG98]]
-// CHECK1-NEXT:    [[FROMBOOL:%.*]] = zext i1 [[TOBOOL24]] to i8, !dbg [[DBG98]]
-// CHECK1-NEXT:    store i8 [[FROMBOOL]], ptr [[TMP7]], align 1, !dbg [[DBG98]]
-// CHECK1-NEXT:    ret void, !dbg [[DBG99:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META77:![0-9]+]], metadata !DIExpression()), !dbg [[DBG78:![0-9]+]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG79:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG79]]
+// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG79]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG79]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG79]]
+// CHECK1-NEXT:    store ptr [[TMP3]], ptr [[_TMP1]], align 8, !dbg [[DBG79]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG79]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG79]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = addrspacecast ptr addrspace(1) [[TMP5]] to ptr, !dbg [[DBG79]]
+// CHECK1-NEXT:    store ptr [[TMP6]], ptr [[_TMP2]], align 8, !dbg [[DBG79]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG79]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B3]], metadata [[META80:![0-9]+]], metadata !DIExpression()), !dbg [[DBG69]]
+// CHECK1-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[B3]], ptr align 4 [[TMP4]], i64 400, i1 false), !dbg [[DBG79]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[F]], metadata [[META81:![0-9]+]], metadata !DIExpression()), !dbg [[DBG84:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 1, !dbg [[DBG85:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX]], i64 0, i64 1, !dbg [[DBG85]]
+// CHECK1-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX4]], i64 0, i64 1, !dbg [[DBG85]]
+// CHECK1-NEXT:    store ptr [[ARRAYIDX5]], ptr [[F]], align 8, !dbg [[DBG84]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[G]], metadata [[META86:![0-9]+]], metadata !DIExpression()), !dbg [[DBG87:![0-9]+]]
+// CHECK1-NEXT:    store ptr [[A_ADDR]], ptr [[G]], align 8, !dbg [[DBG87]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[H]], metadata [[META88:![0-9]+]], metadata !DIExpression()), !dbg [[DBG89:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B3]], i64 0, i64 1, !dbg [[DBG90:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX6]], i64 0, i64 1, !dbg [[DBG90]]
+// CHECK1-NEXT:    store ptr [[ARRAYIDX7]], ptr [[H]], align 8, !dbg [[DBG89]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[D]], metadata [[META91:![0-9]+]], metadata !DIExpression()), !dbg [[DBG92:![0-9]+]]
+// CHECK1-NEXT:    store i32 15, ptr [[D]], align 4, !dbg [[DBG92]]
+// CHECK1-NEXT:    store i32 5, ptr [[A_ADDR]], align 4, !dbg [[DBG93:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B3]], i64 0, i64 0, !dbg [[DBG94:![0-9]+]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG95:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP8]] to i64, !dbg [[DBG94]]
+// CHECK1-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX8]], i64 0, i64 [[IDXPROM]], !dbg [[DBG94]]
+// CHECK1-NEXT:    store i32 10, ptr [[ARRAYIDX9]], align 4, !dbg [[DBG96:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG97:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX10]], i64 0, i64 0, !dbg [[DBG97]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG98:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM12:%.*]] = sext i32 [[TMP9]] to i64, !dbg [[DBG97]]
+// CHECK1-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX11]], i64 0, i64 [[IDXPROM12]], !dbg [[DBG97]]
+// CHECK1-NEXT:    store i32 11, ptr [[ARRAYIDX13]], align 4, !dbg [[DBG99:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG100:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX14]], i64 0, i64 0, !dbg [[DBG100]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG101:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM16:%.*]] = sext i32 [[TMP10]] to i64, !dbg [[DBG100]]
+// CHECK1-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX15]], i64 0, i64 [[IDXPROM16]], !dbg [[DBG100]]
+// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX17]], align 4, !dbg [[DBG100]]
+// CHECK1-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B3]], i64 0, i64 0, !dbg [[DBG102:![0-9]+]]
+// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG103:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM19:%.*]] = sext i32 [[TMP12]] to i64, !dbg [[DBG102]]
+// CHECK1-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX18]], i64 0, i64 [[IDXPROM19]], !dbg [[DBG102]]
+// CHECK1-NEXT:    store i32 [[TMP11]], ptr [[ARRAYIDX20]], align 4, !dbg [[DBG104:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX21:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B3]], i64 0, i64 0, !dbg [[DBG105:![0-9]+]]
+// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG106:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM22:%.*]] = sext i32 [[TMP13]] to i64, !dbg [[DBG105]]
+// CHECK1-NEXT:    [[ARRAYIDX23:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX21]], i64 0, i64 [[IDXPROM22]], !dbg [[DBG105]]
+// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[ARRAYIDX23]], align 4, !dbg [[DBG105]]
+// CHECK1-NEXT:    [[TMP15:%.*]] = load i8, ptr [[TMP7]], align 1, !dbg [[DBG107:![0-9]+]]
+// CHECK1-NEXT:    [[TOBOOL:%.*]] = trunc i8 [[TMP15]] to i1, !dbg [[DBG107]]
+// CHECK1-NEXT:    [[CONV:%.*]] = zext i1 [[TOBOOL]] to i32, !dbg [[DBG107]]
+// CHECK1-NEXT:    [[OR:%.*]] = or i32 [[CONV]], [[TMP14]], !dbg [[DBG107]]
+// CHECK1-NEXT:    [[TOBOOL24:%.*]] = icmp ne i32 [[OR]], 0, !dbg [[DBG107]]
+// CHECK1-NEXT:    [[FROMBOOL:%.*]] = zext i1 [[TOBOOL24]] to i8, !dbg [[DBG107]]
+// CHECK1-NEXT:    store i8 [[FROMBOOL]], ptr [[TMP7]], align 1, !dbg [[DBG107]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG108:![0-9]+]]
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l23_debug___omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG100:![0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR2]] !dbg [[DBG109:![0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -222,62 +222,62 @@ int main() {
 // CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[BB_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META107:![0-9]+]], metadata !DIExpression()), !dbg [[DBG108:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META116:![0-9]+]], metadata !DIExpression()), !dbg [[DBG117:![0-9]+]]
 // CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META109:![0-9]+]], metadata !DIExpression()), !dbg [[DBG108]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META118:![0-9]+]], metadata !DIExpression()), !dbg [[DBG117]]
 // CHECK1-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META110:![0-9]+]], metadata !DIExpression()), !dbg [[DBG108]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META119:![0-9]+]], metadata !DIExpression()), !dbg [[DBG117]]
 // CHECK1-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META111:![0-9]+]], metadata !DIExpression()), !dbg [[DBG108]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META120:![0-9]+]], metadata !DIExpression()), !dbg [[DBG117]]
 // CHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META112:![0-9]+]], metadata !DIExpression()), !dbg [[DBG108]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META121:![0-9]+]], metadata !DIExpression()), !dbg [[DBG117]]
 // CHECK1-NEXT:    store ptr [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META113:![0-9]+]], metadata !DIExpression()), !dbg [[DBG108]]
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG114:![0-9]+]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG114]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG114]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG114]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG114]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG114]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG114]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG114]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG114]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = addrspacecast ptr [[TMP5]] to ptr addrspace(1), !dbg [[DBG114]]
-// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr [[TMP8]] to ptr addrspace(1), !dbg [[DBG114]]
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l23_debug___omp_outlined_debug__(ptr [[TMP3]], ptr [[TMP4]], ptr addrspace(1) [[TMP9]], i32 [[TMP6]], ptr [[TMP7]], ptr addrspace(1) [[TMP10]]) #[[ATTR3:[0-9]+]], !dbg [[DBG114]]
-// CHECK1-NEXT:    ret void, !dbg [[DBG114]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META122:![0-9]+]], metadata !DIExpression()), !dbg [[DBG117]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG123:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG123]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG123]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG123]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG123]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG123]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG123]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG123]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG123]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = addrspacecast ptr [[TMP5]] to ptr addrspace(1), !dbg [[DBG123]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr [[TMP8]] to ptr addrspace(1), !dbg [[DBG123]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l23_debug___omp_outlined_debug__(ptr [[TMP3]], ptr [[TMP4]], ptr addrspace(1) [[TMP9]], i32 [[TMP6]], ptr [[TMP7]], ptr addrspace(1) [[TMP10]]) #[[ATTR4:[0-9]+]], !dbg [[DBG123]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG123]]
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l23
-// CHECK1-SAME: (ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR5:[0-9]+]] !dbg [[DBG115:![0-9]+]] {
+// CHECK1-SAME: (ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR6:[0-9]+]] !dbg [[DBG124:![0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
 // CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[BB_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META118:![0-9]+]], metadata !DIExpression()), !dbg [[DBG119:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META127:![0-9]+]], metadata !DIExpression()), !dbg [[DBG128:![0-9]+]]
 // CHECK1-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META120:![0-9]+]], metadata !DIExpression()), !dbg [[DBG119]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META129:![0-9]+]], metadata !DIExpression()), !dbg [[DBG128]]
 // CHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META121:![0-9]+]], metadata !DIExpression()), !dbg [[DBG119]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META130:![0-9]+]], metadata !DIExpression()), !dbg [[DBG128]]
 // CHECK1-NEXT:    store ptr [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META122:![0-9]+]], metadata !DIExpression()), !dbg [[DBG119]]
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG123:![0-9]+]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG123]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG123]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG123]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG123]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG123]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG123]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = addrspacecast ptr [[TMP3]] to ptr addrspace(1), !dbg [[DBG123]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = addrspacecast ptr [[TMP6]] to ptr addrspace(1), !dbg [[DBG123]]
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l23_debug__(ptr addrspace(1) [[TMP7]], i32 [[TMP4]], ptr [[TMP5]], ptr addrspace(1) [[TMP8]]) #[[ATTR3]], !dbg [[DBG123]]
-// CHECK1-NEXT:    ret void, !dbg [[DBG123]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META131:![0-9]+]], metadata !DIExpression()), !dbg [[DBG128]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG132:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG132]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG132]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG132]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG132]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG132]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG132]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = addrspacecast ptr [[TMP3]] to ptr addrspace(1), !dbg [[DBG132]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = addrspacecast ptr [[TMP6]] to ptr addrspace(1), !dbg [[DBG132]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l23_debug__(ptr addrspace(1) [[TMP7]], i32 [[TMP4]], ptr [[TMP5]], ptr addrspace(1) [[TMP8]]) #[[ATTR4]], !dbg [[DBG132]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG132]]
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l37_debug__
-// CHECK1-SAME: (ptr addrspace(1) noalias noundef [[C:%.*]], i32 noundef [[A:%.*]], ptr addrspace(1) noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG124:![0-9]+]] {
+// CHECK1-SAME: (ptr addrspace(1) noalias noundef [[C:%.*]], i32 noundef [[A:%.*]], ptr addrspace(1) noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG133:![0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[C_ADDR:%.*]] = alloca ptr addrspace(1), align 8
 // CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
@@ -289,51 +289,51 @@ int main() {
 // CHECK1-NEXT:    [[A_CASTED:%.*]] = alloca i64, align 8
 // CHECK1-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x ptr], align 8
 // CHECK1-NEXT:    store ptr addrspace(1) [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META129:![0-9]+]], metadata !DIExpression()), !dbg [[DBG130:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META138:![0-9]+]], metadata !DIExpression()), !dbg [[DBG139:![0-9]+]]
 // CHECK1-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META131:![0-9]+]], metadata !DIExpression()), !dbg [[DBG132:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META140:![0-9]+]], metadata !DIExpression()), !dbg [[DBG141:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META133:![0-9]+]], metadata !DIExpression()), !dbg [[DBG134:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META142:![0-9]+]], metadata !DIExpression()), !dbg [[DBG143:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META135:![0-9]+]], metadata !DIExpression()), !dbg [[DBG136:![0-9]+]]
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG137:![0-9]+]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG137]]
-// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG137]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG137]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr [[B_ADDR]], align 8, !dbg [[DBG137]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[TMP3]] to ptr, !dbg [[DBG137]]
-// CHECK1-NEXT:    store ptr [[TMP4]], ptr [[_TMP1]], align 8, !dbg [[DBG137]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG137]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG137]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = addrspacecast ptr addrspace(1) [[TMP6]] to ptr, !dbg [[DBG137]]
-// CHECK1-NEXT:    store ptr [[TMP7]], ptr [[_TMP2]], align 8, !dbg [[DBG137]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG137]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l37_kernel_environment), !dbg [[DBG137]]
-// CHECK1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP9]], -1, !dbg [[DBG137]]
-// CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]], !dbg [[DBG137]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META144:![0-9]+]], metadata !DIExpression()), !dbg [[DBG145:![0-9]+]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG146:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG146]]
+// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG146]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG146]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr [[B_ADDR]], align 8, !dbg [[DBG146]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[TMP3]] to ptr, !dbg [[DBG146]]
+// CHECK1-NEXT:    store ptr [[TMP4]], ptr [[_TMP1]], align 8, !dbg [[DBG146]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG146]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG146]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = addrspacecast ptr addrspace(1) [[TMP6]] to ptr, !dbg [[DBG146]]
+// CHECK1-NEXT:    store ptr [[TMP7]], ptr [[_TMP2]], align 8, !dbg [[DBG146]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG146]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l37_kernel_environment), !dbg [[DBG146]]
+// CHECK1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP9]], -1, !dbg [[DBG146]]
+// CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]], !dbg [[DBG146]]
 // CHECK1:       user_code.entry:
 // CHECK1-NEXT:    [[TMP10:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB7:[0-9]+]])
-// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG138:![0-9]+]]
-// CHECK1-NEXT:    store i32 [[TMP11]], ptr [[A_CASTED]], align 4, !dbg [[DBG138]]
-// CHECK1-NEXT:    [[TMP12:%.*]] = load i64, ptr [[A_CASTED]], align 8, !dbg [[DBG138]]
-// CHECK1-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 0, !dbg [[DBG138]]
-// CHECK1-NEXT:    store ptr [[TMP2]], ptr [[TMP13]], align 8, !dbg [[DBG138]]
-// CHECK1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 1, !dbg [[DBG138]]
-// CHECK1-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP12]] to ptr, !dbg [[DBG138]]
-// CHECK1-NEXT:    store ptr [[TMP15]], ptr [[TMP14]], align 8, !dbg [[DBG138]]
-// CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 2, !dbg [[DBG138]]
-// CHECK1-NEXT:    store ptr [[TMP5]], ptr [[TMP16]], align 8, !dbg [[DBG138]]
-// CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 3, !dbg [[DBG138]]
-// CHECK1-NEXT:    store ptr [[TMP8]], ptr [[TMP17]], align 8, !dbg [[DBG138]]
-// CHECK1-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB7]], i32 [[TMP10]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l37_debug___omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 4), !dbg [[DBG138]]
-// CHECK1-NEXT:    call void @__kmpc_target_deinit(), !dbg [[DBG139:![0-9]+]]
-// CHECK1-NEXT:    ret void, !dbg [[DBG141:![0-9]+]]
+// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG147:![0-9]+]]
+// CHECK1-NEXT:    store i32 [[TMP11]], ptr [[A_CASTED]], align 4, !dbg [[DBG147]]
+// CHECK1-NEXT:    [[TMP12:%.*]] = load i64, ptr [[A_CASTED]], align 8, !dbg [[DBG147]]
+// CHECK1-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 0, !dbg [[DBG147]]
+// CHECK1-NEXT:    store ptr [[TMP2]], ptr [[TMP13]], align 8, !dbg [[DBG147]]
+// CHECK1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 1, !dbg [[DBG147]]
+// CHECK1-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP12]] to ptr, !dbg [[DBG147]]
+// CHECK1-NEXT:    store ptr [[TMP15]], ptr [[TMP14]], align 8, !dbg [[DBG147]]
+// CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 2, !dbg [[DBG147]]
+// CHECK1-NEXT:    store ptr [[TMP5]], ptr [[TMP16]], align 8, !dbg [[DBG147]]
+// CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 3, !dbg [[DBG147]]
+// CHECK1-NEXT:    store ptr [[TMP8]], ptr [[TMP17]], align 8, !dbg [[DBG147]]
+// CHECK1-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB7]], i32 [[TMP10]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l37_debug___omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 4), !dbg [[DBG147]]
+// CHECK1-NEXT:    call void @__kmpc_target_deinit(), !dbg [[DBG148:![0-9]+]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG150:![0-9]+]]
 // CHECK1:       worker.exit:
-// CHECK1-NEXT:    ret void, !dbg [[DBG137]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG146]]
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l37_debug___omp_outlined_debug__
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], i32 noundef [[A:%.*]], ptr addrspace(1) noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG142:![0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], i32 noundef [[A:%.*]], ptr addrspace(1) noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR2]] !dbg [[DBG151:![0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -349,74 +349,74 @@ int main() {
 // CHECK1-NEXT:    [[H:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[D:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META145:![0-9]+]], metadata !DIExpression()), !dbg [[DBG146:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META154:![0-9]+]], metadata !DIExpression()), !dbg [[DBG155:![0-9]+]]
 // CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META147:![0-9]+]], metadata !DIExpression()), !dbg [[DBG146]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META156:![0-9]+]], metadata !DIExpression()), !dbg [[DBG155]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META148:![0-9]+]], metadata !DIExpression()), !dbg [[DBG149:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META157:![0-9]+]], metadata !DIExpression()), !dbg [[DBG158:![0-9]+]]
 // CHECK1-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META150:![0-9]+]], metadata !DIExpression()), !dbg [[DBG151:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META159:![0-9]+]], metadata !DIExpression()), !dbg [[DBG160:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META152:![0-9]+]], metadata !DIExpression()), !dbg [[DBG153:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META161:![0-9]+]], metadata !DIExpression()), !dbg [[DBG162:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META154:![0-9]+]], metadata !DIExpression()), !dbg [[DBG155:![0-9]+]]
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG156:![0-9]+]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG156]]
-// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG156]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG156]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr [[B_ADDR]], align 8, !dbg [[DBG156]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[TMP3]] to ptr, !dbg [[DBG156]]
-// CHECK1-NEXT:    store ptr [[TMP4]], ptr [[_TMP1]], align 8, !dbg [[DBG156]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG156]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG156]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = addrspacecast ptr addrspace(1) [[TMP6]] to ptr, !dbg [[DBG156]]
-// CHECK1-NEXT:    store ptr [[TMP7]], ptr [[_TMP2]], align 8, !dbg [[DBG156]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG156]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[F]], metadata [[META157:![0-9]+]], metadata !DIExpression()), !dbg [[DBG159:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 1, !dbg [[DBG160:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX]], i64 0, i64 1, !dbg [[DBG160]]
-// CHECK1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX3]], i64 0, i64 1, !dbg [[DBG160]]
-// CHECK1-NEXT:    store ptr [[ARRAYIDX4]], ptr [[F]], align 8, !dbg [[DBG159]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[G]], metadata [[META161:![0-9]+]], metadata !DIExpression()), !dbg [[DBG162:![0-9]+]]
-// CHECK1-NEXT:    store ptr [[A_ADDR]], ptr [[G]], align 8, !dbg [[DBG162]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[H]], metadata [[META163:![0-9]+]], metadata !DIExpression()), !dbg [[DBG164:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP5]], i64 0, i64 1, !dbg [[DBG165:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX5]], i64 0, i64 1, !dbg [[DBG165]]
-// CHECK1-NEXT:    store ptr [[ARRAYIDX6]], ptr [[H]], align 8, !dbg [[DBG164]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[D]], metadata [[META166:![0-9]+]], metadata !DIExpression()), !dbg [[DBG167:![0-9]+]]
-// CHECK1-NEXT:    store i32 15, ptr [[D]], align 4, !dbg [[DBG167]]
-// CHECK1-NEXT:    store i32 5, ptr [[A_ADDR]], align 4, !dbg [[DBG168:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP5]], i64 0, i64 0, !dbg [[DBG169:![0-9]+]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG170:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP9]] to i64, !dbg [[DBG169]]
-// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX7]], i64 0, i64 [[IDXPROM]], !dbg [[DBG169]]
-// CHECK1-NEXT:    store i32 10, ptr [[ARRAYIDX8]], align 4, !dbg [[DBG171:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG172:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX9]], i64 0, i64 0, !dbg [[DBG172]]
-// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG173:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM11:%.*]] = sext i32 [[TMP10]] to i64, !dbg [[DBG172]]
-// CHECK1-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX10]], i64 0, i64 [[IDXPROM11]], !dbg [[DBG172]]
-// CHECK1-NEXT:    store i32 11, ptr [[ARRAYIDX12]], align 4, !dbg [[DBG174:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG175:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX13]], i64 0, i64 0, !dbg [[DBG175]]
-// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG176:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM15:%.*]] = sext i32 [[TMP11]] to i64, !dbg [[DBG175]]
-// CHECK1-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX14]], i64 0, i64 [[IDXPROM15]], !dbg [[DBG175]]
-// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX16]], align 4, !dbg [[DBG175]]
-// CHECK1-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP5]], i64 0, i64 0, !dbg [[DBG177:![0-9]+]]
-// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG178:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM18:%.*]] = sext i32 [[TMP13]] to i64, !dbg [[DBG177]]
-// CHECK1-NEXT:    [[ARRAYIDX19:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX17]], i64 0, i64 [[IDXPROM18]], !dbg [[DBG177]]
-// CHECK1-NEXT:    store i32 [[TMP12]], ptr [[ARRAYIDX19]], align 4, !dbg [[DBG179:![0-9]+]]
-// CHECK1-NEXT:    [[TMP14:%.*]] = load i8, ptr [[TMP8]], align 1, !dbg [[DBG180:![0-9]+]]
-// CHECK1-NEXT:    [[TOBOOL:%.*]] = trunc i8 [[TMP14]] to i1, !dbg [[DBG180]]
-// CHECK1-NEXT:    [[CONV:%.*]] = zext i1 [[TOBOOL]] to i32, !dbg [[DBG180]]
-// CHECK1-NEXT:    store i32 [[CONV]], ptr [[D]], align 4, !dbg [[DBG181:![0-9]+]]
-// CHECK1-NEXT:    ret void, !dbg [[DBG182:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META163:![0-9]+]], metadata !DIExpression()), !dbg [[DBG164:![0-9]+]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG165:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG165]]
+// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG165]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG165]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr [[B_ADDR]], align 8, !dbg [[DBG165]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[TMP3]] to ptr, !dbg [[DBG165]]
+// CHECK1-NEXT:    store ptr [[TMP4]], ptr [[_TMP1]], align 8, !dbg [[DBG165]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG165]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG165]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = addrspacecast ptr addrspace(1) [[TMP6]] to ptr, !dbg [[DBG165]]
+// CHECK1-NEXT:    store ptr [[TMP7]], ptr [[_TMP2]], align 8, !dbg [[DBG165]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG165]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[F]], metadata [[META166:![0-9]+]], metadata !DIExpression()), !dbg [[DBG168:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 1, !dbg [[DBG169:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX]], i64 0, i64 1, !dbg [[DBG169]]
+// CHECK1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX3]], i64 0, i64 1, !dbg [[DBG169]]
+// CHECK1-NEXT:    store ptr [[ARRAYIDX4]], ptr [[F]], align 8, !dbg [[DBG168]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[G]], metadata [[META170:![0-9]+]], metadata !DIExpression()), !dbg [[DBG171:![0-9]+]]
+// CHECK1-NEXT:    store ptr [[A_ADDR]], ptr [[G]], align 8, !dbg [[DBG171]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[H]], metadata [[META172:![0-9]+]], metadata !DIExpression()), !dbg [[DBG173:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP5]], i64 0, i64 1, !dbg [[DBG174:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX5]], i64 0, i64 1, !dbg [[DBG174]]
+// CHECK1-NEXT:    store ptr [[ARRAYIDX6]], ptr [[H]], align 8, !dbg [[DBG173]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[D]], metadata [[META175:![0-9]+]], metadata !DIExpression()), !dbg [[DBG176:![0-9]+]]
+// CHECK1-NEXT:    store i32 15, ptr [[D]], align 4, !dbg [[DBG176]]
+// CHECK1-NEXT:    store i32 5, ptr [[A_ADDR]], align 4, !dbg [[DBG177:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP5]], i64 0, i64 0, !dbg [[DBG178:![0-9]+]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG179:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP9]] to i64, !dbg [[DBG178]]
+// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX7]], i64 0, i64 [[IDXPROM]], !dbg [[DBG178]]
+// CHECK1-NEXT:    store i32 10, ptr [[ARRAYIDX8]], align 4, !dbg [[DBG180:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG181:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX9]], i64 0, i64 0, !dbg [[DBG181]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG182:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM11:%.*]] = sext i32 [[TMP10]] to i64, !dbg [[DBG181]]
+// CHECK1-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX10]], i64 0, i64 [[IDXPROM11]], !dbg [[DBG181]]
+// CHECK1-NEXT:    store i32 11, ptr [[ARRAYIDX12]], align 4, !dbg [[DBG183:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG184:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX13]], i64 0, i64 0, !dbg [[DBG184]]
+// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG185:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM15:%.*]] = sext i32 [[TMP11]] to i64, !dbg [[DBG184]]
+// CHECK1-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX14]], i64 0, i64 [[IDXPROM15]], !dbg [[DBG184]]
+// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX16]], align 4, !dbg [[DBG184]]
+// CHECK1-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP5]], i64 0, i64 0, !dbg [[DBG186:![0-9]+]]
+// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG187:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM18:%.*]] = sext i32 [[TMP13]] to i64, !dbg [[DBG186]]
+// CHECK1-NEXT:    [[ARRAYIDX19:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX17]], i64 0, i64 [[IDXPROM18]], !dbg [[DBG186]]
+// CHECK1-NEXT:    store i32 [[TMP12]], ptr [[ARRAYIDX19]], align 4, !dbg [[DBG188:![0-9]+]]
+// CHECK1-NEXT:    [[TMP14:%.*]] = load i8, ptr [[TMP8]], align 1, !dbg [[DBG189:![0-9]+]]
+// CHECK1-NEXT:    [[TOBOOL:%.*]] = trunc i8 [[TMP14]] to i1, !dbg [[DBG189]]
+// CHECK1-NEXT:    [[CONV:%.*]] = zext i1 [[TOBOOL]] to i32, !dbg [[DBG189]]
+// CHECK1-NEXT:    store i32 [[CONV]], ptr [[D]], align 4, !dbg [[DBG190:![0-9]+]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG191:![0-9]+]]
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l37_debug___omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG183:![0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR2]] !dbg [[DBG192:![0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -425,64 +425,64 @@ int main() {
 // CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[BB_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META184:![0-9]+]], metadata !DIExpression()), !dbg [[DBG185:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META193:![0-9]+]], metadata !DIExpression()), !dbg [[DBG194:![0-9]+]]
 // CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META186:![0-9]+]], metadata !DIExpression()), !dbg [[DBG185]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META195:![0-9]+]], metadata !DIExpression()), !dbg [[DBG194]]
 // CHECK1-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META187:![0-9]+]], metadata !DIExpression()), !dbg [[DBG185]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META196:![0-9]+]], metadata !DIExpression()), !dbg [[DBG194]]
 // CHECK1-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META188:![0-9]+]], metadata !DIExpression()), !dbg [[DBG185]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META197:![0-9]+]], metadata !DIExpression()), !dbg [[DBG194]]
 // CHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META189:![0-9]+]], metadata !DIExpression()), !dbg [[DBG185]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META198:![0-9]+]], metadata !DIExpression()), !dbg [[DBG194]]
 // CHECK1-NEXT:    store ptr [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META190:![0-9]+]], metadata !DIExpression()), !dbg [[DBG185]]
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG191:![0-9]+]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG191]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG191]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG191]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG191]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG191]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG191]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG191]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG191]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = addrspacecast ptr [[TMP5]] to ptr addrspace(1), !dbg [[DBG191]]
-// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr [[TMP7]] to ptr addrspace(1), !dbg [[DBG191]]
-// CHECK1-NEXT:    [[TMP11:%.*]] = addrspacecast ptr [[TMP8]] to ptr addrspace(1), !dbg [[DBG191]]
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l37_debug___omp_outlined_debug__(ptr [[TMP3]], ptr [[TMP4]], ptr addrspace(1) [[TMP9]], i32 [[TMP6]], ptr addrspace(1) [[TMP10]], ptr addrspace(1) [[TMP11]]) #[[ATTR3]], !dbg [[DBG191]]
-// CHECK1-NEXT:    ret void, !dbg [[DBG191]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META199:![0-9]+]], metadata !DIExpression()), !dbg [[DBG194]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG200:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG200]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG200]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG200]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG200]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG200]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG200]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG200]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG200]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = addrspacecast ptr [[TMP5]] to ptr addrspace(1), !dbg [[DBG200]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr [[TMP7]] to ptr addrspace(1), !dbg [[DBG200]]
+// CHECK1-NEXT:    [[TMP11:%.*]] = addrspacecast ptr [[TMP8]] to ptr addrspace(1), !dbg [[DBG200]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l37_debug___omp_outlined_debug__(ptr [[TMP3]], ptr [[TMP4]], ptr addrspace(1) [[TMP9]], i32 [[TMP6]], ptr addrspace(1) [[TMP10]], ptr addrspace(1) [[TMP11]]) #[[ATTR4]], !dbg [[DBG200]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG200]]
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l37
-// CHECK1-SAME: (ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR5]] !dbg [[DBG192:![0-9]+]] {
+// CHECK1-SAME: (ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR6]] !dbg [[DBG201:![0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
 // CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[BB_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META193:![0-9]+]], metadata !DIExpression()), !dbg [[DBG194:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META202:![0-9]+]], metadata !DIExpression()), !dbg [[DBG203:![0-9]+]]
 // CHECK1-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META195:![0-9]+]], metadata !DIExpression()), !dbg [[DBG194]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META204:![0-9]+]], metadata !DIExpression()), !dbg [[DBG203]]
 // CHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META196:![0-9]+]], metadata !DIExpression()), !dbg [[DBG194]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META205:![0-9]+]], metadata !DIExpression()), !dbg [[DBG203]]
 // CHECK1-NEXT:    store ptr [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META197:![0-9]+]], metadata !DIExpression()), !dbg [[DBG194]]
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG198:![0-9]+]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG198]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG198]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG198]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG198]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG198]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG198]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = addrspacecast ptr [[TMP3]] to ptr addrspace(1), !dbg [[DBG198]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = addrspacecast ptr [[TMP5]] to ptr addrspace(1), !dbg [[DBG198]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = addrspacecast ptr [[TMP6]] to ptr addrspace(1), !dbg [[DBG198]]
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l37_debug__(ptr addrspace(1) [[TMP7]], i32 [[TMP4]], ptr addrspace(1) [[TMP8]], ptr addrspace(1) [[TMP9]]) #[[ATTR3]], !dbg [[DBG198]]
-// CHECK1-NEXT:    ret void, !dbg [[DBG198]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META206:![0-9]+]], metadata !DIExpression()), !dbg [[DBG203]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG207:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG207]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG207]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG207]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG207]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG207]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG207]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = addrspacecast ptr [[TMP3]] to ptr addrspace(1), !dbg [[DBG207]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = addrspacecast ptr [[TMP5]] to ptr addrspace(1), !dbg [[DBG207]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = addrspacecast ptr [[TMP6]] to ptr addrspace(1), !dbg [[DBG207]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l37_debug__(ptr addrspace(1) [[TMP7]], i32 [[TMP4]], ptr addrspace(1) [[TMP8]], ptr addrspace(1) [[TMP9]]) #[[ATTR4]], !dbg [[DBG207]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG207]]
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l51_debug__
-// CHECK1-SAME: (ptr addrspace(1) noalias noundef [[C:%.*]], ptr addrspace(1) noalias noundef [[A:%.*]], ptr addrspace(1) noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG199:![0-9]+]] {
+// CHECK1-SAME: (ptr addrspace(1) noalias noundef [[C:%.*]], ptr addrspace(1) noalias noundef [[A:%.*]], ptr addrspace(1) noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG208:![0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[C_ADDR:%.*]] = alloca ptr addrspace(1), align 8
 // CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca ptr addrspace(1), align 8
@@ -494,51 +494,51 @@ int main() {
 // CHECK1-NEXT:    [[_TMP3:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x ptr], align 8
 // CHECK1-NEXT:    store ptr addrspace(1) [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META204:![0-9]+]], metadata !DIExpression()), !dbg [[DBG205:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META213:![0-9]+]], metadata !DIExpression()), !dbg [[DBG214:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[A]], ptr [[A_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META206:![0-9]+]], metadata !DIExpression()), !dbg [[DBG207:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META215:![0-9]+]], metadata !DIExpression()), !dbg [[DBG216:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META208:![0-9]+]], metadata !DIExpression()), !dbg [[DBG209:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META217:![0-9]+]], metadata !DIExpression()), !dbg [[DBG218:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META210:![0-9]+]], metadata !DIExpression()), !dbg [[DBG211:![0-9]+]]
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG212:![0-9]+]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG212]]
-// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG212]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG212]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr [[A_ADDR]], align 8, !dbg [[DBG212]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[TMP3]] to ptr, !dbg [[DBG212]]
-// CHECK1-NEXT:    store ptr [[TMP4]], ptr [[_TMP1]], align 8, !dbg [[DBG212]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG212]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr addrspace(1), ptr [[B_ADDR]], align 8, !dbg [[DBG212]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = addrspacecast ptr addrspace(1) [[TMP6]] to ptr, !dbg [[DBG212]]
-// CHECK1-NEXT:    store ptr [[TMP7]], ptr [[_TMP2]], align 8, !dbg [[DBG212]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG212]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG212]]
-// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr addrspace(1) [[TMP9]] to ptr, !dbg [[DBG212]]
-// CHECK1-NEXT:    store ptr [[TMP10]], ptr [[_TMP3]], align 8, !dbg [[DBG212]]
-// CHECK1-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[_TMP3]], align 8, !dbg [[DBG212]]
-// CHECK1-NEXT:    [[TMP12:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l51_kernel_environment), !dbg [[DBG212]]
-// CHECK1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP12]], -1, !dbg [[DBG212]]
-// CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]], !dbg [[DBG212]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META219:![0-9]+]], metadata !DIExpression()), !dbg [[DBG220:![0-9]+]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG221:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG221]]
+// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG221]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG221]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr [[A_ADDR]], align 8, !dbg [[DBG221]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[TMP3]] to ptr, !dbg [[DBG221]]
+// CHECK1-NEXT:    store ptr [[TMP4]], ptr [[_TMP1]], align 8, !dbg [[DBG221]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG221]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr addrspace(1), ptr [[B_ADDR]], align 8, !dbg [[DBG221]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = addrspacecast ptr addrspace(1) [[TMP6]] to ptr, !dbg [[DBG221]]
+// CHECK1-NEXT:    store ptr [[TMP7]], ptr [[_TMP2]], align 8, !dbg [[DBG221]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG221]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG221]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr addrspace(1) [[TMP9]] to ptr, !dbg [[DBG221]]
+// CHECK1-NEXT:    store ptr [[TMP10]], ptr [[_TMP3]], align 8, !dbg [[DBG221]]
+// CHECK1-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[_TMP3]], align 8, !dbg [[DBG221]]
+// CHECK1-NEXT:    [[TMP12:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l51_kernel_environment), !dbg [[DBG221]]
+// CHECK1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP12]], -1, !dbg [[DBG221]]
+// CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]], !dbg [[DBG221]]
 // CHECK1:       user_code.entry:
 // CHECK1-NEXT:    [[TMP13:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB11:[0-9]+]])
-// CHECK1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 0, !dbg [[DBG213:![0-9]+]]
-// CHECK1-NEXT:    store ptr [[TMP2]], ptr [[TMP14]], align 8, !dbg [[DBG213]]
-// CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 1, !dbg [[DBG213]]
-// CHECK1-NEXT:    store ptr [[TMP5]], ptr [[TMP15]], align 8, !dbg [[DBG213]]
-// CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 2, !dbg [[DBG213]]
-// CHECK1-NEXT:    store ptr [[TMP8]], ptr [[TMP16]], align 8, !dbg [[DBG213]]
-// CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 3, !dbg [[DBG213]]
-// CHECK1-NEXT:    store ptr [[TMP11]], ptr [[TMP17]], align 8, !dbg [[DBG213]]
-// CHECK1-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB11]], i32 [[TMP13]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l51_debug___omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 4), !dbg [[DBG213]]
-// CHECK1-NEXT:    call void @__kmpc_target_deinit(), !dbg [[DBG214:![0-9]+]]
-// CHECK1-NEXT:    ret void, !dbg [[DBG216:![0-9]+]]
+// CHECK1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 0, !dbg [[DBG222:![0-9]+]]
+// CHECK1-NEXT:    store ptr [[TMP2]], ptr [[TMP14]], align 8, !dbg [[DBG222]]
+// CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 1, !dbg [[DBG222]]
+// CHECK1-NEXT:    store ptr [[TMP5]], ptr [[TMP15]], align 8, !dbg [[DBG222]]
+// CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 2, !dbg [[DBG222]]
+// CHECK1-NEXT:    store ptr [[TMP8]], ptr [[TMP16]], align 8, !dbg [[DBG222]]
+// CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 3, !dbg [[DBG222]]
+// CHECK1-NEXT:    store ptr [[TMP11]], ptr [[TMP17]], align 8, !dbg [[DBG222]]
+// CHECK1-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB11]], i32 [[TMP13]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l51_debug___omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 4), !dbg [[DBG222]]
+// CHECK1-NEXT:    call void @__kmpc_target_deinit(), !dbg [[DBG223:![0-9]+]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG225:![0-9]+]]
 // CHECK1:       worker.exit:
-// CHECK1-NEXT:    ret void, !dbg [[DBG212]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG221]]
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l51_debug___omp_outlined_debug__
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], ptr addrspace(1) noalias noundef [[A:%.*]], ptr addrspace(1) noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG217:![0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], ptr addrspace(1) noalias noundef [[A:%.*]], ptr addrspace(1) noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR2]] !dbg [[DBG226:![0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -555,82 +555,82 @@ int main() {
 // CHECK1-NEXT:    [[H:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[D:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META220:![0-9]+]], metadata !DIExpression()), !dbg [[DBG221:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META229:![0-9]+]], metadata !DIExpression()), !dbg [[DBG230:![0-9]+]]
 // CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META222:![0-9]+]], metadata !DIExpression()), !dbg [[DBG221]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META231:![0-9]+]], metadata !DIExpression()), !dbg [[DBG230]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META223:![0-9]+]], metadata !DIExpression()), !dbg [[DBG224:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META232:![0-9]+]], metadata !DIExpression()), !dbg [[DBG233:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[A]], ptr [[A_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META225:![0-9]+]], metadata !DIExpression()), !dbg [[DBG226:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META234:![0-9]+]], metadata !DIExpression()), !dbg [[DBG235:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META227:![0-9]+]], metadata !DIExpression()), !dbg [[DBG228:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META236:![0-9]+]], metadata !DIExpression()), !dbg [[DBG237:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META229:![0-9]+]], metadata !DIExpression()), !dbg [[DBG230:![0-9]+]]
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG231:![0-9]+]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG231]]
-// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG231]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG231]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr [[A_ADDR]], align 8, !dbg [[DBG231]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[TMP3]] to ptr, !dbg [[DBG231]]
-// CHECK1-NEXT:    store ptr [[TMP4]], ptr [[_TMP1]], align 8, !dbg [[DBG231]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG231]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr addrspace(1), ptr [[B_ADDR]], align 8, !dbg [[DBG231]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = addrspacecast ptr addrspace(1) [[TMP6]] to ptr, !dbg [[DBG231]]
-// CHECK1-NEXT:    store ptr [[TMP7]], ptr [[_TMP2]], align 8, !dbg [[DBG231]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG231]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG231]]
-// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr addrspace(1) [[TMP9]] to ptr, !dbg [[DBG231]]
-// CHECK1-NEXT:    store ptr [[TMP10]], ptr [[_TMP3]], align 8, !dbg [[DBG231]]
-// CHECK1-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[_TMP3]], align 8, !dbg [[DBG231]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[F]], metadata [[META232:![0-9]+]], metadata !DIExpression()), !dbg [[DBG234:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 1, !dbg [[DBG235:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX]], i64 0, i64 1, !dbg [[DBG235]]
-// CHECK1-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX4]], i64 0, i64 1, !dbg [[DBG235]]
-// CHECK1-NEXT:    store ptr [[ARRAYIDX5]], ptr [[F]], align 8, !dbg [[DBG234]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[G]], metadata [[META236:![0-9]+]], metadata !DIExpression()), !dbg [[DBG237:![0-9]+]]
-// CHECK1-NEXT:    store ptr [[TMP5]], ptr [[G]], align 8, !dbg [[DBG237]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[H]], metadata [[META238:![0-9]+]], metadata !DIExpression()), !dbg [[DBG239:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP8]], i64 0, i64 1, !dbg [[DBG240:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX6]], i64 0, i64 1, !dbg [[DBG240]]
-// CHECK1-NEXT:    store ptr [[ARRAYIDX7]], ptr [[H]], align 8, !dbg [[DBG239]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[D]], metadata [[META241:![0-9]+]], metadata !DIExpression()), !dbg [[DBG242:![0-9]+]]
-// CHECK1-NEXT:    store i32 15, ptr [[D]], align 4, !dbg [[DBG242]]
-// CHECK1-NEXT:    store i32 5, ptr [[TMP5]], align 4, !dbg [[DBG243:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP8]], i64 0, i64 0, !dbg [[DBG244:![0-9]+]]
-// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG245:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP12]] to i64, !dbg [[DBG244]]
-// CHECK1-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX8]], i64 0, i64 [[IDXPROM]], !dbg [[DBG244]]
-// CHECK1-NEXT:    store i32 10, ptr [[ARRAYIDX9]], align 4, !dbg [[DBG246:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG247:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX10]], i64 0, i64 0, !dbg [[DBG247]]
-// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG248:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM12:%.*]] = sext i32 [[TMP13]] to i64, !dbg [[DBG247]]
-// CHECK1-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX11]], i64 0, i64 [[IDXPROM12]], !dbg [[DBG247]]
-// CHECK1-NEXT:    store i32 11, ptr [[ARRAYIDX13]], align 4, !dbg [[DBG249:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG250:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX14]], i64 0, i64 0, !dbg [[DBG250]]
-// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG251:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM16:%.*]] = sext i32 [[TMP14]] to i64, !dbg [[DBG250]]
-// CHECK1-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX15]], i64 0, i64 [[IDXPROM16]], !dbg [[DBG250]]
-// CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[ARRAYIDX17]], align 4, !dbg [[DBG250]]
-// CHECK1-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP8]], i64 0, i64 0, !dbg [[DBG252:![0-9]+]]
-// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG253:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM19:%.*]] = sext i32 [[TMP16]] to i64, !dbg [[DBG252]]
-// CHECK1-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX18]], i64 0, i64 [[IDXPROM19]], !dbg [[DBG252]]
-// CHECK1-NEXT:    store i32 [[TMP15]], ptr [[ARRAYIDX20]], align 4, !dbg [[DBG254:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX21:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP8]], i64 0, i64 0, !dbg [[DBG255:![0-9]+]]
-// CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG256:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM22:%.*]] = sext i32 [[TMP17]] to i64, !dbg [[DBG255]]
-// CHECK1-NEXT:    [[ARRAYIDX23:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX21]], i64 0, i64 [[IDXPROM22]], !dbg [[DBG255]]
-// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[ARRAYIDX23]], align 4, !dbg [[DBG255]]
-// CHECK1-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP18]], 0, !dbg [[DBG255]]
-// CHECK1-NEXT:    [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8, !dbg [[DBG257:![0-9]+]]
-// CHECK1-NEXT:    store i8 [[FROMBOOL]], ptr [[TMP11]], align 1, !dbg [[DBG257]]
-// CHECK1-NEXT:    ret void, !dbg [[DBG258:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META238:![0-9]+]], metadata !DIExpression()), !dbg [[DBG239:![0-9]+]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG240:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG240]]
+// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG240]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG240]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr [[A_ADDR]], align 8, !dbg [[DBG240]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[TMP3]] to ptr, !dbg [[DBG240]]
+// CHECK1-NEXT:    store ptr [[TMP4]], ptr [[_TMP1]], align 8, !dbg [[DBG240]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG240]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr addrspace(1), ptr [[B_ADDR]], align 8, !dbg [[DBG240]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = addrspacecast ptr addrspace(1) [[TMP6]] to ptr, !dbg [[DBG240]]
+// CHECK1-NEXT:    store ptr [[TMP7]], ptr [[_TMP2]], align 8, !dbg [[DBG240]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG240]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG240]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr addrspace(1) [[TMP9]] to ptr, !dbg [[DBG240]]
+// CHECK1-NEXT:    store ptr [[TMP10]], ptr [[_TMP3]], align 8, !dbg [[DBG240]]
+// CHECK1-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[_TMP3]], align 8, !dbg [[DBG240]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[F]], metadata [[META241:![0-9]+]], metadata !DIExpression()), !dbg [[DBG243:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 1, !dbg [[DBG244:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX]], i64 0, i64 1, !dbg [[DBG244]]
+// CHECK1-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX4]], i64 0, i64 1, !dbg [[DBG244]]
+// CHECK1-NEXT:    store ptr [[ARRAYIDX5]], ptr [[F]], align 8, !dbg [[DBG243]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[G]], metadata [[META245:![0-9]+]], metadata !DIExpression()), !dbg [[DBG246:![0-9]+]]
+// CHECK1-NEXT:    store ptr [[TMP5]], ptr [[G]], align 8, !dbg [[DBG246]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[H]], metadata [[META247:![0-9]+]], metadata !DIExpression()), !dbg [[DBG248:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP8]], i64 0, i64 1, !dbg [[DBG249:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX6]], i64 0, i64 1, !dbg [[DBG249]]
+// CHECK1-NEXT:    store ptr [[ARRAYIDX7]], ptr [[H]], align 8, !dbg [[DBG248]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[D]], metadata [[META250:![0-9]+]], metadata !DIExpression()), !dbg [[DBG251:![0-9]+]]
+// CHECK1-NEXT:    store i32 15, ptr [[D]], align 4, !dbg [[DBG251]]
+// CHECK1-NEXT:    store i32 5, ptr [[TMP5]], align 4, !dbg [[DBG252:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP8]], i64 0, i64 0, !dbg [[DBG253:![0-9]+]]
+// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG254:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP12]] to i64, !dbg [[DBG253]]
+// CHECK1-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX8]], i64 0, i64 [[IDXPROM]], !dbg [[DBG253]]
+// CHECK1-NEXT:    store i32 10, ptr [[ARRAYIDX9]], align 4, !dbg [[DBG255:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG256:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX10]], i64 0, i64 0, !dbg [[DBG256]]
+// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG257:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM12:%.*]] = sext i32 [[TMP13]] to i64, !dbg [[DBG256]]
+// CHECK1-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX11]], i64 0, i64 [[IDXPROM12]], !dbg [[DBG256]]
+// CHECK1-NEXT:    store i32 11, ptr [[ARRAYIDX13]], align 4, !dbg [[DBG258:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG259:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX14]], i64 0, i64 0, !dbg [[DBG259]]
+// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG260:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM16:%.*]] = sext i32 [[TMP14]] to i64, !dbg [[DBG259]]
+// CHECK1-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX15]], i64 0, i64 [[IDXPROM16]], !dbg [[DBG259]]
+// CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[ARRAYIDX17]], align 4, !dbg [[DBG259]]
+// CHECK1-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP8]], i64 0, i64 0, !dbg [[DBG261:![0-9]+]]
+// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG262:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM19:%.*]] = sext i32 [[TMP16]] to i64, !dbg [[DBG261]]
+// CHECK1-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX18]], i64 0, i64 [[IDXPROM19]], !dbg [[DBG261]]
+// CHECK1-NEXT:    store i32 [[TMP15]], ptr [[ARRAYIDX20]], align 4, !dbg [[DBG263:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX21:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP8]], i64 0, i64 0, !dbg [[DBG264:![0-9]+]]
+// CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG265:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM22:%.*]] = sext i32 [[TMP17]] to i64, !dbg [[DBG264]]
+// CHECK1-NEXT:    [[ARRAYIDX23:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX21]], i64 0, i64 [[IDXPROM22]], !dbg [[DBG264]]
+// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[ARRAYIDX23]], align 4, !dbg [[DBG264]]
+// CHECK1-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP18]], 0, !dbg [[DBG264]]
+// CHECK1-NEXT:    [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8, !dbg [[DBG266:![0-9]+]]
+// CHECK1-NEXT:    store i8 [[FROMBOOL]], ptr [[TMP11]], align 1, !dbg [[DBG266]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG267:![0-9]+]]
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l51_debug___omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG259:![0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR2]] !dbg [[DBG268:![0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -639,62 +639,62 @@ int main() {
 // CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[BB_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META262:![0-9]+]], metadata !DIExpression()), !dbg [[DBG263:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META271:![0-9]+]], metadata !DIExpression()), !dbg [[DBG272:![0-9]+]]
 // CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META264:![0-9]+]], metadata !DIExpression()), !dbg [[DBG263]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META273:![0-9]+]], metadata !DIExpression()), !dbg [[DBG272]]
 // CHECK1-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META265:![0-9]+]], metadata !DIExpression()), !dbg [[DBG263]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META274:![0-9]+]], metadata !DIExpression()), !dbg [[DBG272]]
 // CHECK1-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META266:![0-9]+]], metadata !DIExpression()), !dbg [[DBG263]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META275:![0-9]+]], metadata !DIExpression()), !dbg [[DBG272]]
 // CHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META267:![0-9]+]], metadata !DIExpression()), !dbg [[DBG263]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META276:![0-9]+]], metadata !DIExpression()), !dbg [[DBG272]]
 // CHECK1-NEXT:    store ptr [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META268:![0-9]+]], metadata !DIExpression()), !dbg [[DBG263]]
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG269:![0-9]+]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG269]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG269]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG269]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG269]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG269]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG269]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG269]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG269]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG269]]
-// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr [[TMP6]] to ptr addrspace(1), !dbg [[DBG269]]
-// CHECK1-NEXT:    [[TMP11:%.*]] = addrspacecast ptr [[TMP7]] to ptr addrspace(1), !dbg [[DBG269]]
-// CHECK1-NEXT:    [[TMP12:%.*]] = addrspacecast ptr [[TMP8]] to ptr addrspace(1), !dbg [[DBG269]]
-// CHECK1-NEXT:    [[TMP13:%.*]] = addrspacecast ptr [[TMP9]] to ptr addrspace(1), !dbg [[DBG269]]
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l51_debug___omp_outlined_debug__(ptr [[TMP4]], ptr [[TMP5]], ptr addrspace(1) [[TMP10]], ptr addrspace(1) [[TMP11]], ptr addrspace(1) [[TMP12]], ptr addrspace(1) [[TMP13]]) #[[ATTR3]], !dbg [[DBG269]]
-// CHECK1-NEXT:    ret void, !dbg [[DBG269]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META277:![0-9]+]], metadata !DIExpression()), !dbg [[DBG272]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG278:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG278]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG278]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG278]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG278]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG278]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG278]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG278]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG278]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG278]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr [[TMP6]] to ptr addrspace(1), !dbg [[DBG278]]
+// CHECK1-NEXT:    [[TMP11:%.*]] = addrspacecast ptr [[TMP7]] to ptr addrspace(1), !dbg [[DBG278]]
+// CHECK1-NEXT:    [[TMP12:%.*]] = addrspacecast ptr [[TMP8]] to ptr addrspace(1), !dbg [[DBG278]]
+// CHECK1-NEXT:    [[TMP13:%.*]] = addrspacecast ptr [[TMP9]] to ptr addrspace(1), !dbg [[DBG278]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l51_debug___omp_outlined_debug__(ptr [[TMP4]], ptr [[TMP5]], ptr addrspace(1) [[TMP10]], ptr addrspace(1) [[TMP11]], ptr addrspace(1) [[TMP12]], ptr addrspace(1) [[TMP13]]) #[[ATTR4]], !dbg [[DBG278]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG278]]
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l51
-// CHECK1-SAME: (ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR5]] !dbg [[DBG270:![0-9]+]] {
+// CHECK1-SAME: (ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR6]] !dbg [[DBG279:![0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[BB_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META273:![0-9]+]], metadata !DIExpression()), !dbg [[DBG274:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META282:![0-9]+]], metadata !DIExpression()), !dbg [[DBG283:![0-9]+]]
 // CHECK1-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META275:![0-9]+]], metadata !DIExpression()), !dbg [[DBG274]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META284:![0-9]+]], metadata !DIExpression()), !dbg [[DBG283]]
 // CHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META276:![0-9]+]], metadata !DIExpression()), !dbg [[DBG274]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META285:![0-9]+]], metadata !DIExpression()), !dbg [[DBG283]]
 // CHECK1-NEXT:    store ptr [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META277:![0-9]+]], metadata !DIExpression()), !dbg [[DBG274]]
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG278:![0-9]+]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG278]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG278]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG278]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG278]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG278]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG278]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG278]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = addrspacecast ptr [[TMP4]] to ptr addrspace(1), !dbg [[DBG278]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = addrspacecast ptr [[TMP5]] to ptr addrspace(1), !dbg [[DBG278]]
-// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr [[TMP6]] to ptr addrspace(1), !dbg [[DBG278]]
-// CHECK1-NEXT:    [[TMP11:%.*]] = addrspacecast ptr [[TMP7]] to ptr addrspace(1), !dbg [[DBG278]]
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l51_debug__(ptr addrspace(1) [[TMP8]], ptr addrspace(1) [[TMP9]], ptr addrspace(1) [[TMP10]], ptr addrspace(1) [[TMP11]]) #[[ATTR3]], !dbg [[DBG278]]
-// CHECK1-NEXT:    ret void, !dbg [[DBG278]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META286:![0-9]+]], metadata !DIExpression()), !dbg [[DBG283]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG287:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG287]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG287]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG287]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG287]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG287]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG287]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG287]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = addrspacecast ptr [[TMP4]] to ptr addrspace(1), !dbg [[DBG287]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = addrspacecast ptr [[TMP5]] to ptr addrspace(1), !dbg [[DBG287]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr [[TMP6]] to ptr addrspace(1), !dbg [[DBG287]]
+// CHECK1-NEXT:    [[TMP11:%.*]] = addrspacecast ptr [[TMP7]] to ptr addrspace(1), !dbg [[DBG287]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l51_debug__(ptr addrspace(1) [[TMP8]], ptr addrspace(1) [[TMP9]], ptr addrspace(1) [[TMP10]], ptr addrspace(1) [[TMP11]]) #[[ATTR4]], !dbg [[DBG287]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG287]]
 //
diff --git a/clang/test/OpenMP/target_parallel_for_codegen.cpp b/clang/test/OpenMP/target_parallel_for_codegen.cpp
index 09c479c50eef2..c4680d1ea6c4a 100644
--- a/clang/test/OpenMP/target_parallel_for_codegen.cpp
+++ b/clang/test/OpenMP/target_parallel_for_codegen.cpp
@@ -395,7 +395,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP21:%.*]] = icmp ne i32 [[TMP20]], 0
 // CHECK1-NEXT:    br i1 [[TMP21]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK1:       omp_offload.failed:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l103() #[[ATTR4:[0-9]+]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l103() #[[ATTR3:[0-9]+]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    [[CALL:%.*]] = call noundef i64 @_Z7get_valv()
@@ -406,7 +406,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP24:%.*]] = load i64, ptr [[K]], align 8
 // CHECK1-NEXT:    store i64 [[TMP24]], ptr [[K_CASTED]], align 8
 // CHECK1-NEXT:    [[TMP25:%.*]] = load i64, ptr [[K_CASTED]], align 8
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l110(i64 [[TMP23]], i64 [[TMP25]]) #[[ATTR4]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l110(i64 [[TMP23]], i64 [[TMP25]]) #[[ATTR3]]
 // CHECK1-NEXT:    store i32 12, ptr [[LIN]], align 4
 // CHECK1-NEXT:    [[TMP26:%.*]] = load i16, ptr [[AA]], align 2
 // CHECK1-NEXT:    store i16 [[TMP26]], ptr [[AA_CASTED]], align 2
@@ -516,12 +516,12 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP87:%.*]] = icmp ne i32 [[TMP86]], 0
 // CHECK1-NEXT:    br i1 [[TMP87]], label [[OMP_OFFLOAD_FAILED9:%.*]], label [[OMP_OFFLOAD_CONT10:%.*]]
 // CHECK1:       omp_offload.failed9:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l136(i64 [[TMP61]], i64 [[TMP63]]) #[[ATTR4]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l136(i64 [[TMP61]], i64 [[TMP63]]) #[[ATTR3]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT10]]
 // CHECK1:       omp_offload.cont10:
 // CHECK1-NEXT:    br label [[OMP_IF_END:%.*]]
 // CHECK1:       omp_if.else:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l136(i64 [[TMP61]], i64 [[TMP63]]) #[[ATTR4]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l136(i64 [[TMP61]], i64 [[TMP63]]) #[[ATTR3]]
 // CHECK1-NEXT:    br label [[OMP_IF_END]]
 // CHECK1:       omp_if.end:
 // CHECK1-NEXT:    [[TMP88:%.*]] = load i32, ptr [[A]], align 4
@@ -637,12 +637,12 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP146:%.*]] = icmp ne i32 [[TMP145]], 0
 // CHECK1-NEXT:    br i1 [[TMP146]], label [[OMP_OFFLOAD_FAILED18:%.*]], label [[OMP_OFFLOAD_CONT19:%.*]]
 // CHECK1:       omp_offload.failed18:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l160(i64 [[TMP90]], ptr [[B]], i64 [[TMP2]], ptr [[VLA]], ptr [[C]], i64 5, i64 [[TMP5]], ptr [[VLA1]], ptr [[D]], i64 [[TMP92]]) #[[ATTR4]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l160(i64 [[TMP90]], ptr [[B]], i64 [[TMP2]], ptr [[VLA]], ptr [[C]], i64 5, i64 [[TMP5]], ptr [[VLA1]], ptr [[D]], i64 [[TMP92]]) #[[ATTR3]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT19]]
 // CHECK1:       omp_offload.cont19:
 // CHECK1-NEXT:    br label [[OMP_IF_END21:%.*]]
 // CHECK1:       omp_if.else20:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l160(i64 [[TMP90]], ptr [[B]], i64 [[TMP2]], ptr [[VLA]], ptr [[C]], i64 5, i64 [[TMP5]], ptr [[VLA1]], ptr [[D]], i64 [[TMP92]]) #[[ATTR4]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l160(i64 [[TMP90]], ptr [[B]], i64 [[TMP2]], ptr [[VLA]], ptr [[C]], i64 5, i64 [[TMP5]], ptr [[VLA1]], ptr [[D]], i64 [[TMP92]]) #[[ATTR3]]
 // CHECK1-NEXT:    br label [[OMP_IF_END21]]
 // CHECK1:       omp_if.end21:
 // CHECK1-NEXT:    [[TMP147:%.*]] = load i32, ptr [[A]], align 4
@@ -659,7 +659,7 @@ int bar(int n){
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l103.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -736,7 +736,7 @@ int bar(int n){
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l110
-// CHECK1-SAME: (i64 noundef [[A:%.*]], i64 noundef [[K:%.*]]) #[[ATTR3]] {
+// CHECK1-SAME: (i64 noundef [[A:%.*]], i64 noundef [[K:%.*]]) #[[ATTR2]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
 // CHECK1-NEXT:    [[K_ADDR:%.*]] = alloca i64, align 8
@@ -755,7 +755,7 @@ int bar(int n){
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l110.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], i64 noundef [[K:%.*]]) #[[ATTR3]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], i64 noundef [[K:%.*]]) #[[ATTR2]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -794,32 +794,32 @@ int bar(int n){
 // CHECK1-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK1:       omp.inner.for.cond:
-// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26:![0-9]+]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12:![0-9]+]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP12]]
 // CHECK1-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
 // CHECK1-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK1:       omp.inner.for.body:
-// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12]]
 // CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
 // CHECK1-NEXT:    [[SUB:%.*]] = sub nsw i32 10, [[MUL]]
-// CHECK1-NEXT:    store i32 [[SUB]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP26]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load i64, ptr [[DOTLINEAR_START]], align 8, !llvm.access.group [[ACC_GRP26]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK1-NEXT:    store i32 [[SUB]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load i64, ptr [[DOTLINEAR_START]], align 8, !llvm.access.group [[ACC_GRP12]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12]]
 // CHECK1-NEXT:    [[MUL2:%.*]] = mul nsw i32 [[TMP9]], 3
 // CHECK1-NEXT:    [[CONV:%.*]] = sext i32 [[MUL2]] to i64
 // CHECK1-NEXT:    [[ADD:%.*]] = add nsw i64 [[TMP8]], [[CONV]]
-// CHECK1-NEXT:    store i64 [[ADD]], ptr [[K1]], align 8, !llvm.access.group [[ACC_GRP26]]
-// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK1-NEXT:    store i64 [[ADD]], ptr [[K1]], align 8, !llvm.access.group [[ACC_GRP12]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP12]]
 // CHECK1-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP10]], 1
-// CHECK1-NEXT:    store i32 [[ADD3]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK1-NEXT:    store i32 [[ADD3]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP12]]
 // CHECK1-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK1:       omp.body.continue:
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK1:       omp.inner.for.inc:
-// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12]]
 // CHECK1-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP11]], 1
-// CHECK1-NEXT:    store i32 [[ADD4]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP27:![0-9]+]]
+// CHECK1-NEXT:    store i32 [[ADD4]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP13:![0-9]+]]
 // CHECK1:       omp.inner.for.end:
 // CHECK1-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK1:       omp.dispatch.inc:
@@ -862,7 +862,7 @@ int bar(int n){
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l128.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[AA:%.*]], i64 noundef [[LIN:%.*]], i64 noundef [[A:%.*]]) #[[ATTR3]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[AA:%.*]], i64 noundef [[LIN:%.*]], i64 noundef [[A:%.*]]) #[[ATTR2]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -971,7 +971,7 @@ int bar(int n){
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@.omp_task_privates_map.
-// CHECK1-SAME: (ptr noalias noundef [[TMP0:%.*]], ptr noalias noundef [[TMP1:%.*]], ptr noalias noundef [[TMP2:%.*]], ptr noalias noundef [[TMP3:%.*]], ptr noalias noundef [[TMP4:%.*]]) #[[ATTR6:[0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[TMP0:%.*]], ptr noalias noundef [[TMP1:%.*]], ptr noalias noundef [[TMP2:%.*]], ptr noalias noundef [[TMP3:%.*]], ptr noalias noundef [[TMP4:%.*]]) #[[ATTR5:[0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8
@@ -1000,7 +1000,7 @@ int bar(int n){
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@.omp_task_entry.
-// CHECK1-SAME: (i32 noundef signext [[TMP0:%.*]], ptr noalias noundef [[TMP1:%.*]]) #[[ATTR7:[0-9]+]] {
+// CHECK1-SAME: (i32 noundef signext [[TMP0:%.*]], ptr noalias noundef [[TMP1:%.*]]) #[[ATTR6:[0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR_I:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[DOTPART_ID__ADDR_I:%.*]] = alloca ptr, align 8
@@ -1027,65 +1027,65 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], ptr [[TMP4]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8
 // CHECK1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES]], ptr [[TMP3]], i32 0, i32 1
-// CHECK1-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META29:![0-9]+]])
-// CHECK1-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META32:![0-9]+]])
-// CHECK1-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META34:![0-9]+]])
-// CHECK1-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META36:![0-9]+]])
-// CHECK1-NEXT:    store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !38
-// CHECK1-NEXT:    store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 8, !noalias !38
-// CHECK1-NEXT:    store ptr [[TMP8]], ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias !38
-// CHECK1-NEXT:    store ptr @.omp_task_privates_map., ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !38
-// CHECK1-NEXT:    store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 8, !noalias !38
-// CHECK1-NEXT:    store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !38
-// CHECK1-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !38
-// CHECK1-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !38
-// CHECK1-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias !38
-// CHECK1-NEXT:    call void [[TMP10]](ptr [[TMP11]], ptr [[DOTFIRSTPRIV_PTR_ADDR_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR2_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR3_I]]) #[[ATTR4]]
-// CHECK1-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR_I]], align 8, !noalias !38
-// CHECK1-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], align 8, !noalias !38
-// CHECK1-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR2_I]], align 8, !noalias !38
-// CHECK1-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR3_I]], align 8, !noalias !38
+// CHECK1-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META15:![0-9]+]])
+// CHECK1-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META18:![0-9]+]])
+// CHECK1-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META20:![0-9]+]])
+// CHECK1-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META22:![0-9]+]])
+// CHECK1-NEXT:    store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !24
+// CHECK1-NEXT:    store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 8, !noalias !24
+// CHECK1-NEXT:    store ptr [[TMP8]], ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias !24
+// CHECK1-NEXT:    store ptr @.omp_task_privates_map., ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !24
+// CHECK1-NEXT:    store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 8, !noalias !24
+// CHECK1-NEXT:    store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !24
+// CHECK1-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !24
+// CHECK1-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !24
+// CHECK1-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias !24
+// CHECK1-NEXT:    call void [[TMP10]](ptr [[TMP11]], ptr [[DOTFIRSTPRIV_PTR_ADDR_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR2_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR3_I]]) #[[ATTR3]]
+// CHECK1-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR_I]], align 8, !noalias !24
+// CHECK1-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], align 8, !noalias !24
+// CHECK1-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR2_I]], align 8, !noalias !24
+// CHECK1-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR3_I]], align 8, !noalias !24
 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT_ANON:%.*]], ptr [[TMP9]], i32 0, i32 1
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_ANON]], ptr [[TMP9]], i32 0, i32 2
-// CHECK1-NEXT:    store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias !38
+// CHECK1-NEXT:    store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias !24
 // CHECK1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 1
-// CHECK1-NEXT:    store i32 3, ptr [[TMP18]], align 4, !noalias !38
+// CHECK1-NEXT:    store i32 3, ptr [[TMP18]], align 4, !noalias !24
 // CHECK1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 2
-// CHECK1-NEXT:    store ptr [[TMP13]], ptr [[TMP19]], align 8, !noalias !38
+// CHECK1-NEXT:    store ptr [[TMP13]], ptr [[TMP19]], align 8, !noalias !24
 // CHECK1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 3
-// CHECK1-NEXT:    store ptr [[TMP14]], ptr [[TMP20]], align 8, !noalias !38
+// CHECK1-NEXT:    store ptr [[TMP14]], ptr [[TMP20]], align 8, !noalias !24
 // CHECK1-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 4
-// CHECK1-NEXT:    store ptr [[TMP15]], ptr [[TMP21]], align 8, !noalias !38
+// CHECK1-NEXT:    store ptr [[TMP15]], ptr [[TMP21]], align 8, !noalias !24
 // CHECK1-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 5
-// CHECK1-NEXT:    store ptr @.offload_maptypes, ptr [[TMP22]], align 8, !noalias !38
+// CHECK1-NEXT:    store ptr @.offload_maptypes, ptr [[TMP22]], align 8, !noalias !24
 // CHECK1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 6
-// CHECK1-NEXT:    store ptr null, ptr [[TMP23]], align 8, !noalias !38
+// CHECK1-NEXT:    store ptr null, ptr [[TMP23]], align 8, !noalias !24
 // CHECK1-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 7
-// CHECK1-NEXT:    store ptr null, ptr [[TMP24]], align 8, !noalias !38
+// CHECK1-NEXT:    store ptr null, ptr [[TMP24]], align 8, !noalias !24
 // CHECK1-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 8
-// CHECK1-NEXT:    store i64 0, ptr [[TMP25]], align 8, !noalias !38
+// CHECK1-NEXT:    store i64 0, ptr [[TMP25]], align 8, !noalias !24
 // CHECK1-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 9
-// CHECK1-NEXT:    store i64 1, ptr [[TMP26]], align 8, !noalias !38
+// CHECK1-NEXT:    store i64 1, ptr [[TMP26]], align 8, !noalias !24
 // CHECK1-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 10
-// CHECK1-NEXT:    store [3 x i32] [i32 1, i32 0, i32 0], ptr [[TMP27]], align 4, !noalias !38
+// CHECK1-NEXT:    store [3 x i32] [i32 1, i32 0, i32 0], ptr [[TMP27]], align 4, !noalias !24
 // CHECK1-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 11
-// CHECK1-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP28]], align 4, !noalias !38
+// CHECK1-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP28]], align 4, !noalias !24
 // CHECK1-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 12
-// CHECK1-NEXT:    store i32 0, ptr [[TMP29]], align 4, !noalias !38
+// CHECK1-NEXT:    store i32 0, ptr [[TMP29]], align 4, !noalias !24
 // CHECK1-NEXT:    [[TMP30:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB2]], i64 -1, i32 1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l128.region_id, ptr [[KERNEL_ARGS_I]])
 // CHECK1-NEXT:    [[TMP31:%.*]] = icmp ne i32 [[TMP30]], 0
 // CHECK1-NEXT:    br i1 [[TMP31]], label [[OMP_OFFLOAD_FAILED_I:%.*]], label [[DOTOMP_OUTLINED__EXIT:%.*]]
 // CHECK1:       omp_offload.failed.i:
 // CHECK1-NEXT:    [[TMP32:%.*]] = load i16, ptr [[TMP12]], align 2
-// CHECK1-NEXT:    store i16 [[TMP32]], ptr [[AA_CASTED_I]], align 2, !noalias !38
-// CHECK1-NEXT:    [[TMP33:%.*]] = load i64, ptr [[AA_CASTED_I]], align 8, !noalias !38
+// CHECK1-NEXT:    store i16 [[TMP32]], ptr [[AA_CASTED_I]], align 2, !noalias !24
+// CHECK1-NEXT:    [[TMP33:%.*]] = load i64, ptr [[AA_CASTED_I]], align 8, !noalias !24
 // CHECK1-NEXT:    [[TMP34:%.*]] = load i32, ptr [[TMP16]], align 4
-// CHECK1-NEXT:    store i32 [[TMP34]], ptr [[LIN_CASTED_I]], align 4, !noalias !38
-// CHECK1-NEXT:    [[TMP35:%.*]] = load i64, ptr [[LIN_CASTED_I]], align 8, !noalias !38
+// CHECK1-NEXT:    store i32 [[TMP34]], ptr [[LIN_CASTED_I]], align 4, !noalias !24
+// CHECK1-NEXT:    [[TMP35:%.*]] = load i64, ptr [[LIN_CASTED_I]], align 8, !noalias !24
 // CHECK1-NEXT:    [[TMP36:%.*]] = load i32, ptr [[TMP17]], align 4
-// CHECK1-NEXT:    store i32 [[TMP36]], ptr [[A_CASTED_I]], align 4, !noalias !38
-// CHECK1-NEXT:    [[TMP37:%.*]] = load i64, ptr [[A_CASTED_I]], align 8, !noalias !38
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l128(i64 [[TMP33]], i64 [[TMP35]], i64 [[TMP37]]) #[[ATTR4]]
+// CHECK1-NEXT:    store i32 [[TMP36]], ptr [[A_CASTED_I]], align 4, !noalias !24
+// CHECK1-NEXT:    [[TMP37:%.*]] = load i64, ptr [[A_CASTED_I]], align 8, !noalias !24
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l128(i64 [[TMP33]], i64 [[TMP35]], i64 [[TMP37]]) #[[ATTR3]]
 // CHECK1-NEXT:    br label [[DOTOMP_OUTLINED__EXIT]]
 // CHECK1:       .omp_outlined..exit:
 // CHECK1-NEXT:    ret i32 0
@@ -1111,7 +1111,7 @@ int bar(int n){
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l136.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]]) #[[ATTR3]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]]) #[[ATTR2]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1227,7 +1227,7 @@ int bar(int n){
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l160.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BN:%.*]], ptr noundef nonnull align 8 dereferenceable(400) [[C:%.*]], i64 noundef [[VLA1:%.*]], i64 noundef [[VLA3:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[CN:%.*]], ptr noundef nonnull align 8 dereferenceable(16) [[D:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BN:%.*]], ptr noundef nonnull align 8 dereferenceable(400) [[C:%.*]], i64 noundef [[VLA1:%.*]], i64 noundef [[VLA3:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[CN:%.*]], ptr noundef nonnull align 8 dereferenceable(16) [[D:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1504,12 +1504,12 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP43:%.*]] = icmp ne i32 [[TMP42]], 0
 // CHECK1-NEXT:    br i1 [[TMP43]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK1:       omp_offload.failed:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l232(ptr [[THIS1]], i64 [[TMP6]], i64 2, i64 [[TMP2]], ptr [[VLA]]) #[[ATTR4]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l232(ptr [[THIS1]], i64 [[TMP6]], i64 2, i64 [[TMP2]], ptr [[VLA]]) #[[ATTR3]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    br label [[OMP_IF_END:%.*]]
 // CHECK1:       omp_if.else:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l232(ptr [[THIS1]], i64 [[TMP6]], i64 2, i64 [[TMP2]], ptr [[VLA]]) #[[ATTR4]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l232(ptr [[THIS1]], i64 [[TMP6]], i64 2, i64 [[TMP2]], ptr [[VLA]]) #[[ATTR3]]
 // CHECK1-NEXT:    br label [[OMP_IF_END]]
 // CHECK1:       omp_if.end:
 // CHECK1-NEXT:    [[TMP44:%.*]] = mul nsw i64 1, [[TMP2]]
@@ -1612,12 +1612,12 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP35:%.*]] = icmp ne i32 [[TMP34]], 0
 // CHECK1-NEXT:    br i1 [[TMP35]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK1:       omp_offload.failed:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l214(i64 [[TMP1]], i64 [[TMP3]], i64 [[TMP5]], ptr [[B]]) #[[ATTR4]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l214(i64 [[TMP1]], i64 [[TMP3]], i64 [[TMP5]], ptr [[B]]) #[[ATTR3]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    br label [[OMP_IF_END:%.*]]
 // CHECK1:       omp_if.else:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l214(i64 [[TMP1]], i64 [[TMP3]], i64 [[TMP5]], ptr [[B]]) #[[ATTR4]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l214(i64 [[TMP1]], i64 [[TMP3]], i64 [[TMP5]], ptr [[B]]) #[[ATTR3]]
 // CHECK1-NEXT:    br label [[OMP_IF_END]]
 // CHECK1:       omp_if.end:
 // CHECK1-NEXT:    [[TMP36:%.*]] = load i32, ptr [[A]], align 4
@@ -1700,12 +1700,12 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP30:%.*]] = icmp ne i32 [[TMP29]], 0
 // CHECK1-NEXT:    br i1 [[TMP30]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK1:       omp_offload.failed:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l197(i64 [[TMP1]], i64 [[TMP3]], ptr [[B]]) #[[ATTR4]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l197(i64 [[TMP1]], i64 [[TMP3]], ptr [[B]]) #[[ATTR3]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    br label [[OMP_IF_END:%.*]]
 // CHECK1:       omp_if.else:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l197(i64 [[TMP1]], i64 [[TMP3]], ptr [[B]]) #[[ATTR4]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l197(i64 [[TMP1]], i64 [[TMP3]], ptr [[B]]) #[[ATTR3]]
 // CHECK1-NEXT:    br label [[OMP_IF_END]]
 // CHECK1:       omp_if.end:
 // CHECK1-NEXT:    [[TMP31:%.*]] = load i32, ptr [[A]], align 4
@@ -1738,7 +1738,7 @@ int bar(int n){
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l232.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]], i64 noundef [[B:%.*]], i64 noundef [[VLA:%.*]], i64 noundef [[VLA1:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[C:%.*]]) #[[ATTR3]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]], i64 noundef [[B:%.*]], i64 noundef [[VLA:%.*]], i64 noundef [[VLA1:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[C:%.*]]) #[[ATTR2]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1854,7 +1854,7 @@ int bar(int n){
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l214.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]], i64 noundef [[AAA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR3]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]], i64 noundef [[AAA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR2]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1897,7 +1897,7 @@ int bar(int n){
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l197.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR3]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR2]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1976,7 +1976,7 @@ int bar(int n){
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK1-SAME: () #[[ATTR6]] {
+// CHECK1-SAME: () #[[ATTR5]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK1-NEXT:    ret void
@@ -2068,7 +2068,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
 // CHECK3-NEXT:    br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK3:       omp_offload.failed:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l103() #[[ATTR4:[0-9]+]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l103() #[[ATTR3:[0-9]+]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK3:       omp_offload.cont:
 // CHECK3-NEXT:    [[CALL:%.*]] = call noundef i64 @_Z7get_valv()
@@ -2076,7 +2076,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP20:%.*]] = load i32, ptr [[A]], align 4
 // CHECK3-NEXT:    store i32 [[TMP20]], ptr [[A_CASTED]], align 4
 // CHECK3-NEXT:    [[TMP21:%.*]] = load i32, ptr [[A_CASTED]], align 4
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l110(i32 [[TMP21]], ptr [[K]]) #[[ATTR4]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l110(i32 [[TMP21]], ptr [[K]]) #[[ATTR3]]
 // CHECK3-NEXT:    store i32 12, ptr [[LIN]], align 4
 // CHECK3-NEXT:    [[TMP22:%.*]] = load i16, ptr [[AA]], align 2
 // CHECK3-NEXT:    store i16 [[TMP22]], ptr [[AA_CASTED]], align 2
@@ -2186,12 +2186,12 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP83:%.*]] = icmp ne i32 [[TMP82]], 0
 // CHECK3-NEXT:    br i1 [[TMP83]], label [[OMP_OFFLOAD_FAILED9:%.*]], label [[OMP_OFFLOAD_CONT10:%.*]]
 // CHECK3:       omp_offload.failed9:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l136(i32 [[TMP57]], i32 [[TMP59]]) #[[ATTR4]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l136(i32 [[TMP57]], i32 [[TMP59]]) #[[ATTR3]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT10]]
 // CHECK3:       omp_offload.cont10:
 // CHECK3-NEXT:    br label [[OMP_IF_END:%.*]]
 // CHECK3:       omp_if.else:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l136(i32 [[TMP57]], i32 [[TMP59]]) #[[ATTR4]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l136(i32 [[TMP57]], i32 [[TMP59]]) #[[ATTR3]]
 // CHECK3-NEXT:    br label [[OMP_IF_END]]
 // CHECK3:       omp_if.end:
 // CHECK3-NEXT:    [[TMP84:%.*]] = load i32, ptr [[A]], align 4
@@ -2309,12 +2309,12 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP144:%.*]] = icmp ne i32 [[TMP143]], 0
 // CHECK3-NEXT:    br i1 [[TMP144]], label [[OMP_OFFLOAD_FAILED18:%.*]], label [[OMP_OFFLOAD_CONT19:%.*]]
 // CHECK3:       omp_offload.failed18:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l160(i32 [[TMP86]], ptr [[B]], i32 [[TMP1]], ptr [[VLA]], ptr [[C]], i32 5, i32 [[TMP3]], ptr [[VLA1]], ptr [[D]], i32 [[TMP88]]) #[[ATTR4]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l160(i32 [[TMP86]], ptr [[B]], i32 [[TMP1]], ptr [[VLA]], ptr [[C]], i32 5, i32 [[TMP3]], ptr [[VLA1]], ptr [[D]], i32 [[TMP88]]) #[[ATTR3]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT19]]
 // CHECK3:       omp_offload.cont19:
 // CHECK3-NEXT:    br label [[OMP_IF_END21:%.*]]
 // CHECK3:       omp_if.else20:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l160(i32 [[TMP86]], ptr [[B]], i32 [[TMP1]], ptr [[VLA]], ptr [[C]], i32 5, i32 [[TMP3]], ptr [[VLA1]], ptr [[D]], i32 [[TMP88]]) #[[ATTR4]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l160(i32 [[TMP86]], ptr [[B]], i32 [[TMP1]], ptr [[VLA]], ptr [[C]], i32 5, i32 [[TMP3]], ptr [[VLA1]], ptr [[D]], i32 [[TMP88]]) #[[ATTR3]]
 // CHECK3-NEXT:    br label [[OMP_IF_END21]]
 // CHECK3:       omp_if.end21:
 // CHECK3-NEXT:    [[TMP145:%.*]] = load i32, ptr [[A]], align 4
@@ -2331,7 +2331,7 @@ int bar(int n){
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l103.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -2408,7 +2408,7 @@ int bar(int n){
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l110
-// CHECK3-SAME: (i32 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[K:%.*]]) #[[ATTR3]] {
+// CHECK3-SAME: (i32 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[K:%.*]]) #[[ATTR2]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 // CHECK3-NEXT:    [[K_ADDR:%.*]] = alloca ptr, align 4
@@ -2424,7 +2424,7 @@ int bar(int n){
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l110.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[K:%.*]]) #[[ATTR3]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[K:%.*]]) #[[ATTR2]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -2464,32 +2464,32 @@ int bar(int n){
 // CHECK3-NEXT:    store i32 [[TMP5]], ptr [[DOTOMP_IV]], align 4
 // CHECK3-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK3:       omp.inner.for.cond:
-// CHECK3-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP27:![0-9]+]]
-// CHECK3-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP27]]
+// CHECK3-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP13:![0-9]+]]
+// CHECK3-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP13]]
 // CHECK3-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP6]], [[TMP7]]
 // CHECK3-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK3:       omp.inner.for.body:
-// CHECK3-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP27]]
+// CHECK3-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP13]]
 // CHECK3-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1
 // CHECK3-NEXT:    [[SUB:%.*]] = sub nsw i32 10, [[MUL]]
-// CHECK3-NEXT:    store i32 [[SUB]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP27]]
-// CHECK3-NEXT:    [[TMP9:%.*]] = load i64, ptr [[DOTLINEAR_START]], align 8, !llvm.access.group [[ACC_GRP27]]
-// CHECK3-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP27]]
+// CHECK3-NEXT:    store i32 [[SUB]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP13]]
+// CHECK3-NEXT:    [[TMP9:%.*]] = load i64, ptr [[DOTLINEAR_START]], align 8, !llvm.access.group [[ACC_GRP13]]
+// CHECK3-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP13]]
 // CHECK3-NEXT:    [[MUL2:%.*]] = mul nsw i32 [[TMP10]], 3
 // CHECK3-NEXT:    [[CONV:%.*]] = sext i32 [[MUL2]] to i64
 // CHECK3-NEXT:    [[ADD:%.*]] = add nsw i64 [[TMP9]], [[CONV]]
-// CHECK3-NEXT:    store i64 [[ADD]], ptr [[K1]], align 8, !llvm.access.group [[ACC_GRP27]]
-// CHECK3-NEXT:    [[TMP11:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP27]]
+// CHECK3-NEXT:    store i64 [[ADD]], ptr [[K1]], align 8, !llvm.access.group [[ACC_GRP13]]
+// CHECK3-NEXT:    [[TMP11:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP13]]
 // CHECK3-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP11]], 1
-// CHECK3-NEXT:    store i32 [[ADD3]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP27]]
+// CHECK3-NEXT:    store i32 [[ADD3]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP13]]
 // CHECK3-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK3:       omp.body.continue:
 // CHECK3-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK3:       omp.inner.for.inc:
-// CHECK3-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP27]]
+// CHECK3-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP13]]
 // CHECK3-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP12]], 1
-// CHECK3-NEXT:    store i32 [[ADD4]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP27]]
-// CHECK3-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP28:![0-9]+]]
+// CHECK3-NEXT:    store i32 [[ADD4]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP13]]
+// CHECK3-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP14:![0-9]+]]
 // CHECK3:       omp.inner.for.end:
 // CHECK3-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK3:       omp.dispatch.inc:
@@ -2532,7 +2532,7 @@ int bar(int n){
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l128.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[AA:%.*]], i32 noundef [[LIN:%.*]], i32 noundef [[A:%.*]]) #[[ATTR3]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[AA:%.*]], i32 noundef [[LIN:%.*]], i32 noundef [[A:%.*]]) #[[ATTR2]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -2641,7 +2641,7 @@ int bar(int n){
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@.omp_task_privates_map.
-// CHECK3-SAME: (ptr noalias noundef [[TMP0:%.*]], ptr noalias noundef [[TMP1:%.*]], ptr noalias noundef [[TMP2:%.*]], ptr noalias noundef [[TMP3:%.*]], ptr noalias noundef [[TMP4:%.*]]) #[[ATTR6:[0-9]+]] {
+// CHECK3-SAME: (ptr noalias noundef [[TMP0:%.*]], ptr noalias noundef [[TMP1:%.*]], ptr noalias noundef [[TMP2:%.*]], ptr noalias noundef [[TMP3:%.*]], ptr noalias noundef [[TMP4:%.*]]) #[[ATTR5:[0-9]+]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 4
@@ -2670,7 +2670,7 @@ int bar(int n){
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@.omp_task_entry.
-// CHECK3-SAME: (i32 noundef [[TMP0:%.*]], ptr noalias noundef [[TMP1:%.*]]) #[[ATTR7:[0-9]+]] {
+// CHECK3-SAME: (i32 noundef [[TMP0:%.*]], ptr noalias noundef [[TMP1:%.*]]) #[[ATTR6:[0-9]+]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR_I:%.*]] = alloca i32, align 4
 // CHECK3-NEXT:    [[DOTPART_ID__ADDR_I:%.*]] = alloca ptr, align 4
@@ -2697,65 +2697,65 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], ptr [[TMP4]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 4
 // CHECK3-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES]], ptr [[TMP3]], i32 0, i32 1
-// CHECK3-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META30:![0-9]+]])
-// CHECK3-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META33:![0-9]+]])
-// CHECK3-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META35:![0-9]+]])
-// CHECK3-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META37:![0-9]+]])
-// CHECK3-NEXT:    store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !39
-// CHECK3-NEXT:    store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 4, !noalias !39
-// CHECK3-NEXT:    store ptr [[TMP8]], ptr [[DOTPRIVATES__ADDR_I]], align 4, !noalias !39
-// CHECK3-NEXT:    store ptr @.omp_task_privates_map., ptr [[DOTCOPY_FN__ADDR_I]], align 4, !noalias !39
-// CHECK3-NEXT:    store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 4, !noalias !39
-// CHECK3-NEXT:    store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 4, !noalias !39
-// CHECK3-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 4, !noalias !39
-// CHECK3-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[DOTCOPY_FN__ADDR_I]], align 4, !noalias !39
-// CHECK3-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[DOTPRIVATES__ADDR_I]], align 4, !noalias !39
-// CHECK3-NEXT:    call void [[TMP10]](ptr [[TMP11]], ptr [[DOTFIRSTPRIV_PTR_ADDR_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR2_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR3_I]]) #[[ATTR4]]
-// CHECK3-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR_I]], align 4, !noalias !39
-// CHECK3-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], align 4, !noalias !39
-// CHECK3-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR2_I]], align 4, !noalias !39
-// CHECK3-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR3_I]], align 4, !noalias !39
+// CHECK3-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META16:![0-9]+]])
+// CHECK3-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META19:![0-9]+]])
+// CHECK3-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META21:![0-9]+]])
+// CHECK3-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META23:![0-9]+]])
+// CHECK3-NEXT:    store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !25
+// CHECK3-NEXT:    store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 4, !noalias !25
+// CHECK3-NEXT:    store ptr [[TMP8]], ptr [[DOTPRIVATES__ADDR_I]], align 4, !noalias !25
+// CHECK3-NEXT:    store ptr @.omp_task_privates_map., ptr [[DOTCOPY_FN__ADDR_I]], align 4, !noalias !25
+// CHECK3-NEXT:    store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 4, !noalias !25
+// CHECK3-NEXT:    store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 4, !noalias !25
+// CHECK3-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 4, !noalias !25
+// CHECK3-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[DOTCOPY_FN__ADDR_I]], align 4, !noalias !25
+// CHECK3-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[DOTPRIVATES__ADDR_I]], align 4, !noalias !25
+// CHECK3-NEXT:    call void [[TMP10]](ptr [[TMP11]], ptr [[DOTFIRSTPRIV_PTR_ADDR_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR2_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR3_I]]) #[[ATTR3]]
+// CHECK3-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR_I]], align 4, !noalias !25
+// CHECK3-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], align 4, !noalias !25
+// CHECK3-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR2_I]], align 4, !noalias !25
+// CHECK3-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR3_I]], align 4, !noalias !25
 // CHECK3-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT_ANON:%.*]], ptr [[TMP9]], i32 0, i32 1
 // CHECK3-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_ANON]], ptr [[TMP9]], i32 0, i32 2
-// CHECK3-NEXT:    store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias !39
+// CHECK3-NEXT:    store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias !25
 // CHECK3-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 1
-// CHECK3-NEXT:    store i32 3, ptr [[TMP18]], align 4, !noalias !39
+// CHECK3-NEXT:    store i32 3, ptr [[TMP18]], align 4, !noalias !25
 // CHECK3-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 2
-// CHECK3-NEXT:    store ptr [[TMP13]], ptr [[TMP19]], align 4, !noalias !39
+// CHECK3-NEXT:    store ptr [[TMP13]], ptr [[TMP19]], align 4, !noalias !25
 // CHECK3-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 3
-// CHECK3-NEXT:    store ptr [[TMP14]], ptr [[TMP20]], align 4, !noalias !39
+// CHECK3-NEXT:    store ptr [[TMP14]], ptr [[TMP20]], align 4, !noalias !25
 // CHECK3-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 4
-// CHECK3-NEXT:    store ptr [[TMP15]], ptr [[TMP21]], align 4, !noalias !39
+// CHECK3-NEXT:    store ptr [[TMP15]], ptr [[TMP21]], align 4, !noalias !25
 // CHECK3-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 5
-// CHECK3-NEXT:    store ptr @.offload_maptypes, ptr [[TMP22]], align 4, !noalias !39
+// CHECK3-NEXT:    store ptr @.offload_maptypes, ptr [[TMP22]], align 4, !noalias !25
 // CHECK3-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 6
-// CHECK3-NEXT:    store ptr null, ptr [[TMP23]], align 4, !noalias !39
+// CHECK3-NEXT:    store ptr null, ptr [[TMP23]], align 4, !noalias !25
 // CHECK3-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 7
-// CHECK3-NEXT:    store ptr null, ptr [[TMP24]], align 4, !noalias !39
+// CHECK3-NEXT:    store ptr null, ptr [[TMP24]], align 4, !noalias !25
 // CHECK3-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 8
-// CHECK3-NEXT:    store i64 0, ptr [[TMP25]], align 8, !noalias !39
+// CHECK3-NEXT:    store i64 0, ptr [[TMP25]], align 8, !noalias !25
 // CHECK3-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 9
-// CHECK3-NEXT:    store i64 1, ptr [[TMP26]], align 8, !noalias !39
+// CHECK3-NEXT:    store i64 1, ptr [[TMP26]], align 8, !noalias !25
 // CHECK3-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 10
-// CHECK3-NEXT:    store [3 x i32] [i32 1, i32 0, i32 0], ptr [[TMP27]], align 4, !noalias !39
+// CHECK3-NEXT:    store [3 x i32] [i32 1, i32 0, i32 0], ptr [[TMP27]], align 4, !noalias !25
 // CHECK3-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 11
-// CHECK3-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP28]], align 4, !noalias !39
+// CHECK3-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP28]], align 4, !noalias !25
 // CHECK3-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 12
-// CHECK3-NEXT:    store i32 0, ptr [[TMP29]], align 4, !noalias !39
+// CHECK3-NEXT:    store i32 0, ptr [[TMP29]], align 4, !noalias !25
 // CHECK3-NEXT:    [[TMP30:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB2]], i64 -1, i32 1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l128.region_id, ptr [[KERNEL_ARGS_I]])
 // CHECK3-NEXT:    [[TMP31:%.*]] = icmp ne i32 [[TMP30]], 0
 // CHECK3-NEXT:    br i1 [[TMP31]], label [[OMP_OFFLOAD_FAILED_I:%.*]], label [[DOTOMP_OUTLINED__EXIT:%.*]]
 // CHECK3:       omp_offload.failed.i:
 // CHECK3-NEXT:    [[TMP32:%.*]] = load i16, ptr [[TMP12]], align 2
-// CHECK3-NEXT:    store i16 [[TMP32]], ptr [[AA_CASTED_I]], align 2, !noalias !39
-// CHECK3-NEXT:    [[TMP33:%.*]] = load i32, ptr [[AA_CASTED_I]], align 4, !noalias !39
+// CHECK3-NEXT:    store i16 [[TMP32]], ptr [[AA_CASTED_I]], align 2, !noalias !25
+// CHECK3-NEXT:    [[TMP33:%.*]] = load i32, ptr [[AA_CASTED_I]], align 4, !noalias !25
 // CHECK3-NEXT:    [[TMP34:%.*]] = load i32, ptr [[TMP16]], align 4
-// CHECK3-NEXT:    store i32 [[TMP34]], ptr [[LIN_CASTED_I]], align 4, !noalias !39
-// CHECK3-NEXT:    [[TMP35:%.*]] = load i32, ptr [[LIN_CASTED_I]], align 4, !noalias !39
+// CHECK3-NEXT:    store i32 [[TMP34]], ptr [[LIN_CASTED_I]], align 4, !noalias !25
+// CHECK3-NEXT:    [[TMP35:%.*]] = load i32, ptr [[LIN_CASTED_I]], align 4, !noalias !25
 // CHECK3-NEXT:    [[TMP36:%.*]] = load i32, ptr [[TMP17]], align 4
-// CHECK3-NEXT:    store i32 [[TMP36]], ptr [[A_CASTED_I]], align 4, !noalias !39
-// CHECK3-NEXT:    [[TMP37:%.*]] = load i32, ptr [[A_CASTED_I]], align 4, !noalias !39
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l128(i32 [[TMP33]], i32 [[TMP35]], i32 [[TMP37]]) #[[ATTR4]]
+// CHECK3-NEXT:    store i32 [[TMP36]], ptr [[A_CASTED_I]], align 4, !noalias !25
+// CHECK3-NEXT:    [[TMP37:%.*]] = load i32, ptr [[A_CASTED_I]], align 4, !noalias !25
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l128(i32 [[TMP33]], i32 [[TMP35]], i32 [[TMP37]]) #[[ATTR3]]
 // CHECK3-NEXT:    br label [[DOTOMP_OUTLINED__EXIT]]
 // CHECK3:       .omp_outlined..exit:
 // CHECK3-NEXT:    ret i32 0
@@ -2781,7 +2781,7 @@ int bar(int n){
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l136.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]]) #[[ATTR3]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]]) #[[ATTR2]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -2897,7 +2897,7 @@ int bar(int n){
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l160.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BN:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[C:%.*]], i32 noundef [[VLA1:%.*]], i32 noundef [[VLA3:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[CN:%.*]], ptr noundef nonnull align 4 dereferenceable(12) [[D:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BN:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[C:%.*]], i32 noundef [[VLA1:%.*]], i32 noundef [[VLA3:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[CN:%.*]], ptr noundef nonnull align 4 dereferenceable(12) [[D:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -3174,12 +3174,12 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP43:%.*]] = icmp ne i32 [[TMP42]], 0
 // CHECK3-NEXT:    br i1 [[TMP43]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK3:       omp_offload.failed:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l232(ptr [[THIS1]], i32 [[TMP5]], i32 2, i32 [[TMP1]], ptr [[VLA]]) #[[ATTR4]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l232(ptr [[THIS1]], i32 [[TMP5]], i32 2, i32 [[TMP1]], ptr [[VLA]]) #[[ATTR3]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK3:       omp_offload.cont:
 // CHECK3-NEXT:    br label [[OMP_IF_END:%.*]]
 // CHECK3:       omp_if.else:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l232(ptr [[THIS1]], i32 [[TMP5]], i32 2, i32 [[TMP1]], ptr [[VLA]]) #[[ATTR4]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l232(ptr [[THIS1]], i32 [[TMP5]], i32 2, i32 [[TMP1]], ptr [[VLA]]) #[[ATTR3]]
 // CHECK3-NEXT:    br label [[OMP_IF_END]]
 // CHECK3:       omp_if.end:
 // CHECK3-NEXT:    [[TMP44:%.*]] = mul nsw i32 1, [[TMP1]]
@@ -3282,12 +3282,12 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP35:%.*]] = icmp ne i32 [[TMP34]], 0
 // CHECK3-NEXT:    br i1 [[TMP35]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK3:       omp_offload.failed:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l214(i32 [[TMP1]], i32 [[TMP3]], i32 [[TMP5]], ptr [[B]]) #[[ATTR4]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l214(i32 [[TMP1]], i32 [[TMP3]], i32 [[TMP5]], ptr [[B]]) #[[ATTR3]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK3:       omp_offload.cont:
 // CHECK3-NEXT:    br label [[OMP_IF_END:%.*]]
 // CHECK3:       omp_if.else:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l214(i32 [[TMP1]], i32 [[TMP3]], i32 [[TMP5]], ptr [[B]]) #[[ATTR4]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l214(i32 [[TMP1]], i32 [[TMP3]], i32 [[TMP5]], ptr [[B]]) #[[ATTR3]]
 // CHECK3-NEXT:    br label [[OMP_IF_END]]
 // CHECK3:       omp_if.end:
 // CHECK3-NEXT:    [[TMP36:%.*]] = load i32, ptr [[A]], align 4
@@ -3370,12 +3370,12 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP30:%.*]] = icmp ne i32 [[TMP29]], 0
 // CHECK3-NEXT:    br i1 [[TMP30]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK3:       omp_offload.failed:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l197(i32 [[TMP1]], i32 [[TMP3]], ptr [[B]]) #[[ATTR4]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l197(i32 [[TMP1]], i32 [[TMP3]], ptr [[B]]) #[[ATTR3]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK3:       omp_offload.cont:
 // CHECK3-NEXT:    br label [[OMP_IF_END:%.*]]
 // CHECK3:       omp_if.else:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l197(i32 [[TMP1]], i32 [[TMP3]], ptr [[B]]) #[[ATTR4]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l197(i32 [[TMP1]], i32 [[TMP3]], ptr [[B]]) #[[ATTR3]]
 // CHECK3-NEXT:    br label [[OMP_IF_END]]
 // CHECK3:       omp_if.end:
 // CHECK3-NEXT:    [[TMP31:%.*]] = load i32, ptr [[A]], align 4
@@ -3408,7 +3408,7 @@ int bar(int n){
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l232.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]], i32 noundef [[B:%.*]], i32 noundef [[VLA:%.*]], i32 noundef [[VLA1:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[C:%.*]]) #[[ATTR3]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]], i32 noundef [[B:%.*]], i32 noundef [[VLA:%.*]], i32 noundef [[VLA1:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[C:%.*]]) #[[ATTR2]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -3524,7 +3524,7 @@ int bar(int n){
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l214.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]], i32 noundef [[AAA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR3]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]], i32 noundef [[AAA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR2]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -3567,7 +3567,7 @@ int bar(int n){
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l197.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR3]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR2]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -3646,7 +3646,7 @@ int bar(int n){
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK3-SAME: () #[[ATTR6]] {
+// CHECK3-SAME: () #[[ATTR5]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK3-NEXT:    ret void
@@ -3660,7 +3660,7 @@ int bar(int n){
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l103.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -3762,7 +3762,7 @@ int bar(int n){
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l128.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[AA:%.*]], i64 noundef [[LIN:%.*]], i64 noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[AA:%.*]], i64 noundef [[LIN:%.*]], i64 noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -3790,7 +3790,7 @@ int bar(int n){
 // CHECK9-NEXT:    store i32 [[TMP0]], ptr [[DOTLINEAR_START]], align 4
 // CHECK9-NEXT:    [[TMP1:%.*]] = load i32, ptr [[A_ADDR]], align 4
 // CHECK9-NEXT:    store i32 [[TMP1]], ptr [[DOTLINEAR_START1]], align 4
-// CHECK9-NEXT:    [[CALL:%.*]] = call noundef i64 @_Z7get_valv() #[[ATTR5:[0-9]+]]
+// CHECK9-NEXT:    [[CALL:%.*]] = call noundef i64 @_Z7get_valv() #[[ATTR4:[0-9]+]]
 // CHECK9-NEXT:    store i64 [[CALL]], ptr [[DOTLINEAR_STEP]], align 8
 // CHECK9-NEXT:    store i64 0, ptr [[DOTOMP_LB]], align 8
 // CHECK9-NEXT:    store i64 3, ptr [[DOTOMP_UB]], align 8
@@ -3871,7 +3871,7 @@ int bar(int n){
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@_Z7get_valv
-// CHECK9-SAME: () #[[ATTR3:[0-9]+]] {
+// CHECK9-SAME: () #[[ATTR2:[0-9]+]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    ret i64 0
 //
@@ -3896,7 +3896,7 @@ int bar(int n){
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l136.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]]) #[[ATTR1]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]]) #[[ATTR0]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -4012,7 +4012,7 @@ int bar(int n){
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l160.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BN:%.*]], ptr noundef nonnull align 8 dereferenceable(400) [[C:%.*]], i64 noundef [[VLA1:%.*]], i64 noundef [[VLA3:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[CN:%.*]], ptr noundef nonnull align 8 dereferenceable(16) [[D:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR1]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BN:%.*]], ptr noundef nonnull align 8 dereferenceable(400) [[C:%.*]], i64 noundef [[VLA1:%.*]], i64 noundef [[VLA3:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[CN:%.*]], ptr noundef nonnull align 8 dereferenceable(16) [[D:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -4183,7 +4183,7 @@ int bar(int n){
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l214.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]], i64 noundef [[AAA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR1]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]], i64 noundef [[AAA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -4229,7 +4229,7 @@ int bar(int n){
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l232.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]], i64 noundef [[B:%.*]], i64 noundef [[VLA:%.*]], i64 noundef [[VLA1:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[C:%.*]]) #[[ATTR1]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]], i64 noundef [[B:%.*]], i64 noundef [[VLA:%.*]], i64 noundef [[VLA1:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[C:%.*]]) #[[ATTR0]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -4339,7 +4339,7 @@ int bar(int n){
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l197.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR1]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -4425,7 +4425,7 @@ int bar(int n){
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l103.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -4527,7 +4527,7 @@ int bar(int n){
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l128.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[AA:%.*]], i32 noundef [[LIN:%.*]], i32 noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[AA:%.*]], i32 noundef [[LIN:%.*]], i32 noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -4555,7 +4555,7 @@ int bar(int n){
 // CHECK11-NEXT:    store i32 [[TMP0]], ptr [[DOTLINEAR_START]], align 4
 // CHECK11-NEXT:    [[TMP1:%.*]] = load i32, ptr [[A_ADDR]], align 4
 // CHECK11-NEXT:    store i32 [[TMP1]], ptr [[DOTLINEAR_START1]], align 4
-// CHECK11-NEXT:    [[CALL:%.*]] = call noundef i64 @_Z7get_valv() #[[ATTR5:[0-9]+]]
+// CHECK11-NEXT:    [[CALL:%.*]] = call noundef i64 @_Z7get_valv() #[[ATTR4:[0-9]+]]
 // CHECK11-NEXT:    store i64 [[CALL]], ptr [[DOTLINEAR_STEP]], align 8
 // CHECK11-NEXT:    store i64 0, ptr [[DOTOMP_LB]], align 8
 // CHECK11-NEXT:    store i64 3, ptr [[DOTOMP_UB]], align 8
@@ -4636,7 +4636,7 @@ int bar(int n){
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@_Z7get_valv
-// CHECK11-SAME: () #[[ATTR3:[0-9]+]] {
+// CHECK11-SAME: () #[[ATTR2:[0-9]+]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    ret i64 0
 //
@@ -4661,7 +4661,7 @@ int bar(int n){
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l136.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]]) #[[ATTR1]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]]) #[[ATTR0]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -4777,7 +4777,7 @@ int bar(int n){
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l160.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BN:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[C:%.*]], i32 noundef [[VLA1:%.*]], i32 noundef [[VLA3:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[CN:%.*]], ptr noundef nonnull align 4 dereferenceable(12) [[D:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR1]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BN:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[C:%.*]], i32 noundef [[VLA1:%.*]], i32 noundef [[VLA3:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[CN:%.*]], ptr noundef nonnull align 4 dereferenceable(12) [[D:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -4948,7 +4948,7 @@ int bar(int n){
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l214.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]], i32 noundef [[AAA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR1]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]], i32 noundef [[AAA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -4994,7 +4994,7 @@ int bar(int n){
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l232.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]], i32 noundef [[B:%.*]], i32 noundef [[VLA:%.*]], i32 noundef [[VLA1:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[C:%.*]]) #[[ATTR1]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]], i32 noundef [[B:%.*]], i32 noundef [[VLA:%.*]], i32 noundef [[VLA1:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[C:%.*]]) #[[ATTR0]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -5104,7 +5104,7 @@ int bar(int n){
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l197.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR1]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -5271,7 +5271,7 @@ int bar(int n){
 // CHECK17-NEXT:    [[TMP21:%.*]] = icmp ne i32 [[TMP20]], 0
 // CHECK17-NEXT:    br i1 [[TMP21]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK17:       omp_offload.failed:
-// CHECK17-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l103() #[[ATTR4:[0-9]+]]
+// CHECK17-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l103() #[[ATTR3:[0-9]+]]
 // CHECK17-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK17:       omp_offload.cont:
 // CHECK17-NEXT:    [[CALL:%.*]] = call noundef i64 @_Z7get_valv()
@@ -5282,7 +5282,7 @@ int bar(int n){
 // CHECK17-NEXT:    [[TMP24:%.*]] = load i64, ptr [[K]], align 8
 // CHECK17-NEXT:    store i64 [[TMP24]], ptr [[K_CASTED]], align 8
 // CHECK17-NEXT:    [[TMP25:%.*]] = load i64, ptr [[K_CASTED]], align 8
-// CHECK17-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l110(i64 [[TMP23]], i64 [[TMP25]]) #[[ATTR4]]
+// CHECK17-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l110(i64 [[TMP23]], i64 [[TMP25]]) #[[ATTR3]]
 // CHECK17-NEXT:    store i32 12, ptr [[LIN]], align 4
 // CHECK17-NEXT:    [[TMP26:%.*]] = load i16, ptr [[AA]], align 2
 // CHECK17-NEXT:    store i16 [[TMP26]], ptr [[AA_CASTED]], align 2
@@ -5392,12 +5392,12 @@ int bar(int n){
 // CHECK17-NEXT:    [[TMP87:%.*]] = icmp ne i32 [[TMP86]], 0
 // CHECK17-NEXT:    br i1 [[TMP87]], label [[OMP_OFFLOAD_FAILED9:%.*]], label [[OMP_OFFLOAD_CONT10:%.*]]
 // CHECK17:       omp_offload.failed9:
-// CHECK17-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l136(i64 [[TMP61]], i64 [[TMP63]]) #[[ATTR4]]
+// CHECK17-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l136(i64 [[TMP61]], i64 [[TMP63]]) #[[ATTR3]]
 // CHECK17-NEXT:    br label [[OMP_OFFLOAD_CONT10]]
 // CHECK17:       omp_offload.cont10:
 // CHECK17-NEXT:    br label [[OMP_IF_END:%.*]]
 // CHECK17:       omp_if.else:
-// CHECK17-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l136(i64 [[TMP61]], i64 [[TMP63]]) #[[ATTR4]]
+// CHECK17-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l136(i64 [[TMP61]], i64 [[TMP63]]) #[[ATTR3]]
 // CHECK17-NEXT:    br label [[OMP_IF_END]]
 // CHECK17:       omp_if.end:
 // CHECK17-NEXT:    [[TMP88:%.*]] = load i32, ptr [[A]], align 4
@@ -5513,12 +5513,12 @@ int bar(int n){
 // CHECK17-NEXT:    [[TMP146:%.*]] = icmp ne i32 [[TMP145]], 0
 // CHECK17-NEXT:    br i1 [[TMP146]], label [[OMP_OFFLOAD_FAILED18:%.*]], label [[OMP_OFFLOAD_CONT19:%.*]]
 // CHECK17:       omp_offload.failed18:
-// CHECK17-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l160(i64 [[TMP90]], ptr [[B]], i64 [[TMP2]], ptr [[VLA]], ptr [[C]], i64 5, i64 [[TMP5]], ptr [[VLA1]], ptr [[D]], i64 [[TMP92]]) #[[ATTR4]]
+// CHECK17-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l160(i64 [[TMP90]], ptr [[B]], i64 [[TMP2]], ptr [[VLA]], ptr [[C]], i64 5, i64 [[TMP5]], ptr [[VLA1]], ptr [[D]], i64 [[TMP92]]) #[[ATTR3]]
 // CHECK17-NEXT:    br label [[OMP_OFFLOAD_CONT19]]
 // CHECK17:       omp_offload.cont19:
 // CHECK17-NEXT:    br label [[OMP_IF_END21:%.*]]
 // CHECK17:       omp_if.else20:
-// CHECK17-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l160(i64 [[TMP90]], ptr [[B]], i64 [[TMP2]], ptr [[VLA]], ptr [[C]], i64 5, i64 [[TMP5]], ptr [[VLA1]], ptr [[D]], i64 [[TMP92]]) #[[ATTR4]]
+// CHECK17-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l160(i64 [[TMP90]], ptr [[B]], i64 [[TMP2]], ptr [[VLA]], ptr [[C]], i64 5, i64 [[TMP5]], ptr [[VLA1]], ptr [[D]], i64 [[TMP92]]) #[[ATTR3]]
 // CHECK17-NEXT:    br label [[OMP_IF_END21]]
 // CHECK17:       omp_if.end21:
 // CHECK17-NEXT:    [[TMP147:%.*]] = load i32, ptr [[A]], align 4
@@ -5535,7 +5535,7 @@ int bar(int n){
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l103.omp_outlined
-// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] {
 // CHECK17-NEXT:  entry:
 // CHECK17-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -5612,7 +5612,7 @@ int bar(int n){
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l110
-// CHECK17-SAME: (i64 noundef [[A:%.*]], i64 noundef [[K:%.*]]) #[[ATTR3]] {
+// CHECK17-SAME: (i64 noundef [[A:%.*]], i64 noundef [[K:%.*]]) #[[ATTR2]] {
 // CHECK17-NEXT:  entry:
 // CHECK17-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
 // CHECK17-NEXT:    [[K_ADDR:%.*]] = alloca i64, align 8
@@ -5631,7 +5631,7 @@ int bar(int n){
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l110.omp_outlined
-// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], i64 noundef [[K:%.*]]) #[[ATTR3]] {
+// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], i64 noundef [[K:%.*]]) #[[ATTR2]] {
 // CHECK17-NEXT:  entry:
 // CHECK17-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -5670,32 +5670,32 @@ int bar(int n){
 // CHECK17-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
 // CHECK17-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK17:       omp.inner.for.cond:
-// CHECK17-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26:![0-9]+]]
-// CHECK17-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK17-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12:![0-9]+]]
+// CHECK17-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP12]]
 // CHECK17-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
 // CHECK17-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK17:       omp.inner.for.body:
-// CHECK17-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK17-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12]]
 // CHECK17-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
 // CHECK17-NEXT:    [[SUB:%.*]] = sub nsw i32 10, [[MUL]]
-// CHECK17-NEXT:    store i32 [[SUB]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP26]]
-// CHECK17-NEXT:    [[TMP8:%.*]] = load i64, ptr [[DOTLINEAR_START]], align 8, !llvm.access.group [[ACC_GRP26]]
-// CHECK17-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK17-NEXT:    store i32 [[SUB]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK17-NEXT:    [[TMP8:%.*]] = load i64, ptr [[DOTLINEAR_START]], align 8, !llvm.access.group [[ACC_GRP12]]
+// CHECK17-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12]]
 // CHECK17-NEXT:    [[MUL2:%.*]] = mul nsw i32 [[TMP9]], 3
 // CHECK17-NEXT:    [[CONV:%.*]] = sext i32 [[MUL2]] to i64
 // CHECK17-NEXT:    [[ADD:%.*]] = add nsw i64 [[TMP8]], [[CONV]]
-// CHECK17-NEXT:    store i64 [[ADD]], ptr [[K1]], align 8, !llvm.access.group [[ACC_GRP26]]
-// CHECK17-NEXT:    [[TMP10:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK17-NEXT:    store i64 [[ADD]], ptr [[K1]], align 8, !llvm.access.group [[ACC_GRP12]]
+// CHECK17-NEXT:    [[TMP10:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP12]]
 // CHECK17-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP10]], 1
-// CHECK17-NEXT:    store i32 [[ADD3]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK17-NEXT:    store i32 [[ADD3]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP12]]
 // CHECK17-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK17:       omp.body.continue:
 // CHECK17-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK17:       omp.inner.for.inc:
-// CHECK17-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK17-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12]]
 // CHECK17-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP11]], 1
-// CHECK17-NEXT:    store i32 [[ADD4]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
-// CHECK17-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP27:![0-9]+]]
+// CHECK17-NEXT:    store i32 [[ADD4]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK17-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP13:![0-9]+]]
 // CHECK17:       omp.inner.for.end:
 // CHECK17-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK17:       omp.dispatch.inc:
@@ -5738,7 +5738,7 @@ int bar(int n){
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l128.omp_outlined
-// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[AA:%.*]], i64 noundef [[LIN:%.*]], i64 noundef [[A:%.*]]) #[[ATTR3]] {
+// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[AA:%.*]], i64 noundef [[LIN:%.*]], i64 noundef [[A:%.*]]) #[[ATTR2]] {
 // CHECK17-NEXT:  entry:
 // CHECK17-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -5847,7 +5847,7 @@ int bar(int n){
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@.omp_task_privates_map.
-// CHECK17-SAME: (ptr noalias noundef [[TMP0:%.*]], ptr noalias noundef [[TMP1:%.*]], ptr noalias noundef [[TMP2:%.*]], ptr noalias noundef [[TMP3:%.*]], ptr noalias noundef [[TMP4:%.*]]) #[[ATTR6:[0-9]+]] {
+// CHECK17-SAME: (ptr noalias noundef [[TMP0:%.*]], ptr noalias noundef [[TMP1:%.*]], ptr noalias noundef [[TMP2:%.*]], ptr noalias noundef [[TMP3:%.*]], ptr noalias noundef [[TMP4:%.*]]) #[[ATTR5:[0-9]+]] {
 // CHECK17-NEXT:  entry:
 // CHECK17-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8
@@ -5876,7 +5876,7 @@ int bar(int n){
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@.omp_task_entry.
-// CHECK17-SAME: (i32 noundef signext [[TMP0:%.*]], ptr noalias noundef [[TMP1:%.*]]) #[[ATTR7:[0-9]+]] {
+// CHECK17-SAME: (i32 noundef signext [[TMP0:%.*]], ptr noalias noundef [[TMP1:%.*]]) #[[ATTR6:[0-9]+]] {
 // CHECK17-NEXT:  entry:
 // CHECK17-NEXT:    [[DOTGLOBAL_TID__ADDR_I:%.*]] = alloca i32, align 4
 // CHECK17-NEXT:    [[DOTPART_ID__ADDR_I:%.*]] = alloca ptr, align 8
@@ -5903,65 +5903,65 @@ int bar(int n){
 // CHECK17-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], ptr [[TMP4]], i32 0, i32 0
 // CHECK17-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8
 // CHECK17-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES]], ptr [[TMP3]], i32 0, i32 1
-// CHECK17-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META29:![0-9]+]])
-// CHECK17-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META32:![0-9]+]])
-// CHECK17-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META34:![0-9]+]])
-// CHECK17-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META36:![0-9]+]])
-// CHECK17-NEXT:    store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !38
-// CHECK17-NEXT:    store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 8, !noalias !38
-// CHECK17-NEXT:    store ptr [[TMP8]], ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias !38
-// CHECK17-NEXT:    store ptr @.omp_task_privates_map., ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !38
-// CHECK17-NEXT:    store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 8, !noalias !38
-// CHECK17-NEXT:    store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !38
-// CHECK17-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !38
-// CHECK17-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !38
-// CHECK17-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias !38
-// CHECK17-NEXT:    call void [[TMP10]](ptr [[TMP11]], ptr [[DOTFIRSTPRIV_PTR_ADDR_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR2_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR3_I]]) #[[ATTR4]]
-// CHECK17-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR_I]], align 8, !noalias !38
-// CHECK17-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], align 8, !noalias !38
-// CHECK17-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR2_I]], align 8, !noalias !38
-// CHECK17-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR3_I]], align 8, !noalias !38
+// CHECK17-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META15:![0-9]+]])
+// CHECK17-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META18:![0-9]+]])
+// CHECK17-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META20:![0-9]+]])
+// CHECK17-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META22:![0-9]+]])
+// CHECK17-NEXT:    store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !24
+// CHECK17-NEXT:    store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 8, !noalias !24
+// CHECK17-NEXT:    store ptr [[TMP8]], ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias !24
+// CHECK17-NEXT:    store ptr @.omp_task_privates_map., ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !24
+// CHECK17-NEXT:    store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 8, !noalias !24
+// CHECK17-NEXT:    store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !24
+// CHECK17-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !24
+// CHECK17-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !24
+// CHECK17-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias !24
+// CHECK17-NEXT:    call void [[TMP10]](ptr [[TMP11]], ptr [[DOTFIRSTPRIV_PTR_ADDR_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR2_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR3_I]]) #[[ATTR3]]
+// CHECK17-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR_I]], align 8, !noalias !24
+// CHECK17-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], align 8, !noalias !24
+// CHECK17-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR2_I]], align 8, !noalias !24
+// CHECK17-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR3_I]], align 8, !noalias !24
 // CHECK17-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT_ANON:%.*]], ptr [[TMP9]], i32 0, i32 1
 // CHECK17-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_ANON]], ptr [[TMP9]], i32 0, i32 2
-// CHECK17-NEXT:    store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias !38
+// CHECK17-NEXT:    store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias !24
 // CHECK17-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 1
-// CHECK17-NEXT:    store i32 3, ptr [[TMP18]], align 4, !noalias !38
+// CHECK17-NEXT:    store i32 3, ptr [[TMP18]], align 4, !noalias !24
 // CHECK17-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 2
-// CHECK17-NEXT:    store ptr [[TMP13]], ptr [[TMP19]], align 8, !noalias !38
+// CHECK17-NEXT:    store ptr [[TMP13]], ptr [[TMP19]], align 8, !noalias !24
 // CHECK17-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 3
-// CHECK17-NEXT:    store ptr [[TMP14]], ptr [[TMP20]], align 8, !noalias !38
+// CHECK17-NEXT:    store ptr [[TMP14]], ptr [[TMP20]], align 8, !noalias !24
 // CHECK17-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 4
-// CHECK17-NEXT:    store ptr [[TMP15]], ptr [[TMP21]], align 8, !noalias !38
+// CHECK17-NEXT:    store ptr [[TMP15]], ptr [[TMP21]], align 8, !noalias !24
 // CHECK17-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 5
-// CHECK17-NEXT:    store ptr @.offload_maptypes, ptr [[TMP22]], align 8, !noalias !38
+// CHECK17-NEXT:    store ptr @.offload_maptypes, ptr [[TMP22]], align 8, !noalias !24
 // CHECK17-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 6
-// CHECK17-NEXT:    store ptr null, ptr [[TMP23]], align 8, !noalias !38
+// CHECK17-NEXT:    store ptr null, ptr [[TMP23]], align 8, !noalias !24
 // CHECK17-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 7
-// CHECK17-NEXT:    store ptr null, ptr [[TMP24]], align 8, !noalias !38
+// CHECK17-NEXT:    store ptr null, ptr [[TMP24]], align 8, !noalias !24
 // CHECK17-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 8
-// CHECK17-NEXT:    store i64 0, ptr [[TMP25]], align 8, !noalias !38
+// CHECK17-NEXT:    store i64 0, ptr [[TMP25]], align 8, !noalias !24
 // CHECK17-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 9
-// CHECK17-NEXT:    store i64 1, ptr [[TMP26]], align 8, !noalias !38
+// CHECK17-NEXT:    store i64 1, ptr [[TMP26]], align 8, !noalias !24
 // CHECK17-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 10
-// CHECK17-NEXT:    store [3 x i32] [i32 1, i32 0, i32 0], ptr [[TMP27]], align 4, !noalias !38
+// CHECK17-NEXT:    store [3 x i32] [i32 1, i32 0, i32 0], ptr [[TMP27]], align 4, !noalias !24
 // CHECK17-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 11
-// CHECK17-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP28]], align 4, !noalias !38
+// CHECK17-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP28]], align 4, !noalias !24
 // CHECK17-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 12
-// CHECK17-NEXT:    store i32 0, ptr [[TMP29]], align 4, !noalias !38
+// CHECK17-NEXT:    store i32 0, ptr [[TMP29]], align 4, !noalias !24
 // CHECK17-NEXT:    [[TMP30:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB2]], i64 -1, i32 1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l128.region_id, ptr [[KERNEL_ARGS_I]])
 // CHECK17-NEXT:    [[TMP31:%.*]] = icmp ne i32 [[TMP30]], 0
 // CHECK17-NEXT:    br i1 [[TMP31]], label [[OMP_OFFLOAD_FAILED_I:%.*]], label [[DOTOMP_OUTLINED__EXIT:%.*]]
 // CHECK17:       omp_offload.failed.i:
 // CHECK17-NEXT:    [[TMP32:%.*]] = load i16, ptr [[TMP12]], align 2
-// CHECK17-NEXT:    store i16 [[TMP32]], ptr [[AA_CASTED_I]], align 2, !noalias !38
-// CHECK17-NEXT:    [[TMP33:%.*]] = load i64, ptr [[AA_CASTED_I]], align 8, !noalias !38
+// CHECK17-NEXT:    store i16 [[TMP32]], ptr [[AA_CASTED_I]], align 2, !noalias !24
+// CHECK17-NEXT:    [[TMP33:%.*]] = load i64, ptr [[AA_CASTED_I]], align 8, !noalias !24
 // CHECK17-NEXT:    [[TMP34:%.*]] = load i32, ptr [[TMP16]], align 4
-// CHECK17-NEXT:    store i32 [[TMP34]], ptr [[LIN_CASTED_I]], align 4, !noalias !38
-// CHECK17-NEXT:    [[TMP35:%.*]] = load i64, ptr [[LIN_CASTED_I]], align 8, !noalias !38
+// CHECK17-NEXT:    store i32 [[TMP34]], ptr [[LIN_CASTED_I]], align 4, !noalias !24
+// CHECK17-NEXT:    [[TMP35:%.*]] = load i64, ptr [[LIN_CASTED_I]], align 8, !noalias !24
 // CHECK17-NEXT:    [[TMP36:%.*]] = load i32, ptr [[TMP17]], align 4
-// CHECK17-NEXT:    store i32 [[TMP36]], ptr [[A_CASTED_I]], align 4, !noalias !38
-// CHECK17-NEXT:    [[TMP37:%.*]] = load i64, ptr [[A_CASTED_I]], align 8, !noalias !38
-// CHECK17-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l128(i64 [[TMP33]], i64 [[TMP35]], i64 [[TMP37]]) #[[ATTR4]]
+// CHECK17-NEXT:    store i32 [[TMP36]], ptr [[A_CASTED_I]], align 4, !noalias !24
+// CHECK17-NEXT:    [[TMP37:%.*]] = load i64, ptr [[A_CASTED_I]], align 8, !noalias !24
+// CHECK17-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l128(i64 [[TMP33]], i64 [[TMP35]], i64 [[TMP37]]) #[[ATTR3]]
 // CHECK17-NEXT:    br label [[DOTOMP_OUTLINED__EXIT]]
 // CHECK17:       .omp_outlined..exit:
 // CHECK17-NEXT:    ret i32 0
@@ -5987,7 +5987,7 @@ int bar(int n){
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l136.omp_outlined
-// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]]) #[[ATTR3]] {
+// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]]) #[[ATTR2]] {
 // CHECK17-NEXT:  entry:
 // CHECK17-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -6103,7 +6103,7 @@ int bar(int n){
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l160.omp_outlined
-// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BN:%.*]], ptr noundef nonnull align 8 dereferenceable(400) [[C:%.*]], i64 noundef [[VLA1:%.*]], i64 noundef [[VLA3:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[CN:%.*]], ptr noundef nonnull align 8 dereferenceable(16) [[D:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] {
+// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BN:%.*]], ptr noundef nonnull align 8 dereferenceable(400) [[C:%.*]], i64 noundef [[VLA1:%.*]], i64 noundef [[VLA3:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[CN:%.*]], ptr noundef nonnull align 8 dereferenceable(16) [[D:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
 // CHECK17-NEXT:  entry:
 // CHECK17-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -6380,12 +6380,12 @@ int bar(int n){
 // CHECK17-NEXT:    [[TMP43:%.*]] = icmp ne i32 [[TMP42]], 0
 // CHECK17-NEXT:    br i1 [[TMP43]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK17:       omp_offload.failed:
-// CHECK17-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l232(ptr [[THIS1]], i64 [[TMP6]], i64 2, i64 [[TMP2]], ptr [[VLA]]) #[[ATTR4]]
+// CHECK17-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l232(ptr [[THIS1]], i64 [[TMP6]], i64 2, i64 [[TMP2]], ptr [[VLA]]) #[[ATTR3]]
 // CHECK17-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK17:       omp_offload.cont:
 // CHECK17-NEXT:    br label [[OMP_IF_END:%.*]]
 // CHECK17:       omp_if.else:
-// CHECK17-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l232(ptr [[THIS1]], i64 [[TMP6]], i64 2, i64 [[TMP2]], ptr [[VLA]]) #[[ATTR4]]
+// CHECK17-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l232(ptr [[THIS1]], i64 [[TMP6]], i64 2, i64 [[TMP2]], ptr [[VLA]]) #[[ATTR3]]
 // CHECK17-NEXT:    br label [[OMP_IF_END]]
 // CHECK17:       omp_if.end:
 // CHECK17-NEXT:    [[TMP44:%.*]] = mul nsw i64 1, [[TMP2]]
@@ -6488,12 +6488,12 @@ int bar(int n){
 // CHECK17-NEXT:    [[TMP35:%.*]] = icmp ne i32 [[TMP34]], 0
 // CHECK17-NEXT:    br i1 [[TMP35]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK17:       omp_offload.failed:
-// CHECK17-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l214(i64 [[TMP1]], i64 [[TMP3]], i64 [[TMP5]], ptr [[B]]) #[[ATTR4]]
+// CHECK17-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l214(i64 [[TMP1]], i64 [[TMP3]], i64 [[TMP5]], ptr [[B]]) #[[ATTR3]]
 // CHECK17-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK17:       omp_offload.cont:
 // CHECK17-NEXT:    br label [[OMP_IF_END:%.*]]
 // CHECK17:       omp_if.else:
-// CHECK17-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l214(i64 [[TMP1]], i64 [[TMP3]], i64 [[TMP5]], ptr [[B]]) #[[ATTR4]]
+// CHECK17-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l214(i64 [[TMP1]], i64 [[TMP3]], i64 [[TMP5]], ptr [[B]]) #[[ATTR3]]
 // CHECK17-NEXT:    br label [[OMP_IF_END]]
 // CHECK17:       omp_if.end:
 // CHECK17-NEXT:    [[TMP36:%.*]] = load i32, ptr [[A]], align 4
@@ -6576,12 +6576,12 @@ int bar(int n){
 // CHECK17-NEXT:    [[TMP30:%.*]] = icmp ne i32 [[TMP29]], 0
 // CHECK17-NEXT:    br i1 [[TMP30]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK17:       omp_offload.failed:
-// CHECK17-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l197(i64 [[TMP1]], i64 [[TMP3]], ptr [[B]]) #[[ATTR4]]
+// CHECK17-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l197(i64 [[TMP1]], i64 [[TMP3]], ptr [[B]]) #[[ATTR3]]
 // CHECK17-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK17:       omp_offload.cont:
 // CHECK17-NEXT:    br label [[OMP_IF_END:%.*]]
 // CHECK17:       omp_if.else:
-// CHECK17-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l197(i64 [[TMP1]], i64 [[TMP3]], ptr [[B]]) #[[ATTR4]]
+// CHECK17-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l197(i64 [[TMP1]], i64 [[TMP3]], ptr [[B]]) #[[ATTR3]]
 // CHECK17-NEXT:    br label [[OMP_IF_END]]
 // CHECK17:       omp_if.end:
 // CHECK17-NEXT:    [[TMP31:%.*]] = load i32, ptr [[A]], align 4
@@ -6614,7 +6614,7 @@ int bar(int n){
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l232.omp_outlined
-// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]], i64 noundef [[B:%.*]], i64 noundef [[VLA:%.*]], i64 noundef [[VLA1:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[C:%.*]]) #[[ATTR3]] {
+// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]], i64 noundef [[B:%.*]], i64 noundef [[VLA:%.*]], i64 noundef [[VLA1:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[C:%.*]]) #[[ATTR2]] {
 // CHECK17-NEXT:  entry:
 // CHECK17-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -6730,7 +6730,7 @@ int bar(int n){
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l214.omp_outlined
-// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]], i64 noundef [[AAA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR3]] {
+// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]], i64 noundef [[AAA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR2]] {
 // CHECK17-NEXT:  entry:
 // CHECK17-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -6773,7 +6773,7 @@ int bar(int n){
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l197.omp_outlined
-// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR3]] {
+// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR2]] {
 // CHECK17-NEXT:  entry:
 // CHECK17-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -6852,7 +6852,7 @@ int bar(int n){
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK17-SAME: () #[[ATTR6]] {
+// CHECK17-SAME: () #[[ATTR5]] {
 // CHECK17-NEXT:  entry:
 // CHECK17-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK17-NEXT:    ret void
@@ -6944,7 +6944,7 @@ int bar(int n){
 // CHECK19-NEXT:    [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
 // CHECK19-NEXT:    br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK19:       omp_offload.failed:
-// CHECK19-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l103() #[[ATTR4:[0-9]+]]
+// CHECK19-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l103() #[[ATTR3:[0-9]+]]
 // CHECK19-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK19:       omp_offload.cont:
 // CHECK19-NEXT:    [[CALL:%.*]] = call noundef i64 @_Z7get_valv()
@@ -6952,7 +6952,7 @@ int bar(int n){
 // CHECK19-NEXT:    [[TMP20:%.*]] = load i32, ptr [[A]], align 4
 // CHECK19-NEXT:    store i32 [[TMP20]], ptr [[A_CASTED]], align 4
 // CHECK19-NEXT:    [[TMP21:%.*]] = load i32, ptr [[A_CASTED]], align 4
-// CHECK19-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l110(i32 [[TMP21]], ptr [[K]]) #[[ATTR4]]
+// CHECK19-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l110(i32 [[TMP21]], ptr [[K]]) #[[ATTR3]]
 // CHECK19-NEXT:    store i32 12, ptr [[LIN]], align 4
 // CHECK19-NEXT:    [[TMP22:%.*]] = load i16, ptr [[AA]], align 2
 // CHECK19-NEXT:    store i16 [[TMP22]], ptr [[AA_CASTED]], align 2
@@ -7062,12 +7062,12 @@ int bar(int n){
 // CHECK19-NEXT:    [[TMP83:%.*]] = icmp ne i32 [[TMP82]], 0
 // CHECK19-NEXT:    br i1 [[TMP83]], label [[OMP_OFFLOAD_FAILED9:%.*]], label [[OMP_OFFLOAD_CONT10:%.*]]
 // CHECK19:       omp_offload.failed9:
-// CHECK19-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l136(i32 [[TMP57]], i32 [[TMP59]]) #[[ATTR4]]
+// CHECK19-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l136(i32 [[TMP57]], i32 [[TMP59]]) #[[ATTR3]]
 // CHECK19-NEXT:    br label [[OMP_OFFLOAD_CONT10]]
 // CHECK19:       omp_offload.cont10:
 // CHECK19-NEXT:    br label [[OMP_IF_END:%.*]]
 // CHECK19:       omp_if.else:
-// CHECK19-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l136(i32 [[TMP57]], i32 [[TMP59]]) #[[ATTR4]]
+// CHECK19-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l136(i32 [[TMP57]], i32 [[TMP59]]) #[[ATTR3]]
 // CHECK19-NEXT:    br label [[OMP_IF_END]]
 // CHECK19:       omp_if.end:
 // CHECK19-NEXT:    [[TMP84:%.*]] = load i32, ptr [[A]], align 4
@@ -7185,12 +7185,12 @@ int bar(int n){
 // CHECK19-NEXT:    [[TMP144:%.*]] = icmp ne i32 [[TMP143]], 0
 // CHECK19-NEXT:    br i1 [[TMP144]], label [[OMP_OFFLOAD_FAILED18:%.*]], label [[OMP_OFFLOAD_CONT19:%.*]]
 // CHECK19:       omp_offload.failed18:
-// CHECK19-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l160(i32 [[TMP86]], ptr [[B]], i32 [[TMP1]], ptr [[VLA]], ptr [[C]], i32 5, i32 [[TMP3]], ptr [[VLA1]], ptr [[D]], i32 [[TMP88]]) #[[ATTR4]]
+// CHECK19-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l160(i32 [[TMP86]], ptr [[B]], i32 [[TMP1]], ptr [[VLA]], ptr [[C]], i32 5, i32 [[TMP3]], ptr [[VLA1]], ptr [[D]], i32 [[TMP88]]) #[[ATTR3]]
 // CHECK19-NEXT:    br label [[OMP_OFFLOAD_CONT19]]
 // CHECK19:       omp_offload.cont19:
 // CHECK19-NEXT:    br label [[OMP_IF_END21:%.*]]
 // CHECK19:       omp_if.else20:
-// CHECK19-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l160(i32 [[TMP86]], ptr [[B]], i32 [[TMP1]], ptr [[VLA]], ptr [[C]], i32 5, i32 [[TMP3]], ptr [[VLA1]], ptr [[D]], i32 [[TMP88]]) #[[ATTR4]]
+// CHECK19-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l160(i32 [[TMP86]], ptr [[B]], i32 [[TMP1]], ptr [[VLA]], ptr [[C]], i32 5, i32 [[TMP3]], ptr [[VLA1]], ptr [[D]], i32 [[TMP88]]) #[[ATTR3]]
 // CHECK19-NEXT:    br label [[OMP_IF_END21]]
 // CHECK19:       omp_if.end21:
 // CHECK19-NEXT:    [[TMP145:%.*]] = load i32, ptr [[A]], align 4
@@ -7207,7 +7207,7 @@ int bar(int n){
 //
 //
 // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l103.omp_outlined
-// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] {
 // CHECK19-NEXT:  entry:
 // CHECK19-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -7284,7 +7284,7 @@ int bar(int n){
 //
 //
 // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l110
-// CHECK19-SAME: (i32 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[K:%.*]]) #[[ATTR3]] {
+// CHECK19-SAME: (i32 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[K:%.*]]) #[[ATTR2]] {
 // CHECK19-NEXT:  entry:
 // CHECK19-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 // CHECK19-NEXT:    [[K_ADDR:%.*]] = alloca ptr, align 4
@@ -7300,7 +7300,7 @@ int bar(int n){
 //
 //
 // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l110.omp_outlined
-// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[K:%.*]]) #[[ATTR3]] {
+// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[K:%.*]]) #[[ATTR2]] {
 // CHECK19-NEXT:  entry:
 // CHECK19-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -7340,32 +7340,32 @@ int bar(int n){
 // CHECK19-NEXT:    store i32 [[TMP5]], ptr [[DOTOMP_IV]], align 4
 // CHECK19-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK19:       omp.inner.for.cond:
-// CHECK19-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP27:![0-9]+]]
-// CHECK19-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP27]]
+// CHECK19-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP13:![0-9]+]]
+// CHECK19-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP13]]
 // CHECK19-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP6]], [[TMP7]]
 // CHECK19-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK19:       omp.inner.for.body:
-// CHECK19-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP27]]
+// CHECK19-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP13]]
 // CHECK19-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1
 // CHECK19-NEXT:    [[SUB:%.*]] = sub nsw i32 10, [[MUL]]
-// CHECK19-NEXT:    store i32 [[SUB]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP27]]
-// CHECK19-NEXT:    [[TMP9:%.*]] = load i64, ptr [[DOTLINEAR_START]], align 8, !llvm.access.group [[ACC_GRP27]]
-// CHECK19-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP27]]
+// CHECK19-NEXT:    store i32 [[SUB]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP13]]
+// CHECK19-NEXT:    [[TMP9:%.*]] = load i64, ptr [[DOTLINEAR_START]], align 8, !llvm.access.group [[ACC_GRP13]]
+// CHECK19-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP13]]
 // CHECK19-NEXT:    [[MUL2:%.*]] = mul nsw i32 [[TMP10]], 3
 // CHECK19-NEXT:    [[CONV:%.*]] = sext i32 [[MUL2]] to i64
 // CHECK19-NEXT:    [[ADD:%.*]] = add nsw i64 [[TMP9]], [[CONV]]
-// CHECK19-NEXT:    store i64 [[ADD]], ptr [[K1]], align 8, !llvm.access.group [[ACC_GRP27]]
-// CHECK19-NEXT:    [[TMP11:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP27]]
+// CHECK19-NEXT:    store i64 [[ADD]], ptr [[K1]], align 8, !llvm.access.group [[ACC_GRP13]]
+// CHECK19-NEXT:    [[TMP11:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP13]]
 // CHECK19-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP11]], 1
-// CHECK19-NEXT:    store i32 [[ADD3]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP27]]
+// CHECK19-NEXT:    store i32 [[ADD3]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP13]]
 // CHECK19-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK19:       omp.body.continue:
 // CHECK19-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK19:       omp.inner.for.inc:
-// CHECK19-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP27]]
+// CHECK19-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP13]]
 // CHECK19-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP12]], 1
-// CHECK19-NEXT:    store i32 [[ADD4]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP27]]
-// CHECK19-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP28:![0-9]+]]
+// CHECK19-NEXT:    store i32 [[ADD4]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP13]]
+// CHECK19-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP14:![0-9]+]]
 // CHECK19:       omp.inner.for.end:
 // CHECK19-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK19:       omp.dispatch.inc:
@@ -7408,7 +7408,7 @@ int bar(int n){
 //
 //
 // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l128.omp_outlined
-// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[AA:%.*]], i32 noundef [[LIN:%.*]], i32 noundef [[A:%.*]]) #[[ATTR3]] {
+// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[AA:%.*]], i32 noundef [[LIN:%.*]], i32 noundef [[A:%.*]]) #[[ATTR2]] {
 // CHECK19-NEXT:  entry:
 // CHECK19-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -7517,7 +7517,7 @@ int bar(int n){
 //
 //
 // CHECK19-LABEL: define {{[^@]+}}@.omp_task_privates_map.
-// CHECK19-SAME: (ptr noalias noundef [[TMP0:%.*]], ptr noalias noundef [[TMP1:%.*]], ptr noalias noundef [[TMP2:%.*]], ptr noalias noundef [[TMP3:%.*]], ptr noalias noundef [[TMP4:%.*]]) #[[ATTR6:[0-9]+]] {
+// CHECK19-SAME: (ptr noalias noundef [[TMP0:%.*]], ptr noalias noundef [[TMP1:%.*]], ptr noalias noundef [[TMP2:%.*]], ptr noalias noundef [[TMP3:%.*]], ptr noalias noundef [[TMP4:%.*]]) #[[ATTR5:[0-9]+]] {
 // CHECK19-NEXT:  entry:
 // CHECK19-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 4
@@ -7546,7 +7546,7 @@ int bar(int n){
 //
 //
 // CHECK19-LABEL: define {{[^@]+}}@.omp_task_entry.
-// CHECK19-SAME: (i32 noundef [[TMP0:%.*]], ptr noalias noundef [[TMP1:%.*]]) #[[ATTR7:[0-9]+]] {
+// CHECK19-SAME: (i32 noundef [[TMP0:%.*]], ptr noalias noundef [[TMP1:%.*]]) #[[ATTR6:[0-9]+]] {
 // CHECK19-NEXT:  entry:
 // CHECK19-NEXT:    [[DOTGLOBAL_TID__ADDR_I:%.*]] = alloca i32, align 4
 // CHECK19-NEXT:    [[DOTPART_ID__ADDR_I:%.*]] = alloca ptr, align 4
@@ -7573,65 +7573,65 @@ int bar(int n){
 // CHECK19-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], ptr [[TMP4]], i32 0, i32 0
 // CHECK19-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 4
 // CHECK19-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES]], ptr [[TMP3]], i32 0, i32 1
-// CHECK19-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META30:![0-9]+]])
-// CHECK19-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META33:![0-9]+]])
-// CHECK19-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META35:![0-9]+]])
-// CHECK19-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META37:![0-9]+]])
-// CHECK19-NEXT:    store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !39
-// CHECK19-NEXT:    store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 4, !noalias !39
-// CHECK19-NEXT:    store ptr [[TMP8]], ptr [[DOTPRIVATES__ADDR_I]], align 4, !noalias !39
-// CHECK19-NEXT:    store ptr @.omp_task_privates_map., ptr [[DOTCOPY_FN__ADDR_I]], align 4, !noalias !39
-// CHECK19-NEXT:    store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 4, !noalias !39
-// CHECK19-NEXT:    store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 4, !noalias !39
-// CHECK19-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 4, !noalias !39
-// CHECK19-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[DOTCOPY_FN__ADDR_I]], align 4, !noalias !39
-// CHECK19-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[DOTPRIVATES__ADDR_I]], align 4, !noalias !39
-// CHECK19-NEXT:    call void [[TMP10]](ptr [[TMP11]], ptr [[DOTFIRSTPRIV_PTR_ADDR_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR2_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR3_I]]) #[[ATTR4]]
-// CHECK19-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR_I]], align 4, !noalias !39
-// CHECK19-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], align 4, !noalias !39
-// CHECK19-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR2_I]], align 4, !noalias !39
-// CHECK19-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR3_I]], align 4, !noalias !39
+// CHECK19-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META16:![0-9]+]])
+// CHECK19-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META19:![0-9]+]])
+// CHECK19-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META21:![0-9]+]])
+// CHECK19-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META23:![0-9]+]])
+// CHECK19-NEXT:    store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !25
+// CHECK19-NEXT:    store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 4, !noalias !25
+// CHECK19-NEXT:    store ptr [[TMP8]], ptr [[DOTPRIVATES__ADDR_I]], align 4, !noalias !25
+// CHECK19-NEXT:    store ptr @.omp_task_privates_map., ptr [[DOTCOPY_FN__ADDR_I]], align 4, !noalias !25
+// CHECK19-NEXT:    store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 4, !noalias !25
+// CHECK19-NEXT:    store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 4, !noalias !25
+// CHECK19-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 4, !noalias !25
+// CHECK19-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[DOTCOPY_FN__ADDR_I]], align 4, !noalias !25
+// CHECK19-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[DOTPRIVATES__ADDR_I]], align 4, !noalias !25
+// CHECK19-NEXT:    call void [[TMP10]](ptr [[TMP11]], ptr [[DOTFIRSTPRIV_PTR_ADDR_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR2_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR3_I]]) #[[ATTR3]]
+// CHECK19-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR_I]], align 4, !noalias !25
+// CHECK19-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], align 4, !noalias !25
+// CHECK19-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR2_I]], align 4, !noalias !25
+// CHECK19-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR3_I]], align 4, !noalias !25
 // CHECK19-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT_ANON:%.*]], ptr [[TMP9]], i32 0, i32 1
 // CHECK19-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_ANON]], ptr [[TMP9]], i32 0, i32 2
-// CHECK19-NEXT:    store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias !39
+// CHECK19-NEXT:    store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias !25
 // CHECK19-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 1
-// CHECK19-NEXT:    store i32 3, ptr [[TMP18]], align 4, !noalias !39
+// CHECK19-NEXT:    store i32 3, ptr [[TMP18]], align 4, !noalias !25
 // CHECK19-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 2
-// CHECK19-NEXT:    store ptr [[TMP13]], ptr [[TMP19]], align 4, !noalias !39
+// CHECK19-NEXT:    store ptr [[TMP13]], ptr [[TMP19]], align 4, !noalias !25
 // CHECK19-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 3
-// CHECK19-NEXT:    store ptr [[TMP14]], ptr [[TMP20]], align 4, !noalias !39
+// CHECK19-NEXT:    store ptr [[TMP14]], ptr [[TMP20]], align 4, !noalias !25
 // CHECK19-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 4
-// CHECK19-NEXT:    store ptr [[TMP15]], ptr [[TMP21]], align 4, !noalias !39
+// CHECK19-NEXT:    store ptr [[TMP15]], ptr [[TMP21]], align 4, !noalias !25
 // CHECK19-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 5
-// CHECK19-NEXT:    store ptr @.offload_maptypes, ptr [[TMP22]], align 4, !noalias !39
+// CHECK19-NEXT:    store ptr @.offload_maptypes, ptr [[TMP22]], align 4, !noalias !25
 // CHECK19-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 6
-// CHECK19-NEXT:    store ptr null, ptr [[TMP23]], align 4, !noalias !39
+// CHECK19-NEXT:    store ptr null, ptr [[TMP23]], align 4, !noalias !25
 // CHECK19-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 7
-// CHECK19-NEXT:    store ptr null, ptr [[TMP24]], align 4, !noalias !39
+// CHECK19-NEXT:    store ptr null, ptr [[TMP24]], align 4, !noalias !25
 // CHECK19-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 8
-// CHECK19-NEXT:    store i64 0, ptr [[TMP25]], align 8, !noalias !39
+// CHECK19-NEXT:    store i64 0, ptr [[TMP25]], align 8, !noalias !25
 // CHECK19-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 9
-// CHECK19-NEXT:    store i64 1, ptr [[TMP26]], align 8, !noalias !39
+// CHECK19-NEXT:    store i64 1, ptr [[TMP26]], align 8, !noalias !25
 // CHECK19-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 10
-// CHECK19-NEXT:    store [3 x i32] [i32 1, i32 0, i32 0], ptr [[TMP27]], align 4, !noalias !39
+// CHECK19-NEXT:    store [3 x i32] [i32 1, i32 0, i32 0], ptr [[TMP27]], align 4, !noalias !25
 // CHECK19-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 11
-// CHECK19-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP28]], align 4, !noalias !39
+// CHECK19-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP28]], align 4, !noalias !25
 // CHECK19-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 12
-// CHECK19-NEXT:    store i32 0, ptr [[TMP29]], align 4, !noalias !39
+// CHECK19-NEXT:    store i32 0, ptr [[TMP29]], align 4, !noalias !25
 // CHECK19-NEXT:    [[TMP30:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB2]], i64 -1, i32 1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l128.region_id, ptr [[KERNEL_ARGS_I]])
 // CHECK19-NEXT:    [[TMP31:%.*]] = icmp ne i32 [[TMP30]], 0
 // CHECK19-NEXT:    br i1 [[TMP31]], label [[OMP_OFFLOAD_FAILED_I:%.*]], label [[DOTOMP_OUTLINED__EXIT:%.*]]
 // CHECK19:       omp_offload.failed.i:
 // CHECK19-NEXT:    [[TMP32:%.*]] = load i16, ptr [[TMP12]], align 2
-// CHECK19-NEXT:    store i16 [[TMP32]], ptr [[AA_CASTED_I]], align 2, !noalias !39
-// CHECK19-NEXT:    [[TMP33:%.*]] = load i32, ptr [[AA_CASTED_I]], align 4, !noalias !39
+// CHECK19-NEXT:    store i16 [[TMP32]], ptr [[AA_CASTED_I]], align 2, !noalias !25
+// CHECK19-NEXT:    [[TMP33:%.*]] = load i32, ptr [[AA_CASTED_I]], align 4, !noalias !25
 // CHECK19-NEXT:    [[TMP34:%.*]] = load i32, ptr [[TMP16]], align 4
-// CHECK19-NEXT:    store i32 [[TMP34]], ptr [[LIN_CASTED_I]], align 4, !noalias !39
-// CHECK19-NEXT:    [[TMP35:%.*]] = load i32, ptr [[LIN_CASTED_I]], align 4, !noalias !39
+// CHECK19-NEXT:    store i32 [[TMP34]], ptr [[LIN_CASTED_I]], align 4, !noalias !25
+// CHECK19-NEXT:    [[TMP35:%.*]] = load i32, ptr [[LIN_CASTED_I]], align 4, !noalias !25
 // CHECK19-NEXT:    [[TMP36:%.*]] = load i32, ptr [[TMP17]], align 4
-// CHECK19-NEXT:    store i32 [[TMP36]], ptr [[A_CASTED_I]], align 4, !noalias !39
-// CHECK19-NEXT:    [[TMP37:%.*]] = load i32, ptr [[A_CASTED_I]], align 4, !noalias !39
-// CHECK19-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l128(i32 [[TMP33]], i32 [[TMP35]], i32 [[TMP37]]) #[[ATTR4]]
+// CHECK19-NEXT:    store i32 [[TMP36]], ptr [[A_CASTED_I]], align 4, !noalias !25
+// CHECK19-NEXT:    [[TMP37:%.*]] = load i32, ptr [[A_CASTED_I]], align 4, !noalias !25
+// CHECK19-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l128(i32 [[TMP33]], i32 [[TMP35]], i32 [[TMP37]]) #[[ATTR3]]
 // CHECK19-NEXT:    br label [[DOTOMP_OUTLINED__EXIT]]
 // CHECK19:       .omp_outlined..exit:
 // CHECK19-NEXT:    ret i32 0
@@ -7657,7 +7657,7 @@ int bar(int n){
 //
 //
 // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l136.omp_outlined
-// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]]) #[[ATTR3]] {
+// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]]) #[[ATTR2]] {
 // CHECK19-NEXT:  entry:
 // CHECK19-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -7773,7 +7773,7 @@ int bar(int n){
 //
 //
 // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l160.omp_outlined
-// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BN:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[C:%.*]], i32 noundef [[VLA1:%.*]], i32 noundef [[VLA3:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[CN:%.*]], ptr noundef nonnull align 4 dereferenceable(12) [[D:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] {
+// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BN:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[C:%.*]], i32 noundef [[VLA1:%.*]], i32 noundef [[VLA3:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[CN:%.*]], ptr noundef nonnull align 4 dereferenceable(12) [[D:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
 // CHECK19-NEXT:  entry:
 // CHECK19-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -8050,12 +8050,12 @@ int bar(int n){
 // CHECK19-NEXT:    [[TMP43:%.*]] = icmp ne i32 [[TMP42]], 0
 // CHECK19-NEXT:    br i1 [[TMP43]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK19:       omp_offload.failed:
-// CHECK19-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l232(ptr [[THIS1]], i32 [[TMP5]], i32 2, i32 [[TMP1]], ptr [[VLA]]) #[[ATTR4]]
+// CHECK19-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l232(ptr [[THIS1]], i32 [[TMP5]], i32 2, i32 [[TMP1]], ptr [[VLA]]) #[[ATTR3]]
 // CHECK19-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK19:       omp_offload.cont:
 // CHECK19-NEXT:    br label [[OMP_IF_END:%.*]]
 // CHECK19:       omp_if.else:
-// CHECK19-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l232(ptr [[THIS1]], i32 [[TMP5]], i32 2, i32 [[TMP1]], ptr [[VLA]]) #[[ATTR4]]
+// CHECK19-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l232(ptr [[THIS1]], i32 [[TMP5]], i32 2, i32 [[TMP1]], ptr [[VLA]]) #[[ATTR3]]
 // CHECK19-NEXT:    br label [[OMP_IF_END]]
 // CHECK19:       omp_if.end:
 // CHECK19-NEXT:    [[TMP44:%.*]] = mul nsw i32 1, [[TMP1]]
@@ -8158,12 +8158,12 @@ int bar(int n){
 // CHECK19-NEXT:    [[TMP35:%.*]] = icmp ne i32 [[TMP34]], 0
 // CHECK19-NEXT:    br i1 [[TMP35]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK19:       omp_offload.failed:
-// CHECK19-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l214(i32 [[TMP1]], i32 [[TMP3]], i32 [[TMP5]], ptr [[B]]) #[[ATTR4]]
+// CHECK19-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l214(i32 [[TMP1]], i32 [[TMP3]], i32 [[TMP5]], ptr [[B]]) #[[ATTR3]]
 // CHECK19-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK19:       omp_offload.cont:
 // CHECK19-NEXT:    br label [[OMP_IF_END:%.*]]
 // CHECK19:       omp_if.else:
-// CHECK19-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l214(i32 [[TMP1]], i32 [[TMP3]], i32 [[TMP5]], ptr [[B]]) #[[ATTR4]]
+// CHECK19-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l214(i32 [[TMP1]], i32 [[TMP3]], i32 [[TMP5]], ptr [[B]]) #[[ATTR3]]
 // CHECK19-NEXT:    br label [[OMP_IF_END]]
 // CHECK19:       omp_if.end:
 // CHECK19-NEXT:    [[TMP36:%.*]] = load i32, ptr [[A]], align 4
@@ -8246,12 +8246,12 @@ int bar(int n){
 // CHECK19-NEXT:    [[TMP30:%.*]] = icmp ne i32 [[TMP29]], 0
 // CHECK19-NEXT:    br i1 [[TMP30]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK19:       omp_offload.failed:
-// CHECK19-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l197(i32 [[TMP1]], i32 [[TMP3]], ptr [[B]]) #[[ATTR4]]
+// CHECK19-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l197(i32 [[TMP1]], i32 [[TMP3]], ptr [[B]]) #[[ATTR3]]
 // CHECK19-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK19:       omp_offload.cont:
 // CHECK19-NEXT:    br label [[OMP_IF_END:%.*]]
 // CHECK19:       omp_if.else:
-// CHECK19-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l197(i32 [[TMP1]], i32 [[TMP3]], ptr [[B]]) #[[ATTR4]]
+// CHECK19-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l197(i32 [[TMP1]], i32 [[TMP3]], ptr [[B]]) #[[ATTR3]]
 // CHECK19-NEXT:    br label [[OMP_IF_END]]
 // CHECK19:       omp_if.end:
 // CHECK19-NEXT:    [[TMP31:%.*]] = load i32, ptr [[A]], align 4
@@ -8284,7 +8284,7 @@ int bar(int n){
 //
 //
 // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l232.omp_outlined
-// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]], i32 noundef [[B:%.*]], i32 noundef [[VLA:%.*]], i32 noundef [[VLA1:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[C:%.*]]) #[[ATTR3]] {
+// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]], i32 noundef [[B:%.*]], i32 noundef [[VLA:%.*]], i32 noundef [[VLA1:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[C:%.*]]) #[[ATTR2]] {
 // CHECK19-NEXT:  entry:
 // CHECK19-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -8400,7 +8400,7 @@ int bar(int n){
 //
 //
 // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l214.omp_outlined
-// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]], i32 noundef [[AAA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR3]] {
+// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]], i32 noundef [[AAA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR2]] {
 // CHECK19-NEXT:  entry:
 // CHECK19-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -8443,7 +8443,7 @@ int bar(int n){
 //
 //
 // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l197.omp_outlined
-// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR3]] {
+// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR2]] {
 // CHECK19-NEXT:  entry:
 // CHECK19-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -8522,7 +8522,7 @@ int bar(int n){
 //
 //
 // CHECK19-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK19-SAME: () #[[ATTR6]] {
+// CHECK19-SAME: () #[[ATTR5]] {
 // CHECK19-NEXT:  entry:
 // CHECK19-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK19-NEXT:    ret void
diff --git a/clang/test/OpenMP/target_parallel_for_debug_codegen.cpp b/clang/test/OpenMP/target_parallel_for_debug_codegen.cpp
index 12dbf4229f5ba..a49c0c92a9e3f 100644
--- a/clang/test/OpenMP/target_parallel_for_debug_codegen.cpp
+++ b/clang/test/OpenMP/target_parallel_for_debug_codegen.cpp
@@ -55,7 +55,7 @@ int main() {
   return 0;
 }
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13_debug__
-// CHECK1-SAME: (ptr addrspace(1) noalias noundef [[C:%.*]], i32 noundef [[A:%.*]], ptr noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]], i1 noundef zeroext [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0:[0-9]+]] !dbg [[DBG13:![0-9]+]] {
+// CHECK1-SAME: (ptr addrspace(1) noalias noundef [[C:%.*]], i32 noundef [[A:%.*]], ptr noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]], i1 noundef zeroext [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0:[0-9]+]] !dbg [[DBG22:![0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[C_ADDR:%.*]] = alloca ptr addrspace(1), align 8
 // CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
@@ -68,56 +68,56 @@ int main() {
 // CHECK1-NEXT:    [[A_CASTED:%.*]] = alloca i64, align 8
 // CHECK1-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x ptr], align 8
 // CHECK1-NEXT:    store ptr addrspace(1) [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META31:![0-9]+]], metadata !DIExpression()), !dbg [[DBG32:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META40:![0-9]+]], metadata !DIExpression()), !dbg [[DBG41:![0-9]+]]
 // CHECK1-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META33:![0-9]+]], metadata !DIExpression()), !dbg [[DBG34:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META42:![0-9]+]], metadata !DIExpression()), !dbg [[DBG43:![0-9]+]]
 // CHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META35:![0-9]+]], metadata !DIExpression()), !dbg [[DBG36:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META44:![0-9]+]], metadata !DIExpression()), !dbg [[DBG45:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META37:![0-9]+]], metadata !DIExpression()), !dbg [[DBG38:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META46:![0-9]+]], metadata !DIExpression()), !dbg [[DBG47:![0-9]+]]
 // CHECK1-NEXT:    [[FROMBOOL:%.*]] = zext i1 [[DOTCAPTURE_EXPR_]] to i8
 // CHECK1-NEXT:    store i8 [[FROMBOOL]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 1
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTCAPTURE_EXPR__ADDR]], metadata [[META39:![0-9]+]], metadata !DIExpression()), !dbg [[DBG40:![0-9]+]]
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG41:![0-9]+]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG41]]
-// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG41]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG41]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG41]]
-// CHECK1-NEXT:    store ptr [[TMP3]], ptr [[_TMP1]], align 8, !dbg [[DBG41]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG41]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG41]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = addrspacecast ptr addrspace(1) [[TMP5]] to ptr, !dbg [[DBG41]]
-// CHECK1-NEXT:    store ptr [[TMP6]], ptr [[_TMP2]], align 8, !dbg [[DBG41]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG41]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13_kernel_environment), !dbg [[DBG41]]
-// CHECK1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP8]], -1, !dbg [[DBG41]]
-// CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]], !dbg [[DBG41]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTCAPTURE_EXPR__ADDR]], metadata [[META48:![0-9]+]], metadata !DIExpression()), !dbg [[DBG49:![0-9]+]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG50:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG50]]
+// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG50]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG50]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG50]]
+// CHECK1-NEXT:    store ptr [[TMP3]], ptr [[_TMP1]], align 8, !dbg [[DBG50]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG50]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG50]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = addrspacecast ptr addrspace(1) [[TMP5]] to ptr, !dbg [[DBG50]]
+// CHECK1-NEXT:    store ptr [[TMP6]], ptr [[_TMP2]], align 8, !dbg [[DBG50]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG50]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13_kernel_environment), !dbg [[DBG50]]
+// CHECK1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP8]], -1, !dbg [[DBG50]]
+// CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]], !dbg [[DBG50]]
 // CHECK1:       user_code.entry:
 // CHECK1-NEXT:    [[TMP9:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB6:[0-9]+]])
-// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG42:![0-9]+]]
-// CHECK1-NEXT:    store i32 [[TMP10]], ptr [[A_CASTED]], align 4, !dbg [[DBG42]]
-// CHECK1-NEXT:    [[TMP11:%.*]] = load i64, ptr [[A_CASTED]], align 8, !dbg [[DBG42]]
-// CHECK1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 0, !dbg [[DBG42]]
-// CHECK1-NEXT:    store ptr [[TMP2]], ptr [[TMP12]], align 8, !dbg [[DBG42]]
-// CHECK1-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 1, !dbg [[DBG42]]
-// CHECK1-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP11]] to ptr, !dbg [[DBG42]]
-// CHECK1-NEXT:    store ptr [[TMP14]], ptr [[TMP13]], align 8, !dbg [[DBG42]]
-// CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 2, !dbg [[DBG42]]
-// CHECK1-NEXT:    store ptr [[TMP4]], ptr [[TMP15]], align 8, !dbg [[DBG42]]
-// CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 3, !dbg [[DBG42]]
-// CHECK1-NEXT:    store ptr [[TMP7]], ptr [[TMP16]], align 8, !dbg [[DBG42]]
-// CHECK1-NEXT:    [[TMP17:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1, !dbg [[DBG43:![0-9]+]]
-// CHECK1-NEXT:    [[TOBOOL:%.*]] = trunc i8 [[TMP17]] to i1, !dbg [[DBG43]]
-// CHECK1-NEXT:    [[TMP18:%.*]] = zext i1 [[TOBOOL]] to i32, !dbg [[DBG42]]
-// CHECK1-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB6]], i32 [[TMP9]], i32 [[TMP18]], i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13_debug___omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 4), !dbg [[DBG42]]
-// CHECK1-NEXT:    call void @__kmpc_target_deinit(), !dbg [[DBG45:![0-9]+]]
-// CHECK1-NEXT:    ret void, !dbg [[DBG46:![0-9]+]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG51:![0-9]+]]
+// CHECK1-NEXT:    store i32 [[TMP10]], ptr [[A_CASTED]], align 4, !dbg [[DBG51]]
+// CHECK1-NEXT:    [[TMP11:%.*]] = load i64, ptr [[A_CASTED]], align 8, !dbg [[DBG51]]
+// CHECK1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 0, !dbg [[DBG51]]
+// CHECK1-NEXT:    store ptr [[TMP2]], ptr [[TMP12]], align 8, !dbg [[DBG51]]
+// CHECK1-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 1, !dbg [[DBG51]]
+// CHECK1-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP11]] to ptr, !dbg [[DBG51]]
+// CHECK1-NEXT:    store ptr [[TMP14]], ptr [[TMP13]], align 8, !dbg [[DBG51]]
+// CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 2, !dbg [[DBG51]]
+// CHECK1-NEXT:    store ptr [[TMP4]], ptr [[TMP15]], align 8, !dbg [[DBG51]]
+// CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 3, !dbg [[DBG51]]
+// CHECK1-NEXT:    store ptr [[TMP7]], ptr [[TMP16]], align 8, !dbg [[DBG51]]
+// CHECK1-NEXT:    [[TMP17:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1, !dbg [[DBG52:![0-9]+]]
+// CHECK1-NEXT:    [[TOBOOL:%.*]] = trunc i8 [[TMP17]] to i1, !dbg [[DBG52]]
+// CHECK1-NEXT:    [[TMP18:%.*]] = zext i1 [[TOBOOL]] to i32, !dbg [[DBG51]]
+// CHECK1-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB6]], i32 [[TMP9]], i32 [[TMP18]], i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13_debug___omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 4), !dbg [[DBG51]]
+// CHECK1-NEXT:    call void @__kmpc_target_deinit(), !dbg [[DBG54:![0-9]+]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG55:![0-9]+]]
 // CHECK1:       worker.exit:
-// CHECK1-NEXT:    ret void, !dbg [[DBG41]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG50]]
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13_debug___omp_outlined_debug__
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], i32 noundef [[A:%.*]], ptr noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG47:![0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], i32 noundef [[A:%.*]], ptr noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR2:[0-9]+]] !dbg [[DBG56:![0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -141,149 +141,149 @@ int main() {
 // CHECK1-NEXT:    [[H:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[D:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META54:![0-9]+]], metadata !DIExpression()), !dbg [[DBG55:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META63:![0-9]+]], metadata !DIExpression()), !dbg [[DBG64:![0-9]+]]
 // CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META56:![0-9]+]], metadata !DIExpression()), !dbg [[DBG55]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META65:![0-9]+]], metadata !DIExpression()), !dbg [[DBG64]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META57:![0-9]+]], metadata !DIExpression()), !dbg [[DBG58:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META66:![0-9]+]], metadata !DIExpression()), !dbg [[DBG67:![0-9]+]]
 // CHECK1-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META59:![0-9]+]], metadata !DIExpression()), !dbg [[DBG60:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META68:![0-9]+]], metadata !DIExpression()), !dbg [[DBG69:![0-9]+]]
 // CHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META61:![0-9]+]], metadata !DIExpression()), !dbg [[DBG62:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META70:![0-9]+]], metadata !DIExpression()), !dbg [[DBG71:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META63:![0-9]+]], metadata !DIExpression()), !dbg [[DBG64:![0-9]+]]
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG65:![0-9]+]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG65]]
-// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG65]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG65]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG65]]
-// CHECK1-NEXT:    store ptr [[TMP3]], ptr [[_TMP1]], align 8, !dbg [[DBG65]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG65]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG65]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = addrspacecast ptr addrspace(1) [[TMP5]] to ptr, !dbg [[DBG65]]
-// CHECK1-NEXT:    store ptr [[TMP6]], ptr [[_TMP2]], align 8, !dbg [[DBG65]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG65]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_IV]], metadata [[META66:![0-9]+]], metadata !DIExpression()), !dbg [[DBG55]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_LB]], metadata [[META67:![0-9]+]], metadata !DIExpression()), !dbg [[DBG55]]
-// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG68:![0-9]+]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_UB]], metadata [[META69:![0-9]+]], metadata !DIExpression()), !dbg [[DBG55]]
-// CHECK1-NEXT:    store i32 9, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG68]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_STRIDE]], metadata [[META70:![0-9]+]], metadata !DIExpression()), !dbg [[DBG55]]
-// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG68]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_IS_LAST]], metadata [[META71:![0-9]+]], metadata !DIExpression()), !dbg [[DBG55]]
-// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4, !dbg [[DBG68]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B4]], metadata [[META72:![0-9]+]], metadata !DIExpression()), !dbg [[DBG55]]
-// CHECK1-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[B4]], ptr align 4 [[TMP4]], i64 400, i1 false), !dbg [[DBG65]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[I]], metadata [[META73:![0-9]+]], metadata !DIExpression()), !dbg [[DBG55]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG65]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP8]], align 4, !dbg [[DBG65]]
-// CHECK1-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB3:[0-9]+]], i32 [[TMP9]], i32 33, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1), !dbg [[DBG74:![0-9]+]]
-// CHECK1-NEXT:    br label [[OMP_DISPATCH_COND:%.*]], !dbg [[DBG65]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META72:![0-9]+]], metadata !DIExpression()), !dbg [[DBG73:![0-9]+]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG74:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG74]]
+// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG74]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG74]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG74]]
+// CHECK1-NEXT:    store ptr [[TMP3]], ptr [[_TMP1]], align 8, !dbg [[DBG74]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG74]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG74]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = addrspacecast ptr addrspace(1) [[TMP5]] to ptr, !dbg [[DBG74]]
+// CHECK1-NEXT:    store ptr [[TMP6]], ptr [[_TMP2]], align 8, !dbg [[DBG74]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG74]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_IV]], metadata [[META75:![0-9]+]], metadata !DIExpression()), !dbg [[DBG64]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_LB]], metadata [[META76:![0-9]+]], metadata !DIExpression()), !dbg [[DBG64]]
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG77:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_UB]], metadata [[META78:![0-9]+]], metadata !DIExpression()), !dbg [[DBG64]]
+// CHECK1-NEXT:    store i32 9, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG77]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_STRIDE]], metadata [[META79:![0-9]+]], metadata !DIExpression()), !dbg [[DBG64]]
+// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG77]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_IS_LAST]], metadata [[META80:![0-9]+]], metadata !DIExpression()), !dbg [[DBG64]]
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4, !dbg [[DBG77]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B4]], metadata [[META81:![0-9]+]], metadata !DIExpression()), !dbg [[DBG64]]
+// CHECK1-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[B4]], ptr align 4 [[TMP4]], i64 400, i1 false), !dbg [[DBG74]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[I]], metadata [[META82:![0-9]+]], metadata !DIExpression()), !dbg [[DBG64]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG74]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP8]], align 4, !dbg [[DBG74]]
+// CHECK1-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB3:[0-9]+]], i32 [[TMP9]], i32 33, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1), !dbg [[DBG83:![0-9]+]]
+// CHECK1-NEXT:    br label [[OMP_DISPATCH_COND:%.*]], !dbg [[DBG74]]
 // CHECK1:       omp.dispatch.cond:
-// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG68]]
-// CHECK1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP10]], 9, !dbg [[DBG68]]
-// CHECK1-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]], !dbg [[DBG68]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG77]]
+// CHECK1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP10]], 9, !dbg [[DBG77]]
+// CHECK1-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]], !dbg [[DBG77]]
 // CHECK1:       cond.true:
-// CHECK1-NEXT:    br label [[COND_END:%.*]], !dbg [[DBG68]]
+// CHECK1-NEXT:    br label [[COND_END:%.*]], !dbg [[DBG77]]
 // CHECK1:       cond.false:
-// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG68]]
-// CHECK1-NEXT:    br label [[COND_END]], !dbg [[DBG68]]
+// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG77]]
+// CHECK1-NEXT:    br label [[COND_END]], !dbg [[DBG77]]
 // CHECK1:       cond.end:
-// CHECK1-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP11]], [[COND_FALSE]] ], !dbg [[DBG68]]
-// CHECK1-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4, !dbg [[DBG68]]
-// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG68]]
-// CHECK1-NEXT:    store i32 [[TMP12]], ptr [[DOTOMP_IV]], align 4, !dbg [[DBG68]]
-// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG68]]
-// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG68]]
-// CHECK1-NEXT:    [[CMP5:%.*]] = icmp sle i32 [[TMP13]], [[TMP14]], !dbg [[DBG65]]
-// CHECK1-NEXT:    br i1 [[CMP5]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]], !dbg [[DBG65]]
+// CHECK1-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP11]], [[COND_FALSE]] ], !dbg [[DBG77]]
+// CHECK1-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4, !dbg [[DBG77]]
+// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG77]]
+// CHECK1-NEXT:    store i32 [[TMP12]], ptr [[DOTOMP_IV]], align 4, !dbg [[DBG77]]
+// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG77]]
+// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG77]]
+// CHECK1-NEXT:    [[CMP5:%.*]] = icmp sle i32 [[TMP13]], [[TMP14]], !dbg [[DBG74]]
+// CHECK1-NEXT:    br i1 [[CMP5]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]], !dbg [[DBG74]]
 // CHECK1:       omp.dispatch.body:
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]], !dbg [[DBG65]]
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]], !dbg [[DBG74]]
 // CHECK1:       omp.inner.for.cond:
-// CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG68]]
-// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG68]]
-// CHECK1-NEXT:    [[CMP6:%.*]] = icmp sle i32 [[TMP15]], [[TMP16]], !dbg [[DBG65]]
-// CHECK1-NEXT:    br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]], !dbg [[DBG65]]
+// CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG77]]
+// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG77]]
+// CHECK1-NEXT:    [[CMP6:%.*]] = icmp sle i32 [[TMP15]], [[TMP16]], !dbg [[DBG74]]
+// CHECK1-NEXT:    br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]], !dbg [[DBG74]]
 // CHECK1:       omp.inner.for.body:
-// CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG68]]
-// CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP17]], 1, !dbg [[DBG75:![0-9]+]]
-// CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]], !dbg [[DBG75]]
-// CHECK1-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !dbg [[DBG75]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[F]], metadata [[META76:![0-9]+]], metadata !DIExpression()), !dbg [[DBG79:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 1, !dbg [[DBG80:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX]], i64 0, i64 1, !dbg [[DBG80]]
-// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX7]], i64 0, i64 1, !dbg [[DBG80]]
-// CHECK1-NEXT:    store ptr [[ARRAYIDX8]], ptr [[F]], align 8, !dbg [[DBG79]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[G]], metadata [[META81:![0-9]+]], metadata !DIExpression()), !dbg [[DBG82:![0-9]+]]
-// CHECK1-NEXT:    store ptr [[A_ADDR]], ptr [[G]], align 8, !dbg [[DBG82]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[H]], metadata [[META83:![0-9]+]], metadata !DIExpression()), !dbg [[DBG84:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B4]], i64 0, i64 1, !dbg [[DBG85:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX9]], i64 0, i64 1, !dbg [[DBG85]]
-// CHECK1-NEXT:    store ptr [[ARRAYIDX10]], ptr [[H]], align 8, !dbg [[DBG84]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[D]], metadata [[META86:![0-9]+]], metadata !DIExpression()), !dbg [[DBG87:![0-9]+]]
-// CHECK1-NEXT:    store i32 15, ptr [[D]], align 4, !dbg [[DBG87]]
-// CHECK1-NEXT:    store i32 5, ptr [[A_ADDR]], align 4, !dbg [[DBG88:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B4]], i64 0, i64 0, !dbg [[DBG89:![0-9]+]]
-// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG90:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP18]] to i64, !dbg [[DBG89]]
-// CHECK1-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX11]], i64 0, i64 [[IDXPROM]], !dbg [[DBG89]]
-// CHECK1-NEXT:    store i32 10, ptr [[ARRAYIDX12]], align 4, !dbg [[DBG91:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG92:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX13]], i64 0, i64 0, !dbg [[DBG92]]
-// CHECK1-NEXT:    [[TMP19:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG93:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM15:%.*]] = sext i32 [[TMP19]] to i64, !dbg [[DBG92]]
-// CHECK1-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX14]], i64 0, i64 [[IDXPROM15]], !dbg [[DBG92]]
-// CHECK1-NEXT:    store i32 11, ptr [[ARRAYIDX16]], align 4, !dbg [[DBG94:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG95:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX17]], i64 0, i64 0, !dbg [[DBG95]]
-// CHECK1-NEXT:    [[TMP20:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG96:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM19:%.*]] = sext i32 [[TMP20]] to i64, !dbg [[DBG95]]
-// CHECK1-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX18]], i64 0, i64 [[IDXPROM19]], !dbg [[DBG95]]
-// CHECK1-NEXT:    [[TMP21:%.*]] = load i32, ptr [[ARRAYIDX20]], align 4, !dbg [[DBG95]]
-// CHECK1-NEXT:    [[ARRAYIDX21:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B4]], i64 0, i64 0, !dbg [[DBG97:![0-9]+]]
-// CHECK1-NEXT:    [[TMP22:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG98:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM22:%.*]] = sext i32 [[TMP22]] to i64, !dbg [[DBG97]]
-// CHECK1-NEXT:    [[ARRAYIDX23:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX21]], i64 0, i64 [[IDXPROM22]], !dbg [[DBG97]]
-// CHECK1-NEXT:    store i32 [[TMP21]], ptr [[ARRAYIDX23]], align 4, !dbg [[DBG99:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX24:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B4]], i64 0, i64 0, !dbg [[DBG100:![0-9]+]]
-// CHECK1-NEXT:    [[TMP23:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG101:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM25:%.*]] = sext i32 [[TMP23]] to i64, !dbg [[DBG100]]
-// CHECK1-NEXT:    [[ARRAYIDX26:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX24]], i64 0, i64 [[IDXPROM25]], !dbg [[DBG100]]
-// CHECK1-NEXT:    [[TMP24:%.*]] = load i32, ptr [[ARRAYIDX26]], align 4, !dbg [[DBG100]]
-// CHECK1-NEXT:    [[TMP25:%.*]] = load i8, ptr [[TMP7]], align 1, !dbg [[DBG102:![0-9]+]]
-// CHECK1-NEXT:    [[TOBOOL:%.*]] = trunc i8 [[TMP25]] to i1, !dbg [[DBG102]]
-// CHECK1-NEXT:    [[CONV:%.*]] = zext i1 [[TOBOOL]] to i32, !dbg [[DBG102]]
-// CHECK1-NEXT:    [[OR:%.*]] = or i32 [[CONV]], [[TMP24]], !dbg [[DBG102]]
-// CHECK1-NEXT:    [[TOBOOL27:%.*]] = icmp ne i32 [[OR]], 0, !dbg [[DBG102]]
-// CHECK1-NEXT:    [[FROMBOOL:%.*]] = zext i1 [[TOBOOL27]] to i8, !dbg [[DBG102]]
-// CHECK1-NEXT:    store i8 [[FROMBOOL]], ptr [[TMP7]], align 1, !dbg [[DBG102]]
-// CHECK1-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]], !dbg [[DBG103:![0-9]+]]
+// CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG77]]
+// CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP17]], 1, !dbg [[DBG84:![0-9]+]]
+// CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]], !dbg [[DBG84]]
+// CHECK1-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !dbg [[DBG84]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[F]], metadata [[META85:![0-9]+]], metadata !DIExpression()), !dbg [[DBG88:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 1, !dbg [[DBG89:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX]], i64 0, i64 1, !dbg [[DBG89]]
+// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX7]], i64 0, i64 1, !dbg [[DBG89]]
+// CHECK1-NEXT:    store ptr [[ARRAYIDX8]], ptr [[F]], align 8, !dbg [[DBG88]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[G]], metadata [[META90:![0-9]+]], metadata !DIExpression()), !dbg [[DBG91:![0-9]+]]
+// CHECK1-NEXT:    store ptr [[A_ADDR]], ptr [[G]], align 8, !dbg [[DBG91]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[H]], metadata [[META92:![0-9]+]], metadata !DIExpression()), !dbg [[DBG93:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B4]], i64 0, i64 1, !dbg [[DBG94:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX9]], i64 0, i64 1, !dbg [[DBG94]]
+// CHECK1-NEXT:    store ptr [[ARRAYIDX10]], ptr [[H]], align 8, !dbg [[DBG93]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[D]], metadata [[META95:![0-9]+]], metadata !DIExpression()), !dbg [[DBG96:![0-9]+]]
+// CHECK1-NEXT:    store i32 15, ptr [[D]], align 4, !dbg [[DBG96]]
+// CHECK1-NEXT:    store i32 5, ptr [[A_ADDR]], align 4, !dbg [[DBG97:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B4]], i64 0, i64 0, !dbg [[DBG98:![0-9]+]]
+// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG99:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP18]] to i64, !dbg [[DBG98]]
+// CHECK1-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX11]], i64 0, i64 [[IDXPROM]], !dbg [[DBG98]]
+// CHECK1-NEXT:    store i32 10, ptr [[ARRAYIDX12]], align 4, !dbg [[DBG100:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG101:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX13]], i64 0, i64 0, !dbg [[DBG101]]
+// CHECK1-NEXT:    [[TMP19:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG102:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM15:%.*]] = sext i32 [[TMP19]] to i64, !dbg [[DBG101]]
+// CHECK1-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX14]], i64 0, i64 [[IDXPROM15]], !dbg [[DBG101]]
+// CHECK1-NEXT:    store i32 11, ptr [[ARRAYIDX16]], align 4, !dbg [[DBG103:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG104:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX17]], i64 0, i64 0, !dbg [[DBG104]]
+// CHECK1-NEXT:    [[TMP20:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG105:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM19:%.*]] = sext i32 [[TMP20]] to i64, !dbg [[DBG104]]
+// CHECK1-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX18]], i64 0, i64 [[IDXPROM19]], !dbg [[DBG104]]
+// CHECK1-NEXT:    [[TMP21:%.*]] = load i32, ptr [[ARRAYIDX20]], align 4, !dbg [[DBG104]]
+// CHECK1-NEXT:    [[ARRAYIDX21:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B4]], i64 0, i64 0, !dbg [[DBG106:![0-9]+]]
+// CHECK1-NEXT:    [[TMP22:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG107:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM22:%.*]] = sext i32 [[TMP22]] to i64, !dbg [[DBG106]]
+// CHECK1-NEXT:    [[ARRAYIDX23:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX21]], i64 0, i64 [[IDXPROM22]], !dbg [[DBG106]]
+// CHECK1-NEXT:    store i32 [[TMP21]], ptr [[ARRAYIDX23]], align 4, !dbg [[DBG108:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX24:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B4]], i64 0, i64 0, !dbg [[DBG109:![0-9]+]]
+// CHECK1-NEXT:    [[TMP23:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG110:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM25:%.*]] = sext i32 [[TMP23]] to i64, !dbg [[DBG109]]
+// CHECK1-NEXT:    [[ARRAYIDX26:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX24]], i64 0, i64 [[IDXPROM25]], !dbg [[DBG109]]
+// CHECK1-NEXT:    [[TMP24:%.*]] = load i32, ptr [[ARRAYIDX26]], align 4, !dbg [[DBG109]]
+// CHECK1-NEXT:    [[TMP25:%.*]] = load i8, ptr [[TMP7]], align 1, !dbg [[DBG111:![0-9]+]]
+// CHECK1-NEXT:    [[TOBOOL:%.*]] = trunc i8 [[TMP25]] to i1, !dbg [[DBG111]]
+// CHECK1-NEXT:    [[CONV:%.*]] = zext i1 [[TOBOOL]] to i32, !dbg [[DBG111]]
+// CHECK1-NEXT:    [[OR:%.*]] = or i32 [[CONV]], [[TMP24]], !dbg [[DBG111]]
+// CHECK1-NEXT:    [[TOBOOL27:%.*]] = icmp ne i32 [[OR]], 0, !dbg [[DBG111]]
+// CHECK1-NEXT:    [[FROMBOOL:%.*]] = zext i1 [[TOBOOL27]] to i8, !dbg [[DBG111]]
+// CHECK1-NEXT:    store i8 [[FROMBOOL]], ptr [[TMP7]], align 1, !dbg [[DBG111]]
+// CHECK1-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]], !dbg [[DBG112:![0-9]+]]
 // CHECK1:       omp.body.continue:
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]], !dbg [[DBG74]]
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]], !dbg [[DBG83]]
 // CHECK1:       omp.inner.for.inc:
-// CHECK1-NEXT:    [[TMP26:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG68]]
-// CHECK1-NEXT:    [[ADD28:%.*]] = add nsw i32 [[TMP26]], 1, !dbg [[DBG65]]
-// CHECK1-NEXT:    store i32 [[ADD28]], ptr [[DOTOMP_IV]], align 4, !dbg [[DBG65]]
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !dbg [[DBG74]], !llvm.loop [[LOOP104:![0-9]+]]
+// CHECK1-NEXT:    [[TMP26:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG77]]
+// CHECK1-NEXT:    [[ADD28:%.*]] = add nsw i32 [[TMP26]], 1, !dbg [[DBG74]]
+// CHECK1-NEXT:    store i32 [[ADD28]], ptr [[DOTOMP_IV]], align 4, !dbg [[DBG74]]
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !dbg [[DBG83]], !llvm.loop [[LOOP113:![0-9]+]]
 // CHECK1:       omp.inner.for.end:
-// CHECK1-NEXT:    br label [[OMP_DISPATCH_INC:%.*]], !dbg [[DBG74]]
+// CHECK1-NEXT:    br label [[OMP_DISPATCH_INC:%.*]], !dbg [[DBG83]]
 // CHECK1:       omp.dispatch.inc:
-// CHECK1-NEXT:    [[TMP27:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG68]]
-// CHECK1-NEXT:    [[TMP28:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG68]]
-// CHECK1-NEXT:    [[ADD29:%.*]] = add nsw i32 [[TMP27]], [[TMP28]], !dbg [[DBG65]]
-// CHECK1-NEXT:    store i32 [[ADD29]], ptr [[DOTOMP_LB]], align 4, !dbg [[DBG65]]
-// CHECK1-NEXT:    [[TMP29:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG68]]
-// CHECK1-NEXT:    [[TMP30:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG68]]
-// CHECK1-NEXT:    [[ADD30:%.*]] = add nsw i32 [[TMP29]], [[TMP30]], !dbg [[DBG65]]
-// CHECK1-NEXT:    store i32 [[ADD30]], ptr [[DOTOMP_UB]], align 4, !dbg [[DBG65]]
-// CHECK1-NEXT:    br label [[OMP_DISPATCH_COND]], !dbg [[DBG74]], !llvm.loop [[LOOP106:![0-9]+]]
+// CHECK1-NEXT:    [[TMP27:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG77]]
+// CHECK1-NEXT:    [[TMP28:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG77]]
+// CHECK1-NEXT:    [[ADD29:%.*]] = add nsw i32 [[TMP27]], [[TMP28]], !dbg [[DBG74]]
+// CHECK1-NEXT:    store i32 [[ADD29]], ptr [[DOTOMP_LB]], align 4, !dbg [[DBG74]]
+// CHECK1-NEXT:    [[TMP29:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG77]]
+// CHECK1-NEXT:    [[TMP30:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG77]]
+// CHECK1-NEXT:    [[ADD30:%.*]] = add nsw i32 [[TMP29]], [[TMP30]], !dbg [[DBG74]]
+// CHECK1-NEXT:    store i32 [[ADD30]], ptr [[DOTOMP_UB]], align 4, !dbg [[DBG74]]
+// CHECK1-NEXT:    br label [[OMP_DISPATCH_COND]], !dbg [[DBG83]], !llvm.loop [[LOOP115:![0-9]+]]
 // CHECK1:       omp.dispatch.end:
-// CHECK1-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB5:[0-9]+]], i32 [[TMP9]]), !dbg [[DBG105:![0-9]+]]
-// CHECK1-NEXT:    ret void, !dbg [[DBG107:![0-9]+]]
+// CHECK1-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB5:[0-9]+]], i32 [[TMP9]]), !dbg [[DBG114:![0-9]+]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG116:![0-9]+]]
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13_debug___omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG108:![0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR2]] !dbg [[DBG117:![0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -292,34 +292,34 @@ int main() {
 // CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[BB_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META115:![0-9]+]], metadata !DIExpression()), !dbg [[DBG116:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META124:![0-9]+]], metadata !DIExpression()), !dbg [[DBG125:![0-9]+]]
 // CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META117:![0-9]+]], metadata !DIExpression()), !dbg [[DBG116]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META126:![0-9]+]], metadata !DIExpression()), !dbg [[DBG125]]
 // CHECK1-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META118:![0-9]+]], metadata !DIExpression()), !dbg [[DBG116]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META127:![0-9]+]], metadata !DIExpression()), !dbg [[DBG125]]
 // CHECK1-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META119:![0-9]+]], metadata !DIExpression()), !dbg [[DBG116]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META128:![0-9]+]], metadata !DIExpression()), !dbg [[DBG125]]
 // CHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META120:![0-9]+]], metadata !DIExpression()), !dbg [[DBG116]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META129:![0-9]+]], metadata !DIExpression()), !dbg [[DBG125]]
 // CHECK1-NEXT:    store ptr [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META121:![0-9]+]], metadata !DIExpression()), !dbg [[DBG116]]
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG122:![0-9]+]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG122]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG122]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG122]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG122]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG122]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG122]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG122]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG122]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = addrspacecast ptr [[TMP5]] to ptr addrspace(1), !dbg [[DBG122]]
-// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr [[TMP8]] to ptr addrspace(1), !dbg [[DBG122]]
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13_debug___omp_outlined_debug__(ptr [[TMP3]], ptr [[TMP4]], ptr addrspace(1) [[TMP9]], i32 [[TMP6]], ptr [[TMP7]], ptr addrspace(1) [[TMP10]]) #[[ATTR3:[0-9]+]], !dbg [[DBG122]]
-// CHECK1-NEXT:    ret void, !dbg [[DBG122]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META130:![0-9]+]], metadata !DIExpression()), !dbg [[DBG125]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG131:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG131]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG131]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG131]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG131]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG131]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG131]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG131]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG131]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = addrspacecast ptr [[TMP5]] to ptr addrspace(1), !dbg [[DBG131]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr [[TMP8]] to ptr addrspace(1), !dbg [[DBG131]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13_debug___omp_outlined_debug__(ptr [[TMP3]], ptr [[TMP4]], ptr addrspace(1) [[TMP9]], i32 [[TMP6]], ptr [[TMP7]], ptr addrspace(1) [[TMP10]]) #[[ATTR4:[0-9]+]], !dbg [[DBG131]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG131]]
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13
-// CHECK1-SAME: (ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR5:[0-9]+]] !dbg [[DBG123:![0-9]+]] {
+// CHECK1-SAME: (ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR6:[0-9]+]] !dbg [[DBG132:![0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
@@ -327,32 +327,32 @@ int main() {
 // CHECK1-NEXT:    [[BB_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8
 // CHECK1-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META126:![0-9]+]], metadata !DIExpression()), !dbg [[DBG127:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META135:![0-9]+]], metadata !DIExpression()), !dbg [[DBG136:![0-9]+]]
 // CHECK1-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META128:![0-9]+]], metadata !DIExpression()), !dbg [[DBG127]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META137:![0-9]+]], metadata !DIExpression()), !dbg [[DBG136]]
 // CHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META129:![0-9]+]], metadata !DIExpression()), !dbg [[DBG127]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META138:![0-9]+]], metadata !DIExpression()), !dbg [[DBG136]]
 // CHECK1-NEXT:    store ptr [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META130:![0-9]+]], metadata !DIExpression()), !dbg [[DBG127]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META139:![0-9]+]], metadata !DIExpression()), !dbg [[DBG136]]
 // CHECK1-NEXT:    store i64 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTCAPTURE_EXPR__ADDR]], metadata [[META131:![0-9]+]], metadata !DIExpression()), !dbg [[DBG127]]
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG132:![0-9]+]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG132]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG132]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG132]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG132]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG132]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG132]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1, !dbg [[DBG132]]
-// CHECK1-NEXT:    [[TOBOOL:%.*]] = trunc i8 [[TMP7]] to i1, !dbg [[DBG132]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = addrspacecast ptr [[TMP3]] to ptr addrspace(1), !dbg [[DBG132]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = addrspacecast ptr [[TMP6]] to ptr addrspace(1), !dbg [[DBG132]]
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13_debug__(ptr addrspace(1) [[TMP8]], i32 [[TMP4]], ptr [[TMP5]], ptr addrspace(1) [[TMP9]], i1 [[TOBOOL]]) #[[ATTR3]], !dbg [[DBG132]]
-// CHECK1-NEXT:    ret void, !dbg [[DBG132]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTCAPTURE_EXPR__ADDR]], metadata [[META140:![0-9]+]], metadata !DIExpression()), !dbg [[DBG136]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG141:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG141]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG141]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG141]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG141]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG141]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG141]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1, !dbg [[DBG141]]
+// CHECK1-NEXT:    [[TOBOOL:%.*]] = trunc i8 [[TMP7]] to i1, !dbg [[DBG141]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = addrspacecast ptr [[TMP3]] to ptr addrspace(1), !dbg [[DBG141]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = addrspacecast ptr [[TMP6]] to ptr addrspace(1), !dbg [[DBG141]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13_debug__(ptr addrspace(1) [[TMP8]], i32 [[TMP4]], ptr [[TMP5]], ptr addrspace(1) [[TMP9]], i1 [[TOBOOL]]) #[[ATTR4]], !dbg [[DBG141]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG141]]
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27_debug__
-// CHECK1-SAME: (ptr addrspace(1) noalias noundef [[C:%.*]], i32 noundef [[A:%.*]], ptr addrspace(1) noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG133:![0-9]+]] {
+// CHECK1-SAME: (ptr addrspace(1) noalias noundef [[C:%.*]], i32 noundef [[A:%.*]], ptr addrspace(1) noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG142:![0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[C_ADDR:%.*]] = alloca ptr addrspace(1), align 8
 // CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
@@ -364,51 +364,51 @@ int main() {
 // CHECK1-NEXT:    [[A_CASTED:%.*]] = alloca i64, align 8
 // CHECK1-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x ptr], align 8
 // CHECK1-NEXT:    store ptr addrspace(1) [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META138:![0-9]+]], metadata !DIExpression()), !dbg [[DBG139:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META147:![0-9]+]], metadata !DIExpression()), !dbg [[DBG148:![0-9]+]]
 // CHECK1-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META140:![0-9]+]], metadata !DIExpression()), !dbg [[DBG141:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META149:![0-9]+]], metadata !DIExpression()), !dbg [[DBG150:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META142:![0-9]+]], metadata !DIExpression()), !dbg [[DBG143:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META151:![0-9]+]], metadata !DIExpression()), !dbg [[DBG152:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META144:![0-9]+]], metadata !DIExpression()), !dbg [[DBG145:![0-9]+]]
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG146:![0-9]+]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG146]]
-// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG146]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG146]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr [[B_ADDR]], align 8, !dbg [[DBG146]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[TMP3]] to ptr, !dbg [[DBG146]]
-// CHECK1-NEXT:    store ptr [[TMP4]], ptr [[_TMP1]], align 8, !dbg [[DBG146]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG146]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG146]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = addrspacecast ptr addrspace(1) [[TMP6]] to ptr, !dbg [[DBG146]]
-// CHECK1-NEXT:    store ptr [[TMP7]], ptr [[_TMP2]], align 8, !dbg [[DBG146]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG146]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27_kernel_environment), !dbg [[DBG146]]
-// CHECK1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP9]], -1, !dbg [[DBG146]]
-// CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]], !dbg [[DBG146]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META153:![0-9]+]], metadata !DIExpression()), !dbg [[DBG154:![0-9]+]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG155:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG155]]
+// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG155]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG155]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr [[B_ADDR]], align 8, !dbg [[DBG155]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[TMP3]] to ptr, !dbg [[DBG155]]
+// CHECK1-NEXT:    store ptr [[TMP4]], ptr [[_TMP1]], align 8, !dbg [[DBG155]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG155]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG155]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = addrspacecast ptr addrspace(1) [[TMP6]] to ptr, !dbg [[DBG155]]
+// CHECK1-NEXT:    store ptr [[TMP7]], ptr [[_TMP2]], align 8, !dbg [[DBG155]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG155]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27_kernel_environment), !dbg [[DBG155]]
+// CHECK1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP9]], -1, !dbg [[DBG155]]
+// CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]], !dbg [[DBG155]]
 // CHECK1:       user_code.entry:
 // CHECK1-NEXT:    [[TMP10:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB13:[0-9]+]])
-// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG147:![0-9]+]]
-// CHECK1-NEXT:    store i32 [[TMP11]], ptr [[A_CASTED]], align 4, !dbg [[DBG147]]
-// CHECK1-NEXT:    [[TMP12:%.*]] = load i64, ptr [[A_CASTED]], align 8, !dbg [[DBG147]]
-// CHECK1-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 0, !dbg [[DBG147]]
-// CHECK1-NEXT:    store ptr [[TMP2]], ptr [[TMP13]], align 8, !dbg [[DBG147]]
-// CHECK1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 1, !dbg [[DBG147]]
-// CHECK1-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP12]] to ptr, !dbg [[DBG147]]
-// CHECK1-NEXT:    store ptr [[TMP15]], ptr [[TMP14]], align 8, !dbg [[DBG147]]
-// CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 2, !dbg [[DBG147]]
-// CHECK1-NEXT:    store ptr [[TMP5]], ptr [[TMP16]], align 8, !dbg [[DBG147]]
-// CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 3, !dbg [[DBG147]]
-// CHECK1-NEXT:    store ptr [[TMP8]], ptr [[TMP17]], align 8, !dbg [[DBG147]]
-// CHECK1-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB13]], i32 [[TMP10]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27_debug___omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 4), !dbg [[DBG147]]
-// CHECK1-NEXT:    call void @__kmpc_target_deinit(), !dbg [[DBG148:![0-9]+]]
-// CHECK1-NEXT:    ret void, !dbg [[DBG150:![0-9]+]]
+// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG156:![0-9]+]]
+// CHECK1-NEXT:    store i32 [[TMP11]], ptr [[A_CASTED]], align 4, !dbg [[DBG156]]
+// CHECK1-NEXT:    [[TMP12:%.*]] = load i64, ptr [[A_CASTED]], align 8, !dbg [[DBG156]]
+// CHECK1-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 0, !dbg [[DBG156]]
+// CHECK1-NEXT:    store ptr [[TMP2]], ptr [[TMP13]], align 8, !dbg [[DBG156]]
+// CHECK1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 1, !dbg [[DBG156]]
+// CHECK1-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP12]] to ptr, !dbg [[DBG156]]
+// CHECK1-NEXT:    store ptr [[TMP15]], ptr [[TMP14]], align 8, !dbg [[DBG156]]
+// CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 2, !dbg [[DBG156]]
+// CHECK1-NEXT:    store ptr [[TMP5]], ptr [[TMP16]], align 8, !dbg [[DBG156]]
+// CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 3, !dbg [[DBG156]]
+// CHECK1-NEXT:    store ptr [[TMP8]], ptr [[TMP17]], align 8, !dbg [[DBG156]]
+// CHECK1-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB13]], i32 [[TMP10]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27_debug___omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 4), !dbg [[DBG156]]
+// CHECK1-NEXT:    call void @__kmpc_target_deinit(), !dbg [[DBG157:![0-9]+]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG159:![0-9]+]]
 // CHECK1:       worker.exit:
-// CHECK1-NEXT:    ret void, !dbg [[DBG146]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG155]]
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27_debug___omp_outlined_debug__
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], i32 noundef [[A:%.*]], ptr addrspace(1) noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG151:![0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], i32 noundef [[A:%.*]], ptr addrspace(1) noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR2]] !dbg [[DBG160:![0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -431,140 +431,140 @@ int main() {
 // CHECK1-NEXT:    [[H:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[D:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META154:![0-9]+]], metadata !DIExpression()), !dbg [[DBG155:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META163:![0-9]+]], metadata !DIExpression()), !dbg [[DBG164:![0-9]+]]
 // CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META156:![0-9]+]], metadata !DIExpression()), !dbg [[DBG155]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META165:![0-9]+]], metadata !DIExpression()), !dbg [[DBG164]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META157:![0-9]+]], metadata !DIExpression()), !dbg [[DBG158:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META166:![0-9]+]], metadata !DIExpression()), !dbg [[DBG167:![0-9]+]]
 // CHECK1-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META159:![0-9]+]], metadata !DIExpression()), !dbg [[DBG160:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META168:![0-9]+]], metadata !DIExpression()), !dbg [[DBG169:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META161:![0-9]+]], metadata !DIExpression()), !dbg [[DBG162:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META170:![0-9]+]], metadata !DIExpression()), !dbg [[DBG171:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META163:![0-9]+]], metadata !DIExpression()), !dbg [[DBG164:![0-9]+]]
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG165:![0-9]+]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG165]]
-// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG165]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG165]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr [[B_ADDR]], align 8, !dbg [[DBG165]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[TMP3]] to ptr, !dbg [[DBG165]]
-// CHECK1-NEXT:    store ptr [[TMP4]], ptr [[_TMP1]], align 8, !dbg [[DBG165]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG165]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG165]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = addrspacecast ptr addrspace(1) [[TMP6]] to ptr, !dbg [[DBG165]]
-// CHECK1-NEXT:    store ptr [[TMP7]], ptr [[_TMP2]], align 8, !dbg [[DBG165]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG165]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_IV]], metadata [[META166:![0-9]+]], metadata !DIExpression()), !dbg [[DBG155]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_LB]], metadata [[META167:![0-9]+]], metadata !DIExpression()), !dbg [[DBG155]]
-// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG168:![0-9]+]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_UB]], metadata [[META169:![0-9]+]], metadata !DIExpression()), !dbg [[DBG155]]
-// CHECK1-NEXT:    store i32 9, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG168]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_STRIDE]], metadata [[META170:![0-9]+]], metadata !DIExpression()), !dbg [[DBG155]]
-// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG168]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_IS_LAST]], metadata [[META171:![0-9]+]], metadata !DIExpression()), !dbg [[DBG155]]
-// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4, !dbg [[DBG168]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[I]], metadata [[META172:![0-9]+]], metadata !DIExpression()), !dbg [[DBG155]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG165]]
-// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !dbg [[DBG165]]
-// CHECK1-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB10:[0-9]+]], i32 [[TMP10]], i32 33, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1), !dbg [[DBG173:![0-9]+]]
-// CHECK1-NEXT:    br label [[OMP_DISPATCH_COND:%.*]], !dbg [[DBG165]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META172:![0-9]+]], metadata !DIExpression()), !dbg [[DBG173:![0-9]+]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG174:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG174]]
+// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG174]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG174]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr [[B_ADDR]], align 8, !dbg [[DBG174]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[TMP3]] to ptr, !dbg [[DBG174]]
+// CHECK1-NEXT:    store ptr [[TMP4]], ptr [[_TMP1]], align 8, !dbg [[DBG174]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG174]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG174]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = addrspacecast ptr addrspace(1) [[TMP6]] to ptr, !dbg [[DBG174]]
+// CHECK1-NEXT:    store ptr [[TMP7]], ptr [[_TMP2]], align 8, !dbg [[DBG174]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG174]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_IV]], metadata [[META175:![0-9]+]], metadata !DIExpression()), !dbg [[DBG164]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_LB]], metadata [[META176:![0-9]+]], metadata !DIExpression()), !dbg [[DBG164]]
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG177:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_UB]], metadata [[META178:![0-9]+]], metadata !DIExpression()), !dbg [[DBG164]]
+// CHECK1-NEXT:    store i32 9, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG177]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_STRIDE]], metadata [[META179:![0-9]+]], metadata !DIExpression()), !dbg [[DBG164]]
+// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG177]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_IS_LAST]], metadata [[META180:![0-9]+]], metadata !DIExpression()), !dbg [[DBG164]]
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4, !dbg [[DBG177]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[I]], metadata [[META181:![0-9]+]], metadata !DIExpression()), !dbg [[DBG164]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG174]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !dbg [[DBG174]]
+// CHECK1-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB10:[0-9]+]], i32 [[TMP10]], i32 33, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1), !dbg [[DBG182:![0-9]+]]
+// CHECK1-NEXT:    br label [[OMP_DISPATCH_COND:%.*]], !dbg [[DBG174]]
 // CHECK1:       omp.dispatch.cond:
-// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG168]]
-// CHECK1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP11]], 9, !dbg [[DBG168]]
-// CHECK1-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]], !dbg [[DBG168]]
+// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG177]]
+// CHECK1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP11]], 9, !dbg [[DBG177]]
+// CHECK1-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]], !dbg [[DBG177]]
 // CHECK1:       cond.true:
-// CHECK1-NEXT:    br label [[COND_END:%.*]], !dbg [[DBG168]]
+// CHECK1-NEXT:    br label [[COND_END:%.*]], !dbg [[DBG177]]
 // CHECK1:       cond.false:
-// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG168]]
-// CHECK1-NEXT:    br label [[COND_END]], !dbg [[DBG168]]
+// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG177]]
+// CHECK1-NEXT:    br label [[COND_END]], !dbg [[DBG177]]
 // CHECK1:       cond.end:
-// CHECK1-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ], !dbg [[DBG168]]
-// CHECK1-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4, !dbg [[DBG168]]
-// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG168]]
-// CHECK1-NEXT:    store i32 [[TMP13]], ptr [[DOTOMP_IV]], align 4, !dbg [[DBG168]]
-// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG168]]
-// CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG168]]
-// CHECK1-NEXT:    [[CMP4:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]], !dbg [[DBG165]]
-// CHECK1-NEXT:    br i1 [[CMP4]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]], !dbg [[DBG165]]
+// CHECK1-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ], !dbg [[DBG177]]
+// CHECK1-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4, !dbg [[DBG177]]
+// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG177]]
+// CHECK1-NEXT:    store i32 [[TMP13]], ptr [[DOTOMP_IV]], align 4, !dbg [[DBG177]]
+// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG177]]
+// CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG177]]
+// CHECK1-NEXT:    [[CMP4:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]], !dbg [[DBG174]]
+// CHECK1-NEXT:    br i1 [[CMP4]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]], !dbg [[DBG174]]
 // CHECK1:       omp.dispatch.body:
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]], !dbg [[DBG165]]
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]], !dbg [[DBG174]]
 // CHECK1:       omp.inner.for.cond:
-// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG168]]
-// CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG168]]
-// CHECK1-NEXT:    [[CMP5:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]], !dbg [[DBG165]]
-// CHECK1-NEXT:    br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]], !dbg [[DBG165]]
+// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG177]]
+// CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG177]]
+// CHECK1-NEXT:    [[CMP5:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]], !dbg [[DBG174]]
+// CHECK1-NEXT:    br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]], !dbg [[DBG174]]
 // CHECK1:       omp.inner.for.body:
-// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG168]]
-// CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1, !dbg [[DBG174:![0-9]+]]
-// CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]], !dbg [[DBG174]]
-// CHECK1-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !dbg [[DBG174]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[F]], metadata [[META175:![0-9]+]], metadata !DIExpression()), !dbg [[DBG177:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 1, !dbg [[DBG178:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX]], i64 0, i64 1, !dbg [[DBG178]]
-// CHECK1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX6]], i64 0, i64 1, !dbg [[DBG178]]
-// CHECK1-NEXT:    store ptr [[ARRAYIDX7]], ptr [[F]], align 8, !dbg [[DBG177]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[G]], metadata [[META179:![0-9]+]], metadata !DIExpression()), !dbg [[DBG180:![0-9]+]]
-// CHECK1-NEXT:    store ptr [[A_ADDR]], ptr [[G]], align 8, !dbg [[DBG180]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[H]], metadata [[META181:![0-9]+]], metadata !DIExpression()), !dbg [[DBG182:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP5]], i64 0, i64 1, !dbg [[DBG183:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX8]], i64 0, i64 1, !dbg [[DBG183]]
-// CHECK1-NEXT:    store ptr [[ARRAYIDX9]], ptr [[H]], align 8, !dbg [[DBG182]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[D]], metadata [[META184:![0-9]+]], metadata !DIExpression()), !dbg [[DBG185:![0-9]+]]
-// CHECK1-NEXT:    store i32 15, ptr [[D]], align 4, !dbg [[DBG185]]
-// CHECK1-NEXT:    store i32 5, ptr [[A_ADDR]], align 4, !dbg [[DBG186:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP5]], i64 0, i64 0, !dbg [[DBG187:![0-9]+]]
-// CHECK1-NEXT:    [[TMP19:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG188:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP19]] to i64, !dbg [[DBG187]]
-// CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX10]], i64 0, i64 [[IDXPROM]], !dbg [[DBG187]]
-// CHECK1-NEXT:    store i32 10, ptr [[ARRAYIDX11]], align 4, !dbg [[DBG189:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG190:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX12]], i64 0, i64 0, !dbg [[DBG190]]
-// CHECK1-NEXT:    [[TMP20:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG191:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM14:%.*]] = sext i32 [[TMP20]] to i64, !dbg [[DBG190]]
-// CHECK1-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX13]], i64 0, i64 [[IDXPROM14]], !dbg [[DBG190]]
-// CHECK1-NEXT:    store i32 11, ptr [[ARRAYIDX15]], align 4, !dbg [[DBG192:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG193:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX16]], i64 0, i64 0, !dbg [[DBG193]]
-// CHECK1-NEXT:    [[TMP21:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG194:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM18:%.*]] = sext i32 [[TMP21]] to i64, !dbg [[DBG193]]
-// CHECK1-NEXT:    [[ARRAYIDX19:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX17]], i64 0, i64 [[IDXPROM18]], !dbg [[DBG193]]
-// CHECK1-NEXT:    [[TMP22:%.*]] = load i32, ptr [[ARRAYIDX19]], align 4, !dbg [[DBG193]]
-// CHECK1-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP5]], i64 0, i64 0, !dbg [[DBG195:![0-9]+]]
-// CHECK1-NEXT:    [[TMP23:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG196:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM21:%.*]] = sext i32 [[TMP23]] to i64, !dbg [[DBG195]]
-// CHECK1-NEXT:    [[ARRAYIDX22:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX20]], i64 0, i64 [[IDXPROM21]], !dbg [[DBG195]]
-// CHECK1-NEXT:    store i32 [[TMP22]], ptr [[ARRAYIDX22]], align 4, !dbg [[DBG197:![0-9]+]]
-// CHECK1-NEXT:    [[TMP24:%.*]] = load i8, ptr [[TMP8]], align 1, !dbg [[DBG198:![0-9]+]]
-// CHECK1-NEXT:    [[TOBOOL:%.*]] = trunc i8 [[TMP24]] to i1, !dbg [[DBG198]]
-// CHECK1-NEXT:    [[CONV:%.*]] = zext i1 [[TOBOOL]] to i32, !dbg [[DBG198]]
-// CHECK1-NEXT:    store i32 [[CONV]], ptr [[D]], align 4, !dbg [[DBG199:![0-9]+]]
-// CHECK1-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]], !dbg [[DBG200:![0-9]+]]
+// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG177]]
+// CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1, !dbg [[DBG183:![0-9]+]]
+// CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]], !dbg [[DBG183]]
+// CHECK1-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !dbg [[DBG183]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[F]], metadata [[META184:![0-9]+]], metadata !DIExpression()), !dbg [[DBG186:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 1, !dbg [[DBG187:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX]], i64 0, i64 1, !dbg [[DBG187]]
+// CHECK1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX6]], i64 0, i64 1, !dbg [[DBG187]]
+// CHECK1-NEXT:    store ptr [[ARRAYIDX7]], ptr [[F]], align 8, !dbg [[DBG186]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[G]], metadata [[META188:![0-9]+]], metadata !DIExpression()), !dbg [[DBG189:![0-9]+]]
+// CHECK1-NEXT:    store ptr [[A_ADDR]], ptr [[G]], align 8, !dbg [[DBG189]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[H]], metadata [[META190:![0-9]+]], metadata !DIExpression()), !dbg [[DBG191:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP5]], i64 0, i64 1, !dbg [[DBG192:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX8]], i64 0, i64 1, !dbg [[DBG192]]
+// CHECK1-NEXT:    store ptr [[ARRAYIDX9]], ptr [[H]], align 8, !dbg [[DBG191]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[D]], metadata [[META193:![0-9]+]], metadata !DIExpression()), !dbg [[DBG194:![0-9]+]]
+// CHECK1-NEXT:    store i32 15, ptr [[D]], align 4, !dbg [[DBG194]]
+// CHECK1-NEXT:    store i32 5, ptr [[A_ADDR]], align 4, !dbg [[DBG195:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP5]], i64 0, i64 0, !dbg [[DBG196:![0-9]+]]
+// CHECK1-NEXT:    [[TMP19:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG197:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP19]] to i64, !dbg [[DBG196]]
+// CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX10]], i64 0, i64 [[IDXPROM]], !dbg [[DBG196]]
+// CHECK1-NEXT:    store i32 10, ptr [[ARRAYIDX11]], align 4, !dbg [[DBG198:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG199:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX12]], i64 0, i64 0, !dbg [[DBG199]]
+// CHECK1-NEXT:    [[TMP20:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG200:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM14:%.*]] = sext i32 [[TMP20]] to i64, !dbg [[DBG199]]
+// CHECK1-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX13]], i64 0, i64 [[IDXPROM14]], !dbg [[DBG199]]
+// CHECK1-NEXT:    store i32 11, ptr [[ARRAYIDX15]], align 4, !dbg [[DBG201:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG202:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX16]], i64 0, i64 0, !dbg [[DBG202]]
+// CHECK1-NEXT:    [[TMP21:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG203:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM18:%.*]] = sext i32 [[TMP21]] to i64, !dbg [[DBG202]]
+// CHECK1-NEXT:    [[ARRAYIDX19:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX17]], i64 0, i64 [[IDXPROM18]], !dbg [[DBG202]]
+// CHECK1-NEXT:    [[TMP22:%.*]] = load i32, ptr [[ARRAYIDX19]], align 4, !dbg [[DBG202]]
+// CHECK1-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP5]], i64 0, i64 0, !dbg [[DBG204:![0-9]+]]
+// CHECK1-NEXT:    [[TMP23:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG205:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM21:%.*]] = sext i32 [[TMP23]] to i64, !dbg [[DBG204]]
+// CHECK1-NEXT:    [[ARRAYIDX22:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX20]], i64 0, i64 [[IDXPROM21]], !dbg [[DBG204]]
+// CHECK1-NEXT:    store i32 [[TMP22]], ptr [[ARRAYIDX22]], align 4, !dbg [[DBG206:![0-9]+]]
+// CHECK1-NEXT:    [[TMP24:%.*]] = load i8, ptr [[TMP8]], align 1, !dbg [[DBG207:![0-9]+]]
+// CHECK1-NEXT:    [[TOBOOL:%.*]] = trunc i8 [[TMP24]] to i1, !dbg [[DBG207]]
+// CHECK1-NEXT:    [[CONV:%.*]] = zext i1 [[TOBOOL]] to i32, !dbg [[DBG207]]
+// CHECK1-NEXT:    store i32 [[CONV]], ptr [[D]], align 4, !dbg [[DBG208:![0-9]+]]
+// CHECK1-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]], !dbg [[DBG209:![0-9]+]]
 // CHECK1:       omp.body.continue:
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]], !dbg [[DBG173]]
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]], !dbg [[DBG182]]
 // CHECK1:       omp.inner.for.inc:
-// CHECK1-NEXT:    [[TMP25:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG168]]
-// CHECK1-NEXT:    [[ADD23:%.*]] = add nsw i32 [[TMP25]], 1, !dbg [[DBG165]]
-// CHECK1-NEXT:    store i32 [[ADD23]], ptr [[DOTOMP_IV]], align 4, !dbg [[DBG165]]
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !dbg [[DBG173]], !llvm.loop [[LOOP201:![0-9]+]]
+// CHECK1-NEXT:    [[TMP25:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG177]]
+// CHECK1-NEXT:    [[ADD23:%.*]] = add nsw i32 [[TMP25]], 1, !dbg [[DBG174]]
+// CHECK1-NEXT:    store i32 [[ADD23]], ptr [[DOTOMP_IV]], align 4, !dbg [[DBG174]]
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !dbg [[DBG182]], !llvm.loop [[LOOP210:![0-9]+]]
 // CHECK1:       omp.inner.for.end:
-// CHECK1-NEXT:    br label [[OMP_DISPATCH_INC:%.*]], !dbg [[DBG173]]
+// CHECK1-NEXT:    br label [[OMP_DISPATCH_INC:%.*]], !dbg [[DBG182]]
 // CHECK1:       omp.dispatch.inc:
-// CHECK1-NEXT:    [[TMP26:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG168]]
-// CHECK1-NEXT:    [[TMP27:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG168]]
-// CHECK1-NEXT:    [[ADD24:%.*]] = add nsw i32 [[TMP26]], [[TMP27]], !dbg [[DBG165]]
-// CHECK1-NEXT:    store i32 [[ADD24]], ptr [[DOTOMP_LB]], align 4, !dbg [[DBG165]]
-// CHECK1-NEXT:    [[TMP28:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG168]]
-// CHECK1-NEXT:    [[TMP29:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG168]]
-// CHECK1-NEXT:    [[ADD25:%.*]] = add nsw i32 [[TMP28]], [[TMP29]], !dbg [[DBG165]]
-// CHECK1-NEXT:    store i32 [[ADD25]], ptr [[DOTOMP_UB]], align 4, !dbg [[DBG165]]
-// CHECK1-NEXT:    br label [[OMP_DISPATCH_COND]], !dbg [[DBG173]], !llvm.loop [[LOOP203:![0-9]+]]
+// CHECK1-NEXT:    [[TMP26:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG177]]
+// CHECK1-NEXT:    [[TMP27:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG177]]
+// CHECK1-NEXT:    [[ADD24:%.*]] = add nsw i32 [[TMP26]], [[TMP27]], !dbg [[DBG174]]
+// CHECK1-NEXT:    store i32 [[ADD24]], ptr [[DOTOMP_LB]], align 4, !dbg [[DBG174]]
+// CHECK1-NEXT:    [[TMP28:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG177]]
+// CHECK1-NEXT:    [[TMP29:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG177]]
+// CHECK1-NEXT:    [[ADD25:%.*]] = add nsw i32 [[TMP28]], [[TMP29]], !dbg [[DBG174]]
+// CHECK1-NEXT:    store i32 [[ADD25]], ptr [[DOTOMP_UB]], align 4, !dbg [[DBG174]]
+// CHECK1-NEXT:    br label [[OMP_DISPATCH_COND]], !dbg [[DBG182]], !llvm.loop [[LOOP212:![0-9]+]]
 // CHECK1:       omp.dispatch.end:
-// CHECK1-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB12:[0-9]+]], i32 [[TMP10]]), !dbg [[DBG202:![0-9]+]]
-// CHECK1-NEXT:    ret void, !dbg [[DBG204:![0-9]+]]
+// CHECK1-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB12:[0-9]+]], i32 [[TMP10]]), !dbg [[DBG211:![0-9]+]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG213:![0-9]+]]
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27_debug___omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG205:![0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR2]] !dbg [[DBG214:![0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -573,64 +573,64 @@ int main() {
 // CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[BB_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META206:![0-9]+]], metadata !DIExpression()), !dbg [[DBG207:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META215:![0-9]+]], metadata !DIExpression()), !dbg [[DBG216:![0-9]+]]
 // CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META208:![0-9]+]], metadata !DIExpression()), !dbg [[DBG207]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META217:![0-9]+]], metadata !DIExpression()), !dbg [[DBG216]]
 // CHECK1-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META209:![0-9]+]], metadata !DIExpression()), !dbg [[DBG207]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META218:![0-9]+]], metadata !DIExpression()), !dbg [[DBG216]]
 // CHECK1-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META210:![0-9]+]], metadata !DIExpression()), !dbg [[DBG207]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META219:![0-9]+]], metadata !DIExpression()), !dbg [[DBG216]]
 // CHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META211:![0-9]+]], metadata !DIExpression()), !dbg [[DBG207]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META220:![0-9]+]], metadata !DIExpression()), !dbg [[DBG216]]
 // CHECK1-NEXT:    store ptr [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META212:![0-9]+]], metadata !DIExpression()), !dbg [[DBG207]]
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG213:![0-9]+]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG213]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG213]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG213]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG213]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG213]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG213]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG213]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG213]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = addrspacecast ptr [[TMP5]] to ptr addrspace(1), !dbg [[DBG213]]
-// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr [[TMP7]] to ptr addrspace(1), !dbg [[DBG213]]
-// CHECK1-NEXT:    [[TMP11:%.*]] = addrspacecast ptr [[TMP8]] to ptr addrspace(1), !dbg [[DBG213]]
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27_debug___omp_outlined_debug__(ptr [[TMP3]], ptr [[TMP4]], ptr addrspace(1) [[TMP9]], i32 [[TMP6]], ptr addrspace(1) [[TMP10]], ptr addrspace(1) [[TMP11]]) #[[ATTR3]], !dbg [[DBG213]]
-// CHECK1-NEXT:    ret void, !dbg [[DBG213]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META221:![0-9]+]], metadata !DIExpression()), !dbg [[DBG216]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG222:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG222]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG222]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG222]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG222]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG222]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG222]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG222]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG222]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = addrspacecast ptr [[TMP5]] to ptr addrspace(1), !dbg [[DBG222]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr [[TMP7]] to ptr addrspace(1), !dbg [[DBG222]]
+// CHECK1-NEXT:    [[TMP11:%.*]] = addrspacecast ptr [[TMP8]] to ptr addrspace(1), !dbg [[DBG222]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27_debug___omp_outlined_debug__(ptr [[TMP3]], ptr [[TMP4]], ptr addrspace(1) [[TMP9]], i32 [[TMP6]], ptr addrspace(1) [[TMP10]], ptr addrspace(1) [[TMP11]]) #[[ATTR4]], !dbg [[DBG222]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG222]]
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27
-// CHECK1-SAME: (ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR5]] !dbg [[DBG214:![0-9]+]] {
+// CHECK1-SAME: (ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR6]] !dbg [[DBG223:![0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
 // CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[BB_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META217:![0-9]+]], metadata !DIExpression()), !dbg [[DBG218:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META226:![0-9]+]], metadata !DIExpression()), !dbg [[DBG227:![0-9]+]]
 // CHECK1-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META219:![0-9]+]], metadata !DIExpression()), !dbg [[DBG218]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META228:![0-9]+]], metadata !DIExpression()), !dbg [[DBG227]]
 // CHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META220:![0-9]+]], metadata !DIExpression()), !dbg [[DBG218]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META229:![0-9]+]], metadata !DIExpression()), !dbg [[DBG227]]
 // CHECK1-NEXT:    store ptr [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META221:![0-9]+]], metadata !DIExpression()), !dbg [[DBG218]]
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG222:![0-9]+]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG222]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG222]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG222]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG222]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG222]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG222]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = addrspacecast ptr [[TMP3]] to ptr addrspace(1), !dbg [[DBG222]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = addrspacecast ptr [[TMP5]] to ptr addrspace(1), !dbg [[DBG222]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = addrspacecast ptr [[TMP6]] to ptr addrspace(1), !dbg [[DBG222]]
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27_debug__(ptr addrspace(1) [[TMP7]], i32 [[TMP4]], ptr addrspace(1) [[TMP8]], ptr addrspace(1) [[TMP9]]) #[[ATTR3]], !dbg [[DBG222]]
-// CHECK1-NEXT:    ret void, !dbg [[DBG222]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META230:![0-9]+]], metadata !DIExpression()), !dbg [[DBG227]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG231:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG231]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG231]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG231]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG231]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG231]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG231]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = addrspacecast ptr [[TMP3]] to ptr addrspace(1), !dbg [[DBG231]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = addrspacecast ptr [[TMP5]] to ptr addrspace(1), !dbg [[DBG231]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = addrspacecast ptr [[TMP6]] to ptr addrspace(1), !dbg [[DBG231]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27_debug__(ptr addrspace(1) [[TMP7]], i32 [[TMP4]], ptr addrspace(1) [[TMP8]], ptr addrspace(1) [[TMP9]]) #[[ATTR4]], !dbg [[DBG231]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG231]]
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41_debug__
-// CHECK1-SAME: (ptr addrspace(1) noalias noundef [[C:%.*]], ptr addrspace(1) noalias noundef [[A:%.*]], ptr addrspace(1) noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG223:![0-9]+]] {
+// CHECK1-SAME: (ptr addrspace(1) noalias noundef [[C:%.*]], ptr addrspace(1) noalias noundef [[A:%.*]], ptr addrspace(1) noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG232:![0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[C_ADDR:%.*]] = alloca ptr addrspace(1), align 8
 // CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca ptr addrspace(1), align 8
@@ -642,51 +642,51 @@ int main() {
 // CHECK1-NEXT:    [[_TMP3:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x ptr], align 8
 // CHECK1-NEXT:    store ptr addrspace(1) [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META228:![0-9]+]], metadata !DIExpression()), !dbg [[DBG229:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META237:![0-9]+]], metadata !DIExpression()), !dbg [[DBG238:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[A]], ptr [[A_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META230:![0-9]+]], metadata !DIExpression()), !dbg [[DBG231:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META239:![0-9]+]], metadata !DIExpression()), !dbg [[DBG240:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META232:![0-9]+]], metadata !DIExpression()), !dbg [[DBG233:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META241:![0-9]+]], metadata !DIExpression()), !dbg [[DBG242:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META234:![0-9]+]], metadata !DIExpression()), !dbg [[DBG235:![0-9]+]]
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG236:![0-9]+]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG236]]
-// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG236]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG236]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr [[A_ADDR]], align 8, !dbg [[DBG236]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[TMP3]] to ptr, !dbg [[DBG236]]
-// CHECK1-NEXT:    store ptr [[TMP4]], ptr [[_TMP1]], align 8, !dbg [[DBG236]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG236]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr addrspace(1), ptr [[B_ADDR]], align 8, !dbg [[DBG236]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = addrspacecast ptr addrspace(1) [[TMP6]] to ptr, !dbg [[DBG236]]
-// CHECK1-NEXT:    store ptr [[TMP7]], ptr [[_TMP2]], align 8, !dbg [[DBG236]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG236]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG236]]
-// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr addrspace(1) [[TMP9]] to ptr, !dbg [[DBG236]]
-// CHECK1-NEXT:    store ptr [[TMP10]], ptr [[_TMP3]], align 8, !dbg [[DBG236]]
-// CHECK1-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[_TMP3]], align 8, !dbg [[DBG236]]
-// CHECK1-NEXT:    [[TMP12:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41_kernel_environment), !dbg [[DBG236]]
-// CHECK1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP12]], -1, !dbg [[DBG236]]
-// CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]], !dbg [[DBG236]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META243:![0-9]+]], metadata !DIExpression()), !dbg [[DBG244:![0-9]+]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG245:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG245]]
+// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG245]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG245]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr [[A_ADDR]], align 8, !dbg [[DBG245]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[TMP3]] to ptr, !dbg [[DBG245]]
+// CHECK1-NEXT:    store ptr [[TMP4]], ptr [[_TMP1]], align 8, !dbg [[DBG245]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG245]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr addrspace(1), ptr [[B_ADDR]], align 8, !dbg [[DBG245]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = addrspacecast ptr addrspace(1) [[TMP6]] to ptr, !dbg [[DBG245]]
+// CHECK1-NEXT:    store ptr [[TMP7]], ptr [[_TMP2]], align 8, !dbg [[DBG245]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG245]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG245]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr addrspace(1) [[TMP9]] to ptr, !dbg [[DBG245]]
+// CHECK1-NEXT:    store ptr [[TMP10]], ptr [[_TMP3]], align 8, !dbg [[DBG245]]
+// CHECK1-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[_TMP3]], align 8, !dbg [[DBG245]]
+// CHECK1-NEXT:    [[TMP12:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41_kernel_environment), !dbg [[DBG245]]
+// CHECK1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP12]], -1, !dbg [[DBG245]]
+// CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]], !dbg [[DBG245]]
 // CHECK1:       user_code.entry:
 // CHECK1-NEXT:    [[TMP13:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB20:[0-9]+]])
-// CHECK1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 0, !dbg [[DBG237:![0-9]+]]
-// CHECK1-NEXT:    store ptr [[TMP2]], ptr [[TMP14]], align 8, !dbg [[DBG237]]
-// CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 1, !dbg [[DBG237]]
-// CHECK1-NEXT:    store ptr [[TMP5]], ptr [[TMP15]], align 8, !dbg [[DBG237]]
-// CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 2, !dbg [[DBG237]]
-// CHECK1-NEXT:    store ptr [[TMP8]], ptr [[TMP16]], align 8, !dbg [[DBG237]]
-// CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 3, !dbg [[DBG237]]
-// CHECK1-NEXT:    store ptr [[TMP11]], ptr [[TMP17]], align 8, !dbg [[DBG237]]
-// CHECK1-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB20]], i32 [[TMP13]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41_debug___omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 4), !dbg [[DBG237]]
-// CHECK1-NEXT:    call void @__kmpc_target_deinit(), !dbg [[DBG238:![0-9]+]]
-// CHECK1-NEXT:    ret void, !dbg [[DBG240:![0-9]+]]
+// CHECK1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 0, !dbg [[DBG246:![0-9]+]]
+// CHECK1-NEXT:    store ptr [[TMP2]], ptr [[TMP14]], align 8, !dbg [[DBG246]]
+// CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 1, !dbg [[DBG246]]
+// CHECK1-NEXT:    store ptr [[TMP5]], ptr [[TMP15]], align 8, !dbg [[DBG246]]
+// CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 2, !dbg [[DBG246]]
+// CHECK1-NEXT:    store ptr [[TMP8]], ptr [[TMP16]], align 8, !dbg [[DBG246]]
+// CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 3, !dbg [[DBG246]]
+// CHECK1-NEXT:    store ptr [[TMP11]], ptr [[TMP17]], align 8, !dbg [[DBG246]]
+// CHECK1-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB20]], i32 [[TMP13]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41_debug___omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 4), !dbg [[DBG246]]
+// CHECK1-NEXT:    call void @__kmpc_target_deinit(), !dbg [[DBG247:![0-9]+]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG249:![0-9]+]]
 // CHECK1:       worker.exit:
-// CHECK1-NEXT:    ret void, !dbg [[DBG236]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG245]]
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41_debug___omp_outlined_debug__
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], ptr addrspace(1) noalias noundef [[A:%.*]], ptr addrspace(1) noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG241:![0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], ptr addrspace(1) noalias noundef [[A:%.*]], ptr addrspace(1) noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR2]] !dbg [[DBG250:![0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -710,148 +710,148 @@ int main() {
 // CHECK1-NEXT:    [[H:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[D:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META244:![0-9]+]], metadata !DIExpression()), !dbg [[DBG245:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META253:![0-9]+]], metadata !DIExpression()), !dbg [[DBG254:![0-9]+]]
 // CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META246:![0-9]+]], metadata !DIExpression()), !dbg [[DBG245]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META255:![0-9]+]], metadata !DIExpression()), !dbg [[DBG254]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META247:![0-9]+]], metadata !DIExpression()), !dbg [[DBG248:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META256:![0-9]+]], metadata !DIExpression()), !dbg [[DBG257:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[A]], ptr [[A_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META249:![0-9]+]], metadata !DIExpression()), !dbg [[DBG250:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META258:![0-9]+]], metadata !DIExpression()), !dbg [[DBG259:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META251:![0-9]+]], metadata !DIExpression()), !dbg [[DBG252:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META260:![0-9]+]], metadata !DIExpression()), !dbg [[DBG261:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META253:![0-9]+]], metadata !DIExpression()), !dbg [[DBG254:![0-9]+]]
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG255:![0-9]+]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG255]]
-// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG255]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG255]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr [[A_ADDR]], align 8, !dbg [[DBG255]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[TMP3]] to ptr, !dbg [[DBG255]]
-// CHECK1-NEXT:    store ptr [[TMP4]], ptr [[_TMP1]], align 8, !dbg [[DBG255]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG255]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr addrspace(1), ptr [[B_ADDR]], align 8, !dbg [[DBG255]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = addrspacecast ptr addrspace(1) [[TMP6]] to ptr, !dbg [[DBG255]]
-// CHECK1-NEXT:    store ptr [[TMP7]], ptr [[_TMP2]], align 8, !dbg [[DBG255]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG255]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG255]]
-// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr addrspace(1) [[TMP9]] to ptr, !dbg [[DBG255]]
-// CHECK1-NEXT:    store ptr [[TMP10]], ptr [[_TMP3]], align 8, !dbg [[DBG255]]
-// CHECK1-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[_TMP3]], align 8, !dbg [[DBG255]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_IV]], metadata [[META256:![0-9]+]], metadata !DIExpression()), !dbg [[DBG245]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_LB]], metadata [[META257:![0-9]+]], metadata !DIExpression()), !dbg [[DBG245]]
-// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG258:![0-9]+]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_UB]], metadata [[META259:![0-9]+]], metadata !DIExpression()), !dbg [[DBG245]]
-// CHECK1-NEXT:    store i32 9, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG258]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_STRIDE]], metadata [[META260:![0-9]+]], metadata !DIExpression()), !dbg [[DBG245]]
-// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG258]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_IS_LAST]], metadata [[META261:![0-9]+]], metadata !DIExpression()), !dbg [[DBG245]]
-// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4, !dbg [[DBG258]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[I]], metadata [[META262:![0-9]+]], metadata !DIExpression()), !dbg [[DBG245]]
-// CHECK1-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG255]]
-// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4, !dbg [[DBG255]]
-// CHECK1-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB17:[0-9]+]], i32 [[TMP13]], i32 33, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1), !dbg [[DBG263:![0-9]+]]
-// CHECK1-NEXT:    br label [[OMP_DISPATCH_COND:%.*]], !dbg [[DBG255]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META262:![0-9]+]], metadata !DIExpression()), !dbg [[DBG263:![0-9]+]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG264:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG264]]
+// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG264]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG264]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr [[A_ADDR]], align 8, !dbg [[DBG264]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[TMP3]] to ptr, !dbg [[DBG264]]
+// CHECK1-NEXT:    store ptr [[TMP4]], ptr [[_TMP1]], align 8, !dbg [[DBG264]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG264]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr addrspace(1), ptr [[B_ADDR]], align 8, !dbg [[DBG264]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = addrspacecast ptr addrspace(1) [[TMP6]] to ptr, !dbg [[DBG264]]
+// CHECK1-NEXT:    store ptr [[TMP7]], ptr [[_TMP2]], align 8, !dbg [[DBG264]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG264]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG264]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr addrspace(1) [[TMP9]] to ptr, !dbg [[DBG264]]
+// CHECK1-NEXT:    store ptr [[TMP10]], ptr [[_TMP3]], align 8, !dbg [[DBG264]]
+// CHECK1-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[_TMP3]], align 8, !dbg [[DBG264]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_IV]], metadata [[META265:![0-9]+]], metadata !DIExpression()), !dbg [[DBG254]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_LB]], metadata [[META266:![0-9]+]], metadata !DIExpression()), !dbg [[DBG254]]
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG267:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_UB]], metadata [[META268:![0-9]+]], metadata !DIExpression()), !dbg [[DBG254]]
+// CHECK1-NEXT:    store i32 9, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG267]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_STRIDE]], metadata [[META269:![0-9]+]], metadata !DIExpression()), !dbg [[DBG254]]
+// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG267]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_IS_LAST]], metadata [[META270:![0-9]+]], metadata !DIExpression()), !dbg [[DBG254]]
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4, !dbg [[DBG267]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[I]], metadata [[META271:![0-9]+]], metadata !DIExpression()), !dbg [[DBG254]]
+// CHECK1-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG264]]
+// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4, !dbg [[DBG264]]
+// CHECK1-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB17:[0-9]+]], i32 [[TMP13]], i32 33, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1), !dbg [[DBG272:![0-9]+]]
+// CHECK1-NEXT:    br label [[OMP_DISPATCH_COND:%.*]], !dbg [[DBG264]]
 // CHECK1:       omp.dispatch.cond:
-// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG258]]
-// CHECK1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP14]], 9, !dbg [[DBG258]]
-// CHECK1-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]], !dbg [[DBG258]]
+// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG267]]
+// CHECK1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP14]], 9, !dbg [[DBG267]]
+// CHECK1-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]], !dbg [[DBG267]]
 // CHECK1:       cond.true:
-// CHECK1-NEXT:    br label [[COND_END:%.*]], !dbg [[DBG258]]
+// CHECK1-NEXT:    br label [[COND_END:%.*]], !dbg [[DBG267]]
 // CHECK1:       cond.false:
-// CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG258]]
-// CHECK1-NEXT:    br label [[COND_END]], !dbg [[DBG258]]
+// CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG267]]
+// CHECK1-NEXT:    br label [[COND_END]], !dbg [[DBG267]]
 // CHECK1:       cond.end:
-// CHECK1-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP15]], [[COND_FALSE]] ], !dbg [[DBG258]]
-// CHECK1-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4, !dbg [[DBG258]]
-// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG258]]
-// CHECK1-NEXT:    store i32 [[TMP16]], ptr [[DOTOMP_IV]], align 4, !dbg [[DBG258]]
-// CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG258]]
-// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG258]]
-// CHECK1-NEXT:    [[CMP5:%.*]] = icmp sle i32 [[TMP17]], [[TMP18]], !dbg [[DBG255]]
-// CHECK1-NEXT:    br i1 [[CMP5]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]], !dbg [[DBG255]]
+// CHECK1-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP15]], [[COND_FALSE]] ], !dbg [[DBG267]]
+// CHECK1-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4, !dbg [[DBG267]]
+// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG267]]
+// CHECK1-NEXT:    store i32 [[TMP16]], ptr [[DOTOMP_IV]], align 4, !dbg [[DBG267]]
+// CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG267]]
+// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG267]]
+// CHECK1-NEXT:    [[CMP5:%.*]] = icmp sle i32 [[TMP17]], [[TMP18]], !dbg [[DBG264]]
+// CHECK1-NEXT:    br i1 [[CMP5]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]], !dbg [[DBG264]]
 // CHECK1:       omp.dispatch.body:
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]], !dbg [[DBG255]]
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]], !dbg [[DBG264]]
 // CHECK1:       omp.inner.for.cond:
-// CHECK1-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG258]]
-// CHECK1-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG258]]
-// CHECK1-NEXT:    [[CMP6:%.*]] = icmp sle i32 [[TMP19]], [[TMP20]], !dbg [[DBG255]]
-// CHECK1-NEXT:    br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]], !dbg [[DBG255]]
+// CHECK1-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG267]]
+// CHECK1-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG267]]
+// CHECK1-NEXT:    [[CMP6:%.*]] = icmp sle i32 [[TMP19]], [[TMP20]], !dbg [[DBG264]]
+// CHECK1-NEXT:    br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]], !dbg [[DBG264]]
 // CHECK1:       omp.inner.for.body:
-// CHECK1-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG258]]
-// CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP21]], 1, !dbg [[DBG264:![0-9]+]]
-// CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]], !dbg [[DBG264]]
-// CHECK1-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !dbg [[DBG264]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[F]], metadata [[META265:![0-9]+]], metadata !DIExpression()), !dbg [[DBG267:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 1, !dbg [[DBG268:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX]], i64 0, i64 1, !dbg [[DBG268]]
-// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX7]], i64 0, i64 1, !dbg [[DBG268]]
-// CHECK1-NEXT:    store ptr [[ARRAYIDX8]], ptr [[F]], align 8, !dbg [[DBG267]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[G]], metadata [[META269:![0-9]+]], metadata !DIExpression()), !dbg [[DBG270:![0-9]+]]
-// CHECK1-NEXT:    store ptr [[TMP5]], ptr [[G]], align 8, !dbg [[DBG270]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[H]], metadata [[META271:![0-9]+]], metadata !DIExpression()), !dbg [[DBG272:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP8]], i64 0, i64 1, !dbg [[DBG273:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX9]], i64 0, i64 1, !dbg [[DBG273]]
-// CHECK1-NEXT:    store ptr [[ARRAYIDX10]], ptr [[H]], align 8, !dbg [[DBG272]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[D]], metadata [[META274:![0-9]+]], metadata !DIExpression()), !dbg [[DBG275:![0-9]+]]
-// CHECK1-NEXT:    store i32 15, ptr [[D]], align 4, !dbg [[DBG275]]
-// CHECK1-NEXT:    store i32 5, ptr [[TMP5]], align 4, !dbg [[DBG276:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP8]], i64 0, i64 0, !dbg [[DBG277:![0-9]+]]
-// CHECK1-NEXT:    [[TMP22:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG278:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP22]] to i64, !dbg [[DBG277]]
-// CHECK1-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX11]], i64 0, i64 [[IDXPROM]], !dbg [[DBG277]]
-// CHECK1-NEXT:    store i32 10, ptr [[ARRAYIDX12]], align 4, !dbg [[DBG279:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG280:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX13]], i64 0, i64 0, !dbg [[DBG280]]
-// CHECK1-NEXT:    [[TMP23:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG281:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM15:%.*]] = sext i32 [[TMP23]] to i64, !dbg [[DBG280]]
-// CHECK1-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX14]], i64 0, i64 [[IDXPROM15]], !dbg [[DBG280]]
-// CHECK1-NEXT:    store i32 11, ptr [[ARRAYIDX16]], align 4, !dbg [[DBG282:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG283:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX17]], i64 0, i64 0, !dbg [[DBG283]]
-// CHECK1-NEXT:    [[TMP24:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG284:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM19:%.*]] = sext i32 [[TMP24]] to i64, !dbg [[DBG283]]
-// CHECK1-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX18]], i64 0, i64 [[IDXPROM19]], !dbg [[DBG283]]
-// CHECK1-NEXT:    [[TMP25:%.*]] = load i32, ptr [[ARRAYIDX20]], align 4, !dbg [[DBG283]]
-// CHECK1-NEXT:    [[ARRAYIDX21:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP8]], i64 0, i64 0, !dbg [[DBG285:![0-9]+]]
-// CHECK1-NEXT:    [[TMP26:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG286:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM22:%.*]] = sext i32 [[TMP26]] to i64, !dbg [[DBG285]]
-// CHECK1-NEXT:    [[ARRAYIDX23:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX21]], i64 0, i64 [[IDXPROM22]], !dbg [[DBG285]]
-// CHECK1-NEXT:    store i32 [[TMP25]], ptr [[ARRAYIDX23]], align 4, !dbg [[DBG287:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX24:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP8]], i64 0, i64 0, !dbg [[DBG288:![0-9]+]]
-// CHECK1-NEXT:    [[TMP27:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG289:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM25:%.*]] = sext i32 [[TMP27]] to i64, !dbg [[DBG288]]
-// CHECK1-NEXT:    [[ARRAYIDX26:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX24]], i64 0, i64 [[IDXPROM25]], !dbg [[DBG288]]
-// CHECK1-NEXT:    [[TMP28:%.*]] = load i32, ptr [[ARRAYIDX26]], align 4, !dbg [[DBG288]]
-// CHECK1-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP28]], 0, !dbg [[DBG288]]
-// CHECK1-NEXT:    [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8, !dbg [[DBG290:![0-9]+]]
-// CHECK1-NEXT:    store i8 [[FROMBOOL]], ptr [[TMP11]], align 1, !dbg [[DBG290]]
-// CHECK1-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]], !dbg [[DBG291:![0-9]+]]
+// CHECK1-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG267]]
+// CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP21]], 1, !dbg [[DBG273:![0-9]+]]
+// CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]], !dbg [[DBG273]]
+// CHECK1-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !dbg [[DBG273]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[F]], metadata [[META274:![0-9]+]], metadata !DIExpression()), !dbg [[DBG276:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 1, !dbg [[DBG277:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX]], i64 0, i64 1, !dbg [[DBG277]]
+// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX7]], i64 0, i64 1, !dbg [[DBG277]]
+// CHECK1-NEXT:    store ptr [[ARRAYIDX8]], ptr [[F]], align 8, !dbg [[DBG276]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[G]], metadata [[META278:![0-9]+]], metadata !DIExpression()), !dbg [[DBG279:![0-9]+]]
+// CHECK1-NEXT:    store ptr [[TMP5]], ptr [[G]], align 8, !dbg [[DBG279]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[H]], metadata [[META280:![0-9]+]], metadata !DIExpression()), !dbg [[DBG281:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP8]], i64 0, i64 1, !dbg [[DBG282:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX9]], i64 0, i64 1, !dbg [[DBG282]]
+// CHECK1-NEXT:    store ptr [[ARRAYIDX10]], ptr [[H]], align 8, !dbg [[DBG281]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[D]], metadata [[META283:![0-9]+]], metadata !DIExpression()), !dbg [[DBG284:![0-9]+]]
+// CHECK1-NEXT:    store i32 15, ptr [[D]], align 4, !dbg [[DBG284]]
+// CHECK1-NEXT:    store i32 5, ptr [[TMP5]], align 4, !dbg [[DBG285:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP8]], i64 0, i64 0, !dbg [[DBG286:![0-9]+]]
+// CHECK1-NEXT:    [[TMP22:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG287:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP22]] to i64, !dbg [[DBG286]]
+// CHECK1-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX11]], i64 0, i64 [[IDXPROM]], !dbg [[DBG286]]
+// CHECK1-NEXT:    store i32 10, ptr [[ARRAYIDX12]], align 4, !dbg [[DBG288:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG289:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX13]], i64 0, i64 0, !dbg [[DBG289]]
+// CHECK1-NEXT:    [[TMP23:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG290:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM15:%.*]] = sext i32 [[TMP23]] to i64, !dbg [[DBG289]]
+// CHECK1-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX14]], i64 0, i64 [[IDXPROM15]], !dbg [[DBG289]]
+// CHECK1-NEXT:    store i32 11, ptr [[ARRAYIDX16]], align 4, !dbg [[DBG291:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG292:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX17]], i64 0, i64 0, !dbg [[DBG292]]
+// CHECK1-NEXT:    [[TMP24:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG293:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM19:%.*]] = sext i32 [[TMP24]] to i64, !dbg [[DBG292]]
+// CHECK1-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX18]], i64 0, i64 [[IDXPROM19]], !dbg [[DBG292]]
+// CHECK1-NEXT:    [[TMP25:%.*]] = load i32, ptr [[ARRAYIDX20]], align 4, !dbg [[DBG292]]
+// CHECK1-NEXT:    [[ARRAYIDX21:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP8]], i64 0, i64 0, !dbg [[DBG294:![0-9]+]]
+// CHECK1-NEXT:    [[TMP26:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG295:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM22:%.*]] = sext i32 [[TMP26]] to i64, !dbg [[DBG294]]
+// CHECK1-NEXT:    [[ARRAYIDX23:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX21]], i64 0, i64 [[IDXPROM22]], !dbg [[DBG294]]
+// CHECK1-NEXT:    store i32 [[TMP25]], ptr [[ARRAYIDX23]], align 4, !dbg [[DBG296:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX24:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP8]], i64 0, i64 0, !dbg [[DBG297:![0-9]+]]
+// CHECK1-NEXT:    [[TMP27:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG298:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM25:%.*]] = sext i32 [[TMP27]] to i64, !dbg [[DBG297]]
+// CHECK1-NEXT:    [[ARRAYIDX26:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX24]], i64 0, i64 [[IDXPROM25]], !dbg [[DBG297]]
+// CHECK1-NEXT:    [[TMP28:%.*]] = load i32, ptr [[ARRAYIDX26]], align 4, !dbg [[DBG297]]
+// CHECK1-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP28]], 0, !dbg [[DBG297]]
+// CHECK1-NEXT:    [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8, !dbg [[DBG299:![0-9]+]]
+// CHECK1-NEXT:    store i8 [[FROMBOOL]], ptr [[TMP11]], align 1, !dbg [[DBG299]]
+// CHECK1-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]], !dbg [[DBG300:![0-9]+]]
 // CHECK1:       omp.body.continue:
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]], !dbg [[DBG263]]
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]], !dbg [[DBG272]]
 // CHECK1:       omp.inner.for.inc:
-// CHECK1-NEXT:    [[TMP29:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG258]]
-// CHECK1-NEXT:    [[ADD27:%.*]] = add nsw i32 [[TMP29]], 1, !dbg [[DBG255]]
-// CHECK1-NEXT:    store i32 [[ADD27]], ptr [[DOTOMP_IV]], align 4, !dbg [[DBG255]]
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !dbg [[DBG263]], !llvm.loop [[LOOP292:![0-9]+]]
+// CHECK1-NEXT:    [[TMP29:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG267]]
+// CHECK1-NEXT:    [[ADD27:%.*]] = add nsw i32 [[TMP29]], 1, !dbg [[DBG264]]
+// CHECK1-NEXT:    store i32 [[ADD27]], ptr [[DOTOMP_IV]], align 4, !dbg [[DBG264]]
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !dbg [[DBG272]], !llvm.loop [[LOOP301:![0-9]+]]
 // CHECK1:       omp.inner.for.end:
-// CHECK1-NEXT:    br label [[OMP_DISPATCH_INC:%.*]], !dbg [[DBG263]]
+// CHECK1-NEXT:    br label [[OMP_DISPATCH_INC:%.*]], !dbg [[DBG272]]
 // CHECK1:       omp.dispatch.inc:
-// CHECK1-NEXT:    [[TMP30:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG258]]
-// CHECK1-NEXT:    [[TMP31:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG258]]
-// CHECK1-NEXT:    [[ADD28:%.*]] = add nsw i32 [[TMP30]], [[TMP31]], !dbg [[DBG255]]
-// CHECK1-NEXT:    store i32 [[ADD28]], ptr [[DOTOMP_LB]], align 4, !dbg [[DBG255]]
-// CHECK1-NEXT:    [[TMP32:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG258]]
-// CHECK1-NEXT:    [[TMP33:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG258]]
-// CHECK1-NEXT:    [[ADD29:%.*]] = add nsw i32 [[TMP32]], [[TMP33]], !dbg [[DBG255]]
-// CHECK1-NEXT:    store i32 [[ADD29]], ptr [[DOTOMP_UB]], align 4, !dbg [[DBG255]]
-// CHECK1-NEXT:    br label [[OMP_DISPATCH_COND]], !dbg [[DBG263]], !llvm.loop [[LOOP294:![0-9]+]]
+// CHECK1-NEXT:    [[TMP30:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG267]]
+// CHECK1-NEXT:    [[TMP31:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG267]]
+// CHECK1-NEXT:    [[ADD28:%.*]] = add nsw i32 [[TMP30]], [[TMP31]], !dbg [[DBG264]]
+// CHECK1-NEXT:    store i32 [[ADD28]], ptr [[DOTOMP_LB]], align 4, !dbg [[DBG264]]
+// CHECK1-NEXT:    [[TMP32:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG267]]
+// CHECK1-NEXT:    [[TMP33:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG267]]
+// CHECK1-NEXT:    [[ADD29:%.*]] = add nsw i32 [[TMP32]], [[TMP33]], !dbg [[DBG264]]
+// CHECK1-NEXT:    store i32 [[ADD29]], ptr [[DOTOMP_UB]], align 4, !dbg [[DBG264]]
+// CHECK1-NEXT:    br label [[OMP_DISPATCH_COND]], !dbg [[DBG272]], !llvm.loop [[LOOP303:![0-9]+]]
 // CHECK1:       omp.dispatch.end:
-// CHECK1-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB19:[0-9]+]], i32 [[TMP13]]), !dbg [[DBG293:![0-9]+]]
-// CHECK1-NEXT:    ret void, !dbg [[DBG295:![0-9]+]]
+// CHECK1-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB19:[0-9]+]], i32 [[TMP13]]), !dbg [[DBG302:![0-9]+]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG304:![0-9]+]]
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41_debug___omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG296:![0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR2]] !dbg [[DBG305:![0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -860,62 +860,62 @@ int main() {
 // CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[BB_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META299:![0-9]+]], metadata !DIExpression()), !dbg [[DBG300:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META308:![0-9]+]], metadata !DIExpression()), !dbg [[DBG309:![0-9]+]]
 // CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META301:![0-9]+]], metadata !DIExpression()), !dbg [[DBG300]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META310:![0-9]+]], metadata !DIExpression()), !dbg [[DBG309]]
 // CHECK1-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META302:![0-9]+]], metadata !DIExpression()), !dbg [[DBG300]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META311:![0-9]+]], metadata !DIExpression()), !dbg [[DBG309]]
 // CHECK1-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META303:![0-9]+]], metadata !DIExpression()), !dbg [[DBG300]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META312:![0-9]+]], metadata !DIExpression()), !dbg [[DBG309]]
 // CHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META304:![0-9]+]], metadata !DIExpression()), !dbg [[DBG300]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META313:![0-9]+]], metadata !DIExpression()), !dbg [[DBG309]]
 // CHECK1-NEXT:    store ptr [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META305:![0-9]+]], metadata !DIExpression()), !dbg [[DBG300]]
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG306:![0-9]+]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG306]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG306]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG306]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG306]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG306]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG306]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG306]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG306]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG306]]
-// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr [[TMP6]] to ptr addrspace(1), !dbg [[DBG306]]
-// CHECK1-NEXT:    [[TMP11:%.*]] = addrspacecast ptr [[TMP7]] to ptr addrspace(1), !dbg [[DBG306]]
-// CHECK1-NEXT:    [[TMP12:%.*]] = addrspacecast ptr [[TMP8]] to ptr addrspace(1), !dbg [[DBG306]]
-// CHECK1-NEXT:    [[TMP13:%.*]] = addrspacecast ptr [[TMP9]] to ptr addrspace(1), !dbg [[DBG306]]
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41_debug___omp_outlined_debug__(ptr [[TMP4]], ptr [[TMP5]], ptr addrspace(1) [[TMP10]], ptr addrspace(1) [[TMP11]], ptr addrspace(1) [[TMP12]], ptr addrspace(1) [[TMP13]]) #[[ATTR3]], !dbg [[DBG306]]
-// CHECK1-NEXT:    ret void, !dbg [[DBG306]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META314:![0-9]+]], metadata !DIExpression()), !dbg [[DBG309]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG315:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG315]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG315]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG315]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG315]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG315]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG315]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG315]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG315]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG315]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr [[TMP6]] to ptr addrspace(1), !dbg [[DBG315]]
+// CHECK1-NEXT:    [[TMP11:%.*]] = addrspacecast ptr [[TMP7]] to ptr addrspace(1), !dbg [[DBG315]]
+// CHECK1-NEXT:    [[TMP12:%.*]] = addrspacecast ptr [[TMP8]] to ptr addrspace(1), !dbg [[DBG315]]
+// CHECK1-NEXT:    [[TMP13:%.*]] = addrspacecast ptr [[TMP9]] to ptr addrspace(1), !dbg [[DBG315]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41_debug___omp_outlined_debug__(ptr [[TMP4]], ptr [[TMP5]], ptr addrspace(1) [[TMP10]], ptr addrspace(1) [[TMP11]], ptr addrspace(1) [[TMP12]], ptr addrspace(1) [[TMP13]]) #[[ATTR4]], !dbg [[DBG315]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG315]]
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41
-// CHECK1-SAME: (ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR5]] !dbg [[DBG307:![0-9]+]] {
+// CHECK1-SAME: (ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR6]] !dbg [[DBG316:![0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[BB_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META310:![0-9]+]], metadata !DIExpression()), !dbg [[DBG311:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META319:![0-9]+]], metadata !DIExpression()), !dbg [[DBG320:![0-9]+]]
 // CHECK1-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META312:![0-9]+]], metadata !DIExpression()), !dbg [[DBG311]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META321:![0-9]+]], metadata !DIExpression()), !dbg [[DBG320]]
 // CHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META313:![0-9]+]], metadata !DIExpression()), !dbg [[DBG311]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META322:![0-9]+]], metadata !DIExpression()), !dbg [[DBG320]]
 // CHECK1-NEXT:    store ptr [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META314:![0-9]+]], metadata !DIExpression()), !dbg [[DBG311]]
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG315:![0-9]+]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG315]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG315]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG315]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG315]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG315]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG315]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG315]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = addrspacecast ptr [[TMP4]] to ptr addrspace(1), !dbg [[DBG315]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = addrspacecast ptr [[TMP5]] to ptr addrspace(1), !dbg [[DBG315]]
-// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr [[TMP6]] to ptr addrspace(1), !dbg [[DBG315]]
-// CHECK1-NEXT:    [[TMP11:%.*]] = addrspacecast ptr [[TMP7]] to ptr addrspace(1), !dbg [[DBG315]]
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41_debug__(ptr addrspace(1) [[TMP8]], ptr addrspace(1) [[TMP9]], ptr addrspace(1) [[TMP10]], ptr addrspace(1) [[TMP11]]) #[[ATTR3]], !dbg [[DBG315]]
-// CHECK1-NEXT:    ret void, !dbg [[DBG315]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META323:![0-9]+]], metadata !DIExpression()), !dbg [[DBG320]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG324:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG324]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG324]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG324]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG324]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG324]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG324]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG324]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = addrspacecast ptr [[TMP4]] to ptr addrspace(1), !dbg [[DBG324]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = addrspacecast ptr [[TMP5]] to ptr addrspace(1), !dbg [[DBG324]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr [[TMP6]] to ptr addrspace(1), !dbg [[DBG324]]
+// CHECK1-NEXT:    [[TMP11:%.*]] = addrspacecast ptr [[TMP7]] to ptr addrspace(1), !dbg [[DBG324]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41_debug__(ptr addrspace(1) [[TMP8]], ptr addrspace(1) [[TMP9]], ptr addrspace(1) [[TMP10]], ptr addrspace(1) [[TMP11]]) #[[ATTR4]], !dbg [[DBG324]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG324]]
 //
diff --git a/clang/test/OpenMP/target_parallel_for_simd_codegen.cpp b/clang/test/OpenMP/target_parallel_for_simd_codegen.cpp
index cfb9d8ff94d17..8205d1f217817 100644
--- a/clang/test/OpenMP/target_parallel_for_simd_codegen.cpp
+++ b/clang/test/OpenMP/target_parallel_for_simd_codegen.cpp
@@ -363,7 +363,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP12:%.*]] = load i64, ptr [[K]], align 8
 // CHECK1-NEXT:    store i64 [[TMP12]], ptr [[K_CASTED]], align 8
 // CHECK1-NEXT:    [[TMP13:%.*]] = load i64, ptr [[K_CASTED]], align 8
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l101(i64 [[TMP11]], i64 [[TMP13]]) #[[ATTR4:[0-9]+]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l101(i64 [[TMP11]], i64 [[TMP13]]) #[[ATTR3:[0-9]+]]
 // CHECK1-NEXT:    store i32 12, ptr [[LIN]], align 4
 // CHECK1-NEXT:    [[TMP14:%.*]] = load i16, ptr [[AA]], align 2
 // CHECK1-NEXT:    store i16 [[TMP14]], ptr [[AA_CASTED]], align 2
@@ -424,7 +424,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP45:%.*]] = icmp ne i32 [[TMP44]], 0
 // CHECK1-NEXT:    br i1 [[TMP45]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK1:       omp_offload.failed:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l108(i64 [[TMP15]], i64 [[TMP17]], i64 [[TMP19]]) #[[ATTR4]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l108(i64 [[TMP15]], i64 [[TMP17]], i64 [[TMP19]]) #[[ATTR3]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    [[TMP46:%.*]] = load i32, ptr [[A]], align 4
@@ -481,12 +481,12 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP73:%.*]] = icmp ne i32 [[TMP72]], 0
 // CHECK1-NEXT:    br i1 [[TMP73]], label [[OMP_OFFLOAD_FAILED9:%.*]], label [[OMP_OFFLOAD_CONT10:%.*]]
 // CHECK1:       omp_offload.failed9:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l116(i64 [[TMP47]], i64 [[TMP49]]) #[[ATTR4]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l116(i64 [[TMP47]], i64 [[TMP49]]) #[[ATTR3]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT10]]
 // CHECK1:       omp_offload.cont10:
 // CHECK1-NEXT:    br label [[OMP_IF_END:%.*]]
 // CHECK1:       omp_if.else:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l116(i64 [[TMP47]], i64 [[TMP49]]) #[[ATTR4]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l116(i64 [[TMP47]], i64 [[TMP49]]) #[[ATTR3]]
 // CHECK1-NEXT:    br label [[OMP_IF_END]]
 // CHECK1:       omp_if.end:
 // CHECK1-NEXT:    [[TMP74:%.*]] = load i32, ptr [[A]], align 4
@@ -602,12 +602,12 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP132:%.*]] = icmp ne i32 [[TMP131]], 0
 // CHECK1-NEXT:    br i1 [[TMP132]], label [[OMP_OFFLOAD_FAILED18:%.*]], label [[OMP_OFFLOAD_CONT19:%.*]]
 // CHECK1:       omp_offload.failed18:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l140(i64 [[TMP76]], ptr [[B]], i64 [[TMP2]], ptr [[VLA]], ptr [[C]], i64 5, i64 [[TMP5]], ptr [[VLA1]], ptr [[D]], i64 [[TMP78]]) #[[ATTR4]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l140(i64 [[TMP76]], ptr [[B]], i64 [[TMP2]], ptr [[VLA]], ptr [[C]], i64 5, i64 [[TMP5]], ptr [[VLA1]], ptr [[D]], i64 [[TMP78]]) #[[ATTR3]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT19]]
 // CHECK1:       omp_offload.cont19:
 // CHECK1-NEXT:    br label [[OMP_IF_END21:%.*]]
 // CHECK1:       omp_if.else20:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l140(i64 [[TMP76]], ptr [[B]], i64 [[TMP2]], ptr [[VLA]], ptr [[C]], i64 5, i64 [[TMP5]], ptr [[VLA1]], ptr [[D]], i64 [[TMP78]]) #[[ATTR4]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l140(i64 [[TMP76]], ptr [[B]], i64 [[TMP2]], ptr [[VLA]], ptr [[C]], i64 5, i64 [[TMP5]], ptr [[VLA1]], ptr [[D]], i64 [[TMP78]]) #[[ATTR3]]
 // CHECK1-NEXT:    br label [[OMP_IF_END21]]
 // CHECK1:       omp_if.end21:
 // CHECK1-NEXT:    [[TMP133:%.*]] = load i32, ptr [[A]], align 4
@@ -624,7 +624,7 @@ int bar(int n){
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l96.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -659,23 +659,23 @@ int bar(int n){
 // CHECK1-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK1:       omp.inner.for.cond:
-// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP24:![0-9]+]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP24]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP10:![0-9]+]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP10]]
 // CHECK1-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
 // CHECK1-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK1:       omp.inner.for.body:
-// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP24]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP10]]
 // CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 5
 // CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 3, [[MUL]]
-// CHECK1-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP24]]
+// CHECK1-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP10]]
 // CHECK1-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK1:       omp.body.continue:
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK1:       omp.inner.for.inc:
-// CHECK1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP24]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP10]]
 // CHECK1-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP8]], 1
-// CHECK1-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP24]]
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP25:![0-9]+]]
+// CHECK1-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP10]]
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP11:![0-9]+]]
 // CHECK1:       omp.inner.for.end:
 // CHECK1-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK1:       omp.loop.exit:
@@ -691,7 +691,7 @@ int bar(int n){
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@.omp_task_entry.
-// CHECK1-SAME: (i32 noundef signext [[TMP0:%.*]], ptr noalias noundef [[TMP1:%.*]]) #[[ATTR5:[0-9]+]] {
+// CHECK1-SAME: (i32 noundef signext [[TMP0:%.*]], ptr noalias noundef [[TMP1:%.*]]) #[[ATTR4:[0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR_I:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[DOTPART_ID__ADDR_I:%.*]] = alloca ptr, align 8
@@ -710,54 +710,54 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T:%.*]], ptr [[TMP4]], i32 0, i32 2
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], ptr [[TMP4]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8
-// CHECK1-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META30:![0-9]+]])
-// CHECK1-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META33:![0-9]+]])
-// CHECK1-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META35:![0-9]+]])
-// CHECK1-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META37:![0-9]+]])
-// CHECK1-NEXT:    store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !39
-// CHECK1-NEXT:    store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 8, !noalias !39
-// CHECK1-NEXT:    store ptr null, ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias !39
-// CHECK1-NEXT:    store ptr null, ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !39
-// CHECK1-NEXT:    store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 8, !noalias !39
-// CHECK1-NEXT:    store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !39
-// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !39
-// CHECK1-NEXT:    store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias !39
+// CHECK1-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META16:![0-9]+]])
+// CHECK1-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META19:![0-9]+]])
+// CHECK1-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META21:![0-9]+]])
+// CHECK1-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META23:![0-9]+]])
+// CHECK1-NEXT:    store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !25
+// CHECK1-NEXT:    store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 8, !noalias !25
+// CHECK1-NEXT:    store ptr null, ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias !25
+// CHECK1-NEXT:    store ptr null, ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !25
+// CHECK1-NEXT:    store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 8, !noalias !25
+// CHECK1-NEXT:    store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !25
+// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !25
+// CHECK1-NEXT:    store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias !25
 // CHECK1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 1
-// CHECK1-NEXT:    store i32 0, ptr [[TMP9]], align 4, !noalias !39
+// CHECK1-NEXT:    store i32 0, ptr [[TMP9]], align 4, !noalias !25
 // CHECK1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 2
-// CHECK1-NEXT:    store ptr null, ptr [[TMP10]], align 8, !noalias !39
+// CHECK1-NEXT:    store ptr null, ptr [[TMP10]], align 8, !noalias !25
 // CHECK1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 3
-// CHECK1-NEXT:    store ptr null, ptr [[TMP11]], align 8, !noalias !39
+// CHECK1-NEXT:    store ptr null, ptr [[TMP11]], align 8, !noalias !25
 // CHECK1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 4
-// CHECK1-NEXT:    store ptr null, ptr [[TMP12]], align 8, !noalias !39
+// CHECK1-NEXT:    store ptr null, ptr [[TMP12]], align 8, !noalias !25
 // CHECK1-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 5
-// CHECK1-NEXT:    store ptr null, ptr [[TMP13]], align 8, !noalias !39
+// CHECK1-NEXT:    store ptr null, ptr [[TMP13]], align 8, !noalias !25
 // CHECK1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 6
-// CHECK1-NEXT:    store ptr null, ptr [[TMP14]], align 8, !noalias !39
+// CHECK1-NEXT:    store ptr null, ptr [[TMP14]], align 8, !noalias !25
 // CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 7
-// CHECK1-NEXT:    store ptr null, ptr [[TMP15]], align 8, !noalias !39
+// CHECK1-NEXT:    store ptr null, ptr [[TMP15]], align 8, !noalias !25
 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 8
-// CHECK1-NEXT:    store i64 0, ptr [[TMP16]], align 8, !noalias !39
+// CHECK1-NEXT:    store i64 0, ptr [[TMP16]], align 8, !noalias !25
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 9
-// CHECK1-NEXT:    store i64 1, ptr [[TMP17]], align 8, !noalias !39
+// CHECK1-NEXT:    store i64 1, ptr [[TMP17]], align 8, !noalias !25
 // CHECK1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 10
-// CHECK1-NEXT:    store [3 x i32] [i32 1, i32 0, i32 0], ptr [[TMP18]], align 4, !noalias !39
+// CHECK1-NEXT:    store [3 x i32] [i32 1, i32 0, i32 0], ptr [[TMP18]], align 4, !noalias !25
 // CHECK1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 11
-// CHECK1-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP19]], align 4, !noalias !39
+// CHECK1-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP19]], align 4, !noalias !25
 // CHECK1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 12
-// CHECK1-NEXT:    store i32 0, ptr [[TMP20]], align 4, !noalias !39
+// CHECK1-NEXT:    store i32 0, ptr [[TMP20]], align 4, !noalias !25
 // CHECK1-NEXT:    [[TMP21:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB2]], i64 -1, i32 1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l96.region_id, ptr [[KERNEL_ARGS_I]])
 // CHECK1-NEXT:    [[TMP22:%.*]] = icmp ne i32 [[TMP21]], 0
 // CHECK1-NEXT:    br i1 [[TMP22]], label [[OMP_OFFLOAD_FAILED_I:%.*]], label [[DOTOMP_OUTLINED__EXIT:%.*]]
 // CHECK1:       omp_offload.failed.i:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l96() #[[ATTR4]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l96() #[[ATTR3]]
 // CHECK1-NEXT:    br label [[DOTOMP_OUTLINED__EXIT]]
 // CHECK1:       .omp_outlined..exit:
 // CHECK1-NEXT:    ret i32 0
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l101
-// CHECK1-SAME: (i64 noundef [[A:%.*]], i64 noundef [[K:%.*]]) #[[ATTR3]] {
+// CHECK1-SAME: (i64 noundef [[A:%.*]], i64 noundef [[K:%.*]]) #[[ATTR2]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
 // CHECK1-NEXT:    [[K_ADDR:%.*]] = alloca i64, align 8
@@ -776,7 +776,7 @@ int bar(int n){
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l101.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], i64 noundef [[K:%.*]]) #[[ATTR3]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], i64 noundef [[K:%.*]]) #[[ATTR2]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -815,32 +815,32 @@ int bar(int n){
 // CHECK1-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK1:       omp.inner.for.cond:
-// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP40:![0-9]+]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP40]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26:![0-9]+]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP26]]
 // CHECK1-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
 // CHECK1-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK1:       omp.inner.for.body:
-// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP40]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
 // CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
 // CHECK1-NEXT:    [[SUB:%.*]] = sub nsw i32 10, [[MUL]]
-// CHECK1-NEXT:    store i32 [[SUB]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP40]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load i64, ptr [[DOTLINEAR_START]], align 8, !llvm.access.group [[ACC_GRP40]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP40]]
+// CHECK1-NEXT:    store i32 [[SUB]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load i64, ptr [[DOTLINEAR_START]], align 8, !llvm.access.group [[ACC_GRP26]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
 // CHECK1-NEXT:    [[MUL2:%.*]] = mul nsw i32 [[TMP9]], 3
 // CHECK1-NEXT:    [[CONV:%.*]] = sext i32 [[MUL2]] to i64
 // CHECK1-NEXT:    [[ADD:%.*]] = add nsw i64 [[TMP8]], [[CONV]]
-// CHECK1-NEXT:    store i64 [[ADD]], ptr [[K1]], align 8, !llvm.access.group [[ACC_GRP40]]
-// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP40]]
+// CHECK1-NEXT:    store i64 [[ADD]], ptr [[K1]], align 8, !llvm.access.group [[ACC_GRP26]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP26]]
 // CHECK1-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP10]], 1
-// CHECK1-NEXT:    store i32 [[ADD3]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP40]]
+// CHECK1-NEXT:    store i32 [[ADD3]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP26]]
 // CHECK1-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK1:       omp.body.continue:
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK1:       omp.inner.for.inc:
-// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP40]]
+// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
 // CHECK1-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP11]], 1
-// CHECK1-NEXT:    store i32 [[ADD4]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP40]]
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP41:![0-9]+]]
+// CHECK1-NEXT:    store i32 [[ADD4]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP27:![0-9]+]]
 // CHECK1:       omp.inner.for.end:
 // CHECK1-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK1:       omp.dispatch.inc:
@@ -890,7 +890,7 @@ int bar(int n){
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l108.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[AA:%.*]], i64 noundef [[LIN:%.*]], i64 noundef [[A:%.*]]) #[[ATTR3]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[AA:%.*]], i64 noundef [[LIN:%.*]], i64 noundef [[A:%.*]]) #[[ATTR2]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -943,44 +943,44 @@ int bar(int n){
 // CHECK1-NEXT:    store i64 [[TMP6]], ptr [[DOTOMP_IV]], align 8
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK1:       omp.inner.for.cond:
-// CHECK1-NEXT:    [[TMP7:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP43:![0-9]+]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8, !llvm.access.group [[ACC_GRP43]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP29:![0-9]+]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8, !llvm.access.group [[ACC_GRP29]]
 // CHECK1-NEXT:    [[CMP4:%.*]] = icmp ule i64 [[TMP7]], [[TMP8]]
 // CHECK1-NEXT:    br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK1:       omp.inner.for.body:
-// CHECK1-NEXT:    [[TMP9:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP43]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP29]]
 // CHECK1-NEXT:    [[MUL:%.*]] = mul i64 [[TMP9]], 400
 // CHECK1-NEXT:    [[SUB:%.*]] = sub i64 2000, [[MUL]]
-// CHECK1-NEXT:    store i64 [[SUB]], ptr [[IT]], align 8, !llvm.access.group [[ACC_GRP43]]
-// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTLINEAR_START]], align 4, !llvm.access.group [[ACC_GRP43]]
+// CHECK1-NEXT:    store i64 [[SUB]], ptr [[IT]], align 8, !llvm.access.group [[ACC_GRP29]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTLINEAR_START]], align 4, !llvm.access.group [[ACC_GRP29]]
 // CHECK1-NEXT:    [[CONV:%.*]] = sext i32 [[TMP10]] to i64
-// CHECK1-NEXT:    [[TMP11:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP43]]
-// CHECK1-NEXT:    [[TMP12:%.*]] = load i64, ptr [[DOTLINEAR_STEP]], align 8, !llvm.access.group [[ACC_GRP43]]
+// CHECK1-NEXT:    [[TMP11:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP29]]
+// CHECK1-NEXT:    [[TMP12:%.*]] = load i64, ptr [[DOTLINEAR_STEP]], align 8, !llvm.access.group [[ACC_GRP29]]
 // CHECK1-NEXT:    [[MUL5:%.*]] = mul i64 [[TMP11]], [[TMP12]]
 // CHECK1-NEXT:    [[ADD:%.*]] = add i64 [[CONV]], [[MUL5]]
 // CHECK1-NEXT:    [[CONV6:%.*]] = trunc i64 [[ADD]] to i32
-// CHECK1-NEXT:    store i32 [[CONV6]], ptr [[LIN2]], align 4, !llvm.access.group [[ACC_GRP43]]
-// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTLINEAR_START1]], align 4, !llvm.access.group [[ACC_GRP43]]
+// CHECK1-NEXT:    store i32 [[CONV6]], ptr [[LIN2]], align 4, !llvm.access.group [[ACC_GRP29]]
+// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTLINEAR_START1]], align 4, !llvm.access.group [[ACC_GRP29]]
 // CHECK1-NEXT:    [[CONV7:%.*]] = sext i32 [[TMP13]] to i64
-// CHECK1-NEXT:    [[TMP14:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP43]]
-// CHECK1-NEXT:    [[TMP15:%.*]] = load i64, ptr [[DOTLINEAR_STEP]], align 8, !llvm.access.group [[ACC_GRP43]]
+// CHECK1-NEXT:    [[TMP14:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP29]]
+// CHECK1-NEXT:    [[TMP15:%.*]] = load i64, ptr [[DOTLINEAR_STEP]], align 8, !llvm.access.group [[ACC_GRP29]]
 // CHECK1-NEXT:    [[MUL8:%.*]] = mul i64 [[TMP14]], [[TMP15]]
 // CHECK1-NEXT:    [[ADD9:%.*]] = add i64 [[CONV7]], [[MUL8]]
 // CHECK1-NEXT:    [[CONV10:%.*]] = trunc i64 [[ADD9]] to i32
-// CHECK1-NEXT:    store i32 [[CONV10]], ptr [[A3]], align 4, !llvm.access.group [[ACC_GRP43]]
-// CHECK1-NEXT:    [[TMP16:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP43]]
+// CHECK1-NEXT:    store i32 [[CONV10]], ptr [[A3]], align 4, !llvm.access.group [[ACC_GRP29]]
+// CHECK1-NEXT:    [[TMP16:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP29]]
 // CHECK1-NEXT:    [[CONV11:%.*]] = sext i16 [[TMP16]] to i32
 // CHECK1-NEXT:    [[ADD12:%.*]] = add nsw i32 [[CONV11]], 1
 // CHECK1-NEXT:    [[CONV13:%.*]] = trunc i32 [[ADD12]] to i16
-// CHECK1-NEXT:    store i16 [[CONV13]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP43]]
+// CHECK1-NEXT:    store i16 [[CONV13]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP29]]
 // CHECK1-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK1:       omp.body.continue:
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK1:       omp.inner.for.inc:
-// CHECK1-NEXT:    [[TMP17:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP43]]
+// CHECK1-NEXT:    [[TMP17:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP29]]
 // CHECK1-NEXT:    [[ADD14:%.*]] = add i64 [[TMP17]], 1
-// CHECK1-NEXT:    store i64 [[ADD14]], ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP43]]
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP44:![0-9]+]]
+// CHECK1-NEXT:    store i64 [[ADD14]], ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP29]]
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP30:![0-9]+]]
 // CHECK1:       omp.inner.for.end:
 // CHECK1-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK1:       omp.loop.exit:
@@ -1025,7 +1025,7 @@ int bar(int n){
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l116.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]]) #[[ATTR3]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]]) #[[ATTR2]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1064,32 +1064,32 @@ int bar(int n){
 // CHECK1-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK1:       omp.inner.for.cond:
-// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP46:![0-9]+]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP46]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP32:![0-9]+]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP32]]
 // CHECK1-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
 // CHECK1-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK1:       omp.inner.for.body:
-// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP46]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP32]]
 // CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 4
 // CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 6, [[MUL]]
 // CHECK1-NEXT:    [[CONV:%.*]] = trunc i32 [[ADD]] to i16
-// CHECK1-NEXT:    store i16 [[CONV]], ptr [[IT]], align 2, !llvm.access.group [[ACC_GRP46]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP46]]
+// CHECK1-NEXT:    store i16 [[CONV]], ptr [[IT]], align 2, !llvm.access.group [[ACC_GRP32]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP32]]
 // CHECK1-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP8]], 1
-// CHECK1-NEXT:    store i32 [[ADD2]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP46]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP46]]
+// CHECK1-NEXT:    store i32 [[ADD2]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP32]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP32]]
 // CHECK1-NEXT:    [[CONV3:%.*]] = sext i16 [[TMP9]] to i32
 // CHECK1-NEXT:    [[ADD4:%.*]] = add nsw i32 [[CONV3]], 1
 // CHECK1-NEXT:    [[CONV5:%.*]] = trunc i32 [[ADD4]] to i16
-// CHECK1-NEXT:    store i16 [[CONV5]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP46]]
+// CHECK1-NEXT:    store i16 [[CONV5]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP32]]
 // CHECK1-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK1:       omp.body.continue:
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK1:       omp.inner.for.inc:
-// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP46]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP32]]
 // CHECK1-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP10]], 1
-// CHECK1-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP46]]
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP47:![0-9]+]]
+// CHECK1-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP32]]
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP33:![0-9]+]]
 // CHECK1:       omp.inner.for.end:
 // CHECK1-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK1:       omp.loop.exit:
@@ -1148,7 +1148,7 @@ int bar(int n){
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l140.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BN:%.*]], ptr noundef nonnull align 8 dereferenceable(400) [[C:%.*]], i64 noundef [[VLA1:%.*]], i64 noundef [[VLA3:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[CN:%.*]], ptr noundef nonnull align 8 dereferenceable(16) [[D:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BN:%.*]], ptr noundef nonnull align 8 dereferenceable(400) [[C:%.*]], i64 noundef [[VLA1:%.*]], i64 noundef [[VLA3:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[CN:%.*]], ptr noundef nonnull align 8 dereferenceable(16) [[D:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1219,60 +1219,60 @@ int bar(int n){
 // CHECK1:       omp.dispatch.body:
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK1:       omp.inner.for.cond:
-// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP49:![0-9]+]]
-// CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP49]]
+// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP35:![0-9]+]]
+// CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP35]]
 // CHECK1-NEXT:    [[CMP6:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]]
 // CHECK1-NEXT:    br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK1:       omp.inner.for.body:
-// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP49]]
+// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP35]]
 // CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1
 // CHECK1-NEXT:    [[SUB:%.*]] = sub nsw i32 122, [[MUL]]
 // CHECK1-NEXT:    [[CONV:%.*]] = trunc i32 [[SUB]] to i8
-// CHECK1-NEXT:    store i8 [[CONV]], ptr [[IT]], align 1, !llvm.access.group [[ACC_GRP49]]
-// CHECK1-NEXT:    [[TMP19:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP49]]
+// CHECK1-NEXT:    store i8 [[CONV]], ptr [[IT]], align 1, !llvm.access.group [[ACC_GRP35]]
+// CHECK1-NEXT:    [[TMP19:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP35]]
 // CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP19]], 1
-// CHECK1-NEXT:    store i32 [[ADD]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP49]]
+// CHECK1-NEXT:    store i32 [[ADD]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP35]]
 // CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x float], ptr [[TMP0]], i64 0, i64 2
-// CHECK1-NEXT:    [[TMP20:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP49]]
+// CHECK1-NEXT:    [[TMP20:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP35]]
 // CHECK1-NEXT:    [[CONV7:%.*]] = fpext float [[TMP20]] to double
 // CHECK1-NEXT:    [[ADD8:%.*]] = fadd double [[CONV7]], 1.000000e+00
 // CHECK1-NEXT:    [[CONV9:%.*]] = fptrunc double [[ADD8]] to float
-// CHECK1-NEXT:    store float [[CONV9]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP49]]
+// CHECK1-NEXT:    store float [[CONV9]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP35]]
 // CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i64 3
-// CHECK1-NEXT:    [[TMP21:%.*]] = load float, ptr [[ARRAYIDX10]], align 4, !llvm.access.group [[ACC_GRP49]]
+// CHECK1-NEXT:    [[TMP21:%.*]] = load float, ptr [[ARRAYIDX10]], align 4, !llvm.access.group [[ACC_GRP35]]
 // CHECK1-NEXT:    [[CONV11:%.*]] = fpext float [[TMP21]] to double
 // CHECK1-NEXT:    [[ADD12:%.*]] = fadd double [[CONV11]], 1.000000e+00
 // CHECK1-NEXT:    [[CONV13:%.*]] = fptrunc double [[ADD12]] to float
-// CHECK1-NEXT:    store float [[CONV13]], ptr [[ARRAYIDX10]], align 4, !llvm.access.group [[ACC_GRP49]]
+// CHECK1-NEXT:    store float [[CONV13]], ptr [[ARRAYIDX10]], align 4, !llvm.access.group [[ACC_GRP35]]
 // CHECK1-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds [5 x [10 x double]], ptr [[TMP3]], i64 0, i64 1
 // CHECK1-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds [10 x double], ptr [[ARRAYIDX14]], i64 0, i64 2
-// CHECK1-NEXT:    [[TMP22:%.*]] = load double, ptr [[ARRAYIDX15]], align 8, !llvm.access.group [[ACC_GRP49]]
+// CHECK1-NEXT:    [[TMP22:%.*]] = load double, ptr [[ARRAYIDX15]], align 8, !llvm.access.group [[ACC_GRP35]]
 // CHECK1-NEXT:    [[ADD16:%.*]] = fadd double [[TMP22]], 1.000000e+00
-// CHECK1-NEXT:    store double [[ADD16]], ptr [[ARRAYIDX15]], align 8, !llvm.access.group [[ACC_GRP49]]
+// CHECK1-NEXT:    store double [[ADD16]], ptr [[ARRAYIDX15]], align 8, !llvm.access.group [[ACC_GRP35]]
 // CHECK1-NEXT:    [[TMP23:%.*]] = mul nsw i64 1, [[TMP5]]
 // CHECK1-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds double, ptr [[TMP6]], i64 [[TMP23]]
 // CHECK1-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds double, ptr [[ARRAYIDX17]], i64 3
-// CHECK1-NEXT:    [[TMP24:%.*]] = load double, ptr [[ARRAYIDX18]], align 8, !llvm.access.group [[ACC_GRP49]]
+// CHECK1-NEXT:    [[TMP24:%.*]] = load double, ptr [[ARRAYIDX18]], align 8, !llvm.access.group [[ACC_GRP35]]
 // CHECK1-NEXT:    [[ADD19:%.*]] = fadd double [[TMP24]], 1.000000e+00
-// CHECK1-NEXT:    store double [[ADD19]], ptr [[ARRAYIDX18]], align 8, !llvm.access.group [[ACC_GRP49]]
+// CHECK1-NEXT:    store double [[ADD19]], ptr [[ARRAYIDX18]], align 8, !llvm.access.group [[ACC_GRP35]]
 // CHECK1-NEXT:    [[X:%.*]] = getelementptr inbounds [[STRUCT_TT:%.*]], ptr [[TMP7]], i32 0, i32 0
-// CHECK1-NEXT:    [[TMP25:%.*]] = load i64, ptr [[X]], align 8, !llvm.access.group [[ACC_GRP49]]
+// CHECK1-NEXT:    [[TMP25:%.*]] = load i64, ptr [[X]], align 8, !llvm.access.group [[ACC_GRP35]]
 // CHECK1-NEXT:    [[ADD20:%.*]] = add nsw i64 [[TMP25]], 1
-// CHECK1-NEXT:    store i64 [[ADD20]], ptr [[X]], align 8, !llvm.access.group [[ACC_GRP49]]
+// CHECK1-NEXT:    store i64 [[ADD20]], ptr [[X]], align 8, !llvm.access.group [[ACC_GRP35]]
 // CHECK1-NEXT:    [[Y:%.*]] = getelementptr inbounds [[STRUCT_TT]], ptr [[TMP7]], i32 0, i32 1
-// CHECK1-NEXT:    [[TMP26:%.*]] = load i8, ptr [[Y]], align 8, !llvm.access.group [[ACC_GRP49]]
+// CHECK1-NEXT:    [[TMP26:%.*]] = load i8, ptr [[Y]], align 8, !llvm.access.group [[ACC_GRP35]]
 // CHECK1-NEXT:    [[CONV21:%.*]] = sext i8 [[TMP26]] to i32
 // CHECK1-NEXT:    [[ADD22:%.*]] = add nsw i32 [[CONV21]], 1
 // CHECK1-NEXT:    [[CONV23:%.*]] = trunc i32 [[ADD22]] to i8
-// CHECK1-NEXT:    store i8 [[CONV23]], ptr [[Y]], align 8, !llvm.access.group [[ACC_GRP49]]
+// CHECK1-NEXT:    store i8 [[CONV23]], ptr [[Y]], align 8, !llvm.access.group [[ACC_GRP35]]
 // CHECK1-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK1:       omp.body.continue:
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK1:       omp.inner.for.inc:
-// CHECK1-NEXT:    [[TMP27:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP49]]
+// CHECK1-NEXT:    [[TMP27:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP35]]
 // CHECK1-NEXT:    [[ADD24:%.*]] = add nsw i32 [[TMP27]], 1
-// CHECK1-NEXT:    store i32 [[ADD24]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP49]]
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP50:![0-9]+]]
+// CHECK1-NEXT:    store i32 [[ADD24]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP35]]
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP36:![0-9]+]]
 // CHECK1:       omp.inner.for.end:
 // CHECK1-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK1:       omp.dispatch.inc:
@@ -1432,12 +1432,12 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP43:%.*]] = icmp ne i32 [[TMP42]], 0
 // CHECK1-NEXT:    br i1 [[TMP43]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK1:       omp_offload.failed:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l216(ptr [[THIS1]], i64 [[TMP6]], i64 2, i64 [[TMP2]], ptr [[VLA]]) #[[ATTR4]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l216(ptr [[THIS1]], i64 [[TMP6]], i64 2, i64 [[TMP2]], ptr [[VLA]]) #[[ATTR3]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    br label [[OMP_IF_END:%.*]]
 // CHECK1:       omp_if.else:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l216(ptr [[THIS1]], i64 [[TMP6]], i64 2, i64 [[TMP2]], ptr [[VLA]]) #[[ATTR4]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l216(ptr [[THIS1]], i64 [[TMP6]], i64 2, i64 [[TMP2]], ptr [[VLA]]) #[[ATTR3]]
 // CHECK1-NEXT:    br label [[OMP_IF_END]]
 // CHECK1:       omp_if.end:
 // CHECK1-NEXT:    [[TMP44:%.*]] = mul nsw i64 1, [[TMP2]]
@@ -1540,12 +1540,12 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP35:%.*]] = icmp ne i32 [[TMP34]], 0
 // CHECK1-NEXT:    br i1 [[TMP35]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK1:       omp_offload.failed:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l195(i64 [[TMP1]], i64 [[TMP3]], i64 [[TMP5]], ptr [[B]]) #[[ATTR4]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l195(i64 [[TMP1]], i64 [[TMP3]], i64 [[TMP5]], ptr [[B]]) #[[ATTR3]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    br label [[OMP_IF_END:%.*]]
 // CHECK1:       omp_if.else:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l195(i64 [[TMP1]], i64 [[TMP3]], i64 [[TMP5]], ptr [[B]]) #[[ATTR4]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l195(i64 [[TMP1]], i64 [[TMP3]], i64 [[TMP5]], ptr [[B]]) #[[ATTR3]]
 // CHECK1-NEXT:    br label [[OMP_IF_END]]
 // CHECK1:       omp_if.end:
 // CHECK1-NEXT:    [[TMP36:%.*]] = load i32, ptr [[A]], align 4
@@ -1628,12 +1628,12 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP30:%.*]] = icmp ne i32 [[TMP29]], 0
 // CHECK1-NEXT:    br i1 [[TMP30]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK1:       omp_offload.failed:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l178(i64 [[TMP1]], i64 [[TMP3]], ptr [[B]]) #[[ATTR4]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l178(i64 [[TMP1]], i64 [[TMP3]], ptr [[B]]) #[[ATTR3]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    br label [[OMP_IF_END:%.*]]
 // CHECK1:       omp_if.else:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l178(i64 [[TMP1]], i64 [[TMP3]], ptr [[B]]) #[[ATTR4]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l178(i64 [[TMP1]], i64 [[TMP3]], ptr [[B]]) #[[ATTR3]]
 // CHECK1-NEXT:    br label [[OMP_IF_END]]
 // CHECK1:       omp_if.end:
 // CHECK1-NEXT:    [[TMP31:%.*]] = load i32, ptr [[A]], align 4
@@ -1666,7 +1666,7 @@ int bar(int n){
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l216.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]], i64 noundef [[B:%.*]], i64 noundef [[VLA:%.*]], i64 noundef [[VLA1:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[C:%.*]]) #[[ATTR3]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]], i64 noundef [[B:%.*]], i64 noundef [[VLA:%.*]], i64 noundef [[VLA1:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[C:%.*]]) #[[ATTR2]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1715,37 +1715,37 @@ int bar(int n){
 // CHECK1-NEXT:    store i64 [[TMP8]], ptr [[DOTOMP_IV]], align 8
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK1:       omp.inner.for.cond:
-// CHECK1-NEXT:    [[TMP9:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP52:![0-9]+]]
-// CHECK1-NEXT:    [[TMP10:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8, !llvm.access.group [[ACC_GRP52]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP38:![0-9]+]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8, !llvm.access.group [[ACC_GRP38]]
 // CHECK1-NEXT:    [[CMP3:%.*]] = icmp ule i64 [[TMP9]], [[TMP10]]
 // CHECK1-NEXT:    br i1 [[CMP3]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK1:       omp.inner.for.body:
-// CHECK1-NEXT:    [[TMP11:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP52]]
+// CHECK1-NEXT:    [[TMP11:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP38]]
 // CHECK1-NEXT:    [[MUL:%.*]] = mul i64 [[TMP11]], 400
 // CHECK1-NEXT:    [[SUB:%.*]] = sub i64 2000, [[MUL]]
-// CHECK1-NEXT:    store i64 [[SUB]], ptr [[IT]], align 8, !llvm.access.group [[ACC_GRP52]]
-// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[B_ADDR]], align 4, !llvm.access.group [[ACC_GRP52]]
+// CHECK1-NEXT:    store i64 [[SUB]], ptr [[IT]], align 8, !llvm.access.group [[ACC_GRP38]]
+// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[B_ADDR]], align 4, !llvm.access.group [[ACC_GRP38]]
 // CHECK1-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP12]] to double
 // CHECK1-NEXT:    [[ADD:%.*]] = fadd double [[CONV]], 1.500000e+00
 // CHECK1-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_S1:%.*]], ptr [[TMP0]], i32 0, i32 0
-// CHECK1-NEXT:    store double [[ADD]], ptr [[A]], align 8, !llvm.access.group [[ACC_GRP52]]
+// CHECK1-NEXT:    store double [[ADD]], ptr [[A]], align 8, !llvm.access.group [[ACC_GRP38]]
 // CHECK1-NEXT:    [[A4:%.*]] = getelementptr inbounds [[STRUCT_S1]], ptr [[TMP0]], i32 0, i32 0
-// CHECK1-NEXT:    [[TMP13:%.*]] = load double, ptr [[A4]], align 8, !llvm.access.group [[ACC_GRP52]]
+// CHECK1-NEXT:    [[TMP13:%.*]] = load double, ptr [[A4]], align 8, !llvm.access.group [[ACC_GRP38]]
 // CHECK1-NEXT:    [[INC:%.*]] = fadd double [[TMP13]], 1.000000e+00
-// CHECK1-NEXT:    store double [[INC]], ptr [[A4]], align 8, !llvm.access.group [[ACC_GRP52]]
+// CHECK1-NEXT:    store double [[INC]], ptr [[A4]], align 8, !llvm.access.group [[ACC_GRP38]]
 // CHECK1-NEXT:    [[CONV5:%.*]] = fptosi double [[INC]] to i16
 // CHECK1-NEXT:    [[TMP14:%.*]] = mul nsw i64 1, [[TMP2]]
 // CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[TMP3]], i64 [[TMP14]]
 // CHECK1-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i16, ptr [[ARRAYIDX]], i64 1
-// CHECK1-NEXT:    store i16 [[CONV5]], ptr [[ARRAYIDX6]], align 2, !llvm.access.group [[ACC_GRP52]]
+// CHECK1-NEXT:    store i16 [[CONV5]], ptr [[ARRAYIDX6]], align 2, !llvm.access.group [[ACC_GRP38]]
 // CHECK1-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK1:       omp.body.continue:
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK1:       omp.inner.for.inc:
-// CHECK1-NEXT:    [[TMP15:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP52]]
+// CHECK1-NEXT:    [[TMP15:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP38]]
 // CHECK1-NEXT:    [[ADD7:%.*]] = add i64 [[TMP15]], 1
-// CHECK1-NEXT:    store i64 [[ADD7]], ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP52]]
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP53:![0-9]+]]
+// CHECK1-NEXT:    store i64 [[ADD7]], ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP38]]
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP39:![0-9]+]]
 // CHECK1:       omp.inner.for.end:
 // CHECK1-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK1:       omp.loop.exit:
@@ -1789,7 +1789,7 @@ int bar(int n){
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l195.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]], i64 noundef [[AAA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR3]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]], i64 noundef [[AAA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR2]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1832,7 +1832,7 @@ int bar(int n){
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l178.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR3]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR2]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1874,35 +1874,35 @@ int bar(int n){
 // CHECK1-NEXT:    store i64 [[TMP5]], ptr [[DOTOMP_IV]], align 8
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK1:       omp.inner.for.cond:
-// CHECK1-NEXT:    [[TMP6:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP55:![0-9]+]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8, !llvm.access.group [[ACC_GRP55]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP41:![0-9]+]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8, !llvm.access.group [[ACC_GRP41]]
 // CHECK1-NEXT:    [[CMP1:%.*]] = icmp sle i64 [[TMP6]], [[TMP7]]
 // CHECK1-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK1:       omp.inner.for.body:
-// CHECK1-NEXT:    [[TMP8:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP55]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP41]]
 // CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i64 [[TMP8]], 3
 // CHECK1-NEXT:    [[ADD:%.*]] = add nsw i64 -10, [[MUL]]
-// CHECK1-NEXT:    store i64 [[ADD]], ptr [[I]], align 8, !llvm.access.group [[ACC_GRP55]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP55]]
+// CHECK1-NEXT:    store i64 [[ADD]], ptr [[I]], align 8, !llvm.access.group [[ACC_GRP41]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP41]]
 // CHECK1-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP9]], 1
-// CHECK1-NEXT:    store i32 [[ADD2]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP55]]
-// CHECK1-NEXT:    [[TMP10:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP55]]
+// CHECK1-NEXT:    store i32 [[ADD2]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP41]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP41]]
 // CHECK1-NEXT:    [[CONV:%.*]] = sext i16 [[TMP10]] to i32
 // CHECK1-NEXT:    [[ADD3:%.*]] = add nsw i32 [[CONV]], 1
 // CHECK1-NEXT:    [[CONV4:%.*]] = trunc i32 [[ADD3]] to i16
-// CHECK1-NEXT:    store i16 [[CONV4]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP55]]
+// CHECK1-NEXT:    store i16 [[CONV4]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP41]]
 // CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP0]], i64 0, i64 2
-// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP55]]
+// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP41]]
 // CHECK1-NEXT:    [[ADD5:%.*]] = add nsw i32 [[TMP11]], 1
-// CHECK1-NEXT:    store i32 [[ADD5]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP55]]
+// CHECK1-NEXT:    store i32 [[ADD5]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP41]]
 // CHECK1-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK1:       omp.body.continue:
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK1:       omp.inner.for.inc:
-// CHECK1-NEXT:    [[TMP12:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP55]]
+// CHECK1-NEXT:    [[TMP12:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP41]]
 // CHECK1-NEXT:    [[ADD6:%.*]] = add nsw i64 [[TMP12]], 1
-// CHECK1-NEXT:    store i64 [[ADD6]], ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP55]]
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP56:![0-9]+]]
+// CHECK1-NEXT:    store i64 [[ADD6]], ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP41]]
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP42:![0-9]+]]
 // CHECK1:       omp.inner.for.end:
 // CHECK1-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK1:       omp.loop.exit:
@@ -1918,7 +1918,7 @@ int bar(int n){
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK1-SAME: () #[[ATTR8:[0-9]+]] {
+// CHECK1-SAME: () #[[ATTR7:[0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK1-NEXT:    ret void
@@ -1988,7 +1988,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP8:%.*]] = load i32, ptr [[A]], align 4
 // CHECK3-NEXT:    store i32 [[TMP8]], ptr [[A_CASTED]], align 4
 // CHECK3-NEXT:    [[TMP9:%.*]] = load i32, ptr [[A_CASTED]], align 4
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l101(i32 [[TMP9]], ptr [[K]]) #[[ATTR4:[0-9]+]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l101(i32 [[TMP9]], ptr [[K]]) #[[ATTR3:[0-9]+]]
 // CHECK3-NEXT:    store i32 12, ptr [[LIN]], align 4
 // CHECK3-NEXT:    [[TMP10:%.*]] = load i16, ptr [[AA]], align 2
 // CHECK3-NEXT:    store i16 [[TMP10]], ptr [[AA_CASTED]], align 2
@@ -2049,7 +2049,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP41:%.*]] = icmp ne i32 [[TMP40]], 0
 // CHECK3-NEXT:    br i1 [[TMP41]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK3:       omp_offload.failed:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l108(i32 [[TMP11]], i32 [[TMP13]], i32 [[TMP15]]) #[[ATTR4]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l108(i32 [[TMP11]], i32 [[TMP13]], i32 [[TMP15]]) #[[ATTR3]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK3:       omp_offload.cont:
 // CHECK3-NEXT:    [[TMP42:%.*]] = load i32, ptr [[A]], align 4
@@ -2106,12 +2106,12 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP69:%.*]] = icmp ne i32 [[TMP68]], 0
 // CHECK3-NEXT:    br i1 [[TMP69]], label [[OMP_OFFLOAD_FAILED9:%.*]], label [[OMP_OFFLOAD_CONT10:%.*]]
 // CHECK3:       omp_offload.failed9:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l116(i32 [[TMP43]], i32 [[TMP45]]) #[[ATTR4]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l116(i32 [[TMP43]], i32 [[TMP45]]) #[[ATTR3]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT10]]
 // CHECK3:       omp_offload.cont10:
 // CHECK3-NEXT:    br label [[OMP_IF_END:%.*]]
 // CHECK3:       omp_if.else:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l116(i32 [[TMP43]], i32 [[TMP45]]) #[[ATTR4]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l116(i32 [[TMP43]], i32 [[TMP45]]) #[[ATTR3]]
 // CHECK3-NEXT:    br label [[OMP_IF_END]]
 // CHECK3:       omp_if.end:
 // CHECK3-NEXT:    [[TMP70:%.*]] = load i32, ptr [[A]], align 4
@@ -2229,12 +2229,12 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP130:%.*]] = icmp ne i32 [[TMP129]], 0
 // CHECK3-NEXT:    br i1 [[TMP130]], label [[OMP_OFFLOAD_FAILED18:%.*]], label [[OMP_OFFLOAD_CONT19:%.*]]
 // CHECK3:       omp_offload.failed18:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l140(i32 [[TMP72]], ptr [[B]], i32 [[TMP1]], ptr [[VLA]], ptr [[C]], i32 5, i32 [[TMP3]], ptr [[VLA1]], ptr [[D]], i32 [[TMP74]]) #[[ATTR4]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l140(i32 [[TMP72]], ptr [[B]], i32 [[TMP1]], ptr [[VLA]], ptr [[C]], i32 5, i32 [[TMP3]], ptr [[VLA1]], ptr [[D]], i32 [[TMP74]]) #[[ATTR3]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT19]]
 // CHECK3:       omp_offload.cont19:
 // CHECK3-NEXT:    br label [[OMP_IF_END21:%.*]]
 // CHECK3:       omp_if.else20:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l140(i32 [[TMP72]], ptr [[B]], i32 [[TMP1]], ptr [[VLA]], ptr [[C]], i32 5, i32 [[TMP3]], ptr [[VLA1]], ptr [[D]], i32 [[TMP74]]) #[[ATTR4]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l140(i32 [[TMP72]], ptr [[B]], i32 [[TMP1]], ptr [[VLA]], ptr [[C]], i32 5, i32 [[TMP3]], ptr [[VLA1]], ptr [[D]], i32 [[TMP74]]) #[[ATTR3]]
 // CHECK3-NEXT:    br label [[OMP_IF_END21]]
 // CHECK3:       omp_if.end21:
 // CHECK3-NEXT:    [[TMP131:%.*]] = load i32, ptr [[A]], align 4
@@ -2251,7 +2251,7 @@ int bar(int n){
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l96.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -2286,23 +2286,23 @@ int bar(int n){
 // CHECK3-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
 // CHECK3-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK3:       omp.inner.for.cond:
-// CHECK3-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP25:![0-9]+]]
-// CHECK3-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP25]]
+// CHECK3-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP11:![0-9]+]]
+// CHECK3-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP11]]
 // CHECK3-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
 // CHECK3-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK3:       omp.inner.for.body:
-// CHECK3-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP25]]
+// CHECK3-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP11]]
 // CHECK3-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 5
 // CHECK3-NEXT:    [[ADD:%.*]] = add nsw i32 3, [[MUL]]
-// CHECK3-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP25]]
+// CHECK3-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP11]]
 // CHECK3-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK3:       omp.body.continue:
 // CHECK3-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK3:       omp.inner.for.inc:
-// CHECK3-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP25]]
+// CHECK3-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP11]]
 // CHECK3-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP8]], 1
-// CHECK3-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP25]]
-// CHECK3-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP26:![0-9]+]]
+// CHECK3-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP11]]
+// CHECK3-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP12:![0-9]+]]
 // CHECK3:       omp.inner.for.end:
 // CHECK3-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK3:       omp.loop.exit:
@@ -2318,7 +2318,7 @@ int bar(int n){
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@.omp_task_entry.
-// CHECK3-SAME: (i32 noundef [[TMP0:%.*]], ptr noalias noundef [[TMP1:%.*]]) #[[ATTR5:[0-9]+]] {
+// CHECK3-SAME: (i32 noundef [[TMP0:%.*]], ptr noalias noundef [[TMP1:%.*]]) #[[ATTR4:[0-9]+]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR_I:%.*]] = alloca i32, align 4
 // CHECK3-NEXT:    [[DOTPART_ID__ADDR_I:%.*]] = alloca ptr, align 4
@@ -2337,54 +2337,54 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T:%.*]], ptr [[TMP4]], i32 0, i32 2
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], ptr [[TMP4]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 4
-// CHECK3-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META31:![0-9]+]])
-// CHECK3-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META34:![0-9]+]])
-// CHECK3-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META36:![0-9]+]])
-// CHECK3-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META38:![0-9]+]])
-// CHECK3-NEXT:    store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !40
-// CHECK3-NEXT:    store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 4, !noalias !40
-// CHECK3-NEXT:    store ptr null, ptr [[DOTPRIVATES__ADDR_I]], align 4, !noalias !40
-// CHECK3-NEXT:    store ptr null, ptr [[DOTCOPY_FN__ADDR_I]], align 4, !noalias !40
-// CHECK3-NEXT:    store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 4, !noalias !40
-// CHECK3-NEXT:    store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 4, !noalias !40
-// CHECK3-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 4, !noalias !40
-// CHECK3-NEXT:    store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias !40
+// CHECK3-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META17:![0-9]+]])
+// CHECK3-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META20:![0-9]+]])
+// CHECK3-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META22:![0-9]+]])
+// CHECK3-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META24:![0-9]+]])
+// CHECK3-NEXT:    store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !26
+// CHECK3-NEXT:    store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 4, !noalias !26
+// CHECK3-NEXT:    store ptr null, ptr [[DOTPRIVATES__ADDR_I]], align 4, !noalias !26
+// CHECK3-NEXT:    store ptr null, ptr [[DOTCOPY_FN__ADDR_I]], align 4, !noalias !26
+// CHECK3-NEXT:    store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 4, !noalias !26
+// CHECK3-NEXT:    store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 4, !noalias !26
+// CHECK3-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 4, !noalias !26
+// CHECK3-NEXT:    store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias !26
 // CHECK3-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 1
-// CHECK3-NEXT:    store i32 0, ptr [[TMP9]], align 4, !noalias !40
+// CHECK3-NEXT:    store i32 0, ptr [[TMP9]], align 4, !noalias !26
 // CHECK3-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 2
-// CHECK3-NEXT:    store ptr null, ptr [[TMP10]], align 4, !noalias !40
+// CHECK3-NEXT:    store ptr null, ptr [[TMP10]], align 4, !noalias !26
 // CHECK3-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 3
-// CHECK3-NEXT:    store ptr null, ptr [[TMP11]], align 4, !noalias !40
+// CHECK3-NEXT:    store ptr null, ptr [[TMP11]], align 4, !noalias !26
 // CHECK3-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 4
-// CHECK3-NEXT:    store ptr null, ptr [[TMP12]], align 4, !noalias !40
+// CHECK3-NEXT:    store ptr null, ptr [[TMP12]], align 4, !noalias !26
 // CHECK3-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 5
-// CHECK3-NEXT:    store ptr null, ptr [[TMP13]], align 4, !noalias !40
+// CHECK3-NEXT:    store ptr null, ptr [[TMP13]], align 4, !noalias !26
 // CHECK3-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 6
-// CHECK3-NEXT:    store ptr null, ptr [[TMP14]], align 4, !noalias !40
+// CHECK3-NEXT:    store ptr null, ptr [[TMP14]], align 4, !noalias !26
 // CHECK3-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 7
-// CHECK3-NEXT:    store ptr null, ptr [[TMP15]], align 4, !noalias !40
+// CHECK3-NEXT:    store ptr null, ptr [[TMP15]], align 4, !noalias !26
 // CHECK3-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 8
-// CHECK3-NEXT:    store i64 0, ptr [[TMP16]], align 8, !noalias !40
+// CHECK3-NEXT:    store i64 0, ptr [[TMP16]], align 8, !noalias !26
 // CHECK3-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 9
-// CHECK3-NEXT:    store i64 1, ptr [[TMP17]], align 8, !noalias !40
+// CHECK3-NEXT:    store i64 1, ptr [[TMP17]], align 8, !noalias !26
 // CHECK3-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 10
-// CHECK3-NEXT:    store [3 x i32] [i32 1, i32 0, i32 0], ptr [[TMP18]], align 4, !noalias !40
+// CHECK3-NEXT:    store [3 x i32] [i32 1, i32 0, i32 0], ptr [[TMP18]], align 4, !noalias !26
 // CHECK3-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 11
-// CHECK3-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP19]], align 4, !noalias !40
+// CHECK3-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP19]], align 4, !noalias !26
 // CHECK3-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 12
-// CHECK3-NEXT:    store i32 0, ptr [[TMP20]], align 4, !noalias !40
+// CHECK3-NEXT:    store i32 0, ptr [[TMP20]], align 4, !noalias !26
 // CHECK3-NEXT:    [[TMP21:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB2]], i64 -1, i32 1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l96.region_id, ptr [[KERNEL_ARGS_I]])
 // CHECK3-NEXT:    [[TMP22:%.*]] = icmp ne i32 [[TMP21]], 0
 // CHECK3-NEXT:    br i1 [[TMP22]], label [[OMP_OFFLOAD_FAILED_I:%.*]], label [[DOTOMP_OUTLINED__EXIT:%.*]]
 // CHECK3:       omp_offload.failed.i:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l96() #[[ATTR4]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l96() #[[ATTR3]]
 // CHECK3-NEXT:    br label [[DOTOMP_OUTLINED__EXIT]]
 // CHECK3:       .omp_outlined..exit:
 // CHECK3-NEXT:    ret i32 0
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l101
-// CHECK3-SAME: (i32 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[K:%.*]]) #[[ATTR3]] {
+// CHECK3-SAME: (i32 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[K:%.*]]) #[[ATTR2]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 // CHECK3-NEXT:    [[K_ADDR:%.*]] = alloca ptr, align 4
@@ -2400,7 +2400,7 @@ int bar(int n){
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l101.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[K:%.*]]) #[[ATTR3]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[K:%.*]]) #[[ATTR2]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -2440,32 +2440,32 @@ int bar(int n){
 // CHECK3-NEXT:    store i32 [[TMP5]], ptr [[DOTOMP_IV]], align 4
 // CHECK3-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK3:       omp.inner.for.cond:
-// CHECK3-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP41:![0-9]+]]
-// CHECK3-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP41]]
+// CHECK3-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP27:![0-9]+]]
+// CHECK3-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP27]]
 // CHECK3-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP6]], [[TMP7]]
 // CHECK3-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK3:       omp.inner.for.body:
-// CHECK3-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP41]]
+// CHECK3-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP27]]
 // CHECK3-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1
 // CHECK3-NEXT:    [[SUB:%.*]] = sub nsw i32 10, [[MUL]]
-// CHECK3-NEXT:    store i32 [[SUB]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP41]]
-// CHECK3-NEXT:    [[TMP9:%.*]] = load i64, ptr [[DOTLINEAR_START]], align 8, !llvm.access.group [[ACC_GRP41]]
-// CHECK3-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP41]]
+// CHECK3-NEXT:    store i32 [[SUB]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP27]]
+// CHECK3-NEXT:    [[TMP9:%.*]] = load i64, ptr [[DOTLINEAR_START]], align 8, !llvm.access.group [[ACC_GRP27]]
+// CHECK3-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP27]]
 // CHECK3-NEXT:    [[MUL2:%.*]] = mul nsw i32 [[TMP10]], 3
 // CHECK3-NEXT:    [[CONV:%.*]] = sext i32 [[MUL2]] to i64
 // CHECK3-NEXT:    [[ADD:%.*]] = add nsw i64 [[TMP9]], [[CONV]]
-// CHECK3-NEXT:    store i64 [[ADD]], ptr [[K1]], align 8, !llvm.access.group [[ACC_GRP41]]
-// CHECK3-NEXT:    [[TMP11:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP41]]
+// CHECK3-NEXT:    store i64 [[ADD]], ptr [[K1]], align 8, !llvm.access.group [[ACC_GRP27]]
+// CHECK3-NEXT:    [[TMP11:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP27]]
 // CHECK3-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP11]], 1
-// CHECK3-NEXT:    store i32 [[ADD3]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP41]]
+// CHECK3-NEXT:    store i32 [[ADD3]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP27]]
 // CHECK3-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK3:       omp.body.continue:
 // CHECK3-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK3:       omp.inner.for.inc:
-// CHECK3-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP41]]
+// CHECK3-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP27]]
 // CHECK3-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP12]], 1
-// CHECK3-NEXT:    store i32 [[ADD4]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP41]]
-// CHECK3-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP42:![0-9]+]]
+// CHECK3-NEXT:    store i32 [[ADD4]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP27]]
+// CHECK3-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP28:![0-9]+]]
 // CHECK3:       omp.inner.for.end:
 // CHECK3-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK3:       omp.dispatch.inc:
@@ -2515,7 +2515,7 @@ int bar(int n){
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l108.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[AA:%.*]], i32 noundef [[LIN:%.*]], i32 noundef [[A:%.*]]) #[[ATTR3]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[AA:%.*]], i32 noundef [[LIN:%.*]], i32 noundef [[A:%.*]]) #[[ATTR2]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -2568,44 +2568,44 @@ int bar(int n){
 // CHECK3-NEXT:    store i64 [[TMP6]], ptr [[DOTOMP_IV]], align 8
 // CHECK3-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK3:       omp.inner.for.cond:
-// CHECK3-NEXT:    [[TMP7:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP44:![0-9]+]]
-// CHECK3-NEXT:    [[TMP8:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8, !llvm.access.group [[ACC_GRP44]]
+// CHECK3-NEXT:    [[TMP7:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP30:![0-9]+]]
+// CHECK3-NEXT:    [[TMP8:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8, !llvm.access.group [[ACC_GRP30]]
 // CHECK3-NEXT:    [[CMP4:%.*]] = icmp ule i64 [[TMP7]], [[TMP8]]
 // CHECK3-NEXT:    br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK3:       omp.inner.for.body:
-// CHECK3-NEXT:    [[TMP9:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP44]]
+// CHECK3-NEXT:    [[TMP9:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP30]]
 // CHECK3-NEXT:    [[MUL:%.*]] = mul i64 [[TMP9]], 400
 // CHECK3-NEXT:    [[SUB:%.*]] = sub i64 2000, [[MUL]]
-// CHECK3-NEXT:    store i64 [[SUB]], ptr [[IT]], align 8, !llvm.access.group [[ACC_GRP44]]
-// CHECK3-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTLINEAR_START]], align 4, !llvm.access.group [[ACC_GRP44]]
+// CHECK3-NEXT:    store i64 [[SUB]], ptr [[IT]], align 8, !llvm.access.group [[ACC_GRP30]]
+// CHECK3-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTLINEAR_START]], align 4, !llvm.access.group [[ACC_GRP30]]
 // CHECK3-NEXT:    [[CONV:%.*]] = sext i32 [[TMP10]] to i64
-// CHECK3-NEXT:    [[TMP11:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP44]]
-// CHECK3-NEXT:    [[TMP12:%.*]] = load i64, ptr [[DOTLINEAR_STEP]], align 8, !llvm.access.group [[ACC_GRP44]]
+// CHECK3-NEXT:    [[TMP11:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP30]]
+// CHECK3-NEXT:    [[TMP12:%.*]] = load i64, ptr [[DOTLINEAR_STEP]], align 8, !llvm.access.group [[ACC_GRP30]]
 // CHECK3-NEXT:    [[MUL5:%.*]] = mul i64 [[TMP11]], [[TMP12]]
 // CHECK3-NEXT:    [[ADD:%.*]] = add i64 [[CONV]], [[MUL5]]
 // CHECK3-NEXT:    [[CONV6:%.*]] = trunc i64 [[ADD]] to i32
-// CHECK3-NEXT:    store i32 [[CONV6]], ptr [[LIN2]], align 4, !llvm.access.group [[ACC_GRP44]]
-// CHECK3-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTLINEAR_START1]], align 4, !llvm.access.group [[ACC_GRP44]]
+// CHECK3-NEXT:    store i32 [[CONV6]], ptr [[LIN2]], align 4, !llvm.access.group [[ACC_GRP30]]
+// CHECK3-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTLINEAR_START1]], align 4, !llvm.access.group [[ACC_GRP30]]
 // CHECK3-NEXT:    [[CONV7:%.*]] = sext i32 [[TMP13]] to i64
-// CHECK3-NEXT:    [[TMP14:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP44]]
-// CHECK3-NEXT:    [[TMP15:%.*]] = load i64, ptr [[DOTLINEAR_STEP]], align 8, !llvm.access.group [[ACC_GRP44]]
+// CHECK3-NEXT:    [[TMP14:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP30]]
+// CHECK3-NEXT:    [[TMP15:%.*]] = load i64, ptr [[DOTLINEAR_STEP]], align 8, !llvm.access.group [[ACC_GRP30]]
 // CHECK3-NEXT:    [[MUL8:%.*]] = mul i64 [[TMP14]], [[TMP15]]
 // CHECK3-NEXT:    [[ADD9:%.*]] = add i64 [[CONV7]], [[MUL8]]
 // CHECK3-NEXT:    [[CONV10:%.*]] = trunc i64 [[ADD9]] to i32
-// CHECK3-NEXT:    store i32 [[CONV10]], ptr [[A3]], align 4, !llvm.access.group [[ACC_GRP44]]
-// CHECK3-NEXT:    [[TMP16:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP44]]
+// CHECK3-NEXT:    store i32 [[CONV10]], ptr [[A3]], align 4, !llvm.access.group [[ACC_GRP30]]
+// CHECK3-NEXT:    [[TMP16:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP30]]
 // CHECK3-NEXT:    [[CONV11:%.*]] = sext i16 [[TMP16]] to i32
 // CHECK3-NEXT:    [[ADD12:%.*]] = add nsw i32 [[CONV11]], 1
 // CHECK3-NEXT:    [[CONV13:%.*]] = trunc i32 [[ADD12]] to i16
-// CHECK3-NEXT:    store i16 [[CONV13]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP44]]
+// CHECK3-NEXT:    store i16 [[CONV13]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP30]]
 // CHECK3-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK3:       omp.body.continue:
 // CHECK3-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK3:       omp.inner.for.inc:
-// CHECK3-NEXT:    [[TMP17:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP44]]
+// CHECK3-NEXT:    [[TMP17:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP30]]
 // CHECK3-NEXT:    [[ADD14:%.*]] = add i64 [[TMP17]], 1
-// CHECK3-NEXT:    store i64 [[ADD14]], ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP44]]
-// CHECK3-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP45:![0-9]+]]
+// CHECK3-NEXT:    store i64 [[ADD14]], ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP30]]
+// CHECK3-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP31:![0-9]+]]
 // CHECK3:       omp.inner.for.end:
 // CHECK3-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK3:       omp.loop.exit:
@@ -2650,7 +2650,7 @@ int bar(int n){
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l116.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]]) #[[ATTR3]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]]) #[[ATTR2]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -2689,32 +2689,32 @@ int bar(int n){
 // CHECK3-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
 // CHECK3-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK3:       omp.inner.for.cond:
-// CHECK3-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP47:![0-9]+]]
-// CHECK3-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP47]]
+// CHECK3-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP33:![0-9]+]]
+// CHECK3-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP33]]
 // CHECK3-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
 // CHECK3-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK3:       omp.inner.for.body:
-// CHECK3-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP47]]
+// CHECK3-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP33]]
 // CHECK3-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 4
 // CHECK3-NEXT:    [[ADD:%.*]] = add nsw i32 6, [[MUL]]
 // CHECK3-NEXT:    [[CONV:%.*]] = trunc i32 [[ADD]] to i16
-// CHECK3-NEXT:    store i16 [[CONV]], ptr [[IT]], align 2, !llvm.access.group [[ACC_GRP47]]
-// CHECK3-NEXT:    [[TMP8:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP47]]
+// CHECK3-NEXT:    store i16 [[CONV]], ptr [[IT]], align 2, !llvm.access.group [[ACC_GRP33]]
+// CHECK3-NEXT:    [[TMP8:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP33]]
 // CHECK3-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP8]], 1
-// CHECK3-NEXT:    store i32 [[ADD2]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP47]]
-// CHECK3-NEXT:    [[TMP9:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP47]]
+// CHECK3-NEXT:    store i32 [[ADD2]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP33]]
+// CHECK3-NEXT:    [[TMP9:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP33]]
 // CHECK3-NEXT:    [[CONV3:%.*]] = sext i16 [[TMP9]] to i32
 // CHECK3-NEXT:    [[ADD4:%.*]] = add nsw i32 [[CONV3]], 1
 // CHECK3-NEXT:    [[CONV5:%.*]] = trunc i32 [[ADD4]] to i16
-// CHECK3-NEXT:    store i16 [[CONV5]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP47]]
+// CHECK3-NEXT:    store i16 [[CONV5]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP33]]
 // CHECK3-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK3:       omp.body.continue:
 // CHECK3-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK3:       omp.inner.for.inc:
-// CHECK3-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP47]]
+// CHECK3-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP33]]
 // CHECK3-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP10]], 1
-// CHECK3-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP47]]
-// CHECK3-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP48:![0-9]+]]
+// CHECK3-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP33]]
+// CHECK3-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP34:![0-9]+]]
 // CHECK3:       omp.inner.for.end:
 // CHECK3-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK3:       omp.loop.exit:
@@ -2773,7 +2773,7 @@ int bar(int n){
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l140.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BN:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[C:%.*]], i32 noundef [[VLA1:%.*]], i32 noundef [[VLA3:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[CN:%.*]], ptr noundef nonnull align 4 dereferenceable(12) [[D:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BN:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[C:%.*]], i32 noundef [[VLA1:%.*]], i32 noundef [[VLA3:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[CN:%.*]], ptr noundef nonnull align 4 dereferenceable(12) [[D:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -2844,60 +2844,60 @@ int bar(int n){
 // CHECK3:       omp.dispatch.body:
 // CHECK3-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK3:       omp.inner.for.cond:
-// CHECK3-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP50:![0-9]+]]
-// CHECK3-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP50]]
+// CHECK3-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP36:![0-9]+]]
+// CHECK3-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP36]]
 // CHECK3-NEXT:    [[CMP6:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]]
 // CHECK3-NEXT:    br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK3:       omp.inner.for.body:
-// CHECK3-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP50]]
+// CHECK3-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP36]]
 // CHECK3-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1
 // CHECK3-NEXT:    [[SUB:%.*]] = sub nsw i32 122, [[MUL]]
 // CHECK3-NEXT:    [[CONV:%.*]] = trunc i32 [[SUB]] to i8
-// CHECK3-NEXT:    store i8 [[CONV]], ptr [[IT]], align 1, !llvm.access.group [[ACC_GRP50]]
-// CHECK3-NEXT:    [[TMP19:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP50]]
+// CHECK3-NEXT:    store i8 [[CONV]], ptr [[IT]], align 1, !llvm.access.group [[ACC_GRP36]]
+// CHECK3-NEXT:    [[TMP19:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP36]]
 // CHECK3-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP19]], 1
-// CHECK3-NEXT:    store i32 [[ADD]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP50]]
+// CHECK3-NEXT:    store i32 [[ADD]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP36]]
 // CHECK3-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x float], ptr [[TMP0]], i32 0, i32 2
-// CHECK3-NEXT:    [[TMP20:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP50]]
+// CHECK3-NEXT:    [[TMP20:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP36]]
 // CHECK3-NEXT:    [[CONV7:%.*]] = fpext float [[TMP20]] to double
 // CHECK3-NEXT:    [[ADD8:%.*]] = fadd double [[CONV7]], 1.000000e+00
 // CHECK3-NEXT:    [[CONV9:%.*]] = fptrunc double [[ADD8]] to float
-// CHECK3-NEXT:    store float [[CONV9]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP50]]
+// CHECK3-NEXT:    store float [[CONV9]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP36]]
 // CHECK3-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 3
-// CHECK3-NEXT:    [[TMP21:%.*]] = load float, ptr [[ARRAYIDX10]], align 4, !llvm.access.group [[ACC_GRP50]]
+// CHECK3-NEXT:    [[TMP21:%.*]] = load float, ptr [[ARRAYIDX10]], align 4, !llvm.access.group [[ACC_GRP36]]
 // CHECK3-NEXT:    [[CONV11:%.*]] = fpext float [[TMP21]] to double
 // CHECK3-NEXT:    [[ADD12:%.*]] = fadd double [[CONV11]], 1.000000e+00
 // CHECK3-NEXT:    [[CONV13:%.*]] = fptrunc double [[ADD12]] to float
-// CHECK3-NEXT:    store float [[CONV13]], ptr [[ARRAYIDX10]], align 4, !llvm.access.group [[ACC_GRP50]]
+// CHECK3-NEXT:    store float [[CONV13]], ptr [[ARRAYIDX10]], align 4, !llvm.access.group [[ACC_GRP36]]
 // CHECK3-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds [5 x [10 x double]], ptr [[TMP3]], i32 0, i32 1
 // CHECK3-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds [10 x double], ptr [[ARRAYIDX14]], i32 0, i32 2
-// CHECK3-NEXT:    [[TMP22:%.*]] = load double, ptr [[ARRAYIDX15]], align 8, !llvm.access.group [[ACC_GRP50]]
+// CHECK3-NEXT:    [[TMP22:%.*]] = load double, ptr [[ARRAYIDX15]], align 8, !llvm.access.group [[ACC_GRP36]]
 // CHECK3-NEXT:    [[ADD16:%.*]] = fadd double [[TMP22]], 1.000000e+00
-// CHECK3-NEXT:    store double [[ADD16]], ptr [[ARRAYIDX15]], align 8, !llvm.access.group [[ACC_GRP50]]
+// CHECK3-NEXT:    store double [[ADD16]], ptr [[ARRAYIDX15]], align 8, !llvm.access.group [[ACC_GRP36]]
 // CHECK3-NEXT:    [[TMP23:%.*]] = mul nsw i32 1, [[TMP5]]
 // CHECK3-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds double, ptr [[TMP6]], i32 [[TMP23]]
 // CHECK3-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds double, ptr [[ARRAYIDX17]], i32 3
-// CHECK3-NEXT:    [[TMP24:%.*]] = load double, ptr [[ARRAYIDX18]], align 8, !llvm.access.group [[ACC_GRP50]]
+// CHECK3-NEXT:    [[TMP24:%.*]] = load double, ptr [[ARRAYIDX18]], align 8, !llvm.access.group [[ACC_GRP36]]
 // CHECK3-NEXT:    [[ADD19:%.*]] = fadd double [[TMP24]], 1.000000e+00
-// CHECK3-NEXT:    store double [[ADD19]], ptr [[ARRAYIDX18]], align 8, !llvm.access.group [[ACC_GRP50]]
+// CHECK3-NEXT:    store double [[ADD19]], ptr [[ARRAYIDX18]], align 8, !llvm.access.group [[ACC_GRP36]]
 // CHECK3-NEXT:    [[X:%.*]] = getelementptr inbounds [[STRUCT_TT:%.*]], ptr [[TMP7]], i32 0, i32 0
-// CHECK3-NEXT:    [[TMP25:%.*]] = load i64, ptr [[X]], align 4, !llvm.access.group [[ACC_GRP50]]
+// CHECK3-NEXT:    [[TMP25:%.*]] = load i64, ptr [[X]], align 4, !llvm.access.group [[ACC_GRP36]]
 // CHECK3-NEXT:    [[ADD20:%.*]] = add nsw i64 [[TMP25]], 1
-// CHECK3-NEXT:    store i64 [[ADD20]], ptr [[X]], align 4, !llvm.access.group [[ACC_GRP50]]
+// CHECK3-NEXT:    store i64 [[ADD20]], ptr [[X]], align 4, !llvm.access.group [[ACC_GRP36]]
 // CHECK3-NEXT:    [[Y:%.*]] = getelementptr inbounds [[STRUCT_TT]], ptr [[TMP7]], i32 0, i32 1
-// CHECK3-NEXT:    [[TMP26:%.*]] = load i8, ptr [[Y]], align 4, !llvm.access.group [[ACC_GRP50]]
+// CHECK3-NEXT:    [[TMP26:%.*]] = load i8, ptr [[Y]], align 4, !llvm.access.group [[ACC_GRP36]]
 // CHECK3-NEXT:    [[CONV21:%.*]] = sext i8 [[TMP26]] to i32
 // CHECK3-NEXT:    [[ADD22:%.*]] = add nsw i32 [[CONV21]], 1
 // CHECK3-NEXT:    [[CONV23:%.*]] = trunc i32 [[ADD22]] to i8
-// CHECK3-NEXT:    store i8 [[CONV23]], ptr [[Y]], align 4, !llvm.access.group [[ACC_GRP50]]
+// CHECK3-NEXT:    store i8 [[CONV23]], ptr [[Y]], align 4, !llvm.access.group [[ACC_GRP36]]
 // CHECK3-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK3:       omp.body.continue:
 // CHECK3-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK3:       omp.inner.for.inc:
-// CHECK3-NEXT:    [[TMP27:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP50]]
+// CHECK3-NEXT:    [[TMP27:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP36]]
 // CHECK3-NEXT:    [[ADD24:%.*]] = add nsw i32 [[TMP27]], 1
-// CHECK3-NEXT:    store i32 [[ADD24]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP50]]
-// CHECK3-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP51:![0-9]+]]
+// CHECK3-NEXT:    store i32 [[ADD24]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP36]]
+// CHECK3-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP37:![0-9]+]]
 // CHECK3:       omp.inner.for.end:
 // CHECK3-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK3:       omp.dispatch.inc:
@@ -3057,12 +3057,12 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP43:%.*]] = icmp ne i32 [[TMP42]], 0
 // CHECK3-NEXT:    br i1 [[TMP43]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK3:       omp_offload.failed:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l216(ptr [[THIS1]], i32 [[TMP5]], i32 2, i32 [[TMP1]], ptr [[VLA]]) #[[ATTR4]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l216(ptr [[THIS1]], i32 [[TMP5]], i32 2, i32 [[TMP1]], ptr [[VLA]]) #[[ATTR3]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK3:       omp_offload.cont:
 // CHECK3-NEXT:    br label [[OMP_IF_END:%.*]]
 // CHECK3:       omp_if.else:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l216(ptr [[THIS1]], i32 [[TMP5]], i32 2, i32 [[TMP1]], ptr [[VLA]]) #[[ATTR4]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l216(ptr [[THIS1]], i32 [[TMP5]], i32 2, i32 [[TMP1]], ptr [[VLA]]) #[[ATTR3]]
 // CHECK3-NEXT:    br label [[OMP_IF_END]]
 // CHECK3:       omp_if.end:
 // CHECK3-NEXT:    [[TMP44:%.*]] = mul nsw i32 1, [[TMP1]]
@@ -3165,12 +3165,12 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP35:%.*]] = icmp ne i32 [[TMP34]], 0
 // CHECK3-NEXT:    br i1 [[TMP35]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK3:       omp_offload.failed:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l195(i32 [[TMP1]], i32 [[TMP3]], i32 [[TMP5]], ptr [[B]]) #[[ATTR4]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l195(i32 [[TMP1]], i32 [[TMP3]], i32 [[TMP5]], ptr [[B]]) #[[ATTR3]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK3:       omp_offload.cont:
 // CHECK3-NEXT:    br label [[OMP_IF_END:%.*]]
 // CHECK3:       omp_if.else:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l195(i32 [[TMP1]], i32 [[TMP3]], i32 [[TMP5]], ptr [[B]]) #[[ATTR4]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l195(i32 [[TMP1]], i32 [[TMP3]], i32 [[TMP5]], ptr [[B]]) #[[ATTR3]]
 // CHECK3-NEXT:    br label [[OMP_IF_END]]
 // CHECK3:       omp_if.end:
 // CHECK3-NEXT:    [[TMP36:%.*]] = load i32, ptr [[A]], align 4
@@ -3253,12 +3253,12 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP30:%.*]] = icmp ne i32 [[TMP29]], 0
 // CHECK3-NEXT:    br i1 [[TMP30]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK3:       omp_offload.failed:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l178(i32 [[TMP1]], i32 [[TMP3]], ptr [[B]]) #[[ATTR4]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l178(i32 [[TMP1]], i32 [[TMP3]], ptr [[B]]) #[[ATTR3]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK3:       omp_offload.cont:
 // CHECK3-NEXT:    br label [[OMP_IF_END:%.*]]
 // CHECK3:       omp_if.else:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l178(i32 [[TMP1]], i32 [[TMP3]], ptr [[B]]) #[[ATTR4]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l178(i32 [[TMP1]], i32 [[TMP3]], ptr [[B]]) #[[ATTR3]]
 // CHECK3-NEXT:    br label [[OMP_IF_END]]
 // CHECK3:       omp_if.end:
 // CHECK3-NEXT:    [[TMP31:%.*]] = load i32, ptr [[A]], align 4
@@ -3291,7 +3291,7 @@ int bar(int n){
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l216.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]], i32 noundef [[B:%.*]], i32 noundef [[VLA:%.*]], i32 noundef [[VLA1:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[C:%.*]]) #[[ATTR3]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]], i32 noundef [[B:%.*]], i32 noundef [[VLA:%.*]], i32 noundef [[VLA1:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[C:%.*]]) #[[ATTR2]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -3340,37 +3340,37 @@ int bar(int n){
 // CHECK3-NEXT:    store i64 [[TMP8]], ptr [[DOTOMP_IV]], align 8
 // CHECK3-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK3:       omp.inner.for.cond:
-// CHECK3-NEXT:    [[TMP9:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP53:![0-9]+]]
-// CHECK3-NEXT:    [[TMP10:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8, !llvm.access.group [[ACC_GRP53]]
+// CHECK3-NEXT:    [[TMP9:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP39:![0-9]+]]
+// CHECK3-NEXT:    [[TMP10:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8, !llvm.access.group [[ACC_GRP39]]
 // CHECK3-NEXT:    [[CMP3:%.*]] = icmp ule i64 [[TMP9]], [[TMP10]]
 // CHECK3-NEXT:    br i1 [[CMP3]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK3:       omp.inner.for.body:
-// CHECK3-NEXT:    [[TMP11:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP53]]
+// CHECK3-NEXT:    [[TMP11:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP39]]
 // CHECK3-NEXT:    [[MUL:%.*]] = mul i64 [[TMP11]], 400
 // CHECK3-NEXT:    [[SUB:%.*]] = sub i64 2000, [[MUL]]
-// CHECK3-NEXT:    store i64 [[SUB]], ptr [[IT]], align 8, !llvm.access.group [[ACC_GRP53]]
-// CHECK3-NEXT:    [[TMP12:%.*]] = load i32, ptr [[B_ADDR]], align 4, !llvm.access.group [[ACC_GRP53]]
+// CHECK3-NEXT:    store i64 [[SUB]], ptr [[IT]], align 8, !llvm.access.group [[ACC_GRP39]]
+// CHECK3-NEXT:    [[TMP12:%.*]] = load i32, ptr [[B_ADDR]], align 4, !llvm.access.group [[ACC_GRP39]]
 // CHECK3-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP12]] to double
 // CHECK3-NEXT:    [[ADD:%.*]] = fadd double [[CONV]], 1.500000e+00
 // CHECK3-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_S1:%.*]], ptr [[TMP0]], i32 0, i32 0
-// CHECK3-NEXT:    store double [[ADD]], ptr [[A]], align 4, !llvm.access.group [[ACC_GRP53]]
+// CHECK3-NEXT:    store double [[ADD]], ptr [[A]], align 4, !llvm.access.group [[ACC_GRP39]]
 // CHECK3-NEXT:    [[A4:%.*]] = getelementptr inbounds [[STRUCT_S1]], ptr [[TMP0]], i32 0, i32 0
-// CHECK3-NEXT:    [[TMP13:%.*]] = load double, ptr [[A4]], align 4, !llvm.access.group [[ACC_GRP53]]
+// CHECK3-NEXT:    [[TMP13:%.*]] = load double, ptr [[A4]], align 4, !llvm.access.group [[ACC_GRP39]]
 // CHECK3-NEXT:    [[INC:%.*]] = fadd double [[TMP13]], 1.000000e+00
-// CHECK3-NEXT:    store double [[INC]], ptr [[A4]], align 4, !llvm.access.group [[ACC_GRP53]]
+// CHECK3-NEXT:    store double [[INC]], ptr [[A4]], align 4, !llvm.access.group [[ACC_GRP39]]
 // CHECK3-NEXT:    [[CONV5:%.*]] = fptosi double [[INC]] to i16
 // CHECK3-NEXT:    [[TMP14:%.*]] = mul nsw i32 1, [[TMP2]]
 // CHECK3-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[TMP3]], i32 [[TMP14]]
 // CHECK3-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i16, ptr [[ARRAYIDX]], i32 1
-// CHECK3-NEXT:    store i16 [[CONV5]], ptr [[ARRAYIDX6]], align 2, !llvm.access.group [[ACC_GRP53]]
+// CHECK3-NEXT:    store i16 [[CONV5]], ptr [[ARRAYIDX6]], align 2, !llvm.access.group [[ACC_GRP39]]
 // CHECK3-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK3:       omp.body.continue:
 // CHECK3-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK3:       omp.inner.for.inc:
-// CHECK3-NEXT:    [[TMP15:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP53]]
+// CHECK3-NEXT:    [[TMP15:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP39]]
 // CHECK3-NEXT:    [[ADD7:%.*]] = add i64 [[TMP15]], 1
-// CHECK3-NEXT:    store i64 [[ADD7]], ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP53]]
-// CHECK3-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP54:![0-9]+]]
+// CHECK3-NEXT:    store i64 [[ADD7]], ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP39]]
+// CHECK3-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP40:![0-9]+]]
 // CHECK3:       omp.inner.for.end:
 // CHECK3-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK3:       omp.loop.exit:
@@ -3414,7 +3414,7 @@ int bar(int n){
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l195.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]], i32 noundef [[AAA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR3]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]], i32 noundef [[AAA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR2]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -3457,7 +3457,7 @@ int bar(int n){
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l178.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR3]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR2]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -3499,35 +3499,35 @@ int bar(int n){
 // CHECK3-NEXT:    store i64 [[TMP5]], ptr [[DOTOMP_IV]], align 8
 // CHECK3-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK3:       omp.inner.for.cond:
-// CHECK3-NEXT:    [[TMP6:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP56:![0-9]+]]
-// CHECK3-NEXT:    [[TMP7:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8, !llvm.access.group [[ACC_GRP56]]
+// CHECK3-NEXT:    [[TMP6:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP42:![0-9]+]]
+// CHECK3-NEXT:    [[TMP7:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8, !llvm.access.group [[ACC_GRP42]]
 // CHECK3-NEXT:    [[CMP1:%.*]] = icmp sle i64 [[TMP6]], [[TMP7]]
 // CHECK3-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK3:       omp.inner.for.body:
-// CHECK3-NEXT:    [[TMP8:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP56]]
+// CHECK3-NEXT:    [[TMP8:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP42]]
 // CHECK3-NEXT:    [[MUL:%.*]] = mul nsw i64 [[TMP8]], 3
 // CHECK3-NEXT:    [[ADD:%.*]] = add nsw i64 -10, [[MUL]]
-// CHECK3-NEXT:    store i64 [[ADD]], ptr [[I]], align 8, !llvm.access.group [[ACC_GRP56]]
-// CHECK3-NEXT:    [[TMP9:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP56]]
+// CHECK3-NEXT:    store i64 [[ADD]], ptr [[I]], align 8, !llvm.access.group [[ACC_GRP42]]
+// CHECK3-NEXT:    [[TMP9:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP42]]
 // CHECK3-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP9]], 1
-// CHECK3-NEXT:    store i32 [[ADD2]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP56]]
-// CHECK3-NEXT:    [[TMP10:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP56]]
+// CHECK3-NEXT:    store i32 [[ADD2]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP42]]
+// CHECK3-NEXT:    [[TMP10:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP42]]
 // CHECK3-NEXT:    [[CONV:%.*]] = sext i16 [[TMP10]] to i32
 // CHECK3-NEXT:    [[ADD3:%.*]] = add nsw i32 [[CONV]], 1
 // CHECK3-NEXT:    [[CONV4:%.*]] = trunc i32 [[ADD3]] to i16
-// CHECK3-NEXT:    store i16 [[CONV4]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP56]]
+// CHECK3-NEXT:    store i16 [[CONV4]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP42]]
 // CHECK3-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP0]], i32 0, i32 2
-// CHECK3-NEXT:    [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP56]]
+// CHECK3-NEXT:    [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP42]]
 // CHECK3-NEXT:    [[ADD5:%.*]] = add nsw i32 [[TMP11]], 1
-// CHECK3-NEXT:    store i32 [[ADD5]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP56]]
+// CHECK3-NEXT:    store i32 [[ADD5]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP42]]
 // CHECK3-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK3:       omp.body.continue:
 // CHECK3-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK3:       omp.inner.for.inc:
-// CHECK3-NEXT:    [[TMP12:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP56]]
+// CHECK3-NEXT:    [[TMP12:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP42]]
 // CHECK3-NEXT:    [[ADD6:%.*]] = add nsw i64 [[TMP12]], 1
-// CHECK3-NEXT:    store i64 [[ADD6]], ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP56]]
-// CHECK3-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP57:![0-9]+]]
+// CHECK3-NEXT:    store i64 [[ADD6]], ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP42]]
+// CHECK3-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP43:![0-9]+]]
 // CHECK3:       omp.inner.for.end:
 // CHECK3-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK3:       omp.loop.exit:
@@ -3543,7 +3543,7 @@ int bar(int n){
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK3-SAME: () #[[ATTR8:[0-9]+]] {
+// CHECK3-SAME: () #[[ATTR7:[0-9]+]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK3-NEXT:    ret void
@@ -3619,7 +3619,7 @@ int bar(int n){
 // CHECK5-NEXT:    [[TMP12:%.*]] = load i64, ptr [[K]], align 8
 // CHECK5-NEXT:    store i64 [[TMP12]], ptr [[K_CASTED]], align 8
 // CHECK5-NEXT:    [[TMP13:%.*]] = load i64, ptr [[K_CASTED]], align 8
-// CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l101(i64 [[TMP11]], i64 [[TMP13]]) #[[ATTR4:[0-9]+]]
+// CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l101(i64 [[TMP11]], i64 [[TMP13]]) #[[ATTR3:[0-9]+]]
 // CHECK5-NEXT:    store i32 12, ptr [[LIN]], align 4
 // CHECK5-NEXT:    [[TMP14:%.*]] = load i16, ptr [[AA]], align 2
 // CHECK5-NEXT:    store i16 [[TMP14]], ptr [[AA_CASTED]], align 2
@@ -3680,7 +3680,7 @@ int bar(int n){
 // CHECK5-NEXT:    [[TMP45:%.*]] = icmp ne i32 [[TMP44]], 0
 // CHECK5-NEXT:    br i1 [[TMP45]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK5:       omp_offload.failed:
-// CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l108(i64 [[TMP15]], i64 [[TMP17]], i64 [[TMP19]]) #[[ATTR4]]
+// CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l108(i64 [[TMP15]], i64 [[TMP17]], i64 [[TMP19]]) #[[ATTR3]]
 // CHECK5-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK5:       omp_offload.cont:
 // CHECK5-NEXT:    [[TMP46:%.*]] = load i32, ptr [[A]], align 4
@@ -3737,12 +3737,12 @@ int bar(int n){
 // CHECK5-NEXT:    [[TMP73:%.*]] = icmp ne i32 [[TMP72]], 0
 // CHECK5-NEXT:    br i1 [[TMP73]], label [[OMP_OFFLOAD_FAILED9:%.*]], label [[OMP_OFFLOAD_CONT10:%.*]]
 // CHECK5:       omp_offload.failed9:
-// CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l116(i64 [[TMP47]], i64 [[TMP49]]) #[[ATTR4]]
+// CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l116(i64 [[TMP47]], i64 [[TMP49]]) #[[ATTR3]]
 // CHECK5-NEXT:    br label [[OMP_OFFLOAD_CONT10]]
 // CHECK5:       omp_offload.cont10:
 // CHECK5-NEXT:    br label [[OMP_IF_END:%.*]]
 // CHECK5:       omp_if.else:
-// CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l116(i64 [[TMP47]], i64 [[TMP49]]) #[[ATTR4]]
+// CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l116(i64 [[TMP47]], i64 [[TMP49]]) #[[ATTR3]]
 // CHECK5-NEXT:    br label [[OMP_IF_END]]
 // CHECK5:       omp_if.end:
 // CHECK5-NEXT:    [[TMP74:%.*]] = load i32, ptr [[A]], align 4
@@ -3858,12 +3858,12 @@ int bar(int n){
 // CHECK5-NEXT:    [[TMP132:%.*]] = icmp ne i32 [[TMP131]], 0
 // CHECK5-NEXT:    br i1 [[TMP132]], label [[OMP_OFFLOAD_FAILED18:%.*]], label [[OMP_OFFLOAD_CONT19:%.*]]
 // CHECK5:       omp_offload.failed18:
-// CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l140(i64 [[TMP76]], ptr [[B]], i64 [[TMP2]], ptr [[VLA]], ptr [[C]], i64 5, i64 [[TMP5]], ptr [[VLA1]], ptr [[D]], i64 [[TMP78]]) #[[ATTR4]]
+// CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l140(i64 [[TMP76]], ptr [[B]], i64 [[TMP2]], ptr [[VLA]], ptr [[C]], i64 5, i64 [[TMP5]], ptr [[VLA1]], ptr [[D]], i64 [[TMP78]]) #[[ATTR3]]
 // CHECK5-NEXT:    br label [[OMP_OFFLOAD_CONT19]]
 // CHECK5:       omp_offload.cont19:
 // CHECK5-NEXT:    br label [[OMP_IF_END21:%.*]]
 // CHECK5:       omp_if.else20:
-// CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l140(i64 [[TMP76]], ptr [[B]], i64 [[TMP2]], ptr [[VLA]], ptr [[C]], i64 5, i64 [[TMP5]], ptr [[VLA1]], ptr [[D]], i64 [[TMP78]]) #[[ATTR4]]
+// CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l140(i64 [[TMP76]], ptr [[B]], i64 [[TMP2]], ptr [[VLA]], ptr [[C]], i64 5, i64 [[TMP5]], ptr [[VLA1]], ptr [[D]], i64 [[TMP78]]) #[[ATTR3]]
 // CHECK5-NEXT:    br label [[OMP_IF_END21]]
 // CHECK5:       omp_if.end21:
 // CHECK5-NEXT:    [[TMP133:%.*]] = load i32, ptr [[A]], align 4
@@ -3880,7 +3880,7 @@ int bar(int n){
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l96.omp_outlined
-// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -3915,23 +3915,23 @@ int bar(int n){
 // CHECK5-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
 // CHECK5-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK5:       omp.inner.for.cond:
-// CHECK5-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP24:![0-9]+]]
-// CHECK5-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP24]]
+// CHECK5-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP10:![0-9]+]]
+// CHECK5-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP10]]
 // CHECK5-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
 // CHECK5-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK5:       omp.inner.for.body:
-// CHECK5-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP24]]
+// CHECK5-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP10]]
 // CHECK5-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 5
 // CHECK5-NEXT:    [[ADD:%.*]] = add nsw i32 3, [[MUL]]
-// CHECK5-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP24]]
+// CHECK5-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP10]]
 // CHECK5-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK5:       omp.body.continue:
 // CHECK5-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK5:       omp.inner.for.inc:
-// CHECK5-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP24]]
+// CHECK5-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP10]]
 // CHECK5-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP8]], 1
-// CHECK5-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP24]]
-// CHECK5-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP25:![0-9]+]]
+// CHECK5-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP10]]
+// CHECK5-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP11:![0-9]+]]
 // CHECK5:       omp.inner.for.end:
 // CHECK5-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK5:       omp.loop.exit:
@@ -3947,7 +3947,7 @@ int bar(int n){
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@.omp_task_entry.
-// CHECK5-SAME: (i32 noundef signext [[TMP0:%.*]], ptr noalias noundef [[TMP1:%.*]]) #[[ATTR5:[0-9]+]] {
+// CHECK5-SAME: (i32 noundef signext [[TMP0:%.*]], ptr noalias noundef [[TMP1:%.*]]) #[[ATTR4:[0-9]+]] {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[DOTGLOBAL_TID__ADDR_I:%.*]] = alloca i32, align 4
 // CHECK5-NEXT:    [[DOTPART_ID__ADDR_I:%.*]] = alloca ptr, align 8
@@ -3966,54 +3966,54 @@ int bar(int n){
 // CHECK5-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T:%.*]], ptr [[TMP4]], i32 0, i32 2
 // CHECK5-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], ptr [[TMP4]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8
-// CHECK5-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META30:![0-9]+]])
-// CHECK5-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META33:![0-9]+]])
-// CHECK5-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META35:![0-9]+]])
-// CHECK5-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META37:![0-9]+]])
-// CHECK5-NEXT:    store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !39
-// CHECK5-NEXT:    store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 8, !noalias !39
-// CHECK5-NEXT:    store ptr null, ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias !39
-// CHECK5-NEXT:    store ptr null, ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !39
-// CHECK5-NEXT:    store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 8, !noalias !39
-// CHECK5-NEXT:    store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !39
-// CHECK5-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !39
-// CHECK5-NEXT:    store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias !39
+// CHECK5-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META16:![0-9]+]])
+// CHECK5-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META19:![0-9]+]])
+// CHECK5-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META21:![0-9]+]])
+// CHECK5-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META23:![0-9]+]])
+// CHECK5-NEXT:    store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !25
+// CHECK5-NEXT:    store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 8, !noalias !25
+// CHECK5-NEXT:    store ptr null, ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias !25
+// CHECK5-NEXT:    store ptr null, ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !25
+// CHECK5-NEXT:    store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 8, !noalias !25
+// CHECK5-NEXT:    store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !25
+// CHECK5-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !25
+// CHECK5-NEXT:    store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias !25
 // CHECK5-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 1
-// CHECK5-NEXT:    store i32 0, ptr [[TMP9]], align 4, !noalias !39
+// CHECK5-NEXT:    store i32 0, ptr [[TMP9]], align 4, !noalias !25
 // CHECK5-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 2
-// CHECK5-NEXT:    store ptr null, ptr [[TMP10]], align 8, !noalias !39
+// CHECK5-NEXT:    store ptr null, ptr [[TMP10]], align 8, !noalias !25
 // CHECK5-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 3
-// CHECK5-NEXT:    store ptr null, ptr [[TMP11]], align 8, !noalias !39
+// CHECK5-NEXT:    store ptr null, ptr [[TMP11]], align 8, !noalias !25
 // CHECK5-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 4
-// CHECK5-NEXT:    store ptr null, ptr [[TMP12]], align 8, !noalias !39
+// CHECK5-NEXT:    store ptr null, ptr [[TMP12]], align 8, !noalias !25
 // CHECK5-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 5
-// CHECK5-NEXT:    store ptr null, ptr [[TMP13]], align 8, !noalias !39
+// CHECK5-NEXT:    store ptr null, ptr [[TMP13]], align 8, !noalias !25
 // CHECK5-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 6
-// CHECK5-NEXT:    store ptr null, ptr [[TMP14]], align 8, !noalias !39
+// CHECK5-NEXT:    store ptr null, ptr [[TMP14]], align 8, !noalias !25
 // CHECK5-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 7
-// CHECK5-NEXT:    store ptr null, ptr [[TMP15]], align 8, !noalias !39
+// CHECK5-NEXT:    store ptr null, ptr [[TMP15]], align 8, !noalias !25
 // CHECK5-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 8
-// CHECK5-NEXT:    store i64 0, ptr [[TMP16]], align 8, !noalias !39
+// CHECK5-NEXT:    store i64 0, ptr [[TMP16]], align 8, !noalias !25
 // CHECK5-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 9
-// CHECK5-NEXT:    store i64 1, ptr [[TMP17]], align 8, !noalias !39
+// CHECK5-NEXT:    store i64 1, ptr [[TMP17]], align 8, !noalias !25
 // CHECK5-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 10
-// CHECK5-NEXT:    store [3 x i32] [i32 1, i32 0, i32 0], ptr [[TMP18]], align 4, !noalias !39
+// CHECK5-NEXT:    store [3 x i32] [i32 1, i32 0, i32 0], ptr [[TMP18]], align 4, !noalias !25
 // CHECK5-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 11
-// CHECK5-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP19]], align 4, !noalias !39
+// CHECK5-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP19]], align 4, !noalias !25
 // CHECK5-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 12
-// CHECK5-NEXT:    store i32 0, ptr [[TMP20]], align 4, !noalias !39
+// CHECK5-NEXT:    store i32 0, ptr [[TMP20]], align 4, !noalias !25
 // CHECK5-NEXT:    [[TMP21:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB2]], i64 -1, i32 1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l96.region_id, ptr [[KERNEL_ARGS_I]])
 // CHECK5-NEXT:    [[TMP22:%.*]] = icmp ne i32 [[TMP21]], 0
 // CHECK5-NEXT:    br i1 [[TMP22]], label [[OMP_OFFLOAD_FAILED_I:%.*]], label [[DOTOMP_OUTLINED__EXIT:%.*]]
 // CHECK5:       omp_offload.failed.i:
-// CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l96() #[[ATTR4]]
+// CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l96() #[[ATTR3]]
 // CHECK5-NEXT:    br label [[DOTOMP_OUTLINED__EXIT]]
 // CHECK5:       .omp_outlined..exit:
 // CHECK5-NEXT:    ret i32 0
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l101
-// CHECK5-SAME: (i64 noundef [[A:%.*]], i64 noundef [[K:%.*]]) #[[ATTR3]] {
+// CHECK5-SAME: (i64 noundef [[A:%.*]], i64 noundef [[K:%.*]]) #[[ATTR2]] {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
 // CHECK5-NEXT:    [[K_ADDR:%.*]] = alloca i64, align 8
@@ -4032,7 +4032,7 @@ int bar(int n){
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l101.omp_outlined
-// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], i64 noundef [[K:%.*]]) #[[ATTR3]] {
+// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], i64 noundef [[K:%.*]]) #[[ATTR2]] {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -4071,32 +4071,32 @@ int bar(int n){
 // CHECK5-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
 // CHECK5-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK5:       omp.inner.for.cond:
-// CHECK5-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP40:![0-9]+]]
-// CHECK5-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP40]]
+// CHECK5-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26:![0-9]+]]
+// CHECK5-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP26]]
 // CHECK5-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
 // CHECK5-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK5:       omp.inner.for.body:
-// CHECK5-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP40]]
+// CHECK5-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
 // CHECK5-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
 // CHECK5-NEXT:    [[SUB:%.*]] = sub nsw i32 10, [[MUL]]
-// CHECK5-NEXT:    store i32 [[SUB]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP40]]
-// CHECK5-NEXT:    [[TMP8:%.*]] = load i64, ptr [[DOTLINEAR_START]], align 8, !llvm.access.group [[ACC_GRP40]]
-// CHECK5-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP40]]
+// CHECK5-NEXT:    store i32 [[SUB]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK5-NEXT:    [[TMP8:%.*]] = load i64, ptr [[DOTLINEAR_START]], align 8, !llvm.access.group [[ACC_GRP26]]
+// CHECK5-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
 // CHECK5-NEXT:    [[MUL2:%.*]] = mul nsw i32 [[TMP9]], 3
 // CHECK5-NEXT:    [[CONV:%.*]] = sext i32 [[MUL2]] to i64
 // CHECK5-NEXT:    [[ADD:%.*]] = add nsw i64 [[TMP8]], [[CONV]]
-// CHECK5-NEXT:    store i64 [[ADD]], ptr [[K1]], align 8, !llvm.access.group [[ACC_GRP40]]
-// CHECK5-NEXT:    [[TMP10:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP40]]
+// CHECK5-NEXT:    store i64 [[ADD]], ptr [[K1]], align 8, !llvm.access.group [[ACC_GRP26]]
+// CHECK5-NEXT:    [[TMP10:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP26]]
 // CHECK5-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP10]], 1
-// CHECK5-NEXT:    store i32 [[ADD3]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP40]]
+// CHECK5-NEXT:    store i32 [[ADD3]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP26]]
 // CHECK5-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK5:       omp.body.continue:
 // CHECK5-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK5:       omp.inner.for.inc:
-// CHECK5-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP40]]
+// CHECK5-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
 // CHECK5-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP11]], 1
-// CHECK5-NEXT:    store i32 [[ADD4]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP40]]
-// CHECK5-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP41:![0-9]+]]
+// CHECK5-NEXT:    store i32 [[ADD4]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK5-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP27:![0-9]+]]
 // CHECK5:       omp.inner.for.end:
 // CHECK5-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK5:       omp.dispatch.inc:
@@ -4146,7 +4146,7 @@ int bar(int n){
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l108.omp_outlined
-// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[AA:%.*]], i64 noundef [[LIN:%.*]], i64 noundef [[A:%.*]]) #[[ATTR3]] {
+// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[AA:%.*]], i64 noundef [[LIN:%.*]], i64 noundef [[A:%.*]]) #[[ATTR2]] {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -4199,44 +4199,44 @@ int bar(int n){
 // CHECK5-NEXT:    store i64 [[TMP6]], ptr [[DOTOMP_IV]], align 8
 // CHECK5-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK5:       omp.inner.for.cond:
-// CHECK5-NEXT:    [[TMP7:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP43:![0-9]+]]
-// CHECK5-NEXT:    [[TMP8:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8, !llvm.access.group [[ACC_GRP43]]
+// CHECK5-NEXT:    [[TMP7:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP29:![0-9]+]]
+// CHECK5-NEXT:    [[TMP8:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8, !llvm.access.group [[ACC_GRP29]]
 // CHECK5-NEXT:    [[CMP4:%.*]] = icmp ule i64 [[TMP7]], [[TMP8]]
 // CHECK5-NEXT:    br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK5:       omp.inner.for.body:
-// CHECK5-NEXT:    [[TMP9:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP43]]
+// CHECK5-NEXT:    [[TMP9:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP29]]
 // CHECK5-NEXT:    [[MUL:%.*]] = mul i64 [[TMP9]], 400
 // CHECK5-NEXT:    [[SUB:%.*]] = sub i64 2000, [[MUL]]
-// CHECK5-NEXT:    store i64 [[SUB]], ptr [[IT]], align 8, !llvm.access.group [[ACC_GRP43]]
-// CHECK5-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTLINEAR_START]], align 4, !llvm.access.group [[ACC_GRP43]]
+// CHECK5-NEXT:    store i64 [[SUB]], ptr [[IT]], align 8, !llvm.access.group [[ACC_GRP29]]
+// CHECK5-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTLINEAR_START]], align 4, !llvm.access.group [[ACC_GRP29]]
 // CHECK5-NEXT:    [[CONV:%.*]] = sext i32 [[TMP10]] to i64
-// CHECK5-NEXT:    [[TMP11:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP43]]
-// CHECK5-NEXT:    [[TMP12:%.*]] = load i64, ptr [[DOTLINEAR_STEP]], align 8, !llvm.access.group [[ACC_GRP43]]
+// CHECK5-NEXT:    [[TMP11:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP29]]
+// CHECK5-NEXT:    [[TMP12:%.*]] = load i64, ptr [[DOTLINEAR_STEP]], align 8, !llvm.access.group [[ACC_GRP29]]
 // CHECK5-NEXT:    [[MUL5:%.*]] = mul i64 [[TMP11]], [[TMP12]]
 // CHECK5-NEXT:    [[ADD:%.*]] = add i64 [[CONV]], [[MUL5]]
 // CHECK5-NEXT:    [[CONV6:%.*]] = trunc i64 [[ADD]] to i32
-// CHECK5-NEXT:    store i32 [[CONV6]], ptr [[LIN2]], align 4, !llvm.access.group [[ACC_GRP43]]
-// CHECK5-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTLINEAR_START1]], align 4, !llvm.access.group [[ACC_GRP43]]
+// CHECK5-NEXT:    store i32 [[CONV6]], ptr [[LIN2]], align 4, !llvm.access.group [[ACC_GRP29]]
+// CHECK5-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTLINEAR_START1]], align 4, !llvm.access.group [[ACC_GRP29]]
 // CHECK5-NEXT:    [[CONV7:%.*]] = sext i32 [[TMP13]] to i64
-// CHECK5-NEXT:    [[TMP14:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP43]]
-// CHECK5-NEXT:    [[TMP15:%.*]] = load i64, ptr [[DOTLINEAR_STEP]], align 8, !llvm.access.group [[ACC_GRP43]]
+// CHECK5-NEXT:    [[TMP14:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP29]]
+// CHECK5-NEXT:    [[TMP15:%.*]] = load i64, ptr [[DOTLINEAR_STEP]], align 8, !llvm.access.group [[ACC_GRP29]]
 // CHECK5-NEXT:    [[MUL8:%.*]] = mul i64 [[TMP14]], [[TMP15]]
 // CHECK5-NEXT:    [[ADD9:%.*]] = add i64 [[CONV7]], [[MUL8]]
 // CHECK5-NEXT:    [[CONV10:%.*]] = trunc i64 [[ADD9]] to i32
-// CHECK5-NEXT:    store i32 [[CONV10]], ptr [[A3]], align 4, !llvm.access.group [[ACC_GRP43]]
-// CHECK5-NEXT:    [[TMP16:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP43]]
+// CHECK5-NEXT:    store i32 [[CONV10]], ptr [[A3]], align 4, !llvm.access.group [[ACC_GRP29]]
+// CHECK5-NEXT:    [[TMP16:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP29]]
 // CHECK5-NEXT:    [[CONV11:%.*]] = sext i16 [[TMP16]] to i32
 // CHECK5-NEXT:    [[ADD12:%.*]] = add nsw i32 [[CONV11]], 1
 // CHECK5-NEXT:    [[CONV13:%.*]] = trunc i32 [[ADD12]] to i16
-// CHECK5-NEXT:    store i16 [[CONV13]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP43]]
+// CHECK5-NEXT:    store i16 [[CONV13]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP29]]
 // CHECK5-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK5:       omp.body.continue:
 // CHECK5-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK5:       omp.inner.for.inc:
-// CHECK5-NEXT:    [[TMP17:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP43]]
+// CHECK5-NEXT:    [[TMP17:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP29]]
 // CHECK5-NEXT:    [[ADD14:%.*]] = add i64 [[TMP17]], 1
-// CHECK5-NEXT:    store i64 [[ADD14]], ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP43]]
-// CHECK5-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP44:![0-9]+]]
+// CHECK5-NEXT:    store i64 [[ADD14]], ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP29]]
+// CHECK5-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP30:![0-9]+]]
 // CHECK5:       omp.inner.for.end:
 // CHECK5-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK5:       omp.loop.exit:
@@ -4281,7 +4281,7 @@ int bar(int n){
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l116.omp_outlined
-// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]]) #[[ATTR3]] {
+// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]]) #[[ATTR2]] {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -4320,32 +4320,32 @@ int bar(int n){
 // CHECK5-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
 // CHECK5-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK5:       omp.inner.for.cond:
-// CHECK5-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP46:![0-9]+]]
-// CHECK5-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP46]]
+// CHECK5-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP32:![0-9]+]]
+// CHECK5-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP32]]
 // CHECK5-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
 // CHECK5-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK5:       omp.inner.for.body:
-// CHECK5-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP46]]
+// CHECK5-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP32]]
 // CHECK5-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 4
 // CHECK5-NEXT:    [[ADD:%.*]] = add nsw i32 6, [[MUL]]
 // CHECK5-NEXT:    [[CONV:%.*]] = trunc i32 [[ADD]] to i16
-// CHECK5-NEXT:    store i16 [[CONV]], ptr [[IT]], align 2, !llvm.access.group [[ACC_GRP46]]
-// CHECK5-NEXT:    [[TMP8:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP46]]
+// CHECK5-NEXT:    store i16 [[CONV]], ptr [[IT]], align 2, !llvm.access.group [[ACC_GRP32]]
+// CHECK5-NEXT:    [[TMP8:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP32]]
 // CHECK5-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP8]], 1
-// CHECK5-NEXT:    store i32 [[ADD2]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP46]]
-// CHECK5-NEXT:    [[TMP9:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP46]]
+// CHECK5-NEXT:    store i32 [[ADD2]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP32]]
+// CHECK5-NEXT:    [[TMP9:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP32]]
 // CHECK5-NEXT:    [[CONV3:%.*]] = sext i16 [[TMP9]] to i32
 // CHECK5-NEXT:    [[ADD4:%.*]] = add nsw i32 [[CONV3]], 1
 // CHECK5-NEXT:    [[CONV5:%.*]] = trunc i32 [[ADD4]] to i16
-// CHECK5-NEXT:    store i16 [[CONV5]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP46]]
+// CHECK5-NEXT:    store i16 [[CONV5]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP32]]
 // CHECK5-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK5:       omp.body.continue:
 // CHECK5-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK5:       omp.inner.for.inc:
-// CHECK5-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP46]]
+// CHECK5-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP32]]
 // CHECK5-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP10]], 1
-// CHECK5-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP46]]
-// CHECK5-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP47:![0-9]+]]
+// CHECK5-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP32]]
+// CHECK5-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP33:![0-9]+]]
 // CHECK5:       omp.inner.for.end:
 // CHECK5-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK5:       omp.loop.exit:
@@ -4404,7 +4404,7 @@ int bar(int n){
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l140.omp_outlined
-// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BN:%.*]], ptr noundef nonnull align 8 dereferenceable(400) [[C:%.*]], i64 noundef [[VLA1:%.*]], i64 noundef [[VLA3:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[CN:%.*]], ptr noundef nonnull align 8 dereferenceable(16) [[D:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] {
+// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BN:%.*]], ptr noundef nonnull align 8 dereferenceable(400) [[C:%.*]], i64 noundef [[VLA1:%.*]], i64 noundef [[VLA3:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[CN:%.*]], ptr noundef nonnull align 8 dereferenceable(16) [[D:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -4475,60 +4475,60 @@ int bar(int n){
 // CHECK5:       omp.dispatch.body:
 // CHECK5-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK5:       omp.inner.for.cond:
-// CHECK5-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP49:![0-9]+]]
-// CHECK5-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP49]]
+// CHECK5-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP35:![0-9]+]]
+// CHECK5-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP35]]
 // CHECK5-NEXT:    [[CMP6:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]]
 // CHECK5-NEXT:    br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK5:       omp.inner.for.body:
-// CHECK5-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP49]]
+// CHECK5-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP35]]
 // CHECK5-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1
 // CHECK5-NEXT:    [[SUB:%.*]] = sub nsw i32 122, [[MUL]]
 // CHECK5-NEXT:    [[CONV:%.*]] = trunc i32 [[SUB]] to i8
-// CHECK5-NEXT:    store i8 [[CONV]], ptr [[IT]], align 1, !llvm.access.group [[ACC_GRP49]]
-// CHECK5-NEXT:    [[TMP19:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP49]]
+// CHECK5-NEXT:    store i8 [[CONV]], ptr [[IT]], align 1, !llvm.access.group [[ACC_GRP35]]
+// CHECK5-NEXT:    [[TMP19:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP35]]
 // CHECK5-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP19]], 1
-// CHECK5-NEXT:    store i32 [[ADD]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP49]]
+// CHECK5-NEXT:    store i32 [[ADD]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP35]]
 // CHECK5-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x float], ptr [[TMP0]], i64 0, i64 2
-// CHECK5-NEXT:    [[TMP20:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP49]]
+// CHECK5-NEXT:    [[TMP20:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP35]]
 // CHECK5-NEXT:    [[CONV7:%.*]] = fpext float [[TMP20]] to double
 // CHECK5-NEXT:    [[ADD8:%.*]] = fadd double [[CONV7]], 1.000000e+00
 // CHECK5-NEXT:    [[CONV9:%.*]] = fptrunc double [[ADD8]] to float
-// CHECK5-NEXT:    store float [[CONV9]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP49]]
+// CHECK5-NEXT:    store float [[CONV9]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP35]]
 // CHECK5-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i64 3
-// CHECK5-NEXT:    [[TMP21:%.*]] = load float, ptr [[ARRAYIDX10]], align 4, !llvm.access.group [[ACC_GRP49]]
+// CHECK5-NEXT:    [[TMP21:%.*]] = load float, ptr [[ARRAYIDX10]], align 4, !llvm.access.group [[ACC_GRP35]]
 // CHECK5-NEXT:    [[CONV11:%.*]] = fpext float [[TMP21]] to double
 // CHECK5-NEXT:    [[ADD12:%.*]] = fadd double [[CONV11]], 1.000000e+00
 // CHECK5-NEXT:    [[CONV13:%.*]] = fptrunc double [[ADD12]] to float
-// CHECK5-NEXT:    store float [[CONV13]], ptr [[ARRAYIDX10]], align 4, !llvm.access.group [[ACC_GRP49]]
+// CHECK5-NEXT:    store float [[CONV13]], ptr [[ARRAYIDX10]], align 4, !llvm.access.group [[ACC_GRP35]]
 // CHECK5-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds [5 x [10 x double]], ptr [[TMP3]], i64 0, i64 1
 // CHECK5-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds [10 x double], ptr [[ARRAYIDX14]], i64 0, i64 2
-// CHECK5-NEXT:    [[TMP22:%.*]] = load double, ptr [[ARRAYIDX15]], align 8, !llvm.access.group [[ACC_GRP49]]
+// CHECK5-NEXT:    [[TMP22:%.*]] = load double, ptr [[ARRAYIDX15]], align 8, !llvm.access.group [[ACC_GRP35]]
 // CHECK5-NEXT:    [[ADD16:%.*]] = fadd double [[TMP22]], 1.000000e+00
-// CHECK5-NEXT:    store double [[ADD16]], ptr [[ARRAYIDX15]], align 8, !llvm.access.group [[ACC_GRP49]]
+// CHECK5-NEXT:    store double [[ADD16]], ptr [[ARRAYIDX15]], align 8, !llvm.access.group [[ACC_GRP35]]
 // CHECK5-NEXT:    [[TMP23:%.*]] = mul nsw i64 1, [[TMP5]]
 // CHECK5-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds double, ptr [[TMP6]], i64 [[TMP23]]
 // CHECK5-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds double, ptr [[ARRAYIDX17]], i64 3
-// CHECK5-NEXT:    [[TMP24:%.*]] = load double, ptr [[ARRAYIDX18]], align 8, !llvm.access.group [[ACC_GRP49]]
+// CHECK5-NEXT:    [[TMP24:%.*]] = load double, ptr [[ARRAYIDX18]], align 8, !llvm.access.group [[ACC_GRP35]]
 // CHECK5-NEXT:    [[ADD19:%.*]] = fadd double [[TMP24]], 1.000000e+00
-// CHECK5-NEXT:    store double [[ADD19]], ptr [[ARRAYIDX18]], align 8, !llvm.access.group [[ACC_GRP49]]
+// CHECK5-NEXT:    store double [[ADD19]], ptr [[ARRAYIDX18]], align 8, !llvm.access.group [[ACC_GRP35]]
 // CHECK5-NEXT:    [[X:%.*]] = getelementptr inbounds [[STRUCT_TT:%.*]], ptr [[TMP7]], i32 0, i32 0
-// CHECK5-NEXT:    [[TMP25:%.*]] = load i64, ptr [[X]], align 8, !llvm.access.group [[ACC_GRP49]]
+// CHECK5-NEXT:    [[TMP25:%.*]] = load i64, ptr [[X]], align 8, !llvm.access.group [[ACC_GRP35]]
 // CHECK5-NEXT:    [[ADD20:%.*]] = add nsw i64 [[TMP25]], 1
-// CHECK5-NEXT:    store i64 [[ADD20]], ptr [[X]], align 8, !llvm.access.group [[ACC_GRP49]]
+// CHECK5-NEXT:    store i64 [[ADD20]], ptr [[X]], align 8, !llvm.access.group [[ACC_GRP35]]
 // CHECK5-NEXT:    [[Y:%.*]] = getelementptr inbounds [[STRUCT_TT]], ptr [[TMP7]], i32 0, i32 1
-// CHECK5-NEXT:    [[TMP26:%.*]] = load i8, ptr [[Y]], align 8, !llvm.access.group [[ACC_GRP49]]
+// CHECK5-NEXT:    [[TMP26:%.*]] = load i8, ptr [[Y]], align 8, !llvm.access.group [[ACC_GRP35]]
 // CHECK5-NEXT:    [[CONV21:%.*]] = sext i8 [[TMP26]] to i32
 // CHECK5-NEXT:    [[ADD22:%.*]] = add nsw i32 [[CONV21]], 1
 // CHECK5-NEXT:    [[CONV23:%.*]] = trunc i32 [[ADD22]] to i8
-// CHECK5-NEXT:    store i8 [[CONV23]], ptr [[Y]], align 8, !llvm.access.group [[ACC_GRP49]]
+// CHECK5-NEXT:    store i8 [[CONV23]], ptr [[Y]], align 8, !llvm.access.group [[ACC_GRP35]]
 // CHECK5-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK5:       omp.body.continue:
 // CHECK5-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK5:       omp.inner.for.inc:
-// CHECK5-NEXT:    [[TMP27:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP49]]
+// CHECK5-NEXT:    [[TMP27:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP35]]
 // CHECK5-NEXT:    [[ADD24:%.*]] = add nsw i32 [[TMP27]], 1
-// CHECK5-NEXT:    store i32 [[ADD24]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP49]]
-// CHECK5-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP50:![0-9]+]]
+// CHECK5-NEXT:    store i32 [[ADD24]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP35]]
+// CHECK5-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP36:![0-9]+]]
 // CHECK5:       omp.inner.for.end:
 // CHECK5-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK5:       omp.dispatch.inc:
@@ -4709,12 +4709,12 @@ int bar(int n){
 // CHECK5-NEXT:    [[TMP52:%.*]] = icmp ne i32 [[TMP51]], 0
 // CHECK5-NEXT:    br i1 [[TMP52]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK5:       omp_offload.failed:
-// CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l214(ptr [[THIS1]], i64 [[TMP7]], i64 2, i64 [[TMP2]], ptr [[VLA]], i64 [[TMP9]]) #[[ATTR4]]
+// CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l214(ptr [[THIS1]], i64 [[TMP7]], i64 2, i64 [[TMP2]], ptr [[VLA]], i64 [[TMP9]]) #[[ATTR3]]
 // CHECK5-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK5:       omp_offload.cont:
 // CHECK5-NEXT:    br label [[OMP_IF_END:%.*]]
 // CHECK5:       omp_if.else:
-// CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l214(ptr [[THIS1]], i64 [[TMP7]], i64 2, i64 [[TMP2]], ptr [[VLA]], i64 [[TMP9]]) #[[ATTR4]]
+// CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l214(ptr [[THIS1]], i64 [[TMP7]], i64 2, i64 [[TMP2]], ptr [[VLA]], i64 [[TMP9]]) #[[ATTR3]]
 // CHECK5-NEXT:    br label [[OMP_IF_END]]
 // CHECK5:       omp_if.end:
 // CHECK5-NEXT:    [[TMP53:%.*]] = mul nsw i64 1, [[TMP2]]
@@ -4817,12 +4817,12 @@ int bar(int n){
 // CHECK5-NEXT:    [[TMP35:%.*]] = icmp ne i32 [[TMP34]], 0
 // CHECK5-NEXT:    br i1 [[TMP35]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK5:       omp_offload.failed:
-// CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l195(i64 [[TMP1]], i64 [[TMP3]], i64 [[TMP5]], ptr [[B]]) #[[ATTR4]]
+// CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l195(i64 [[TMP1]], i64 [[TMP3]], i64 [[TMP5]], ptr [[B]]) #[[ATTR3]]
 // CHECK5-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK5:       omp_offload.cont:
 // CHECK5-NEXT:    br label [[OMP_IF_END:%.*]]
 // CHECK5:       omp_if.else:
-// CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l195(i64 [[TMP1]], i64 [[TMP3]], i64 [[TMP5]], ptr [[B]]) #[[ATTR4]]
+// CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l195(i64 [[TMP1]], i64 [[TMP3]], i64 [[TMP5]], ptr [[B]]) #[[ATTR3]]
 // CHECK5-NEXT:    br label [[OMP_IF_END]]
 // CHECK5:       omp_if.end:
 // CHECK5-NEXT:    [[TMP36:%.*]] = load i32, ptr [[A]], align 4
@@ -4905,12 +4905,12 @@ int bar(int n){
 // CHECK5-NEXT:    [[TMP30:%.*]] = icmp ne i32 [[TMP29]], 0
 // CHECK5-NEXT:    br i1 [[TMP30]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK5:       omp_offload.failed:
-// CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l178(i64 [[TMP1]], i64 [[TMP3]], ptr [[B]]) #[[ATTR4]]
+// CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l178(i64 [[TMP1]], i64 [[TMP3]], ptr [[B]]) #[[ATTR3]]
 // CHECK5-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK5:       omp_offload.cont:
 // CHECK5-NEXT:    br label [[OMP_IF_END:%.*]]
 // CHECK5:       omp_if.else:
-// CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l178(i64 [[TMP1]], i64 [[TMP3]], ptr [[B]]) #[[ATTR4]]
+// CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l178(i64 [[TMP1]], i64 [[TMP3]], ptr [[B]]) #[[ATTR3]]
 // CHECK5-NEXT:    br label [[OMP_IF_END]]
 // CHECK5:       omp_if.end:
 // CHECK5-NEXT:    [[TMP31:%.*]] = load i32, ptr [[A]], align 4
@@ -4959,7 +4959,7 @@ int bar(int n){
 // CHECK5-NEXT:    call void @__kmpc_serialized_parallel(ptr @[[GLOB2]], i32 [[TMP0]])
 // CHECK5-NEXT:    store i32 [[TMP0]], ptr [[DOTTHREADID_TEMP_]], align 4
 // CHECK5-NEXT:    store i32 0, ptr [[DOTBOUND_ZERO_ADDR]], align 4
-// CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l214.omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTBOUND_ZERO_ADDR]], ptr [[TMP1]], i64 [[TMP6]], i64 [[TMP2]], i64 [[TMP3]], ptr [[TMP4]], i64 [[TMP8]]) #[[ATTR4]]
+// CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l214.omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTBOUND_ZERO_ADDR]], ptr [[TMP1]], i64 [[TMP6]], i64 [[TMP2]], i64 [[TMP3]], ptr [[TMP4]], i64 [[TMP8]]) #[[ATTR3]]
 // CHECK5-NEXT:    call void @__kmpc_end_serialized_parallel(ptr @[[GLOB2]], i32 [[TMP0]])
 // CHECK5-NEXT:    br label [[OMP_IF_END]]
 // CHECK5:       omp_if.end:
@@ -4967,7 +4967,7 @@ int bar(int n){
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l214.omp_outlined
-// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]], i64 noundef [[B:%.*]], i64 noundef [[VLA:%.*]], i64 noundef [[VLA1:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[C:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] {
+// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]], i64 noundef [[B:%.*]], i64 noundef [[VLA:%.*]], i64 noundef [[VLA1:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[C:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -5022,37 +5022,37 @@ int bar(int n){
 // CHECK5-NEXT:    store i64 [[TMP9]], ptr [[DOTOMP_IV]], align 8
 // CHECK5-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK5:       omp.inner.for.cond:
-// CHECK5-NEXT:    [[TMP10:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP52:![0-9]+]]
-// CHECK5-NEXT:    [[TMP11:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8, !llvm.access.group [[ACC_GRP52]]
+// CHECK5-NEXT:    [[TMP10:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP38:![0-9]+]]
+// CHECK5-NEXT:    [[TMP11:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8, !llvm.access.group [[ACC_GRP38]]
 // CHECK5-NEXT:    [[CMP3:%.*]] = icmp ule i64 [[TMP10]], [[TMP11]]
 // CHECK5-NEXT:    br i1 [[CMP3]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK5:       omp.inner.for.body:
-// CHECK5-NEXT:    [[TMP12:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP52]]
+// CHECK5-NEXT:    [[TMP12:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP38]]
 // CHECK5-NEXT:    [[MUL:%.*]] = mul i64 [[TMP12]], 400
 // CHECK5-NEXT:    [[SUB:%.*]] = sub i64 2000, [[MUL]]
-// CHECK5-NEXT:    store i64 [[SUB]], ptr [[IT]], align 8, !llvm.access.group [[ACC_GRP52]]
-// CHECK5-NEXT:    [[TMP13:%.*]] = load i32, ptr [[B_ADDR]], align 4, !llvm.access.group [[ACC_GRP52]]
+// CHECK5-NEXT:    store i64 [[SUB]], ptr [[IT]], align 8, !llvm.access.group [[ACC_GRP38]]
+// CHECK5-NEXT:    [[TMP13:%.*]] = load i32, ptr [[B_ADDR]], align 4, !llvm.access.group [[ACC_GRP38]]
 // CHECK5-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP13]] to double
 // CHECK5-NEXT:    [[ADD:%.*]] = fadd double [[CONV]], 1.500000e+00
 // CHECK5-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_S1:%.*]], ptr [[TMP0]], i32 0, i32 0
-// CHECK5-NEXT:    store double [[ADD]], ptr [[A]], align 8, !nontemporal !53, !llvm.access.group [[ACC_GRP52]]
+// CHECK5-NEXT:    store double [[ADD]], ptr [[A]], align 8, !nontemporal !39, !llvm.access.group [[ACC_GRP38]]
 // CHECK5-NEXT:    [[A4:%.*]] = getelementptr inbounds [[STRUCT_S1]], ptr [[TMP0]], i32 0, i32 0
-// CHECK5-NEXT:    [[TMP14:%.*]] = load double, ptr [[A4]], align 8, !nontemporal !53, !llvm.access.group [[ACC_GRP52]]
+// CHECK5-NEXT:    [[TMP14:%.*]] = load double, ptr [[A4]], align 8, !nontemporal !39, !llvm.access.group [[ACC_GRP38]]
 // CHECK5-NEXT:    [[INC:%.*]] = fadd double [[TMP14]], 1.000000e+00
-// CHECK5-NEXT:    store double [[INC]], ptr [[A4]], align 8, !nontemporal !53, !llvm.access.group [[ACC_GRP52]]
+// CHECK5-NEXT:    store double [[INC]], ptr [[A4]], align 8, !nontemporal !39, !llvm.access.group [[ACC_GRP38]]
 // CHECK5-NEXT:    [[CONV5:%.*]] = fptosi double [[INC]] to i16
 // CHECK5-NEXT:    [[TMP15:%.*]] = mul nsw i64 1, [[TMP2]]
 // CHECK5-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[TMP3]], i64 [[TMP15]]
 // CHECK5-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i16, ptr [[ARRAYIDX]], i64 1
-// CHECK5-NEXT:    store i16 [[CONV5]], ptr [[ARRAYIDX6]], align 2, !llvm.access.group [[ACC_GRP52]]
+// CHECK5-NEXT:    store i16 [[CONV5]], ptr [[ARRAYIDX6]], align 2, !llvm.access.group [[ACC_GRP38]]
 // CHECK5-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK5:       omp.body.continue:
 // CHECK5-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK5:       omp.inner.for.inc:
-// CHECK5-NEXT:    [[TMP16:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP52]]
+// CHECK5-NEXT:    [[TMP16:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP38]]
 // CHECK5-NEXT:    [[ADD7:%.*]] = add i64 [[TMP16]], 1
-// CHECK5-NEXT:    store i64 [[ADD7]], ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP52]]
-// CHECK5-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP54:![0-9]+]]
+// CHECK5-NEXT:    store i64 [[ADD7]], ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP38]]
+// CHECK5-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP40:![0-9]+]]
 // CHECK5:       omp.inner.for.end:
 // CHECK5-NEXT:    br label [[OMP_IF_END:%.*]]
 // CHECK5:       omp_if.else:
@@ -5104,7 +5104,7 @@ int bar(int n){
 // CHECK5-NEXT:    [[TMP28:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8
 // CHECK5-NEXT:    [[ADD28:%.*]] = add i64 [[TMP28]], 1
 // CHECK5-NEXT:    store i64 [[ADD28]], ptr [[DOTOMP_IV]], align 8
-// CHECK5-NEXT:    br label [[OMP_INNER_FOR_COND13]], !llvm.loop [[LOOP56:![0-9]+]]
+// CHECK5-NEXT:    br label [[OMP_INNER_FOR_COND13]], !llvm.loop [[LOOP42:![0-9]+]]
 // CHECK5:       omp.inner.for.end29:
 // CHECK5-NEXT:    br label [[OMP_IF_END]]
 // CHECK5:       omp_if.end:
@@ -5152,7 +5152,7 @@ int bar(int n){
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l195.omp_outlined
-// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]], i64 noundef [[AAA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR3]] {
+// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]], i64 noundef [[AAA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR2]] {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -5195,7 +5195,7 @@ int bar(int n){
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l178.omp_outlined
-// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR3]] {
+// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR2]] {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -5237,35 +5237,35 @@ int bar(int n){
 // CHECK5-NEXT:    store i64 [[TMP5]], ptr [[DOTOMP_IV]], align 8
 // CHECK5-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK5:       omp.inner.for.cond:
-// CHECK5-NEXT:    [[TMP6:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP58:![0-9]+]]
-// CHECK5-NEXT:    [[TMP7:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8, !llvm.access.group [[ACC_GRP58]]
+// CHECK5-NEXT:    [[TMP6:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP44:![0-9]+]]
+// CHECK5-NEXT:    [[TMP7:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8, !llvm.access.group [[ACC_GRP44]]
 // CHECK5-NEXT:    [[CMP1:%.*]] = icmp sle i64 [[TMP6]], [[TMP7]]
 // CHECK5-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK5:       omp.inner.for.body:
-// CHECK5-NEXT:    [[TMP8:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP58]]
+// CHECK5-NEXT:    [[TMP8:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP44]]
 // CHECK5-NEXT:    [[MUL:%.*]] = mul nsw i64 [[TMP8]], 3
 // CHECK5-NEXT:    [[ADD:%.*]] = add nsw i64 -10, [[MUL]]
-// CHECK5-NEXT:    store i64 [[ADD]], ptr [[I]], align 8, !llvm.access.group [[ACC_GRP58]]
-// CHECK5-NEXT:    [[TMP9:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP58]]
+// CHECK5-NEXT:    store i64 [[ADD]], ptr [[I]], align 8, !llvm.access.group [[ACC_GRP44]]
+// CHECK5-NEXT:    [[TMP9:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP44]]
 // CHECK5-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP9]], 1
-// CHECK5-NEXT:    store i32 [[ADD2]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP58]]
-// CHECK5-NEXT:    [[TMP10:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP58]]
+// CHECK5-NEXT:    store i32 [[ADD2]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP44]]
+// CHECK5-NEXT:    [[TMP10:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP44]]
 // CHECK5-NEXT:    [[CONV:%.*]] = sext i16 [[TMP10]] to i32
 // CHECK5-NEXT:    [[ADD3:%.*]] = add nsw i32 [[CONV]], 1
 // CHECK5-NEXT:    [[CONV4:%.*]] = trunc i32 [[ADD3]] to i16
-// CHECK5-NEXT:    store i16 [[CONV4]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP58]]
+// CHECK5-NEXT:    store i16 [[CONV4]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP44]]
 // CHECK5-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP0]], i64 0, i64 2
-// CHECK5-NEXT:    [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP58]]
+// CHECK5-NEXT:    [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP44]]
 // CHECK5-NEXT:    [[ADD5:%.*]] = add nsw i32 [[TMP11]], 1
-// CHECK5-NEXT:    store i32 [[ADD5]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP58]]
+// CHECK5-NEXT:    store i32 [[ADD5]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP44]]
 // CHECK5-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK5:       omp.body.continue:
 // CHECK5-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK5:       omp.inner.for.inc:
-// CHECK5-NEXT:    [[TMP12:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP58]]
+// CHECK5-NEXT:    [[TMP12:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP44]]
 // CHECK5-NEXT:    [[ADD6:%.*]] = add nsw i64 [[TMP12]], 1
-// CHECK5-NEXT:    store i64 [[ADD6]], ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP58]]
-// CHECK5-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP59:![0-9]+]]
+// CHECK5-NEXT:    store i64 [[ADD6]], ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP44]]
+// CHECK5-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP45:![0-9]+]]
 // CHECK5:       omp.inner.for.end:
 // CHECK5-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK5:       omp.loop.exit:
@@ -5281,7 +5281,7 @@ int bar(int n){
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK5-SAME: () #[[ATTR8:[0-9]+]] {
+// CHECK5-SAME: () #[[ATTR7:[0-9]+]] {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK5-NEXT:    ret void
@@ -5351,7 +5351,7 @@ int bar(int n){
 // CHECK7-NEXT:    [[TMP8:%.*]] = load i32, ptr [[A]], align 4
 // CHECK7-NEXT:    store i32 [[TMP8]], ptr [[A_CASTED]], align 4
 // CHECK7-NEXT:    [[TMP9:%.*]] = load i32, ptr [[A_CASTED]], align 4
-// CHECK7-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l101(i32 [[TMP9]], ptr [[K]]) #[[ATTR4:[0-9]+]]
+// CHECK7-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l101(i32 [[TMP9]], ptr [[K]]) #[[ATTR3:[0-9]+]]
 // CHECK7-NEXT:    store i32 12, ptr [[LIN]], align 4
 // CHECK7-NEXT:    [[TMP10:%.*]] = load i16, ptr [[AA]], align 2
 // CHECK7-NEXT:    store i16 [[TMP10]], ptr [[AA_CASTED]], align 2
@@ -5412,7 +5412,7 @@ int bar(int n){
 // CHECK7-NEXT:    [[TMP41:%.*]] = icmp ne i32 [[TMP40]], 0
 // CHECK7-NEXT:    br i1 [[TMP41]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK7:       omp_offload.failed:
-// CHECK7-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l108(i32 [[TMP11]], i32 [[TMP13]], i32 [[TMP15]]) #[[ATTR4]]
+// CHECK7-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l108(i32 [[TMP11]], i32 [[TMP13]], i32 [[TMP15]]) #[[ATTR3]]
 // CHECK7-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK7:       omp_offload.cont:
 // CHECK7-NEXT:    [[TMP42:%.*]] = load i32, ptr [[A]], align 4
@@ -5469,12 +5469,12 @@ int bar(int n){
 // CHECK7-NEXT:    [[TMP69:%.*]] = icmp ne i32 [[TMP68]], 0
 // CHECK7-NEXT:    br i1 [[TMP69]], label [[OMP_OFFLOAD_FAILED9:%.*]], label [[OMP_OFFLOAD_CONT10:%.*]]
 // CHECK7:       omp_offload.failed9:
-// CHECK7-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l116(i32 [[TMP43]], i32 [[TMP45]]) #[[ATTR4]]
+// CHECK7-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l116(i32 [[TMP43]], i32 [[TMP45]]) #[[ATTR3]]
 // CHECK7-NEXT:    br label [[OMP_OFFLOAD_CONT10]]
 // CHECK7:       omp_offload.cont10:
 // CHECK7-NEXT:    br label [[OMP_IF_END:%.*]]
 // CHECK7:       omp_if.else:
-// CHECK7-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l116(i32 [[TMP43]], i32 [[TMP45]]) #[[ATTR4]]
+// CHECK7-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l116(i32 [[TMP43]], i32 [[TMP45]]) #[[ATTR3]]
 // CHECK7-NEXT:    br label [[OMP_IF_END]]
 // CHECK7:       omp_if.end:
 // CHECK7-NEXT:    [[TMP70:%.*]] = load i32, ptr [[A]], align 4
@@ -5592,12 +5592,12 @@ int bar(int n){
 // CHECK7-NEXT:    [[TMP130:%.*]] = icmp ne i32 [[TMP129]], 0
 // CHECK7-NEXT:    br i1 [[TMP130]], label [[OMP_OFFLOAD_FAILED18:%.*]], label [[OMP_OFFLOAD_CONT19:%.*]]
 // CHECK7:       omp_offload.failed18:
-// CHECK7-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l140(i32 [[TMP72]], ptr [[B]], i32 [[TMP1]], ptr [[VLA]], ptr [[C]], i32 5, i32 [[TMP3]], ptr [[VLA1]], ptr [[D]], i32 [[TMP74]]) #[[ATTR4]]
+// CHECK7-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l140(i32 [[TMP72]], ptr [[B]], i32 [[TMP1]], ptr [[VLA]], ptr [[C]], i32 5, i32 [[TMP3]], ptr [[VLA1]], ptr [[D]], i32 [[TMP74]]) #[[ATTR3]]
 // CHECK7-NEXT:    br label [[OMP_OFFLOAD_CONT19]]
 // CHECK7:       omp_offload.cont19:
 // CHECK7-NEXT:    br label [[OMP_IF_END21:%.*]]
 // CHECK7:       omp_if.else20:
-// CHECK7-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l140(i32 [[TMP72]], ptr [[B]], i32 [[TMP1]], ptr [[VLA]], ptr [[C]], i32 5, i32 [[TMP3]], ptr [[VLA1]], ptr [[D]], i32 [[TMP74]]) #[[ATTR4]]
+// CHECK7-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l140(i32 [[TMP72]], ptr [[B]], i32 [[TMP1]], ptr [[VLA]], ptr [[C]], i32 5, i32 [[TMP3]], ptr [[VLA1]], ptr [[D]], i32 [[TMP74]]) #[[ATTR3]]
 // CHECK7-NEXT:    br label [[OMP_IF_END21]]
 // CHECK7:       omp_if.end21:
 // CHECK7-NEXT:    [[TMP131:%.*]] = load i32, ptr [[A]], align 4
@@ -5614,7 +5614,7 @@ int bar(int n){
 //
 //
 // CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l96.omp_outlined
-// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] {
 // CHECK7-NEXT:  entry:
 // CHECK7-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK7-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -5649,23 +5649,23 @@ int bar(int n){
 // CHECK7-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
 // CHECK7-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK7:       omp.inner.for.cond:
-// CHECK7-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP25:![0-9]+]]
-// CHECK7-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP25]]
+// CHECK7-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP11:![0-9]+]]
+// CHECK7-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP11]]
 // CHECK7-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
 // CHECK7-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK7:       omp.inner.for.body:
-// CHECK7-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP25]]
+// CHECK7-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP11]]
 // CHECK7-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 5
 // CHECK7-NEXT:    [[ADD:%.*]] = add nsw i32 3, [[MUL]]
-// CHECK7-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP25]]
+// CHECK7-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP11]]
 // CHECK7-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK7:       omp.body.continue:
 // CHECK7-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK7:       omp.inner.for.inc:
-// CHECK7-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP25]]
+// CHECK7-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP11]]
 // CHECK7-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP8]], 1
-// CHECK7-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP25]]
-// CHECK7-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP26:![0-9]+]]
+// CHECK7-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP11]]
+// CHECK7-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP12:![0-9]+]]
 // CHECK7:       omp.inner.for.end:
 // CHECK7-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK7:       omp.loop.exit:
@@ -5681,7 +5681,7 @@ int bar(int n){
 //
 //
 // CHECK7-LABEL: define {{[^@]+}}@.omp_task_entry.
-// CHECK7-SAME: (i32 noundef [[TMP0:%.*]], ptr noalias noundef [[TMP1:%.*]]) #[[ATTR5:[0-9]+]] {
+// CHECK7-SAME: (i32 noundef [[TMP0:%.*]], ptr noalias noundef [[TMP1:%.*]]) #[[ATTR4:[0-9]+]] {
 // CHECK7-NEXT:  entry:
 // CHECK7-NEXT:    [[DOTGLOBAL_TID__ADDR_I:%.*]] = alloca i32, align 4
 // CHECK7-NEXT:    [[DOTPART_ID__ADDR_I:%.*]] = alloca ptr, align 4
@@ -5700,54 +5700,54 @@ int bar(int n){
 // CHECK7-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T:%.*]], ptr [[TMP4]], i32 0, i32 2
 // CHECK7-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], ptr [[TMP4]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 4
-// CHECK7-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META31:![0-9]+]])
-// CHECK7-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META34:![0-9]+]])
-// CHECK7-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META36:![0-9]+]])
-// CHECK7-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META38:![0-9]+]])
-// CHECK7-NEXT:    store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !40
-// CHECK7-NEXT:    store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 4, !noalias !40
-// CHECK7-NEXT:    store ptr null, ptr [[DOTPRIVATES__ADDR_I]], align 4, !noalias !40
-// CHECK7-NEXT:    store ptr null, ptr [[DOTCOPY_FN__ADDR_I]], align 4, !noalias !40
-// CHECK7-NEXT:    store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 4, !noalias !40
-// CHECK7-NEXT:    store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 4, !noalias !40
-// CHECK7-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 4, !noalias !40
-// CHECK7-NEXT:    store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias !40
+// CHECK7-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META17:![0-9]+]])
+// CHECK7-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META20:![0-9]+]])
+// CHECK7-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META22:![0-9]+]])
+// CHECK7-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META24:![0-9]+]])
+// CHECK7-NEXT:    store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !26
+// CHECK7-NEXT:    store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 4, !noalias !26
+// CHECK7-NEXT:    store ptr null, ptr [[DOTPRIVATES__ADDR_I]], align 4, !noalias !26
+// CHECK7-NEXT:    store ptr null, ptr [[DOTCOPY_FN__ADDR_I]], align 4, !noalias !26
+// CHECK7-NEXT:    store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 4, !noalias !26
+// CHECK7-NEXT:    store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 4, !noalias !26
+// CHECK7-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 4, !noalias !26
+// CHECK7-NEXT:    store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias !26
 // CHECK7-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 1
-// CHECK7-NEXT:    store i32 0, ptr [[TMP9]], align 4, !noalias !40
+// CHECK7-NEXT:    store i32 0, ptr [[TMP9]], align 4, !noalias !26
 // CHECK7-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 2
-// CHECK7-NEXT:    store ptr null, ptr [[TMP10]], align 4, !noalias !40
+// CHECK7-NEXT:    store ptr null, ptr [[TMP10]], align 4, !noalias !26
 // CHECK7-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 3
-// CHECK7-NEXT:    store ptr null, ptr [[TMP11]], align 4, !noalias !40
+// CHECK7-NEXT:    store ptr null, ptr [[TMP11]], align 4, !noalias !26
 // CHECK7-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 4
-// CHECK7-NEXT:    store ptr null, ptr [[TMP12]], align 4, !noalias !40
+// CHECK7-NEXT:    store ptr null, ptr [[TMP12]], align 4, !noalias !26
 // CHECK7-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 5
-// CHECK7-NEXT:    store ptr null, ptr [[TMP13]], align 4, !noalias !40
+// CHECK7-NEXT:    store ptr null, ptr [[TMP13]], align 4, !noalias !26
 // CHECK7-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 6
-// CHECK7-NEXT:    store ptr null, ptr [[TMP14]], align 4, !noalias !40
+// CHECK7-NEXT:    store ptr null, ptr [[TMP14]], align 4, !noalias !26
 // CHECK7-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 7
-// CHECK7-NEXT:    store ptr null, ptr [[TMP15]], align 4, !noalias !40
+// CHECK7-NEXT:    store ptr null, ptr [[TMP15]], align 4, !noalias !26
 // CHECK7-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 8
-// CHECK7-NEXT:    store i64 0, ptr [[TMP16]], align 8, !noalias !40
+// CHECK7-NEXT:    store i64 0, ptr [[TMP16]], align 8, !noalias !26
 // CHECK7-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 9
-// CHECK7-NEXT:    store i64 1, ptr [[TMP17]], align 8, !noalias !40
+// CHECK7-NEXT:    store i64 1, ptr [[TMP17]], align 8, !noalias !26
 // CHECK7-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 10
-// CHECK7-NEXT:    store [3 x i32] [i32 1, i32 0, i32 0], ptr [[TMP18]], align 4, !noalias !40
+// CHECK7-NEXT:    store [3 x i32] [i32 1, i32 0, i32 0], ptr [[TMP18]], align 4, !noalias !26
 // CHECK7-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 11
-// CHECK7-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP19]], align 4, !noalias !40
+// CHECK7-NEXT:    store [3 x i32] zeroinitializer, ptr [[TMP19]], align 4, !noalias !26
 // CHECK7-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 12
-// CHECK7-NEXT:    store i32 0, ptr [[TMP20]], align 4, !noalias !40
+// CHECK7-NEXT:    store i32 0, ptr [[TMP20]], align 4, !noalias !26
 // CHECK7-NEXT:    [[TMP21:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB2]], i64 -1, i32 1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l96.region_id, ptr [[KERNEL_ARGS_I]])
 // CHECK7-NEXT:    [[TMP22:%.*]] = icmp ne i32 [[TMP21]], 0
 // CHECK7-NEXT:    br i1 [[TMP22]], label [[OMP_OFFLOAD_FAILED_I:%.*]], label [[DOTOMP_OUTLINED__EXIT:%.*]]
 // CHECK7:       omp_offload.failed.i:
-// CHECK7-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l96() #[[ATTR4]]
+// CHECK7-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l96() #[[ATTR3]]
 // CHECK7-NEXT:    br label [[DOTOMP_OUTLINED__EXIT]]
 // CHECK7:       .omp_outlined..exit:
 // CHECK7-NEXT:    ret i32 0
 //
 //
 // CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l101
-// CHECK7-SAME: (i32 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[K:%.*]]) #[[ATTR3]] {
+// CHECK7-SAME: (i32 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[K:%.*]]) #[[ATTR2]] {
 // CHECK7-NEXT:  entry:
 // CHECK7-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 // CHECK7-NEXT:    [[K_ADDR:%.*]] = alloca ptr, align 4
@@ -5763,7 +5763,7 @@ int bar(int n){
 //
 //
 // CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l101.omp_outlined
-// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[K:%.*]]) #[[ATTR3]] {
+// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[K:%.*]]) #[[ATTR2]] {
 // CHECK7-NEXT:  entry:
 // CHECK7-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK7-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -5803,32 +5803,32 @@ int bar(int n){
 // CHECK7-NEXT:    store i32 [[TMP5]], ptr [[DOTOMP_IV]], align 4
 // CHECK7-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK7:       omp.inner.for.cond:
-// CHECK7-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP41:![0-9]+]]
-// CHECK7-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP41]]
+// CHECK7-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP27:![0-9]+]]
+// CHECK7-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP27]]
 // CHECK7-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP6]], [[TMP7]]
 // CHECK7-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK7:       omp.inner.for.body:
-// CHECK7-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP41]]
+// CHECK7-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP27]]
 // CHECK7-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1
 // CHECK7-NEXT:    [[SUB:%.*]] = sub nsw i32 10, [[MUL]]
-// CHECK7-NEXT:    store i32 [[SUB]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP41]]
-// CHECK7-NEXT:    [[TMP9:%.*]] = load i64, ptr [[DOTLINEAR_START]], align 8, !llvm.access.group [[ACC_GRP41]]
-// CHECK7-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP41]]
+// CHECK7-NEXT:    store i32 [[SUB]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP27]]
+// CHECK7-NEXT:    [[TMP9:%.*]] = load i64, ptr [[DOTLINEAR_START]], align 8, !llvm.access.group [[ACC_GRP27]]
+// CHECK7-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP27]]
 // CHECK7-NEXT:    [[MUL2:%.*]] = mul nsw i32 [[TMP10]], 3
 // CHECK7-NEXT:    [[CONV:%.*]] = sext i32 [[MUL2]] to i64
 // CHECK7-NEXT:    [[ADD:%.*]] = add nsw i64 [[TMP9]], [[CONV]]
-// CHECK7-NEXT:    store i64 [[ADD]], ptr [[K1]], align 8, !llvm.access.group [[ACC_GRP41]]
-// CHECK7-NEXT:    [[TMP11:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP41]]
+// CHECK7-NEXT:    store i64 [[ADD]], ptr [[K1]], align 8, !llvm.access.group [[ACC_GRP27]]
+// CHECK7-NEXT:    [[TMP11:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP27]]
 // CHECK7-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP11]], 1
-// CHECK7-NEXT:    store i32 [[ADD3]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP41]]
+// CHECK7-NEXT:    store i32 [[ADD3]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP27]]
 // CHECK7-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK7:       omp.body.continue:
 // CHECK7-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK7:       omp.inner.for.inc:
-// CHECK7-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP41]]
+// CHECK7-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP27]]
 // CHECK7-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP12]], 1
-// CHECK7-NEXT:    store i32 [[ADD4]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP41]]
-// CHECK7-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP42:![0-9]+]]
+// CHECK7-NEXT:    store i32 [[ADD4]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP27]]
+// CHECK7-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP28:![0-9]+]]
 // CHECK7:       omp.inner.for.end:
 // CHECK7-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK7:       omp.dispatch.inc:
@@ -5878,7 +5878,7 @@ int bar(int n){
 //
 //
 // CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l108.omp_outlined
-// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[AA:%.*]], i32 noundef [[LIN:%.*]], i32 noundef [[A:%.*]]) #[[ATTR3]] {
+// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[AA:%.*]], i32 noundef [[LIN:%.*]], i32 noundef [[A:%.*]]) #[[ATTR2]] {
 // CHECK7-NEXT:  entry:
 // CHECK7-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK7-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -5931,44 +5931,44 @@ int bar(int n){
 // CHECK7-NEXT:    store i64 [[TMP6]], ptr [[DOTOMP_IV]], align 8
 // CHECK7-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK7:       omp.inner.for.cond:
-// CHECK7-NEXT:    [[TMP7:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP44:![0-9]+]]
-// CHECK7-NEXT:    [[TMP8:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8, !llvm.access.group [[ACC_GRP44]]
+// CHECK7-NEXT:    [[TMP7:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP30:![0-9]+]]
+// CHECK7-NEXT:    [[TMP8:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8, !llvm.access.group [[ACC_GRP30]]
 // CHECK7-NEXT:    [[CMP4:%.*]] = icmp ule i64 [[TMP7]], [[TMP8]]
 // CHECK7-NEXT:    br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK7:       omp.inner.for.body:
-// CHECK7-NEXT:    [[TMP9:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP44]]
+// CHECK7-NEXT:    [[TMP9:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP30]]
 // CHECK7-NEXT:    [[MUL:%.*]] = mul i64 [[TMP9]], 400
 // CHECK7-NEXT:    [[SUB:%.*]] = sub i64 2000, [[MUL]]
-// CHECK7-NEXT:    store i64 [[SUB]], ptr [[IT]], align 8, !llvm.access.group [[ACC_GRP44]]
-// CHECK7-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTLINEAR_START]], align 4, !llvm.access.group [[ACC_GRP44]]
+// CHECK7-NEXT:    store i64 [[SUB]], ptr [[IT]], align 8, !llvm.access.group [[ACC_GRP30]]
+// CHECK7-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTLINEAR_START]], align 4, !llvm.access.group [[ACC_GRP30]]
 // CHECK7-NEXT:    [[CONV:%.*]] = sext i32 [[TMP10]] to i64
-// CHECK7-NEXT:    [[TMP11:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP44]]
-// CHECK7-NEXT:    [[TMP12:%.*]] = load i64, ptr [[DOTLINEAR_STEP]], align 8, !llvm.access.group [[ACC_GRP44]]
+// CHECK7-NEXT:    [[TMP11:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP30]]
+// CHECK7-NEXT:    [[TMP12:%.*]] = load i64, ptr [[DOTLINEAR_STEP]], align 8, !llvm.access.group [[ACC_GRP30]]
 // CHECK7-NEXT:    [[MUL5:%.*]] = mul i64 [[TMP11]], [[TMP12]]
 // CHECK7-NEXT:    [[ADD:%.*]] = add i64 [[CONV]], [[MUL5]]
 // CHECK7-NEXT:    [[CONV6:%.*]] = trunc i64 [[ADD]] to i32
-// CHECK7-NEXT:    store i32 [[CONV6]], ptr [[LIN2]], align 4, !llvm.access.group [[ACC_GRP44]]
-// CHECK7-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTLINEAR_START1]], align 4, !llvm.access.group [[ACC_GRP44]]
+// CHECK7-NEXT:    store i32 [[CONV6]], ptr [[LIN2]], align 4, !llvm.access.group [[ACC_GRP30]]
+// CHECK7-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTLINEAR_START1]], align 4, !llvm.access.group [[ACC_GRP30]]
 // CHECK7-NEXT:    [[CONV7:%.*]] = sext i32 [[TMP13]] to i64
-// CHECK7-NEXT:    [[TMP14:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP44]]
-// CHECK7-NEXT:    [[TMP15:%.*]] = load i64, ptr [[DOTLINEAR_STEP]], align 8, !llvm.access.group [[ACC_GRP44]]
+// CHECK7-NEXT:    [[TMP14:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP30]]
+// CHECK7-NEXT:    [[TMP15:%.*]] = load i64, ptr [[DOTLINEAR_STEP]], align 8, !llvm.access.group [[ACC_GRP30]]
 // CHECK7-NEXT:    [[MUL8:%.*]] = mul i64 [[TMP14]], [[TMP15]]
 // CHECK7-NEXT:    [[ADD9:%.*]] = add i64 [[CONV7]], [[MUL8]]
 // CHECK7-NEXT:    [[CONV10:%.*]] = trunc i64 [[ADD9]] to i32
-// CHECK7-NEXT:    store i32 [[CONV10]], ptr [[A3]], align 4, !llvm.access.group [[ACC_GRP44]]
-// CHECK7-NEXT:    [[TMP16:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP44]]
+// CHECK7-NEXT:    store i32 [[CONV10]], ptr [[A3]], align 4, !llvm.access.group [[ACC_GRP30]]
+// CHECK7-NEXT:    [[TMP16:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP30]]
 // CHECK7-NEXT:    [[CONV11:%.*]] = sext i16 [[TMP16]] to i32
 // CHECK7-NEXT:    [[ADD12:%.*]] = add nsw i32 [[CONV11]], 1
 // CHECK7-NEXT:    [[CONV13:%.*]] = trunc i32 [[ADD12]] to i16
-// CHECK7-NEXT:    store i16 [[CONV13]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP44]]
+// CHECK7-NEXT:    store i16 [[CONV13]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP30]]
 // CHECK7-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK7:       omp.body.continue:
 // CHECK7-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK7:       omp.inner.for.inc:
-// CHECK7-NEXT:    [[TMP17:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP44]]
+// CHECK7-NEXT:    [[TMP17:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP30]]
 // CHECK7-NEXT:    [[ADD14:%.*]] = add i64 [[TMP17]], 1
-// CHECK7-NEXT:    store i64 [[ADD14]], ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP44]]
-// CHECK7-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP45:![0-9]+]]
+// CHECK7-NEXT:    store i64 [[ADD14]], ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP30]]
+// CHECK7-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP31:![0-9]+]]
 // CHECK7:       omp.inner.for.end:
 // CHECK7-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK7:       omp.loop.exit:
@@ -6013,7 +6013,7 @@ int bar(int n){
 //
 //
 // CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l116.omp_outlined
-// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]]) #[[ATTR3]] {
+// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]]) #[[ATTR2]] {
 // CHECK7-NEXT:  entry:
 // CHECK7-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK7-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -6052,32 +6052,32 @@ int bar(int n){
 // CHECK7-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
 // CHECK7-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK7:       omp.inner.for.cond:
-// CHECK7-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP47:![0-9]+]]
-// CHECK7-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP47]]
+// CHECK7-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP33:![0-9]+]]
+// CHECK7-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP33]]
 // CHECK7-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
 // CHECK7-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK7:       omp.inner.for.body:
-// CHECK7-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP47]]
+// CHECK7-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP33]]
 // CHECK7-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 4
 // CHECK7-NEXT:    [[ADD:%.*]] = add nsw i32 6, [[MUL]]
 // CHECK7-NEXT:    [[CONV:%.*]] = trunc i32 [[ADD]] to i16
-// CHECK7-NEXT:    store i16 [[CONV]], ptr [[IT]], align 2, !llvm.access.group [[ACC_GRP47]]
-// CHECK7-NEXT:    [[TMP8:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP47]]
+// CHECK7-NEXT:    store i16 [[CONV]], ptr [[IT]], align 2, !llvm.access.group [[ACC_GRP33]]
+// CHECK7-NEXT:    [[TMP8:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP33]]
 // CHECK7-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP8]], 1
-// CHECK7-NEXT:    store i32 [[ADD2]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP47]]
-// CHECK7-NEXT:    [[TMP9:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP47]]
+// CHECK7-NEXT:    store i32 [[ADD2]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP33]]
+// CHECK7-NEXT:    [[TMP9:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP33]]
 // CHECK7-NEXT:    [[CONV3:%.*]] = sext i16 [[TMP9]] to i32
 // CHECK7-NEXT:    [[ADD4:%.*]] = add nsw i32 [[CONV3]], 1
 // CHECK7-NEXT:    [[CONV5:%.*]] = trunc i32 [[ADD4]] to i16
-// CHECK7-NEXT:    store i16 [[CONV5]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP47]]
+// CHECK7-NEXT:    store i16 [[CONV5]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP33]]
 // CHECK7-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK7:       omp.body.continue:
 // CHECK7-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK7:       omp.inner.for.inc:
-// CHECK7-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP47]]
+// CHECK7-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP33]]
 // CHECK7-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP10]], 1
-// CHECK7-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP47]]
-// CHECK7-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP48:![0-9]+]]
+// CHECK7-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP33]]
+// CHECK7-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP34:![0-9]+]]
 // CHECK7:       omp.inner.for.end:
 // CHECK7-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK7:       omp.loop.exit:
@@ -6136,7 +6136,7 @@ int bar(int n){
 //
 //
 // CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l140.omp_outlined
-// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BN:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[C:%.*]], i32 noundef [[VLA1:%.*]], i32 noundef [[VLA3:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[CN:%.*]], ptr noundef nonnull align 4 dereferenceable(12) [[D:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] {
+// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BN:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[C:%.*]], i32 noundef [[VLA1:%.*]], i32 noundef [[VLA3:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[CN:%.*]], ptr noundef nonnull align 4 dereferenceable(12) [[D:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
 // CHECK7-NEXT:  entry:
 // CHECK7-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK7-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -6207,60 +6207,60 @@ int bar(int n){
 // CHECK7:       omp.dispatch.body:
 // CHECK7-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK7:       omp.inner.for.cond:
-// CHECK7-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP50:![0-9]+]]
-// CHECK7-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP50]]
+// CHECK7-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP36:![0-9]+]]
+// CHECK7-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP36]]
 // CHECK7-NEXT:    [[CMP6:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]]
 // CHECK7-NEXT:    br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK7:       omp.inner.for.body:
-// CHECK7-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP50]]
+// CHECK7-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP36]]
 // CHECK7-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1
 // CHECK7-NEXT:    [[SUB:%.*]] = sub nsw i32 122, [[MUL]]
 // CHECK7-NEXT:    [[CONV:%.*]] = trunc i32 [[SUB]] to i8
-// CHECK7-NEXT:    store i8 [[CONV]], ptr [[IT]], align 1, !llvm.access.group [[ACC_GRP50]]
-// CHECK7-NEXT:    [[TMP19:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP50]]
+// CHECK7-NEXT:    store i8 [[CONV]], ptr [[IT]], align 1, !llvm.access.group [[ACC_GRP36]]
+// CHECK7-NEXT:    [[TMP19:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP36]]
 // CHECK7-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP19]], 1
-// CHECK7-NEXT:    store i32 [[ADD]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP50]]
+// CHECK7-NEXT:    store i32 [[ADD]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP36]]
 // CHECK7-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x float], ptr [[TMP0]], i32 0, i32 2
-// CHECK7-NEXT:    [[TMP20:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP50]]
+// CHECK7-NEXT:    [[TMP20:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP36]]
 // CHECK7-NEXT:    [[CONV7:%.*]] = fpext float [[TMP20]] to double
 // CHECK7-NEXT:    [[ADD8:%.*]] = fadd double [[CONV7]], 1.000000e+00
 // CHECK7-NEXT:    [[CONV9:%.*]] = fptrunc double [[ADD8]] to float
-// CHECK7-NEXT:    store float [[CONV9]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP50]]
+// CHECK7-NEXT:    store float [[CONV9]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP36]]
 // CHECK7-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 3
-// CHECK7-NEXT:    [[TMP21:%.*]] = load float, ptr [[ARRAYIDX10]], align 4, !llvm.access.group [[ACC_GRP50]]
+// CHECK7-NEXT:    [[TMP21:%.*]] = load float, ptr [[ARRAYIDX10]], align 4, !llvm.access.group [[ACC_GRP36]]
 // CHECK7-NEXT:    [[CONV11:%.*]] = fpext float [[TMP21]] to double
 // CHECK7-NEXT:    [[ADD12:%.*]] = fadd double [[CONV11]], 1.000000e+00
 // CHECK7-NEXT:    [[CONV13:%.*]] = fptrunc double [[ADD12]] to float
-// CHECK7-NEXT:    store float [[CONV13]], ptr [[ARRAYIDX10]], align 4, !llvm.access.group [[ACC_GRP50]]
+// CHECK7-NEXT:    store float [[CONV13]], ptr [[ARRAYIDX10]], align 4, !llvm.access.group [[ACC_GRP36]]
 // CHECK7-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds [5 x [10 x double]], ptr [[TMP3]], i32 0, i32 1
 // CHECK7-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds [10 x double], ptr [[ARRAYIDX14]], i32 0, i32 2
-// CHECK7-NEXT:    [[TMP22:%.*]] = load double, ptr [[ARRAYIDX15]], align 8, !llvm.access.group [[ACC_GRP50]]
+// CHECK7-NEXT:    [[TMP22:%.*]] = load double, ptr [[ARRAYIDX15]], align 8, !llvm.access.group [[ACC_GRP36]]
 // CHECK7-NEXT:    [[ADD16:%.*]] = fadd double [[TMP22]], 1.000000e+00
-// CHECK7-NEXT:    store double [[ADD16]], ptr [[ARRAYIDX15]], align 8, !llvm.access.group [[ACC_GRP50]]
+// CHECK7-NEXT:    store double [[ADD16]], ptr [[ARRAYIDX15]], align 8, !llvm.access.group [[ACC_GRP36]]
 // CHECK7-NEXT:    [[TMP23:%.*]] = mul nsw i32 1, [[TMP5]]
 // CHECK7-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds double, ptr [[TMP6]], i32 [[TMP23]]
 // CHECK7-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds double, ptr [[ARRAYIDX17]], i32 3
-// CHECK7-NEXT:    [[TMP24:%.*]] = load double, ptr [[ARRAYIDX18]], align 8, !llvm.access.group [[ACC_GRP50]]
+// CHECK7-NEXT:    [[TMP24:%.*]] = load double, ptr [[ARRAYIDX18]], align 8, !llvm.access.group [[ACC_GRP36]]
 // CHECK7-NEXT:    [[ADD19:%.*]] = fadd double [[TMP24]], 1.000000e+00
-// CHECK7-NEXT:    store double [[ADD19]], ptr [[ARRAYIDX18]], align 8, !llvm.access.group [[ACC_GRP50]]
+// CHECK7-NEXT:    store double [[ADD19]], ptr [[ARRAYIDX18]], align 8, !llvm.access.group [[ACC_GRP36]]
 // CHECK7-NEXT:    [[X:%.*]] = getelementptr inbounds [[STRUCT_TT:%.*]], ptr [[TMP7]], i32 0, i32 0
-// CHECK7-NEXT:    [[TMP25:%.*]] = load i64, ptr [[X]], align 4, !llvm.access.group [[ACC_GRP50]]
+// CHECK7-NEXT:    [[TMP25:%.*]] = load i64, ptr [[X]], align 4, !llvm.access.group [[ACC_GRP36]]
 // CHECK7-NEXT:    [[ADD20:%.*]] = add nsw i64 [[TMP25]], 1
-// CHECK7-NEXT:    store i64 [[ADD20]], ptr [[X]], align 4, !llvm.access.group [[ACC_GRP50]]
+// CHECK7-NEXT:    store i64 [[ADD20]], ptr [[X]], align 4, !llvm.access.group [[ACC_GRP36]]
 // CHECK7-NEXT:    [[Y:%.*]] = getelementptr inbounds [[STRUCT_TT]], ptr [[TMP7]], i32 0, i32 1
-// CHECK7-NEXT:    [[TMP26:%.*]] = load i8, ptr [[Y]], align 4, !llvm.access.group [[ACC_GRP50]]
+// CHECK7-NEXT:    [[TMP26:%.*]] = load i8, ptr [[Y]], align 4, !llvm.access.group [[ACC_GRP36]]
 // CHECK7-NEXT:    [[CONV21:%.*]] = sext i8 [[TMP26]] to i32
 // CHECK7-NEXT:    [[ADD22:%.*]] = add nsw i32 [[CONV21]], 1
 // CHECK7-NEXT:    [[CONV23:%.*]] = trunc i32 [[ADD22]] to i8
-// CHECK7-NEXT:    store i8 [[CONV23]], ptr [[Y]], align 4, !llvm.access.group [[ACC_GRP50]]
+// CHECK7-NEXT:    store i8 [[CONV23]], ptr [[Y]], align 4, !llvm.access.group [[ACC_GRP36]]
 // CHECK7-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK7:       omp.body.continue:
 // CHECK7-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK7:       omp.inner.for.inc:
-// CHECK7-NEXT:    [[TMP27:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP50]]
+// CHECK7-NEXT:    [[TMP27:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP36]]
 // CHECK7-NEXT:    [[ADD24:%.*]] = add nsw i32 [[TMP27]], 1
-// CHECK7-NEXT:    store i32 [[ADD24]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP50]]
-// CHECK7-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP51:![0-9]+]]
+// CHECK7-NEXT:    store i32 [[ADD24]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP36]]
+// CHECK7-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP37:![0-9]+]]
 // CHECK7:       omp.inner.for.end:
 // CHECK7-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK7:       omp.dispatch.inc:
@@ -6441,12 +6441,12 @@ int bar(int n){
 // CHECK7-NEXT:    [[TMP52:%.*]] = icmp ne i32 [[TMP51]], 0
 // CHECK7-NEXT:    br i1 [[TMP52]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK7:       omp_offload.failed:
-// CHECK7-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l214(ptr [[THIS1]], i32 [[TMP6]], i32 2, i32 [[TMP1]], ptr [[VLA]], i32 [[TMP8]]) #[[ATTR4]]
+// CHECK7-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l214(ptr [[THIS1]], i32 [[TMP6]], i32 2, i32 [[TMP1]], ptr [[VLA]], i32 [[TMP8]]) #[[ATTR3]]
 // CHECK7-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK7:       omp_offload.cont:
 // CHECK7-NEXT:    br label [[OMP_IF_END:%.*]]
 // CHECK7:       omp_if.else:
-// CHECK7-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l214(ptr [[THIS1]], i32 [[TMP6]], i32 2, i32 [[TMP1]], ptr [[VLA]], i32 [[TMP8]]) #[[ATTR4]]
+// CHECK7-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l214(ptr [[THIS1]], i32 [[TMP6]], i32 2, i32 [[TMP1]], ptr [[VLA]], i32 [[TMP8]]) #[[ATTR3]]
 // CHECK7-NEXT:    br label [[OMP_IF_END]]
 // CHECK7:       omp_if.end:
 // CHECK7-NEXT:    [[TMP53:%.*]] = mul nsw i32 1, [[TMP1]]
@@ -6549,12 +6549,12 @@ int bar(int n){
 // CHECK7-NEXT:    [[TMP35:%.*]] = icmp ne i32 [[TMP34]], 0
 // CHECK7-NEXT:    br i1 [[TMP35]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK7:       omp_offload.failed:
-// CHECK7-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l195(i32 [[TMP1]], i32 [[TMP3]], i32 [[TMP5]], ptr [[B]]) #[[ATTR4]]
+// CHECK7-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l195(i32 [[TMP1]], i32 [[TMP3]], i32 [[TMP5]], ptr [[B]]) #[[ATTR3]]
 // CHECK7-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK7:       omp_offload.cont:
 // CHECK7-NEXT:    br label [[OMP_IF_END:%.*]]
 // CHECK7:       omp_if.else:
-// CHECK7-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l195(i32 [[TMP1]], i32 [[TMP3]], i32 [[TMP5]], ptr [[B]]) #[[ATTR4]]
+// CHECK7-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l195(i32 [[TMP1]], i32 [[TMP3]], i32 [[TMP5]], ptr [[B]]) #[[ATTR3]]
 // CHECK7-NEXT:    br label [[OMP_IF_END]]
 // CHECK7:       omp_if.end:
 // CHECK7-NEXT:    [[TMP36:%.*]] = load i32, ptr [[A]], align 4
@@ -6637,12 +6637,12 @@ int bar(int n){
 // CHECK7-NEXT:    [[TMP30:%.*]] = icmp ne i32 [[TMP29]], 0
 // CHECK7-NEXT:    br i1 [[TMP30]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK7:       omp_offload.failed:
-// CHECK7-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l178(i32 [[TMP1]], i32 [[TMP3]], ptr [[B]]) #[[ATTR4]]
+// CHECK7-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l178(i32 [[TMP1]], i32 [[TMP3]], ptr [[B]]) #[[ATTR3]]
 // CHECK7-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK7:       omp_offload.cont:
 // CHECK7-NEXT:    br label [[OMP_IF_END:%.*]]
 // CHECK7:       omp_if.else:
-// CHECK7-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l178(i32 [[TMP1]], i32 [[TMP3]], ptr [[B]]) #[[ATTR4]]
+// CHECK7-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l178(i32 [[TMP1]], i32 [[TMP3]], ptr [[B]]) #[[ATTR3]]
 // CHECK7-NEXT:    br label [[OMP_IF_END]]
 // CHECK7:       omp_if.end:
 // CHECK7-NEXT:    [[TMP31:%.*]] = load i32, ptr [[A]], align 4
@@ -6691,7 +6691,7 @@ int bar(int n){
 // CHECK7-NEXT:    call void @__kmpc_serialized_parallel(ptr @[[GLOB2]], i32 [[TMP0]])
 // CHECK7-NEXT:    store i32 [[TMP0]], ptr [[DOTTHREADID_TEMP_]], align 4
 // CHECK7-NEXT:    store i32 0, ptr [[DOTBOUND_ZERO_ADDR]], align 4
-// CHECK7-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l214.omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTBOUND_ZERO_ADDR]], ptr [[TMP1]], i32 [[TMP6]], i32 [[TMP2]], i32 [[TMP3]], ptr [[TMP4]], i32 [[TMP8]]) #[[ATTR4]]
+// CHECK7-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l214.omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTBOUND_ZERO_ADDR]], ptr [[TMP1]], i32 [[TMP6]], i32 [[TMP2]], i32 [[TMP3]], ptr [[TMP4]], i32 [[TMP8]]) #[[ATTR3]]
 // CHECK7-NEXT:    call void @__kmpc_end_serialized_parallel(ptr @[[GLOB2]], i32 [[TMP0]])
 // CHECK7-NEXT:    br label [[OMP_IF_END]]
 // CHECK7:       omp_if.end:
@@ -6699,7 +6699,7 @@ int bar(int n){
 //
 //
 // CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l214.omp_outlined
-// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]], i32 noundef [[B:%.*]], i32 noundef [[VLA:%.*]], i32 noundef [[VLA1:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[C:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] {
+// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]], i32 noundef [[B:%.*]], i32 noundef [[VLA:%.*]], i32 noundef [[VLA1:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[C:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
 // CHECK7-NEXT:  entry:
 // CHECK7-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK7-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -6754,37 +6754,37 @@ int bar(int n){
 // CHECK7-NEXT:    store i64 [[TMP9]], ptr [[DOTOMP_IV]], align 8
 // CHECK7-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK7:       omp.inner.for.cond:
-// CHECK7-NEXT:    [[TMP10:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP53:![0-9]+]]
-// CHECK7-NEXT:    [[TMP11:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8, !llvm.access.group [[ACC_GRP53]]
+// CHECK7-NEXT:    [[TMP10:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP39:![0-9]+]]
+// CHECK7-NEXT:    [[TMP11:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8, !llvm.access.group [[ACC_GRP39]]
 // CHECK7-NEXT:    [[CMP3:%.*]] = icmp ule i64 [[TMP10]], [[TMP11]]
 // CHECK7-NEXT:    br i1 [[CMP3]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK7:       omp.inner.for.body:
-// CHECK7-NEXT:    [[TMP12:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP53]]
+// CHECK7-NEXT:    [[TMP12:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP39]]
 // CHECK7-NEXT:    [[MUL:%.*]] = mul i64 [[TMP12]], 400
 // CHECK7-NEXT:    [[SUB:%.*]] = sub i64 2000, [[MUL]]
-// CHECK7-NEXT:    store i64 [[SUB]], ptr [[IT]], align 8, !llvm.access.group [[ACC_GRP53]]
-// CHECK7-NEXT:    [[TMP13:%.*]] = load i32, ptr [[B_ADDR]], align 4, !llvm.access.group [[ACC_GRP53]]
+// CHECK7-NEXT:    store i64 [[SUB]], ptr [[IT]], align 8, !llvm.access.group [[ACC_GRP39]]
+// CHECK7-NEXT:    [[TMP13:%.*]] = load i32, ptr [[B_ADDR]], align 4, !llvm.access.group [[ACC_GRP39]]
 // CHECK7-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP13]] to double
 // CHECK7-NEXT:    [[ADD:%.*]] = fadd double [[CONV]], 1.500000e+00
 // CHECK7-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_S1:%.*]], ptr [[TMP0]], i32 0, i32 0
-// CHECK7-NEXT:    store double [[ADD]], ptr [[A]], align 4, !nontemporal !54, !llvm.access.group [[ACC_GRP53]]
+// CHECK7-NEXT:    store double [[ADD]], ptr [[A]], align 4, !nontemporal !40, !llvm.access.group [[ACC_GRP39]]
 // CHECK7-NEXT:    [[A4:%.*]] = getelementptr inbounds [[STRUCT_S1]], ptr [[TMP0]], i32 0, i32 0
-// CHECK7-NEXT:    [[TMP14:%.*]] = load double, ptr [[A4]], align 4, !nontemporal !54, !llvm.access.group [[ACC_GRP53]]
+// CHECK7-NEXT:    [[TMP14:%.*]] = load double, ptr [[A4]], align 4, !nontemporal !40, !llvm.access.group [[ACC_GRP39]]
 // CHECK7-NEXT:    [[INC:%.*]] = fadd double [[TMP14]], 1.000000e+00
-// CHECK7-NEXT:    store double [[INC]], ptr [[A4]], align 4, !nontemporal !54, !llvm.access.group [[ACC_GRP53]]
+// CHECK7-NEXT:    store double [[INC]], ptr [[A4]], align 4, !nontemporal !40, !llvm.access.group [[ACC_GRP39]]
 // CHECK7-NEXT:    [[CONV5:%.*]] = fptosi double [[INC]] to i16
 // CHECK7-NEXT:    [[TMP15:%.*]] = mul nsw i32 1, [[TMP2]]
 // CHECK7-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[TMP3]], i32 [[TMP15]]
 // CHECK7-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i16, ptr [[ARRAYIDX]], i32 1
-// CHECK7-NEXT:    store i16 [[CONV5]], ptr [[ARRAYIDX6]], align 2, !llvm.access.group [[ACC_GRP53]]
+// CHECK7-NEXT:    store i16 [[CONV5]], ptr [[ARRAYIDX6]], align 2, !llvm.access.group [[ACC_GRP39]]
 // CHECK7-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK7:       omp.body.continue:
 // CHECK7-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK7:       omp.inner.for.inc:
-// CHECK7-NEXT:    [[TMP16:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP53]]
+// CHECK7-NEXT:    [[TMP16:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP39]]
 // CHECK7-NEXT:    [[ADD7:%.*]] = add i64 [[TMP16]], 1
-// CHECK7-NEXT:    store i64 [[ADD7]], ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP53]]
-// CHECK7-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP55:![0-9]+]]
+// CHECK7-NEXT:    store i64 [[ADD7]], ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP39]]
+// CHECK7-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP41:![0-9]+]]
 // CHECK7:       omp.inner.for.end:
 // CHECK7-NEXT:    br label [[OMP_IF_END:%.*]]
 // CHECK7:       omp_if.else:
@@ -6836,7 +6836,7 @@ int bar(int n){
 // CHECK7-NEXT:    [[TMP28:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8
 // CHECK7-NEXT:    [[ADD28:%.*]] = add i64 [[TMP28]], 1
 // CHECK7-NEXT:    store i64 [[ADD28]], ptr [[DOTOMP_IV]], align 8
-// CHECK7-NEXT:    br label [[OMP_INNER_FOR_COND13]], !llvm.loop [[LOOP57:![0-9]+]]
+// CHECK7-NEXT:    br label [[OMP_INNER_FOR_COND13]], !llvm.loop [[LOOP43:![0-9]+]]
 // CHECK7:       omp.inner.for.end29:
 // CHECK7-NEXT:    br label [[OMP_IF_END]]
 // CHECK7:       omp_if.end:
@@ -6884,7 +6884,7 @@ int bar(int n){
 //
 //
 // CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l195.omp_outlined
-// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]], i32 noundef [[AAA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR3]] {
+// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]], i32 noundef [[AAA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR2]] {
 // CHECK7-NEXT:  entry:
 // CHECK7-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK7-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -6927,7 +6927,7 @@ int bar(int n){
 //
 //
 // CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l178.omp_outlined
-// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR3]] {
+// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR2]] {
 // CHECK7-NEXT:  entry:
 // CHECK7-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK7-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -6969,35 +6969,35 @@ int bar(int n){
 // CHECK7-NEXT:    store i64 [[TMP5]], ptr [[DOTOMP_IV]], align 8
 // CHECK7-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK7:       omp.inner.for.cond:
-// CHECK7-NEXT:    [[TMP6:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP59:![0-9]+]]
-// CHECK7-NEXT:    [[TMP7:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8, !llvm.access.group [[ACC_GRP59]]
+// CHECK7-NEXT:    [[TMP6:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP45:![0-9]+]]
+// CHECK7-NEXT:    [[TMP7:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8, !llvm.access.group [[ACC_GRP45]]
 // CHECK7-NEXT:    [[CMP1:%.*]] = icmp sle i64 [[TMP6]], [[TMP7]]
 // CHECK7-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK7:       omp.inner.for.body:
-// CHECK7-NEXT:    [[TMP8:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP59]]
+// CHECK7-NEXT:    [[TMP8:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP45]]
 // CHECK7-NEXT:    [[MUL:%.*]] = mul nsw i64 [[TMP8]], 3
 // CHECK7-NEXT:    [[ADD:%.*]] = add nsw i64 -10, [[MUL]]
-// CHECK7-NEXT:    store i64 [[ADD]], ptr [[I]], align 8, !llvm.access.group [[ACC_GRP59]]
-// CHECK7-NEXT:    [[TMP9:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP59]]
+// CHECK7-NEXT:    store i64 [[ADD]], ptr [[I]], align 8, !llvm.access.group [[ACC_GRP45]]
+// CHECK7-NEXT:    [[TMP9:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP45]]
 // CHECK7-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP9]], 1
-// CHECK7-NEXT:    store i32 [[ADD2]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP59]]
-// CHECK7-NEXT:    [[TMP10:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP59]]
+// CHECK7-NEXT:    store i32 [[ADD2]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP45]]
+// CHECK7-NEXT:    [[TMP10:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP45]]
 // CHECK7-NEXT:    [[CONV:%.*]] = sext i16 [[TMP10]] to i32
 // CHECK7-NEXT:    [[ADD3:%.*]] = add nsw i32 [[CONV]], 1
 // CHECK7-NEXT:    [[CONV4:%.*]] = trunc i32 [[ADD3]] to i16
-// CHECK7-NEXT:    store i16 [[CONV4]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP59]]
+// CHECK7-NEXT:    store i16 [[CONV4]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP45]]
 // CHECK7-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP0]], i32 0, i32 2
-// CHECK7-NEXT:    [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP59]]
+// CHECK7-NEXT:    [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP45]]
 // CHECK7-NEXT:    [[ADD5:%.*]] = add nsw i32 [[TMP11]], 1
-// CHECK7-NEXT:    store i32 [[ADD5]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP59]]
+// CHECK7-NEXT:    store i32 [[ADD5]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP45]]
 // CHECK7-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK7:       omp.body.continue:
 // CHECK7-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK7:       omp.inner.for.inc:
-// CHECK7-NEXT:    [[TMP12:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP59]]
+// CHECK7-NEXT:    [[TMP12:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP45]]
 // CHECK7-NEXT:    [[ADD6:%.*]] = add nsw i64 [[TMP12]], 1
-// CHECK7-NEXT:    store i64 [[ADD6]], ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP59]]
-// CHECK7-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP60:![0-9]+]]
+// CHECK7-NEXT:    store i64 [[ADD6]], ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP45]]
+// CHECK7-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP46:![0-9]+]]
 // CHECK7:       omp.inner.for.end:
 // CHECK7-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK7:       omp.loop.exit:
@@ -7013,7 +7013,7 @@ int bar(int n){
 //
 //
 // CHECK7-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK7-SAME: () #[[ATTR8:[0-9]+]] {
+// CHECK7-SAME: () #[[ATTR7:[0-9]+]] {
 // CHECK7-NEXT:  entry:
 // CHECK7-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK7-NEXT:    ret void
@@ -9031,7 +9031,7 @@ int bar(int n){
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l96.omp_outlined
-// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
 // CHECK17-NEXT:  entry:
 // CHECK17-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -9066,23 +9066,23 @@ int bar(int n){
 // CHECK17-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
 // CHECK17-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK17:       omp.inner.for.cond:
-// CHECK17-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP25:![0-9]+]]
-// CHECK17-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP25]]
+// CHECK17-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP11:![0-9]+]]
+// CHECK17-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP11]]
 // CHECK17-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
 // CHECK17-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK17:       omp.inner.for.body:
-// CHECK17-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP25]]
+// CHECK17-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP11]]
 // CHECK17-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 5
 // CHECK17-NEXT:    [[ADD:%.*]] = add nsw i32 3, [[MUL]]
-// CHECK17-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP25]]
+// CHECK17-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP11]]
 // CHECK17-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK17:       omp.body.continue:
 // CHECK17-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK17:       omp.inner.for.inc:
-// CHECK17-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP25]]
+// CHECK17-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP11]]
 // CHECK17-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP8]], 1
-// CHECK17-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP25]]
-// CHECK17-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP26:![0-9]+]]
+// CHECK17-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP11]]
+// CHECK17-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP12:![0-9]+]]
 // CHECK17:       omp.inner.for.end:
 // CHECK17-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK17:       omp.loop.exit:
@@ -9123,7 +9123,7 @@ int bar(int n){
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l108.omp_outlined
-// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[AA:%.*]], i64 noundef [[LIN:%.*]], i64 noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[AA:%.*]], i64 noundef [[LIN:%.*]], i64 noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK17-NEXT:  entry:
 // CHECK17-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -9151,7 +9151,7 @@ int bar(int n){
 // CHECK17-NEXT:    store i32 [[TMP0]], ptr [[DOTLINEAR_START]], align 4
 // CHECK17-NEXT:    [[TMP1:%.*]] = load i32, ptr [[A_ADDR]], align 4
 // CHECK17-NEXT:    store i32 [[TMP1]], ptr [[DOTLINEAR_START1]], align 4
-// CHECK17-NEXT:    [[CALL:%.*]] = call noundef i64 @_Z7get_valv() #[[ATTR5:[0-9]+]]
+// CHECK17-NEXT:    [[CALL:%.*]] = call noundef i64 @_Z7get_valv() #[[ATTR4:[0-9]+]]
 // CHECK17-NEXT:    store i64 [[CALL]], ptr [[DOTLINEAR_STEP]], align 8
 // CHECK17-NEXT:    store i64 0, ptr [[DOTOMP_LB]], align 8
 // CHECK17-NEXT:    store i64 3, ptr [[DOTOMP_UB]], align 8
@@ -9176,44 +9176,44 @@ int bar(int n){
 // CHECK17-NEXT:    store i64 [[TMP6]], ptr [[DOTOMP_IV]], align 8
 // CHECK17-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK17:       omp.inner.for.cond:
-// CHECK17-NEXT:    [[TMP7:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP31:![0-9]+]]
-// CHECK17-NEXT:    [[TMP8:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8, !llvm.access.group [[ACC_GRP31]]
+// CHECK17-NEXT:    [[TMP7:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP17:![0-9]+]]
+// CHECK17-NEXT:    [[TMP8:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8, !llvm.access.group [[ACC_GRP17]]
 // CHECK17-NEXT:    [[CMP4:%.*]] = icmp ule i64 [[TMP7]], [[TMP8]]
 // CHECK17-NEXT:    br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK17:       omp.inner.for.body:
-// CHECK17-NEXT:    [[TMP9:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP31]]
+// CHECK17-NEXT:    [[TMP9:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP17]]
 // CHECK17-NEXT:    [[MUL:%.*]] = mul i64 [[TMP9]], 400
 // CHECK17-NEXT:    [[SUB:%.*]] = sub i64 2000, [[MUL]]
-// CHECK17-NEXT:    store i64 [[SUB]], ptr [[IT]], align 8, !llvm.access.group [[ACC_GRP31]]
-// CHECK17-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTLINEAR_START]], align 4, !llvm.access.group [[ACC_GRP31]]
+// CHECK17-NEXT:    store i64 [[SUB]], ptr [[IT]], align 8, !llvm.access.group [[ACC_GRP17]]
+// CHECK17-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTLINEAR_START]], align 4, !llvm.access.group [[ACC_GRP17]]
 // CHECK17-NEXT:    [[CONV:%.*]] = sext i32 [[TMP10]] to i64
-// CHECK17-NEXT:    [[TMP11:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP31]]
-// CHECK17-NEXT:    [[TMP12:%.*]] = load i64, ptr [[DOTLINEAR_STEP]], align 8, !llvm.access.group [[ACC_GRP31]]
+// CHECK17-NEXT:    [[TMP11:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP17]]
+// CHECK17-NEXT:    [[TMP12:%.*]] = load i64, ptr [[DOTLINEAR_STEP]], align 8, !llvm.access.group [[ACC_GRP17]]
 // CHECK17-NEXT:    [[MUL5:%.*]] = mul i64 [[TMP11]], [[TMP12]]
 // CHECK17-NEXT:    [[ADD:%.*]] = add i64 [[CONV]], [[MUL5]]
 // CHECK17-NEXT:    [[CONV6:%.*]] = trunc i64 [[ADD]] to i32
-// CHECK17-NEXT:    store i32 [[CONV6]], ptr [[LIN2]], align 4, !llvm.access.group [[ACC_GRP31]]
-// CHECK17-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTLINEAR_START1]], align 4, !llvm.access.group [[ACC_GRP31]]
+// CHECK17-NEXT:    store i32 [[CONV6]], ptr [[LIN2]], align 4, !llvm.access.group [[ACC_GRP17]]
+// CHECK17-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTLINEAR_START1]], align 4, !llvm.access.group [[ACC_GRP17]]
 // CHECK17-NEXT:    [[CONV7:%.*]] = sext i32 [[TMP13]] to i64
-// CHECK17-NEXT:    [[TMP14:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP31]]
-// CHECK17-NEXT:    [[TMP15:%.*]] = load i64, ptr [[DOTLINEAR_STEP]], align 8, !llvm.access.group [[ACC_GRP31]]
+// CHECK17-NEXT:    [[TMP14:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP17]]
+// CHECK17-NEXT:    [[TMP15:%.*]] = load i64, ptr [[DOTLINEAR_STEP]], align 8, !llvm.access.group [[ACC_GRP17]]
 // CHECK17-NEXT:    [[MUL8:%.*]] = mul i64 [[TMP14]], [[TMP15]]
 // CHECK17-NEXT:    [[ADD9:%.*]] = add i64 [[CONV7]], [[MUL8]]
 // CHECK17-NEXT:    [[CONV10:%.*]] = trunc i64 [[ADD9]] to i32
-// CHECK17-NEXT:    store i32 [[CONV10]], ptr [[A3]], align 4, !llvm.access.group [[ACC_GRP31]]
-// CHECK17-NEXT:    [[TMP16:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP31]]
+// CHECK17-NEXT:    store i32 [[CONV10]], ptr [[A3]], align 4, !llvm.access.group [[ACC_GRP17]]
+// CHECK17-NEXT:    [[TMP16:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP17]]
 // CHECK17-NEXT:    [[CONV11:%.*]] = sext i16 [[TMP16]] to i32
 // CHECK17-NEXT:    [[ADD12:%.*]] = add nsw i32 [[CONV11]], 1
 // CHECK17-NEXT:    [[CONV13:%.*]] = trunc i32 [[ADD12]] to i16
-// CHECK17-NEXT:    store i16 [[CONV13]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP31]]
+// CHECK17-NEXT:    store i16 [[CONV13]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP17]]
 // CHECK17-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK17:       omp.body.continue:
 // CHECK17-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK17:       omp.inner.for.inc:
-// CHECK17-NEXT:    [[TMP17:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP31]]
+// CHECK17-NEXT:    [[TMP17:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP17]]
 // CHECK17-NEXT:    [[ADD14:%.*]] = add i64 [[TMP17]], 1
-// CHECK17-NEXT:    store i64 [[ADD14]], ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP31]]
-// CHECK17-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP32:![0-9]+]]
+// CHECK17-NEXT:    store i64 [[ADD14]], ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP17]]
+// CHECK17-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP18:![0-9]+]]
 // CHECK17:       omp.inner.for.end:
 // CHECK17-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK17:       omp.loop.exit:
@@ -9239,7 +9239,7 @@ int bar(int n){
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@_Z7get_valv
-// CHECK17-SAME: () #[[ATTR3:[0-9]+]] {
+// CHECK17-SAME: () #[[ATTR2:[0-9]+]] {
 // CHECK17-NEXT:  entry:
 // CHECK17-NEXT:    ret i64 0
 //
@@ -9264,7 +9264,7 @@ int bar(int n){
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l116.omp_outlined
-// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]]) #[[ATTR1]] {
+// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]]) #[[ATTR0]] {
 // CHECK17-NEXT:  entry:
 // CHECK17-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -9303,32 +9303,32 @@ int bar(int n){
 // CHECK17-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
 // CHECK17-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK17:       omp.inner.for.cond:
-// CHECK17-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP34:![0-9]+]]
-// CHECK17-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP34]]
+// CHECK17-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP20:![0-9]+]]
+// CHECK17-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP20]]
 // CHECK17-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
 // CHECK17-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK17:       omp.inner.for.body:
-// CHECK17-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP34]]
+// CHECK17-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP20]]
 // CHECK17-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 4
 // CHECK17-NEXT:    [[ADD:%.*]] = add nsw i32 6, [[MUL]]
 // CHECK17-NEXT:    [[CONV:%.*]] = trunc i32 [[ADD]] to i16
-// CHECK17-NEXT:    store i16 [[CONV]], ptr [[IT]], align 2, !llvm.access.group [[ACC_GRP34]]
-// CHECK17-NEXT:    [[TMP8:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP34]]
+// CHECK17-NEXT:    store i16 [[CONV]], ptr [[IT]], align 2, !llvm.access.group [[ACC_GRP20]]
+// CHECK17-NEXT:    [[TMP8:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP20]]
 // CHECK17-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP8]], 1
-// CHECK17-NEXT:    store i32 [[ADD2]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP34]]
-// CHECK17-NEXT:    [[TMP9:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP34]]
+// CHECK17-NEXT:    store i32 [[ADD2]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP20]]
+// CHECK17-NEXT:    [[TMP9:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP20]]
 // CHECK17-NEXT:    [[CONV3:%.*]] = sext i16 [[TMP9]] to i32
 // CHECK17-NEXT:    [[ADD4:%.*]] = add nsw i32 [[CONV3]], 1
 // CHECK17-NEXT:    [[CONV5:%.*]] = trunc i32 [[ADD4]] to i16
-// CHECK17-NEXT:    store i16 [[CONV5]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP34]]
+// CHECK17-NEXT:    store i16 [[CONV5]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP20]]
 // CHECK17-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK17:       omp.body.continue:
 // CHECK17-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK17:       omp.inner.for.inc:
-// CHECK17-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP34]]
+// CHECK17-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP20]]
 // CHECK17-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP10]], 1
-// CHECK17-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP34]]
-// CHECK17-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP35:![0-9]+]]
+// CHECK17-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP20]]
+// CHECK17-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP21:![0-9]+]]
 // CHECK17:       omp.inner.for.end:
 // CHECK17-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK17:       omp.loop.exit:
@@ -9387,7 +9387,7 @@ int bar(int n){
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l140.omp_outlined
-// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BN:%.*]], ptr noundef nonnull align 8 dereferenceable(400) [[C:%.*]], i64 noundef [[VLA1:%.*]], i64 noundef [[VLA3:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[CN:%.*]], ptr noundef nonnull align 8 dereferenceable(16) [[D:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR1]] {
+// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BN:%.*]], ptr noundef nonnull align 8 dereferenceable(400) [[C:%.*]], i64 noundef [[VLA1:%.*]], i64 noundef [[VLA3:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[CN:%.*]], ptr noundef nonnull align 8 dereferenceable(16) [[D:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] {
 // CHECK17-NEXT:  entry:
 // CHECK17-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -9458,60 +9458,60 @@ int bar(int n){
 // CHECK17:       omp.dispatch.body:
 // CHECK17-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK17:       omp.inner.for.cond:
-// CHECK17-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP37:![0-9]+]]
-// CHECK17-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP37]]
+// CHECK17-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP23:![0-9]+]]
+// CHECK17-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP23]]
 // CHECK17-NEXT:    [[CMP6:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]]
 // CHECK17-NEXT:    br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK17:       omp.inner.for.body:
-// CHECK17-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP37]]
+// CHECK17-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP23]]
 // CHECK17-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1
 // CHECK17-NEXT:    [[SUB:%.*]] = sub nsw i32 122, [[MUL]]
 // CHECK17-NEXT:    [[CONV:%.*]] = trunc i32 [[SUB]] to i8
-// CHECK17-NEXT:    store i8 [[CONV]], ptr [[IT]], align 1, !llvm.access.group [[ACC_GRP37]]
-// CHECK17-NEXT:    [[TMP19:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP37]]
+// CHECK17-NEXT:    store i8 [[CONV]], ptr [[IT]], align 1, !llvm.access.group [[ACC_GRP23]]
+// CHECK17-NEXT:    [[TMP19:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP23]]
 // CHECK17-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP19]], 1
-// CHECK17-NEXT:    store i32 [[ADD]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP37]]
+// CHECK17-NEXT:    store i32 [[ADD]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP23]]
 // CHECK17-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x float], ptr [[TMP0]], i64 0, i64 2
-// CHECK17-NEXT:    [[TMP20:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP37]]
+// CHECK17-NEXT:    [[TMP20:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP23]]
 // CHECK17-NEXT:    [[CONV7:%.*]] = fpext float [[TMP20]] to double
 // CHECK17-NEXT:    [[ADD8:%.*]] = fadd double [[CONV7]], 1.000000e+00
 // CHECK17-NEXT:    [[CONV9:%.*]] = fptrunc double [[ADD8]] to float
-// CHECK17-NEXT:    store float [[CONV9]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP37]]
+// CHECK17-NEXT:    store float [[CONV9]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP23]]
 // CHECK17-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i64 3
-// CHECK17-NEXT:    [[TMP21:%.*]] = load float, ptr [[ARRAYIDX10]], align 4, !llvm.access.group [[ACC_GRP37]]
+// CHECK17-NEXT:    [[TMP21:%.*]] = load float, ptr [[ARRAYIDX10]], align 4, !llvm.access.group [[ACC_GRP23]]
 // CHECK17-NEXT:    [[CONV11:%.*]] = fpext float [[TMP21]] to double
 // CHECK17-NEXT:    [[ADD12:%.*]] = fadd double [[CONV11]], 1.000000e+00
 // CHECK17-NEXT:    [[CONV13:%.*]] = fptrunc double [[ADD12]] to float
-// CHECK17-NEXT:    store float [[CONV13]], ptr [[ARRAYIDX10]], align 4, !llvm.access.group [[ACC_GRP37]]
+// CHECK17-NEXT:    store float [[CONV13]], ptr [[ARRAYIDX10]], align 4, !llvm.access.group [[ACC_GRP23]]
 // CHECK17-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds [5 x [10 x double]], ptr [[TMP3]], i64 0, i64 1
 // CHECK17-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds [10 x double], ptr [[ARRAYIDX14]], i64 0, i64 2
-// CHECK17-NEXT:    [[TMP22:%.*]] = load double, ptr [[ARRAYIDX15]], align 8, !llvm.access.group [[ACC_GRP37]]
+// CHECK17-NEXT:    [[TMP22:%.*]] = load double, ptr [[ARRAYIDX15]], align 8, !llvm.access.group [[ACC_GRP23]]
 // CHECK17-NEXT:    [[ADD16:%.*]] = fadd double [[TMP22]], 1.000000e+00
-// CHECK17-NEXT:    store double [[ADD16]], ptr [[ARRAYIDX15]], align 8, !llvm.access.group [[ACC_GRP37]]
+// CHECK17-NEXT:    store double [[ADD16]], ptr [[ARRAYIDX15]], align 8, !llvm.access.group [[ACC_GRP23]]
 // CHECK17-NEXT:    [[TMP23:%.*]] = mul nsw i64 1, [[TMP5]]
 // CHECK17-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds double, ptr [[TMP6]], i64 [[TMP23]]
 // CHECK17-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds double, ptr [[ARRAYIDX17]], i64 3
-// CHECK17-NEXT:    [[TMP24:%.*]] = load double, ptr [[ARRAYIDX18]], align 8, !llvm.access.group [[ACC_GRP37]]
+// CHECK17-NEXT:    [[TMP24:%.*]] = load double, ptr [[ARRAYIDX18]], align 8, !llvm.access.group [[ACC_GRP23]]
 // CHECK17-NEXT:    [[ADD19:%.*]] = fadd double [[TMP24]], 1.000000e+00
-// CHECK17-NEXT:    store double [[ADD19]], ptr [[ARRAYIDX18]], align 8, !llvm.access.group [[ACC_GRP37]]
+// CHECK17-NEXT:    store double [[ADD19]], ptr [[ARRAYIDX18]], align 8, !llvm.access.group [[ACC_GRP23]]
 // CHECK17-NEXT:    [[X:%.*]] = getelementptr inbounds [[STRUCT_TT:%.*]], ptr [[TMP7]], i32 0, i32 0
-// CHECK17-NEXT:    [[TMP25:%.*]] = load i64, ptr [[X]], align 8, !llvm.access.group [[ACC_GRP37]]
+// CHECK17-NEXT:    [[TMP25:%.*]] = load i64, ptr [[X]], align 8, !llvm.access.group [[ACC_GRP23]]
 // CHECK17-NEXT:    [[ADD20:%.*]] = add nsw i64 [[TMP25]], 1
-// CHECK17-NEXT:    store i64 [[ADD20]], ptr [[X]], align 8, !llvm.access.group [[ACC_GRP37]]
+// CHECK17-NEXT:    store i64 [[ADD20]], ptr [[X]], align 8, !llvm.access.group [[ACC_GRP23]]
 // CHECK17-NEXT:    [[Y:%.*]] = getelementptr inbounds [[STRUCT_TT]], ptr [[TMP7]], i32 0, i32 1
-// CHECK17-NEXT:    [[TMP26:%.*]] = load i8, ptr [[Y]], align 8, !llvm.access.group [[ACC_GRP37]]
+// CHECK17-NEXT:    [[TMP26:%.*]] = load i8, ptr [[Y]], align 8, !llvm.access.group [[ACC_GRP23]]
 // CHECK17-NEXT:    [[CONV21:%.*]] = sext i8 [[TMP26]] to i32
 // CHECK17-NEXT:    [[ADD22:%.*]] = add nsw i32 [[CONV21]], 1
 // CHECK17-NEXT:    [[CONV23:%.*]] = trunc i32 [[ADD22]] to i8
-// CHECK17-NEXT:    store i8 [[CONV23]], ptr [[Y]], align 8, !llvm.access.group [[ACC_GRP37]]
+// CHECK17-NEXT:    store i8 [[CONV23]], ptr [[Y]], align 8, !llvm.access.group [[ACC_GRP23]]
 // CHECK17-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK17:       omp.body.continue:
 // CHECK17-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK17:       omp.inner.for.inc:
-// CHECK17-NEXT:    [[TMP27:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP37]]
+// CHECK17-NEXT:    [[TMP27:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP23]]
 // CHECK17-NEXT:    [[ADD24:%.*]] = add nsw i32 [[TMP27]], 1
-// CHECK17-NEXT:    store i32 [[ADD24]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP37]]
-// CHECK17-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP38:![0-9]+]]
+// CHECK17-NEXT:    store i32 [[ADD24]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK17-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP24:![0-9]+]]
 // CHECK17:       omp.inner.for.end:
 // CHECK17-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK17:       omp.dispatch.inc:
@@ -9565,7 +9565,7 @@ int bar(int n){
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l195.omp_outlined
-// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]], i64 noundef [[AAA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR1]] {
+// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]], i64 noundef [[AAA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
 // CHECK17-NEXT:  entry:
 // CHECK17-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -9611,7 +9611,7 @@ int bar(int n){
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l216.omp_outlined
-// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]], i64 noundef [[B:%.*]], i64 noundef [[VLA:%.*]], i64 noundef [[VLA1:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[C:%.*]]) #[[ATTR1]] {
+// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]], i64 noundef [[B:%.*]], i64 noundef [[VLA:%.*]], i64 noundef [[VLA1:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[C:%.*]]) #[[ATTR0]] {
 // CHECK17-NEXT:  entry:
 // CHECK17-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -9660,37 +9660,37 @@ int bar(int n){
 // CHECK17-NEXT:    store i64 [[TMP8]], ptr [[DOTOMP_IV]], align 8
 // CHECK17-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK17:       omp.inner.for.cond:
-// CHECK17-NEXT:    [[TMP9:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP40:![0-9]+]]
-// CHECK17-NEXT:    [[TMP10:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8, !llvm.access.group [[ACC_GRP40]]
+// CHECK17-NEXT:    [[TMP9:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP26:![0-9]+]]
+// CHECK17-NEXT:    [[TMP10:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8, !llvm.access.group [[ACC_GRP26]]
 // CHECK17-NEXT:    [[CMP3:%.*]] = icmp ule i64 [[TMP9]], [[TMP10]]
 // CHECK17-NEXT:    br i1 [[CMP3]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK17:       omp.inner.for.body:
-// CHECK17-NEXT:    [[TMP11:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP40]]
+// CHECK17-NEXT:    [[TMP11:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP26]]
 // CHECK17-NEXT:    [[MUL:%.*]] = mul i64 [[TMP11]], 400
 // CHECK17-NEXT:    [[SUB:%.*]] = sub i64 2000, [[MUL]]
-// CHECK17-NEXT:    store i64 [[SUB]], ptr [[IT]], align 8, !llvm.access.group [[ACC_GRP40]]
-// CHECK17-NEXT:    [[TMP12:%.*]] = load i32, ptr [[B_ADDR]], align 4, !llvm.access.group [[ACC_GRP40]]
+// CHECK17-NEXT:    store i64 [[SUB]], ptr [[IT]], align 8, !llvm.access.group [[ACC_GRP26]]
+// CHECK17-NEXT:    [[TMP12:%.*]] = load i32, ptr [[B_ADDR]], align 4, !llvm.access.group [[ACC_GRP26]]
 // CHECK17-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP12]] to double
 // CHECK17-NEXT:    [[ADD:%.*]] = fadd double [[CONV]], 1.500000e+00
 // CHECK17-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_S1:%.*]], ptr [[TMP0]], i32 0, i32 0
-// CHECK17-NEXT:    store double [[ADD]], ptr [[A]], align 8, !llvm.access.group [[ACC_GRP40]]
+// CHECK17-NEXT:    store double [[ADD]], ptr [[A]], align 8, !llvm.access.group [[ACC_GRP26]]
 // CHECK17-NEXT:    [[A4:%.*]] = getelementptr inbounds [[STRUCT_S1]], ptr [[TMP0]], i32 0, i32 0
-// CHECK17-NEXT:    [[TMP13:%.*]] = load double, ptr [[A4]], align 8, !llvm.access.group [[ACC_GRP40]]
+// CHECK17-NEXT:    [[TMP13:%.*]] = load double, ptr [[A4]], align 8, !llvm.access.group [[ACC_GRP26]]
 // CHECK17-NEXT:    [[INC:%.*]] = fadd double [[TMP13]], 1.000000e+00
-// CHECK17-NEXT:    store double [[INC]], ptr [[A4]], align 8, !llvm.access.group [[ACC_GRP40]]
+// CHECK17-NEXT:    store double [[INC]], ptr [[A4]], align 8, !llvm.access.group [[ACC_GRP26]]
 // CHECK17-NEXT:    [[CONV5:%.*]] = fptosi double [[INC]] to i16
 // CHECK17-NEXT:    [[TMP14:%.*]] = mul nsw i64 1, [[TMP2]]
 // CHECK17-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[TMP3]], i64 [[TMP14]]
 // CHECK17-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i16, ptr [[ARRAYIDX]], i64 1
-// CHECK17-NEXT:    store i16 [[CONV5]], ptr [[ARRAYIDX6]], align 2, !llvm.access.group [[ACC_GRP40]]
+// CHECK17-NEXT:    store i16 [[CONV5]], ptr [[ARRAYIDX6]], align 2, !llvm.access.group [[ACC_GRP26]]
 // CHECK17-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK17:       omp.body.continue:
 // CHECK17-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK17:       omp.inner.for.inc:
-// CHECK17-NEXT:    [[TMP15:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP40]]
+// CHECK17-NEXT:    [[TMP15:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP26]]
 // CHECK17-NEXT:    [[ADD7:%.*]] = add i64 [[TMP15]], 1
-// CHECK17-NEXT:    store i64 [[ADD7]], ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP40]]
-// CHECK17-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP41:![0-9]+]]
+// CHECK17-NEXT:    store i64 [[ADD7]], ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP26]]
+// CHECK17-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP27:![0-9]+]]
 // CHECK17:       omp.inner.for.end:
 // CHECK17-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK17:       omp.loop.exit:
@@ -9728,7 +9728,7 @@ int bar(int n){
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l178.omp_outlined
-// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR1]] {
+// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
 // CHECK17-NEXT:  entry:
 // CHECK17-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -9770,35 +9770,35 @@ int bar(int n){
 // CHECK17-NEXT:    store i64 [[TMP5]], ptr [[DOTOMP_IV]], align 8
 // CHECK17-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK17:       omp.inner.for.cond:
-// CHECK17-NEXT:    [[TMP6:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP43:![0-9]+]]
-// CHECK17-NEXT:    [[TMP7:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8, !llvm.access.group [[ACC_GRP43]]
+// CHECK17-NEXT:    [[TMP6:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP29:![0-9]+]]
+// CHECK17-NEXT:    [[TMP7:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8, !llvm.access.group [[ACC_GRP29]]
 // CHECK17-NEXT:    [[CMP1:%.*]] = icmp sle i64 [[TMP6]], [[TMP7]]
 // CHECK17-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK17:       omp.inner.for.body:
-// CHECK17-NEXT:    [[TMP8:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP43]]
+// CHECK17-NEXT:    [[TMP8:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP29]]
 // CHECK17-NEXT:    [[MUL:%.*]] = mul nsw i64 [[TMP8]], 3
 // CHECK17-NEXT:    [[ADD:%.*]] = add nsw i64 -10, [[MUL]]
-// CHECK17-NEXT:    store i64 [[ADD]], ptr [[I]], align 8, !llvm.access.group [[ACC_GRP43]]
-// CHECK17-NEXT:    [[TMP9:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP43]]
+// CHECK17-NEXT:    store i64 [[ADD]], ptr [[I]], align 8, !llvm.access.group [[ACC_GRP29]]
+// CHECK17-NEXT:    [[TMP9:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP29]]
 // CHECK17-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP9]], 1
-// CHECK17-NEXT:    store i32 [[ADD2]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP43]]
-// CHECK17-NEXT:    [[TMP10:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP43]]
+// CHECK17-NEXT:    store i32 [[ADD2]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP29]]
+// CHECK17-NEXT:    [[TMP10:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP29]]
 // CHECK17-NEXT:    [[CONV:%.*]] = sext i16 [[TMP10]] to i32
 // CHECK17-NEXT:    [[ADD3:%.*]] = add nsw i32 [[CONV]], 1
 // CHECK17-NEXT:    [[CONV4:%.*]] = trunc i32 [[ADD3]] to i16
-// CHECK17-NEXT:    store i16 [[CONV4]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP43]]
+// CHECK17-NEXT:    store i16 [[CONV4]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP29]]
 // CHECK17-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP0]], i64 0, i64 2
-// CHECK17-NEXT:    [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP43]]
+// CHECK17-NEXT:    [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP29]]
 // CHECK17-NEXT:    [[ADD5:%.*]] = add nsw i32 [[TMP11]], 1
-// CHECK17-NEXT:    store i32 [[ADD5]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP43]]
+// CHECK17-NEXT:    store i32 [[ADD5]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP29]]
 // CHECK17-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK17:       omp.body.continue:
 // CHECK17-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK17:       omp.inner.for.inc:
-// CHECK17-NEXT:    [[TMP12:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP43]]
+// CHECK17-NEXT:    [[TMP12:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP29]]
 // CHECK17-NEXT:    [[ADD6:%.*]] = add nsw i64 [[TMP12]], 1
-// CHECK17-NEXT:    store i64 [[ADD6]], ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP43]]
-// CHECK17-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP44:![0-9]+]]
+// CHECK17-NEXT:    store i64 [[ADD6]], ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP29]]
+// CHECK17-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP30:![0-9]+]]
 // CHECK17:       omp.inner.for.end:
 // CHECK17-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK17:       omp.loop.exit:
@@ -9821,7 +9821,7 @@ int bar(int n){
 //
 //
 // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l96.omp_outlined
-// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
 // CHECK19-NEXT:  entry:
 // CHECK19-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -9856,23 +9856,23 @@ int bar(int n){
 // CHECK19-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
 // CHECK19-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK19:       omp.inner.for.cond:
-// CHECK19-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26:![0-9]+]]
-// CHECK19-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK19-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12:![0-9]+]]
+// CHECK19-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP12]]
 // CHECK19-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
 // CHECK19-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK19:       omp.inner.for.body:
-// CHECK19-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK19-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12]]
 // CHECK19-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 5
 // CHECK19-NEXT:    [[ADD:%.*]] = add nsw i32 3, [[MUL]]
-// CHECK19-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK19-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP12]]
 // CHECK19-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK19:       omp.body.continue:
 // CHECK19-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK19:       omp.inner.for.inc:
-// CHECK19-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK19-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12]]
 // CHECK19-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP8]], 1
-// CHECK19-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
-// CHECK19-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP27:![0-9]+]]
+// CHECK19-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK19-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP13:![0-9]+]]
 // CHECK19:       omp.inner.for.end:
 // CHECK19-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK19:       omp.loop.exit:
@@ -9913,7 +9913,7 @@ int bar(int n){
 //
 //
 // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l108.omp_outlined
-// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[AA:%.*]], i32 noundef [[LIN:%.*]], i32 noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[AA:%.*]], i32 noundef [[LIN:%.*]], i32 noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK19-NEXT:  entry:
 // CHECK19-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -9941,7 +9941,7 @@ int bar(int n){
 // CHECK19-NEXT:    store i32 [[TMP0]], ptr [[DOTLINEAR_START]], align 4
 // CHECK19-NEXT:    [[TMP1:%.*]] = load i32, ptr [[A_ADDR]], align 4
 // CHECK19-NEXT:    store i32 [[TMP1]], ptr [[DOTLINEAR_START1]], align 4
-// CHECK19-NEXT:    [[CALL:%.*]] = call noundef i64 @_Z7get_valv() #[[ATTR5:[0-9]+]]
+// CHECK19-NEXT:    [[CALL:%.*]] = call noundef i64 @_Z7get_valv() #[[ATTR4:[0-9]+]]
 // CHECK19-NEXT:    store i64 [[CALL]], ptr [[DOTLINEAR_STEP]], align 8
 // CHECK19-NEXT:    store i64 0, ptr [[DOTOMP_LB]], align 8
 // CHECK19-NEXT:    store i64 3, ptr [[DOTOMP_UB]], align 8
@@ -9966,44 +9966,44 @@ int bar(int n){
 // CHECK19-NEXT:    store i64 [[TMP6]], ptr [[DOTOMP_IV]], align 8
 // CHECK19-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK19:       omp.inner.for.cond:
-// CHECK19-NEXT:    [[TMP7:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP32:![0-9]+]]
-// CHECK19-NEXT:    [[TMP8:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8, !llvm.access.group [[ACC_GRP32]]
+// CHECK19-NEXT:    [[TMP7:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP18:![0-9]+]]
+// CHECK19-NEXT:    [[TMP8:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8, !llvm.access.group [[ACC_GRP18]]
 // CHECK19-NEXT:    [[CMP4:%.*]] = icmp ule i64 [[TMP7]], [[TMP8]]
 // CHECK19-NEXT:    br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK19:       omp.inner.for.body:
-// CHECK19-NEXT:    [[TMP9:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP32]]
+// CHECK19-NEXT:    [[TMP9:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP18]]
 // CHECK19-NEXT:    [[MUL:%.*]] = mul i64 [[TMP9]], 400
 // CHECK19-NEXT:    [[SUB:%.*]] = sub i64 2000, [[MUL]]
-// CHECK19-NEXT:    store i64 [[SUB]], ptr [[IT]], align 8, !llvm.access.group [[ACC_GRP32]]
-// CHECK19-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTLINEAR_START]], align 4, !llvm.access.group [[ACC_GRP32]]
+// CHECK19-NEXT:    store i64 [[SUB]], ptr [[IT]], align 8, !llvm.access.group [[ACC_GRP18]]
+// CHECK19-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTLINEAR_START]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK19-NEXT:    [[CONV:%.*]] = sext i32 [[TMP10]] to i64
-// CHECK19-NEXT:    [[TMP11:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP32]]
-// CHECK19-NEXT:    [[TMP12:%.*]] = load i64, ptr [[DOTLINEAR_STEP]], align 8, !llvm.access.group [[ACC_GRP32]]
+// CHECK19-NEXT:    [[TMP11:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP18]]
+// CHECK19-NEXT:    [[TMP12:%.*]] = load i64, ptr [[DOTLINEAR_STEP]], align 8, !llvm.access.group [[ACC_GRP18]]
 // CHECK19-NEXT:    [[MUL5:%.*]] = mul i64 [[TMP11]], [[TMP12]]
 // CHECK19-NEXT:    [[ADD:%.*]] = add i64 [[CONV]], [[MUL5]]
 // CHECK19-NEXT:    [[CONV6:%.*]] = trunc i64 [[ADD]] to i32
-// CHECK19-NEXT:    store i32 [[CONV6]], ptr [[LIN2]], align 4, !llvm.access.group [[ACC_GRP32]]
-// CHECK19-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTLINEAR_START1]], align 4, !llvm.access.group [[ACC_GRP32]]
+// CHECK19-NEXT:    store i32 [[CONV6]], ptr [[LIN2]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK19-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTLINEAR_START1]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK19-NEXT:    [[CONV7:%.*]] = sext i32 [[TMP13]] to i64
-// CHECK19-NEXT:    [[TMP14:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP32]]
-// CHECK19-NEXT:    [[TMP15:%.*]] = load i64, ptr [[DOTLINEAR_STEP]], align 8, !llvm.access.group [[ACC_GRP32]]
+// CHECK19-NEXT:    [[TMP14:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP18]]
+// CHECK19-NEXT:    [[TMP15:%.*]] = load i64, ptr [[DOTLINEAR_STEP]], align 8, !llvm.access.group [[ACC_GRP18]]
 // CHECK19-NEXT:    [[MUL8:%.*]] = mul i64 [[TMP14]], [[TMP15]]
 // CHECK19-NEXT:    [[ADD9:%.*]] = add i64 [[CONV7]], [[MUL8]]
 // CHECK19-NEXT:    [[CONV10:%.*]] = trunc i64 [[ADD9]] to i32
-// CHECK19-NEXT:    store i32 [[CONV10]], ptr [[A3]], align 4, !llvm.access.group [[ACC_GRP32]]
-// CHECK19-NEXT:    [[TMP16:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP32]]
+// CHECK19-NEXT:    store i32 [[CONV10]], ptr [[A3]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK19-NEXT:    [[TMP16:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP18]]
 // CHECK19-NEXT:    [[CONV11:%.*]] = sext i16 [[TMP16]] to i32
 // CHECK19-NEXT:    [[ADD12:%.*]] = add nsw i32 [[CONV11]], 1
 // CHECK19-NEXT:    [[CONV13:%.*]] = trunc i32 [[ADD12]] to i16
-// CHECK19-NEXT:    store i16 [[CONV13]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP32]]
+// CHECK19-NEXT:    store i16 [[CONV13]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP18]]
 // CHECK19-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK19:       omp.body.continue:
 // CHECK19-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK19:       omp.inner.for.inc:
-// CHECK19-NEXT:    [[TMP17:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP32]]
+// CHECK19-NEXT:    [[TMP17:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP18]]
 // CHECK19-NEXT:    [[ADD14:%.*]] = add i64 [[TMP17]], 1
-// CHECK19-NEXT:    store i64 [[ADD14]], ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP32]]
-// CHECK19-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP33:![0-9]+]]
+// CHECK19-NEXT:    store i64 [[ADD14]], ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP18]]
+// CHECK19-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP19:![0-9]+]]
 // CHECK19:       omp.inner.for.end:
 // CHECK19-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK19:       omp.loop.exit:
@@ -10029,7 +10029,7 @@ int bar(int n){
 //
 //
 // CHECK19-LABEL: define {{[^@]+}}@_Z7get_valv
-// CHECK19-SAME: () #[[ATTR3:[0-9]+]] {
+// CHECK19-SAME: () #[[ATTR2:[0-9]+]] {
 // CHECK19-NEXT:  entry:
 // CHECK19-NEXT:    ret i64 0
 //
@@ -10054,7 +10054,7 @@ int bar(int n){
 //
 //
 // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l116.omp_outlined
-// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]]) #[[ATTR1]] {
+// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]]) #[[ATTR0]] {
 // CHECK19-NEXT:  entry:
 // CHECK19-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -10093,32 +10093,32 @@ int bar(int n){
 // CHECK19-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
 // CHECK19-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK19:       omp.inner.for.cond:
-// CHECK19-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP35:![0-9]+]]
-// CHECK19-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP35]]
+// CHECK19-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP21:![0-9]+]]
+// CHECK19-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP21]]
 // CHECK19-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
 // CHECK19-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK19:       omp.inner.for.body:
-// CHECK19-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP35]]
+// CHECK19-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP21]]
 // CHECK19-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 4
 // CHECK19-NEXT:    [[ADD:%.*]] = add nsw i32 6, [[MUL]]
 // CHECK19-NEXT:    [[CONV:%.*]] = trunc i32 [[ADD]] to i16
-// CHECK19-NEXT:    store i16 [[CONV]], ptr [[IT]], align 2, !llvm.access.group [[ACC_GRP35]]
-// CHECK19-NEXT:    [[TMP8:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP35]]
+// CHECK19-NEXT:    store i16 [[CONV]], ptr [[IT]], align 2, !llvm.access.group [[ACC_GRP21]]
+// CHECK19-NEXT:    [[TMP8:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP21]]
 // CHECK19-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP8]], 1
-// CHECK19-NEXT:    store i32 [[ADD2]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP35]]
-// CHECK19-NEXT:    [[TMP9:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP35]]
+// CHECK19-NEXT:    store i32 [[ADD2]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP21]]
+// CHECK19-NEXT:    [[TMP9:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP21]]
 // CHECK19-NEXT:    [[CONV3:%.*]] = sext i16 [[TMP9]] to i32
 // CHECK19-NEXT:    [[ADD4:%.*]] = add nsw i32 [[CONV3]], 1
 // CHECK19-NEXT:    [[CONV5:%.*]] = trunc i32 [[ADD4]] to i16
-// CHECK19-NEXT:    store i16 [[CONV5]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP35]]
+// CHECK19-NEXT:    store i16 [[CONV5]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP21]]
 // CHECK19-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK19:       omp.body.continue:
 // CHECK19-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK19:       omp.inner.for.inc:
-// CHECK19-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP35]]
+// CHECK19-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP21]]
 // CHECK19-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP10]], 1
-// CHECK19-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP35]]
-// CHECK19-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP36:![0-9]+]]
+// CHECK19-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP21]]
+// CHECK19-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP22:![0-9]+]]
 // CHECK19:       omp.inner.for.end:
 // CHECK19-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK19:       omp.loop.exit:
@@ -10177,7 +10177,7 @@ int bar(int n){
 //
 //
 // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l140.omp_outlined
-// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BN:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[C:%.*]], i32 noundef [[VLA1:%.*]], i32 noundef [[VLA3:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[CN:%.*]], ptr noundef nonnull align 4 dereferenceable(12) [[D:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR1]] {
+// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BN:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[C:%.*]], i32 noundef [[VLA1:%.*]], i32 noundef [[VLA3:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[CN:%.*]], ptr noundef nonnull align 4 dereferenceable(12) [[D:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] {
 // CHECK19-NEXT:  entry:
 // CHECK19-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -10248,60 +10248,60 @@ int bar(int n){
 // CHECK19:       omp.dispatch.body:
 // CHECK19-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK19:       omp.inner.for.cond:
-// CHECK19-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP38:![0-9]+]]
-// CHECK19-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP38]]
+// CHECK19-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP24:![0-9]+]]
+// CHECK19-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP24]]
 // CHECK19-NEXT:    [[CMP6:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]]
 // CHECK19-NEXT:    br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK19:       omp.inner.for.body:
-// CHECK19-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP38]]
+// CHECK19-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP24]]
 // CHECK19-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1
 // CHECK19-NEXT:    [[SUB:%.*]] = sub nsw i32 122, [[MUL]]
 // CHECK19-NEXT:    [[CONV:%.*]] = trunc i32 [[SUB]] to i8
-// CHECK19-NEXT:    store i8 [[CONV]], ptr [[IT]], align 1, !llvm.access.group [[ACC_GRP38]]
-// CHECK19-NEXT:    [[TMP19:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP38]]
+// CHECK19-NEXT:    store i8 [[CONV]], ptr [[IT]], align 1, !llvm.access.group [[ACC_GRP24]]
+// CHECK19-NEXT:    [[TMP19:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP24]]
 // CHECK19-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP19]], 1
-// CHECK19-NEXT:    store i32 [[ADD]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP38]]
+// CHECK19-NEXT:    store i32 [[ADD]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP24]]
 // CHECK19-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x float], ptr [[TMP0]], i32 0, i32 2
-// CHECK19-NEXT:    [[TMP20:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP38]]
+// CHECK19-NEXT:    [[TMP20:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP24]]
 // CHECK19-NEXT:    [[CONV7:%.*]] = fpext float [[TMP20]] to double
 // CHECK19-NEXT:    [[ADD8:%.*]] = fadd double [[CONV7]], 1.000000e+00
 // CHECK19-NEXT:    [[CONV9:%.*]] = fptrunc double [[ADD8]] to float
-// CHECK19-NEXT:    store float [[CONV9]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP38]]
+// CHECK19-NEXT:    store float [[CONV9]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP24]]
 // CHECK19-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 3
-// CHECK19-NEXT:    [[TMP21:%.*]] = load float, ptr [[ARRAYIDX10]], align 4, !llvm.access.group [[ACC_GRP38]]
+// CHECK19-NEXT:    [[TMP21:%.*]] = load float, ptr [[ARRAYIDX10]], align 4, !llvm.access.group [[ACC_GRP24]]
 // CHECK19-NEXT:    [[CONV11:%.*]] = fpext float [[TMP21]] to double
 // CHECK19-NEXT:    [[ADD12:%.*]] = fadd double [[CONV11]], 1.000000e+00
 // CHECK19-NEXT:    [[CONV13:%.*]] = fptrunc double [[ADD12]] to float
-// CHECK19-NEXT:    store float [[CONV13]], ptr [[ARRAYIDX10]], align 4, !llvm.access.group [[ACC_GRP38]]
+// CHECK19-NEXT:    store float [[CONV13]], ptr [[ARRAYIDX10]], align 4, !llvm.access.group [[ACC_GRP24]]
 // CHECK19-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds [5 x [10 x double]], ptr [[TMP3]], i32 0, i32 1
 // CHECK19-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds [10 x double], ptr [[ARRAYIDX14]], i32 0, i32 2
-// CHECK19-NEXT:    [[TMP22:%.*]] = load double, ptr [[ARRAYIDX15]], align 8, !llvm.access.group [[ACC_GRP38]]
+// CHECK19-NEXT:    [[TMP22:%.*]] = load double, ptr [[ARRAYIDX15]], align 8, !llvm.access.group [[ACC_GRP24]]
 // CHECK19-NEXT:    [[ADD16:%.*]] = fadd double [[TMP22]], 1.000000e+00
-// CHECK19-NEXT:    store double [[ADD16]], ptr [[ARRAYIDX15]], align 8, !llvm.access.group [[ACC_GRP38]]
+// CHECK19-NEXT:    store double [[ADD16]], ptr [[ARRAYIDX15]], align 8, !llvm.access.group [[ACC_GRP24]]
 // CHECK19-NEXT:    [[TMP23:%.*]] = mul nsw i32 1, [[TMP5]]
 // CHECK19-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds double, ptr [[TMP6]], i32 [[TMP23]]
 // CHECK19-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds double, ptr [[ARRAYIDX17]], i32 3
-// CHECK19-NEXT:    [[TMP24:%.*]] = load double, ptr [[ARRAYIDX18]], align 8, !llvm.access.group [[ACC_GRP38]]
+// CHECK19-NEXT:    [[TMP24:%.*]] = load double, ptr [[ARRAYIDX18]], align 8, !llvm.access.group [[ACC_GRP24]]
 // CHECK19-NEXT:    [[ADD19:%.*]] = fadd double [[TMP24]], 1.000000e+00
-// CHECK19-NEXT:    store double [[ADD19]], ptr [[ARRAYIDX18]], align 8, !llvm.access.group [[ACC_GRP38]]
+// CHECK19-NEXT:    store double [[ADD19]], ptr [[ARRAYIDX18]], align 8, !llvm.access.group [[ACC_GRP24]]
 // CHECK19-NEXT:    [[X:%.*]] = getelementptr inbounds [[STRUCT_TT:%.*]], ptr [[TMP7]], i32 0, i32 0
-// CHECK19-NEXT:    [[TMP25:%.*]] = load i64, ptr [[X]], align 4, !llvm.access.group [[ACC_GRP38]]
+// CHECK19-NEXT:    [[TMP25:%.*]] = load i64, ptr [[X]], align 4, !llvm.access.group [[ACC_GRP24]]
 // CHECK19-NEXT:    [[ADD20:%.*]] = add nsw i64 [[TMP25]], 1
-// CHECK19-NEXT:    store i64 [[ADD20]], ptr [[X]], align 4, !llvm.access.group [[ACC_GRP38]]
+// CHECK19-NEXT:    store i64 [[ADD20]], ptr [[X]], align 4, !llvm.access.group [[ACC_GRP24]]
 // CHECK19-NEXT:    [[Y:%.*]] = getelementptr inbounds [[STRUCT_TT]], ptr [[TMP7]], i32 0, i32 1
-// CHECK19-NEXT:    [[TMP26:%.*]] = load i8, ptr [[Y]], align 4, !llvm.access.group [[ACC_GRP38]]
+// CHECK19-NEXT:    [[TMP26:%.*]] = load i8, ptr [[Y]], align 4, !llvm.access.group [[ACC_GRP24]]
 // CHECK19-NEXT:    [[CONV21:%.*]] = sext i8 [[TMP26]] to i32
 // CHECK19-NEXT:    [[ADD22:%.*]] = add nsw i32 [[CONV21]], 1
 // CHECK19-NEXT:    [[CONV23:%.*]] = trunc i32 [[ADD22]] to i8
-// CHECK19-NEXT:    store i8 [[CONV23]], ptr [[Y]], align 4, !llvm.access.group [[ACC_GRP38]]
+// CHECK19-NEXT:    store i8 [[CONV23]], ptr [[Y]], align 4, !llvm.access.group [[ACC_GRP24]]
 // CHECK19-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK19:       omp.body.continue:
 // CHECK19-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK19:       omp.inner.for.inc:
-// CHECK19-NEXT:    [[TMP27:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP38]]
+// CHECK19-NEXT:    [[TMP27:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP24]]
 // CHECK19-NEXT:    [[ADD24:%.*]] = add nsw i32 [[TMP27]], 1
-// CHECK19-NEXT:    store i32 [[ADD24]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP38]]
-// CHECK19-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP39:![0-9]+]]
+// CHECK19-NEXT:    store i32 [[ADD24]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP24]]
+// CHECK19-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP25:![0-9]+]]
 // CHECK19:       omp.inner.for.end:
 // CHECK19-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK19:       omp.dispatch.inc:
@@ -10355,7 +10355,7 @@ int bar(int n){
 //
 //
 // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l195.omp_outlined
-// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]], i32 noundef [[AAA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR1]] {
+// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]], i32 noundef [[AAA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
 // CHECK19-NEXT:  entry:
 // CHECK19-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -10401,7 +10401,7 @@ int bar(int n){
 //
 //
 // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l216.omp_outlined
-// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]], i32 noundef [[B:%.*]], i32 noundef [[VLA:%.*]], i32 noundef [[VLA1:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[C:%.*]]) #[[ATTR1]] {
+// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]], i32 noundef [[B:%.*]], i32 noundef [[VLA:%.*]], i32 noundef [[VLA1:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[C:%.*]]) #[[ATTR0]] {
 // CHECK19-NEXT:  entry:
 // CHECK19-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -10450,37 +10450,37 @@ int bar(int n){
 // CHECK19-NEXT:    store i64 [[TMP8]], ptr [[DOTOMP_IV]], align 8
 // CHECK19-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK19:       omp.inner.for.cond:
-// CHECK19-NEXT:    [[TMP9:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP41:![0-9]+]]
-// CHECK19-NEXT:    [[TMP10:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8, !llvm.access.group [[ACC_GRP41]]
+// CHECK19-NEXT:    [[TMP9:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP27:![0-9]+]]
+// CHECK19-NEXT:    [[TMP10:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8, !llvm.access.group [[ACC_GRP27]]
 // CHECK19-NEXT:    [[CMP3:%.*]] = icmp ule i64 [[TMP9]], [[TMP10]]
 // CHECK19-NEXT:    br i1 [[CMP3]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK19:       omp.inner.for.body:
-// CHECK19-NEXT:    [[TMP11:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP41]]
+// CHECK19-NEXT:    [[TMP11:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP27]]
 // CHECK19-NEXT:    [[MUL:%.*]] = mul i64 [[TMP11]], 400
 // CHECK19-NEXT:    [[SUB:%.*]] = sub i64 2000, [[MUL]]
-// CHECK19-NEXT:    store i64 [[SUB]], ptr [[IT]], align 8, !llvm.access.group [[ACC_GRP41]]
-// CHECK19-NEXT:    [[TMP12:%.*]] = load i32, ptr [[B_ADDR]], align 4, !llvm.access.group [[ACC_GRP41]]
+// CHECK19-NEXT:    store i64 [[SUB]], ptr [[IT]], align 8, !llvm.access.group [[ACC_GRP27]]
+// CHECK19-NEXT:    [[TMP12:%.*]] = load i32, ptr [[B_ADDR]], align 4, !llvm.access.group [[ACC_GRP27]]
 // CHECK19-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP12]] to double
 // CHECK19-NEXT:    [[ADD:%.*]] = fadd double [[CONV]], 1.500000e+00
 // CHECK19-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_S1:%.*]], ptr [[TMP0]], i32 0, i32 0
-// CHECK19-NEXT:    store double [[ADD]], ptr [[A]], align 4, !llvm.access.group [[ACC_GRP41]]
+// CHECK19-NEXT:    store double [[ADD]], ptr [[A]], align 4, !llvm.access.group [[ACC_GRP27]]
 // CHECK19-NEXT:    [[A4:%.*]] = getelementptr inbounds [[STRUCT_S1]], ptr [[TMP0]], i32 0, i32 0
-// CHECK19-NEXT:    [[TMP13:%.*]] = load double, ptr [[A4]], align 4, !llvm.access.group [[ACC_GRP41]]
+// CHECK19-NEXT:    [[TMP13:%.*]] = load double, ptr [[A4]], align 4, !llvm.access.group [[ACC_GRP27]]
 // CHECK19-NEXT:    [[INC:%.*]] = fadd double [[TMP13]], 1.000000e+00
-// CHECK19-NEXT:    store double [[INC]], ptr [[A4]], align 4, !llvm.access.group [[ACC_GRP41]]
+// CHECK19-NEXT:    store double [[INC]], ptr [[A4]], align 4, !llvm.access.group [[ACC_GRP27]]
 // CHECK19-NEXT:    [[CONV5:%.*]] = fptosi double [[INC]] to i16
 // CHECK19-NEXT:    [[TMP14:%.*]] = mul nsw i32 1, [[TMP2]]
 // CHECK19-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[TMP3]], i32 [[TMP14]]
 // CHECK19-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i16, ptr [[ARRAYIDX]], i32 1
-// CHECK19-NEXT:    store i16 [[CONV5]], ptr [[ARRAYIDX6]], align 2, !llvm.access.group [[ACC_GRP41]]
+// CHECK19-NEXT:    store i16 [[CONV5]], ptr [[ARRAYIDX6]], align 2, !llvm.access.group [[ACC_GRP27]]
 // CHECK19-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK19:       omp.body.continue:
 // CHECK19-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK19:       omp.inner.for.inc:
-// CHECK19-NEXT:    [[TMP15:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP41]]
+// CHECK19-NEXT:    [[TMP15:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP27]]
 // CHECK19-NEXT:    [[ADD7:%.*]] = add i64 [[TMP15]], 1
-// CHECK19-NEXT:    store i64 [[ADD7]], ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP41]]
-// CHECK19-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP42:![0-9]+]]
+// CHECK19-NEXT:    store i64 [[ADD7]], ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP27]]
+// CHECK19-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP28:![0-9]+]]
 // CHECK19:       omp.inner.for.end:
 // CHECK19-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK19:       omp.loop.exit:
@@ -10518,7 +10518,7 @@ int bar(int n){
 //
 //
 // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l178.omp_outlined
-// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR1]] {
+// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
 // CHECK19-NEXT:  entry:
 // CHECK19-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -10560,35 +10560,35 @@ int bar(int n){
 // CHECK19-NEXT:    store i64 [[TMP5]], ptr [[DOTOMP_IV]], align 8
 // CHECK19-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK19:       omp.inner.for.cond:
-// CHECK19-NEXT:    [[TMP6:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP44:![0-9]+]]
-// CHECK19-NEXT:    [[TMP7:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8, !llvm.access.group [[ACC_GRP44]]
+// CHECK19-NEXT:    [[TMP6:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP30:![0-9]+]]
+// CHECK19-NEXT:    [[TMP7:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8, !llvm.access.group [[ACC_GRP30]]
 // CHECK19-NEXT:    [[CMP1:%.*]] = icmp sle i64 [[TMP6]], [[TMP7]]
 // CHECK19-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK19:       omp.inner.for.body:
-// CHECK19-NEXT:    [[TMP8:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP44]]
+// CHECK19-NEXT:    [[TMP8:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP30]]
 // CHECK19-NEXT:    [[MUL:%.*]] = mul nsw i64 [[TMP8]], 3
 // CHECK19-NEXT:    [[ADD:%.*]] = add nsw i64 -10, [[MUL]]
-// CHECK19-NEXT:    store i64 [[ADD]], ptr [[I]], align 8, !llvm.access.group [[ACC_GRP44]]
-// CHECK19-NEXT:    [[TMP9:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP44]]
+// CHECK19-NEXT:    store i64 [[ADD]], ptr [[I]], align 8, !llvm.access.group [[ACC_GRP30]]
+// CHECK19-NEXT:    [[TMP9:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP30]]
 // CHECK19-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP9]], 1
-// CHECK19-NEXT:    store i32 [[ADD2]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP44]]
-// CHECK19-NEXT:    [[TMP10:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP44]]
+// CHECK19-NEXT:    store i32 [[ADD2]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP30]]
+// CHECK19-NEXT:    [[TMP10:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP30]]
 // CHECK19-NEXT:    [[CONV:%.*]] = sext i16 [[TMP10]] to i32
 // CHECK19-NEXT:    [[ADD3:%.*]] = add nsw i32 [[CONV]], 1
 // CHECK19-NEXT:    [[CONV4:%.*]] = trunc i32 [[ADD3]] to i16
-// CHECK19-NEXT:    store i16 [[CONV4]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP44]]
+// CHECK19-NEXT:    store i16 [[CONV4]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP30]]
 // CHECK19-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP0]], i32 0, i32 2
-// CHECK19-NEXT:    [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP44]]
+// CHECK19-NEXT:    [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP30]]
 // CHECK19-NEXT:    [[ADD5:%.*]] = add nsw i32 [[TMP11]], 1
-// CHECK19-NEXT:    store i32 [[ADD5]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP44]]
+// CHECK19-NEXT:    store i32 [[ADD5]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP30]]
 // CHECK19-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK19:       omp.body.continue:
 // CHECK19-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK19:       omp.inner.for.inc:
-// CHECK19-NEXT:    [[TMP12:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP44]]
+// CHECK19-NEXT:    [[TMP12:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP30]]
 // CHECK19-NEXT:    [[ADD6:%.*]] = add nsw i64 [[TMP12]], 1
-// CHECK19-NEXT:    store i64 [[ADD6]], ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP44]]
-// CHECK19-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP45:![0-9]+]]
+// CHECK19-NEXT:    store i64 [[ADD6]], ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP30]]
+// CHECK19-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP31:![0-9]+]]
 // CHECK19:       omp.inner.for.end:
 // CHECK19-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK19:       omp.loop.exit:
@@ -10611,7 +10611,7 @@ int bar(int n){
 //
 //
 // CHECK21-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l96.omp_outlined
-// CHECK21-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK21-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
 // CHECK21-NEXT:  entry:
 // CHECK21-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK21-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -10646,23 +10646,23 @@ int bar(int n){
 // CHECK21-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
 // CHECK21-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK21:       omp.inner.for.cond:
-// CHECK21-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP25:![0-9]+]]
-// CHECK21-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP25]]
+// CHECK21-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP11:![0-9]+]]
+// CHECK21-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP11]]
 // CHECK21-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
 // CHECK21-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK21:       omp.inner.for.body:
-// CHECK21-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP25]]
+// CHECK21-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP11]]
 // CHECK21-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 5
 // CHECK21-NEXT:    [[ADD:%.*]] = add nsw i32 3, [[MUL]]
-// CHECK21-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP25]]
+// CHECK21-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP11]]
 // CHECK21-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK21:       omp.body.continue:
 // CHECK21-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK21:       omp.inner.for.inc:
-// CHECK21-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP25]]
+// CHECK21-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP11]]
 // CHECK21-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP8]], 1
-// CHECK21-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP25]]
-// CHECK21-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP26:![0-9]+]]
+// CHECK21-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP11]]
+// CHECK21-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP12:![0-9]+]]
 // CHECK21:       omp.inner.for.end:
 // CHECK21-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK21:       omp.loop.exit:
@@ -10703,7 +10703,7 @@ int bar(int n){
 //
 //
 // CHECK21-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l108.omp_outlined
-// CHECK21-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[AA:%.*]], i64 noundef [[LIN:%.*]], i64 noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK21-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[AA:%.*]], i64 noundef [[LIN:%.*]], i64 noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK21-NEXT:  entry:
 // CHECK21-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK21-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -10731,7 +10731,7 @@ int bar(int n){
 // CHECK21-NEXT:    store i32 [[TMP0]], ptr [[DOTLINEAR_START]], align 4
 // CHECK21-NEXT:    [[TMP1:%.*]] = load i32, ptr [[A_ADDR]], align 4
 // CHECK21-NEXT:    store i32 [[TMP1]], ptr [[DOTLINEAR_START1]], align 4
-// CHECK21-NEXT:    [[CALL:%.*]] = call noundef i64 @_Z7get_valv() #[[ATTR5:[0-9]+]]
+// CHECK21-NEXT:    [[CALL:%.*]] = call noundef i64 @_Z7get_valv() #[[ATTR4:[0-9]+]]
 // CHECK21-NEXT:    store i64 [[CALL]], ptr [[DOTLINEAR_STEP]], align 8
 // CHECK21-NEXT:    store i64 0, ptr [[DOTOMP_LB]], align 8
 // CHECK21-NEXT:    store i64 3, ptr [[DOTOMP_UB]], align 8
@@ -10756,44 +10756,44 @@ int bar(int n){
 // CHECK21-NEXT:    store i64 [[TMP6]], ptr [[DOTOMP_IV]], align 8
 // CHECK21-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK21:       omp.inner.for.cond:
-// CHECK21-NEXT:    [[TMP7:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP31:![0-9]+]]
-// CHECK21-NEXT:    [[TMP8:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8, !llvm.access.group [[ACC_GRP31]]
+// CHECK21-NEXT:    [[TMP7:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP17:![0-9]+]]
+// CHECK21-NEXT:    [[TMP8:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8, !llvm.access.group [[ACC_GRP17]]
 // CHECK21-NEXT:    [[CMP4:%.*]] = icmp ule i64 [[TMP7]], [[TMP8]]
 // CHECK21-NEXT:    br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK21:       omp.inner.for.body:
-// CHECK21-NEXT:    [[TMP9:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP31]]
+// CHECK21-NEXT:    [[TMP9:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP17]]
 // CHECK21-NEXT:    [[MUL:%.*]] = mul i64 [[TMP9]], 400
 // CHECK21-NEXT:    [[SUB:%.*]] = sub i64 2000, [[MUL]]
-// CHECK21-NEXT:    store i64 [[SUB]], ptr [[IT]], align 8, !llvm.access.group [[ACC_GRP31]]
-// CHECK21-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTLINEAR_START]], align 4, !llvm.access.group [[ACC_GRP31]]
+// CHECK21-NEXT:    store i64 [[SUB]], ptr [[IT]], align 8, !llvm.access.group [[ACC_GRP17]]
+// CHECK21-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTLINEAR_START]], align 4, !llvm.access.group [[ACC_GRP17]]
 // CHECK21-NEXT:    [[CONV:%.*]] = sext i32 [[TMP10]] to i64
-// CHECK21-NEXT:    [[TMP11:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP31]]
-// CHECK21-NEXT:    [[TMP12:%.*]] = load i64, ptr [[DOTLINEAR_STEP]], align 8, !llvm.access.group [[ACC_GRP31]]
+// CHECK21-NEXT:    [[TMP11:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP17]]
+// CHECK21-NEXT:    [[TMP12:%.*]] = load i64, ptr [[DOTLINEAR_STEP]], align 8, !llvm.access.group [[ACC_GRP17]]
 // CHECK21-NEXT:    [[MUL5:%.*]] = mul i64 [[TMP11]], [[TMP12]]
 // CHECK21-NEXT:    [[ADD:%.*]] = add i64 [[CONV]], [[MUL5]]
 // CHECK21-NEXT:    [[CONV6:%.*]] = trunc i64 [[ADD]] to i32
-// CHECK21-NEXT:    store i32 [[CONV6]], ptr [[LIN2]], align 4, !llvm.access.group [[ACC_GRP31]]
-// CHECK21-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTLINEAR_START1]], align 4, !llvm.access.group [[ACC_GRP31]]
+// CHECK21-NEXT:    store i32 [[CONV6]], ptr [[LIN2]], align 4, !llvm.access.group [[ACC_GRP17]]
+// CHECK21-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTLINEAR_START1]], align 4, !llvm.access.group [[ACC_GRP17]]
 // CHECK21-NEXT:    [[CONV7:%.*]] = sext i32 [[TMP13]] to i64
-// CHECK21-NEXT:    [[TMP14:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP31]]
-// CHECK21-NEXT:    [[TMP15:%.*]] = load i64, ptr [[DOTLINEAR_STEP]], align 8, !llvm.access.group [[ACC_GRP31]]
+// CHECK21-NEXT:    [[TMP14:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP17]]
+// CHECK21-NEXT:    [[TMP15:%.*]] = load i64, ptr [[DOTLINEAR_STEP]], align 8, !llvm.access.group [[ACC_GRP17]]
 // CHECK21-NEXT:    [[MUL8:%.*]] = mul i64 [[TMP14]], [[TMP15]]
 // CHECK21-NEXT:    [[ADD9:%.*]] = add i64 [[CONV7]], [[MUL8]]
 // CHECK21-NEXT:    [[CONV10:%.*]] = trunc i64 [[ADD9]] to i32
-// CHECK21-NEXT:    store i32 [[CONV10]], ptr [[A3]], align 4, !llvm.access.group [[ACC_GRP31]]
-// CHECK21-NEXT:    [[TMP16:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP31]]
+// CHECK21-NEXT:    store i32 [[CONV10]], ptr [[A3]], align 4, !llvm.access.group [[ACC_GRP17]]
+// CHECK21-NEXT:    [[TMP16:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP17]]
 // CHECK21-NEXT:    [[CONV11:%.*]] = sext i16 [[TMP16]] to i32
 // CHECK21-NEXT:    [[ADD12:%.*]] = add nsw i32 [[CONV11]], 1
 // CHECK21-NEXT:    [[CONV13:%.*]] = trunc i32 [[ADD12]] to i16
-// CHECK21-NEXT:    store i16 [[CONV13]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP31]]
+// CHECK21-NEXT:    store i16 [[CONV13]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP17]]
 // CHECK21-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK21:       omp.body.continue:
 // CHECK21-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK21:       omp.inner.for.inc:
-// CHECK21-NEXT:    [[TMP17:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP31]]
+// CHECK21-NEXT:    [[TMP17:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP17]]
 // CHECK21-NEXT:    [[ADD14:%.*]] = add i64 [[TMP17]], 1
-// CHECK21-NEXT:    store i64 [[ADD14]], ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP31]]
-// CHECK21-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP32:![0-9]+]]
+// CHECK21-NEXT:    store i64 [[ADD14]], ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP17]]
+// CHECK21-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP18:![0-9]+]]
 // CHECK21:       omp.inner.for.end:
 // CHECK21-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK21:       omp.loop.exit:
@@ -10819,7 +10819,7 @@ int bar(int n){
 //
 //
 // CHECK21-LABEL: define {{[^@]+}}@_Z7get_valv
-// CHECK21-SAME: () #[[ATTR3:[0-9]+]] {
+// CHECK21-SAME: () #[[ATTR2:[0-9]+]] {
 // CHECK21-NEXT:  entry:
 // CHECK21-NEXT:    ret i64 0
 //
@@ -10844,7 +10844,7 @@ int bar(int n){
 //
 //
 // CHECK21-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l116.omp_outlined
-// CHECK21-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]]) #[[ATTR1]] {
+// CHECK21-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]]) #[[ATTR0]] {
 // CHECK21-NEXT:  entry:
 // CHECK21-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK21-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -10883,32 +10883,32 @@ int bar(int n){
 // CHECK21-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
 // CHECK21-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK21:       omp.inner.for.cond:
-// CHECK21-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP34:![0-9]+]]
-// CHECK21-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP34]]
+// CHECK21-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP20:![0-9]+]]
+// CHECK21-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP20]]
 // CHECK21-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
 // CHECK21-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK21:       omp.inner.for.body:
-// CHECK21-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP34]]
+// CHECK21-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP20]]
 // CHECK21-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 4
 // CHECK21-NEXT:    [[ADD:%.*]] = add nsw i32 6, [[MUL]]
 // CHECK21-NEXT:    [[CONV:%.*]] = trunc i32 [[ADD]] to i16
-// CHECK21-NEXT:    store i16 [[CONV]], ptr [[IT]], align 2, !llvm.access.group [[ACC_GRP34]]
-// CHECK21-NEXT:    [[TMP8:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP34]]
+// CHECK21-NEXT:    store i16 [[CONV]], ptr [[IT]], align 2, !llvm.access.group [[ACC_GRP20]]
+// CHECK21-NEXT:    [[TMP8:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP20]]
 // CHECK21-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP8]], 1
-// CHECK21-NEXT:    store i32 [[ADD2]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP34]]
-// CHECK21-NEXT:    [[TMP9:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP34]]
+// CHECK21-NEXT:    store i32 [[ADD2]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP20]]
+// CHECK21-NEXT:    [[TMP9:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP20]]
 // CHECK21-NEXT:    [[CONV3:%.*]] = sext i16 [[TMP9]] to i32
 // CHECK21-NEXT:    [[ADD4:%.*]] = add nsw i32 [[CONV3]], 1
 // CHECK21-NEXT:    [[CONV5:%.*]] = trunc i32 [[ADD4]] to i16
-// CHECK21-NEXT:    store i16 [[CONV5]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP34]]
+// CHECK21-NEXT:    store i16 [[CONV5]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP20]]
 // CHECK21-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK21:       omp.body.continue:
 // CHECK21-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK21:       omp.inner.for.inc:
-// CHECK21-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP34]]
+// CHECK21-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP20]]
 // CHECK21-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP10]], 1
-// CHECK21-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP34]]
-// CHECK21-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP35:![0-9]+]]
+// CHECK21-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP20]]
+// CHECK21-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP21:![0-9]+]]
 // CHECK21:       omp.inner.for.end:
 // CHECK21-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK21:       omp.loop.exit:
@@ -10967,7 +10967,7 @@ int bar(int n){
 //
 //
 // CHECK21-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l140.omp_outlined
-// CHECK21-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BN:%.*]], ptr noundef nonnull align 8 dereferenceable(400) [[C:%.*]], i64 noundef [[VLA1:%.*]], i64 noundef [[VLA3:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[CN:%.*]], ptr noundef nonnull align 8 dereferenceable(16) [[D:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR1]] {
+// CHECK21-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BN:%.*]], ptr noundef nonnull align 8 dereferenceable(400) [[C:%.*]], i64 noundef [[VLA1:%.*]], i64 noundef [[VLA3:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[CN:%.*]], ptr noundef nonnull align 8 dereferenceable(16) [[D:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] {
 // CHECK21-NEXT:  entry:
 // CHECK21-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK21-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -11038,60 +11038,60 @@ int bar(int n){
 // CHECK21:       omp.dispatch.body:
 // CHECK21-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK21:       omp.inner.for.cond:
-// CHECK21-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP37:![0-9]+]]
-// CHECK21-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP37]]
+// CHECK21-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP23:![0-9]+]]
+// CHECK21-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP23]]
 // CHECK21-NEXT:    [[CMP6:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]]
 // CHECK21-NEXT:    br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK21:       omp.inner.for.body:
-// CHECK21-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP37]]
+// CHECK21-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP23]]
 // CHECK21-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1
 // CHECK21-NEXT:    [[SUB:%.*]] = sub nsw i32 122, [[MUL]]
 // CHECK21-NEXT:    [[CONV:%.*]] = trunc i32 [[SUB]] to i8
-// CHECK21-NEXT:    store i8 [[CONV]], ptr [[IT]], align 1, !llvm.access.group [[ACC_GRP37]]
-// CHECK21-NEXT:    [[TMP19:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP37]]
+// CHECK21-NEXT:    store i8 [[CONV]], ptr [[IT]], align 1, !llvm.access.group [[ACC_GRP23]]
+// CHECK21-NEXT:    [[TMP19:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP23]]
 // CHECK21-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP19]], 1
-// CHECK21-NEXT:    store i32 [[ADD]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP37]]
+// CHECK21-NEXT:    store i32 [[ADD]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP23]]
 // CHECK21-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x float], ptr [[TMP0]], i64 0, i64 2
-// CHECK21-NEXT:    [[TMP20:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP37]]
+// CHECK21-NEXT:    [[TMP20:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP23]]
 // CHECK21-NEXT:    [[CONV7:%.*]] = fpext float [[TMP20]] to double
 // CHECK21-NEXT:    [[ADD8:%.*]] = fadd double [[CONV7]], 1.000000e+00
 // CHECK21-NEXT:    [[CONV9:%.*]] = fptrunc double [[ADD8]] to float
-// CHECK21-NEXT:    store float [[CONV9]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP37]]
+// CHECK21-NEXT:    store float [[CONV9]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP23]]
 // CHECK21-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i64 3
-// CHECK21-NEXT:    [[TMP21:%.*]] = load float, ptr [[ARRAYIDX10]], align 4, !llvm.access.group [[ACC_GRP37]]
+// CHECK21-NEXT:    [[TMP21:%.*]] = load float, ptr [[ARRAYIDX10]], align 4, !llvm.access.group [[ACC_GRP23]]
 // CHECK21-NEXT:    [[CONV11:%.*]] = fpext float [[TMP21]] to double
 // CHECK21-NEXT:    [[ADD12:%.*]] = fadd double [[CONV11]], 1.000000e+00
 // CHECK21-NEXT:    [[CONV13:%.*]] = fptrunc double [[ADD12]] to float
-// CHECK21-NEXT:    store float [[CONV13]], ptr [[ARRAYIDX10]], align 4, !llvm.access.group [[ACC_GRP37]]
+// CHECK21-NEXT:    store float [[CONV13]], ptr [[ARRAYIDX10]], align 4, !llvm.access.group [[ACC_GRP23]]
 // CHECK21-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds [5 x [10 x double]], ptr [[TMP3]], i64 0, i64 1
 // CHECK21-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds [10 x double], ptr [[ARRAYIDX14]], i64 0, i64 2
-// CHECK21-NEXT:    [[TMP22:%.*]] = load double, ptr [[ARRAYIDX15]], align 8, !llvm.access.group [[ACC_GRP37]]
+// CHECK21-NEXT:    [[TMP22:%.*]] = load double, ptr [[ARRAYIDX15]], align 8, !llvm.access.group [[ACC_GRP23]]
 // CHECK21-NEXT:    [[ADD16:%.*]] = fadd double [[TMP22]], 1.000000e+00
-// CHECK21-NEXT:    store double [[ADD16]], ptr [[ARRAYIDX15]], align 8, !llvm.access.group [[ACC_GRP37]]
+// CHECK21-NEXT:    store double [[ADD16]], ptr [[ARRAYIDX15]], align 8, !llvm.access.group [[ACC_GRP23]]
 // CHECK21-NEXT:    [[TMP23:%.*]] = mul nsw i64 1, [[TMP5]]
 // CHECK21-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds double, ptr [[TMP6]], i64 [[TMP23]]
 // CHECK21-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds double, ptr [[ARRAYIDX17]], i64 3
-// CHECK21-NEXT:    [[TMP24:%.*]] = load double, ptr [[ARRAYIDX18]], align 8, !llvm.access.group [[ACC_GRP37]]
+// CHECK21-NEXT:    [[TMP24:%.*]] = load double, ptr [[ARRAYIDX18]], align 8, !llvm.access.group [[ACC_GRP23]]
 // CHECK21-NEXT:    [[ADD19:%.*]] = fadd double [[TMP24]], 1.000000e+00
-// CHECK21-NEXT:    store double [[ADD19]], ptr [[ARRAYIDX18]], align 8, !llvm.access.group [[ACC_GRP37]]
+// CHECK21-NEXT:    store double [[ADD19]], ptr [[ARRAYIDX18]], align 8, !llvm.access.group [[ACC_GRP23]]
 // CHECK21-NEXT:    [[X:%.*]] = getelementptr inbounds [[STRUCT_TT:%.*]], ptr [[TMP7]], i32 0, i32 0
-// CHECK21-NEXT:    [[TMP25:%.*]] = load i64, ptr [[X]], align 8, !llvm.access.group [[ACC_GRP37]]
+// CHECK21-NEXT:    [[TMP25:%.*]] = load i64, ptr [[X]], align 8, !llvm.access.group [[ACC_GRP23]]
 // CHECK21-NEXT:    [[ADD20:%.*]] = add nsw i64 [[TMP25]], 1
-// CHECK21-NEXT:    store i64 [[ADD20]], ptr [[X]], align 8, !llvm.access.group [[ACC_GRP37]]
+// CHECK21-NEXT:    store i64 [[ADD20]], ptr [[X]], align 8, !llvm.access.group [[ACC_GRP23]]
 // CHECK21-NEXT:    [[Y:%.*]] = getelementptr inbounds [[STRUCT_TT]], ptr [[TMP7]], i32 0, i32 1
-// CHECK21-NEXT:    [[TMP26:%.*]] = load i8, ptr [[Y]], align 8, !llvm.access.group [[ACC_GRP37]]
+// CHECK21-NEXT:    [[TMP26:%.*]] = load i8, ptr [[Y]], align 8, !llvm.access.group [[ACC_GRP23]]
 // CHECK21-NEXT:    [[CONV21:%.*]] = sext i8 [[TMP26]] to i32
 // CHECK21-NEXT:    [[ADD22:%.*]] = add nsw i32 [[CONV21]], 1
 // CHECK21-NEXT:    [[CONV23:%.*]] = trunc i32 [[ADD22]] to i8
-// CHECK21-NEXT:    store i8 [[CONV23]], ptr [[Y]], align 8, !llvm.access.group [[ACC_GRP37]]
+// CHECK21-NEXT:    store i8 [[CONV23]], ptr [[Y]], align 8, !llvm.access.group [[ACC_GRP23]]
 // CHECK21-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK21:       omp.body.continue:
 // CHECK21-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK21:       omp.inner.for.inc:
-// CHECK21-NEXT:    [[TMP27:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP37]]
+// CHECK21-NEXT:    [[TMP27:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP23]]
 // CHECK21-NEXT:    [[ADD24:%.*]] = add nsw i32 [[TMP27]], 1
-// CHECK21-NEXT:    store i32 [[ADD24]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP37]]
-// CHECK21-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP38:![0-9]+]]
+// CHECK21-NEXT:    store i32 [[ADD24]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK21-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP24:![0-9]+]]
 // CHECK21:       omp.inner.for.end:
 // CHECK21-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK21:       omp.dispatch.inc:
@@ -11145,7 +11145,7 @@ int bar(int n){
 //
 //
 // CHECK21-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l195.omp_outlined
-// CHECK21-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]], i64 noundef [[AAA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR1]] {
+// CHECK21-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]], i64 noundef [[AAA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
 // CHECK21-NEXT:  entry:
 // CHECK21-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK21-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -11207,7 +11207,7 @@ int bar(int n){
 // CHECK21-NEXT:    call void @__kmpc_serialized_parallel(ptr @[[GLOB2]], i32 [[TMP0]])
 // CHECK21-NEXT:    store i32 [[TMP0]], ptr [[DOTTHREADID_TEMP_]], align 4
 // CHECK21-NEXT:    store i32 0, ptr [[DOTBOUND_ZERO_ADDR]], align 4
-// CHECK21-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l214.omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTBOUND_ZERO_ADDR]], ptr [[TMP1]], i64 [[TMP6]], i64 [[TMP2]], i64 [[TMP3]], ptr [[TMP4]], i64 [[TMP8]]) #[[ATTR2:[0-9]+]]
+// CHECK21-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l214.omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTBOUND_ZERO_ADDR]], ptr [[TMP1]], i64 [[TMP6]], i64 [[TMP2]], i64 [[TMP3]], ptr [[TMP4]], i64 [[TMP8]]) #[[ATTR1:[0-9]+]]
 // CHECK21-NEXT:    call void @__kmpc_end_serialized_parallel(ptr @[[GLOB2]], i32 [[TMP0]])
 // CHECK21-NEXT:    br label [[OMP_IF_END]]
 // CHECK21:       omp_if.end:
@@ -11215,7 +11215,7 @@ int bar(int n){
 //
 //
 // CHECK21-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l214.omp_outlined
-// CHECK21-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]], i64 noundef [[B:%.*]], i64 noundef [[VLA:%.*]], i64 noundef [[VLA1:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[C:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR1]] {
+// CHECK21-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]], i64 noundef [[B:%.*]], i64 noundef [[VLA:%.*]], i64 noundef [[VLA1:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[C:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] {
 // CHECK21-NEXT:  entry:
 // CHECK21-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK21-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -11270,37 +11270,37 @@ int bar(int n){
 // CHECK21-NEXT:    store i64 [[TMP9]], ptr [[DOTOMP_IV]], align 8
 // CHECK21-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK21:       omp.inner.for.cond:
-// CHECK21-NEXT:    [[TMP10:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP40:![0-9]+]]
-// CHECK21-NEXT:    [[TMP11:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8, !llvm.access.group [[ACC_GRP40]]
+// CHECK21-NEXT:    [[TMP10:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP26:![0-9]+]]
+// CHECK21-NEXT:    [[TMP11:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8, !llvm.access.group [[ACC_GRP26]]
 // CHECK21-NEXT:    [[CMP3:%.*]] = icmp ule i64 [[TMP10]], [[TMP11]]
 // CHECK21-NEXT:    br i1 [[CMP3]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK21:       omp.inner.for.body:
-// CHECK21-NEXT:    [[TMP12:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP40]]
+// CHECK21-NEXT:    [[TMP12:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP26]]
 // CHECK21-NEXT:    [[MUL:%.*]] = mul i64 [[TMP12]], 400
 // CHECK21-NEXT:    [[SUB:%.*]] = sub i64 2000, [[MUL]]
-// CHECK21-NEXT:    store i64 [[SUB]], ptr [[IT]], align 8, !llvm.access.group [[ACC_GRP40]]
-// CHECK21-NEXT:    [[TMP13:%.*]] = load i32, ptr [[B_ADDR]], align 4, !llvm.access.group [[ACC_GRP40]]
+// CHECK21-NEXT:    store i64 [[SUB]], ptr [[IT]], align 8, !llvm.access.group [[ACC_GRP26]]
+// CHECK21-NEXT:    [[TMP13:%.*]] = load i32, ptr [[B_ADDR]], align 4, !llvm.access.group [[ACC_GRP26]]
 // CHECK21-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP13]] to double
 // CHECK21-NEXT:    [[ADD:%.*]] = fadd double [[CONV]], 1.500000e+00
 // CHECK21-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_S1:%.*]], ptr [[TMP0]], i32 0, i32 0
-// CHECK21-NEXT:    store double [[ADD]], ptr [[A]], align 8, !nontemporal !41, !llvm.access.group [[ACC_GRP40]]
+// CHECK21-NEXT:    store double [[ADD]], ptr [[A]], align 8, !nontemporal !27, !llvm.access.group [[ACC_GRP26]]
 // CHECK21-NEXT:    [[A4:%.*]] = getelementptr inbounds [[STRUCT_S1]], ptr [[TMP0]], i32 0, i32 0
-// CHECK21-NEXT:    [[TMP14:%.*]] = load double, ptr [[A4]], align 8, !nontemporal !41, !llvm.access.group [[ACC_GRP40]]
+// CHECK21-NEXT:    [[TMP14:%.*]] = load double, ptr [[A4]], align 8, !nontemporal !27, !llvm.access.group [[ACC_GRP26]]
 // CHECK21-NEXT:    [[INC:%.*]] = fadd double [[TMP14]], 1.000000e+00
-// CHECK21-NEXT:    store double [[INC]], ptr [[A4]], align 8, !nontemporal !41, !llvm.access.group [[ACC_GRP40]]
+// CHECK21-NEXT:    store double [[INC]], ptr [[A4]], align 8, !nontemporal !27, !llvm.access.group [[ACC_GRP26]]
 // CHECK21-NEXT:    [[CONV5:%.*]] = fptosi double [[INC]] to i16
 // CHECK21-NEXT:    [[TMP15:%.*]] = mul nsw i64 1, [[TMP2]]
 // CHECK21-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[TMP3]], i64 [[TMP15]]
 // CHECK21-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i16, ptr [[ARRAYIDX]], i64 1
-// CHECK21-NEXT:    store i16 [[CONV5]], ptr [[ARRAYIDX6]], align 2, !llvm.access.group [[ACC_GRP40]]
+// CHECK21-NEXT:    store i16 [[CONV5]], ptr [[ARRAYIDX6]], align 2, !llvm.access.group [[ACC_GRP26]]
 // CHECK21-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK21:       omp.body.continue:
 // CHECK21-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK21:       omp.inner.for.inc:
-// CHECK21-NEXT:    [[TMP16:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP40]]
+// CHECK21-NEXT:    [[TMP16:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP26]]
 // CHECK21-NEXT:    [[ADD7:%.*]] = add i64 [[TMP16]], 1
-// CHECK21-NEXT:    store i64 [[ADD7]], ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP40]]
-// CHECK21-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP42:![0-9]+]]
+// CHECK21-NEXT:    store i64 [[ADD7]], ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP26]]
+// CHECK21-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP28:![0-9]+]]
 // CHECK21:       omp.inner.for.end:
 // CHECK21-NEXT:    br label [[OMP_IF_END:%.*]]
 // CHECK21:       omp_if.else:
@@ -11352,7 +11352,7 @@ int bar(int n){
 // CHECK21-NEXT:    [[TMP28:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8
 // CHECK21-NEXT:    [[ADD28:%.*]] = add i64 [[TMP28]], 1
 // CHECK21-NEXT:    store i64 [[ADD28]], ptr [[DOTOMP_IV]], align 8
-// CHECK21-NEXT:    br label [[OMP_INNER_FOR_COND13]], !llvm.loop [[LOOP44:![0-9]+]]
+// CHECK21-NEXT:    br label [[OMP_INNER_FOR_COND13]], !llvm.loop [[LOOP30:![0-9]+]]
 // CHECK21:       omp.inner.for.end29:
 // CHECK21-NEXT:    br label [[OMP_IF_END]]
 // CHECK21:       omp_if.end:
@@ -11394,7 +11394,7 @@ int bar(int n){
 //
 //
 // CHECK21-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l178.omp_outlined
-// CHECK21-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR1]] {
+// CHECK21-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
 // CHECK21-NEXT:  entry:
 // CHECK21-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK21-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -11436,35 +11436,35 @@ int bar(int n){
 // CHECK21-NEXT:    store i64 [[TMP5]], ptr [[DOTOMP_IV]], align 8
 // CHECK21-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK21:       omp.inner.for.cond:
-// CHECK21-NEXT:    [[TMP6:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP46:![0-9]+]]
-// CHECK21-NEXT:    [[TMP7:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8, !llvm.access.group [[ACC_GRP46]]
+// CHECK21-NEXT:    [[TMP6:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP32:![0-9]+]]
+// CHECK21-NEXT:    [[TMP7:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8, !llvm.access.group [[ACC_GRP32]]
 // CHECK21-NEXT:    [[CMP1:%.*]] = icmp sle i64 [[TMP6]], [[TMP7]]
 // CHECK21-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK21:       omp.inner.for.body:
-// CHECK21-NEXT:    [[TMP8:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP46]]
+// CHECK21-NEXT:    [[TMP8:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP32]]
 // CHECK21-NEXT:    [[MUL:%.*]] = mul nsw i64 [[TMP8]], 3
 // CHECK21-NEXT:    [[ADD:%.*]] = add nsw i64 -10, [[MUL]]
-// CHECK21-NEXT:    store i64 [[ADD]], ptr [[I]], align 8, !llvm.access.group [[ACC_GRP46]]
-// CHECK21-NEXT:    [[TMP9:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP46]]
+// CHECK21-NEXT:    store i64 [[ADD]], ptr [[I]], align 8, !llvm.access.group [[ACC_GRP32]]
+// CHECK21-NEXT:    [[TMP9:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP32]]
 // CHECK21-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP9]], 1
-// CHECK21-NEXT:    store i32 [[ADD2]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP46]]
-// CHECK21-NEXT:    [[TMP10:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP46]]
+// CHECK21-NEXT:    store i32 [[ADD2]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP32]]
+// CHECK21-NEXT:    [[TMP10:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP32]]
 // CHECK21-NEXT:    [[CONV:%.*]] = sext i16 [[TMP10]] to i32
 // CHECK21-NEXT:    [[ADD3:%.*]] = add nsw i32 [[CONV]], 1
 // CHECK21-NEXT:    [[CONV4:%.*]] = trunc i32 [[ADD3]] to i16
-// CHECK21-NEXT:    store i16 [[CONV4]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP46]]
+// CHECK21-NEXT:    store i16 [[CONV4]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP32]]
 // CHECK21-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP0]], i64 0, i64 2
-// CHECK21-NEXT:    [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP46]]
+// CHECK21-NEXT:    [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP32]]
 // CHECK21-NEXT:    [[ADD5:%.*]] = add nsw i32 [[TMP11]], 1
-// CHECK21-NEXT:    store i32 [[ADD5]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP46]]
+// CHECK21-NEXT:    store i32 [[ADD5]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP32]]
 // CHECK21-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK21:       omp.body.continue:
 // CHECK21-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK21:       omp.inner.for.inc:
-// CHECK21-NEXT:    [[TMP12:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP46]]
+// CHECK21-NEXT:    [[TMP12:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP32]]
 // CHECK21-NEXT:    [[ADD6:%.*]] = add nsw i64 [[TMP12]], 1
-// CHECK21-NEXT:    store i64 [[ADD6]], ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP46]]
-// CHECK21-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP47:![0-9]+]]
+// CHECK21-NEXT:    store i64 [[ADD6]], ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP32]]
+// CHECK21-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP33:![0-9]+]]
 // CHECK21:       omp.inner.for.end:
 // CHECK21-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK21:       omp.loop.exit:
@@ -11487,7 +11487,7 @@ int bar(int n){
 //
 //
 // CHECK23-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l96.omp_outlined
-// CHECK23-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK23-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
 // CHECK23-NEXT:  entry:
 // CHECK23-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK23-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -11522,23 +11522,23 @@ int bar(int n){
 // CHECK23-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
 // CHECK23-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK23:       omp.inner.for.cond:
-// CHECK23-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26:![0-9]+]]
-// CHECK23-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK23-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12:![0-9]+]]
+// CHECK23-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP12]]
 // CHECK23-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
 // CHECK23-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK23:       omp.inner.for.body:
-// CHECK23-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK23-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12]]
 // CHECK23-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 5
 // CHECK23-NEXT:    [[ADD:%.*]] = add nsw i32 3, [[MUL]]
-// CHECK23-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK23-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP12]]
 // CHECK23-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK23:       omp.body.continue:
 // CHECK23-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK23:       omp.inner.for.inc:
-// CHECK23-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK23-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12]]
 // CHECK23-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP8]], 1
-// CHECK23-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
-// CHECK23-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP27:![0-9]+]]
+// CHECK23-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK23-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP13:![0-9]+]]
 // CHECK23:       omp.inner.for.end:
 // CHECK23-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK23:       omp.loop.exit:
@@ -11579,7 +11579,7 @@ int bar(int n){
 //
 //
 // CHECK23-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l108.omp_outlined
-// CHECK23-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[AA:%.*]], i32 noundef [[LIN:%.*]], i32 noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK23-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[AA:%.*]], i32 noundef [[LIN:%.*]], i32 noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK23-NEXT:  entry:
 // CHECK23-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK23-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -11607,7 +11607,7 @@ int bar(int n){
 // CHECK23-NEXT:    store i32 [[TMP0]], ptr [[DOTLINEAR_START]], align 4
 // CHECK23-NEXT:    [[TMP1:%.*]] = load i32, ptr [[A_ADDR]], align 4
 // CHECK23-NEXT:    store i32 [[TMP1]], ptr [[DOTLINEAR_START1]], align 4
-// CHECK23-NEXT:    [[CALL:%.*]] = call noundef i64 @_Z7get_valv() #[[ATTR5:[0-9]+]]
+// CHECK23-NEXT:    [[CALL:%.*]] = call noundef i64 @_Z7get_valv() #[[ATTR4:[0-9]+]]
 // CHECK23-NEXT:    store i64 [[CALL]], ptr [[DOTLINEAR_STEP]], align 8
 // CHECK23-NEXT:    store i64 0, ptr [[DOTOMP_LB]], align 8
 // CHECK23-NEXT:    store i64 3, ptr [[DOTOMP_UB]], align 8
@@ -11632,44 +11632,44 @@ int bar(int n){
 // CHECK23-NEXT:    store i64 [[TMP6]], ptr [[DOTOMP_IV]], align 8
 // CHECK23-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK23:       omp.inner.for.cond:
-// CHECK23-NEXT:    [[TMP7:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP32:![0-9]+]]
-// CHECK23-NEXT:    [[TMP8:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8, !llvm.access.group [[ACC_GRP32]]
+// CHECK23-NEXT:    [[TMP7:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP18:![0-9]+]]
+// CHECK23-NEXT:    [[TMP8:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8, !llvm.access.group [[ACC_GRP18]]
 // CHECK23-NEXT:    [[CMP4:%.*]] = icmp ule i64 [[TMP7]], [[TMP8]]
 // CHECK23-NEXT:    br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK23:       omp.inner.for.body:
-// CHECK23-NEXT:    [[TMP9:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP32]]
+// CHECK23-NEXT:    [[TMP9:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP18]]
 // CHECK23-NEXT:    [[MUL:%.*]] = mul i64 [[TMP9]], 400
 // CHECK23-NEXT:    [[SUB:%.*]] = sub i64 2000, [[MUL]]
-// CHECK23-NEXT:    store i64 [[SUB]], ptr [[IT]], align 8, !llvm.access.group [[ACC_GRP32]]
-// CHECK23-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTLINEAR_START]], align 4, !llvm.access.group [[ACC_GRP32]]
+// CHECK23-NEXT:    store i64 [[SUB]], ptr [[IT]], align 8, !llvm.access.group [[ACC_GRP18]]
+// CHECK23-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTLINEAR_START]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK23-NEXT:    [[CONV:%.*]] = sext i32 [[TMP10]] to i64
-// CHECK23-NEXT:    [[TMP11:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP32]]
-// CHECK23-NEXT:    [[TMP12:%.*]] = load i64, ptr [[DOTLINEAR_STEP]], align 8, !llvm.access.group [[ACC_GRP32]]
+// CHECK23-NEXT:    [[TMP11:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP18]]
+// CHECK23-NEXT:    [[TMP12:%.*]] = load i64, ptr [[DOTLINEAR_STEP]], align 8, !llvm.access.group [[ACC_GRP18]]
 // CHECK23-NEXT:    [[MUL5:%.*]] = mul i64 [[TMP11]], [[TMP12]]
 // CHECK23-NEXT:    [[ADD:%.*]] = add i64 [[CONV]], [[MUL5]]
 // CHECK23-NEXT:    [[CONV6:%.*]] = trunc i64 [[ADD]] to i32
-// CHECK23-NEXT:    store i32 [[CONV6]], ptr [[LIN2]], align 4, !llvm.access.group [[ACC_GRP32]]
-// CHECK23-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTLINEAR_START1]], align 4, !llvm.access.group [[ACC_GRP32]]
+// CHECK23-NEXT:    store i32 [[CONV6]], ptr [[LIN2]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK23-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTLINEAR_START1]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK23-NEXT:    [[CONV7:%.*]] = sext i32 [[TMP13]] to i64
-// CHECK23-NEXT:    [[TMP14:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP32]]
-// CHECK23-NEXT:    [[TMP15:%.*]] = load i64, ptr [[DOTLINEAR_STEP]], align 8, !llvm.access.group [[ACC_GRP32]]
+// CHECK23-NEXT:    [[TMP14:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP18]]
+// CHECK23-NEXT:    [[TMP15:%.*]] = load i64, ptr [[DOTLINEAR_STEP]], align 8, !llvm.access.group [[ACC_GRP18]]
 // CHECK23-NEXT:    [[MUL8:%.*]] = mul i64 [[TMP14]], [[TMP15]]
 // CHECK23-NEXT:    [[ADD9:%.*]] = add i64 [[CONV7]], [[MUL8]]
 // CHECK23-NEXT:    [[CONV10:%.*]] = trunc i64 [[ADD9]] to i32
-// CHECK23-NEXT:    store i32 [[CONV10]], ptr [[A3]], align 4, !llvm.access.group [[ACC_GRP32]]
-// CHECK23-NEXT:    [[TMP16:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP32]]
+// CHECK23-NEXT:    store i32 [[CONV10]], ptr [[A3]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK23-NEXT:    [[TMP16:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP18]]
 // CHECK23-NEXT:    [[CONV11:%.*]] = sext i16 [[TMP16]] to i32
 // CHECK23-NEXT:    [[ADD12:%.*]] = add nsw i32 [[CONV11]], 1
 // CHECK23-NEXT:    [[CONV13:%.*]] = trunc i32 [[ADD12]] to i16
-// CHECK23-NEXT:    store i16 [[CONV13]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP32]]
+// CHECK23-NEXT:    store i16 [[CONV13]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP18]]
 // CHECK23-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK23:       omp.body.continue:
 // CHECK23-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK23:       omp.inner.for.inc:
-// CHECK23-NEXT:    [[TMP17:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP32]]
+// CHECK23-NEXT:    [[TMP17:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP18]]
 // CHECK23-NEXT:    [[ADD14:%.*]] = add i64 [[TMP17]], 1
-// CHECK23-NEXT:    store i64 [[ADD14]], ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP32]]
-// CHECK23-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP33:![0-9]+]]
+// CHECK23-NEXT:    store i64 [[ADD14]], ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP18]]
+// CHECK23-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP19:![0-9]+]]
 // CHECK23:       omp.inner.for.end:
 // CHECK23-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK23:       omp.loop.exit:
@@ -11695,7 +11695,7 @@ int bar(int n){
 //
 //
 // CHECK23-LABEL: define {{[^@]+}}@_Z7get_valv
-// CHECK23-SAME: () #[[ATTR3:[0-9]+]] {
+// CHECK23-SAME: () #[[ATTR2:[0-9]+]] {
 // CHECK23-NEXT:  entry:
 // CHECK23-NEXT:    ret i64 0
 //
@@ -11720,7 +11720,7 @@ int bar(int n){
 //
 //
 // CHECK23-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l116.omp_outlined
-// CHECK23-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]]) #[[ATTR1]] {
+// CHECK23-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]]) #[[ATTR0]] {
 // CHECK23-NEXT:  entry:
 // CHECK23-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK23-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -11759,32 +11759,32 @@ int bar(int n){
 // CHECK23-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
 // CHECK23-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK23:       omp.inner.for.cond:
-// CHECK23-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP35:![0-9]+]]
-// CHECK23-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP35]]
+// CHECK23-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP21:![0-9]+]]
+// CHECK23-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP21]]
 // CHECK23-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
 // CHECK23-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK23:       omp.inner.for.body:
-// CHECK23-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP35]]
+// CHECK23-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP21]]
 // CHECK23-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 4
 // CHECK23-NEXT:    [[ADD:%.*]] = add nsw i32 6, [[MUL]]
 // CHECK23-NEXT:    [[CONV:%.*]] = trunc i32 [[ADD]] to i16
-// CHECK23-NEXT:    store i16 [[CONV]], ptr [[IT]], align 2, !llvm.access.group [[ACC_GRP35]]
-// CHECK23-NEXT:    [[TMP8:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP35]]
+// CHECK23-NEXT:    store i16 [[CONV]], ptr [[IT]], align 2, !llvm.access.group [[ACC_GRP21]]
+// CHECK23-NEXT:    [[TMP8:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP21]]
 // CHECK23-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP8]], 1
-// CHECK23-NEXT:    store i32 [[ADD2]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP35]]
-// CHECK23-NEXT:    [[TMP9:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP35]]
+// CHECK23-NEXT:    store i32 [[ADD2]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP21]]
+// CHECK23-NEXT:    [[TMP9:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP21]]
 // CHECK23-NEXT:    [[CONV3:%.*]] = sext i16 [[TMP9]] to i32
 // CHECK23-NEXT:    [[ADD4:%.*]] = add nsw i32 [[CONV3]], 1
 // CHECK23-NEXT:    [[CONV5:%.*]] = trunc i32 [[ADD4]] to i16
-// CHECK23-NEXT:    store i16 [[CONV5]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP35]]
+// CHECK23-NEXT:    store i16 [[CONV5]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP21]]
 // CHECK23-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK23:       omp.body.continue:
 // CHECK23-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK23:       omp.inner.for.inc:
-// CHECK23-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP35]]
+// CHECK23-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP21]]
 // CHECK23-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP10]], 1
-// CHECK23-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP35]]
-// CHECK23-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP36:![0-9]+]]
+// CHECK23-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP21]]
+// CHECK23-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP22:![0-9]+]]
 // CHECK23:       omp.inner.for.end:
 // CHECK23-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK23:       omp.loop.exit:
@@ -11843,7 +11843,7 @@ int bar(int n){
 //
 //
 // CHECK23-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l140.omp_outlined
-// CHECK23-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BN:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[C:%.*]], i32 noundef [[VLA1:%.*]], i32 noundef [[VLA3:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[CN:%.*]], ptr noundef nonnull align 4 dereferenceable(12) [[D:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR1]] {
+// CHECK23-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BN:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[C:%.*]], i32 noundef [[VLA1:%.*]], i32 noundef [[VLA3:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[CN:%.*]], ptr noundef nonnull align 4 dereferenceable(12) [[D:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] {
 // CHECK23-NEXT:  entry:
 // CHECK23-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK23-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -11914,60 +11914,60 @@ int bar(int n){
 // CHECK23:       omp.dispatch.body:
 // CHECK23-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK23:       omp.inner.for.cond:
-// CHECK23-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP38:![0-9]+]]
-// CHECK23-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP38]]
+// CHECK23-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP24:![0-9]+]]
+// CHECK23-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP24]]
 // CHECK23-NEXT:    [[CMP6:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]]
 // CHECK23-NEXT:    br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK23:       omp.inner.for.body:
-// CHECK23-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP38]]
+// CHECK23-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP24]]
 // CHECK23-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1
 // CHECK23-NEXT:    [[SUB:%.*]] = sub nsw i32 122, [[MUL]]
 // CHECK23-NEXT:    [[CONV:%.*]] = trunc i32 [[SUB]] to i8
-// CHECK23-NEXT:    store i8 [[CONV]], ptr [[IT]], align 1, !llvm.access.group [[ACC_GRP38]]
-// CHECK23-NEXT:    [[TMP19:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP38]]
+// CHECK23-NEXT:    store i8 [[CONV]], ptr [[IT]], align 1, !llvm.access.group [[ACC_GRP24]]
+// CHECK23-NEXT:    [[TMP19:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP24]]
 // CHECK23-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP19]], 1
-// CHECK23-NEXT:    store i32 [[ADD]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP38]]
+// CHECK23-NEXT:    store i32 [[ADD]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP24]]
 // CHECK23-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x float], ptr [[TMP0]], i32 0, i32 2
-// CHECK23-NEXT:    [[TMP20:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP38]]
+// CHECK23-NEXT:    [[TMP20:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP24]]
 // CHECK23-NEXT:    [[CONV7:%.*]] = fpext float [[TMP20]] to double
 // CHECK23-NEXT:    [[ADD8:%.*]] = fadd double [[CONV7]], 1.000000e+00
 // CHECK23-NEXT:    [[CONV9:%.*]] = fptrunc double [[ADD8]] to float
-// CHECK23-NEXT:    store float [[CONV9]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP38]]
+// CHECK23-NEXT:    store float [[CONV9]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP24]]
 // CHECK23-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 3
-// CHECK23-NEXT:    [[TMP21:%.*]] = load float, ptr [[ARRAYIDX10]], align 4, !llvm.access.group [[ACC_GRP38]]
+// CHECK23-NEXT:    [[TMP21:%.*]] = load float, ptr [[ARRAYIDX10]], align 4, !llvm.access.group [[ACC_GRP24]]
 // CHECK23-NEXT:    [[CONV11:%.*]] = fpext float [[TMP21]] to double
 // CHECK23-NEXT:    [[ADD12:%.*]] = fadd double [[CONV11]], 1.000000e+00
 // CHECK23-NEXT:    [[CONV13:%.*]] = fptrunc double [[ADD12]] to float
-// CHECK23-NEXT:    store float [[CONV13]], ptr [[ARRAYIDX10]], align 4, !llvm.access.group [[ACC_GRP38]]
+// CHECK23-NEXT:    store float [[CONV13]], ptr [[ARRAYIDX10]], align 4, !llvm.access.group [[ACC_GRP24]]
 // CHECK23-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds [5 x [10 x double]], ptr [[TMP3]], i32 0, i32 1
 // CHECK23-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds [10 x double], ptr [[ARRAYIDX14]], i32 0, i32 2
-// CHECK23-NEXT:    [[TMP22:%.*]] = load double, ptr [[ARRAYIDX15]], align 8, !llvm.access.group [[ACC_GRP38]]
+// CHECK23-NEXT:    [[TMP22:%.*]] = load double, ptr [[ARRAYIDX15]], align 8, !llvm.access.group [[ACC_GRP24]]
 // CHECK23-NEXT:    [[ADD16:%.*]] = fadd double [[TMP22]], 1.000000e+00
-// CHECK23-NEXT:    store double [[ADD16]], ptr [[ARRAYIDX15]], align 8, !llvm.access.group [[ACC_GRP38]]
+// CHECK23-NEXT:    store double [[ADD16]], ptr [[ARRAYIDX15]], align 8, !llvm.access.group [[ACC_GRP24]]
 // CHECK23-NEXT:    [[TMP23:%.*]] = mul nsw i32 1, [[TMP5]]
 // CHECK23-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds double, ptr [[TMP6]], i32 [[TMP23]]
 // CHECK23-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds double, ptr [[ARRAYIDX17]], i32 3
-// CHECK23-NEXT:    [[TMP24:%.*]] = load double, ptr [[ARRAYIDX18]], align 8, !llvm.access.group [[ACC_GRP38]]
+// CHECK23-NEXT:    [[TMP24:%.*]] = load double, ptr [[ARRAYIDX18]], align 8, !llvm.access.group [[ACC_GRP24]]
 // CHECK23-NEXT:    [[ADD19:%.*]] = fadd double [[TMP24]], 1.000000e+00
-// CHECK23-NEXT:    store double [[ADD19]], ptr [[ARRAYIDX18]], align 8, !llvm.access.group [[ACC_GRP38]]
+// CHECK23-NEXT:    store double [[ADD19]], ptr [[ARRAYIDX18]], align 8, !llvm.access.group [[ACC_GRP24]]
 // CHECK23-NEXT:    [[X:%.*]] = getelementptr inbounds [[STRUCT_TT:%.*]], ptr [[TMP7]], i32 0, i32 0
-// CHECK23-NEXT:    [[TMP25:%.*]] = load i64, ptr [[X]], align 4, !llvm.access.group [[ACC_GRP38]]
+// CHECK23-NEXT:    [[TMP25:%.*]] = load i64, ptr [[X]], align 4, !llvm.access.group [[ACC_GRP24]]
 // CHECK23-NEXT:    [[ADD20:%.*]] = add nsw i64 [[TMP25]], 1
-// CHECK23-NEXT:    store i64 [[ADD20]], ptr [[X]], align 4, !llvm.access.group [[ACC_GRP38]]
+// CHECK23-NEXT:    store i64 [[ADD20]], ptr [[X]], align 4, !llvm.access.group [[ACC_GRP24]]
 // CHECK23-NEXT:    [[Y:%.*]] = getelementptr inbounds [[STRUCT_TT]], ptr [[TMP7]], i32 0, i32 1
-// CHECK23-NEXT:    [[TMP26:%.*]] = load i8, ptr [[Y]], align 4, !llvm.access.group [[ACC_GRP38]]
+// CHECK23-NEXT:    [[TMP26:%.*]] = load i8, ptr [[Y]], align 4, !llvm.access.group [[ACC_GRP24]]
 // CHECK23-NEXT:    [[CONV21:%.*]] = sext i8 [[TMP26]] to i32
 // CHECK23-NEXT:    [[ADD22:%.*]] = add nsw i32 [[CONV21]], 1
 // CHECK23-NEXT:    [[CONV23:%.*]] = trunc i32 [[ADD22]] to i8
-// CHECK23-NEXT:    store i8 [[CONV23]], ptr [[Y]], align 4, !llvm.access.group [[ACC_GRP38]]
+// CHECK23-NEXT:    store i8 [[CONV23]], ptr [[Y]], align 4, !llvm.access.group [[ACC_GRP24]]
 // CHECK23-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK23:       omp.body.continue:
 // CHECK23-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK23:       omp.inner.for.inc:
-// CHECK23-NEXT:    [[TMP27:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP38]]
+// CHECK23-NEXT:    [[TMP27:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP24]]
 // CHECK23-NEXT:    [[ADD24:%.*]] = add nsw i32 [[TMP27]], 1
-// CHECK23-NEXT:    store i32 [[ADD24]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP38]]
-// CHECK23-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP39:![0-9]+]]
+// CHECK23-NEXT:    store i32 [[ADD24]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP24]]
+// CHECK23-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP25:![0-9]+]]
 // CHECK23:       omp.inner.for.end:
 // CHECK23-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK23:       omp.dispatch.inc:
@@ -12021,7 +12021,7 @@ int bar(int n){
 //
 //
 // CHECK23-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l195.omp_outlined
-// CHECK23-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]], i32 noundef [[AAA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR1]] {
+// CHECK23-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]], i32 noundef [[AAA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
 // CHECK23-NEXT:  entry:
 // CHECK23-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK23-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -12083,7 +12083,7 @@ int bar(int n){
 // CHECK23-NEXT:    call void @__kmpc_serialized_parallel(ptr @[[GLOB2]], i32 [[TMP0]])
 // CHECK23-NEXT:    store i32 [[TMP0]], ptr [[DOTTHREADID_TEMP_]], align 4
 // CHECK23-NEXT:    store i32 0, ptr [[DOTBOUND_ZERO_ADDR]], align 4
-// CHECK23-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l214.omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTBOUND_ZERO_ADDR]], ptr [[TMP1]], i32 [[TMP6]], i32 [[TMP2]], i32 [[TMP3]], ptr [[TMP4]], i32 [[TMP8]]) #[[ATTR2:[0-9]+]]
+// CHECK23-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l214.omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTBOUND_ZERO_ADDR]], ptr [[TMP1]], i32 [[TMP6]], i32 [[TMP2]], i32 [[TMP3]], ptr [[TMP4]], i32 [[TMP8]]) #[[ATTR1:[0-9]+]]
 // CHECK23-NEXT:    call void @__kmpc_end_serialized_parallel(ptr @[[GLOB2]], i32 [[TMP0]])
 // CHECK23-NEXT:    br label [[OMP_IF_END]]
 // CHECK23:       omp_if.end:
@@ -12091,7 +12091,7 @@ int bar(int n){
 //
 //
 // CHECK23-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l214.omp_outlined
-// CHECK23-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]], i32 noundef [[B:%.*]], i32 noundef [[VLA:%.*]], i32 noundef [[VLA1:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[C:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR1]] {
+// CHECK23-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]], i32 noundef [[B:%.*]], i32 noundef [[VLA:%.*]], i32 noundef [[VLA1:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[C:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] {
 // CHECK23-NEXT:  entry:
 // CHECK23-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK23-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -12146,37 +12146,37 @@ int bar(int n){
 // CHECK23-NEXT:    store i64 [[TMP9]], ptr [[DOTOMP_IV]], align 8
 // CHECK23-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK23:       omp.inner.for.cond:
-// CHECK23-NEXT:    [[TMP10:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP41:![0-9]+]]
-// CHECK23-NEXT:    [[TMP11:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8, !llvm.access.group [[ACC_GRP41]]
+// CHECK23-NEXT:    [[TMP10:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP27:![0-9]+]]
+// CHECK23-NEXT:    [[TMP11:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8, !llvm.access.group [[ACC_GRP27]]
 // CHECK23-NEXT:    [[CMP3:%.*]] = icmp ule i64 [[TMP10]], [[TMP11]]
 // CHECK23-NEXT:    br i1 [[CMP3]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK23:       omp.inner.for.body:
-// CHECK23-NEXT:    [[TMP12:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP41]]
+// CHECK23-NEXT:    [[TMP12:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP27]]
 // CHECK23-NEXT:    [[MUL:%.*]] = mul i64 [[TMP12]], 400
 // CHECK23-NEXT:    [[SUB:%.*]] = sub i64 2000, [[MUL]]
-// CHECK23-NEXT:    store i64 [[SUB]], ptr [[IT]], align 8, !llvm.access.group [[ACC_GRP41]]
-// CHECK23-NEXT:    [[TMP13:%.*]] = load i32, ptr [[B_ADDR]], align 4, !llvm.access.group [[ACC_GRP41]]
+// CHECK23-NEXT:    store i64 [[SUB]], ptr [[IT]], align 8, !llvm.access.group [[ACC_GRP27]]
+// CHECK23-NEXT:    [[TMP13:%.*]] = load i32, ptr [[B_ADDR]], align 4, !llvm.access.group [[ACC_GRP27]]
 // CHECK23-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP13]] to double
 // CHECK23-NEXT:    [[ADD:%.*]] = fadd double [[CONV]], 1.500000e+00
 // CHECK23-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_S1:%.*]], ptr [[TMP0]], i32 0, i32 0
-// CHECK23-NEXT:    store double [[ADD]], ptr [[A]], align 4, !nontemporal !42, !llvm.access.group [[ACC_GRP41]]
+// CHECK23-NEXT:    store double [[ADD]], ptr [[A]], align 4, !nontemporal !28, !llvm.access.group [[ACC_GRP27]]
 // CHECK23-NEXT:    [[A4:%.*]] = getelementptr inbounds [[STRUCT_S1]], ptr [[TMP0]], i32 0, i32 0
-// CHECK23-NEXT:    [[TMP14:%.*]] = load double, ptr [[A4]], align 4, !nontemporal !42, !llvm.access.group [[ACC_GRP41]]
+// CHECK23-NEXT:    [[TMP14:%.*]] = load double, ptr [[A4]], align 4, !nontemporal !28, !llvm.access.group [[ACC_GRP27]]
 // CHECK23-NEXT:    [[INC:%.*]] = fadd double [[TMP14]], 1.000000e+00
-// CHECK23-NEXT:    store double [[INC]], ptr [[A4]], align 4, !nontemporal !42, !llvm.access.group [[ACC_GRP41]]
+// CHECK23-NEXT:    store double [[INC]], ptr [[A4]], align 4, !nontemporal !28, !llvm.access.group [[ACC_GRP27]]
 // CHECK23-NEXT:    [[CONV5:%.*]] = fptosi double [[INC]] to i16
 // CHECK23-NEXT:    [[TMP15:%.*]] = mul nsw i32 1, [[TMP2]]
 // CHECK23-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[TMP3]], i32 [[TMP15]]
 // CHECK23-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i16, ptr [[ARRAYIDX]], i32 1
-// CHECK23-NEXT:    store i16 [[CONV5]], ptr [[ARRAYIDX6]], align 2, !llvm.access.group [[ACC_GRP41]]
+// CHECK23-NEXT:    store i16 [[CONV5]], ptr [[ARRAYIDX6]], align 2, !llvm.access.group [[ACC_GRP27]]
 // CHECK23-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK23:       omp.body.continue:
 // CHECK23-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK23:       omp.inner.for.inc:
-// CHECK23-NEXT:    [[TMP16:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP41]]
+// CHECK23-NEXT:    [[TMP16:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP27]]
 // CHECK23-NEXT:    [[ADD7:%.*]] = add i64 [[TMP16]], 1
-// CHECK23-NEXT:    store i64 [[ADD7]], ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP41]]
-// CHECK23-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP43:![0-9]+]]
+// CHECK23-NEXT:    store i64 [[ADD7]], ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP27]]
+// CHECK23-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP29:![0-9]+]]
 // CHECK23:       omp.inner.for.end:
 // CHECK23-NEXT:    br label [[OMP_IF_END:%.*]]
 // CHECK23:       omp_if.else:
@@ -12228,7 +12228,7 @@ int bar(int n){
 // CHECK23-NEXT:    [[TMP28:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8
 // CHECK23-NEXT:    [[ADD28:%.*]] = add i64 [[TMP28]], 1
 // CHECK23-NEXT:    store i64 [[ADD28]], ptr [[DOTOMP_IV]], align 8
-// CHECK23-NEXT:    br label [[OMP_INNER_FOR_COND13]], !llvm.loop [[LOOP45:![0-9]+]]
+// CHECK23-NEXT:    br label [[OMP_INNER_FOR_COND13]], !llvm.loop [[LOOP31:![0-9]+]]
 // CHECK23:       omp.inner.for.end29:
 // CHECK23-NEXT:    br label [[OMP_IF_END]]
 // CHECK23:       omp_if.end:
@@ -12270,7 +12270,7 @@ int bar(int n){
 //
 //
 // CHECK23-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l178.omp_outlined
-// CHECK23-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR1]] {
+// CHECK23-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
 // CHECK23-NEXT:  entry:
 // CHECK23-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK23-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -12312,35 +12312,35 @@ int bar(int n){
 // CHECK23-NEXT:    store i64 [[TMP5]], ptr [[DOTOMP_IV]], align 8
 // CHECK23-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK23:       omp.inner.for.cond:
-// CHECK23-NEXT:    [[TMP6:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP47:![0-9]+]]
-// CHECK23-NEXT:    [[TMP7:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8, !llvm.access.group [[ACC_GRP47]]
+// CHECK23-NEXT:    [[TMP6:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP33:![0-9]+]]
+// CHECK23-NEXT:    [[TMP7:%.*]] = load i64, ptr [[DOTOMP_UB]], align 8, !llvm.access.group [[ACC_GRP33]]
 // CHECK23-NEXT:    [[CMP1:%.*]] = icmp sle i64 [[TMP6]], [[TMP7]]
 // CHECK23-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK23:       omp.inner.for.body:
-// CHECK23-NEXT:    [[TMP8:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP47]]
+// CHECK23-NEXT:    [[TMP8:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP33]]
 // CHECK23-NEXT:    [[MUL:%.*]] = mul nsw i64 [[TMP8]], 3
 // CHECK23-NEXT:    [[ADD:%.*]] = add nsw i64 -10, [[MUL]]
-// CHECK23-NEXT:    store i64 [[ADD]], ptr [[I]], align 8, !llvm.access.group [[ACC_GRP47]]
-// CHECK23-NEXT:    [[TMP9:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP47]]
+// CHECK23-NEXT:    store i64 [[ADD]], ptr [[I]], align 8, !llvm.access.group [[ACC_GRP33]]
+// CHECK23-NEXT:    [[TMP9:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP33]]
 // CHECK23-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP9]], 1
-// CHECK23-NEXT:    store i32 [[ADD2]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP47]]
-// CHECK23-NEXT:    [[TMP10:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP47]]
+// CHECK23-NEXT:    store i32 [[ADD2]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP33]]
+// CHECK23-NEXT:    [[TMP10:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP33]]
 // CHECK23-NEXT:    [[CONV:%.*]] = sext i16 [[TMP10]] to i32
 // CHECK23-NEXT:    [[ADD3:%.*]] = add nsw i32 [[CONV]], 1
 // CHECK23-NEXT:    [[CONV4:%.*]] = trunc i32 [[ADD3]] to i16
-// CHECK23-NEXT:    store i16 [[CONV4]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP47]]
+// CHECK23-NEXT:    store i16 [[CONV4]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP33]]
 // CHECK23-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP0]], i32 0, i32 2
-// CHECK23-NEXT:    [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP47]]
+// CHECK23-NEXT:    [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP33]]
 // CHECK23-NEXT:    [[ADD5:%.*]] = add nsw i32 [[TMP11]], 1
-// CHECK23-NEXT:    store i32 [[ADD5]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP47]]
+// CHECK23-NEXT:    store i32 [[ADD5]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP33]]
 // CHECK23-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK23:       omp.body.continue:
 // CHECK23-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK23:       omp.inner.for.inc:
-// CHECK23-NEXT:    [[TMP12:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP47]]
+// CHECK23-NEXT:    [[TMP12:%.*]] = load i64, ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP33]]
 // CHECK23-NEXT:    [[ADD6:%.*]] = add nsw i64 [[TMP12]], 1
-// CHECK23-NEXT:    store i64 [[ADD6]], ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP47]]
-// CHECK23-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP48:![0-9]+]]
+// CHECK23-NEXT:    store i64 [[ADD6]], ptr [[DOTOMP_IV]], align 8, !llvm.access.group [[ACC_GRP33]]
+// CHECK23-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP34:![0-9]+]]
 // CHECK23:       omp.inner.for.end:
 // CHECK23-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK23:       omp.loop.exit:
diff --git a/clang/test/OpenMP/target_parallel_generic_loop_codegen-3.cpp b/clang/test/OpenMP/target_parallel_generic_loop_codegen-3.cpp
index b92d173e13d63..a8e55c51aafd8 100644
--- a/clang/test/OpenMP/target_parallel_generic_loop_codegen-3.cpp
+++ b/clang/test/OpenMP/target_parallel_generic_loop_codegen-3.cpp
@@ -55,7 +55,7 @@ int main() {
   return 0;
 }
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13_debug__
-// CHECK1-SAME: (ptr addrspace(1) noalias noundef [[C:%.*]], i32 noundef [[A:%.*]], ptr noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]], i1 noundef zeroext [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0:[0-9]+]] !dbg [[DBG13:![0-9]+]] {
+// CHECK1-SAME: (ptr addrspace(1) noalias noundef [[C:%.*]], i32 noundef [[A:%.*]], ptr noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]], i1 noundef zeroext [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0:[0-9]+]] !dbg [[DBG22:![0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[C_ADDR:%.*]] = alloca ptr addrspace(1), align 8
 // CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
@@ -68,56 +68,56 @@ int main() {
 // CHECK1-NEXT:    [[A_CASTED:%.*]] = alloca i64, align 8
 // CHECK1-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x ptr], align 8
 // CHECK1-NEXT:    store ptr addrspace(1) [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META31:![0-9]+]], metadata !DIExpression()), !dbg [[DBG32:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META40:![0-9]+]], metadata !DIExpression()), !dbg [[DBG41:![0-9]+]]
 // CHECK1-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META33:![0-9]+]], metadata !DIExpression()), !dbg [[DBG34:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META42:![0-9]+]], metadata !DIExpression()), !dbg [[DBG43:![0-9]+]]
 // CHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META35:![0-9]+]], metadata !DIExpression()), !dbg [[DBG36:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META44:![0-9]+]], metadata !DIExpression()), !dbg [[DBG45:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META37:![0-9]+]], metadata !DIExpression()), !dbg [[DBG38:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META46:![0-9]+]], metadata !DIExpression()), !dbg [[DBG47:![0-9]+]]
 // CHECK1-NEXT:    [[FROMBOOL:%.*]] = zext i1 [[DOTCAPTURE_EXPR_]] to i8
 // CHECK1-NEXT:    store i8 [[FROMBOOL]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 1
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTCAPTURE_EXPR__ADDR]], metadata [[META39:![0-9]+]], metadata !DIExpression()), !dbg [[DBG40:![0-9]+]]
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG41:![0-9]+]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG41]]
-// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG41]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG41]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG41]]
-// CHECK1-NEXT:    store ptr [[TMP3]], ptr [[_TMP1]], align 8, !dbg [[DBG41]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG41]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG41]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = addrspacecast ptr addrspace(1) [[TMP5]] to ptr, !dbg [[DBG41]]
-// CHECK1-NEXT:    store ptr [[TMP6]], ptr [[_TMP2]], align 8, !dbg [[DBG41]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG41]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13_kernel_environment), !dbg [[DBG41]]
-// CHECK1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP8]], -1, !dbg [[DBG41]]
-// CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]], !dbg [[DBG41]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTCAPTURE_EXPR__ADDR]], metadata [[META48:![0-9]+]], metadata !DIExpression()), !dbg [[DBG49:![0-9]+]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG50:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG50]]
+// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG50]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG50]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG50]]
+// CHECK1-NEXT:    store ptr [[TMP3]], ptr [[_TMP1]], align 8, !dbg [[DBG50]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG50]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG50]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = addrspacecast ptr addrspace(1) [[TMP5]] to ptr, !dbg [[DBG50]]
+// CHECK1-NEXT:    store ptr [[TMP6]], ptr [[_TMP2]], align 8, !dbg [[DBG50]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG50]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13_kernel_environment), !dbg [[DBG50]]
+// CHECK1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP8]], -1, !dbg [[DBG50]]
+// CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]], !dbg [[DBG50]]
 // CHECK1:       user_code.entry:
 // CHECK1-NEXT:    [[TMP9:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB6:[0-9]+]])
-// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG42:![0-9]+]]
-// CHECK1-NEXT:    store i32 [[TMP10]], ptr [[A_CASTED]], align 4, !dbg [[DBG42]]
-// CHECK1-NEXT:    [[TMP11:%.*]] = load i64, ptr [[A_CASTED]], align 8, !dbg [[DBG42]]
-// CHECK1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 0, !dbg [[DBG42]]
-// CHECK1-NEXT:    store ptr [[TMP2]], ptr [[TMP12]], align 8, !dbg [[DBG42]]
-// CHECK1-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 1, !dbg [[DBG42]]
-// CHECK1-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP11]] to ptr, !dbg [[DBG42]]
-// CHECK1-NEXT:    store ptr [[TMP14]], ptr [[TMP13]], align 8, !dbg [[DBG42]]
-// CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 2, !dbg [[DBG42]]
-// CHECK1-NEXT:    store ptr [[TMP4]], ptr [[TMP15]], align 8, !dbg [[DBG42]]
-// CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 3, !dbg [[DBG42]]
-// CHECK1-NEXT:    store ptr [[TMP7]], ptr [[TMP16]], align 8, !dbg [[DBG42]]
-// CHECK1-NEXT:    [[TMP17:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1, !dbg [[DBG43:![0-9]+]]
-// CHECK1-NEXT:    [[TOBOOL:%.*]] = trunc i8 [[TMP17]] to i1, !dbg [[DBG43]]
-// CHECK1-NEXT:    [[TMP18:%.*]] = zext i1 [[TOBOOL]] to i32, !dbg [[DBG42]]
-// CHECK1-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB6]], i32 [[TMP9]], i32 [[TMP18]], i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13_debug___omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 4), !dbg [[DBG42]]
-// CHECK1-NEXT:    call void @__kmpc_target_deinit(), !dbg [[DBG45:![0-9]+]]
-// CHECK1-NEXT:    ret void, !dbg [[DBG46:![0-9]+]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG51:![0-9]+]]
+// CHECK1-NEXT:    store i32 [[TMP10]], ptr [[A_CASTED]], align 4, !dbg [[DBG51]]
+// CHECK1-NEXT:    [[TMP11:%.*]] = load i64, ptr [[A_CASTED]], align 8, !dbg [[DBG51]]
+// CHECK1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 0, !dbg [[DBG51]]
+// CHECK1-NEXT:    store ptr [[TMP2]], ptr [[TMP12]], align 8, !dbg [[DBG51]]
+// CHECK1-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 1, !dbg [[DBG51]]
+// CHECK1-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP11]] to ptr, !dbg [[DBG51]]
+// CHECK1-NEXT:    store ptr [[TMP14]], ptr [[TMP13]], align 8, !dbg [[DBG51]]
+// CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 2, !dbg [[DBG51]]
+// CHECK1-NEXT:    store ptr [[TMP4]], ptr [[TMP15]], align 8, !dbg [[DBG51]]
+// CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 3, !dbg [[DBG51]]
+// CHECK1-NEXT:    store ptr [[TMP7]], ptr [[TMP16]], align 8, !dbg [[DBG51]]
+// CHECK1-NEXT:    [[TMP17:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1, !dbg [[DBG52:![0-9]+]]
+// CHECK1-NEXT:    [[TOBOOL:%.*]] = trunc i8 [[TMP17]] to i1, !dbg [[DBG52]]
+// CHECK1-NEXT:    [[TMP18:%.*]] = zext i1 [[TOBOOL]] to i32, !dbg [[DBG51]]
+// CHECK1-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB6]], i32 [[TMP9]], i32 [[TMP18]], i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13_debug___omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 4), !dbg [[DBG51]]
+// CHECK1-NEXT:    call void @__kmpc_target_deinit(), !dbg [[DBG54:![0-9]+]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG55:![0-9]+]]
 // CHECK1:       worker.exit:
-// CHECK1-NEXT:    ret void, !dbg [[DBG41]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG50]]
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13_debug___omp_outlined_debug__
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], i32 noundef [[A:%.*]], ptr noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG47:![0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], i32 noundef [[A:%.*]], ptr noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR2:[0-9]+]] !dbg [[DBG56:![0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -141,149 +141,149 @@ int main() {
 // CHECK1-NEXT:    [[H:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[D:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META54:![0-9]+]], metadata !DIExpression()), !dbg [[DBG55:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META63:![0-9]+]], metadata !DIExpression()), !dbg [[DBG64:![0-9]+]]
 // CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META56:![0-9]+]], metadata !DIExpression()), !dbg [[DBG55]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META65:![0-9]+]], metadata !DIExpression()), !dbg [[DBG64]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META57:![0-9]+]], metadata !DIExpression()), !dbg [[DBG58:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META66:![0-9]+]], metadata !DIExpression()), !dbg [[DBG67:![0-9]+]]
 // CHECK1-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META59:![0-9]+]], metadata !DIExpression()), !dbg [[DBG60:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META68:![0-9]+]], metadata !DIExpression()), !dbg [[DBG69:![0-9]+]]
 // CHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META61:![0-9]+]], metadata !DIExpression()), !dbg [[DBG62:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META70:![0-9]+]], metadata !DIExpression()), !dbg [[DBG71:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META63:![0-9]+]], metadata !DIExpression()), !dbg [[DBG64:![0-9]+]]
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG65:![0-9]+]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG65]]
-// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG65]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG65]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG65]]
-// CHECK1-NEXT:    store ptr [[TMP3]], ptr [[_TMP1]], align 8, !dbg [[DBG65]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG65]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG65]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = addrspacecast ptr addrspace(1) [[TMP5]] to ptr, !dbg [[DBG65]]
-// CHECK1-NEXT:    store ptr [[TMP6]], ptr [[_TMP2]], align 8, !dbg [[DBG65]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG65]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_IV]], metadata [[META66:![0-9]+]], metadata !DIExpression()), !dbg [[DBG55]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_LB]], metadata [[META67:![0-9]+]], metadata !DIExpression()), !dbg [[DBG55]]
-// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG68:![0-9]+]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_UB]], metadata [[META69:![0-9]+]], metadata !DIExpression()), !dbg [[DBG55]]
-// CHECK1-NEXT:    store i32 9, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG68]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_STRIDE]], metadata [[META70:![0-9]+]], metadata !DIExpression()), !dbg [[DBG55]]
-// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG68]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_IS_LAST]], metadata [[META71:![0-9]+]], metadata !DIExpression()), !dbg [[DBG55]]
-// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4, !dbg [[DBG68]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B4]], metadata [[META72:![0-9]+]], metadata !DIExpression()), !dbg [[DBG55]]
-// CHECK1-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[B4]], ptr align 4 [[TMP4]], i64 400, i1 false), !dbg [[DBG65]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[I]], metadata [[META73:![0-9]+]], metadata !DIExpression()), !dbg [[DBG55]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG65]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP8]], align 4, !dbg [[DBG65]]
-// CHECK1-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB3:[0-9]+]], i32 [[TMP9]], i32 33, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1), !dbg [[DBG74:![0-9]+]]
-// CHECK1-NEXT:    br label [[OMP_DISPATCH_COND:%.*]], !dbg [[DBG65]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META72:![0-9]+]], metadata !DIExpression()), !dbg [[DBG73:![0-9]+]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG74:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG74]]
+// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG74]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG74]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG74]]
+// CHECK1-NEXT:    store ptr [[TMP3]], ptr [[_TMP1]], align 8, !dbg [[DBG74]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG74]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG74]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = addrspacecast ptr addrspace(1) [[TMP5]] to ptr, !dbg [[DBG74]]
+// CHECK1-NEXT:    store ptr [[TMP6]], ptr [[_TMP2]], align 8, !dbg [[DBG74]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG74]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_IV]], metadata [[META75:![0-9]+]], metadata !DIExpression()), !dbg [[DBG64]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_LB]], metadata [[META76:![0-9]+]], metadata !DIExpression()), !dbg [[DBG64]]
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG77:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_UB]], metadata [[META78:![0-9]+]], metadata !DIExpression()), !dbg [[DBG64]]
+// CHECK1-NEXT:    store i32 9, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG77]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_STRIDE]], metadata [[META79:![0-9]+]], metadata !DIExpression()), !dbg [[DBG64]]
+// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG77]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_IS_LAST]], metadata [[META80:![0-9]+]], metadata !DIExpression()), !dbg [[DBG64]]
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4, !dbg [[DBG77]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B4]], metadata [[META81:![0-9]+]], metadata !DIExpression()), !dbg [[DBG64]]
+// CHECK1-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[B4]], ptr align 4 [[TMP4]], i64 400, i1 false), !dbg [[DBG74]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[I]], metadata [[META82:![0-9]+]], metadata !DIExpression()), !dbg [[DBG64]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG74]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP8]], align 4, !dbg [[DBG74]]
+// CHECK1-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB3:[0-9]+]], i32 [[TMP9]], i32 33, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1), !dbg [[DBG83:![0-9]+]]
+// CHECK1-NEXT:    br label [[OMP_DISPATCH_COND:%.*]], !dbg [[DBG74]]
 // CHECK1:       omp.dispatch.cond:
-// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG68]]
-// CHECK1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP10]], 9, !dbg [[DBG68]]
-// CHECK1-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]], !dbg [[DBG68]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG77]]
+// CHECK1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP10]], 9, !dbg [[DBG77]]
+// CHECK1-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]], !dbg [[DBG77]]
 // CHECK1:       cond.true:
-// CHECK1-NEXT:    br label [[COND_END:%.*]], !dbg [[DBG68]]
+// CHECK1-NEXT:    br label [[COND_END:%.*]], !dbg [[DBG77]]
 // CHECK1:       cond.false:
-// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG68]]
-// CHECK1-NEXT:    br label [[COND_END]], !dbg [[DBG68]]
+// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG77]]
+// CHECK1-NEXT:    br label [[COND_END]], !dbg [[DBG77]]
 // CHECK1:       cond.end:
-// CHECK1-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP11]], [[COND_FALSE]] ], !dbg [[DBG68]]
-// CHECK1-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4, !dbg [[DBG68]]
-// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG68]]
-// CHECK1-NEXT:    store i32 [[TMP12]], ptr [[DOTOMP_IV]], align 4, !dbg [[DBG68]]
-// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG68]]
-// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG68]]
-// CHECK1-NEXT:    [[CMP5:%.*]] = icmp sle i32 [[TMP13]], [[TMP14]], !dbg [[DBG65]]
-// CHECK1-NEXT:    br i1 [[CMP5]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]], !dbg [[DBG65]]
+// CHECK1-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP11]], [[COND_FALSE]] ], !dbg [[DBG77]]
+// CHECK1-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4, !dbg [[DBG77]]
+// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG77]]
+// CHECK1-NEXT:    store i32 [[TMP12]], ptr [[DOTOMP_IV]], align 4, !dbg [[DBG77]]
+// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG77]]
+// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG77]]
+// CHECK1-NEXT:    [[CMP5:%.*]] = icmp sle i32 [[TMP13]], [[TMP14]], !dbg [[DBG74]]
+// CHECK1-NEXT:    br i1 [[CMP5]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]], !dbg [[DBG74]]
 // CHECK1:       omp.dispatch.body:
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]], !dbg [[DBG65]]
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]], !dbg [[DBG74]]
 // CHECK1:       omp.inner.for.cond:
-// CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG68]]
-// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG68]]
-// CHECK1-NEXT:    [[CMP6:%.*]] = icmp sle i32 [[TMP15]], [[TMP16]], !dbg [[DBG65]]
-// CHECK1-NEXT:    br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]], !dbg [[DBG65]]
+// CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG77]]
+// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG77]]
+// CHECK1-NEXT:    [[CMP6:%.*]] = icmp sle i32 [[TMP15]], [[TMP16]], !dbg [[DBG74]]
+// CHECK1-NEXT:    br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]], !dbg [[DBG74]]
 // CHECK1:       omp.inner.for.body:
-// CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG68]]
-// CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP17]], 1, !dbg [[DBG75:![0-9]+]]
-// CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]], !dbg [[DBG75]]
-// CHECK1-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !dbg [[DBG75]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[F]], metadata [[META76:![0-9]+]], metadata !DIExpression()), !dbg [[DBG79:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 1, !dbg [[DBG80:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX]], i64 0, i64 1, !dbg [[DBG80]]
-// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX7]], i64 0, i64 1, !dbg [[DBG80]]
-// CHECK1-NEXT:    store ptr [[ARRAYIDX8]], ptr [[F]], align 8, !dbg [[DBG79]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[G]], metadata [[META81:![0-9]+]], metadata !DIExpression()), !dbg [[DBG82:![0-9]+]]
-// CHECK1-NEXT:    store ptr [[A_ADDR]], ptr [[G]], align 8, !dbg [[DBG82]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[H]], metadata [[META83:![0-9]+]], metadata !DIExpression()), !dbg [[DBG84:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B4]], i64 0, i64 1, !dbg [[DBG85:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX9]], i64 0, i64 1, !dbg [[DBG85]]
-// CHECK1-NEXT:    store ptr [[ARRAYIDX10]], ptr [[H]], align 8, !dbg [[DBG84]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[D]], metadata [[META86:![0-9]+]], metadata !DIExpression()), !dbg [[DBG87:![0-9]+]]
-// CHECK1-NEXT:    store i32 15, ptr [[D]], align 4, !dbg [[DBG87]]
-// CHECK1-NEXT:    store i32 5, ptr [[A_ADDR]], align 4, !dbg [[DBG88:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B4]], i64 0, i64 0, !dbg [[DBG89:![0-9]+]]
-// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG90:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP18]] to i64, !dbg [[DBG89]]
-// CHECK1-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX11]], i64 0, i64 [[IDXPROM]], !dbg [[DBG89]]
-// CHECK1-NEXT:    store i32 10, ptr [[ARRAYIDX12]], align 4, !dbg [[DBG91:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG92:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX13]], i64 0, i64 0, !dbg [[DBG92]]
-// CHECK1-NEXT:    [[TMP19:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG93:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM15:%.*]] = sext i32 [[TMP19]] to i64, !dbg [[DBG92]]
-// CHECK1-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX14]], i64 0, i64 [[IDXPROM15]], !dbg [[DBG92]]
-// CHECK1-NEXT:    store i32 11, ptr [[ARRAYIDX16]], align 4, !dbg [[DBG94:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG95:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX17]], i64 0, i64 0, !dbg [[DBG95]]
-// CHECK1-NEXT:    [[TMP20:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG96:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM19:%.*]] = sext i32 [[TMP20]] to i64, !dbg [[DBG95]]
-// CHECK1-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX18]], i64 0, i64 [[IDXPROM19]], !dbg [[DBG95]]
-// CHECK1-NEXT:    [[TMP21:%.*]] = load i32, ptr [[ARRAYIDX20]], align 4, !dbg [[DBG95]]
-// CHECK1-NEXT:    [[ARRAYIDX21:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B4]], i64 0, i64 0, !dbg [[DBG97:![0-9]+]]
-// CHECK1-NEXT:    [[TMP22:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG98:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM22:%.*]] = sext i32 [[TMP22]] to i64, !dbg [[DBG97]]
-// CHECK1-NEXT:    [[ARRAYIDX23:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX21]], i64 0, i64 [[IDXPROM22]], !dbg [[DBG97]]
-// CHECK1-NEXT:    store i32 [[TMP21]], ptr [[ARRAYIDX23]], align 4, !dbg [[DBG99:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX24:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B4]], i64 0, i64 0, !dbg [[DBG100:![0-9]+]]
-// CHECK1-NEXT:    [[TMP23:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG101:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM25:%.*]] = sext i32 [[TMP23]] to i64, !dbg [[DBG100]]
-// CHECK1-NEXT:    [[ARRAYIDX26:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX24]], i64 0, i64 [[IDXPROM25]], !dbg [[DBG100]]
-// CHECK1-NEXT:    [[TMP24:%.*]] = load i32, ptr [[ARRAYIDX26]], align 4, !dbg [[DBG100]]
-// CHECK1-NEXT:    [[TMP25:%.*]] = load i8, ptr [[TMP7]], align 1, !dbg [[DBG102:![0-9]+]]
-// CHECK1-NEXT:    [[TOBOOL:%.*]] = trunc i8 [[TMP25]] to i1, !dbg [[DBG102]]
-// CHECK1-NEXT:    [[CONV:%.*]] = zext i1 [[TOBOOL]] to i32, !dbg [[DBG102]]
-// CHECK1-NEXT:    [[OR:%.*]] = or i32 [[CONV]], [[TMP24]], !dbg [[DBG102]]
-// CHECK1-NEXT:    [[TOBOOL27:%.*]] = icmp ne i32 [[OR]], 0, !dbg [[DBG102]]
-// CHECK1-NEXT:    [[FROMBOOL:%.*]] = zext i1 [[TOBOOL27]] to i8, !dbg [[DBG102]]
-// CHECK1-NEXT:    store i8 [[FROMBOOL]], ptr [[TMP7]], align 1, !dbg [[DBG102]]
-// CHECK1-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]], !dbg [[DBG103:![0-9]+]]
+// CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG77]]
+// CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP17]], 1, !dbg [[DBG84:![0-9]+]]
+// CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]], !dbg [[DBG84]]
+// CHECK1-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !dbg [[DBG84]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[F]], metadata [[META85:![0-9]+]], metadata !DIExpression()), !dbg [[DBG88:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 1, !dbg [[DBG89:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX]], i64 0, i64 1, !dbg [[DBG89]]
+// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX7]], i64 0, i64 1, !dbg [[DBG89]]
+// CHECK1-NEXT:    store ptr [[ARRAYIDX8]], ptr [[F]], align 8, !dbg [[DBG88]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[G]], metadata [[META90:![0-9]+]], metadata !DIExpression()), !dbg [[DBG91:![0-9]+]]
+// CHECK1-NEXT:    store ptr [[A_ADDR]], ptr [[G]], align 8, !dbg [[DBG91]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[H]], metadata [[META92:![0-9]+]], metadata !DIExpression()), !dbg [[DBG93:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B4]], i64 0, i64 1, !dbg [[DBG94:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX9]], i64 0, i64 1, !dbg [[DBG94]]
+// CHECK1-NEXT:    store ptr [[ARRAYIDX10]], ptr [[H]], align 8, !dbg [[DBG93]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[D]], metadata [[META95:![0-9]+]], metadata !DIExpression()), !dbg [[DBG96:![0-9]+]]
+// CHECK1-NEXT:    store i32 15, ptr [[D]], align 4, !dbg [[DBG96]]
+// CHECK1-NEXT:    store i32 5, ptr [[A_ADDR]], align 4, !dbg [[DBG97:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B4]], i64 0, i64 0, !dbg [[DBG98:![0-9]+]]
+// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG99:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP18]] to i64, !dbg [[DBG98]]
+// CHECK1-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX11]], i64 0, i64 [[IDXPROM]], !dbg [[DBG98]]
+// CHECK1-NEXT:    store i32 10, ptr [[ARRAYIDX12]], align 4, !dbg [[DBG100:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG101:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX13]], i64 0, i64 0, !dbg [[DBG101]]
+// CHECK1-NEXT:    [[TMP19:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG102:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM15:%.*]] = sext i32 [[TMP19]] to i64, !dbg [[DBG101]]
+// CHECK1-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX14]], i64 0, i64 [[IDXPROM15]], !dbg [[DBG101]]
+// CHECK1-NEXT:    store i32 11, ptr [[ARRAYIDX16]], align 4, !dbg [[DBG103:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG104:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX17]], i64 0, i64 0, !dbg [[DBG104]]
+// CHECK1-NEXT:    [[TMP20:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG105:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM19:%.*]] = sext i32 [[TMP20]] to i64, !dbg [[DBG104]]
+// CHECK1-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX18]], i64 0, i64 [[IDXPROM19]], !dbg [[DBG104]]
+// CHECK1-NEXT:    [[TMP21:%.*]] = load i32, ptr [[ARRAYIDX20]], align 4, !dbg [[DBG104]]
+// CHECK1-NEXT:    [[ARRAYIDX21:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B4]], i64 0, i64 0, !dbg [[DBG106:![0-9]+]]
+// CHECK1-NEXT:    [[TMP22:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG107:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM22:%.*]] = sext i32 [[TMP22]] to i64, !dbg [[DBG106]]
+// CHECK1-NEXT:    [[ARRAYIDX23:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX21]], i64 0, i64 [[IDXPROM22]], !dbg [[DBG106]]
+// CHECK1-NEXT:    store i32 [[TMP21]], ptr [[ARRAYIDX23]], align 4, !dbg [[DBG108:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX24:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B4]], i64 0, i64 0, !dbg [[DBG109:![0-9]+]]
+// CHECK1-NEXT:    [[TMP23:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG110:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM25:%.*]] = sext i32 [[TMP23]] to i64, !dbg [[DBG109]]
+// CHECK1-NEXT:    [[ARRAYIDX26:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX24]], i64 0, i64 [[IDXPROM25]], !dbg [[DBG109]]
+// CHECK1-NEXT:    [[TMP24:%.*]] = load i32, ptr [[ARRAYIDX26]], align 4, !dbg [[DBG109]]
+// CHECK1-NEXT:    [[TMP25:%.*]] = load i8, ptr [[TMP7]], align 1, !dbg [[DBG111:![0-9]+]]
+// CHECK1-NEXT:    [[TOBOOL:%.*]] = trunc i8 [[TMP25]] to i1, !dbg [[DBG111]]
+// CHECK1-NEXT:    [[CONV:%.*]] = zext i1 [[TOBOOL]] to i32, !dbg [[DBG111]]
+// CHECK1-NEXT:    [[OR:%.*]] = or i32 [[CONV]], [[TMP24]], !dbg [[DBG111]]
+// CHECK1-NEXT:    [[TOBOOL27:%.*]] = icmp ne i32 [[OR]], 0, !dbg [[DBG111]]
+// CHECK1-NEXT:    [[FROMBOOL:%.*]] = zext i1 [[TOBOOL27]] to i8, !dbg [[DBG111]]
+// CHECK1-NEXT:    store i8 [[FROMBOOL]], ptr [[TMP7]], align 1, !dbg [[DBG111]]
+// CHECK1-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]], !dbg [[DBG112:![0-9]+]]
 // CHECK1:       omp.body.continue:
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]], !dbg [[DBG74]]
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]], !dbg [[DBG83]]
 // CHECK1:       omp.inner.for.inc:
-// CHECK1-NEXT:    [[TMP26:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG68]]
-// CHECK1-NEXT:    [[ADD28:%.*]] = add nsw i32 [[TMP26]], 1, !dbg [[DBG65]]
-// CHECK1-NEXT:    store i32 [[ADD28]], ptr [[DOTOMP_IV]], align 4, !dbg [[DBG65]]
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !dbg [[DBG74]], !llvm.loop [[LOOP104:![0-9]+]]
+// CHECK1-NEXT:    [[TMP26:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG77]]
+// CHECK1-NEXT:    [[ADD28:%.*]] = add nsw i32 [[TMP26]], 1, !dbg [[DBG74]]
+// CHECK1-NEXT:    store i32 [[ADD28]], ptr [[DOTOMP_IV]], align 4, !dbg [[DBG74]]
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !dbg [[DBG83]], !llvm.loop [[LOOP113:![0-9]+]]
 // CHECK1:       omp.inner.for.end:
-// CHECK1-NEXT:    br label [[OMP_DISPATCH_INC:%.*]], !dbg [[DBG74]]
+// CHECK1-NEXT:    br label [[OMP_DISPATCH_INC:%.*]], !dbg [[DBG83]]
 // CHECK1:       omp.dispatch.inc:
-// CHECK1-NEXT:    [[TMP27:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG68]]
-// CHECK1-NEXT:    [[TMP28:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG68]]
-// CHECK1-NEXT:    [[ADD29:%.*]] = add nsw i32 [[TMP27]], [[TMP28]], !dbg [[DBG65]]
-// CHECK1-NEXT:    store i32 [[ADD29]], ptr [[DOTOMP_LB]], align 4, !dbg [[DBG65]]
-// CHECK1-NEXT:    [[TMP29:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG68]]
-// CHECK1-NEXT:    [[TMP30:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG68]]
-// CHECK1-NEXT:    [[ADD30:%.*]] = add nsw i32 [[TMP29]], [[TMP30]], !dbg [[DBG65]]
-// CHECK1-NEXT:    store i32 [[ADD30]], ptr [[DOTOMP_UB]], align 4, !dbg [[DBG65]]
-// CHECK1-NEXT:    br label [[OMP_DISPATCH_COND]], !dbg [[DBG74]], !llvm.loop [[LOOP106:![0-9]+]]
+// CHECK1-NEXT:    [[TMP27:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG77]]
+// CHECK1-NEXT:    [[TMP28:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG77]]
+// CHECK1-NEXT:    [[ADD29:%.*]] = add nsw i32 [[TMP27]], [[TMP28]], !dbg [[DBG74]]
+// CHECK1-NEXT:    store i32 [[ADD29]], ptr [[DOTOMP_LB]], align 4, !dbg [[DBG74]]
+// CHECK1-NEXT:    [[TMP29:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG77]]
+// CHECK1-NEXT:    [[TMP30:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG77]]
+// CHECK1-NEXT:    [[ADD30:%.*]] = add nsw i32 [[TMP29]], [[TMP30]], !dbg [[DBG74]]
+// CHECK1-NEXT:    store i32 [[ADD30]], ptr [[DOTOMP_UB]], align 4, !dbg [[DBG74]]
+// CHECK1-NEXT:    br label [[OMP_DISPATCH_COND]], !dbg [[DBG83]], !llvm.loop [[LOOP115:![0-9]+]]
 // CHECK1:       omp.dispatch.end:
-// CHECK1-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB5:[0-9]+]], i32 [[TMP9]]), !dbg [[DBG105:![0-9]+]]
-// CHECK1-NEXT:    ret void, !dbg [[DBG107:![0-9]+]]
+// CHECK1-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB5:[0-9]+]], i32 [[TMP9]]), !dbg [[DBG114:![0-9]+]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG116:![0-9]+]]
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13_debug___omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG108:![0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR2]] !dbg [[DBG117:![0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -292,34 +292,34 @@ int main() {
 // CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[BB_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META115:![0-9]+]], metadata !DIExpression()), !dbg [[DBG116:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META124:![0-9]+]], metadata !DIExpression()), !dbg [[DBG125:![0-9]+]]
 // CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META117:![0-9]+]], metadata !DIExpression()), !dbg [[DBG116]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META126:![0-9]+]], metadata !DIExpression()), !dbg [[DBG125]]
 // CHECK1-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META118:![0-9]+]], metadata !DIExpression()), !dbg [[DBG116]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META127:![0-9]+]], metadata !DIExpression()), !dbg [[DBG125]]
 // CHECK1-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META119:![0-9]+]], metadata !DIExpression()), !dbg [[DBG116]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META128:![0-9]+]], metadata !DIExpression()), !dbg [[DBG125]]
 // CHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META120:![0-9]+]], metadata !DIExpression()), !dbg [[DBG116]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META129:![0-9]+]], metadata !DIExpression()), !dbg [[DBG125]]
 // CHECK1-NEXT:    store ptr [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META121:![0-9]+]], metadata !DIExpression()), !dbg [[DBG116]]
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG122:![0-9]+]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG122]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG122]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG122]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG122]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG122]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG122]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG122]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG122]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = addrspacecast ptr [[TMP5]] to ptr addrspace(1), !dbg [[DBG122]]
-// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr [[TMP8]] to ptr addrspace(1), !dbg [[DBG122]]
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13_debug___omp_outlined_debug__(ptr [[TMP3]], ptr [[TMP4]], ptr addrspace(1) [[TMP9]], i32 [[TMP6]], ptr [[TMP7]], ptr addrspace(1) [[TMP10]]) #[[ATTR3:[0-9]+]], !dbg [[DBG122]]
-// CHECK1-NEXT:    ret void, !dbg [[DBG122]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META130:![0-9]+]], metadata !DIExpression()), !dbg [[DBG125]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG131:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG131]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG131]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG131]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG131]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG131]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG131]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG131]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG131]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = addrspacecast ptr [[TMP5]] to ptr addrspace(1), !dbg [[DBG131]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr [[TMP8]] to ptr addrspace(1), !dbg [[DBG131]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13_debug___omp_outlined_debug__(ptr [[TMP3]], ptr [[TMP4]], ptr addrspace(1) [[TMP9]], i32 [[TMP6]], ptr [[TMP7]], ptr addrspace(1) [[TMP10]]) #[[ATTR4:[0-9]+]], !dbg [[DBG131]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG131]]
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13
-// CHECK1-SAME: (ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR5:[0-9]+]] !dbg [[DBG123:![0-9]+]] {
+// CHECK1-SAME: (ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR6:[0-9]+]] !dbg [[DBG132:![0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
@@ -327,32 +327,32 @@ int main() {
 // CHECK1-NEXT:    [[BB_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8
 // CHECK1-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META126:![0-9]+]], metadata !DIExpression()), !dbg [[DBG127:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META135:![0-9]+]], metadata !DIExpression()), !dbg [[DBG136:![0-9]+]]
 // CHECK1-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META128:![0-9]+]], metadata !DIExpression()), !dbg [[DBG127]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META137:![0-9]+]], metadata !DIExpression()), !dbg [[DBG136]]
 // CHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META129:![0-9]+]], metadata !DIExpression()), !dbg [[DBG127]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META138:![0-9]+]], metadata !DIExpression()), !dbg [[DBG136]]
 // CHECK1-NEXT:    store ptr [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META130:![0-9]+]], metadata !DIExpression()), !dbg [[DBG127]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META139:![0-9]+]], metadata !DIExpression()), !dbg [[DBG136]]
 // CHECK1-NEXT:    store i64 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTCAPTURE_EXPR__ADDR]], metadata [[META131:![0-9]+]], metadata !DIExpression()), !dbg [[DBG127]]
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG132:![0-9]+]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG132]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG132]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG132]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG132]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG132]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG132]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1, !dbg [[DBG132]]
-// CHECK1-NEXT:    [[TOBOOL:%.*]] = trunc i8 [[TMP7]] to i1, !dbg [[DBG132]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = addrspacecast ptr [[TMP3]] to ptr addrspace(1), !dbg [[DBG132]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = addrspacecast ptr [[TMP6]] to ptr addrspace(1), !dbg [[DBG132]]
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13_debug__(ptr addrspace(1) [[TMP8]], i32 [[TMP4]], ptr [[TMP5]], ptr addrspace(1) [[TMP9]], i1 [[TOBOOL]]) #[[ATTR3]], !dbg [[DBG132]]
-// CHECK1-NEXT:    ret void, !dbg [[DBG132]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTCAPTURE_EXPR__ADDR]], metadata [[META140:![0-9]+]], metadata !DIExpression()), !dbg [[DBG136]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG141:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG141]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG141]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG141]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG141]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG141]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG141]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1, !dbg [[DBG141]]
+// CHECK1-NEXT:    [[TOBOOL:%.*]] = trunc i8 [[TMP7]] to i1, !dbg [[DBG141]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = addrspacecast ptr [[TMP3]] to ptr addrspace(1), !dbg [[DBG141]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = addrspacecast ptr [[TMP6]] to ptr addrspace(1), !dbg [[DBG141]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13_debug__(ptr addrspace(1) [[TMP8]], i32 [[TMP4]], ptr [[TMP5]], ptr addrspace(1) [[TMP9]], i1 [[TOBOOL]]) #[[ATTR4]], !dbg [[DBG141]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG141]]
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27_debug__
-// CHECK1-SAME: (ptr addrspace(1) noalias noundef [[C:%.*]], i32 noundef [[A:%.*]], ptr addrspace(1) noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG133:![0-9]+]] {
+// CHECK1-SAME: (ptr addrspace(1) noalias noundef [[C:%.*]], i32 noundef [[A:%.*]], ptr addrspace(1) noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG142:![0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[C_ADDR:%.*]] = alloca ptr addrspace(1), align 8
 // CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
@@ -364,51 +364,51 @@ int main() {
 // CHECK1-NEXT:    [[A_CASTED:%.*]] = alloca i64, align 8
 // CHECK1-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x ptr], align 8
 // CHECK1-NEXT:    store ptr addrspace(1) [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META138:![0-9]+]], metadata !DIExpression()), !dbg [[DBG139:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META147:![0-9]+]], metadata !DIExpression()), !dbg [[DBG148:![0-9]+]]
 // CHECK1-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META140:![0-9]+]], metadata !DIExpression()), !dbg [[DBG141:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META149:![0-9]+]], metadata !DIExpression()), !dbg [[DBG150:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META142:![0-9]+]], metadata !DIExpression()), !dbg [[DBG143:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META151:![0-9]+]], metadata !DIExpression()), !dbg [[DBG152:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META144:![0-9]+]], metadata !DIExpression()), !dbg [[DBG145:![0-9]+]]
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG146:![0-9]+]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG146]]
-// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG146]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG146]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr [[B_ADDR]], align 8, !dbg [[DBG146]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[TMP3]] to ptr, !dbg [[DBG146]]
-// CHECK1-NEXT:    store ptr [[TMP4]], ptr [[_TMP1]], align 8, !dbg [[DBG146]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG146]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG146]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = addrspacecast ptr addrspace(1) [[TMP6]] to ptr, !dbg [[DBG146]]
-// CHECK1-NEXT:    store ptr [[TMP7]], ptr [[_TMP2]], align 8, !dbg [[DBG146]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG146]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27_kernel_environment), !dbg [[DBG146]]
-// CHECK1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP9]], -1, !dbg [[DBG146]]
-// CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]], !dbg [[DBG146]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META153:![0-9]+]], metadata !DIExpression()), !dbg [[DBG154:![0-9]+]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG155:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG155]]
+// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG155]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG155]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr [[B_ADDR]], align 8, !dbg [[DBG155]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[TMP3]] to ptr, !dbg [[DBG155]]
+// CHECK1-NEXT:    store ptr [[TMP4]], ptr [[_TMP1]], align 8, !dbg [[DBG155]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG155]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG155]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = addrspacecast ptr addrspace(1) [[TMP6]] to ptr, !dbg [[DBG155]]
+// CHECK1-NEXT:    store ptr [[TMP7]], ptr [[_TMP2]], align 8, !dbg [[DBG155]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG155]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27_kernel_environment), !dbg [[DBG155]]
+// CHECK1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP9]], -1, !dbg [[DBG155]]
+// CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]], !dbg [[DBG155]]
 // CHECK1:       user_code.entry:
 // CHECK1-NEXT:    [[TMP10:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB13:[0-9]+]])
-// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG147:![0-9]+]]
-// CHECK1-NEXT:    store i32 [[TMP11]], ptr [[A_CASTED]], align 4, !dbg [[DBG147]]
-// CHECK1-NEXT:    [[TMP12:%.*]] = load i64, ptr [[A_CASTED]], align 8, !dbg [[DBG147]]
-// CHECK1-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 0, !dbg [[DBG147]]
-// CHECK1-NEXT:    store ptr [[TMP2]], ptr [[TMP13]], align 8, !dbg [[DBG147]]
-// CHECK1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 1, !dbg [[DBG147]]
-// CHECK1-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP12]] to ptr, !dbg [[DBG147]]
-// CHECK1-NEXT:    store ptr [[TMP15]], ptr [[TMP14]], align 8, !dbg [[DBG147]]
-// CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 2, !dbg [[DBG147]]
-// CHECK1-NEXT:    store ptr [[TMP5]], ptr [[TMP16]], align 8, !dbg [[DBG147]]
-// CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 3, !dbg [[DBG147]]
-// CHECK1-NEXT:    store ptr [[TMP8]], ptr [[TMP17]], align 8, !dbg [[DBG147]]
-// CHECK1-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB13]], i32 [[TMP10]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27_debug___omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 4), !dbg [[DBG147]]
-// CHECK1-NEXT:    call void @__kmpc_target_deinit(), !dbg [[DBG148:![0-9]+]]
-// CHECK1-NEXT:    ret void, !dbg [[DBG150:![0-9]+]]
+// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG156:![0-9]+]]
+// CHECK1-NEXT:    store i32 [[TMP11]], ptr [[A_CASTED]], align 4, !dbg [[DBG156]]
+// CHECK1-NEXT:    [[TMP12:%.*]] = load i64, ptr [[A_CASTED]], align 8, !dbg [[DBG156]]
+// CHECK1-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 0, !dbg [[DBG156]]
+// CHECK1-NEXT:    store ptr [[TMP2]], ptr [[TMP13]], align 8, !dbg [[DBG156]]
+// CHECK1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 1, !dbg [[DBG156]]
+// CHECK1-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP12]] to ptr, !dbg [[DBG156]]
+// CHECK1-NEXT:    store ptr [[TMP15]], ptr [[TMP14]], align 8, !dbg [[DBG156]]
+// CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 2, !dbg [[DBG156]]
+// CHECK1-NEXT:    store ptr [[TMP5]], ptr [[TMP16]], align 8, !dbg [[DBG156]]
+// CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 3, !dbg [[DBG156]]
+// CHECK1-NEXT:    store ptr [[TMP8]], ptr [[TMP17]], align 8, !dbg [[DBG156]]
+// CHECK1-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB13]], i32 [[TMP10]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27_debug___omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 4), !dbg [[DBG156]]
+// CHECK1-NEXT:    call void @__kmpc_target_deinit(), !dbg [[DBG157:![0-9]+]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG159:![0-9]+]]
 // CHECK1:       worker.exit:
-// CHECK1-NEXT:    ret void, !dbg [[DBG146]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG155]]
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27_debug___omp_outlined_debug__
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], i32 noundef [[A:%.*]], ptr addrspace(1) noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG151:![0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], i32 noundef [[A:%.*]], ptr addrspace(1) noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR2]] !dbg [[DBG160:![0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -431,140 +431,140 @@ int main() {
 // CHECK1-NEXT:    [[H:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[D:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META154:![0-9]+]], metadata !DIExpression()), !dbg [[DBG155:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META163:![0-9]+]], metadata !DIExpression()), !dbg [[DBG164:![0-9]+]]
 // CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META156:![0-9]+]], metadata !DIExpression()), !dbg [[DBG155]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META165:![0-9]+]], metadata !DIExpression()), !dbg [[DBG164]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META157:![0-9]+]], metadata !DIExpression()), !dbg [[DBG158:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META166:![0-9]+]], metadata !DIExpression()), !dbg [[DBG167:![0-9]+]]
 // CHECK1-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META159:![0-9]+]], metadata !DIExpression()), !dbg [[DBG160:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META168:![0-9]+]], metadata !DIExpression()), !dbg [[DBG169:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META161:![0-9]+]], metadata !DIExpression()), !dbg [[DBG162:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META170:![0-9]+]], metadata !DIExpression()), !dbg [[DBG171:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META163:![0-9]+]], metadata !DIExpression()), !dbg [[DBG164:![0-9]+]]
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG165:![0-9]+]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG165]]
-// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG165]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG165]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr [[B_ADDR]], align 8, !dbg [[DBG165]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[TMP3]] to ptr, !dbg [[DBG165]]
-// CHECK1-NEXT:    store ptr [[TMP4]], ptr [[_TMP1]], align 8, !dbg [[DBG165]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG165]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG165]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = addrspacecast ptr addrspace(1) [[TMP6]] to ptr, !dbg [[DBG165]]
-// CHECK1-NEXT:    store ptr [[TMP7]], ptr [[_TMP2]], align 8, !dbg [[DBG165]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG165]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_IV]], metadata [[META166:![0-9]+]], metadata !DIExpression()), !dbg [[DBG155]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_LB]], metadata [[META167:![0-9]+]], metadata !DIExpression()), !dbg [[DBG155]]
-// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG168:![0-9]+]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_UB]], metadata [[META169:![0-9]+]], metadata !DIExpression()), !dbg [[DBG155]]
-// CHECK1-NEXT:    store i32 9, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG168]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_STRIDE]], metadata [[META170:![0-9]+]], metadata !DIExpression()), !dbg [[DBG155]]
-// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG168]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_IS_LAST]], metadata [[META171:![0-9]+]], metadata !DIExpression()), !dbg [[DBG155]]
-// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4, !dbg [[DBG168]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[I]], metadata [[META172:![0-9]+]], metadata !DIExpression()), !dbg [[DBG155]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG165]]
-// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !dbg [[DBG165]]
-// CHECK1-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB10:[0-9]+]], i32 [[TMP10]], i32 33, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1), !dbg [[DBG173:![0-9]+]]
-// CHECK1-NEXT:    br label [[OMP_DISPATCH_COND:%.*]], !dbg [[DBG165]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META172:![0-9]+]], metadata !DIExpression()), !dbg [[DBG173:![0-9]+]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG174:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG174]]
+// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG174]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG174]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr [[B_ADDR]], align 8, !dbg [[DBG174]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[TMP3]] to ptr, !dbg [[DBG174]]
+// CHECK1-NEXT:    store ptr [[TMP4]], ptr [[_TMP1]], align 8, !dbg [[DBG174]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG174]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG174]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = addrspacecast ptr addrspace(1) [[TMP6]] to ptr, !dbg [[DBG174]]
+// CHECK1-NEXT:    store ptr [[TMP7]], ptr [[_TMP2]], align 8, !dbg [[DBG174]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG174]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_IV]], metadata [[META175:![0-9]+]], metadata !DIExpression()), !dbg [[DBG164]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_LB]], metadata [[META176:![0-9]+]], metadata !DIExpression()), !dbg [[DBG164]]
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG177:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_UB]], metadata [[META178:![0-9]+]], metadata !DIExpression()), !dbg [[DBG164]]
+// CHECK1-NEXT:    store i32 9, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG177]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_STRIDE]], metadata [[META179:![0-9]+]], metadata !DIExpression()), !dbg [[DBG164]]
+// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG177]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_IS_LAST]], metadata [[META180:![0-9]+]], metadata !DIExpression()), !dbg [[DBG164]]
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4, !dbg [[DBG177]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[I]], metadata [[META181:![0-9]+]], metadata !DIExpression()), !dbg [[DBG164]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG174]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !dbg [[DBG174]]
+// CHECK1-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB10:[0-9]+]], i32 [[TMP10]], i32 33, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1), !dbg [[DBG182:![0-9]+]]
+// CHECK1-NEXT:    br label [[OMP_DISPATCH_COND:%.*]], !dbg [[DBG174]]
 // CHECK1:       omp.dispatch.cond:
-// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG168]]
-// CHECK1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP11]], 9, !dbg [[DBG168]]
-// CHECK1-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]], !dbg [[DBG168]]
+// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG177]]
+// CHECK1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP11]], 9, !dbg [[DBG177]]
+// CHECK1-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]], !dbg [[DBG177]]
 // CHECK1:       cond.true:
-// CHECK1-NEXT:    br label [[COND_END:%.*]], !dbg [[DBG168]]
+// CHECK1-NEXT:    br label [[COND_END:%.*]], !dbg [[DBG177]]
 // CHECK1:       cond.false:
-// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG168]]
-// CHECK1-NEXT:    br label [[COND_END]], !dbg [[DBG168]]
+// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG177]]
+// CHECK1-NEXT:    br label [[COND_END]], !dbg [[DBG177]]
 // CHECK1:       cond.end:
-// CHECK1-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ], !dbg [[DBG168]]
-// CHECK1-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4, !dbg [[DBG168]]
-// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG168]]
-// CHECK1-NEXT:    store i32 [[TMP13]], ptr [[DOTOMP_IV]], align 4, !dbg [[DBG168]]
-// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG168]]
-// CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG168]]
-// CHECK1-NEXT:    [[CMP4:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]], !dbg [[DBG165]]
-// CHECK1-NEXT:    br i1 [[CMP4]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]], !dbg [[DBG165]]
+// CHECK1-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ], !dbg [[DBG177]]
+// CHECK1-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4, !dbg [[DBG177]]
+// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG177]]
+// CHECK1-NEXT:    store i32 [[TMP13]], ptr [[DOTOMP_IV]], align 4, !dbg [[DBG177]]
+// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG177]]
+// CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG177]]
+// CHECK1-NEXT:    [[CMP4:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]], !dbg [[DBG174]]
+// CHECK1-NEXT:    br i1 [[CMP4]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]], !dbg [[DBG174]]
 // CHECK1:       omp.dispatch.body:
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]], !dbg [[DBG165]]
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]], !dbg [[DBG174]]
 // CHECK1:       omp.inner.for.cond:
-// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG168]]
-// CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG168]]
-// CHECK1-NEXT:    [[CMP5:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]], !dbg [[DBG165]]
-// CHECK1-NEXT:    br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]], !dbg [[DBG165]]
+// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG177]]
+// CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG177]]
+// CHECK1-NEXT:    [[CMP5:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]], !dbg [[DBG174]]
+// CHECK1-NEXT:    br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]], !dbg [[DBG174]]
 // CHECK1:       omp.inner.for.body:
-// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG168]]
-// CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1, !dbg [[DBG174:![0-9]+]]
-// CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]], !dbg [[DBG174]]
-// CHECK1-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !dbg [[DBG174]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[F]], metadata [[META175:![0-9]+]], metadata !DIExpression()), !dbg [[DBG177:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 1, !dbg [[DBG178:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX]], i64 0, i64 1, !dbg [[DBG178]]
-// CHECK1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX6]], i64 0, i64 1, !dbg [[DBG178]]
-// CHECK1-NEXT:    store ptr [[ARRAYIDX7]], ptr [[F]], align 8, !dbg [[DBG177]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[G]], metadata [[META179:![0-9]+]], metadata !DIExpression()), !dbg [[DBG180:![0-9]+]]
-// CHECK1-NEXT:    store ptr [[A_ADDR]], ptr [[G]], align 8, !dbg [[DBG180]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[H]], metadata [[META181:![0-9]+]], metadata !DIExpression()), !dbg [[DBG182:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP5]], i64 0, i64 1, !dbg [[DBG183:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX8]], i64 0, i64 1, !dbg [[DBG183]]
-// CHECK1-NEXT:    store ptr [[ARRAYIDX9]], ptr [[H]], align 8, !dbg [[DBG182]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[D]], metadata [[META184:![0-9]+]], metadata !DIExpression()), !dbg [[DBG185:![0-9]+]]
-// CHECK1-NEXT:    store i32 15, ptr [[D]], align 4, !dbg [[DBG185]]
-// CHECK1-NEXT:    store i32 5, ptr [[A_ADDR]], align 4, !dbg [[DBG186:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP5]], i64 0, i64 0, !dbg [[DBG187:![0-9]+]]
-// CHECK1-NEXT:    [[TMP19:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG188:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP19]] to i64, !dbg [[DBG187]]
-// CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX10]], i64 0, i64 [[IDXPROM]], !dbg [[DBG187]]
-// CHECK1-NEXT:    store i32 10, ptr [[ARRAYIDX11]], align 4, !dbg [[DBG189:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG190:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX12]], i64 0, i64 0, !dbg [[DBG190]]
-// CHECK1-NEXT:    [[TMP20:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG191:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM14:%.*]] = sext i32 [[TMP20]] to i64, !dbg [[DBG190]]
-// CHECK1-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX13]], i64 0, i64 [[IDXPROM14]], !dbg [[DBG190]]
-// CHECK1-NEXT:    store i32 11, ptr [[ARRAYIDX15]], align 4, !dbg [[DBG192:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG193:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX16]], i64 0, i64 0, !dbg [[DBG193]]
-// CHECK1-NEXT:    [[TMP21:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG194:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM18:%.*]] = sext i32 [[TMP21]] to i64, !dbg [[DBG193]]
-// CHECK1-NEXT:    [[ARRAYIDX19:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX17]], i64 0, i64 [[IDXPROM18]], !dbg [[DBG193]]
-// CHECK1-NEXT:    [[TMP22:%.*]] = load i32, ptr [[ARRAYIDX19]], align 4, !dbg [[DBG193]]
-// CHECK1-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP5]], i64 0, i64 0, !dbg [[DBG195:![0-9]+]]
-// CHECK1-NEXT:    [[TMP23:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG196:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM21:%.*]] = sext i32 [[TMP23]] to i64, !dbg [[DBG195]]
-// CHECK1-NEXT:    [[ARRAYIDX22:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX20]], i64 0, i64 [[IDXPROM21]], !dbg [[DBG195]]
-// CHECK1-NEXT:    store i32 [[TMP22]], ptr [[ARRAYIDX22]], align 4, !dbg [[DBG197:![0-9]+]]
-// CHECK1-NEXT:    [[TMP24:%.*]] = load i8, ptr [[TMP8]], align 1, !dbg [[DBG198:![0-9]+]]
-// CHECK1-NEXT:    [[TOBOOL:%.*]] = trunc i8 [[TMP24]] to i1, !dbg [[DBG198]]
-// CHECK1-NEXT:    [[CONV:%.*]] = zext i1 [[TOBOOL]] to i32, !dbg [[DBG198]]
-// CHECK1-NEXT:    store i32 [[CONV]], ptr [[D]], align 4, !dbg [[DBG199:![0-9]+]]
-// CHECK1-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]], !dbg [[DBG200:![0-9]+]]
+// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG177]]
+// CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1, !dbg [[DBG183:![0-9]+]]
+// CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]], !dbg [[DBG183]]
+// CHECK1-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !dbg [[DBG183]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[F]], metadata [[META184:![0-9]+]], metadata !DIExpression()), !dbg [[DBG186:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 1, !dbg [[DBG187:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX]], i64 0, i64 1, !dbg [[DBG187]]
+// CHECK1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX6]], i64 0, i64 1, !dbg [[DBG187]]
+// CHECK1-NEXT:    store ptr [[ARRAYIDX7]], ptr [[F]], align 8, !dbg [[DBG186]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[G]], metadata [[META188:![0-9]+]], metadata !DIExpression()), !dbg [[DBG189:![0-9]+]]
+// CHECK1-NEXT:    store ptr [[A_ADDR]], ptr [[G]], align 8, !dbg [[DBG189]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[H]], metadata [[META190:![0-9]+]], metadata !DIExpression()), !dbg [[DBG191:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP5]], i64 0, i64 1, !dbg [[DBG192:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX8]], i64 0, i64 1, !dbg [[DBG192]]
+// CHECK1-NEXT:    store ptr [[ARRAYIDX9]], ptr [[H]], align 8, !dbg [[DBG191]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[D]], metadata [[META193:![0-9]+]], metadata !DIExpression()), !dbg [[DBG194:![0-9]+]]
+// CHECK1-NEXT:    store i32 15, ptr [[D]], align 4, !dbg [[DBG194]]
+// CHECK1-NEXT:    store i32 5, ptr [[A_ADDR]], align 4, !dbg [[DBG195:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP5]], i64 0, i64 0, !dbg [[DBG196:![0-9]+]]
+// CHECK1-NEXT:    [[TMP19:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG197:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP19]] to i64, !dbg [[DBG196]]
+// CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX10]], i64 0, i64 [[IDXPROM]], !dbg [[DBG196]]
+// CHECK1-NEXT:    store i32 10, ptr [[ARRAYIDX11]], align 4, !dbg [[DBG198:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG199:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX12]], i64 0, i64 0, !dbg [[DBG199]]
+// CHECK1-NEXT:    [[TMP20:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG200:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM14:%.*]] = sext i32 [[TMP20]] to i64, !dbg [[DBG199]]
+// CHECK1-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX13]], i64 0, i64 [[IDXPROM14]], !dbg [[DBG199]]
+// CHECK1-NEXT:    store i32 11, ptr [[ARRAYIDX15]], align 4, !dbg [[DBG201:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG202:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX16]], i64 0, i64 0, !dbg [[DBG202]]
+// CHECK1-NEXT:    [[TMP21:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG203:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM18:%.*]] = sext i32 [[TMP21]] to i64, !dbg [[DBG202]]
+// CHECK1-NEXT:    [[ARRAYIDX19:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX17]], i64 0, i64 [[IDXPROM18]], !dbg [[DBG202]]
+// CHECK1-NEXT:    [[TMP22:%.*]] = load i32, ptr [[ARRAYIDX19]], align 4, !dbg [[DBG202]]
+// CHECK1-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP5]], i64 0, i64 0, !dbg [[DBG204:![0-9]+]]
+// CHECK1-NEXT:    [[TMP23:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG205:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM21:%.*]] = sext i32 [[TMP23]] to i64, !dbg [[DBG204]]
+// CHECK1-NEXT:    [[ARRAYIDX22:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX20]], i64 0, i64 [[IDXPROM21]], !dbg [[DBG204]]
+// CHECK1-NEXT:    store i32 [[TMP22]], ptr [[ARRAYIDX22]], align 4, !dbg [[DBG206:![0-9]+]]
+// CHECK1-NEXT:    [[TMP24:%.*]] = load i8, ptr [[TMP8]], align 1, !dbg [[DBG207:![0-9]+]]
+// CHECK1-NEXT:    [[TOBOOL:%.*]] = trunc i8 [[TMP24]] to i1, !dbg [[DBG207]]
+// CHECK1-NEXT:    [[CONV:%.*]] = zext i1 [[TOBOOL]] to i32, !dbg [[DBG207]]
+// CHECK1-NEXT:    store i32 [[CONV]], ptr [[D]], align 4, !dbg [[DBG208:![0-9]+]]
+// CHECK1-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]], !dbg [[DBG209:![0-9]+]]
 // CHECK1:       omp.body.continue:
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]], !dbg [[DBG173]]
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]], !dbg [[DBG182]]
 // CHECK1:       omp.inner.for.inc:
-// CHECK1-NEXT:    [[TMP25:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG168]]
-// CHECK1-NEXT:    [[ADD23:%.*]] = add nsw i32 [[TMP25]], 1, !dbg [[DBG165]]
-// CHECK1-NEXT:    store i32 [[ADD23]], ptr [[DOTOMP_IV]], align 4, !dbg [[DBG165]]
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !dbg [[DBG173]], !llvm.loop [[LOOP201:![0-9]+]]
+// CHECK1-NEXT:    [[TMP25:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG177]]
+// CHECK1-NEXT:    [[ADD23:%.*]] = add nsw i32 [[TMP25]], 1, !dbg [[DBG174]]
+// CHECK1-NEXT:    store i32 [[ADD23]], ptr [[DOTOMP_IV]], align 4, !dbg [[DBG174]]
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !dbg [[DBG182]], !llvm.loop [[LOOP210:![0-9]+]]
 // CHECK1:       omp.inner.for.end:
-// CHECK1-NEXT:    br label [[OMP_DISPATCH_INC:%.*]], !dbg [[DBG173]]
+// CHECK1-NEXT:    br label [[OMP_DISPATCH_INC:%.*]], !dbg [[DBG182]]
 // CHECK1:       omp.dispatch.inc:
-// CHECK1-NEXT:    [[TMP26:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG168]]
-// CHECK1-NEXT:    [[TMP27:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG168]]
-// CHECK1-NEXT:    [[ADD24:%.*]] = add nsw i32 [[TMP26]], [[TMP27]], !dbg [[DBG165]]
-// CHECK1-NEXT:    store i32 [[ADD24]], ptr [[DOTOMP_LB]], align 4, !dbg [[DBG165]]
-// CHECK1-NEXT:    [[TMP28:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG168]]
-// CHECK1-NEXT:    [[TMP29:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG168]]
-// CHECK1-NEXT:    [[ADD25:%.*]] = add nsw i32 [[TMP28]], [[TMP29]], !dbg [[DBG165]]
-// CHECK1-NEXT:    store i32 [[ADD25]], ptr [[DOTOMP_UB]], align 4, !dbg [[DBG165]]
-// CHECK1-NEXT:    br label [[OMP_DISPATCH_COND]], !dbg [[DBG173]], !llvm.loop [[LOOP203:![0-9]+]]
+// CHECK1-NEXT:    [[TMP26:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG177]]
+// CHECK1-NEXT:    [[TMP27:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG177]]
+// CHECK1-NEXT:    [[ADD24:%.*]] = add nsw i32 [[TMP26]], [[TMP27]], !dbg [[DBG174]]
+// CHECK1-NEXT:    store i32 [[ADD24]], ptr [[DOTOMP_LB]], align 4, !dbg [[DBG174]]
+// CHECK1-NEXT:    [[TMP28:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG177]]
+// CHECK1-NEXT:    [[TMP29:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG177]]
+// CHECK1-NEXT:    [[ADD25:%.*]] = add nsw i32 [[TMP28]], [[TMP29]], !dbg [[DBG174]]
+// CHECK1-NEXT:    store i32 [[ADD25]], ptr [[DOTOMP_UB]], align 4, !dbg [[DBG174]]
+// CHECK1-NEXT:    br label [[OMP_DISPATCH_COND]], !dbg [[DBG182]], !llvm.loop [[LOOP212:![0-9]+]]
 // CHECK1:       omp.dispatch.end:
-// CHECK1-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB12:[0-9]+]], i32 [[TMP10]]), !dbg [[DBG202:![0-9]+]]
-// CHECK1-NEXT:    ret void, !dbg [[DBG204:![0-9]+]]
+// CHECK1-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB12:[0-9]+]], i32 [[TMP10]]), !dbg [[DBG211:![0-9]+]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG213:![0-9]+]]
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27_debug___omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG205:![0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR2]] !dbg [[DBG214:![0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -573,64 +573,64 @@ int main() {
 // CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[BB_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META206:![0-9]+]], metadata !DIExpression()), !dbg [[DBG207:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META215:![0-9]+]], metadata !DIExpression()), !dbg [[DBG216:![0-9]+]]
 // CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META208:![0-9]+]], metadata !DIExpression()), !dbg [[DBG207]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META217:![0-9]+]], metadata !DIExpression()), !dbg [[DBG216]]
 // CHECK1-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META209:![0-9]+]], metadata !DIExpression()), !dbg [[DBG207]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META218:![0-9]+]], metadata !DIExpression()), !dbg [[DBG216]]
 // CHECK1-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META210:![0-9]+]], metadata !DIExpression()), !dbg [[DBG207]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META219:![0-9]+]], metadata !DIExpression()), !dbg [[DBG216]]
 // CHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META211:![0-9]+]], metadata !DIExpression()), !dbg [[DBG207]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META220:![0-9]+]], metadata !DIExpression()), !dbg [[DBG216]]
 // CHECK1-NEXT:    store ptr [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META212:![0-9]+]], metadata !DIExpression()), !dbg [[DBG207]]
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG213:![0-9]+]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG213]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG213]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG213]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG213]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG213]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG213]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG213]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG213]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = addrspacecast ptr [[TMP5]] to ptr addrspace(1), !dbg [[DBG213]]
-// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr [[TMP7]] to ptr addrspace(1), !dbg [[DBG213]]
-// CHECK1-NEXT:    [[TMP11:%.*]] = addrspacecast ptr [[TMP8]] to ptr addrspace(1), !dbg [[DBG213]]
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27_debug___omp_outlined_debug__(ptr [[TMP3]], ptr [[TMP4]], ptr addrspace(1) [[TMP9]], i32 [[TMP6]], ptr addrspace(1) [[TMP10]], ptr addrspace(1) [[TMP11]]) #[[ATTR3]], !dbg [[DBG213]]
-// CHECK1-NEXT:    ret void, !dbg [[DBG213]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META221:![0-9]+]], metadata !DIExpression()), !dbg [[DBG216]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG222:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG222]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG222]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG222]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG222]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG222]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG222]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG222]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG222]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = addrspacecast ptr [[TMP5]] to ptr addrspace(1), !dbg [[DBG222]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr [[TMP7]] to ptr addrspace(1), !dbg [[DBG222]]
+// CHECK1-NEXT:    [[TMP11:%.*]] = addrspacecast ptr [[TMP8]] to ptr addrspace(1), !dbg [[DBG222]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27_debug___omp_outlined_debug__(ptr [[TMP3]], ptr [[TMP4]], ptr addrspace(1) [[TMP9]], i32 [[TMP6]], ptr addrspace(1) [[TMP10]], ptr addrspace(1) [[TMP11]]) #[[ATTR4]], !dbg [[DBG222]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG222]]
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27
-// CHECK1-SAME: (ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR5]] !dbg [[DBG214:![0-9]+]] {
+// CHECK1-SAME: (ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR6]] !dbg [[DBG223:![0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
 // CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[BB_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META217:![0-9]+]], metadata !DIExpression()), !dbg [[DBG218:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META226:![0-9]+]], metadata !DIExpression()), !dbg [[DBG227:![0-9]+]]
 // CHECK1-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META219:![0-9]+]], metadata !DIExpression()), !dbg [[DBG218]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META228:![0-9]+]], metadata !DIExpression()), !dbg [[DBG227]]
 // CHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META220:![0-9]+]], metadata !DIExpression()), !dbg [[DBG218]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META229:![0-9]+]], metadata !DIExpression()), !dbg [[DBG227]]
 // CHECK1-NEXT:    store ptr [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META221:![0-9]+]], metadata !DIExpression()), !dbg [[DBG218]]
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG222:![0-9]+]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG222]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG222]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG222]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG222]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG222]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG222]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = addrspacecast ptr [[TMP3]] to ptr addrspace(1), !dbg [[DBG222]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = addrspacecast ptr [[TMP5]] to ptr addrspace(1), !dbg [[DBG222]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = addrspacecast ptr [[TMP6]] to ptr addrspace(1), !dbg [[DBG222]]
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27_debug__(ptr addrspace(1) [[TMP7]], i32 [[TMP4]], ptr addrspace(1) [[TMP8]], ptr addrspace(1) [[TMP9]]) #[[ATTR3]], !dbg [[DBG222]]
-// CHECK1-NEXT:    ret void, !dbg [[DBG222]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META230:![0-9]+]], metadata !DIExpression()), !dbg [[DBG227]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG231:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG231]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG231]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG231]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG231]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG231]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG231]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = addrspacecast ptr [[TMP3]] to ptr addrspace(1), !dbg [[DBG231]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = addrspacecast ptr [[TMP5]] to ptr addrspace(1), !dbg [[DBG231]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = addrspacecast ptr [[TMP6]] to ptr addrspace(1), !dbg [[DBG231]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27_debug__(ptr addrspace(1) [[TMP7]], i32 [[TMP4]], ptr addrspace(1) [[TMP8]], ptr addrspace(1) [[TMP9]]) #[[ATTR4]], !dbg [[DBG231]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG231]]
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41_debug__
-// CHECK1-SAME: (ptr addrspace(1) noalias noundef [[C:%.*]], ptr addrspace(1) noalias noundef [[A:%.*]], ptr addrspace(1) noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG223:![0-9]+]] {
+// CHECK1-SAME: (ptr addrspace(1) noalias noundef [[C:%.*]], ptr addrspace(1) noalias noundef [[A:%.*]], ptr addrspace(1) noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG232:![0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[C_ADDR:%.*]] = alloca ptr addrspace(1), align 8
 // CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca ptr addrspace(1), align 8
@@ -642,51 +642,51 @@ int main() {
 // CHECK1-NEXT:    [[_TMP3:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x ptr], align 8
 // CHECK1-NEXT:    store ptr addrspace(1) [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META228:![0-9]+]], metadata !DIExpression()), !dbg [[DBG229:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META237:![0-9]+]], metadata !DIExpression()), !dbg [[DBG238:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[A]], ptr [[A_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META230:![0-9]+]], metadata !DIExpression()), !dbg [[DBG231:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META239:![0-9]+]], metadata !DIExpression()), !dbg [[DBG240:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META232:![0-9]+]], metadata !DIExpression()), !dbg [[DBG233:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META241:![0-9]+]], metadata !DIExpression()), !dbg [[DBG242:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META234:![0-9]+]], metadata !DIExpression()), !dbg [[DBG235:![0-9]+]]
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG236:![0-9]+]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG236]]
-// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG236]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG236]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr [[A_ADDR]], align 8, !dbg [[DBG236]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[TMP3]] to ptr, !dbg [[DBG236]]
-// CHECK1-NEXT:    store ptr [[TMP4]], ptr [[_TMP1]], align 8, !dbg [[DBG236]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG236]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr addrspace(1), ptr [[B_ADDR]], align 8, !dbg [[DBG236]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = addrspacecast ptr addrspace(1) [[TMP6]] to ptr, !dbg [[DBG236]]
-// CHECK1-NEXT:    store ptr [[TMP7]], ptr [[_TMP2]], align 8, !dbg [[DBG236]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG236]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG236]]
-// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr addrspace(1) [[TMP9]] to ptr, !dbg [[DBG236]]
-// CHECK1-NEXT:    store ptr [[TMP10]], ptr [[_TMP3]], align 8, !dbg [[DBG236]]
-// CHECK1-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[_TMP3]], align 8, !dbg [[DBG236]]
-// CHECK1-NEXT:    [[TMP12:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41_kernel_environment), !dbg [[DBG236]]
-// CHECK1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP12]], -1, !dbg [[DBG236]]
-// CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]], !dbg [[DBG236]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META243:![0-9]+]], metadata !DIExpression()), !dbg [[DBG244:![0-9]+]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG245:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG245]]
+// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG245]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG245]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr [[A_ADDR]], align 8, !dbg [[DBG245]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[TMP3]] to ptr, !dbg [[DBG245]]
+// CHECK1-NEXT:    store ptr [[TMP4]], ptr [[_TMP1]], align 8, !dbg [[DBG245]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG245]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr addrspace(1), ptr [[B_ADDR]], align 8, !dbg [[DBG245]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = addrspacecast ptr addrspace(1) [[TMP6]] to ptr, !dbg [[DBG245]]
+// CHECK1-NEXT:    store ptr [[TMP7]], ptr [[_TMP2]], align 8, !dbg [[DBG245]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG245]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG245]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr addrspace(1) [[TMP9]] to ptr, !dbg [[DBG245]]
+// CHECK1-NEXT:    store ptr [[TMP10]], ptr [[_TMP3]], align 8, !dbg [[DBG245]]
+// CHECK1-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[_TMP3]], align 8, !dbg [[DBG245]]
+// CHECK1-NEXT:    [[TMP12:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41_kernel_environment), !dbg [[DBG245]]
+// CHECK1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP12]], -1, !dbg [[DBG245]]
+// CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]], !dbg [[DBG245]]
 // CHECK1:       user_code.entry:
 // CHECK1-NEXT:    [[TMP13:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB20:[0-9]+]])
-// CHECK1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 0, !dbg [[DBG237:![0-9]+]]
-// CHECK1-NEXT:    store ptr [[TMP2]], ptr [[TMP14]], align 8, !dbg [[DBG237]]
-// CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 1, !dbg [[DBG237]]
-// CHECK1-NEXT:    store ptr [[TMP5]], ptr [[TMP15]], align 8, !dbg [[DBG237]]
-// CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 2, !dbg [[DBG237]]
-// CHECK1-NEXT:    store ptr [[TMP8]], ptr [[TMP16]], align 8, !dbg [[DBG237]]
-// CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 3, !dbg [[DBG237]]
-// CHECK1-NEXT:    store ptr [[TMP11]], ptr [[TMP17]], align 8, !dbg [[DBG237]]
-// CHECK1-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB20]], i32 [[TMP13]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41_debug___omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 4), !dbg [[DBG237]]
-// CHECK1-NEXT:    call void @__kmpc_target_deinit(), !dbg [[DBG238:![0-9]+]]
-// CHECK1-NEXT:    ret void, !dbg [[DBG240:![0-9]+]]
+// CHECK1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 0, !dbg [[DBG246:![0-9]+]]
+// CHECK1-NEXT:    store ptr [[TMP2]], ptr [[TMP14]], align 8, !dbg [[DBG246]]
+// CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 1, !dbg [[DBG246]]
+// CHECK1-NEXT:    store ptr [[TMP5]], ptr [[TMP15]], align 8, !dbg [[DBG246]]
+// CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 2, !dbg [[DBG246]]
+// CHECK1-NEXT:    store ptr [[TMP8]], ptr [[TMP16]], align 8, !dbg [[DBG246]]
+// CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 3, !dbg [[DBG246]]
+// CHECK1-NEXT:    store ptr [[TMP11]], ptr [[TMP17]], align 8, !dbg [[DBG246]]
+// CHECK1-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB20]], i32 [[TMP13]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41_debug___omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 4), !dbg [[DBG246]]
+// CHECK1-NEXT:    call void @__kmpc_target_deinit(), !dbg [[DBG247:![0-9]+]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG249:![0-9]+]]
 // CHECK1:       worker.exit:
-// CHECK1-NEXT:    ret void, !dbg [[DBG236]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG245]]
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41_debug___omp_outlined_debug__
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], ptr addrspace(1) noalias noundef [[A:%.*]], ptr addrspace(1) noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG241:![0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], ptr addrspace(1) noalias noundef [[A:%.*]], ptr addrspace(1) noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR2]] !dbg [[DBG250:![0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -710,148 +710,148 @@ int main() {
 // CHECK1-NEXT:    [[H:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[D:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META244:![0-9]+]], metadata !DIExpression()), !dbg [[DBG245:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META253:![0-9]+]], metadata !DIExpression()), !dbg [[DBG254:![0-9]+]]
 // CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META246:![0-9]+]], metadata !DIExpression()), !dbg [[DBG245]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META255:![0-9]+]], metadata !DIExpression()), !dbg [[DBG254]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META247:![0-9]+]], metadata !DIExpression()), !dbg [[DBG248:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META256:![0-9]+]], metadata !DIExpression()), !dbg [[DBG257:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[A]], ptr [[A_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META249:![0-9]+]], metadata !DIExpression()), !dbg [[DBG250:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META258:![0-9]+]], metadata !DIExpression()), !dbg [[DBG259:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META251:![0-9]+]], metadata !DIExpression()), !dbg [[DBG252:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META260:![0-9]+]], metadata !DIExpression()), !dbg [[DBG261:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META253:![0-9]+]], metadata !DIExpression()), !dbg [[DBG254:![0-9]+]]
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG255:![0-9]+]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG255]]
-// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG255]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG255]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr [[A_ADDR]], align 8, !dbg [[DBG255]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[TMP3]] to ptr, !dbg [[DBG255]]
-// CHECK1-NEXT:    store ptr [[TMP4]], ptr [[_TMP1]], align 8, !dbg [[DBG255]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG255]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr addrspace(1), ptr [[B_ADDR]], align 8, !dbg [[DBG255]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = addrspacecast ptr addrspace(1) [[TMP6]] to ptr, !dbg [[DBG255]]
-// CHECK1-NEXT:    store ptr [[TMP7]], ptr [[_TMP2]], align 8, !dbg [[DBG255]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG255]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG255]]
-// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr addrspace(1) [[TMP9]] to ptr, !dbg [[DBG255]]
-// CHECK1-NEXT:    store ptr [[TMP10]], ptr [[_TMP3]], align 8, !dbg [[DBG255]]
-// CHECK1-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[_TMP3]], align 8, !dbg [[DBG255]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_IV]], metadata [[META256:![0-9]+]], metadata !DIExpression()), !dbg [[DBG245]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_LB]], metadata [[META257:![0-9]+]], metadata !DIExpression()), !dbg [[DBG245]]
-// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG258:![0-9]+]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_UB]], metadata [[META259:![0-9]+]], metadata !DIExpression()), !dbg [[DBG245]]
-// CHECK1-NEXT:    store i32 9, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG258]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_STRIDE]], metadata [[META260:![0-9]+]], metadata !DIExpression()), !dbg [[DBG245]]
-// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG258]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_IS_LAST]], metadata [[META261:![0-9]+]], metadata !DIExpression()), !dbg [[DBG245]]
-// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4, !dbg [[DBG258]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[I]], metadata [[META262:![0-9]+]], metadata !DIExpression()), !dbg [[DBG245]]
-// CHECK1-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG255]]
-// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4, !dbg [[DBG255]]
-// CHECK1-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB17:[0-9]+]], i32 [[TMP13]], i32 33, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1), !dbg [[DBG263:![0-9]+]]
-// CHECK1-NEXT:    br label [[OMP_DISPATCH_COND:%.*]], !dbg [[DBG255]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META262:![0-9]+]], metadata !DIExpression()), !dbg [[DBG263:![0-9]+]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG264:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG264]]
+// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG264]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG264]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr [[A_ADDR]], align 8, !dbg [[DBG264]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[TMP3]] to ptr, !dbg [[DBG264]]
+// CHECK1-NEXT:    store ptr [[TMP4]], ptr [[_TMP1]], align 8, !dbg [[DBG264]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG264]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr addrspace(1), ptr [[B_ADDR]], align 8, !dbg [[DBG264]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = addrspacecast ptr addrspace(1) [[TMP6]] to ptr, !dbg [[DBG264]]
+// CHECK1-NEXT:    store ptr [[TMP7]], ptr [[_TMP2]], align 8, !dbg [[DBG264]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG264]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG264]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr addrspace(1) [[TMP9]] to ptr, !dbg [[DBG264]]
+// CHECK1-NEXT:    store ptr [[TMP10]], ptr [[_TMP3]], align 8, !dbg [[DBG264]]
+// CHECK1-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[_TMP3]], align 8, !dbg [[DBG264]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_IV]], metadata [[META265:![0-9]+]], metadata !DIExpression()), !dbg [[DBG254]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_LB]], metadata [[META266:![0-9]+]], metadata !DIExpression()), !dbg [[DBG254]]
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG267:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_UB]], metadata [[META268:![0-9]+]], metadata !DIExpression()), !dbg [[DBG254]]
+// CHECK1-NEXT:    store i32 9, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG267]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_STRIDE]], metadata [[META269:![0-9]+]], metadata !DIExpression()), !dbg [[DBG254]]
+// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG267]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_IS_LAST]], metadata [[META270:![0-9]+]], metadata !DIExpression()), !dbg [[DBG254]]
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4, !dbg [[DBG267]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[I]], metadata [[META271:![0-9]+]], metadata !DIExpression()), !dbg [[DBG254]]
+// CHECK1-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG264]]
+// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4, !dbg [[DBG264]]
+// CHECK1-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB17:[0-9]+]], i32 [[TMP13]], i32 33, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1), !dbg [[DBG272:![0-9]+]]
+// CHECK1-NEXT:    br label [[OMP_DISPATCH_COND:%.*]], !dbg [[DBG264]]
 // CHECK1:       omp.dispatch.cond:
-// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG258]]
-// CHECK1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP14]], 9, !dbg [[DBG258]]
-// CHECK1-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]], !dbg [[DBG258]]
+// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG267]]
+// CHECK1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP14]], 9, !dbg [[DBG267]]
+// CHECK1-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]], !dbg [[DBG267]]
 // CHECK1:       cond.true:
-// CHECK1-NEXT:    br label [[COND_END:%.*]], !dbg [[DBG258]]
+// CHECK1-NEXT:    br label [[COND_END:%.*]], !dbg [[DBG267]]
 // CHECK1:       cond.false:
-// CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG258]]
-// CHECK1-NEXT:    br label [[COND_END]], !dbg [[DBG258]]
+// CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG267]]
+// CHECK1-NEXT:    br label [[COND_END]], !dbg [[DBG267]]
 // CHECK1:       cond.end:
-// CHECK1-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP15]], [[COND_FALSE]] ], !dbg [[DBG258]]
-// CHECK1-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4, !dbg [[DBG258]]
-// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG258]]
-// CHECK1-NEXT:    store i32 [[TMP16]], ptr [[DOTOMP_IV]], align 4, !dbg [[DBG258]]
-// CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG258]]
-// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG258]]
-// CHECK1-NEXT:    [[CMP5:%.*]] = icmp sle i32 [[TMP17]], [[TMP18]], !dbg [[DBG255]]
-// CHECK1-NEXT:    br i1 [[CMP5]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]], !dbg [[DBG255]]
+// CHECK1-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP15]], [[COND_FALSE]] ], !dbg [[DBG267]]
+// CHECK1-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4, !dbg [[DBG267]]
+// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG267]]
+// CHECK1-NEXT:    store i32 [[TMP16]], ptr [[DOTOMP_IV]], align 4, !dbg [[DBG267]]
+// CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG267]]
+// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG267]]
+// CHECK1-NEXT:    [[CMP5:%.*]] = icmp sle i32 [[TMP17]], [[TMP18]], !dbg [[DBG264]]
+// CHECK1-NEXT:    br i1 [[CMP5]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]], !dbg [[DBG264]]
 // CHECK1:       omp.dispatch.body:
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]], !dbg [[DBG255]]
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]], !dbg [[DBG264]]
 // CHECK1:       omp.inner.for.cond:
-// CHECK1-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG258]]
-// CHECK1-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG258]]
-// CHECK1-NEXT:    [[CMP6:%.*]] = icmp sle i32 [[TMP19]], [[TMP20]], !dbg [[DBG255]]
-// CHECK1-NEXT:    br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]], !dbg [[DBG255]]
+// CHECK1-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG267]]
+// CHECK1-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG267]]
+// CHECK1-NEXT:    [[CMP6:%.*]] = icmp sle i32 [[TMP19]], [[TMP20]], !dbg [[DBG264]]
+// CHECK1-NEXT:    br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]], !dbg [[DBG264]]
 // CHECK1:       omp.inner.for.body:
-// CHECK1-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG258]]
-// CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP21]], 1, !dbg [[DBG264:![0-9]+]]
-// CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]], !dbg [[DBG264]]
-// CHECK1-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !dbg [[DBG264]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[F]], metadata [[META265:![0-9]+]], metadata !DIExpression()), !dbg [[DBG267:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 1, !dbg [[DBG268:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX]], i64 0, i64 1, !dbg [[DBG268]]
-// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX7]], i64 0, i64 1, !dbg [[DBG268]]
-// CHECK1-NEXT:    store ptr [[ARRAYIDX8]], ptr [[F]], align 8, !dbg [[DBG267]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[G]], metadata [[META269:![0-9]+]], metadata !DIExpression()), !dbg [[DBG270:![0-9]+]]
-// CHECK1-NEXT:    store ptr [[TMP5]], ptr [[G]], align 8, !dbg [[DBG270]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[H]], metadata [[META271:![0-9]+]], metadata !DIExpression()), !dbg [[DBG272:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP8]], i64 0, i64 1, !dbg [[DBG273:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX9]], i64 0, i64 1, !dbg [[DBG273]]
-// CHECK1-NEXT:    store ptr [[ARRAYIDX10]], ptr [[H]], align 8, !dbg [[DBG272]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[D]], metadata [[META274:![0-9]+]], metadata !DIExpression()), !dbg [[DBG275:![0-9]+]]
-// CHECK1-NEXT:    store i32 15, ptr [[D]], align 4, !dbg [[DBG275]]
-// CHECK1-NEXT:    store i32 5, ptr [[TMP5]], align 4, !dbg [[DBG276:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP8]], i64 0, i64 0, !dbg [[DBG277:![0-9]+]]
-// CHECK1-NEXT:    [[TMP22:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG278:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP22]] to i64, !dbg [[DBG277]]
-// CHECK1-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX11]], i64 0, i64 [[IDXPROM]], !dbg [[DBG277]]
-// CHECK1-NEXT:    store i32 10, ptr [[ARRAYIDX12]], align 4, !dbg [[DBG279:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG280:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX13]], i64 0, i64 0, !dbg [[DBG280]]
-// CHECK1-NEXT:    [[TMP23:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG281:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM15:%.*]] = sext i32 [[TMP23]] to i64, !dbg [[DBG280]]
-// CHECK1-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX14]], i64 0, i64 [[IDXPROM15]], !dbg [[DBG280]]
-// CHECK1-NEXT:    store i32 11, ptr [[ARRAYIDX16]], align 4, !dbg [[DBG282:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG283:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX17]], i64 0, i64 0, !dbg [[DBG283]]
-// CHECK1-NEXT:    [[TMP24:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG284:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM19:%.*]] = sext i32 [[TMP24]] to i64, !dbg [[DBG283]]
-// CHECK1-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX18]], i64 0, i64 [[IDXPROM19]], !dbg [[DBG283]]
-// CHECK1-NEXT:    [[TMP25:%.*]] = load i32, ptr [[ARRAYIDX20]], align 4, !dbg [[DBG283]]
-// CHECK1-NEXT:    [[ARRAYIDX21:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP8]], i64 0, i64 0, !dbg [[DBG285:![0-9]+]]
-// CHECK1-NEXT:    [[TMP26:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG286:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM22:%.*]] = sext i32 [[TMP26]] to i64, !dbg [[DBG285]]
-// CHECK1-NEXT:    [[ARRAYIDX23:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX21]], i64 0, i64 [[IDXPROM22]], !dbg [[DBG285]]
-// CHECK1-NEXT:    store i32 [[TMP25]], ptr [[ARRAYIDX23]], align 4, !dbg [[DBG287:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX24:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP8]], i64 0, i64 0, !dbg [[DBG288:![0-9]+]]
-// CHECK1-NEXT:    [[TMP27:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG289:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM25:%.*]] = sext i32 [[TMP27]] to i64, !dbg [[DBG288]]
-// CHECK1-NEXT:    [[ARRAYIDX26:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX24]], i64 0, i64 [[IDXPROM25]], !dbg [[DBG288]]
-// CHECK1-NEXT:    [[TMP28:%.*]] = load i32, ptr [[ARRAYIDX26]], align 4, !dbg [[DBG288]]
-// CHECK1-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP28]], 0, !dbg [[DBG288]]
-// CHECK1-NEXT:    [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8, !dbg [[DBG290:![0-9]+]]
-// CHECK1-NEXT:    store i8 [[FROMBOOL]], ptr [[TMP11]], align 1, !dbg [[DBG290]]
-// CHECK1-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]], !dbg [[DBG291:![0-9]+]]
+// CHECK1-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG267]]
+// CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP21]], 1, !dbg [[DBG273:![0-9]+]]
+// CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]], !dbg [[DBG273]]
+// CHECK1-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !dbg [[DBG273]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[F]], metadata [[META274:![0-9]+]], metadata !DIExpression()), !dbg [[DBG276:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 1, !dbg [[DBG277:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX]], i64 0, i64 1, !dbg [[DBG277]]
+// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX7]], i64 0, i64 1, !dbg [[DBG277]]
+// CHECK1-NEXT:    store ptr [[ARRAYIDX8]], ptr [[F]], align 8, !dbg [[DBG276]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[G]], metadata [[META278:![0-9]+]], metadata !DIExpression()), !dbg [[DBG279:![0-9]+]]
+// CHECK1-NEXT:    store ptr [[TMP5]], ptr [[G]], align 8, !dbg [[DBG279]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[H]], metadata [[META280:![0-9]+]], metadata !DIExpression()), !dbg [[DBG281:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP8]], i64 0, i64 1, !dbg [[DBG282:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX9]], i64 0, i64 1, !dbg [[DBG282]]
+// CHECK1-NEXT:    store ptr [[ARRAYIDX10]], ptr [[H]], align 8, !dbg [[DBG281]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[D]], metadata [[META283:![0-9]+]], metadata !DIExpression()), !dbg [[DBG284:![0-9]+]]
+// CHECK1-NEXT:    store i32 15, ptr [[D]], align 4, !dbg [[DBG284]]
+// CHECK1-NEXT:    store i32 5, ptr [[TMP5]], align 4, !dbg [[DBG285:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP8]], i64 0, i64 0, !dbg [[DBG286:![0-9]+]]
+// CHECK1-NEXT:    [[TMP22:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG287:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP22]] to i64, !dbg [[DBG286]]
+// CHECK1-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX11]], i64 0, i64 [[IDXPROM]], !dbg [[DBG286]]
+// CHECK1-NEXT:    store i32 10, ptr [[ARRAYIDX12]], align 4, !dbg [[DBG288:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG289:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX13]], i64 0, i64 0, !dbg [[DBG289]]
+// CHECK1-NEXT:    [[TMP23:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG290:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM15:%.*]] = sext i32 [[TMP23]] to i64, !dbg [[DBG289]]
+// CHECK1-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX14]], i64 0, i64 [[IDXPROM15]], !dbg [[DBG289]]
+// CHECK1-NEXT:    store i32 11, ptr [[ARRAYIDX16]], align 4, !dbg [[DBG291:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG292:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX17]], i64 0, i64 0, !dbg [[DBG292]]
+// CHECK1-NEXT:    [[TMP24:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG293:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM19:%.*]] = sext i32 [[TMP24]] to i64, !dbg [[DBG292]]
+// CHECK1-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX18]], i64 0, i64 [[IDXPROM19]], !dbg [[DBG292]]
+// CHECK1-NEXT:    [[TMP25:%.*]] = load i32, ptr [[ARRAYIDX20]], align 4, !dbg [[DBG292]]
+// CHECK1-NEXT:    [[ARRAYIDX21:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP8]], i64 0, i64 0, !dbg [[DBG294:![0-9]+]]
+// CHECK1-NEXT:    [[TMP26:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG295:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM22:%.*]] = sext i32 [[TMP26]] to i64, !dbg [[DBG294]]
+// CHECK1-NEXT:    [[ARRAYIDX23:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX21]], i64 0, i64 [[IDXPROM22]], !dbg [[DBG294]]
+// CHECK1-NEXT:    store i32 [[TMP25]], ptr [[ARRAYIDX23]], align 4, !dbg [[DBG296:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX24:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP8]], i64 0, i64 0, !dbg [[DBG297:![0-9]+]]
+// CHECK1-NEXT:    [[TMP27:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG298:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM25:%.*]] = sext i32 [[TMP27]] to i64, !dbg [[DBG297]]
+// CHECK1-NEXT:    [[ARRAYIDX26:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX24]], i64 0, i64 [[IDXPROM25]], !dbg [[DBG297]]
+// CHECK1-NEXT:    [[TMP28:%.*]] = load i32, ptr [[ARRAYIDX26]], align 4, !dbg [[DBG297]]
+// CHECK1-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP28]], 0, !dbg [[DBG297]]
+// CHECK1-NEXT:    [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8, !dbg [[DBG299:![0-9]+]]
+// CHECK1-NEXT:    store i8 [[FROMBOOL]], ptr [[TMP11]], align 1, !dbg [[DBG299]]
+// CHECK1-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]], !dbg [[DBG300:![0-9]+]]
 // CHECK1:       omp.body.continue:
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]], !dbg [[DBG263]]
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]], !dbg [[DBG272]]
 // CHECK1:       omp.inner.for.inc:
-// CHECK1-NEXT:    [[TMP29:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG258]]
-// CHECK1-NEXT:    [[ADD27:%.*]] = add nsw i32 [[TMP29]], 1, !dbg [[DBG255]]
-// CHECK1-NEXT:    store i32 [[ADD27]], ptr [[DOTOMP_IV]], align 4, !dbg [[DBG255]]
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !dbg [[DBG263]], !llvm.loop [[LOOP292:![0-9]+]]
+// CHECK1-NEXT:    [[TMP29:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG267]]
+// CHECK1-NEXT:    [[ADD27:%.*]] = add nsw i32 [[TMP29]], 1, !dbg [[DBG264]]
+// CHECK1-NEXT:    store i32 [[ADD27]], ptr [[DOTOMP_IV]], align 4, !dbg [[DBG264]]
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !dbg [[DBG272]], !llvm.loop [[LOOP301:![0-9]+]]
 // CHECK1:       omp.inner.for.end:
-// CHECK1-NEXT:    br label [[OMP_DISPATCH_INC:%.*]], !dbg [[DBG263]]
+// CHECK1-NEXT:    br label [[OMP_DISPATCH_INC:%.*]], !dbg [[DBG272]]
 // CHECK1:       omp.dispatch.inc:
-// CHECK1-NEXT:    [[TMP30:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG258]]
-// CHECK1-NEXT:    [[TMP31:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG258]]
-// CHECK1-NEXT:    [[ADD28:%.*]] = add nsw i32 [[TMP30]], [[TMP31]], !dbg [[DBG255]]
-// CHECK1-NEXT:    store i32 [[ADD28]], ptr [[DOTOMP_LB]], align 4, !dbg [[DBG255]]
-// CHECK1-NEXT:    [[TMP32:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG258]]
-// CHECK1-NEXT:    [[TMP33:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG258]]
-// CHECK1-NEXT:    [[ADD29:%.*]] = add nsw i32 [[TMP32]], [[TMP33]], !dbg [[DBG255]]
-// CHECK1-NEXT:    store i32 [[ADD29]], ptr [[DOTOMP_UB]], align 4, !dbg [[DBG255]]
-// CHECK1-NEXT:    br label [[OMP_DISPATCH_COND]], !dbg [[DBG263]], !llvm.loop [[LOOP294:![0-9]+]]
+// CHECK1-NEXT:    [[TMP30:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG267]]
+// CHECK1-NEXT:    [[TMP31:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG267]]
+// CHECK1-NEXT:    [[ADD28:%.*]] = add nsw i32 [[TMP30]], [[TMP31]], !dbg [[DBG264]]
+// CHECK1-NEXT:    store i32 [[ADD28]], ptr [[DOTOMP_LB]], align 4, !dbg [[DBG264]]
+// CHECK1-NEXT:    [[TMP32:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG267]]
+// CHECK1-NEXT:    [[TMP33:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG267]]
+// CHECK1-NEXT:    [[ADD29:%.*]] = add nsw i32 [[TMP32]], [[TMP33]], !dbg [[DBG264]]
+// CHECK1-NEXT:    store i32 [[ADD29]], ptr [[DOTOMP_UB]], align 4, !dbg [[DBG264]]
+// CHECK1-NEXT:    br label [[OMP_DISPATCH_COND]], !dbg [[DBG272]], !llvm.loop [[LOOP303:![0-9]+]]
 // CHECK1:       omp.dispatch.end:
-// CHECK1-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB19:[0-9]+]], i32 [[TMP13]]), !dbg [[DBG293:![0-9]+]]
-// CHECK1-NEXT:    ret void, !dbg [[DBG295:![0-9]+]]
+// CHECK1-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB19:[0-9]+]], i32 [[TMP13]]), !dbg [[DBG302:![0-9]+]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG304:![0-9]+]]
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41_debug___omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG296:![0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR2]] !dbg [[DBG305:![0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -860,62 +860,62 @@ int main() {
 // CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[BB_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META299:![0-9]+]], metadata !DIExpression()), !dbg [[DBG300:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META308:![0-9]+]], metadata !DIExpression()), !dbg [[DBG309:![0-9]+]]
 // CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META301:![0-9]+]], metadata !DIExpression()), !dbg [[DBG300]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META310:![0-9]+]], metadata !DIExpression()), !dbg [[DBG309]]
 // CHECK1-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META302:![0-9]+]], metadata !DIExpression()), !dbg [[DBG300]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META311:![0-9]+]], metadata !DIExpression()), !dbg [[DBG309]]
 // CHECK1-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META303:![0-9]+]], metadata !DIExpression()), !dbg [[DBG300]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META312:![0-9]+]], metadata !DIExpression()), !dbg [[DBG309]]
 // CHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META304:![0-9]+]], metadata !DIExpression()), !dbg [[DBG300]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META313:![0-9]+]], metadata !DIExpression()), !dbg [[DBG309]]
 // CHECK1-NEXT:    store ptr [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META305:![0-9]+]], metadata !DIExpression()), !dbg [[DBG300]]
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG306:![0-9]+]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG306]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG306]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG306]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG306]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG306]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG306]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG306]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG306]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG306]]
-// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr [[TMP6]] to ptr addrspace(1), !dbg [[DBG306]]
-// CHECK1-NEXT:    [[TMP11:%.*]] = addrspacecast ptr [[TMP7]] to ptr addrspace(1), !dbg [[DBG306]]
-// CHECK1-NEXT:    [[TMP12:%.*]] = addrspacecast ptr [[TMP8]] to ptr addrspace(1), !dbg [[DBG306]]
-// CHECK1-NEXT:    [[TMP13:%.*]] = addrspacecast ptr [[TMP9]] to ptr addrspace(1), !dbg [[DBG306]]
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41_debug___omp_outlined_debug__(ptr [[TMP4]], ptr [[TMP5]], ptr addrspace(1) [[TMP10]], ptr addrspace(1) [[TMP11]], ptr addrspace(1) [[TMP12]], ptr addrspace(1) [[TMP13]]) #[[ATTR3]], !dbg [[DBG306]]
-// CHECK1-NEXT:    ret void, !dbg [[DBG306]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META314:![0-9]+]], metadata !DIExpression()), !dbg [[DBG309]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG315:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG315]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG315]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG315]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG315]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG315]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG315]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG315]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG315]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG315]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr [[TMP6]] to ptr addrspace(1), !dbg [[DBG315]]
+// CHECK1-NEXT:    [[TMP11:%.*]] = addrspacecast ptr [[TMP7]] to ptr addrspace(1), !dbg [[DBG315]]
+// CHECK1-NEXT:    [[TMP12:%.*]] = addrspacecast ptr [[TMP8]] to ptr addrspace(1), !dbg [[DBG315]]
+// CHECK1-NEXT:    [[TMP13:%.*]] = addrspacecast ptr [[TMP9]] to ptr addrspace(1), !dbg [[DBG315]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41_debug___omp_outlined_debug__(ptr [[TMP4]], ptr [[TMP5]], ptr addrspace(1) [[TMP10]], ptr addrspace(1) [[TMP11]], ptr addrspace(1) [[TMP12]], ptr addrspace(1) [[TMP13]]) #[[ATTR4]], !dbg [[DBG315]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG315]]
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41
-// CHECK1-SAME: (ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR5]] !dbg [[DBG307:![0-9]+]] {
+// CHECK1-SAME: (ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR6]] !dbg [[DBG316:![0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[BB_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META310:![0-9]+]], metadata !DIExpression()), !dbg [[DBG311:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META319:![0-9]+]], metadata !DIExpression()), !dbg [[DBG320:![0-9]+]]
 // CHECK1-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META312:![0-9]+]], metadata !DIExpression()), !dbg [[DBG311]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META321:![0-9]+]], metadata !DIExpression()), !dbg [[DBG320]]
 // CHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META313:![0-9]+]], metadata !DIExpression()), !dbg [[DBG311]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META322:![0-9]+]], metadata !DIExpression()), !dbg [[DBG320]]
 // CHECK1-NEXT:    store ptr [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META314:![0-9]+]], metadata !DIExpression()), !dbg [[DBG311]]
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG315:![0-9]+]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG315]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG315]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG315]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG315]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG315]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG315]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG315]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = addrspacecast ptr [[TMP4]] to ptr addrspace(1), !dbg [[DBG315]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = addrspacecast ptr [[TMP5]] to ptr addrspace(1), !dbg [[DBG315]]
-// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr [[TMP6]] to ptr addrspace(1), !dbg [[DBG315]]
-// CHECK1-NEXT:    [[TMP11:%.*]] = addrspacecast ptr [[TMP7]] to ptr addrspace(1), !dbg [[DBG315]]
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41_debug__(ptr addrspace(1) [[TMP8]], ptr addrspace(1) [[TMP9]], ptr addrspace(1) [[TMP10]], ptr addrspace(1) [[TMP11]]) #[[ATTR3]], !dbg [[DBG315]]
-// CHECK1-NEXT:    ret void, !dbg [[DBG315]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META323:![0-9]+]], metadata !DIExpression()), !dbg [[DBG320]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG324:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG324]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG324]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG324]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG324]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG324]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG324]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG324]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = addrspacecast ptr [[TMP4]] to ptr addrspace(1), !dbg [[DBG324]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = addrspacecast ptr [[TMP5]] to ptr addrspace(1), !dbg [[DBG324]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr [[TMP6]] to ptr addrspace(1), !dbg [[DBG324]]
+// CHECK1-NEXT:    [[TMP11:%.*]] = addrspacecast ptr [[TMP7]] to ptr addrspace(1), !dbg [[DBG324]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41_debug__(ptr addrspace(1) [[TMP8]], ptr addrspace(1) [[TMP9]], ptr addrspace(1) [[TMP10]], ptr addrspace(1) [[TMP11]]) #[[ATTR4]], !dbg [[DBG324]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG324]]
 //
diff --git a/clang/test/OpenMP/target_teams_distribute_simd_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_simd_codegen.cpp
index a3daee2532db1..9735f69119174 100644
--- a/clang/test/OpenMP/target_teams_distribute_simd_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_simd_codegen.cpp
@@ -411,7 +411,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP43:%.*]] = load i32, ptr [[A]], align 4
 // CHECK1-NEXT:    store i32 [[TMP43]], ptr [[A_CASTED]], align 4
 // CHECK1-NEXT:    [[TMP44:%.*]] = load i64, ptr [[A_CASTED]], align 8
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l104(i64 [[TMP44]]) #[[ATTR4:[0-9]+]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l104(i64 [[TMP44]]) #[[ATTR3:[0-9]+]]
 // CHECK1-NEXT:    [[TMP45:%.*]] = load i16, ptr [[AA]], align 2
 // CHECK1-NEXT:    store i16 [[TMP45]], ptr [[AA_CASTED4]], align 2
 // CHECK1-NEXT:    [[TMP46:%.*]] = load i64, ptr [[AA_CASTED4]], align 8
@@ -453,7 +453,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP66:%.*]] = icmp ne i32 [[TMP65]], 0
 // CHECK1-NEXT:    br i1 [[TMP66]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK1:       omp_offload.failed:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l111(i64 [[TMP46]]) #[[ATTR4]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l111(i64 [[TMP46]]) #[[ATTR3]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    [[TMP67:%.*]] = load i32, ptr [[A]], align 4
@@ -510,12 +510,12 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP94:%.*]] = icmp ne i32 [[TMP93]], 0
 // CHECK1-NEXT:    br i1 [[TMP94]], label [[OMP_OFFLOAD_FAILED15:%.*]], label [[OMP_OFFLOAD_CONT16:%.*]]
 // CHECK1:       omp_offload.failed15:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l118(i64 [[TMP68]], i64 [[TMP70]]) #[[ATTR4]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l118(i64 [[TMP68]], i64 [[TMP70]]) #[[ATTR3]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT16]]
 // CHECK1:       omp_offload.cont16:
 // CHECK1-NEXT:    br label [[OMP_IF_END:%.*]]
 // CHECK1:       omp_if.else:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l118(i64 [[TMP68]], i64 [[TMP70]]) #[[ATTR4]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l118(i64 [[TMP68]], i64 [[TMP70]]) #[[ATTR3]]
 // CHECK1-NEXT:    br label [[OMP_IF_END]]
 // CHECK1:       omp_if.end:
 // CHECK1-NEXT:    [[TMP95:%.*]] = load i32, ptr [[A]], align 4
@@ -620,12 +620,12 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP147:%.*]] = icmp ne i32 [[TMP146]], 0
 // CHECK1-NEXT:    br i1 [[TMP147]], label [[OMP_OFFLOAD_FAILED25:%.*]], label [[OMP_OFFLOAD_CONT26:%.*]]
 // CHECK1:       omp_offload.failed25:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l142(i64 [[TMP96]], ptr [[B]], i64 [[TMP2]], ptr [[VLA]], ptr [[C]], i64 5, i64 [[TMP5]], ptr [[VLA1]], ptr [[D]]) #[[ATTR4]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l142(i64 [[TMP96]], ptr [[B]], i64 [[TMP2]], ptr [[VLA]], ptr [[C]], i64 5, i64 [[TMP5]], ptr [[VLA1]], ptr [[D]]) #[[ATTR3]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT26]]
 // CHECK1:       omp_offload.cont26:
 // CHECK1-NEXT:    br label [[OMP_IF_END28:%.*]]
 // CHECK1:       omp_if.else27:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l142(i64 [[TMP96]], ptr [[B]], i64 [[TMP2]], ptr [[VLA]], ptr [[C]], i64 5, i64 [[TMP5]], ptr [[VLA1]], ptr [[D]]) #[[ATTR4]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l142(i64 [[TMP96]], ptr [[B]], i64 [[TMP2]], ptr [[VLA]], ptr [[C]], i64 5, i64 [[TMP5]], ptr [[VLA1]], ptr [[D]]) #[[ATTR3]]
 // CHECK1-NEXT:    br label [[OMP_IF_END28]]
 // CHECK1:       omp_if.end28:
 // CHECK1-NEXT:    [[TMP148:%.*]] = load i32, ptr [[A]], align 4
@@ -656,7 +656,7 @@ int bar(int n){
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l97.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[AA:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[AA:%.*]]) #[[ATTR2]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -693,23 +693,23 @@ int bar(int n){
 // CHECK1-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK1:       omp.inner.for.cond:
-// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP17:![0-9]+]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP17]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP10:![0-9]+]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP10]]
 // CHECK1-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
 // CHECK1-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK1:       omp.inner.for.body:
-// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP17]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP10]]
 // CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
 // CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK1-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP17]]
+// CHECK1-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP10]]
 // CHECK1-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK1:       omp.body.continue:
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK1:       omp.inner.for.inc:
-// CHECK1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP17]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP10]]
 // CHECK1-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP8]], 1
-// CHECK1-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP17]]
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP18:![0-9]+]]
+// CHECK1-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP10]]
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP11:![0-9]+]]
 // CHECK1:       omp.inner.for.end:
 // CHECK1-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK1:       omp.loop.exit:
@@ -725,7 +725,7 @@ int bar(int n){
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@.omp_task_privates_map.
-// CHECK1-SAME: (ptr noalias noundef [[TMP0:%.*]], ptr noalias noundef [[TMP1:%.*]], ptr noalias noundef [[TMP2:%.*]], ptr noalias noundef [[TMP3:%.*]], ptr noalias noundef [[TMP4:%.*]]) #[[ATTR5:[0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[TMP0:%.*]], ptr noalias noundef [[TMP1:%.*]], ptr noalias noundef [[TMP2:%.*]], ptr noalias noundef [[TMP3:%.*]], ptr noalias noundef [[TMP4:%.*]]) #[[ATTR4:[0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8
@@ -754,7 +754,7 @@ int bar(int n){
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@.omp_task_entry.
-// CHECK1-SAME: (i32 noundef signext [[TMP0:%.*]], ptr noalias noundef [[TMP1:%.*]]) #[[ATTR6:[0-9]+]] {
+// CHECK1-SAME: (i32 noundef signext [[TMP0:%.*]], ptr noalias noundef [[TMP1:%.*]]) #[[ATTR5:[0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR_I:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[DOTPART_ID__ADDR_I:%.*]] = alloca ptr, align 8
@@ -781,74 +781,74 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], ptr [[TMP4]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8
 // CHECK1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES]], ptr [[TMP3]], i32 0, i32 1
+// CHECK1-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META17:![0-9]+]])
+// CHECK1-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META20:![0-9]+]])
+// CHECK1-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META22:![0-9]+]])
 // CHECK1-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META24:![0-9]+]])
-// CHECK1-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META27:![0-9]+]])
-// CHECK1-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META29:![0-9]+]])
-// CHECK1-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META31:![0-9]+]])
-// CHECK1-NEXT:    store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !33
-// CHECK1-NEXT:    store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 8, !noalias !33
-// CHECK1-NEXT:    store ptr [[TMP8]], ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias !33
-// CHECK1-NEXT:    store ptr @.omp_task_privates_map., ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !33
-// CHECK1-NEXT:    store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 8, !noalias !33
-// CHECK1-NEXT:    store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !33
-// CHECK1-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !33
-// CHECK1-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !33
-// CHECK1-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias !33
-// CHECK1-NEXT:    call void [[TMP10]](ptr [[TMP11]], ptr [[DOTFIRSTPRIV_PTR_ADDR_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR2_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR3_I]]) #[[ATTR4]]
-// CHECK1-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR_I]], align 8, !noalias !33
-// CHECK1-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], align 8, !noalias !33
-// CHECK1-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR2_I]], align 8, !noalias !33
-// CHECK1-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR3_I]], align 8, !noalias !33
+// CHECK1-NEXT:    store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !26
+// CHECK1-NEXT:    store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 8, !noalias !26
+// CHECK1-NEXT:    store ptr [[TMP8]], ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias !26
+// CHECK1-NEXT:    store ptr @.omp_task_privates_map., ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !26
+// CHECK1-NEXT:    store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 8, !noalias !26
+// CHECK1-NEXT:    store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !26
+// CHECK1-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !26
+// CHECK1-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !26
+// CHECK1-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias !26
+// CHECK1-NEXT:    call void [[TMP10]](ptr [[TMP11]], ptr [[DOTFIRSTPRIV_PTR_ADDR_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR2_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR3_I]]) #[[ATTR3]]
+// CHECK1-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR_I]], align 8, !noalias !26
+// CHECK1-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], align 8, !noalias !26
+// CHECK1-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR2_I]], align 8, !noalias !26
+// CHECK1-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR3_I]], align 8, !noalias !26
 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT_ANON:%.*]], ptr [[TMP9]], i32 0, i32 1
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_ANON]], ptr [[TMP9]], i32 0, i32 2
 // CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP16]], align 4
 // CHECK1-NEXT:    [[TMP19:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP18]], 0
-// CHECK1-NEXT:    store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias !33
+// CHECK1-NEXT:    store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias !26
 // CHECK1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 1
-// CHECK1-NEXT:    store i32 3, ptr [[TMP20]], align 4, !noalias !33
+// CHECK1-NEXT:    store i32 3, ptr [[TMP20]], align 4, !noalias !26
 // CHECK1-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 2
-// CHECK1-NEXT:    store ptr [[TMP13]], ptr [[TMP21]], align 8, !noalias !33
+// CHECK1-NEXT:    store ptr [[TMP13]], ptr [[TMP21]], align 8, !noalias !26
 // CHECK1-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 3
-// CHECK1-NEXT:    store ptr [[TMP14]], ptr [[TMP22]], align 8, !noalias !33
+// CHECK1-NEXT:    store ptr [[TMP14]], ptr [[TMP22]], align 8, !noalias !26
 // CHECK1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 4
-// CHECK1-NEXT:    store ptr [[TMP15]], ptr [[TMP23]], align 8, !noalias !33
+// CHECK1-NEXT:    store ptr [[TMP15]], ptr [[TMP23]], align 8, !noalias !26
 // CHECK1-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 5
-// CHECK1-NEXT:    store ptr @.offload_maptypes, ptr [[TMP24]], align 8, !noalias !33
+// CHECK1-NEXT:    store ptr @.offload_maptypes, ptr [[TMP24]], align 8, !noalias !26
 // CHECK1-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 6
-// CHECK1-NEXT:    store ptr null, ptr [[TMP25]], align 8, !noalias !33
+// CHECK1-NEXT:    store ptr null, ptr [[TMP25]], align 8, !noalias !26
 // CHECK1-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 7
-// CHECK1-NEXT:    store ptr null, ptr [[TMP26]], align 8, !noalias !33
+// CHECK1-NEXT:    store ptr null, ptr [[TMP26]], align 8, !noalias !26
 // CHECK1-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 8
-// CHECK1-NEXT:    store i64 10, ptr [[TMP27]], align 8, !noalias !33
+// CHECK1-NEXT:    store i64 10, ptr [[TMP27]], align 8, !noalias !26
 // CHECK1-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 9
-// CHECK1-NEXT:    store i64 1, ptr [[TMP28]], align 8, !noalias !33
+// CHECK1-NEXT:    store i64 1, ptr [[TMP28]], align 8, !noalias !26
 // CHECK1-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 10
-// CHECK1-NEXT:    store [3 x i32] [[TMP19]], ptr [[TMP29]], align 4, !noalias !33
+// CHECK1-NEXT:    store [3 x i32] [[TMP19]], ptr [[TMP29]], align 4, !noalias !26
 // CHECK1-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 11
-// CHECK1-NEXT:    store [3 x i32] [i32 1, i32 0, i32 0], ptr [[TMP30]], align 4, !noalias !33
+// CHECK1-NEXT:    store [3 x i32] [i32 1, i32 0, i32 0], ptr [[TMP30]], align 4, !noalias !26
 // CHECK1-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 12
-// CHECK1-NEXT:    store i32 0, ptr [[TMP31]], align 4, !noalias !33
+// CHECK1-NEXT:    store i32 0, ptr [[TMP31]], align 4, !noalias !26
 // CHECK1-NEXT:    [[TMP32:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB2]], i64 -1, i32 [[TMP18]], i32 1, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l97.region_id, ptr [[KERNEL_ARGS_I]])
 // CHECK1-NEXT:    [[TMP33:%.*]] = icmp ne i32 [[TMP32]], 0
 // CHECK1-NEXT:    br i1 [[TMP33]], label [[OMP_OFFLOAD_FAILED_I:%.*]], label [[DOTOMP_OUTLINED__EXIT:%.*]]
 // CHECK1:       omp_offload.failed.i:
 // CHECK1-NEXT:    [[TMP34:%.*]] = load i16, ptr [[TMP12]], align 2
-// CHECK1-NEXT:    store i16 [[TMP34]], ptr [[AA_CASTED_I]], align 2, !noalias !33
-// CHECK1-NEXT:    [[TMP35:%.*]] = load i64, ptr [[AA_CASTED_I]], align 8, !noalias !33
+// CHECK1-NEXT:    store i16 [[TMP34]], ptr [[AA_CASTED_I]], align 2, !noalias !26
+// CHECK1-NEXT:    [[TMP35:%.*]] = load i64, ptr [[AA_CASTED_I]], align 8, !noalias !26
 // CHECK1-NEXT:    [[TMP36:%.*]] = load i32, ptr [[TMP16]], align 4
-// CHECK1-NEXT:    store i32 [[TMP36]], ptr [[DOTCAPTURE_EXPR__CASTED_I]], align 4, !noalias !33
-// CHECK1-NEXT:    [[TMP37:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED_I]], align 8, !noalias !33
+// CHECK1-NEXT:    store i32 [[TMP36]], ptr [[DOTCAPTURE_EXPR__CASTED_I]], align 4, !noalias !26
+// CHECK1-NEXT:    [[TMP37:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED_I]], align 8, !noalias !26
 // CHECK1-NEXT:    [[TMP38:%.*]] = load i32, ptr [[TMP17]], align 4
-// CHECK1-NEXT:    store i32 [[TMP38]], ptr [[DOTCAPTURE_EXPR__CASTED4_I]], align 4, !noalias !33
-// CHECK1-NEXT:    [[TMP39:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED4_I]], align 8, !noalias !33
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l97(i64 [[TMP35]], i64 [[TMP37]], i64 [[TMP39]]) #[[ATTR4]]
+// CHECK1-NEXT:    store i32 [[TMP38]], ptr [[DOTCAPTURE_EXPR__CASTED4_I]], align 4, !noalias !26
+// CHECK1-NEXT:    [[TMP39:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED4_I]], align 8, !noalias !26
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l97(i64 [[TMP35]], i64 [[TMP37]], i64 [[TMP39]]) #[[ATTR3]]
 // CHECK1-NEXT:    br label [[DOTOMP_OUTLINED__EXIT]]
 // CHECK1:       .omp_outlined..exit:
 // CHECK1-NEXT:    ret i32 0
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l104
-// CHECK1-SAME: (i64 noundef [[A:%.*]]) #[[ATTR3]] {
+// CHECK1-SAME: (i64 noundef [[A:%.*]]) #[[ATTR2]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
 // CHECK1-NEXT:    [[A_CASTED:%.*]] = alloca i64, align 8
@@ -861,7 +861,7 @@ int bar(int n){
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l104.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]]) #[[ATTR3]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]]) #[[ATTR2]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -917,7 +917,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
 // CHECK1-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP9]], 1
 // CHECK1-NEXT:    store i32 [[ADD4]], ptr [[DOTOMP_IV]], align 4
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP34:![0-9]+]]
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP27:![0-9]+]]
 // CHECK1:       omp.inner.for.end:
 // CHECK1-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK1:       omp.loop.exit:
@@ -946,7 +946,7 @@ int bar(int n){
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l111.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[AA:%.*]]) #[[ATTR3]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[AA:%.*]]) #[[ATTR2]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -983,28 +983,28 @@ int bar(int n){
 // CHECK1-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK1:       omp.inner.for.cond:
-// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP36:![0-9]+]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP36]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP29:![0-9]+]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP29]]
 // CHECK1-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
 // CHECK1-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK1:       omp.inner.for.body:
-// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP36]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP29]]
 // CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
 // CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK1-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP36]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP36]]
+// CHECK1-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP29]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP29]]
 // CHECK1-NEXT:    [[CONV:%.*]] = sext i16 [[TMP8]] to i32
 // CHECK1-NEXT:    [[ADD2:%.*]] = add nsw i32 [[CONV]], 1
 // CHECK1-NEXT:    [[CONV3:%.*]] = trunc i32 [[ADD2]] to i16
-// CHECK1-NEXT:    store i16 [[CONV3]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP36]]
+// CHECK1-NEXT:    store i16 [[CONV3]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP29]]
 // CHECK1-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK1:       omp.body.continue:
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK1:       omp.inner.for.inc:
-// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP36]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP29]]
 // CHECK1-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP9]], 1
-// CHECK1-NEXT:    store i32 [[ADD4]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP36]]
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP37:![0-9]+]]
+// CHECK1-NEXT:    store i32 [[ADD4]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP29]]
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP30:![0-9]+]]
 // CHECK1:       omp.inner.for.end:
 // CHECK1-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK1:       omp.loop.exit:
@@ -1039,7 +1039,7 @@ int bar(int n){
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l118.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]]) #[[ATTR3]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]]) #[[ATTR2]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1078,31 +1078,31 @@ int bar(int n){
 // CHECK1-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK1:       omp.inner.for.cond:
-// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP39:![0-9]+]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP39]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP32:![0-9]+]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP32]]
 // CHECK1-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
 // CHECK1-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK1:       omp.inner.for.body:
-// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP39]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP32]]
 // CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
 // CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK1-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP39]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP39]]
+// CHECK1-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP32]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP32]]
 // CHECK1-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP8]], 1
-// CHECK1-NEXT:    store i32 [[ADD2]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP39]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP39]]
+// CHECK1-NEXT:    store i32 [[ADD2]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP32]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP32]]
 // CHECK1-NEXT:    [[CONV:%.*]] = sext i16 [[TMP9]] to i32
 // CHECK1-NEXT:    [[ADD3:%.*]] = add nsw i32 [[CONV]], 1
 // CHECK1-NEXT:    [[CONV4:%.*]] = trunc i32 [[ADD3]] to i16
-// CHECK1-NEXT:    store i16 [[CONV4]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP39]]
+// CHECK1-NEXT:    store i16 [[CONV4]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP32]]
 // CHECK1-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK1:       omp.body.continue:
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK1:       omp.inner.for.inc:
-// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP39]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP32]]
 // CHECK1-NEXT:    [[ADD5:%.*]] = add nsw i32 [[TMP10]], 1
-// CHECK1-NEXT:    store i32 [[ADD5]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP39]]
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP40:![0-9]+]]
+// CHECK1-NEXT:    store i32 [[ADD5]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP32]]
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP33:![0-9]+]]
 // CHECK1:       omp.inner.for.end:
 // CHECK1-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK1:       omp.loop.exit:
@@ -1155,7 +1155,7 @@ int bar(int n){
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l142.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BN:%.*]], ptr noundef nonnull align 8 dereferenceable(400) [[C:%.*]], i64 noundef [[VLA1:%.*]], i64 noundef [[VLA3:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[CN:%.*]], ptr noundef nonnull align 8 dereferenceable(16) [[D:%.*]]) #[[ATTR3]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BN:%.*]], ptr noundef nonnull align 8 dereferenceable(400) [[C:%.*]], i64 noundef [[VLA1:%.*]], i64 noundef [[VLA3:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[CN:%.*]], ptr noundef nonnull align 8 dereferenceable(16) [[D:%.*]]) #[[ATTR2]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1218,59 +1218,59 @@ int bar(int n){
 // CHECK1-NEXT:    store i32 [[TMP12]], ptr [[DOTOMP_IV]], align 4
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK1:       omp.inner.for.cond:
-// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP42:![0-9]+]]
-// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP42]]
+// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP35:![0-9]+]]
+// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP35]]
 // CHECK1-NEXT:    [[CMP5:%.*]] = icmp sle i32 [[TMP13]], [[TMP14]]
 // CHECK1-NEXT:    br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK1:       omp.inner.for.body:
-// CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP42]]
+// CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP35]]
 // CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP15]], 1
 // CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK1-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP42]]
-// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP42]]
+// CHECK1-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP35]]
+// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP35]]
 // CHECK1-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP16]], 1
-// CHECK1-NEXT:    store i32 [[ADD6]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP42]]
+// CHECK1-NEXT:    store i32 [[ADD6]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP35]]
 // CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x float], ptr [[TMP0]], i64 0, i64 2
-// CHECK1-NEXT:    [[TMP17:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP42]]
+// CHECK1-NEXT:    [[TMP17:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP35]]
 // CHECK1-NEXT:    [[CONV:%.*]] = fpext float [[TMP17]] to double
 // CHECK1-NEXT:    [[ADD7:%.*]] = fadd double [[CONV]], 1.000000e+00
 // CHECK1-NEXT:    [[CONV8:%.*]] = fptrunc double [[ADD7]] to float
-// CHECK1-NEXT:    store float [[CONV8]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP42]]
+// CHECK1-NEXT:    store float [[CONV8]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP35]]
 // CHECK1-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i64 3
-// CHECK1-NEXT:    [[TMP18:%.*]] = load float, ptr [[ARRAYIDX9]], align 4, !llvm.access.group [[ACC_GRP42]]
+// CHECK1-NEXT:    [[TMP18:%.*]] = load float, ptr [[ARRAYIDX9]], align 4, !llvm.access.group [[ACC_GRP35]]
 // CHECK1-NEXT:    [[CONV10:%.*]] = fpext float [[TMP18]] to double
 // CHECK1-NEXT:    [[ADD11:%.*]] = fadd double [[CONV10]], 1.000000e+00
 // CHECK1-NEXT:    [[CONV12:%.*]] = fptrunc double [[ADD11]] to float
-// CHECK1-NEXT:    store float [[CONV12]], ptr [[ARRAYIDX9]], align 4, !llvm.access.group [[ACC_GRP42]]
+// CHECK1-NEXT:    store float [[CONV12]], ptr [[ARRAYIDX9]], align 4, !llvm.access.group [[ACC_GRP35]]
 // CHECK1-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [5 x [10 x double]], ptr [[TMP3]], i64 0, i64 1
 // CHECK1-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x double], ptr [[ARRAYIDX13]], i64 0, i64 2
-// CHECK1-NEXT:    [[TMP19:%.*]] = load double, ptr [[ARRAYIDX14]], align 8, !llvm.access.group [[ACC_GRP42]]
+// CHECK1-NEXT:    [[TMP19:%.*]] = load double, ptr [[ARRAYIDX14]], align 8, !llvm.access.group [[ACC_GRP35]]
 // CHECK1-NEXT:    [[ADD15:%.*]] = fadd double [[TMP19]], 1.000000e+00
-// CHECK1-NEXT:    store double [[ADD15]], ptr [[ARRAYIDX14]], align 8, !llvm.access.group [[ACC_GRP42]]
+// CHECK1-NEXT:    store double [[ADD15]], ptr [[ARRAYIDX14]], align 8, !llvm.access.group [[ACC_GRP35]]
 // CHECK1-NEXT:    [[TMP20:%.*]] = mul nsw i64 1, [[TMP5]]
 // CHECK1-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds double, ptr [[TMP6]], i64 [[TMP20]]
 // CHECK1-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds double, ptr [[ARRAYIDX16]], i64 3
-// CHECK1-NEXT:    [[TMP21:%.*]] = load double, ptr [[ARRAYIDX17]], align 8, !llvm.access.group [[ACC_GRP42]]
+// CHECK1-NEXT:    [[TMP21:%.*]] = load double, ptr [[ARRAYIDX17]], align 8, !llvm.access.group [[ACC_GRP35]]
 // CHECK1-NEXT:    [[ADD18:%.*]] = fadd double [[TMP21]], 1.000000e+00
-// CHECK1-NEXT:    store double [[ADD18]], ptr [[ARRAYIDX17]], align 8, !llvm.access.group [[ACC_GRP42]]
+// CHECK1-NEXT:    store double [[ADD18]], ptr [[ARRAYIDX17]], align 8, !llvm.access.group [[ACC_GRP35]]
 // CHECK1-NEXT:    [[X:%.*]] = getelementptr inbounds [[STRUCT_TT:%.*]], ptr [[TMP7]], i32 0, i32 0
-// CHECK1-NEXT:    [[TMP22:%.*]] = load i64, ptr [[X]], align 8, !llvm.access.group [[ACC_GRP42]]
+// CHECK1-NEXT:    [[TMP22:%.*]] = load i64, ptr [[X]], align 8, !llvm.access.group [[ACC_GRP35]]
 // CHECK1-NEXT:    [[ADD19:%.*]] = add nsw i64 [[TMP22]], 1
-// CHECK1-NEXT:    store i64 [[ADD19]], ptr [[X]], align 8, !llvm.access.group [[ACC_GRP42]]
+// CHECK1-NEXT:    store i64 [[ADD19]], ptr [[X]], align 8, !llvm.access.group [[ACC_GRP35]]
 // CHECK1-NEXT:    [[Y:%.*]] = getelementptr inbounds [[STRUCT_TT]], ptr [[TMP7]], i32 0, i32 1
-// CHECK1-NEXT:    [[TMP23:%.*]] = load i8, ptr [[Y]], align 8, !llvm.access.group [[ACC_GRP42]]
+// CHECK1-NEXT:    [[TMP23:%.*]] = load i8, ptr [[Y]], align 8, !llvm.access.group [[ACC_GRP35]]
 // CHECK1-NEXT:    [[CONV20:%.*]] = sext i8 [[TMP23]] to i32
 // CHECK1-NEXT:    [[ADD21:%.*]] = add nsw i32 [[CONV20]], 1
 // CHECK1-NEXT:    [[CONV22:%.*]] = trunc i32 [[ADD21]] to i8
-// CHECK1-NEXT:    store i8 [[CONV22]], ptr [[Y]], align 8, !llvm.access.group [[ACC_GRP42]]
+// CHECK1-NEXT:    store i8 [[CONV22]], ptr [[Y]], align 8, !llvm.access.group [[ACC_GRP35]]
 // CHECK1-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK1:       omp.body.continue:
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK1:       omp.inner.for.inc:
-// CHECK1-NEXT:    [[TMP24:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP42]]
+// CHECK1-NEXT:    [[TMP24:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP35]]
 // CHECK1-NEXT:    [[ADD23:%.*]] = add nsw i32 [[TMP24]], 1
-// CHECK1-NEXT:    store i32 [[ADD23]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP42]]
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP43:![0-9]+]]
+// CHECK1-NEXT:    store i32 [[ADD23]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP35]]
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP36:![0-9]+]]
 // CHECK1:       omp.inner.for.end:
 // CHECK1-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK1:       omp.loop.exit:
@@ -1421,12 +1421,12 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP43:%.*]] = icmp ne i32 [[TMP42]], 0
 // CHECK1-NEXT:    br i1 [[TMP43]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK1:       omp_offload.failed:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l215(ptr [[THIS1]], i64 [[TMP6]], i64 2, i64 [[TMP2]], ptr [[VLA]]) #[[ATTR4]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l215(ptr [[THIS1]], i64 [[TMP6]], i64 2, i64 [[TMP2]], ptr [[VLA]]) #[[ATTR3]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    br label [[OMP_IF_END:%.*]]
 // CHECK1:       omp_if.else:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l215(ptr [[THIS1]], i64 [[TMP6]], i64 2, i64 [[TMP2]], ptr [[VLA]]) #[[ATTR4]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l215(ptr [[THIS1]], i64 [[TMP6]], i64 2, i64 [[TMP2]], ptr [[VLA]]) #[[ATTR3]]
 // CHECK1-NEXT:    br label [[OMP_IF_END]]
 // CHECK1:       omp_if.end:
 // CHECK1-NEXT:    [[TMP44:%.*]] = mul nsw i64 1, [[TMP2]]
@@ -1558,12 +1558,12 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP46:%.*]] = icmp ne i32 [[TMP45]], 0
 // CHECK1-NEXT:    br i1 [[TMP46]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK1:       omp_offload.failed:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l197(i64 [[TMP1]], i64 [[TMP3]], i64 [[TMP5]], i64 [[TMP7]], ptr [[B]]) #[[ATTR4]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l197(i64 [[TMP1]], i64 [[TMP3]], i64 [[TMP5]], i64 [[TMP7]], ptr [[B]]) #[[ATTR3]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    br label [[OMP_IF_END:%.*]]
 // CHECK1:       omp_if.else:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l197(i64 [[TMP1]], i64 [[TMP3]], i64 [[TMP5]], i64 [[TMP7]], ptr [[B]]) #[[ATTR4]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l197(i64 [[TMP1]], i64 [[TMP3]], i64 [[TMP5]], i64 [[TMP7]], ptr [[B]]) #[[ATTR3]]
 // CHECK1-NEXT:    br label [[OMP_IF_END]]
 // CHECK1:       omp_if.end:
 // CHECK1-NEXT:    [[TMP47:%.*]] = load i32, ptr [[A]], align 4
@@ -1647,12 +1647,12 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP30:%.*]] = icmp ne i32 [[TMP29]], 0
 // CHECK1-NEXT:    br i1 [[TMP30]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK1:       omp_offload.failed:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l180(i64 [[TMP1]], i64 [[TMP3]], ptr [[B]]) #[[ATTR4]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l180(i64 [[TMP1]], i64 [[TMP3]], ptr [[B]]) #[[ATTR3]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    br label [[OMP_IF_END:%.*]]
 // CHECK1:       omp_if.else:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l180(i64 [[TMP1]], i64 [[TMP3]], ptr [[B]]) #[[ATTR4]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l180(i64 [[TMP1]], i64 [[TMP3]], ptr [[B]]) #[[ATTR3]]
 // CHECK1-NEXT:    br label [[OMP_IF_END]]
 // CHECK1:       omp_if.end:
 // CHECK1-NEXT:    [[TMP31:%.*]] = load i32, ptr [[A]], align 4
@@ -1685,7 +1685,7 @@ int bar(int n){
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l215.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]], i64 noundef [[B:%.*]], i64 noundef [[VLA:%.*]], i64 noundef [[VLA1:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[C:%.*]]) #[[ATTR3]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]], i64 noundef [[B:%.*]], i64 noundef [[VLA:%.*]], i64 noundef [[VLA1:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[C:%.*]]) #[[ATTR2]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1734,37 +1734,37 @@ int bar(int n){
 // CHECK1-NEXT:    store i32 [[TMP8]], ptr [[DOTOMP_IV]], align 4
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK1:       omp.inner.for.cond:
-// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP45:![0-9]+]]
-// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP45]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP38:![0-9]+]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP38]]
 // CHECK1-NEXT:    [[CMP3:%.*]] = icmp sle i32 [[TMP9]], [[TMP10]]
 // CHECK1-NEXT:    br i1 [[CMP3]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK1:       omp.inner.for.body:
-// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP45]]
+// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP38]]
 // CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP11]], 1
 // CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK1-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP45]]
-// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[B_ADDR]], align 4, !llvm.access.group [[ACC_GRP45]]
+// CHECK1-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP38]]
+// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[B_ADDR]], align 4, !llvm.access.group [[ACC_GRP38]]
 // CHECK1-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP12]] to double
 // CHECK1-NEXT:    [[ADD4:%.*]] = fadd double [[CONV]], 1.500000e+00
 // CHECK1-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_S1:%.*]], ptr [[TMP0]], i32 0, i32 0
-// CHECK1-NEXT:    store double [[ADD4]], ptr [[A]], align 8, !llvm.access.group [[ACC_GRP45]]
+// CHECK1-NEXT:    store double [[ADD4]], ptr [[A]], align 8, !llvm.access.group [[ACC_GRP38]]
 // CHECK1-NEXT:    [[A5:%.*]] = getelementptr inbounds [[STRUCT_S1]], ptr [[TMP0]], i32 0, i32 0
-// CHECK1-NEXT:    [[TMP13:%.*]] = load double, ptr [[A5]], align 8, !llvm.access.group [[ACC_GRP45]]
+// CHECK1-NEXT:    [[TMP13:%.*]] = load double, ptr [[A5]], align 8, !llvm.access.group [[ACC_GRP38]]
 // CHECK1-NEXT:    [[INC:%.*]] = fadd double [[TMP13]], 1.000000e+00
-// CHECK1-NEXT:    store double [[INC]], ptr [[A5]], align 8, !llvm.access.group [[ACC_GRP45]]
+// CHECK1-NEXT:    store double [[INC]], ptr [[A5]], align 8, !llvm.access.group [[ACC_GRP38]]
 // CHECK1-NEXT:    [[CONV6:%.*]] = fptosi double [[INC]] to i16
 // CHECK1-NEXT:    [[TMP14:%.*]] = mul nsw i64 1, [[TMP2]]
 // CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[TMP3]], i64 [[TMP14]]
 // CHECK1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i16, ptr [[ARRAYIDX]], i64 1
-// CHECK1-NEXT:    store i16 [[CONV6]], ptr [[ARRAYIDX7]], align 2, !llvm.access.group [[ACC_GRP45]]
+// CHECK1-NEXT:    store i16 [[CONV6]], ptr [[ARRAYIDX7]], align 2, !llvm.access.group [[ACC_GRP38]]
 // CHECK1-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK1:       omp.body.continue:
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK1:       omp.inner.for.inc:
-// CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP45]]
+// CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP38]]
 // CHECK1-NEXT:    [[ADD8:%.*]] = add nsw i32 [[TMP15]], 1
-// CHECK1-NEXT:    store i32 [[ADD8]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP45]]
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP46:![0-9]+]]
+// CHECK1-NEXT:    store i32 [[ADD8]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP38]]
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP39:![0-9]+]]
 // CHECK1:       omp.inner.for.end:
 // CHECK1-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK1:       omp.loop.exit:
@@ -1814,7 +1814,7 @@ int bar(int n){
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l197.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], i64 noundef [[N:%.*]], i64 noundef [[AA:%.*]], i64 noundef [[AAA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR3]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], i64 noundef [[N:%.*]], i64 noundef [[AA:%.*]], i64 noundef [[AAA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR2]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1886,42 +1886,42 @@ int bar(int n){
 // CHECK1-NEXT:    store i32 [[TMP15]], ptr [[DOTOMP_IV]], align 4
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK1:       omp.inner.for.cond:
-// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP48:![0-9]+]]
-// CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP48]]
+// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP41:![0-9]+]]
+// CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP41]]
 // CHECK1-NEXT:    [[ADD7:%.*]] = add i32 [[TMP17]], 1
 // CHECK1-NEXT:    [[CMP8:%.*]] = icmp ult i32 [[TMP16]], [[ADD7]]
 // CHECK1-NEXT:    br i1 [[CMP8]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK1:       omp.inner.for.body:
-// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4, !llvm.access.group [[ACC_GRP48]]
-// CHECK1-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP48]]
+// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4, !llvm.access.group [[ACC_GRP41]]
+// CHECK1-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP41]]
 // CHECK1-NEXT:    [[MUL:%.*]] = mul i32 [[TMP19]], 1
 // CHECK1-NEXT:    [[ADD9:%.*]] = add i32 [[TMP18]], [[MUL]]
-// CHECK1-NEXT:    store i32 [[ADD9]], ptr [[I5]], align 4, !llvm.access.group [[ACC_GRP48]]
-// CHECK1-NEXT:    [[TMP20:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP48]]
+// CHECK1-NEXT:    store i32 [[ADD9]], ptr [[I5]], align 4, !llvm.access.group [[ACC_GRP41]]
+// CHECK1-NEXT:    [[TMP20:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP41]]
 // CHECK1-NEXT:    [[ADD10:%.*]] = add nsw i32 [[TMP20]], 1
-// CHECK1-NEXT:    store i32 [[ADD10]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP48]]
-// CHECK1-NEXT:    [[TMP21:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP48]]
+// CHECK1-NEXT:    store i32 [[ADD10]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP41]]
+// CHECK1-NEXT:    [[TMP21:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP41]]
 // CHECK1-NEXT:    [[CONV:%.*]] = sext i16 [[TMP21]] to i32
 // CHECK1-NEXT:    [[ADD11:%.*]] = add nsw i32 [[CONV]], 1
 // CHECK1-NEXT:    [[CONV12:%.*]] = trunc i32 [[ADD11]] to i16
-// CHECK1-NEXT:    store i16 [[CONV12]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP48]]
-// CHECK1-NEXT:    [[TMP22:%.*]] = load i8, ptr [[AAA_ADDR]], align 1, !llvm.access.group [[ACC_GRP48]]
+// CHECK1-NEXT:    store i16 [[CONV12]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP41]]
+// CHECK1-NEXT:    [[TMP22:%.*]] = load i8, ptr [[AAA_ADDR]], align 1, !llvm.access.group [[ACC_GRP41]]
 // CHECK1-NEXT:    [[CONV13:%.*]] = sext i8 [[TMP22]] to i32
 // CHECK1-NEXT:    [[ADD14:%.*]] = add nsw i32 [[CONV13]], 1
 // CHECK1-NEXT:    [[CONV15:%.*]] = trunc i32 [[ADD14]] to i8
-// CHECK1-NEXT:    store i8 [[CONV15]], ptr [[AAA_ADDR]], align 1, !llvm.access.group [[ACC_GRP48]]
+// CHECK1-NEXT:    store i8 [[CONV15]], ptr [[AAA_ADDR]], align 1, !llvm.access.group [[ACC_GRP41]]
 // CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP0]], i64 0, i64 2
-// CHECK1-NEXT:    [[TMP23:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP48]]
+// CHECK1-NEXT:    [[TMP23:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP41]]
 // CHECK1-NEXT:    [[ADD16:%.*]] = add nsw i32 [[TMP23]], 1
-// CHECK1-NEXT:    store i32 [[ADD16]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP48]]
+// CHECK1-NEXT:    store i32 [[ADD16]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP41]]
 // CHECK1-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK1:       omp.body.continue:
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK1:       omp.inner.for.inc:
-// CHECK1-NEXT:    [[TMP24:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP48]]
+// CHECK1-NEXT:    [[TMP24:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP41]]
 // CHECK1-NEXT:    [[ADD17:%.*]] = add i32 [[TMP24]], 1
-// CHECK1-NEXT:    store i32 [[ADD17]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP48]]
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP49:![0-9]+]]
+// CHECK1-NEXT:    store i32 [[ADD17]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP41]]
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP42:![0-9]+]]
 // CHECK1:       omp.inner.for.end:
 // CHECK1-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK1:       omp.loop.exit:
@@ -1972,7 +1972,7 @@ int bar(int n){
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l180.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR3]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR2]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -2014,35 +2014,35 @@ int bar(int n){
 // CHECK1-NEXT:    store i32 [[TMP5]], ptr [[DOTOMP_IV]], align 4
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK1:       omp.inner.for.cond:
-// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP51:![0-9]+]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP51]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP44:![0-9]+]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP44]]
 // CHECK1-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP6]], [[TMP7]]
 // CHECK1-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK1:       omp.inner.for.body:
-// CHECK1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP51]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP44]]
 // CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1
 // CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK1-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP51]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP51]]
+// CHECK1-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP44]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP44]]
 // CHECK1-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP9]], 1
-// CHECK1-NEXT:    store i32 [[ADD2]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP51]]
-// CHECK1-NEXT:    [[TMP10:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP51]]
+// CHECK1-NEXT:    store i32 [[ADD2]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP44]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP44]]
 // CHECK1-NEXT:    [[CONV:%.*]] = sext i16 [[TMP10]] to i32
 // CHECK1-NEXT:    [[ADD3:%.*]] = add nsw i32 [[CONV]], 1
 // CHECK1-NEXT:    [[CONV4:%.*]] = trunc i32 [[ADD3]] to i16
-// CHECK1-NEXT:    store i16 [[CONV4]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP51]]
+// CHECK1-NEXT:    store i16 [[CONV4]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP44]]
 // CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP0]], i64 0, i64 2
-// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP51]]
+// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP44]]
 // CHECK1-NEXT:    [[ADD5:%.*]] = add nsw i32 [[TMP11]], 1
-// CHECK1-NEXT:    store i32 [[ADD5]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP51]]
+// CHECK1-NEXT:    store i32 [[ADD5]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP44]]
 // CHECK1-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK1:       omp.body.continue:
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK1:       omp.inner.for.inc:
-// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP51]]
+// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP44]]
 // CHECK1-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP12]], 1
-// CHECK1-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP51]]
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP52:![0-9]+]]
+// CHECK1-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP44]]
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP45:![0-9]+]]
 // CHECK1:       omp.inner.for.end:
 // CHECK1-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK1:       omp.loop.exit:
@@ -2058,7 +2058,7 @@ int bar(int n){
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK1-SAME: () #[[ATTR5]] {
+// CHECK1-SAME: () #[[ATTR4]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK1-NEXT:    ret void
@@ -2180,7 +2180,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP41:%.*]] = load i32, ptr [[A]], align 4
 // CHECK3-NEXT:    store i32 [[TMP41]], ptr [[A_CASTED]], align 4
 // CHECK3-NEXT:    [[TMP42:%.*]] = load i32, ptr [[A_CASTED]], align 4
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l104(i32 [[TMP42]]) #[[ATTR4:[0-9]+]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l104(i32 [[TMP42]]) #[[ATTR3:[0-9]+]]
 // CHECK3-NEXT:    [[TMP43:%.*]] = load i16, ptr [[AA]], align 2
 // CHECK3-NEXT:    store i16 [[TMP43]], ptr [[AA_CASTED4]], align 2
 // CHECK3-NEXT:    [[TMP44:%.*]] = load i32, ptr [[AA_CASTED4]], align 4
@@ -2222,7 +2222,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP64:%.*]] = icmp ne i32 [[TMP63]], 0
 // CHECK3-NEXT:    br i1 [[TMP64]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK3:       omp_offload.failed:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l111(i32 [[TMP44]]) #[[ATTR4]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l111(i32 [[TMP44]]) #[[ATTR3]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK3:       omp_offload.cont:
 // CHECK3-NEXT:    [[TMP65:%.*]] = load i32, ptr [[A]], align 4
@@ -2279,12 +2279,12 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP92:%.*]] = icmp ne i32 [[TMP91]], 0
 // CHECK3-NEXT:    br i1 [[TMP92]], label [[OMP_OFFLOAD_FAILED15:%.*]], label [[OMP_OFFLOAD_CONT16:%.*]]
 // CHECK3:       omp_offload.failed15:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l118(i32 [[TMP66]], i32 [[TMP68]]) #[[ATTR4]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l118(i32 [[TMP66]], i32 [[TMP68]]) #[[ATTR3]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT16]]
 // CHECK3:       omp_offload.cont16:
 // CHECK3-NEXT:    br label [[OMP_IF_END:%.*]]
 // CHECK3:       omp_if.else:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l118(i32 [[TMP66]], i32 [[TMP68]]) #[[ATTR4]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l118(i32 [[TMP66]], i32 [[TMP68]]) #[[ATTR3]]
 // CHECK3-NEXT:    br label [[OMP_IF_END]]
 // CHECK3:       omp_if.end:
 // CHECK3-NEXT:    [[TMP93:%.*]] = load i32, ptr [[A]], align 4
@@ -2391,12 +2391,12 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP147:%.*]] = icmp ne i32 [[TMP146]], 0
 // CHECK3-NEXT:    br i1 [[TMP147]], label [[OMP_OFFLOAD_FAILED25:%.*]], label [[OMP_OFFLOAD_CONT26:%.*]]
 // CHECK3:       omp_offload.failed25:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l142(i32 [[TMP94]], ptr [[B]], i32 [[TMP1]], ptr [[VLA]], ptr [[C]], i32 5, i32 [[TMP3]], ptr [[VLA1]], ptr [[D]]) #[[ATTR4]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l142(i32 [[TMP94]], ptr [[B]], i32 [[TMP1]], ptr [[VLA]], ptr [[C]], i32 5, i32 [[TMP3]], ptr [[VLA1]], ptr [[D]]) #[[ATTR3]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT26]]
 // CHECK3:       omp_offload.cont26:
 // CHECK3-NEXT:    br label [[OMP_IF_END28:%.*]]
 // CHECK3:       omp_if.else27:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l142(i32 [[TMP94]], ptr [[B]], i32 [[TMP1]], ptr [[VLA]], ptr [[C]], i32 5, i32 [[TMP3]], ptr [[VLA1]], ptr [[D]]) #[[ATTR4]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l142(i32 [[TMP94]], ptr [[B]], i32 [[TMP1]], ptr [[VLA]], ptr [[C]], i32 5, i32 [[TMP3]], ptr [[VLA1]], ptr [[D]]) #[[ATTR3]]
 // CHECK3-NEXT:    br label [[OMP_IF_END28]]
 // CHECK3:       omp_if.end28:
 // CHECK3-NEXT:    [[TMP148:%.*]] = load i32, ptr [[A]], align 4
@@ -2427,7 +2427,7 @@ int bar(int n){
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l97.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[AA:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[AA:%.*]]) #[[ATTR2]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -2464,23 +2464,23 @@ int bar(int n){
 // CHECK3-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
 // CHECK3-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK3:       omp.inner.for.cond:
-// CHECK3-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP18:![0-9]+]]
-// CHECK3-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK3-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP11:![0-9]+]]
+// CHECK3-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP11]]
 // CHECK3-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
 // CHECK3-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK3:       omp.inner.for.body:
-// CHECK3-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK3-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP11]]
 // CHECK3-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
 // CHECK3-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK3-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK3-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP11]]
 // CHECK3-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK3:       omp.body.continue:
 // CHECK3-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK3:       omp.inner.for.inc:
-// CHECK3-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK3-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP11]]
 // CHECK3-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP8]], 1
-// CHECK3-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP18]]
-// CHECK3-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP19:![0-9]+]]
+// CHECK3-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP11]]
+// CHECK3-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP12:![0-9]+]]
 // CHECK3:       omp.inner.for.end:
 // CHECK3-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK3:       omp.loop.exit:
@@ -2496,7 +2496,7 @@ int bar(int n){
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@.omp_task_privates_map.
-// CHECK3-SAME: (ptr noalias noundef [[TMP0:%.*]], ptr noalias noundef [[TMP1:%.*]], ptr noalias noundef [[TMP2:%.*]], ptr noalias noundef [[TMP3:%.*]], ptr noalias noundef [[TMP4:%.*]]) #[[ATTR5:[0-9]+]] {
+// CHECK3-SAME: (ptr noalias noundef [[TMP0:%.*]], ptr noalias noundef [[TMP1:%.*]], ptr noalias noundef [[TMP2:%.*]], ptr noalias noundef [[TMP3:%.*]], ptr noalias noundef [[TMP4:%.*]]) #[[ATTR4:[0-9]+]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 4
@@ -2525,7 +2525,7 @@ int bar(int n){
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@.omp_task_entry.
-// CHECK3-SAME: (i32 noundef [[TMP0:%.*]], ptr noalias noundef [[TMP1:%.*]]) #[[ATTR6:[0-9]+]] {
+// CHECK3-SAME: (i32 noundef [[TMP0:%.*]], ptr noalias noundef [[TMP1:%.*]]) #[[ATTR5:[0-9]+]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR_I:%.*]] = alloca i32, align 4
 // CHECK3-NEXT:    [[DOTPART_ID__ADDR_I:%.*]] = alloca ptr, align 4
@@ -2552,74 +2552,74 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], ptr [[TMP4]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 4
 // CHECK3-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES]], ptr [[TMP3]], i32 0, i32 1
+// CHECK3-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META18:![0-9]+]])
+// CHECK3-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META21:![0-9]+]])
+// CHECK3-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META23:![0-9]+]])
 // CHECK3-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META25:![0-9]+]])
-// CHECK3-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META28:![0-9]+]])
-// CHECK3-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META30:![0-9]+]])
-// CHECK3-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META32:![0-9]+]])
-// CHECK3-NEXT:    store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !34
-// CHECK3-NEXT:    store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 4, !noalias !34
-// CHECK3-NEXT:    store ptr [[TMP8]], ptr [[DOTPRIVATES__ADDR_I]], align 4, !noalias !34
-// CHECK3-NEXT:    store ptr @.omp_task_privates_map., ptr [[DOTCOPY_FN__ADDR_I]], align 4, !noalias !34
-// CHECK3-NEXT:    store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 4, !noalias !34
-// CHECK3-NEXT:    store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 4, !noalias !34
-// CHECK3-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 4, !noalias !34
-// CHECK3-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[DOTCOPY_FN__ADDR_I]], align 4, !noalias !34
-// CHECK3-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[DOTPRIVATES__ADDR_I]], align 4, !noalias !34
-// CHECK3-NEXT:    call void [[TMP10]](ptr [[TMP11]], ptr [[DOTFIRSTPRIV_PTR_ADDR_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR2_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR3_I]]) #[[ATTR4]]
-// CHECK3-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR_I]], align 4, !noalias !34
-// CHECK3-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], align 4, !noalias !34
-// CHECK3-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR2_I]], align 4, !noalias !34
-// CHECK3-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR3_I]], align 4, !noalias !34
+// CHECK3-NEXT:    store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !27
+// CHECK3-NEXT:    store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 4, !noalias !27
+// CHECK3-NEXT:    store ptr [[TMP8]], ptr [[DOTPRIVATES__ADDR_I]], align 4, !noalias !27
+// CHECK3-NEXT:    store ptr @.omp_task_privates_map., ptr [[DOTCOPY_FN__ADDR_I]], align 4, !noalias !27
+// CHECK3-NEXT:    store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 4, !noalias !27
+// CHECK3-NEXT:    store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 4, !noalias !27
+// CHECK3-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 4, !noalias !27
+// CHECK3-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[DOTCOPY_FN__ADDR_I]], align 4, !noalias !27
+// CHECK3-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[DOTPRIVATES__ADDR_I]], align 4, !noalias !27
+// CHECK3-NEXT:    call void [[TMP10]](ptr [[TMP11]], ptr [[DOTFIRSTPRIV_PTR_ADDR_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR2_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR3_I]]) #[[ATTR3]]
+// CHECK3-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR_I]], align 4, !noalias !27
+// CHECK3-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], align 4, !noalias !27
+// CHECK3-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR2_I]], align 4, !noalias !27
+// CHECK3-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR3_I]], align 4, !noalias !27
 // CHECK3-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT_ANON:%.*]], ptr [[TMP9]], i32 0, i32 1
 // CHECK3-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_ANON]], ptr [[TMP9]], i32 0, i32 2
 // CHECK3-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP16]], align 4
 // CHECK3-NEXT:    [[TMP19:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP18]], 0
-// CHECK3-NEXT:    store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias !34
+// CHECK3-NEXT:    store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias !27
 // CHECK3-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 1
-// CHECK3-NEXT:    store i32 3, ptr [[TMP20]], align 4, !noalias !34
+// CHECK3-NEXT:    store i32 3, ptr [[TMP20]], align 4, !noalias !27
 // CHECK3-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 2
-// CHECK3-NEXT:    store ptr [[TMP13]], ptr [[TMP21]], align 4, !noalias !34
+// CHECK3-NEXT:    store ptr [[TMP13]], ptr [[TMP21]], align 4, !noalias !27
 // CHECK3-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 3
-// CHECK3-NEXT:    store ptr [[TMP14]], ptr [[TMP22]], align 4, !noalias !34
+// CHECK3-NEXT:    store ptr [[TMP14]], ptr [[TMP22]], align 4, !noalias !27
 // CHECK3-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 4
-// CHECK3-NEXT:    store ptr [[TMP15]], ptr [[TMP23]], align 4, !noalias !34
+// CHECK3-NEXT:    store ptr [[TMP15]], ptr [[TMP23]], align 4, !noalias !27
 // CHECK3-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 5
-// CHECK3-NEXT:    store ptr @.offload_maptypes, ptr [[TMP24]], align 4, !noalias !34
+// CHECK3-NEXT:    store ptr @.offload_maptypes, ptr [[TMP24]], align 4, !noalias !27
 // CHECK3-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 6
-// CHECK3-NEXT:    store ptr null, ptr [[TMP25]], align 4, !noalias !34
+// CHECK3-NEXT:    store ptr null, ptr [[TMP25]], align 4, !noalias !27
 // CHECK3-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 7
-// CHECK3-NEXT:    store ptr null, ptr [[TMP26]], align 4, !noalias !34
+// CHECK3-NEXT:    store ptr null, ptr [[TMP26]], align 4, !noalias !27
 // CHECK3-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 8
-// CHECK3-NEXT:    store i64 10, ptr [[TMP27]], align 8, !noalias !34
+// CHECK3-NEXT:    store i64 10, ptr [[TMP27]], align 8, !noalias !27
 // CHECK3-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 9
-// CHECK3-NEXT:    store i64 1, ptr [[TMP28]], align 8, !noalias !34
+// CHECK3-NEXT:    store i64 1, ptr [[TMP28]], align 8, !noalias !27
 // CHECK3-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 10
-// CHECK3-NEXT:    store [3 x i32] [[TMP19]], ptr [[TMP29]], align 4, !noalias !34
+// CHECK3-NEXT:    store [3 x i32] [[TMP19]], ptr [[TMP29]], align 4, !noalias !27
 // CHECK3-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 11
-// CHECK3-NEXT:    store [3 x i32] [i32 1, i32 0, i32 0], ptr [[TMP30]], align 4, !noalias !34
+// CHECK3-NEXT:    store [3 x i32] [i32 1, i32 0, i32 0], ptr [[TMP30]], align 4, !noalias !27
 // CHECK3-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 12
-// CHECK3-NEXT:    store i32 0, ptr [[TMP31]], align 4, !noalias !34
+// CHECK3-NEXT:    store i32 0, ptr [[TMP31]], align 4, !noalias !27
 // CHECK3-NEXT:    [[TMP32:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB2]], i64 -1, i32 [[TMP18]], i32 1, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l97.region_id, ptr [[KERNEL_ARGS_I]])
 // CHECK3-NEXT:    [[TMP33:%.*]] = icmp ne i32 [[TMP32]], 0
 // CHECK3-NEXT:    br i1 [[TMP33]], label [[OMP_OFFLOAD_FAILED_I:%.*]], label [[DOTOMP_OUTLINED__EXIT:%.*]]
 // CHECK3:       omp_offload.failed.i:
 // CHECK3-NEXT:    [[TMP34:%.*]] = load i16, ptr [[TMP12]], align 2
-// CHECK3-NEXT:    store i16 [[TMP34]], ptr [[AA_CASTED_I]], align 2, !noalias !34
-// CHECK3-NEXT:    [[TMP35:%.*]] = load i32, ptr [[AA_CASTED_I]], align 4, !noalias !34
+// CHECK3-NEXT:    store i16 [[TMP34]], ptr [[AA_CASTED_I]], align 2, !noalias !27
+// CHECK3-NEXT:    [[TMP35:%.*]] = load i32, ptr [[AA_CASTED_I]], align 4, !noalias !27
 // CHECK3-NEXT:    [[TMP36:%.*]] = load i32, ptr [[TMP16]], align 4
-// CHECK3-NEXT:    store i32 [[TMP36]], ptr [[DOTCAPTURE_EXPR__CASTED_I]], align 4, !noalias !34
-// CHECK3-NEXT:    [[TMP37:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__CASTED_I]], align 4, !noalias !34
+// CHECK3-NEXT:    store i32 [[TMP36]], ptr [[DOTCAPTURE_EXPR__CASTED_I]], align 4, !noalias !27
+// CHECK3-NEXT:    [[TMP37:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__CASTED_I]], align 4, !noalias !27
 // CHECK3-NEXT:    [[TMP38:%.*]] = load i32, ptr [[TMP17]], align 4
-// CHECK3-NEXT:    store i32 [[TMP38]], ptr [[DOTCAPTURE_EXPR__CASTED4_I]], align 4, !noalias !34
-// CHECK3-NEXT:    [[TMP39:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__CASTED4_I]], align 4, !noalias !34
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l97(i32 [[TMP35]], i32 [[TMP37]], i32 [[TMP39]]) #[[ATTR4]]
+// CHECK3-NEXT:    store i32 [[TMP38]], ptr [[DOTCAPTURE_EXPR__CASTED4_I]], align 4, !noalias !27
+// CHECK3-NEXT:    [[TMP39:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__CASTED4_I]], align 4, !noalias !27
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l97(i32 [[TMP35]], i32 [[TMP37]], i32 [[TMP39]]) #[[ATTR3]]
 // CHECK3-NEXT:    br label [[DOTOMP_OUTLINED__EXIT]]
 // CHECK3:       .omp_outlined..exit:
 // CHECK3-NEXT:    ret i32 0
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l104
-// CHECK3-SAME: (i32 noundef [[A:%.*]]) #[[ATTR3]] {
+// CHECK3-SAME: (i32 noundef [[A:%.*]]) #[[ATTR2]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 // CHECK3-NEXT:    [[A_CASTED:%.*]] = alloca i32, align 4
@@ -2632,7 +2632,7 @@ int bar(int n){
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l104.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]]) #[[ATTR3]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]]) #[[ATTR2]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -2688,7 +2688,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
 // CHECK3-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP9]], 1
 // CHECK3-NEXT:    store i32 [[ADD4]], ptr [[DOTOMP_IV]], align 4
-// CHECK3-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP35:![0-9]+]]
+// CHECK3-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP28:![0-9]+]]
 // CHECK3:       omp.inner.for.end:
 // CHECK3-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK3:       omp.loop.exit:
@@ -2717,7 +2717,7 @@ int bar(int n){
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l111.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[AA:%.*]]) #[[ATTR3]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[AA:%.*]]) #[[ATTR2]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -2754,28 +2754,28 @@ int bar(int n){
 // CHECK3-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
 // CHECK3-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK3:       omp.inner.for.cond:
-// CHECK3-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP37:![0-9]+]]
-// CHECK3-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP37]]
+// CHECK3-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP30:![0-9]+]]
+// CHECK3-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP30]]
 // CHECK3-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
 // CHECK3-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK3:       omp.inner.for.body:
-// CHECK3-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP37]]
+// CHECK3-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP30]]
 // CHECK3-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
 // CHECK3-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK3-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP37]]
-// CHECK3-NEXT:    [[TMP8:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP37]]
+// CHECK3-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP30]]
+// CHECK3-NEXT:    [[TMP8:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP30]]
 // CHECK3-NEXT:    [[CONV:%.*]] = sext i16 [[TMP8]] to i32
 // CHECK3-NEXT:    [[ADD2:%.*]] = add nsw i32 [[CONV]], 1
 // CHECK3-NEXT:    [[CONV3:%.*]] = trunc i32 [[ADD2]] to i16
-// CHECK3-NEXT:    store i16 [[CONV3]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP37]]
+// CHECK3-NEXT:    store i16 [[CONV3]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP30]]
 // CHECK3-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK3:       omp.body.continue:
 // CHECK3-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK3:       omp.inner.for.inc:
-// CHECK3-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP37]]
+// CHECK3-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP30]]
 // CHECK3-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP9]], 1
-// CHECK3-NEXT:    store i32 [[ADD4]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP37]]
-// CHECK3-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP38:![0-9]+]]
+// CHECK3-NEXT:    store i32 [[ADD4]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP30]]
+// CHECK3-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP31:![0-9]+]]
 // CHECK3:       omp.inner.for.end:
 // CHECK3-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK3:       omp.loop.exit:
@@ -2810,7 +2810,7 @@ int bar(int n){
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l118.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]]) #[[ATTR3]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]]) #[[ATTR2]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -2849,31 +2849,31 @@ int bar(int n){
 // CHECK3-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
 // CHECK3-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK3:       omp.inner.for.cond:
-// CHECK3-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP40:![0-9]+]]
-// CHECK3-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP40]]
+// CHECK3-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP33:![0-9]+]]
+// CHECK3-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP33]]
 // CHECK3-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
 // CHECK3-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK3:       omp.inner.for.body:
-// CHECK3-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP40]]
+// CHECK3-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP33]]
 // CHECK3-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
 // CHECK3-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK3-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP40]]
-// CHECK3-NEXT:    [[TMP8:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP40]]
+// CHECK3-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP33]]
+// CHECK3-NEXT:    [[TMP8:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP33]]
 // CHECK3-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP8]], 1
-// CHECK3-NEXT:    store i32 [[ADD2]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP40]]
-// CHECK3-NEXT:    [[TMP9:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP40]]
+// CHECK3-NEXT:    store i32 [[ADD2]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP33]]
+// CHECK3-NEXT:    [[TMP9:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP33]]
 // CHECK3-NEXT:    [[CONV:%.*]] = sext i16 [[TMP9]] to i32
 // CHECK3-NEXT:    [[ADD3:%.*]] = add nsw i32 [[CONV]], 1
 // CHECK3-NEXT:    [[CONV4:%.*]] = trunc i32 [[ADD3]] to i16
-// CHECK3-NEXT:    store i16 [[CONV4]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP40]]
+// CHECK3-NEXT:    store i16 [[CONV4]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP33]]
 // CHECK3-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK3:       omp.body.continue:
 // CHECK3-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK3:       omp.inner.for.inc:
-// CHECK3-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP40]]
+// CHECK3-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP33]]
 // CHECK3-NEXT:    [[ADD5:%.*]] = add nsw i32 [[TMP10]], 1
-// CHECK3-NEXT:    store i32 [[ADD5]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP40]]
-// CHECK3-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP41:![0-9]+]]
+// CHECK3-NEXT:    store i32 [[ADD5]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP33]]
+// CHECK3-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP34:![0-9]+]]
 // CHECK3:       omp.inner.for.end:
 // CHECK3-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK3:       omp.loop.exit:
@@ -2926,7 +2926,7 @@ int bar(int n){
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l142.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BN:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[C:%.*]], i32 noundef [[VLA1:%.*]], i32 noundef [[VLA3:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[CN:%.*]], ptr noundef nonnull align 4 dereferenceable(12) [[D:%.*]]) #[[ATTR3]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BN:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[C:%.*]], i32 noundef [[VLA1:%.*]], i32 noundef [[VLA3:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[CN:%.*]], ptr noundef nonnull align 4 dereferenceable(12) [[D:%.*]]) #[[ATTR2]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -2989,59 +2989,59 @@ int bar(int n){
 // CHECK3-NEXT:    store i32 [[TMP12]], ptr [[DOTOMP_IV]], align 4
 // CHECK3-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK3:       omp.inner.for.cond:
-// CHECK3-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP43:![0-9]+]]
-// CHECK3-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP43]]
+// CHECK3-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP36:![0-9]+]]
+// CHECK3-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP36]]
 // CHECK3-NEXT:    [[CMP5:%.*]] = icmp sle i32 [[TMP13]], [[TMP14]]
 // CHECK3-NEXT:    br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK3:       omp.inner.for.body:
-// CHECK3-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP43]]
+// CHECK3-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP36]]
 // CHECK3-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP15]], 1
 // CHECK3-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK3-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP43]]
-// CHECK3-NEXT:    [[TMP16:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP43]]
+// CHECK3-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP36]]
+// CHECK3-NEXT:    [[TMP16:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP36]]
 // CHECK3-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP16]], 1
-// CHECK3-NEXT:    store i32 [[ADD6]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP43]]
+// CHECK3-NEXT:    store i32 [[ADD6]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP36]]
 // CHECK3-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x float], ptr [[TMP0]], i32 0, i32 2
-// CHECK3-NEXT:    [[TMP17:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP43]]
+// CHECK3-NEXT:    [[TMP17:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP36]]
 // CHECK3-NEXT:    [[CONV:%.*]] = fpext float [[TMP17]] to double
 // CHECK3-NEXT:    [[ADD7:%.*]] = fadd double [[CONV]], 1.000000e+00
 // CHECK3-NEXT:    [[CONV8:%.*]] = fptrunc double [[ADD7]] to float
-// CHECK3-NEXT:    store float [[CONV8]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP43]]
+// CHECK3-NEXT:    store float [[CONV8]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP36]]
 // CHECK3-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 3
-// CHECK3-NEXT:    [[TMP18:%.*]] = load float, ptr [[ARRAYIDX9]], align 4, !llvm.access.group [[ACC_GRP43]]
+// CHECK3-NEXT:    [[TMP18:%.*]] = load float, ptr [[ARRAYIDX9]], align 4, !llvm.access.group [[ACC_GRP36]]
 // CHECK3-NEXT:    [[CONV10:%.*]] = fpext float [[TMP18]] to double
 // CHECK3-NEXT:    [[ADD11:%.*]] = fadd double [[CONV10]], 1.000000e+00
 // CHECK3-NEXT:    [[CONV12:%.*]] = fptrunc double [[ADD11]] to float
-// CHECK3-NEXT:    store float [[CONV12]], ptr [[ARRAYIDX9]], align 4, !llvm.access.group [[ACC_GRP43]]
+// CHECK3-NEXT:    store float [[CONV12]], ptr [[ARRAYIDX9]], align 4, !llvm.access.group [[ACC_GRP36]]
 // CHECK3-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [5 x [10 x double]], ptr [[TMP3]], i32 0, i32 1
 // CHECK3-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x double], ptr [[ARRAYIDX13]], i32 0, i32 2
-// CHECK3-NEXT:    [[TMP19:%.*]] = load double, ptr [[ARRAYIDX14]], align 8, !llvm.access.group [[ACC_GRP43]]
+// CHECK3-NEXT:    [[TMP19:%.*]] = load double, ptr [[ARRAYIDX14]], align 8, !llvm.access.group [[ACC_GRP36]]
 // CHECK3-NEXT:    [[ADD15:%.*]] = fadd double [[TMP19]], 1.000000e+00
-// CHECK3-NEXT:    store double [[ADD15]], ptr [[ARRAYIDX14]], align 8, !llvm.access.group [[ACC_GRP43]]
+// CHECK3-NEXT:    store double [[ADD15]], ptr [[ARRAYIDX14]], align 8, !llvm.access.group [[ACC_GRP36]]
 // CHECK3-NEXT:    [[TMP20:%.*]] = mul nsw i32 1, [[TMP5]]
 // CHECK3-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds double, ptr [[TMP6]], i32 [[TMP20]]
 // CHECK3-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds double, ptr [[ARRAYIDX16]], i32 3
-// CHECK3-NEXT:    [[TMP21:%.*]] = load double, ptr [[ARRAYIDX17]], align 8, !llvm.access.group [[ACC_GRP43]]
+// CHECK3-NEXT:    [[TMP21:%.*]] = load double, ptr [[ARRAYIDX17]], align 8, !llvm.access.group [[ACC_GRP36]]
 // CHECK3-NEXT:    [[ADD18:%.*]] = fadd double [[TMP21]], 1.000000e+00
-// CHECK3-NEXT:    store double [[ADD18]], ptr [[ARRAYIDX17]], align 8, !llvm.access.group [[ACC_GRP43]]
+// CHECK3-NEXT:    store double [[ADD18]], ptr [[ARRAYIDX17]], align 8, !llvm.access.group [[ACC_GRP36]]
 // CHECK3-NEXT:    [[X:%.*]] = getelementptr inbounds [[STRUCT_TT:%.*]], ptr [[TMP7]], i32 0, i32 0
-// CHECK3-NEXT:    [[TMP22:%.*]] = load i64, ptr [[X]], align 4, !llvm.access.group [[ACC_GRP43]]
+// CHECK3-NEXT:    [[TMP22:%.*]] = load i64, ptr [[X]], align 4, !llvm.access.group [[ACC_GRP36]]
 // CHECK3-NEXT:    [[ADD19:%.*]] = add nsw i64 [[TMP22]], 1
-// CHECK3-NEXT:    store i64 [[ADD19]], ptr [[X]], align 4, !llvm.access.group [[ACC_GRP43]]
+// CHECK3-NEXT:    store i64 [[ADD19]], ptr [[X]], align 4, !llvm.access.group [[ACC_GRP36]]
 // CHECK3-NEXT:    [[Y:%.*]] = getelementptr inbounds [[STRUCT_TT]], ptr [[TMP7]], i32 0, i32 1
-// CHECK3-NEXT:    [[TMP23:%.*]] = load i8, ptr [[Y]], align 4, !llvm.access.group [[ACC_GRP43]]
+// CHECK3-NEXT:    [[TMP23:%.*]] = load i8, ptr [[Y]], align 4, !llvm.access.group [[ACC_GRP36]]
 // CHECK3-NEXT:    [[CONV20:%.*]] = sext i8 [[TMP23]] to i32
 // CHECK3-NEXT:    [[ADD21:%.*]] = add nsw i32 [[CONV20]], 1
 // CHECK3-NEXT:    [[CONV22:%.*]] = trunc i32 [[ADD21]] to i8
-// CHECK3-NEXT:    store i8 [[CONV22]], ptr [[Y]], align 4, !llvm.access.group [[ACC_GRP43]]
+// CHECK3-NEXT:    store i8 [[CONV22]], ptr [[Y]], align 4, !llvm.access.group [[ACC_GRP36]]
 // CHECK3-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK3:       omp.body.continue:
 // CHECK3-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK3:       omp.inner.for.inc:
-// CHECK3-NEXT:    [[TMP24:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP43]]
+// CHECK3-NEXT:    [[TMP24:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP36]]
 // CHECK3-NEXT:    [[ADD23:%.*]] = add nsw i32 [[TMP24]], 1
-// CHECK3-NEXT:    store i32 [[ADD23]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP43]]
-// CHECK3-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP44:![0-9]+]]
+// CHECK3-NEXT:    store i32 [[ADD23]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP36]]
+// CHECK3-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP37:![0-9]+]]
 // CHECK3:       omp.inner.for.end:
 // CHECK3-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK3:       omp.loop.exit:
@@ -3192,12 +3192,12 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP43:%.*]] = icmp ne i32 [[TMP42]], 0
 // CHECK3-NEXT:    br i1 [[TMP43]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK3:       omp_offload.failed:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l215(ptr [[THIS1]], i32 [[TMP5]], i32 2, i32 [[TMP1]], ptr [[VLA]]) #[[ATTR4]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l215(ptr [[THIS1]], i32 [[TMP5]], i32 2, i32 [[TMP1]], ptr [[VLA]]) #[[ATTR3]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK3:       omp_offload.cont:
 // CHECK3-NEXT:    br label [[OMP_IF_END:%.*]]
 // CHECK3:       omp_if.else:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l215(ptr [[THIS1]], i32 [[TMP5]], i32 2, i32 [[TMP1]], ptr [[VLA]]) #[[ATTR4]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l215(ptr [[THIS1]], i32 [[TMP5]], i32 2, i32 [[TMP1]], ptr [[VLA]]) #[[ATTR3]]
 // CHECK3-NEXT:    br label [[OMP_IF_END]]
 // CHECK3:       omp_if.end:
 // CHECK3-NEXT:    [[TMP44:%.*]] = mul nsw i32 1, [[TMP1]]
@@ -3329,12 +3329,12 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP46:%.*]] = icmp ne i32 [[TMP45]], 0
 // CHECK3-NEXT:    br i1 [[TMP46]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK3:       omp_offload.failed:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l197(i32 [[TMP1]], i32 [[TMP3]], i32 [[TMP5]], i32 [[TMP7]], ptr [[B]]) #[[ATTR4]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l197(i32 [[TMP1]], i32 [[TMP3]], i32 [[TMP5]], i32 [[TMP7]], ptr [[B]]) #[[ATTR3]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK3:       omp_offload.cont:
 // CHECK3-NEXT:    br label [[OMP_IF_END:%.*]]
 // CHECK3:       omp_if.else:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l197(i32 [[TMP1]], i32 [[TMP3]], i32 [[TMP5]], i32 [[TMP7]], ptr [[B]]) #[[ATTR4]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l197(i32 [[TMP1]], i32 [[TMP3]], i32 [[TMP5]], i32 [[TMP7]], ptr [[B]]) #[[ATTR3]]
 // CHECK3-NEXT:    br label [[OMP_IF_END]]
 // CHECK3:       omp_if.end:
 // CHECK3-NEXT:    [[TMP47:%.*]] = load i32, ptr [[A]], align 4
@@ -3418,12 +3418,12 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP30:%.*]] = icmp ne i32 [[TMP29]], 0
 // CHECK3-NEXT:    br i1 [[TMP30]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK3:       omp_offload.failed:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l180(i32 [[TMP1]], i32 [[TMP3]], ptr [[B]]) #[[ATTR4]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l180(i32 [[TMP1]], i32 [[TMP3]], ptr [[B]]) #[[ATTR3]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK3:       omp_offload.cont:
 // CHECK3-NEXT:    br label [[OMP_IF_END:%.*]]
 // CHECK3:       omp_if.else:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l180(i32 [[TMP1]], i32 [[TMP3]], ptr [[B]]) #[[ATTR4]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l180(i32 [[TMP1]], i32 [[TMP3]], ptr [[B]]) #[[ATTR3]]
 // CHECK3-NEXT:    br label [[OMP_IF_END]]
 // CHECK3:       omp_if.end:
 // CHECK3-NEXT:    [[TMP31:%.*]] = load i32, ptr [[A]], align 4
@@ -3456,7 +3456,7 @@ int bar(int n){
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l215.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]], i32 noundef [[B:%.*]], i32 noundef [[VLA:%.*]], i32 noundef [[VLA1:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[C:%.*]]) #[[ATTR3]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]], i32 noundef [[B:%.*]], i32 noundef [[VLA:%.*]], i32 noundef [[VLA1:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[C:%.*]]) #[[ATTR2]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -3505,37 +3505,37 @@ int bar(int n){
 // CHECK3-NEXT:    store i32 [[TMP8]], ptr [[DOTOMP_IV]], align 4
 // CHECK3-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK3:       omp.inner.for.cond:
-// CHECK3-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP46:![0-9]+]]
-// CHECK3-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP46]]
+// CHECK3-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP39:![0-9]+]]
+// CHECK3-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP39]]
 // CHECK3-NEXT:    [[CMP3:%.*]] = icmp sle i32 [[TMP9]], [[TMP10]]
 // CHECK3-NEXT:    br i1 [[CMP3]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK3:       omp.inner.for.body:
-// CHECK3-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP46]]
+// CHECK3-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP39]]
 // CHECK3-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP11]], 1
 // CHECK3-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK3-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP46]]
-// CHECK3-NEXT:    [[TMP12:%.*]] = load i32, ptr [[B_ADDR]], align 4, !llvm.access.group [[ACC_GRP46]]
+// CHECK3-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP39]]
+// CHECK3-NEXT:    [[TMP12:%.*]] = load i32, ptr [[B_ADDR]], align 4, !llvm.access.group [[ACC_GRP39]]
 // CHECK3-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP12]] to double
 // CHECK3-NEXT:    [[ADD4:%.*]] = fadd double [[CONV]], 1.500000e+00
 // CHECK3-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_S1:%.*]], ptr [[TMP0]], i32 0, i32 0
-// CHECK3-NEXT:    store double [[ADD4]], ptr [[A]], align 4, !llvm.access.group [[ACC_GRP46]]
+// CHECK3-NEXT:    store double [[ADD4]], ptr [[A]], align 4, !llvm.access.group [[ACC_GRP39]]
 // CHECK3-NEXT:    [[A5:%.*]] = getelementptr inbounds [[STRUCT_S1]], ptr [[TMP0]], i32 0, i32 0
-// CHECK3-NEXT:    [[TMP13:%.*]] = load double, ptr [[A5]], align 4, !llvm.access.group [[ACC_GRP46]]
+// CHECK3-NEXT:    [[TMP13:%.*]] = load double, ptr [[A5]], align 4, !llvm.access.group [[ACC_GRP39]]
 // CHECK3-NEXT:    [[INC:%.*]] = fadd double [[TMP13]], 1.000000e+00
-// CHECK3-NEXT:    store double [[INC]], ptr [[A5]], align 4, !llvm.access.group [[ACC_GRP46]]
+// CHECK3-NEXT:    store double [[INC]], ptr [[A5]], align 4, !llvm.access.group [[ACC_GRP39]]
 // CHECK3-NEXT:    [[CONV6:%.*]] = fptosi double [[INC]] to i16
 // CHECK3-NEXT:    [[TMP14:%.*]] = mul nsw i32 1, [[TMP2]]
 // CHECK3-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[TMP3]], i32 [[TMP14]]
 // CHECK3-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i16, ptr [[ARRAYIDX]], i32 1
-// CHECK3-NEXT:    store i16 [[CONV6]], ptr [[ARRAYIDX7]], align 2, !llvm.access.group [[ACC_GRP46]]
+// CHECK3-NEXT:    store i16 [[CONV6]], ptr [[ARRAYIDX7]], align 2, !llvm.access.group [[ACC_GRP39]]
 // CHECK3-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK3:       omp.body.continue:
 // CHECK3-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK3:       omp.inner.for.inc:
-// CHECK3-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP46]]
+// CHECK3-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP39]]
 // CHECK3-NEXT:    [[ADD8:%.*]] = add nsw i32 [[TMP15]], 1
-// CHECK3-NEXT:    store i32 [[ADD8]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP46]]
-// CHECK3-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP47:![0-9]+]]
+// CHECK3-NEXT:    store i32 [[ADD8]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP39]]
+// CHECK3-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP40:![0-9]+]]
 // CHECK3:       omp.inner.for.end:
 // CHECK3-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK3:       omp.loop.exit:
@@ -3585,7 +3585,7 @@ int bar(int n){
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l197.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], i32 noundef [[N:%.*]], i32 noundef [[AA:%.*]], i32 noundef [[AAA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR3]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], i32 noundef [[N:%.*]], i32 noundef [[AA:%.*]], i32 noundef [[AAA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR2]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -3657,42 +3657,42 @@ int bar(int n){
 // CHECK3-NEXT:    store i32 [[TMP15]], ptr [[DOTOMP_IV]], align 4
 // CHECK3-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK3:       omp.inner.for.cond:
-// CHECK3-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP49:![0-9]+]]
-// CHECK3-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP49]]
+// CHECK3-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP42:![0-9]+]]
+// CHECK3-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP42]]
 // CHECK3-NEXT:    [[ADD7:%.*]] = add i32 [[TMP17]], 1
 // CHECK3-NEXT:    [[CMP8:%.*]] = icmp ult i32 [[TMP16]], [[ADD7]]
 // CHECK3-NEXT:    br i1 [[CMP8]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK3:       omp.inner.for.body:
-// CHECK3-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4, !llvm.access.group [[ACC_GRP49]]
-// CHECK3-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP49]]
+// CHECK3-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4, !llvm.access.group [[ACC_GRP42]]
+// CHECK3-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP42]]
 // CHECK3-NEXT:    [[MUL:%.*]] = mul i32 [[TMP19]], 1
 // CHECK3-NEXT:    [[ADD9:%.*]] = add i32 [[TMP18]], [[MUL]]
-// CHECK3-NEXT:    store i32 [[ADD9]], ptr [[I5]], align 4, !llvm.access.group [[ACC_GRP49]]
-// CHECK3-NEXT:    [[TMP20:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP49]]
+// CHECK3-NEXT:    store i32 [[ADD9]], ptr [[I5]], align 4, !llvm.access.group [[ACC_GRP42]]
+// CHECK3-NEXT:    [[TMP20:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP42]]
 // CHECK3-NEXT:    [[ADD10:%.*]] = add nsw i32 [[TMP20]], 1
-// CHECK3-NEXT:    store i32 [[ADD10]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP49]]
-// CHECK3-NEXT:    [[TMP21:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP49]]
+// CHECK3-NEXT:    store i32 [[ADD10]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP42]]
+// CHECK3-NEXT:    [[TMP21:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP42]]
 // CHECK3-NEXT:    [[CONV:%.*]] = sext i16 [[TMP21]] to i32
 // CHECK3-NEXT:    [[ADD11:%.*]] = add nsw i32 [[CONV]], 1
 // CHECK3-NEXT:    [[CONV12:%.*]] = trunc i32 [[ADD11]] to i16
-// CHECK3-NEXT:    store i16 [[CONV12]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP49]]
-// CHECK3-NEXT:    [[TMP22:%.*]] = load i8, ptr [[AAA_ADDR]], align 1, !llvm.access.group [[ACC_GRP49]]
+// CHECK3-NEXT:    store i16 [[CONV12]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP42]]
+// CHECK3-NEXT:    [[TMP22:%.*]] = load i8, ptr [[AAA_ADDR]], align 1, !llvm.access.group [[ACC_GRP42]]
 // CHECK3-NEXT:    [[CONV13:%.*]] = sext i8 [[TMP22]] to i32
 // CHECK3-NEXT:    [[ADD14:%.*]] = add nsw i32 [[CONV13]], 1
 // CHECK3-NEXT:    [[CONV15:%.*]] = trunc i32 [[ADD14]] to i8
-// CHECK3-NEXT:    store i8 [[CONV15]], ptr [[AAA_ADDR]], align 1, !llvm.access.group [[ACC_GRP49]]
+// CHECK3-NEXT:    store i8 [[CONV15]], ptr [[AAA_ADDR]], align 1, !llvm.access.group [[ACC_GRP42]]
 // CHECK3-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP0]], i32 0, i32 2
-// CHECK3-NEXT:    [[TMP23:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP49]]
+// CHECK3-NEXT:    [[TMP23:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP42]]
 // CHECK3-NEXT:    [[ADD16:%.*]] = add nsw i32 [[TMP23]], 1
-// CHECK3-NEXT:    store i32 [[ADD16]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP49]]
+// CHECK3-NEXT:    store i32 [[ADD16]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP42]]
 // CHECK3-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK3:       omp.body.continue:
 // CHECK3-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK3:       omp.inner.for.inc:
-// CHECK3-NEXT:    [[TMP24:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP49]]
+// CHECK3-NEXT:    [[TMP24:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP42]]
 // CHECK3-NEXT:    [[ADD17:%.*]] = add i32 [[TMP24]], 1
-// CHECK3-NEXT:    store i32 [[ADD17]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP49]]
-// CHECK3-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP50:![0-9]+]]
+// CHECK3-NEXT:    store i32 [[ADD17]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP42]]
+// CHECK3-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP43:![0-9]+]]
 // CHECK3:       omp.inner.for.end:
 // CHECK3-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK3:       omp.loop.exit:
@@ -3743,7 +3743,7 @@ int bar(int n){
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l180.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR3]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR2]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -3785,35 +3785,35 @@ int bar(int n){
 // CHECK3-NEXT:    store i32 [[TMP5]], ptr [[DOTOMP_IV]], align 4
 // CHECK3-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK3:       omp.inner.for.cond:
-// CHECK3-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP52:![0-9]+]]
-// CHECK3-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP52]]
+// CHECK3-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP45:![0-9]+]]
+// CHECK3-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP45]]
 // CHECK3-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP6]], [[TMP7]]
 // CHECK3-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK3:       omp.inner.for.body:
-// CHECK3-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP52]]
+// CHECK3-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP45]]
 // CHECK3-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1
 // CHECK3-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK3-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP52]]
-// CHECK3-NEXT:    [[TMP9:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP52]]
+// CHECK3-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP45]]
+// CHECK3-NEXT:    [[TMP9:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP45]]
 // CHECK3-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP9]], 1
-// CHECK3-NEXT:    store i32 [[ADD2]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP52]]
-// CHECK3-NEXT:    [[TMP10:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP52]]
+// CHECK3-NEXT:    store i32 [[ADD2]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP45]]
+// CHECK3-NEXT:    [[TMP10:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP45]]
 // CHECK3-NEXT:    [[CONV:%.*]] = sext i16 [[TMP10]] to i32
 // CHECK3-NEXT:    [[ADD3:%.*]] = add nsw i32 [[CONV]], 1
 // CHECK3-NEXT:    [[CONV4:%.*]] = trunc i32 [[ADD3]] to i16
-// CHECK3-NEXT:    store i16 [[CONV4]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP52]]
+// CHECK3-NEXT:    store i16 [[CONV4]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP45]]
 // CHECK3-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP0]], i32 0, i32 2
-// CHECK3-NEXT:    [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP52]]
+// CHECK3-NEXT:    [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP45]]
 // CHECK3-NEXT:    [[ADD5:%.*]] = add nsw i32 [[TMP11]], 1
-// CHECK3-NEXT:    store i32 [[ADD5]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP52]]
+// CHECK3-NEXT:    store i32 [[ADD5]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP45]]
 // CHECK3-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK3:       omp.body.continue:
 // CHECK3-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK3:       omp.inner.for.inc:
-// CHECK3-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP52]]
+// CHECK3-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP45]]
 // CHECK3-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP12]], 1
-// CHECK3-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP52]]
-// CHECK3-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP53:![0-9]+]]
+// CHECK3-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP45]]
+// CHECK3-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP46:![0-9]+]]
 // CHECK3:       omp.inner.for.end:
 // CHECK3-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK3:       omp.loop.exit:
@@ -3829,7 +3829,7 @@ int bar(int n){
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK3-SAME: () #[[ATTR5]] {
+// CHECK3-SAME: () #[[ATTR4]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK3-NEXT:    ret void
@@ -3953,7 +3953,7 @@ int bar(int n){
 // CHECK5-NEXT:    [[TMP43:%.*]] = load i32, ptr [[A]], align 4
 // CHECK5-NEXT:    store i32 [[TMP43]], ptr [[A_CASTED]], align 4
 // CHECK5-NEXT:    [[TMP44:%.*]] = load i64, ptr [[A_CASTED]], align 8
-// CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l102(i64 [[TMP44]]) #[[ATTR4:[0-9]+]]
+// CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l102(i64 [[TMP44]]) #[[ATTR3:[0-9]+]]
 // CHECK5-NEXT:    [[TMP45:%.*]] = load i16, ptr [[AA]], align 2
 // CHECK5-NEXT:    store i16 [[TMP45]], ptr [[AA_CASTED4]], align 2
 // CHECK5-NEXT:    [[TMP46:%.*]] = load i64, ptr [[AA_CASTED4]], align 8
@@ -3995,7 +3995,7 @@ int bar(int n){
 // CHECK5-NEXT:    [[TMP66:%.*]] = icmp ne i32 [[TMP65]], 0
 // CHECK5-NEXT:    br i1 [[TMP66]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK5:       omp_offload.failed:
-// CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l111(i64 [[TMP46]]) #[[ATTR4]]
+// CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l111(i64 [[TMP46]]) #[[ATTR3]]
 // CHECK5-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK5:       omp_offload.cont:
 // CHECK5-NEXT:    [[TMP67:%.*]] = load i32, ptr [[A]], align 4
@@ -4052,12 +4052,12 @@ int bar(int n){
 // CHECK5-NEXT:    [[TMP94:%.*]] = icmp ne i32 [[TMP93]], 0
 // CHECK5-NEXT:    br i1 [[TMP94]], label [[OMP_OFFLOAD_FAILED15:%.*]], label [[OMP_OFFLOAD_CONT16:%.*]]
 // CHECK5:       omp_offload.failed15:
-// CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l118(i64 [[TMP68]], i64 [[TMP70]]) #[[ATTR4]]
+// CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l118(i64 [[TMP68]], i64 [[TMP70]]) #[[ATTR3]]
 // CHECK5-NEXT:    br label [[OMP_OFFLOAD_CONT16]]
 // CHECK5:       omp_offload.cont16:
 // CHECK5-NEXT:    br label [[OMP_IF_END:%.*]]
 // CHECK5:       omp_if.else:
-// CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l118(i64 [[TMP68]], i64 [[TMP70]]) #[[ATTR4]]
+// CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l118(i64 [[TMP68]], i64 [[TMP70]]) #[[ATTR3]]
 // CHECK5-NEXT:    br label [[OMP_IF_END]]
 // CHECK5:       omp_if.end:
 // CHECK5-NEXT:    [[TMP95:%.*]] = load i32, ptr [[A]], align 4
@@ -4162,12 +4162,12 @@ int bar(int n){
 // CHECK5-NEXT:    [[TMP147:%.*]] = icmp ne i32 [[TMP146]], 0
 // CHECK5-NEXT:    br i1 [[TMP147]], label [[OMP_OFFLOAD_FAILED25:%.*]], label [[OMP_OFFLOAD_CONT26:%.*]]
 // CHECK5:       omp_offload.failed25:
-// CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l142(i64 [[TMP96]], ptr [[B]], i64 [[TMP2]], ptr [[VLA]], ptr [[C]], i64 5, i64 [[TMP5]], ptr [[VLA1]], ptr [[D]]) #[[ATTR4]]
+// CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l142(i64 [[TMP96]], ptr [[B]], i64 [[TMP2]], ptr [[VLA]], ptr [[C]], i64 5, i64 [[TMP5]], ptr [[VLA1]], ptr [[D]]) #[[ATTR3]]
 // CHECK5-NEXT:    br label [[OMP_OFFLOAD_CONT26]]
 // CHECK5:       omp_offload.cont26:
 // CHECK5-NEXT:    br label [[OMP_IF_END28:%.*]]
 // CHECK5:       omp_if.else27:
-// CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l142(i64 [[TMP96]], ptr [[B]], i64 [[TMP2]], ptr [[VLA]], ptr [[C]], i64 5, i64 [[TMP5]], ptr [[VLA1]], ptr [[D]]) #[[ATTR4]]
+// CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l142(i64 [[TMP96]], ptr [[B]], i64 [[TMP2]], ptr [[VLA]], ptr [[C]], i64 5, i64 [[TMP5]], ptr [[VLA1]], ptr [[D]]) #[[ATTR3]]
 // CHECK5-NEXT:    br label [[OMP_IF_END28]]
 // CHECK5:       omp_if.end28:
 // CHECK5-NEXT:    [[TMP148:%.*]] = load i32, ptr [[A]], align 4
@@ -4198,7 +4198,7 @@ int bar(int n){
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l97.omp_outlined
-// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[AA:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[AA:%.*]]) #[[ATTR2]] {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -4235,23 +4235,23 @@ int bar(int n){
 // CHECK5-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
 // CHECK5-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK5:       omp.inner.for.cond:
-// CHECK5-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP17:![0-9]+]]
-// CHECK5-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP17]]
+// CHECK5-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP10:![0-9]+]]
+// CHECK5-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP10]]
 // CHECK5-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
 // CHECK5-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK5:       omp.inner.for.body:
-// CHECK5-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP17]]
+// CHECK5-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP10]]
 // CHECK5-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
 // CHECK5-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK5-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP17]]
+// CHECK5-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP10]]
 // CHECK5-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK5:       omp.body.continue:
 // CHECK5-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK5:       omp.inner.for.inc:
-// CHECK5-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP17]]
+// CHECK5-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP10]]
 // CHECK5-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP8]], 1
-// CHECK5-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP17]]
-// CHECK5-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP18:![0-9]+]]
+// CHECK5-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP10]]
+// CHECK5-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP11:![0-9]+]]
 // CHECK5:       omp.inner.for.end:
 // CHECK5-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK5:       omp.loop.exit:
@@ -4267,7 +4267,7 @@ int bar(int n){
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@.omp_task_privates_map.
-// CHECK5-SAME: (ptr noalias noundef [[TMP0:%.*]], ptr noalias noundef [[TMP1:%.*]], ptr noalias noundef [[TMP2:%.*]], ptr noalias noundef [[TMP3:%.*]], ptr noalias noundef [[TMP4:%.*]]) #[[ATTR5:[0-9]+]] {
+// CHECK5-SAME: (ptr noalias noundef [[TMP0:%.*]], ptr noalias noundef [[TMP1:%.*]], ptr noalias noundef [[TMP2:%.*]], ptr noalias noundef [[TMP3:%.*]], ptr noalias noundef [[TMP4:%.*]]) #[[ATTR4:[0-9]+]] {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8
@@ -4296,7 +4296,7 @@ int bar(int n){
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@.omp_task_entry.
-// CHECK5-SAME: (i32 noundef signext [[TMP0:%.*]], ptr noalias noundef [[TMP1:%.*]]) #[[ATTR6:[0-9]+]] {
+// CHECK5-SAME: (i32 noundef signext [[TMP0:%.*]], ptr noalias noundef [[TMP1:%.*]]) #[[ATTR5:[0-9]+]] {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[DOTGLOBAL_TID__ADDR_I:%.*]] = alloca i32, align 4
 // CHECK5-NEXT:    [[DOTPART_ID__ADDR_I:%.*]] = alloca ptr, align 8
@@ -4323,74 +4323,74 @@ int bar(int n){
 // CHECK5-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], ptr [[TMP4]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8
 // CHECK5-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES]], ptr [[TMP3]], i32 0, i32 1
+// CHECK5-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META17:![0-9]+]])
+// CHECK5-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META20:![0-9]+]])
+// CHECK5-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META22:![0-9]+]])
 // CHECK5-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META24:![0-9]+]])
-// CHECK5-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META27:![0-9]+]])
-// CHECK5-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META29:![0-9]+]])
-// CHECK5-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META31:![0-9]+]])
-// CHECK5-NEXT:    store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !33
-// CHECK5-NEXT:    store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 8, !noalias !33
-// CHECK5-NEXT:    store ptr [[TMP8]], ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias !33
-// CHECK5-NEXT:    store ptr @.omp_task_privates_map., ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !33
-// CHECK5-NEXT:    store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 8, !noalias !33
-// CHECK5-NEXT:    store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !33
-// CHECK5-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !33
-// CHECK5-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !33
-// CHECK5-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias !33
-// CHECK5-NEXT:    call void [[TMP10]](ptr [[TMP11]], ptr [[DOTFIRSTPRIV_PTR_ADDR_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR2_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR3_I]]) #[[ATTR4]]
-// CHECK5-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR_I]], align 8, !noalias !33
-// CHECK5-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], align 8, !noalias !33
-// CHECK5-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR2_I]], align 8, !noalias !33
-// CHECK5-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR3_I]], align 8, !noalias !33
+// CHECK5-NEXT:    store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !26
+// CHECK5-NEXT:    store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 8, !noalias !26
+// CHECK5-NEXT:    store ptr [[TMP8]], ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias !26
+// CHECK5-NEXT:    store ptr @.omp_task_privates_map., ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !26
+// CHECK5-NEXT:    store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 8, !noalias !26
+// CHECK5-NEXT:    store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !26
+// CHECK5-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !26
+// CHECK5-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[DOTCOPY_FN__ADDR_I]], align 8, !noalias !26
+// CHECK5-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[DOTPRIVATES__ADDR_I]], align 8, !noalias !26
+// CHECK5-NEXT:    call void [[TMP10]](ptr [[TMP11]], ptr [[DOTFIRSTPRIV_PTR_ADDR_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR2_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR3_I]]) #[[ATTR3]]
+// CHECK5-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR_I]], align 8, !noalias !26
+// CHECK5-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], align 8, !noalias !26
+// CHECK5-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR2_I]], align 8, !noalias !26
+// CHECK5-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR3_I]], align 8, !noalias !26
 // CHECK5-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT_ANON:%.*]], ptr [[TMP9]], i32 0, i32 1
 // CHECK5-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_ANON]], ptr [[TMP9]], i32 0, i32 2
 // CHECK5-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP16]], align 4
 // CHECK5-NEXT:    [[TMP19:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP18]], 0
-// CHECK5-NEXT:    store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias !33
+// CHECK5-NEXT:    store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias !26
 // CHECK5-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 1
-// CHECK5-NEXT:    store i32 3, ptr [[TMP20]], align 4, !noalias !33
+// CHECK5-NEXT:    store i32 3, ptr [[TMP20]], align 4, !noalias !26
 // CHECK5-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 2
-// CHECK5-NEXT:    store ptr [[TMP13]], ptr [[TMP21]], align 8, !noalias !33
+// CHECK5-NEXT:    store ptr [[TMP13]], ptr [[TMP21]], align 8, !noalias !26
 // CHECK5-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 3
-// CHECK5-NEXT:    store ptr [[TMP14]], ptr [[TMP22]], align 8, !noalias !33
+// CHECK5-NEXT:    store ptr [[TMP14]], ptr [[TMP22]], align 8, !noalias !26
 // CHECK5-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 4
-// CHECK5-NEXT:    store ptr [[TMP15]], ptr [[TMP23]], align 8, !noalias !33
+// CHECK5-NEXT:    store ptr [[TMP15]], ptr [[TMP23]], align 8, !noalias !26
 // CHECK5-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 5
-// CHECK5-NEXT:    store ptr @.offload_maptypes, ptr [[TMP24]], align 8, !noalias !33
+// CHECK5-NEXT:    store ptr @.offload_maptypes, ptr [[TMP24]], align 8, !noalias !26
 // CHECK5-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 6
-// CHECK5-NEXT:    store ptr null, ptr [[TMP25]], align 8, !noalias !33
+// CHECK5-NEXT:    store ptr null, ptr [[TMP25]], align 8, !noalias !26
 // CHECK5-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 7
-// CHECK5-NEXT:    store ptr null, ptr [[TMP26]], align 8, !noalias !33
+// CHECK5-NEXT:    store ptr null, ptr [[TMP26]], align 8, !noalias !26
 // CHECK5-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 8
-// CHECK5-NEXT:    store i64 10, ptr [[TMP27]], align 8, !noalias !33
+// CHECK5-NEXT:    store i64 10, ptr [[TMP27]], align 8, !noalias !26
 // CHECK5-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 9
-// CHECK5-NEXT:    store i64 1, ptr [[TMP28]], align 8, !noalias !33
+// CHECK5-NEXT:    store i64 1, ptr [[TMP28]], align 8, !noalias !26
 // CHECK5-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 10
-// CHECK5-NEXT:    store [3 x i32] [[TMP19]], ptr [[TMP29]], align 4, !noalias !33
+// CHECK5-NEXT:    store [3 x i32] [[TMP19]], ptr [[TMP29]], align 4, !noalias !26
 // CHECK5-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 11
-// CHECK5-NEXT:    store [3 x i32] [i32 1, i32 0, i32 0], ptr [[TMP30]], align 4, !noalias !33
+// CHECK5-NEXT:    store [3 x i32] [i32 1, i32 0, i32 0], ptr [[TMP30]], align 4, !noalias !26
 // CHECK5-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 12
-// CHECK5-NEXT:    store i32 0, ptr [[TMP31]], align 4, !noalias !33
+// CHECK5-NEXT:    store i32 0, ptr [[TMP31]], align 4, !noalias !26
 // CHECK5-NEXT:    [[TMP32:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB2]], i64 -1, i32 [[TMP18]], i32 1, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l97.region_id, ptr [[KERNEL_ARGS_I]])
 // CHECK5-NEXT:    [[TMP33:%.*]] = icmp ne i32 [[TMP32]], 0
 // CHECK5-NEXT:    br i1 [[TMP33]], label [[OMP_OFFLOAD_FAILED_I:%.*]], label [[DOTOMP_OUTLINED__EXIT:%.*]]
 // CHECK5:       omp_offload.failed.i:
 // CHECK5-NEXT:    [[TMP34:%.*]] = load i16, ptr [[TMP12]], align 2
-// CHECK5-NEXT:    store i16 [[TMP34]], ptr [[AA_CASTED_I]], align 2, !noalias !33
-// CHECK5-NEXT:    [[TMP35:%.*]] = load i64, ptr [[AA_CASTED_I]], align 8, !noalias !33
+// CHECK5-NEXT:    store i16 [[TMP34]], ptr [[AA_CASTED_I]], align 2, !noalias !26
+// CHECK5-NEXT:    [[TMP35:%.*]] = load i64, ptr [[AA_CASTED_I]], align 8, !noalias !26
 // CHECK5-NEXT:    [[TMP36:%.*]] = load i32, ptr [[TMP16]], align 4
-// CHECK5-NEXT:    store i32 [[TMP36]], ptr [[DOTCAPTURE_EXPR__CASTED_I]], align 4, !noalias !33
-// CHECK5-NEXT:    [[TMP37:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED_I]], align 8, !noalias !33
+// CHECK5-NEXT:    store i32 [[TMP36]], ptr [[DOTCAPTURE_EXPR__CASTED_I]], align 4, !noalias !26
+// CHECK5-NEXT:    [[TMP37:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED_I]], align 8, !noalias !26
 // CHECK5-NEXT:    [[TMP38:%.*]] = load i32, ptr [[TMP17]], align 4
-// CHECK5-NEXT:    store i32 [[TMP38]], ptr [[DOTCAPTURE_EXPR__CASTED4_I]], align 4, !noalias !33
-// CHECK5-NEXT:    [[TMP39:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED4_I]], align 8, !noalias !33
-// CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l97(i64 [[TMP35]], i64 [[TMP37]], i64 [[TMP39]]) #[[ATTR4]]
+// CHECK5-NEXT:    store i32 [[TMP38]], ptr [[DOTCAPTURE_EXPR__CASTED4_I]], align 4, !noalias !26
+// CHECK5-NEXT:    [[TMP39:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED4_I]], align 8, !noalias !26
+// CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l97(i64 [[TMP35]], i64 [[TMP37]], i64 [[TMP39]]) #[[ATTR3]]
 // CHECK5-NEXT:    br label [[DOTOMP_OUTLINED__EXIT]]
 // CHECK5:       .omp_outlined..exit:
 // CHECK5-NEXT:    ret i32 0
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l102
-// CHECK5-SAME: (i64 noundef [[A:%.*]]) #[[ATTR3]] {
+// CHECK5-SAME: (i64 noundef [[A:%.*]]) #[[ATTR2]] {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
 // CHECK5-NEXT:    [[A_CASTED:%.*]] = alloca i64, align 8
@@ -4403,7 +4403,7 @@ int bar(int n){
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l102.omp_outlined
-// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]]) #[[ATTR3]] {
+// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]]) #[[ATTR2]] {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -4448,10 +4448,10 @@ int bar(int n){
 // CHECK5-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
 // CHECK5-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
 // CHECK5-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK5-NEXT:    store i32 [[ADD]], ptr [[A1]], align 4, !nontemporal !34
-// CHECK5-NEXT:    [[TMP8:%.*]] = load i32, ptr [[A1]], align 4, !nontemporal !34
+// CHECK5-NEXT:    store i32 [[ADD]], ptr [[A1]], align 4, !nontemporal !27
+// CHECK5-NEXT:    [[TMP8:%.*]] = load i32, ptr [[A1]], align 4, !nontemporal !27
 // CHECK5-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP8]], 1
-// CHECK5-NEXT:    store i32 [[ADD3]], ptr [[A1]], align 4, !nontemporal !34
+// CHECK5-NEXT:    store i32 [[ADD3]], ptr [[A1]], align 4, !nontemporal !27
 // CHECK5-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK5:       omp.body.continue:
 // CHECK5-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
@@ -4459,7 +4459,7 @@ int bar(int n){
 // CHECK5-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
 // CHECK5-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP9]], 1
 // CHECK5-NEXT:    store i32 [[ADD4]], ptr [[DOTOMP_IV]], align 4
-// CHECK5-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP35:![0-9]+]]
+// CHECK5-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP28:![0-9]+]]
 // CHECK5:       omp.inner.for.end:
 // CHECK5-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK5:       omp.loop.exit:
@@ -4488,7 +4488,7 @@ int bar(int n){
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l111.omp_outlined
-// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[AA:%.*]]) #[[ATTR3]] {
+// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[AA:%.*]]) #[[ATTR2]] {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -4525,28 +4525,28 @@ int bar(int n){
 // CHECK5-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
 // CHECK5-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK5:       omp.inner.for.cond:
-// CHECK5-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP37:![0-9]+]]
-// CHECK5-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP37]]
+// CHECK5-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP30:![0-9]+]]
+// CHECK5-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP30]]
 // CHECK5-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
 // CHECK5-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK5:       omp.inner.for.body:
-// CHECK5-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP37]]
+// CHECK5-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP30]]
 // CHECK5-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
 // CHECK5-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK5-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP37]]
-// CHECK5-NEXT:    [[TMP8:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP37]]
+// CHECK5-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP30]]
+// CHECK5-NEXT:    [[TMP8:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP30]]
 // CHECK5-NEXT:    [[CONV:%.*]] = sext i16 [[TMP8]] to i32
 // CHECK5-NEXT:    [[ADD2:%.*]] = add nsw i32 [[CONV]], 1
 // CHECK5-NEXT:    [[CONV3:%.*]] = trunc i32 [[ADD2]] to i16
-// CHECK5-NEXT:    store i16 [[CONV3]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP37]]
+// CHECK5-NEXT:    store i16 [[CONV3]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP30]]
 // CHECK5-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK5:       omp.body.continue:
 // CHECK5-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK5:       omp.inner.for.inc:
-// CHECK5-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP37]]
+// CHECK5-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP30]]
 // CHECK5-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP9]], 1
-// CHECK5-NEXT:    store i32 [[ADD4]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP37]]
-// CHECK5-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP38:![0-9]+]]
+// CHECK5-NEXT:    store i32 [[ADD4]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP30]]
+// CHECK5-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP31:![0-9]+]]
 // CHECK5:       omp.inner.for.end:
 // CHECK5-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK5:       omp.loop.exit:
@@ -4581,7 +4581,7 @@ int bar(int n){
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l118.omp_outlined
-// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]]) #[[ATTR3]] {
+// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]]) #[[ATTR2]] {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -4620,31 +4620,31 @@ int bar(int n){
 // CHECK5-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
 // CHECK5-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK5:       omp.inner.for.cond:
-// CHECK5-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP40:![0-9]+]]
-// CHECK5-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP40]]
+// CHECK5-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP33:![0-9]+]]
+// CHECK5-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP33]]
 // CHECK5-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
 // CHECK5-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK5:       omp.inner.for.body:
-// CHECK5-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP40]]
+// CHECK5-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP33]]
 // CHECK5-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
 // CHECK5-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK5-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP40]]
-// CHECK5-NEXT:    [[TMP8:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP40]]
+// CHECK5-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP33]]
+// CHECK5-NEXT:    [[TMP8:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP33]]
 // CHECK5-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP8]], 1
-// CHECK5-NEXT:    store i32 [[ADD2]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP40]]
-// CHECK5-NEXT:    [[TMP9:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP40]]
+// CHECK5-NEXT:    store i32 [[ADD2]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP33]]
+// CHECK5-NEXT:    [[TMP9:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP33]]
 // CHECK5-NEXT:    [[CONV:%.*]] = sext i16 [[TMP9]] to i32
 // CHECK5-NEXT:    [[ADD3:%.*]] = add nsw i32 [[CONV]], 1
 // CHECK5-NEXT:    [[CONV4:%.*]] = trunc i32 [[ADD3]] to i16
-// CHECK5-NEXT:    store i16 [[CONV4]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP40]]
+// CHECK5-NEXT:    store i16 [[CONV4]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP33]]
 // CHECK5-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK5:       omp.body.continue:
 // CHECK5-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK5:       omp.inner.for.inc:
-// CHECK5-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP40]]
+// CHECK5-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP33]]
 // CHECK5-NEXT:    [[ADD5:%.*]] = add nsw i32 [[TMP10]], 1
-// CHECK5-NEXT:    store i32 [[ADD5]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP40]]
-// CHECK5-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP41:![0-9]+]]
+// CHECK5-NEXT:    store i32 [[ADD5]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP33]]
+// CHECK5-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP34:![0-9]+]]
 // CHECK5:       omp.inner.for.end:
 // CHECK5-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK5:       omp.loop.exit:
@@ -4697,7 +4697,7 @@ int bar(int n){
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l142.omp_outlined
-// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BN:%.*]], ptr noundef nonnull align 8 dereferenceable(400) [[C:%.*]], i64 noundef [[VLA1:%.*]], i64 noundef [[VLA3:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[CN:%.*]], ptr noundef nonnull align 8 dereferenceable(16) [[D:%.*]]) #[[ATTR3]] {
+// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BN:%.*]], ptr noundef nonnull align 8 dereferenceable(400) [[C:%.*]], i64 noundef [[VLA1:%.*]], i64 noundef [[VLA3:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[CN:%.*]], ptr noundef nonnull align 8 dereferenceable(16) [[D:%.*]]) #[[ATTR2]] {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -4760,59 +4760,59 @@ int bar(int n){
 // CHECK5-NEXT:    store i32 [[TMP12]], ptr [[DOTOMP_IV]], align 4
 // CHECK5-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK5:       omp.inner.for.cond:
-// CHECK5-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP43:![0-9]+]]
-// CHECK5-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP43]]
+// CHECK5-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP36:![0-9]+]]
+// CHECK5-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP36]]
 // CHECK5-NEXT:    [[CMP5:%.*]] = icmp sle i32 [[TMP13]], [[TMP14]]
 // CHECK5-NEXT:    br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK5:       omp.inner.for.body:
-// CHECK5-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP43]]
+// CHECK5-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP36]]
 // CHECK5-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP15]], 1
 // CHECK5-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK5-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP43]]
-// CHECK5-NEXT:    [[TMP16:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP43]]
+// CHECK5-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP36]]
+// CHECK5-NEXT:    [[TMP16:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP36]]
 // CHECK5-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP16]], 1
-// CHECK5-NEXT:    store i32 [[ADD6]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP43]]
+// CHECK5-NEXT:    store i32 [[ADD6]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP36]]
 // CHECK5-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x float], ptr [[TMP0]], i64 0, i64 2
-// CHECK5-NEXT:    [[TMP17:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP43]]
+// CHECK5-NEXT:    [[TMP17:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP36]]
 // CHECK5-NEXT:    [[CONV:%.*]] = fpext float [[TMP17]] to double
 // CHECK5-NEXT:    [[ADD7:%.*]] = fadd double [[CONV]], 1.000000e+00
 // CHECK5-NEXT:    [[CONV8:%.*]] = fptrunc double [[ADD7]] to float
-// CHECK5-NEXT:    store float [[CONV8]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP43]]
+// CHECK5-NEXT:    store float [[CONV8]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP36]]
 // CHECK5-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i64 3
-// CHECK5-NEXT:    [[TMP18:%.*]] = load float, ptr [[ARRAYIDX9]], align 4, !llvm.access.group [[ACC_GRP43]]
+// CHECK5-NEXT:    [[TMP18:%.*]] = load float, ptr [[ARRAYIDX9]], align 4, !llvm.access.group [[ACC_GRP36]]
 // CHECK5-NEXT:    [[CONV10:%.*]] = fpext float [[TMP18]] to double
 // CHECK5-NEXT:    [[ADD11:%.*]] = fadd double [[CONV10]], 1.000000e+00
 // CHECK5-NEXT:    [[CONV12:%.*]] = fptrunc double [[ADD11]] to float
-// CHECK5-NEXT:    store float [[CONV12]], ptr [[ARRAYIDX9]], align 4, !llvm.access.group [[ACC_GRP43]]
+// CHECK5-NEXT:    store float [[CONV12]], ptr [[ARRAYIDX9]], align 4, !llvm.access.group [[ACC_GRP36]]
 // CHECK5-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [5 x [10 x double]], ptr [[TMP3]], i64 0, i64 1
 // CHECK5-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x double], ptr [[ARRAYIDX13]], i64 0, i64 2
-// CHECK5-NEXT:    [[TMP19:%.*]] = load double, ptr [[ARRAYIDX14]], align 8, !llvm.access.group [[ACC_GRP43]]
+// CHECK5-NEXT:    [[TMP19:%.*]] = load double, ptr [[ARRAYIDX14]], align 8, !llvm.access.group [[ACC_GRP36]]
 // CHECK5-NEXT:    [[ADD15:%.*]] = fadd double [[TMP19]], 1.000000e+00
-// CHECK5-NEXT:    store double [[ADD15]], ptr [[ARRAYIDX14]], align 8, !llvm.access.group [[ACC_GRP43]]
+// CHECK5-NEXT:    store double [[ADD15]], ptr [[ARRAYIDX14]], align 8, !llvm.access.group [[ACC_GRP36]]
 // CHECK5-NEXT:    [[TMP20:%.*]] = mul nsw i64 1, [[TMP5]]
 // CHECK5-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds double, ptr [[TMP6]], i64 [[TMP20]]
 // CHECK5-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds double, ptr [[ARRAYIDX16]], i64 3
-// CHECK5-NEXT:    [[TMP21:%.*]] = load double, ptr [[ARRAYIDX17]], align 8, !llvm.access.group [[ACC_GRP43]]
+// CHECK5-NEXT:    [[TMP21:%.*]] = load double, ptr [[ARRAYIDX17]], align 8, !llvm.access.group [[ACC_GRP36]]
 // CHECK5-NEXT:    [[ADD18:%.*]] = fadd double [[TMP21]], 1.000000e+00
-// CHECK5-NEXT:    store double [[ADD18]], ptr [[ARRAYIDX17]], align 8, !llvm.access.group [[ACC_GRP43]]
+// CHECK5-NEXT:    store double [[ADD18]], ptr [[ARRAYIDX17]], align 8, !llvm.access.group [[ACC_GRP36]]
 // CHECK5-NEXT:    [[X:%.*]] = getelementptr inbounds [[STRUCT_TT:%.*]], ptr [[TMP7]], i32 0, i32 0
-// CHECK5-NEXT:    [[TMP22:%.*]] = load i64, ptr [[X]], align 8, !llvm.access.group [[ACC_GRP43]]
+// CHECK5-NEXT:    [[TMP22:%.*]] = load i64, ptr [[X]], align 8, !llvm.access.group [[ACC_GRP36]]
 // CHECK5-NEXT:    [[ADD19:%.*]] = add nsw i64 [[TMP22]], 1
-// CHECK5-NEXT:    store i64 [[ADD19]], ptr [[X]], align 8, !llvm.access.group [[ACC_GRP43]]
+// CHECK5-NEXT:    store i64 [[ADD19]], ptr [[X]], align 8, !llvm.access.group [[ACC_GRP36]]
 // CHECK5-NEXT:    [[Y:%.*]] = getelementptr inbounds [[STRUCT_TT]], ptr [[TMP7]], i32 0, i32 1
-// CHECK5-NEXT:    [[TMP23:%.*]] = load i8, ptr [[Y]], align 8, !llvm.access.group [[ACC_GRP43]]
+// CHECK5-NEXT:    [[TMP23:%.*]] = load i8, ptr [[Y]], align 8, !llvm.access.group [[ACC_GRP36]]
 // CHECK5-NEXT:    [[CONV20:%.*]] = sext i8 [[TMP23]] to i32
 // CHECK5-NEXT:    [[ADD21:%.*]] = add nsw i32 [[CONV20]], 1
 // CHECK5-NEXT:    [[CONV22:%.*]] = trunc i32 [[ADD21]] to i8
-// CHECK5-NEXT:    store i8 [[CONV22]], ptr [[Y]], align 8, !llvm.access.group [[ACC_GRP43]]
+// CHECK5-NEXT:    store i8 [[CONV22]], ptr [[Y]], align 8, !llvm.access.group [[ACC_GRP36]]
 // CHECK5-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK5:       omp.body.continue:
 // CHECK5-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK5:       omp.inner.for.inc:
-// CHECK5-NEXT:    [[TMP24:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP43]]
+// CHECK5-NEXT:    [[TMP24:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP36]]
 // CHECK5-NEXT:    [[ADD23:%.*]] = add nsw i32 [[TMP24]], 1
-// CHECK5-NEXT:    store i32 [[ADD23]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP43]]
-// CHECK5-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP44:![0-9]+]]
+// CHECK5-NEXT:    store i32 [[ADD23]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP36]]
+// CHECK5-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP37:![0-9]+]]
 // CHECK5:       omp.inner.for.end:
 // CHECK5-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK5:       omp.loop.exit:
@@ -4980,12 +4980,12 @@ int bar(int n){
 // CHECK5-NEXT:    [[TMP49:%.*]] = icmp ne i32 [[TMP48]], 0
 // CHECK5-NEXT:    br i1 [[TMP49]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK5:       omp_offload.failed:
-// CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l215(ptr [[THIS1]], i64 [[TMP7]], i64 2, i64 [[TMP2]], ptr [[VLA]], i64 [[TMP9]]) #[[ATTR4]]
+// CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l215(ptr [[THIS1]], i64 [[TMP7]], i64 2, i64 [[TMP2]], ptr [[VLA]], i64 [[TMP9]]) #[[ATTR3]]
 // CHECK5-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK5:       omp_offload.cont:
 // CHECK5-NEXT:    br label [[OMP_IF_END:%.*]]
 // CHECK5:       omp_if.else:
-// CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l215(ptr [[THIS1]], i64 [[TMP7]], i64 2, i64 [[TMP2]], ptr [[VLA]], i64 [[TMP9]]) #[[ATTR4]]
+// CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l215(ptr [[THIS1]], i64 [[TMP7]], i64 2, i64 [[TMP2]], ptr [[VLA]], i64 [[TMP9]]) #[[ATTR3]]
 // CHECK5-NEXT:    br label [[OMP_IF_END]]
 // CHECK5:       omp_if.end:
 // CHECK5-NEXT:    [[TMP50:%.*]] = mul nsw i64 1, [[TMP2]]
@@ -5117,12 +5117,12 @@ int bar(int n){
 // CHECK5-NEXT:    [[TMP46:%.*]] = icmp ne i32 [[TMP45]], 0
 // CHECK5-NEXT:    br i1 [[TMP46]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK5:       omp_offload.failed:
-// CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l197(i64 [[TMP1]], i64 [[TMP3]], i64 [[TMP5]], i64 [[TMP7]], ptr [[B]]) #[[ATTR4]]
+// CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l197(i64 [[TMP1]], i64 [[TMP3]], i64 [[TMP5]], i64 [[TMP7]], ptr [[B]]) #[[ATTR3]]
 // CHECK5-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK5:       omp_offload.cont:
 // CHECK5-NEXT:    br label [[OMP_IF_END:%.*]]
 // CHECK5:       omp_if.else:
-// CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l197(i64 [[TMP1]], i64 [[TMP3]], i64 [[TMP5]], i64 [[TMP7]], ptr [[B]]) #[[ATTR4]]
+// CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l197(i64 [[TMP1]], i64 [[TMP3]], i64 [[TMP5]], i64 [[TMP7]], ptr [[B]]) #[[ATTR3]]
 // CHECK5-NEXT:    br label [[OMP_IF_END]]
 // CHECK5:       omp_if.end:
 // CHECK5-NEXT:    [[TMP47:%.*]] = load i32, ptr [[A]], align 4
@@ -5206,12 +5206,12 @@ int bar(int n){
 // CHECK5-NEXT:    [[TMP30:%.*]] = icmp ne i32 [[TMP29]], 0
 // CHECK5-NEXT:    br i1 [[TMP30]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK5:       omp_offload.failed:
-// CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l180(i64 [[TMP1]], i64 [[TMP3]], ptr [[B]]) #[[ATTR4]]
+// CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l180(i64 [[TMP1]], i64 [[TMP3]], ptr [[B]]) #[[ATTR3]]
 // CHECK5-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK5:       omp_offload.cont:
 // CHECK5-NEXT:    br label [[OMP_IF_END:%.*]]
 // CHECK5:       omp_if.else:
-// CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l180(i64 [[TMP1]], i64 [[TMP3]], ptr [[B]]) #[[ATTR4]]
+// CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l180(i64 [[TMP1]], i64 [[TMP3]], ptr [[B]]) #[[ATTR3]]
 // CHECK5-NEXT:    br label [[OMP_IF_END]]
 // CHECK5:       omp_if.end:
 // CHECK5-NEXT:    [[TMP31:%.*]] = load i32, ptr [[A]], align 4
@@ -5252,7 +5252,7 @@ int bar(int n){
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l215.omp_outlined
-// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]], i64 noundef [[B:%.*]], i64 noundef [[VLA:%.*]], i64 noundef [[VLA1:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[C:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] {
+// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]], i64 noundef [[B:%.*]], i64 noundef [[VLA:%.*]], i64 noundef [[VLA1:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[C:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -5307,37 +5307,37 @@ int bar(int n){
 // CHECK5:       omp_if.then:
 // CHECK5-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK5:       omp.inner.for.cond:
-// CHECK5-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP46:![0-9]+]]
-// CHECK5-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP46]]
+// CHECK5-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP39:![0-9]+]]
+// CHECK5-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP39]]
 // CHECK5-NEXT:    [[CMP3:%.*]] = icmp sle i32 [[TMP10]], [[TMP11]]
 // CHECK5-NEXT:    br i1 [[CMP3]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK5:       omp.inner.for.body:
-// CHECK5-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP46]]
+// CHECK5-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP39]]
 // CHECK5-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1
 // CHECK5-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK5-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP46]]
-// CHECK5-NEXT:    [[TMP13:%.*]] = load i32, ptr [[B_ADDR]], align 4, !llvm.access.group [[ACC_GRP46]]
+// CHECK5-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP39]]
+// CHECK5-NEXT:    [[TMP13:%.*]] = load i32, ptr [[B_ADDR]], align 4, !llvm.access.group [[ACC_GRP39]]
 // CHECK5-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP13]] to double
 // CHECK5-NEXT:    [[ADD4:%.*]] = fadd double [[CONV]], 1.500000e+00
 // CHECK5-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_S1:%.*]], ptr [[TMP0]], i32 0, i32 0
-// CHECK5-NEXT:    store double [[ADD4]], ptr [[A]], align 8, !llvm.access.group [[ACC_GRP46]]
+// CHECK5-NEXT:    store double [[ADD4]], ptr [[A]], align 8, !llvm.access.group [[ACC_GRP39]]
 // CHECK5-NEXT:    [[A5:%.*]] = getelementptr inbounds [[STRUCT_S1]], ptr [[TMP0]], i32 0, i32 0
-// CHECK5-NEXT:    [[TMP14:%.*]] = load double, ptr [[A5]], align 8, !llvm.access.group [[ACC_GRP46]]
+// CHECK5-NEXT:    [[TMP14:%.*]] = load double, ptr [[A5]], align 8, !llvm.access.group [[ACC_GRP39]]
 // CHECK5-NEXT:    [[INC:%.*]] = fadd double [[TMP14]], 1.000000e+00
-// CHECK5-NEXT:    store double [[INC]], ptr [[A5]], align 8, !llvm.access.group [[ACC_GRP46]]
+// CHECK5-NEXT:    store double [[INC]], ptr [[A5]], align 8, !llvm.access.group [[ACC_GRP39]]
 // CHECK5-NEXT:    [[CONV6:%.*]] = fptosi double [[INC]] to i16
 // CHECK5-NEXT:    [[TMP15:%.*]] = mul nsw i64 1, [[TMP2]]
 // CHECK5-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[TMP3]], i64 [[TMP15]]
 // CHECK5-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i16, ptr [[ARRAYIDX]], i64 1
-// CHECK5-NEXT:    store i16 [[CONV6]], ptr [[ARRAYIDX7]], align 2, !llvm.access.group [[ACC_GRP46]]
+// CHECK5-NEXT:    store i16 [[CONV6]], ptr [[ARRAYIDX7]], align 2, !llvm.access.group [[ACC_GRP39]]
 // CHECK5-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK5:       omp.body.continue:
 // CHECK5-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK5:       omp.inner.for.inc:
-// CHECK5-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP46]]
+// CHECK5-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP39]]
 // CHECK5-NEXT:    [[ADD8:%.*]] = add nsw i32 [[TMP16]], 1
-// CHECK5-NEXT:    store i32 [[ADD8]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP46]]
-// CHECK5-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP47:![0-9]+]]
+// CHECK5-NEXT:    store i32 [[ADD8]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP39]]
+// CHECK5-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP40:![0-9]+]]
 // CHECK5:       omp.inner.for.end:
 // CHECK5-NEXT:    br label [[OMP_IF_END:%.*]]
 // CHECK5:       omp_if.else:
@@ -5373,7 +5373,7 @@ int bar(int n){
 // CHECK5-NEXT:    [[TMP23:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
 // CHECK5-NEXT:    [[ADD24:%.*]] = add nsw i32 [[TMP23]], 1
 // CHECK5-NEXT:    store i32 [[ADD24]], ptr [[DOTOMP_IV]], align 4
-// CHECK5-NEXT:    br label [[OMP_INNER_FOR_COND9]], !llvm.loop [[LOOP49:![0-9]+]]
+// CHECK5-NEXT:    br label [[OMP_INNER_FOR_COND9]], !llvm.loop [[LOOP42:![0-9]+]]
 // CHECK5:       omp.inner.for.end25:
 // CHECK5-NEXT:    br label [[OMP_IF_END]]
 // CHECK5:       omp_if.end:
@@ -5425,7 +5425,7 @@ int bar(int n){
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l197.omp_outlined
-// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], i64 noundef [[N:%.*]], i64 noundef [[AA:%.*]], i64 noundef [[AAA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR3]] {
+// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], i64 noundef [[N:%.*]], i64 noundef [[AA:%.*]], i64 noundef [[AAA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR2]] {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -5497,42 +5497,42 @@ int bar(int n){
 // CHECK5-NEXT:    store i32 [[TMP15]], ptr [[DOTOMP_IV]], align 4
 // CHECK5-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK5:       omp.inner.for.cond:
-// CHECK5-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP51:![0-9]+]]
-// CHECK5-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP51]]
+// CHECK5-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP44:![0-9]+]]
+// CHECK5-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP44]]
 // CHECK5-NEXT:    [[ADD7:%.*]] = add i32 [[TMP17]], 1
 // CHECK5-NEXT:    [[CMP8:%.*]] = icmp ult i32 [[TMP16]], [[ADD7]]
 // CHECK5-NEXT:    br i1 [[CMP8]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK5:       omp.inner.for.body:
-// CHECK5-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4, !llvm.access.group [[ACC_GRP51]]
-// CHECK5-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP51]]
+// CHECK5-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4, !llvm.access.group [[ACC_GRP44]]
+// CHECK5-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP44]]
 // CHECK5-NEXT:    [[MUL:%.*]] = mul i32 [[TMP19]], 1
 // CHECK5-NEXT:    [[ADD9:%.*]] = add i32 [[TMP18]], [[MUL]]
-// CHECK5-NEXT:    store i32 [[ADD9]], ptr [[I5]], align 4, !llvm.access.group [[ACC_GRP51]]
-// CHECK5-NEXT:    [[TMP20:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP51]]
+// CHECK5-NEXT:    store i32 [[ADD9]], ptr [[I5]], align 4, !llvm.access.group [[ACC_GRP44]]
+// CHECK5-NEXT:    [[TMP20:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP44]]
 // CHECK5-NEXT:    [[ADD10:%.*]] = add nsw i32 [[TMP20]], 1
-// CHECK5-NEXT:    store i32 [[ADD10]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP51]]
-// CHECK5-NEXT:    [[TMP21:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP51]]
+// CHECK5-NEXT:    store i32 [[ADD10]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP44]]
+// CHECK5-NEXT:    [[TMP21:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP44]]
 // CHECK5-NEXT:    [[CONV:%.*]] = sext i16 [[TMP21]] to i32
 // CHECK5-NEXT:    [[ADD11:%.*]] = add nsw i32 [[CONV]], 1
 // CHECK5-NEXT:    [[CONV12:%.*]] = trunc i32 [[ADD11]] to i16
-// CHECK5-NEXT:    store i16 [[CONV12]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP51]]
-// CHECK5-NEXT:    [[TMP22:%.*]] = load i8, ptr [[AAA_ADDR]], align 1, !llvm.access.group [[ACC_GRP51]]
+// CHECK5-NEXT:    store i16 [[CONV12]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP44]]
+// CHECK5-NEXT:    [[TMP22:%.*]] = load i8, ptr [[AAA_ADDR]], align 1, !llvm.access.group [[ACC_GRP44]]
 // CHECK5-NEXT:    [[CONV13:%.*]] = sext i8 [[TMP22]] to i32
 // CHECK5-NEXT:    [[ADD14:%.*]] = add nsw i32 [[CONV13]], 1
 // CHECK5-NEXT:    [[CONV15:%.*]] = trunc i32 [[ADD14]] to i8
-// CHECK5-NEXT:    store i8 [[CONV15]], ptr [[AAA_ADDR]], align 1, !llvm.access.group [[ACC_GRP51]]
+// CHECK5-NEXT:    store i8 [[CONV15]], ptr [[AAA_ADDR]], align 1, !llvm.access.group [[ACC_GRP44]]
 // CHECK5-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP0]], i64 0, i64 2
-// CHECK5-NEXT:    [[TMP23:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP51]]
+// CHECK5-NEXT:    [[TMP23:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP44]]
 // CHECK5-NEXT:    [[ADD16:%.*]] = add nsw i32 [[TMP23]], 1
-// CHECK5-NEXT:    store i32 [[ADD16]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP51]]
+// CHECK5-NEXT:    store i32 [[ADD16]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP44]]
 // CHECK5-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK5:       omp.body.continue:
 // CHECK5-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK5:       omp.inner.for.inc:
-// CHECK5-NEXT:    [[TMP24:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP51]]
+// CHECK5-NEXT:    [[TMP24:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP44]]
 // CHECK5-NEXT:    [[ADD17:%.*]] = add i32 [[TMP24]], 1
-// CHECK5-NEXT:    store i32 [[ADD17]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP51]]
-// CHECK5-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP52:![0-9]+]]
+// CHECK5-NEXT:    store i32 [[ADD17]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP44]]
+// CHECK5-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP45:![0-9]+]]
 // CHECK5:       omp.inner.for.end:
 // CHECK5-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK5:       omp.loop.exit:
@@ -5583,7 +5583,7 @@ int bar(int n){
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l180.omp_outlined
-// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR3]] {
+// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR2]] {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -5625,35 +5625,35 @@ int bar(int n){
 // CHECK5-NEXT:    store i32 [[TMP5]], ptr [[DOTOMP_IV]], align 4
 // CHECK5-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK5:       omp.inner.for.cond:
-// CHECK5-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP54:![0-9]+]]
-// CHECK5-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP54]]
+// CHECK5-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP47:![0-9]+]]
+// CHECK5-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP47]]
 // CHECK5-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP6]], [[TMP7]]
 // CHECK5-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK5:       omp.inner.for.body:
-// CHECK5-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP54]]
+// CHECK5-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP47]]
 // CHECK5-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1
 // CHECK5-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK5-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP54]]
-// CHECK5-NEXT:    [[TMP9:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP54]]
+// CHECK5-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP47]]
+// CHECK5-NEXT:    [[TMP9:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP47]]
 // CHECK5-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP9]], 1
-// CHECK5-NEXT:    store i32 [[ADD2]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP54]]
-// CHECK5-NEXT:    [[TMP10:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP54]]
+// CHECK5-NEXT:    store i32 [[ADD2]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP47]]
+// CHECK5-NEXT:    [[TMP10:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP47]]
 // CHECK5-NEXT:    [[CONV:%.*]] = sext i16 [[TMP10]] to i32
 // CHECK5-NEXT:    [[ADD3:%.*]] = add nsw i32 [[CONV]], 1
 // CHECK5-NEXT:    [[CONV4:%.*]] = trunc i32 [[ADD3]] to i16
-// CHECK5-NEXT:    store i16 [[CONV4]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP54]]
+// CHECK5-NEXT:    store i16 [[CONV4]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP47]]
 // CHECK5-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP0]], i64 0, i64 2
-// CHECK5-NEXT:    [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP54]]
+// CHECK5-NEXT:    [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP47]]
 // CHECK5-NEXT:    [[ADD5:%.*]] = add nsw i32 [[TMP11]], 1
-// CHECK5-NEXT:    store i32 [[ADD5]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP54]]
+// CHECK5-NEXT:    store i32 [[ADD5]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP47]]
 // CHECK5-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK5:       omp.body.continue:
 // CHECK5-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK5:       omp.inner.for.inc:
-// CHECK5-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP54]]
+// CHECK5-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP47]]
 // CHECK5-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP12]], 1
-// CHECK5-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP54]]
-// CHECK5-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP55:![0-9]+]]
+// CHECK5-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP47]]
+// CHECK5-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP48:![0-9]+]]
 // CHECK5:       omp.inner.for.end:
 // CHECK5-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK5:       omp.loop.exit:
@@ -5669,7 +5669,7 @@ int bar(int n){
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK5-SAME: () #[[ATTR5]] {
+// CHECK5-SAME: () #[[ATTR4]] {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK5-NEXT:    ret void
@@ -5791,7 +5791,7 @@ int bar(int n){
 // CHECK7-NEXT:    [[TMP41:%.*]] = load i32, ptr [[A]], align 4
 // CHECK7-NEXT:    store i32 [[TMP41]], ptr [[A_CASTED]], align 4
 // CHECK7-NEXT:    [[TMP42:%.*]] = load i32, ptr [[A_CASTED]], align 4
-// CHECK7-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l102(i32 [[TMP42]]) #[[ATTR4:[0-9]+]]
+// CHECK7-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l102(i32 [[TMP42]]) #[[ATTR3:[0-9]+]]
 // CHECK7-NEXT:    [[TMP43:%.*]] = load i16, ptr [[AA]], align 2
 // CHECK7-NEXT:    store i16 [[TMP43]], ptr [[AA_CASTED4]], align 2
 // CHECK7-NEXT:    [[TMP44:%.*]] = load i32, ptr [[AA_CASTED4]], align 4
@@ -5833,7 +5833,7 @@ int bar(int n){
 // CHECK7-NEXT:    [[TMP64:%.*]] = icmp ne i32 [[TMP63]], 0
 // CHECK7-NEXT:    br i1 [[TMP64]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK7:       omp_offload.failed:
-// CHECK7-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l111(i32 [[TMP44]]) #[[ATTR4]]
+// CHECK7-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l111(i32 [[TMP44]]) #[[ATTR3]]
 // CHECK7-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK7:       omp_offload.cont:
 // CHECK7-NEXT:    [[TMP65:%.*]] = load i32, ptr [[A]], align 4
@@ -5890,12 +5890,12 @@ int bar(int n){
 // CHECK7-NEXT:    [[TMP92:%.*]] = icmp ne i32 [[TMP91]], 0
 // CHECK7-NEXT:    br i1 [[TMP92]], label [[OMP_OFFLOAD_FAILED15:%.*]], label [[OMP_OFFLOAD_CONT16:%.*]]
 // CHECK7:       omp_offload.failed15:
-// CHECK7-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l118(i32 [[TMP66]], i32 [[TMP68]]) #[[ATTR4]]
+// CHECK7-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l118(i32 [[TMP66]], i32 [[TMP68]]) #[[ATTR3]]
 // CHECK7-NEXT:    br label [[OMP_OFFLOAD_CONT16]]
 // CHECK7:       omp_offload.cont16:
 // CHECK7-NEXT:    br label [[OMP_IF_END:%.*]]
 // CHECK7:       omp_if.else:
-// CHECK7-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l118(i32 [[TMP66]], i32 [[TMP68]]) #[[ATTR4]]
+// CHECK7-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l118(i32 [[TMP66]], i32 [[TMP68]]) #[[ATTR3]]
 // CHECK7-NEXT:    br label [[OMP_IF_END]]
 // CHECK7:       omp_if.end:
 // CHECK7-NEXT:    [[TMP93:%.*]] = load i32, ptr [[A]], align 4
@@ -6002,12 +6002,12 @@ int bar(int n){
 // CHECK7-NEXT:    [[TMP147:%.*]] = icmp ne i32 [[TMP146]], 0
 // CHECK7-NEXT:    br i1 [[TMP147]], label [[OMP_OFFLOAD_FAILED25:%.*]], label [[OMP_OFFLOAD_CONT26:%.*]]
 // CHECK7:       omp_offload.failed25:
-// CHECK7-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l142(i32 [[TMP94]], ptr [[B]], i32 [[TMP1]], ptr [[VLA]], ptr [[C]], i32 5, i32 [[TMP3]], ptr [[VLA1]], ptr [[D]]) #[[ATTR4]]
+// CHECK7-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l142(i32 [[TMP94]], ptr [[B]], i32 [[TMP1]], ptr [[VLA]], ptr [[C]], i32 5, i32 [[TMP3]], ptr [[VLA1]], ptr [[D]]) #[[ATTR3]]
 // CHECK7-NEXT:    br label [[OMP_OFFLOAD_CONT26]]
 // CHECK7:       omp_offload.cont26:
 // CHECK7-NEXT:    br label [[OMP_IF_END28:%.*]]
 // CHECK7:       omp_if.else27:
-// CHECK7-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l142(i32 [[TMP94]], ptr [[B]], i32 [[TMP1]], ptr [[VLA]], ptr [[C]], i32 5, i32 [[TMP3]], ptr [[VLA1]], ptr [[D]]) #[[ATTR4]]
+// CHECK7-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l142(i32 [[TMP94]], ptr [[B]], i32 [[TMP1]], ptr [[VLA]], ptr [[C]], i32 5, i32 [[TMP3]], ptr [[VLA1]], ptr [[D]]) #[[ATTR3]]
 // CHECK7-NEXT:    br label [[OMP_IF_END28]]
 // CHECK7:       omp_if.end28:
 // CHECK7-NEXT:    [[TMP148:%.*]] = load i32, ptr [[A]], align 4
@@ -6038,7 +6038,7 @@ int bar(int n){
 //
 //
 // CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l97.omp_outlined
-// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[AA:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[AA:%.*]]) #[[ATTR2]] {
 // CHECK7-NEXT:  entry:
 // CHECK7-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK7-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -6075,23 +6075,23 @@ int bar(int n){
 // CHECK7-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
 // CHECK7-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK7:       omp.inner.for.cond:
-// CHECK7-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP18:![0-9]+]]
-// CHECK7-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK7-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP11:![0-9]+]]
+// CHECK7-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP11]]
 // CHECK7-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
 // CHECK7-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK7:       omp.inner.for.body:
-// CHECK7-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK7-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP11]]
 // CHECK7-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
 // CHECK7-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK7-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK7-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP11]]
 // CHECK7-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK7:       omp.body.continue:
 // CHECK7-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK7:       omp.inner.for.inc:
-// CHECK7-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK7-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP11]]
 // CHECK7-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP8]], 1
-// CHECK7-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP18]]
-// CHECK7-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP19:![0-9]+]]
+// CHECK7-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP11]]
+// CHECK7-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP12:![0-9]+]]
 // CHECK7:       omp.inner.for.end:
 // CHECK7-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK7:       omp.loop.exit:
@@ -6107,7 +6107,7 @@ int bar(int n){
 //
 //
 // CHECK7-LABEL: define {{[^@]+}}@.omp_task_privates_map.
-// CHECK7-SAME: (ptr noalias noundef [[TMP0:%.*]], ptr noalias noundef [[TMP1:%.*]], ptr noalias noundef [[TMP2:%.*]], ptr noalias noundef [[TMP3:%.*]], ptr noalias noundef [[TMP4:%.*]]) #[[ATTR5:[0-9]+]] {
+// CHECK7-SAME: (ptr noalias noundef [[TMP0:%.*]], ptr noalias noundef [[TMP1:%.*]], ptr noalias noundef [[TMP2:%.*]], ptr noalias noundef [[TMP3:%.*]], ptr noalias noundef [[TMP4:%.*]]) #[[ATTR4:[0-9]+]] {
 // CHECK7-NEXT:  entry:
 // CHECK7-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 4
 // CHECK7-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 4
@@ -6136,7 +6136,7 @@ int bar(int n){
 //
 //
 // CHECK7-LABEL: define {{[^@]+}}@.omp_task_entry.
-// CHECK7-SAME: (i32 noundef [[TMP0:%.*]], ptr noalias noundef [[TMP1:%.*]]) #[[ATTR6:[0-9]+]] {
+// CHECK7-SAME: (i32 noundef [[TMP0:%.*]], ptr noalias noundef [[TMP1:%.*]]) #[[ATTR5:[0-9]+]] {
 // CHECK7-NEXT:  entry:
 // CHECK7-NEXT:    [[DOTGLOBAL_TID__ADDR_I:%.*]] = alloca i32, align 4
 // CHECK7-NEXT:    [[DOTPART_ID__ADDR_I:%.*]] = alloca ptr, align 4
@@ -6163,74 +6163,74 @@ int bar(int n){
 // CHECK7-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T]], ptr [[TMP4]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 4
 // CHECK7-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES]], ptr [[TMP3]], i32 0, i32 1
+// CHECK7-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META18:![0-9]+]])
+// CHECK7-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META21:![0-9]+]])
+// CHECK7-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META23:![0-9]+]])
 // CHECK7-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META25:![0-9]+]])
-// CHECK7-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META28:![0-9]+]])
-// CHECK7-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META30:![0-9]+]])
-// CHECK7-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META32:![0-9]+]])
-// CHECK7-NEXT:    store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !34
-// CHECK7-NEXT:    store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 4, !noalias !34
-// CHECK7-NEXT:    store ptr [[TMP8]], ptr [[DOTPRIVATES__ADDR_I]], align 4, !noalias !34
-// CHECK7-NEXT:    store ptr @.omp_task_privates_map., ptr [[DOTCOPY_FN__ADDR_I]], align 4, !noalias !34
-// CHECK7-NEXT:    store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 4, !noalias !34
-// CHECK7-NEXT:    store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 4, !noalias !34
-// CHECK7-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 4, !noalias !34
-// CHECK7-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[DOTCOPY_FN__ADDR_I]], align 4, !noalias !34
-// CHECK7-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[DOTPRIVATES__ADDR_I]], align 4, !noalias !34
-// CHECK7-NEXT:    call void [[TMP10]](ptr [[TMP11]], ptr [[DOTFIRSTPRIV_PTR_ADDR_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR2_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR3_I]]) #[[ATTR4]]
-// CHECK7-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR_I]], align 4, !noalias !34
-// CHECK7-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], align 4, !noalias !34
-// CHECK7-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR2_I]], align 4, !noalias !34
-// CHECK7-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR3_I]], align 4, !noalias !34
+// CHECK7-NEXT:    store i32 [[TMP2]], ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !27
+// CHECK7-NEXT:    store ptr [[TMP5]], ptr [[DOTPART_ID__ADDR_I]], align 4, !noalias !27
+// CHECK7-NEXT:    store ptr [[TMP8]], ptr [[DOTPRIVATES__ADDR_I]], align 4, !noalias !27
+// CHECK7-NEXT:    store ptr @.omp_task_privates_map., ptr [[DOTCOPY_FN__ADDR_I]], align 4, !noalias !27
+// CHECK7-NEXT:    store ptr [[TMP3]], ptr [[DOTTASK_T__ADDR_I]], align 4, !noalias !27
+// CHECK7-NEXT:    store ptr [[TMP7]], ptr [[__CONTEXT_ADDR_I]], align 4, !noalias !27
+// CHECK7-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 4, !noalias !27
+// CHECK7-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[DOTCOPY_FN__ADDR_I]], align 4, !noalias !27
+// CHECK7-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[DOTPRIVATES__ADDR_I]], align 4, !noalias !27
+// CHECK7-NEXT:    call void [[TMP10]](ptr [[TMP11]], ptr [[DOTFIRSTPRIV_PTR_ADDR_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR2_I]], ptr [[DOTFIRSTPRIV_PTR_ADDR3_I]]) #[[ATTR3]]
+// CHECK7-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR_I]], align 4, !noalias !27
+// CHECK7-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR1_I]], align 4, !noalias !27
+// CHECK7-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR2_I]], align 4, !noalias !27
+// CHECK7-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[DOTFIRSTPRIV_PTR_ADDR3_I]], align 4, !noalias !27
 // CHECK7-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [[STRUCT_ANON:%.*]], ptr [[TMP9]], i32 0, i32 1
 // CHECK7-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_ANON]], ptr [[TMP9]], i32 0, i32 2
 // CHECK7-NEXT:    [[TMP18:%.*]] = load i32, ptr [[TMP16]], align 4
 // CHECK7-NEXT:    [[TMP19:%.*]] = insertvalue [3 x i32] zeroinitializer, i32 [[TMP18]], 0
-// CHECK7-NEXT:    store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias !34
+// CHECK7-NEXT:    store i32 2, ptr [[KERNEL_ARGS_I]], align 4, !noalias !27
 // CHECK7-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 1
-// CHECK7-NEXT:    store i32 3, ptr [[TMP20]], align 4, !noalias !34
+// CHECK7-NEXT:    store i32 3, ptr [[TMP20]], align 4, !noalias !27
 // CHECK7-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 2
-// CHECK7-NEXT:    store ptr [[TMP13]], ptr [[TMP21]], align 4, !noalias !34
+// CHECK7-NEXT:    store ptr [[TMP13]], ptr [[TMP21]], align 4, !noalias !27
 // CHECK7-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 3
-// CHECK7-NEXT:    store ptr [[TMP14]], ptr [[TMP22]], align 4, !noalias !34
+// CHECK7-NEXT:    store ptr [[TMP14]], ptr [[TMP22]], align 4, !noalias !27
 // CHECK7-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 4
-// CHECK7-NEXT:    store ptr [[TMP15]], ptr [[TMP23]], align 4, !noalias !34
+// CHECK7-NEXT:    store ptr [[TMP15]], ptr [[TMP23]], align 4, !noalias !27
 // CHECK7-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 5
-// CHECK7-NEXT:    store ptr @.offload_maptypes, ptr [[TMP24]], align 4, !noalias !34
+// CHECK7-NEXT:    store ptr @.offload_maptypes, ptr [[TMP24]], align 4, !noalias !27
 // CHECK7-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 6
-// CHECK7-NEXT:    store ptr null, ptr [[TMP25]], align 4, !noalias !34
+// CHECK7-NEXT:    store ptr null, ptr [[TMP25]], align 4, !noalias !27
 // CHECK7-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 7
-// CHECK7-NEXT:    store ptr null, ptr [[TMP26]], align 4, !noalias !34
+// CHECK7-NEXT:    store ptr null, ptr [[TMP26]], align 4, !noalias !27
 // CHECK7-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 8
-// CHECK7-NEXT:    store i64 10, ptr [[TMP27]], align 8, !noalias !34
+// CHECK7-NEXT:    store i64 10, ptr [[TMP27]], align 8, !noalias !27
 // CHECK7-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 9
-// CHECK7-NEXT:    store i64 1, ptr [[TMP28]], align 8, !noalias !34
+// CHECK7-NEXT:    store i64 1, ptr [[TMP28]], align 8, !noalias !27
 // CHECK7-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 10
-// CHECK7-NEXT:    store [3 x i32] [[TMP19]], ptr [[TMP29]], align 4, !noalias !34
+// CHECK7-NEXT:    store [3 x i32] [[TMP19]], ptr [[TMP29]], align 4, !noalias !27
 // CHECK7-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 11
-// CHECK7-NEXT:    store [3 x i32] [i32 1, i32 0, i32 0], ptr [[TMP30]], align 4, !noalias !34
+// CHECK7-NEXT:    store [3 x i32] [i32 1, i32 0, i32 0], ptr [[TMP30]], align 4, !noalias !27
 // CHECK7-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS_I]], i32 0, i32 12
-// CHECK7-NEXT:    store i32 0, ptr [[TMP31]], align 4, !noalias !34
+// CHECK7-NEXT:    store i32 0, ptr [[TMP31]], align 4, !noalias !27
 // CHECK7-NEXT:    [[TMP32:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB2]], i64 -1, i32 [[TMP18]], i32 1, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l97.region_id, ptr [[KERNEL_ARGS_I]])
 // CHECK7-NEXT:    [[TMP33:%.*]] = icmp ne i32 [[TMP32]], 0
 // CHECK7-NEXT:    br i1 [[TMP33]], label [[OMP_OFFLOAD_FAILED_I:%.*]], label [[DOTOMP_OUTLINED__EXIT:%.*]]
 // CHECK7:       omp_offload.failed.i:
 // CHECK7-NEXT:    [[TMP34:%.*]] = load i16, ptr [[TMP12]], align 2
-// CHECK7-NEXT:    store i16 [[TMP34]], ptr [[AA_CASTED_I]], align 2, !noalias !34
-// CHECK7-NEXT:    [[TMP35:%.*]] = load i32, ptr [[AA_CASTED_I]], align 4, !noalias !34
+// CHECK7-NEXT:    store i16 [[TMP34]], ptr [[AA_CASTED_I]], align 2, !noalias !27
+// CHECK7-NEXT:    [[TMP35:%.*]] = load i32, ptr [[AA_CASTED_I]], align 4, !noalias !27
 // CHECK7-NEXT:    [[TMP36:%.*]] = load i32, ptr [[TMP16]], align 4
-// CHECK7-NEXT:    store i32 [[TMP36]], ptr [[DOTCAPTURE_EXPR__CASTED_I]], align 4, !noalias !34
-// CHECK7-NEXT:    [[TMP37:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__CASTED_I]], align 4, !noalias !34
+// CHECK7-NEXT:    store i32 [[TMP36]], ptr [[DOTCAPTURE_EXPR__CASTED_I]], align 4, !noalias !27
+// CHECK7-NEXT:    [[TMP37:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__CASTED_I]], align 4, !noalias !27
 // CHECK7-NEXT:    [[TMP38:%.*]] = load i32, ptr [[TMP17]], align 4
-// CHECK7-NEXT:    store i32 [[TMP38]], ptr [[DOTCAPTURE_EXPR__CASTED4_I]], align 4, !noalias !34
-// CHECK7-NEXT:    [[TMP39:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__CASTED4_I]], align 4, !noalias !34
-// CHECK7-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l97(i32 [[TMP35]], i32 [[TMP37]], i32 [[TMP39]]) #[[ATTR4]]
+// CHECK7-NEXT:    store i32 [[TMP38]], ptr [[DOTCAPTURE_EXPR__CASTED4_I]], align 4, !noalias !27
+// CHECK7-NEXT:    [[TMP39:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__CASTED4_I]], align 4, !noalias !27
+// CHECK7-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l97(i32 [[TMP35]], i32 [[TMP37]], i32 [[TMP39]]) #[[ATTR3]]
 // CHECK7-NEXT:    br label [[DOTOMP_OUTLINED__EXIT]]
 // CHECK7:       .omp_outlined..exit:
 // CHECK7-NEXT:    ret i32 0
 //
 //
 // CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l102
-// CHECK7-SAME: (i32 noundef [[A:%.*]]) #[[ATTR3]] {
+// CHECK7-SAME: (i32 noundef [[A:%.*]]) #[[ATTR2]] {
 // CHECK7-NEXT:  entry:
 // CHECK7-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 // CHECK7-NEXT:    [[A_CASTED:%.*]] = alloca i32, align 4
@@ -6243,7 +6243,7 @@ int bar(int n){
 //
 //
 // CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l102.omp_outlined
-// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]]) #[[ATTR3]] {
+// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]]) #[[ATTR2]] {
 // CHECK7-NEXT:  entry:
 // CHECK7-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK7-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -6288,10 +6288,10 @@ int bar(int n){
 // CHECK7-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
 // CHECK7-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
 // CHECK7-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK7-NEXT:    store i32 [[ADD]], ptr [[A1]], align 4, !nontemporal !35
-// CHECK7-NEXT:    [[TMP8:%.*]] = load i32, ptr [[A1]], align 4, !nontemporal !35
+// CHECK7-NEXT:    store i32 [[ADD]], ptr [[A1]], align 4, !nontemporal !28
+// CHECK7-NEXT:    [[TMP8:%.*]] = load i32, ptr [[A1]], align 4, !nontemporal !28
 // CHECK7-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP8]], 1
-// CHECK7-NEXT:    store i32 [[ADD3]], ptr [[A1]], align 4, !nontemporal !35
+// CHECK7-NEXT:    store i32 [[ADD3]], ptr [[A1]], align 4, !nontemporal !28
 // CHECK7-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK7:       omp.body.continue:
 // CHECK7-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
@@ -6299,7 +6299,7 @@ int bar(int n){
 // CHECK7-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
 // CHECK7-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP9]], 1
 // CHECK7-NEXT:    store i32 [[ADD4]], ptr [[DOTOMP_IV]], align 4
-// CHECK7-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP36:![0-9]+]]
+// CHECK7-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP29:![0-9]+]]
 // CHECK7:       omp.inner.for.end:
 // CHECK7-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK7:       omp.loop.exit:
@@ -6328,7 +6328,7 @@ int bar(int n){
 //
 //
 // CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l111.omp_outlined
-// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[AA:%.*]]) #[[ATTR3]] {
+// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[AA:%.*]]) #[[ATTR2]] {
 // CHECK7-NEXT:  entry:
 // CHECK7-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK7-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -6365,28 +6365,28 @@ int bar(int n){
 // CHECK7-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
 // CHECK7-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK7:       omp.inner.for.cond:
-// CHECK7-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP38:![0-9]+]]
-// CHECK7-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP38]]
+// CHECK7-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP31:![0-9]+]]
+// CHECK7-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP31]]
 // CHECK7-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
 // CHECK7-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK7:       omp.inner.for.body:
-// CHECK7-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP38]]
+// CHECK7-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP31]]
 // CHECK7-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
 // CHECK7-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK7-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP38]]
-// CHECK7-NEXT:    [[TMP8:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP38]]
+// CHECK7-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP31]]
+// CHECK7-NEXT:    [[TMP8:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP31]]
 // CHECK7-NEXT:    [[CONV:%.*]] = sext i16 [[TMP8]] to i32
 // CHECK7-NEXT:    [[ADD2:%.*]] = add nsw i32 [[CONV]], 1
 // CHECK7-NEXT:    [[CONV3:%.*]] = trunc i32 [[ADD2]] to i16
-// CHECK7-NEXT:    store i16 [[CONV3]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP38]]
+// CHECK7-NEXT:    store i16 [[CONV3]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP31]]
 // CHECK7-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK7:       omp.body.continue:
 // CHECK7-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK7:       omp.inner.for.inc:
-// CHECK7-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP38]]
+// CHECK7-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP31]]
 // CHECK7-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP9]], 1
-// CHECK7-NEXT:    store i32 [[ADD4]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP38]]
-// CHECK7-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP39:![0-9]+]]
+// CHECK7-NEXT:    store i32 [[ADD4]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP31]]
+// CHECK7-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP32:![0-9]+]]
 // CHECK7:       omp.inner.for.end:
 // CHECK7-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK7:       omp.loop.exit:
@@ -6421,7 +6421,7 @@ int bar(int n){
 //
 //
 // CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l118.omp_outlined
-// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]]) #[[ATTR3]] {
+// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]]) #[[ATTR2]] {
 // CHECK7-NEXT:  entry:
 // CHECK7-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK7-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -6460,31 +6460,31 @@ int bar(int n){
 // CHECK7-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
 // CHECK7-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK7:       omp.inner.for.cond:
-// CHECK7-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP41:![0-9]+]]
-// CHECK7-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP41]]
+// CHECK7-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP34:![0-9]+]]
+// CHECK7-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP34]]
 // CHECK7-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
 // CHECK7-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK7:       omp.inner.for.body:
-// CHECK7-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP41]]
+// CHECK7-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP34]]
 // CHECK7-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
 // CHECK7-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK7-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP41]]
-// CHECK7-NEXT:    [[TMP8:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP41]]
+// CHECK7-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP34]]
+// CHECK7-NEXT:    [[TMP8:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP34]]
 // CHECK7-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP8]], 1
-// CHECK7-NEXT:    store i32 [[ADD2]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP41]]
-// CHECK7-NEXT:    [[TMP9:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP41]]
+// CHECK7-NEXT:    store i32 [[ADD2]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP34]]
+// CHECK7-NEXT:    [[TMP9:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP34]]
 // CHECK7-NEXT:    [[CONV:%.*]] = sext i16 [[TMP9]] to i32
 // CHECK7-NEXT:    [[ADD3:%.*]] = add nsw i32 [[CONV]], 1
 // CHECK7-NEXT:    [[CONV4:%.*]] = trunc i32 [[ADD3]] to i16
-// CHECK7-NEXT:    store i16 [[CONV4]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP41]]
+// CHECK7-NEXT:    store i16 [[CONV4]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP34]]
 // CHECK7-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK7:       omp.body.continue:
 // CHECK7-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK7:       omp.inner.for.inc:
-// CHECK7-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP41]]
+// CHECK7-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP34]]
 // CHECK7-NEXT:    [[ADD5:%.*]] = add nsw i32 [[TMP10]], 1
-// CHECK7-NEXT:    store i32 [[ADD5]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP41]]
-// CHECK7-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP42:![0-9]+]]
+// CHECK7-NEXT:    store i32 [[ADD5]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP34]]
+// CHECK7-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP35:![0-9]+]]
 // CHECK7:       omp.inner.for.end:
 // CHECK7-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK7:       omp.loop.exit:
@@ -6537,7 +6537,7 @@ int bar(int n){
 //
 //
 // CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l142.omp_outlined
-// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BN:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[C:%.*]], i32 noundef [[VLA1:%.*]], i32 noundef [[VLA3:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[CN:%.*]], ptr noundef nonnull align 4 dereferenceable(12) [[D:%.*]]) #[[ATTR3]] {
+// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BN:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[C:%.*]], i32 noundef [[VLA1:%.*]], i32 noundef [[VLA3:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[CN:%.*]], ptr noundef nonnull align 4 dereferenceable(12) [[D:%.*]]) #[[ATTR2]] {
 // CHECK7-NEXT:  entry:
 // CHECK7-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK7-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -6600,59 +6600,59 @@ int bar(int n){
 // CHECK7-NEXT:    store i32 [[TMP12]], ptr [[DOTOMP_IV]], align 4
 // CHECK7-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK7:       omp.inner.for.cond:
-// CHECK7-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP44:![0-9]+]]
-// CHECK7-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP44]]
+// CHECK7-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP37:![0-9]+]]
+// CHECK7-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP37]]
 // CHECK7-NEXT:    [[CMP5:%.*]] = icmp sle i32 [[TMP13]], [[TMP14]]
 // CHECK7-NEXT:    br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK7:       omp.inner.for.body:
-// CHECK7-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP44]]
+// CHECK7-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP37]]
 // CHECK7-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP15]], 1
 // CHECK7-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK7-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP44]]
-// CHECK7-NEXT:    [[TMP16:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP44]]
+// CHECK7-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP37]]
+// CHECK7-NEXT:    [[TMP16:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP37]]
 // CHECK7-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP16]], 1
-// CHECK7-NEXT:    store i32 [[ADD6]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP44]]
+// CHECK7-NEXT:    store i32 [[ADD6]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP37]]
 // CHECK7-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x float], ptr [[TMP0]], i32 0, i32 2
-// CHECK7-NEXT:    [[TMP17:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP44]]
+// CHECK7-NEXT:    [[TMP17:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP37]]
 // CHECK7-NEXT:    [[CONV:%.*]] = fpext float [[TMP17]] to double
 // CHECK7-NEXT:    [[ADD7:%.*]] = fadd double [[CONV]], 1.000000e+00
 // CHECK7-NEXT:    [[CONV8:%.*]] = fptrunc double [[ADD7]] to float
-// CHECK7-NEXT:    store float [[CONV8]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP44]]
+// CHECK7-NEXT:    store float [[CONV8]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP37]]
 // CHECK7-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 3
-// CHECK7-NEXT:    [[TMP18:%.*]] = load float, ptr [[ARRAYIDX9]], align 4, !llvm.access.group [[ACC_GRP44]]
+// CHECK7-NEXT:    [[TMP18:%.*]] = load float, ptr [[ARRAYIDX9]], align 4, !llvm.access.group [[ACC_GRP37]]
 // CHECK7-NEXT:    [[CONV10:%.*]] = fpext float [[TMP18]] to double
 // CHECK7-NEXT:    [[ADD11:%.*]] = fadd double [[CONV10]], 1.000000e+00
 // CHECK7-NEXT:    [[CONV12:%.*]] = fptrunc double [[ADD11]] to float
-// CHECK7-NEXT:    store float [[CONV12]], ptr [[ARRAYIDX9]], align 4, !llvm.access.group [[ACC_GRP44]]
+// CHECK7-NEXT:    store float [[CONV12]], ptr [[ARRAYIDX9]], align 4, !llvm.access.group [[ACC_GRP37]]
 // CHECK7-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [5 x [10 x double]], ptr [[TMP3]], i32 0, i32 1
 // CHECK7-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x double], ptr [[ARRAYIDX13]], i32 0, i32 2
-// CHECK7-NEXT:    [[TMP19:%.*]] = load double, ptr [[ARRAYIDX14]], align 8, !llvm.access.group [[ACC_GRP44]]
+// CHECK7-NEXT:    [[TMP19:%.*]] = load double, ptr [[ARRAYIDX14]], align 8, !llvm.access.group [[ACC_GRP37]]
 // CHECK7-NEXT:    [[ADD15:%.*]] = fadd double [[TMP19]], 1.000000e+00
-// CHECK7-NEXT:    store double [[ADD15]], ptr [[ARRAYIDX14]], align 8, !llvm.access.group [[ACC_GRP44]]
+// CHECK7-NEXT:    store double [[ADD15]], ptr [[ARRAYIDX14]], align 8, !llvm.access.group [[ACC_GRP37]]
 // CHECK7-NEXT:    [[TMP20:%.*]] = mul nsw i32 1, [[TMP5]]
 // CHECK7-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds double, ptr [[TMP6]], i32 [[TMP20]]
 // CHECK7-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds double, ptr [[ARRAYIDX16]], i32 3
-// CHECK7-NEXT:    [[TMP21:%.*]] = load double, ptr [[ARRAYIDX17]], align 8, !llvm.access.group [[ACC_GRP44]]
+// CHECK7-NEXT:    [[TMP21:%.*]] = load double, ptr [[ARRAYIDX17]], align 8, !llvm.access.group [[ACC_GRP37]]
 // CHECK7-NEXT:    [[ADD18:%.*]] = fadd double [[TMP21]], 1.000000e+00
-// CHECK7-NEXT:    store double [[ADD18]], ptr [[ARRAYIDX17]], align 8, !llvm.access.group [[ACC_GRP44]]
+// CHECK7-NEXT:    store double [[ADD18]], ptr [[ARRAYIDX17]], align 8, !llvm.access.group [[ACC_GRP37]]
 // CHECK7-NEXT:    [[X:%.*]] = getelementptr inbounds [[STRUCT_TT:%.*]], ptr [[TMP7]], i32 0, i32 0
-// CHECK7-NEXT:    [[TMP22:%.*]] = load i64, ptr [[X]], align 4, !llvm.access.group [[ACC_GRP44]]
+// CHECK7-NEXT:    [[TMP22:%.*]] = load i64, ptr [[X]], align 4, !llvm.access.group [[ACC_GRP37]]
 // CHECK7-NEXT:    [[ADD19:%.*]] = add nsw i64 [[TMP22]], 1
-// CHECK7-NEXT:    store i64 [[ADD19]], ptr [[X]], align 4, !llvm.access.group [[ACC_GRP44]]
+// CHECK7-NEXT:    store i64 [[ADD19]], ptr [[X]], align 4, !llvm.access.group [[ACC_GRP37]]
 // CHECK7-NEXT:    [[Y:%.*]] = getelementptr inbounds [[STRUCT_TT]], ptr [[TMP7]], i32 0, i32 1
-// CHECK7-NEXT:    [[TMP23:%.*]] = load i8, ptr [[Y]], align 4, !llvm.access.group [[ACC_GRP44]]
+// CHECK7-NEXT:    [[TMP23:%.*]] = load i8, ptr [[Y]], align 4, !llvm.access.group [[ACC_GRP37]]
 // CHECK7-NEXT:    [[CONV20:%.*]] = sext i8 [[TMP23]] to i32
 // CHECK7-NEXT:    [[ADD21:%.*]] = add nsw i32 [[CONV20]], 1
 // CHECK7-NEXT:    [[CONV22:%.*]] = trunc i32 [[ADD21]] to i8
-// CHECK7-NEXT:    store i8 [[CONV22]], ptr [[Y]], align 4, !llvm.access.group [[ACC_GRP44]]
+// CHECK7-NEXT:    store i8 [[CONV22]], ptr [[Y]], align 4, !llvm.access.group [[ACC_GRP37]]
 // CHECK7-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK7:       omp.body.continue:
 // CHECK7-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK7:       omp.inner.for.inc:
-// CHECK7-NEXT:    [[TMP24:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP44]]
+// CHECK7-NEXT:    [[TMP24:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP37]]
 // CHECK7-NEXT:    [[ADD23:%.*]] = add nsw i32 [[TMP24]], 1
-// CHECK7-NEXT:    store i32 [[ADD23]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP44]]
-// CHECK7-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP45:![0-9]+]]
+// CHECK7-NEXT:    store i32 [[ADD23]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP37]]
+// CHECK7-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP38:![0-9]+]]
 // CHECK7:       omp.inner.for.end:
 // CHECK7-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK7:       omp.loop.exit:
@@ -6820,12 +6820,12 @@ int bar(int n){
 // CHECK7-NEXT:    [[TMP49:%.*]] = icmp ne i32 [[TMP48]], 0
 // CHECK7-NEXT:    br i1 [[TMP49]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK7:       omp_offload.failed:
-// CHECK7-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l215(ptr [[THIS1]], i32 [[TMP6]], i32 2, i32 [[TMP1]], ptr [[VLA]], i32 [[TMP8]]) #[[ATTR4]]
+// CHECK7-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l215(ptr [[THIS1]], i32 [[TMP6]], i32 2, i32 [[TMP1]], ptr [[VLA]], i32 [[TMP8]]) #[[ATTR3]]
 // CHECK7-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK7:       omp_offload.cont:
 // CHECK7-NEXT:    br label [[OMP_IF_END:%.*]]
 // CHECK7:       omp_if.else:
-// CHECK7-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l215(ptr [[THIS1]], i32 [[TMP6]], i32 2, i32 [[TMP1]], ptr [[VLA]], i32 [[TMP8]]) #[[ATTR4]]
+// CHECK7-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l215(ptr [[THIS1]], i32 [[TMP6]], i32 2, i32 [[TMP1]], ptr [[VLA]], i32 [[TMP8]]) #[[ATTR3]]
 // CHECK7-NEXT:    br label [[OMP_IF_END]]
 // CHECK7:       omp_if.end:
 // CHECK7-NEXT:    [[TMP50:%.*]] = mul nsw i32 1, [[TMP1]]
@@ -6957,12 +6957,12 @@ int bar(int n){
 // CHECK7-NEXT:    [[TMP46:%.*]] = icmp ne i32 [[TMP45]], 0
 // CHECK7-NEXT:    br i1 [[TMP46]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK7:       omp_offload.failed:
-// CHECK7-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l197(i32 [[TMP1]], i32 [[TMP3]], i32 [[TMP5]], i32 [[TMP7]], ptr [[B]]) #[[ATTR4]]
+// CHECK7-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l197(i32 [[TMP1]], i32 [[TMP3]], i32 [[TMP5]], i32 [[TMP7]], ptr [[B]]) #[[ATTR3]]
 // CHECK7-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK7:       omp_offload.cont:
 // CHECK7-NEXT:    br label [[OMP_IF_END:%.*]]
 // CHECK7:       omp_if.else:
-// CHECK7-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l197(i32 [[TMP1]], i32 [[TMP3]], i32 [[TMP5]], i32 [[TMP7]], ptr [[B]]) #[[ATTR4]]
+// CHECK7-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l197(i32 [[TMP1]], i32 [[TMP3]], i32 [[TMP5]], i32 [[TMP7]], ptr [[B]]) #[[ATTR3]]
 // CHECK7-NEXT:    br label [[OMP_IF_END]]
 // CHECK7:       omp_if.end:
 // CHECK7-NEXT:    [[TMP47:%.*]] = load i32, ptr [[A]], align 4
@@ -7046,12 +7046,12 @@ int bar(int n){
 // CHECK7-NEXT:    [[TMP30:%.*]] = icmp ne i32 [[TMP29]], 0
 // CHECK7-NEXT:    br i1 [[TMP30]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK7:       omp_offload.failed:
-// CHECK7-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l180(i32 [[TMP1]], i32 [[TMP3]], ptr [[B]]) #[[ATTR4]]
+// CHECK7-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l180(i32 [[TMP1]], i32 [[TMP3]], ptr [[B]]) #[[ATTR3]]
 // CHECK7-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK7:       omp_offload.cont:
 // CHECK7-NEXT:    br label [[OMP_IF_END:%.*]]
 // CHECK7:       omp_if.else:
-// CHECK7-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l180(i32 [[TMP1]], i32 [[TMP3]], ptr [[B]]) #[[ATTR4]]
+// CHECK7-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l180(i32 [[TMP1]], i32 [[TMP3]], ptr [[B]]) #[[ATTR3]]
 // CHECK7-NEXT:    br label [[OMP_IF_END]]
 // CHECK7:       omp_if.end:
 // CHECK7-NEXT:    [[TMP31:%.*]] = load i32, ptr [[A]], align 4
@@ -7092,7 +7092,7 @@ int bar(int n){
 //
 //
 // CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l215.omp_outlined
-// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]], i32 noundef [[B:%.*]], i32 noundef [[VLA:%.*]], i32 noundef [[VLA1:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[C:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] {
+// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]], i32 noundef [[B:%.*]], i32 noundef [[VLA:%.*]], i32 noundef [[VLA1:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[C:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
 // CHECK7-NEXT:  entry:
 // CHECK7-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK7-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -7147,37 +7147,37 @@ int bar(int n){
 // CHECK7:       omp_if.then:
 // CHECK7-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK7:       omp.inner.for.cond:
-// CHECK7-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP47:![0-9]+]]
-// CHECK7-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP47]]
+// CHECK7-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP40:![0-9]+]]
+// CHECK7-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP40]]
 // CHECK7-NEXT:    [[CMP3:%.*]] = icmp sle i32 [[TMP10]], [[TMP11]]
 // CHECK7-NEXT:    br i1 [[CMP3]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK7:       omp.inner.for.body:
-// CHECK7-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP47]]
+// CHECK7-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP40]]
 // CHECK7-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1
 // CHECK7-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK7-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP47]]
-// CHECK7-NEXT:    [[TMP13:%.*]] = load i32, ptr [[B_ADDR]], align 4, !llvm.access.group [[ACC_GRP47]]
+// CHECK7-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP40]]
+// CHECK7-NEXT:    [[TMP13:%.*]] = load i32, ptr [[B_ADDR]], align 4, !llvm.access.group [[ACC_GRP40]]
 // CHECK7-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP13]] to double
 // CHECK7-NEXT:    [[ADD4:%.*]] = fadd double [[CONV]], 1.500000e+00
 // CHECK7-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_S1:%.*]], ptr [[TMP0]], i32 0, i32 0
-// CHECK7-NEXT:    store double [[ADD4]], ptr [[A]], align 4, !llvm.access.group [[ACC_GRP47]]
+// CHECK7-NEXT:    store double [[ADD4]], ptr [[A]], align 4, !llvm.access.group [[ACC_GRP40]]
 // CHECK7-NEXT:    [[A5:%.*]] = getelementptr inbounds [[STRUCT_S1]], ptr [[TMP0]], i32 0, i32 0
-// CHECK7-NEXT:    [[TMP14:%.*]] = load double, ptr [[A5]], align 4, !llvm.access.group [[ACC_GRP47]]
+// CHECK7-NEXT:    [[TMP14:%.*]] = load double, ptr [[A5]], align 4, !llvm.access.group [[ACC_GRP40]]
 // CHECK7-NEXT:    [[INC:%.*]] = fadd double [[TMP14]], 1.000000e+00
-// CHECK7-NEXT:    store double [[INC]], ptr [[A5]], align 4, !llvm.access.group [[ACC_GRP47]]
+// CHECK7-NEXT:    store double [[INC]], ptr [[A5]], align 4, !llvm.access.group [[ACC_GRP40]]
 // CHECK7-NEXT:    [[CONV6:%.*]] = fptosi double [[INC]] to i16
 // CHECK7-NEXT:    [[TMP15:%.*]] = mul nsw i32 1, [[TMP2]]
 // CHECK7-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[TMP3]], i32 [[TMP15]]
 // CHECK7-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i16, ptr [[ARRAYIDX]], i32 1
-// CHECK7-NEXT:    store i16 [[CONV6]], ptr [[ARRAYIDX7]], align 2, !llvm.access.group [[ACC_GRP47]]
+// CHECK7-NEXT:    store i16 [[CONV6]], ptr [[ARRAYIDX7]], align 2, !llvm.access.group [[ACC_GRP40]]
 // CHECK7-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK7:       omp.body.continue:
 // CHECK7-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK7:       omp.inner.for.inc:
-// CHECK7-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP47]]
+// CHECK7-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP40]]
 // CHECK7-NEXT:    [[ADD8:%.*]] = add nsw i32 [[TMP16]], 1
-// CHECK7-NEXT:    store i32 [[ADD8]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP47]]
-// CHECK7-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP48:![0-9]+]]
+// CHECK7-NEXT:    store i32 [[ADD8]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP40]]
+// CHECK7-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP41:![0-9]+]]
 // CHECK7:       omp.inner.for.end:
 // CHECK7-NEXT:    br label [[OMP_IF_END:%.*]]
 // CHECK7:       omp_if.else:
@@ -7213,7 +7213,7 @@ int bar(int n){
 // CHECK7-NEXT:    [[TMP23:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
 // CHECK7-NEXT:    [[ADD24:%.*]] = add nsw i32 [[TMP23]], 1
 // CHECK7-NEXT:    store i32 [[ADD24]], ptr [[DOTOMP_IV]], align 4
-// CHECK7-NEXT:    br label [[OMP_INNER_FOR_COND9]], !llvm.loop [[LOOP50:![0-9]+]]
+// CHECK7-NEXT:    br label [[OMP_INNER_FOR_COND9]], !llvm.loop [[LOOP43:![0-9]+]]
 // CHECK7:       omp.inner.for.end25:
 // CHECK7-NEXT:    br label [[OMP_IF_END]]
 // CHECK7:       omp_if.end:
@@ -7265,7 +7265,7 @@ int bar(int n){
 //
 //
 // CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l197.omp_outlined
-// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], i32 noundef [[N:%.*]], i32 noundef [[AA:%.*]], i32 noundef [[AAA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR3]] {
+// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], i32 noundef [[N:%.*]], i32 noundef [[AA:%.*]], i32 noundef [[AAA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR2]] {
 // CHECK7-NEXT:  entry:
 // CHECK7-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK7-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -7337,42 +7337,42 @@ int bar(int n){
 // CHECK7-NEXT:    store i32 [[TMP15]], ptr [[DOTOMP_IV]], align 4
 // CHECK7-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK7:       omp.inner.for.cond:
-// CHECK7-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP52:![0-9]+]]
-// CHECK7-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP52]]
+// CHECK7-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP45:![0-9]+]]
+// CHECK7-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP45]]
 // CHECK7-NEXT:    [[ADD7:%.*]] = add i32 [[TMP17]], 1
 // CHECK7-NEXT:    [[CMP8:%.*]] = icmp ult i32 [[TMP16]], [[ADD7]]
 // CHECK7-NEXT:    br i1 [[CMP8]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK7:       omp.inner.for.body:
-// CHECK7-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4, !llvm.access.group [[ACC_GRP52]]
-// CHECK7-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP52]]
+// CHECK7-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4, !llvm.access.group [[ACC_GRP45]]
+// CHECK7-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP45]]
 // CHECK7-NEXT:    [[MUL:%.*]] = mul i32 [[TMP19]], 1
 // CHECK7-NEXT:    [[ADD9:%.*]] = add i32 [[TMP18]], [[MUL]]
-// CHECK7-NEXT:    store i32 [[ADD9]], ptr [[I5]], align 4, !llvm.access.group [[ACC_GRP52]]
-// CHECK7-NEXT:    [[TMP20:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP52]]
+// CHECK7-NEXT:    store i32 [[ADD9]], ptr [[I5]], align 4, !llvm.access.group [[ACC_GRP45]]
+// CHECK7-NEXT:    [[TMP20:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP45]]
 // CHECK7-NEXT:    [[ADD10:%.*]] = add nsw i32 [[TMP20]], 1
-// CHECK7-NEXT:    store i32 [[ADD10]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP52]]
-// CHECK7-NEXT:    [[TMP21:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP52]]
+// CHECK7-NEXT:    store i32 [[ADD10]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP45]]
+// CHECK7-NEXT:    [[TMP21:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP45]]
 // CHECK7-NEXT:    [[CONV:%.*]] = sext i16 [[TMP21]] to i32
 // CHECK7-NEXT:    [[ADD11:%.*]] = add nsw i32 [[CONV]], 1
 // CHECK7-NEXT:    [[CONV12:%.*]] = trunc i32 [[ADD11]] to i16
-// CHECK7-NEXT:    store i16 [[CONV12]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP52]]
-// CHECK7-NEXT:    [[TMP22:%.*]] = load i8, ptr [[AAA_ADDR]], align 1, !llvm.access.group [[ACC_GRP52]]
+// CHECK7-NEXT:    store i16 [[CONV12]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP45]]
+// CHECK7-NEXT:    [[TMP22:%.*]] = load i8, ptr [[AAA_ADDR]], align 1, !llvm.access.group [[ACC_GRP45]]
 // CHECK7-NEXT:    [[CONV13:%.*]] = sext i8 [[TMP22]] to i32
 // CHECK7-NEXT:    [[ADD14:%.*]] = add nsw i32 [[CONV13]], 1
 // CHECK7-NEXT:    [[CONV15:%.*]] = trunc i32 [[ADD14]] to i8
-// CHECK7-NEXT:    store i8 [[CONV15]], ptr [[AAA_ADDR]], align 1, !llvm.access.group [[ACC_GRP52]]
+// CHECK7-NEXT:    store i8 [[CONV15]], ptr [[AAA_ADDR]], align 1, !llvm.access.group [[ACC_GRP45]]
 // CHECK7-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP0]], i32 0, i32 2
-// CHECK7-NEXT:    [[TMP23:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP52]]
+// CHECK7-NEXT:    [[TMP23:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP45]]
 // CHECK7-NEXT:    [[ADD16:%.*]] = add nsw i32 [[TMP23]], 1
-// CHECK7-NEXT:    store i32 [[ADD16]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP52]]
+// CHECK7-NEXT:    store i32 [[ADD16]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP45]]
 // CHECK7-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK7:       omp.body.continue:
 // CHECK7-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK7:       omp.inner.for.inc:
-// CHECK7-NEXT:    [[TMP24:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP52]]
+// CHECK7-NEXT:    [[TMP24:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP45]]
 // CHECK7-NEXT:    [[ADD17:%.*]] = add i32 [[TMP24]], 1
-// CHECK7-NEXT:    store i32 [[ADD17]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP52]]
-// CHECK7-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP53:![0-9]+]]
+// CHECK7-NEXT:    store i32 [[ADD17]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP45]]
+// CHECK7-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP46:![0-9]+]]
 // CHECK7:       omp.inner.for.end:
 // CHECK7-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK7:       omp.loop.exit:
@@ -7423,7 +7423,7 @@ int bar(int n){
 //
 //
 // CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l180.omp_outlined
-// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR3]] {
+// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR2]] {
 // CHECK7-NEXT:  entry:
 // CHECK7-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK7-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -7465,35 +7465,35 @@ int bar(int n){
 // CHECK7-NEXT:    store i32 [[TMP5]], ptr [[DOTOMP_IV]], align 4
 // CHECK7-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK7:       omp.inner.for.cond:
-// CHECK7-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP55:![0-9]+]]
-// CHECK7-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP55]]
+// CHECK7-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP48:![0-9]+]]
+// CHECK7-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP48]]
 // CHECK7-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP6]], [[TMP7]]
 // CHECK7-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK7:       omp.inner.for.body:
-// CHECK7-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP55]]
+// CHECK7-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP48]]
 // CHECK7-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1
 // CHECK7-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK7-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP55]]
-// CHECK7-NEXT:    [[TMP9:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP55]]
+// CHECK7-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP48]]
+// CHECK7-NEXT:    [[TMP9:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP48]]
 // CHECK7-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP9]], 1
-// CHECK7-NEXT:    store i32 [[ADD2]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP55]]
-// CHECK7-NEXT:    [[TMP10:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP55]]
+// CHECK7-NEXT:    store i32 [[ADD2]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP48]]
+// CHECK7-NEXT:    [[TMP10:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP48]]
 // CHECK7-NEXT:    [[CONV:%.*]] = sext i16 [[TMP10]] to i32
 // CHECK7-NEXT:    [[ADD3:%.*]] = add nsw i32 [[CONV]], 1
 // CHECK7-NEXT:    [[CONV4:%.*]] = trunc i32 [[ADD3]] to i16
-// CHECK7-NEXT:    store i16 [[CONV4]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP55]]
+// CHECK7-NEXT:    store i16 [[CONV4]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP48]]
 // CHECK7-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP0]], i32 0, i32 2
-// CHECK7-NEXT:    [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP55]]
+// CHECK7-NEXT:    [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP48]]
 // CHECK7-NEXT:    [[ADD5:%.*]] = add nsw i32 [[TMP11]], 1
-// CHECK7-NEXT:    store i32 [[ADD5]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP55]]
+// CHECK7-NEXT:    store i32 [[ADD5]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP48]]
 // CHECK7-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK7:       omp.body.continue:
 // CHECK7-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK7:       omp.inner.for.inc:
-// CHECK7-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP55]]
+// CHECK7-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP48]]
 // CHECK7-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP12]], 1
-// CHECK7-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP55]]
-// CHECK7-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP56:![0-9]+]]
+// CHECK7-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP48]]
+// CHECK7-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP49:![0-9]+]]
 // CHECK7:       omp.inner.for.end:
 // CHECK7-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK7:       omp.loop.exit:
@@ -7509,7 +7509,7 @@ int bar(int n){
 //
 //
 // CHECK7-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK7-SAME: () #[[ATTR5]] {
+// CHECK7-SAME: () #[[ATTR4]] {
 // CHECK7-NEXT:  entry:
 // CHECK7-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK7-NEXT:    ret void
@@ -9669,7 +9669,7 @@ int bar(int n){
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l97.omp_outlined
-// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[AA:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[AA:%.*]]) #[[ATTR0]] {
 // CHECK17-NEXT:  entry:
 // CHECK17-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -9706,23 +9706,23 @@ int bar(int n){
 // CHECK17-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
 // CHECK17-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK17:       omp.inner.for.cond:
-// CHECK17-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP18:![0-9]+]]
-// CHECK17-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK17-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP11:![0-9]+]]
+// CHECK17-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP11]]
 // CHECK17-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
 // CHECK17-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK17:       omp.inner.for.body:
-// CHECK17-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK17-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP11]]
 // CHECK17-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
 // CHECK17-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK17-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK17-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP11]]
 // CHECK17-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK17:       omp.body.continue:
 // CHECK17-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK17:       omp.inner.for.inc:
-// CHECK17-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK17-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP11]]
 // CHECK17-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP8]], 1
-// CHECK17-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP18]]
-// CHECK17-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP19:![0-9]+]]
+// CHECK17-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP11]]
+// CHECK17-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP12:![0-9]+]]
 // CHECK17:       omp.inner.for.end:
 // CHECK17-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK17:       omp.loop.exit:
@@ -9751,7 +9751,7 @@ int bar(int n){
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l111.omp_outlined
-// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[AA:%.*]]) #[[ATTR1]] {
+// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[AA:%.*]]) #[[ATTR0]] {
 // CHECK17-NEXT:  entry:
 // CHECK17-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -9788,28 +9788,28 @@ int bar(int n){
 // CHECK17-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
 // CHECK17-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK17:       omp.inner.for.cond:
-// CHECK17-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP25:![0-9]+]]
-// CHECK17-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP25]]
+// CHECK17-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP18:![0-9]+]]
+// CHECK17-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK17-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
 // CHECK17-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK17:       omp.inner.for.body:
-// CHECK17-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP25]]
+// CHECK17-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK17-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
 // CHECK17-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK17-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP25]]
-// CHECK17-NEXT:    [[TMP8:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP25]]
+// CHECK17-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK17-NEXT:    [[TMP8:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP18]]
 // CHECK17-NEXT:    [[CONV:%.*]] = sext i16 [[TMP8]] to i32
 // CHECK17-NEXT:    [[ADD2:%.*]] = add nsw i32 [[CONV]], 1
 // CHECK17-NEXT:    [[CONV3:%.*]] = trunc i32 [[ADD2]] to i16
-// CHECK17-NEXT:    store i16 [[CONV3]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP25]]
+// CHECK17-NEXT:    store i16 [[CONV3]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP18]]
 // CHECK17-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK17:       omp.body.continue:
 // CHECK17-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK17:       omp.inner.for.inc:
-// CHECK17-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP25]]
+// CHECK17-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK17-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP9]], 1
-// CHECK17-NEXT:    store i32 [[ADD4]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP25]]
-// CHECK17-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP26:![0-9]+]]
+// CHECK17-NEXT:    store i32 [[ADD4]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK17-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP19:![0-9]+]]
 // CHECK17:       omp.inner.for.end:
 // CHECK17-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK17:       omp.loop.exit:
@@ -9844,7 +9844,7 @@ int bar(int n){
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l118.omp_outlined
-// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]]) #[[ATTR1]] {
+// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]]) #[[ATTR0]] {
 // CHECK17-NEXT:  entry:
 // CHECK17-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -9883,31 +9883,31 @@ int bar(int n){
 // CHECK17-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
 // CHECK17-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK17:       omp.inner.for.cond:
-// CHECK17-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP28:![0-9]+]]
-// CHECK17-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP28]]
+// CHECK17-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP21:![0-9]+]]
+// CHECK17-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP21]]
 // CHECK17-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
 // CHECK17-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK17:       omp.inner.for.body:
-// CHECK17-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP28]]
+// CHECK17-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP21]]
 // CHECK17-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
 // CHECK17-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK17-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP28]]
-// CHECK17-NEXT:    [[TMP8:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP28]]
+// CHECK17-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP21]]
+// CHECK17-NEXT:    [[TMP8:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP21]]
 // CHECK17-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP8]], 1
-// CHECK17-NEXT:    store i32 [[ADD2]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP28]]
-// CHECK17-NEXT:    [[TMP9:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP28]]
+// CHECK17-NEXT:    store i32 [[ADD2]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP21]]
+// CHECK17-NEXT:    [[TMP9:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP21]]
 // CHECK17-NEXT:    [[CONV:%.*]] = sext i16 [[TMP9]] to i32
 // CHECK17-NEXT:    [[ADD3:%.*]] = add nsw i32 [[CONV]], 1
 // CHECK17-NEXT:    [[CONV4:%.*]] = trunc i32 [[ADD3]] to i16
-// CHECK17-NEXT:    store i16 [[CONV4]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP28]]
+// CHECK17-NEXT:    store i16 [[CONV4]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP21]]
 // CHECK17-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK17:       omp.body.continue:
 // CHECK17-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK17:       omp.inner.for.inc:
-// CHECK17-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP28]]
+// CHECK17-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP21]]
 // CHECK17-NEXT:    [[ADD5:%.*]] = add nsw i32 [[TMP10]], 1
-// CHECK17-NEXT:    store i32 [[ADD5]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP28]]
-// CHECK17-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP29:![0-9]+]]
+// CHECK17-NEXT:    store i32 [[ADD5]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP21]]
+// CHECK17-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP22:![0-9]+]]
 // CHECK17:       omp.inner.for.end:
 // CHECK17-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK17:       omp.loop.exit:
@@ -9960,7 +9960,7 @@ int bar(int n){
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l142.omp_outlined
-// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BN:%.*]], ptr noundef nonnull align 8 dereferenceable(400) [[C:%.*]], i64 noundef [[VLA1:%.*]], i64 noundef [[VLA3:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[CN:%.*]], ptr noundef nonnull align 8 dereferenceable(16) [[D:%.*]]) #[[ATTR1]] {
+// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BN:%.*]], ptr noundef nonnull align 8 dereferenceable(400) [[C:%.*]], i64 noundef [[VLA1:%.*]], i64 noundef [[VLA3:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[CN:%.*]], ptr noundef nonnull align 8 dereferenceable(16) [[D:%.*]]) #[[ATTR0]] {
 // CHECK17-NEXT:  entry:
 // CHECK17-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -10023,59 +10023,59 @@ int bar(int n){
 // CHECK17-NEXT:    store i32 [[TMP12]], ptr [[DOTOMP_IV]], align 4
 // CHECK17-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK17:       omp.inner.for.cond:
-// CHECK17-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP31:![0-9]+]]
-// CHECK17-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP31]]
+// CHECK17-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP24:![0-9]+]]
+// CHECK17-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP24]]
 // CHECK17-NEXT:    [[CMP5:%.*]] = icmp sle i32 [[TMP13]], [[TMP14]]
 // CHECK17-NEXT:    br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK17:       omp.inner.for.body:
-// CHECK17-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP31]]
+// CHECK17-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP24]]
 // CHECK17-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP15]], 1
 // CHECK17-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK17-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP31]]
-// CHECK17-NEXT:    [[TMP16:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP31]]
+// CHECK17-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP24]]
+// CHECK17-NEXT:    [[TMP16:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP24]]
 // CHECK17-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP16]], 1
-// CHECK17-NEXT:    store i32 [[ADD6]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP31]]
+// CHECK17-NEXT:    store i32 [[ADD6]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP24]]
 // CHECK17-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x float], ptr [[TMP0]], i64 0, i64 2
-// CHECK17-NEXT:    [[TMP17:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP31]]
+// CHECK17-NEXT:    [[TMP17:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP24]]
 // CHECK17-NEXT:    [[CONV:%.*]] = fpext float [[TMP17]] to double
 // CHECK17-NEXT:    [[ADD7:%.*]] = fadd double [[CONV]], 1.000000e+00
 // CHECK17-NEXT:    [[CONV8:%.*]] = fptrunc double [[ADD7]] to float
-// CHECK17-NEXT:    store float [[CONV8]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP31]]
+// CHECK17-NEXT:    store float [[CONV8]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP24]]
 // CHECK17-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i64 3
-// CHECK17-NEXT:    [[TMP18:%.*]] = load float, ptr [[ARRAYIDX9]], align 4, !llvm.access.group [[ACC_GRP31]]
+// CHECK17-NEXT:    [[TMP18:%.*]] = load float, ptr [[ARRAYIDX9]], align 4, !llvm.access.group [[ACC_GRP24]]
 // CHECK17-NEXT:    [[CONV10:%.*]] = fpext float [[TMP18]] to double
 // CHECK17-NEXT:    [[ADD11:%.*]] = fadd double [[CONV10]], 1.000000e+00
 // CHECK17-NEXT:    [[CONV12:%.*]] = fptrunc double [[ADD11]] to float
-// CHECK17-NEXT:    store float [[CONV12]], ptr [[ARRAYIDX9]], align 4, !llvm.access.group [[ACC_GRP31]]
+// CHECK17-NEXT:    store float [[CONV12]], ptr [[ARRAYIDX9]], align 4, !llvm.access.group [[ACC_GRP24]]
 // CHECK17-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [5 x [10 x double]], ptr [[TMP3]], i64 0, i64 1
 // CHECK17-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x double], ptr [[ARRAYIDX13]], i64 0, i64 2
-// CHECK17-NEXT:    [[TMP19:%.*]] = load double, ptr [[ARRAYIDX14]], align 8, !llvm.access.group [[ACC_GRP31]]
+// CHECK17-NEXT:    [[TMP19:%.*]] = load double, ptr [[ARRAYIDX14]], align 8, !llvm.access.group [[ACC_GRP24]]
 // CHECK17-NEXT:    [[ADD15:%.*]] = fadd double [[TMP19]], 1.000000e+00
-// CHECK17-NEXT:    store double [[ADD15]], ptr [[ARRAYIDX14]], align 8, !llvm.access.group [[ACC_GRP31]]
+// CHECK17-NEXT:    store double [[ADD15]], ptr [[ARRAYIDX14]], align 8, !llvm.access.group [[ACC_GRP24]]
 // CHECK17-NEXT:    [[TMP20:%.*]] = mul nsw i64 1, [[TMP5]]
 // CHECK17-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds double, ptr [[TMP6]], i64 [[TMP20]]
 // CHECK17-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds double, ptr [[ARRAYIDX16]], i64 3
-// CHECK17-NEXT:    [[TMP21:%.*]] = load double, ptr [[ARRAYIDX17]], align 8, !llvm.access.group [[ACC_GRP31]]
+// CHECK17-NEXT:    [[TMP21:%.*]] = load double, ptr [[ARRAYIDX17]], align 8, !llvm.access.group [[ACC_GRP24]]
 // CHECK17-NEXT:    [[ADD18:%.*]] = fadd double [[TMP21]], 1.000000e+00
-// CHECK17-NEXT:    store double [[ADD18]], ptr [[ARRAYIDX17]], align 8, !llvm.access.group [[ACC_GRP31]]
+// CHECK17-NEXT:    store double [[ADD18]], ptr [[ARRAYIDX17]], align 8, !llvm.access.group [[ACC_GRP24]]
 // CHECK17-NEXT:    [[X:%.*]] = getelementptr inbounds [[STRUCT_TT:%.*]], ptr [[TMP7]], i32 0, i32 0
-// CHECK17-NEXT:    [[TMP22:%.*]] = load i64, ptr [[X]], align 8, !llvm.access.group [[ACC_GRP31]]
+// CHECK17-NEXT:    [[TMP22:%.*]] = load i64, ptr [[X]], align 8, !llvm.access.group [[ACC_GRP24]]
 // CHECK17-NEXT:    [[ADD19:%.*]] = add nsw i64 [[TMP22]], 1
-// CHECK17-NEXT:    store i64 [[ADD19]], ptr [[X]], align 8, !llvm.access.group [[ACC_GRP31]]
+// CHECK17-NEXT:    store i64 [[ADD19]], ptr [[X]], align 8, !llvm.access.group [[ACC_GRP24]]
 // CHECK17-NEXT:    [[Y:%.*]] = getelementptr inbounds [[STRUCT_TT]], ptr [[TMP7]], i32 0, i32 1
-// CHECK17-NEXT:    [[TMP23:%.*]] = load i8, ptr [[Y]], align 8, !llvm.access.group [[ACC_GRP31]]
+// CHECK17-NEXT:    [[TMP23:%.*]] = load i8, ptr [[Y]], align 8, !llvm.access.group [[ACC_GRP24]]
 // CHECK17-NEXT:    [[CONV20:%.*]] = sext i8 [[TMP23]] to i32
 // CHECK17-NEXT:    [[ADD21:%.*]] = add nsw i32 [[CONV20]], 1
 // CHECK17-NEXT:    [[CONV22:%.*]] = trunc i32 [[ADD21]] to i8
-// CHECK17-NEXT:    store i8 [[CONV22]], ptr [[Y]], align 8, !llvm.access.group [[ACC_GRP31]]
+// CHECK17-NEXT:    store i8 [[CONV22]], ptr [[Y]], align 8, !llvm.access.group [[ACC_GRP24]]
 // CHECK17-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK17:       omp.body.continue:
 // CHECK17-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK17:       omp.inner.for.inc:
-// CHECK17-NEXT:    [[TMP24:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP31]]
+// CHECK17-NEXT:    [[TMP24:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP24]]
 // CHECK17-NEXT:    [[ADD23:%.*]] = add nsw i32 [[TMP24]], 1
-// CHECK17-NEXT:    store i32 [[ADD23]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP31]]
-// CHECK17-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP32:![0-9]+]]
+// CHECK17-NEXT:    store i32 [[ADD23]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP24]]
+// CHECK17-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP25:![0-9]+]]
 // CHECK17:       omp.inner.for.end:
 // CHECK17-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK17:       omp.loop.exit:
@@ -10125,7 +10125,7 @@ int bar(int n){
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l197.omp_outlined
-// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], i64 noundef [[N:%.*]], i64 noundef [[AA:%.*]], i64 noundef [[AAA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR1]] {
+// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], i64 noundef [[N:%.*]], i64 noundef [[AA:%.*]], i64 noundef [[AAA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
 // CHECK17-NEXT:  entry:
 // CHECK17-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -10197,42 +10197,42 @@ int bar(int n){
 // CHECK17-NEXT:    store i32 [[TMP15]], ptr [[DOTOMP_IV]], align 4
 // CHECK17-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK17:       omp.inner.for.cond:
-// CHECK17-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP34:![0-9]+]]
-// CHECK17-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP34]]
+// CHECK17-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP27:![0-9]+]]
+// CHECK17-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP27]]
 // CHECK17-NEXT:    [[ADD7:%.*]] = add i32 [[TMP17]], 1
 // CHECK17-NEXT:    [[CMP8:%.*]] = icmp ult i32 [[TMP16]], [[ADD7]]
 // CHECK17-NEXT:    br i1 [[CMP8]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK17:       omp.inner.for.body:
-// CHECK17-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4, !llvm.access.group [[ACC_GRP34]]
-// CHECK17-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP34]]
+// CHECK17-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4, !llvm.access.group [[ACC_GRP27]]
+// CHECK17-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP27]]
 // CHECK17-NEXT:    [[MUL:%.*]] = mul i32 [[TMP19]], 1
 // CHECK17-NEXT:    [[ADD9:%.*]] = add i32 [[TMP18]], [[MUL]]
-// CHECK17-NEXT:    store i32 [[ADD9]], ptr [[I5]], align 4, !llvm.access.group [[ACC_GRP34]]
-// CHECK17-NEXT:    [[TMP20:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP34]]
+// CHECK17-NEXT:    store i32 [[ADD9]], ptr [[I5]], align 4, !llvm.access.group [[ACC_GRP27]]
+// CHECK17-NEXT:    [[TMP20:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP27]]
 // CHECK17-NEXT:    [[ADD10:%.*]] = add nsw i32 [[TMP20]], 1
-// CHECK17-NEXT:    store i32 [[ADD10]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP34]]
-// CHECK17-NEXT:    [[TMP21:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP34]]
+// CHECK17-NEXT:    store i32 [[ADD10]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP27]]
+// CHECK17-NEXT:    [[TMP21:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP27]]
 // CHECK17-NEXT:    [[CONV:%.*]] = sext i16 [[TMP21]] to i32
 // CHECK17-NEXT:    [[ADD11:%.*]] = add nsw i32 [[CONV]], 1
 // CHECK17-NEXT:    [[CONV12:%.*]] = trunc i32 [[ADD11]] to i16
-// CHECK17-NEXT:    store i16 [[CONV12]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP34]]
-// CHECK17-NEXT:    [[TMP22:%.*]] = load i8, ptr [[AAA_ADDR]], align 1, !llvm.access.group [[ACC_GRP34]]
+// CHECK17-NEXT:    store i16 [[CONV12]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP27]]
+// CHECK17-NEXT:    [[TMP22:%.*]] = load i8, ptr [[AAA_ADDR]], align 1, !llvm.access.group [[ACC_GRP27]]
 // CHECK17-NEXT:    [[CONV13:%.*]] = sext i8 [[TMP22]] to i32
 // CHECK17-NEXT:    [[ADD14:%.*]] = add nsw i32 [[CONV13]], 1
 // CHECK17-NEXT:    [[CONV15:%.*]] = trunc i32 [[ADD14]] to i8
-// CHECK17-NEXT:    store i8 [[CONV15]], ptr [[AAA_ADDR]], align 1, !llvm.access.group [[ACC_GRP34]]
+// CHECK17-NEXT:    store i8 [[CONV15]], ptr [[AAA_ADDR]], align 1, !llvm.access.group [[ACC_GRP27]]
 // CHECK17-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP0]], i64 0, i64 2
-// CHECK17-NEXT:    [[TMP23:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP34]]
+// CHECK17-NEXT:    [[TMP23:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP27]]
 // CHECK17-NEXT:    [[ADD16:%.*]] = add nsw i32 [[TMP23]], 1
-// CHECK17-NEXT:    store i32 [[ADD16]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP34]]
+// CHECK17-NEXT:    store i32 [[ADD16]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP27]]
 // CHECK17-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK17:       omp.body.continue:
 // CHECK17-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK17:       omp.inner.for.inc:
-// CHECK17-NEXT:    [[TMP24:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP34]]
+// CHECK17-NEXT:    [[TMP24:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP27]]
 // CHECK17-NEXT:    [[ADD17:%.*]] = add i32 [[TMP24]], 1
-// CHECK17-NEXT:    store i32 [[ADD17]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP34]]
-// CHECK17-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP35:![0-9]+]]
+// CHECK17-NEXT:    store i32 [[ADD17]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP27]]
+// CHECK17-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP28:![0-9]+]]
 // CHECK17:       omp.inner.for.end:
 // CHECK17-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK17:       omp.loop.exit:
@@ -10286,7 +10286,7 @@ int bar(int n){
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l215.omp_outlined
-// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]], i64 noundef [[B:%.*]], i64 noundef [[VLA:%.*]], i64 noundef [[VLA1:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[C:%.*]]) #[[ATTR1]] {
+// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]], i64 noundef [[B:%.*]], i64 noundef [[VLA:%.*]], i64 noundef [[VLA1:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[C:%.*]]) #[[ATTR0]] {
 // CHECK17-NEXT:  entry:
 // CHECK17-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -10335,37 +10335,37 @@ int bar(int n){
 // CHECK17-NEXT:    store i32 [[TMP8]], ptr [[DOTOMP_IV]], align 4
 // CHECK17-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK17:       omp.inner.for.cond:
-// CHECK17-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP37:![0-9]+]]
-// CHECK17-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP37]]
+// CHECK17-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP30:![0-9]+]]
+// CHECK17-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP30]]
 // CHECK17-NEXT:    [[CMP3:%.*]] = icmp sle i32 [[TMP9]], [[TMP10]]
 // CHECK17-NEXT:    br i1 [[CMP3]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK17:       omp.inner.for.body:
-// CHECK17-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP37]]
+// CHECK17-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP30]]
 // CHECK17-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP11]], 1
 // CHECK17-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK17-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP37]]
-// CHECK17-NEXT:    [[TMP12:%.*]] = load i32, ptr [[B_ADDR]], align 4, !llvm.access.group [[ACC_GRP37]]
+// CHECK17-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP30]]
+// CHECK17-NEXT:    [[TMP12:%.*]] = load i32, ptr [[B_ADDR]], align 4, !llvm.access.group [[ACC_GRP30]]
 // CHECK17-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP12]] to double
 // CHECK17-NEXT:    [[ADD4:%.*]] = fadd double [[CONV]], 1.500000e+00
 // CHECK17-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_S1:%.*]], ptr [[TMP0]], i32 0, i32 0
-// CHECK17-NEXT:    store double [[ADD4]], ptr [[A]], align 8, !llvm.access.group [[ACC_GRP37]]
+// CHECK17-NEXT:    store double [[ADD4]], ptr [[A]], align 8, !llvm.access.group [[ACC_GRP30]]
 // CHECK17-NEXT:    [[A5:%.*]] = getelementptr inbounds [[STRUCT_S1]], ptr [[TMP0]], i32 0, i32 0
-// CHECK17-NEXT:    [[TMP13:%.*]] = load double, ptr [[A5]], align 8, !llvm.access.group [[ACC_GRP37]]
+// CHECK17-NEXT:    [[TMP13:%.*]] = load double, ptr [[A5]], align 8, !llvm.access.group [[ACC_GRP30]]
 // CHECK17-NEXT:    [[INC:%.*]] = fadd double [[TMP13]], 1.000000e+00
-// CHECK17-NEXT:    store double [[INC]], ptr [[A5]], align 8, !llvm.access.group [[ACC_GRP37]]
+// CHECK17-NEXT:    store double [[INC]], ptr [[A5]], align 8, !llvm.access.group [[ACC_GRP30]]
 // CHECK17-NEXT:    [[CONV6:%.*]] = fptosi double [[INC]] to i16
 // CHECK17-NEXT:    [[TMP14:%.*]] = mul nsw i64 1, [[TMP2]]
 // CHECK17-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[TMP3]], i64 [[TMP14]]
 // CHECK17-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i16, ptr [[ARRAYIDX]], i64 1
-// CHECK17-NEXT:    store i16 [[CONV6]], ptr [[ARRAYIDX7]], align 2, !llvm.access.group [[ACC_GRP37]]
+// CHECK17-NEXT:    store i16 [[CONV6]], ptr [[ARRAYIDX7]], align 2, !llvm.access.group [[ACC_GRP30]]
 // CHECK17-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK17:       omp.body.continue:
 // CHECK17-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK17:       omp.inner.for.inc:
-// CHECK17-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP37]]
+// CHECK17-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP30]]
 // CHECK17-NEXT:    [[ADD8:%.*]] = add nsw i32 [[TMP15]], 1
-// CHECK17-NEXT:    store i32 [[ADD8]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP37]]
-// CHECK17-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP38:![0-9]+]]
+// CHECK17-NEXT:    store i32 [[ADD8]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP30]]
+// CHECK17-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP31:![0-9]+]]
 // CHECK17:       omp.inner.for.end:
 // CHECK17-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK17:       omp.loop.exit:
@@ -10403,7 +10403,7 @@ int bar(int n){
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l180.omp_outlined
-// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR1]] {
+// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
 // CHECK17-NEXT:  entry:
 // CHECK17-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -10445,35 +10445,35 @@ int bar(int n){
 // CHECK17-NEXT:    store i32 [[TMP5]], ptr [[DOTOMP_IV]], align 4
 // CHECK17-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK17:       omp.inner.for.cond:
-// CHECK17-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP40:![0-9]+]]
-// CHECK17-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP40]]
+// CHECK17-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP33:![0-9]+]]
+// CHECK17-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP33]]
 // CHECK17-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP6]], [[TMP7]]
 // CHECK17-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK17:       omp.inner.for.body:
-// CHECK17-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP40]]
+// CHECK17-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP33]]
 // CHECK17-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1
 // CHECK17-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK17-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP40]]
-// CHECK17-NEXT:    [[TMP9:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP40]]
+// CHECK17-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP33]]
+// CHECK17-NEXT:    [[TMP9:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP33]]
 // CHECK17-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP9]], 1
-// CHECK17-NEXT:    store i32 [[ADD2]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP40]]
-// CHECK17-NEXT:    [[TMP10:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP40]]
+// CHECK17-NEXT:    store i32 [[ADD2]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP33]]
+// CHECK17-NEXT:    [[TMP10:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP33]]
 // CHECK17-NEXT:    [[CONV:%.*]] = sext i16 [[TMP10]] to i32
 // CHECK17-NEXT:    [[ADD3:%.*]] = add nsw i32 [[CONV]], 1
 // CHECK17-NEXT:    [[CONV4:%.*]] = trunc i32 [[ADD3]] to i16
-// CHECK17-NEXT:    store i16 [[CONV4]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP40]]
+// CHECK17-NEXT:    store i16 [[CONV4]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP33]]
 // CHECK17-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP0]], i64 0, i64 2
-// CHECK17-NEXT:    [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP40]]
+// CHECK17-NEXT:    [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP33]]
 // CHECK17-NEXT:    [[ADD5:%.*]] = add nsw i32 [[TMP11]], 1
-// CHECK17-NEXT:    store i32 [[ADD5]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP40]]
+// CHECK17-NEXT:    store i32 [[ADD5]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP33]]
 // CHECK17-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK17:       omp.body.continue:
 // CHECK17-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK17:       omp.inner.for.inc:
-// CHECK17-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP40]]
+// CHECK17-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP33]]
 // CHECK17-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP12]], 1
-// CHECK17-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP40]]
-// CHECK17-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP41:![0-9]+]]
+// CHECK17-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP33]]
+// CHECK17-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP34:![0-9]+]]
 // CHECK17:       omp.inner.for.end:
 // CHECK17-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK17:       omp.loop.exit:
@@ -10510,7 +10510,7 @@ int bar(int n){
 //
 //
 // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l97.omp_outlined
-// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[AA:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[AA:%.*]]) #[[ATTR0]] {
 // CHECK19-NEXT:  entry:
 // CHECK19-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -10547,23 +10547,23 @@ int bar(int n){
 // CHECK19-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
 // CHECK19-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK19:       omp.inner.for.cond:
-// CHECK19-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19:![0-9]+]]
-// CHECK19-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK19-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12:![0-9]+]]
+// CHECK19-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP12]]
 // CHECK19-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
 // CHECK19-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK19:       omp.inner.for.body:
-// CHECK19-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK19-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12]]
 // CHECK19-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
 // CHECK19-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK19-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK19-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP12]]
 // CHECK19-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK19:       omp.body.continue:
 // CHECK19-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK19:       omp.inner.for.inc:
-// CHECK19-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK19-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12]]
 // CHECK19-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP8]], 1
-// CHECK19-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19]]
-// CHECK19-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP20:![0-9]+]]
+// CHECK19-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK19-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP13:![0-9]+]]
 // CHECK19:       omp.inner.for.end:
 // CHECK19-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK19:       omp.loop.exit:
@@ -10592,7 +10592,7 @@ int bar(int n){
 //
 //
 // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l111.omp_outlined
-// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[AA:%.*]]) #[[ATTR1]] {
+// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[AA:%.*]]) #[[ATTR0]] {
 // CHECK19-NEXT:  entry:
 // CHECK19-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -10629,28 +10629,28 @@ int bar(int n){
 // CHECK19-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
 // CHECK19-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK19:       omp.inner.for.cond:
-// CHECK19-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26:![0-9]+]]
-// CHECK19-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK19-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19:![0-9]+]]
+// CHECK19-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP19]]
 // CHECK19-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
 // CHECK19-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK19:       omp.inner.for.body:
-// CHECK19-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK19-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19]]
 // CHECK19-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
 // CHECK19-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK19-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP26]]
-// CHECK19-NEXT:    [[TMP8:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP26]]
+// CHECK19-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK19-NEXT:    [[TMP8:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP19]]
 // CHECK19-NEXT:    [[CONV:%.*]] = sext i16 [[TMP8]] to i32
 // CHECK19-NEXT:    [[ADD2:%.*]] = add nsw i32 [[CONV]], 1
 // CHECK19-NEXT:    [[CONV3:%.*]] = trunc i32 [[ADD2]] to i16
-// CHECK19-NEXT:    store i16 [[CONV3]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP26]]
+// CHECK19-NEXT:    store i16 [[CONV3]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP19]]
 // CHECK19-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK19:       omp.body.continue:
 // CHECK19-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK19:       omp.inner.for.inc:
-// CHECK19-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK19-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19]]
 // CHECK19-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP9]], 1
-// CHECK19-NEXT:    store i32 [[ADD4]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
-// CHECK19-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP27:![0-9]+]]
+// CHECK19-NEXT:    store i32 [[ADD4]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK19-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP20:![0-9]+]]
 // CHECK19:       omp.inner.for.end:
 // CHECK19-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK19:       omp.loop.exit:
@@ -10685,7 +10685,7 @@ int bar(int n){
 //
 //
 // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l118.omp_outlined
-// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]]) #[[ATTR1]] {
+// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]]) #[[ATTR0]] {
 // CHECK19-NEXT:  entry:
 // CHECK19-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -10724,31 +10724,31 @@ int bar(int n){
 // CHECK19-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
 // CHECK19-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK19:       omp.inner.for.cond:
-// CHECK19-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP29:![0-9]+]]
-// CHECK19-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP29]]
+// CHECK19-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22:![0-9]+]]
+// CHECK19-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP22]]
 // CHECK19-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
 // CHECK19-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK19:       omp.inner.for.body:
-// CHECK19-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP29]]
+// CHECK19-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
 // CHECK19-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
 // CHECK19-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK19-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP29]]
-// CHECK19-NEXT:    [[TMP8:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP29]]
+// CHECK19-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK19-NEXT:    [[TMP8:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP22]]
 // CHECK19-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP8]], 1
-// CHECK19-NEXT:    store i32 [[ADD2]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP29]]
-// CHECK19-NEXT:    [[TMP9:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP29]]
+// CHECK19-NEXT:    store i32 [[ADD2]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK19-NEXT:    [[TMP9:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP22]]
 // CHECK19-NEXT:    [[CONV:%.*]] = sext i16 [[TMP9]] to i32
 // CHECK19-NEXT:    [[ADD3:%.*]] = add nsw i32 [[CONV]], 1
 // CHECK19-NEXT:    [[CONV4:%.*]] = trunc i32 [[ADD3]] to i16
-// CHECK19-NEXT:    store i16 [[CONV4]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP29]]
+// CHECK19-NEXT:    store i16 [[CONV4]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP22]]
 // CHECK19-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK19:       omp.body.continue:
 // CHECK19-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK19:       omp.inner.for.inc:
-// CHECK19-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP29]]
+// CHECK19-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
 // CHECK19-NEXT:    [[ADD5:%.*]] = add nsw i32 [[TMP10]], 1
-// CHECK19-NEXT:    store i32 [[ADD5]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP29]]
-// CHECK19-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP30:![0-9]+]]
+// CHECK19-NEXT:    store i32 [[ADD5]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK19-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP23:![0-9]+]]
 // CHECK19:       omp.inner.for.end:
 // CHECK19-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK19:       omp.loop.exit:
@@ -10801,7 +10801,7 @@ int bar(int n){
 //
 //
 // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l142.omp_outlined
-// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BN:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[C:%.*]], i32 noundef [[VLA1:%.*]], i32 noundef [[VLA3:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[CN:%.*]], ptr noundef nonnull align 4 dereferenceable(12) [[D:%.*]]) #[[ATTR1]] {
+// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BN:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[C:%.*]], i32 noundef [[VLA1:%.*]], i32 noundef [[VLA3:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[CN:%.*]], ptr noundef nonnull align 4 dereferenceable(12) [[D:%.*]]) #[[ATTR0]] {
 // CHECK19-NEXT:  entry:
 // CHECK19-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -10864,59 +10864,59 @@ int bar(int n){
 // CHECK19-NEXT:    store i32 [[TMP12]], ptr [[DOTOMP_IV]], align 4
 // CHECK19-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK19:       omp.inner.for.cond:
-// CHECK19-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP32:![0-9]+]]
-// CHECK19-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP32]]
+// CHECK19-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP25:![0-9]+]]
+// CHECK19-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP25]]
 // CHECK19-NEXT:    [[CMP5:%.*]] = icmp sle i32 [[TMP13]], [[TMP14]]
 // CHECK19-NEXT:    br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK19:       omp.inner.for.body:
-// CHECK19-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP32]]
+// CHECK19-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP25]]
 // CHECK19-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP15]], 1
 // CHECK19-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK19-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP32]]
-// CHECK19-NEXT:    [[TMP16:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP32]]
+// CHECK19-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP25]]
+// CHECK19-NEXT:    [[TMP16:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP25]]
 // CHECK19-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP16]], 1
-// CHECK19-NEXT:    store i32 [[ADD6]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP32]]
+// CHECK19-NEXT:    store i32 [[ADD6]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP25]]
 // CHECK19-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x float], ptr [[TMP0]], i32 0, i32 2
-// CHECK19-NEXT:    [[TMP17:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP32]]
+// CHECK19-NEXT:    [[TMP17:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP25]]
 // CHECK19-NEXT:    [[CONV:%.*]] = fpext float [[TMP17]] to double
 // CHECK19-NEXT:    [[ADD7:%.*]] = fadd double [[CONV]], 1.000000e+00
 // CHECK19-NEXT:    [[CONV8:%.*]] = fptrunc double [[ADD7]] to float
-// CHECK19-NEXT:    store float [[CONV8]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP32]]
+// CHECK19-NEXT:    store float [[CONV8]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP25]]
 // CHECK19-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 3
-// CHECK19-NEXT:    [[TMP18:%.*]] = load float, ptr [[ARRAYIDX9]], align 4, !llvm.access.group [[ACC_GRP32]]
+// CHECK19-NEXT:    [[TMP18:%.*]] = load float, ptr [[ARRAYIDX9]], align 4, !llvm.access.group [[ACC_GRP25]]
 // CHECK19-NEXT:    [[CONV10:%.*]] = fpext float [[TMP18]] to double
 // CHECK19-NEXT:    [[ADD11:%.*]] = fadd double [[CONV10]], 1.000000e+00
 // CHECK19-NEXT:    [[CONV12:%.*]] = fptrunc double [[ADD11]] to float
-// CHECK19-NEXT:    store float [[CONV12]], ptr [[ARRAYIDX9]], align 4, !llvm.access.group [[ACC_GRP32]]
+// CHECK19-NEXT:    store float [[CONV12]], ptr [[ARRAYIDX9]], align 4, !llvm.access.group [[ACC_GRP25]]
 // CHECK19-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [5 x [10 x double]], ptr [[TMP3]], i32 0, i32 1
 // CHECK19-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x double], ptr [[ARRAYIDX13]], i32 0, i32 2
-// CHECK19-NEXT:    [[TMP19:%.*]] = load double, ptr [[ARRAYIDX14]], align 8, !llvm.access.group [[ACC_GRP32]]
+// CHECK19-NEXT:    [[TMP19:%.*]] = load double, ptr [[ARRAYIDX14]], align 8, !llvm.access.group [[ACC_GRP25]]
 // CHECK19-NEXT:    [[ADD15:%.*]] = fadd double [[TMP19]], 1.000000e+00
-// CHECK19-NEXT:    store double [[ADD15]], ptr [[ARRAYIDX14]], align 8, !llvm.access.group [[ACC_GRP32]]
+// CHECK19-NEXT:    store double [[ADD15]], ptr [[ARRAYIDX14]], align 8, !llvm.access.group [[ACC_GRP25]]
 // CHECK19-NEXT:    [[TMP20:%.*]] = mul nsw i32 1, [[TMP5]]
 // CHECK19-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds double, ptr [[TMP6]], i32 [[TMP20]]
 // CHECK19-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds double, ptr [[ARRAYIDX16]], i32 3
-// CHECK19-NEXT:    [[TMP21:%.*]] = load double, ptr [[ARRAYIDX17]], align 8, !llvm.access.group [[ACC_GRP32]]
+// CHECK19-NEXT:    [[TMP21:%.*]] = load double, ptr [[ARRAYIDX17]], align 8, !llvm.access.group [[ACC_GRP25]]
 // CHECK19-NEXT:    [[ADD18:%.*]] = fadd double [[TMP21]], 1.000000e+00
-// CHECK19-NEXT:    store double [[ADD18]], ptr [[ARRAYIDX17]], align 8, !llvm.access.group [[ACC_GRP32]]
+// CHECK19-NEXT:    store double [[ADD18]], ptr [[ARRAYIDX17]], align 8, !llvm.access.group [[ACC_GRP25]]
 // CHECK19-NEXT:    [[X:%.*]] = getelementptr inbounds [[STRUCT_TT:%.*]], ptr [[TMP7]], i32 0, i32 0
-// CHECK19-NEXT:    [[TMP22:%.*]] = load i64, ptr [[X]], align 4, !llvm.access.group [[ACC_GRP32]]
+// CHECK19-NEXT:    [[TMP22:%.*]] = load i64, ptr [[X]], align 4, !llvm.access.group [[ACC_GRP25]]
 // CHECK19-NEXT:    [[ADD19:%.*]] = add nsw i64 [[TMP22]], 1
-// CHECK19-NEXT:    store i64 [[ADD19]], ptr [[X]], align 4, !llvm.access.group [[ACC_GRP32]]
+// CHECK19-NEXT:    store i64 [[ADD19]], ptr [[X]], align 4, !llvm.access.group [[ACC_GRP25]]
 // CHECK19-NEXT:    [[Y:%.*]] = getelementptr inbounds [[STRUCT_TT]], ptr [[TMP7]], i32 0, i32 1
-// CHECK19-NEXT:    [[TMP23:%.*]] = load i8, ptr [[Y]], align 4, !llvm.access.group [[ACC_GRP32]]
+// CHECK19-NEXT:    [[TMP23:%.*]] = load i8, ptr [[Y]], align 4, !llvm.access.group [[ACC_GRP25]]
 // CHECK19-NEXT:    [[CONV20:%.*]] = sext i8 [[TMP23]] to i32
 // CHECK19-NEXT:    [[ADD21:%.*]] = add nsw i32 [[CONV20]], 1
 // CHECK19-NEXT:    [[CONV22:%.*]] = trunc i32 [[ADD21]] to i8
-// CHECK19-NEXT:    store i8 [[CONV22]], ptr [[Y]], align 4, !llvm.access.group [[ACC_GRP32]]
+// CHECK19-NEXT:    store i8 [[CONV22]], ptr [[Y]], align 4, !llvm.access.group [[ACC_GRP25]]
 // CHECK19-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK19:       omp.body.continue:
 // CHECK19-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK19:       omp.inner.for.inc:
-// CHECK19-NEXT:    [[TMP24:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP32]]
+// CHECK19-NEXT:    [[TMP24:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP25]]
 // CHECK19-NEXT:    [[ADD23:%.*]] = add nsw i32 [[TMP24]], 1
-// CHECK19-NEXT:    store i32 [[ADD23]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP32]]
-// CHECK19-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP33:![0-9]+]]
+// CHECK19-NEXT:    store i32 [[ADD23]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP25]]
+// CHECK19-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP26:![0-9]+]]
 // CHECK19:       omp.inner.for.end:
 // CHECK19-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK19:       omp.loop.exit:
@@ -10966,7 +10966,7 @@ int bar(int n){
 //
 //
 // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l197.omp_outlined
-// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], i32 noundef [[N:%.*]], i32 noundef [[AA:%.*]], i32 noundef [[AAA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR1]] {
+// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], i32 noundef [[N:%.*]], i32 noundef [[AA:%.*]], i32 noundef [[AAA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
 // CHECK19-NEXT:  entry:
 // CHECK19-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -11038,42 +11038,42 @@ int bar(int n){
 // CHECK19-NEXT:    store i32 [[TMP15]], ptr [[DOTOMP_IV]], align 4
 // CHECK19-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK19:       omp.inner.for.cond:
-// CHECK19-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP35:![0-9]+]]
-// CHECK19-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP35]]
+// CHECK19-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP28:![0-9]+]]
+// CHECK19-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP28]]
 // CHECK19-NEXT:    [[ADD7:%.*]] = add i32 [[TMP17]], 1
 // CHECK19-NEXT:    [[CMP8:%.*]] = icmp ult i32 [[TMP16]], [[ADD7]]
 // CHECK19-NEXT:    br i1 [[CMP8]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK19:       omp.inner.for.body:
-// CHECK19-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4, !llvm.access.group [[ACC_GRP35]]
-// CHECK19-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP35]]
+// CHECK19-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4, !llvm.access.group [[ACC_GRP28]]
+// CHECK19-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP28]]
 // CHECK19-NEXT:    [[MUL:%.*]] = mul i32 [[TMP19]], 1
 // CHECK19-NEXT:    [[ADD9:%.*]] = add i32 [[TMP18]], [[MUL]]
-// CHECK19-NEXT:    store i32 [[ADD9]], ptr [[I5]], align 4, !llvm.access.group [[ACC_GRP35]]
-// CHECK19-NEXT:    [[TMP20:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP35]]
+// CHECK19-NEXT:    store i32 [[ADD9]], ptr [[I5]], align 4, !llvm.access.group [[ACC_GRP28]]
+// CHECK19-NEXT:    [[TMP20:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP28]]
 // CHECK19-NEXT:    [[ADD10:%.*]] = add nsw i32 [[TMP20]], 1
-// CHECK19-NEXT:    store i32 [[ADD10]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP35]]
-// CHECK19-NEXT:    [[TMP21:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP35]]
+// CHECK19-NEXT:    store i32 [[ADD10]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP28]]
+// CHECK19-NEXT:    [[TMP21:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP28]]
 // CHECK19-NEXT:    [[CONV:%.*]] = sext i16 [[TMP21]] to i32
 // CHECK19-NEXT:    [[ADD11:%.*]] = add nsw i32 [[CONV]], 1
 // CHECK19-NEXT:    [[CONV12:%.*]] = trunc i32 [[ADD11]] to i16
-// CHECK19-NEXT:    store i16 [[CONV12]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP35]]
-// CHECK19-NEXT:    [[TMP22:%.*]] = load i8, ptr [[AAA_ADDR]], align 1, !llvm.access.group [[ACC_GRP35]]
+// CHECK19-NEXT:    store i16 [[CONV12]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP28]]
+// CHECK19-NEXT:    [[TMP22:%.*]] = load i8, ptr [[AAA_ADDR]], align 1, !llvm.access.group [[ACC_GRP28]]
 // CHECK19-NEXT:    [[CONV13:%.*]] = sext i8 [[TMP22]] to i32
 // CHECK19-NEXT:    [[ADD14:%.*]] = add nsw i32 [[CONV13]], 1
 // CHECK19-NEXT:    [[CONV15:%.*]] = trunc i32 [[ADD14]] to i8
-// CHECK19-NEXT:    store i8 [[CONV15]], ptr [[AAA_ADDR]], align 1, !llvm.access.group [[ACC_GRP35]]
+// CHECK19-NEXT:    store i8 [[CONV15]], ptr [[AAA_ADDR]], align 1, !llvm.access.group [[ACC_GRP28]]
 // CHECK19-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP0]], i32 0, i32 2
-// CHECK19-NEXT:    [[TMP23:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP35]]
+// CHECK19-NEXT:    [[TMP23:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP28]]
 // CHECK19-NEXT:    [[ADD16:%.*]] = add nsw i32 [[TMP23]], 1
-// CHECK19-NEXT:    store i32 [[ADD16]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP35]]
+// CHECK19-NEXT:    store i32 [[ADD16]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP28]]
 // CHECK19-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK19:       omp.body.continue:
 // CHECK19-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK19:       omp.inner.for.inc:
-// CHECK19-NEXT:    [[TMP24:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP35]]
+// CHECK19-NEXT:    [[TMP24:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP28]]
 // CHECK19-NEXT:    [[ADD17:%.*]] = add i32 [[TMP24]], 1
-// CHECK19-NEXT:    store i32 [[ADD17]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP35]]
-// CHECK19-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP36:![0-9]+]]
+// CHECK19-NEXT:    store i32 [[ADD17]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP28]]
+// CHECK19-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP29:![0-9]+]]
 // CHECK19:       omp.inner.for.end:
 // CHECK19-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK19:       omp.loop.exit:
@@ -11127,7 +11127,7 @@ int bar(int n){
 //
 //
 // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l215.omp_outlined
-// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]], i32 noundef [[B:%.*]], i32 noundef [[VLA:%.*]], i32 noundef [[VLA1:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[C:%.*]]) #[[ATTR1]] {
+// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]], i32 noundef [[B:%.*]], i32 noundef [[VLA:%.*]], i32 noundef [[VLA1:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[C:%.*]]) #[[ATTR0]] {
 // CHECK19-NEXT:  entry:
 // CHECK19-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -11176,37 +11176,37 @@ int bar(int n){
 // CHECK19-NEXT:    store i32 [[TMP8]], ptr [[DOTOMP_IV]], align 4
 // CHECK19-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK19:       omp.inner.for.cond:
-// CHECK19-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP38:![0-9]+]]
-// CHECK19-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP38]]
+// CHECK19-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP31:![0-9]+]]
+// CHECK19-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP31]]
 // CHECK19-NEXT:    [[CMP3:%.*]] = icmp sle i32 [[TMP9]], [[TMP10]]
 // CHECK19-NEXT:    br i1 [[CMP3]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK19:       omp.inner.for.body:
-// CHECK19-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP38]]
+// CHECK19-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP31]]
 // CHECK19-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP11]], 1
 // CHECK19-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK19-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP38]]
-// CHECK19-NEXT:    [[TMP12:%.*]] = load i32, ptr [[B_ADDR]], align 4, !llvm.access.group [[ACC_GRP38]]
+// CHECK19-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP31]]
+// CHECK19-NEXT:    [[TMP12:%.*]] = load i32, ptr [[B_ADDR]], align 4, !llvm.access.group [[ACC_GRP31]]
 // CHECK19-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP12]] to double
 // CHECK19-NEXT:    [[ADD4:%.*]] = fadd double [[CONV]], 1.500000e+00
 // CHECK19-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_S1:%.*]], ptr [[TMP0]], i32 0, i32 0
-// CHECK19-NEXT:    store double [[ADD4]], ptr [[A]], align 4, !llvm.access.group [[ACC_GRP38]]
+// CHECK19-NEXT:    store double [[ADD4]], ptr [[A]], align 4, !llvm.access.group [[ACC_GRP31]]
 // CHECK19-NEXT:    [[A5:%.*]] = getelementptr inbounds [[STRUCT_S1]], ptr [[TMP0]], i32 0, i32 0
-// CHECK19-NEXT:    [[TMP13:%.*]] = load double, ptr [[A5]], align 4, !llvm.access.group [[ACC_GRP38]]
+// CHECK19-NEXT:    [[TMP13:%.*]] = load double, ptr [[A5]], align 4, !llvm.access.group [[ACC_GRP31]]
 // CHECK19-NEXT:    [[INC:%.*]] = fadd double [[TMP13]], 1.000000e+00
-// CHECK19-NEXT:    store double [[INC]], ptr [[A5]], align 4, !llvm.access.group [[ACC_GRP38]]
+// CHECK19-NEXT:    store double [[INC]], ptr [[A5]], align 4, !llvm.access.group [[ACC_GRP31]]
 // CHECK19-NEXT:    [[CONV6:%.*]] = fptosi double [[INC]] to i16
 // CHECK19-NEXT:    [[TMP14:%.*]] = mul nsw i32 1, [[TMP2]]
 // CHECK19-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[TMP3]], i32 [[TMP14]]
 // CHECK19-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i16, ptr [[ARRAYIDX]], i32 1
-// CHECK19-NEXT:    store i16 [[CONV6]], ptr [[ARRAYIDX7]], align 2, !llvm.access.group [[ACC_GRP38]]
+// CHECK19-NEXT:    store i16 [[CONV6]], ptr [[ARRAYIDX7]], align 2, !llvm.access.group [[ACC_GRP31]]
 // CHECK19-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK19:       omp.body.continue:
 // CHECK19-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK19:       omp.inner.for.inc:
-// CHECK19-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP38]]
+// CHECK19-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP31]]
 // CHECK19-NEXT:    [[ADD8:%.*]] = add nsw i32 [[TMP15]], 1
-// CHECK19-NEXT:    store i32 [[ADD8]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP38]]
-// CHECK19-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP39:![0-9]+]]
+// CHECK19-NEXT:    store i32 [[ADD8]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP31]]
+// CHECK19-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP32:![0-9]+]]
 // CHECK19:       omp.inner.for.end:
 // CHECK19-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK19:       omp.loop.exit:
@@ -11244,7 +11244,7 @@ int bar(int n){
 //
 //
 // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l180.omp_outlined
-// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR1]] {
+// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
 // CHECK19-NEXT:  entry:
 // CHECK19-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -11286,35 +11286,35 @@ int bar(int n){
 // CHECK19-NEXT:    store i32 [[TMP5]], ptr [[DOTOMP_IV]], align 4
 // CHECK19-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK19:       omp.inner.for.cond:
-// CHECK19-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP41:![0-9]+]]
-// CHECK19-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP41]]
+// CHECK19-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP34:![0-9]+]]
+// CHECK19-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP34]]
 // CHECK19-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP6]], [[TMP7]]
 // CHECK19-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK19:       omp.inner.for.body:
-// CHECK19-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP41]]
+// CHECK19-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP34]]
 // CHECK19-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1
 // CHECK19-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK19-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP41]]
-// CHECK19-NEXT:    [[TMP9:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP41]]
+// CHECK19-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP34]]
+// CHECK19-NEXT:    [[TMP9:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP34]]
 // CHECK19-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP9]], 1
-// CHECK19-NEXT:    store i32 [[ADD2]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP41]]
-// CHECK19-NEXT:    [[TMP10:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP41]]
+// CHECK19-NEXT:    store i32 [[ADD2]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP34]]
+// CHECK19-NEXT:    [[TMP10:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP34]]
 // CHECK19-NEXT:    [[CONV:%.*]] = sext i16 [[TMP10]] to i32
 // CHECK19-NEXT:    [[ADD3:%.*]] = add nsw i32 [[CONV]], 1
 // CHECK19-NEXT:    [[CONV4:%.*]] = trunc i32 [[ADD3]] to i16
-// CHECK19-NEXT:    store i16 [[CONV4]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP41]]
+// CHECK19-NEXT:    store i16 [[CONV4]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP34]]
 // CHECK19-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP0]], i32 0, i32 2
-// CHECK19-NEXT:    [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP41]]
+// CHECK19-NEXT:    [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP34]]
 // CHECK19-NEXT:    [[ADD5:%.*]] = add nsw i32 [[TMP11]], 1
-// CHECK19-NEXT:    store i32 [[ADD5]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP41]]
+// CHECK19-NEXT:    store i32 [[ADD5]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP34]]
 // CHECK19-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK19:       omp.body.continue:
 // CHECK19-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK19:       omp.inner.for.inc:
-// CHECK19-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP41]]
+// CHECK19-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP34]]
 // CHECK19-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP12]], 1
-// CHECK19-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP41]]
-// CHECK19-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP42:![0-9]+]]
+// CHECK19-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP34]]
+// CHECK19-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP35:![0-9]+]]
 // CHECK19:       omp.inner.for.end:
 // CHECK19-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK19:       omp.loop.exit:
@@ -11351,7 +11351,7 @@ int bar(int n){
 //
 //
 // CHECK21-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l97.omp_outlined
-// CHECK21-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[AA:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK21-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[AA:%.*]]) #[[ATTR0]] {
 // CHECK21-NEXT:  entry:
 // CHECK21-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK21-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -11388,23 +11388,23 @@ int bar(int n){
 // CHECK21-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
 // CHECK21-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK21:       omp.inner.for.cond:
-// CHECK21-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP18:![0-9]+]]
-// CHECK21-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK21-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP11:![0-9]+]]
+// CHECK21-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP11]]
 // CHECK21-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
 // CHECK21-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK21:       omp.inner.for.body:
-// CHECK21-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK21-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP11]]
 // CHECK21-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
 // CHECK21-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK21-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK21-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP11]]
 // CHECK21-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK21:       omp.body.continue:
 // CHECK21-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK21:       omp.inner.for.inc:
-// CHECK21-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK21-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP11]]
 // CHECK21-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP8]], 1
-// CHECK21-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP18]]
-// CHECK21-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP19:![0-9]+]]
+// CHECK21-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP11]]
+// CHECK21-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP12:![0-9]+]]
 // CHECK21:       omp.inner.for.end:
 // CHECK21-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK21:       omp.loop.exit:
@@ -11433,7 +11433,7 @@ int bar(int n){
 //
 //
 // CHECK21-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l111.omp_outlined
-// CHECK21-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[AA:%.*]]) #[[ATTR1]] {
+// CHECK21-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[AA:%.*]]) #[[ATTR0]] {
 // CHECK21-NEXT:  entry:
 // CHECK21-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK21-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -11470,28 +11470,28 @@ int bar(int n){
 // CHECK21-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
 // CHECK21-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK21:       omp.inner.for.cond:
-// CHECK21-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP25:![0-9]+]]
-// CHECK21-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP25]]
+// CHECK21-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP18:![0-9]+]]
+// CHECK21-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK21-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
 // CHECK21-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK21:       omp.inner.for.body:
-// CHECK21-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP25]]
+// CHECK21-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK21-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
 // CHECK21-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK21-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP25]]
-// CHECK21-NEXT:    [[TMP8:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP25]]
+// CHECK21-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK21-NEXT:    [[TMP8:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP18]]
 // CHECK21-NEXT:    [[CONV:%.*]] = sext i16 [[TMP8]] to i32
 // CHECK21-NEXT:    [[ADD2:%.*]] = add nsw i32 [[CONV]], 1
 // CHECK21-NEXT:    [[CONV3:%.*]] = trunc i32 [[ADD2]] to i16
-// CHECK21-NEXT:    store i16 [[CONV3]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP25]]
+// CHECK21-NEXT:    store i16 [[CONV3]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP18]]
 // CHECK21-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK21:       omp.body.continue:
 // CHECK21-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK21:       omp.inner.for.inc:
-// CHECK21-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP25]]
+// CHECK21-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK21-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP9]], 1
-// CHECK21-NEXT:    store i32 [[ADD4]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP25]]
-// CHECK21-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP26:![0-9]+]]
+// CHECK21-NEXT:    store i32 [[ADD4]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK21-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP19:![0-9]+]]
 // CHECK21:       omp.inner.for.end:
 // CHECK21-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK21:       omp.loop.exit:
@@ -11526,7 +11526,7 @@ int bar(int n){
 //
 //
 // CHECK21-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l118.omp_outlined
-// CHECK21-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]]) #[[ATTR1]] {
+// CHECK21-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]]) #[[ATTR0]] {
 // CHECK21-NEXT:  entry:
 // CHECK21-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK21-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -11565,31 +11565,31 @@ int bar(int n){
 // CHECK21-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
 // CHECK21-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK21:       omp.inner.for.cond:
-// CHECK21-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP28:![0-9]+]]
-// CHECK21-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP28]]
+// CHECK21-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP21:![0-9]+]]
+// CHECK21-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP21]]
 // CHECK21-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
 // CHECK21-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK21:       omp.inner.for.body:
-// CHECK21-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP28]]
+// CHECK21-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP21]]
 // CHECK21-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
 // CHECK21-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK21-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP28]]
-// CHECK21-NEXT:    [[TMP8:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP28]]
+// CHECK21-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP21]]
+// CHECK21-NEXT:    [[TMP8:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP21]]
 // CHECK21-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP8]], 1
-// CHECK21-NEXT:    store i32 [[ADD2]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP28]]
-// CHECK21-NEXT:    [[TMP9:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP28]]
+// CHECK21-NEXT:    store i32 [[ADD2]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP21]]
+// CHECK21-NEXT:    [[TMP9:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP21]]
 // CHECK21-NEXT:    [[CONV:%.*]] = sext i16 [[TMP9]] to i32
 // CHECK21-NEXT:    [[ADD3:%.*]] = add nsw i32 [[CONV]], 1
 // CHECK21-NEXT:    [[CONV4:%.*]] = trunc i32 [[ADD3]] to i16
-// CHECK21-NEXT:    store i16 [[CONV4]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP28]]
+// CHECK21-NEXT:    store i16 [[CONV4]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP21]]
 // CHECK21-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK21:       omp.body.continue:
 // CHECK21-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK21:       omp.inner.for.inc:
-// CHECK21-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP28]]
+// CHECK21-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP21]]
 // CHECK21-NEXT:    [[ADD5:%.*]] = add nsw i32 [[TMP10]], 1
-// CHECK21-NEXT:    store i32 [[ADD5]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP28]]
-// CHECK21-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP29:![0-9]+]]
+// CHECK21-NEXT:    store i32 [[ADD5]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP21]]
+// CHECK21-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP22:![0-9]+]]
 // CHECK21:       omp.inner.for.end:
 // CHECK21-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK21:       omp.loop.exit:
@@ -11642,7 +11642,7 @@ int bar(int n){
 //
 //
 // CHECK21-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l142.omp_outlined
-// CHECK21-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BN:%.*]], ptr noundef nonnull align 8 dereferenceable(400) [[C:%.*]], i64 noundef [[VLA1:%.*]], i64 noundef [[VLA3:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[CN:%.*]], ptr noundef nonnull align 8 dereferenceable(16) [[D:%.*]]) #[[ATTR1]] {
+// CHECK21-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BN:%.*]], ptr noundef nonnull align 8 dereferenceable(400) [[C:%.*]], i64 noundef [[VLA1:%.*]], i64 noundef [[VLA3:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[CN:%.*]], ptr noundef nonnull align 8 dereferenceable(16) [[D:%.*]]) #[[ATTR0]] {
 // CHECK21-NEXT:  entry:
 // CHECK21-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK21-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -11705,59 +11705,59 @@ int bar(int n){
 // CHECK21-NEXT:    store i32 [[TMP12]], ptr [[DOTOMP_IV]], align 4
 // CHECK21-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK21:       omp.inner.for.cond:
-// CHECK21-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP31:![0-9]+]]
-// CHECK21-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP31]]
+// CHECK21-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP24:![0-9]+]]
+// CHECK21-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP24]]
 // CHECK21-NEXT:    [[CMP5:%.*]] = icmp sle i32 [[TMP13]], [[TMP14]]
 // CHECK21-NEXT:    br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK21:       omp.inner.for.body:
-// CHECK21-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP31]]
+// CHECK21-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP24]]
 // CHECK21-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP15]], 1
 // CHECK21-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK21-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP31]]
-// CHECK21-NEXT:    [[TMP16:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP31]]
+// CHECK21-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP24]]
+// CHECK21-NEXT:    [[TMP16:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP24]]
 // CHECK21-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP16]], 1
-// CHECK21-NEXT:    store i32 [[ADD6]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP31]]
+// CHECK21-NEXT:    store i32 [[ADD6]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP24]]
 // CHECK21-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x float], ptr [[TMP0]], i64 0, i64 2
-// CHECK21-NEXT:    [[TMP17:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP31]]
+// CHECK21-NEXT:    [[TMP17:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP24]]
 // CHECK21-NEXT:    [[CONV:%.*]] = fpext float [[TMP17]] to double
 // CHECK21-NEXT:    [[ADD7:%.*]] = fadd double [[CONV]], 1.000000e+00
 // CHECK21-NEXT:    [[CONV8:%.*]] = fptrunc double [[ADD7]] to float
-// CHECK21-NEXT:    store float [[CONV8]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP31]]
+// CHECK21-NEXT:    store float [[CONV8]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP24]]
 // CHECK21-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i64 3
-// CHECK21-NEXT:    [[TMP18:%.*]] = load float, ptr [[ARRAYIDX9]], align 4, !llvm.access.group [[ACC_GRP31]]
+// CHECK21-NEXT:    [[TMP18:%.*]] = load float, ptr [[ARRAYIDX9]], align 4, !llvm.access.group [[ACC_GRP24]]
 // CHECK21-NEXT:    [[CONV10:%.*]] = fpext float [[TMP18]] to double
 // CHECK21-NEXT:    [[ADD11:%.*]] = fadd double [[CONV10]], 1.000000e+00
 // CHECK21-NEXT:    [[CONV12:%.*]] = fptrunc double [[ADD11]] to float
-// CHECK21-NEXT:    store float [[CONV12]], ptr [[ARRAYIDX9]], align 4, !llvm.access.group [[ACC_GRP31]]
+// CHECK21-NEXT:    store float [[CONV12]], ptr [[ARRAYIDX9]], align 4, !llvm.access.group [[ACC_GRP24]]
 // CHECK21-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [5 x [10 x double]], ptr [[TMP3]], i64 0, i64 1
 // CHECK21-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x double], ptr [[ARRAYIDX13]], i64 0, i64 2
-// CHECK21-NEXT:    [[TMP19:%.*]] = load double, ptr [[ARRAYIDX14]], align 8, !llvm.access.group [[ACC_GRP31]]
+// CHECK21-NEXT:    [[TMP19:%.*]] = load double, ptr [[ARRAYIDX14]], align 8, !llvm.access.group [[ACC_GRP24]]
 // CHECK21-NEXT:    [[ADD15:%.*]] = fadd double [[TMP19]], 1.000000e+00
-// CHECK21-NEXT:    store double [[ADD15]], ptr [[ARRAYIDX14]], align 8, !llvm.access.group [[ACC_GRP31]]
+// CHECK21-NEXT:    store double [[ADD15]], ptr [[ARRAYIDX14]], align 8, !llvm.access.group [[ACC_GRP24]]
 // CHECK21-NEXT:    [[TMP20:%.*]] = mul nsw i64 1, [[TMP5]]
 // CHECK21-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds double, ptr [[TMP6]], i64 [[TMP20]]
 // CHECK21-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds double, ptr [[ARRAYIDX16]], i64 3
-// CHECK21-NEXT:    [[TMP21:%.*]] = load double, ptr [[ARRAYIDX17]], align 8, !llvm.access.group [[ACC_GRP31]]
+// CHECK21-NEXT:    [[TMP21:%.*]] = load double, ptr [[ARRAYIDX17]], align 8, !llvm.access.group [[ACC_GRP24]]
 // CHECK21-NEXT:    [[ADD18:%.*]] = fadd double [[TMP21]], 1.000000e+00
-// CHECK21-NEXT:    store double [[ADD18]], ptr [[ARRAYIDX17]], align 8, !llvm.access.group [[ACC_GRP31]]
+// CHECK21-NEXT:    store double [[ADD18]], ptr [[ARRAYIDX17]], align 8, !llvm.access.group [[ACC_GRP24]]
 // CHECK21-NEXT:    [[X:%.*]] = getelementptr inbounds [[STRUCT_TT:%.*]], ptr [[TMP7]], i32 0, i32 0
-// CHECK21-NEXT:    [[TMP22:%.*]] = load i64, ptr [[X]], align 8, !llvm.access.group [[ACC_GRP31]]
+// CHECK21-NEXT:    [[TMP22:%.*]] = load i64, ptr [[X]], align 8, !llvm.access.group [[ACC_GRP24]]
 // CHECK21-NEXT:    [[ADD19:%.*]] = add nsw i64 [[TMP22]], 1
-// CHECK21-NEXT:    store i64 [[ADD19]], ptr [[X]], align 8, !llvm.access.group [[ACC_GRP31]]
+// CHECK21-NEXT:    store i64 [[ADD19]], ptr [[X]], align 8, !llvm.access.group [[ACC_GRP24]]
 // CHECK21-NEXT:    [[Y:%.*]] = getelementptr inbounds [[STRUCT_TT]], ptr [[TMP7]], i32 0, i32 1
-// CHECK21-NEXT:    [[TMP23:%.*]] = load i8, ptr [[Y]], align 8, !llvm.access.group [[ACC_GRP31]]
+// CHECK21-NEXT:    [[TMP23:%.*]] = load i8, ptr [[Y]], align 8, !llvm.access.group [[ACC_GRP24]]
 // CHECK21-NEXT:    [[CONV20:%.*]] = sext i8 [[TMP23]] to i32
 // CHECK21-NEXT:    [[ADD21:%.*]] = add nsw i32 [[CONV20]], 1
 // CHECK21-NEXT:    [[CONV22:%.*]] = trunc i32 [[ADD21]] to i8
-// CHECK21-NEXT:    store i8 [[CONV22]], ptr [[Y]], align 8, !llvm.access.group [[ACC_GRP31]]
+// CHECK21-NEXT:    store i8 [[CONV22]], ptr [[Y]], align 8, !llvm.access.group [[ACC_GRP24]]
 // CHECK21-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK21:       omp.body.continue:
 // CHECK21-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK21:       omp.inner.for.inc:
-// CHECK21-NEXT:    [[TMP24:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP31]]
+// CHECK21-NEXT:    [[TMP24:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP24]]
 // CHECK21-NEXT:    [[ADD23:%.*]] = add nsw i32 [[TMP24]], 1
-// CHECK21-NEXT:    store i32 [[ADD23]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP31]]
-// CHECK21-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP32:![0-9]+]]
+// CHECK21-NEXT:    store i32 [[ADD23]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP24]]
+// CHECK21-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP25:![0-9]+]]
 // CHECK21:       omp.inner.for.end:
 // CHECK21-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK21:       omp.loop.exit:
@@ -11807,7 +11807,7 @@ int bar(int n){
 //
 //
 // CHECK21-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l197.omp_outlined
-// CHECK21-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], i64 noundef [[N:%.*]], i64 noundef [[AA:%.*]], i64 noundef [[AAA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR1]] {
+// CHECK21-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], i64 noundef [[N:%.*]], i64 noundef [[AA:%.*]], i64 noundef [[AAA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
 // CHECK21-NEXT:  entry:
 // CHECK21-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK21-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -11879,42 +11879,42 @@ int bar(int n){
 // CHECK21-NEXT:    store i32 [[TMP15]], ptr [[DOTOMP_IV]], align 4
 // CHECK21-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK21:       omp.inner.for.cond:
-// CHECK21-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP34:![0-9]+]]
-// CHECK21-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP34]]
+// CHECK21-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP27:![0-9]+]]
+// CHECK21-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP27]]
 // CHECK21-NEXT:    [[ADD7:%.*]] = add i32 [[TMP17]], 1
 // CHECK21-NEXT:    [[CMP8:%.*]] = icmp ult i32 [[TMP16]], [[ADD7]]
 // CHECK21-NEXT:    br i1 [[CMP8]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK21:       omp.inner.for.body:
-// CHECK21-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4, !llvm.access.group [[ACC_GRP34]]
-// CHECK21-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP34]]
+// CHECK21-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4, !llvm.access.group [[ACC_GRP27]]
+// CHECK21-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP27]]
 // CHECK21-NEXT:    [[MUL:%.*]] = mul i32 [[TMP19]], 1
 // CHECK21-NEXT:    [[ADD9:%.*]] = add i32 [[TMP18]], [[MUL]]
-// CHECK21-NEXT:    store i32 [[ADD9]], ptr [[I5]], align 4, !llvm.access.group [[ACC_GRP34]]
-// CHECK21-NEXT:    [[TMP20:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP34]]
+// CHECK21-NEXT:    store i32 [[ADD9]], ptr [[I5]], align 4, !llvm.access.group [[ACC_GRP27]]
+// CHECK21-NEXT:    [[TMP20:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP27]]
 // CHECK21-NEXT:    [[ADD10:%.*]] = add nsw i32 [[TMP20]], 1
-// CHECK21-NEXT:    store i32 [[ADD10]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP34]]
-// CHECK21-NEXT:    [[TMP21:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP34]]
+// CHECK21-NEXT:    store i32 [[ADD10]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP27]]
+// CHECK21-NEXT:    [[TMP21:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP27]]
 // CHECK21-NEXT:    [[CONV:%.*]] = sext i16 [[TMP21]] to i32
 // CHECK21-NEXT:    [[ADD11:%.*]] = add nsw i32 [[CONV]], 1
 // CHECK21-NEXT:    [[CONV12:%.*]] = trunc i32 [[ADD11]] to i16
-// CHECK21-NEXT:    store i16 [[CONV12]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP34]]
-// CHECK21-NEXT:    [[TMP22:%.*]] = load i8, ptr [[AAA_ADDR]], align 1, !llvm.access.group [[ACC_GRP34]]
+// CHECK21-NEXT:    store i16 [[CONV12]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP27]]
+// CHECK21-NEXT:    [[TMP22:%.*]] = load i8, ptr [[AAA_ADDR]], align 1, !llvm.access.group [[ACC_GRP27]]
 // CHECK21-NEXT:    [[CONV13:%.*]] = sext i8 [[TMP22]] to i32
 // CHECK21-NEXT:    [[ADD14:%.*]] = add nsw i32 [[CONV13]], 1
 // CHECK21-NEXT:    [[CONV15:%.*]] = trunc i32 [[ADD14]] to i8
-// CHECK21-NEXT:    store i8 [[CONV15]], ptr [[AAA_ADDR]], align 1, !llvm.access.group [[ACC_GRP34]]
+// CHECK21-NEXT:    store i8 [[CONV15]], ptr [[AAA_ADDR]], align 1, !llvm.access.group [[ACC_GRP27]]
 // CHECK21-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP0]], i64 0, i64 2
-// CHECK21-NEXT:    [[TMP23:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP34]]
+// CHECK21-NEXT:    [[TMP23:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP27]]
 // CHECK21-NEXT:    [[ADD16:%.*]] = add nsw i32 [[TMP23]], 1
-// CHECK21-NEXT:    store i32 [[ADD16]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP34]]
+// CHECK21-NEXT:    store i32 [[ADD16]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP27]]
 // CHECK21-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK21:       omp.body.continue:
 // CHECK21-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK21:       omp.inner.for.inc:
-// CHECK21-NEXT:    [[TMP24:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP34]]
+// CHECK21-NEXT:    [[TMP24:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP27]]
 // CHECK21-NEXT:    [[ADD17:%.*]] = add i32 [[TMP24]], 1
-// CHECK21-NEXT:    store i32 [[ADD17]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP34]]
-// CHECK21-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP35:![0-9]+]]
+// CHECK21-NEXT:    store i32 [[ADD17]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP27]]
+// CHECK21-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP28:![0-9]+]]
 // CHECK21:       omp.inner.for.end:
 // CHECK21-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK21:       omp.loop.exit:
@@ -11976,7 +11976,7 @@ int bar(int n){
 //
 //
 // CHECK21-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l215.omp_outlined
-// CHECK21-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]], i64 noundef [[B:%.*]], i64 noundef [[VLA:%.*]], i64 noundef [[VLA1:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[C:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR1]] {
+// CHECK21-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]], i64 noundef [[B:%.*]], i64 noundef [[VLA:%.*]], i64 noundef [[VLA1:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[C:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] {
 // CHECK21-NEXT:  entry:
 // CHECK21-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK21-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -12031,37 +12031,37 @@ int bar(int n){
 // CHECK21:       omp_if.then:
 // CHECK21-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK21:       omp.inner.for.cond:
-// CHECK21-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP37:![0-9]+]]
-// CHECK21-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP37]]
+// CHECK21-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP30:![0-9]+]]
+// CHECK21-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP30]]
 // CHECK21-NEXT:    [[CMP3:%.*]] = icmp sle i32 [[TMP10]], [[TMP11]]
 // CHECK21-NEXT:    br i1 [[CMP3]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK21:       omp.inner.for.body:
-// CHECK21-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP37]]
+// CHECK21-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP30]]
 // CHECK21-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1
 // CHECK21-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK21-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP37]]
-// CHECK21-NEXT:    [[TMP13:%.*]] = load i32, ptr [[B_ADDR]], align 4, !llvm.access.group [[ACC_GRP37]]
+// CHECK21-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP30]]
+// CHECK21-NEXT:    [[TMP13:%.*]] = load i32, ptr [[B_ADDR]], align 4, !llvm.access.group [[ACC_GRP30]]
 // CHECK21-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP13]] to double
 // CHECK21-NEXT:    [[ADD4:%.*]] = fadd double [[CONV]], 1.500000e+00
 // CHECK21-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_S1:%.*]], ptr [[TMP0]], i32 0, i32 0
-// CHECK21-NEXT:    store double [[ADD4]], ptr [[A]], align 8, !llvm.access.group [[ACC_GRP37]]
+// CHECK21-NEXT:    store double [[ADD4]], ptr [[A]], align 8, !llvm.access.group [[ACC_GRP30]]
 // CHECK21-NEXT:    [[A5:%.*]] = getelementptr inbounds [[STRUCT_S1]], ptr [[TMP0]], i32 0, i32 0
-// CHECK21-NEXT:    [[TMP14:%.*]] = load double, ptr [[A5]], align 8, !llvm.access.group [[ACC_GRP37]]
+// CHECK21-NEXT:    [[TMP14:%.*]] = load double, ptr [[A5]], align 8, !llvm.access.group [[ACC_GRP30]]
 // CHECK21-NEXT:    [[INC:%.*]] = fadd double [[TMP14]], 1.000000e+00
-// CHECK21-NEXT:    store double [[INC]], ptr [[A5]], align 8, !llvm.access.group [[ACC_GRP37]]
+// CHECK21-NEXT:    store double [[INC]], ptr [[A5]], align 8, !llvm.access.group [[ACC_GRP30]]
 // CHECK21-NEXT:    [[CONV6:%.*]] = fptosi double [[INC]] to i16
 // CHECK21-NEXT:    [[TMP15:%.*]] = mul nsw i64 1, [[TMP2]]
 // CHECK21-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[TMP3]], i64 [[TMP15]]
 // CHECK21-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i16, ptr [[ARRAYIDX]], i64 1
-// CHECK21-NEXT:    store i16 [[CONV6]], ptr [[ARRAYIDX7]], align 2, !llvm.access.group [[ACC_GRP37]]
+// CHECK21-NEXT:    store i16 [[CONV6]], ptr [[ARRAYIDX7]], align 2, !llvm.access.group [[ACC_GRP30]]
 // CHECK21-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK21:       omp.body.continue:
 // CHECK21-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK21:       omp.inner.for.inc:
-// CHECK21-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP37]]
+// CHECK21-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP30]]
 // CHECK21-NEXT:    [[ADD8:%.*]] = add nsw i32 [[TMP16]], 1
-// CHECK21-NEXT:    store i32 [[ADD8]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP37]]
-// CHECK21-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP38:![0-9]+]]
+// CHECK21-NEXT:    store i32 [[ADD8]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP30]]
+// CHECK21-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP31:![0-9]+]]
 // CHECK21:       omp.inner.for.end:
 // CHECK21-NEXT:    br label [[OMP_IF_END:%.*]]
 // CHECK21:       omp_if.else:
@@ -12097,7 +12097,7 @@ int bar(int n){
 // CHECK21-NEXT:    [[TMP23:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
 // CHECK21-NEXT:    [[ADD24:%.*]] = add nsw i32 [[TMP23]], 1
 // CHECK21-NEXT:    store i32 [[ADD24]], ptr [[DOTOMP_IV]], align 4
-// CHECK21-NEXT:    br label [[OMP_INNER_FOR_COND9]], !llvm.loop [[LOOP40:![0-9]+]]
+// CHECK21-NEXT:    br label [[OMP_INNER_FOR_COND9]], !llvm.loop [[LOOP33:![0-9]+]]
 // CHECK21:       omp.inner.for.end25:
 // CHECK21-NEXT:    br label [[OMP_IF_END]]
 // CHECK21:       omp_if.end:
@@ -12137,7 +12137,7 @@ int bar(int n){
 //
 //
 // CHECK21-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l180.omp_outlined
-// CHECK21-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR1]] {
+// CHECK21-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
 // CHECK21-NEXT:  entry:
 // CHECK21-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK21-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -12179,35 +12179,35 @@ int bar(int n){
 // CHECK21-NEXT:    store i32 [[TMP5]], ptr [[DOTOMP_IV]], align 4
 // CHECK21-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK21:       omp.inner.for.cond:
-// CHECK21-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP42:![0-9]+]]
-// CHECK21-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP42]]
+// CHECK21-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP35:![0-9]+]]
+// CHECK21-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP35]]
 // CHECK21-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP6]], [[TMP7]]
 // CHECK21-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK21:       omp.inner.for.body:
-// CHECK21-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP42]]
+// CHECK21-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP35]]
 // CHECK21-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1
 // CHECK21-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK21-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP42]]
-// CHECK21-NEXT:    [[TMP9:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP42]]
+// CHECK21-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP35]]
+// CHECK21-NEXT:    [[TMP9:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP35]]
 // CHECK21-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP9]], 1
-// CHECK21-NEXT:    store i32 [[ADD2]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP42]]
-// CHECK21-NEXT:    [[TMP10:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP42]]
+// CHECK21-NEXT:    store i32 [[ADD2]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP35]]
+// CHECK21-NEXT:    [[TMP10:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP35]]
 // CHECK21-NEXT:    [[CONV:%.*]] = sext i16 [[TMP10]] to i32
 // CHECK21-NEXT:    [[ADD3:%.*]] = add nsw i32 [[CONV]], 1
 // CHECK21-NEXT:    [[CONV4:%.*]] = trunc i32 [[ADD3]] to i16
-// CHECK21-NEXT:    store i16 [[CONV4]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP42]]
+// CHECK21-NEXT:    store i16 [[CONV4]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP35]]
 // CHECK21-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP0]], i64 0, i64 2
-// CHECK21-NEXT:    [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP42]]
+// CHECK21-NEXT:    [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP35]]
 // CHECK21-NEXT:    [[ADD5:%.*]] = add nsw i32 [[TMP11]], 1
-// CHECK21-NEXT:    store i32 [[ADD5]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP42]]
+// CHECK21-NEXT:    store i32 [[ADD5]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP35]]
 // CHECK21-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK21:       omp.body.continue:
 // CHECK21-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK21:       omp.inner.for.inc:
-// CHECK21-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP42]]
+// CHECK21-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP35]]
 // CHECK21-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP12]], 1
-// CHECK21-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP42]]
-// CHECK21-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP43:![0-9]+]]
+// CHECK21-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP35]]
+// CHECK21-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP36:![0-9]+]]
 // CHECK21:       omp.inner.for.end:
 // CHECK21-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK21:       omp.loop.exit:
@@ -12244,7 +12244,7 @@ int bar(int n){
 //
 //
 // CHECK23-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l97.omp_outlined
-// CHECK23-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[AA:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK23-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[AA:%.*]]) #[[ATTR0]] {
 // CHECK23-NEXT:  entry:
 // CHECK23-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK23-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -12281,23 +12281,23 @@ int bar(int n){
 // CHECK23-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
 // CHECK23-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK23:       omp.inner.for.cond:
-// CHECK23-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19:![0-9]+]]
-// CHECK23-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK23-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12:![0-9]+]]
+// CHECK23-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP12]]
 // CHECK23-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
 // CHECK23-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK23:       omp.inner.for.body:
-// CHECK23-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK23-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12]]
 // CHECK23-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
 // CHECK23-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK23-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK23-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP12]]
 // CHECK23-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK23:       omp.body.continue:
 // CHECK23-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK23:       omp.inner.for.inc:
-// CHECK23-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK23-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12]]
 // CHECK23-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP8]], 1
-// CHECK23-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19]]
-// CHECK23-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP20:![0-9]+]]
+// CHECK23-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK23-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP13:![0-9]+]]
 // CHECK23:       omp.inner.for.end:
 // CHECK23-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK23:       omp.loop.exit:
@@ -12326,7 +12326,7 @@ int bar(int n){
 //
 //
 // CHECK23-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l111.omp_outlined
-// CHECK23-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[AA:%.*]]) #[[ATTR1]] {
+// CHECK23-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[AA:%.*]]) #[[ATTR0]] {
 // CHECK23-NEXT:  entry:
 // CHECK23-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK23-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -12363,28 +12363,28 @@ int bar(int n){
 // CHECK23-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
 // CHECK23-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK23:       omp.inner.for.cond:
-// CHECK23-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26:![0-9]+]]
-// CHECK23-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK23-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19:![0-9]+]]
+// CHECK23-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP19]]
 // CHECK23-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
 // CHECK23-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK23:       omp.inner.for.body:
-// CHECK23-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK23-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19]]
 // CHECK23-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
 // CHECK23-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK23-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP26]]
-// CHECK23-NEXT:    [[TMP8:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP26]]
+// CHECK23-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK23-NEXT:    [[TMP8:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP19]]
 // CHECK23-NEXT:    [[CONV:%.*]] = sext i16 [[TMP8]] to i32
 // CHECK23-NEXT:    [[ADD2:%.*]] = add nsw i32 [[CONV]], 1
 // CHECK23-NEXT:    [[CONV3:%.*]] = trunc i32 [[ADD2]] to i16
-// CHECK23-NEXT:    store i16 [[CONV3]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP26]]
+// CHECK23-NEXT:    store i16 [[CONV3]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP19]]
 // CHECK23-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK23:       omp.body.continue:
 // CHECK23-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK23:       omp.inner.for.inc:
-// CHECK23-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK23-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19]]
 // CHECK23-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP9]], 1
-// CHECK23-NEXT:    store i32 [[ADD4]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
-// CHECK23-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP27:![0-9]+]]
+// CHECK23-NEXT:    store i32 [[ADD4]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK23-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP20:![0-9]+]]
 // CHECK23:       omp.inner.for.end:
 // CHECK23-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK23:       omp.loop.exit:
@@ -12419,7 +12419,7 @@ int bar(int n){
 //
 //
 // CHECK23-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l118.omp_outlined
-// CHECK23-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]]) #[[ATTR1]] {
+// CHECK23-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]]) #[[ATTR0]] {
 // CHECK23-NEXT:  entry:
 // CHECK23-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK23-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -12458,31 +12458,31 @@ int bar(int n){
 // CHECK23-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
 // CHECK23-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK23:       omp.inner.for.cond:
-// CHECK23-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP29:![0-9]+]]
-// CHECK23-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP29]]
+// CHECK23-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22:![0-9]+]]
+// CHECK23-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP22]]
 // CHECK23-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
 // CHECK23-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK23:       omp.inner.for.body:
-// CHECK23-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP29]]
+// CHECK23-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
 // CHECK23-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
 // CHECK23-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK23-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP29]]
-// CHECK23-NEXT:    [[TMP8:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP29]]
+// CHECK23-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK23-NEXT:    [[TMP8:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP22]]
 // CHECK23-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP8]], 1
-// CHECK23-NEXT:    store i32 [[ADD2]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP29]]
-// CHECK23-NEXT:    [[TMP9:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP29]]
+// CHECK23-NEXT:    store i32 [[ADD2]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK23-NEXT:    [[TMP9:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP22]]
 // CHECK23-NEXT:    [[CONV:%.*]] = sext i16 [[TMP9]] to i32
 // CHECK23-NEXT:    [[ADD3:%.*]] = add nsw i32 [[CONV]], 1
 // CHECK23-NEXT:    [[CONV4:%.*]] = trunc i32 [[ADD3]] to i16
-// CHECK23-NEXT:    store i16 [[CONV4]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP29]]
+// CHECK23-NEXT:    store i16 [[CONV4]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP22]]
 // CHECK23-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK23:       omp.body.continue:
 // CHECK23-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK23:       omp.inner.for.inc:
-// CHECK23-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP29]]
+// CHECK23-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
 // CHECK23-NEXT:    [[ADD5:%.*]] = add nsw i32 [[TMP10]], 1
-// CHECK23-NEXT:    store i32 [[ADD5]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP29]]
-// CHECK23-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP30:![0-9]+]]
+// CHECK23-NEXT:    store i32 [[ADD5]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK23-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP23:![0-9]+]]
 // CHECK23:       omp.inner.for.end:
 // CHECK23-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK23:       omp.loop.exit:
@@ -12535,7 +12535,7 @@ int bar(int n){
 //
 //
 // CHECK23-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l142.omp_outlined
-// CHECK23-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BN:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[C:%.*]], i32 noundef [[VLA1:%.*]], i32 noundef [[VLA3:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[CN:%.*]], ptr noundef nonnull align 4 dereferenceable(12) [[D:%.*]]) #[[ATTR1]] {
+// CHECK23-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BN:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[C:%.*]], i32 noundef [[VLA1:%.*]], i32 noundef [[VLA3:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[CN:%.*]], ptr noundef nonnull align 4 dereferenceable(12) [[D:%.*]]) #[[ATTR0]] {
 // CHECK23-NEXT:  entry:
 // CHECK23-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK23-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -12598,59 +12598,59 @@ int bar(int n){
 // CHECK23-NEXT:    store i32 [[TMP12]], ptr [[DOTOMP_IV]], align 4
 // CHECK23-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK23:       omp.inner.for.cond:
-// CHECK23-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP32:![0-9]+]]
-// CHECK23-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP32]]
+// CHECK23-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP25:![0-9]+]]
+// CHECK23-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP25]]
 // CHECK23-NEXT:    [[CMP5:%.*]] = icmp sle i32 [[TMP13]], [[TMP14]]
 // CHECK23-NEXT:    br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK23:       omp.inner.for.body:
-// CHECK23-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP32]]
+// CHECK23-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP25]]
 // CHECK23-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP15]], 1
 // CHECK23-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK23-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP32]]
-// CHECK23-NEXT:    [[TMP16:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP32]]
+// CHECK23-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP25]]
+// CHECK23-NEXT:    [[TMP16:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP25]]
 // CHECK23-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP16]], 1
-// CHECK23-NEXT:    store i32 [[ADD6]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP32]]
+// CHECK23-NEXT:    store i32 [[ADD6]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP25]]
 // CHECK23-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x float], ptr [[TMP0]], i32 0, i32 2
-// CHECK23-NEXT:    [[TMP17:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP32]]
+// CHECK23-NEXT:    [[TMP17:%.*]] = load float, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP25]]
 // CHECK23-NEXT:    [[CONV:%.*]] = fpext float [[TMP17]] to double
 // CHECK23-NEXT:    [[ADD7:%.*]] = fadd double [[CONV]], 1.000000e+00
 // CHECK23-NEXT:    [[CONV8:%.*]] = fptrunc double [[ADD7]] to float
-// CHECK23-NEXT:    store float [[CONV8]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP32]]
+// CHECK23-NEXT:    store float [[CONV8]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP25]]
 // CHECK23-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 3
-// CHECK23-NEXT:    [[TMP18:%.*]] = load float, ptr [[ARRAYIDX9]], align 4, !llvm.access.group [[ACC_GRP32]]
+// CHECK23-NEXT:    [[TMP18:%.*]] = load float, ptr [[ARRAYIDX9]], align 4, !llvm.access.group [[ACC_GRP25]]
 // CHECK23-NEXT:    [[CONV10:%.*]] = fpext float [[TMP18]] to double
 // CHECK23-NEXT:    [[ADD11:%.*]] = fadd double [[CONV10]], 1.000000e+00
 // CHECK23-NEXT:    [[CONV12:%.*]] = fptrunc double [[ADD11]] to float
-// CHECK23-NEXT:    store float [[CONV12]], ptr [[ARRAYIDX9]], align 4, !llvm.access.group [[ACC_GRP32]]
+// CHECK23-NEXT:    store float [[CONV12]], ptr [[ARRAYIDX9]], align 4, !llvm.access.group [[ACC_GRP25]]
 // CHECK23-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [5 x [10 x double]], ptr [[TMP3]], i32 0, i32 1
 // CHECK23-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x double], ptr [[ARRAYIDX13]], i32 0, i32 2
-// CHECK23-NEXT:    [[TMP19:%.*]] = load double, ptr [[ARRAYIDX14]], align 8, !llvm.access.group [[ACC_GRP32]]
+// CHECK23-NEXT:    [[TMP19:%.*]] = load double, ptr [[ARRAYIDX14]], align 8, !llvm.access.group [[ACC_GRP25]]
 // CHECK23-NEXT:    [[ADD15:%.*]] = fadd double [[TMP19]], 1.000000e+00
-// CHECK23-NEXT:    store double [[ADD15]], ptr [[ARRAYIDX14]], align 8, !llvm.access.group [[ACC_GRP32]]
+// CHECK23-NEXT:    store double [[ADD15]], ptr [[ARRAYIDX14]], align 8, !llvm.access.group [[ACC_GRP25]]
 // CHECK23-NEXT:    [[TMP20:%.*]] = mul nsw i32 1, [[TMP5]]
 // CHECK23-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds double, ptr [[TMP6]], i32 [[TMP20]]
 // CHECK23-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds double, ptr [[ARRAYIDX16]], i32 3
-// CHECK23-NEXT:    [[TMP21:%.*]] = load double, ptr [[ARRAYIDX17]], align 8, !llvm.access.group [[ACC_GRP32]]
+// CHECK23-NEXT:    [[TMP21:%.*]] = load double, ptr [[ARRAYIDX17]], align 8, !llvm.access.group [[ACC_GRP25]]
 // CHECK23-NEXT:    [[ADD18:%.*]] = fadd double [[TMP21]], 1.000000e+00
-// CHECK23-NEXT:    store double [[ADD18]], ptr [[ARRAYIDX17]], align 8, !llvm.access.group [[ACC_GRP32]]
+// CHECK23-NEXT:    store double [[ADD18]], ptr [[ARRAYIDX17]], align 8, !llvm.access.group [[ACC_GRP25]]
 // CHECK23-NEXT:    [[X:%.*]] = getelementptr inbounds [[STRUCT_TT:%.*]], ptr [[TMP7]], i32 0, i32 0
-// CHECK23-NEXT:    [[TMP22:%.*]] = load i64, ptr [[X]], align 4, !llvm.access.group [[ACC_GRP32]]
+// CHECK23-NEXT:    [[TMP22:%.*]] = load i64, ptr [[X]], align 4, !llvm.access.group [[ACC_GRP25]]
 // CHECK23-NEXT:    [[ADD19:%.*]] = add nsw i64 [[TMP22]], 1
-// CHECK23-NEXT:    store i64 [[ADD19]], ptr [[X]], align 4, !llvm.access.group [[ACC_GRP32]]
+// CHECK23-NEXT:    store i64 [[ADD19]], ptr [[X]], align 4, !llvm.access.group [[ACC_GRP25]]
 // CHECK23-NEXT:    [[Y:%.*]] = getelementptr inbounds [[STRUCT_TT]], ptr [[TMP7]], i32 0, i32 1
-// CHECK23-NEXT:    [[TMP23:%.*]] = load i8, ptr [[Y]], align 4, !llvm.access.group [[ACC_GRP32]]
+// CHECK23-NEXT:    [[TMP23:%.*]] = load i8, ptr [[Y]], align 4, !llvm.access.group [[ACC_GRP25]]
 // CHECK23-NEXT:    [[CONV20:%.*]] = sext i8 [[TMP23]] to i32
 // CHECK23-NEXT:    [[ADD21:%.*]] = add nsw i32 [[CONV20]], 1
 // CHECK23-NEXT:    [[CONV22:%.*]] = trunc i32 [[ADD21]] to i8
-// CHECK23-NEXT:    store i8 [[CONV22]], ptr [[Y]], align 4, !llvm.access.group [[ACC_GRP32]]
+// CHECK23-NEXT:    store i8 [[CONV22]], ptr [[Y]], align 4, !llvm.access.group [[ACC_GRP25]]
 // CHECK23-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK23:       omp.body.continue:
 // CHECK23-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK23:       omp.inner.for.inc:
-// CHECK23-NEXT:    [[TMP24:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP32]]
+// CHECK23-NEXT:    [[TMP24:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP25]]
 // CHECK23-NEXT:    [[ADD23:%.*]] = add nsw i32 [[TMP24]], 1
-// CHECK23-NEXT:    store i32 [[ADD23]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP32]]
-// CHECK23-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP33:![0-9]+]]
+// CHECK23-NEXT:    store i32 [[ADD23]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP25]]
+// CHECK23-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP26:![0-9]+]]
 // CHECK23:       omp.inner.for.end:
 // CHECK23-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK23:       omp.loop.exit:
@@ -12700,7 +12700,7 @@ int bar(int n){
 //
 //
 // CHECK23-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l197.omp_outlined
-// CHECK23-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], i32 noundef [[N:%.*]], i32 noundef [[AA:%.*]], i32 noundef [[AAA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR1]] {
+// CHECK23-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], i32 noundef [[N:%.*]], i32 noundef [[AA:%.*]], i32 noundef [[AAA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
 // CHECK23-NEXT:  entry:
 // CHECK23-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK23-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -12772,42 +12772,42 @@ int bar(int n){
 // CHECK23-NEXT:    store i32 [[TMP15]], ptr [[DOTOMP_IV]], align 4
 // CHECK23-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK23:       omp.inner.for.cond:
-// CHECK23-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP35:![0-9]+]]
-// CHECK23-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP35]]
+// CHECK23-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP28:![0-9]+]]
+// CHECK23-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP28]]
 // CHECK23-NEXT:    [[ADD7:%.*]] = add i32 [[TMP17]], 1
 // CHECK23-NEXT:    [[CMP8:%.*]] = icmp ult i32 [[TMP16]], [[ADD7]]
 // CHECK23-NEXT:    br i1 [[CMP8]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK23:       omp.inner.for.body:
-// CHECK23-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4, !llvm.access.group [[ACC_GRP35]]
-// CHECK23-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP35]]
+// CHECK23-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4, !llvm.access.group [[ACC_GRP28]]
+// CHECK23-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP28]]
 // CHECK23-NEXT:    [[MUL:%.*]] = mul i32 [[TMP19]], 1
 // CHECK23-NEXT:    [[ADD9:%.*]] = add i32 [[TMP18]], [[MUL]]
-// CHECK23-NEXT:    store i32 [[ADD9]], ptr [[I5]], align 4, !llvm.access.group [[ACC_GRP35]]
-// CHECK23-NEXT:    [[TMP20:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP35]]
+// CHECK23-NEXT:    store i32 [[ADD9]], ptr [[I5]], align 4, !llvm.access.group [[ACC_GRP28]]
+// CHECK23-NEXT:    [[TMP20:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP28]]
 // CHECK23-NEXT:    [[ADD10:%.*]] = add nsw i32 [[TMP20]], 1
-// CHECK23-NEXT:    store i32 [[ADD10]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP35]]
-// CHECK23-NEXT:    [[TMP21:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP35]]
+// CHECK23-NEXT:    store i32 [[ADD10]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP28]]
+// CHECK23-NEXT:    [[TMP21:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP28]]
 // CHECK23-NEXT:    [[CONV:%.*]] = sext i16 [[TMP21]] to i32
 // CHECK23-NEXT:    [[ADD11:%.*]] = add nsw i32 [[CONV]], 1
 // CHECK23-NEXT:    [[CONV12:%.*]] = trunc i32 [[ADD11]] to i16
-// CHECK23-NEXT:    store i16 [[CONV12]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP35]]
-// CHECK23-NEXT:    [[TMP22:%.*]] = load i8, ptr [[AAA_ADDR]], align 1, !llvm.access.group [[ACC_GRP35]]
+// CHECK23-NEXT:    store i16 [[CONV12]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP28]]
+// CHECK23-NEXT:    [[TMP22:%.*]] = load i8, ptr [[AAA_ADDR]], align 1, !llvm.access.group [[ACC_GRP28]]
 // CHECK23-NEXT:    [[CONV13:%.*]] = sext i8 [[TMP22]] to i32
 // CHECK23-NEXT:    [[ADD14:%.*]] = add nsw i32 [[CONV13]], 1
 // CHECK23-NEXT:    [[CONV15:%.*]] = trunc i32 [[ADD14]] to i8
-// CHECK23-NEXT:    store i8 [[CONV15]], ptr [[AAA_ADDR]], align 1, !llvm.access.group [[ACC_GRP35]]
+// CHECK23-NEXT:    store i8 [[CONV15]], ptr [[AAA_ADDR]], align 1, !llvm.access.group [[ACC_GRP28]]
 // CHECK23-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP0]], i32 0, i32 2
-// CHECK23-NEXT:    [[TMP23:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP35]]
+// CHECK23-NEXT:    [[TMP23:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP28]]
 // CHECK23-NEXT:    [[ADD16:%.*]] = add nsw i32 [[TMP23]], 1
-// CHECK23-NEXT:    store i32 [[ADD16]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP35]]
+// CHECK23-NEXT:    store i32 [[ADD16]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP28]]
 // CHECK23-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK23:       omp.body.continue:
 // CHECK23-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK23:       omp.inner.for.inc:
-// CHECK23-NEXT:    [[TMP24:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP35]]
+// CHECK23-NEXT:    [[TMP24:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP28]]
 // CHECK23-NEXT:    [[ADD17:%.*]] = add i32 [[TMP24]], 1
-// CHECK23-NEXT:    store i32 [[ADD17]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP35]]
-// CHECK23-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP36:![0-9]+]]
+// CHECK23-NEXT:    store i32 [[ADD17]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP28]]
+// CHECK23-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP29:![0-9]+]]
 // CHECK23:       omp.inner.for.end:
 // CHECK23-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK23:       omp.loop.exit:
@@ -12869,7 +12869,7 @@ int bar(int n){
 //
 //
 // CHECK23-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l215.omp_outlined
-// CHECK23-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]], i32 noundef [[B:%.*]], i32 noundef [[VLA:%.*]], i32 noundef [[VLA1:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[C:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR1]] {
+// CHECK23-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]], i32 noundef [[B:%.*]], i32 noundef [[VLA:%.*]], i32 noundef [[VLA1:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[C:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] {
 // CHECK23-NEXT:  entry:
 // CHECK23-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK23-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -12924,37 +12924,37 @@ int bar(int n){
 // CHECK23:       omp_if.then:
 // CHECK23-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK23:       omp.inner.for.cond:
-// CHECK23-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP38:![0-9]+]]
-// CHECK23-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP38]]
+// CHECK23-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP31:![0-9]+]]
+// CHECK23-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP31]]
 // CHECK23-NEXT:    [[CMP3:%.*]] = icmp sle i32 [[TMP10]], [[TMP11]]
 // CHECK23-NEXT:    br i1 [[CMP3]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK23:       omp.inner.for.body:
-// CHECK23-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP38]]
+// CHECK23-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP31]]
 // CHECK23-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1
 // CHECK23-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK23-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP38]]
-// CHECK23-NEXT:    [[TMP13:%.*]] = load i32, ptr [[B_ADDR]], align 4, !llvm.access.group [[ACC_GRP38]]
+// CHECK23-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP31]]
+// CHECK23-NEXT:    [[TMP13:%.*]] = load i32, ptr [[B_ADDR]], align 4, !llvm.access.group [[ACC_GRP31]]
 // CHECK23-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP13]] to double
 // CHECK23-NEXT:    [[ADD4:%.*]] = fadd double [[CONV]], 1.500000e+00
 // CHECK23-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_S1:%.*]], ptr [[TMP0]], i32 0, i32 0
-// CHECK23-NEXT:    store double [[ADD4]], ptr [[A]], align 4, !llvm.access.group [[ACC_GRP38]]
+// CHECK23-NEXT:    store double [[ADD4]], ptr [[A]], align 4, !llvm.access.group [[ACC_GRP31]]
 // CHECK23-NEXT:    [[A5:%.*]] = getelementptr inbounds [[STRUCT_S1]], ptr [[TMP0]], i32 0, i32 0
-// CHECK23-NEXT:    [[TMP14:%.*]] = load double, ptr [[A5]], align 4, !llvm.access.group [[ACC_GRP38]]
+// CHECK23-NEXT:    [[TMP14:%.*]] = load double, ptr [[A5]], align 4, !llvm.access.group [[ACC_GRP31]]
 // CHECK23-NEXT:    [[INC:%.*]] = fadd double [[TMP14]], 1.000000e+00
-// CHECK23-NEXT:    store double [[INC]], ptr [[A5]], align 4, !llvm.access.group [[ACC_GRP38]]
+// CHECK23-NEXT:    store double [[INC]], ptr [[A5]], align 4, !llvm.access.group [[ACC_GRP31]]
 // CHECK23-NEXT:    [[CONV6:%.*]] = fptosi double [[INC]] to i16
 // CHECK23-NEXT:    [[TMP15:%.*]] = mul nsw i32 1, [[TMP2]]
 // CHECK23-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[TMP3]], i32 [[TMP15]]
 // CHECK23-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i16, ptr [[ARRAYIDX]], i32 1
-// CHECK23-NEXT:    store i16 [[CONV6]], ptr [[ARRAYIDX7]], align 2, !llvm.access.group [[ACC_GRP38]]
+// CHECK23-NEXT:    store i16 [[CONV6]], ptr [[ARRAYIDX7]], align 2, !llvm.access.group [[ACC_GRP31]]
 // CHECK23-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK23:       omp.body.continue:
 // CHECK23-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK23:       omp.inner.for.inc:
-// CHECK23-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP38]]
+// CHECK23-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP31]]
 // CHECK23-NEXT:    [[ADD8:%.*]] = add nsw i32 [[TMP16]], 1
-// CHECK23-NEXT:    store i32 [[ADD8]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP38]]
-// CHECK23-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP39:![0-9]+]]
+// CHECK23-NEXT:    store i32 [[ADD8]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP31]]
+// CHECK23-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP32:![0-9]+]]
 // CHECK23:       omp.inner.for.end:
 // CHECK23-NEXT:    br label [[OMP_IF_END:%.*]]
 // CHECK23:       omp_if.else:
@@ -12990,7 +12990,7 @@ int bar(int n){
 // CHECK23-NEXT:    [[TMP23:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
 // CHECK23-NEXT:    [[ADD24:%.*]] = add nsw i32 [[TMP23]], 1
 // CHECK23-NEXT:    store i32 [[ADD24]], ptr [[DOTOMP_IV]], align 4
-// CHECK23-NEXT:    br label [[OMP_INNER_FOR_COND9]], !llvm.loop [[LOOP41:![0-9]+]]
+// CHECK23-NEXT:    br label [[OMP_INNER_FOR_COND9]], !llvm.loop [[LOOP34:![0-9]+]]
 // CHECK23:       omp.inner.for.end25:
 // CHECK23-NEXT:    br label [[OMP_IF_END]]
 // CHECK23:       omp_if.end:
@@ -13030,7 +13030,7 @@ int bar(int n){
 //
 //
 // CHECK23-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l180.omp_outlined
-// CHECK23-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR1]] {
+// CHECK23-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
 // CHECK23-NEXT:  entry:
 // CHECK23-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK23-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -13072,35 +13072,35 @@ int bar(int n){
 // CHECK23-NEXT:    store i32 [[TMP5]], ptr [[DOTOMP_IV]], align 4
 // CHECK23-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK23:       omp.inner.for.cond:
-// CHECK23-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP43:![0-9]+]]
-// CHECK23-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP43]]
+// CHECK23-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP36:![0-9]+]]
+// CHECK23-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP36]]
 // CHECK23-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP6]], [[TMP7]]
 // CHECK23-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK23:       omp.inner.for.body:
-// CHECK23-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP43]]
+// CHECK23-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP36]]
 // CHECK23-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1
 // CHECK23-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK23-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP43]]
-// CHECK23-NEXT:    [[TMP9:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP43]]
+// CHECK23-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP36]]
+// CHECK23-NEXT:    [[TMP9:%.*]] = load i32, ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP36]]
 // CHECK23-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP9]], 1
-// CHECK23-NEXT:    store i32 [[ADD2]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP43]]
-// CHECK23-NEXT:    [[TMP10:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP43]]
+// CHECK23-NEXT:    store i32 [[ADD2]], ptr [[A_ADDR]], align 4, !llvm.access.group [[ACC_GRP36]]
+// CHECK23-NEXT:    [[TMP10:%.*]] = load i16, ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP36]]
 // CHECK23-NEXT:    [[CONV:%.*]] = sext i16 [[TMP10]] to i32
 // CHECK23-NEXT:    [[ADD3:%.*]] = add nsw i32 [[CONV]], 1
 // CHECK23-NEXT:    [[CONV4:%.*]] = trunc i32 [[ADD3]] to i16
-// CHECK23-NEXT:    store i16 [[CONV4]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP43]]
+// CHECK23-NEXT:    store i16 [[CONV4]], ptr [[AA_ADDR]], align 2, !llvm.access.group [[ACC_GRP36]]
 // CHECK23-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP0]], i32 0, i32 2
-// CHECK23-NEXT:    [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP43]]
+// CHECK23-NEXT:    [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP36]]
 // CHECK23-NEXT:    [[ADD5:%.*]] = add nsw i32 [[TMP11]], 1
-// CHECK23-NEXT:    store i32 [[ADD5]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP43]]
+// CHECK23-NEXT:    store i32 [[ADD5]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP36]]
 // CHECK23-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK23:       omp.body.continue:
 // CHECK23-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK23:       omp.inner.for.inc:
-// CHECK23-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP43]]
+// CHECK23-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP36]]
 // CHECK23-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP12]], 1
-// CHECK23-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP43]]
-// CHECK23-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP44:![0-9]+]]
+// CHECK23-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP36]]
+// CHECK23-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP37:![0-9]+]]
 // CHECK23:       omp.inner.for.end:
 // CHECK23-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK23:       omp.loop.exit:
diff --git a/clang/test/OpenMP/teams_distribute_simd_codegen.cpp b/clang/test/OpenMP/teams_distribute_simd_codegen.cpp
index b2ce6a13a308f..751bb6ca35ba8 100644
--- a/clang/test/OpenMP/teams_distribute_simd_codegen.cpp
+++ b/clang/test/OpenMP/teams_distribute_simd_codegen.cpp
@@ -331,7 +331,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP50:%.*]] = icmp ne i32 [[TMP49]], 0
 // CHECK1-NEXT:    br i1 [[TMP50]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK1:       omp_offload.failed:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z21teams_argument_globali_l30(i64 [[TMP2]], i64 [[TMP4]], ptr @a, i64 [[TMP6]], i64 [[TMP8]]) #[[ATTR4:[0-9]+]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z21teams_argument_globali_l30(i64 [[TMP2]], i64 [[TMP4]], ptr @a, i64 [[TMP6]], i64 [[TMP8]]) #[[ATTR3:[0-9]+]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    [[TMP51:%.*]] = load i32, ptr [[N_ADDR]], align 4
@@ -391,7 +391,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP79:%.*]] = icmp ne i32 [[TMP78]], 0
 // CHECK1-NEXT:    br i1 [[TMP79]], label [[OMP_OFFLOAD_FAILED16:%.*]], label [[OMP_OFFLOAD_CONT17:%.*]]
 // CHECK1:       omp_offload.failed16:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z21teams_argument_globali_l36(i64 [[TMP52]], ptr @a) #[[ATTR4]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z21teams_argument_globali_l36(i64 [[TMP52]], ptr @a) #[[ATTR3]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT17]]
 // CHECK1:       omp_offload.cont17:
 // CHECK1-NEXT:    [[TMP80:%.*]] = load i32, ptr @a, align 4
@@ -421,7 +421,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z21teams_argument_globali_l30.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[I:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[A:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[I:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[A:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -485,27 +485,27 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    store i32 [[TMP13]], ptr [[DOTOMP_IV]], align 4
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK1:       omp.inner.for.cond:
-// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP7:![0-9]+]]
-// CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP7]]
+// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP5:![0-9]+]]
+// CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP5]]
 // CHECK1-NEXT:    [[CMP6:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]]
 // CHECK1-NEXT:    br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK1:       omp.inner.for.body:
-// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP7]]
+// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP5]]
 // CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP16]], 1
 // CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK1-NEXT:    store i32 [[ADD]], ptr [[I4]], align 4, !llvm.access.group [[ACC_GRP7]]
-// CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I4]], align 4, !llvm.access.group [[ACC_GRP7]]
+// CHECK1-NEXT:    store i32 [[ADD]], ptr [[I4]], align 4, !llvm.access.group [[ACC_GRP5]]
+// CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I4]], align 4, !llvm.access.group [[ACC_GRP5]]
 // CHECK1-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP17]] to i64
 // CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [100 x i32], ptr [[TMP2]], i64 0, i64 [[IDXPROM]]
-// CHECK1-NEXT:    store i32 0, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP7]]
+// CHECK1-NEXT:    store i32 0, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP5]]
 // CHECK1-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK1:       omp.body.continue:
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK1:       omp.inner.for.inc:
-// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP7]]
+// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP5]]
 // CHECK1-NEXT:    [[ADD7:%.*]] = add nsw i32 [[TMP18]], 1
-// CHECK1-NEXT:    store i32 [[ADD7]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP7]]
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP8:![0-9]+]]
+// CHECK1-NEXT:    store i32 [[ADD7]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP5]]
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP6:![0-9]+]]
 // CHECK1:       omp.inner.for.end:
 // CHECK1-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK1:       omp.loop.exit:
@@ -542,7 +542,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z21teams_argument_globali_l36.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[A:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[A:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -621,7 +621,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
 // CHECK1-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP17]], 1
 // CHECK1-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP14:![0-9]+]]
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP12:![0-9]+]]
 // CHECK1:       omp.inner.for.end:
 // CHECK1-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK1:       omp.loop.exit:
@@ -646,7 +646,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK1-SAME: () #[[ATTR5:[0-9]+]] {
+// CHECK1-SAME: () #[[ATTR4:[0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK1-NEXT:    ret void
@@ -773,7 +773,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP50:%.*]] = icmp ne i32 [[TMP49]], 0
 // CHECK3-NEXT:    br i1 [[TMP50]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK3:       omp_offload.failed:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z21teams_argument_globali_l30(i32 [[TMP2]], i32 [[TMP4]], ptr @a, i32 [[TMP6]], i32 [[TMP8]]) #[[ATTR4:[0-9]+]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z21teams_argument_globali_l30(i32 [[TMP2]], i32 [[TMP4]], ptr @a, i32 [[TMP6]], i32 [[TMP8]]) #[[ATTR3:[0-9]+]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK3:       omp_offload.cont:
 // CHECK3-NEXT:    [[TMP51:%.*]] = load i32, ptr [[N_ADDR]], align 4
@@ -833,7 +833,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP79:%.*]] = icmp ne i32 [[TMP78]], 0
 // CHECK3-NEXT:    br i1 [[TMP79]], label [[OMP_OFFLOAD_FAILED16:%.*]], label [[OMP_OFFLOAD_CONT17:%.*]]
 // CHECK3:       omp_offload.failed16:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z21teams_argument_globali_l36(i32 [[TMP52]], ptr @a) #[[ATTR4]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z21teams_argument_globali_l36(i32 [[TMP52]], ptr @a) #[[ATTR3]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT17]]
 // CHECK3:       omp_offload.cont17:
 // CHECK3-NEXT:    [[TMP80:%.*]] = load i32, ptr @a, align 4
@@ -863,7 +863,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z21teams_argument_globali_l30.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[I:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[A:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[I:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[A:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -927,26 +927,26 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    store i32 [[TMP13]], ptr [[DOTOMP_IV]], align 4
 // CHECK3-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK3:       omp.inner.for.cond:
-// CHECK3-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP8:![0-9]+]]
-// CHECK3-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP8]]
+// CHECK3-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP6:![0-9]+]]
+// CHECK3-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP6]]
 // CHECK3-NEXT:    [[CMP6:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]]
 // CHECK3-NEXT:    br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK3:       omp.inner.for.body:
-// CHECK3-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP8]]
+// CHECK3-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP6]]
 // CHECK3-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP16]], 1
 // CHECK3-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK3-NEXT:    store i32 [[ADD]], ptr [[I4]], align 4, !llvm.access.group [[ACC_GRP8]]
-// CHECK3-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I4]], align 4, !llvm.access.group [[ACC_GRP8]]
+// CHECK3-NEXT:    store i32 [[ADD]], ptr [[I4]], align 4, !llvm.access.group [[ACC_GRP6]]
+// CHECK3-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I4]], align 4, !llvm.access.group [[ACC_GRP6]]
 // CHECK3-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [100 x i32], ptr [[TMP2]], i32 0, i32 [[TMP17]]
-// CHECK3-NEXT:    store i32 0, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP8]]
+// CHECK3-NEXT:    store i32 0, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP6]]
 // CHECK3-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK3:       omp.body.continue:
 // CHECK3-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK3:       omp.inner.for.inc:
-// CHECK3-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP8]]
+// CHECK3-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP6]]
 // CHECK3-NEXT:    [[ADD7:%.*]] = add nsw i32 [[TMP18]], 1
-// CHECK3-NEXT:    store i32 [[ADD7]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP8]]
-// CHECK3-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP9:![0-9]+]]
+// CHECK3-NEXT:    store i32 [[ADD7]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP6]]
+// CHECK3-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP7:![0-9]+]]
 // CHECK3:       omp.inner.for.end:
 // CHECK3-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK3:       omp.loop.exit:
@@ -983,7 +983,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z21teams_argument_globali_l36.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[A:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[A:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1061,7 +1061,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
 // CHECK3-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP17]], 1
 // CHECK3-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4
-// CHECK3-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP15:![0-9]+]]
+// CHECK3-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP13:![0-9]+]]
 // CHECK3:       omp.inner.for.end:
 // CHECK3-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK3:       omp.loop.exit:
@@ -1086,7 +1086,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK3-SAME: () #[[ATTR5:[0-9]+]] {
+// CHECK3-SAME: () #[[ATTR4:[0-9]+]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK3-NEXT:    ret void
@@ -1450,7 +1450,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[TMP37:%.*]] = icmp ne i32 [[TMP36]], 0
 // CHECK9-NEXT:    br i1 [[TMP37]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK9:       omp_offload.failed:
-// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z15teams_local_argv_l75(i64 [[TMP4]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR4:[0-9]+]]
+// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z15teams_local_argv_l75(i64 [[TMP4]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR3:[0-9]+]]
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK9:       omp_offload.cont:
 // CHECK9-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i64 0
@@ -1476,7 +1476,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z15teams_local_argv_l75.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1538,27 +1538,27 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    store i32 [[TMP13]], ptr [[DOTOMP_IV]], align 4
 // CHECK9-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK9:       omp.inner.for.cond:
-// CHECK9-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP5:![0-9]+]]
-// CHECK9-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP5]]
+// CHECK9-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP4:![0-9]+]]
+// CHECK9-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP4]]
 // CHECK9-NEXT:    [[CMP5:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]]
 // CHECK9-NEXT:    br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK9:       omp.inner.for.body:
-// CHECK9-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP5]]
+// CHECK9-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP4]]
 // CHECK9-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP16]], 1
 // CHECK9-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK9-NEXT:    store i32 [[ADD]], ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP5]]
-// CHECK9-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP5]]
+// CHECK9-NEXT:    store i32 [[ADD]], ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP4]]
+// CHECK9-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP4]]
 // CHECK9-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP17]] to i64
 // CHECK9-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i64 [[IDXPROM]]
-// CHECK9-NEXT:    store i32 0, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP5]]
+// CHECK9-NEXT:    store i32 0, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP4]]
 // CHECK9-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK9:       omp.body.continue:
 // CHECK9-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK9:       omp.inner.for.inc:
-// CHECK9-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP5]]
+// CHECK9-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP4]]
 // CHECK9-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP18]], 1
-// CHECK9-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP5]]
-// CHECK9-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP6:![0-9]+]]
+// CHECK9-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP4]]
+// CHECK9-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP5:![0-9]+]]
 // CHECK9:       omp.inner.for.end:
 // CHECK9-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK9:       omp.loop.exit:
@@ -1583,7 +1583,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK9-SAME: () #[[ATTR6:[0-9]+]] {
+// CHECK9-SAME: () #[[ATTR5:[0-9]+]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK9-NEXT:    ret void
@@ -1679,7 +1679,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[TMP37:%.*]] = icmp ne i32 [[TMP36]], 0
 // CHECK11-NEXT:    br i1 [[TMP37]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK11:       omp_offload.failed:
-// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z15teams_local_argv_l75(i32 [[TMP3]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR4:[0-9]+]]
+// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z15teams_local_argv_l75(i32 [[TMP3]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR3:[0-9]+]]
 // CHECK11-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK11:       omp_offload.cont:
 // CHECK11-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[VLA]], i32 0
@@ -1705,7 +1705,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z15teams_local_argv_l75.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1767,26 +1767,26 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    store i32 [[TMP13]], ptr [[DOTOMP_IV]], align 4
 // CHECK11-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK11:       omp.inner.for.cond:
-// CHECK11-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP6:![0-9]+]]
-// CHECK11-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP6]]
+// CHECK11-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP5:![0-9]+]]
+// CHECK11-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP5]]
 // CHECK11-NEXT:    [[CMP5:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]]
 // CHECK11-NEXT:    br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK11:       omp.inner.for.body:
-// CHECK11-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP6]]
+// CHECK11-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP5]]
 // CHECK11-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP16]], 1
 // CHECK11-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK11-NEXT:    store i32 [[ADD]], ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP6]]
-// CHECK11-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP6]]
+// CHECK11-NEXT:    store i32 [[ADD]], ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP5]]
+// CHECK11-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP5]]
 // CHECK11-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 [[TMP17]]
-// CHECK11-NEXT:    store i32 0, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP6]]
+// CHECK11-NEXT:    store i32 0, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP5]]
 // CHECK11-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK11:       omp.body.continue:
 // CHECK11-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK11:       omp.inner.for.inc:
-// CHECK11-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP6]]
+// CHECK11-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP5]]
 // CHECK11-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP18]], 1
-// CHECK11-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP6]]
-// CHECK11-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP7:![0-9]+]]
+// CHECK11-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP5]]
+// CHECK11-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP6:![0-9]+]]
 // CHECK11:       omp.inner.for.end:
 // CHECK11-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK11:       omp.loop.exit:
@@ -1811,7 +1811,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK11-SAME: () #[[ATTR6:[0-9]+]] {
+// CHECK11-SAME: () #[[ATTR5:[0-9]+]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK11-NEXT:    ret void
@@ -2050,7 +2050,7 @@ int main (int argc, char **argv) {
 // CHECK17-NEXT:    [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0
 // CHECK17-NEXT:    br i1 [[TMP32]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK17:       omp_offload.failed:
-// CHECK17-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l123(ptr [[THIS1]]) #[[ATTR3:[0-9]+]]
+// CHECK17-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l123(ptr [[THIS1]]) #[[ATTR2:[0-9]+]]
 // CHECK17-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK17:       omp_offload.cont:
 // CHECK17-NEXT:    [[A2:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0
@@ -2070,7 +2070,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l123.omp_outlined
-// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK17-NEXT:  entry:
 // CHECK17-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -2108,31 +2108,31 @@ int main (int argc, char **argv) {
 // CHECK17-NEXT:    store i32 [[TMP5]], ptr [[DOTOMP_IV]], align 4
 // CHECK17-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK17:       omp.inner.for.cond:
-// CHECK17-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP5:![0-9]+]]
-// CHECK17-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP5]]
+// CHECK17-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP4:![0-9]+]]
+// CHECK17-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP4]]
 // CHECK17-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP6]], [[TMP7]]
 // CHECK17-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK17:       omp.inner.for.body:
-// CHECK17-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP5]]
+// CHECK17-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP4]]
 // CHECK17-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1
 // CHECK17-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK17-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP5]]
+// CHECK17-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP4]]
 // CHECK17-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT_SS:%.*]], ptr [[TMP0]], i32 0, i32 1
-// CHECK17-NEXT:    [[TMP9:%.*]] = load float, ptr [[B]], align 4, !llvm.access.group [[ACC_GRP5]]
+// CHECK17-NEXT:    [[TMP9:%.*]] = load float, ptr [[B]], align 4, !llvm.access.group [[ACC_GRP4]]
 // CHECK17-NEXT:    [[CONV:%.*]] = fptosi float [[TMP9]] to i32
 // CHECK17-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[TMP0]], i32 0, i32 0
-// CHECK17-NEXT:    [[TMP10:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP5]]
+// CHECK17-NEXT:    [[TMP10:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP4]]
 // CHECK17-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP10]] to i64
 // CHECK17-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [123 x i32], ptr [[A]], i64 0, i64 [[IDXPROM]]
-// CHECK17-NEXT:    store i32 [[CONV]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP5]]
+// CHECK17-NEXT:    store i32 [[CONV]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP4]]
 // CHECK17-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK17:       omp.body.continue:
 // CHECK17-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK17:       omp.inner.for.inc:
-// CHECK17-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP5]]
+// CHECK17-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP4]]
 // CHECK17-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP11]], 1
-// CHECK17-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP5]]
-// CHECK17-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP6:![0-9]+]]
+// CHECK17-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP4]]
+// CHECK17-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP5:![0-9]+]]
 // CHECK17:       omp.inner.for.end:
 // CHECK17-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK17:       omp.loop.exit:
@@ -2148,7 +2148,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK17-SAME: () #[[ATTR5:[0-9]+]] {
+// CHECK17-SAME: () #[[ATTR4:[0-9]+]] {
 // CHECK17-NEXT:  entry:
 // CHECK17-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK17-NEXT:    ret void
@@ -2235,7 +2235,7 @@ int main (int argc, char **argv) {
 // CHECK19-NEXT:    [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0
 // CHECK19-NEXT:    br i1 [[TMP32]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK19:       omp_offload.failed:
-// CHECK19-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l123(ptr [[THIS1]]) #[[ATTR3:[0-9]+]]
+// CHECK19-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l123(ptr [[THIS1]]) #[[ATTR2:[0-9]+]]
 // CHECK19-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK19:       omp_offload.cont:
 // CHECK19-NEXT:    [[A2:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0
@@ -2255,7 +2255,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l123.omp_outlined
-// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK19-NEXT:  entry:
 // CHECK19-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -2293,30 +2293,30 @@ int main (int argc, char **argv) {
 // CHECK19-NEXT:    store i32 [[TMP5]], ptr [[DOTOMP_IV]], align 4
 // CHECK19-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK19:       omp.inner.for.cond:
-// CHECK19-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP6:![0-9]+]]
-// CHECK19-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP6]]
+// CHECK19-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP5:![0-9]+]]
+// CHECK19-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP5]]
 // CHECK19-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP6]], [[TMP7]]
 // CHECK19-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK19:       omp.inner.for.body:
-// CHECK19-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP6]]
+// CHECK19-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP5]]
 // CHECK19-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1
 // CHECK19-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK19-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP6]]
+// CHECK19-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP5]]
 // CHECK19-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT_SS:%.*]], ptr [[TMP0]], i32 0, i32 1
-// CHECK19-NEXT:    [[TMP9:%.*]] = load float, ptr [[B]], align 4, !llvm.access.group [[ACC_GRP6]]
+// CHECK19-NEXT:    [[TMP9:%.*]] = load float, ptr [[B]], align 4, !llvm.access.group [[ACC_GRP5]]
 // CHECK19-NEXT:    [[CONV:%.*]] = fptosi float [[TMP9]] to i32
 // CHECK19-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[TMP0]], i32 0, i32 0
-// CHECK19-NEXT:    [[TMP10:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP6]]
+// CHECK19-NEXT:    [[TMP10:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP5]]
 // CHECK19-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [123 x i32], ptr [[A]], i32 0, i32 [[TMP10]]
-// CHECK19-NEXT:    store i32 [[CONV]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP6]]
+// CHECK19-NEXT:    store i32 [[CONV]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP5]]
 // CHECK19-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK19:       omp.body.continue:
 // CHECK19-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK19:       omp.inner.for.inc:
-// CHECK19-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP6]]
+// CHECK19-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP5]]
 // CHECK19-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP11]], 1
-// CHECK19-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP6]]
-// CHECK19-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP7:![0-9]+]]
+// CHECK19-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP5]]
+// CHECK19-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP6:![0-9]+]]
 // CHECK19:       omp.inner.for.end:
 // CHECK19-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK19:       omp.loop.exit:
@@ -2332,7 +2332,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK19-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK19-SAME: () #[[ATTR5:[0-9]+]] {
+// CHECK19-SAME: () #[[ATTR4:[0-9]+]] {
 // CHECK19-NEXT:  entry:
 // CHECK19-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK19-NEXT:    ret void
@@ -2419,7 +2419,7 @@ int main (int argc, char **argv) {
 // CHECK21-NEXT:    [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0
 // CHECK21-NEXT:    br i1 [[TMP32]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK21:       omp_offload.failed:
-// CHECK21-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l123(ptr [[THIS1]]) #[[ATTR3:[0-9]+]]
+// CHECK21-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l123(ptr [[THIS1]]) #[[ATTR2:[0-9]+]]
 // CHECK21-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK21:       omp_offload.cont:
 // CHECK21-NEXT:    [[A2:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0
@@ -2451,7 +2451,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK21-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l123.omp_outlined
-// CHECK21-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK21-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR1]] {
 // CHECK21-NEXT:  entry:
 // CHECK21-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK21-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -2495,31 +2495,31 @@ int main (int argc, char **argv) {
 // CHECK21:       omp_if.then:
 // CHECK21-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK21:       omp.inner.for.cond:
-// CHECK21-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP5:![0-9]+]]
-// CHECK21-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP5]]
+// CHECK21-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP4:![0-9]+]]
+// CHECK21-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP4]]
 // CHECK21-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]]
 // CHECK21-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK21:       omp.inner.for.body:
-// CHECK21-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP5]]
+// CHECK21-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP4]]
 // CHECK21-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP9]], 1
 // CHECK21-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK21-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP5]]
+// CHECK21-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP4]]
 // CHECK21-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT_SS:%.*]], ptr [[TMP0]], i32 0, i32 1
-// CHECK21-NEXT:    [[TMP10:%.*]] = load float, ptr [[B]], align 4, !nontemporal !6, !llvm.access.group [[ACC_GRP5]]
+// CHECK21-NEXT:    [[TMP10:%.*]] = load float, ptr [[B]], align 4, !nontemporal !5, !llvm.access.group [[ACC_GRP4]]
 // CHECK21-NEXT:    [[CONV:%.*]] = fptosi float [[TMP10]] to i32
 // CHECK21-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[TMP0]], i32 0, i32 0
-// CHECK21-NEXT:    [[TMP11:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP5]]
+// CHECK21-NEXT:    [[TMP11:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP4]]
 // CHECK21-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP11]] to i64
 // CHECK21-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [123 x i32], ptr [[A]], i64 0, i64 [[IDXPROM]]
-// CHECK21-NEXT:    store i32 [[CONV]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP5]]
+// CHECK21-NEXT:    store i32 [[CONV]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP4]]
 // CHECK21-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK21:       omp.body.continue:
 // CHECK21-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK21:       omp.inner.for.inc:
-// CHECK21-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP5]]
+// CHECK21-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP4]]
 // CHECK21-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP12]], 1
-// CHECK21-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP5]]
-// CHECK21-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP7:![0-9]+]]
+// CHECK21-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP4]]
+// CHECK21-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP6:![0-9]+]]
 // CHECK21:       omp.inner.for.end:
 // CHECK21-NEXT:    br label [[OMP_IF_END:%.*]]
 // CHECK21:       omp_if.else:
@@ -2549,7 +2549,7 @@ int main (int argc, char **argv) {
 // CHECK21-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
 // CHECK21-NEXT:    [[ADD15:%.*]] = add nsw i32 [[TMP18]], 1
 // CHECK21-NEXT:    store i32 [[ADD15]], ptr [[DOTOMP_IV]], align 4
-// CHECK21-NEXT:    br label [[OMP_INNER_FOR_COND3]], !llvm.loop [[LOOP10:![0-9]+]]
+// CHECK21-NEXT:    br label [[OMP_INNER_FOR_COND3]], !llvm.loop [[LOOP9:![0-9]+]]
 // CHECK21:       omp.inner.for.end16:
 // CHECK21-NEXT:    br label [[OMP_IF_END]]
 // CHECK21:       omp_if.end:
@@ -2567,7 +2567,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK21-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK21-SAME: () #[[ATTR5:[0-9]+]] {
+// CHECK21-SAME: () #[[ATTR4:[0-9]+]] {
 // CHECK21-NEXT:  entry:
 // CHECK21-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK21-NEXT:    ret void
@@ -2654,7 +2654,7 @@ int main (int argc, char **argv) {
 // CHECK23-NEXT:    [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0
 // CHECK23-NEXT:    br i1 [[TMP32]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK23:       omp_offload.failed:
-// CHECK23-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l123(ptr [[THIS1]]) #[[ATTR3:[0-9]+]]
+// CHECK23-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l123(ptr [[THIS1]]) #[[ATTR2:[0-9]+]]
 // CHECK23-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK23:       omp_offload.cont:
 // CHECK23-NEXT:    [[A2:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0
@@ -2686,7 +2686,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK23-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l123.omp_outlined
-// CHECK23-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK23-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR1]] {
 // CHECK23-NEXT:  entry:
 // CHECK23-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK23-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -2730,30 +2730,30 @@ int main (int argc, char **argv) {
 // CHECK23:       omp_if.then:
 // CHECK23-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK23:       omp.inner.for.cond:
-// CHECK23-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP6:![0-9]+]]
-// CHECK23-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP6]]
+// CHECK23-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP5:![0-9]+]]
+// CHECK23-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP5]]
 // CHECK23-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]]
 // CHECK23-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK23:       omp.inner.for.body:
-// CHECK23-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP6]]
+// CHECK23-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP5]]
 // CHECK23-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP9]], 1
 // CHECK23-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK23-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP6]]
+// CHECK23-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP5]]
 // CHECK23-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT_SS:%.*]], ptr [[TMP0]], i32 0, i32 1
-// CHECK23-NEXT:    [[TMP10:%.*]] = load float, ptr [[B]], align 4, !nontemporal !7, !llvm.access.group [[ACC_GRP6]]
+// CHECK23-NEXT:    [[TMP10:%.*]] = load float, ptr [[B]], align 4, !nontemporal !6, !llvm.access.group [[ACC_GRP5]]
 // CHECK23-NEXT:    [[CONV:%.*]] = fptosi float [[TMP10]] to i32
 // CHECK23-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[TMP0]], i32 0, i32 0
-// CHECK23-NEXT:    [[TMP11:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP6]]
+// CHECK23-NEXT:    [[TMP11:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP5]]
 // CHECK23-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [123 x i32], ptr [[A]], i32 0, i32 [[TMP11]]
-// CHECK23-NEXT:    store i32 [[CONV]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP6]]
+// CHECK23-NEXT:    store i32 [[CONV]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP5]]
 // CHECK23-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK23:       omp.body.continue:
 // CHECK23-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK23:       omp.inner.for.inc:
-// CHECK23-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP6]]
+// CHECK23-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP5]]
 // CHECK23-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP12]], 1
-// CHECK23-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP6]]
-// CHECK23-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP8:![0-9]+]]
+// CHECK23-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP5]]
+// CHECK23-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP7:![0-9]+]]
 // CHECK23:       omp.inner.for.end:
 // CHECK23-NEXT:    br label [[OMP_IF_END:%.*]]
 // CHECK23:       omp_if.else:
@@ -2782,7 +2782,7 @@ int main (int argc, char **argv) {
 // CHECK23-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
 // CHECK23-NEXT:    [[ADD14:%.*]] = add nsw i32 [[TMP18]], 1
 // CHECK23-NEXT:    store i32 [[ADD14]], ptr [[DOTOMP_IV]], align 4
-// CHECK23-NEXT:    br label [[OMP_INNER_FOR_COND3]], !llvm.loop [[LOOP11:![0-9]+]]
+// CHECK23-NEXT:    br label [[OMP_INNER_FOR_COND3]], !llvm.loop [[LOOP10:![0-9]+]]
 // CHECK23:       omp.inner.for.end15:
 // CHECK23-NEXT:    br label [[OMP_IF_END]]
 // CHECK23:       omp_if.end:
@@ -2800,7 +2800,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK23-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK23-SAME: () #[[ATTR5:[0-9]+]] {
+// CHECK23-SAME: () #[[ATTR4:[0-9]+]] {
 // CHECK23-NEXT:  entry:
 // CHECK23-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK23-NEXT:    ret void
@@ -3215,7 +3215,7 @@ int main (int argc, char **argv) {
 // CHECK33-NEXT:    [[TMP37:%.*]] = icmp ne i32 [[TMP36]], 0
 // CHECK33-NEXT:    br i1 [[TMP37]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK33:       omp_offload.failed:
-// CHECK33-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l192(i64 [[TMP4]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR4:[0-9]+]]
+// CHECK33-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l192(i64 [[TMP4]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR3:[0-9]+]]
 // CHECK33-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK33:       omp_offload.cont:
 // CHECK33-NEXT:    [[TMP38:%.*]] = load i32, ptr [[ARGC_ADDR]], align 4
@@ -3243,7 +3243,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK33-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l192.omp_outlined
-// CHECK33-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK33-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] {
 // CHECK33-NEXT:  entry:
 // CHECK33-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK33-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -3305,27 +3305,27 @@ int main (int argc, char **argv) {
 // CHECK33-NEXT:    store i32 [[TMP13]], ptr [[DOTOMP_IV]], align 4
 // CHECK33-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK33:       omp.inner.for.cond:
-// CHECK33-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP7:![0-9]+]]
-// CHECK33-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP7]]
+// CHECK33-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP5:![0-9]+]]
+// CHECK33-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP5]]
 // CHECK33-NEXT:    [[CMP5:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]]
 // CHECK33-NEXT:    br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK33:       omp.inner.for.body:
-// CHECK33-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP7]]
+// CHECK33-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP5]]
 // CHECK33-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP16]], 1
 // CHECK33-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK33-NEXT:    store i32 [[ADD]], ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP7]]
-// CHECK33-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP7]]
+// CHECK33-NEXT:    store i32 [[ADD]], ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP5]]
+// CHECK33-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP5]]
 // CHECK33-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP17]] to i64
 // CHECK33-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i64 [[IDXPROM]]
-// CHECK33-NEXT:    store i32 0, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP7]]
+// CHECK33-NEXT:    store i32 0, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP5]]
 // CHECK33-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK33:       omp.body.continue:
 // CHECK33-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK33:       omp.inner.for.inc:
-// CHECK33-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP7]]
+// CHECK33-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP5]]
 // CHECK33-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP18]], 1
-// CHECK33-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP7]]
-// CHECK33-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP8:![0-9]+]]
+// CHECK33-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP5]]
+// CHECK33-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP6:![0-9]+]]
 // CHECK33:       omp.inner.for.end:
 // CHECK33-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK33:       omp.loop.exit:
@@ -3350,7 +3350,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK33-LABEL: define {{[^@]+}}@_Z5tmainIiLi10EEiT_
-// CHECK33-SAME: (i32 noundef signext [[ARGC:%.*]]) #[[ATTR6:[0-9]+]] comdat {
+// CHECK33-SAME: (i32 noundef signext [[ARGC:%.*]]) #[[ATTR5:[0-9]+]] comdat {
 // CHECK33-NEXT:  entry:
 // CHECK33-NEXT:    [[ARGC_ADDR:%.*]] = alloca i32, align 4
 // CHECK33-NEXT:    [[A:%.*]] = alloca [10 x i32], align 4
@@ -3428,7 +3428,7 @@ int main (int argc, char **argv) {
 // CHECK33-NEXT:    [[TMP35:%.*]] = icmp ne i32 [[TMP34]], 0
 // CHECK33-NEXT:    br i1 [[TMP35]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK33:       omp_offload.failed:
-// CHECK33-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l181(i64 [[TMP1]], i64 [[TMP3]], ptr [[A]]) #[[ATTR4]]
+// CHECK33-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l181(i64 [[TMP1]], i64 [[TMP3]], ptr [[A]]) #[[ATTR3]]
 // CHECK33-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK33:       omp_offload.cont:
 // CHECK33-NEXT:    ret i32 0
@@ -3453,7 +3453,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK33-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l181.omp_outlined
-// CHECK33-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] {
+// CHECK33-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] {
 // CHECK33-NEXT:  entry:
 // CHECK33-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK33-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -3491,27 +3491,27 @@ int main (int argc, char **argv) {
 // CHECK33-NEXT:    store i32 [[TMP5]], ptr [[DOTOMP_IV]], align 4
 // CHECK33-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK33:       omp.inner.for.cond:
-// CHECK33-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP13:![0-9]+]]
-// CHECK33-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP13]]
+// CHECK33-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP11:![0-9]+]]
+// CHECK33-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP11]]
 // CHECK33-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP6]], [[TMP7]]
 // CHECK33-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK33:       omp.inner.for.body:
-// CHECK33-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP13]]
+// CHECK33-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP11]]
 // CHECK33-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1
 // CHECK33-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK33-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP13]]
-// CHECK33-NEXT:    [[TMP9:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP13]]
+// CHECK33-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP11]]
+// CHECK33-NEXT:    [[TMP9:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP11]]
 // CHECK33-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP9]] to i64
 // CHECK33-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP0]], i64 0, i64 [[IDXPROM]]
-// CHECK33-NEXT:    store i32 0, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP13]]
+// CHECK33-NEXT:    store i32 0, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP11]]
 // CHECK33-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK33:       omp.body.continue:
 // CHECK33-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK33:       omp.inner.for.inc:
-// CHECK33-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP13]]
+// CHECK33-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP11]]
 // CHECK33-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP10]], 1
-// CHECK33-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP13]]
-// CHECK33-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP14:![0-9]+]]
+// CHECK33-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP11]]
+// CHECK33-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP12:![0-9]+]]
 // CHECK33:       omp.inner.for.end:
 // CHECK33-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK33:       omp.loop.exit:
@@ -3527,7 +3527,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK33-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK33-SAME: () #[[ATTR7:[0-9]+]] {
+// CHECK33-SAME: () #[[ATTR6:[0-9]+]] {
 // CHECK33-NEXT:  entry:
 // CHECK33-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK33-NEXT:    ret void
@@ -3629,7 +3629,7 @@ int main (int argc, char **argv) {
 // CHECK35-NEXT:    [[TMP37:%.*]] = icmp ne i32 [[TMP36]], 0
 // CHECK35-NEXT:    br i1 [[TMP37]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK35:       omp_offload.failed:
-// CHECK35-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l192(i32 [[TMP3]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR4:[0-9]+]]
+// CHECK35-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l192(i32 [[TMP3]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR3:[0-9]+]]
 // CHECK35-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK35:       omp_offload.cont:
 // CHECK35-NEXT:    [[TMP38:%.*]] = load i32, ptr [[ARGC_ADDR]], align 4
@@ -3657,7 +3657,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK35-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l192.omp_outlined
-// CHECK35-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK35-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] {
 // CHECK35-NEXT:  entry:
 // CHECK35-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK35-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -3719,26 +3719,26 @@ int main (int argc, char **argv) {
 // CHECK35-NEXT:    store i32 [[TMP13]], ptr [[DOTOMP_IV]], align 4
 // CHECK35-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK35:       omp.inner.for.cond:
-// CHECK35-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP8:![0-9]+]]
-// CHECK35-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP8]]
+// CHECK35-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP6:![0-9]+]]
+// CHECK35-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP6]]
 // CHECK35-NEXT:    [[CMP5:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]]
 // CHECK35-NEXT:    br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK35:       omp.inner.for.body:
-// CHECK35-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP8]]
+// CHECK35-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP6]]
 // CHECK35-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP16]], 1
 // CHECK35-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK35-NEXT:    store i32 [[ADD]], ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP8]]
-// CHECK35-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP8]]
+// CHECK35-NEXT:    store i32 [[ADD]], ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP6]]
+// CHECK35-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP6]]
 // CHECK35-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 [[TMP17]]
-// CHECK35-NEXT:    store i32 0, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP8]]
+// CHECK35-NEXT:    store i32 0, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP6]]
 // CHECK35-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK35:       omp.body.continue:
 // CHECK35-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK35:       omp.inner.for.inc:
-// CHECK35-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP8]]
+// CHECK35-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP6]]
 // CHECK35-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP18]], 1
-// CHECK35-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP8]]
-// CHECK35-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP9:![0-9]+]]
+// CHECK35-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP6]]
+// CHECK35-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP7:![0-9]+]]
 // CHECK35:       omp.inner.for.end:
 // CHECK35-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK35:       omp.loop.exit:
@@ -3763,7 +3763,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK35-LABEL: define {{[^@]+}}@_Z5tmainIiLi10EEiT_
-// CHECK35-SAME: (i32 noundef [[ARGC:%.*]]) #[[ATTR6:[0-9]+]] comdat {
+// CHECK35-SAME: (i32 noundef [[ARGC:%.*]]) #[[ATTR5:[0-9]+]] comdat {
 // CHECK35-NEXT:  entry:
 // CHECK35-NEXT:    [[ARGC_ADDR:%.*]] = alloca i32, align 4
 // CHECK35-NEXT:    [[A:%.*]] = alloca [10 x i32], align 4
@@ -3841,7 +3841,7 @@ int main (int argc, char **argv) {
 // CHECK35-NEXT:    [[TMP35:%.*]] = icmp ne i32 [[TMP34]], 0
 // CHECK35-NEXT:    br i1 [[TMP35]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK35:       omp_offload.failed:
-// CHECK35-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l181(i32 [[TMP1]], i32 [[TMP3]], ptr [[A]]) #[[ATTR4]]
+// CHECK35-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l181(i32 [[TMP1]], i32 [[TMP3]], ptr [[A]]) #[[ATTR3]]
 // CHECK35-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK35:       omp_offload.cont:
 // CHECK35-NEXT:    ret i32 0
@@ -3866,7 +3866,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK35-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l181.omp_outlined
-// CHECK35-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] {
+// CHECK35-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] {
 // CHECK35-NEXT:  entry:
 // CHECK35-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK35-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -3904,26 +3904,26 @@ int main (int argc, char **argv) {
 // CHECK35-NEXT:    store i32 [[TMP5]], ptr [[DOTOMP_IV]], align 4
 // CHECK35-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK35:       omp.inner.for.cond:
-// CHECK35-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP14:![0-9]+]]
-// CHECK35-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP14]]
+// CHECK35-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12:![0-9]+]]
+// CHECK35-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP12]]
 // CHECK35-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP6]], [[TMP7]]
 // CHECK35-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK35:       omp.inner.for.body:
-// CHECK35-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP14]]
+// CHECK35-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12]]
 // CHECK35-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1
 // CHECK35-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK35-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP14]]
-// CHECK35-NEXT:    [[TMP9:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP14]]
+// CHECK35-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK35-NEXT:    [[TMP9:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP12]]
 // CHECK35-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP0]], i32 0, i32 [[TMP9]]
-// CHECK35-NEXT:    store i32 0, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP14]]
+// CHECK35-NEXT:    store i32 0, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP12]]
 // CHECK35-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK35:       omp.body.continue:
 // CHECK35-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK35:       omp.inner.for.inc:
-// CHECK35-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP14]]
+// CHECK35-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12]]
 // CHECK35-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP10]], 1
-// CHECK35-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP14]]
-// CHECK35-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP15:![0-9]+]]
+// CHECK35-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK35-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP13:![0-9]+]]
 // CHECK35:       omp.inner.for.end:
 // CHECK35-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK35:       omp.loop.exit:
@@ -3939,7 +3939,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK35-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK35-SAME: () #[[ATTR7:[0-9]+]] {
+// CHECK35-SAME: () #[[ATTR6:[0-9]+]] {
 // CHECK35-NEXT:  entry:
 // CHECK35-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK35-NEXT:    ret void
@@ -4051,7 +4051,7 @@ int main (int argc, char **argv) {
 // CHECK37-NEXT:    [[TMP42:%.*]] = icmp ne i32 [[TMP41]], 0
 // CHECK37-NEXT:    br i1 [[TMP42]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK37:       omp_offload.failed:
-// CHECK37-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l192(i64 [[TMP4]], i64 [[TMP6]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR4:[0-9]+]]
+// CHECK37-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l192(i64 [[TMP4]], i64 [[TMP6]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR3:[0-9]+]]
 // CHECK37-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK37:       omp_offload.cont:
 // CHECK37-NEXT:    [[TMP43:%.*]] = load i32, ptr [[ARGC_ADDR]], align 4
@@ -4092,7 +4092,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK37-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l192.omp_outlined
-// CHECK37-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK37-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
 // CHECK37-NEXT:  entry:
 // CHECK37-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK37-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -4160,27 +4160,27 @@ int main (int argc, char **argv) {
 // CHECK37:       omp_if.then:
 // CHECK37-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK37:       omp.inner.for.cond:
-// CHECK37-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP7:![0-9]+]]
-// CHECK37-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP7]]
+// CHECK37-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP5:![0-9]+]]
+// CHECK37-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP5]]
 // CHECK37-NEXT:    [[CMP6:%.*]] = icmp sle i32 [[TMP15]], [[TMP16]]
 // CHECK37-NEXT:    br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK37:       omp.inner.for.body:
-// CHECK37-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP7]]
+// CHECK37-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP5]]
 // CHECK37-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP17]], 1
 // CHECK37-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK37-NEXT:    store i32 [[ADD]], ptr [[I4]], align 4, !llvm.access.group [[ACC_GRP7]]
-// CHECK37-NEXT:    [[TMP18:%.*]] = load i32, ptr [[I4]], align 4, !llvm.access.group [[ACC_GRP7]]
+// CHECK37-NEXT:    store i32 [[ADD]], ptr [[I4]], align 4, !llvm.access.group [[ACC_GRP5]]
+// CHECK37-NEXT:    [[TMP18:%.*]] = load i32, ptr [[I4]], align 4, !llvm.access.group [[ACC_GRP5]]
 // CHECK37-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP18]] to i64
 // CHECK37-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i64 [[IDXPROM]]
-// CHECK37-NEXT:    store i32 0, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP7]]
+// CHECK37-NEXT:    store i32 0, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP5]]
 // CHECK37-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK37:       omp.body.continue:
 // CHECK37-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK37:       omp.inner.for.inc:
-// CHECK37-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP7]]
+// CHECK37-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP5]]
 // CHECK37-NEXT:    [[ADD7:%.*]] = add nsw i32 [[TMP19]], 1
-// CHECK37-NEXT:    store i32 [[ADD7]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP7]]
-// CHECK37-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP8:![0-9]+]]
+// CHECK37-NEXT:    store i32 [[ADD7]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP5]]
+// CHECK37-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP6:![0-9]+]]
 // CHECK37:       omp.inner.for.end:
 // CHECK37-NEXT:    br label [[OMP_IF_END:%.*]]
 // CHECK37:       omp_if.else:
@@ -4206,7 +4206,7 @@ int main (int argc, char **argv) {
 // CHECK37-NEXT:    [[TMP24:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
 // CHECK37-NEXT:    [[ADD17:%.*]] = add nsw i32 [[TMP24]], 1
 // CHECK37-NEXT:    store i32 [[ADD17]], ptr [[DOTOMP_IV]], align 4
-// CHECK37-NEXT:    br label [[OMP_INNER_FOR_COND8]], !llvm.loop [[LOOP11:![0-9]+]]
+// CHECK37-NEXT:    br label [[OMP_INNER_FOR_COND8]], !llvm.loop [[LOOP9:![0-9]+]]
 // CHECK37:       omp.inner.for.end18:
 // CHECK37-NEXT:    br label [[OMP_IF_END]]
 // CHECK37:       omp_if.end:
@@ -4233,7 +4233,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK37-LABEL: define {{[^@]+}}@_Z5tmainIiLi10EEiT_
-// CHECK37-SAME: (i32 noundef signext [[ARGC:%.*]]) #[[ATTR6:[0-9]+]] comdat {
+// CHECK37-SAME: (i32 noundef signext [[ARGC:%.*]]) #[[ATTR5:[0-9]+]] comdat {
 // CHECK37-NEXT:  entry:
 // CHECK37-NEXT:    [[ARGC_ADDR:%.*]] = alloca i32, align 4
 // CHECK37-NEXT:    [[A:%.*]] = alloca [10 x i32], align 4
@@ -4311,7 +4311,7 @@ int main (int argc, char **argv) {
 // CHECK37-NEXT:    [[TMP35:%.*]] = icmp ne i32 [[TMP34]], 0
 // CHECK37-NEXT:    br i1 [[TMP35]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK37:       omp_offload.failed:
-// CHECK37-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l181(i64 [[TMP1]], i64 [[TMP3]], ptr [[A]]) #[[ATTR4]]
+// CHECK37-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l181(i64 [[TMP1]], i64 [[TMP3]], ptr [[A]]) #[[ATTR3]]
 // CHECK37-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK37:       omp_offload.cont:
 // CHECK37-NEXT:    ret i32 0
@@ -4336,7 +4336,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK37-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l181.omp_outlined
-// CHECK37-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] {
+// CHECK37-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] {
 // CHECK37-NEXT:  entry:
 // CHECK37-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK37-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -4374,27 +4374,27 @@ int main (int argc, char **argv) {
 // CHECK37-NEXT:    store i32 [[TMP5]], ptr [[DOTOMP_IV]], align 4
 // CHECK37-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK37:       omp.inner.for.cond:
-// CHECK37-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP15:![0-9]+]]
-// CHECK37-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP15]]
+// CHECK37-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP13:![0-9]+]]
+// CHECK37-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP13]]
 // CHECK37-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP6]], [[TMP7]]
 // CHECK37-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK37:       omp.inner.for.body:
-// CHECK37-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP15]]
+// CHECK37-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP13]]
 // CHECK37-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1
 // CHECK37-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK37-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP15]]
-// CHECK37-NEXT:    [[TMP9:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP15]]
+// CHECK37-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP13]]
+// CHECK37-NEXT:    [[TMP9:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP13]]
 // CHECK37-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP9]] to i64
 // CHECK37-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP0]], i64 0, i64 [[IDXPROM]]
-// CHECK37-NEXT:    store i32 0, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP15]]
+// CHECK37-NEXT:    store i32 0, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP13]]
 // CHECK37-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK37:       omp.body.continue:
 // CHECK37-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK37:       omp.inner.for.inc:
-// CHECK37-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP15]]
+// CHECK37-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP13]]
 // CHECK37-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP10]], 1
-// CHECK37-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP15]]
-// CHECK37-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP16:![0-9]+]]
+// CHECK37-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP13]]
+// CHECK37-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP14:![0-9]+]]
 // CHECK37:       omp.inner.for.end:
 // CHECK37-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK37:       omp.loop.exit:
@@ -4410,7 +4410,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK37-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK37-SAME: () #[[ATTR7:[0-9]+]] {
+// CHECK37-SAME: () #[[ATTR6:[0-9]+]] {
 // CHECK37-NEXT:  entry:
 // CHECK37-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK37-NEXT:    ret void
@@ -4522,7 +4522,7 @@ int main (int argc, char **argv) {
 // CHECK39-NEXT:    [[TMP42:%.*]] = icmp ne i32 [[TMP41]], 0
 // CHECK39-NEXT:    br i1 [[TMP42]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK39:       omp_offload.failed:
-// CHECK39-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l192(i32 [[TMP3]], i32 [[TMP5]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR4:[0-9]+]]
+// CHECK39-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l192(i32 [[TMP3]], i32 [[TMP5]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR3:[0-9]+]]
 // CHECK39-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK39:       omp_offload.cont:
 // CHECK39-NEXT:    [[TMP43:%.*]] = load i32, ptr [[ARGC_ADDR]], align 4
@@ -4563,7 +4563,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK39-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l192.omp_outlined
-// CHECK39-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK39-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
 // CHECK39-NEXT:  entry:
 // CHECK39-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK39-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -4631,26 +4631,26 @@ int main (int argc, char **argv) {
 // CHECK39:       omp_if.then:
 // CHECK39-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK39:       omp.inner.for.cond:
-// CHECK39-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP8:![0-9]+]]
-// CHECK39-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP8]]
+// CHECK39-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP6:![0-9]+]]
+// CHECK39-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP6]]
 // CHECK39-NEXT:    [[CMP6:%.*]] = icmp sle i32 [[TMP15]], [[TMP16]]
 // CHECK39-NEXT:    br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK39:       omp.inner.for.body:
-// CHECK39-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP8]]
+// CHECK39-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP6]]
 // CHECK39-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP17]], 1
 // CHECK39-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK39-NEXT:    store i32 [[ADD]], ptr [[I4]], align 4, !llvm.access.group [[ACC_GRP8]]
-// CHECK39-NEXT:    [[TMP18:%.*]] = load i32, ptr [[I4]], align 4, !llvm.access.group [[ACC_GRP8]]
+// CHECK39-NEXT:    store i32 [[ADD]], ptr [[I4]], align 4, !llvm.access.group [[ACC_GRP6]]
+// CHECK39-NEXT:    [[TMP18:%.*]] = load i32, ptr [[I4]], align 4, !llvm.access.group [[ACC_GRP6]]
 // CHECK39-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 [[TMP18]]
-// CHECK39-NEXT:    store i32 0, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP8]]
+// CHECK39-NEXT:    store i32 0, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP6]]
 // CHECK39-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK39:       omp.body.continue:
 // CHECK39-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK39:       omp.inner.for.inc:
-// CHECK39-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP8]]
+// CHECK39-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP6]]
 // CHECK39-NEXT:    [[ADD7:%.*]] = add nsw i32 [[TMP19]], 1
-// CHECK39-NEXT:    store i32 [[ADD7]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP8]]
-// CHECK39-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP9:![0-9]+]]
+// CHECK39-NEXT:    store i32 [[ADD7]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP6]]
+// CHECK39-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP7:![0-9]+]]
 // CHECK39:       omp.inner.for.end:
 // CHECK39-NEXT:    br label [[OMP_IF_END:%.*]]
 // CHECK39:       omp_if.else:
@@ -4675,7 +4675,7 @@ int main (int argc, char **argv) {
 // CHECK39-NEXT:    [[TMP24:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
 // CHECK39-NEXT:    [[ADD16:%.*]] = add nsw i32 [[TMP24]], 1
 // CHECK39-NEXT:    store i32 [[ADD16]], ptr [[DOTOMP_IV]], align 4
-// CHECK39-NEXT:    br label [[OMP_INNER_FOR_COND8]], !llvm.loop [[LOOP12:![0-9]+]]
+// CHECK39-NEXT:    br label [[OMP_INNER_FOR_COND8]], !llvm.loop [[LOOP10:![0-9]+]]
 // CHECK39:       omp.inner.for.end17:
 // CHECK39-NEXT:    br label [[OMP_IF_END]]
 // CHECK39:       omp_if.end:
@@ -4702,7 +4702,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK39-LABEL: define {{[^@]+}}@_Z5tmainIiLi10EEiT_
-// CHECK39-SAME: (i32 noundef [[ARGC:%.*]]) #[[ATTR6:[0-9]+]] comdat {
+// CHECK39-SAME: (i32 noundef [[ARGC:%.*]]) #[[ATTR5:[0-9]+]] comdat {
 // CHECK39-NEXT:  entry:
 // CHECK39-NEXT:    [[ARGC_ADDR:%.*]] = alloca i32, align 4
 // CHECK39-NEXT:    [[A:%.*]] = alloca [10 x i32], align 4
@@ -4780,7 +4780,7 @@ int main (int argc, char **argv) {
 // CHECK39-NEXT:    [[TMP35:%.*]] = icmp ne i32 [[TMP34]], 0
 // CHECK39-NEXT:    br i1 [[TMP35]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK39:       omp_offload.failed:
-// CHECK39-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l181(i32 [[TMP1]], i32 [[TMP3]], ptr [[A]]) #[[ATTR4]]
+// CHECK39-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l181(i32 [[TMP1]], i32 [[TMP3]], ptr [[A]]) #[[ATTR3]]
 // CHECK39-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK39:       omp_offload.cont:
 // CHECK39-NEXT:    ret i32 0
@@ -4805,7 +4805,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK39-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l181.omp_outlined
-// CHECK39-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] {
+// CHECK39-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] {
 // CHECK39-NEXT:  entry:
 // CHECK39-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK39-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -4843,26 +4843,26 @@ int main (int argc, char **argv) {
 // CHECK39-NEXT:    store i32 [[TMP5]], ptr [[DOTOMP_IV]], align 4
 // CHECK39-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK39:       omp.inner.for.cond:
-// CHECK39-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16:![0-9]+]]
-// CHECK39-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK39-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP14:![0-9]+]]
+// CHECK39-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP14]]
 // CHECK39-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP6]], [[TMP7]]
 // CHECK39-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK39:       omp.inner.for.body:
-// CHECK39-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK39-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP14]]
 // CHECK39-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1
 // CHECK39-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK39-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK39-NEXT:    [[TMP9:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK39-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP14]]
+// CHECK39-NEXT:    [[TMP9:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP14]]
 // CHECK39-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP0]], i32 0, i32 [[TMP9]]
-// CHECK39-NEXT:    store i32 0, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK39-NEXT:    store i32 0, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP14]]
 // CHECK39-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK39:       omp.body.continue:
 // CHECK39-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK39:       omp.inner.for.inc:
-// CHECK39-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK39-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP14]]
 // CHECK39-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP10]], 1
-// CHECK39-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK39-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP17:![0-9]+]]
+// CHECK39-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP14]]
+// CHECK39-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP15:![0-9]+]]
 // CHECK39:       omp.inner.for.end:
 // CHECK39-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK39:       omp.loop.exit:
@@ -4878,7 +4878,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK39-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK39-SAME: () #[[ATTR7:[0-9]+]] {
+// CHECK39-SAME: () #[[ATTR6:[0-9]+]] {
 // CHECK39-NEXT:  entry:
 // CHECK39-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK39-NEXT:    ret void
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index 270f3efc5a3b3..aa891202a40ea 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -19,6 +19,7 @@
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/Support/Allocator.h"
+#include "llvm/TargetParser/Triple.h"
 #include <forward_list>
 #include <map>
 #include <optional>
@@ -445,7 +446,8 @@ class OpenMPIRBuilder {
   /// Create a new OpenMPIRBuilder operating on the given module \p M. This will
   /// not have an effect on \p M (see initialize)
   OpenMPIRBuilder(Module &M)
-      : M(M), Builder(M.getContext()), OffloadInfoManager(this) {}
+      : M(M), Builder(M.getContext()), OffloadInfoManager(this),
+        T(Triple(M.getTargetTriple())) {}
   ~OpenMPIRBuilder();
 
   /// Initialize the internal state, this will put structures types and
@@ -1448,6 +1450,9 @@ class OpenMPIRBuilder {
   /// Info manager to keep track of target regions.
   OffloadEntriesInfoManager OffloadInfoManager;
 
+  /// The target triple of the underlying module.
+  const Triple T;
+
   /// Helper that contains information about regions we need to outline
   /// during finalization.
   struct OutlineInfo {
@@ -2021,7 +2026,15 @@ class OpenMPIRBuilder {
   ///
   /// \param Loc The insert and source location description.
   /// \param IsSPMD Flag to indicate if the kernel is an SPMD kernel or not.
-  InsertPointTy createTargetInit(const LocationDescription &Loc, bool IsSPMD);
+  /// \param MinThreads Minimal number of threads, or 0.
+  /// \param MaxThreads Maximal number of threads, or 0.
+  /// \param MinTeams Minimal number of teams, or 0.
+  /// \param MaxTeams Maximal number of teams, or 0.
+  InsertPointTy createTargetInit(const LocationDescription &Loc, bool IsSPMD,
+                                 int32_t MinThreadsVal = 0,
+                                 int32_t MaxThreadsVal = 0,
+                                 int32_t MinTeamsVal = 0,
+                                 int32_t MaxTeamsVal = 0);
 
   /// Create a runtime call for kmpc_target_deinit
   ///
@@ -2037,23 +2050,21 @@ class OpenMPIRBuilder {
   /// Read/write a bounds on threads for \p Kernel. Read will return 0 if none
   /// is set.
   static std::pair<int32_t, int32_t>
-  readThreadBoundsForKernel(Function &Kernel);
-  static void writeThreadBoundsForKernel(Function &Kernel, int32_t LB,
-                                         int32_t UB);
+  readThreadBoundsForKernel(const Triple &T, Function &Kernel);
+  static void writeThreadBoundsForKernel(const Triple &T, Function &Kernel,
+                                         int32_t LB, int32_t UB);
 
   /// Read/write a bounds on teams for \p Kernel. Read will return 0 if none
   /// is set.
-  static std::pair<int32_t, int32_t> readTeamBoundsForKernel(Function &Kernel);
-  static void writeTeamsForKernel(Function &Kernel, int32_t LB, int32_t UB);
+  static std::pair<int32_t, int32_t> readTeamBoundsForKernel(const Triple &T,
+                                                             Function &Kernel);
+  static void writeTeamsForKernel(const Triple &T, Function &Kernel, int32_t LB,
+                                  int32_t UB);
   ///}
 
 private:
   // Sets the function attributes expected for the outlined function
-  void setOutlinedTargetRegionFunctionAttributes(Function *OutlinedFn,
-                                                 int32_t MinTeams,
-                                                 int32_t MaxTeams,
-                                                 int32_t MinThreads,
-                                                 int32_t MaxThreads);
+  void setOutlinedTargetRegionFunctionAttributes(Function *OutlinedFn);
 
   // Creates the function ID/Address for the given outlined function.
   // In the case of an embedded device function the address of the function is
@@ -2098,16 +2109,10 @@ class OpenMPIRBuilder {
   /// \param InfoManager The info manager keeping track of the offload entries
   /// \param EntryInfo The entry information about the function
   /// \param GenerateFunctionCallback The callback function to generate the code
-  /// \param MinTeams Minimal number of teams
-  /// \param MaxTeams Maximal number of teams
-  /// \param MinThreads Minimal number of threads
-  /// \param MaxThreads Maximal number of threads
   /// \param OutlinedFunction Pointer to the outlined function
   /// \param EntryFnIDName Name of the ID o be created
   void emitTargetRegionFunction(TargetRegionEntryInfo &EntryInfo,
                                 FunctionGenCallback &GenerateFunctionCallback,
-                                int32_t MinTeams, int32_t MaxTeams,
-                                int32_t MinThreads, int32_t MaxThreads,
                                 bool IsOffloadEntry, Function *&OutlinedFn,
                                 Constant *&OutlinedFnID);
 
@@ -2119,14 +2124,10 @@ class OpenMPIRBuilder {
   /// \param OutlinedFunction Pointer to the outlined function
   /// \param EntryFnName Name of the outlined function
   /// \param EntryFnIDName Name of the ID o be created
-  /// \param MinTeams Minimal number of teams
-  /// \param MaxTeams Maximal number of teams
-  /// \param MinThreads Minimal number of threads
-  /// \param MaxThreads Maximal number of threads
-  Constant *registerTargetRegionFunction(
-      TargetRegionEntryInfo &EntryInfo, Function *OutlinedFunction,
-      StringRef EntryFnName, StringRef EntryFnIDName, int32_t MinTeams,
-      int32_t MaxTeams, int32_t MinThreads, int32_t MaxThreads);
+  Constant *registerTargetRegionFunction(TargetRegionEntryInfo &EntryInfo,
+                                         Function *OutlinedFunction,
+                                         StringRef EntryFnName,
+                                         StringRef EntryFnIDName);
 
   /// Type of BodyGen to use for region codegen
   ///
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index ac5a8d1fd817e..3e4e030f44c7f 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -86,6 +86,19 @@ static bool isConflictIP(IRBuilder<>::InsertPoint IP1,
   return IP1.getBlock() == IP2.getBlock() && IP1.getPoint() == IP2.getPoint();
 }
 
+static const omp::GV &getGridValue(const Triple &T, Function *Kernel) {
+  if (T.isAMDGPU()) {
+    StringRef Features =
+        Kernel->getFnAttribute("target-features").getValueAsString();
+    if (Features.count("+wavefrontsize64"))
+      return omp::getAMDGPUGridValues<64>();
+    return omp::getAMDGPUGridValues<32>();
+  }
+  if (T.isNVPTX())
+    return omp::NVPTXGridValues;
+  llvm_unreachable("No grid value available for this architecture!");
+}
+
 static bool isValidWorkshareLoopScheduleType(OMPScheduleType SchedType) {
   // Valid ordered/unordered and base algorithm combinations.
   switch (SchedType & ~OMPScheduleType::MonotonicityMask) {
@@ -506,7 +519,6 @@ void OpenMPIRBuilder::getKernelArgsVector(TargetKernelArgs &KernelArgs,
 
 void OpenMPIRBuilder::addAttributes(omp::RuntimeFunction FnID, Function &Fn) {
   LLVMContext &Ctx = Fn.getContext();
-  Triple T(M.getTargetTriple());
 
   // Get the function's current attributes.
   auto Attrs = Fn.getAttributes();
@@ -4064,7 +4076,9 @@ CallInst *OpenMPIRBuilder::createCachedThreadPrivate(
 }
 
 OpenMPIRBuilder::InsertPointTy
-OpenMPIRBuilder::createTargetInit(const LocationDescription &Loc, bool IsSPMD) {
+OpenMPIRBuilder::createTargetInit(const LocationDescription &Loc, bool IsSPMD,
+                                  int32_t MinThreadsVal, int32_t MaxThreadsVal,
+                                  int32_t MinTeamsVal, int32_t MaxTeamsVal) {
   if (!updateToLocation(Loc))
     return Loc.IP;
 
@@ -4078,8 +4092,20 @@ OpenMPIRBuilder::createTargetInit(const LocationDescription &Loc, bool IsSPMD) {
   Constant *DebugIndentionLevelVal = ConstantInt::getSigned(Int16, 0);
 
   Function *Kernel = Builder.GetInsertBlock()->getParent();
-  auto [MinThreadsVal, MaxThreadsVal] = readThreadBoundsForKernel(*Kernel);
-  auto [MinTeamsVal, MaxTeamsVal] = readTeamBoundsForKernel(*Kernel);
+
+  /// Manifest the launch configuration in the metadata matching the kernel
+  /// environment.
+  if (MinTeamsVal > 1 || MaxTeamsVal > 0)
+    writeTeamsForKernel(T, *Kernel, MinTeamsVal, MaxTeamsVal);
+
+  // For max values, < 0 means unset, == 0 means set but unknown.
+  if (MaxThreadsVal < 0)
+    MaxThreadsVal = std::max(
+        int32_t(getGridValue(T, Kernel).GV_Default_WG_Size), MinThreadsVal);
+
+  if (MaxThreadsVal > 0)
+    writeThreadBoundsForKernel(T, *Kernel, MinThreadsVal, MaxThreadsVal);
+
   Constant *MinThreads = ConstantInt::getSigned(Int32, MinThreadsVal);
   Constant *MaxThreads = ConstantInt::getSigned(Int32, MaxThreadsVal);
   Constant *MinTeams = ConstantInt::getSigned(Int32, MinTeamsVal);
@@ -4183,20 +4209,6 @@ void OpenMPIRBuilder::createTargetDeinit(const LocationDescription &Loc) {
   Builder.CreateCall(Fn, {});
 }
 
-static const omp::GV &getGridValue(Function *Kernel) {
-  if (Kernel->getCallingConv() == CallingConv::AMDGPU_KERNEL) {
-    StringRef Features =
-        Kernel->getFnAttribute("target-features").getValueAsString();
-    if (Features.count("+wavefrontsize64"))
-      return omp::getAMDGPUGridValues<64>();
-    return omp::getAMDGPUGridValues<32>();
-  }
-  if (Triple(Kernel->getParent()->getTargetTriple()).isNVPTX())
-
-    return omp::NVPTXGridValues;
-  llvm_unreachable("No grid value available for this architecture!");
-}
-
 static MDNode *getNVPTXMDNode(Function &Kernel, StringRef Name) {
   Module &M = *Kernel.getParent();
   NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
@@ -4239,12 +4251,11 @@ static void updateNVPTXMetadata(Function &Kernel, StringRef Name, int32_t Value,
 }
 
 std::pair<int32_t, int32_t>
-OpenMPIRBuilder::readThreadBoundsForKernel(Function &Kernel) {
+OpenMPIRBuilder::readThreadBoundsForKernel(const Triple &T, Function &Kernel) {
   int32_t ThreadLimit =
       Kernel.getFnAttributeAsParsedInteger("omp_target_thread_limit");
 
-  bool IsAMDGPU = Kernel.getCallingConv() == CallingConv::AMDGPU_KERNEL;
-  if (IsAMDGPU) {
+  if (T.isAMDGPU()) {
     const auto &Attr = Kernel.getFnAttribute("amdgpu-flat-work-group-size");
     if (!Attr.isValid() || !Attr.isStringAttribute())
       return {0, ThreadLimit};
@@ -4266,12 +4277,12 @@ OpenMPIRBuilder::readThreadBoundsForKernel(Function &Kernel) {
   return {0, ThreadLimit};
 }
 
-void OpenMPIRBuilder::writeThreadBoundsForKernel(Function &Kernel, int32_t LB,
+void OpenMPIRBuilder::writeThreadBoundsForKernel(const Triple &T,
+                                                 Function &Kernel, int32_t LB,
                                                  int32_t UB) {
   Kernel.addFnAttr("omp_target_thread_limit", std::to_string(UB));
 
-  bool IsAMDGPU = Kernel.getCallingConv() == CallingConv::AMDGPU_KERNEL;
-  if (IsAMDGPU) {
+  if (T.isAMDGPU()) {
     Kernel.addFnAttr("amdgpu-flat-work-group-size",
                      llvm::utostr(LB) + "," + llvm::utostr(UB));
     return;
@@ -4281,40 +4292,31 @@ void OpenMPIRBuilder::writeThreadBoundsForKernel(Function &Kernel, int32_t LB,
 }
 
 std::pair<int32_t, int32_t>
-OpenMPIRBuilder::readTeamBoundsForKernel(Function &Kernel) {
+OpenMPIRBuilder::readTeamBoundsForKernel(const Triple &, Function &Kernel) {
   // TODO: Read from backend annotations if available.
   return {0, Kernel.getFnAttributeAsParsedInteger("omp_target_num_teams")};
 }
 
-void OpenMPIRBuilder::writeTeamsForKernel(Function &Kernel, int32_t LB,
-                                          int32_t UB) {
-  if (UB > 0)
-    updateNVPTXMetadata(Kernel, "maxclusterrank", UB, true);
-  updateNVPTXMetadata(Kernel, "minctasm", LB, false);
+void OpenMPIRBuilder::writeTeamsForKernel(const Triple &T, Function &Kernel,
+                                          int32_t LB, int32_t UB) {
+  if (T.isNVPTX()) {
+    if (UB > 0)
+      updateNVPTXMetadata(Kernel, "maxclusterrank", UB, true);
+    updateNVPTXMetadata(Kernel, "minctasm", LB, false);
+  }
   Kernel.addFnAttr("omp_target_num_teams", std::to_string(LB));
 }
 
 void OpenMPIRBuilder::setOutlinedTargetRegionFunctionAttributes(
-    Function *OutlinedFn, int32_t MinTeams, int32_t MaxTeams,
-    int32_t MinThreads, int32_t MaxThreads) {
+    Function *OutlinedFn) {
   if (Config.isTargetDevice()) {
     OutlinedFn->setLinkage(GlobalValue::WeakODRLinkage);
     // TODO: Determine if DSO local can be set to true.
     OutlinedFn->setDSOLocal(false);
     OutlinedFn->setVisibility(GlobalValue::ProtectedVisibility);
-    if (Triple(M.getTargetTriple()).isAMDGCN())
+    if (T.isAMDGCN())
       OutlinedFn->setCallingConv(CallingConv::AMDGPU_KERNEL);
   }
-
-  if (MinTeams > 1 || MaxTeams > 0)
-    writeTeamsForKernel(*OutlinedFn, MinTeams, MaxTeams);
-
-  if (MaxThreads == -1 && Config.isGPU())
-    MaxThreads = std::max(int32_t(getGridValue(OutlinedFn).GV_Default_WG_Size),
-                          MinThreads);
-
-  if (MaxThreads > 0)
-    writeThreadBoundsForKernel(*OutlinedFn, MinThreads, MaxThreads);
 }
 
 Constant *OpenMPIRBuilder::createOutlinedFunctionID(Function *OutlinedFn,
@@ -4343,9 +4345,8 @@ Constant *OpenMPIRBuilder::createTargetRegionEntryAddr(Function *OutlinedFn,
 
 void OpenMPIRBuilder::emitTargetRegionFunction(
     TargetRegionEntryInfo &EntryInfo,
-    FunctionGenCallback &GenerateFunctionCallback, int32_t MinTeams,
-    int32_t MaxTeams, int32_t MinThreads, int32_t MaxThreads,
-    bool IsOffloadEntry, Function *&OutlinedFn, Constant *&OutlinedFnID) {
+    FunctionGenCallback &GenerateFunctionCallback, bool IsOffloadEntry,
+    Function *&OutlinedFn, Constant *&OutlinedFnID) {
 
   SmallString<64> EntryFnName;
   OffloadInfoManager.getTargetRegionEntryFnName(EntryFnName, EntryInfo);
@@ -4365,18 +4366,15 @@ void OpenMPIRBuilder::emitTargetRegionFunction(
           ? std::string(EntryFnName)
           : createPlatformSpecificName({EntryFnName, "region_id"});
 
-  OutlinedFnID = registerTargetRegionFunction(
-      EntryInfo, OutlinedFn, EntryFnName, EntryFnIDName, MinTeams, MaxTeams,
-      MinThreads, MaxThreads);
+  OutlinedFnID = registerTargetRegionFunction(EntryInfo, OutlinedFn,
+                                              EntryFnName, EntryFnIDName);
 }
 
 Constant *OpenMPIRBuilder::registerTargetRegionFunction(
     TargetRegionEntryInfo &EntryInfo, Function *OutlinedFn,
-    StringRef EntryFnName, StringRef EntryFnIDName, int32_t MinTeams,
-    int32_t MaxTeams, int32_t MinThreads, int32_t MaxThreads) {
+    StringRef EntryFnName, StringRef EntryFnIDName) {
   if (OutlinedFn)
-    setOutlinedTargetRegionFunctionAttributes(OutlinedFn, MinTeams, MaxTeams,
-                                              MinThreads, MaxThreads);
+    setOutlinedTargetRegionFunctionAttributes(OutlinedFn);
   auto OutlinedFnID = createOutlinedFunctionID(OutlinedFn, EntryFnIDName);
   auto EntryAddr = createTargetRegionEntryAddr(OutlinedFn, EntryFnName);
   OffloadInfoManager.registerTargetRegionEntryInfo(
@@ -4654,8 +4652,7 @@ static Function *createOutlinedFunction(
 static void emitTargetOutlinedFunction(
     OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder,
     TargetRegionEntryInfo &EntryInfo, Function *&OutlinedFn,
-    Constant *&OutlinedFnID, int32_t NumTeams, int32_t NumThreads,
-    SmallVectorImpl<Value *> &Inputs,
+    Constant *&OutlinedFnID, SmallVectorImpl<Value *> &Inputs,
     OpenMPIRBuilder::TargetBodyGenCallbackTy &CBFunc,
     OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy &ArgAccessorFuncCB) {
 
@@ -4666,8 +4663,7 @@ static void emitTargetOutlinedFunction(
                                       CBFunc, ArgAccessorFuncCB);
       };
 
-  OMPBuilder.emitTargetRegionFunction(EntryInfo, GenerateOutlinedFunction,
-                                      NumTeams, NumTeams, 1, NumThreads, true,
+  OMPBuilder.emitTargetRegionFunction(EntryInfo, GenerateOutlinedFunction, true,
                                       OutlinedFn, OutlinedFnID);
 }
 
@@ -4738,8 +4734,7 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createTarget(
   Function *OutlinedFn;
   Constant *OutlinedFnID;
   emitTargetOutlinedFunction(*this, Builder, EntryInfo, OutlinedFn,
-                             OutlinedFnID, NumTeams, NumThreads, Args, CBFunc,
-                             ArgAccessorFuncCB);
+                             OutlinedFnID, Args, CBFunc, ArgAccessorFuncCB);
   if (!Config.isTargetDevice())
     emitTargetCall(*this, Builder, AllocaIP, OutlinedFn, OutlinedFnID, NumTeams,
                    NumThreads, Args, GenMapInfoCB);
@@ -5985,7 +5980,7 @@ void OpenMPIRBuilder::createOffloadEntry(Constant *ID, Constant *Addr,
 
   // Add a function attribute for the kernel.
   Fn->addFnAttr(Attribute::get(Ctx, "kernel"));
-  if (Triple(M.getTargetTriple()).isAMDGCN())
+  if (T.isAMDGCN())
     Fn->addFnAttr("uniform-work-group-size", "true");
   Fn->addFnAttr(Attribute::MustProgress);
 }
diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
index e9dbcd334321f..b5e63de3cb839 100644
--- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -3739,14 +3739,16 @@ struct AAKernelInfoFunction : AAKernelInfo {
     else
       setExecModeOfKernelEnvironment(AssumedExecModeC);
 
+    const Triple T(Fn->getParent()->getTargetTriple());
     auto *Int32Ty = Type::getInt32Ty(Fn->getContext());
     auto [MinThreads, MaxThreads] =
-        OpenMPIRBuilder::readThreadBoundsForKernel(*Fn);
+        OpenMPIRBuilder::readThreadBoundsForKernel(T, *Fn);
     if (MinThreads)
       setMinThreadsOfKernelEnvironment(ConstantInt::get(Int32Ty, MinThreads));
     if (MaxThreads)
       setMaxThreadsOfKernelEnvironment(ConstantInt::get(Int32Ty, MaxThreads));
-    auto [MinTeams, MaxTeams] = OpenMPIRBuilder::readTeamBoundsForKernel(*Fn);
+    auto [MinTeams, MaxTeams] =
+        OpenMPIRBuilder::readTeamBoundsForKernel(T, *Fn);
     if (MinTeams)
       setMinTeamsOfKernelEnvironment(ConstantInt::get(Int32Ty, MinTeams));
     if (MaxTeams)

From d8f5a18b6e587aeaa8b99707e87b652f49b160cd Mon Sep 17 00:00:00 2001
From: serge-sans-paille <serge.guelton@telecom-bretagne.eu>
Date: Sun, 29 Oct 2023 18:17:02 +0000
Subject: [PATCH 323/877] Perf/lexer faster slow get char and size (#70543)

Co-authored-by: serge-sans-paille <sguelton@mozilla.com>
---
 clang/include/clang/Lex/Lexer.h               | 35 ++++-----
 clang/lib/Lex/DependencyDirectivesScanner.cpp |  5 +-
 clang/lib/Lex/Lexer.cpp                       | 73 ++++++++++---------
 3 files changed, 60 insertions(+), 53 deletions(-)

diff --git a/clang/include/clang/Lex/Lexer.h b/clang/include/clang/Lex/Lexer.h
index ac0ef14c591bd..899e665e74546 100644
--- a/clang/include/clang/Lex/Lexer.h
+++ b/clang/include/clang/Lex/Lexer.h
@@ -575,19 +575,23 @@ class Lexer : public PreprocessorLexer {
   /// sequence.
   static bool isNewLineEscaped(const char *BufferStart, const char *Str);
 
+  /// Represents a char and the number of bytes parsed to produce it.
+  struct SizedChar {
+    char Char;
+    unsigned Size;
+  };
+
   /// getCharAndSizeNoWarn - Like the getCharAndSize method, but does not ever
   /// emit a warning.
-  static inline char getCharAndSizeNoWarn(const char *Ptr, unsigned &Size,
-                                          const LangOptions &LangOpts) {
+  static inline SizedChar getCharAndSizeNoWarn(const char *Ptr,
+                                               const LangOptions &LangOpts) {
     // If this is not a trigraph and not a UCN or escaped newline, return
     // quickly.
     if (isObviouslySimpleCharacter(Ptr[0])) {
-      Size = 1;
-      return *Ptr;
+      return {*Ptr, 1u};
     }
 
-    Size = 0;
-    return getCharAndSizeSlowNoWarn(Ptr, Size, LangOpts);
+    return getCharAndSizeSlowNoWarn(Ptr, LangOpts);
   }
 
   /// Returns the leading whitespace for line that corresponds to the given
@@ -665,8 +669,7 @@ class Lexer : public PreprocessorLexer {
     // quickly.
     if (isObviouslySimpleCharacter(Ptr[0])) return *Ptr++;
 
-    unsigned Size = 0;
-    char C = getCharAndSizeSlow(Ptr, Size, &Tok);
+    auto [C, Size] = getCharAndSizeSlow(Ptr, &Tok);
     Ptr += Size;
     return C;
   }
@@ -682,9 +685,7 @@ class Lexer : public PreprocessorLexer {
 
     // Otherwise, re-lex the character with a current token, allowing
     // diagnostics to be emitted and flags to be set.
-    Size = 0;
-    getCharAndSizeSlow(Ptr, Size, &Tok);
-    return Ptr+Size;
+    return Ptr + getCharAndSizeSlow(Ptr, &Tok).Size;
   }
 
   /// getCharAndSize - Peek a single 'character' from the specified buffer,
@@ -699,14 +700,14 @@ class Lexer : public PreprocessorLexer {
       return *Ptr;
     }
 
-    Size = 0;
-    return getCharAndSizeSlow(Ptr, Size);
+    auto CharAndSize = getCharAndSizeSlow(Ptr);
+    Size = CharAndSize.Size;
+    return CharAndSize.Char;
   }
 
   /// getCharAndSizeSlow - Handle the slow/uncommon case of the getCharAndSize
   /// method.
-  char getCharAndSizeSlow(const char *Ptr, unsigned &Size,
-                          Token *Tok = nullptr);
+  SizedChar getCharAndSizeSlow(const char *Ptr, Token *Tok = nullptr);
 
   /// getEscapedNewLineSize - Return the size of the specified escaped newline,
   /// or 0 if it is not an escaped newline. P[-1] is known to be a "\" on entry
@@ -720,8 +721,8 @@ class Lexer : public PreprocessorLexer {
 
   /// getCharAndSizeSlowNoWarn - Same as getCharAndSizeSlow, but never emits a
   /// diagnostic.
-  static char getCharAndSizeSlowNoWarn(const char *Ptr, unsigned &Size,
-                                       const LangOptions &LangOpts);
+  static SizedChar getCharAndSizeSlowNoWarn(const char *Ptr,
+                                            const LangOptions &LangOpts);
 
   //===--------------------------------------------------------------------===//
   // Other lexer functions.
diff --git a/clang/lib/Lex/DependencyDirectivesScanner.cpp b/clang/lib/Lex/DependencyDirectivesScanner.cpp
index 2bd2c5f8388c0..980f865cf24c9 100644
--- a/clang/lib/Lex/DependencyDirectivesScanner.cpp
+++ b/clang/lib/Lex/DependencyDirectivesScanner.cpp
@@ -565,9 +565,8 @@ Scanner::cleanStringIfNeeded(const dependency_directives_scan::Token &Tok) {
   const char *BufPtr = Input.begin() + Tok.Offset;
   const char *AfterIdent = Input.begin() + Tok.getEnd();
   while (BufPtr < AfterIdent) {
-    unsigned Size;
-    Spelling[SpellingLength++] =
-        Lexer::getCharAndSizeNoWarn(BufPtr, Size, LangOpts);
+    auto [Char, Size] = Lexer::getCharAndSizeNoWarn(BufPtr, LangOpts);
+    Spelling[SpellingLength++] = Char;
     BufPtr += Size;
   }
 
diff --git a/clang/lib/Lex/Lexer.cpp b/clang/lib/Lex/Lexer.cpp
index 675ec28e51479..1c53997527732 100644
--- a/clang/lib/Lex/Lexer.cpp
+++ b/clang/lib/Lex/Lexer.cpp
@@ -287,9 +287,9 @@ static size_t getSpellingSlow(const Token &Tok, const char *BufPtr,
   if (tok::isStringLiteral(Tok.getKind())) {
     // Munch the encoding-prefix and opening double-quote.
     while (BufPtr < BufEnd) {
-      unsigned Size;
-      Spelling[Length++] = Lexer::getCharAndSizeNoWarn(BufPtr, Size, LangOpts);
-      BufPtr += Size;
+      auto CharAndSize = Lexer::getCharAndSizeNoWarn(BufPtr, LangOpts);
+      Spelling[Length++] = CharAndSize.Char;
+      BufPtr += CharAndSize.Size;
 
       if (Spelling[Length - 1] == '"')
         break;
@@ -316,9 +316,9 @@ static size_t getSpellingSlow(const Token &Tok, const char *BufPtr,
   }
 
   while (BufPtr < BufEnd) {
-    unsigned Size;
-    Spelling[Length++] = Lexer::getCharAndSizeNoWarn(BufPtr, Size, LangOpts);
-    BufPtr += Size;
+    auto CharAndSize = Lexer::getCharAndSizeNoWarn(BufPtr, LangOpts);
+    Spelling[Length++] = CharAndSize.Char;
+    BufPtr += CharAndSize.Size;
   }
 
   assert(Length < Tok.getLength() &&
@@ -772,10 +772,9 @@ unsigned Lexer::getTokenPrefixLength(SourceLocation TokStart, unsigned CharNo,
   // If we have a character that may be a trigraph or escaped newline, use a
   // lexer to parse it correctly.
   for (; CharNo; --CharNo) {
-    unsigned Size;
-    Lexer::getCharAndSizeNoWarn(TokPtr, Size, LangOpts);
-    TokPtr += Size;
-    PhysOffset += Size;
+    auto CharAndSize = Lexer::getCharAndSizeNoWarn(TokPtr, LangOpts);
+    TokPtr += CharAndSize.Size;
+    PhysOffset += CharAndSize.Size;
   }
 
   // Final detail: if we end up on an escaped newline, we want to return the
@@ -1357,15 +1356,16 @@ SourceLocation Lexer::findLocationAfterToken(
 ///
 /// NOTE: When this method is updated, getCharAndSizeSlowNoWarn (below) should
 /// be updated to match.
-char Lexer::getCharAndSizeSlow(const char *Ptr, unsigned &Size,
-                               Token *Tok) {
+Lexer::SizedChar Lexer::getCharAndSizeSlow(const char *Ptr, Token *Tok) {
+  unsigned Size = 0;
   // If we have a slash, look for an escaped newline.
   if (Ptr[0] == '\\') {
     ++Size;
     ++Ptr;
 Slash:
     // Common case, backslash-char where the char is not whitespace.
-    if (!isWhitespace(Ptr[0])) return '\\';
+    if (!isWhitespace(Ptr[0]))
+      return {'\\', Size};
 
     // See if we have optional whitespace characters between the slash and
     // newline.
@@ -1382,11 +1382,13 @@ char Lexer::getCharAndSizeSlow(const char *Ptr, unsigned &Size,
       Ptr  += EscapedNewLineSize;
 
       // Use slow version to accumulate a correct size field.
-      return getCharAndSizeSlow(Ptr, Size, Tok);
+      auto CharAndSize = getCharAndSizeSlow(Ptr, Tok);
+      CharAndSize.Size += Size;
+      return CharAndSize;
     }
 
     // Otherwise, this is not an escaped newline, just return the slash.
-    return '\\';
+    return {'\\', Size};
   }
 
   // If this is a trigraph, process it.
@@ -1401,13 +1403,12 @@ char Lexer::getCharAndSizeSlow(const char *Ptr, unsigned &Size,
       Ptr += 3;
       Size += 3;
       if (C == '\\') goto Slash;
-      return C;
+      return {C, Size};
     }
   }
 
   // If this is neither, return a single character.
-  ++Size;
-  return *Ptr;
+  return {*Ptr, Size + 1u};
 }
 
 /// getCharAndSizeSlowNoWarn - Handle the slow/uncommon case of the
@@ -1416,15 +1417,18 @@ char Lexer::getCharAndSizeSlow(const char *Ptr, unsigned &Size,
 ///
 /// NOTE: When this method is updated, getCharAndSizeSlow (above) should
 /// be updated to match.
-char Lexer::getCharAndSizeSlowNoWarn(const char *Ptr, unsigned &Size,
-                                     const LangOptions &LangOpts) {
+Lexer::SizedChar Lexer::getCharAndSizeSlowNoWarn(const char *Ptr,
+                                                 const LangOptions &LangOpts) {
+
+  unsigned Size = 0;
   // If we have a slash, look for an escaped newline.
   if (Ptr[0] == '\\') {
     ++Size;
     ++Ptr;
 Slash:
     // Common case, backslash-char where the char is not whitespace.
-    if (!isWhitespace(Ptr[0])) return '\\';
+    if (!isWhitespace(Ptr[0]))
+      return {'\\', Size};
 
     // See if we have optional whitespace characters followed by a newline.
     if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) {
@@ -1433,11 +1437,13 @@ char Lexer::getCharAndSizeSlowNoWarn(const char *Ptr, unsigned &Size,
       Ptr  += EscapedNewLineSize;
 
       // Use slow version to accumulate a correct size field.
-      return getCharAndSizeSlowNoWarn(Ptr, Size, LangOpts);
+      auto CharAndSize = getCharAndSizeSlowNoWarn(Ptr, LangOpts);
+      CharAndSize.Size += Size;
+      return CharAndSize;
     }
 
     // Otherwise, this is not an escaped newline, just return the slash.
-    return '\\';
+    return {'\\', Size};
   }
 
   // If this is a trigraph, process it.
@@ -1448,13 +1454,12 @@ char Lexer::getCharAndSizeSlowNoWarn(const char *Ptr, unsigned &Size,
       Ptr += 3;
       Size += 3;
       if (C == '\\') goto Slash;
-      return C;
+      return {C, Size};
     }
   }
 
   // If this is neither, return a single character.
-  ++Size;
-  return *Ptr;
+  return {*Ptr, Size + 1u};
 }
 
 //===----------------------------------------------------------------------===//
@@ -1964,11 +1969,14 @@ bool Lexer::LexIdentifierContinue(Token &Result, const char *CurPtr) {
 /// isHexaLiteral - Return true if Start points to a hex constant.
 /// in microsoft mode (where this is supposed to be several different tokens).
 bool Lexer::isHexaLiteral(const char *Start, const LangOptions &LangOpts) {
-  unsigned Size;
-  char C1 = Lexer::getCharAndSizeNoWarn(Start, Size, LangOpts);
+  auto CharAndSize1 = Lexer::getCharAndSizeNoWarn(Start, LangOpts);
+  char C1 = CharAndSize1.Char;
   if (C1 != '0')
     return false;
-  char C2 = Lexer::getCharAndSizeNoWarn(Start + Size, Size, LangOpts);
+
+  auto CharAndSize2 =
+      Lexer::getCharAndSizeNoWarn(Start + CharAndSize1.Size, LangOpts);
+  char C2 = CharAndSize2.Char;
   return (C2 == 'x' || C2 == 'X');
 }
 
@@ -2012,8 +2020,7 @@ bool Lexer::LexNumericConstant(Token &Result, const char *CurPtr) {
 
   // If we have a digit separator, continue.
   if (C == '\'' && (LangOpts.CPlusPlus14 || LangOpts.C23)) {
-    unsigned NextSize;
-    char Next = getCharAndSizeNoWarn(CurPtr + Size, NextSize, LangOpts);
+    auto [Next, NextSize] = getCharAndSizeNoWarn(CurPtr + Size, LangOpts);
     if (isAsciiIdentifierContinue(Next)) {
       if (!isLexingRawMode())
         Diag(CurPtr, LangOpts.CPlusPlus
@@ -2085,8 +2092,8 @@ const char *Lexer::LexUDSuffix(Token &Result, const char *CurPtr,
       unsigned Consumed = Size;
       unsigned Chars = 1;
       while (true) {
-        unsigned NextSize;
-        char Next = getCharAndSizeNoWarn(CurPtr + Consumed, NextSize, LangOpts);
+        auto [Next, NextSize] =
+            getCharAndSizeNoWarn(CurPtr + Consumed, LangOpts);
         if (!isAsciiIdentifierContinue(Next)) {
           // End of suffix. Check whether this is on the allowed list.
           const StringRef CompleteSuffix(Buffer, Chars);

From d346c82435f377984c2ff3d7d4b5e3942c72bc17 Mon Sep 17 00:00:00 2001
From: Johannes Doerfert <johannes@jdoerfert.de>
Date: Sun, 29 Oct 2023 11:35:34 -0700
Subject: [PATCH 324/877] [OpenMP] Associate the KernelEnvironment with the
 GenericKernelTy (#70383)

By associating the kernel environment with the generic kernel we can
access middle-end information easily, including the launch bounds ranges
that are acceptable. By constraining the number of threads accordingly,
we now obey the user-provided bounds that were passed via attributes.
---
 clang/test/OpenMP/bug57757.cpp                | 15 ++--
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp     |  4 +-
 .../plugins-nextgen/amdgpu/src/rtl.cpp        |  8 +-
 .../PluginInterface/PluginInterface.cpp       | 74 +++++++------------
 .../common/PluginInterface/PluginInterface.h  | 39 +++++-----
 .../plugins-nextgen/cuda/src/rtl.cpp          |  8 +-
 .../generic-elf-64bit/src/rtl.cpp             | 20 ++---
 .../test/offloading/default_thread_limit.c    |  3 +-
 .../test/offloading/thread_state_1.c          |  4 +-
 .../test/offloading/thread_state_2.c          |  4 +-
 10 files changed, 74 insertions(+), 105 deletions(-)

diff --git a/clang/test/OpenMP/bug57757.cpp b/clang/test/OpenMP/bug57757.cpp
index 7894796ac4628..7acfe134ddd0b 100644
--- a/clang/test/OpenMP/bug57757.cpp
+++ b/clang/test/OpenMP/bug57757.cpp
@@ -32,24 +32,23 @@ void foo() {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T:%.*]], ptr [[TMP1]], i64 0, i32 2
 // CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META13:![0-9]+]])
-// CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META16:![0-9]+]])
-// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4, !tbaa [[TBAA18:![0-9]+]], !alias.scope !13, !noalias !16
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4, !tbaa [[TBAA16:![0-9]+]], !alias.scope !13, !noalias !17
 // CHECK-NEXT:    switch i32 [[TMP3]], label [[DOTOMP_OUTLINED__EXIT:%.*]] [
 // CHECK-NEXT:    i32 0, label [[DOTUNTIED_JMP__I:%.*]]
 // CHECK-NEXT:    i32 1, label [[DOTUNTIED_NEXT__I:%.*]]
 // CHECK-NEXT:    ]
 // CHECK:       .untied.jmp..i:
-// CHECK-NEXT:    store i32 1, ptr [[TMP2]], align 4, !tbaa [[TBAA18]], !alias.scope !13, !noalias !16
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call i32 @__kmpc_omp_task(ptr nonnull @[[GLOB1]], i32 [[TMP0]], ptr [[TMP1]]), !noalias !19
+// CHECK-NEXT:    store i32 1, ptr [[TMP2]], align 4, !tbaa [[TBAA16]], !alias.scope !13, !noalias !17
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call i32 @__kmpc_omp_task(ptr nonnull @[[GLOB1]], i32 [[TMP0]], ptr [[TMP1]]), !noalias !13
 // CHECK-NEXT:    br label [[DOTOMP_OUTLINED__EXIT]]
 // CHECK:       .untied.next..i:
 // CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES:%.*]], ptr [[TMP1]], i64 0, i32 1
 // CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES]], ptr [[TMP1]], i64 0, i32 1, i32 2
 // CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES]], ptr [[TMP1]], i64 0, i32 1, i32 1
-// CHECK-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[TMP5]], align 8, !tbaa [[TBAA20:![0-9]+]], !alias.scope !16, !noalias !13
-// CHECK-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA18]], !alias.scope !16, !noalias !13
-// CHECK-NEXT:    [[TMP10:%.*]] = load float, ptr [[TMP6]], align 4, !tbaa [[TBAA21:![0-9]+]], !alias.scope !16, !noalias !13
-// CHECK-NEXT:    tail call void [[TMP8]](i32 noundef [[TMP9]], float noundef [[TMP10]]) #[[ATTR2:[0-9]+]], !noalias !19
+// CHECK-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[TMP5]], align 8, !tbaa [[TBAA19:![0-9]+]], !noalias !13
+// CHECK-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA16]], !noalias !13
+// CHECK-NEXT:    [[TMP10:%.*]] = load float, ptr [[TMP6]], align 4, !tbaa [[TBAA20:![0-9]+]], !noalias !13
+// CHECK-NEXT:    tail call void [[TMP8]](i32 noundef [[TMP9]], float noundef [[TMP10]]) #[[ATTR2:[0-9]+]], !noalias !13
 // CHECK-NEXT:    br label [[DOTOMP_OUTLINED__EXIT]]
 // CHECK:       .omp_outlined..exit:
 // CHECK-NEXT:    ret i32 0
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 3e4e030f44c7f..b320d77652e1c 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -4093,8 +4093,8 @@ OpenMPIRBuilder::createTargetInit(const LocationDescription &Loc, bool IsSPMD,
 
   Function *Kernel = Builder.GetInsertBlock()->getParent();
 
-  /// Manifest the launch configuration in the metadata matching the kernel
-  /// environment.
+  // Manifest the launch configuration in the metadata matching the kernel
+  // environment.
   if (MinTeamsVal > 1 || MaxTeamsVal > 0)
     writeTeamsForKernel(T, *Kernel, MinTeamsVal, MaxTeamsVal);
 
diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
index 756c5003b0d54..5366fad0c862e 100644
--- a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -411,8 +411,7 @@ struct AMDGPUDeviceImageTy : public DeviceImageTy {
 /// generic kernel class.
 struct AMDGPUKernelTy : public GenericKernelTy {
   /// Create an AMDGPU kernel with a name and an execution mode.
-  AMDGPUKernelTy(const char *Name, OMPTgtExecModeFlags ExecutionMode)
-      : GenericKernelTy(Name, ExecutionMode) {}
+  AMDGPUKernelTy(const char *Name) : GenericKernelTy(Name) {}
 
   /// Initialize the AMDGPU kernel.
   Error initImpl(GenericDeviceTy &Device, DeviceImageTy &Image) override {
@@ -1978,14 +1977,13 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
 
   /// Allocate and construct an AMDGPU kernel.
   Expected<GenericKernelTy &>
-  constructKernel(const __tgt_offload_entry &KernelEntry,
-                  OMPTgtExecModeFlags ExecMode) override {
+  constructKernel(const __tgt_offload_entry &KernelEntry) override {
     // Allocate and construct the AMDGPU kernel.
     AMDGPUKernelTy *AMDGPUKernel = Plugin::get().allocate<AMDGPUKernelTy>();
     if (!AMDGPUKernel)
       return Plugin::error("Failed to allocate memory for AMDGPU kernel");
 
-    new (AMDGPUKernel) AMDGPUKernelTy(KernelEntry.name, ExecMode);
+    new (AMDGPUKernel) AMDGPUKernelTy(KernelEntry.name);
 
     return *AMDGPUKernel;
   }
diff --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp
index 0243f0205dbf0..e5ee3840a6768 100644
--- a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp
+++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp
@@ -339,9 +339,33 @@ Error GenericKernelTy::init(GenericDeviceTy &GenericDevice,
 
   ImagePtr = &Image;
 
-  PreferredNumThreads = GenericDevice.getDefaultNumThreads();
+  // Retrieve kernel environment object for the kernel.
+  GlobalTy KernelEnv(std::string(Name) + "_kernel_environment",
+                     sizeof(KernelEnvironment), &KernelEnvironment);
+  GenericGlobalHandlerTy &GHandler = Plugin::get().getGlobalHandler();
+  if (auto Err =
+          GHandler.readGlobalFromImage(GenericDevice, *ImagePtr, KernelEnv)) {
+    [[maybe_unused]] std::string ErrStr = toString(std::move(Err));
+    DP("Failed to read kernel environment for '%s': %s\n"
+       "Using default SPMD (2) execution mode\n",
+       Name, ErrStr.data());
+    KernelEnvironment.Configuration.ExecMode = OMP_TGT_EXEC_MODE_SPMD;
+    KernelEnvironment.Configuration.MayUseNestedParallelism = /*Unknown=*/2;
+    KernelEnvironment.Configuration.UseGenericStateMachine = /*Unknown=*/2;
+  }
 
-  MaxNumThreads = GenericDevice.getThreadLimit();
+  // Max = Config.Max > 0 ? min(Config.Max, Device.Max) : Device.Max;
+  MaxNumThreads = KernelEnvironment.Configuration.MaxThreads > 0
+                      ? std::min(KernelEnvironment.Configuration.MaxThreads,
+                                 int32_t(GenericDevice.getThreadLimit()))
+                      : GenericDevice.getThreadLimit();
+
+  // Pref = Config.Pref > 0 ? max(Config.Pref, Device.Pref) : Device.Pref;
+  PreferredNumThreads =
+      KernelEnvironment.Configuration.MinThreads > 0
+          ? std::max(KernelEnvironment.Configuration.MinThreads,
+                     int32_t(GenericDevice.getDefaultNumThreads()))
+          : GenericDevice.getDefaultNumThreads();
 
   return initImpl(GenericDevice, Image);
 }
@@ -890,13 +914,8 @@ Error GenericDeviceTy::registerKernelOffloadEntry(
     __tgt_offload_entry &DeviceEntry) {
   DeviceEntry = KernelEntry;
 
-  // Retrieve the execution mode.
-  auto ExecModeOrErr = getExecutionModeForKernel(KernelEntry.name, Image);
-  if (!ExecModeOrErr)
-    return ExecModeOrErr.takeError();
-
   // Create a kernel object.
-  auto KernelOrErr = constructKernel(KernelEntry, *ExecModeOrErr);
+  auto KernelOrErr = constructKernel(KernelEntry);
   if (!KernelOrErr)
     return KernelOrErr.takeError();
 
@@ -914,45 +933,6 @@ Error GenericDeviceTy::registerKernelOffloadEntry(
   return Plugin::success();
 }
 
-Expected<KernelEnvironmentTy>
-GenericDeviceTy::getKernelEnvironmentForKernel(StringRef Name,
-                                               DeviceImageTy &Image) {
-  // Create a metadata object for the kernel environment object.
-  StaticGlobalTy<KernelEnvironmentTy> KernelEnv(Name.data(),
-                                                "_kernel_environment");
-
-  // Retrieve kernel environment object for the kernel.
-  GenericGlobalHandlerTy &GHandler = Plugin::get().getGlobalHandler();
-  if (auto Err = GHandler.readGlobalFromImage(*this, Image, KernelEnv))
-    return std::move(Err);
-
-  return KernelEnv.getValue();
-}
-
-Expected<OMPTgtExecModeFlags>
-GenericDeviceTy::getExecutionModeForKernel(StringRef Name,
-                                           DeviceImageTy &Image) {
-  auto KernelEnvOrError = getKernelEnvironmentForKernel(Name, Image);
-  if (!KernelEnvOrError) {
-    [[maybe_unused]] std::string ErrStr =
-        toString(KernelEnvOrError.takeError());
-    DP("Failed to read kernel environment for '%s': %s\n"
-       "Using default SPMD (2) execution mode\n",
-       Name.data(), ErrStr.data());
-    return OMP_TGT_EXEC_MODE_SPMD;
-  }
-
-  auto &KernelEnv = *KernelEnvOrError;
-  auto ExecMode = KernelEnv.Configuration.ExecMode;
-
-  // Check that the retrieved execution mode is valid.
-  if (!GenericKernelTy::isValidExecutionMode(ExecMode))
-    return Plugin::error("Invalid execution mode %d for '%s'", ExecMode,
-                         Name.data());
-
-  return ExecMode;
-}
-
 Error PinnedAllocationMapTy::insertEntry(void *HstPtr, void *DevAccessiblePtr,
                                          size_t Size, bool ExternallyLocked) {
   // Insert the new entry into the map.
diff --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h
index ddcf3b3cc9b95..e61b28b462677 100644
--- a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h
+++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h
@@ -255,9 +255,8 @@ class DeviceImageTy {
 /// implement the necessary virtual function members.
 struct GenericKernelTy {
   /// Construct a kernel with a name and a execution mode.
-  GenericKernelTy(const char *Name, OMPTgtExecModeFlags ExecutionMode)
-      : Name(Name), ExecutionMode(ExecutionMode), PreferredNumThreads(0),
-        MaxNumThreads(0) {}
+  GenericKernelTy(const char *Name)
+      : Name(Name), PreferredNumThreads(0), MaxNumThreads(0) {}
 
   virtual ~GenericKernelTy() {}
 
@@ -285,6 +284,11 @@ struct GenericKernelTy {
     return *ImagePtr;
   }
 
+  /// Return the kernel environment object for kernel \p Name.
+  const KernelEnvironmentTy &getKernelEnvironmentForKernel() {
+    return KernelEnvironment;
+  }
+
   /// Indicate whether an execution mode is valid.
   static bool isValidExecutionMode(OMPTgtExecModeFlags ExecutionMode) {
     switch (ExecutionMode) {
@@ -299,7 +303,7 @@ struct GenericKernelTy {
 protected:
   /// Get the execution mode name of the kernel.
   const char *getExecutionModeName() const {
-    switch (ExecutionMode) {
+    switch (KernelEnvironment.Configuration.ExecMode) {
     case OMP_TGT_EXEC_MODE_SPMD:
       return "SPMD";
     case OMP_TGT_EXEC_MODE_GENERIC:
@@ -343,19 +347,20 @@ struct GenericKernelTy {
 
   /// Indicate if the kernel works in Generic SPMD, Generic or SPMD mode.
   bool isGenericSPMDMode() const {
-    return ExecutionMode == OMP_TGT_EXEC_MODE_GENERIC_SPMD;
+    return KernelEnvironment.Configuration.ExecMode ==
+           OMP_TGT_EXEC_MODE_GENERIC_SPMD;
   }
   bool isGenericMode() const {
-    return ExecutionMode == OMP_TGT_EXEC_MODE_GENERIC;
+    return KernelEnvironment.Configuration.ExecMode ==
+           OMP_TGT_EXEC_MODE_GENERIC;
+  }
+  bool isSPMDMode() const {
+    return KernelEnvironment.Configuration.ExecMode == OMP_TGT_EXEC_MODE_SPMD;
   }
-  bool isSPMDMode() const { return ExecutionMode == OMP_TGT_EXEC_MODE_SPMD; }
 
   /// The kernel name.
   const char *Name;
 
-  /// The execution flags of the kernel.
-  OMPTgtExecModeFlags ExecutionMode;
-
   /// The image that contains this kernel.
   DeviceImageTy *ImagePtr = nullptr;
 
@@ -365,6 +370,9 @@ struct GenericKernelTy {
 
   /// The maximum number of threads which the kernel could leverage.
   uint32_t MaxNumThreads;
+
+  /// The kernel environment, including execution flags.
+  KernelEnvironmentTy KernelEnvironment;
 };
 
 /// Class representing a map of host pinned allocations. We track these pinned
@@ -819,8 +827,7 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
 
   /// Allocate and construct a kernel object.
   virtual Expected<GenericKernelTy &>
-  constructKernel(const __tgt_offload_entry &KernelEntry,
-                  OMPTgtExecModeFlags ExecMode) = 0;
+  constructKernel(const __tgt_offload_entry &KernelEntry) = 0;
 
   /// Get and set the stack size and heap size for the device. If not used, the
   /// plugin can implement the setters as no-op and setting the output
@@ -864,10 +871,6 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
       UInt32Envar("LIBOMPTARGET_MIN_THREADS_FOR_LOW_TRIP_COUNT", 32);
 
 protected:
-  /// Return the execution mode used for kernel \p Name.
-  virtual Expected<OMPTgtExecModeFlags>
-  getExecutionModeForKernel(StringRef Name, DeviceImageTy &Image);
-
   /// Environment variables defined by the LLVM OpenMP implementation
   /// regarding the initial number of streams and events.
   UInt32Envar OMPX_InitialNumStreams;
@@ -916,10 +919,6 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
 #endif
 
 private:
-  /// Return the kernel environment object for kernel \p Name.
-  Expected<KernelEnvironmentTy>
-  getKernelEnvironmentForKernel(StringRef Name, DeviceImageTy &Image);
-
   DeviceMemoryPoolTy DeviceMemoryPool = {nullptr, 0};
   DeviceMemoryPoolTrackingTy DeviceMemoryPoolTracking = {0, 0, ~0U, 0};
 };
diff --git a/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
index 431e34ca75cd6..d3375b5a556bd 100644
--- a/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
+++ b/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
@@ -79,8 +79,7 @@ struct CUDADeviceImageTy : public DeviceImageTy {
 /// generic kernel class.
 struct CUDAKernelTy : public GenericKernelTy {
   /// Create a CUDA kernel with a name and an execution mode.
-  CUDAKernelTy(const char *Name, OMPTgtExecModeFlags ExecMode)
-      : GenericKernelTy(Name, ExecMode), Func(nullptr) {}
+  CUDAKernelTy(const char *Name) : GenericKernelTy(Name), Func(nullptr) {}
 
   /// Initialize the CUDA kernel.
   Error initImpl(GenericDeviceTy &GenericDevice,
@@ -356,14 +355,13 @@ struct CUDADeviceTy : public GenericDeviceTy {
 
   /// Allocate and construct a CUDA kernel.
   Expected<GenericKernelTy &>
-  constructKernel(const __tgt_offload_entry &KernelEntry,
-                  OMPTgtExecModeFlags ExecMode) override {
+  constructKernel(const __tgt_offload_entry &KernelEntry) override {
     // Allocate and construct the CUDA kernel.
     CUDAKernelTy *CUDAKernel = Plugin::get().allocate<CUDAKernelTy>();
     if (!CUDAKernel)
       return Plugin::error("Failed to allocate memory for CUDA kernel");
 
-    new (CUDAKernel) CUDAKernelTy(KernelEntry.name, ExecMode);
+    new (CUDAKernel) CUDAKernelTy(KernelEntry.name);
 
     return *CUDAKernel;
   }
diff --git a/openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp
index 619f4dfed9b4e..85cf9bef1543b 100644
--- a/openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp
+++ b/openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp
@@ -25,6 +25,7 @@
 
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Frontend/OpenMP/OMPConstants.h"
+#include "llvm/Frontend/OpenMP/OMPDeviceConstants.h"
 #include "llvm/Frontend/OpenMP/OMPGridValues.h"
 #include "llvm/Support/DynamicLibrary.h"
 
@@ -51,8 +52,7 @@ using llvm::sys::DynamicLibrary;
 /// Class implementing kernel functionalities for GenELF64.
 struct GenELF64KernelTy : public GenericKernelTy {
   /// Construct the kernel with a name and an execution mode.
-  GenELF64KernelTy(const char *Name, OMPTgtExecModeFlags ExecMode)
-      : GenericKernelTy(Name, ExecMode), Func(nullptr) {}
+  GenELF64KernelTy(const char *Name) : GenericKernelTy(Name), Func(nullptr) {}
 
   /// Initialize the kernel.
   Error initImpl(GenericDeviceTy &Device, DeviceImageTy &Image) override {
@@ -71,6 +71,10 @@ struct GenELF64KernelTy : public GenericKernelTy {
     // Save the function pointer.
     Func = (void (*)())Global.getPtr();
 
+    KernelEnvironment.Configuration.ExecMode = OMP_TGT_EXEC_MODE_GENERIC;
+    KernelEnvironment.Configuration.MayUseNestedParallelism = /* Unknown */ 2;
+    KernelEnvironment.Configuration.UseGenericStateMachine = /* Unknown */ 2;
+
     // Set the maximum number of threads to a single.
     MaxNumThreads = 1;
     return Plugin::success();
@@ -137,15 +141,14 @@ struct GenELF64DeviceTy : public GenericDeviceTy {
 
   /// Construct the kernel for a specific image on the device.
   Expected<GenericKernelTy &>
-  constructKernel(const __tgt_offload_entry &KernelEntry,
-                  OMPTgtExecModeFlags ExecMode) override {
+  constructKernel(const __tgt_offload_entry &KernelEntry) override {
     // Allocate and construct the kernel.
     GenELF64KernelTy *GenELF64Kernel =
         Plugin::get().allocate<GenELF64KernelTy>();
     if (!GenELF64Kernel)
       return Plugin::error("Failed to allocate memory for GenELF64 kernel");
 
-    new (GenELF64Kernel) GenELF64KernelTy(KernelEntry.name, ExecMode);
+    new (GenELF64Kernel) GenELF64KernelTy(KernelEntry.name);
 
     return *GenELF64Kernel;
   }
@@ -325,13 +328,6 @@ struct GenELF64DeviceTy : public GenericDeviceTy {
   }
   Error setDeviceHeapSize(uint64_t Value) override { return Plugin::success(); }
 
-protected:
-  /// Retrieve the execution mode for kernels. All kernels use the generic mode.
-  Expected<OMPTgtExecModeFlags>
-  getExecutionModeForKernel(StringRef Name, DeviceImageTy &Image) override {
-    return OMP_TGT_EXEC_MODE_GENERIC;
-  }
-
 private:
   /// Grid values for Generic ELF64 plugins.
   static constexpr GV GenELF64GridValues = {
diff --git a/openmp/libomptarget/test/offloading/default_thread_limit.c b/openmp/libomptarget/test/offloading/default_thread_limit.c
index 73c7e08ccaed4..d32e7df418cbb 100644
--- a/openmp/libomptarget/test/offloading/default_thread_limit.c
+++ b/openmp/libomptarget/test/offloading/default_thread_limit.c
@@ -48,8 +48,7 @@ int main() {
   for (int i = 0; i < N; ++i) {
     optnone();
   }
-// FIXME: Use the attribute value to imply a thread_limit
-// DEFAULT: {{(128|256)}} (MaxFlatWorkGroupSize: 42
+// DEFAULT: 42 (MaxFlatWorkGroupSize: 42
 #pragma omp target ompx_attribute(__attribute__((amdgpu_flat_work_group_size(42, 42))))
 #pragma omp teams distribute parallel for
   for (int i = 0; i < N; ++i) {
diff --git a/openmp/libomptarget/test/offloading/thread_state_1.c b/openmp/libomptarget/test/offloading/thread_state_1.c
index f3f7b32eead56..908b71638097f 100644
--- a/openmp/libomptarget/test/offloading/thread_state_1.c
+++ b/openmp/libomptarget/test/offloading/thread_state_1.c
@@ -26,8 +26,8 @@ int main() {
       }
     }
   }
-  if (o_lvl == 1 && o_tid == o_nt - 1 && o_nt > 1 && i_lvl == 2 && i_tid == 0 &&
-      i_nt == 1) {
+  if (o_lvl == 1 && o_tid == o_nt - 1 && o_nt >= 1 && i_lvl == 2 &&
+      i_tid == 0 && i_nt == 1) {
     // CHECK: Success
     printf("Success\n");
     return 0;
diff --git a/openmp/libomptarget/test/offloading/thread_state_2.c b/openmp/libomptarget/test/offloading/thread_state_2.c
index 6d3bf1661f462..38bc86b7ad0c4 100644
--- a/openmp/libomptarget/test/offloading/thread_state_2.c
+++ b/openmp/libomptarget/test/offloading/thread_state_2.c
@@ -28,8 +28,8 @@ int main() {
       }
     }
   }
-  if (o_lvl == 1 && o_tid == o_nt - 1 && o_nt > 1 && i_lvl == 2 && i_tid == 0 &&
-      i_nt == 1) {
+  if (o_lvl == 1 && o_tid == o_nt - 1 && o_nt >= 1 && i_lvl == 2 &&
+      i_tid == 0 && i_nt == 1) {
     // CHECK: Success
     printf("Success\n");
     return 0;

From d6a3d6b96de64135768ba74b2deea515d7a6fb55 Mon Sep 17 00:00:00 2001
From: Konstantinos Parasyris <koparasy@gmail.com>
Date: Sun, 29 Oct 2023 12:27:19 -0700
Subject: [PATCH 325/877] [openmp] Fixed Support for VA for record-replay.
 (#70396)

The commit was discussed in phabricator
(https://reviews.llvm.org/D157186).

Record replay currently fails on AMD as it conflicts with the heap
memory allocator introduced in #69806. The workaround is setting
`LIBOMPTARGET_HEAP_SIZE=0` during both record and replay run.
---
 openmp/libomptarget/include/Utilities.h       |   5 +
 openmp/libomptarget/include/omptarget.h       |   2 +-
 openmp/libomptarget/include/rtl.h             |   3 +-
 .../plugins-nextgen/amdgpu/src/rtl.cpp        |  10 ++
 .../PluginInterface/PluginInterface.cpp       | 117 ++++++++++++++----
 .../common/PluginInterface/PluginInterface.h  |  15 +++
 .../cuda/dynamic_cuda/cuda.cpp                |  10 ++
 .../plugins-nextgen/cuda/dynamic_cuda/cuda.h  |  82 ++++++++++++
 .../plugins-nextgen/cuda/src/rtl.cpp          | 117 ++++++++++++++++++
 openmp/libomptarget/src/device.cpp            |   5 +-
 openmp/libomptarget/src/interface.cpp         |   5 +-
 openmp/libomptarget/src/omptarget.cpp         |  21 ++--
 openmp/libomptarget/src/private.h             |   2 +-
 .../kernelreplay/llvm-omp-kernel-replay.cpp   |   7 +-
 14 files changed, 356 insertions(+), 45 deletions(-)

diff --git a/openmp/libomptarget/include/Utilities.h b/openmp/libomptarget/include/Utilities.h
index 82593e206e4d0..84d38d05911f7 100644
--- a/openmp/libomptarget/include/Utilities.h
+++ b/openmp/libomptarget/include/Utilities.h
@@ -253,6 +253,11 @@ template <typename Ty> Ty *alignPtr(Ty *Ptr, int64_t Alignment) {
   return std::align(Alignment, sizeof(char), Ptr, Space);
 }
 
+/// Round up \p V to a \p Boundary.
+template <typename Ty> inline Ty roundUp(Ty V, Ty Boundary) {
+  return (V + Boundary - 1) / Boundary * Boundary;
+}
+
 } // namespace target
 } // namespace omp
 } // namespace llvm
diff --git a/openmp/libomptarget/include/omptarget.h b/openmp/libomptarget/include/omptarget.h
index e1f0f77849fa2..de4a1935c2863 100644
--- a/openmp/libomptarget/include/omptarget.h
+++ b/openmp/libomptarget/include/omptarget.h
@@ -439,7 +439,7 @@ void __tgt_set_info_flag(uint32_t);
 int __tgt_print_device_info(int64_t DeviceId);
 
 int __tgt_activate_record_replay(int64_t DeviceId, uint64_t MemorySize,
-                                 bool IsRecord, bool SaveOutput);
+                                 void *VAddr, bool IsRecord, bool SaveOutput);
 
 #ifdef __cplusplus
 }
diff --git a/openmp/libomptarget/include/rtl.h b/openmp/libomptarget/include/rtl.h
index 782a46e27bf47..2272577684f0c 100644
--- a/openmp/libomptarget/include/rtl.h
+++ b/openmp/libomptarget/include/rtl.h
@@ -73,7 +73,8 @@ struct RTLInfoTy {
   typedef int32_t(data_notify_mapped_ty)(int32_t, void *, int64_t);
   typedef int32_t(data_notify_unmapped_ty)(int32_t, void *);
   typedef int32_t(set_device_offset_ty)(int32_t);
-  typedef int32_t(activate_record_replay_ty)(int32_t, uint64_t, bool, bool);
+  typedef int32_t(activate_record_replay_ty)(int32_t, uint64_t, void *, bool,
+                                             bool);
 
   int32_t Idx = -1;             // RTL index, index is the number of devices
                                 // of other RTLs that were registered before,
diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
index 5366fad0c862e..fbecb4963c4ab 100644
--- a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -2579,6 +2579,16 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
     DeviceMemoryPoolSize = Value;
     return Plugin::success();
   }
+  Error getDeviceMemorySize(uint64_t &Value) override {
+    for (AMDGPUMemoryPoolTy *Pool : AllMemoryPools) {
+      if (Pool->isGlobal()) {
+        hsa_status_t Status =
+            Pool->getAttrRaw(HSA_AMD_MEMORY_POOL_INFO_SIZE, Value);
+        return Plugin::check(Status, "Error in getting device memory size: %s");
+      }
+    }
+    return Plugin::error("getDeviceMemorySize:: no global pool");
+  }
 
   /// AMDGPU-specific function to get device attributes.
   template <typename Ty> Error getDeviceAttr(uint32_t Kind, Ty &Value) {
diff --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp
index e5ee3840a6768..200fa15cb9fb9 100644
--- a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp
+++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp
@@ -49,40 +49,87 @@ struct RecordReplayTy {
   void *MemoryStart;
   void *MemoryPtr;
   size_t MemorySize;
+  size_t TotalSize;
   GenericDeviceTy *Device;
   std::mutex AllocationLock;
 
   RRStatusTy Status;
   bool ReplaySaveOutput;
-  uint64_t DeviceMemorySize;
-
-  // Record/replay pre-allocates the largest possible device memory using the
-  // default kind.
-  // TODO: Expand allocation to include other kinds (device, host, shared) and
-  // possibly use a MemoryManager to track (de-)allocations for
-  // storing/retrieving when recording/replaying.
-  Error preallocateDeviceMemory(uint64_t DeviceMemorySize) {
-    // Pre-allocate memory on device. Starts with 64GB and subtracts in steps
-    // of 1GB until allocation succeeds.
-    const size_t MAX_MEMORY_ALLOCATION = DeviceMemorySize;
+
+  void *suggestAddress(uint64_t MaxMemoryAllocation) {
+    // Get a valid pointer address for this system
+    void *Addr =
+        Device->allocate(1024, /* HstPtr */ nullptr, TARGET_ALLOC_DEFAULT);
+    Device->free(Addr);
+    // Align Address to MaxMemoryAllocation
+    Addr = (void *)alignPtr((Addr), MaxMemoryAllocation);
+    return Addr;
+  }
+
+  Error preAllocateVAMemory(uint64_t MaxMemoryAllocation, void *VAddr) {
+    size_t ASize = MaxMemoryAllocation;
+
+    if (!VAddr && isRecording())
+      VAddr = suggestAddress(MaxMemoryAllocation);
+
+    DP("Request %ld bytes allocated at %p\n", MaxMemoryAllocation, VAddr);
+
+    if (auto Err = Device->memoryVAMap(&MemoryStart, VAddr, &ASize))
+      return Err;
+
+    if (isReplaying() && VAddr != MemoryStart) {
+      return Plugin::error("Record-Replay cannot assign the"
+                           "requested recorded address (%p, %p)",
+                           VAddr, MemoryStart);
+    }
+
+    INFO(OMP_INFOTYPE_PLUGIN_KERNEL, Device->getDeviceId(),
+         "Allocated %" PRIu64 " bytes at %p for replay.\n", ASize, MemoryStart);
+
+    MemoryPtr = MemoryStart;
+    MemorySize = 0;
+    TotalSize = ASize;
+    return Plugin::success();
+  }
+
+  Error preAllocateHeuristic(uint64_t MaxMemoryAllocation, void *VAddr) {
+    const size_t MAX_MEMORY_ALLOCATION = MaxMemoryAllocation;
     constexpr size_t STEP = 1024 * 1024 * 1024ULL;
     MemoryStart = nullptr;
-    for (size_t Try = MAX_MEMORY_ALLOCATION; Try > 0; Try -= STEP) {
-      MemoryStart =
-          Device->allocate(Try, /* HstPtr */ nullptr, TARGET_ALLOC_DEFAULT);
+    for (TotalSize = MAX_MEMORY_ALLOCATION; TotalSize > 0; TotalSize -= STEP) {
+      MemoryStart = Device->allocate(TotalSize, /* HstPtr */ nullptr,
+                                     TARGET_ALLOC_DEFAULT);
       if (MemoryStart)
         break;
     }
 
+    INFO(OMP_INFOTYPE_PLUGIN_KERNEL, Device->getDeviceId(),
+         "Allocated %" PRIu64 " bytes at %p for replay.\n", TotalSize,
+         MemoryStart);
+
     if (!MemoryStart)
       return Plugin::error("Allocating record/replay memory");
 
+    if (VAddr && VAddr != MemoryStart)
+      return Plugin::error("Cannot allocate recorded address");
+
     MemoryPtr = MemoryStart;
     MemorySize = 0;
 
     return Plugin::success();
   }
 
+  Error preallocateDeviceMemory(uint64_t DeviceMemorySize, void *ReqVAddr) {
+    if (Device->supportVAManagement())
+      return preAllocateVAMemory(DeviceMemorySize, ReqVAddr);
+
+    uint64_t DevMemSize;
+    if (Device->getDeviceMemorySize(DevMemSize))
+      return Plugin::error("Cannot determine Device Memory Size");
+
+    return preAllocateHeuristic(DevMemSize, ReqVAddr);
+  }
+
   void dumpDeviceMemory(StringRef Filename) {
     ErrorOr<std::unique_ptr<WritableMemoryBuffer>> DeviceMemoryMB =
         WritableMemoryBuffer::getNewUninitMemBuffer(MemorySize);
@@ -114,8 +161,7 @@ struct RecordReplayTy {
   bool isSaveOutputEnabled() const { return ReplaySaveOutput; }
 
   RecordReplayTy()
-      : Status(RRStatusTy::RRDeactivated), ReplaySaveOutput(false),
-        DeviceMemorySize(-1) {}
+      : Status(RRStatusTy::RRDeactivated), ReplaySaveOutput(false) {}
 
   void saveImage(const char *Name, const DeviceImageTy &Image) {
     SmallString<128> ImageName = {Name, ".image"};
@@ -197,6 +243,7 @@ struct RecordReplayTy {
     JsonKernelInfo["LoopTripCount"] = LoopTripCount;
     JsonKernelInfo["DeviceMemorySize"] = MemorySize;
     JsonKernelInfo["DeviceId"] = Device->getDeviceId();
+    JsonKernelInfo["BumpAllocVAStart"] = (intptr_t)MemoryStart;
 
     json::Array JsonArgPtrs;
     for (int I = 0; I < NumArgs; ++I)
@@ -244,27 +291,33 @@ struct RecordReplayTy {
     return Alloc;
   }
 
-  Error init(GenericDeviceTy *Device, uint64_t MemSize, RRStatusTy Status,
-             bool SaveOutput) {
+  Error init(GenericDeviceTy *Device, uint64_t MemSize, void *VAddr,
+             RRStatusTy Status, bool SaveOutput) {
     this->Device = Device;
     this->Status = Status;
-    this->DeviceMemorySize = MemSize;
     this->ReplaySaveOutput = SaveOutput;
 
-    if (auto Err = preallocateDeviceMemory(MemSize))
+    if (auto Err = preallocateDeviceMemory(MemSize, VAddr))
       return Err;
 
     INFO(OMP_INFOTYPE_PLUGIN_KERNEL, Device->getDeviceId(),
          "Record Replay Initialized (%p)"
          " as starting address, %lu Memory Size"
          " and set on status %s\n",
-         MemoryStart, MemSize,
+         MemoryStart, TotalSize,
          Status == RRStatusTy::RRRecording ? "Recording" : "Replaying");
 
     return Plugin::success();
   }
 
-  void deinit() { Device->free(MemoryStart); }
+  void deinit() {
+    if (Device->supportVAManagement()) {
+      if (auto Err = Device->memoryVAUnMap(MemoryStart, TotalSize))
+        report_fatal_error("Error on releasing virtual memory space");
+    } else {
+      Device->free(MemoryStart);
+    }
+  }
 
 } RecordReplay;
 
@@ -1184,6 +1237,19 @@ Error GenericDeviceTy::queryAsync(__tgt_async_info *AsyncInfo) {
   return queryAsyncImpl(*AsyncInfo);
 }
 
+Error GenericDeviceTy::memoryVAMap(void **Addr, void *VAddr, size_t *RSize) {
+  return Plugin::error("Device does not suppport VA Management");
+}
+
+Error GenericDeviceTy::memoryVAUnMap(void *VAddr, size_t Size) {
+  return Plugin::error("Device does not suppport VA Management");
+}
+
+Error GenericDeviceTy::getDeviceMemorySize(uint64_t &DSize) {
+  return Plugin::error(
+      "Mising getDeviceMemorySize impelmentation (required by RR-heuristic");
+}
+
 Expected<void *> GenericDeviceTy::dataAlloc(int64_t Size, void *HostPtr,
                                             TargetAllocTy Kind) {
   void *Alloc = nullptr;
@@ -1552,8 +1618,8 @@ int32_t __tgt_rtl_is_data_exchangable(int32_t SrcDeviceId,
   return Plugin::get().isDataExchangable(SrcDeviceId, DstDeviceId);
 }
 
-int32_t __tgt_rtl_initialize_record_replay(int32_t DeviceId,
-                                           uint64_t MemorySize, bool isRecord,
+int32_t __tgt_rtl_initialize_record_replay(int32_t DeviceId, int64_t MemorySize,
+                                           void *VAddr, bool isRecord,
                                            bool SaveOutput) {
   GenericPluginTy &Plugin = Plugin::get();
   GenericDeviceTy &Device = Plugin.getDevice(DeviceId);
@@ -1561,7 +1627,8 @@ int32_t __tgt_rtl_initialize_record_replay(int32_t DeviceId,
       isRecord ? RecordReplayTy::RRStatusTy::RRRecording
                : RecordReplayTy::RRStatusTy::RRReplaying;
 
-  if (auto Err = RecordReplay.init(&Device, MemorySize, Status, SaveOutput)) {
+  if (auto Err =
+          RecordReplay.init(&Device, MemorySize, VAddr, Status, SaveOutput)) {
     REPORT("WARNING RR did not intialize RR-properly with %lu bytes"
            "(Error: %s)\n",
            MemorySize, toString(std::move(Err)).data());
diff --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h
index e61b28b462677..e6cfa3d3d6c11 100644
--- a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h
+++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h
@@ -655,6 +655,21 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
   Error queryAsync(__tgt_async_info *AsyncInfo);
   virtual Error queryAsyncImpl(__tgt_async_info &AsyncInfo) = 0;
 
+  /// Check whether the architecture supports VA management
+  virtual bool supportVAManagement() const { return false; }
+
+  /// Get the total device memory size
+  virtual Error getDeviceMemorySize(uint64_t &DSize);
+
+  /// Allocates \p RSize bytes (rounded up to page size) and hints the driver to
+  /// map it to \p VAddr. The obtained address is stored in \p Addr. At return
+  /// \p RSize contains the actual size which can be equal or larger than the
+  /// requested size.
+  virtual Error memoryVAMap(void **Addr, void *VAddr, size_t *RSize);
+
+  /// De-allocates device memory and unmaps the virtual address \p VAddr
+  virtual Error memoryVAUnMap(void *VAddr, size_t Size);
+
   /// Allocate data on the device or involving the device.
   Expected<void *> dataAlloc(int64_t Size, void *HostPtr, TargetAllocTy Kind);
 
diff --git a/openmp/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp b/openmp/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
index 2271b3aa90ddd..3d0de0d5b2caf 100644
--- a/openmp/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
+++ b/openmp/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
@@ -81,6 +81,16 @@ DLWRAP(cuEventDestroy, 1)
 
 DLWRAP_FINALIZE()
 
+DLWRAP(cuMemUnmap, 2)
+DLWRAP(cuMemRelease, 1)
+DLWRAP(cuMemAddressFree, 2)
+DLWRAP(cuMemGetInfo, 2)
+DLWRAP(cuMemAddressReserve, 5)
+DLWRAP(cuMemMap, 5)
+DLWRAP(cuMemCreate, 4)
+DLWRAP(cuMemSetAccess, 4)
+DLWRAP(cuMemGetAllocationGranularity, 3)
+
 #ifndef DYNAMIC_CUDA_PATH
 #define DYNAMIC_CUDA_PATH "libcuda.so"
 #endif
diff --git a/openmp/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.h b/openmp/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.h
index 459236f7ecbd0..3e0307759924b 100644
--- a/openmp/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.h
+++ b/openmp/libomptarget/plugins-nextgen/cuda/dynamic_cuda/cuda.h
@@ -26,6 +26,71 @@ typedef struct CUevent_st *CUevent;
 
 #define CU_DEVICE_INVALID ((CUdevice)-2)
 
+typedef unsigned long long CUmemGenericAllocationHandle_v1;
+typedef CUmemGenericAllocationHandle_v1 CUmemGenericAllocationHandle;
+
+#define CU_DEVICE_INVALID ((CUdevice)-2)
+
+typedef enum CUmemAllocationGranularity_flags_enum {
+  CU_MEM_ALLOC_GRANULARITY_MINIMUM = 0x0,
+  CU_MEM_ALLOC_GRANULARITY_RECOMMENDED = 0x1
+} CUmemAllocationGranularity_flags;
+
+typedef enum CUmemAccess_flags_enum {
+  CU_MEM_ACCESS_FLAGS_PROT_NONE = 0x0,
+  CU_MEM_ACCESS_FLAGS_PROT_READ = 0x1,
+  CU_MEM_ACCESS_FLAGS_PROT_READWRITE = 0x3,
+  CU_MEM_ACCESS_FLAGS_PROT_MAX = 0x7FFFFFFF
+} CUmemAccess_flags;
+
+typedef enum CUmemLocationType_enum {
+  CU_MEM_LOCATION_TYPE_INVALID = 0x0,
+  CU_MEM_LOCATION_TYPE_DEVICE = 0x1,
+  CU_MEM_LOCATION_TYPE_MAX = 0x7FFFFFFF
+} CUmemLocationType;
+
+typedef struct CUmemLocation_st {
+  CUmemLocationType type;
+  int id;
+} CUmemLocation_v1;
+typedef CUmemLocation_v1 CUmemLocation;
+
+typedef struct CUmemAccessDesc_st {
+  CUmemLocation location;
+  CUmemAccess_flags flags;
+} CUmemAccessDesc_v1;
+
+typedef CUmemAccessDesc_v1 CUmemAccessDesc;
+
+typedef enum CUmemAllocationType_enum {
+  CU_MEM_ALLOCATION_TYPE_INVALID = 0x0,
+  CU_MEM_ALLOCATION_TYPE_PINNED = 0x1,
+  CU_MEM_ALLOCATION_TYPE_MAX = 0x7FFFFFFF
+} CUmemAllocationType;
+
+typedef enum CUmemAllocationHandleType_enum {
+  CU_MEM_HANDLE_TYPE_NONE = 0x0,
+  CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR = 0x1,
+  CU_MEM_HANDLE_TYPE_WIN32 = 0x2,
+  CU_MEM_HANDLE_TYPE_WIN32_KMT = 0x4,
+  CU_MEM_HANDLE_TYPE_MAX = 0x7FFFFFFF
+} CUmemAllocationHandleType;
+
+typedef struct CUmemAllocationProp_st {
+  CUmemAllocationType type;
+  CUmemAllocationHandleType requestedHandleTypes;
+  CUmemLocation location;
+
+  void *win32HandleMetaData;
+  struct {
+    unsigned char compressionType;
+    unsigned char gpuDirectRDMACapable;
+    unsigned short usage;
+    unsigned char reserved[4];
+  } allocFlags;
+} CUmemAllocationProp_v1;
+typedef CUmemAllocationProp_v1 CUmemAllocationProp;
+
 typedef enum cudaError_enum {
   CUDA_SUCCESS = 0,
   CUDA_ERROR_INVALID_VALUE = 1,
@@ -268,4 +333,21 @@ CUresult cuStreamWaitEvent(CUstream, CUevent, unsigned int);
 CUresult cuEventSynchronize(CUevent);
 CUresult cuEventDestroy(CUevent);
 
+CUresult cuMemUnmap(CUdeviceptr ptr, size_t size);
+CUresult cuMemRelease(CUmemGenericAllocationHandle handle);
+CUresult cuMemAddressFree(CUdeviceptr ptr, size_t size);
+CUresult cuMemGetInfo(size_t *free, size_t *total);
+CUresult cuMemAddressReserve(CUdeviceptr *ptr, size_t size, size_t alignment,
+                             CUdeviceptr addr, unsigned long long flags);
+CUresult cuMemMap(CUdeviceptr ptr, size_t size, size_t offset,
+                  CUmemGenericAllocationHandle handle,
+                  unsigned long long flags);
+CUresult cuMemCreate(CUmemGenericAllocationHandle *handle, size_t size,
+                     const CUmemAllocationProp *prop, unsigned long long flags);
+CUresult cuMemSetAccess(CUdeviceptr ptr, size_t size,
+                        const CUmemAccessDesc *desc, size_t count);
+CUresult cuMemGetAllocationGranularity(size_t *granularity,
+                                       const CUmemAllocationProp *prop,
+                                       CUmemAllocationGranularity_flags option);
+
 #endif
diff --git a/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
index d3375b5a556bd..be34051bc96f9 100644
--- a/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
+++ b/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
@@ -517,6 +517,116 @@ struct CUDADeviceTy : public GenericDeviceTy {
     return Plugin::check(Res, "Error in cuStreamSynchronize: %s");
   }
 
+  /// CUDA support VA management
+  bool supportVAManagement() const override { return true; }
+
+  /// Allocates \p RSize bytes (rounded up to page size) and hints the cuda
+  /// driver to map it to \p VAddr. The obtained address is stored in \p Addr.
+  /// At return \p RSize contains the actual size
+  Error memoryVAMap(void **Addr, void *VAddr, size_t *RSize) override {
+    CUdeviceptr DVAddr = reinterpret_cast<CUdeviceptr>(VAddr);
+    auto IHandle = DeviceMMaps.find(DVAddr);
+    size_t Size = *RSize;
+
+    if (Size == 0)
+      return Plugin::error("Memory Map Size must be larger than 0");
+
+    // Check if we have already mapped this address
+    if (IHandle != DeviceMMaps.end())
+      return Plugin::error("Address already memory mapped");
+
+    CUmemAllocationProp Prop = {};
+    size_t Granularity = 0;
+
+    size_t Free, Total;
+    CUresult Res = cuMemGetInfo(&Free, &Total);
+    if (auto Err = Plugin::check(Res, "Error in cuMemGetInfo: %s"))
+      return Err;
+
+    if (Size >= Free) {
+      *Addr = nullptr;
+      return Plugin::error(
+          "Canot map memory size larger than the available device memory");
+    }
+
+    // currently NVidia only supports pinned device types
+    Prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
+    Prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+
+    Prop.location.id = DeviceId;
+    cuMemGetAllocationGranularity(&Granularity, &Prop,
+                                  CU_MEM_ALLOC_GRANULARITY_MINIMUM);
+    if (auto Err =
+            Plugin::check(Res, "Error in cuMemGetAllocationGranularity: %s"))
+      return Err;
+
+    if (Granularity == 0)
+      return Plugin::error("Wrong device Page size");
+
+    // Ceil to page size.
+    Size = roundUp(Size, Granularity);
+
+    // Create a handler of our allocation
+    CUmemGenericAllocationHandle AHandle;
+    Res = cuMemCreate(&AHandle, Size, &Prop, 0);
+    if (auto Err = Plugin::check(Res, "Error in cuMemCreate: %s"))
+      return Err;
+
+    CUdeviceptr DevPtr = 0;
+    Res = cuMemAddressReserve(&DevPtr, Size, 0, DVAddr, 0);
+    if (auto Err = Plugin::check(Res, "Error in cuMemAddressReserve: %s"))
+      return Err;
+
+    Res = cuMemMap(DevPtr, Size, 0, AHandle, 0);
+    if (auto Err = Plugin::check(Res, "Error in cuMemMap: %s"))
+      return Err;
+
+    CUmemAccessDesc ADesc = {};
+    ADesc.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+    ADesc.location.id = DeviceId;
+    ADesc.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
+
+    // Sets address
+    Res = cuMemSetAccess(DevPtr, Size, &ADesc, 1);
+    if (auto Err = Plugin::check(Res, "Error in cuMemSetAccess: %s"))
+      return Err;
+
+    *Addr = reinterpret_cast<void *>(DevPtr);
+    *RSize = Size;
+    DeviceMMaps.insert({DevPtr, AHandle});
+    return Plugin::success();
+  }
+
+  /// De-allocates device memory and Unmaps the Virtual Addr
+  Error memoryVAUnMap(void *VAddr, size_t Size) override {
+    CUdeviceptr DVAddr = reinterpret_cast<CUdeviceptr>(VAddr);
+    auto IHandle = DeviceMMaps.find(DVAddr);
+    // Mapping does not exist
+    if (IHandle == DeviceMMaps.end()) {
+      return Plugin::error("Addr is not MemoryMapped");
+    }
+
+    if (IHandle == DeviceMMaps.end())
+      return Plugin::error("Addr is not MemoryMapped");
+
+    CUmemGenericAllocationHandle &AllocHandle = IHandle->second;
+
+    CUresult Res = cuMemUnmap(DVAddr, Size);
+    if (auto Err = Plugin::check(Res, "Error in cuMemUnmap: %s"))
+      return Err;
+
+    Res = cuMemRelease(AllocHandle);
+    if (auto Err = Plugin::check(Res, "Error in cuMemRelease: %s"))
+      return Err;
+
+    Res = cuMemAddressFree(DVAddr, Size);
+    if (auto Err = Plugin::check(Res, "Error in cuMemAddressFree: %s"))
+      return Err;
+
+    DeviceMMaps.erase(IHandle);
+    return Plugin::success();
+  }
+
   /// Query for the completion of the pending operations on the async info.
   Error queryAsyncImpl(__tgt_async_info &AsyncInfo) override {
     CUstream Stream = reinterpret_cast<CUstream>(AsyncInfo.Queue);
@@ -859,6 +969,10 @@ struct CUDADeviceTy : public GenericDeviceTy {
   Error setDeviceHeapSize(uint64_t Value) override {
     return setCtxLimit(CU_LIMIT_MALLOC_HEAP_SIZE, Value);
   }
+  Error getDeviceMemorySize(uint64_t &Value) override {
+    CUresult Res = cuDeviceTotalMem(&Value, Device);
+    return Plugin::check(Res, "Error in getDeviceMemorySize %s");
+  }
 
   /// CUDA-specific functions for getting and setting context limits.
   Error setCtxLimit(CUlimit Kind, uint64_t Value) {
@@ -907,6 +1021,9 @@ struct CUDADeviceTy : public GenericDeviceTy {
   /// The CUDA device handler.
   CUdevice Device = CU_DEVICE_INVALID;
 
+  /// The memory mapped addresses and their handles
+  std::unordered_map<CUdeviceptr, CUmemGenericAllocationHandle> DeviceMMaps;
+
   /// The compute capability of the corresponding CUDA device.
   struct ComputeCapabilityTy {
     uint32_t Major;
diff --git a/openmp/libomptarget/src/device.cpp b/openmp/libomptarget/src/device.cpp
index 93d2157dbd4ee..8a2fe4620b39c 100644
--- a/openmp/libomptarget/src/device.cpp
+++ b/openmp/libomptarget/src/device.cpp
@@ -482,7 +482,8 @@ void *DeviceTy::getTgtPtrBegin(HDTTMapAccessorTy &HDTTMap, void *HstPtrBegin,
 int DeviceTy::eraseMapEntry(HDTTMapAccessorTy &HDTTMap,
                             HostDataToTargetTy *Entry, int64_t Size) {
   assert(Entry && "Trying to delete a null entry from the HDTT map.");
-  assert(Entry->getTotalRefCount() == 0 && Entry->getDataEndThreadCount() == 0 &&
+  assert(Entry->getTotalRefCount() == 0 &&
+         Entry->getDataEndThreadCount() == 0 &&
          "Trying to delete entry that is in use or owned by another thread.");
 
   INFO(OMP_INFOTYPE_MAPPING_CHANGED, DeviceID,
@@ -546,7 +547,7 @@ void DeviceTy::init() {
 
     RTL->activate_record_replay(RTLDeviceID,
                                 OMPX_DeviceMemorySize * 1024 * 1024 * 1024,
-                                true, OMPX_ReplaySaveOutput);
+                                nullptr, true, OMPX_ReplaySaveOutput);
   }
 
   IsInit = true;
diff --git a/openmp/libomptarget/src/interface.cpp b/openmp/libomptarget/src/interface.cpp
index 0b4a393405a4f..e9ab7f05c7a0a 100644
--- a/openmp/libomptarget/src/interface.cpp
+++ b/openmp/libomptarget/src/interface.cpp
@@ -346,7 +346,8 @@ EXTERN int __tgt_target_kernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams,
 /// /param SaveOutput Store the device memory after kernel
 ///                   execution on persistent storage
 EXTERN int __tgt_activate_record_replay(int64_t DeviceId, uint64_t MemorySize,
-                                        bool IsRecord, bool SaveOutput) {
+                                        void *VAddr, bool IsRecord,
+                                        bool SaveOutput) {
   if (!deviceIsReady(DeviceId)) {
     DP("Device %" PRId64 " is not ready\n", DeviceId);
     return OMP_TGT_FAIL;
@@ -354,7 +355,7 @@ EXTERN int __tgt_activate_record_replay(int64_t DeviceId, uint64_t MemorySize,
 
   DeviceTy &Device = *PM->Devices[DeviceId];
   [[maybe_unused]] int Rc =
-      target_activate_rr(Device, MemorySize, IsRecord, SaveOutput);
+      target_activate_rr(Device, MemorySize, VAddr, IsRecord, SaveOutput);
   assert(Rc == OFFLOAD_SUCCESS &&
          "__tgt_activate_record_replay unexpected failure!");
   return OMP_TGT_SUCCESS;
diff --git a/openmp/libomptarget/src/omptarget.cpp b/openmp/libomptarget/src/omptarget.cpp
index 40419e4489426..65f2a49abc714 100644
--- a/openmp/libomptarget/src/omptarget.cpp
+++ b/openmp/libomptarget/src/omptarget.cpp
@@ -827,14 +827,13 @@ postProcessingTargetDataEnd(DeviceTy *Device,
     // remaining shadow pointer entries for this struct.
     const bool HasFrom = ArgType & OMP_TGT_MAPTYPE_FROM;
     if (HasFrom) {
-      Entry->foreachShadowPointerInfo(
-          [&](const ShadowPtrInfoTy &ShadowPtr) {
-            *ShadowPtr.HstPtrAddr = ShadowPtr.HstPtrVal;
-            DP("Restoring original host pointer value " DPxMOD " for host "
-               "pointer " DPxMOD "\n",
-               DPxPTR(ShadowPtr.HstPtrVal), DPxPTR(ShadowPtr.HstPtrAddr));
-            return OFFLOAD_SUCCESS;
-          });
+      Entry->foreachShadowPointerInfo([&](const ShadowPtrInfoTy &ShadowPtr) {
+        *ShadowPtr.HstPtrAddr = ShadowPtr.HstPtrVal;
+        DP("Restoring original host pointer value " DPxMOD " for host "
+           "pointer " DPxMOD "\n",
+           DPxPTR(ShadowPtr.HstPtrVal), DPxPTR(ShadowPtr.HstPtrAddr));
+        return OFFLOAD_SUCCESS;
+      });
     }
 
     // Give up the lock as we either don't need it anymore (e.g., done with
@@ -1713,9 +1712,9 @@ int target(ident_t *Loc, DeviceTy &Device, void *HostPtr,
 /// Enables the record replay mechanism by pre-allocating MemorySize
 /// and informing the record-replayer of whether to store the output
 /// in some file.
-int target_activate_rr(DeviceTy &Device, uint64_t MemorySize, bool isRecord,
-                       bool SaveOutput) {
-  return Device.RTL->activate_record_replay(Device.DeviceID, MemorySize,
+int target_activate_rr(DeviceTy &Device, uint64_t MemorySize, void *VAddr,
+                       bool isRecord, bool SaveOutput) {
+  return Device.RTL->activate_record_replay(Device.DeviceID, MemorySize, VAddr,
                                             isRecord, SaveOutput);
 }
 
diff --git a/openmp/libomptarget/src/private.h b/openmp/libomptarget/src/private.h
index cf88f78696b2e..2a06bdbd1b708 100644
--- a/openmp/libomptarget/src/private.h
+++ b/openmp/libomptarget/src/private.h
@@ -42,7 +42,7 @@ extern int target(ident_t *Loc, DeviceTy &Device, void *HostPtr,
                   KernelArgsTy &KernelArgs, AsyncInfoTy &AsyncInfo);
 
 extern int target_activate_rr(DeviceTy &Device, uint64_t MemorySize,
-                              bool isRecord, bool SaveOutput);
+                              void *ReqAddr, bool isRecord, bool SaveOutput);
 
 extern int target_replay(ident_t *Loc, DeviceTy &Device, void *HostPtr,
                          void *DeviceMemory, int64_t DeviceMemorySize,
diff --git a/openmp/libomptarget/tools/kernelreplay/llvm-omp-kernel-replay.cpp b/openmp/libomptarget/tools/kernelreplay/llvm-omp-kernel-replay.cpp
index 0de59e23634a5..93fc3e7853f8e 100644
--- a/openmp/libomptarget/tools/kernelreplay/llvm-omp-kernel-replay.cpp
+++ b/openmp/libomptarget/tools/kernelreplay/llvm-omp-kernel-replay.cpp
@@ -87,6 +87,9 @@ int main(int argc, char **argv) {
   for (auto It : *TgtArgOffsetsArray)
     TgtArgOffsets.push_back(static_cast<ptrdiff_t>(It.getAsInteger().value()));
 
+  void *BAllocStart = reinterpret_cast<void *>(
+      JsonKernelInfo->getAsObject()->getInteger("BumpAllocVAStart").value());
+
   __tgt_offload_entry KernelEntry = {nullptr, nullptr, 0, 0, 0};
   std::string KernelEntryName = KernelFunc.value().str();
   KernelEntry.name = const_cast<char *>(KernelEntryName.c_str());
@@ -125,8 +128,8 @@ int main(int argc, char **argv) {
 
   __tgt_register_lib(&Desc);
 
-  int Rc = __tgt_activate_record_replay(DeviceId, DeviceMemorySize, false,
-                                        VerifyOpt);
+  int Rc = __tgt_activate_record_replay(DeviceId, DeviceMemorySize, BAllocStart,
+                                        false, VerifyOpt);
 
   if (Rc != OMP_TGT_SUCCESS) {
     report_fatal_error("Cannot activate record replay\n");

From 5eb65ca83d47a6a7cf84a79621ba8576223b507a Mon Sep 17 00:00:00 2001
From: Nikita Popov <nikita.ppv@gmail.com>
Date: Sun, 29 Oct 2023 20:31:50 +0100
Subject: [PATCH 326/877] [OpenMP] Move function out of !NDEBUG section

To unbreak the release build.
---
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp | 26 +++++++++++------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index b320d77652e1c..624958649d39f 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -86,19 +86,6 @@ static bool isConflictIP(IRBuilder<>::InsertPoint IP1,
   return IP1.getBlock() == IP2.getBlock() && IP1.getPoint() == IP2.getPoint();
 }
 
-static const omp::GV &getGridValue(const Triple &T, Function *Kernel) {
-  if (T.isAMDGPU()) {
-    StringRef Features =
-        Kernel->getFnAttribute("target-features").getValueAsString();
-    if (Features.count("+wavefrontsize64"))
-      return omp::getAMDGPUGridValues<64>();
-    return omp::getAMDGPUGridValues<32>();
-  }
-  if (T.isNVPTX())
-    return omp::NVPTXGridValues;
-  llvm_unreachable("No grid value available for this architecture!");
-}
-
 static bool isValidWorkshareLoopScheduleType(OMPScheduleType SchedType) {
   // Valid ordered/unordered and base algorithm combinations.
   switch (SchedType & ~OMPScheduleType::MonotonicityMask) {
@@ -158,6 +145,19 @@ static bool isValidWorkshareLoopScheduleType(OMPScheduleType SchedType) {
 }
 #endif
 
+static const omp::GV &getGridValue(const Triple &T, Function *Kernel) {
+  if (T.isAMDGPU()) {
+    StringRef Features =
+        Kernel->getFnAttribute("target-features").getValueAsString();
+    if (Features.count("+wavefrontsize64"))
+      return omp::getAMDGPUGridValues<64>();
+    return omp::getAMDGPUGridValues<32>();
+  }
+  if (T.isNVPTX())
+    return omp::NVPTXGridValues;
+  llvm_unreachable("No grid value available for this architecture!");
+}
+
 /// Determine which scheduling algorithm to use, determined from schedule clause
 /// arguments.
 static OMPScheduleType

From 66464eb966a998ffc3541208d2d4f8fe9acc53ab Mon Sep 17 00:00:00 2001
From: Aiden Grossman <agrossman154@yahoo.com>
Date: Sun, 29 Oct 2023 12:41:09 -0700
Subject: [PATCH 327/877] [Github] Add OpenMP docs to Github docs action
 (#70529)

This patch enables building the openmp documentation through Github
actions during PRs and at tip of tree to ensure that the documentation
builds and that there aren't any Sphinx warnings in a manner that
enables more rapid developer iteration without having to install
documentation tooling.
---
 .github/workflows/docs.yml | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
index e6af2f41167e0..b311a23f21af7 100644
--- a/.github/workflows/docs.yml
+++ b/.github/workflows/docs.yml
@@ -21,6 +21,7 @@ on:
       - 'libcxx/docs/**'
       - 'libc/docs/**'
       - 'lld/docs/**'
+      - 'openmp/docs/**'
   pull_request:
     paths:
       - 'llvm/docs/**'
@@ -31,6 +32,7 @@ on:
       - 'libcxx/docs/**'
       - 'libc/docs/**'
       - 'lld/docs/**'
+      - 'openmp/docs/**'
 
 jobs:
   check-docs-build:
@@ -67,6 +69,8 @@ jobs:
               - 'libc/docs/**'
             lld:
               - 'lld/docs/**'
+            openmp:
+              - 'openmp/docs/**'
       - name: Fetch LLVM sources (PR)
         if: ${{ github.event_name == 'pull_request' }}
         uses: actions/checkout@v4
@@ -125,4 +129,9 @@ jobs:
         run: |
           cmake -B lld-build -GNinja -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_PROJECTS="lld" -DLLVM_ENABLE_SPHINX=ON ./llvm
           TZ=UTC ninja -C lld-build docs-lld-html
+      - name: Build OpenMP docs
+        if: steps.docs-changed-subprojects.outputs.openmp_any_changed == 'true'
+        run: |
+          cmake -B openmp-build -GNinja -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_PROJECTS="clang;openmp" -DLLVM_ENABLE_SPHINX=ON ./llvm
+          TZ=UTC ninja -C openmp-build docs-openmp-html
 

From 5cfa03ef25f5ab0ff81916f78cb0a10f091bee1f Mon Sep 17 00:00:00 2001
From: Aiden Grossman <agrossman154@yahoo.com>
Date: Sun, 29 Oct 2023 12:49:34 -0700
Subject: [PATCH 328/877] [Github] Add Polly docs to Github actions (#70531)

This patch enables building the polly docs in CI to catch warnings/build
regressions in the documentation.
---
 .github/workflows/docs.yml | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
index b311a23f21af7..17b2fbbae7bd5 100644
--- a/.github/workflows/docs.yml
+++ b/.github/workflows/docs.yml
@@ -22,6 +22,7 @@ on:
       - 'libc/docs/**'
       - 'lld/docs/**'
       - 'openmp/docs/**'
+      - 'polly/docs/**'
   pull_request:
     paths:
       - 'llvm/docs/**'
@@ -33,6 +34,7 @@ on:
       - 'libc/docs/**'
       - 'lld/docs/**'
       - 'openmp/docs/**'
+      - 'polly/docs/**'
 
 jobs:
   check-docs-build:
@@ -71,6 +73,8 @@ jobs:
               - 'lld/docs/**'
             openmp:
               - 'openmp/docs/**'
+            polly:
+              - 'polly/docs/**'
       - name: Fetch LLVM sources (PR)
         if: ${{ github.event_name == 'pull_request' }}
         uses: actions/checkout@v4
@@ -134,4 +138,9 @@ jobs:
         run: |
           cmake -B openmp-build -GNinja -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_PROJECTS="clang;openmp" -DLLVM_ENABLE_SPHINX=ON ./llvm
           TZ=UTC ninja -C openmp-build docs-openmp-html
+      - name: Build Polly docs
+        if: steps.docs-changed-subprojects.outputs.polly_any_changed == 'true'
+        run: |
+          cmake -B polly-build -GNinja -DCMAKE_BUILD_TYPE=Release -DLLVM_ENABLE_PROJECTS="polly" -DLLVM_ENABLE_SPHINX=ON ./llvm
+          TZ=UTC ninja -C polly-build docs-polly-html docs-polly-man
 

From 91b9a661c21891dc06e1721a26478d5bfe53e4ed Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bj=C3=B6rn=20Sch=C3=A4pers?= <bjoern@hazardy.de>
Date: Sun, 29 Oct 2023 20:55:12 +0100
Subject: [PATCH 329/877] [clang-format][NFC] Reduce indent

By using if init statement.
---
 clang/lib/Format/TokenAnnotator.cpp | 22 ++++++++++------------
 1 file changed, 10 insertions(+), 12 deletions(-)

diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp
index 9ec8b93e39fd2..aee966145b8e5 100644
--- a/clang/lib/Format/TokenAnnotator.cpp
+++ b/clang/lib/Format/TokenAnnotator.cpp
@@ -2093,18 +2093,16 @@ class AnnotatingParser {
           !Current.Next->isOneOf(tok::semi, tok::colon, tok::l_brace,
                                  tok::comma, tok::period, tok::arrow,
                                  tok::coloncolon, tok::kw_noexcept)) {
-        if (FormatToken *AfterParen = Current.MatchingParen->Next) {
-          // Make sure this isn't the return type of an Obj-C block declaration
-          if (AfterParen->isNot(tok::caret)) {
-            if (FormatToken *BeforeParen = Current.MatchingParen->Previous) {
-              if (BeforeParen->is(tok::identifier) &&
-                  BeforeParen->isNot(TT_TypenameMacro) &&
-                  BeforeParen->TokenText == BeforeParen->TokenText.upper() &&
-                  (!BeforeParen->Previous ||
-                   BeforeParen->Previous->ClosesTemplateDeclaration)) {
-                Current.setType(TT_FunctionAnnotationRParen);
-              }
-            }
+        if (FormatToken *AfterParen = Current.MatchingParen->Next;
+            AfterParen && AfterParen->isNot(tok::caret)) {
+          // Make sure this isn't the return type of an Obj-C block declaration.
+          if (FormatToken *BeforeParen = Current.MatchingParen->Previous;
+              BeforeParen && BeforeParen->is(tok::identifier) &&
+              BeforeParen->isNot(TT_TypenameMacro) &&
+              BeforeParen->TokenText == BeforeParen->TokenText.upper() &&
+              (!BeforeParen->Previous ||
+               BeforeParen->Previous->ClosesTemplateDeclaration)) {
+            Current.setType(TT_FunctionAnnotationRParen);
           }
         }
       }

From 0d1da7c37f64bcca9b57396af26b7cf4136aefa7 Mon Sep 17 00:00:00 2001
From: Brad Smith <brad@comstyle.com>
Date: Sun, 29 Oct 2023 18:30:11 -0400
Subject: [PATCH 330/877] [OpenMP] Make use of getloadavg() on *BSD OS's
 (#70586)

OpenBSD does not have /proc filesystem, neither does FreeBSD (by default).
---
 openmp/runtime/src/z_Linux_util.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/openmp/runtime/src/z_Linux_util.cpp b/openmp/runtime/src/z_Linux_util.cpp
index 478c09baa1bee..e6aa6f413d168 100644
--- a/openmp/runtime/src/z_Linux_util.cpp
+++ b/openmp/runtime/src/z_Linux_util.cpp
@@ -2206,7 +2206,8 @@ int __kmp_is_address_mapped(void *addr) {
 
 #ifdef USE_LOAD_BALANCE
 
-#if KMP_OS_DARWIN || KMP_OS_NETBSD
+#if KMP_OS_DARWIN || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||    \
+    KMP_OS_OPENBSD
 
 // The function returns the rounded value of the system load average
 // during given time interval which depends on the value of

From 072a7edec3ca3729ec597c413d2fcc9557d88ffc Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Sun, 29 Oct 2023 22:58:57 +0000
Subject: [PATCH 331/877] [AArch64] Add additional concat trunc -> UZP1
 patterns

These extra patterns come from the lowering of fptoi, where an extra assertzext
is present between some of the vectors.
---
 llvm/lib/Target/AArch64/AArch64InstrInfo.td | 11 +++++++++
 llvm/test/CodeGen/AArch64/fptoi.ll          | 26 ++++++---------------
 2 files changed, 18 insertions(+), 19 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 97c5e89a29a97..ee42612c0fcdd 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -5896,6 +5896,17 @@ def : Pat<(v8i16 (concat_vectors (v4i16 (trunc (v4i32 V128:$Vn))),
 def : Pat<(v4i32 (concat_vectors (v2i32 (trunc (v2i64 V128:$Vn))),
                                  (v2i32 (trunc (v2i64 V128:$Vm))))),
           (UZP1v4i32 V128:$Vn, V128:$Vm)>;
+// These are the same as above, with an optional assertzext node that can be
+// generated from fptoi lowering.
+def : Pat<(v16i8 (concat_vectors (v8i8 (assertzext (trunc (v8i16 V128:$Vn)))),
+                                 (v8i8 (assertzext (trunc (v8i16 V128:$Vm)))))),
+          (UZP1v16i8 V128:$Vn, V128:$Vm)>;
+def : Pat<(v8i16 (concat_vectors (v4i16 (assertzext (trunc (v4i32 V128:$Vn)))),
+                                 (v4i16 (assertzext (trunc (v4i32 V128:$Vm)))))),
+          (UZP1v8i16 V128:$Vn, V128:$Vm)>;
+def : Pat<(v4i32 (concat_vectors (v2i32 (assertzext (trunc (v2i64 V128:$Vn)))),
+                                 (v2i32 (assertzext (trunc (v2i64 V128:$Vm)))))),
+          (UZP1v4i32 V128:$Vn, V128:$Vm)>;
 
 def : Pat<(v16i8 (concat_vectors
                  (v8i8 (trunc (AArch64vlshr (v8i16 V128:$Vn), (i32 8)))),
diff --git a/llvm/test/CodeGen/AArch64/fptoi.ll b/llvm/test/CodeGen/AArch64/fptoi.ll
index 8fbb074136a90..c13d9144d2aea 100644
--- a/llvm/test/CodeGen/AArch64/fptoi.ll
+++ b/llvm/test/CodeGen/AArch64/fptoi.ll
@@ -2260,12 +2260,8 @@ define <16 x i8> @fptou_v16f32_v16i8(<16 x float> %a) {
 ; CHECK-NEXT:    fcvtzs v2.4s, v2.4s
 ; CHECK-NEXT:    fcvtzs v1.4s, v1.4s
 ; CHECK-NEXT:    fcvtzs v0.4s, v0.4s
-; CHECK-NEXT:    xtn v3.4h, v3.4s
-; CHECK-NEXT:    xtn v2.4h, v2.4s
-; CHECK-NEXT:    xtn v1.4h, v1.4s
-; CHECK-NEXT:    xtn v0.4h, v0.4s
-; CHECK-NEXT:    mov v2.d[1], v3.d[0]
-; CHECK-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-NEXT:    uzp1 v2.8h, v2.8h, v3.8h
+; CHECK-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
 ; CHECK-NEXT:    uzp1 v0.16b, v0.16b, v2.16b
 ; CHECK-NEXT:    ret
 entry:
@@ -2315,20 +2311,12 @@ define <32 x i8> @fptou_v32f32_v32i8(<32 x float> %a) {
 ; CHECK-NEXT:    fcvtzs v6.4s, v6.4s
 ; CHECK-NEXT:    fcvtzs v5.4s, v5.4s
 ; CHECK-NEXT:    fcvtzs v4.4s, v4.4s
-; CHECK-NEXT:    xtn v3.4h, v3.4s
-; CHECK-NEXT:    xtn v2.4h, v2.4s
-; CHECK-NEXT:    xtn v1.4h, v1.4s
-; CHECK-NEXT:    xtn v0.4h, v0.4s
-; CHECK-NEXT:    xtn v7.4h, v7.4s
-; CHECK-NEXT:    xtn v6.4h, v6.4s
-; CHECK-NEXT:    xtn v5.4h, v5.4s
-; CHECK-NEXT:    xtn v4.4h, v4.4s
-; CHECK-NEXT:    mov v2.d[1], v3.d[0]
-; CHECK-NEXT:    mov v0.d[1], v1.d[0]
-; CHECK-NEXT:    mov v6.d[1], v7.d[0]
-; CHECK-NEXT:    mov v4.d[1], v5.d[0]
+; CHECK-NEXT:    uzp1 v2.8h, v2.8h, v3.8h
+; CHECK-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    uzp1 v1.8h, v6.8h, v7.8h
+; CHECK-NEXT:    uzp1 v3.8h, v4.8h, v5.8h
 ; CHECK-NEXT:    uzp1 v0.16b, v0.16b, v2.16b
-; CHECK-NEXT:    uzp1 v1.16b, v4.16b, v6.16b
+; CHECK-NEXT:    uzp1 v1.16b, v3.16b, v1.16b
 ; CHECK-NEXT:    ret
 entry:
   %c = fptoui <32 x float> %a to <32 x i8>

From 1c876ff5155c4feeb2b2885eb3e6abda17c4b7f4 Mon Sep 17 00:00:00 2001
From: Nico Weber <thakis@chromium.org>
Date: Sun, 29 Oct 2023 21:11:39 -0400
Subject: [PATCH 332/877] Revert "Perf/lexer faster slow get char and size
 (#70543)"

This reverts commit d8f5a18b6e587aeaa8b99707e87b652f49b160cd.
Breaks build, see:
https://github.com/llvm/llvm-project/pull/70543#issuecomment-1784227421
---
 clang/include/clang/Lex/Lexer.h               | 35 +++++----
 clang/lib/Lex/DependencyDirectivesScanner.cpp |  5 +-
 clang/lib/Lex/Lexer.cpp                       | 73 +++++++++----------
 3 files changed, 53 insertions(+), 60 deletions(-)

diff --git a/clang/include/clang/Lex/Lexer.h b/clang/include/clang/Lex/Lexer.h
index 899e665e74546..ac0ef14c591bd 100644
--- a/clang/include/clang/Lex/Lexer.h
+++ b/clang/include/clang/Lex/Lexer.h
@@ -575,23 +575,19 @@ class Lexer : public PreprocessorLexer {
   /// sequence.
   static bool isNewLineEscaped(const char *BufferStart, const char *Str);
 
-  /// Represents a char and the number of bytes parsed to produce it.
-  struct SizedChar {
-    char Char;
-    unsigned Size;
-  };
-
   /// getCharAndSizeNoWarn - Like the getCharAndSize method, but does not ever
   /// emit a warning.
-  static inline SizedChar getCharAndSizeNoWarn(const char *Ptr,
-                                               const LangOptions &LangOpts) {
+  static inline char getCharAndSizeNoWarn(const char *Ptr, unsigned &Size,
+                                          const LangOptions &LangOpts) {
     // If this is not a trigraph and not a UCN or escaped newline, return
     // quickly.
     if (isObviouslySimpleCharacter(Ptr[0])) {
-      return {*Ptr, 1u};
+      Size = 1;
+      return *Ptr;
     }
 
-    return getCharAndSizeSlowNoWarn(Ptr, LangOpts);
+    Size = 0;
+    return getCharAndSizeSlowNoWarn(Ptr, Size, LangOpts);
   }
 
   /// Returns the leading whitespace for line that corresponds to the given
@@ -669,7 +665,8 @@ class Lexer : public PreprocessorLexer {
     // quickly.
     if (isObviouslySimpleCharacter(Ptr[0])) return *Ptr++;
 
-    auto [C, Size] = getCharAndSizeSlow(Ptr, &Tok);
+    unsigned Size = 0;
+    char C = getCharAndSizeSlow(Ptr, Size, &Tok);
     Ptr += Size;
     return C;
   }
@@ -685,7 +682,9 @@ class Lexer : public PreprocessorLexer {
 
     // Otherwise, re-lex the character with a current token, allowing
     // diagnostics to be emitted and flags to be set.
-    return Ptr + getCharAndSizeSlow(Ptr, &Tok).Size;
+    Size = 0;
+    getCharAndSizeSlow(Ptr, Size, &Tok);
+    return Ptr+Size;
   }
 
   /// getCharAndSize - Peek a single 'character' from the specified buffer,
@@ -700,14 +699,14 @@ class Lexer : public PreprocessorLexer {
       return *Ptr;
     }
 
-    auto CharAndSize = getCharAndSizeSlow(Ptr);
-    Size = CharAndSize.Size;
-    return CharAndSize.Char;
+    Size = 0;
+    return getCharAndSizeSlow(Ptr, Size);
   }
 
   /// getCharAndSizeSlow - Handle the slow/uncommon case of the getCharAndSize
   /// method.
-  SizedChar getCharAndSizeSlow(const char *Ptr, Token *Tok = nullptr);
+  char getCharAndSizeSlow(const char *Ptr, unsigned &Size,
+                          Token *Tok = nullptr);
 
   /// getEscapedNewLineSize - Return the size of the specified escaped newline,
   /// or 0 if it is not an escaped newline. P[-1] is known to be a "\" on entry
@@ -721,8 +720,8 @@ class Lexer : public PreprocessorLexer {
 
   /// getCharAndSizeSlowNoWarn - Same as getCharAndSizeSlow, but never emits a
   /// diagnostic.
-  static SizedChar getCharAndSizeSlowNoWarn(const char *Ptr,
-                                            const LangOptions &LangOpts);
+  static char getCharAndSizeSlowNoWarn(const char *Ptr, unsigned &Size,
+                                       const LangOptions &LangOpts);
 
   //===--------------------------------------------------------------------===//
   // Other lexer functions.
diff --git a/clang/lib/Lex/DependencyDirectivesScanner.cpp b/clang/lib/Lex/DependencyDirectivesScanner.cpp
index 980f865cf24c9..2bd2c5f8388c0 100644
--- a/clang/lib/Lex/DependencyDirectivesScanner.cpp
+++ b/clang/lib/Lex/DependencyDirectivesScanner.cpp
@@ -565,8 +565,9 @@ Scanner::cleanStringIfNeeded(const dependency_directives_scan::Token &Tok) {
   const char *BufPtr = Input.begin() + Tok.Offset;
   const char *AfterIdent = Input.begin() + Tok.getEnd();
   while (BufPtr < AfterIdent) {
-    auto [Char, Size] = Lexer::getCharAndSizeNoWarn(BufPtr, LangOpts);
-    Spelling[SpellingLength++] = Char;
+    unsigned Size;
+    Spelling[SpellingLength++] =
+        Lexer::getCharAndSizeNoWarn(BufPtr, Size, LangOpts);
     BufPtr += Size;
   }
 
diff --git a/clang/lib/Lex/Lexer.cpp b/clang/lib/Lex/Lexer.cpp
index 1c53997527732..675ec28e51479 100644
--- a/clang/lib/Lex/Lexer.cpp
+++ b/clang/lib/Lex/Lexer.cpp
@@ -287,9 +287,9 @@ static size_t getSpellingSlow(const Token &Tok, const char *BufPtr,
   if (tok::isStringLiteral(Tok.getKind())) {
     // Munch the encoding-prefix and opening double-quote.
     while (BufPtr < BufEnd) {
-      auto CharAndSize = Lexer::getCharAndSizeNoWarn(BufPtr, LangOpts);
-      Spelling[Length++] = CharAndSize.Char;
-      BufPtr += CharAndSize.Size;
+      unsigned Size;
+      Spelling[Length++] = Lexer::getCharAndSizeNoWarn(BufPtr, Size, LangOpts);
+      BufPtr += Size;
 
       if (Spelling[Length - 1] == '"')
         break;
@@ -316,9 +316,9 @@ static size_t getSpellingSlow(const Token &Tok, const char *BufPtr,
   }
 
   while (BufPtr < BufEnd) {
-    auto CharAndSize = Lexer::getCharAndSizeNoWarn(BufPtr, LangOpts);
-    Spelling[Length++] = CharAndSize.Char;
-    BufPtr += CharAndSize.Size;
+    unsigned Size;
+    Spelling[Length++] = Lexer::getCharAndSizeNoWarn(BufPtr, Size, LangOpts);
+    BufPtr += Size;
   }
 
   assert(Length < Tok.getLength() &&
@@ -772,9 +772,10 @@ unsigned Lexer::getTokenPrefixLength(SourceLocation TokStart, unsigned CharNo,
   // If we have a character that may be a trigraph or escaped newline, use a
   // lexer to parse it correctly.
   for (; CharNo; --CharNo) {
-    auto CharAndSize = Lexer::getCharAndSizeNoWarn(TokPtr, LangOpts);
-    TokPtr += CharAndSize.Size;
-    PhysOffset += CharAndSize.Size;
+    unsigned Size;
+    Lexer::getCharAndSizeNoWarn(TokPtr, Size, LangOpts);
+    TokPtr += Size;
+    PhysOffset += Size;
   }
 
   // Final detail: if we end up on an escaped newline, we want to return the
@@ -1356,16 +1357,15 @@ SourceLocation Lexer::findLocationAfterToken(
 ///
 /// NOTE: When this method is updated, getCharAndSizeSlowNoWarn (below) should
 /// be updated to match.
-Lexer::SizedChar Lexer::getCharAndSizeSlow(const char *Ptr, Token *Tok) {
-  unsigned Size = 0;
+char Lexer::getCharAndSizeSlow(const char *Ptr, unsigned &Size,
+                               Token *Tok) {
   // If we have a slash, look for an escaped newline.
   if (Ptr[0] == '\\') {
     ++Size;
     ++Ptr;
 Slash:
     // Common case, backslash-char where the char is not whitespace.
-    if (!isWhitespace(Ptr[0]))
-      return {'\\', Size};
+    if (!isWhitespace(Ptr[0])) return '\\';
 
     // See if we have optional whitespace characters between the slash and
     // newline.
@@ -1382,13 +1382,11 @@ Lexer::SizedChar Lexer::getCharAndSizeSlow(const char *Ptr, Token *Tok) {
       Ptr  += EscapedNewLineSize;
 
       // Use slow version to accumulate a correct size field.
-      auto CharAndSize = getCharAndSizeSlow(Ptr, Tok);
-      CharAndSize.Size += Size;
-      return CharAndSize;
+      return getCharAndSizeSlow(Ptr, Size, Tok);
     }
 
     // Otherwise, this is not an escaped newline, just return the slash.
-    return {'\\', Size};
+    return '\\';
   }
 
   // If this is a trigraph, process it.
@@ -1403,12 +1401,13 @@ Lexer::SizedChar Lexer::getCharAndSizeSlow(const char *Ptr, Token *Tok) {
       Ptr += 3;
       Size += 3;
       if (C == '\\') goto Slash;
-      return {C, Size};
+      return C;
     }
   }
 
   // If this is neither, return a single character.
-  return {*Ptr, Size + 1u};
+  ++Size;
+  return *Ptr;
 }
 
 /// getCharAndSizeSlowNoWarn - Handle the slow/uncommon case of the
@@ -1417,18 +1416,15 @@ Lexer::SizedChar Lexer::getCharAndSizeSlow(const char *Ptr, Token *Tok) {
 ///
 /// NOTE: When this method is updated, getCharAndSizeSlow (above) should
 /// be updated to match.
-Lexer::SizedChar Lexer::getCharAndSizeSlowNoWarn(const char *Ptr,
-                                                 const LangOptions &LangOpts) {
-
-  unsigned Size = 0;
+char Lexer::getCharAndSizeSlowNoWarn(const char *Ptr, unsigned &Size,
+                                     const LangOptions &LangOpts) {
   // If we have a slash, look for an escaped newline.
   if (Ptr[0] == '\\') {
     ++Size;
     ++Ptr;
 Slash:
     // Common case, backslash-char where the char is not whitespace.
-    if (!isWhitespace(Ptr[0]))
-      return {'\\', Size};
+    if (!isWhitespace(Ptr[0])) return '\\';
 
     // See if we have optional whitespace characters followed by a newline.
     if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) {
@@ -1437,13 +1433,11 @@ Lexer::SizedChar Lexer::getCharAndSizeSlowNoWarn(const char *Ptr,
       Ptr  += EscapedNewLineSize;
 
       // Use slow version to accumulate a correct size field.
-      auto CharAndSize = getCharAndSizeSlowNoWarn(Ptr, LangOpts);
-      CharAndSize.Size += Size;
-      return CharAndSize;
+      return getCharAndSizeSlowNoWarn(Ptr, Size, LangOpts);
     }
 
     // Otherwise, this is not an escaped newline, just return the slash.
-    return {'\\', Size};
+    return '\\';
   }
 
   // If this is a trigraph, process it.
@@ -1454,12 +1448,13 @@ Lexer::SizedChar Lexer::getCharAndSizeSlowNoWarn(const char *Ptr,
       Ptr += 3;
       Size += 3;
       if (C == '\\') goto Slash;
-      return {C, Size};
+      return C;
     }
   }
 
   // If this is neither, return a single character.
-  return {*Ptr, Size + 1u};
+  ++Size;
+  return *Ptr;
 }
 
 //===----------------------------------------------------------------------===//
@@ -1969,14 +1964,11 @@ bool Lexer::LexIdentifierContinue(Token &Result, const char *CurPtr) {
 /// isHexaLiteral - Return true if Start points to a hex constant.
 /// in microsoft mode (where this is supposed to be several different tokens).
 bool Lexer::isHexaLiteral(const char *Start, const LangOptions &LangOpts) {
-  auto CharAndSize1 = Lexer::getCharAndSizeNoWarn(Start, LangOpts);
-  char C1 = CharAndSize1.Char;
+  unsigned Size;
+  char C1 = Lexer::getCharAndSizeNoWarn(Start, Size, LangOpts);
   if (C1 != '0')
     return false;
-
-  auto CharAndSize2 =
-      Lexer::getCharAndSizeNoWarn(Start + CharAndSize1.Size, LangOpts);
-  char C2 = CharAndSize2.Char;
+  char C2 = Lexer::getCharAndSizeNoWarn(Start + Size, Size, LangOpts);
   return (C2 == 'x' || C2 == 'X');
 }
 
@@ -2020,7 +2012,8 @@ bool Lexer::LexNumericConstant(Token &Result, const char *CurPtr) {
 
   // If we have a digit separator, continue.
   if (C == '\'' && (LangOpts.CPlusPlus14 || LangOpts.C23)) {
-    auto [Next, NextSize] = getCharAndSizeNoWarn(CurPtr + Size, LangOpts);
+    unsigned NextSize;
+    char Next = getCharAndSizeNoWarn(CurPtr + Size, NextSize, LangOpts);
     if (isAsciiIdentifierContinue(Next)) {
       if (!isLexingRawMode())
         Diag(CurPtr, LangOpts.CPlusPlus
@@ -2092,8 +2085,8 @@ const char *Lexer::LexUDSuffix(Token &Result, const char *CurPtr,
       unsigned Consumed = Size;
       unsigned Chars = 1;
       while (true) {
-        auto [Next, NextSize] =
-            getCharAndSizeNoWarn(CurPtr + Consumed, LangOpts);
+        unsigned NextSize;
+        char Next = getCharAndSizeNoWarn(CurPtr + Consumed, NextSize, LangOpts);
         if (!isAsciiIdentifierContinue(Next)) {
           // End of suffix. Check whether this is on the allowed list.
           const StringRef CompleteSuffix(Buffer, Chars);

From dbeaee61af9985a8dcdbe363d40530792b7e7904 Mon Sep 17 00:00:00 2001
From: Wenju He <wenju.he@intel.com>
Date: Mon, 30 Oct 2023 09:41:43 +0800
Subject: [PATCH 333/877] [NFC][DominanceFrontier] Replace std::map with
 DenseMap for DomSetMapType (#70403)

---
 llvm/include/llvm/Analysis/DominanceFrontier.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/include/llvm/Analysis/DominanceFrontier.h b/llvm/include/llvm/Analysis/DominanceFrontier.h
index 42ede2ac5ece3..db0130e4804b8 100644
--- a/llvm/include/llvm/Analysis/DominanceFrontier.h
+++ b/llvm/include/llvm/Analysis/DominanceFrontier.h
@@ -17,6 +17,7 @@
 #ifndef LLVM_ANALYSIS_DOMINANCEFRONTIER_H
 #define LLVM_ANALYSIS_DOMINANCEFRONTIER_H
 
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/GraphTraits.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/Config/llvm-config.h"
@@ -24,7 +25,6 @@
 #include "llvm/Pass.h"
 #include "llvm/Support/GenericDomTree.h"
 #include <cassert>
-#include <map>
 #include <utility>
 
 namespace llvm {
@@ -42,7 +42,7 @@ class DominanceFrontierBase {
   // Dom set for a bb. Use SetVector to make iterating dom frontiers of a bb
   // deterministic.
   using DomSetType = SetVector<BlockT *>;
-  using DomSetMapType = std::map<BlockT *, DomSetType>; // Dom set map
+  using DomSetMapType = DenseMap<BlockT *, DomSetType>; // Dom set map
 
 protected:
   using BlockTraits = GraphTraits<BlockT *>;

From 0a29879e410689aff381c4a48e34bfe857f6ce5a Mon Sep 17 00:00:00 2001
From: Brad Smith <brad@comstyle.com>
Date: Sun, 29 Oct 2023 22:35:03 -0400
Subject: [PATCH 334/877] [OpenMP] Add missing bit with the Hurd support
 (#70609)

Looking at 855d09855d8e541176758f38015e8b9b522d6110 it looks like a bit was
missing. The padding variable is used further down by the KMP_ALLOCA()
function.
---
 openmp/runtime/src/z_Linux_util.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/openmp/runtime/src/z_Linux_util.cpp b/openmp/runtime/src/z_Linux_util.cpp
index e6aa6f413d168..5495f60d2029d 100644
--- a/openmp/runtime/src/z_Linux_util.cpp
+++ b/openmp/runtime/src/z_Linux_util.cpp
@@ -512,7 +512,7 @@ static void *__kmp_launch_worker(void *thr) {
 #endif /* KMP_BLOCK_SIGNALS */
 
 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD ||     \
-    KMP_OS_OPENBSD
+    KMP_OS_OPENBSD || KMP_OS_HURD
   if (__kmp_stkoffset > 0 && gtid > 0) {
     padding = KMP_ALLOCA(gtid * __kmp_stkoffset);
     (void)padding;

From 77c63391979ba2a7e5ab37ddf4714547c4f17f0a Mon Sep 17 00:00:00 2001
From: weiwei chen <weiwei.chen@modular.com>
Date: Sun, 29 Oct 2023 22:58:33 -0400
Subject: [PATCH 335/877] [MLIR][mlir-opt] Add
 `registerationAndParseCLIOptions` for `MlirOptMain`. (#70581)

Seprate registeration and CLI parsing from `MlirOptMain` to
`mlir::registrationAndParseCLIOptions` and a new `MlirOptMain` so that
external tools to drive `mlir-opt` can call these two new APIs to get CLI
option values before running `MlirOptMain`.
---
 .../include/mlir/Tools/mlir-opt/MlirOptMain.h | 19 +++++
 mlir/lib/Tools/mlir-opt/MlirOptMain.cpp       | 77 ++++++++++++-------
 2 files changed, 69 insertions(+), 27 deletions(-)

diff --git a/mlir/include/mlir/Tools/mlir-opt/MlirOptMain.h b/mlir/include/mlir/Tools/mlir-opt/MlirOptMain.h
index 222a51e8db77e..a1530936f55ca 100644
--- a/mlir/include/mlir/Tools/mlir-opt/MlirOptMain.h
+++ b/mlir/include/mlir/Tools/mlir-opt/MlirOptMain.h
@@ -229,6 +229,15 @@ class MlirOptMainConfig {
 /// the loaded IR.
 using PassPipelineFn = llvm::function_ref<LogicalResult(PassManager &pm)>;
 
+/// Register and parse command line options.
+/// - toolName is used for the header displayed by `--help`.
+/// - registry should contain all the dialects that can be parsed in the source.
+/// - return std::pair<std::string, std::string> for
+///   inputFilename and outputFilename command line option values.
+std::pair<std::string, std::string>
+registerAndParseCLIOptions(int argc, char **argv, llvm::StringRef toolName,
+                           DialectRegistry &registry);
+
 /// Perform the core processing behind `mlir-opt`.
 /// - outputStream is the stream where the resulting IR is printed.
 /// - buffer is the in-memory file to parser and process.
@@ -245,6 +254,16 @@ LogicalResult MlirOptMain(llvm::raw_ostream &outputStream,
 LogicalResult MlirOptMain(int argc, char **argv, llvm::StringRef toolName,
                           DialectRegistry &registry);
 
+/// Implementation for tools like `mlir-opt`.
+/// This function can be used with registrationAndParseCLIOptions so that
+/// CLI options can be accessed before running MlirOptMain.
+/// - inputFilename is the name of the input mlir file.
+/// - outputFilename is the name of the output file.
+/// - registry should contain all the dialects that can be parsed in the source.
+LogicalResult MlirOptMain(int argc, char **argv, llvm::StringRef inputFilename,
+                          llvm::StringRef outputFilename,
+                          DialectRegistry &registry);
+
 /// Helper wrapper to return the result of MlirOptMain directly from main.
 ///
 /// Example:
diff --git a/mlir/lib/Tools/mlir-opt/MlirOptMain.cpp b/mlir/lib/Tools/mlir-opt/MlirOptMain.cpp
index 644113058bdc1..c36afae716b12 100644
--- a/mlir/lib/Tools/mlir-opt/MlirOptMain.cpp
+++ b/mlir/lib/Tools/mlir-opt/MlirOptMain.cpp
@@ -268,7 +268,8 @@ static LogicalResult doVerifyRoundTrip(Operation *op,
     llvm::raw_string_ostream ostream(buffer);
     if (useBytecode) {
       if (failed(writeBytecodeToFile(op, ostream))) {
-        op->emitOpError() << "failed to write bytecode, cannot verify round-trip.\n";
+        op->emitOpError()
+            << "failed to write bytecode, cannot verify round-trip.\n";
         return failure();
       }
     } else {
@@ -281,7 +282,8 @@ static LogicalResult doVerifyRoundTrip(Operation *op,
     roundtripModule =
         parseSourceString<Operation *>(ostream.str(), parseConfig);
     if (!roundtripModule) {
-      op->emitOpError() << "failed to parse bytecode back, cannot verify round-trip.\n";
+      op->emitOpError()
+          << "failed to parse bytecode back, cannot verify round-trip.\n";
       return failure();
     }
   }
@@ -300,7 +302,8 @@ static LogicalResult doVerifyRoundTrip(Operation *op,
   }
   if (reference != roundtrip) {
     // TODO implement a diff.
-    return op->emitOpError() << "roundTrip testing roundtripped module differs from reference:\n<<<<<<Reference\n"
+    return op->emitOpError() << "roundTrip testing roundtripped module differs "
+                                "from reference:\n<<<<<<Reference\n"
                              << reference << "\n=====\n"
                              << roundtrip << "\n>>>>>roundtripped\n";
   }
@@ -443,6 +446,36 @@ static LogicalResult processBuffer(raw_ostream &os,
   return sourceMgrHandler.verify();
 }
 
+std::pair<std::string, std::string>
+mlir::registerAndParseCLIOptions(int argc, char **argv,
+                                 llvm::StringRef toolName,
+                                 DialectRegistry &registry) {
+  static cl::opt<std::string> inputFilename(
+      cl::Positional, cl::desc("<input file>"), cl::init("-"));
+
+  static cl::opt<std::string> outputFilename("o", cl::desc("Output filename"),
+                                             cl::value_desc("filename"),
+                                             cl::init("-"));
+  // Register any command line options.
+  MlirOptMainConfig::registerCLOptions(registry);
+  registerAsmPrinterCLOptions();
+  registerMLIRContextCLOptions();
+  registerPassManagerCLOptions();
+  registerDefaultTimingManagerCLOptions();
+  tracing::DebugCounter::registerCLOptions();
+
+  // Build the list of dialects as a header for the --help message.
+  std::string helpHeader = (toolName + "\nAvailable Dialects: ").str();
+  {
+    llvm::raw_string_ostream os(helpHeader);
+    interleaveComma(registry.getDialectNames(), os,
+                    [&](auto name) { os << name; });
+  }
+  // Parse pass names in main to ensure static initialization completed.
+  cl::ParseCommandLineOptions(argc, argv, helpHeader);
+  return std::make_pair(inputFilename.getValue(), outputFilename.getValue());
+}
+
 LogicalResult mlir::MlirOptMain(llvm::raw_ostream &outputStream,
                                 std::unique_ptr<llvm::MemoryBuffer> buffer,
                                 DialectRegistry &registry,
@@ -477,34 +510,13 @@ LogicalResult mlir::MlirOptMain(llvm::raw_ostream &outputStream,
                                /*insertMarkerInOutput=*/true);
 }
 
-LogicalResult mlir::MlirOptMain(int argc, char **argv, llvm::StringRef toolName,
+LogicalResult mlir::MlirOptMain(int argc, char **argv,
+                                llvm::StringRef inputFilename,
+                                llvm::StringRef outputFilename,
                                 DialectRegistry &registry) {
-  static cl::opt<std::string> inputFilename(
-      cl::Positional, cl::desc("<input file>"), cl::init("-"));
-
-  static cl::opt<std::string> outputFilename("o", cl::desc("Output filename"),
-                                             cl::value_desc("filename"),
-                                             cl::init("-"));
 
   InitLLVM y(argc, argv);
 
-  // Register any command line options.
-  MlirOptMainConfig::registerCLOptions(registry);
-  registerAsmPrinterCLOptions();
-  registerMLIRContextCLOptions();
-  registerPassManagerCLOptions();
-  registerDefaultTimingManagerCLOptions();
-  tracing::DebugCounter::registerCLOptions();
-
-  // Build the list of dialects as a header for the --help message.
-  std::string helpHeader = (toolName + "\nAvailable Dialects: ").str();
-  {
-    llvm::raw_string_ostream os(helpHeader);
-    interleaveComma(registry.getDialectNames(), os,
-                    [&](auto name) { os << name; });
-  }
-  // Parse pass names in main to ensure static initialization completed.
-  cl::ParseCommandLineOptions(argc, argv, helpHeader);
   MlirOptMainConfig config = MlirOptMainConfig::createFromCLOptions();
 
   // When reading from stdin and the input is a tty, it is often a user mistake
@@ -535,3 +547,14 @@ LogicalResult mlir::MlirOptMain(int argc, char **argv, llvm::StringRef toolName,
   output->keep();
   return success();
 }
+
+LogicalResult mlir::MlirOptMain(int argc, char **argv, llvm::StringRef toolName,
+                                DialectRegistry &registry) {
+
+  // Register and parse command line options.
+  std::string inputFilename, outputFilename;
+  std::tie(inputFilename, outputFilename) =
+      registerAndParseCLIOptions(argc, argv, toolName, registry);
+
+  return MlirOptMain(argc, argv, inputFilename, outputFilename, registry);
+}

From 22043643c64f748cb00ae92476f2085b3c0903c9 Mon Sep 17 00:00:00 2001
From: Chuanqi Xu <yedeng.yd@linux.alibaba.com>
Date: Mon, 30 Oct 2023 11:25:19 +0800
Subject: [PATCH 336/877] [docs] [C++20] [Modules] Add document for the case of
 duplicated '-fmodule-file=<module-name>='

This patch clarifies that in case there are multiple
'-fmodule-file=<module-name>=' optins for the same `<module-name>`,
the last '-fmodule-file=<module-name>=' will override the previous
'-fmodule-file=<module-name>=' option.
---
 clang/docs/StandardCPlusPlusModules.rst | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/clang/docs/StandardCPlusPlusModules.rst b/clang/docs/StandardCPlusPlusModules.rst
index 8dd86edc64a80..4970457574030 100644
--- a/clang/docs/StandardCPlusPlusModules.rst
+++ b/clang/docs/StandardCPlusPlusModules.rst
@@ -365,6 +365,10 @@ the above example could be rewritten into:
 
   $ clang++ -std=c++20 M.cppm --precompile -fmodule-file=M:interface_part=M-interface_part.pcm -fmodule-file=M:impl_part=M-impl_part.pcm -o M.pcm
 
+When there are multiple ``-fmodule-file=<module-name>=`` options for the same
+``<module-name>``, the last ``-fmodule-file=<module-name>=`` will override the previous
+``-fmodule-file=<module-name>=`` options.
+
 ``-fprebuilt-module-path`` is more convenient and ``-fmodule-file`` is faster since
 it saves time for file lookup.
 

From a8d0c86174b4e7355e250b3eb266e055df224f48 Mon Sep 17 00:00:00 2001
From: Matthias Springer <me@m-sp.org>
Date: Mon, 30 Oct 2023 13:42:44 +0900
Subject: [PATCH 337/877] [mlir][Interfaces][NFC] Move
 `SubsetInsertionOpInterface` to `mlir/Interfaces` (#70615)

`SubsetInsertionOpInterface` is an interface for ops that insert into a
destination tensor at a subset. It is currently used by the
bufferization framework to support efficient
`tensor.extract_slice/insert_slice` bufferization and to drive "empty
tensor elimination".

This commit moves the interface to `mlir/Interfaces`. This is in
preparation of adding a new "loop-invariant subset hoisting"
transformation to
`mlir/Transforms/Utils/LoopInvariantCodeMotionUtils.cpp`, which will
utilize `SubsetInsertionOpInterface`. (This new transform is roughly
what `Linalg/Transforms/SubsetHoisting.cpp` is doing, but in a generic
and interface-driven way.)
---
 .../Dialect/Bufferization/IR/Bufferization.h  |  2 +-
 .../Bufferization/IR/BufferizationOps.td      |  2 +-
 .../Dialect/Bufferization/IR/CMakeLists.txt   |  1 -
 mlir/include/mlir/Interfaces/CMakeLists.txt   |  1 +
 .../SubsetInsertionOpInterface.h              | 10 +++----
 .../SubsetInsertionOpInterface.td             |  6 ++--
 .../Dialect/Bufferization/IR/CMakeLists.txt   |  2 +-
 .../Bufferization/Transforms/CMakeLists.txt   |  1 +
 .../Transforms/EmptyTensorElimination.cpp     |  2 +-
 .../Transforms/OneShotAnalysis.cpp            |  2 +-
 .../Dialect/Linalg/Transforms/CMakeLists.txt  |  1 +
 .../SubsetInsertionOpInterfaceImpl.cpp        |  3 +-
 .../Dialect/Tensor/Transforms/CMakeLists.txt  |  1 +
 .../SubsetInsertionOpInterfaceImpl.cpp        |  3 +-
 mlir/lib/Interfaces/CMakeLists.txt            | 17 +++++++++++
 .../SubsetInsertionOpInterface.cpp            |  6 ++--
 .../llvm-project-overlay/mlir/BUILD.bazel     | 29 ++++++++++++++-----
 17 files changed, 60 insertions(+), 29 deletions(-)
 rename mlir/include/mlir/{Dialect/Bufferization/IR => Interfaces}/SubsetInsertionOpInterface.h (67%)
 rename mlir/include/mlir/{Dialect/Bufferization/IR => Interfaces}/SubsetInsertionOpInterface.td (96%)
 rename mlir/lib/{Dialect/Bufferization/IR => Interfaces}/SubsetInsertionOpInterface.cpp (76%)

diff --git a/mlir/include/mlir/Dialect/Bufferization/IR/Bufferization.h b/mlir/include/mlir/Dialect/Bufferization/IR/Bufferization.h
index 16bb0d87fb6fe..c035190f43e39 100644
--- a/mlir/include/mlir/Dialect/Bufferization/IR/Bufferization.h
+++ b/mlir/include/mlir/Dialect/Bufferization/IR/Bufferization.h
@@ -12,10 +12,10 @@
 #include "mlir/Bytecode/BytecodeOpInterface.h"
 #include "mlir/Dialect/Bufferization/IR/AllocationOpInterface.h"
 #include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h"
-#include "mlir/Dialect/Bufferization/IR/SubsetInsertionOpInterface.h"
 #include "mlir/Interfaces/CopyOpInterface.h"
 #include "mlir/Interfaces/DestinationStyleOpInterface.h"
 #include "mlir/Interfaces/InferTypeOpInterface.h"
+#include "mlir/Interfaces/SubsetInsertionOpInterface.h"
 
 //===----------------------------------------------------------------------===//
 // Bufferization Dialect
diff --git a/mlir/include/mlir/Dialect/Bufferization/IR/BufferizationOps.td b/mlir/include/mlir/Dialect/Bufferization/IR/BufferizationOps.td
index c779d1f843d76..72a4aa712f49c 100644
--- a/mlir/include/mlir/Dialect/Bufferization/IR/BufferizationOps.td
+++ b/mlir/include/mlir/Dialect/Bufferization/IR/BufferizationOps.td
@@ -12,10 +12,10 @@
 include "mlir/Dialect/Bufferization/IR/AllocationOpInterface.td"
 include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.td"
 include "mlir/Dialect/Bufferization/IR/BufferizationBase.td"
-include "mlir/Dialect/Bufferization/IR/SubsetInsertionOpInterface.td"
 include "mlir/Interfaces/DestinationStyleOpInterface.td"
 include "mlir/Interfaces/InferTypeOpInterface.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
+include "mlir/Interfaces/SubsetInsertionOpInterface.td"
 include "mlir/Interfaces/CopyOpInterface.td"
 
 class Bufferization_Op<string mnemonic, list<Trait> traits = []>
diff --git a/mlir/include/mlir/Dialect/Bufferization/IR/CMakeLists.txt b/mlir/include/mlir/Dialect/Bufferization/IR/CMakeLists.txt
index 38057d4910d29..31a553f9a32f5 100644
--- a/mlir/include/mlir/Dialect/Bufferization/IR/CMakeLists.txt
+++ b/mlir/include/mlir/Dialect/Bufferization/IR/CMakeLists.txt
@@ -3,7 +3,6 @@ add_mlir_doc(BufferizationOps BufferizationOps Dialects/ -gen-dialect-doc)
 add_mlir_interface(AllocationOpInterface)
 add_mlir_interface(BufferDeallocationOpInterface)
 add_mlir_interface(BufferizableOpInterface)
-add_mlir_interface(SubsetInsertionOpInterface)
 
 set(LLVM_TARGET_DEFINITIONS BufferizationEnums.td)
 mlir_tablegen(BufferizationEnums.h.inc -gen-enum-decls)
diff --git a/mlir/include/mlir/Interfaces/CMakeLists.txt b/mlir/include/mlir/Interfaces/CMakeLists.txt
index 701b46889194d..36a04ff0eaeaf 100644
--- a/mlir/include/mlir/Interfaces/CMakeLists.txt
+++ b/mlir/include/mlir/Interfaces/CMakeLists.txt
@@ -12,6 +12,7 @@ add_mlir_interface(ParallelCombiningOpInterface)
 add_mlir_interface(RuntimeVerifiableOpInterface)
 add_mlir_interface(ShapedOpInterfaces)
 add_mlir_interface(SideEffectInterfaces)
+add_mlir_interface(SubsetInsertionOpInterface)
 add_mlir_interface(TilingInterface)
 add_mlir_interface(ValueBoundsOpInterface)
 add_mlir_interface(VectorInterfaces)
diff --git a/mlir/include/mlir/Dialect/Bufferization/IR/SubsetInsertionOpInterface.h b/mlir/include/mlir/Interfaces/SubsetInsertionOpInterface.h
similarity index 67%
rename from mlir/include/mlir/Dialect/Bufferization/IR/SubsetInsertionOpInterface.h
rename to mlir/include/mlir/Interfaces/SubsetInsertionOpInterface.h
index e5b06d746e74b..3a6dfceadcce7 100644
--- a/mlir/include/mlir/Dialect/Bufferization/IR/SubsetInsertionOpInterface.h
+++ b/mlir/include/mlir/Interfaces/SubsetInsertionOpInterface.h
@@ -6,13 +6,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef MLIR_DIALECT_BUFFERIZATION_IR_SUBSETINSERTIONOPINTERFACE_H_
-#define MLIR_DIALECT_BUFFERIZATION_IR_SUBSETINSERTIONOPINTERFACE_H_
+#ifndef MLIR_INTERFACES_SUBSETINSERTIONOPINTERFACE_H_
+#define MLIR_INTERFACES_SUBSETINSERTIONOPINTERFACE_H_
 
 #include "mlir/IR/OpDefinition.h"
 
 namespace mlir {
-namespace bufferization {
 namespace detail {
 
 /// Return the destination/"init" operand of the op if it implements the
@@ -21,9 +20,8 @@ namespace detail {
 OpOperand &defaultGetDestinationOperand(Operation *op);
 
 } // namespace detail
-} // namespace bufferization
 } // namespace mlir
 
-#include "mlir/Dialect/Bufferization/IR/SubsetInsertionOpInterface.h.inc"
+#include "mlir/Interfaces/SubsetInsertionOpInterface.h.inc"
 
-#endif // MLIR_DIALECT_BUFFERIZATION_IR_SUBSETINSERTIONOPINTERFACE_H_
+#endif // MLIR_INTERFACES_SUBSETINSERTIONOPINTERFACE_H_
diff --git a/mlir/include/mlir/Dialect/Bufferization/IR/SubsetInsertionOpInterface.td b/mlir/include/mlir/Interfaces/SubsetInsertionOpInterface.td
similarity index 96%
rename from mlir/include/mlir/Dialect/Bufferization/IR/SubsetInsertionOpInterface.td
rename to mlir/include/mlir/Interfaces/SubsetInsertionOpInterface.td
index aa09354bc753d..ef94a8ae9a60e 100644
--- a/mlir/include/mlir/Dialect/Bufferization/IR/SubsetInsertionOpInterface.td
+++ b/mlir/include/mlir/Interfaces/SubsetInsertionOpInterface.td
@@ -42,7 +42,7 @@ def SubsetInsertionOpInterface : OpInterface<"SubsetInsertionOpInterface"> {
     Note: This interface currently assumes that a subset op inserts a single
     tensor (source) into a destination tensor at a single subset.
   }];
-  let cppNamespace = "::mlir::bufferization";
+  let cppNamespace = "::mlir";
   let methods = [
       InterfaceMethod<
         /*desc=*/[{
@@ -61,7 +61,7 @@ def SubsetInsertionOpInterface : OpInterface<"SubsetInsertionOpInterface"> {
         /*args=*/(ins),
         /*methodBody=*/"",
         /*defaultImplementation=*/[{
-          return ::mlir::bufferization::detail::defaultGetDestinationOperand(
+          return ::mlir::detail::defaultGetDestinationOperand(
               $_op.getOperation());
         }]
       >,
@@ -144,7 +144,7 @@ def SubsetInsertionOpInterface : OpInterface<"SubsetInsertionOpInterface"> {
     /// Note: This function is useful outside of bufferization, where no tensor
     /// equivalence information is available.
     bool isSameSubset(OpResult candidate) {
-      auto subsetOp = cast<::mlir::bufferization::SubsetInsertionOpInterface>(
+      auto subsetOp = cast<::mlir::SubsetInsertionOpInterface>(
           getOperation());
       return subsetOp.isEquivalentSubset(
           candidate, [](Value v1, Value v2) { return v1 == v2; });
diff --git a/mlir/lib/Dialect/Bufferization/IR/CMakeLists.txt b/mlir/lib/Dialect/Bufferization/IR/CMakeLists.txt
index b1940e40ba341..385d8dc9364e3 100644
--- a/mlir/lib/Dialect/Bufferization/IR/CMakeLists.txt
+++ b/mlir/lib/Dialect/Bufferization/IR/CMakeLists.txt
@@ -4,7 +4,6 @@ add_mlir_dialect_library(MLIRBufferizationDialect
   BufferDeallocationOpInterface.cpp
   BufferizationOps.cpp
   BufferizationDialect.cpp
-  SubsetInsertionOpInterface.cpp
   UnstructuredControlFlow.cpp
 
   ADDITIONAL_HEADER_DIRS
@@ -24,6 +23,7 @@ add_mlir_dialect_library(MLIRBufferizationDialect
   MLIRFunctionInterfaces
   MLIRIR
   MLIRSparseTensorDialect
+  MLIRSubsetInsertionOpInterface
   MLIRTensorDialect
   MLIRMemRefDialect
   )
diff --git a/mlir/lib/Dialect/Bufferization/Transforms/CMakeLists.txt b/mlir/lib/Dialect/Bufferization/Transforms/CMakeLists.txt
index ed8dbd57bf40b..a6876c7c824e0 100644
--- a/mlir/lib/Dialect/Bufferization/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/Bufferization/Transforms/CMakeLists.txt
@@ -36,6 +36,7 @@ add_mlir_dialect_library(MLIRBufferizationTransforms
   MLIRTensorDialect
   MLIRSCFDialect
   MLIRSideEffectInterfaces
+  MLIRSubsetInsertionOpInterface
   MLIRTransforms
   MLIRViewLikeInterface
   MLIRSupport
diff --git a/mlir/lib/Dialect/Bufferization/Transforms/EmptyTensorElimination.cpp b/mlir/lib/Dialect/Bufferization/Transforms/EmptyTensorElimination.cpp
index 1a5a65bfac132..6622cfefa76a2 100644
--- a/mlir/lib/Dialect/Bufferization/Transforms/EmptyTensorElimination.cpp
+++ b/mlir/lib/Dialect/Bufferization/Transforms/EmptyTensorElimination.cpp
@@ -10,12 +10,12 @@
 
 #include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h"
 #include "mlir/Dialect/Bufferization/IR/Bufferization.h"
-#include "mlir/Dialect/Bufferization/IR/SubsetInsertionOpInterface.h"
 #include "mlir/Dialect/Bufferization/Transforms/OneShotAnalysis.h"
 #include "mlir/Dialect/Bufferization/Transforms/OneShotModuleBufferize.h"
 #include "mlir/Dialect/Bufferization/Transforms/Transforms.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/IR/Dominance.h"
+#include "mlir/Interfaces/SubsetInsertionOpInterface.h"
 #include "mlir/Pass/Pass.h"
 
 namespace mlir {
diff --git a/mlir/lib/Dialect/Bufferization/Transforms/OneShotAnalysis.cpp b/mlir/lib/Dialect/Bufferization/Transforms/OneShotAnalysis.cpp
index f590e3d9da8e9..0bbfdba2b6e6e 100644
--- a/mlir/lib/Dialect/Bufferization/Transforms/OneShotAnalysis.cpp
+++ b/mlir/lib/Dialect/Bufferization/Transforms/OneShotAnalysis.cpp
@@ -45,7 +45,6 @@
 
 #include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h"
 #include "mlir/Dialect/Bufferization/IR/Bufferization.h"
-#include "mlir/Dialect/Bufferization/IR/SubsetInsertionOpInterface.h"
 #include "mlir/Dialect/Bufferization/Transforms/Bufferize.h"
 #include "mlir/Dialect/Bufferization/Transforms/Transforms.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
@@ -55,6 +54,7 @@
 #include "mlir/IR/Operation.h"
 #include "mlir/IR/TypeUtilities.h"
 #include "mlir/Interfaces/ControlFlowInterfaces.h"
+#include "mlir/Interfaces/SubsetInsertionOpInterface.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/SetVector.h"
 
diff --git a/mlir/lib/Dialect/Linalg/Transforms/CMakeLists.txt b/mlir/lib/Dialect/Linalg/Transforms/CMakeLists.txt
index 4e094609afa6a..bad246c262979 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/Linalg/Transforms/CMakeLists.txt
@@ -66,6 +66,7 @@ add_mlir_dialect_library(MLIRLinalgTransforms
   MLIRSCFTransforms
   MLIRSCFUtils
   MLIRPass
+  MLIRSubsetInsertionOpInterface
   MLIRSparseTensorDialect
   MLIRTensorDialect
   MLIRTensorTilingInterfaceImpl
diff --git a/mlir/lib/Dialect/Linalg/Transforms/SubsetInsertionOpInterfaceImpl.cpp b/mlir/lib/Dialect/Linalg/Transforms/SubsetInsertionOpInterfaceImpl.cpp
index ef7c1b76e0cd0..e0819082102ef 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/SubsetInsertionOpInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/SubsetInsertionOpInterfaceImpl.cpp
@@ -8,11 +8,10 @@
 
 #include "mlir/Dialect/Linalg/Transforms/SubsetInsertionOpInterfaceImpl.h"
 
-#include "mlir/Dialect/Bufferization/IR/SubsetInsertionOpInterface.h"
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
+#include "mlir/Interfaces/SubsetInsertionOpInterface.h"
 
 using namespace mlir;
-using namespace mlir::bufferization;
 using namespace mlir::linalg;
 
 namespace {
diff --git a/mlir/lib/Dialect/Tensor/Transforms/CMakeLists.txt b/mlir/lib/Dialect/Tensor/Transforms/CMakeLists.txt
index 0d148278cec51..a0c172ac52e4b 100644
--- a/mlir/lib/Dialect/Tensor/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/Tensor/Transforms/CMakeLists.txt
@@ -30,6 +30,7 @@ add_mlir_dialect_library(MLIRTensorTransforms
   MLIRMemRefDialect
   MLIRPass
   MLIRSCFDialect
+  MLIRSubsetInsertionOpInterface
   MLIRTensorDialect
   MLIRTensorUtils
   MLIRTilingInterface
diff --git a/mlir/lib/Dialect/Tensor/Transforms/SubsetInsertionOpInterfaceImpl.cpp b/mlir/lib/Dialect/Tensor/Transforms/SubsetInsertionOpInterfaceImpl.cpp
index 85f7796096a42..dbda9953684f4 100644
--- a/mlir/lib/Dialect/Tensor/Transforms/SubsetInsertionOpInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/Tensor/Transforms/SubsetInsertionOpInterfaceImpl.cpp
@@ -8,11 +8,10 @@
 
 #include "mlir/Dialect/Tensor/Transforms/SubsetInsertionOpInterfaceImpl.h"
 
-#include "mlir/Dialect/Bufferization/IR/SubsetInsertionOpInterface.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/Interfaces/SubsetInsertionOpInterface.h"
 
 using namespace mlir;
-using namespace mlir::bufferization;
 using namespace mlir::tensor;
 
 namespace {
diff --git a/mlir/lib/Interfaces/CMakeLists.txt b/mlir/lib/Interfaces/CMakeLists.txt
index 6067a7d8a6292..f74306206d63f 100644
--- a/mlir/lib/Interfaces/CMakeLists.txt
+++ b/mlir/lib/Interfaces/CMakeLists.txt
@@ -16,6 +16,7 @@ set(LLVM_OPTIONAL_SOURCES
   RuntimeVerifiableOpInterface.cpp
   ShapedOpInterfaces.cpp
   SideEffectInterfaces.cpp
+  SubsetInsertionOpInterface.cpp
   TilingInterface.cpp
   ValueBoundsOpInterface.cpp
   VectorInterfaces.cpp
@@ -82,6 +83,22 @@ add_mlir_interface_library(ParallelCombiningOpInterface)
 add_mlir_interface_library(RuntimeVerifiableOpInterface)
 add_mlir_interface_library(ShapedOpInterfaces)
 add_mlir_interface_library(SideEffectInterfaces)
+
+add_mlir_library(MLIRSubsetInsertionOpInterface
+  SubsetInsertionOpInterface.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Interfaces
+
+  DEPENDS
+  MLIRDestinationStyleOpInterface
+  MLIRSubsetInsertionOpInterfaceIncGen
+
+  LINK_LIBS PUBLIC
+  MLIRDestinationStyleOpInterface
+  MLIRIR
+  )
+
 add_mlir_interface_library(TilingInterface)
 add_mlir_interface_library(VectorInterfaces)
 add_mlir_interface_library(ViewLikeInterface)
diff --git a/mlir/lib/Dialect/Bufferization/IR/SubsetInsertionOpInterface.cpp b/mlir/lib/Interfaces/SubsetInsertionOpInterface.cpp
similarity index 76%
rename from mlir/lib/Dialect/Bufferization/IR/SubsetInsertionOpInterface.cpp
rename to mlir/lib/Interfaces/SubsetInsertionOpInterface.cpp
index 19a6fbba403c7..b2b092287f96b 100644
--- a/mlir/lib/Dialect/Bufferization/IR/SubsetInsertionOpInterface.cpp
+++ b/mlir/lib/Interfaces/SubsetInsertionOpInterface.cpp
@@ -6,14 +6,14 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "mlir/Dialect/Bufferization/IR/SubsetInsertionOpInterface.h"
+#include "mlir/Interfaces/SubsetInsertionOpInterface.h"
 #include "mlir/Interfaces/DestinationStyleOpInterface.h"
 
-#include "mlir/Dialect/Bufferization/IR/SubsetInsertionOpInterface.cpp.inc"
+#include "mlir/Interfaces/SubsetInsertionOpInterface.cpp.inc"
 
 using namespace mlir;
 
-OpOperand &bufferization::detail::defaultGetDestinationOperand(Operation *op) {
+OpOperand &detail::defaultGetDestinationOperand(Operation *op) {
   auto dstOp = dyn_cast<DestinationStyleOpInterface>(op);
   assert(dstOp && "getDestination must be implemented for non-DPS ops");
   assert(
diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index 99364fe11e872..b1bd543ae7fce 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -6933,6 +6933,7 @@ cc_library(
         ":MemRefDialect",
         ":Pass",
         ":SCFDialect",
+        ":SubsetInsertionOpInterface",
         ":TensorDialect",
         ":TensorPassIncGen",
         ":TensorUtils",
@@ -10190,7 +10191,7 @@ gentbl_cc_library(
 td_library(
     name = "SubsetInsertionOpInterfaceTdFiles",
     srcs = [
-        "include/mlir/Dialect/Bufferization/IR/SubsetInsertionOpInterface.td",
+        "include/mlir/Interfaces/SubsetInsertionOpInterface.td",
     ],
     includes = ["include"],
     deps = [
@@ -10203,20 +10204,34 @@ gentbl_cc_library(
     tbl_outs = [
         (
             ["-gen-op-interface-decls"],
-            "include/mlir/Dialect/Bufferization/IR/SubsetInsertionOpInterface.h.inc",
+            "include/mlir/Interfaces/SubsetInsertionOpInterface.h.inc",
         ),
         (
             ["-gen-op-interface-defs"],
-            "include/mlir/Dialect/Bufferization/IR/SubsetInsertionOpInterface.cpp.inc",
+            "include/mlir/Interfaces/SubsetInsertionOpInterface.cpp.inc",
         ),
     ],
     tblgen = ":mlir-tblgen",
-    td_file = "include/mlir/Dialect/Bufferization/IR/SubsetInsertionOpInterface.td",
+    td_file = "include/mlir/Interfaces/SubsetInsertionOpInterface.td",
     deps = [
         ":SubsetInsertionOpInterfaceTdFiles",
     ],
 )
 
+cc_library(
+    name = "SubsetInsertionOpInterface",
+    srcs = ["lib/Interfaces/SubsetInsertionOpInterface.cpp"],
+    hdrs = ["include/mlir/Interfaces/SubsetInsertionOpInterface.h"],
+    includes = ["include"],
+    deps = [
+        ":DestinationStyleOpInterface",
+        ":IR",
+        ":SubsetInsertionOpInterfaceIncGen",
+        ":Support",
+        "//llvm:Support",
+    ],
+)
+
 td_library(
     name = "LinalgDocTdFiles",
     srcs = ["include/mlir/Dialect/Linalg/IR/LinalgDoc.td"],
@@ -10455,6 +10470,7 @@ cc_library(
         ":SCFTransforms",
         ":SCFUtils",
         ":SparseTensorDialect",
+        ":SubsetInsertionOpInterface",
         ":Support",
         ":TensorDialect",
         ":TensorTilingInterfaceImpl",
@@ -12574,7 +12590,6 @@ cc_library(
         "lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp",
         "lib/Dialect/Bufferization/IR/BufferizationDialect.cpp",
         "lib/Dialect/Bufferization/IR/BufferizationOps.cpp",
-        "lib/Dialect/Bufferization/IR/SubsetInsertionOpInterface.cpp",
         "lib/Dialect/Bufferization/IR/UnstructuredControlFlow.cpp",
     ],
     hdrs = [
@@ -12582,7 +12597,6 @@ cc_library(
         "include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h",
         "include/mlir/Dialect/Bufferization/IR/Bufferization.h",
         "include/mlir/Dialect/Bufferization/IR/DstBufferizableOpInterfaceImpl.h",
-        "include/mlir/Dialect/Bufferization/IR/SubsetInsertionOpInterface.h",
         "include/mlir/Dialect/Bufferization/IR/UnstructuredControlFlow.h",
     ],
     includes = ["include"],
@@ -12605,7 +12619,7 @@ cc_library(
         ":InferTypeOpInterface",
         ":MemRefDialect",
         ":SparseTensorDialect",
-        ":SubsetInsertionOpInterfaceIncGen",
+        ":SubsetInsertionOpInterface",
         ":Support",
         ":TensorDialect",
         "//llvm:Support",
@@ -12655,6 +12669,7 @@ cc_library(
         ":Pass",
         ":SCFDialect",
         ":SideEffectInterfaces",
+        ":SubsetInsertionOpInterface",
         ":Support",
         ":TensorDialect",
         ":Transforms",

From 9925801c7efec8c70a6e2c1da99ee0e558586e16 Mon Sep 17 00:00:00 2001
From: Brad Smith <brad@comstyle.com>
Date: Mon, 30 Oct 2023 02:36:13 -0400
Subject: [PATCH 338/877] [Preprocessor][test] Add Haiku

---
 clang/test/Preprocessor/init.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/clang/test/Preprocessor/init.c b/clang/test/Preprocessor/init.c
index 59c5122afe1e4..a0a2879cb58c7 100644
--- a/clang/test/Preprocessor/init.c
+++ b/clang/test/Preprocessor/init.c
@@ -1454,6 +1454,13 @@
 // RUN: %clang_cc1 -triple lanai-unknown-unknown -E -dM < /dev/null | FileCheck -match-full-lines -check-prefix LANAI %s
 // LANAI: #define __lanai__ 1
 //
+// RUN: %clang_cc1 -triple=aarch64-unknown-haiku -E -dM < /dev/null | FileCheck -match-full-lines -check-prefix HAIKU %s
+// RUN: %clang_cc1 -triple=arm-unknown-haiku -E -dM < /dev/null | FileCheck -match-full-lines -check-prefix HAIKU %s
+// RUN: %clang_cc1 -triple=riscv64-unknown-haiku -E -dM < /dev/null | FileCheck -match-full-lines -check-prefix HAIKU %s
+// RUN: %clang_cc1 -triple=x86_64-unknown-haiku -E -dM < /dev/null | FileCheck -match-full-lines -check-prefix HAIKU %s
+// RUN: %clang_cc1 -triple=i386-unknown-haiku -E -dM < /dev/null | FileCheck -match-full-lines -check-prefix HAIKU %s
+// HAIKU: #define __HAIKU__ 1
+//
 // RUN: %clang_cc1 -E -dM -ffreestanding -triple=amd64-unknown-openbsd6.1 < /dev/null | FileCheck -match-full-lines -check-prefix OPENBSD %s
 // RUN: %clang_cc1 -E -dM -ffreestanding -triple=aarch64-unknown-openbsd6.1 < /dev/null | FileCheck -match-full-lines -check-prefix OPENBSD %s
 // RUN: %clang_cc1 -E -dM -ffreestanding -triple=arm-unknown-openbsd6.1-gnueabi < /dev/null | FileCheck -match-full-lines -check-prefix OPENBSD %s

From 30ca16ec87206294f4ad0e9688c88f32421b343e Mon Sep 17 00:00:00 2001
From: Pierre van Houtryve <pierre.vanhoutryve@amd.com>
Date: Mon, 30 Oct 2023 07:48:24 +0100
Subject: [PATCH 339/877] [mlir][spirv] Handle failed conversions of struct
 elements (#70005)

LLVMStructTypes could be emitted with some null elements. This caused a
crash later in the LLVMDialect verifier.

We now use `convertTypes` and check that all types were successfully converted before passing them to the `LLVMStructType` constructor.

See #59990
---
 .../mlir/Dialect/SPIRV/IR/SPIRVTypes.h        | 22 +--------------
 .../Conversion/SPIRVToLLVM/SPIRVToLLVM.cpp    | 27 +++++++++----------
 mlir/lib/Dialect/SPIRV/IR/SPIRVTypes.cpp      |  6 ++---
 .../spirv-types-to-llvm-invalid.mlir          |  7 +++++
 4 files changed, 23 insertions(+), 39 deletions(-)

diff --git a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVTypes.h b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVTypes.h
index 07f2f158ecabb..4be2582f8fd68 100644
--- a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVTypes.h
+++ b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVTypes.h
@@ -347,27 +347,7 @@ class StructType
 
   Type getElementType(unsigned) const;
 
-  /// Range class for element types.
-  class ElementTypeRange
-      : public ::llvm::detail::indexed_accessor_range_base<
-            ElementTypeRange, const Type *, Type, Type, Type> {
-  private:
-    using RangeBaseT::RangeBaseT;
-
-    /// See `llvm::detail::indexed_accessor_range_base` for details.
-    static const Type *offset_base(const Type *object, ptrdiff_t index) {
-      return object + index;
-    }
-    /// See `llvm::detail::indexed_accessor_range_base` for details.
-    static Type dereference_iterator(const Type *object, ptrdiff_t index) {
-      return object[index];
-    }
-
-    /// Allow base class access to `offset_base` and `dereference_iterator`.
-    friend RangeBaseT;
-  };
-
-  ElementTypeRange getElementTypes() const;
+  TypeRange getElementTypes() const;
 
   bool hasOffset() const;
 
diff --git a/mlir/lib/Conversion/SPIRVToLLVM/SPIRVToLLVM.cpp b/mlir/lib/Conversion/SPIRVToLLVM/SPIRVToLLVM.cpp
index 60f34f413f587..0ea82730c065a 100644
--- a/mlir/lib/Conversion/SPIRVToLLVM/SPIRVToLLVM.cpp
+++ b/mlir/lib/Conversion/SPIRVToLLVM/SPIRVToLLVM.cpp
@@ -201,16 +201,14 @@ static Value processCountOrOffset(Location loc, Value value, Type srcType,
 
 /// Converts SPIR-V struct with a regular (according to `VulkanLayoutUtils`)
 /// offset to LLVM struct. Otherwise, the conversion is not supported.
-static std::optional<Type>
-convertStructTypeWithOffset(spirv::StructType type,
-                            LLVMTypeConverter &converter) {
+static Type convertStructTypeWithOffset(spirv::StructType type,
+                                        LLVMTypeConverter &converter) {
   if (type != VulkanLayoutUtils::decorateType(type))
-    return std::nullopt;
+    return nullptr;
 
-  auto elementsVector = llvm::to_vector<8>(
-      llvm::map_range(type.getElementTypes(), [&](Type elementType) {
-        return converter.convertType(elementType);
-      }));
+  SmallVector<Type> elementsVector;
+  if (failed(converter.convertTypes(type.getElementTypes(), elementsVector)))
+    return nullptr;
   return LLVM::LLVMStructType::getLiteral(type.getContext(), elementsVector,
                                           /*isPacked=*/false);
 }
@@ -218,10 +216,9 @@ convertStructTypeWithOffset(spirv::StructType type,
 /// Converts SPIR-V struct with no offset to packed LLVM struct.
 static Type convertStructTypePacked(spirv::StructType type,
                                     LLVMTypeConverter &converter) {
-  auto elementsVector = llvm::to_vector<8>(
-      llvm::map_range(type.getElementTypes(), [&](Type elementType) {
-        return converter.convertType(elementType);
-      }));
+  SmallVector<Type> elementsVector;
+  if (failed(converter.convertTypes(type.getElementTypes(), elementsVector)))
+    return nullptr;
   return LLVM::LLVMStructType::getLiteral(type.getContext(), elementsVector,
                                           /*isPacked=*/true);
 }
@@ -335,12 +332,12 @@ static std::optional<Type> convertRuntimeArrayType(spirv::RuntimeArrayType type,
 
 /// Converts SPIR-V struct to LLVM struct. There is no support of structs with
 /// member decorations. Also, only natural offset is supported.
-static std::optional<Type> convertStructType(spirv::StructType type,
-                                             LLVMTypeConverter &converter) {
+static Type convertStructType(spirv::StructType type,
+                              LLVMTypeConverter &converter) {
   SmallVector<spirv::StructType::MemberDecorationInfo, 4> memberDecorations;
   type.getMemberDecorations(memberDecorations);
   if (!memberDecorations.empty())
-    return std::nullopt;
+    return nullptr;
   if (type.hasOffset())
     return convertStructTypeWithOffset(type, converter);
   return convertStructTypePacked(type, converter);
diff --git a/mlir/lib/Dialect/SPIRV/IR/SPIRVTypes.cpp b/mlir/lib/Dialect/SPIRV/IR/SPIRVTypes.cpp
index 39d6603a46f96..f1bac6490837b 100644
--- a/mlir/lib/Dialect/SPIRV/IR/SPIRVTypes.cpp
+++ b/mlir/lib/Dialect/SPIRV/IR/SPIRVTypes.cpp
@@ -1146,9 +1146,9 @@ Type StructType::getElementType(unsigned index) const {
   return getImpl()->memberTypesAndIsBodySet.getPointer()[index];
 }
 
-StructType::ElementTypeRange StructType::getElementTypes() const {
-  return ElementTypeRange(getImpl()->memberTypesAndIsBodySet.getPointer(),
-                          getNumElements());
+TypeRange StructType::getElementTypes() const {
+  return TypeRange(getImpl()->memberTypesAndIsBodySet.getPointer(),
+                   getNumElements());
 }
 
 bool StructType::hasOffset() const { return getImpl()->offsetInfo; }
diff --git a/mlir/test/Conversion/SPIRVToLLVM/spirv-types-to-llvm-invalid.mlir b/mlir/test/Conversion/SPIRVToLLVM/spirv-types-to-llvm-invalid.mlir
index 3965c47ec199f..438c90205abed 100644
--- a/mlir/test/Conversion/SPIRVToLLVM/spirv-types-to-llvm-invalid.mlir
+++ b/mlir/test/Conversion/SPIRVToLLVM/spirv-types-to-llvm-invalid.mlir
@@ -7,6 +7,13 @@ spirv.func @array_with_unnatural_stride(%arg: !spirv.array<4 x f32, stride=8>) -
 
 // -----
 
+// expected-error@+1 {{failed to legalize operation 'spirv.func' that was explicitly marked illegal}}
+spirv.func @struct_array_with_unnatural_stride(%arg: !spirv.struct<(!spirv.array<4 x f32, stride=8>)>) -> () "None" {
+  spirv.Return
+}
+
+// -----
+
 // expected-error@+1 {{failed to legalize operation 'spirv.func' that was explicitly marked illegal}}
 spirv.func @struct_with_unnatural_offset(%arg: !spirv.struct<(i32[0], i32[8])>) -> () "None" {
   spirv.Return

From d16d149e3d96cf1532c1b17b93b61024f976ad0c Mon Sep 17 00:00:00 2001
From: Kristof Beyls <kristof.beyls@arm.com>
Date: Mon, 30 Oct 2023 08:25:42 +0100
Subject: [PATCH 340/877] [doc] Improve wording for "What is considered a
 security issue?" (#69436)

The intent of this "wording improvement" is to not change the meaning of
the text, just make it easier to process by removing "FUTURE" markers,
and slightly improving other wording.
---
 llvm/docs/Security.rst | 54 +++++++++++++++++++++++++++++-------------
 1 file changed, 37 insertions(+), 17 deletions(-)

diff --git a/llvm/docs/Security.rst b/llvm/docs/Security.rst
index 61ba9aafaa394..7f3493d4563bb 100644
--- a/llvm/docs/Security.rst
+++ b/llvm/docs/Security.rst
@@ -211,23 +211,43 @@ The LLVM Security Policy may be changed by majority vote of the LLVM Security Gr
 What is considered a security issue?
 ====================================
 
-*FUTURE*: this section will be expanded once the Security Group is formed, and it agrees on an initial security surface area.
-
-The LLVM Project has a significant amount of code, and not all of it is considered security-sensitive. This is particularly true because LLVM is used in a wide variety of circumstances: there are different threat models, untrusted inputs differ, and the environment LLVM runs in is varied. Therefore, what the LLVM Project considers a security issue is what its members have signed up to maintain securely.
-
-As this security process matures, members of the LLVM community can propose that a part of the codebase be designated as security-sensitive (or no longer security-sensitive). This requires a rationale, and buy-in from the LLVM community as for any RFC. In some cases, parts of the codebase could be handled as security-sensitive but need significant work to get to the stage where that's manageable. The LLVM community will need to decide whether it wants to invest in making these parts of the code secure-able, and maintain these security properties over time. In all cases the LLVM Security Group should be consulted, since they'll be responding to security issues filed against these parts of the codebase.
-
-If you're not sure whether an issue is in-scope for this security process or not, err towards assuming that it is. The Security Group might agree or disagree and will explain its rationale in the report, as well as  update this document through the above process.
-
-The security-sensitive parts of the LLVM Project currently are:
-
-* None (this process is new, the list hasn't been populated yet)
-* *FUTURE*: this section will be expanded.
-
-The parts of the LLVM Project which are currently treated as non-security sensitive are:
-
-* Language front-ends, such as clang, for which a malicious input file can cause undesirable behavior. For example, a maliciously crafted C or Rust source file can cause arbitrary code to execute in LLVM. These parts of LLVM haven't been hardened, and compiling untrusted code usually also includes running utilities such as `make` which can more readily perform malicious things.
-* *FUTURE*: this section will be expanded.
+The LLVM Project has a significant amount of code, and not all of it is
+considered security-sensitive. This is particularly true because LLVM is used in
+a wide variety of circumstances: there are different threat models, untrusted
+inputs differ, and the environment LLVM runs in is varied. Therefore, what the
+LLVM Project considers a security issue is what its members have signed up to
+maintain securely.
+
+As this security process matures, members of the LLVM community can propose that
+a part of the codebase be designated as security-sensitive (or no longer
+security-sensitive). This requires a rationale, and buy-in from the LLVM
+community as for any RFC. In some cases, parts of the codebase could be handled
+as security-sensitive but need significant work to get to the stage where that's
+manageable. The LLVM community will need to decide whether it wants to invest in
+making these parts of the code securable, and maintain these security
+properties over time. In all cases the LLVM Security Group should be consulted,
+since they'll be responding to security issues filed against these parts of the
+codebase.
+
+If you're not sure whether an issue is in-scope for this security process or
+not, err towards assuming that it is. The Security Group might agree or disagree
+and will explain its rationale in the report, as well as update this document
+through the above process.
+
+The security-sensitive parts of the LLVM Project currently are the following.
+Note that this list can change over time.
+
+* None are currently defined. Please don't let this stop you from reporting
+  issues to the security group that you believe are security-sensitive.
+
+The parts of the LLVM Project which are currently treated as non-security
+sensitive are the following. Note that this list can change over time.
+
+* Language front-ends, such as clang, for which a malicious input file can cause
+  undesirable behavior. For example, a maliciously crafted C or Rust source file
+  can cause arbitrary code to execute in LLVM. These parts of LLVM haven't been
+  hardened, and compiling untrusted code usually also includes running utilities
+  such as `make` which can more readily perform malicious things.
 
 
 .. _CVE process: https://cve.mitre.org

From 130b149ba92265d09fc7a08c116506f68982cc9b Mon Sep 17 00:00:00 2001
From: Christian Ulmann <christian.ulmann@nextsilicon.com>
Date: Mon, 30 Oct 2023 08:57:16 +0100
Subject: [PATCH 341/877] [MLIR][FuncToLLVM] Remove typed pointer support
 (#70574)

This commit removes the support for lowering Func to LLVM dialect with
typed pointers. Typed pointers have been deprecated for a while now and
it's planned to soon remove them from the LLVM dialect.

Related PSA:
https://discourse.llvm.org/t/psa-removal-of-typed-pointers-from-the-llvm-dialect/74502
---
 mlir/include/mlir/Conversion/Passes.td        |   3 -
 mlir/lib/Conversion/FuncToLLVM/FuncToLLVM.cpp |   1 -
 .../FuncToLLVM/calling-convention.mlir        |   4 +-
 .../FuncToLLVM/convert-argattrs.mlir          |   2 +-
 .../FuncToLLVM/convert-data-layout.mlir       |   2 +-
 .../Conversion/FuncToLLVM/convert-funcs.mlir  |   2 +-
 .../emit-c-wrappers-for-external-callers.mlir |   2 +-
 ...mit-c-wrappers-for-external-functions.mlir |   2 +-
 .../FuncToLLVM/func-memref-return.mlir        |   4 +-
 .../Conversion/FuncToLLVM/func-memref.mlir    |   4 +-
 .../Conversion/FuncToLLVM/func-to-llvm.mlir   |   4 +-
 mlir/test/Conversion/FuncToLLVM/invalid.mlir  |   2 +-
 .../Conversion/FuncToLLVM/typed-pointers.mlir | 114 ------------------
 .../test/lib/Dialect/LLVM/TestLowerToLLVM.cpp |   3 +-
 14 files changed, 15 insertions(+), 134 deletions(-)
 delete mode 100644 mlir/test/Conversion/FuncToLLVM/typed-pointers.mlir

diff --git a/mlir/include/mlir/Conversion/Passes.td b/mlir/include/mlir/Conversion/Passes.td
index 336f0d3af951b..9d70987386437 100644
--- a/mlir/include/mlir/Conversion/Passes.td
+++ b/mlir/include/mlir/Conversion/Passes.td
@@ -409,9 +409,6 @@ def ConvertFuncToLLVMPass : Pass<"convert-func-to-llvm", "ModuleOp"> {
     Option<"indexBitwidth", "index-bitwidth", "unsigned",
            /*default=kDeriveIndexBitwidthFromDataLayout*/"0",
            "Bitwidth of the index type, 0 to use size of machine word">,
-    Option<"useOpaquePointers", "use-opaque-pointers", "bool",
-                       /*default=*/"true", "Generate LLVM IR using opaque pointers "
-                       "instead of typed pointers">,
   ];
 }
 
diff --git a/mlir/lib/Conversion/FuncToLLVM/FuncToLLVM.cpp b/mlir/lib/Conversion/FuncToLLVM/FuncToLLVM.cpp
index 3506f50916132..3126d1dee32cb 100644
--- a/mlir/lib/Conversion/FuncToLLVM/FuncToLLVM.cpp
+++ b/mlir/lib/Conversion/FuncToLLVM/FuncToLLVM.cpp
@@ -790,7 +790,6 @@ struct ConvertFuncToLLVMPass
     if (indexBitwidth != kDeriveIndexBitwidthFromDataLayout)
       options.overrideIndexBitwidth(indexBitwidth);
     options.dataLayout = llvm::DataLayout(dataLayout);
-    options.useOpaquePointers = useOpaquePointers;
 
     LLVMTypeConverter typeConverter(&getContext(), options,
                                     &dataLayoutAnalysis);
diff --git a/mlir/test/Conversion/FuncToLLVM/calling-convention.mlir b/mlir/test/Conversion/FuncToLLVM/calling-convention.mlir
index 1ed6770887560..7cdb89e1f72d2 100644
--- a/mlir/test/Conversion/FuncToLLVM/calling-convention.mlir
+++ b/mlir/test/Conversion/FuncToLLVM/calling-convention.mlir
@@ -1,5 +1,5 @@
-// RUN: mlir-opt -finalize-memref-to-llvm='use-opaque-pointers=1' -llvm-request-c-wrappers -convert-func-to-llvm='use-opaque-pointers=1' -reconcile-unrealized-casts %s | FileCheck %s
-// RUN: mlir-opt -finalize-memref-to-llvm='use-opaque-pointers=1' -convert-func-to-llvm='use-opaque-pointers=1' -reconcile-unrealized-casts %s | FileCheck %s --check-prefix=EMIT_C_ATTRIBUTE
+// RUN: mlir-opt -finalize-memref-to-llvm -llvm-request-c-wrappers -convert-func-to-llvm -reconcile-unrealized-casts %s | FileCheck %s
+// RUN: mlir-opt -finalize-memref-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts %s | FileCheck %s --check-prefix=EMIT_C_ATTRIBUTE
 
 // This tests the default memref calling convention and the emission of C
 // wrappers. We don't need to separate runs because the wrapper-emission
diff --git a/mlir/test/Conversion/FuncToLLVM/convert-argattrs.mlir b/mlir/test/Conversion/FuncToLLVM/convert-argattrs.mlir
index 41aff17d86919..85c7cbddfdbf6 100644
--- a/mlir/test/Conversion/FuncToLLVM/convert-argattrs.mlir
+++ b/mlir/test/Conversion/FuncToLLVM/convert-argattrs.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt -convert-func-to-llvm='use-opaque-pointers=1' %s | FileCheck %s
+// RUN: mlir-opt -convert-func-to-llvm %s | FileCheck %s
 
 // CHECK-LABEL: func @check_attributes
 // CHECK-SAME: {dialect.a = true, dialect.b = 4 : i64}
diff --git a/mlir/test/Conversion/FuncToLLVM/convert-data-layout.mlir b/mlir/test/Conversion/FuncToLLVM/convert-data-layout.mlir
index fb33d4fdfbe7c..0e7c16ec50799 100644
--- a/mlir/test/Conversion/FuncToLLVM/convert-data-layout.mlir
+++ b/mlir/test/Conversion/FuncToLLVM/convert-data-layout.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt -set-llvm-module-datalayout -convert-func-to-llvm='use-opaque-pointers=1' %s | FileCheck %s
+// RUN: mlir-opt -set-llvm-module-datalayout -convert-func-to-llvm %s | FileCheck %s
 
 // RUN-32: mlir-opt -set-llvm-module-datalayout='data-layout=p:32:32:32' -convert-func-to-llvm='use-opaque-pointers=1' %s \
 // RUN-32: | FileCheck %s
diff --git a/mlir/test/Conversion/FuncToLLVM/convert-funcs.mlir b/mlir/test/Conversion/FuncToLLVM/convert-funcs.mlir
index 9fe5ad5cdda65..765d8469f3c56 100644
--- a/mlir/test/Conversion/FuncToLLVM/convert-funcs.mlir
+++ b/mlir/test/Conversion/FuncToLLVM/convert-funcs.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt -convert-func-to-llvm='use-opaque-pointers=1' -split-input-file -verify-diagnostics %s | FileCheck %s
+// RUN: mlir-opt -convert-func-to-llvm -split-input-file -verify-diagnostics %s | FileCheck %s
 
 //CHECK: llvm.func @second_order_arg(!llvm.ptr)
 func.func private @second_order_arg(%arg0 : () -> ())
diff --git a/mlir/test/Conversion/FuncToLLVM/emit-c-wrappers-for-external-callers.mlir b/mlir/test/Conversion/FuncToLLVM/emit-c-wrappers-for-external-callers.mlir
index dd474e1401105..826ca9540ae56 100644
--- a/mlir/test/Conversion/FuncToLLVM/emit-c-wrappers-for-external-callers.mlir
+++ b/mlir/test/Conversion/FuncToLLVM/emit-c-wrappers-for-external-callers.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt -llvm-request-c-wrappers -convert-func-to-llvm='use-opaque-pointers=1' %s | FileCheck %s
+// RUN: mlir-opt -llvm-request-c-wrappers -convert-func-to-llvm %s | FileCheck %s
 
 // CHECK: llvm.func @res_attrs_with_memref_return() -> (!llvm.struct{{.*}} {test.returnOne})
 // CHECK-LABEL: llvm.func @_mlir_ciface_res_attrs_with_memref_return
diff --git a/mlir/test/Conversion/FuncToLLVM/emit-c-wrappers-for-external-functions.mlir b/mlir/test/Conversion/FuncToLLVM/emit-c-wrappers-for-external-functions.mlir
index 027d29b0bf079..28c2638c7be51 100644
--- a/mlir/test/Conversion/FuncToLLVM/emit-c-wrappers-for-external-functions.mlir
+++ b/mlir/test/Conversion/FuncToLLVM/emit-c-wrappers-for-external-functions.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt -llvm-request-c-wrappers -convert-func-to-llvm='use-opaque-pointers=1' %s | FileCheck %s
+// RUN: mlir-opt -llvm-request-c-wrappers -convert-func-to-llvm %s | FileCheck %s
 
 // CHECK: llvm.func private @res_attrs_with_memref_return() -> (!llvm.struct{{.*}} {test.returnOne})
 // CHECK-LABEL: llvm.func @_mlir_ciface_res_attrs_with_memref_return
diff --git a/mlir/test/Conversion/FuncToLLVM/func-memref-return.mlir b/mlir/test/Conversion/FuncToLLVM/func-memref-return.mlir
index b584d4ce28f52..91ef571cb3bf7 100644
--- a/mlir/test/Conversion/FuncToLLVM/func-memref-return.mlir
+++ b/mlir/test/Conversion/FuncToLLVM/func-memref-return.mlir
@@ -1,6 +1,6 @@
-// RUN: mlir-opt -convert-func-to-llvm='use-opaque-pointers=1' -reconcile-unrealized-casts %s | FileCheck %s
+// RUN: mlir-opt -convert-func-to-llvm -reconcile-unrealized-casts %s | FileCheck %s
 
-// RUN: mlir-opt -convert-func-to-llvm='use-bare-ptr-memref-call-conv=1 use-opaque-pointers=1'  %s | FileCheck %s --check-prefix=BAREPTR
+// RUN: mlir-opt -convert-func-to-llvm='use-bare-ptr-memref-call-conv=1'  %s | FileCheck %s --check-prefix=BAREPTR
 
 // RUN: mlir-opt -transform-interpreter %s | FileCheck %s --check-prefix=BAREPTR
 
diff --git a/mlir/test/Conversion/FuncToLLVM/func-memref.mlir b/mlir/test/Conversion/FuncToLLVM/func-memref.mlir
index b61287643dca9..d44a07bdcc9ab 100644
--- a/mlir/test/Conversion/FuncToLLVM/func-memref.mlir
+++ b/mlir/test/Conversion/FuncToLLVM/func-memref.mlir
@@ -1,5 +1,5 @@
-// RUN: mlir-opt -pass-pipeline="builtin.module(func.func(convert-arith-to-llvm),convert-func-to-llvm{use-opaque-pointers=1},reconcile-unrealized-casts)" -split-input-file %s | FileCheck %s
-// RUN: mlir-opt -pass-pipeline="builtin.module(func.func(convert-arith-to-llvm),convert-func-to-llvm{use-bare-ptr-memref-call-conv=1 use-opaque-pointers=1},reconcile-unrealized-casts)" -split-input-file %s | FileCheck %s --check-prefix=BAREPTR
+// RUN: mlir-opt -pass-pipeline="builtin.module(func.func(convert-arith-to-llvm),convert-func-to-llvm,reconcile-unrealized-casts)" -split-input-file %s | FileCheck %s
+// RUN: mlir-opt -pass-pipeline="builtin.module(func.func(convert-arith-to-llvm),convert-func-to-llvm{use-bare-ptr-memref-call-conv=1},reconcile-unrealized-casts)" -split-input-file %s | FileCheck %s --check-prefix=BAREPTR
 
 // BAREPTR-LABEL: func @check_noalias
 // BAREPTR-SAME: %{{.*}}: !llvm.ptr {llvm.noalias}, %{{.*}}: !llvm.ptr {llvm.noalias}
diff --git a/mlir/test/Conversion/FuncToLLVM/func-to-llvm.mlir b/mlir/test/Conversion/FuncToLLVM/func-to-llvm.mlir
index 8254e77c8628b..9cc6bbf0873ab 100644
--- a/mlir/test/Conversion/FuncToLLVM/func-to-llvm.mlir
+++ b/mlir/test/Conversion/FuncToLLVM/func-to-llvm.mlir
@@ -1,6 +1,6 @@
-// RUN: mlir-opt -pass-pipeline="builtin.module(func.func(convert-math-to-llvm,convert-arith-to-llvm),convert-func-to-llvm{use-opaque-pointers=1},reconcile-unrealized-casts)" %s | FileCheck %s
+// RUN: mlir-opt -pass-pipeline="builtin.module(func.func(convert-math-to-llvm,convert-arith-to-llvm),convert-func-to-llvm,reconcile-unrealized-casts)" %s | FileCheck %s
 
-// RUN: mlir-opt -pass-pipeline="builtin.module(func.func(convert-math-to-llvm,convert-arith-to-llvm{index-bitwidth=32}),convert-func-to-llvm{index-bitwidth=32 use-opaque-pointers=1},reconcile-unrealized-casts)" %s | FileCheck --check-prefix=CHECK32 %s
+// RUN: mlir-opt -pass-pipeline="builtin.module(func.func(convert-math-to-llvm,convert-arith-to-llvm{index-bitwidth=32}),convert-func-to-llvm{index-bitwidth=32},reconcile-unrealized-casts)" %s | FileCheck --check-prefix=CHECK32 %s
 
 // RUN: mlir-opt -transform-interpreter %s | FileCheck --check-prefix=CHECK32 %s
 
diff --git a/mlir/test/Conversion/FuncToLLVM/invalid.mlir b/mlir/test/Conversion/FuncToLLVM/invalid.mlir
index 798d0a8519efe..e70252ff87ed1 100644
--- a/mlir/test/Conversion/FuncToLLVM/invalid.mlir
+++ b/mlir/test/Conversion/FuncToLLVM/invalid.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-func-to-llvm='use-opaque-pointers=1' -verify-diagnostics -split-input-file
+// RUN: mlir-opt %s -convert-func-to-llvm -verify-diagnostics -split-input-file
 
 // Should not crash on unsupported types in function signatures.
 func.func private @unsupported_signature() -> tensor<10 x i32>
diff --git a/mlir/test/Conversion/FuncToLLVM/typed-pointers.mlir b/mlir/test/Conversion/FuncToLLVM/typed-pointers.mlir
deleted file mode 100644
index 7b3b816cc38bb..0000000000000
--- a/mlir/test/Conversion/FuncToLLVM/typed-pointers.mlir
+++ /dev/null
@@ -1,114 +0,0 @@
-// RUN: mlir-opt -convert-func-to-llvm='use-opaque-pointers=0' -split-input-file %s | FileCheck %s
-
-//CHECK: llvm.func @second_order_arg(!llvm.ptr<func<void ()>>)
-func.func private @second_order_arg(%arg0 : () -> ())
-
-//CHECK: llvm.func @second_order_result() -> !llvm.ptr<func<void ()>>
-func.func private @second_order_result() -> (() -> ())
-
-//CHECK: llvm.func @second_order_multi_result() -> !llvm.struct<(ptr<func<i32 ()>>, ptr<func<i64 ()>>, ptr<func<f32 ()>>)>
-func.func private @second_order_multi_result() -> (() -> (i32), () -> (i64), () -> (f32))
-
-//CHECK: llvm.func @third_order(!llvm.ptr<func<ptr<func<void ()>> (ptr<func<void ()>>)>>) -> !llvm.ptr<func<ptr<func<void ()>> (ptr<func<void ()>>)>>
-func.func private @third_order(%arg0 : (() -> ()) -> (() -> ())) -> ((() -> ()) -> (() -> ()))
-
-//CHECK: llvm.func @fifth_order_left(!llvm.ptr<func<void (ptr<func<void (ptr<func<void (ptr<func<void ()>>)>>)>>)>>)
-func.func private @fifth_order_left(%arg0: (((() -> ()) -> ()) -> ()) -> ())
-
-//CHECK: llvm.func @fifth_order_right(!llvm.ptr<func<ptr<func<ptr<func<ptr<func<void ()>> ()>> ()>> ()>>)
-func.func private @fifth_order_right(%arg0: () -> (() -> (() -> (() -> ()))))
-
-// Check that memrefs are converted to argument packs if appear as function arguments.
-// CHECK: llvm.func @memref_call_conv(!llvm.ptr<f32>, !llvm.ptr<f32>, i64, i64, i64)
-func.func private @memref_call_conv(%arg0: memref<?xf32>)
-
-// Same in nested functions.
-// CHECK: llvm.func @memref_call_conv_nested(!llvm.ptr<func<void (ptr<f32>, ptr<f32>, i64, i64, i64)>>)
-func.func private @memref_call_conv_nested(%arg0: (memref<?xf32>) -> ())
-
-//CHECK-LABEL: llvm.func @pass_through(%arg0: !llvm.ptr<func<void ()>>) -> !llvm.ptr<func<void ()>> {
-func.func @pass_through(%arg0: () -> ()) -> (() -> ()) {
-// CHECK-NEXT:  llvm.br ^bb1(%arg0 : !llvm.ptr<func<void ()>>)
-  cf.br ^bb1(%arg0 : () -> ())
-
-//CHECK-NEXT: ^bb1(%0: !llvm.ptr<func<void ()>>):
-^bb1(%bbarg: () -> ()):
-// CHECK-NEXT:  llvm.return %0 : !llvm.ptr<func<void ()>>
-  return %bbarg : () -> ()
-}
-
-// CHECK-LABEL: llvm.func @indirect_call(%arg0: !llvm.ptr<func<i32 (f32)>>, %arg1: f32) -> i32 {
-func.func @indirect_call(%arg0: (f32) -> i32, %arg1: f32) -> i32 {
-// CHECK-NEXT:  %0 = llvm.call %arg0(%arg1) : !llvm.ptr<func<i32 (f32)>>, (f32) -> i32
-  %0 = call_indirect %arg0(%arg1) : (f32) -> i32
-// CHECK-NEXT:  llvm.return %0 : i32
-  return %0 : i32
-}
-
-// CHECK-LABEL: llvm.func @get_i64() -> i64
-func.func private @get_i64() -> (i64)
-// CHECK-LABEL: llvm.func @get_f32() -> f32
-func.func private @get_f32() -> (f32)
-// CHECK-LABEL: llvm.func @get_memref() -> !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<4 x i64>, array<4 x i64>)>
-func.func private @get_memref() -> (memref<42x?x10x?xf32>)
-
-// CHECK-LABEL: llvm.func @multireturn() -> !llvm.struct<(i64, f32, struct<(ptr<f32>, ptr<f32>, i64, array<4 x i64>, array<4 x i64>)>)> {
-func.func @multireturn() -> (i64, f32, memref<42x?x10x?xf32>) {
-^bb0:
-// CHECK-NEXT:  {{.*}} = llvm.call @get_i64() : () -> i64
-// CHECK-NEXT:  {{.*}} = llvm.call @get_f32() : () -> f32
-// CHECK-NEXT:  {{.*}} = llvm.call @get_memref() : () -> !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<4 x i64>, array<4 x i64>)>
-  %0 = call @get_i64() : () -> (i64)
-  %1 = call @get_f32() : () -> (f32)
-  %2 = call @get_memref() : () -> (memref<42x?x10x?xf32>)
-// CHECK-NEXT:  {{.*}} = llvm.mlir.undef : !llvm.struct<(i64, f32, struct<(ptr<f32>, ptr<f32>, i64, array<4 x i64>, array<4 x i64>)>)>
-// CHECK-NEXT:  {{.*}} = llvm.insertvalue {{.*}}, {{.*}}[0] : !llvm.struct<(i64, f32, struct<(ptr<f32>, ptr<f32>, i64, array<4 x i64>, array<4 x i64>)>)>
-// CHECK-NEXT:  {{.*}} = llvm.insertvalue {{.*}}, {{.*}}[1] : !llvm.struct<(i64, f32, struct<(ptr<f32>, ptr<f32>, i64, array<4 x i64>, array<4 x i64>)>)>
-// CHECK-NEXT:  {{.*}} = llvm.insertvalue {{.*}}, {{.*}}[2] : !llvm.struct<(i64, f32, struct<(ptr<f32>, ptr<f32>, i64, array<4 x i64>, array<4 x i64>)>)>
-// CHECK-NEXT:  llvm.return {{.*}} : !llvm.struct<(i64, f32, struct<(ptr<f32>, ptr<f32>, i64, array<4 x i64>, array<4 x i64>)>)>
-  return %0, %1, %2 : i64, f32, memref<42x?x10x?xf32>
-}
-
-//===========================================================================//
-// Calling convention on returning unranked memrefs.
-// IR below produced by running -finalize-memref-to-llvm without opaque
-// pointers on calling-convention.mlir
-//===========================================================================//
-
-func.func @return_var_memref(%arg0: memref<4x3xf32>) -> memref<*xf32> attributes {llvm.emit_c_interface} {
-  %0 = builtin.unrealized_conversion_cast %arg0 : memref<4x3xf32> to !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
-  %1 = llvm.mlir.constant(1 : index) : i64
-  %2 = llvm.alloca %1 x !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> : (i64) -> !llvm.ptr<struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>>
-  llvm.store %0, %2 : !llvm.ptr<struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>>
-  %3 = llvm.bitcast %2 : !llvm.ptr<struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>> to !llvm.ptr<i8>
-  %4 = llvm.mlir.constant(2 : index) : i64
-  %5 = llvm.mlir.undef : !llvm.struct<(i64, ptr<i8>)>
-  %6 = llvm.insertvalue %4, %5[0] : !llvm.struct<(i64, ptr<i8>)>
-  %7 = llvm.insertvalue %3, %6[1] : !llvm.struct<(i64, ptr<i8>)>
-  %8 = builtin.unrealized_conversion_cast %7 : !llvm.struct<(i64, ptr<i8>)> to memref<*xf32>
-  return %8 : memref<*xf32>
-}
-
-// Check that the result memref is passed as parameter
-// CHECK-LABEL: @_mlir_ciface_return_var_memref
-// CHECK-SAME: (%{{.*}}: !llvm.ptr<struct<(i64, ptr<i8>)>>, %{{.*}}: !llvm.ptr<struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>>)
-
-func.func @return_two_var_memref(%arg0: memref<4x3xf32>) -> (memref<*xf32>, memref<*xf32>) attributes {llvm.emit_c_interface} {
-  %0 = builtin.unrealized_conversion_cast %arg0 : memref<4x3xf32> to !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
-  %1 = llvm.mlir.constant(1 : index) : i64
-  %2 = llvm.alloca %1 x !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> : (i64) -> !llvm.ptr<struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>>
-  llvm.store %0, %2 : !llvm.ptr<struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>>
-  %3 = llvm.bitcast %2 : !llvm.ptr<struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>> to !llvm.ptr<i8>
-  %4 = llvm.mlir.constant(2 : index) : i64
-  %5 = llvm.mlir.undef : !llvm.struct<(i64, ptr<i8>)>
-  %6 = llvm.insertvalue %4, %5[0] : !llvm.struct<(i64, ptr<i8>)>
-  %7 = llvm.insertvalue %3, %6[1] : !llvm.struct<(i64, ptr<i8>)>
-  %8 = builtin.unrealized_conversion_cast %7 : !llvm.struct<(i64, ptr<i8>)> to memref<*xf32>
-  return %8, %8 : memref<*xf32>, memref<*xf32>
-}
-
-// Check that the result memrefs are passed as parameter
-// CHECK-LABEL: @_mlir_ciface_return_two_var_memref
-// CHECK-SAME: (%{{.*}}: !llvm.ptr<struct<(struct<(i64, ptr<i8>)>, struct<(i64, ptr<i8>)>)>>,
-// CHECK-SAME: %{{.*}}: !llvm.ptr<struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>>)
-
diff --git a/mlir/test/lib/Dialect/LLVM/TestLowerToLLVM.cpp b/mlir/test/lib/Dialect/LLVM/TestLowerToLLVM.cpp
index 8d61ec44214f8..b6f89c23e0914 100644
--- a/mlir/test/lib/Dialect/LLVM/TestLowerToLLVM.cpp
+++ b/mlir/test/lib/Dialect/LLVM/TestLowerToLLVM.cpp
@@ -79,8 +79,7 @@ void buildTestLowerToLLVM(OpPassManager &pm,
   pm.addPass(createFinalizeMemRefToLLVMConversionPass(
       enableOpaquePointers(FinalizeMemRefToLLVMConversionPassOptions{})));
   // Convert Func to LLVM (always needed).
-  pm.addPass(createConvertFuncToLLVMPass(
-      enableOpaquePointers(ConvertFuncToLLVMPassOptions{})));
+  pm.addPass(createConvertFuncToLLVMPass(ConvertFuncToLLVMPassOptions{}));
   // Convert Index to LLVM (always needed).
   pm.addPass(createConvertIndexToLLVMPass());
   // Convert remaining unrealized_casts (always needed).

From 41f3b83fb066b4c3273e9abe02a8630864f22f30 Mon Sep 17 00:00:00 2001
From: Christian Ulmann <christian.ulmann@nextsilicon.com>
Date: Mon, 30 Oct 2023 08:57:35 +0100
Subject: [PATCH 342/877] [MLIR][SPIRVToLLVM] Remove typed pointer support
 (#70568)

This commit removes the support for lowering SPIRV to LLVM dialect with
typed pointers. Typed pointers have been deprecated for a while now and
it's planned to soon remove them from the LLVM dialect.

Related PSA:
https://discourse.llvm.org/t/psa-removal-of-typed-pointers-from-the-llvm-dialect/74502
---
 mlir/include/mlir/Conversion/Passes.td         |  3 ---
 .../lib/Conversion/SPIRVToLLVM/SPIRVToLLVM.cpp |  4 ++--
 .../Conversion/SPIRVToLLVM/SPIRVToLLVMPass.cpp |  1 -
 .../SPIRVToLLVM/arithmetic-ops-to-llvm.mlir    |  2 +-
 .../SPIRVToLLVM/bitwise-ops-to-llvm.mlir       |  2 +-
 .../SPIRVToLLVM/cast-ops-to-llvm.mlir          |  2 +-
 .../SPIRVToLLVM/comparison-ops-to-llvm.mlir    |  2 +-
 .../SPIRVToLLVM/constant-op-to-llvm.mlir       |  2 +-
 .../SPIRVToLLVM/control-flow-ops-to-llvm.mlir  |  2 +-
 .../SPIRVToLLVM/func-ops-to-llvm.mlir          |  2 +-
 .../Conversion/SPIRVToLLVM/gl-ops-to-llvm.mlir |  2 +-
 .../SPIRVToLLVM/logical-ops-to-llvm.mlir       |  2 +-
 .../SPIRVToLLVM/lower-host-to-llvm-calls.mlir  |  2 +-
 .../lower-host-to-llvm-calls_fail.mlir         |  2 +-
 .../SPIRVToLLVM/memory-ops-to-llvm.mlir        |  2 +-
 .../SPIRVToLLVM/misc-ops-to-llvm.mlir          |  2 +-
 .../SPIRVToLLVM/module-ops-to-llvm.mlir        |  2 +-
 .../SPIRVToLLVM/shift-ops-to-llvm.mlir         |  2 +-
 .../spirv-storage-class-mapping.mlir           |  4 ++--
 .../spirv-types-to-llvm-invalid.mlir           |  2 +-
 .../SPIRVToLLVM/spirv-types-to-llvm.mlir       |  2 +-
 .../Conversion/SPIRVToLLVM/typed-pointers.mlir | 18 ------------------
 22 files changed, 21 insertions(+), 43 deletions(-)
 delete mode 100644 mlir/test/Conversion/SPIRVToLLVM/typed-pointers.mlir

diff --git a/mlir/include/mlir/Conversion/Passes.td b/mlir/include/mlir/Conversion/Passes.td
index 9d70987386437..a2307bc243f61 100644
--- a/mlir/include/mlir/Conversion/Passes.td
+++ b/mlir/include/mlir/Conversion/Passes.td
@@ -1016,9 +1016,6 @@ def ConvertSPIRVToLLVMPass : Pass<"convert-spirv-to-llvm", "ModuleOp"> {
   let dependentDialects = ["LLVM::LLVMDialect"];
 
   let options = [
-    Option<"useOpaquePointers", "use-opaque-pointers", "bool",
-                 /*default=*/"true", "Generate LLVM IR using opaque pointers "
-                 "instead of typed pointers">,
     Option<"clientAPI", "client-api", "::mlir::spirv::ClientAPI",
 	   /*default=*/"::mlir::spirv::ClientAPI::Unknown",
 	   "Derive StorageClass to address space mapping from the client API",
diff --git a/mlir/lib/Conversion/SPIRVToLLVM/SPIRVToLLVM.cpp b/mlir/lib/Conversion/SPIRVToLLVM/SPIRVToLLVM.cpp
index 0ea82730c065a..cbc09c9ddfa3a 100644
--- a/mlir/lib/Conversion/SPIRVToLLVM/SPIRVToLLVM.cpp
+++ b/mlir/lib/Conversion/SPIRVToLLVM/SPIRVToLLVM.cpp
@@ -1376,8 +1376,8 @@ class BitcastConversionPattern
     if (!dstType)
       return failure();
 
-    if (typeConverter.useOpaquePointers() &&
-        isa<LLVM::LLVMPointerType>(dstType)) {
+    // LLVM's opaque pointers do not require bitcasts.
+    if (isa<LLVM::LLVMPointerType>(dstType)) {
       rewriter.replaceOp(bitcastOp, adaptor.getOperand());
       return success();
     }
diff --git a/mlir/lib/Conversion/SPIRVToLLVM/SPIRVToLLVMPass.cpp b/mlir/lib/Conversion/SPIRVToLLVM/SPIRVToLLVMPass.cpp
index 40798e9eb9dcb..38091e449c56e 100644
--- a/mlir/lib/Conversion/SPIRVToLLVM/SPIRVToLLVMPass.cpp
+++ b/mlir/lib/Conversion/SPIRVToLLVM/SPIRVToLLVMPass.cpp
@@ -42,7 +42,6 @@ void ConvertSPIRVToLLVMPass::runOnOperation() {
   ModuleOp module = getOperation();
 
   LowerToLLVMOptions options(&getContext());
-  options.useOpaquePointers = useOpaquePointers;
 
   LLVMTypeConverter converter(&getContext(), options);
 
diff --git a/mlir/test/Conversion/SPIRVToLLVM/arithmetic-ops-to-llvm.mlir b/mlir/test/Conversion/SPIRVToLLVM/arithmetic-ops-to-llvm.mlir
index 925443758c43d..dbbf8610afb4d 100644
--- a/mlir/test/Conversion/SPIRVToLLVM/arithmetic-ops-to-llvm.mlir
+++ b/mlir/test/Conversion/SPIRVToLLVM/arithmetic-ops-to-llvm.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt -convert-spirv-to-llvm='use-opaque-pointers=1' %s | FileCheck %s
+// RUN: mlir-opt -convert-spirv-to-llvm %s | FileCheck %s
 
 //===----------------------------------------------------------------------===//
 // spirv.IAdd
diff --git a/mlir/test/Conversion/SPIRVToLLVM/bitwise-ops-to-llvm.mlir b/mlir/test/Conversion/SPIRVToLLVM/bitwise-ops-to-llvm.mlir
index af232bb2c6387..a0afe0dafcaa2 100644
--- a/mlir/test/Conversion/SPIRVToLLVM/bitwise-ops-to-llvm.mlir
+++ b/mlir/test/Conversion/SPIRVToLLVM/bitwise-ops-to-llvm.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt -convert-spirv-to-llvm='use-opaque-pointers=1' %s | FileCheck %s
+// RUN: mlir-opt -convert-spirv-to-llvm %s | FileCheck %s
 
 //===----------------------------------------------------------------------===//
 // spirv.BitCount
diff --git a/mlir/test/Conversion/SPIRVToLLVM/cast-ops-to-llvm.mlir b/mlir/test/Conversion/SPIRVToLLVM/cast-ops-to-llvm.mlir
index 4f6c1eaaf5048..2026027a6dace 100644
--- a/mlir/test/Conversion/SPIRVToLLVM/cast-ops-to-llvm.mlir
+++ b/mlir/test/Conversion/SPIRVToLLVM/cast-ops-to-llvm.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt -convert-spirv-to-llvm='use-opaque-pointers=1' %s | FileCheck %s
+// RUN: mlir-opt -convert-spirv-to-llvm %s | FileCheck %s
 
 //===----------------------------------------------------------------------===//
 // spirv.Bitcast
diff --git a/mlir/test/Conversion/SPIRVToLLVM/comparison-ops-to-llvm.mlir b/mlir/test/Conversion/SPIRVToLLVM/comparison-ops-to-llvm.mlir
index 272ee309fdf6d..52359db3be7bd 100644
--- a/mlir/test/Conversion/SPIRVToLLVM/comparison-ops-to-llvm.mlir
+++ b/mlir/test/Conversion/SPIRVToLLVM/comparison-ops-to-llvm.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt -convert-spirv-to-llvm='use-opaque-pointers=1' %s | FileCheck %s
+// RUN: mlir-opt -convert-spirv-to-llvm %s | FileCheck %s
 
 //===----------------------------------------------------------------------===//
 // spirv.IEqual
diff --git a/mlir/test/Conversion/SPIRVToLLVM/constant-op-to-llvm.mlir b/mlir/test/Conversion/SPIRVToLLVM/constant-op-to-llvm.mlir
index f66b0768a6143..2d74022b34406 100644
--- a/mlir/test/Conversion/SPIRVToLLVM/constant-op-to-llvm.mlir
+++ b/mlir/test/Conversion/SPIRVToLLVM/constant-op-to-llvm.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt -convert-spirv-to-llvm='use-opaque-pointers=1' %s | FileCheck %s
+// RUN: mlir-opt -convert-spirv-to-llvm %s | FileCheck %s
 
 //===----------------------------------------------------------------------===//
 // spirv.Constant
diff --git a/mlir/test/Conversion/SPIRVToLLVM/control-flow-ops-to-llvm.mlir b/mlir/test/Conversion/SPIRVToLLVM/control-flow-ops-to-llvm.mlir
index 54ef71f75f528..3557830e779e2 100644
--- a/mlir/test/Conversion/SPIRVToLLVM/control-flow-ops-to-llvm.mlir
+++ b/mlir/test/Conversion/SPIRVToLLVM/control-flow-ops-to-llvm.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt -convert-spirv-to-llvm='use-opaque-pointers=1' -split-input-file -verify-diagnostics %s | FileCheck %s
+// RUN: mlir-opt -convert-spirv-to-llvm -split-input-file -verify-diagnostics %s | FileCheck %s
 
 //===----------------------------------------------------------------------===//
 // spirv.Branch
diff --git a/mlir/test/Conversion/SPIRVToLLVM/func-ops-to-llvm.mlir b/mlir/test/Conversion/SPIRVToLLVM/func-ops-to-llvm.mlir
index 1dff2c48cefee..5b3d8ba5ca595 100644
--- a/mlir/test/Conversion/SPIRVToLLVM/func-ops-to-llvm.mlir
+++ b/mlir/test/Conversion/SPIRVToLLVM/func-ops-to-llvm.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt -convert-spirv-to-llvm='use-opaque-pointers=1' %s | FileCheck %s
+// RUN: mlir-opt -convert-spirv-to-llvm %s | FileCheck %s
 
 //===----------------------------------------------------------------------===//
 // spirv.Return
diff --git a/mlir/test/Conversion/SPIRVToLLVM/gl-ops-to-llvm.mlir b/mlir/test/Conversion/SPIRVToLLVM/gl-ops-to-llvm.mlir
index 851d164284c48..e1936e2fd8abe 100644
--- a/mlir/test/Conversion/SPIRVToLLVM/gl-ops-to-llvm.mlir
+++ b/mlir/test/Conversion/SPIRVToLLVM/gl-ops-to-llvm.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt -convert-spirv-to-llvm='use-opaque-pointers=1' %s | FileCheck %s
+// RUN: mlir-opt -convert-spirv-to-llvm %s | FileCheck %s
 
 //===----------------------------------------------------------------------===//
 // spirv.GL.Ceil
diff --git a/mlir/test/Conversion/SPIRVToLLVM/logical-ops-to-llvm.mlir b/mlir/test/Conversion/SPIRVToLLVM/logical-ops-to-llvm.mlir
index 65db67314361f..6d93480d3ed14 100644
--- a/mlir/test/Conversion/SPIRVToLLVM/logical-ops-to-llvm.mlir
+++ b/mlir/test/Conversion/SPIRVToLLVM/logical-ops-to-llvm.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt -convert-spirv-to-llvm='use-opaque-pointers=1' %s | FileCheck %s
+// RUN: mlir-opt -convert-spirv-to-llvm %s | FileCheck %s
 
 //===----------------------------------------------------------------------===//
 // spirv.LogicalEqual
diff --git a/mlir/test/Conversion/SPIRVToLLVM/lower-host-to-llvm-calls.mlir b/mlir/test/Conversion/SPIRVToLLVM/lower-host-to-llvm-calls.mlir
index 71149f1389407..61944b9d047e4 100644
--- a/mlir/test/Conversion/SPIRVToLLVM/lower-host-to-llvm-calls.mlir
+++ b/mlir/test/Conversion/SPIRVToLLVM/lower-host-to-llvm-calls.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt --lower-host-to-llvm='use-opaque-pointers=1' %s -split-input-file | FileCheck %s
+// RUN: mlir-opt --lower-host-to-llvm %s -split-input-file | FileCheck %s
 
 module attributes {gpu.container_module, spirv.target_env = #spirv.target_env<#spirv.vce<v1.0, [Shader], [SPV_KHR_variable_pointers]>, #spirv.resource_limits<max_compute_workgroup_invocations = 128, max_compute_workgroup_size = [128, 128, 64]>>} {
 
diff --git a/mlir/test/Conversion/SPIRVToLLVM/lower-host-to-llvm-calls_fail.mlir b/mlir/test/Conversion/SPIRVToLLVM/lower-host-to-llvm-calls_fail.mlir
index 05ed608252613..e36d30b434fc7 100644
--- a/mlir/test/Conversion/SPIRVToLLVM/lower-host-to-llvm-calls_fail.mlir
+++ b/mlir/test/Conversion/SPIRVToLLVM/lower-host-to-llvm-calls_fail.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt --lower-host-to-llvm='use-opaque-pointers=1' %s -verify-diagnostics
+// RUN: mlir-opt --lower-host-to-llvm %s -verify-diagnostics
 
 module {
 // expected-error @+1 {{The module must contain exactly one entry point function}}
diff --git a/mlir/test/Conversion/SPIRVToLLVM/memory-ops-to-llvm.mlir b/mlir/test/Conversion/SPIRVToLLVM/memory-ops-to-llvm.mlir
index 04357a1b00a5b..1847975b279af 100644
--- a/mlir/test/Conversion/SPIRVToLLVM/memory-ops-to-llvm.mlir
+++ b/mlir/test/Conversion/SPIRVToLLVM/memory-ops-to-llvm.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt -convert-spirv-to-llvm='use-opaque-pointers=1' %s | FileCheck %s
+// RUN: mlir-opt -convert-spirv-to-llvm %s | FileCheck %s
 
 //===----------------------------------------------------------------------===//
 // spirv.AccessChain
diff --git a/mlir/test/Conversion/SPIRVToLLVM/misc-ops-to-llvm.mlir b/mlir/test/Conversion/SPIRVToLLVM/misc-ops-to-llvm.mlir
index b90022b25265d..13bde6e6fc563 100644
--- a/mlir/test/Conversion/SPIRVToLLVM/misc-ops-to-llvm.mlir
+++ b/mlir/test/Conversion/SPIRVToLLVM/misc-ops-to-llvm.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt -convert-spirv-to-llvm='use-opaque-pointers=1' %s | FileCheck %s
+// RUN: mlir-opt -convert-spirv-to-llvm %s | FileCheck %s
 
 //===----------------------------------------------------------------------===//
 // spirv.CompositeExtract
diff --git a/mlir/test/Conversion/SPIRVToLLVM/module-ops-to-llvm.mlir b/mlir/test/Conversion/SPIRVToLLVM/module-ops-to-llvm.mlir
index d4cb88db720ca..894de8b9e90a9 100644
--- a/mlir/test/Conversion/SPIRVToLLVM/module-ops-to-llvm.mlir
+++ b/mlir/test/Conversion/SPIRVToLLVM/module-ops-to-llvm.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt -convert-spirv-to-llvm='use-opaque-pointers=1' %s | FileCheck %s
+// RUN: mlir-opt -convert-spirv-to-llvm %s | FileCheck %s
 
 //===----------------------------------------------------------------------===//
 // spirv.module
diff --git a/mlir/test/Conversion/SPIRVToLLVM/shift-ops-to-llvm.mlir b/mlir/test/Conversion/SPIRVToLLVM/shift-ops-to-llvm.mlir
index db8f207193c9a..da4def7aeda82 100644
--- a/mlir/test/Conversion/SPIRVToLLVM/shift-ops-to-llvm.mlir
+++ b/mlir/test/Conversion/SPIRVToLLVM/shift-ops-to-llvm.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt -convert-spirv-to-llvm='use-opaque-pointers=1' %s | FileCheck %s
+// RUN: mlir-opt -convert-spirv-to-llvm %s | FileCheck %s
 
 //===----------------------------------------------------------------------===//
 // spirv.ShiftRightArithmetic
diff --git a/mlir/test/Conversion/SPIRVToLLVM/spirv-storage-class-mapping.mlir b/mlir/test/Conversion/SPIRVToLLVM/spirv-storage-class-mapping.mlir
index 989ada93cf36e..b3991cbdbe8af 100644
--- a/mlir/test/Conversion/SPIRVToLLVM/spirv-storage-class-mapping.mlir
+++ b/mlir/test/Conversion/SPIRVToLLVM/spirv-storage-class-mapping.mlir
@@ -1,5 +1,5 @@
-// RUN: mlir-opt -convert-spirv-to-llvm='use-opaque-pointers=1' -verify-diagnostics %s | FileCheck %s --check-prefixes=CHECK-UNKNOWN,CHECK-ALL
-// RUN: mlir-opt -convert-spirv-to-llvm='use-opaque-pointers=1 client-api=OpenCL' -verify-diagnostics %s | FileCheck %s --check-prefixes=CHECK-OPENCL,CHECK-ALL
+// RUN: mlir-opt -convert-spirv-to-llvm -verify-diagnostics %s | FileCheck %s --check-prefixes=CHECK-UNKNOWN,CHECK-ALL
+// RUN: mlir-opt -convert-spirv-to-llvm='client-api=OpenCL' -verify-diagnostics %s | FileCheck %s --check-prefixes=CHECK-OPENCL,CHECK-ALL
 
 // CHECK-OPENCL:         llvm.func @pointerUniformConstant(!llvm.ptr<2>)
 // CHECK-UNKNOWN:        llvm.func @pointerUniformConstant(!llvm.ptr)
diff --git a/mlir/test/Conversion/SPIRVToLLVM/spirv-types-to-llvm-invalid.mlir b/mlir/test/Conversion/SPIRVToLLVM/spirv-types-to-llvm-invalid.mlir
index 438c90205abed..82c02b6c06c78 100644
--- a/mlir/test/Conversion/SPIRVToLLVM/spirv-types-to-llvm-invalid.mlir
+++ b/mlir/test/Conversion/SPIRVToLLVM/spirv-types-to-llvm-invalid.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-spirv-to-llvm='use-opaque-pointers=1' -verify-diagnostics -split-input-file
+// RUN: mlir-opt %s -convert-spirv-to-llvm -verify-diagnostics -split-input-file
 
 // expected-error@+1 {{failed to legalize operation 'spirv.func' that was explicitly marked illegal}}
 spirv.func @array_with_unnatural_stride(%arg: !spirv.array<4 x f32, stride=8>) -> () "None" {
diff --git a/mlir/test/Conversion/SPIRVToLLVM/spirv-types-to-llvm.mlir b/mlir/test/Conversion/SPIRVToLLVM/spirv-types-to-llvm.mlir
index 167ad021a5fa1..0f2dbf8ef1155 100644
--- a/mlir/test/Conversion/SPIRVToLLVM/spirv-types-to-llvm.mlir
+++ b/mlir/test/Conversion/SPIRVToLLVM/spirv-types-to-llvm.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt -split-input-file -convert-spirv-to-llvm='use-opaque-pointers=1' -verify-diagnostics %s | FileCheck %s
+// RUN: mlir-opt -split-input-file -convert-spirv-to-llvm -verify-diagnostics %s | FileCheck %s
 
 //===----------------------------------------------------------------------===//
 // Array type
diff --git a/mlir/test/Conversion/SPIRVToLLVM/typed-pointers.mlir b/mlir/test/Conversion/SPIRVToLLVM/typed-pointers.mlir
deleted file mode 100644
index 2c56f42a1fd52..0000000000000
--- a/mlir/test/Conversion/SPIRVToLLVM/typed-pointers.mlir
+++ /dev/null
@@ -1,18 +0,0 @@
-// RUN: mlir-opt -split-input-file -convert-spirv-to-llvm='use-opaque-pointers=0' %s | FileCheck %s
-
-//===----------------------------------------------------------------------===//
-// Pointer type
-//===----------------------------------------------------------------------===//
-
-// CHECK-LABEL: @pointer_scalar(!llvm.ptr<i1>, !llvm.ptr<f32>)
-spirv.func @pointer_scalar(!spirv.ptr<i1, Uniform>, !spirv.ptr<f32, Private>) "None"
-
-// CHECK-LABEL: @pointer_vector(!llvm.ptr<vector<4xi32>>)
-spirv.func @pointer_vector(!spirv.ptr<vector<4xi32>, Function>) "None"
-
-// CHECK-LABEL: @bitcast_pointer
-spirv.func @bitcast_pointer(%arg0: !spirv.ptr<f32, Function>) "None" {
-  // CHECK: llvm.bitcast %{{.*}} : !llvm.ptr<f32> to !llvm.ptr<i32>
-  %0 = spirv.Bitcast %arg0 : !spirv.ptr<f32, Function> to !spirv.ptr<i32, Function>
-  spirv.Return
-}

From ed3f06b9b393cd51e78e5fbc7a46bce090c1817a Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Mon, 30 Oct 2023 09:04:04 +0100
Subject: [PATCH 343/877] [IR] Add zext nneg flag (#67982)

Add an nneg flag to the zext instruction, which specifies that the
argument is non-negative. Otherwise, the result is a poison value.

The primary use-case for the flag is to preserve information when sext
gets replaced with zext due to range-based canonicalization. The nneg
flag allows us to convert the zext back into an sext later. This is
useful for some optimizations (e.g. a signed icmp can fold with sext but
not zext), as well as some targets (e.g. RISCV prefers sext over zext).

Discourse thread: https://discourse.llvm.org/t/rfc-add-zext-nneg-flag/73914

This patch is based on https://reviews.llvm.org/D156444 by
@Panagiotis156, with some implementation simplifications and additional
tests.

---------

Co-authored-by: Panagiotis K <karouzakispan@gmail.com>
---
 llvm/docs/LangRef.rst                         | 10 +++++++
 llvm/include/llvm/AsmParser/LLToken.h         |  1 +
 llvm/include/llvm/Bitcode/LLVMBitCodes.h      |  3 ++
 llvm/include/llvm/IR/InstrTypes.h             | 14 +++++++++
 llvm/include/llvm/IR/Instruction.h            |  7 +++++
 llvm/lib/AsmParser/LLLexer.cpp                |  1 +
 llvm/lib/AsmParser/LLParser.cpp               | 10 ++++++-
 llvm/lib/Bitcode/Reader/BitcodeReader.cpp     | 10 +++++--
 llvm/lib/Bitcode/Writer/BitcodeWriter.cpp     | 22 ++++++++++++++
 llvm/lib/IR/AsmWriter.cpp                     |  3 ++
 llvm/lib/IR/Instruction.cpp                   | 24 +++++++++++++++
 llvm/lib/IR/Operator.cpp                      |  4 +++
 llvm/test/Assembler/flags.ll                  |  6 ++++
 llvm/test/Bitcode/flags.ll                    |  4 +++
 llvm/test/Transforms/InstCombine/freeze.ll    | 11 +++++++
 llvm/test/Transforms/SimplifyCFG/HoistCode.ll | 30 +++++++++++++++++++
 16 files changed, 156 insertions(+), 4 deletions(-)

diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index c97a7ae372bc6..3631dff50f30d 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -11229,6 +11229,10 @@ Overview:
 
 The '``zext``' instruction zero extends its operand to type ``ty2``.
 
+The ``nneg`` (non-negative) flag, if present, specifies that the operand is
+non-negative. This property may be used by optimization passes to later
+convert the ``zext`` into a ``sext``.
+
 Arguments:
 """"""""""
 
@@ -11245,6 +11249,9 @@ until it reaches the size of the destination type, ``ty2``.
 
 When zero extending from i1, the result will always be either 0 or 1.
 
+If the ``nneg`` flag is set, and the ``zext`` argument is negative, the result
+is a poison value.
+
 Example:
 """"""""
 
@@ -11254,6 +11261,9 @@ Example:
       %Y = zext i1 true to i32              ; yields i32:1
       %Z = zext <2 x i16> <i16 8, i16 7> to <2 x i32> ; yields <i32 8, i32 7>
 
+      %a = zext nneg i8 127 to i16 ; yields i16 127
+      %b = zext nneg i8 -1 to i16  ; yields i16 poison
+
 .. _i_sext:
 
 '``sext .. to``' Instruction
diff --git a/llvm/include/llvm/AsmParser/LLToken.h b/llvm/include/llvm/AsmParser/LLToken.h
index 2d6b8a19401d7..773a1b84ea533 100644
--- a/llvm/include/llvm/AsmParser/LLToken.h
+++ b/llvm/include/llvm/AsmParser/LLToken.h
@@ -110,6 +110,7 @@ enum Kind {
   kw_nsw,
   kw_exact,
   kw_inbounds,
+  kw_nneg,
   kw_inrange,
   kw_addrspace,
   kw_section,
diff --git a/llvm/include/llvm/Bitcode/LLVMBitCodes.h b/llvm/include/llvm/Bitcode/LLVMBitCodes.h
index 5d7be5ca936ad..ee2ccec6e89a4 100644
--- a/llvm/include/llvm/Bitcode/LLVMBitCodes.h
+++ b/llvm/include/llvm/Bitcode/LLVMBitCodes.h
@@ -505,6 +505,9 @@ enum FastMathMap {
   AllowReassoc    = (1 << 7)
 };
 
+/// Flags for serializing PossiblyNonNegInst's SubclassOptionalData contents.
+enum PossiblyNonNegInstOptionalFlags { PNNI_NON_NEG = 0 };
+
 /// PossiblyExactOperatorOptionalFlags - Flags for serializing
 /// PossiblyExactOperator's SubclassOptionalData contents.
 enum PossiblyExactOperatorOptionalFlags { PEO_EXACT = 0 };
diff --git a/llvm/include/llvm/IR/InstrTypes.h b/llvm/include/llvm/IR/InstrTypes.h
index 6095b0a1be69c..fc5e228168a05 100644
--- a/llvm/include/llvm/IR/InstrTypes.h
+++ b/llvm/include/llvm/IR/InstrTypes.h
@@ -692,6 +692,20 @@ class CastInst : public UnaryInstruction {
   }
 };
 
+/// Instruction that can have a nneg flag (only zext).
+class PossiblyNonNegInst : public CastInst {
+public:
+  enum { NonNeg = (1 << 0) };
+
+  static bool classof(const Instruction *I) {
+    return I->getOpcode() == Instruction::ZExt;
+  }
+
+  static bool classof(const Value *V) {
+    return isa<Instruction>(V) && classof(cast<Instruction>(V));
+  }
+};
+
 //===----------------------------------------------------------------------===//
 //                               CmpInst Class
 //===----------------------------------------------------------------------===//
diff --git a/llvm/include/llvm/IR/Instruction.h b/llvm/include/llvm/IR/Instruction.h
index af7aa791cb6da..5142847fa75ff 100644
--- a/llvm/include/llvm/IR/Instruction.h
+++ b/llvm/include/llvm/IR/Instruction.h
@@ -410,12 +410,19 @@ class Instruction : public User,
   /// which supports this flag. See LangRef.html for the meaning of this flag.
   void setIsExact(bool b = true);
 
+  /// Set or clear the nneg flag on this instruction, which must be a zext
+  /// instruction.
+  void setNonNeg(bool b = true);
+
   /// Determine whether the no unsigned wrap flag is set.
   bool hasNoUnsignedWrap() const LLVM_READONLY;
 
   /// Determine whether the no signed wrap flag is set.
   bool hasNoSignedWrap() const LLVM_READONLY;
 
+  /// Determine whether the the nneg flag is set.
+  bool hasNonNeg() const LLVM_READONLY;
+
   /// Return true if this operator has flags which may cause this instruction
   /// to evaluate to poison despite having non-poison inputs.
   bool hasPoisonGeneratingFlags() const LLVM_READONLY;
diff --git a/llvm/lib/AsmParser/LLLexer.cpp b/llvm/lib/AsmParser/LLLexer.cpp
index ae46209b30ede..284a4c64c6793 100644
--- a/llvm/lib/AsmParser/LLLexer.cpp
+++ b/llvm/lib/AsmParser/LLLexer.cpp
@@ -565,6 +565,7 @@ lltok::Kind LLLexer::LexIdentifier() {
   KEYWORD(nsw);
   KEYWORD(exact);
   KEYWORD(inbounds);
+  KEYWORD(nneg);
   KEYWORD(inrange);
   KEYWORD(addrspace);
   KEYWORD(section);
diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp
index e104f8b3d1fdb..42f306a99d5ee 100644
--- a/llvm/lib/AsmParser/LLParser.cpp
+++ b/llvm/lib/AsmParser/LLParser.cpp
@@ -6383,8 +6383,16 @@ int LLParser::parseInstruction(Instruction *&Inst, BasicBlock *BB,
   }
 
   // Casts.
+  case lltok::kw_zext: {
+    bool NonNeg = EatIfPresent(lltok::kw_nneg);
+    bool Res = parseCast(Inst, PFS, KeywordVal);
+    if (Res != 0)
+      return Res;
+    if (NonNeg)
+      Inst->setNonNeg();
+    return 0;
+  }
   case lltok::kw_trunc:
-  case lltok::kw_zext:
   case lltok::kw_sext:
   case lltok::kw_fptrunc:
   case lltok::kw_fpext:
diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
index 28addb9068b24..fcba7366fdb61 100644
--- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -4877,12 +4877,13 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
       Value *Op;
       unsigned OpTypeID;
       if (getValueTypePair(Record, OpNum, NextValueNo, Op, OpTypeID, CurBB) ||
-          OpNum+2 != Record.size())
+          OpNum + 1 > Record.size())
         return error("Invalid record");
 
-      ResTypeID = Record[OpNum];
+      ResTypeID = Record[OpNum++];
       Type *ResTy = getTypeByID(ResTypeID);
-      int Opc = getDecodedCastOpcode(Record[OpNum + 1]);
+      int Opc = getDecodedCastOpcode(Record[OpNum++]);
+
       if (Opc == -1 || !ResTy)
         return error("Invalid record");
       Instruction *Temp = nullptr;
@@ -4898,6 +4899,9 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
           return error("Invalid cast");
         I = CastInst::Create(CastOp, Op, ResTy);
       }
+      if (OpNum < Record.size() && isa<PossiblyNonNegInst>(I) &&
+          (Record[OpNum] & (1 << bitc::PNNI_NON_NEG)))
+        I->setNonNeg(true);
       InstructionList.push_back(I);
       break;
     }
diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
index c427459508ecf..d7ebc76d9bfb0 100644
--- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -123,6 +123,7 @@ enum {
   FUNCTION_INST_BINOP_ABBREV,
   FUNCTION_INST_BINOP_FLAGS_ABBREV,
   FUNCTION_INST_CAST_ABBREV,
+  FUNCTION_INST_CAST_FLAGS_ABBREV,
   FUNCTION_INST_RET_VOID_ABBREV,
   FUNCTION_INST_RET_VAL_ABBREV,
   FUNCTION_INST_UNREACHABLE_ABBREV,
@@ -1551,6 +1552,9 @@ static uint64_t getOptimizationFlags(const Value *V) {
       Flags |= bitc::AllowContract;
     if (FPMO->hasApproxFunc())
       Flags |= bitc::ApproxFunc;
+  } else if (const auto *NNI = dyn_cast<PossiblyNonNegInst>(V)) {
+    if (NNI->hasNonNeg())
+      Flags |= 1 << bitc::PNNI_NON_NEG;
   }
 
   return Flags;
@@ -2827,6 +2831,12 @@ void ModuleBitcodeWriter::writeInstruction(const Instruction &I,
         AbbrevToUse = FUNCTION_INST_CAST_ABBREV;
       Vals.push_back(VE.getTypeID(I.getType()));
       Vals.push_back(getEncodedCastOpcode(I.getOpcode()));
+      uint64_t Flags = getOptimizationFlags(&I);
+      if (Flags != 0) {
+        if (AbbrevToUse == FUNCTION_INST_CAST_ABBREV)
+          AbbrevToUse = FUNCTION_INST_CAST_FLAGS_ABBREV;
+        Vals.push_back(Flags);
+      }
     } else {
       assert(isa<BinaryOperator>(I) && "Unknown instruction!");
       Code = bitc::FUNC_CODE_INST_BINOP;
@@ -3648,6 +3658,18 @@ void ModuleBitcodeWriter::writeBlockInfo() {
         FUNCTION_INST_CAST_ABBREV)
       llvm_unreachable("Unexpected abbrev ordering!");
   }
+  { // INST_CAST_FLAGS abbrev for FUNCTION_BLOCK.
+    auto Abbv = std::make_shared<BitCodeAbbrev>();
+    Abbv->Add(BitCodeAbbrevOp(bitc::FUNC_CODE_INST_CAST));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6)); // OpVal
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed,    // dest ty
+                              VE.computeBitsRequiredForTypeIndicies()));
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 4)); // opc
+    Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 8)); // flags
+    if (Stream.EmitBlockInfoAbbrev(bitc::FUNCTION_BLOCK_ID, Abbv) !=
+        FUNCTION_INST_CAST_FLAGS_ABBREV)
+      llvm_unreachable("Unexpected abbrev ordering!");
+  }
 
   { // INST_RET abbrev for FUNCTION_BLOCK.
     auto Abbv = std::make_shared<BitCodeAbbrev>();
diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp
index bd8b3e9ad5221..c738b50a7c721 100644
--- a/llvm/lib/IR/AsmWriter.cpp
+++ b/llvm/lib/IR/AsmWriter.cpp
@@ -1348,6 +1348,9 @@ static void WriteOptimizationInfo(raw_ostream &Out, const User *U) {
   } else if (const GEPOperator *GEP = dyn_cast<GEPOperator>(U)) {
     if (GEP->isInBounds())
       Out << " inbounds";
+  } else if (const auto *NNI = dyn_cast<PossiblyNonNegInst>(U)) {
+    if (NNI->hasNonNeg())
+      Out << " nneg";
   }
 }
 
diff --git a/llvm/lib/IR/Instruction.cpp b/llvm/lib/IR/Instruction.cpp
index 9b176eb78888e..1b3c03348f41a 100644
--- a/llvm/lib/IR/Instruction.cpp
+++ b/llvm/lib/IR/Instruction.cpp
@@ -171,6 +171,12 @@ void Instruction::setIsExact(bool b) {
   cast<PossiblyExactOperator>(this)->setIsExact(b);
 }
 
+void Instruction::setNonNeg(bool b) {
+  assert(isa<PossiblyNonNegInst>(this) && "Must be zext");
+  SubclassOptionalData = (SubclassOptionalData & ~PossiblyNonNegInst::NonNeg) |
+                         (b * PossiblyNonNegInst::NonNeg);
+}
+
 bool Instruction::hasNoUnsignedWrap() const {
   return cast<OverflowingBinaryOperator>(this)->hasNoUnsignedWrap();
 }
@@ -179,6 +185,11 @@ bool Instruction::hasNoSignedWrap() const {
   return cast<OverflowingBinaryOperator>(this)->hasNoSignedWrap();
 }
 
+bool Instruction::hasNonNeg() const {
+  assert(isa<PossiblyNonNegInst>(this) && "Must be zext");
+  return (SubclassOptionalData & PossiblyNonNegInst::NonNeg) != 0;
+}
+
 bool Instruction::hasPoisonGeneratingFlags() const {
   return cast<Operator>(this)->hasPoisonGeneratingFlags();
 }
@@ -203,7 +214,12 @@ void Instruction::dropPoisonGeneratingFlags() {
   case Instruction::GetElementPtr:
     cast<GetElementPtrInst>(this)->setIsInBounds(false);
     break;
+
+  case Instruction::ZExt:
+    setNonNeg(false);
+    break;
   }
+
   if (isa<FPMathOperator>(this)) {
     setHasNoNaNs(false);
     setHasNoInfs(false);
@@ -378,6 +394,10 @@ void Instruction::copyIRFlags(const Value *V, bool IncludeWrapFlags) {
   if (auto *SrcGEP = dyn_cast<GetElementPtrInst>(V))
     if (auto *DestGEP = dyn_cast<GetElementPtrInst>(this))
       DestGEP->setIsInBounds(SrcGEP->isInBounds() || DestGEP->isInBounds());
+
+  if (auto *NNI = dyn_cast<PossiblyNonNegInst>(V))
+    if (isa<PossiblyNonNegInst>(this))
+      setNonNeg(NNI->hasNonNeg());
 }
 
 void Instruction::andIRFlags(const Value *V) {
@@ -403,6 +423,10 @@ void Instruction::andIRFlags(const Value *V) {
   if (auto *SrcGEP = dyn_cast<GetElementPtrInst>(V))
     if (auto *DestGEP = dyn_cast<GetElementPtrInst>(this))
       DestGEP->setIsInBounds(SrcGEP->isInBounds() && DestGEP->isInBounds());
+
+  if (auto *NNI = dyn_cast<PossiblyNonNegInst>(V))
+    if (isa<PossiblyNonNegInst>(this))
+      setNonNeg(hasNonNeg() && NNI->hasNonNeg());
 }
 
 const char *Instruction::getOpcodeName(unsigned OpCode) {
diff --git a/llvm/lib/IR/Operator.cpp b/llvm/lib/IR/Operator.cpp
index d2a1f2eb49daf..0c917ad77e158 100644
--- a/llvm/lib/IR/Operator.cpp
+++ b/llvm/lib/IR/Operator.cpp
@@ -37,6 +37,10 @@ bool Operator::hasPoisonGeneratingFlags() const {
     // Note: inrange exists on constexpr only
     return GEP->isInBounds() || GEP->getInRangeIndex() != std::nullopt;
   }
+  case Instruction::ZExt:
+    if (auto *NNI = dyn_cast<PossiblyNonNegInst>(this))
+      return NNI->hasNonNeg();
+    return false;
   default:
     if (const auto *FP = dyn_cast<FPMathOperator>(this))
       return FP->hasNoNaNs() || FP->hasNoInfs();
diff --git a/llvm/test/Assembler/flags.ll b/llvm/test/Assembler/flags.ll
index 3b54b06b81d4e..8331edf52a169 100644
--- a/llvm/test/Assembler/flags.ll
+++ b/llvm/test/Assembler/flags.ll
@@ -260,3 +260,9 @@ define i64 @mul_unsigned_ce() {
 	ret i64 mul nuw (i64 ptrtoint (ptr @addr to i64), i64 91)
 }
 
+define i64 @test_zext(i32 %a) {
+; CHECK: %res = zext nneg i32 %a to i64
+  %res = zext nneg i32 %a to i64
+  ret i64 %res
+}
+
diff --git a/llvm/test/Bitcode/flags.ll b/llvm/test/Bitcode/flags.ll
index 6febaa6b40df8..a6e368b7e7632 100644
--- a/llvm/test/Bitcode/flags.ll
+++ b/llvm/test/Bitcode/flags.ll
@@ -16,6 +16,8 @@ second:                                           ; preds = %first
   %s = add nsw i32 %a, 0                          ; <i32> [#uses=0]
   %us = add nuw nsw i32 %a, 0                     ; <i32> [#uses=0]
   %z = add i32 %a, 0                              ; <i32> [#uses=0]
+  %hh = zext nneg i32 %a to i64
+  %ll = zext i32 %s to i64
   unreachable
 
 first:                                            ; preds = %entry
@@ -24,5 +26,7 @@ first:                                            ; preds = %entry
   %ss = add nsw i32 %a, 0                         ; <i32> [#uses=0]
   %uuss = add nuw nsw i32 %a, 0                   ; <i32> [#uses=0]
   %zz = add i32 %a, 0                             ; <i32> [#uses=0]
+  %kk = zext nneg i32 %a to i64
+  %rr = zext i32 %ss to i64
   br label %second
 }
diff --git a/llvm/test/Transforms/InstCombine/freeze.ll b/llvm/test/Transforms/InstCombine/freeze.ll
index f8e9a757e2bc9..3fde49d084812 100644
--- a/llvm/test/Transforms/InstCombine/freeze.ll
+++ b/llvm/test/Transforms/InstCombine/freeze.ll
@@ -1116,6 +1116,17 @@ define i32 @freeze_ctpop(i32 %x) {
   ret i32 %fr
 }
 
+define i32 @freeze_zext_nneg(i8 %x) {
+; CHECK-LABEL: @freeze_zext_nneg(
+; CHECK-NEXT:    [[X_FR:%.*]] = freeze i8 [[X:%.*]]
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext i8 [[X_FR]] to i32
+; CHECK-NEXT:    ret i32 [[ZEXT]]
+;
+  %zext = zext nneg i8 %x to i32
+  %fr = freeze i32 %zext
+  ret i32 %fr
+}
+
 !0 = !{}
 !1 = !{i64 4}
 !2 = !{i32 0, i32 100}
diff --git a/llvm/test/Transforms/SimplifyCFG/HoistCode.ll b/llvm/test/Transforms/SimplifyCFG/HoistCode.ll
index 4088ecfc81898..08cf6cd5be80c 100644
--- a/llvm/test/Transforms/SimplifyCFG/HoistCode.ll
+++ b/llvm/test/Transforms/SimplifyCFG/HoistCode.ll
@@ -94,3 +94,33 @@ end:
   %cond = phi fast float [ 0.0, %bb0 ], [ %x, %bb1 ], [ %x, %bb2 ]
   ret float %cond
 }
+
+define i32 @hoist_zext_flags_preserve(i1 %C, i8 %x) {
+; CHECK-LABEL: @hoist_zext_flags_preserve(
+; CHECK-NEXT:  common.ret:
+; CHECK-NEXT:    [[Z1:%.*]] = zext nneg i8 [[X:%.*]] to i32
+; CHECK-NEXT:    ret i32 [[Z1]]
+;
+  br i1 %C, label %T, label %F
+T:
+  %z1 = zext nneg i8 %x to i32
+  ret i32 %z1
+F:
+  %z2 = zext nneg i8 %x to i32
+  ret i32 %z2
+}
+
+define i32 @hoist_zext_flags_drop(i1 %C, i8 %x) {
+; CHECK-LABEL: @hoist_zext_flags_drop(
+; CHECK-NEXT:  common.ret:
+; CHECK-NEXT:    [[Z1:%.*]] = zext i8 [[X:%.*]] to i32
+; CHECK-NEXT:    ret i32 [[Z1]]
+;
+  br i1 %C, label %T, label %F
+T:
+  %z1 = zext nneg i8 %x to i32
+  ret i32 %z1
+F:
+  %z2 = zext i8 %x to i32
+  ret i32 %z2
+}

From 6a0f6dd8359b38340442b7e6b14629c1d6c54a81 Mon Sep 17 00:00:00 2001
From: Christian Ulmann <christian.ulmann@nextsilicon.com>
Date: Mon, 30 Oct 2023 08:22:12 +0000
Subject: [PATCH 344/877] Revert "[MLIR][FuncToLLVM] Remove typed pointer
 support (#70574)"

This reverts commit 130b149ba92265d09fc7a08c116506f68982cc9b due to it
breaking nvidia build bots. Apparently, there are other users of the
conversion options that were changed in the reverted commit.
---
 mlir/include/mlir/Conversion/Passes.td        |   3 +
 mlir/lib/Conversion/FuncToLLVM/FuncToLLVM.cpp |   1 +
 .../FuncToLLVM/calling-convention.mlir        |   4 +-
 .../FuncToLLVM/convert-argattrs.mlir          |   2 +-
 .../FuncToLLVM/convert-data-layout.mlir       |   2 +-
 .../Conversion/FuncToLLVM/convert-funcs.mlir  |   2 +-
 .../emit-c-wrappers-for-external-callers.mlir |   2 +-
 ...mit-c-wrappers-for-external-functions.mlir |   2 +-
 .../FuncToLLVM/func-memref-return.mlir        |   4 +-
 .../Conversion/FuncToLLVM/func-memref.mlir    |   4 +-
 .../Conversion/FuncToLLVM/func-to-llvm.mlir   |   4 +-
 mlir/test/Conversion/FuncToLLVM/invalid.mlir  |   2 +-
 .../Conversion/FuncToLLVM/typed-pointers.mlir | 114 ++++++++++++++++++
 .../test/lib/Dialect/LLVM/TestLowerToLLVM.cpp |   3 +-
 14 files changed, 134 insertions(+), 15 deletions(-)
 create mode 100644 mlir/test/Conversion/FuncToLLVM/typed-pointers.mlir

diff --git a/mlir/include/mlir/Conversion/Passes.td b/mlir/include/mlir/Conversion/Passes.td
index a2307bc243f61..cf6e545749ffc 100644
--- a/mlir/include/mlir/Conversion/Passes.td
+++ b/mlir/include/mlir/Conversion/Passes.td
@@ -409,6 +409,9 @@ def ConvertFuncToLLVMPass : Pass<"convert-func-to-llvm", "ModuleOp"> {
     Option<"indexBitwidth", "index-bitwidth", "unsigned",
            /*default=kDeriveIndexBitwidthFromDataLayout*/"0",
            "Bitwidth of the index type, 0 to use size of machine word">,
+    Option<"useOpaquePointers", "use-opaque-pointers", "bool",
+                       /*default=*/"true", "Generate LLVM IR using opaque pointers "
+                       "instead of typed pointers">,
   ];
 }
 
diff --git a/mlir/lib/Conversion/FuncToLLVM/FuncToLLVM.cpp b/mlir/lib/Conversion/FuncToLLVM/FuncToLLVM.cpp
index 3126d1dee32cb..3506f50916132 100644
--- a/mlir/lib/Conversion/FuncToLLVM/FuncToLLVM.cpp
+++ b/mlir/lib/Conversion/FuncToLLVM/FuncToLLVM.cpp
@@ -790,6 +790,7 @@ struct ConvertFuncToLLVMPass
     if (indexBitwidth != kDeriveIndexBitwidthFromDataLayout)
       options.overrideIndexBitwidth(indexBitwidth);
     options.dataLayout = llvm::DataLayout(dataLayout);
+    options.useOpaquePointers = useOpaquePointers;
 
     LLVMTypeConverter typeConverter(&getContext(), options,
                                     &dataLayoutAnalysis);
diff --git a/mlir/test/Conversion/FuncToLLVM/calling-convention.mlir b/mlir/test/Conversion/FuncToLLVM/calling-convention.mlir
index 7cdb89e1f72d2..1ed6770887560 100644
--- a/mlir/test/Conversion/FuncToLLVM/calling-convention.mlir
+++ b/mlir/test/Conversion/FuncToLLVM/calling-convention.mlir
@@ -1,5 +1,5 @@
-// RUN: mlir-opt -finalize-memref-to-llvm -llvm-request-c-wrappers -convert-func-to-llvm -reconcile-unrealized-casts %s | FileCheck %s
-// RUN: mlir-opt -finalize-memref-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts %s | FileCheck %s --check-prefix=EMIT_C_ATTRIBUTE
+// RUN: mlir-opt -finalize-memref-to-llvm='use-opaque-pointers=1' -llvm-request-c-wrappers -convert-func-to-llvm='use-opaque-pointers=1' -reconcile-unrealized-casts %s | FileCheck %s
+// RUN: mlir-opt -finalize-memref-to-llvm='use-opaque-pointers=1' -convert-func-to-llvm='use-opaque-pointers=1' -reconcile-unrealized-casts %s | FileCheck %s --check-prefix=EMIT_C_ATTRIBUTE
 
 // This tests the default memref calling convention and the emission of C
 // wrappers. We don't need to separate runs because the wrapper-emission
diff --git a/mlir/test/Conversion/FuncToLLVM/convert-argattrs.mlir b/mlir/test/Conversion/FuncToLLVM/convert-argattrs.mlir
index 85c7cbddfdbf6..41aff17d86919 100644
--- a/mlir/test/Conversion/FuncToLLVM/convert-argattrs.mlir
+++ b/mlir/test/Conversion/FuncToLLVM/convert-argattrs.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt -convert-func-to-llvm %s | FileCheck %s
+// RUN: mlir-opt -convert-func-to-llvm='use-opaque-pointers=1' %s | FileCheck %s
 
 // CHECK-LABEL: func @check_attributes
 // CHECK-SAME: {dialect.a = true, dialect.b = 4 : i64}
diff --git a/mlir/test/Conversion/FuncToLLVM/convert-data-layout.mlir b/mlir/test/Conversion/FuncToLLVM/convert-data-layout.mlir
index 0e7c16ec50799..fb33d4fdfbe7c 100644
--- a/mlir/test/Conversion/FuncToLLVM/convert-data-layout.mlir
+++ b/mlir/test/Conversion/FuncToLLVM/convert-data-layout.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt -set-llvm-module-datalayout -convert-func-to-llvm %s | FileCheck %s
+// RUN: mlir-opt -set-llvm-module-datalayout -convert-func-to-llvm='use-opaque-pointers=1' %s | FileCheck %s
 
 // RUN-32: mlir-opt -set-llvm-module-datalayout='data-layout=p:32:32:32' -convert-func-to-llvm='use-opaque-pointers=1' %s \
 // RUN-32: | FileCheck %s
diff --git a/mlir/test/Conversion/FuncToLLVM/convert-funcs.mlir b/mlir/test/Conversion/FuncToLLVM/convert-funcs.mlir
index 765d8469f3c56..9fe5ad5cdda65 100644
--- a/mlir/test/Conversion/FuncToLLVM/convert-funcs.mlir
+++ b/mlir/test/Conversion/FuncToLLVM/convert-funcs.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt -convert-func-to-llvm -split-input-file -verify-diagnostics %s | FileCheck %s
+// RUN: mlir-opt -convert-func-to-llvm='use-opaque-pointers=1' -split-input-file -verify-diagnostics %s | FileCheck %s
 
 //CHECK: llvm.func @second_order_arg(!llvm.ptr)
 func.func private @second_order_arg(%arg0 : () -> ())
diff --git a/mlir/test/Conversion/FuncToLLVM/emit-c-wrappers-for-external-callers.mlir b/mlir/test/Conversion/FuncToLLVM/emit-c-wrappers-for-external-callers.mlir
index 826ca9540ae56..dd474e1401105 100644
--- a/mlir/test/Conversion/FuncToLLVM/emit-c-wrappers-for-external-callers.mlir
+++ b/mlir/test/Conversion/FuncToLLVM/emit-c-wrappers-for-external-callers.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt -llvm-request-c-wrappers -convert-func-to-llvm %s | FileCheck %s
+// RUN: mlir-opt -llvm-request-c-wrappers -convert-func-to-llvm='use-opaque-pointers=1' %s | FileCheck %s
 
 // CHECK: llvm.func @res_attrs_with_memref_return() -> (!llvm.struct{{.*}} {test.returnOne})
 // CHECK-LABEL: llvm.func @_mlir_ciface_res_attrs_with_memref_return
diff --git a/mlir/test/Conversion/FuncToLLVM/emit-c-wrappers-for-external-functions.mlir b/mlir/test/Conversion/FuncToLLVM/emit-c-wrappers-for-external-functions.mlir
index 28c2638c7be51..027d29b0bf079 100644
--- a/mlir/test/Conversion/FuncToLLVM/emit-c-wrappers-for-external-functions.mlir
+++ b/mlir/test/Conversion/FuncToLLVM/emit-c-wrappers-for-external-functions.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt -llvm-request-c-wrappers -convert-func-to-llvm %s | FileCheck %s
+// RUN: mlir-opt -llvm-request-c-wrappers -convert-func-to-llvm='use-opaque-pointers=1' %s | FileCheck %s
 
 // CHECK: llvm.func private @res_attrs_with_memref_return() -> (!llvm.struct{{.*}} {test.returnOne})
 // CHECK-LABEL: llvm.func @_mlir_ciface_res_attrs_with_memref_return
diff --git a/mlir/test/Conversion/FuncToLLVM/func-memref-return.mlir b/mlir/test/Conversion/FuncToLLVM/func-memref-return.mlir
index 91ef571cb3bf7..b584d4ce28f52 100644
--- a/mlir/test/Conversion/FuncToLLVM/func-memref-return.mlir
+++ b/mlir/test/Conversion/FuncToLLVM/func-memref-return.mlir
@@ -1,6 +1,6 @@
-// RUN: mlir-opt -convert-func-to-llvm -reconcile-unrealized-casts %s | FileCheck %s
+// RUN: mlir-opt -convert-func-to-llvm='use-opaque-pointers=1' -reconcile-unrealized-casts %s | FileCheck %s
 
-// RUN: mlir-opt -convert-func-to-llvm='use-bare-ptr-memref-call-conv=1'  %s | FileCheck %s --check-prefix=BAREPTR
+// RUN: mlir-opt -convert-func-to-llvm='use-bare-ptr-memref-call-conv=1 use-opaque-pointers=1'  %s | FileCheck %s --check-prefix=BAREPTR
 
 // RUN: mlir-opt -transform-interpreter %s | FileCheck %s --check-prefix=BAREPTR
 
diff --git a/mlir/test/Conversion/FuncToLLVM/func-memref.mlir b/mlir/test/Conversion/FuncToLLVM/func-memref.mlir
index d44a07bdcc9ab..b61287643dca9 100644
--- a/mlir/test/Conversion/FuncToLLVM/func-memref.mlir
+++ b/mlir/test/Conversion/FuncToLLVM/func-memref.mlir
@@ -1,5 +1,5 @@
-// RUN: mlir-opt -pass-pipeline="builtin.module(func.func(convert-arith-to-llvm),convert-func-to-llvm,reconcile-unrealized-casts)" -split-input-file %s | FileCheck %s
-// RUN: mlir-opt -pass-pipeline="builtin.module(func.func(convert-arith-to-llvm),convert-func-to-llvm{use-bare-ptr-memref-call-conv=1},reconcile-unrealized-casts)" -split-input-file %s | FileCheck %s --check-prefix=BAREPTR
+// RUN: mlir-opt -pass-pipeline="builtin.module(func.func(convert-arith-to-llvm),convert-func-to-llvm{use-opaque-pointers=1},reconcile-unrealized-casts)" -split-input-file %s | FileCheck %s
+// RUN: mlir-opt -pass-pipeline="builtin.module(func.func(convert-arith-to-llvm),convert-func-to-llvm{use-bare-ptr-memref-call-conv=1 use-opaque-pointers=1},reconcile-unrealized-casts)" -split-input-file %s | FileCheck %s --check-prefix=BAREPTR
 
 // BAREPTR-LABEL: func @check_noalias
 // BAREPTR-SAME: %{{.*}}: !llvm.ptr {llvm.noalias}, %{{.*}}: !llvm.ptr {llvm.noalias}
diff --git a/mlir/test/Conversion/FuncToLLVM/func-to-llvm.mlir b/mlir/test/Conversion/FuncToLLVM/func-to-llvm.mlir
index 9cc6bbf0873ab..8254e77c8628b 100644
--- a/mlir/test/Conversion/FuncToLLVM/func-to-llvm.mlir
+++ b/mlir/test/Conversion/FuncToLLVM/func-to-llvm.mlir
@@ -1,6 +1,6 @@
-// RUN: mlir-opt -pass-pipeline="builtin.module(func.func(convert-math-to-llvm,convert-arith-to-llvm),convert-func-to-llvm,reconcile-unrealized-casts)" %s | FileCheck %s
+// RUN: mlir-opt -pass-pipeline="builtin.module(func.func(convert-math-to-llvm,convert-arith-to-llvm),convert-func-to-llvm{use-opaque-pointers=1},reconcile-unrealized-casts)" %s | FileCheck %s
 
-// RUN: mlir-opt -pass-pipeline="builtin.module(func.func(convert-math-to-llvm,convert-arith-to-llvm{index-bitwidth=32}),convert-func-to-llvm{index-bitwidth=32},reconcile-unrealized-casts)" %s | FileCheck --check-prefix=CHECK32 %s
+// RUN: mlir-opt -pass-pipeline="builtin.module(func.func(convert-math-to-llvm,convert-arith-to-llvm{index-bitwidth=32}),convert-func-to-llvm{index-bitwidth=32 use-opaque-pointers=1},reconcile-unrealized-casts)" %s | FileCheck --check-prefix=CHECK32 %s
 
 // RUN: mlir-opt -transform-interpreter %s | FileCheck --check-prefix=CHECK32 %s
 
diff --git a/mlir/test/Conversion/FuncToLLVM/invalid.mlir b/mlir/test/Conversion/FuncToLLVM/invalid.mlir
index e70252ff87ed1..798d0a8519efe 100644
--- a/mlir/test/Conversion/FuncToLLVM/invalid.mlir
+++ b/mlir/test/Conversion/FuncToLLVM/invalid.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-func-to-llvm -verify-diagnostics -split-input-file
+// RUN: mlir-opt %s -convert-func-to-llvm='use-opaque-pointers=1' -verify-diagnostics -split-input-file
 
 // Should not crash on unsupported types in function signatures.
 func.func private @unsupported_signature() -> tensor<10 x i32>
diff --git a/mlir/test/Conversion/FuncToLLVM/typed-pointers.mlir b/mlir/test/Conversion/FuncToLLVM/typed-pointers.mlir
new file mode 100644
index 0000000000000..7b3b816cc38bb
--- /dev/null
+++ b/mlir/test/Conversion/FuncToLLVM/typed-pointers.mlir
@@ -0,0 +1,114 @@
+// RUN: mlir-opt -convert-func-to-llvm='use-opaque-pointers=0' -split-input-file %s | FileCheck %s
+
+//CHECK: llvm.func @second_order_arg(!llvm.ptr<func<void ()>>)
+func.func private @second_order_arg(%arg0 : () -> ())
+
+//CHECK: llvm.func @second_order_result() -> !llvm.ptr<func<void ()>>
+func.func private @second_order_result() -> (() -> ())
+
+//CHECK: llvm.func @second_order_multi_result() -> !llvm.struct<(ptr<func<i32 ()>>, ptr<func<i64 ()>>, ptr<func<f32 ()>>)>
+func.func private @second_order_multi_result() -> (() -> (i32), () -> (i64), () -> (f32))
+
+//CHECK: llvm.func @third_order(!llvm.ptr<func<ptr<func<void ()>> (ptr<func<void ()>>)>>) -> !llvm.ptr<func<ptr<func<void ()>> (ptr<func<void ()>>)>>
+func.func private @third_order(%arg0 : (() -> ()) -> (() -> ())) -> ((() -> ()) -> (() -> ()))
+
+//CHECK: llvm.func @fifth_order_left(!llvm.ptr<func<void (ptr<func<void (ptr<func<void (ptr<func<void ()>>)>>)>>)>>)
+func.func private @fifth_order_left(%arg0: (((() -> ()) -> ()) -> ()) -> ())
+
+//CHECK: llvm.func @fifth_order_right(!llvm.ptr<func<ptr<func<ptr<func<ptr<func<void ()>> ()>> ()>> ()>>)
+func.func private @fifth_order_right(%arg0: () -> (() -> (() -> (() -> ()))))
+
+// Check that memrefs are converted to argument packs if appear as function arguments.
+// CHECK: llvm.func @memref_call_conv(!llvm.ptr<f32>, !llvm.ptr<f32>, i64, i64, i64)
+func.func private @memref_call_conv(%arg0: memref<?xf32>)
+
+// Same in nested functions.
+// CHECK: llvm.func @memref_call_conv_nested(!llvm.ptr<func<void (ptr<f32>, ptr<f32>, i64, i64, i64)>>)
+func.func private @memref_call_conv_nested(%arg0: (memref<?xf32>) -> ())
+
+//CHECK-LABEL: llvm.func @pass_through(%arg0: !llvm.ptr<func<void ()>>) -> !llvm.ptr<func<void ()>> {
+func.func @pass_through(%arg0: () -> ()) -> (() -> ()) {
+// CHECK-NEXT:  llvm.br ^bb1(%arg0 : !llvm.ptr<func<void ()>>)
+  cf.br ^bb1(%arg0 : () -> ())
+
+//CHECK-NEXT: ^bb1(%0: !llvm.ptr<func<void ()>>):
+^bb1(%bbarg: () -> ()):
+// CHECK-NEXT:  llvm.return %0 : !llvm.ptr<func<void ()>>
+  return %bbarg : () -> ()
+}
+
+// CHECK-LABEL: llvm.func @indirect_call(%arg0: !llvm.ptr<func<i32 (f32)>>, %arg1: f32) -> i32 {
+func.func @indirect_call(%arg0: (f32) -> i32, %arg1: f32) -> i32 {
+// CHECK-NEXT:  %0 = llvm.call %arg0(%arg1) : !llvm.ptr<func<i32 (f32)>>, (f32) -> i32
+  %0 = call_indirect %arg0(%arg1) : (f32) -> i32
+// CHECK-NEXT:  llvm.return %0 : i32
+  return %0 : i32
+}
+
+// CHECK-LABEL: llvm.func @get_i64() -> i64
+func.func private @get_i64() -> (i64)
+// CHECK-LABEL: llvm.func @get_f32() -> f32
+func.func private @get_f32() -> (f32)
+// CHECK-LABEL: llvm.func @get_memref() -> !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<4 x i64>, array<4 x i64>)>
+func.func private @get_memref() -> (memref<42x?x10x?xf32>)
+
+// CHECK-LABEL: llvm.func @multireturn() -> !llvm.struct<(i64, f32, struct<(ptr<f32>, ptr<f32>, i64, array<4 x i64>, array<4 x i64>)>)> {
+func.func @multireturn() -> (i64, f32, memref<42x?x10x?xf32>) {
+^bb0:
+// CHECK-NEXT:  {{.*}} = llvm.call @get_i64() : () -> i64
+// CHECK-NEXT:  {{.*}} = llvm.call @get_f32() : () -> f32
+// CHECK-NEXT:  {{.*}} = llvm.call @get_memref() : () -> !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<4 x i64>, array<4 x i64>)>
+  %0 = call @get_i64() : () -> (i64)
+  %1 = call @get_f32() : () -> (f32)
+  %2 = call @get_memref() : () -> (memref<42x?x10x?xf32>)
+// CHECK-NEXT:  {{.*}} = llvm.mlir.undef : !llvm.struct<(i64, f32, struct<(ptr<f32>, ptr<f32>, i64, array<4 x i64>, array<4 x i64>)>)>
+// CHECK-NEXT:  {{.*}} = llvm.insertvalue {{.*}}, {{.*}}[0] : !llvm.struct<(i64, f32, struct<(ptr<f32>, ptr<f32>, i64, array<4 x i64>, array<4 x i64>)>)>
+// CHECK-NEXT:  {{.*}} = llvm.insertvalue {{.*}}, {{.*}}[1] : !llvm.struct<(i64, f32, struct<(ptr<f32>, ptr<f32>, i64, array<4 x i64>, array<4 x i64>)>)>
+// CHECK-NEXT:  {{.*}} = llvm.insertvalue {{.*}}, {{.*}}[2] : !llvm.struct<(i64, f32, struct<(ptr<f32>, ptr<f32>, i64, array<4 x i64>, array<4 x i64>)>)>
+// CHECK-NEXT:  llvm.return {{.*}} : !llvm.struct<(i64, f32, struct<(ptr<f32>, ptr<f32>, i64, array<4 x i64>, array<4 x i64>)>)>
+  return %0, %1, %2 : i64, f32, memref<42x?x10x?xf32>
+}
+
+//===========================================================================//
+// Calling convention on returning unranked memrefs.
+// IR below produced by running -finalize-memref-to-llvm without opaque
+// pointers on calling-convention.mlir
+//===========================================================================//
+
+func.func @return_var_memref(%arg0: memref<4x3xf32>) -> memref<*xf32> attributes {llvm.emit_c_interface} {
+  %0 = builtin.unrealized_conversion_cast %arg0 : memref<4x3xf32> to !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
+  %1 = llvm.mlir.constant(1 : index) : i64
+  %2 = llvm.alloca %1 x !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> : (i64) -> !llvm.ptr<struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>>
+  llvm.store %0, %2 : !llvm.ptr<struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>>
+  %3 = llvm.bitcast %2 : !llvm.ptr<struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>> to !llvm.ptr<i8>
+  %4 = llvm.mlir.constant(2 : index) : i64
+  %5 = llvm.mlir.undef : !llvm.struct<(i64, ptr<i8>)>
+  %6 = llvm.insertvalue %4, %5[0] : !llvm.struct<(i64, ptr<i8>)>
+  %7 = llvm.insertvalue %3, %6[1] : !llvm.struct<(i64, ptr<i8>)>
+  %8 = builtin.unrealized_conversion_cast %7 : !llvm.struct<(i64, ptr<i8>)> to memref<*xf32>
+  return %8 : memref<*xf32>
+}
+
+// Check that the result memref is passed as parameter
+// CHECK-LABEL: @_mlir_ciface_return_var_memref
+// CHECK-SAME: (%{{.*}}: !llvm.ptr<struct<(i64, ptr<i8>)>>, %{{.*}}: !llvm.ptr<struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>>)
+
+func.func @return_two_var_memref(%arg0: memref<4x3xf32>) -> (memref<*xf32>, memref<*xf32>) attributes {llvm.emit_c_interface} {
+  %0 = builtin.unrealized_conversion_cast %arg0 : memref<4x3xf32> to !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
+  %1 = llvm.mlir.constant(1 : index) : i64
+  %2 = llvm.alloca %1 x !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> : (i64) -> !llvm.ptr<struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>>
+  llvm.store %0, %2 : !llvm.ptr<struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>>
+  %3 = llvm.bitcast %2 : !llvm.ptr<struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>> to !llvm.ptr<i8>
+  %4 = llvm.mlir.constant(2 : index) : i64
+  %5 = llvm.mlir.undef : !llvm.struct<(i64, ptr<i8>)>
+  %6 = llvm.insertvalue %4, %5[0] : !llvm.struct<(i64, ptr<i8>)>
+  %7 = llvm.insertvalue %3, %6[1] : !llvm.struct<(i64, ptr<i8>)>
+  %8 = builtin.unrealized_conversion_cast %7 : !llvm.struct<(i64, ptr<i8>)> to memref<*xf32>
+  return %8, %8 : memref<*xf32>, memref<*xf32>
+}
+
+// Check that the result memrefs are passed as parameter
+// CHECK-LABEL: @_mlir_ciface_return_two_var_memref
+// CHECK-SAME: (%{{.*}}: !llvm.ptr<struct<(struct<(i64, ptr<i8>)>, struct<(i64, ptr<i8>)>)>>,
+// CHECK-SAME: %{{.*}}: !llvm.ptr<struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>>)
+
diff --git a/mlir/test/lib/Dialect/LLVM/TestLowerToLLVM.cpp b/mlir/test/lib/Dialect/LLVM/TestLowerToLLVM.cpp
index b6f89c23e0914..8d61ec44214f8 100644
--- a/mlir/test/lib/Dialect/LLVM/TestLowerToLLVM.cpp
+++ b/mlir/test/lib/Dialect/LLVM/TestLowerToLLVM.cpp
@@ -79,7 +79,8 @@ void buildTestLowerToLLVM(OpPassManager &pm,
   pm.addPass(createFinalizeMemRefToLLVMConversionPass(
       enableOpaquePointers(FinalizeMemRefToLLVMConversionPassOptions{})));
   // Convert Func to LLVM (always needed).
-  pm.addPass(createConvertFuncToLLVMPass(ConvertFuncToLLVMPassOptions{}));
+  pm.addPass(createConvertFuncToLLVMPass(
+      enableOpaquePointers(ConvertFuncToLLVMPassOptions{})));
   // Convert Index to LLVM (always needed).
   pm.addPass(createConvertIndexToLLVMPass());
   // Convert remaining unrealized_casts (always needed).

From e79f0506cf6fd32ed466fdd2ccbb68dcacbb4bf5 Mon Sep 17 00:00:00 2001
From: Sander de Smalen <sander.desmalen@arm.com>
Date: Fri, 27 Oct 2023 16:30:55 +0000
Subject: [PATCH 345/877] [compiler-rt] Don't use 'vg' in CFI directives for
 SME ABI routines

This broke some builds where GNU assembler doesn't support 'vg'.
---
 compiler-rt/lib/builtins/aarch64/sme-abi.S | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/compiler-rt/lib/builtins/aarch64/sme-abi.S b/compiler-rt/lib/builtins/aarch64/sme-abi.S
index b3612c68066f2..d470ecaf7aaad 100644
--- a/compiler-rt/lib/builtins/aarch64/sme-abi.S
+++ b/compiler-rt/lib/builtins/aarch64/sme-abi.S
@@ -35,7 +35,7 @@ DEFINE_COMPILERRT_PRIVATE_FUNCTION(do_abort)
   .cfi_def_cfa_offset 32
   .cfi_offset w30, -24
   .cfi_offset w29, -32
-  .cfi_offset vg, -16
+  .cfi_offset 46, -16
 	bl	__arm_sme_state
 	tbz	x0, #0, 2f
 1:

From b72732c37fd8d193cfa64e693b879dabdec813a5 Mon Sep 17 00:00:00 2001
From: Timm Baeder <tbaeder@redhat.com>
Date: Mon, 30 Oct 2023 09:38:02 +0100
Subject: [PATCH 346/877] [clang][Interp] Handle CXXTryStmts (#70584)

Just do the same thing the current interpreter does: Ignore all handlers
and visit the try block like normal.
---
 clang/lib/AST/Interp/ByteCodeStmtGen.cpp |  8 ++++++++
 clang/lib/AST/Interp/ByteCodeStmtGen.h   |  1 +
 clang/test/AST/Interp/cxx20.cpp          | 13 +++++++++++++
 3 files changed, 22 insertions(+)

diff --git a/clang/lib/AST/Interp/ByteCodeStmtGen.cpp b/clang/lib/AST/Interp/ByteCodeStmtGen.cpp
index 509abe3ae867f..b1ab5fcf9cb64 100644
--- a/clang/lib/AST/Interp/ByteCodeStmtGen.cpp
+++ b/clang/lib/AST/Interp/ByteCodeStmtGen.cpp
@@ -255,6 +255,8 @@ bool ByteCodeStmtGen<Emitter>::visitStmt(const Stmt *S) {
     return visitAsmStmt(cast<AsmStmt>(S));
   case Stmt::AttributedStmtClass:
     return visitAttributedStmt(cast<AttributedStmt>(S));
+  case Stmt::CXXTryStmtClass:
+    return visitCXXTryStmt(cast<CXXTryStmt>(S));
   case Stmt::NullStmtClass:
     return true;
   default: {
@@ -643,6 +645,12 @@ bool ByteCodeStmtGen<Emitter>::visitAttributedStmt(const AttributedStmt *S) {
   return this->visitStmt(S->getSubStmt());
 }
 
+template <class Emitter>
+bool ByteCodeStmtGen<Emitter>::visitCXXTryStmt(const CXXTryStmt *S) {
+  // Ignore all handlers.
+  return this->visitStmt(S->getTryBlock());
+}
+
 namespace clang {
 namespace interp {
 
diff --git a/clang/lib/AST/Interp/ByteCodeStmtGen.h b/clang/lib/AST/Interp/ByteCodeStmtGen.h
index 31f9dbb8064c7..64e03587ab211 100644
--- a/clang/lib/AST/Interp/ByteCodeStmtGen.h
+++ b/clang/lib/AST/Interp/ByteCodeStmtGen.h
@@ -65,6 +65,7 @@ class ByteCodeStmtGen final : public ByteCodeExprGen<Emitter> {
   bool visitDefaultStmt(const DefaultStmt *S);
   bool visitAsmStmt(const AsmStmt *S);
   bool visitAttributedStmt(const AttributedStmt *S);
+  bool visitCXXTryStmt(const CXXTryStmt *S);
 
   bool emitLambdaStaticInvokerBody(const CXXMethodDecl *MD);
 
diff --git a/clang/test/AST/Interp/cxx20.cpp b/clang/test/AST/Interp/cxx20.cpp
index f0bb4e9e0d071..50a7c02925878 100644
--- a/clang/test/AST/Interp/cxx20.cpp
+++ b/clang/test/AST/Interp/cxx20.cpp
@@ -739,3 +739,16 @@ namespace NonPrimitiveOpaqueValue
 
   static_assert(!ternary(), "");
 }
+
+namespace TryCatch {
+  constexpr int foo() {
+    int a = 10;
+    try {
+      ++a;
+    } catch(int m) {
+      --a;
+    }
+    return a;
+  }
+  static_assert(foo() == 11);
+}

From 54732a3e0b0ff7d8cc722887a85abdf599aa41c5 Mon Sep 17 00:00:00 2001
From: Cullen Rhodes <cullen.rhodes@arm.com>
Date: Mon, 30 Oct 2023 08:47:39 +0000
Subject: [PATCH 347/877] [AArch64] Use TargetRegisterClass::hasSubClassEq in
 tryToFindRegisterToRename

When renaming store operands for pairing in the load/store optimizer it
tries to find an available register from the minimal physical register
class of the original register. For each register it compares the
equality of minimal physical register class of all sub/super registers
with the minimal physical register class of the original register.

Simply checking for register class equality can break once additional
register classes are added, as was the case when adding:

    def foo : RegisterClass<"AArch64", [i32], 32, (sequence "W%u", 12, 15)>

which broke:

    CodeGen/AArch64/stp-opt-with-renaming-reserved-regs.mir
    CodeGen/AArch64/stp-opt-with-renaming.mir

Since the introduction of the register class above, the rename register
in test1 of the reserved regs test changed from x12 to x18. The reason
for this is the minimal physical register class of x12 (as well as
x13-x15) and its sub/super registers no longer matches that of x9
(GPR64noip_and_tcGPR64).

Rather than selecting a matching register based on a comparison of the minimal
physical register classes of the original and rename registers, this patch
selects based on `MachineInstr::getRegClassConstraint` for the original
register.

It's worth mentioning the parameter passing registers (r0-r7) could be now be
used as rename registers since the GPR32arg and GPR64arg register classes are
subclasses of the minimal physical register class for x8 for example. I'm not
entirely sure if we want to exclude those registers, if so maybe we could
explicitly exclude those register classes.

Reviewed By: efriedma, paulwalker-arm

Differential Revision: https://reviews.llvm.org/D88663
---
 .../AArch64/AArch64LoadStoreOptimizer.cpp     | 62 +++++++++++++++----
 .../stp-opt-with-renaming-reserved-regs.mir   |  4 +-
 .../stp-opt-with-renaming-undef-assert.mir    |  4 +-
 .../CodeGen/AArch64/stp-opt-with-renaming.mir |  8 +--
 4 files changed, 57 insertions(+), 21 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
index c93fd02a821dc..4a7805719bc57 100644
--- a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
@@ -723,6 +723,16 @@ static bool isMergeableLdStUpdate(MachineInstr &MI) {
   }
 }
 
+static bool isRewritableImplicitDef(unsigned Opc) {
+  switch (Opc) {
+  default:
+    return false;
+  case AArch64::ORRWrs:
+  case AArch64::ADDWri:
+    return true;
+  }
+}
+
 MachineBasicBlock::iterator
 AArch64LoadStoreOpt::mergeNarrowZeroStores(MachineBasicBlock::iterator I,
                                            MachineBasicBlock::iterator MergeMI,
@@ -871,12 +881,13 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
 
     // Return the sub/super register for RenameReg, matching the size of
     // OriginalReg.
-    auto GetMatchingSubReg = [this,
-                              RenameReg](MCPhysReg OriginalReg) -> MCPhysReg {
-      for (MCPhysReg SubOrSuper : TRI->sub_and_superregs_inclusive(*RenameReg))
-        if (TRI->getMinimalPhysRegClass(OriginalReg) ==
-            TRI->getMinimalPhysRegClass(SubOrSuper))
+    auto GetMatchingSubReg =
+        [this, RenameReg](const TargetRegisterClass *C) -> MCPhysReg {
+      for (MCPhysReg SubOrSuper :
+           TRI->sub_and_superregs_inclusive(*RenameReg)) {
+        if (C->contains(SubOrSuper))
           return SubOrSuper;
+      }
       llvm_unreachable("Should have found matching sub or super register!");
     };
 
@@ -884,7 +895,8 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
         [this, RegToRename, GetMatchingSubReg](MachineInstr &MI, bool IsDef) {
           if (IsDef) {
             bool SeenDef = false;
-            for (auto &MOP : MI.operands()) {
+            for (unsigned OpIdx = 0; OpIdx < MI.getNumOperands(); ++OpIdx) {
+              MachineOperand &MOP = MI.getOperand(OpIdx);
               // Rename the first explicit definition and all implicit
               // definitions matching RegToRename.
               if (MOP.isReg() && !MOP.isDebug() && MOP.getReg() &&
@@ -893,18 +905,33 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
                 assert((MOP.isImplicit() ||
                         (MOP.isRenamable() && !MOP.isEarlyClobber())) &&
                        "Need renamable operands");
-                MOP.setReg(GetMatchingSubReg(MOP.getReg()));
+                Register MatchingReg;
+                if (const TargetRegisterClass *RC =
+                        MI.getRegClassConstraint(OpIdx, TII, TRI))
+                  MatchingReg = GetMatchingSubReg(RC);
+                else {
+                  if (!isRewritableImplicitDef(MI.getOpcode()))
+                    continue;
+                  MatchingReg = GetMatchingSubReg(
+                      TRI->getMinimalPhysRegClass(MOP.getReg()));
+                }
+                MOP.setReg(MatchingReg);
                 SeenDef = true;
               }
             }
           } else {
-            for (auto &MOP : MI.operands()) {
+            for (unsigned OpIdx = 0; OpIdx < MI.getNumOperands(); ++OpIdx) {
+              MachineOperand &MOP = MI.getOperand(OpIdx);
               if (MOP.isReg() && !MOP.isDebug() && MOP.getReg() &&
                   TRI->regsOverlap(MOP.getReg(), RegToRename)) {
                 assert((MOP.isImplicit() ||
                         (MOP.isRenamable() && !MOP.isEarlyClobber())) &&
                            "Need renamable operands");
-                MOP.setReg(GetMatchingSubReg(MOP.getReg()));
+                const TargetRegisterClass *RC =
+                    MI.getRegClassConstraint(OpIdx, TII, TRI);
+                if (!RC)
+                  continue;
+                MOP.setReg(GetMatchingSubReg(RC));
               }
             }
           }
@@ -1404,6 +1431,16 @@ canRenameUpToDef(MachineInstr &FirstMI, LiveRegUnits &UsedInBetween,
             << MOP << ")\n");
         return false;
       }
+
+      // We cannot rename arbitrary implicit-defs, the specific rule to rewrite
+      // them must be known. For example, in ORRWrs the implicit-def
+      // corresponds to the result register.
+      if (MOP.isImplicit() && MOP.isDef()) {
+        if (!isRewritableImplicitDef(MOP.getParent()->getOpcode()))
+          return false;
+        return TRI->isSuperOrSubRegisterEq(
+            MOP.getParent()->getOperand(0).getReg(), MOP.getReg());
+      }
     }
     return MOP.isImplicit() ||
            (MOP.isRenamable() && !MOP.isEarlyClobber() && !MOP.isTied());
@@ -1511,10 +1548,9 @@ static std::optional<MCPhysReg> tryToFindRegisterToRename(
   // required register classes.
   auto CanBeUsedForAllClasses = [&RequiredClasses, TRI](MCPhysReg PR) {
     return all_of(RequiredClasses, [PR, TRI](const TargetRegisterClass *C) {
-      return any_of(TRI->sub_and_superregs_inclusive(PR),
-                    [C, TRI](MCPhysReg SubOrSuper) {
-                      return C == TRI->getMinimalPhysRegClass(SubOrSuper);
-                    });
+      return any_of(
+          TRI->sub_and_superregs_inclusive(PR),
+          [C](MCPhysReg SubOrSuper) { return C->contains(SubOrSuper); });
     });
   };
 
diff --git a/llvm/test/CodeGen/AArch64/stp-opt-with-renaming-reserved-regs.mir b/llvm/test/CodeGen/AArch64/stp-opt-with-renaming-reserved-regs.mir
index 86b8030e5cc33..8159b52679f58 100644
--- a/llvm/test/CodeGen/AArch64/stp-opt-with-renaming-reserved-regs.mir
+++ b/llvm/test/CodeGen/AArch64/stp-opt-with-renaming-reserved-regs.mir
@@ -47,7 +47,7 @@ body:             |
 ...
 # CHECK-LABEL: name: test2
 # CHECK:       bb.0:
-# CHECK-NEXT:     liveins: $x0, $x1, $x10, $x11, $x12, $x13
+# CHECK-NEXT:     liveins: $x0, $x1, $x2, $x3, $x4, $x5, $x6, $x7, $x9, $x10, $x11, $x12, $x13
 # CHECK:          renamable $w19 = LDRWui renamable $x0, 0 :: (load (s64))
 # PRESERVED-NEXT: renamable $x9, renamable $x8 = LDPXi renamable $x0, 1 :: (load (s64))
 # NOPRES-NEXT:    renamable $x9, renamable $x8 = LDPXi renamable $x0, 1 :: (load (s64))
@@ -74,7 +74,7 @@ frameInfo:
 machineFunctionInfo: {}
 body:             |
   bb.0:
-    liveins: $x0, $x1, $x10, $x11, $x12, $x13
+    liveins: $x0, $x1, $x2, $x3, $x4, $x5, $x6, $x7, $x9, $x10, $x11, $x12, $x13
     renamable $w19 = LDRWui renamable $x0, 0 :: (load (s64))
     renamable $x9, renamable $x8 = LDPXi renamable $x0, 1 :: (load (s64))
     STRXui renamable killed $x9, renamable $x0, 11 :: (store (s64), align 4)
diff --git a/llvm/test/CodeGen/AArch64/stp-opt-with-renaming-undef-assert.mir b/llvm/test/CodeGen/AArch64/stp-opt-with-renaming-undef-assert.mir
index d1da5e76b536f..17ce26aa36514 100644
--- a/llvm/test/CodeGen/AArch64/stp-opt-with-renaming-undef-assert.mir
+++ b/llvm/test/CodeGen/AArch64/stp-opt-with-renaming-undef-assert.mir
@@ -17,13 +17,13 @@
 # CHECK-NEXT: liveins: $x0, $x17, $x18
 # CHECK: renamable $q13_q14_q15 = LD3Threev16b undef renamable $x17 :: (load (s384) from `ptr undef`, align 64)
 # CHECK-NEXT: renamable $q23_q24_q25 = LD3Threev16b undef renamable $x18 :: (load (s384) from `ptr undef`, align 64)
-# CHECK-NEXT: $q16 = EXTv16i8 renamable $q23, renamable $q23, 8
+# CHECK-NEXT: $q0 = EXTv16i8 renamable $q23, renamable $q23, 8
 # CHECK-NEXT: renamable $q20 = EXTv16i8 renamable $q14, renamable $q14, 8
 # CHECK-NEXT: STRQui killed renamable $q20, $sp, 4 :: (store (s128))
 # CHECK-NEXT: renamable $d6 = ZIP2v8i8 renamable $d23, undef renamable $d16
 # CHECK-NEXT: STRDui killed renamable $d6, $sp, 11 :: (store (s64))
 # CHECK-NEXT: renamable $q6 = EXTv16i8 renamable $q13, renamable $q13, 8
-# CHECK-NEXT: STPQi killed renamable $q6, killed $q16, $sp, 6 :: (store (s128))
+# CHECK-NEXT: STPQi killed renamable $q6, killed $q0, $sp, 6 :: (store (s128))
 # CHECK-NEXT: RET undef $lr
 
 ---
diff --git a/llvm/test/CodeGen/AArch64/stp-opt-with-renaming.mir b/llvm/test/CodeGen/AArch64/stp-opt-with-renaming.mir
index ecf0695c07714..f1ca71680062b 100644
--- a/llvm/test/CodeGen/AArch64/stp-opt-with-renaming.mir
+++ b/llvm/test/CodeGen/AArch64/stp-opt-with-renaming.mir
@@ -361,7 +361,7 @@ body:             |
 # ($x14 in this example)
 # CHECK-LABEL: name: test11
 # CHECK: bb.0:
-# CHECK-NEXT: liveins: $x0, $x1, $x11, $x12, $x13
+# CHECK-NEXT: liveins: $x0, $x1, $x2, $x3, $x4, $x5, $x6, $x7, $x11, $x12, $x13
 
 # CHECK:         renamable $w10 = LDRWui renamable $x0, 0 :: (load (s64))
 # CHECK-NEXT:    renamable $x9, renamable $x8 = LDPXi renamable $x0, 1 :: (load (s64))
@@ -387,7 +387,7 @@ frameInfo:
 machineFunctionInfo: {}
 body:             |
   bb.0:
-    liveins: $x0, $x1, $x11, $x12, $x13
+    liveins: $x0, $x1, $x2, $x3, $x4, $x5, $x6, $x7, $x11, $x12, $x13
     renamable $w10 = LDRWui renamable $x0, 0 :: (load (s64))
     renamable $x9, renamable $x8 = LDPXi renamable $x0, 1 :: (load (s64))
     STRXui renamable killed $x9, renamable $x0, 11 :: (store (s64), align 4)
@@ -445,7 +445,7 @@ body:             |
 # paired store. ($x14 in this example)
 # CHECK-LABEL: name: test13
 # CHECK: bb.0:
-# CHECK-NEXT: liveins: $x0, $x1, $x10, $x11, $x12, $x13
+# CHECK-NEXT: liveins: $x0, $x1, $x2, $x3, $x4, $x5, $x6, $x7, $x10, $x11, $x12, $x13
 # CHECK:    renamable $x9, renamable $x8 = LDPXi renamable $x0, 0 :: (load (s64))
 # CHECK-NEXT:    renamable $x14 = LDRXui renamable $x0, 4 :: (load (s64))
 # CHECK-NEXT:    STRXui killed renamable $x14, renamable $x0, 100 :: (store (s64), align 4)
@@ -467,7 +467,7 @@ frameInfo:
 machineFunctionInfo: {}
 body:             |
   bb.0:
-    liveins: $x0, $x1, $x10, $x11, $x12, $x13
+    liveins: $x0, $x1, $x2, $x3, $x4, $x5, $x6, $x7, $x10, $x11, $x12, $x13
     renamable $x9, renamable $x8 = LDPXi renamable $x0, 0 :: (load (s64))
     renamable $x14 = LDRXui renamable $x0, 4 :: (load (s64))
     STRXui renamable killed $x14, renamable $x0, 100 :: (store (s64), align 4)

From 5c8a71d82bcfd8f9d327c103aa036f1b003ca951 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Mon, 30 Oct 2023 10:00:33 +0100
Subject: [PATCH 348/877] [InstCombine] Remove unnecessary icmp of all-zero gep
 folds (NFC)

All-zero GEPs will be removed anyway, no need to special-case them
here.
---
 .../Transforms/InstCombine/InstCombineCompares.cpp    | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
index 1764917107f03..2ff27abc79318 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -846,17 +846,6 @@ Instruction *InstCombinerImpl::foldGEPICmp(GEPOperator *GEPLHS, Value *RHS,
       return transformToIndexedCompare(GEPLHS, RHS, Cond, DL, *this);
     }
 
-    // If one of the GEPs has all zero indices, recurse.
-    // FIXME: Handle vector of pointers.
-    if (!GEPLHS->getType()->isVectorTy() && GEPLHS->hasAllZeroIndices())
-      return foldGEPICmp(GEPRHS, GEPLHS->getOperand(0),
-                         ICmpInst::getSwappedPredicate(Cond), I);
-
-    // If the other GEP has all zero indices, recurse.
-    // FIXME: Handle vector of pointers.
-    if (!GEPRHS->getType()->isVectorTy() && GEPRHS->hasAllZeroIndices())
-      return foldGEPICmp(GEPLHS, GEPRHS->getOperand(0), Cond, I);
-
     bool GEPsInBounds = GEPLHS->isInBounds() && GEPRHS->isInBounds();
     if (GEPLHS->getNumOperands() == GEPRHS->getNumOperands() &&
         GEPLHS->getSourceElementType() == GEPRHS->getSourceElementType()) {

From 1770a2e325192f1665018e21200596da1904a330 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Mon, 30 Oct 2023 10:05:39 +0100
Subject: [PATCH 349/877] [InstCombine] Simplify and/or of icmp eq with op
 replacement (#70335)

and/or in logical (select) form benefit from generic simplifications via
simplifyWithOpReplaced(). However, the corresponding fold for plain
and/or currently does not exist.

Similar to selects, there are two general cases for this fold
(illustrated with `and`, but there are `or` conjugates).

The basic case is something like `(a == b) & c`, where the replacement
of a with b or b with a inside c allows it to fold to true or false.
Then the whole operation will fold to either false or `a == b`.

The second case is something like `(a != b) & c`, where the replacement
inside c allows it to fold to false. In that case, the operand can be
replaced with c, because in the case where a == b (and thus the icmp is
false), c itself will already be false.

As the test diffs show, this catches quite a lot of patterns in existing
test coverage. This also obsoletes quite a few existing special-case
and/or of icmp folds we have (e.g. simplifyAndOrOfICmpsWithLimitConst),
but I haven't removed anything as part of this patch in the interest of
risk mitigation.

Fixes #69050.
Fixes #69091.
---
 llvm/lib/Analysis/InstructionSimplify.cpp     |  60 ++
 llvm/test/CodeGen/PowerPC/pr45448.ll          |  10 +-
 .../div-by-0-guard-before-smul_ov.ll          |  11 +-
 .../div-by-0-guard-before-umul_ov.ll          |  11 +-
 llvm/test/Transforms/InstCombine/ispow2.ll    |  24 +-
 .../InstSimplify/and-or-icmp-ctpop.ll         |  10 +-
 .../InstSimplify/and-or-icmp-min-max.ll       | 617 +++++-------------
 .../InstSimplify/and-or-implied-cond.ll       |  10 +-
 .../div-by-0-guard-before-smul_ov-not.ll      |  13 +-
 .../div-by-0-guard-before-smul_ov.ll          |  11 +-
 .../div-by-0-guard-before-umul_ov-not.ll      |  13 +-
 .../div-by-0-guard-before-umul_ov.ll          |  11 +-
 ...f-negative-is-non-zero-and-no-underflow.ll |  20 +-
 llvm/test/Transforms/PGOProfile/chr.ll        |   7 +-
 14 files changed, 241 insertions(+), 587 deletions(-)

diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp
index f0e60c9a2dac6..a68ca950e5232 100644
--- a/llvm/lib/Analysis/InstructionSimplify.cpp
+++ b/llvm/lib/Analysis/InstructionSimplify.cpp
@@ -2025,6 +2025,52 @@ static Value *simplifyAndOrOfCmps(const SimplifyQuery &Q, Value *Op0,
   return nullptr;
 }
 
+static Value *simplifyWithOpReplaced(Value *V, Value *Op, Value *RepOp,
+                                     const SimplifyQuery &Q,
+                                     bool AllowRefinement,
+                                     SmallVectorImpl<Instruction *> *DropFlags,
+                                     unsigned MaxRecurse);
+
+static Value *simplifyAndOrWithICmpEq(unsigned Opcode, Value *Op0, Value *Op1,
+                                      const SimplifyQuery &Q,
+                                      unsigned MaxRecurse) {
+  assert((Opcode == Instruction::And || Opcode == Instruction::Or) &&
+         "Must be and/or");
+  ICmpInst::Predicate Pred;
+  Value *A, *B;
+  if (!match(Op0, m_ICmp(Pred, m_Value(A), m_Value(B))) ||
+      !ICmpInst::isEquality(Pred) || !MaxRecurse--)
+    return nullptr;
+
+  auto Simplify = [&](Value *Res) -> Value * {
+    // and (icmp eq a, b), x implies (a==b) inside x.
+    // or (icmp ne a, b), x implies (a==b) inside x.
+    // If x simplifies to true/false, we can simplify the and/or.
+    if (Pred ==
+        (Opcode == Instruction::And ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_NE))
+      return simplifyBinOp(Opcode, Op0, Res, Q, MaxRecurse);
+    // If we have and (icmp ne a, b), x and for a==b we can simplify x to false,
+    // then we can drop the icmp, as x will already be false in the case where
+    // the icmp is false. Similar for or and true.
+    if (Res == ConstantExpr::getBinOpAbsorber(Opcode, Res->getType()))
+      return Op1;
+    return nullptr;
+  };
+
+  // Increment MaxRecurse again, because simplifyWithOpReplaced() does its own
+  // decrement.
+  if (Value *Res =
+          simplifyWithOpReplaced(Op1, A, B, Q, /* AllowRefinement */ true,
+                                 /* DropFlags */ nullptr, MaxRecurse + 1))
+    return Simplify(Res);
+  if (Value *Res =
+          simplifyWithOpReplaced(Op1, B, A, Q, /* AllowRefinement */ true,
+                                 /* DropFlags */ nullptr, MaxRecurse + 1))
+    return Simplify(Res);
+
+  return nullptr;
+}
+
 /// Given a bitwise logic op, check if the operands are add/sub with a common
 /// source value and inverted constant (identity: C - X -> ~(X + ~C)).
 static Value *simplifyLogicOfAddSub(Value *Op0, Value *Op1,
@@ -2159,6 +2205,13 @@ static Value *simplifyAndInst(Value *Op0, Value *Op1, const SimplifyQuery &Q,
       isKnownToBeAPowerOfTwo(Op0, Q.DL, /*OrZero*/ true, 0, Q.AC, Q.CxtI, Q.DT))
     return Constant::getNullValue(Op0->getType());
 
+  if (Value *V =
+          simplifyAndOrWithICmpEq(Instruction::And, Op0, Op1, Q, MaxRecurse))
+    return V;
+  if (Value *V =
+          simplifyAndOrWithICmpEq(Instruction::And, Op1, Op0, Q, MaxRecurse))
+    return V;
+
   if (Value *V = simplifyAndOrOfCmps(Q, Op0, Op1, true))
     return V;
 
@@ -2435,6 +2488,13 @@ static Value *simplifyOrInst(Value *Op0, Value *Op1, const SimplifyQuery &Q,
       match(Op0, m_LShr(m_Specific(X), m_Specific(Y))))
     return Op1;
 
+  if (Value *V =
+          simplifyAndOrWithICmpEq(Instruction::Or, Op0, Op1, Q, MaxRecurse))
+    return V;
+  if (Value *V =
+          simplifyAndOrWithICmpEq(Instruction::Or, Op1, Op0, Q, MaxRecurse))
+    return V;
+
   if (Value *V = simplifyAndOrOfCmps(Q, Op0, Op1, false))
     return V;
 
diff --git a/llvm/test/CodeGen/PowerPC/pr45448.ll b/llvm/test/CodeGen/PowerPC/pr45448.ll
index 6b3d578f6b338..0f2dcb3ccc8a0 100644
--- a/llvm/test/CodeGen/PowerPC/pr45448.ll
+++ b/llvm/test/CodeGen/PowerPC/pr45448.ll
@@ -20,20 +20,16 @@ define hidden void @julia_tryparse_internal_45896() #0 {
 ; CHECK-NEXT:  .LBB0_6: # %fail194
 ; CHECK-NEXT:  .LBB0_7: # %L670
 ; CHECK-NEXT:    li r5, -3
-; CHECK-NEXT:    cmpdi r3, 0
 ; CHECK-NEXT:    sradi r4, r3, 63
 ; CHECK-NEXT:    rldic r5, r5, 4, 32
-; CHECK-NEXT:    crnot 4*cr5+lt, eq
 ; CHECK-NEXT:    mulhdu r3, r3, r5
 ; CHECK-NEXT:    maddld r6, r4, r5, r3
 ; CHECK-NEXT:    cmpld cr1, r6, r3
 ; CHECK-NEXT:    mulhdu. r3, r4, r5
-; CHECK-NEXT:    bc 4, 4*cr5+lt, .LBB0_10
-; CHECK-NEXT:  # %bb.8: # %L670
 ; CHECK-NEXT:    crorc 4*cr5+lt, 4*cr1+lt, eq
-; CHECK-NEXT:    bc 4, 4*cr5+lt, .LBB0_10
-; CHECK-NEXT:  # %bb.9: # %L917
-; CHECK-NEXT:  .LBB0_10: # %L994
+; CHECK-NEXT:    bc 4, 4*cr5+lt, .LBB0_9
+; CHECK-NEXT:  # %bb.8: # %L917
+; CHECK-NEXT:  .LBB0_9: # %L994
 top:
   %0 = load i64, ptr undef, align 8
   %1 = icmp ne i64 %0, 0
diff --git a/llvm/test/Transforms/InstCombine/div-by-0-guard-before-smul_ov.ll b/llvm/test/Transforms/InstCombine/div-by-0-guard-before-smul_ov.ll
index 23bfc75b945ba..08eefbebb7363 100644
--- a/llvm/test/Transforms/InstCombine/div-by-0-guard-before-smul_ov.ll
+++ b/llvm/test/Transforms/InstCombine/div-by-0-guard-before-smul_ov.ll
@@ -47,11 +47,7 @@ define i1 @n2_wrong_size(i4 %size0, i4 %size1, i4 %nmemb) {
 
 define i1 @n3_wrong_pred(i4 %size, i4 %nmemb) {
 ; CHECK-LABEL: @n3_wrong_pred(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i4 [[SIZE:%.*]], 0
-; CHECK-NEXT:    [[SMUL:%.*]] = tail call { i4, i1 } @llvm.smul.with.overflow.i4(i4 [[SIZE]], i4 [[NMEMB:%.*]])
-; CHECK-NEXT:    [[SMUL_OV:%.*]] = extractvalue { i4, i1 } [[SMUL]], 1
-; CHECK-NEXT:    [[AND:%.*]] = and i1 [[SMUL_OV]], [[CMP]]
-; CHECK-NEXT:    ret i1 [[AND]]
+; CHECK-NEXT:    ret i1 false
 ;
   %cmp = icmp eq i4 %size, 0 ; not 'ne'
   %smul = tail call { i4, i1 } @llvm.smul.with.overflow.i4(i4 %size, i4 %nmemb)
@@ -63,10 +59,7 @@ define i1 @n3_wrong_pred(i4 %size, i4 %nmemb) {
 define i1 @n4_not_and(i4 %size, i4 %nmemb) {
 ; CHECK-LABEL: @n4_not_and(
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i4 [[SIZE:%.*]], 0
-; CHECK-NEXT:    [[SMUL:%.*]] = tail call { i4, i1 } @llvm.smul.with.overflow.i4(i4 [[SIZE]], i4 [[NMEMB:%.*]])
-; CHECK-NEXT:    [[SMUL_OV:%.*]] = extractvalue { i4, i1 } [[SMUL]], 1
-; CHECK-NEXT:    [[AND:%.*]] = or i1 [[SMUL_OV]], [[CMP]]
-; CHECK-NEXT:    ret i1 [[AND]]
+; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %cmp = icmp ne i4 %size, 0
   %smul = tail call { i4, i1 } @llvm.smul.with.overflow.i4(i4 %size, i4 %nmemb)
diff --git a/llvm/test/Transforms/InstCombine/div-by-0-guard-before-umul_ov.ll b/llvm/test/Transforms/InstCombine/div-by-0-guard-before-umul_ov.ll
index dbc3b5e7a25be..047f8855fe5cb 100644
--- a/llvm/test/Transforms/InstCombine/div-by-0-guard-before-umul_ov.ll
+++ b/llvm/test/Transforms/InstCombine/div-by-0-guard-before-umul_ov.ll
@@ -47,11 +47,7 @@ define i1 @n2_wrong_size(i4 %size0, i4 %size1, i4 %nmemb) {
 
 define i1 @n3_wrong_pred(i4 %size, i4 %nmemb) {
 ; CHECK-LABEL: @n3_wrong_pred(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i4 [[SIZE:%.*]], 0
-; CHECK-NEXT:    [[UMUL:%.*]] = tail call { i4, i1 } @llvm.umul.with.overflow.i4(i4 [[SIZE]], i4 [[NMEMB:%.*]])
-; CHECK-NEXT:    [[UMUL_OV:%.*]] = extractvalue { i4, i1 } [[UMUL]], 1
-; CHECK-NEXT:    [[AND:%.*]] = and i1 [[UMUL_OV]], [[CMP]]
-; CHECK-NEXT:    ret i1 [[AND]]
+; CHECK-NEXT:    ret i1 false
 ;
   %cmp = icmp eq i4 %size, 0 ; not 'ne'
   %umul = tail call { i4, i1 } @llvm.umul.with.overflow.i4(i4 %size, i4 %nmemb)
@@ -63,10 +59,7 @@ define i1 @n3_wrong_pred(i4 %size, i4 %nmemb) {
 define i1 @n4_not_and(i4 %size, i4 %nmemb) {
 ; CHECK-LABEL: @n4_not_and(
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i4 [[SIZE:%.*]], 0
-; CHECK-NEXT:    [[UMUL:%.*]] = tail call { i4, i1 } @llvm.umul.with.overflow.i4(i4 [[SIZE]], i4 [[NMEMB:%.*]])
-; CHECK-NEXT:    [[UMUL_OV:%.*]] = extractvalue { i4, i1 } [[UMUL]], 1
-; CHECK-NEXT:    [[AND:%.*]] = or i1 [[UMUL_OV]], [[CMP]]
-; CHECK-NEXT:    ret i1 [[AND]]
+; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %cmp = icmp ne i4 %size, 0
   %umul = tail call { i4, i1 } @llvm.umul.with.overflow.i4(i4 %size, i4 %nmemb)
diff --git a/llvm/test/Transforms/InstCombine/ispow2.ll b/llvm/test/Transforms/InstCombine/ispow2.ll
index 740f79cd32b39..cc50c5cd1e668 100644
--- a/llvm/test/Transforms/InstCombine/ispow2.ll
+++ b/llvm/test/Transforms/InstCombine/ispow2.ll
@@ -392,9 +392,7 @@ define i1 @is_pow2_ctpop_wrong_pred1(i32 %x) {
 ; CHECK-LABEL: @is_pow2_ctpop_wrong_pred1(
 ; CHECK-NEXT:    [[T0:%.*]] = tail call i32 @llvm.ctpop.i32(i32 [[X:%.*]]), !range [[RNG0]]
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i32 [[T0]], 2
-; CHECK-NEXT:    [[NOTZERO:%.*]] = icmp ne i32 [[X]], 0
-; CHECK-NEXT:    [[R:%.*]] = and i1 [[NOTZERO]], [[CMP]]
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %t0 = tail call i32 @llvm.ctpop.i32(i32 %x)
   %cmp = icmp ugt i32 %t0, 2
@@ -946,9 +944,7 @@ define i1 @is_pow2or0_ctpop_wrong_pred1(i32 %x) {
 ; CHECK-LABEL: @is_pow2or0_ctpop_wrong_pred1(
 ; CHECK-NEXT:    [[T0:%.*]] = tail call i32 @llvm.ctpop.i32(i32 [[X:%.*]]), !range [[RNG0]]
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[T0]], 1
-; CHECK-NEXT:    [[ISZERO:%.*]] = icmp eq i32 [[X]], 0
-; CHECK-NEXT:    [[R:%.*]] = or i1 [[ISZERO]], [[CMP]]
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %t0 = tail call i32 @llvm.ctpop.i32(i32 %x)
   %cmp = icmp ne i32 %t0, 1
@@ -959,11 +955,7 @@ define i1 @is_pow2or0_ctpop_wrong_pred1(i32 %x) {
 
 define i1 @is_pow2or0_ctpop_wrong_pred2(i32 %x) {
 ; CHECK-LABEL: @is_pow2or0_ctpop_wrong_pred2(
-; CHECK-NEXT:    [[T0:%.*]] = tail call i32 @llvm.ctpop.i32(i32 [[X:%.*]]), !range [[RNG0]]
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[T0]], 1
-; CHECK-NEXT:    [[ISZERO:%.*]] = icmp ne i32 [[X]], 0
-; CHECK-NEXT:    [[R:%.*]] = or i1 [[ISZERO]], [[CMP]]
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    ret i1 true
 ;
   %t0 = tail call i32 @llvm.ctpop.i32(i32 %x)
   %cmp = icmp ne i32 %t0, 1
@@ -1149,9 +1141,7 @@ define i1 @isnot_pow2nor0_ctpop_wrong_pred1(i32 %x) {
 ; CHECK-LABEL: @isnot_pow2nor0_ctpop_wrong_pred1(
 ; CHECK-NEXT:    [[T0:%.*]] = tail call i32 @llvm.ctpop.i32(i32 [[X:%.*]]), !range [[RNG0]]
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[T0]], 1
-; CHECK-NEXT:    [[NOTZERO:%.*]] = icmp ne i32 [[X]], 0
-; CHECK-NEXT:    [[R:%.*]] = and i1 [[NOTZERO]], [[CMP]]
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %t0 = tail call i32 @llvm.ctpop.i32(i32 %x)
   %cmp = icmp eq i32 %t0, 1
@@ -1162,11 +1152,7 @@ define i1 @isnot_pow2nor0_ctpop_wrong_pred1(i32 %x) {
 
 define i1 @isnot_pow2nor0_ctpop_wrong_pred2(i32 %x) {
 ; CHECK-LABEL: @isnot_pow2nor0_ctpop_wrong_pred2(
-; CHECK-NEXT:    [[T0:%.*]] = tail call i32 @llvm.ctpop.i32(i32 [[X:%.*]]), !range [[RNG0]]
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[T0]], 1
-; CHECK-NEXT:    [[NOTZERO:%.*]] = icmp eq i32 [[X]], 0
-; CHECK-NEXT:    [[R:%.*]] = and i1 [[NOTZERO]], [[CMP]]
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    ret i1 false
 ;
   %t0 = tail call i32 @llvm.ctpop.i32(i32 %x)
   %cmp = icmp eq i32 %t0, 1
diff --git a/llvm/test/Transforms/InstSimplify/and-or-icmp-ctpop.ll b/llvm/test/Transforms/InstSimplify/and-or-icmp-ctpop.ll
index 6de97c3a7a76d..6fe8d29bd10bf 100644
--- a/llvm/test/Transforms/InstSimplify/and-or-icmp-ctpop.ll
+++ b/llvm/test/Transforms/InstSimplify/and-or-icmp-ctpop.ll
@@ -40,11 +40,7 @@ define <2 x i1> @eq_or_non_0_commute(<2 x i32> %x) {
 
 define i1 @eq_or_non_0_wrong_pred1(i32 %x) {
 ; CHECK-LABEL: @eq_or_non_0_wrong_pred1(
-; CHECK-NEXT:    [[T0:%.*]] = tail call i32 @llvm.ctpop.i32(i32 [[X:%.*]])
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[T0]], 10
-; CHECK-NEXT:    [[NOTZERO:%.*]] = icmp ne i32 [[X]], 0
-; CHECK-NEXT:    [[R:%.*]] = or i1 [[NOTZERO]], [[CMP]]
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    ret i1 true
 ;
   %t0 = tail call i32 @llvm.ctpop.i32(i32 %x)
   %cmp = icmp ne i32 %t0, 10
@@ -90,9 +86,7 @@ define i1 @ne_and_is_0_wrong_pred1(i32 %x) {
 ; CHECK-LABEL: @ne_and_is_0_wrong_pred1(
 ; CHECK-NEXT:    [[T0:%.*]] = tail call i32 @llvm.ctpop.i32(i32 [[X:%.*]])
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[T0]], 10
-; CHECK-NEXT:    [[ISZERO:%.*]] = icmp eq i32 [[X]], 0
-; CHECK-NEXT:    [[R:%.*]] = or i1 [[ISZERO]], [[CMP]]
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %t0 = tail call i32 @llvm.ctpop.i32(i32 %x)
   %cmp = icmp ne i32 %t0, 10
diff --git a/llvm/test/Transforms/InstSimplify/and-or-icmp-min-max.ll b/llvm/test/Transforms/InstSimplify/and-or-icmp-min-max.ll
index 7ea1797c99898..4e3832f31e5a4 100644
--- a/llvm/test/Transforms/InstSimplify/and-or-icmp-min-max.ll
+++ b/llvm/test/Transforms/InstSimplify/and-or-icmp-min-max.ll
@@ -16,10 +16,7 @@
 
 define i1 @slt_and_max(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @slt_and_max(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 [[X:%.*]], [[Y:%.*]]
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X]], 127
-; CHECK-NEXT:    [[R:%.*]] = and i1 [[CMP]], [[CMPEQ]]
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    ret i1 false
 ;
   %cmp = icmp slt i8 %x, %y
   %cmpeq = icmp eq i8 %x, 127
@@ -29,10 +26,7 @@ define i1 @slt_and_max(i8 %x, i8 %y)  {
 
 define <2 x i1> @slt_and_max_commute(<2 x i8> %x, <2 x i8> %y)  {
 ; CHECK-LABEL: @slt_and_max_commute(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt <2 x i8> [[X:%.*]], [[Y:%.*]]
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq <2 x i8> [[X]], <i8 127, i8 127>
-; CHECK-NEXT:    [[R:%.*]] = and <2 x i1> [[CMPEQ]], [[CMP]]
-; CHECK-NEXT:    ret <2 x i1> [[R]]
+; CHECK-NEXT:    ret <2 x i1> zeroinitializer
 ;
   %cmp = icmp slt <2 x i8> %x, %y
   %cmpeq = icmp eq <2 x i8> %x, <i8 127, i8 127>
@@ -42,10 +36,7 @@ define <2 x i1> @slt_and_max_commute(<2 x i8> %x, <2 x i8> %y)  {
 
 define i1 @slt_swap_and_max(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @slt_swap_and_max(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i8 [[Y:%.*]], [[X:%.*]]
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X]], 127
-; CHECK-NEXT:    [[R:%.*]] = and i1 [[CMP]], [[CMPEQ]]
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    ret i1 false
 ;
   %cmp = icmp sgt i8 %y, %x
   %cmpeq = icmp eq i8 %x, 127
@@ -55,10 +46,7 @@ define i1 @slt_swap_and_max(i8 %x, i8 %y)  {
 
 define i1 @slt_swap_and_max_commute(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @slt_swap_and_max_commute(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i8 [[Y:%.*]], [[X:%.*]]
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X]], 127
-; CHECK-NEXT:    [[R:%.*]] = and i1 [[CMPEQ]], [[CMP]]
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    ret i1 false
 ;
   %cmp = icmp sgt i8 %y, %x
   %cmpeq = icmp eq i8 %x, 127
@@ -68,10 +56,7 @@ define i1 @slt_swap_and_max_commute(i8 %x, i8 %y)  {
 
 define i1 @ult_and_max(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @ult_and_max(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i8 [[X:%.*]], [[Y:%.*]]
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X]], -1
-; CHECK-NEXT:    [[R:%.*]] = and i1 [[CMP]], [[CMPEQ]]
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    ret i1 false
 ;
   %cmp = icmp ult i8 %x, %y
   %cmpeq = icmp eq i8 %x, 255
@@ -81,10 +66,7 @@ define i1 @ult_and_max(i8 %x, i8 %y)  {
 
 define i1 @ult_and_max_commute(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @ult_and_max_commute(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i8 [[X:%.*]], [[Y:%.*]]
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X]], -1
-; CHECK-NEXT:    [[R:%.*]] = and i1 [[CMPEQ]], [[CMP]]
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    ret i1 false
 ;
   %cmp = icmp ult i8 %x, %y
   %cmpeq = icmp eq i8 %x, 255
@@ -94,10 +76,7 @@ define i1 @ult_and_max_commute(i8 %x, i8 %y)  {
 
 define i1 @ult_swap_and_max(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @ult_swap_and_max(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i8 [[Y:%.*]], [[X:%.*]]
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X]], -1
-; CHECK-NEXT:    [[R:%.*]] = and i1 [[CMP]], [[CMPEQ]]
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    ret i1 false
 ;
   %cmp = icmp ugt i8 %y, %x
   %cmpeq = icmp eq i8 %x, 255
@@ -107,10 +86,7 @@ define i1 @ult_swap_and_max(i8 %x, i8 %y)  {
 
 define i1 @ult_swap_and_max_commute(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @ult_swap_and_max_commute(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i8 [[Y:%.*]], [[X:%.*]]
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X]], -1
-; CHECK-NEXT:    [[R:%.*]] = and i1 [[CMPEQ]], [[CMP]]
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    ret i1 false
 ;
   %cmp = icmp ugt i8 %y, %x
   %cmpeq = icmp eq i8 %x, 255
@@ -126,10 +102,7 @@ define i1 @ult_swap_and_max_commute(i8 %x, i8 %y)  {
 
 define i1 @sgt_and_min(i9 %x, i9 %y)  {
 ; CHECK-LABEL: @sgt_and_min(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i9 [[X:%.*]], [[Y:%.*]]
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i9 [[X]], -256
-; CHECK-NEXT:    [[R:%.*]] = and i1 [[CMP]], [[CMPEQ]]
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    ret i1 false
 ;
   %cmp = icmp sgt i9 %x, %y
   %cmpeq = icmp eq i9 %x, 256
@@ -139,10 +112,7 @@ define i1 @sgt_and_min(i9 %x, i9 %y)  {
 
 define i1 @sgt_and_min_commute(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @sgt_and_min_commute(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i8 [[X:%.*]], [[Y:%.*]]
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X]], -128
-; CHECK-NEXT:    [[R:%.*]] = and i1 [[CMPEQ]], [[CMP]]
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    ret i1 false
 ;
   %cmp = icmp sgt i8 %x, %y
   %cmpeq = icmp eq i8 %x, 128
@@ -152,10 +122,7 @@ define i1 @sgt_and_min_commute(i8 %x, i8 %y)  {
 
 define i1 @sgt_swap_and_min(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @sgt_swap_and_min(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 [[Y:%.*]], [[X:%.*]]
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X]], -128
-; CHECK-NEXT:    [[R:%.*]] = and i1 [[CMP]], [[CMPEQ]]
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    ret i1 false
 ;
   %cmp = icmp slt i8 %y, %x
   %cmpeq = icmp eq i8 %x, 128
@@ -165,10 +132,7 @@ define i1 @sgt_swap_and_min(i8 %x, i8 %y)  {
 
 define i1 @sgt_swap_and_min_commute(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @sgt_swap_and_min_commute(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 [[Y:%.*]], [[X:%.*]]
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X]], -128
-; CHECK-NEXT:    [[R:%.*]] = and i1 [[CMPEQ]], [[CMP]]
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    ret i1 false
 ;
   %cmp = icmp slt i8 %y, %x
   %cmpeq = icmp eq i8 %x, 128
@@ -224,10 +188,7 @@ define i1 @ugt_swap_and_min_commute(i8 %x, i8 %y)  {
 
 define i1 @sge_or_not_max(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @sge_or_not_max(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sge i8 [[X:%.*]], [[Y:%.*]]
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X]], 127
-; CHECK-NEXT:    [[R:%.*]] = or i1 [[CMP]], [[CMPEQ]]
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    ret i1 true
 ;
   %cmp = icmp sge i8 %x, %y
   %cmpeq = icmp ne i8 %x, 127
@@ -237,10 +198,7 @@ define i1 @sge_or_not_max(i8 %x, i8 %y)  {
 
 define i1 @sge_or_not_max_commute(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @sge_or_not_max_commute(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sge i8 [[X:%.*]], [[Y:%.*]]
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X]], 127
-; CHECK-NEXT:    [[R:%.*]] = or i1 [[CMPEQ]], [[CMP]]
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    ret i1 true
 ;
   %cmp = icmp sge i8 %x, %y
   %cmpeq = icmp ne i8 %x, 127
@@ -250,10 +208,7 @@ define i1 @sge_or_not_max_commute(i8 %x, i8 %y)  {
 
 define i1 @sge_swap_or_not_max(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @sge_swap_or_not_max(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sle i8 [[Y:%.*]], [[X:%.*]]
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X]], 127
-; CHECK-NEXT:    [[R:%.*]] = or i1 [[CMP]], [[CMPEQ]]
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    ret i1 true
 ;
   %cmp = icmp sle i8 %y, %x
   %cmpeq = icmp ne i8 %x, 127
@@ -263,10 +218,7 @@ define i1 @sge_swap_or_not_max(i8 %x, i8 %y)  {
 
 define i1 @sge_swap_or_not_max_commute(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @sge_swap_or_not_max_commute(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sle i8 [[Y:%.*]], [[X:%.*]]
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X]], 127
-; CHECK-NEXT:    [[R:%.*]] = or i1 [[CMPEQ]], [[CMP]]
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    ret i1 true
 ;
   %cmp = icmp sle i8 %y, %x
   %cmpeq = icmp ne i8 %x, 127
@@ -276,10 +228,7 @@ define i1 @sge_swap_or_not_max_commute(i8 %x, i8 %y)  {
 
 define i1 @uge_or_not_max(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @uge_or_not_max(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp uge i8 [[X:%.*]], [[Y:%.*]]
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X]], -1
-; CHECK-NEXT:    [[R:%.*]] = or i1 [[CMP]], [[CMPEQ]]
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    ret i1 true
 ;
   %cmp = icmp uge i8 %x, %y
   %cmpeq = icmp ne i8 %x, 255
@@ -289,10 +238,7 @@ define i1 @uge_or_not_max(i8 %x, i8 %y)  {
 
 define i1 @uge_or_not_max_commute(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @uge_or_not_max_commute(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp uge i8 [[X:%.*]], [[Y:%.*]]
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X]], -1
-; CHECK-NEXT:    [[R:%.*]] = or i1 [[CMPEQ]], [[CMP]]
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    ret i1 true
 ;
   %cmp = icmp uge i8 %x, %y
   %cmpeq = icmp ne i8 %x, 255
@@ -302,10 +248,7 @@ define i1 @uge_or_not_max_commute(i8 %x, i8 %y)  {
 
 define i1 @uge_swap_or_not_max(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @uge_swap_or_not_max(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ule i8 [[Y:%.*]], [[X:%.*]]
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X]], -1
-; CHECK-NEXT:    [[R:%.*]] = or i1 [[CMP]], [[CMPEQ]]
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    ret i1 true
 ;
   %cmp = icmp ule i8 %y, %x
   %cmpeq = icmp ne i8 %x, 255
@@ -315,10 +258,7 @@ define i1 @uge_swap_or_not_max(i8 %x, i8 %y)  {
 
 define i1 @uge_swap_or_not_max_commute(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @uge_swap_or_not_max_commute(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ule i8 [[Y:%.*]], [[X:%.*]]
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X]], -1
-; CHECK-NEXT:    [[R:%.*]] = or i1 [[CMPEQ]], [[CMP]]
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    ret i1 true
 ;
   %cmp = icmp ule i8 %y, %x
   %cmpeq = icmp ne i8 %x, 255
@@ -334,10 +274,7 @@ define i1 @uge_swap_or_not_max_commute(i8 %x, i8 %y)  {
 
 define i1 @sle_or_not_min(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @sle_or_not_min(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sle i8 [[X:%.*]], [[Y:%.*]]
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X]], -128
-; CHECK-NEXT:    [[R:%.*]] = or i1 [[CMP]], [[CMPEQ]]
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    ret i1 true
 ;
   %cmp = icmp sle i8 %x, %y
   %cmpeq = icmp ne i8 %x, 128
@@ -347,10 +284,7 @@ define i1 @sle_or_not_min(i8 %x, i8 %y)  {
 
 define i1 @sle_or_not_min_commute(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @sle_or_not_min_commute(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sle i8 [[X:%.*]], [[Y:%.*]]
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X]], -128
-; CHECK-NEXT:    [[R:%.*]] = or i1 [[CMPEQ]], [[CMP]]
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    ret i1 true
 ;
   %cmp = icmp sle i8 %x, %y
   %cmpeq = icmp ne i8 %x, 128
@@ -360,10 +294,7 @@ define i1 @sle_or_not_min_commute(i8 %x, i8 %y)  {
 
 define i1 @sle_swap_or_not_min(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @sle_swap_or_not_min(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sge i8 [[Y:%.*]], [[X:%.*]]
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X]], -128
-; CHECK-NEXT:    [[R:%.*]] = or i1 [[CMP]], [[CMPEQ]]
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    ret i1 true
 ;
   %cmp = icmp sge i8 %y, %x
   %cmpeq = icmp ne i8 %x, 128
@@ -373,10 +304,7 @@ define i1 @sle_swap_or_not_min(i8 %x, i8 %y)  {
 
 define i1 @sle_swap_or_not_min_commute(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @sle_swap_or_not_min_commute(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sge i8 [[Y:%.*]], [[X:%.*]]
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X]], -128
-; CHECK-NEXT:    [[R:%.*]] = or i1 [[CMPEQ]], [[CMP]]
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    ret i1 true
 ;
   %cmp = icmp sge i8 %y, %x
   %cmpeq = icmp ne i8 %x, 128
@@ -432,10 +360,8 @@ define i1 @ule_swap_or_not_min_commute(i8 %x, i8 %y)  {
 
 define i1 @sge_and_max(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @sge_and_max(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sge i8 [[X:%.*]], [[Y:%.*]]
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X]], 127
-; CHECK-NEXT:    [[R:%.*]] = and i1 [[CMP]], [[CMPEQ]]
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X:%.*]], 127
+; CHECK-NEXT:    ret i1 [[CMPEQ]]
 ;
   %cmp = icmp sge i8 %x, %y
   %cmpeq = icmp eq i8 %x, 127
@@ -445,10 +371,8 @@ define i1 @sge_and_max(i8 %x, i8 %y)  {
 
 define i1 @sge_and_max_commute(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @sge_and_max_commute(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sge i8 [[X:%.*]], [[Y:%.*]]
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X]], 127
-; CHECK-NEXT:    [[R:%.*]] = and i1 [[CMPEQ]], [[CMP]]
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X:%.*]], 127
+; CHECK-NEXT:    ret i1 [[CMPEQ]]
 ;
   %cmp = icmp sge i8 %x, %y
   %cmpeq = icmp eq i8 %x, 127
@@ -458,10 +382,8 @@ define i1 @sge_and_max_commute(i8 %x, i8 %y)  {
 
 define i1 @sge_swap_and_max(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @sge_swap_and_max(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sle i8 [[Y:%.*]], [[X:%.*]]
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X]], 127
-; CHECK-NEXT:    [[R:%.*]] = and i1 [[CMP]], [[CMPEQ]]
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X:%.*]], 127
+; CHECK-NEXT:    ret i1 [[CMPEQ]]
 ;
   %cmp = icmp sle i8 %y, %x
   %cmpeq = icmp eq i8 %x, 127
@@ -471,10 +393,8 @@ define i1 @sge_swap_and_max(i8 %x, i8 %y)  {
 
 define i1 @sge_swap_and_max_commute(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @sge_swap_and_max_commute(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sle i8 [[Y:%.*]], [[X:%.*]]
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X]], 127
-; CHECK-NEXT:    [[R:%.*]] = and i1 [[CMPEQ]], [[CMP]]
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X:%.*]], 127
+; CHECK-NEXT:    ret i1 [[CMPEQ]]
 ;
   %cmp = icmp sle i8 %y, %x
   %cmpeq = icmp eq i8 %x, 127
@@ -484,10 +404,8 @@ define i1 @sge_swap_and_max_commute(i8 %x, i8 %y)  {
 
 define i1 @uge_and_max(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @uge_and_max(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp uge i8 [[X:%.*]], [[Y:%.*]]
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X]], -1
-; CHECK-NEXT:    [[R:%.*]] = and i1 [[CMP]], [[CMPEQ]]
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X:%.*]], -1
+; CHECK-NEXT:    ret i1 [[CMPEQ]]
 ;
   %cmp = icmp uge i8 %x, %y
   %cmpeq = icmp eq i8 %x, 255
@@ -497,10 +415,8 @@ define i1 @uge_and_max(i8 %x, i8 %y)  {
 
 define i1 @uge_and_max_commute(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @uge_and_max_commute(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp uge i8 [[X:%.*]], [[Y:%.*]]
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X]], -1
-; CHECK-NEXT:    [[R:%.*]] = and i1 [[CMPEQ]], [[CMP]]
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X:%.*]], -1
+; CHECK-NEXT:    ret i1 [[CMPEQ]]
 ;
   %cmp = icmp uge i8 %x, %y
   %cmpeq = icmp eq i8 %x, 255
@@ -510,10 +426,8 @@ define i1 @uge_and_max_commute(i8 %x, i8 %y)  {
 
 define i1 @uge_swap_and_max(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @uge_swap_and_max(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ule i8 [[Y:%.*]], [[X:%.*]]
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X]], -1
-; CHECK-NEXT:    [[R:%.*]] = and i1 [[CMP]], [[CMPEQ]]
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X:%.*]], -1
+; CHECK-NEXT:    ret i1 [[CMPEQ]]
 ;
   %cmp = icmp ule i8 %y, %x
   %cmpeq = icmp eq i8 %x, 255
@@ -523,10 +437,8 @@ define i1 @uge_swap_and_max(i8 %x, i8 %y)  {
 
 define i1 @uge_swap_and_max_commute(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @uge_swap_and_max_commute(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ule i8 [[Y:%.*]], [[X:%.*]]
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X]], -1
-; CHECK-NEXT:    [[R:%.*]] = and i1 [[CMPEQ]], [[CMP]]
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X:%.*]], -1
+; CHECK-NEXT:    ret i1 [[CMPEQ]]
 ;
   %cmp = icmp ule i8 %y, %x
   %cmpeq = icmp eq i8 %x, 255
@@ -542,10 +454,8 @@ define i1 @uge_swap_and_max_commute(i8 %x, i8 %y)  {
 
 define i1 @sle_and_min(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @sle_and_min(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sle i8 [[X:%.*]], [[Y:%.*]]
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X]], -128
-; CHECK-NEXT:    [[R:%.*]] = and i1 [[CMP]], [[CMPEQ]]
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X:%.*]], -128
+; CHECK-NEXT:    ret i1 [[CMPEQ]]
 ;
   %cmp = icmp sle i8 %x, %y
   %cmpeq = icmp eq i8 %x, 128
@@ -555,10 +465,8 @@ define i1 @sle_and_min(i8 %x, i8 %y)  {
 
 define i1 @sle_and_min_commute(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @sle_and_min_commute(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sle i8 [[X:%.*]], [[Y:%.*]]
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X]], -128
-; CHECK-NEXT:    [[R:%.*]] = and i1 [[CMPEQ]], [[CMP]]
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X:%.*]], -128
+; CHECK-NEXT:    ret i1 [[CMPEQ]]
 ;
   %cmp = icmp sle i8 %x, %y
   %cmpeq = icmp eq i8 %x, 128
@@ -568,10 +476,8 @@ define i1 @sle_and_min_commute(i8 %x, i8 %y)  {
 
 define i1 @sle_swap_and_min(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @sle_swap_and_min(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sge i8 [[Y:%.*]], [[X:%.*]]
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X]], -128
-; CHECK-NEXT:    [[R:%.*]] = and i1 [[CMP]], [[CMPEQ]]
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X:%.*]], -128
+; CHECK-NEXT:    ret i1 [[CMPEQ]]
 ;
   %cmp = icmp sge i8 %y, %x
   %cmpeq = icmp eq i8 %x, 128
@@ -581,10 +487,8 @@ define i1 @sle_swap_and_min(i8 %x, i8 %y)  {
 
 define i1 @sle_swap_and_min_commute(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @sle_swap_and_min_commute(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sge i8 [[Y:%.*]], [[X:%.*]]
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X]], -128
-; CHECK-NEXT:    [[R:%.*]] = and i1 [[CMPEQ]], [[CMP]]
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X:%.*]], -128
+; CHECK-NEXT:    ret i1 [[CMPEQ]]
 ;
   %cmp = icmp sge i8 %y, %x
   %cmpeq = icmp eq i8 %x, 128
@@ -1020,10 +924,8 @@ define i1 @ugt_swap_and_not_min_commute(i8 %x, i8 %y)  {
 
 define i1 @slt_or_not_max(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @slt_or_not_max(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 [[X:%.*]], [[Y:%.*]]
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X]], 127
-; CHECK-NEXT:    [[R:%.*]] = or i1 [[CMP]], [[CMPEQ]]
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X:%.*]], 127
+; CHECK-NEXT:    ret i1 [[CMPEQ]]
 ;
   %cmp = icmp slt i8 %x, %y
   %cmpeq = icmp ne i8 %x, 127
@@ -1033,10 +935,8 @@ define i1 @slt_or_not_max(i8 %x, i8 %y)  {
 
 define i1 @slt_or_not_max_commute(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @slt_or_not_max_commute(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 [[X:%.*]], [[Y:%.*]]
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X]], 127
-; CHECK-NEXT:    [[R:%.*]] = or i1 [[CMPEQ]], [[CMP]]
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X:%.*]], 127
+; CHECK-NEXT:    ret i1 [[CMPEQ]]
 ;
   %cmp = icmp slt i8 %x, %y
   %cmpeq = icmp ne i8 %x, 127
@@ -1046,10 +946,8 @@ define i1 @slt_or_not_max_commute(i8 %x, i8 %y)  {
 
 define i1 @slt_swap_or_not_max(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @slt_swap_or_not_max(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i8 [[Y:%.*]], [[X:%.*]]
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X]], 127
-; CHECK-NEXT:    [[R:%.*]] = or i1 [[CMP]], [[CMPEQ]]
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X:%.*]], 127
+; CHECK-NEXT:    ret i1 [[CMPEQ]]
 ;
   %cmp = icmp sgt i8 %y, %x
   %cmpeq = icmp ne i8 %x, 127
@@ -1059,10 +957,8 @@ define i1 @slt_swap_or_not_max(i8 %x, i8 %y)  {
 
 define i1 @slt_swap_or_not_max_commute(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @slt_swap_or_not_max_commute(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i8 [[Y:%.*]], [[X:%.*]]
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X]], 127
-; CHECK-NEXT:    [[R:%.*]] = or i1 [[CMPEQ]], [[CMP]]
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X:%.*]], 127
+; CHECK-NEXT:    ret i1 [[CMPEQ]]
 ;
   %cmp = icmp sgt i8 %y, %x
   %cmpeq = icmp ne i8 %x, 127
@@ -1072,10 +968,8 @@ define i1 @slt_swap_or_not_max_commute(i8 %x, i8 %y)  {
 
 define i1 @ult_or_not_max(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @ult_or_not_max(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i8 [[X:%.*]], [[Y:%.*]]
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X]], -1
-; CHECK-NEXT:    [[R:%.*]] = or i1 [[CMP]], [[CMPEQ]]
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X:%.*]], -1
+; CHECK-NEXT:    ret i1 [[CMPEQ]]
 ;
   %cmp = icmp ult i8 %x, %y
   %cmpeq = icmp ne i8 %x, 255
@@ -1085,10 +979,8 @@ define i1 @ult_or_not_max(i8 %x, i8 %y)  {
 
 define i1 @ult_or_not_max_commute(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @ult_or_not_max_commute(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i8 [[X:%.*]], [[Y:%.*]]
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X]], -1
-; CHECK-NEXT:    [[R:%.*]] = or i1 [[CMPEQ]], [[CMP]]
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X:%.*]], -1
+; CHECK-NEXT:    ret i1 [[CMPEQ]]
 ;
   %cmp = icmp ult i8 %x, %y
   %cmpeq = icmp ne i8 %x, 255
@@ -1098,10 +990,8 @@ define i1 @ult_or_not_max_commute(i8 %x, i8 %y)  {
 
 define i1 @ult_swap_or_not_max(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @ult_swap_or_not_max(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i8 [[Y:%.*]], [[X:%.*]]
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X]], -1
-; CHECK-NEXT:    [[R:%.*]] = or i1 [[CMP]], [[CMPEQ]]
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X:%.*]], -1
+; CHECK-NEXT:    ret i1 [[CMPEQ]]
 ;
   %cmp = icmp ugt i8 %y, %x
   %cmpeq = icmp ne i8 %x, 255
@@ -1111,10 +1001,8 @@ define i1 @ult_swap_or_not_max(i8 %x, i8 %y)  {
 
 define i1 @ult_swap_or_not_max_commute(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @ult_swap_or_not_max_commute(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i8 [[Y:%.*]], [[X:%.*]]
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X]], -1
-; CHECK-NEXT:    [[R:%.*]] = or i1 [[CMPEQ]], [[CMP]]
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X:%.*]], -1
+; CHECK-NEXT:    ret i1 [[CMPEQ]]
 ;
   %cmp = icmp ugt i8 %y, %x
   %cmpeq = icmp ne i8 %x, 255
@@ -1130,10 +1018,8 @@ define i1 @ult_swap_or_not_max_commute(i8 %x, i8 %y)  {
 
 define i1 @sgt_or_not_min(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @sgt_or_not_min(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i8 [[X:%.*]], [[Y:%.*]]
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X]], -128
-; CHECK-NEXT:    [[R:%.*]] = or i1 [[CMP]], [[CMPEQ]]
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X:%.*]], -128
+; CHECK-NEXT:    ret i1 [[CMPEQ]]
 ;
   %cmp = icmp sgt i8 %x, %y
   %cmpeq = icmp ne i8 %x, 128
@@ -1143,10 +1029,8 @@ define i1 @sgt_or_not_min(i8 %x, i8 %y)  {
 
 define i1 @sgt_or_not_min_commute(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @sgt_or_not_min_commute(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i8 [[X:%.*]], [[Y:%.*]]
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X]], -128
-; CHECK-NEXT:    [[R:%.*]] = or i1 [[CMPEQ]], [[CMP]]
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X:%.*]], -128
+; CHECK-NEXT:    ret i1 [[CMPEQ]]
 ;
   %cmp = icmp sgt i8 %x, %y
   %cmpeq = icmp ne i8 %x, 128
@@ -1156,10 +1040,8 @@ define i1 @sgt_or_not_min_commute(i8 %x, i8 %y)  {
 
 define i1 @sgt_swap_or_not_min(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @sgt_swap_or_not_min(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 [[Y:%.*]], [[X:%.*]]
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X]], -128
-; CHECK-NEXT:    [[R:%.*]] = or i1 [[CMP]], [[CMPEQ]]
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X:%.*]], -128
+; CHECK-NEXT:    ret i1 [[CMPEQ]]
 ;
   %cmp = icmp slt i8 %y, %x
   %cmpeq = icmp ne i8 %x, 128
@@ -1169,10 +1051,8 @@ define i1 @sgt_swap_or_not_min(i8 %x, i8 %y)  {
 
 define i1 @sgt_swap_or_not_min_commute(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @sgt_swap_or_not_min_commute(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 [[Y:%.*]], [[X:%.*]]
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X]], -128
-; CHECK-NEXT:    [[R:%.*]] = or i1 [[CMPEQ]], [[CMP]]
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X:%.*]], -128
+; CHECK-NEXT:    ret i1 [[CMPEQ]]
 ;
   %cmp = icmp slt i8 %y, %x
   %cmpeq = icmp ne i8 %x, 128
@@ -1232,11 +1112,7 @@ define i1 @ugt_swap_or_not_min_commute(i823 %x, i823 %y)  {
 
 define i1 @slt_and_max_not_op(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @slt_and_max_not_op(
-; CHECK-NEXT:    [[NOTX:%.*]] = xor i8 [[X:%.*]], -1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 [[NOTX]], [[Y:%.*]]
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X]], -128
-; CHECK-NEXT:    [[R:%.*]] = and i1 [[CMP]], [[CMPEQ]]
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    ret i1 false
 ;
   %notx = xor i8 %x, -1
   %cmp = icmp slt i8 %notx, %y
@@ -1247,11 +1123,7 @@ define i1 @slt_and_max_not_op(i8 %x, i8 %y)  {
 
 define <2 x i1> @slt_and_max_commute_not_op(<2 x i8> %x, <2 x i8> %y)  {
 ; CHECK-LABEL: @slt_and_max_commute_not_op(
-; CHECK-NEXT:    [[NOTX:%.*]] = xor <2 x i8> [[X:%.*]], <i8 -1, i8 -1>
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt <2 x i8> [[NOTX]], [[Y:%.*]]
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq <2 x i8> [[X]], <i8 -128, i8 -128>
-; CHECK-NEXT:    [[R:%.*]] = and <2 x i1> [[CMPEQ]], [[CMP]]
-; CHECK-NEXT:    ret <2 x i1> [[R]]
+; CHECK-NEXT:    ret <2 x i1> zeroinitializer
 ;
   %notx = xor <2 x i8> %x, <i8 -1, i8 -1>
   %cmp = icmp slt <2 x i8> %notx, %y
@@ -1262,11 +1134,7 @@ define <2 x i1> @slt_and_max_commute_not_op(<2 x i8> %x, <2 x i8> %y)  {
 
 define i1 @slt_swap_and_max_not_op(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @slt_swap_and_max_not_op(
-; CHECK-NEXT:    [[NOTX:%.*]] = xor i8 [[X:%.*]], -1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i8 [[Y:%.*]], [[NOTX]]
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X]], -128
-; CHECK-NEXT:    [[R:%.*]] = and i1 [[CMP]], [[CMPEQ]]
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    ret i1 false
 ;
   %notx = xor i8 %x, -1
   %cmp = icmp sgt i8 %y, %notx
@@ -1277,11 +1145,7 @@ define i1 @slt_swap_and_max_not_op(i8 %x, i8 %y)  {
 
 define i1 @slt_swap_and_max_commute_not_op(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @slt_swap_and_max_commute_not_op(
-; CHECK-NEXT:    [[NOTX:%.*]] = xor i8 [[X:%.*]], -1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i8 [[Y:%.*]], [[NOTX]]
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X]], -128
-; CHECK-NEXT:    [[R:%.*]] = and i1 [[CMPEQ]], [[CMP]]
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    ret i1 false
 ;
   %notx = xor i8 %x, -1
   %cmp = icmp sgt i8 %y, %notx
@@ -1292,11 +1156,7 @@ define i1 @slt_swap_and_max_commute_not_op(i8 %x, i8 %y)  {
 
 define i1 @ult_and_max_not_op(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @ult_and_max_not_op(
-; CHECK-NEXT:    [[NOTX:%.*]] = xor i8 [[X:%.*]], -1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i8 [[NOTX]], [[Y:%.*]]
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X]], 0
-; CHECK-NEXT:    [[R:%.*]] = and i1 [[CMP]], [[CMPEQ]]
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    ret i1 false
 ;
   %notx = xor i8 %x, -1
   %cmp = icmp ult i8 %notx, %y
@@ -1307,11 +1167,7 @@ define i1 @ult_and_max_not_op(i8 %x, i8 %y)  {
 
 define i1 @ult_and_max_commute_not_op(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @ult_and_max_commute_not_op(
-; CHECK-NEXT:    [[NOTX:%.*]] = xor i8 [[X:%.*]], -1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i8 [[NOTX]], [[Y:%.*]]
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X]], 0
-; CHECK-NEXT:    [[R:%.*]] = and i1 [[CMPEQ]], [[CMP]]
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    ret i1 false
 ;
   %notx = xor i8 %x, -1
   %cmp = icmp ult i8 %notx, %y
@@ -1322,11 +1178,7 @@ define i1 @ult_and_max_commute_not_op(i8 %x, i8 %y)  {
 
 define i1 @ult_swap_and_max_not_op(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @ult_swap_and_max_not_op(
-; CHECK-NEXT:    [[NOTX:%.*]] = xor i8 [[X:%.*]], -1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i8 [[Y:%.*]], [[NOTX]]
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X]], 0
-; CHECK-NEXT:    [[R:%.*]] = and i1 [[CMP]], [[CMPEQ]]
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    ret i1 false
 ;
   %notx = xor i8 %x, -1
   %cmp = icmp ugt i8 %y, %notx
@@ -1337,11 +1189,7 @@ define i1 @ult_swap_and_max_not_op(i8 %x, i8 %y)  {
 
 define i1 @ult_swap_and_max_commute_not_op(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @ult_swap_and_max_commute_not_op(
-; CHECK-NEXT:    [[NOTX:%.*]] = xor i8 [[X:%.*]], -1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i8 [[Y:%.*]], [[NOTX]]
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X]], 0
-; CHECK-NEXT:    [[R:%.*]] = and i1 [[CMPEQ]], [[CMP]]
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    ret i1 false
 ;
   %notx = xor i8 %x, -1
   %cmp = icmp ugt i8 %y, %notx
@@ -1358,11 +1206,7 @@ define i1 @ult_swap_and_max_commute_not_op(i8 %x, i8 %y)  {
 
 define i1 @sgt_and_min_not_op(i9 %x, i9 %y)  {
 ; CHECK-LABEL: @sgt_and_min_not_op(
-; CHECK-NEXT:    [[NOTX:%.*]] = xor i9 [[X:%.*]], -1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i9 [[NOTX]], [[Y:%.*]]
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i9 [[X]], 255
-; CHECK-NEXT:    [[R:%.*]] = and i1 [[CMP]], [[CMPEQ]]
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    ret i1 false
 ;
   %notx = xor i9 %x, -1
   %cmp = icmp sgt i9 %notx, %y
@@ -1373,11 +1217,7 @@ define i1 @sgt_and_min_not_op(i9 %x, i9 %y)  {
 
 define i1 @sgt_and_min_commute_not_op(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @sgt_and_min_commute_not_op(
-; CHECK-NEXT:    [[NOTX:%.*]] = xor i8 [[X:%.*]], -1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i8 [[NOTX]], [[Y:%.*]]
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X]], 127
-; CHECK-NEXT:    [[R:%.*]] = and i1 [[CMPEQ]], [[CMP]]
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    ret i1 false
 ;
   %notx = xor i8 %x, -1
   %cmp = icmp sgt i8 %notx, %y
@@ -1388,11 +1228,7 @@ define i1 @sgt_and_min_commute_not_op(i8 %x, i8 %y)  {
 
 define i1 @sgt_swap_and_min_not_op(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @sgt_swap_and_min_not_op(
-; CHECK-NEXT:    [[NOTX:%.*]] = xor i8 [[X:%.*]], -1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 [[Y:%.*]], [[NOTX]]
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X]], 127
-; CHECK-NEXT:    [[R:%.*]] = and i1 [[CMP]], [[CMPEQ]]
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    ret i1 false
 ;
   %notx = xor i8 %x, -1
   %cmp = icmp slt i8 %y, %notx
@@ -1403,11 +1239,7 @@ define i1 @sgt_swap_and_min_not_op(i8 %x, i8 %y)  {
 
 define i1 @sgt_swap_and_min_commute_not_op(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @sgt_swap_and_min_commute_not_op(
-; CHECK-NEXT:    [[NOTX:%.*]] = xor i8 [[X:%.*]], -1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 [[Y:%.*]], [[NOTX]]
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X]], 127
-; CHECK-NEXT:    [[R:%.*]] = and i1 [[CMPEQ]], [[CMP]]
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    ret i1 false
 ;
   %notx = xor i8 %x, -1
   %cmp = icmp slt i8 %y, %notx
@@ -1418,11 +1250,7 @@ define i1 @sgt_swap_and_min_commute_not_op(i8 %x, i8 %y)  {
 
 define i1 @ugt_and_min_not_op(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @ugt_and_min_not_op(
-; CHECK-NEXT:    [[NOTX:%.*]] = xor i8 [[X:%.*]], -1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i8 [[NOTX]], [[Y:%.*]]
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X]], -1
-; CHECK-NEXT:    [[R:%.*]] = and i1 [[CMP]], [[CMPEQ]]
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    ret i1 false
 ;
   %notx = xor i8 %x, -1
   %cmp = icmp ugt i8 %notx, %y
@@ -1433,11 +1261,7 @@ define i1 @ugt_and_min_not_op(i8 %x, i8 %y)  {
 
 define i1 @ugt_and_min_commute_not_op(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @ugt_and_min_commute_not_op(
-; CHECK-NEXT:    [[NOTX:%.*]] = xor i8 [[X:%.*]], -1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i8 [[NOTX]], [[Y:%.*]]
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X]], -1
-; CHECK-NEXT:    [[R:%.*]] = and i1 [[CMPEQ]], [[CMP]]
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    ret i1 false
 ;
   %notx = xor i8 %x, -1
   %cmp = icmp ugt i8 %notx, %y
@@ -1448,11 +1272,7 @@ define i1 @ugt_and_min_commute_not_op(i8 %x, i8 %y)  {
 
 define i1 @ugt_swap_and_min_not_op(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @ugt_swap_and_min_not_op(
-; CHECK-NEXT:    [[NOTX:%.*]] = xor i8 [[X:%.*]], -1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i8 [[Y:%.*]], [[NOTX]]
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X]], -1
-; CHECK-NEXT:    [[R:%.*]] = and i1 [[CMP]], [[CMPEQ]]
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    ret i1 false
 ;
   %notx = xor i8 %x, -1
   %cmp = icmp ult i8 %y, %notx
@@ -1463,11 +1283,7 @@ define i1 @ugt_swap_and_min_not_op(i8 %x, i8 %y)  {
 
 define i1 @ugt_swap_and_min_commute_not_op(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @ugt_swap_and_min_commute_not_op(
-; CHECK-NEXT:    [[NOTX:%.*]] = xor i8 [[X:%.*]], -1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i8 [[Y:%.*]], [[NOTX]]
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X]], -1
-; CHECK-NEXT:    [[R:%.*]] = and i1 [[CMPEQ]], [[CMP]]
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    ret i1 false
 ;
   %notx = xor i8 %x, -1
   %cmp = icmp ult i8 %y, %notx
@@ -1484,11 +1300,7 @@ define i1 @ugt_swap_and_min_commute_not_op(i8 %x, i8 %y)  {
 
 define i1 @sge_or_not_max_not_op(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @sge_or_not_max_not_op(
-; CHECK-NEXT:    [[NOTX:%.*]] = xor i8 [[X:%.*]], -1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sge i8 [[NOTX]], [[Y:%.*]]
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X]], -128
-; CHECK-NEXT:    [[R:%.*]] = or i1 [[CMP]], [[CMPEQ]]
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    ret i1 true
 ;
   %notx = xor i8 %x, -1
   %cmp = icmp sge i8 %notx, %y
@@ -1499,11 +1311,7 @@ define i1 @sge_or_not_max_not_op(i8 %x, i8 %y)  {
 
 define i1 @sge_or_not_max_commute_not_op(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @sge_or_not_max_commute_not_op(
-; CHECK-NEXT:    [[NOTX:%.*]] = xor i8 [[X:%.*]], -1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sge i8 [[NOTX]], [[Y:%.*]]
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X]], -128
-; CHECK-NEXT:    [[R:%.*]] = or i1 [[CMPEQ]], [[CMP]]
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    ret i1 true
 ;
   %notx = xor i8 %x, -1
   %cmp = icmp sge i8 %notx, %y
@@ -1514,11 +1322,7 @@ define i1 @sge_or_not_max_commute_not_op(i8 %x, i8 %y)  {
 
 define i1 @sge_swap_or_not_max_not_op(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @sge_swap_or_not_max_not_op(
-; CHECK-NEXT:    [[NOTX:%.*]] = xor i8 [[X:%.*]], -1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sle i8 [[Y:%.*]], [[NOTX]]
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X]], -128
-; CHECK-NEXT:    [[R:%.*]] = or i1 [[CMP]], [[CMPEQ]]
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    ret i1 true
 ;
   %notx = xor i8 %x, -1
   %cmp = icmp sle i8 %y, %notx
@@ -1529,11 +1333,7 @@ define i1 @sge_swap_or_not_max_not_op(i8 %x, i8 %y)  {
 
 define i1 @sge_swap_or_not_max_commute_not_op(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @sge_swap_or_not_max_commute_not_op(
-; CHECK-NEXT:    [[NOTX:%.*]] = xor i8 [[X:%.*]], -1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sle i8 [[Y:%.*]], [[NOTX]]
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X]], -128
-; CHECK-NEXT:    [[R:%.*]] = or i1 [[CMPEQ]], [[CMP]]
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    ret i1 true
 ;
   %notx = xor i8 %x, -1
   %cmp = icmp sle i8 %y, %notx
@@ -1544,11 +1344,7 @@ define i1 @sge_swap_or_not_max_commute_not_op(i8 %x, i8 %y)  {
 
 define i1 @uge_or_not_max_not_op(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @uge_or_not_max_not_op(
-; CHECK-NEXT:    [[NOTX:%.*]] = xor i8 [[X:%.*]], -1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp uge i8 [[NOTX]], [[Y:%.*]]
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X]], 0
-; CHECK-NEXT:    [[R:%.*]] = or i1 [[CMP]], [[CMPEQ]]
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    ret i1 true
 ;
   %notx = xor i8 %x, -1
   %cmp = icmp uge i8 %notx, %y
@@ -1559,11 +1355,7 @@ define i1 @uge_or_not_max_not_op(i8 %x, i8 %y)  {
 
 define i1 @uge_or_not_max_commute_not_op(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @uge_or_not_max_commute_not_op(
-; CHECK-NEXT:    [[NOTX:%.*]] = xor i8 [[X:%.*]], -1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp uge i8 [[NOTX]], [[Y:%.*]]
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X]], 0
-; CHECK-NEXT:    [[R:%.*]] = or i1 [[CMPEQ]], [[CMP]]
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    ret i1 true
 ;
   %notx = xor i8 %x, -1
   %cmp = icmp uge i8 %notx, %y
@@ -1574,11 +1366,7 @@ define i1 @uge_or_not_max_commute_not_op(i8 %x, i8 %y)  {
 
 define i1 @uge_swap_or_not_max_not_op(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @uge_swap_or_not_max_not_op(
-; CHECK-NEXT:    [[NOTX:%.*]] = xor i8 [[X:%.*]], -1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ule i8 [[Y:%.*]], [[NOTX]]
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X]], 0
-; CHECK-NEXT:    [[R:%.*]] = or i1 [[CMP]], [[CMPEQ]]
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    ret i1 true
 ;
   %notx = xor i8 %x, -1
   %cmp = icmp ule i8 %y, %notx
@@ -1589,11 +1377,7 @@ define i1 @uge_swap_or_not_max_not_op(i8 %x, i8 %y)  {
 
 define i1 @uge_swap_or_not_max_commute_not_op(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @uge_swap_or_not_max_commute_not_op(
-; CHECK-NEXT:    [[NOTX:%.*]] = xor i8 [[X:%.*]], -1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ule i8 [[Y:%.*]], [[NOTX]]
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X]], 0
-; CHECK-NEXT:    [[R:%.*]] = or i1 [[CMPEQ]], [[CMP]]
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    ret i1 true
 ;
   %notx = xor i8 %x, -1
   %cmp = icmp ule i8 %y, %notx
@@ -1610,11 +1394,7 @@ define i1 @uge_swap_or_not_max_commute_not_op(i8 %x, i8 %y)  {
 
 define i1 @sle_or_not_min_not_op(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @sle_or_not_min_not_op(
-; CHECK-NEXT:    [[NOTX:%.*]] = xor i8 [[X:%.*]], -1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sle i8 [[NOTX]], [[Y:%.*]]
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X]], 127
-; CHECK-NEXT:    [[R:%.*]] = or i1 [[CMP]], [[CMPEQ]]
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    ret i1 true
 ;
   %notx = xor i8 %x, -1
   %cmp = icmp sle i8 %notx, %y
@@ -1625,11 +1405,7 @@ define i1 @sle_or_not_min_not_op(i8 %x, i8 %y)  {
 
 define i1 @sle_or_not_min_commute_not_op(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @sle_or_not_min_commute_not_op(
-; CHECK-NEXT:    [[NOTX:%.*]] = xor i8 [[X:%.*]], -1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sle i8 [[NOTX]], [[Y:%.*]]
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X]], 127
-; CHECK-NEXT:    [[R:%.*]] = or i1 [[CMPEQ]], [[CMP]]
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    ret i1 true
 ;
   %notx = xor i8 %x, -1
   %cmp = icmp sle i8 %notx, %y
@@ -1640,11 +1416,7 @@ define i1 @sle_or_not_min_commute_not_op(i8 %x, i8 %y)  {
 
 define i1 @sle_swap_or_not_min_not_op(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @sle_swap_or_not_min_not_op(
-; CHECK-NEXT:    [[NOTX:%.*]] = xor i8 [[X:%.*]], -1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sge i8 [[Y:%.*]], [[NOTX]]
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X]], 127
-; CHECK-NEXT:    [[R:%.*]] = or i1 [[CMP]], [[CMPEQ]]
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    ret i1 true
 ;
   %notx = xor i8 %x, -1
   %cmp = icmp sge i8 %y, %notx
@@ -1655,11 +1427,7 @@ define i1 @sle_swap_or_not_min_not_op(i8 %x, i8 %y)  {
 
 define i1 @sle_swap_or_not_min_commute_not_op(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @sle_swap_or_not_min_commute_not_op(
-; CHECK-NEXT:    [[NOTX:%.*]] = xor i8 [[X:%.*]], -1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sge i8 [[Y:%.*]], [[NOTX]]
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X]], 127
-; CHECK-NEXT:    [[R:%.*]] = or i1 [[CMPEQ]], [[CMP]]
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    ret i1 true
 ;
   %notx = xor i8 %x, -1
   %cmp = icmp sge i8 %y, %notx
@@ -1670,11 +1438,7 @@ define i1 @sle_swap_or_not_min_commute_not_op(i8 %x, i8 %y)  {
 
 define i1 @ule_or_not_min_not_op(i427 %x, i427 %y)  {
 ; CHECK-LABEL: @ule_or_not_min_not_op(
-; CHECK-NEXT:    [[NOTX:%.*]] = xor i427 [[X:%.*]], -1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ule i427 [[NOTX]], [[Y:%.*]]
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i427 [[X]], -1
-; CHECK-NEXT:    [[R:%.*]] = or i1 [[CMP]], [[CMPEQ]]
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    ret i1 true
 ;
   %notx = xor i427 %x, -1
   %cmp = icmp ule i427 %notx, %y
@@ -1685,11 +1449,7 @@ define i1 @ule_or_not_min_not_op(i427 %x, i427 %y)  {
 
 define i1 @ule_or_not_min_commute_not_op(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @ule_or_not_min_commute_not_op(
-; CHECK-NEXT:    [[NOTX:%.*]] = xor i8 [[X:%.*]], -1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ule i8 [[NOTX]], [[Y:%.*]]
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X]], -1
-; CHECK-NEXT:    [[R:%.*]] = or i1 [[CMPEQ]], [[CMP]]
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    ret i1 true
 ;
   %notx = xor i8 %x, -1
   %cmp = icmp ule i8 %notx, %y
@@ -1700,11 +1460,7 @@ define i1 @ule_or_not_min_commute_not_op(i8 %x, i8 %y)  {
 
 define i1 @ule_swap_or_not_min_not_op(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @ule_swap_or_not_min_not_op(
-; CHECK-NEXT:    [[NOTX:%.*]] = xor i8 [[X:%.*]], -1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp uge i8 [[Y:%.*]], [[NOTX]]
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X]], -1
-; CHECK-NEXT:    [[R:%.*]] = or i1 [[CMP]], [[CMPEQ]]
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    ret i1 true
 ;
   %notx = xor i8 %x, -1
   %cmp = icmp uge i8 %y, %notx
@@ -1715,11 +1471,7 @@ define i1 @ule_swap_or_not_min_not_op(i8 %x, i8 %y)  {
 
 define i1 @ule_swap_or_not_min_commute_not_op(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @ule_swap_or_not_min_commute_not_op(
-; CHECK-NEXT:    [[NOTX:%.*]] = xor i8 [[X:%.*]], -1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp uge i8 [[Y:%.*]], [[NOTX]]
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X]], -1
-; CHECK-NEXT:    [[R:%.*]] = or i1 [[CMPEQ]], [[CMP]]
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    ret i1 true
 ;
   %notx = xor i8 %x, -1
   %cmp = icmp uge i8 %y, %notx
@@ -1736,11 +1488,8 @@ define i1 @ule_swap_or_not_min_commute_not_op(i8 %x, i8 %y)  {
 
 define i1 @sge_and_max_not_op(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @sge_and_max_not_op(
-; CHECK-NEXT:    [[NOTX:%.*]] = xor i8 [[X:%.*]], -1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sge i8 [[NOTX]], [[Y:%.*]]
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X]], -128
-; CHECK-NEXT:    [[R:%.*]] = and i1 [[CMP]], [[CMPEQ]]
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X:%.*]], -128
+; CHECK-NEXT:    ret i1 [[CMPEQ]]
 ;
   %notx = xor i8 %x, -1
   %cmp = icmp sge i8 %notx, %y
@@ -1751,11 +1500,8 @@ define i1 @sge_and_max_not_op(i8 %x, i8 %y)  {
 
 define i1 @sge_and_max_commute_not_op(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @sge_and_max_commute_not_op(
-; CHECK-NEXT:    [[NOTX:%.*]] = xor i8 [[X:%.*]], -1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sge i8 [[NOTX]], [[Y:%.*]]
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X]], -128
-; CHECK-NEXT:    [[R:%.*]] = and i1 [[CMPEQ]], [[CMP]]
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X:%.*]], -128
+; CHECK-NEXT:    ret i1 [[CMPEQ]]
 ;
   %notx = xor i8 %x, -1
   %cmp = icmp sge i8 %notx, %y
@@ -1766,11 +1512,8 @@ define i1 @sge_and_max_commute_not_op(i8 %x, i8 %y)  {
 
 define i1 @sge_swap_and_max_not_op(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @sge_swap_and_max_not_op(
-; CHECK-NEXT:    [[NOTX:%.*]] = xor i8 [[X:%.*]], -1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sle i8 [[Y:%.*]], [[NOTX]]
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X]], -128
-; CHECK-NEXT:    [[R:%.*]] = and i1 [[CMP]], [[CMPEQ]]
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X:%.*]], -128
+; CHECK-NEXT:    ret i1 [[CMPEQ]]
 ;
   %notx = xor i8 %x, -1
   %cmp = icmp sle i8 %y, %notx
@@ -1781,11 +1524,8 @@ define i1 @sge_swap_and_max_not_op(i8 %x, i8 %y)  {
 
 define i1 @sge_swap_and_max_commute_not_op(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @sge_swap_and_max_commute_not_op(
-; CHECK-NEXT:    [[NOTX:%.*]] = xor i8 [[X:%.*]], -1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sle i8 [[Y:%.*]], [[NOTX]]
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X]], -128
-; CHECK-NEXT:    [[R:%.*]] = and i1 [[CMPEQ]], [[CMP]]
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X:%.*]], -128
+; CHECK-NEXT:    ret i1 [[CMPEQ]]
 ;
   %notx = xor i8 %x, -1
   %cmp = icmp sle i8 %y, %notx
@@ -1796,11 +1536,8 @@ define i1 @sge_swap_and_max_commute_not_op(i8 %x, i8 %y)  {
 
 define i1 @uge_and_max_not_op(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @uge_and_max_not_op(
-; CHECK-NEXT:    [[NOTX:%.*]] = xor i8 [[X:%.*]], -1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp uge i8 [[NOTX]], [[Y:%.*]]
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X]], 0
-; CHECK-NEXT:    [[R:%.*]] = and i1 [[CMP]], [[CMPEQ]]
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X:%.*]], 0
+; CHECK-NEXT:    ret i1 [[CMPEQ]]
 ;
   %notx = xor i8 %x, -1
   %cmp = icmp uge i8 %notx, %y
@@ -1811,11 +1548,8 @@ define i1 @uge_and_max_not_op(i8 %x, i8 %y)  {
 
 define i1 @uge_and_max_commute_not_op(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @uge_and_max_commute_not_op(
-; CHECK-NEXT:    [[NOTX:%.*]] = xor i8 [[X:%.*]], -1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp uge i8 [[NOTX]], [[Y:%.*]]
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X]], 0
-; CHECK-NEXT:    [[R:%.*]] = and i1 [[CMPEQ]], [[CMP]]
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X:%.*]], 0
+; CHECK-NEXT:    ret i1 [[CMPEQ]]
 ;
   %notx = xor i8 %x, -1
   %cmp = icmp uge i8 %notx, %y
@@ -1826,11 +1560,8 @@ define i1 @uge_and_max_commute_not_op(i8 %x, i8 %y)  {
 
 define i1 @uge_swap_and_max_not_op(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @uge_swap_and_max_not_op(
-; CHECK-NEXT:    [[NOTX:%.*]] = xor i8 [[X:%.*]], -1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ule i8 [[Y:%.*]], [[NOTX]]
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X]], 0
-; CHECK-NEXT:    [[R:%.*]] = and i1 [[CMP]], [[CMPEQ]]
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X:%.*]], 0
+; CHECK-NEXT:    ret i1 [[CMPEQ]]
 ;
   %notx = xor i8 %x, -1
   %cmp = icmp ule i8 %y, %notx
@@ -1841,11 +1572,8 @@ define i1 @uge_swap_and_max_not_op(i8 %x, i8 %y)  {
 
 define i1 @uge_swap_and_max_commute_not_op(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @uge_swap_and_max_commute_not_op(
-; CHECK-NEXT:    [[NOTX:%.*]] = xor i8 [[X:%.*]], -1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ule i8 [[Y:%.*]], [[NOTX]]
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X]], 0
-; CHECK-NEXT:    [[R:%.*]] = and i1 [[CMPEQ]], [[CMP]]
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X:%.*]], 0
+; CHECK-NEXT:    ret i1 [[CMPEQ]]
 ;
   %notx = xor i8 %x, -1
   %cmp = icmp ule i8 %y, %notx
@@ -2428,11 +2156,8 @@ define i1 @ugt_swap_and_not_min_commute_not_op(i8 %x, i8 %y)  {
 
 define i1 @slt_or_not_max_not_op(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @slt_or_not_max_not_op(
-; CHECK-NEXT:    [[NOTX:%.*]] = xor i8 [[X:%.*]], -1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 [[NOTX]], [[Y:%.*]]
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X]], -128
-; CHECK-NEXT:    [[R:%.*]] = or i1 [[CMP]], [[CMPEQ]]
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X:%.*]], -128
+; CHECK-NEXT:    ret i1 [[CMPEQ]]
 ;
   %notx = xor i8 %x, -1
   %cmp = icmp slt i8 %notx, %y
@@ -2443,11 +2168,8 @@ define i1 @slt_or_not_max_not_op(i8 %x, i8 %y)  {
 
 define i1 @slt_or_not_max_commute_not_op(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @slt_or_not_max_commute_not_op(
-; CHECK-NEXT:    [[NOTX:%.*]] = xor i8 [[X:%.*]], -1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 [[NOTX]], [[Y:%.*]]
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X]], -128
-; CHECK-NEXT:    [[R:%.*]] = or i1 [[CMPEQ]], [[CMP]]
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X:%.*]], -128
+; CHECK-NEXT:    ret i1 [[CMPEQ]]
 ;
   %notx = xor i8 %x, -1
   %cmp = icmp slt i8 %notx, %y
@@ -2458,11 +2180,8 @@ define i1 @slt_or_not_max_commute_not_op(i8 %x, i8 %y)  {
 
 define i1 @slt_swap_or_not_max_not_op(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @slt_swap_or_not_max_not_op(
-; CHECK-NEXT:    [[NOTX:%.*]] = xor i8 [[X:%.*]], -1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i8 [[Y:%.*]], [[NOTX]]
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X]], -128
-; CHECK-NEXT:    [[R:%.*]] = or i1 [[CMP]], [[CMPEQ]]
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X:%.*]], -128
+; CHECK-NEXT:    ret i1 [[CMPEQ]]
 ;
   %notx = xor i8 %x, -1
   %cmp = icmp sgt i8 %y, %notx
@@ -2473,11 +2192,8 @@ define i1 @slt_swap_or_not_max_not_op(i8 %x, i8 %y)  {
 
 define i1 @slt_swap_or_not_max_commute_not_op(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @slt_swap_or_not_max_commute_not_op(
-; CHECK-NEXT:    [[NOTX:%.*]] = xor i8 [[X:%.*]], -1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i8 [[Y:%.*]], [[NOTX]]
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X]], -128
-; CHECK-NEXT:    [[R:%.*]] = or i1 [[CMPEQ]], [[CMP]]
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X:%.*]], -128
+; CHECK-NEXT:    ret i1 [[CMPEQ]]
 ;
   %notx = xor i8 %x, -1
   %cmp = icmp sgt i8 %y, %notx
@@ -2488,11 +2204,8 @@ define i1 @slt_swap_or_not_max_commute_not_op(i8 %x, i8 %y)  {
 
 define i1 @ult_or_not_max_not_op(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @ult_or_not_max_not_op(
-; CHECK-NEXT:    [[NOTX:%.*]] = xor i8 [[X:%.*]], -1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i8 [[NOTX]], [[Y:%.*]]
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X]], 0
-; CHECK-NEXT:    [[R:%.*]] = or i1 [[CMP]], [[CMPEQ]]
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X:%.*]], 0
+; CHECK-NEXT:    ret i1 [[CMPEQ]]
 ;
   %notx = xor i8 %x, -1
   %cmp = icmp ult i8 %notx, %y
@@ -2503,11 +2216,8 @@ define i1 @ult_or_not_max_not_op(i8 %x, i8 %y)  {
 
 define i1 @ult_or_not_max_commute_not_op(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @ult_or_not_max_commute_not_op(
-; CHECK-NEXT:    [[NOTX:%.*]] = xor i8 [[X:%.*]], -1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i8 [[NOTX]], [[Y:%.*]]
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X]], 0
-; CHECK-NEXT:    [[R:%.*]] = or i1 [[CMPEQ]], [[CMP]]
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X:%.*]], 0
+; CHECK-NEXT:    ret i1 [[CMPEQ]]
 ;
   %notx = xor i8 %x, -1
   %cmp = icmp ult i8 %notx, %y
@@ -2518,11 +2228,8 @@ define i1 @ult_or_not_max_commute_not_op(i8 %x, i8 %y)  {
 
 define i1 @ult_swap_or_not_max_not_op(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @ult_swap_or_not_max_not_op(
-; CHECK-NEXT:    [[NOTX:%.*]] = xor i8 [[X:%.*]], -1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i8 [[Y:%.*]], [[NOTX]]
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X]], 0
-; CHECK-NEXT:    [[R:%.*]] = or i1 [[CMP]], [[CMPEQ]]
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X:%.*]], 0
+; CHECK-NEXT:    ret i1 [[CMPEQ]]
 ;
   %notx = xor i8 %x, -1
   %cmp = icmp ugt i8 %y, %notx
@@ -2533,11 +2240,8 @@ define i1 @ult_swap_or_not_max_not_op(i8 %x, i8 %y)  {
 
 define i1 @ult_swap_or_not_max_commute_not_op(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @ult_swap_or_not_max_commute_not_op(
-; CHECK-NEXT:    [[NOTX:%.*]] = xor i8 [[X:%.*]], -1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i8 [[Y:%.*]], [[NOTX]]
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X]], 0
-; CHECK-NEXT:    [[R:%.*]] = or i1 [[CMPEQ]], [[CMP]]
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X:%.*]], 0
+; CHECK-NEXT:    ret i1 [[CMPEQ]]
 ;
   %notx = xor i8 %x, -1
   %cmp = icmp ugt i8 %y, %notx
@@ -2554,11 +2258,8 @@ define i1 @ult_swap_or_not_max_commute_not_op(i8 %x, i8 %y)  {
 
 define i1 @sgt_or_not_min_not_op(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @sgt_or_not_min_not_op(
-; CHECK-NEXT:    [[NOTX:%.*]] = xor i8 [[X:%.*]], -1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i8 [[NOTX]], [[Y:%.*]]
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X]], 127
-; CHECK-NEXT:    [[R:%.*]] = or i1 [[CMP]], [[CMPEQ]]
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X:%.*]], 127
+; CHECK-NEXT:    ret i1 [[CMPEQ]]
 ;
   %notx = xor i8 %x, -1
   %cmp = icmp sgt i8 %notx, %y
@@ -2569,11 +2270,8 @@ define i1 @sgt_or_not_min_not_op(i8 %x, i8 %y)  {
 
 define i1 @sgt_or_not_min_commute_not_op(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @sgt_or_not_min_commute_not_op(
-; CHECK-NEXT:    [[NOTX:%.*]] = xor i8 [[X:%.*]], -1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i8 [[NOTX]], [[Y:%.*]]
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X]], 127
-; CHECK-NEXT:    [[R:%.*]] = or i1 [[CMPEQ]], [[CMP]]
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X:%.*]], 127
+; CHECK-NEXT:    ret i1 [[CMPEQ]]
 ;
   %notx = xor i8 %x, -1
   %cmp = icmp sgt i8 %notx, %y
@@ -2584,11 +2282,8 @@ define i1 @sgt_or_not_min_commute_not_op(i8 %x, i8 %y)  {
 
 define i1 @sgt_swap_or_not_min_not_op(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @sgt_swap_or_not_min_not_op(
-; CHECK-NEXT:    [[NOTX:%.*]] = xor i8 [[X:%.*]], -1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 [[Y:%.*]], [[NOTX]]
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X]], 127
-; CHECK-NEXT:    [[R:%.*]] = or i1 [[CMP]], [[CMPEQ]]
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X:%.*]], 127
+; CHECK-NEXT:    ret i1 [[CMPEQ]]
 ;
   %notx = xor i8 %x, -1
   %cmp = icmp slt i8 %y, %notx
@@ -2599,11 +2294,8 @@ define i1 @sgt_swap_or_not_min_not_op(i8 %x, i8 %y)  {
 
 define i1 @sgt_swap_or_not_min_commute_not_op(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @sgt_swap_or_not_min_commute_not_op(
-; CHECK-NEXT:    [[NOTX:%.*]] = xor i8 [[X:%.*]], -1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 [[Y:%.*]], [[NOTX]]
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X]], 127
-; CHECK-NEXT:    [[R:%.*]] = or i1 [[CMPEQ]], [[CMP]]
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X:%.*]], 127
+; CHECK-NEXT:    ret i1 [[CMPEQ]]
 ;
   %notx = xor i8 %x, -1
   %cmp = icmp slt i8 %y, %notx
@@ -2614,11 +2306,8 @@ define i1 @sgt_swap_or_not_min_commute_not_op(i8 %x, i8 %y)  {
 
 define i1 @ugt_or_not_min_not_op(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @ugt_or_not_min_not_op(
-; CHECK-NEXT:    [[NOTX:%.*]] = xor i8 [[X:%.*]], -1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i8 [[NOTX]], [[Y:%.*]]
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X]], -1
-; CHECK-NEXT:    [[R:%.*]] = or i1 [[CMP]], [[CMPEQ]]
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X:%.*]], -1
+; CHECK-NEXT:    ret i1 [[CMPEQ]]
 ;
   %notx = xor i8 %x, -1
   %cmp = icmp ugt i8 %notx, %y
@@ -2629,11 +2318,8 @@ define i1 @ugt_or_not_min_not_op(i8 %x, i8 %y)  {
 
 define i1 @ugt_or_not_min_commute_not_op(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @ugt_or_not_min_commute_not_op(
-; CHECK-NEXT:    [[NOTX:%.*]] = xor i8 [[X:%.*]], -1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i8 [[NOTX]], [[Y:%.*]]
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X]], -1
-; CHECK-NEXT:    [[R:%.*]] = or i1 [[CMPEQ]], [[CMP]]
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X:%.*]], -1
+; CHECK-NEXT:    ret i1 [[CMPEQ]]
 ;
   %notx = xor i8 %x, -1
   %cmp = icmp ugt i8 %notx, %y
@@ -2644,11 +2330,8 @@ define i1 @ugt_or_not_min_commute_not_op(i8 %x, i8 %y)  {
 
 define i1 @ugt_swap_or_not_min_not_op(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @ugt_swap_or_not_min_not_op(
-; CHECK-NEXT:    [[NOTX:%.*]] = xor i8 [[X:%.*]], -1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i8 [[Y:%.*]], [[NOTX]]
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X]], -1
-; CHECK-NEXT:    [[R:%.*]] = or i1 [[CMP]], [[CMPEQ]]
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X:%.*]], -1
+; CHECK-NEXT:    ret i1 [[CMPEQ]]
 ;
   %notx = xor i8 %x, -1
   %cmp = icmp ult i8 %y, %notx
diff --git a/llvm/test/Transforms/InstSimplify/and-or-implied-cond.ll b/llvm/test/Transforms/InstSimplify/and-or-implied-cond.ll
index db38077e0c513..7af3138dcaf27 100644
--- a/llvm/test/Transforms/InstSimplify/and-or-implied-cond.ll
+++ b/llvm/test/Transforms/InstSimplify/and-or-implied-cond.ll
@@ -236,9 +236,7 @@ define i1 @pr69050(i32 %arg, i32 %arg1) {
 ; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[ARG:%.*]], -1
 ; CHECK-NEXT:    [[AND:%.*]] = and i32 [[XOR]], [[ARG1:%.*]]
 ; CHECK-NEXT:    [[ICMP:%.*]] = icmp ne i32 [[AND]], 0
-; CHECK-NEXT:    [[ICMP2:%.*]] = icmp ne i32 [[ARG]], -1
-; CHECK-NEXT:    [[AND3:%.*]] = and i1 [[ICMP2]], [[ICMP]]
-; CHECK-NEXT:    ret i1 [[AND3]]
+; CHECK-NEXT:    ret i1 [[ICMP]]
 ;
   %xor = xor i32 %arg, -1
   %and = and i32 %xor, %arg1
@@ -251,11 +249,7 @@ define i1 @pr69050(i32 %arg, i32 %arg1) {
 define i1 @pr69091(i32 %arg, i32 %arg1) {
 ; CHECK-LABEL: @pr69091(
 ; CHECK-NEXT:    [[ICMP:%.*]] = icmp ne i32 [[ARG:%.*]], -1
-; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[ARG]], 1
-; CHECK-NEXT:    [[MUL:%.*]] = mul i32 [[ADD]], [[ARG1:%.*]]
-; CHECK-NEXT:    [[ICMP2:%.*]] = icmp ne i32 [[MUL]], 0
-; CHECK-NEXT:    [[OR:%.*]] = or i1 [[ICMP]], [[ICMP2]]
-; CHECK-NEXT:    ret i1 [[OR]]
+; CHECK-NEXT:    ret i1 [[ICMP]]
 ;
   %icmp = icmp ne i32 %arg, -1
   %add = add i32 %arg, 1
diff --git a/llvm/test/Transforms/InstSimplify/div-by-0-guard-before-smul_ov-not.ll b/llvm/test/Transforms/InstSimplify/div-by-0-guard-before-smul_ov-not.ll
index ef2416eccb839..fa0b7f165a3ad 100644
--- a/llvm/test/Transforms/InstSimplify/div-by-0-guard-before-smul_ov-not.ll
+++ b/llvm/test/Transforms/InstSimplify/div-by-0-guard-before-smul_ov-not.ll
@@ -52,12 +52,7 @@ define i1 @n2_wrong_size(i4 %size0, i4 %size1, i4 %nmemb) {
 
 define i1 @n3_wrong_pred(i4 %size, i4 %nmemb) {
 ; CHECK-LABEL: @n3_wrong_pred(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i4 [[SIZE:%.*]], 0
-; CHECK-NEXT:    [[SMUL:%.*]] = tail call { i4, i1 } @llvm.smul.with.overflow.i4(i4 [[SIZE]], i4 [[NMEMB:%.*]])
-; CHECK-NEXT:    [[SMUL_OV:%.*]] = extractvalue { i4, i1 } [[SMUL]], 1
-; CHECK-NEXT:    [[PHITMP:%.*]] = xor i1 [[SMUL_OV]], true
-; CHECK-NEXT:    [[OR:%.*]] = or i1 [[CMP]], [[PHITMP]]
-; CHECK-NEXT:    ret i1 [[OR]]
+; CHECK-NEXT:    ret i1 true
 ;
   %cmp = icmp ne i4 %size, 0 ; not 'eq'
   %smul = tail call { i4, i1 } @llvm.smul.with.overflow.i4(i4 %size, i4 %nmemb)
@@ -70,11 +65,7 @@ define i1 @n3_wrong_pred(i4 %size, i4 %nmemb) {
 define i1 @n4_not_and(i4 %size, i4 %nmemb) {
 ; CHECK-LABEL: @n4_not_and(
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i4 [[SIZE:%.*]], 0
-; CHECK-NEXT:    [[SMUL:%.*]] = tail call { i4, i1 } @llvm.smul.with.overflow.i4(i4 [[SIZE]], i4 [[NMEMB:%.*]])
-; CHECK-NEXT:    [[SMUL_OV:%.*]] = extractvalue { i4, i1 } [[SMUL]], 1
-; CHECK-NEXT:    [[PHITMP:%.*]] = xor i1 [[SMUL_OV]], true
-; CHECK-NEXT:    [[OR:%.*]] = and i1 [[CMP]], [[PHITMP]]
-; CHECK-NEXT:    ret i1 [[OR]]
+; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %cmp = icmp eq i4 %size, 0
   %smul = tail call { i4, i1 } @llvm.smul.with.overflow.i4(i4 %size, i4 %nmemb)
diff --git a/llvm/test/Transforms/InstSimplify/div-by-0-guard-before-smul_ov.ll b/llvm/test/Transforms/InstSimplify/div-by-0-guard-before-smul_ov.ll
index 84462567271ac..001a24c0f026e 100644
--- a/llvm/test/Transforms/InstSimplify/div-by-0-guard-before-smul_ov.ll
+++ b/llvm/test/Transforms/InstSimplify/div-by-0-guard-before-smul_ov.ll
@@ -46,11 +46,7 @@ define i1 @n2_wrong_size(i4 %size0, i4 %size1, i4 %nmemb) {
 
 define i1 @n3_wrong_pred(i4 %size, i4 %nmemb) {
 ; CHECK-LABEL: @n3_wrong_pred(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i4 [[SIZE:%.*]], 0
-; CHECK-NEXT:    [[SMUL:%.*]] = tail call { i4, i1 } @llvm.smul.with.overflow.i4(i4 [[SIZE]], i4 [[NMEMB:%.*]])
-; CHECK-NEXT:    [[SMUL_OV:%.*]] = extractvalue { i4, i1 } [[SMUL]], 1
-; CHECK-NEXT:    [[AND:%.*]] = and i1 [[SMUL_OV]], [[CMP]]
-; CHECK-NEXT:    ret i1 [[AND]]
+; CHECK-NEXT:    ret i1 false
 ;
   %cmp = icmp eq i4 %size, 0 ; not 'ne'
   %smul = tail call { i4, i1 } @llvm.smul.with.overflow.i4(i4 %size, i4 %nmemb)
@@ -62,10 +58,7 @@ define i1 @n3_wrong_pred(i4 %size, i4 %nmemb) {
 define i1 @n4_not_and(i4 %size, i4 %nmemb) {
 ; CHECK-LABEL: @n4_not_and(
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i4 [[SIZE:%.*]], 0
-; CHECK-NEXT:    [[SMUL:%.*]] = tail call { i4, i1 } @llvm.smul.with.overflow.i4(i4 [[SIZE]], i4 [[NMEMB:%.*]])
-; CHECK-NEXT:    [[SMUL_OV:%.*]] = extractvalue { i4, i1 } [[SMUL]], 1
-; CHECK-NEXT:    [[AND:%.*]] = or i1 [[SMUL_OV]], [[CMP]]
-; CHECK-NEXT:    ret i1 [[AND]]
+; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %cmp = icmp ne i4 %size, 0
   %smul = tail call { i4, i1 } @llvm.smul.with.overflow.i4(i4 %size, i4 %nmemb)
diff --git a/llvm/test/Transforms/InstSimplify/div-by-0-guard-before-umul_ov-not.ll b/llvm/test/Transforms/InstSimplify/div-by-0-guard-before-umul_ov-not.ll
index c332b35fc1874..a1c317ec8ee29 100644
--- a/llvm/test/Transforms/InstSimplify/div-by-0-guard-before-umul_ov-not.ll
+++ b/llvm/test/Transforms/InstSimplify/div-by-0-guard-before-umul_ov-not.ll
@@ -52,12 +52,7 @@ define i1 @n2_wrong_size(i4 %size0, i4 %size1, i4 %nmemb) {
 
 define i1 @n3_wrong_pred(i4 %size, i4 %nmemb) {
 ; CHECK-LABEL: @n3_wrong_pred(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i4 [[SIZE:%.*]], 0
-; CHECK-NEXT:    [[UMUL:%.*]] = tail call { i4, i1 } @llvm.umul.with.overflow.i4(i4 [[SIZE]], i4 [[NMEMB:%.*]])
-; CHECK-NEXT:    [[UMUL_OV:%.*]] = extractvalue { i4, i1 } [[UMUL]], 1
-; CHECK-NEXT:    [[PHITMP:%.*]] = xor i1 [[UMUL_OV]], true
-; CHECK-NEXT:    [[OR:%.*]] = or i1 [[CMP]], [[PHITMP]]
-; CHECK-NEXT:    ret i1 [[OR]]
+; CHECK-NEXT:    ret i1 true
 ;
   %cmp = icmp ne i4 %size, 0 ; not 'eq'
   %umul = tail call { i4, i1 } @llvm.umul.with.overflow.i4(i4 %size, i4 %nmemb)
@@ -70,11 +65,7 @@ define i1 @n3_wrong_pred(i4 %size, i4 %nmemb) {
 define i1 @n4_not_and(i4 %size, i4 %nmemb) {
 ; CHECK-LABEL: @n4_not_and(
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i4 [[SIZE:%.*]], 0
-; CHECK-NEXT:    [[UMUL:%.*]] = tail call { i4, i1 } @llvm.umul.with.overflow.i4(i4 [[SIZE]], i4 [[NMEMB:%.*]])
-; CHECK-NEXT:    [[UMUL_OV:%.*]] = extractvalue { i4, i1 } [[UMUL]], 1
-; CHECK-NEXT:    [[PHITMP:%.*]] = xor i1 [[UMUL_OV]], true
-; CHECK-NEXT:    [[OR:%.*]] = and i1 [[CMP]], [[PHITMP]]
-; CHECK-NEXT:    ret i1 [[OR]]
+; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %cmp = icmp eq i4 %size, 0
   %umul = tail call { i4, i1 } @llvm.umul.with.overflow.i4(i4 %size, i4 %nmemb)
diff --git a/llvm/test/Transforms/InstSimplify/div-by-0-guard-before-umul_ov.ll b/llvm/test/Transforms/InstSimplify/div-by-0-guard-before-umul_ov.ll
index 9ab0819e214e7..f75b8a6282764 100644
--- a/llvm/test/Transforms/InstSimplify/div-by-0-guard-before-umul_ov.ll
+++ b/llvm/test/Transforms/InstSimplify/div-by-0-guard-before-umul_ov.ll
@@ -46,11 +46,7 @@ define i1 @n2_wrong_size(i4 %size0, i4 %size1, i4 %nmemb) {
 
 define i1 @n3_wrong_pred(i4 %size, i4 %nmemb) {
 ; CHECK-LABEL: @n3_wrong_pred(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i4 [[SIZE:%.*]], 0
-; CHECK-NEXT:    [[UMUL:%.*]] = tail call { i4, i1 } @llvm.umul.with.overflow.i4(i4 [[SIZE]], i4 [[NMEMB:%.*]])
-; CHECK-NEXT:    [[UMUL_OV:%.*]] = extractvalue { i4, i1 } [[UMUL]], 1
-; CHECK-NEXT:    [[AND:%.*]] = and i1 [[UMUL_OV]], [[CMP]]
-; CHECK-NEXT:    ret i1 [[AND]]
+; CHECK-NEXT:    ret i1 false
 ;
   %cmp = icmp eq i4 %size, 0 ; not 'ne'
   %umul = tail call { i4, i1 } @llvm.umul.with.overflow.i4(i4 %size, i4 %nmemb)
@@ -62,10 +58,7 @@ define i1 @n3_wrong_pred(i4 %size, i4 %nmemb) {
 define i1 @n4_not_and(i4 %size, i4 %nmemb) {
 ; CHECK-LABEL: @n4_not_and(
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i4 [[SIZE:%.*]], 0
-; CHECK-NEXT:    [[UMUL:%.*]] = tail call { i4, i1 } @llvm.umul.with.overflow.i4(i4 [[SIZE]], i4 [[NMEMB:%.*]])
-; CHECK-NEXT:    [[UMUL_OV:%.*]] = extractvalue { i4, i1 } [[UMUL]], 1
-; CHECK-NEXT:    [[AND:%.*]] = or i1 [[UMUL_OV]], [[CMP]]
-; CHECK-NEXT:    ret i1 [[AND]]
+; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %cmp = icmp ne i4 %size, 0
   %umul = tail call { i4, i1 } @llvm.umul.with.overflow.i4(i4 %size, i4 %nmemb)
diff --git a/llvm/test/Transforms/InstSimplify/result-of-add-of-negative-is-non-zero-and-no-underflow.ll b/llvm/test/Transforms/InstSimplify/result-of-add-of-negative-is-non-zero-and-no-underflow.ll
index abcbb78889f1d..38bc66ff4d692 100644
--- a/llvm/test/Transforms/InstSimplify/result-of-add-of-negative-is-non-zero-and-no-underflow.ll
+++ b/llvm/test/Transforms/InstSimplify/result-of-add-of-negative-is-non-zero-and-no-underflow.ll
@@ -18,10 +18,7 @@ define i1 @t1(i8 %base, i8 %offset) {
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP]])
 ; CHECK-NEXT:    [[ADJUSTED:%.*]] = add i8 [[BASE]], [[OFFSET:%.*]]
 ; CHECK-NEXT:    call void @use8(i8 [[ADJUSTED]])
-; CHECK-NEXT:    [[NOT_NULL:%.*]] = icmp ne i8 [[ADJUSTED]], 0
-; CHECK-NEXT:    [[NO_UNDERFLOW:%.*]] = icmp ult i8 [[ADJUSTED]], [[BASE]]
-; CHECK-NEXT:    [[R:%.*]] = or i1 [[NOT_NULL]], [[NO_UNDERFLOW]]
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    ret i1 true
 ;
   %cmp = icmp slt i8 %base, 0
   call void @llvm.assume(i1 %cmp)
@@ -39,10 +36,7 @@ define i1 @t2_commutative(i8 %base, i8 %offset) {
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP]])
 ; CHECK-NEXT:    [[ADJUSTED:%.*]] = add i8 [[BASE]], [[OFFSET:%.*]]
 ; CHECK-NEXT:    call void @use8(i8 [[ADJUSTED]])
-; CHECK-NEXT:    [[NOT_NULL:%.*]] = icmp ne i8 [[ADJUSTED]], 0
-; CHECK-NEXT:    [[NO_UNDERFLOW:%.*]] = icmp ugt i8 [[BASE]], [[ADJUSTED]]
-; CHECK-NEXT:    [[R:%.*]] = or i1 [[NOT_NULL]], [[NO_UNDERFLOW]]
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    ret i1 true
 ;
   %cmp = icmp slt i8 %base, 0
   call void @llvm.assume(i1 %cmp)
@@ -63,10 +57,7 @@ define i1 @t3(i8 %base, i8 %offset) {
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP]])
 ; CHECK-NEXT:    [[ADJUSTED:%.*]] = add i8 [[BASE]], [[OFFSET:%.*]]
 ; CHECK-NEXT:    call void @use8(i8 [[ADJUSTED]])
-; CHECK-NEXT:    [[NOT_NULL:%.*]] = icmp eq i8 [[ADJUSTED]], 0
-; CHECK-NEXT:    [[NO_UNDERFLOW:%.*]] = icmp uge i8 [[ADJUSTED]], [[BASE]]
-; CHECK-NEXT:    [[R:%.*]] = and i1 [[NOT_NULL]], [[NO_UNDERFLOW]]
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    ret i1 false
 ;
   %cmp = icmp slt i8 %base, 0
   call void @llvm.assume(i1 %cmp)
@@ -84,10 +75,7 @@ define i1 @t4_commutative(i8 %base, i8 %offset) {
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP]])
 ; CHECK-NEXT:    [[ADJUSTED:%.*]] = add i8 [[BASE]], [[OFFSET:%.*]]
 ; CHECK-NEXT:    call void @use8(i8 [[ADJUSTED]])
-; CHECK-NEXT:    [[NOT_NULL:%.*]] = icmp eq i8 [[ADJUSTED]], 0
-; CHECK-NEXT:    [[NO_UNDERFLOW:%.*]] = icmp ule i8 [[BASE]], [[ADJUSTED]]
-; CHECK-NEXT:    [[R:%.*]] = and i1 [[NOT_NULL]], [[NO_UNDERFLOW]]
-; CHECK-NEXT:    ret i1 [[R]]
+; CHECK-NEXT:    ret i1 false
 ;
   %cmp = icmp slt i8 %base, 0
   call void @llvm.assume(i1 %cmp)
diff --git a/llvm/test/Transforms/PGOProfile/chr.ll b/llvm/test/Transforms/PGOProfile/chr.ll
index c82800ec11a12..c4030af943a90 100644
--- a/llvm/test/Transforms/PGOProfile/chr.ll
+++ b/llvm/test/Transforms/PGOProfile/chr.ll
@@ -1298,12 +1298,11 @@ define i32 @test_chr_14(ptr %i, ptr %j, i32 %sum0, i1 %pred, i32 %z) !prof !14 {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[Z_FR:%.*]] = freeze i32 [[Z:%.*]]
 ; CHECK-NEXT:    [[I0:%.*]] = load i32, ptr [[I:%.*]], align 4
-; CHECK-NEXT:    [[V1_NOT:%.*]] = icmp eq i32 [[Z_FR]], 1
-; CHECK-NEXT:    br i1 [[V1_NOT]], label [[BB1:%.*]], label [[ENTRY_SPLIT_NONCHR:%.*]], !prof [[PROF15]]
+; CHECK-NEXT:    [[V1:%.*]] = icmp eq i32 [[Z_FR]], 1
+; CHECK-NEXT:    br i1 [[V1]], label [[BB1:%.*]], label [[ENTRY_SPLIT_NONCHR:%.*]], !prof [[PROF15]]
 ; CHECK:       entry.split.nonchr:
-; CHECK-NEXT:    [[PRED_FR:%.*]] = freeze i1 [[PRED:%.*]]
 ; CHECK-NEXT:    [[V0:%.*]] = icmp eq i32 [[Z_FR]], 0
-; CHECK-NEXT:    [[V3_NONCHR:%.*]] = and i1 [[V0]], [[PRED_FR]]
+; CHECK-NEXT:    [[V3_NONCHR:%.*]] = and i1 [[V0]], [[PRED:%.*]]
 ; CHECK-NEXT:    br i1 [[V3_NONCHR]], label [[BB0_NONCHR:%.*]], label [[BB1]], !prof [[PROF16]]
 ; CHECK:       bb0.nonchr:
 ; CHECK-NEXT:    call void @foo()

From a902ca66428164b4f11dedf1c551393add511271 Mon Sep 17 00:00:00 2001
From: Christian Ulmann <christian.ulmann@nextsilicon.com>
Date: Mon, 30 Oct 2023 10:42:04 +0100
Subject: [PATCH 350/877] [MLIR][LLVM] Infer export location scope from
 location, if possible (#70465)

This commit changes the debug location exporter to try to infer the
locations scope from the MLIR location, if possible. This is necessary
when the function containing the operation does not have a DISubprogram
attached to it.

We observed a roundtrip crash with a case where the the subprogram was
missing on a function, but a debug intrinsic referenced a subprogram
non-the-less. This lead to a successful import, but the export silently
dropped the location, which results in invalid IR.
---
 mlir/lib/Target/LLVMIR/DebugTranslation.cpp | 19 ++++++---------
 mlir/test/Target/LLVMIR/llvmir-debug.mlir   | 27 +++++++++++++++++++++
 2 files changed, 34 insertions(+), 12 deletions(-)

diff --git a/mlir/lib/Target/LLVMIR/DebugTranslation.cpp b/mlir/lib/Target/LLVMIR/DebugTranslation.cpp
index 52a01d7e8e0e1..a15ffb40065a0 100644
--- a/mlir/lib/Target/LLVMIR/DebugTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/DebugTranslation.cpp
@@ -286,7 +286,7 @@ llvm::DILocation *DebugTranslation::translateLoc(Location loc,
                                                  llvm::DILocalScope *scope,
                                                  llvm::DILocation *inlinedAt) {
   // LLVM doesn't have a representation for unknown.
-  if (!scope || isa<UnknownLoc>(loc))
+  if (isa<UnknownLoc>(loc))
     return nullptr;
 
   // Check for a cached instance.
@@ -301,17 +301,12 @@ llvm::DILocation *DebugTranslation::translateLoc(Location loc,
     llvmLoc = translateLoc(callLoc.getCallee(), scope, callerLoc);
 
   } else if (auto fileLoc = dyn_cast<FileLineColLoc>(loc)) {
-    llvm::DILocalScope *locationScope = scope;
-    // Only construct a new DIFile when no local scope is present. This
-    // prioritizes existing DI information when it's present.
-    if (!locationScope) {
-      auto *file = translateFile(fileLoc.getFilename());
-      locationScope = llvm::DILexicalBlockFile::get(llvmCtx, scope, file,
-                                                    /*Discriminator=*/0);
-    }
-    llvmLoc = llvm::DILocation::get(llvmCtx, fileLoc.getLine(),
-                                    fileLoc.getColumn(), locationScope,
-                                    const_cast<llvm::DILocation *>(inlinedAt));
+    // A scope of a DILocation cannot be null.
+    if (!scope)
+      return nullptr;
+    llvmLoc =
+        llvm::DILocation::get(llvmCtx, fileLoc.getLine(), fileLoc.getColumn(),
+                              scope, const_cast<llvm::DILocation *>(inlinedAt));
 
   } else if (auto fusedLoc = dyn_cast<FusedLoc>(loc)) {
     ArrayRef<Location> locations = fusedLoc.getLocations();
diff --git a/mlir/test/Target/LLVMIR/llvmir-debug.mlir b/mlir/test/Target/LLVMIR/llvmir-debug.mlir
index 9156c3a1bbbb4..c1e3d723df667 100644
--- a/mlir/test/Target/LLVMIR/llvmir-debug.mlir
+++ b/mlir/test/Target/LLVMIR/llvmir-debug.mlir
@@ -205,3 +205,30 @@ llvm.func @func_with_inlined_dbg_value(%arg0: i32) -> (i32) {
 // CHECK-DAG: ![[VAR_LOC0]] = !DILocalVariable(name: "a", scope: ![[OUTER_FUNC]], file: ![[FILE]]
 // CHECK-DAG: ![[VAR_LOC1]] = !DILocalVariable(name: "b", scope: ![[LEXICAL_BLOCK_FILE]], file: ![[FILE]]
 // CHECK-DAG  ![[LABEL]] = !DILabel(scope: ![[LEXICAL_BLOCK_FILE]], name: "label", file: ![[FILE]], line: 42)
+
+// -----
+
+#di_basic_type = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "int", sizeInBits = 32, encoding = DW_ATE_signed>
+#di_file = #llvm.di_file<"foo.mlir" in "/test/">
+#di_compile_unit = #llvm.di_compile_unit<
+  sourceLanguage = DW_LANG_C, file = #di_file, producer = "MLIR",
+  isOptimized = true, emissionKind = Full
+>
+#di_subprogram = #llvm.di_subprogram<
+  compileUnit = #di_compile_unit, scope = #di_file, name = "func",
+  file = #di_file, subprogramFlags = Definition>
+#di_local_variable = #llvm.di_local_variable<scope = #di_subprogram, name = "a", file = #di_file, type = #di_basic_type>
+
+#loc = loc("foo.mlir":0:0)
+
+// CHECK-LABEL: define void @func_without_subprogram(
+// CHECK-SAME: i32 %[[ARG:.*]])
+llvm.func @func_without_subprogram(%0 : i32) {
+  // CHECK: call void @llvm.dbg.value(metadata i32 %[[ARG]], metadata ![[VAR_LOC:[0-9]+]], metadata !DIExpression()), !dbg ![[DBG_LOC0:.*]]
+  llvm.intr.dbg.value #di_local_variable = %0 : i32 loc(fused<#di_subprogram>[#loc])
+  llvm.return
+}
+
+// CHECK: ![[FILE:.*]] = !DIFile(filename: "foo.mlir", directory: "/test/")
+// CHECK-DAG: ![[FUNC:.*]] = distinct !DISubprogram(name: "func", scope: ![[FILE]]
+// CHECK-DAG: ![[VAR_LOC]] = !DILocalVariable(name: "a", scope: ![[FUNC]], file: ![[FILE]]

From 6d30bc00858941043ea7d45938192adfd1020922 Mon Sep 17 00:00:00 2001
From: Sander de Smalen <sander.desmalen@arm.com>
Date: Mon, 30 Oct 2023 10:47:07 +0000
Subject: [PATCH 351/877] [AArch64][SME] Allow inlining when streaming-mode
 attributes dont match up. (#68415)

The use-case here is to support things like:

  int foo(int x, int y) __arm_streaming { return std::max<int>(x, y); }

where the call to non-streaming `std::max<int>(x, y)` can be safely
inlined into the streaming function.

This is a first step and will need further work to allow more cases
(e.g. more finegrained analysis of the function calls to ensure they
don't result in any incompatible instructions for the requested mode).
---
 .../AArch64/AArch64TargetTransformInfo.cpp    |  41 +++++-
 .../Inline/AArch64/sme-pstatesm-attrs.ll      | 138 ++++++++++++++----
 .../Inline/AArch64/sme-pstateza-attrs.ll      |  79 +++++++++-
 3 files changed, 218 insertions(+), 40 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 9ddee9e4edd58..6bbd7009e2378 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -190,16 +190,49 @@ static cl::opt<bool> EnableFixedwidthAutovecInStreamingMode(
 static cl::opt<bool> EnableScalableAutovecInStreamingMode(
     "enable-scalable-autovec-in-streaming-mode", cl::init(false), cl::Hidden);
 
+static bool isSMEABIRoutineCall(const CallInst &CI) {
+  const auto *F = CI.getCalledFunction();
+  return F && StringSwitch<bool>(F->getName())
+                  .Case("__arm_sme_state", true)
+                  .Case("__arm_tpidr2_save", true)
+                  .Case("__arm_tpidr2_restore", true)
+                  .Case("__arm_za_disable", true)
+                  .Default(false);
+}
+
+/// Returns true if the function has explicit operations that can only be
+/// lowered using incompatible instructions for the selected mode. This also
+/// returns true if the function F may use or modify ZA state.
+static bool hasPossibleIncompatibleOps(const Function *F) {
+  for (const BasicBlock &BB : *F) {
+    for (const Instruction &I : BB) {
+      // Be conservative for now and assume that any call to inline asm or to
+      // intrinsics could could result in non-streaming ops (e.g. calls to
+      // @llvm.aarch64.* or @llvm.gather/scatter intrinsics). We can assume that
+      // all native LLVM instructions can be lowered to compatible instructions.
+      if (isa<CallInst>(I) && !I.isDebugOrPseudoInst() &&
+          (cast<CallInst>(I).isInlineAsm() || isa<IntrinsicInst>(I) ||
+           isSMEABIRoutineCall(cast<CallInst>(I))))
+        return true;
+    }
+  }
+  return false;
+}
+
 bool AArch64TTIImpl::areInlineCompatible(const Function *Caller,
                                          const Function *Callee) const {
   SMEAttrs CallerAttrs(*Caller);
   SMEAttrs CalleeAttrs(*Callee);
-  if (CallerAttrs.requiresSMChange(CalleeAttrs,
-                                   /*BodyOverridesInterface=*/true) ||
-      CallerAttrs.requiresLazySave(CalleeAttrs) ||
-      CalleeAttrs.hasNewZABody())
+  if (CalleeAttrs.hasNewZABody())
     return false;
 
+  if (CallerAttrs.requiresLazySave(CalleeAttrs) ||
+      CallerAttrs.requiresSMChange(CalleeAttrs,
+                                   /*BodyOverridesInterface=*/true)) {
+    if (hasPossibleIncompatibleOps(Callee))
+      return false;
+  }
+
   const TargetMachine &TM = getTLI()->getTargetMachine();
 
   const FeatureBitset &CallerBits =
diff --git a/llvm/test/Transforms/Inline/AArch64/sme-pstatesm-attrs.ll b/llvm/test/Transforms/Inline/AArch64/sme-pstatesm-attrs.ll
index 3df5400875ae2..f2f5768dbe9c6 100644
--- a/llvm/test/Transforms/Inline/AArch64/sme-pstatesm-attrs.ll
+++ b/llvm/test/Transforms/Inline/AArch64/sme-pstatesm-attrs.ll
@@ -102,11 +102,11 @@ entry:
 ; [ ] N  -> SC
 ; [ ] N  -> N + B
 ; [ ] N  -> SC + B
-define void @normal_caller_streaming_callee_dont_inline() {
-; CHECK-LABEL: define void @normal_caller_streaming_callee_dont_inline
+define void @normal_caller_streaming_callee_inline() {
+; CHECK-LABEL: define void @normal_caller_streaming_callee_inline
 ; CHECK-SAME: () #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    call void @streaming_callee()
+; CHECK-NEXT:    call void @inlined_body()
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -136,11 +136,11 @@ entry:
 ; [ ] N  -> SC
 ; [x] N  -> N + B
 ; [ ] N  -> SC + B
-define void @normal_caller_locally_streaming_callee_dont_inline() {
-; CHECK-LABEL: define void @normal_caller_locally_streaming_callee_dont_inline
+define void @normal_caller_locally_streaming_callee_inline() {
+; CHECK-LABEL: define void @normal_caller_locally_streaming_callee_inline
 ; CHECK-SAME: () #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    call void @locally_streaming_callee()
+; CHECK-NEXT:    call void @inlined_body()
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -153,11 +153,11 @@ entry:
 ; [ ] N  -> SC
 ; [ ] N  -> N + B
 ; [x] N  -> SC + B
-define void @normal_caller_streaming_compatible_locally_streaming_callee_dont_inline() {
-; CHECK-LABEL: define void @normal_caller_streaming_compatible_locally_streaming_callee_dont_inline
+define void @normal_caller_streaming_compatible_locally_streaming_callee_inline() {
+; CHECK-LABEL: define void @normal_caller_streaming_compatible_locally_streaming_callee_inline
 ; CHECK-SAME: () #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    call void @streaming_compatible_locally_streaming_callee()
+; CHECK-NEXT:    call void @inlined_body()
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -170,11 +170,11 @@ entry:
 ; [ ] S  -> SC
 ; [ ] S  -> N + B
 ; [ ] S  -> SC + B
-define void @streaming_caller_normal_callee_dont_inline() "aarch64_pstate_sm_enabled" {
-; CHECK-LABEL: define void @streaming_caller_normal_callee_dont_inline
+define void @streaming_caller_normal_callee_inline() "aarch64_pstate_sm_enabled" {
+; CHECK-LABEL: define void @streaming_caller_normal_callee_inline
 ; CHECK-SAME: () #[[ATTR2]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    call void @normal_callee()
+; CHECK-NEXT:    call void @inlined_body()
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -255,11 +255,11 @@ entry:
 ; [ ] N + B -> SC
 ; [ ] N + B -> N + B
 ; [ ] N + B -> SC + B
-define void @locally_streaming_caller_normal_callee_dont_inline() "aarch64_pstate_sm_body" {
-; CHECK-LABEL: define void @locally_streaming_caller_normal_callee_dont_inline
+define void @locally_streaming_caller_normal_callee_inline() "aarch64_pstate_sm_body" {
+; CHECK-LABEL: define void @locally_streaming_caller_normal_callee_inline
 ; CHECK-SAME: () #[[ATTR3]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    call void @normal_callee()
+; CHECK-NEXT:    call void @inlined_body()
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -340,11 +340,11 @@ entry:
 ; [ ] SC -> SC
 ; [ ] SC -> N + B
 ; [ ] SC -> SC + B
-define void @streaming_compatible_caller_normal_callee_dont_inline() "aarch64_pstate_sm_compatible" {
-; CHECK-LABEL: define void @streaming_compatible_caller_normal_callee_dont_inline
+define void @streaming_compatible_caller_normal_callee_inline() "aarch64_pstate_sm_compatible" {
+; CHECK-LABEL: define void @streaming_compatible_caller_normal_callee_inline
 ; CHECK-SAME: () #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    call void @normal_callee()
+; CHECK-NEXT:    call void @inlined_body()
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -357,11 +357,11 @@ entry:
 ; [ ] SC -> SC
 ; [ ] SC -> N + B
 ; [ ] SC -> SC + B
-define void @streaming_compatible_caller_streaming_callee_dont_inline() "aarch64_pstate_sm_compatible" {
-; CHECK-LABEL: define void @streaming_compatible_caller_streaming_callee_dont_inline
+define void @streaming_compatible_caller_streaming_callee_inline() "aarch64_pstate_sm_compatible" {
+; CHECK-LABEL: define void @streaming_compatible_caller_streaming_callee_inline
 ; CHECK-SAME: () #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    call void @streaming_callee()
+; CHECK-NEXT:    call void @inlined_body()
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -391,11 +391,11 @@ entry:
 ; [ ] SC -> SC
 ; [x] SC -> N + B
 ; [ ] SC -> SC + B
-define void @streaming_compatible_caller_locally_streaming_callee_dont_inline() "aarch64_pstate_sm_compatible" {
-; CHECK-LABEL: define void @streaming_compatible_caller_locally_streaming_callee_dont_inline
+define void @streaming_compatible_caller_locally_streaming_callee_inline() "aarch64_pstate_sm_compatible" {
+; CHECK-LABEL: define void @streaming_compatible_caller_locally_streaming_callee_inline
 ; CHECK-SAME: () #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    call void @locally_streaming_callee()
+; CHECK-NEXT:    call void @inlined_body()
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -408,11 +408,11 @@ entry:
 ; [ ] SC -> SC
 ; [ ] SC -> N + B
 ; [x] SC -> SC + B
-define void @streaming_compatible_caller_streaming_compatible_locally_streaming_callee_dont_inline() "aarch64_pstate_sm_compatible" {
-; CHECK-LABEL: define void @streaming_compatible_caller_streaming_compatible_locally_streaming_callee_dont_inline
+define void @streaming_compatible_caller_streaming_compatible_locally_streaming_callee_inline() "aarch64_pstate_sm_compatible" {
+; CHECK-LABEL: define void @streaming_compatible_caller_streaming_compatible_locally_streaming_callee_inline
 ; CHECK-SAME: () #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    call void @streaming_compatible_locally_streaming_callee()
+; CHECK-NEXT:    call void @inlined_body()
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -424,11 +424,11 @@ entry:
 ; [ ] SC + B -> SC
 ; [ ] SC + B -> N + B
 ; [ ] SC + B -> SC + B
-define void @streaming_compatible_locally_streaming_caller_normal_callee_dont_inline() "aarch64_pstate_sm_compatible" "aarch64_pstate_sm_body" {
-; CHECK-LABEL: define void @streaming_compatible_locally_streaming_caller_normal_callee_dont_inline
+define void @streaming_compatible_locally_streaming_caller_normal_callee_inline() "aarch64_pstate_sm_compatible" "aarch64_pstate_sm_body" {
+; CHECK-LABEL: define void @streaming_compatible_locally_streaming_caller_normal_callee_inline
 ; CHECK-SAME: () #[[ATTR4]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    call void @normal_callee()
+; CHECK-NEXT:    call void @inlined_body()
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -503,3 +503,81 @@ entry:
   call void @streaming_compatible_locally_streaming_callee()
   ret void
 }
+
+define void @normal_callee_with_inlineasm() {
+; CHECK-LABEL: define void @normal_callee_with_inlineasm
+; CHECK-SAME: () #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    call void asm sideeffect "
+; CHECK-NEXT:    ret void
+;
+entry:
+  call void asm sideeffect "; inlineasm", ""()
+  ret void
+}
+
+define void @streaming_caller_normal_callee_with_inlineasm_dont_inline() "aarch64_pstate_sm_enabled" {
+; CHECK-LABEL: define void @streaming_caller_normal_callee_with_inlineasm_dont_inline
+; CHECK-SAME: () #[[ATTR2]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    call void @normal_callee_with_inlineasm()
+; CHECK-NEXT:    ret void
+;
+entry:
+  call void @normal_callee_with_inlineasm()
+  ret void
+}
+
+define i64 @normal_callee_with_intrinsic_call() {
+; CHECK-LABEL: define i64 @normal_callee_with_intrinsic_call
+; CHECK-SAME: () #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[RES:%.*]] = call i64 @llvm.aarch64.sve.cntb(i32 4)
+; CHECK-NEXT:    ret i64 [[RES]]
+;
+entry:
+  %res = call i64 @llvm.aarch64.sve.cntb(i32 4)
+  ret i64 %res
+}
+
+define i64 @streaming_caller_normal_callee_with_intrinsic_call_dont_inline() "aarch64_pstate_sm_enabled" {
+; CHECK-LABEL: define i64 @streaming_caller_normal_callee_with_intrinsic_call_dont_inline
+; CHECK-SAME: () #[[ATTR2]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[RES:%.*]] = call i64 @normal_callee_with_intrinsic_call()
+; CHECK-NEXT:    ret i64 [[RES]]
+;
+entry:
+  %res = call i64 @normal_callee_with_intrinsic_call()
+  ret i64 %res
+}
+
+declare i64 @llvm.aarch64.sve.cntb(i32)
+
+define i64 @normal_callee_call_sme_state() {
+; CHECK-LABEL: define i64 @normal_callee_call_sme_state
+; CHECK-SAME: () #[[ATTR1]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[RES:%.*]] = call { i64, i64 } @__arm_sme_state()
+; CHECK-NEXT:    [[RES_0:%.*]] = extractvalue { i64, i64 } [[RES]], 0
+; CHECK-NEXT:    ret i64 [[RES_0]]
+;
+entry:
+  %res = call {i64, i64} @__arm_sme_state()
+  %res.0 = extractvalue {i64, i64} %res, 0
+  ret i64 %res.0
+}
+
+declare {i64, i64} @__arm_sme_state()
+
+define i64 @streaming_caller_normal_callee_call_sme_state_dont_inline() "aarch64_pstate_sm_enabled" {
+; CHECK-LABEL: define i64 @streaming_caller_normal_callee_call_sme_state_dont_inline
+; CHECK-SAME: () #[[ATTR2]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[RES:%.*]] = call i64 @normal_callee_call_sme_state()
+; CHECK-NEXT:    ret i64 [[RES]]
+;
+entry:
+  %res = call i64 @normal_callee_call_sme_state()
+  ret i64 %res
+}
diff --git a/llvm/test/Transforms/Inline/AArch64/sme-pstateza-attrs.ll b/llvm/test/Transforms/Inline/AArch64/sme-pstateza-attrs.ll
index a833e7a911ac0..7fca45b1e43f6 100644
--- a/llvm/test/Transforms/Inline/AArch64/sme-pstateza-attrs.ll
+++ b/llvm/test/Transforms/Inline/AArch64/sme-pstateza-attrs.ll
@@ -3,10 +3,12 @@
 
 declare void @inlined_body()
 
+;
 ; Define some functions that will be called by the functions below.
 ; These just call a '...body()' function. If we see the call to one of
 ; these functions being replaced by '...body()', then we know it has been
 ; inlined.
+;
 
 define void @nonza_callee() {
 ; CHECK-LABEL: define void @nonza_callee
@@ -42,6 +44,7 @@ define void @new_za_callee() "aarch64_pstate_za_new" {
   ret void
 }
 
+;
 ; Now test that inlining only happens when no lazy-save is needed.
 ; Test for a number of combinations, where:
 ; N   Not using ZA.
@@ -81,11 +84,11 @@ entry:
 ; [x] Z -> N
 ; [ ] Z -> S
 ; [ ] Z -> Z
-define void @new_za_caller_nonza_callee_dont_inline() "aarch64_pstate_za_new" {
-; CHECK-LABEL: define void @new_za_caller_nonza_callee_dont_inline
+define void @new_za_caller_nonza_callee_inline() "aarch64_pstate_za_new" {
+; CHECK-LABEL: define void @new_za_caller_nonza_callee_inline
 ; CHECK-SAME: () #[[ATTR2]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    call void @nonza_callee()
+; CHECK-NEXT:    call void @inlined_body()
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -126,11 +129,11 @@ entry:
 ; [x] Z -> N
 ; [ ] Z -> S
 ; [ ] Z -> Z
-define void @shared_za_caller_nonza_callee_dont_inline() "aarch64_pstate_za_shared" {
-; CHECK-LABEL: define void @shared_za_caller_nonza_callee_dont_inline
+define void @shared_za_caller_nonza_callee_inline() "aarch64_pstate_za_shared" {
+; CHECK-LABEL: define void @shared_za_caller_nonza_callee_inline
 ; CHECK-SAME: () #[[ATTR1]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    call void @nonza_callee()
+; CHECK-NEXT:    call void @inlined_body()
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -167,3 +170,67 @@ entry:
   call void @shared_za_callee()
   ret void
 }
+
+define void @private_za_callee_call_za_disable() {
+; CHECK-LABEL: define void @private_za_callee_call_za_disable
+; CHECK-SAME: () #[[ATTR0]] {
+; CHECK-NEXT:    call void @__arm_za_disable()
+; CHECK-NEXT:    ret void
+;
+  call void @__arm_za_disable()
+  ret void
+}
+
+define void @shared_za_caller_private_za_callee_call_za_disable() "aarch64_pstate_za_shared" {
+; CHECK-LABEL: define void @shared_za_caller_private_za_callee_call_za_disable
+; CHECK-SAME: () #[[ATTR1]] {
+; CHECK-NEXT:    call void @private_za_callee_call_za_disable()
+; CHECK-NEXT:    ret void
+;
+  call void @private_za_callee_call_za_disable()
+  ret void
+}
+
+define void @private_za_callee_call_tpidr2_save() {
+; CHECK-LABEL: define void @private_za_callee_call_tpidr2_save
+; CHECK-SAME: () #[[ATTR0]] {
+; CHECK-NEXT:    call void @__arm_tpidr2_save()
+; CHECK-NEXT:    ret void
+;
+  call void @__arm_tpidr2_save()
+  ret void
+}
+
+define void @shared_za_caller_private_za_callee_call_tpidr2_save_dont_inline() "aarch64_pstate_za_shared" {
+; CHECK-LABEL: define void @shared_za_caller_private_za_callee_call_tpidr2_save_dont_inline
+; CHECK-SAME: () #[[ATTR1]] {
+; CHECK-NEXT:    call void @private_za_callee_call_tpidr2_save()
+; CHECK-NEXT:    ret void
+;
+  call void @private_za_callee_call_tpidr2_save()
+  ret void
+}
+
+define void @private_za_callee_call_tpidr2_restore(ptr %ptr) {
+; CHECK-LABEL: define void @private_za_callee_call_tpidr2_restore
+; CHECK-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    call void @__arm_tpidr2_restore(ptr [[PTR]])
+; CHECK-NEXT:    ret void
+;
+  call void @__arm_tpidr2_restore(ptr %ptr)
+  ret void
+}
+
+define void @shared_za_caller_private_za_callee_call_tpidr2_restore_dont_inline(ptr %ptr) "aarch64_pstate_za_shared" {
+; CHECK-LABEL: define void @shared_za_caller_private_za_callee_call_tpidr2_restore_dont_inline
+; CHECK-SAME: (ptr [[PTR:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT:    call void @private_za_callee_call_tpidr2_restore(ptr [[PTR]])
+; CHECK-NEXT:    ret void
+;
+  call void @private_za_callee_call_tpidr2_restore(ptr %ptr)
+  ret void
+}
+
+declare void @__arm_za_disable()
+declare void @__arm_tpidr2_save()
+declare void @__arm_tpidr2_restore(ptr)

From a316f14fdd8f5aa60d7158f2ce66718337dcf3b6 Mon Sep 17 00:00:00 2001
From: Wang Pengcheng <wangpengcheng.pp@bytedance.com>
Date: Mon, 30 Oct 2023 19:03:04 +0800
Subject: [PATCH 352/877] [RISCV][NFC] Move getRVVMCOpcode to RISCVInstrInfo
 (#70637)

To simplify more code.
---
 llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp  | 21 +++++---------------
 llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp | 20 ++++++-------------
 llvm/lib/Target/RISCV/RISCVInstrInfo.cpp     |  8 ++++++++
 llvm/lib/Target/RISCV/RISCVInstrInfo.h       |  3 +++
 llvm/lib/Target/RISCV/RISCVOptWInstrs.cpp    |  7 +++----
 5 files changed, 25 insertions(+), 34 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index 6c156057ccd7d..94d1994895325 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -2841,10 +2841,9 @@ bool RISCVDAGToDAGISel::selectSHXADD_UWOp(SDValue N, unsigned ShAmt,
 static bool vectorPseudoHasAllNBitUsers(SDNode *User, unsigned UserOpNo,
                                         unsigned Bits,
                                         const TargetInstrInfo *TII) {
-  const RISCVVPseudosTable::PseudoInfo *PseudoInfo =
-      RISCVVPseudosTable::getPseudoInfo(User->getMachineOpcode());
+  unsigned MCOpcode = RISCV::getRVVMCOpcode(User->getMachineOpcode());
 
-  if (!PseudoInfo)
+  if (!MCOpcode)
     return false;
 
   const MCInstrDesc &MCID = TII->get(User->getMachineOpcode());
@@ -2865,7 +2864,7 @@ static bool vectorPseudoHasAllNBitUsers(SDNode *User, unsigned UserOpNo,
     return false;
 
   auto NumDemandedBits =
-      RISCV::getVectorLowDemandedScalarBits(PseudoInfo->BaseInstr, Log2SEW);
+      RISCV::getVectorLowDemandedScalarBits(MCOpcode, Log2SEW);
   return NumDemandedBits && Bits >= *NumDemandedBits;
 }
 
@@ -3404,21 +3403,11 @@ bool RISCVDAGToDAGISel::doPeepholeMaskedRVV(MachineSDNode *N) {
 }
 
 static bool IsVMerge(SDNode *N) {
-  unsigned Opc = N->getMachineOpcode();
-  return Opc == RISCV::PseudoVMERGE_VVM_MF8 ||
-         Opc == RISCV::PseudoVMERGE_VVM_MF4 ||
-         Opc == RISCV::PseudoVMERGE_VVM_MF2 ||
-         Opc == RISCV::PseudoVMERGE_VVM_M1 ||
-         Opc == RISCV::PseudoVMERGE_VVM_M2 ||
-         Opc == RISCV::PseudoVMERGE_VVM_M4 || Opc == RISCV::PseudoVMERGE_VVM_M8;
+  return RISCV::getRVVMCOpcode(N->getMachineOpcode()) == RISCV::VMERGE_VVM;
 }
 
 static bool IsVMv(SDNode *N) {
-  unsigned Opc = N->getMachineOpcode();
-  return Opc == RISCV::PseudoVMV_V_V_MF8 || Opc == RISCV::PseudoVMV_V_V_MF4 ||
-         Opc == RISCV::PseudoVMV_V_V_MF2 || Opc == RISCV::PseudoVMV_V_V_M1 ||
-         Opc == RISCV::PseudoVMV_V_V_M2 || Opc == RISCV::PseudoVMV_V_V_M4 ||
-         Opc == RISCV::PseudoVMV_V_V_M8;
+  return RISCV::getRVVMCOpcode(N->getMachineOpcode()) == RISCV::VMV_V_V;
 }
 
 static unsigned GetVMSetForLMul(RISCVII::VLMUL LMUL) {
diff --git a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
index 27f281fff6fc5..f6d8b1f0a70e1 100644
--- a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
@@ -67,16 +67,8 @@ static bool isVLPreservingConfig(const MachineInstr &MI) {
   return RISCV::X0 == MI.getOperand(0).getReg();
 }
 
-static uint16_t getRVVMCOpcode(uint16_t RVVPseudoOpcode) {
-  const RISCVVPseudosTable::PseudoInfo *RVV =
-      RISCVVPseudosTable::getPseudoInfo(RVVPseudoOpcode);
-  if (!RVV)
-    return 0;
-  return RVV->BaseInstr;
-}
-
 static bool isFloatScalarMoveOrScalarSplatInstr(const MachineInstr &MI) {
-  switch (getRVVMCOpcode(MI.getOpcode())) {
+  switch (RISCV::getRVVMCOpcode(MI.getOpcode())) {
   default:
     return false;
   case RISCV::VFMV_S_F:
@@ -86,7 +78,7 @@ static bool isFloatScalarMoveOrScalarSplatInstr(const MachineInstr &MI) {
 }
 
 static bool isScalarExtractInstr(const MachineInstr &MI) {
-  switch (getRVVMCOpcode(MI.getOpcode())) {
+  switch (RISCV::getRVVMCOpcode(MI.getOpcode())) {
   default:
     return false;
   case RISCV::VMV_X_S:
@@ -96,7 +88,7 @@ static bool isScalarExtractInstr(const MachineInstr &MI) {
 }
 
 static bool isScalarInsertInstr(const MachineInstr &MI) {
-  switch (getRVVMCOpcode(MI.getOpcode())) {
+  switch (RISCV::getRVVMCOpcode(MI.getOpcode())) {
   default:
     return false;
   case RISCV::VMV_S_X:
@@ -106,7 +98,7 @@ static bool isScalarInsertInstr(const MachineInstr &MI) {
 }
 
 static bool isScalarSplatInstr(const MachineInstr &MI) {
-  switch (getRVVMCOpcode(MI.getOpcode())) {
+  switch (RISCV::getRVVMCOpcode(MI.getOpcode())) {
   default:
     return false;
   case RISCV::VMV_V_I:
@@ -117,7 +109,7 @@ static bool isScalarSplatInstr(const MachineInstr &MI) {
 }
 
 static bool isVSlideInstr(const MachineInstr &MI) {
-  switch (getRVVMCOpcode(MI.getOpcode())) {
+  switch (RISCV::getRVVMCOpcode(MI.getOpcode())) {
   default:
     return false;
   case RISCV::VSLIDEDOWN_VX:
@@ -131,7 +123,7 @@ static bool isVSlideInstr(const MachineInstr &MI) {
 /// Get the EEW for a load or store instruction.  Return std::nullopt if MI is
 /// not a load or store which ignores SEW.
 static std::optional<unsigned> getEEWForLoadStore(const MachineInstr &MI) {
-  switch (getRVVMCOpcode(MI.getOpcode())) {
+  switch (RISCV::getRVVMCOpcode(MI.getOpcode())) {
   default:
     return std::nullopt;
   case RISCV::VLE8_V:
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index bfe43bae7cb12..996ef1c6f574a 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -3103,3 +3103,11 @@ RISCV::getVectorLowDemandedScalarBits(uint16_t Opcode, unsigned Log2SEW) {
     return 1U << Log2SEW;
   }
 }
+
+unsigned RISCV::getRVVMCOpcode(unsigned RVVPseudoOpcode) {
+  const RISCVVPseudosTable::PseudoInfo *RVV =
+      RISCVVPseudosTable::getPseudoInfo(RVVPseudoOpcode);
+  if (!RVV)
+    return 0;
+  return RVV->BaseInstr;
+}
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.h b/llvm/lib/Target/RISCV/RISCVInstrInfo.h
index a51dd0c316ef6..d0112a464677a 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.h
@@ -283,6 +283,9 @@ bool hasEqualFRM(const MachineInstr &MI1, const MachineInstr &MI2);
 std::optional<unsigned> getVectorLowDemandedScalarBits(uint16_t Opcode,
                                                        unsigned Log2SEW);
 
+// Returns the MC opcode of RVV pseudo instruction.
+unsigned getRVVMCOpcode(unsigned RVVPseudoOpcode);
+
 // Special immediate for AVL operand of V pseudo instructions to indicate VLMax.
 static constexpr int64_t VLMaxSentinel = -1LL;
 
diff --git a/llvm/lib/Target/RISCV/RISCVOptWInstrs.cpp b/llvm/lib/Target/RISCV/RISCVOptWInstrs.cpp
index 63be3f88d48cc..a62c7b4bbae06 100644
--- a/llvm/lib/Target/RISCV/RISCVOptWInstrs.cpp
+++ b/llvm/lib/Target/RISCV/RISCVOptWInstrs.cpp
@@ -84,10 +84,9 @@ FunctionPass *llvm::createRISCVOptWInstrsPass() {
 static bool vectorPseudoHasAllNBitUsers(const MachineOperand &UserOp,
                                         unsigned Bits) {
   const MachineInstr &MI = *UserOp.getParent();
-  const RISCVVPseudosTable::PseudoInfo *PseudoInfo =
-      RISCVVPseudosTable::getPseudoInfo(MI.getOpcode());
+  unsigned MCOpcode = RISCV::getRVVMCOpcode(MI.getOpcode());
 
-  if (!PseudoInfo)
+  if (!MCOpcode)
     return false;
 
   const MCInstrDesc &MCID = MI.getDesc();
@@ -101,7 +100,7 @@ static bool vectorPseudoHasAllNBitUsers(const MachineOperand &UserOp,
     return false;
 
   auto NumDemandedBits =
-      RISCV::getVectorLowDemandedScalarBits(PseudoInfo->BaseInstr, Log2SEW);
+      RISCV::getVectorLowDemandedScalarBits(MCOpcode, Log2SEW);
   return NumDemandedBits && Bits >= *NumDemandedBits;
 }
 

From edebbb46b5033720a52e1a2cc3a2a96efbab88c2 Mon Sep 17 00:00:00 2001
From: Jessica Del <50999226+OutOfCache@users.noreply.github.com>
Date: Mon, 30 Oct 2023 12:11:35 +0100
Subject: [PATCH 353/877] [AMDGPU/VOP3P][NFC] - Simplify wmma instruction defs
 (#70622)

Instead of checking for the `_w32` or `_w64` suffix, we can use the
suffix in the definitions directly.
---
 llvm/lib/Target/AMDGPU/VOP3PInstructions.td | 27 ++++++---------------
 1 file changed, 7 insertions(+), 20 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index 9c746cda1a4b1..539b69651dfed 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -871,29 +871,16 @@ multiclass WMMAInst<string Suffix, string Instr, VOPProfile P, SDPatternOperator
   defvar WMMAConstraints3Addr = "@earlyclobber $vdst";
 
   defvar WMMAProfile = VOPProfileWMMA<P, Suffix, _Src01RC64, Type.hasClamp, Type.hasOpsel>;
-  if !eq(Suffix, "_w32") then {
-    let Mnemonic = Instr, mayRaiseFPException = 0, ReadsModeReg = 0 in {
-      let Constraints = WMMAConstraints2Addr, isConvertibleToThreeAddress = 1 in {
-        def _twoaddr_w32 : VOP3P_Pseudo<Instr # Suffix, WMMAProfile>;
-      }
-      let Constraints = WMMAConstraints3Addr, SchedRW = [Write32Bit, Write32Bit] in {
-        def _threeaddr_w32 : VOP3P_Pseudo<Instr # Suffix, WMMAProfile>;
-      }
+  let Mnemonic = Instr, mayRaiseFPException = 0, ReadsModeReg = 0 in {
+    let Constraints = WMMAConstraints2Addr, isConvertibleToThreeAddress = 1 in {
+      def _twoaddr # Suffix : VOP3P_Pseudo<Instr # Suffix, WMMAProfile>;
     }
-    def : WMMAOpcodeMapping<!cast<Instruction>(NAME # _twoaddr_w32),
-                            !cast<Instruction>(NAME # _threeaddr_w32)>;
-  } else if !eq(Suffix, "_w64") then {
-    let Mnemonic = Instr, mayRaiseFPException = 0, ReadsModeReg = 0 in {
-      let Constraints = WMMAConstraints2Addr, isConvertibleToThreeAddress = 1 in {
-        def _twoaddr_w64 : VOP3P_Pseudo<Instr # Suffix, WMMAProfile>;
-      }
-      let Constraints = WMMAConstraints3Addr, SchedRW = [Write32Bit, Write32Bit] in {
-        def _threeaddr_w64 : VOP3P_Pseudo<Instr # Suffix, WMMAProfile>;
-      }
+    let Constraints = WMMAConstraints3Addr, SchedRW = [Write32Bit, Write32Bit] in {
+      def _threeaddr # Suffix : VOP3P_Pseudo<Instr # Suffix, WMMAProfile>;
     }
-    def : WMMAOpcodeMapping<!cast<Instruction>(NAME # _twoaddr_w64),
-                            !cast<Instruction>(NAME # _threeaddr_w64)>;
   }
+  def : WMMAOpcodeMapping<!cast<Instruction>(NAME # _twoaddr # Suffix),
+                          !cast<Instruction>(NAME # _threeaddr # Suffix)>;
 
   if !eq(Type, WMMAOpSel) then {
     def : WMMAOpSelPat<!cast<Instruction>(NAME # _twoaddr # Suffix), node, P>;

From 8f11f984817ffe48725e35cffa0fdc45492497e9 Mon Sep 17 00:00:00 2001
From: Nathan Sidwell <nathan@acm.org>
Date: Mon, 30 Oct 2023 07:37:42 -0400
Subject: [PATCH 354/877] [clang][NFC] Assert not llvm_unreachable (#70149)

An assert is better here.
---
 clang/lib/CodeGen/CGExprScalar.cpp | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/clang/lib/CodeGen/CGExprScalar.cpp b/clang/lib/CodeGen/CGExprScalar.cpp
index c25ddeff9adc3..7633c6b17db88 100644
--- a/clang/lib/CodeGen/CGExprScalar.cpp
+++ b/clang/lib/CodeGen/CGExprScalar.cpp
@@ -2084,11 +2084,10 @@ Value *ScalarExprEmitter::VisitCastExpr(CastExpr *CE) {
     Value *Src = Visit(const_cast<Expr*>(E));
     llvm::Type *SrcTy = Src->getType();
     llvm::Type *DstTy = ConvertType(DestTy);
-    if (SrcTy->isPtrOrPtrVectorTy() && DstTy->isPtrOrPtrVectorTy() &&
-        SrcTy->getPointerAddressSpace() != DstTy->getPointerAddressSpace()) {
-      llvm_unreachable("wrong cast for pointers in different address spaces"
-                       "(must be an address space cast)!");
-    }
+    assert(
+        (!SrcTy->isPtrOrPtrVectorTy() || !DstTy->isPtrOrPtrVectorTy() ||
+         SrcTy->getPointerAddressSpace() == DstTy->getPointerAddressSpace()) &&
+        "Address-space cast must be used to convert address spaces");
 
     if (CGF.SanOpts.has(SanitizerKind::CFIUnrelatedCast)) {
       if (auto *PT = DestTy->getAs<PointerType>()) {

From 97f05956c61e98b62badc6d2449c4e848bf49062 Mon Sep 17 00:00:00 2001
From: tsitdikov <149382295+tsitdikov@users.noreply.github.com>
Date: Mon, 30 Oct 2023 11:39:03 +0000
Subject: [PATCH 355/877] Update mlir-spirv-cpu-runner.cpp (#70649)

https://github.com/llvm/llvm-project/pull/70568 removed the support for
lowering SPIRV to LLVM dialect. We now need to stop using
enableOpaquePointers with ConvertSPIRVToLLVMPassOptions.
---
 mlir/tools/mlir-spirv-cpu-runner/mlir-spirv-cpu-runner.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mlir/tools/mlir-spirv-cpu-runner/mlir-spirv-cpu-runner.cpp b/mlir/tools/mlir-spirv-cpu-runner/mlir-spirv-cpu-runner.cpp
index a427d37ae126e..e3d5b2ff5843c 100644
--- a/mlir/tools/mlir-spirv-cpu-runner/mlir-spirv-cpu-runner.cpp
+++ b/mlir/tools/mlir-spirv-cpu-runner/mlir-spirv-cpu-runner.cpp
@@ -94,8 +94,8 @@ static LogicalResult runMLIRPasses(Operation *module,
   nestedPM.addPass(spirv::createSPIRVUpdateVCEPass());
   passManager.addPass(createLowerHostCodeToLLVMPass(
       enableOpaquePointers(LowerHostCodeToLLVMPassOptions{})));
-  passManager.addPass(createConvertSPIRVToLLVMPass(
-      enableOpaquePointers(ConvertSPIRVToLLVMPassOptions{})));
+  passManager.addPass(
+      createConvertSPIRVToLLVMPass(ConvertSPIRVToLLVMPassOptions{}));
   return passManager.run(module);
 }
 

From 7b2e0095bccb2f63e6b6cc8b8e524d206ebf8d69 Mon Sep 17 00:00:00 2001
From: Nathan Sidwell <nathan@acm.org>
Date: Mon, 30 Oct 2023 07:39:45 -0400
Subject: [PATCH 356/877] [clang] Robustify openmp test (#69739)

If the source path contains 'alias' this would spuriously fail. Be more
specific about not wanting global aliases
---
 clang/test/OpenMP/declare_variant_device_kind_codegen.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/clang/test/OpenMP/declare_variant_device_kind_codegen.cpp b/clang/test/OpenMP/declare_variant_device_kind_codegen.cpp
index daa14f1e3a931..4f9a86f1e0080 100644
--- a/clang/test/OpenMP/declare_variant_device_kind_codegen.cpp
+++ b/clang/test/OpenMP/declare_variant_device_kind_codegen.cpp
@@ -80,7 +80,8 @@
 
 // expected-no-diagnostics
 
-// CHECK-NOT: alias
+// Verify no unexpected global symbol aliasing
+// CHECK-NOT: @{{[^ ]+}} = {{.*}}alias
 
 // CHECK-NOT: ret i32 {{1|4|81|84}}
 // CHECK-DAG: declare {{.*}}i32 @_Z5bazzzv()

From 3e96070b2d25954925bf904dc8841067280ffa27 Mon Sep 17 00:00:00 2001
From: Christian Ulmann <christian.ulmann@nextsilicon.com>
Date: Mon, 30 Oct 2023 12:49:16 +0100
Subject: [PATCH 357/877] [MLIR][LLVM] Avoid exporting broken debug intrinsics
 without a location (#70643)

LLVM IR does not allow debug intrinsics without a debug attachment. The
location export can fail the export of a location due to multiple
reasons. To deal with this, this commit adds a check to the debug
intrinsic's LLVM builders, that skips them, if the location is
`nullptr`.

Fixes #60222
---
 .../mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td   |  6 ++++
 mlir/test/Target/LLVMIR/llvmir-debug.mlir     | 31 +++++++++++++++++++
 2 files changed, 37 insertions(+)

diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td
index 040f9895ad0db..72c932ac07a2e 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td
@@ -526,6 +526,9 @@ def LLVM_CoroResumeOp : LLVM_IntrOp<"coro.resume", [], [], [], 0> {
 class LLVM_DbgIntrOp<string name, string argName, list<Trait> traits = []>
     : LLVM_IntrOp<name, [], [], traits, 0> {
   let llvmBuilder = [{
+    // Debug intrinsics without debug locations are invalid.
+    if(!builder.getCurrentDebugLocation())
+      return success();
     llvm::Module *module = builder.GetInsertBlock()->getModule();
     llvm::LLVMContext &ctx = module->getContext();
     llvm::Function *fn =
@@ -566,6 +569,9 @@ def LLVM_DbgLabelOp : LLVM_IntrOp<"dbg.label", [], [], [], 0> {
   let summary = "Relates the program to a debug information label.";
   let arguments = (ins LLVM_DILabelAttr:$label);
   let llvmBuilder = [{
+    // Debug intrinsics without debug locations are invalid.
+    if(!builder.getCurrentDebugLocation())
+      return success();
     llvm::Module *module = builder.GetInsertBlock()->getModule();
     llvm::LLVMContext &ctx = module->getContext();
     llvm::Function *fn =
diff --git a/mlir/test/Target/LLVMIR/llvmir-debug.mlir b/mlir/test/Target/LLVMIR/llvmir-debug.mlir
index c1e3d723df667..8d1734d7cdc31 100644
--- a/mlir/test/Target/LLVMIR/llvmir-debug.mlir
+++ b/mlir/test/Target/LLVMIR/llvmir-debug.mlir
@@ -232,3 +232,34 @@ llvm.func @func_without_subprogram(%0 : i32) {
 // CHECK: ![[FILE:.*]] = !DIFile(filename: "foo.mlir", directory: "/test/")
 // CHECK-DAG: ![[FUNC:.*]] = distinct !DISubprogram(name: "func", scope: ![[FILE]]
 // CHECK-DAG: ![[VAR_LOC]] = !DILocalVariable(name: "a", scope: ![[FUNC]], file: ![[FILE]]
+
+// -----
+
+// Ensures that debug intrinsics without a valid location are not exported to
+// avoid broken LLVM IR.
+
+#di_file = #llvm.di_file<"foo.mlir" in "/test/">
+#di_compile_unit = #llvm.di_compile_unit<
+  sourceLanguage = DW_LANG_C, file = #di_file, producer = "MLIR",
+  isOptimized = true, emissionKind = Full
+>
+#di_subprogram = #llvm.di_subprogram<
+  compileUnit = #di_compile_unit, scope = #di_file, name = "outer_func",
+  file = #di_file, subprogramFlags = "Definition|Optimized"
+>
+#di_local_variable = #llvm.di_local_variable<scope = #di_subprogram, name = "a">
+#declared_var = #llvm.di_local_variable<scope = #di_subprogram, name = "alloc">
+#di_label = #llvm.di_label<scope = #di_subprogram, name = "label", file = #di_file, line = 42>
+
+// CHECK-LABEL: define i32 @dbg_intrinsics_with_no_location(
+llvm.func @dbg_intrinsics_with_no_location(%arg0: i32) -> (i32) {
+  %allocCount = llvm.mlir.constant(1 : i32) : i32
+  %alloc = llvm.alloca %allocCount x i64 : (i32) -> !llvm.ptr
+  // CHECK-NOT: @llvm.dbg.value
+  llvm.intr.dbg.value #di_local_variable = %arg0 : i32
+  // CHECK-NOT: @llvm.dbg.declare
+  llvm.intr.dbg.declare #declared_var = %alloc : !llvm.ptr
+  // CHECK-NOT: @llvm.dbg.label
+  llvm.intr.dbg.label #di_label
+  llvm.return %arg0 : i32
+}

From 46edbce454a58e35b6f026092fe4dadcf5fccb2e Mon Sep 17 00:00:00 2001
From: Christian Ulmann <christian.ulmann@nextsilicon.com>
Date: Mon, 30 Oct 2023 12:50:37 +0100
Subject: [PATCH 358/877] [MLIR][LLVM] Change CAPI pointer factory to create
 opaque pointers (#70572)

This commit changes the LLVM dialect's CAPI pointer getters to drop
support for typed pointers. Typed pointers are deprecated and should no
longer be generated.
---
 mlir/include/mlir-c/Dialect/LLVM.h |  2 +-
 mlir/lib/CAPI/Dialect/LLVM.cpp     |  4 ++--
 mlir/test/CAPI/llvm.c              | 25 ++++++++++++++-----------
 3 files changed, 17 insertions(+), 14 deletions(-)

diff --git a/mlir/include/mlir-c/Dialect/LLVM.h b/mlir/include/mlir-c/Dialect/LLVM.h
index ba98c33fdfd6b..72701a8222543 100644
--- a/mlir/include/mlir-c/Dialect/LLVM.h
+++ b/mlir/include/mlir-c/Dialect/LLVM.h
@@ -19,7 +19,7 @@ extern "C" {
 MLIR_DECLARE_CAPI_DIALECT_REGISTRATION(LLVM, llvm);
 
 /// Creates an llvm.ptr type.
-MLIR_CAPI_EXPORTED MlirType mlirLLVMPointerTypeGet(MlirType pointee,
+MLIR_CAPI_EXPORTED MlirType mlirLLVMPointerTypeGet(MlirContext ctx,
                                                    unsigned addressSpace);
 
 /// Creates an llmv.void type.
diff --git a/mlir/lib/CAPI/Dialect/LLVM.cpp b/mlir/lib/CAPI/Dialect/LLVM.cpp
index d023bf5d68ce5..b4405f7aac8ab 100644
--- a/mlir/lib/CAPI/Dialect/LLVM.cpp
+++ b/mlir/lib/CAPI/Dialect/LLVM.cpp
@@ -16,8 +16,8 @@ using namespace mlir::LLVM;
 
 MLIR_DEFINE_CAPI_DIALECT_REGISTRATION(LLVM, llvm, LLVMDialect)
 
-MlirType mlirLLVMPointerTypeGet(MlirType pointee, unsigned addressSpace) {
-  return wrap(LLVMPointerType::get(unwrap(pointee), addressSpace));
+MlirType mlirLLVMPointerTypeGet(MlirContext ctx, unsigned addressSpace) {
+  return wrap(LLVMPointerType::get(unwrap(ctx), addressSpace));
 }
 
 MlirType mlirLLVMVoidTypeGet(MlirContext ctx) {
diff --git a/mlir/test/CAPI/llvm.c b/mlir/test/CAPI/llvm.c
index 82e1660c15a48..aaec7b113f0a9 100644
--- a/mlir/test/CAPI/llvm.c
+++ b/mlir/test/CAPI/llvm.c
@@ -10,8 +10,8 @@
 // RUN: mlir-capi-llvm-test 2>&1 | FileCheck %s
 
 #include "mlir-c/Dialect/LLVM.h"
-#include "mlir-c/IR.h"
 #include "mlir-c/BuiltinTypes.h"
+#include "mlir-c/IR.h"
 
 #include <assert.h>
 #include <math.h>
@@ -26,17 +26,20 @@ static void testTypeCreation(MlirContext ctx) {
   MlirType i32 = mlirIntegerTypeGet(ctx, 32);
   MlirType i64 = mlirIntegerTypeGet(ctx, 64);
 
-  const char *i32p_text = "!llvm.ptr<i32>";
-  MlirType i32p = mlirLLVMPointerTypeGet(i32, 0);
-  MlirType i32p_ref = mlirTypeParseGet(ctx, mlirStringRefCreateFromCString(i32p_text));
-  // CHECK: !llvm.ptr<i32>: 1
-  fprintf(stderr, "%s: %d\n", i32p_text, mlirTypeEqual(i32p, i32p_ref));
+  const char *ptr_text = "!llvm.ptr";
+  MlirType ptr = mlirLLVMPointerTypeGet(ctx, 0);
+  MlirType ptr_ref =
+      mlirTypeParseGet(ctx, mlirStringRefCreateFromCString(ptr_text));
+  // CHECK: !llvm.ptr: 1
+  fprintf(stderr, "%s: %d\n", ptr_text, mlirTypeEqual(ptr, ptr_ref));
 
-  const char *i32p4_text = "!llvm.ptr<i32, 4>";
-  MlirType i32p4 = mlirLLVMPointerTypeGet(i32, 4);
-  MlirType i32p4_ref = mlirTypeParseGet(ctx, mlirStringRefCreateFromCString(i32p4_text));
-  // CHECK: !llvm.ptr<i32, 4>: 1
-  fprintf(stderr, "%s: %d\n", i32p4_text, mlirTypeEqual(i32p4, i32p4_ref));
+  const char *ptr_addr_text = "!llvm.ptr<42>";
+  MlirType ptr_addr = mlirLLVMPointerTypeGet(ctx, 42);
+  MlirType ptr_addr_ref =
+      mlirTypeParseGet(ctx, mlirStringRefCreateFromCString(ptr_addr_text));
+  // CHECK: !llvm.ptr<42>: 1
+  fprintf(stderr, "%s: %d\n", ptr_addr_text,
+          mlirTypeEqual(ptr_addr, ptr_addr_ref));
 
   const char *voidt_text = "!llvm.void";
   MlirType voidt = mlirLLVMVoidTypeGet(ctx);

From 8483d18be5b6b5e8721a10eb558be06008307ec6 Mon Sep 17 00:00:00 2001
From: Nicolas Vasilache <nicolas.vasilache@gmail.com>
Date: Wed, 25 Oct 2023 13:43:25 +0000
Subject: [PATCH 359/877] [mlir][Transform] Relax the applicability of
 transform.foreach_match to also take into account the op itself

---
 .../mlir/Dialect/Transform/IR/TransformOps.td | 24 +++++++++++++++----
 .../lib/Dialect/Transform/IR/TransformOps.cpp | 12 ++++++----
 .../Dialect/Linalg/match-ops-interpreter.mlir |  3 ++-
 3 files changed, 28 insertions(+), 11 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Transform/IR/TransformOps.td b/mlir/include/mlir/Dialect/Transform/IR/TransformOps.td
index b14c89eadb097..2fd0e80db96fe 100644
--- a/mlir/include/mlir/Dialect/Transform/IR/TransformOps.td
+++ b/mlir/include/mlir/Dialect/Transform/IR/TransformOps.td
@@ -481,8 +481,16 @@ def ForeachMatchOp : TransformDialectOp<"foreach_match", [
     This operation consumes the operand and produces a new handle associated
     with the same payload. This is necessary to trigger invalidation of handles
     to any of the payload operations nested in the payload operations associated
-    with the operand, as those are likely to be modified by actions. Note that
-    the root payload operation associated with the operand are not matched.
+    with the operand, as those are likely to be modified by actions. 
+    
+    By default, the root payload operation associated with the operand is not
+    matched. This is to support the conservative case where applied actions may
+    invalidate the root payload operation. If the optional `restrict_root`
+    attribute is set, the root operand is guaranteed to not be invalidated by any
+    of the applied actions. In such cases, the root payload operation is also
+    matched. This is useful because matching the root payload operation is a
+    common idiom, when e.g. matching a func.func directly and operations nested
+    under it.
 
     The operation succeeds if none of the matchers produced a definite failure
     during application and if all of the applied actions produced success. Note
@@ -495,13 +503,19 @@ def ForeachMatchOp : TransformDialectOp<"foreach_match", [
   }];
 
   let arguments = (ins TransformHandleTypeInterface:$root,
+                       UnitAttr:$restrict_root,
                        SymbolRefArrayAttr:$matchers,
                        SymbolRefArrayAttr:$actions);
   let results = (outs TransformHandleTypeInterface:$updated);
 
-  let assemblyFormat =
-      "`in` $root custom<ForeachMatchSymbols>($matchers, $actions) "
-      "attr-dict `:` functional-type($root, $updated)";
+  let assemblyFormat = [{
+    (`restrict_root` $restrict_root^)?
+    `in`
+    $root
+    custom<ForeachMatchSymbols>($matchers, $actions)
+    attr-dict
+    `:` functional-type($root, $updated)
+  }];
 
   let hasVerifier = 1;
 }
diff --git a/mlir/lib/Dialect/Transform/IR/TransformOps.cpp b/mlir/lib/Dialect/Transform/IR/TransformOps.cpp
index 8db77b6059dd2..514a75b5d5904 100644
--- a/mlir/lib/Dialect/Transform/IR/TransformOps.cpp
+++ b/mlir/lib/Dialect/Transform/IR/TransformOps.cpp
@@ -850,8 +850,9 @@ transform::ForeachMatchOp::apply(transform::TransformRewriter &rewriter,
 
   for (Operation *root : state.getPayloadOps(getRoot())) {
     WalkResult walkResult = root->walk([&](Operation *op) {
-      // Skip over the root op itself so we don't invalidate it.
-      if (op == root)
+      // If getRestrictRoot is not present, skip over the root op itself so we
+      // don't invalidate it.
+      if (!getRestrictRoot() && op == root)
         return WalkResult::advance();
 
       DEBUG_MATCHER({
@@ -1556,10 +1557,10 @@ DiagnosedSilenceableFailure transform::MatchOperationEmptyOp::matchOperation(
     ::std::optional<::mlir::Operation *> maybeCurrent,
     transform::TransformResults &results, transform::TransformState &state) {
   if (!maybeCurrent.has_value()) {
-    DBGS_MATCHER() << "MatchOperationEmptyOp success\n";
+    DEBUG_MATCHER({ DBGS_MATCHER() << "MatchOperationEmptyOp success\n"; });
     return DiagnosedSilenceableFailure::success();
   }
-  DBGS_MATCHER() << "MatchOperationEmptyOp failure\n";
+  DEBUG_MATCHER({ DBGS_MATCHER() << "MatchOperationEmptyOp failure\n"; });
   return emitSilenceableError() << "operation is not empty";
 }
 
@@ -1961,7 +1962,8 @@ void transform::NamedSequenceOp::build(OpBuilder &builder,
   state.addAttribute(SymbolTable::getSymbolAttrName(),
                      builder.getStringAttr(symName));
   state.addAttribute(getFunctionTypeAttrName(state.name),
-                     TypeAttr::get(FunctionType::get(builder.getContext(), rootType, resultTypes)));
+                     TypeAttr::get(FunctionType::get(builder.getContext(),
+                                                     rootType, resultTypes)));
   state.attributes.append(attrs.begin(), attrs.end());
   state.addRegion();
 
diff --git a/mlir/test/Dialect/Linalg/match-ops-interpreter.mlir b/mlir/test/Dialect/Linalg/match-ops-interpreter.mlir
index 9489aadac843d..c88945c8a5c60 100644
--- a/mlir/test/Dialect/Linalg/match-ops-interpreter.mlir
+++ b/mlir/test/Dialect/Linalg/match-ops-interpreter.mlir
@@ -100,12 +100,13 @@ module attributes { transform.with_named_sequence } {
   }
 
   transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.consumed}) {
-    transform.foreach_match in %arg0
+    transform.foreach_match restrict_root in %arg0
         @match_structured_suppress -> @do_nothing
         : (!transform.any_op) -> !transform.any_op
     transform.yield
   }
 
+  // expected-remark @below {{other}}
   func.func @payload() attributes { transform.target_tag = "start_here" } {
     // expected-remark @below {{other}}
     %D = arith.constant dense<1.0> : tensor<2x4xf32>

From b2929bebb6ce8d75acab4f2fde43213673cb6010 Mon Sep 17 00:00:00 2001
From: Tulio Magno Quites Machado Filho <tuliom@redhat.com>
Date: Mon, 30 Oct 2023 08:55:34 -0300
Subject: [PATCH 360/877] [lldb] Adapt code to Python 3.13 (#70445)

1. Remove usage of PyEval_ThreadsInitialized and PyEval_InitThreads

Both of these functions were removed in Python 3.13 [1] after being
deprecated since Python 3.9.

According to "What's new in Python 3.13" document [1]:

    Since Python 3.7, Py_Initialize() always creates the GIL: calling
    PyEval_InitThreads() did nothing and PyEval_ThreadsInitialized()
    always returned non-zero.

2. Replace _Py_IsFinalizing() with Py_IsFinalizing().

[1] https://docs.python.org/3.13/whatsnew/3.13.html
---
 .../ScriptInterpreter/Python/PythonDataObjects.cpp       | 4 +++-
 .../ScriptInterpreter/Python/ScriptInterpreterPython.cpp | 9 +++++++++
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/PythonDataObjects.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/PythonDataObjects.cpp
index 9ac840a4a102d..fe3438c424715 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/PythonDataObjects.cpp
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/PythonDataObjects.cpp
@@ -71,7 +71,9 @@ Expected<std::string> python::As<std::string>(Expected<PythonObject> &&obj) {
 }
 
 static bool python_is_finalizing() {
-#if PY_MAJOR_VERSION == 3 && PY_MINOR_VERSION < 7
+#if (PY_MAJOR_VERSION == 3 && PY_MINOR_VERSION >= 13) || (PY_MAJOR_VERSION > 3)
+  return Py_IsFinalizing();
+#elif PY_MAJOR_VERSION == 3 && PY_MINOR_VERSION < 7
   return _Py_Finalizing != nullptr;
 #else
   return _Py_IsFinalizing();
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp
index a57c8e4984ad8..968cc8ca03001 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp
@@ -179,18 +179,27 @@ struct InitializePythonRAII {
       return;
 #endif
 
+// `PyEval_ThreadsInitialized` was deprecated in Python 3.9 and removed in
+// Python 3.13. It has been returning `true` always since Python 3.7.
+#if (PY_MAJOR_VERSION == 3 && PY_MINOR_VERSION < 9) || (PY_MAJOR_VERSION < 3)
     if (PyEval_ThreadsInitialized()) {
+#endif
       Log *log = GetLog(LLDBLog::Script);
 
       m_was_already_initialized = true;
       m_gil_state = PyGILState_Ensure();
       LLDB_LOGV(log, "Ensured PyGILState. Previous state = {0}locked\n",
                 m_gil_state == PyGILState_UNLOCKED ? "un" : "");
+
+// `PyEval_InitThreads` was deprecated in Python 3.9 and removed in
+// Python 3.13.
+#if (PY_MAJOR_VERSION == 3 && PY_MINOR_VERSION < 9) || (PY_MAJOR_VERSION < 3)
       return;
     }
 
     // InitThreads acquires the GIL if it hasn't been called before.
     PyEval_InitThreads();
+#endif
   }
 
   PyGILState_STATE m_gil_state = PyGILState_UNLOCKED;

From f6643263631bcb0d191ef923963ac1a5ca9ac5fd Mon Sep 17 00:00:00 2001
From: Benjamin Kramer <benny.kra@googlemail.com>
Date: Mon, 30 Oct 2023 12:57:03 +0100
Subject: [PATCH 361/877] Remove the opaque pointers flag from tools

This has been the default for a while and the flags are slowly going
away. NFCI.
---
 mlir/test/lib/Dialect/LLVM/TestLowerToLLVM.cpp   | 14 +++-----------
 .../mlir-spirv-cpu-runner.cpp                    | 11 ++---------
 .../mlir-vulkan-runner/mlir-vulkan-runner.cpp    | 16 ++++------------
 3 files changed, 9 insertions(+), 32 deletions(-)

diff --git a/mlir/test/lib/Dialect/LLVM/TestLowerToLLVM.cpp b/mlir/test/lib/Dialect/LLVM/TestLowerToLLVM.cpp
index 8d61ec44214f8..10c21612f64ac 100644
--- a/mlir/test/lib/Dialect/LLVM/TestLowerToLLVM.cpp
+++ b/mlir/test/lib/Dialect/LLVM/TestLowerToLLVM.cpp
@@ -48,11 +48,6 @@ void buildTestLowerToLLVM(OpPassManager &pm,
   // unrealized casts, but there needs to be the final module-wise cleanup in
   // the end. Keep module-level for now.
 
-  auto enableOpaquePointers = [](auto options) {
-    options.useOpaquePointers = true;
-    return options;
-  };
-
   // Blanket-convert any remaining high-level vector ops to loops if any remain.
   pm.addNestedPass<func::FuncOp>(createConvertVectorToSCFPass());
   // Blanket-convert any remaining linalg ops to loops if any remain.
@@ -67,8 +62,7 @@ void buildTestLowerToLLVM(OpPassManager &pm,
   // Convert vector to LLVM (always needed).
   pm.addPass(createConvertVectorToLLVMPass(
       // TODO: add more options on a per-need basis.
-      enableOpaquePointers(
-          ConvertVectorToLLVMPassOptions{options.reassociateFPReductions})));
+      ConvertVectorToLLVMPassOptions{options.reassociateFPReductions}));
   // Convert Math to LLVM (always needed).
   pm.addNestedPass<func::FuncOp>(createConvertMathToLLVMPass());
   // Expand complicated MemRef operations before lowering them.
@@ -76,11 +70,9 @@ void buildTestLowerToLLVM(OpPassManager &pm,
   // The expansion may create affine expressions. Get rid of them.
   pm.addPass(createLowerAffinePass());
   // Convert MemRef to LLVM (always needed).
-  pm.addPass(createFinalizeMemRefToLLVMConversionPass(
-      enableOpaquePointers(FinalizeMemRefToLLVMConversionPassOptions{})));
+  pm.addPass(createFinalizeMemRefToLLVMConversionPass());
   // Convert Func to LLVM (always needed).
-  pm.addPass(createConvertFuncToLLVMPass(
-      enableOpaquePointers(ConvertFuncToLLVMPassOptions{})));
+  pm.addPass(createConvertFuncToLLVMPass());
   // Convert Index to LLVM (always needed).
   pm.addPass(createConvertIndexToLLVMPass());
   // Convert remaining unrealized_casts (always needed).
diff --git a/mlir/tools/mlir-spirv-cpu-runner/mlir-spirv-cpu-runner.cpp b/mlir/tools/mlir-spirv-cpu-runner/mlir-spirv-cpu-runner.cpp
index e3d5b2ff5843c..7e0b51cac8062 100644
--- a/mlir/tools/mlir-spirv-cpu-runner/mlir-spirv-cpu-runner.cpp
+++ b/mlir/tools/mlir-spirv-cpu-runner/mlir-spirv-cpu-runner.cpp
@@ -84,18 +84,11 @@ static LogicalResult runMLIRPasses(Operation *module,
   passManager.addPass(createGpuKernelOutliningPass());
   passManager.addPass(createConvertGPUToSPIRVPass(/*mapMemorySpace=*/true));
 
-  auto enableOpaquePointers = [](auto options) {
-    options.useOpaquePointers = true;
-    return options;
-  };
-
   OpPassManager &nestedPM = passManager.nest<spirv::ModuleOp>();
   nestedPM.addPass(spirv::createSPIRVLowerABIAttributesPass());
   nestedPM.addPass(spirv::createSPIRVUpdateVCEPass());
-  passManager.addPass(createLowerHostCodeToLLVMPass(
-      enableOpaquePointers(LowerHostCodeToLLVMPassOptions{})));
-  passManager.addPass(
-      createConvertSPIRVToLLVMPass(ConvertSPIRVToLLVMPassOptions{}));
+  passManager.addPass(createLowerHostCodeToLLVMPass());
+  passManager.addPass(createConvertSPIRVToLLVMPass());
   return passManager.run(module);
 }
 
diff --git a/mlir/tools/mlir-vulkan-runner/mlir-vulkan-runner.cpp b/mlir/tools/mlir-vulkan-runner/mlir-vulkan-runner.cpp
index 0588fcd265f3f..d3ec890bf4859 100644
--- a/mlir/tools/mlir-vulkan-runner/mlir-vulkan-runner.cpp
+++ b/mlir/tools/mlir-vulkan-runner/mlir-vulkan-runner.cpp
@@ -70,25 +70,17 @@ static LogicalResult runMLIRPasses(Operation *op,
   if (options.spirvWebGPUPrepare)
     modulePM.addPass(spirv::createSPIRVWebGPUPreparePass());
 
-  auto enableOpaquePointers = [](auto passOption) {
-    passOption.useOpaquePointers = true;
-    return passOption;
-  };
-
   passManager.addPass(createConvertGpuLaunchFuncToVulkanLaunchFuncPass());
-  passManager.addPass(createFinalizeMemRefToLLVMConversionPass(
-      enableOpaquePointers(FinalizeMemRefToLLVMConversionPassOptions{})));
-  passManager.addPass(createConvertVectorToLLVMPass(
-      enableOpaquePointers(ConvertVectorToLLVMPassOptions{})));
+  passManager.addPass(createFinalizeMemRefToLLVMConversionPass());
+  passManager.addPass(createConvertVectorToLLVMPass());
   passManager.nest<func::FuncOp>().addPass(LLVM::createRequestCWrappersPass());
   ConvertFuncToLLVMPassOptions funcToLLVMOptions{};
   funcToLLVMOptions.indexBitwidth =
       DataLayout(module).getTypeSizeInBits(IndexType::get(module.getContext()));
   passManager.addPass(
-      createConvertFuncToLLVMPass(enableOpaquePointers(funcToLLVMOptions)));
+      createConvertFuncToLLVMPass(funcToLLVMOptions));
   passManager.addPass(createReconcileUnrealizedCastsPass());
-  passManager.addPass(createConvertVulkanLaunchFuncToVulkanCallsPass(
-      enableOpaquePointers(ConvertVulkanLaunchFuncToVulkanCallsPassOptions{})));
+  passManager.addPass(createConvertVulkanLaunchFuncToVulkanCallsPass());
 
   return passManager.run(module);
 }

From a1b2ace137385388bf9bd7ea4b6df3ff298900f6 Mon Sep 17 00:00:00 2001
From: Cullen Rhodes <cullen.rhodes@arm.com>
Date: Mon, 30 Oct 2023 12:12:52 +0000
Subject: [PATCH 362/877] [mlir][ArmSME] Add optional padding and mask operands
 to tile_load (#69195)

Padding and mask are optional, but if one is specified both must be
specified. This is consistent with vector.transfer_read.
---
 .../mlir/Dialect/ArmSME/IR/ArmSMEOps.td       | 52 ++++++++++++++++--
 mlir/test/Dialect/ArmSME/invalid.mlir         | 53 +++++++++++++++++++
 mlir/test/Dialect/ArmSME/roundtrip.mlir       | 10 ++++
 3 files changed, 112 insertions(+), 3 deletions(-)

diff --git a/mlir/include/mlir/Dialect/ArmSME/IR/ArmSMEOps.td b/mlir/include/mlir/Dialect/ArmSME/IR/ArmSMEOps.td
index 9b9dbff10ea2d..b30d0fdb866bd 100644
--- a/mlir/include/mlir/Dialect/ArmSME/IR/ArmSMEOps.td
+++ b/mlir/include/mlir/Dialect/ArmSME/IR/ArmSMEOps.td
@@ -231,7 +231,26 @@ def ZeroOp : ArmSME_Op<"zero", [Pure]> {
   let assemblyFormat = "attr-dict `:` type($res)";
 }
 
-def TileLoadOp : ArmSME_Op<"tile_load"> {
+def TileLoadOp : ArmSME_Op<"tile_load", [
+  AttrSizedOperandSegments,
+  OptionalTypesMatchWith<
+    "padding type matches element type of result",
+    "result", "padding",
+    "::llvm::cast<VectorType>($_self).getElementType()"
+  >,
+  OptionalTypesMatchWith<
+    "mask has i1 element type and same shape as result",
+    "result", "mask",
+    "VectorType("
+      "VectorType::Builder("
+        "::llvm::cast<mlir::VectorType>($_self)"
+      ").setElementType(IntegerType::get($_self.getContext(), 1)))"
+  >,
+  PredOpTrait<
+    "both `padding` and `mask` should be provided or neither",
+    CPred<"bool(getPadding()) == bool(getMask())">
+  >,
+]> {
   let summary = "Tile load operation";
   let description = [{
     Loads a 2D SME "virtual tile" from memory defined by a base and indices,
@@ -242,6 +261,16 @@ def TileLoadOp : ArmSME_Op<"tile_load"> {
     dimensions, since the operation is scalable, and the element type must be a
     scalar that matches the element type of the result.
 
+    An optional SSA value `padding` of the same elemental type as the MemRef is
+    provided to specify a fallback value in the case of masking.
+
+    An optional SSA value `mask` may be specified to mask out elements read
+    from the MemRef. The `mask` type is an `i1` vector with a shape that
+    matches how elements are read from the MemRef. Elements whose corresponding
+    mask element is `0` are masked out and replaced with `padding`.
+
+    If either `padding` or `mask` are specified, both must be specified.
+
     Example 1: Load an 8-bit element ZA tile with horizontal layout (default) from memory (ZA0.B).
     ```mlir
     %tile = arm_sme.tile_load %base[%c0, %c0] : memref<?x?xi8>, vector<[16]x[16]xi8>
@@ -256,10 +285,16 @@ def TileLoadOp : ArmSME_Op<"tile_load"> {
     ```mlir
     %tile = arm_sme.tile_load %base[%c0, %c0] layout<horizontal> : memref<?x?xi128>, vector<[1]x[1]xi128>
     ```
+
+    Example 4: Masked load of int 32-bit element ZA tile with horizontal layout (default) from memory.
+    ```mlir
+    %tile = arm_sme.tile_load %base[%c0, %c0], %pad, %mask : memref<?x?xf32>, vector<[4]x[4]xf32>
+    ```
   }];
   let arguments = (ins
     Arg<AnyMemRef, "the reference to load from", [MemRead]>:$base,
     Variadic<Index>:$indices,
+    Optional<AnyType>:$padding, Optional<AnyVector>:$mask,
     ArmSME_TileSliceLayoutAttr:$layout
   );
   let results = (outs SMETile:$result);
@@ -273,9 +308,20 @@ def TileLoadOp : ArmSME_Op<"tile_load"> {
     }
   }];
 
+  let builders = [
+    OpBuilder<(ins "VectorType":$resultType, "Value":$base,
+                   "ValueRange":$indices, "TileSliceLayout":$layout), [{
+      build($_builder, $_state, resultType, base, indices, {}, {}, layout);
+    }]>,
+    OpBuilder<(ins "VectorType":$resultType, "Value":$base,
+                   "ValueRange":$indices), [{
+      build($_builder, $_state, resultType, base, indices, {}, {}, {});
+    }]>,
+  ];
+
   let assemblyFormat =
-    "$base `[` $indices `]` (`layout` `` $layout^)? attr-dict "
-      "`:` type($base) `,` type($result)";
+    "$base `[` $indices `]` (`,` $padding `,` $mask^)? (`layout` `` $layout^)?"
+      "attr-dict `:` type($base) `,` type($result)";
 }
 
 def TileStoreOp : ArmSME_Op<"tile_store"> {
diff --git a/mlir/test/Dialect/ArmSME/invalid.mlir b/mlir/test/Dialect/ArmSME/invalid.mlir
index 431009b1b9ede..25c62f78d8435 100644
--- a/mlir/test/Dialect/ArmSME/invalid.mlir
+++ b/mlir/test/Dialect/ArmSME/invalid.mlir
@@ -1,5 +1,9 @@
 // RUN: mlir-opt %s -split-input-file -verify-diagnostics
 
+//===----------------------------------------------------------------------===//
+// arm_sme.cast_tile_to_vector
+//===----------------------------------------------------------------------===//
+
 // -----
 
 func.func @arm_sme_cast_tile_to_vector__bad_tile_id_bitwidth(%tile_id : i8) -> vector<[8]x[8]xi16> {
@@ -48,6 +52,10 @@ func.func @arm_sme_cast_tile_to_vector_bad_shape(%tile_id : i8) -> vector<[4]x[1
   return %0 : vector<[4]x[16]xi8>
 }
 
+//===----------------------------------------------------------------------===//
+// arm_sme.cast_vector_to_tile
+//===----------------------------------------------------------------------===//
+
 // -----
 
 func.func @arm_sme_cast_vector_to_tile__bad_tile_id_bitwidth(%vector : vector<[1]x[1]xi128>) -> i32 {
@@ -64,6 +72,10 @@ func.func @arm_sme_cast_vector_to_tile__bad_rank_1d(%vector : vector<[16]xi8>) -
   return %0 : i8
 }
 
+//===----------------------------------------------------------------------===//
+// arm_sme.get_tile_id
+//===----------------------------------------------------------------------===//
+
 // -----
 
 func.func @arm_sme_get_tile_id__bad_type() -> i1 {
@@ -72,6 +84,10 @@ func.func @arm_sme_get_tile_id__bad_type() -> i1 {
   return %0 : i1
 }
 
+//===----------------------------------------------------------------------===//
+// arm_sme.move_vector_to_tile_slice
+//===----------------------------------------------------------------------===//
+
 // -----
 
 func.func @arm_sme_move_vector_to_tile_slice_i8__bad_vector_type(%vector : vector<[8]xi8>, %tile : vector<[16]x[16]xi8>, %tile_slice_index : index) -> vector<[16]x[16]xi8> {
@@ -90,6 +106,10 @@ func.func @arm_sme_move_vector_to_tile_slice_f32__bad_vector_type(%vector : vect
   return %0 : vector<[4]x[4]xf32>
 }
 
+//===----------------------------------------------------------------------===//
+// arm_sme.move_tile_slice_to_vector
+//===----------------------------------------------------------------------===//
+
 // -----
 
 func.func @arm_sme_move_tile_slice_to_vector__bad_result_type(%tile : vector<[4]x[4]xf32>, %tile_slice_index : index) -> vector<[2]xf64> {
@@ -97,3 +117,36 @@ func.func @arm_sme_move_tile_slice_to_vector__bad_result_type(%tile : vector<[4]
   %0 = arm_sme.move_tile_slice_to_vector %tile[%tile_slice_index] : vector<[2]xf64> from vector<[4]x[4]xf32>
   return %0 : vector<[2]xf64>
 }
+
+//===----------------------------------------------------------------------===//
+// arm_sme.tile_load
+//===----------------------------------------------------------------------===//
+
+// -----
+
+func.func @arm_sme_tile_load__bad_padding_type(%src : memref<?x?xf64>, %pad : f32, %mask : vector<[2]x[2]xi1>) {
+  %c0 = arith.constant 0 : index
+  // expected-note@-2 {{prior use here}}
+  // expected-error@+1 {{use of value '%pad' expects different type than prior uses: 'f64' vs 'f32'}}
+  %tile = arm_sme.tile_load %src[%c0, %c0], %pad, %mask : memref<?x?xf64>, vector<[2]x[2]xf64>
+  return
+}
+
+// -----
+
+func.func @arm_sme_tile_load__bad_mask_type(%src : memref<?x?xf64>, %pad : f64, %mask : vector<[4]x[4]xi1>) {
+  %c0 = arith.constant 0 : index
+  // expected-note@-2 {{prior use here}}
+  // expected-error@+1 {{use of value '%mask' expects different type than prior uses: 'vector<[2]x[2]xi1>' vs 'vector<[4]x[4]xi1>}}
+  %tile = arm_sme.tile_load %src[%c0, %c0], %pad, %mask : memref<?x?xf64>, vector<[2]x[2]xf64>
+  return
+}
+
+// -----
+
+func.func @arm_sme_tile_load__pad_but_no_mask(%src : memref<?x?xf64>, %pad : f64) {
+  %c0 = arith.constant 0 : index
+  // expected-error@+1 {{op failed to verify that both `padding` and `mask` should be provided or neither}}
+  %tile = arm_sme.tile_load %src[%c0, %c0], %pad, : memref<?x?xf64>, vector<[2]x[2]xf64>
+  return
+}
diff --git a/mlir/test/Dialect/ArmSME/roundtrip.mlir b/mlir/test/Dialect/ArmSME/roundtrip.mlir
index e5ba81eff8360..6866137267dc6 100644
--- a/mlir/test/Dialect/ArmSME/roundtrip.mlir
+++ b/mlir/test/Dialect/ArmSME/roundtrip.mlir
@@ -438,6 +438,16 @@ func.func @arm_sme_tile_load_ver_f64(%src : memref<?x?xf64>) {
 
 // -----
 
+/// Padding and mask are optional
+func.func @arm_sme_tile_load_hor_pad_f64(%src : memref<?x?xf64>, %pad : f64, %mask : vector<[2]x[2]xi1>) {
+  // CHECK: arm_sme.tile_load %{{.*}}[{{.*}}], {{.*}}, {{.*}} : memref<?x?xf64>, vector<[2]x[2]xf64>
+  %c0 = arith.constant 0 : index
+  %tile = arm_sme.tile_load %src[%c0, %c0], %pad, %mask : memref<?x?xf64>, vector<[2]x[2]xf64>
+  return
+}
+
+// -----
+
 /// Layout is optional and horizontal is the default, verify it's still parsed.
 func.func @arm_sme_tile_load_explicit_hor(%src : memref<?x?xi8>) {
   // CHECK: arm_sme.tile_load %{{.*}}[{{.*}}] : memref<?x?xi8>, vector<[16]x[16]xi8>

From da28c3309413475146fdeb9194e44b7a63b7484e Mon Sep 17 00:00:00 2001
From: "Henrik G. Olsson" <hnrklssn@gmail.com>
Date: Mon, 30 Oct 2023 13:17:26 +0100
Subject: [PATCH 363/877] [UTC] Recognise CHECK lines with globals matched
 literally (#70050)

Previously when using `-p` a.k.a. `--preserve-names` existing lines for
checking globals were not recognised as such, leading to the line being
kept while also being emitted again, resulting in duplicated CHECK
lines.

This resolves #70048.
---
 .../Inputs/global_preserve_name.ll                 | 13 +++++++++++++
 .../Inputs/global_preserve_name.ll.expected        | 14 ++++++++++++++
 .../update_test_checks/global_preserve_name.test   |  7 +++++++
 llvm/utils/UpdateTestChecks/common.py              |  2 +-
 4 files changed, 35 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/global_preserve_name.ll
 create mode 100644 llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/global_preserve_name.ll.expected
 create mode 100644 llvm/test/tools/UpdateTestChecks/update_test_checks/global_preserve_name.test

diff --git a/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/global_preserve_name.ll b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/global_preserve_name.ll
new file mode 100644
index 0000000000000..b872e9d53e2cb
--- /dev/null
+++ b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/global_preserve_name.ll
@@ -0,0 +1,13 @@
+; RUN: opt -S < %s | FileCheck %s
+
+@G = constant i32 42
+
+;.
+; CHECK: @G = constant i32 42
+;.
+define ptr @foo() {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:    ret ptr @G
+;
+  ret ptr @G
+}
diff --git a/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/global_preserve_name.ll.expected b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/global_preserve_name.ll.expected
new file mode 100644
index 0000000000000..f29ed24be3fec
--- /dev/null
+++ b/llvm/test/tools/UpdateTestChecks/update_test_checks/Inputs/global_preserve_name.ll.expected
@@ -0,0 +1,14 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: -p --check-globals
+; RUN: opt -S < %s | FileCheck %s
+
+@G = constant i32 42
+
+;.
+; CHECK: @G = constant i32 42
+;.
+define ptr @foo() {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:    ret ptr @G
+;
+  ret ptr @G
+}
diff --git a/llvm/test/tools/UpdateTestChecks/update_test_checks/global_preserve_name.test b/llvm/test/tools/UpdateTestChecks/update_test_checks/global_preserve_name.test
new file mode 100644
index 0000000000000..2ef050abe15b9
--- /dev/null
+++ b/llvm/test/tools/UpdateTestChecks/update_test_checks/global_preserve_name.test
@@ -0,0 +1,7 @@
+## Basic test checking that we capture existing lines matching global variable names
+# RUN: cp -f %S/Inputs/global_preserve_name.ll %t.ll && %update_test_checks %t.ll --check-globals --preserve-names
+# RUN: diff -u %t.ll %S/Inputs/global_preserve_name.ll.expected
+## Verify that running without the --global-value-regex flag respects UTC_ARGS, and that the output is a fixed point.
+# RUN: %update_test_checks %t.ll
+# RUN: diff -u %t.ll %S/Inputs/global_preserve_name.ll.expected
+
diff --git a/llvm/utils/UpdateTestChecks/common.py b/llvm/utils/UpdateTestChecks/common.py
index c8c8d85e0dc68..33da7b3b8665d 100644
--- a/llvm/utils/UpdateTestChecks/common.py
+++ b/llvm/utils/UpdateTestChecks/common.py
@@ -391,7 +391,7 @@ def should_add_line_to_output(
     m = CHECK_RE.match(input_line)
     if m and m.group(1) in prefix_set:
         if skip_global_checks:
-            global_ir_value_re = re.compile(r"\[\[", flags=(re.M))
+            global_ir_value_re = re.compile(r"(\[\[|@)", flags=(re.M))
             return not global_ir_value_re.search(input_line)
         return False
 

From 526c9b7e37fa12abc17eebc68f21c1d213477ba8 Mon Sep 17 00:00:00 2001
From: martinboehme <mboehme@google.com>
Date: Mon, 30 Oct 2023 13:18:57 +0100
Subject: [PATCH 364/877] [clang][nullability] Use `proves()` and `assume()`
 instead of deprecated synonyms. (#70297)

---
 .../lib/Analysis/FlowSensitive/HTMLLogger.cpp |   9 +-
 .../Models/ChromiumCheckModel.cpp             |   2 +-
 .../Models/UncheckedOptionalAccessModel.cpp   |  22 ++-
 .../TypeErasedDataflowAnalysis.cpp            |   2 +-
 .../FlowSensitive/ChromiumCheckModelTest.cpp  |   4 +-
 .../FlowSensitive/SignAnalysisTest.cpp        |  45 +++---
 .../Analysis/FlowSensitive/TransferTest.cpp   | 152 +++++++++---------
 .../TypeErasedDataflowAnalysisTest.cpp        |  74 +++++----
 8 files changed, 149 insertions(+), 161 deletions(-)

diff --git a/clang/lib/Analysis/FlowSensitive/HTMLLogger.cpp b/clang/lib/Analysis/FlowSensitive/HTMLLogger.cpp
index 8aef1d6f46089..8329367098b1d 100644
--- a/clang/lib/Analysis/FlowSensitive/HTMLLogger.cpp
+++ b/clang/lib/Analysis/FlowSensitive/HTMLLogger.cpp
@@ -114,11 +114,10 @@ class ModelDumper {
     // guaranteed true/false here is valuable and hard to determine by hand.
     if (auto *B = llvm::dyn_cast<BoolValue>(&V)) {
       JOS.attribute("formula", llvm::to_string(B->formula()));
-      JOS.attribute(
-          "truth", Env.flowConditionImplies(B->formula()) ? "true"
-                   : Env.flowConditionImplies(Env.arena().makeNot(B->formula()))
-                       ? "false"
-                       : "unknown");
+      JOS.attribute("truth", Env.proves(B->formula()) ? "true"
+                             : Env.proves(Env.arena().makeNot(B->formula()))
+                                 ? "false"
+                                 : "unknown");
     }
   }
   void dump(const StorageLocation &L) {
diff --git a/clang/lib/Analysis/FlowSensitive/Models/ChromiumCheckModel.cpp b/clang/lib/Analysis/FlowSensitive/Models/ChromiumCheckModel.cpp
index 895f4ff04a172..f49087ababc44 100644
--- a/clang/lib/Analysis/FlowSensitive/Models/ChromiumCheckModel.cpp
+++ b/clang/lib/Analysis/FlowSensitive/Models/ChromiumCheckModel.cpp
@@ -59,7 +59,7 @@ bool ChromiumCheckModel::transfer(const CFGElement &Element, Environment &Env) {
     if (const auto *M = dyn_cast<CXXMethodDecl>(Call->getDirectCallee())) {
       if (isCheckLikeMethod(CheckDecls, *M)) {
         // Mark this branch as unreachable.
-        Env.addToFlowCondition(Env.arena().makeLiteral(false));
+        Env.assume(Env.arena().makeLiteral(false));
         return true;
       }
     }
diff --git a/clang/lib/Analysis/FlowSensitive/Models/UncheckedOptionalAccessModel.cpp b/clang/lib/Analysis/FlowSensitive/Models/UncheckedOptionalAccessModel.cpp
index 8bd9a030f50cd..55d0713639d90 100644
--- a/clang/lib/Analysis/FlowSensitive/Models/UncheckedOptionalAccessModel.cpp
+++ b/clang/lib/Analysis/FlowSensitive/Models/UncheckedOptionalAccessModel.cpp
@@ -413,7 +413,7 @@ bool isEmptyOptional(const Value &OptionalVal, const Environment &Env) {
   auto *HasValueVal =
       cast_or_null<BoolValue>(OptionalVal.getProperty("has_value"));
   return HasValueVal != nullptr &&
-         Env.flowConditionImplies(Env.arena().makeNot(HasValueVal->formula()));
+         Env.proves(Env.arena().makeNot(HasValueVal->formula()));
 }
 
 /// Returns true if and only if `OptionalVal` is initialized and known to be
@@ -421,8 +421,7 @@ bool isEmptyOptional(const Value &OptionalVal, const Environment &Env) {
 bool isNonEmptyOptional(const Value &OptionalVal, const Environment &Env) {
   auto *HasValueVal =
       cast_or_null<BoolValue>(OptionalVal.getProperty("has_value"));
-  return HasValueVal != nullptr &&
-         Env.flowConditionImplies(HasValueVal->formula());
+  return HasValueVal != nullptr && Env.proves(HasValueVal->formula());
 }
 
 Value *getValueBehindPossiblePointer(const Expr &E, const Environment &Env) {
@@ -490,8 +489,8 @@ void transferValueOrImpl(
   if (HasValueVal == nullptr)
     return;
 
-  Env.addToFlowCondition(ModelPred(Env, forceBoolValue(Env, *ValueOrPredExpr),
-                                   HasValueVal->formula()));
+  Env.assume(ModelPred(Env, forceBoolValue(Env, *ValueOrPredExpr),
+                       HasValueVal->formula()));
 }
 
 void transferValueOrStringEmptyCall(const clang::Expr *ComparisonExpr,
@@ -717,8 +716,8 @@ void transferOptionalAndOptionalCmp(const clang::CXXOperatorCallExpr *CmpExpr,
     if (auto *RHasVal = getHasValue(Env, Env.getValue(*CmpExpr->getArg(1)))) {
       if (CmpExpr->getOperator() == clang::OO_ExclaimEqual)
         CmpValue = &A.makeNot(*CmpValue);
-      Env.addToFlowCondition(evaluateEquality(A, *CmpValue, LHasVal->formula(),
-                                              RHasVal->formula()));
+      Env.assume(evaluateEquality(A, *CmpValue, LHasVal->formula(),
+                                  RHasVal->formula()));
     }
 }
 
@@ -729,7 +728,7 @@ void transferOptionalAndValueCmp(const clang::CXXOperatorCallExpr *CmpExpr,
   if (auto *HasVal = getHasValue(Env, Env.getValue(*E))) {
     if (CmpExpr->getOperator() == clang::OO_ExclaimEqual)
       CmpValue = &A.makeNot(*CmpValue);
-    Env.addToFlowCondition(
+    Env.assume(
         evaluateEquality(A, *CmpValue, HasVal->formula(), A.makeLiteral(true)));
   }
 }
@@ -917,7 +916,7 @@ llvm::SmallVector<SourceLocation> diagnoseUnwrapCall(const Expr *ObjectExpr,
   if (auto *OptionalVal = getValueBehindPossiblePointer(*ObjectExpr, Env)) {
     auto *Prop = OptionalVal->getProperty("has_value");
     if (auto *HasValueVal = cast_or_null<BoolValue>(Prop)) {
-      if (Env.flowConditionImplies(HasValueVal->formula()))
+      if (Env.proves(HasValueVal->formula()))
         return {};
     }
   }
@@ -1004,14 +1003,13 @@ bool UncheckedOptionalAccessModel::merge(QualType Type, const Value &Val1,
   bool MustNonEmpty1 = isNonEmptyOptional(Val1, Env1);
   bool MustNonEmpty2 = isNonEmptyOptional(Val2, Env2);
   if (MustNonEmpty1 && MustNonEmpty2)
-    MergedEnv.addToFlowCondition(HasValueVal.formula());
+    MergedEnv.assume(HasValueVal.formula());
   else if (
       // Only make the costly calls to `isEmptyOptional` if we got "unknown"
       // (false) for both calls to `isNonEmptyOptional`.
       !MustNonEmpty1 && !MustNonEmpty2 && isEmptyOptional(Val1, Env1) &&
       isEmptyOptional(Val2, Env2))
-    MergedEnv.addToFlowCondition(
-        MergedEnv.arena().makeNot(HasValueVal.formula()));
+    MergedEnv.assume(MergedEnv.arena().makeNot(HasValueVal.formula()));
   setHasValue(MergedVal, HasValueVal);
   return true;
 }
diff --git a/clang/lib/Analysis/FlowSensitive/TypeErasedDataflowAnalysis.cpp b/clang/lib/Analysis/FlowSensitive/TypeErasedDataflowAnalysis.cpp
index 23b062665a687..e54fb2a01ddee 100644
--- a/clang/lib/Analysis/FlowSensitive/TypeErasedDataflowAnalysis.cpp
+++ b/clang/lib/Analysis/FlowSensitive/TypeErasedDataflowAnalysis.cpp
@@ -148,7 +148,7 @@ class TerminatorVisitor
       ConditionValue = false;
     }
 
-    Env.addToFlowCondition(Val->formula());
+    Env.assume(Val->formula());
     return {&Cond, ConditionValue};
   }
 
diff --git a/clang/unittests/Analysis/FlowSensitive/ChromiumCheckModelTest.cpp b/clang/unittests/Analysis/FlowSensitive/ChromiumCheckModelTest.cpp
index 1cb51a9cf37c5..a2762046665a2 100644
--- a/clang/unittests/Analysis/FlowSensitive/ChromiumCheckModelTest.cpp
+++ b/clang/unittests/Analysis/FlowSensitive/ChromiumCheckModelTest.cpp
@@ -159,7 +159,7 @@ TEST(ChromiumCheckModelTest, CheckSuccessImpliesConditionHolds) {
 
         auto *FooVal = cast<BoolValue>(Env.getValue(*FooDecl));
 
-        EXPECT_TRUE(Env.flowConditionImplies(FooVal->formula()));
+        EXPECT_TRUE(Env.proves(FooVal->formula()));
       };
 
   std::string Code = R"(
@@ -190,7 +190,7 @@ TEST(ChromiumCheckModelTest, UnrelatedCheckIgnored) {
 
         auto *FooVal = cast<BoolValue>(Env.getValue(*FooDecl));
 
-        EXPECT_FALSE(Env.flowConditionImplies(FooVal->formula()));
+        EXPECT_FALSE(Env.proves(FooVal->formula()));
       };
 
   std::string Code = R"(
diff --git a/clang/unittests/Analysis/FlowSensitive/SignAnalysisTest.cpp b/clang/unittests/Analysis/FlowSensitive/SignAnalysisTest.cpp
index f8897929a59cf..362b0dea58d6b 100644
--- a/clang/unittests/Analysis/FlowSensitive/SignAnalysisTest.cpp
+++ b/clang/unittests/Analysis/FlowSensitive/SignAnalysisTest.cpp
@@ -157,44 +157,44 @@ void transferBinary(const BinaryOperator *BO, const MatchFinder::MatchResult &M,
   switch (BO->getOpcode()) {
   case BO_GT:
     // pos > pos
-    State.Env.addToFlowCondition(
+    State.Env.assume(
         A.makeImplies(*Comp, A.makeImplies(RHSProps.Pos->formula(),
                                            LHSProps.Pos->formula())));
     // pos > zero
-    State.Env.addToFlowCondition(
+    State.Env.assume(
         A.makeImplies(*Comp, A.makeImplies(RHSProps.Zero->formula(),
                                            LHSProps.Pos->formula())));
     break;
   case BO_LT:
     // neg < neg
-    State.Env.addToFlowCondition(
+    State.Env.assume(
         A.makeImplies(*Comp, A.makeImplies(RHSProps.Neg->formula(),
                                            LHSProps.Neg->formula())));
     // neg < zero
-    State.Env.addToFlowCondition(
+    State.Env.assume(
         A.makeImplies(*Comp, A.makeImplies(RHSProps.Zero->formula(),
                                            LHSProps.Neg->formula())));
     break;
   case BO_GE:
     // pos >= pos
-    State.Env.addToFlowCondition(
+    State.Env.assume(
         A.makeImplies(*Comp, A.makeImplies(RHSProps.Pos->formula(),
                                            LHSProps.Pos->formula())));
     break;
   case BO_LE:
     // neg <= neg
-    State.Env.addToFlowCondition(
+    State.Env.assume(
         A.makeImplies(*Comp, A.makeImplies(RHSProps.Neg->formula(),
                                            LHSProps.Neg->formula())));
     break;
   case BO_EQ:
-    State.Env.addToFlowCondition(
+    State.Env.assume(
         A.makeImplies(*Comp, A.makeImplies(RHSProps.Neg->formula(),
                                            LHSProps.Neg->formula())));
-    State.Env.addToFlowCondition(
+    State.Env.assume(
         A.makeImplies(*Comp, A.makeImplies(RHSProps.Zero->formula(),
                                            LHSProps.Zero->formula())));
-    State.Env.addToFlowCondition(
+    State.Env.assume(
         A.makeImplies(*Comp, A.makeImplies(RHSProps.Pos->formula(),
                                            LHSProps.Pos->formula())));
     break;
@@ -215,14 +215,14 @@ void transferUnaryMinus(const UnaryOperator *UO,
     return;
 
   // a is pos ==> -a is neg
-  State.Env.addToFlowCondition(
+  State.Env.assume(
       A.makeImplies(OperandProps.Pos->formula(), UnaryOpProps.Neg->formula()));
   // a is neg ==> -a is pos
-  State.Env.addToFlowCondition(
+  State.Env.assume(
       A.makeImplies(OperandProps.Neg->formula(), UnaryOpProps.Pos->formula()));
   // a is zero ==> -a is zero
-  State.Env.addToFlowCondition(A.makeImplies(OperandProps.Zero->formula(),
-                                             UnaryOpProps.Zero->formula()));
+  State.Env.assume(A.makeImplies(OperandProps.Zero->formula(),
+                                 UnaryOpProps.Zero->formula()));
 }
 
 void transferUnaryNot(const UnaryOperator *UO,
@@ -235,7 +235,7 @@ void transferUnaryNot(const UnaryOperator *UO,
     return;
 
   // a is neg or pos ==> !a is zero
-  State.Env.addToFlowCondition(A.makeImplies(
+  State.Env.assume(A.makeImplies(
       A.makeOr(OperandProps.Pos->formula(), OperandProps.Neg->formula()),
       UnaryOpProps.Zero->formula()));
 
@@ -243,11 +243,11 @@ void transferUnaryNot(const UnaryOperator *UO,
   // put the generic handler, transferExpr maybe?
   if (auto *UOBoolVal = dyn_cast<BoolValue>(UnaryOpValue)) {
     // !a <==> a is zero
-    State.Env.addToFlowCondition(
+    State.Env.assume(
         A.makeEquals(UOBoolVal->formula(), OperandProps.Zero->formula()));
     // !a <==> !a is not zero
-    State.Env.addToFlowCondition(A.makeEquals(
-        UOBoolVal->formula(), A.makeNot(UnaryOpProps.Zero->formula())));
+    State.Env.assume(A.makeEquals(UOBoolVal->formula(),
+                                  A.makeNot(UnaryOpProps.Zero->formula())));
   }
 }
 
@@ -391,11 +391,10 @@ BoolValue &mergeBoolValues(BoolValue &Bool1, const Environment &Env1,
   // path taken - this simplifies the flow condition tracked in `MergedEnv`.
   // Otherwise, information about which path was taken is used to associate
   // `MergedBool` with `Bool1` and `Bool2`.
-  if (Env1.flowConditionImplies(B1) && Env2.flowConditionImplies(B2)) {
-    MergedEnv.addToFlowCondition(MergedBool.formula());
-  } else if (Env1.flowConditionImplies(A.makeNot(B1)) &&
-             Env2.flowConditionImplies(A.makeNot(B2))) {
-    MergedEnv.addToFlowCondition(A.makeNot(MergedBool.formula()));
+  if (Env1.proves(B1) && Env2.proves(B2)) {
+    MergedEnv.assume(MergedBool.formula());
+  } else if (Env1.proves(A.makeNot(B1)) && Env2.proves(A.makeNot(B2))) {
+    MergedEnv.assume(A.makeNot(MergedBool.formula()));
   }
   return MergedBool;
 }
@@ -484,7 +483,7 @@ testing::AssertionResult isPropertyImplied(const Environment &Env,
   if (!Prop)
     return Result;
   auto *BVProp = cast<BoolValue>(Prop);
-  if (Env.flowConditionImplies(BVProp->formula()) != Implies)
+  if (Env.proves(BVProp->formula()) != Implies)
     return testing::AssertionFailure()
            << Property << " is " << (Implies ? "not" : "") << " implied"
            << ", but should " << (Implies ? "" : "not ") << "be";
diff --git a/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp b/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp
index 0c2106777560e..0f9f13df81707 100644
--- a/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp
+++ b/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp
@@ -3795,11 +3795,10 @@ TEST(TransferTest, BooleanEquality) {
         ASSERT_THAT(BarDecl, NotNull());
 
         auto &BarValThen = getFormula(*BarDecl, EnvThen);
-        EXPECT_TRUE(EnvThen.flowConditionImplies(BarValThen));
+        EXPECT_TRUE(EnvThen.proves(BarValThen));
 
         auto &BarValElse = getFormula(*BarDecl, EnvElse);
-        EXPECT_TRUE(
-            EnvElse.flowConditionImplies(EnvElse.arena().makeNot(BarValElse)));
+        EXPECT_TRUE(EnvElse.proves(EnvElse.arena().makeNot(BarValElse)));
       });
 }
 
@@ -3830,11 +3829,10 @@ TEST(TransferTest, BooleanInequality) {
         ASSERT_THAT(BarDecl, NotNull());
 
         auto &BarValThen = getFormula(*BarDecl, EnvThen);
-        EXPECT_TRUE(
-            EnvThen.flowConditionImplies(EnvThen.arena().makeNot(BarValThen)));
+        EXPECT_TRUE(EnvThen.proves(EnvThen.arena().makeNot(BarValThen)));
 
         auto &BarValElse = getFormula(*BarDecl, EnvElse);
-        EXPECT_TRUE(EnvElse.flowConditionImplies(BarValElse));
+        EXPECT_TRUE(EnvElse.proves(BarValElse));
       });
 }
 
@@ -3853,7 +3851,7 @@ TEST(TransferTest, IntegerLiteralEquality) {
 
         auto &Equal =
             getValueForDecl<BoolValue>(ASTCtx, Env, "equal").formula();
-        EXPECT_TRUE(Env.flowConditionImplies(Equal));
+        EXPECT_TRUE(Env.proves(Equal));
       });
 }
 
@@ -3890,19 +3888,19 @@ TEST(TransferTest, CorrelatedBranches) {
           ASSERT_THAT(BDecl, NotNull());
           auto &BVal = getFormula(*BDecl, Env);
 
-          EXPECT_TRUE(Env.flowConditionImplies(Env.arena().makeNot(BVal)));
+          EXPECT_TRUE(Env.proves(Env.arena().makeNot(BVal)));
         }
 
         {
           const Environment &Env = getEnvironmentAtAnnotation(Results, "p1");
           auto &CVal = getFormula(*CDecl, Env);
-          EXPECT_TRUE(Env.flowConditionImplies(CVal));
+          EXPECT_TRUE(Env.proves(CVal));
         }
 
         {
           const Environment &Env = getEnvironmentAtAnnotation(Results, "p2");
           auto &CVal = getFormula(*CDecl, Env);
-          EXPECT_TRUE(Env.flowConditionImplies(CVal));
+          EXPECT_TRUE(Env.proves(CVal));
         }
       });
 }
@@ -3934,7 +3932,7 @@ TEST(TransferTest, LoopWithAssignmentConverges) {
         ASSERT_THAT(BarDecl, NotNull());
 
         auto &BarVal = getFormula(*BarDecl, Env);
-        EXPECT_TRUE(Env.flowConditionImplies(Env.arena().makeNot(BarVal)));
+        EXPECT_TRUE(Env.proves(Env.arena().makeNot(BarVal)));
       });
 }
 
@@ -3967,12 +3965,11 @@ TEST(TransferTest, LoopWithStagedAssignments) {
 
         auto &BarVal = getFormula(*BarDecl, Env);
         auto &ErrVal = getFormula(*ErrDecl, Env);
-        EXPECT_TRUE(Env.flowConditionImplies(BarVal));
+        EXPECT_TRUE(Env.proves(BarVal));
         // An unsound analysis, for example only evaluating the loop once, can
         // conclude that `Err` is false. So, we test that this conclusion is not
         // reached.
-        EXPECT_FALSE(
-            Env.flowConditionImplies(Env.arena().makeNot(ErrVal)));
+        EXPECT_FALSE(Env.proves(Env.arena().makeNot(ErrVal)));
       });
 }
 
@@ -4002,7 +3999,7 @@ TEST(TransferTest, LoopWithReferenceAssignmentConverges) {
         ASSERT_THAT(BarDecl, NotNull());
 
         auto &BarVal = getFormula(*BarDecl, Env);
-        EXPECT_TRUE(Env.flowConditionImplies(Env.arena().makeNot(BarVal)));
+        EXPECT_TRUE(Env.proves(Env.arena().makeNot(BarVal)));
       });
 }
 
@@ -4531,11 +4528,10 @@ TEST(TransferTest, IfStmtBranchExtendsFlowCondition) {
         ASSERT_THAT(FooDecl, NotNull());
 
         auto &ThenFooVal= getFormula(*FooDecl, ThenEnv);
-        EXPECT_TRUE(ThenEnv.flowConditionImplies(ThenFooVal));
+        EXPECT_TRUE(ThenEnv.proves(ThenFooVal));
 
         auto &ElseFooVal = getFormula(*FooDecl, ElseEnv);
-        EXPECT_TRUE(
-            ElseEnv.flowConditionImplies(ElseEnv.arena().makeNot(ElseFooVal)));
+        EXPECT_TRUE(ElseEnv.proves(ElseEnv.arena().makeNot(ElseFooVal)));
       });
 }
 
@@ -4565,11 +4561,11 @@ TEST(TransferTest, WhileStmtBranchExtendsFlowCondition) {
         ASSERT_THAT(FooDecl, NotNull());
 
         auto &LoopBodyFooVal = getFormula(*FooDecl, LoopBodyEnv);
-        EXPECT_TRUE(LoopBodyEnv.flowConditionImplies(LoopBodyFooVal));
+        EXPECT_TRUE(LoopBodyEnv.proves(LoopBodyFooVal));
 
         auto &AfterLoopFooVal = getFormula(*FooDecl, AfterLoopEnv);
-        EXPECT_TRUE(AfterLoopEnv.flowConditionImplies(
-            AfterLoopEnv.arena().makeNot(AfterLoopFooVal)));
+        EXPECT_TRUE(
+            AfterLoopEnv.proves(AfterLoopEnv.arena().makeNot(AfterLoopFooVal)));
       });
 }
 
@@ -4606,15 +4602,13 @@ TEST(TransferTest, DoWhileStmtBranchExtendsFlowCondition) {
 
         auto &LoopBodyFooVal= getFormula(*FooDecl, LoopBodyEnv);
         auto &LoopBodyBarVal = getFormula(*BarDecl, LoopBodyEnv);
-        EXPECT_TRUE(LoopBodyEnv.flowConditionImplies(
-            A.makeOr(LoopBodyBarVal, LoopBodyFooVal)));
+        EXPECT_TRUE(
+            LoopBodyEnv.proves(A.makeOr(LoopBodyBarVal, LoopBodyFooVal)));
 
         auto &AfterLoopFooVal = getFormula(*FooDecl, AfterLoopEnv);
         auto &AfterLoopBarVal = getFormula(*BarDecl, AfterLoopEnv);
-        EXPECT_TRUE(
-            AfterLoopEnv.flowConditionImplies(A.makeNot(AfterLoopFooVal)));
-        EXPECT_TRUE(
-            AfterLoopEnv.flowConditionImplies(A.makeNot(AfterLoopBarVal)));
+        EXPECT_TRUE(AfterLoopEnv.proves(A.makeNot(AfterLoopFooVal)));
+        EXPECT_TRUE(AfterLoopEnv.proves(A.makeNot(AfterLoopBarVal)));
       });
 }
 
@@ -4644,11 +4638,11 @@ TEST(TransferTest, ForStmtBranchExtendsFlowCondition) {
         ASSERT_THAT(FooDecl, NotNull());
 
         auto &LoopBodyFooVal= getFormula(*FooDecl, LoopBodyEnv);
-        EXPECT_TRUE(LoopBodyEnv.flowConditionImplies(LoopBodyFooVal));
+        EXPECT_TRUE(LoopBodyEnv.proves(LoopBodyFooVal));
 
         auto &AfterLoopFooVal = getFormula(*FooDecl, AfterLoopEnv);
-        EXPECT_TRUE(AfterLoopEnv.flowConditionImplies(
-            AfterLoopEnv.arena().makeNot(AfterLoopFooVal)));
+        EXPECT_TRUE(
+            AfterLoopEnv.proves(AfterLoopEnv.arena().makeNot(AfterLoopFooVal)));
       });
 }
 
@@ -4673,7 +4667,7 @@ TEST(TransferTest, ForStmtBranchWithoutConditionDoesNotExtendFlowCondition) {
         ASSERT_THAT(FooDecl, NotNull());
 
         auto &LoopBodyFooVal= getFormula(*FooDecl, LoopBodyEnv);
-        EXPECT_FALSE(LoopBodyEnv.flowConditionImplies(LoopBodyFooVal));
+        EXPECT_FALSE(LoopBodyEnv.proves(LoopBodyFooVal));
       });
 }
 
@@ -4699,8 +4693,8 @@ TEST(TransferTest, ContextSensitiveOptionDisabled) {
         ASSERT_THAT(FooDecl, NotNull());
 
         auto &FooVal = getFormula(*FooDecl, Env);
-        EXPECT_FALSE(Env.flowConditionImplies(FooVal));
-        EXPECT_FALSE(Env.flowConditionImplies(Env.arena().makeNot(FooVal)));
+        EXPECT_FALSE(Env.proves(FooVal));
+        EXPECT_FALSE(Env.proves(Env.arena().makeNot(FooVal)));
       },
       {BuiltinOptions{/*.ContextSensitiveOpts=*/std::nullopt}});
 }
@@ -4838,8 +4832,8 @@ TEST(TransferTest, ContextSensitiveDepthZero) {
         ASSERT_THAT(FooDecl, NotNull());
 
         auto &FooVal = getFormula(*FooDecl, Env);
-        EXPECT_FALSE(Env.flowConditionImplies(FooVal));
-        EXPECT_FALSE(Env.flowConditionImplies(Env.arena().makeNot(FooVal)));
+        EXPECT_FALSE(Env.proves(FooVal));
+        EXPECT_FALSE(Env.proves(Env.arena().makeNot(FooVal)));
       },
       {BuiltinOptions{ContextSensitiveOptions{/*.Depth=*/0}}});
 }
@@ -4866,7 +4860,7 @@ TEST(TransferTest, ContextSensitiveSetTrue) {
         ASSERT_THAT(FooDecl, NotNull());
 
         auto &FooVal = getFormula(*FooDecl, Env);
-        EXPECT_TRUE(Env.flowConditionImplies(FooVal));
+        EXPECT_TRUE(Env.proves(FooVal));
       },
       {BuiltinOptions{ContextSensitiveOptions{}}});
 }
@@ -4893,7 +4887,7 @@ TEST(TransferTest, ContextSensitiveSetFalse) {
         ASSERT_THAT(FooDecl, NotNull());
 
         auto &FooVal = getFormula(*FooDecl, Env);
-        EXPECT_TRUE(Env.flowConditionImplies(Env.arena().makeNot(FooVal)));
+        EXPECT_TRUE(Env.proves(Env.arena().makeNot(FooVal)));
       },
       {BuiltinOptions{ContextSensitiveOptions{}}});
 }
@@ -4926,12 +4920,12 @@ TEST(TransferTest, ContextSensitiveSetBothTrueAndFalse) {
         ASSERT_THAT(BarDecl, NotNull());
 
         auto &FooVal = getFormula(*FooDecl, Env);
-        EXPECT_TRUE(Env.flowConditionImplies(FooVal));
-        EXPECT_FALSE(Env.flowConditionImplies(A.makeNot(FooVal)));
+        EXPECT_TRUE(Env.proves(FooVal));
+        EXPECT_FALSE(Env.proves(A.makeNot(FooVal)));
 
         auto &BarVal = getFormula(*BarDecl, Env);
-        EXPECT_FALSE(Env.flowConditionImplies(BarVal));
-        EXPECT_TRUE(Env.flowConditionImplies(A.makeNot(BarVal)));
+        EXPECT_FALSE(Env.proves(BarVal));
+        EXPECT_TRUE(Env.proves(A.makeNot(BarVal)));
       },
       {BuiltinOptions{ContextSensitiveOptions{}}});
 }
@@ -4959,8 +4953,8 @@ TEST(TransferTest, ContextSensitiveSetTwoLayersDepthOne) {
         ASSERT_THAT(FooDecl, NotNull());
 
         auto &FooVal = getFormula(*FooDecl, Env);
-        EXPECT_FALSE(Env.flowConditionImplies(FooVal));
-        EXPECT_FALSE(Env.flowConditionImplies(Env.arena().makeNot(FooVal)));
+        EXPECT_FALSE(Env.proves(FooVal));
+        EXPECT_FALSE(Env.proves(Env.arena().makeNot(FooVal)));
       },
       {BuiltinOptions{ContextSensitiveOptions{/*.Depth=*/1}}});
 }
@@ -4988,7 +4982,7 @@ TEST(TransferTest, ContextSensitiveSetTwoLayersDepthTwo) {
         ASSERT_THAT(FooDecl, NotNull());
 
         auto &FooVal = getFormula(*FooDecl, Env);
-        EXPECT_TRUE(Env.flowConditionImplies(FooVal));
+        EXPECT_TRUE(Env.proves(FooVal));
       },
       {BuiltinOptions{ContextSensitiveOptions{/*.Depth=*/2}}});
 }
@@ -5017,8 +5011,8 @@ TEST(TransferTest, ContextSensitiveSetThreeLayersDepthTwo) {
         ASSERT_THAT(FooDecl, NotNull());
 
         auto &FooVal = getFormula(*FooDecl, Env);
-        EXPECT_FALSE(Env.flowConditionImplies(FooVal));
-        EXPECT_FALSE(Env.flowConditionImplies(Env.arena().makeNot(FooVal)));
+        EXPECT_FALSE(Env.proves(FooVal));
+        EXPECT_FALSE(Env.proves(Env.arena().makeNot(FooVal)));
       },
       {BuiltinOptions{ContextSensitiveOptions{/*.Depth=*/2}}});
 }
@@ -5047,7 +5041,7 @@ TEST(TransferTest, ContextSensitiveSetThreeLayersDepthThree) {
         ASSERT_THAT(FooDecl, NotNull());
 
         auto &FooVal = getFormula(*FooDecl, Env);
-        EXPECT_TRUE(Env.flowConditionImplies(FooVal));
+        EXPECT_TRUE(Env.proves(FooVal));
       },
       {BuiltinOptions{ContextSensitiveOptions{/*.Depth=*/3}}});
 }
@@ -5090,8 +5084,8 @@ TEST(TransferTest, ContextSensitiveMutualRecursion) {
 
         auto &FooVal = getFormula(*FooDecl, Env);
         // ... but it also can't prove anything here.
-        EXPECT_FALSE(Env.flowConditionImplies(FooVal));
-        EXPECT_FALSE(Env.flowConditionImplies(Env.arena().makeNot(FooVal)));
+        EXPECT_FALSE(Env.proves(FooVal));
+        EXPECT_FALSE(Env.proves(Env.arena().makeNot(FooVal)));
       },
       {BuiltinOptions{ContextSensitiveOptions{/*.Depth=*/4}}});
 }
@@ -5124,12 +5118,12 @@ TEST(TransferTest, ContextSensitiveSetMultipleLines) {
         ASSERT_THAT(BarDecl, NotNull());
 
         auto &FooVal = getFormula(*FooDecl, Env);
-        EXPECT_TRUE(Env.flowConditionImplies(FooVal));
-        EXPECT_FALSE(Env.flowConditionImplies(Env.arena().makeNot(FooVal)));
+        EXPECT_TRUE(Env.proves(FooVal));
+        EXPECT_FALSE(Env.proves(Env.arena().makeNot(FooVal)));
 
         auto &BarVal = getFormula(*BarDecl, Env);
-        EXPECT_FALSE(Env.flowConditionImplies(BarVal));
-        EXPECT_TRUE(Env.flowConditionImplies(Env.arena().makeNot(BarVal)));
+        EXPECT_FALSE(Env.proves(BarVal));
+        EXPECT_TRUE(Env.proves(Env.arena().makeNot(BarVal)));
       },
       {BuiltinOptions{ContextSensitiveOptions{}}});
 }
@@ -5166,12 +5160,12 @@ TEST(TransferTest, ContextSensitiveSetMultipleBlocks) {
         ASSERT_THAT(BazDecl, NotNull());
 
         auto &BarVal = getFormula(*BarDecl, Env);
-        EXPECT_FALSE(Env.flowConditionImplies(BarVal));
-        EXPECT_TRUE(Env.flowConditionImplies(Env.arena().makeNot(BarVal)));
+        EXPECT_FALSE(Env.proves(BarVal));
+        EXPECT_TRUE(Env.proves(Env.arena().makeNot(BarVal)));
 
         auto &BazVal = getFormula(*BazDecl, Env);
-        EXPECT_TRUE(Env.flowConditionImplies(BazVal));
-        EXPECT_FALSE(Env.flowConditionImplies(Env.arena().makeNot(BazVal)));
+        EXPECT_TRUE(Env.proves(BazVal));
+        EXPECT_FALSE(Env.proves(Env.arena().makeNot(BazVal)));
       },
       {BuiltinOptions{ContextSensitiveOptions{}}});
 }
@@ -5215,7 +5209,7 @@ TEST(TransferTest, ContextSensitiveReturnTrue) {
         ASSERT_THAT(FooDecl, NotNull());
 
         auto &FooVal = getFormula(*FooDecl, Env);
-        EXPECT_TRUE(Env.flowConditionImplies(FooVal));
+        EXPECT_TRUE(Env.proves(FooVal));
       },
       {BuiltinOptions{ContextSensitiveOptions{}}});
 }
@@ -5240,7 +5234,7 @@ TEST(TransferTest, ContextSensitiveReturnFalse) {
         ASSERT_THAT(FooDecl, NotNull());
 
         auto &FooVal = getFormula(*FooDecl, Env);
-        EXPECT_TRUE(Env.flowConditionImplies(Env.arena().makeNot(FooVal)));
+        EXPECT_TRUE(Env.proves(Env.arena().makeNot(FooVal)));
       },
       {BuiltinOptions{ContextSensitiveOptions{}}});
 }
@@ -5268,7 +5262,7 @@ TEST(TransferTest, ContextSensitiveReturnArg) {
         ASSERT_THAT(BazDecl, NotNull());
 
         auto &BazVal = getFormula(*BazDecl, Env);
-        EXPECT_TRUE(Env.flowConditionImplies(BazVal));
+        EXPECT_TRUE(Env.proves(BazVal));
       },
       {BuiltinOptions{ContextSensitiveOptions{}}});
 }
@@ -5316,7 +5310,7 @@ TEST(TransferTest, ContextSensitiveMethodLiteral) {
         ASSERT_THAT(FooDecl, NotNull());
 
         auto &FooVal = getFormula(*FooDecl, Env);
-        EXPECT_TRUE(Env.flowConditionImplies(FooVal));
+        EXPECT_TRUE(Env.proves(FooVal));
       },
       {BuiltinOptions{ContextSensitiveOptions{}}});
 }
@@ -5348,7 +5342,7 @@ TEST(TransferTest, ContextSensitiveMethodGetter) {
         ASSERT_THAT(FooDecl, NotNull());
 
         auto &FooVal = getFormula(*FooDecl, Env);
-        EXPECT_TRUE(Env.flowConditionImplies(FooVal));
+        EXPECT_TRUE(Env.proves(FooVal));
       },
       {BuiltinOptions{ContextSensitiveOptions{}}});
 }
@@ -5380,7 +5374,7 @@ TEST(TransferTest, ContextSensitiveMethodSetter) {
         ASSERT_THAT(FooDecl, NotNull());
 
         auto &FooVal = getFormula(*FooDecl, Env);
-        EXPECT_TRUE(Env.flowConditionImplies(FooVal));
+        EXPECT_TRUE(Env.proves(FooVal));
       },
       {BuiltinOptions{ContextSensitiveOptions{}}});
 }
@@ -5414,7 +5408,7 @@ TEST(TransferTest, ContextSensitiveMethodGetterAndSetter) {
         ASSERT_THAT(FooDecl, NotNull());
 
         auto &FooVal = getFormula(*FooDecl, Env);
-        EXPECT_TRUE(Env.flowConditionImplies(FooVal));
+        EXPECT_TRUE(Env.proves(FooVal));
       },
       {BuiltinOptions{ContextSensitiveOptions{}}});
 }
@@ -5449,7 +5443,7 @@ TEST(TransferTest, ContextSensitiveMethodTwoLayersVoid) {
         ASSERT_THAT(FooDecl, NotNull());
 
         auto &FooVal = getFormula(*FooDecl, Env);
-        EXPECT_TRUE(Env.flowConditionImplies(FooVal));
+        EXPECT_TRUE(Env.proves(FooVal));
       },
       {BuiltinOptions{ContextSensitiveOptions{}}});
 }
@@ -5483,7 +5477,7 @@ TEST(TransferTest, ContextSensitiveMethodTwoLayersReturn) {
         ASSERT_THAT(FooDecl, NotNull());
 
         auto &FooVal = getFormula(*FooDecl, Env);
-        EXPECT_TRUE(Env.flowConditionImplies(FooVal));
+        EXPECT_TRUE(Env.proves(FooVal));
       },
       {BuiltinOptions{ContextSensitiveOptions{}}});
 }
@@ -5514,7 +5508,7 @@ TEST(TransferTest, ContextSensitiveConstructorBody) {
         ASSERT_THAT(FooDecl, NotNull());
 
         auto &FooVal = getFormula(*FooDecl, Env);
-        EXPECT_TRUE(Env.flowConditionImplies(FooVal));
+        EXPECT_TRUE(Env.proves(FooVal));
       },
       {BuiltinOptions{ContextSensitiveOptions{}}});
 }
@@ -5545,7 +5539,7 @@ TEST(TransferTest, ContextSensitiveConstructorInitializer) {
         ASSERT_THAT(FooDecl, NotNull());
 
         auto &FooVal = getFormula(*FooDecl, Env);
-        EXPECT_TRUE(Env.flowConditionImplies(FooVal));
+        EXPECT_TRUE(Env.proves(FooVal));
       },
       {BuiltinOptions{ContextSensitiveOptions{}}});
 }
@@ -5576,7 +5570,7 @@ TEST(TransferTest, ContextSensitiveConstructorDefault) {
         ASSERT_THAT(FooDecl, NotNull());
 
         auto &FooVal = getFormula(*FooDecl, Env);
-        EXPECT_TRUE(Env.flowConditionImplies(FooVal));
+        EXPECT_TRUE(Env.proves(FooVal));
       },
       {BuiltinOptions{ContextSensitiveOptions{}}});
 }
@@ -5656,7 +5650,7 @@ TEST(TransferTest, ChainedLogicalOps) {
          ASTContext &ASTCtx) {
         const Environment &Env = getEnvironmentAtAnnotation(Results, "p");
         auto &B = getValueForDecl<BoolValue>(ASTCtx, Env, "b").formula();
-        EXPECT_TRUE(Env.flowConditionImplies(B));
+        EXPECT_TRUE(Env.proves(B));
       });
 }
 
@@ -5701,30 +5695,30 @@ TEST(TransferTest, NoReturnFunctionInsideShortCircuitedBooleanOp) {
         auto &A = Env.arena();
 
         // Check that [[p]] is reachable with a non-false flow condition.
-        EXPECT_FALSE(Env.flowConditionImplies(A.makeLiteral(false)));
+        EXPECT_FALSE(Env.proves(A.makeLiteral(false)));
 
         auto &B1 = getValueForDecl<BoolValue>(ASTCtx, Env, "b1").formula();
-        EXPECT_TRUE(Env.flowConditionImplies(A.makeNot(B1)));
+        EXPECT_TRUE(Env.proves(A.makeNot(B1)));
 
         auto &NoreturnOnRhsOfAnd =
             getValueForDecl<BoolValue>(ASTCtx, Env, "NoreturnOnRhsOfAnd").formula();
-        EXPECT_TRUE(Env.flowConditionImplies(A.makeNot(NoreturnOnRhsOfAnd)));
+        EXPECT_TRUE(Env.proves(A.makeNot(NoreturnOnRhsOfAnd)));
 
         auto &B2 = getValueForDecl<BoolValue>(ASTCtx, Env, "b2").formula();
-        EXPECT_TRUE(Env.flowConditionImplies(B2));
+        EXPECT_TRUE(Env.proves(B2));
 
         auto &NoreturnOnRhsOfOr =
             getValueForDecl<BoolValue>(ASTCtx, Env, "NoreturnOnRhsOfOr")
                 .formula();
-        EXPECT_TRUE(Env.flowConditionImplies(NoreturnOnRhsOfOr));
+        EXPECT_TRUE(Env.proves(NoreturnOnRhsOfOr));
 
         auto &NoreturnOnLhsMakesAndUnreachable = getValueForDecl<BoolValue>(
             ASTCtx, Env, "NoreturnOnLhsMakesAndUnreachable").formula();
-        EXPECT_TRUE(Env.flowConditionImplies(NoreturnOnLhsMakesAndUnreachable));
+        EXPECT_TRUE(Env.proves(NoreturnOnLhsMakesAndUnreachable));
 
         auto &NoreturnOnLhsMakesOrUnreachable = getValueForDecl<BoolValue>(
             ASTCtx, Env, "NoreturnOnLhsMakesOrUnreachable").formula();
-        EXPECT_TRUE(Env.flowConditionImplies(NoreturnOnLhsMakesOrUnreachable));
+        EXPECT_TRUE(Env.proves(NoreturnOnLhsMakesOrUnreachable));
       });
 }
 
@@ -5944,7 +5938,7 @@ TEST(TransferTest, AnonymousStruct) {
             S->getChild(*cast<ValueDecl>(IndirectField->chain().front())));
 
         auto *B = cast<BoolValue>(getFieldValue(&AnonStruct, *BDecl, Env));
-        ASSERT_TRUE(Env.flowConditionImplies(B->formula()));
+        ASSERT_TRUE(Env.proves(B->formula()));
       });
 }
 
@@ -5975,7 +5969,7 @@ TEST(TransferTest, AnonymousStructWithInitializer) {
             *cast<ValueDecl>(IndirectField->chain().front())));
 
         auto *B = cast<BoolValue>(getFieldValue(&AnonStruct, *BDecl, Env));
-        ASSERT_TRUE(Env.flowConditionImplies(B->formula()));
+        ASSERT_TRUE(Env.proves(B->formula()));
       });
 }
 
diff --git a/clang/unittests/Analysis/FlowSensitive/TypeErasedDataflowAnalysisTest.cpp b/clang/unittests/Analysis/FlowSensitive/TypeErasedDataflowAnalysisTest.cpp
index 8422f3804db54..e33bea47137ad 100644
--- a/clang/unittests/Analysis/FlowSensitive/TypeErasedDataflowAnalysisTest.cpp
+++ b/clang/unittests/Analysis/FlowSensitive/TypeErasedDataflowAnalysisTest.cpp
@@ -450,8 +450,7 @@ class SpecialBoolAnalysis final
     if (IsSet2 == nullptr)
       return ComparisonResult::Different;
 
-    return Env1.flowConditionImplies(IsSet1->formula()) ==
-                   Env2.flowConditionImplies(IsSet2->formula())
+    return Env1.proves(IsSet1->formula()) == Env2.proves(IsSet2->formula())
                ? ComparisonResult::Same
                : ComparisonResult::Different;
   }
@@ -475,9 +474,8 @@ class SpecialBoolAnalysis final
 
     auto &IsSet = MergedEnv.makeAtomicBoolValue();
     MergedVal.setProperty("is_set", IsSet);
-    if (Env1.flowConditionImplies(IsSet1->formula()) &&
-        Env2.flowConditionImplies(IsSet2->formula()))
-      MergedEnv.addToFlowCondition(IsSet.formula());
+    if (Env1.proves(IsSet1->formula()) && Env2.proves(IsSet2->formula()))
+      MergedEnv.assume(IsSet.formula());
 
     return true;
   }
@@ -544,10 +542,10 @@ TEST_F(JoinFlowConditionsTest, JoinDistinctButProvablyEquivalentValues) {
               ->formula();
         };
 
-        EXPECT_FALSE(Env1.flowConditionImplies(GetFoo(Env1)));
-        EXPECT_TRUE(Env2.flowConditionImplies(GetFoo(Env2)));
-        EXPECT_TRUE(Env3.flowConditionImplies(GetFoo(Env3)));
-        EXPECT_TRUE(Env4.flowConditionImplies(GetFoo(Env4)));
+        EXPECT_FALSE(Env1.proves(GetFoo(Env1)));
+        EXPECT_TRUE(Env2.proves(GetFoo(Env2)));
+        EXPECT_TRUE(Env3.proves(GetFoo(Env3)));
+        EXPECT_TRUE(Env4.proves(GetFoo(Env4)));
       });
 }
 
@@ -849,11 +847,11 @@ TEST_F(FlowConditionTest, IfStmtSingleVar) {
 
         const Environment &Env1 = getEnvironmentAtAnnotation(Results, "p1");
         auto &FooVal1 = cast<BoolValue>(Env1.getValue(*FooDecl))->formula();
-        EXPECT_TRUE(Env1.flowConditionImplies(FooVal1));
+        EXPECT_TRUE(Env1.proves(FooVal1));
 
         const Environment &Env2 = getEnvironmentAtAnnotation(Results, "p2");
         auto &FooVal2 = cast<BoolValue>(Env2.getValue(*FooDecl))->formula();
-        EXPECT_FALSE(Env2.flowConditionImplies(FooVal2));
+        EXPECT_FALSE(Env2.proves(FooVal2));
       });
 }
 
@@ -880,11 +878,11 @@ TEST_F(FlowConditionTest, IfStmtSingleNegatedVar) {
 
         const Environment &Env1 = getEnvironmentAtAnnotation(Results, "p1");
         auto &FooVal1 = cast<BoolValue>(Env1.getValue(*FooDecl))->formula();
-        EXPECT_FALSE(Env1.flowConditionImplies(FooVal1));
+        EXPECT_FALSE(Env1.proves(FooVal1));
 
         const Environment &Env2 = getEnvironmentAtAnnotation(Results, "p2");
         auto &FooVal2 = cast<BoolValue>(Env2.getValue(*FooDecl))->formula();
-        EXPECT_TRUE(Env2.flowConditionImplies(FooVal2));
+        EXPECT_TRUE(Env2.proves(FooVal2));
       });
 }
 
@@ -908,7 +906,7 @@ TEST_F(FlowConditionTest, WhileStmt) {
         const Environment &Env = getEnvironmentAtAnnotation(Results, "p");
 
         auto &FooVal = cast<BoolValue>(Env.getValue(*FooDecl))->formula();
-        EXPECT_TRUE(Env.flowConditionImplies(FooVal));
+        EXPECT_TRUE(Env.proves(FooVal));
       });
 }
 
@@ -931,7 +929,7 @@ TEST_F(FlowConditionTest, WhileStmtWithAssignmentInCondition) {
          ASTContext &ASTCtx) {
         const Environment &Env = getEnvironmentAtAnnotation(Results, "p");
         auto &FooVal = getValueForDecl<BoolValue>(ASTCtx, Env, "Foo").formula();
-        EXPECT_TRUE(Env.flowConditionImplies(FooVal));
+        EXPECT_TRUE(Env.proves(FooVal));
       });
 }
 
@@ -961,14 +959,14 @@ TEST_F(FlowConditionTest, Conjunction) {
     const Environment &Env1 = getEnvironmentAtAnnotation(Results, "p1");
     auto &FooVal1 = cast<BoolValue>(Env1.getValue(*FooDecl))->formula();
     auto &BarVal1 = cast<BoolValue>(Env1.getValue(*BarDecl))->formula();
-    EXPECT_TRUE(Env1.flowConditionImplies(FooVal1));
-    EXPECT_TRUE(Env1.flowConditionImplies(BarVal1));
+    EXPECT_TRUE(Env1.proves(FooVal1));
+    EXPECT_TRUE(Env1.proves(BarVal1));
 
     const Environment &Env2 = getEnvironmentAtAnnotation(Results, "p2");
     auto &FooVal2 = cast<BoolValue>(Env2.getValue(*FooDecl))->formula();
     auto &BarVal2 = cast<BoolValue>(Env2.getValue(*BarDecl))->formula();
-    EXPECT_FALSE(Env2.flowConditionImplies(FooVal2));
-    EXPECT_FALSE(Env2.flowConditionImplies(BarVal2));
+    EXPECT_FALSE(Env2.proves(FooVal2));
+    EXPECT_FALSE(Env2.proves(BarVal2));
   });
 }
 
@@ -998,14 +996,14 @@ TEST_F(FlowConditionTest, Disjunction) {
     const Environment &Env1 = getEnvironmentAtAnnotation(Results, "p1");
     auto &FooVal1 = cast<BoolValue>(Env1.getValue(*FooDecl))->formula();
     auto &BarVal1 = cast<BoolValue>(Env1.getValue(*BarDecl))->formula();
-    EXPECT_FALSE(Env1.flowConditionImplies(FooVal1));
-    EXPECT_FALSE(Env1.flowConditionImplies(BarVal1));
+    EXPECT_FALSE(Env1.proves(FooVal1));
+    EXPECT_FALSE(Env1.proves(BarVal1));
 
     const Environment &Env2 = getEnvironmentAtAnnotation(Results, "p2");
     auto &FooVal2 = cast<BoolValue>(Env2.getValue(*FooDecl))->formula();
     auto &BarVal2 = cast<BoolValue>(Env2.getValue(*BarDecl))->formula();
-    EXPECT_FALSE(Env2.flowConditionImplies(FooVal2));
-    EXPECT_FALSE(Env2.flowConditionImplies(BarVal2));
+    EXPECT_FALSE(Env2.proves(FooVal2));
+    EXPECT_FALSE(Env2.proves(BarVal2));
   });
 }
 
@@ -1035,14 +1033,14 @@ TEST_F(FlowConditionTest, NegatedConjunction) {
     const Environment &Env1 = getEnvironmentAtAnnotation(Results, "p1");
     auto &FooVal1 = cast<BoolValue>(Env1.getValue(*FooDecl))->formula();
     auto &BarVal1 = cast<BoolValue>(Env1.getValue(*BarDecl))->formula();
-    EXPECT_FALSE(Env1.flowConditionImplies(FooVal1));
-    EXPECT_FALSE(Env1.flowConditionImplies(BarVal1));
+    EXPECT_FALSE(Env1.proves(FooVal1));
+    EXPECT_FALSE(Env1.proves(BarVal1));
 
     const Environment &Env2 = getEnvironmentAtAnnotation(Results, "p2");
     auto &FooVal2 = cast<BoolValue>(Env2.getValue(*FooDecl))->formula();
     auto &BarVal2 = cast<BoolValue>(Env2.getValue(*BarDecl))->formula();
-    EXPECT_TRUE(Env2.flowConditionImplies(FooVal2));
-    EXPECT_TRUE(Env2.flowConditionImplies(BarVal2));
+    EXPECT_TRUE(Env2.proves(FooVal2));
+    EXPECT_TRUE(Env2.proves(BarVal2));
   });
 }
 
@@ -1072,14 +1070,14 @@ TEST_F(FlowConditionTest, DeMorgan) {
     const Environment &Env1 = getEnvironmentAtAnnotation(Results, "p1");
     auto &FooVal1 = cast<BoolValue>(Env1.getValue(*FooDecl))->formula();
     auto &BarVal1 = cast<BoolValue>(Env1.getValue(*BarDecl))->formula();
-    EXPECT_TRUE(Env1.flowConditionImplies(FooVal1));
-    EXPECT_TRUE(Env1.flowConditionImplies(BarVal1));
+    EXPECT_TRUE(Env1.proves(FooVal1));
+    EXPECT_TRUE(Env1.proves(BarVal1));
 
     const Environment &Env2 = getEnvironmentAtAnnotation(Results, "p2");
     auto &FooVal2 = cast<BoolValue>(Env2.getValue(*FooDecl))->formula();
     auto &BarVal2 = cast<BoolValue>(Env2.getValue(*BarDecl))->formula();
-    EXPECT_FALSE(Env2.flowConditionImplies(FooVal2));
-    EXPECT_FALSE(Env2.flowConditionImplies(BarVal2));
+    EXPECT_FALSE(Env2.proves(FooVal2));
+    EXPECT_FALSE(Env2.proves(BarVal2));
   });
 }
 
@@ -1108,7 +1106,7 @@ TEST_F(FlowConditionTest, Join) {
 
         const Environment &Env = getEnvironmentAtAnnotation(Results, "p");
         auto &FooVal = cast<BoolValue>(Env.getValue(*FooDecl))->formula();
-        EXPECT_TRUE(Env.flowConditionImplies(FooVal));
+        EXPECT_TRUE(Env.proves(FooVal));
       });
 }
 
@@ -1142,7 +1140,7 @@ TEST_F(FlowConditionTest, OpaqueFlowConditionMergesToOpaqueBool) {
 
         auto &BarVal = cast<BoolValue>(Env.getValue(*BarDecl))->formula();
 
-        EXPECT_FALSE(Env.flowConditionImplies(BarVal));
+        EXPECT_FALSE(Env.proves(BarVal));
       });
 }
 
@@ -1183,7 +1181,7 @@ TEST_F(FlowConditionTest, OpaqueFieldFlowConditionMergesToOpaqueBool) {
 
         auto &BarVal = cast<BoolValue>(Env.getValue(*BarDecl))->formula();
 
-        EXPECT_FALSE(Env.flowConditionImplies(BarVal));
+        EXPECT_FALSE(Env.proves(BarVal));
       });
 }
 
@@ -1217,7 +1215,7 @@ TEST_F(FlowConditionTest, OpaqueFlowConditionInsideBranchMergesToOpaqueBool) {
 
         auto &BarVal = cast<BoolValue>(Env.getValue(*BarDecl))->formula();
 
-        EXPECT_FALSE(Env.flowConditionImplies(BarVal));
+        EXPECT_FALSE(Env.proves(BarVal));
       });
 }
 
@@ -1245,11 +1243,11 @@ TEST_F(FlowConditionTest, PointerToBoolImplicitCast) {
 
         const Environment &Env1 = getEnvironmentAtAnnotation(Results, "p1");
         auto &FooVal1 = cast<BoolValue>(Env1.getValue(*FooDecl))->formula();
-        EXPECT_TRUE(Env1.flowConditionImplies(FooVal1));
+        EXPECT_TRUE(Env1.proves(FooVal1));
 
         const Environment &Env2 = getEnvironmentAtAnnotation(Results, "p2");
         auto &FooVal2 = cast<BoolValue>(Env2.getValue(*FooDecl))->formula();
-        EXPECT_FALSE(Env2.flowConditionImplies(FooVal2));
+        EXPECT_FALSE(Env2.proves(FooVal2));
       });
 }
 
@@ -1585,7 +1583,7 @@ TEST_F(TopTest, TopUsedInBothBranchesWithoutPrecisionLoss) {
         auto *BarVal = dyn_cast_or_null<BoolValue>(Env.getValue(*BarDecl));
         ASSERT_THAT(BarVal, NotNull());
 
-        EXPECT_TRUE(Env.flowConditionImplies(
+        EXPECT_TRUE(Env.proves(
             Env.arena().makeEquals(FooVal->formula(), BarVal->formula())));
       });
 }

From 432649700db1bcfd5c991296242195129f03b4b1 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Mon, 30 Oct 2023 12:43:05 +0000
Subject: [PATCH 365/877] [X86] vec_insert-5.ll - ensure we build with +mmx as
 we reference x86_mmx types

Enabling SSE doesn't guarantee MMX is enabled on all targets

Avoids a crash in D152928 (although we still currently see a regression with that patch applied resulting in MMX codegen)
---
 llvm/test/CodeGen/X86/vec_insert-5.ll | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/test/CodeGen/X86/vec_insert-5.ll b/llvm/test/CodeGen/X86/vec_insert-5.ll
index be155969e0b5e..34280aa647aab 100644
--- a/llvm/test/CodeGen/X86/vec_insert-5.ll
+++ b/llvm/test/CodeGen/X86/vec_insert-5.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i386-unknown -mattr=+sse2,+ssse3 | FileCheck %s --check-prefix=X86
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2,+ssse3 | FileCheck %s --check-prefixes=X64,ALIGN
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2,+ssse3,sse-unaligned-mem | FileCheck %s --check-prefixes=X64,UNALIGN
+; RUN: llc < %s -mtriple=i386-unknown -mattr=+mmx,+sse2,+ssse3 | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+mmx,+sse2,+ssse3 | FileCheck %s --check-prefixes=X64,ALIGN
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+mmx,+sse2,+ssse3,sse-unaligned-mem | FileCheck %s --check-prefixes=X64,UNALIGN
 
 ; There are no MMX operations in @t1
 

From 70904226e12f78344a1c6abfff54fb490e1de988 Mon Sep 17 00:00:00 2001
From: Igor Kirillov <igor.kirillov@arm.com>
Date: Mon, 30 Oct 2023 13:43:26 +0000
Subject: [PATCH 366/877] [LoopVectorize] Enhance Vectorization decisions for
 predicate tail-folded loops with low trip counts (#69588)

* Avoid using `CM_ScalarEpilogueNotAllowedLowTripLoop` for loops known
to be predicate tail-folded, delegating to `areRuntimeChecksProfitable`
to decide on the profitability of vectorizing loops with runtime checks.
* Update the `areRuntimeChecksProfitable` function to consider the
`ScalarEpilogueLowering` setting when assessing vectorization of a loop.

With this patch, we can make more informed decisions for loops with low
trip counts, especially when leveraging Profile-Guided Optimization
(PGO) data.
---
 .../Transforms/Vectorize/LoopVectorize.cpp    |  24 ++--
 .../runtime-check-trip-count-decisions.ll     | 108 ++++++++++++++++++
 2 files changed, 125 insertions(+), 7 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/runtime-check-trip-count-decisions.ll

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 16c761a91ff23..4f547886f6025 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -9785,7 +9785,8 @@ static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) {
 static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,
                                        VectorizationFactor &VF,
                                        std::optional<unsigned> VScale, Loop *L,
-                                       ScalarEvolution &SE) {
+                                       ScalarEvolution &SE,
+                                       ScalarEpilogueLowering SEL) {
   InstructionCost CheckCost = Checks.getCost();
   if (!CheckCost.isValid())
     return false;
@@ -9855,11 +9856,13 @@ static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,
   //   RtC < ScalarC * TC * (1 / X)  ==>  RtC * X / ScalarC < TC
   double MinTC2 = RtC * 10 / ScalarC;
 
-  // Now pick the larger minimum. If it is not a multiple of VF, choose the
-  // next closest multiple of VF. This should partly compensate for ignoring
-  // the epilogue cost.
+  // Now pick the larger minimum. If it is not a multiple of VF and a scalar
+  // epilogue is allowed, choose the next closest multiple of VF. This should
+  // partly compensate for ignoring the epilogue cost.
   uint64_t MinTC = std::ceil(std::max(MinTC1, MinTC2));
-  VF.MinProfitableTripCount = ElementCount::getFixed(alignTo(MinTC, IntVF));
+  if (SEL == CM_ScalarEpilogueAllowed)
+    MinTC = alignTo(MinTC, IntVF);
+  VF.MinProfitableTripCount = ElementCount::getFixed(MinTC);
 
   LLVM_DEBUG(
       dbgs() << "LV: Minimum required TC for runtime checks to be profitable:"
@@ -9979,7 +9982,14 @@ bool LoopVectorizePass::processLoop(Loop *L) {
     else {
       if (*ExpectedTC > TTI->getMinTripCountTailFoldingThreshold()) {
         LLVM_DEBUG(dbgs() << "\n");
-        SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
+        // Predicate tail-folded loops are efficient even when the loop
+        // iteration count is low. However, setting the epilogue policy to
+        // `CM_ScalarEpilogueNotAllowedLowTripLoop` prevents vectorizing loops
+        // with runtime checks. It's more effective to let
+        // `areRuntimeChecksProfitable` determine if vectorization is beneficial
+        // for the loop.
+        if (SEL != CM_ScalarEpilogueNotNeededUsePredicate)
+          SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
       } else {
         LLVM_DEBUG(dbgs() << " But the target considers the trip count too "
                              "small to consider vectorizing.\n");
@@ -10074,7 +10084,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
         Hints.getForce() == LoopVectorizeHints::FK_Enabled;
     if (!ForceVectorization &&
         !areRuntimeChecksProfitable(Checks, VF, getVScaleForTuning(L, *TTI), L,
-                                    *PSE.getSE())) {
+                                    *PSE.getSE(), SEL)) {
       ORE->emit([&]() {
         return OptimizationRemarkAnalysisAliasing(
                    DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(),
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/runtime-check-trip-count-decisions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/runtime-check-trip-count-decisions.ll
new file mode 100644
index 0000000000000..39ef5baa5b019
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/runtime-check-trip-count-decisions.ll
@@ -0,0 +1,108 @@
+; RUN: opt < %s -passes=loop-vectorize -mtriple aarch64-unknown-linux-gnu -mattr=+sve -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue -S | FileCheck --check-prefixes=CHECK,PREDICATED %s
+; RUN: opt < %s -passes=loop-vectorize -mtriple aarch64-unknown-linux-gnu -mattr=+sve -prefer-predicate-over-epilogue=scalar-epilogue -S | FileCheck --check-prefixes=CHECK,SCALAR %s
+
+; This file contains the same function but with different trip-count PGO hints
+
+; The function is vectorized if there are no trip-count hints
+define i32 @foo_no_trip_count(ptr %a, ptr %b, ptr %c, i32 %bound) {
+; CHECK-LABEL: @foo_no_trip_count(
+; PREDICATED: vector.body
+; SCALAR: vector.body
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %idx = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %a.index = getelementptr inbounds [32 x i8], ptr %a, i32 0, i32 %idx
+  %0 = load i8, ptr %a.index, align 1
+  %b.index = getelementptr inbounds [32 x i8], ptr %b, i32 0, i32 %idx
+  %1 = load i8, ptr %b.index, align 1
+  %2 = add i8 %0, %1
+  %c.index = getelementptr inbounds [32 x i8], ptr %c, i32 0, i32 %idx
+  store i8 %2, ptr %c.index, align 1
+  %inc = add nsw i32 %idx, 1
+  %exitcond = icmp eq i32 %idx, %bound
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret i32 0
+}
+
+; If trip-count is equal to 4, the function is not vectorised
+define i32 @foo_low_trip_count(ptr %a, ptr %b, ptr %c, i32 %bound) {
+; CHECK-LABEL: @foo_low_trip_count(
+; PREDICATED-NOT: vector.body
+; SCALAR-NOT: vector.body
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %idx = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %a.index = getelementptr inbounds [32 x i8], ptr %a, i32 0, i32 %idx
+  %0 = load i8, ptr %a.index, align 1
+  %b.index = getelementptr inbounds [32 x i8], ptr %b, i32 0, i32 %idx
+  %1 = load i8, ptr %b.index, align 1
+  %2 = add i8 %0, %1
+  %c.index = getelementptr inbounds [32 x i8], ptr %c, i32 0, i32 %idx
+  store i8 %2, ptr %c.index, align 1
+  %inc = add nsw i32 %idx, 1
+  %exitcond = icmp eq i32 %idx, %bound
+  br i1 %exitcond, label %for.end, label %for.body, !prof !0
+
+for.end:                                          ; preds = %for.body
+  ret i32 0
+}
+
+; If trip-count is equal to 10, the function is vectorised when predicated tail folding is chosen
+define i32 @foo_mid_trip_count(ptr %a, ptr %b, ptr %c, i32 %bound) {
+; CHECK-LABEL: @foo_mid_trip_count(
+; PREDICATED: vector.body
+; SCALAR-NOT: vector.body
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %idx = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %a.index = getelementptr inbounds [32 x i8], ptr %a, i32 0, i32 %idx
+  %0 = load i8, ptr %a.index, align 1
+  %b.index = getelementptr inbounds [32 x i8], ptr %b, i32 0, i32 %idx
+  %1 = load i8, ptr %b.index, align 1
+  %2 = add i8 %0, %1
+  %c.index = getelementptr inbounds [32 x i8], ptr %c, i32 0, i32 %idx
+  store i8 %2, ptr %c.index, align 1
+  %inc = add nsw i32 %idx, 1
+  %exitcond = icmp eq i32 %idx, %bound
+  br i1 %exitcond, label %for.end, label %for.body, !prof !1
+
+for.end:                                          ; preds = %for.body
+  ret i32 0
+}
+
+; If trip-count is equal to 40, the function is always vectorised
+define i32 @foo_high_trip_count(ptr %a, ptr %b, ptr %c, i32 %bound) {
+; CHECK-LABEL: @foo_high_trip_count(
+; PREDICATED: vector.body
+; SCALAR: vector.body
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %idx = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %a.index = getelementptr inbounds [32 x i8], ptr %a, i32 0, i32 %idx
+  %0 = load i8, ptr %a.index, align 1
+  %b.index = getelementptr inbounds [32 x i8], ptr %b, i32 0, i32 %idx
+  %1 = load i8, ptr %b.index, align 1
+  %2 = add i8 %0, %1
+  %c.index = getelementptr inbounds [32 x i8], ptr %c, i32 0, i32 %idx
+  store i8 %2, ptr %c.index, align 1
+  %inc = add nsw i32 %idx, 1
+  %exitcond = icmp eq i32 %idx, %bound
+  br i1 %exitcond, label %for.end, label %for.body, !prof !2
+
+for.end:                                          ; preds = %for.body
+  ret i32 0
+}
+
+!0 = !{!"branch_weights", i32 10, i32 30}
+!1 = !{!"branch_weights", i32 10, i32 90}
+!2 = !{!"branch_weights", i32 10, i32 390}

From ddd2747b7ee90026854a2957ee756dbaf3109f85 Mon Sep 17 00:00:00 2001
From: Pete Steinfeld <47540744+psteinfeld@users.noreply.github.com>
Date: Mon, 30 Oct 2023 06:44:09 -0700
Subject: [PATCH 367/877] [flang] Put ISO_Fortran_binding.h where it can be
 easily used (#70129)

The update stems from the discussion in

https://discourse.llvm.org/t/adding-flang-specific-header-files-to-clang/72442

This is my third attempt at this. My second attempt was in pull request
#69121.

This is my second attempt at this. My first attempt was in pull request
#68756.

This pull request has three changes from the second one:
- I put the test into the Driver directory rather than Examples so that
it would get run without require the define LLVM_BUILD_EXAMPLES.
- When installing ISO_Fortran_binding.h, I changed the location where it
was installed from.
- I changed the test so that it would work when flang was built with
shared libraries.

Here's the information from my previous attempts:

I decided to put ISO_Fortran_binding.h in a place where it would be
accessible with the include: "#include<ISO_Fortran_binding.h>" rather
than "#include<fortran/ISO_Fortran_binding.h>" because this is what
gfortran implements.

Note that the file is also installed into ".../include/flang", so if a
user wanted to access the file from a compiler other than clang, it
would be available.

I added a test in ".../flang/test/Driver". To make the test work, I also
needed to put ISO_Fortran_binding.h into the build area.

Although the flang project depends on clang, clang may not always be
available in a flang build. For example, when building just the
"check-flang" target, the "clang" executable may not be available at the
time the new test gets run. To account for this, I made the test's
script check for the existence of the "clang" executable. If "clang" is
not available, it simply prints "PASS". If it is available, it fully
builds and executes the test. On success, this will also print "PASS"
---
 flang/CMakeLists.txt             | 17 ++++++++
 flang/test/Driver/ctofortran.f90 | 73 ++++++++++++++++++++++++++++++++
 2 files changed, 90 insertions(+)
 create mode 100644 flang/test/Driver/ctofortran.f90

diff --git a/flang/CMakeLists.txt b/flang/CMakeLists.txt
index ac30da89995ed..f81d3e33fe86c 100644
--- a/flang/CMakeLists.txt
+++ b/flang/CMakeLists.txt
@@ -276,6 +276,9 @@ endif()
 
 
 set(PACKAGE_VERSION "${LLVM_PACKAGE_VERSION}")
+if (NOT PACKAGE_VERSION)
+  set(PACKAGE_VERSION ${LLVM_VERSION_MAJOR})
+endif()
 
 
 if (NOT DEFINED FLANG_VERSION_MAJOR)
@@ -490,3 +493,17 @@ if (NOT LLVM_INSTALL_TOOLCHAIN_ONLY)
     PATTERN "*.inc"
     )
 endif()
+
+# Put ISO_Fortran_binding.h into the include files of the build area now
+# so that we can run tests before installing
+include(GetClangResourceDir)
+get_clang_resource_dir(HEADER_BINARY_DIR PREFIX ${LLVM_LIBRARY_OUTPUT_INTDIR}/.. SUBDIR include)
+configure_file(
+  ${FLANG_SOURCE_DIR}/include/flang/ISO_Fortran_binding.h
+  ${HEADER_BINARY_DIR}/ISO_Fortran_binding.h)
+
+# And also install it into the install area
+get_clang_resource_dir(HEADER_INSTALL_DIR SUBDIR include)
+install(
+  FILES include/flang/ISO_Fortran_binding.h
+  DESTINATION ${HEADER_INSTALL_DIR} )
diff --git a/flang/test/Driver/ctofortran.f90 b/flang/test/Driver/ctofortran.f90
new file mode 100644
index 0000000000000..6483e0deb3866
--- /dev/null
+++ b/flang/test/Driver/ctofortran.f90
@@ -0,0 +1,73 @@
+! UNSUPPORTED: system-windows
+! RUN: split-file %s %t
+! RUN: chmod +x %t/runtest.sh
+! RUN: %t/runtest.sh %t %flang $t/ffile.f90 $t/cfile.c
+
+!--- ffile.f90
+subroutine foo(a) bind(c)
+  integer :: a(:)
+  if (lbound(a, 1) .ne. 1) then
+     print *, 'FAIL expected 1 for lbound but got ',lbound(a, 1)
+     stop 1
+  endif
+
+  if (ubound(a, 1) .ne. 10) then
+     print *, 'FAIL expected 10 for ubound but got ',ubound(a, 1)
+     stop 1
+  endif
+
+  do i = lbound(a,1),ubound(a,1)
+     !print *, a(i)
+     if (a(i) .ne. i) then
+        print *, 'FAIL expected', i, ' for index ',i, ' but got ',a(i)
+        stop 1
+     endif
+  enddo
+  print *, 'PASS'
+end subroutine foo
+
+! CHECK: PASS
+!--- cfile.c
+#include <stdio.h>
+#include <stdlib.h>
+#include <ISO_Fortran_binding.h>
+
+void foo(CFI_cdesc_t*);
+
+int a[10];
+
+int main() {
+  int i, res;
+  static CFI_CDESC_T(1) r1;
+  CFI_cdesc_t *desc = (CFI_cdesc_t*)&r1;
+  CFI_index_t extent[1] = {10};
+
+  for(i=0; i<10; ++i) {
+    a[i] = i+1;
+  }
+
+  res = CFI_establish(desc, (void*)a, CFI_attribute_other, CFI_type_int32_t,
+                      sizeof(int), 1, extent);
+  if (res != 0) {
+    printf("FAIL CFI_establish returned %d instead of 0.\n",res);
+    exit(1);
+  }
+
+  foo(desc);
+  return 0;
+}
+!--- runtest.sh
+#!/bin/bash
+export BINDIR=`dirname $2`
+export CCOMP=$BINDIR/clang
+if [ -x $CCOMP ]
+then
+  export LIBDIR=$BINDIR/../lib
+  export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$LIBDIR
+  $CCOMP -c $1/$4 -o $1/cfile.o
+  $2 $1/$3 $1/cfile.o -o $1/ctofortran
+  $1/ctofortran # should print "PASS"
+else
+  # No clang compiler, just pass by default
+  echo "PASS"
+fi

From 89f8d35094518ed9734a493cf19e188c56c8d4e4 Mon Sep 17 00:00:00 2001
From: Ivan Kosarev <ivan.kosarev@amd.com>
Date: Mon, 30 Oct 2023 15:47:25 +0200
Subject: [PATCH 368/877] [AMDGPU] Fix subtarget predicates for some V_MFMA
 instructions. (#70450)

Resolves AsmParser ambiguity, e.g., V_MFMA_I32_32X32X8I8_vi currently
has isGFX908orGFX90A as its subtarget predicate, which makes it
identical to V_MFMA_I32_32X32X8I8_gfx90a_acd on GFX90A.

Part of <https://github.com/llvm/llvm-project/issues/69256>.
---
 llvm/lib/Target/AMDGPU/VOP3PInstructions.td | 33 ++++++++++++---------
 1 file changed, 19 insertions(+), 14 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index 539b69651dfed..b4149729d50e5 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -1072,17 +1072,24 @@ multiclass VOP3P_Real_MFMA_gfx940<bits<7> op, string Name = !cast<VOP3_Pseudo>(N
   defm : VOP3P_Real_MFMA_gfx940_aliases<Name, !subst("_1k", "", PS_ACD.Mnemonic), NAME>;
 }
 
-multiclass VOP3P_Real_MFMA<bits<7> op, string GFX940Name = !cast<VOP3_Pseudo>(NAME#"_e64").Mnemonic> :
-  VOP3P_Real_MFMA_gfx90a <op>,
-  VOP3P_Real_MFMA_gfx940 <op, GFX940Name> {
+multiclass VOP3P_Real_MFMA_vi<bits<7> op> {
   def _vi : VOP3P_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.VI>,
             VOP3Pe_MAI <op, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl, ?> {
+    let SubtargetPredicate = isGFX8GFX9NotGFX90A;
     let AssemblerPredicate = HasMAIInsts;
     let DecoderNamespace = "GFX8";
     let Constraints = "";
   }
 }
 
+multiclass VOP3P_Real_MFMA_vi_gfx90a<bits<7> op> :
+  VOP3P_Real_MFMA_gfx90a <op>,
+  VOP3P_Real_MFMA_vi <op>;
+
+multiclass VOP3P_Real_MFMA<bits<7> op, string GFX940Name = !cast<VOP3_Pseudo>(NAME#"_e64").Mnemonic> :
+  VOP3P_Real_MFMA_vi_gfx90a <op>,
+  VOP3P_Real_MFMA_gfx940 <op, GFX940Name>;
+
 multiclass VOP3P_Real_SMFMAC<bits<7> op, string alias> {
   def _gfx940 : VOP3P_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.VI>,
                 VOP3Pe_SMFMAC <op> {
@@ -1143,7 +1150,7 @@ defm V_DOT8_U32_U4  : VOP3P_Real_vi <0x2b>;
 defm V_DOT4_I32_I8  : VOP3P_Real_vi <0x28>;
 defm V_DOT8_I32_I4  : VOP3P_Real_vi <0x2a>;
 
-let SubtargetPredicate = HasMAIInsts in {
+let OtherPredicates = [HasMAIInsts] in {
 
 defm V_ACCVGPR_READ_B32  : VOP3P_Real_MAI <0x58>;
 defm V_ACCVGPR_WRITE_B32 : VOP3P_Real_MAI <0x59>;
@@ -1161,17 +1168,15 @@ defm V_MFMA_I32_32X32X4I8   : VOP3P_Real_MFMA <0x50, "v_mfma_i32_32x32x4_2b_i8">
 defm V_MFMA_I32_16X16X4I8   : VOP3P_Real_MFMA <0x51, "v_mfma_i32_16x16x4_4b_i8">;
 defm V_MFMA_I32_4X4X4I8     : VOP3P_Real_MFMA <0x52, "v_mfma_i32_4x4x4_16b_i8">;
 
-let SubtargetPredicate = isGFX908orGFX90A in {
-defm V_MFMA_I32_16X16X16I8  : VOP3P_Real_MFMA <0x55>;
-defm V_MFMA_I32_32X32X8I8   : VOP3P_Real_MFMA <0x54>;
-defm V_MFMA_F32_32X32X2BF16 : VOP3P_Real_MFMA <0x68>;
-defm V_MFMA_F32_16X16X2BF16 : VOP3P_Real_MFMA <0x69>;
-defm V_MFMA_F32_4X4X2BF16   : VOP3P_Real_MFMA <0x6b>;
-defm V_MFMA_F32_32X32X4BF16 : VOP3P_Real_MFMA <0x6c>;
-defm V_MFMA_F32_16X16X8BF16 : VOP3P_Real_MFMA <0x6d>;
-}
+defm V_MFMA_I32_16X16X16I8  : VOP3P_Real_MFMA_vi_gfx90a <0x55>;
+defm V_MFMA_I32_32X32X8I8   : VOP3P_Real_MFMA_vi_gfx90a <0x54>;
+defm V_MFMA_F32_32X32X2BF16 : VOP3P_Real_MFMA_vi_gfx90a <0x68>;
+defm V_MFMA_F32_16X16X2BF16 : VOP3P_Real_MFMA_vi_gfx90a <0x69>;
+defm V_MFMA_F32_4X4X2BF16   : VOP3P_Real_MFMA_vi_gfx90a <0x6b>;
+defm V_MFMA_F32_32X32X4BF16 : VOP3P_Real_MFMA_vi_gfx90a <0x6c>;
+defm V_MFMA_F32_16X16X8BF16 : VOP3P_Real_MFMA_vi_gfx90a <0x6d>;
 
-} // End SubtargetPredicate = HasMAIInsts
+} // End OtherPredicates = [HasMAIInsts]
 
 defm V_MFMA_F32_32X32X4BF16_1K  : VOP3P_Real_MFMA_gfx90a <0x63>;
 defm V_MFMA_F32_16X16X4BF16_1K  : VOP3P_Real_MFMA_gfx90a <0x64>;

From e9ff4e4291a2748a87ab53c4e623fdc3a21960c0 Mon Sep 17 00:00:00 2001
From: "Yueh-Ting (eop) Chen" <yueh.ting.chen@gmail.com>
Date: Mon, 30 Oct 2023 21:59:02 +0800
Subject: [PATCH 369/877] [Clang][RISCV] Add vundefine intrinsics for tuple
 types (#70354)

riscv-non-isa/rvv-intrinsic-doc#288
---
 clang/include/clang/Basic/riscv_vector.td     |    9 +
 .../non-policy/non-overloaded/vundefined.c    | 2245 ++++++++++++++---
 2 files changed, 1959 insertions(+), 295 deletions(-)

diff --git a/clang/include/clang/Basic/riscv_vector.td b/clang/include/clang/Basic/riscv_vector.td
index c685f3ef6087d..63316da940594 100644
--- a/clang/include/clang/Basic/riscv_vector.td
+++ b/clang/include/clang/Basic/riscv_vector.td
@@ -2320,6 +2320,15 @@ let HasMasked = false, HasVL = false, IRName = "" in {
       }] in {
     def vundefined : RVVBuiltin<"v", "v", "csilxfd">;
     def vundefined_u : RVVBuiltin<"Uv", "Uv", "csil">;
+
+    foreach nf = NFList in {
+      let NF = nf in {
+        defvar T = "(Tuple:" # nf # ")";
+        def : RVVBuiltin<T # "v", T # "v", "csilxfd">;
+        def : RVVBuiltin<T # "Uv", T # "Uv", "csil">;
+      }
+    }
+
   }
 
   // LMUL truncation
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vundefined.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vundefined.c
index 5f474f8a5f555..721fde373ab12 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vundefined.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vundefined.c
@@ -1,4 +1,4 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 3
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
 // RUN:   -target-feature +zvfh -disable-O0-optnone  \
@@ -7,534 +7,2189 @@
 
 #include <riscv_vector.h>
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_vundefined_f16mf4
-// CHECK-RV64-SAME: () #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x half> @test_vundefined_f16mf4(
+// CHECK-RV64-SAME: ) #[[ATTR0:[0-9]+]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    ret <vscale x 1 x half> poison
 //
-vfloat16mf4_t test_vundefined_f16mf4() {
-  return __riscv_vundefined_f16mf4();
-}
+vfloat16mf4_t test_vundefined_f16mf4() { return __riscv_vundefined_f16mf4(); }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_vundefined_f16mf2
-// CHECK-RV64-SAME: () #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x half> @test_vundefined_f16mf2(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    ret <vscale x 2 x half> poison
 //
-vfloat16mf2_t test_vundefined_f16mf2() {
-  return __riscv_vundefined_f16mf2();
-}
+vfloat16mf2_t test_vundefined_f16mf2() { return __riscv_vundefined_f16mf2(); }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_vundefined_f16m1
-// CHECK-RV64-SAME: () #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x half> @test_vundefined_f16m1(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    ret <vscale x 4 x half> poison
 //
-vfloat16m1_t test_vundefined_f16m1() {
-  return __riscv_vundefined_f16m1();
-}
+vfloat16m1_t test_vundefined_f16m1() { return __riscv_vundefined_f16m1(); }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_vundefined_f16m2
-// CHECK-RV64-SAME: () #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_vundefined_f16m2(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    ret <vscale x 8 x half> poison
 //
-vfloat16m2_t test_vundefined_f16m2() {
-  return __riscv_vundefined_f16m2();
-}
+vfloat16m2_t test_vundefined_f16m2() { return __riscv_vundefined_f16m2(); }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_vundefined_f16m4
-// CHECK-RV64-SAME: () #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_vundefined_f16m4(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    ret <vscale x 16 x half> poison
 //
-vfloat16m4_t test_vundefined_f16m4() {
-  return __riscv_vundefined_f16m4();
-}
+vfloat16m4_t test_vundefined_f16m4() { return __riscv_vundefined_f16m4(); }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 32 x half> @test_vundefined_f16m8
-// CHECK-RV64-SAME: () #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x half> @test_vundefined_f16m8(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    ret <vscale x 32 x half> poison
 //
-vfloat16m8_t test_vundefined_f16m8() {
-  return __riscv_vundefined_f16m8();
-}
+vfloat16m8_t test_vundefined_f16m8() { return __riscv_vundefined_f16m8(); }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vundefined_f32mf2
-// CHECK-RV64-SAME: () #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x float> @test_vundefined_f32mf2(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    ret <vscale x 1 x float> poison
 //
-vfloat32mf2_t test_vundefined_f32mf2() {
-  return __riscv_vundefined_f32mf2();
-}
+vfloat32mf2_t test_vundefined_f32mf2() { return __riscv_vundefined_f32mf2(); }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vundefined_f32m1
-// CHECK-RV64-SAME: () #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x float> @test_vundefined_f32m1(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    ret <vscale x 2 x float> poison
 //
-vfloat32m1_t test_vundefined_f32m1() {
-  return __riscv_vundefined_f32m1();
-}
+vfloat32m1_t test_vundefined_f32m1() { return __riscv_vundefined_f32m1(); }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vundefined_f32m2
-// CHECK-RV64-SAME: () #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vundefined_f32m2(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    ret <vscale x 4 x float> poison
 //
-vfloat32m2_t test_vundefined_f32m2() {
-  return __riscv_vundefined_f32m2();
-}
+vfloat32m2_t test_vundefined_f32m2() { return __riscv_vundefined_f32m2(); }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vundefined_f32m4
-// CHECK-RV64-SAME: () #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vundefined_f32m4(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    ret <vscale x 8 x float> poison
 //
-vfloat32m4_t test_vundefined_f32m4() {
-  return __riscv_vundefined_f32m4();
-}
+vfloat32m4_t test_vundefined_f32m4() { return __riscv_vundefined_f32m4(); }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vundefined_f32m8
-// CHECK-RV64-SAME: () #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vundefined_f32m8(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    ret <vscale x 16 x float> poison
 //
-vfloat32m8_t test_vundefined_f32m8() {
-  return __riscv_vundefined_f32m8();
-}
+vfloat32m8_t test_vundefined_f32m8() { return __riscv_vundefined_f32m8(); }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 1 x double> @test_vundefined_f64m1
-// CHECK-RV64-SAME: () #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x double> @test_vundefined_f64m1(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    ret <vscale x 1 x double> poison
 //
-vfloat64m1_t test_vundefined_f64m1() {
-  return __riscv_vundefined_f64m1();
-}
+vfloat64m1_t test_vundefined_f64m1() { return __riscv_vundefined_f64m1(); }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 2 x double> @test_vundefined_f64m2
-// CHECK-RV64-SAME: () #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x double> @test_vundefined_f64m2(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    ret <vscale x 2 x double> poison
 //
-vfloat64m2_t test_vundefined_f64m2() {
-  return __riscv_vundefined_f64m2();
-}
+vfloat64m2_t test_vundefined_f64m2() { return __riscv_vundefined_f64m2(); }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 4 x double> @test_vundefined_f64m4
-// CHECK-RV64-SAME: () #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x double> @test_vundefined_f64m4(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    ret <vscale x 4 x double> poison
 //
-vfloat64m4_t test_vundefined_f64m4() {
-  return __riscv_vundefined_f64m4();
-}
+vfloat64m4_t test_vundefined_f64m4() { return __riscv_vundefined_f64m4(); }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 8 x double> @test_vundefined_f64m8
-// CHECK-RV64-SAME: () #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x double> @test_vundefined_f64m8(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    ret <vscale x 8 x double> poison
 //
-vfloat64m8_t test_vundefined_f64m8() {
-  return __riscv_vundefined_f64m8();
-}
+vfloat64m8_t test_vundefined_f64m8() { return __riscv_vundefined_f64m8(); }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i8> @test_vundefined_i8mf8
-// CHECK-RV64-SAME: () #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i8> @test_vundefined_i8mf8(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i8> poison
 //
-vint8mf8_t test_vundefined_i8mf8() {
-  return __riscv_vundefined_i8mf8();
-}
+vint8mf8_t test_vundefined_i8mf8() { return __riscv_vundefined_i8mf8(); }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i8> @test_vundefined_i8mf4
-// CHECK-RV64-SAME: () #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i8> @test_vundefined_i8mf4(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i8> poison
 //
-vint8mf4_t test_vundefined_i8mf4() {
-  return __riscv_vundefined_i8mf4();
-}
+vint8mf4_t test_vundefined_i8mf4() { return __riscv_vundefined_i8mf4(); }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i8> @test_vundefined_i8mf2
-// CHECK-RV64-SAME: () #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i8> @test_vundefined_i8mf2(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i8> poison
 //
-vint8mf2_t test_vundefined_i8mf2() {
-  return __riscv_vundefined_i8mf2();
-}
+vint8mf2_t test_vundefined_i8mf2() { return __riscv_vundefined_i8mf2(); }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i8> @test_vundefined_i8m1
-// CHECK-RV64-SAME: () #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i8> @test_vundefined_i8m1(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i8> poison
 //
-vint8m1_t test_vundefined_i8m1() {
-  return __riscv_vundefined_i8m1();
-}
+vint8m1_t test_vundefined_i8m1() { return __riscv_vundefined_i8m1(); }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i8> @test_vundefined_i8m2
-// CHECK-RV64-SAME: () #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i8> @test_vundefined_i8m2(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i8> poison
 //
-vint8m2_t test_vundefined_i8m2() {
-  return __riscv_vundefined_i8m2();
-}
+vint8m2_t test_vundefined_i8m2() { return __riscv_vundefined_i8m2(); }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i8> @test_vundefined_i8m4
-// CHECK-RV64-SAME: () #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i8> @test_vundefined_i8m4(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i8> poison
 //
-vint8m4_t test_vundefined_i8m4() {
-  return __riscv_vundefined_i8m4();
-}
+vint8m4_t test_vundefined_i8m4() { return __riscv_vundefined_i8m4(); }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 64 x i8> @test_vundefined_i8m8
-// CHECK-RV64-SAME: () #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local <vscale x 64 x i8> @test_vundefined_i8m8(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    ret <vscale x 64 x i8> poison
 //
-vint8m8_t test_vundefined_i8m8() {
-  return __riscv_vundefined_i8m8();
-}
+vint8m8_t test_vundefined_i8m8() { return __riscv_vundefined_i8m8(); }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i16> @test_vundefined_i16mf4
-// CHECK-RV64-SAME: () #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i16> @test_vundefined_i16mf4(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i16> poison
 //
-vint16mf4_t test_vundefined_i16mf4() {
-  return __riscv_vundefined_i16mf4();
-}
+vint16mf4_t test_vundefined_i16mf4() { return __riscv_vundefined_i16mf4(); }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i16> @test_vundefined_i16mf2
-// CHECK-RV64-SAME: () #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i16> @test_vundefined_i16mf2(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i16> poison
 //
-vint16mf2_t test_vundefined_i16mf2() {
-  return __riscv_vundefined_i16mf2();
-}
+vint16mf2_t test_vundefined_i16mf2() { return __riscv_vundefined_i16mf2(); }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i16> @test_vundefined_i16m1
-// CHECK-RV64-SAME: () #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i16> @test_vundefined_i16m1(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i16> poison
 //
-vint16m1_t test_vundefined_i16m1() {
-  return __riscv_vundefined_i16m1();
-}
+vint16m1_t test_vundefined_i16m1() { return __riscv_vundefined_i16m1(); }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i16> @test_vundefined_i16m2
-// CHECK-RV64-SAME: () #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i16> @test_vundefined_i16m2(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i16> poison
 //
-vint16m2_t test_vundefined_i16m2() {
-  return __riscv_vundefined_i16m2();
-}
+vint16m2_t test_vundefined_i16m2() { return __riscv_vundefined_i16m2(); }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i16> @test_vundefined_i16m4
-// CHECK-RV64-SAME: () #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i16> @test_vundefined_i16m4(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i16> poison
 //
-vint16m4_t test_vundefined_i16m4() {
-  return __riscv_vundefined_i16m4();
-}
+vint16m4_t test_vundefined_i16m4() { return __riscv_vundefined_i16m4(); }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i16> @test_vundefined_i16m8
-// CHECK-RV64-SAME: () #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i16> @test_vundefined_i16m8(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i16> poison
 //
-vint16m8_t test_vundefined_i16m8() {
-  return __riscv_vundefined_i16m8();
-}
+vint16m8_t test_vundefined_i16m8() { return __riscv_vundefined_i16m8(); }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i32> @test_vundefined_i32mf2
-// CHECK-RV64-SAME: () #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i32> @test_vundefined_i32mf2(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i32> poison
 //
-vint32mf2_t test_vundefined_i32mf2() {
-  return __riscv_vundefined_i32mf2();
-}
+vint32mf2_t test_vundefined_i32mf2() { return __riscv_vundefined_i32mf2(); }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i32> @test_vundefined_i32m1
-// CHECK-RV64-SAME: () #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i32> @test_vundefined_i32m1(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> poison
 //
-vint32m1_t test_vundefined_i32m1() {
-  return __riscv_vundefined_i32m1();
-}
+vint32m1_t test_vundefined_i32m1() { return __riscv_vundefined_i32m1(); }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i32> @test_vundefined_i32m2
-// CHECK-RV64-SAME: () #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i32> @test_vundefined_i32m2(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> poison
 //
-vint32m2_t test_vundefined_i32m2() {
-  return __riscv_vundefined_i32m2();
-}
+vint32m2_t test_vundefined_i32m2() { return __riscv_vundefined_i32m2(); }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i32> @test_vundefined_i32m4
-// CHECK-RV64-SAME: () #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i32> @test_vundefined_i32m4(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> poison
 //
-vint32m4_t test_vundefined_i32m4() {
-  return __riscv_vundefined_i32m4();
-}
+vint32m4_t test_vundefined_i32m4() { return __riscv_vundefined_i32m4(); }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i32> @test_vundefined_i32m8
-// CHECK-RV64-SAME: () #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i32> @test_vundefined_i32m8(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> poison
 //
-vint32m8_t test_vundefined_i32m8() {
-  return __riscv_vundefined_i32m8();
-}
+vint32m8_t test_vundefined_i32m8() { return __riscv_vundefined_i32m8(); }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i64> @test_vundefined_i64m1
-// CHECK-RV64-SAME: () #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i64> @test_vundefined_i64m1(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i64> poison
 //
-vint64m1_t test_vundefined_i64m1() {
-  return __riscv_vundefined_i64m1();
-}
+vint64m1_t test_vundefined_i64m1() { return __riscv_vundefined_i64m1(); }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i64> @test_vundefined_i64m2
-// CHECK-RV64-SAME: () #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i64> @test_vundefined_i64m2(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i64> poison
 //
-vint64m2_t test_vundefined_i64m2() {
-  return __riscv_vundefined_i64m2();
-}
+vint64m2_t test_vundefined_i64m2() { return __riscv_vundefined_i64m2(); }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i64> @test_vundefined_i64m4
-// CHECK-RV64-SAME: () #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i64> @test_vundefined_i64m4(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i64> poison
 //
-vint64m4_t test_vundefined_i64m4() {
-  return __riscv_vundefined_i64m4();
-}
+vint64m4_t test_vundefined_i64m4() { return __riscv_vundefined_i64m4(); }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i64> @test_vundefined_i64m8
-// CHECK-RV64-SAME: () #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i64> @test_vundefined_i64m8(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i64> poison
 //
-vint64m8_t test_vundefined_i64m8() {
-  return __riscv_vundefined_i64m8();
-}
+vint64m8_t test_vundefined_i64m8() { return __riscv_vundefined_i64m8(); }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i8> @test_vundefined_u8mf8
-// CHECK-RV64-SAME: () #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i8> @test_vundefined_u8mf8(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i8> poison
 //
-vuint8mf8_t test_vundefined_u8mf8() {
-  return __riscv_vundefined_u8mf8();
-}
+vuint8mf8_t test_vundefined_u8mf8() { return __riscv_vundefined_u8mf8(); }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i8> @test_vundefined_u8mf4
-// CHECK-RV64-SAME: () #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i8> @test_vundefined_u8mf4(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i8> poison
 //
-vuint8mf4_t test_vundefined_u8mf4() {
-  return __riscv_vundefined_u8mf4();
-}
+vuint8mf4_t test_vundefined_u8mf4() { return __riscv_vundefined_u8mf4(); }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i8> @test_vundefined_u8mf2
-// CHECK-RV64-SAME: () #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i8> @test_vundefined_u8mf2(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i8> poison
 //
-vuint8mf2_t test_vundefined_u8mf2() {
-  return __riscv_vundefined_u8mf2();
-}
+vuint8mf2_t test_vundefined_u8mf2() { return __riscv_vundefined_u8mf2(); }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i8> @test_vundefined_u8m1
-// CHECK-RV64-SAME: () #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i8> @test_vundefined_u8m1(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i8> poison
 //
-vuint8m1_t test_vundefined_u8m1() {
-  return __riscv_vundefined_u8m1();
-}
+vuint8m1_t test_vundefined_u8m1() { return __riscv_vundefined_u8m1(); }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i8> @test_vundefined_u8m2
-// CHECK-RV64-SAME: () #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i8> @test_vundefined_u8m2(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i8> poison
 //
-vuint8m2_t test_vundefined_u8m2() {
-  return __riscv_vundefined_u8m2();
-}
+vuint8m2_t test_vundefined_u8m2() { return __riscv_vundefined_u8m2(); }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i8> @test_vundefined_u8m4
-// CHECK-RV64-SAME: () #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i8> @test_vundefined_u8m4(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i8> poison
 //
-vuint8m4_t test_vundefined_u8m4() {
-  return __riscv_vundefined_u8m4();
-}
+vuint8m4_t test_vundefined_u8m4() { return __riscv_vundefined_u8m4(); }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 64 x i8> @test_vundefined_u8m8
-// CHECK-RV64-SAME: () #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local <vscale x 64 x i8> @test_vundefined_u8m8(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    ret <vscale x 64 x i8> poison
 //
-vuint8m8_t test_vundefined_u8m8() {
-  return __riscv_vundefined_u8m8();
-}
+vuint8m8_t test_vundefined_u8m8() { return __riscv_vundefined_u8m8(); }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i16> @test_vundefined_u16mf4
-// CHECK-RV64-SAME: () #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i16> @test_vundefined_u16mf4(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i16> poison
 //
-vuint16mf4_t test_vundefined_u16mf4() {
-  return __riscv_vundefined_u16mf4();
-}
+vuint16mf4_t test_vundefined_u16mf4() { return __riscv_vundefined_u16mf4(); }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i16> @test_vundefined_u16mf2
-// CHECK-RV64-SAME: () #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i16> @test_vundefined_u16mf2(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i16> poison
 //
-vuint16mf2_t test_vundefined_u16mf2() {
-  return __riscv_vundefined_u16mf2();
-}
+vuint16mf2_t test_vundefined_u16mf2() { return __riscv_vundefined_u16mf2(); }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i16> @test_vundefined_u16m1
-// CHECK-RV64-SAME: () #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i16> @test_vundefined_u16m1(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i16> poison
 //
-vuint16m1_t test_vundefined_u16m1() {
-  return __riscv_vundefined_u16m1();
-}
+vuint16m1_t test_vundefined_u16m1() { return __riscv_vundefined_u16m1(); }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i16> @test_vundefined_u16m2
-// CHECK-RV64-SAME: () #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i16> @test_vundefined_u16m2(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i16> poison
 //
-vuint16m2_t test_vundefined_u16m2() {
-  return __riscv_vundefined_u16m2();
-}
+vuint16m2_t test_vundefined_u16m2() { return __riscv_vundefined_u16m2(); }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i16> @test_vundefined_u16m4
-// CHECK-RV64-SAME: () #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i16> @test_vundefined_u16m4(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i16> poison
 //
-vuint16m4_t test_vundefined_u16m4() {
-  return __riscv_vundefined_u16m4();
-}
+vuint16m4_t test_vundefined_u16m4() { return __riscv_vundefined_u16m4(); }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i16> @test_vundefined_u16m8
-// CHECK-RV64-SAME: () #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i16> @test_vundefined_u16m8(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    ret <vscale x 32 x i16> poison
 //
-vuint16m8_t test_vundefined_u16m8() {
-  return __riscv_vundefined_u16m8();
-}
+vuint16m8_t test_vundefined_u16m8() { return __riscv_vundefined_u16m8(); }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i32> @test_vundefined_u32mf2
-// CHECK-RV64-SAME: () #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i32> @test_vundefined_u32mf2(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i32> poison
 //
-vuint32mf2_t test_vundefined_u32mf2() {
-  return __riscv_vundefined_u32mf2();
-}
+vuint32mf2_t test_vundefined_u32mf2() { return __riscv_vundefined_u32mf2(); }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i32> @test_vundefined_u32m1
-// CHECK-RV64-SAME: () #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i32> @test_vundefined_u32m1(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i32> poison
 //
-vuint32m1_t test_vundefined_u32m1() {
-  return __riscv_vundefined_u32m1();
-}
+vuint32m1_t test_vundefined_u32m1() { return __riscv_vundefined_u32m1(); }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i32> @test_vundefined_u32m2
-// CHECK-RV64-SAME: () #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i32> @test_vundefined_u32m2(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i32> poison
 //
-vuint32m2_t test_vundefined_u32m2() {
-  return __riscv_vundefined_u32m2();
-}
+vuint32m2_t test_vundefined_u32m2() { return __riscv_vundefined_u32m2(); }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i32> @test_vundefined_u32m4
-// CHECK-RV64-SAME: () #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i32> @test_vundefined_u32m4(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i32> poison
 //
-vuint32m4_t test_vundefined_u32m4() {
-  return __riscv_vundefined_u32m4();
-}
+vuint32m4_t test_vundefined_u32m4() { return __riscv_vundefined_u32m4(); }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i32> @test_vundefined_u32m8
-// CHECK-RV64-SAME: () #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i32> @test_vundefined_u32m8(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    ret <vscale x 16 x i32> poison
 //
-vuint32m8_t test_vundefined_u32m8() {
-  return __riscv_vundefined_u32m8();
-}
+vuint32m8_t test_vundefined_u32m8() { return __riscv_vundefined_u32m8(); }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i64> @test_vundefined_u64m1
-// CHECK-RV64-SAME: () #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local <vscale x 1 x i64> @test_vundefined_u64m1(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    ret <vscale x 1 x i64> poison
 //
-vuint64m1_t test_vundefined_u64m1() {
-  return __riscv_vundefined_u64m1();
-}
+vuint64m1_t test_vundefined_u64m1() { return __riscv_vundefined_u64m1(); }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i64> @test_vundefined_u64m2
-// CHECK-RV64-SAME: () #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i64> @test_vundefined_u64m2(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    ret <vscale x 2 x i64> poison
 //
-vuint64m2_t test_vundefined_u64m2() {
-  return __riscv_vundefined_u64m2();
-}
+vuint64m2_t test_vundefined_u64m2() { return __riscv_vundefined_u64m2(); }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i64> @test_vundefined_u64m4
-// CHECK-RV64-SAME: () #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i64> @test_vundefined_u64m4(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    ret <vscale x 4 x i64> poison
 //
-vuint64m4_t test_vundefined_u64m4() {
-  return __riscv_vundefined_u64m4();
-}
+vuint64m4_t test_vundefined_u64m4() { return __riscv_vundefined_u64m4(); }
 
-// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i64> @test_vundefined_u64m8
-// CHECK-RV64-SAME: () #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i64> @test_vundefined_u64m8(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    ret <vscale x 8 x i64> poison
 //
-vuint64m8_t test_vundefined_u64m8() {
-  return __riscv_vundefined_u64m8();
+vuint64m8_t test_vundefined_u64m8() { return __riscv_vundefined_u64m8(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x half>, <vscale x 1 x half> } @test_vundefined_f16mf4x2(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 1 x half>, <vscale x 1 x half> } poison
+//
+vfloat16mf4x2_t test_vundefined_f16mf4x2() {
+  return __riscv_vundefined_f16mf4x2();
+}
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half> } @test_vundefined_f16mf4x3(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half> } poison
+//
+vfloat16mf4x3_t test_vundefined_f16mf4x3() {
+  return __riscv_vundefined_f16mf4x3();
+}
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half> } @test_vundefined_f16mf4x4(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half> } poison
+//
+vfloat16mf4x4_t test_vundefined_f16mf4x4() {
+  return __riscv_vundefined_f16mf4x4();
+}
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half> } @test_vundefined_f16mf4x5(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half> } poison
+//
+vfloat16mf4x5_t test_vundefined_f16mf4x5() {
+  return __riscv_vundefined_f16mf4x5();
+}
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half> } @test_vundefined_f16mf4x6(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half> } poison
+//
+vfloat16mf4x6_t test_vundefined_f16mf4x6() {
+  return __riscv_vundefined_f16mf4x6();
+}
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half> } @test_vundefined_f16mf4x7(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half> } poison
+//
+vfloat16mf4x7_t test_vundefined_f16mf4x7() {
+  return __riscv_vundefined_f16mf4x7();
+}
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half> } @test_vundefined_f16mf4x8(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half> } poison
+//
+vfloat16mf4x8_t test_vundefined_f16mf4x8() {
+  return __riscv_vundefined_f16mf4x8();
+}
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x half>, <vscale x 2 x half> } @test_vundefined_f16mf2x2(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 2 x half>, <vscale x 2 x half> } poison
+//
+vfloat16mf2x2_t test_vundefined_f16mf2x2() {
+  return __riscv_vundefined_f16mf2x2();
+}
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half> } @test_vundefined_f16mf2x3(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half> } poison
+//
+vfloat16mf2x3_t test_vundefined_f16mf2x3() {
+  return __riscv_vundefined_f16mf2x3();
+}
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half> } @test_vundefined_f16mf2x4(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half> } poison
+//
+vfloat16mf2x4_t test_vundefined_f16mf2x4() {
+  return __riscv_vundefined_f16mf2x4();
+}
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half> } @test_vundefined_f16mf2x5(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half> } poison
+//
+vfloat16mf2x5_t test_vundefined_f16mf2x5() {
+  return __riscv_vundefined_f16mf2x5();
+}
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half> } @test_vundefined_f16mf2x6(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half> } poison
+//
+vfloat16mf2x6_t test_vundefined_f16mf2x6() {
+  return __riscv_vundefined_f16mf2x6();
+}
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half> } @test_vundefined_f16mf2x7(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half> } poison
+//
+vfloat16mf2x7_t test_vundefined_f16mf2x7() {
+  return __riscv_vundefined_f16mf2x7();
+}
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half> } @test_vundefined_f16mf2x8(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half> } poison
+//
+vfloat16mf2x8_t test_vundefined_f16mf2x8() {
+  return __riscv_vundefined_f16mf2x8();
+}
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x half>, <vscale x 4 x half> } @test_vundefined_f16m1x2(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 4 x half>, <vscale x 4 x half> } poison
+//
+vfloat16m1x2_t test_vundefined_f16m1x2() {
+  return __riscv_vundefined_f16m1x2();
+}
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half> } @test_vundefined_f16m1x3(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half> } poison
+//
+vfloat16m1x3_t test_vundefined_f16m1x3() {
+  return __riscv_vundefined_f16m1x3();
+}
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half> } @test_vundefined_f16m1x4(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half> } poison
+//
+vfloat16m1x4_t test_vundefined_f16m1x4() {
+  return __riscv_vundefined_f16m1x4();
+}
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half> } @test_vundefined_f16m1x5(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half> } poison
+//
+vfloat16m1x5_t test_vundefined_f16m1x5() {
+  return __riscv_vundefined_f16m1x5();
+}
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half> } @test_vundefined_f16m1x6(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half> } poison
+//
+vfloat16m1x6_t test_vundefined_f16m1x6() {
+  return __riscv_vundefined_f16m1x6();
+}
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half> } @test_vundefined_f16m1x7(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half> } poison
+//
+vfloat16m1x7_t test_vundefined_f16m1x7() {
+  return __riscv_vundefined_f16m1x7();
+}
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half> } @test_vundefined_f16m1x8(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half> } poison
+//
+vfloat16m1x8_t test_vundefined_f16m1x8() {
+  return __riscv_vundefined_f16m1x8();
+}
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 8 x half>, <vscale x 8 x half> } @test_vundefined_f16m2x2(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half> } poison
+//
+vfloat16m2x2_t test_vundefined_f16m2x2() {
+  return __riscv_vundefined_f16m2x2();
+}
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @test_vundefined_f16m2x3(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } poison
+//
+vfloat16m2x3_t test_vundefined_f16m2x3() {
+  return __riscv_vundefined_f16m2x3();
+}
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @test_vundefined_f16m2x4(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } poison
+//
+vfloat16m2x4_t test_vundefined_f16m2x4() {
+  return __riscv_vundefined_f16m2x4();
+}
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 16 x half>, <vscale x 16 x half> } @test_vundefined_f16m4x2(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 16 x half>, <vscale x 16 x half> } poison
+//
+vfloat16m4x2_t test_vundefined_f16m4x2() {
+  return __riscv_vundefined_f16m4x2();
+}
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x float>, <vscale x 1 x float> } @test_vundefined_f32mf2x2(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 1 x float>, <vscale x 1 x float> } poison
+//
+vfloat32mf2x2_t test_vundefined_f32mf2x2() {
+  return __riscv_vundefined_f32mf2x2();
+}
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float> } @test_vundefined_f32mf2x3(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float> } poison
+//
+vfloat32mf2x3_t test_vundefined_f32mf2x3() {
+  return __riscv_vundefined_f32mf2x3();
+}
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float> } @test_vundefined_f32mf2x4(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float> } poison
+//
+vfloat32mf2x4_t test_vundefined_f32mf2x4() {
+  return __riscv_vundefined_f32mf2x4();
+}
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float> } @test_vundefined_f32mf2x5(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float> } poison
+//
+vfloat32mf2x5_t test_vundefined_f32mf2x5() {
+  return __riscv_vundefined_f32mf2x5();
+}
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float> } @test_vundefined_f32mf2x6(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float> } poison
+//
+vfloat32mf2x6_t test_vundefined_f32mf2x6() {
+  return __riscv_vundefined_f32mf2x6();
+}
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float> } @test_vundefined_f32mf2x7(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float> } poison
+//
+vfloat32mf2x7_t test_vundefined_f32mf2x7() {
+  return __riscv_vundefined_f32mf2x7();
+}
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float> } @test_vundefined_f32mf2x8(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float> } poison
+//
+vfloat32mf2x8_t test_vundefined_f32mf2x8() {
+  return __riscv_vundefined_f32mf2x8();
+}
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x float>, <vscale x 2 x float> } @test_vundefined_f32m1x2(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 2 x float>, <vscale x 2 x float> } poison
+//
+vfloat32m1x2_t test_vundefined_f32m1x2() {
+  return __riscv_vundefined_f32m1x2();
+}
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float> } @test_vundefined_f32m1x3(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float> } poison
+//
+vfloat32m1x3_t test_vundefined_f32m1x3() {
+  return __riscv_vundefined_f32m1x3();
+}
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float> } @test_vundefined_f32m1x4(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float> } poison
+//
+vfloat32m1x4_t test_vundefined_f32m1x4() {
+  return __riscv_vundefined_f32m1x4();
+}
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float> } @test_vundefined_f32m1x5(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float> } poison
+//
+vfloat32m1x5_t test_vundefined_f32m1x5() {
+  return __riscv_vundefined_f32m1x5();
+}
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float> } @test_vundefined_f32m1x6(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float> } poison
+//
+vfloat32m1x6_t test_vundefined_f32m1x6() {
+  return __riscv_vundefined_f32m1x6();
+}
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float> } @test_vundefined_f32m1x7(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float> } poison
+//
+vfloat32m1x7_t test_vundefined_f32m1x7() {
+  return __riscv_vundefined_f32m1x7();
+}
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float> } @test_vundefined_f32m1x8(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float> } poison
+//
+vfloat32m1x8_t test_vundefined_f32m1x8() {
+  return __riscv_vundefined_f32m1x8();
+}
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x float>, <vscale x 4 x float> } @test_vundefined_f32m2x2(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float> } poison
+//
+vfloat32m2x2_t test_vundefined_f32m2x2() {
+  return __riscv_vundefined_f32m2x2();
+}
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @test_vundefined_f32m2x3(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } poison
+//
+vfloat32m2x3_t test_vundefined_f32m2x3() {
+  return __riscv_vundefined_f32m2x3();
+}
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @test_vundefined_f32m2x4(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } poison
+//
+vfloat32m2x4_t test_vundefined_f32m2x4() {
+  return __riscv_vundefined_f32m2x4();
+}
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 8 x float>, <vscale x 8 x float> } @test_vundefined_f32m4x2(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 8 x float>, <vscale x 8 x float> } poison
+//
+vfloat32m4x2_t test_vundefined_f32m4x2() {
+  return __riscv_vundefined_f32m4x2();
+}
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x double>, <vscale x 1 x double> } @test_vundefined_f64m1x2(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 1 x double>, <vscale x 1 x double> } poison
+//
+vfloat64m1x2_t test_vundefined_f64m1x2() {
+  return __riscv_vundefined_f64m1x2();
+}
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double> } @test_vundefined_f64m1x3(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double> } poison
+//
+vfloat64m1x3_t test_vundefined_f64m1x3() {
+  return __riscv_vundefined_f64m1x3();
+}
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double> } @test_vundefined_f64m1x4(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double> } poison
+//
+vfloat64m1x4_t test_vundefined_f64m1x4() {
+  return __riscv_vundefined_f64m1x4();
+}
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double> } @test_vundefined_f64m1x5(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double> } poison
+//
+vfloat64m1x5_t test_vundefined_f64m1x5() {
+  return __riscv_vundefined_f64m1x5();
+}
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double> } @test_vundefined_f64m1x6(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double> } poison
+//
+vfloat64m1x6_t test_vundefined_f64m1x6() {
+  return __riscv_vundefined_f64m1x6();
+}
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double> } @test_vundefined_f64m1x7(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double> } poison
+//
+vfloat64m1x7_t test_vundefined_f64m1x7() {
+  return __riscv_vundefined_f64m1x7();
 }
 
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double> } @test_vundefined_f64m1x8(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double> } poison
+//
+vfloat64m1x8_t test_vundefined_f64m1x8() {
+  return __riscv_vundefined_f64m1x8();
+}
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x double>, <vscale x 2 x double> } @test_vundefined_f64m2x2(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double> } poison
+//
+vfloat64m2x2_t test_vundefined_f64m2x2() {
+  return __riscv_vundefined_f64m2x2();
+}
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @test_vundefined_f64m2x3(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } poison
+//
+vfloat64m2x3_t test_vundefined_f64m2x3() {
+  return __riscv_vundefined_f64m2x3();
+}
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @test_vundefined_f64m2x4(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } poison
+//
+vfloat64m2x4_t test_vundefined_f64m2x4() {
+  return __riscv_vundefined_f64m2x4();
+}
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x double>, <vscale x 4 x double> } @test_vundefined_f64m4x2(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 4 x double>, <vscale x 4 x double> } poison
+//
+vfloat64m4x2_t test_vundefined_f64m4x2() {
+  return __riscv_vundefined_f64m4x2();
+}
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i8>, <vscale x 1 x i8> } @test_vundefined_i8mf8x2(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 1 x i8>, <vscale x 1 x i8> } poison
+//
+vint8mf8x2_t test_vundefined_i8mf8x2() { return __riscv_vundefined_i8mf8x2(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8> } @test_vundefined_i8mf8x3(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8> } poison
+//
+vint8mf8x3_t test_vundefined_i8mf8x3() { return __riscv_vundefined_i8mf8x3(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8> } @test_vundefined_i8mf8x4(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8> } poison
+//
+vint8mf8x4_t test_vundefined_i8mf8x4() { return __riscv_vundefined_i8mf8x4(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8> } @test_vundefined_i8mf8x5(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8> } poison
+//
+vint8mf8x5_t test_vundefined_i8mf8x5() { return __riscv_vundefined_i8mf8x5(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8> } @test_vundefined_i8mf8x6(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8> } poison
+//
+vint8mf8x6_t test_vundefined_i8mf8x6() { return __riscv_vundefined_i8mf8x6(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8> } @test_vundefined_i8mf8x7(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8> } poison
+//
+vint8mf8x7_t test_vundefined_i8mf8x7() { return __riscv_vundefined_i8mf8x7(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8> } @test_vundefined_i8mf8x8(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8> } poison
+//
+vint8mf8x8_t test_vundefined_i8mf8x8() { return __riscv_vundefined_i8mf8x8(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i8>, <vscale x 2 x i8> } @test_vundefined_i8mf4x2(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 2 x i8>, <vscale x 2 x i8> } poison
+//
+vint8mf4x2_t test_vundefined_i8mf4x2() { return __riscv_vundefined_i8mf4x2(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8> } @test_vundefined_i8mf4x3(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8> } poison
+//
+vint8mf4x3_t test_vundefined_i8mf4x3() { return __riscv_vundefined_i8mf4x3(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8> } @test_vundefined_i8mf4x4(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8> } poison
+//
+vint8mf4x4_t test_vundefined_i8mf4x4() { return __riscv_vundefined_i8mf4x4(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8> } @test_vundefined_i8mf4x5(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8> } poison
+//
+vint8mf4x5_t test_vundefined_i8mf4x5() { return __riscv_vundefined_i8mf4x5(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8> } @test_vundefined_i8mf4x6(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8> } poison
+//
+vint8mf4x6_t test_vundefined_i8mf4x6() { return __riscv_vundefined_i8mf4x6(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8> } @test_vundefined_i8mf4x7(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8> } poison
+//
+vint8mf4x7_t test_vundefined_i8mf4x7() { return __riscv_vundefined_i8mf4x7(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8> } @test_vundefined_i8mf4x8(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8> } poison
+//
+vint8mf4x8_t test_vundefined_i8mf4x8() { return __riscv_vundefined_i8mf4x8(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x i8>, <vscale x 4 x i8> } @test_vundefined_i8mf2x2(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 4 x i8>, <vscale x 4 x i8> } poison
+//
+vint8mf2x2_t test_vundefined_i8mf2x2() { return __riscv_vundefined_i8mf2x2(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8> } @test_vundefined_i8mf2x3(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8> } poison
+//
+vint8mf2x3_t test_vundefined_i8mf2x3() { return __riscv_vundefined_i8mf2x3(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8> } @test_vundefined_i8mf2x4(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8> } poison
+//
+vint8mf2x4_t test_vundefined_i8mf2x4() { return __riscv_vundefined_i8mf2x4(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8> } @test_vundefined_i8mf2x5(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8> } poison
+//
+vint8mf2x5_t test_vundefined_i8mf2x5() { return __riscv_vundefined_i8mf2x5(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8> } @test_vundefined_i8mf2x6(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8> } poison
+//
+vint8mf2x6_t test_vundefined_i8mf2x6() { return __riscv_vundefined_i8mf2x6(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8> } @test_vundefined_i8mf2x7(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8> } poison
+//
+vint8mf2x7_t test_vundefined_i8mf2x7() { return __riscv_vundefined_i8mf2x7(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8> } @test_vundefined_i8mf2x8(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8> } poison
+//
+vint8mf2x8_t test_vundefined_i8mf2x8() { return __riscv_vundefined_i8mf2x8(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 8 x i8>, <vscale x 8 x i8> } @test_vundefined_i8m1x2(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 8 x i8>, <vscale x 8 x i8> } poison
+//
+vint8m1x2_t test_vundefined_i8m1x2() { return __riscv_vundefined_i8m1x2(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } @test_vundefined_i8m1x3(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } poison
+//
+vint8m1x3_t test_vundefined_i8m1x3() { return __riscv_vundefined_i8m1x3(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } @test_vundefined_i8m1x4(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } poison
+//
+vint8m1x4_t test_vundefined_i8m1x4() { return __riscv_vundefined_i8m1x4(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } @test_vundefined_i8m1x5(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } poison
+//
+vint8m1x5_t test_vundefined_i8m1x5() { return __riscv_vundefined_i8m1x5(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } @test_vundefined_i8m1x6(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } poison
+//
+vint8m1x6_t test_vundefined_i8m1x6() { return __riscv_vundefined_i8m1x6(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } @test_vundefined_i8m1x7(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } poison
+//
+vint8m1x7_t test_vundefined_i8m1x7() { return __riscv_vundefined_i8m1x7(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } @test_vundefined_i8m1x8(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } poison
+//
+vint8m1x8_t test_vundefined_i8m1x8() { return __riscv_vundefined_i8m1x8(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8> } @test_vundefined_i8m2x2(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } poison
+//
+vint8m2x2_t test_vundefined_i8m2x2() { return __riscv_vundefined_i8m2x2(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @test_vundefined_i8m2x3(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison
+//
+vint8m2x3_t test_vundefined_i8m2x3() { return __riscv_vundefined_i8m2x3(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @test_vundefined_i8m2x4(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison
+//
+vint8m2x4_t test_vundefined_i8m2x4() { return __riscv_vundefined_i8m2x4(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 32 x i8>, <vscale x 32 x i8> } @test_vundefined_i8m4x2(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 32 x i8>, <vscale x 32 x i8> } poison
+//
+vint8m4x2_t test_vundefined_i8m4x2() { return __riscv_vundefined_i8m4x2(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i16>, <vscale x 1 x i16> } @test_vundefined_i16mf4x2(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 1 x i16>, <vscale x 1 x i16> } poison
+//
+vint16mf4x2_t test_vundefined_i16mf4x2() {
+  return __riscv_vundefined_i16mf4x2();
+}
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16> } @test_vundefined_i16mf4x3(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16> } poison
+//
+vint16mf4x3_t test_vundefined_i16mf4x3() {
+  return __riscv_vundefined_i16mf4x3();
+}
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16> } @test_vundefined_i16mf4x4(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16> } poison
+//
+vint16mf4x4_t test_vundefined_i16mf4x4() {
+  return __riscv_vundefined_i16mf4x4();
+}
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16> } @test_vundefined_i16mf4x5(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16> } poison
+//
+vint16mf4x5_t test_vundefined_i16mf4x5() {
+  return __riscv_vundefined_i16mf4x5();
+}
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16> } @test_vundefined_i16mf4x6(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16> } poison
+//
+vint16mf4x6_t test_vundefined_i16mf4x6() {
+  return __riscv_vundefined_i16mf4x6();
+}
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16> } @test_vundefined_i16mf4x7(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16> } poison
+//
+vint16mf4x7_t test_vundefined_i16mf4x7() {
+  return __riscv_vundefined_i16mf4x7();
+}
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16> } @test_vundefined_i16mf4x8(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16> } poison
+//
+vint16mf4x8_t test_vundefined_i16mf4x8() {
+  return __riscv_vundefined_i16mf4x8();
+}
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i16>, <vscale x 2 x i16> } @test_vundefined_i16mf2x2(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 2 x i16>, <vscale x 2 x i16> } poison
+//
+vint16mf2x2_t test_vundefined_i16mf2x2() {
+  return __riscv_vundefined_i16mf2x2();
+}
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16> } @test_vundefined_i16mf2x3(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16> } poison
+//
+vint16mf2x3_t test_vundefined_i16mf2x3() {
+  return __riscv_vundefined_i16mf2x3();
+}
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16> } @test_vundefined_i16mf2x4(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16> } poison
+//
+vint16mf2x4_t test_vundefined_i16mf2x4() {
+  return __riscv_vundefined_i16mf2x4();
+}
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16> } @test_vundefined_i16mf2x5(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16> } poison
+//
+vint16mf2x5_t test_vundefined_i16mf2x5() {
+  return __riscv_vundefined_i16mf2x5();
+}
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16> } @test_vundefined_i16mf2x6(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16> } poison
+//
+vint16mf2x6_t test_vundefined_i16mf2x6() {
+  return __riscv_vundefined_i16mf2x6();
+}
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16> } @test_vundefined_i16mf2x7(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16> } poison
+//
+vint16mf2x7_t test_vundefined_i16mf2x7() {
+  return __riscv_vundefined_i16mf2x7();
+}
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16> } @test_vundefined_i16mf2x8(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16> } poison
+//
+vint16mf2x8_t test_vundefined_i16mf2x8() {
+  return __riscv_vundefined_i16mf2x8();
+}
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x i16>, <vscale x 4 x i16> } @test_vundefined_i16m1x2(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 4 x i16>, <vscale x 4 x i16> } poison
+//
+vint16m1x2_t test_vundefined_i16m1x2() { return __riscv_vundefined_i16m1x2(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16> } @test_vundefined_i16m1x3(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16> } poison
+//
+vint16m1x3_t test_vundefined_i16m1x3() { return __riscv_vundefined_i16m1x3(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16> } @test_vundefined_i16m1x4(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16> } poison
+//
+vint16m1x4_t test_vundefined_i16m1x4() { return __riscv_vundefined_i16m1x4(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16> } @test_vundefined_i16m1x5(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16> } poison
+//
+vint16m1x5_t test_vundefined_i16m1x5() { return __riscv_vundefined_i16m1x5(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16> } @test_vundefined_i16m1x6(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16> } poison
+//
+vint16m1x6_t test_vundefined_i16m1x6() { return __riscv_vundefined_i16m1x6(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16> } @test_vundefined_i16m1x7(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16> } poison
+//
+vint16m1x7_t test_vundefined_i16m1x7() { return __riscv_vundefined_i16m1x7(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16> } @test_vundefined_i16m1x8(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16> } poison
+//
+vint16m1x8_t test_vundefined_i16m1x8() { return __riscv_vundefined_i16m1x8(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 8 x i16>, <vscale x 8 x i16> } @test_vundefined_i16m2x2(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } poison
+//
+vint16m2x2_t test_vundefined_i16m2x2() { return __riscv_vundefined_i16m2x2(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @test_vundefined_i16m2x3(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } poison
+//
+vint16m2x3_t test_vundefined_i16m2x3() { return __riscv_vundefined_i16m2x3(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @test_vundefined_i16m2x4(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } poison
+//
+vint16m2x4_t test_vundefined_i16m2x4() { return __riscv_vundefined_i16m2x4(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 16 x i16>, <vscale x 16 x i16> } @test_vundefined_i16m4x2(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 16 x i16>, <vscale x 16 x i16> } poison
+//
+vint16m4x2_t test_vundefined_i16m4x2() { return __riscv_vundefined_i16m4x2(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i32>, <vscale x 1 x i32> } @test_vundefined_i32mf2x2(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 1 x i32>, <vscale x 1 x i32> } poison
+//
+vint32mf2x2_t test_vundefined_i32mf2x2() {
+  return __riscv_vundefined_i32mf2x2();
+}
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } @test_vundefined_i32mf2x3(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } poison
+//
+vint32mf2x3_t test_vundefined_i32mf2x3() {
+  return __riscv_vundefined_i32mf2x3();
+}
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } @test_vundefined_i32mf2x4(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } poison
+//
+vint32mf2x4_t test_vundefined_i32mf2x4() {
+  return __riscv_vundefined_i32mf2x4();
+}
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } @test_vundefined_i32mf2x5(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } poison
+//
+vint32mf2x5_t test_vundefined_i32mf2x5() {
+  return __riscv_vundefined_i32mf2x5();
+}
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } @test_vundefined_i32mf2x6(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } poison
+//
+vint32mf2x6_t test_vundefined_i32mf2x6() {
+  return __riscv_vundefined_i32mf2x6();
+}
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } @test_vundefined_i32mf2x7(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } poison
+//
+vint32mf2x7_t test_vundefined_i32mf2x7() {
+  return __riscv_vundefined_i32mf2x7();
+}
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } @test_vundefined_i32mf2x8(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } poison
+//
+vint32mf2x8_t test_vundefined_i32mf2x8() {
+  return __riscv_vundefined_i32mf2x8();
+}
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i32>, <vscale x 2 x i32> } @test_vundefined_i32m1x2(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 2 x i32>, <vscale x 2 x i32> } poison
+//
+vint32m1x2_t test_vundefined_i32m1x2() { return __riscv_vundefined_i32m1x2(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } @test_vundefined_i32m1x3(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } poison
+//
+vint32m1x3_t test_vundefined_i32m1x3() { return __riscv_vundefined_i32m1x3(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } @test_vundefined_i32m1x4(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } poison
+//
+vint32m1x4_t test_vundefined_i32m1x4() { return __riscv_vundefined_i32m1x4(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } @test_vundefined_i32m1x5(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } poison
+//
+vint32m1x5_t test_vundefined_i32m1x5() { return __riscv_vundefined_i32m1x5(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } @test_vundefined_i32m1x6(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } poison
+//
+vint32m1x6_t test_vundefined_i32m1x6() { return __riscv_vundefined_i32m1x6(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } @test_vundefined_i32m1x7(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } poison
+//
+vint32m1x7_t test_vundefined_i32m1x7() { return __riscv_vundefined_i32m1x7(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } @test_vundefined_i32m1x8(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } poison
+//
+vint32m1x8_t test_vundefined_i32m1x8() { return __riscv_vundefined_i32m1x8(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x i32>, <vscale x 4 x i32> } @test_vundefined_i32m2x2(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } poison
+//
+vint32m2x2_t test_vundefined_i32m2x2() { return __riscv_vundefined_i32m2x2(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @test_vundefined_i32m2x3(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } poison
+//
+vint32m2x3_t test_vundefined_i32m2x3() { return __riscv_vundefined_i32m2x3(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @test_vundefined_i32m2x4(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } poison
+//
+vint32m2x4_t test_vundefined_i32m2x4() { return __riscv_vundefined_i32m2x4(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 8 x i32>, <vscale x 8 x i32> } @test_vundefined_i32m4x2(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 8 x i32>, <vscale x 8 x i32> } poison
+//
+vint32m4x2_t test_vundefined_i32m4x2() { return __riscv_vundefined_i32m4x2(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i64>, <vscale x 1 x i64> } @test_vundefined_i64m1x2(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 1 x i64>, <vscale x 1 x i64> } poison
+//
+vint64m1x2_t test_vundefined_i64m1x2() { return __riscv_vundefined_i64m1x2(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } @test_vundefined_i64m1x3(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } poison
+//
+vint64m1x3_t test_vundefined_i64m1x3() { return __riscv_vundefined_i64m1x3(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } @test_vundefined_i64m1x4(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } poison
+//
+vint64m1x4_t test_vundefined_i64m1x4() { return __riscv_vundefined_i64m1x4(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } @test_vundefined_i64m1x5(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } poison
+//
+vint64m1x5_t test_vundefined_i64m1x5() { return __riscv_vundefined_i64m1x5(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } @test_vundefined_i64m1x6(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } poison
+//
+vint64m1x6_t test_vundefined_i64m1x6() { return __riscv_vundefined_i64m1x6(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } @test_vundefined_i64m1x7(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } poison
+//
+vint64m1x7_t test_vundefined_i64m1x7() { return __riscv_vundefined_i64m1x7(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } @test_vundefined_i64m1x8(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } poison
+//
+vint64m1x8_t test_vundefined_i64m1x8() { return __riscv_vundefined_i64m1x8(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i64>, <vscale x 2 x i64> } @test_vundefined_i64m2x2(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } poison
+//
+vint64m2x2_t test_vundefined_i64m2x2() { return __riscv_vundefined_i64m2x2(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @test_vundefined_i64m2x3(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } poison
+//
+vint64m2x3_t test_vundefined_i64m2x3() { return __riscv_vundefined_i64m2x3(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @test_vundefined_i64m2x4(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } poison
+//
+vint64m2x4_t test_vundefined_i64m2x4() { return __riscv_vundefined_i64m2x4(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x i64>, <vscale x 4 x i64> } @test_vundefined_i64m4x2(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 4 x i64>, <vscale x 4 x i64> } poison
+//
+vint64m4x2_t test_vundefined_i64m4x2() { return __riscv_vundefined_i64m4x2(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i8>, <vscale x 1 x i8> } @test_vundefined_u8mf8x2(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 1 x i8>, <vscale x 1 x i8> } poison
+//
+vuint8mf8x2_t test_vundefined_u8mf8x2() { return __riscv_vundefined_u8mf8x2(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8> } @test_vundefined_u8mf8x3(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8> } poison
+//
+vuint8mf8x3_t test_vundefined_u8mf8x3() { return __riscv_vundefined_u8mf8x3(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8> } @test_vundefined_u8mf8x4(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8> } poison
+//
+vuint8mf8x4_t test_vundefined_u8mf8x4() { return __riscv_vundefined_u8mf8x4(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8> } @test_vundefined_u8mf8x5(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8> } poison
+//
+vuint8mf8x5_t test_vundefined_u8mf8x5() { return __riscv_vundefined_u8mf8x5(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8> } @test_vundefined_u8mf8x6(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8> } poison
+//
+vuint8mf8x6_t test_vundefined_u8mf8x6() { return __riscv_vundefined_u8mf8x6(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8> } @test_vundefined_u8mf8x7(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8> } poison
+//
+vuint8mf8x7_t test_vundefined_u8mf8x7() { return __riscv_vundefined_u8mf8x7(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8> } @test_vundefined_u8mf8x8(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8> } poison
+//
+vuint8mf8x8_t test_vundefined_u8mf8x8() { return __riscv_vundefined_u8mf8x8(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i8>, <vscale x 2 x i8> } @test_vundefined_u8mf4x2(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 2 x i8>, <vscale x 2 x i8> } poison
+//
+vuint8mf4x2_t test_vundefined_u8mf4x2() { return __riscv_vundefined_u8mf4x2(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8> } @test_vundefined_u8mf4x3(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8> } poison
+//
+vuint8mf4x3_t test_vundefined_u8mf4x3() { return __riscv_vundefined_u8mf4x3(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8> } @test_vundefined_u8mf4x4(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8> } poison
+//
+vuint8mf4x4_t test_vundefined_u8mf4x4() { return __riscv_vundefined_u8mf4x4(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8> } @test_vundefined_u8mf4x5(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8> } poison
+//
+vuint8mf4x5_t test_vundefined_u8mf4x5() { return __riscv_vundefined_u8mf4x5(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8> } @test_vundefined_u8mf4x6(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8> } poison
+//
+vuint8mf4x6_t test_vundefined_u8mf4x6() { return __riscv_vundefined_u8mf4x6(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8> } @test_vundefined_u8mf4x7(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8> } poison
+//
+vuint8mf4x7_t test_vundefined_u8mf4x7() { return __riscv_vundefined_u8mf4x7(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8> } @test_vundefined_u8mf4x8(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8> } poison
+//
+vuint8mf4x8_t test_vundefined_u8mf4x8() { return __riscv_vundefined_u8mf4x8(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x i8>, <vscale x 4 x i8> } @test_vundefined_u8mf2x2(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 4 x i8>, <vscale x 4 x i8> } poison
+//
+vuint8mf2x2_t test_vundefined_u8mf2x2() { return __riscv_vundefined_u8mf2x2(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8> } @test_vundefined_u8mf2x3(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8> } poison
+//
+vuint8mf2x3_t test_vundefined_u8mf2x3() { return __riscv_vundefined_u8mf2x3(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8> } @test_vundefined_u8mf2x4(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8> } poison
+//
+vuint8mf2x4_t test_vundefined_u8mf2x4() { return __riscv_vundefined_u8mf2x4(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8> } @test_vundefined_u8mf2x5(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8> } poison
+//
+vuint8mf2x5_t test_vundefined_u8mf2x5() { return __riscv_vundefined_u8mf2x5(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8> } @test_vundefined_u8mf2x6(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8> } poison
+//
+vuint8mf2x6_t test_vundefined_u8mf2x6() { return __riscv_vundefined_u8mf2x6(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8> } @test_vundefined_u8mf2x7(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8> } poison
+//
+vuint8mf2x7_t test_vundefined_u8mf2x7() { return __riscv_vundefined_u8mf2x7(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8> } @test_vundefined_u8mf2x8(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8> } poison
+//
+vuint8mf2x8_t test_vundefined_u8mf2x8() { return __riscv_vundefined_u8mf2x8(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 8 x i8>, <vscale x 8 x i8> } @test_vundefined_u8m1x2(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 8 x i8>, <vscale x 8 x i8> } poison
+//
+vuint8m1x2_t test_vundefined_u8m1x2() { return __riscv_vundefined_u8m1x2(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } @test_vundefined_u8m1x3(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } poison
+//
+vuint8m1x3_t test_vundefined_u8m1x3() { return __riscv_vundefined_u8m1x3(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } @test_vundefined_u8m1x4(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } poison
+//
+vuint8m1x4_t test_vundefined_u8m1x4() { return __riscv_vundefined_u8m1x4(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } @test_vundefined_u8m1x5(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } poison
+//
+vuint8m1x5_t test_vundefined_u8m1x5() { return __riscv_vundefined_u8m1x5(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } @test_vundefined_u8m1x6(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } poison
+//
+vuint8m1x6_t test_vundefined_u8m1x6() { return __riscv_vundefined_u8m1x6(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } @test_vundefined_u8m1x7(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } poison
+//
+vuint8m1x7_t test_vundefined_u8m1x7() { return __riscv_vundefined_u8m1x7(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } @test_vundefined_u8m1x8(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } poison
+//
+vuint8m1x8_t test_vundefined_u8m1x8() { return __riscv_vundefined_u8m1x8(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8> } @test_vundefined_u8m2x2(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } poison
+//
+vuint8m2x2_t test_vundefined_u8m2x2() { return __riscv_vundefined_u8m2x2(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @test_vundefined_u8m2x3(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison
+//
+vuint8m2x3_t test_vundefined_u8m2x3() { return __riscv_vundefined_u8m2x3(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @test_vundefined_u8m2x4(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison
+//
+vuint8m2x4_t test_vundefined_u8m2x4() { return __riscv_vundefined_u8m2x4(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 32 x i8>, <vscale x 32 x i8> } @test_vundefined_u8m4x2(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 32 x i8>, <vscale x 32 x i8> } poison
+//
+vuint8m4x2_t test_vundefined_u8m4x2() { return __riscv_vundefined_u8m4x2(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i16>, <vscale x 1 x i16> } @test_vundefined_u16mf4x2(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 1 x i16>, <vscale x 1 x i16> } poison
+//
+vuint16mf4x2_t test_vundefined_u16mf4x2() {
+  return __riscv_vundefined_u16mf4x2();
+}
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16> } @test_vundefined_u16mf4x3(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16> } poison
+//
+vuint16mf4x3_t test_vundefined_u16mf4x3() {
+  return __riscv_vundefined_u16mf4x3();
+}
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16> } @test_vundefined_u16mf4x4(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16> } poison
+//
+vuint16mf4x4_t test_vundefined_u16mf4x4() {
+  return __riscv_vundefined_u16mf4x4();
+}
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16> } @test_vundefined_u16mf4x5(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16> } poison
+//
+vuint16mf4x5_t test_vundefined_u16mf4x5() {
+  return __riscv_vundefined_u16mf4x5();
+}
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16> } @test_vundefined_u16mf4x6(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16> } poison
+//
+vuint16mf4x6_t test_vundefined_u16mf4x6() {
+  return __riscv_vundefined_u16mf4x6();
+}
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16> } @test_vundefined_u16mf4x7(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16> } poison
+//
+vuint16mf4x7_t test_vundefined_u16mf4x7() {
+  return __riscv_vundefined_u16mf4x7();
+}
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16> } @test_vundefined_u16mf4x8(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16> } poison
+//
+vuint16mf4x8_t test_vundefined_u16mf4x8() {
+  return __riscv_vundefined_u16mf4x8();
+}
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i16>, <vscale x 2 x i16> } @test_vundefined_u16mf2x2(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 2 x i16>, <vscale x 2 x i16> } poison
+//
+vuint16mf2x2_t test_vundefined_u16mf2x2() {
+  return __riscv_vundefined_u16mf2x2();
+}
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16> } @test_vundefined_u16mf2x3(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16> } poison
+//
+vuint16mf2x3_t test_vundefined_u16mf2x3() {
+  return __riscv_vundefined_u16mf2x3();
+}
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16> } @test_vundefined_u16mf2x4(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16> } poison
+//
+vuint16mf2x4_t test_vundefined_u16mf2x4() {
+  return __riscv_vundefined_u16mf2x4();
+}
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16> } @test_vundefined_u16mf2x5(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16> } poison
+//
+vuint16mf2x5_t test_vundefined_u16mf2x5() {
+  return __riscv_vundefined_u16mf2x5();
+}
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16> } @test_vundefined_u16mf2x6(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16> } poison
+//
+vuint16mf2x6_t test_vundefined_u16mf2x6() {
+  return __riscv_vundefined_u16mf2x6();
+}
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16> } @test_vundefined_u16mf2x7(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16> } poison
+//
+vuint16mf2x7_t test_vundefined_u16mf2x7() {
+  return __riscv_vundefined_u16mf2x7();
+}
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16> } @test_vundefined_u16mf2x8(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16> } poison
+//
+vuint16mf2x8_t test_vundefined_u16mf2x8() {
+  return __riscv_vundefined_u16mf2x8();
+}
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x i16>, <vscale x 4 x i16> } @test_vundefined_u16m1x2(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 4 x i16>, <vscale x 4 x i16> } poison
+//
+vuint16m1x2_t test_vundefined_u16m1x2() { return __riscv_vundefined_u16m1x2(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16> } @test_vundefined_u16m1x3(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16> } poison
+//
+vuint16m1x3_t test_vundefined_u16m1x3() { return __riscv_vundefined_u16m1x3(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16> } @test_vundefined_u16m1x4(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16> } poison
+//
+vuint16m1x4_t test_vundefined_u16m1x4() { return __riscv_vundefined_u16m1x4(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16> } @test_vundefined_u16m1x5(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16> } poison
+//
+vuint16m1x5_t test_vundefined_u16m1x5() { return __riscv_vundefined_u16m1x5(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16> } @test_vundefined_u16m1x6(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16> } poison
+//
+vuint16m1x6_t test_vundefined_u16m1x6() { return __riscv_vundefined_u16m1x6(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16> } @test_vundefined_u16m1x7(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16> } poison
+//
+vuint16m1x7_t test_vundefined_u16m1x7() { return __riscv_vundefined_u16m1x7(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16> } @test_vundefined_u16m1x8(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16> } poison
+//
+vuint16m1x8_t test_vundefined_u16m1x8() { return __riscv_vundefined_u16m1x8(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 8 x i16>, <vscale x 8 x i16> } @test_vundefined_u16m2x2(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } poison
+//
+vuint16m2x2_t test_vundefined_u16m2x2() { return __riscv_vundefined_u16m2x2(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @test_vundefined_u16m2x3(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } poison
+//
+vuint16m2x3_t test_vundefined_u16m2x3() { return __riscv_vundefined_u16m2x3(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @test_vundefined_u16m2x4(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } poison
+//
+vuint16m2x4_t test_vundefined_u16m2x4() { return __riscv_vundefined_u16m2x4(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 16 x i16>, <vscale x 16 x i16> } @test_vundefined_u16m4x2(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 16 x i16>, <vscale x 16 x i16> } poison
+//
+vuint16m4x2_t test_vundefined_u16m4x2() { return __riscv_vundefined_u16m4x2(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i32>, <vscale x 1 x i32> } @test_vundefined_u32mf2x2(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 1 x i32>, <vscale x 1 x i32> } poison
+//
+vuint32mf2x2_t test_vundefined_u32mf2x2() {
+  return __riscv_vundefined_u32mf2x2();
+}
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } @test_vundefined_u32mf2x3(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } poison
+//
+vuint32mf2x3_t test_vundefined_u32mf2x3() {
+  return __riscv_vundefined_u32mf2x3();
+}
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } @test_vundefined_u32mf2x4(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } poison
+//
+vuint32mf2x4_t test_vundefined_u32mf2x4() {
+  return __riscv_vundefined_u32mf2x4();
+}
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } @test_vundefined_u32mf2x5(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } poison
+//
+vuint32mf2x5_t test_vundefined_u32mf2x5() {
+  return __riscv_vundefined_u32mf2x5();
+}
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } @test_vundefined_u32mf2x6(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } poison
+//
+vuint32mf2x6_t test_vundefined_u32mf2x6() {
+  return __riscv_vundefined_u32mf2x6();
+}
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } @test_vundefined_u32mf2x7(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } poison
+//
+vuint32mf2x7_t test_vundefined_u32mf2x7() {
+  return __riscv_vundefined_u32mf2x7();
+}
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } @test_vundefined_u32mf2x8(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } poison
+//
+vuint32mf2x8_t test_vundefined_u32mf2x8() {
+  return __riscv_vundefined_u32mf2x8();
+}
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i32>, <vscale x 2 x i32> } @test_vundefined_u32m1x2(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 2 x i32>, <vscale x 2 x i32> } poison
+//
+vuint32m1x2_t test_vundefined_u32m1x2() { return __riscv_vundefined_u32m1x2(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } @test_vundefined_u32m1x3(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } poison
+//
+vuint32m1x3_t test_vundefined_u32m1x3() { return __riscv_vundefined_u32m1x3(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } @test_vundefined_u32m1x4(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } poison
+//
+vuint32m1x4_t test_vundefined_u32m1x4() { return __riscv_vundefined_u32m1x4(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } @test_vundefined_u32m1x5(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } poison
+//
+vuint32m1x5_t test_vundefined_u32m1x5() { return __riscv_vundefined_u32m1x5(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } @test_vundefined_u32m1x6(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } poison
+//
+vuint32m1x6_t test_vundefined_u32m1x6() { return __riscv_vundefined_u32m1x6(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } @test_vundefined_u32m1x7(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } poison
+//
+vuint32m1x7_t test_vundefined_u32m1x7() { return __riscv_vundefined_u32m1x7(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } @test_vundefined_u32m1x8(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } poison
+//
+vuint32m1x8_t test_vundefined_u32m1x8() { return __riscv_vundefined_u32m1x8(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x i32>, <vscale x 4 x i32> } @test_vundefined_u32m2x2(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } poison
+//
+vuint32m2x2_t test_vundefined_u32m2x2() { return __riscv_vundefined_u32m2x2(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @test_vundefined_u32m2x3(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } poison
+//
+vuint32m2x3_t test_vundefined_u32m2x3() { return __riscv_vundefined_u32m2x3(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @test_vundefined_u32m2x4(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } poison
+//
+vuint32m2x4_t test_vundefined_u32m2x4() { return __riscv_vundefined_u32m2x4(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 8 x i32>, <vscale x 8 x i32> } @test_vundefined_u32m4x2(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 8 x i32>, <vscale x 8 x i32> } poison
+//
+vuint32m4x2_t test_vundefined_u32m4x2() { return __riscv_vundefined_u32m4x2(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i64>, <vscale x 1 x i64> } @test_vundefined_u64m1x2(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 1 x i64>, <vscale x 1 x i64> } poison
+//
+vuint64m1x2_t test_vundefined_u64m1x2() { return __riscv_vundefined_u64m1x2(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } @test_vundefined_u64m1x3(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } poison
+//
+vuint64m1x3_t test_vundefined_u64m1x3() { return __riscv_vundefined_u64m1x3(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } @test_vundefined_u64m1x4(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } poison
+//
+vuint64m1x4_t test_vundefined_u64m1x4() { return __riscv_vundefined_u64m1x4(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } @test_vundefined_u64m1x5(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } poison
+//
+vuint64m1x5_t test_vundefined_u64m1x5() { return __riscv_vundefined_u64m1x5(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } @test_vundefined_u64m1x6(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } poison
+//
+vuint64m1x6_t test_vundefined_u64m1x6() { return __riscv_vundefined_u64m1x6(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } @test_vundefined_u64m1x7(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } poison
+//
+vuint64m1x7_t test_vundefined_u64m1x7() { return __riscv_vundefined_u64m1x7(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } @test_vundefined_u64m1x8(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } poison
+//
+vuint64m1x8_t test_vundefined_u64m1x8() { return __riscv_vundefined_u64m1x8(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i64>, <vscale x 2 x i64> } @test_vundefined_u64m2x2(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } poison
+//
+vuint64m2x2_t test_vundefined_u64m2x2() { return __riscv_vundefined_u64m2x2(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @test_vundefined_u64m2x3(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } poison
+//
+vuint64m2x3_t test_vundefined_u64m2x3() { return __riscv_vundefined_u64m2x3(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @test_vundefined_u64m2x4(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } poison
+//
+vuint64m2x4_t test_vundefined_u64m2x4() { return __riscv_vundefined_u64m2x4(); }
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x i64>, <vscale x 4 x i64> } @test_vundefined_u64m4x2(
+// CHECK-RV64-SAME: ) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    ret { <vscale x 4 x i64>, <vscale x 4 x i64> } poison
+//
+vuint64m4x2_t test_vundefined_u64m4x2() { return __riscv_vundefined_u64m4x2(); }

From 43a94cbafae13780f67ec4adac2a48863468671b Mon Sep 17 00:00:00 2001
From: Cullen Rhodes <cullen.rhodes@arm.com>
Date: Mon, 30 Oct 2023 13:25:06 +0000
Subject: [PATCH 370/877] [mlir][ArmSME] NFC: Rename tild_id -> tile_id

---
 mlir/include/mlir/Dialect/ArmSME/IR/ArmSMEIntrinsicOps.td | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/include/mlir/Dialect/ArmSME/IR/ArmSMEIntrinsicOps.td b/mlir/include/mlir/Dialect/ArmSME/IR/ArmSMEIntrinsicOps.td
index df837ebcf23b3..e369ef203ad39 100644
--- a/mlir/include/mlir/Dialect/ArmSME/IR/ArmSMEIntrinsicOps.td
+++ b/mlir/include/mlir/Dialect/ArmSME/IR/ArmSMEIntrinsicOps.td
@@ -86,7 +86,7 @@ class ArmSME_IntrStoreOp<string mnemonic>
     : ArmSME_IntrOp<mnemonic>,
       Arguments<(ins Arg<LDSTPredicate, "Vector predicate">:$predicate,
                  Arg<LLVM_AnyPointer, "Store address", [MemWrite]>:$store_address,
-                 Arg<I32, "Virtual tile ID">:$tild_id,
+                 Arg<I32, "Virtual tile ID">:$tile_id,
                  Arg<I32, "Tile slice">:$tile_slice_index)>;
 
 def LLVM_aarch64_sme_st1b_horiz : ArmSME_IntrStoreOp<"st1b.horiz">;

From 292f34b0d3cb2a04be5ebb85aaeb838b29f71323 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Mon, 30 Oct 2023 15:01:01 +0100
Subject: [PATCH 371/877] [AArch64][GlobalISel] Fix incorrect ABI when tail
 call not supported (#70215)

The check for whether a tail call is supported calls
determineAssignments(), which may modify argument flags. As such, even
though the check fails and a non-tail call will be emitted, it will not
have a different (incorrect) ABI.

Fix this by operating on a separate copy of the arguments.

Fixes https://github.com/llvm/llvm-project/issues/70207.
---
 .../AArch64/GISel/AArch64CallLowering.cpp      |  7 +++++--
 .../call-lowering-tail-call-fallback.ll        | 18 +++++++-----------
 2 files changed, 12 insertions(+), 13 deletions(-)

diff --git a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp
index 2d6cc870f98e7..84057ea8d2214 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp
@@ -832,9 +832,9 @@ bool AArch64CallLowering::doCallerAndCalleePassArgsTheSameWay(
 
 bool AArch64CallLowering::areCalleeOutgoingArgsTailCallable(
     CallLoweringInfo &Info, MachineFunction &MF,
-    SmallVectorImpl<ArgInfo> &OutArgs) const {
+    SmallVectorImpl<ArgInfo> &OrigOutArgs) const {
   // If there are no outgoing arguments, then we are done.
-  if (OutArgs.empty())
+  if (OrigOutArgs.empty())
     return true;
 
   const Function &CallerF = MF.getFunction();
@@ -854,6 +854,9 @@ bool AArch64CallLowering::areCalleeOutgoingArgsTailCallable(
 
   AArch64OutgoingValueAssigner CalleeAssigner(AssignFnFixed, AssignFnVarArg,
                                               Subtarget, /*IsReturn*/ false);
+  // determineAssignments() may modify argument flags, so make a copy.
+  SmallVector<ArgInfo, 8> OutArgs;
+  append_range(OutArgs, OrigOutArgs);
   if (!determineAssignments(CalleeAssigner, OutArgs, OutInfo)) {
     LLVM_DEBUG(dbgs() << "... Could not analyze call operands.\n");
     return false;
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/call-lowering-tail-call-fallback.ll b/llvm/test/CodeGen/AArch64/GlobalISel/call-lowering-tail-call-fallback.ll
index fc6eefb4016b6..ebd2beca67810 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/call-lowering-tail-call-fallback.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/call-lowering-tail-call-fallback.ll
@@ -3,30 +3,26 @@
 
 declare void @func(i64, i64, i64, i64, i64, i128, i128)
 
-; FIXME: This is a miscompile.
 ; Make sure the check for whether a tail call is allowed does not affect the
 ; calling convention if it fails.
 ; The first i128 argument should be passed in registers, not on the stack.
 define void @pr70207(i128 %arg1, i128 %arg2) nounwind {
 ; CHECK-LABEL: pr70207:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    sub sp, sp, #64
+; CHECK-NEXT:    mov x8, x2
 ; CHECK-NEXT:    mov x6, x0
-; CHECK-NEXT:    mov x8, x1
-; CHECK-NEXT:    mov x9, x2
-; CHECK-NEXT:    mov x10, x3
+; CHECK-NEXT:    mov x7, x1
+; CHECK-NEXT:    mov x9, x3
 ; CHECK-NEXT:    mov x0, xzr
 ; CHECK-NEXT:    mov x1, xzr
 ; CHECK-NEXT:    mov x2, xzr
 ; CHECK-NEXT:    mov x3, xzr
 ; CHECK-NEXT:    mov x4, xzr
-; CHECK-NEXT:    str x30, [sp, #48] // 8-byte Folded Spill
-; CHECK-NEXT:    str x8, [sp]
-; CHECK-NEXT:    str x9, [sp, #16]
-; CHECK-NEXT:    str x10, [sp, #32]
+; CHECK-NEXT:    str x8, [sp, #-32]!
+; CHECK-NEXT:    stp x9, x30, [sp, #8] // 8-byte Folded Spill
 ; CHECK-NEXT:    bl func
-; CHECK-NEXT:    ldr x30, [sp, #48] // 8-byte Folded Reload
-; CHECK-NEXT:    add sp, sp, #64
+; CHECK-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #32
 ; CHECK-NEXT:    ret
   tail call void @func(i64 0, i64 0, i64 0, i64 0, i64 0, i128 %arg1, i128 %arg2)
   ret void

From 564e0165abc851dcd7a3485d4e728ce63c3d6466 Mon Sep 17 00:00:00 2001
From: Discookie <viktor.cseh@ericsson.com>
Date: Mon, 30 Oct 2023 14:03:33 +0000
Subject: [PATCH 372/877] [clang][analyzer] Do not analyze opaque types in
 CXXDeleteChecker (#70638)

While inheritance can only be expressed if the class has a definition,
in this case one of the types might be opaque to the analyzer.

Fixes a crash encountered while analyzing LLVM.
---
 clang/lib/StaticAnalyzer/Checkers/CXXDeleteChecker.cpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/clang/lib/StaticAnalyzer/Checkers/CXXDeleteChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/CXXDeleteChecker.cpp
index 1a1f5c5302940..eb265f4dde68b 100644
--- a/clang/lib/StaticAnalyzer/Checkers/CXXDeleteChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/CXXDeleteChecker.cpp
@@ -114,6 +114,9 @@ void DeleteWithNonVirtualDtorChecker::checkTypedDeleteExpr(
   if (!BaseClass || !DerivedClass)
     return;
 
+  if (!BaseClass->hasDefinition() || !DerivedClass->hasDefinition())
+    return;
+
   if (BaseClass->getDestructor()->isVirtual())
     return;
 
@@ -148,6 +151,9 @@ void CXXArrayDeleteChecker::checkTypedDeleteExpr(
   if (!BaseClass || !DerivedClass)
     return;
 
+  if (!BaseClass->hasDefinition() || !DerivedClass->hasDefinition())
+    return;
+
   if (DE->getOperatorDelete()->getOverloadedOperator() != OO_Array_Delete)
     return;
 

From fb08c694a42ac6199cc64ca9accf2c2421366b57 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Mon, 30 Oct 2023 15:26:06 +0100
Subject: [PATCH 373/877] [JumpThreading] Add test for #70651 (NFC)

---
 llvm/test/Transforms/JumpThreading/pr70651.ll | 32 +++++++++++++++++++
 1 file changed, 32 insertions(+)
 create mode 100644 llvm/test/Transforms/JumpThreading/pr70651.ll

diff --git a/llvm/test/Transforms/JumpThreading/pr70651.ll b/llvm/test/Transforms/JumpThreading/pr70651.ll
new file mode 100644
index 0000000000000..a156be541874a
--- /dev/null
+++ b/llvm/test/Transforms/JumpThreading/pr70651.ll
@@ -0,0 +1,32 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; RUN: opt -S -passes=jump-threading < %s | FileCheck %s
+
+; FIXME: This is a miscompile.
+define i64 @test(i64 %v) {
+; CHECK-LABEL: define i64 @test(
+; CHECK-SAME: i64 [[V:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[V_NONNEG:%.*]] = icmp sgt i64 [[V]], -1
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[SUM:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[SUM_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[SUM_NEXT]] = add i64 [[SUM]], [[V]]
+; CHECK-NEXT:    [[OVERFLOW:%.*]] = icmp ult i64 [[SUM_NEXT]], [[SUM]]
+; CHECK-NEXT:    br i1 [[V_NONNEG]], label [[FOR_BODY]], label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret i64 [[SUM]]
+;
+entry:
+  %v.nonneg = icmp sgt i64 %v, -1
+  br label %for.body
+
+for.body:
+  %sum = phi i64 [ 0, %entry ], [ %sum.next, %for.body ]
+  %sum.next = add i64 %sum, %v
+  %overflow = icmp ult i64 %sum.next, %sum
+  %cmp = xor i1 %v.nonneg, %overflow
+  br i1 %cmp, label %for.body, label %exit
+
+exit:
+  ret i64 %sum
+}

From af15c46777208a4cb4b276c4974a5b556608a415 Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev@outlook.com>
Date: Mon, 30 Oct 2023 07:11:44 -0700
Subject: [PATCH 374/877] [SLP]Do not crash if number of vector registers does
 not feet the vector type.

Need to check, if the number of vector registers, returned by TTI, is
not greater than total number of mask element and not zero, before
trying to perform any operations. TTI still may return non-valid number
of registers.
---
 .../Transforms/Vectorize/SLPVectorizer.cpp    |  8 +--
 .../SLPVectorizer/X86/shuffle-multivector.ll  | 53 +++++++++++++++++++
 2 files changed, 57 insertions(+), 4 deletions(-)
 create mode 100644 llvm/test/Transforms/SLPVectorizer/X86/shuffle-multivector.ll

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index bb4e743c1544a..b6895c649f838 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -7580,8 +7580,8 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
     auto *MaskVecTy =
         FixedVectorType::get(E1.Scalars.front()->getType(), Mask.size());
     unsigned NumParts = TTI.getNumberOfParts(MaskVecTy);
-    assert(NumParts > 0 && NumParts < Mask.size() &&
-           "Expected positive number of registers.");
+    if (NumParts == 0 || NumParts >= Mask.size())
+      NumParts = 1;
     unsigned SliceSize = Mask.size() / NumParts;
     const auto *It =
         find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; });
@@ -7598,8 +7598,8 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
     auto *MaskVecTy =
         FixedVectorType::get(E1.Scalars.front()->getType(), Mask.size());
     unsigned NumParts = TTI.getNumberOfParts(MaskVecTy);
-    assert(NumParts > 0 && NumParts < Mask.size() &&
-           "Expected positive number of registers.");
+    if (NumParts == 0 || NumParts >= Mask.size())
+      NumParts = 1;
     unsigned SliceSize = Mask.size() / NumParts;
     const auto *It =
         find_if(Mask, [](int Idx) { return Idx != PoisonMaskElem; });
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/shuffle-multivector.ll b/llvm/test/Transforms/SLPVectorizer/X86/shuffle-multivector.ll
new file mode 100644
index 0000000000000..143052a3d9cd0
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/shuffle-multivector.ll
@@ -0,0 +1,53 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=slp-vectorizer -S < %s -mtriple=x86_64-unknown-linux -slp-threshold=-160 | FileCheck %s
+
+define void @test1(i128 %p0, i128 %p1, i128 %p2, i128 %p3, <4 x i128> %vec) {
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i128> poison, i128 [[P0:%.*]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i128> [[TMP0]], i128 [[P1:%.*]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc <2 x i128> [[TMP1]] to <2 x i32>
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <2 x i128> poison, i128 [[P2:%.*]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x i128> [[TMP4]], i128 [[P3:%.*]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = trunc <2 x i128> [[TMP5]] to <2 x i32>
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+; CHECK-NEXT:    [[T5:%.*]] = trunc i128 [[P1]] to i32
+; CHECK-NEXT:    [[TMP8:%.*]] = sdiv <4 x i32> [[TMP3]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <2 x i128> [[TMP1]], <2 x i128> [[TMP5]], <4 x i32> <i32 poison, i32 0, i32 3, i32 2>
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x i128> [[VEC:%.*]], <4 x i128> [[TMP9]], <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP11:%.*]] = trunc <4 x i128> [[TMP10]] to <4 x i32>
+; CHECK-NEXT:    [[TMP12:%.*]] = sdiv <4 x i32> [[TMP8]], [[TMP11]]
+; CHECK-NEXT:    br label [[BB:%.*]]
+; CHECK:       bb:
+; CHECK-NEXT:    [[TMP13:%.*]] = phi <4 x i32> [ [[TMP12]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    ret void
+;
+entry:
+  %t1 = trunc i128 %p0 to i32
+  %t2 = trunc i128 %p1 to i32
+  %t3 = trunc i128 %p2 to i32
+  %t4 = trunc i128 %p3 to i32
+  %t5 = trunc i128 %p1 to i32
+  %t6 = trunc i128 %p0 to i32
+  %t7 = trunc i128 %p3 to i32
+  %t8 = trunc i128 %p2 to i32
+  %m0 = sdiv i32 %t1, %t3
+  %m1 = sdiv i32 %t2, %t4
+  %m2 = sdiv i32 %t1, %t3
+  %m3 = sdiv i32 %t2, %t4
+  %e0 = extractelement <4 x i128> %vec, i32 0
+  %t9 = trunc i128 %e0 to i32
+  %d0 = sdiv i32 %m0, %t9
+  %d1 = sdiv i32 %m1, %t6
+  %d2 = sdiv i32 %m2, %t7
+  %d3 = sdiv i32 %m3, %t8
+  br label %bb
+
+bb:
+  %phi0 = phi i32 [ %d0, %entry ]
+  %phi1 = phi i32 [ %d1, %entry ]
+  %phi2 = phi i32 [ %d2, %entry ]
+  %phi3 = phi i32 [ %d3, %entry ]
+  ret void
+}

From bb352b6ead5b37a51c7a9d3bf7a1b23198f7a330 Mon Sep 17 00:00:00 2001
From: Egor Zhdan <e_zhdan@apple.com>
Date: Mon, 30 Oct 2023 14:36:44 +0000
Subject: [PATCH 375/877] [APINotes] Upstream APINotesReader

This upstreams more of the Clang API Notes functionality that is
currently implemented in the Apple fork:
https://github.com/apple/llvm-project/tree/next/clang/lib/APINotes
---
 clang/include/clang/APINotes/APINotesReader.h |  200 ++
 clang/include/clang/APINotes/Types.h          |    8 +-
 clang/lib/APINotes/APINotesReader.cpp         | 2048 +++++++++++++++++
 clang/lib/APINotes/CMakeLists.txt             |    3 +
 4 files changed, 2252 insertions(+), 7 deletions(-)
 create mode 100644 clang/include/clang/APINotes/APINotesReader.h
 create mode 100644 clang/lib/APINotes/APINotesReader.cpp

diff --git a/clang/include/clang/APINotes/APINotesReader.h b/clang/include/clang/APINotes/APINotesReader.h
new file mode 100644
index 0000000000000..1c5aab0959550
--- /dev/null
+++ b/clang/include/clang/APINotes/APINotesReader.h
@@ -0,0 +1,200 @@
+//===--- APINotesReader.h - API Notes Reader --------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the \c APINotesReader class that reads source API notes
+// data providing additional information about source code as a separate input,
+// such as the non-nil/nilable annotations for method parameters.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_APINOTES_READER_H
+#define LLVM_CLANG_APINOTES_READER_H
+
+#include "clang/APINotes/Types.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/VersionTuple.h"
+#include <memory>
+
+namespace clang {
+namespace api_notes {
+
+/// A class that reads API notes data from a binary file that was written by
+/// the \c APINotesWriter.
+class APINotesReader {
+  class Implementation;
+  std::unique_ptr<Implementation> Implementation;
+
+  APINotesReader(llvm::MemoryBuffer *InputBuffer,
+                 llvm::VersionTuple SwiftVersion, bool &Failed);
+
+public:
+  /// Create a new API notes reader from the given member buffer, which
+  /// contains the contents of a binary API notes file.
+  ///
+  /// \returns the new API notes reader, or null if an error occurred.
+  static std::unique_ptr<APINotesReader>
+  Create(std::unique_ptr<llvm::MemoryBuffer> InputBuffer,
+         llvm::VersionTuple SwiftVersion);
+
+  ~APINotesReader();
+
+  APINotesReader(const APINotesReader &) = delete;
+  APINotesReader &operator=(const APINotesReader &) = delete;
+
+  /// Captures the completed versioned information for a particular part of
+  /// API notes, including both unversioned API notes and each versioned API
+  /// note for that particular entity.
+  template <typename T> class VersionedInfo {
+    /// The complete set of results.
+    llvm::SmallVector<std::pair<llvm::VersionTuple, T>, 1> Results;
+
+    /// The index of the result that is the "selected" set based on the desired
+    /// Swift version, or null if nothing matched.
+    std::optional<unsigned> Selected;
+
+  public:
+    /// Form an empty set of versioned information.
+    VersionedInfo(std::nullopt_t) : Selected(std::nullopt) {}
+
+    /// Form a versioned info set given the desired version and a set of
+    /// results.
+    VersionedInfo(
+        llvm::VersionTuple Version,
+        llvm::SmallVector<std::pair<llvm::VersionTuple, T>, 1> Results);
+
+    /// Retrieve the selected index in the result set.
+    std::optional<unsigned> getSelected() const { return Selected; }
+
+    /// Return the number of versioned results we know about.
+    unsigned size() const { return Results.size(); }
+
+    /// Access all versioned results.
+    const std::pair<llvm::VersionTuple, T> *begin() const {
+      assert(!Results.empty());
+      return Results.begin();
+    }
+    const std::pair<llvm::VersionTuple, T> *end() const {
+      return Results.end();
+    }
+
+    /// Access a specific versioned result.
+    const std::pair<llvm::VersionTuple, T> &operator[](unsigned index) const {
+      assert(index < Results.size());
+      return Results[index];
+    }
+  };
+
+  /// Look for the context ID of the given Objective-C class.
+  ///
+  /// \param Name The name of the class we're looking for.
+  ///
+  /// \returns The ID, if known.
+  std::optional<ContextID> lookupObjCClassID(llvm::StringRef Name);
+
+  /// Look for information regarding the given Objective-C class.
+  ///
+  /// \param Name The name of the class we're looking for.
+  ///
+  /// \returns The information about the class, if known.
+  VersionedInfo<ObjCContextInfo> lookupObjCClassInfo(llvm::StringRef Name);
+
+  /// Look for the context ID of the given Objective-C protocol.
+  ///
+  /// \param Name The name of the protocol we're looking for.
+  ///
+  /// \returns The ID of the protocol, if known.
+  std::optional<ContextID> lookupObjCProtocolID(llvm::StringRef Name);
+
+  /// Look for information regarding the given Objective-C protocol.
+  ///
+  /// \param Name The name of the protocol we're looking for.
+  ///
+  /// \returns The information about the protocol, if known.
+  VersionedInfo<ObjCContextInfo> lookupObjCProtocolInfo(llvm::StringRef Name);
+
+  /// Look for information regarding the given Objective-C property in
+  /// the given context.
+  ///
+  /// \param CtxID The ID that references the context we are looking for.
+  /// \param Name The name of the property we're looking for.
+  /// \param IsInstance Whether we are looking for an instance property (vs.
+  /// a class property).
+  ///
+  /// \returns Information about the property, if known.
+  VersionedInfo<ObjCPropertyInfo>
+  lookupObjCProperty(ContextID CtxID, llvm::StringRef Name, bool IsInstance);
+
+  /// Look for information regarding the given Objective-C method in
+  /// the given context.
+  ///
+  /// \param CtxID The ID that references the context we are looking for.
+  /// \param Selector The selector naming the method we're looking for.
+  /// \param IsInstanceMethod Whether we are looking for an instance method.
+  ///
+  /// \returns Information about the method, if known.
+  VersionedInfo<ObjCMethodInfo> lookupObjCMethod(ContextID CtxID,
+                                                 ObjCSelectorRef Selector,
+                                                 bool IsInstanceMethod);
+
+  /// Look for information regarding the given global variable.
+  ///
+  /// \param Name The name of the global variable.
+  ///
+  /// \returns information about the global variable, if known.
+  VersionedInfo<GlobalVariableInfo>
+  lookupGlobalVariable(llvm::StringRef Name,
+                       std::optional<Context> Ctx = std::nullopt);
+
+  /// Look for information regarding the given global function.
+  ///
+  /// \param Name The name of the global function.
+  ///
+  /// \returns information about the global function, if known.
+  VersionedInfo<GlobalFunctionInfo>
+  lookupGlobalFunction(llvm::StringRef Name,
+                       std::optional<Context> Ctx = std::nullopt);
+
+  /// Look for information regarding the given enumerator.
+  ///
+  /// \param Name The name of the enumerator.
+  ///
+  /// \returns information about the enumerator, if known.
+  VersionedInfo<EnumConstantInfo> lookupEnumConstant(llvm::StringRef Name);
+
+  /// Look for information regarding the given tag
+  /// (struct/union/enum/C++ class).
+  ///
+  /// \param Name The name of the tag.
+  ///
+  /// \returns information about the tag, if known.
+  VersionedInfo<TagInfo> lookupTag(llvm::StringRef Name,
+                                   std::optional<Context> Ctx = std::nullopt);
+
+  /// Look for information regarding the given typedef.
+  ///
+  /// \param Name The name of the typedef.
+  ///
+  /// \returns information about the typedef, if known.
+  VersionedInfo<TypedefInfo>
+  lookupTypedef(llvm::StringRef Name,
+                std::optional<Context> Ctx = std::nullopt);
+
+  /// Look for the context ID of the given C++ namespace.
+  ///
+  /// \param Name The name of the class we're looking for.
+  ///
+  /// \returns The ID, if known.
+  std::optional<ContextID>
+  lookupNamespaceID(llvm::StringRef Name,
+                    std::optional<ContextID> ParentNamespaceID = std::nullopt);
+};
+
+} // end namespace api_notes
+} // end namespace clang
+
+#endif // LLVM_CLANG_APINOTES_READER_H
diff --git a/clang/include/clang/APINotes/Types.h b/clang/include/clang/APINotes/Types.h
index 354458588e309..b74244bc8f1cb 100644
--- a/clang/include/clang/APINotes/Types.h
+++ b/clang/include/clang/APINotes/Types.h
@@ -144,16 +144,10 @@ class CommonTypeInfo : public CommonEntityInfo {
     return SwiftBridge;
   }
 
-  void setSwiftBridge(const std::optional<std::string> &SwiftType) {
+  void setSwiftBridge(std::optional<std::string> SwiftType) {
     SwiftBridge = SwiftType;
   }
 
-  void setSwiftBridge(const std::optional<llvm::StringRef> &SwiftType) {
-    SwiftBridge = SwiftType
-                      ? std::optional<std::string>(std::string(*SwiftType))
-                      : std::nullopt;
-  }
-
   const std::optional<std::string> &getNSErrorDomain() const {
     return NSErrorDomain;
   }
diff --git a/clang/lib/APINotes/APINotesReader.cpp b/clang/lib/APINotes/APINotesReader.cpp
new file mode 100644
index 0000000000000..2cbf5fd3bf503
--- /dev/null
+++ b/clang/lib/APINotes/APINotesReader.cpp
@@ -0,0 +1,2048 @@
+//===--- APINotesReader.cpp - API Notes Reader ------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/APINotes/APINotesReader.h"
+#include "APINotesFormat.h"
+#include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Bitstream/BitstreamReader.h"
+#include "llvm/Support/DJB.h"
+#include "llvm/Support/EndianStream.h"
+#include "llvm/Support/OnDiskHashTable.h"
+
+namespace clang {
+namespace api_notes {
+using namespace llvm::support;
+
+namespace {
+/// Deserialize a version tuple.
+llvm::VersionTuple ReadVersionTuple(const uint8_t *&Data) {
+  uint8_t NumVersions = (*Data++) & 0x03;
+
+  unsigned Major =
+      endian::readNext<uint32_t, llvm::endianness::little, unaligned>(Data);
+  if (NumVersions == 0)
+    return llvm::VersionTuple(Major);
+
+  unsigned Minor =
+      endian::readNext<uint32_t, llvm::endianness::little, unaligned>(Data);
+  if (NumVersions == 1)
+    return llvm::VersionTuple(Major, Minor);
+
+  unsigned Subminor =
+      endian::readNext<uint32_t, llvm::endianness::little, unaligned>(Data);
+  if (NumVersions == 2)
+    return llvm::VersionTuple(Major, Minor, Subminor);
+
+  unsigned Build =
+      endian::readNext<uint32_t, llvm::endianness::little, unaligned>(Data);
+  return llvm::VersionTuple(Major, Minor, Subminor, Build);
+}
+
+/// An on-disk hash table whose data is versioned based on the Swift version.
+template <typename Derived, typename KeyType, typename UnversionedDataType>
+class VersionedTableInfo {
+public:
+  using internal_key_type = KeyType;
+  using external_key_type = KeyType;
+  using data_type =
+      llvm::SmallVector<std::pair<llvm::VersionTuple, UnversionedDataType>, 1>;
+  using hash_value_type = size_t;
+  using offset_type = unsigned;
+
+  internal_key_type GetInternalKey(external_key_type Key) { return Key; }
+
+  external_key_type GetExternalKey(internal_key_type Key) { return Key; }
+
+  static bool EqualKey(internal_key_type LHS, internal_key_type RHS) {
+    return LHS == RHS;
+  }
+
+  static std::pair<unsigned, unsigned> ReadKeyDataLength(const uint8_t *&Data) {
+    unsigned KeyLength =
+        endian::readNext<uint16_t, llvm::endianness::little, unaligned>(Data);
+    unsigned DataLength =
+        endian::readNext<uint16_t, llvm::endianness::little, unaligned>(Data);
+    return {KeyLength, DataLength};
+  }
+
+  static data_type ReadData(internal_key_type Key, const uint8_t *Data,
+                            unsigned Length) {
+    unsigned NumElements =
+        endian::readNext<uint16_t, llvm::endianness::little, unaligned>(Data);
+    data_type Result;
+    Result.reserve(NumElements);
+    for (unsigned i = 0; i != NumElements; ++i) {
+      auto version = ReadVersionTuple(Data);
+      const auto *DataBefore = Data;
+      (void)DataBefore;
+      assert(Data != DataBefore &&
+             "Unversioned data reader didn't move pointer");
+      auto UnversionedData = Derived::readUnversioned(Key, Data);
+      Result.push_back({version, UnversionedData});
+    }
+    return Result;
+  }
+};
+
+/// Read serialized CommonEntityInfo.
+void ReadCommonEntityInfo(const uint8_t *&Data, CommonEntityInfo &Info) {
+  uint8_t UnavailableBits = *Data++;
+  Info.Unavailable = (UnavailableBits >> 1) & 0x01;
+  Info.UnavailableInSwift = UnavailableBits & 0x01;
+  if ((UnavailableBits >> 2) & 0x01)
+    Info.setSwiftPrivate(static_cast<bool>((UnavailableBits >> 3) & 0x01));
+
+  unsigned MsgLength =
+      endian::readNext<uint16_t, llvm::endianness::little, unaligned>(Data);
+  Info.UnavailableMsg =
+      std::string(reinterpret_cast<const char *>(Data),
+                  reinterpret_cast<const char *>(Data) + MsgLength);
+  Data += MsgLength;
+
+  unsigned SwiftNameLength =
+      endian::readNext<uint16_t, llvm::endianness::little, unaligned>(Data);
+  Info.SwiftName =
+      std::string(reinterpret_cast<const char *>(Data),
+                  reinterpret_cast<const char *>(Data) + SwiftNameLength);
+  Data += SwiftNameLength;
+}
+
+/// Read serialized CommonTypeInfo.
+void ReadCommonTypeInfo(const uint8_t *&Data, CommonTypeInfo &Info) {
+  ReadCommonEntityInfo(Data, Info);
+
+  unsigned SwiftBridgeLength =
+      endian::readNext<uint16_t, llvm::endianness::little, unaligned>(Data);
+  if (SwiftBridgeLength > 0) {
+    Info.setSwiftBridge(std::string(reinterpret_cast<const char *>(Data),
+                                    SwiftBridgeLength - 1));
+    Data += SwiftBridgeLength - 1;
+  }
+
+  unsigned ErrorDomainLength =
+      endian::readNext<uint16_t, llvm::endianness::little, unaligned>(Data);
+  if (ErrorDomainLength > 0) {
+    Info.setNSErrorDomain(std::optional<std::string>(std::string(
+        reinterpret_cast<const char *>(Data), ErrorDomainLength - 1)));
+    Data += ErrorDomainLength - 1;
+  }
+}
+
+/// Used to deserialize the on-disk identifier table.
+class IdentifierTableInfo {
+public:
+  using internal_key_type = llvm::StringRef;
+  using external_key_type = llvm::StringRef;
+  using data_type = IdentifierID;
+  using hash_value_type = uint32_t;
+  using offset_type = unsigned;
+
+  internal_key_type GetInternalKey(external_key_type Key) { return Key; }
+
+  external_key_type GetExternalKey(internal_key_type Key) { return Key; }
+
+  hash_value_type ComputeHash(internal_key_type Key) {
+    return llvm::hash_value(Key);
+  }
+
+  static bool EqualKey(internal_key_type LHS, internal_key_type RHS) {
+    return LHS == RHS;
+  }
+
+  static std::pair<unsigned, unsigned> ReadKeyDataLength(const uint8_t *&Data) {
+    unsigned KeyLength =
+        endian::readNext<uint16_t, llvm::endianness::little, unaligned>(Data);
+    unsigned DataLength =
+        endian::readNext<uint16_t, llvm::endianness::little, unaligned>(Data);
+    return {KeyLength, DataLength};
+  }
+
+  static internal_key_type ReadKey(const uint8_t *Data, unsigned Length) {
+    return llvm::StringRef(reinterpret_cast<const char *>(Data), Length);
+  }
+
+  static data_type ReadData(internal_key_type key, const uint8_t *Data,
+                            unsigned Length) {
+    return endian::readNext<uint32_t, llvm::endianness::little, unaligned>(
+        Data);
+  }
+};
+
+/// Used to deserialize the on-disk Objective-C class table.
+class ObjCContextIDTableInfo {
+public:
+  using internal_key_type = ContextTableKey;
+  using external_key_type = internal_key_type;
+  using data_type = unsigned;
+  using hash_value_type = size_t;
+  using offset_type = unsigned;
+
+  internal_key_type GetInternalKey(external_key_type Key) { return Key; }
+
+  external_key_type GetExternalKey(internal_key_type Key) { return Key; }
+
+  hash_value_type ComputeHash(internal_key_type Key) {
+    return static_cast<size_t>(Key.hashValue());
+  }
+
+  static bool EqualKey(internal_key_type LHS, internal_key_type RHS) {
+    return LHS == RHS;
+  }
+
+  static std::pair<unsigned, unsigned> ReadKeyDataLength(const uint8_t *&Data) {
+    unsigned KeyLength =
+        endian::readNext<uint16_t, llvm::endianness::little, unaligned>(Data);
+    unsigned DataLength =
+        endian::readNext<uint16_t, llvm::endianness::little, unaligned>(Data);
+    return {KeyLength, DataLength};
+  }
+
+  static internal_key_type ReadKey(const uint8_t *Data, unsigned Length) {
+    auto ParentCtxID =
+        endian::readNext<uint32_t, llvm::endianness::little, unaligned>(Data);
+    auto ContextKind =
+        endian::readNext<uint8_t, llvm::endianness::little, unaligned>(Data);
+    auto NameID =
+        endian::readNext<uint32_t, llvm::endianness::little, unaligned>(Data);
+    return {ParentCtxID, ContextKind, NameID};
+  }
+
+  static data_type ReadData(internal_key_type Key, const uint8_t *Data,
+                            unsigned Length) {
+    return endian::readNext<uint32_t, llvm::endianness::little, unaligned>(
+        Data);
+  }
+};
+
+/// Used to deserialize the on-disk Objective-C property table.
+class ObjCContextInfoTableInfo
+    : public VersionedTableInfo<ObjCContextInfoTableInfo, unsigned,
+                                ObjCContextInfo> {
+public:
+  static internal_key_type ReadKey(const uint8_t *Data, unsigned Length) {
+    return endian::readNext<uint32_t, llvm::endianness::little, unaligned>(
+        Data);
+  }
+
+  hash_value_type ComputeHash(internal_key_type Key) {
+    return static_cast<size_t>(llvm::hash_value(Key));
+  }
+
+  static ObjCContextInfo readUnversioned(internal_key_type Key,
+                                         const uint8_t *&Data) {
+    ObjCContextInfo Info;
+    ReadCommonTypeInfo(Data, Info);
+    uint8_t Payload = *Data++;
+
+    if (Payload & 0x01)
+      Info.setHasDesignatedInits(true);
+    Payload = Payload >> 1;
+
+    if (Payload & 0x4)
+      Info.setDefaultNullability(static_cast<NullabilityKind>(Payload & 0x03));
+    Payload >>= 3;
+
+    if (Payload & (1 << 1))
+      Info.setSwiftObjCMembers(Payload & 1);
+    Payload >>= 2;
+
+    if (Payload & (1 << 1))
+      Info.setSwiftImportAsNonGeneric(Payload & 1);
+
+    return Info;
+  }
+};
+
+/// Read serialized VariableInfo.
+void ReadVariableInfo(const uint8_t *&Data, VariableInfo &Info) {
+  ReadCommonEntityInfo(Data, Info);
+  if (*Data++) {
+    Info.setNullabilityAudited(static_cast<NullabilityKind>(*Data));
+  }
+  ++Data;
+
+  auto TypeLen =
+      endian::readNext<uint16_t, llvm::endianness::little, unaligned>(Data);
+  Info.setType(std::string(Data, Data + TypeLen));
+  Data += TypeLen;
+}
+
+/// Used to deserialize the on-disk Objective-C property table.
+class ObjCPropertyTableInfo
+    : public VersionedTableInfo<ObjCPropertyTableInfo,
+                                std::tuple<uint32_t, uint32_t, uint8_t>,
+                                ObjCPropertyInfo> {
+public:
+  static internal_key_type ReadKey(const uint8_t *Data, unsigned Length) {
+    auto ClassID =
+        endian::readNext<uint32_t, llvm::endianness::little, unaligned>(Data);
+    auto NameID =
+        endian::readNext<uint32_t, llvm::endianness::little, unaligned>(Data);
+    char IsInstance =
+        endian::readNext<uint8_t, llvm::endianness::little, unaligned>(Data);
+    return {ClassID, NameID, IsInstance};
+  }
+
+  hash_value_type ComputeHash(internal_key_type Key) {
+    return static_cast<size_t>(llvm::hash_value(Key));
+  }
+
+  static ObjCPropertyInfo readUnversioned(internal_key_type Key,
+                                          const uint8_t *&Data) {
+    ObjCPropertyInfo Info;
+    ReadVariableInfo(Data, Info);
+    uint8_t Flags = *Data++;
+    if (Flags & (1 << 0))
+      Info.setSwiftImportAsAccessors(Flags & (1 << 1));
+    return Info;
+  }
+};
+
+/// Read serialized ParamInfo.
+void ReadParamInfo(const uint8_t *&Data, ParamInfo &Info) {
+  ReadVariableInfo(Data, Info);
+
+  uint8_t Payload =
+      endian::readNext<uint8_t, llvm::endianness::little, unaligned>(Data);
+  if (auto RawConvention = Payload & 0x7) {
+    auto Convention = static_cast<RetainCountConventionKind>(RawConvention - 1);
+    Info.setRetainCountConvention(Convention);
+  }
+  Payload >>= 3;
+  if (Payload & 0x01)
+    Info.setNoEscape(Payload & 0x02);
+  Payload >>= 2;
+  assert(Payload == 0 && "Bad API notes");
+}
+
+/// Read serialized FunctionInfo.
+void ReadFunctionInfo(const uint8_t *&Data, FunctionInfo &Info) {
+  ReadCommonEntityInfo(Data, Info);
+
+  uint8_t Payload =
+      endian::readNext<uint8_t, llvm::endianness::little, unaligned>(Data);
+  if (auto RawConvention = Payload & 0x7) {
+    auto Convention = static_cast<RetainCountConventionKind>(RawConvention - 1);
+    Info.setRetainCountConvention(Convention);
+  }
+  Payload >>= 3;
+  Info.NullabilityAudited = Payload & 0x1;
+  Payload >>= 1;
+  assert(Payload == 0 && "Bad API notes");
+
+  Info.NumAdjustedNullable =
+      endian::readNext<uint8_t, llvm::endianness::little, unaligned>(Data);
+  Info.NullabilityPayload =
+      endian::readNext<uint64_t, llvm::endianness::little, unaligned>(Data);
+
+  unsigned NumParams =
+      endian::readNext<uint16_t, llvm::endianness::little, unaligned>(Data);
+  while (NumParams > 0) {
+    ParamInfo pi;
+    ReadParamInfo(Data, pi);
+    Info.Params.push_back(pi);
+    --NumParams;
+  }
+
+  unsigned ResultTypeLen =
+      endian::readNext<uint16_t, llvm::endianness::little, unaligned>(Data);
+  Info.ResultType = std::string(Data, Data + ResultTypeLen);
+  Data += ResultTypeLen;
+}
+
+/// Used to deserialize the on-disk Objective-C method table.
+class ObjCMethodTableInfo
+    : public VersionedTableInfo<ObjCMethodTableInfo,
+                                std::tuple<uint32_t, uint32_t, uint8_t>,
+                                ObjCMethodInfo> {
+public:
+  static internal_key_type ReadKey(const uint8_t *Data, unsigned Length) {
+    auto ClassID =
+        endian::readNext<uint32_t, llvm::endianness::little, unaligned>(Data);
+    auto SelectorID =
+        endian::readNext<uint32_t, llvm::endianness::little, unaligned>(Data);
+    auto IsInstance =
+        endian::readNext<uint8_t, llvm::endianness::little, unaligned>(Data);
+    return {ClassID, SelectorID, IsInstance};
+  }
+
+  hash_value_type ComputeHash(internal_key_type Key) {
+    return static_cast<size_t>(llvm::hash_value(Key));
+  }
+
+  static ObjCMethodInfo readUnversioned(internal_key_type Key,
+                                        const uint8_t *&Data) {
+    ObjCMethodInfo Info;
+    uint8_t Payload = *Data++;
+    Info.RequiredInit = Payload & 0x01;
+    Payload >>= 1;
+    Info.DesignatedInit = Payload & 0x01;
+    Payload >>= 1;
+
+    ReadFunctionInfo(Data, Info);
+    return Info;
+  }
+};
+
+/// Used to deserialize the on-disk Objective-C selector table.
+class ObjCSelectorTableInfo {
+public:
+  using internal_key_type = StoredObjCSelector;
+  using external_key_type = internal_key_type;
+  using data_type = SelectorID;
+  using hash_value_type = unsigned;
+  using offset_type = unsigned;
+
+  internal_key_type GetInternalKey(external_key_type Key) { return Key; }
+
+  external_key_type GetExternalKey(internal_key_type Key) { return Key; }
+
+  hash_value_type ComputeHash(internal_key_type Key) {
+    return llvm::DenseMapInfo<StoredObjCSelector>::getHashValue(Key);
+  }
+
+  static bool EqualKey(internal_key_type LHS, internal_key_type RHS) {
+    return llvm::DenseMapInfo<StoredObjCSelector>::isEqual(LHS, RHS);
+  }
+
+  static std::pair<unsigned, unsigned> ReadKeyDataLength(const uint8_t *&Data) {
+    unsigned KeyLength =
+        endian::readNext<uint16_t, llvm::endianness::little, unaligned>(Data);
+    unsigned DataLength =
+        endian::readNext<uint16_t, llvm::endianness::little, unaligned>(Data);
+    return {KeyLength, DataLength};
+  }
+
+  static internal_key_type ReadKey(const uint8_t *Data, unsigned Length) {
+    internal_key_type Key;
+    Key.NumPieces =
+        endian::readNext<uint16_t, llvm::endianness::little, unaligned>(Data);
+    unsigned NumIdents = (Length - sizeof(uint16_t)) / sizeof(uint32_t);
+    for (unsigned i = 0; i != NumIdents; ++i) {
+      Key.Identifiers.push_back(
+          endian::readNext<uint32_t, llvm::endianness::little, unaligned>(
+              Data));
+    }
+    return Key;
+  }
+
+  static data_type ReadData(internal_key_type Key, const uint8_t *Data,
+                            unsigned Length) {
+    return endian::readNext<uint32_t, llvm::endianness::little, unaligned>(
+        Data);
+  }
+};
+
+/// Used to deserialize the on-disk global variable table.
+class GlobalVariableTableInfo
+    : public VersionedTableInfo<GlobalVariableTableInfo, ContextTableKey,
+                                GlobalVariableInfo> {
+public:
+  static internal_key_type ReadKey(const uint8_t *Data, unsigned Length) {
+    auto CtxID =
+        endian::readNext<uint32_t, llvm::endianness::little, unaligned>(Data);
+    auto ContextKind =
+        endian::readNext<uint8_t, llvm::endianness::little, unaligned>(Data);
+    auto NameID =
+        endian::readNext<uint32_t, llvm::endianness::little, unaligned>(Data);
+    return {CtxID, ContextKind, NameID};
+  }
+
+  hash_value_type ComputeHash(internal_key_type Key) {
+    return static_cast<size_t>(Key.hashValue());
+  }
+
+  static GlobalVariableInfo readUnversioned(internal_key_type Key,
+                                            const uint8_t *&Data) {
+    GlobalVariableInfo Info;
+    ReadVariableInfo(Data, Info);
+    return Info;
+  }
+};
+
+/// Used to deserialize the on-disk global function table.
+class GlobalFunctionTableInfo
+    : public VersionedTableInfo<GlobalFunctionTableInfo, ContextTableKey,
+                                GlobalFunctionInfo> {
+public:
+  static internal_key_type ReadKey(const uint8_t *Data, unsigned Length) {
+    auto CtxID =
+        endian::readNext<uint32_t, llvm::endianness::little, unaligned>(Data);
+    auto ContextKind =
+        endian::readNext<uint8_t, llvm::endianness::little, unaligned>(Data);
+    auto NameID =
+        endian::readNext<uint32_t, llvm::endianness::little, unaligned>(Data);
+    return {CtxID, ContextKind, NameID};
+  }
+
+  hash_value_type ComputeHash(internal_key_type Key) {
+    return static_cast<size_t>(Key.hashValue());
+  }
+
+  static GlobalFunctionInfo readUnversioned(internal_key_type Key,
+                                            const uint8_t *&Data) {
+    GlobalFunctionInfo Info;
+    ReadFunctionInfo(Data, Info);
+    return Info;
+  }
+};
+
+/// Used to deserialize the on-disk enumerator table.
+class EnumConstantTableInfo
+    : public VersionedTableInfo<EnumConstantTableInfo, uint32_t,
+                                EnumConstantInfo> {
+public:
+  static internal_key_type ReadKey(const uint8_t *Data, unsigned Length) {
+    auto NameID =
+        endian::readNext<uint32_t, llvm::endianness::little, unaligned>(Data);
+    return NameID;
+  }
+
+  hash_value_type ComputeHash(internal_key_type Key) {
+    return static_cast<size_t>(llvm::hash_value(Key));
+  }
+
+  static EnumConstantInfo readUnversioned(internal_key_type Key,
+                                          const uint8_t *&Data) {
+    EnumConstantInfo Info;
+    ReadCommonEntityInfo(Data, Info);
+    return Info;
+  }
+};
+
+/// Used to deserialize the on-disk tag table.
+class TagTableInfo
+    : public VersionedTableInfo<TagTableInfo, ContextTableKey, TagInfo> {
+public:
+  static internal_key_type ReadKey(const uint8_t *Data, unsigned Length) {
+    auto CtxID =
+        endian::readNext<uint32_t, llvm::endianness::little, unaligned>(Data);
+    auto ContextKind =
+        endian::readNext<uint8_t, llvm::endianness::little, unaligned>(Data);
+    auto NameID =
+        endian::readNext<IdentifierID, llvm::endianness::little, unaligned>(
+            Data);
+    return {CtxID, ContextKind, NameID};
+  }
+
+  hash_value_type ComputeHash(internal_key_type Key) {
+    return static_cast<size_t>(Key.hashValue());
+  }
+
+  static TagInfo readUnversioned(internal_key_type Key, const uint8_t *&Data) {
+    TagInfo Info;
+
+    uint8_t Payload = *Data++;
+    if (Payload & 1)
+      Info.setFlagEnum(Payload & 2);
+    Payload >>= 2;
+    if (Payload > 0)
+      Info.EnumExtensibility =
+          static_cast<EnumExtensibilityKind>((Payload & 0x3) - 1);
+
+    unsigned ImportAsLength =
+        endian::readNext<uint16_t, llvm::endianness::little, unaligned>(Data);
+    if (ImportAsLength > 0) {
+      Info.SwiftImportAs =
+          std::string(reinterpret_cast<const char *>(Data), ImportAsLength - 1);
+      Data += ImportAsLength - 1;
+    }
+    unsigned RetainOpLength =
+        endian::readNext<uint16_t, llvm::endianness::little, unaligned>(Data);
+    if (RetainOpLength > 0) {
+      Info.SwiftRetainOp =
+          std::string(reinterpret_cast<const char *>(Data), RetainOpLength - 1);
+      Data += RetainOpLength - 1;
+    }
+    unsigned ReleaseOpLength =
+        endian::readNext<uint16_t, llvm::endianness::little, unaligned>(Data);
+    if (ReleaseOpLength > 0) {
+      Info.SwiftReleaseOp = std::string(reinterpret_cast<const char *>(Data),
+                                        ReleaseOpLength - 1);
+      Data += ReleaseOpLength - 1;
+    }
+
+    ReadCommonTypeInfo(Data, Info);
+    return Info;
+  }
+};
+
+/// Used to deserialize the on-disk typedef table.
+class TypedefTableInfo
+    : public VersionedTableInfo<TypedefTableInfo, ContextTableKey,
+                                TypedefInfo> {
+public:
+  static internal_key_type ReadKey(const uint8_t *Data, unsigned Length) {
+    auto CtxID =
+        endian::readNext<uint32_t, llvm::endianness::little, unaligned>(Data);
+    auto ContextKind =
+        endian::readNext<uint8_t, llvm::endianness::little, unaligned>(Data);
+    auto nameID =
+        endian::readNext<IdentifierID, llvm::endianness::little, unaligned>(
+            Data);
+    return {CtxID, ContextKind, nameID};
+  }
+
+  hash_value_type ComputeHash(internal_key_type Key) {
+    return static_cast<size_t>(Key.hashValue());
+  }
+
+  static TypedefInfo readUnversioned(internal_key_type Key,
+                                     const uint8_t *&Data) {
+    TypedefInfo Info;
+
+    uint8_t Payload = *Data++;
+    if (Payload > 0)
+      Info.SwiftWrapper = static_cast<SwiftNewTypeKind>((Payload & 0x3) - 1);
+
+    ReadCommonTypeInfo(Data, Info);
+    return Info;
+  }
+};
+} // end anonymous namespace
+
+class APINotesReader::Implementation {
+public:
+  /// The input buffer for the API notes data.
+  llvm::MemoryBuffer *InputBuffer;
+
+  /// The Swift version to use for filtering.
+  llvm::VersionTuple SwiftVersion;
+
+  /// The name of the module that we read from the control block.
+  std::string ModuleName;
+
+  // The size and modification time of the source file from
+  // which this API notes file was created, if known.
+  std::optional<std::pair<off_t, time_t>> SourceFileSizeAndModTime;
+
+  using SerializedIdentifierTable =
+      llvm::OnDiskIterableChainedHashTable<IdentifierTableInfo>;
+
+  /// The identifier table.
+  std::unique_ptr<SerializedIdentifierTable> IdentifierTable;
+
+  using SerializedObjCContextIDTable =
+      llvm::OnDiskIterableChainedHashTable<ObjCContextIDTableInfo>;
+
+  /// The Objective-C context ID table.
+  std::unique_ptr<SerializedObjCContextIDTable> ObjCContextIDTable;
+
+  using SerializedObjCContextInfoTable =
+      llvm::OnDiskIterableChainedHashTable<ObjCContextInfoTableInfo>;
+
+  /// The Objective-C context info table.
+  std::unique_ptr<SerializedObjCContextInfoTable> ObjCContextInfoTable;
+
+  using SerializedObjCPropertyTable =
+      llvm::OnDiskIterableChainedHashTable<ObjCPropertyTableInfo>;
+
+  /// The Objective-C property table.
+  std::unique_ptr<SerializedObjCPropertyTable> ObjCPropertyTable;
+
+  using SerializedObjCMethodTable =
+      llvm::OnDiskIterableChainedHashTable<ObjCMethodTableInfo>;
+
+  /// The Objective-C method table.
+  std::unique_ptr<SerializedObjCMethodTable> ObjCMethodTable;
+
+  using SerializedObjCSelectorTable =
+      llvm::OnDiskIterableChainedHashTable<ObjCSelectorTableInfo>;
+
+  /// The Objective-C selector table.
+  std::unique_ptr<SerializedObjCSelectorTable> ObjCSelectorTable;
+
+  using SerializedGlobalVariableTable =
+      llvm::OnDiskIterableChainedHashTable<GlobalVariableTableInfo>;
+
+  /// The global variable table.
+  std::unique_ptr<SerializedGlobalVariableTable> GlobalVariableTable;
+
+  using SerializedGlobalFunctionTable =
+      llvm::OnDiskIterableChainedHashTable<GlobalFunctionTableInfo>;
+
+  /// The global function table.
+  std::unique_ptr<SerializedGlobalFunctionTable> GlobalFunctionTable;
+
+  using SerializedEnumConstantTable =
+      llvm::OnDiskIterableChainedHashTable<EnumConstantTableInfo>;
+
+  /// The enumerator table.
+  std::unique_ptr<SerializedEnumConstantTable> EnumConstantTable;
+
+  using SerializedTagTable = llvm::OnDiskIterableChainedHashTable<TagTableInfo>;
+
+  /// The tag table.
+  std::unique_ptr<SerializedTagTable> TagTable;
+
+  using SerializedTypedefTable =
+      llvm::OnDiskIterableChainedHashTable<TypedefTableInfo>;
+
+  /// The typedef table.
+  std::unique_ptr<SerializedTypedefTable> TypedefTable;
+
+  /// Retrieve the identifier ID for the given string, or an empty
+  /// optional if the string is unknown.
+  std::optional<IdentifierID> getIdentifier(llvm::StringRef Str);
+
+  /// Retrieve the selector ID for the given selector, or an empty
+  /// optional if the string is unknown.
+  std::optional<SelectorID> getSelector(ObjCSelectorRef Selector);
+
+  bool readControlBlock(llvm::BitstreamCursor &Cursor,
+                        llvm::SmallVectorImpl<uint64_t> &Scratch);
+  bool readIdentifierBlock(llvm::BitstreamCursor &Cursor,
+                           llvm::SmallVectorImpl<uint64_t> &Scratch);
+  bool readObjCContextBlock(llvm::BitstreamCursor &Cursor,
+                            llvm::SmallVectorImpl<uint64_t> &Scratch);
+  bool readObjCPropertyBlock(llvm::BitstreamCursor &Cursor,
+                             llvm::SmallVectorImpl<uint64_t> &Scratch);
+  bool readObjCMethodBlock(llvm::BitstreamCursor &Cursor,
+                           llvm::SmallVectorImpl<uint64_t> &Scratch);
+  bool readObjCSelectorBlock(llvm::BitstreamCursor &Cursor,
+                             llvm::SmallVectorImpl<uint64_t> &Scratch);
+  bool readGlobalVariableBlock(llvm::BitstreamCursor &Cursor,
+                               llvm::SmallVectorImpl<uint64_t> &Scratch);
+  bool readGlobalFunctionBlock(llvm::BitstreamCursor &Cursor,
+                               llvm::SmallVectorImpl<uint64_t> &Scratch);
+  bool readEnumConstantBlock(llvm::BitstreamCursor &Cursor,
+                             llvm::SmallVectorImpl<uint64_t> &Scratch);
+  bool readTagBlock(llvm::BitstreamCursor &Cursor,
+                    llvm::SmallVectorImpl<uint64_t> &Scratch);
+  bool readTypedefBlock(llvm::BitstreamCursor &Cursor,
+                        llvm::SmallVectorImpl<uint64_t> &Scratch);
+};
+
+std::optional<IdentifierID>
+APINotesReader::Implementation::getIdentifier(llvm::StringRef Str) {
+  if (!IdentifierTable)
+    return std::nullopt;
+
+  if (Str.empty())
+    return IdentifierID(0);
+
+  auto Known = IdentifierTable->find(Str);
+  if (Known == IdentifierTable->end())
+    return std::nullopt;
+
+  return *Known;
+}
+
+std::optional<SelectorID>
+APINotesReader::Implementation::getSelector(ObjCSelectorRef Selector) {
+  if (!ObjCSelectorTable || !IdentifierTable)
+    return std::nullopt;
+
+  // Translate the identifiers.
+  StoredObjCSelector Key;
+  for (auto Ident : Selector.Identifiers) {
+    if (auto IdentID = getIdentifier(Ident)) {
+      Key.Identifiers.push_back(*IdentID);
+    } else {
+      return std::nullopt;
+    }
+  }
+
+  auto Known = ObjCSelectorTable->find(Key);
+  if (Known == ObjCSelectorTable->end())
+    return std::nullopt;
+
+  return *Known;
+}
+
+bool APINotesReader::Implementation::readControlBlock(
+    llvm::BitstreamCursor &Cursor, llvm::SmallVectorImpl<uint64_t> &Scratch) {
+  if (Cursor.EnterSubBlock(CONTROL_BLOCK_ID))
+    return true;
+
+  bool SawMetadata = false;
+
+  llvm::Expected<llvm::BitstreamEntry> MaybeNext = Cursor.advance();
+  if (!MaybeNext) {
+    // FIXME this drops the error on the floor.
+    consumeError(MaybeNext.takeError());
+    return false;
+  }
+  llvm::BitstreamEntry Next = MaybeNext.get();
+
+  while (Next.Kind != llvm::BitstreamEntry::EndBlock) {
+    if (Next.Kind == llvm::BitstreamEntry::Error)
+      return true;
+
+    if (Next.Kind == llvm::BitstreamEntry::SubBlock) {
+      // Unknown metadata sub-block, possibly for use by a future version of the
+      // API notes format.
+      if (Cursor.SkipBlock())
+        return true;
+
+      MaybeNext = Cursor.advance();
+      if (!MaybeNext) {
+        // FIXME this drops the error on the floor.
+        consumeError(MaybeNext.takeError());
+        return false;
+      }
+      Next = MaybeNext.get();
+      continue;
+    }
+
+    Scratch.clear();
+    llvm::StringRef BlobData;
+    llvm::Expected<unsigned> MaybeKind =
+        Cursor.readRecord(Next.ID, Scratch, &BlobData);
+    if (!MaybeKind) {
+      // FIXME this drops the error on the floor.
+      consumeError(MaybeKind.takeError());
+      return false;
+    }
+    unsigned Kind = MaybeKind.get();
+
+    switch (Kind) {
+    case control_block::METADATA:
+      // Already saw metadata.
+      if (SawMetadata)
+        return true;
+
+      if (Scratch[0] != VERSION_MAJOR || Scratch[1] != VERSION_MINOR)
+        return true;
+
+      SawMetadata = true;
+      break;
+
+    case control_block::MODULE_NAME:
+      ModuleName = BlobData.str();
+      break;
+
+    case control_block::MODULE_OPTIONS:
+      break;
+
+    case control_block::SOURCE_FILE:
+      SourceFileSizeAndModTime = {Scratch[0], Scratch[1]};
+      break;
+
+    default:
+      // Unknown metadata record, possibly for use by a future version of the
+      // module format.
+      break;
+    }
+
+    MaybeNext = Cursor.advance();
+    if (!MaybeNext) {
+      // FIXME this drops the error on the floor.
+      consumeError(MaybeNext.takeError());
+      return false;
+    }
+    Next = MaybeNext.get();
+  }
+
+  return !SawMetadata;
+}
+
+bool APINotesReader::Implementation::readIdentifierBlock(
+    llvm::BitstreamCursor &Cursor, llvm::SmallVectorImpl<uint64_t> &Scratch) {
+  if (Cursor.EnterSubBlock(IDENTIFIER_BLOCK_ID))
+    return true;
+
+  llvm::Expected<llvm::BitstreamEntry> MaybeNext = Cursor.advance();
+  if (!MaybeNext) {
+    // FIXME this drops the error on the floor.
+    consumeError(MaybeNext.takeError());
+    return false;
+  }
+  llvm::BitstreamEntry Next = MaybeNext.get();
+
+  while (Next.Kind != llvm::BitstreamEntry::EndBlock) {
+    if (Next.Kind == llvm::BitstreamEntry::Error)
+      return true;
+
+    if (Next.Kind == llvm::BitstreamEntry::SubBlock) {
+      // Unknown sub-block, possibly for use by a future version of the
+      // API notes format.
+      if (Cursor.SkipBlock())
+        return true;
+
+      MaybeNext = Cursor.advance();
+      if (!MaybeNext) {
+        // FIXME this drops the error on the floor.
+        consumeError(MaybeNext.takeError());
+        return false;
+      }
+      Next = MaybeNext.get();
+      continue;
+    }
+
+    Scratch.clear();
+    llvm::StringRef BlobData;
+    llvm::Expected<unsigned> MaybeKind =
+        Cursor.readRecord(Next.ID, Scratch, &BlobData);
+    if (!MaybeKind) {
+      // FIXME this drops the error on the floor.
+      consumeError(MaybeKind.takeError());
+      return false;
+    }
+    unsigned Kind = MaybeKind.get();
+    switch (Kind) {
+    case identifier_block::IDENTIFIER_DATA: {
+      // Already saw identifier table.
+      if (IdentifierTable)
+        return true;
+
+      uint32_t tableOffset;
+      identifier_block::IdentifierDataLayout::readRecord(Scratch, tableOffset);
+      auto base = reinterpret_cast<const uint8_t *>(BlobData.data());
+
+      IdentifierTable.reset(SerializedIdentifierTable::Create(
+          base + tableOffset, base + sizeof(uint32_t), base));
+      break;
+    }
+
+    default:
+      // Unknown record, possibly for use by a future version of the
+      // module format.
+      break;
+    }
+
+    MaybeNext = Cursor.advance();
+    if (!MaybeNext) {
+      // FIXME this drops the error on the floor.
+      consumeError(MaybeNext.takeError());
+      return false;
+    }
+    Next = MaybeNext.get();
+  }
+
+  return false;
+}
+
+bool APINotesReader::Implementation::readObjCContextBlock(
+    llvm::BitstreamCursor &Cursor, llvm::SmallVectorImpl<uint64_t> &Scratch) {
+  if (Cursor.EnterSubBlock(OBJC_CONTEXT_BLOCK_ID))
+    return true;
+
+  llvm::Expected<llvm::BitstreamEntry> MaybeNext = Cursor.advance();
+  if (!MaybeNext) {
+    // FIXME this drops the error on the floor.
+    consumeError(MaybeNext.takeError());
+    return false;
+  }
+  llvm::BitstreamEntry Next = MaybeNext.get();
+
+  while (Next.Kind != llvm::BitstreamEntry::EndBlock) {
+    if (Next.Kind == llvm::BitstreamEntry::Error)
+      return true;
+
+    if (Next.Kind == llvm::BitstreamEntry::SubBlock) {
+      // Unknown sub-block, possibly for use by a future version of the
+      // API notes format.
+      if (Cursor.SkipBlock())
+        return true;
+
+      MaybeNext = Cursor.advance();
+      if (!MaybeNext) {
+        // FIXME this drops the error on the floor.
+        consumeError(MaybeNext.takeError());
+        return false;
+      }
+      Next = MaybeNext.get();
+      continue;
+    }
+
+    Scratch.clear();
+    llvm::StringRef BlobData;
+    llvm::Expected<unsigned> MaybeKind =
+        Cursor.readRecord(Next.ID, Scratch, &BlobData);
+    if (!MaybeKind) {
+      // FIXME this drops the error on the floor.
+      consumeError(MaybeKind.takeError());
+      return false;
+    }
+    unsigned Kind = MaybeKind.get();
+    switch (Kind) {
+    case objc_context_block::OBJC_CONTEXT_ID_DATA: {
+      // Already saw Objective-C context ID table.
+      if (ObjCContextIDTable)
+        return true;
+
+      uint32_t tableOffset;
+      objc_context_block::ObjCContextIDLayout::readRecord(Scratch, tableOffset);
+      auto base = reinterpret_cast<const uint8_t *>(BlobData.data());
+
+      ObjCContextIDTable.reset(SerializedObjCContextIDTable::Create(
+          base + tableOffset, base + sizeof(uint32_t), base));
+      break;
+    }
+
+    case objc_context_block::OBJC_CONTEXT_INFO_DATA: {
+      // Already saw Objective-C context info table.
+      if (ObjCContextInfoTable)
+        return true;
+
+      uint32_t tableOffset;
+      objc_context_block::ObjCContextInfoLayout::readRecord(Scratch,
+                                                            tableOffset);
+      auto base = reinterpret_cast<const uint8_t *>(BlobData.data());
+
+      ObjCContextInfoTable.reset(SerializedObjCContextInfoTable::Create(
+          base + tableOffset, base + sizeof(uint32_t), base));
+      break;
+    }
+
+    default:
+      // Unknown record, possibly for use by a future version of the
+      // module format.
+      break;
+    }
+
+    MaybeNext = Cursor.advance();
+    if (!MaybeNext) {
+      // FIXME this drops the error on the floor.
+      consumeError(MaybeNext.takeError());
+      return false;
+    }
+    Next = MaybeNext.get();
+  }
+
+  return false;
+}
+
+bool APINotesReader::Implementation::readObjCPropertyBlock(
+    llvm::BitstreamCursor &Cursor, llvm::SmallVectorImpl<uint64_t> &Scratch) {
+  if (Cursor.EnterSubBlock(OBJC_PROPERTY_BLOCK_ID))
+    return true;
+
+  llvm::Expected<llvm::BitstreamEntry> MaybeNext = Cursor.advance();
+  if (!MaybeNext) {
+    // FIXME this drops the error on the floor.
+    consumeError(MaybeNext.takeError());
+    return false;
+  }
+  llvm::BitstreamEntry Next = MaybeNext.get();
+
+  while (Next.Kind != llvm::BitstreamEntry::EndBlock) {
+    if (Next.Kind == llvm::BitstreamEntry::Error)
+      return true;
+
+    if (Next.Kind == llvm::BitstreamEntry::SubBlock) {
+      // Unknown sub-block, possibly for use by a future version of the
+      // API notes format.
+      if (Cursor.SkipBlock())
+        return true;
+
+      MaybeNext = Cursor.advance();
+      if (!MaybeNext) {
+        // FIXME this drops the error on the floor.
+        consumeError(MaybeNext.takeError());
+        return false;
+      }
+      Next = MaybeNext.get();
+      continue;
+    }
+
+    Scratch.clear();
+    llvm::StringRef BlobData;
+    llvm::Expected<unsigned> MaybeKind =
+        Cursor.readRecord(Next.ID, Scratch, &BlobData);
+    if (!MaybeKind) {
+      // FIXME this drops the error on the floor.
+      consumeError(MaybeKind.takeError());
+      return false;
+    }
+    unsigned Kind = MaybeKind.get();
+    switch (Kind) {
+    case objc_property_block::OBJC_PROPERTY_DATA: {
+      // Already saw Objective-C property table.
+      if (ObjCPropertyTable)
+        return true;
+
+      uint32_t tableOffset;
+      objc_property_block::ObjCPropertyDataLayout::readRecord(Scratch,
+                                                              tableOffset);
+      auto base = reinterpret_cast<const uint8_t *>(BlobData.data());
+
+      ObjCPropertyTable.reset(SerializedObjCPropertyTable::Create(
+          base + tableOffset, base + sizeof(uint32_t), base));
+      break;
+    }
+
+    default:
+      // Unknown record, possibly for use by a future version of the
+      // module format.
+      break;
+    }
+
+    MaybeNext = Cursor.advance();
+    if (!MaybeNext) {
+      // FIXME this drops the error on the floor.
+      consumeError(MaybeNext.takeError());
+      return false;
+    }
+    Next = MaybeNext.get();
+  }
+
+  return false;
+}
+
+bool APINotesReader::Implementation::readObjCMethodBlock(
+    llvm::BitstreamCursor &Cursor, llvm::SmallVectorImpl<uint64_t> &Scratch) {
+  if (Cursor.EnterSubBlock(OBJC_METHOD_BLOCK_ID))
+    return true;
+
+  llvm::Expected<llvm::BitstreamEntry> MaybeNext = Cursor.advance();
+  if (!MaybeNext) {
+    // FIXME this drops the error on the floor.
+    consumeError(MaybeNext.takeError());
+    return false;
+  }
+  llvm::BitstreamEntry Next = MaybeNext.get();
+  while (Next.Kind != llvm::BitstreamEntry::EndBlock) {
+    if (Next.Kind == llvm::BitstreamEntry::Error)
+      return true;
+
+    if (Next.Kind == llvm::BitstreamEntry::SubBlock) {
+      // Unknown sub-block, possibly for use by a future version of the
+      // API notes format.
+      if (Cursor.SkipBlock())
+        return true;
+
+      MaybeNext = Cursor.advance();
+      if (!MaybeNext) {
+        // FIXME this drops the error on the floor.
+        consumeError(MaybeNext.takeError());
+        return false;
+      }
+      Next = MaybeNext.get();
+      continue;
+    }
+
+    Scratch.clear();
+    llvm::StringRef BlobData;
+    llvm::Expected<unsigned> MaybeKind =
+        Cursor.readRecord(Next.ID, Scratch, &BlobData);
+    if (!MaybeKind) {
+      // FIXME this drops the error on the floor.
+      consumeError(MaybeKind.takeError());
+      return false;
+    }
+    unsigned Kind = MaybeKind.get();
+    switch (Kind) {
+    case objc_method_block::OBJC_METHOD_DATA: {
+      // Already saw Objective-C method table.
+      if (ObjCMethodTable)
+        return true;
+
+      uint32_t tableOffset;
+      objc_method_block::ObjCMethodDataLayout::readRecord(Scratch, tableOffset);
+      auto base = reinterpret_cast<const uint8_t *>(BlobData.data());
+
+      ObjCMethodTable.reset(SerializedObjCMethodTable::Create(
+          base + tableOffset, base + sizeof(uint32_t), base));
+      break;
+    }
+
+    default:
+      // Unknown record, possibly for use by a future version of the
+      // module format.
+      break;
+    }
+
+    MaybeNext = Cursor.advance();
+    if (!MaybeNext) {
+      // FIXME this drops the error on the floor.
+      consumeError(MaybeNext.takeError());
+      return false;
+    }
+    Next = MaybeNext.get();
+  }
+
+  return false;
+}
+
+bool APINotesReader::Implementation::readObjCSelectorBlock(
+    llvm::BitstreamCursor &Cursor, llvm::SmallVectorImpl<uint64_t> &Scratch) {
+  if (Cursor.EnterSubBlock(OBJC_SELECTOR_BLOCK_ID))
+    return true;
+
+  llvm::Expected<llvm::BitstreamEntry> MaybeNext = Cursor.advance();
+  if (!MaybeNext) {
+    // FIXME this drops the error on the floor.
+    consumeError(MaybeNext.takeError());
+    return false;
+  }
+  llvm::BitstreamEntry Next = MaybeNext.get();
+  while (Next.Kind != llvm::BitstreamEntry::EndBlock) {
+    if (Next.Kind == llvm::BitstreamEntry::Error)
+      return true;
+
+    if (Next.Kind == llvm::BitstreamEntry::SubBlock) {
+      // Unknown sub-block, possibly for use by a future version of the
+      // API notes format.
+      if (Cursor.SkipBlock())
+        return true;
+
+      MaybeNext = Cursor.advance();
+      if (!MaybeNext) {
+        // FIXME this drops the error on the floor.
+        consumeError(MaybeNext.takeError());
+        return false;
+      }
+      Next = MaybeNext.get();
+      continue;
+    }
+
+    Scratch.clear();
+    llvm::StringRef BlobData;
+    llvm::Expected<unsigned> MaybeKind =
+        Cursor.readRecord(Next.ID, Scratch, &BlobData);
+    if (!MaybeKind) {
+      // FIXME this drops the error on the floor.
+      consumeError(MaybeKind.takeError());
+      return false;
+    }
+    unsigned Kind = MaybeKind.get();
+    switch (Kind) {
+    case objc_selector_block::OBJC_SELECTOR_DATA: {
+      // Already saw Objective-C selector table.
+      if (ObjCSelectorTable)
+        return true;
+
+      uint32_t tableOffset;
+      objc_selector_block::ObjCSelectorDataLayout::readRecord(Scratch,
+                                                              tableOffset);
+      auto base = reinterpret_cast<const uint8_t *>(BlobData.data());
+
+      ObjCSelectorTable.reset(SerializedObjCSelectorTable::Create(
+          base + tableOffset, base + sizeof(uint32_t), base));
+      break;
+    }
+
+    default:
+      // Unknown record, possibly for use by a future version of the
+      // module format.
+      break;
+    }
+
+    MaybeNext = Cursor.advance();
+    if (!MaybeNext) {
+      // FIXME this drops the error on the floor.
+      consumeError(MaybeNext.takeError());
+      return false;
+    }
+    Next = MaybeNext.get();
+  }
+
+  return false;
+}
+
+bool APINotesReader::Implementation::readGlobalVariableBlock(
+    llvm::BitstreamCursor &Cursor, llvm::SmallVectorImpl<uint64_t> &Scratch) {
+  if (Cursor.EnterSubBlock(GLOBAL_VARIABLE_BLOCK_ID))
+    return true;
+
+  llvm::Expected<llvm::BitstreamEntry> MaybeNext = Cursor.advance();
+  if (!MaybeNext) {
+    // FIXME this drops the error on the floor.
+    consumeError(MaybeNext.takeError());
+    return false;
+  }
+  llvm::BitstreamEntry Next = MaybeNext.get();
+  while (Next.Kind != llvm::BitstreamEntry::EndBlock) {
+    if (Next.Kind == llvm::BitstreamEntry::Error)
+      return true;
+
+    if (Next.Kind == llvm::BitstreamEntry::SubBlock) {
+      // Unknown sub-block, possibly for use by a future version of the
+      // API notes format.
+      if (Cursor.SkipBlock())
+        return true;
+
+      MaybeNext = Cursor.advance();
+      if (!MaybeNext) {
+        // FIXME this drops the error on the floor.
+        consumeError(MaybeNext.takeError());
+        return false;
+      }
+      Next = MaybeNext.get();
+      continue;
+    }
+
+    Scratch.clear();
+    llvm::StringRef BlobData;
+    llvm::Expected<unsigned> MaybeKind =
+        Cursor.readRecord(Next.ID, Scratch, &BlobData);
+    if (!MaybeKind) {
+      // FIXME this drops the error on the floor.
+      consumeError(MaybeKind.takeError());
+      return false;
+    }
+    unsigned Kind = MaybeKind.get();
+    switch (Kind) {
+    case global_variable_block::GLOBAL_VARIABLE_DATA: {
+      // Already saw global variable table.
+      if (GlobalVariableTable)
+        return true;
+
+      uint32_t tableOffset;
+      global_variable_block::GlobalVariableDataLayout::readRecord(Scratch,
+                                                                  tableOffset);
+      auto base = reinterpret_cast<const uint8_t *>(BlobData.data());
+
+      GlobalVariableTable.reset(SerializedGlobalVariableTable::Create(
+          base + tableOffset, base + sizeof(uint32_t), base));
+      break;
+    }
+
+    default:
+      // Unknown record, possibly for use by a future version of the
+      // module format.
+      break;
+    }
+
+    MaybeNext = Cursor.advance();
+    if (!MaybeNext) {
+      // FIXME this drops the error on the floor.
+      consumeError(MaybeNext.takeError());
+      return false;
+    }
+    Next = MaybeNext.get();
+  }
+
+  return false;
+}
+
+bool APINotesReader::Implementation::readGlobalFunctionBlock(
+    llvm::BitstreamCursor &Cursor, llvm::SmallVectorImpl<uint64_t> &Scratch) {
+  if (Cursor.EnterSubBlock(GLOBAL_FUNCTION_BLOCK_ID))
+    return true;
+
+  llvm::Expected<llvm::BitstreamEntry> MaybeNext = Cursor.advance();
+  if (!MaybeNext) {
+    // FIXME this drops the error on the floor.
+    consumeError(MaybeNext.takeError());
+    return false;
+  }
+  llvm::BitstreamEntry Next = MaybeNext.get();
+  while (Next.Kind != llvm::BitstreamEntry::EndBlock) {
+    if (Next.Kind == llvm::BitstreamEntry::Error)
+      return true;
+
+    if (Next.Kind == llvm::BitstreamEntry::SubBlock) {
+      // Unknown sub-block, possibly for use by a future version of the
+      // API notes format.
+      if (Cursor.SkipBlock())
+        return true;
+
+      MaybeNext = Cursor.advance();
+      if (!MaybeNext) {
+        // FIXME this drops the error on the floor.
+        consumeError(MaybeNext.takeError());
+        return false;
+      }
+      Next = MaybeNext.get();
+      continue;
+    }
+
+    Scratch.clear();
+    llvm::StringRef BlobData;
+    llvm::Expected<unsigned> MaybeKind =
+        Cursor.readRecord(Next.ID, Scratch, &BlobData);
+    if (!MaybeKind) {
+      // FIXME this drops the error on the floor.
+      consumeError(MaybeKind.takeError());
+      return false;
+    }
+    unsigned Kind = MaybeKind.get();
+    switch (Kind) {
+    case global_function_block::GLOBAL_FUNCTION_DATA: {
+      // Already saw global function table.
+      if (GlobalFunctionTable)
+        return true;
+
+      uint32_t tableOffset;
+      global_function_block::GlobalFunctionDataLayout::readRecord(Scratch,
+                                                                  tableOffset);
+      auto base = reinterpret_cast<const uint8_t *>(BlobData.data());
+
+      GlobalFunctionTable.reset(SerializedGlobalFunctionTable::Create(
+          base + tableOffset, base + sizeof(uint32_t), base));
+      break;
+    }
+
+    default:
+      // Unknown record, possibly for use by a future version of the
+      // module format.
+      break;
+    }
+
+    MaybeNext = Cursor.advance();
+    if (!MaybeNext) {
+      // FIXME this drops the error on the floor.
+      consumeError(MaybeNext.takeError());
+      return false;
+    }
+    Next = MaybeNext.get();
+  }
+
+  return false;
+}
+
+bool APINotesReader::Implementation::readEnumConstantBlock(
+    llvm::BitstreamCursor &Cursor, llvm::SmallVectorImpl<uint64_t> &Scratch) {
+  if (Cursor.EnterSubBlock(ENUM_CONSTANT_BLOCK_ID))
+    return true;
+
+  llvm::Expected<llvm::BitstreamEntry> MaybeNext = Cursor.advance();
+  if (!MaybeNext) {
+    // FIXME this drops the error on the floor.
+    consumeError(MaybeNext.takeError());
+    return false;
+  }
+  llvm::BitstreamEntry Next = MaybeNext.get();
+  while (Next.Kind != llvm::BitstreamEntry::EndBlock) {
+    if (Next.Kind == llvm::BitstreamEntry::Error)
+      return true;
+
+    if (Next.Kind == llvm::BitstreamEntry::SubBlock) {
+      // Unknown sub-block, possibly for use by a future version of the
+      // API notes format.
+      if (Cursor.SkipBlock())
+        return true;
+
+      MaybeNext = Cursor.advance();
+      if (!MaybeNext) {
+        // FIXME this drops the error on the floor.
+        consumeError(MaybeNext.takeError());
+        return false;
+      }
+      Next = MaybeNext.get();
+      continue;
+    }
+
+    Scratch.clear();
+    llvm::StringRef BlobData;
+    llvm::Expected<unsigned> MaybeKind =
+        Cursor.readRecord(Next.ID, Scratch, &BlobData);
+    if (!MaybeKind) {
+      // FIXME this drops the error on the floor.
+      consumeError(MaybeKind.takeError());
+      return false;
+    }
+    unsigned Kind = MaybeKind.get();
+    switch (Kind) {
+    case enum_constant_block::ENUM_CONSTANT_DATA: {
+      // Already saw enumerator table.
+      if (EnumConstantTable)
+        return true;
+
+      uint32_t tableOffset;
+      enum_constant_block::EnumConstantDataLayout::readRecord(Scratch,
+                                                              tableOffset);
+      auto base = reinterpret_cast<const uint8_t *>(BlobData.data());
+
+      EnumConstantTable.reset(SerializedEnumConstantTable::Create(
+          base + tableOffset, base + sizeof(uint32_t), base));
+      break;
+    }
+
+    default:
+      // Unknown record, possibly for use by a future version of the
+      // module format.
+      break;
+    }
+
+    MaybeNext = Cursor.advance();
+    if (!MaybeNext) {
+      // FIXME this drops the error on the floor.
+      consumeError(MaybeNext.takeError());
+      return false;
+    }
+    Next = MaybeNext.get();
+  }
+
+  return false;
+}
+
+bool APINotesReader::Implementation::readTagBlock(
+    llvm::BitstreamCursor &Cursor, llvm::SmallVectorImpl<uint64_t> &Scratch) {
+  if (Cursor.EnterSubBlock(TAG_BLOCK_ID))
+    return true;
+
+  llvm::Expected<llvm::BitstreamEntry> MaybeNext = Cursor.advance();
+  if (!MaybeNext) {
+    // FIXME this drops the error on the floor.
+    consumeError(MaybeNext.takeError());
+    return false;
+  }
+  llvm::BitstreamEntry Next = MaybeNext.get();
+  while (Next.Kind != llvm::BitstreamEntry::EndBlock) {
+    if (Next.Kind == llvm::BitstreamEntry::Error)
+      return true;
+
+    if (Next.Kind == llvm::BitstreamEntry::SubBlock) {
+      // Unknown sub-block, possibly for use by a future version of the
+      // API notes format.
+      if (Cursor.SkipBlock())
+        return true;
+
+      MaybeNext = Cursor.advance();
+      if (!MaybeNext) {
+        // FIXME this drops the error on the floor.
+        consumeError(MaybeNext.takeError());
+        return false;
+      }
+      Next = MaybeNext.get();
+      continue;
+    }
+
+    Scratch.clear();
+    llvm::StringRef BlobData;
+    llvm::Expected<unsigned> MaybeKind =
+        Cursor.readRecord(Next.ID, Scratch, &BlobData);
+    if (!MaybeKind) {
+      // FIXME this drops the error on the floor.
+      consumeError(MaybeKind.takeError());
+      return false;
+    }
+    unsigned Kind = MaybeKind.get();
+    switch (Kind) {
+    case tag_block::TAG_DATA: {
+      // Already saw tag table.
+      if (TagTable)
+        return true;
+
+      uint32_t tableOffset;
+      tag_block::TagDataLayout::readRecord(Scratch, tableOffset);
+      auto base = reinterpret_cast<const uint8_t *>(BlobData.data());
+
+      TagTable.reset(SerializedTagTable::Create(base + tableOffset,
+                                                base + sizeof(uint32_t), base));
+      break;
+    }
+
+    default:
+      // Unknown record, possibly for use by a future version of the
+      // module format.
+      break;
+    }
+
+    MaybeNext = Cursor.advance();
+    if (!MaybeNext) {
+      // FIXME this drops the error on the floor.
+      consumeError(MaybeNext.takeError());
+      return false;
+    }
+    Next = MaybeNext.get();
+  }
+
+  return false;
+}
+
+bool APINotesReader::Implementation::readTypedefBlock(
+    llvm::BitstreamCursor &Cursor, llvm::SmallVectorImpl<uint64_t> &Scratch) {
+  if (Cursor.EnterSubBlock(TYPEDEF_BLOCK_ID))
+    return true;
+
+  llvm::Expected<llvm::BitstreamEntry> MaybeNext = Cursor.advance();
+  if (!MaybeNext) {
+    // FIXME this drops the error on the floor.
+    consumeError(MaybeNext.takeError());
+    return false;
+  }
+  llvm::BitstreamEntry Next = MaybeNext.get();
+  while (Next.Kind != llvm::BitstreamEntry::EndBlock) {
+    if (Next.Kind == llvm::BitstreamEntry::Error)
+      return true;
+
+    if (Next.Kind == llvm::BitstreamEntry::SubBlock) {
+      // Unknown sub-block, possibly for use by a future version of the
+      // API notes format.
+      if (Cursor.SkipBlock())
+        return true;
+
+      MaybeNext = Cursor.advance();
+      if (!MaybeNext) {
+        // FIXME this drops the error on the floor.
+        consumeError(MaybeNext.takeError());
+        return false;
+      }
+      Next = MaybeNext.get();
+      continue;
+    }
+
+    Scratch.clear();
+    llvm::StringRef BlobData;
+    llvm::Expected<unsigned> MaybeKind =
+        Cursor.readRecord(Next.ID, Scratch, &BlobData);
+    if (!MaybeKind) {
+      // FIXME this drops the error on the floor.
+      consumeError(MaybeKind.takeError());
+      return false;
+    }
+    unsigned Kind = MaybeKind.get();
+    switch (Kind) {
+    case typedef_block::TYPEDEF_DATA: {
+      // Already saw typedef table.
+      if (TypedefTable)
+        return true;
+
+      uint32_t tableOffset;
+      typedef_block::TypedefDataLayout::readRecord(Scratch, tableOffset);
+      auto base = reinterpret_cast<const uint8_t *>(BlobData.data());
+
+      TypedefTable.reset(SerializedTypedefTable::Create(
+          base + tableOffset, base + sizeof(uint32_t), base));
+      break;
+    }
+
+    default:
+      // Unknown record, possibly for use by a future version of the
+      // module format.
+      break;
+    }
+
+    MaybeNext = Cursor.advance();
+    if (!MaybeNext) {
+      // FIXME this drops the error on the floor.
+      consumeError(MaybeNext.takeError());
+      return false;
+    }
+    Next = MaybeNext.get();
+  }
+
+  return false;
+}
+
+APINotesReader::APINotesReader(llvm::MemoryBuffer *InputBuffer,
+                               llvm::VersionTuple SwiftVersion, bool &Failed)
+    : Implementation(new class Implementation) {
+  Failed = false;
+
+  // Initialize the input buffer.
+  Implementation->InputBuffer = InputBuffer;
+  Implementation->SwiftVersion = SwiftVersion;
+  llvm::BitstreamCursor Cursor(*Implementation->InputBuffer);
+
+  // Validate signature.
+  for (auto byte : API_NOTES_SIGNATURE) {
+    if (Cursor.AtEndOfStream()) {
+      Failed = true;
+      return;
+    }
+    if (llvm::Expected<llvm::SimpleBitstreamCursor::word_t> maybeRead =
+            Cursor.Read(8)) {
+      if (maybeRead.get() != byte) {
+        Failed = true;
+        return;
+      }
+    } else {
+      // FIXME this drops the error on the floor.
+      consumeError(maybeRead.takeError());
+      Failed = true;
+      return;
+    }
+  }
+
+  // Look at all of the blocks.
+  bool HasValidControlBlock = false;
+  llvm::SmallVector<uint64_t, 64> Scratch;
+  while (!Cursor.AtEndOfStream()) {
+    llvm::Expected<llvm::BitstreamEntry> MaybeTopLevelEntry = Cursor.advance();
+    if (!MaybeTopLevelEntry) {
+      // FIXME this drops the error on the floor.
+      consumeError(MaybeTopLevelEntry.takeError());
+      Failed = true;
+      return;
+    }
+    llvm::BitstreamEntry TopLevelEntry = MaybeTopLevelEntry.get();
+
+    if (TopLevelEntry.Kind != llvm::BitstreamEntry::SubBlock)
+      break;
+
+    switch (TopLevelEntry.ID) {
+    case llvm::bitc::BLOCKINFO_BLOCK_ID:
+      if (!Cursor.ReadBlockInfoBlock()) {
+        Failed = true;
+        break;
+      }
+      break;
+
+    case CONTROL_BLOCK_ID:
+      // Only allow a single control block.
+      if (HasValidControlBlock ||
+          Implementation->readControlBlock(Cursor, Scratch)) {
+        Failed = true;
+        return;
+      }
+
+      HasValidControlBlock = true;
+      break;
+
+    case IDENTIFIER_BLOCK_ID:
+      if (!HasValidControlBlock ||
+          Implementation->readIdentifierBlock(Cursor, Scratch)) {
+        Failed = true;
+        return;
+      }
+      break;
+
+    case OBJC_CONTEXT_BLOCK_ID:
+      if (!HasValidControlBlock ||
+          Implementation->readObjCContextBlock(Cursor, Scratch)) {
+        Failed = true;
+        return;
+      }
+
+      break;
+
+    case OBJC_PROPERTY_BLOCK_ID:
+      if (!HasValidControlBlock ||
+          Implementation->readObjCPropertyBlock(Cursor, Scratch)) {
+        Failed = true;
+        return;
+      }
+      break;
+
+    case OBJC_METHOD_BLOCK_ID:
+      if (!HasValidControlBlock ||
+          Implementation->readObjCMethodBlock(Cursor, Scratch)) {
+        Failed = true;
+        return;
+      }
+      break;
+
+    case OBJC_SELECTOR_BLOCK_ID:
+      if (!HasValidControlBlock ||
+          Implementation->readObjCSelectorBlock(Cursor, Scratch)) {
+        Failed = true;
+        return;
+      }
+      break;
+
+    case GLOBAL_VARIABLE_BLOCK_ID:
+      if (!HasValidControlBlock ||
+          Implementation->readGlobalVariableBlock(Cursor, Scratch)) {
+        Failed = true;
+        return;
+      }
+      break;
+
+    case GLOBAL_FUNCTION_BLOCK_ID:
+      if (!HasValidControlBlock ||
+          Implementation->readGlobalFunctionBlock(Cursor, Scratch)) {
+        Failed = true;
+        return;
+      }
+      break;
+
+    case ENUM_CONSTANT_BLOCK_ID:
+      if (!HasValidControlBlock ||
+          Implementation->readEnumConstantBlock(Cursor, Scratch)) {
+        Failed = true;
+        return;
+      }
+      break;
+
+    case TAG_BLOCK_ID:
+      if (!HasValidControlBlock ||
+          Implementation->readTagBlock(Cursor, Scratch)) {
+        Failed = true;
+        return;
+      }
+      break;
+
+    case TYPEDEF_BLOCK_ID:
+      if (!HasValidControlBlock ||
+          Implementation->readTypedefBlock(Cursor, Scratch)) {
+        Failed = true;
+        return;
+      }
+      break;
+
+    default:
+      // Unknown top-level block, possibly for use by a future version of the
+      // module format.
+      if (Cursor.SkipBlock()) {
+        Failed = true;
+        return;
+      }
+      break;
+    }
+  }
+
+  if (!Cursor.AtEndOfStream()) {
+    Failed = true;
+    return;
+  }
+}
+
+APINotesReader::~APINotesReader() { delete Implementation->InputBuffer; }
+
+std::unique_ptr<APINotesReader>
+APINotesReader::Create(std::unique_ptr<llvm::MemoryBuffer> InputBuffer,
+                       llvm::VersionTuple SwiftVersion) {
+  bool Failed = false;
+  std::unique_ptr<APINotesReader> Reader(
+      new APINotesReader(InputBuffer.release(), SwiftVersion, Failed));
+  if (Failed)
+    return nullptr;
+
+  return Reader;
+}
+
+template <typename T>
+APINotesReader::VersionedInfo<T>::VersionedInfo(
+    llvm::VersionTuple Version,
+    llvm::SmallVector<std::pair<llvm::VersionTuple, T>, 1> Results)
+    : Results(std::move(Results)) {
+
+  assert(!Results.empty());
+  assert(std::is_sorted(
+      Results.begin(), Results.end(),
+      [](const std::pair<llvm::VersionTuple, T> &left,
+         const std::pair<llvm::VersionTuple, T> &right) -> bool {
+        assert(left.first != right.first && "two entries for the same version");
+        return left.first < right.first;
+      }));
+
+  Selected = std::nullopt;
+  for (unsigned i = 0, n = Results.size(); i != n; ++i) {
+    if (!Version.empty() && Results[i].first >= Version) {
+      // If the current version is "4", then entries for 4 are better than
+      // entries for 5, but both are valid. Because entries are sorted, we get
+      // that behavior by picking the first match.
+      Selected = i;
+      break;
+    }
+  }
+
+  // If we didn't find a match but we have an unversioned result, use the
+  // unversioned result. This will always be the first entry because we encode
+  // it as version 0.
+  if (!Selected && Results[0].first.empty())
+    Selected = 0;
+}
+
+auto APINotesReader::lookupObjCClassID(llvm::StringRef Name)
+    -> std::optional<ContextID> {
+  if (!Implementation->ObjCContextIDTable)
+    return std::nullopt;
+
+  std::optional<IdentifierID> ClassID = Implementation->getIdentifier(Name);
+  if (!ClassID)
+    return std::nullopt;
+
+  // ObjC classes can't be declared in C++ namespaces, so use -1 as the global
+  // context.
+  auto KnownID = Implementation->ObjCContextIDTable->find(
+      ContextTableKey(-1, (uint8_t)ContextKind::ObjCClass, *ClassID));
+  if (KnownID == Implementation->ObjCContextIDTable->end())
+    return std::nullopt;
+
+  return ContextID(*KnownID);
+}
+
+auto APINotesReader::lookupObjCClassInfo(llvm::StringRef Name)
+    -> VersionedInfo<ObjCContextInfo> {
+  if (!Implementation->ObjCContextInfoTable)
+    return std::nullopt;
+
+  std::optional<ContextID> CtxID = lookupObjCClassID(Name);
+  if (!CtxID)
+    return std::nullopt;
+
+  auto KnownInfo = Implementation->ObjCContextInfoTable->find(CtxID->Value);
+  if (KnownInfo == Implementation->ObjCContextInfoTable->end())
+    return std::nullopt;
+
+  return {Implementation->SwiftVersion, *KnownInfo};
+}
+
+auto APINotesReader::lookupObjCProtocolID(llvm::StringRef Name)
+    -> std::optional<ContextID> {
+  if (!Implementation->ObjCContextIDTable)
+    return std::nullopt;
+
+  std::optional<IdentifierID> classID = Implementation->getIdentifier(Name);
+  if (!classID)
+    return std::nullopt;
+
+  // ObjC classes can't be declared in C++ namespaces, so use -1 as the global
+  // context.
+  auto KnownID = Implementation->ObjCContextIDTable->find(
+      ContextTableKey(-1, (uint8_t)ContextKind::ObjCProtocol, *classID));
+  if (KnownID == Implementation->ObjCContextIDTable->end())
+    return std::nullopt;
+
+  return ContextID(*KnownID);
+}
+
+auto APINotesReader::lookupObjCProtocolInfo(llvm::StringRef Name)
+    -> VersionedInfo<ObjCContextInfo> {
+  if (!Implementation->ObjCContextInfoTable)
+    return std::nullopt;
+
+  std::optional<ContextID> CtxID = lookupObjCProtocolID(Name);
+  if (!CtxID)
+    return std::nullopt;
+
+  auto KnownInfo = Implementation->ObjCContextInfoTable->find(CtxID->Value);
+  if (KnownInfo == Implementation->ObjCContextInfoTable->end())
+    return std::nullopt;
+
+  return {Implementation->SwiftVersion, *KnownInfo};
+}
+
+auto APINotesReader::lookupObjCProperty(ContextID CtxID, llvm::StringRef Name,
+                                        bool IsInstance)
+    -> VersionedInfo<ObjCPropertyInfo> {
+  if (!Implementation->ObjCPropertyTable)
+    return std::nullopt;
+
+  std::optional<IdentifierID> PropertyID = Implementation->getIdentifier(Name);
+  if (!PropertyID)
+    return std::nullopt;
+
+  auto Known = Implementation->ObjCPropertyTable->find(
+      std::make_tuple(CtxID.Value, *PropertyID, (char)IsInstance));
+  if (Known == Implementation->ObjCPropertyTable->end())
+    return std::nullopt;
+
+  return {Implementation->SwiftVersion, *Known};
+}
+
+auto APINotesReader::lookupObjCMethod(ContextID CtxID, ObjCSelectorRef Selector,
+                                      bool IsInstanceMethod)
+    -> VersionedInfo<ObjCMethodInfo> {
+  if (!Implementation->ObjCMethodTable)
+    return std::nullopt;
+
+  std::optional<SelectorID> SelID = Implementation->getSelector(Selector);
+  if (!SelID)
+    return std::nullopt;
+
+  auto Known = Implementation->ObjCMethodTable->find(
+      ObjCMethodTableInfo::internal_key_type{CtxID.Value, *SelID,
+                                             IsInstanceMethod});
+  if (Known == Implementation->ObjCMethodTable->end())
+    return std::nullopt;
+
+  return {Implementation->SwiftVersion, *Known};
+}
+
+auto APINotesReader::lookupGlobalVariable(llvm::StringRef Name,
+                                          std::optional<Context> Ctx)
+    -> VersionedInfo<GlobalVariableInfo> {
+  if (!Implementation->GlobalVariableTable)
+    return std::nullopt;
+
+  std::optional<IdentifierID> NameID = Implementation->getIdentifier(Name);
+  if (!NameID)
+    return std::nullopt;
+
+  ContextTableKey Key(Ctx, *NameID);
+
+  auto Known = Implementation->GlobalVariableTable->find(Key);
+  if (Known == Implementation->GlobalVariableTable->end())
+    return std::nullopt;
+
+  return {Implementation->SwiftVersion, *Known};
+}
+
+auto APINotesReader::lookupGlobalFunction(llvm::StringRef Name,
+                                          std::optional<Context> Ctx)
+    -> VersionedInfo<GlobalFunctionInfo> {
+  if (!Implementation->GlobalFunctionTable)
+    return std::nullopt;
+
+  std::optional<IdentifierID> NameID = Implementation->getIdentifier(Name);
+  if (!NameID)
+    return std::nullopt;
+
+  ContextTableKey Key(Ctx, *NameID);
+
+  auto Known = Implementation->GlobalFunctionTable->find(Key);
+  if (Known == Implementation->GlobalFunctionTable->end())
+    return std::nullopt;
+
+  return {Implementation->SwiftVersion, *Known};
+}
+
+auto APINotesReader::lookupEnumConstant(llvm::StringRef Name)
+    -> VersionedInfo<EnumConstantInfo> {
+  if (!Implementation->EnumConstantTable)
+    return std::nullopt;
+
+  std::optional<IdentifierID> NameID = Implementation->getIdentifier(Name);
+  if (!NameID)
+    return std::nullopt;
+
+  auto Known = Implementation->EnumConstantTable->find(*NameID);
+  if (Known == Implementation->EnumConstantTable->end())
+    return std::nullopt;
+
+  return {Implementation->SwiftVersion, *Known};
+}
+
+auto APINotesReader::lookupTag(llvm::StringRef Name, std::optional<Context> Ctx)
+    -> VersionedInfo<TagInfo> {
+  if (!Implementation->TagTable)
+    return std::nullopt;
+
+  std::optional<IdentifierID> NameID = Implementation->getIdentifier(Name);
+  if (!NameID)
+    return std::nullopt;
+
+  ContextTableKey Key(Ctx, *NameID);
+
+  auto Known = Implementation->TagTable->find(Key);
+  if (Known == Implementation->TagTable->end())
+    return std::nullopt;
+
+  return {Implementation->SwiftVersion, *Known};
+}
+
+auto APINotesReader::lookupTypedef(llvm::StringRef Name,
+                                   std::optional<Context> Ctx)
+    -> VersionedInfo<TypedefInfo> {
+  if (!Implementation->TypedefTable)
+    return std::nullopt;
+
+  std::optional<IdentifierID> NameID = Implementation->getIdentifier(Name);
+  if (!NameID)
+    return std::nullopt;
+
+  ContextTableKey Key(Ctx, *NameID);
+
+  auto Known = Implementation->TypedefTable->find(Key);
+  if (Known == Implementation->TypedefTable->end())
+    return std::nullopt;
+
+  return {Implementation->SwiftVersion, *Known};
+}
+
+auto APINotesReader::lookupNamespaceID(
+    llvm::StringRef Name, std::optional<ContextID> ParentNamespaceID)
+    -> std::optional<ContextID> {
+  if (!Implementation->ObjCContextIDTable)
+    return std::nullopt;
+
+  std::optional<IdentifierID> NamespaceID = Implementation->getIdentifier(Name);
+  if (!NamespaceID)
+    return std::nullopt;
+
+  uint32_t RawParentNamespaceID =
+      ParentNamespaceID ? ParentNamespaceID->Value : -1;
+  auto KnownID = Implementation->ObjCContextIDTable->find(
+      {RawParentNamespaceID, (uint8_t)ContextKind::Namespace, *NamespaceID});
+  if (KnownID == Implementation->ObjCContextIDTable->end())
+    return std::nullopt;
+
+  return ContextID(*KnownID);
+}
+
+} // namespace api_notes
+} // namespace clang
diff --git a/clang/lib/APINotes/CMakeLists.txt b/clang/lib/APINotes/CMakeLists.txt
index c34168876a42e..dec596ea160c6 100644
--- a/clang/lib/APINotes/CMakeLists.txt
+++ b/clang/lib/APINotes/CMakeLists.txt
@@ -1,6 +1,9 @@
 set(LLVM_LINK_COMPONENTS
+  BitReader
+  BitstreamReader
   Support)
 add_clang_library(clangAPINotes
+  APINotesReader.cpp
   APINotesTypes.cpp
   APINotesWriter.cpp
   APINotesYAMLCompiler.cpp

From 4bbb2bc051c751067ea3723978e5fae1acfe6d40 Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Mon, 30 Oct 2023 14:41:09 +0000
Subject: [PATCH 376/877] [gn build] Port bb352b6ead5b

---
 llvm/utils/gn/secondary/clang/lib/APINotes/BUILD.gn | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/utils/gn/secondary/clang/lib/APINotes/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/APINotes/BUILD.gn
index 08c10f0284b8f..2a367bed33a1a 100644
--- a/llvm/utils/gn/secondary/clang/lib/APINotes/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/lib/APINotes/BUILD.gn
@@ -6,6 +6,7 @@ static_library("APINotes") {
     "//llvm/lib/Support",
   ]
   sources = [
+    "APINotesReader.cpp",
     "APINotesTypes.cpp",
     "APINotesWriter.cpp",
     "APINotesYAMLCompiler.cpp",

From 6a62707c048e16ce9bad37ed8e3520799139436b Mon Sep 17 00:00:00 2001
From: agozillon <Andrew.Gozillon@amd.com>
Date: Mon, 30 Oct 2023 16:00:23 +0100
Subject: [PATCH 377/877] [Flang][OpenMP][MLIR] Initial array section mapping
 MLIR -> LLVM-IR lowering utilising omp.bounds (#68689)

This patch seeks to add initial lowering of OpenMP array sections within
target region map clauses from MLIR to LLVM IR.

This patch seeks to support fixed sized contiguous (don't think OpenMP
supports anything other than contiguous sections from my reading but i
could be wrong) arrays initially, before looking toward assumed size and
shaped arrays. The patch also currently does not include stride, it's
left for future work.

Although, assumed size works in some fashion (dummy arguments) with some
minor alterations to the OMPEarlyOutliner, so it is possible changes
made in the IsolatedFromAbove series may allow this to work with no
further required patches.

It utilises the generated omp.bounds to calculate the size of the mapped
OpenMP array (both for sectioned and un-sectioned arrays) as well as the
offset to be passed to the kernel argument structure.

Alongside these changes some refactoring of how map data is handled is
attempted, using a new MapData structure to keep track of information
utilised in the lowering of mapped values.

The initial addition of a more complex createDeviceArgumentAccessor that
utilises capture kinds similarly to (and loosely based on) Clang to
generate different kernel argument accesses is also added.

A similar function for altering how the kernel argument is passed to the
kernel argument structure on the host is also utilised
(createAlteredByCaptureMap), which allows modification of the
pointer/basePointer based on their capture (and bounds information).
It's of note ByRef, is the default for explicit mappings and ByCopy will
be the default for implicit captures, so the former is currently tested
in this patch and the latter is not for the moment.
---
 .../OpenMP/OpenMPToLLVMIRTranslation.cpp      | 535 +++++++++++++-----
 .../omptarget-array-sectioning-host.mlir      |  56 ++
 ...target-byref-bycopy-generation-device.mlir |  41 ++
 ...mptarget-byref-bycopy-generation-host.mlir |  42 ++
 mlir/test/Target/LLVMIR/omptarget-llvm.mlir   |  61 +-
 .../omptarget-region-parallel-llvm.mlir       |   2 +-
 .../basic-target-region-1D-array-section.f90  |  27 +
 .../basic-target-region-3D-array-section.f90  |  39 ++
 .../fortran/basic-target-region-3D-array.f90  |  45 ++
 .../fortran/basic-target-region-array.f90     |  27 +
 10 files changed, 710 insertions(+), 165 deletions(-)
 create mode 100644 mlir/test/Target/LLVMIR/omptarget-array-sectioning-host.mlir
 create mode 100644 mlir/test/Target/LLVMIR/omptarget-byref-bycopy-generation-device.mlir
 create mode 100644 mlir/test/Target/LLVMIR/omptarget-byref-bycopy-generation-host.mlir
 create mode 100644 openmp/libomptarget/test/offloading/fortran/basic-target-region-1D-array-section.f90
 create mode 100644 openmp/libomptarget/test/offloading/fortran/basic-target-region-3D-array-section.f90
 create mode 100644 openmp/libomptarget/test/offloading/fortran/basic-target-region-3D-array.f90
 create mode 100644 openmp/libomptarget/test/offloading/fortran/basic-target-region-array.f90

diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 875ce11391587..67875f668d4d3 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -1537,13 +1537,6 @@ convertOmpThreadprivate(Operation &opInst, llvm::IRBuilderBase &builder,
   return success();
 }
 
-int64_t getSizeInBytes(DataLayout &DL, const Type &type, const Type &eleType) {
-  if (isa<LLVM::LLVMPointerType>(type))
-    return DL.getTypeSize(eleType);
-
-  return 0;
-}
-
 static llvm::OffloadEntriesInfoManager::OMPTargetDeviceClauseKind
 convertToDeviceClauseKind(mlir::omp::DeclareTargetDeviceType deviceClause) {
   switch (deviceClause) {
@@ -1638,13 +1631,141 @@ getRefPtrIfDeclareTarget(mlir::Value value,
   return nullptr;
 }
 
+// A small helper structure to contain data gathered
+// for map lowering and coalese it into one area and
+// avoiding extra computations such as searches in the
+// llvm module for lowered mapped varibles or checking
+// if something is declare target (and retrieving the
+// value) more than neccessary.
+struct MapInfoData : llvm::OpenMPIRBuilder::MapInfosTy {
+  llvm::SmallVector<bool, 4> IsDeclareTarget;
+  llvm::SmallVector<mlir::Operation *, 4> MapClause;
+  llvm::SmallVector<llvm::Value *, 4> OriginalValue;
+  // Stripped off array/pointer to get the underlying
+  // element type
+  llvm::SmallVector<llvm::Type *, 4> BaseType;
+
+  /// Append arrays in \a CurInfo.
+  void append(MapInfoData &CurInfo) {
+    IsDeclareTarget.append(CurInfo.IsDeclareTarget.begin(),
+                           CurInfo.IsDeclareTarget.end());
+    MapClause.append(CurInfo.MapClause.begin(), CurInfo.MapClause.end());
+    OriginalValue.append(CurInfo.OriginalValue.begin(),
+                         CurInfo.OriginalValue.end());
+    BaseType.append(CurInfo.BaseType.begin(), CurInfo.BaseType.end());
+    llvm::OpenMPIRBuilder::MapInfosTy::append(CurInfo);
+  }
+};
+
+uint64_t getArrayElementSizeInBits(LLVM::LLVMArrayType arrTy, DataLayout &dl) {
+  if (auto nestedArrTy = llvm::dyn_cast_if_present<LLVM::LLVMArrayType>(
+          arrTy.getElementType()))
+    return getArrayElementSizeInBits(nestedArrTy, dl);
+  return dl.getTypeSizeInBits(arrTy.getElementType());
+}
+
+// This function calculates the size to be offloaded for a specified type, given
+// its associated map clause (which can contain bounds information which affects
+// the total size), this size is calculated based on the underlying element type
+// e.g. given a 1-D array of ints, we will calculate the size from the integer
+// type * number of elements in the array. This size can be used in other
+// calculations but is ultimately used as an argument to the OpenMP runtimes
+// kernel argument structure which is generated through the combinedInfo data
+// structures.
+// This function is somewhat equivalent to Clang's getExprTypeSize inside of
+// CGOpenMPRuntime.cpp.
+llvm::Value *getSizeInBytes(DataLayout &dl, const mlir::Type &type,
+                            Operation *clauseOp, llvm::IRBuilderBase &builder,
+                            LLVM::ModuleTranslation &moduleTranslation) {
+  // utilising getTypeSizeInBits instead of getTypeSize as getTypeSize gives
+  // the size in inconsistent byte or bit format.
+  uint64_t underlyingTypeSzInBits = dl.getTypeSizeInBits(type);
+  if (auto arrTy = llvm::dyn_cast_if_present<LLVM::LLVMArrayType>(type)) {
+    underlyingTypeSzInBits = getArrayElementSizeInBits(arrTy, dl);
+  }
+
+  if (auto memberClause =
+          mlir::dyn_cast_if_present<mlir::omp::MapInfoOp>(clauseOp)) {
+    // This calculates the size to transfer based on bounds and the underlying
+    // element type, provided bounds have been specified (Fortran
+    // pointers/allocatables/target and arrays that have sections specified fall
+    // into this as well).
+    if (!memberClause.getBounds().empty()) {
+      llvm::Value *elementCount = builder.getInt64(1);
+      for (auto bounds : memberClause.getBounds()) {
+        if (auto boundOp = mlir::dyn_cast_if_present<mlir::omp::DataBoundsOp>(
+                bounds.getDefiningOp())) {
+          // The below calculation for the size to be mapped calculated from the
+          // map_info's bounds is: (elemCount * [UB - LB] + 1), later we
+          // multiply by the underlying element types byte size to get the full
+          // size to be offloaded based on the bounds
+          elementCount = builder.CreateMul(
+              elementCount,
+              builder.CreateAdd(
+                  builder.CreateSub(
+                      moduleTranslation.lookupValue(boundOp.getUpperBound()),
+                      moduleTranslation.lookupValue(boundOp.getLowerBound())),
+                  builder.getInt64(1)));
+        }
+      }
+
+      // The size in bytes x number of elements, the sizeInBytes stored is
+      // the underyling types size, e.g. if ptr<i32>, it'll be the i32's
+      // size, so we do some on the fly runtime math to get the size in
+      // bytes from the extent (ub - lb) * sizeInBytes. NOTE: This may need
+      // some adjustment for members with more complex types.
+      return builder.CreateMul(elementCount,
+                               builder.getInt64(underlyingTypeSzInBits / 8));
+    }
+  }
+
+  return builder.getInt64(underlyingTypeSzInBits / 8);
+}
+
+void collectMapDataFromMapOperands(MapInfoData &mapData,
+                                   llvm::SmallVectorImpl<Value> &mapOperands,
+                                   LLVM::ModuleTranslation &moduleTranslation,
+                                   DataLayout &dl,
+                                   llvm::IRBuilderBase &builder) {
+  for (mlir::Value mapValue : mapOperands) {
+    assert(mlir::isa<mlir::omp::MapInfoOp>(mapValue.getDefiningOp()) &&
+           "missing map info operation or incorrect map info operation type");
+    if (auto mapOp = mlir::dyn_cast_if_present<mlir::omp::MapInfoOp>(
+            mapValue.getDefiningOp())) {
+      mapData.OriginalValue.push_back(
+          moduleTranslation.lookupValue(mapOp.getVarPtr()));
+      mapData.Pointers.push_back(mapData.OriginalValue.back());
+
+      if (llvm::Value *refPtr = getRefPtrIfDeclareTarget(
+              mapOp.getVarPtr(), moduleTranslation)) { // declare target
+        mapData.IsDeclareTarget.push_back(true);
+        mapData.BasePointers.push_back(refPtr);
+      } else { // regular mapped variable
+        mapData.IsDeclareTarget.push_back(false);
+        mapData.BasePointers.push_back(mapData.OriginalValue.back());
+      }
+
+      mapData.Sizes.push_back(getSizeInBytes(dl, mapOp.getVarType(), mapOp,
+                                             builder, moduleTranslation));
+      mapData.BaseType.push_back(
+          moduleTranslation.convertType(mapOp.getVarType()));
+      mapData.MapClause.push_back(mapOp.getOperation());
+      mapData.Types.push_back(
+          llvm::omp::OpenMPOffloadMappingFlags(mapOp.getMapType().value()));
+      mapData.Names.push_back(LLVM::createMappingInformation(
+          mapOp.getLoc(), *moduleTranslation.getOpenMPBuilder()));
+      mapData.DevicePointers.push_back(
+          llvm::OpenMPIRBuilder::DeviceInfoTy::None);
+    }
+  }
+}
+
 // Generate all map related information and fill the combinedInfo.
 static void genMapInfos(llvm::IRBuilderBase &builder,
                         LLVM::ModuleTranslation &moduleTranslation,
-                        DataLayout &DL,
+                        DataLayout &dl,
                         llvm::OpenMPIRBuilder::MapInfosTy &combinedInfo,
-                        const SmallVector<Value> &mapOperands,
-                        const ArrayAttr &mapTypes,
+                        MapInfoData &mapData,
                         const SmallVector<Value> &devPtrOperands = {},
                         const SmallVector<Value> &devAddrOperands = {},
                         bool isTargetParams = false) {
@@ -1659,58 +1780,39 @@ static void genMapInfos(llvm::IRBuilderBase &builder,
     combinedInfo.Names.clear();
   };
 
-  auto findMapInfo = [&combinedInfo](llvm::Value *val, unsigned &index) {
-    index = 0;
-    for (auto basePtr : combinedInfo.BasePointers) {
-      if (basePtr == val)
-        return true;
-      index++;
-    }
-    return false;
-  };
-
-  unsigned index = 0;
-  for (const auto &mapOp : mapOperands) {
-    // Unlike dev_ptr and dev_addr operands these map operands point
-    // to a map entry operation which contains further information
-    // on the variable being mapped and how it should be mapped.
-    auto mapInfoOp =
-        mlir::dyn_cast<mlir::omp::MapInfoOp>(mapOp.getDefiningOp());
-
-    // TODO: Only LLVMPointerTypes are handled.
-    if (!mapInfoOp.getType().isa<LLVM::LLVMPointerType>())
-      return fail();
-
-    llvm::Value *mapOpValue =
-        moduleTranslation.lookupValue(mapInfoOp.getVarPtr());
-
-    llvm::Value *refPtr =
-        getRefPtrIfDeclareTarget(mapInfoOp.getVarPtr(), moduleTranslation);
-
-    combinedInfo.BasePointers.emplace_back(refPtr ? refPtr : mapOpValue);
-    combinedInfo.Pointers.emplace_back(mapOpValue);
-    combinedInfo.DevicePointers.emplace_back(
-        llvm::OpenMPIRBuilder::DeviceInfoTy::None);
-    combinedInfo.Names.emplace_back(LLVM::createMappingInformation(
-        mapInfoOp.getVarPtr().getLoc(), *ompBuilder));
-
-    auto mapFlag = llvm::omp::OpenMPOffloadMappingFlags(
-        mapTypes[index].cast<IntegerAttr>().getUInt());
-
+  // We operate under the assumption that all vectors that are
+  // required in MapInfoData are of equal lengths (either filled with
+  // default constructed data or appropiate information) so we can
+  // utilise the size from any component of MapInfoData, if we can't
+  // something is missing from the initial MapInfoData construction.
+  for (size_t i = 0; i < mapData.MapClause.size(); ++i) {
     // Declare Target Mappings are excluded from being marked as
     // OMP_MAP_TARGET_PARAM as they are not passed as parameters, they're marked
     // with OMP_MAP_PTR_AND_OBJ instead.
-    if (refPtr)
+    auto mapFlag = mapData.Types[i];
+    if (mapData.IsDeclareTarget[i])
       mapFlag |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_PTR_AND_OBJ;
     else if (isTargetParams)
       mapFlag |= llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_TARGET_PARAM;
 
+    combinedInfo.BasePointers.emplace_back(mapData.BasePointers[i]);
+    combinedInfo.Pointers.emplace_back(mapData.Pointers[i]);
+    combinedInfo.DevicePointers.emplace_back(mapData.DevicePointers[i]);
+    combinedInfo.Names.emplace_back(mapData.Names[i]);
     combinedInfo.Types.emplace_back(mapFlag);
-    combinedInfo.Sizes.emplace_back(builder.getInt64(getSizeInBytes(
-        DL, mapInfoOp.getVarPtr().getType(), mapInfoOp.getVarType())));
-    index++;
+    combinedInfo.Sizes.emplace_back(mapData.Sizes[i]);
   }
 
+  auto findMapInfo = [&combinedInfo](llvm::Value *val, unsigned &index) {
+    index = 0;
+    for (llvm::Value *basePtr : combinedInfo.BasePointers) {
+      if (basePtr == val)
+        return true;
+      index++;
+    }
+    return false;
+  };
+
   auto addDevInfos = [&, fail](auto devOperands, auto devOpType) -> void {
     for (const auto &devOp : devOperands) {
       // TODO: Only LLVMPointerTypes are handled.
@@ -1756,19 +1858,6 @@ convertOmpTargetData(Operation *op, llvm::IRBuilderBase &builder,
 
   llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();
 
-  auto getMapTypes = [](mlir::OperandRange mapOperands,
-                        mlir::MLIRContext *ctx) {
-    SmallVector<mlir::Attribute> mapTypes;
-    for (auto mapValue : mapOperands) {
-      if (mapValue.getDefiningOp()) {
-        auto mapOp =
-            mlir::dyn_cast<mlir::omp::MapInfoOp>(mapValue.getDefiningOp());
-        mapTypes.push_back(mapOp.getMapTypeAttr());
-      }
-    }
-    return mlir::ArrayAttr::get(ctx, mapTypes);
-  };
-
   LogicalResult result =
       llvm::TypeSwitch<Operation *, LogicalResult>(op)
           .Case([&](omp::DataOp dataOp) {
@@ -1782,8 +1871,6 @@ convertOmpTargetData(Operation *op, llvm::IRBuilderBase &builder,
                   deviceID = intAttr.getInt();
 
             mapOperands = dataOp.getMapOperands();
-            mapTypes =
-                getMapTypes(dataOp.getMapOperands(), dataOp->getContext());
             useDevPtrOperands = dataOp.getUseDevicePtr();
             useDevAddrOperands = dataOp.getUseDeviceAddr();
             return success();
@@ -1802,8 +1889,6 @@ convertOmpTargetData(Operation *op, llvm::IRBuilderBase &builder,
                   deviceID = intAttr.getInt();
             RTLFn = llvm::omp::OMPRTL___tgt_target_data_begin_mapper;
             mapOperands = enterDataOp.getMapOperands();
-            mapTypes = getMapTypes(enterDataOp.getMapOperands(),
-                                   enterDataOp->getContext());
             return success();
           })
           .Case([&](omp::ExitDataOp exitDataOp) {
@@ -1821,8 +1906,6 @@ convertOmpTargetData(Operation *op, llvm::IRBuilderBase &builder,
 
             RTLFn = llvm::omp::OMPRTL___tgt_target_data_end_mapper;
             mapOperands = exitDataOp.getMapOperands();
-            mapTypes = getMapTypes(exitDataOp.getMapOperands(),
-                                   exitDataOp->getContext());
             return success();
           })
           .Default([&](Operation *op) {
@@ -1835,17 +1918,20 @@ convertOmpTargetData(Operation *op, llvm::IRBuilderBase &builder,
 
   using InsertPointTy = llvm::OpenMPIRBuilder::InsertPointTy;
 
+  MapInfoData mapData;
+  collectMapDataFromMapOperands(mapData, mapOperands, moduleTranslation, DL,
+                                builder);
+
   // Fill up the arrays with all the mapped variables.
   llvm::OpenMPIRBuilder::MapInfosTy combinedInfo;
   auto genMapInfoCB =
       [&](InsertPointTy codeGenIP) -> llvm::OpenMPIRBuilder::MapInfosTy & {
     builder.restoreIP(codeGenIP);
-    if (auto DataOp = dyn_cast<omp::DataOp>(op)) {
-      genMapInfos(builder, moduleTranslation, DL, combinedInfo, mapOperands,
-                  mapTypes, useDevPtrOperands, useDevAddrOperands);
+    if (auto dataOp = dyn_cast<omp::DataOp>(op)) {
+      genMapInfos(builder, moduleTranslation, DL, combinedInfo, mapData,
+                  useDevPtrOperands, useDevAddrOperands);
     } else {
-      genMapInfos(builder, moduleTranslation, DL, combinedInfo, mapOperands,
-                  mapTypes);
+      genMapInfos(builder, moduleTranslation, DL, combinedInfo, mapData);
     }
     return combinedInfo;
   };
@@ -1997,61 +2083,232 @@ static bool targetOpSupported(Operation &opInst) {
 }
 
 static void
-handleDeclareTargetMapVar(llvm::ArrayRef<Value> mapOperands,
+handleDeclareTargetMapVar(MapInfoData &mapData,
                           LLVM::ModuleTranslation &moduleTranslation,
                           llvm::IRBuilderBase &builder) {
-  for (const mlir::Value &mapOp : mapOperands) {
-    auto mapInfoOp =
-        mlir::dyn_cast<mlir::omp::MapInfoOp>(mapOp.getDefiningOp());
-    llvm::Value *mapOpValue =
-        moduleTranslation.lookupValue(mapInfoOp.getVarPtr());
-    if (auto *declareTarget = getRefPtrIfDeclareTarget(mapInfoOp.getVarPtr(),
-                                                       moduleTranslation)) {
-      // The user's iterator will get invalidated if we modify an element,
+  for (size_t i = 0; i < mapData.MapClause.size(); ++i) {
+    // In the case of declare target mapped variables, the basePointer is
+    // the reference pointer generated by the convertDeclareTargetAttr
+    // method. Whereas the kernelValue is the original variable, so for
+    // the device we must replace all uses of this original global variable
+    // (stored in kernelValue) with the reference pointer (stored in
+    // basePointer for declare target mapped variables), as for device the
+    // data is mapped into this reference pointer and should be loaded
+    // from it, the original variable is discarded. On host both exist and
+    // metadata is generated (elsewhere in the convertDeclareTargetAttr)
+    // function to link the two variables in the runtime and then both the
+    // reference pointer and the pointer are assigned in the kernel argument
+    // structure for the host.
+    if (mapData.IsDeclareTarget[i]) {
+      // The users iterator will get invalidated if we modify an element,
       // so we populate this vector of uses to alter each user on an individual
       // basis to emit its own load (rather than one load for all).
       llvm::SmallVector<llvm::User *> userVec;
-      for (llvm::User *user : mapOpValue->users())
+      for (llvm::User *user : mapData.OriginalValue[i]->users())
         userVec.push_back(user);
 
       for (llvm::User *user : userVec) {
         if (auto *insn = dyn_cast<llvm::Instruction>(user)) {
-          auto *load = builder.CreateLoad(
-              moduleTranslation.convertType(mapInfoOp.getVarPtr().getType()),
-              declareTarget);
+          auto *load = builder.CreateLoad(mapData.BasePointers[i]->getType(),
+                                          mapData.BasePointers[i]);
           load->moveBefore(insn);
-          user->replaceUsesOfWith(mapOpValue, load);
+          user->replaceUsesOfWith(mapData.OriginalValue[i], load);
         }
       }
     }
   }
 }
 
+// The createDeviceArgumentAccessor function generates
+// instructions for retrieving (acessing) kernel
+// arguments inside of the device kernel for use by
+// the kernel. This enables different semantics such as
+// the creation of temporary copies of data allowing
+// semantics like read-only/no host write back kernel
+// arguments.
+//
+// This currently implements a very light version of Clang's
+// EmitParmDecl's handling of direct argument handling as well
+// as a portion of the argument access generation based on
+// capture types found at the end of emitOutlinedFunctionPrologue
+// in Clang. The indirect path handling of EmitParmDecl's may be
+// required for future work, but a direct 1-to-1 copy doesn't seem
+// possible as the logic is rather scattered throughout Clang's
+// lowering and perhaps we wish to deviate slightly.
+//
+// \param mapData - A container containing vectors of information
+// corresponding to the input argument, which should have a
+// corresponding entry in the MapInfoData containers
+// OrigialValue's.
+// \param arg - This is the generated kernel function argument that
+// corresponds to the passed in input argument. We generated different
+// accesses of this Argument, based on capture type and other Input
+// related information.
+// \param input - This is the host side value that will be passed to
+// the kernel i.e. the kernel input, we rewrite all uses of this within
+// the kernel (as we generate the kernel body based on the target's region
+// which maintians references to the original input) to the retVal argument
+// apon exit of this function inside of the OMPIRBuilder. This interlinks
+// the kernel argument to future uses of it in the function providing
+// appropriate "glue" instructions inbetween.
+// \param retVal - This is the value that all uses of input inside of the
+// kernel will be re-written to, the goal of this function is to generate
+// an appropriate location for the kernel argument to be accessed from,
+// e.g. ByRef will result in a temporary allocation location and then
+// a store of the kernel argument into this allocated memory which
+// will then be loaded from, ByCopy will use the allocated memory
+// directly.
 static llvm::IRBuilderBase::InsertPoint
-createDeviceArgumentAccessor(llvm::Argument &arg, llvm::Value *input,
-                             llvm::Value *&retVal, llvm::IRBuilderBase &builder,
+createDeviceArgumentAccessor(MapInfoData &mapData, llvm::Argument &arg,
+                             llvm::Value *input, llvm::Value *&retVal,
+                             llvm::IRBuilderBase &builder,
                              llvm::OpenMPIRBuilder &ompBuilder,
                              LLVM::ModuleTranslation &moduleTranslation,
                              llvm::IRBuilderBase::InsertPoint allocaIP,
                              llvm::IRBuilderBase::InsertPoint codeGenIP) {
   builder.restoreIP(allocaIP);
 
-  llvm::Value *addr =
+  mlir::omp::VariableCaptureKind capture =
+      mlir::omp::VariableCaptureKind::ByRef;
+  llvm::Type *inputType = input->getType();
+
+  // Find the associated MapInfoData entry for the current input
+  for (size_t i = 0; i < mapData.MapClause.size(); ++i)
+    if (mapData.OriginalValue[i] == input) {
+      if (auto mapOp = mlir::dyn_cast_if_present<mlir::omp::MapInfoOp>(
+              mapData.MapClause[i])) {
+        capture = mapOp.getMapCaptureType().value_or(
+            mlir::omp::VariableCaptureKind::ByRef);
+      }
+
+      inputType = mapData.BaseType[i];
+      break;
+    }
+
+  unsigned int allocaAS = ompBuilder.M.getDataLayout().getAllocaAddrSpace();
+  unsigned int defaultAS =
+      ompBuilder.M.getDataLayout().getProgramAddressSpace();
+
+  // Create the alloca for the argument the current point.
+  llvm::Value *v =
       builder.CreateAlloca(arg.getType()->isPointerTy()
                                ? arg.getType()
                                : llvm::Type::getInt64Ty(builder.getContext()),
                            ompBuilder.M.getDataLayout().getAllocaAddrSpace());
-  llvm::Value *addrAscast =
-      arg.getType()->isPointerTy()
-          ? builder.CreatePointerBitCastOrAddrSpaceCast(addr, input->getType())
-          : addr;
 
-  builder.CreateStore(&arg, addrAscast);
+  if (allocaAS != defaultAS && arg.getType()->isPointerTy()) {
+    v = builder.CreatePointerBitCastOrAddrSpaceCast(
+        v, arg.getType()->getPointerTo(defaultAS));
+  }
+
+  builder.CreateStore(&arg, v);
+
   builder.restoreIP(codeGenIP);
-  retVal = builder.CreateLoad(arg.getType(), addrAscast);
+
+  switch (capture) {
+  case mlir::omp::VariableCaptureKind::ByCopy: {
+    // RHS of || aims to ignore conversions like int -> uint, but further
+    // extension of this path must be implemented for the moment it'll fall
+    // through to the assert.
+    if (inputType->isPointerTy() || v->getType() == inputType->getPointerTo()) {
+      retVal = v;
+      return builder.saveIP();
+    }
+
+    assert(false && "Currently unsupported OMPTargetVarCaptureByCopy Type");
+    break;
+  }
+  case mlir::omp::VariableCaptureKind::ByRef: {
+    retVal = builder.CreateAlignedLoad(
+        v->getType(), v,
+        ompBuilder.M.getDataLayout().getPrefTypeAlign(v->getType()));
+    break;
+  }
+  case mlir::omp::VariableCaptureKind::This:
+  case mlir::omp::VariableCaptureKind::VLAType:
+    assert(false && "Currently unsupported capture kind");
+    break;
+  }
+
   return builder.saveIP();
 }
 
+// This is a variation on Clang's GenerateOpenMPCapturedVars, which
+// generates different operation (e.g. load/store) combinations for
+// arguments to the kernel, based on map capture kinds which are then
+// utilised in the combinedInfo in place of the original Map value.
+static void
+createAlteredByCaptureMap(MapInfoData &mapData,
+                          LLVM::ModuleTranslation &moduleTranslation,
+                          llvm::IRBuilderBase &builder) {
+  for (size_t i = 0; i < mapData.MapClause.size(); ++i) {
+    // if it's declare target, skip it, it's handled seperately.
+    if (!mapData.IsDeclareTarget[i]) {
+      mlir::omp::VariableCaptureKind captureKind =
+          mlir::omp::VariableCaptureKind::ByRef;
+
+      if (auto mapOp = mlir::dyn_cast_if_present<mlir::omp::MapInfoOp>(
+              mapData.MapClause[i])) {
+        captureKind = mapOp.getMapCaptureType().value_or(
+            mlir::omp::VariableCaptureKind::ByRef);
+      }
+
+      switch (captureKind) {
+      case mlir::omp::VariableCaptureKind::ByRef: {
+        // Currently handles array sectioning lowerbound case, but more
+        // logic may be required in the future. Clang invokes EmitLValue,
+        // which has specialised logic for special Clang types such as user
+        // defines, so it is possible we will have to extend this for
+        // structures or other complex types. As the general idea is that this
+        // function mimics some of the logic from Clang that we require for
+        // kernel argument passing from host -> device.
+        if (auto mapOp = mlir::dyn_cast_if_present<mlir::omp::MapInfoOp>(
+                mapData.MapClause[i])) {
+          if (!mapOp.getBounds().empty() && mapData.BaseType[i]->isArrayTy()) {
+
+            std::vector<llvm::Value *> idx =
+                std::vector<llvm::Value *>{builder.getInt64(0)};
+            for (int i = mapOp.getBounds().size() - 1; i >= 0; --i) {
+              if (auto boundOp =
+                      mlir::dyn_cast_if_present<mlir::omp::DataBoundsOp>(
+                          mapOp.getBounds()[i].getDefiningOp())) {
+                idx.push_back(
+                    moduleTranslation.lookupValue(boundOp.getLowerBound()));
+              }
+            }
+
+            mapData.Pointers[i] = builder.CreateInBoundsGEP(
+                mapData.BaseType[i], mapData.Pointers[i], idx);
+          }
+        }
+      } break;
+      case mlir::omp::VariableCaptureKind::ByCopy: {
+        llvm::Type *type = mapData.BaseType[i];
+        llvm::Value *newV = builder.CreateLoad(type, mapData.Pointers[i]);
+
+        if (!type->isPointerTy()) {
+          auto curInsert = builder.saveIP();
+          builder.restoreIP(findAllocaInsertPoint(builder, moduleTranslation));
+          auto *memTempAlloc =
+              builder.CreateAlloca(builder.getInt8PtrTy(), nullptr, ".casted");
+          builder.restoreIP(curInsert);
+
+          builder.CreateStore(newV, memTempAlloc);
+          newV = builder.CreateLoad(builder.getInt8PtrTy(), memTempAlloc);
+        }
+
+        mapData.Pointers[i] = newV;
+        mapData.BasePointers[i] = newV;
+      } break;
+      case mlir::omp::VariableCaptureKind::This:
+      case mlir::omp::VariableCaptureKind::VLAType:
+        mapData.MapClause[i]->emitOpError("Unhandled capture kind");
+        break;
+      }
+    }
+  }
+}
+
 static LogicalResult
 convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder,
                  LLVM::ModuleTranslation &moduleTranslation) {
@@ -2062,26 +2319,6 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder,
   auto targetOp = cast<omp::TargetOp>(opInst);
   auto &targetRegion = targetOp.getRegion();
 
-  // This function filters out kernel data that will not show up as kernel
-  // input arguments to the generated kernel function but will still need
-  // explicitly mapped through supplying information to the OpenMP runtime
-  // (declare target). It also prepares some data used for generating the
-  // kernel and populating the associated OpenMP runtime data structures.
-  auto getKernelArguments =
-      [&](const llvm::SetVector<Value> &operandSet,
-          llvm::SmallVectorImpl<llvm::Value *> &llvmInputs) {
-        for (Value operand : operandSet) {
-          if (!getRefPtrIfDeclareTarget(operand, moduleTranslation))
-            llvmInputs.push_back(moduleTranslation.lookupValue(operand));
-        }
-      };
-
-  llvm::SetVector<Value> operandSet;
-  getUsedValuesDefinedAbove(targetRegion, operandSet);
-
-  llvm::SmallVector<llvm::Value *> inputs;
-  getKernelArguments(operandSet, inputs);
-
   LogicalResult bodyGenStatus = success();
 
   using InsertPointTy = llvm::OpenMPIRBuilder::InsertPointTy;
@@ -2115,31 +2352,32 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder,
   llvm::OpenMPIRBuilder::InsertPointTy allocaIP =
       findAllocaInsertPoint(builder, moduleTranslation);
 
-  DataLayout DL = DataLayout(opInst.getParentOfType<ModuleOp>());
-  SmallVector<Value> mapOperands = targetOp.getMapOperands();
-
-  auto getMapTypes = [](mlir::OperandRange mapOperands,
-                        mlir::MLIRContext *ctx) {
-    SmallVector<mlir::Attribute> mapTypes;
-    for (auto mapValue : mapOperands) {
-      if (mapValue.getDefiningOp()) {
-        auto mapOp =
-            mlir::dyn_cast<mlir::omp::MapInfoOp>(mapValue.getDefiningOp());
-        mapTypes.push_back(mapOp.getMapTypeAttr());
-      }
-    }
-    return mlir::ArrayAttr::get(ctx, mapTypes);
-  };
-
-  ArrayAttr mapTypes =
-      getMapTypes(targetOp.getMapOperands(), targetOp->getContext());
+  DataLayout dl = DataLayout(opInst.getParentOfType<ModuleOp>());
+  llvm::SmallVector<Value> mapOperands = targetOp.getMapOperands();
+  MapInfoData mapData;
+  collectMapDataFromMapOperands(mapData, mapOperands, moduleTranslation, dl,
+                                builder);
+
+  // We wish to modify some of the methods in which kernel arguments are
+  // passed based on their capture type by the target region, this can
+  // involve generating new loads and stores, which changes the
+  // MLIR value to LLVM value mapping, however, we only wish to do this
+  // locally for the current function/target and also avoid altering
+  // ModuleTranslation, so we remap the base pointer or pointer stored
+  // in the map infos corresponding MapInfoData, which is later accessed
+  // by genMapInfos and createTarget to help generate the kernel and
+  // kernel arg structure. It primarily becomes relevant in cases like
+  // bycopy, or byref range'd arrays. In the default case, we simply
+  // pass thee pointer byref as both basePointer and pointer.
+  if (!moduleTranslation.getOpenMPBuilder()->Config.isTargetDevice())
+    createAlteredByCaptureMap(mapData, moduleTranslation, builder);
 
   llvm::OpenMPIRBuilder::MapInfosTy combinedInfos;
   auto genMapInfoCB = [&](llvm::OpenMPIRBuilder::InsertPointTy codeGenIP)
       -> llvm::OpenMPIRBuilder::MapInfosTy & {
     builder.restoreIP(codeGenIP);
-    genMapInfos(builder, moduleTranslation, DL, combinedInfos, mapOperands,
-                mapTypes, {}, {}, true);
+    genMapInfos(builder, moduleTranslation, dl, combinedInfos, mapData, {}, {},
+                true);
     return combinedInfos;
   };
 
@@ -2158,22 +2396,27 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder,
       return codeGenIP;
     }
 
-    return createDeviceArgumentAccessor(arg, input, retVal, builder,
+    return createDeviceArgumentAccessor(mapData, arg, input, retVal, builder,
                                         *ompBuilder, moduleTranslation,
                                         allocaIP, codeGenIP);
   };
 
+  llvm::SmallVector<llvm::Value *, 4> kernelInput;
+  for (size_t i = 0; i < mapData.MapClause.size(); ++i) {
+    // declare target arguments are not passed to kernels as arguments
+    if (!mapData.IsDeclareTarget[i])
+      kernelInput.push_back(mapData.OriginalValue[i]);
+  }
+
   builder.restoreIP(moduleTranslation.getOpenMPBuilder()->createTarget(
       ompLoc, allocaIP, builder.saveIP(), entryInfo, defaultValTeams,
-      defaultValThreads, inputs, genMapInfoCB, bodyCB, argAccessorCB));
+      defaultValThreads, kernelInput, genMapInfoCB, bodyCB, argAccessorCB));
 
   // Remap access operations to declare target reference pointers for the
   // device, essentially generating extra loadop's as necessary
-  if (moduleTranslation.getOpenMPBuilder()->Config.isTargetDevice()) {
-    SmallVector<Value> mapOperands = targetOp.getMapOperands();
-    handleDeclareTargetMapVar(llvm::ArrayRef(mapOperands), moduleTranslation,
-                              builder);
-  }
+  if (moduleTranslation.getOpenMPBuilder()->Config.isTargetDevice())
+    handleDeclareTargetMapVar(mapData, moduleTranslation, builder);
+
   return bodyGenStatus;
 }
 
diff --git a/mlir/test/Target/LLVMIR/omptarget-array-sectioning-host.mlir b/mlir/test/Target/LLVMIR/omptarget-array-sectioning-host.mlir
new file mode 100644
index 0000000000000..056085123480b
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/omptarget-array-sectioning-host.mlir
@@ -0,0 +1,56 @@
+// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
+
+// This test checks the offload sizes provided to the OpenMP kernel argument
+// structure are correct when lowering to LLVM-IR from MLIR with 3-D bounds 
+// provided for a 3-D array. One with full default size, and the other with 
+// a user specified OpenMP array sectioning. We expect the default sized 
+// array bounds to lower to the full size of the array and the sectioned 
+// array to be the size of 3*3*1*element-byte-size (36 bytes in this case).
+
+module attributes {omp.is_target_device = false} {
+  llvm.func @_3d_target_array_section() {
+    %0 = llvm.mlir.addressof @_QFEinarray : !llvm.ptr
+    %1 = llvm.mlir.addressof @_QFEoutarray : !llvm.ptr
+    %2 = llvm.mlir.constant(1 : index) : i64
+    %3 = llvm.mlir.constant(0 : index) : i64
+    %4 = llvm.mlir.constant(2 : index) : i64
+    %5 = omp.bounds   lower_bound(%3 : i64) upper_bound(%4 : i64) stride(%2 : i64) start_idx(%2 : i64)
+    %6 = omp.bounds   lower_bound(%2 : i64) upper_bound(%2 : i64) stride(%2 : i64) start_idx(%2 : i64)
+    %7 = omp.map_info var_ptr(%0 : !llvm.ptr, !llvm.array<3 x array<3 x array<3 x i32>>>)   map_clauses(tofrom) capture(ByRef) bounds(%5, %5, %6) -> !llvm.ptr {name = "inarray(1:3,1:3,2:2)"}
+    %8 = omp.map_info var_ptr(%1 : !llvm.ptr, !llvm.array<3 x array<3 x array<3 x i32>>>)   map_clauses(tofrom) capture(ByRef) bounds(%5, %5, %5) -> !llvm.ptr {name = "outarray(1:3,1:3,1:3)"}
+    omp.target   map_entries(%7, %8 : !llvm.ptr, !llvm.ptr) {
+      %9 = llvm.mlir.constant(0 : i64) : i64
+      %10 = llvm.mlir.constant(1 : i64) : i64
+      %11 = llvm.getelementptr %0[0, %10, %9, %9] : (!llvm.ptr, i64, i64, i64) -> !llvm.ptr, !llvm.array<3 x array<3 x array<3 x i32>>>
+      %12 = llvm.load %11 : !llvm.ptr -> i32
+      %13 = llvm.getelementptr %1[0, %10, %9, %9] : (!llvm.ptr, i64, i64, i64) -> !llvm.ptr, !llvm.array<3 x array<3 x array<3 x i32>>>
+      llvm.store %12, %13 : i32, !llvm.ptr
+      omp.terminator
+    }
+    llvm.return
+  }
+  llvm.mlir.global internal @_QFEinarray() {addr_space = 0 : i32} : !llvm.array<3 x array<3 x array<3 x i32>>> {
+    %0 = llvm.mlir.zero : !llvm.array<3 x array<3 x array<3 x i32>>>
+    llvm.return %0 : !llvm.array<3 x array<3 x array<3 x i32>>>
+  }
+  llvm.mlir.global internal @_QFEoutarray() {addr_space = 0 : i32} : !llvm.array<3 x array<3 x array<3 x i32>>> {
+    %0 = llvm.mlir.zero : !llvm.array<3 x array<3 x array<3 x i32>>>
+    llvm.return %0 : !llvm.array<3 x array<3 x array<3 x i32>>>
+  }
+}
+
+// CHECK: @.offload_sizes = private unnamed_addr constant [2 x i64] [i64 36, i64 108]
+// CHECK: @.offload_maptypes = private unnamed_addr constant [2 x i64] [i64 35, i64 35]
+// CHECKL: @.offload_mapnames = private constant [2 x ptr] [ptr @0, ptr @1]
+
+// CHECK: define void @_3d_target_array_section()
+
+// CHECK: %[[OFFLOADBASEPTRS:.*]] = getelementptr inbounds [2 x ptr], ptr %.offload_baseptrs, i32 0, i32 0
+// CHECK: store ptr @_QFEinarray, ptr %[[OFFLOADBASEPTRS]], align 8
+// CHECK: %[[OFFLOADPTRS:.*]] = getelementptr inbounds [2 x ptr], ptr %.offload_ptrs, i32 0, i32 0
+// CHECK: store ptr getelementptr inbounds ([3 x [3 x [3 x i32]]], ptr @_QFEinarray, i64 0, i64 1, i64 0, i64 0), ptr %[[OFFLOADPTRS]], align 8
+
+// CHECK: %[[OFFLOADBASEPTRS2:.*]] = getelementptr inbounds [2 x ptr], ptr %.offload_baseptrs, i32 0, i32 1
+// CHECK: store ptr @_QFEoutarray, ptr %[[OFFLOADBASEPTRS2]], align 8
+// CHECK: %[[OFFLOADPTRS2:.*]] = getelementptr inbounds [2 x ptr], ptr %.offload_ptrs, i32 0, i32 1
+// CHECK: store ptr @_QFEoutarray, ptr %[[OFFLOADPTRS2]], align 8
diff --git a/mlir/test/Target/LLVMIR/omptarget-byref-bycopy-generation-device.mlir b/mlir/test/Target/LLVMIR/omptarget-byref-bycopy-generation-device.mlir
new file mode 100644
index 0000000000000..cf6b7257ac606
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/omptarget-byref-bycopy-generation-device.mlir
@@ -0,0 +1,41 @@
+// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
+
+module attributes {omp.is_target_device = true} {
+  llvm.func @_QQmain() attributes {fir.bindc_name = "main"} {
+    %0 = llvm.mlir.addressof @_QFEi : !llvm.ptr
+    %1 = llvm.mlir.addressof @_QFEsp : !llvm.ptr
+    %2 = omp.map_info var_ptr(%1 : !llvm.ptr, i32) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = "sp"}
+    %3 = omp.map_info var_ptr(%0 : !llvm.ptr, i32) map_clauses(to) capture(ByCopy) -> !llvm.ptr {name = "i"}
+    omp.target map_entries(%2, %3 : !llvm.ptr, !llvm.ptr) {
+      %4 = llvm.load %0 : !llvm.ptr -> i32
+      llvm.store %4, %1 : i32, !llvm.ptr
+      omp.terminator
+    }
+    llvm.return
+  }
+  llvm.mlir.global internal @_QFEi() {addr_space = 0 : i32} : i32 {
+    %0 = llvm.mlir.constant(1 : i32) : i32
+    llvm.return %0 : i32
+  }
+  llvm.mlir.global internal @_QFEsp() {addr_space = 0 : i32} : i32 {
+    %0 = llvm.mlir.constant(0 : i32) : i32
+    llvm.return %0 : i32
+  }
+}
+
+// CHECK: define {{.*}} void @__omp_offloading_{{.*}}_{{.*}}__QQmain_l{{.*}}(ptr %[[ARG_BYREF:.*]], ptr %[[ARG_BYCOPY:.*]]) {
+
+// CHECK: entry:
+// CHECK: %[[ALLOCA_BYREF:.*]] = alloca ptr, align 8
+// CHECK: store ptr %[[ARG_BYREF]], ptr %[[ALLOCA_BYREF]], align 8
+// CHECK: %[[ALLOCA_BYCOPY:.*]] = alloca ptr, align 8
+// CHECK: store ptr %[[ARG_BYCOPY]], ptr %[[ALLOCA_BYCOPY]], align 8
+
+// CHECK: user_code.entry:                                  ; preds = %entry
+// CHECK: %[[LOAD_BYREF:.*]] = load ptr, ptr %[[ALLOCA_BYREF]], align 8 
+// CHECK: br label %omp.target
+
+// CHECK: omp.target:                                       ; preds = %user_code.entry
+// CHECK:  %[[VAL_LOAD_BYCOPY:.*]] = load i32, ptr %[[ALLOCA_BYCOPY]], align 4
+// CHECK:  store i32 %[[VAL_LOAD_BYCOPY]], ptr %[[LOAD_BYREF]], align 4
+// CHECK: br label %omp.region.cont
diff --git a/mlir/test/Target/LLVMIR/omptarget-byref-bycopy-generation-host.mlir b/mlir/test/Target/LLVMIR/omptarget-byref-bycopy-generation-host.mlir
new file mode 100644
index 0000000000000..ca5dad8b4fc9a
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/omptarget-byref-bycopy-generation-host.mlir
@@ -0,0 +1,42 @@
+// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
+
+module attributes {omp.is_target_device = false} {
+  llvm.func @_QQmain() attributes {fir.bindc_name = "main"} {
+    %0 = llvm.mlir.addressof @_QFEi : !llvm.ptr
+    %1 = llvm.mlir.addressof @_QFEsp : !llvm.ptr
+    %2 = omp.map_info var_ptr(%1 : !llvm.ptr, i32) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = "sp"}
+    %3 = omp.map_info var_ptr(%0 : !llvm.ptr, i32) map_clauses(to) capture(ByCopy) -> !llvm.ptr {name = "i"}
+    omp.target map_entries(%2, %3 : !llvm.ptr, !llvm.ptr) {
+      %4 = llvm.load %0 : !llvm.ptr -> i32
+      llvm.store %4, %1 : i32, !llvm.ptr
+      omp.terminator
+    }
+    llvm.return
+  }
+  llvm.mlir.global internal @_QFEi() {addr_space = 0 : i32} : i32 {
+    %0 = llvm.mlir.constant(1 : i32) : i32
+    llvm.return %0 : i32
+  }
+  llvm.mlir.global internal @_QFEsp() {addr_space = 0 : i32} : i32 {
+    %0 = llvm.mlir.constant(0 : i32) : i32
+    llvm.return %0 : i32
+  }
+}
+
+// CHECK: define void @_QQmain() {
+// CHECK: %[[BYCOPY_ALLOCA:.*]] = alloca ptr, align 8
+
+// CHECK: entry:                                            ; preds = %0
+// CHECK: %[[LOAD_VAL:.*]] = load i32, ptr @_QFEi, align 4
+// CHECK: store i32 %[[LOAD_VAL]], ptr %[[BYCOPY_ALLOCA]], align 4
+// CHECK: %[[BYCOPY_LOAD:.*]] = load ptr, ptr %[[BYCOPY_ALLOCA]], align 8
+
+// CHECK: %[[BASEPTR_BYREF:.*]] = getelementptr inbounds [2 x ptr], ptr %.offload_baseptrs, i32 0, i32 0
+// CHECK: store ptr @_QFEsp, ptr %[[BASEPTR_BYREF]], align 8
+// CHECK: %[[OFFLOADPTR_BYREF:.*]] = getelementptr inbounds [2 x ptr], ptr %.offload_ptrs, i32 0, i32 0
+// CHECK: store ptr @_QFEsp, ptr %[[OFFLOADPTR_BYREF]], align 8
+
+// CHECK: %[[BASEPTR_BYCOPY:.*]] = getelementptr inbounds [2 x ptr], ptr %.offload_baseptrs, i32 0, i32 1
+// CHECK: store ptr %[[BYCOPY_LOAD]], ptr %[[BASEPTR_BYCOPY]], align 8
+// CHECK: %[[OFFLOADPTR_BYREF:.*]] = getelementptr inbounds [2 x ptr], ptr %.offload_ptrs, i32 0, i32 1
+// CHECK: store ptr %[[BYCOPY_LOAD]], ptr %[[OFFLOADPTR_BYREF]], align 8
diff --git a/mlir/test/Target/LLVMIR/omptarget-llvm.mlir b/mlir/test/Target/LLVMIR/omptarget-llvm.mlir
index f2431ec87933f..9221b410d766e 100644
--- a/mlir/test/Target/LLVMIR/omptarget-llvm.mlir
+++ b/mlir/test/Target/LLVMIR/omptarget-llvm.mlir
@@ -38,15 +38,20 @@ llvm.func @_QPopenmp_target_data() {
 
 // -----
 
-llvm.func @_QPopenmp_target_data_region(%1 : !llvm.ptr) {
-  %2 = omp.map_info var_ptr(%1 : !llvm.ptr, !llvm.array<1024 x i32>)   map_clauses(from) capture(ByRef) -> !llvm.ptr {name = ""}
-  omp.target_data map_entries(%2 : !llvm.ptr) {
-    %3 = llvm.mlir.constant(99 : i32) : i32
-    %4 = llvm.mlir.constant(1 : i64) : i64
-    %5 = llvm.mlir.constant(1 : i64) : i64
-    %6 = llvm.mlir.constant(0 : i64) : i64
-    %7 = llvm.getelementptr %1[0, %6] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.array<1024 x i32>
-    llvm.store %3, %7 : i32, !llvm.ptr
+llvm.func @_QPopenmp_target_data_region(%0 : !llvm.ptr) {
+  %1 = llvm.mlir.constant(1023 : index) : i64
+  %2 = llvm.mlir.constant(0 : index) : i64
+  %3 = llvm.mlir.constant(1024 : index) : i64
+  %4 = llvm.mlir.constant(1 : index) : i64
+  %5 = omp.bounds   lower_bound(%2 : i64) upper_bound(%1 : i64) extent(%3 : i64) stride(%4 : i64) start_idx(%4 : i64)
+  %6 = omp.map_info var_ptr(%0 : !llvm.ptr, !llvm.array<1024 x i32>)   map_clauses(from) capture(ByRef) bounds(%5)  -> !llvm.ptr {name = ""}
+  omp.target_data map_entries(%6 : !llvm.ptr) {
+    %7 = llvm.mlir.constant(99 : i32) : i32
+    %8 = llvm.mlir.constant(1 : i64) : i64
+    %9 = llvm.mlir.constant(1 : i64) : i64
+    %10 = llvm.mlir.constant(0 : i64) : i64
+    %11 = llvm.getelementptr %0[0, %10] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.array<1024 x i32>
+    llvm.store %7, %11 : i32, !llvm.ptr
     omp.terminator
   }
   llvm.return
@@ -92,16 +97,36 @@ llvm.func @_QPomp_target_enter_exit(%1 : !llvm.ptr, %3 : !llvm.ptr) {
   %11 = llvm.mlir.constant(10 : i32) : i32
   %12 = llvm.icmp "slt" %10, %11 : i32
   %13 = llvm.load %5 : !llvm.ptr -> i32
-  %map1 = omp.map_info var_ptr(%1 : !llvm.ptr, !llvm.array<1024 x i32>)   map_clauses(to) capture(ByRef) -> !llvm.ptr {name = ""}
-  %map2 = omp.map_info var_ptr(%3 : !llvm.ptr, !llvm.array<512 x i32>)   map_clauses(exit_release_or_enter_alloc) capture(ByRef) -> !llvm.ptr {name = ""}
+  %14 = llvm.mlir.constant(1023 : index) : i64
+  %15 = llvm.mlir.constant(0 : index) : i64
+  %16 = llvm.mlir.constant(1024 : index) : i64
+  %17 = llvm.mlir.constant(1 : index) : i64
+  %18 = omp.bounds   lower_bound(%15 : i64) upper_bound(%14 : i64) extent(%16 : i64) stride(%17 : i64) start_idx(%17 : i64)
+  %map1 = omp.map_info var_ptr(%1 : !llvm.ptr, !llvm.array<1024 x i32>)   map_clauses(to) capture(ByRef) bounds(%18) -> !llvm.ptr {name = ""}
+  %19 = llvm.mlir.constant(511 : index) : i64
+  %20 = llvm.mlir.constant(0 : index) : i64
+  %21 = llvm.mlir.constant(512 : index) : i64
+  %22 = llvm.mlir.constant(1 : index) : i64
+  %23 = omp.bounds   lower_bound(%20 : i64) upper_bound(%19 : i64) extent(%21 : i64) stride(%22 : i64) start_idx(%22 : i64)
+  %map2 = omp.map_info var_ptr(%3 : !llvm.ptr, !llvm.array<512 x i32>)   map_clauses(exit_release_or_enter_alloc) capture(ByRef) bounds(%23) -> !llvm.ptr {name = ""}
   omp.target_enter_data   if(%12 : i1) device(%13 : i32) map_entries(%map1, %map2 : !llvm.ptr, !llvm.ptr)
-  %14 = llvm.load %7 : !llvm.ptr -> i32
-  %15 = llvm.mlir.constant(10 : i32) : i32
-  %16 = llvm.icmp "sgt" %14, %15 : i32
-  %17 = llvm.load %5 : !llvm.ptr -> i32
-  %map3 = omp.map_info var_ptr(%1 : !llvm.ptr, !llvm.array<1024 x i32>)   map_clauses(from) capture(ByRef) -> !llvm.ptr {name = ""}
-  %map4 = omp.map_info var_ptr(%3 : !llvm.ptr, !llvm.array<512 x i32>)   map_clauses(exit_release_or_enter_alloc) capture(ByRef) -> !llvm.ptr {name = ""}
-  omp.target_exit_data   if(%16 : i1) device(%17 : i32) map_entries(%map3, %map4 : !llvm.ptr, !llvm.ptr)
+  %24 = llvm.load %7 : !llvm.ptr -> i32
+  %25 = llvm.mlir.constant(10 : i32) : i32
+  %26 = llvm.icmp "sgt" %24, %25 : i32
+  %27 = llvm.load %5 : !llvm.ptr -> i32
+  %28 = llvm.mlir.constant(1023 : index) : i64
+  %29 = llvm.mlir.constant(0 : index) : i64
+  %30 = llvm.mlir.constant(1024 : index) : i64
+  %31 = llvm.mlir.constant(1 : index) : i64
+  %32 = omp.bounds   lower_bound(%29 : i64) upper_bound(%28 : i64) extent(%30 : i64) stride(%31 : i64) start_idx(%31 : i64)
+  %map3 = omp.map_info var_ptr(%1 : !llvm.ptr, !llvm.array<1024 x i32>)   map_clauses(from) capture(ByRef) bounds(%32) -> !llvm.ptr {name = ""}
+  %33 = llvm.mlir.constant(511 : index) : i64
+  %34 = llvm.mlir.constant(0 : index) : i64
+  %35 = llvm.mlir.constant(512 : index) : i64
+  %36 = llvm.mlir.constant(1 : index) : i64
+  %37 = omp.bounds   lower_bound(%34 : i64) upper_bound(%33 : i64) extent(%35 : i64) stride(%36 : i64) start_idx(%36 : i64)
+  %map4 = omp.map_info var_ptr(%3 : !llvm.ptr, !llvm.array<512 x i32>)   map_clauses(exit_release_or_enter_alloc) capture(ByRef) bounds(%37) -> !llvm.ptr {name = ""}
+  omp.target_exit_data   if(%26 : i1) device(%27 : i32) map_entries(%map3, %map4 : !llvm.ptr, !llvm.ptr)
   llvm.return
 }
 
diff --git a/mlir/test/Target/LLVMIR/omptarget-region-parallel-llvm.mlir b/mlir/test/Target/LLVMIR/omptarget-region-parallel-llvm.mlir
index 20ad6d30c2f52..1d8799ecd446f 100644
--- a/mlir/test/Target/LLVMIR/omptarget-region-parallel-llvm.mlir
+++ b/mlir/test/Target/LLVMIR/omptarget-region-parallel-llvm.mlir
@@ -38,7 +38,7 @@ module attributes {omp.is_target_device = false} {
 // CHECK: store ptr %[[ADDR_B]], ptr %[[GEP2]], align 8
 // CHECK: %[[GEP3:.*]] = getelementptr { ptr, ptr, ptr }, ptr %[[STRUCTARG]], i32 0, i32 2
 // CHECK: store ptr %[[ADDR_C]], ptr %[[GEP3]], align 8
-// CHECK: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @1, i32 1, ptr @__omp_offloading_[[DEV]]_[[FIL]]_omp_target_region__l[[LINE]]..omp_par, ptr %[[STRUCTARG]])
+// CHECK: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @{{.*}}, i32 1, ptr @__omp_offloading_[[DEV]]_[[FIL]]_omp_target_region__l[[LINE]]..omp_par, ptr %[[STRUCTARG]])
 
 
 // CHECK: define internal void @__omp_offloading_[[DEV]]_[[FIL]]_omp_target_region__l[[LINE]]..omp_par(ptr noalias %tid.addr, ptr noalias %zero.addr, ptr %[[STRUCTARG2:.*]]) #0 {
diff --git a/openmp/libomptarget/test/offloading/fortran/basic-target-region-1D-array-section.f90 b/openmp/libomptarget/test/offloading/fortran/basic-target-region-1D-array-section.f90
new file mode 100644
index 0000000000000..11d3b6936bcea
--- /dev/null
+++ b/openmp/libomptarget/test/offloading/fortran/basic-target-region-1D-array-section.f90
@@ -0,0 +1,27 @@
+! Basic offloading test of arrays with provided lower 
+! and upper bounds as specified by OpenMP's sectioning
+! REQUIRES: flang, amdgcn-amd-amdhsa
+! UNSUPPORTED: nvptx64-nvidia-cuda
+! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
+! UNSUPPORTED: aarch64-unknown-linux-gnu
+! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+! UNSUPPORTED: x86_64-pc-linux-gnu
+! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+
+! RUN: %libomptarget-compile-fortran-run-and-check-generic
+program main
+    implicit none
+    integer :: write_arr(10) =  (/0,0,0,0,0,0,0,0,0,0/)
+    integer :: read_arr(10) = (/1,2,3,4,5,6,7,8,9,10/)
+    integer :: i = 2
+
+    !$omp target map(to:read_arr(2:5)) map(from:write_arr(2:5)) map(tofrom:i)
+        do i = 2, 5
+            write_arr(i) = read_arr(i)
+        end do
+    !$omp end target
+    
+    print *, write_arr(:)
+end program
+
+! CHECK: 0 2 3 4 5 0 0 0 0 0
diff --git a/openmp/libomptarget/test/offloading/fortran/basic-target-region-3D-array-section.f90 b/openmp/libomptarget/test/offloading/fortran/basic-target-region-3D-array-section.f90
new file mode 100644
index 0000000000000..28b2afced4d1b
--- /dev/null
+++ b/openmp/libomptarget/test/offloading/fortran/basic-target-region-3D-array-section.f90
@@ -0,0 +1,39 @@
+! Basic offloading test of a regular array explicitly
+! passed within a target region
+! REQUIRES: flang, amdgcn-amd-amdhsa
+! UNSUPPORTED: nvptx64-nvidia-cuda
+! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
+! UNSUPPORTED: aarch64-unknown-linux-gnu
+! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+! UNSUPPORTED: x86_64-pc-linux-gnu
+! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+
+! RUN: %libomptarget-compile-fortran-run-and-check-generic
+program main
+    implicit none
+    integer :: inArray(3,3,3)
+    integer :: outArray(3,3,3)
+    integer :: i, j, k 
+
+    do i = 1, 3
+      do j = 1, 3
+        do k = 1, 3
+            inArray(i, j, k) = 42
+            outArray(i, j, k) = 0
+        end do
+       end do
+    end do
+
+!$omp target map(tofrom:inArray(1:3, 1:3, 2:2), outArray(1:3, 1:3, 1:3), j, k)
+    do j = 1, 3
+      do k = 1, 3
+        outArray(k, j, 2) = inArray(k, j, 2)
+      end do
+    end do
+!$omp end target
+
+ print *, outArray
+
+end program
+
+! CHECK:  0 0 0 0 0 0 0 0 0 42 42 42 42 42 42 42 42 42 0 0 0 0 0 0 0 0 0
diff --git a/openmp/libomptarget/test/offloading/fortran/basic-target-region-3D-array.f90 b/openmp/libomptarget/test/offloading/fortran/basic-target-region-3D-array.f90
new file mode 100644
index 0000000000000..58f42138ad0af
--- /dev/null
+++ b/openmp/libomptarget/test/offloading/fortran/basic-target-region-3D-array.f90
@@ -0,0 +1,45 @@
+! Basic offloading test of a regular array explicitly
+! passed within a target region
+! REQUIRES: flang, amdgcn-amd-amdhsa
+! UNSUPPORTED: nvptx64-nvidia-cuda
+! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
+! UNSUPPORTED: aarch64-unknown-linux-gnu
+! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+! UNSUPPORTED: x86_64-pc-linux-gnu
+! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+
+! RUN: %libomptarget-compile-fortran-run-and-check-generic
+program main
+    implicit none
+    integer :: x(2,2,2)
+    integer :: i = 1, j = 1, k = 1
+    integer :: counter = 1
+    do i = 1, 2
+        do j = 1, 2
+          do k = 1, 2
+            x(i, j, k) = 0
+          end do
+        end do
+    end do
+
+!$omp target map(tofrom:x, i, j, k, counter)
+    do i = 1, 2
+        do j = 1, 2
+          do k = 1, 2
+            x(i, j, k) = counter
+            counter = counter + 1
+          end do
+        end do
+    end do
+!$omp end target
+
+     do i = 1, 2
+        do j = 1, 2
+          do k = 1, 2
+            print *, x(i, j, k)
+          end do
+        end do
+    end do
+end program main
+  
+! CHECK: 1 2 3 4 5 6 7 8
diff --git a/openmp/libomptarget/test/offloading/fortran/basic-target-region-array.f90 b/openmp/libomptarget/test/offloading/fortran/basic-target-region-array.f90
new file mode 100644
index 0000000000000..d3c799ff3334f
--- /dev/null
+++ b/openmp/libomptarget/test/offloading/fortran/basic-target-region-array.f90
@@ -0,0 +1,27 @@
+! Basic offloading test of a regular array explicitly
+! passed within a target region
+! REQUIRES: flang, amdgcn-amd-amdhsa
+! UNSUPPORTED: nvptx64-nvidia-cuda
+! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
+! UNSUPPORTED: aarch64-unknown-linux-gnu
+! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+! UNSUPPORTED: x86_64-pc-linux-gnu
+! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+
+! RUN: %libomptarget-compile-fortran-run-and-check-generic
+program main
+    integer :: x(10) = (/0,0,0,0,0,0,0,0,0,0/)
+    integer :: i = 1
+    integer :: j = 11
+
+  !$omp target map(tofrom:x, i, j)
+     do while (i <= j)
+        x(i) = i;
+        i = i + 1
+    end do
+  !$omp end target
+
+   PRINT *, x(:)
+end program main
+  
+! CHECK: 1 2 3 4 5 6 7 8 9 10

From 273ceb1337cdccb730a6930b50544c0ee7d7cd7e Mon Sep 17 00:00:00 2001
From: cor3ntin <corentinjabot@gmail.com>
Date: Mon, 30 Oct 2023 16:05:53 +0100
Subject: [PATCH 378/877] [Clang] Diagnose defaulted assignment operator with
 incompatible object parameter (#70176)

Per https://eel.is/c++draft/dcl.fct.def.default#2.2, the explicit object
parameter of a defaulted special member function must be of the same
type as the one of an equivalent implicitly defaulted function, ignoring
references.

Fixes #69233
---
 .../clang/Basic/DiagnosticSemaKinds.td        |  3 ++
 clang/lib/Sema/SemaDeclCXX.cpp                | 18 ++++++++
 clang/test/SemaCXX/cxx2b-deducing-this.cpp    | 41 +++++++++++++++++++
 3 files changed, 62 insertions(+)

diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 453bd8a9a3404..224c0df7f1fb7 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -9483,6 +9483,9 @@ def err_defaulted_special_member_return_type : Error<
 def err_defaulted_special_member_quals : Error<
   "an explicitly-defaulted %select{copy|move}0 assignment operator may not "
   "have 'const'%select{, 'constexpr'|}1 or 'volatile' qualifiers">;
+def err_defaulted_special_member_explicit_object_mismatch : Error<
+  "the type of the explicit object parameter of an explicitly-defaulted "
+  "%select{copy|move}0 assignment operator should match the type of the class %1">;
 def err_defaulted_special_member_volatile_param : Error<
   "the parameter for an explicitly-defaulted %sub{select_special_member_kind}0 "
   "may not be volatile">;
diff --git a/clang/lib/Sema/SemaDeclCXX.cpp b/clang/lib/Sema/SemaDeclCXX.cpp
index a3f68d4ffc0f6..beb7e5b177c6e 100644
--- a/clang/lib/Sema/SemaDeclCXX.cpp
+++ b/clang/lib/Sema/SemaDeclCXX.cpp
@@ -7748,6 +7748,24 @@ bool Sema::CheckExplicitlyDefaultedSpecialMember(CXXMethodDecl *MD,
         HadError = true;
       }
     }
+    // [C++23][dcl.fct.def.default]/p2.2
+    // if F2 has an implicit object parameter of type “reference to C”,
+    // F1 may be an explicit object member function whose explicit object
+    // parameter is of (possibly different) type “reference to C”,
+    // in which case the type of F1 would differ from the type of F2
+    // in that the type of F1 has an additional parameter;
+    if (!Context.hasSameType(
+            ThisType.getNonReferenceType().getUnqualifiedType(),
+            Context.getRecordType(RD))) {
+      if (DeleteOnTypeMismatch)
+        ShouldDeleteForTypeMismatch = true;
+      else {
+        Diag(MD->getLocation(),
+             diag::err_defaulted_special_member_explicit_object_mismatch)
+            << (CSM == CXXMoveAssignment) << RD << MD->getSourceRange();
+        HadError = true;
+      }
+    }
   }
 
   // Check for parameter type matching.
diff --git a/clang/test/SemaCXX/cxx2b-deducing-this.cpp b/clang/test/SemaCXX/cxx2b-deducing-this.cpp
index 535381e876da9..0033541fa322d 100644
--- a/clang/test/SemaCXX/cxx2b-deducing-this.cpp
+++ b/clang/test/SemaCXX/cxx2b-deducing-this.cpp
@@ -585,3 +585,44 @@ class Server : public Thing {
     S name_;
 };
 }
+
+namespace GH69233 {
+struct Base {};
+struct S : Base {
+    int j;
+    S& operator=(this Base& self, const S&) = default;
+    // expected-warning@-1 {{explicitly defaulted copy assignment operator is implicitly deleted}}
+    // expected-note@-2 {{function is implicitly deleted because its declared type does not match the type of an implicit copy assignment operator}}
+    // expected-note@-3 {{explicitly defaulted function was implicitly deleted here}}
+};
+
+struct S2 {
+    S2& operator=(this int&& self, const S2&);
+    S2& operator=(this int&& self, S2&&);
+    operator int();
+};
+
+S2& S2::operator=(this int&& self, const S2&) = default;
+// expected-error@-1 {{the type of the explicit object parameter of an explicitly-defaulted copy assignment operator should match the type of the class 'S2'}}
+
+S2& S2::operator=(this int&& self, S2&&) = default;
+// expected-error@-1 {{the type of the explicit object parameter of an explicitly-defaulted move assignment operator should match the type of the class 'S2'}}
+
+struct Move {
+    Move& operator=(this int&, Move&&) = default;
+    // expected-warning@-1 {{explicitly defaulted move assignment operator is implicitly deleted}}
+    // expected-note@-2 {{function is implicitly deleted because its declared type does not match the type of an implicit move assignment operator}}
+    // expected-note@-3 {{copy assignment operator is implicitly deleted because 'Move' has a user-declared move assignment operator}}
+};
+
+void test() {
+    S s;
+    s = s; // expected-error {{object of type 'S' cannot be assigned because its copy assignment operator is implicitly deleted}}
+    S2 s2;
+    s2 = s2;
+
+    Move m;
+    m = Move{}; // expected-error {{object of type 'Move' cannot be assigned because its copy assignment operator is implicitly deleted}}
+}
+
+}

From ee6d62db997bf61ef17a9f888c240f60656dc2db Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <rampitec@users.noreply.github.com>
Date: Mon, 30 Oct 2023 08:07:43 -0700
Subject: [PATCH 379/877] [AMDGPU] Prevent folding of the negative i32 literals
 as i64 (#70274)

We can use sign extended 64-bit literals, but only for signed operands.
At the moment we do not know if an operand is signed. Such operand will
be encoded as its low 32 bits and then either correctly sign extended or
incorrectly zero extended by HW.
---
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp        |  15 +-
 .../AMDGPU/fold-short-64-bit-literals.mir     |   6 +-
 .../CodeGen/AMDGPU/folding-of-i32-as-i64.mir  | 128 ++++++++++++++++++
 3 files changed, 145 insertions(+), 4 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/folding-of-i32-as-i64.mir

diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index fae43bf30a3f6..f1e375ee52cb8 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -5611,9 +5611,18 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
                      OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_INT64 ||
                      OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2INT32 ||
                      OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2FP32;
-    if (Is64BitOp && !AMDGPU::isValid32BitLiteral(Imm, Is64BitFPOp) &&
-        !AMDGPU::isInlinableLiteral64(Imm, ST.hasInv2PiInlineImm()))
-      return false;
+    if (Is64BitOp &&
+        !AMDGPU::isInlinableLiteral64(Imm, ST.hasInv2PiInlineImm())) {
+      if (!AMDGPU::isValid32BitLiteral(Imm, Is64BitFPOp))
+        return false;
+
+      // FIXME: We can use sign extended 64-bit literals, but only for signed
+      //        operands. At the moment we do not know if an operand is signed.
+      //        Such operand will be encoded as its low 32 bits and then either
+      //        correctly sign extended or incorrectly zero extended by HW.
+      if (!Is64BitFPOp && (int32_t)Imm < 0)
+        return false;
+    }
   }
 
   // Handle non-register types that are treated like immediates.
diff --git a/llvm/test/CodeGen/AMDGPU/fold-short-64-bit-literals.mir b/llvm/test/CodeGen/AMDGPU/fold-short-64-bit-literals.mir
index 6e975c8a53707..69c6a858162f3 100644
--- a/llvm/test/CodeGen/AMDGPU/fold-short-64-bit-literals.mir
+++ b/llvm/test/CodeGen/AMDGPU/fold-short-64-bit-literals.mir
@@ -84,6 +84,9 @@ body:             |
     SI_RETURN_TO_EPILOG %2
 ...
 
+# FIXME: This could be folded, but we do not know if operand of S_AND_B64 is signed or unsigned
+#        and if it will be sign or zero extended.
+
 ---
 name:            fold_uint_32bit_literal_sgpr
 tracksRegLiveness: true
@@ -92,7 +95,8 @@ body:             |
 
     ; GCN-LABEL: name: fold_uint_32bit_literal_sgpr
     ; GCN: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
-    ; GCN-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[DEF]], 4294967295, implicit-def $scc
+    ; GCN-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 4294967295
+    ; GCN-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[DEF]], [[S_MOV_B64_]], implicit-def $scc
     ; GCN-NEXT: SI_RETURN_TO_EPILOG [[S_AND_B64_]]
     %0:sreg_64 = IMPLICIT_DEF
     %1:sreg_64 = S_MOV_B64 4294967295
diff --git a/llvm/test/CodeGen/AMDGPU/folding-of-i32-as-i64.mir b/llvm/test/CodeGen/AMDGPU/folding-of-i32-as-i64.mir
new file mode 100644
index 0000000000000..30cb88d5789fd
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/folding-of-i32-as-i64.mir
@@ -0,0 +1,128 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
+# RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass=si-fold-operands -o - %s | FileCheck -check-prefix=GCN %s
+
+# The constant is 0xffffffff80000000. It is 64-bit negative constant, but it passes the test
+# isInt<32>(). Nonetheless it is not a legal literal for a binary or unsigned operand and
+# cannot be used right in the shift as HW will zero extend it.
+
+---
+name:            imm64_shift_int32_const_0xffffffff80000000
+body: |
+  bb.0:
+    ; GCN-LABEL: name: imm64_shift_int32_const_0xffffffff80000000
+    ; GCN: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -2147483648
+    ; GCN-NEXT: [[S_LSHL_B64_:%[0-9]+]]:sreg_64 = S_LSHL_B64 [[S_MOV_B]], 1, implicit-def $scc
+    ; GCN-NEXT: S_ENDPGM 0, implicit [[S_LSHL_B64_]]
+    %0:sreg_64 = S_MOV_B64_IMM_PSEUDO 18446744071562067968
+    %1:sreg_64 = S_LSHL_B64 %0, 1, implicit-def $scc
+    S_ENDPGM 0, implicit %1
+
+...
+
+---
+name:            imm64_shift_int32_const_0xffffffff
+body: |
+  bb.0:
+    ; GCN-LABEL: name: imm64_shift_int32_const_0xffffffff
+    ; GCN: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 4294967295
+    ; GCN-NEXT: [[S_LSHL_B64_:%[0-9]+]]:sreg_64 = S_LSHL_B64 [[S_MOV_B]], 1, implicit-def $scc
+    ; GCN-NEXT: S_ENDPGM 0, implicit [[S_LSHL_B64_]]
+    %0:sreg_64 = S_MOV_B64_IMM_PSEUDO 4294967295
+    %1:sreg_64 = S_LSHL_B64 %0, 1, implicit-def $scc
+    S_ENDPGM 0, implicit %1
+
+...
+
+---
+name:            imm64_shift_int32_const_0x80000000
+body: |
+  bb.0:
+    ; GCN-LABEL: name: imm64_shift_int32_const_0x80000000
+    ; GCN: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 2147483648
+    ; GCN-NEXT: [[S_LSHL_B64_:%[0-9]+]]:sreg_64 = S_LSHL_B64 [[S_MOV_B]], 1, implicit-def $scc
+    ; GCN-NEXT: S_ENDPGM 0, implicit [[S_LSHL_B64_]]
+    %0:sreg_64 = S_MOV_B64_IMM_PSEUDO 2147483648
+    %1:sreg_64 = S_LSHL_B64 %0, 1, implicit-def $scc
+    S_ENDPGM 0, implicit %1
+
+...
+
+---
+name:            imm64_shift_int32_const_0x7fffffff
+body: |
+  bb.0:
+    ; GCN-LABEL: name: imm64_shift_int32_const_0x7fffffff
+    ; GCN: [[S_LSHL_B64_:%[0-9]+]]:sreg_64 = S_LSHL_B64 2147483647, 1, implicit-def $scc
+    ; GCN-NEXT: S_ENDPGM 0, implicit [[S_LSHL_B64_]]
+    %0:sreg_64 = S_MOV_B64_IMM_PSEUDO 2147483647
+    %1:sreg_64 = S_LSHL_B64 %0, 1, implicit-def $scc
+    S_ENDPGM 0, implicit %1
+
+...
+
+---
+name:            imm64_shift_int32_const_0x1ffffffff
+body: |
+  bb.0:
+    ; GCN-LABEL: name: imm64_shift_int32_const_0x1ffffffff
+    ; GCN: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 8589934591
+    ; GCN-NEXT: [[S_LSHL_B64_:%[0-9]+]]:sreg_64 = S_LSHL_B64 [[S_MOV_B]], 1, implicit-def $scc
+    ; GCN-NEXT: S_ENDPGM 0, implicit [[S_LSHL_B64_]]
+    %0:sreg_64 = S_MOV_B64_IMM_PSEUDO 8589934591
+    %1:sreg_64 = S_LSHL_B64 %0, 1, implicit-def $scc
+    S_ENDPGM 0, implicit %1
+
+...
+
+---
+name:            imm64_shift_int32_const_0xffffffffffffffff
+body: |
+  bb.0:
+    ; GCN-LABEL: name: imm64_shift_int32_const_0xffffffffffffffff
+    ; GCN: [[S_LSHL_B64_:%[0-9]+]]:sreg_64 = S_LSHL_B64 -1, 1, implicit-def $scc
+    ; GCN-NEXT: S_ENDPGM 0, implicit [[S_LSHL_B64_]]
+    %0:sreg_64 = S_MOV_B64_IMM_PSEUDO -1
+    %1:sreg_64 = S_LSHL_B64 %0, 1, implicit-def $scc
+    S_ENDPGM 0, implicit %1
+
+...
+
+---
+name:            imm64_ashr_int32_const_0xffffffff
+body: |
+  bb.0:
+    ; GCN-LABEL: name: imm64_ashr_int32_const_0xffffffff
+    ; GCN: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 4294967295
+    ; GCN-NEXT: [[S_ASHR_I64_:%[0-9]+]]:sreg_64 = S_ASHR_I64 [[S_MOV_B]], 1, implicit-def $scc
+    ; GCN-NEXT: S_ENDPGM 0, implicit [[S_ASHR_I64_]]
+    %0:sreg_64 = S_MOV_B64_IMM_PSEUDO 4294967295
+    %1:sreg_64 = S_ASHR_I64 %0, 1, implicit-def $scc
+    S_ENDPGM 0, implicit %1
+
+...
+
+---
+name:            imm64_ashr_int32_const_0x7fffffff
+body: |
+  bb.0:
+    ; GCN-LABEL: name: imm64_ashr_int32_const_0x7fffffff
+    ; GCN: [[S_ASHR_I64_:%[0-9]+]]:sreg_64 = S_ASHR_I64 2147483647, 1, implicit-def $scc
+    ; GCN-NEXT: S_ENDPGM 0, implicit [[S_ASHR_I64_]]
+    %0:sreg_64 = S_MOV_B64_IMM_PSEUDO 2147483647
+    %1:sreg_64 = S_ASHR_I64 %0, 1, implicit-def $scc
+    S_ENDPGM 0, implicit %1
+
+...
+
+---
+name:            imm64_ashr_int32_const_0xffffffffffffffff
+body: |
+  bb.0:
+    ; GCN-LABEL: name: imm64_ashr_int32_const_0xffffffffffffffff
+    ; GCN: [[S_ASHR_I64_:%[0-9]+]]:sreg_64 = S_ASHR_I64 -1, 1, implicit-def $scc
+    ; GCN-NEXT: S_ENDPGM 0, implicit [[S_ASHR_I64_]]
+    %0:sreg_64 = S_MOV_B64_IMM_PSEUDO -1
+    %1:sreg_64 = S_ASHR_I64 %0, 1, implicit-def $scc
+    S_ENDPGM 0, implicit %1
+
+...

From fe8335babba1725e18d6ea94073c3dbb92958bfa Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <rampitec@users.noreply.github.com>
Date: Mon, 30 Oct 2023 08:12:28 -0700
Subject: [PATCH 380/877] [AMDGPU] Select 64-bit imm moves if can be encoded as
 32 bit operand (#70395)

This allows folding of 64-bit operands if fit into 32-bit. Fixes
https://github.com/llvm/llvm-project/issues/67781
---
 llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp |   13 +-
 .../AMDGPU/AMDGPUInstructionSelector.cpp      |    8 +
 llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp    |    2 +-
 llvm/lib/Target/AMDGPU/SIInstructions.td      |   23 +
 .../AMDGPU/GlobalISel/combine-short-clamp.ll  |    4 +-
 .../AMDGPU/GlobalISel/extractelement.ll       |  124 +-
 .../AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll  |   24 +-
 llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll   |  868 +-
 llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll   |  866 +-
 .../AMDGPU/GlobalISel/insertelement.i16.ll    |   52 +-
 .../AMDGPU/GlobalISel/insertelement.i8.ll     |   16 +
 .../AMDGPU/GlobalISel/insertelement.ll        |   24 +-
 ...inst-select-amdgpu-atomic-cmpxchg-flat.mir |  117 +-
 ...st-select-amdgpu-atomic-cmpxchg-global.mir |  136 +-
 .../inst-select-atomicrmw-add-flat.mir        |  330 +-
 .../inst-select-atomicrmw-add-global.mir      |  320 +-
 .../GlobalISel/inst-select-constant.mir       |  360 +-
 .../GlobalISel/inst-select-fconstant.mir      |   42 +-
 .../AMDGPU/GlobalISel/inst-select-fmul.mir    |   93 +-
 .../GlobalISel/inst-select-fract.f64.mir      |   13 +-
 .../inst-select-load-atomic-flat.mir          |  102 +-
 .../inst-select-load-atomic-global.mir        |  156 +-
 .../GlobalISel/inst-select-load-constant.mir  |   54 +-
 .../GlobalISel/inst-select-load-flat.mir      |  772 +-
 .../inst-select-load-global-saddr.mir         |   72 +-
 .../GlobalISel/inst-select-load-global.mir    |  708 +-
 .../GlobalISel/inst-select-load-smrd.mir      |    2 +-
 .../AMDGPU/GlobalISel/inst-select-ptrmask.mir |   36 +-
 .../GlobalISel/inst-select-store-flat.mir     |  120 +-
 .../GlobalISel/inst-select-store-global.mir   |   24 +-
 .../GlobalISel/llvm.amdgcn.div.scale.ll       |   26 +-
 .../llvm.amdgcn.global.atomic.csub.ll         |   30 +-
 .../GlobalISel/llvm.amdgcn.mfma.gfx90a.ll     |   36 +-
 .../CodeGen/AMDGPU/GlobalISel/llvm.memmove.ll |    7 +-
 .../CodeGen/AMDGPU/GlobalISel/llvm.memset.ll  |    7 +-
 .../CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll     |  305 +-
 .../CodeGen/AMDGPU/GlobalISel/srem.i64.ll     |  280 +-
 .../CodeGen/AMDGPU/GlobalISel/udiv.i64.ll     |  191 +-
 .../CodeGen/AMDGPU/GlobalISel/urem.i64.ll     |  175 +-
 .../AMDGPU/agpr-copy-no-free-registers.ll     |   18 +-
 .../AMDGPU/amdgpu-codegenprepare-idiv.ll      |  176 +-
 .../atomic_optimizations_local_pointer.ll     |   16 +-
 .../CodeGen/AMDGPU/combine_andor_with_cmps.ll |    3 +-
 llvm/test/CodeGen/AMDGPU/commute-compares.ll  |    4 +-
 llvm/test/CodeGen/AMDGPU/constrained-shift.ll |    3 +-
 llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll     |  824 +-
 .../AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll |  115 +-
 .../CodeGen/AMDGPU/fp64-atomics-gfx90a.ll     |   32 +-
 llvm/test/CodeGen/AMDGPU/fract-match.ll       |   15 +-
 llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll         |  284 +-
 llvm/test/CodeGen/AMDGPU/global_atomics.ll    |   42 +-
 .../AMDGPU/global_atomics_scan_fadd.ll        |  324 +-
 .../AMDGPU/global_atomics_scan_fsub.ll        |  324 +-
 llvm/test/CodeGen/AMDGPU/inline-asm.ll        |    5 +-
 .../CodeGen/AMDGPU/insert-delay-alu-bug.ll    |    9 +-
 .../CodeGen/AMDGPU/insert_vector_dynelt.ll    |   37 +-
 llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll |   42 +-
 .../CodeGen/AMDGPU/insert_vector_elt.v2i16.ll |   52 +-
 .../ipra-return-address-save-restore.ll       |    2 +-
 .../CodeGen/AMDGPU/lds-atomic-fmin-fmax.ll    |  322 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll    |  236 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll    |  672 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll    |  120 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll    |  450 +-
 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.ll   |    4 +-
 llvm/test/CodeGen/AMDGPU/llvm.frexp.ll        |   72 +-
 llvm/test/CodeGen/AMDGPU/load-constant-i1.ll  |   77 +-
 llvm/test/CodeGen/AMDGPU/load-global-i16.ll   |  849 +-
 llvm/test/CodeGen/AMDGPU/offset-split-flat.ll |  861 +-
 .../CodeGen/AMDGPU/offset-split-global.ll     |  402 +-
 .../AMDGPU/reassoc-mul-add-1-to-mad.ll        |   68 +-
 llvm/test/CodeGen/AMDGPU/rsq.f64.ll           |  649 +-
 llvm/test/CodeGen/AMDGPU/salu-to-valu.ll      |    8 +-
 llvm/test/CodeGen/AMDGPU/sdiv64.ll            |   15 +-
 llvm/test/CodeGen/AMDGPU/shl.ll               |   59 +-
 .../CodeGen/AMDGPU/spill-scavenge-offset.ll   | 7359 ++++++++---------
 .../AMDGPU/splitkit-getsubrangeformask.ll     |  326 +-
 llvm/test/CodeGen/AMDGPU/srem64.ll            |   15 +-
 llvm/test/CodeGen/AMDGPU/swdev380865.ll       |  103 +-
 .../AMDGPU/tuple-allocation-failure.ll        |  158 +-
 llvm/test/CodeGen/AMDGPU/udiv64.ll            |   15 +-
 .../AMDGPU/unstructured-cfg-def-use-issue.ll  |   10 +-
 llvm/test/CodeGen/AMDGPU/urem64.ll            |   15 +-
 .../CodeGen/AMDGPU/use-sgpr-multiple-times.ll |    7 +-
 llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll |    6 +-
 85 files changed, 10087 insertions(+), 11656 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index b5ceaaa14b4fd..804ffb90b5302 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -595,11 +595,15 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
       break;
 
     uint64_t Imm;
-    if (ConstantFPSDNode *FP = dyn_cast<ConstantFPSDNode>(N))
+    if (ConstantFPSDNode *FP = dyn_cast<ConstantFPSDNode>(N)) {
       Imm = FP->getValueAPF().bitcastToAPInt().getZExtValue();
-    else {
+      if (AMDGPU::isValid32BitLiteral(Imm, true))
+        break;
+    } else {
       ConstantSDNode *C = cast<ConstantSDNode>(N);
       Imm = C->getZExtValue();
+      if (AMDGPU::isValid32BitLiteral(Imm, false))
+        break;
     }
 
     SDLoc DL(N);
@@ -3014,7 +3018,7 @@ bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const {
     if (!RC || SIRI->isSGPRClass(RC))
       return false;
 
-    if (RC != &AMDGPU::VS_32RegClass) {
+    if (RC != &AMDGPU::VS_32RegClass && RC != &AMDGPU::VS_64RegClass) {
       AllUsesAcceptSReg = false;
       SDNode * User = *U;
       if (User->isMachineOpcode()) {
@@ -3026,7 +3030,8 @@ bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const {
           if (SII->findCommutedOpIndices(Desc, OpIdx, CommuteIdx1)) {
             unsigned CommutedOpNo = CommuteIdx1 - Desc.getNumDefs();
             const TargetRegisterClass *CommutedRC = getOperandRegClass(*U, CommutedOpNo);
-            if (CommutedRC == &AMDGPU::VS_32RegClass)
+            if (CommutedRC == &AMDGPU::VS_32RegClass ||
+                CommutedRC == &AMDGPU::VS_64RegClass)
               AllUsesAcceptSReg = true;
           }
         }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 31d72fb8cadd8..2cf60f338105b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -2551,11 +2551,13 @@ bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr &I) const {
   MachineOperand &ImmOp = I.getOperand(1);
   Register DstReg = I.getOperand(0).getReg();
   unsigned Size = MRI->getType(DstReg).getSizeInBits();
+  bool IsFP = false;
 
   // The AMDGPU backend only supports Imm operands and not CImm or FPImm.
   if (ImmOp.isFPImm()) {
     const APInt &Imm = ImmOp.getFPImm()->getValueAPF().bitcastToAPInt();
     ImmOp.ChangeToImmediate(Imm.getZExtValue());
+    IsFP = true;
   } else if (ImmOp.isCImm()) {
     ImmOp.ChangeToImmediate(ImmOp.getCImm()->getSExtValue());
   } else {
@@ -2568,6 +2570,12 @@ bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr &I) const {
   unsigned Opcode;
   if (DstRB->getID() == AMDGPU::VCCRegBankID) {
     Opcode = STI.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
+  } else if (Size == 64 &&
+             AMDGPU::isValid32BitLiteral(I.getOperand(1).getImm(), IsFP)) {
+    Opcode = IsSgpr ? AMDGPU::S_MOV_B64_IMM_PSEUDO : AMDGPU::V_MOV_B64_PSEUDO;
+    I.setDesc(TII.get(Opcode));
+    I.addImplicitDefUseOperands(*MF);
+    return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
   } else {
     Opcode = IsSgpr ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
 
diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
index b32ed9fef5dd3..b7ac90e33f65e 100644
--- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -367,7 +367,7 @@ static bool isSafeToFoldImmIntoCopy(const MachineInstr *Copy,
     SMovOp = AMDGPU::S_MOV_B32;
     break;
   case AMDGPU::V_MOV_B64_PSEUDO:
-    SMovOp = AMDGPU::S_MOV_B64;
+    SMovOp = AMDGPU::S_MOV_B64_IMM_PSEUDO;
     break;
   }
   Imm = ImmOp->getImm();
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index e56269438472e..ba3ed939561d4 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -1966,6 +1966,29 @@ def : GCNPat <
   (V_MOV_B32_e32 (f16 (bitcast_fpimm_to_i32 $imm)))
 >;
 
+// V_MOV_B64_PSEUDO and S_MOV_B64_IMM_PSEUDO can be used with any 64-bit
+// immediate and wil be expanded as needed, but we will only use these patterns
+// for values which can be encoded.
+def : GCNPat <
+  (VGPRImm<(i64 imm)>:$imm),
+  (V_MOV_B64_PSEUDO imm:$imm)
+>;
+
+def : GCNPat <
+  (VGPRImm<(f64 fpimm)>:$imm),
+  (V_MOV_B64_PSEUDO (f64 (bitcast_fpimm_to_i64 $imm)))
+>;
+
+def : GCNPat <
+  (i64 imm:$imm),
+  (S_MOV_B64_IMM_PSEUDO imm:$imm)
+>;
+
+def : GCNPat <
+  (f64 fpimm:$imm),
+  (S_MOV_B64_IMM_PSEUDO (i64 (bitcast_fpimm_to_i64 fpimm:$imm)))
+>;
+
 def : GCNPat <
   (f32 fpimm:$imm),
   (S_MOV_B32 (f32 (bitcast_fpimm_to_i32 $imm)))
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-short-clamp.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-short-clamp.ll
index ed525fb83c6de..621394fd290b0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-short-clamp.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-short-clamp.ll
@@ -41,11 +41,12 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}v_clamp_i64_i16_invalid_lower
+; GFX6789: v_mov_b32_e32 v{{[0-9]+}}, 0x8001
 ; GFX6789: v_mov_b32_e32 [[B:v[0-9]+]], 0x8001
 ; GFX6789: v_cndmask_b32_e32 [[A:v[0-9]+]], [[B]], [[A]], vcc
 ; GFX6789: v_cndmask_b32_e32 [[C:v[0-9]+]], 0, [[C]], vcc
 
-; GFX10: v_cndmask_b32_e32 [[A:v[0-9]+]], 0x8001, [[A]], vcc_lo
+; GFX10: v_{{(dual_)?}}cndmask_b32{{(_e32)?}} [[A:v[0-9]+]], 0x8001, [[A]]
 ; GFX10: v_cndmask_b32_e32 [[B:v[0-9]+]], 0, [[B]], vcc_lo
 define i16 @v_clamp_i64_i16_invalid_lower(i64 %in) #0 {
 entry:
@@ -56,6 +57,7 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}v_clamp_i64_i16_invalid_lower_and_higher
+; GFX6789: v_mov_b32_e32 v{{[0-9]+}}, 0x8000
 ; GFX6789: v_mov_b32_e32 [[B:v[0-9]+]], 0x8000
 ; GFX6789: v_cndmask_b32_e32 [[A:v[0-9]+]], [[B]], [[A]], vcc
 ; GFX10: v_cndmask_b32_e32 [[A:v[0-9]+]], 0x8000, [[A]], vcc_lo
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
index 701a733d9e8e9..8bf34caea4051 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
@@ -2090,69 +2090,69 @@ define amdgpu_ps double @dyn_extract_v16f64_s_s(i32 inreg %sel) {
 ; GCN-LABEL: dyn_extract_v16f64_s_s:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_mov_b32 s66, 0
+; GCN-NEXT:    s_mov_b32 s64, 0
+; GCN-NEXT:    s_mov_b32 s62, 0
+; GCN-NEXT:    s_mov_b32 s60, 0
+; GCN-NEXT:    s_mov_b32 s58, 0
+; GCN-NEXT:    s_mov_b32 s56, 0
+; GCN-NEXT:    s_mov_b32 s54, 0
+; GCN-NEXT:    s_mov_b32 s52, 0
+; GCN-NEXT:    s_mov_b32 s50, 0
+; GCN-NEXT:    s_mov_b32 s48, 0
+; GCN-NEXT:    s_mov_b32 s46, 0
+; GCN-NEXT:    s_mov_b32 s44, 0
+; GCN-NEXT:    s_mov_b32 s40, 0
 ; GCN-NEXT:    s_mov_b64 s[36:37], 1.0
 ; GCN-NEXT:    s_mov_b32 m0, s2
 ; GCN-NEXT:    s_mov_b32 s67, 0x40300000
 ; GCN-NEXT:    s_mov_b32 s65, 0x402e0000
-; GCN-NEXT:    s_mov_b32 s64, s66
 ; GCN-NEXT:    s_mov_b32 s63, 0x402c0000
-; GCN-NEXT:    s_mov_b32 s62, s66
 ; GCN-NEXT:    s_mov_b32 s61, 0x402a0000
-; GCN-NEXT:    s_mov_b32 s60, s66
 ; GCN-NEXT:    s_mov_b32 s59, 0x40280000
-; GCN-NEXT:    s_mov_b32 s58, s66
 ; GCN-NEXT:    s_mov_b32 s57, 0x40260000
-; GCN-NEXT:    s_mov_b32 s56, s66
 ; GCN-NEXT:    s_mov_b32 s55, 0x40240000
-; GCN-NEXT:    s_mov_b32 s54, s66
 ; GCN-NEXT:    s_mov_b32 s53, 0x40220000
-; GCN-NEXT:    s_mov_b32 s52, s66
 ; GCN-NEXT:    s_mov_b32 s51, 0x40200000
-; GCN-NEXT:    s_mov_b32 s50, s66
 ; GCN-NEXT:    s_mov_b32 s49, 0x401c0000
-; GCN-NEXT:    s_mov_b32 s48, s66
 ; GCN-NEXT:    s_mov_b32 s47, 0x40180000
-; GCN-NEXT:    s_mov_b32 s46, s66
 ; GCN-NEXT:    s_mov_b32 s45, 0x40140000
-; GCN-NEXT:    s_mov_b32 s44, s66
 ; GCN-NEXT:    s_mov_b64 s[42:43], 4.0
 ; GCN-NEXT:    s_mov_b32 s41, 0x40080000
-; GCN-NEXT:    s_mov_b32 s40, s66
 ; GCN-NEXT:    s_mov_b64 s[38:39], 2.0
 ; GCN-NEXT:    s_movrels_b64 s[0:1], s[36:37]
 ; GCN-NEXT:    ; return to shader part epilog
 ;
 ; GFX10PLUS-LABEL: dyn_extract_v16f64_s_s:
 ; GFX10PLUS:       ; %bb.0: ; %entry
-; GFX10PLUS-NEXT:    s_mov_b32 s66, 0
 ; GFX10PLUS-NEXT:    s_mov_b64 s[36:37], 1.0
 ; GFX10PLUS-NEXT:    s_mov_b32 m0, s2
+; GFX10PLUS-NEXT:    s_mov_b32 s66, 0
+; GFX10PLUS-NEXT:    s_mov_b32 s64, 0
+; GFX10PLUS-NEXT:    s_mov_b32 s62, 0
+; GFX10PLUS-NEXT:    s_mov_b32 s60, 0
+; GFX10PLUS-NEXT:    s_mov_b32 s58, 0
+; GFX10PLUS-NEXT:    s_mov_b32 s56, 0
+; GFX10PLUS-NEXT:    s_mov_b32 s54, 0
+; GFX10PLUS-NEXT:    s_mov_b32 s52, 0
+; GFX10PLUS-NEXT:    s_mov_b32 s50, 0
+; GFX10PLUS-NEXT:    s_mov_b32 s48, 0
+; GFX10PLUS-NEXT:    s_mov_b32 s46, 0
+; GFX10PLUS-NEXT:    s_mov_b32 s44, 0
+; GFX10PLUS-NEXT:    s_mov_b32 s40, 0
 ; GFX10PLUS-NEXT:    s_mov_b32 s67, 0x40300000
 ; GFX10PLUS-NEXT:    s_mov_b32 s65, 0x402e0000
-; GFX10PLUS-NEXT:    s_mov_b32 s64, s66
 ; GFX10PLUS-NEXT:    s_mov_b32 s63, 0x402c0000
-; GFX10PLUS-NEXT:    s_mov_b32 s62, s66
 ; GFX10PLUS-NEXT:    s_mov_b32 s61, 0x402a0000
-; GFX10PLUS-NEXT:    s_mov_b32 s60, s66
 ; GFX10PLUS-NEXT:    s_mov_b32 s59, 0x40280000
-; GFX10PLUS-NEXT:    s_mov_b32 s58, s66
 ; GFX10PLUS-NEXT:    s_mov_b32 s57, 0x40260000
-; GFX10PLUS-NEXT:    s_mov_b32 s56, s66
 ; GFX10PLUS-NEXT:    s_mov_b32 s55, 0x40240000
-; GFX10PLUS-NEXT:    s_mov_b32 s54, s66
 ; GFX10PLUS-NEXT:    s_mov_b32 s53, 0x40220000
-; GFX10PLUS-NEXT:    s_mov_b32 s52, s66
 ; GFX10PLUS-NEXT:    s_mov_b32 s51, 0x40200000
-; GFX10PLUS-NEXT:    s_mov_b32 s50, s66
 ; GFX10PLUS-NEXT:    s_mov_b32 s49, 0x401c0000
-; GFX10PLUS-NEXT:    s_mov_b32 s48, s66
 ; GFX10PLUS-NEXT:    s_mov_b32 s47, 0x40180000
-; GFX10PLUS-NEXT:    s_mov_b32 s46, s66
 ; GFX10PLUS-NEXT:    s_mov_b32 s45, 0x40140000
-; GFX10PLUS-NEXT:    s_mov_b32 s44, s66
 ; GFX10PLUS-NEXT:    s_mov_b64 s[42:43], 4.0
 ; GFX10PLUS-NEXT:    s_mov_b32 s41, 0x40080000
-; GFX10PLUS-NEXT:    s_mov_b32 s40, s66
 ; GFX10PLUS-NEXT:    s_mov_b64 s[38:39], 2.0
 ; GFX10PLUS-NEXT:    s_movrels_b64 s[0:1], s[36:37]
 ; GFX10PLUS-NEXT:    ; return to shader part epilog
@@ -3085,10 +3085,10 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
 ; GPRIDX-NEXT:  ; %bb.0: ; %entry
 ; GPRIDX-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GPRIDX-NEXT:    s_load_dword s8, s[4:5], 0x8
+; GPRIDX-NEXT:    s_mov_b32 s4, 0
+; GPRIDX-NEXT:    s_mov_b32 s5, 0x40080000
 ; GPRIDX-NEXT:    s_mov_b32 s2, 0
 ; GPRIDX-NEXT:    s_mov_b32 s3, 0x40140000
-; GPRIDX-NEXT:    s_mov_b32 s5, 0x40080000
-; GPRIDX-NEXT:    s_mov_b32 s4, s2
 ; GPRIDX-NEXT:    s_waitcnt lgkmcnt(0)
 ; GPRIDX-NEXT:    s_cmp_eq_u32 s8, 1
 ; GPRIDX-NEXT:    s_cselect_b64 s[6:7], 2.0, 1.0
@@ -3176,10 +3176,10 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
 ; MOVREL-NEXT:  ; %bb.0: ; %entry
 ; MOVREL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; MOVREL-NEXT:    s_load_dword s8, s[4:5], 0x8
+; MOVREL-NEXT:    s_mov_b32 s4, 0
+; MOVREL-NEXT:    s_mov_b32 s5, 0x40080000
 ; MOVREL-NEXT:    s_mov_b32 s2, 0
 ; MOVREL-NEXT:    s_mov_b32 s3, 0x40140000
-; MOVREL-NEXT:    s_mov_b32 s5, 0x40080000
-; MOVREL-NEXT:    s_mov_b32 s4, s2
 ; MOVREL-NEXT:    s_waitcnt lgkmcnt(0)
 ; MOVREL-NEXT:    s_cmp_eq_u32 s8, 1
 ; MOVREL-NEXT:    s_cselect_b64 s[6:7], 2.0, 1.0
@@ -3207,7 +3207,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
 ; GFX10-NEXT:     kernel_code_entry_byte_offset = 256
 ; GFX10-NEXT:     kernel_code_prefetch_byte_size = 0
 ; GFX10-NEXT:     granulated_workitem_vgpr_count = 0
-; GFX10-NEXT:     granulated_wavefront_sgpr_count = 1
+; GFX10-NEXT:     granulated_wavefront_sgpr_count = 0
 ; GFX10-NEXT:     priority = 0
 ; GFX10-NEXT:     float_mode = 240
 ; GFX10-NEXT:     priv = 0
@@ -3250,7 +3250,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
 ; GFX10-NEXT:     gds_segment_byte_size = 0
 ; GFX10-NEXT:     kernarg_segment_byte_size = 12
 ; GFX10-NEXT:     workgroup_fbarrier_count = 0
-; GFX10-NEXT:     wavefront_sgpr_count = 9
+; GFX10-NEXT:     wavefront_sgpr_count = 7
 ; GFX10-NEXT:     workitem_vgpr_count = 3
 ; GFX10-NEXT:     reserved_vgpr_first = 0
 ; GFX10-NEXT:     reserved_vgpr_count = 0
@@ -3267,22 +3267,22 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
 ; GFX10-NEXT:    .end_amd_kernel_code_t
 ; GFX10-NEXT:  ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    s_load_dword s8, s[4:5], 0x8
+; GFX10-NEXT:    s_load_dword s6, s[4:5], 0x8
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
 ; GFX10-NEXT:    s_mov_b32 s2, 0
-; GFX10-NEXT:    s_mov_b32 s3, 0x40140000
-; GFX10-NEXT:    s_mov_b32 s5, 0x40080000
-; GFX10-NEXT:    s_mov_b32 s4, s2
+; GFX10-NEXT:    s_mov_b32 s3, 0x40080000
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_cmp_eq_u32 s8, 1
-; GFX10-NEXT:    s_cselect_b64 s[6:7], 2.0, 1.0
-; GFX10-NEXT:    s_cmp_eq_u32 s8, 2
-; GFX10-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[6:7]
-; GFX10-NEXT:    s_cmp_eq_u32 s8, 3
-; GFX10-NEXT:    s_cselect_b64 s[4:5], 4.0, s[4:5]
-; GFX10-NEXT:    s_cmp_eq_u32 s8, 4
+; GFX10-NEXT:    s_cmp_eq_u32 s6, 1
+; GFX10-NEXT:    s_cselect_b64 s[4:5], 2.0, 1.0
+; GFX10-NEXT:    s_cmp_eq_u32 s6, 2
 ; GFX10-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[4:5]
+; GFX10-NEXT:    s_cmp_eq_u32 s6, 3
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:    s_mov_b32 s5, 0x40140000
+; GFX10-NEXT:    s_cselect_b64 s[2:3], 4.0, s[2:3]
+; GFX10-NEXT:    s_cmp_eq_u32 s6, 4
+; GFX10-NEXT:    s_cselect_b64 s[2:3], s[4:5], s[2:3]
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
@@ -3299,7 +3299,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
 ; GFX11-NEXT:     kernel_code_entry_byte_offset = 256
 ; GFX11-NEXT:     kernel_code_prefetch_byte_size = 0
 ; GFX11-NEXT:     granulated_workitem_vgpr_count = 0
-; GFX11-NEXT:     granulated_wavefront_sgpr_count = 1
+; GFX11-NEXT:     granulated_wavefront_sgpr_count = 0
 ; GFX11-NEXT:     priority = 0
 ; GFX11-NEXT:     float_mode = 240
 ; GFX11-NEXT:     priv = 0
@@ -3342,7 +3342,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
 ; GFX11-NEXT:     gds_segment_byte_size = 0
 ; GFX11-NEXT:     kernarg_segment_byte_size = 12
 ; GFX11-NEXT:     workgroup_fbarrier_count = 0
-; GFX11-NEXT:     wavefront_sgpr_count = 9
+; GFX11-NEXT:     wavefront_sgpr_count = 7
 ; GFX11-NEXT:     workitem_vgpr_count = 3
 ; GFX11-NEXT:     reserved_vgpr_first = 0
 ; GFX11-NEXT:     reserved_vgpr_count = 0
@@ -3359,22 +3359,22 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
 ; GFX11-NEXT:    .end_amd_kernel_code_t
 ; GFX11-NEXT:  ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b32 s8, s[0:1], 0x8
+; GFX11-NEXT:    s_load_b32 s6, s[0:1], 0x8
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX11-NEXT:    s_mov_b32 s2, 0
-; GFX11-NEXT:    s_mov_b32 s3, 0x40140000
-; GFX11-NEXT:    s_mov_b32 s5, 0x40080000
-; GFX11-NEXT:    s_mov_b32 s4, s2
+; GFX11-NEXT:    s_mov_b32 s3, 0x40080000
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_cmp_eq_u32 s8, 1
-; GFX11-NEXT:    s_cselect_b64 s[6:7], 2.0, 1.0
-; GFX11-NEXT:    s_cmp_eq_u32 s8, 2
-; GFX11-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[6:7]
-; GFX11-NEXT:    s_cmp_eq_u32 s8, 3
-; GFX11-NEXT:    s_cselect_b64 s[4:5], 4.0, s[4:5]
-; GFX11-NEXT:    s_cmp_eq_u32 s8, 4
+; GFX11-NEXT:    s_cmp_eq_u32 s6, 1
+; GFX11-NEXT:    s_cselect_b64 s[4:5], 2.0, 1.0
+; GFX11-NEXT:    s_cmp_eq_u32 s6, 2
 ; GFX11-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[4:5]
+; GFX11-NEXT:    s_cmp_eq_u32 s6, 3
+; GFX11-NEXT:    s_mov_b32 s4, 0
+; GFX11-NEXT:    s_mov_b32 s5, 0x40140000
+; GFX11-NEXT:    s_cselect_b64 s[2:3], 4.0, s[2:3]
+; GFX11-NEXT:    s_cmp_eq_u32 s6, 4
+; GFX11-NEXT:    s_cselect_b64 s[2:3], s[4:5], s[2:3]
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-NEXT:    s_nop 0
@@ -4784,11 +4784,8 @@ define i32 @v_extract_v64i32_32(ptr addrspace(1) %ptr) {
 ; MOVREL-LABEL: v_extract_v64i32_32:
 ; MOVREL:       ; %bb.0:
 ; MOVREL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; MOVREL-NEXT:    s_mov_b64 s[4:5], 0x80
-; MOVREL-NEXT:    v_mov_b32_e32 v2, s4
-; MOVREL-NEXT:    v_mov_b32_e32 v3, s5
-; MOVREL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
-; MOVREL-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; MOVREL-NEXT:    v_add_u32_e32 v0, vcc, 0x80, v0
+; MOVREL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; MOVREL-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; MOVREL-NEXT:    s_waitcnt vmcnt(0)
 ; MOVREL-NEXT:    s_setpc_b64 s[30:31]
@@ -4823,11 +4820,8 @@ define i32 @v_extract_v64i32_33(ptr addrspace(1) %ptr) {
 ; MOVREL-LABEL: v_extract_v64i32_33:
 ; MOVREL:       ; %bb.0:
 ; MOVREL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; MOVREL-NEXT:    s_mov_b64 s[4:5], 0x80
-; MOVREL-NEXT:    v_mov_b32_e32 v2, s4
-; MOVREL-NEXT:    v_mov_b32_e32 v3, s5
-; MOVREL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
-; MOVREL-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; MOVREL-NEXT:    v_add_u32_e32 v0, vcc, 0x80, v0
+; MOVREL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; MOVREL-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; MOVREL-NEXT:    s_waitcnt vmcnt(0)
 ; MOVREL-NEXT:    v_mov_b32_e32 v0, v1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
index 66bff4a14cac8..c6ea046f95a91 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
@@ -1473,12 +1473,12 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat(ptr %ptr) #1 {
 ; GFX940-LABEL: flat_atomic_fadd_f64_noret_pat:
 ; GFX940:       ; %bb.0: ; %main_body
 ; GFX940-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX940-NEXT:    v_mov_b64_e32 v[2:3], 4.0
+; GFX940-NEXT:    v_mov_b64_e32 v[0:1], 4.0
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-NEXT:    v_mov_b64_e32 v[2:3], s[0:1]
 ; GFX940-NEXT:    buffer_wbl2 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    flat_atomic_add_f64 v[0:1], v[2:3] sc1
+; GFX940-NEXT:    flat_atomic_add_f64 v[2:3], v[0:1] sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc0 sc1
 ; GFX940-NEXT:    s_endpgm
@@ -1504,12 +1504,12 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent(ptr %ptr) #1 {
 ; GFX940-LABEL: flat_atomic_fadd_f64_noret_pat_agent:
 ; GFX940:       ; %bb.0: ; %main_body
 ; GFX940-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX940-NEXT:    v_mov_b64_e32 v[2:3], 4.0
+; GFX940-NEXT:    v_mov_b64_e32 v[0:1], 4.0
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-NEXT:    v_mov_b64_e32 v[2:3], s[0:1]
 ; GFX940-NEXT:    buffer_wbl2 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    flat_atomic_add_f64 v[0:1], v[2:3]
+; GFX940-NEXT:    flat_atomic_add_f64 v[2:3], v[0:1]
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_endpgm
@@ -1549,12 +1549,12 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_system(ptr %ptr) #1 {
 ; GFX940-LABEL: flat_atomic_fadd_f64_noret_pat_system:
 ; GFX940:       ; %bb.0: ; %main_body
 ; GFX940-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX940-NEXT:    v_mov_b64_e32 v[2:3], 4.0
+; GFX940-NEXT:    v_mov_b64_e32 v[0:1], 4.0
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-NEXT:    v_mov_b64_e32 v[2:3], s[0:1]
 ; GFX940-NEXT:    buffer_wbl2 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    flat_atomic_add_f64 v[0:1], v[2:3] sc1
+; GFX940-NEXT:    flat_atomic_add_f64 v[2:3], v[0:1] sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc0 sc1
 ; GFX940-NEXT:    s_endpgm
@@ -1748,12 +1748,12 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent_safe(ptr %ptr) {
 ; GFX940-LABEL: flat_atomic_fadd_f64_noret_pat_agent_safe:
 ; GFX940:       ; %bb.0: ; %main_body
 ; GFX940-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX940-NEXT:    v_mov_b64_e32 v[2:3], 4.0
+; GFX940-NEXT:    v_mov_b64_e32 v[0:1], 4.0
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-NEXT:    v_mov_b64_e32 v[2:3], s[0:1]
 ; GFX940-NEXT:    buffer_wbl2 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    flat_atomic_add_f64 v[0:1], v[2:3]
+; GFX940-NEXT:    flat_atomic_add_f64 v[2:3], v[0:1]
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
index 139bb40daa930..056629ca35451 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
@@ -5824,29 +5824,28 @@ define <2 x i64> @v_fshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) {
 define amdgpu_ps i128 @s_fshl_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg %amt) {
 ; GFX6-LABEL: s_fshl_i128:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_mov_b64 s[10:11], 0x7f
-; GFX6-NEXT:    s_and_b64 s[12:13], s[8:9], s[10:11]
-; GFX6-NEXT:    s_andn2_b64 s[8:9], s[10:11], s[8:9]
-; GFX6-NEXT:    s_sub_i32 s9, s12, 64
-; GFX6-NEXT:    s_sub_i32 s10, 64, s12
-; GFX6-NEXT:    s_cmp_lt_u32 s12, 64
+; GFX6-NEXT:    s_and_b64 s[10:11], s[8:9], 0x7f
+; GFX6-NEXT:    s_andn2_b64 s[8:9], 0x7f, s[8:9]
+; GFX6-NEXT:    s_sub_i32 s9, s10, 64
+; GFX6-NEXT:    s_sub_i32 s11, 64, s10
+; GFX6-NEXT:    s_cmp_lt_u32 s10, 64
+; GFX6-NEXT:    s_cselect_b32 s13, 1, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s10, 0
 ; GFX6-NEXT:    s_cselect_b32 s18, 1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s12, 0
-; GFX6-NEXT:    s_cselect_b32 s19, 1, 0
-; GFX6-NEXT:    s_lshl_b64 s[14:15], s[0:1], s12
-; GFX6-NEXT:    s_lshr_b64 s[16:17], s[0:1], s10
-; GFX6-NEXT:    s_lshl_b64 s[12:13], s[2:3], s12
-; GFX6-NEXT:    s_or_b64 s[12:13], s[16:17], s[12:13]
+; GFX6-NEXT:    s_lshl_b64 s[14:15], s[0:1], s10
+; GFX6-NEXT:    s_lshr_b64 s[16:17], s[0:1], s11
+; GFX6-NEXT:    s_lshl_b64 s[10:11], s[2:3], s10
+; GFX6-NEXT:    s_or_b64 s[10:11], s[16:17], s[10:11]
 ; GFX6-NEXT:    s_lshl_b64 s[0:1], s[0:1], s9
-; GFX6-NEXT:    s_cmp_lg_u32 s18, 0
+; GFX6-NEXT:    s_cmp_lg_u32 s13, 0
 ; GFX6-NEXT:    s_cselect_b64 s[14:15], s[14:15], 0
-; GFX6-NEXT:    s_cselect_b64 s[0:1], s[12:13], s[0:1]
-; GFX6-NEXT:    s_cmp_lg_u32 s19, 0
+; GFX6-NEXT:    s_cselect_b64 s[0:1], s[10:11], s[0:1]
+; GFX6-NEXT:    s_cmp_lg_u32 s18, 0
+; GFX6-NEXT:    s_mov_b32 s12, 0
 ; GFX6-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
 ; GFX6-NEXT:    s_lshr_b64 s[0:1], s[4:5], 1
-; GFX6-NEXT:    s_lshl_b32 s5, s6, 31
-; GFX6-NEXT:    s_mov_b32 s4, s11
-; GFX6-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
+; GFX6-NEXT:    s_lshl_b32 s13, s6, 31
+; GFX6-NEXT:    s_or_b64 s[0:1], s[0:1], s[12:13]
 ; GFX6-NEXT:    s_lshr_b64 s[4:5], s[6:7], 1
 ; GFX6-NEXT:    s_sub_i32 s12, s8, 64
 ; GFX6-NEXT:    s_sub_i32 s10, 64, s8
@@ -5871,29 +5870,28 @@ define amdgpu_ps i128 @s_fshl_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg
 ;
 ; GFX8-LABEL: s_fshl_i128:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_mov_b64 s[10:11], 0x7f
-; GFX8-NEXT:    s_and_b64 s[12:13], s[8:9], s[10:11]
-; GFX8-NEXT:    s_andn2_b64 s[8:9], s[10:11], s[8:9]
-; GFX8-NEXT:    s_sub_i32 s9, s12, 64
-; GFX8-NEXT:    s_sub_i32 s10, 64, s12
-; GFX8-NEXT:    s_cmp_lt_u32 s12, 64
+; GFX8-NEXT:    s_and_b64 s[10:11], s[8:9], 0x7f
+; GFX8-NEXT:    s_andn2_b64 s[8:9], 0x7f, s[8:9]
+; GFX8-NEXT:    s_sub_i32 s9, s10, 64
+; GFX8-NEXT:    s_sub_i32 s11, 64, s10
+; GFX8-NEXT:    s_cmp_lt_u32 s10, 64
+; GFX8-NEXT:    s_cselect_b32 s13, 1, 0
+; GFX8-NEXT:    s_cmp_eq_u32 s10, 0
 ; GFX8-NEXT:    s_cselect_b32 s18, 1, 0
-; GFX8-NEXT:    s_cmp_eq_u32 s12, 0
-; GFX8-NEXT:    s_cselect_b32 s19, 1, 0
-; GFX8-NEXT:    s_lshl_b64 s[14:15], s[0:1], s12
-; GFX8-NEXT:    s_lshr_b64 s[16:17], s[0:1], s10
-; GFX8-NEXT:    s_lshl_b64 s[12:13], s[2:3], s12
-; GFX8-NEXT:    s_or_b64 s[12:13], s[16:17], s[12:13]
+; GFX8-NEXT:    s_lshl_b64 s[14:15], s[0:1], s10
+; GFX8-NEXT:    s_lshr_b64 s[16:17], s[0:1], s11
+; GFX8-NEXT:    s_lshl_b64 s[10:11], s[2:3], s10
+; GFX8-NEXT:    s_or_b64 s[10:11], s[16:17], s[10:11]
 ; GFX8-NEXT:    s_lshl_b64 s[0:1], s[0:1], s9
-; GFX8-NEXT:    s_cmp_lg_u32 s18, 0
+; GFX8-NEXT:    s_cmp_lg_u32 s13, 0
 ; GFX8-NEXT:    s_cselect_b64 s[14:15], s[14:15], 0
-; GFX8-NEXT:    s_cselect_b64 s[0:1], s[12:13], s[0:1]
-; GFX8-NEXT:    s_cmp_lg_u32 s19, 0
+; GFX8-NEXT:    s_cselect_b64 s[0:1], s[10:11], s[0:1]
+; GFX8-NEXT:    s_cmp_lg_u32 s18, 0
+; GFX8-NEXT:    s_mov_b32 s12, 0
 ; GFX8-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
 ; GFX8-NEXT:    s_lshr_b64 s[0:1], s[4:5], 1
-; GFX8-NEXT:    s_lshl_b32 s5, s6, 31
-; GFX8-NEXT:    s_mov_b32 s4, s11
-; GFX8-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
+; GFX8-NEXT:    s_lshl_b32 s13, s6, 31
+; GFX8-NEXT:    s_or_b64 s[0:1], s[0:1], s[12:13]
 ; GFX8-NEXT:    s_lshr_b64 s[4:5], s[6:7], 1
 ; GFX8-NEXT:    s_sub_i32 s12, s8, 64
 ; GFX8-NEXT:    s_sub_i32 s10, 64, s8
@@ -5918,29 +5916,28 @@ define amdgpu_ps i128 @s_fshl_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg
 ;
 ; GFX9-LABEL: s_fshl_i128:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b64 s[10:11], 0x7f
-; GFX9-NEXT:    s_and_b64 s[12:13], s[8:9], s[10:11]
-; GFX9-NEXT:    s_andn2_b64 s[8:9], s[10:11], s[8:9]
-; GFX9-NEXT:    s_sub_i32 s9, s12, 64
-; GFX9-NEXT:    s_sub_i32 s10, 64, s12
-; GFX9-NEXT:    s_cmp_lt_u32 s12, 64
+; GFX9-NEXT:    s_and_b64 s[10:11], s[8:9], 0x7f
+; GFX9-NEXT:    s_andn2_b64 s[8:9], 0x7f, s[8:9]
+; GFX9-NEXT:    s_sub_i32 s9, s10, 64
+; GFX9-NEXT:    s_sub_i32 s11, 64, s10
+; GFX9-NEXT:    s_cmp_lt_u32 s10, 64
+; GFX9-NEXT:    s_cselect_b32 s13, 1, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s10, 0
 ; GFX9-NEXT:    s_cselect_b32 s18, 1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s12, 0
-; GFX9-NEXT:    s_cselect_b32 s19, 1, 0
-; GFX9-NEXT:    s_lshl_b64 s[14:15], s[0:1], s12
-; GFX9-NEXT:    s_lshr_b64 s[16:17], s[0:1], s10
-; GFX9-NEXT:    s_lshl_b64 s[12:13], s[2:3], s12
-; GFX9-NEXT:    s_or_b64 s[12:13], s[16:17], s[12:13]
+; GFX9-NEXT:    s_lshl_b64 s[14:15], s[0:1], s10
+; GFX9-NEXT:    s_lshr_b64 s[16:17], s[0:1], s11
+; GFX9-NEXT:    s_lshl_b64 s[10:11], s[2:3], s10
+; GFX9-NEXT:    s_or_b64 s[10:11], s[16:17], s[10:11]
 ; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], s9
-; GFX9-NEXT:    s_cmp_lg_u32 s18, 0
+; GFX9-NEXT:    s_cmp_lg_u32 s13, 0
 ; GFX9-NEXT:    s_cselect_b64 s[14:15], s[14:15], 0
-; GFX9-NEXT:    s_cselect_b64 s[0:1], s[12:13], s[0:1]
-; GFX9-NEXT:    s_cmp_lg_u32 s19, 0
+; GFX9-NEXT:    s_cselect_b64 s[0:1], s[10:11], s[0:1]
+; GFX9-NEXT:    s_cmp_lg_u32 s18, 0
+; GFX9-NEXT:    s_mov_b32 s12, 0
 ; GFX9-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
 ; GFX9-NEXT:    s_lshr_b64 s[0:1], s[4:5], 1
-; GFX9-NEXT:    s_lshl_b32 s5, s6, 31
-; GFX9-NEXT:    s_mov_b32 s4, s11
-; GFX9-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
+; GFX9-NEXT:    s_lshl_b32 s13, s6, 31
+; GFX9-NEXT:    s_or_b64 s[0:1], s[0:1], s[12:13]
 ; GFX9-NEXT:    s_lshr_b64 s[4:5], s[6:7], 1
 ; GFX9-NEXT:    s_sub_i32 s12, s8, 64
 ; GFX9-NEXT:    s_sub_i32 s10, 64, s8
@@ -5965,40 +5962,39 @@ define amdgpu_ps i128 @s_fshl_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg
 ;
 ; GFX10-LABEL: s_fshl_i128:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_mov_b64 s[10:11], 0x7f
-; GFX10-NEXT:    s_and_b64 s[12:13], s[8:9], s[10:11]
-; GFX10-NEXT:    s_andn2_b64 s[8:9], s[10:11], s[8:9]
-; GFX10-NEXT:    s_sub_i32 s9, s12, 64
-; GFX10-NEXT:    s_sub_i32 s10, 64, s12
-; GFX10-NEXT:    s_cmp_lt_u32 s12, 64
+; GFX10-NEXT:    s_and_b64 s[10:11], s[8:9], 0x7f
+; GFX10-NEXT:    s_andn2_b64 s[8:9], 0x7f, s[8:9]
+; GFX10-NEXT:    s_sub_i32 s9, s10, 64
+; GFX10-NEXT:    s_sub_i32 s11, 64, s10
+; GFX10-NEXT:    s_cmp_lt_u32 s10, 64
+; GFX10-NEXT:    s_mov_b32 s12, 0
+; GFX10-NEXT:    s_cselect_b32 s13, 1, 0
+; GFX10-NEXT:    s_cmp_eq_u32 s10, 0
 ; GFX10-NEXT:    s_cselect_b32 s18, 1, 0
-; GFX10-NEXT:    s_cmp_eq_u32 s12, 0
-; GFX10-NEXT:    s_cselect_b32 s19, 1, 0
-; GFX10-NEXT:    s_lshr_b64 s[14:15], s[0:1], s10
-; GFX10-NEXT:    s_lshl_b64 s[16:17], s[2:3], s12
-; GFX10-NEXT:    s_lshl_b64 s[12:13], s[0:1], s12
+; GFX10-NEXT:    s_lshr_b64 s[14:15], s[0:1], s11
+; GFX10-NEXT:    s_lshl_b64 s[16:17], s[2:3], s10
+; GFX10-NEXT:    s_lshl_b64 s[10:11], s[0:1], s10
 ; GFX10-NEXT:    s_or_b64 s[14:15], s[14:15], s[16:17]
 ; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], s9
-; GFX10-NEXT:    s_cmp_lg_u32 s18, 0
-; GFX10-NEXT:    s_cselect_b64 s[12:13], s[12:13], 0
+; GFX10-NEXT:    s_cmp_lg_u32 s13, 0
+; GFX10-NEXT:    s_cselect_b64 s[10:11], s[10:11], 0
 ; GFX10-NEXT:    s_cselect_b64 s[0:1], s[14:15], s[0:1]
-; GFX10-NEXT:    s_cmp_lg_u32 s19, 0
+; GFX10-NEXT:    s_cmp_lg_u32 s18, 0
 ; GFX10-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
 ; GFX10-NEXT:    s_lshr_b64 s[0:1], s[4:5], 1
-; GFX10-NEXT:    s_lshl_b32 s5, s6, 31
-; GFX10-NEXT:    s_mov_b32 s4, s11
-; GFX10-NEXT:    s_sub_i32 s14, s8, 64
-; GFX10-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
+; GFX10-NEXT:    s_lshl_b32 s13, s6, 31
 ; GFX10-NEXT:    s_lshr_b64 s[4:5], s[6:7], 1
+; GFX10-NEXT:    s_or_b64 s[0:1], s[0:1], s[12:13]
+; GFX10-NEXT:    s_sub_i32 s14, s8, 64
 ; GFX10-NEXT:    s_sub_i32 s9, 64, s8
 ; GFX10-NEXT:    s_cmp_lt_u32 s8, 64
 ; GFX10-NEXT:    s_cselect_b32 s15, 1, 0
 ; GFX10-NEXT:    s_cmp_eq_u32 s8, 0
 ; GFX10-NEXT:    s_cselect_b32 s16, 1, 0
 ; GFX10-NEXT:    s_lshr_b64 s[6:7], s[0:1], s8
-; GFX10-NEXT:    s_lshl_b64 s[10:11], s[4:5], s9
+; GFX10-NEXT:    s_lshl_b64 s[12:13], s[4:5], s9
 ; GFX10-NEXT:    s_lshr_b64 s[8:9], s[4:5], s8
-; GFX10-NEXT:    s_or_b64 s[6:7], s[6:7], s[10:11]
+; GFX10-NEXT:    s_or_b64 s[6:7], s[6:7], s[12:13]
 ; GFX10-NEXT:    s_lshr_b64 s[4:5], s[4:5], s14
 ; GFX10-NEXT:    s_cmp_lg_u32 s15, 0
 ; GFX10-NEXT:    s_cselect_b64 s[4:5], s[6:7], s[4:5]
@@ -6006,47 +6002,45 @@ define amdgpu_ps i128 @s_fshl_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg
 ; GFX10-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[4:5]
 ; GFX10-NEXT:    s_cmp_lg_u32 s15, 0
 ; GFX10-NEXT:    s_cselect_b64 s[4:5], s[8:9], 0
-; GFX10-NEXT:    s_or_b64 s[0:1], s[12:13], s[0:1]
+; GFX10-NEXT:    s_or_b64 s[0:1], s[10:11], s[0:1]
 ; GFX10-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: s_fshl_i128:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_mov_b64 s[10:11], 0x7f
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_b64 s[12:13], s[8:9], s[10:11]
-; GFX11-NEXT:    s_and_not1_b64 s[8:9], s[10:11], s[8:9]
-; GFX11-NEXT:    s_sub_i32 s9, s12, 64
-; GFX11-NEXT:    s_sub_i32 s10, 64, s12
-; GFX11-NEXT:    s_cmp_lt_u32 s12, 64
+; GFX11-NEXT:    s_and_b64 s[10:11], s[8:9], 0x7f
+; GFX11-NEXT:    s_and_not1_b64 s[8:9], 0x7f, s[8:9]
+; GFX11-NEXT:    s_sub_i32 s9, s10, 64
+; GFX11-NEXT:    s_sub_i32 s11, 64, s10
+; GFX11-NEXT:    s_cmp_lt_u32 s10, 64
+; GFX11-NEXT:    s_mov_b32 s12, 0
+; GFX11-NEXT:    s_cselect_b32 s13, 1, 0
+; GFX11-NEXT:    s_cmp_eq_u32 s10, 0
 ; GFX11-NEXT:    s_cselect_b32 s18, 1, 0
-; GFX11-NEXT:    s_cmp_eq_u32 s12, 0
-; GFX11-NEXT:    s_cselect_b32 s19, 1, 0
-; GFX11-NEXT:    s_lshr_b64 s[14:15], s[0:1], s10
-; GFX11-NEXT:    s_lshl_b64 s[16:17], s[2:3], s12
-; GFX11-NEXT:    s_lshl_b64 s[12:13], s[0:1], s12
+; GFX11-NEXT:    s_lshr_b64 s[14:15], s[0:1], s11
+; GFX11-NEXT:    s_lshl_b64 s[16:17], s[2:3], s10
+; GFX11-NEXT:    s_lshl_b64 s[10:11], s[0:1], s10
 ; GFX11-NEXT:    s_or_b64 s[14:15], s[14:15], s[16:17]
 ; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], s9
-; GFX11-NEXT:    s_cmp_lg_u32 s18, 0
-; GFX11-NEXT:    s_cselect_b64 s[12:13], s[12:13], 0
+; GFX11-NEXT:    s_cmp_lg_u32 s13, 0
+; GFX11-NEXT:    s_cselect_b64 s[10:11], s[10:11], 0
 ; GFX11-NEXT:    s_cselect_b64 s[0:1], s[14:15], s[0:1]
-; GFX11-NEXT:    s_cmp_lg_u32 s19, 0
+; GFX11-NEXT:    s_cmp_lg_u32 s18, 0
 ; GFX11-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
 ; GFX11-NEXT:    s_lshr_b64 s[0:1], s[4:5], 1
-; GFX11-NEXT:    s_lshl_b32 s5, s6, 31
-; GFX11-NEXT:    s_mov_b32 s4, s11
-; GFX11-NEXT:    s_sub_i32 s14, s8, 64
-; GFX11-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
+; GFX11-NEXT:    s_lshl_b32 s13, s6, 31
 ; GFX11-NEXT:    s_lshr_b64 s[4:5], s[6:7], 1
+; GFX11-NEXT:    s_or_b64 s[0:1], s[0:1], s[12:13]
+; GFX11-NEXT:    s_sub_i32 s14, s8, 64
 ; GFX11-NEXT:    s_sub_i32 s9, 64, s8
 ; GFX11-NEXT:    s_cmp_lt_u32 s8, 64
 ; GFX11-NEXT:    s_cselect_b32 s15, 1, 0
 ; GFX11-NEXT:    s_cmp_eq_u32 s8, 0
 ; GFX11-NEXT:    s_cselect_b32 s16, 1, 0
 ; GFX11-NEXT:    s_lshr_b64 s[6:7], s[0:1], s8
-; GFX11-NEXT:    s_lshl_b64 s[10:11], s[4:5], s9
+; GFX11-NEXT:    s_lshl_b64 s[12:13], s[4:5], s9
 ; GFX11-NEXT:    s_lshr_b64 s[8:9], s[4:5], s8
-; GFX11-NEXT:    s_or_b64 s[6:7], s[6:7], s[10:11]
+; GFX11-NEXT:    s_or_b64 s[6:7], s[6:7], s[12:13]
 ; GFX11-NEXT:    s_lshr_b64 s[4:5], s[4:5], s14
 ; GFX11-NEXT:    s_cmp_lg_u32 s15, 0
 ; GFX11-NEXT:    s_cselect_b64 s[4:5], s[6:7], s[4:5]
@@ -6054,7 +6048,7 @@ define amdgpu_ps i128 @s_fshl_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg
 ; GFX11-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[4:5]
 ; GFX11-NEXT:    s_cmp_lg_u32 s15, 0
 ; GFX11-NEXT:    s_cselect_b64 s[4:5], s[8:9], 0
-; GFX11-NEXT:    s_or_b64 s[0:1], s[12:13], s[0:1]
+; GFX11-NEXT:    s_or_b64 s[0:1], s[10:11], s[0:1]
 ; GFX11-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
 ; GFX11-NEXT:    ; return to shader part epilog
   %result = call i128 @llvm.fshl.i128(i128 %lhs, i128 %rhs, i128 %amt)
@@ -6575,23 +6569,22 @@ define amdgpu_ps <4 x float> @v_fshl_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
 define amdgpu_ps <4 x float> @v_fshl_i128_svs(i128 inreg %lhs, i128 %rhs, i128 inreg %amt) {
 ; GFX6-LABEL: v_fshl_i128_svs:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_mov_b64 s[6:7], 0x7f
-; GFX6-NEXT:    s_and_b64 s[8:9], s[4:5], s[6:7]
-; GFX6-NEXT:    s_andn2_b64 s[4:5], s[6:7], s[4:5]
-; GFX6-NEXT:    s_sub_i32 s5, s8, 64
-; GFX6-NEXT:    s_sub_i32 s9, 64, s8
-; GFX6-NEXT:    s_cmp_lt_u32 s8, 64
+; GFX6-NEXT:    s_and_b64 s[6:7], s[4:5], 0x7f
+; GFX6-NEXT:    s_andn2_b64 s[4:5], 0x7f, s[4:5]
+; GFX6-NEXT:    s_sub_i32 s5, s6, 64
+; GFX6-NEXT:    s_sub_i32 s7, 64, s6
+; GFX6-NEXT:    s_cmp_lt_u32 s6, 64
 ; GFX6-NEXT:    s_cselect_b32 s12, 1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s8, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s6, 0
 ; GFX6-NEXT:    s_cselect_b32 s13, 1, 0
-; GFX6-NEXT:    s_lshl_b64 s[6:7], s[0:1], s8
-; GFX6-NEXT:    s_lshr_b64 s[10:11], s[0:1], s9
-; GFX6-NEXT:    s_lshl_b64 s[8:9], s[2:3], s8
-; GFX6-NEXT:    s_or_b64 s[8:9], s[10:11], s[8:9]
+; GFX6-NEXT:    s_lshl_b64 s[8:9], s[0:1], s6
+; GFX6-NEXT:    s_lshr_b64 s[10:11], s[0:1], s7
+; GFX6-NEXT:    s_lshl_b64 s[6:7], s[2:3], s6
+; GFX6-NEXT:    s_or_b64 s[6:7], s[10:11], s[6:7]
 ; GFX6-NEXT:    s_lshl_b64 s[0:1], s[0:1], s5
 ; GFX6-NEXT:    s_cmp_lg_u32 s12, 0
-; GFX6-NEXT:    s_cselect_b64 s[6:7], s[6:7], 0
-; GFX6-NEXT:    s_cselect_b64 s[0:1], s[8:9], s[0:1]
+; GFX6-NEXT:    s_cselect_b64 s[8:9], s[8:9], 0
+; GFX6-NEXT:    s_cselect_b64 s[0:1], s[6:7], s[0:1]
 ; GFX6-NEXT:    s_cmp_lg_u32 s13, 0
 ; GFX6-NEXT:    v_lshr_b64 v[0:1], v[0:1], 1
 ; GFX6-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
@@ -6605,14 +6598,14 @@ define amdgpu_ps <4 x float> @v_fshl_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 ; GFX6-NEXT:    s_cmp_eq_u32 s4, 0
 ; GFX6-NEXT:    v_lshr_b64 v[4:5], v[0:1], s4
 ; GFX6-NEXT:    v_lshl_b64 v[6:7], v[2:3], s1
-; GFX6-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX6-NEXT:    s_cselect_b32 s6, 1, 0
 ; GFX6-NEXT:    v_lshr_b64 v[8:9], v[2:3], s4
 ; GFX6-NEXT:    v_lshr_b64 v[2:3], v[2:3], s0
 ; GFX6-NEXT:    s_and_b32 s0, 1, s5
 ; GFX6-NEXT:    v_or_b32_e32 v4, v4, v6
 ; GFX6-NEXT:    v_or_b32_e32 v5, v5, v7
 ; GFX6-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
-; GFX6-NEXT:    s_and_b32 s0, 1, s8
+; GFX6-NEXT:    s_and_b32 s0, 1, s6
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
 ; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, s0
@@ -6620,31 +6613,30 @@ define amdgpu_ps <4 x float> @v_fshl_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 ; GFX6-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[0:1]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v8, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, 0, v9, vcc
-; GFX6-NEXT:    v_or_b32_e32 v0, s6, v0
-; GFX6-NEXT:    v_or_b32_e32 v1, s7, v1
+; GFX6-NEXT:    v_or_b32_e32 v0, s8, v0
+; GFX6-NEXT:    v_or_b32_e32 v1, s9, v1
 ; GFX6-NEXT:    v_or_b32_e32 v2, s2, v2
 ; GFX6-NEXT:    v_or_b32_e32 v3, s3, v3
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: v_fshl_i128_svs:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_mov_b64 s[6:7], 0x7f
-; GFX8-NEXT:    s_and_b64 s[8:9], s[4:5], s[6:7]
-; GFX8-NEXT:    s_andn2_b64 s[4:5], s[6:7], s[4:5]
-; GFX8-NEXT:    s_sub_i32 s5, s8, 64
-; GFX8-NEXT:    s_sub_i32 s9, 64, s8
-; GFX8-NEXT:    s_cmp_lt_u32 s8, 64
+; GFX8-NEXT:    s_and_b64 s[6:7], s[4:5], 0x7f
+; GFX8-NEXT:    s_andn2_b64 s[4:5], 0x7f, s[4:5]
+; GFX8-NEXT:    s_sub_i32 s5, s6, 64
+; GFX8-NEXT:    s_sub_i32 s7, 64, s6
+; GFX8-NEXT:    s_cmp_lt_u32 s6, 64
 ; GFX8-NEXT:    s_cselect_b32 s12, 1, 0
-; GFX8-NEXT:    s_cmp_eq_u32 s8, 0
+; GFX8-NEXT:    s_cmp_eq_u32 s6, 0
 ; GFX8-NEXT:    s_cselect_b32 s13, 1, 0
-; GFX8-NEXT:    s_lshl_b64 s[6:7], s[0:1], s8
-; GFX8-NEXT:    s_lshr_b64 s[10:11], s[0:1], s9
-; GFX8-NEXT:    s_lshl_b64 s[8:9], s[2:3], s8
-; GFX8-NEXT:    s_or_b64 s[8:9], s[10:11], s[8:9]
+; GFX8-NEXT:    s_lshl_b64 s[8:9], s[0:1], s6
+; GFX8-NEXT:    s_lshr_b64 s[10:11], s[0:1], s7
+; GFX8-NEXT:    s_lshl_b64 s[6:7], s[2:3], s6
+; GFX8-NEXT:    s_or_b64 s[6:7], s[10:11], s[6:7]
 ; GFX8-NEXT:    s_lshl_b64 s[0:1], s[0:1], s5
 ; GFX8-NEXT:    s_cmp_lg_u32 s12, 0
-; GFX8-NEXT:    s_cselect_b64 s[6:7], s[6:7], 0
-; GFX8-NEXT:    s_cselect_b64 s[0:1], s[8:9], s[0:1]
+; GFX8-NEXT:    s_cselect_b64 s[8:9], s[8:9], 0
+; GFX8-NEXT:    s_cselect_b64 s[0:1], s[6:7], s[0:1]
 ; GFX8-NEXT:    s_cmp_lg_u32 s13, 0
 ; GFX8-NEXT:    v_lshrrev_b64 v[0:1], 1, v[0:1]
 ; GFX8-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
@@ -6658,14 +6650,14 @@ define amdgpu_ps <4 x float> @v_fshl_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 ; GFX8-NEXT:    s_cmp_eq_u32 s4, 0
 ; GFX8-NEXT:    v_lshrrev_b64 v[4:5], s4, v[0:1]
 ; GFX8-NEXT:    v_lshlrev_b64 v[6:7], s1, v[2:3]
-; GFX8-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX8-NEXT:    s_cselect_b32 s6, 1, 0
 ; GFX8-NEXT:    v_lshrrev_b64 v[8:9], s4, v[2:3]
 ; GFX8-NEXT:    v_lshrrev_b64 v[2:3], s0, v[2:3]
 ; GFX8-NEXT:    s_and_b32 s0, 1, s5
 ; GFX8-NEXT:    v_or_b32_e32 v4, v4, v6
 ; GFX8-NEXT:    v_or_b32_e32 v5, v5, v7
 ; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
-; GFX8-NEXT:    s_and_b32 s0, 1, s8
+; GFX8-NEXT:    s_and_b32 s0, 1, s6
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
 ; GFX8-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, s0
@@ -6673,32 +6665,31 @@ define amdgpu_ps <4 x float> @v_fshl_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[0:1]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, 0, v8, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, 0, v9, vcc
-; GFX8-NEXT:    v_or_b32_e32 v0, s6, v0
-; GFX8-NEXT:    v_or_b32_e32 v1, s7, v1
+; GFX8-NEXT:    v_or_b32_e32 v0, s8, v0
+; GFX8-NEXT:    v_or_b32_e32 v1, s9, v1
 ; GFX8-NEXT:    v_or_b32_e32 v2, s2, v2
 ; GFX8-NEXT:    v_or_b32_e32 v3, s3, v3
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: v_fshl_i128_svs:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b64 s[6:7], 0x7f
-; GFX9-NEXT:    s_and_b64 s[8:9], s[4:5], s[6:7]
-; GFX9-NEXT:    s_andn2_b64 s[4:5], s[6:7], s[4:5]
-; GFX9-NEXT:    s_sub_i32 s5, s8, 64
-; GFX9-NEXT:    s_sub_i32 s9, 64, s8
-; GFX9-NEXT:    s_cmp_lt_u32 s8, 64
+; GFX9-NEXT:    s_and_b64 s[6:7], s[4:5], 0x7f
+; GFX9-NEXT:    s_andn2_b64 s[4:5], 0x7f, s[4:5]
+; GFX9-NEXT:    s_sub_i32 s5, s6, 64
+; GFX9-NEXT:    s_sub_i32 s7, 64, s6
+; GFX9-NEXT:    s_cmp_lt_u32 s6, 64
 ; GFX9-NEXT:    s_cselect_b32 s12, 1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s8, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s6, 0
 ; GFX9-NEXT:    s_cselect_b32 s13, 1, 0
-; GFX9-NEXT:    s_lshl_b64 s[6:7], s[0:1], s8
-; GFX9-NEXT:    s_lshr_b64 s[10:11], s[0:1], s9
-; GFX9-NEXT:    s_lshl_b64 s[8:9], s[2:3], s8
-; GFX9-NEXT:    s_or_b64 s[8:9], s[10:11], s[8:9]
+; GFX9-NEXT:    s_lshl_b64 s[8:9], s[0:1], s6
+; GFX9-NEXT:    s_lshr_b64 s[10:11], s[0:1], s7
+; GFX9-NEXT:    s_lshl_b64 s[6:7], s[2:3], s6
+; GFX9-NEXT:    s_or_b64 s[6:7], s[10:11], s[6:7]
 ; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], s5
 ; GFX9-NEXT:    s_cmp_lg_u32 s12, 0
 ; GFX9-NEXT:    v_lshrrev_b64 v[0:1], 1, v[0:1]
-; GFX9-NEXT:    s_cselect_b64 s[6:7], s[6:7], 0
-; GFX9-NEXT:    s_cselect_b64 s[0:1], s[8:9], s[0:1]
+; GFX9-NEXT:    s_cselect_b64 s[8:9], s[8:9], 0
+; GFX9-NEXT:    s_cselect_b64 s[0:1], s[6:7], s[0:1]
 ; GFX9-NEXT:    s_cmp_lg_u32 s13, 0
 ; GFX9-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
 ; GFX9-NEXT:    v_lshl_or_b32 v1, v2, 31, v1
@@ -6710,14 +6701,14 @@ define amdgpu_ps <4 x float> @v_fshl_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 ; GFX9-NEXT:    s_cmp_eq_u32 s4, 0
 ; GFX9-NEXT:    v_lshrrev_b64 v[4:5], s4, v[0:1]
 ; GFX9-NEXT:    v_lshlrev_b64 v[6:7], s1, v[2:3]
-; GFX9-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX9-NEXT:    s_cselect_b32 s6, 1, 0
 ; GFX9-NEXT:    v_lshrrev_b64 v[8:9], s4, v[2:3]
 ; GFX9-NEXT:    v_lshrrev_b64 v[2:3], s0, v[2:3]
 ; GFX9-NEXT:    s_and_b32 s0, 1, s5
 ; GFX9-NEXT:    v_or_b32_e32 v4, v4, v6
 ; GFX9-NEXT:    v_or_b32_e32 v5, v5, v7
 ; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
-; GFX9-NEXT:    s_and_b32 s0, 1, s8
+; GFX9-NEXT:    s_and_b32 s0, 1, s6
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
 ; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, s0
@@ -6725,36 +6716,35 @@ define amdgpu_ps <4 x float> @v_fshl_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, 0, v8, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, 0, v9, vcc
-; GFX9-NEXT:    v_or_b32_e32 v0, s6, v0
-; GFX9-NEXT:    v_or_b32_e32 v1, s7, v1
+; GFX9-NEXT:    v_or_b32_e32 v0, s8, v0
+; GFX9-NEXT:    v_or_b32_e32 v1, s9, v1
 ; GFX9-NEXT:    v_or_b32_e32 v2, s2, v2
 ; GFX9-NEXT:    v_or_b32_e32 v3, s3, v3
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: v_fshl_i128_svs:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_mov_b64 s[6:7], 0x7f
+; GFX10-NEXT:    s_and_b64 s[6:7], s[4:5], 0x7f
+; GFX10-NEXT:    s_andn2_b64 s[4:5], 0x7f, s[4:5]
+; GFX10-NEXT:    s_sub_i32 s5, s6, 64
+; GFX10-NEXT:    s_sub_i32 s7, 64, s6
+; GFX10-NEXT:    s_cmp_lt_u32 s6, 64
 ; GFX10-NEXT:    v_lshrrev_b64 v[0:1], 1, v[0:1]
-; GFX10-NEXT:    s_and_b64 s[8:9], s[4:5], s[6:7]
-; GFX10-NEXT:    s_andn2_b64 s[4:5], s[6:7], s[4:5]
-; GFX10-NEXT:    s_sub_i32 s5, s8, 64
-; GFX10-NEXT:    s_sub_i32 s6, 64, s8
-; GFX10-NEXT:    s_cmp_lt_u32 s8, 64
-; GFX10-NEXT:    v_lshl_or_b32 v1, v2, 31, v1
 ; GFX10-NEXT:    s_cselect_b32 s12, 1, 0
-; GFX10-NEXT:    s_cmp_eq_u32 s8, 0
-; GFX10-NEXT:    v_lshrrev_b64 v[2:3], 1, v[2:3]
+; GFX10-NEXT:    s_cmp_eq_u32 s6, 0
 ; GFX10-NEXT:    s_cselect_b32 s13, 1, 0
-; GFX10-NEXT:    s_lshr_b64 s[6:7], s[0:1], s6
-; GFX10-NEXT:    s_lshl_b64 s[10:11], s[2:3], s8
-; GFX10-NEXT:    s_lshl_b64 s[8:9], s[0:1], s8
-; GFX10-NEXT:    s_or_b64 s[6:7], s[6:7], s[10:11]
+; GFX10-NEXT:    s_lshr_b64 s[8:9], s[0:1], s7
+; GFX10-NEXT:    s_lshl_b64 s[10:11], s[2:3], s6
+; GFX10-NEXT:    s_lshl_b64 s[6:7], s[0:1], s6
+; GFX10-NEXT:    s_or_b64 s[8:9], s[8:9], s[10:11]
 ; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], s5
 ; GFX10-NEXT:    s_cmp_lg_u32 s12, 0
-; GFX10-NEXT:    v_lshrrev_b64 v[4:5], s4, v[0:1]
-; GFX10-NEXT:    s_cselect_b64 s[8:9], s[8:9], 0
-; GFX10-NEXT:    s_cselect_b64 s[0:1], s[6:7], s[0:1]
+; GFX10-NEXT:    v_lshl_or_b32 v1, v2, 31, v1
+; GFX10-NEXT:    v_lshrrev_b64 v[2:3], 1, v[2:3]
+; GFX10-NEXT:    s_cselect_b64 s[6:7], s[6:7], 0
+; GFX10-NEXT:    s_cselect_b64 s[0:1], s[8:9], s[0:1]
 ; GFX10-NEXT:    s_cmp_lg_u32 s13, 0
+; GFX10-NEXT:    v_lshrrev_b64 v[4:5], s4, v[0:1]
 ; GFX10-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
 ; GFX10-NEXT:    s_sub_i32 s0, 64, s4
 ; GFX10-NEXT:    v_lshlrev_b64 v[6:7], s0, v[2:3]
@@ -6779,34 +6769,33 @@ define amdgpu_ps <4 x float> @v_fshl_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v5, v1, s0
 ; GFX10-NEXT:    v_or_b32_e32 v2, s2, v2
 ; GFX10-NEXT:    v_or_b32_e32 v3, s3, v3
-; GFX10-NEXT:    v_or_b32_e32 v0, s8, v0
-; GFX10-NEXT:    v_or_b32_e32 v1, s9, v1
+; GFX10-NEXT:    v_or_b32_e32 v0, s6, v0
+; GFX10-NEXT:    v_or_b32_e32 v1, s7, v1
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: v_fshl_i128_svs:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_mov_b64 s[6:7], 0x7f
+; GFX11-NEXT:    s_and_b64 s[6:7], s[4:5], 0x7f
+; GFX11-NEXT:    s_and_not1_b64 s[4:5], 0x7f, s[4:5]
+; GFX11-NEXT:    s_sub_i32 s5, s6, 64
+; GFX11-NEXT:    s_sub_i32 s7, 64, s6
+; GFX11-NEXT:    s_cmp_lt_u32 s6, 64
 ; GFX11-NEXT:    v_lshrrev_b64 v[0:1], 1, v[0:1]
-; GFX11-NEXT:    s_and_b64 s[8:9], s[4:5], s[6:7]
-; GFX11-NEXT:    s_and_not1_b64 s[4:5], s[6:7], s[4:5]
-; GFX11-NEXT:    s_sub_i32 s5, s8, 64
-; GFX11-NEXT:    s_sub_i32 s6, 64, s8
-; GFX11-NEXT:    s_cmp_lt_u32 s8, 64
-; GFX11-NEXT:    v_lshl_or_b32 v1, v2, 31, v1
 ; GFX11-NEXT:    s_cselect_b32 s12, 1, 0
-; GFX11-NEXT:    s_cmp_eq_u32 s8, 0
-; GFX11-NEXT:    v_lshrrev_b64 v[2:3], 1, v[2:3]
+; GFX11-NEXT:    s_cmp_eq_u32 s6, 0
 ; GFX11-NEXT:    s_cselect_b32 s13, 1, 0
-; GFX11-NEXT:    s_lshr_b64 s[6:7], s[0:1], s6
-; GFX11-NEXT:    s_lshl_b64 s[10:11], s[2:3], s8
-; GFX11-NEXT:    s_lshl_b64 s[8:9], s[0:1], s8
-; GFX11-NEXT:    s_or_b64 s[6:7], s[6:7], s[10:11]
+; GFX11-NEXT:    s_lshr_b64 s[8:9], s[0:1], s7
+; GFX11-NEXT:    s_lshl_b64 s[10:11], s[2:3], s6
+; GFX11-NEXT:    s_lshl_b64 s[6:7], s[0:1], s6
+; GFX11-NEXT:    s_or_b64 s[8:9], s[8:9], s[10:11]
 ; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], s5
 ; GFX11-NEXT:    s_cmp_lg_u32 s12, 0
-; GFX11-NEXT:    v_lshrrev_b64 v[4:5], s4, v[0:1]
-; GFX11-NEXT:    s_cselect_b64 s[8:9], s[8:9], 0
-; GFX11-NEXT:    s_cselect_b64 s[0:1], s[6:7], s[0:1]
+; GFX11-NEXT:    v_lshl_or_b32 v1, v2, 31, v1
+; GFX11-NEXT:    v_lshrrev_b64 v[2:3], 1, v[2:3]
+; GFX11-NEXT:    s_cselect_b64 s[6:7], s[6:7], 0
+; GFX11-NEXT:    s_cselect_b64 s[0:1], s[8:9], s[0:1]
 ; GFX11-NEXT:    s_cmp_lg_u32 s13, 0
+; GFX11-NEXT:    v_lshrrev_b64 v[4:5], s4, v[0:1]
 ; GFX11-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
 ; GFX11-NEXT:    s_sub_i32 s0, 64, s4
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -6833,9 +6822,9 @@ define amdgpu_ps <4 x float> @v_fshl_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 ; GFX11-NEXT:    v_or_b32_e32 v2, s2, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_or_b32_e32 v3, s3, v3
-; GFX11-NEXT:    v_or_b32_e32 v0, s8, v0
+; GFX11-NEXT:    v_or_b32_e32 v0, s6, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
-; GFX11-NEXT:    v_or_b32_e32 v1, s9, v1
+; GFX11-NEXT:    v_or_b32_e32 v1, s7, v1
 ; GFX11-NEXT:    ; return to shader part epilog
   %result = call i128 @llvm.fshl.i128(i128 %lhs, i128 %rhs, i128 %amt)
   %cast.result = bitcast i128 %result to <4 x float>
@@ -6845,23 +6834,22 @@ define amdgpu_ps <4 x float> @v_fshl_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 define amdgpu_ps <4 x float> @v_fshl_i128_vss(i128 %lhs, i128 inreg %rhs, i128 inreg %amt) {
 ; GFX6-LABEL: v_fshl_i128_vss:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_mov_b64 s[6:7], 0x7f
-; GFX6-NEXT:    s_and_b64 s[8:9], s[4:5], s[6:7]
-; GFX6-NEXT:    s_andn2_b64 s[4:5], s[6:7], s[4:5]
-; GFX6-NEXT:    s_sub_i32 s5, s8, 64
-; GFX6-NEXT:    s_sub_i32 s6, 64, s8
-; GFX6-NEXT:    s_cmp_lt_u32 s8, 64
+; GFX6-NEXT:    s_and_b64 s[6:7], s[4:5], 0x7f
+; GFX6-NEXT:    s_andn2_b64 s[4:5], 0x7f, s[4:5]
+; GFX6-NEXT:    s_sub_i32 s5, s6, 64
+; GFX6-NEXT:    s_sub_i32 s7, 64, s6
+; GFX6-NEXT:    s_cmp_lt_u32 s6, 64
 ; GFX6-NEXT:    s_cselect_b32 s9, 1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s8, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s6, 0
+; GFX6-NEXT:    s_mov_b32 s8, 0
 ; GFX6-NEXT:    s_cselect_b32 s10, 1, 0
-; GFX6-NEXT:    v_lshr_b64 v[4:5], v[0:1], s6
-; GFX6-NEXT:    v_lshl_b64 v[6:7], v[2:3], s8
-; GFX6-NEXT:    v_lshl_b64 v[8:9], v[0:1], s8
+; GFX6-NEXT:    v_lshr_b64 v[4:5], v[0:1], s7
+; GFX6-NEXT:    v_lshl_b64 v[8:9], v[0:1], s6
 ; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], s5
 ; GFX6-NEXT:    s_and_b32 s5, 1, s9
 ; GFX6-NEXT:    s_lshr_b64 s[0:1], s[0:1], 1
 ; GFX6-NEXT:    s_lshl_b32 s9, s2, 31
-; GFX6-NEXT:    s_mov_b32 s8, s7
+; GFX6-NEXT:    v_lshl_b64 v[6:7], v[2:3], s6
 ; GFX6-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s5
 ; GFX6-NEXT:    s_and_b32 s5, 1, s10
 ; GFX6-NEXT:    s_or_b64 s[0:1], s[0:1], s[8:9]
@@ -6900,23 +6888,22 @@ define amdgpu_ps <4 x float> @v_fshl_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
 ;
 ; GFX8-LABEL: v_fshl_i128_vss:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_mov_b64 s[6:7], 0x7f
-; GFX8-NEXT:    s_and_b64 s[8:9], s[4:5], s[6:7]
-; GFX8-NEXT:    s_andn2_b64 s[4:5], s[6:7], s[4:5]
-; GFX8-NEXT:    s_sub_i32 s5, s8, 64
-; GFX8-NEXT:    s_sub_i32 s6, 64, s8
-; GFX8-NEXT:    s_cmp_lt_u32 s8, 64
+; GFX8-NEXT:    s_and_b64 s[6:7], s[4:5], 0x7f
+; GFX8-NEXT:    s_andn2_b64 s[4:5], 0x7f, s[4:5]
+; GFX8-NEXT:    s_sub_i32 s5, s6, 64
+; GFX8-NEXT:    s_sub_i32 s7, 64, s6
+; GFX8-NEXT:    s_cmp_lt_u32 s6, 64
 ; GFX8-NEXT:    s_cselect_b32 s9, 1, 0
-; GFX8-NEXT:    s_cmp_eq_u32 s8, 0
+; GFX8-NEXT:    s_cmp_eq_u32 s6, 0
+; GFX8-NEXT:    s_mov_b32 s8, 0
 ; GFX8-NEXT:    s_cselect_b32 s10, 1, 0
-; GFX8-NEXT:    v_lshrrev_b64 v[4:5], s6, v[0:1]
-; GFX8-NEXT:    v_lshlrev_b64 v[6:7], s8, v[2:3]
-; GFX8-NEXT:    v_lshlrev_b64 v[8:9], s8, v[0:1]
+; GFX8-NEXT:    v_lshrrev_b64 v[4:5], s7, v[0:1]
+; GFX8-NEXT:    v_lshlrev_b64 v[8:9], s6, v[0:1]
 ; GFX8-NEXT:    v_lshlrev_b64 v[0:1], s5, v[0:1]
 ; GFX8-NEXT:    s_and_b32 s5, 1, s9
 ; GFX8-NEXT:    s_lshr_b64 s[0:1], s[0:1], 1
 ; GFX8-NEXT:    s_lshl_b32 s9, s2, 31
-; GFX8-NEXT:    s_mov_b32 s8, s7
+; GFX8-NEXT:    v_lshlrev_b64 v[6:7], s6, v[2:3]
 ; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s5
 ; GFX8-NEXT:    s_and_b32 s5, 1, s10
 ; GFX8-NEXT:    s_or_b64 s[0:1], s[0:1], s[8:9]
@@ -6955,23 +6942,22 @@ define amdgpu_ps <4 x float> @v_fshl_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
 ;
 ; GFX9-LABEL: v_fshl_i128_vss:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b64 s[6:7], 0x7f
-; GFX9-NEXT:    s_and_b64 s[8:9], s[4:5], s[6:7]
-; GFX9-NEXT:    s_andn2_b64 s[4:5], s[6:7], s[4:5]
-; GFX9-NEXT:    s_sub_i32 s5, s8, 64
-; GFX9-NEXT:    s_sub_i32 s6, 64, s8
-; GFX9-NEXT:    s_cmp_lt_u32 s8, 64
+; GFX9-NEXT:    s_and_b64 s[6:7], s[4:5], 0x7f
+; GFX9-NEXT:    s_andn2_b64 s[4:5], 0x7f, s[4:5]
+; GFX9-NEXT:    s_sub_i32 s5, s6, 64
+; GFX9-NEXT:    s_sub_i32 s7, 64, s6
+; GFX9-NEXT:    s_cmp_lt_u32 s6, 64
 ; GFX9-NEXT:    s_cselect_b32 s9, 1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s8, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s6, 0
+; GFX9-NEXT:    s_mov_b32 s8, 0
 ; GFX9-NEXT:    s_cselect_b32 s10, 1, 0
-; GFX9-NEXT:    v_lshrrev_b64 v[4:5], s6, v[0:1]
-; GFX9-NEXT:    v_lshlrev_b64 v[6:7], s8, v[2:3]
-; GFX9-NEXT:    v_lshlrev_b64 v[8:9], s8, v[0:1]
+; GFX9-NEXT:    v_lshrrev_b64 v[4:5], s7, v[0:1]
+; GFX9-NEXT:    v_lshlrev_b64 v[8:9], s6, v[0:1]
 ; GFX9-NEXT:    v_lshlrev_b64 v[0:1], s5, v[0:1]
 ; GFX9-NEXT:    s_and_b32 s5, 1, s9
 ; GFX9-NEXT:    s_lshr_b64 s[0:1], s[0:1], 1
 ; GFX9-NEXT:    s_lshl_b32 s9, s2, 31
-; GFX9-NEXT:    s_mov_b32 s8, s7
+; GFX9-NEXT:    v_lshlrev_b64 v[6:7], s6, v[2:3]
 ; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s5
 ; GFX9-NEXT:    s_and_b32 s5, 1, s10
 ; GFX9-NEXT:    s_or_b64 s[0:1], s[0:1], s[8:9]
@@ -7010,39 +6996,38 @@ define amdgpu_ps <4 x float> @v_fshl_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
 ;
 ; GFX10-LABEL: v_fshl_i128_vss:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_mov_b64 s[6:7], 0x7f
-; GFX10-NEXT:    s_and_b64 s[8:9], s[4:5], s[6:7]
-; GFX10-NEXT:    s_andn2_b64 s[4:5], s[6:7], s[4:5]
-; GFX10-NEXT:    s_sub_i32 s5, s8, 64
-; GFX10-NEXT:    s_sub_i32 s6, 64, s8
-; GFX10-NEXT:    s_cmp_lt_u32 s8, 64
-; GFX10-NEXT:    v_lshrrev_b64 v[4:5], s6, v[0:1]
-; GFX10-NEXT:    v_lshlrev_b64 v[6:7], s8, v[2:3]
+; GFX10-NEXT:    s_and_b64 s[6:7], s[4:5], 0x7f
+; GFX10-NEXT:    s_andn2_b64 s[4:5], 0x7f, s[4:5]
+; GFX10-NEXT:    s_sub_i32 s5, s6, 64
+; GFX10-NEXT:    s_sub_i32 s7, 64, s6
+; GFX10-NEXT:    s_cmp_lt_u32 s6, 64
+; GFX10-NEXT:    v_lshrrev_b64 v[4:5], s7, v[0:1]
+; GFX10-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX10-NEXT:    s_cmp_eq_u32 s6, 0
+; GFX10-NEXT:    v_lshlrev_b64 v[6:7], s6, v[2:3]
 ; GFX10-NEXT:    s_cselect_b32 s9, 1, 0
-; GFX10-NEXT:    s_cmp_eq_u32 s8, 0
-; GFX10-NEXT:    v_lshlrev_b64 v[8:9], s8, v[0:1]
-; GFX10-NEXT:    s_cselect_b32 s10, 1, 0
-; GFX10-NEXT:    s_and_b32 s6, 1, s9
+; GFX10-NEXT:    v_lshlrev_b64 v[8:9], s6, v[0:1]
+; GFX10-NEXT:    s_and_b32 s6, 1, s8
 ; GFX10-NEXT:    s_lshr_b64 s[0:1], s[0:1], 1
-; GFX10-NEXT:    s_lshl_b32 s9, s2, 31
-; GFX10-NEXT:    s_mov_b32 s8, s7
+; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s6
+; GFX10-NEXT:    s_mov_b32 s6, 0
+; GFX10-NEXT:    s_lshl_b32 s7, s2, 31
 ; GFX10-NEXT:    v_lshlrev_b64 v[0:1], s5, v[0:1]
-; GFX10-NEXT:    s_and_b32 s5, 1, s10
-; GFX10-NEXT:    s_or_b64 s[0:1], s[0:1], s[8:9]
+; GFX10-NEXT:    s_and_b32 s5, 1, s9
+; GFX10-NEXT:    s_or_b64 s[0:1], s[0:1], s[6:7]
 ; GFX10-NEXT:    s_lshr_b64 s[2:3], s[2:3], 1
 ; GFX10-NEXT:    s_sub_i32 s10, s4, 64
 ; GFX10-NEXT:    s_sub_i32 s8, 64, s4
-; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s6
 ; GFX10-NEXT:    v_or_b32_e32 v4, v4, v6
 ; GFX10-NEXT:    v_or_b32_e32 v5, v5, v7
 ; GFX10-NEXT:    s_cmp_lt_u32 s4, 64
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, 0, v8, vcc_lo
 ; GFX10-NEXT:    s_cselect_b32 s11, 1, 0
 ; GFX10-NEXT:    s_cmp_eq_u32 s4, 0
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, 0, v8, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, 0, v9, vcc_lo
 ; GFX10-NEXT:    s_cselect_b32 s12, 1, 0
 ; GFX10-NEXT:    s_lshr_b64 s[6:7], s[0:1], s4
 ; GFX10-NEXT:    s_lshl_b64 s[8:9], s[2:3], s8
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, 0, v9, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s5
@@ -7065,40 +7050,38 @@ define amdgpu_ps <4 x float> @v_fshl_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
 ;
 ; GFX11-LABEL: v_fshl_i128_vss:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_mov_b64 s[6:7], 0x7f
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_b64 s[8:9], s[4:5], s[6:7]
-; GFX11-NEXT:    s_and_not1_b64 s[4:5], s[6:7], s[4:5]
-; GFX11-NEXT:    s_sub_i32 s5, s8, 64
-; GFX11-NEXT:    s_sub_i32 s6, 64, s8
-; GFX11-NEXT:    s_cmp_lt_u32 s8, 64
-; GFX11-NEXT:    v_lshrrev_b64 v[4:5], s6, v[0:1]
-; GFX11-NEXT:    v_lshlrev_b64 v[6:7], s8, v[2:3]
+; GFX11-NEXT:    s_and_b64 s[6:7], s[4:5], 0x7f
+; GFX11-NEXT:    s_and_not1_b64 s[4:5], 0x7f, s[4:5]
+; GFX11-NEXT:    s_sub_i32 s5, s6, 64
+; GFX11-NEXT:    s_sub_i32 s7, 64, s6
+; GFX11-NEXT:    s_cmp_lt_u32 s6, 64
+; GFX11-NEXT:    v_lshrrev_b64 v[4:5], s7, v[0:1]
+; GFX11-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX11-NEXT:    s_cmp_eq_u32 s6, 0
+; GFX11-NEXT:    v_lshlrev_b64 v[6:7], s6, v[2:3]
 ; GFX11-NEXT:    s_cselect_b32 s9, 1, 0
-; GFX11-NEXT:    s_cmp_eq_u32 s8, 0
-; GFX11-NEXT:    v_lshlrev_b64 v[8:9], s8, v[0:1]
-; GFX11-NEXT:    s_cselect_b32 s10, 1, 0
-; GFX11-NEXT:    s_and_b32 s6, 1, s9
+; GFX11-NEXT:    v_lshlrev_b64 v[8:9], s6, v[0:1]
+; GFX11-NEXT:    s_and_b32 s6, 1, s8
 ; GFX11-NEXT:    s_lshr_b64 s[0:1], s[0:1], 1
-; GFX11-NEXT:    s_lshl_b32 s9, s2, 31
-; GFX11-NEXT:    s_mov_b32 s8, s7
+; GFX11-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s6
+; GFX11-NEXT:    s_mov_b32 s6, 0
+; GFX11-NEXT:    s_lshl_b32 s7, s2, 31
 ; GFX11-NEXT:    v_lshlrev_b64 v[0:1], s5, v[0:1]
-; GFX11-NEXT:    s_and_b32 s5, 1, s10
-; GFX11-NEXT:    s_or_b64 s[0:1], s[0:1], s[8:9]
+; GFX11-NEXT:    s_and_b32 s5, 1, s9
+; GFX11-NEXT:    s_or_b64 s[0:1], s[0:1], s[6:7]
 ; GFX11-NEXT:    s_lshr_b64 s[2:3], s[2:3], 1
 ; GFX11-NEXT:    s_sub_i32 s10, s4, 64
 ; GFX11-NEXT:    s_sub_i32 s8, 64, s4
-; GFX11-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s6
 ; GFX11-NEXT:    v_or_b32_e32 v4, v4, v6
 ; GFX11-NEXT:    v_or_b32_e32 v5, v5, v7
 ; GFX11-NEXT:    s_cmp_lt_u32 s4, 64
+; GFX11-NEXT:    v_dual_cndmask_b32 v6, 0, v8 :: v_dual_cndmask_b32 v7, 0, v9
 ; GFX11-NEXT:    s_cselect_b32 s11, 1, 0
 ; GFX11-NEXT:    s_cmp_eq_u32 s4, 0
-; GFX11-NEXT:    v_dual_cndmask_b32 v6, 0, v8 :: v_dual_cndmask_b32 v7, 0, v9
+; GFX11-NEXT:    v_dual_cndmask_b32 v0, v0, v4 :: v_dual_cndmask_b32 v1, v1, v5
 ; GFX11-NEXT:    s_cselect_b32 s12, 1, 0
 ; GFX11-NEXT:    s_lshr_b64 s[6:7], s[0:1], s4
 ; GFX11-NEXT:    s_lshl_b64 s[8:9], s[2:3], s8
-; GFX11-NEXT:    v_dual_cndmask_b32 v0, v0, v4 :: v_dual_cndmask_b32 v1, v1, v5
 ; GFX11-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s5
 ; GFX11-NEXT:    s_lshr_b64 s[4:5], s[2:3], s4
 ; GFX11-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
@@ -7243,71 +7226,69 @@ define i128 @v_fshl_i128_65(i128 %lhs, i128 %rhs) {
 define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inreg %rhs, <2 x i128> inreg %amt) {
 ; GFX6-LABEL: s_fshl_v2i128:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_mov_b64 s[18:19], 0x7f
-; GFX6-NEXT:    s_and_b64 s[22:23], s[16:17], s[18:19]
-; GFX6-NEXT:    s_andn2_b64 s[16:17], s[18:19], s[16:17]
-; GFX6-NEXT:    s_sub_i32 s17, s22, 64
-; GFX6-NEXT:    s_sub_i32 s23, 64, s22
-; GFX6-NEXT:    s_cmp_lt_u32 s22, 64
+; GFX6-NEXT:    s_and_b64 s[18:19], s[16:17], 0x7f
+; GFX6-NEXT:    s_andn2_b64 s[16:17], 0x7f, s[16:17]
+; GFX6-NEXT:    s_sub_i32 s17, s18, 64
+; GFX6-NEXT:    s_sub_i32 s19, 64, s18
+; GFX6-NEXT:    s_cmp_lt_u32 s18, 64
+; GFX6-NEXT:    s_cselect_b32 s23, 1, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s18, 0
 ; GFX6-NEXT:    s_cselect_b32 s28, 1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s22, 0
-; GFX6-NEXT:    s_cselect_b32 s29, 1, 0
-; GFX6-NEXT:    s_lshl_b64 s[24:25], s[0:1], s22
-; GFX6-NEXT:    s_lshr_b64 s[26:27], s[0:1], s23
-; GFX6-NEXT:    s_lshl_b64 s[22:23], s[2:3], s22
-; GFX6-NEXT:    s_or_b64 s[22:23], s[26:27], s[22:23]
+; GFX6-NEXT:    s_lshl_b64 s[24:25], s[0:1], s18
+; GFX6-NEXT:    s_lshr_b64 s[26:27], s[0:1], s19
+; GFX6-NEXT:    s_lshl_b64 s[18:19], s[2:3], s18
+; GFX6-NEXT:    s_or_b64 s[18:19], s[26:27], s[18:19]
 ; GFX6-NEXT:    s_lshl_b64 s[0:1], s[0:1], s17
-; GFX6-NEXT:    s_cmp_lg_u32 s28, 0
+; GFX6-NEXT:    s_cmp_lg_u32 s23, 0
 ; GFX6-NEXT:    s_cselect_b64 s[24:25], s[24:25], 0
-; GFX6-NEXT:    s_cselect_b64 s[0:1], s[22:23], s[0:1]
-; GFX6-NEXT:    s_cmp_lg_u32 s29, 0
+; GFX6-NEXT:    s_cselect_b64 s[0:1], s[18:19], s[0:1]
+; GFX6-NEXT:    s_cmp_lg_u32 s28, 0
+; GFX6-NEXT:    s_mov_b32 s22, 0
 ; GFX6-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
 ; GFX6-NEXT:    s_lshr_b64 s[0:1], s[8:9], 1
-; GFX6-NEXT:    s_lshl_b32 s9, s10, 31
-; GFX6-NEXT:    s_mov_b32 s8, s19
-; GFX6-NEXT:    s_or_b64 s[0:1], s[0:1], s[8:9]
+; GFX6-NEXT:    s_lshl_b32 s23, s10, 31
+; GFX6-NEXT:    s_or_b64 s[0:1], s[0:1], s[22:23]
 ; GFX6-NEXT:    s_lshr_b64 s[8:9], s[10:11], 1
-; GFX6-NEXT:    s_sub_i32 s26, s16, 64
-; GFX6-NEXT:    s_sub_i32 s22, 64, s16
+; GFX6-NEXT:    s_sub_i32 s23, s16, 64
+; GFX6-NEXT:    s_sub_i32 s18, 64, s16
 ; GFX6-NEXT:    s_cmp_lt_u32 s16, 64
-; GFX6-NEXT:    s_cselect_b32 s27, 1, 0
+; GFX6-NEXT:    s_cselect_b32 s26, 1, 0
 ; GFX6-NEXT:    s_cmp_eq_u32 s16, 0
-; GFX6-NEXT:    s_cselect_b32 s28, 1, 0
+; GFX6-NEXT:    s_cselect_b32 s27, 1, 0
 ; GFX6-NEXT:    s_lshr_b64 s[10:11], s[8:9], s16
 ; GFX6-NEXT:    s_lshr_b64 s[16:17], s[0:1], s16
-; GFX6-NEXT:    s_lshl_b64 s[22:23], s[8:9], s22
-; GFX6-NEXT:    s_or_b64 s[16:17], s[16:17], s[22:23]
-; GFX6-NEXT:    s_lshr_b64 s[8:9], s[8:9], s26
-; GFX6-NEXT:    s_cmp_lg_u32 s27, 0
+; GFX6-NEXT:    s_lshl_b64 s[18:19], s[8:9], s18
+; GFX6-NEXT:    s_or_b64 s[16:17], s[16:17], s[18:19]
+; GFX6-NEXT:    s_lshr_b64 s[8:9], s[8:9], s23
+; GFX6-NEXT:    s_cmp_lg_u32 s26, 0
 ; GFX6-NEXT:    s_cselect_b64 s[8:9], s[16:17], s[8:9]
-; GFX6-NEXT:    s_cmp_lg_u32 s28, 0
-; GFX6-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[8:9]
 ; GFX6-NEXT:    s_cmp_lg_u32 s27, 0
+; GFX6-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[8:9]
+; GFX6-NEXT:    s_cmp_lg_u32 s26, 0
 ; GFX6-NEXT:    s_cselect_b64 s[8:9], s[10:11], 0
 ; GFX6-NEXT:    s_or_b64 s[2:3], s[2:3], s[8:9]
-; GFX6-NEXT:    s_and_b64 s[8:9], s[20:21], s[18:19]
-; GFX6-NEXT:    s_andn2_b64 s[10:11], s[18:19], s[20:21]
+; GFX6-NEXT:    s_and_b64 s[8:9], s[20:21], 0x7f
+; GFX6-NEXT:    s_andn2_b64 s[10:11], 0x7f, s[20:21]
 ; GFX6-NEXT:    s_or_b64 s[0:1], s[24:25], s[0:1]
 ; GFX6-NEXT:    s_sub_i32 s11, s8, 64
 ; GFX6-NEXT:    s_sub_i32 s9, 64, s8
 ; GFX6-NEXT:    s_cmp_lt_u32 s8, 64
-; GFX6-NEXT:    s_cselect_b32 s18, 1, 0
+; GFX6-NEXT:    s_cselect_b32 s20, 1, 0
 ; GFX6-NEXT:    s_cmp_eq_u32 s8, 0
-; GFX6-NEXT:    s_cselect_b32 s22, 1, 0
+; GFX6-NEXT:    s_cselect_b32 s21, 1, 0
 ; GFX6-NEXT:    s_lshl_b64 s[16:17], s[4:5], s8
-; GFX6-NEXT:    s_lshr_b64 s[20:21], s[4:5], s9
+; GFX6-NEXT:    s_lshr_b64 s[18:19], s[4:5], s9
 ; GFX6-NEXT:    s_lshl_b64 s[8:9], s[6:7], s8
-; GFX6-NEXT:    s_or_b64 s[8:9], s[20:21], s[8:9]
+; GFX6-NEXT:    s_or_b64 s[8:9], s[18:19], s[8:9]
 ; GFX6-NEXT:    s_lshl_b64 s[4:5], s[4:5], s11
-; GFX6-NEXT:    s_cmp_lg_u32 s18, 0
+; GFX6-NEXT:    s_cmp_lg_u32 s20, 0
 ; GFX6-NEXT:    s_cselect_b64 s[16:17], s[16:17], 0
 ; GFX6-NEXT:    s_cselect_b64 s[4:5], s[8:9], s[4:5]
-; GFX6-NEXT:    s_cmp_lg_u32 s22, 0
+; GFX6-NEXT:    s_cmp_lg_u32 s21, 0
 ; GFX6-NEXT:    s_cselect_b64 s[6:7], s[6:7], s[4:5]
 ; GFX6-NEXT:    s_lshr_b64 s[4:5], s[12:13], 1
-; GFX6-NEXT:    s_lshl_b32 s9, s14, 31
-; GFX6-NEXT:    s_mov_b32 s8, s19
-; GFX6-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
+; GFX6-NEXT:    s_lshl_b32 s23, s14, 31
+; GFX6-NEXT:    s_or_b64 s[4:5], s[4:5], s[22:23]
 ; GFX6-NEXT:    s_lshr_b64 s[8:9], s[14:15], 1
 ; GFX6-NEXT:    s_sub_i32 s18, s10, 64
 ; GFX6-NEXT:    s_sub_i32 s14, 64, s10
@@ -7332,71 +7313,69 @@ define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ;
 ; GFX8-LABEL: s_fshl_v2i128:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_mov_b64 s[18:19], 0x7f
-; GFX8-NEXT:    s_and_b64 s[22:23], s[16:17], s[18:19]
-; GFX8-NEXT:    s_andn2_b64 s[16:17], s[18:19], s[16:17]
-; GFX8-NEXT:    s_sub_i32 s17, s22, 64
-; GFX8-NEXT:    s_sub_i32 s23, 64, s22
-; GFX8-NEXT:    s_cmp_lt_u32 s22, 64
+; GFX8-NEXT:    s_and_b64 s[18:19], s[16:17], 0x7f
+; GFX8-NEXT:    s_andn2_b64 s[16:17], 0x7f, s[16:17]
+; GFX8-NEXT:    s_sub_i32 s17, s18, 64
+; GFX8-NEXT:    s_sub_i32 s19, 64, s18
+; GFX8-NEXT:    s_cmp_lt_u32 s18, 64
+; GFX8-NEXT:    s_cselect_b32 s23, 1, 0
+; GFX8-NEXT:    s_cmp_eq_u32 s18, 0
 ; GFX8-NEXT:    s_cselect_b32 s28, 1, 0
-; GFX8-NEXT:    s_cmp_eq_u32 s22, 0
-; GFX8-NEXT:    s_cselect_b32 s29, 1, 0
-; GFX8-NEXT:    s_lshl_b64 s[24:25], s[0:1], s22
-; GFX8-NEXT:    s_lshr_b64 s[26:27], s[0:1], s23
-; GFX8-NEXT:    s_lshl_b64 s[22:23], s[2:3], s22
-; GFX8-NEXT:    s_or_b64 s[22:23], s[26:27], s[22:23]
+; GFX8-NEXT:    s_lshl_b64 s[24:25], s[0:1], s18
+; GFX8-NEXT:    s_lshr_b64 s[26:27], s[0:1], s19
+; GFX8-NEXT:    s_lshl_b64 s[18:19], s[2:3], s18
+; GFX8-NEXT:    s_or_b64 s[18:19], s[26:27], s[18:19]
 ; GFX8-NEXT:    s_lshl_b64 s[0:1], s[0:1], s17
-; GFX8-NEXT:    s_cmp_lg_u32 s28, 0
+; GFX8-NEXT:    s_cmp_lg_u32 s23, 0
 ; GFX8-NEXT:    s_cselect_b64 s[24:25], s[24:25], 0
-; GFX8-NEXT:    s_cselect_b64 s[0:1], s[22:23], s[0:1]
-; GFX8-NEXT:    s_cmp_lg_u32 s29, 0
+; GFX8-NEXT:    s_cselect_b64 s[0:1], s[18:19], s[0:1]
+; GFX8-NEXT:    s_cmp_lg_u32 s28, 0
+; GFX8-NEXT:    s_mov_b32 s22, 0
 ; GFX8-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
 ; GFX8-NEXT:    s_lshr_b64 s[0:1], s[8:9], 1
-; GFX8-NEXT:    s_lshl_b32 s9, s10, 31
-; GFX8-NEXT:    s_mov_b32 s8, s19
-; GFX8-NEXT:    s_or_b64 s[0:1], s[0:1], s[8:9]
+; GFX8-NEXT:    s_lshl_b32 s23, s10, 31
+; GFX8-NEXT:    s_or_b64 s[0:1], s[0:1], s[22:23]
 ; GFX8-NEXT:    s_lshr_b64 s[8:9], s[10:11], 1
-; GFX8-NEXT:    s_sub_i32 s26, s16, 64
-; GFX8-NEXT:    s_sub_i32 s22, 64, s16
+; GFX8-NEXT:    s_sub_i32 s23, s16, 64
+; GFX8-NEXT:    s_sub_i32 s18, 64, s16
 ; GFX8-NEXT:    s_cmp_lt_u32 s16, 64
-; GFX8-NEXT:    s_cselect_b32 s27, 1, 0
+; GFX8-NEXT:    s_cselect_b32 s26, 1, 0
 ; GFX8-NEXT:    s_cmp_eq_u32 s16, 0
-; GFX8-NEXT:    s_cselect_b32 s28, 1, 0
+; GFX8-NEXT:    s_cselect_b32 s27, 1, 0
 ; GFX8-NEXT:    s_lshr_b64 s[10:11], s[8:9], s16
 ; GFX8-NEXT:    s_lshr_b64 s[16:17], s[0:1], s16
-; GFX8-NEXT:    s_lshl_b64 s[22:23], s[8:9], s22
-; GFX8-NEXT:    s_or_b64 s[16:17], s[16:17], s[22:23]
-; GFX8-NEXT:    s_lshr_b64 s[8:9], s[8:9], s26
-; GFX8-NEXT:    s_cmp_lg_u32 s27, 0
+; GFX8-NEXT:    s_lshl_b64 s[18:19], s[8:9], s18
+; GFX8-NEXT:    s_or_b64 s[16:17], s[16:17], s[18:19]
+; GFX8-NEXT:    s_lshr_b64 s[8:9], s[8:9], s23
+; GFX8-NEXT:    s_cmp_lg_u32 s26, 0
 ; GFX8-NEXT:    s_cselect_b64 s[8:9], s[16:17], s[8:9]
-; GFX8-NEXT:    s_cmp_lg_u32 s28, 0
-; GFX8-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[8:9]
 ; GFX8-NEXT:    s_cmp_lg_u32 s27, 0
+; GFX8-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[8:9]
+; GFX8-NEXT:    s_cmp_lg_u32 s26, 0
 ; GFX8-NEXT:    s_cselect_b64 s[8:9], s[10:11], 0
 ; GFX8-NEXT:    s_or_b64 s[2:3], s[2:3], s[8:9]
-; GFX8-NEXT:    s_and_b64 s[8:9], s[20:21], s[18:19]
-; GFX8-NEXT:    s_andn2_b64 s[10:11], s[18:19], s[20:21]
+; GFX8-NEXT:    s_and_b64 s[8:9], s[20:21], 0x7f
+; GFX8-NEXT:    s_andn2_b64 s[10:11], 0x7f, s[20:21]
 ; GFX8-NEXT:    s_or_b64 s[0:1], s[24:25], s[0:1]
 ; GFX8-NEXT:    s_sub_i32 s11, s8, 64
 ; GFX8-NEXT:    s_sub_i32 s9, 64, s8
 ; GFX8-NEXT:    s_cmp_lt_u32 s8, 64
-; GFX8-NEXT:    s_cselect_b32 s18, 1, 0
+; GFX8-NEXT:    s_cselect_b32 s20, 1, 0
 ; GFX8-NEXT:    s_cmp_eq_u32 s8, 0
-; GFX8-NEXT:    s_cselect_b32 s22, 1, 0
+; GFX8-NEXT:    s_cselect_b32 s21, 1, 0
 ; GFX8-NEXT:    s_lshl_b64 s[16:17], s[4:5], s8
-; GFX8-NEXT:    s_lshr_b64 s[20:21], s[4:5], s9
+; GFX8-NEXT:    s_lshr_b64 s[18:19], s[4:5], s9
 ; GFX8-NEXT:    s_lshl_b64 s[8:9], s[6:7], s8
-; GFX8-NEXT:    s_or_b64 s[8:9], s[20:21], s[8:9]
+; GFX8-NEXT:    s_or_b64 s[8:9], s[18:19], s[8:9]
 ; GFX8-NEXT:    s_lshl_b64 s[4:5], s[4:5], s11
-; GFX8-NEXT:    s_cmp_lg_u32 s18, 0
+; GFX8-NEXT:    s_cmp_lg_u32 s20, 0
 ; GFX8-NEXT:    s_cselect_b64 s[16:17], s[16:17], 0
 ; GFX8-NEXT:    s_cselect_b64 s[4:5], s[8:9], s[4:5]
-; GFX8-NEXT:    s_cmp_lg_u32 s22, 0
+; GFX8-NEXT:    s_cmp_lg_u32 s21, 0
 ; GFX8-NEXT:    s_cselect_b64 s[6:7], s[6:7], s[4:5]
 ; GFX8-NEXT:    s_lshr_b64 s[4:5], s[12:13], 1
-; GFX8-NEXT:    s_lshl_b32 s9, s14, 31
-; GFX8-NEXT:    s_mov_b32 s8, s19
-; GFX8-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
+; GFX8-NEXT:    s_lshl_b32 s23, s14, 31
+; GFX8-NEXT:    s_or_b64 s[4:5], s[4:5], s[22:23]
 ; GFX8-NEXT:    s_lshr_b64 s[8:9], s[14:15], 1
 ; GFX8-NEXT:    s_sub_i32 s18, s10, 64
 ; GFX8-NEXT:    s_sub_i32 s14, 64, s10
@@ -7421,71 +7400,69 @@ define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ;
 ; GFX9-LABEL: s_fshl_v2i128:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b64 s[18:19], 0x7f
-; GFX9-NEXT:    s_and_b64 s[22:23], s[16:17], s[18:19]
-; GFX9-NEXT:    s_andn2_b64 s[16:17], s[18:19], s[16:17]
-; GFX9-NEXT:    s_sub_i32 s17, s22, 64
-; GFX9-NEXT:    s_sub_i32 s23, 64, s22
-; GFX9-NEXT:    s_cmp_lt_u32 s22, 64
+; GFX9-NEXT:    s_and_b64 s[18:19], s[16:17], 0x7f
+; GFX9-NEXT:    s_andn2_b64 s[16:17], 0x7f, s[16:17]
+; GFX9-NEXT:    s_sub_i32 s17, s18, 64
+; GFX9-NEXT:    s_sub_i32 s19, 64, s18
+; GFX9-NEXT:    s_cmp_lt_u32 s18, 64
+; GFX9-NEXT:    s_cselect_b32 s23, 1, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s18, 0
 ; GFX9-NEXT:    s_cselect_b32 s28, 1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s22, 0
-; GFX9-NEXT:    s_cselect_b32 s29, 1, 0
-; GFX9-NEXT:    s_lshl_b64 s[24:25], s[0:1], s22
-; GFX9-NEXT:    s_lshr_b64 s[26:27], s[0:1], s23
-; GFX9-NEXT:    s_lshl_b64 s[22:23], s[2:3], s22
-; GFX9-NEXT:    s_or_b64 s[22:23], s[26:27], s[22:23]
+; GFX9-NEXT:    s_lshl_b64 s[24:25], s[0:1], s18
+; GFX9-NEXT:    s_lshr_b64 s[26:27], s[0:1], s19
+; GFX9-NEXT:    s_lshl_b64 s[18:19], s[2:3], s18
+; GFX9-NEXT:    s_or_b64 s[18:19], s[26:27], s[18:19]
 ; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], s17
-; GFX9-NEXT:    s_cmp_lg_u32 s28, 0
+; GFX9-NEXT:    s_cmp_lg_u32 s23, 0
 ; GFX9-NEXT:    s_cselect_b64 s[24:25], s[24:25], 0
-; GFX9-NEXT:    s_cselect_b64 s[0:1], s[22:23], s[0:1]
-; GFX9-NEXT:    s_cmp_lg_u32 s29, 0
+; GFX9-NEXT:    s_cselect_b64 s[0:1], s[18:19], s[0:1]
+; GFX9-NEXT:    s_cmp_lg_u32 s28, 0
+; GFX9-NEXT:    s_mov_b32 s22, 0
 ; GFX9-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
 ; GFX9-NEXT:    s_lshr_b64 s[0:1], s[8:9], 1
-; GFX9-NEXT:    s_lshl_b32 s9, s10, 31
-; GFX9-NEXT:    s_mov_b32 s8, s19
-; GFX9-NEXT:    s_or_b64 s[0:1], s[0:1], s[8:9]
+; GFX9-NEXT:    s_lshl_b32 s23, s10, 31
+; GFX9-NEXT:    s_or_b64 s[0:1], s[0:1], s[22:23]
 ; GFX9-NEXT:    s_lshr_b64 s[8:9], s[10:11], 1
-; GFX9-NEXT:    s_sub_i32 s26, s16, 64
-; GFX9-NEXT:    s_sub_i32 s22, 64, s16
+; GFX9-NEXT:    s_sub_i32 s23, s16, 64
+; GFX9-NEXT:    s_sub_i32 s18, 64, s16
 ; GFX9-NEXT:    s_cmp_lt_u32 s16, 64
-; GFX9-NEXT:    s_cselect_b32 s27, 1, 0
+; GFX9-NEXT:    s_cselect_b32 s26, 1, 0
 ; GFX9-NEXT:    s_cmp_eq_u32 s16, 0
-; GFX9-NEXT:    s_cselect_b32 s28, 1, 0
+; GFX9-NEXT:    s_cselect_b32 s27, 1, 0
 ; GFX9-NEXT:    s_lshr_b64 s[10:11], s[8:9], s16
 ; GFX9-NEXT:    s_lshr_b64 s[16:17], s[0:1], s16
-; GFX9-NEXT:    s_lshl_b64 s[22:23], s[8:9], s22
-; GFX9-NEXT:    s_or_b64 s[16:17], s[16:17], s[22:23]
-; GFX9-NEXT:    s_lshr_b64 s[8:9], s[8:9], s26
-; GFX9-NEXT:    s_cmp_lg_u32 s27, 0
+; GFX9-NEXT:    s_lshl_b64 s[18:19], s[8:9], s18
+; GFX9-NEXT:    s_or_b64 s[16:17], s[16:17], s[18:19]
+; GFX9-NEXT:    s_lshr_b64 s[8:9], s[8:9], s23
+; GFX9-NEXT:    s_cmp_lg_u32 s26, 0
 ; GFX9-NEXT:    s_cselect_b64 s[8:9], s[16:17], s[8:9]
-; GFX9-NEXT:    s_cmp_lg_u32 s28, 0
-; GFX9-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[8:9]
 ; GFX9-NEXT:    s_cmp_lg_u32 s27, 0
+; GFX9-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[8:9]
+; GFX9-NEXT:    s_cmp_lg_u32 s26, 0
 ; GFX9-NEXT:    s_cselect_b64 s[8:9], s[10:11], 0
 ; GFX9-NEXT:    s_or_b64 s[2:3], s[2:3], s[8:9]
-; GFX9-NEXT:    s_and_b64 s[8:9], s[20:21], s[18:19]
-; GFX9-NEXT:    s_andn2_b64 s[10:11], s[18:19], s[20:21]
+; GFX9-NEXT:    s_and_b64 s[8:9], s[20:21], 0x7f
+; GFX9-NEXT:    s_andn2_b64 s[10:11], 0x7f, s[20:21]
 ; GFX9-NEXT:    s_or_b64 s[0:1], s[24:25], s[0:1]
 ; GFX9-NEXT:    s_sub_i32 s11, s8, 64
 ; GFX9-NEXT:    s_sub_i32 s9, 64, s8
 ; GFX9-NEXT:    s_cmp_lt_u32 s8, 64
-; GFX9-NEXT:    s_cselect_b32 s18, 1, 0
+; GFX9-NEXT:    s_cselect_b32 s20, 1, 0
 ; GFX9-NEXT:    s_cmp_eq_u32 s8, 0
-; GFX9-NEXT:    s_cselect_b32 s22, 1, 0
+; GFX9-NEXT:    s_cselect_b32 s21, 1, 0
 ; GFX9-NEXT:    s_lshl_b64 s[16:17], s[4:5], s8
-; GFX9-NEXT:    s_lshr_b64 s[20:21], s[4:5], s9
+; GFX9-NEXT:    s_lshr_b64 s[18:19], s[4:5], s9
 ; GFX9-NEXT:    s_lshl_b64 s[8:9], s[6:7], s8
-; GFX9-NEXT:    s_or_b64 s[8:9], s[20:21], s[8:9]
+; GFX9-NEXT:    s_or_b64 s[8:9], s[18:19], s[8:9]
 ; GFX9-NEXT:    s_lshl_b64 s[4:5], s[4:5], s11
-; GFX9-NEXT:    s_cmp_lg_u32 s18, 0
+; GFX9-NEXT:    s_cmp_lg_u32 s20, 0
 ; GFX9-NEXT:    s_cselect_b64 s[16:17], s[16:17], 0
 ; GFX9-NEXT:    s_cselect_b64 s[4:5], s[8:9], s[4:5]
-; GFX9-NEXT:    s_cmp_lg_u32 s22, 0
+; GFX9-NEXT:    s_cmp_lg_u32 s21, 0
 ; GFX9-NEXT:    s_cselect_b64 s[6:7], s[6:7], s[4:5]
 ; GFX9-NEXT:    s_lshr_b64 s[4:5], s[12:13], 1
-; GFX9-NEXT:    s_lshl_b32 s9, s14, 31
-; GFX9-NEXT:    s_mov_b32 s8, s19
-; GFX9-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
+; GFX9-NEXT:    s_lshl_b32 s23, s14, 31
+; GFX9-NEXT:    s_or_b64 s[4:5], s[4:5], s[22:23]
 ; GFX9-NEXT:    s_lshr_b64 s[8:9], s[14:15], 1
 ; GFX9-NEXT:    s_sub_i32 s18, s10, 64
 ; GFX9-NEXT:    s_sub_i32 s14, 64, s10
@@ -7510,73 +7487,71 @@ define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ;
 ; GFX10-LABEL: s_fshl_v2i128:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_mov_b64 s[18:19], 0x7f
-; GFX10-NEXT:    s_and_b64 s[22:23], s[16:17], s[18:19]
-; GFX10-NEXT:    s_andn2_b64 s[16:17], s[18:19], s[16:17]
-; GFX10-NEXT:    s_sub_i32 s17, s22, 64
-; GFX10-NEXT:    s_sub_i32 s23, 64, s22
-; GFX10-NEXT:    s_cmp_lt_u32 s22, 64
+; GFX10-NEXT:    s_and_b64 s[18:19], s[16:17], 0x7f
+; GFX10-NEXT:    s_andn2_b64 s[16:17], 0x7f, s[16:17]
+; GFX10-NEXT:    s_sub_i32 s17, s18, 64
+; GFX10-NEXT:    s_sub_i32 s19, 64, s18
+; GFX10-NEXT:    s_cmp_lt_u32 s18, 64
+; GFX10-NEXT:    s_mov_b32 s22, 0
+; GFX10-NEXT:    s_cselect_b32 s23, 1, 0
+; GFX10-NEXT:    s_cmp_eq_u32 s18, 0
 ; GFX10-NEXT:    s_cselect_b32 s28, 1, 0
-; GFX10-NEXT:    s_cmp_eq_u32 s22, 0
-; GFX10-NEXT:    s_cselect_b32 s29, 1, 0
-; GFX10-NEXT:    s_lshr_b64 s[24:25], s[0:1], s23
-; GFX10-NEXT:    s_lshl_b64 s[26:27], s[2:3], s22
-; GFX10-NEXT:    s_lshl_b64 s[22:23], s[0:1], s22
+; GFX10-NEXT:    s_lshr_b64 s[24:25], s[0:1], s19
+; GFX10-NEXT:    s_lshl_b64 s[26:27], s[2:3], s18
+; GFX10-NEXT:    s_lshl_b64 s[18:19], s[0:1], s18
 ; GFX10-NEXT:    s_or_b64 s[24:25], s[24:25], s[26:27]
 ; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], s17
-; GFX10-NEXT:    s_cmp_lg_u32 s28, 0
-; GFX10-NEXT:    s_cselect_b64 s[22:23], s[22:23], 0
+; GFX10-NEXT:    s_cmp_lg_u32 s23, 0
+; GFX10-NEXT:    s_cselect_b64 s[18:19], s[18:19], 0
 ; GFX10-NEXT:    s_cselect_b64 s[0:1], s[24:25], s[0:1]
-; GFX10-NEXT:    s_cmp_lg_u32 s29, 0
+; GFX10-NEXT:    s_cmp_lg_u32 s28, 0
 ; GFX10-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
 ; GFX10-NEXT:    s_lshr_b64 s[0:1], s[8:9], 1
-; GFX10-NEXT:    s_lshl_b32 s9, s10, 31
-; GFX10-NEXT:    s_mov_b32 s8, s19
-; GFX10-NEXT:    s_sub_i32 s26, s16, 64
-; GFX10-NEXT:    s_or_b64 s[0:1], s[0:1], s[8:9]
+; GFX10-NEXT:    s_lshl_b32 s23, s10, 31
 ; GFX10-NEXT:    s_lshr_b64 s[8:9], s[10:11], 1
+; GFX10-NEXT:    s_or_b64 s[0:1], s[0:1], s[22:23]
+; GFX10-NEXT:    s_sub_i32 s23, s16, 64
 ; GFX10-NEXT:    s_sub_i32 s17, 64, s16
 ; GFX10-NEXT:    s_cmp_lt_u32 s16, 64
-; GFX10-NEXT:    s_cselect_b32 s27, 1, 0
+; GFX10-NEXT:    s_cselect_b32 s26, 1, 0
 ; GFX10-NEXT:    s_cmp_eq_u32 s16, 0
-; GFX10-NEXT:    s_cselect_b32 s28, 1, 0
+; GFX10-NEXT:    s_cselect_b32 s27, 1, 0
 ; GFX10-NEXT:    s_lshr_b64 s[10:11], s[0:1], s16
 ; GFX10-NEXT:    s_lshl_b64 s[24:25], s[8:9], s17
 ; GFX10-NEXT:    s_lshr_b64 s[16:17], s[8:9], s16
 ; GFX10-NEXT:    s_or_b64 s[10:11], s[10:11], s[24:25]
-; GFX10-NEXT:    s_lshr_b64 s[8:9], s[8:9], s26
-; GFX10-NEXT:    s_cmp_lg_u32 s27, 0
+; GFX10-NEXT:    s_lshr_b64 s[8:9], s[8:9], s23
+; GFX10-NEXT:    s_cmp_lg_u32 s26, 0
 ; GFX10-NEXT:    s_cselect_b64 s[8:9], s[10:11], s[8:9]
-; GFX10-NEXT:    s_cmp_lg_u32 s28, 0
-; GFX10-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[8:9]
 ; GFX10-NEXT:    s_cmp_lg_u32 s27, 0
+; GFX10-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[8:9]
+; GFX10-NEXT:    s_cmp_lg_u32 s26, 0
 ; GFX10-NEXT:    s_cselect_b64 s[8:9], s[16:17], 0
-; GFX10-NEXT:    s_andn2_b64 s[10:11], s[18:19], s[20:21]
+; GFX10-NEXT:    s_andn2_b64 s[10:11], 0x7f, s[20:21]
 ; GFX10-NEXT:    s_or_b64 s[2:3], s[2:3], s[8:9]
-; GFX10-NEXT:    s_and_b64 s[8:9], s[20:21], s[18:19]
-; GFX10-NEXT:    s_or_b64 s[0:1], s[22:23], s[0:1]
+; GFX10-NEXT:    s_and_b64 s[8:9], s[20:21], 0x7f
+; GFX10-NEXT:    s_or_b64 s[0:1], s[18:19], s[0:1]
 ; GFX10-NEXT:    s_sub_i32 s11, s8, 64
 ; GFX10-NEXT:    s_sub_i32 s9, 64, s8
 ; GFX10-NEXT:    s_cmp_lt_u32 s8, 64
-; GFX10-NEXT:    s_cselect_b32 s18, 1, 0
+; GFX10-NEXT:    s_cselect_b32 s20, 1, 0
 ; GFX10-NEXT:    s_cmp_eq_u32 s8, 0
-; GFX10-NEXT:    s_cselect_b32 s22, 1, 0
+; GFX10-NEXT:    s_cselect_b32 s21, 1, 0
 ; GFX10-NEXT:    s_lshr_b64 s[16:17], s[4:5], s9
-; GFX10-NEXT:    s_lshl_b64 s[20:21], s[6:7], s8
+; GFX10-NEXT:    s_lshl_b64 s[18:19], s[6:7], s8
 ; GFX10-NEXT:    s_lshl_b64 s[8:9], s[4:5], s8
-; GFX10-NEXT:    s_or_b64 s[16:17], s[16:17], s[20:21]
+; GFX10-NEXT:    s_or_b64 s[16:17], s[16:17], s[18:19]
 ; GFX10-NEXT:    s_lshl_b64 s[4:5], s[4:5], s11
-; GFX10-NEXT:    s_cmp_lg_u32 s18, 0
+; GFX10-NEXT:    s_cmp_lg_u32 s20, 0
 ; GFX10-NEXT:    s_cselect_b64 s[8:9], s[8:9], 0
 ; GFX10-NEXT:    s_cselect_b64 s[4:5], s[16:17], s[4:5]
-; GFX10-NEXT:    s_cmp_lg_u32 s22, 0
+; GFX10-NEXT:    s_cmp_lg_u32 s21, 0
 ; GFX10-NEXT:    s_cselect_b64 s[6:7], s[6:7], s[4:5]
 ; GFX10-NEXT:    s_lshr_b64 s[4:5], s[12:13], 1
-; GFX10-NEXT:    s_lshl_b32 s13, s14, 31
-; GFX10-NEXT:    s_mov_b32 s12, s19
-; GFX10-NEXT:    s_sub_i32 s18, s10, 64
-; GFX10-NEXT:    s_or_b64 s[4:5], s[4:5], s[12:13]
+; GFX10-NEXT:    s_lshl_b32 s23, s14, 31
 ; GFX10-NEXT:    s_lshr_b64 s[12:13], s[14:15], 1
+; GFX10-NEXT:    s_or_b64 s[4:5], s[4:5], s[22:23]
+; GFX10-NEXT:    s_sub_i32 s18, s10, 64
 ; GFX10-NEXT:    s_sub_i32 s11, 64, s10
 ; GFX10-NEXT:    s_cmp_lt_u32 s10, 64
 ; GFX10-NEXT:    s_cselect_b32 s19, 1, 0
@@ -7599,74 +7574,71 @@ define amdgpu_ps <2 x i128> @s_fshl_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ;
 ; GFX11-LABEL: s_fshl_v2i128:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_mov_b64 s[18:19], 0x7f
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_b64 s[22:23], s[16:17], s[18:19]
-; GFX11-NEXT:    s_and_not1_b64 s[16:17], s[18:19], s[16:17]
-; GFX11-NEXT:    s_sub_i32 s17, s22, 64
-; GFX11-NEXT:    s_sub_i32 s23, 64, s22
-; GFX11-NEXT:    s_cmp_lt_u32 s22, 64
+; GFX11-NEXT:    s_and_b64 s[18:19], s[16:17], 0x7f
+; GFX11-NEXT:    s_and_not1_b64 s[16:17], 0x7f, s[16:17]
+; GFX11-NEXT:    s_sub_i32 s17, s18, 64
+; GFX11-NEXT:    s_sub_i32 s19, 64, s18
+; GFX11-NEXT:    s_cmp_lt_u32 s18, 64
+; GFX11-NEXT:    s_mov_b32 s22, 0
+; GFX11-NEXT:    s_cselect_b32 s23, 1, 0
+; GFX11-NEXT:    s_cmp_eq_u32 s18, 0
 ; GFX11-NEXT:    s_cselect_b32 s28, 1, 0
-; GFX11-NEXT:    s_cmp_eq_u32 s22, 0
-; GFX11-NEXT:    s_cselect_b32 s29, 1, 0
-; GFX11-NEXT:    s_lshr_b64 s[24:25], s[0:1], s23
-; GFX11-NEXT:    s_lshl_b64 s[26:27], s[2:3], s22
-; GFX11-NEXT:    s_lshl_b64 s[22:23], s[0:1], s22
+; GFX11-NEXT:    s_lshr_b64 s[24:25], s[0:1], s19
+; GFX11-NEXT:    s_lshl_b64 s[26:27], s[2:3], s18
+; GFX11-NEXT:    s_lshl_b64 s[18:19], s[0:1], s18
 ; GFX11-NEXT:    s_or_b64 s[24:25], s[24:25], s[26:27]
 ; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], s17
-; GFX11-NEXT:    s_cmp_lg_u32 s28, 0
-; GFX11-NEXT:    s_cselect_b64 s[22:23], s[22:23], 0
+; GFX11-NEXT:    s_cmp_lg_u32 s23, 0
+; GFX11-NEXT:    s_cselect_b64 s[18:19], s[18:19], 0
 ; GFX11-NEXT:    s_cselect_b64 s[0:1], s[24:25], s[0:1]
-; GFX11-NEXT:    s_cmp_lg_u32 s29, 0
+; GFX11-NEXT:    s_cmp_lg_u32 s28, 0
 ; GFX11-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
 ; GFX11-NEXT:    s_lshr_b64 s[0:1], s[8:9], 1
-; GFX11-NEXT:    s_lshl_b32 s9, s10, 31
-; GFX11-NEXT:    s_mov_b32 s8, s19
-; GFX11-NEXT:    s_sub_i32 s26, s16, 64
-; GFX11-NEXT:    s_or_b64 s[0:1], s[0:1], s[8:9]
+; GFX11-NEXT:    s_lshl_b32 s23, s10, 31
 ; GFX11-NEXT:    s_lshr_b64 s[8:9], s[10:11], 1
+; GFX11-NEXT:    s_or_b64 s[0:1], s[0:1], s[22:23]
+; GFX11-NEXT:    s_sub_i32 s23, s16, 64
 ; GFX11-NEXT:    s_sub_i32 s17, 64, s16
 ; GFX11-NEXT:    s_cmp_lt_u32 s16, 64
-; GFX11-NEXT:    s_cselect_b32 s27, 1, 0
+; GFX11-NEXT:    s_cselect_b32 s26, 1, 0
 ; GFX11-NEXT:    s_cmp_eq_u32 s16, 0
-; GFX11-NEXT:    s_cselect_b32 s28, 1, 0
+; GFX11-NEXT:    s_cselect_b32 s27, 1, 0
 ; GFX11-NEXT:    s_lshr_b64 s[10:11], s[0:1], s16
 ; GFX11-NEXT:    s_lshl_b64 s[24:25], s[8:9], s17
 ; GFX11-NEXT:    s_lshr_b64 s[16:17], s[8:9], s16
 ; GFX11-NEXT:    s_or_b64 s[10:11], s[10:11], s[24:25]
-; GFX11-NEXT:    s_lshr_b64 s[8:9], s[8:9], s26
-; GFX11-NEXT:    s_cmp_lg_u32 s27, 0
+; GFX11-NEXT:    s_lshr_b64 s[8:9], s[8:9], s23
+; GFX11-NEXT:    s_cmp_lg_u32 s26, 0
 ; GFX11-NEXT:    s_cselect_b64 s[8:9], s[10:11], s[8:9]
-; GFX11-NEXT:    s_cmp_lg_u32 s28, 0
-; GFX11-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[8:9]
 ; GFX11-NEXT:    s_cmp_lg_u32 s27, 0
+; GFX11-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[8:9]
+; GFX11-NEXT:    s_cmp_lg_u32 s26, 0
 ; GFX11-NEXT:    s_cselect_b64 s[8:9], s[16:17], 0
-; GFX11-NEXT:    s_and_not1_b64 s[10:11], s[18:19], s[20:21]
+; GFX11-NEXT:    s_and_not1_b64 s[10:11], 0x7f, s[20:21]
 ; GFX11-NEXT:    s_or_b64 s[2:3], s[2:3], s[8:9]
-; GFX11-NEXT:    s_and_b64 s[8:9], s[20:21], s[18:19]
-; GFX11-NEXT:    s_or_b64 s[0:1], s[22:23], s[0:1]
+; GFX11-NEXT:    s_and_b64 s[8:9], s[20:21], 0x7f
+; GFX11-NEXT:    s_or_b64 s[0:1], s[18:19], s[0:1]
 ; GFX11-NEXT:    s_sub_i32 s11, s8, 64
 ; GFX11-NEXT:    s_sub_i32 s9, 64, s8
 ; GFX11-NEXT:    s_cmp_lt_u32 s8, 64
-; GFX11-NEXT:    s_cselect_b32 s18, 1, 0
+; GFX11-NEXT:    s_cselect_b32 s20, 1, 0
 ; GFX11-NEXT:    s_cmp_eq_u32 s8, 0
-; GFX11-NEXT:    s_cselect_b32 s22, 1, 0
+; GFX11-NEXT:    s_cselect_b32 s21, 1, 0
 ; GFX11-NEXT:    s_lshr_b64 s[16:17], s[4:5], s9
-; GFX11-NEXT:    s_lshl_b64 s[20:21], s[6:7], s8
+; GFX11-NEXT:    s_lshl_b64 s[18:19], s[6:7], s8
 ; GFX11-NEXT:    s_lshl_b64 s[8:9], s[4:5], s8
-; GFX11-NEXT:    s_or_b64 s[16:17], s[16:17], s[20:21]
+; GFX11-NEXT:    s_or_b64 s[16:17], s[16:17], s[18:19]
 ; GFX11-NEXT:    s_lshl_b64 s[4:5], s[4:5], s11
-; GFX11-NEXT:    s_cmp_lg_u32 s18, 0
+; GFX11-NEXT:    s_cmp_lg_u32 s20, 0
 ; GFX11-NEXT:    s_cselect_b64 s[8:9], s[8:9], 0
 ; GFX11-NEXT:    s_cselect_b64 s[4:5], s[16:17], s[4:5]
-; GFX11-NEXT:    s_cmp_lg_u32 s22, 0
+; GFX11-NEXT:    s_cmp_lg_u32 s21, 0
 ; GFX11-NEXT:    s_cselect_b64 s[6:7], s[6:7], s[4:5]
 ; GFX11-NEXT:    s_lshr_b64 s[4:5], s[12:13], 1
-; GFX11-NEXT:    s_lshl_b32 s13, s14, 31
-; GFX11-NEXT:    s_mov_b32 s12, s19
-; GFX11-NEXT:    s_sub_i32 s18, s10, 64
-; GFX11-NEXT:    s_or_b64 s[4:5], s[4:5], s[12:13]
+; GFX11-NEXT:    s_lshl_b32 s23, s14, 31
 ; GFX11-NEXT:    s_lshr_b64 s[12:13], s[14:15], 1
+; GFX11-NEXT:    s_or_b64 s[4:5], s[4:5], s[22:23]
+; GFX11-NEXT:    s_sub_i32 s18, s10, 64
 ; GFX11-NEXT:    s_sub_i32 s11, 64, s10
 ; GFX11-NEXT:    s_cmp_lt_u32 s10, 64
 ; GFX11-NEXT:    s_cselect_b32 s19, 1, 0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
index 25d845f2f9922..88fa7a8406475 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
@@ -5878,39 +5878,38 @@ define <2 x i64> @v_fshr_v2i64(<2 x i64> %lhs, <2 x i64> %rhs, <2 x i64> %amt) {
 define amdgpu_ps i128 @s_fshr_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg %amt) {
 ; GFX6-LABEL: s_fshr_i128:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_movk_i32 s10, 0x7f
-; GFX6-NEXT:    s_mov_b32 s11, 0
-; GFX6-NEXT:    s_and_b64 s[12:13], s[8:9], s[10:11]
-; GFX6-NEXT:    s_andn2_b64 s[8:9], s[10:11], s[8:9]
+; GFX6-NEXT:    s_and_b64 s[10:11], s[8:9], 0x7f
+; GFX6-NEXT:    s_andn2_b64 s[8:9], 0x7f, s[8:9]
+; GFX6-NEXT:    s_lshl_b64 s[12:13], s[0:1], 1
 ; GFX6-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
-; GFX6-NEXT:    s_lshr_b32 s10, s1, 31
-; GFX6-NEXT:    s_lshl_b64 s[14:15], s[0:1], 1
-; GFX6-NEXT:    s_or_b64 s[0:1], s[2:3], s[10:11]
-; GFX6-NEXT:    s_sub_i32 s13, s8, 64
+; GFX6-NEXT:    s_lshr_b32 s0, s1, 31
+; GFX6-NEXT:    s_mov_b32 s1, 0
+; GFX6-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
+; GFX6-NEXT:    s_sub_i32 s11, s8, 64
 ; GFX6-NEXT:    s_sub_i32 s9, 64, s8
 ; GFX6-NEXT:    s_cmp_lt_u32 s8, 64
 ; GFX6-NEXT:    s_cselect_b32 s16, 1, 0
 ; GFX6-NEXT:    s_cmp_eq_u32 s8, 0
 ; GFX6-NEXT:    s_cselect_b32 s17, 1, 0
-; GFX6-NEXT:    s_lshl_b64 s[2:3], s[14:15], s8
-; GFX6-NEXT:    s_lshr_b64 s[10:11], s[14:15], s9
+; GFX6-NEXT:    s_lshl_b64 s[2:3], s[12:13], s8
+; GFX6-NEXT:    s_lshr_b64 s[14:15], s[12:13], s9
 ; GFX6-NEXT:    s_lshl_b64 s[8:9], s[0:1], s8
-; GFX6-NEXT:    s_or_b64 s[8:9], s[10:11], s[8:9]
-; GFX6-NEXT:    s_lshl_b64 s[10:11], s[14:15], s13
+; GFX6-NEXT:    s_or_b64 s[8:9], s[14:15], s[8:9]
+; GFX6-NEXT:    s_lshl_b64 s[12:13], s[12:13], s11
 ; GFX6-NEXT:    s_cmp_lg_u32 s16, 0
 ; GFX6-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
-; GFX6-NEXT:    s_cselect_b64 s[8:9], s[8:9], s[10:11]
+; GFX6-NEXT:    s_cselect_b64 s[8:9], s[8:9], s[12:13]
 ; GFX6-NEXT:    s_cmp_lg_u32 s17, 0
 ; GFX6-NEXT:    s_cselect_b64 s[8:9], s[0:1], s[8:9]
-; GFX6-NEXT:    s_sub_i32 s14, s12, 64
-; GFX6-NEXT:    s_sub_i32 s13, 64, s12
-; GFX6-NEXT:    s_cmp_lt_u32 s12, 64
+; GFX6-NEXT:    s_sub_i32 s14, s10, 64
+; GFX6-NEXT:    s_sub_i32 s12, 64, s10
+; GFX6-NEXT:    s_cmp_lt_u32 s10, 64
 ; GFX6-NEXT:    s_cselect_b32 s15, 1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s12, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s10, 0
 ; GFX6-NEXT:    s_cselect_b32 s16, 1, 0
-; GFX6-NEXT:    s_lshr_b64 s[0:1], s[6:7], s12
-; GFX6-NEXT:    s_lshr_b64 s[10:11], s[4:5], s12
-; GFX6-NEXT:    s_lshl_b64 s[12:13], s[6:7], s13
+; GFX6-NEXT:    s_lshr_b64 s[0:1], s[6:7], s10
+; GFX6-NEXT:    s_lshr_b64 s[10:11], s[4:5], s10
+; GFX6-NEXT:    s_lshl_b64 s[12:13], s[6:7], s12
 ; GFX6-NEXT:    s_or_b64 s[10:11], s[10:11], s[12:13]
 ; GFX6-NEXT:    s_lshr_b64 s[6:7], s[6:7], s14
 ; GFX6-NEXT:    s_cmp_lg_u32 s15, 0
@@ -5925,39 +5924,38 @@ define amdgpu_ps i128 @s_fshr_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg
 ;
 ; GFX8-LABEL: s_fshr_i128:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_movk_i32 s10, 0x7f
-; GFX8-NEXT:    s_mov_b32 s11, 0
-; GFX8-NEXT:    s_and_b64 s[12:13], s[8:9], s[10:11]
-; GFX8-NEXT:    s_andn2_b64 s[8:9], s[10:11], s[8:9]
+; GFX8-NEXT:    s_and_b64 s[10:11], s[8:9], 0x7f
+; GFX8-NEXT:    s_andn2_b64 s[8:9], 0x7f, s[8:9]
+; GFX8-NEXT:    s_lshl_b64 s[12:13], s[0:1], 1
 ; GFX8-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
-; GFX8-NEXT:    s_lshr_b32 s10, s1, 31
-; GFX8-NEXT:    s_lshl_b64 s[14:15], s[0:1], 1
-; GFX8-NEXT:    s_or_b64 s[0:1], s[2:3], s[10:11]
-; GFX8-NEXT:    s_sub_i32 s13, s8, 64
+; GFX8-NEXT:    s_lshr_b32 s0, s1, 31
+; GFX8-NEXT:    s_mov_b32 s1, 0
+; GFX8-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
+; GFX8-NEXT:    s_sub_i32 s11, s8, 64
 ; GFX8-NEXT:    s_sub_i32 s9, 64, s8
 ; GFX8-NEXT:    s_cmp_lt_u32 s8, 64
 ; GFX8-NEXT:    s_cselect_b32 s16, 1, 0
 ; GFX8-NEXT:    s_cmp_eq_u32 s8, 0
 ; GFX8-NEXT:    s_cselect_b32 s17, 1, 0
-; GFX8-NEXT:    s_lshl_b64 s[2:3], s[14:15], s8
-; GFX8-NEXT:    s_lshr_b64 s[10:11], s[14:15], s9
+; GFX8-NEXT:    s_lshl_b64 s[2:3], s[12:13], s8
+; GFX8-NEXT:    s_lshr_b64 s[14:15], s[12:13], s9
 ; GFX8-NEXT:    s_lshl_b64 s[8:9], s[0:1], s8
-; GFX8-NEXT:    s_or_b64 s[8:9], s[10:11], s[8:9]
-; GFX8-NEXT:    s_lshl_b64 s[10:11], s[14:15], s13
+; GFX8-NEXT:    s_or_b64 s[8:9], s[14:15], s[8:9]
+; GFX8-NEXT:    s_lshl_b64 s[12:13], s[12:13], s11
 ; GFX8-NEXT:    s_cmp_lg_u32 s16, 0
 ; GFX8-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
-; GFX8-NEXT:    s_cselect_b64 s[8:9], s[8:9], s[10:11]
+; GFX8-NEXT:    s_cselect_b64 s[8:9], s[8:9], s[12:13]
 ; GFX8-NEXT:    s_cmp_lg_u32 s17, 0
 ; GFX8-NEXT:    s_cselect_b64 s[8:9], s[0:1], s[8:9]
-; GFX8-NEXT:    s_sub_i32 s14, s12, 64
-; GFX8-NEXT:    s_sub_i32 s13, 64, s12
-; GFX8-NEXT:    s_cmp_lt_u32 s12, 64
+; GFX8-NEXT:    s_sub_i32 s14, s10, 64
+; GFX8-NEXT:    s_sub_i32 s12, 64, s10
+; GFX8-NEXT:    s_cmp_lt_u32 s10, 64
 ; GFX8-NEXT:    s_cselect_b32 s15, 1, 0
-; GFX8-NEXT:    s_cmp_eq_u32 s12, 0
+; GFX8-NEXT:    s_cmp_eq_u32 s10, 0
 ; GFX8-NEXT:    s_cselect_b32 s16, 1, 0
-; GFX8-NEXT:    s_lshr_b64 s[0:1], s[6:7], s12
-; GFX8-NEXT:    s_lshr_b64 s[10:11], s[4:5], s12
-; GFX8-NEXT:    s_lshl_b64 s[12:13], s[6:7], s13
+; GFX8-NEXT:    s_lshr_b64 s[0:1], s[6:7], s10
+; GFX8-NEXT:    s_lshr_b64 s[10:11], s[4:5], s10
+; GFX8-NEXT:    s_lshl_b64 s[12:13], s[6:7], s12
 ; GFX8-NEXT:    s_or_b64 s[10:11], s[10:11], s[12:13]
 ; GFX8-NEXT:    s_lshr_b64 s[6:7], s[6:7], s14
 ; GFX8-NEXT:    s_cmp_lg_u32 s15, 0
@@ -5972,39 +5970,38 @@ define amdgpu_ps i128 @s_fshr_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg
 ;
 ; GFX9-LABEL: s_fshr_i128:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_movk_i32 s10, 0x7f
-; GFX9-NEXT:    s_mov_b32 s11, 0
-; GFX9-NEXT:    s_and_b64 s[12:13], s[8:9], s[10:11]
-; GFX9-NEXT:    s_andn2_b64 s[8:9], s[10:11], s[8:9]
+; GFX9-NEXT:    s_and_b64 s[10:11], s[8:9], 0x7f
+; GFX9-NEXT:    s_andn2_b64 s[8:9], 0x7f, s[8:9]
+; GFX9-NEXT:    s_lshl_b64 s[12:13], s[0:1], 1
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
-; GFX9-NEXT:    s_lshr_b32 s10, s1, 31
-; GFX9-NEXT:    s_lshl_b64 s[14:15], s[0:1], 1
-; GFX9-NEXT:    s_or_b64 s[0:1], s[2:3], s[10:11]
-; GFX9-NEXT:    s_sub_i32 s13, s8, 64
+; GFX9-NEXT:    s_lshr_b32 s0, s1, 31
+; GFX9-NEXT:    s_mov_b32 s1, 0
+; GFX9-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
+; GFX9-NEXT:    s_sub_i32 s11, s8, 64
 ; GFX9-NEXT:    s_sub_i32 s9, 64, s8
 ; GFX9-NEXT:    s_cmp_lt_u32 s8, 64
 ; GFX9-NEXT:    s_cselect_b32 s16, 1, 0
 ; GFX9-NEXT:    s_cmp_eq_u32 s8, 0
 ; GFX9-NEXT:    s_cselect_b32 s17, 1, 0
-; GFX9-NEXT:    s_lshl_b64 s[2:3], s[14:15], s8
-; GFX9-NEXT:    s_lshr_b64 s[10:11], s[14:15], s9
+; GFX9-NEXT:    s_lshl_b64 s[2:3], s[12:13], s8
+; GFX9-NEXT:    s_lshr_b64 s[14:15], s[12:13], s9
 ; GFX9-NEXT:    s_lshl_b64 s[8:9], s[0:1], s8
-; GFX9-NEXT:    s_or_b64 s[8:9], s[10:11], s[8:9]
-; GFX9-NEXT:    s_lshl_b64 s[10:11], s[14:15], s13
+; GFX9-NEXT:    s_or_b64 s[8:9], s[14:15], s[8:9]
+; GFX9-NEXT:    s_lshl_b64 s[12:13], s[12:13], s11
 ; GFX9-NEXT:    s_cmp_lg_u32 s16, 0
 ; GFX9-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
-; GFX9-NEXT:    s_cselect_b64 s[8:9], s[8:9], s[10:11]
+; GFX9-NEXT:    s_cselect_b64 s[8:9], s[8:9], s[12:13]
 ; GFX9-NEXT:    s_cmp_lg_u32 s17, 0
 ; GFX9-NEXT:    s_cselect_b64 s[8:9], s[0:1], s[8:9]
-; GFX9-NEXT:    s_sub_i32 s14, s12, 64
-; GFX9-NEXT:    s_sub_i32 s13, 64, s12
-; GFX9-NEXT:    s_cmp_lt_u32 s12, 64
+; GFX9-NEXT:    s_sub_i32 s14, s10, 64
+; GFX9-NEXT:    s_sub_i32 s12, 64, s10
+; GFX9-NEXT:    s_cmp_lt_u32 s10, 64
 ; GFX9-NEXT:    s_cselect_b32 s15, 1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s12, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s10, 0
 ; GFX9-NEXT:    s_cselect_b32 s16, 1, 0
-; GFX9-NEXT:    s_lshr_b64 s[0:1], s[6:7], s12
-; GFX9-NEXT:    s_lshr_b64 s[10:11], s[4:5], s12
-; GFX9-NEXT:    s_lshl_b64 s[12:13], s[6:7], s13
+; GFX9-NEXT:    s_lshr_b64 s[0:1], s[6:7], s10
+; GFX9-NEXT:    s_lshr_b64 s[10:11], s[4:5], s10
+; GFX9-NEXT:    s_lshl_b64 s[12:13], s[6:7], s12
 ; GFX9-NEXT:    s_or_b64 s[10:11], s[10:11], s[12:13]
 ; GFX9-NEXT:    s_lshr_b64 s[6:7], s[6:7], s14
 ; GFX9-NEXT:    s_cmp_lg_u32 s15, 0
@@ -6019,94 +6016,92 @@ define amdgpu_ps i128 @s_fshr_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg
 ;
 ; GFX10-LABEL: s_fshr_i128:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_movk_i32 s10, 0x7f
-; GFX10-NEXT:    s_mov_b32 s11, 0
+; GFX10-NEXT:    s_and_b64 s[10:11], s[8:9], 0x7f
+; GFX10-NEXT:    s_andn2_b64 s[8:9], 0x7f, s[8:9]
 ; GFX10-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
-; GFX10-NEXT:    s_and_b64 s[12:13], s[8:9], s[10:11]
-; GFX10-NEXT:    s_andn2_b64 s[8:9], s[10:11], s[8:9]
-; GFX10-NEXT:    s_lshr_b32 s10, s1, 31
+; GFX10-NEXT:    s_lshr_b32 s12, s1, 31
+; GFX10-NEXT:    s_mov_b32 s13, 0
 ; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], 1
-; GFX10-NEXT:    s_or_b64 s[2:3], s[2:3], s[10:11]
-; GFX10-NEXT:    s_sub_i32 s13, s8, 64
+; GFX10-NEXT:    s_or_b64 s[2:3], s[2:3], s[12:13]
+; GFX10-NEXT:    s_sub_i32 s11, s8, 64
 ; GFX10-NEXT:    s_sub_i32 s9, 64, s8
 ; GFX10-NEXT:    s_cmp_lt_u32 s8, 64
 ; GFX10-NEXT:    s_cselect_b32 s16, 1, 0
 ; GFX10-NEXT:    s_cmp_eq_u32 s8, 0
 ; GFX10-NEXT:    s_cselect_b32 s17, 1, 0
-; GFX10-NEXT:    s_lshr_b64 s[10:11], s[0:1], s9
+; GFX10-NEXT:    s_lshr_b64 s[12:13], s[0:1], s9
 ; GFX10-NEXT:    s_lshl_b64 s[14:15], s[2:3], s8
 ; GFX10-NEXT:    s_lshl_b64 s[8:9], s[0:1], s8
-; GFX10-NEXT:    s_or_b64 s[10:11], s[10:11], s[14:15]
-; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], s13
+; GFX10-NEXT:    s_or_b64 s[12:13], s[12:13], s[14:15]
+; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], s11
 ; GFX10-NEXT:    s_cmp_lg_u32 s16, 0
 ; GFX10-NEXT:    s_cselect_b64 s[8:9], s[8:9], 0
-; GFX10-NEXT:    s_cselect_b64 s[0:1], s[10:11], s[0:1]
+; GFX10-NEXT:    s_cselect_b64 s[0:1], s[12:13], s[0:1]
 ; GFX10-NEXT:    s_cmp_lg_u32 s17, 0
 ; GFX10-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
-; GFX10-NEXT:    s_sub_i32 s14, s12, 64
-; GFX10-NEXT:    s_sub_i32 s10, 64, s12
-; GFX10-NEXT:    s_cmp_lt_u32 s12, 64
+; GFX10-NEXT:    s_sub_i32 s14, s10, 64
+; GFX10-NEXT:    s_sub_i32 s11, 64, s10
+; GFX10-NEXT:    s_cmp_lt_u32 s10, 64
 ; GFX10-NEXT:    s_cselect_b32 s15, 1, 0
-; GFX10-NEXT:    s_cmp_eq_u32 s12, 0
+; GFX10-NEXT:    s_cmp_eq_u32 s10, 0
 ; GFX10-NEXT:    s_cselect_b32 s16, 1, 0
-; GFX10-NEXT:    s_lshr_b64 s[0:1], s[4:5], s12
-; GFX10-NEXT:    s_lshl_b64 s[10:11], s[6:7], s10
-; GFX10-NEXT:    s_lshr_b64 s[12:13], s[6:7], s12
-; GFX10-NEXT:    s_or_b64 s[0:1], s[0:1], s[10:11]
+; GFX10-NEXT:    s_lshr_b64 s[0:1], s[4:5], s10
+; GFX10-NEXT:    s_lshl_b64 s[12:13], s[6:7], s11
+; GFX10-NEXT:    s_lshr_b64 s[10:11], s[6:7], s10
+; GFX10-NEXT:    s_or_b64 s[0:1], s[0:1], s[12:13]
 ; GFX10-NEXT:    s_lshr_b64 s[6:7], s[6:7], s14
 ; GFX10-NEXT:    s_cmp_lg_u32 s15, 0
 ; GFX10-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[6:7]
 ; GFX10-NEXT:    s_cmp_lg_u32 s16, 0
 ; GFX10-NEXT:    s_cselect_b64 s[0:1], s[4:5], s[0:1]
 ; GFX10-NEXT:    s_cmp_lg_u32 s15, 0
-; GFX10-NEXT:    s_cselect_b64 s[4:5], s[12:13], 0
+; GFX10-NEXT:    s_cselect_b64 s[4:5], s[10:11], 0
 ; GFX10-NEXT:    s_or_b64 s[0:1], s[8:9], s[0:1]
 ; GFX10-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: s_fshr_i128:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_movk_i32 s10, 0x7f
-; GFX11-NEXT:    s_mov_b32 s11, 0
+; GFX11-NEXT:    s_and_b64 s[10:11], s[8:9], 0x7f
+; GFX11-NEXT:    s_and_not1_b64 s[8:9], 0x7f, s[8:9]
 ; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
-; GFX11-NEXT:    s_and_b64 s[12:13], s[8:9], s[10:11]
-; GFX11-NEXT:    s_and_not1_b64 s[8:9], s[10:11], s[8:9]
-; GFX11-NEXT:    s_lshr_b32 s10, s1, 31
+; GFX11-NEXT:    s_lshr_b32 s12, s1, 31
+; GFX11-NEXT:    s_mov_b32 s13, 0
 ; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 1
-; GFX11-NEXT:    s_or_b64 s[2:3], s[2:3], s[10:11]
-; GFX11-NEXT:    s_sub_i32 s13, s8, 64
+; GFX11-NEXT:    s_or_b64 s[2:3], s[2:3], s[12:13]
+; GFX11-NEXT:    s_sub_i32 s11, s8, 64
 ; GFX11-NEXT:    s_sub_i32 s9, 64, s8
 ; GFX11-NEXT:    s_cmp_lt_u32 s8, 64
 ; GFX11-NEXT:    s_cselect_b32 s16, 1, 0
 ; GFX11-NEXT:    s_cmp_eq_u32 s8, 0
 ; GFX11-NEXT:    s_cselect_b32 s17, 1, 0
-; GFX11-NEXT:    s_lshr_b64 s[10:11], s[0:1], s9
+; GFX11-NEXT:    s_lshr_b64 s[12:13], s[0:1], s9
 ; GFX11-NEXT:    s_lshl_b64 s[14:15], s[2:3], s8
 ; GFX11-NEXT:    s_lshl_b64 s[8:9], s[0:1], s8
-; GFX11-NEXT:    s_or_b64 s[10:11], s[10:11], s[14:15]
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], s13
+; GFX11-NEXT:    s_or_b64 s[12:13], s[12:13], s[14:15]
+; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], s11
 ; GFX11-NEXT:    s_cmp_lg_u32 s16, 0
 ; GFX11-NEXT:    s_cselect_b64 s[8:9], s[8:9], 0
-; GFX11-NEXT:    s_cselect_b64 s[0:1], s[10:11], s[0:1]
+; GFX11-NEXT:    s_cselect_b64 s[0:1], s[12:13], s[0:1]
 ; GFX11-NEXT:    s_cmp_lg_u32 s17, 0
 ; GFX11-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
-; GFX11-NEXT:    s_sub_i32 s14, s12, 64
-; GFX11-NEXT:    s_sub_i32 s10, 64, s12
-; GFX11-NEXT:    s_cmp_lt_u32 s12, 64
+; GFX11-NEXT:    s_sub_i32 s14, s10, 64
+; GFX11-NEXT:    s_sub_i32 s11, 64, s10
+; GFX11-NEXT:    s_cmp_lt_u32 s10, 64
 ; GFX11-NEXT:    s_cselect_b32 s15, 1, 0
-; GFX11-NEXT:    s_cmp_eq_u32 s12, 0
+; GFX11-NEXT:    s_cmp_eq_u32 s10, 0
 ; GFX11-NEXT:    s_cselect_b32 s16, 1, 0
-; GFX11-NEXT:    s_lshr_b64 s[0:1], s[4:5], s12
-; GFX11-NEXT:    s_lshl_b64 s[10:11], s[6:7], s10
-; GFX11-NEXT:    s_lshr_b64 s[12:13], s[6:7], s12
-; GFX11-NEXT:    s_or_b64 s[0:1], s[0:1], s[10:11]
+; GFX11-NEXT:    s_lshr_b64 s[0:1], s[4:5], s10
+; GFX11-NEXT:    s_lshl_b64 s[12:13], s[6:7], s11
+; GFX11-NEXT:    s_lshr_b64 s[10:11], s[6:7], s10
+; GFX11-NEXT:    s_or_b64 s[0:1], s[0:1], s[12:13]
 ; GFX11-NEXT:    s_lshr_b64 s[6:7], s[6:7], s14
 ; GFX11-NEXT:    s_cmp_lg_u32 s15, 0
 ; GFX11-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[6:7]
 ; GFX11-NEXT:    s_cmp_lg_u32 s16, 0
 ; GFX11-NEXT:    s_cselect_b64 s[0:1], s[4:5], s[0:1]
 ; GFX11-NEXT:    s_cmp_lg_u32 s15, 0
-; GFX11-NEXT:    s_cselect_b64 s[4:5], s[12:13], 0
+; GFX11-NEXT:    s_cselect_b64 s[4:5], s[10:11], 0
 ; GFX11-NEXT:    s_or_b64 s[0:1], s[8:9], s[0:1]
 ; GFX11-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
 ; GFX11-NEXT:    ; return to shader part epilog
@@ -6626,45 +6621,44 @@ define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
 define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 inreg %amt) {
 ; GFX6-LABEL: v_fshr_i128_svs:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_movk_i32 s6, 0x7f
-; GFX6-NEXT:    s_mov_b32 s7, 0
-; GFX6-NEXT:    s_and_b64 s[8:9], s[4:5], s[6:7]
-; GFX6-NEXT:    s_andn2_b64 s[4:5], s[6:7], s[4:5]
+; GFX6-NEXT:    s_and_b64 s[6:7], s[4:5], 0x7f
+; GFX6-NEXT:    s_andn2_b64 s[4:5], 0x7f, s[4:5]
+; GFX6-NEXT:    s_lshl_b64 s[8:9], s[0:1], 1
 ; GFX6-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
-; GFX6-NEXT:    s_lshr_b32 s6, s1, 31
-; GFX6-NEXT:    s_lshl_b64 s[10:11], s[0:1], 1
-; GFX6-NEXT:    s_or_b64 s[0:1], s[2:3], s[6:7]
-; GFX6-NEXT:    s_sub_i32 s9, s4, 64
+; GFX6-NEXT:    s_lshr_b32 s0, s1, 31
+; GFX6-NEXT:    s_mov_b32 s1, 0
+; GFX6-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
+; GFX6-NEXT:    s_sub_i32 s7, s4, 64
 ; GFX6-NEXT:    s_sub_i32 s5, 64, s4
 ; GFX6-NEXT:    s_cmp_lt_u32 s4, 64
 ; GFX6-NEXT:    s_cselect_b32 s12, 1, 0
 ; GFX6-NEXT:    s_cmp_eq_u32 s4, 0
 ; GFX6-NEXT:    s_cselect_b32 s13, 1, 0
-; GFX6-NEXT:    s_lshl_b64 s[2:3], s[10:11], s4
-; GFX6-NEXT:    s_lshr_b64 s[6:7], s[10:11], s5
+; GFX6-NEXT:    s_lshl_b64 s[2:3], s[8:9], s4
+; GFX6-NEXT:    s_lshr_b64 s[10:11], s[8:9], s5
 ; GFX6-NEXT:    s_lshl_b64 s[4:5], s[0:1], s4
-; GFX6-NEXT:    s_or_b64 s[4:5], s[6:7], s[4:5]
-; GFX6-NEXT:    s_lshl_b64 s[6:7], s[10:11], s9
+; GFX6-NEXT:    s_or_b64 s[4:5], s[10:11], s[4:5]
+; GFX6-NEXT:    s_lshl_b64 s[8:9], s[8:9], s7
 ; GFX6-NEXT:    s_cmp_lg_u32 s12, 0
 ; GFX6-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
-; GFX6-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[6:7]
+; GFX6-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[8:9]
 ; GFX6-NEXT:    s_cmp_lg_u32 s13, 0
 ; GFX6-NEXT:    s_cselect_b64 s[4:5], s[0:1], s[4:5]
-; GFX6-NEXT:    s_sub_i32 s0, s8, 64
-; GFX6-NEXT:    s_sub_i32 s1, 64, s8
-; GFX6-NEXT:    s_cmp_lt_u32 s8, 64
-; GFX6-NEXT:    s_cselect_b32 s6, 1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s8, 0
-; GFX6-NEXT:    v_lshr_b64 v[4:5], v[0:1], s8
-; GFX6-NEXT:    v_lshl_b64 v[6:7], v[2:3], s1
+; GFX6-NEXT:    s_sub_i32 s0, s6, 64
+; GFX6-NEXT:    s_sub_i32 s1, 64, s6
+; GFX6-NEXT:    s_cmp_lt_u32 s6, 64
 ; GFX6-NEXT:    s_cselect_b32 s7, 1, 0
-; GFX6-NEXT:    v_lshr_b64 v[8:9], v[2:3], s8
+; GFX6-NEXT:    s_cmp_eq_u32 s6, 0
+; GFX6-NEXT:    v_lshr_b64 v[4:5], v[0:1], s6
+; GFX6-NEXT:    v_lshl_b64 v[6:7], v[2:3], s1
+; GFX6-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX6-NEXT:    v_lshr_b64 v[8:9], v[2:3], s6
 ; GFX6-NEXT:    v_lshr_b64 v[2:3], v[2:3], s0
-; GFX6-NEXT:    s_and_b32 s0, 1, s6
+; GFX6-NEXT:    s_and_b32 s0, 1, s7
 ; GFX6-NEXT:    v_or_b32_e32 v4, v4, v6
 ; GFX6-NEXT:    v_or_b32_e32 v5, v5, v7
 ; GFX6-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
-; GFX6-NEXT:    s_and_b32 s0, 1, s7
+; GFX6-NEXT:    s_and_b32 s0, 1, s8
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
 ; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, s0
@@ -6680,45 +6674,44 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 ;
 ; GFX8-LABEL: v_fshr_i128_svs:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_movk_i32 s6, 0x7f
-; GFX8-NEXT:    s_mov_b32 s7, 0
-; GFX8-NEXT:    s_and_b64 s[8:9], s[4:5], s[6:7]
-; GFX8-NEXT:    s_andn2_b64 s[4:5], s[6:7], s[4:5]
+; GFX8-NEXT:    s_and_b64 s[6:7], s[4:5], 0x7f
+; GFX8-NEXT:    s_andn2_b64 s[4:5], 0x7f, s[4:5]
+; GFX8-NEXT:    s_lshl_b64 s[8:9], s[0:1], 1
 ; GFX8-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
-; GFX8-NEXT:    s_lshr_b32 s6, s1, 31
-; GFX8-NEXT:    s_lshl_b64 s[10:11], s[0:1], 1
-; GFX8-NEXT:    s_or_b64 s[0:1], s[2:3], s[6:7]
-; GFX8-NEXT:    s_sub_i32 s9, s4, 64
+; GFX8-NEXT:    s_lshr_b32 s0, s1, 31
+; GFX8-NEXT:    s_mov_b32 s1, 0
+; GFX8-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
+; GFX8-NEXT:    s_sub_i32 s7, s4, 64
 ; GFX8-NEXT:    s_sub_i32 s5, 64, s4
 ; GFX8-NEXT:    s_cmp_lt_u32 s4, 64
 ; GFX8-NEXT:    s_cselect_b32 s12, 1, 0
 ; GFX8-NEXT:    s_cmp_eq_u32 s4, 0
 ; GFX8-NEXT:    s_cselect_b32 s13, 1, 0
-; GFX8-NEXT:    s_lshl_b64 s[2:3], s[10:11], s4
-; GFX8-NEXT:    s_lshr_b64 s[6:7], s[10:11], s5
+; GFX8-NEXT:    s_lshl_b64 s[2:3], s[8:9], s4
+; GFX8-NEXT:    s_lshr_b64 s[10:11], s[8:9], s5
 ; GFX8-NEXT:    s_lshl_b64 s[4:5], s[0:1], s4
-; GFX8-NEXT:    s_or_b64 s[4:5], s[6:7], s[4:5]
-; GFX8-NEXT:    s_lshl_b64 s[6:7], s[10:11], s9
+; GFX8-NEXT:    s_or_b64 s[4:5], s[10:11], s[4:5]
+; GFX8-NEXT:    s_lshl_b64 s[8:9], s[8:9], s7
 ; GFX8-NEXT:    s_cmp_lg_u32 s12, 0
 ; GFX8-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
-; GFX8-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[6:7]
+; GFX8-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[8:9]
 ; GFX8-NEXT:    s_cmp_lg_u32 s13, 0
 ; GFX8-NEXT:    s_cselect_b64 s[4:5], s[0:1], s[4:5]
-; GFX8-NEXT:    s_sub_i32 s0, s8, 64
-; GFX8-NEXT:    s_sub_i32 s1, 64, s8
-; GFX8-NEXT:    s_cmp_lt_u32 s8, 64
-; GFX8-NEXT:    s_cselect_b32 s6, 1, 0
-; GFX8-NEXT:    s_cmp_eq_u32 s8, 0
-; GFX8-NEXT:    v_lshrrev_b64 v[4:5], s8, v[0:1]
-; GFX8-NEXT:    v_lshlrev_b64 v[6:7], s1, v[2:3]
+; GFX8-NEXT:    s_sub_i32 s0, s6, 64
+; GFX8-NEXT:    s_sub_i32 s1, 64, s6
+; GFX8-NEXT:    s_cmp_lt_u32 s6, 64
 ; GFX8-NEXT:    s_cselect_b32 s7, 1, 0
-; GFX8-NEXT:    v_lshrrev_b64 v[8:9], s8, v[2:3]
+; GFX8-NEXT:    s_cmp_eq_u32 s6, 0
+; GFX8-NEXT:    v_lshrrev_b64 v[4:5], s6, v[0:1]
+; GFX8-NEXT:    v_lshlrev_b64 v[6:7], s1, v[2:3]
+; GFX8-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX8-NEXT:    v_lshrrev_b64 v[8:9], s6, v[2:3]
 ; GFX8-NEXT:    v_lshrrev_b64 v[2:3], s0, v[2:3]
-; GFX8-NEXT:    s_and_b32 s0, 1, s6
+; GFX8-NEXT:    s_and_b32 s0, 1, s7
 ; GFX8-NEXT:    v_or_b32_e32 v4, v4, v6
 ; GFX8-NEXT:    v_or_b32_e32 v5, v5, v7
 ; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
-; GFX8-NEXT:    s_and_b32 s0, 1, s7
+; GFX8-NEXT:    s_and_b32 s0, 1, s8
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
 ; GFX8-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, s0
@@ -6734,45 +6727,44 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 ;
 ; GFX9-LABEL: v_fshr_i128_svs:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_movk_i32 s6, 0x7f
-; GFX9-NEXT:    s_mov_b32 s7, 0
-; GFX9-NEXT:    s_and_b64 s[8:9], s[4:5], s[6:7]
-; GFX9-NEXT:    s_andn2_b64 s[4:5], s[6:7], s[4:5]
+; GFX9-NEXT:    s_and_b64 s[6:7], s[4:5], 0x7f
+; GFX9-NEXT:    s_andn2_b64 s[4:5], 0x7f, s[4:5]
+; GFX9-NEXT:    s_lshl_b64 s[8:9], s[0:1], 1
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
-; GFX9-NEXT:    s_lshr_b32 s6, s1, 31
-; GFX9-NEXT:    s_lshl_b64 s[10:11], s[0:1], 1
-; GFX9-NEXT:    s_or_b64 s[0:1], s[2:3], s[6:7]
-; GFX9-NEXT:    s_sub_i32 s9, s4, 64
+; GFX9-NEXT:    s_lshr_b32 s0, s1, 31
+; GFX9-NEXT:    s_mov_b32 s1, 0
+; GFX9-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
+; GFX9-NEXT:    s_sub_i32 s7, s4, 64
 ; GFX9-NEXT:    s_sub_i32 s5, 64, s4
 ; GFX9-NEXT:    s_cmp_lt_u32 s4, 64
 ; GFX9-NEXT:    s_cselect_b32 s12, 1, 0
 ; GFX9-NEXT:    s_cmp_eq_u32 s4, 0
 ; GFX9-NEXT:    s_cselect_b32 s13, 1, 0
-; GFX9-NEXT:    s_lshl_b64 s[2:3], s[10:11], s4
-; GFX9-NEXT:    s_lshr_b64 s[6:7], s[10:11], s5
+; GFX9-NEXT:    s_lshl_b64 s[2:3], s[8:9], s4
+; GFX9-NEXT:    s_lshr_b64 s[10:11], s[8:9], s5
 ; GFX9-NEXT:    s_lshl_b64 s[4:5], s[0:1], s4
-; GFX9-NEXT:    s_or_b64 s[4:5], s[6:7], s[4:5]
-; GFX9-NEXT:    s_lshl_b64 s[6:7], s[10:11], s9
+; GFX9-NEXT:    s_or_b64 s[4:5], s[10:11], s[4:5]
+; GFX9-NEXT:    s_lshl_b64 s[8:9], s[8:9], s7
 ; GFX9-NEXT:    s_cmp_lg_u32 s12, 0
 ; GFX9-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
-; GFX9-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[6:7]
+; GFX9-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[8:9]
 ; GFX9-NEXT:    s_cmp_lg_u32 s13, 0
 ; GFX9-NEXT:    s_cselect_b64 s[4:5], s[0:1], s[4:5]
-; GFX9-NEXT:    s_sub_i32 s0, s8, 64
-; GFX9-NEXT:    s_sub_i32 s1, 64, s8
-; GFX9-NEXT:    s_cmp_lt_u32 s8, 64
-; GFX9-NEXT:    s_cselect_b32 s6, 1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s8, 0
-; GFX9-NEXT:    v_lshrrev_b64 v[4:5], s8, v[0:1]
-; GFX9-NEXT:    v_lshlrev_b64 v[6:7], s1, v[2:3]
+; GFX9-NEXT:    s_sub_i32 s0, s6, 64
+; GFX9-NEXT:    s_sub_i32 s1, 64, s6
+; GFX9-NEXT:    s_cmp_lt_u32 s6, 64
 ; GFX9-NEXT:    s_cselect_b32 s7, 1, 0
-; GFX9-NEXT:    v_lshrrev_b64 v[8:9], s8, v[2:3]
+; GFX9-NEXT:    s_cmp_eq_u32 s6, 0
+; GFX9-NEXT:    v_lshrrev_b64 v[4:5], s6, v[0:1]
+; GFX9-NEXT:    v_lshlrev_b64 v[6:7], s1, v[2:3]
+; GFX9-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX9-NEXT:    v_lshrrev_b64 v[8:9], s6, v[2:3]
 ; GFX9-NEXT:    v_lshrrev_b64 v[2:3], s0, v[2:3]
-; GFX9-NEXT:    s_and_b32 s0, 1, s6
+; GFX9-NEXT:    s_and_b32 s0, 1, s7
 ; GFX9-NEXT:    v_or_b32_e32 v4, v4, v6
 ; GFX9-NEXT:    v_or_b32_e32 v5, v5, v7
 ; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
-; GFX9-NEXT:    s_and_b32 s0, 1, s7
+; GFX9-NEXT:    s_and_b32 s0, 1, s8
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
 ; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, s0
@@ -6788,45 +6780,44 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 ;
 ; GFX10-LABEL: v_fshr_i128_svs:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_movk_i32 s6, 0x7f
-; GFX10-NEXT:    s_mov_b32 s7, 0
+; GFX10-NEXT:    s_and_b64 s[6:7], s[4:5], 0x7f
+; GFX10-NEXT:    s_andn2_b64 s[4:5], 0x7f, s[4:5]
 ; GFX10-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
-; GFX10-NEXT:    s_and_b64 s[8:9], s[4:5], s[6:7]
-; GFX10-NEXT:    s_andn2_b64 s[4:5], s[6:7], s[4:5]
-; GFX10-NEXT:    s_lshr_b32 s6, s1, 31
+; GFX10-NEXT:    s_lshr_b32 s8, s1, 31
+; GFX10-NEXT:    s_mov_b32 s9, 0
 ; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], 1
-; GFX10-NEXT:    s_or_b64 s[2:3], s[2:3], s[6:7]
-; GFX10-NEXT:    s_sub_i32 s9, s4, 64
+; GFX10-NEXT:    s_or_b64 s[2:3], s[2:3], s[8:9]
+; GFX10-NEXT:    s_sub_i32 s7, s4, 64
 ; GFX10-NEXT:    s_sub_i32 s5, 64, s4
 ; GFX10-NEXT:    s_cmp_lt_u32 s4, 64
-; GFX10-NEXT:    v_lshrrev_b64 v[4:5], s8, v[0:1]
+; GFX10-NEXT:    v_lshrrev_b64 v[4:5], s6, v[0:1]
 ; GFX10-NEXT:    s_cselect_b32 s12, 1, 0
 ; GFX10-NEXT:    s_cmp_eq_u32 s4, 0
 ; GFX10-NEXT:    s_cselect_b32 s13, 1, 0
-; GFX10-NEXT:    s_lshr_b64 s[6:7], s[0:1], s5
+; GFX10-NEXT:    s_lshr_b64 s[8:9], s[0:1], s5
 ; GFX10-NEXT:    s_lshl_b64 s[10:11], s[2:3], s4
 ; GFX10-NEXT:    s_lshl_b64 s[4:5], s[0:1], s4
-; GFX10-NEXT:    s_or_b64 s[6:7], s[6:7], s[10:11]
-; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], s9
+; GFX10-NEXT:    s_or_b64 s[8:9], s[8:9], s[10:11]
+; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], s7
 ; GFX10-NEXT:    s_cmp_lg_u32 s12, 0
 ; GFX10-NEXT:    s_cselect_b64 s[4:5], s[4:5], 0
-; GFX10-NEXT:    s_cselect_b64 s[0:1], s[6:7], s[0:1]
+; GFX10-NEXT:    s_cselect_b64 s[0:1], s[8:9], s[0:1]
 ; GFX10-NEXT:    s_cmp_lg_u32 s13, 0
 ; GFX10-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
-; GFX10-NEXT:    s_sub_i32 s0, 64, s8
+; GFX10-NEXT:    s_sub_i32 s0, 64, s6
 ; GFX10-NEXT:    v_lshlrev_b64 v[6:7], s0, v[2:3]
-; GFX10-NEXT:    s_sub_i32 s0, s8, 64
-; GFX10-NEXT:    s_cmp_lt_u32 s8, 64
+; GFX10-NEXT:    s_sub_i32 s0, s6, 64
+; GFX10-NEXT:    s_cmp_lt_u32 s6, 64
 ; GFX10-NEXT:    v_lshrrev_b64 v[8:9], s0, v[2:3]
 ; GFX10-NEXT:    s_cselect_b32 s1, 1, 0
-; GFX10-NEXT:    s_cmp_eq_u32 s8, 0
+; GFX10-NEXT:    s_cmp_eq_u32 s6, 0
 ; GFX10-NEXT:    v_or_b32_e32 v4, v4, v6
-; GFX10-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX10-NEXT:    s_cselect_b32 s7, 1, 0
 ; GFX10-NEXT:    s_and_b32 s0, 1, s1
 ; GFX10-NEXT:    v_or_b32_e32 v5, v5, v7
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
-; GFX10-NEXT:    s_and_b32 s0, 1, s6
-; GFX10-NEXT:    v_lshrrev_b64 v[2:3], s8, v[2:3]
+; GFX10-NEXT:    s_and_b32 s0, 1, s7
+; GFX10-NEXT:    v_lshrrev_b64 v[2:3], s6, v[2:3]
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, s0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v5, v9, v5, vcc_lo
@@ -6842,46 +6833,45 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 ;
 ; GFX11-LABEL: v_fshr_i128_svs:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_movk_i32 s6, 0x7f
-; GFX11-NEXT:    s_mov_b32 s7, 0
+; GFX11-NEXT:    s_and_b64 s[6:7], s[4:5], 0x7f
+; GFX11-NEXT:    s_and_not1_b64 s[4:5], 0x7f, s[4:5]
 ; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
-; GFX11-NEXT:    s_and_b64 s[8:9], s[4:5], s[6:7]
-; GFX11-NEXT:    s_and_not1_b64 s[4:5], s[6:7], s[4:5]
-; GFX11-NEXT:    s_lshr_b32 s6, s1, 31
+; GFX11-NEXT:    s_lshr_b32 s8, s1, 31
+; GFX11-NEXT:    s_mov_b32 s9, 0
 ; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 1
-; GFX11-NEXT:    s_or_b64 s[2:3], s[2:3], s[6:7]
-; GFX11-NEXT:    s_sub_i32 s9, s4, 64
+; GFX11-NEXT:    s_or_b64 s[2:3], s[2:3], s[8:9]
+; GFX11-NEXT:    s_sub_i32 s7, s4, 64
 ; GFX11-NEXT:    s_sub_i32 s5, 64, s4
 ; GFX11-NEXT:    s_cmp_lt_u32 s4, 64
-; GFX11-NEXT:    v_lshrrev_b64 v[4:5], s8, v[0:1]
+; GFX11-NEXT:    v_lshrrev_b64 v[4:5], s6, v[0:1]
 ; GFX11-NEXT:    s_cselect_b32 s12, 1, 0
 ; GFX11-NEXT:    s_cmp_eq_u32 s4, 0
 ; GFX11-NEXT:    s_cselect_b32 s13, 1, 0
-; GFX11-NEXT:    s_lshr_b64 s[6:7], s[0:1], s5
+; GFX11-NEXT:    s_lshr_b64 s[8:9], s[0:1], s5
 ; GFX11-NEXT:    s_lshl_b64 s[10:11], s[2:3], s4
 ; GFX11-NEXT:    s_lshl_b64 s[4:5], s[0:1], s4
-; GFX11-NEXT:    s_or_b64 s[6:7], s[6:7], s[10:11]
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], s9
+; GFX11-NEXT:    s_or_b64 s[8:9], s[8:9], s[10:11]
+; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], s7
 ; GFX11-NEXT:    s_cmp_lg_u32 s12, 0
 ; GFX11-NEXT:    s_cselect_b64 s[4:5], s[4:5], 0
-; GFX11-NEXT:    s_cselect_b64 s[0:1], s[6:7], s[0:1]
+; GFX11-NEXT:    s_cselect_b64 s[0:1], s[8:9], s[0:1]
 ; GFX11-NEXT:    s_cmp_lg_u32 s13, 0
 ; GFX11-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
-; GFX11-NEXT:    s_sub_i32 s0, 64, s8
+; GFX11-NEXT:    s_sub_i32 s0, 64, s6
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_lshlrev_b64 v[6:7], s0, v[2:3]
-; GFX11-NEXT:    s_sub_i32 s0, s8, 64
-; GFX11-NEXT:    s_cmp_lt_u32 s8, 64
+; GFX11-NEXT:    s_sub_i32 s0, s6, 64
+; GFX11-NEXT:    s_cmp_lt_u32 s6, 64
 ; GFX11-NEXT:    v_lshrrev_b64 v[8:9], s0, v[2:3]
 ; GFX11-NEXT:    s_cselect_b32 s1, 1, 0
-; GFX11-NEXT:    s_cmp_eq_u32 s8, 0
+; GFX11-NEXT:    s_cmp_eq_u32 s6, 0
 ; GFX11-NEXT:    v_or_b32_e32 v4, v4, v6
-; GFX11-NEXT:    s_cselect_b32 s6, 1, 0
+; GFX11-NEXT:    s_cselect_b32 s7, 1, 0
 ; GFX11-NEXT:    s_and_b32 s0, 1, s1
 ; GFX11-NEXT:    v_or_b32_e32 v5, v5, v7
 ; GFX11-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
-; GFX11-NEXT:    s_and_b32 s0, 1, s6
-; GFX11-NEXT:    v_lshrrev_b64 v[2:3], s8, v[2:3]
+; GFX11-NEXT:    s_and_b32 s0, 1, s7
+; GFX11-NEXT:    v_lshrrev_b64 v[2:3], s6, v[2:3]
 ; GFX11-NEXT:    v_cmp_ne_u32_e64 s0, 0, s0
 ; GFX11-NEXT:    v_dual_cndmask_b32 v4, v8, v4 :: v_dual_cndmask_b32 v5, v9, v5
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
@@ -6904,42 +6894,41 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 define amdgpu_ps <4 x float> @v_fshr_i128_vss(i128 %lhs, i128 inreg %rhs, i128 inreg %amt) {
 ; GFX6-LABEL: v_fshr_i128_vss:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_mov_b64 s[6:7], 0x7f
-; GFX6-NEXT:    s_and_b64 s[8:9], s[4:5], s[6:7]
-; GFX6-NEXT:    s_andn2_b64 s[4:5], s[6:7], s[4:5]
+; GFX6-NEXT:    s_and_b64 s[6:7], s[4:5], 0x7f
+; GFX6-NEXT:    s_andn2_b64 s[4:5], 0x7f, s[4:5]
 ; GFX6-NEXT:    v_lshl_b64 v[2:3], v[2:3], 1
 ; GFX6-NEXT:    s_sub_i32 s5, s4, 64
-; GFX6-NEXT:    s_sub_i32 s6, 64, s4
+; GFX6-NEXT:    s_sub_i32 s7, 64, s4
 ; GFX6-NEXT:    v_lshl_b64 v[4:5], v[0:1], 1
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 31, v1
 ; GFX6-NEXT:    s_cmp_lt_u32 s4, 64
 ; GFX6-NEXT:    v_or_b32_e32 v2, v2, v0
-; GFX6-NEXT:    s_cselect_b32 s7, 1, 0
+; GFX6-NEXT:    s_cselect_b32 s8, 1, 0
 ; GFX6-NEXT:    s_cmp_eq_u32 s4, 0
 ; GFX6-NEXT:    s_cselect_b32 s9, 1, 0
-; GFX6-NEXT:    v_lshr_b64 v[0:1], v[4:5], s6
+; GFX6-NEXT:    v_lshr_b64 v[0:1], v[4:5], s7
 ; GFX6-NEXT:    v_lshl_b64 v[6:7], v[2:3], s4
 ; GFX6-NEXT:    v_lshl_b64 v[8:9], v[4:5], s4
-; GFX6-NEXT:    s_and_b32 s4, 1, s7
+; GFX6-NEXT:    s_and_b32 s4, 1, s8
 ; GFX6-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
 ; GFX6-NEXT:    s_and_b32 s4, 1, s9
-; GFX6-NEXT:    s_sub_i32 s10, s8, 64
-; GFX6-NEXT:    s_sub_i32 s9, 64, s8
+; GFX6-NEXT:    s_sub_i32 s10, s6, 64
+; GFX6-NEXT:    s_sub_i32 s8, 64, s6
 ; GFX6-NEXT:    v_or_b32_e32 v6, v0, v6
 ; GFX6-NEXT:    v_or_b32_e32 v7, v1, v7
 ; GFX6-NEXT:    v_lshl_b64 v[0:1], v[4:5], s5
-; GFX6-NEXT:    s_cmp_lt_u32 s8, 64
+; GFX6-NEXT:    s_cmp_lt_u32 s6, 64
 ; GFX6-NEXT:    s_cselect_b32 s11, 1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s8, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s6, 0
 ; GFX6-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v5, 0, v9, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc
 ; GFX6-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
 ; GFX6-NEXT:    s_cselect_b32 s12, 1, 0
-; GFX6-NEXT:    s_lshr_b64 s[4:5], s[2:3], s8
-; GFX6-NEXT:    s_lshr_b64 s[6:7], s[0:1], s8
-; GFX6-NEXT:    s_lshl_b64 s[8:9], s[2:3], s9
+; GFX6-NEXT:    s_lshr_b64 s[4:5], s[2:3], s6
+; GFX6-NEXT:    s_lshr_b64 s[6:7], s[0:1], s6
+; GFX6-NEXT:    s_lshl_b64 s[8:9], s[2:3], s8
 ; GFX6-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
 ; GFX6-NEXT:    s_lshr_b64 s[2:3], s[2:3], s10
 ; GFX6-NEXT:    s_cmp_lg_u32 s11, 0
@@ -6958,42 +6947,41 @@ define amdgpu_ps <4 x float> @v_fshr_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
 ;
 ; GFX8-LABEL: v_fshr_i128_vss:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_mov_b64 s[6:7], 0x7f
-; GFX8-NEXT:    s_and_b64 s[8:9], s[4:5], s[6:7]
-; GFX8-NEXT:    s_andn2_b64 s[4:5], s[6:7], s[4:5]
+; GFX8-NEXT:    s_and_b64 s[6:7], s[4:5], 0x7f
+; GFX8-NEXT:    s_andn2_b64 s[4:5], 0x7f, s[4:5]
 ; GFX8-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
 ; GFX8-NEXT:    s_sub_i32 s5, s4, 64
-; GFX8-NEXT:    s_sub_i32 s6, 64, s4
+; GFX8-NEXT:    s_sub_i32 s7, 64, s4
 ; GFX8-NEXT:    v_lshlrev_b64 v[4:5], 1, v[0:1]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 31, v1
 ; GFX8-NEXT:    s_cmp_lt_u32 s4, 64
 ; GFX8-NEXT:    v_or_b32_e32 v2, v2, v0
-; GFX8-NEXT:    s_cselect_b32 s7, 1, 0
+; GFX8-NEXT:    s_cselect_b32 s8, 1, 0
 ; GFX8-NEXT:    s_cmp_eq_u32 s4, 0
 ; GFX8-NEXT:    s_cselect_b32 s9, 1, 0
-; GFX8-NEXT:    v_lshrrev_b64 v[0:1], s6, v[4:5]
+; GFX8-NEXT:    v_lshrrev_b64 v[0:1], s7, v[4:5]
 ; GFX8-NEXT:    v_lshlrev_b64 v[6:7], s4, v[2:3]
 ; GFX8-NEXT:    v_lshlrev_b64 v[8:9], s4, v[4:5]
-; GFX8-NEXT:    s_and_b32 s4, 1, s7
+; GFX8-NEXT:    s_and_b32 s4, 1, s8
 ; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
 ; GFX8-NEXT:    s_and_b32 s4, 1, s9
-; GFX8-NEXT:    s_sub_i32 s10, s8, 64
-; GFX8-NEXT:    s_sub_i32 s9, 64, s8
+; GFX8-NEXT:    s_sub_i32 s10, s6, 64
+; GFX8-NEXT:    s_sub_i32 s8, 64, s6
 ; GFX8-NEXT:    v_or_b32_e32 v6, v0, v6
 ; GFX8-NEXT:    v_or_b32_e32 v7, v1, v7
 ; GFX8-NEXT:    v_lshlrev_b64 v[0:1], s5, v[4:5]
-; GFX8-NEXT:    s_cmp_lt_u32 s8, 64
+; GFX8-NEXT:    s_cmp_lt_u32 s6, 64
 ; GFX8-NEXT:    s_cselect_b32 s11, 1, 0
-; GFX8-NEXT:    s_cmp_eq_u32 s8, 0
+; GFX8-NEXT:    s_cmp_eq_u32 s6, 0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v5, 0, v9, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc
 ; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
 ; GFX8-NEXT:    s_cselect_b32 s12, 1, 0
-; GFX8-NEXT:    s_lshr_b64 s[4:5], s[2:3], s8
-; GFX8-NEXT:    s_lshr_b64 s[6:7], s[0:1], s8
-; GFX8-NEXT:    s_lshl_b64 s[8:9], s[2:3], s9
+; GFX8-NEXT:    s_lshr_b64 s[4:5], s[2:3], s6
+; GFX8-NEXT:    s_lshr_b64 s[6:7], s[0:1], s6
+; GFX8-NEXT:    s_lshl_b64 s[8:9], s[2:3], s8
 ; GFX8-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
 ; GFX8-NEXT:    s_lshr_b64 s[2:3], s[2:3], s10
 ; GFX8-NEXT:    s_cmp_lg_u32 s11, 0
@@ -7012,42 +7000,41 @@ define amdgpu_ps <4 x float> @v_fshr_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
 ;
 ; GFX9-LABEL: v_fshr_i128_vss:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b64 s[6:7], 0x7f
-; GFX9-NEXT:    s_and_b64 s[8:9], s[4:5], s[6:7]
-; GFX9-NEXT:    s_andn2_b64 s[4:5], s[6:7], s[4:5]
+; GFX9-NEXT:    s_and_b64 s[6:7], s[4:5], 0x7f
+; GFX9-NEXT:    s_andn2_b64 s[4:5], 0x7f, s[4:5]
 ; GFX9-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
 ; GFX9-NEXT:    s_sub_i32 s5, s4, 64
-; GFX9-NEXT:    s_sub_i32 s6, 64, s4
+; GFX9-NEXT:    s_sub_i32 s7, 64, s4
 ; GFX9-NEXT:    v_lshlrev_b64 v[4:5], 1, v[0:1]
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 31, v1
 ; GFX9-NEXT:    s_cmp_lt_u32 s4, 64
 ; GFX9-NEXT:    v_or_b32_e32 v2, v2, v0
-; GFX9-NEXT:    s_cselect_b32 s7, 1, 0
+; GFX9-NEXT:    s_cselect_b32 s8, 1, 0
 ; GFX9-NEXT:    s_cmp_eq_u32 s4, 0
 ; GFX9-NEXT:    s_cselect_b32 s9, 1, 0
-; GFX9-NEXT:    v_lshrrev_b64 v[0:1], s6, v[4:5]
+; GFX9-NEXT:    v_lshrrev_b64 v[0:1], s7, v[4:5]
 ; GFX9-NEXT:    v_lshlrev_b64 v[6:7], s4, v[2:3]
 ; GFX9-NEXT:    v_lshlrev_b64 v[8:9], s4, v[4:5]
-; GFX9-NEXT:    s_and_b32 s4, 1, s7
+; GFX9-NEXT:    s_and_b32 s4, 1, s8
 ; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
 ; GFX9-NEXT:    s_and_b32 s4, 1, s9
-; GFX9-NEXT:    s_sub_i32 s10, s8, 64
-; GFX9-NEXT:    s_sub_i32 s9, 64, s8
+; GFX9-NEXT:    s_sub_i32 s10, s6, 64
+; GFX9-NEXT:    s_sub_i32 s8, 64, s6
 ; GFX9-NEXT:    v_or_b32_e32 v6, v0, v6
 ; GFX9-NEXT:    v_or_b32_e32 v7, v1, v7
 ; GFX9-NEXT:    v_lshlrev_b64 v[0:1], s5, v[4:5]
-; GFX9-NEXT:    s_cmp_lt_u32 s8, 64
+; GFX9-NEXT:    s_cmp_lt_u32 s6, 64
 ; GFX9-NEXT:    s_cselect_b32 s11, 1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s8, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s6, 0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, 0, v8, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v5, 0, v9, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc
 ; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s4
 ; GFX9-NEXT:    s_cselect_b32 s12, 1, 0
-; GFX9-NEXT:    s_lshr_b64 s[4:5], s[2:3], s8
-; GFX9-NEXT:    s_lshr_b64 s[6:7], s[0:1], s8
-; GFX9-NEXT:    s_lshl_b64 s[8:9], s[2:3], s9
+; GFX9-NEXT:    s_lshr_b64 s[4:5], s[2:3], s6
+; GFX9-NEXT:    s_lshr_b64 s[6:7], s[0:1], s6
+; GFX9-NEXT:    s_lshl_b64 s[8:9], s[2:3], s8
 ; GFX9-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
 ; GFX9-NEXT:    s_lshr_b64 s[2:3], s[2:3], s10
 ; GFX9-NEXT:    s_cmp_lg_u32 s11, 0
@@ -7068,41 +7055,40 @@ define amdgpu_ps <4 x float> @v_fshr_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 31, v1
-; GFX10-NEXT:    s_mov_b64 s[6:7], 0x7f
 ; GFX10-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
-; GFX10-NEXT:    s_and_b64 s[8:9], s[4:5], s[6:7]
-; GFX10-NEXT:    s_andn2_b64 s[4:5], s[6:7], s[4:5]
+; GFX10-NEXT:    s_and_b64 s[6:7], s[4:5], 0x7f
+; GFX10-NEXT:    s_andn2_b64 s[4:5], 0x7f, s[4:5]
+; GFX10-NEXT:    s_sub_i32 s7, 64, s4
 ; GFX10-NEXT:    v_or_b32_e32 v2, v2, v4
-; GFX10-NEXT:    s_sub_i32 s6, 64, s4
 ; GFX10-NEXT:    s_sub_i32 s5, s4, 64
 ; GFX10-NEXT:    s_cmp_lt_u32 s4, 64
-; GFX10-NEXT:    v_lshrrev_b64 v[4:5], s6, v[0:1]
+; GFX10-NEXT:    v_lshrrev_b64 v[4:5], s7, v[0:1]
+; GFX10-NEXT:    s_cselect_b32 s8, 1, 0
 ; GFX10-NEXT:    v_lshlrev_b64 v[6:7], s4, v[2:3]
-; GFX10-NEXT:    s_cselect_b32 s7, 1, 0
 ; GFX10-NEXT:    s_cmp_eq_u32 s4, 0
 ; GFX10-NEXT:    v_lshlrev_b64 v[8:9], s4, v[0:1]
 ; GFX10-NEXT:    s_cselect_b32 s9, 1, 0
-; GFX10-NEXT:    s_and_b32 s4, 1, s7
+; GFX10-NEXT:    s_and_b32 s4, 1, s8
 ; GFX10-NEXT:    v_lshlrev_b64 v[0:1], s5, v[0:1]
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s4
 ; GFX10-NEXT:    v_or_b32_e32 v4, v4, v6
 ; GFX10-NEXT:    v_or_b32_e32 v5, v5, v7
 ; GFX10-NEXT:    s_and_b32 s4, 1, s9
-; GFX10-NEXT:    s_sub_i32 s10, s8, 64
-; GFX10-NEXT:    s_sub_i32 s6, 64, s8
-; GFX10-NEXT:    s_cmp_lt_u32 s8, 64
+; GFX10-NEXT:    s_sub_i32 s10, s6, 64
+; GFX10-NEXT:    s_sub_i32 s7, 64, s6
+; GFX10-NEXT:    s_cmp_lt_u32 s6, 64
 ; GFX10-NEXT:    v_cndmask_b32_e32 v6, 0, v8, vcc_lo
 ; GFX10-NEXT:    s_cselect_b32 s11, 1, 0
-; GFX10-NEXT:    s_cmp_eq_u32 s8, 0
+; GFX10-NEXT:    s_cmp_eq_u32 s6, 0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v7, 0, v9, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s4
 ; GFX10-NEXT:    s_cselect_b32 s12, 1, 0
-; GFX10-NEXT:    s_lshr_b64 s[4:5], s[0:1], s8
-; GFX10-NEXT:    s_lshl_b64 s[6:7], s[2:3], s6
-; GFX10-NEXT:    s_lshr_b64 s[8:9], s[2:3], s8
-; GFX10-NEXT:    s_or_b64 s[4:5], s[4:5], s[6:7]
+; GFX10-NEXT:    s_lshr_b64 s[4:5], s[0:1], s6
+; GFX10-NEXT:    s_lshl_b64 s[8:9], s[2:3], s7
+; GFX10-NEXT:    s_lshr_b64 s[6:7], s[2:3], s6
+; GFX10-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
 ; GFX10-NEXT:    s_lshr_b64 s[2:3], s[2:3], s10
 ; GFX10-NEXT:    s_cmp_lg_u32 s11, 0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc_lo
@@ -7112,7 +7098,7 @@ define amdgpu_ps <4 x float> @v_fshr_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
 ; GFX10-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[2:3]
 ; GFX10-NEXT:    s_cmp_lg_u32 s11, 0
 ; GFX10-NEXT:    v_or_b32_e32 v0, s0, v6
-; GFX10-NEXT:    s_cselect_b64 s[2:3], s[8:9], 0
+; GFX10-NEXT:    s_cselect_b64 s[2:3], s[6:7], 0
 ; GFX10-NEXT:    v_or_b32_e32 v1, s1, v7
 ; GFX10-NEXT:    v_or_b32_e32 v2, s2, v2
 ; GFX10-NEXT:    v_or_b32_e32 v3, s3, v3
@@ -7122,39 +7108,39 @@ define amdgpu_ps <4 x float> @v_fshr_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 31, v1
-; GFX11-NEXT:    s_mov_b64 s[6:7], 0x7f
 ; GFX11-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
-; GFX11-NEXT:    s_and_b64 s[8:9], s[4:5], s[6:7]
-; GFX11-NEXT:    s_and_not1_b64 s[4:5], s[6:7], s[4:5]
+; GFX11-NEXT:    s_and_b64 s[6:7], s[4:5], 0x7f
+; GFX11-NEXT:    s_and_not1_b64 s[4:5], 0x7f, s[4:5]
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_sub_i32 s7, 64, s4
 ; GFX11-NEXT:    v_or_b32_e32 v2, v2, v4
-; GFX11-NEXT:    s_sub_i32 s6, 64, s4
 ; GFX11-NEXT:    s_sub_i32 s5, s4, 64
 ; GFX11-NEXT:    s_cmp_lt_u32 s4, 64
-; GFX11-NEXT:    v_lshrrev_b64 v[4:5], s6, v[0:1]
+; GFX11-NEXT:    v_lshrrev_b64 v[4:5], s7, v[0:1]
+; GFX11-NEXT:    s_cselect_b32 s8, 1, 0
 ; GFX11-NEXT:    v_lshlrev_b64 v[6:7], s4, v[2:3]
-; GFX11-NEXT:    s_cselect_b32 s7, 1, 0
 ; GFX11-NEXT:    s_cmp_eq_u32 s4, 0
 ; GFX11-NEXT:    v_lshlrev_b64 v[8:9], s4, v[0:1]
 ; GFX11-NEXT:    s_cselect_b32 s9, 1, 0
-; GFX11-NEXT:    s_and_b32 s4, 1, s7
+; GFX11-NEXT:    s_and_b32 s4, 1, s8
 ; GFX11-NEXT:    v_lshlrev_b64 v[0:1], s5, v[0:1]
 ; GFX11-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s4
 ; GFX11-NEXT:    v_or_b32_e32 v4, v4, v6
 ; GFX11-NEXT:    v_or_b32_e32 v5, v5, v7
 ; GFX11-NEXT:    s_and_b32 s4, 1, s9
-; GFX11-NEXT:    s_sub_i32 s10, s8, 64
-; GFX11-NEXT:    s_sub_i32 s6, 64, s8
-; GFX11-NEXT:    s_cmp_lt_u32 s8, 64
+; GFX11-NEXT:    s_sub_i32 s10, s6, 64
+; GFX11-NEXT:    s_sub_i32 s7, 64, s6
+; GFX11-NEXT:    s_cmp_lt_u32 s6, 64
 ; GFX11-NEXT:    v_dual_cndmask_b32 v6, 0, v8 :: v_dual_cndmask_b32 v7, 0, v9
 ; GFX11-NEXT:    s_cselect_b32 s11, 1, 0
-; GFX11-NEXT:    s_cmp_eq_u32 s8, 0
+; GFX11-NEXT:    s_cmp_eq_u32 s6, 0
 ; GFX11-NEXT:    v_dual_cndmask_b32 v0, v0, v4 :: v_dual_cndmask_b32 v1, v1, v5
 ; GFX11-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s4
 ; GFX11-NEXT:    s_cselect_b32 s12, 1, 0
-; GFX11-NEXT:    s_lshr_b64 s[4:5], s[0:1], s8
-; GFX11-NEXT:    s_lshl_b64 s[6:7], s[2:3], s6
-; GFX11-NEXT:    s_lshr_b64 s[8:9], s[2:3], s8
-; GFX11-NEXT:    s_or_b64 s[4:5], s[4:5], s[6:7]
+; GFX11-NEXT:    s_lshr_b64 s[4:5], s[0:1], s6
+; GFX11-NEXT:    s_lshl_b64 s[8:9], s[2:3], s7
+; GFX11-NEXT:    s_lshr_b64 s[6:7], s[2:3], s6
+; GFX11-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
 ; GFX11-NEXT:    s_lshr_b64 s[2:3], s[2:3], s10
 ; GFX11-NEXT:    s_cmp_lg_u32 s11, 0
 ; GFX11-NEXT:    v_dual_cndmask_b32 v2, v0, v2 :: v_dual_cndmask_b32 v3, v1, v3
@@ -7163,7 +7149,7 @@ define amdgpu_ps <4 x float> @v_fshr_i128_vss(i128 %lhs, i128 inreg %rhs, i128 i
 ; GFX11-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[2:3]
 ; GFX11-NEXT:    s_cmp_lg_u32 s11, 0
 ; GFX11-NEXT:    v_or_b32_e32 v0, s0, v6
-; GFX11-NEXT:    s_cselect_b64 s[2:3], s[8:9], 0
+; GFX11-NEXT:    s_cselect_b64 s[2:3], s[6:7], 0
 ; GFX11-NEXT:    v_or_b32_e32 v1, s1, v7
 ; GFX11-NEXT:    v_or_b32_e32 v2, s2, v2
 ; GFX11-NEXT:    v_or_b32_e32 v3, s3, v3
@@ -7301,56 +7287,54 @@ define i128 @v_fshr_i128_65(i128 %lhs, i128 %rhs) {
 define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inreg %rhs, <2 x i128> inreg %amt) {
 ; GFX6-LABEL: s_fshr_v2i128:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_movk_i32 s18, 0x7f
-; GFX6-NEXT:    s_mov_b32 s19, 0
-; GFX6-NEXT:    s_and_b64 s[22:23], s[16:17], s[18:19]
-; GFX6-NEXT:    s_andn2_b64 s[16:17], s[18:19], s[16:17]
-; GFX6-NEXT:    s_lshl_b64 s[24:25], s[0:1], 1
+; GFX6-NEXT:    s_and_b64 s[18:19], s[16:17], 0x7f
+; GFX6-NEXT:    s_andn2_b64 s[16:17], 0x7f, s[16:17]
 ; GFX6-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
-; GFX6-NEXT:    s_lshr_b32 s0, s1, 31
-; GFX6-NEXT:    s_mov_b32 s1, s19
-; GFX6-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
-; GFX6-NEXT:    s_sub_i32 s23, s16, 64
+; GFX6-NEXT:    s_lshr_b32 s24, s1, 31
+; GFX6-NEXT:    s_mov_b32 s25, 0
+; GFX6-NEXT:    s_lshl_b64 s[22:23], s[0:1], 1
+; GFX6-NEXT:    s_or_b64 s[0:1], s[2:3], s[24:25]
+; GFX6-NEXT:    s_sub_i32 s19, s16, 64
 ; GFX6-NEXT:    s_sub_i32 s17, 64, s16
 ; GFX6-NEXT:    s_cmp_lt_u32 s16, 64
-; GFX6-NEXT:    s_cselect_b32 s28, 1, 0
+; GFX6-NEXT:    s_cselect_b32 s24, 1, 0
 ; GFX6-NEXT:    s_cmp_eq_u32 s16, 0
-; GFX6-NEXT:    s_cselect_b32 s29, 1, 0
-; GFX6-NEXT:    s_lshl_b64 s[2:3], s[24:25], s16
-; GFX6-NEXT:    s_lshr_b64 s[26:27], s[24:25], s17
+; GFX6-NEXT:    s_cselect_b32 s28, 1, 0
+; GFX6-NEXT:    s_lshl_b64 s[2:3], s[22:23], s16
+; GFX6-NEXT:    s_lshr_b64 s[26:27], s[22:23], s17
 ; GFX6-NEXT:    s_lshl_b64 s[16:17], s[0:1], s16
 ; GFX6-NEXT:    s_or_b64 s[16:17], s[26:27], s[16:17]
-; GFX6-NEXT:    s_lshl_b64 s[24:25], s[24:25], s23
-; GFX6-NEXT:    s_cmp_lg_u32 s28, 0
+; GFX6-NEXT:    s_lshl_b64 s[22:23], s[22:23], s19
+; GFX6-NEXT:    s_cmp_lg_u32 s24, 0
 ; GFX6-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
-; GFX6-NEXT:    s_cselect_b64 s[16:17], s[16:17], s[24:25]
-; GFX6-NEXT:    s_cmp_lg_u32 s29, 0
+; GFX6-NEXT:    s_cselect_b64 s[16:17], s[16:17], s[22:23]
+; GFX6-NEXT:    s_cmp_lg_u32 s28, 0
 ; GFX6-NEXT:    s_cselect_b64 s[16:17], s[0:1], s[16:17]
-; GFX6-NEXT:    s_sub_i32 s26, s22, 64
-; GFX6-NEXT:    s_sub_i32 s24, 64, s22
-; GFX6-NEXT:    s_cmp_lt_u32 s22, 64
+; GFX6-NEXT:    s_sub_i32 s24, s18, 64
+; GFX6-NEXT:    s_sub_i32 s22, 64, s18
+; GFX6-NEXT:    s_cmp_lt_u32 s18, 64
+; GFX6-NEXT:    s_cselect_b32 s26, 1, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s18, 0
 ; GFX6-NEXT:    s_cselect_b32 s27, 1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s22, 0
-; GFX6-NEXT:    s_cselect_b32 s28, 1, 0
-; GFX6-NEXT:    s_lshr_b64 s[0:1], s[10:11], s22
-; GFX6-NEXT:    s_lshr_b64 s[22:23], s[8:9], s22
-; GFX6-NEXT:    s_lshl_b64 s[24:25], s[10:11], s24
-; GFX6-NEXT:    s_or_b64 s[22:23], s[22:23], s[24:25]
-; GFX6-NEXT:    s_lshr_b64 s[10:11], s[10:11], s26
+; GFX6-NEXT:    s_lshr_b64 s[0:1], s[10:11], s18
+; GFX6-NEXT:    s_lshr_b64 s[18:19], s[8:9], s18
+; GFX6-NEXT:    s_lshl_b64 s[22:23], s[10:11], s22
+; GFX6-NEXT:    s_or_b64 s[18:19], s[18:19], s[22:23]
+; GFX6-NEXT:    s_lshr_b64 s[10:11], s[10:11], s24
+; GFX6-NEXT:    s_cmp_lg_u32 s26, 0
+; GFX6-NEXT:    s_cselect_b64 s[10:11], s[18:19], s[10:11]
 ; GFX6-NEXT:    s_cmp_lg_u32 s27, 0
-; GFX6-NEXT:    s_cselect_b64 s[10:11], s[22:23], s[10:11]
-; GFX6-NEXT:    s_cmp_lg_u32 s28, 0
 ; GFX6-NEXT:    s_cselect_b64 s[8:9], s[8:9], s[10:11]
-; GFX6-NEXT:    s_cmp_lg_u32 s27, 0
+; GFX6-NEXT:    s_cmp_lg_u32 s26, 0
 ; GFX6-NEXT:    s_cselect_b64 s[10:11], s[0:1], 0
 ; GFX6-NEXT:    s_or_b64 s[0:1], s[2:3], s[8:9]
 ; GFX6-NEXT:    s_or_b64 s[2:3], s[16:17], s[10:11]
-; GFX6-NEXT:    s_and_b64 s[8:9], s[20:21], s[18:19]
-; GFX6-NEXT:    s_andn2_b64 s[10:11], s[18:19], s[20:21]
+; GFX6-NEXT:    s_and_b64 s[8:9], s[20:21], 0x7f
+; GFX6-NEXT:    s_andn2_b64 s[10:11], 0x7f, s[20:21]
 ; GFX6-NEXT:    s_lshl_b64 s[6:7], s[6:7], 1
-; GFX6-NEXT:    s_lshr_b32 s18, s5, 31
+; GFX6-NEXT:    s_lshr_b32 s24, s5, 31
 ; GFX6-NEXT:    s_lshl_b64 s[16:17], s[4:5], 1
-; GFX6-NEXT:    s_or_b64 s[4:5], s[6:7], s[18:19]
+; GFX6-NEXT:    s_or_b64 s[4:5], s[6:7], s[24:25]
 ; GFX6-NEXT:    s_sub_i32 s9, s10, 64
 ; GFX6-NEXT:    s_sub_i32 s11, 64, s10
 ; GFX6-NEXT:    s_cmp_lt_u32 s10, 64
@@ -7390,56 +7374,54 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ;
 ; GFX8-LABEL: s_fshr_v2i128:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_movk_i32 s18, 0x7f
-; GFX8-NEXT:    s_mov_b32 s19, 0
-; GFX8-NEXT:    s_and_b64 s[22:23], s[16:17], s[18:19]
-; GFX8-NEXT:    s_andn2_b64 s[16:17], s[18:19], s[16:17]
-; GFX8-NEXT:    s_lshl_b64 s[24:25], s[0:1], 1
+; GFX8-NEXT:    s_and_b64 s[18:19], s[16:17], 0x7f
+; GFX8-NEXT:    s_andn2_b64 s[16:17], 0x7f, s[16:17]
 ; GFX8-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
-; GFX8-NEXT:    s_lshr_b32 s0, s1, 31
-; GFX8-NEXT:    s_mov_b32 s1, s19
-; GFX8-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
-; GFX8-NEXT:    s_sub_i32 s23, s16, 64
+; GFX8-NEXT:    s_lshr_b32 s24, s1, 31
+; GFX8-NEXT:    s_mov_b32 s25, 0
+; GFX8-NEXT:    s_lshl_b64 s[22:23], s[0:1], 1
+; GFX8-NEXT:    s_or_b64 s[0:1], s[2:3], s[24:25]
+; GFX8-NEXT:    s_sub_i32 s19, s16, 64
 ; GFX8-NEXT:    s_sub_i32 s17, 64, s16
 ; GFX8-NEXT:    s_cmp_lt_u32 s16, 64
-; GFX8-NEXT:    s_cselect_b32 s28, 1, 0
+; GFX8-NEXT:    s_cselect_b32 s24, 1, 0
 ; GFX8-NEXT:    s_cmp_eq_u32 s16, 0
-; GFX8-NEXT:    s_cselect_b32 s29, 1, 0
-; GFX8-NEXT:    s_lshl_b64 s[2:3], s[24:25], s16
-; GFX8-NEXT:    s_lshr_b64 s[26:27], s[24:25], s17
+; GFX8-NEXT:    s_cselect_b32 s28, 1, 0
+; GFX8-NEXT:    s_lshl_b64 s[2:3], s[22:23], s16
+; GFX8-NEXT:    s_lshr_b64 s[26:27], s[22:23], s17
 ; GFX8-NEXT:    s_lshl_b64 s[16:17], s[0:1], s16
 ; GFX8-NEXT:    s_or_b64 s[16:17], s[26:27], s[16:17]
-; GFX8-NEXT:    s_lshl_b64 s[24:25], s[24:25], s23
-; GFX8-NEXT:    s_cmp_lg_u32 s28, 0
+; GFX8-NEXT:    s_lshl_b64 s[22:23], s[22:23], s19
+; GFX8-NEXT:    s_cmp_lg_u32 s24, 0
 ; GFX8-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
-; GFX8-NEXT:    s_cselect_b64 s[16:17], s[16:17], s[24:25]
-; GFX8-NEXT:    s_cmp_lg_u32 s29, 0
+; GFX8-NEXT:    s_cselect_b64 s[16:17], s[16:17], s[22:23]
+; GFX8-NEXT:    s_cmp_lg_u32 s28, 0
 ; GFX8-NEXT:    s_cselect_b64 s[16:17], s[0:1], s[16:17]
-; GFX8-NEXT:    s_sub_i32 s26, s22, 64
-; GFX8-NEXT:    s_sub_i32 s24, 64, s22
-; GFX8-NEXT:    s_cmp_lt_u32 s22, 64
+; GFX8-NEXT:    s_sub_i32 s24, s18, 64
+; GFX8-NEXT:    s_sub_i32 s22, 64, s18
+; GFX8-NEXT:    s_cmp_lt_u32 s18, 64
+; GFX8-NEXT:    s_cselect_b32 s26, 1, 0
+; GFX8-NEXT:    s_cmp_eq_u32 s18, 0
 ; GFX8-NEXT:    s_cselect_b32 s27, 1, 0
-; GFX8-NEXT:    s_cmp_eq_u32 s22, 0
-; GFX8-NEXT:    s_cselect_b32 s28, 1, 0
-; GFX8-NEXT:    s_lshr_b64 s[0:1], s[10:11], s22
-; GFX8-NEXT:    s_lshr_b64 s[22:23], s[8:9], s22
-; GFX8-NEXT:    s_lshl_b64 s[24:25], s[10:11], s24
-; GFX8-NEXT:    s_or_b64 s[22:23], s[22:23], s[24:25]
-; GFX8-NEXT:    s_lshr_b64 s[10:11], s[10:11], s26
+; GFX8-NEXT:    s_lshr_b64 s[0:1], s[10:11], s18
+; GFX8-NEXT:    s_lshr_b64 s[18:19], s[8:9], s18
+; GFX8-NEXT:    s_lshl_b64 s[22:23], s[10:11], s22
+; GFX8-NEXT:    s_or_b64 s[18:19], s[18:19], s[22:23]
+; GFX8-NEXT:    s_lshr_b64 s[10:11], s[10:11], s24
+; GFX8-NEXT:    s_cmp_lg_u32 s26, 0
+; GFX8-NEXT:    s_cselect_b64 s[10:11], s[18:19], s[10:11]
 ; GFX8-NEXT:    s_cmp_lg_u32 s27, 0
-; GFX8-NEXT:    s_cselect_b64 s[10:11], s[22:23], s[10:11]
-; GFX8-NEXT:    s_cmp_lg_u32 s28, 0
 ; GFX8-NEXT:    s_cselect_b64 s[8:9], s[8:9], s[10:11]
-; GFX8-NEXT:    s_cmp_lg_u32 s27, 0
+; GFX8-NEXT:    s_cmp_lg_u32 s26, 0
 ; GFX8-NEXT:    s_cselect_b64 s[10:11], s[0:1], 0
 ; GFX8-NEXT:    s_or_b64 s[0:1], s[2:3], s[8:9]
 ; GFX8-NEXT:    s_or_b64 s[2:3], s[16:17], s[10:11]
-; GFX8-NEXT:    s_and_b64 s[8:9], s[20:21], s[18:19]
-; GFX8-NEXT:    s_andn2_b64 s[10:11], s[18:19], s[20:21]
+; GFX8-NEXT:    s_and_b64 s[8:9], s[20:21], 0x7f
+; GFX8-NEXT:    s_andn2_b64 s[10:11], 0x7f, s[20:21]
 ; GFX8-NEXT:    s_lshl_b64 s[6:7], s[6:7], 1
-; GFX8-NEXT:    s_lshr_b32 s18, s5, 31
+; GFX8-NEXT:    s_lshr_b32 s24, s5, 31
 ; GFX8-NEXT:    s_lshl_b64 s[16:17], s[4:5], 1
-; GFX8-NEXT:    s_or_b64 s[4:5], s[6:7], s[18:19]
+; GFX8-NEXT:    s_or_b64 s[4:5], s[6:7], s[24:25]
 ; GFX8-NEXT:    s_sub_i32 s9, s10, 64
 ; GFX8-NEXT:    s_sub_i32 s11, 64, s10
 ; GFX8-NEXT:    s_cmp_lt_u32 s10, 64
@@ -7479,56 +7461,54 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ;
 ; GFX9-LABEL: s_fshr_v2i128:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_movk_i32 s18, 0x7f
-; GFX9-NEXT:    s_mov_b32 s19, 0
-; GFX9-NEXT:    s_and_b64 s[22:23], s[16:17], s[18:19]
-; GFX9-NEXT:    s_andn2_b64 s[16:17], s[18:19], s[16:17]
-; GFX9-NEXT:    s_lshl_b64 s[24:25], s[0:1], 1
+; GFX9-NEXT:    s_and_b64 s[18:19], s[16:17], 0x7f
+; GFX9-NEXT:    s_andn2_b64 s[16:17], 0x7f, s[16:17]
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
-; GFX9-NEXT:    s_lshr_b32 s0, s1, 31
-; GFX9-NEXT:    s_mov_b32 s1, s19
-; GFX9-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
-; GFX9-NEXT:    s_sub_i32 s23, s16, 64
+; GFX9-NEXT:    s_lshr_b32 s24, s1, 31
+; GFX9-NEXT:    s_mov_b32 s25, 0
+; GFX9-NEXT:    s_lshl_b64 s[22:23], s[0:1], 1
+; GFX9-NEXT:    s_or_b64 s[0:1], s[2:3], s[24:25]
+; GFX9-NEXT:    s_sub_i32 s19, s16, 64
 ; GFX9-NEXT:    s_sub_i32 s17, 64, s16
 ; GFX9-NEXT:    s_cmp_lt_u32 s16, 64
-; GFX9-NEXT:    s_cselect_b32 s28, 1, 0
+; GFX9-NEXT:    s_cselect_b32 s24, 1, 0
 ; GFX9-NEXT:    s_cmp_eq_u32 s16, 0
-; GFX9-NEXT:    s_cselect_b32 s29, 1, 0
-; GFX9-NEXT:    s_lshl_b64 s[2:3], s[24:25], s16
-; GFX9-NEXT:    s_lshr_b64 s[26:27], s[24:25], s17
+; GFX9-NEXT:    s_cselect_b32 s28, 1, 0
+; GFX9-NEXT:    s_lshl_b64 s[2:3], s[22:23], s16
+; GFX9-NEXT:    s_lshr_b64 s[26:27], s[22:23], s17
 ; GFX9-NEXT:    s_lshl_b64 s[16:17], s[0:1], s16
 ; GFX9-NEXT:    s_or_b64 s[16:17], s[26:27], s[16:17]
-; GFX9-NEXT:    s_lshl_b64 s[24:25], s[24:25], s23
-; GFX9-NEXT:    s_cmp_lg_u32 s28, 0
+; GFX9-NEXT:    s_lshl_b64 s[22:23], s[22:23], s19
+; GFX9-NEXT:    s_cmp_lg_u32 s24, 0
 ; GFX9-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
-; GFX9-NEXT:    s_cselect_b64 s[16:17], s[16:17], s[24:25]
-; GFX9-NEXT:    s_cmp_lg_u32 s29, 0
+; GFX9-NEXT:    s_cselect_b64 s[16:17], s[16:17], s[22:23]
+; GFX9-NEXT:    s_cmp_lg_u32 s28, 0
 ; GFX9-NEXT:    s_cselect_b64 s[16:17], s[0:1], s[16:17]
-; GFX9-NEXT:    s_sub_i32 s26, s22, 64
-; GFX9-NEXT:    s_sub_i32 s24, 64, s22
-; GFX9-NEXT:    s_cmp_lt_u32 s22, 64
+; GFX9-NEXT:    s_sub_i32 s24, s18, 64
+; GFX9-NEXT:    s_sub_i32 s22, 64, s18
+; GFX9-NEXT:    s_cmp_lt_u32 s18, 64
+; GFX9-NEXT:    s_cselect_b32 s26, 1, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s18, 0
 ; GFX9-NEXT:    s_cselect_b32 s27, 1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s22, 0
-; GFX9-NEXT:    s_cselect_b32 s28, 1, 0
-; GFX9-NEXT:    s_lshr_b64 s[0:1], s[10:11], s22
-; GFX9-NEXT:    s_lshr_b64 s[22:23], s[8:9], s22
-; GFX9-NEXT:    s_lshl_b64 s[24:25], s[10:11], s24
-; GFX9-NEXT:    s_or_b64 s[22:23], s[22:23], s[24:25]
-; GFX9-NEXT:    s_lshr_b64 s[10:11], s[10:11], s26
+; GFX9-NEXT:    s_lshr_b64 s[0:1], s[10:11], s18
+; GFX9-NEXT:    s_lshr_b64 s[18:19], s[8:9], s18
+; GFX9-NEXT:    s_lshl_b64 s[22:23], s[10:11], s22
+; GFX9-NEXT:    s_or_b64 s[18:19], s[18:19], s[22:23]
+; GFX9-NEXT:    s_lshr_b64 s[10:11], s[10:11], s24
+; GFX9-NEXT:    s_cmp_lg_u32 s26, 0
+; GFX9-NEXT:    s_cselect_b64 s[10:11], s[18:19], s[10:11]
 ; GFX9-NEXT:    s_cmp_lg_u32 s27, 0
-; GFX9-NEXT:    s_cselect_b64 s[10:11], s[22:23], s[10:11]
-; GFX9-NEXT:    s_cmp_lg_u32 s28, 0
 ; GFX9-NEXT:    s_cselect_b64 s[8:9], s[8:9], s[10:11]
-; GFX9-NEXT:    s_cmp_lg_u32 s27, 0
+; GFX9-NEXT:    s_cmp_lg_u32 s26, 0
 ; GFX9-NEXT:    s_cselect_b64 s[10:11], s[0:1], 0
 ; GFX9-NEXT:    s_or_b64 s[0:1], s[2:3], s[8:9]
 ; GFX9-NEXT:    s_or_b64 s[2:3], s[16:17], s[10:11]
-; GFX9-NEXT:    s_and_b64 s[8:9], s[20:21], s[18:19]
-; GFX9-NEXT:    s_andn2_b64 s[10:11], s[18:19], s[20:21]
+; GFX9-NEXT:    s_and_b64 s[8:9], s[20:21], 0x7f
+; GFX9-NEXT:    s_andn2_b64 s[10:11], 0x7f, s[20:21]
 ; GFX9-NEXT:    s_lshl_b64 s[6:7], s[6:7], 1
-; GFX9-NEXT:    s_lshr_b32 s18, s5, 31
+; GFX9-NEXT:    s_lshr_b32 s24, s5, 31
 ; GFX9-NEXT:    s_lshl_b64 s[16:17], s[4:5], 1
-; GFX9-NEXT:    s_or_b64 s[4:5], s[6:7], s[18:19]
+; GFX9-NEXT:    s_or_b64 s[4:5], s[6:7], s[24:25]
 ; GFX9-NEXT:    s_sub_i32 s9, s10, 64
 ; GFX9-NEXT:    s_sub_i32 s11, 64, s10
 ; GFX9-NEXT:    s_cmp_lt_u32 s10, 64
@@ -7568,56 +7548,54 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ;
 ; GFX10-LABEL: s_fshr_v2i128:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_movk_i32 s18, 0x7f
-; GFX10-NEXT:    s_mov_b32 s19, 0
+; GFX10-NEXT:    s_and_b64 s[18:19], s[16:17], 0x7f
+; GFX10-NEXT:    s_andn2_b64 s[16:17], 0x7f, s[16:17]
 ; GFX10-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
-; GFX10-NEXT:    s_and_b64 s[22:23], s[16:17], s[18:19]
-; GFX10-NEXT:    s_andn2_b64 s[16:17], s[18:19], s[16:17]
-; GFX10-NEXT:    s_lshr_b32 s24, s1, 31
-; GFX10-NEXT:    s_mov_b32 s25, s19
+; GFX10-NEXT:    s_lshr_b32 s22, s1, 31
+; GFX10-NEXT:    s_mov_b32 s23, 0
 ; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], 1
-; GFX10-NEXT:    s_or_b64 s[2:3], s[2:3], s[24:25]
-; GFX10-NEXT:    s_sub_i32 s23, s16, 64
+; GFX10-NEXT:    s_or_b64 s[2:3], s[2:3], s[22:23]
+; GFX10-NEXT:    s_sub_i32 s19, s16, 64
 ; GFX10-NEXT:    s_sub_i32 s17, 64, s16
 ; GFX10-NEXT:    s_cmp_lt_u32 s16, 64
-; GFX10-NEXT:    s_cselect_b32 s28, 1, 0
+; GFX10-NEXT:    s_cselect_b32 s22, 1, 0
 ; GFX10-NEXT:    s_cmp_eq_u32 s16, 0
-; GFX10-NEXT:    s_cselect_b32 s29, 1, 0
+; GFX10-NEXT:    s_cselect_b32 s28, 1, 0
 ; GFX10-NEXT:    s_lshr_b64 s[24:25], s[0:1], s17
 ; GFX10-NEXT:    s_lshl_b64 s[26:27], s[2:3], s16
 ; GFX10-NEXT:    s_lshl_b64 s[16:17], s[0:1], s16
 ; GFX10-NEXT:    s_or_b64 s[24:25], s[24:25], s[26:27]
-; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], s23
-; GFX10-NEXT:    s_cmp_lg_u32 s28, 0
+; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], s19
+; GFX10-NEXT:    s_cmp_lg_u32 s22, 0
 ; GFX10-NEXT:    s_cselect_b64 s[16:17], s[16:17], 0
 ; GFX10-NEXT:    s_cselect_b64 s[0:1], s[24:25], s[0:1]
-; GFX10-NEXT:    s_cmp_lg_u32 s29, 0
+; GFX10-NEXT:    s_cmp_lg_u32 s28, 0
 ; GFX10-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
-; GFX10-NEXT:    s_sub_i32 s26, s22, 64
-; GFX10-NEXT:    s_sub_i32 s23, 64, s22
-; GFX10-NEXT:    s_cmp_lt_u32 s22, 64
+; GFX10-NEXT:    s_sub_i32 s22, s18, 64
+; GFX10-NEXT:    s_sub_i32 s19, 64, s18
+; GFX10-NEXT:    s_cmp_lt_u32 s18, 64
+; GFX10-NEXT:    s_cselect_b32 s26, 1, 0
+; GFX10-NEXT:    s_cmp_eq_u32 s18, 0
 ; GFX10-NEXT:    s_cselect_b32 s27, 1, 0
-; GFX10-NEXT:    s_cmp_eq_u32 s22, 0
-; GFX10-NEXT:    s_cselect_b32 s28, 1, 0
-; GFX10-NEXT:    s_lshr_b64 s[0:1], s[8:9], s22
-; GFX10-NEXT:    s_lshl_b64 s[24:25], s[10:11], s23
-; GFX10-NEXT:    s_lshr_b64 s[22:23], s[10:11], s22
+; GFX10-NEXT:    s_lshr_b64 s[0:1], s[8:9], s18
+; GFX10-NEXT:    s_lshl_b64 s[24:25], s[10:11], s19
+; GFX10-NEXT:    s_lshr_b64 s[18:19], s[10:11], s18
 ; GFX10-NEXT:    s_or_b64 s[0:1], s[0:1], s[24:25]
-; GFX10-NEXT:    s_lshr_b64 s[10:11], s[10:11], s26
-; GFX10-NEXT:    s_cmp_lg_u32 s27, 0
+; GFX10-NEXT:    s_lshr_b64 s[10:11], s[10:11], s22
+; GFX10-NEXT:    s_cmp_lg_u32 s26, 0
 ; GFX10-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[10:11]
-; GFX10-NEXT:    s_cmp_lg_u32 s28, 0
-; GFX10-NEXT:    s_cselect_b64 s[0:1], s[8:9], s[0:1]
 ; GFX10-NEXT:    s_cmp_lg_u32 s27, 0
-; GFX10-NEXT:    s_cselect_b64 s[8:9], s[22:23], 0
-; GFX10-NEXT:    s_andn2_b64 s[10:11], s[18:19], s[20:21]
-; GFX10-NEXT:    s_or_b64 s[2:3], s[2:3], s[8:9]
-; GFX10-NEXT:    s_and_b64 s[8:9], s[20:21], s[18:19]
+; GFX10-NEXT:    s_cselect_b64 s[0:1], s[8:9], s[0:1]
+; GFX10-NEXT:    s_cmp_lg_u32 s26, 0
+; GFX10-NEXT:    s_cselect_b64 s[8:9], s[18:19], 0
+; GFX10-NEXT:    s_andn2_b64 s[10:11], 0x7f, s[20:21]
 ; GFX10-NEXT:    s_lshl_b64 s[6:7], s[6:7], 1
-; GFX10-NEXT:    s_lshr_b32 s18, s5, 31
+; GFX10-NEXT:    s_lshr_b32 s22, s5, 31
+; GFX10-NEXT:    s_or_b64 s[2:3], s[2:3], s[8:9]
+; GFX10-NEXT:    s_and_b64 s[8:9], s[20:21], 0x7f
 ; GFX10-NEXT:    s_or_b64 s[0:1], s[16:17], s[0:1]
 ; GFX10-NEXT:    s_lshl_b64 s[4:5], s[4:5], 1
-; GFX10-NEXT:    s_or_b64 s[6:7], s[6:7], s[18:19]
+; GFX10-NEXT:    s_or_b64 s[6:7], s[6:7], s[22:23]
 ; GFX10-NEXT:    s_sub_i32 s9, s10, 64
 ; GFX10-NEXT:    s_sub_i32 s11, 64, s10
 ; GFX10-NEXT:    s_cmp_lt_u32 s10, 64
@@ -7657,56 +7635,54 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ;
 ; GFX11-LABEL: s_fshr_v2i128:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_movk_i32 s18, 0x7f
-; GFX11-NEXT:    s_mov_b32 s19, 0
+; GFX11-NEXT:    s_and_b64 s[18:19], s[16:17], 0x7f
+; GFX11-NEXT:    s_and_not1_b64 s[16:17], 0x7f, s[16:17]
 ; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
-; GFX11-NEXT:    s_and_b64 s[22:23], s[16:17], s[18:19]
-; GFX11-NEXT:    s_and_not1_b64 s[16:17], s[18:19], s[16:17]
-; GFX11-NEXT:    s_lshr_b32 s24, s1, 31
-; GFX11-NEXT:    s_mov_b32 s25, s19
+; GFX11-NEXT:    s_lshr_b32 s22, s1, 31
+; GFX11-NEXT:    s_mov_b32 s23, 0
 ; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 1
-; GFX11-NEXT:    s_or_b64 s[2:3], s[2:3], s[24:25]
-; GFX11-NEXT:    s_sub_i32 s23, s16, 64
+; GFX11-NEXT:    s_or_b64 s[2:3], s[2:3], s[22:23]
+; GFX11-NEXT:    s_sub_i32 s19, s16, 64
 ; GFX11-NEXT:    s_sub_i32 s17, 64, s16
 ; GFX11-NEXT:    s_cmp_lt_u32 s16, 64
-; GFX11-NEXT:    s_cselect_b32 s28, 1, 0
+; GFX11-NEXT:    s_cselect_b32 s22, 1, 0
 ; GFX11-NEXT:    s_cmp_eq_u32 s16, 0
-; GFX11-NEXT:    s_cselect_b32 s29, 1, 0
+; GFX11-NEXT:    s_cselect_b32 s28, 1, 0
 ; GFX11-NEXT:    s_lshr_b64 s[24:25], s[0:1], s17
 ; GFX11-NEXT:    s_lshl_b64 s[26:27], s[2:3], s16
 ; GFX11-NEXT:    s_lshl_b64 s[16:17], s[0:1], s16
 ; GFX11-NEXT:    s_or_b64 s[24:25], s[24:25], s[26:27]
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], s23
-; GFX11-NEXT:    s_cmp_lg_u32 s28, 0
+; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], s19
+; GFX11-NEXT:    s_cmp_lg_u32 s22, 0
 ; GFX11-NEXT:    s_cselect_b64 s[16:17], s[16:17], 0
 ; GFX11-NEXT:    s_cselect_b64 s[0:1], s[24:25], s[0:1]
-; GFX11-NEXT:    s_cmp_lg_u32 s29, 0
+; GFX11-NEXT:    s_cmp_lg_u32 s28, 0
 ; GFX11-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
-; GFX11-NEXT:    s_sub_i32 s26, s22, 64
-; GFX11-NEXT:    s_sub_i32 s23, 64, s22
-; GFX11-NEXT:    s_cmp_lt_u32 s22, 64
+; GFX11-NEXT:    s_sub_i32 s22, s18, 64
+; GFX11-NEXT:    s_sub_i32 s19, 64, s18
+; GFX11-NEXT:    s_cmp_lt_u32 s18, 64
+; GFX11-NEXT:    s_cselect_b32 s26, 1, 0
+; GFX11-NEXT:    s_cmp_eq_u32 s18, 0
 ; GFX11-NEXT:    s_cselect_b32 s27, 1, 0
-; GFX11-NEXT:    s_cmp_eq_u32 s22, 0
-; GFX11-NEXT:    s_cselect_b32 s28, 1, 0
-; GFX11-NEXT:    s_lshr_b64 s[0:1], s[8:9], s22
-; GFX11-NEXT:    s_lshl_b64 s[24:25], s[10:11], s23
-; GFX11-NEXT:    s_lshr_b64 s[22:23], s[10:11], s22
+; GFX11-NEXT:    s_lshr_b64 s[0:1], s[8:9], s18
+; GFX11-NEXT:    s_lshl_b64 s[24:25], s[10:11], s19
+; GFX11-NEXT:    s_lshr_b64 s[18:19], s[10:11], s18
 ; GFX11-NEXT:    s_or_b64 s[0:1], s[0:1], s[24:25]
-; GFX11-NEXT:    s_lshr_b64 s[10:11], s[10:11], s26
-; GFX11-NEXT:    s_cmp_lg_u32 s27, 0
+; GFX11-NEXT:    s_lshr_b64 s[10:11], s[10:11], s22
+; GFX11-NEXT:    s_cmp_lg_u32 s26, 0
 ; GFX11-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[10:11]
-; GFX11-NEXT:    s_cmp_lg_u32 s28, 0
-; GFX11-NEXT:    s_cselect_b64 s[0:1], s[8:9], s[0:1]
 ; GFX11-NEXT:    s_cmp_lg_u32 s27, 0
-; GFX11-NEXT:    s_cselect_b64 s[8:9], s[22:23], 0
-; GFX11-NEXT:    s_and_not1_b64 s[10:11], s[18:19], s[20:21]
-; GFX11-NEXT:    s_or_b64 s[2:3], s[2:3], s[8:9]
-; GFX11-NEXT:    s_and_b64 s[8:9], s[20:21], s[18:19]
+; GFX11-NEXT:    s_cselect_b64 s[0:1], s[8:9], s[0:1]
+; GFX11-NEXT:    s_cmp_lg_u32 s26, 0
+; GFX11-NEXT:    s_cselect_b64 s[8:9], s[18:19], 0
+; GFX11-NEXT:    s_and_not1_b64 s[10:11], 0x7f, s[20:21]
 ; GFX11-NEXT:    s_lshl_b64 s[6:7], s[6:7], 1
-; GFX11-NEXT:    s_lshr_b32 s18, s5, 31
+; GFX11-NEXT:    s_lshr_b32 s22, s5, 31
+; GFX11-NEXT:    s_or_b64 s[2:3], s[2:3], s[8:9]
+; GFX11-NEXT:    s_and_b64 s[8:9], s[20:21], 0x7f
 ; GFX11-NEXT:    s_or_b64 s[0:1], s[16:17], s[0:1]
 ; GFX11-NEXT:    s_lshl_b64 s[4:5], s[4:5], 1
-; GFX11-NEXT:    s_or_b64 s[6:7], s[6:7], s[18:19]
+; GFX11-NEXT:    s_or_b64 s[6:7], s[6:7], s[22:23]
 ; GFX11-NEXT:    s_sub_i32 s9, s10, 64
 ; GFX11-NEXT:    s_sub_i32 s11, 64, s10
 ; GFX11-NEXT:    s_cmp_lt_u32 s10, 64
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll
index d6957be8ab8ff..64c3cd4e8c067 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll
@@ -146,6 +146,7 @@ define amdgpu_ps void @insertelement_v_v2i16_s_s(ptr addrspace(1) %ptr, i16 inre
 ; GFX7-NEXT:    s_lshl_b32 s1, s1, s0
 ; GFX7-NEXT:    s_lshl_b32 s0, 0xffff, s0
 ; GFX7-NEXT:    s_not_b32 s0, s0
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    s_mov_b32 s6, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v0, s0, v0
@@ -515,6 +516,7 @@ define amdgpu_ps void @insertelement_v_v2i16_s_v(ptr addrspace(1) %ptr, i16 inre
 ; GFX7-NEXT:    v_lshl_b32_e32 v2, s0, v1
 ; GFX7-NEXT:    v_lshl_b32_e32 v1, 0xffff, v1
 ; GFX7-NEXT:    v_not_b32_e32 v1, v1
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    s_mov_b32 s6, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v0, v0, v1
@@ -608,6 +610,7 @@ define amdgpu_ps void @insertelement_v_v2i16_v_s(ptr addrspace(1) %ptr, i16 %val
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, s0, v1
 ; GFX7-NEXT:    s_lshl_b32 s0, 0xffff, s0
 ; GFX7-NEXT:    s_not_b32 s0, s0
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    s_mov_b32 s6, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v0, s0, v0
@@ -701,6 +704,7 @@ define amdgpu_ps void @insertelement_v_v2i16_v_v(ptr addrspace(1) %ptr, i16 %val
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v1, v2
 ; GFX7-NEXT:    v_lshl_b32_e32 v1, 0xffff, v1
 ; GFX7-NEXT:    v_not_b32_e32 v1, v1
+; GFX7-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX7-NEXT:    s_mov_b32 s2, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v0, v0, v1
@@ -866,6 +870,7 @@ define amdgpu_ps void @insertelement_v_v4i16_s_s(ptr addrspace(1) %ptr, i16 inre
 ; GFX7-NEXT:    s_lshl_b32 s1, 0xffff, s1
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s0, 1
 ; GFX7-NEXT:    s_not_b32 s1, s1
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    s_mov_b32 s6, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc
@@ -1406,6 +1411,7 @@ define amdgpu_ps void @insertelement_v_v4i16_s_v(ptr addrspace(1) %ptr, i16 inre
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
 ; GFX7-NEXT:    v_not_b32_e32 v2, v2
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v3
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    s_mov_b32 s6, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_cndmask_b32_e32 v5, v0, v1, vcc
@@ -1529,6 +1535,7 @@ define amdgpu_ps void @insertelement_v_v4i16_v_s(ptr addrspace(1) %ptr, i16 %val
 ; GFX7-NEXT:    s_lshl_b32 s1, 0xffff, s1
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s0, 1
 ; GFX7-NEXT:    s_not_b32 s1, s1
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    s_mov_b32 s6, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_cndmask_b32_e32 v3, v0, v1, vcc
@@ -1653,6 +1660,7 @@ define amdgpu_ps void @insertelement_v_v4i16_v_v(ptr addrspace(1) %ptr, i16 %val
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
 ; GFX7-NEXT:    v_not_b32_e32 v3, v3
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v4
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    s_mov_b32 s6, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_cndmask_b32_e32 v5, v0, v1, vcc
@@ -1976,6 +1984,7 @@ define amdgpu_ps void @insertelement_v_v8i16_s_s(ptr addrspace(1) %ptr, i16 inre
 ; GFX7-NEXT:    s_not_b32 s6, s0
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], s4, 2
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[2:3], s4, 3
+; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    s_mov_b32 s10, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_cndmask_b32_e32 v4, v0, v1, vcc
@@ -2693,6 +2702,7 @@ define amdgpu_ps void @insertelement_v_v8i16_s_v(ptr addrspace(1) %ptr, i16 inre
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v0
 ; GFX7-NEXT:    v_not_b32_e32 v1, v1
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
+; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    s_mov_b32 s10, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_cndmask_b32_e32 v7, v3, v4, vcc
@@ -2846,6 +2856,7 @@ define amdgpu_ps void @insertelement_v_v8i16_v_s(ptr addrspace(1) %ptr, i16 %val
 ; GFX7-NEXT:    s_not_b32 s5, s0
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], s4, 2
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[2:3], s4, 3
+; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    s_mov_b32 s10, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
@@ -3001,6 +3012,7 @@ define amdgpu_ps void @insertelement_v_v8i16_v_v(ptr addrspace(1) %ptr, i16 %val
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v0
 ; GFX7-NEXT:    v_not_b32_e32 v1, v1
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
+; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    s_mov_b32 s10, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
@@ -3351,6 +3363,7 @@ define amdgpu_ps void @insertelement_v_v16i16_s_s(ptr addrspace(1) %ptr, i16 inr
 ; GFX7-NEXT:    s_lshl_b32 s1, s1, s0
 ; GFX7-NEXT:    s_lshl_b32 s0, 0xffff, s0
 ; GFX7-NEXT:    s_not_b32 s0, s0
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    s_mov_b32 s6, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_movrels_b32_e32 v0, v2
@@ -4289,11 +4302,11 @@ define amdgpu_ps void @insertelement_v_v16i16_s_v(ptr addrspace(1) %ptr, i16 inr
 ;
 ; GFX7-LABEL: insertelement_v_v16i16_s_v:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_mov_b32 s18, 0
-; GFX7-NEXT:    s_mov_b32 s19, 0xf000
-; GFX7-NEXT:    s_mov_b64 s[16:17], 0
-; GFX7-NEXT:    buffer_load_dwordx4 v[3:6], v[0:1], s[16:19], 0 addr64
-; GFX7-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[16:19], 0 addr64 offset:16
+; GFX7-NEXT:    s_mov_b32 s14, 0
+; GFX7-NEXT:    s_mov_b32 s15, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[12:13], 0
+; GFX7-NEXT:    buffer_load_dwordx4 v[3:6], v[0:1], s[12:15], 0 addr64
+; GFX7-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[12:15], 0 addr64 offset:16
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 1, v2
 ; GFX7-NEXT:    v_and_b32_e32 v1, 1, v2
 ; GFX7-NEXT:    s_and_b32 s0, s2, 0xffff
@@ -4309,7 +4322,7 @@ define amdgpu_ps void @insertelement_v_v16i16_s_v(ptr addrspace(1) %ptr, i16 inr
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[10:11], 7, v0
 ; GFX7-NEXT:    v_not_b32_e32 v1, v1
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[12:13], 0, v0
-; GFX7-NEXT:    s_mov_b32 s18, -1
+; GFX7-NEXT:    s_mov_b32 s14, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(1)
 ; GFX7-NEXT:    v_cndmask_b32_e32 v11, v3, v4, vcc
 ; GFX7-NEXT:    v_cndmask_b32_e64 v11, v11, v5, s[0:1]
@@ -4325,13 +4338,14 @@ define amdgpu_ps void @insertelement_v_v16i16_s_v(ptr addrspace(1) %ptr, i16 inr
 ; GFX7-NEXT:    v_cndmask_b32_e32 v1, v4, v11, vcc
 ; GFX7-NEXT:    v_cndmask_b32_e64 v2, v5, v11, s[0:1]
 ; GFX7-NEXT:    v_cndmask_b32_e64 v3, v6, v11, s[2:3]
+; GFX7-NEXT:    s_mov_b64 s[12:13], 0
 ; GFX7-NEXT:    v_cndmask_b32_e64 v4, v7, v11, s[4:5]
 ; GFX7-NEXT:    v_cndmask_b32_e64 v5, v8, v11, s[6:7]
 ; GFX7-NEXT:    v_cndmask_b32_e64 v6, v9, v11, s[8:9]
 ; GFX7-NEXT:    v_cndmask_b32_e64 v7, v10, v11, s[10:11]
-; GFX7-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
-; GFX7-NEXT:    s_mov_b64 s[16:17], 16
-; GFX7-NEXT:    buffer_store_dwordx4 v[4:7], off, s[16:19], 0
+; GFX7-NEXT:    buffer_store_dwordx4 v[0:3], off, s[12:15], 0
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
+; GFX7-NEXT:    buffer_store_dwordx4 v[4:7], off, s[12:15], 0
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: insertelement_v_v16i16_s_v:
@@ -4523,6 +4537,7 @@ define amdgpu_ps void @insertelement_v_v16i16_v_s(ptr addrspace(1) %ptr, i16 %va
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, s0, v0
 ; GFX7-NEXT:    s_lshl_b32 s0, 0xffff, s0
 ; GFX7-NEXT:    s_not_b32 s0, s0
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    s_mov_b32 s6, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_movrels_b32_e32 v1, v3
@@ -4686,11 +4701,11 @@ define amdgpu_ps void @insertelement_v_v16i16_v_v(ptr addrspace(1) %ptr, i16 %va
 ;
 ; GFX7-LABEL: insertelement_v_v16i16_v_v:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_mov_b32 s18, 0
-; GFX7-NEXT:    s_mov_b32 s19, 0xf000
-; GFX7-NEXT:    s_mov_b64 s[16:17], 0
-; GFX7-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[16:19], 0 addr64
-; GFX7-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[16:19], 0 addr64 offset:16
+; GFX7-NEXT:    s_mov_b32 s14, 0
+; GFX7-NEXT:    s_mov_b32 s15, 0xf000
+; GFX7-NEXT:    s_mov_b64 s[12:13], 0
+; GFX7-NEXT:    buffer_load_dwordx4 v[4:7], v[0:1], s[12:15], 0 addr64
+; GFX7-NEXT:    buffer_load_dwordx4 v[8:11], v[0:1], s[12:15], 0 addr64 offset:16
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 1, v3
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GFX7-NEXT:    v_and_b32_e32 v1, 1, v3
@@ -4706,7 +4721,7 @@ define amdgpu_ps void @insertelement_v_v16i16_v_v(ptr addrspace(1) %ptr, i16 %va
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[10:11], 7, v0
 ; GFX7-NEXT:    v_not_b32_e32 v1, v1
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[12:13], 0, v0
-; GFX7-NEXT:    s_mov_b32 s18, -1
+; GFX7-NEXT:    s_mov_b32 s14, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(1)
 ; GFX7-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
 ; GFX7-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s[0:1]
@@ -4722,13 +4737,14 @@ define amdgpu_ps void @insertelement_v_v16i16_v_v(ptr addrspace(1) %ptr, i16 %va
 ; GFX7-NEXT:    v_cndmask_b32_e32 v1, v5, v12, vcc
 ; GFX7-NEXT:    v_cndmask_b32_e64 v2, v6, v12, s[0:1]
 ; GFX7-NEXT:    v_cndmask_b32_e64 v3, v7, v12, s[2:3]
+; GFX7-NEXT:    s_mov_b64 s[12:13], 0
 ; GFX7-NEXT:    v_cndmask_b32_e64 v4, v8, v12, s[4:5]
 ; GFX7-NEXT:    v_cndmask_b32_e64 v5, v9, v12, s[6:7]
 ; GFX7-NEXT:    v_cndmask_b32_e64 v6, v10, v12, s[8:9]
 ; GFX7-NEXT:    v_cndmask_b32_e64 v7, v11, v12, s[10:11]
-; GFX7-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
-; GFX7-NEXT:    s_mov_b64 s[16:17], 16
-; GFX7-NEXT:    buffer_store_dwordx4 v[4:7], off, s[16:19], 0
+; GFX7-NEXT:    buffer_store_dwordx4 v[0:3], off, s[12:15], 0
+; GFX7-NEXT:    s_mov_b64 s[12:13], 16
+; GFX7-NEXT:    buffer_store_dwordx4 v[4:7], off, s[12:15], 0
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX10-LABEL: insertelement_v_v16i16_v_v:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll
index d531462ae9cc9..16b702edff2db 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll
@@ -159,6 +159,7 @@ define amdgpu_ps void @insertelement_v_v2i8_s_s(ptr addrspace(1) %ptr, i8 inreg
 ; GFX7-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s3, 0
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    s_mov_b32 s6, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
@@ -581,6 +582,7 @@ define amdgpu_ps void @insertelement_v_v2i8_s_v(ptr addrspace(1) %ptr, i8 inreg
 ; GFX7-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    s_mov_b32 s6, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
@@ -681,6 +683,7 @@ define amdgpu_ps void @insertelement_v_v2i8_v_s(ptr addrspace(1) %ptr, i8 %val,
 ; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 0
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    s_mov_b32 s6, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
@@ -781,6 +784,7 @@ define amdgpu_ps void @insertelement_v_v2i8_v_v(ptr addrspace(1) %ptr, i8 %val,
 ; GFX7-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX7-NEXT:    buffer_load_ushort v0, v[0:1], s[0:3], 0 addr64
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
+; GFX7-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX7-NEXT:    s_mov_b32 s2, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
@@ -943,6 +947,7 @@ define amdgpu_ps void @insertelement_v_v4i8_s_s(ptr addrspace(1) %ptr, i8 inreg
 ; GFX7-NEXT:    s_lshl_b32 s1, s1, s0
 ; GFX7-NEXT:    s_lshl_b32 s0, 0xff, s0
 ; GFX7-NEXT:    s_not_b32 s0, s0
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    s_mov_b32 s6, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v0, s0, v0
@@ -1312,6 +1317,7 @@ define amdgpu_ps void @insertelement_v_v4i8_s_v(ptr addrspace(1) %ptr, i8 inreg
 ; GFX7-NEXT:    v_lshl_b32_e32 v2, s0, v1
 ; GFX7-NEXT:    v_lshl_b32_e32 v1, 0xff, v1
 ; GFX7-NEXT:    v_not_b32_e32 v1, v1
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    s_mov_b32 s6, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v0, v0, v1
@@ -1405,6 +1411,7 @@ define amdgpu_ps void @insertelement_v_v4i8_v_s(ptr addrspace(1) %ptr, i8 %val,
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, s0, v1
 ; GFX7-NEXT:    s_lshl_b32 s0, 0xff, s0
 ; GFX7-NEXT:    s_not_b32 s0, s0
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    s_mov_b32 s6, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v0, s0, v0
@@ -1498,6 +1505,7 @@ define amdgpu_ps void @insertelement_v_v4i8_v_v(ptr addrspace(1) %ptr, i8 %val,
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v2, v1, v2
 ; GFX7-NEXT:    v_lshl_b32_e32 v1, 0xff, v1
 ; GFX7-NEXT:    v_not_b32_e32 v1, v1
+; GFX7-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX7-NEXT:    s_mov_b32 s2, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_and_b32_e32 v0, v0, v1
@@ -1741,6 +1749,7 @@ define amdgpu_ps void @insertelement_v_v8i8_s_s(ptr addrspace(1) %ptr, i8 inreg
 ; GFX7-NEXT:    s_lshl_b32 s1, 0xff, s1
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s0, 1
 ; GFX7-NEXT:    s_not_b32 s1, s1
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    s_mov_b32 s6, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_cndmask_b32_e32 v2, v0, v1, vcc
@@ -2281,6 +2290,7 @@ define amdgpu_ps void @insertelement_v_v8i8_s_v(ptr addrspace(1) %ptr, i8 inreg
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
 ; GFX7-NEXT:    v_not_b32_e32 v2, v2
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v3
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    s_mov_b32 s6, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_cndmask_b32_e32 v5, v0, v1, vcc
@@ -2404,6 +2414,7 @@ define amdgpu_ps void @insertelement_v_v8i8_v_s(ptr addrspace(1) %ptr, i8 %val,
 ; GFX7-NEXT:    s_lshl_b32 s1, 0xff, s1
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 vcc, s0, 1
 ; GFX7-NEXT:    s_not_b32 s1, s1
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    s_mov_b32 s6, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_cndmask_b32_e32 v3, v0, v1, vcc
@@ -2528,6 +2539,7 @@ define amdgpu_ps void @insertelement_v_v8i8_v_v(ptr addrspace(1) %ptr, i8 %val,
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
 ; GFX7-NEXT:    v_not_b32_e32 v3, v3
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v4
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX7-NEXT:    s_mov_b32 s6, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_cndmask_b32_e32 v5, v0, v1, vcc
@@ -2851,6 +2863,7 @@ define amdgpu_ps void @insertelement_v_v16i8_s_s(ptr addrspace(1) %ptr, i8 inreg
 ; GFX7-NEXT:    s_not_b32 s6, s0
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], s4, 2
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[2:3], s4, 3
+; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    s_mov_b32 s10, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_cndmask_b32_e32 v4, v0, v1, vcc
@@ -3568,6 +3581,7 @@ define amdgpu_ps void @insertelement_v_v16i8_s_v(ptr addrspace(1) %ptr, i8 inreg
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v0
 ; GFX7-NEXT:    v_not_b32_e32 v1, v1
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
+; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    s_mov_b32 s10, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_cndmask_b32_e32 v7, v3, v4, vcc
@@ -3721,6 +3735,7 @@ define amdgpu_ps void @insertelement_v_v16i8_v_s(ptr addrspace(1) %ptr, i8 %val,
 ; GFX7-NEXT:    s_not_b32 s5, s0
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[0:1], s4, 2
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[2:3], s4, 3
+; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    s_mov_b32 s10, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
@@ -3876,6 +3891,7 @@ define amdgpu_ps void @insertelement_v_v16i8_v_v(ptr addrspace(1) %ptr, i8 %val,
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v0
 ; GFX7-NEXT:    v_not_b32_e32 v1, v1
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
+; GFX7-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX7-NEXT:    s_mov_b32 s10, -1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll
index 61439021a8875..dc9cbb498dab4 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll
@@ -687,17 +687,17 @@ define void @dyn_insertelement_v8f64_const_s_v_v(double %val, i32 %idx) {
 ; GPRIDX:       ; %bb.0: ; %entry
 ; GPRIDX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GPRIDX-NEXT:    s_mov_b32 s18, 0
+; GPRIDX-NEXT:    s_mov_b32 s16, 0
+; GPRIDX-NEXT:    s_mov_b32 s14, 0
+; GPRIDX-NEXT:    s_mov_b32 s12, 0
+; GPRIDX-NEXT:    s_mov_b32 s8, 0
 ; GPRIDX-NEXT:    s_mov_b64 s[4:5], 1.0
 ; GPRIDX-NEXT:    s_mov_b32 s19, 0x40200000
 ; GPRIDX-NEXT:    s_mov_b32 s17, 0x401c0000
-; GPRIDX-NEXT:    s_mov_b32 s16, s18
 ; GPRIDX-NEXT:    s_mov_b32 s15, 0x40180000
-; GPRIDX-NEXT:    s_mov_b32 s14, s18
 ; GPRIDX-NEXT:    s_mov_b32 s13, 0x40140000
-; GPRIDX-NEXT:    s_mov_b32 s12, s18
 ; GPRIDX-NEXT:    s_mov_b64 s[10:11], 4.0
 ; GPRIDX-NEXT:    s_mov_b32 s9, 0x40080000
-; GPRIDX-NEXT:    s_mov_b32 s8, s18
 ; GPRIDX-NEXT:    s_mov_b64 s[6:7], 2.0
 ; GPRIDX-NEXT:    v_mov_b32_e32 v3, s4
 ; GPRIDX-NEXT:    v_mov_b32_e32 v4, s5
@@ -753,17 +753,17 @@ define void @dyn_insertelement_v8f64_const_s_v_v(double %val, i32 %idx) {
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    s_mov_b32 s18, 0
+; GFX10-NEXT:    s_mov_b32 s16, 0
+; GFX10-NEXT:    s_mov_b32 s14, 0
+; GFX10-NEXT:    s_mov_b32 s12, 0
+; GFX10-NEXT:    s_mov_b32 s8, 0
 ; GFX10-NEXT:    s_mov_b64 s[4:5], 1.0
 ; GFX10-NEXT:    s_mov_b32 s19, 0x40200000
 ; GFX10-NEXT:    s_mov_b32 s17, 0x401c0000
-; GFX10-NEXT:    s_mov_b32 s16, s18
 ; GFX10-NEXT:    s_mov_b32 s15, 0x40180000
-; GFX10-NEXT:    s_mov_b32 s14, s18
 ; GFX10-NEXT:    s_mov_b32 s13, 0x40140000
-; GFX10-NEXT:    s_mov_b32 s12, s18
 ; GFX10-NEXT:    s_mov_b64 s[10:11], 4.0
 ; GFX10-NEXT:    s_mov_b32 s9, 0x40080000
-; GFX10-NEXT:    s_mov_b32 s8, s18
 ; GFX10-NEXT:    s_mov_b64 s[6:7], 2.0
 ; GFX10-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX10-NEXT:    v_mov_b32_e32 v4, s5
@@ -820,16 +820,16 @@ define void @dyn_insertelement_v8f64_const_s_v_v(double %val, i32 %idx) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s14, 0
 ; GFX11-NEXT:    s_mov_b32 s15, 0x40200000
+; GFX11-NEXT:    s_mov_b32 s12, 0
+; GFX11-NEXT:    s_mov_b32 s10, 0
+; GFX11-NEXT:    s_mov_b32 s8, 0
+; GFX11-NEXT:    s_mov_b32 s4, 0
 ; GFX11-NEXT:    s_mov_b64 s[0:1], 1.0
 ; GFX11-NEXT:    s_mov_b32 s13, 0x401c0000
-; GFX11-NEXT:    s_mov_b32 s12, s14
 ; GFX11-NEXT:    s_mov_b32 s11, 0x40180000
-; GFX11-NEXT:    s_mov_b32 s10, s14
 ; GFX11-NEXT:    s_mov_b32 s9, 0x40140000
-; GFX11-NEXT:    s_mov_b32 s8, s14
 ; GFX11-NEXT:    s_mov_b64 s[6:7], 4.0
 ; GFX11-NEXT:    s_mov_b32 s5, 0x40080000
-; GFX11-NEXT:    s_mov_b32 s4, s14
 ; GFX11-NEXT:    s_mov_b64 s[2:3], 2.0
 ; GFX11-NEXT:    v_dual_mov_b32 v18, s15 :: v_dual_mov_b32 v17, s14
 ; GFX11-NEXT:    v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v3, s0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgpu-atomic-cmpxchg-flat.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgpu-atomic-cmpxchg-flat.mir
index 2714982163fec..f1c3673ae29dd 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgpu-atomic-cmpxchg-flat.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgpu-atomic-cmpxchg-flat.mir
@@ -23,6 +23,7 @@ body:             |
     ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1
     ; GFX7-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
     ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_CMPSWAP_RTN]]
+    ;
     ; GFX9-LABEL: name: amdgpu_atomic_cmpxchg_s32_flat
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2, $vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -32,6 +33,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1
     ; GFX9-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
     ; GFX9-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_CMPSWAP_RTN]]
+    ;
     ; GFX10-LABEL: name: amdgpu_atomic_cmpxchg_s32_flat
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2, $vgpr3
     ; GFX10-NEXT: {{  $}}
@@ -41,6 +43,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1
     ; GFX10-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
     ; GFX10-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_CMPSWAP_RTN]]
+    ;
     ; GFX11-LABEL: name: amdgpu_atomic_cmpxchg_s32_flat
     ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2, $vgpr3
     ; GFX11-NEXT: {{  $}}
@@ -75,18 +78,17 @@ body:             |
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3
     ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1
-    ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4, implicit $exec
-    ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4, implicit $exec
     ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
+    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
+    ; GFX7-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY3]], [[COPY4]], 0, implicit $exec
     ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
+    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
     ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_CMPSWAP_RTN]]
+    ;
     ; GFX9-LABEL: name: amdgpu_atomic_cmpxchg_s32_flat_gep4
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2, $vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -96,6 +98,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1
     ; GFX9-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY]], [[REG_SEQUENCE]], 4, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
     ; GFX9-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_CMPSWAP_RTN]]
+    ;
     ; GFX10-LABEL: name: amdgpu_atomic_cmpxchg_s32_flat_gep4
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2, $vgpr3
     ; GFX10-NEXT: {{  $}}
@@ -103,18 +106,17 @@ body:             |
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1
-    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4, implicit $exec
-    ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4, implicit $exec
     ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
+    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY3]], [[COPY4]], 0, implicit $exec
     ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX10-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
+    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX10-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
     ; GFX10-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_CMPSWAP_RTN]]
+    ;
     ; GFX11-LABEL: name: amdgpu_atomic_cmpxchg_s32_flat_gep4
     ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2, $vgpr3
     ; GFX11-NEXT: {{  $}}
@@ -153,6 +155,7 @@ body:             |
     ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0_sub1, [[COPY2]], %subreg.sub2_sub3
     ; GFX7-NEXT: [[FLAT_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_CMPSWAP_X2_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64))
     ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_ATOMIC_CMPSWAP_X2_RTN]]
+    ;
     ; GFX9-LABEL: name: amdgpu_atomic_cmpxchg_s64_flat
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $vgpr4_vgpr5
     ; GFX9-NEXT: {{  $}}
@@ -162,6 +165,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0_sub1, [[COPY2]], %subreg.sub2_sub3
     ; GFX9-NEXT: [[FLAT_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_CMPSWAP_X2_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64))
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_ATOMIC_CMPSWAP_X2_RTN]]
+    ;
     ; GFX10-LABEL: name: amdgpu_atomic_cmpxchg_s64_flat
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $vgpr4_vgpr5
     ; GFX10-NEXT: {{  $}}
@@ -171,6 +175,7 @@ body:             |
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0_sub1, [[COPY2]], %subreg.sub2_sub3
     ; GFX10-NEXT: [[FLAT_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_CMPSWAP_X2_RTN [[COPY]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64))
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_ATOMIC_CMPSWAP_X2_RTN]]
+    ;
     ; GFX11-LABEL: name: amdgpu_atomic_cmpxchg_s64_flat
     ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $vgpr4_vgpr5
     ; GFX11-NEXT: {{  $}}
@@ -205,18 +210,17 @@ body:             |
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr4_vgpr5
     ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0_sub1, [[COPY2]], %subreg.sub2_sub3
-    ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4, implicit $exec
-    ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4, implicit $exec
     ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
+    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
+    ; GFX7-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY3]], [[COPY4]], 0, implicit $exec
     ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-NEXT: [[FLAT_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_CMPSWAP_X2_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64))
+    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-NEXT: [[FLAT_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_CMPSWAP_X2_RTN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64))
     ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_ATOMIC_CMPSWAP_X2_RTN]]
+    ;
     ; GFX9-LABEL: name: amdgpu_atomic_cmpxchg_s64_flat_gep4
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $vgpr4_vgpr5
     ; GFX9-NEXT: {{  $}}
@@ -226,6 +230,7 @@ body:             |
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0_sub1, [[COPY2]], %subreg.sub2_sub3
     ; GFX9-NEXT: [[FLAT_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_CMPSWAP_X2_RTN [[COPY]], [[REG_SEQUENCE]], 4, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64))
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_ATOMIC_CMPSWAP_X2_RTN]]
+    ;
     ; GFX10-LABEL: name: amdgpu_atomic_cmpxchg_s64_flat_gep4
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $vgpr4_vgpr5
     ; GFX10-NEXT: {{  $}}
@@ -233,18 +238,17 @@ body:             |
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr4_vgpr5
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0_sub1, [[COPY2]], %subreg.sub2_sub3
-    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4, implicit $exec
-    ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4, implicit $exec
     ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
+    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY3]], [[COPY4]], 0, implicit $exec
     ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX10-NEXT: [[FLAT_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_CMPSWAP_X2_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64))
+    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX10-NEXT: [[FLAT_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_CMPSWAP_X2_RTN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64))
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_ATOMIC_CMPSWAP_X2_RTN]]
+    ;
     ; GFX11-LABEL: name: amdgpu_atomic_cmpxchg_s64_flat_gep4
     ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $vgpr4_vgpr5
     ; GFX11-NEXT: {{  $}}
@@ -281,18 +285,17 @@ body:             |
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3
     ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1
-    ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967292, implicit $exec
-    ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -4, implicit $exec
     ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
+    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
+    ; GFX7-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY3]], [[COPY4]], 0, implicit $exec
     ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
+    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
     ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_CMPSWAP_RTN]]
+    ;
     ; GFX9-LABEL: name: amdgpu_atomic_cmpxchg_s32_flat_gepm4
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2, $vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -300,18 +303,17 @@ body:             |
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1
-    ; GFX9-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967292, implicit $exec
-    ; GFX9-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX9-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -4, implicit $exec
     ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX9-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
+    ; GFX9-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX9-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY3]], [[COPY4]], 0, implicit $exec
     ; GFX9-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX9-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX9-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
+    ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX9-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
     ; GFX9-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_CMPSWAP_RTN]]
+    ;
     ; GFX10-LABEL: name: amdgpu_atomic_cmpxchg_s32_flat_gepm4
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2, $vgpr3
     ; GFX10-NEXT: {{  $}}
@@ -319,18 +321,17 @@ body:             |
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1
-    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967292, implicit $exec
-    ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -4, implicit $exec
     ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
+    ; GFX10-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY3]], [[COPY4]], 0, implicit $exec
     ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX10-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
+    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX10-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
     ; GFX10-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_CMPSWAP_RTN]]
+    ;
     ; GFX11-LABEL: name: amdgpu_atomic_cmpxchg_s32_flat_gepm4
     ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2, $vgpr3
     ; GFX11-NEXT: {{  $}}
@@ -338,17 +339,15 @@ body:             |
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3
     ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1
-    ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967292, implicit $exec
-    ; GFX11-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX11-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -4, implicit $exec
     ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
+    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
+    ; GFX11-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX11-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY3]], [[COPY4]], 0, implicit $exec
     ; GFX11-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX11-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX11-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
+    ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX11-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
     ; GFX11-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_CMPSWAP_RTN]]
     %0:vgpr(p0) = COPY $vgpr0_vgpr1
     %1:vgpr(s32) = COPY $vgpr2
@@ -378,6 +377,7 @@ body:             |
     ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3
     ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1
     ; GFX7-NEXT: FLAT_ATOMIC_CMPSWAP [[COPY]], [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
+    ;
     ; GFX9-LABEL: name: amdgpu_atomic_cmpxchg_s32_flat_nortn
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2, $vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -386,6 +386,7 @@ body:             |
     ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1
     ; GFX9-NEXT: FLAT_ATOMIC_CMPSWAP [[COPY]], [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
+    ;
     ; GFX10-LABEL: name: amdgpu_atomic_cmpxchg_s32_flat_nortn
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2, $vgpr3
     ; GFX10-NEXT: {{  $}}
@@ -394,6 +395,7 @@ body:             |
     ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1
     ; GFX10-NEXT: FLAT_ATOMIC_CMPSWAP [[COPY]], [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
+    ;
     ; GFX11-LABEL: name: amdgpu_atomic_cmpxchg_s32_flat_nortn
     ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2, $vgpr3
     ; GFX11-NEXT: {{  $}}
@@ -427,6 +429,7 @@ body:             |
     ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr4_vgpr5
     ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0_sub1, [[COPY2]], %subreg.sub2_sub3
     ; GFX7-NEXT: FLAT_ATOMIC_CMPSWAP_X2 [[COPY]], [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64))
+    ;
     ; GFX9-LABEL: name: amdgpu_atomic_cmpxchg_s64_flat_nortn
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $vgpr4_vgpr5
     ; GFX9-NEXT: {{  $}}
@@ -435,6 +438,7 @@ body:             |
     ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr4_vgpr5
     ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0_sub1, [[COPY2]], %subreg.sub2_sub3
     ; GFX9-NEXT: FLAT_ATOMIC_CMPSWAP_X2 [[COPY]], [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64))
+    ;
     ; GFX10-LABEL: name: amdgpu_atomic_cmpxchg_s64_flat_nortn
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $vgpr4_vgpr5
     ; GFX10-NEXT: {{  $}}
@@ -443,6 +447,7 @@ body:             |
     ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr4_vgpr5
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0_sub1, [[COPY2]], %subreg.sub2_sub3
     ; GFX10-NEXT: FLAT_ATOMIC_CMPSWAP_X2 [[COPY]], [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64))
+    ;
     ; GFX11-LABEL: name: amdgpu_atomic_cmpxchg_s64_flat_nortn
     ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $vgpr4_vgpr5
     ; GFX11-NEXT: {{  $}}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgpu-atomic-cmpxchg-global.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgpu-atomic-cmpxchg-global.mir
index bc131f53910d9..e1ef96bec0fdb 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgpu-atomic-cmpxchg-global.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgpu-atomic-cmpxchg-global.mir
@@ -144,17 +144,15 @@ body:             |
     ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3
     ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1
-    ; GFX7-FLAT-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4, implicit $exec
-    ; GFX7-FLAT-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-FLAT-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4, implicit $exec
     ; GFX7-FLAT-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
+    ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-FLAT-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-FLAT-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
+    ; GFX7-FLAT-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-FLAT-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY3]], [[COPY4]], 0, implicit $exec
     ; GFX7-FLAT-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-FLAT-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32), addrspace 1)
+    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-FLAT-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32), addrspace 1)
     ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_CMPSWAP_RTN]]
     ;
     ; GFX8-LABEL: name: amdgpu_atomic_cmpxchg_s32_global_gep4
@@ -164,17 +162,15 @@ body:             |
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3
     ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1
-    ; GFX8-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4, implicit $exec
-    ; GFX8-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4, implicit $exec
     ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
+    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX8-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX8-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
+    ; GFX8-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY3]], [[COPY4]], 0, implicit $exec
     ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX8-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32), addrspace 1)
+    ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX8-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32), addrspace 1)
     ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_CMPSWAP_RTN]]
     ;
     ; GFX9-LABEL: name: amdgpu_atomic_cmpxchg_s32_global_gep4
@@ -344,17 +340,15 @@ body:             |
     ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr4_vgpr5
     ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0_sub1, [[COPY2]], %subreg.sub2_sub3
-    ; GFX7-FLAT-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4, implicit $exec
-    ; GFX7-FLAT-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-FLAT-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4, implicit $exec
     ; GFX7-FLAT-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
+    ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-FLAT-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-FLAT-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
+    ; GFX7-FLAT-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-FLAT-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY3]], [[COPY4]], 0, implicit $exec
     ; GFX7-FLAT-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-FLAT-NEXT: [[FLAT_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_CMPSWAP_X2_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64), addrspace 1)
+    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-FLAT-NEXT: [[FLAT_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_CMPSWAP_X2_RTN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64), addrspace 1)
     ; GFX7-FLAT-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_ATOMIC_CMPSWAP_X2_RTN]]
     ;
     ; GFX8-LABEL: name: amdgpu_atomic_cmpxchg_s64_global_gep4
@@ -364,17 +358,15 @@ body:             |
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr4_vgpr5
     ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0_sub1, [[COPY2]], %subreg.sub2_sub3
-    ; GFX8-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4, implicit $exec
-    ; GFX8-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4, implicit $exec
     ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
+    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX8-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX8-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
+    ; GFX8-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY3]], [[COPY4]], 0, implicit $exec
     ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX8-NEXT: [[FLAT_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_CMPSWAP_X2_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64), addrspace 1)
+    ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX8-NEXT: [[FLAT_ATOMIC_CMPSWAP_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_CMPSWAP_X2_RTN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64), addrspace 1)
     ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_ATOMIC_CMPSWAP_X2_RTN]]
     ;
     ; GFX9-LABEL: name: amdgpu_atomic_cmpxchg_s64_global_gep4
@@ -423,22 +415,20 @@ body:             |
     ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX6-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3
     ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1
-    ; GFX6-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967292, implicit $exec
-    ; GFX6-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX6-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -4, implicit $exec
     ; GFX6-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
+    ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX6-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX6-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
+    ; GFX6-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY3]], [[COPY4]], 0, implicit $exec
     ; GFX6-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX6-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
     ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-    ; GFX6-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX6-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
     ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-    ; GFX6-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE3]], %subreg.sub2_sub3
-    ; GFX6-NEXT: [[BUFFER_ATOMIC_CMPSWAP_ADDR64_RTN:%[0-9]+]]:vreg_64 = BUFFER_ATOMIC_CMPSWAP_ADDR64_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE4]], 0, 0, 1, implicit $exec :: (load store seq_cst (s32), addrspace 1)
+    ; GFX6-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3
+    ; GFX6-NEXT: [[BUFFER_ATOMIC_CMPSWAP_ADDR64_RTN:%[0-9]+]]:vreg_64 = BUFFER_ATOMIC_CMPSWAP_ADDR64_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 1, implicit $exec :: (load store seq_cst (s32), addrspace 1)
     ; GFX6-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_CMPSWAP_ADDR64_RTN]].sub0
     ; GFX6-NEXT: $vgpr0 = COPY [[COPY7]]
     ;
@@ -449,22 +439,20 @@ body:             |
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3
     ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1
-    ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967292, implicit $exec
-    ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -4, implicit $exec
     ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
+    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
+    ; GFX7-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY3]], [[COPY4]], 0, implicit $exec
     ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
     ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-    ; GFX7-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
     ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-    ; GFX7-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE3]], %subreg.sub2_sub3
-    ; GFX7-NEXT: [[BUFFER_ATOMIC_CMPSWAP_ADDR64_RTN:%[0-9]+]]:vreg_64 = BUFFER_ATOMIC_CMPSWAP_ADDR64_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE2]], [[REG_SEQUENCE4]], 0, 0, 1, implicit $exec :: (load store seq_cst (s32), addrspace 1)
+    ; GFX7-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3
+    ; GFX7-NEXT: [[BUFFER_ATOMIC_CMPSWAP_ADDR64_RTN:%[0-9]+]]:vreg_64 = BUFFER_ATOMIC_CMPSWAP_ADDR64_RTN [[REG_SEQUENCE]], [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 1, implicit $exec :: (load store seq_cst (s32), addrspace 1)
     ; GFX7-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_ATOMIC_CMPSWAP_ADDR64_RTN]].sub0
     ; GFX7-NEXT: $vgpr0 = COPY [[COPY7]]
     ;
@@ -475,17 +463,15 @@ body:             |
     ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3
     ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1
-    ; GFX7-FLAT-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967292, implicit $exec
-    ; GFX7-FLAT-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-FLAT-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -4, implicit $exec
     ; GFX7-FLAT-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
+    ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-FLAT-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-FLAT-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
+    ; GFX7-FLAT-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-FLAT-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY3]], [[COPY4]], 0, implicit $exec
     ; GFX7-FLAT-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-FLAT-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32), addrspace 1)
+    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-FLAT-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32), addrspace 1)
     ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_CMPSWAP_RTN]]
     ;
     ; GFX8-LABEL: name: amdgpu_atomic_cmpxchg_s32_global_gepm4
@@ -495,17 +481,15 @@ body:             |
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3
     ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1
-    ; GFX8-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967292, implicit $exec
-    ; GFX8-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -4, implicit $exec
     ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
+    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX8-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX8-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
+    ; GFX8-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY3]], [[COPY4]], 0, implicit $exec
     ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX8-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32), addrspace 1)
+    ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX8-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32), addrspace 1)
     ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_CMPSWAP_RTN]]
     ;
     ; GFX9-LABEL: name: amdgpu_atomic_cmpxchg_s32_global_gepm4
@@ -834,19 +818,17 @@ body:             |
     ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
     ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-    ; GFX7-FLAT-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4095
-    ; GFX7-FLAT-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX7-FLAT-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 4095
     ; GFX7-FLAT-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
-    ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub0
     ; GFX7-FLAT-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
-    ; GFX7-FLAT-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-FLAT-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub1
     ; GFX7-FLAT-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY3]], [[COPY4]], implicit-def $scc
     ; GFX7-FLAT-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY5]], [[COPY6]], implicit-def dead $scc, implicit $scc
-    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
-    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1
-    ; GFX7-FLAT-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
-    ; GFX7-FLAT-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY7]], [[REG_SEQUENCE2]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32), addrspace 1)
+    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
+    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1
+    ; GFX7-FLAT-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
+    ; GFX7-FLAT-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY7]], [[REG_SEQUENCE1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32), addrspace 1)
     ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_CMPSWAP_RTN]]
     ;
     ; GFX8-LABEL: name: amdgpu_atomic_cmpxchg_s32_global_sgpr_ptr_offset_4095
@@ -855,19 +837,17 @@ body:             |
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr3
-    ; GFX8-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4095
-    ; GFX8-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX8-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 4095
     ; GFX8-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
-    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub0
     ; GFX8-NEXT: [[COPY5:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
-    ; GFX8-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX8-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub1
     ; GFX8-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY3]], [[COPY4]], implicit-def $scc
     ; GFX8-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY5]], [[COPY6]], implicit-def dead $scc, implicit $scc
-    ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
-    ; GFX8-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1
-    ; GFX8-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
-    ; GFX8-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY7]], [[REG_SEQUENCE2]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32), addrspace 1)
+    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
+    ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1
+    ; GFX8-NEXT: [[COPY7:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
+    ; GFX8-NEXT: [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY7]], [[REG_SEQUENCE1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32), addrspace 1)
     ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_CMPSWAP_RTN]]
     ;
     ; GFX9-LABEL: name: amdgpu_atomic_cmpxchg_s32_global_sgpr_ptr_offset_4095
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomicrmw-add-flat.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomicrmw-add-flat.mir
index 78071022fc05c..868f08d805caa 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomicrmw-add-flat.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomicrmw-add-flat.mir
@@ -21,6 +21,7 @@ body:             |
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX7-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
     ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]]
+    ;
     ; GFX9-LABEL: name: flat_atomicrmw_add_s32
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9-NEXT: {{  $}}
@@ -28,6 +29,7 @@ body:             |
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX9-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
     ; GFX9-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]]
+    ;
     ; GFX10-LABEL: name: flat_atomicrmw_add_s32
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX10-NEXT: {{  $}}
@@ -35,6 +37,7 @@ body:             |
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX10-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
     ; GFX10-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]]
+    ;
     ; GFX11-LABEL: name: flat_atomicrmw_add_s32
     ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX11-NEXT: {{  $}}
@@ -64,18 +67,21 @@ body:             |
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX7-NEXT: FLAT_ATOMIC_ADD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
+    ;
     ; GFX9-LABEL: name: flat_atomicrmw_add_s32_nortn
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX9-NEXT: FLAT_ATOMIC_ADD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
+    ;
     ; GFX10-LABEL: name: flat_atomicrmw_add_s32_nortn
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX10-NEXT: FLAT_ATOMIC_ADD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
+    ;
     ; GFX11-LABEL: name: flat_atomicrmw_add_s32_nortn
     ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX11-NEXT: {{  $}}
@@ -102,18 +108,17 @@ body:             |
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2047, implicit $exec
-    ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 2047, implicit $exec
     ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
     ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]]
+    ;
     ; GFX9-LABEL: name: flat_atomicrmw_add_s32_offset2047
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9-NEXT: {{  $}}
@@ -121,23 +126,23 @@ body:             |
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX9-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 2047, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
     ; GFX9-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]]
+    ;
     ; GFX10-LABEL: name: flat_atomicrmw_add_s32_offset2047
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2047, implicit $exec
-    ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 2047, implicit $exec
     ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX10-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX10-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
     ; GFX10-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]]
+    ;
     ; GFX11-LABEL: name: flat_atomicrmw_add_s32_offset2047
     ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX11-NEXT: {{  $}}
@@ -168,39 +173,38 @@ body:             |
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2047, implicit $exec
-    ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 2047, implicit $exec
     ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-NEXT: FLAT_ATOMIC_ADD [[REG_SEQUENCE1]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-NEXT: FLAT_ATOMIC_ADD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
+    ;
     ; GFX9-LABEL: name: flat_atomicrmw_add_s32_offset2047_nortn
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX9-NEXT: FLAT_ATOMIC_ADD [[COPY]], [[COPY1]], 2047, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
+    ;
     ; GFX10-LABEL: name: flat_atomicrmw_add_s32_offset2047_nortn
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2047, implicit $exec
-    ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 2047, implicit $exec
     ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX10-NEXT: FLAT_ATOMIC_ADD [[REG_SEQUENCE1]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX10-NEXT: FLAT_ATOMIC_ADD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
+    ;
     ; GFX11-LABEL: name: flat_atomicrmw_add_s32_offset2047_nortn
     ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX11-NEXT: {{  $}}
@@ -229,18 +233,17 @@ body:             |
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2048, implicit $exec
-    ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 2048, implicit $exec
     ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
     ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]]
+    ;
     ; GFX9-LABEL: name: flat_atomicrmw_add_s32_offset2048
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9-NEXT: {{  $}}
@@ -248,23 +251,23 @@ body:             |
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX9-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 2048, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
     ; GFX9-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]]
+    ;
     ; GFX10-LABEL: name: flat_atomicrmw_add_s32_offset2048
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2048, implicit $exec
-    ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 2048, implicit $exec
     ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX10-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX10-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
     ; GFX10-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]]
+    ;
     ; GFX11-LABEL: name: flat_atomicrmw_add_s32_offset2048
     ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX11-NEXT: {{  $}}
@@ -295,39 +298,38 @@ body:             |
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2048, implicit $exec
-    ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 2048, implicit $exec
     ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-NEXT: FLAT_ATOMIC_ADD [[REG_SEQUENCE1]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-NEXT: FLAT_ATOMIC_ADD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
+    ;
     ; GFX9-LABEL: name: flat_atomicrmw_add_s32_offset2048_nortn
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX9-NEXT: FLAT_ATOMIC_ADD [[COPY]], [[COPY1]], 2048, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
+    ;
     ; GFX10-LABEL: name: flat_atomicrmw_add_s32_offset2048_nortn
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2048, implicit $exec
-    ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 2048, implicit $exec
     ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX10-NEXT: FLAT_ATOMIC_ADD [[REG_SEQUENCE1]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX10-NEXT: FLAT_ATOMIC_ADD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
+    ;
     ; GFX11-LABEL: name: flat_atomicrmw_add_s32_offset2048_nortn
     ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX11-NEXT: {{  $}}
@@ -356,18 +358,17 @@ body:             |
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec
-    ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4095, implicit $exec
     ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
     ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]]
+    ;
     ; GFX9-LABEL: name: flat_atomicrmw_add_s32_offset4095
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9-NEXT: {{  $}}
@@ -375,23 +376,23 @@ body:             |
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX9-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 4095, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
     ; GFX9-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]]
+    ;
     ; GFX10-LABEL: name: flat_atomicrmw_add_s32_offset4095
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec
-    ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4095, implicit $exec
     ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX10-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX10-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
     ; GFX10-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]]
+    ;
     ; GFX11-LABEL: name: flat_atomicrmw_add_s32_offset4095
     ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX11-NEXT: {{  $}}
@@ -422,39 +423,38 @@ body:             |
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec
-    ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4095, implicit $exec
     ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-NEXT: FLAT_ATOMIC_ADD [[REG_SEQUENCE1]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-NEXT: FLAT_ATOMIC_ADD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
+    ;
     ; GFX9-LABEL: name: flat_atomicrmw_add_s32_offset4095_nortn
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX9-NEXT: FLAT_ATOMIC_ADD [[COPY]], [[COPY1]], 4095, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
+    ;
     ; GFX10-LABEL: name: flat_atomicrmw_add_s32_offset4095_nortn
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec
-    ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4095, implicit $exec
     ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX10-NEXT: FLAT_ATOMIC_ADD [[REG_SEQUENCE1]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX10-NEXT: FLAT_ATOMIC_ADD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
+    ;
     ; GFX11-LABEL: name: flat_atomicrmw_add_s32_offset4095_nortn
     ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX11-NEXT: {{  $}}
@@ -483,68 +483,63 @@ body:             |
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4097, implicit $exec
-    ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4097, implicit $exec
     ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
     ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]]
+    ;
     ; GFX9-LABEL: name: flat_atomicrmw_add_s32_offset4097
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX9-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4097, implicit $exec
-    ; GFX9-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX9-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4097, implicit $exec
     ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX9-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX9-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX9-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
+    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX9-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
     ; GFX9-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]]
+    ;
     ; GFX10-LABEL: name: flat_atomicrmw_add_s32_offset4097
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4097, implicit $exec
-    ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4097, implicit $exec
     ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX10-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX10-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
     ; GFX10-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]]
+    ;
     ; GFX11-LABEL: name: flat_atomicrmw_add_s32_offset4097
     ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4097, implicit $exec
-    ; GFX11-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX11-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4097, implicit $exec
     ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX11-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX11-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX11-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
+    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX11-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
     ; GFX11-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]]
     %0:vgpr(p0) = COPY $vgpr0_vgpr1
     %1:vgpr(s32) = COPY $vgpr2
@@ -569,65 +564,60 @@ body:             |
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4097, implicit $exec
-    ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4097, implicit $exec
     ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-NEXT: FLAT_ATOMIC_ADD [[REG_SEQUENCE1]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-NEXT: FLAT_ATOMIC_ADD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
+    ;
     ; GFX9-LABEL: name: flat_atomicrmw_add_s32_offset4097_nortn
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX9-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4097, implicit $exec
-    ; GFX9-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX9-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4097, implicit $exec
     ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX9-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX9-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX9-NEXT: FLAT_ATOMIC_ADD [[REG_SEQUENCE1]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
+    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX9-NEXT: FLAT_ATOMIC_ADD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
+    ;
     ; GFX10-LABEL: name: flat_atomicrmw_add_s32_offset4097_nortn
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4097, implicit $exec
-    ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4097, implicit $exec
     ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX10-NEXT: FLAT_ATOMIC_ADD [[REG_SEQUENCE1]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX10-NEXT: FLAT_ATOMIC_ADD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
+    ;
     ; GFX11-LABEL: name: flat_atomicrmw_add_s32_offset4097_nortn
     ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4097, implicit $exec
-    ; GFX11-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX11-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4097, implicit $exec
     ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX11-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX11-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX11-NEXT: FLAT_ATOMIC_ADD [[REG_SEQUENCE1]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
+    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX11-NEXT: FLAT_ATOMIC_ADD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32))
     %0:vgpr(p0) = COPY $vgpr0_vgpr1
     %1:vgpr(s32) = COPY $vgpr2
     %2:vgpr(s64) = G_CONSTANT i64 4097
@@ -652,6 +642,7 @@ body:             |
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX7-NEXT: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64))
     ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_ATOMIC_ADD_X2_RTN]]
+    ;
     ; GFX9-LABEL: name: flat_atomicrmw_add_s64
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -659,6 +650,7 @@ body:             |
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX9-NEXT: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64))
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_ATOMIC_ADD_X2_RTN]]
+    ;
     ; GFX10-LABEL: name: flat_atomicrmw_add_s64
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX10-NEXT: {{  $}}
@@ -666,6 +658,7 @@ body:             |
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX10-NEXT: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64))
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_ATOMIC_ADD_X2_RTN]]
+    ;
     ; GFX11-LABEL: name: flat_atomicrmw_add_s64
     ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX11-NEXT: {{  $}}
@@ -695,18 +688,21 @@ body:             |
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX7-NEXT: FLAT_ATOMIC_ADD_X2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64))
+    ;
     ; GFX9-LABEL: name: flat_atomicrmw_add_s64_nortn
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX9-NEXT: FLAT_ATOMIC_ADD_X2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64))
+    ;
     ; GFX10-LABEL: name: flat_atomicrmw_add_s64_nortn
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX10-NEXT: FLAT_ATOMIC_ADD_X2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64))
+    ;
     ; GFX11-LABEL: name: flat_atomicrmw_add_s64_nortn
     ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX11-NEXT: {{  $}}
@@ -733,18 +729,17 @@ body:             |
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec
-    ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4095, implicit $exec
     ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-NEXT: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64))
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-NEXT: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[REG_SEQUENCE]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64))
     ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_ATOMIC_ADD_X2_RTN]]
+    ;
     ; GFX9-LABEL: name: flat_atomicrmw_add_s64_offset4095
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -752,23 +747,23 @@ body:             |
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX9-NEXT: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 4095, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64))
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_ATOMIC_ADD_X2_RTN]]
+    ;
     ; GFX10-LABEL: name: flat_atomicrmw_add_s64_offset4095
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec
-    ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4095, implicit $exec
     ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX10-NEXT: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64))
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX10-NEXT: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[REG_SEQUENCE]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64))
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_ATOMIC_ADD_X2_RTN]]
+    ;
     ; GFX11-LABEL: name: flat_atomicrmw_add_s64_offset4095
     ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX11-NEXT: {{  $}}
@@ -799,39 +794,38 @@ body:             |
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec
-    ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4095, implicit $exec
     ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-NEXT: FLAT_ATOMIC_ADD_X2 [[REG_SEQUENCE1]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64))
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-NEXT: FLAT_ATOMIC_ADD_X2 [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64))
+    ;
     ; GFX9-LABEL: name: flat_atomicrmw_add_s64_offset4095_nortn
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX9-NEXT: FLAT_ATOMIC_ADD_X2 [[COPY]], [[COPY1]], 4095, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64))
+    ;
     ; GFX10-LABEL: name: flat_atomicrmw_add_s64_offset4095_nortn
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec
-    ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4095, implicit $exec
     ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX10-NEXT: FLAT_ATOMIC_ADD_X2 [[REG_SEQUENCE1]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64))
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX10-NEXT: FLAT_ATOMIC_ADD_X2 [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64))
+    ;
     ; GFX11-LABEL: name: flat_atomicrmw_add_s64_offset4095_nortn
     ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX11-NEXT: {{  $}}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomicrmw-add-global.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomicrmw-add-global.mir
index d3eaee283d644..43d9b911b9f33 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomicrmw-add-global.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-atomicrmw-add-global.mir
@@ -27,6 +27,7 @@ body:             |
     ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
     ; GFX6-NEXT: [[BUFFER_ATOMIC_ADD_ADDR64_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_ADDR64_RTN [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 1, implicit $exec :: (load store seq_cst (s32), addrspace 1)
     ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_ADDR64_RTN]]
+    ;
     ; GFX7-LABEL: name: global_atomicrmw_add_s32
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX7-NEXT: {{  $}}
@@ -34,6 +35,7 @@ body:             |
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX7-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32), addrspace 1)
     ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]]
+    ;
     ; GFX9-LABEL: name: global_atomicrmw_add_s32
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9-NEXT: {{  $}}
@@ -41,6 +43,7 @@ body:             |
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX9-NEXT: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 0, 1, implicit $exec :: (load store seq_cst (s32), addrspace 1)
     ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_RTN]]
+    ;
     ; GFX10-LABEL: name: global_atomicrmw_add_s32
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX10-NEXT: {{  $}}
@@ -48,6 +51,7 @@ body:             |
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX10-NEXT: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 0, 1, implicit $exec :: (load store seq_cst (s32), addrspace 1)
     ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_RTN]]
+    ;
     ; GFX11-LABEL: name: global_atomicrmw_add_s32
     ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX11-NEXT: {{  $}}
@@ -82,24 +86,28 @@ body:             |
     ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
     ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
     ; GFX6-NEXT: BUFFER_ATOMIC_ADD_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, implicit $exec :: (load store seq_cst (s32), addrspace 1)
+    ;
     ; GFX7-LABEL: name: global_atomicrmw_add_s32_nortn
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX7-NEXT: FLAT_ATOMIC_ADD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32), addrspace 1)
+    ;
     ; GFX9-LABEL: name: global_atomicrmw_add_s32_nortn
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX9-NEXT: GLOBAL_ATOMIC_ADD [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (load store seq_cst (s32), addrspace 1)
+    ;
     ; GFX10-LABEL: name: global_atomicrmw_add_s32_nortn
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX10-NEXT: GLOBAL_ATOMIC_ADD [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (load store seq_cst (s32), addrspace 1)
+    ;
     ; GFX11-LABEL: name: global_atomicrmw_add_s32_nortn
     ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX11-NEXT: {{  $}}
@@ -133,23 +141,23 @@ body:             |
     ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
     ; GFX6-NEXT: [[BUFFER_ATOMIC_ADD_ADDR64_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_ADDR64_RTN [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 2047, 1, implicit $exec :: (load store seq_cst (s32), addrspace 1)
     ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_ADDR64_RTN]]
+    ;
     ; GFX7-LABEL: name: global_atomicrmw_add_s32_offset2047
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2047, implicit $exec
-    ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 2047, implicit $exec
     ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32), addrspace 1)
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32), addrspace 1)
     ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]]
+    ;
     ; GFX9-LABEL: name: global_atomicrmw_add_s32_offset2047
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9-NEXT: {{  $}}
@@ -157,6 +165,7 @@ body:             |
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX9-NEXT: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 2047, 1, implicit $exec :: (load store seq_cst (s32), addrspace 1)
     ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_RTN]]
+    ;
     ; GFX10-LABEL: name: global_atomicrmw_add_s32_offset2047
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX10-NEXT: {{  $}}
@@ -164,6 +173,7 @@ body:             |
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX10-NEXT: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 2047, 1, implicit $exec :: (load store seq_cst (s32), addrspace 1)
     ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_RTN]]
+    ;
     ; GFX11-LABEL: name: global_atomicrmw_add_s32_offset2047
     ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX11-NEXT: {{  $}}
@@ -200,34 +210,36 @@ body:             |
     ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
     ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
     ; GFX6-NEXT: BUFFER_ATOMIC_ADD_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 2047, 0, implicit $exec :: (load store seq_cst (s32), addrspace 1)
+    ;
     ; GFX7-LABEL: name: global_atomicrmw_add_s32_offset2047_nortn
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2047, implicit $exec
-    ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 2047, implicit $exec
     ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-NEXT: FLAT_ATOMIC_ADD [[REG_SEQUENCE1]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32), addrspace 1)
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-NEXT: FLAT_ATOMIC_ADD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32), addrspace 1)
+    ;
     ; GFX9-LABEL: name: global_atomicrmw_add_s32_offset2047_nortn
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX9-NEXT: GLOBAL_ATOMIC_ADD [[COPY]], [[COPY1]], 2047, 0, implicit $exec :: (load store seq_cst (s32), addrspace 1)
+    ;
     ; GFX10-LABEL: name: global_atomicrmw_add_s32_offset2047_nortn
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX10-NEXT: GLOBAL_ATOMIC_ADD [[COPY]], [[COPY1]], 2047, 0, implicit $exec :: (load store seq_cst (s32), addrspace 1)
+    ;
     ; GFX11-LABEL: name: global_atomicrmw_add_s32_offset2047_nortn
     ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX11-NEXT: {{  $}}
@@ -263,23 +275,23 @@ body:             |
     ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
     ; GFX6-NEXT: [[BUFFER_ATOMIC_ADD_ADDR64_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_ADDR64_RTN [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 2048, 1, implicit $exec :: (load store seq_cst (s32), addrspace 1)
     ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_ADDR64_RTN]]
+    ;
     ; GFX7-LABEL: name: global_atomicrmw_add_s32_offset2048
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2048, implicit $exec
-    ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 2048, implicit $exec
     ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32), addrspace 1)
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32), addrspace 1)
     ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]]
+    ;
     ; GFX9-LABEL: name: global_atomicrmw_add_s32_offset2048
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9-NEXT: {{  $}}
@@ -287,23 +299,23 @@ body:             |
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX9-NEXT: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 2048, 1, implicit $exec :: (load store seq_cst (s32), addrspace 1)
     ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_RTN]]
+    ;
     ; GFX10-LABEL: name: global_atomicrmw_add_s32_offset2048
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2048, implicit $exec
-    ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 2048, implicit $exec
     ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX10-NEXT: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, implicit $exec :: (load store seq_cst (s32), addrspace 1)
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX10-NEXT: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[REG_SEQUENCE]], [[COPY1]], 0, 1, implicit $exec :: (load store seq_cst (s32), addrspace 1)
     ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_RTN]]
+    ;
     ; GFX11-LABEL: name: global_atomicrmw_add_s32_offset2048
     ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX11-NEXT: {{  $}}
@@ -340,44 +352,44 @@ body:             |
     ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
     ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
     ; GFX6-NEXT: BUFFER_ATOMIC_ADD_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 2048, 0, implicit $exec :: (load store seq_cst (s32), addrspace 1)
+    ;
     ; GFX7-LABEL: name: global_atomicrmw_add_s32_offset2048_nortn
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2048, implicit $exec
-    ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 2048, implicit $exec
     ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-NEXT: FLAT_ATOMIC_ADD [[REG_SEQUENCE1]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32), addrspace 1)
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-NEXT: FLAT_ATOMIC_ADD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32), addrspace 1)
+    ;
     ; GFX9-LABEL: name: global_atomicrmw_add_s32_offset2048_nortn
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX9-NEXT: GLOBAL_ATOMIC_ADD [[COPY]], [[COPY1]], 2048, 0, implicit $exec :: (load store seq_cst (s32), addrspace 1)
+    ;
     ; GFX10-LABEL: name: global_atomicrmw_add_s32_offset2048_nortn
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2048, implicit $exec
-    ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 2048, implicit $exec
     ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX10-NEXT: GLOBAL_ATOMIC_ADD [[REG_SEQUENCE1]], [[COPY1]], 0, 0, implicit $exec :: (load store seq_cst (s32), addrspace 1)
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX10-NEXT: GLOBAL_ATOMIC_ADD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec :: (load store seq_cst (s32), addrspace 1)
+    ;
     ; GFX11-LABEL: name: global_atomicrmw_add_s32_offset2048_nortn
     ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX11-NEXT: {{  $}}
@@ -413,23 +425,23 @@ body:             |
     ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
     ; GFX6-NEXT: [[BUFFER_ATOMIC_ADD_ADDR64_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_ADDR64_RTN [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 4095, 1, implicit $exec :: (load store seq_cst (s32), addrspace 1)
     ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_ADDR64_RTN]]
+    ;
     ; GFX7-LABEL: name: global_atomicrmw_add_s32_offset4095
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec
-    ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4095, implicit $exec
     ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32), addrspace 1)
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32), addrspace 1)
     ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]]
+    ;
     ; GFX9-LABEL: name: global_atomicrmw_add_s32_offset4095
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9-NEXT: {{  $}}
@@ -437,23 +449,23 @@ body:             |
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX9-NEXT: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[COPY]], [[COPY1]], 4095, 1, implicit $exec :: (load store seq_cst (s32), addrspace 1)
     ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_RTN]]
+    ;
     ; GFX10-LABEL: name: global_atomicrmw_add_s32_offset4095
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec
-    ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4095, implicit $exec
     ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX10-NEXT: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, implicit $exec :: (load store seq_cst (s32), addrspace 1)
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX10-NEXT: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[REG_SEQUENCE]], [[COPY1]], 0, 1, implicit $exec :: (load store seq_cst (s32), addrspace 1)
     ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_RTN]]
+    ;
     ; GFX11-LABEL: name: global_atomicrmw_add_s32_offset4095
     ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX11-NEXT: {{  $}}
@@ -490,44 +502,44 @@ body:             |
     ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
     ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
     ; GFX6-NEXT: BUFFER_ATOMIC_ADD_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 4095, 0, implicit $exec :: (load store seq_cst (s32), addrspace 1)
+    ;
     ; GFX7-LABEL: name: global_atomicrmw_add_s32_offset4095_nortn
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec
-    ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4095, implicit $exec
     ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-NEXT: FLAT_ATOMIC_ADD [[REG_SEQUENCE1]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32), addrspace 1)
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-NEXT: FLAT_ATOMIC_ADD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32), addrspace 1)
+    ;
     ; GFX9-LABEL: name: global_atomicrmw_add_s32_offset4095_nortn
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX9-NEXT: GLOBAL_ATOMIC_ADD [[COPY]], [[COPY1]], 4095, 0, implicit $exec :: (load store seq_cst (s32), addrspace 1)
+    ;
     ; GFX10-LABEL: name: global_atomicrmw_add_s32_offset4095_nortn
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec
-    ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4095, implicit $exec
     ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX10-NEXT: GLOBAL_ATOMIC_ADD [[REG_SEQUENCE1]], [[COPY1]], 0, 0, implicit $exec :: (load store seq_cst (s32), addrspace 1)
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX10-NEXT: GLOBAL_ATOMIC_ADD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec :: (load store seq_cst (s32), addrspace 1)
+    ;
     ; GFX11-LABEL: name: global_atomicrmw_add_s32_offset4095_nortn
     ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX11-NEXT: {{  $}}
@@ -564,73 +576,69 @@ body:             |
     ; GFX6-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 4097
     ; GFX6-NEXT: [[BUFFER_ATOMIC_ADD_ADDR64_RTN:%[0-9]+]]:vgpr_32 = BUFFER_ATOMIC_ADD_ADDR64_RTN [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_2]], 0, 1, implicit $exec :: (load store seq_cst (s32), addrspace 1)
     ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_ATOMIC_ADD_ADDR64_RTN]]
+    ;
     ; GFX7-LABEL: name: global_atomicrmw_add_s32_offset4097
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4097, implicit $exec
-    ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4097, implicit $exec
     ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32), addrspace 1)
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-NEXT: [[FLAT_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_ADD_RTN [[REG_SEQUENCE]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32), addrspace 1)
     ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_ATOMIC_ADD_RTN]]
+    ;
     ; GFX9-LABEL: name: global_atomicrmw_add_s32_offset4097
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX9-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4097, implicit $exec
-    ; GFX9-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX9-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4097, implicit $exec
     ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX9-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX9-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX9-NEXT: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, implicit $exec :: (load store seq_cst (s32), addrspace 1)
+    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX9-NEXT: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[REG_SEQUENCE]], [[COPY1]], 0, 1, implicit $exec :: (load store seq_cst (s32), addrspace 1)
     ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_RTN]]
+    ;
     ; GFX10-LABEL: name: global_atomicrmw_add_s32_offset4097
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4097, implicit $exec
-    ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4097, implicit $exec
     ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX10-NEXT: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, implicit $exec :: (load store seq_cst (s32), addrspace 1)
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX10-NEXT: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[REG_SEQUENCE]], [[COPY1]], 0, 1, implicit $exec :: (load store seq_cst (s32), addrspace 1)
     ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_RTN]]
+    ;
     ; GFX11-LABEL: name: global_atomicrmw_add_s32_offset4097
     ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4097, implicit $exec
-    ; GFX11-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX11-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4097, implicit $exec
     ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX11-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX11-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX11-NEXT: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, implicit $exec :: (load store seq_cst (s32), addrspace 1)
+    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX11-NEXT: [[GLOBAL_ATOMIC_ADD_RTN:%[0-9]+]]:vgpr_32 = GLOBAL_ATOMIC_ADD_RTN [[REG_SEQUENCE]], [[COPY1]], 0, 1, implicit $exec :: (load store seq_cst (s32), addrspace 1)
     ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_ATOMIC_ADD_RTN]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s32) = COPY $vgpr2
@@ -662,70 +670,66 @@ body:             |
     ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
     ; GFX6-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 4097
     ; GFX6-NEXT: BUFFER_ATOMIC_ADD_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], [[S_MOV_B32_2]], 0, 0, implicit $exec :: (load store seq_cst (s32), addrspace 1)
+    ;
     ; GFX7-LABEL: name: global_atomicrmw_add_s32_offset4097_nortn
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4097, implicit $exec
-    ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4097, implicit $exec
     ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-NEXT: FLAT_ATOMIC_ADD [[REG_SEQUENCE1]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32), addrspace 1)
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-NEXT: FLAT_ATOMIC_ADD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s32), addrspace 1)
+    ;
     ; GFX9-LABEL: name: global_atomicrmw_add_s32_offset4097_nortn
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX9-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4097, implicit $exec
-    ; GFX9-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX9-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4097, implicit $exec
     ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX9-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX9-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX9-NEXT: GLOBAL_ATOMIC_ADD [[REG_SEQUENCE1]], [[COPY1]], 0, 0, implicit $exec :: (load store seq_cst (s32), addrspace 1)
+    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX9-NEXT: GLOBAL_ATOMIC_ADD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec :: (load store seq_cst (s32), addrspace 1)
+    ;
     ; GFX10-LABEL: name: global_atomicrmw_add_s32_offset4097_nortn
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4097, implicit $exec
-    ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4097, implicit $exec
     ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX10-NEXT: GLOBAL_ATOMIC_ADD [[REG_SEQUENCE1]], [[COPY1]], 0, 0, implicit $exec :: (load store seq_cst (s32), addrspace 1)
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX10-NEXT: GLOBAL_ATOMIC_ADD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec :: (load store seq_cst (s32), addrspace 1)
+    ;
     ; GFX11-LABEL: name: global_atomicrmw_add_s32_offset4097_nortn
     ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4097, implicit $exec
-    ; GFX11-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX11-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4097, implicit $exec
     ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX11-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX11-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX11-NEXT: GLOBAL_ATOMIC_ADD [[REG_SEQUENCE1]], [[COPY1]], 0, 0, implicit $exec :: (load store seq_cst (s32), addrspace 1)
+    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX11-NEXT: GLOBAL_ATOMIC_ADD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec :: (load store seq_cst (s32), addrspace 1)
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s32) = COPY $vgpr2
     %2:vgpr(s64) = G_CONSTANT i64 4097
@@ -755,6 +759,7 @@ body:             |
     ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
     ; GFX6-NEXT: [[BUFFER_ATOMIC_ADD_X2_ADDR64_RTN:%[0-9]+]]:vreg_64 = BUFFER_ATOMIC_ADD_X2_ADDR64_RTN [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 1, implicit $exec :: (load store seq_cst (s64), addrspace 1)
     ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[BUFFER_ATOMIC_ADD_X2_ADDR64_RTN]]
+    ;
     ; GFX7-LABEL: name: global_atomicrmw_add_s64
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX7-NEXT: {{  $}}
@@ -762,6 +767,7 @@ body:             |
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX7-NEXT: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64), addrspace 1)
     ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_ATOMIC_ADD_X2_RTN]]
+    ;
     ; GFX9-LABEL: name: global_atomicrmw_add_s64
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -769,6 +775,7 @@ body:             |
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX9-NEXT: [[GLOBAL_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = GLOBAL_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 0, 1, implicit $exec :: (load store seq_cst (s64), addrspace 1)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[GLOBAL_ATOMIC_ADD_X2_RTN]]
+    ;
     ; GFX10-LABEL: name: global_atomicrmw_add_s64
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX10-NEXT: {{  $}}
@@ -776,6 +783,7 @@ body:             |
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX10-NEXT: [[GLOBAL_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = GLOBAL_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 0, 1, implicit $exec :: (load store seq_cst (s64), addrspace 1)
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[GLOBAL_ATOMIC_ADD_X2_RTN]]
+    ;
     ; GFX11-LABEL: name: global_atomicrmw_add_s64
     ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX11-NEXT: {{  $}}
@@ -810,24 +818,28 @@ body:             |
     ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
     ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
     ; GFX6-NEXT: BUFFER_ATOMIC_ADD_X2_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, implicit $exec :: (load store seq_cst (s64), addrspace 1)
+    ;
     ; GFX7-LABEL: name: global_atomicrmw_add_s64_nortn
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX7-NEXT: FLAT_ATOMIC_ADD_X2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64), addrspace 1)
+    ;
     ; GFX9-LABEL: name: global_atomicrmw_add_s64_nortn
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX9-NEXT: GLOBAL_ATOMIC_ADD_X2 [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (load store seq_cst (s64), addrspace 1)
+    ;
     ; GFX10-LABEL: name: global_atomicrmw_add_s64_nortn
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX10-NEXT: GLOBAL_ATOMIC_ADD_X2 [[COPY]], [[COPY1]], 0, 0, implicit $exec :: (load store seq_cst (s64), addrspace 1)
+    ;
     ; GFX11-LABEL: name: global_atomicrmw_add_s64_nortn
     ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX11-NEXT: {{  $}}
@@ -861,23 +873,23 @@ body:             |
     ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
     ; GFX6-NEXT: [[BUFFER_ATOMIC_ADD_X2_ADDR64_RTN:%[0-9]+]]:vreg_64 = BUFFER_ATOMIC_ADD_X2_ADDR64_RTN [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 4095, 1, implicit $exec :: (load store seq_cst (s64), addrspace 1)
     ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[BUFFER_ATOMIC_ADD_X2_ADDR64_RTN]]
+    ;
     ; GFX7-LABEL: name: global_atomicrmw_add_s64_offset4095
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec
-    ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4095, implicit $exec
     ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-NEXT: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64), addrspace 1)
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-NEXT: [[FLAT_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = FLAT_ATOMIC_ADD_X2_RTN [[REG_SEQUENCE]], [[COPY1]], 0, 1, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64), addrspace 1)
     ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_ATOMIC_ADD_X2_RTN]]
+    ;
     ; GFX9-LABEL: name: global_atomicrmw_add_s64_offset4095
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -885,23 +897,23 @@ body:             |
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX9-NEXT: [[GLOBAL_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = GLOBAL_ATOMIC_ADD_X2_RTN [[COPY]], [[COPY1]], 4095, 1, implicit $exec :: (load store seq_cst (s64), addrspace 1)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[GLOBAL_ATOMIC_ADD_X2_RTN]]
+    ;
     ; GFX10-LABEL: name: global_atomicrmw_add_s64_offset4095
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec
-    ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4095, implicit $exec
     ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX10-NEXT: [[GLOBAL_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = GLOBAL_ATOMIC_ADD_X2_RTN [[REG_SEQUENCE1]], [[COPY1]], 0, 1, implicit $exec :: (load store seq_cst (s64), addrspace 1)
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX10-NEXT: [[GLOBAL_ATOMIC_ADD_X2_RTN:%[0-9]+]]:vreg_64 = GLOBAL_ATOMIC_ADD_X2_RTN [[REG_SEQUENCE]], [[COPY1]], 0, 1, implicit $exec :: (load store seq_cst (s64), addrspace 1)
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[GLOBAL_ATOMIC_ADD_X2_RTN]]
+    ;
     ; GFX11-LABEL: name: global_atomicrmw_add_s64_offset4095
     ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX11-NEXT: {{  $}}
@@ -938,44 +950,44 @@ body:             |
     ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
     ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
     ; GFX6-NEXT: BUFFER_ATOMIC_ADD_X2_ADDR64 [[COPY1]], [[COPY]], [[REG_SEQUENCE1]], 0, 4095, 0, implicit $exec :: (load store seq_cst (s64), addrspace 1)
+    ;
     ; GFX7-LABEL: name: global_atomicrmw_add_s64_offset4095_nortn
     ; GFX7: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec
-    ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4095, implicit $exec
     ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-NEXT: FLAT_ATOMIC_ADD_X2 [[REG_SEQUENCE1]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64), addrspace 1)
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-NEXT: FLAT_ATOMIC_ADD_X2 [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (load store seq_cst (s64), addrspace 1)
+    ;
     ; GFX9-LABEL: name: global_atomicrmw_add_s64_offset4095_nortn
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX9-NEXT: GLOBAL_ATOMIC_ADD_X2 [[COPY]], [[COPY1]], 4095, 0, implicit $exec :: (load store seq_cst (s64), addrspace 1)
+    ;
     ; GFX10-LABEL: name: global_atomicrmw_add_s64_offset4095_nortn
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec
-    ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4095, implicit $exec
     ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX10-NEXT: GLOBAL_ATOMIC_ADD_X2 [[REG_SEQUENCE1]], [[COPY1]], 0, 0, implicit $exec :: (load store seq_cst (s64), addrspace 1)
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX10-NEXT: GLOBAL_ATOMIC_ADD_X2 [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec :: (load store seq_cst (s64), addrspace 1)
+    ;
     ; GFX11-LABEL: name: global_atomicrmw_add_s64_offset4095_nortn
     ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX11-NEXT: {{  $}}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-constant.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-constant.mir
index dadc6f32dfb8c..ca3fd71f6c981 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-constant.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-constant.mir
@@ -136,58 +136,34 @@ tracksRegLiveness: true
 body: |
   bb.0:
     ; WAVE64-LABEL: name: constant_v_s64
-    ; WAVE64: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; WAVE64-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; WAVE64: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec
+    ; WAVE64-NEXT: [[V_MOV_B1:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 1, implicit $exec
+    ; WAVE64-NEXT: [[V_MOV_B2:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -1, implicit $exec
+    ; WAVE64-NEXT: [[V_MOV_B3:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -54, implicit $exec
+    ; WAVE64-NEXT: [[V_MOV_B4:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 27, implicit $exec
+    ; WAVE64-NEXT: [[V_MOV_B5:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4294967295, implicit $exec
+    ; WAVE64-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; WAVE64-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
     ; WAVE64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
-    ; WAVE64-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
-    ; WAVE64-NEXT: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; WAVE64-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 23255, implicit $exec
+    ; WAVE64-NEXT: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -16, implicit $exec
     ; WAVE64-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_2]], %subreg.sub0, [[V_MOV_B32_e32_3]], %subreg.sub1
-    ; WAVE64-NEXT: [[V_MOV_B32_e32_4:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967295, implicit $exec
-    ; WAVE64-NEXT: [[V_MOV_B32_e32_5:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; WAVE64-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_4]], %subreg.sub0, [[V_MOV_B32_e32_5]], %subreg.sub1
-    ; WAVE64-NEXT: [[V_MOV_B32_e32_6:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967242, implicit $exec
-    ; WAVE64-NEXT: [[V_MOV_B32_e32_7:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; WAVE64-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_6]], %subreg.sub0, [[V_MOV_B32_e32_7]], %subreg.sub1
-    ; WAVE64-NEXT: [[V_MOV_B32_e32_8:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 27, implicit $exec
-    ; WAVE64-NEXT: [[V_MOV_B32_e32_9:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; WAVE64-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_8]], %subreg.sub0, [[V_MOV_B32_e32_9]], %subreg.sub1
-    ; WAVE64-NEXT: [[V_MOV_B32_e32_10:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967295, implicit $exec
-    ; WAVE64-NEXT: [[V_MOV_B32_e32_11:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; WAVE64-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_10]], %subreg.sub0, [[V_MOV_B32_e32_11]], %subreg.sub1
-    ; WAVE64-NEXT: [[V_MOV_B32_e32_12:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; WAVE64-NEXT: [[V_MOV_B32_e32_13:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
-    ; WAVE64-NEXT: [[REG_SEQUENCE6:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_12]], %subreg.sub0, [[V_MOV_B32_e32_13]], %subreg.sub1
-    ; WAVE64-NEXT: [[V_MOV_B32_e32_14:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 23255, implicit $exec
-    ; WAVE64-NEXT: [[V_MOV_B32_e32_15:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -16, implicit $exec
-    ; WAVE64-NEXT: [[REG_SEQUENCE7:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_14]], %subreg.sub0, [[V_MOV_B32_e32_15]], %subreg.sub1
-    ; WAVE64-NEXT: S_ENDPGM 0, implicit [[REG_SEQUENCE]], implicit [[REG_SEQUENCE1]], implicit [[REG_SEQUENCE2]], implicit [[REG_SEQUENCE3]], implicit [[REG_SEQUENCE4]], implicit [[REG_SEQUENCE5]], implicit [[REG_SEQUENCE6]], implicit [[REG_SEQUENCE7]]
+    ; WAVE64-NEXT: S_ENDPGM 0, implicit [[V_MOV_B]], implicit [[V_MOV_B1]], implicit [[V_MOV_B2]], implicit [[V_MOV_B3]], implicit [[V_MOV_B4]], implicit [[V_MOV_B5]], implicit [[REG_SEQUENCE]], implicit [[REG_SEQUENCE1]]
     ;
     ; WAVE32-LABEL: name: constant_v_s64
-    ; WAVE32: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; WAVE32-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; WAVE32: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec
+    ; WAVE32-NEXT: [[V_MOV_B1:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 1, implicit $exec
+    ; WAVE32-NEXT: [[V_MOV_B2:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -1, implicit $exec
+    ; WAVE32-NEXT: [[V_MOV_B3:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -54, implicit $exec
+    ; WAVE32-NEXT: [[V_MOV_B4:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 27, implicit $exec
+    ; WAVE32-NEXT: [[V_MOV_B5:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4294967295, implicit $exec
+    ; WAVE32-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; WAVE32-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
     ; WAVE32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
-    ; WAVE32-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
-    ; WAVE32-NEXT: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; WAVE32-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 23255, implicit $exec
+    ; WAVE32-NEXT: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -16, implicit $exec
     ; WAVE32-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_2]], %subreg.sub0, [[V_MOV_B32_e32_3]], %subreg.sub1
-    ; WAVE32-NEXT: [[V_MOV_B32_e32_4:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967295, implicit $exec
-    ; WAVE32-NEXT: [[V_MOV_B32_e32_5:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; WAVE32-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_4]], %subreg.sub0, [[V_MOV_B32_e32_5]], %subreg.sub1
-    ; WAVE32-NEXT: [[V_MOV_B32_e32_6:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967242, implicit $exec
-    ; WAVE32-NEXT: [[V_MOV_B32_e32_7:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; WAVE32-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_6]], %subreg.sub0, [[V_MOV_B32_e32_7]], %subreg.sub1
-    ; WAVE32-NEXT: [[V_MOV_B32_e32_8:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 27, implicit $exec
-    ; WAVE32-NEXT: [[V_MOV_B32_e32_9:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; WAVE32-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_8]], %subreg.sub0, [[V_MOV_B32_e32_9]], %subreg.sub1
-    ; WAVE32-NEXT: [[V_MOV_B32_e32_10:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967295, implicit $exec
-    ; WAVE32-NEXT: [[V_MOV_B32_e32_11:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; WAVE32-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_10]], %subreg.sub0, [[V_MOV_B32_e32_11]], %subreg.sub1
-    ; WAVE32-NEXT: [[V_MOV_B32_e32_12:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; WAVE32-NEXT: [[V_MOV_B32_e32_13:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
-    ; WAVE32-NEXT: [[REG_SEQUENCE6:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_12]], %subreg.sub0, [[V_MOV_B32_e32_13]], %subreg.sub1
-    ; WAVE32-NEXT: [[V_MOV_B32_e32_14:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 23255, implicit $exec
-    ; WAVE32-NEXT: [[V_MOV_B32_e32_15:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -16, implicit $exec
-    ; WAVE32-NEXT: [[REG_SEQUENCE7:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_14]], %subreg.sub0, [[V_MOV_B32_e32_15]], %subreg.sub1
-    ; WAVE32-NEXT: S_ENDPGM 0, implicit [[REG_SEQUENCE]], implicit [[REG_SEQUENCE1]], implicit [[REG_SEQUENCE2]], implicit [[REG_SEQUENCE3]], implicit [[REG_SEQUENCE4]], implicit [[REG_SEQUENCE5]], implicit [[REG_SEQUENCE6]], implicit [[REG_SEQUENCE7]]
+    ; WAVE32-NEXT: S_ENDPGM 0, implicit [[V_MOV_B]], implicit [[V_MOV_B1]], implicit [[V_MOV_B2]], implicit [[V_MOV_B3]], implicit [[V_MOV_B4]], implicit [[V_MOV_B5]], implicit [[REG_SEQUENCE]], implicit [[REG_SEQUENCE1]]
     %0:vgpr(s64) = G_CONSTANT i64 0
     %1:vgpr(s64) = G_CONSTANT i64 1
     %2:vgpr(s64) = G_CONSTANT i64 -1
@@ -208,42 +184,34 @@ tracksRegLiveness: true
 body: |
   bb.0:
     ; WAVE64-LABEL: name: constant_s_s64
-    ; WAVE64: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-    ; WAVE64-NEXT: [[S_MOV_B64_1:%[0-9]+]]:sreg_64 = S_MOV_B64 1
-    ; WAVE64-NEXT: [[S_MOV_B64_2:%[0-9]+]]:sreg_64 = S_MOV_B64 -1
-    ; WAVE64-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4294967242
-    ; WAVE64-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+    ; WAVE64: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 0
+    ; WAVE64-NEXT: [[S_MOV_B1:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 1
+    ; WAVE64-NEXT: [[S_MOV_B2:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -1
+    ; WAVE64-NEXT: [[S_MOV_B3:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -54
+    ; WAVE64-NEXT: [[S_MOV_B4:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 27
+    ; WAVE64-NEXT: [[S_MOV_B5:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 4294967295
+    ; WAVE64-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; WAVE64-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1
     ; WAVE64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
-    ; WAVE64-NEXT: [[S_MOV_B64_3:%[0-9]+]]:sreg_64 = S_MOV_B64 27
-    ; WAVE64-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 4294967295
-    ; WAVE64-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; WAVE64-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 23255
+    ; WAVE64-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 -16
     ; WAVE64-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_2]], %subreg.sub0, [[S_MOV_B32_3]], %subreg.sub1
-    ; WAVE64-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-    ; WAVE64-NEXT: [[S_MOV_B32_5:%[0-9]+]]:sreg_32 = S_MOV_B32 1
-    ; WAVE64-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_4]], %subreg.sub0, [[S_MOV_B32_5]], %subreg.sub1
-    ; WAVE64-NEXT: [[S_MOV_B32_6:%[0-9]+]]:sreg_32 = S_MOV_B32 23255
-    ; WAVE64-NEXT: [[S_MOV_B32_7:%[0-9]+]]:sreg_32 = S_MOV_B32 -16
-    ; WAVE64-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_6]], %subreg.sub0, [[S_MOV_B32_7]], %subreg.sub1
-    ; WAVE64-NEXT: S_ENDPGM 0, implicit [[S_MOV_B64_]], implicit [[S_MOV_B64_1]], implicit [[S_MOV_B64_2]], implicit [[REG_SEQUENCE]], implicit [[S_MOV_B64_3]], implicit [[REG_SEQUENCE1]], implicit [[REG_SEQUENCE2]], implicit [[REG_SEQUENCE3]]
+    ; WAVE64-NEXT: S_ENDPGM 0, implicit [[S_MOV_B]], implicit [[S_MOV_B1]], implicit [[S_MOV_B2]], implicit [[S_MOV_B3]], implicit [[S_MOV_B4]], implicit [[S_MOV_B5]], implicit [[REG_SEQUENCE]], implicit [[REG_SEQUENCE1]]
     ;
     ; WAVE32-LABEL: name: constant_s_s64
-    ; WAVE32: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-    ; WAVE32-NEXT: [[S_MOV_B64_1:%[0-9]+]]:sreg_64 = S_MOV_B64 1
-    ; WAVE32-NEXT: [[S_MOV_B64_2:%[0-9]+]]:sreg_64 = S_MOV_B64 -1
-    ; WAVE32-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4294967242
-    ; WAVE32-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+    ; WAVE32: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 0
+    ; WAVE32-NEXT: [[S_MOV_B1:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 1
+    ; WAVE32-NEXT: [[S_MOV_B2:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -1
+    ; WAVE32-NEXT: [[S_MOV_B3:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -54
+    ; WAVE32-NEXT: [[S_MOV_B4:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 27
+    ; WAVE32-NEXT: [[S_MOV_B5:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 4294967295
+    ; WAVE32-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; WAVE32-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1
     ; WAVE32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
-    ; WAVE32-NEXT: [[S_MOV_B64_3:%[0-9]+]]:sreg_64 = S_MOV_B64 27
-    ; WAVE32-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 4294967295
-    ; WAVE32-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; WAVE32-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 23255
+    ; WAVE32-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 -16
     ; WAVE32-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_2]], %subreg.sub0, [[S_MOV_B32_3]], %subreg.sub1
-    ; WAVE32-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-    ; WAVE32-NEXT: [[S_MOV_B32_5:%[0-9]+]]:sreg_32 = S_MOV_B32 1
-    ; WAVE32-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_4]], %subreg.sub0, [[S_MOV_B32_5]], %subreg.sub1
-    ; WAVE32-NEXT: [[S_MOV_B32_6:%[0-9]+]]:sreg_32 = S_MOV_B32 23255
-    ; WAVE32-NEXT: [[S_MOV_B32_7:%[0-9]+]]:sreg_32 = S_MOV_B32 -16
-    ; WAVE32-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_6]], %subreg.sub0, [[S_MOV_B32_7]], %subreg.sub1
-    ; WAVE32-NEXT: S_ENDPGM 0, implicit [[S_MOV_B64_]], implicit [[S_MOV_B64_1]], implicit [[S_MOV_B64_2]], implicit [[REG_SEQUENCE]], implicit [[S_MOV_B64_3]], implicit [[REG_SEQUENCE1]], implicit [[REG_SEQUENCE2]], implicit [[REG_SEQUENCE3]]
+    ; WAVE32-NEXT: S_ENDPGM 0, implicit [[S_MOV_B]], implicit [[S_MOV_B1]], implicit [[S_MOV_B2]], implicit [[S_MOV_B3]], implicit [[S_MOV_B4]], implicit [[S_MOV_B5]], implicit [[REG_SEQUENCE]], implicit [[REG_SEQUENCE1]]
     %0:sgpr(s64) = G_CONSTANT i64 0
     %1:sgpr(s64) = G_CONSTANT i64 1
     %2:sgpr(s64) = G_CONSTANT i64 -1
@@ -351,42 +319,34 @@ tracksRegLiveness: true
 body: |
   bb.0:
     ; WAVE64-LABEL: name: constant_s_p1
-    ; WAVE64: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-    ; WAVE64-NEXT: [[S_MOV_B64_1:%[0-9]+]]:sreg_64 = S_MOV_B64 1
-    ; WAVE64-NEXT: [[S_MOV_B64_2:%[0-9]+]]:sreg_64 = S_MOV_B64 -1
-    ; WAVE64-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4294967242
-    ; WAVE64-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+    ; WAVE64: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 0
+    ; WAVE64-NEXT: [[S_MOV_B1:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 1
+    ; WAVE64-NEXT: [[S_MOV_B2:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -1
+    ; WAVE64-NEXT: [[S_MOV_B3:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -54
+    ; WAVE64-NEXT: [[S_MOV_B4:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 27
+    ; WAVE64-NEXT: [[S_MOV_B5:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 4294967295
+    ; WAVE64-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; WAVE64-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1
     ; WAVE64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
-    ; WAVE64-NEXT: [[S_MOV_B64_3:%[0-9]+]]:sreg_64 = S_MOV_B64 27
-    ; WAVE64-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 4294967295
-    ; WAVE64-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; WAVE64-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 23255
+    ; WAVE64-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 -16
     ; WAVE64-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_2]], %subreg.sub0, [[S_MOV_B32_3]], %subreg.sub1
-    ; WAVE64-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-    ; WAVE64-NEXT: [[S_MOV_B32_5:%[0-9]+]]:sreg_32 = S_MOV_B32 1
-    ; WAVE64-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_4]], %subreg.sub0, [[S_MOV_B32_5]], %subreg.sub1
-    ; WAVE64-NEXT: [[S_MOV_B32_6:%[0-9]+]]:sreg_32 = S_MOV_B32 23255
-    ; WAVE64-NEXT: [[S_MOV_B32_7:%[0-9]+]]:sreg_32 = S_MOV_B32 -16
-    ; WAVE64-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_6]], %subreg.sub0, [[S_MOV_B32_7]], %subreg.sub1
-    ; WAVE64-NEXT: S_ENDPGM 0, implicit [[S_MOV_B64_]], implicit [[S_MOV_B64_1]], implicit [[S_MOV_B64_2]], implicit [[REG_SEQUENCE]], implicit [[S_MOV_B64_3]], implicit [[REG_SEQUENCE1]], implicit [[REG_SEQUENCE2]], implicit [[REG_SEQUENCE3]]
+    ; WAVE64-NEXT: S_ENDPGM 0, implicit [[S_MOV_B]], implicit [[S_MOV_B1]], implicit [[S_MOV_B2]], implicit [[S_MOV_B3]], implicit [[S_MOV_B4]], implicit [[S_MOV_B5]], implicit [[REG_SEQUENCE]], implicit [[REG_SEQUENCE1]]
     ;
     ; WAVE32-LABEL: name: constant_s_p1
-    ; WAVE32: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-    ; WAVE32-NEXT: [[S_MOV_B64_1:%[0-9]+]]:sreg_64 = S_MOV_B64 1
-    ; WAVE32-NEXT: [[S_MOV_B64_2:%[0-9]+]]:sreg_64 = S_MOV_B64 -1
-    ; WAVE32-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4294967242
-    ; WAVE32-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+    ; WAVE32: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 0
+    ; WAVE32-NEXT: [[S_MOV_B1:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 1
+    ; WAVE32-NEXT: [[S_MOV_B2:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -1
+    ; WAVE32-NEXT: [[S_MOV_B3:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -54
+    ; WAVE32-NEXT: [[S_MOV_B4:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 27
+    ; WAVE32-NEXT: [[S_MOV_B5:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 4294967295
+    ; WAVE32-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; WAVE32-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1
     ; WAVE32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
-    ; WAVE32-NEXT: [[S_MOV_B64_3:%[0-9]+]]:sreg_64 = S_MOV_B64 27
-    ; WAVE32-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 4294967295
-    ; WAVE32-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; WAVE32-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 23255
+    ; WAVE32-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 -16
     ; WAVE32-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_2]], %subreg.sub0, [[S_MOV_B32_3]], %subreg.sub1
-    ; WAVE32-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-    ; WAVE32-NEXT: [[S_MOV_B32_5:%[0-9]+]]:sreg_32 = S_MOV_B32 1
-    ; WAVE32-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_4]], %subreg.sub0, [[S_MOV_B32_5]], %subreg.sub1
-    ; WAVE32-NEXT: [[S_MOV_B32_6:%[0-9]+]]:sreg_32 = S_MOV_B32 23255
-    ; WAVE32-NEXT: [[S_MOV_B32_7:%[0-9]+]]:sreg_32 = S_MOV_B32 -16
-    ; WAVE32-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_6]], %subreg.sub0, [[S_MOV_B32_7]], %subreg.sub1
-    ; WAVE32-NEXT: S_ENDPGM 0, implicit [[S_MOV_B64_]], implicit [[S_MOV_B64_1]], implicit [[S_MOV_B64_2]], implicit [[REG_SEQUENCE]], implicit [[S_MOV_B64_3]], implicit [[REG_SEQUENCE1]], implicit [[REG_SEQUENCE2]], implicit [[REG_SEQUENCE3]]
+    ; WAVE32-NEXT: S_ENDPGM 0, implicit [[S_MOV_B]], implicit [[S_MOV_B1]], implicit [[S_MOV_B2]], implicit [[S_MOV_B3]], implicit [[S_MOV_B4]], implicit [[S_MOV_B5]], implicit [[REG_SEQUENCE]], implicit [[REG_SEQUENCE1]]
     %0:sgpr(p1) = G_CONSTANT i64 0
     %1:sgpr(p1) = G_CONSTANT i64 1
     %2:sgpr(p1) = G_CONSTANT i64 -1
@@ -407,58 +367,34 @@ tracksRegLiveness: true
 body: |
   bb.0:
     ; WAVE64-LABEL: name: constant_v_p1
-    ; WAVE64: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; WAVE64-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; WAVE64: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec
+    ; WAVE64-NEXT: [[V_MOV_B1:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 1, implicit $exec
+    ; WAVE64-NEXT: [[V_MOV_B2:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -1, implicit $exec
+    ; WAVE64-NEXT: [[V_MOV_B3:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -54, implicit $exec
+    ; WAVE64-NEXT: [[V_MOV_B4:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 27, implicit $exec
+    ; WAVE64-NEXT: [[V_MOV_B5:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4294967295, implicit $exec
+    ; WAVE64-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; WAVE64-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
     ; WAVE64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
-    ; WAVE64-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
-    ; WAVE64-NEXT: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; WAVE64-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 23255, implicit $exec
+    ; WAVE64-NEXT: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -16, implicit $exec
     ; WAVE64-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_2]], %subreg.sub0, [[V_MOV_B32_e32_3]], %subreg.sub1
-    ; WAVE64-NEXT: [[V_MOV_B32_e32_4:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967295, implicit $exec
-    ; WAVE64-NEXT: [[V_MOV_B32_e32_5:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; WAVE64-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_4]], %subreg.sub0, [[V_MOV_B32_e32_5]], %subreg.sub1
-    ; WAVE64-NEXT: [[V_MOV_B32_e32_6:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967242, implicit $exec
-    ; WAVE64-NEXT: [[V_MOV_B32_e32_7:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; WAVE64-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_6]], %subreg.sub0, [[V_MOV_B32_e32_7]], %subreg.sub1
-    ; WAVE64-NEXT: [[V_MOV_B32_e32_8:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 27, implicit $exec
-    ; WAVE64-NEXT: [[V_MOV_B32_e32_9:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; WAVE64-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_8]], %subreg.sub0, [[V_MOV_B32_e32_9]], %subreg.sub1
-    ; WAVE64-NEXT: [[V_MOV_B32_e32_10:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967295, implicit $exec
-    ; WAVE64-NEXT: [[V_MOV_B32_e32_11:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; WAVE64-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_10]], %subreg.sub0, [[V_MOV_B32_e32_11]], %subreg.sub1
-    ; WAVE64-NEXT: [[V_MOV_B32_e32_12:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; WAVE64-NEXT: [[V_MOV_B32_e32_13:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
-    ; WAVE64-NEXT: [[REG_SEQUENCE6:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_12]], %subreg.sub0, [[V_MOV_B32_e32_13]], %subreg.sub1
-    ; WAVE64-NEXT: [[V_MOV_B32_e32_14:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 23255, implicit $exec
-    ; WAVE64-NEXT: [[V_MOV_B32_e32_15:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -16, implicit $exec
-    ; WAVE64-NEXT: [[REG_SEQUENCE7:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_14]], %subreg.sub0, [[V_MOV_B32_e32_15]], %subreg.sub1
-    ; WAVE64-NEXT: S_ENDPGM 0, implicit [[REG_SEQUENCE]], implicit [[REG_SEQUENCE1]], implicit [[REG_SEQUENCE2]], implicit [[REG_SEQUENCE3]], implicit [[REG_SEQUENCE4]], implicit [[REG_SEQUENCE5]], implicit [[REG_SEQUENCE6]], implicit [[REG_SEQUENCE7]]
+    ; WAVE64-NEXT: S_ENDPGM 0, implicit [[V_MOV_B]], implicit [[V_MOV_B1]], implicit [[V_MOV_B2]], implicit [[V_MOV_B3]], implicit [[V_MOV_B4]], implicit [[V_MOV_B5]], implicit [[REG_SEQUENCE]], implicit [[REG_SEQUENCE1]]
     ;
     ; WAVE32-LABEL: name: constant_v_p1
-    ; WAVE32: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; WAVE32-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; WAVE32: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec
+    ; WAVE32-NEXT: [[V_MOV_B1:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 1, implicit $exec
+    ; WAVE32-NEXT: [[V_MOV_B2:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -1, implicit $exec
+    ; WAVE32-NEXT: [[V_MOV_B3:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -54, implicit $exec
+    ; WAVE32-NEXT: [[V_MOV_B4:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 27, implicit $exec
+    ; WAVE32-NEXT: [[V_MOV_B5:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4294967295, implicit $exec
+    ; WAVE32-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; WAVE32-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
     ; WAVE32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
-    ; WAVE32-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
-    ; WAVE32-NEXT: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; WAVE32-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 23255, implicit $exec
+    ; WAVE32-NEXT: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -16, implicit $exec
     ; WAVE32-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_2]], %subreg.sub0, [[V_MOV_B32_e32_3]], %subreg.sub1
-    ; WAVE32-NEXT: [[V_MOV_B32_e32_4:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967295, implicit $exec
-    ; WAVE32-NEXT: [[V_MOV_B32_e32_5:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; WAVE32-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_4]], %subreg.sub0, [[V_MOV_B32_e32_5]], %subreg.sub1
-    ; WAVE32-NEXT: [[V_MOV_B32_e32_6:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967242, implicit $exec
-    ; WAVE32-NEXT: [[V_MOV_B32_e32_7:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; WAVE32-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_6]], %subreg.sub0, [[V_MOV_B32_e32_7]], %subreg.sub1
-    ; WAVE32-NEXT: [[V_MOV_B32_e32_8:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 27, implicit $exec
-    ; WAVE32-NEXT: [[V_MOV_B32_e32_9:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; WAVE32-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_8]], %subreg.sub0, [[V_MOV_B32_e32_9]], %subreg.sub1
-    ; WAVE32-NEXT: [[V_MOV_B32_e32_10:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967295, implicit $exec
-    ; WAVE32-NEXT: [[V_MOV_B32_e32_11:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; WAVE32-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_10]], %subreg.sub0, [[V_MOV_B32_e32_11]], %subreg.sub1
-    ; WAVE32-NEXT: [[V_MOV_B32_e32_12:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; WAVE32-NEXT: [[V_MOV_B32_e32_13:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
-    ; WAVE32-NEXT: [[REG_SEQUENCE6:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_12]], %subreg.sub0, [[V_MOV_B32_e32_13]], %subreg.sub1
-    ; WAVE32-NEXT: [[V_MOV_B32_e32_14:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 23255, implicit $exec
-    ; WAVE32-NEXT: [[V_MOV_B32_e32_15:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -16, implicit $exec
-    ; WAVE32-NEXT: [[REG_SEQUENCE7:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_14]], %subreg.sub0, [[V_MOV_B32_e32_15]], %subreg.sub1
-    ; WAVE32-NEXT: S_ENDPGM 0, implicit [[REG_SEQUENCE]], implicit [[REG_SEQUENCE1]], implicit [[REG_SEQUENCE2]], implicit [[REG_SEQUENCE3]], implicit [[REG_SEQUENCE4]], implicit [[REG_SEQUENCE5]], implicit [[REG_SEQUENCE6]], implicit [[REG_SEQUENCE7]]
+    ; WAVE32-NEXT: S_ENDPGM 0, implicit [[V_MOV_B]], implicit [[V_MOV_B1]], implicit [[V_MOV_B2]], implicit [[V_MOV_B3]], implicit [[V_MOV_B4]], implicit [[V_MOV_B5]], implicit [[REG_SEQUENCE]], implicit [[REG_SEQUENCE1]]
     %0:vgpr(p1) = G_CONSTANT i64 0
     %1:vgpr(p1) = G_CONSTANT i64 1
     %2:vgpr(p1) = G_CONSTANT i64 -1
@@ -479,42 +415,34 @@ tracksRegLiveness: true
 body: |
   bb.0:
     ; WAVE64-LABEL: name: constant_s_p999
-    ; WAVE64: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-    ; WAVE64-NEXT: [[S_MOV_B64_1:%[0-9]+]]:sreg_64 = S_MOV_B64 1
-    ; WAVE64-NEXT: [[S_MOV_B64_2:%[0-9]+]]:sreg_64 = S_MOV_B64 -1
-    ; WAVE64-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4294967242
-    ; WAVE64-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+    ; WAVE64: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 0
+    ; WAVE64-NEXT: [[S_MOV_B1:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 1
+    ; WAVE64-NEXT: [[S_MOV_B2:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -1
+    ; WAVE64-NEXT: [[S_MOV_B3:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -54
+    ; WAVE64-NEXT: [[S_MOV_B4:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 27
+    ; WAVE64-NEXT: [[S_MOV_B5:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 4294967295
+    ; WAVE64-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; WAVE64-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1
     ; WAVE64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
-    ; WAVE64-NEXT: [[S_MOV_B64_3:%[0-9]+]]:sreg_64 = S_MOV_B64 27
-    ; WAVE64-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 4294967295
-    ; WAVE64-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; WAVE64-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 23255
+    ; WAVE64-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 -16
     ; WAVE64-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_2]], %subreg.sub0, [[S_MOV_B32_3]], %subreg.sub1
-    ; WAVE64-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-    ; WAVE64-NEXT: [[S_MOV_B32_5:%[0-9]+]]:sreg_32 = S_MOV_B32 1
-    ; WAVE64-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_4]], %subreg.sub0, [[S_MOV_B32_5]], %subreg.sub1
-    ; WAVE64-NEXT: [[S_MOV_B32_6:%[0-9]+]]:sreg_32 = S_MOV_B32 23255
-    ; WAVE64-NEXT: [[S_MOV_B32_7:%[0-9]+]]:sreg_32 = S_MOV_B32 -16
-    ; WAVE64-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_6]], %subreg.sub0, [[S_MOV_B32_7]], %subreg.sub1
-    ; WAVE64-NEXT: S_ENDPGM 0, implicit [[S_MOV_B64_]], implicit [[S_MOV_B64_1]], implicit [[S_MOV_B64_2]], implicit [[REG_SEQUENCE]], implicit [[S_MOV_B64_3]], implicit [[REG_SEQUENCE1]], implicit [[REG_SEQUENCE2]], implicit [[REG_SEQUENCE3]]
+    ; WAVE64-NEXT: S_ENDPGM 0, implicit [[S_MOV_B]], implicit [[S_MOV_B1]], implicit [[S_MOV_B2]], implicit [[S_MOV_B3]], implicit [[S_MOV_B4]], implicit [[S_MOV_B5]], implicit [[REG_SEQUENCE]], implicit [[REG_SEQUENCE1]]
     ;
     ; WAVE32-LABEL: name: constant_s_p999
-    ; WAVE32: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-    ; WAVE32-NEXT: [[S_MOV_B64_1:%[0-9]+]]:sreg_64 = S_MOV_B64 1
-    ; WAVE32-NEXT: [[S_MOV_B64_2:%[0-9]+]]:sreg_64 = S_MOV_B64 -1
-    ; WAVE32-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4294967242
-    ; WAVE32-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
+    ; WAVE32: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 0
+    ; WAVE32-NEXT: [[S_MOV_B1:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 1
+    ; WAVE32-NEXT: [[S_MOV_B2:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -1
+    ; WAVE32-NEXT: [[S_MOV_B3:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -54
+    ; WAVE32-NEXT: [[S_MOV_B4:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 27
+    ; WAVE32-NEXT: [[S_MOV_B5:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 4294967295
+    ; WAVE32-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; WAVE32-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1
     ; WAVE32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
-    ; WAVE32-NEXT: [[S_MOV_B64_3:%[0-9]+]]:sreg_64 = S_MOV_B64 27
-    ; WAVE32-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 4294967295
-    ; WAVE32-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+    ; WAVE32-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 23255
+    ; WAVE32-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 -16
     ; WAVE32-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_2]], %subreg.sub0, [[S_MOV_B32_3]], %subreg.sub1
-    ; WAVE32-NEXT: [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-    ; WAVE32-NEXT: [[S_MOV_B32_5:%[0-9]+]]:sreg_32 = S_MOV_B32 1
-    ; WAVE32-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_4]], %subreg.sub0, [[S_MOV_B32_5]], %subreg.sub1
-    ; WAVE32-NEXT: [[S_MOV_B32_6:%[0-9]+]]:sreg_32 = S_MOV_B32 23255
-    ; WAVE32-NEXT: [[S_MOV_B32_7:%[0-9]+]]:sreg_32 = S_MOV_B32 -16
-    ; WAVE32-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_6]], %subreg.sub0, [[S_MOV_B32_7]], %subreg.sub1
-    ; WAVE32-NEXT: S_ENDPGM 0, implicit [[S_MOV_B64_]], implicit [[S_MOV_B64_1]], implicit [[S_MOV_B64_2]], implicit [[REG_SEQUENCE]], implicit [[S_MOV_B64_3]], implicit [[REG_SEQUENCE1]], implicit [[REG_SEQUENCE2]], implicit [[REG_SEQUENCE3]]
+    ; WAVE32-NEXT: S_ENDPGM 0, implicit [[S_MOV_B]], implicit [[S_MOV_B1]], implicit [[S_MOV_B2]], implicit [[S_MOV_B3]], implicit [[S_MOV_B4]], implicit [[S_MOV_B5]], implicit [[REG_SEQUENCE]], implicit [[REG_SEQUENCE1]]
     %0:sgpr(p999) = G_CONSTANT i64 0
     %1:sgpr(p999) = G_CONSTANT i64 1
     %2:sgpr(p999) = G_CONSTANT i64 -1
@@ -535,58 +463,34 @@ tracksRegLiveness: true
 body: |
   bb.0:
     ; WAVE64-LABEL: name: constant_v_p999
-    ; WAVE64: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; WAVE64-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; WAVE64: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec
+    ; WAVE64-NEXT: [[V_MOV_B1:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 1, implicit $exec
+    ; WAVE64-NEXT: [[V_MOV_B2:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -1, implicit $exec
+    ; WAVE64-NEXT: [[V_MOV_B3:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -54, implicit $exec
+    ; WAVE64-NEXT: [[V_MOV_B4:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 27, implicit $exec
+    ; WAVE64-NEXT: [[V_MOV_B5:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4294967295, implicit $exec
+    ; WAVE64-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; WAVE64-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
     ; WAVE64-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
-    ; WAVE64-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
-    ; WAVE64-NEXT: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; WAVE64-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 23255, implicit $exec
+    ; WAVE64-NEXT: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -16, implicit $exec
     ; WAVE64-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_2]], %subreg.sub0, [[V_MOV_B32_e32_3]], %subreg.sub1
-    ; WAVE64-NEXT: [[V_MOV_B32_e32_4:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967295, implicit $exec
-    ; WAVE64-NEXT: [[V_MOV_B32_e32_5:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; WAVE64-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_4]], %subreg.sub0, [[V_MOV_B32_e32_5]], %subreg.sub1
-    ; WAVE64-NEXT: [[V_MOV_B32_e32_6:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967242, implicit $exec
-    ; WAVE64-NEXT: [[V_MOV_B32_e32_7:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; WAVE64-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_6]], %subreg.sub0, [[V_MOV_B32_e32_7]], %subreg.sub1
-    ; WAVE64-NEXT: [[V_MOV_B32_e32_8:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 27, implicit $exec
-    ; WAVE64-NEXT: [[V_MOV_B32_e32_9:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; WAVE64-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_8]], %subreg.sub0, [[V_MOV_B32_e32_9]], %subreg.sub1
-    ; WAVE64-NEXT: [[V_MOV_B32_e32_10:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967295, implicit $exec
-    ; WAVE64-NEXT: [[V_MOV_B32_e32_11:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; WAVE64-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_10]], %subreg.sub0, [[V_MOV_B32_e32_11]], %subreg.sub1
-    ; WAVE64-NEXT: [[V_MOV_B32_e32_12:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; WAVE64-NEXT: [[V_MOV_B32_e32_13:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
-    ; WAVE64-NEXT: [[REG_SEQUENCE6:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_12]], %subreg.sub0, [[V_MOV_B32_e32_13]], %subreg.sub1
-    ; WAVE64-NEXT: [[V_MOV_B32_e32_14:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 23255, implicit $exec
-    ; WAVE64-NEXT: [[V_MOV_B32_e32_15:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -16, implicit $exec
-    ; WAVE64-NEXT: [[REG_SEQUENCE7:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_14]], %subreg.sub0, [[V_MOV_B32_e32_15]], %subreg.sub1
-    ; WAVE64-NEXT: S_ENDPGM 0, implicit [[REG_SEQUENCE]], implicit [[REG_SEQUENCE1]], implicit [[REG_SEQUENCE2]], implicit [[REG_SEQUENCE3]], implicit [[REG_SEQUENCE4]], implicit [[REG_SEQUENCE5]], implicit [[REG_SEQUENCE6]], implicit [[REG_SEQUENCE7]]
+    ; WAVE64-NEXT: S_ENDPGM 0, implicit [[V_MOV_B]], implicit [[V_MOV_B1]], implicit [[V_MOV_B2]], implicit [[V_MOV_B3]], implicit [[V_MOV_B4]], implicit [[V_MOV_B5]], implicit [[REG_SEQUENCE]], implicit [[REG_SEQUENCE1]]
     ;
     ; WAVE32-LABEL: name: constant_v_p999
-    ; WAVE32: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; WAVE32-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; WAVE32: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec
+    ; WAVE32-NEXT: [[V_MOV_B1:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 1, implicit $exec
+    ; WAVE32-NEXT: [[V_MOV_B2:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -1, implicit $exec
+    ; WAVE32-NEXT: [[V_MOV_B3:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -54, implicit $exec
+    ; WAVE32-NEXT: [[V_MOV_B4:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 27, implicit $exec
+    ; WAVE32-NEXT: [[V_MOV_B5:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4294967295, implicit $exec
+    ; WAVE32-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; WAVE32-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
     ; WAVE32-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
-    ; WAVE32-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
-    ; WAVE32-NEXT: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    ; WAVE32-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 23255, implicit $exec
+    ; WAVE32-NEXT: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -16, implicit $exec
     ; WAVE32-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_2]], %subreg.sub0, [[V_MOV_B32_e32_3]], %subreg.sub1
-    ; WAVE32-NEXT: [[V_MOV_B32_e32_4:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967295, implicit $exec
-    ; WAVE32-NEXT: [[V_MOV_B32_e32_5:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; WAVE32-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_4]], %subreg.sub0, [[V_MOV_B32_e32_5]], %subreg.sub1
-    ; WAVE32-NEXT: [[V_MOV_B32_e32_6:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967242, implicit $exec
-    ; WAVE32-NEXT: [[V_MOV_B32_e32_7:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; WAVE32-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_6]], %subreg.sub0, [[V_MOV_B32_e32_7]], %subreg.sub1
-    ; WAVE32-NEXT: [[V_MOV_B32_e32_8:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 27, implicit $exec
-    ; WAVE32-NEXT: [[V_MOV_B32_e32_9:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; WAVE32-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_8]], %subreg.sub0, [[V_MOV_B32_e32_9]], %subreg.sub1
-    ; WAVE32-NEXT: [[V_MOV_B32_e32_10:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967295, implicit $exec
-    ; WAVE32-NEXT: [[V_MOV_B32_e32_11:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; WAVE32-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_10]], %subreg.sub0, [[V_MOV_B32_e32_11]], %subreg.sub1
-    ; WAVE32-NEXT: [[V_MOV_B32_e32_12:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; WAVE32-NEXT: [[V_MOV_B32_e32_13:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
-    ; WAVE32-NEXT: [[REG_SEQUENCE6:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_12]], %subreg.sub0, [[V_MOV_B32_e32_13]], %subreg.sub1
-    ; WAVE32-NEXT: [[V_MOV_B32_e32_14:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 23255, implicit $exec
-    ; WAVE32-NEXT: [[V_MOV_B32_e32_15:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -16, implicit $exec
-    ; WAVE32-NEXT: [[REG_SEQUENCE7:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_14]], %subreg.sub0, [[V_MOV_B32_e32_15]], %subreg.sub1
-    ; WAVE32-NEXT: S_ENDPGM 0, implicit [[REG_SEQUENCE]], implicit [[REG_SEQUENCE1]], implicit [[REG_SEQUENCE2]], implicit [[REG_SEQUENCE3]], implicit [[REG_SEQUENCE4]], implicit [[REG_SEQUENCE5]], implicit [[REG_SEQUENCE6]], implicit [[REG_SEQUENCE7]]
+    ; WAVE32-NEXT: S_ENDPGM 0, implicit [[V_MOV_B]], implicit [[V_MOV_B1]], implicit [[V_MOV_B2]], implicit [[V_MOV_B3]], implicit [[V_MOV_B4]], implicit [[V_MOV_B5]], implicit [[REG_SEQUENCE]], implicit [[REG_SEQUENCE1]]
     %0:vgpr(p999) = G_CONSTANT i64 0
     %1:vgpr(p999) = G_CONSTANT i64 1
     %2:vgpr(p999) = G_CONSTANT i64 -1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fconstant.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fconstant.mir
index 23b10218cbbe8..2465c374cc11d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fconstant.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fconstant.mir
@@ -61,21 +61,13 @@ tracksRegLiveness: true
 body: |
   bb.0:
     ; GCN-LABEL: name: fconstant_v_s64
-    ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GCN-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1072693248, implicit $exec
-    ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
-    ; GCN-NEXT: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GCN-NEXT: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1075838976, implicit $exec
-    ; GCN-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_2]], %subreg.sub0, [[V_MOV_B32_e32_3]], %subreg.sub1
-    ; GCN-NEXT: [[V_MOV_B32_e32_4:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GCN-NEXT: [[V_MOV_B32_e32_5:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1073741824, implicit $exec
-    ; GCN-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_4]], %subreg.sub0, [[V_MOV_B32_e32_5]], %subreg.sub1
-    ; GCN-NEXT: [[V_MOV_B32_e32_6:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GCN-NEXT: [[V_MOV_B32_e32_7:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1076101120, implicit $exec
-    ; GCN-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_6]], %subreg.sub0, [[V_MOV_B32_e32_7]], %subreg.sub1
-    ; GCN-NEXT: $vgpr0_vgpr1 = COPY [[REG_SEQUENCE]]
-    ; GCN-NEXT: $vgpr2_vgpr3 = COPY [[REG_SEQUENCE1]]
-    ; GCN-NEXT: S_ENDPGM 0, implicit [[REG_SEQUENCE2]], implicit [[REG_SEQUENCE3]]
+    ; GCN: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4607182418800017408, implicit $exec
+    ; GCN-NEXT: [[V_MOV_B1:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4620693217682128896, implicit $exec
+    ; GCN-NEXT: [[V_MOV_B2:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -4611686018427387904, implicit $exec
+    ; GCN-NEXT: [[V_MOV_B3:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4621819117588971520, implicit $exec
+    ; GCN-NEXT: $vgpr0_vgpr1 = COPY [[V_MOV_B]]
+    ; GCN-NEXT: $vgpr2_vgpr3 = COPY [[V_MOV_B1]]
+    ; GCN-NEXT: S_ENDPGM 0, implicit [[V_MOV_B2]], implicit [[V_MOV_B3]]
     %0:vgpr(s64) = G_FCONSTANT double 1.0
     %1:vgpr(s64) = G_FCONSTANT double 8.0
     %2:vgpr(s64) = G_FCONSTANT double -2.0
@@ -95,17 +87,13 @@ tracksRegLiveness: true
 body: |
   bb.0:
     ; GCN-LABEL: name: fconstant_s_s64
-    ; GCN: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 4607182418800017408
-    ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-    ; GCN-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 1075838976
-    ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
-    ; GCN-NEXT: [[S_MOV_B64_1:%[0-9]+]]:sreg_64 = S_MOV_B64 -4611686018427387904
-    ; GCN-NEXT: [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-    ; GCN-NEXT: [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 -1071382528
-    ; GCN-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_2]], %subreg.sub0, [[S_MOV_B32_3]], %subreg.sub1
-    ; GCN-NEXT: $sgpr0_sgpr1 = COPY [[S_MOV_B64_]]
-    ; GCN-NEXT: $sgpr2_sgpr3 = COPY [[REG_SEQUENCE]]
-    ; GCN-NEXT: S_ENDPGM 0, implicit [[S_MOV_B64_]], implicit [[REG_SEQUENCE]], implicit [[S_MOV_B64_1]], implicit [[REG_SEQUENCE1]]
+    ; GCN: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 4607182418800017408
+    ; GCN-NEXT: [[S_MOV_B1:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 4620693217682128896
+    ; GCN-NEXT: [[S_MOV_B2:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -4611686018427387904
+    ; GCN-NEXT: [[S_MOV_B3:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -4601552919265804288
+    ; GCN-NEXT: $sgpr0_sgpr1 = COPY [[S_MOV_B]]
+    ; GCN-NEXT: $sgpr2_sgpr3 = COPY [[S_MOV_B1]]
+    ; GCN-NEXT: S_ENDPGM 0, implicit [[S_MOV_B]], implicit [[S_MOV_B1]], implicit [[S_MOV_B2]], implicit [[S_MOV_B3]]
     %0:sgpr(s64) = G_FCONSTANT double 1.0
     %1:sgpr(s64) = G_FCONSTANT double 8.0
     %2:sgpr(s64) = G_FCONSTANT double -2.0
@@ -136,7 +124,6 @@ body: |
     %2:vgpr(s32) = G_ANYEXT %0
     %3:vgpr(s32) = G_ANYEXT %1
 
-    ; Test without already assigned register class
     %4:vgpr(s16) = G_FCONSTANT half 1.0
     %5:vgpr(s16) = G_FCONSTANT half 8.0
     $vgpr0 = COPY %2
@@ -168,7 +155,6 @@ body: |
     %2:vgpr(s32) = G_ANYEXT %0
     %3:vgpr(s32) = G_ANYEXT %1
 
-    ; Test without already assigned register class
     %4:sgpr(s16) = G_FCONSTANT half 1.0
     %5:sgpr(s16) = G_FCONSTANT half 8.0
     $sgpr0 = COPY %2
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmul.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmul.mir
index 15ece434487ed..6ccd2a9c3e678 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmul.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fmul.mir
@@ -17,24 +17,21 @@ body: |
     ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; GCN-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY $vgpr3_vgpr4
-    ; GCN-NEXT: %4:vgpr_32 = nofpexcept V_MUL_F32_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: %5:vgpr_32 = nofpexcept V_MUL_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: %6:vgpr_32 = nofpexcept V_MUL_F32_e64 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: FLAT_STORE_DWORD [[COPY3]], %4, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 1)
-    ; GCN-NEXT: FLAT_STORE_DWORD [[COPY3]], %5, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 1)
-    ; GCN-NEXT: FLAT_STORE_DWORD [[COPY3]], %6, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 1)
+    ; GCN-NEXT: [[V_MUL_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: [[V_MUL_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: [[V_MUL_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e64 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: FLAT_STORE_DWORD [[COPY3]], [[V_MUL_F32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 1)
+    ; GCN-NEXT: FLAT_STORE_DWORD [[COPY3]], [[V_MUL_F32_e64_1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 1)
+    ; GCN-NEXT: FLAT_STORE_DWORD [[COPY3]], [[V_MUL_F32_e64_2]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 1)
     %0:sgpr(s32) = COPY $sgpr0
     %1:vgpr(s32) = COPY $vgpr0
     %2:vgpr(s32) = COPY $vgpr1
     %3:vgpr(p1) = COPY $vgpr3_vgpr4
 
-    ; fmul vs
     %4:vgpr(s32) = G_FMUL %1, %0
 
-    ; fmul sv
     %5:vgpr(s32) = G_FMUL %0, %1
 
-    ; fmul vv
     %6:vgpr(s32) = G_FMUL %1, %2
 
     G_STORE %4, %3 :: (store (s32), addrspace 1)
@@ -57,22 +54,19 @@ body: |
     ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
     ; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GCN-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; GCN-NEXT: %4:vreg_64 = nofpexcept V_MUL_F64_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: %5:vreg_64 = nofpexcept V_MUL_F64_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: %6:vreg_64 = nofpexcept V_MUL_F64_e64 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: S_ENDPGM 0, implicit %4, implicit %5, implicit %6
+    ; GCN-NEXT: [[V_MUL_F64_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_MUL_F64_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: [[V_MUL_F64_e64_1:%[0-9]+]]:vreg_64 = nofpexcept V_MUL_F64_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: [[V_MUL_F64_e64_2:%[0-9]+]]:vreg_64 = nofpexcept V_MUL_F64_e64 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: S_ENDPGM 0, implicit [[V_MUL_F64_e64_]], implicit [[V_MUL_F64_e64_1]], implicit [[V_MUL_F64_e64_2]]
     %0:sgpr(s64) = COPY $sgpr0_sgpr1
     %1:vgpr(s64) = COPY $vgpr0_vgpr1
     %2:vgpr(s64) = COPY $vgpr2_vgpr3
     %3:vgpr(p1) = COPY $vgpr4_vgpr5
 
-    ; fmul vs
     %4:vgpr(s64) = G_FMUL %1, %0
 
-    ; fmul sv
     %5:vgpr(s64) = G_FMUL %0, %1
 
-    ; fmul vv
     %6:vgpr(s64) = G_FMUL %1, %2
     S_ENDPGM 0, implicit %4, implicit %5, implicit %6
 
@@ -92,10 +86,10 @@ body: |
     ; GCN-NEXT: {{  $}}
     ; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
     ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GCN-NEXT: %7:vgpr_32 = nofpexcept V_MUL_F16_e64 0, [[COPY]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: %8:vgpr_32 = nofpexcept V_MUL_F16_e64 0, [[COPY]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: %9:vgpr_32 = nofpexcept V_MUL_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: S_ENDPGM 0, implicit %7, implicit %8, implicit %9
+    ; GCN-NEXT: [[V_MUL_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_e64 0, [[COPY]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: [[V_MUL_F16_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_e64 0, [[COPY]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: [[V_MUL_F16_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F16_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: S_ENDPGM 0, implicit [[V_MUL_F16_e64_]], implicit [[V_MUL_F16_e64_1]], implicit [[V_MUL_F16_e64_2]]
     %0:sgpr(s32) = COPY $sgpr0
     %1:vgpr(s32) = COPY $vgpr0
     %2:vgpr(s32) = COPY $vgpr1
@@ -105,13 +99,10 @@ body: |
     %5:vgpr(s16) = G_TRUNC %1
     %6:vgpr(s16) = G_TRUNC %2
 
-    ; fmul vs
     %8:vgpr(s16) = G_FMUL %4, %4
 
-    ; fmul sv
     %9:vgpr(s16) = G_FMUL %4, %4
 
-    ; fmul vv
     %10:vgpr(s16) = G_FMUL %4, %5
 
     S_ENDPGM 0, implicit %8, implicit %9, implicit %10
@@ -131,26 +122,26 @@ body: |
     ; GCN-NEXT: {{  $}}
     ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
-    ; GCN-NEXT: %6:vgpr_32 = nofpexcept V_MUL_F32_e64 2, [[COPY]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: %7:vgpr_32 = nofpexcept V_MUL_F32_e64 0, [[COPY]], 2, [[COPY]], 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: %8:vgpr_32 = nofpexcept V_MUL_F32_e64 2, [[COPY]], 2, [[COPY]], 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: %9:vgpr_32 = nofpexcept V_MUL_F32_e64 1, [[COPY]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: %10:vgpr_32 = nofpexcept V_MUL_F32_e64 0, [[COPY]], 1, [[COPY]], 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: %11:vgpr_32 = nofpexcept V_MUL_F32_e64 1, [[COPY]], 1, [[COPY]], 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: %12:vgpr_32 = nofpexcept V_MUL_F32_e64 3, [[COPY]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: %13:vgpr_32 = nofpexcept V_MUL_F32_e64 0, [[COPY]], 3, [[COPY]], 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: %14:vgpr_32 = nofpexcept V_MUL_F32_e64 3, [[COPY]], 3, [[COPY]], 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: %15:vgpr_32 = nofpexcept V_MUL_F32_e64 3, [[COPY]], 1, [[COPY]], 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: FLAT_STORE_DWORD [[COPY1]], %6, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 1)
-    ; GCN-NEXT: FLAT_STORE_DWORD [[COPY1]], %7, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 1)
-    ; GCN-NEXT: FLAT_STORE_DWORD [[COPY1]], %8, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 1)
-    ; GCN-NEXT: FLAT_STORE_DWORD [[COPY1]], %9, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 1)
-    ; GCN-NEXT: FLAT_STORE_DWORD [[COPY1]], %10, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 1)
-    ; GCN-NEXT: FLAT_STORE_DWORD [[COPY1]], %11, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 1)
-    ; GCN-NEXT: FLAT_STORE_DWORD [[COPY1]], %12, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 1)
-    ; GCN-NEXT: FLAT_STORE_DWORD [[COPY1]], %13, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 1)
-    ; GCN-NEXT: FLAT_STORE_DWORD [[COPY1]], %14, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 1)
-    ; GCN-NEXT: FLAT_STORE_DWORD [[COPY1]], %15, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 1)
+    ; GCN-NEXT: [[V_MUL_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e64 2, [[COPY]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: [[V_MUL_F32_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e64 0, [[COPY]], 2, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: [[V_MUL_F32_e64_2:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e64 2, [[COPY]], 2, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: [[V_MUL_F32_e64_3:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e64 1, [[COPY]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: [[V_MUL_F32_e64_4:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e64 0, [[COPY]], 1, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: [[V_MUL_F32_e64_5:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e64 1, [[COPY]], 1, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: [[V_MUL_F32_e64_6:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e64 3, [[COPY]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: [[V_MUL_F32_e64_7:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e64 0, [[COPY]], 3, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: [[V_MUL_F32_e64_8:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e64 3, [[COPY]], 3, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: [[V_MUL_F32_e64_9:%[0-9]+]]:vgpr_32 = nofpexcept V_MUL_F32_e64 3, [[COPY]], 1, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: FLAT_STORE_DWORD [[COPY1]], [[V_MUL_F32_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 1)
+    ; GCN-NEXT: FLAT_STORE_DWORD [[COPY1]], [[V_MUL_F32_e64_1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 1)
+    ; GCN-NEXT: FLAT_STORE_DWORD [[COPY1]], [[V_MUL_F32_e64_2]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 1)
+    ; GCN-NEXT: FLAT_STORE_DWORD [[COPY1]], [[V_MUL_F32_e64_3]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 1)
+    ; GCN-NEXT: FLAT_STORE_DWORD [[COPY1]], [[V_MUL_F32_e64_4]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 1)
+    ; GCN-NEXT: FLAT_STORE_DWORD [[COPY1]], [[V_MUL_F32_e64_5]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 1)
+    ; GCN-NEXT: FLAT_STORE_DWORD [[COPY1]], [[V_MUL_F32_e64_6]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 1)
+    ; GCN-NEXT: FLAT_STORE_DWORD [[COPY1]], [[V_MUL_F32_e64_7]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 1)
+    ; GCN-NEXT: FLAT_STORE_DWORD [[COPY1]], [[V_MUL_F32_e64_8]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 1)
+    ; GCN-NEXT: FLAT_STORE_DWORD [[COPY1]], [[V_MUL_F32_e64_9]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 1)
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr1
     %2:vgpr(p1) = COPY $vgpr2_vgpr3
@@ -159,37 +150,27 @@ body: |
     %4:vgpr(s32) = G_FNEG %0
     %5:vgpr(s32) = G_FNEG %3
 
-    ; fabs lhs
     %6:vgpr(s32) = G_FMUL %3, %0
 
-    ; fabs rhs
     %7:vgpr(s32) = G_FMUL %0, %3
 
-    ; fabs lhs, rhs
     %8:vgpr(s32) = G_FMUL %3, %3
 
 
-    ; fneg lhs
     %9:vgpr(s32) = G_FMUL %4, %0
 
-    ; fneg rhs
     %10:vgpr(s32) = G_FMUL %0, %4
 
-    ; fneg  lhs, rhs
     %11:vgpr(s32) = G_FMUL %4, %4
 
 
-    ; fneg fabs lhs
     %12:vgpr(s32) = G_FMUL %5, %0
 
-    ; fneg fabs rhs
     %13:vgpr(s32) = G_FMUL %0, %5
 
-    ; fneg  fabs lhs, rhs
     %14:vgpr(s32) = G_FMUL %5, %5
 
 
-    ; fneg  fabs lhs, fneg rhs
     %15:vgpr(s32) = G_FMUL %5, %4
 
     G_STORE %6, %2 :: (store (s32), addrspace 1)
@@ -369,10 +350,8 @@ body: |
     ; GCN: liveins: $vgpr0_vgpr1
     ; GCN-NEXT: {{  $}}
     ; GCN-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GCN-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1070596096, implicit $exec
-    ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
-    ; GCN-NEXT: [[V_MUL_F64_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_MUL_F64_e64 3, [[COPY]], 1, [[REG_SEQUENCE]], 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -4598175219545276416, implicit $exec
+    ; GCN-NEXT: [[V_MUL_F64_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_MUL_F64_e64 3, [[COPY]], 1, [[V_MOV_B]], 0, 0, implicit $mode, implicit $exec
     ; GCN-NEXT: $vgpr0_vgpr1 = COPY [[V_MUL_F64_e64_]]
     ; GCN-NEXT: SI_RETURN implicit $vgpr0_vgpr1
     %0:vgpr(s64) = COPY $vgpr0_vgpr1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fract.f64.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fract.f64.mir
index cda4414a0d90e..47e5a2f35a567 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fract.f64.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fract.f64.mir
@@ -20,10 +20,8 @@ body: |
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_64 = COPY [[S_LOAD_DWORDX4_IMM]].sub0_sub1
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_64 = COPY [[S_LOAD_DWORDX4_IMM]].sub2_sub3
     ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY2]], 0, 0 :: (load (s64), addrspace 1)
-    ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-    ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -2147483648
-    ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
-    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
+    ; CHECK-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -9223372036854775808
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B]]
     ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY [[S_LOAD_DWORDX2_IMM]]
     ; CHECK-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_ADD_F64_e64 0, [[COPY3]], 1, [[COPY4]], 0, 0, implicit $mode, implicit $exec
     ; CHECK-NEXT: [[V_FLOOR_F64_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_FLOOR_F64_e64 0, [[V_ADD_F64_e64_]], 0, 0, implicit $mode, implicit $exec
@@ -62,7 +60,6 @@ body: |
   bb.1:
     liveins: $sgpr0_sgpr1
 
-    ; S_LOAD_DWORDX4_IMM [[COPY]], 36, 0 :: (dereferenceable invariant load (<2 x s64>), align 4, addrspace 4)
     ; CHECK-LABEL: name: fract_f64_neg_abs
     ; CHECK: liveins: $sgpr0_sgpr1
     ; CHECK-NEXT: {{  $}}
@@ -71,10 +68,8 @@ body: |
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_64 = COPY [[S_LOAD_DWORDX4_IMM]].sub0_sub1
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_64 = COPY [[S_LOAD_DWORDX4_IMM]].sub2_sub3
     ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY2]], 0, 0 :: (load (s64), addrspace 1)
-    ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
-    ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -2147483648
-    ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
-    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
+    ; CHECK-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -9223372036854775808
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[S_MOV_B]]
     ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY [[S_LOAD_DWORDX2_IMM]]
     ; CHECK-NEXT: [[V_ADD_F64_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_ADD_F64_e64 0, [[COPY3]], 3, [[COPY4]], 0, 0, implicit $mode, implicit $exec
     ; CHECK-NEXT: [[V_FLOOR_F64_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_FLOOR_F64_e64 0, [[V_ADD_F64_e64_]], 0, 0, implicit $mode, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-flat.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-flat.mir
index c9b1b782658c7..de21788ff168f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-flat.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-flat.mir
@@ -21,18 +21,21 @@ body: |
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s32))
     ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
+    ;
     ; GFX9-LABEL: name: load_atomic_flat_s32_seq_cst
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s32))
     ; GFX9-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
+    ;
     ; GFX10-LABEL: name: load_atomic_flat_s32_seq_cst
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s32))
     ; GFX10-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
+    ;
     ; GFX11-LABEL: name: load_atomic_flat_s32_seq_cst
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
@@ -62,18 +65,21 @@ body: |
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[LOAD:%[0-9]+]]:vgpr_32(<2 x s16>) = G_LOAD [[COPY]](p0) :: (load seq_cst (<2 x s16>))
     ; GFX7-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>)
+    ;
     ; GFX9-LABEL: name: load_atomic_flat_v2s16_seq_cst
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[LOAD:%[0-9]+]]:vgpr_32(<2 x s16>) = G_LOAD [[COPY]](p0) :: (load seq_cst (<2 x s16>))
     ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>)
+    ;
     ; GFX10-LABEL: name: load_atomic_flat_v2s16_seq_cst
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[LOAD:%[0-9]+]]:vgpr_32(<2 x s16>) = G_LOAD [[COPY]](p0) :: (load seq_cst (<2 x s16>))
     ; GFX10-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>)
+    ;
     ; GFX11-LABEL: name: load_atomic_flat_v2s16_seq_cst
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
@@ -103,18 +109,21 @@ body: |
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[LOAD:%[0-9]+]]:vgpr_32(p3) = G_LOAD [[COPY]](p0) :: (load seq_cst (p3))
     ; GFX7-NEXT: $vgpr0 = COPY [[LOAD]](p3)
+    ;
     ; GFX9-LABEL: name: load_atomic_flat_p3_seq_cst
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[LOAD:%[0-9]+]]:vgpr_32(p3) = G_LOAD [[COPY]](p0) :: (load seq_cst (p3))
     ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](p3)
+    ;
     ; GFX10-LABEL: name: load_atomic_flat_p3_seq_cst
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[LOAD:%[0-9]+]]:vgpr_32(p3) = G_LOAD [[COPY]](p0) :: (load seq_cst (p3))
     ; GFX10-NEXT: $vgpr0 = COPY [[LOAD]](p3)
+    ;
     ; GFX11-LABEL: name: load_atomic_flat_p3_seq_cst
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
@@ -144,18 +153,21 @@ body: |
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s64))
     ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
+    ;
     ; GFX9-LABEL: name: load_atomic_flat_s64_seq_cst
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s64))
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
+    ;
     ; GFX10-LABEL: name: load_atomic_flat_s64_seq_cst
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s64))
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
+    ;
     ; GFX11-LABEL: name: load_atomic_flat_s64_seq_cst
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
@@ -185,18 +197,21 @@ body: |
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<2 x s32>) = G_LOAD [[COPY]](p0) :: (load seq_cst (<2 x s32>))
     ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; GFX9-LABEL: name: load_atomic_flat_v2s32_seq_cst
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<2 x s32>) = G_LOAD [[COPY]](p0) :: (load seq_cst (<2 x s32>))
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; GFX10-LABEL: name: load_atomic_flat_v2s32_seq_cst
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<2 x s32>) = G_LOAD [[COPY]](p0) :: (load seq_cst (<2 x s32>))
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; GFX11-LABEL: name: load_atomic_flat_v2s32_seq_cst
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
@@ -226,18 +241,21 @@ body: |
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<4 x s16>) = G_LOAD [[COPY]](p0) :: (load seq_cst (<4 x s16>))
     ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
+    ;
     ; GFX9-LABEL: name: load_atomic_flat_v4s16_seq_cst
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<4 x s16>) = G_LOAD [[COPY]](p0) :: (load seq_cst (<4 x s16>))
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
+    ;
     ; GFX10-LABEL: name: load_atomic_flat_v4s16_seq_cst
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<4 x s16>) = G_LOAD [[COPY]](p0) :: (load seq_cst (<4 x s16>))
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
+    ;
     ; GFX11-LABEL: name: load_atomic_flat_v4s16_seq_cst
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
@@ -267,18 +285,21 @@ body: |
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[LOAD:%[0-9]+]]:vreg_64(p1) = G_LOAD [[COPY]](p0) :: (load seq_cst (p1))
     ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
+    ;
     ; GFX9-LABEL: name: load_atomic_flat_p1_seq_cst
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[LOAD:%[0-9]+]]:vreg_64(p1) = G_LOAD [[COPY]](p0) :: (load seq_cst (p1))
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
+    ;
     ; GFX10-LABEL: name: load_atomic_flat_p1_seq_cst
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[LOAD:%[0-9]+]]:vreg_64(p1) = G_LOAD [[COPY]](p0) :: (load seq_cst (p1))
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
+    ;
     ; GFX11-LABEL: name: load_atomic_flat_p1_seq_cst
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
@@ -308,18 +329,21 @@ body: |
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[LOAD:%[0-9]+]]:vreg_64(p0) = G_LOAD [[COPY]](p0) :: (load seq_cst (p0))
     ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p0)
+    ;
     ; GFX9-LABEL: name: load_atomic_flat_p0_seq_cst
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[LOAD:%[0-9]+]]:vreg_64(p0) = G_LOAD [[COPY]](p0) :: (load seq_cst (p0))
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p0)
+    ;
     ; GFX10-LABEL: name: load_atomic_flat_p0_seq_cst
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr(p0) = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[LOAD:%[0-9]+]]:vreg_64(p0) = G_LOAD [[COPY]](p0) :: (load seq_cst (p0))
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p0)
+    ;
     ; GFX11-LABEL: name: load_atomic_flat_p0_seq_cst
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
@@ -347,65 +371,60 @@ body: |
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965248, implicit $exec
-    ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -2048, implicit $exec
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s32))
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s32))
     ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
+    ;
     ; GFX9-LABEL: name: load_atomic_flat_s32_seq_cst_gep_m2048
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965248, implicit $exec
-    ; GFX9-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX9-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -2048, implicit $exec
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX9-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX9-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX9-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s32))
+    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX9-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s32))
     ; GFX9-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
+    ;
     ; GFX10-LABEL: name: load_atomic_flat_s32_seq_cst_gep_m2048
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965248, implicit $exec
-    ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -2048, implicit $exec
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX10-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s32))
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX10-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s32))
     ; GFX10-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
+    ;
     ; GFX11-LABEL: name: load_atomic_flat_s32_seq_cst_gep_m2048
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965248, implicit $exec
-    ; GFX11-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX11-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -2048, implicit $exec
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX11-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX11-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX11-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s32))
+    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX11-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s32))
     ; GFX11-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
     %0:vgpr(p0) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = G_CONSTANT i64 -2048
@@ -430,40 +449,39 @@ body: |
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec
-    ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4095, implicit $exec
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s32))
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s32))
     ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
+    ;
     ; GFX9-LABEL: name: load_atomic_flat_s32_seq_cst_gep_4095
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 4095, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s32))
     ; GFX9-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
+    ;
     ; GFX10-LABEL: name: load_atomic_flat_s32_seq_cst_gep_4095
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec
-    ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4095, implicit $exec
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX10-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s32))
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX10-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s32))
     ; GFX10-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
+    ;
     ; GFX11-LABEL: name: load_atomic_flat_s32_seq_cst_gep_4095
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-global.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-global.mir
index 2c2e792bce66d..b678966de8537 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-global.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-atomic-global.mir
@@ -28,6 +28,7 @@ body: |
     ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
     ; GFX6-NEXT: [[BUFFER_LOAD_DWORD_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load seq_cst (s32), addrspace 1)
     ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_ADDR64_]]
+    ;
     ; GFX7-LABEL: name: load_atomic_global_s32_seq_cst
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
@@ -39,18 +40,21 @@ body: |
     ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
     ; GFX7-NEXT: [[BUFFER_LOAD_DWORD_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load seq_cst (s32), addrspace 1)
     ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_ADDR64_]]
+    ;
     ; GFX7-FLAT-LABEL: name: load_atomic_global_s32_seq_cst
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: {{  $}}
     ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s32), addrspace 1)
     ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
+    ;
     ; GFX9-LABEL: name: load_atomic_global_s32_seq_cst
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 0, 0, implicit $exec :: (load seq_cst (s32), addrspace 1)
     ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
+    ;
     ; GFX10-LABEL: name: load_atomic_global_s32_seq_cst
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
@@ -80,24 +84,28 @@ body: |
     ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX6-NEXT: [[LOAD:%[0-9]+]]:vgpr_32(<2 x s16>) = G_LOAD [[COPY]](p1) :: (load seq_cst (<2 x s16>), addrspace 1)
     ; GFX6-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>)
+    ;
     ; GFX7-LABEL: name: load_atomic_global_v2s16_seq_cst
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[LOAD:%[0-9]+]]:vgpr_32(<2 x s16>) = G_LOAD [[COPY]](p1) :: (load seq_cst (<2 x s16>), addrspace 1)
     ; GFX7-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>)
+    ;
     ; GFX7-FLAT-LABEL: name: load_atomic_global_v2s16_seq_cst
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: {{  $}}
     ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: [[LOAD:%[0-9]+]]:vgpr_32(<2 x s16>) = G_LOAD [[COPY]](p1) :: (load seq_cst (<2 x s16>), addrspace 1)
     ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>)
+    ;
     ; GFX9-LABEL: name: load_atomic_global_v2s16_seq_cst
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[LOAD:%[0-9]+]]:vgpr_32(<2 x s16>) = G_LOAD [[COPY]](p1) :: (load seq_cst (<2 x s16>), addrspace 1)
     ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>)
+    ;
     ; GFX10-LABEL: name: load_atomic_global_v2s16_seq_cst
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
@@ -127,24 +135,28 @@ body: |
     ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX6-NEXT: [[LOAD:%[0-9]+]]:vgpr_32(p3) = G_LOAD [[COPY]](p1) :: (load seq_cst (p3), addrspace 1)
     ; GFX6-NEXT: $vgpr0 = COPY [[LOAD]](p3)
+    ;
     ; GFX7-LABEL: name: load_atomic_global_p3_seq_cst
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[LOAD:%[0-9]+]]:vgpr_32(p3) = G_LOAD [[COPY]](p1) :: (load seq_cst (p3), addrspace 1)
     ; GFX7-NEXT: $vgpr0 = COPY [[LOAD]](p3)
+    ;
     ; GFX7-FLAT-LABEL: name: load_atomic_global_p3_seq_cst
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: {{  $}}
     ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: [[LOAD:%[0-9]+]]:vgpr_32(p3) = G_LOAD [[COPY]](p1) :: (load seq_cst (p3), addrspace 1)
     ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[LOAD]](p3)
+    ;
     ; GFX9-LABEL: name: load_atomic_global_p3_seq_cst
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[LOAD:%[0-9]+]]:vgpr_32(p3) = G_LOAD [[COPY]](p1) :: (load seq_cst (p3), addrspace 1)
     ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](p3)
+    ;
     ; GFX10-LABEL: name: load_atomic_global_p3_seq_cst
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
@@ -179,6 +191,7 @@ body: |
     ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
     ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX2_ADDR64_:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load seq_cst (s64), addrspace 1)
     ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[BUFFER_LOAD_DWORDX2_ADDR64_]]
+    ;
     ; GFX7-LABEL: name: load_atomic_global_s64_seq_cst
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
@@ -190,18 +203,21 @@ body: |
     ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
     ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX2_ADDR64_:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 0, 0, 0, implicit $exec :: (load seq_cst (s64), addrspace 1)
     ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[BUFFER_LOAD_DWORDX2_ADDR64_]]
+    ;
     ; GFX7-FLAT-LABEL: name: load_atomic_global_s64_seq_cst
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: {{  $}}
     ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s64), addrspace 1)
     ; GFX7-FLAT-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
+    ;
     ; GFX9-LABEL: name: load_atomic_global_s64_seq_cst
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec :: (load seq_cst (s64), addrspace 1)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[GLOBAL_LOAD_DWORDX2_]]
+    ;
     ; GFX10-LABEL: name: load_atomic_global_s64_seq_cst
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
@@ -231,24 +247,28 @@ body: |
     ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX6-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<2 x s32>) = G_LOAD [[COPY]](p1) :: (load seq_cst (<2 x s32>), addrspace 1)
     ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; GFX7-LABEL: name: load_atomic_global_v2s32_seq_cst
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<2 x s32>) = G_LOAD [[COPY]](p1) :: (load seq_cst (<2 x s32>), addrspace 1)
     ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; GFX7-FLAT-LABEL: name: load_atomic_global_v2s32_seq_cst
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: {{  $}}
     ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<2 x s32>) = G_LOAD [[COPY]](p1) :: (load seq_cst (<2 x s32>), addrspace 1)
     ; GFX7-FLAT-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; GFX9-LABEL: name: load_atomic_global_v2s32_seq_cst
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<2 x s32>) = G_LOAD [[COPY]](p1) :: (load seq_cst (<2 x s32>), addrspace 1)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; GFX10-LABEL: name: load_atomic_global_v2s32_seq_cst
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
@@ -278,24 +298,28 @@ body: |
     ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX6-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<4 x s16>) = G_LOAD [[COPY]](p1) :: (load seq_cst (<4 x s16>), addrspace 1)
     ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
+    ;
     ; GFX7-LABEL: name: load_atomic_global_v4s16_seq_cst
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<4 x s16>) = G_LOAD [[COPY]](p1) :: (load seq_cst (<4 x s16>), addrspace 1)
     ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
+    ;
     ; GFX7-FLAT-LABEL: name: load_atomic_global_v4s16_seq_cst
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: {{  $}}
     ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<4 x s16>) = G_LOAD [[COPY]](p1) :: (load seq_cst (<4 x s16>), addrspace 1)
     ; GFX7-FLAT-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
+    ;
     ; GFX9-LABEL: name: load_atomic_global_v4s16_seq_cst
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<4 x s16>) = G_LOAD [[COPY]](p1) :: (load seq_cst (<4 x s16>), addrspace 1)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
+    ;
     ; GFX10-LABEL: name: load_atomic_global_v4s16_seq_cst
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
@@ -325,24 +349,28 @@ body: |
     ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX6-NEXT: [[LOAD:%[0-9]+]]:vreg_64(p1) = G_LOAD [[COPY]](p1) :: (load seq_cst (p1), addrspace 1)
     ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
+    ;
     ; GFX7-LABEL: name: load_atomic_global_p1_seq_cst
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[LOAD:%[0-9]+]]:vreg_64(p1) = G_LOAD [[COPY]](p1) :: (load seq_cst (p1), addrspace 1)
     ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
+    ;
     ; GFX7-FLAT-LABEL: name: load_atomic_global_p1_seq_cst
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: {{  $}}
     ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: [[LOAD:%[0-9]+]]:vreg_64(p1) = G_LOAD [[COPY]](p1) :: (load seq_cst (p1), addrspace 1)
     ; GFX7-FLAT-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
+    ;
     ; GFX9-LABEL: name: load_atomic_global_p1_seq_cst
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[LOAD:%[0-9]+]]:vreg_64(p1) = G_LOAD [[COPY]](p1) :: (load seq_cst (p1), addrspace 1)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
+    ;
     ; GFX10-LABEL: name: load_atomic_global_p1_seq_cst
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
@@ -372,24 +400,28 @@ body: |
     ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX6-NEXT: [[LOAD:%[0-9]+]]:vreg_64(p0) = G_LOAD [[COPY]](p1) :: (load seq_cst (p0), addrspace 1)
     ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p0)
+    ;
     ; GFX7-LABEL: name: load_atomic_global_p0_seq_cst
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[LOAD:%[0-9]+]]:vreg_64(p0) = G_LOAD [[COPY]](p1) :: (load seq_cst (p0), addrspace 1)
     ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p0)
+    ;
     ; GFX7-FLAT-LABEL: name: load_atomic_global_p0_seq_cst
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: {{  $}}
     ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: [[LOAD:%[0-9]+]]:vreg_64(p0) = G_LOAD [[COPY]](p1) :: (load seq_cst (p0), addrspace 1)
     ; GFX7-FLAT-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p0)
+    ;
     ; GFX9-LABEL: name: load_atomic_global_p0_seq_cst
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[LOAD:%[0-9]+]]:vreg_64(p0) = G_LOAD [[COPY]](p1) :: (load seq_cst (p0), addrspace 1)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p0)
+    ;
     ; GFX10-LABEL: name: load_atomic_global_p0_seq_cst
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
@@ -417,66 +449,64 @@ body: |
     ; GFX6: liveins: $vgpr0_vgpr1
     ; GFX6-NEXT: {{  $}}
     ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX6-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965248, implicit $exec
-    ; GFX6-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX6-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -2048, implicit $exec
     ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX6-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX6-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX6-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX6-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
     ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-    ; GFX6-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
     ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-    ; GFX6-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3
-    ; GFX6-NEXT: [[BUFFER_LOAD_DWORD_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec :: (load seq_cst (s32), addrspace 1)
+    ; GFX6-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
+    ; GFX6-NEXT: [[BUFFER_LOAD_DWORD_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_ADDR64 [[REG_SEQUENCE]], [[REG_SEQUENCE2]], 0, 0, 0, 0, implicit $exec :: (load seq_cst (s32), addrspace 1)
     ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_ADDR64_]]
+    ;
     ; GFX7-LABEL: name: load_atomic_global_s32_seq_cst_gep_m2048
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965248, implicit $exec
-    ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -2048, implicit $exec
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
     ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-    ; GFX7-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
     ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-    ; GFX7-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3
-    ; GFX7-NEXT: [[BUFFER_LOAD_DWORD_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec :: (load seq_cst (s32), addrspace 1)
+    ; GFX7-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
+    ; GFX7-NEXT: [[BUFFER_LOAD_DWORD_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_ADDR64 [[REG_SEQUENCE]], [[REG_SEQUENCE2]], 0, 0, 0, 0, implicit $exec :: (load seq_cst (s32), addrspace 1)
     ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_ADDR64_]]
+    ;
     ; GFX7-FLAT-LABEL: name: load_atomic_global_s32_seq_cst_gep_m2048
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: {{  $}}
     ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-FLAT-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965248, implicit $exec
-    ; GFX7-FLAT-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-FLAT-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -2048, implicit $exec
     ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-FLAT-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-FLAT-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX7-FLAT-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-FLAT-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s32), addrspace 1)
+    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-FLAT-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s32), addrspace 1)
     ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
+    ;
     ; GFX9-LABEL: name: load_atomic_global_s32_seq_cst_gep_m2048
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], -2048, 0, implicit $exec :: (load seq_cst (s32), addrspace 1)
     ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
+    ;
     ; GFX10-LABEL: name: load_atomic_global_s32_seq_cst_gep_m2048
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
@@ -513,6 +543,7 @@ body: |
     ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
     ; GFX6-NEXT: [[BUFFER_LOAD_DWORD_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 4095, 0, 0, implicit $exec :: (load seq_cst (s32), addrspace 1)
     ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_ADDR64_]]
+    ;
     ; GFX7-LABEL: name: load_atomic_global_s32_seq_cst_gep_4095
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
@@ -524,43 +555,42 @@ body: |
     ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE]], %subreg.sub2_sub3
     ; GFX7-NEXT: [[BUFFER_LOAD_DWORD_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_ADDR64 [[COPY]], [[REG_SEQUENCE1]], 0, 4095, 0, 0, implicit $exec :: (load seq_cst (s32), addrspace 1)
     ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_ADDR64_]]
+    ;
     ; GFX7-FLAT-LABEL: name: load_atomic_global_s32_seq_cst_gep_4095
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: {{  $}}
     ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-FLAT-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec
-    ; GFX7-FLAT-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-FLAT-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4095, implicit $exec
     ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-FLAT-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-FLAT-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX7-FLAT-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-FLAT-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s32), addrspace 1)
+    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-FLAT-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s32), addrspace 1)
     ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
+    ;
     ; GFX9-LABEL: name: load_atomic_global_s32_seq_cst_gep_4095
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY]], 4095, 0, implicit $exec :: (load seq_cst (s32), addrspace 1)
     ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
+    ;
     ; GFX10-LABEL: name: load_atomic_global_s32_seq_cst_gep_4095
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec
-    ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4095, implicit $exec
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX10-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load seq_cst (s32), addrspace 1)
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX10-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load seq_cst (s32), addrspace 1)
     ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = G_CONSTANT i64 4095
@@ -585,66 +615,64 @@ body: |
     ; GFX6: liveins: $vgpr0_vgpr1
     ; GFX6-NEXT: {{  $}}
     ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX6-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965248, implicit $exec
-    ; GFX6-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX6-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -2048, implicit $exec
     ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX6-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX6-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX6-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX6-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
     ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-    ; GFX6-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
     ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-    ; GFX6-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3
-    ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX2_ADDR64_:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec :: (load seq_cst (s64), addrspace 1)
+    ; GFX6-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
+    ; GFX6-NEXT: [[BUFFER_LOAD_DWORDX2_ADDR64_:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_ADDR64 [[REG_SEQUENCE]], [[REG_SEQUENCE2]], 0, 0, 0, 0, implicit $exec :: (load seq_cst (s64), addrspace 1)
     ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[BUFFER_LOAD_DWORDX2_ADDR64_]]
+    ;
     ; GFX7-LABEL: name: load_atomic_global_s64_seq_cst_gep_m2048
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965248, implicit $exec
-    ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -2048, implicit $exec
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
     ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-    ; GFX7-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
     ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-    ; GFX7-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3
-    ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX2_ADDR64_:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec :: (load seq_cst (s64), addrspace 1)
+    ; GFX7-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
+    ; GFX7-NEXT: [[BUFFER_LOAD_DWORDX2_ADDR64_:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_ADDR64 [[REG_SEQUENCE]], [[REG_SEQUENCE2]], 0, 0, 0, 0, implicit $exec :: (load seq_cst (s64), addrspace 1)
     ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[BUFFER_LOAD_DWORDX2_ADDR64_]]
+    ;
     ; GFX7-FLAT-LABEL: name: load_atomic_global_s64_seq_cst_gep_m2048
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: {{  $}}
     ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-FLAT-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965248, implicit $exec
-    ; GFX7-FLAT-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-FLAT-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -2048, implicit $exec
     ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-FLAT-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-FLAT-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX7-FLAT-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-FLAT-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s64), addrspace 1)
+    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-FLAT-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load seq_cst (s64), addrspace 1)
     ; GFX7-FLAT-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
+    ;
     ; GFX9-LABEL: name: load_atomic_global_s64_seq_cst_gep_m2048
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[GLOBAL_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = GLOBAL_LOAD_DWORDX2 [[COPY]], -2048, 0, implicit $exec :: (load seq_cst (s64), addrspace 1)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[GLOBAL_LOAD_DWORDX2_]]
+    ;
     ; GFX10-LABEL: name: load_atomic_global_s64_seq_cst_gep_m2048
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-constant.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-constant.mir
index 8f17cc3ab47ec..a53fd81f351a2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-constant.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-constant.mir
@@ -1189,11 +1189,11 @@ body: |
     ; GFX6: liveins: $sgpr0_sgpr1
     ; GFX6-NEXT: {{  $}}
     ; GFX6-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1
+    ; GFX6-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -1
     ; GFX6-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
-    ; GFX6-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B64_]].sub0
+    ; GFX6-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub0
     ; GFX6-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
-    ; GFX6-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B64_]].sub1
+    ; GFX6-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub1
     ; GFX6-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc
     ; GFX6-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def dead $scc, implicit $scc
     ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
@@ -1204,11 +1204,11 @@ body: |
     ; GFX7: liveins: $sgpr0_sgpr1
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1
+    ; GFX7-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -1
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
-    ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B64_]].sub0
+    ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub0
     ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
-    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B64_]].sub1
+    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub1
     ; GFX7-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc
     ; GFX7-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def dead $scc, implicit $scc
     ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
@@ -1219,11 +1219,11 @@ body: |
     ; GFX8: liveins: $sgpr0_sgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; GFX8-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 -1
+    ; GFX8-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -1
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
-    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B64_]].sub0
+    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub0
     ; GFX8-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
-    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B64_]].sub1
+    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub1
     ; GFX8-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc
     ; GFX8-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def dead $scc, implicit $scc
     ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
@@ -1259,51 +1259,45 @@ body: |
     ; GFX6: liveins: $sgpr0_sgpr1
     ; GFX6-NEXT: {{  $}}
     ; GFX6-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4294443008
-    ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
-    ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX6-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -524288
     ; GFX6-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
-    ; GFX6-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX6-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub0
     ; GFX6-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
-    ; GFX6-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX6-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub1
     ; GFX6-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc
     ; GFX6-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def dead $scc, implicit $scc
-    ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
-    ; GFX6-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[REG_SEQUENCE1]], 0, 0 :: (load (s32), addrspace 4)
+    ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
+    ; GFX6-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[REG_SEQUENCE]], 0, 0 :: (load (s32), addrspace 4)
     ; GFX6-NEXT: $sgpr0 = COPY [[S_LOAD_DWORD_IMM]]
     ;
     ; GFX7-LABEL: name: load_constant_s32_from_4_gep_negative_524288
     ; GFX7: liveins: $sgpr0_sgpr1
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4294443008
-    ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
-    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -524288
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
-    ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub0
     ; GFX7-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
-    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub1
     ; GFX7-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc
     ; GFX7-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def dead $scc, implicit $scc
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
-    ; GFX7-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[REG_SEQUENCE1]], 0, 0 :: (load (s32), addrspace 4)
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
+    ; GFX7-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[REG_SEQUENCE]], 0, 0 :: (load (s32), addrspace 4)
     ; GFX7-NEXT: $sgpr0 = COPY [[S_LOAD_DWORD_IMM]]
     ;
     ; GFX8-LABEL: name: load_constant_s32_from_4_gep_negative_524288
     ; GFX8: liveins: $sgpr0_sgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; GFX8-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4294443008
-    ; GFX8-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
-    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX8-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -524288
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
-    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub0
     ; GFX8-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
-    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub1
     ; GFX8-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc
     ; GFX8-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def dead $scc, implicit $scc
-    ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
-    ; GFX8-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[REG_SEQUENCE1]], 0, 0 :: (load (s32), addrspace 4)
+    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
+    ; GFX8-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[REG_SEQUENCE]], 0, 0 :: (load (s32), addrspace 4)
     ; GFX8-NEXT: $sgpr0 = COPY [[S_LOAD_DWORD_IMM]]
     ;
     ; GFX10-LABEL: name: load_constant_s32_from_4_gep_negative_524288
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-flat.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-flat.mir
index 78812ca1991f9..d7c3254398862 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-flat.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-flat.mir
@@ -23,24 +23,28 @@ body: |
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s32))
     ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
+    ;
     ; GFX8-LABEL: name: load_flat_s32_from_4
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s32))
     ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
+    ;
     ; GFX9-LABEL: name: load_flat_s32_from_4
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s32))
     ; GFX9-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
+    ;
     ; GFX10-LABEL: name: load_flat_s32_from_4
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s32))
     ; GFX10-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
+    ;
     ; GFX11-LABEL: name: load_flat_s32_from_4
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
@@ -70,24 +74,28 @@ body: |
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[FLAT_LOAD_USHORT:%[0-9]+]]:vgpr_32 = FLAT_LOAD_USHORT [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s16))
     ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_LOAD_USHORT]]
+    ;
     ; GFX8-LABEL: name: load_flat_s32_from_2
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[FLAT_LOAD_USHORT:%[0-9]+]]:vgpr_32 = FLAT_LOAD_USHORT [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s16))
     ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_USHORT]]
+    ;
     ; GFX9-LABEL: name: load_flat_s32_from_2
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[FLAT_LOAD_USHORT:%[0-9]+]]:vgpr_32 = FLAT_LOAD_USHORT [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s16))
     ; GFX9-NEXT: $vgpr0 = COPY [[FLAT_LOAD_USHORT]]
+    ;
     ; GFX10-LABEL: name: load_flat_s32_from_2
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[FLAT_LOAD_USHORT:%[0-9]+]]:vgpr_32 = FLAT_LOAD_USHORT [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s16))
     ; GFX10-NEXT: $vgpr0 = COPY [[FLAT_LOAD_USHORT]]
+    ;
     ; GFX11-LABEL: name: load_flat_s32_from_2
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
@@ -117,24 +125,28 @@ body: |
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX8-LABEL: name: load_flat_s32_from_1
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX9-LABEL: name: load_flat_s32_from_1
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX9-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX10-LABEL: name: load_flat_s32_from_1
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX10-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX11-LABEL: name: load_flat_s32_from_1
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
@@ -164,24 +176,28 @@ body: |
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<2 x s32>))
     ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
+    ;
     ; GFX8-LABEL: name: load_flat_v2s32
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<2 x s32>))
     ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
+    ;
     ; GFX9-LABEL: name: load_flat_v2s32
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<2 x s32>))
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
+    ;
     ; GFX10-LABEL: name: load_flat_v2s32
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<2 x s32>))
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
+    ;
     ; GFX11-LABEL: name: load_flat_v2s32
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
@@ -211,24 +227,28 @@ body: |
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[FLAT_LOAD_DWORDX3_:%[0-9]+]]:vreg_96 = FLAT_LOAD_DWORDX3 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<3 x s32>), align 4)
     ; GFX7-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[FLAT_LOAD_DWORDX3_]]
+    ;
     ; GFX8-LABEL: name: load_flat_v3s32
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[FLAT_LOAD_DWORDX3_:%[0-9]+]]:vreg_96 = FLAT_LOAD_DWORDX3 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<3 x s32>), align 4)
     ; GFX8-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[FLAT_LOAD_DWORDX3_]]
+    ;
     ; GFX9-LABEL: name: load_flat_v3s32
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[FLAT_LOAD_DWORDX3_:%[0-9]+]]:vreg_96 = FLAT_LOAD_DWORDX3 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<3 x s32>), align 4)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[FLAT_LOAD_DWORDX3_]]
+    ;
     ; GFX10-LABEL: name: load_flat_v3s32
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[FLAT_LOAD_DWORDX3_:%[0-9]+]]:vreg_96 = FLAT_LOAD_DWORDX3 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<3 x s32>), align 4)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[FLAT_LOAD_DWORDX3_]]
+    ;
     ; GFX11-LABEL: name: load_flat_v3s32
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
@@ -258,24 +278,28 @@ body: |
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<4 x s32>), align 4)
     ; GFX7-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]]
+    ;
     ; GFX8-LABEL: name: load_flat_v4s32
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<4 x s32>), align 4)
     ; GFX8-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]]
+    ;
     ; GFX9-LABEL: name: load_flat_v4s32
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<4 x s32>), align 4)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]]
+    ;
     ; GFX10-LABEL: name: load_flat_v4s32
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<4 x s32>), align 4)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]]
+    ;
     ; GFX11-LABEL: name: load_flat_v4s32
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
@@ -305,24 +329,28 @@ body: |
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s64))
     ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
+    ;
     ; GFX8-LABEL: name: load_flat_s64
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s64))
     ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
+    ;
     ; GFX9-LABEL: name: load_flat_s64
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s64))
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
+    ;
     ; GFX10-LABEL: name: load_flat_s64
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s64))
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
+    ;
     ; GFX11-LABEL: name: load_flat_s64
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
@@ -352,24 +380,28 @@ body: |
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<2 x s64>), align 4)
     ; GFX7-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]]
+    ;
     ; GFX8-LABEL: name: load_flat_v2s64
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<2 x s64>), align 4)
     ; GFX8-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]]
+    ;
     ; GFX9-LABEL: name: load_flat_v2s64
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<2 x s64>), align 4)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]]
+    ;
     ; GFX10-LABEL: name: load_flat_v2s64
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<2 x s64>), align 4)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]]
+    ;
     ; GFX11-LABEL: name: load_flat_v2s64
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
@@ -399,24 +431,28 @@ body: |
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[LOAD:%[0-9]+]]:vreg_128(<2 x p1>) = G_LOAD [[COPY]](p1) :: (load (<2 x p1>), align 4)
     ; GFX7-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x p1>)
+    ;
     ; GFX8-LABEL: name: load_flat_v2p1
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[LOAD:%[0-9]+]]:vreg_128(<2 x p1>) = G_LOAD [[COPY]](p1) :: (load (<2 x p1>), align 4)
     ; GFX8-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x p1>)
+    ;
     ; GFX9-LABEL: name: load_flat_v2p1
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[LOAD:%[0-9]+]]:vreg_128(<2 x p1>) = G_LOAD [[COPY]](p1) :: (load (<2 x p1>), align 4)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x p1>)
+    ;
     ; GFX10-LABEL: name: load_flat_v2p1
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[LOAD:%[0-9]+]]:vreg_128(<2 x p1>) = G_LOAD [[COPY]](p1) :: (load (<2 x p1>), align 4)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x p1>)
+    ;
     ; GFX11-LABEL: name: load_flat_v2p1
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
@@ -446,24 +482,28 @@ body: |
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[LOAD:%[0-9]+]]:vreg_96(s96) = G_LOAD [[COPY]](p1) :: (load (s96), align 4)
     ; GFX7-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](s96)
+    ;
     ; GFX8-LABEL: name: load_flat_s96
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[LOAD:%[0-9]+]]:vreg_96(s96) = G_LOAD [[COPY]](p1) :: (load (s96), align 4)
     ; GFX8-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](s96)
+    ;
     ; GFX9-LABEL: name: load_flat_s96
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[LOAD:%[0-9]+]]:vreg_96(s96) = G_LOAD [[COPY]](p1) :: (load (s96), align 4)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](s96)
+    ;
     ; GFX10-LABEL: name: load_flat_s96
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[LOAD:%[0-9]+]]:vreg_96(s96) = G_LOAD [[COPY]](p1) :: (load (s96), align 4)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](s96)
+    ;
     ; GFX11-LABEL: name: load_flat_s96
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
@@ -493,24 +533,28 @@ body: |
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[LOAD:%[0-9]+]]:vreg_128(s128) = G_LOAD [[COPY]](p1) :: (load (s128), align 4)
     ; GFX7-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](s128)
+    ;
     ; GFX8-LABEL: name: load_flat_s128
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[LOAD:%[0-9]+]]:vreg_128(s128) = G_LOAD [[COPY]](p1) :: (load (s128), align 4)
     ; GFX8-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](s128)
+    ;
     ; GFX9-LABEL: name: load_flat_s128
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[LOAD:%[0-9]+]]:vreg_128(s128) = G_LOAD [[COPY]](p1) :: (load (s128), align 4)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](s128)
+    ;
     ; GFX10-LABEL: name: load_flat_s128
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[LOAD:%[0-9]+]]:vreg_128(s128) = G_LOAD [[COPY]](p1) :: (load (s128), align 4)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](s128)
+    ;
     ; GFX11-LABEL: name: load_flat_s128
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
@@ -540,24 +584,28 @@ body: |
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (p3))
     ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
+    ;
     ; GFX8-LABEL: name: load_flat_p3_from_4
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (p3))
     ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
+    ;
     ; GFX9-LABEL: name: load_flat_p3_from_4
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (p3))
     ; GFX9-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
+    ;
     ; GFX10-LABEL: name: load_flat_p3_from_4
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (p3))
     ; GFX10-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
+    ;
     ; GFX11-LABEL: name: load_flat_p3_from_4
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
@@ -587,24 +635,28 @@ body: |
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (p1))
     ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
+    ;
     ; GFX8-LABEL: name: load_flat_p1_from_8
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (p1))
     ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
+    ;
     ; GFX9-LABEL: name: load_flat_p1_from_8
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (p1))
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
+    ;
     ; GFX10-LABEL: name: load_flat_p1_from_8
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (p1))
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
+    ;
     ; GFX11-LABEL: name: load_flat_p1_from_8
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
@@ -634,24 +686,28 @@ body: |
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[LOAD:%[0-9]+]]:vreg_64(p999) = G_LOAD [[COPY]](p1) :: (load (p999))
     ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p999)
+    ;
     ; GFX8-LABEL: name: load_flat_p999_from_8
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[LOAD:%[0-9]+]]:vreg_64(p999) = G_LOAD [[COPY]](p1) :: (load (p999))
     ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p999)
+    ;
     ; GFX9-LABEL: name: load_flat_p999_from_8
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[LOAD:%[0-9]+]]:vreg_64(p999) = G_LOAD [[COPY]](p1) :: (load (p999))
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p999)
+    ;
     ; GFX10-LABEL: name: load_flat_p999_from_8
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[LOAD:%[0-9]+]]:vreg_64(p999) = G_LOAD [[COPY]](p1) :: (load (p999))
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p999)
+    ;
     ; GFX11-LABEL: name: load_flat_p999_from_8
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
@@ -681,24 +737,28 @@ body: |
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<2 x p3>) = G_LOAD [[COPY]](p1) :: (load (<2 x p3>))
     ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
+    ;
     ; GFX8-LABEL: name: load_flat_v2p3
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<2 x p3>) = G_LOAD [[COPY]](p1) :: (load (<2 x p3>))
     ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
+    ;
     ; GFX9-LABEL: name: load_flat_v2p3
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<2 x p3>) = G_LOAD [[COPY]](p1) :: (load (<2 x p3>))
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
+    ;
     ; GFX10-LABEL: name: load_flat_v2p3
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[LOAD:%[0-9]+]]:vreg_64(<2 x p3>) = G_LOAD [[COPY]](p1) :: (load (<2 x p3>))
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
+    ;
     ; GFX11-LABEL: name: load_flat_v2p3
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
@@ -728,24 +788,28 @@ body: |
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<2 x s16>))
     ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
+    ;
     ; GFX8-LABEL: name: load_flat_v2s16
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<2 x s16>))
     ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
+    ;
     ; GFX9-LABEL: name: load_flat_v2s16
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<2 x s16>))
     ; GFX9-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
+    ;
     ; GFX10-LABEL: name: load_flat_v2s16
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<2 x s16>))
     ; GFX10-NEXT: $vgpr0 = COPY [[FLAT_LOAD_DWORD]]
+    ;
     ; GFX11-LABEL: name: load_flat_v2s16
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
@@ -775,24 +839,28 @@ body: |
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<4 x s16>))
     ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
+    ;
     ; GFX8-LABEL: name: load_flat_v4s16
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<4 x s16>))
     ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
+    ;
     ; GFX9-LABEL: name: load_flat_v4s16
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<4 x s16>))
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
+    ;
     ; GFX10-LABEL: name: load_flat_v4s16
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[FLAT_LOAD_DWORDX2_:%[0-9]+]]:vreg_64 = FLAT_LOAD_DWORDX2 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<4 x s16>))
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[FLAT_LOAD_DWORDX2_]]
+    ;
     ; GFX11-LABEL: name: load_flat_v4s16
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
@@ -822,24 +890,28 @@ body: |
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[LOAD:%[0-9]+]]:vreg_96(<6 x s16>) = G_LOAD [[COPY]](p1) :: (load (<6 x s16>), align 4)
     ; GFX7-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](<6 x s16>)
+    ;
     ; GFX8-LABEL: name: load_flat_v6s16
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[LOAD:%[0-9]+]]:vreg_96(<6 x s16>) = G_LOAD [[COPY]](p1) :: (load (<6 x s16>), align 4)
     ; GFX8-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](<6 x s16>)
+    ;
     ; GFX9-LABEL: name: load_flat_v6s16
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[LOAD:%[0-9]+]]:vreg_96(<6 x s16>) = G_LOAD [[COPY]](p1) :: (load (<6 x s16>), align 4)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](<6 x s16>)
+    ;
     ; GFX10-LABEL: name: load_flat_v6s16
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[LOAD:%[0-9]+]]:vreg_96(<6 x s16>) = G_LOAD [[COPY]](p1) :: (load (<6 x s16>), align 4)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](<6 x s16>)
+    ;
     ; GFX11-LABEL: name: load_flat_v6s16
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
@@ -869,24 +941,28 @@ body: |
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<8 x s16>), align 4)
     ; GFX7-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]]
+    ;
     ; GFX8-LABEL: name: load_flat_v8s16
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<8 x s16>), align 4)
     ; GFX8-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]]
+    ;
     ; GFX9-LABEL: name: load_flat_v8s16
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<8 x s16>), align 4)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]]
+    ;
     ; GFX10-LABEL: name: load_flat_v8s16
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[FLAT_LOAD_DWORDX4_:%[0-9]+]]:vreg_128 = FLAT_LOAD_DWORDX4 [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (load (<8 x s16>), align 4)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[FLAT_LOAD_DWORDX4_]]
+    ;
     ; GFX11-LABEL: name: load_flat_v8s16
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
@@ -918,56 +994,54 @@ body: |
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2047, implicit $exec
-    ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 2047, implicit $exec
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX8-LABEL: name: load_flat_s32_from_1_gep_2047
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX8-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2047, implicit $exec
-    ; GFX8-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 2047, implicit $exec
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX9-LABEL: name: load_flat_s32_from_1_gep_2047
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[COPY]], 2047, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX9-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX10-LABEL: name: load_flat_s32_from_1_gep_2047
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2047, implicit $exec
-    ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 2047, implicit $exec
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX10-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX10-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX10-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX11-LABEL: name: load_flat_s32_from_1_gep_2047
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
@@ -997,56 +1071,54 @@ body: |
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2048, implicit $exec
-    ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 2048, implicit $exec
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX8-LABEL: name: load_flat_s32_from_1_gep_2048
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX8-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2048, implicit $exec
-    ; GFX8-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 2048, implicit $exec
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX9-LABEL: name: load_flat_s32_from_1_gep_2048
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[COPY]], 2048, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX9-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX10-LABEL: name: load_flat_s32_from_1_gep_2048
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2048, implicit $exec
-    ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 2048, implicit $exec
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX10-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX10-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX10-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX11-LABEL: name: load_flat_s32_from_1_gep_2048
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
@@ -1076,81 +1148,75 @@ body: |
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965249, implicit $exec
-    ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -2047, implicit $exec
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX8-LABEL: name: load_flat_s32_from_1_gep_m2047
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX8-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965249, implicit $exec
-    ; GFX8-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -2047, implicit $exec
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX9-LABEL: name: load_flat_s32_from_1_gep_m2047
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965249, implicit $exec
-    ; GFX9-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX9-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -2047, implicit $exec
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX9-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX9-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX9-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX9-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX9-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX10-LABEL: name: load_flat_s32_from_1_gep_m2047
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965249, implicit $exec
-    ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -2047, implicit $exec
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX10-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX10-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX10-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX11-LABEL: name: load_flat_s32_from_1_gep_m2047
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965249, implicit $exec
-    ; GFX11-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX11-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -2047, implicit $exec
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX11-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX11-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX11-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX11-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX11-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = G_CONSTANT i64 -2047
@@ -1175,81 +1241,75 @@ body: |
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965248, implicit $exec
-    ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -2048, implicit $exec
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX8-LABEL: name: load_flat_s32_from_1_gep_m2048
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX8-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965248, implicit $exec
-    ; GFX8-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -2048, implicit $exec
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX9-LABEL: name: load_flat_s32_from_1_gep_m2048
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965248, implicit $exec
-    ; GFX9-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX9-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -2048, implicit $exec
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX9-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX9-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX9-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX9-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX9-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX10-LABEL: name: load_flat_s32_from_1_gep_m2048
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965248, implicit $exec
-    ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -2048, implicit $exec
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX10-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX10-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX10-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX11-LABEL: name: load_flat_s32_from_1_gep_m2048
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965248, implicit $exec
-    ; GFX11-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX11-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -2048, implicit $exec
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX11-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX11-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX11-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX11-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX11-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = G_CONSTANT i64 -2048
@@ -1274,56 +1334,54 @@ body: |
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec
-    ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4095, implicit $exec
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX8-LABEL: name: load_flat_s32_from_1_gep_4095
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX8-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec
-    ; GFX8-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4095, implicit $exec
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX9-LABEL: name: load_flat_s32_from_1_gep_4095
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[COPY]], 4095, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX9-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX10-LABEL: name: load_flat_s32_from_1_gep_4095
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec
-    ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4095, implicit $exec
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX10-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX10-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX10-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX11-LABEL: name: load_flat_s32_from_1_gep_4095
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
@@ -1353,81 +1411,75 @@ body: |
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
-    ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4096, implicit $exec
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX8-LABEL: name: load_flat_s32_from_1_gep_4096
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX8-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
-    ; GFX8-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4096, implicit $exec
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX9-LABEL: name: load_flat_s32_from_1_gep_4096
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
-    ; GFX9-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX9-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4096, implicit $exec
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX9-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX9-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX9-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX9-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX9-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX10-LABEL: name: load_flat_s32_from_1_gep_4096
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
-    ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4096, implicit $exec
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX10-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX10-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX10-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX11-LABEL: name: load_flat_s32_from_1_gep_4096
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
-    ; GFX11-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX11-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4096, implicit $exec
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX11-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX11-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX11-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX11-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX11-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = G_CONSTANT i64 4096
@@ -1452,81 +1504,75 @@ body: |
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294963201, implicit $exec
-    ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -4095, implicit $exec
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX8-LABEL: name: load_flat_s32_from_1_gep_m4095
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX8-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294963201, implicit $exec
-    ; GFX8-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -4095, implicit $exec
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX9-LABEL: name: load_flat_s32_from_1_gep_m4095
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294963201, implicit $exec
-    ; GFX9-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX9-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -4095, implicit $exec
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX9-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX9-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX9-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX9-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX9-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX10-LABEL: name: load_flat_s32_from_1_gep_m4095
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294963201, implicit $exec
-    ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -4095, implicit $exec
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX10-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX10-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX10-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX11-LABEL: name: load_flat_s32_from_1_gep_m4095
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294963201, implicit $exec
-    ; GFX11-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX11-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -4095, implicit $exec
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX11-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX11-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX11-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX11-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX11-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = G_CONSTANT i64 -4095
@@ -1551,81 +1597,75 @@ body: |
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294963200, implicit $exec
-    ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -4096, implicit $exec
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX8-LABEL: name: load_flat_s32_from_1_gep_m4096
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX8-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294963200, implicit $exec
-    ; GFX8-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -4096, implicit $exec
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX9-LABEL: name: load_flat_s32_from_1_gep_m4096
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294963200, implicit $exec
-    ; GFX9-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX9-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -4096, implicit $exec
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX9-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX9-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX9-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX9-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX9-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX10-LABEL: name: load_flat_s32_from_1_gep_m4096
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294963200, implicit $exec
-    ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -4096, implicit $exec
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX10-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX10-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX10-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX11-LABEL: name: load_flat_s32_from_1_gep_m4096
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294963200, implicit $exec
-    ; GFX11-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX11-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -4096, implicit $exec
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX11-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX11-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX11-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX11-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX11-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = G_CONSTANT i64 -4096
@@ -1650,81 +1690,75 @@ body: |
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 8191, implicit $exec
-    ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8191, implicit $exec
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX8-LABEL: name: load_flat_s32_from_1_gep_8191
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX8-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 8191, implicit $exec
-    ; GFX8-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8191, implicit $exec
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX9-LABEL: name: load_flat_s32_from_1_gep_8191
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 8191, implicit $exec
-    ; GFX9-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX9-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8191, implicit $exec
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX9-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX9-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX9-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX9-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX9-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX10-LABEL: name: load_flat_s32_from_1_gep_8191
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 8191, implicit $exec
-    ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8191, implicit $exec
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX10-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX10-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX10-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX11-LABEL: name: load_flat_s32_from_1_gep_8191
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 8191, implicit $exec
-    ; GFX11-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX11-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8191, implicit $exec
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX11-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX11-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX11-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX11-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX11-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = G_CONSTANT i64 8191
@@ -1749,81 +1783,75 @@ body: |
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 8192, implicit $exec
-    ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8192, implicit $exec
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX8-LABEL: name: load_flat_s32_from_1_gep_8192
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX8-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 8192, implicit $exec
-    ; GFX8-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8192, implicit $exec
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX9-LABEL: name: load_flat_s32_from_1_gep_8192
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 8192, implicit $exec
-    ; GFX9-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX9-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8192, implicit $exec
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX9-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX9-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX9-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX9-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX9-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX10-LABEL: name: load_flat_s32_from_1_gep_8192
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 8192, implicit $exec
-    ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8192, implicit $exec
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX10-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX10-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX10-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX11-LABEL: name: load_flat_s32_from_1_gep_8192
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 8192, implicit $exec
-    ; GFX11-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX11-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8192, implicit $exec
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX11-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX11-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX11-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX11-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX11-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = G_CONSTANT i64 8192
@@ -1848,81 +1876,75 @@ body: |
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294959105, implicit $exec
-    ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8191, implicit $exec
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX8-LABEL: name: load_flat_s32_from_1_gep_m8191
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX8-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294959105, implicit $exec
-    ; GFX8-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8191, implicit $exec
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX9-LABEL: name: load_flat_s32_from_1_gep_m8191
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294959105, implicit $exec
-    ; GFX9-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX9-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8191, implicit $exec
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX9-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX9-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX9-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX9-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX9-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX10-LABEL: name: load_flat_s32_from_1_gep_m8191
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294959105, implicit $exec
-    ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8191, implicit $exec
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX10-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX10-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX10-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX11-LABEL: name: load_flat_s32_from_1_gep_m8191
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294959105, implicit $exec
-    ; GFX11-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX11-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8191, implicit $exec
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX11-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX11-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX11-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX11-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX11-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = G_CONSTANT i64 -8191
@@ -1947,81 +1969,75 @@ body: |
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294959104, implicit $exec
-    ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8192, implicit $exec
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX7-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX8-LABEL: name: load_flat_s32_from_1_gep_m8192
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX8-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294959104, implicit $exec
-    ; GFX8-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8192, implicit $exec
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX9-LABEL: name: load_flat_s32_from_1_gep_m8192
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294959104, implicit $exec
-    ; GFX9-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX9-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8192, implicit $exec
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX9-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX9-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX9-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX9-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX9-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX10-LABEL: name: load_flat_s32_from_1_gep_m8192
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294959104, implicit $exec
-    ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8192, implicit $exec
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX10-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX10-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX10-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
+    ;
     ; GFX11-LABEL: name: load_flat_s32_from_1_gep_m8192
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294959104, implicit $exec
-    ; GFX11-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX11-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8192, implicit $exec
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX11-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX11-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX11-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
+    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX11-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8))
     ; GFX11-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = G_CONSTANT i64 -8192
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global-saddr.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global-saddr.mir
index 7ffa65f922456..0103bfc9d39c1 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global-saddr.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global-saddr.mir
@@ -243,17 +243,15 @@ body: |
     ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY3]], [[COPY4]], 0, implicit $exec
     ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec
-    ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4095, implicit $exec
     ; GFX10-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
-    ; GFX10-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
+    ; GFX10-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX10-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
-    ; GFX10-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
+    ; GFX10-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX10-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY7]], [[COPY8]], 0, implicit $exec
     ; GFX10-NEXT: [[V_ADDC_U32_e64_2:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_3:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY9]], [[COPY10]], killed [[V_ADD_CO_U32_e64_3]], 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_2]], %subreg.sub0, [[V_ADDC_U32_e64_2]], %subreg.sub1
-    ; GFX10-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG_SEQUENCE2]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
+    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_2]], %subreg.sub0, [[V_ADDC_U32_e64_2]], %subreg.sub1
+    ; GFX10-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
     ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
     ;
     ; GFX11-LABEL: name: load_global_s32_from_sgpr_zext_vgpr_offset4095
@@ -310,17 +308,15 @@ body: |
     ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY3]], [[COPY4]], 0, implicit $exec
     ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY5]], [[COPY6]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
     ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294963200, implicit $exec
-    ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -4096, implicit $exec
     ; GFX10-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
-    ; GFX10-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
+    ; GFX10-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX10-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
-    ; GFX10-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1
+    ; GFX10-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX10-NEXT: [[V_ADD_CO_U32_e64_2:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_3:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY7]], [[COPY8]], 0, implicit $exec
     ; GFX10-NEXT: [[V_ADDC_U32_e64_2:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_3:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY9]], [[COPY10]], killed [[V_ADD_CO_U32_e64_3]], 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_2]], %subreg.sub0, [[V_ADDC_U32_e64_2]], %subreg.sub1
-    ; GFX10-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG_SEQUENCE2]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
+    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_2]], %subreg.sub0, [[V_ADDC_U32_e64_2]], %subreg.sub1
+    ; GFX10-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
     ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
     ;
     ; GFX11-LABEL: name: load_global_s32_from_sgpr_zext_vgpr_offset_neg4096
@@ -440,35 +436,31 @@ body: |
     ; GFX9: liveins: $sgpr0_sgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; GFX9-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4294963199
-    ; GFX9-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX9-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -4097
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub0
     ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub1
     ; GFX9-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc
     ; GFX9-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def dead $scc, implicit $scc
-    ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
+    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
     ; GFX9-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX9-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[REG_SEQUENCE1]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
+    ; GFX9-NEXT: [[GLOBAL_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD_SADDR [[REG_SEQUENCE]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
     ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD_SADDR]]
     ;
     ; GFX10-LABEL: name: load_global_s32_from_sgpr_base_offset_neg4097
     ; GFX10: liveins: $sgpr0_sgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4294963199
-    ; GFX10-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX10-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -4097
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub0
     ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub1
     ; GFX10-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc
     ; GFX10-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def dead $scc, implicit $scc
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
     ; GFX10-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
     ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
     ;
@@ -476,17 +468,15 @@ body: |
     ; GFX11: liveins: $sgpr0_sgpr1
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4294963199
-    ; GFX11-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX11-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -4097
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub0
     ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub1
     ; GFX11-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc
     ; GFX11-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def dead $scc, implicit $scc
-    ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
-    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
+    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
+    ; GFX11-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
     ; GFX11-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
     ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
     %0:sgpr(p1) = COPY $sgpr0_sgpr1
@@ -562,17 +552,15 @@ body: |
     ; GFX10: liveins: $sgpr0_sgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4294965247
-    ; GFX10-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX10-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO -2049
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub0
     ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_MOV_B]].sub1
     ; GFX10-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], [[COPY2]], implicit-def $scc
     ; GFX10-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY3]], [[COPY4]], implicit-def dead $scc, implicit $scc
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64_xexec = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
     ; GFX10-NEXT: [[GLOBAL_LOAD_DWORD:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_DWORD [[COPY5]], 0, 0, implicit $exec :: (load (s32), addrspace 1)
     ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_DWORD]]
     ;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global.mir
index f26e23293dae7..27806edc91808 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-global.mir
@@ -1212,34 +1212,30 @@ body: |
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: {{  $}}
     ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-FLAT-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2047, implicit $exec
-    ; GFX7-FLAT-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-FLAT-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 2047, implicit $exec
     ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-FLAT-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-FLAT-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX7-FLAT-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
+    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
     ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ;
     ; GFX8-LABEL: name: load_global_s32_from_1_gep_2047
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX8-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2047, implicit $exec
-    ; GFX8-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 2047, implicit $exec
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
+    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
     ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ;
     ; GFX9-LABEL: name: load_global_s32_from_1_gep_2047
@@ -1309,34 +1305,30 @@ body: |
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: {{  $}}
     ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-FLAT-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2048, implicit $exec
-    ; GFX7-FLAT-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-FLAT-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 2048, implicit $exec
     ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-FLAT-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-FLAT-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX7-FLAT-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
+    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
     ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ;
     ; GFX8-LABEL: name: load_global_s32_from_1_gep_2048
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX8-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2048, implicit $exec
-    ; GFX8-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 2048, implicit $exec
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
+    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
     ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ;
     ; GFX9-LABEL: name: load_global_s32_from_1_gep_2048
@@ -1350,17 +1342,15 @@ body: |
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2048, implicit $exec
-    ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 2048, implicit $exec
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
     ;
     ; GFX11-LABEL: name: load_global_s32_from_1_gep_2048
@@ -1392,78 +1382,70 @@ body: |
     ; GFX6: liveins: $vgpr0_vgpr1
     ; GFX6-NEXT: {{  $}}
     ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX6-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965249, implicit $exec
-    ; GFX6-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX6-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -2047, implicit $exec
     ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX6-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX6-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX6-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX6-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
     ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-    ; GFX6-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
     ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-    ; GFX6-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3
-    ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
+    ; GFX6-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
+    ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE]], [[REG_SEQUENCE2]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
     ;
     ; GFX7-LABEL: name: load_global_s32_from_1_gep_m2047
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965249, implicit $exec
-    ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -2047, implicit $exec
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
     ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-    ; GFX7-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
     ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-    ; GFX7-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3
-    ; GFX7-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
+    ; GFX7-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
+    ; GFX7-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE]], [[REG_SEQUENCE2]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
     ;
     ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_m2047
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: {{  $}}
     ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-FLAT-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965249, implicit $exec
-    ; GFX7-FLAT-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-FLAT-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -2047, implicit $exec
     ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-FLAT-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-FLAT-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX7-FLAT-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
+    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
     ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ;
     ; GFX8-LABEL: name: load_global_s32_from_1_gep_m2047
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX8-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965249, implicit $exec
-    ; GFX8-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -2047, implicit $exec
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
+    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
     ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ;
     ; GFX9-LABEL: name: load_global_s32_from_1_gep_m2047
@@ -1509,78 +1491,70 @@ body: |
     ; GFX6: liveins: $vgpr0_vgpr1
     ; GFX6-NEXT: {{  $}}
     ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX6-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965248, implicit $exec
-    ; GFX6-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX6-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -2048, implicit $exec
     ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX6-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX6-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX6-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX6-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
     ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-    ; GFX6-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
     ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-    ; GFX6-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3
-    ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
+    ; GFX6-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
+    ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE]], [[REG_SEQUENCE2]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
     ;
     ; GFX7-LABEL: name: load_global_s32_from_1_gep_m2048
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965248, implicit $exec
-    ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -2048, implicit $exec
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
     ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-    ; GFX7-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
     ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-    ; GFX7-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3
-    ; GFX7-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
+    ; GFX7-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
+    ; GFX7-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE]], [[REG_SEQUENCE2]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
     ;
     ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_m2048
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: {{  $}}
     ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-FLAT-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965248, implicit $exec
-    ; GFX7-FLAT-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-FLAT-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -2048, implicit $exec
     ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-FLAT-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-FLAT-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX7-FLAT-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
+    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
     ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ;
     ; GFX8-LABEL: name: load_global_s32_from_1_gep_m2048
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX8-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965248, implicit $exec
-    ; GFX8-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -2048, implicit $exec
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
+    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
     ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ;
     ; GFX9-LABEL: name: load_global_s32_from_1_gep_m2048
@@ -1650,34 +1624,30 @@ body: |
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: {{  $}}
     ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-FLAT-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec
-    ; GFX7-FLAT-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-FLAT-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4095, implicit $exec
     ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-FLAT-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-FLAT-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX7-FLAT-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
+    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
     ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ;
     ; GFX8-LABEL: name: load_global_s32_from_1_gep_4095
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX8-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec
-    ; GFX8-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4095, implicit $exec
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
+    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
     ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ;
     ; GFX9-LABEL: name: load_global_s32_from_1_gep_4095
@@ -1691,17 +1661,15 @@ body: |
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec
-    ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4095, implicit $exec
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
     ;
     ; GFX11-LABEL: name: load_global_s32_from_1_gep_4095
@@ -1759,85 +1727,75 @@ body: |
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: {{  $}}
     ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-FLAT-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
-    ; GFX7-FLAT-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-FLAT-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4096, implicit $exec
     ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-FLAT-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-FLAT-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX7-FLAT-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
+    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
     ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ;
     ; GFX8-LABEL: name: load_global_s32_from_1_gep_4096
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX8-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
-    ; GFX8-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4096, implicit $exec
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
+    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
     ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ;
     ; GFX9-LABEL: name: load_global_s32_from_1_gep_4096
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
-    ; GFX9-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX9-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4096, implicit $exec
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX9-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX9-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX9-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
+    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX9-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
     ;
     ; GFX10-LABEL: name: load_global_s32_from_1_gep_4096
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
-    ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4096, implicit $exec
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
     ;
     ; GFX11-LABEL: name: load_global_s32_from_1_gep_4096
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
-    ; GFX11-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX11-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 4096, implicit $exec
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX11-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX11-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX11-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
+    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX11-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = G_CONSTANT i64 4096
@@ -1862,78 +1820,70 @@ body: |
     ; GFX6: liveins: $vgpr0_vgpr1
     ; GFX6-NEXT: {{  $}}
     ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX6-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294963201, implicit $exec
-    ; GFX6-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX6-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -4095, implicit $exec
     ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX6-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX6-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX6-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX6-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
     ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-    ; GFX6-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
     ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-    ; GFX6-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3
-    ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
+    ; GFX6-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
+    ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE]], [[REG_SEQUENCE2]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
     ;
     ; GFX7-LABEL: name: load_global_s32_from_1_gep_m4095
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294963201, implicit $exec
-    ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -4095, implicit $exec
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
     ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-    ; GFX7-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
     ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-    ; GFX7-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3
-    ; GFX7-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
+    ; GFX7-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
+    ; GFX7-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE]], [[REG_SEQUENCE2]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
     ;
     ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_m4095
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: {{  $}}
     ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-FLAT-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294963201, implicit $exec
-    ; GFX7-FLAT-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-FLAT-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -4095, implicit $exec
     ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-FLAT-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-FLAT-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX7-FLAT-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
+    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
     ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ;
     ; GFX8-LABEL: name: load_global_s32_from_1_gep_m4095
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX8-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294963201, implicit $exec
-    ; GFX8-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -4095, implicit $exec
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
+    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
     ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ;
     ; GFX9-LABEL: name: load_global_s32_from_1_gep_m4095
@@ -1947,17 +1897,15 @@ body: |
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294963201, implicit $exec
-    ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -4095, implicit $exec
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
     ;
     ; GFX11-LABEL: name: load_global_s32_from_1_gep_m4095
@@ -1989,78 +1937,70 @@ body: |
     ; GFX6: liveins: $vgpr0_vgpr1
     ; GFX6-NEXT: {{  $}}
     ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX6-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294963200, implicit $exec
-    ; GFX6-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX6-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -4096, implicit $exec
     ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX6-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX6-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX6-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX6-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
     ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-    ; GFX6-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
     ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-    ; GFX6-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3
-    ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
+    ; GFX6-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
+    ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE]], [[REG_SEQUENCE2]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
     ;
     ; GFX7-LABEL: name: load_global_s32_from_1_gep_m4096
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294963200, implicit $exec
-    ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -4096, implicit $exec
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
     ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-    ; GFX7-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
     ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-    ; GFX7-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3
-    ; GFX7-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
+    ; GFX7-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
+    ; GFX7-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE]], [[REG_SEQUENCE2]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
     ;
     ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_m4096
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: {{  $}}
     ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-FLAT-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294963200, implicit $exec
-    ; GFX7-FLAT-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-FLAT-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -4096, implicit $exec
     ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-FLAT-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-FLAT-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX7-FLAT-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
+    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
     ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ;
     ; GFX8-LABEL: name: load_global_s32_from_1_gep_m4096
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX8-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294963200, implicit $exec
-    ; GFX8-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -4096, implicit $exec
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
+    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
     ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ;
     ; GFX9-LABEL: name: load_global_s32_from_1_gep_m4096
@@ -2074,17 +2014,15 @@ body: |
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294963200, implicit $exec
-    ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -4096, implicit $exec
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
     ;
     ; GFX11-LABEL: name: load_global_s32_from_1_gep_m4096
@@ -2142,85 +2080,75 @@ body: |
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: {{  $}}
     ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-FLAT-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 8191, implicit $exec
-    ; GFX7-FLAT-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-FLAT-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8191, implicit $exec
     ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-FLAT-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-FLAT-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX7-FLAT-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
+    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
     ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ;
     ; GFX8-LABEL: name: load_global_s32_from_1_gep_8191
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX8-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 8191, implicit $exec
-    ; GFX8-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8191, implicit $exec
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
+    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
     ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ;
     ; GFX9-LABEL: name: load_global_s32_from_1_gep_8191
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 8191, implicit $exec
-    ; GFX9-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX9-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8191, implicit $exec
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX9-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX9-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX9-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
+    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX9-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
     ;
     ; GFX10-LABEL: name: load_global_s32_from_1_gep_8191
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 8191, implicit $exec
-    ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8191, implicit $exec
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
     ;
     ; GFX11-LABEL: name: load_global_s32_from_1_gep_8191
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 8191, implicit $exec
-    ; GFX11-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX11-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8191, implicit $exec
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX11-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX11-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX11-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
+    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX11-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = G_CONSTANT i64 8191
@@ -2271,85 +2199,75 @@ body: |
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: {{  $}}
     ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-FLAT-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 8192, implicit $exec
-    ; GFX7-FLAT-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-FLAT-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8192, implicit $exec
     ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-FLAT-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-FLAT-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX7-FLAT-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
+    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
     ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ;
     ; GFX8-LABEL: name: load_global_s32_from_1_gep_8192
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX8-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 8192, implicit $exec
-    ; GFX8-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8192, implicit $exec
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
+    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
     ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ;
     ; GFX9-LABEL: name: load_global_s32_from_1_gep_8192
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 8192, implicit $exec
-    ; GFX9-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX9-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8192, implicit $exec
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX9-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX9-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX9-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
+    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX9-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
     ;
     ; GFX10-LABEL: name: load_global_s32_from_1_gep_8192
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 8192, implicit $exec
-    ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8192, implicit $exec
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
     ;
     ; GFX11-LABEL: name: load_global_s32_from_1_gep_8192
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 8192, implicit $exec
-    ; GFX11-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX11-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 8192, implicit $exec
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX11-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX11-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX11-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
+    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX11-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = G_CONSTANT i64 8192
@@ -2374,129 +2292,115 @@ body: |
     ; GFX6: liveins: $vgpr0_vgpr1
     ; GFX6-NEXT: {{  $}}
     ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX6-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294959105, implicit $exec
-    ; GFX6-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX6-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8191, implicit $exec
     ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX6-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX6-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX6-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX6-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
     ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-    ; GFX6-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
     ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-    ; GFX6-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3
-    ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
+    ; GFX6-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
+    ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE]], [[REG_SEQUENCE2]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
     ;
     ; GFX7-LABEL: name: load_global_s32_from_1_gep_m8191
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294959105, implicit $exec
-    ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8191, implicit $exec
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
     ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-    ; GFX7-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
     ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-    ; GFX7-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3
-    ; GFX7-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
+    ; GFX7-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
+    ; GFX7-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE]], [[REG_SEQUENCE2]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
     ;
     ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_m8191
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: {{  $}}
     ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-FLAT-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294959105, implicit $exec
-    ; GFX7-FLAT-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-FLAT-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8191, implicit $exec
     ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-FLAT-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-FLAT-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX7-FLAT-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
+    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
     ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ;
     ; GFX8-LABEL: name: load_global_s32_from_1_gep_m8191
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX8-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294959105, implicit $exec
-    ; GFX8-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8191, implicit $exec
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
+    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
     ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ;
     ; GFX9-LABEL: name: load_global_s32_from_1_gep_m8191
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294959105, implicit $exec
-    ; GFX9-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX9-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8191, implicit $exec
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX9-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX9-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX9-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
+    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX9-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
     ;
     ; GFX10-LABEL: name: load_global_s32_from_1_gep_m8191
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294959105, implicit $exec
-    ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8191, implicit $exec
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
     ;
     ; GFX11-LABEL: name: load_global_s32_from_1_gep_m8191
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294959105, implicit $exec
-    ; GFX11-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX11-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8191, implicit $exec
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX11-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX11-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX11-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
+    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX11-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = G_CONSTANT i64 -8191
@@ -2521,129 +2425,115 @@ body: |
     ; GFX6: liveins: $vgpr0_vgpr1
     ; GFX6-NEXT: {{  $}}
     ; GFX6-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX6-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294959104, implicit $exec
-    ; GFX6-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX6-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8192, implicit $exec
     ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX6-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX6-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX6-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX6-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX6-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX6-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX6-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
     ; GFX6-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-    ; GFX6-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX6-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
     ; GFX6-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-    ; GFX6-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3
-    ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
+    ; GFX6-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
+    ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE]], [[REG_SEQUENCE2]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
     ;
     ; GFX7-LABEL: name: load_global_s32_from_1_gep_m8192
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294959104, implicit $exec
-    ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8192, implicit $exec
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
     ; GFX7-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
     ; GFX7-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 61440
-    ; GFX7-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
     ; GFX7-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-    ; GFX7-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3
-    ; GFX7-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE1]], [[REG_SEQUENCE3]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
+    ; GFX7-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B64_]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
+    ; GFX7-NEXT: [[BUFFER_LOAD_UBYTE_ADDR64_:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_ADDR64 [[REG_SEQUENCE]], [[REG_SEQUENCE2]], 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX7-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_ADDR64_]]
     ;
     ; GFX7-FLAT-LABEL: name: load_global_s32_from_1_gep_m8192
     ; GFX7-FLAT: liveins: $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: {{  $}}
     ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX7-FLAT-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294959104, implicit $exec
-    ; GFX7-FLAT-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-FLAT-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8192, implicit $exec
     ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-FLAT-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-FLAT-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX7-FLAT-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
+    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-FLAT-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
     ; GFX7-FLAT-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ;
     ; GFX8-LABEL: name: load_global_s32_from_1_gep_m8192
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX8-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294959104, implicit $exec
-    ; GFX8-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8192, implicit $exec
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
+    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX8-NEXT: [[FLAT_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = FLAT_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 1)
     ; GFX8-NEXT: $vgpr0 = COPY [[FLAT_LOAD_UBYTE]]
     ;
     ; GFX9-LABEL: name: load_global_s32_from_1_gep_m8192
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX9-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294959104, implicit $exec
-    ; GFX9-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX9-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8192, implicit $exec
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX9-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX9-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX9-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX9-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX9-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
+    ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX9-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX9-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
     ;
     ; GFX10-LABEL: name: load_global_s32_from_1_gep_m8192
     ; GFX10: liveins: $vgpr0_vgpr1
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294959104, implicit $exec
-    ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8192, implicit $exec
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX10-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX10-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
     ;
     ; GFX11-LABEL: name: load_global_s32_from_1_gep_m8192
     ; GFX11: liveins: $vgpr0_vgpr1
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; GFX11-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294959104, implicit $exec
-    ; GFX11-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX11-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO -8192, implicit $exec
     ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX11-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX11-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY1]], [[COPY2]], 0, implicit $exec
     ; GFX11-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY3]], [[COPY4]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX11-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX11-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE1]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
+    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX11-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8), addrspace 1)
     ; GFX11-NEXT: $vgpr0 = COPY [[GLOBAL_LOAD_UBYTE]]
     %0:vgpr(p1) = COPY $vgpr0_vgpr1
     %1:vgpr(s64) = G_CONSTANT i64 -8192
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-smrd.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-smrd.mir
index edace19c47b16..ad53a2fd81120 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-smrd.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-smrd.mir
@@ -193,7 +193,7 @@ body: |
 
 # Test a load of an offset from a constant base address
 # GCN-LABEL: name: constant_address_positive{{$}}
-# GCN: %0:sreg_64 = S_MOV_B64 44
+# GCN: %0:sreg_64 = S_MOV_B64_IMM_PSEUDO 44
 
 # VI: %3:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0, 64, 0 :: (dereferenceable invariant load (s32), addrspace 4)
 # SICI: %3:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0, 16, 0 :: (dereferenceable invariant load (s32), addrspace 4)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ptrmask.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ptrmask.mir
index 1a20e55958742..3f020aaa0365c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ptrmask.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ptrmask.mir
@@ -314,8 +314,8 @@ body: |
     ; CHECK: liveins: $sgpr0_sgpr1
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; CHECK-NEXT: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-    ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY]], [[S_MOV_B64_]], implicit-def dead $scc
+    ; CHECK-NEXT: [[S_MOV_B:%[0-9]+]]:sreg_64 = S_MOV_B64_IMM_PSEUDO 0
+    ; CHECK-NEXT: [[S_AND_B64_:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY]], [[S_MOV_B]], implicit-def dead $scc
     ; CHECK-NEXT: S_ENDPGM 0, implicit [[S_AND_B64_]]
     %0:sgpr(p0) = COPY $sgpr0_sgpr1
     %1:sgpr(s64) = G_CONSTANT i64 0
@@ -441,7 +441,7 @@ body: |
     ; CHECK: liveins: $sgpr0_sgpr1
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; CHECK-NEXT: %const:sreg_64 = S_MOV_B64 -2
+    ; CHECK-NEXT: %const:sreg_64 = S_MOV_B64_IMM_PSEUDO -2
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY %const.sub0
@@ -468,7 +468,7 @@ body: |
     ; CHECK: liveins: $sgpr0_sgpr1
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; CHECK-NEXT: %const:sreg_64 = S_MOV_B64 -4
+    ; CHECK-NEXT: %const:sreg_64 = S_MOV_B64_IMM_PSEUDO -4
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY %const.sub0
@@ -495,7 +495,7 @@ body: |
     ; CHECK: liveins: $sgpr0_sgpr1
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; CHECK-NEXT: %const:sreg_64 = S_MOV_B64 -8
+    ; CHECK-NEXT: %const:sreg_64 = S_MOV_B64_IMM_PSEUDO -8
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY %const.sub0
@@ -522,7 +522,7 @@ body: |
     ; CHECK: liveins: $sgpr0_sgpr1
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; CHECK-NEXT: %const:sreg_64 = S_MOV_B64 -16
+    ; CHECK-NEXT: %const:sreg_64 = S_MOV_B64_IMM_PSEUDO -16
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY %const.sub0
@@ -549,9 +549,7 @@ body: |
     ; CHECK: liveins: $sgpr0_sgpr1
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0_sgpr1
-    ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 3758096384
-    ; CHECK-NEXT: [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 -1
-    ; CHECK-NEXT: %const:sreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_1]], %subreg.sub1
+    ; CHECK-NEXT: %const:sreg_64 = S_MOV_B64_IMM_PSEUDO -536870912
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[COPY]].sub1
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY %const.sub0
@@ -776,9 +774,7 @@ body: |
     ; CHECK: liveins: $vgpr0_vgpr1
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967294, implicit $exec
-    ; CHECK-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; CHECK-NEXT: %const:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; CHECK-NEXT: %const:vreg_64 = V_MOV_B64_PSEUDO -2, implicit $exec
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY %const.sub0
@@ -805,9 +801,7 @@ body: |
     ; CHECK: liveins: $vgpr0_vgpr1
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967292, implicit $exec
-    ; CHECK-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; CHECK-NEXT: %const:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; CHECK-NEXT: %const:vreg_64 = V_MOV_B64_PSEUDO -4, implicit $exec
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY %const.sub0
@@ -834,9 +828,7 @@ body: |
     ; CHECK: liveins: $vgpr0_vgpr1
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967292, implicit $exec
-    ; CHECK-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; CHECK-NEXT: %const:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; CHECK-NEXT: %const:vreg_64 = V_MOV_B64_PSEUDO -4, implicit $exec
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY %const.sub0
@@ -863,9 +855,7 @@ body: |
     ; CHECK: liveins: $vgpr0_vgpr1
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967280, implicit $exec
-    ; CHECK-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; CHECK-NEXT: %const:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; CHECK-NEXT: %const:vreg_64 = V_MOV_B64_PSEUDO -16, implicit $exec
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY %const.sub0
@@ -892,9 +882,7 @@ body: |
     ; CHECK: liveins: $vgpr0_vgpr1
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
-    ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 3758096384, implicit $exec
-    ; CHECK-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
-    ; CHECK-NEXT: %const:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; CHECK-NEXT: %const:vreg_64 = V_MOV_B64_PSEUDO -536870912, implicit $exec
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY %const.sub0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-flat.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-flat.mir
index c7520de936aa8..fc6925ee5709c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-flat.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-flat.mir
@@ -22,24 +22,28 @@ body: |
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX7-NEXT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32))
+    ;
     ; GFX8-LABEL: name: store_flat_s32_to_4
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX8-NEXT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32))
+    ;
     ; GFX9-LABEL: name: store_flat_s32_to_4
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX9-NEXT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32))
+    ;
     ; GFX10-LABEL: name: store_flat_s32_to_4
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX10-NEXT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32))
+    ;
     ; GFX11-LABEL: name: store_flat_s32_to_4
     ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX11-NEXT: {{  $}}
@@ -68,24 +72,28 @@ body: |
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX7-NEXT: FLAT_STORE_SHORT [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16))
+    ;
     ; GFX8-LABEL: name: store_flat_s32_to_2
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX8-NEXT: FLAT_STORE_SHORT [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16))
+    ;
     ; GFX9-LABEL: name: store_flat_s32_to_2
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX9-NEXT: FLAT_STORE_SHORT [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16))
+    ;
     ; GFX10-LABEL: name: store_flat_s32_to_2
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX10-NEXT: FLAT_STORE_SHORT [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16))
+    ;
     ; GFX11-LABEL: name: store_flat_s32_to_2
     ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX11-NEXT: {{  $}}
@@ -114,24 +122,28 @@ body: |
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX7-NEXT: FLAT_STORE_BYTE [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s8))
+    ;
     ; GFX8-LABEL: name: store_flat_s32_to_1
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX8-NEXT: FLAT_STORE_BYTE [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s8))
+    ;
     ; GFX9-LABEL: name: store_flat_s32_to_1
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX9-NEXT: FLAT_STORE_BYTE [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s8))
+    ;
     ; GFX10-LABEL: name: store_flat_s32_to_1
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX10-NEXT: FLAT_STORE_BYTE [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s8))
+    ;
     ; GFX11-LABEL: name: store_flat_s32_to_1
     ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX11-NEXT: {{  $}}
@@ -161,24 +173,28 @@ body: |
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX7-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s64))
+    ;
     ; GFX8-LABEL: name: store_flat_s64
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX8-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s64))
+    ;
     ; GFX9-LABEL: name: store_flat_s64
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX9-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s64))
+    ;
     ; GFX10-LABEL: name: store_flat_s64
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX10-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s64))
+    ;
     ; GFX11-LABEL: name: store_flat_s64
     ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX11-NEXT: {{  $}}
@@ -207,24 +223,28 @@ body: |
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr(s96) = COPY $vgpr2_vgpr3_vgpr4
     ; GFX7-NEXT: G_STORE [[COPY1]](s96), [[COPY]](p1) :: (store (s96), align 16)
+    ;
     ; GFX8-LABEL: name: store_flat_s96
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr(s96) = COPY $vgpr2_vgpr3_vgpr4
     ; GFX8-NEXT: G_STORE [[COPY1]](s96), [[COPY]](p1) :: (store (s96), align 16)
+    ;
     ; GFX9-LABEL: name: store_flat_s96
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr(s96) = COPY $vgpr2_vgpr3_vgpr4
     ; GFX9-NEXT: G_STORE [[COPY1]](s96), [[COPY]](p1) :: (store (s96), align 16)
+    ;
     ; GFX10-LABEL: name: store_flat_s96
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr(s96) = COPY $vgpr2_vgpr3_vgpr4
     ; GFX10-NEXT: G_STORE [[COPY1]](s96), [[COPY]](p1) :: (store (s96), align 16)
+    ;
     ; GFX11-LABEL: name: store_flat_s96
     ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4
     ; GFX11-NEXT: {{  $}}
@@ -253,24 +273,28 @@ body: |
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr(s128) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX7-NEXT: G_STORE [[COPY1]](s128), [[COPY]](p1) :: (store (s128))
+    ;
     ; GFX8-LABEL: name: store_flat_s128
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr(s128) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX8-NEXT: G_STORE [[COPY1]](s128), [[COPY]](p1) :: (store (s128))
+    ;
     ; GFX9-LABEL: name: store_flat_s128
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr(s128) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX9-NEXT: G_STORE [[COPY1]](s128), [[COPY]](p1) :: (store (s128))
+    ;
     ; GFX10-LABEL: name: store_flat_s128
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr(s128) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX10-NEXT: G_STORE [[COPY1]](s128), [[COPY]](p1) :: (store (s128))
+    ;
     ; GFX11-LABEL: name: store_flat_s128
     ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX11-NEXT: {{  $}}
@@ -300,24 +324,28 @@ body: |
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX7-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<2 x s32>))
+    ;
     ; GFX8-LABEL: name: store_flat_v2s32
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX8-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<2 x s32>))
+    ;
     ; GFX9-LABEL: name: store_flat_v2s32
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX9-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<2 x s32>))
+    ;
     ; GFX10-LABEL: name: store_flat_v2s32
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX10-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<2 x s32>))
+    ;
     ; GFX11-LABEL: name: store_flat_v2s32
     ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX11-NEXT: {{  $}}
@@ -346,24 +374,28 @@ body: |
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vreg_96 = COPY $vgpr2_vgpr3_vgpr4
     ; GFX7-NEXT: FLAT_STORE_DWORDX3 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<3 x s32>), align 16)
+    ;
     ; GFX8-LABEL: name: store_flat_v3s32
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vreg_96 = COPY $vgpr2_vgpr3_vgpr4
     ; GFX8-NEXT: FLAT_STORE_DWORDX3 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<3 x s32>), align 16)
+    ;
     ; GFX9-LABEL: name: store_flat_v3s32
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vreg_96 = COPY $vgpr2_vgpr3_vgpr4
     ; GFX9-NEXT: FLAT_STORE_DWORDX3 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<3 x s32>), align 16)
+    ;
     ; GFX10-LABEL: name: store_flat_v3s32
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vreg_96 = COPY $vgpr2_vgpr3_vgpr4
     ; GFX10-NEXT: FLAT_STORE_DWORDX3 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<3 x s32>), align 16)
+    ;
     ; GFX11-LABEL: name: store_flat_v3s32
     ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4
     ; GFX11-NEXT: {{  $}}
@@ -392,24 +424,28 @@ body: |
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX7-NEXT: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<4 x s32>))
+    ;
     ; GFX8-LABEL: name: store_flat_v4s32
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX8-NEXT: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<4 x s32>))
+    ;
     ; GFX9-LABEL: name: store_flat_v4s32
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX9-NEXT: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<4 x s32>))
+    ;
     ; GFX10-LABEL: name: store_flat_v4s32
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX10-NEXT: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<4 x s32>))
+    ;
     ; GFX11-LABEL: name: store_flat_v4s32
     ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX11-NEXT: {{  $}}
@@ -439,24 +475,28 @@ body: |
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX7-NEXT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<2 x s16>))
+    ;
     ; GFX8-LABEL: name: store_flat_v2s16
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX8-NEXT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<2 x s16>))
+    ;
     ; GFX9-LABEL: name: store_flat_v2s16
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX9-NEXT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<2 x s16>))
+    ;
     ; GFX10-LABEL: name: store_flat_v2s16
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX10-NEXT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<2 x s16>))
+    ;
     ; GFX11-LABEL: name: store_flat_v2s16
     ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX11-NEXT: {{  $}}
@@ -486,24 +526,28 @@ body: |
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX7-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<4 x s16>))
+    ;
     ; GFX8-LABEL: name: store_flat_v4s16
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX8-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<4 x s16>))
+    ;
     ; GFX9-LABEL: name: store_flat_v4s16
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX9-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<4 x s16>))
+    ;
     ; GFX10-LABEL: name: store_flat_v4s16
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX10-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<4 x s16>))
+    ;
     ; GFX11-LABEL: name: store_flat_v4s16
     ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX11-NEXT: {{  $}}
@@ -533,24 +577,28 @@ body: |
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr(<6 x s16>) = COPY $vgpr2_vgpr3_vgpr4
     ; GFX7-NEXT: G_STORE [[COPY1]](<6 x s16>), [[COPY]](p1) :: (store (<6 x s16>), align 16)
+    ;
     ; GFX8-LABEL: name: store_flat_v6s16
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr(<6 x s16>) = COPY $vgpr2_vgpr3_vgpr4
     ; GFX8-NEXT: G_STORE [[COPY1]](<6 x s16>), [[COPY]](p1) :: (store (<6 x s16>), align 16)
+    ;
     ; GFX9-LABEL: name: store_flat_v6s16
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr(<6 x s16>) = COPY $vgpr2_vgpr3_vgpr4
     ; GFX9-NEXT: G_STORE [[COPY1]](<6 x s16>), [[COPY]](p1) :: (store (<6 x s16>), align 16)
+    ;
     ; GFX10-LABEL: name: store_flat_v6s16
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr(<6 x s16>) = COPY $vgpr2_vgpr3_vgpr4
     ; GFX10-NEXT: G_STORE [[COPY1]](<6 x s16>), [[COPY]](p1) :: (store (<6 x s16>), align 16)
+    ;
     ; GFX11-LABEL: name: store_flat_v6s16
     ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4
     ; GFX11-NEXT: {{  $}}
@@ -579,24 +627,28 @@ body: |
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX7-NEXT: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<8 x s16>))
+    ;
     ; GFX8-LABEL: name: store_flat_v8s16
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX8-NEXT: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<8 x s16>))
+    ;
     ; GFX9-LABEL: name: store_flat_v8s16
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX9-NEXT: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<8 x s16>))
+    ;
     ; GFX10-LABEL: name: store_flat_v8s16
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX10-NEXT: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<8 x s16>))
+    ;
     ; GFX11-LABEL: name: store_flat_v8s16
     ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX11-NEXT: {{  $}}
@@ -626,24 +678,28 @@ body: |
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX7-NEXT: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<2 x s64>))
+    ;
     ; GFX8-LABEL: name: store_flat_v2s64
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX8-NEXT: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<2 x s64>))
+    ;
     ; GFX9-LABEL: name: store_flat_v2s64
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX9-NEXT: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<2 x s64>))
+    ;
     ; GFX10-LABEL: name: store_flat_v2s64
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX10-NEXT: FLAT_STORE_DWORDX4 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (<2 x s64>))
+    ;
     ; GFX11-LABEL: name: store_flat_v2s64
     ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX11-NEXT: {{  $}}
@@ -673,24 +729,28 @@ body: |
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX7-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (p1))
+    ;
     ; GFX8-LABEL: name: store_flat_p1
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX8-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (p1))
+    ;
     ; GFX9-LABEL: name: store_flat_p1
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX9-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (p1))
+    ;
     ; GFX10-LABEL: name: store_flat_p1
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX10-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (p1))
+    ;
     ; GFX11-LABEL: name: store_flat_p1
     ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX11-NEXT: {{  $}}
@@ -720,24 +780,28 @@ body: |
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr(<2 x p1>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX7-NEXT: G_STORE [[COPY1]](<2 x p1>), [[COPY]](p1) :: (store (<2 x p1>))
+    ;
     ; GFX8-LABEL: name: store_flat_v2p1
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr(<2 x p1>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX8-NEXT: G_STORE [[COPY1]](<2 x p1>), [[COPY]](p1) :: (store (<2 x p1>))
+    ;
     ; GFX9-LABEL: name: store_flat_v2p1
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr(<2 x p1>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX9-NEXT: G_STORE [[COPY1]](<2 x p1>), [[COPY]](p1) :: (store (<2 x p1>))
+    ;
     ; GFX10-LABEL: name: store_flat_v2p1
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr(<2 x p1>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX10-NEXT: G_STORE [[COPY1]](<2 x p1>), [[COPY]](p1) :: (store (<2 x p1>))
+    ;
     ; GFX11-LABEL: name: store_flat_v2p1
     ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX11-NEXT: {{  $}}
@@ -767,24 +831,28 @@ body: |
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX7-NEXT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (p3))
+    ;
     ; GFX8-LABEL: name: store_flat_p3
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX8-NEXT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (p3))
+    ;
     ; GFX9-LABEL: name: store_flat_p3
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX9-NEXT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (p3))
+    ;
     ; GFX10-LABEL: name: store_flat_p3
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX10-NEXT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (p3))
+    ;
     ; GFX11-LABEL: name: store_flat_p3
     ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX11-NEXT: {{  $}}
@@ -814,24 +882,28 @@ body: |
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr(<2 x p3>) = COPY $vgpr2_vgpr3
     ; GFX7-NEXT: G_STORE [[COPY1]](<2 x p3>), [[COPY]](p1) :: (store (<2 x p3>))
+    ;
     ; GFX8-LABEL: name: store_flat_v2p3
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr(<2 x p3>) = COPY $vgpr2_vgpr3
     ; GFX8-NEXT: G_STORE [[COPY1]](<2 x p3>), [[COPY]](p1) :: (store (<2 x p3>))
+    ;
     ; GFX9-LABEL: name: store_flat_v2p3
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr(<2 x p3>) = COPY $vgpr2_vgpr3
     ; GFX9-NEXT: G_STORE [[COPY1]](<2 x p3>), [[COPY]](p1) :: (store (<2 x p3>))
+    ;
     ; GFX10-LABEL: name: store_flat_v2p3
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr(p1) = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr(<2 x p3>) = COPY $vgpr2_vgpr3
     ; GFX10-NEXT: G_STORE [[COPY1]](<2 x p3>), [[COPY]](p1) :: (store (<2 x p3>))
+    ;
     ; GFX11-LABEL: name: store_flat_v2p3
     ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX11-NEXT: {{  $}}
@@ -860,24 +932,28 @@ body: |
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX7-NEXT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store monotonic (s32))
+    ;
     ; GFX8-LABEL: name: store_atomic_flat_s32
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX8-NEXT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store monotonic (s32))
+    ;
     ; GFX9-LABEL: name: store_atomic_flat_s32
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX9-NEXT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store monotonic (s32))
+    ;
     ; GFX10-LABEL: name: store_atomic_flat_s32
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX10-NEXT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store monotonic (s32))
+    ;
     ; GFX11-LABEL: name: store_atomic_flat_s32
     ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX11-NEXT: {{  $}}
@@ -907,24 +983,28 @@ body: |
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX7-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store monotonic (s64))
+    ;
     ; GFX8-LABEL: name: store_atomic_flat_s64
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX8-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store monotonic (s64))
+    ;
     ; GFX9-LABEL: name: store_atomic_flat_s64
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX9-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store monotonic (s64))
+    ;
     ; GFX10-LABEL: name: store_atomic_flat_s64
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr2_vgpr3
     ; GFX10-NEXT: FLAT_STORE_DWORDX2 [[COPY]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store monotonic (s64))
+    ;
     ; GFX11-LABEL: name: store_atomic_flat_s64
     ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX11-NEXT: {{  $}}
@@ -953,55 +1033,53 @@ body: |
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX7-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2047, implicit $exec
-    ; GFX7-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 2047, implicit $exec
     ; GFX7-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX7-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE1]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32))
+    ; GFX7-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32))
+    ;
     ; GFX8-LABEL: name: store_flat_s32_gep_2047
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX8-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2047, implicit $exec
-    ; GFX8-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 2047, implicit $exec
     ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX8-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX8-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX8-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE1]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32))
+    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX8-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32))
+    ;
     ; GFX9-LABEL: name: store_flat_s32_gep_2047
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
     ; GFX9-NEXT: FLAT_STORE_DWORD [[COPY]], [[COPY1]], 2047, 0, implicit $exec, implicit $flat_scr :: (store (s32))
+    ;
     ; GFX10-LABEL: name: store_flat_s32_gep_2047
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX10-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2047, implicit $exec
-    ; GFX10-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX10-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 2047, implicit $exec
     ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX10-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX10-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX10-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX10-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX10-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX10-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE1]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32))
+    ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX10-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32))
+    ;
     ; GFX11-LABEL: name: store_flat_s32_gep_2047
     ; GFX11: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX11-NEXT: {{  $}}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-global.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-global.mir
index c56ba70b667d9..95e01aa413a37 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-global.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-global.mir
@@ -1168,34 +1168,30 @@ body: |
     ; GFX7-FLAT-NEXT: {{  $}}
     ; GFX7-FLAT-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX7-FLAT-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX7-FLAT-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2047, implicit $exec
-    ; GFX7-FLAT-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX7-FLAT-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 2047, implicit $exec
     ; GFX7-FLAT-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX7-FLAT-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX7-FLAT-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX7-FLAT-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX7-FLAT-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX7-FLAT-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX7-FLAT-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX7-FLAT-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX7-FLAT-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE1]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 1)
+    ; GFX7-FLAT-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX7-FLAT-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 1)
     ;
     ; GFX8-LABEL: name: store_global_s32_gep_2047
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-    ; GFX8-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2047, implicit $exec
-    ; GFX8-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_MOV_B32_e32_]], %subreg.sub0, [[V_MOV_B32_e32_1]], %subreg.sub1
+    ; GFX8-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 2047, implicit $exec
     ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub0
-    ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0
+    ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub0
     ; GFX8-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[COPY]].sub1
-    ; GFX8-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1
+    ; GFX8-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[V_MOV_B]].sub1
     ; GFX8-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY2]], [[COPY3]], 0, implicit $exec
     ; GFX8-NEXT: [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADDC_U32_e64 [[COPY4]], [[COPY5]], killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
-    ; GFX8-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
-    ; GFX8-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE1]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 1)
+    ; GFX8-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[V_ADD_CO_U32_e64_]], %subreg.sub0, [[V_ADDC_U32_e64_]], %subreg.sub1
+    ; GFX8-NEXT: FLAT_STORE_DWORD [[REG_SEQUENCE]], [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32), addrspace 1)
     ;
     ; GFX9-LABEL: name: store_global_s32_gep_2047
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll
index 13f7885207105..ce27598f69b3f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll
@@ -1638,21 +1638,21 @@ define amdgpu_kernel void @test_div_scale_f32_undef_undef_val(ptr addrspace(1) %
 define amdgpu_kernel void @test_div_scale_f64_val_undef_val(ptr addrspace(1) %out) #0 {
 ; GFX7-LABEL: test_div_scale_f64_val_undef_val:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_mov_b32 s2, 0
 ; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
-; GFX7-NEXT:    s_mov_b32 s3, 0x40200000
-; GFX7-NEXT:    v_div_scale_f64 v[0:1], s[2:3], v[0:1], v[0:1], s[2:3]
+; GFX7-NEXT:    v_mov_b32_e32 v0, 0
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0x40200000
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_div_scale_f64 v[0:1], s[2:3], s[0:1], s[0:1], v[0:1]
 ; GFX7-NEXT:    s_mov_b32 s2, -1
 ; GFX7-NEXT:    s_mov_b32 s3, 0xf000
-; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX7-NEXT:    s_endpgm
 ;
 ; GFX8-LABEL: test_div_scale_f64_val_undef_val:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_mov_b32 s2, 0
-; GFX8-NEXT:    s_mov_b32 s3, 0x40200000
-; GFX8-NEXT:    v_div_scale_f64 v[0:1], s[2:3], v[0:1], v[0:1], s[2:3]
+; GFX8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0x40200000
+; GFX8-NEXT:    v_div_scale_f64 v[0:1], s[2:3], s[0:1], s[0:1], v[0:1]
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s1
@@ -1662,22 +1662,18 @@ define amdgpu_kernel void @test_div_scale_f64_val_undef_val(ptr addrspace(1) %ou
 ;
 ; GFX10-LABEL: test_div_scale_f64_val_undef_val:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_mov_b32 s2, 0
-; GFX10-NEXT:    s_mov_b32 s3, 0x40200000
-; GFX10-NEXT:    v_mov_b32_e32 v2, 0
-; GFX10-NEXT:    v_div_scale_f64 v[0:1], s2, s[0:1], s[0:1], s[2:3]
+; GFX10-NEXT:    v_div_scale_f64 v[0:1], s2, s[0:1], s[0:1], 0x40200000
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: test_div_scale_f64_val_undef_val:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_mov_b32 s2, 0
-; GFX11-NEXT:    s_mov_b32 s3, 0x40200000
-; GFX11-NEXT:    v_mov_b32_e32 v2, 0
-; GFX11-NEXT:    v_div_scale_f64 v[0:1], null, s[0:1], s[0:1], s[2:3]
+; GFX11-NEXT:    v_div_scale_f64 v[0:1], null, s[0:1], s[0:1], 0x40200000
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-NEXT:    s_nop 0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll
index 02d1a92c69373..4ab963ed85cca 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll
@@ -25,11 +25,8 @@ define i32 @global_atomic_csub_offset(ptr addrspace(1) %ptr, i32 %data) {
 ; GFX10-LABEL: global_atomic_csub_offset:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    s_mov_b64 s[4:5], 0x1000
-; GFX10-NEXT:    v_mov_b32_e32 v3, s4
-; GFX10-NEXT:    v_mov_b32_e32 v4, s5
-; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v3
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v4, vcc_lo
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 ; GFX10-NEXT:    global_atomic_csub v0, v[0:1], v2, off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -37,12 +34,8 @@ define i32 @global_atomic_csub_offset(ptr addrspace(1) %ptr, i32 %data) {
 ; GFX11-LABEL: global_atomic_csub_offset:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b64 s[0:1], 0x1000
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v3, s0
-; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v4, vcc_lo
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 ; GFX11-NEXT:    global_atomic_csub_u32 v0, v[0:1], v2, off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -73,11 +66,8 @@ define void @global_atomic_csub_offset_nortn(ptr addrspace(1) %ptr, i32 %data) {
 ; GFX10-LABEL: global_atomic_csub_offset_nortn:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    s_mov_b64 s[4:5], 0x1000
-; GFX10-NEXT:    v_mov_b32_e32 v3, s4
-; GFX10-NEXT:    v_mov_b32_e32 v4, s5
-; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v3
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v4, vcc_lo
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 ; GFX10-NEXT:    global_atomic_csub v0, v[0:1], v2, off glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -85,12 +75,8 @@ define void @global_atomic_csub_offset_nortn(ptr addrspace(1) %ptr, i32 %data) {
 ; GFX11-LABEL: global_atomic_csub_offset_nortn:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b64 s[0:1], 0x1000
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v3, s0
-; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v4, vcc_lo
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 ; GFX11-NEXT:    global_atomic_csub_u32 v0, v[0:1], v2, off glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll
index aecfbe7aa2260..3ccedb0733d51 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll
@@ -317,31 +317,31 @@ bb:
 define amdgpu_kernel void @test_mfma_f64_16x16x4f64_imm(ptr addrspace(1) %arg, double %a, double %b) #0 {
 ; GCN-LABEL: test_mfma_f64_16x16x4f64_imm:
 ; GCN:       ; %bb.0: ; %bb
-; GCN-NEXT:    s_load_dwordx4 s[12:15], s[0:1], 0x24
-; GCN-NEXT:    s_mov_b64 s[4:5], 0
-; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x34
-; GCN-NEXT:    s_mov_b64 s[10:11], 1.0
-; GCN-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; GCN-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x24
+; GCN-NEXT:    s_load_dwordx2 s[12:13], s[0:1], 0x34
+; GCN-NEXT:    s_mov_b64 s[0:1], 0
+; GCN-NEXT:    s_mov_b64 s[6:7], 1.0
+; GCN-NEXT:    s_mov_b64 s[2:3], s[0:1]
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_pk_mov_b32 v[0:1], s[14:15], s[14:15] op_sel:[0,1]
-; GCN-NEXT:    s_mov_b64 s[8:9], s[4:5]
-; GCN-NEXT:    v_accvgpr_write_b32 a0, s4
-; GCN-NEXT:    v_accvgpr_write_b32 a1, s5
-; GCN-NEXT:    v_accvgpr_write_b32 a2, s6
-; GCN-NEXT:    v_accvgpr_write_b32 a3, s7
-; GCN-NEXT:    v_accvgpr_write_b32 a4, s8
-; GCN-NEXT:    v_accvgpr_write_b32 a5, s9
-; GCN-NEXT:    v_accvgpr_write_b32 a6, s10
-; GCN-NEXT:    v_accvgpr_write_b32 a7, s11
-; GCN-NEXT:    v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1]
+; GCN-NEXT:    v_pk_mov_b32 v[0:1], s[10:11], s[10:11] op_sel:[0,1]
+; GCN-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GCN-NEXT:    v_accvgpr_write_b32 a0, s0
+; GCN-NEXT:    v_pk_mov_b32 v[2:3], s[12:13], s[12:13] op_sel:[0,1]
+; GCN-NEXT:    v_accvgpr_write_b32 a1, s1
+; GCN-NEXT:    v_accvgpr_write_b32 a2, s2
+; GCN-NEXT:    v_accvgpr_write_b32 a3, s3
+; GCN-NEXT:    v_accvgpr_write_b32 a4, s4
+; GCN-NEXT:    v_accvgpr_write_b32 a5, s5
+; GCN-NEXT:    v_accvgpr_write_b32 a6, s6
+; GCN-NEXT:    v_accvgpr_write_b32 a7, s7
 ; GCN-NEXT:    s_nop 1
 ; GCN-NEXT:    v_mfma_f64_16x16x4f64 a[0:7], v[0:1], v[2:3], a[0:7]
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    s_nop 7
 ; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    global_store_dwordx4 v0, a[0:3], s[12:13]
-; GCN-NEXT:    global_store_dwordx4 v0, a[4:7], s[12:13] offset:16
+; GCN-NEXT:    global_store_dwordx4 v0, a[0:3], s[8:9]
+; GCN-NEXT:    global_store_dwordx4 v0, a[4:7], s[8:9] offset:16
 ; GCN-NEXT:    s_endpgm
 bb:
   %mai.1 = tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double %a, double %b, <4 x double> <double 0.0, double 0.0, double 0.0, double 1.0>, i32 0, i32 0, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memmove.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memmove.ll
index 0ec4f64b38a1b..a4d5fe4ffa5a7 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memmove.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memmove.ll
@@ -12,11 +12,12 @@ define amdgpu_cs void @memmove_p1i8(ptr addrspace(1) %dst, ptr addrspace(1) %src
 ; LOOP-NEXT:    s_xor_b64 s[4:5], exec, s[0:1]
 ; LOOP-NEXT:    s_cbranch_execz .LBB0_3
 ; LOOP-NEXT:  ; %bb.1: ; %copy_forward
-; LOOP-NEXT:    s_mov_b64 s[0:1], 0
+; LOOP-NEXT:    s_mov_b64 s[6:7], 0
 ; LOOP-NEXT:    s_mov_b32 s2, 0
 ; LOOP-NEXT:    s_mov_b32 s3, 0xf000
-; LOOP-NEXT:    v_mov_b32_e32 v5, s1
-; LOOP-NEXT:    v_mov_b32_e32 v4, s0
+; LOOP-NEXT:    s_mov_b64 s[0:1], 0
+; LOOP-NEXT:    v_mov_b32_e32 v4, s6
+; LOOP-NEXT:    v_mov_b32_e32 v5, s7
 ; LOOP-NEXT:  .LBB0_2: ; %copy_forward_loop
 ; LOOP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; LOOP-NEXT:    v_add_i32_e32 v6, vcc, v2, v4
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memset.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memset.ll
index 7cd3babc70909..3edd2e0914a6e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memset.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memset.ll
@@ -7,11 +7,12 @@ declare void @llvm.memset.p1.i32(ptr addrspace(1), i8, i32, i1)
 define amdgpu_cs void @memset_p1i8(ptr addrspace(1) %dst, i8 %val) {
 ; LOOP-LABEL: memset_p1i8:
 ; LOOP:       ; %bb.0: ; %loadstoreloop.preheader
-; LOOP-NEXT:    s_mov_b64 s[0:1], 0
+; LOOP-NEXT:    s_mov_b64 s[4:5], 0
 ; LOOP-NEXT:    s_mov_b32 s2, 0
 ; LOOP-NEXT:    s_mov_b32 s3, 0xf000
-; LOOP-NEXT:    v_mov_b32_e32 v4, s1
-; LOOP-NEXT:    v_mov_b32_e32 v3, s0
+; LOOP-NEXT:    s_mov_b64 s[0:1], 0
+; LOOP-NEXT:    v_mov_b32_e32 v3, s4
+; LOOP-NEXT:    v_mov_b32_e32 v4, s5
 ; LOOP-NEXT:  .LBB0_1: ; %loadstoreloop
 ; LOOP-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; LOOP-NEXT:    v_add_i32_e32 v5, vcc, v0, v3
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
index 4248f7b6a1583..9dacdbc46be19 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
@@ -2259,12 +2259,13 @@ define i64 @v_sdiv_i64_pow2_shl_denom(i64 %x, i64 %y) {
 ; CHECK-LABEL: v_sdiv_i64_pow2_shl_denom:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_mov_b64 s[4:5], 0x1000
-; CHECK-NEXT:    v_lshl_b64 v[5:6], s[4:5], v2
-; CHECK-NEXT:    v_mov_b32_e32 v4, v1
 ; CHECK-NEXT:    v_mov_b32_e32 v3, v0
-; CHECK-NEXT:    v_or_b32_e32 v1, v4, v6
+; CHECK-NEXT:    v_mov_b32_e32 v4, v1
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0x1000
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0
+; CHECK-NEXT:    v_lshl_b64 v[5:6], v[0:1], v2
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    v_or_b32_e32 v1, v4, v6
 ; CHECK-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
 ; CHECK-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; CHECK-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -2717,161 +2718,163 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-LABEL: v_sdiv_v2i64_pow2_shl_denom:
 ; CGP:       ; %bb.0:
 ; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CGP-NEXT:    s_mov_b64 s[6:7], 0x1000
-; CGP-NEXT:    v_mov_b32_e32 v5, v2
-; CGP-NEXT:    v_mov_b32_e32 v7, v3
-; CGP-NEXT:    v_lshl_b64 v[2:3], s[6:7], v4
+; CGP-NEXT:    s_mov_b64 s[4:5], 0x1000
+; CGP-NEXT:    v_lshl_b64 v[11:12], s[4:5], v4
 ; CGP-NEXT:    v_mov_b32_e32 v9, v1
 ; CGP-NEXT:    v_mov_b32_e32 v8, v0
-; CGP-NEXT:    v_or_b32_e32 v1, v9, v3
+; CGP-NEXT:    v_or_b32_e32 v1, v9, v12
 ; CGP-NEXT:    v_mov_b32_e32 v0, 0
+; CGP-NEXT:    v_mov_b32_e32 v5, v2
+; CGP-NEXT:    v_mov_b32_e32 v7, v3
+; CGP-NEXT:    v_mov_b32_e32 v2, 0x1000
 ; CGP-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; CGP-NEXT:    v_mov_b32_e32 v3, 0
 ; CGP-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; CGP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; CGP-NEXT:    s_xor_b64 s[8:9], exec, s[4:5]
+; CGP-NEXT:    s_xor_b64 s[6:7], exec, s[4:5]
 ; CGP-NEXT:    s_cbranch_execz .LBB8_2
 ; CGP-NEXT:  ; %bb.1:
-; CGP-NEXT:    v_ashrrev_i32_e32 v0, 31, v3
-; CGP-NEXT:    v_add_i32_e32 v1, vcc, v2, v0
-; CGP-NEXT:    v_addc_u32_e32 v3, vcc, v3, v0, vcc
-; CGP-NEXT:    v_xor_b32_e32 v2, v1, v0
-; CGP-NEXT:    v_xor_b32_e32 v1, v3, v0
-; CGP-NEXT:    v_cvt_f32_u32_e32 v3, v2
-; CGP-NEXT:    v_cvt_f32_u32_e32 v4, v1
-; CGP-NEXT:    v_sub_i32_e32 v13, vcc, 0, v2
-; CGP-NEXT:    v_subb_u32_e32 v14, vcc, 0, v1, vcc
-; CGP-NEXT:    v_mac_f32_e32 v3, 0x4f800000, v4
-; CGP-NEXT:    v_rcp_iflag_f32_e32 v3, v3
-; CGP-NEXT:    v_mul_f32_e32 v3, 0x5f7ffffc, v3
-; CGP-NEXT:    v_mul_f32_e32 v4, 0x2f800000, v3
-; CGP-NEXT:    v_trunc_f32_e32 v10, v4
-; CGP-NEXT:    v_mac_f32_e32 v3, 0xcf800000, v10
-; CGP-NEXT:    v_cvt_u32_f32_e32 v12, v3
-; CGP-NEXT:    v_cvt_u32_f32_e32 v15, v10
-; CGP-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v13, v12, 0
-; CGP-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v13, v15, v[4:5]
-; CGP-NEXT:    v_mul_lo_u32 v4, v15, v3
-; CGP-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v14, v12, v[10:11]
-; CGP-NEXT:    v_mul_hi_u32 v11, v12, v3
-; CGP-NEXT:    v_mul_hi_u32 v3, v15, v3
-; CGP-NEXT:    v_mul_lo_u32 v16, v12, v10
-; CGP-NEXT:    v_mul_lo_u32 v17, v15, v10
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v16
-; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v11
-; CGP-NEXT:    v_mul_hi_u32 v11, v12, v10
-; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v16, v4
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v17, v3
-; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v11
-; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v11, vcc, v16, v11
-; CGP-NEXT:    v_mul_hi_u32 v10, v15, v10
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
-; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v11, v4
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v10, v4
-; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v3
-; CGP-NEXT:    v_addc_u32_e32 v15, vcc, v15, v4, vcc
-; CGP-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v13, v12, 0
-; CGP-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v13, v15, v[4:5]
-; CGP-NEXT:    v_ashrrev_i32_e32 v13, 31, v9
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v8, v13
-; CGP-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v14, v12, v[10:11]
-; CGP-NEXT:    v_addc_u32_e32 v8, vcc, v9, v13, vcc
-; CGP-NEXT:    v_xor_b32_e32 v11, v4, v13
-; CGP-NEXT:    v_mul_lo_u32 v4, v15, v3
-; CGP-NEXT:    v_mul_lo_u32 v9, v12, v10
-; CGP-NEXT:    v_xor_b32_e32 v14, v8, v13
-; CGP-NEXT:    v_mul_hi_u32 v8, v12, v3
-; CGP-NEXT:    v_mul_hi_u32 v3, v15, v3
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v9
-; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
-; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v8, v15, v10
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v9, v4
-; CGP-NEXT:    v_mul_hi_u32 v9, v12, v10
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v8, v3
-; CGP-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v9
-; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
-; CGP-NEXT:    v_mul_hi_u32 v9, v15, v10
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
-; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v8, v4
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v9, v4
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v12, v3
-; CGP-NEXT:    v_addc_u32_e32 v4, vcc, v15, v4, vcc
-; CGP-NEXT:    v_mul_lo_u32 v8, v14, v3
-; CGP-NEXT:    v_mul_lo_u32 v9, v11, v4
-; CGP-NEXT:    v_mul_hi_u32 v10, v11, v3
-; CGP-NEXT:    v_mul_hi_u32 v3, v14, v3
-; CGP-NEXT:    v_mul_hi_u32 v12, v14, v4
+; CGP-NEXT:    v_ashrrev_i32_e32 v0, 31, v12
+; CGP-NEXT:    v_add_i32_e32 v1, vcc, v11, v0
+; CGP-NEXT:    v_addc_u32_e32 v10, vcc, v12, v0, vcc
+; CGP-NEXT:    v_xor_b32_e32 v4, v1, v0
+; CGP-NEXT:    v_xor_b32_e32 v1, v10, v0
+; CGP-NEXT:    v_cvt_f32_u32_e32 v10, v4
+; CGP-NEXT:    v_cvt_f32_u32_e32 v11, v1
+; CGP-NEXT:    v_sub_i32_e32 v14, vcc, 0, v4
+; CGP-NEXT:    v_subb_u32_e32 v15, vcc, 0, v1, vcc
+; CGP-NEXT:    v_mac_f32_e32 v10, 0x4f800000, v11
+; CGP-NEXT:    v_rcp_iflag_f32_e32 v10, v10
+; CGP-NEXT:    v_mul_f32_e32 v10, 0x5f7ffffc, v10
+; CGP-NEXT:    v_mul_f32_e32 v11, 0x2f800000, v10
+; CGP-NEXT:    v_trunc_f32_e32 v12, v11
+; CGP-NEXT:    v_mac_f32_e32 v10, 0xcf800000, v12
+; CGP-NEXT:    v_cvt_u32_f32_e32 v13, v10
+; CGP-NEXT:    v_cvt_u32_f32_e32 v16, v12
+; CGP-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v14, v13, 0
+; CGP-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v14, v16, v[11:12]
+; CGP-NEXT:    v_mul_hi_u32 v17, v13, v10
+; CGP-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[11:12]
+; CGP-NEXT:    v_mul_lo_u32 v12, v16, v10
+; CGP-NEXT:    v_mul_hi_u32 v10, v16, v10
+; CGP-NEXT:    v_mul_lo_u32 v18, v13, v11
+; CGP-NEXT:    v_mul_lo_u32 v19, v16, v11
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v18
+; CGP-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v17
+; CGP-NEXT:    v_mul_hi_u32 v17, v13, v11
+; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v18, v12
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v19, v10
+; CGP-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v17
+; CGP-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v17, vcc, v18, v17
+; CGP-NEXT:    v_mul_hi_u32 v11, v16, v11
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
+; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v17, v12
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v10
+; CGP-NEXT:    v_addc_u32_e32 v16, vcc, v16, v11, vcc
+; CGP-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v14, v13, 0
+; CGP-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v14, v16, v[11:12]
+; CGP-NEXT:    v_ashrrev_i32_e32 v14, 31, v9
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v14
+; CGP-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[11:12]
+; CGP-NEXT:    v_addc_u32_e32 v9, vcc, v9, v14, vcc
+; CGP-NEXT:    v_xor_b32_e32 v12, v8, v14
+; CGP-NEXT:    v_mul_lo_u32 v8, v16, v10
+; CGP-NEXT:    v_mul_lo_u32 v15, v13, v11
+; CGP-NEXT:    v_xor_b32_e32 v17, v9, v14
+; CGP-NEXT:    v_mul_hi_u32 v9, v13, v10
+; CGP-NEXT:    v_mul_hi_u32 v10, v16, v10
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v15
+; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
-; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
 ; CGP-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v10, v14, v4
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
-; CGP-NEXT:    v_mul_hi_u32 v9, v11, v4
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v10, v3
+; CGP-NEXT:    v_mul_lo_u32 v9, v16, v11
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v15, v8
+; CGP-NEXT:    v_mul_hi_u32 v15, v13, v11
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
 ; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v9
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v15
+; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v15
+; CGP-NEXT:    v_mul_hi_u32 v11, v16, v11
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
 ; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v3, v8
-; CGP-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v2, v10, 0
-; CGP-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
-; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v8
-; CGP-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v2, v12, v[4:5]
-; CGP-NEXT:    v_sub_i32_e32 v3, vcc, v11, v3
-; CGP-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v1, v10, v[8:9]
-; CGP-NEXT:    v_subb_u32_e64 v4, s[4:5], v14, v8, vcc
-; CGP-NEXT:    v_sub_i32_e64 v8, s[4:5], v14, v8
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v4, v1
-; CGP-NEXT:    v_subb_u32_e32 v8, vcc, v8, v1, vcc
-; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v3, v2
-; CGP-NEXT:    v_sub_i32_e32 v3, vcc, v3, v2
-; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v4, v1
-; CGP-NEXT:    v_subbrev_u32_e32 v8, vcc, 0, v8, vcc
-; CGP-NEXT:    v_cndmask_b32_e64 v4, v9, v11, s[4:5]
-; CGP-NEXT:    v_add_i32_e32 v9, vcc, 1, v10
-; CGP-NEXT:    v_addc_u32_e32 v11, vcc, 0, v12, vcc
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v8, v1
-; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, -1, vcc
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v2
-; CGP-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
-; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, v8, v1
-; CGP-NEXT:    v_cndmask_b32_e32 v1, v14, v2, vcc
-; CGP-NEXT:    v_add_i32_e32 v2, vcc, 1, v9
-; CGP-NEXT:    v_addc_u32_e32 v3, vcc, 0, v11, vcc
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v11, v9
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v13, v8
+; CGP-NEXT:    v_addc_u32_e32 v9, vcc, v16, v9, vcc
+; CGP-NEXT:    v_mul_lo_u32 v10, v17, v8
+; CGP-NEXT:    v_mul_lo_u32 v11, v12, v9
+; CGP-NEXT:    v_mul_hi_u32 v13, v12, v8
+; CGP-NEXT:    v_mul_hi_u32 v8, v17, v8
+; CGP-NEXT:    v_mul_hi_u32 v15, v17, v9
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
+; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v13
+; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; CGP-NEXT:    v_mul_lo_u32 v13, v17, v9
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; CGP-NEXT:    v_mul_hi_u32 v11, v12, v9
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v13, v8
+; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
+; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v13, v11
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v8, v10
+; CGP-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v4, v13, 0
+; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v15, v10
+; CGP-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v4, v11, v[9:10]
+; CGP-NEXT:    v_sub_i32_e32 v8, vcc, v12, v8
+; CGP-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v1, v13, v[9:10]
+; CGP-NEXT:    v_subb_u32_e64 v10, s[4:5], v17, v9, vcc
+; CGP-NEXT:    v_sub_i32_e64 v9, s[4:5], v17, v9
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v10, v1
+; CGP-NEXT:    v_subb_u32_e32 v9, vcc, v9, v1, vcc
+; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[4:5]
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v8, v4
+; CGP-NEXT:    v_sub_i32_e32 v8, vcc, v8, v4
+; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, -1, s[4:5]
+; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v10, v1
+; CGP-NEXT:    v_subbrev_u32_e32 v9, vcc, 0, v9, vcc
+; CGP-NEXT:    v_cndmask_b32_e64 v10, v12, v15, s[4:5]
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, 1, v13
+; CGP-NEXT:    v_addc_u32_e32 v15, vcc, 0, v11, vcc
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v9, v1
+; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, -1, vcc
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v8, v4
+; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc
+; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, v9, v1
+; CGP-NEXT:    v_cndmask_b32_e32 v1, v16, v4, vcc
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, 1, v12
+; CGP-NEXT:    v_addc_u32_e32 v8, vcc, 0, v15, vcc
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
-; CGP-NEXT:    v_cndmask_b32_e32 v1, v9, v2, vcc
-; CGP-NEXT:    v_cndmask_b32_e32 v2, v11, v3, vcc
-; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
-; CGP-NEXT:    v_cndmask_b32_e32 v1, v10, v1, vcc
-; CGP-NEXT:    v_xor_b32_e32 v3, v13, v0
-; CGP-NEXT:    v_cndmask_b32_e32 v2, v12, v2, vcc
-; CGP-NEXT:    v_xor_b32_e32 v0, v1, v3
-; CGP-NEXT:    v_xor_b32_e32 v1, v2, v3
-; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v3
-; CGP-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
-; CGP-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; CGP-NEXT:    v_cndmask_b32_e32 v1, v12, v4, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v4, v15, v8, vcc
+; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
+; CGP-NEXT:    v_cndmask_b32_e32 v1, v13, v1, vcc
+; CGP-NEXT:    v_xor_b32_e32 v8, v14, v0
+; CGP-NEXT:    v_cndmask_b32_e32 v4, v11, v4, vcc
+; CGP-NEXT:    v_xor_b32_e32 v0, v1, v8
+; CGP-NEXT:    v_xor_b32_e32 v1, v4, v8
+; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v8
+; CGP-NEXT:    v_subb_u32_e32 v1, vcc, v1, v8, vcc
+; CGP-NEXT:    ; implicit-def: $vgpr11_vgpr12
 ; CGP-NEXT:    ; implicit-def: $vgpr8
 ; CGP-NEXT:  .LBB8_2: ; %Flow1
-; CGP-NEXT:    s_or_saveexec_b64 s[8:9], s[8:9]
-; CGP-NEXT:    v_lshl_b64 v[9:10], s[6:7], v6
-; CGP-NEXT:    s_xor_b64 exec, exec, s[8:9]
+; CGP-NEXT:    s_or_saveexec_b64 s[6:7], s[6:7]
+; CGP-NEXT:    v_lshl_b64 v[9:10], v[2:3], v6
+; CGP-NEXT:    s_xor_b64 exec, exec, s[6:7]
 ; CGP-NEXT:    s_cbranch_execz .LBB8_4
 ; CGP-NEXT:  ; %bb.3:
-; CGP-NEXT:    v_cvt_f32_u32_e32 v0, v2
-; CGP-NEXT:    v_sub_i32_e32 v1, vcc, 0, v2
+; CGP-NEXT:    v_cvt_f32_u32_e32 v0, v11
+; CGP-NEXT:    v_sub_i32_e32 v1, vcc, 0, v11
 ; CGP-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; CGP-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v0, v0
@@ -2879,19 +2882,19 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; CGP-NEXT:    v_mul_hi_u32 v0, v8, v0
-; CGP-NEXT:    v_mul_lo_u32 v1, v0, v2
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, 1, v0
+; CGP-NEXT:    v_mul_lo_u32 v1, v0, v11
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
 ; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v8, v1
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v2
-; CGP-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; CGP-NEXT:    v_sub_i32_e64 v3, s[4:5], v1, v2
-; CGP-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, 1, v0
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v2
-; CGP-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v11
+; CGP-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; CGP-NEXT:    v_sub_i32_e64 v2, s[4:5], v1, v11
+; CGP-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v11
+; CGP-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; CGP-NEXT:    v_mov_b32_e32 v1, 0
 ; CGP-NEXT:  .LBB8_4:
-; CGP-NEXT:    s_or_b64 exec, exec, s[8:9]
+; CGP-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; CGP-NEXT:    v_or_b32_e32 v3, v7, v10
 ; CGP-NEXT:    v_mov_b32_e32 v2, 0
 ; CGP-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
index d0c55c69f5087..d1599ac489a5f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
@@ -2219,12 +2219,13 @@ define i64 @v_srem_i64_pow2_shl_denom(i64 %x, i64 %y) {
 ; CHECK-LABEL: v_srem_i64_pow2_shl_denom:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_mov_b64 s[4:5], 0x1000
-; CHECK-NEXT:    v_lshl_b64 v[5:6], s[4:5], v2
-; CHECK-NEXT:    v_mov_b32_e32 v4, v1
 ; CHECK-NEXT:    v_mov_b32_e32 v3, v0
-; CHECK-NEXT:    v_or_b32_e32 v1, v4, v6
+; CHECK-NEXT:    v_mov_b32_e32 v4, v1
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0x1000
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0
+; CHECK-NEXT:    v_lshl_b64 v[5:6], v[0:1], v2
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    v_or_b32_e32 v1, v4, v6
 ; CHECK-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
 ; CHECK-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; CHECK-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -2671,159 +2672,164 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-LABEL: v_srem_v2i64_pow2_shl_denom:
 ; CGP:       ; %bb.0:
 ; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CGP-NEXT:    s_mov_b64 s[6:7], 0x1000
-; CGP-NEXT:    v_mov_b32_e32 v5, v2
-; CGP-NEXT:    v_mov_b32_e32 v7, v3
-; CGP-NEXT:    v_lshl_b64 v[2:3], s[6:7], v4
+; CGP-NEXT:    s_mov_b64 s[4:5], 0x1000
+; CGP-NEXT:    v_lshl_b64 v[11:12], s[4:5], v4
 ; CGP-NEXT:    v_mov_b32_e32 v9, v1
 ; CGP-NEXT:    v_mov_b32_e32 v8, v0
-; CGP-NEXT:    v_or_b32_e32 v1, v9, v3
+; CGP-NEXT:    v_or_b32_e32 v1, v9, v12
 ; CGP-NEXT:    v_mov_b32_e32 v0, 0
+; CGP-NEXT:    v_mov_b32_e32 v5, v2
+; CGP-NEXT:    v_mov_b32_e32 v7, v3
+; CGP-NEXT:    v_mov_b32_e32 v2, 0x1000
 ; CGP-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; CGP-NEXT:    v_mov_b32_e32 v3, 0
 ; CGP-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; CGP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; CGP-NEXT:    s_xor_b64 s[8:9], exec, s[4:5]
+; CGP-NEXT:    s_xor_b64 s[6:7], exec, s[4:5]
 ; CGP-NEXT:    s_cbranch_execz .LBB8_2
 ; CGP-NEXT:  ; %bb.1:
-; CGP-NEXT:    v_ashrrev_i32_e32 v1, 31, v3
-; CGP-NEXT:    v_add_i32_e32 v0, vcc, v2, v1
-; CGP-NEXT:    v_addc_u32_e32 v2, vcc, v3, v1, vcc
+; CGP-NEXT:    v_ashrrev_i32_e32 v1, 31, v12
+; CGP-NEXT:    v_add_i32_e32 v0, vcc, v11, v1
+; CGP-NEXT:    v_addc_u32_e32 v4, vcc, v12, v1, vcc
 ; CGP-NEXT:    v_xor_b32_e32 v0, v0, v1
-; CGP-NEXT:    v_xor_b32_e32 v1, v2, v1
-; CGP-NEXT:    v_cvt_f32_u32_e32 v2, v0
-; CGP-NEXT:    v_cvt_f32_u32_e32 v3, v1
-; CGP-NEXT:    v_sub_i32_e32 v11, vcc, 0, v0
-; CGP-NEXT:    v_subb_u32_e32 v12, vcc, 0, v1, vcc
-; CGP-NEXT:    v_mac_f32_e32 v2, 0x4f800000, v3
-; CGP-NEXT:    v_rcp_iflag_f32_e32 v2, v2
-; CGP-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
-; CGP-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v2
-; CGP-NEXT:    v_trunc_f32_e32 v4, v3
-; CGP-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v4
-; CGP-NEXT:    v_cvt_u32_f32_e32 v10, v2
+; CGP-NEXT:    v_xor_b32_e32 v1, v4, v1
+; CGP-NEXT:    v_cvt_f32_u32_e32 v4, v0
+; CGP-NEXT:    v_cvt_f32_u32_e32 v10, v1
+; CGP-NEXT:    v_sub_i32_e32 v14, vcc, 0, v0
+; CGP-NEXT:    v_subb_u32_e32 v15, vcc, 0, v1, vcc
+; CGP-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v10
+; CGP-NEXT:    v_rcp_iflag_f32_e32 v4, v4
+; CGP-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
+; CGP-NEXT:    v_mul_f32_e32 v10, 0x2f800000, v4
+; CGP-NEXT:    v_trunc_f32_e32 v12, v10
+; CGP-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v12
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v13, v4
-; CGP-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v11, v10, 0
-; CGP-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v11, v13, v[3:4]
-; CGP-NEXT:    v_mul_hi_u32 v14, v10, v2
-; CGP-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v12, v10, v[3:4]
-; CGP-NEXT:    v_mul_lo_u32 v4, v13, v2
-; CGP-NEXT:    v_mul_hi_u32 v2, v13, v2
-; CGP-NEXT:    v_mul_lo_u32 v15, v10, v3
-; CGP-NEXT:    v_mul_lo_u32 v16, v13, v3
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v15
-; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v14
-; CGP-NEXT:    v_mul_hi_u32 v14, v10, v3
-; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v15, v4
-; CGP-NEXT:    v_add_i32_e32 v2, vcc, v16, v2
-; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v14
-; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
-; CGP-NEXT:    v_mul_hi_u32 v3, v13, v3
-; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
-; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v14, v4
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v2
-; CGP-NEXT:    v_addc_u32_e32 v13, vcc, v13, v3, vcc
-; CGP-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v11, v10, 0
-; CGP-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v11, v13, v[3:4]
-; CGP-NEXT:    v_ashrrev_i32_e32 v11, 31, v9
-; CGP-NEXT:    v_mul_hi_u32 v14, v10, v2
-; CGP-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v12, v10, v[3:4]
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v8, v11
-; CGP-NEXT:    v_addc_u32_e32 v8, vcc, v9, v11, vcc
-; CGP-NEXT:    v_xor_b32_e32 v9, v4, v11
-; CGP-NEXT:    v_mul_lo_u32 v4, v13, v2
-; CGP-NEXT:    v_mul_lo_u32 v12, v10, v3
-; CGP-NEXT:    v_mul_hi_u32 v2, v13, v2
-; CGP-NEXT:    v_xor_b32_e32 v8, v8, v11
+; CGP-NEXT:    v_cvt_u32_f32_e32 v16, v12
+; CGP-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v14, v13, 0
+; CGP-NEXT:    v_mov_b32_e32 v4, v11
+; CGP-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v14, v16, v[4:5]
+; CGP-NEXT:    v_mul_lo_u32 v4, v16, v10
+; CGP-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[11:12]
+; CGP-NEXT:    v_mul_hi_u32 v12, v13, v10
+; CGP-NEXT:    v_mul_hi_u32 v10, v16, v10
+; CGP-NEXT:    v_mul_lo_u32 v17, v13, v11
+; CGP-NEXT:    v_mul_lo_u32 v18, v16, v11
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v17
+; CGP-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v12
-; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v14
+; CGP-NEXT:    v_mul_hi_u32 v12, v13, v11
 ; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v14, v13, v3
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v12, v4
-; CGP-NEXT:    v_mul_hi_u32 v12, v10, v3
-; CGP-NEXT:    v_add_i32_e32 v2, vcc, v14, v2
-; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v12
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v17, v4
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v18, v10
+; CGP-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
 ; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v12, vcc, v14, v12
-; CGP-NEXT:    v_mul_hi_u32 v3, v13, v3
-; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
-; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v12, v4
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
-; CGP-NEXT:    v_add_i32_e32 v2, vcc, v10, v2
-; CGP-NEXT:    v_addc_u32_e32 v3, vcc, v13, v3, vcc
-; CGP-NEXT:    v_mul_lo_u32 v4, v8, v2
-; CGP-NEXT:    v_mul_lo_u32 v10, v9, v3
-; CGP-NEXT:    v_mul_hi_u32 v12, v9, v2
-; CGP-NEXT:    v_mul_hi_u32 v2, v8, v2
-; CGP-NEXT:    v_mul_hi_u32 v13, v8, v3
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v10
-; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v12
-; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v12, v8, v3
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v17, v12
+; CGP-NEXT:    v_mul_hi_u32 v11, v16, v11
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v10, v4
-; CGP-NEXT:    v_mul_hi_u32 v10, v9, v3
-; CGP-NEXT:    v_add_i32_e32 v2, vcc, v12, v2
-; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v10
 ; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, v12, v10
-; CGP-NEXT:    v_add_i32_e32 v12, vcc, v2, v4
-; CGP-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v0, v12, 0
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v4
+; CGP-NEXT:    v_addc_u32_e32 v16, vcc, v16, v10, vcc
+; CGP-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v14, v13, 0
+; CGP-NEXT:    v_mov_b32_e32 v4, v11
+; CGP-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v14, v16, v[4:5]
+; CGP-NEXT:    v_ashrrev_i32_e32 v14, 31, v9
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v8, v14
+; CGP-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[11:12]
+; CGP-NEXT:    v_addc_u32_e32 v8, vcc, v9, v14, vcc
+; CGP-NEXT:    v_xor_b32_e32 v12, v4, v14
+; CGP-NEXT:    v_mul_lo_u32 v4, v16, v10
+; CGP-NEXT:    v_mul_lo_u32 v9, v13, v11
+; CGP-NEXT:    v_xor_b32_e32 v15, v8, v14
+; CGP-NEXT:    v_mul_hi_u32 v8, v13, v10
+; CGP-NEXT:    v_mul_hi_u32 v10, v16, v10
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v9
+; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
 ; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v10, v4
+; CGP-NEXT:    v_mul_lo_u32 v8, v16, v11
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v9, v4
+; CGP-NEXT:    v_mul_hi_u32 v9, v13, v11
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
+; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
+; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
+; CGP-NEXT:    v_mul_hi_u32 v10, v16, v11
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v8, v4
+; CGP-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v13, v4
-; CGP-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v0, v4, v[3:4]
-; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v9, v2
-; CGP-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v1, v12, v[3:4]
-; CGP-NEXT:    v_subb_u32_e64 v4, s[4:5], v8, v3, vcc
-; CGP-NEXT:    v_sub_i32_e64 v3, s[4:5], v8, v3
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v4, v1
-; CGP-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v2, v0
-; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v4, v1
-; CGP-NEXT:    v_subb_u32_e32 v3, vcc, v3, v1, vcc
-; CGP-NEXT:    v_cndmask_b32_e64 v8, v8, v9, s[4:5]
-; CGP-NEXT:    v_sub_i32_e32 v9, vcc, v2, v0
-; CGP-NEXT:    v_subbrev_u32_e64 v10, s[4:5], 0, v3, vcc
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v10, v1
-; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v9, v0
+; CGP-NEXT:    v_addc_u32_e32 v8, vcc, v16, v8, vcc
+; CGP-NEXT:    v_mul_lo_u32 v9, v15, v4
+; CGP-NEXT:    v_mul_lo_u32 v10, v12, v8
+; CGP-NEXT:    v_mul_hi_u32 v11, v12, v4
+; CGP-NEXT:    v_mul_hi_u32 v4, v15, v4
+; CGP-NEXT:    v_mul_hi_u32 v13, v15, v8
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
+; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
+; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; CGP-NEXT:    v_mul_lo_u32 v11, v15, v8
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
+; CGP-NEXT:    v_mul_hi_u32 v10, v12, v8
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v11, v4
+; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v10
+; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v4, v9
+; CGP-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v0, v11, 0
+; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v10, v4
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v13, v4
+; CGP-NEXT:    v_mov_b32_e32 v4, v9
+; CGP-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v0, v10, v[4:5]
+; CGP-NEXT:    v_sub_i32_e32 v4, vcc, v12, v8
+; CGP-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v1, v11, v[9:10]
+; CGP-NEXT:    v_subb_u32_e64 v8, s[4:5], v15, v9, vcc
+; CGP-NEXT:    v_sub_i32_e64 v9, s[4:5], v15, v9
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v8, v1
+; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[4:5]
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v4, v0
+; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[4:5]
+; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v8, v1
+; CGP-NEXT:    v_subb_u32_e32 v9, vcc, v9, v1, vcc
+; CGP-NEXT:    v_cndmask_b32_e64 v10, v10, v11, s[4:5]
+; CGP-NEXT:    v_sub_i32_e32 v11, vcc, v4, v0
+; CGP-NEXT:    v_subbrev_u32_e64 v12, s[4:5], 0, v9, vcc
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v12, v1
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v10, v1
-; CGP-NEXT:    v_subb_u32_e32 v1, vcc, v3, v1, vcc
-; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v9, v0
-; CGP-NEXT:    v_cndmask_b32_e64 v12, v12, v13, s[4:5]
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v11, v0
+; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, -1, s[4:5]
+; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v12, v1
+; CGP-NEXT:    v_subb_u32_e32 v1, vcc, v9, v1, vcc
+; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v11, v0
+; CGP-NEXT:    v_cndmask_b32_e64 v13, v13, v15, s[4:5]
 ; CGP-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v12
-; CGP-NEXT:    v_cndmask_b32_e32 v0, v9, v0, vcc
-; CGP-NEXT:    v_cndmask_b32_e32 v1, v10, v1, vcc
-; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v8
-; CGP-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; CGP-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
-; CGP-NEXT:    v_xor_b32_e32 v0, v0, v11
-; CGP-NEXT:    v_xor_b32_e32 v1, v1, v11
-; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v11
-; CGP-NEXT:    v_subb_u32_e32 v1, vcc, v1, v11, vcc
-; CGP-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v13
+; CGP-NEXT:    v_cndmask_b32_e32 v0, v11, v0, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v1, v12, v1, vcc
+; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
+; CGP-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc
+; CGP-NEXT:    v_xor_b32_e32 v0, v0, v14
+; CGP-NEXT:    v_xor_b32_e32 v1, v1, v14
+; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v14
+; CGP-NEXT:    v_subb_u32_e32 v1, vcc, v1, v14, vcc
+; CGP-NEXT:    ; implicit-def: $vgpr11_vgpr12
 ; CGP-NEXT:    ; implicit-def: $vgpr8
 ; CGP-NEXT:  .LBB8_2: ; %Flow1
-; CGP-NEXT:    s_or_saveexec_b64 s[4:5], s[8:9]
-; CGP-NEXT:    v_lshl_b64 v[9:10], s[6:7], v6
+; CGP-NEXT:    s_or_saveexec_b64 s[4:5], s[6:7]
+; CGP-NEXT:    v_lshl_b64 v[9:10], v[2:3], v6
 ; CGP-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; CGP-NEXT:    s_cbranch_execz .LBB8_4
 ; CGP-NEXT:  ; %bb.3:
-; CGP-NEXT:    v_cvt_f32_u32_e32 v0, v2
-; CGP-NEXT:    v_sub_i32_e32 v1, vcc, 0, v2
+; CGP-NEXT:    v_cvt_f32_u32_e32 v0, v11
+; CGP-NEXT:    v_sub_i32_e32 v1, vcc, 0, v11
 ; CGP-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; CGP-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v0, v0
@@ -2831,13 +2837,13 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; CGP-NEXT:    v_mul_hi_u32 v0, v8, v0
-; CGP-NEXT:    v_mul_lo_u32 v0, v0, v2
+; CGP-NEXT:    v_mul_lo_u32 v0, v0, v11
 ; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v8, v0
-; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v0, v2
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
+; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v0, v11
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v11
 ; CGP-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v0, v2
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
+; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v0, v11
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v11
 ; CGP-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; CGP-NEXT:    v_mov_b32_e32 v1, 0
 ; CGP-NEXT:  .LBB8_4:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
index 77737b356ff6e..b3b57e14cb3fb 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
@@ -1070,11 +1070,12 @@ define i64 @v_udiv_i64_pow2_shl_denom(i64 %x, i64 %y) {
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_mov_b32_e32 v3, v0
 ; CHECK-NEXT:    v_mov_b32_e32 v4, v1
-; CHECK-NEXT:    s_mov_b64 s[4:5], 0x1000
-; CHECK-NEXT:    v_mov_b32_e32 v0, 0
-; CHECK-NEXT:    v_lshl_b64 v[5:6], s[4:5], v2
-; CHECK-NEXT:    v_or_b32_e32 v1, v4, v6
-; CHECK-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0x1000
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0
+; CHECK-NEXT:    v_mov_b32_e32 v7, 0
+; CHECK-NEXT:    v_lshl_b64 v[5:6], v[0:1], v2
+; CHECK-NEXT:    v_or_b32_e32 v8, v4, v6
+; CHECK-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[7:8]
 ; CHECK-NEXT:    v_cvt_f32_u32_e32 v2, v5
 ; CHECK-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; CHECK-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -1509,20 +1510,22 @@ define <2 x i64> @v_udiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_mov_b32_e32 v9, v1
 ; CGP-NEXT:    v_mov_b32_e32 v5, v2
 ; CGP-NEXT:    v_mov_b32_e32 v7, v3
-; CGP-NEXT:    s_mov_b64 s[6:7], 0x1000
+; CGP-NEXT:    s_mov_b64 s[4:5], 0x1000
+; CGP-NEXT:    v_mov_b32_e32 v10, 0x1000
+; CGP-NEXT:    v_mov_b32_e32 v11, 0
 ; CGP-NEXT:    v_mov_b32_e32 v0, 0
-; CGP-NEXT:    v_lshl_b64 v[2:3], s[6:7], v4
+; CGP-NEXT:    v_lshl_b64 v[2:3], s[4:5], v4
 ; CGP-NEXT:    v_or_b32_e32 v1, v9, v3
 ; CGP-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v4, v2
 ; CGP-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; CGP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; CGP-NEXT:    s_xor_b64 s[8:9], exec, s[4:5]
+; CGP-NEXT:    s_xor_b64 s[6:7], exec, s[4:5]
 ; CGP-NEXT:    s_cbranch_execz .LBB8_2
 ; CGP-NEXT:  ; %bb.1:
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v0, v3
 ; CGP-NEXT:    v_sub_i32_e32 v1, vcc, 0, v2
-; CGP-NEXT:    v_subb_u32_e32 v10, vcc, 0, v3, vcc
+; CGP-NEXT:    v_subb_u32_e32 v12, vcc, 0, v3, vcc
 ; CGP-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v0
 ; CGP-NEXT:    v_rcp_iflag_f32_e32 v0, v4
 ; CGP-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -1531,105 +1534,105 @@ define <2 x i64> @v_udiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v4
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v4, v4
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; CGP-NEXT:    v_mul_lo_u32 v11, v1, v4
-; CGP-NEXT:    v_mul_lo_u32 v12, v1, v0
-; CGP-NEXT:    v_mul_lo_u32 v13, v10, v0
-; CGP-NEXT:    v_mul_hi_u32 v14, v1, v0
-; CGP-NEXT:    v_add_i32_e32 v11, vcc, v13, v11
-; CGP-NEXT:    v_mul_lo_u32 v13, v4, v12
-; CGP-NEXT:    v_mul_hi_u32 v15, v0, v12
-; CGP-NEXT:    v_mul_hi_u32 v12, v4, v12
-; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v14
-; CGP-NEXT:    v_mul_lo_u32 v14, v0, v11
-; CGP-NEXT:    v_mul_lo_u32 v16, v4, v11
-; CGP-NEXT:    v_mul_hi_u32 v17, v0, v11
-; CGP-NEXT:    v_mul_hi_u32 v11, v4, v11
-; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
-; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v12, vcc, v16, v12
+; CGP-NEXT:    v_mul_lo_u32 v13, v1, v4
+; CGP-NEXT:    v_mul_lo_u32 v14, v1, v0
+; CGP-NEXT:    v_mul_lo_u32 v15, v12, v0
+; CGP-NEXT:    v_mul_hi_u32 v16, v1, v0
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v15, v13
+; CGP-NEXT:    v_mul_lo_u32 v15, v4, v14
+; CGP-NEXT:    v_mul_hi_u32 v17, v0, v14
+; CGP-NEXT:    v_mul_hi_u32 v14, v4, v14
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v16
+; CGP-NEXT:    v_mul_lo_u32 v16, v0, v13
+; CGP-NEXT:    v_mul_lo_u32 v18, v4, v13
+; CGP-NEXT:    v_mul_hi_u32 v19, v0, v13
+; CGP-NEXT:    v_mul_hi_u32 v13, v4, v13
+; CGP-NEXT:    v_add_i32_e32 v15, vcc, v15, v16
 ; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v14, vcc, v18, v14
+; CGP-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v15, vcc, v15, v17
+; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v14, vcc, v14, v19
+; CGP-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v15, vcc, v16, v15
+; CGP-NEXT:    v_add_i32_e32 v16, vcc, v18, v17
+; CGP-NEXT:    v_add_i32_e32 v14, vcc, v14, v15
+; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v15, vcc, v16, v15
 ; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v15
-; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v17
+; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v14
+; CGP-NEXT:    v_addc_u32_e32 v4, vcc, v4, v13, vcc
+; CGP-NEXT:    v_mul_lo_u32 v13, v1, v0
+; CGP-NEXT:    v_mul_lo_u32 v12, v12, v0
+; CGP-NEXT:    v_mul_hi_u32 v14, v1, v0
+; CGP-NEXT:    v_mul_lo_u32 v1, v1, v4
+; CGP-NEXT:    v_mul_lo_u32 v15, v4, v13
+; CGP-NEXT:    v_mul_hi_u32 v16, v0, v13
+; CGP-NEXT:    v_mul_hi_u32 v13, v4, v13
+; CGP-NEXT:    v_add_i32_e32 v1, vcc, v12, v1
+; CGP-NEXT:    v_add_i32_e32 v1, vcc, v1, v14
+; CGP-NEXT:    v_mul_lo_u32 v12, v0, v1
+; CGP-NEXT:    v_mul_lo_u32 v14, v4, v1
+; CGP-NEXT:    v_mul_hi_u32 v17, v0, v1
+; CGP-NEXT:    v_mul_hi_u32 v1, v4, v1
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v15, v12
 ; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
-; CGP-NEXT:    v_add_i32_e32 v14, vcc, v16, v15
-; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v13
+; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v16
+; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v17
+; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v15, v12
+; CGP-NEXT:    v_add_i32_e32 v14, vcc, v14, v16
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
-; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v13
+; CGP-NEXT:    v_add_i32_e32 v1, vcc, v1, v13
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v12
-; CGP-NEXT:    v_addc_u32_e32 v4, vcc, v4, v11, vcc
-; CGP-NEXT:    v_mul_lo_u32 v11, v1, v0
-; CGP-NEXT:    v_mul_lo_u32 v10, v10, v0
-; CGP-NEXT:    v_mul_hi_u32 v12, v1, v0
-; CGP-NEXT:    v_mul_lo_u32 v1, v1, v4
-; CGP-NEXT:    v_mul_lo_u32 v13, v4, v11
-; CGP-NEXT:    v_mul_hi_u32 v14, v0, v11
-; CGP-NEXT:    v_mul_hi_u32 v11, v4, v11
-; CGP-NEXT:    v_add_i32_e32 v1, vcc, v10, v1
-; CGP-NEXT:    v_add_i32_e32 v1, vcc, v1, v12
-; CGP-NEXT:    v_mul_lo_u32 v10, v0, v1
-; CGP-NEXT:    v_mul_lo_u32 v12, v4, v1
-; CGP-NEXT:    v_mul_hi_u32 v15, v0, v1
-; CGP-NEXT:    v_mul_hi_u32 v1, v4, v1
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v13, v10
-; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
-; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v14
-; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v15
-; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v13, v10
-; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v14
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
-; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
-; CGP-NEXT:    v_add_i32_e32 v1, vcc, v1, v11
-; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v10
 ; CGP-NEXT:    v_addc_u32_e32 v1, vcc, v4, v1, vcc
 ; CGP-NEXT:    v_mul_lo_u32 v4, v9, v0
-; CGP-NEXT:    v_mul_hi_u32 v10, v8, v0
+; CGP-NEXT:    v_mul_hi_u32 v12, v8, v0
 ; CGP-NEXT:    v_mul_hi_u32 v0, v9, v0
-; CGP-NEXT:    v_mul_lo_u32 v11, v8, v1
-; CGP-NEXT:    v_mul_lo_u32 v12, v9, v1
-; CGP-NEXT:    v_mul_hi_u32 v13, v8, v1
+; CGP-NEXT:    v_mul_lo_u32 v13, v8, v1
+; CGP-NEXT:    v_mul_lo_u32 v14, v9, v1
+; CGP-NEXT:    v_mul_hi_u32 v15, v8, v1
 ; CGP-NEXT:    v_mul_hi_u32 v1, v9, v1
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v11
-; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v0, vcc, v12, v0
-; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v10
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v13
+; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v0, vcc, v14, v0
+; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v12
 ; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v13
-; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v11, v4
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v12, v10
+; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v15
+; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v13, v4
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v14, v12
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
 ; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v10, v4
-; CGP-NEXT:    v_mul_lo_u32 v10, v2, v0
-; CGP-NEXT:    v_mul_lo_u32 v11, v3, v0
-; CGP-NEXT:    v_mul_hi_u32 v12, v2, v0
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v12, v4
+; CGP-NEXT:    v_mul_lo_u32 v12, v2, v0
+; CGP-NEXT:    v_mul_lo_u32 v13, v3, v0
+; CGP-NEXT:    v_mul_hi_u32 v14, v2, v0
 ; CGP-NEXT:    v_add_i32_e32 v1, vcc, v1, v4
 ; CGP-NEXT:    v_mul_lo_u32 v4, v2, v1
-; CGP-NEXT:    v_add_i32_e32 v13, vcc, 1, v0
-; CGP-NEXT:    v_addc_u32_e32 v14, vcc, 0, v1, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v11, v4
-; CGP-NEXT:    v_add_i32_e32 v11, vcc, 1, v13
-; CGP-NEXT:    v_addc_u32_e32 v15, vcc, 0, v14, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v12
-; CGP-NEXT:    v_sub_i32_e32 v8, vcc, v8, v10
-; CGP-NEXT:    v_subb_u32_e64 v10, s[4:5], v9, v4, vcc
+; CGP-NEXT:    v_add_i32_e32 v15, vcc, 1, v0
+; CGP-NEXT:    v_addc_u32_e32 v16, vcc, 0, v1, vcc
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v13, v4
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, 1, v15
+; CGP-NEXT:    v_addc_u32_e32 v17, vcc, 0, v16, vcc
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v14
+; CGP-NEXT:    v_sub_i32_e32 v8, vcc, v8, v12
+; CGP-NEXT:    v_subb_u32_e64 v12, s[4:5], v9, v4, vcc
 ; CGP-NEXT:    v_sub_i32_e64 v4, s[4:5], v9, v4
 ; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v8, v2
 ; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v10, v3
-; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[4:5]
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v12, v3
+; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, -1, s[4:5]
 ; CGP-NEXT:    v_subb_u32_e32 v4, vcc, v4, v3, vcc
-; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, v10, v3
-; CGP-NEXT:    v_cndmask_b32_e32 v9, v12, v9, vcc
+; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, v12, v3
+; CGP-NEXT:    v_cndmask_b32_e32 v9, v14, v9, vcc
 ; CGP-NEXT:    v_sub_i32_e32 v8, vcc, v8, v2
 ; CGP-NEXT:    v_subbrev_u32_e32 v4, vcc, 0, v4, vcc
 ; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v8, v2
@@ -1639,8 +1642,8 @@ define <2 x i64> @v_udiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
 ; CGP-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; CGP-NEXT:    v_cndmask_b32_e32 v2, v13, v11, vcc
-; CGP-NEXT:    v_cndmask_b32_e32 v3, v14, v15, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v2, v15, v13, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v3, v16, v17, vcc
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v9
 ; CGP-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; CGP-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
@@ -1648,9 +1651,9 @@ define <2 x i64> @v_udiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; CGP-NEXT:    ; implicit-def: $vgpr8
 ; CGP-NEXT:  .LBB8_2: ; %Flow1
-; CGP-NEXT:    s_or_saveexec_b64 s[8:9], s[8:9]
-; CGP-NEXT:    v_lshl_b64 v[9:10], s[6:7], v6
-; CGP-NEXT:    s_xor_b64 exec, exec, s[8:9]
+; CGP-NEXT:    s_or_saveexec_b64 s[6:7], s[6:7]
+; CGP-NEXT:    v_lshl_b64 v[9:10], v[10:11], v6
+; CGP-NEXT:    s_xor_b64 exec, exec, s[6:7]
 ; CGP-NEXT:    s_cbranch_execz .LBB8_4
 ; CGP-NEXT:  ; %bb.3:
 ; CGP-NEXT:    v_rcp_iflag_f32_e32 v0, v4
@@ -1673,7 +1676,7 @@ define <2 x i64> @v_udiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; CGP-NEXT:    v_mov_b32_e32 v1, 0
 ; CGP-NEXT:  .LBB8_4:
-; CGP-NEXT:    s_or_b64 exec, exec, s[8:9]
+; CGP-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; CGP-NEXT:    v_or_b32_e32 v3, v7, v10
 ; CGP-NEXT:    v_mov_b32_e32 v2, 0
 ; CGP-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
index 097f6642cbc66..ecf7cc921886c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
@@ -1576,11 +1576,12 @@ define i64 @v_urem_i64_pow2_shl_denom(i64 %x, i64 %y) {
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_mov_b32_e32 v3, v0
 ; CHECK-NEXT:    v_mov_b32_e32 v4, v1
-; CHECK-NEXT:    s_mov_b64 s[4:5], 0x1000
-; CHECK-NEXT:    v_mov_b32_e32 v0, 0
-; CHECK-NEXT:    v_lshl_b64 v[5:6], s[4:5], v2
-; CHECK-NEXT:    v_or_b32_e32 v1, v4, v6
-; CHECK-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0x1000
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0
+; CHECK-NEXT:    v_mov_b32_e32 v7, 0
+; CHECK-NEXT:    v_lshl_b64 v[5:6], v[0:1], v2
+; CHECK-NEXT:    v_or_b32_e32 v8, v4, v6
+; CHECK-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[7:8]
 ; CHECK-NEXT:    v_cvt_f32_u32_e32 v2, v5
 ; CHECK-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; CHECK-NEXT:    s_and_saveexec_b64 s[4:5], vcc
@@ -2010,20 +2011,22 @@ define <2 x i64> @v_urem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_mov_b32_e32 v9, v1
 ; CGP-NEXT:    v_mov_b32_e32 v5, v2
 ; CGP-NEXT:    v_mov_b32_e32 v7, v3
-; CGP-NEXT:    s_mov_b64 s[6:7], 0x1000
+; CGP-NEXT:    s_mov_b64 s[4:5], 0x1000
+; CGP-NEXT:    v_mov_b32_e32 v10, 0x1000
+; CGP-NEXT:    v_mov_b32_e32 v11, 0
 ; CGP-NEXT:    v_mov_b32_e32 v0, 0
-; CGP-NEXT:    v_lshl_b64 v[2:3], s[6:7], v4
+; CGP-NEXT:    v_lshl_b64 v[2:3], s[4:5], v4
 ; CGP-NEXT:    v_or_b32_e32 v1, v9, v3
 ; CGP-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v4, v2
 ; CGP-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; CGP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; CGP-NEXT:    s_xor_b64 s[8:9], exec, s[4:5]
+; CGP-NEXT:    s_xor_b64 s[6:7], exec, s[4:5]
 ; CGP-NEXT:    s_cbranch_execz .LBB8_2
 ; CGP-NEXT:  ; %bb.1:
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v0, v3
 ; CGP-NEXT:    v_sub_i32_e32 v1, vcc, 0, v2
-; CGP-NEXT:    v_subb_u32_e32 v10, vcc, 0, v3, vcc
+; CGP-NEXT:    v_subb_u32_e32 v12, vcc, 0, v3, vcc
 ; CGP-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v0
 ; CGP-NEXT:    v_rcp_iflag_f32_e32 v0, v4
 ; CGP-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -2032,92 +2035,92 @@ define <2 x i64> @v_urem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v4
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v4, v4
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; CGP-NEXT:    v_mul_lo_u32 v11, v1, v4
-; CGP-NEXT:    v_mul_lo_u32 v12, v1, v0
-; CGP-NEXT:    v_mul_lo_u32 v13, v10, v0
-; CGP-NEXT:    v_mul_hi_u32 v14, v1, v0
-; CGP-NEXT:    v_add_i32_e32 v11, vcc, v13, v11
-; CGP-NEXT:    v_mul_lo_u32 v13, v4, v12
-; CGP-NEXT:    v_mul_hi_u32 v15, v0, v12
-; CGP-NEXT:    v_mul_hi_u32 v12, v4, v12
-; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v14
-; CGP-NEXT:    v_mul_lo_u32 v14, v0, v11
-; CGP-NEXT:    v_mul_lo_u32 v16, v4, v11
-; CGP-NEXT:    v_mul_hi_u32 v17, v0, v11
-; CGP-NEXT:    v_mul_hi_u32 v11, v4, v11
-; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
-; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v12, vcc, v16, v12
+; CGP-NEXT:    v_mul_lo_u32 v13, v1, v4
+; CGP-NEXT:    v_mul_lo_u32 v14, v1, v0
+; CGP-NEXT:    v_mul_lo_u32 v15, v12, v0
+; CGP-NEXT:    v_mul_hi_u32 v16, v1, v0
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v15, v13
+; CGP-NEXT:    v_mul_lo_u32 v15, v4, v14
+; CGP-NEXT:    v_mul_hi_u32 v17, v0, v14
+; CGP-NEXT:    v_mul_hi_u32 v14, v4, v14
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v16
+; CGP-NEXT:    v_mul_lo_u32 v16, v0, v13
+; CGP-NEXT:    v_mul_lo_u32 v18, v4, v13
+; CGP-NEXT:    v_mul_hi_u32 v19, v0, v13
+; CGP-NEXT:    v_mul_hi_u32 v13, v4, v13
+; CGP-NEXT:    v_add_i32_e32 v15, vcc, v15, v16
 ; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v14, vcc, v18, v14
+; CGP-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v15, vcc, v15, v17
+; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v14, vcc, v14, v19
+; CGP-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v15, vcc, v16, v15
+; CGP-NEXT:    v_add_i32_e32 v16, vcc, v18, v17
+; CGP-NEXT:    v_add_i32_e32 v14, vcc, v14, v15
+; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v15, vcc, v16, v15
 ; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v15
-; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v17
+; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v14
+; CGP-NEXT:    v_addc_u32_e32 v4, vcc, v4, v13, vcc
+; CGP-NEXT:    v_mul_lo_u32 v13, v1, v0
+; CGP-NEXT:    v_mul_lo_u32 v12, v12, v0
+; CGP-NEXT:    v_mul_hi_u32 v14, v1, v0
+; CGP-NEXT:    v_mul_lo_u32 v1, v1, v4
+; CGP-NEXT:    v_mul_lo_u32 v15, v4, v13
+; CGP-NEXT:    v_mul_hi_u32 v16, v0, v13
+; CGP-NEXT:    v_mul_hi_u32 v13, v4, v13
+; CGP-NEXT:    v_add_i32_e32 v1, vcc, v12, v1
+; CGP-NEXT:    v_add_i32_e32 v1, vcc, v1, v14
+; CGP-NEXT:    v_mul_lo_u32 v12, v0, v1
+; CGP-NEXT:    v_mul_lo_u32 v14, v4, v1
+; CGP-NEXT:    v_mul_hi_u32 v17, v0, v1
+; CGP-NEXT:    v_mul_hi_u32 v1, v4, v1
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v15, v12
 ; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
-; CGP-NEXT:    v_add_i32_e32 v14, vcc, v16, v15
-; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v13
+; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v16
+; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v17
+; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v15, v12
+; CGP-NEXT:    v_add_i32_e32 v14, vcc, v14, v16
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
-; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v13
+; CGP-NEXT:    v_add_i32_e32 v1, vcc, v1, v13
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v12
-; CGP-NEXT:    v_addc_u32_e32 v4, vcc, v4, v11, vcc
-; CGP-NEXT:    v_mul_lo_u32 v11, v1, v0
-; CGP-NEXT:    v_mul_lo_u32 v10, v10, v0
-; CGP-NEXT:    v_mul_hi_u32 v12, v1, v0
-; CGP-NEXT:    v_mul_lo_u32 v1, v1, v4
-; CGP-NEXT:    v_mul_lo_u32 v13, v4, v11
-; CGP-NEXT:    v_mul_hi_u32 v14, v0, v11
-; CGP-NEXT:    v_mul_hi_u32 v11, v4, v11
-; CGP-NEXT:    v_add_i32_e32 v1, vcc, v10, v1
-; CGP-NEXT:    v_add_i32_e32 v1, vcc, v1, v12
-; CGP-NEXT:    v_mul_lo_u32 v10, v0, v1
-; CGP-NEXT:    v_mul_lo_u32 v12, v4, v1
-; CGP-NEXT:    v_mul_hi_u32 v15, v0, v1
-; CGP-NEXT:    v_mul_hi_u32 v1, v4, v1
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v13, v10
-; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
-; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v14
-; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v15
-; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v13, v10
-; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v14
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
-; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
-; CGP-NEXT:    v_add_i32_e32 v1, vcc, v1, v11
-; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v10
 ; CGP-NEXT:    v_addc_u32_e32 v1, vcc, v4, v1, vcc
 ; CGP-NEXT:    v_mul_lo_u32 v4, v9, v0
-; CGP-NEXT:    v_mul_hi_u32 v10, v8, v0
+; CGP-NEXT:    v_mul_hi_u32 v12, v8, v0
 ; CGP-NEXT:    v_mul_hi_u32 v0, v9, v0
-; CGP-NEXT:    v_mul_lo_u32 v11, v8, v1
-; CGP-NEXT:    v_mul_lo_u32 v12, v9, v1
-; CGP-NEXT:    v_mul_hi_u32 v13, v8, v1
+; CGP-NEXT:    v_mul_lo_u32 v13, v8, v1
+; CGP-NEXT:    v_mul_lo_u32 v14, v9, v1
+; CGP-NEXT:    v_mul_hi_u32 v15, v8, v1
 ; CGP-NEXT:    v_mul_hi_u32 v1, v9, v1
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v11
-; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v0, vcc, v12, v0
-; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v10
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v13
+; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v0, vcc, v14, v0
+; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v12
 ; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v13
-; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v11, v4
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v12, v10
+; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v15
+; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v13, v4
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v14, v12
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
 ; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v10, v4
-; CGP-NEXT:    v_mul_lo_u32 v10, v2, v0
-; CGP-NEXT:    v_mul_lo_u32 v11, v3, v0
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v12, v4
+; CGP-NEXT:    v_mul_lo_u32 v12, v2, v0
+; CGP-NEXT:    v_mul_lo_u32 v13, v3, v0
 ; CGP-NEXT:    v_mul_hi_u32 v0, v2, v0
 ; CGP-NEXT:    v_add_i32_e32 v1, vcc, v1, v4
 ; CGP-NEXT:    v_mul_lo_u32 v1, v2, v1
-; CGP-NEXT:    v_add_i32_e32 v1, vcc, v11, v1
+; CGP-NEXT:    v_add_i32_e32 v1, vcc, v13, v1
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
-; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v8, v10
+; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v8, v12
 ; CGP-NEXT:    v_subb_u32_e64 v4, s[4:5], v9, v0, vcc
 ; CGP-NEXT:    v_sub_i32_e64 v0, s[4:5], v9, v0
 ; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v1, v2
@@ -2128,19 +2131,19 @@ define <2 x i64> @v_urem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
 ; CGP-NEXT:    v_cndmask_b32_e32 v8, v9, v8, vcc
 ; CGP-NEXT:    v_sub_i32_e32 v9, vcc, v1, v2
-; CGP-NEXT:    v_subbrev_u32_e64 v10, s[4:5], 0, v0, vcc
+; CGP-NEXT:    v_subbrev_u32_e64 v12, s[4:5], 0, v0, vcc
 ; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v9, v2
-; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[4:5]
+; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, -1, s[4:5]
 ; CGP-NEXT:    v_subb_u32_e32 v0, vcc, v0, v3, vcc
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v10, v3
-; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, -1, vcc
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v12, v3
+; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, -1, vcc
 ; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v9, v2
 ; CGP-NEXT:    v_subbrev_u32_e32 v0, vcc, 0, v0, vcc
-; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, v10, v3
-; CGP-NEXT:    v_cndmask_b32_e32 v3, v12, v11, vcc
+; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, v12, v3
+; CGP-NEXT:    v_cndmask_b32_e32 v3, v14, v13, vcc
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
 ; CGP-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc
-; CGP-NEXT:    v_cndmask_b32_e32 v3, v10, v0, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v3, v12, v0, vcc
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v8
 ; CGP-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
 ; CGP-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
@@ -2148,8 +2151,8 @@ define <2 x i64> @v_urem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; CGP-NEXT:    ; implicit-def: $vgpr8
 ; CGP-NEXT:  .LBB8_2: ; %Flow1
-; CGP-NEXT:    s_or_saveexec_b64 s[4:5], s[8:9]
-; CGP-NEXT:    v_lshl_b64 v[9:10], s[6:7], v6
+; CGP-NEXT:    s_or_saveexec_b64 s[4:5], s[6:7]
+; CGP-NEXT:    v_lshl_b64 v[9:10], v[10:11], v6
 ; CGP-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; CGP-NEXT:    s_cbranch_execz .LBB8_4
 ; CGP-NEXT:  ; %bb.3:
diff --git a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
index 8c4483fc118db..bf5843ea8047d 100644
--- a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
+++ b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
@@ -691,12 +691,12 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
 ; GFX90A-NEXT:    v_cvt_f32_u32_e32 v0, s3
 ; GFX90A-NEXT:    s_sub_i32 s4, 0, s3
 ; GFX90A-NEXT:    v_mov_b32_e32 v19, 0
-; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], 0, 0
-; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX90A-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
-; GFX90A-NEXT:    v_cvt_u32_f32_e32 v1, v0
-; GFX90A-NEXT:    v_cvt_f32_f16_e32 v0, s9
-; GFX90A-NEXT:    v_readfirstlane_b32 s10, v1
+; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], 0, 0
+; GFX90A-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
+; GFX90A-NEXT:    v_cvt_u32_f32_e32 v3, v2
+; GFX90A-NEXT:    v_cvt_f32_f16_e32 v2, s9
+; GFX90A-NEXT:    v_readfirstlane_b32 s10, v3
 ; GFX90A-NEXT:    s_mul_i32 s4, s4, s10
 ; GFX90A-NEXT:    s_mul_hi_u32 s4, s10, s4
 ; GFX90A-NEXT:    s_add_i32 s10, s10, s4
@@ -713,7 +713,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
 ; GFX90A-NEXT:    s_cselect_b32 s4, s10, s4
 ; GFX90A-NEXT:    s_lshr_b32 s9, s9, 16
 ; GFX90A-NEXT:    s_lshl_b64 s[12:13], s[4:5], 5
-; GFX90A-NEXT:    v_cvt_f32_f16_e32 v1, s9
+; GFX90A-NEXT:    v_cvt_f32_f16_e32 v3, s9
 ; GFX90A-NEXT:    s_lshl_b64 s[2:3], s[0:1], 5
 ; GFX90A-NEXT:    s_lshl_b64 s[10:11], s[6:7], 5
 ; GFX90A-NEXT:    s_or_b32 s10, s10, 28
@@ -737,7 +737,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
 ; GFX90A-NEXT:    s_cbranch_scc0 .LBB3_10
 ; GFX90A-NEXT:  ; %bb.3: ; %bb14
 ; GFX90A-NEXT:    ; in Loop: Header=BB3_2 Depth=1
-; GFX90A-NEXT:    global_load_dwordx2 v[4:5], v[2:3], off
+; GFX90A-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
 ; GFX90A-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[6:7], -1
 ; GFX90A-NEXT:    s_mov_b32 s9, s8
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[0:1]
@@ -795,7 +795,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
 ; GFX90A-NEXT:    v_cvt_f32_f16_e32 v22, v21
 ; GFX90A-NEXT:    v_cvt_f32_f16_sdwa v21, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX90A-NEXT:    v_cvt_f32_f16_e32 v20, v20
-; GFX90A-NEXT:    v_pk_add_f32 v[24:25], v[0:1], v[14:15]
+; GFX90A-NEXT:    v_pk_add_f32 v[24:25], v[2:3], v[14:15]
 ; GFX90A-NEXT:    v_pk_add_f32 v[26:27], v[14:15], 0 op_sel_hi:[1,0]
 ; GFX90A-NEXT:    v_pk_add_f32 v[16:17], v[22:23], v[16:17]
 ; GFX90A-NEXT:    v_pk_add_f32 v[14:15], v[20:21], v[14:15]
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
index d342c4ffa37b0..2b444e5e0e1f3 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
@@ -7910,8 +7910,7 @@ define amdgpu_kernel void @urem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s0, s4
 ; GFX6-NEXT:    s_mov_b32 s1, s5
-; GFX6-NEXT:    s_mov_b64 s[4:5], 0x1000
-; GFX6-NEXT:    s_lshl_b64 s[4:5], s[4:5], s8
+; GFX6-NEXT:    s_lshl_b64 s[4:5], 0x1000, s8
 ; GFX6-NEXT:    s_add_u32 s4, s4, -1
 ; GFX6-NEXT:    s_addc_u32 s5, s5, -1
 ; GFX6-NEXT:    s_and_b64 s[4:5], s[6:7], s[4:5]
@@ -7923,10 +7922,9 @@ define amdgpu_kernel void @urem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x34
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX9-NEXT:    s_mov_b64 s[0:1], 0x1000
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], s2
+; GFX9-NEXT:    s_lshl_b64 s[0:1], 0x1000, s2
 ; GFX9-NEXT:    s_add_u32 s0, s0, -1
 ; GFX9-NEXT:    s_addc_u32 s1, s1, -1
 ; GFX9-NEXT:    s_and_b64 s[0:1], s[6:7], s[0:1]
@@ -8000,45 +7998,43 @@ define amdgpu_kernel void @urem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ;
 ; GFX6-LABEL: urem_v2i64_pow2_shl_denom:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x9
-; GFX6-NEXT:    s_load_dwordx8 s[0:7], s[0:1], 0xd
-; GFX6-NEXT:    s_mov_b64 s[12:13], 0x1000
-; GFX6-NEXT:    s_mov_b32 s11, 0xf000
-; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0xd
+; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX6-NEXT:    s_mov_b32 s3, 0xf000
+; GFX6-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_lshl_b64 s[6:7], s[12:13], s6
-; GFX6-NEXT:    s_lshl_b64 s[4:5], s[12:13], s4
-; GFX6-NEXT:    s_add_u32 s4, s4, -1
-; GFX6-NEXT:    s_addc_u32 s5, s5, -1
-; GFX6-NEXT:    s_and_b64 s[0:1], s[0:1], s[4:5]
-; GFX6-NEXT:    s_add_u32 s4, s6, -1
-; GFX6-NEXT:    s_addc_u32 s5, s7, -1
-; GFX6-NEXT:    s_and_b64 s[2:3], s[2:3], s[4:5]
-; GFX6-NEXT:    v_mov_b32_e32 v0, s0
-; GFX6-NEXT:    v_mov_b32_e32 v1, s1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s2
-; GFX6-NEXT:    v_mov_b32_e32 v3, s3
-; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; GFX6-NEXT:    s_lshl_b64 s[10:11], 0x1000, s10
+; GFX6-NEXT:    s_lshl_b64 s[8:9], 0x1000, s8
+; GFX6-NEXT:    s_add_u32 s8, s8, -1
+; GFX6-NEXT:    s_addc_u32 s9, s9, -1
+; GFX6-NEXT:    s_and_b64 s[4:5], s[4:5], s[8:9]
+; GFX6-NEXT:    s_add_u32 s8, s10, -1
+; GFX6-NEXT:    s_addc_u32 s9, s11, -1
+; GFX6-NEXT:    s_and_b64 s[6:7], s[6:7], s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    v_mov_b32_e32 v1, s5
+; GFX6-NEXT:    v_mov_b32_e32 v2, s6
+; GFX6-NEXT:    v_mov_b32_e32 v3, s7
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
 ; GFX9-LABEL: urem_v2i64_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
-; GFX9-NEXT:    s_mov_b64 s[2:3], 0x1000
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b64 s[10:11], s[2:3], s10
-; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], s8
+; GFX9-NEXT:    s_lshl_b64 s[2:3], 0x1000, s10
+; GFX9-NEXT:    s_lshl_b64 s[8:9], 0x1000, s8
+; GFX9-NEXT:    s_add_u32 s8, s8, -1
+; GFX9-NEXT:    s_addc_u32 s9, s9, -1
+; GFX9-NEXT:    s_and_b64 s[4:5], s[4:5], s[8:9]
 ; GFX9-NEXT:    s_add_u32 s2, s2, -1
 ; GFX9-NEXT:    s_addc_u32 s3, s3, -1
-; GFX9-NEXT:    s_and_b64 s[2:3], s[4:5], s[2:3]
-; GFX9-NEXT:    s_add_u32 s4, s10, -1
-; GFX9-NEXT:    s_addc_u32 s5, s11, -1
-; GFX9-NEXT:    s_and_b64 s[4:5], s[6:7], s[4:5]
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-NEXT:    v_mov_b32_e32 v3, s5
+; GFX9-NEXT:    s_and_b64 s[2:3], s[6:7], s[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    v_mov_b32_e32 v2, s2
+; GFX9-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX9-NEXT:    s_endpgm
   %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y
@@ -8297,12 +8293,11 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ;
 ; GFX6-LABEL: sdiv_i64_pow2_shl_denom:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_load_dword s4, s[0:1], 0xd
-; GFX6-NEXT:    s_mov_b64 s[2:3], 0x1000
+; GFX6-NEXT:    s_load_dword s2, s[0:1], 0xd
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s6, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_lshl_b64 s[2:3], s[2:3], s4
+; GFX6-NEXT:    s_lshl_b64 s[2:3], 0x1000, s2
 ; GFX6-NEXT:    s_ashr_i32 s8, s3, 31
 ; GFX6-NEXT:    s_add_u32 s2, s2, s8
 ; GFX6-NEXT:    s_mov_b32 s9, s8
@@ -8334,17 +8329,16 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
 ; GFX6-NEXT:    v_mul_hi_u32 v3, v0, v4
 ; GFX6-NEXT:    v_mul_lo_u32 v5, v0, v2
-; GFX6-NEXT:    v_mul_hi_u32 v6, v0, v2
-; GFX6-NEXT:    v_mul_hi_u32 v7, v1, v2
-; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
-; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
-; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
+; GFX6-NEXT:    v_mul_hi_u32 v7, v0, v2
 ; GFX6-NEXT:    v_mul_lo_u32 v6, v1, v4
 ; GFX6-NEXT:    v_mul_hi_u32 v4, v1, v4
-; GFX6-NEXT:    s_xor_b64 s[2:3], s[2:3], s[12:13]
+; GFX6-NEXT:    v_mul_hi_u32 v8, v1, v2
+; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
+; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, 0, v7, vcc
+; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
 ; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
 ; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v5, v4, vcc
-; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v7, vcc
+; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v8, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
@@ -8352,6 +8346,7 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; GFX6-NEXT:    v_mul_lo_u32 v2, s4, v1
 ; GFX6-NEXT:    v_mul_hi_u32 v3, s4, v0
 ; GFX6-NEXT:    v_mul_lo_u32 v4, s5, v0
+; GFX6-NEXT:    s_xor_b64 s[2:3], s[2:3], s[12:13]
 ; GFX6-NEXT:    s_mov_b32 s5, s1
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; GFX6-NEXT:    v_mul_lo_u32 v3, s4, v0
@@ -8433,10 +8428,9 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; GFX6-NEXT:    s_endpgm
 ; GFX9-LABEL: sdiv_i64_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x34
-; GFX9-NEXT:    s_mov_b64 s[2:3], 0x1000
+; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x34
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b64 s[4:5], s[2:3], s4
+; GFX9-NEXT:    s_lshl_b64 s[4:5], 0x1000, s2
 ; GFX9-NEXT:    s_ashr_i32 s2, s5, 31
 ; GFX9-NEXT:    s_add_u32 s4, s4, s2
 ; GFX9-NEXT:    s_mov_b32 s3, s2
@@ -8462,12 +8456,12 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; GFX9-NEXT:    s_mul_hi_u32 s14, s0, s11
 ; GFX9-NEXT:    s_mul_i32 s13, s1, s11
 ; GFX9-NEXT:    s_add_i32 s12, s14, s12
-; GFX9-NEXT:    s_add_i32 s12, s12, s13
 ; GFX9-NEXT:    s_mul_i32 s15, s0, s11
+; GFX9-NEXT:    s_add_i32 s12, s12, s13
+; GFX9-NEXT:    s_mul_hi_u32 s14, s11, s15
 ; GFX9-NEXT:    s_mul_hi_u32 s13, s11, s12
-; GFX9-NEXT:    s_mul_i32 s14, s11, s12
-; GFX9-NEXT:    s_mul_hi_u32 s11, s11, s15
-; GFX9-NEXT:    s_add_u32 s11, s11, s14
+; GFX9-NEXT:    s_mul_i32 s11, s11, s12
+; GFX9-NEXT:    s_add_u32 s11, s14, s11
 ; GFX9-NEXT:    s_addc_u32 s13, 0, s13
 ; GFX9-NEXT:    s_mul_hi_u32 s16, s10, s15
 ; GFX9-NEXT:    s_mul_i32 s15, s10, s15
@@ -8884,10 +8878,9 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-LABEL: sdiv_v2i64_pow2_shl_denom:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0xd
-; GFX6-NEXT:    s_mov_b64 s[2:3], 0x1000
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_lshl_b64 s[12:13], s[2:3], s10
-; GFX6-NEXT:    s_lshl_b64 s[2:3], s[2:3], s8
+; GFX6-NEXT:    s_lshl_b64 s[2:3], 0x1000, s8
+; GFX6-NEXT:    s_lshl_b64 s[12:13], 0x1000, s10
 ; GFX6-NEXT:    s_ashr_i32 s14, s3, 31
 ; GFX6-NEXT:    s_add_u32 s2, s2, s14
 ; GFX6-NEXT:    s_mov_b32 s15, s14
@@ -8920,15 +8913,15 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    v_mul_hi_u32 v3, v0, v4
 ; GFX6-NEXT:    v_mul_lo_u32 v5, v0, v2
 ; GFX6-NEXT:    v_mul_hi_u32 v7, v0, v2
-; GFX6-NEXT:    v_mul_lo_u32 v6, v1, v4
-; GFX6-NEXT:    v_mul_hi_u32 v4, v1, v4
+; GFX6-NEXT:    v_mul_hi_u32 v6, v1, v4
+; GFX6-NEXT:    v_mul_lo_u32 v4, v1, v4
+; GFX6-NEXT:    v_mul_hi_u32 v8, v1, v2
 ; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
 ; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, 0, v7, vcc
-; GFX6-NEXT:    v_mul_hi_u32 v7, v1, v2
 ; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
-; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
-; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v5, v4, vcc
-; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v7, vcc
+; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
+; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v5, v6, vcc
+; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v8, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
@@ -9141,11 +9134,10 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX9-LABEL: sdiv_v2i64_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
-; GFX9-NEXT:    s_mov_b64 s[2:3], 0x1000
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b64 s[10:11], s[2:3], s10
-; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], s8
+; GFX9-NEXT:    s_lshl_b64 s[2:3], 0x1000, s8
+; GFX9-NEXT:    s_lshl_b64 s[10:11], 0x1000, s10
 ; GFX9-NEXT:    s_ashr_i32 s8, s3, 31
 ; GFX9-NEXT:    s_add_u32 s2, s2, s8
 ; GFX9-NEXT:    s_mov_b32 s9, s8
@@ -9170,8 +9162,8 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX9-NEXT:    s_mul_hi_u32 s18, s0, s15
 ; GFX9-NEXT:    s_mul_i32 s17, s1, s15
 ; GFX9-NEXT:    s_add_i32 s16, s18, s16
-; GFX9-NEXT:    s_add_i32 s16, s16, s17
 ; GFX9-NEXT:    s_mul_i32 s19, s0, s15
+; GFX9-NEXT:    s_add_i32 s16, s16, s17
 ; GFX9-NEXT:    s_mul_hi_u32 s17, s15, s16
 ; GFX9-NEXT:    s_mul_i32 s18, s15, s16
 ; GFX9-NEXT:    s_mul_hi_u32 s15, s15, s19
@@ -9691,12 +9683,11 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ;
 ; GFX6-LABEL: srem_i64_pow2_shl_denom:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_load_dword s4, s[0:1], 0xd
-; GFX6-NEXT:    s_mov_b64 s[2:3], 0x1000
+; GFX6-NEXT:    s_load_dword s2, s[0:1], 0xd
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s6, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_lshl_b64 s[2:3], s[2:3], s4
+; GFX6-NEXT:    s_lshl_b64 s[2:3], 0x1000, s2
 ; GFX6-NEXT:    s_ashr_i32 s4, s3, 31
 ; GFX6-NEXT:    s_add_u32 s2, s2, s4
 ; GFX6-NEXT:    s_mov_b32 s5, s4
@@ -9728,17 +9719,16 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
 ; GFX6-NEXT:    v_mul_hi_u32 v3, v0, v4
 ; GFX6-NEXT:    v_mul_lo_u32 v5, v0, v2
-; GFX6-NEXT:    v_mul_hi_u32 v6, v0, v2
-; GFX6-NEXT:    v_mul_hi_u32 v7, v1, v2
-; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
-; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
-; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
+; GFX6-NEXT:    v_mul_hi_u32 v7, v0, v2
 ; GFX6-NEXT:    v_mul_lo_u32 v6, v1, v4
 ; GFX6-NEXT:    v_mul_hi_u32 v4, v1, v4
-; GFX6-NEXT:    s_xor_b64 s[12:13], s[2:3], s[10:11]
+; GFX6-NEXT:    v_mul_hi_u32 v8, v1, v2
+; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
+; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, 0, v7, vcc
+; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
 ; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
 ; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v5, v4, vcc
-; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v7, vcc
+; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v8, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
@@ -9746,6 +9736,7 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; GFX6-NEXT:    v_mul_lo_u32 v2, s4, v1
 ; GFX6-NEXT:    v_mul_hi_u32 v3, s4, v0
 ; GFX6-NEXT:    v_mul_lo_u32 v4, s5, v0
+; GFX6-NEXT:    s_xor_b64 s[12:13], s[2:3], s[10:11]
 ; GFX6-NEXT:    s_mov_b32 s5, s1
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; GFX6-NEXT:    v_mul_lo_u32 v3, s4, v0
@@ -9825,10 +9816,9 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; GFX6-NEXT:    s_endpgm
 ; GFX9-LABEL: srem_i64_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x34
-; GFX9-NEXT:    s_mov_b64 s[2:3], 0x1000
+; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x34
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], s4
+; GFX9-NEXT:    s_lshl_b64 s[2:3], 0x1000, s2
 ; GFX9-NEXT:    s_ashr_i32 s4, s3, 31
 ; GFX9-NEXT:    s_add_u32 s2, s2, s4
 ; GFX9-NEXT:    s_mov_b32 s5, s4
@@ -9854,12 +9844,12 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; GFX9-NEXT:    s_mul_hi_u32 s12, s0, s3
 ; GFX9-NEXT:    s_mul_i32 s11, s1, s3
 ; GFX9-NEXT:    s_add_i32 s10, s12, s10
-; GFX9-NEXT:    s_add_i32 s10, s10, s11
 ; GFX9-NEXT:    s_mul_i32 s13, s0, s3
+; GFX9-NEXT:    s_add_i32 s10, s10, s11
+; GFX9-NEXT:    s_mul_hi_u32 s12, s3, s13
 ; GFX9-NEXT:    s_mul_hi_u32 s11, s3, s10
-; GFX9-NEXT:    s_mul_i32 s12, s3, s10
-; GFX9-NEXT:    s_mul_hi_u32 s3, s3, s13
-; GFX9-NEXT:    s_add_u32 s3, s3, s12
+; GFX9-NEXT:    s_mul_i32 s3, s3, s10
+; GFX9-NEXT:    s_add_u32 s3, s12, s3
 ; GFX9-NEXT:    s_addc_u32 s11, 0, s11
 ; GFX9-NEXT:    s_mul_hi_u32 s14, s2, s13
 ; GFX9-NEXT:    s_mul_i32 s13, s2, s13
@@ -10063,11 +10053,10 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-LABEL: srem_v2i64_pow2_shl_denom:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0xd
-; GFX6-NEXT:    s_mov_b64 s[2:3], 0x1000
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b32 s11, 0xf000
-; GFX6-NEXT:    s_lshl_b64 s[16:17], s[2:3], s10
-; GFX6-NEXT:    s_lshl_b64 s[2:3], s[2:3], s8
+; GFX6-NEXT:    s_lshl_b64 s[2:3], 0x1000, s8
+; GFX6-NEXT:    s_lshl_b64 s[16:17], 0x1000, s10
 ; GFX6-NEXT:    s_ashr_i32 s8, s3, 31
 ; GFX6-NEXT:    s_add_u32 s2, s2, s8
 ; GFX6-NEXT:    s_mov_b32 s9, s8
@@ -10101,15 +10090,15 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    v_mul_hi_u32 v3, v0, v4
 ; GFX6-NEXT:    v_mul_lo_u32 v5, v0, v2
 ; GFX6-NEXT:    v_mul_hi_u32 v7, v0, v2
-; GFX6-NEXT:    v_mul_lo_u32 v6, v1, v4
-; GFX6-NEXT:    v_mul_hi_u32 v4, v1, v4
+; GFX6-NEXT:    v_mul_hi_u32 v6, v1, v4
+; GFX6-NEXT:    v_mul_lo_u32 v4, v1, v4
+; GFX6-NEXT:    v_mul_hi_u32 v8, v1, v2
 ; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
 ; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, 0, v7, vcc
-; GFX6-NEXT:    v_mul_hi_u32 v7, v1, v2
 ; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
-; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
-; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v5, v4, vcc
-; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v7, vcc
+; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
+; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v5, v6, vcc
+; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v8, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v4, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
@@ -10316,11 +10305,10 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX9-LABEL: srem_v2i64_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
-; GFX9-NEXT:    s_mov_b64 s[2:3], 0x1000
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b64 s[10:11], s[2:3], s10
-; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], s8
+; GFX9-NEXT:    s_lshl_b64 s[2:3], 0x1000, s8
+; GFX9-NEXT:    s_lshl_b64 s[10:11], 0x1000, s10
 ; GFX9-NEXT:    s_ashr_i32 s8, s3, 31
 ; GFX9-NEXT:    s_add_u32 s2, s2, s8
 ; GFX9-NEXT:    s_mov_b32 s9, s8
@@ -10345,8 +10333,8 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX9-NEXT:    s_mul_hi_u32 s16, s0, s3
 ; GFX9-NEXT:    s_mul_i32 s15, s1, s3
 ; GFX9-NEXT:    s_add_i32 s14, s16, s14
-; GFX9-NEXT:    s_add_i32 s14, s14, s15
 ; GFX9-NEXT:    s_mul_i32 s17, s0, s3
+; GFX9-NEXT:    s_add_i32 s14, s14, s15
 ; GFX9-NEXT:    s_mul_hi_u32 s15, s3, s14
 ; GFX9-NEXT:    s_mul_i32 s16, s3, s14
 ; GFX9-NEXT:    s_mul_hi_u32 s3, s3, s17
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
index abd9a4159f8cc..60c3134445218 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
@@ -4416,9 +4416,9 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) {
 ; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX7LESS-NEXT:    s_cbranch_execz .LBB18_2
 ; GFX7LESS-NEXT:  ; %bb.1:
-; GFX7LESS-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v0, 5
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
+; GFX7LESS-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX7LESS-NEXT:    s_mov_b32 m0, -1
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    ds_max_rtn_i64 v[0:1], v2, v[0:1]
@@ -4452,8 +4452,8 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) {
 ; GFX8-NEXT:    s_cbranch_execz .LBB18_2
 ; GFX8-NEXT:  ; %bb.1:
 ; GFX8-NEXT:    v_mov_b32_e32 v0, 5
-; GFX8-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX8-NEXT:    s_mov_b32 m0, -1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    ds_max_rtn_i64 v[0:1], v2, v[0:1]
@@ -4966,9 +4966,9 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) {
 ; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX7LESS-NEXT:    s_cbranch_execz .LBB20_2
 ; GFX7LESS-NEXT:  ; %bb.1:
-; GFX7LESS-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v0, 5
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
+; GFX7LESS-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX7LESS-NEXT:    s_mov_b32 m0, -1
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    ds_min_rtn_i64 v[0:1], v2, v[0:1]
@@ -5002,8 +5002,8 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) {
 ; GFX8-NEXT:    s_cbranch_execz .LBB20_2
 ; GFX8-NEXT:  ; %bb.1:
 ; GFX8-NEXT:    v_mov_b32_e32 v0, 5
-; GFX8-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX8-NEXT:    s_mov_b32 m0, -1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    ds_min_rtn_i64 v[0:1], v2, v[0:1]
@@ -5516,9 +5516,9 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) {
 ; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX7LESS-NEXT:    s_cbranch_execz .LBB22_2
 ; GFX7LESS-NEXT:  ; %bb.1:
-; GFX7LESS-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v0, 5
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
+; GFX7LESS-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX7LESS-NEXT:    s_mov_b32 m0, -1
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    ds_max_rtn_u64 v[0:1], v2, v[0:1]
@@ -5551,8 +5551,8 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) {
 ; GFX8-NEXT:    s_cbranch_execz .LBB22_2
 ; GFX8-NEXT:  ; %bb.1:
 ; GFX8-NEXT:    v_mov_b32_e32 v0, 5
-; GFX8-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX8-NEXT:    s_mov_b32 m0, -1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    ds_max_rtn_u64 v[0:1], v2, v[0:1]
@@ -6061,9 +6061,9 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) {
 ; GFX7LESS-NEXT:    s_and_saveexec_b64 s[2:3], vcc
 ; GFX7LESS-NEXT:    s_cbranch_execz .LBB24_2
 ; GFX7LESS-NEXT:  ; %bb.1:
-; GFX7LESS-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v0, 5
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0
+; GFX7LESS-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX7LESS-NEXT:    s_mov_b32 m0, -1
 ; GFX7LESS-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7LESS-NEXT:    ds_min_rtn_u64 v[0:1], v2, v[0:1]
@@ -6096,8 +6096,8 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) {
 ; GFX8-NEXT:    s_cbranch_execz .LBB24_2
 ; GFX8-NEXT:  ; %bb.1:
 ; GFX8-NEXT:    v_mov_b32_e32 v0, 5
-; GFX8-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX8-NEXT:    s_mov_b32 m0, -1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    ds_min_rtn_u64 v[0:1], v2, v[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/combine_andor_with_cmps.ll b/llvm/test/CodeGen/AMDGPU/combine_andor_with_cmps.ll
index 6efbd6ce87385..8082a0646d4a1 100644
--- a/llvm/test/CodeGen/AMDGPU/combine_andor_with_cmps.ll
+++ b/llvm/test/CodeGen/AMDGPU/combine_andor_with_cmps.ll
@@ -2533,8 +2533,7 @@ define i1 @test124(i32 %arg1, i64 %arg2) {
 ; GCN-LABEL: test124:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_mov_b64 s[0:1], 0x3e8
-; GCN-NEXT:    v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[1:2]
+; GCN-NEXT:    v_cmp_gt_i64_e32 vcc_lo, 0x3e8, v[1:2]
 ; GCN-NEXT:    v_cmp_gt_i32_e64 s0, 0x3e8, v0
 ; GCN-NEXT:    s_or_b32 s0, s0, vcc_lo
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
diff --git a/llvm/test/CodeGen/AMDGPU/commute-compares.ll b/llvm/test/CodeGen/AMDGPU/commute-compares.ll
index c703a1dd7734d..6997913f1ae16 100644
--- a/llvm/test/CodeGen/AMDGPU/commute-compares.ll
+++ b/llvm/test/CodeGen/AMDGPU/commute-compares.ll
@@ -250,8 +250,8 @@ define amdgpu_kernel void @commute_ule_63_i64(ptr addrspace(1) %out, ptr addrspa
 ; FIXME: Undo canonicalization to gt (x + 1) since it doesn't use the inline imm
 
 ; GCN-LABEL: {{^}}commute_ule_64_i64:
-; GCN-DAG: s_movk_i32 s[[KLO:[0-9]+]], 0x41{{$}}
-; GCN: v_cmp_gt_u64_e32 vcc, s[[[KLO]]:{{[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}
+; GCN: s_mov_b64 [[K:s\[[0-9:]+\]]], 0x41
+; GCN: v_cmp_gt_u64_e32 vcc, [[K]], v{{\[[0-9]+:[0-9]+\]}}
 define amdgpu_kernel void @commute_ule_64_i64(ptr addrspace(1) %out, ptr addrspace(1) %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i64, ptr addrspace(1) %in, i32 %tid
diff --git a/llvm/test/CodeGen/AMDGPU/constrained-shift.ll b/llvm/test/CodeGen/AMDGPU/constrained-shift.ll
index bd0c2b30eb5de..53c9861b7a051 100644
--- a/llvm/test/CodeGen/AMDGPU/constrained-shift.ll
+++ b/llvm/test/CodeGen/AMDGPU/constrained-shift.ll
@@ -310,8 +310,7 @@ define amdgpu_ps i64 @s_csh_64_1(i64 inreg %a, i64 inreg %b) {
 ;
 ; GISEL-LABEL: s_csh_64_1:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_mov_b64 s[4:5], 0xff
-; GISEL-NEXT:    s_and_b64 s[2:3], s[2:3], s[4:5]
+; GISEL-NEXT:    s_and_b64 s[2:3], s[2:3], 0xff
 ; GISEL-NEXT:    s_lshl_b64 s[4:5], s[0:1], s2
 ; GISEL-NEXT:    s_lshr_b64 s[6:7], s[0:1], s2
 ; GISEL-NEXT:    s_ashr_i64 s[0:1], s[0:1], s2
diff --git a/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll b/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll
index 86d0df494bcac..d9e0ddd3b9044 100644
--- a/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll
@@ -27,29 +27,27 @@ define float @v_mul_42_f32(float %x) {
 }
 
 define double @v_mul_42_f64(double %x) {
-; GFX9-LABEL: v_mul_42_f64:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s4, 0
-; GFX9-NEXT:    s_mov_b32 s5, 0x40450000
-; GFX9-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-SDAG-LABEL: v_mul_42_f64:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    s_mov_b32 s4, 0
+; GFX9-SDAG-NEXT:    s_mov_b32 s5, 0x40450000
+; GFX9-SDAG-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: v_mul_42_f64:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    s_mov_b32 s5, 0x40450000
-; GFX10-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-GISEL-LABEL: v_mul_42_f64:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x40450000
+; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_mul_42_f64:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_mov_b32 s1, 0x40450000
-; GFX11-NEXT:    v_mul_f64 v[0:1], v[0:1], s[0:1]
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX1011-LABEL: v_mul_42_f64:
+; GFX1011:       ; %bb.0:
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011-NEXT:    v_mul_f64 v[0:1], 0x40450000, v[0:1]
+; GFX1011-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul double %x, 42.0
   ret double %mul
 }
@@ -726,9 +724,9 @@ define double @v_mul_0x1pn1031_f64(double %x) {
 ; GFX9-GISEL-LABEL: v_mul_0x1pn1031_f64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX9-GISEL-NEXT:    s_movk_i32 s5, 0x800
-; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x800
+; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_0x1pn1031_f64:
@@ -740,9 +738,7 @@ define double @v_mul_0x1pn1031_f64(double %x) {
 ; GFX10-GISEL-LABEL: v_mul_0x1pn1031_f64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX10-GISEL-NEXT:    s_movk_i32 s5, 0x800
-; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], 0x800, v[0:1]
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_0x1pn1031_f64:
@@ -754,9 +750,7 @@ define double @v_mul_0x1pn1031_f64(double %x) {
 ; GFX11-GISEL-LABEL: v_mul_0x1pn1031_f64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
-; GFX11-GISEL-NEXT:    s_movk_i32 s1, 0x800
-; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[0:1]
+; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], 0x800, v[0:1]
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul double %x, 4.34584737989687770135e-311
   ret double %mul
@@ -774,9 +768,9 @@ define double @v_mul_0x1pn1022_f64(double %x) {
 ; GFX9-GISEL-LABEL: v_mul_0x1pn1022_f64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 0x100000
-; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x100000
+; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_0x1pn1022_f64:
@@ -788,9 +782,7 @@ define double @v_mul_0x1pn1022_f64(double %x) {
 ; GFX10-GISEL-LABEL: v_mul_0x1pn1022_f64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, 0x100000
-; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], 0x100000, v[0:1]
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_0x1pn1022_f64:
@@ -802,9 +794,7 @@ define double @v_mul_0x1pn1022_f64(double %x) {
 ; GFX11-GISEL-LABEL: v_mul_0x1pn1022_f64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, 0x100000
-; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[0:1]
+; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], 0x100000, v[0:1]
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul double %x, 2.22507385850720138309e-308
   ret double %mul
@@ -822,9 +812,9 @@ define double @v_mul_0x1pn1021_f64(double %x) {
 ; GFX9-GISEL-LABEL: v_mul_0x1pn1021_f64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 0x200000
-; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x200000
+; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_0x1pn1021_f64:
@@ -836,9 +826,7 @@ define double @v_mul_0x1pn1021_f64(double %x) {
 ; GFX10-GISEL-LABEL: v_mul_0x1pn1021_f64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, 0x200000
-; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], 0x200000, v[0:1]
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_0x1pn1021_f64:
@@ -850,9 +838,7 @@ define double @v_mul_0x1pn1021_f64(double %x) {
 ; GFX11-GISEL-LABEL: v_mul_0x1pn1021_f64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, 0x200000
-; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[0:1]
+; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], 0x200000, v[0:1]
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul double %x, 4.45014771701440276618e-308
   ret double %mul
@@ -870,9 +856,9 @@ define double @v_mul_0x1pn64_f64(double %x) {
 ; GFX9-GISEL-LABEL: v_mul_0x1pn64_f64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 0x3bf00000
-; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3bf00000
+; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_0x1pn64_f64:
@@ -884,9 +870,7 @@ define double @v_mul_0x1pn64_f64(double %x) {
 ; GFX10-GISEL-LABEL: v_mul_0x1pn64_f64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, 0x3bf00000
-; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], 0x3bf00000, v[0:1]
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_0x1pn64_f64:
@@ -898,9 +882,7 @@ define double @v_mul_0x1pn64_f64(double %x) {
 ; GFX11-GISEL-LABEL: v_mul_0x1pn64_f64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, 0x3bf00000
-; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[0:1]
+; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], 0x3bf00000, v[0:1]
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul double %x, 5.42101086242752217004e-20
   ret double %mul
@@ -918,9 +900,9 @@ define double @v_mul_0x1pn17_f64(double %x) {
 ; GFX9-GISEL-LABEL: v_mul_0x1pn17_f64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 0x3ee00000
-; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3ee00000
+; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_0x1pn17_f64:
@@ -932,9 +914,7 @@ define double @v_mul_0x1pn17_f64(double %x) {
 ; GFX10-GISEL-LABEL: v_mul_0x1pn17_f64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, 0x3ee00000
-; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], 0x3ee00000, v[0:1]
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_0x1pn17_f64:
@@ -946,9 +926,7 @@ define double @v_mul_0x1pn17_f64(double %x) {
 ; GFX11-GISEL-LABEL: v_mul_0x1pn17_f64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, 0x3ee00000
-; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[0:1]
+; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], 0x3ee00000, v[0:1]
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul double %x, 0.00000762939453125
   ret double %mul
@@ -965,9 +943,9 @@ define double @v_mul_0x1pn16_f64(double %x) {
 ; GFX9-GISEL-LABEL: v_mul_0x1pn16_f64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 0x3ef00000
-; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3ef00000
+; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_0x1pn16_f64:
@@ -979,9 +957,7 @@ define double @v_mul_0x1pn16_f64(double %x) {
 ; GFX10-GISEL-LABEL: v_mul_0x1pn16_f64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, 0x3ef00000
-; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], 0x3ef00000, v[0:1]
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_0x1pn16_f64:
@@ -993,9 +969,7 @@ define double @v_mul_0x1pn16_f64(double %x) {
 ; GFX11-GISEL-LABEL: v_mul_0x1pn16_f64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, 0x3ef00000
-; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[0:1]
+; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], 0x3ef00000, v[0:1]
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul double %x, 0.0000152587890625
   ret double %mul
@@ -1012,9 +986,9 @@ define double @v_mul_0x1pn15_f64(double %x) {
 ; GFX9-GISEL-LABEL: v_mul_0x1pn15_f64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 0.5
-; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0.5
+; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_0x1pn15_f64:
@@ -1026,9 +1000,7 @@ define double @v_mul_0x1pn15_f64(double %x) {
 ; GFX10-GISEL-LABEL: v_mul_0x1pn15_f64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, 0.5
-; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], 0x3f000000, v[0:1]
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_0x1pn15_f64:
@@ -1040,9 +1012,7 @@ define double @v_mul_0x1pn15_f64(double %x) {
 ; GFX11-GISEL-LABEL: v_mul_0x1pn15_f64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, 0.5
-; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[0:1]
+; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], 0x3f000000, v[0:1]
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul double %x, 0.000030517578125
   ret double %mul
@@ -1058,9 +1028,9 @@ define double @v_mul_neg256_f64(double %x) {
 ; GFX9-GISEL-LABEL: v_mul_neg256_f64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 0xc0700000
-; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0xc0700000
+; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_neg256_f64:
@@ -1072,9 +1042,7 @@ define double @v_mul_neg256_f64(double %x) {
 ; GFX10-GISEL-LABEL: v_mul_neg256_f64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, 0xc0700000
-; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], 0xc0700000, v[0:1]
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_neg256_f64:
@@ -1086,9 +1054,7 @@ define double @v_mul_neg256_f64(double %x) {
 ; GFX11-GISEL-LABEL: v_mul_neg256_f64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, 0xc0700000
-; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[0:1]
+; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], 0xc0700000, v[0:1]
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul double %x, -256.0
   ret double %mul
@@ -1104,9 +1070,9 @@ define double @v_mul_neg128_f64(double %x) {
 ; GFX9-GISEL-LABEL: v_mul_neg128_f64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 0xc0600000
-; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0xc0600000
+; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_neg128_f64:
@@ -1118,9 +1084,7 @@ define double @v_mul_neg128_f64(double %x) {
 ; GFX10-GISEL-LABEL: v_mul_neg128_f64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, 0xc0600000
-; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], 0xc0600000, v[0:1]
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_neg128_f64:
@@ -1132,9 +1096,7 @@ define double @v_mul_neg128_f64(double %x) {
 ; GFX11-GISEL-LABEL: v_mul_neg128_f64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, 0xc0600000
-; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[0:1]
+; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], 0xc0600000, v[0:1]
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul double %x, -128.0
   ret double %mul
@@ -1150,9 +1112,9 @@ define double @v_mul_neg64_f64(double %x) {
 ; GFX9-GISEL-LABEL: v_mul_neg64_f64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 0xc0500000
-; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0xc0500000
+; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_neg64_f64:
@@ -1164,9 +1126,7 @@ define double @v_mul_neg64_f64(double %x) {
 ; GFX10-GISEL-LABEL: v_mul_neg64_f64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, 0xc0500000
-; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], 0xc0500000, v[0:1]
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_neg64_f64:
@@ -1178,9 +1138,7 @@ define double @v_mul_neg64_f64(double %x) {
 ; GFX11-GISEL-LABEL: v_mul_neg64_f64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, 0xc0500000
-; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[0:1]
+; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], 0xc0500000, v[0:1]
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul double %x, -64.0
   ret double %mul
@@ -1196,9 +1154,9 @@ define double @v_mul_neg32_f64(double %x) {
 ; GFX9-GISEL-LABEL: v_mul_neg32_f64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 0xc0400000
-; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0xc0400000
+; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_neg32_f64:
@@ -1210,9 +1168,7 @@ define double @v_mul_neg32_f64(double %x) {
 ; GFX10-GISEL-LABEL: v_mul_neg32_f64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, 0xc0400000
-; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], 0xc0400000, v[0:1]
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_neg32_f64:
@@ -1224,9 +1180,7 @@ define double @v_mul_neg32_f64(double %x) {
 ; GFX11-GISEL-LABEL: v_mul_neg32_f64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, 0xc0400000
-; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[0:1]
+; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], 0xc0400000, v[0:1]
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul double %x, -32.0
   ret double %mul
@@ -1242,9 +1196,9 @@ define double @v_mul_neg16_f64(double %x) {
 ; GFX9-GISEL-LABEL: v_mul_neg16_f64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 0xc0300000
-; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0xc0300000
+; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_neg16_f64:
@@ -1256,9 +1210,7 @@ define double @v_mul_neg16_f64(double %x) {
 ; GFX10-GISEL-LABEL: v_mul_neg16_f64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, 0xc0300000
-; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], 0xc0300000, v[0:1]
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_neg16_f64:
@@ -1270,9 +1222,7 @@ define double @v_mul_neg16_f64(double %x) {
 ; GFX11-GISEL-LABEL: v_mul_neg16_f64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, 0xc0300000
-; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[0:1]
+; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], 0xc0300000, v[0:1]
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul double %x, -16.0
   ret double %mul
@@ -1288,9 +1238,9 @@ define double @v_mul_neg8_f64(double %x) {
 ; GFX9-GISEL-LABEL: v_mul_neg8_f64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 0xc0200000
-; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0xc0200000
+; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_neg8_f64:
@@ -1302,9 +1252,7 @@ define double @v_mul_neg8_f64(double %x) {
 ; GFX10-GISEL-LABEL: v_mul_neg8_f64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, 0xc0200000
-; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], 0xc0200000, v[0:1]
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_neg8_f64:
@@ -1316,9 +1264,7 @@ define double @v_mul_neg8_f64(double %x) {
 ; GFX11-GISEL-LABEL: v_mul_neg8_f64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, 0xc0200000
-; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[0:1]
+; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], 0xc0200000, v[0:1]
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul double %x, -8.0
   ret double %mul
@@ -1414,9 +1360,9 @@ define double @v_mul_neg_quarter_f64(double %x) {
 ; GFX9-GISEL-LABEL: v_mul_neg_quarter_f64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 0xbfd00000
-; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0xbfd00000
+; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_neg_quarter_f64:
@@ -1428,9 +1374,7 @@ define double @v_mul_neg_quarter_f64(double %x) {
 ; GFX10-GISEL-LABEL: v_mul_neg_quarter_f64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, 0xbfd00000
-; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], 0xbfd00000, v[0:1]
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_neg_quarter_f64:
@@ -1442,9 +1386,7 @@ define double @v_mul_neg_quarter_f64(double %x) {
 ; GFX11-GISEL-LABEL: v_mul_neg_quarter_f64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, 0xbfd00000
-; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[0:1]
+; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], 0xbfd00000, v[0:1]
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul double %x, -0.25
   ret double %mul
@@ -1460,9 +1402,9 @@ define double @v_mul_quarter_f64(double %x) {
 ; GFX9-GISEL-LABEL: v_mul_quarter_f64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 0x3fd00000
-; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3fd00000
+; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_quarter_f64:
@@ -1474,9 +1416,7 @@ define double @v_mul_quarter_f64(double %x) {
 ; GFX10-GISEL-LABEL: v_mul_quarter_f64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, 0x3fd00000
-; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], 0x3fd00000, v[0:1]
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_quarter_f64:
@@ -1488,9 +1428,7 @@ define double @v_mul_quarter_f64(double %x) {
 ; GFX11-GISEL-LABEL: v_mul_quarter_f64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, 0x3fd00000
-; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[0:1]
+; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], 0x3fd00000, v[0:1]
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul double %x, 0.25
   ret double %mul
@@ -1575,9 +1513,9 @@ define double @v_mul_8_f64(double %x) {
 ; GFX9-GISEL-LABEL: v_mul_8_f64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 0x40200000
-; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x40200000
+; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_8_f64:
@@ -1589,9 +1527,7 @@ define double @v_mul_8_f64(double %x) {
 ; GFX10-GISEL-LABEL: v_mul_8_f64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, 0x40200000
-; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], 0x40200000, v[0:1]
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_8_f64:
@@ -1603,9 +1539,7 @@ define double @v_mul_8_f64(double %x) {
 ; GFX11-GISEL-LABEL: v_mul_8_f64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, 0x40200000
-; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[0:1]
+; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], 0x40200000, v[0:1]
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul double %x, 8.0
   ret double %mul
@@ -1621,9 +1555,9 @@ define double @v_mul_16_f64(double %x) {
 ; GFX9-GISEL-LABEL: v_mul_16_f64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 0x40300000
-; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x40300000
+; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_16_f64:
@@ -1635,9 +1569,7 @@ define double @v_mul_16_f64(double %x) {
 ; GFX10-GISEL-LABEL: v_mul_16_f64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, 0x40300000
-; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], 0x40300000, v[0:1]
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_16_f64:
@@ -1649,9 +1581,7 @@ define double @v_mul_16_f64(double %x) {
 ; GFX11-GISEL-LABEL: v_mul_16_f64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, 0x40300000
-; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[0:1]
+; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], 0x40300000, v[0:1]
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul double %x, 16.0
   ret double %mul
@@ -1667,9 +1597,9 @@ define double @v_mul_32_f64(double %x) {
 ; GFX9-GISEL-LABEL: v_mul_32_f64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 0x40400000
-; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x40400000
+; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_32_f64:
@@ -1681,9 +1611,7 @@ define double @v_mul_32_f64(double %x) {
 ; GFX10-GISEL-LABEL: v_mul_32_f64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, 0x40400000
-; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], 0x40400000, v[0:1]
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_32_f64:
@@ -1695,9 +1623,7 @@ define double @v_mul_32_f64(double %x) {
 ; GFX11-GISEL-LABEL: v_mul_32_f64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, 0x40400000
-; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[0:1]
+; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], 0x40400000, v[0:1]
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul double %x, 32.0
   ret double %mul
@@ -1713,9 +1639,9 @@ define double @v_mul_64_f64(double %x) {
 ; GFX9-GISEL-LABEL: v_mul_64_f64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 0x40500000
-; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x40500000
+; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_64_f64:
@@ -1727,9 +1653,7 @@ define double @v_mul_64_f64(double %x) {
 ; GFX10-GISEL-LABEL: v_mul_64_f64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, 0x40500000
-; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], 0x40500000, v[0:1]
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_64_f64:
@@ -1741,9 +1665,7 @@ define double @v_mul_64_f64(double %x) {
 ; GFX11-GISEL-LABEL: v_mul_64_f64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, 0x40500000
-; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[0:1]
+; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], 0x40500000, v[0:1]
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul double %x, 64.0
   ret double %mul
@@ -1759,9 +1681,9 @@ define double @v_mul_128_f64(double %x) {
 ; GFX9-GISEL-LABEL: v_mul_128_f64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 0x40600000
-; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x40600000
+; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_128_f64:
@@ -1773,9 +1695,7 @@ define double @v_mul_128_f64(double %x) {
 ; GFX10-GISEL-LABEL: v_mul_128_f64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, 0x40600000
-; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], 0x40600000, v[0:1]
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_128_f64:
@@ -1787,9 +1707,7 @@ define double @v_mul_128_f64(double %x) {
 ; GFX11-GISEL-LABEL: v_mul_128_f64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, 0x40600000
-; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[0:1]
+; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], 0x40600000, v[0:1]
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul double %x, 128.0
   ret double %mul
@@ -1805,9 +1723,9 @@ define double @v_mul_256_f64(double %x) {
 ; GFX9-GISEL-LABEL: v_mul_256_f64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 0x40700000
-; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x40700000
+; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_256_f64:
@@ -1819,9 +1737,7 @@ define double @v_mul_256_f64(double %x) {
 ; GFX10-GISEL-LABEL: v_mul_256_f64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, 0x40700000
-; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], 0x40700000, v[0:1]
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_256_f64:
@@ -1833,9 +1749,7 @@ define double @v_mul_256_f64(double %x) {
 ; GFX11-GISEL-LABEL: v_mul_256_f64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, 0x40700000
-; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[0:1]
+; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], 0x40700000, v[0:1]
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul double %x, 256.0
   ret double %mul
@@ -1852,9 +1766,9 @@ define double @v_mul_0x1p63_f64(double %x) {
 ; GFX9-GISEL-LABEL: v_mul_0x1p63_f64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 0x43e00000
-; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x43e00000
+; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_0x1p63_f64:
@@ -1866,9 +1780,7 @@ define double @v_mul_0x1p63_f64(double %x) {
 ; GFX10-GISEL-LABEL: v_mul_0x1p63_f64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, 0x43e00000
-; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], 0x43e00000, v[0:1]
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_0x1p63_f64:
@@ -1880,9 +1792,7 @@ define double @v_mul_0x1p63_f64(double %x) {
 ; GFX11-GISEL-LABEL: v_mul_0x1p63_f64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, 0x43e00000
-; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[0:1]
+; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], 0x43e00000, v[0:1]
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul double %x, 9223372036854775808.0
   ret double %mul
@@ -1899,9 +1809,9 @@ define double @v_mul_0x1p64_f64(double %x) {
 ; GFX9-GISEL-LABEL: v_mul_0x1p64_f64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 0x43f00000
-; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x43f00000
+; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_0x1p64_f64:
@@ -1913,9 +1823,7 @@ define double @v_mul_0x1p64_f64(double %x) {
 ; GFX10-GISEL-LABEL: v_mul_0x1p64_f64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, 0x43f00000
-; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], 0x43f00000, v[0:1]
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_0x1p64_f64:
@@ -1927,9 +1835,7 @@ define double @v_mul_0x1p64_f64(double %x) {
 ; GFX11-GISEL-LABEL: v_mul_0x1p64_f64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, 0x43f00000
-; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[0:1]
+; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], 0x43f00000, v[0:1]
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul double %x, 18446744073709551616.0
   ret double %mul
@@ -1947,9 +1853,9 @@ define double @v_mul_0x1p65_f64(double %x) {
 ; GFX9-GISEL-LABEL: v_mul_0x1p65_f64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX9-GISEL-NEXT:    s_brev_b32 s5, 34
-; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-GISEL-NEXT:    v_bfrev_b32_e32 v3, 34
+; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_0x1p65_f64:
@@ -1961,9 +1867,7 @@ define double @v_mul_0x1p65_f64(double %x) {
 ; GFX10-GISEL-LABEL: v_mul_0x1p65_f64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX10-GISEL-NEXT:    s_brev_b32 s5, 34
-; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], 0x44000000, v[0:1]
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_0x1p65_f64:
@@ -1975,9 +1879,7 @@ define double @v_mul_0x1p65_f64(double %x) {
 ; GFX11-GISEL-LABEL: v_mul_0x1p65_f64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
-; GFX11-GISEL-NEXT:    s_brev_b32 s1, 34
-; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[0:1]
+; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], 0x44000000, v[0:1]
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul double %x, 36893488147419103232.0
   ret double %mul
@@ -1994,10 +1896,8 @@ define amdgpu_ps <2 x i32> @s_mul_0x1p65_f64(double inreg %x, double inreg %y) {
 ;
 ; GFX9-GISEL-LABEL: s_mul_0x1p65_f64:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_mov_b32 s2, 0
-; GFX9-GISEL-NEXT:    s_brev_b32 s3, 34
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-GISEL-NEXT:    v_bfrev_b32_e32 v1, 34
 ; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], s[0:1], v[0:1]
 ; GFX9-GISEL-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX9-GISEL-NEXT:    v_readfirstlane_b32 s1, v1
@@ -2012,9 +1912,7 @@ define amdgpu_ps <2 x i32> @s_mul_0x1p65_f64(double inreg %x, double inreg %y) {
 ;
 ; GFX10-GISEL-LABEL: s_mul_0x1p65_f64:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_mov_b32 s2, 0
-; GFX10-GISEL-NEXT:    s_brev_b32 s3, 34
-; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], s[0:1], s[2:3]
+; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], 0x44000000, s[0:1]
 ; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s1, v1
 ; GFX10-GISEL-NEXT:    ; return to shader part epilog
@@ -2028,9 +1926,7 @@ define amdgpu_ps <2 x i32> @s_mul_0x1p65_f64(double inreg %x, double inreg %y) {
 ;
 ; GFX11-GISEL-LABEL: s_mul_0x1p65_f64:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_mov_b32 s2, 0
-; GFX11-GISEL-NEXT:    s_brev_b32 s3, 34
-; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], s[0:1], s[2:3]
+; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], 0x44000000, s[0:1]
 ; GFX11-GISEL-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX11-GISEL-NEXT:    v_readfirstlane_b32 s1, v1
 ; GFX11-GISEL-NEXT:    ; return to shader part epilog
@@ -2057,9 +1953,9 @@ define double @v_mul_0x1p128_f64(double %x) {
 ; GFX9-GISEL-LABEL: v_mul_0x1p128_f64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 0x47f00000
-; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x47f00000
+; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_0x1p128_f64:
@@ -2071,9 +1967,7 @@ define double @v_mul_0x1p128_f64(double %x) {
 ; GFX10-GISEL-LABEL: v_mul_0x1p128_f64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, 0x47f00000
-; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], 0x47f00000, v[0:1]
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_0x1p128_f64:
@@ -2085,9 +1979,7 @@ define double @v_mul_0x1p128_f64(double %x) {
 ; GFX11-GISEL-LABEL: v_mul_0x1p128_f64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, 0x47f00000
-; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[0:1]
+; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], 0x47f00000, v[0:1]
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul double %x, 3.40282366920938463463e+38
   ret double %mul
@@ -2105,9 +1997,9 @@ define double @v_mul_0x1p1022_f64(double %x) {
 ; GFX9-GISEL-LABEL: v_mul_0x1p1022_f64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 0x7fd00000
-; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7fd00000
+; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_0x1p1022_f64:
@@ -2119,9 +2011,7 @@ define double @v_mul_0x1p1022_f64(double %x) {
 ; GFX10-GISEL-LABEL: v_mul_0x1p1022_f64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, 0x7fd00000
-; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], 0x7fd00000, v[0:1]
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_0x1p1022_f64:
@@ -2133,9 +2023,7 @@ define double @v_mul_0x1p1022_f64(double %x) {
 ; GFX11-GISEL-LABEL: v_mul_0x1p1022_f64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, 0x7fd00000
-; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[0:1]
+; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], 0x7fd00000, v[0:1]
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul double %x, 4.49423283715578976932e+307
   ret double %mul
@@ -2153,9 +2041,9 @@ define double @v_mul_0x1p1023_f64(double %x) {
 ; GFX9-GISEL-LABEL: v_mul_0x1p1023_f64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 0x7fe00000
-; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7fe00000
+; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_0x1p1023_f64:
@@ -2167,9 +2055,7 @@ define double @v_mul_0x1p1023_f64(double %x) {
 ; GFX10-GISEL-LABEL: v_mul_0x1p1023_f64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, 0x7fe00000
-; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], 0x7fe00000, v[0:1]
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_0x1p1023_f64:
@@ -2181,9 +2067,7 @@ define double @v_mul_0x1p1023_f64(double %x) {
 ; GFX11-GISEL-LABEL: v_mul_0x1p1023_f64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, 0x7fe00000
-; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[0:1]
+; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], 0x7fe00000, v[0:1]
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul double %x, 8.98846567431157953865e+307
   ret double %mul
@@ -2191,29 +2075,27 @@ define double @v_mul_0x1p1023_f64(double %x) {
 
 ; Check that this doesn't interfer with fma formation
 define double @v_fma_mul_add_32_f64(double %x, double %y) {
-; GFX9-LABEL: v_fma_mul_add_32_f64:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s4, 0
-; GFX9-NEXT:    s_mov_b32 s5, 0x40400000
-; GFX9-NEXT:    v_fma_f64 v[0:1], v[0:1], s[4:5], v[2:3]
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-SDAG-LABEL: v_fma_mul_add_32_f64:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    s_mov_b32 s4, 0
+; GFX9-SDAG-NEXT:    s_mov_b32 s5, 0x40400000
+; GFX9-SDAG-NEXT:    v_fma_f64 v[0:1], v[0:1], s[4:5], v[2:3]
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: v_fma_mul_add_32_f64:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    s_mov_b32 s5, 0x40400000
-; GFX10-NEXT:    v_fma_f64 v[0:1], v[0:1], s[4:5], v[2:3]
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-GISEL-LABEL: v_fma_mul_add_32_f64:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0x40400000
+; GFX9-GISEL-NEXT:    v_fma_f64 v[0:1], v[0:1], v[4:5], v[2:3]
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_fma_mul_add_32_f64:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_mov_b32 s1, 0x40400000
-; GFX11-NEXT:    v_fma_f64 v[0:1], v[0:1], s[0:1], v[2:3]
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX1011-LABEL: v_fma_mul_add_32_f64:
+; GFX1011:       ; %bb.0:
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011-NEXT:    v_fma_f64 v[0:1], 0x40400000, v[0:1], v[2:3]
+; GFX1011-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul contract double %x, 32.0
   %fma = fadd contract double %mul, %y
   ret double %fma
@@ -2229,23 +2111,33 @@ define <2 x double> @v_fma_mul_add_32_v2f64(<2 x double> %x, <2 x double> %y) {
 ; GFX9-NEXT:    v_fma_f64 v[2:3], v[2:3], s[4:5], v[6:7]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: v_fma_mul_add_32_v2f64:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    s_mov_b32 s5, 0x40400000
-; GFX10-NEXT:    v_fma_f64 v[0:1], v[0:1], s[4:5], v[4:5]
-; GFX10-NEXT:    v_fma_f64 v[2:3], v[2:3], s[4:5], v[6:7]
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-SDAG-LABEL: v_fma_mul_add_32_v2f64:
+; GFX10-SDAG:       ; %bb.0:
+; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT:    v_fma_f64 v[0:1], 0x40400000, v[0:1], v[4:5]
+; GFX10-SDAG-NEXT:    v_fma_f64 v[2:3], 0x40400000, v[2:3], v[6:7]
+; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_fma_mul_add_32_v2f64:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_mov_b32 s1, 0x40400000
-; GFX11-NEXT:    v_fma_f64 v[0:1], v[0:1], s[0:1], v[4:5]
-; GFX11-NEXT:    v_fma_f64 v[2:3], v[2:3], s[0:1], v[6:7]
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-GISEL-LABEL: v_fma_mul_add_32_v2f64:
+; GFX10-GISEL:       ; %bb.0:
+; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT:    v_fma_f64 v[0:1], v[0:1], 0x40400000, v[4:5]
+; GFX10-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], 0x40400000, v[6:7]
+; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: v_fma_mul_add_32_v2f64:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT:    v_fma_f64 v[0:1], 0x40400000, v[0:1], v[4:5]
+; GFX11-SDAG-NEXT:    v_fma_f64 v[2:3], 0x40400000, v[2:3], v[6:7]
+; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: v_fma_mul_add_32_v2f64:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT:    v_fma_f64 v[0:1], v[0:1], 0x40400000, v[4:5]
+; GFX11-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], 0x40400000, v[6:7]
+; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul contract <2 x double> %x, <double 32.0, double 32.0>
   %fma = fadd contract <2 x double> %mul, %y
   ret <2 x double> %fma
@@ -2319,9 +2211,9 @@ define double @v_mul_add_32_f64(double %x, double %y) {
 ; GFX9-GISEL-LABEL: v_mul_add_32_f64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 0x40400000
-; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0x40400000
+; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[4:5]
 ; GFX9-GISEL-NEXT:    v_add_f64 v[0:1], v[0:1], v[2:3]
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2335,9 +2227,7 @@ define double @v_mul_add_32_f64(double %x, double %y) {
 ; GFX10-GISEL-LABEL: v_mul_add_32_f64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, 0x40400000
-; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], 0x40400000, v[0:1]
 ; GFX10-GISEL-NEXT:    v_add_f64 v[0:1], v[0:1], v[2:3]
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2351,9 +2241,7 @@ define double @v_mul_add_32_f64(double %x, double %y) {
 ; GFX11-GISEL-LABEL: v_mul_add_32_f64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, 0x40400000
-; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[0:1]
+; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], 0x40400000, v[0:1]
 ; GFX11-GISEL-NEXT:    v_add_f64 v[0:1], v[0:1], v[2:3]
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul double %x, 32.0
@@ -2456,58 +2344,54 @@ define double @v_mul_add_4_f64(double %x, double %y) {
 }
 
 define double @v_fma_mul_sub_32_f64(double %x, double %y) {
-; GFX9-LABEL: v_fma_mul_sub_32_f64:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s4, 0
-; GFX9-NEXT:    s_mov_b32 s5, 0x40400000
-; GFX9-NEXT:    v_fma_f64 v[0:1], v[0:1], s[4:5], -v[2:3]
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-SDAG-LABEL: v_fma_mul_sub_32_f64:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    s_mov_b32 s4, 0
+; GFX9-SDAG-NEXT:    s_mov_b32 s5, 0x40400000
+; GFX9-SDAG-NEXT:    v_fma_f64 v[0:1], v[0:1], s[4:5], -v[2:3]
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: v_fma_mul_sub_32_f64:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    s_mov_b32 s5, 0x40400000
-; GFX10-NEXT:    v_fma_f64 v[0:1], v[0:1], s[4:5], -v[2:3]
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-GISEL-LABEL: v_fma_mul_sub_32_f64:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0x40400000
+; GFX9-GISEL-NEXT:    v_fma_f64 v[0:1], v[0:1], v[4:5], -v[2:3]
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_fma_mul_sub_32_f64:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_mov_b32 s1, 0x40400000
-; GFX11-NEXT:    v_fma_f64 v[0:1], v[0:1], s[0:1], -v[2:3]
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX1011-LABEL: v_fma_mul_sub_32_f64:
+; GFX1011:       ; %bb.0:
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011-NEXT:    v_fma_f64 v[0:1], 0x40400000, v[0:1], -v[2:3]
+; GFX1011-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul contract double %x, 32.0
   %fma = fsub contract double %mul, %y
   ret double %fma
 }
 
 define double @v_fma_mul_add_neg32_f64(double %x, double %y) {
-; GFX9-LABEL: v_fma_mul_add_neg32_f64:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s4, 0
-; GFX9-NEXT:    s_mov_b32 s5, 0xc0400000
-; GFX9-NEXT:    v_fma_f64 v[0:1], v[0:1], s[4:5], v[2:3]
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-SDAG-LABEL: v_fma_mul_add_neg32_f64:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    s_mov_b32 s4, 0
+; GFX9-SDAG-NEXT:    s_mov_b32 s5, 0xc0400000
+; GFX9-SDAG-NEXT:    v_fma_f64 v[0:1], v[0:1], s[4:5], v[2:3]
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: v_fma_mul_add_neg32_f64:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    s_mov_b32 s5, 0xc0400000
-; GFX10-NEXT:    v_fma_f64 v[0:1], v[0:1], s[4:5], v[2:3]
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-GISEL-LABEL: v_fma_mul_add_neg32_f64:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0xc0400000
+; GFX9-GISEL-NEXT:    v_fma_f64 v[0:1], v[0:1], v[4:5], v[2:3]
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_fma_mul_add_neg32_f64:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_mov_b32 s1, 0xc0400000
-; GFX11-NEXT:    v_fma_f64 v[0:1], v[0:1], s[0:1], v[2:3]
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX1011-LABEL: v_fma_mul_add_neg32_f64:
+; GFX1011:       ; %bb.0:
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011-NEXT:    v_fma_f64 v[0:1], 0xc0400000, v[0:1], v[2:3]
+; GFX1011-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul contract double %x, -32.0
   %fma = fadd contract double %mul, %y
   ret double %fma
@@ -2523,9 +2407,9 @@ define double @v_mul_fabs_32_f64(double %x) {
 ; GFX9-GISEL-LABEL: v_mul_fabs_32_f64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 0x40400000
-; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], |v[0:1]|, s[4:5]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x40400000
+; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], |v[0:1]|, v[2:3]
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_fabs_32_f64:
@@ -2537,9 +2421,7 @@ define double @v_mul_fabs_32_f64(double %x) {
 ; GFX10-GISEL-LABEL: v_mul_fabs_32_f64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, 0x40400000
-; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], |v[0:1]|, s[4:5]
+; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], 0x40400000, |v[0:1]|
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_fabs_32_f64:
@@ -2551,9 +2433,7 @@ define double @v_mul_fabs_32_f64(double %x) {
 ; GFX11-GISEL-LABEL: v_mul_fabs_32_f64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, 0x40400000
-; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], |v[0:1]|, s[0:1]
+; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], 0x40400000, |v[0:1]|
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %x.fabs = call double @llvm.fabs.f64(double %x)
   %mul = fmul double %x.fabs, 32.0
@@ -2561,29 +2441,27 @@ define double @v_mul_fabs_32_f64(double %x) {
 }
 
 define double @v_mul_add_fma_fabs_32_f64(double %x, double %y) {
-; GFX9-LABEL: v_mul_add_fma_fabs_32_f64:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s4, 0
-; GFX9-NEXT:    s_mov_b32 s5, 0x40400000
-; GFX9-NEXT:    v_fma_f64 v[0:1], |v[0:1]|, s[4:5], v[2:3]
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-SDAG-LABEL: v_mul_add_fma_fabs_32_f64:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    s_mov_b32 s4, 0
+; GFX9-SDAG-NEXT:    s_mov_b32 s5, 0x40400000
+; GFX9-SDAG-NEXT:    v_fma_f64 v[0:1], |v[0:1]|, s[4:5], v[2:3]
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: v_mul_add_fma_fabs_32_f64:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    s_mov_b32 s5, 0x40400000
-; GFX10-NEXT:    v_fma_f64 v[0:1], |v[0:1]|, s[4:5], v[2:3]
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-GISEL-LABEL: v_mul_add_fma_fabs_32_f64:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0x40400000
+; GFX9-GISEL-NEXT:    v_fma_f64 v[0:1], |v[0:1]|, v[4:5], v[2:3]
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_mul_add_fma_fabs_32_f64:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_mov_b32 s1, 0x40400000
-; GFX11-NEXT:    v_fma_f64 v[0:1], |v[0:1]|, s[0:1], v[2:3]
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX1011-LABEL: v_mul_add_fma_fabs_32_f64:
+; GFX1011:       ; %bb.0:
+; GFX1011-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX1011-NEXT:    v_fma_f64 v[0:1], 0x40400000, |v[0:1]|, v[2:3]
+; GFX1011-NEXT:    s_setpc_b64 s[30:31]
   %x.fabs = call double @llvm.fabs.f64(double %x)
   %mul = fmul contract double %x.fabs, 32.0
   %fma = fadd contract double %mul, %y
@@ -2617,10 +2495,8 @@ define <2 x double> @v_mul_16_v2f64(<2 x double> %x) {
 ; GFX10-GISEL-LABEL: v_mul_16_v2f64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, 0x40300000
-; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
-; GFX10-GISEL-NEXT:    v_mul_f64 v[2:3], v[2:3], s[4:5]
+; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], 0x40300000
+; GFX10-GISEL-NEXT:    v_mul_f64 v[2:3], v[2:3], 0x40300000
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_16_v2f64:
@@ -2633,10 +2509,8 @@ define <2 x double> @v_mul_16_v2f64(<2 x double> %x) {
 ; GFX11-GISEL-LABEL: v_mul_16_v2f64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, 0x40300000
-; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[0:1]
-; GFX11-GISEL-NEXT:    v_mul_f64 v[2:3], v[2:3], s[0:1]
+; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], 0x40300000
+; GFX11-GISEL-NEXT:    v_mul_f64 v[2:3], v[2:3], 0x40300000
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul <2 x double> %x, <double 16.0, double 16.0>
   ret <2 x double> %mul
@@ -2669,10 +2543,8 @@ define <2 x double> @v_mul_neg16_v2f64(<2 x double> %x) {
 ; GFX10-GISEL-LABEL: v_mul_neg16_v2f64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, 0xc0300000
-; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
-; GFX10-GISEL-NEXT:    v_mul_f64 v[2:3], v[2:3], s[4:5]
+; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], 0xc0300000
+; GFX10-GISEL-NEXT:    v_mul_f64 v[2:3], v[2:3], 0xc0300000
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_neg16_v2f64:
@@ -2685,10 +2557,8 @@ define <2 x double> @v_mul_neg16_v2f64(<2 x double> %x) {
 ; GFX11-GISEL-LABEL: v_mul_neg16_v2f64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, 0xc0300000
-; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[0:1]
-; GFX11-GISEL-NEXT:    v_mul_f64 v[2:3], v[2:3], s[0:1]
+; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], 0xc0300000
+; GFX11-GISEL-NEXT:    v_mul_f64 v[2:3], v[2:3], 0xc0300000
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %mul = fmul <2 x double> %x, <double -16.0, double -16.0>
   ret <2 x double> %mul
@@ -2721,10 +2591,8 @@ define <2 x double> @v_mul_fabs_16_v2f64(<2 x double> %x) {
 ; GFX10-GISEL-LABEL: v_mul_fabs_16_v2f64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, 0x40300000
-; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], |v[0:1]|, s[4:5]
-; GFX10-GISEL-NEXT:    v_mul_f64 v[2:3], |v[2:3]|, s[4:5]
+; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], |v[0:1]|, 0x40300000
+; GFX10-GISEL-NEXT:    v_mul_f64 v[2:3], |v[2:3]|, 0x40300000
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_fabs_16_v2f64:
@@ -2737,10 +2605,8 @@ define <2 x double> @v_mul_fabs_16_v2f64(<2 x double> %x) {
 ; GFX11-GISEL-LABEL: v_mul_fabs_16_v2f64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, 0x40300000
-; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], |v[0:1]|, s[0:1]
-; GFX11-GISEL-NEXT:    v_mul_f64 v[2:3], |v[2:3]|, s[0:1]
+; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], |v[0:1]|, 0x40300000
+; GFX11-GISEL-NEXT:    v_mul_f64 v[2:3], |v[2:3]|, 0x40300000
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %x.fabs = call <2 x double> @llvm.fabs.v2f64(<2 x double> %x)
   %mul = fmul <2 x double> %x.fabs, <double 16.0, double 16.0>
@@ -2757,10 +2623,8 @@ define amdgpu_ps <2 x i32> @s_mul_32_f64(double inreg %x, double inreg %y) {
 ;
 ; GFX9-GISEL-LABEL: s_mul_32_f64:
 ; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_mov_b32 s2, 0
-; GFX9-GISEL-NEXT:    s_mov_b32 s3, 0x40400000
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0x40400000
 ; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], s[0:1], v[0:1]
 ; GFX9-GISEL-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX9-GISEL-NEXT:    v_readfirstlane_b32 s1, v1
@@ -2775,9 +2639,7 @@ define amdgpu_ps <2 x i32> @s_mul_32_f64(double inreg %x, double inreg %y) {
 ;
 ; GFX10-GISEL-LABEL: s_mul_32_f64:
 ; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_mov_b32 s2, 0
-; GFX10-GISEL-NEXT:    s_mov_b32 s3, 0x40400000
-; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], s[0:1], s[2:3]
+; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], 0x40400000, s[0:1]
 ; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX10-GISEL-NEXT:    v_readfirstlane_b32 s1, v1
 ; GFX10-GISEL-NEXT:    ; return to shader part epilog
@@ -2791,9 +2653,7 @@ define amdgpu_ps <2 x i32> @s_mul_32_f64(double inreg %x, double inreg %y) {
 ;
 ; GFX11-GISEL-LABEL: s_mul_32_f64:
 ; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_mov_b32 s2, 0
-; GFX11-GISEL-NEXT:    s_mov_b32 s3, 0x40400000
-; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], s[0:1], s[2:3]
+; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], 0x40400000, s[0:1]
 ; GFX11-GISEL-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX11-GISEL-NEXT:    v_readfirstlane_b32 s1, v1
 ; GFX11-GISEL-NEXT:    ; return to shader part epilog
@@ -6885,9 +6745,9 @@ define double @v_constrained_fmul_32_f64(double %x, double %y) #0 {
 ; GFX9-GISEL-LABEL: v_constrained_fmul_32_f64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 0x40400000
-; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x40400000
+; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_constrained_fmul_32_f64:
@@ -6899,9 +6759,7 @@ define double @v_constrained_fmul_32_f64(double %x, double %y) #0 {
 ; GFX10-GISEL-LABEL: v_constrained_fmul_32_f64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, 0x40400000
-; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], 0x40400000
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_constrained_fmul_32_f64:
@@ -6913,9 +6771,7 @@ define double @v_constrained_fmul_32_f64(double %x, double %y) #0 {
 ; GFX11-GISEL-LABEL: v_constrained_fmul_32_f64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, 0x40400000
-; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[0:1]
+; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], 0x40400000
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %val = call double @llvm.experimental.constrained.fmul.f64(double %x, double 32.0, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret double %val
@@ -6931,9 +6787,9 @@ define double @v_constrained_fmul_0x1p64_f64(double %x, double %y) #0 {
 ; GFX9-GISEL-LABEL: v_constrained_fmul_0x1p64_f64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 0x43f00000
-; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x43f00000
+; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_constrained_fmul_0x1p64_f64:
@@ -6945,9 +6801,7 @@ define double @v_constrained_fmul_0x1p64_f64(double %x, double %y) #0 {
 ; GFX10-GISEL-LABEL: v_constrained_fmul_0x1p64_f64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, 0x43f00000
-; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], 0x43f00000
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_constrained_fmul_0x1p64_f64:
@@ -6959,9 +6813,7 @@ define double @v_constrained_fmul_0x1p64_f64(double %x, double %y) #0 {
 ; GFX11-GISEL-LABEL: v_constrained_fmul_0x1p64_f64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, 0x43f00000
-; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], s[0:1]
+; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], 0x43f00000
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %val = call double @llvm.experimental.constrained.fmul.f64(double %x, double 18446744073709551616.0, metadata !"round.dynamic", metadata !"fpexcept.strict")
   ret double %val
@@ -6988,9 +6840,9 @@ define double @v_mul_fabs_0x1pn1031_f64(double %x) {
 ; GFX9-GISEL-LABEL: v_mul_fabs_0x1pn1031_f64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX9-GISEL-NEXT:    s_movk_i32 s5, 0x800
-; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], |v[0:1]|, s[4:5]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x800
+; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], |v[0:1]|, v[2:3]
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_fabs_0x1pn1031_f64:
@@ -7002,9 +6854,7 @@ define double @v_mul_fabs_0x1pn1031_f64(double %x) {
 ; GFX10-GISEL-LABEL: v_mul_fabs_0x1pn1031_f64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX10-GISEL-NEXT:    s_movk_i32 s5, 0x800
-; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], |v[0:1]|, s[4:5]
+; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], 0x800, |v[0:1]|
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_fabs_0x1pn1031_f64:
@@ -7016,9 +6866,7 @@ define double @v_mul_fabs_0x1pn1031_f64(double %x) {
 ; GFX11-GISEL-LABEL: v_mul_fabs_0x1pn1031_f64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
-; GFX11-GISEL-NEXT:    s_movk_i32 s1, 0x800
-; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], |v[0:1]|, s[0:1]
+; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], 0x800, |v[0:1]|
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %fabs.x = call double @llvm.fabs.f64(double %x)
   %mul = fmul double %fabs.x, 4.34584737989687770135e-311
@@ -7035,9 +6883,9 @@ define double @v_mul_fabs_neg256_f64(double %x) {
 ; GFX9-GISEL-LABEL: v_mul_fabs_neg256_f64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 0xc0700000
-; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], |v[0:1]|, s[4:5]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0xc0700000
+; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], |v[0:1]|, v[2:3]
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_fabs_neg256_f64:
@@ -7049,9 +6897,7 @@ define double @v_mul_fabs_neg256_f64(double %x) {
 ; GFX10-GISEL-LABEL: v_mul_fabs_neg256_f64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, 0xc0700000
-; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], |v[0:1]|, s[4:5]
+; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], 0xc0700000, |v[0:1]|
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_fabs_neg256_f64:
@@ -7063,9 +6909,7 @@ define double @v_mul_fabs_neg256_f64(double %x) {
 ; GFX11-GISEL-LABEL: v_mul_fabs_neg256_f64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, 0xc0700000
-; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], |v[0:1]|, s[0:1]
+; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], 0xc0700000, |v[0:1]|
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %fabs.x = call double @llvm.fabs.f64(double %x)
   %mul = fmul double %fabs.x, -256.0
@@ -7082,9 +6926,9 @@ define double @v_mul_fabs_neg8_f64(double %x) {
 ; GFX9-GISEL-LABEL: v_mul_fabs_neg8_f64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 0xc0200000
-; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], |v[0:1]|, s[4:5]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0xc0200000
+; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], |v[0:1]|, v[2:3]
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_fabs_neg8_f64:
@@ -7096,9 +6940,7 @@ define double @v_mul_fabs_neg8_f64(double %x) {
 ; GFX10-GISEL-LABEL: v_mul_fabs_neg8_f64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, 0xc0200000
-; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], |v[0:1]|, s[4:5]
+; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], 0xc0200000, |v[0:1]|
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_fabs_neg8_f64:
@@ -7110,9 +6952,7 @@ define double @v_mul_fabs_neg8_f64(double %x) {
 ; GFX11-GISEL-LABEL: v_mul_fabs_neg8_f64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, 0xc0200000
-; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], |v[0:1]|, s[0:1]
+; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], 0xc0200000, |v[0:1]|
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %fabs.x = call double @llvm.fabs.f64(double %x)
   %mul = fmul double %fabs.x, -8.0
@@ -7203,9 +7043,9 @@ define double @v_mul_fabs_negquarter_f64(double %x) {
 ; GFX9-GISEL-LABEL: v_mul_fabs_negquarter_f64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 0xbfd00000
-; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], |v[0:1]|, s[4:5]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0xbfd00000
+; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], |v[0:1]|, v[2:3]
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_fabs_negquarter_f64:
@@ -7217,9 +7057,7 @@ define double @v_mul_fabs_negquarter_f64(double %x) {
 ; GFX10-GISEL-LABEL: v_mul_fabs_negquarter_f64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, 0xbfd00000
-; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], |v[0:1]|, s[4:5]
+; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], 0xbfd00000, |v[0:1]|
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_fabs_negquarter_f64:
@@ -7231,9 +7069,7 @@ define double @v_mul_fabs_negquarter_f64(double %x) {
 ; GFX11-GISEL-LABEL: v_mul_fabs_negquarter_f64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, 0xbfd00000
-; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], |v[0:1]|, s[0:1]
+; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], 0xbfd00000, |v[0:1]|
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %fabs.x = call double @llvm.fabs.f64(double %x)
   %mul = fmul double %fabs.x, -0.25
@@ -7250,9 +7086,9 @@ define double @v_mul_fabs_quarter_f64(double %x) {
 ; GFX9-GISEL-LABEL: v_mul_fabs_quarter_f64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 0x3fd00000
-; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], |v[0:1]|, s[4:5]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3fd00000
+; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], |v[0:1]|, v[2:3]
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_fabs_quarter_f64:
@@ -7264,9 +7100,7 @@ define double @v_mul_fabs_quarter_f64(double %x) {
 ; GFX10-GISEL-LABEL: v_mul_fabs_quarter_f64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, 0x3fd00000
-; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], |v[0:1]|, s[4:5]
+; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], 0x3fd00000, |v[0:1]|
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_fabs_quarter_f64:
@@ -7278,9 +7112,7 @@ define double @v_mul_fabs_quarter_f64(double %x) {
 ; GFX11-GISEL-LABEL: v_mul_fabs_quarter_f64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, 0x3fd00000
-; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], |v[0:1]|, s[0:1]
+; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], 0x3fd00000, |v[0:1]|
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %fabs.x = call double @llvm.fabs.f64(double %x)
   %mul = fmul double %fabs.x, 0.25
@@ -7371,9 +7203,9 @@ define double @v_mul_fabs_8_f64(double %x) {
 ; GFX9-GISEL-LABEL: v_mul_fabs_8_f64:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 0x40200000
-; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], |v[0:1]|, s[4:5]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x40200000
+; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], |v[0:1]|, v[2:3]
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: v_mul_fabs_8_f64:
@@ -7385,9 +7217,7 @@ define double @v_mul_fabs_8_f64(double %x) {
 ; GFX10-GISEL-LABEL: v_mul_fabs_8_f64:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, 0x40200000
-; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], |v[0:1]|, s[4:5]
+; GFX10-GISEL-NEXT:    v_mul_f64 v[0:1], 0x40200000, |v[0:1]|
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: v_mul_fabs_8_f64:
@@ -7399,9 +7229,7 @@ define double @v_mul_fabs_8_f64(double %x) {
 ; GFX11-GISEL-LABEL: v_mul_fabs_8_f64:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, 0x40200000
-; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], |v[0:1]|, s[0:1]
+; GFX11-GISEL-NEXT:    v_mul_f64 v[0:1], 0x40200000, |v[0:1]|
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %fabs.x = call double @llvm.fabs.f64(double %x)
   %mul = fmul double %fabs.x, 8.0
diff --git a/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll b/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll
index d4c830c55030d..b60780db77378 100644
--- a/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll
@@ -435,21 +435,17 @@ define double @fmul_pow_shl_cnt(i64 %cnt) nounwind {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v0, 1
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    s_mov_b32 s5, 0x40220000
 ; GFX10-NEXT:    v_cvt_f64_u32_e32 v[1:2], v1
 ; GFX10-NEXT:    v_cvt_f64_u32_e32 v[3:4], v0
 ; GFX10-NEXT:    v_ldexp_f64 v[0:1], v[1:2], 32
 ; GFX10-NEXT:    v_add_f64 v[0:1], v[0:1], v[3:4]
-; GFX10-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX10-NEXT:    v_mul_f64 v[0:1], 0x40220000, v[0:1]
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fmul_pow_shl_cnt:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v0, 1
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_mov_b32 s1, 0x40220000
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cvt_f64_u32_e32 v[1:2], v1
 ; GFX11-NEXT:    v_cvt_f64_u32_e32 v[3:4], v0
@@ -457,7 +453,7 @@ define double @fmul_pow_shl_cnt(i64 %cnt) nounwind {
 ; GFX11-NEXT:    v_ldexp_f64 v[0:1], v[1:2], 32
 ; GFX11-NEXT:    v_add_f64 v[0:1], v[0:1], v[3:4]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_mul_f64 v[0:1], v[0:1], s[0:1]
+; GFX11-NEXT:    v_mul_f64 v[0:1], 0x40220000, v[0:1]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %shl = shl nuw i64 1, %cnt
   %conv = uitofp i64 %shl to double
@@ -529,21 +525,17 @@ define double @fmul_pow_shl_cnt2(i64 %cnt) nounwind {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v0, 2
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    s_mov_b32 s5, 0xc0220000
 ; GFX10-NEXT:    v_cvt_f64_u32_e32 v[1:2], v1
 ; GFX10-NEXT:    v_cvt_f64_u32_e32 v[3:4], v0
 ; GFX10-NEXT:    v_ldexp_f64 v[0:1], v[1:2], 32
 ; GFX10-NEXT:    v_add_f64 v[0:1], v[0:1], v[3:4]
-; GFX10-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX10-NEXT:    v_mul_f64 v[0:1], 0xc0220000, v[0:1]
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fmul_pow_shl_cnt2:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v0, 2
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_mov_b32 s1, 0xc0220000
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cvt_f64_u32_e32 v[1:2], v1
 ; GFX11-NEXT:    v_cvt_f64_u32_e32 v[3:4], v0
@@ -551,7 +543,7 @@ define double @fmul_pow_shl_cnt2(i64 %cnt) nounwind {
 ; GFX11-NEXT:    v_ldexp_f64 v[0:1], v[1:2], 32
 ; GFX11-NEXT:    v_add_f64 v[0:1], v[0:1], v[3:4]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_mul_f64 v[0:1], v[0:1], s[0:1]
+; GFX11-NEXT:    v_mul_f64 v[0:1], 0xc0220000, v[0:1]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %shl = shl nuw i64 2, %cnt
   %conv = uitofp i64 %shl to double
@@ -721,8 +713,7 @@ define float @fmul_fly_pow_mul_min_pow2(i64 %cnt) nounwind {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v0, 8
-; GFX10-NEXT:    s_mov_b64 s[4:5], 0x2000
-; GFX10-NEXT:    v_cmp_gt_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX10-NEXT:    v_cmp_gt_u64_e32 vcc_lo, 0x2000, v[0:1]
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x2000, v0, vcc_lo
 ; GFX10-NEXT:    v_ffbh_u32_e32 v2, v1
@@ -740,23 +731,22 @@ define float @fmul_fly_pow_mul_min_pow2(i64 %cnt) nounwind {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v0, 8
-; GFX11-NEXT:    s_mov_b64 s[0:1], 0x2000
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    v_cmp_gt_u64_e32 vcc_lo, s[0:1], v[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cmp_gt_u64_e32 vcc_lo, 0x2000, v[0:1]
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc_lo
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x2000, v0, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_clz_i32_u32_e32 v2, v1
-; GFX11-NEXT:    v_min_u32_e32 v2, 32, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_min_u32_e32 v2, 32, v2
 ; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v2, v[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_min_u32_e32 v0, 1, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v1, 32, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_mul_f32_e32 v0, 0x41100000, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %shl8 = shl nuw i64 8, %cnt
@@ -838,24 +828,20 @@ define double @fmul_pow_mul_max_pow2(i16 %cnt) nounwind {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b16 v0, v0, 2
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    s_mov_b32 s5, 0x40080000
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX10-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
-; GFX10-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX10-NEXT:    v_mul_f64 v[0:1], 0x40080000, v[0:1]
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fmul_pow_mul_max_pow2:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b16 v0, v0, 2
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_mov_b32 s1, 0x40080000
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX11-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_mul_f64 v[0:1], v[0:1], s[0:1]
+; GFX11-NEXT:    v_mul_f64 v[0:1], 0x40080000, v[0:1]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %shl2 = shl nuw i16 2, %cnt
   %shl1 = shl nuw i16 1, %cnt
@@ -925,21 +911,17 @@ define double @fmul_pow_shl_cnt_fail_maybe_non_pow2(i64 %v, i64 %cnt) nounwind {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v2, v[0:1]
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    s_mov_b32 s5, 0x40220000
 ; GFX10-NEXT:    v_cvt_f64_u32_e32 v[1:2], v1
 ; GFX10-NEXT:    v_cvt_f64_u32_e32 v[3:4], v0
 ; GFX10-NEXT:    v_ldexp_f64 v[0:1], v[1:2], 32
 ; GFX10-NEXT:    v_add_f64 v[0:1], v[0:1], v[3:4]
-; GFX10-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
+; GFX10-NEXT:    v_mul_f64 v[0:1], 0x40220000, v[0:1]
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fmul_pow_shl_cnt_fail_maybe_non_pow2:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v2, v[0:1]
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_mov_b32 s1, 0x40220000
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cvt_f64_u32_e32 v[1:2], v1
 ; GFX11-NEXT:    v_cvt_f64_u32_e32 v[3:4], v0
@@ -947,7 +929,7 @@ define double @fmul_pow_shl_cnt_fail_maybe_non_pow2(i64 %v, i64 %cnt) nounwind {
 ; GFX11-NEXT:    v_ldexp_f64 v[0:1], v[1:2], 32
 ; GFX11-NEXT:    v_add_f64 v[0:1], v[0:1], v[3:4]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_mul_f64 v[0:1], v[0:1], s[0:1]
+; GFX11-NEXT:    v_mul_f64 v[0:1], 0x40220000, v[0:1]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %shl = shl nuw i64 %v, %cnt
   %conv = uitofp i64 %shl to double
@@ -1206,8 +1188,6 @@ define <2 x double> @fmul_pow_shl_cnt_vec(<2 x i64> %cnt) nounwind {
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v0, 2
 ; GFX10-NEXT:    v_lshlrev_b64 v[2:3], v2, 2
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    s_mov_b32 s5, 0x402e0000
 ; GFX10-NEXT:    v_cvt_f64_u32_e32 v[4:5], v1
 ; GFX10-NEXT:    v_cvt_f64_u32_e32 v[6:7], v3
 ; GFX10-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
@@ -1216,8 +1196,8 @@ define <2 x double> @fmul_pow_shl_cnt_vec(<2 x i64> %cnt) nounwind {
 ; GFX10-NEXT:    v_ldexp_f64 v[5:6], v[6:7], 32
 ; GFX10-NEXT:    v_add_f64 v[0:1], v[3:4], v[0:1]
 ; GFX10-NEXT:    v_add_f64 v[2:3], v[5:6], v[8:9]
-; GFX10-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
-; GFX10-NEXT:    v_mul_f64 v[2:3], v[2:3], s[4:5]
+; GFX10-NEXT:    v_mul_f64 v[0:1], 0x402e0000, v[0:1]
+; GFX10-NEXT:    v_mul_f64 v[2:3], 0x402e0000, v[2:3]
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fmul_pow_shl_cnt_vec:
@@ -1225,11 +1205,10 @@ define <2 x double> @fmul_pow_shl_cnt_vec(<2 x i64> %cnt) nounwind {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v0, 2
 ; GFX11-NEXT:    v_lshlrev_b64 v[2:3], v2, 2
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_mov_b32 s1, 0x402e0000
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cvt_f64_u32_e32 v[4:5], v1
 ; GFX11-NEXT:    v_cvt_f64_u32_e32 v[6:7], v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
 ; GFX11-NEXT:    v_cvt_f64_u32_e32 v[8:9], v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
@@ -1239,8 +1218,8 @@ define <2 x double> @fmul_pow_shl_cnt_vec(<2 x i64> %cnt) nounwind {
 ; GFX11-NEXT:    v_add_f64 v[0:1], v[3:4], v[0:1]
 ; GFX11-NEXT:    v_add_f64 v[2:3], v[5:6], v[8:9]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_mul_f64 v[0:1], v[0:1], s[0:1]
-; GFX11-NEXT:    v_mul_f64 v[2:3], v[2:3], s[0:1]
+; GFX11-NEXT:    v_mul_f64 v[0:1], 0x402e0000, v[0:1]
+; GFX11-NEXT:    v_mul_f64 v[2:3], 0x402e0000, v[2:3]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %shl = shl nsw nuw <2 x i64> <i64 2, i64 2>, %cnt
   %conv = uitofp <2 x i64> %shl to <2 x double>
@@ -1435,20 +1414,21 @@ define <2 x double> @fmul_pow_shl_cnt_vec_non_splat_todo(<2 x i64> %cnt) nounwin
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_lshlrev_b64 v[0:1], v0, 2
-; VI-NEXT:    v_lshlrev_b64 v[2:3], v2, 2
-; VI-NEXT:    v_cvt_f64_u32_e32 v[4:5], v1
-; VI-NEXT:    v_cvt_f64_u32_e32 v[6:7], v3
-; VI-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
+; VI-NEXT:    s_mov_b32 s4, 0
+; VI-NEXT:    v_cvt_f64_u32_e32 v[3:4], v1
+; VI-NEXT:    v_lshlrev_b64 v[1:2], v2, 2
 ; VI-NEXT:    s_mov_b32 s5, 0x402e0000
-; VI-NEXT:    v_ldexp_f64 v[3:4], v[4:5], 32
-; VI-NEXT:    v_ldexp_f64 v[5:6], v[6:7], 32
-; VI-NEXT:    v_cvt_f64_u32_e32 v[7:8], v2
+; VI-NEXT:    v_cvt_f64_u32_e32 v[5:6], v2
+; VI-NEXT:    v_ldexp_f64 v[2:3], v[3:4], 32
+; VI-NEXT:    v_ldexp_f64 v[4:5], v[5:6], 32
+; VI-NEXT:    v_cvt_f64_u32_e32 v[6:7], v0
+; VI-NEXT:    v_cvt_f64_u32_e32 v[0:1], v1
+; VI-NEXT:    v_add_f64 v[2:3], v[2:3], v[6:7]
+; VI-NEXT:    v_add_f64 v[4:5], v[4:5], v[0:1]
+; VI-NEXT:    v_mul_f64 v[0:1], v[2:3], s[4:5]
 ; VI-NEXT:    s_mov_b32 s4, 0
-; VI-NEXT:    v_add_f64 v[0:1], v[3:4], v[0:1]
-; VI-NEXT:    v_add_f64 v[2:3], v[5:6], v[7:8]
-; VI-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
 ; VI-NEXT:    s_mov_b32 s5, 0x402c0000
-; VI-NEXT:    v_mul_f64 v[2:3], v[2:3], s[4:5]
+; VI-NEXT:    v_mul_f64 v[2:3], v[4:5], s[4:5]
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: fmul_pow_shl_cnt_vec_non_splat_todo:
@@ -1456,8 +1436,6 @@ define <2 x double> @fmul_pow_shl_cnt_vec_non_splat_todo(<2 x i64> %cnt) nounwin
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v0, 2
 ; GFX10-NEXT:    v_lshlrev_b64 v[2:3], v2, 2
-; GFX10-NEXT:    s_mov_b32 s5, 0x402e0000
-; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:    v_cvt_f64_u32_e32 v[4:5], v1
 ; GFX10-NEXT:    v_cvt_f64_u32_e32 v[6:7], v3
 ; GFX10-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
@@ -1466,9 +1444,8 @@ define <2 x double> @fmul_pow_shl_cnt_vec_non_splat_todo(<2 x i64> %cnt) nounwin
 ; GFX10-NEXT:    v_ldexp_f64 v[5:6], v[6:7], 32
 ; GFX10-NEXT:    v_add_f64 v[0:1], v[3:4], v[0:1]
 ; GFX10-NEXT:    v_add_f64 v[2:3], v[5:6], v[8:9]
-; GFX10-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
-; GFX10-NEXT:    s_mov_b32 s5, 0x402c0000
-; GFX10-NEXT:    v_mul_f64 v[2:3], v[2:3], s[4:5]
+; GFX10-NEXT:    v_mul_f64 v[0:1], 0x402e0000, v[0:1]
+; GFX10-NEXT:    v_mul_f64 v[2:3], 0x402c0000, v[2:3]
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fmul_pow_shl_cnt_vec_non_splat_todo:
@@ -1476,11 +1453,10 @@ define <2 x double> @fmul_pow_shl_cnt_vec_non_splat_todo(<2 x i64> %cnt) nounwin
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v0, 2
 ; GFX11-NEXT:    v_lshlrev_b64 v[2:3], v2, 2
-; GFX11-NEXT:    s_mov_b32 s1, 0x402e0000
-; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cvt_f64_u32_e32 v[4:5], v1
 ; GFX11-NEXT:    v_cvt_f64_u32_e32 v[6:7], v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
 ; GFX11-NEXT:    v_cvt_f64_u32_e32 v[8:9], v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
@@ -1489,11 +1465,9 @@ define <2 x double> @fmul_pow_shl_cnt_vec_non_splat_todo(<2 x i64> %cnt) nounwin
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_add_f64 v[0:1], v[3:4], v[0:1]
 ; GFX11-NEXT:    v_add_f64 v[2:3], v[5:6], v[8:9]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_mul_f64 v[0:1], v[0:1], s[0:1]
-; GFX11-NEXT:    s_mov_b32 s1, 0x402c0000
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    v_mul_f64 v[2:3], v[2:3], s[0:1]
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_mul_f64 v[0:1], 0x402e0000, v[0:1]
+; GFX11-NEXT:    v_mul_f64 v[2:3], 0x402c0000, v[2:3]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %shl = shl nsw nuw <2 x i64> <i64 2, i64 2>, %cnt
   %conv = uitofp <2 x i64> %shl to <2 x double>
@@ -1580,8 +1554,6 @@ define <2 x double> @fmul_pow_shl_cnt_vec_non_splat2_todo(<2 x i64> %cnt) nounwi
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v0, 2
 ; GFX10-NEXT:    v_lshlrev_b64 v[2:3], v2, 1
-; GFX10-NEXT:    s_mov_b32 s4, 0
-; GFX10-NEXT:    s_mov_b32 s5, 0x402e0000
 ; GFX10-NEXT:    v_cvt_f64_u32_e32 v[4:5], v1
 ; GFX10-NEXT:    v_cvt_f64_u32_e32 v[6:7], v3
 ; GFX10-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
@@ -1590,8 +1562,8 @@ define <2 x double> @fmul_pow_shl_cnt_vec_non_splat2_todo(<2 x i64> %cnt) nounwi
 ; GFX10-NEXT:    v_ldexp_f64 v[5:6], v[6:7], 32
 ; GFX10-NEXT:    v_add_f64 v[0:1], v[3:4], v[0:1]
 ; GFX10-NEXT:    v_add_f64 v[2:3], v[5:6], v[8:9]
-; GFX10-NEXT:    v_mul_f64 v[0:1], v[0:1], s[4:5]
-; GFX10-NEXT:    v_mul_f64 v[2:3], v[2:3], s[4:5]
+; GFX10-NEXT:    v_mul_f64 v[0:1], 0x402e0000, v[0:1]
+; GFX10-NEXT:    v_mul_f64 v[2:3], 0x402e0000, v[2:3]
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fmul_pow_shl_cnt_vec_non_splat2_todo:
@@ -1599,11 +1571,10 @@ define <2 x double> @fmul_pow_shl_cnt_vec_non_splat2_todo(<2 x i64> %cnt) nounwi
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b64 v[0:1], v0, 2
 ; GFX11-NEXT:    v_lshlrev_b64 v[2:3], v2, 1
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_mov_b32 s1, 0x402e0000
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cvt_f64_u32_e32 v[4:5], v1
 ; GFX11-NEXT:    v_cvt_f64_u32_e32 v[6:7], v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
 ; GFX11-NEXT:    v_cvt_f64_u32_e32 v[8:9], v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
@@ -1613,8 +1584,8 @@ define <2 x double> @fmul_pow_shl_cnt_vec_non_splat2_todo(<2 x i64> %cnt) nounwi
 ; GFX11-NEXT:    v_add_f64 v[0:1], v[3:4], v[0:1]
 ; GFX11-NEXT:    v_add_f64 v[2:3], v[5:6], v[8:9]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_mul_f64 v[0:1], v[0:1], s[0:1]
-; GFX11-NEXT:    v_mul_f64 v[2:3], v[2:3], s[0:1]
+; GFX11-NEXT:    v_mul_f64 v[0:1], 0x402e0000, v[0:1]
+; GFX11-NEXT:    v_mul_f64 v[2:3], 0x402e0000, v[2:3]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %shl = shl nsw nuw <2 x i64> <i64 2, i64 1>, %cnt
   %conv = uitofp <2 x i64> %shl to <2 x double>
diff --git a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
index 58b0a0f56918b..fdc8c908ded54 100644
--- a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
@@ -1561,12 +1561,12 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat(ptr %ptr) #1 {
 ; GFX940-LABEL: flat_atomic_fadd_f64_noret_pat:
 ; GFX940:       ; %bb.0: ; %main_body
 ; GFX940-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX940-NEXT:    v_mov_b64_e32 v[2:3], 4.0
+; GFX940-NEXT:    v_mov_b64_e32 v[0:1], 4.0
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-NEXT:    v_mov_b64_e32 v[2:3], s[0:1]
 ; GFX940-NEXT:    buffer_wbl2 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    flat_atomic_add_f64 v[0:1], v[2:3] sc1
+; GFX940-NEXT:    flat_atomic_add_f64 v[2:3], v[0:1] sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc0 sc1
 ; GFX940-NEXT:    s_endpgm
@@ -1579,12 +1579,12 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent(ptr %ptr) #1 {
 ; GFX90A-LABEL: flat_atomic_fadd_f64_noret_pat_agent:
 ; GFX90A:       ; %bb.0: ; %main_body
 ; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
-; GFX90A-NEXT:    v_mov_b32_e32 v3, 0x40100000
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v1, 0x40100000
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    flat_atomic_add_f64 v[0:1], v[2:3]
+; GFX90A-NEXT:    flat_atomic_add_f64 v[2:3], v[0:1]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-NEXT:    s_endpgm
@@ -1592,12 +1592,12 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent(ptr %ptr) #1 {
 ; GFX940-LABEL: flat_atomic_fadd_f64_noret_pat_agent:
 ; GFX940:       ; %bb.0: ; %main_body
 ; GFX940-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX940-NEXT:    v_mov_b64_e32 v[2:3], 4.0
+; GFX940-NEXT:    v_mov_b64_e32 v[0:1], 4.0
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-NEXT:    v_mov_b64_e32 v[2:3], s[0:1]
 ; GFX940-NEXT:    buffer_wbl2 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    flat_atomic_add_f64 v[0:1], v[2:3]
+; GFX940-NEXT:    flat_atomic_add_f64 v[2:3], v[0:1]
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_endpgm
@@ -1637,12 +1637,12 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_system(ptr %ptr) #1 {
 ; GFX940-LABEL: flat_atomic_fadd_f64_noret_pat_system:
 ; GFX940:       ; %bb.0: ; %main_body
 ; GFX940-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX940-NEXT:    v_mov_b64_e32 v[2:3], 4.0
+; GFX940-NEXT:    v_mov_b64_e32 v[0:1], 4.0
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-NEXT:    v_mov_b64_e32 v[2:3], s[0:1]
 ; GFX940-NEXT:    buffer_wbl2 sc0 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
-; GFX940-NEXT:    flat_atomic_add_f64 v[0:1], v[2:3] sc1
+; GFX940-NEXT:    flat_atomic_add_f64 v[2:3], v[0:1] sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc0 sc1
 ; GFX940-NEXT:    s_endpgm
@@ -1838,12 +1838,12 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent_safe(ptr %ptr) {
 ; GFX940-LABEL: flat_atomic_fadd_f64_noret_pat_agent_safe:
 ; GFX940:       ; %bb.0: ; %main_body
 ; GFX940-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX940-NEXT:    v_mov_b64_e32 v[2:3], 4.0
+; GFX940-NEXT:    v_mov_b64_e32 v[0:1], 4.0
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX940-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX940-NEXT:    v_mov_b64_e32 v[2:3], s[0:1]
 ; GFX940-NEXT:    buffer_wbl2 sc1
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT:    flat_atomic_add_f64 v[0:1], v[2:3]
+; GFX940-NEXT:    flat_atomic_add_f64 v[2:3], v[0:1]
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc1
 ; GFX940-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fract-match.ll b/llvm/test/CodeGen/AMDGPU/fract-match.ll
index 055cfbdcc1ea3..3a0b8259d0849 100644
--- a/llvm/test/CodeGen/AMDGPU/fract-match.ll
+++ b/llvm/test/CodeGen/AMDGPU/fract-match.ll
@@ -1821,14 +1821,14 @@ define double @safe_math_fract_f64(double %x, ptr addrspace(1) nocapture writeon
 ; GFX6-NEXT:    v_add_f64 v[4:5], v[0:1], -v[4:5]
 ; GFX6-NEXT:    s_mov_b32 s9, 0x3fefffff
 ; GFX6-NEXT:    v_add_f64 v[6:7], v[0:1], -v[4:5]
-; GFX6-NEXT:    s_mov_b32 s6, 0
-; GFX6-NEXT:    v_min_f64 v[6:7], v[6:7], s[8:9]
 ; GFX6-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
+; GFX6-NEXT:    v_min_f64 v[6:7], v[6:7], s[8:9]
+; GFX6-NEXT:    s_mov_b32 s8, 0
 ; GFX6-NEXT:    s_mov_b32 s9, 0x7ff00000
-; GFX6-NEXT:    s_mov_b32 s8, s6
 ; GFX6-NEXT:    v_cndmask_b32_e32 v7, v7, v1, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v6, v6, v0, vcc
 ; GFX6-NEXT:    v_cmp_neq_f64_e64 vcc, |v[0:1]|, s[8:9]
+; GFX6-NEXT:    s_mov_b32 s6, 0
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s4, s6
 ; GFX6-NEXT:    s_mov_b32 s5, s6
@@ -1841,13 +1841,14 @@ define double @safe_math_fract_f64(double %x, ptr addrspace(1) nocapture writeon
 ; GFX7-LABEL: safe_math_fract_f64:
 ; GFX7:       ; %bb.0: ; %entry
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_mov_b32 s6, 0
+; GFX7-NEXT:    s_mov_b32 s4, 0
 ; GFX7-NEXT:    s_mov_b32 s5, 0x7ff00000
-; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    v_fract_f64_e32 v[4:5], v[0:1]
 ; GFX7-NEXT:    v_cmp_neq_f64_e64 vcc, |v[0:1]|, s[4:5]
 ; GFX7-NEXT:    v_floor_f64_e32 v[6:7], v[0:1]
+; GFX7-NEXT:    s_mov_b32 s6, 0
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s4, s6
 ; GFX7-NEXT:    s_mov_b32 s5, s6
 ; GFX7-NEXT:    v_cndmask_b32_e32 v0, 0, v4, vcc
 ; GFX7-NEXT:    v_cndmask_b32_e32 v1, 0, v5, vcc
@@ -1872,10 +1873,8 @@ define double @safe_math_fract_f64(double %x, ptr addrspace(1) nocapture writeon
 ; GFX11-LABEL: safe_math_fract_f64:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_mov_b32 s1, 0x7ff00000
 ; GFX11-NEXT:    v_fract_f64_e32 v[4:5], v[0:1]
-; GFX11-NEXT:    v_cmp_neq_f64_e64 vcc_lo, |v[0:1]|, s[0:1]
+; GFX11-NEXT:    v_cmp_neq_f64_e64 vcc_lo, 0x7ff00000, |v[0:1]|
 ; GFX11-NEXT:    v_floor_f64_e32 v[6:7], v[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX11-NEXT:    v_dual_cndmask_b32 v0, 0, v4 :: v_dual_cndmask_b32 v1, 0, v5
diff --git a/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll b/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll
index 8bb8f6c464cd0..196a3705ac818 100644
--- a/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll
@@ -1,9 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel=0 -march=amdgcn -mcpu=pitcairn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SDAG %s
-; RUN: llc -global-isel=0 -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SDAG %s
+; RUN: llc -global-isel=0 -march=amdgcn -mcpu=pitcairn -verify-machineinstrs < %s | FileCheck -check-prefixes=SDAG %s
+; RUN: llc -global-isel=0 -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=SDAG %s
 
-; RUN: llc -global-isel=1 -march=amdgcn -mcpu=pitcairn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GISEL %s
-; RUN: llc -global-isel=1 -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GISEL %s
+; RUN: llc -global-isel=1 -march=amdgcn -mcpu=pitcairn -verify-machineinstrs < %s | FileCheck -check-prefixes=GISEL %s
+; RUN: llc -global-isel=1 -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GISEL %s
 
 define double @v_sqrt_f64(double %x) {
 ; SDAG-LABEL: v_sqrt_f64:
@@ -37,11 +37,11 @@ define double @v_sqrt_f64(double %x) {
 ; GISEL-LABEL: v_sqrt_f64:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, 0
-; GISEL-NEXT:    s_brev_b32 s5, 8
-; GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
+; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -97,11 +97,11 @@ define double @v_sqrt_f64_fneg(double %x) {
 ; GISEL-LABEL: v_sqrt_f64_fneg:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, 0
-; GISEL-NEXT:    s_brev_b32 s5, 8
-; GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, -v[0:1], s[4:5]
-; GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
+; GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, -v[0:1], v[2:3]
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], -v[0:1], v2
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -158,11 +158,11 @@ define double @v_sqrt_f64_fabs(double %x) {
 ; GISEL-LABEL: v_sqrt_f64_fabs:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, 0
-; GISEL-NEXT:    s_brev_b32 s5, 8
-; GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[4:5]
-; GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
+; GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, |v[0:1]|, v[2:3]
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], |v[0:1]|, v2
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -219,11 +219,11 @@ define double @v_sqrt_f64_fneg_fabs(double %x) {
 ; GISEL-LABEL: v_sqrt_f64_fneg_fabs:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, 0
-; GISEL-NEXT:    s_brev_b32 s5, 8
-; GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, -|v[0:1]|, s[4:5]
-; GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
+; GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, -|v[0:1]|, v[2:3]
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], -|v[0:1]|, v2
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -281,11 +281,11 @@ define double @v_sqrt_f64_ninf(double %x) {
 ; GISEL-LABEL: v_sqrt_f64_ninf:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, 0
-; GISEL-NEXT:    s_brev_b32 s5, 8
-; GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
+; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -341,11 +341,11 @@ define double @v_sqrt_f64_no_infs_attribute(double %x) "no-infs-fp-math"="true"
 ; GISEL-LABEL: v_sqrt_f64_no_infs_attribute:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, 0
-; GISEL-NEXT:    s_brev_b32 s5, 8
-; GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
+; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -401,11 +401,11 @@ define double @v_sqrt_f64_nnan(double %x) {
 ; GISEL-LABEL: v_sqrt_f64_nnan:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, 0
-; GISEL-NEXT:    s_brev_b32 s5, 8
-; GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
+; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -461,10 +461,8 @@ define amdgpu_ps <2 x i32> @s_sqrt_f64(double inreg %x) {
 ;
 ; GISEL-LABEL: s_sqrt_f64:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_mov_b32 s2, 0
-; GISEL-NEXT:    s_brev_b32 s3, 8
-; GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GISEL-NEXT:    v_bfrev_b32_e32 v1, 8
 ; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
 ; GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
@@ -532,10 +530,8 @@ define amdgpu_ps <2 x i32> @s_sqrt_f64_ninf(double inreg %x) {
 ;
 ; GISEL-LABEL: s_sqrt_f64_ninf:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_mov_b32 s2, 0
-; GISEL-NEXT:    s_brev_b32 s3, 8
-; GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GISEL-NEXT:    v_bfrev_b32_e32 v1, 8
 ; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
 ; GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
@@ -603,10 +599,8 @@ define amdgpu_ps <2 x i32> @s_sqrt_f64_afn(double inreg %x) {
 ;
 ; GISEL-LABEL: s_sqrt_f64_afn:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_mov_b32 s2, 0
-; GISEL-NEXT:    s_brev_b32 s3, 8
-; GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GISEL-NEXT:    v_bfrev_b32_e32 v1, 8
 ; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
 ; GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
@@ -674,10 +668,8 @@ define amdgpu_ps <2 x i32> @s_sqrt_f64_afn_nnan_ninf(double inreg %x) {
 ;
 ; GISEL-LABEL: s_sqrt_f64_afn_nnan_ninf:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_mov_b32 s2, 0
-; GISEL-NEXT:    s_brev_b32 s3, 8
-; GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GISEL-NEXT:    v_bfrev_b32_e32 v1, 8
 ; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
 ; GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
@@ -745,11 +737,11 @@ define double @v_sqrt_f64_nsz(double %x) {
 ; GISEL-LABEL: v_sqrt_f64_nsz:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, 0
-; GISEL-NEXT:    s_brev_b32 s5, 8
-; GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
+; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -805,11 +797,11 @@ define double @v_sqrt_f64_nnan_ninf(double %x) {
 ; GISEL-LABEL: v_sqrt_f64_nnan_ninf:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, 0
-; GISEL-NEXT:    s_brev_b32 s5, 8
-; GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
+; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -865,11 +857,11 @@ define double @v_sqrt_f64_nnan_ninf_nsz(double %x) {
 ; GISEL-LABEL: v_sqrt_f64_nnan_ninf_nsz:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, 0
-; GISEL-NEXT:    s_brev_b32 s5, 8
-; GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
+; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -925,11 +917,11 @@ define double @v_sqrt_f64_afn(double %x) {
 ; GISEL-LABEL: v_sqrt_f64_afn:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, 0
-; GISEL-NEXT:    s_brev_b32 s5, 8
-; GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
+; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -985,11 +977,11 @@ define double @v_sqrt_f64_afn_nsz(double %x) {
 ; GISEL-LABEL: v_sqrt_f64_afn_nsz:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, 0
-; GISEL-NEXT:    s_brev_b32 s5, 8
-; GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
+; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -1066,12 +1058,14 @@ define <2 x double> @v_sqrt_v2f64_afn(<2 x double> %x) {
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_mov_b32 s4, 0
 ; GISEL-NEXT:    s_brev_b32 s5, 8
+; GISEL-NEXT:    v_mov_b32_e32 v4, s4
+; GISEL-NEXT:    v_mov_b32_e32 v5, s5
 ; GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; GISEL-NEXT:    v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v5, 0, v4, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v4, s[4:5]
-; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v5
+; GISEL-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[2:3], v[4:5]
+; GISEL-NEXT:    v_mov_b32_e32 v6, 0x100
+; GISEL-NEXT:    v_cndmask_b32_e32 v7, 0, v6, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v6, s[4:5]
+; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v7
 ; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
 ; GISEL-NEXT:    v_rsq_f64_e32 v[6:7], v[2:3]
@@ -1142,11 +1136,11 @@ define double @v_sqrt_f64_afn_nnan(double %x) {
 ; GISEL-LABEL: v_sqrt_f64_afn_nnan:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, 0
-; GISEL-NEXT:    s_brev_b32 s5, 8
-; GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
+; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -1202,11 +1196,11 @@ define double @v_sqrt_f64_fabs_afn_ninf(double %x) {
 ; GISEL-LABEL: v_sqrt_f64_fabs_afn_ninf:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, 0
-; GISEL-NEXT:    s_brev_b32 s5, 8
-; GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[4:5]
-; GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
+; GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, |v[0:1]|, v[2:3]
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], |v[0:1]|, v2
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -1263,11 +1257,11 @@ define double @v_sqrt_f64_afn_nnan_ninf(double %x) {
 ; GISEL-LABEL: v_sqrt_f64_afn_nnan_ninf:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, 0
-; GISEL-NEXT:    s_brev_b32 s5, 8
-; GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
+; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -1344,12 +1338,14 @@ define <2 x double> @v_sqrt_v2f64_afn_nnan_ninf(<2 x double> %x) {
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_mov_b32 s4, 0
 ; GISEL-NEXT:    s_brev_b32 s5, 8
+; GISEL-NEXT:    v_mov_b32_e32 v4, s4
+; GISEL-NEXT:    v_mov_b32_e32 v5, s5
 ; GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; GISEL-NEXT:    v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v5, 0, v4, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v4, s[4:5]
-; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v5
+; GISEL-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[2:3], v[4:5]
+; GISEL-NEXT:    v_mov_b32_e32 v6, 0x100
+; GISEL-NEXT:    v_cndmask_b32_e32 v7, 0, v6, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v6, s[4:5]
+; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v7
 ; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
 ; GISEL-NEXT:    v_rsq_f64_e32 v[6:7], v[2:3]
@@ -1420,11 +1416,11 @@ define double @v_sqrt_f64_afn_nnan_ninf_nsz(double %x) {
 ; GISEL-LABEL: v_sqrt_f64_afn_nnan_ninf_nsz:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, 0
-; GISEL-NEXT:    s_brev_b32 s5, 8
-; GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
+; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -1480,11 +1476,11 @@ define double @v_sqrt_f64__approx_func_fp_math(double %x) #2 {
 ; GISEL-LABEL: v_sqrt_f64__approx_func_fp_math:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, 0
-; GISEL-NEXT:    s_brev_b32 s5, 8
-; GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
+; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -1540,11 +1536,11 @@ define double @v_sqrt_f64__enough_unsafe_attrs(double %x) #3 {
 ; GISEL-LABEL: v_sqrt_f64__enough_unsafe_attrs:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, 0
-; GISEL-NEXT:    s_brev_b32 s5, 8
-; GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
+; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -1600,11 +1596,11 @@ define double @v_sqrt_f64__unsafe_attr(double %x) #4 {
 ; GISEL-LABEL: v_sqrt_f64__unsafe_attr:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, 0
-; GISEL-NEXT:    s_brev_b32 s5, 8
-; GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
+; GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -1681,12 +1677,14 @@ define <2 x double> @v_sqrt_v2f64(<2 x double> %x) {
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    s_mov_b32 s4, 0
 ; GISEL-NEXT:    s_brev_b32 s5, 8
+; GISEL-NEXT:    v_mov_b32_e32 v4, s4
+; GISEL-NEXT:    v_mov_b32_e32 v5, s5
 ; GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; GISEL-NEXT:    v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3]
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v5, 0, v4, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v4, s[4:5]
-; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v5
+; GISEL-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[2:3], v[4:5]
+; GISEL-NEXT:    v_mov_b32_e32 v6, 0x100
+; GISEL-NEXT:    v_cndmask_b32_e32 v7, 0, v6, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v6, s[4:5]
+; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v7
 ; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
 ; GISEL-NEXT:    v_rsq_f64_e32 v[6:7], v[2:3]
@@ -1795,17 +1793,19 @@ define <3 x double> @v_sqrt_v3f64(<3 x double> %x) {
 ; GISEL-LABEL: v_sqrt_v3f64:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s6, 0
-; GISEL-NEXT:    s_brev_b32 s7, 8
-; GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[6:7], v[0:1]
-; GISEL-NEXT:    v_cmp_gt_f64_e64 s[4:5], s[6:7], v[2:3]
-; GISEL-NEXT:    v_cmp_gt_f64_e64 s[6:7], s[6:7], v[4:5]
-; GISEL-NEXT:    v_mov_b32_e32 v6, 0x100
-; GISEL-NEXT:    v_cndmask_b32_e32 v7, 0, v6, vcc
-; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v7
-; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, v6, s[4:5]
-; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, v6, s[6:7]
-; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v7
+; GISEL-NEXT:    s_mov_b32 s4, 0
+; GISEL-NEXT:    s_brev_b32 s5, 8
+; GISEL-NEXT:    v_mov_b32_e32 v6, s4
+; GISEL-NEXT:    v_mov_b32_e32 v7, s5
+; GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; GISEL-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[2:3], v[6:7]
+; GISEL-NEXT:    v_cmp_lt_f64_e64 s[6:7], v[4:5], v[6:7]
+; GISEL-NEXT:    v_mov_b32_e32 v8, 0x100
+; GISEL-NEXT:    v_cndmask_b32_e32 v9, 0, v8, vcc
+; GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v9
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, v8, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, v8, s[6:7]
+; GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v9
 ; GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v6
 ; GISEL-NEXT:    v_rsq_f64_e32 v[6:7], v[0:1]
 ; GISEL-NEXT:    v_rsq_f64_e32 v[8:9], v[2:3]
@@ -1870,5 +1870,3 @@ attributes #1 = { convergent nounwind willreturn memory(none) }
 attributes #2 = { "approx-func-fp-math"="true" }
 attributes #3 = { "approx-func-fp-math"="true" "no-nans-fp-math"="true" "no-infs-fp-math"="true" }
 attributes #4 = { "unsafe-fp-math"="true" }
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; GCN: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics.ll b/llvm/test/CodeGen/AMDGPU/global_atomics.ll
index 08e06d4dd015a..d85778bc0195f 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics.ll
@@ -53,16 +53,16 @@ entry:
 define amdgpu_kernel void @atomic_add_i32_max_neg_offset(ptr addrspace(1) %out, i32 %in) {
 ; SI-LABEL: atomic_add_i32_max_neg_offset:
 ; SI:       ; %bb.0: ; %entry
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
-; SI-NEXT:    s_load_dword s0, s[0:1], 0xb
-; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_mov_b32 s6, 0
+; SI-NEXT:    s_load_dword s4, s[0:1], 0xb
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, 0
 ; SI-NEXT:    v_mov_b32_e32 v0, 0xfffff000
 ; SI-NEXT:    v_mov_b32_e32 v1, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v2, s0
+; SI-NEXT:    v_mov_b32_e32 v2, s4
 ; SI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; SI-NEXT:    buffer_atomic_add v2, v[0:1], s[4:7], 0 addr64
+; SI-NEXT:    buffer_atomic_add v2, v[0:1], s[0:3], 0 addr64
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
 ; SI-NEXT:    s_endpgm
@@ -5403,12 +5403,12 @@ define amdgpu_kernel void @atomic_load_i32_negoffset(ptr addrspace(1) %in, ptr a
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    v_mov_b32_e32 v0, 0xfffffe00
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s4, s2
 ; SI-NEXT:    s_mov_b32 s5, s3
 ; SI-NEXT:    s_mov_b32 s2, 0
 ; SI-NEXT:    s_mov_b32 s3, s7
+; SI-NEXT:    v_mov_b32_e32 v0, 0xfffffe00
 ; SI-NEXT:    v_mov_b32_e32 v1, -1
 ; SI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SI-NEXT:    buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc
@@ -6148,12 +6148,12 @@ define amdgpu_kernel void @atomic_load_i8_negoffset(ptr addrspace(1) %in, ptr ad
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    v_mov_b32_e32 v0, 0xfffffe00
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s4, s2
 ; SI-NEXT:    s_mov_b32 s5, s3
 ; SI-NEXT:    s_mov_b32 s2, 0
 ; SI-NEXT:    s_mov_b32 s3, s7
+; SI-NEXT:    v_mov_b32_e32 v0, 0xfffffe00
 ; SI-NEXT:    v_mov_b32_e32 v1, -1
 ; SI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SI-NEXT:    buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64 glc
@@ -6339,12 +6339,12 @@ define amdgpu_kernel void @atomic_load_i16_negoffset(ptr addrspace(1) %in, ptr a
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    v_mov_b32_e32 v0, 0xfffffe00
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s4, s2
 ; SI-NEXT:    s_mov_b32 s5, s3
 ; SI-NEXT:    s_mov_b32 s2, 0
 ; SI-NEXT:    s_mov_b32 s3, s7
+; SI-NEXT:    v_mov_b32_e32 v0, 0xfffffe00
 ; SI-NEXT:    v_mov_b32_e32 v1, -1
 ; SI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; SI-NEXT:    buffer_load_ushort v0, v[0:1], s[0:3], 0 addr64 glc
@@ -6606,16 +6606,16 @@ entry:
 define amdgpu_kernel void @atomic_inc_i32_max_neg_offset(ptr addrspace(1) %out, i32 %in) {
 ; SI-LABEL: atomic_inc_i32_max_neg_offset:
 ; SI:       ; %bb.0: ; %entry
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
-; SI-NEXT:    s_load_dword s0, s[0:1], 0xb
-; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_mov_b32 s6, 0
+; SI-NEXT:    s_load_dword s4, s[0:1], 0xb
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, 0
 ; SI-NEXT:    v_mov_b32_e32 v0, 0xfffff000
 ; SI-NEXT:    v_mov_b32_e32 v1, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v2, s0
+; SI-NEXT:    v_mov_b32_e32 v2, s4
 ; SI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; SI-NEXT:    buffer_atomic_inc v2, v[0:1], s[4:7], 0 addr64
+; SI-NEXT:    buffer_atomic_inc v2, v[0:1], s[0:3], 0 addr64
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
 ; SI-NEXT:    s_endpgm
@@ -7001,16 +7001,16 @@ entry:
 define amdgpu_kernel void @atomic_dec_i32_max_neg_offset(ptr addrspace(1) %out, i32 %in) {
 ; SI-LABEL: atomic_dec_i32_max_neg_offset:
 ; SI:       ; %bb.0: ; %entry
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
-; SI-NEXT:    s_load_dword s0, s[0:1], 0xb
-; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_mov_b32 s6, 0
+; SI-NEXT:    s_load_dword s4, s[0:1], 0xb
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, 0
 ; SI-NEXT:    v_mov_b32_e32 v0, 0xfffff000
 ; SI-NEXT:    v_mov_b32_e32 v1, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v2, s0
+; SI-NEXT:    v_mov_b32_e32 v2, s4
 ; SI-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; SI-NEXT:    buffer_atomic_dec v2, v[0:1], s[4:7], 0 addr64
+; SI-NEXT:    buffer_atomic_dec v2, v[0:1], s[0:3], 0 addr64
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
 ; SI-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
index 429bdd805ec5e..4cbd5e84871cc 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
@@ -1096,8 +1096,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope
 ; GFX7LESS-NEXT:    s_cbranch_execz .LBB2_3
 ; GFX7LESS-NEXT:  ; %bb.1:
 ; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s6, s[2:3]
-; GFX7LESS-NEXT:    s_mov_b32 s7, 0x43300000
 ; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS-NEXT:    s_mov_b32 s7, 0x43300000
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0xc3300000
 ; GFX7LESS-NEXT:    s_mov_b64 s[4:5], 0
@@ -1141,27 +1141,28 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_cbranch_execz .LBB2_3
 ; GFX9-NEXT:  ; %bb.1:
-; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
-; GFX9-NEXT:    s_mov_b32 s3, 0x43300000
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0xc3300000
-; GFX9-NEXT:    v_add_f64 v[1:2], s[2:3], v[0:1]
+; GFX9-NEXT:    s_mov_b32 s3, 0x43300000
+; GFX9-NEXT:    v_add_f64 v[0:1], s[2:3], v[0:1]
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    s_mov_b64 s[2:3], 0
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX9-NEXT:    v_cvt_f32_f64_e32 v1, v[1:2]
+; GFX9-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-NEXT:    v_mul_f32_e32 v3, 4.0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-NEXT:    v_mul_f32_e32 v2, 4.0, v0
 ; GFX9-NEXT:  .LBB2_2: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    v_add_f32_e32 v1, v2, v3
-; GFX9-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc
+; GFX9-NEXT:    v_add_f32_e32 v0, v1, v2
+; GFX9-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX9-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v2, v1
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB2_2
 ; GFX9-NEXT:  .LBB2_3:
@@ -1169,25 +1170,23 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope
 ;
 ; GFX1064-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp:
 ; GFX1064:       ; %bb.0:
-; GFX1064-NEXT:    s_mov_b64 s[4:5], exec
 ; GFX1064-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
-; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
 ; GFX1064-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
 ; GFX1064-NEXT:    s_mov_b32 s10, -1
 ; GFX1064-NEXT:    s_mov_b32 s11, 0x31e16000
 ; GFX1064-NEXT:    s_add_u32 s8, s8, s3
-; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
+; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1064-NEXT:    s_addc_u32 s9, s9, 0
-; GFX1064-NEXT:    s_mov_b32 s2, 0
+; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX1064-NEXT:    s_cbranch_execz .LBB2_3
 ; GFX1064-NEXT:  ; %bb.1:
-; GFX1064-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
-; GFX1064-NEXT:    s_mov_b32 s5, 0x43300000
-; GFX1064-NEXT:    s_mov_b32 s3, 0xc3300000
+; GFX1064-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064-NEXT:    s_mov_b32 s3, 0x43300000
 ; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT:    v_add_f64 v[0:1], s[4:5], s[2:3]
+; GFX1064-NEXT:    v_add_f64 v[0:1], 0xc3300000, s[2:3]
 ; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    s_load_dword s2, s[0:1], 0x0
@@ -1226,9 +1225,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope
 ; GFX1032-NEXT:  ; %bb.1:
 ; GFX1032-NEXT:    s_bcnt1_i32_b32 s4, s3
 ; GFX1032-NEXT:    s_mov_b32 s5, 0x43300000
-; GFX1032-NEXT:    s_mov_b32 s3, 0xc3300000
 ; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT:    v_add_f64 v[0:1], s[4:5], s[2:3]
+; GFX1032-NEXT:    v_add_f64 v[0:1], 0xc3300000, s[4:5]
 ; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    s_load_dword s3, s[0:1], 0x0
@@ -1255,8 +1253,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope
 ; GFX1164-NEXT:    v_mov_b32_e32 v0, 0x43300000
 ; GFX1164-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
-; GFX1164-NEXT:    s_mov_b32 s2, 0
-; GFX1164-NEXT:    s_mov_b64 s[4:5], exec
+; GFX1164-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164-NEXT:    s_clause 0x1
 ; GFX1164-NEXT:    scratch_store_b32 off, v0, off offset:12
 ; GFX1164-NEXT:    scratch_store_b32 off, v1, off offset:8
@@ -1266,10 +1263,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope
 ; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v2
 ; GFX1164-NEXT:    s_cbranch_execz .LBB2_3
 ; GFX1164-NEXT:  ; %bb.1:
-; GFX1164-NEXT:    s_mov_b32 s3, 0xc3300000
-; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX1164-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-NEXT:    v_add_f64 v[0:1], v[0:1], s[2:3]
+; GFX1164-NEXT:    v_add_f64 v[0:1], 0xc3300000, v[0:1]
+; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX1164-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    s_load_b32 s2, s[0:1], 0x0
@@ -1309,10 +1305,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope
 ; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v2
 ; GFX1132-NEXT:    s_cbranch_execz .LBB2_3
 ; GFX1132-NEXT:  ; %bb.1:
-; GFX1132-NEXT:    s_mov_b32 s3, 0xc3300000
-; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-NEXT:    v_add_f64 v[0:1], v[0:1], s[2:3]
+; GFX1132-NEXT:    v_add_f64 v[0:1], 0xc3300000, v[0:1]
+; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX1132-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    s_load_b32 s3, s[0:1], 0x0
@@ -1350,27 +1345,28 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope
 ; GFX9-DPP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-DPP-NEXT:    s_cbranch_execz .LBB2_3
 ; GFX9-DPP-NEXT:  ; %bb.1:
-; GFX9-DPP-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
-; GFX9-DPP-NEXT:    s_mov_b32 s3, 0x43300000
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-DPP-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, 0xc3300000
-; GFX9-DPP-NEXT:    v_add_f64 v[1:2], s[2:3], v[0:1]
+; GFX9-DPP-NEXT:    s_mov_b32 s3, 0x43300000
+; GFX9-DPP-NEXT:    v_add_f64 v[0:1], s[2:3], v[0:1]
 ; GFX9-DPP-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX9-DPP-NEXT:    v_cvt_f32_f64_e32 v1, v[1:2]
+; GFX9-DPP-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-DPP-NEXT:    v_mul_f32_e32 v3, 4.0, v1
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-DPP-NEXT:    v_mul_f32_e32 v2, 4.0, v0
 ; GFX9-DPP-NEXT:  .LBB2_2: ; %atomicrmw.start
 ; GFX9-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-DPP-NEXT:    v_add_f32_e32 v1, v2, v3
-; GFX9-DPP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc
+; GFX9-DPP-NEXT:    v_add_f32_e32 v0, v1, v2
+; GFX9-DPP-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX9-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX9-DPP-NEXT:    s_cbranch_execnz .LBB2_2
 ; GFX9-DPP-NEXT:  .LBB2_3:
@@ -1378,25 +1374,23 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope
 ;
 ; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp:
 ; GFX1064-DPP:       ; %bb.0:
-; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], exec
 ; GFX1064-DPP-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
-; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
 ; GFX1064-DPP-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
 ; GFX1064-DPP-NEXT:    s_mov_b32 s10, -1
 ; GFX1064-DPP-NEXT:    s_mov_b32 s11, 0x31e16000
 ; GFX1064-DPP-NEXT:    s_add_u32 s8, s8, s3
-; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
+; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1064-DPP-NEXT:    s_addc_u32 s9, s9, 0
-; GFX1064-DPP-NEXT:    s_mov_b32 s2, 0
+; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB2_3
 ; GFX1064-DPP-NEXT:  ; %bb.1:
-; GFX1064-DPP-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
-; GFX1064-DPP-NEXT:    s_mov_b32 s5, 0x43300000
-; GFX1064-DPP-NEXT:    s_mov_b32 s3, 0xc3300000
+; GFX1064-DPP-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064-DPP-NEXT:    s_mov_b32 s3, 0x43300000
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-DPP-NEXT:    v_add_f64 v[0:1], s[4:5], s[2:3]
+; GFX1064-DPP-NEXT:    v_add_f64 v[0:1], 0xc3300000, s[2:3]
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    s_load_dword s2, s[0:1], 0x0
@@ -1435,9 +1429,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope
 ; GFX1032-DPP-NEXT:  ; %bb.1:
 ; GFX1032-DPP-NEXT:    s_bcnt1_i32_b32 s4, s3
 ; GFX1032-DPP-NEXT:    s_mov_b32 s5, 0x43300000
-; GFX1032-DPP-NEXT:    s_mov_b32 s3, 0xc3300000
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-DPP-NEXT:    v_add_f64 v[0:1], s[4:5], s[2:3]
+; GFX1032-DPP-NEXT:    v_add_f64 v[0:1], 0xc3300000, s[4:5]
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-DPP-NEXT:    s_load_dword s3, s[0:1], 0x0
@@ -1464,8 +1457,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, 0x43300000
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
-; GFX1164-DPP-NEXT:    s_mov_b32 s2, 0
-; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], exec
+; GFX1164-DPP-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164-DPP-NEXT:    s_clause 0x1
 ; GFX1164-DPP-NEXT:    scratch_store_b32 off, v0, off offset:12
 ; GFX1164-DPP-NEXT:    scratch_store_b32 off, v1, off offset:8
@@ -1475,10 +1467,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope
 ; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v2
 ; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB2_3
 ; GFX1164-DPP-NEXT:  ; %bb.1:
-; GFX1164-DPP-NEXT:    s_mov_b32 s3, 0xc3300000
-; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_add_f64 v[0:1], v[0:1], s[2:3]
+; GFX1164-DPP-NEXT:    v_add_f64 v[0:1], 0xc3300000, v[0:1]
+; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    s_load_b32 s2, s[0:1], 0x0
@@ -1518,10 +1509,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v2
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB2_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
-; GFX1132-DPP-NEXT:    s_mov_b32 s3, 0xc3300000
-; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_add_f64 v[0:1], v[0:1], s[2:3]
+; GFX1132-DPP-NEXT:    v_add_f64 v[0:1], 0xc3300000, v[0:1]
+; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_load_b32 s3, s[0:1], 0x0
@@ -2356,8 +2346,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_
 ; GFX7LESS-NEXT:    s_cbranch_execz .LBB4_3
 ; GFX7LESS-NEXT:  ; %bb.1:
 ; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s6, s[2:3]
-; GFX7LESS-NEXT:    s_mov_b32 s7, 0x43300000
 ; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS-NEXT:    s_mov_b32 s7, 0x43300000
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0xc3300000
 ; GFX7LESS-NEXT:    s_mov_b64 s[4:5], 0
@@ -2401,27 +2391,28 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_cbranch_execz .LBB4_3
 ; GFX9-NEXT:  ; %bb.1:
-; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
-; GFX9-NEXT:    s_mov_b32 s3, 0x43300000
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0xc3300000
-; GFX9-NEXT:    v_add_f64 v[1:2], s[2:3], v[0:1]
+; GFX9-NEXT:    s_mov_b32 s3, 0x43300000
+; GFX9-NEXT:    v_add_f64 v[0:1], s[2:3], v[0:1]
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    s_mov_b64 s[2:3], 0
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX9-NEXT:    v_cvt_f32_f64_e32 v1, v[1:2]
+; GFX9-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-NEXT:    v_mul_f32_e32 v3, 4.0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-NEXT:    v_mul_f32_e32 v2, 4.0, v0
 ; GFX9-NEXT:  .LBB4_2: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    v_add_f32_e32 v1, v2, v3
-; GFX9-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc
+; GFX9-NEXT:    v_add_f32_e32 v0, v1, v2
+; GFX9-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX9-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v2, v1
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB4_2
 ; GFX9-NEXT:  .LBB4_3:
@@ -2429,25 +2420,23 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_
 ;
 ; GFX1064-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp:
 ; GFX1064:       ; %bb.0:
-; GFX1064-NEXT:    s_mov_b64 s[4:5], exec
 ; GFX1064-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
-; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
 ; GFX1064-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
 ; GFX1064-NEXT:    s_mov_b32 s10, -1
 ; GFX1064-NEXT:    s_mov_b32 s11, 0x31e16000
 ; GFX1064-NEXT:    s_add_u32 s8, s8, s3
-; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
+; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1064-NEXT:    s_addc_u32 s9, s9, 0
-; GFX1064-NEXT:    s_mov_b32 s2, 0
+; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX1064-NEXT:    s_cbranch_execz .LBB4_3
 ; GFX1064-NEXT:  ; %bb.1:
-; GFX1064-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
-; GFX1064-NEXT:    s_mov_b32 s5, 0x43300000
-; GFX1064-NEXT:    s_mov_b32 s3, 0xc3300000
+; GFX1064-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064-NEXT:    s_mov_b32 s3, 0x43300000
 ; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT:    v_add_f64 v[0:1], s[4:5], s[2:3]
+; GFX1064-NEXT:    v_add_f64 v[0:1], 0xc3300000, s[2:3]
 ; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    s_load_dword s2, s[0:1], 0x0
@@ -2486,9 +2475,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_
 ; GFX1032-NEXT:  ; %bb.1:
 ; GFX1032-NEXT:    s_bcnt1_i32_b32 s4, s3
 ; GFX1032-NEXT:    s_mov_b32 s5, 0x43300000
-; GFX1032-NEXT:    s_mov_b32 s3, 0xc3300000
 ; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT:    v_add_f64 v[0:1], s[4:5], s[2:3]
+; GFX1032-NEXT:    v_add_f64 v[0:1], 0xc3300000, s[4:5]
 ; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    s_load_dword s3, s[0:1], 0x0
@@ -2515,8 +2503,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_
 ; GFX1164-NEXT:    v_mov_b32_e32 v0, 0x43300000
 ; GFX1164-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
-; GFX1164-NEXT:    s_mov_b32 s2, 0
-; GFX1164-NEXT:    s_mov_b64 s[4:5], exec
+; GFX1164-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164-NEXT:    s_clause 0x1
 ; GFX1164-NEXT:    scratch_store_b32 off, v0, off offset:12
 ; GFX1164-NEXT:    scratch_store_b32 off, v1, off offset:8
@@ -2526,10 +2513,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_
 ; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v2
 ; GFX1164-NEXT:    s_cbranch_execz .LBB4_3
 ; GFX1164-NEXT:  ; %bb.1:
-; GFX1164-NEXT:    s_mov_b32 s3, 0xc3300000
-; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX1164-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-NEXT:    v_add_f64 v[0:1], v[0:1], s[2:3]
+; GFX1164-NEXT:    v_add_f64 v[0:1], 0xc3300000, v[0:1]
+; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX1164-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    s_load_b32 s2, s[0:1], 0x0
@@ -2569,10 +2555,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_
 ; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v2
 ; GFX1132-NEXT:    s_cbranch_execz .LBB4_3
 ; GFX1132-NEXT:  ; %bb.1:
-; GFX1132-NEXT:    s_mov_b32 s3, 0xc3300000
-; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-NEXT:    v_add_f64 v[0:1], v[0:1], s[2:3]
+; GFX1132-NEXT:    v_add_f64 v[0:1], 0xc3300000, v[0:1]
+; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX1132-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    s_load_b32 s3, s[0:1], 0x0
@@ -2610,27 +2595,28 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_
 ; GFX9-DPP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-DPP-NEXT:    s_cbranch_execz .LBB4_3
 ; GFX9-DPP-NEXT:  ; %bb.1:
-; GFX9-DPP-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
-; GFX9-DPP-NEXT:    s_mov_b32 s3, 0x43300000
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-DPP-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, 0xc3300000
-; GFX9-DPP-NEXT:    v_add_f64 v[1:2], s[2:3], v[0:1]
+; GFX9-DPP-NEXT:    s_mov_b32 s3, 0x43300000
+; GFX9-DPP-NEXT:    v_add_f64 v[0:1], s[2:3], v[0:1]
 ; GFX9-DPP-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX9-DPP-NEXT:    v_cvt_f32_f64_e32 v1, v[1:2]
+; GFX9-DPP-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-DPP-NEXT:    v_mul_f32_e32 v3, 4.0, v1
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-DPP-NEXT:    v_mul_f32_e32 v2, 4.0, v0
 ; GFX9-DPP-NEXT:  .LBB4_2: ; %atomicrmw.start
 ; GFX9-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-DPP-NEXT:    v_add_f32_e32 v1, v2, v3
-; GFX9-DPP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc
+; GFX9-DPP-NEXT:    v_add_f32_e32 v0, v1, v2
+; GFX9-DPP-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX9-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX9-DPP-NEXT:    s_cbranch_execnz .LBB4_2
 ; GFX9-DPP-NEXT:  .LBB4_3:
@@ -2638,25 +2624,23 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_
 ;
 ; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp:
 ; GFX1064-DPP:       ; %bb.0:
-; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], exec
 ; GFX1064-DPP-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
-; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
 ; GFX1064-DPP-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
 ; GFX1064-DPP-NEXT:    s_mov_b32 s10, -1
 ; GFX1064-DPP-NEXT:    s_mov_b32 s11, 0x31e16000
 ; GFX1064-DPP-NEXT:    s_add_u32 s8, s8, s3
-; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
+; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1064-DPP-NEXT:    s_addc_u32 s9, s9, 0
-; GFX1064-DPP-NEXT:    s_mov_b32 s2, 0
+; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB4_3
 ; GFX1064-DPP-NEXT:  ; %bb.1:
-; GFX1064-DPP-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
-; GFX1064-DPP-NEXT:    s_mov_b32 s5, 0x43300000
-; GFX1064-DPP-NEXT:    s_mov_b32 s3, 0xc3300000
+; GFX1064-DPP-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064-DPP-NEXT:    s_mov_b32 s3, 0x43300000
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-DPP-NEXT:    v_add_f64 v[0:1], s[4:5], s[2:3]
+; GFX1064-DPP-NEXT:    v_add_f64 v[0:1], 0xc3300000, s[2:3]
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    s_load_dword s2, s[0:1], 0x0
@@ -2695,9 +2679,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_
 ; GFX1032-DPP-NEXT:  ; %bb.1:
 ; GFX1032-DPP-NEXT:    s_bcnt1_i32_b32 s4, s3
 ; GFX1032-DPP-NEXT:    s_mov_b32 s5, 0x43300000
-; GFX1032-DPP-NEXT:    s_mov_b32 s3, 0xc3300000
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-DPP-NEXT:    v_add_f64 v[0:1], s[4:5], s[2:3]
+; GFX1032-DPP-NEXT:    v_add_f64 v[0:1], 0xc3300000, s[4:5]
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-DPP-NEXT:    s_load_dword s3, s[0:1], 0x0
@@ -2724,8 +2707,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, 0x43300000
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
-; GFX1164-DPP-NEXT:    s_mov_b32 s2, 0
-; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], exec
+; GFX1164-DPP-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164-DPP-NEXT:    s_clause 0x1
 ; GFX1164-DPP-NEXT:    scratch_store_b32 off, v0, off offset:12
 ; GFX1164-DPP-NEXT:    scratch_store_b32 off, v1, off offset:8
@@ -2735,10 +2717,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_
 ; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v2
 ; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB4_3
 ; GFX1164-DPP-NEXT:  ; %bb.1:
-; GFX1164-DPP-NEXT:    s_mov_b32 s3, 0xc3300000
-; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_add_f64 v[0:1], v[0:1], s[2:3]
+; GFX1164-DPP-NEXT:    v_add_f64 v[0:1], 0xc3300000, v[0:1]
+; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    s_load_b32 s2, s[0:1], 0x0
@@ -2778,10 +2759,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v2
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB4_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
-; GFX1132-DPP-NEXT:    s_mov_b32 s3, 0xc3300000
-; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_add_f64 v[0:1], v[0:1], s[2:3]
+; GFX1132-DPP-NEXT:    v_add_f64 v[0:1], 0xc3300000, v[0:1]
+; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_load_b32 s3, s[0:1], 0x0
@@ -4320,8 +4300,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_defalut_scop
 ; GFX7LESS-NEXT:    s_cbranch_execz .LBB7_3
 ; GFX7LESS-NEXT:  ; %bb.1:
 ; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s6, s[2:3]
-; GFX7LESS-NEXT:    s_mov_b32 s7, 0x43300000
 ; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS-NEXT:    s_mov_b32 s7, 0x43300000
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0xc3300000
 ; GFX7LESS-NEXT:    s_mov_b64 s[4:5], 0
@@ -4365,27 +4345,28 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_defalut_scop
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_cbranch_execz .LBB7_3
 ; GFX9-NEXT:  ; %bb.1:
-; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
-; GFX9-NEXT:    s_mov_b32 s3, 0x43300000
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0xc3300000
-; GFX9-NEXT:    v_add_f64 v[1:2], s[2:3], v[0:1]
+; GFX9-NEXT:    s_mov_b32 s3, 0x43300000
+; GFX9-NEXT:    v_add_f64 v[0:1], s[2:3], v[0:1]
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    s_mov_b64 s[2:3], 0
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX9-NEXT:    v_cvt_f32_f64_e32 v1, v[1:2]
+; GFX9-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-NEXT:    v_mul_f32_e32 v3, 4.0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-NEXT:    v_mul_f32_e32 v2, 4.0, v0
 ; GFX9-NEXT:  .LBB7_2: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    v_add_f32_e32 v1, v2, v3
-; GFX9-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc
+; GFX9-NEXT:    v_add_f32_e32 v0, v1, v2
+; GFX9-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX9-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v2, v1
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB7_2
 ; GFX9-NEXT:  .LBB7_3:
@@ -4393,25 +4374,23 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_defalut_scop
 ;
 ; GFX1064-LABEL: global_atomic_fadd_uni_address_uni_value_defalut_scope_strictfp:
 ; GFX1064:       ; %bb.0:
-; GFX1064-NEXT:    s_mov_b64 s[4:5], exec
 ; GFX1064-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
-; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
 ; GFX1064-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
 ; GFX1064-NEXT:    s_mov_b32 s10, -1
 ; GFX1064-NEXT:    s_mov_b32 s11, 0x31e16000
 ; GFX1064-NEXT:    s_add_u32 s8, s8, s3
-; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
+; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1064-NEXT:    s_addc_u32 s9, s9, 0
-; GFX1064-NEXT:    s_mov_b32 s2, 0
+; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX1064-NEXT:    s_cbranch_execz .LBB7_3
 ; GFX1064-NEXT:  ; %bb.1:
-; GFX1064-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
-; GFX1064-NEXT:    s_mov_b32 s5, 0x43300000
-; GFX1064-NEXT:    s_mov_b32 s3, 0xc3300000
+; GFX1064-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064-NEXT:    s_mov_b32 s3, 0x43300000
 ; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT:    v_add_f64 v[0:1], s[4:5], s[2:3]
+; GFX1064-NEXT:    v_add_f64 v[0:1], 0xc3300000, s[2:3]
 ; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    s_load_dword s2, s[0:1], 0x0
@@ -4450,9 +4429,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_defalut_scop
 ; GFX1032-NEXT:  ; %bb.1:
 ; GFX1032-NEXT:    s_bcnt1_i32_b32 s4, s3
 ; GFX1032-NEXT:    s_mov_b32 s5, 0x43300000
-; GFX1032-NEXT:    s_mov_b32 s3, 0xc3300000
 ; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT:    v_add_f64 v[0:1], s[4:5], s[2:3]
+; GFX1032-NEXT:    v_add_f64 v[0:1], 0xc3300000, s[4:5]
 ; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    s_load_dword s3, s[0:1], 0x0
@@ -4479,8 +4457,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_defalut_scop
 ; GFX1164-NEXT:    v_mov_b32_e32 v0, 0x43300000
 ; GFX1164-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
-; GFX1164-NEXT:    s_mov_b32 s2, 0
-; GFX1164-NEXT:    s_mov_b64 s[4:5], exec
+; GFX1164-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164-NEXT:    s_clause 0x1
 ; GFX1164-NEXT:    scratch_store_b32 off, v0, off offset:12
 ; GFX1164-NEXT:    scratch_store_b32 off, v1, off offset:8
@@ -4490,10 +4467,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_defalut_scop
 ; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v2
 ; GFX1164-NEXT:    s_cbranch_execz .LBB7_3
 ; GFX1164-NEXT:  ; %bb.1:
-; GFX1164-NEXT:    s_mov_b32 s3, 0xc3300000
-; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX1164-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-NEXT:    v_add_f64 v[0:1], v[0:1], s[2:3]
+; GFX1164-NEXT:    v_add_f64 v[0:1], 0xc3300000, v[0:1]
+; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX1164-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    s_load_b32 s2, s[0:1], 0x0
@@ -4533,10 +4509,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_defalut_scop
 ; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v2
 ; GFX1132-NEXT:    s_cbranch_execz .LBB7_3
 ; GFX1132-NEXT:  ; %bb.1:
-; GFX1132-NEXT:    s_mov_b32 s3, 0xc3300000
-; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-NEXT:    v_add_f64 v[0:1], v[0:1], s[2:3]
+; GFX1132-NEXT:    v_add_f64 v[0:1], 0xc3300000, v[0:1]
+; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX1132-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    s_load_b32 s3, s[0:1], 0x0
@@ -4574,27 +4549,28 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_defalut_scop
 ; GFX9-DPP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-DPP-NEXT:    s_cbranch_execz .LBB7_3
 ; GFX9-DPP-NEXT:  ; %bb.1:
-; GFX9-DPP-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
-; GFX9-DPP-NEXT:    s_mov_b32 s3, 0x43300000
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-DPP-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, 0xc3300000
-; GFX9-DPP-NEXT:    v_add_f64 v[1:2], s[2:3], v[0:1]
+; GFX9-DPP-NEXT:    s_mov_b32 s3, 0x43300000
+; GFX9-DPP-NEXT:    v_add_f64 v[0:1], s[2:3], v[0:1]
 ; GFX9-DPP-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX9-DPP-NEXT:    v_cvt_f32_f64_e32 v1, v[1:2]
+; GFX9-DPP-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-DPP-NEXT:    v_mul_f32_e32 v3, 4.0, v1
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-DPP-NEXT:    v_mul_f32_e32 v2, 4.0, v0
 ; GFX9-DPP-NEXT:  .LBB7_2: ; %atomicrmw.start
 ; GFX9-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-DPP-NEXT:    v_add_f32_e32 v1, v2, v3
-; GFX9-DPP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc
+; GFX9-DPP-NEXT:    v_add_f32_e32 v0, v1, v2
+; GFX9-DPP-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX9-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX9-DPP-NEXT:    s_cbranch_execnz .LBB7_2
 ; GFX9-DPP-NEXT:  .LBB7_3:
@@ -4602,25 +4578,23 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_defalut_scop
 ;
 ; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_defalut_scope_strictfp:
 ; GFX1064-DPP:       ; %bb.0:
-; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], exec
 ; GFX1064-DPP-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
-; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
 ; GFX1064-DPP-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
 ; GFX1064-DPP-NEXT:    s_mov_b32 s10, -1
 ; GFX1064-DPP-NEXT:    s_mov_b32 s11, 0x31e16000
 ; GFX1064-DPP-NEXT:    s_add_u32 s8, s8, s3
-; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
+; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1064-DPP-NEXT:    s_addc_u32 s9, s9, 0
-; GFX1064-DPP-NEXT:    s_mov_b32 s2, 0
+; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB7_3
 ; GFX1064-DPP-NEXT:  ; %bb.1:
-; GFX1064-DPP-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
-; GFX1064-DPP-NEXT:    s_mov_b32 s5, 0x43300000
-; GFX1064-DPP-NEXT:    s_mov_b32 s3, 0xc3300000
+; GFX1064-DPP-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064-DPP-NEXT:    s_mov_b32 s3, 0x43300000
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-DPP-NEXT:    v_add_f64 v[0:1], s[4:5], s[2:3]
+; GFX1064-DPP-NEXT:    v_add_f64 v[0:1], 0xc3300000, s[2:3]
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    s_load_dword s2, s[0:1], 0x0
@@ -4659,9 +4633,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_defalut_scop
 ; GFX1032-DPP-NEXT:  ; %bb.1:
 ; GFX1032-DPP-NEXT:    s_bcnt1_i32_b32 s4, s3
 ; GFX1032-DPP-NEXT:    s_mov_b32 s5, 0x43300000
-; GFX1032-DPP-NEXT:    s_mov_b32 s3, 0xc3300000
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-DPP-NEXT:    v_add_f64 v[0:1], s[4:5], s[2:3]
+; GFX1032-DPP-NEXT:    v_add_f64 v[0:1], 0xc3300000, s[4:5]
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-DPP-NEXT:    s_load_dword s3, s[0:1], 0x0
@@ -4688,8 +4661,7 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_defalut_scop
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, 0x43300000
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
-; GFX1164-DPP-NEXT:    s_mov_b32 s2, 0
-; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], exec
+; GFX1164-DPP-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164-DPP-NEXT:    s_clause 0x1
 ; GFX1164-DPP-NEXT:    scratch_store_b32 off, v0, off offset:12
 ; GFX1164-DPP-NEXT:    scratch_store_b32 off, v1, off offset:8
@@ -4699,10 +4671,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_defalut_scop
 ; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v2
 ; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB7_3
 ; GFX1164-DPP-NEXT:  ; %bb.1:
-; GFX1164-DPP-NEXT:    s_mov_b32 s3, 0xc3300000
-; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_add_f64 v[0:1], v[0:1], s[2:3]
+; GFX1164-DPP-NEXT:    v_add_f64 v[0:1], 0xc3300000, v[0:1]
+; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    s_load_b32 s2, s[0:1], 0x0
@@ -4742,10 +4713,9 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_defalut_scop
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v2
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB7_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
-; GFX1132-DPP-NEXT:    s_mov_b32 s3, 0xc3300000
-; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_add_f64 v[0:1], v[0:1], s[2:3]
+; GFX1132-DPP-NEXT:    v_add_f64 v[0:1], 0xc3300000, v[0:1]
+; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_load_b32 s3, s[0:1], 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
index f05a420a1b0a2..4a00d7bc71bca 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
@@ -1192,8 +1192,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope
 ; GFX7LESS-NEXT:    s_cbranch_execz .LBB2_3
 ; GFX7LESS-NEXT:  ; %bb.1:
 ; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s6, s[2:3]
-; GFX7LESS-NEXT:    s_mov_b32 s7, 0x43300000
 ; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS-NEXT:    s_mov_b32 s7, 0x43300000
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0xc3300000
 ; GFX7LESS-NEXT:    s_mov_b64 s[4:5], 0
@@ -1237,27 +1237,28 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_cbranch_execz .LBB2_3
 ; GFX9-NEXT:  ; %bb.1:
-; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
-; GFX9-NEXT:    s_mov_b32 s3, 0x43300000
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0xc3300000
-; GFX9-NEXT:    v_add_f64 v[1:2], s[2:3], v[0:1]
+; GFX9-NEXT:    s_mov_b32 s3, 0x43300000
+; GFX9-NEXT:    v_add_f64 v[0:1], s[2:3], v[0:1]
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    s_mov_b64 s[2:3], 0
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX9-NEXT:    v_cvt_f32_f64_e32 v1, v[1:2]
+; GFX9-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-NEXT:    v_mul_f32_e32 v3, 4.0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-NEXT:    v_mul_f32_e32 v2, 4.0, v0
 ; GFX9-NEXT:  .LBB2_2: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    v_sub_f32_e32 v1, v2, v3
-; GFX9-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc
+; GFX9-NEXT:    v_sub_f32_e32 v0, v1, v2
+; GFX9-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX9-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v2, v1
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB2_2
 ; GFX9-NEXT:  .LBB2_3:
@@ -1265,25 +1266,23 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope
 ;
 ; GFX1064-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp:
 ; GFX1064:       ; %bb.0:
-; GFX1064-NEXT:    s_mov_b64 s[4:5], exec
 ; GFX1064-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
-; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
 ; GFX1064-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
 ; GFX1064-NEXT:    s_mov_b32 s10, -1
 ; GFX1064-NEXT:    s_mov_b32 s11, 0x31e16000
 ; GFX1064-NEXT:    s_add_u32 s8, s8, s3
-; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
+; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1064-NEXT:    s_addc_u32 s9, s9, 0
-; GFX1064-NEXT:    s_mov_b32 s2, 0
+; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX1064-NEXT:    s_cbranch_execz .LBB2_3
 ; GFX1064-NEXT:  ; %bb.1:
-; GFX1064-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
-; GFX1064-NEXT:    s_mov_b32 s5, 0x43300000
-; GFX1064-NEXT:    s_mov_b32 s3, 0xc3300000
+; GFX1064-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064-NEXT:    s_mov_b32 s3, 0x43300000
 ; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT:    v_add_f64 v[0:1], s[4:5], s[2:3]
+; GFX1064-NEXT:    v_add_f64 v[0:1], 0xc3300000, s[2:3]
 ; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    s_load_dword s2, s[0:1], 0x0
@@ -1322,9 +1321,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope
 ; GFX1032-NEXT:  ; %bb.1:
 ; GFX1032-NEXT:    s_bcnt1_i32_b32 s4, s3
 ; GFX1032-NEXT:    s_mov_b32 s5, 0x43300000
-; GFX1032-NEXT:    s_mov_b32 s3, 0xc3300000
 ; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT:    v_add_f64 v[0:1], s[4:5], s[2:3]
+; GFX1032-NEXT:    v_add_f64 v[0:1], 0xc3300000, s[4:5]
 ; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    s_load_dword s3, s[0:1], 0x0
@@ -1351,8 +1349,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope
 ; GFX1164-NEXT:    v_mov_b32_e32 v0, 0x43300000
 ; GFX1164-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
-; GFX1164-NEXT:    s_mov_b32 s2, 0
-; GFX1164-NEXT:    s_mov_b64 s[4:5], exec
+; GFX1164-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164-NEXT:    s_clause 0x1
 ; GFX1164-NEXT:    scratch_store_b32 off, v0, off offset:12
 ; GFX1164-NEXT:    scratch_store_b32 off, v1, off offset:8
@@ -1362,10 +1359,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope
 ; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v2
 ; GFX1164-NEXT:    s_cbranch_execz .LBB2_3
 ; GFX1164-NEXT:  ; %bb.1:
-; GFX1164-NEXT:    s_mov_b32 s3, 0xc3300000
-; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX1164-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-NEXT:    v_add_f64 v[0:1], v[0:1], s[2:3]
+; GFX1164-NEXT:    v_add_f64 v[0:1], 0xc3300000, v[0:1]
+; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX1164-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    s_load_b32 s2, s[0:1], 0x0
@@ -1405,10 +1401,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope
 ; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v2
 ; GFX1132-NEXT:    s_cbranch_execz .LBB2_3
 ; GFX1132-NEXT:  ; %bb.1:
-; GFX1132-NEXT:    s_mov_b32 s3, 0xc3300000
-; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-NEXT:    v_add_f64 v[0:1], v[0:1], s[2:3]
+; GFX1132-NEXT:    v_add_f64 v[0:1], 0xc3300000, v[0:1]
+; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX1132-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    s_load_b32 s3, s[0:1], 0x0
@@ -1446,27 +1441,28 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope
 ; GFX9-DPP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-DPP-NEXT:    s_cbranch_execz .LBB2_3
 ; GFX9-DPP-NEXT:  ; %bb.1:
-; GFX9-DPP-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
-; GFX9-DPP-NEXT:    s_mov_b32 s3, 0x43300000
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-DPP-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, 0xc3300000
-; GFX9-DPP-NEXT:    v_add_f64 v[1:2], s[2:3], v[0:1]
+; GFX9-DPP-NEXT:    s_mov_b32 s3, 0x43300000
+; GFX9-DPP-NEXT:    v_add_f64 v[0:1], s[2:3], v[0:1]
 ; GFX9-DPP-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX9-DPP-NEXT:    v_cvt_f32_f64_e32 v1, v[1:2]
+; GFX9-DPP-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-DPP-NEXT:    v_mul_f32_e32 v3, 4.0, v1
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-DPP-NEXT:    v_mul_f32_e32 v2, 4.0, v0
 ; GFX9-DPP-NEXT:  .LBB2_2: ; %atomicrmw.start
 ; GFX9-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-DPP-NEXT:    v_sub_f32_e32 v1, v2, v3
-; GFX9-DPP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc
+; GFX9-DPP-NEXT:    v_sub_f32_e32 v0, v1, v2
+; GFX9-DPP-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX9-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX9-DPP-NEXT:    s_cbranch_execnz .LBB2_2
 ; GFX9-DPP-NEXT:  .LBB2_3:
@@ -1474,25 +1470,23 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope
 ;
 ; GFX1064-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp:
 ; GFX1064-DPP:       ; %bb.0:
-; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], exec
 ; GFX1064-DPP-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
-; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
 ; GFX1064-DPP-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
 ; GFX1064-DPP-NEXT:    s_mov_b32 s10, -1
 ; GFX1064-DPP-NEXT:    s_mov_b32 s11, 0x31e16000
 ; GFX1064-DPP-NEXT:    s_add_u32 s8, s8, s3
-; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
+; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1064-DPP-NEXT:    s_addc_u32 s9, s9, 0
-; GFX1064-DPP-NEXT:    s_mov_b32 s2, 0
+; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB2_3
 ; GFX1064-DPP-NEXT:  ; %bb.1:
-; GFX1064-DPP-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
-; GFX1064-DPP-NEXT:    s_mov_b32 s5, 0x43300000
-; GFX1064-DPP-NEXT:    s_mov_b32 s3, 0xc3300000
+; GFX1064-DPP-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064-DPP-NEXT:    s_mov_b32 s3, 0x43300000
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-DPP-NEXT:    v_add_f64 v[0:1], s[4:5], s[2:3]
+; GFX1064-DPP-NEXT:    v_add_f64 v[0:1], 0xc3300000, s[2:3]
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    s_load_dword s2, s[0:1], 0x0
@@ -1531,9 +1525,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope
 ; GFX1032-DPP-NEXT:  ; %bb.1:
 ; GFX1032-DPP-NEXT:    s_bcnt1_i32_b32 s4, s3
 ; GFX1032-DPP-NEXT:    s_mov_b32 s5, 0x43300000
-; GFX1032-DPP-NEXT:    s_mov_b32 s3, 0xc3300000
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-DPP-NEXT:    v_add_f64 v[0:1], s[4:5], s[2:3]
+; GFX1032-DPP-NEXT:    v_add_f64 v[0:1], 0xc3300000, s[4:5]
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-DPP-NEXT:    s_load_dword s3, s[0:1], 0x0
@@ -1560,8 +1553,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, 0x43300000
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
-; GFX1164-DPP-NEXT:    s_mov_b32 s2, 0
-; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], exec
+; GFX1164-DPP-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164-DPP-NEXT:    s_clause 0x1
 ; GFX1164-DPP-NEXT:    scratch_store_b32 off, v0, off offset:12
 ; GFX1164-DPP-NEXT:    scratch_store_b32 off, v1, off offset:8
@@ -1571,10 +1563,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope
 ; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v2
 ; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB2_3
 ; GFX1164-DPP-NEXT:  ; %bb.1:
-; GFX1164-DPP-NEXT:    s_mov_b32 s3, 0xc3300000
-; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_add_f64 v[0:1], v[0:1], s[2:3]
+; GFX1164-DPP-NEXT:    v_add_f64 v[0:1], 0xc3300000, v[0:1]
+; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    s_load_b32 s2, s[0:1], 0x0
@@ -1614,10 +1605,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v2
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB2_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
-; GFX1132-DPP-NEXT:    s_mov_b32 s3, 0xc3300000
-; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_add_f64 v[0:1], v[0:1], s[2:3]
+; GFX1132-DPP-NEXT:    v_add_f64 v[0:1], 0xc3300000, v[0:1]
+; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_load_b32 s3, s[0:1], 0x0
@@ -2452,8 +2442,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_
 ; GFX7LESS-NEXT:    s_cbranch_execz .LBB4_3
 ; GFX7LESS-NEXT:  ; %bb.1:
 ; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s6, s[2:3]
-; GFX7LESS-NEXT:    s_mov_b32 s7, 0x43300000
 ; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS-NEXT:    s_mov_b32 s7, 0x43300000
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0xc3300000
 ; GFX7LESS-NEXT:    s_mov_b64 s[4:5], 0
@@ -2497,27 +2487,28 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_cbranch_execz .LBB4_3
 ; GFX9-NEXT:  ; %bb.1:
-; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
-; GFX9-NEXT:    s_mov_b32 s3, 0x43300000
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0xc3300000
-; GFX9-NEXT:    v_add_f64 v[1:2], s[2:3], v[0:1]
+; GFX9-NEXT:    s_mov_b32 s3, 0x43300000
+; GFX9-NEXT:    v_add_f64 v[0:1], s[2:3], v[0:1]
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    s_mov_b64 s[2:3], 0
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX9-NEXT:    v_cvt_f32_f64_e32 v1, v[1:2]
+; GFX9-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-NEXT:    v_mul_f32_e32 v3, 4.0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-NEXT:    v_mul_f32_e32 v2, 4.0, v0
 ; GFX9-NEXT:  .LBB4_2: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    v_sub_f32_e32 v1, v2, v3
-; GFX9-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc
+; GFX9-NEXT:    v_sub_f32_e32 v0, v1, v2
+; GFX9-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX9-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v2, v1
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB4_2
 ; GFX9-NEXT:  .LBB4_3:
@@ -2525,25 +2516,23 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_
 ;
 ; GFX1064-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp:
 ; GFX1064:       ; %bb.0:
-; GFX1064-NEXT:    s_mov_b64 s[4:5], exec
 ; GFX1064-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
-; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
 ; GFX1064-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
 ; GFX1064-NEXT:    s_mov_b32 s10, -1
 ; GFX1064-NEXT:    s_mov_b32 s11, 0x31e16000
 ; GFX1064-NEXT:    s_add_u32 s8, s8, s3
-; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
+; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1064-NEXT:    s_addc_u32 s9, s9, 0
-; GFX1064-NEXT:    s_mov_b32 s2, 0
+; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX1064-NEXT:    s_cbranch_execz .LBB4_3
 ; GFX1064-NEXT:  ; %bb.1:
-; GFX1064-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
-; GFX1064-NEXT:    s_mov_b32 s5, 0x43300000
-; GFX1064-NEXT:    s_mov_b32 s3, 0xc3300000
+; GFX1064-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064-NEXT:    s_mov_b32 s3, 0x43300000
 ; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT:    v_add_f64 v[0:1], s[4:5], s[2:3]
+; GFX1064-NEXT:    v_add_f64 v[0:1], 0xc3300000, s[2:3]
 ; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    s_load_dword s2, s[0:1], 0x0
@@ -2582,9 +2571,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_
 ; GFX1032-NEXT:  ; %bb.1:
 ; GFX1032-NEXT:    s_bcnt1_i32_b32 s4, s3
 ; GFX1032-NEXT:    s_mov_b32 s5, 0x43300000
-; GFX1032-NEXT:    s_mov_b32 s3, 0xc3300000
 ; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT:    v_add_f64 v[0:1], s[4:5], s[2:3]
+; GFX1032-NEXT:    v_add_f64 v[0:1], 0xc3300000, s[4:5]
 ; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    s_load_dword s3, s[0:1], 0x0
@@ -2611,8 +2599,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_
 ; GFX1164-NEXT:    v_mov_b32_e32 v0, 0x43300000
 ; GFX1164-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
-; GFX1164-NEXT:    s_mov_b32 s2, 0
-; GFX1164-NEXT:    s_mov_b64 s[4:5], exec
+; GFX1164-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164-NEXT:    s_clause 0x1
 ; GFX1164-NEXT:    scratch_store_b32 off, v0, off offset:12
 ; GFX1164-NEXT:    scratch_store_b32 off, v1, off offset:8
@@ -2622,10 +2609,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_
 ; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v2
 ; GFX1164-NEXT:    s_cbranch_execz .LBB4_3
 ; GFX1164-NEXT:  ; %bb.1:
-; GFX1164-NEXT:    s_mov_b32 s3, 0xc3300000
-; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX1164-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-NEXT:    v_add_f64 v[0:1], v[0:1], s[2:3]
+; GFX1164-NEXT:    v_add_f64 v[0:1], 0xc3300000, v[0:1]
+; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX1164-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    s_load_b32 s2, s[0:1], 0x0
@@ -2665,10 +2651,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_
 ; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v2
 ; GFX1132-NEXT:    s_cbranch_execz .LBB4_3
 ; GFX1132-NEXT:  ; %bb.1:
-; GFX1132-NEXT:    s_mov_b32 s3, 0xc3300000
-; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-NEXT:    v_add_f64 v[0:1], v[0:1], s[2:3]
+; GFX1132-NEXT:    v_add_f64 v[0:1], 0xc3300000, v[0:1]
+; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX1132-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    s_load_b32 s3, s[0:1], 0x0
@@ -2706,27 +2691,28 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_
 ; GFX9-DPP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-DPP-NEXT:    s_cbranch_execz .LBB4_3
 ; GFX9-DPP-NEXT:  ; %bb.1:
-; GFX9-DPP-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
-; GFX9-DPP-NEXT:    s_mov_b32 s3, 0x43300000
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-DPP-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, 0xc3300000
-; GFX9-DPP-NEXT:    v_add_f64 v[1:2], s[2:3], v[0:1]
+; GFX9-DPP-NEXT:    s_mov_b32 s3, 0x43300000
+; GFX9-DPP-NEXT:    v_add_f64 v[0:1], s[2:3], v[0:1]
 ; GFX9-DPP-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX9-DPP-NEXT:    v_cvt_f32_f64_e32 v1, v[1:2]
+; GFX9-DPP-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-DPP-NEXT:    v_mul_f32_e32 v3, 4.0, v1
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-DPP-NEXT:    v_mul_f32_e32 v2, 4.0, v0
 ; GFX9-DPP-NEXT:  .LBB4_2: ; %atomicrmw.start
 ; GFX9-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-DPP-NEXT:    v_sub_f32_e32 v1, v2, v3
-; GFX9-DPP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc
+; GFX9-DPP-NEXT:    v_sub_f32_e32 v0, v1, v2
+; GFX9-DPP-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX9-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX9-DPP-NEXT:    s_cbranch_execnz .LBB4_2
 ; GFX9-DPP-NEXT:  .LBB4_3:
@@ -2734,25 +2720,23 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_
 ;
 ; GFX1064-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp:
 ; GFX1064-DPP:       ; %bb.0:
-; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], exec
 ; GFX1064-DPP-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
-; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
 ; GFX1064-DPP-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
 ; GFX1064-DPP-NEXT:    s_mov_b32 s10, -1
 ; GFX1064-DPP-NEXT:    s_mov_b32 s11, 0x31e16000
 ; GFX1064-DPP-NEXT:    s_add_u32 s8, s8, s3
-; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
+; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1064-DPP-NEXT:    s_addc_u32 s9, s9, 0
-; GFX1064-DPP-NEXT:    s_mov_b32 s2, 0
+; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB4_3
 ; GFX1064-DPP-NEXT:  ; %bb.1:
-; GFX1064-DPP-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
-; GFX1064-DPP-NEXT:    s_mov_b32 s5, 0x43300000
-; GFX1064-DPP-NEXT:    s_mov_b32 s3, 0xc3300000
+; GFX1064-DPP-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064-DPP-NEXT:    s_mov_b32 s3, 0x43300000
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-DPP-NEXT:    v_add_f64 v[0:1], s[4:5], s[2:3]
+; GFX1064-DPP-NEXT:    v_add_f64 v[0:1], 0xc3300000, s[2:3]
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    s_load_dword s2, s[0:1], 0x0
@@ -2791,9 +2775,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_
 ; GFX1032-DPP-NEXT:  ; %bb.1:
 ; GFX1032-DPP-NEXT:    s_bcnt1_i32_b32 s4, s3
 ; GFX1032-DPP-NEXT:    s_mov_b32 s5, 0x43300000
-; GFX1032-DPP-NEXT:    s_mov_b32 s3, 0xc3300000
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-DPP-NEXT:    v_add_f64 v[0:1], s[4:5], s[2:3]
+; GFX1032-DPP-NEXT:    v_add_f64 v[0:1], 0xc3300000, s[4:5]
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-DPP-NEXT:    s_load_dword s3, s[0:1], 0x0
@@ -2820,8 +2803,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, 0x43300000
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
-; GFX1164-DPP-NEXT:    s_mov_b32 s2, 0
-; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], exec
+; GFX1164-DPP-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164-DPP-NEXT:    s_clause 0x1
 ; GFX1164-DPP-NEXT:    scratch_store_b32 off, v0, off offset:12
 ; GFX1164-DPP-NEXT:    scratch_store_b32 off, v1, off offset:8
@@ -2831,10 +2813,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_
 ; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v2
 ; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB4_3
 ; GFX1164-DPP-NEXT:  ; %bb.1:
-; GFX1164-DPP-NEXT:    s_mov_b32 s3, 0xc3300000
-; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_add_f64 v[0:1], v[0:1], s[2:3]
+; GFX1164-DPP-NEXT:    v_add_f64 v[0:1], 0xc3300000, v[0:1]
+; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    s_load_b32 s2, s[0:1], 0x0
@@ -2874,10 +2855,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v2
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB4_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
-; GFX1132-DPP-NEXT:    s_mov_b32 s3, 0xc3300000
-; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_add_f64 v[0:1], v[0:1], s[2:3]
+; GFX1132-DPP-NEXT:    v_add_f64 v[0:1], 0xc3300000, v[0:1]
+; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_load_b32 s3, s[0:1], 0x0
@@ -4504,8 +4484,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_defalut_scop
 ; GFX7LESS-NEXT:    s_cbranch_execz .LBB7_3
 ; GFX7LESS-NEXT:  ; %bb.1:
 ; GFX7LESS-NEXT:    s_bcnt1_i32_b64 s6, s[2:3]
-; GFX7LESS-NEXT:    s_mov_b32 s7, 0x43300000
 ; GFX7LESS-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS-NEXT:    s_mov_b32 s7, 0x43300000
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX7LESS-NEXT:    v_mov_b32_e32 v1, 0xc3300000
 ; GFX7LESS-NEXT:    s_mov_b64 s[4:5], 0
@@ -4549,27 +4529,28 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_defalut_scop
 ; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-NEXT:    s_cbranch_execz .LBB7_3
 ; GFX9-NEXT:  ; %bb.1:
-; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
-; GFX9-NEXT:    s_mov_b32 s3, 0x43300000
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0xc3300000
-; GFX9-NEXT:    v_add_f64 v[1:2], s[2:3], v[0:1]
+; GFX9-NEXT:    s_mov_b32 s3, 0x43300000
+; GFX9-NEXT:    v_add_f64 v[0:1], s[2:3], v[0:1]
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    s_mov_b64 s[2:3], 0
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX9-NEXT:    v_cvt_f32_f64_e32 v1, v[1:2]
+; GFX9-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-NEXT:    v_mul_f32_e32 v3, 4.0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-NEXT:    v_mul_f32_e32 v2, 4.0, v0
 ; GFX9-NEXT:  .LBB7_2: ; %atomicrmw.start
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    v_sub_f32_e32 v1, v2, v3
-; GFX9-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc
+; GFX9-NEXT:    v_sub_f32_e32 v0, v1, v2
+; GFX9-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX9-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v2, v1
+; GFX9-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX9-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX9-NEXT:    s_cbranch_execnz .LBB7_2
 ; GFX9-NEXT:  .LBB7_3:
@@ -4577,25 +4558,23 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_defalut_scop
 ;
 ; GFX1064-LABEL: global_atomic_fsub_uni_address_uni_value_defalut_scope_strictfp:
 ; GFX1064:       ; %bb.0:
-; GFX1064-NEXT:    s_mov_b64 s[4:5], exec
 ; GFX1064-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
-; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
 ; GFX1064-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
 ; GFX1064-NEXT:    s_mov_b32 s10, -1
 ; GFX1064-NEXT:    s_mov_b32 s11, 0x31e16000
 ; GFX1064-NEXT:    s_add_u32 s8, s8, s3
-; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
+; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1064-NEXT:    s_addc_u32 s9, s9, 0
-; GFX1064-NEXT:    s_mov_b32 s2, 0
+; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1064-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
 ; GFX1064-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GFX1064-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX1064-NEXT:    s_cbranch_execz .LBB7_3
 ; GFX1064-NEXT:  ; %bb.1:
-; GFX1064-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
-; GFX1064-NEXT:    s_mov_b32 s5, 0x43300000
-; GFX1064-NEXT:    s_mov_b32 s3, 0xc3300000
+; GFX1064-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064-NEXT:    s_mov_b32 s3, 0x43300000
 ; GFX1064-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-NEXT:    v_add_f64 v[0:1], s[4:5], s[2:3]
+; GFX1064-NEXT:    v_add_f64 v[0:1], 0xc3300000, s[2:3]
 ; GFX1064-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    s_load_dword s2, s[0:1], 0x0
@@ -4634,9 +4613,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_defalut_scop
 ; GFX1032-NEXT:  ; %bb.1:
 ; GFX1032-NEXT:    s_bcnt1_i32_b32 s4, s3
 ; GFX1032-NEXT:    s_mov_b32 s5, 0x43300000
-; GFX1032-NEXT:    s_mov_b32 s3, 0xc3300000
 ; GFX1032-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-NEXT:    v_add_f64 v[0:1], s[4:5], s[2:3]
+; GFX1032-NEXT:    v_add_f64 v[0:1], 0xc3300000, s[4:5]
 ; GFX1032-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    s_load_dword s3, s[0:1], 0x0
@@ -4663,8 +4641,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_defalut_scop
 ; GFX1164-NEXT:    v_mov_b32_e32 v0, 0x43300000
 ; GFX1164-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
-; GFX1164-NEXT:    s_mov_b32 s2, 0
-; GFX1164-NEXT:    s_mov_b64 s[4:5], exec
+; GFX1164-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164-NEXT:    s_clause 0x1
 ; GFX1164-NEXT:    scratch_store_b32 off, v0, off offset:12
 ; GFX1164-NEXT:    scratch_store_b32 off, v1, off offset:8
@@ -4674,10 +4651,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_defalut_scop
 ; GFX1164-NEXT:    v_cmpx_eq_u32_e32 0, v2
 ; GFX1164-NEXT:    s_cbranch_execz .LBB7_3
 ; GFX1164-NEXT:  ; %bb.1:
-; GFX1164-NEXT:    s_mov_b32 s3, 0xc3300000
-; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX1164-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-NEXT:    v_add_f64 v[0:1], v[0:1], s[2:3]
+; GFX1164-NEXT:    v_add_f64 v[0:1], 0xc3300000, v[0:1]
+; GFX1164-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX1164-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1164-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-NEXT:    s_load_b32 s2, s[0:1], 0x0
@@ -4717,10 +4693,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_defalut_scop
 ; GFX1132-NEXT:    v_cmpx_eq_u32_e32 0, v2
 ; GFX1132-NEXT:    s_cbranch_execz .LBB7_3
 ; GFX1132-NEXT:  ; %bb.1:
-; GFX1132-NEXT:    s_mov_b32 s3, 0xc3300000
-; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX1132-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-NEXT:    v_add_f64 v[0:1], v[0:1], s[2:3]
+; GFX1132-NEXT:    v_add_f64 v[0:1], 0xc3300000, v[0:1]
+; GFX1132-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX1132-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1132-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-NEXT:    s_load_b32 s3, s[0:1], 0x0
@@ -4758,27 +4733,28 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_defalut_scop
 ; GFX9-DPP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX9-DPP-NEXT:    s_cbranch_execz .LBB7_3
 ; GFX9-DPP-NEXT:  ; %bb.1:
-; GFX9-DPP-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
-; GFX9-DPP-NEXT:    s_mov_b32 s3, 0x43300000
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-DPP-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
 ; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, 0xc3300000
-; GFX9-DPP-NEXT:    v_add_f64 v[1:2], s[2:3], v[0:1]
+; GFX9-DPP-NEXT:    s_mov_b32 s3, 0x43300000
+; GFX9-DPP-NEXT:    v_add_f64 v[0:1], s[2:3], v[0:1]
 ; GFX9-DPP-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DPP-NEXT:    s_load_dword s4, s[0:1], 0x0
-; GFX9-DPP-NEXT:    v_cvt_f32_f64_e32 v1, v[1:2]
+; GFX9-DPP-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
 ; GFX9-DPP-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-DPP-NEXT:    v_mul_f32_e32 v3, 4.0, v1
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-DPP-NEXT:    v_mul_f32_e32 v2, 4.0, v0
 ; GFX9-DPP-NEXT:  .LBB7_2: ; %atomicrmw.start
 ; GFX9-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-DPP-NEXT:    v_sub_f32_e32 v1, v2, v3
-; GFX9-DPP-NEXT:    global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc
+; GFX9-DPP-NEXT:    v_sub_f32_e32 v0, v1, v2
+; GFX9-DPP-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
 ; GFX9-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX9-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
 ; GFX9-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
-; GFX9-DPP-NEXT:    v_mov_b32_e32 v2, v1
+; GFX9-DPP-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX9-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
 ; GFX9-DPP-NEXT:    s_cbranch_execnz .LBB7_2
 ; GFX9-DPP-NEXT:  .LBB7_3:
@@ -4786,25 +4762,23 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_defalut_scop
 ;
 ; GFX1064-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_defalut_scope_strictfp:
 ; GFX1064-DPP:       ; %bb.0:
-; GFX1064-DPP-NEXT:    s_mov_b64 s[4:5], exec
 ; GFX1064-DPP-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
-; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
 ; GFX1064-DPP-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
 ; GFX1064-DPP-NEXT:    s_mov_b32 s10, -1
 ; GFX1064-DPP-NEXT:    s_mov_b32 s11, 0x31e16000
 ; GFX1064-DPP-NEXT:    s_add_u32 s8, s8, s3
-; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
+; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1064-DPP-NEXT:    s_addc_u32 s9, s9, 0
-; GFX1064-DPP-NEXT:    s_mov_b32 s2, 0
+; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1064-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
 ; GFX1064-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GFX1064-DPP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GFX1064-DPP-NEXT:    s_cbranch_execz .LBB7_3
 ; GFX1064-DPP-NEXT:  ; %bb.1:
-; GFX1064-DPP-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
-; GFX1064-DPP-NEXT:    s_mov_b32 s5, 0x43300000
-; GFX1064-DPP-NEXT:    s_mov_b32 s3, 0xc3300000
+; GFX1064-DPP-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064-DPP-NEXT:    s_mov_b32 s3, 0x43300000
 ; GFX1064-DPP-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1064-DPP-NEXT:    v_add_f64 v[0:1], s[4:5], s[2:3]
+; GFX1064-DPP-NEXT:    v_add_f64 v[0:1], 0xc3300000, s[2:3]
 ; GFX1064-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1064-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-DPP-NEXT:    s_load_dword s2, s[0:1], 0x0
@@ -4843,9 +4817,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_defalut_scop
 ; GFX1032-DPP-NEXT:  ; %bb.1:
 ; GFX1032-DPP-NEXT:    s_bcnt1_i32_b32 s4, s3
 ; GFX1032-DPP-NEXT:    s_mov_b32 s5, 0x43300000
-; GFX1032-DPP-NEXT:    s_mov_b32 s3, 0xc3300000
 ; GFX1032-DPP-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX1032-DPP-NEXT:    v_add_f64 v[0:1], s[4:5], s[2:3]
+; GFX1032-DPP-NEXT:    v_add_f64 v[0:1], 0xc3300000, s[4:5]
 ; GFX1032-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1032-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-DPP-NEXT:    s_load_dword s3, s[0:1], 0x0
@@ -4872,8 +4845,7 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_defalut_scop
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v0, 0x43300000
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX1164-DPP-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
-; GFX1164-DPP-NEXT:    s_mov_b32 s2, 0
-; GFX1164-DPP-NEXT:    s_mov_b64 s[4:5], exec
+; GFX1164-DPP-NEXT:    s_mov_b64 s[2:3], exec
 ; GFX1164-DPP-NEXT:    s_clause 0x1
 ; GFX1164-DPP-NEXT:    scratch_store_b32 off, v0, off offset:12
 ; GFX1164-DPP-NEXT:    scratch_store_b32 off, v1, off offset:8
@@ -4883,10 +4855,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_defalut_scop
 ; GFX1164-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v2
 ; GFX1164-DPP-NEXT:    s_cbranch_execz .LBB7_3
 ; GFX1164-DPP-NEXT:  ; %bb.1:
-; GFX1164-DPP-NEXT:    s_mov_b32 s3, 0xc3300000
-; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX1164-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1164-DPP-NEXT:    v_add_f64 v[0:1], v[0:1], s[2:3]
+; GFX1164-DPP-NEXT:    v_add_f64 v[0:1], 0xc3300000, v[0:1]
+; GFX1164-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX1164-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1164-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1164-DPP-NEXT:    s_load_b32 s2, s[0:1], 0x0
@@ -4926,10 +4897,9 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_defalut_scop
 ; GFX1132-DPP-NEXT:    v_cmpx_eq_u32_e32 0, v2
 ; GFX1132-DPP-NEXT:    s_cbranch_execz .LBB7_3
 ; GFX1132-DPP-NEXT:  ; %bb.1:
-; GFX1132-DPP-NEXT:    s_mov_b32 s3, 0xc3300000
-; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX1132-DPP-NEXT:    s_waitcnt vmcnt(0)
-; GFX1132-DPP-NEXT:    v_add_f64 v[0:1], v[0:1], s[2:3]
+; GFX1132-DPP-NEXT:    v_add_f64 v[0:1], 0xc3300000, v[0:1]
+; GFX1132-DPP-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
 ; GFX1132-DPP-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1132-DPP-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1132-DPP-NEXT:    s_load_b32 s3, s[0:1], 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/inline-asm.ll b/llvm/test/CodeGen/AMDGPU/inline-asm.ll
index 173e4f656c90d..a290dd5fd145c 100644
--- a/llvm/test/CodeGen/AMDGPU/inline-asm.ll
+++ b/llvm/test/CodeGen/AMDGPU/inline-asm.ll
@@ -205,9 +205,8 @@ entry:
 
 ; FIXME: Should not have intermediate sgprs
 ; CHECK-LABEL: {{^}}i64_imm_input_phys_vgpr:
-; CHECK: s_mov_b64 s[0:1], 0x1e240
-; CHECK: v_mov_b32_e32 v0, s0
-; CHECK: v_mov_b32_e32 v1, s1
+; CHECK: v_mov_b32_e32 v0, 0x1e240
+; CHECK: v_mov_b32_e32 v1, 0
 ; CHECK: use v[0:1]
 define amdgpu_kernel void @i64_imm_input_phys_vgpr() {
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
index f5d41b246b1b8..220ea962b9e1d 100644
--- a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
@@ -85,17 +85,18 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg
 ; GFX11-NEXT:    s_addc_u32 s1, s1, f0@gotpcrel32@hi+12
 ; GFX11-NEXT:    s_mov_b32 s13, s14
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX11-NEXT:    s_mov_b32 s21, s14
+; GFX11-NEXT:    s_mov_b32 s3, s14
 ; GFX11-NEXT:    s_mov_b32 s14, s15
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    s_mov_b32 s14, s21
+; GFX11-NEXT:    s_mov_b32 s14, s3
 ; GFX11-NEXT:    s_mov_b32 s1, -1
-; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s3
-; GFX11-NEXT:    s_cbranch_vccz .LBB2_4
+; GFX11-NEXT:    s_cbranch_execz .LBB2_4
 ; GFX11-NEXT:    s_branch .LBB2_12
 ; GFX11-NEXT:  .LBB2_3:
 ; GFX11-NEXT:    s_mov_b32 s1, 0
+; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_vccnz .LBB2_12
 ; GFX11-NEXT:  .LBB2_4: ; %bb16
 ; GFX11-NEXT:    s_load_b32 s2, s[16:17], 0x54
 ; GFX11-NEXT:    s_bitcmp1_b32 s23, 0
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
index ae470efc92fee..3bc503e3714fe 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
@@ -289,16 +289,15 @@ entry:
 define amdgpu_kernel void @half4_inselt(ptr addrspace(1) %out, <4 x half> %vec, i32 %sel) {
 ; GCN-LABEL: half4_inselt:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_load_dword s7, s[0:1], 0x34
+; GCN-NEXT:    s_load_dword s6, s[0:1], 0x34
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NEXT:    s_mov_b64 s[4:5], 0xffff
-; GCN-NEXT:    s_mov_b32 s6, 0x3c003c00
+; GCN-NEXT:    s_mov_b32 s4, 0x3c003c00
+; GCN-NEXT:    s_mov_b32 s5, s4
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_lshl_b32 s7, s7, 4
-; GCN-NEXT:    s_lshl_b64 s[4:5], s[4:5], s7
-; GCN-NEXT:    s_mov_b32 s7, s6
-; GCN-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[4:5]
-; GCN-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
+; GCN-NEXT:    s_lshl_b32 s6, s6, 4
+; GCN-NEXT:    s_lshl_b64 s[6:7], 0xffff, s6
+; GCN-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GCN-NEXT:    s_and_b64 s[4:5], s[6:7], s[4:5]
 ; GCN-NEXT:    s_or_b64 s[2:3], s[4:5], s[2:3]
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    v_mov_b32_e32 v2, s2
@@ -419,16 +418,15 @@ entry:
 define amdgpu_kernel void @short4_inselt(ptr addrspace(1) %out, <4 x i16> %vec, i32 %sel) {
 ; GCN-LABEL: short4_inselt:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_load_dword s7, s[0:1], 0x34
+; GCN-NEXT:    s_load_dword s6, s[0:1], 0x34
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NEXT:    s_mov_b64 s[4:5], 0xffff
-; GCN-NEXT:    s_mov_b32 s6, 0x10001
+; GCN-NEXT:    s_mov_b32 s4, 0x10001
+; GCN-NEXT:    s_mov_b32 s5, s4
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_lshl_b32 s7, s7, 4
-; GCN-NEXT:    s_lshl_b64 s[4:5], s[4:5], s7
-; GCN-NEXT:    s_mov_b32 s7, s6
-; GCN-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[4:5]
-; GCN-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
+; GCN-NEXT:    s_lshl_b32 s6, s6, 4
+; GCN-NEXT:    s_lshl_b64 s[6:7], 0xffff, s6
+; GCN-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GCN-NEXT:    s_and_b64 s[4:5], s[6:7], s[4:5]
 ; GCN-NEXT:    s_or_b64 s[2:3], s[4:5], s[2:3]
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    v_mov_b32_e32 v2, s2
@@ -445,12 +443,11 @@ entry:
 define amdgpu_kernel void @byte8_inselt(ptr addrspace(1) %out, <8 x i8> %vec, i32 %sel) {
 ; GCN-LABEL: byte8_inselt:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_load_dword s6, s[0:1], 0x34
+; GCN-NEXT:    s_load_dword s4, s[0:1], 0x34
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GCN-NEXT:    s_mov_b64 s[4:5], 0xff
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_lshl_b32 s6, s6, 3
-; GCN-NEXT:    s_lshl_b64 s[4:5], s[4:5], s6
+; GCN-NEXT:    s_lshl_b32 s4, s4, 3
+; GCN-NEXT:    s_lshl_b64 s[4:5], 0xff, s4
 ; GCN-NEXT:    s_and_b32 s7, s5, 0x1010101
 ; GCN-NEXT:    s_and_b32 s6, s4, 0x1010101
 ; GCN-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
index c58dbd6bd1206..68427e8937bb9 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
@@ -1550,10 +1550,9 @@ define amdgpu_kernel void @dynamic_insertelement_v3i16(ptr addrspace(1) %out, <3
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s4, s0
+; SI-NEXT:    s_lshl_b32 s0, s8, 4
 ; SI-NEXT:    s_mov_b32 s5, s1
-; SI-NEXT:    s_lshl_b32 s8, s8, 4
-; SI-NEXT:    s_mov_b64 s[0:1], 0xffff
-; SI-NEXT:    s_lshl_b64 s[0:1], s[0:1], s8
+; SI-NEXT:    s_lshl_b64 s[0:1], 0xffff, s0
 ; SI-NEXT:    s_and_b32 s9, s1, 0x50005
 ; SI-NEXT:    s_and_b32 s8, s0, 0x50005
 ; SI-NEXT:    s_andn2_b64 s[0:1], s[2:3], s[0:1]
@@ -1572,11 +1571,10 @@ define amdgpu_kernel void @dynamic_insertelement_v3i16(ptr addrspace(1) %out, <3
 ; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_mov_b32 s4, s0
-; VI-NEXT:    s_mov_b32 s5, s1
-; VI-NEXT:    s_lshl_b32 s8, s8, 4
-; VI-NEXT:    s_mov_b64 s[0:1], 0xffff
-; VI-NEXT:    s_lshl_b64 s[0:1], s[0:1], s8
+; VI-NEXT:    s_lshl_b32 s0, s8, 4
 ; VI-NEXT:    s_mov_b32 s8, 0x50005
+; VI-NEXT:    s_mov_b32 s5, s1
+; VI-NEXT:    s_lshl_b64 s[0:1], 0xffff, s0
 ; VI-NEXT:    s_mov_b32 s9, s8
 ; VI-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[0:1]
 ; VI-NEXT:    s_and_b64 s[0:1], s[0:1], s[8:9]
@@ -1725,17 +1723,16 @@ define amdgpu_kernel void @s_dynamic_insertelement_v8i8(ptr addrspace(1) %out, p
 ; SI-NEXT:    s_mov_b32 s7, 0x100f000
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; SI-NEXT:    s_mov_b32 s4, s0
+; SI-NEXT:    s_lshl_b32 s0, s8, 3
 ; SI-NEXT:    s_mov_b32 s5, s1
-; SI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
-; SI-NEXT:    s_lshl_b32 s8, s8, 3
-; SI-NEXT:    s_mov_b64 s[2:3], 0xff
-; SI-NEXT:    s_lshl_b64 s[2:3], s[2:3], s8
-; SI-NEXT:    s_and_b32 s9, s3, 0x5050505
-; SI-NEXT:    s_and_b32 s8, s2, 0x5050505
+; SI-NEXT:    s_lshl_b64 s[0:1], 0xff, s0
+; SI-NEXT:    s_and_b32 s9, s1, 0x5050505
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; SI-NEXT:    s_or_b64 s[0:1], s[8:9], s[0:1]
+; SI-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[0:1]
+; SI-NEXT:    s_and_b32 s8, s0, 0x5050505
+; SI-NEXT:    s_or_b64 s[0:1], s[8:9], s[2:3]
 ; SI-NEXT:    v_mov_b32_e32 v0, s0
 ; SI-NEXT:    v_mov_b32_e32 v1, s1
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
@@ -1748,17 +1745,16 @@ define amdgpu_kernel void @s_dynamic_insertelement_v8i8(ptr addrspace(1) %out, p
 ; VI-NEXT:    s_mov_b32 s7, 0x1100f000
 ; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
+; VI-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; VI-NEXT:    s_mov_b32 s4, s0
+; VI-NEXT:    s_lshl_b32 s0, s8, 3
 ; VI-NEXT:    s_mov_b32 s5, s1
-; VI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
-; VI-NEXT:    s_lshl_b32 s8, s8, 3
-; VI-NEXT:    s_mov_b64 s[2:3], 0xff
-; VI-NEXT:    s_lshl_b64 s[2:3], s[2:3], s8
-; VI-NEXT:    s_and_b32 s9, s3, 0x5050505
-; VI-NEXT:    s_and_b32 s8, s2, 0x5050505
+; VI-NEXT:    s_lshl_b64 s[0:1], 0xff, s0
+; VI-NEXT:    s_and_b32 s9, s1, 0x5050505
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; VI-NEXT:    s_or_b64 s[0:1], s[8:9], s[0:1]
+; VI-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[0:1]
+; VI-NEXT:    s_and_b32 s8, s0, 0x5050505
+; VI-NEXT:    s_or_b64 s[0:1], s[8:9], s[2:3]
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
index f98b41ba199bd..47f7943e076a4 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
@@ -2064,19 +2064,17 @@ define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(ptr addrspace(1) %
 ;
 ; GFX11-LABEL: v_insertelement_v4i16_dynamic_vgpr:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x0
-; GFX11-NEXT:    s_load_b32 s2, s[0:1], 0x10
 ; GFX11-NEXT:    global_load_b32 v2, v[0:1], off glc dlc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
-; GFX11-NEXT:    s_mov_b64 s[0:1], 0xffff
+; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x10
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b64 v[0:1], v4, s[6:7]
+; GFX11-NEXT:    s_pack_ll_b32_b16 s0, s0, s0
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 4, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshlrev_b64 v[2:3], v2, s[0:1]
-; GFX11-NEXT:    s_pack_ll_b32_b16 s0, s2, s2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b64 v[2:3], v2, 0xffff
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_bfi_b32 v1, v3, s0, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
@@ -2106,12 +2104,11 @@ define amdgpu_kernel void @v_insertelement_v4f16_dynamic_sgpr(ptr addrspace(1) %
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
-; GFX9-NEXT:    s_mov_b64 s[2:3], 0xffff
-; GFX9-NEXT:    s_lshl_b32 s4, s7, 4
-; GFX9-NEXT:    s_pack_ll_b32_b16 s5, s6, s6
-; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], s4
-; GFX9-NEXT:    v_mov_b32_e32 v3, s5
-; GFX9-NEXT:    v_mov_b32_e32 v4, s5
+; GFX9-NEXT:    s_lshl_b32 s2, s7, 4
+; GFX9-NEXT:    s_pack_ll_b32_b16 s4, s6, s6
+; GFX9-NEXT:    s_lshl_b64 s[2:3], 0xffff, s2
+; GFX9-NEXT:    v_mov_b32_e32 v3, s4
+; GFX9-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_bfi_b32 v1, s3, v3, v1
 ; GFX9-NEXT:    v_bfi_b32 v0, s2, v4, v0
@@ -2128,14 +2125,13 @@ define amdgpu_kernel void @v_insertelement_v4f16_dynamic_sgpr(ptr addrspace(1) %
 ; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; VI-NEXT:    s_mov_b64 s[2:3], 0xffff
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
-; VI-NEXT:    s_lshl_b32 s1, s5, 4
-; VI-NEXT:    s_lshl_b32 s5, s4, 16
-; VI-NEXT:    s_and_b32 s4, s4, 0xffff
+; VI-NEXT:    s_lshl_b32 s1, s4, 16
+; VI-NEXT:    s_and_b32 s2, s4, 0xffff
+; VI-NEXT:    s_lshl_b32 s3, s5, 4
+; VI-NEXT:    s_or_b32 s2, s2, s1
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, s0, v2
-; VI-NEXT:    s_lshl_b64 s[0:1], s[2:3], s1
-; VI-NEXT:    s_or_b32 s2, s4, s5
+; VI-NEXT:    s_lshl_b64 s[0:1], 0xffff, s3
 ; VI-NEXT:    v_mov_b32_e32 v4, s2
 ; VI-NEXT:    v_mov_b32_e32 v5, s2
 ; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
@@ -2155,14 +2151,13 @@ define amdgpu_kernel void @v_insertelement_v4f16_dynamic_sgpr(ptr addrspace(1) %
 ; CI-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
 ; CI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; CI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; CI-NEXT:    s_mov_b64 s[2:3], 0xffff
 ; CI-NEXT:    v_mov_b32_e32 v3, s1
-; CI-NEXT:    s_and_b32 s6, s4, 0xffff
-; CI-NEXT:    s_lshl_b32 s1, s5, 4
-; CI-NEXT:    s_lshl_b32 s4, s4, 16
+; CI-NEXT:    s_and_b32 s1, s4, 0xffff
+; CI-NEXT:    s_lshl_b32 s2, s4, 16
+; CI-NEXT:    s_lshl_b32 s3, s5, 4
+; CI-NEXT:    s_or_b32 s2, s1, s2
 ; CI-NEXT:    v_add_i32_e32 v2, vcc, s0, v2
-; CI-NEXT:    s_lshl_b64 s[0:1], s[2:3], s1
-; CI-NEXT:    s_or_b32 s2, s6, s4
+; CI-NEXT:    s_lshl_b64 s[0:1], 0xffff, s3
 ; CI-NEXT:    v_mov_b32_e32 v4, s2
 ; CI-NEXT:    v_mov_b32_e32 v5, s2
 ; CI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
@@ -2177,15 +2172,14 @@ define amdgpu_kernel void @v_insertelement_v4f16_dynamic_sgpr(ptr addrspace(1) %
 ; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x0
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x10
-; GFX11-NEXT:    s_mov_b64 s[2:3], 0xffff
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b64 v[0:1], v2, s[6:7]
 ; GFX11-NEXT:    s_lshl_b32 s1, s1, 4
-; GFX11-NEXT:    s_pack_ll_b32_b16 s6, s0, s0
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[2:3], s1
+; GFX11-NEXT:    s_pack_ll_b32_b16 s2, s0, s0
+; GFX11-NEXT:    s_lshl_b64 s[0:1], 0xffff, s1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_bfi_b32 v1, s1, s6, v1
-; GFX11-NEXT:    v_bfi_b32 v0, s0, s6, v0
+; GFX11-NEXT:    v_bfi_b32 v1, s1, s2, v1
+; GFX11-NEXT:    v_bfi_b32 v0, s0, s2, v0
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
diff --git a/llvm/test/CodeGen/AMDGPU/ipra-return-address-save-restore.ll b/llvm/test/CodeGen/AMDGPU/ipra-return-address-save-restore.ll
index 741164bc04506..29a96c227f2f0 100644
--- a/llvm/test/CodeGen/AMDGPU/ipra-return-address-save-restore.ll
+++ b/llvm/test/CodeGen/AMDGPU/ipra-return-address-save-restore.ll
@@ -29,7 +29,7 @@ declare void @llvm.lifetime.end.p5(i64 immarg, ptr addrspace(5) nocapture) #1
 define internal fastcc void @svm_node_closure_bsdf(ptr addrspace(1) %sd, ptr %stack, <4 x i32> %node, ptr %offset, i32 %0, i8 %trunc, float %1, float %2, float %mul80, i1 %cmp412.old, <4 x i32> %3, float %4, i32 %5, i1 %cmp440, i1 %cmp442, i1 %or.cond1306, float %.op, ptr addrspace(1) %arrayidx.i.i2202, ptr addrspace(1) %retval.0.i.i22089, ptr addrspace(1) %retval.1.i221310, i1 %cmp575, ptr addrspace(1) %num_closure_left.i2215, i32 %6, i1 %cmp.i2216, i32 %7, i64 %idx.ext.i2223, i32 %sub5.i2221) #2 {
 ; GCN-LABEL: {{^}}svm_node_closure_bsdf:
 ; GCN-NOT: v_writelane_b32
-; GCN: s_movk_i32 s28, 0x60
+; GCN: s_movk_i32 s26, 0x60
 ; GCN-NOT: s31
 ; GCN-NOT: v_readlane_b32
 ; GCN: s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/lds-atomic-fmin-fmax.ll b/llvm/test/CodeGen/AMDGPU/lds-atomic-fmin-fmax.ll
index dee5b724934a0..31295f2a543f2 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-atomic-fmin-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-atomic-fmin-fmax.ll
@@ -674,98 +674,91 @@ define amdgpu_kernel void @lds_ds_fmax(ptr addrspace(5) %out, ptr addrspace(3) %
 define amdgpu_kernel void @lds_ds_fmin_f64(ptr addrspace(5) %out, ptr addrspace(3) %ptrf, i32 %idx) {
 ; SI-LABEL: lds_ds_fmin_f64:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
-; SI-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
-; SI-NEXT:    s_load_dword s4, s[0:1], 0xb
+; SI-NEXT:    s_mov_b32 s4, SCRATCH_RSRC_DWORD0
+; SI-NEXT:    s_mov_b32 s5, SCRATCH_RSRC_DWORD1
+; SI-NEXT:    s_load_dword s2, s[0:1], 0xb
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s10, -1
-; SI-NEXT:    s_mov_b32 s11, 0xe8f000
-; SI-NEXT:    s_add_u32 s8, s8, s3
-; SI-NEXT:    s_addc_u32 s9, s9, 0
-; SI-NEXT:    s_mov_b32 s2, 0
+; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    s_mov_b32 s7, 0xe8f000
+; SI-NEXT:    s_add_u32 s4, s4, s3
+; SI-NEXT:    s_addc_u32 s5, s5, 0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_lshl_b32 s5, s4, 4
-; SI-NEXT:    s_lshl_b32 s4, s4, 3
-; SI-NEXT:    s_mov_b32 s3, 0x40450000
-; SI-NEXT:    s_add_i32 s4, s4, 32
-; SI-NEXT:    v_mov_b32_e32 v0, s2
-; SI-NEXT:    v_mov_b32_e32 v2, s4
-; SI-NEXT:    v_mov_b32_e32 v1, s3
+; SI-NEXT:    s_lshl_b32 s3, s2, 4
+; SI-NEXT:    s_lshl_b32 s2, s2, 3
+; SI-NEXT:    v_mov_b32_e32 v0, 0
+; SI-NEXT:    s_add_i32 s2, s2, 32
+; SI-NEXT:    v_mov_b32_e32 v1, 0x40450000
+; SI-NEXT:    v_mov_b32_e32 v2, s2
 ; SI-NEXT:    s_mov_b32 m0, -1
 ; SI-NEXT:    ds_min_rtn_f64 v[2:3], v2, v[0:1]
-; SI-NEXT:    s_add_i32 s2, s5, 64
-; SI-NEXT:    v_mov_b32_e32 v4, s2
-; SI-NEXT:    ds_min_f64 v4, v[0:1]
-; SI-NEXT:    v_mov_b32_e32 v0, s1
+; SI-NEXT:    v_mov_b32_e32 v4, s1
+; SI-NEXT:    s_add_i32 s1, s3, 64
+; SI-NEXT:    v_mov_b32_e32 v5, s1
+; SI-NEXT:    ds_min_f64 v5, v[0:1]
 ; SI-NEXT:    s_waitcnt lgkmcnt(1)
-; SI-NEXT:    ds_min_rtn_f64 v[0:1], v0, v[2:3]
+; SI-NEXT:    ds_min_rtn_f64 v[0:1], v4, v[2:3]
 ; SI-NEXT:    s_add_i32 s1, s0, 4
-; SI-NEXT:    v_mov_b32_e32 v2, s1
+; SI-NEXT:    v_mov_b32_e32 v3, s1
+; SI-NEXT:    v_mov_b32_e32 v2, s0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    buffer_store_dword v1, v2, s[8:11], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v1, s0
-; SI-NEXT:    buffer_store_dword v0, v1, s[8:11], 0 offen
+; SI-NEXT:    buffer_store_dword v1, v3, s[4:7], 0 offen
+; SI-NEXT:    buffer_store_dword v0, v2, s[4:7], 0 offen
 ; SI-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: lds_ds_fmin_f64:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
-; GFX7-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
-; GFX7-NEXT:    s_mov_b32 s10, -1
-; GFX7-NEXT:    s_load_dword s4, s[0:1], 0xb
+; GFX7-NEXT:    s_mov_b32 s4, SCRATCH_RSRC_DWORD0
+; GFX7-NEXT:    s_mov_b32 s5, SCRATCH_RSRC_DWORD1
+; GFX7-NEXT:    s_load_dword s2, s[0:1], 0xb
 ; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
-; GFX7-NEXT:    s_mov_b32 s11, 0xe8f000
-; GFX7-NEXT:    s_add_u32 s8, s8, s3
-; GFX7-NEXT:    s_mov_b32 s2, 0
-; GFX7-NEXT:    s_mov_b32 s3, 0x40450000
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
-; GFX7-NEXT:    s_addc_u32 s9, s9, 0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    s_mov_b32 s6, -1
+; GFX7-NEXT:    s_mov_b32 s7, 0xe8f000
+; GFX7-NEXT:    s_add_u32 s4, s4, s3
+; GFX7-NEXT:    s_addc_u32 s5, s5, 0
+; GFX7-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_lshl_b32 s2, s4, 3
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    s_lshl_b32 s3, s2, 3
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0x40450000
+; GFX7-NEXT:    v_mov_b32_e32 v2, s3
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    ds_min_rtn_f64 v[2:3], v2, v[0:1] offset:32
-; GFX7-NEXT:    s_lshl_b32 s2, s4, 4
-; GFX7-NEXT:    v_mov_b32_e32 v4, s2
-; GFX7-NEXT:    ds_min_f64 v4, v[0:1] offset:64
-; GFX7-NEXT:    v_mov_b32_e32 v0, s1
+; GFX7-NEXT:    s_lshl_b32 s2, s2, 4
+; GFX7-NEXT:    v_mov_b32_e32 v5, s2
+; GFX7-NEXT:    v_mov_b32_e32 v4, s1
+; GFX7-NEXT:    ds_min_f64 v5, v[0:1] offset:64
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(1)
-; GFX7-NEXT:    ds_min_rtn_f64 v[0:1], v0, v[2:3]
+; GFX7-NEXT:    ds_min_rtn_f64 v[0:1], v4, v[2:3]
 ; GFX7-NEXT:    s_add_i32 s1, s0, 4
 ; GFX7-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    buffer_store_dword v1, v3, s[8:11], 0 offen
-; GFX7-NEXT:    buffer_store_dword v0, v2, s[8:11], 0 offen
+; GFX7-NEXT:    buffer_store_dword v1, v3, s[4:7], 0 offen
+; GFX7-NEXT:    buffer_store_dword v0, v2, s[4:7], 0 offen
 ; GFX7-NEXT:    s_endpgm
 ;
 ; VI-LABEL: lds_ds_fmin_f64:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_mov_b32 s88, SCRATCH_RSRC_DWORD0
 ; VI-NEXT:    s_mov_b32 s89, SCRATCH_RSRC_DWORD1
-; VI-NEXT:    s_mov_b32 s90, -1
-; VI-NEXT:    s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT:    s_load_dword s2, s[0:1], 0x2c
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT:    s_mov_b32 s90, -1
 ; VI-NEXT:    s_mov_b32 s91, 0xe80000
 ; VI-NEXT:    s_add_u32 s88, s88, s3
-; VI-NEXT:    s_mov_b32 s2, 0
-; VI-NEXT:    s_mov_b32 s3, 0x40450000
-; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    s_addc_u32 s89, s89, 0
-; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_mov_b32_e32 v0, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshl_b32 s2, s4, 3
-; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    s_lshl_b32 s3, s2, 3
+; VI-NEXT:    v_mov_b32_e32 v1, 0x40450000
+; VI-NEXT:    v_mov_b32_e32 v2, s3
 ; VI-NEXT:    s_mov_b32 m0, -1
 ; VI-NEXT:    ds_min_rtn_f64 v[2:3], v2, v[0:1] offset:32
-; VI-NEXT:    s_lshl_b32 s2, s4, 4
-; VI-NEXT:    v_mov_b32_e32 v4, s2
-; VI-NEXT:    ds_min_f64 v4, v[0:1] offset:64
-; VI-NEXT:    v_mov_b32_e32 v0, s1
+; VI-NEXT:    s_lshl_b32 s2, s2, 4
+; VI-NEXT:    v_mov_b32_e32 v5, s2
+; VI-NEXT:    v_mov_b32_e32 v4, s1
+; VI-NEXT:    ds_min_f64 v5, v[0:1] offset:64
 ; VI-NEXT:    s_waitcnt lgkmcnt(1)
-; VI-NEXT:    ds_min_rtn_f64 v[0:1], v0, v[2:3]
+; VI-NEXT:    ds_min_rtn_f64 v[0:1], v4, v[2:3]
 ; VI-NEXT:    s_add_i32 s1, s0, 4
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
@@ -783,11 +776,9 @@ define amdgpu_kernel void @lds_ds_fmin_f64(ptr addrspace(5) %out, ptr addrspace(
 ; GFX9-NEXT:    s_add_u32 s8, s8, s3
 ; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
 ; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX9-NEXT:    s_mov_b32 s0, 0
-; GFX9-NEXT:    s_mov_b32 s1, 0x40450000
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    s_addc_u32 s9, s9, 0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x40450000
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_lshl_b32 s0, s4, 3
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s0
@@ -814,17 +805,15 @@ define amdgpu_kernel void @lds_ds_fmin_f64(ptr addrspace(5) %out, ptr addrspace(
 ; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    s_load_dword s4, s[0:1], 0x2c
 ; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX10-NEXT:    s_mov_b32 s0, 0
 ; GFX10-NEXT:    s_addc_u32 s9, s9, 0
-; GFX10-NEXT:    s_mov_b32 s1, 0x40450000
-; GFX10-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0x40450000
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_lshl_b32 s5, s4, 3
+; GFX10-NEXT:    s_lshl_b32 s0, s4, 3
+; GFX10-NEXT:    v_mov_b32_e32 v5, s3
+; GFX10-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX10-NEXT:    s_lshl_b32 s0, s4, 4
-; GFX10-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-NEXT:    v_mov_b32_e32 v4, s0
-; GFX10-NEXT:    v_mov_b32_e32 v5, s3
 ; GFX10-NEXT:    ds_min_rtn_f64 v[2:3], v2, v[0:1] offset:32
 ; GFX10-NEXT:    ds_min_f64 v4, v[0:1] offset:64
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(1)
@@ -838,16 +827,14 @@ define amdgpu_kernel void @lds_ds_fmin_f64(ptr addrspace(5) %out, ptr addrspace(
 ; GFX11-LABEL: lds_ds_fmin_f64:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT:    s_load_b32 s2, s[0:1], 0x2c
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-NEXT:    s_mov_b32 s2, 0
-; GFX11-NEXT:    s_mov_b32 s3, 0x40450000
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b32 s5, s4, 3
-; GFX11-NEXT:    v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v2, s5
-; GFX11-NEXT:    s_lshl_b32 s2, s4, 4
+; GFX11-NEXT:    s_lshl_b32 s3, s2, 3
+; GFX11-NEXT:    v_mov_b32_e32 v5, s1
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0x40450000 :: v_dual_mov_b32 v2, s3
+; GFX11-NEXT:    s_lshl_b32 s2, s2, 4
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_mov_b32_e32 v4, s2
 ; GFX11-NEXT:    ds_min_rtn_f64 v[2:3], v2, v[0:1] offset:32
@@ -870,11 +857,11 @@ define amdgpu_kernel void @lds_ds_fmin_f64(ptr addrspace(5) %out, ptr addrspace(
 ; G_SI-NEXT:    s_mov_b32 s2, 0
 ; G_SI-NEXT:    s_addc_u32 s9, s9, 0
 ; G_SI-NEXT:    s_mov_b32 s3, 0x40450000
-; G_SI-NEXT:    v_mov_b32_e32 v0, s2
 ; G_SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; G_SI-NEXT:    s_add_i32 s4, s4, 4
-; G_SI-NEXT:    v_mov_b32_e32 v1, s3
+; G_SI-NEXT:    v_mov_b32_e32 v0, s2
 ; G_SI-NEXT:    s_lshl_b32 s2, s4, 3
+; G_SI-NEXT:    v_mov_b32_e32 v1, s3
 ; G_SI-NEXT:    v_mov_b32_e32 v2, s2
 ; G_SI-NEXT:    s_mov_b32 m0, -1
 ; G_SI-NEXT:    ds_min_rtn_f64 v[2:3], v2, v[0:1]
@@ -904,11 +891,11 @@ define amdgpu_kernel void @lds_ds_fmin_f64(ptr addrspace(5) %out, ptr addrspace(
 ; G_GFX7-NEXT:    s_mov_b32 s2, 0
 ; G_GFX7-NEXT:    s_addc_u32 s9, s9, 0
 ; G_GFX7-NEXT:    s_mov_b32 s3, 0x40450000
-; G_GFX7-NEXT:    v_mov_b32_e32 v0, s2
 ; G_GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; G_GFX7-NEXT:    s_add_i32 s4, s4, 4
-; G_GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; G_GFX7-NEXT:    v_mov_b32_e32 v0, s2
 ; G_GFX7-NEXT:    s_lshl_b32 s2, s4, 3
+; G_GFX7-NEXT:    v_mov_b32_e32 v1, s3
 ; G_GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; G_GFX7-NEXT:    s_mov_b32 m0, -1
 ; G_GFX7-NEXT:    ds_min_rtn_f64 v[2:3], v2, v[0:1]
@@ -938,11 +925,11 @@ define amdgpu_kernel void @lds_ds_fmin_f64(ptr addrspace(5) %out, ptr addrspace(
 ; G_VI-NEXT:    s_mov_b32 s2, 0
 ; G_VI-NEXT:    s_addc_u32 s89, s89, 0
 ; G_VI-NEXT:    s_mov_b32 s3, 0x40450000
-; G_VI-NEXT:    v_mov_b32_e32 v0, s2
 ; G_VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; G_VI-NEXT:    s_add_i32 s4, s4, 4
-; G_VI-NEXT:    v_mov_b32_e32 v1, s3
+; G_VI-NEXT:    v_mov_b32_e32 v0, s2
 ; G_VI-NEXT:    s_lshl_b32 s2, s4, 3
+; G_VI-NEXT:    v_mov_b32_e32 v1, s3
 ; G_VI-NEXT:    v_mov_b32_e32 v2, s2
 ; G_VI-NEXT:    s_mov_b32 m0, -1
 ; G_VI-NEXT:    ds_min_rtn_f64 v[2:3], v2, v[0:1]
@@ -972,11 +959,11 @@ define amdgpu_kernel void @lds_ds_fmin_f64(ptr addrspace(5) %out, ptr addrspace(
 ; G_GFX9-NEXT:    s_mov_b32 s0, 0
 ; G_GFX9-NEXT:    s_addc_u32 s9, s9, 0
 ; G_GFX9-NEXT:    s_mov_b32 s1, 0x40450000
-; G_GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; G_GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; G_GFX9-NEXT:    s_add_i32 s4, s4, 4
-; G_GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; G_GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; G_GFX9-NEXT:    s_lshl_b32 s0, s4, 3
+; G_GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; G_GFX9-NEXT:    v_mov_b32_e32 v2, s0
 ; G_GFX9-NEXT:    ds_min_rtn_f64 v[2:3], v2, v[0:1]
 ; G_GFX9-NEXT:    s_lshl_b32 s0, s4, 4
@@ -1031,10 +1018,10 @@ define amdgpu_kernel void @lds_ds_fmin_f64(ptr addrspace(5) %out, ptr addrspace(
 ; G_GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; G_GFX11-NEXT:    s_add_i32 s4, s2, 4
 ; G_GFX11-NEXT:    s_mov_b32 s2, 0
-; G_GFX11-NEXT:    s_lshl_b32 s5, s4, 3
 ; G_GFX11-NEXT:    s_mov_b32 s3, 0x40450000
+; G_GFX11-NEXT:    s_lshl_b32 s5, s4, 3
 ; G_GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v5, s1
-; G_GFX11-NEXT:    v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v1, s3
+; G_GFX11-NEXT:    v_dual_mov_b32 v1, s3 :: v_dual_mov_b32 v2, s5
 ; G_GFX11-NEXT:    s_lshl_b32 s2, s4, 4
 ; G_GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; G_GFX11-NEXT:    v_mov_b32_e32 v4, s2
@@ -1060,98 +1047,91 @@ define amdgpu_kernel void @lds_ds_fmin_f64(ptr addrspace(5) %out, ptr addrspace(
 define amdgpu_kernel void @lds_ds_fmax_f64(ptr addrspace(5) %out, ptr addrspace(3) %ptrf, i32 %idx) {
 ; SI-LABEL: lds_ds_fmax_f64:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
-; SI-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
-; SI-NEXT:    s_load_dword s4, s[0:1], 0xb
+; SI-NEXT:    s_mov_b32 s4, SCRATCH_RSRC_DWORD0
+; SI-NEXT:    s_mov_b32 s5, SCRATCH_RSRC_DWORD1
+; SI-NEXT:    s_load_dword s2, s[0:1], 0xb
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
-; SI-NEXT:    s_mov_b32 s10, -1
-; SI-NEXT:    s_mov_b32 s11, 0xe8f000
-; SI-NEXT:    s_add_u32 s8, s8, s3
-; SI-NEXT:    s_addc_u32 s9, s9, 0
-; SI-NEXT:    s_mov_b32 s2, 0
+; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    s_mov_b32 s7, 0xe8f000
+; SI-NEXT:    s_add_u32 s4, s4, s3
+; SI-NEXT:    s_addc_u32 s5, s5, 0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_lshl_b32 s5, s4, 4
-; SI-NEXT:    s_lshl_b32 s4, s4, 3
-; SI-NEXT:    s_mov_b32 s3, 0x40450000
-; SI-NEXT:    s_add_i32 s4, s4, 32
-; SI-NEXT:    v_mov_b32_e32 v0, s2
-; SI-NEXT:    v_mov_b32_e32 v2, s4
-; SI-NEXT:    v_mov_b32_e32 v1, s3
+; SI-NEXT:    s_lshl_b32 s3, s2, 4
+; SI-NEXT:    s_lshl_b32 s2, s2, 3
+; SI-NEXT:    v_mov_b32_e32 v0, 0
+; SI-NEXT:    s_add_i32 s2, s2, 32
+; SI-NEXT:    v_mov_b32_e32 v1, 0x40450000
+; SI-NEXT:    v_mov_b32_e32 v2, s2
 ; SI-NEXT:    s_mov_b32 m0, -1
 ; SI-NEXT:    ds_max_rtn_f64 v[2:3], v2, v[0:1]
-; SI-NEXT:    s_add_i32 s2, s5, 64
-; SI-NEXT:    v_mov_b32_e32 v4, s2
-; SI-NEXT:    ds_max_f64 v4, v[0:1]
-; SI-NEXT:    v_mov_b32_e32 v0, s1
+; SI-NEXT:    v_mov_b32_e32 v4, s1
+; SI-NEXT:    s_add_i32 s1, s3, 64
+; SI-NEXT:    v_mov_b32_e32 v5, s1
+; SI-NEXT:    ds_max_f64 v5, v[0:1]
 ; SI-NEXT:    s_waitcnt lgkmcnt(1)
-; SI-NEXT:    ds_max_rtn_f64 v[0:1], v0, v[2:3]
+; SI-NEXT:    ds_max_rtn_f64 v[0:1], v4, v[2:3]
 ; SI-NEXT:    s_add_i32 s1, s0, 4
-; SI-NEXT:    v_mov_b32_e32 v2, s1
+; SI-NEXT:    v_mov_b32_e32 v3, s1
+; SI-NEXT:    v_mov_b32_e32 v2, s0
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    buffer_store_dword v1, v2, s[8:11], 0 offen
-; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v1, s0
-; SI-NEXT:    buffer_store_dword v0, v1, s[8:11], 0 offen
+; SI-NEXT:    buffer_store_dword v1, v3, s[4:7], 0 offen
+; SI-NEXT:    buffer_store_dword v0, v2, s[4:7], 0 offen
 ; SI-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: lds_ds_fmax_f64:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
-; GFX7-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
-; GFX7-NEXT:    s_mov_b32 s10, -1
-; GFX7-NEXT:    s_load_dword s4, s[0:1], 0xb
+; GFX7-NEXT:    s_mov_b32 s4, SCRATCH_RSRC_DWORD0
+; GFX7-NEXT:    s_mov_b32 s5, SCRATCH_RSRC_DWORD1
+; GFX7-NEXT:    s_load_dword s2, s[0:1], 0xb
 ; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
-; GFX7-NEXT:    s_mov_b32 s11, 0xe8f000
-; GFX7-NEXT:    s_add_u32 s8, s8, s3
-; GFX7-NEXT:    s_mov_b32 s2, 0
-; GFX7-NEXT:    s_mov_b32 s3, 0x40450000
-; GFX7-NEXT:    v_mov_b32_e32 v0, s2
-; GFX7-NEXT:    s_addc_u32 s9, s9, 0
-; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    s_mov_b32 s6, -1
+; GFX7-NEXT:    s_mov_b32 s7, 0xe8f000
+; GFX7-NEXT:    s_add_u32 s4, s4, s3
+; GFX7-NEXT:    s_addc_u32 s5, s5, 0
+; GFX7-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_lshl_b32 s2, s4, 3
-; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    s_lshl_b32 s3, s2, 3
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0x40450000
+; GFX7-NEXT:    v_mov_b32_e32 v2, s3
 ; GFX7-NEXT:    s_mov_b32 m0, -1
 ; GFX7-NEXT:    ds_max_rtn_f64 v[2:3], v2, v[0:1] offset:32
-; GFX7-NEXT:    s_lshl_b32 s2, s4, 4
-; GFX7-NEXT:    v_mov_b32_e32 v4, s2
-; GFX7-NEXT:    ds_max_f64 v4, v[0:1] offset:64
-; GFX7-NEXT:    v_mov_b32_e32 v0, s1
+; GFX7-NEXT:    s_lshl_b32 s2, s2, 4
+; GFX7-NEXT:    v_mov_b32_e32 v5, s2
+; GFX7-NEXT:    v_mov_b32_e32 v4, s1
+; GFX7-NEXT:    ds_max_f64 v5, v[0:1] offset:64
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(1)
-; GFX7-NEXT:    ds_max_rtn_f64 v[0:1], v0, v[2:3]
+; GFX7-NEXT:    ds_max_rtn_f64 v[0:1], v4, v[2:3]
 ; GFX7-NEXT:    s_add_i32 s1, s0, 4
 ; GFX7-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX7-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    buffer_store_dword v1, v3, s[8:11], 0 offen
-; GFX7-NEXT:    buffer_store_dword v0, v2, s[8:11], 0 offen
+; GFX7-NEXT:    buffer_store_dword v1, v3, s[4:7], 0 offen
+; GFX7-NEXT:    buffer_store_dword v0, v2, s[4:7], 0 offen
 ; GFX7-NEXT:    s_endpgm
 ;
 ; VI-LABEL: lds_ds_fmax_f64:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_mov_b32 s88, SCRATCH_RSRC_DWORD0
 ; VI-NEXT:    s_mov_b32 s89, SCRATCH_RSRC_DWORD1
-; VI-NEXT:    s_mov_b32 s90, -1
-; VI-NEXT:    s_load_dword s4, s[0:1], 0x2c
+; VI-NEXT:    s_load_dword s2, s[0:1], 0x2c
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT:    s_mov_b32 s90, -1
 ; VI-NEXT:    s_mov_b32 s91, 0xe80000
 ; VI-NEXT:    s_add_u32 s88, s88, s3
-; VI-NEXT:    s_mov_b32 s2, 0
-; VI-NEXT:    s_mov_b32 s3, 0x40450000
-; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    s_addc_u32 s89, s89, 0
-; VI-NEXT:    v_mov_b32_e32 v1, s3
+; VI-NEXT:    v_mov_b32_e32 v0, 0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshl_b32 s2, s4, 3
-; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    s_lshl_b32 s3, s2, 3
+; VI-NEXT:    v_mov_b32_e32 v1, 0x40450000
+; VI-NEXT:    v_mov_b32_e32 v2, s3
 ; VI-NEXT:    s_mov_b32 m0, -1
 ; VI-NEXT:    ds_max_rtn_f64 v[2:3], v2, v[0:1] offset:32
-; VI-NEXT:    s_lshl_b32 s2, s4, 4
-; VI-NEXT:    v_mov_b32_e32 v4, s2
-; VI-NEXT:    ds_max_f64 v4, v[0:1] offset:64
-; VI-NEXT:    v_mov_b32_e32 v0, s1
+; VI-NEXT:    s_lshl_b32 s2, s2, 4
+; VI-NEXT:    v_mov_b32_e32 v5, s2
+; VI-NEXT:    v_mov_b32_e32 v4, s1
+; VI-NEXT:    ds_max_f64 v5, v[0:1] offset:64
 ; VI-NEXT:    s_waitcnt lgkmcnt(1)
-; VI-NEXT:    ds_max_rtn_f64 v[0:1], v0, v[2:3]
+; VI-NEXT:    ds_max_rtn_f64 v[0:1], v4, v[2:3]
 ; VI-NEXT:    s_add_i32 s1, s0, 4
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
@@ -1169,11 +1149,9 @@ define amdgpu_kernel void @lds_ds_fmax_f64(ptr addrspace(5) %out, ptr addrspace(
 ; GFX9-NEXT:    s_add_u32 s8, s8, s3
 ; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
 ; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX9-NEXT:    s_mov_b32 s0, 0
-; GFX9-NEXT:    s_mov_b32 s1, 0x40450000
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-NEXT:    s_addc_u32 s9, s9, 0
-; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x40450000
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_lshl_b32 s0, s4, 3
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s0
@@ -1200,17 +1178,15 @@ define amdgpu_kernel void @lds_ds_fmax_f64(ptr addrspace(5) %out, ptr addrspace(
 ; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    s_load_dword s4, s[0:1], 0x2c
 ; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX10-NEXT:    s_mov_b32 s0, 0
 ; GFX10-NEXT:    s_addc_u32 s9, s9, 0
-; GFX10-NEXT:    s_mov_b32 s1, 0x40450000
-; GFX10-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-NEXT:    v_mov_b32_e32 v1, s1
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0x40450000
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_lshl_b32 s5, s4, 3
+; GFX10-NEXT:    s_lshl_b32 s0, s4, 3
+; GFX10-NEXT:    v_mov_b32_e32 v5, s3
+; GFX10-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX10-NEXT:    s_lshl_b32 s0, s4, 4
-; GFX10-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX10-NEXT:    v_mov_b32_e32 v4, s0
-; GFX10-NEXT:    v_mov_b32_e32 v5, s3
 ; GFX10-NEXT:    ds_max_rtn_f64 v[2:3], v2, v[0:1] offset:32
 ; GFX10-NEXT:    ds_max_f64 v4, v[0:1] offset:64
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(1)
@@ -1224,16 +1200,14 @@ define amdgpu_kernel void @lds_ds_fmax_f64(ptr addrspace(5) %out, ptr addrspace(
 ; GFX11-LABEL: lds_ds_fmax_f64:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b32 s4, s[0:1], 0x2c
+; GFX11-NEXT:    s_load_b32 s2, s[0:1], 0x2c
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
-; GFX11-NEXT:    s_mov_b32 s2, 0
-; GFX11-NEXT:    s_mov_b32 s3, 0x40450000
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_lshl_b32 s5, s4, 3
-; GFX11-NEXT:    v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v2, s5
-; GFX11-NEXT:    s_lshl_b32 s2, s4, 4
+; GFX11-NEXT:    s_lshl_b32 s3, s2, 3
+; GFX11-NEXT:    v_mov_b32_e32 v5, s1
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0x40450000 :: v_dual_mov_b32 v2, s3
+; GFX11-NEXT:    s_lshl_b32 s2, s2, 4
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_mov_b32_e32 v4, s2
 ; GFX11-NEXT:    ds_max_rtn_f64 v[2:3], v2, v[0:1] offset:32
@@ -1256,11 +1230,11 @@ define amdgpu_kernel void @lds_ds_fmax_f64(ptr addrspace(5) %out, ptr addrspace(
 ; G_SI-NEXT:    s_mov_b32 s2, 0
 ; G_SI-NEXT:    s_addc_u32 s9, s9, 0
 ; G_SI-NEXT:    s_mov_b32 s3, 0x40450000
-; G_SI-NEXT:    v_mov_b32_e32 v0, s2
 ; G_SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; G_SI-NEXT:    s_add_i32 s4, s4, 4
-; G_SI-NEXT:    v_mov_b32_e32 v1, s3
+; G_SI-NEXT:    v_mov_b32_e32 v0, s2
 ; G_SI-NEXT:    s_lshl_b32 s2, s4, 3
+; G_SI-NEXT:    v_mov_b32_e32 v1, s3
 ; G_SI-NEXT:    v_mov_b32_e32 v2, s2
 ; G_SI-NEXT:    s_mov_b32 m0, -1
 ; G_SI-NEXT:    ds_max_rtn_f64 v[2:3], v2, v[0:1]
@@ -1290,11 +1264,11 @@ define amdgpu_kernel void @lds_ds_fmax_f64(ptr addrspace(5) %out, ptr addrspace(
 ; G_GFX7-NEXT:    s_mov_b32 s2, 0
 ; G_GFX7-NEXT:    s_addc_u32 s9, s9, 0
 ; G_GFX7-NEXT:    s_mov_b32 s3, 0x40450000
-; G_GFX7-NEXT:    v_mov_b32_e32 v0, s2
 ; G_GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; G_GFX7-NEXT:    s_add_i32 s4, s4, 4
-; G_GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; G_GFX7-NEXT:    v_mov_b32_e32 v0, s2
 ; G_GFX7-NEXT:    s_lshl_b32 s2, s4, 3
+; G_GFX7-NEXT:    v_mov_b32_e32 v1, s3
 ; G_GFX7-NEXT:    v_mov_b32_e32 v2, s2
 ; G_GFX7-NEXT:    s_mov_b32 m0, -1
 ; G_GFX7-NEXT:    ds_max_rtn_f64 v[2:3], v2, v[0:1]
@@ -1324,11 +1298,11 @@ define amdgpu_kernel void @lds_ds_fmax_f64(ptr addrspace(5) %out, ptr addrspace(
 ; G_VI-NEXT:    s_mov_b32 s2, 0
 ; G_VI-NEXT:    s_addc_u32 s89, s89, 0
 ; G_VI-NEXT:    s_mov_b32 s3, 0x40450000
-; G_VI-NEXT:    v_mov_b32_e32 v0, s2
 ; G_VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; G_VI-NEXT:    s_add_i32 s4, s4, 4
-; G_VI-NEXT:    v_mov_b32_e32 v1, s3
+; G_VI-NEXT:    v_mov_b32_e32 v0, s2
 ; G_VI-NEXT:    s_lshl_b32 s2, s4, 3
+; G_VI-NEXT:    v_mov_b32_e32 v1, s3
 ; G_VI-NEXT:    v_mov_b32_e32 v2, s2
 ; G_VI-NEXT:    s_mov_b32 m0, -1
 ; G_VI-NEXT:    ds_max_rtn_f64 v[2:3], v2, v[0:1]
@@ -1358,11 +1332,11 @@ define amdgpu_kernel void @lds_ds_fmax_f64(ptr addrspace(5) %out, ptr addrspace(
 ; G_GFX9-NEXT:    s_mov_b32 s0, 0
 ; G_GFX9-NEXT:    s_addc_u32 s9, s9, 0
 ; G_GFX9-NEXT:    s_mov_b32 s1, 0x40450000
-; G_GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; G_GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; G_GFX9-NEXT:    s_add_i32 s4, s4, 4
-; G_GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; G_GFX9-NEXT:    v_mov_b32_e32 v0, s0
 ; G_GFX9-NEXT:    s_lshl_b32 s0, s4, 3
+; G_GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; G_GFX9-NEXT:    v_mov_b32_e32 v2, s0
 ; G_GFX9-NEXT:    ds_max_rtn_f64 v[2:3], v2, v[0:1]
 ; G_GFX9-NEXT:    s_lshl_b32 s0, s4, 4
@@ -1417,10 +1391,10 @@ define amdgpu_kernel void @lds_ds_fmax_f64(ptr addrspace(5) %out, ptr addrspace(
 ; G_GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; G_GFX11-NEXT:    s_add_i32 s4, s2, 4
 ; G_GFX11-NEXT:    s_mov_b32 s2, 0
-; G_GFX11-NEXT:    s_lshl_b32 s5, s4, 3
 ; G_GFX11-NEXT:    s_mov_b32 s3, 0x40450000
+; G_GFX11-NEXT:    s_lshl_b32 s5, s4, 3
 ; G_GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v5, s1
-; G_GFX11-NEXT:    v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v1, s3
+; G_GFX11-NEXT:    v_dual_mov_b32 v1, s3 :: v_dual_mov_b32 v2, s5
 ; G_GFX11-NEXT:    s_lshl_b32 s2, s4, 4
 ; G_GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; G_GFX11-NEXT:    v_mov_b32_e32 v4, s2
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll
index 5a950d803e9c5..af59d62b2e2d0 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w32.ll
@@ -1,9 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr="+wavefrontsize32" -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX11,SDAG-GFX11 %s
-; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr="+wavefrontsize32" -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10,SDAG-GFX10 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr="+wavefrontsize32" -verify-machineinstrs < %s | FileCheck -check-prefixes=SDAG-GFX11 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr="+wavefrontsize32" -verify-machineinstrs < %s | FileCheck -check-prefixes=SDAG-GFX10 %s
 
-; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -mattr="+wavefrontsize32" -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX11,GISEL-GFX11 %s
-; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -mattr="+wavefrontsize32" -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10,GISEL-GFX10 %s
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -mattr="+wavefrontsize32" -verify-machineinstrs < %s | FileCheck -check-prefixes=GISEL-GFX11 %s
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -mattr="+wavefrontsize32" -verify-machineinstrs < %s | FileCheck -check-prefixes=GISEL-GFX10 %s
 
 declare i32 @llvm.amdgcn.fcmp.f32(float, float, i32) #0
 declare i32 @llvm.amdgcn.fcmp.f64(double, double, i32) #0
@@ -962,10 +962,8 @@ define amdgpu_kernel void @v_fcmp_f64_oeq(ptr addrspace(1) %out, double %src) {
 ; SDAG-GFX11-LABEL: v_fcmp_f64_oeq:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; SDAG-GFX11-NEXT:    s_mov_b32 s4, 0
-; SDAG-GFX11-NEXT:    s_mov_b32 s5, 0x40590000
 ; SDAG-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT:    v_cmp_eq_f64_e64 s2, s[2:3], s[4:5]
+; SDAG-GFX11-NEXT:    v_cmp_eq_f64_e64 s2, 0x40590000, s[2:3]
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -976,11 +974,9 @@ define amdgpu_kernel void @v_fcmp_f64_oeq(ptr addrspace(1) %out, double %src) {
 ; SDAG-GFX10-LABEL: v_fcmp_f64_oeq:
 ; SDAG-GFX10:       ; %bb.0:
 ; SDAG-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; SDAG-GFX10-NEXT:    s_mov_b32 s4, 0
-; SDAG-GFX10-NEXT:    s_mov_b32 s5, 0x40590000
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT:    v_cmp_eq_f64_e64 s2, s[2:3], s[4:5]
+; SDAG-GFX10-NEXT:    v_cmp_eq_f64_e64 s2, 0x40590000, s[2:3]
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s2
 ; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; SDAG-GFX10-NEXT:    s_endpgm
@@ -988,11 +984,9 @@ define amdgpu_kernel void @v_fcmp_f64_oeq(ptr addrspace(1) %out, double %src) {
 ; GISEL-GFX11-LABEL: v_fcmp_f64_oeq:
 ; GISEL-GFX11:       ; %bb.0:
 ; GISEL-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GISEL-GFX11-NEXT:    s_mov_b32 s4, 0
-; GISEL-GFX11-NEXT:    s_mov_b32 s5, 0x40590000
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT:    v_cmp_eq_f64_e64 s2, s[2:3], s[4:5]
+; GISEL-GFX11-NEXT:    v_cmp_eq_f64_e64 s2, 0x40590000, s[2:3]
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
@@ -1003,11 +997,9 @@ define amdgpu_kernel void @v_fcmp_f64_oeq(ptr addrspace(1) %out, double %src) {
 ; GISEL-GFX10-LABEL: v_fcmp_f64_oeq:
 ; GISEL-GFX10:       ; %bb.0:
 ; GISEL-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GISEL-GFX10-NEXT:    s_mov_b32 s4, 0
-; GISEL-GFX10-NEXT:    s_mov_b32 s5, 0x40590000
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT:    v_cmp_eq_f64_e64 s2, s[2:3], s[4:5]
+; GISEL-GFX10-NEXT:    v_cmp_eq_f64_e64 s2, 0x40590000, s[2:3]
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GISEL-GFX10-NEXT:    s_endpgm
@@ -1020,10 +1012,8 @@ define amdgpu_kernel void @v_fcmp_f64_one(ptr addrspace(1) %out, double %src) {
 ; SDAG-GFX11-LABEL: v_fcmp_f64_one:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; SDAG-GFX11-NEXT:    s_mov_b32 s4, 0
-; SDAG-GFX11-NEXT:    s_mov_b32 s5, 0x40590000
 ; SDAG-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT:    v_cmp_neq_f64_e64 s2, s[2:3], s[4:5]
+; SDAG-GFX11-NEXT:    v_cmp_neq_f64_e64 s2, 0x40590000, s[2:3]
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -1034,11 +1024,9 @@ define amdgpu_kernel void @v_fcmp_f64_one(ptr addrspace(1) %out, double %src) {
 ; SDAG-GFX10-LABEL: v_fcmp_f64_one:
 ; SDAG-GFX10:       ; %bb.0:
 ; SDAG-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; SDAG-GFX10-NEXT:    s_mov_b32 s4, 0
-; SDAG-GFX10-NEXT:    s_mov_b32 s5, 0x40590000
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT:    v_cmp_neq_f64_e64 s2, s[2:3], s[4:5]
+; SDAG-GFX10-NEXT:    v_cmp_neq_f64_e64 s2, 0x40590000, s[2:3]
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s2
 ; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; SDAG-GFX10-NEXT:    s_endpgm
@@ -1046,11 +1034,9 @@ define amdgpu_kernel void @v_fcmp_f64_one(ptr addrspace(1) %out, double %src) {
 ; GISEL-GFX11-LABEL: v_fcmp_f64_one:
 ; GISEL-GFX11:       ; %bb.0:
 ; GISEL-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GISEL-GFX11-NEXT:    s_mov_b32 s4, 0
-; GISEL-GFX11-NEXT:    s_mov_b32 s5, 0x40590000
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT:    v_cmp_neq_f64_e64 s2, s[2:3], s[4:5]
+; GISEL-GFX11-NEXT:    v_cmp_neq_f64_e64 s2, 0x40590000, s[2:3]
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
@@ -1061,11 +1047,9 @@ define amdgpu_kernel void @v_fcmp_f64_one(ptr addrspace(1) %out, double %src) {
 ; GISEL-GFX10-LABEL: v_fcmp_f64_one:
 ; GISEL-GFX10:       ; %bb.0:
 ; GISEL-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GISEL-GFX10-NEXT:    s_mov_b32 s4, 0
-; GISEL-GFX10-NEXT:    s_mov_b32 s5, 0x40590000
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT:    v_cmp_neq_f64_e64 s2, s[2:3], s[4:5]
+; GISEL-GFX10-NEXT:    v_cmp_neq_f64_e64 s2, 0x40590000, s[2:3]
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GISEL-GFX10-NEXT:    s_endpgm
@@ -1078,10 +1062,8 @@ define amdgpu_kernel void @v_fcmp_f64_ogt(ptr addrspace(1) %out, double %src) {
 ; SDAG-GFX11-LABEL: v_fcmp_f64_ogt:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; SDAG-GFX11-NEXT:    s_mov_b32 s4, 0
-; SDAG-GFX11-NEXT:    s_mov_b32 s5, 0x40590000
 ; SDAG-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT:    v_cmp_gt_f64_e64 s2, s[2:3], s[4:5]
+; SDAG-GFX11-NEXT:    v_cmp_lt_f64_e64 s2, 0x40590000, s[2:3]
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -1092,11 +1074,9 @@ define amdgpu_kernel void @v_fcmp_f64_ogt(ptr addrspace(1) %out, double %src) {
 ; SDAG-GFX10-LABEL: v_fcmp_f64_ogt:
 ; SDAG-GFX10:       ; %bb.0:
 ; SDAG-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; SDAG-GFX10-NEXT:    s_mov_b32 s4, 0
-; SDAG-GFX10-NEXT:    s_mov_b32 s5, 0x40590000
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT:    v_cmp_gt_f64_e64 s2, s[2:3], s[4:5]
+; SDAG-GFX10-NEXT:    v_cmp_lt_f64_e64 s2, 0x40590000, s[2:3]
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s2
 ; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; SDAG-GFX10-NEXT:    s_endpgm
@@ -1104,11 +1084,9 @@ define amdgpu_kernel void @v_fcmp_f64_ogt(ptr addrspace(1) %out, double %src) {
 ; GISEL-GFX11-LABEL: v_fcmp_f64_ogt:
 ; GISEL-GFX11:       ; %bb.0:
 ; GISEL-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GISEL-GFX11-NEXT:    s_mov_b32 s4, 0
-; GISEL-GFX11-NEXT:    s_mov_b32 s5, 0x40590000
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT:    v_cmp_gt_f64_e64 s2, s[2:3], s[4:5]
+; GISEL-GFX11-NEXT:    v_cmp_lt_f64_e64 s2, 0x40590000, s[2:3]
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
@@ -1119,11 +1097,9 @@ define amdgpu_kernel void @v_fcmp_f64_ogt(ptr addrspace(1) %out, double %src) {
 ; GISEL-GFX10-LABEL: v_fcmp_f64_ogt:
 ; GISEL-GFX10:       ; %bb.0:
 ; GISEL-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GISEL-GFX10-NEXT:    s_mov_b32 s4, 0
-; GISEL-GFX10-NEXT:    s_mov_b32 s5, 0x40590000
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT:    v_cmp_gt_f64_e64 s2, s[2:3], s[4:5]
+; GISEL-GFX10-NEXT:    v_cmp_lt_f64_e64 s2, 0x40590000, s[2:3]
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GISEL-GFX10-NEXT:    s_endpgm
@@ -1136,10 +1112,8 @@ define amdgpu_kernel void @v_fcmp_f64_oge(ptr addrspace(1) %out, double %src) {
 ; SDAG-GFX11-LABEL: v_fcmp_f64_oge:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; SDAG-GFX11-NEXT:    s_mov_b32 s4, 0
-; SDAG-GFX11-NEXT:    s_mov_b32 s5, 0x40590000
 ; SDAG-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT:    v_cmp_ge_f64_e64 s2, s[2:3], s[4:5]
+; SDAG-GFX11-NEXT:    v_cmp_le_f64_e64 s2, 0x40590000, s[2:3]
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -1150,11 +1124,9 @@ define amdgpu_kernel void @v_fcmp_f64_oge(ptr addrspace(1) %out, double %src) {
 ; SDAG-GFX10-LABEL: v_fcmp_f64_oge:
 ; SDAG-GFX10:       ; %bb.0:
 ; SDAG-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; SDAG-GFX10-NEXT:    s_mov_b32 s4, 0
-; SDAG-GFX10-NEXT:    s_mov_b32 s5, 0x40590000
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT:    v_cmp_ge_f64_e64 s2, s[2:3], s[4:5]
+; SDAG-GFX10-NEXT:    v_cmp_le_f64_e64 s2, 0x40590000, s[2:3]
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s2
 ; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; SDAG-GFX10-NEXT:    s_endpgm
@@ -1162,11 +1134,9 @@ define amdgpu_kernel void @v_fcmp_f64_oge(ptr addrspace(1) %out, double %src) {
 ; GISEL-GFX11-LABEL: v_fcmp_f64_oge:
 ; GISEL-GFX11:       ; %bb.0:
 ; GISEL-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GISEL-GFX11-NEXT:    s_mov_b32 s4, 0
-; GISEL-GFX11-NEXT:    s_mov_b32 s5, 0x40590000
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT:    v_cmp_ge_f64_e64 s2, s[2:3], s[4:5]
+; GISEL-GFX11-NEXT:    v_cmp_le_f64_e64 s2, 0x40590000, s[2:3]
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
@@ -1177,11 +1147,9 @@ define amdgpu_kernel void @v_fcmp_f64_oge(ptr addrspace(1) %out, double %src) {
 ; GISEL-GFX10-LABEL: v_fcmp_f64_oge:
 ; GISEL-GFX10:       ; %bb.0:
 ; GISEL-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GISEL-GFX10-NEXT:    s_mov_b32 s4, 0
-; GISEL-GFX10-NEXT:    s_mov_b32 s5, 0x40590000
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT:    v_cmp_ge_f64_e64 s2, s[2:3], s[4:5]
+; GISEL-GFX10-NEXT:    v_cmp_le_f64_e64 s2, 0x40590000, s[2:3]
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GISEL-GFX10-NEXT:    s_endpgm
@@ -1194,10 +1162,8 @@ define amdgpu_kernel void @v_fcmp_f64_olt(ptr addrspace(1) %out, double %src) {
 ; SDAG-GFX11-LABEL: v_fcmp_f64_olt:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; SDAG-GFX11-NEXT:    s_mov_b32 s4, 0
-; SDAG-GFX11-NEXT:    s_mov_b32 s5, 0x40590000
 ; SDAG-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT:    v_cmp_lt_f64_e64 s2, s[2:3], s[4:5]
+; SDAG-GFX11-NEXT:    v_cmp_gt_f64_e64 s2, 0x40590000, s[2:3]
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -1208,11 +1174,9 @@ define amdgpu_kernel void @v_fcmp_f64_olt(ptr addrspace(1) %out, double %src) {
 ; SDAG-GFX10-LABEL: v_fcmp_f64_olt:
 ; SDAG-GFX10:       ; %bb.0:
 ; SDAG-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; SDAG-GFX10-NEXT:    s_mov_b32 s4, 0
-; SDAG-GFX10-NEXT:    s_mov_b32 s5, 0x40590000
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT:    v_cmp_lt_f64_e64 s2, s[2:3], s[4:5]
+; SDAG-GFX10-NEXT:    v_cmp_gt_f64_e64 s2, 0x40590000, s[2:3]
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s2
 ; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; SDAG-GFX10-NEXT:    s_endpgm
@@ -1220,11 +1184,9 @@ define amdgpu_kernel void @v_fcmp_f64_olt(ptr addrspace(1) %out, double %src) {
 ; GISEL-GFX11-LABEL: v_fcmp_f64_olt:
 ; GISEL-GFX11:       ; %bb.0:
 ; GISEL-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GISEL-GFX11-NEXT:    s_mov_b32 s4, 0
-; GISEL-GFX11-NEXT:    s_mov_b32 s5, 0x40590000
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT:    v_cmp_lt_f64_e64 s2, s[2:3], s[4:5]
+; GISEL-GFX11-NEXT:    v_cmp_gt_f64_e64 s2, 0x40590000, s[2:3]
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
@@ -1235,11 +1197,9 @@ define amdgpu_kernel void @v_fcmp_f64_olt(ptr addrspace(1) %out, double %src) {
 ; GISEL-GFX10-LABEL: v_fcmp_f64_olt:
 ; GISEL-GFX10:       ; %bb.0:
 ; GISEL-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GISEL-GFX10-NEXT:    s_mov_b32 s4, 0
-; GISEL-GFX10-NEXT:    s_mov_b32 s5, 0x40590000
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT:    v_cmp_lt_f64_e64 s2, s[2:3], s[4:5]
+; GISEL-GFX10-NEXT:    v_cmp_gt_f64_e64 s2, 0x40590000, s[2:3]
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GISEL-GFX10-NEXT:    s_endpgm
@@ -1252,10 +1212,8 @@ define amdgpu_kernel void @v_fcmp_f64_ole(ptr addrspace(1) %out, double %src) {
 ; SDAG-GFX11-LABEL: v_fcmp_f64_ole:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; SDAG-GFX11-NEXT:    s_mov_b32 s4, 0
-; SDAG-GFX11-NEXT:    s_mov_b32 s5, 0x40590000
 ; SDAG-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT:    v_cmp_le_f64_e64 s2, s[2:3], s[4:5]
+; SDAG-GFX11-NEXT:    v_cmp_ge_f64_e64 s2, 0x40590000, s[2:3]
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -1266,11 +1224,9 @@ define amdgpu_kernel void @v_fcmp_f64_ole(ptr addrspace(1) %out, double %src) {
 ; SDAG-GFX10-LABEL: v_fcmp_f64_ole:
 ; SDAG-GFX10:       ; %bb.0:
 ; SDAG-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; SDAG-GFX10-NEXT:    s_mov_b32 s4, 0
-; SDAG-GFX10-NEXT:    s_mov_b32 s5, 0x40590000
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT:    v_cmp_le_f64_e64 s2, s[2:3], s[4:5]
+; SDAG-GFX10-NEXT:    v_cmp_ge_f64_e64 s2, 0x40590000, s[2:3]
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s2
 ; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; SDAG-GFX10-NEXT:    s_endpgm
@@ -1278,11 +1234,9 @@ define amdgpu_kernel void @v_fcmp_f64_ole(ptr addrspace(1) %out, double %src) {
 ; GISEL-GFX11-LABEL: v_fcmp_f64_ole:
 ; GISEL-GFX11:       ; %bb.0:
 ; GISEL-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GISEL-GFX11-NEXT:    s_mov_b32 s4, 0
-; GISEL-GFX11-NEXT:    s_mov_b32 s5, 0x40590000
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT:    v_cmp_le_f64_e64 s2, s[2:3], s[4:5]
+; GISEL-GFX11-NEXT:    v_cmp_ge_f64_e64 s2, 0x40590000, s[2:3]
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
@@ -1293,11 +1247,9 @@ define amdgpu_kernel void @v_fcmp_f64_ole(ptr addrspace(1) %out, double %src) {
 ; GISEL-GFX10-LABEL: v_fcmp_f64_ole:
 ; GISEL-GFX10:       ; %bb.0:
 ; GISEL-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GISEL-GFX10-NEXT:    s_mov_b32 s4, 0
-; GISEL-GFX10-NEXT:    s_mov_b32 s5, 0x40590000
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT:    v_cmp_le_f64_e64 s2, s[2:3], s[4:5]
+; GISEL-GFX10-NEXT:    v_cmp_ge_f64_e64 s2, 0x40590000, s[2:3]
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GISEL-GFX10-NEXT:    s_endpgm
@@ -1310,10 +1262,8 @@ define amdgpu_kernel void @v_fcmp_f64_ueq(ptr addrspace(1) %out, double %src) {
 ; SDAG-GFX11-LABEL: v_fcmp_f64_ueq:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; SDAG-GFX11-NEXT:    s_mov_b32 s4, 0
-; SDAG-GFX11-NEXT:    s_mov_b32 s5, 0x40590000
 ; SDAG-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT:    v_cmp_nlg_f64_e64 s2, s[2:3], s[4:5]
+; SDAG-GFX11-NEXT:    v_cmp_nlg_f64_e64 s2, 0x40590000, s[2:3]
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -1324,11 +1274,9 @@ define amdgpu_kernel void @v_fcmp_f64_ueq(ptr addrspace(1) %out, double %src) {
 ; SDAG-GFX10-LABEL: v_fcmp_f64_ueq:
 ; SDAG-GFX10:       ; %bb.0:
 ; SDAG-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; SDAG-GFX10-NEXT:    s_mov_b32 s4, 0
-; SDAG-GFX10-NEXT:    s_mov_b32 s5, 0x40590000
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT:    v_cmp_nlg_f64_e64 s2, s[2:3], s[4:5]
+; SDAG-GFX10-NEXT:    v_cmp_nlg_f64_e64 s2, 0x40590000, s[2:3]
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s2
 ; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; SDAG-GFX10-NEXT:    s_endpgm
@@ -1336,11 +1284,9 @@ define amdgpu_kernel void @v_fcmp_f64_ueq(ptr addrspace(1) %out, double %src) {
 ; GISEL-GFX11-LABEL: v_fcmp_f64_ueq:
 ; GISEL-GFX11:       ; %bb.0:
 ; GISEL-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GISEL-GFX11-NEXT:    s_mov_b32 s4, 0
-; GISEL-GFX11-NEXT:    s_mov_b32 s5, 0x40590000
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT:    v_cmp_nlg_f64_e64 s2, s[2:3], s[4:5]
+; GISEL-GFX11-NEXT:    v_cmp_nlg_f64_e64 s2, 0x40590000, s[2:3]
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
@@ -1351,11 +1297,9 @@ define amdgpu_kernel void @v_fcmp_f64_ueq(ptr addrspace(1) %out, double %src) {
 ; GISEL-GFX10-LABEL: v_fcmp_f64_ueq:
 ; GISEL-GFX10:       ; %bb.0:
 ; GISEL-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GISEL-GFX10-NEXT:    s_mov_b32 s4, 0
-; GISEL-GFX10-NEXT:    s_mov_b32 s5, 0x40590000
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT:    v_cmp_nlg_f64_e64 s2, s[2:3], s[4:5]
+; GISEL-GFX10-NEXT:    v_cmp_nlg_f64_e64 s2, 0x40590000, s[2:3]
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GISEL-GFX10-NEXT:    s_endpgm
@@ -1368,10 +1312,8 @@ define amdgpu_kernel void @v_fcmp_f64_o(ptr addrspace(1) %out, double %src) {
 ; SDAG-GFX11-LABEL: v_fcmp_f64_o:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; SDAG-GFX11-NEXT:    s_mov_b32 s4, 0
-; SDAG-GFX11-NEXT:    s_mov_b32 s5, 0x40590000
 ; SDAG-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT:    v_cmp_o_f64_e64 s2, s[2:3], s[4:5]
+; SDAG-GFX11-NEXT:    v_cmp_o_f64_e64 s2, 0x40590000, s[2:3]
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -1382,11 +1324,9 @@ define amdgpu_kernel void @v_fcmp_f64_o(ptr addrspace(1) %out, double %src) {
 ; SDAG-GFX10-LABEL: v_fcmp_f64_o:
 ; SDAG-GFX10:       ; %bb.0:
 ; SDAG-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; SDAG-GFX10-NEXT:    s_mov_b32 s4, 0
-; SDAG-GFX10-NEXT:    s_mov_b32 s5, 0x40590000
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT:    v_cmp_o_f64_e64 s2, s[2:3], s[4:5]
+; SDAG-GFX10-NEXT:    v_cmp_o_f64_e64 s2, 0x40590000, s[2:3]
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s2
 ; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; SDAG-GFX10-NEXT:    s_endpgm
@@ -1394,11 +1334,9 @@ define amdgpu_kernel void @v_fcmp_f64_o(ptr addrspace(1) %out, double %src) {
 ; GISEL-GFX11-LABEL: v_fcmp_f64_o:
 ; GISEL-GFX11:       ; %bb.0:
 ; GISEL-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GISEL-GFX11-NEXT:    s_mov_b32 s4, 0
-; GISEL-GFX11-NEXT:    s_mov_b32 s5, 0x40590000
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT:    v_cmp_o_f64_e64 s2, s[2:3], s[4:5]
+; GISEL-GFX11-NEXT:    v_cmp_o_f64_e64 s2, 0x40590000, s[2:3]
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
@@ -1409,11 +1347,9 @@ define amdgpu_kernel void @v_fcmp_f64_o(ptr addrspace(1) %out, double %src) {
 ; GISEL-GFX10-LABEL: v_fcmp_f64_o:
 ; GISEL-GFX10:       ; %bb.0:
 ; GISEL-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GISEL-GFX10-NEXT:    s_mov_b32 s4, 0
-; GISEL-GFX10-NEXT:    s_mov_b32 s5, 0x40590000
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT:    v_cmp_o_f64_e64 s2, s[2:3], s[4:5]
+; GISEL-GFX10-NEXT:    v_cmp_o_f64_e64 s2, 0x40590000, s[2:3]
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GISEL-GFX10-NEXT:    s_endpgm
@@ -1426,10 +1362,8 @@ define amdgpu_kernel void @v_fcmp_f64_uo(ptr addrspace(1) %out, double %src) {
 ; SDAG-GFX11-LABEL: v_fcmp_f64_uo:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; SDAG-GFX11-NEXT:    s_mov_b32 s4, 0
-; SDAG-GFX11-NEXT:    s_mov_b32 s5, 0x40590000
 ; SDAG-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT:    v_cmp_u_f64_e64 s2, s[2:3], s[4:5]
+; SDAG-GFX11-NEXT:    v_cmp_u_f64_e64 s2, 0x40590000, s[2:3]
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -1440,11 +1374,9 @@ define amdgpu_kernel void @v_fcmp_f64_uo(ptr addrspace(1) %out, double %src) {
 ; SDAG-GFX10-LABEL: v_fcmp_f64_uo:
 ; SDAG-GFX10:       ; %bb.0:
 ; SDAG-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; SDAG-GFX10-NEXT:    s_mov_b32 s4, 0
-; SDAG-GFX10-NEXT:    s_mov_b32 s5, 0x40590000
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT:    v_cmp_u_f64_e64 s2, s[2:3], s[4:5]
+; SDAG-GFX10-NEXT:    v_cmp_u_f64_e64 s2, 0x40590000, s[2:3]
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s2
 ; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; SDAG-GFX10-NEXT:    s_endpgm
@@ -1452,11 +1384,9 @@ define amdgpu_kernel void @v_fcmp_f64_uo(ptr addrspace(1) %out, double %src) {
 ; GISEL-GFX11-LABEL: v_fcmp_f64_uo:
 ; GISEL-GFX11:       ; %bb.0:
 ; GISEL-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GISEL-GFX11-NEXT:    s_mov_b32 s4, 0
-; GISEL-GFX11-NEXT:    s_mov_b32 s5, 0x40590000
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT:    v_cmp_u_f64_e64 s2, s[2:3], s[4:5]
+; GISEL-GFX11-NEXT:    v_cmp_u_f64_e64 s2, 0x40590000, s[2:3]
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
@@ -1467,11 +1397,9 @@ define amdgpu_kernel void @v_fcmp_f64_uo(ptr addrspace(1) %out, double %src) {
 ; GISEL-GFX10-LABEL: v_fcmp_f64_uo:
 ; GISEL-GFX10:       ; %bb.0:
 ; GISEL-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GISEL-GFX10-NEXT:    s_mov_b32 s4, 0
-; GISEL-GFX10-NEXT:    s_mov_b32 s5, 0x40590000
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT:    v_cmp_u_f64_e64 s2, s[2:3], s[4:5]
+; GISEL-GFX10-NEXT:    v_cmp_u_f64_e64 s2, 0x40590000, s[2:3]
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GISEL-GFX10-NEXT:    s_endpgm
@@ -1484,10 +1412,8 @@ define amdgpu_kernel void @v_fcmp_f64_une(ptr addrspace(1) %out, double %src) {
 ; SDAG-GFX11-LABEL: v_fcmp_f64_une:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; SDAG-GFX11-NEXT:    s_mov_b32 s4, 0
-; SDAG-GFX11-NEXT:    s_mov_b32 s5, 0x40590000
 ; SDAG-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT:    v_cmp_neq_f64_e64 s2, s[2:3], s[4:5]
+; SDAG-GFX11-NEXT:    v_cmp_neq_f64_e64 s2, 0x40590000, s[2:3]
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -1498,11 +1424,9 @@ define amdgpu_kernel void @v_fcmp_f64_une(ptr addrspace(1) %out, double %src) {
 ; SDAG-GFX10-LABEL: v_fcmp_f64_une:
 ; SDAG-GFX10:       ; %bb.0:
 ; SDAG-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; SDAG-GFX10-NEXT:    s_mov_b32 s4, 0
-; SDAG-GFX10-NEXT:    s_mov_b32 s5, 0x40590000
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT:    v_cmp_neq_f64_e64 s2, s[2:3], s[4:5]
+; SDAG-GFX10-NEXT:    v_cmp_neq_f64_e64 s2, 0x40590000, s[2:3]
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s2
 ; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; SDAG-GFX10-NEXT:    s_endpgm
@@ -1510,11 +1434,9 @@ define amdgpu_kernel void @v_fcmp_f64_une(ptr addrspace(1) %out, double %src) {
 ; GISEL-GFX11-LABEL: v_fcmp_f64_une:
 ; GISEL-GFX11:       ; %bb.0:
 ; GISEL-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GISEL-GFX11-NEXT:    s_mov_b32 s4, 0
-; GISEL-GFX11-NEXT:    s_mov_b32 s5, 0x40590000
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT:    v_cmp_neq_f64_e64 s2, s[2:3], s[4:5]
+; GISEL-GFX11-NEXT:    v_cmp_neq_f64_e64 s2, 0x40590000, s[2:3]
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
@@ -1525,11 +1447,9 @@ define amdgpu_kernel void @v_fcmp_f64_une(ptr addrspace(1) %out, double %src) {
 ; GISEL-GFX10-LABEL: v_fcmp_f64_une:
 ; GISEL-GFX10:       ; %bb.0:
 ; GISEL-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GISEL-GFX10-NEXT:    s_mov_b32 s4, 0
-; GISEL-GFX10-NEXT:    s_mov_b32 s5, 0x40590000
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT:    v_cmp_neq_f64_e64 s2, s[2:3], s[4:5]
+; GISEL-GFX10-NEXT:    v_cmp_neq_f64_e64 s2, 0x40590000, s[2:3]
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GISEL-GFX10-NEXT:    s_endpgm
@@ -1542,10 +1462,8 @@ define amdgpu_kernel void @v_fcmp_f64_ugt(ptr addrspace(1) %out, double %src) {
 ; SDAG-GFX11-LABEL: v_fcmp_f64_ugt:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; SDAG-GFX11-NEXT:    s_mov_b32 s4, 0
-; SDAG-GFX11-NEXT:    s_mov_b32 s5, 0x40590000
 ; SDAG-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT:    v_cmp_nle_f64_e64 s2, s[2:3], s[4:5]
+; SDAG-GFX11-NEXT:    v_cmp_nge_f64_e64 s2, 0x40590000, s[2:3]
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -1556,11 +1474,9 @@ define amdgpu_kernel void @v_fcmp_f64_ugt(ptr addrspace(1) %out, double %src) {
 ; SDAG-GFX10-LABEL: v_fcmp_f64_ugt:
 ; SDAG-GFX10:       ; %bb.0:
 ; SDAG-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; SDAG-GFX10-NEXT:    s_mov_b32 s4, 0
-; SDAG-GFX10-NEXT:    s_mov_b32 s5, 0x40590000
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT:    v_cmp_nle_f64_e64 s2, s[2:3], s[4:5]
+; SDAG-GFX10-NEXT:    v_cmp_nge_f64_e64 s2, 0x40590000, s[2:3]
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s2
 ; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; SDAG-GFX10-NEXT:    s_endpgm
@@ -1568,11 +1484,9 @@ define amdgpu_kernel void @v_fcmp_f64_ugt(ptr addrspace(1) %out, double %src) {
 ; GISEL-GFX11-LABEL: v_fcmp_f64_ugt:
 ; GISEL-GFX11:       ; %bb.0:
 ; GISEL-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GISEL-GFX11-NEXT:    s_mov_b32 s4, 0
-; GISEL-GFX11-NEXT:    s_mov_b32 s5, 0x40590000
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT:    v_cmp_nle_f64_e64 s2, s[2:3], s[4:5]
+; GISEL-GFX11-NEXT:    v_cmp_nge_f64_e64 s2, 0x40590000, s[2:3]
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
@@ -1583,11 +1497,9 @@ define amdgpu_kernel void @v_fcmp_f64_ugt(ptr addrspace(1) %out, double %src) {
 ; GISEL-GFX10-LABEL: v_fcmp_f64_ugt:
 ; GISEL-GFX10:       ; %bb.0:
 ; GISEL-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GISEL-GFX10-NEXT:    s_mov_b32 s4, 0
-; GISEL-GFX10-NEXT:    s_mov_b32 s5, 0x40590000
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT:    v_cmp_nle_f64_e64 s2, s[2:3], s[4:5]
+; GISEL-GFX10-NEXT:    v_cmp_nge_f64_e64 s2, 0x40590000, s[2:3]
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GISEL-GFX10-NEXT:    s_endpgm
@@ -1600,10 +1512,8 @@ define amdgpu_kernel void @v_fcmp_f64_uge(ptr addrspace(1) %out, double %src) {
 ; SDAG-GFX11-LABEL: v_fcmp_f64_uge:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; SDAG-GFX11-NEXT:    s_mov_b32 s4, 0
-; SDAG-GFX11-NEXT:    s_mov_b32 s5, 0x40590000
 ; SDAG-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT:    v_cmp_nlt_f64_e64 s2, s[2:3], s[4:5]
+; SDAG-GFX11-NEXT:    v_cmp_ngt_f64_e64 s2, 0x40590000, s[2:3]
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -1614,11 +1524,9 @@ define amdgpu_kernel void @v_fcmp_f64_uge(ptr addrspace(1) %out, double %src) {
 ; SDAG-GFX10-LABEL: v_fcmp_f64_uge:
 ; SDAG-GFX10:       ; %bb.0:
 ; SDAG-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; SDAG-GFX10-NEXT:    s_mov_b32 s4, 0
-; SDAG-GFX10-NEXT:    s_mov_b32 s5, 0x40590000
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT:    v_cmp_nlt_f64_e64 s2, s[2:3], s[4:5]
+; SDAG-GFX10-NEXT:    v_cmp_ngt_f64_e64 s2, 0x40590000, s[2:3]
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s2
 ; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; SDAG-GFX10-NEXT:    s_endpgm
@@ -1626,11 +1534,9 @@ define amdgpu_kernel void @v_fcmp_f64_uge(ptr addrspace(1) %out, double %src) {
 ; GISEL-GFX11-LABEL: v_fcmp_f64_uge:
 ; GISEL-GFX11:       ; %bb.0:
 ; GISEL-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GISEL-GFX11-NEXT:    s_mov_b32 s4, 0
-; GISEL-GFX11-NEXT:    s_mov_b32 s5, 0x40590000
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT:    v_cmp_nlt_f64_e64 s2, s[2:3], s[4:5]
+; GISEL-GFX11-NEXT:    v_cmp_ngt_f64_e64 s2, 0x40590000, s[2:3]
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
@@ -1641,11 +1547,9 @@ define amdgpu_kernel void @v_fcmp_f64_uge(ptr addrspace(1) %out, double %src) {
 ; GISEL-GFX10-LABEL: v_fcmp_f64_uge:
 ; GISEL-GFX10:       ; %bb.0:
 ; GISEL-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GISEL-GFX10-NEXT:    s_mov_b32 s4, 0
-; GISEL-GFX10-NEXT:    s_mov_b32 s5, 0x40590000
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT:    v_cmp_nlt_f64_e64 s2, s[2:3], s[4:5]
+; GISEL-GFX10-NEXT:    v_cmp_ngt_f64_e64 s2, 0x40590000, s[2:3]
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GISEL-GFX10-NEXT:    s_endpgm
@@ -1658,10 +1562,8 @@ define amdgpu_kernel void @v_fcmp_f64_ult(ptr addrspace(1) %out, double %src) {
 ; SDAG-GFX11-LABEL: v_fcmp_f64_ult:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; SDAG-GFX11-NEXT:    s_mov_b32 s4, 0
-; SDAG-GFX11-NEXT:    s_mov_b32 s5, 0x40590000
 ; SDAG-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT:    v_cmp_nge_f64_e64 s2, s[2:3], s[4:5]
+; SDAG-GFX11-NEXT:    v_cmp_nle_f64_e64 s2, 0x40590000, s[2:3]
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -1672,11 +1574,9 @@ define amdgpu_kernel void @v_fcmp_f64_ult(ptr addrspace(1) %out, double %src) {
 ; SDAG-GFX10-LABEL: v_fcmp_f64_ult:
 ; SDAG-GFX10:       ; %bb.0:
 ; SDAG-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; SDAG-GFX10-NEXT:    s_mov_b32 s4, 0
-; SDAG-GFX10-NEXT:    s_mov_b32 s5, 0x40590000
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT:    v_cmp_nge_f64_e64 s2, s[2:3], s[4:5]
+; SDAG-GFX10-NEXT:    v_cmp_nle_f64_e64 s2, 0x40590000, s[2:3]
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s2
 ; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; SDAG-GFX10-NEXT:    s_endpgm
@@ -1684,11 +1584,9 @@ define amdgpu_kernel void @v_fcmp_f64_ult(ptr addrspace(1) %out, double %src) {
 ; GISEL-GFX11-LABEL: v_fcmp_f64_ult:
 ; GISEL-GFX11:       ; %bb.0:
 ; GISEL-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GISEL-GFX11-NEXT:    s_mov_b32 s4, 0
-; GISEL-GFX11-NEXT:    s_mov_b32 s5, 0x40590000
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT:    v_cmp_nge_f64_e64 s2, s[2:3], s[4:5]
+; GISEL-GFX11-NEXT:    v_cmp_nle_f64_e64 s2, 0x40590000, s[2:3]
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
@@ -1699,11 +1597,9 @@ define amdgpu_kernel void @v_fcmp_f64_ult(ptr addrspace(1) %out, double %src) {
 ; GISEL-GFX10-LABEL: v_fcmp_f64_ult:
 ; GISEL-GFX10:       ; %bb.0:
 ; GISEL-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GISEL-GFX10-NEXT:    s_mov_b32 s4, 0
-; GISEL-GFX10-NEXT:    s_mov_b32 s5, 0x40590000
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT:    v_cmp_nge_f64_e64 s2, s[2:3], s[4:5]
+; GISEL-GFX10-NEXT:    v_cmp_nle_f64_e64 s2, 0x40590000, s[2:3]
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GISEL-GFX10-NEXT:    s_endpgm
@@ -1716,10 +1612,8 @@ define amdgpu_kernel void @v_fcmp_f64_ule(ptr addrspace(1) %out, double %src) {
 ; SDAG-GFX11-LABEL: v_fcmp_f64_ule:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; SDAG-GFX11-NEXT:    s_mov_b32 s4, 0
-; SDAG-GFX11-NEXT:    s_mov_b32 s5, 0x40590000
 ; SDAG-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT:    v_cmp_ngt_f64_e64 s2, s[2:3], s[4:5]
+; SDAG-GFX11-NEXT:    v_cmp_nlt_f64_e64 s2, 0x40590000, s[2:3]
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -1730,11 +1624,9 @@ define amdgpu_kernel void @v_fcmp_f64_ule(ptr addrspace(1) %out, double %src) {
 ; SDAG-GFX10-LABEL: v_fcmp_f64_ule:
 ; SDAG-GFX10:       ; %bb.0:
 ; SDAG-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; SDAG-GFX10-NEXT:    s_mov_b32 s4, 0
-; SDAG-GFX10-NEXT:    s_mov_b32 s5, 0x40590000
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT:    v_cmp_ngt_f64_e64 s2, s[2:3], s[4:5]
+; SDAG-GFX10-NEXT:    v_cmp_nlt_f64_e64 s2, 0x40590000, s[2:3]
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s2
 ; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; SDAG-GFX10-NEXT:    s_endpgm
@@ -1742,11 +1634,9 @@ define amdgpu_kernel void @v_fcmp_f64_ule(ptr addrspace(1) %out, double %src) {
 ; GISEL-GFX11-LABEL: v_fcmp_f64_ule:
 ; GISEL-GFX11:       ; %bb.0:
 ; GISEL-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GISEL-GFX11-NEXT:    s_mov_b32 s4, 0
-; GISEL-GFX11-NEXT:    s_mov_b32 s5, 0x40590000
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT:    v_cmp_ngt_f64_e64 s2, s[2:3], s[4:5]
+; GISEL-GFX11-NEXT:    v_cmp_nlt_f64_e64 s2, 0x40590000, s[2:3]
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
@@ -1757,11 +1647,9 @@ define amdgpu_kernel void @v_fcmp_f64_ule(ptr addrspace(1) %out, double %src) {
 ; GISEL-GFX10-LABEL: v_fcmp_f64_ule:
 ; GISEL-GFX10:       ; %bb.0:
 ; GISEL-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GISEL-GFX10-NEXT:    s_mov_b32 s4, 0
-; GISEL-GFX10-NEXT:    s_mov_b32 s5, 0x40590000
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT:    v_cmp_ngt_f64_e64 s2, s[2:3], s[4:5]
+; GISEL-GFX10-NEXT:    v_cmp_nlt_f64_e64 s2, 0x40590000, s[2:3]
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GISEL-GFX10-NEXT:    s_endpgm
@@ -2754,7 +2642,3 @@ define amdgpu_kernel void @v_fcmp_f16_ule(ptr addrspace(1) %out, half %src) {
 }
 
 attributes #0 = { nounwind readnone convergent }
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; GCN: {{.*}}
-; GFX10: {{.*}}
-; GFX11: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll
index e2bdcfa6bbddc..8c76df3e041fd 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.w64.ll
@@ -1046,11 +1046,9 @@ define amdgpu_kernel void @v_fcmp_f64_oeq(ptr addrspace(1) %out, double %src) {
 ; GFX11-LABEL: v_fcmp_f64_oeq:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT:    s_mov_b32 s4, 0
-; GFX11-NEXT:    s_mov_b32 s5, 0x40590000
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_eq_f64_e64 s[2:3], s[2:3], s[4:5]
+; GFX11-NEXT:    v_cmp_eq_f64_e64 s[2:3], 0x40590000, s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
@@ -1059,32 +1057,18 @@ define amdgpu_kernel void @v_fcmp_f64_oeq(ptr addrspace(1) %out, double %src) {
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
-; GFX9-SDAG-LABEL: v_fcmp_f64_oeq:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, 0x40590000
-; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_cmp_eq_f64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s2
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, s3
-; GFX9-SDAG-NEXT:    global_store_dwordx2 v0, v[1:2], s[0:1]
-; GFX9-SDAG-NEXT:    s_endpgm
-;
-; GFX9-GISEL-LABEL: v_fcmp_f64_oeq:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 0x40590000
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_cmp_eq_f64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
-; GFX9-GISEL-NEXT:    s_endpgm
+; GFX9-LABEL: v_fcmp_f64_oeq:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x40590000
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_f64_e64 s[2:3], s[2:3], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_fcmp_f64_oeq:
 ; VI-SDAG:       ; %bb.0:
@@ -1103,10 +1087,8 @@ define amdgpu_kernel void @v_fcmp_f64_oeq(ptr addrspace(1) %out, double %src) {
 ; VI-GISEL-LABEL: v_fcmp_f64_oeq:
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-GISEL-NEXT:    s_mov_b32 s4, 0
-; VI-GISEL-NEXT:    s_mov_b32 s5, 0x40590000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s4
-; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x40590000
 ; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_cmp_eq_f64_e64 s[2:3], s[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v3, s1
@@ -1124,11 +1106,9 @@ define amdgpu_kernel void @v_fcmp_f64_one(ptr addrspace(1) %out, double %src) {
 ; GFX11-LABEL: v_fcmp_f64_one:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT:    s_mov_b32 s4, 0
-; GFX11-NEXT:    s_mov_b32 s5, 0x40590000
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_neq_f64_e64 s[2:3], s[2:3], s[4:5]
+; GFX11-NEXT:    v_cmp_neq_f64_e64 s[2:3], 0x40590000, s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
@@ -1137,32 +1117,18 @@ define amdgpu_kernel void @v_fcmp_f64_one(ptr addrspace(1) %out, double %src) {
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
-; GFX9-SDAG-LABEL: v_fcmp_f64_one:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, 0x40590000
-; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_cmp_neq_f64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s2
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, s3
-; GFX9-SDAG-NEXT:    global_store_dwordx2 v0, v[1:2], s[0:1]
-; GFX9-SDAG-NEXT:    s_endpgm
-;
-; GFX9-GISEL-LABEL: v_fcmp_f64_one:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 0x40590000
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_cmp_neq_f64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
-; GFX9-GISEL-NEXT:    s_endpgm
+; GFX9-LABEL: v_fcmp_f64_one:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x40590000
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_neq_f64_e64 s[2:3], s[2:3], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_fcmp_f64_one:
 ; VI-SDAG:       ; %bb.0:
@@ -1181,10 +1147,8 @@ define amdgpu_kernel void @v_fcmp_f64_one(ptr addrspace(1) %out, double %src) {
 ; VI-GISEL-LABEL: v_fcmp_f64_one:
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-GISEL-NEXT:    s_mov_b32 s4, 0
-; VI-GISEL-NEXT:    s_mov_b32 s5, 0x40590000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s4
-; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x40590000
 ; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_cmp_neq_f64_e64 s[2:3], s[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v3, s1
@@ -1202,11 +1166,9 @@ define amdgpu_kernel void @v_fcmp_f64_ogt(ptr addrspace(1) %out, double %src) {
 ; GFX11-LABEL: v_fcmp_f64_ogt:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT:    s_mov_b32 s4, 0
-; GFX11-NEXT:    s_mov_b32 s5, 0x40590000
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_gt_f64_e64 s[2:3], s[2:3], s[4:5]
+; GFX11-NEXT:    v_cmp_lt_f64_e64 s[2:3], 0x40590000, s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
@@ -1215,32 +1177,18 @@ define amdgpu_kernel void @v_fcmp_f64_ogt(ptr addrspace(1) %out, double %src) {
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
-; GFX9-SDAG-LABEL: v_fcmp_f64_ogt:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, 0x40590000
-; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_cmp_gt_f64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s2
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, s3
-; GFX9-SDAG-NEXT:    global_store_dwordx2 v0, v[1:2], s[0:1]
-; GFX9-SDAG-NEXT:    s_endpgm
-;
-; GFX9-GISEL-LABEL: v_fcmp_f64_ogt:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 0x40590000
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_cmp_gt_f64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
-; GFX9-GISEL-NEXT:    s_endpgm
+; GFX9-LABEL: v_fcmp_f64_ogt:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x40590000
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_gt_f64_e64 s[2:3], s[2:3], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_fcmp_f64_ogt:
 ; VI-SDAG:       ; %bb.0:
@@ -1259,10 +1207,8 @@ define amdgpu_kernel void @v_fcmp_f64_ogt(ptr addrspace(1) %out, double %src) {
 ; VI-GISEL-LABEL: v_fcmp_f64_ogt:
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-GISEL-NEXT:    s_mov_b32 s4, 0
-; VI-GISEL-NEXT:    s_mov_b32 s5, 0x40590000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s4
-; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x40590000
 ; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_cmp_gt_f64_e64 s[2:3], s[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v3, s1
@@ -1280,11 +1226,9 @@ define amdgpu_kernel void @v_fcmp_f64_oge(ptr addrspace(1) %out, double %src) {
 ; GFX11-LABEL: v_fcmp_f64_oge:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT:    s_mov_b32 s4, 0
-; GFX11-NEXT:    s_mov_b32 s5, 0x40590000
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_ge_f64_e64 s[2:3], s[2:3], s[4:5]
+; GFX11-NEXT:    v_cmp_le_f64_e64 s[2:3], 0x40590000, s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
@@ -1293,32 +1237,18 @@ define amdgpu_kernel void @v_fcmp_f64_oge(ptr addrspace(1) %out, double %src) {
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
-; GFX9-SDAG-LABEL: v_fcmp_f64_oge:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, 0x40590000
-; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_cmp_ge_f64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s2
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, s3
-; GFX9-SDAG-NEXT:    global_store_dwordx2 v0, v[1:2], s[0:1]
-; GFX9-SDAG-NEXT:    s_endpgm
-;
-; GFX9-GISEL-LABEL: v_fcmp_f64_oge:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 0x40590000
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_cmp_ge_f64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
-; GFX9-GISEL-NEXT:    s_endpgm
+; GFX9-LABEL: v_fcmp_f64_oge:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x40590000
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_ge_f64_e64 s[2:3], s[2:3], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_fcmp_f64_oge:
 ; VI-SDAG:       ; %bb.0:
@@ -1337,10 +1267,8 @@ define amdgpu_kernel void @v_fcmp_f64_oge(ptr addrspace(1) %out, double %src) {
 ; VI-GISEL-LABEL: v_fcmp_f64_oge:
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-GISEL-NEXT:    s_mov_b32 s4, 0
-; VI-GISEL-NEXT:    s_mov_b32 s5, 0x40590000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s4
-; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x40590000
 ; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_cmp_ge_f64_e64 s[2:3], s[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v3, s1
@@ -1358,11 +1286,9 @@ define amdgpu_kernel void @v_fcmp_f64_olt(ptr addrspace(1) %out, double %src) {
 ; GFX11-LABEL: v_fcmp_f64_olt:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT:    s_mov_b32 s4, 0
-; GFX11-NEXT:    s_mov_b32 s5, 0x40590000
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_lt_f64_e64 s[2:3], s[2:3], s[4:5]
+; GFX11-NEXT:    v_cmp_gt_f64_e64 s[2:3], 0x40590000, s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
@@ -1371,32 +1297,18 @@ define amdgpu_kernel void @v_fcmp_f64_olt(ptr addrspace(1) %out, double %src) {
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
-; GFX9-SDAG-LABEL: v_fcmp_f64_olt:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, 0x40590000
-; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_cmp_lt_f64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s2
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, s3
-; GFX9-SDAG-NEXT:    global_store_dwordx2 v0, v[1:2], s[0:1]
-; GFX9-SDAG-NEXT:    s_endpgm
-;
-; GFX9-GISEL-LABEL: v_fcmp_f64_olt:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 0x40590000
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_cmp_lt_f64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
-; GFX9-GISEL-NEXT:    s_endpgm
+; GFX9-LABEL: v_fcmp_f64_olt:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x40590000
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_lt_f64_e64 s[2:3], s[2:3], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_fcmp_f64_olt:
 ; VI-SDAG:       ; %bb.0:
@@ -1415,10 +1327,8 @@ define amdgpu_kernel void @v_fcmp_f64_olt(ptr addrspace(1) %out, double %src) {
 ; VI-GISEL-LABEL: v_fcmp_f64_olt:
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-GISEL-NEXT:    s_mov_b32 s4, 0
-; VI-GISEL-NEXT:    s_mov_b32 s5, 0x40590000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s4
-; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x40590000
 ; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e64 s[2:3], s[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v3, s1
@@ -1436,11 +1346,9 @@ define amdgpu_kernel void @v_fcmp_f64_ole(ptr addrspace(1) %out, double %src) {
 ; GFX11-LABEL: v_fcmp_f64_ole:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT:    s_mov_b32 s4, 0
-; GFX11-NEXT:    s_mov_b32 s5, 0x40590000
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_le_f64_e64 s[2:3], s[2:3], s[4:5]
+; GFX11-NEXT:    v_cmp_ge_f64_e64 s[2:3], 0x40590000, s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
@@ -1449,32 +1357,18 @@ define amdgpu_kernel void @v_fcmp_f64_ole(ptr addrspace(1) %out, double %src) {
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
-; GFX9-SDAG-LABEL: v_fcmp_f64_ole:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, 0x40590000
-; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_cmp_le_f64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s2
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, s3
-; GFX9-SDAG-NEXT:    global_store_dwordx2 v0, v[1:2], s[0:1]
-; GFX9-SDAG-NEXT:    s_endpgm
-;
-; GFX9-GISEL-LABEL: v_fcmp_f64_ole:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 0x40590000
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_cmp_le_f64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
-; GFX9-GISEL-NEXT:    s_endpgm
+; GFX9-LABEL: v_fcmp_f64_ole:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x40590000
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_le_f64_e64 s[2:3], s[2:3], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_fcmp_f64_ole:
 ; VI-SDAG:       ; %bb.0:
@@ -1493,10 +1387,8 @@ define amdgpu_kernel void @v_fcmp_f64_ole(ptr addrspace(1) %out, double %src) {
 ; VI-GISEL-LABEL: v_fcmp_f64_ole:
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-GISEL-NEXT:    s_mov_b32 s4, 0
-; VI-GISEL-NEXT:    s_mov_b32 s5, 0x40590000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s4
-; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x40590000
 ; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_cmp_le_f64_e64 s[2:3], s[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v3, s1
@@ -1514,11 +1406,9 @@ define amdgpu_kernel void @v_fcmp_f64_ueq(ptr addrspace(1) %out, double %src) {
 ; GFX11-LABEL: v_fcmp_f64_ueq:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT:    s_mov_b32 s4, 0
-; GFX11-NEXT:    s_mov_b32 s5, 0x40590000
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_nlg_f64_e64 s[2:3], s[2:3], s[4:5]
+; GFX11-NEXT:    v_cmp_nlg_f64_e64 s[2:3], 0x40590000, s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
@@ -1527,32 +1417,18 @@ define amdgpu_kernel void @v_fcmp_f64_ueq(ptr addrspace(1) %out, double %src) {
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
-; GFX9-SDAG-LABEL: v_fcmp_f64_ueq:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, 0x40590000
-; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_cmp_nlg_f64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s2
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, s3
-; GFX9-SDAG-NEXT:    global_store_dwordx2 v0, v[1:2], s[0:1]
-; GFX9-SDAG-NEXT:    s_endpgm
-;
-; GFX9-GISEL-LABEL: v_fcmp_f64_ueq:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 0x40590000
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_cmp_nlg_f64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
-; GFX9-GISEL-NEXT:    s_endpgm
+; GFX9-LABEL: v_fcmp_f64_ueq:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x40590000
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_nlg_f64_e64 s[2:3], s[2:3], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_fcmp_f64_ueq:
 ; VI-SDAG:       ; %bb.0:
@@ -1571,10 +1447,8 @@ define amdgpu_kernel void @v_fcmp_f64_ueq(ptr addrspace(1) %out, double %src) {
 ; VI-GISEL-LABEL: v_fcmp_f64_ueq:
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-GISEL-NEXT:    s_mov_b32 s4, 0
-; VI-GISEL-NEXT:    s_mov_b32 s5, 0x40590000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s4
-; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x40590000
 ; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_cmp_nlg_f64_e64 s[2:3], s[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v3, s1
@@ -1592,11 +1466,9 @@ define amdgpu_kernel void @v_fcmp_f64_o(ptr addrspace(1) %out, double %src) {
 ; GFX11-LABEL: v_fcmp_f64_o:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT:    s_mov_b32 s4, 0
-; GFX11-NEXT:    s_mov_b32 s5, 0x40590000
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_o_f64_e64 s[2:3], s[2:3], s[4:5]
+; GFX11-NEXT:    v_cmp_o_f64_e64 s[2:3], 0x40590000, s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
@@ -1605,32 +1477,18 @@ define amdgpu_kernel void @v_fcmp_f64_o(ptr addrspace(1) %out, double %src) {
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
-; GFX9-SDAG-LABEL: v_fcmp_f64_o:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, 0x40590000
-; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_cmp_o_f64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s2
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, s3
-; GFX9-SDAG-NEXT:    global_store_dwordx2 v0, v[1:2], s[0:1]
-; GFX9-SDAG-NEXT:    s_endpgm
-;
-; GFX9-GISEL-LABEL: v_fcmp_f64_o:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 0x40590000
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_cmp_o_f64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
-; GFX9-GISEL-NEXT:    s_endpgm
+; GFX9-LABEL: v_fcmp_f64_o:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x40590000
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_o_f64_e64 s[2:3], s[2:3], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_fcmp_f64_o:
 ; VI-SDAG:       ; %bb.0:
@@ -1649,10 +1507,8 @@ define amdgpu_kernel void @v_fcmp_f64_o(ptr addrspace(1) %out, double %src) {
 ; VI-GISEL-LABEL: v_fcmp_f64_o:
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-GISEL-NEXT:    s_mov_b32 s4, 0
-; VI-GISEL-NEXT:    s_mov_b32 s5, 0x40590000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s4
-; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x40590000
 ; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_cmp_o_f64_e64 s[2:3], s[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v3, s1
@@ -1670,11 +1526,9 @@ define amdgpu_kernel void @v_fcmp_f64_uo(ptr addrspace(1) %out, double %src) {
 ; GFX11-LABEL: v_fcmp_f64_uo:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT:    s_mov_b32 s4, 0
-; GFX11-NEXT:    s_mov_b32 s5, 0x40590000
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_u_f64_e64 s[2:3], s[2:3], s[4:5]
+; GFX11-NEXT:    v_cmp_u_f64_e64 s[2:3], 0x40590000, s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
@@ -1683,32 +1537,18 @@ define amdgpu_kernel void @v_fcmp_f64_uo(ptr addrspace(1) %out, double %src) {
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
-; GFX9-SDAG-LABEL: v_fcmp_f64_uo:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, 0x40590000
-; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_cmp_u_f64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s2
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, s3
-; GFX9-SDAG-NEXT:    global_store_dwordx2 v0, v[1:2], s[0:1]
-; GFX9-SDAG-NEXT:    s_endpgm
-;
-; GFX9-GISEL-LABEL: v_fcmp_f64_uo:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 0x40590000
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_cmp_u_f64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
-; GFX9-GISEL-NEXT:    s_endpgm
+; GFX9-LABEL: v_fcmp_f64_uo:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x40590000
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_u_f64_e64 s[2:3], s[2:3], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_fcmp_f64_uo:
 ; VI-SDAG:       ; %bb.0:
@@ -1727,10 +1567,8 @@ define amdgpu_kernel void @v_fcmp_f64_uo(ptr addrspace(1) %out, double %src) {
 ; VI-GISEL-LABEL: v_fcmp_f64_uo:
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-GISEL-NEXT:    s_mov_b32 s4, 0
-; VI-GISEL-NEXT:    s_mov_b32 s5, 0x40590000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s4
-; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x40590000
 ; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_cmp_u_f64_e64 s[2:3], s[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v3, s1
@@ -1748,11 +1586,9 @@ define amdgpu_kernel void @v_fcmp_f64_une(ptr addrspace(1) %out, double %src) {
 ; GFX11-LABEL: v_fcmp_f64_une:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT:    s_mov_b32 s4, 0
-; GFX11-NEXT:    s_mov_b32 s5, 0x40590000
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_neq_f64_e64 s[2:3], s[2:3], s[4:5]
+; GFX11-NEXT:    v_cmp_neq_f64_e64 s[2:3], 0x40590000, s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
@@ -1761,32 +1597,18 @@ define amdgpu_kernel void @v_fcmp_f64_une(ptr addrspace(1) %out, double %src) {
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
-; GFX9-SDAG-LABEL: v_fcmp_f64_une:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, 0x40590000
-; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_cmp_neq_f64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s2
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, s3
-; GFX9-SDAG-NEXT:    global_store_dwordx2 v0, v[1:2], s[0:1]
-; GFX9-SDAG-NEXT:    s_endpgm
-;
-; GFX9-GISEL-LABEL: v_fcmp_f64_une:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 0x40590000
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_cmp_neq_f64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
-; GFX9-GISEL-NEXT:    s_endpgm
+; GFX9-LABEL: v_fcmp_f64_une:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x40590000
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_neq_f64_e64 s[2:3], s[2:3], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_fcmp_f64_une:
 ; VI-SDAG:       ; %bb.0:
@@ -1805,10 +1627,8 @@ define amdgpu_kernel void @v_fcmp_f64_une(ptr addrspace(1) %out, double %src) {
 ; VI-GISEL-LABEL: v_fcmp_f64_une:
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-GISEL-NEXT:    s_mov_b32 s4, 0
-; VI-GISEL-NEXT:    s_mov_b32 s5, 0x40590000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s4
-; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x40590000
 ; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_cmp_neq_f64_e64 s[2:3], s[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v3, s1
@@ -1826,11 +1646,9 @@ define amdgpu_kernel void @v_fcmp_f64_ugt(ptr addrspace(1) %out, double %src) {
 ; GFX11-LABEL: v_fcmp_f64_ugt:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT:    s_mov_b32 s4, 0
-; GFX11-NEXT:    s_mov_b32 s5, 0x40590000
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_nle_f64_e64 s[2:3], s[2:3], s[4:5]
+; GFX11-NEXT:    v_cmp_nge_f64_e64 s[2:3], 0x40590000, s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
@@ -1839,32 +1657,18 @@ define amdgpu_kernel void @v_fcmp_f64_ugt(ptr addrspace(1) %out, double %src) {
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
-; GFX9-SDAG-LABEL: v_fcmp_f64_ugt:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, 0x40590000
-; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_cmp_nle_f64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s2
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, s3
-; GFX9-SDAG-NEXT:    global_store_dwordx2 v0, v[1:2], s[0:1]
-; GFX9-SDAG-NEXT:    s_endpgm
-;
-; GFX9-GISEL-LABEL: v_fcmp_f64_ugt:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 0x40590000
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_cmp_nle_f64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
-; GFX9-GISEL-NEXT:    s_endpgm
+; GFX9-LABEL: v_fcmp_f64_ugt:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x40590000
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_nle_f64_e64 s[2:3], s[2:3], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_fcmp_f64_ugt:
 ; VI-SDAG:       ; %bb.0:
@@ -1883,10 +1687,8 @@ define amdgpu_kernel void @v_fcmp_f64_ugt(ptr addrspace(1) %out, double %src) {
 ; VI-GISEL-LABEL: v_fcmp_f64_ugt:
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-GISEL-NEXT:    s_mov_b32 s4, 0
-; VI-GISEL-NEXT:    s_mov_b32 s5, 0x40590000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s4
-; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x40590000
 ; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_cmp_nle_f64_e64 s[2:3], s[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v3, s1
@@ -1904,11 +1706,9 @@ define amdgpu_kernel void @v_fcmp_f64_uge(ptr addrspace(1) %out, double %src) {
 ; GFX11-LABEL: v_fcmp_f64_uge:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT:    s_mov_b32 s4, 0
-; GFX11-NEXT:    s_mov_b32 s5, 0x40590000
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_nlt_f64_e64 s[2:3], s[2:3], s[4:5]
+; GFX11-NEXT:    v_cmp_ngt_f64_e64 s[2:3], 0x40590000, s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
@@ -1917,32 +1717,18 @@ define amdgpu_kernel void @v_fcmp_f64_uge(ptr addrspace(1) %out, double %src) {
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
-; GFX9-SDAG-LABEL: v_fcmp_f64_uge:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, 0x40590000
-; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_cmp_nlt_f64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s2
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, s3
-; GFX9-SDAG-NEXT:    global_store_dwordx2 v0, v[1:2], s[0:1]
-; GFX9-SDAG-NEXT:    s_endpgm
-;
-; GFX9-GISEL-LABEL: v_fcmp_f64_uge:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 0x40590000
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_cmp_nlt_f64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
-; GFX9-GISEL-NEXT:    s_endpgm
+; GFX9-LABEL: v_fcmp_f64_uge:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x40590000
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_nlt_f64_e64 s[2:3], s[2:3], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_fcmp_f64_uge:
 ; VI-SDAG:       ; %bb.0:
@@ -1961,10 +1747,8 @@ define amdgpu_kernel void @v_fcmp_f64_uge(ptr addrspace(1) %out, double %src) {
 ; VI-GISEL-LABEL: v_fcmp_f64_uge:
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-GISEL-NEXT:    s_mov_b32 s4, 0
-; VI-GISEL-NEXT:    s_mov_b32 s5, 0x40590000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s4
-; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x40590000
 ; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_cmp_nlt_f64_e64 s[2:3], s[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v3, s1
@@ -1982,11 +1766,9 @@ define amdgpu_kernel void @v_fcmp_f64_ult(ptr addrspace(1) %out, double %src) {
 ; GFX11-LABEL: v_fcmp_f64_ult:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT:    s_mov_b32 s4, 0
-; GFX11-NEXT:    s_mov_b32 s5, 0x40590000
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_nge_f64_e64 s[2:3], s[2:3], s[4:5]
+; GFX11-NEXT:    v_cmp_nle_f64_e64 s[2:3], 0x40590000, s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
@@ -1995,32 +1777,18 @@ define amdgpu_kernel void @v_fcmp_f64_ult(ptr addrspace(1) %out, double %src) {
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
-; GFX9-SDAG-LABEL: v_fcmp_f64_ult:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, 0x40590000
-; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_cmp_nge_f64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s2
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, s3
-; GFX9-SDAG-NEXT:    global_store_dwordx2 v0, v[1:2], s[0:1]
-; GFX9-SDAG-NEXT:    s_endpgm
-;
-; GFX9-GISEL-LABEL: v_fcmp_f64_ult:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 0x40590000
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_cmp_nge_f64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
-; GFX9-GISEL-NEXT:    s_endpgm
+; GFX9-LABEL: v_fcmp_f64_ult:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x40590000
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_nge_f64_e64 s[2:3], s[2:3], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_fcmp_f64_ult:
 ; VI-SDAG:       ; %bb.0:
@@ -2039,10 +1807,8 @@ define amdgpu_kernel void @v_fcmp_f64_ult(ptr addrspace(1) %out, double %src) {
 ; VI-GISEL-LABEL: v_fcmp_f64_ult:
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-GISEL-NEXT:    s_mov_b32 s4, 0
-; VI-GISEL-NEXT:    s_mov_b32 s5, 0x40590000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s4
-; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x40590000
 ; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_cmp_nge_f64_e64 s[2:3], s[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v3, s1
@@ -2060,11 +1826,9 @@ define amdgpu_kernel void @v_fcmp_f64_ule(ptr addrspace(1) %out, double %src) {
 ; GFX11-LABEL: v_fcmp_f64_ule:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT:    s_mov_b32 s4, 0
-; GFX11-NEXT:    s_mov_b32 s5, 0x40590000
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_ngt_f64_e64 s[2:3], s[2:3], s[4:5]
+; GFX11-NEXT:    v_cmp_nlt_f64_e64 s[2:3], 0x40590000, s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
@@ -2073,32 +1837,18 @@ define amdgpu_kernel void @v_fcmp_f64_ule(ptr addrspace(1) %out, double %src) {
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
 ;
-; GFX9-SDAG-LABEL: v_fcmp_f64_ule:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, 0x40590000
-; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_cmp_ngt_f64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s2
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, s3
-; GFX9-SDAG-NEXT:    global_store_dwordx2 v0, v[1:2], s[0:1]
-; GFX9-SDAG-NEXT:    s_endpgm
-;
-; GFX9-GISEL-LABEL: v_fcmp_f64_ule:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, 0x40590000
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s5
-; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_cmp_ngt_f64_e64 s[2:3], s[2:3], v[0:1]
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s3
-; GFX9-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
-; GFX9-GISEL-NEXT:    s_endpgm
+; GFX9-LABEL: v_fcmp_f64_ule:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x40590000
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_ngt_f64_e64 s[2:3], s[2:3], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: v_fcmp_f64_ule:
 ; VI-SDAG:       ; %bb.0:
@@ -2117,10 +1867,8 @@ define amdgpu_kernel void @v_fcmp_f64_ule(ptr addrspace(1) %out, double %src) {
 ; VI-GISEL-LABEL: v_fcmp_f64_ule:
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-GISEL-NEXT:    s_mov_b32 s4, 0
-; VI-GISEL-NEXT:    s_mov_b32 s5, 0x40590000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s4
-; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x40590000
 ; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_cmp_ngt_f64_e64 s[2:3], s[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v3, s1
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll
index 44d1cfb96146e..7a492f51cce49 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll
@@ -633,9 +633,8 @@ define amdgpu_kernel void @v_icmp_i64_eq(ptr addrspace(1) %out, i64 %src) {
 ; SDAG-GFX11-LABEL: v_icmp_i64_eq:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; SDAG-GFX11-NEXT:    s_mov_b64 s[4:5], 0x64
 ; SDAG-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT:    v_cmp_eq_u64_e64 s2, s[2:3], s[4:5]
+; SDAG-GFX11-NEXT:    v_cmp_eq_u64_e64 s2, 0x64, s[2:3]
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -646,10 +645,9 @@ define amdgpu_kernel void @v_icmp_i64_eq(ptr addrspace(1) %out, i64 %src) {
 ; SDAG-GFX10-LABEL: v_icmp_i64_eq:
 ; SDAG-GFX10:       ; %bb.0:
 ; SDAG-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; SDAG-GFX10-NEXT:    s_mov_b64 s[4:5], 0x64
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT:    v_cmp_eq_u64_e64 s2, s[2:3], s[4:5]
+; SDAG-GFX10-NEXT:    v_cmp_eq_u64_e64 s2, 0x64, s[2:3]
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s2
 ; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; SDAG-GFX10-NEXT:    s_endpgm
@@ -657,10 +655,9 @@ define amdgpu_kernel void @v_icmp_i64_eq(ptr addrspace(1) %out, i64 %src) {
 ; GISEL-GFX11-LABEL: v_icmp_i64_eq:
 ; GISEL-GFX11:       ; %bb.0:
 ; GISEL-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GISEL-GFX11-NEXT:    s_mov_b64 s[4:5], 0x64
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT:    v_cmp_eq_u64_e64 s2, s[2:3], s[4:5]
+; GISEL-GFX11-NEXT:    v_cmp_eq_u64_e64 s2, 0x64, s[2:3]
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
@@ -671,10 +668,9 @@ define amdgpu_kernel void @v_icmp_i64_eq(ptr addrspace(1) %out, i64 %src) {
 ; GISEL-GFX10-LABEL: v_icmp_i64_eq:
 ; GISEL-GFX10:       ; %bb.0:
 ; GISEL-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GISEL-GFX10-NEXT:    s_mov_b64 s[4:5], 0x64
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT:    v_cmp_eq_u64_e64 s2, s[2:3], s[4:5]
+; GISEL-GFX10-NEXT:    v_cmp_eq_u64_e64 s2, 0x64, s[2:3]
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GISEL-GFX10-NEXT:    s_endpgm
@@ -687,9 +683,8 @@ define amdgpu_kernel void @v_icmp_i64_ne(ptr addrspace(1) %out, i64 %src) {
 ; SDAG-GFX11-LABEL: v_icmp_i64_ne:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; SDAG-GFX11-NEXT:    s_mov_b64 s[4:5], 0x64
 ; SDAG-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT:    v_cmp_ne_u64_e64 s2, s[2:3], s[4:5]
+; SDAG-GFX11-NEXT:    v_cmp_ne_u64_e64 s2, 0x64, s[2:3]
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -700,10 +695,9 @@ define amdgpu_kernel void @v_icmp_i64_ne(ptr addrspace(1) %out, i64 %src) {
 ; SDAG-GFX10-LABEL: v_icmp_i64_ne:
 ; SDAG-GFX10:       ; %bb.0:
 ; SDAG-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; SDAG-GFX10-NEXT:    s_mov_b64 s[4:5], 0x64
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT:    v_cmp_ne_u64_e64 s2, s[2:3], s[4:5]
+; SDAG-GFX10-NEXT:    v_cmp_ne_u64_e64 s2, 0x64, s[2:3]
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s2
 ; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; SDAG-GFX10-NEXT:    s_endpgm
@@ -711,10 +705,9 @@ define amdgpu_kernel void @v_icmp_i64_ne(ptr addrspace(1) %out, i64 %src) {
 ; GISEL-GFX11-LABEL: v_icmp_i64_ne:
 ; GISEL-GFX11:       ; %bb.0:
 ; GISEL-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GISEL-GFX11-NEXT:    s_mov_b64 s[4:5], 0x64
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT:    v_cmp_ne_u64_e64 s2, s[2:3], s[4:5]
+; GISEL-GFX11-NEXT:    v_cmp_ne_u64_e64 s2, 0x64, s[2:3]
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
@@ -725,10 +718,9 @@ define amdgpu_kernel void @v_icmp_i64_ne(ptr addrspace(1) %out, i64 %src) {
 ; GISEL-GFX10-LABEL: v_icmp_i64_ne:
 ; GISEL-GFX10:       ; %bb.0:
 ; GISEL-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GISEL-GFX10-NEXT:    s_mov_b64 s[4:5], 0x64
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT:    v_cmp_ne_u64_e64 s2, s[2:3], s[4:5]
+; GISEL-GFX10-NEXT:    v_cmp_ne_u64_e64 s2, 0x64, s[2:3]
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GISEL-GFX10-NEXT:    s_endpgm
@@ -741,9 +733,8 @@ define amdgpu_kernel void @v_icmp_u64_ugt(ptr addrspace(1) %out, i64 %src) {
 ; SDAG-GFX11-LABEL: v_icmp_u64_ugt:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; SDAG-GFX11-NEXT:    s_mov_b64 s[4:5], 0x64
 ; SDAG-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT:    v_cmp_gt_u64_e64 s2, s[2:3], s[4:5]
+; SDAG-GFX11-NEXT:    v_cmp_lt_u64_e64 s2, 0x64, s[2:3]
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -754,10 +745,9 @@ define amdgpu_kernel void @v_icmp_u64_ugt(ptr addrspace(1) %out, i64 %src) {
 ; SDAG-GFX10-LABEL: v_icmp_u64_ugt:
 ; SDAG-GFX10:       ; %bb.0:
 ; SDAG-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; SDAG-GFX10-NEXT:    s_mov_b64 s[4:5], 0x64
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT:    v_cmp_gt_u64_e64 s2, s[2:3], s[4:5]
+; SDAG-GFX10-NEXT:    v_cmp_lt_u64_e64 s2, 0x64, s[2:3]
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s2
 ; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; SDAG-GFX10-NEXT:    s_endpgm
@@ -765,10 +755,9 @@ define amdgpu_kernel void @v_icmp_u64_ugt(ptr addrspace(1) %out, i64 %src) {
 ; GISEL-GFX11-LABEL: v_icmp_u64_ugt:
 ; GISEL-GFX11:       ; %bb.0:
 ; GISEL-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GISEL-GFX11-NEXT:    s_mov_b64 s[4:5], 0x64
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT:    v_cmp_gt_u64_e64 s2, s[2:3], s[4:5]
+; GISEL-GFX11-NEXT:    v_cmp_lt_u64_e64 s2, 0x64, s[2:3]
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
@@ -779,10 +768,9 @@ define amdgpu_kernel void @v_icmp_u64_ugt(ptr addrspace(1) %out, i64 %src) {
 ; GISEL-GFX10-LABEL: v_icmp_u64_ugt:
 ; GISEL-GFX10:       ; %bb.0:
 ; GISEL-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GISEL-GFX10-NEXT:    s_mov_b64 s[4:5], 0x64
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT:    v_cmp_gt_u64_e64 s2, s[2:3], s[4:5]
+; GISEL-GFX10-NEXT:    v_cmp_lt_u64_e64 s2, 0x64, s[2:3]
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GISEL-GFX10-NEXT:    s_endpgm
@@ -795,9 +783,8 @@ define amdgpu_kernel void @v_icmp_u64_uge(ptr addrspace(1) %out, i64 %src) {
 ; SDAG-GFX11-LABEL: v_icmp_u64_uge:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; SDAG-GFX11-NEXT:    s_mov_b64 s[4:5], 0x64
 ; SDAG-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT:    v_cmp_ge_u64_e64 s2, s[2:3], s[4:5]
+; SDAG-GFX11-NEXT:    v_cmp_le_u64_e64 s2, 0x64, s[2:3]
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -808,10 +795,9 @@ define amdgpu_kernel void @v_icmp_u64_uge(ptr addrspace(1) %out, i64 %src) {
 ; SDAG-GFX10-LABEL: v_icmp_u64_uge:
 ; SDAG-GFX10:       ; %bb.0:
 ; SDAG-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; SDAG-GFX10-NEXT:    s_mov_b64 s[4:5], 0x64
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT:    v_cmp_ge_u64_e64 s2, s[2:3], s[4:5]
+; SDAG-GFX10-NEXT:    v_cmp_le_u64_e64 s2, 0x64, s[2:3]
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s2
 ; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; SDAG-GFX10-NEXT:    s_endpgm
@@ -819,10 +805,9 @@ define amdgpu_kernel void @v_icmp_u64_uge(ptr addrspace(1) %out, i64 %src) {
 ; GISEL-GFX11-LABEL: v_icmp_u64_uge:
 ; GISEL-GFX11:       ; %bb.0:
 ; GISEL-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GISEL-GFX11-NEXT:    s_mov_b64 s[4:5], 0x64
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT:    v_cmp_ge_u64_e64 s2, s[2:3], s[4:5]
+; GISEL-GFX11-NEXT:    v_cmp_le_u64_e64 s2, 0x64, s[2:3]
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
@@ -833,10 +818,9 @@ define amdgpu_kernel void @v_icmp_u64_uge(ptr addrspace(1) %out, i64 %src) {
 ; GISEL-GFX10-LABEL: v_icmp_u64_uge:
 ; GISEL-GFX10:       ; %bb.0:
 ; GISEL-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GISEL-GFX10-NEXT:    s_mov_b64 s[4:5], 0x64
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT:    v_cmp_ge_u64_e64 s2, s[2:3], s[4:5]
+; GISEL-GFX10-NEXT:    v_cmp_le_u64_e64 s2, 0x64, s[2:3]
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GISEL-GFX10-NEXT:    s_endpgm
@@ -849,9 +833,8 @@ define amdgpu_kernel void @v_icmp_u64_ult(ptr addrspace(1) %out, i64 %src) {
 ; SDAG-GFX11-LABEL: v_icmp_u64_ult:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; SDAG-GFX11-NEXT:    s_mov_b64 s[4:5], 0x64
 ; SDAG-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT:    v_cmp_lt_u64_e64 s2, s[2:3], s[4:5]
+; SDAG-GFX11-NEXT:    v_cmp_gt_u64_e64 s2, 0x64, s[2:3]
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -862,10 +845,9 @@ define amdgpu_kernel void @v_icmp_u64_ult(ptr addrspace(1) %out, i64 %src) {
 ; SDAG-GFX10-LABEL: v_icmp_u64_ult:
 ; SDAG-GFX10:       ; %bb.0:
 ; SDAG-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; SDAG-GFX10-NEXT:    s_mov_b64 s[4:5], 0x64
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT:    v_cmp_lt_u64_e64 s2, s[2:3], s[4:5]
+; SDAG-GFX10-NEXT:    v_cmp_gt_u64_e64 s2, 0x64, s[2:3]
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s2
 ; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; SDAG-GFX10-NEXT:    s_endpgm
@@ -873,10 +855,9 @@ define amdgpu_kernel void @v_icmp_u64_ult(ptr addrspace(1) %out, i64 %src) {
 ; GISEL-GFX11-LABEL: v_icmp_u64_ult:
 ; GISEL-GFX11:       ; %bb.0:
 ; GISEL-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GISEL-GFX11-NEXT:    s_mov_b64 s[4:5], 0x64
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT:    v_cmp_lt_u64_e64 s2, s[2:3], s[4:5]
+; GISEL-GFX11-NEXT:    v_cmp_gt_u64_e64 s2, 0x64, s[2:3]
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
@@ -887,10 +868,9 @@ define amdgpu_kernel void @v_icmp_u64_ult(ptr addrspace(1) %out, i64 %src) {
 ; GISEL-GFX10-LABEL: v_icmp_u64_ult:
 ; GISEL-GFX10:       ; %bb.0:
 ; GISEL-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GISEL-GFX10-NEXT:    s_mov_b64 s[4:5], 0x64
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT:    v_cmp_lt_u64_e64 s2, s[2:3], s[4:5]
+; GISEL-GFX10-NEXT:    v_cmp_gt_u64_e64 s2, 0x64, s[2:3]
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GISEL-GFX10-NEXT:    s_endpgm
@@ -903,9 +883,8 @@ define amdgpu_kernel void @v_icmp_u64_ule(ptr addrspace(1) %out, i64 %src) {
 ; SDAG-GFX11-LABEL: v_icmp_u64_ule:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; SDAG-GFX11-NEXT:    s_mov_b64 s[4:5], 0x64
 ; SDAG-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT:    v_cmp_le_u64_e64 s2, s[2:3], s[4:5]
+; SDAG-GFX11-NEXT:    v_cmp_ge_u64_e64 s2, 0x64, s[2:3]
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -916,10 +895,9 @@ define amdgpu_kernel void @v_icmp_u64_ule(ptr addrspace(1) %out, i64 %src) {
 ; SDAG-GFX10-LABEL: v_icmp_u64_ule:
 ; SDAG-GFX10:       ; %bb.0:
 ; SDAG-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; SDAG-GFX10-NEXT:    s_mov_b64 s[4:5], 0x64
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT:    v_cmp_le_u64_e64 s2, s[2:3], s[4:5]
+; SDAG-GFX10-NEXT:    v_cmp_ge_u64_e64 s2, 0x64, s[2:3]
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s2
 ; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; SDAG-GFX10-NEXT:    s_endpgm
@@ -927,10 +905,9 @@ define amdgpu_kernel void @v_icmp_u64_ule(ptr addrspace(1) %out, i64 %src) {
 ; GISEL-GFX11-LABEL: v_icmp_u64_ule:
 ; GISEL-GFX11:       ; %bb.0:
 ; GISEL-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GISEL-GFX11-NEXT:    s_mov_b64 s[4:5], 0x64
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT:    v_cmp_le_u64_e64 s2, s[2:3], s[4:5]
+; GISEL-GFX11-NEXT:    v_cmp_ge_u64_e64 s2, 0x64, s[2:3]
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
@@ -941,10 +918,9 @@ define amdgpu_kernel void @v_icmp_u64_ule(ptr addrspace(1) %out, i64 %src) {
 ; GISEL-GFX10-LABEL: v_icmp_u64_ule:
 ; GISEL-GFX10:       ; %bb.0:
 ; GISEL-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GISEL-GFX10-NEXT:    s_mov_b64 s[4:5], 0x64
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT:    v_cmp_le_u64_e64 s2, s[2:3], s[4:5]
+; GISEL-GFX10-NEXT:    v_cmp_ge_u64_e64 s2, 0x64, s[2:3]
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GISEL-GFX10-NEXT:    s_endpgm
@@ -957,9 +933,8 @@ define amdgpu_kernel void @v_icmp_i64_sgt(ptr addrspace(1) %out, i64 %src) {
 ; SDAG-GFX11-LABEL: v_icmp_i64_sgt:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; SDAG-GFX11-NEXT:    s_mov_b64 s[4:5], 0x64
 ; SDAG-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT:    v_cmp_gt_i64_e64 s2, s[2:3], s[4:5]
+; SDAG-GFX11-NEXT:    v_cmp_lt_i64_e64 s2, 0x64, s[2:3]
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -970,10 +945,9 @@ define amdgpu_kernel void @v_icmp_i64_sgt(ptr addrspace(1) %out, i64 %src) {
 ; SDAG-GFX10-LABEL: v_icmp_i64_sgt:
 ; SDAG-GFX10:       ; %bb.0:
 ; SDAG-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; SDAG-GFX10-NEXT:    s_mov_b64 s[4:5], 0x64
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT:    v_cmp_gt_i64_e64 s2, s[2:3], s[4:5]
+; SDAG-GFX10-NEXT:    v_cmp_lt_i64_e64 s2, 0x64, s[2:3]
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s2
 ; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; SDAG-GFX10-NEXT:    s_endpgm
@@ -981,10 +955,9 @@ define amdgpu_kernel void @v_icmp_i64_sgt(ptr addrspace(1) %out, i64 %src) {
 ; GISEL-GFX11-LABEL: v_icmp_i64_sgt:
 ; GISEL-GFX11:       ; %bb.0:
 ; GISEL-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GISEL-GFX11-NEXT:    s_mov_b64 s[4:5], 0x64
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT:    v_cmp_gt_i64_e64 s2, s[2:3], s[4:5]
+; GISEL-GFX11-NEXT:    v_cmp_lt_i64_e64 s2, 0x64, s[2:3]
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
@@ -995,10 +968,9 @@ define amdgpu_kernel void @v_icmp_i64_sgt(ptr addrspace(1) %out, i64 %src) {
 ; GISEL-GFX10-LABEL: v_icmp_i64_sgt:
 ; GISEL-GFX10:       ; %bb.0:
 ; GISEL-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GISEL-GFX10-NEXT:    s_mov_b64 s[4:5], 0x64
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT:    v_cmp_gt_i64_e64 s2, s[2:3], s[4:5]
+; GISEL-GFX10-NEXT:    v_cmp_lt_i64_e64 s2, 0x64, s[2:3]
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GISEL-GFX10-NEXT:    s_endpgm
@@ -1011,9 +983,8 @@ define amdgpu_kernel void @v_icmp_i64_sge(ptr addrspace(1) %out, i64 %src) {
 ; SDAG-GFX11-LABEL: v_icmp_i64_sge:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; SDAG-GFX11-NEXT:    s_mov_b64 s[4:5], 0x64
 ; SDAG-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT:    v_cmp_ge_i64_e64 s2, s[2:3], s[4:5]
+; SDAG-GFX11-NEXT:    v_cmp_le_i64_e64 s2, 0x64, s[2:3]
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -1024,10 +995,9 @@ define amdgpu_kernel void @v_icmp_i64_sge(ptr addrspace(1) %out, i64 %src) {
 ; SDAG-GFX10-LABEL: v_icmp_i64_sge:
 ; SDAG-GFX10:       ; %bb.0:
 ; SDAG-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; SDAG-GFX10-NEXT:    s_mov_b64 s[4:5], 0x64
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT:    v_cmp_ge_i64_e64 s2, s[2:3], s[4:5]
+; SDAG-GFX10-NEXT:    v_cmp_le_i64_e64 s2, 0x64, s[2:3]
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s2
 ; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; SDAG-GFX10-NEXT:    s_endpgm
@@ -1035,10 +1005,9 @@ define amdgpu_kernel void @v_icmp_i64_sge(ptr addrspace(1) %out, i64 %src) {
 ; GISEL-GFX11-LABEL: v_icmp_i64_sge:
 ; GISEL-GFX11:       ; %bb.0:
 ; GISEL-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GISEL-GFX11-NEXT:    s_mov_b64 s[4:5], 0x64
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT:    v_cmp_ge_i64_e64 s2, s[2:3], s[4:5]
+; GISEL-GFX11-NEXT:    v_cmp_le_i64_e64 s2, 0x64, s[2:3]
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
@@ -1049,10 +1018,9 @@ define amdgpu_kernel void @v_icmp_i64_sge(ptr addrspace(1) %out, i64 %src) {
 ; GISEL-GFX10-LABEL: v_icmp_i64_sge:
 ; GISEL-GFX10:       ; %bb.0:
 ; GISEL-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GISEL-GFX10-NEXT:    s_mov_b64 s[4:5], 0x64
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT:    v_cmp_ge_i64_e64 s2, s[2:3], s[4:5]
+; GISEL-GFX10-NEXT:    v_cmp_le_i64_e64 s2, 0x64, s[2:3]
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GISEL-GFX10-NEXT:    s_endpgm
@@ -1065,9 +1033,8 @@ define amdgpu_kernel void @v_icmp_i64_slt(ptr addrspace(1) %out, i64 %src) {
 ; SDAG-GFX11-LABEL: v_icmp_i64_slt:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; SDAG-GFX11-NEXT:    s_mov_b64 s[4:5], 0x64
 ; SDAG-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT:    v_cmp_lt_i64_e64 s2, s[2:3], s[4:5]
+; SDAG-GFX11-NEXT:    v_cmp_gt_i64_e64 s2, 0x64, s[2:3]
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -1078,10 +1045,9 @@ define amdgpu_kernel void @v_icmp_i64_slt(ptr addrspace(1) %out, i64 %src) {
 ; SDAG-GFX10-LABEL: v_icmp_i64_slt:
 ; SDAG-GFX10:       ; %bb.0:
 ; SDAG-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; SDAG-GFX10-NEXT:    s_mov_b64 s[4:5], 0x64
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT:    v_cmp_lt_i64_e64 s2, s[2:3], s[4:5]
+; SDAG-GFX10-NEXT:    v_cmp_gt_i64_e64 s2, 0x64, s[2:3]
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s2
 ; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; SDAG-GFX10-NEXT:    s_endpgm
@@ -1089,10 +1055,9 @@ define amdgpu_kernel void @v_icmp_i64_slt(ptr addrspace(1) %out, i64 %src) {
 ; GISEL-GFX11-LABEL: v_icmp_i64_slt:
 ; GISEL-GFX11:       ; %bb.0:
 ; GISEL-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GISEL-GFX11-NEXT:    s_mov_b64 s[4:5], 0x64
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT:    v_cmp_lt_i64_e64 s2, s[2:3], s[4:5]
+; GISEL-GFX11-NEXT:    v_cmp_gt_i64_e64 s2, 0x64, s[2:3]
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
@@ -1103,10 +1068,9 @@ define amdgpu_kernel void @v_icmp_i64_slt(ptr addrspace(1) %out, i64 %src) {
 ; GISEL-GFX10-LABEL: v_icmp_i64_slt:
 ; GISEL-GFX10:       ; %bb.0:
 ; GISEL-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GISEL-GFX10-NEXT:    s_mov_b64 s[4:5], 0x64
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT:    v_cmp_lt_i64_e64 s2, s[2:3], s[4:5]
+; GISEL-GFX10-NEXT:    v_cmp_gt_i64_e64 s2, 0x64, s[2:3]
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GISEL-GFX10-NEXT:    s_endpgm
@@ -1119,9 +1083,8 @@ define amdgpu_kernel void @v_icmp_i64_sle(ptr addrspace(1) %out, i64 %src) {
 ; SDAG-GFX11-LABEL: v_icmp_i64_sle:
 ; SDAG-GFX11:       ; %bb.0:
 ; SDAG-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; SDAG-GFX11-NEXT:    s_mov_b64 s[4:5], 0x64
 ; SDAG-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX11-NEXT:    v_cmp_le_i64_e64 s2, s[2:3], s[4:5]
+; SDAG-GFX11-NEXT:    v_cmp_ge_i64_e64 s2, 0x64, s[2:3]
 ; SDAG-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
@@ -1132,10 +1095,9 @@ define amdgpu_kernel void @v_icmp_i64_sle(ptr addrspace(1) %out, i64 %src) {
 ; SDAG-GFX10-LABEL: v_icmp_i64_sle:
 ; SDAG-GFX10:       ; %bb.0:
 ; SDAG-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; SDAG-GFX10-NEXT:    s_mov_b64 s[4:5], 0x64
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX10-NEXT:    v_cmp_le_i64_e64 s2, s[2:3], s[4:5]
+; SDAG-GFX10-NEXT:    v_cmp_ge_i64_e64 s2, 0x64, s[2:3]
 ; SDAG-GFX10-NEXT:    v_mov_b32_e32 v1, s2
 ; SDAG-GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
 ; SDAG-GFX10-NEXT:    s_endpgm
@@ -1143,10 +1105,9 @@ define amdgpu_kernel void @v_icmp_i64_sle(ptr addrspace(1) %out, i64 %src) {
 ; GISEL-GFX11-LABEL: v_icmp_i64_sle:
 ; GISEL-GFX11:       ; %bb.0:
 ; GISEL-GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GISEL-GFX11-NEXT:    s_mov_b64 s[4:5], 0x64
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX11-NEXT:    v_cmp_le_i64_e64 s2, s[2:3], s[4:5]
+; GISEL-GFX11-NEXT:    v_cmp_ge_i64_e64 s2, 0x64, s[2:3]
 ; GISEL-GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
@@ -1157,10 +1118,9 @@ define amdgpu_kernel void @v_icmp_i64_sle(ptr addrspace(1) %out, i64 %src) {
 ; GISEL-GFX10-LABEL: v_icmp_i64_sle:
 ; GISEL-GFX10:       ; %bb.0:
 ; GISEL-GFX10-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GISEL-GFX10-NEXT:    s_mov_b64 s[4:5], 0x64
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX10-NEXT:    v_cmp_le_i64_e64 s2, s[2:3], s[4:5]
+; GISEL-GFX10-NEXT:    v_cmp_ge_i64_e64 s2, 0x64, s[2:3]
 ; GISEL-GFX10-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GISEL-GFX10-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll
index 5d1aa7cbb9992..80e0202962462 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll
@@ -689,10 +689,9 @@ define amdgpu_kernel void @v_icmp_i64_eq(ptr addrspace(1) %out, i64 %src) {
 ; GFX11-LABEL: v_icmp_i64_eq:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT:    s_mov_b64 s[4:5], 0x64
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_eq_u64_e64 s[2:3], s[2:3], s[4:5]
+; GFX11-NEXT:    v_cmp_eq_u64_e64 s[2:3], 0x64, s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
@@ -715,24 +714,24 @@ define amdgpu_kernel void @v_icmp_i64_eq(ptr addrspace(1) %out, i64 %src) {
 ; SDAG-VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; SDAG-VI-NEXT:    s_endpgm
 ;
-; SDAG-GFX9-LABEL: v_icmp_i64_eq:
-; SDAG-GFX9:       ; %bb.0:
-; SDAG-GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; SDAG-GFX9-NEXT:    v_mov_b32_e32 v1, 0
-; SDAG-GFX9-NEXT:    v_mov_b32_e32 v0, 0x64
-; SDAG-GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX9-NEXT:    v_cmp_eq_u64_e64 s[2:3], s[2:3], v[0:1]
-; SDAG-GFX9-NEXT:    v_mov_b32_e32 v2, s2
-; SDAG-GFX9-NEXT:    v_mov_b32_e32 v3, s3
-; SDAG-GFX9-NEXT:    global_store_dwordx2 v1, v[2:3], s[0:1]
-; SDAG-GFX9-NEXT:    s_endpgm
+; GFX9-LABEL: v_icmp_i64_eq:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x64
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_eq_u64_e64 s[2:3], s[2:3], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    s_endpgm
 ;
 ; GISEL-VI-LABEL: v_icmp_i64_eq:
 ; GISEL-VI:       ; %bb.0:
 ; GISEL-VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GISEL-VI-NEXT:    s_mov_b64 s[4:5], 0x64
-; GISEL-VI-NEXT:    v_mov_b32_e32 v0, s4
-; GISEL-VI-NEXT:    v_mov_b32_e32 v1, s5
+; GISEL-VI-NEXT:    v_mov_b32_e32 v0, 0x64
+; GISEL-VI-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL-VI-NEXT:    v_cmp_eq_u64_e64 s[2:3], s[2:3], v[0:1]
 ; GISEL-VI-NEXT:    v_mov_b32_e32 v3, s1
@@ -741,20 +740,6 @@ define amdgpu_kernel void @v_icmp_i64_eq(ptr addrspace(1) %out, i64 %src) {
 ; GISEL-VI-NEXT:    v_mov_b32_e32 v2, s0
 ; GISEL-VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GISEL-VI-NEXT:    s_endpgm
-;
-; GISEL-GFX9-LABEL: v_icmp_i64_eq:
-; GISEL-GFX9:       ; %bb.0:
-; GISEL-GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GISEL-GFX9-NEXT:    s_mov_b64 s[4:5], 0x64
-; GISEL-GFX9-NEXT:    v_mov_b32_e32 v0, s4
-; GISEL-GFX9-NEXT:    v_mov_b32_e32 v1, s5
-; GISEL-GFX9-NEXT:    v_mov_b32_e32 v2, 0
-; GISEL-GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX9-NEXT:    v_cmp_eq_u64_e64 s[2:3], s[2:3], v[0:1]
-; GISEL-GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GISEL-GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GISEL-GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
-; GISEL-GFX9-NEXT:    s_endpgm
   %result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 32)
   store i64 %result, ptr addrspace(1) %out
   ret void
@@ -764,10 +749,9 @@ define amdgpu_kernel void @v_icmp_i64_ne(ptr addrspace(1) %out, i64 %src) {
 ; GFX11-LABEL: v_icmp_i64_ne:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT:    s_mov_b64 s[4:5], 0x64
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[2:3], s[4:5]
+; GFX11-NEXT:    v_cmp_ne_u64_e64 s[2:3], 0x64, s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
@@ -790,24 +774,24 @@ define amdgpu_kernel void @v_icmp_i64_ne(ptr addrspace(1) %out, i64 %src) {
 ; SDAG-VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; SDAG-VI-NEXT:    s_endpgm
 ;
-; SDAG-GFX9-LABEL: v_icmp_i64_ne:
-; SDAG-GFX9:       ; %bb.0:
-; SDAG-GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; SDAG-GFX9-NEXT:    v_mov_b32_e32 v1, 0
-; SDAG-GFX9-NEXT:    v_mov_b32_e32 v0, 0x64
-; SDAG-GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX9-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[2:3], v[0:1]
-; SDAG-GFX9-NEXT:    v_mov_b32_e32 v2, s2
-; SDAG-GFX9-NEXT:    v_mov_b32_e32 v3, s3
-; SDAG-GFX9-NEXT:    global_store_dwordx2 v1, v[2:3], s[0:1]
-; SDAG-GFX9-NEXT:    s_endpgm
+; GFX9-LABEL: v_icmp_i64_ne:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x64
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[2:3], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    s_endpgm
 ;
 ; GISEL-VI-LABEL: v_icmp_i64_ne:
 ; GISEL-VI:       ; %bb.0:
 ; GISEL-VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GISEL-VI-NEXT:    s_mov_b64 s[4:5], 0x64
-; GISEL-VI-NEXT:    v_mov_b32_e32 v0, s4
-; GISEL-VI-NEXT:    v_mov_b32_e32 v1, s5
+; GISEL-VI-NEXT:    v_mov_b32_e32 v0, 0x64
+; GISEL-VI-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL-VI-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[2:3], v[0:1]
 ; GISEL-VI-NEXT:    v_mov_b32_e32 v3, s1
@@ -816,20 +800,6 @@ define amdgpu_kernel void @v_icmp_i64_ne(ptr addrspace(1) %out, i64 %src) {
 ; GISEL-VI-NEXT:    v_mov_b32_e32 v2, s0
 ; GISEL-VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GISEL-VI-NEXT:    s_endpgm
-;
-; GISEL-GFX9-LABEL: v_icmp_i64_ne:
-; GISEL-GFX9:       ; %bb.0:
-; GISEL-GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GISEL-GFX9-NEXT:    s_mov_b64 s[4:5], 0x64
-; GISEL-GFX9-NEXT:    v_mov_b32_e32 v0, s4
-; GISEL-GFX9-NEXT:    v_mov_b32_e32 v1, s5
-; GISEL-GFX9-NEXT:    v_mov_b32_e32 v2, 0
-; GISEL-GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX9-NEXT:    v_cmp_ne_u64_e64 s[2:3], s[2:3], v[0:1]
-; GISEL-GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GISEL-GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GISEL-GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
-; GISEL-GFX9-NEXT:    s_endpgm
   %result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 33)
   store i64 %result, ptr addrspace(1) %out
   ret void
@@ -839,10 +809,9 @@ define amdgpu_kernel void @v_icmp_u64_ugt(ptr addrspace(1) %out, i64 %src) {
 ; GFX11-LABEL: v_icmp_u64_ugt:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT:    s_mov_b64 s[4:5], 0x64
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_gt_u64_e64 s[2:3], s[2:3], s[4:5]
+; GFX11-NEXT:    v_cmp_lt_u64_e64 s[2:3], 0x64, s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
@@ -865,24 +834,24 @@ define amdgpu_kernel void @v_icmp_u64_ugt(ptr addrspace(1) %out, i64 %src) {
 ; SDAG-VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; SDAG-VI-NEXT:    s_endpgm
 ;
-; SDAG-GFX9-LABEL: v_icmp_u64_ugt:
-; SDAG-GFX9:       ; %bb.0:
-; SDAG-GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; SDAG-GFX9-NEXT:    v_mov_b32_e32 v1, 0
-; SDAG-GFX9-NEXT:    v_mov_b32_e32 v0, 0x64
-; SDAG-GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX9-NEXT:    v_cmp_gt_u64_e64 s[2:3], s[2:3], v[0:1]
-; SDAG-GFX9-NEXT:    v_mov_b32_e32 v2, s2
-; SDAG-GFX9-NEXT:    v_mov_b32_e32 v3, s3
-; SDAG-GFX9-NEXT:    global_store_dwordx2 v1, v[2:3], s[0:1]
-; SDAG-GFX9-NEXT:    s_endpgm
+; GFX9-LABEL: v_icmp_u64_ugt:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x64
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_gt_u64_e64 s[2:3], s[2:3], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    s_endpgm
 ;
 ; GISEL-VI-LABEL: v_icmp_u64_ugt:
 ; GISEL-VI:       ; %bb.0:
 ; GISEL-VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GISEL-VI-NEXT:    s_mov_b64 s[4:5], 0x64
-; GISEL-VI-NEXT:    v_mov_b32_e32 v0, s4
-; GISEL-VI-NEXT:    v_mov_b32_e32 v1, s5
+; GISEL-VI-NEXT:    v_mov_b32_e32 v0, 0x64
+; GISEL-VI-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL-VI-NEXT:    v_cmp_gt_u64_e64 s[2:3], s[2:3], v[0:1]
 ; GISEL-VI-NEXT:    v_mov_b32_e32 v3, s1
@@ -891,20 +860,6 @@ define amdgpu_kernel void @v_icmp_u64_ugt(ptr addrspace(1) %out, i64 %src) {
 ; GISEL-VI-NEXT:    v_mov_b32_e32 v2, s0
 ; GISEL-VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GISEL-VI-NEXT:    s_endpgm
-;
-; GISEL-GFX9-LABEL: v_icmp_u64_ugt:
-; GISEL-GFX9:       ; %bb.0:
-; GISEL-GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GISEL-GFX9-NEXT:    s_mov_b64 s[4:5], 0x64
-; GISEL-GFX9-NEXT:    v_mov_b32_e32 v0, s4
-; GISEL-GFX9-NEXT:    v_mov_b32_e32 v1, s5
-; GISEL-GFX9-NEXT:    v_mov_b32_e32 v2, 0
-; GISEL-GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX9-NEXT:    v_cmp_gt_u64_e64 s[2:3], s[2:3], v[0:1]
-; GISEL-GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GISEL-GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GISEL-GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
-; GISEL-GFX9-NEXT:    s_endpgm
   %result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 34)
   store i64 %result, ptr addrspace(1) %out
   ret void
@@ -914,10 +869,9 @@ define amdgpu_kernel void @v_icmp_u64_uge(ptr addrspace(1) %out, i64 %src) {
 ; GFX11-LABEL: v_icmp_u64_uge:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT:    s_mov_b64 s[4:5], 0x64
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_ge_u64_e64 s[2:3], s[2:3], s[4:5]
+; GFX11-NEXT:    v_cmp_le_u64_e64 s[2:3], 0x64, s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
@@ -940,24 +894,24 @@ define amdgpu_kernel void @v_icmp_u64_uge(ptr addrspace(1) %out, i64 %src) {
 ; SDAG-VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; SDAG-VI-NEXT:    s_endpgm
 ;
-; SDAG-GFX9-LABEL: v_icmp_u64_uge:
-; SDAG-GFX9:       ; %bb.0:
-; SDAG-GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; SDAG-GFX9-NEXT:    v_mov_b32_e32 v1, 0
-; SDAG-GFX9-NEXT:    v_mov_b32_e32 v0, 0x64
-; SDAG-GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX9-NEXT:    v_cmp_ge_u64_e64 s[2:3], s[2:3], v[0:1]
-; SDAG-GFX9-NEXT:    v_mov_b32_e32 v2, s2
-; SDAG-GFX9-NEXT:    v_mov_b32_e32 v3, s3
-; SDAG-GFX9-NEXT:    global_store_dwordx2 v1, v[2:3], s[0:1]
-; SDAG-GFX9-NEXT:    s_endpgm
+; GFX9-LABEL: v_icmp_u64_uge:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x64
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_ge_u64_e64 s[2:3], s[2:3], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    s_endpgm
 ;
 ; GISEL-VI-LABEL: v_icmp_u64_uge:
 ; GISEL-VI:       ; %bb.0:
 ; GISEL-VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GISEL-VI-NEXT:    s_mov_b64 s[4:5], 0x64
-; GISEL-VI-NEXT:    v_mov_b32_e32 v0, s4
-; GISEL-VI-NEXT:    v_mov_b32_e32 v1, s5
+; GISEL-VI-NEXT:    v_mov_b32_e32 v0, 0x64
+; GISEL-VI-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL-VI-NEXT:    v_cmp_ge_u64_e64 s[2:3], s[2:3], v[0:1]
 ; GISEL-VI-NEXT:    v_mov_b32_e32 v3, s1
@@ -966,20 +920,6 @@ define amdgpu_kernel void @v_icmp_u64_uge(ptr addrspace(1) %out, i64 %src) {
 ; GISEL-VI-NEXT:    v_mov_b32_e32 v2, s0
 ; GISEL-VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GISEL-VI-NEXT:    s_endpgm
-;
-; GISEL-GFX9-LABEL: v_icmp_u64_uge:
-; GISEL-GFX9:       ; %bb.0:
-; GISEL-GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GISEL-GFX9-NEXT:    s_mov_b64 s[4:5], 0x64
-; GISEL-GFX9-NEXT:    v_mov_b32_e32 v0, s4
-; GISEL-GFX9-NEXT:    v_mov_b32_e32 v1, s5
-; GISEL-GFX9-NEXT:    v_mov_b32_e32 v2, 0
-; GISEL-GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX9-NEXT:    v_cmp_ge_u64_e64 s[2:3], s[2:3], v[0:1]
-; GISEL-GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GISEL-GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GISEL-GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
-; GISEL-GFX9-NEXT:    s_endpgm
   %result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 35)
   store i64 %result, ptr addrspace(1) %out
   ret void
@@ -989,10 +929,9 @@ define amdgpu_kernel void @v_icmp_u64_ult(ptr addrspace(1) %out, i64 %src) {
 ; GFX11-LABEL: v_icmp_u64_ult:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT:    s_mov_b64 s[4:5], 0x64
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_lt_u64_e64 s[2:3], s[2:3], s[4:5]
+; GFX11-NEXT:    v_cmp_gt_u64_e64 s[2:3], 0x64, s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
@@ -1015,24 +954,24 @@ define amdgpu_kernel void @v_icmp_u64_ult(ptr addrspace(1) %out, i64 %src) {
 ; SDAG-VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; SDAG-VI-NEXT:    s_endpgm
 ;
-; SDAG-GFX9-LABEL: v_icmp_u64_ult:
-; SDAG-GFX9:       ; %bb.0:
-; SDAG-GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; SDAG-GFX9-NEXT:    v_mov_b32_e32 v1, 0
-; SDAG-GFX9-NEXT:    v_mov_b32_e32 v0, 0x64
-; SDAG-GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX9-NEXT:    v_cmp_lt_u64_e64 s[2:3], s[2:3], v[0:1]
-; SDAG-GFX9-NEXT:    v_mov_b32_e32 v2, s2
-; SDAG-GFX9-NEXT:    v_mov_b32_e32 v3, s3
-; SDAG-GFX9-NEXT:    global_store_dwordx2 v1, v[2:3], s[0:1]
-; SDAG-GFX9-NEXT:    s_endpgm
+; GFX9-LABEL: v_icmp_u64_ult:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x64
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_lt_u64_e64 s[2:3], s[2:3], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    s_endpgm
 ;
 ; GISEL-VI-LABEL: v_icmp_u64_ult:
 ; GISEL-VI:       ; %bb.0:
 ; GISEL-VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GISEL-VI-NEXT:    s_mov_b64 s[4:5], 0x64
-; GISEL-VI-NEXT:    v_mov_b32_e32 v0, s4
-; GISEL-VI-NEXT:    v_mov_b32_e32 v1, s5
+; GISEL-VI-NEXT:    v_mov_b32_e32 v0, 0x64
+; GISEL-VI-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL-VI-NEXT:    v_cmp_lt_u64_e64 s[2:3], s[2:3], v[0:1]
 ; GISEL-VI-NEXT:    v_mov_b32_e32 v3, s1
@@ -1041,20 +980,6 @@ define amdgpu_kernel void @v_icmp_u64_ult(ptr addrspace(1) %out, i64 %src) {
 ; GISEL-VI-NEXT:    v_mov_b32_e32 v2, s0
 ; GISEL-VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GISEL-VI-NEXT:    s_endpgm
-;
-; GISEL-GFX9-LABEL: v_icmp_u64_ult:
-; GISEL-GFX9:       ; %bb.0:
-; GISEL-GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GISEL-GFX9-NEXT:    s_mov_b64 s[4:5], 0x64
-; GISEL-GFX9-NEXT:    v_mov_b32_e32 v0, s4
-; GISEL-GFX9-NEXT:    v_mov_b32_e32 v1, s5
-; GISEL-GFX9-NEXT:    v_mov_b32_e32 v2, 0
-; GISEL-GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX9-NEXT:    v_cmp_lt_u64_e64 s[2:3], s[2:3], v[0:1]
-; GISEL-GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GISEL-GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GISEL-GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
-; GISEL-GFX9-NEXT:    s_endpgm
   %result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 36)
   store i64 %result, ptr addrspace(1) %out
   ret void
@@ -1064,10 +989,9 @@ define amdgpu_kernel void @v_icmp_u64_ule(ptr addrspace(1) %out, i64 %src) {
 ; GFX11-LABEL: v_icmp_u64_ule:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT:    s_mov_b64 s[4:5], 0x64
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_le_u64_e64 s[2:3], s[2:3], s[4:5]
+; GFX11-NEXT:    v_cmp_ge_u64_e64 s[2:3], 0x64, s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
@@ -1090,24 +1014,24 @@ define amdgpu_kernel void @v_icmp_u64_ule(ptr addrspace(1) %out, i64 %src) {
 ; SDAG-VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; SDAG-VI-NEXT:    s_endpgm
 ;
-; SDAG-GFX9-LABEL: v_icmp_u64_ule:
-; SDAG-GFX9:       ; %bb.0:
-; SDAG-GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; SDAG-GFX9-NEXT:    v_mov_b32_e32 v1, 0
-; SDAG-GFX9-NEXT:    v_mov_b32_e32 v0, 0x64
-; SDAG-GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX9-NEXT:    v_cmp_le_u64_e64 s[2:3], s[2:3], v[0:1]
-; SDAG-GFX9-NEXT:    v_mov_b32_e32 v2, s2
-; SDAG-GFX9-NEXT:    v_mov_b32_e32 v3, s3
-; SDAG-GFX9-NEXT:    global_store_dwordx2 v1, v[2:3], s[0:1]
-; SDAG-GFX9-NEXT:    s_endpgm
+; GFX9-LABEL: v_icmp_u64_ule:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x64
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_le_u64_e64 s[2:3], s[2:3], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    s_endpgm
 ;
 ; GISEL-VI-LABEL: v_icmp_u64_ule:
 ; GISEL-VI:       ; %bb.0:
 ; GISEL-VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GISEL-VI-NEXT:    s_mov_b64 s[4:5], 0x64
-; GISEL-VI-NEXT:    v_mov_b32_e32 v0, s4
-; GISEL-VI-NEXT:    v_mov_b32_e32 v1, s5
+; GISEL-VI-NEXT:    v_mov_b32_e32 v0, 0x64
+; GISEL-VI-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL-VI-NEXT:    v_cmp_le_u64_e64 s[2:3], s[2:3], v[0:1]
 ; GISEL-VI-NEXT:    v_mov_b32_e32 v3, s1
@@ -1116,20 +1040,6 @@ define amdgpu_kernel void @v_icmp_u64_ule(ptr addrspace(1) %out, i64 %src) {
 ; GISEL-VI-NEXT:    v_mov_b32_e32 v2, s0
 ; GISEL-VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GISEL-VI-NEXT:    s_endpgm
-;
-; GISEL-GFX9-LABEL: v_icmp_u64_ule:
-; GISEL-GFX9:       ; %bb.0:
-; GISEL-GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GISEL-GFX9-NEXT:    s_mov_b64 s[4:5], 0x64
-; GISEL-GFX9-NEXT:    v_mov_b32_e32 v0, s4
-; GISEL-GFX9-NEXT:    v_mov_b32_e32 v1, s5
-; GISEL-GFX9-NEXT:    v_mov_b32_e32 v2, 0
-; GISEL-GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX9-NEXT:    v_cmp_le_u64_e64 s[2:3], s[2:3], v[0:1]
-; GISEL-GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GISEL-GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GISEL-GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
-; GISEL-GFX9-NEXT:    s_endpgm
   %result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 37)
   store i64 %result, ptr addrspace(1) %out
   ret void
@@ -1139,10 +1049,9 @@ define amdgpu_kernel void @v_icmp_i64_sgt(ptr addrspace(1) %out, i64 %src) {
 ; GFX11-LABEL: v_icmp_i64_sgt:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT:    s_mov_b64 s[4:5], 0x64
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_gt_i64_e64 s[2:3], s[2:3], s[4:5]
+; GFX11-NEXT:    v_cmp_lt_i64_e64 s[2:3], 0x64, s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
@@ -1165,24 +1074,24 @@ define amdgpu_kernel void @v_icmp_i64_sgt(ptr addrspace(1) %out, i64 %src) {
 ; SDAG-VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; SDAG-VI-NEXT:    s_endpgm
 ;
-; SDAG-GFX9-LABEL: v_icmp_i64_sgt:
-; SDAG-GFX9:       ; %bb.0:
-; SDAG-GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; SDAG-GFX9-NEXT:    v_mov_b32_e32 v1, 0
-; SDAG-GFX9-NEXT:    v_mov_b32_e32 v0, 0x64
-; SDAG-GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX9-NEXT:    v_cmp_gt_i64_e64 s[2:3], s[2:3], v[0:1]
-; SDAG-GFX9-NEXT:    v_mov_b32_e32 v2, s2
-; SDAG-GFX9-NEXT:    v_mov_b32_e32 v3, s3
-; SDAG-GFX9-NEXT:    global_store_dwordx2 v1, v[2:3], s[0:1]
-; SDAG-GFX9-NEXT:    s_endpgm
+; GFX9-LABEL: v_icmp_i64_sgt:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x64
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_gt_i64_e64 s[2:3], s[2:3], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    s_endpgm
 ;
 ; GISEL-VI-LABEL: v_icmp_i64_sgt:
 ; GISEL-VI:       ; %bb.0:
 ; GISEL-VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GISEL-VI-NEXT:    s_mov_b64 s[4:5], 0x64
-; GISEL-VI-NEXT:    v_mov_b32_e32 v0, s4
-; GISEL-VI-NEXT:    v_mov_b32_e32 v1, s5
+; GISEL-VI-NEXT:    v_mov_b32_e32 v0, 0x64
+; GISEL-VI-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL-VI-NEXT:    v_cmp_gt_i64_e64 s[2:3], s[2:3], v[0:1]
 ; GISEL-VI-NEXT:    v_mov_b32_e32 v3, s1
@@ -1191,20 +1100,6 @@ define amdgpu_kernel void @v_icmp_i64_sgt(ptr addrspace(1) %out, i64 %src) {
 ; GISEL-VI-NEXT:    v_mov_b32_e32 v2, s0
 ; GISEL-VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GISEL-VI-NEXT:    s_endpgm
-;
-; GISEL-GFX9-LABEL: v_icmp_i64_sgt:
-; GISEL-GFX9:       ; %bb.0:
-; GISEL-GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GISEL-GFX9-NEXT:    s_mov_b64 s[4:5], 0x64
-; GISEL-GFX9-NEXT:    v_mov_b32_e32 v0, s4
-; GISEL-GFX9-NEXT:    v_mov_b32_e32 v1, s5
-; GISEL-GFX9-NEXT:    v_mov_b32_e32 v2, 0
-; GISEL-GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX9-NEXT:    v_cmp_gt_i64_e64 s[2:3], s[2:3], v[0:1]
-; GISEL-GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GISEL-GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GISEL-GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
-; GISEL-GFX9-NEXT:    s_endpgm
   %result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 38)
   store i64 %result, ptr addrspace(1) %out
   ret void
@@ -1214,10 +1109,9 @@ define amdgpu_kernel void @v_icmp_i64_sge(ptr addrspace(1) %out, i64 %src) {
 ; GFX11-LABEL: v_icmp_i64_sge:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT:    s_mov_b64 s[4:5], 0x64
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_ge_i64_e64 s[2:3], s[2:3], s[4:5]
+; GFX11-NEXT:    v_cmp_le_i64_e64 s[2:3], 0x64, s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
@@ -1240,24 +1134,24 @@ define amdgpu_kernel void @v_icmp_i64_sge(ptr addrspace(1) %out, i64 %src) {
 ; SDAG-VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; SDAG-VI-NEXT:    s_endpgm
 ;
-; SDAG-GFX9-LABEL: v_icmp_i64_sge:
-; SDAG-GFX9:       ; %bb.0:
-; SDAG-GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; SDAG-GFX9-NEXT:    v_mov_b32_e32 v1, 0
-; SDAG-GFX9-NEXT:    v_mov_b32_e32 v0, 0x64
-; SDAG-GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX9-NEXT:    v_cmp_ge_i64_e64 s[2:3], s[2:3], v[0:1]
-; SDAG-GFX9-NEXT:    v_mov_b32_e32 v2, s2
-; SDAG-GFX9-NEXT:    v_mov_b32_e32 v3, s3
-; SDAG-GFX9-NEXT:    global_store_dwordx2 v1, v[2:3], s[0:1]
-; SDAG-GFX9-NEXT:    s_endpgm
+; GFX9-LABEL: v_icmp_i64_sge:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x64
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_ge_i64_e64 s[2:3], s[2:3], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    s_endpgm
 ;
 ; GISEL-VI-LABEL: v_icmp_i64_sge:
 ; GISEL-VI:       ; %bb.0:
 ; GISEL-VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GISEL-VI-NEXT:    s_mov_b64 s[4:5], 0x64
-; GISEL-VI-NEXT:    v_mov_b32_e32 v0, s4
-; GISEL-VI-NEXT:    v_mov_b32_e32 v1, s5
+; GISEL-VI-NEXT:    v_mov_b32_e32 v0, 0x64
+; GISEL-VI-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL-VI-NEXT:    v_cmp_ge_i64_e64 s[2:3], s[2:3], v[0:1]
 ; GISEL-VI-NEXT:    v_mov_b32_e32 v3, s1
@@ -1266,20 +1160,6 @@ define amdgpu_kernel void @v_icmp_i64_sge(ptr addrspace(1) %out, i64 %src) {
 ; GISEL-VI-NEXT:    v_mov_b32_e32 v2, s0
 ; GISEL-VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GISEL-VI-NEXT:    s_endpgm
-;
-; GISEL-GFX9-LABEL: v_icmp_i64_sge:
-; GISEL-GFX9:       ; %bb.0:
-; GISEL-GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GISEL-GFX9-NEXT:    s_mov_b64 s[4:5], 0x64
-; GISEL-GFX9-NEXT:    v_mov_b32_e32 v0, s4
-; GISEL-GFX9-NEXT:    v_mov_b32_e32 v1, s5
-; GISEL-GFX9-NEXT:    v_mov_b32_e32 v2, 0
-; GISEL-GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX9-NEXT:    v_cmp_ge_i64_e64 s[2:3], s[2:3], v[0:1]
-; GISEL-GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GISEL-GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GISEL-GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
-; GISEL-GFX9-NEXT:    s_endpgm
   %result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 39)
   store i64 %result, ptr addrspace(1) %out
   ret void
@@ -1289,10 +1169,9 @@ define amdgpu_kernel void @v_icmp_i64_slt(ptr addrspace(1) %out, i64 %src) {
 ; GFX11-LABEL: v_icmp_i64_slt:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT:    s_mov_b64 s[4:5], 0x64
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_lt_i64_e64 s[2:3], s[2:3], s[4:5]
+; GFX11-NEXT:    v_cmp_gt_i64_e64 s[2:3], 0x64, s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
@@ -1315,24 +1194,24 @@ define amdgpu_kernel void @v_icmp_i64_slt(ptr addrspace(1) %out, i64 %src) {
 ; SDAG-VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; SDAG-VI-NEXT:    s_endpgm
 ;
-; SDAG-GFX9-LABEL: v_icmp_i64_slt:
-; SDAG-GFX9:       ; %bb.0:
-; SDAG-GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; SDAG-GFX9-NEXT:    v_mov_b32_e32 v1, 0
-; SDAG-GFX9-NEXT:    v_mov_b32_e32 v0, 0x64
-; SDAG-GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX9-NEXT:    v_cmp_lt_i64_e64 s[2:3], s[2:3], v[0:1]
-; SDAG-GFX9-NEXT:    v_mov_b32_e32 v2, s2
-; SDAG-GFX9-NEXT:    v_mov_b32_e32 v3, s3
-; SDAG-GFX9-NEXT:    global_store_dwordx2 v1, v[2:3], s[0:1]
-; SDAG-GFX9-NEXT:    s_endpgm
+; GFX9-LABEL: v_icmp_i64_slt:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x64
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_lt_i64_e64 s[2:3], s[2:3], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    s_endpgm
 ;
 ; GISEL-VI-LABEL: v_icmp_i64_slt:
 ; GISEL-VI:       ; %bb.0:
 ; GISEL-VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GISEL-VI-NEXT:    s_mov_b64 s[4:5], 0x64
-; GISEL-VI-NEXT:    v_mov_b32_e32 v0, s4
-; GISEL-VI-NEXT:    v_mov_b32_e32 v1, s5
+; GISEL-VI-NEXT:    v_mov_b32_e32 v0, 0x64
+; GISEL-VI-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL-VI-NEXT:    v_cmp_lt_i64_e64 s[2:3], s[2:3], v[0:1]
 ; GISEL-VI-NEXT:    v_mov_b32_e32 v3, s1
@@ -1341,20 +1220,6 @@ define amdgpu_kernel void @v_icmp_i64_slt(ptr addrspace(1) %out, i64 %src) {
 ; GISEL-VI-NEXT:    v_mov_b32_e32 v2, s0
 ; GISEL-VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GISEL-VI-NEXT:    s_endpgm
-;
-; GISEL-GFX9-LABEL: v_icmp_i64_slt:
-; GISEL-GFX9:       ; %bb.0:
-; GISEL-GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GISEL-GFX9-NEXT:    s_mov_b64 s[4:5], 0x64
-; GISEL-GFX9-NEXT:    v_mov_b32_e32 v0, s4
-; GISEL-GFX9-NEXT:    v_mov_b32_e32 v1, s5
-; GISEL-GFX9-NEXT:    v_mov_b32_e32 v2, 0
-; GISEL-GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX9-NEXT:    v_cmp_lt_i64_e64 s[2:3], s[2:3], v[0:1]
-; GISEL-GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GISEL-GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GISEL-GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
-; GISEL-GFX9-NEXT:    s_endpgm
   %result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 40)
   store i64 %result, ptr addrspace(1) %out
   ret void
@@ -1364,10 +1229,9 @@ define amdgpu_kernel void @v_icmp_i64_sle(ptr addrspace(1) %out, i64 %src) {
 ; GFX11-LABEL: v_icmp_i64_sle:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
-; GFX11-NEXT:    s_mov_b64 s[4:5], 0x64
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_le_i64_e64 s[2:3], s[2:3], s[4:5]
+; GFX11-NEXT:    v_cmp_ge_i64_e64 s[2:3], 0x64, s[2:3]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, s3
@@ -1390,24 +1254,24 @@ define amdgpu_kernel void @v_icmp_i64_sle(ptr addrspace(1) %out, i64 %src) {
 ; SDAG-VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; SDAG-VI-NEXT:    s_endpgm
 ;
-; SDAG-GFX9-LABEL: v_icmp_i64_sle:
-; SDAG-GFX9:       ; %bb.0:
-; SDAG-GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; SDAG-GFX9-NEXT:    v_mov_b32_e32 v1, 0
-; SDAG-GFX9-NEXT:    v_mov_b32_e32 v0, 0x64
-; SDAG-GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; SDAG-GFX9-NEXT:    v_cmp_le_i64_e64 s[2:3], s[2:3], v[0:1]
-; SDAG-GFX9-NEXT:    v_mov_b32_e32 v2, s2
-; SDAG-GFX9-NEXT:    v_mov_b32_e32 v3, s3
-; SDAG-GFX9-NEXT:    global_store_dwordx2 v1, v[2:3], s[0:1]
-; SDAG-GFX9-NEXT:    s_endpgm
+; GFX9-LABEL: v_icmp_i64_sle:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x64
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_le_i64_e64 s[2:3], s[2:3], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    s_endpgm
 ;
 ; GISEL-VI-LABEL: v_icmp_i64_sle:
 ; GISEL-VI:       ; %bb.0:
 ; GISEL-VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GISEL-VI-NEXT:    s_mov_b64 s[4:5], 0x64
-; GISEL-VI-NEXT:    v_mov_b32_e32 v0, s4
-; GISEL-VI-NEXT:    v_mov_b32_e32 v1, s5
+; GISEL-VI-NEXT:    v_mov_b32_e32 v0, 0x64
+; GISEL-VI-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; GISEL-VI-NEXT:    v_cmp_le_i64_e64 s[2:3], s[2:3], v[0:1]
 ; GISEL-VI-NEXT:    v_mov_b32_e32 v3, s1
@@ -1416,20 +1280,6 @@ define amdgpu_kernel void @v_icmp_i64_sle(ptr addrspace(1) %out, i64 %src) {
 ; GISEL-VI-NEXT:    v_mov_b32_e32 v2, s0
 ; GISEL-VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GISEL-VI-NEXT:    s_endpgm
-;
-; GISEL-GFX9-LABEL: v_icmp_i64_sle:
-; GISEL-GFX9:       ; %bb.0:
-; GISEL-GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GISEL-GFX9-NEXT:    s_mov_b64 s[4:5], 0x64
-; GISEL-GFX9-NEXT:    v_mov_b32_e32 v0, s4
-; GISEL-GFX9-NEXT:    v_mov_b32_e32 v1, s5
-; GISEL-GFX9-NEXT:    v_mov_b32_e32 v2, 0
-; GISEL-GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX9-NEXT:    v_cmp_le_i64_e64 s[2:3], s[2:3], v[0:1]
-; GISEL-GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GISEL-GFX9-NEXT:    v_mov_b32_e32 v1, s3
-; GISEL-GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
-; GISEL-GFX9-NEXT:    s_endpgm
   %result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 41)
   store i64 %result, ptr addrspace(1) %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.ll
index 2be94f4dbc650..6e6a3cbdd4887 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.ll
@@ -47,9 +47,7 @@ define amdgpu_kernel void @rsq_f64_constant_4.0(ptr addrspace(1) %out) #1 {
 }
 
 ; FUNC-LABEL: {{^}}rsq_f64_constant_100.0
-; SI-DAG: s_mov_b32 s{{[0-9]+}}, 0x40590000
-; SI-DAG: s_mov_b32 s{{[0-9]+}}, 0{{$}}
-; SI: v_rsq_f64_e32 {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+; SI: v_rsq_f64_e32 {{v\[[0-9]+:[0-9]+\]}}, 0x40590000
 define amdgpu_kernel void @rsq_f64_constant_100.0(ptr addrspace(1) %out) #1 {
   %rsq = call double @llvm.amdgcn.rsq.f64(double 100.0) #0
   store double %rsq, ptr addrspace(1) %out, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll b/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll
index 684edd27536b5..16b1ccf58cf6a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll
@@ -898,12 +898,13 @@ define { double, i32 } @test_frexp_f64_i32(double %a) {
 ; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-SDAG-NEXT:    s_mov_b32 s4, 0
 ; GFX6-SDAG-NEXT:    s_mov_b32 s5, 0x7ff00000
-; GFX6-SDAG-NEXT:    v_frexp_exp_i32_f64_e32 v2, v[0:1]
 ; GFX6-SDAG-NEXT:    v_frexp_mant_f64_e32 v[3:4], v[0:1]
 ; GFX6-SDAG-NEXT:    v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[4:5]
-; GFX6-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
-; GFX6-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX6-SDAG-NEXT:    v_cndmask_b32_e32 v3, v0, v3, vcc
+; GFX6-SDAG-NEXT:    v_frexp_exp_i32_f64_e32 v0, v[0:1]
 ; GFX6-SDAG-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; GFX6-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v0, vcc
+; GFX6-SDAG-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX6-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: test_frexp_f64_i32:
@@ -936,11 +937,11 @@ define { double, i32 } @test_frexp_f64_i32(double %a) {
 ; GFX6-GISEL-LABEL: test_frexp_f64_i32:
 ; GFX6-GISEL:       ; %bb.0:
 ; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-GISEL-NEXT:    s_mov_b32 s4, 0
-; GFX6-GISEL-NEXT:    s_mov_b32 s5, 0x7ff00000
+; GFX6-GISEL-NEXT:    v_mov_b32_e32 v5, 0
+; GFX6-GISEL-NEXT:    v_mov_b32_e32 v6, 0x7ff00000
 ; GFX6-GISEL-NEXT:    v_frexp_mant_f64_e32 v[3:4], v[0:1]
 ; GFX6-GISEL-NEXT:    v_frexp_exp_i32_f64_e32 v2, v[0:1]
-; GFX6-GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[4:5]
+; GFX6-GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, |v[0:1]|, v[5:6]
 ; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
 ; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
@@ -950,16 +951,16 @@ define { double, i32 } @test_frexp_f64_i32(double %a) {
 }
 
 define double @test_frexp_f64_i32_only_use_fract(double %a) {
-; GFX6-LABEL: test_frexp_f64_i32_only_use_fract:
-; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s4, 0
-; GFX6-NEXT:    s_mov_b32 s5, 0x7ff00000
-; GFX6-NEXT:    v_frexp_mant_f64_e32 v[2:3], v[0:1]
-; GFX6-NEXT:    v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX6-NEXT:    s_setpc_b64 s[30:31]
+; GFX6-SDAG-LABEL: test_frexp_f64_i32_only_use_fract:
+; GFX6-SDAG:       ; %bb.0:
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-SDAG-NEXT:    s_mov_b32 s4, 0
+; GFX6-SDAG-NEXT:    s_mov_b32 s5, 0x7ff00000
+; GFX6-SDAG-NEXT:    v_frexp_mant_f64_e32 v[2:3], v[0:1]
+; GFX6-SDAG-NEXT:    v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[4:5]
+; GFX6-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX6-SDAG-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX6-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: test_frexp_f64_i32_only_use_fract:
 ; GFX8:       ; %bb.0:
@@ -978,21 +979,32 @@ define double @test_frexp_f64_i32_only_use_fract(double %a) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_frexp_mant_f64_e32 v[0:1], v[0:1]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-GISEL-LABEL: test_frexp_f64_i32_only_use_fract:
+; GFX6-GISEL:       ; %bb.0:
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX6-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7ff00000
+; GFX6-GISEL-NEXT:    v_frexp_mant_f64_e32 v[4:5], v[0:1]
+; GFX6-GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, |v[0:1]|, v[2:3]
+; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call { double, i32 } @llvm.frexp.f64.i32(double %a)
   %result.0 = extractvalue { double, i32 } %result, 0
   ret double %result.0
 }
 
 define i32 @test_frexp_f64_i32_only_use_exp(double %a) {
-; GFX6-LABEL: test_frexp_f64_i32_only_use_exp:
-; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s4, 0
-; GFX6-NEXT:    s_mov_b32 s5, 0x7ff00000
-; GFX6-NEXT:    v_frexp_exp_i32_f64_e32 v2, v[0:1]
-; GFX6-NEXT:    v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[4:5]
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
-; GFX6-NEXT:    s_setpc_b64 s[30:31]
+; GFX6-SDAG-LABEL: test_frexp_f64_i32_only_use_exp:
+; GFX6-SDAG:       ; %bb.0:
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-SDAG-NEXT:    s_mov_b32 s4, 0
+; GFX6-SDAG-NEXT:    s_mov_b32 s5, 0x7ff00000
+; GFX6-SDAG-NEXT:    v_frexp_exp_i32_f64_e32 v2, v[0:1]
+; GFX6-SDAG-NEXT:    v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[4:5]
+; GFX6-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; GFX6-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: test_frexp_f64_i32_only_use_exp:
 ; GFX8:       ; %bb.0:
@@ -1011,6 +1023,16 @@ define i32 @test_frexp_f64_i32_only_use_exp(double %a) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_frexp_exp_i32_f64_e32 v0, v[0:1]
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-GISEL-LABEL: test_frexp_f64_i32_only_use_exp:
+; GFX6-GISEL:       ; %bb.0:
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX6-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7ff00000
+; GFX6-GISEL-NEXT:    v_frexp_exp_i32_f64_e32 v4, v[0:1]
+; GFX6-GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, |v[0:1]|, v[2:3]
+; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v4, vcc
+; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call { double, i32 } @llvm.frexp.f64.i32(double %a)
   %result.0 = extractvalue { double, i32 } %result, 1
   ret i32 %result.0
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
index 438b1bfe319a0..2a1488652d887 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
@@ -4558,10 +4558,10 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(ptr addrspace(1) %o
 ; GFX8-LABEL: constant_zextload_v16i1_to_v16i64:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX8-NEXT:    v_mov_b32_e32 v17, 0
-; GFX8-NEXT:    v_mov_b32_e32 v21, 0
-; GFX8-NEXT:    v_mov_b32_e32 v19, v17
-; GFX8-NEXT:    v_mov_b32_e32 v13, v17
+; GFX8-NEXT:    v_mov_b32_e32 v20, 0
+; GFX8-NEXT:    v_mov_b32_e32 v19, 0
+; GFX8-NEXT:    v_mov_b32_e32 v17, v20
+; GFX8-NEXT:    v_mov_b32_e32 v22, v20
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s3
@@ -4571,61 +4571,62 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(ptr addrspace(1) %o
 ; GFX8-NEXT:    s_add_u32 s4, s0, 0x50
 ; GFX8-NEXT:    s_addc_u32 s5, s1, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s4
-; GFX8-NEXT:    v_mov_b32_e32 v24, s3
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s5
-; GFX8-NEXT:    v_mov_b32_e32 v23, s2
-; GFX8-NEXT:    s_add_u32 s2, s0, 64
-; GFX8-NEXT:    s_addc_u32 s3, s1, 0
-; GFX8-NEXT:    v_mov_b32_e32 v9, v17
-; GFX8-NEXT:    v_mov_b32_e32 v5, v17
-; GFX8-NEXT:    v_mov_b32_e32 v22, 0
+; GFX8-NEXT:    v_mov_b32_e32 v23, v20
+; GFX8-NEXT:    v_mov_b32_e32 v13, v20
+; GFX8-NEXT:    v_mov_b32_e32 v9, v20
+; GFX8-NEXT:    v_mov_b32_e32 v5, v20
+; GFX8-NEXT:    v_mov_b32_e32 v25, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v15, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v11, 0
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v4, 10, v2
-; GFX8-NEXT:    v_and_b32_e32 v18, 1, v4
+; GFX8-NEXT:    v_and_b32_e32 v16, 1, v4
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v4, 11, v2
 ; GFX8-NEXT:    v_and_b32_e32 v4, 1, v4
-; GFX8-NEXT:    v_and_b32_e32 v20, 0xffff, v4
+; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff, v4
+; GFX8-NEXT:    flat_store_dwordx4 v[0:1], v[16:19]
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v4, 14, v2
-; GFX8-NEXT:    flat_store_dwordx4 v[0:1], v[18:21]
+; GFX8-NEXT:    v_mov_b32_e32 v17, s3
+; GFX8-NEXT:    v_mov_b32_e32 v16, s2
+; GFX8-NEXT:    s_add_u32 s2, s0, 64
 ; GFX8-NEXT:    v_mov_b32_e32 v0, 1
-; GFX8-NEXT:    v_and_b32_e32 v16, 1, v4
-; GFX8-NEXT:    v_lshrrev_b16_e32 v18, 15, v2
-; GFX8-NEXT:    flat_store_dwordx4 v[23:24], v[16:19]
-; GFX8-NEXT:    v_mov_b32_e32 v24, s3
-; GFX8-NEXT:    v_and_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_e32 v19, 1, v4
+; GFX8-NEXT:    v_lshrrev_b16_e32 v21, 15, v2
+; GFX8-NEXT:    s_addc_u32 s3, s1, 0
+; GFX8-NEXT:    flat_store_dwordx4 v[16:17], v[19:22]
+; GFX8-NEXT:    v_mov_b32_e32 v17, s3
+; GFX8-NEXT:    v_and_b32_sdwa v19, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v0, 9, v2
-; GFX8-NEXT:    v_mov_b32_e32 v23, s2
+; GFX8-NEXT:    v_mov_b32_e32 v16, s2
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX8-NEXT:    s_add_u32 s2, s0, 0x60
-; GFX8-NEXT:    v_mov_b32_e32 v19, 0
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff, v0
+; GFX8-NEXT:    v_mov_b32_e32 v22, 0
+; GFX8-NEXT:    v_and_b32_e32 v21, 0xffff, v0
 ; GFX8-NEXT:    s_addc_u32 s3, s1, 0
-; GFX8-NEXT:    flat_store_dwordx4 v[23:24], v[16:19]
-; GFX8-NEXT:    v_mov_b32_e32 v24, s3
-; GFX8-NEXT:    v_mov_b32_e32 v23, s2
+; GFX8-NEXT:    flat_store_dwordx4 v[16:17], v[19:22]
+; GFX8-NEXT:    v_mov_b32_e32 v1, v20
+; GFX8-NEXT:    v_mov_b32_e32 v19, s3
+; GFX8-NEXT:    v_mov_b32_e32 v18, s2
 ; GFX8-NEXT:    s_add_u32 s2, s0, 48
 ; GFX8-NEXT:    s_addc_u32 s3, s1, 0
-; GFX8-NEXT:    v_mov_b32_e32 v26, s3
+; GFX8-NEXT:    v_mov_b32_e32 v21, s3
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v0, 12, v2
-; GFX8-NEXT:    v_mov_b32_e32 v25, s2
+; GFX8-NEXT:    v_mov_b32_e32 v20, s2
 ; GFX8-NEXT:    s_add_u32 s2, s0, 32
-; GFX8-NEXT:    v_and_b32_e32 v19, 1, v0
+; GFX8-NEXT:    v_and_b32_e32 v22, 1, v0
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v0, 13, v2
-; GFX8-NEXT:    v_mov_b32_e32 v20, v17
-; GFX8-NEXT:    v_mov_b32_e32 v1, v17
 ; GFX8-NEXT:    v_mov_b32_e32 v17, s1
 ; GFX8-NEXT:    s_addc_u32 s3, s1, 0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v16, s0
-; GFX8-NEXT:    v_and_b32_e32 v21, 0xffff, v0
+; GFX8-NEXT:    v_and_b32_e32 v24, 0xffff, v0
 ; GFX8-NEXT:    s_add_u32 s0, s0, 16
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v6, 7, v2
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v0, 6, v2
-; GFX8-NEXT:    flat_store_dwordx4 v[23:24], v[19:22]
+; GFX8-NEXT:    flat_store_dwordx4 v[18:19], v[22:25]
 ; GFX8-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v4, 2, v2
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v8, 4, v2
@@ -4634,23 +4635,23 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(ptr addrspace(1) %o
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v14, 3, v2
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v2
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v2, 1, v2
-; GFX8-NEXT:    v_and_b32_e32 v22, 1, v6
+; GFX8-NEXT:    v_and_b32_e32 v24, 1, v6
 ; GFX8-NEXT:    v_mov_b32_e32 v19, s3
-; GFX8-NEXT:    v_mov_b32_e32 v21, s1
+; GFX8-NEXT:    v_mov_b32_e32 v23, s1
 ; GFX8-NEXT:    v_and_b32_e32 v10, 1, v10
 ; GFX8-NEXT:    v_and_b32_e32 v6, 1, v14
 ; GFX8-NEXT:    v_and_b32_e32 v2, 1, v2
-; GFX8-NEXT:    v_and_b32_e32 v14, 0xffff, v22
+; GFX8-NEXT:    v_and_b32_e32 v14, 0xffff, v24
 ; GFX8-NEXT:    v_mov_b32_e32 v18, s2
-; GFX8-NEXT:    v_mov_b32_e32 v20, s0
+; GFX8-NEXT:    v_mov_b32_e32 v22, s0
 ; GFX8-NEXT:    v_and_b32_e32 v4, 1, v4
 ; GFX8-NEXT:    v_and_b32_e32 v8, 1, v8
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff, v6
 ; GFX8-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX8-NEXT:    flat_store_dwordx4 v[25:26], v[12:15]
+; GFX8-NEXT:    flat_store_dwordx4 v[20:21], v[12:15]
 ; GFX8-NEXT:    flat_store_dwordx4 v[18:19], v[8:11]
-; GFX8-NEXT:    flat_store_dwordx4 v[20:21], v[4:7]
+; GFX8-NEXT:    flat_store_dwordx4 v[22:23], v[4:7]
 ; GFX8-NEXT:    flat_store_dwordx4 v[16:17], v[0:3]
 ; GFX8-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
index 25a84e9e787fb..2ca17b535cba0 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
@@ -3583,190 +3583,191 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %ou
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
-; GCN-HSA-NEXT:    flat_load_dwordx4 v[20:23], v[0:1]
-; GCN-HSA-NEXT:    s_add_u32 s4, s2, 0x50
-; GCN-HSA-NEXT:    s_addc_u32 s5, s3, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s5
-; GCN-HSA-NEXT:    s_add_u32 s4, s2, 0x60
-; GCN-HSA-NEXT:    flat_load_dwordx4 v[16:19], v[0:1]
-; GCN-HSA-NEXT:    s_addc_u32 s5, s3, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s5
-; GCN-HSA-NEXT:    s_add_u32 s4, s2, 0x70
-; GCN-HSA-NEXT:    flat_load_dwordx4 v[12:15], v[0:1]
-; GCN-HSA-NEXT:    s_addc_u32 s5, s3, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s5
+; GCN-HSA-NEXT:    flat_load_dwordx4 v[24:27], v[0:1]
 ; GCN-HSA-NEXT:    s_add_u32 s4, s2, 16
 ; GCN-HSA-NEXT:    s_addc_u32 s5, s3, 0
 ; GCN-HSA-NEXT:    s_add_u32 s6, s2, 32
 ; GCN-HSA-NEXT:    s_addc_u32 s7, s3, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v27, s7
-; GCN-HSA-NEXT:    v_mov_b32_e32 v26, s6
-; GCN-HSA-NEXT:    flat_load_dwordx4 v[8:11], v[0:1]
-; GCN-HSA-NEXT:    flat_load_dwordx4 v[28:31], v[26:27]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s5
-; GCN-HSA-NEXT:    s_add_u32 s4, s2, 48
-; GCN-HSA-NEXT:    s_addc_u32 s5, s3, 0
-; GCN-HSA-NEXT:    s_add_u32 s2, s2, 64
-; GCN-HSA-NEXT:    s_addc_u32 s3, s3, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s2
+; GCN-HSA-NEXT:    s_add_u32 s8, s2, 48
+; GCN-HSA-NEXT:    s_addc_u32 s9, s3, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v17, s9
+; GCN-HSA-NEXT:    s_add_u32 s10, s2, 64
+; GCN-HSA-NEXT:    v_mov_b32_e32 v16, s8
+; GCN-HSA-NEXT:    flat_load_dwordx4 v[20:23], v[16:17]
+; GCN-HSA-NEXT:    s_addc_u32 s11, s3, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s10
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s11
+; GCN-HSA-NEXT:    s_add_u32 s10, s2, 0x50
+; GCN-HSA-NEXT:    s_addc_u32 s11, s3, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s10
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s11
+; GCN-HSA-NEXT:    s_add_u32 s10, s2, 0x60
 ; GCN-HSA-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
+; GCN-HSA-NEXT:    flat_load_dwordx4 v[8:11], v[4:5]
+; GCN-HSA-NEXT:    s_addc_u32 s11, s3, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s10
+; GCN-HSA-NEXT:    s_add_u32 s2, s2, 0x70
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s11
+; GCN-HSA-NEXT:    s_addc_u32 s3, s3, 0
 ; GCN-HSA-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v25, s5
-; GCN-HSA-NEXT:    v_mov_b32_e32 v24, s4
-; GCN-HSA-NEXT:    flat_load_dwordx4 v[24:27], v[24:25]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s2
+; GCN-HSA-NEXT:    flat_load_dwordx4 v[28:31], v[12:13]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s5
+; GCN-HSA-NEXT:    v_mov_b32_e32 v15, s7
+; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s4
+; GCN-HSA-NEXT:    v_mov_b32_e32 v14, s6
+; GCN-HSA-NEXT:    flat_load_dwordx4 v[16:19], v[12:13]
+; GCN-HSA-NEXT:    flat_load_dwordx4 v[12:15], v[14:15]
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 16
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    s_add_u32 s4, s0, 0xe0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v33, s1
-; GCN-HSA-NEXT:    s_addc_u32 s5, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v32, s0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v37, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v36, s0
 ; GCN-HSA-NEXT:    s_waitcnt vmcnt(7)
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v37, 16, v21
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v35, 16, v20
-; GCN-HSA-NEXT:    v_and_b32_e32 v36, 0xffff, v21
-; GCN-HSA-NEXT:    v_and_b32_e32 v34, 0xffff, v20
-; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v20, s2
-; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xf0
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[32:33], v[34:37]
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v35, 16, v25
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v33, 16, v24
+; GCN-HSA-NEXT:    v_and_b32_e32 v34, 0xffff, v25
+; GCN-HSA-NEXT:    v_and_b32_e32 v32, 0xffff, v24
+; GCN-HSA-NEXT:    v_mov_b32_e32 v25, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v24, s2
+; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xe0
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v37, s5
-; GCN-HSA-NEXT:    v_mov_b32_e32 v36, s4
-; GCN-HSA-NEXT:    s_add_u32 s4, s0, 0xc0
+; GCN-HSA-NEXT:    s_add_u32 s4, s0, 0xf0
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[36:37], v[32:35]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v37, s3
 ; GCN-HSA-NEXT:    s_addc_u32 s5, s1, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v36, s2
+; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xc0
+; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    s_add_u32 s6, s0, 0xd0
 ; GCN-HSA-NEXT:    s_addc_u32 s7, s1, 0
 ; GCN-HSA-NEXT:    s_add_u32 s8, s0, 0xa0
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v35, 16, v23
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v33, 16, v22
-; GCN-HSA-NEXT:    v_and_b32_e32 v34, 0xffff, v23
-; GCN-HSA-NEXT:    v_and_b32_e32 v32, 0xffff, v22
 ; GCN-HSA-NEXT:    s_addc_u32 s9, s1, 0
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[20:21], v[32:35]
+; GCN-HSA-NEXT:    s_add_u32 s10, s0, 0xb0
+; GCN-HSA-NEXT:    s_addc_u32 s11, s1, 0
+; GCN-HSA-NEXT:    s_add_u32 s12, s0, 0x80
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v35, 16, v27
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v33, 16, v26
+; GCN-HSA-NEXT:    v_and_b32_e32 v34, 0xffff, v27
+; GCN-HSA-NEXT:    v_and_b32_e32 v32, 0xffff, v26
+; GCN-HSA-NEXT:    s_addc_u32 s13, s1, 0
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[24:25], v[32:35]
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(7)
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v27, 16, v1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v33, s13
+; GCN-HSA-NEXT:    v_mov_b32_e32 v32, s12
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v25, 16, v0
+; GCN-HSA-NEXT:    v_and_b32_e32 v26, 0xffff, v1
+; GCN-HSA-NEXT:    v_and_b32_e32 v24, 0xffff, v0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s8
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[32:33], v[24:27]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s9
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(7)
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v27, 16, v9
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v25, 16, v8
+; GCN-HSA-NEXT:    v_and_b32_e32 v26, 0xffff, v9
+; GCN-HSA-NEXT:    v_and_b32_e32 v24, 0xffff, v8
+; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s10
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[24:27]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s11
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v27, 16, v11
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v25, 16, v10
+; GCN-HSA-NEXT:    v_and_b32_e32 v26, 0xffff, v11
+; GCN-HSA-NEXT:    v_and_b32_e32 v24, 0xffff, v10
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[8:9], v[24:27]
 ; GCN-HSA-NEXT:    s_waitcnt vmcnt(8)
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v23, 16, v17
-; GCN-HSA-NEXT:    v_mov_b32_e32 v33, s9
-; GCN-HSA-NEXT:    v_mov_b32_e32 v32, s8
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v21, 16, v16
-; GCN-HSA-NEXT:    v_and_b32_e32 v22, 0xffff, v17
-; GCN-HSA-NEXT:    v_and_b32_e32 v20, 0xffff, v16
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[32:33], v[20:23]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v33, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v32, s2
-; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xb0
-; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v17, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v16, s2
-; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x80
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v22, 16, v19
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v20, 16, v18
-; GCN-HSA-NEXT:    v_and_b32_e32 v21, 0xffff, v19
-; GCN-HSA-NEXT:    v_and_b32_e32 v19, 0xffff, v18
-; GCN-HSA-NEXT:    v_mov_b32_e32 v35, s5
-; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v34, s4
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[16:17], v[19:22]
-; GCN-HSA-NEXT:    s_waitcnt vmcnt(9)
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v17, 16, v12
-; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s7
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v19, 16, v13
-; GCN-HSA-NEXT:    v_and_b32_e32 v18, 0xffff, v13
-; GCN-HSA-NEXT:    v_and_b32_e32 v16, 0xffff, v12
-; GCN-HSA-NEXT:    v_mov_b32_e32 v23, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v20, s6
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[34:35], v[16:19]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s2
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v18, 16, v15
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v16, 16, v14
-; GCN-HSA-NEXT:    v_and_b32_e32 v17, 0xffff, v15
-; GCN-HSA-NEXT:    v_and_b32_e32 v15, 0xffff, v14
-; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x90
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[20:21], v[15:18]
-; GCN-HSA-NEXT:    s_waitcnt vmcnt(10)
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v13, 16, v8
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v15, 16, v9
-; GCN-HSA-NEXT:    v_and_b32_e32 v14, 0xffff, v9
-; GCN-HSA-NEXT:    v_and_b32_e32 v12, 0xffff, v8
-; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v19, 16, v11
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v17, 16, v10
-; GCN-HSA-NEXT:    v_and_b32_e32 v18, 0xffff, v11
-; GCN-HSA-NEXT:    v_and_b32_e32 v16, 0xffff, v10
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[36:37], v[12:15]
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[32:33], v[16:19]
-; GCN-HSA-NEXT:    s_waitcnt vmcnt(9)
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v14, 16, v5
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v12, 16, v4
-; GCN-HSA-NEXT:    v_and_b32_e32 v13, 0xffff, v5
-; GCN-HSA-NEXT:    v_and_b32_e32 v11, 0xffff, v4
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v11, 16, v5
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v9, 16, v4
+; GCN-HSA-NEXT:    v_and_b32_e32 v10, 0xffff, v5
+; GCN-HSA-NEXT:    v_and_b32_e32 v8, 0xffff, v4
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s2
-; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x60
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[22:23], v[11:14]
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v18, 16, v7
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v14, 16, v1
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v12, 16, v0
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v16, 16, v6
-; GCN-HSA-NEXT:    v_and_b32_e32 v17, 0xffff, v7
-; GCN-HSA-NEXT:    v_and_b32_e32 v15, 0xffff, v6
-; GCN-HSA-NEXT:    v_and_b32_e32 v13, 0xffff, v1
-; GCN-HSA-NEXT:    v_and_b32_e32 v11, 0xffff, v0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v33, s7
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s5
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v27, 16, v7
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v25, 16, v6
+; GCN-HSA-NEXT:    v_and_b32_e32 v26, 0xffff, v7
+; GCN-HSA-NEXT:    v_and_b32_e32 v24, 0xffff, v6
+; GCN-HSA-NEXT:    v_mov_b32_e32 v32, s6
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[8:11]
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(8)
+; GCN-HSA-NEXT:    v_and_b32_e32 v7, 0xffff, v28
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v10, 16, v29
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v8, 16, v28
+; GCN-HSA-NEXT:    v_and_b32_e32 v9, 0xffff, v29
+; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x90
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[32:33], v[24:27]
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v27, 16, v31
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v25, 16, v30
+; GCN-HSA-NEXT:    v_and_b32_e32 v26, 0xffff, v31
+; GCN-HSA-NEXT:    v_and_b32_e32 v24, 0xffff, v30
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[36:37], v[7:10]
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[24:27]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v10, 16, v3
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[15:18]
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v4, 16, v30
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v18, 16, v29
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v16, 16, v28
-; GCN-HSA-NEXT:    v_and_b32_e32 v9, 0xffff, v3
-; GCN-HSA-NEXT:    s_waitcnt vmcnt(10)
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v22, 16, v27
-; GCN-HSA-NEXT:    v_and_b32_e32 v3, 0xffff, v30
-; GCN-HSA-NEXT:    v_and_b32_e32 v17, 0xffff, v29
-; GCN-HSA-NEXT:    v_and_b32_e32 v15, 0xffff, v28
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v30, 16, v25
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v28, 16, v24
-; GCN-HSA-NEXT:    v_and_b32_e32 v21, 0xffff, v27
-; GCN-HSA-NEXT:    v_and_b32_e32 v29, 0xffff, v25
-; GCN-HSA-NEXT:    v_and_b32_e32 v27, 0xffff, v24
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x60
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GCN-HSA-NEXT:    v_and_b32_e32 v5, 0xffff, v3
+; GCN-HSA-NEXT:    v_and_b32_e32 v3, 0xffff, v2
+; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[3:6]
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v1, 16, v20
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x70
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[27:30]
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v20, 16, v26
-; GCN-HSA-NEXT:    v_and_b32_e32 v19, 0xffff, v26
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s3
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v3, 16, v21
+; GCN-HSA-NEXT:    v_and_b32_e32 v2, 0xffff, v21
+; GCN-HSA-NEXT:    v_and_b32_e32 v0, 0xffff, v20
+; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 64
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[19:22]
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v7, 16, v23
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v5, 16, v22
+; GCN-HSA-NEXT:    v_and_b32_e32 v6, 0xffff, v23
+; GCN-HSA-NEXT:    v_and_b32_e32 v4, 0xffff, v22
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(12)
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v1, 16, v18
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(12)
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v11, 16, v15
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v7, 16, v17
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v5, 16, v16
+; GCN-HSA-NEXT:    v_and_b32_e32 v0, 0xffff, v18
+; GCN-HSA-NEXT:    v_and_b32_e32 v6, 0xffff, v17
+; GCN-HSA-NEXT:    v_and_b32_e32 v4, 0xffff, v16
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v18, 16, v13
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v16, 16, v12
+; GCN-HSA-NEXT:    v_and_b32_e32 v10, 0xffff, v15
+; GCN-HSA-NEXT:    v_and_b32_e32 v17, 0xffff, v13
+; GCN-HSA-NEXT:    v_and_b32_e32 v15, 0xffff, v12
+; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x50
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[15:18]
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v6, 16, v31
-; GCN-HSA-NEXT:    v_and_b32_e32 v5, 0xffff, v31
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[12:13], v[15:18]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 32
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[3:6]
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v9, 16, v14
+; GCN-HSA-NEXT:    v_and_b32_e32 v8, 0xffff, v14
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[12:13], v[8:11]
 ; GCN-HSA-NEXT:    s_add_u32 s0, s0, 48
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[11:14]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s2
 ; GCN-HSA-NEXT:    s_addc_u32 s1, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v8, 16, v2
-; GCN-HSA-NEXT:    v_and_b32_e32 v7, 0xffff, v2
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s1
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[7:10]
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v3, 16, v19
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s1
+; GCN-HSA-NEXT:    v_and_b32_e32 v2, 0xffff, v19
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s0
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-HSA-NEXT:    s_endpgm
 ;
 ; GCN-NOHSA-VI-LABEL: global_zextload_v64i16_to_v64i32:
@@ -4383,189 +4384,191 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-HSA-NEXT:    flat_load_dwordx4 v[28:31], v[0:1]
 ; GCN-HSA-NEXT:    s_add_u32 s4, s2, 0x70
-; GCN-HSA-NEXT:    flat_load_dwordx4 v[20:23], v[0:1]
 ; GCN-HSA-NEXT:    s_addc_u32 s5, s3, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s5
+; GCN-HSA-NEXT:    flat_load_dwordx4 v[20:23], v[0:1]
 ; GCN-HSA-NEXT:    s_add_u32 s4, s2, 0x60
-; GCN-HSA-NEXT:    flat_load_dwordx4 v[16:19], v[0:1]
 ; GCN-HSA-NEXT:    s_addc_u32 s5, s3, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN-HSA-NEXT:    s_add_u32 s4, s2, 0x50
-; GCN-HSA-NEXT:    s_addc_u32 s5, s3, 0
-; GCN-HSA-NEXT:    s_add_u32 s8, s2, 64
 ; GCN-HSA-NEXT:    flat_load_dwordx4 v[12:15], v[0:1]
+; GCN-HSA-NEXT:    s_addc_u32 s5, s3, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s5
+; GCN-HSA-NEXT:    s_add_u32 s4, s2, 64
+; GCN-HSA-NEXT:    flat_load_dwordx4 v[4:7], v[0:1]
+; GCN-HSA-NEXT:    s_addc_u32 s5, s3, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-HSA-NEXT:    s_addc_u32 s9, s3, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN-HSA-NEXT:    s_add_u32 s4, s2, 48
 ; GCN-HSA-NEXT:    s_addc_u32 s5, s3, 0
 ; GCN-HSA-NEXT:    s_add_u32 s6, s2, 32
 ; GCN-HSA-NEXT:    s_addc_u32 s7, s3, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v25, s7
 ; GCN-HSA-NEXT:    s_add_u32 s2, s2, 16
-; GCN-HSA-NEXT:    v_mov_b32_e32 v24, s6
-; GCN-HSA-NEXT:    flat_load_dwordx4 v[8:11], v[0:1]
-; GCN-HSA-NEXT:    flat_load_dwordx4 v[28:31], v[24:25]
-; GCN-HSA-NEXT:    s_addc_u32 s3, s3, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-HSA-NEXT:    v_mov_b32_e32 v25, s5
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v24, s4
-; GCN-HSA-NEXT:    flat_load_dwordx4 v[4:7], v[0:1]
-; GCN-HSA-NEXT:    flat_load_dwordx4 v[24:27], v[24:25]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s8
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s9
 ; GCN-HSA-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
+; GCN-HSA-NEXT:    s_addc_u32 s3, s3, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s2
+; GCN-HSA-NEXT:    flat_load_dwordx4 v[24:27], v[8:9]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s5
+; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s4
+; GCN-HSA-NEXT:    flat_load_dwordx4 v[8:11], v[8:9]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v17, s7
+; GCN-HSA-NEXT:    v_mov_b32_e32 v16, s6
+; GCN-HSA-NEXT:    flat_load_dwordx4 v[16:19], v[16:17]
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 16
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    s_add_u32 s4, s0, 0xe0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v37, s1
-; GCN-HSA-NEXT:    s_addc_u32 s5, s1, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v36, s0
 ; GCN-HSA-NEXT:    s_waitcnt vmcnt(7)
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v35, 16, v21
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v33, 16, v20
-; GCN-HSA-NEXT:    v_bfe_i32 v34, v21, 0, 16
-; GCN-HSA-NEXT:    v_bfe_i32 v32, v20, 0, 16
-; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v20, s2
-; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xf0
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v35, 16, v29
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v33, 16, v28
+; GCN-HSA-NEXT:    v_bfe_i32 v34, v29, 0, 16
+; GCN-HSA-NEXT:    v_bfe_i32 v32, v28, 0, 16
+; GCN-HSA-NEXT:    v_mov_b32_e32 v29, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v28, s2
+; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xe0
+; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[36:37], v[32:35]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v36, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v35, s2
+; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xf0
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v34, 16, v31
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v32, 16, v30
+; GCN-HSA-NEXT:    v_bfe_i32 v33, v31, 0, 16
+; GCN-HSA-NEXT:    v_bfe_i32 v31, v30, 0, 16
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v35, 16, v23
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v33, 16, v22
-; GCN-HSA-NEXT:    v_bfe_i32 v34, v23, 0, 16
-; GCN-HSA-NEXT:    v_bfe_i32 v32, v22, 0, 16
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[20:21], v[32:35]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v37, s5
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[28:29], v[31:34]
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(8)
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v29, 16, v20
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v33, s3
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v32, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xc0
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v31, 16, v21
+; GCN-HSA-NEXT:    v_bfe_i32 v30, v21, 0, 16
+; GCN-HSA-NEXT:    v_bfe_i32 v28, v20, 0, 16
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[35:36], v[28:31]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v35, s3
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v34, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xd0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v36, s4
-; GCN-HSA-NEXT:    s_waitcnt vmcnt(8)
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v23, 16, v17
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v21, 16, v16
-; GCN-HSA-NEXT:    v_bfe_i32 v22, v17, 0, 16
-; GCN-HSA-NEXT:    v_bfe_i32 v20, v16, 0, 16
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[36:37], v[20:23]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v37, s3
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v36, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xa0
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v22, 16, v19
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v20, 16, v18
-; GCN-HSA-NEXT:    v_bfe_i32 v21, v19, 0, 16
-; GCN-HSA-NEXT:    v_bfe_i32 v19, v18, 0, 16
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v31, 16, v23
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v29, 16, v22
+; GCN-HSA-NEXT:    v_bfe_i32 v30, v23, 0, 16
+; GCN-HSA-NEXT:    v_bfe_i32 v28, v22, 0, 16
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[32:33], v[19:22]
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[32:33], v[28:31]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v33, s3
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v32, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xb0
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    s_waitcnt vmcnt(9)
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v19, 16, v13
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v17, 16, v12
-; GCN-HSA-NEXT:    v_bfe_i32 v18, v13, 0, 16
-; GCN-HSA-NEXT:    v_bfe_i32 v16, v12, 0, 16
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v39, s3
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v23, 16, v15
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v21, 16, v14
-; GCN-HSA-NEXT:    v_bfe_i32 v22, v15, 0, 16
-; GCN-HSA-NEXT:    v_bfe_i32 v20, v14, 0, 16
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(9)
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v23, 16, v13
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v21, 16, v12
+; GCN-HSA-NEXT:    v_bfe_i32 v22, v13, 0, 16
+; GCN-HSA-NEXT:    v_bfe_i32 v20, v12, 0, 16
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v38, s2
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[34:35], v[16:19]
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[36:37], v[20:23]
-; GCN-HSA-NEXT:    s_waitcnt vmcnt(10)
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v18, 16, v9
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v16, 16, v8
-; GCN-HSA-NEXT:    v_bfe_i32 v17, v9, 0, 16
-; GCN-HSA-NEXT:    v_bfe_i32 v15, v8, 0, 16
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x80
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v14, 16, v11
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v12, 16, v10
-; GCN-HSA-NEXT:    v_bfe_i32 v13, v11, 0, 16
-; GCN-HSA-NEXT:    v_bfe_i32 v11, v10, 0, 16
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[32:33], v[15:18]
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[38:39], v[11:14]
-; GCN-HSA-NEXT:    s_waitcnt vmcnt(8)
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v18, 16, v1
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v16, 16, v0
-; GCN-HSA-NEXT:    v_bfe_i32 v17, v1, 0, 16
-; GCN-HSA-NEXT:    v_bfe_i32 v15, v0, 0, 16
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v31, 16, v15
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v29, 16, v14
+; GCN-HSA-NEXT:    v_bfe_i32 v30, v15, 0, 16
+; GCN-HSA-NEXT:    v_bfe_i32 v28, v14, 0, 16
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[34:35], v[20:23]
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[36:37], v[28:31]
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(10)
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v15, 16, v5
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v13, 16, v4
+; GCN-HSA-NEXT:    v_bfe_i32 v14, v5, 0, 16
+; GCN-HSA-NEXT:    v_bfe_i32 v12, v4, 0, 16
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v23, 16, v7
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v21, 16, v6
+; GCN-HSA-NEXT:    v_bfe_i32 v22, v7, 0, 16
+; GCN-HSA-NEXT:    v_bfe_i32 v20, v6, 0, 16
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(9)
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v7, 16, v1
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v5, 16, v0
+; GCN-HSA-NEXT:    v_bfe_i32 v6, v1, 0, 16
+; GCN-HSA-NEXT:    v_bfe_i32 v4, v0, 0, 16
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x90
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[15:18]
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[32:33], v[12:15]
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[38:39], v[20:23]
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x60
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v10, 16, v7
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v8, 16, v6
-; GCN-HSA-NEXT:    v_bfe_i32 v9, v7, 0, 16
-; GCN-HSA-NEXT:    v_bfe_i32 v7, v6, 0, 16
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v14, 16, v5
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v12, 16, v4
-; GCN-HSA-NEXT:    v_bfe_i32 v13, v5, 0, 16
-; GCN-HSA-NEXT:    v_bfe_i32 v11, v4, 0, 16
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v6, 16, v3
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v4, 16, v2
-; GCN-HSA-NEXT:    v_bfe_i32 v5, v3, 0, 16
-; GCN-HSA-NEXT:    v_bfe_i32 v3, v2, 0, 16
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v23, 16, v3
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v21, 16, v2
+; GCN-HSA-NEXT:    v_bfe_i32 v22, v3, 0, 16
+; GCN-HSA-NEXT:    v_bfe_i32 v20, v2, 0, 16
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[3:6]
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v20, 16, v28
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s2
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[20:23]
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(11)
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v3, 16, v9
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v1, 16, v8
+; GCN-HSA-NEXT:    v_bfe_i32 v2, v9, 0, 16
+; GCN-HSA-NEXT:    v_bfe_i32 v0, v8, 0, 16
+; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x70
-; GCN-HSA-NEXT:    v_bfe_i32 v19, v28, 0, 16
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v3, 16, v27
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v1, 16, v26
-; GCN-HSA-NEXT:    v_bfe_i32 v2, v27, 0, 16
-; GCN-HSA-NEXT:    v_bfe_i32 v0, v26, 0, 16
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v28, 16, v25
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v26, 16, v24
-; GCN-HSA-NEXT:    v_bfe_i32 v27, v25, 0, 16
-; GCN-HSA-NEXT:    v_bfe_i32 v25, v24, 0, 16
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[25:28]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s2
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 64
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v13, 16, v24
+; GCN-HSA-NEXT:    v_bfe_i32 v12, v24, 0, 16
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v24, 16, v11
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v22, 16, v10
+; GCN-HSA-NEXT:    v_bfe_i32 v23, v11, 0, 16
+; GCN-HSA-NEXT:    v_bfe_i32 v21, v10, 0, 16
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v22, 16, v29
-; GCN-HSA-NEXT:    v_bfe_i32 v21, v29, 0, 16
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[8:9], v[21:24]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x50
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[19:22]
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(12)
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v3, 16, v19
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v1, 16, v18
+; GCN-HSA-NEXT:    v_bfe_i32 v2, v19, 0, 16
+; GCN-HSA-NEXT:    v_bfe_i32 v0, v18, 0, 16
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v20, 16, v17
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v18, 16, v16
+; GCN-HSA-NEXT:    v_bfe_i32 v19, v17, 0, 16
+; GCN-HSA-NEXT:    v_bfe_i32 v17, v16, 0, 16
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v18, 16, v31
-; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v16, 16, v30
-; GCN-HSA-NEXT:    v_bfe_i32 v17, v31, 0, 16
-; GCN-HSA-NEXT:    v_bfe_i32 v15, v30, 0, 16
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[8:9], v[17:20]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 32
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[15:18]
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v15, 16, v25
+; GCN-HSA-NEXT:    v_bfe_i32 v14, v25, 0, 16
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN-HSA-NEXT:    s_add_u32 s0, s0, 48
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[11:14]
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[12:15]
 ; GCN-HSA-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s0
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v7, 16, v27
+; GCN-HSA-NEXT:    v_ashrrev_i32_e32 v5, 16, v26
+; GCN-HSA-NEXT:    v_bfe_i32 v6, v27, 0, 16
+; GCN-HSA-NEXT:    v_bfe_i32 v4, v26, 0, 16
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s1
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[7:10]
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[4:7]
 ; GCN-HSA-NEXT:    s_endpgm
 ;
 ; GCN-NOHSA-VI-LABEL: global_sextload_v64i16_to_v64i32:
@@ -6526,16 +6529,16 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(ptr addrspace(1) %ou
 ; GCN-HSA-LABEL: global_zextload_v16i16_to_v16i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v8, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v17, v8
-; GCN-HSA-NEXT:    v_mov_b32_e32 v19, v8
-; GCN-HSA-NEXT:    v_mov_b32_e32 v11, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v15, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v17, v15
+; GCN-HSA-NEXT:    v_mov_b32_e32 v19, v15
+; GCN-HSA-NEXT:    v_mov_b32_e32 v9, 0
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
-; GCN-HSA-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; GCN-HSA-NEXT:    s_add_u32 s2, s2, 16
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s3, 0
+; GCN-HSA-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s3
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s2
 ; GCN-HSA-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
@@ -6543,65 +6546,65 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(ptr addrspace(1) %ou
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    s_add_u32 s4, s0, 16
 ; GCN-HSA-NEXT:    s_addc_u32 s5, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s5
-; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s4
+; GCN-HSA-NEXT:    v_mov_b32_e32 v11, s5
+; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s4
 ; GCN-HSA-NEXT:    s_add_u32 s4, s0, 0x50
 ; GCN-HSA-NEXT:    s_addc_u32 s5, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s2
-; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s1
-; GCN-HSA-NEXT:    v_mov_b32_e32 v20, s0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v15, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v13, 0
 ; GCN-HSA-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v18, 16, v1
 ; GCN-HSA-NEXT:    v_and_b32_e32 v16, 0xffff, v1
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[9:10], v[16:19]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s5
-; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s4
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[10:11], v[16:19]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v11, s5
+; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s4
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(1)
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v18, 16, v5
+; GCN-HSA-NEXT:    v_and_b32_e32 v16, 0xffff, v5
 ; GCN-HSA-NEXT:    s_add_u32 s4, s0, 0x70
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[10:11], v[16:19]
 ; GCN-HSA-NEXT:    s_addc_u32 s5, s1, 0
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v18, 16, v7
+; GCN-HSA-NEXT:    v_and_b32_e32 v16, 0xffff, v7
+; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v7, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 32
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v23, s3
-; GCN-HSA-NEXT:    s_waitcnt vmcnt(1)
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v18, 16, v5
-; GCN-HSA-NEXT:    v_and_b32_e32 v16, 0xffff, v5
-; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s2
+; GCN-HSA-NEXT:    v_mov_b32_e32 v11, s5
+; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s4
+; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 64
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[9:10], v[16:19]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s5
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[10:11], v[16:19]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v20, s1
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s4
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v18, 16, v7
-; GCN-HSA-NEXT:    v_and_b32_e32 v16, 0xffff, v7
+; GCN-HSA-NEXT:    v_mov_b32_e32 v19, s0
 ; GCN-HSA-NEXT:    s_add_u32 s0, s0, 0x60
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[9:10], v[16:19]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v10, v8
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v9, 16, v3
-; GCN-HSA-NEXT:    v_and_b32_e32 v7, 0xffff, v3
 ; GCN-HSA-NEXT:    s_addc_u32 s1, s1, 0
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[12:13], v[7:10]
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v16, 16, v3
+; GCN-HSA-NEXT:    v_and_b32_e32 v14, 0xffff, v3
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v3, v8
-; GCN-HSA-NEXT:    v_mov_b32_e32 v13, v8
-; GCN-HSA-NEXT:    v_mov_b32_e32 v9, v8
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v14, 16, v4
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v10, 16, v6
-; GCN-HSA-NEXT:    v_and_b32_e32 v8, 0xffff, v6
-; GCN-HSA-NEXT:    v_and_b32_e32 v12, 0xffff, v4
-; GCN-HSA-NEXT:    v_mov_b32_e32 v7, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v25, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, v15
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v12, 16, v4
+; GCN-HSA-NEXT:    v_and_b32_e32 v10, 0xffff, v4
+; GCN-HSA-NEXT:    v_mov_b32_e32 v24, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v26, s1
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
 ; GCN-HSA-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GCN-HSA-NEXT:    v_mov_b32_e32 v19, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v6, s2
-; GCN-HSA-NEXT:    v_mov_b32_e32 v24, s0
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v18, 16, v0
-; GCN-HSA-NEXT:    v_and_b32_e32 v16, 0xffff, v0
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[22:23], v[2:5]
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[20:21], v[16:19]
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[6:7], v[12:15]
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[24:25], v[8:11]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v18, 0
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[7:8], v[14:17]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v11, v15
+; GCN-HSA-NEXT:    v_mov_b32_e32 v16, v15
+; GCN-HSA-NEXT:    v_mov_b32_e32 v7, v15
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v8, 16, v6
+; GCN-HSA-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; GCN-HSA-NEXT:    v_mov_b32_e32 v23, s2
+; GCN-HSA-NEXT:    v_mov_b32_e32 v25, s0
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v17, 16, v0
+; GCN-HSA-NEXT:    v_and_b32_e32 v15, 0xffff, v0
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[21:22], v[2:5]
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[19:20], v[15:18]
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[23:24], v[10:13]
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[25:26], v[6:9]
 ; GCN-HSA-NEXT:    s_endpgm
 ;
 ; GCN-NOHSA-VI-LABEL: global_zextload_v16i16_to_v16i64:
@@ -7358,26 +7361,27 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(ptr addrspace(1) %ou
 ; GCN-HSA-LABEL: global_zextload_v32i16_to_v32i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v26, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v29, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v25, 0
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_add_u32 s4, s2, 16
 ; GCN-HSA-NEXT:    s_addc_u32 s5, s3, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s5
-; GCN-HSA-NEXT:    flat_load_dwordx4 v[2:5], v[0:1]
+; GCN-HSA-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; GCN-HSA-NEXT:    s_add_u32 s4, s2, 32
 ; GCN-HSA-NEXT:    s_addc_u32 s5, s3, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s4
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s5
-; GCN-HSA-NEXT:    flat_load_dwordx4 v[6:9], v[0:1]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s4
+; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s5
+; GCN-HSA-NEXT:    flat_load_dwordx4 v[5:8], v[4:5]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s2, 48
-; GCN-HSA-NEXT:    flat_load_dwordx4 v[10:13], v[0:1]
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s3, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
-; GCN-HSA-NEXT:    flat_load_dwordx4 v[14:17], v[0:1]
+; GCN-HSA-NEXT:    flat_load_dwordx4 v[9:12], v[9:10]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v14, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s2
+; GCN-HSA-NEXT:    flat_load_dwordx4 v[13:16], v[13:14]
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 48
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    s_add_u32 s4, s0, 16
@@ -7392,124 +7396,123 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(ptr addrspace(1) %ou
 ; GCN-HSA-NEXT:    s_addc_u32 s13, s1, 0
 ; GCN-HSA-NEXT:    s_add_u32 s14, s0, 0x70
 ; GCN-HSA-NEXT:    s_addc_u32 s15, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v23, s15
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s14
+; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s15
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s14
 ; GCN-HSA-NEXT:    s_add_u32 s14, s0, 0x50
-; GCN-HSA-NEXT:    v_mov_b32_e32 v19, v1
-; GCN-HSA-NEXT:    v_mov_b32_e32 v21, v1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v18, v4
+; GCN-HSA-NEXT:    v_mov_b32_e32 v20, v4
 ; GCN-HSA-NEXT:    s_addc_u32 s15, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v24, v1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v27, v4
+; GCN-HSA-NEXT:    v_mov_b32_e32 v23, v4
 ; GCN-HSA-NEXT:    s_waitcnt vmcnt(3)
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v20, 16, v5
-; GCN-HSA-NEXT:    v_and_b32_e32 v18, 0xffff, v5
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[22:23], v[18:21]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v23, s15
-; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s14
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v20, 16, v3
-; GCN-HSA-NEXT:    v_and_b32_e32 v18, 0xffff, v3
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[22:23], v[18:21]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v23, s11
-; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s10
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v19, 16, v3
+; GCN-HSA-NEXT:    v_and_b32_e32 v17, 0xffff, v3
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[21:22], v[17:20]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s15
+; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s14
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v19, 16, v1
+; GCN-HSA-NEXT:    v_and_b32_e32 v17, 0xffff, v1
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[21:22], v[17:20]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s11
+; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s10
 ; GCN-HSA-NEXT:    s_waitcnt vmcnt(4)
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v20, 16, v9
-; GCN-HSA-NEXT:    v_and_b32_e32 v18, 0xffff, v9
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[22:23], v[18:21]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v23, s13
-; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s12
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v20, 16, v7
-; GCN-HSA-NEXT:    v_and_b32_e32 v18, 0xffff, v7
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[22:23], v[18:21]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v23, s5
-; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s4
-; GCN-HSA-NEXT:    s_waitcnt vmcnt(5)
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v20, 16, v11
-; GCN-HSA-NEXT:    v_and_b32_e32 v18, 0xffff, v11
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[22:23], v[18:21]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s7
-; GCN-HSA-NEXT:    v_mov_b32_e32 v18, v1
-; GCN-HSA-NEXT:    v_mov_b32_e32 v20, v1
-; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s6
-; GCN-HSA-NEXT:    s_waitcnt vmcnt(5)
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v19, 16, v17
-; GCN-HSA-NEXT:    v_and_b32_e32 v17, 0xffff, v17
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v19, 16, v8
+; GCN-HSA-NEXT:    v_and_b32_e32 v17, 0xffff, v8
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[21:22], v[17:20]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s9
-; GCN-HSA-NEXT:    s_add_u32 s4, s0, 32
-; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s8
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v19, 16, v15
-; GCN-HSA-NEXT:    v_and_b32_e32 v17, 0xffff, v15
-; GCN-HSA-NEXT:    s_addc_u32 s5, s1, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s13
+; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s12
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v19, 16, v6
+; GCN-HSA-NEXT:    v_and_b32_e32 v17, 0xffff, v6
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[21:22], v[17:20]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s5
-; GCN-HSA-NEXT:    v_mov_b32_e32 v20, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s4
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v19, 16, v12
-; GCN-HSA-NEXT:    v_and_b32_e32 v17, 0xffff, v12
-; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s1
-; GCN-HSA-NEXT:    s_add_u32 s4, s0, 0xe0
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[21:22], v[17:20]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v11, s0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v20, 0
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(5)
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v19, 16, v10
 ; GCN-HSA-NEXT:    v_and_b32_e32 v17, 0xffff, v10
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[21:22], v[17:20]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s7
+; GCN-HSA-NEXT:    v_mov_b32_e32 v17, v4
+; GCN-HSA-NEXT:    v_mov_b32_e32 v19, v4
+; GCN-HSA-NEXT:    v_mov_b32_e32 v20, s6
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(5)
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v18, 16, v16
+; GCN-HSA-NEXT:    v_and_b32_e32 v16, 0xffff, v16
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[20:21], v[16:19]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s9
+; GCN-HSA-NEXT:    s_add_u32 s4, s0, 32
+; GCN-HSA-NEXT:    v_mov_b32_e32 v20, s8
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v18, 16, v14
+; GCN-HSA-NEXT:    v_and_b32_e32 v16, 0xffff, v14
 ; GCN-HSA-NEXT:    s_addc_u32 s5, s1, 0
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[11:12], v[17:20]
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v11, 16, v16
-; GCN-HSA-NEXT:    v_and_b32_e32 v9, 0xffff, v16
-; GCN-HSA-NEXT:    v_mov_b32_e32 v16, s5
-; GCN-HSA-NEXT:    v_mov_b32_e32 v12, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v10, v1
-; GCN-HSA-NEXT:    v_mov_b32_e32 v15, s4
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[15:16], v[9:12]
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v25, 16, v14
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v11, 16, v2
-; GCN-HSA-NEXT:    v_and_b32_e32 v9, 0xffff, v2
-; GCN-HSA-NEXT:    v_and_b32_e32 v23, 0xffff, v14
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v2, 16, v13
-; GCN-HSA-NEXT:    v_and_b32_e32 v0, 0xffff, v13
-; GCN-HSA-NEXT:    v_mov_b32_e32 v14, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v3, v1
-; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s2
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[20:21], v[16:19]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s5
+; GCN-HSA-NEXT:    v_mov_b32_e32 v19, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v20, s4
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v18, 16, v11
+; GCN-HSA-NEXT:    v_and_b32_e32 v16, 0xffff, v11
+; GCN-HSA-NEXT:    v_mov_b32_e32 v11, s1
+; GCN-HSA-NEXT:    s_add_u32 s4, s0, 0xe0
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[20:21], v[16:19]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v19, 0
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v18, 16, v9
+; GCN-HSA-NEXT:    v_and_b32_e32 v16, 0xffff, v9
+; GCN-HSA-NEXT:    s_addc_u32 s5, s1, 0
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[10:11], v[16:19]
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v10, 16, v15
+; GCN-HSA-NEXT:    v_and_b32_e32 v8, 0xffff, v15
+; GCN-HSA-NEXT:    v_mov_b32_e32 v15, s5
+; GCN-HSA-NEXT:    v_mov_b32_e32 v11, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v9, v4
+; GCN-HSA-NEXT:    v_mov_b32_e32 v14, s4
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[14:15], v[8:11]
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v16, 16, v0
+; GCN-HSA-NEXT:    v_and_b32_e32 v14, 0xffff, v0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v20, 16, v5
+; GCN-HSA-NEXT:    v_and_b32_e32 v18, 0xffff, v5
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v5, 16, v12
+; GCN-HSA-NEXT:    v_and_b32_e32 v3, 0xffff, v12
+; GCN-HSA-NEXT:    v_mov_b32_e32 v6, v4
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xc0
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[13:14], v[0:3]
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[3:6]
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s2
-; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v28, 16, v13
+; GCN-HSA-NEXT:    v_and_b32_e32 v26, 0xffff, v13
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xa0
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[2:3], v[23:26]
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[26:29]
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s2
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v21, 16, v8
-; GCN-HSA-NEXT:    v_and_b32_e32 v19, 0xffff, v8
-; GCN-HSA-NEXT:    v_mov_b32_e32 v22, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v20, v1
-; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v24, 16, v7
+; GCN-HSA-NEXT:    v_and_b32_e32 v22, 0xffff, v7
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x80
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[2:3], v[19:22]
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[22:25]
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s2
-; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-HSA-NEXT:    v_mov_b32_e32 v21, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v19, v4
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x60
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[18:21]
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v17, 16, v6
-; GCN-HSA-NEXT:    v_and_b32_e32 v15, 0xffff, v6
-; GCN-HSA-NEXT:    v_mov_b32_e32 v18, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v16, v1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
+; GCN-HSA-NEXT:    v_and_b32_e32 v8, 0xffff, v2
+; GCN-HSA-NEXT:    v_mov_b32_e32 v11, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN-HSA-NEXT:    s_add_u32 s0, s0, 64
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[2:3], v[15:18]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v6, v1
-; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s2
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[8:11]
 ; GCN-HSA-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
-; GCN-HSA-NEXT:    v_and_b32_e32 v5, 0xffff, v4
-; GCN-HSA-NEXT:    v_mov_b32_e32 v12, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v8, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v17, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v15, v4
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s1
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[2:3], v[5:8]
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[9:12]
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[14:17]
 ; GCN-HSA-NEXT:    s_endpgm
 ;
 ; GCN-NOHSA-VI-LABEL: global_zextload_v32i16_to_v32i64:
diff --git a/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll b/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll
index ea1e784fe58e2..dcaf7664d5b5a 100644
--- a/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll
+++ b/llvm/test/CodeGen/AMDGPU/offset-split-flat.ll
@@ -45,14 +45,14 @@ define i8 @flat_inst_valu_offset_11bit_max(ptr %p) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-SDAG-LABEL: flat_inst_valu_offset_11bit_max:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7ff, v0
-; GFX10-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX10-SDAG-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: flat_inst_valu_offset_11bit_max:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x7ff, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: flat_inst_valu_offset_11bit_max:
 ; GFX11:       ; %bb.0:
@@ -60,18 +60,6 @@ define i8 @flat_inst_valu_offset_11bit_max(ptr %p) {
 ; GFX11-NEXT:    flat_load_u8 v0, v[0:1] offset:2047
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-GISEL-LABEL: flat_inst_valu_offset_11bit_max:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_mov_b64 s[4:5], 0x7ff
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX10-GISEL-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr %p, i64 2047
   %load = load i8, ptr %gep, align 4
   ret i8 %load
@@ -85,14 +73,14 @@ define i8 @flat_inst_valu_offset_12bit_max(ptr %p) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-SDAG-LABEL: flat_inst_valu_offset_12bit_max:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfff, v0
-; GFX10-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX10-SDAG-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: flat_inst_valu_offset_12bit_max:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfff, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: flat_inst_valu_offset_12bit_max:
 ; GFX11:       ; %bb.0:
@@ -100,18 +88,6 @@ define i8 @flat_inst_valu_offset_12bit_max(ptr %p) {
 ; GFX11-NEXT:    flat_load_u8 v0, v[0:1] offset:4095
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-GISEL-LABEL: flat_inst_valu_offset_12bit_max:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_mov_b64 s[4:5], 0xfff
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX10-GISEL-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr %p, i64 4095
   %load = load i8, ptr %gep, align 4
   ret i8 %load
@@ -127,14 +103,14 @@ define i8 @flat_inst_valu_offset_13bit_max(ptr %p) {
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-SDAG-LABEL: flat_inst_valu_offset_13bit_max:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1fff, v0
-; GFX10-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX10-SDAG-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: flat_inst_valu_offset_13bit_max:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1fff, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: flat_inst_valu_offset_13bit_max:
 ; GFX11-SDAG:       ; %bb.0:
@@ -148,36 +124,17 @@ define i8 @flat_inst_valu_offset_13bit_max(ptr %p) {
 ; GFX9-GISEL-LABEL: flat_inst_valu_offset_13bit_max:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_mov_b64 s[4:5], 0x1fff
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1fff, v0
+; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX9-GISEL-NEXT:    flat_load_ubyte v0, v[0:1]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-GISEL-LABEL: flat_inst_valu_offset_13bit_max:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_mov_b64 s[4:5], 0x1fff
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX10-GISEL-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
 ; GFX11-GISEL-LABEL: flat_inst_valu_offset_13bit_max:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_mov_b64 s[0:1], 0x1fff
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1fff, v0
+; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 ; GFX11-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -187,610 +144,320 @@ define i8 @flat_inst_valu_offset_13bit_max(ptr %p) {
 }
 
 define i8 @flat_inst_valu_offset_neg_11bit_max(ptr %p) {
-; GFX9-SDAG-LABEL: flat_inst_valu_offset_neg_11bit_max:
+; GFX9-LABEL: flat_inst_valu_offset_neg_11bit_max:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX9-NEXT:    flat_load_ubyte v0, v[0:1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: flat_inst_valu_offset_neg_11bit_max:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: flat_inst_valu_offset_neg_11bit_max:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr i8, ptr %p, i64 -2048
+  %load = load i8, ptr %gep, align 4
+  ret i8 %load
+}
+
+define i8 @flat_inst_valu_offset_neg_12bit_max(ptr %p) {
+; GFX9-LABEL: flat_inst_valu_offset_neg_12bit_max:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff000, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX9-NEXT:    flat_load_ubyte v0, v[0:1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: flat_inst_valu_offset_neg_12bit_max:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff000, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: flat_inst_valu_offset_neg_12bit_max:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff000, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr i8, ptr %p, i64 -4096
+  %load = load i8, ptr %gep, align 4
+  ret i8 %load
+}
+
+define i8 @flat_inst_valu_offset_neg_13bit_max(ptr %p) {
+; GFX9-LABEL: flat_inst_valu_offset_neg_13bit_max:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0xffffe000, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX9-NEXT:    flat_load_ubyte v0, v[0:1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: flat_inst_valu_offset_neg_13bit_max:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0xffffe000, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: flat_inst_valu_offset_neg_13bit_max:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xffffe000, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr i8, ptr %p, i64 -8192
+  %load = load i8, ptr %gep, align 4
+  ret i8 %load
+}
+
+define i8 @flat_inst_valu_offset_2x_11bit_max(ptr %p) {
+; GFX9-LABEL: flat_inst_valu_offset_2x_11bit_max:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    flat_load_ubyte v0, v[0:1] offset:4095
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: flat_inst_valu_offset_2x_11bit_max:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfff, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: flat_inst_valu_offset_2x_11bit_max:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    flat_load_u8 v0, v[0:1] offset:4095
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr i8, ptr %p, i64 4095
+  %load = load i8, ptr %gep, align 4
+  ret i8 %load
+}
+
+define i8 @flat_inst_valu_offset_2x_12bit_max(ptr %p) {
+; GFX9-SDAG-LABEL: flat_inst_valu_offset_2x_12bit_max:
 ; GFX9-SDAG:       ; %bb.0:
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
-; GFX9-SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX9-SDAG-NEXT:    flat_load_ubyte v0, v[0:1]
+; GFX9-SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1000, v0
+; GFX9-SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-SDAG-NEXT:    flat_load_ubyte v0, v[0:1] offset:4095
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-SDAG-LABEL: flat_inst_valu_offset_neg_11bit_max:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
-; GFX10-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX10-SDAG-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: flat_inst_valu_offset_2x_12bit_max:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1fff, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: flat_inst_valu_offset_neg_11bit_max:
+; GFX11-SDAG-LABEL: flat_inst_valu_offset_2x_12bit_max:
 ; GFX11-SDAG:       ; %bb.0:
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
-; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX11-SDAG-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
+; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:4095
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-GISEL-LABEL: flat_inst_valu_offset_neg_11bit_max:
+; GFX9-GISEL-LABEL: flat_inst_valu_offset_2x_12bit_max:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_movk_i32 s4, 0xf800
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, -1
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1fff, v0
+; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX9-GISEL-NEXT:    flat_load_ubyte v0, v[0:1]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-GISEL-LABEL: flat_inst_valu_offset_neg_11bit_max:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_movk_i32 s4, 0xf800
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, -1
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX10-GISEL-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: flat_inst_valu_offset_neg_11bit_max:
+; GFX11-GISEL-LABEL: flat_inst_valu_offset_2x_12bit_max:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_movk_i32 s0, 0xf800
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, -1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1fff, v0
+; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 ; GFX11-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr i8, ptr %p, i64 -2048
+  %gep = getelementptr i8, ptr %p, i64 8191
   %load = load i8, ptr %gep, align 4
   ret i8 %load
 }
 
-define i8 @flat_inst_valu_offset_neg_12bit_max(ptr %p) {
-; GFX9-SDAG-LABEL: flat_inst_valu_offset_neg_12bit_max:
+define i8 @flat_inst_valu_offset_2x_13bit_max(ptr %p) {
+; GFX9-SDAG-LABEL: flat_inst_valu_offset_2x_13bit_max:
 ; GFX9-SDAG:       ; %bb.0:
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff000, v0
-; GFX9-SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX9-SDAG-NEXT:    flat_load_ubyte v0, v[0:1]
+; GFX9-SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0x3000, v0
+; GFX9-SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-SDAG-NEXT:    flat_load_ubyte v0, v[0:1] offset:4095
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-SDAG-LABEL: flat_inst_valu_offset_neg_12bit_max:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff000, v0
-; GFX10-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX10-SDAG-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: flat_inst_valu_offset_2x_13bit_max:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x3fff, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: flat_inst_valu_offset_neg_12bit_max:
+; GFX11-SDAG-LABEL: flat_inst_valu_offset_2x_13bit_max:
 ; GFX11-SDAG:       ; %bb.0:
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff000, v0
-; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX11-SDAG-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x3000, v0
+; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:4095
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-GISEL-LABEL: flat_inst_valu_offset_neg_12bit_max:
+; GFX9-GISEL-LABEL: flat_inst_valu_offset_2x_13bit_max:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_movk_i32 s4, 0xf000
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, -1
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, 0x3fff, v0
+; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX9-GISEL-NEXT:    flat_load_ubyte v0, v[0:1]
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-GISEL-LABEL: flat_inst_valu_offset_neg_12bit_max:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_movk_i32 s4, 0xf000
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, -1
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX10-GISEL-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: flat_inst_valu_offset_neg_12bit_max:
+; GFX11-GISEL-LABEL: flat_inst_valu_offset_2x_13bit_max:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_movk_i32 s0, 0xf000
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, -1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0x3fff, v0
+; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 ; GFX11-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr i8, ptr %p, i64 -4096
-  %load = load i8, ptr %gep, align 4
-  ret i8 %load
-}
-
-define i8 @flat_inst_valu_offset_neg_13bit_max(ptr %p) {
-; GFX9-SDAG-LABEL: flat_inst_valu_offset_neg_13bit_max:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0xffffe000, v0
-; GFX9-SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX9-SDAG-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-SDAG-LABEL: flat_inst_valu_offset_neg_13bit_max:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0xffffe000, v0
-; GFX10-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX10-SDAG-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-LABEL: flat_inst_valu_offset_neg_13bit_max:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0xffffe000, v0
-; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX11-SDAG-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: flat_inst_valu_offset_neg_13bit_max:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_movk_i32 s4, 0xe000
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, -1
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
-; GFX9-GISEL-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-GISEL-LABEL: flat_inst_valu_offset_neg_13bit_max:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_movk_i32 s4, 0xe000
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, -1
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX10-GISEL-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: flat_inst_valu_offset_neg_13bit_max:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_movk_i32 s0, 0xe000
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, -1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX11-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr i8, ptr %p, i64 -8192
+  %gep = getelementptr i8, ptr %p, i64 16383
   %load = load i8, ptr %gep, align 4
   ret i8 %load
 }
 
-define i8 @flat_inst_valu_offset_2x_11bit_max(ptr %p) {
-; GFX9-LABEL: flat_inst_valu_offset_2x_11bit_max:
+define i8 @flat_inst_valu_offset_2x_neg_11bit_max(ptr %p) {
+; GFX9-LABEL: flat_inst_valu_offset_2x_neg_11bit_max:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    flat_load_ubyte v0, v[0:1] offset:4095
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff000, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX9-NEXT:    flat_load_ubyte v0, v[0:1]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-SDAG-LABEL: flat_inst_valu_offset_2x_11bit_max:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfff, v0
-; GFX10-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX10-SDAG-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: flat_inst_valu_offset_2x_neg_11bit_max:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff000, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: flat_inst_valu_offset_2x_11bit_max:
+; GFX11-LABEL: flat_inst_valu_offset_2x_neg_11bit_max:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    flat_load_u8 v0, v[0:1] offset:4095
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff000, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT:    flat_load_u8 v0, v[0:1]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-GISEL-LABEL: flat_inst_valu_offset_2x_11bit_max:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_mov_b64 s[4:5], 0xfff
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX10-GISEL-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr i8, ptr %p, i64 4095
-  %load = load i8, ptr %gep, align 4
-  ret i8 %load
-}
-
-define i8 @flat_inst_valu_offset_2x_12bit_max(ptr %p) {
-; GFX9-SDAG-LABEL: flat_inst_valu_offset_2x_12bit_max:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1000, v0
-; GFX9-SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-SDAG-NEXT:    flat_load_ubyte v0, v[0:1] offset:4095
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-SDAG-LABEL: flat_inst_valu_offset_2x_12bit_max:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1fff, v0
-; GFX10-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX10-SDAG-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-LABEL: flat_inst_valu_offset_2x_12bit_max:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX11-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:4095
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: flat_inst_valu_offset_2x_12bit_max:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_mov_b64 s[4:5], 0x1fff
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
-; GFX9-GISEL-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-GISEL-LABEL: flat_inst_valu_offset_2x_12bit_max:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_mov_b64 s[4:5], 0x1fff
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX10-GISEL-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: flat_inst_valu_offset_2x_12bit_max:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_mov_b64 s[0:1], 0x1fff
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX11-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr i8, ptr %p, i64 8191
-  %load = load i8, ptr %gep, align 4
-  ret i8 %load
-}
-
-define i8 @flat_inst_valu_offset_2x_13bit_max(ptr %p) {
-; GFX9-SDAG-LABEL: flat_inst_valu_offset_2x_13bit_max:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0x3000, v0
-; GFX9-SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-SDAG-NEXT:    flat_load_ubyte v0, v[0:1] offset:4095
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-SDAG-LABEL: flat_inst_valu_offset_2x_13bit_max:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x3fff, v0
-; GFX10-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX10-SDAG-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-LABEL: flat_inst_valu_offset_2x_13bit_max:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0x3000, v0
-; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX11-SDAG-NEXT:    flat_load_u8 v0, v[0:1] offset:4095
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: flat_inst_valu_offset_2x_13bit_max:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_mov_b64 s[4:5], 0x3fff
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
-; GFX9-GISEL-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-GISEL-LABEL: flat_inst_valu_offset_2x_13bit_max:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_mov_b64 s[4:5], 0x3fff
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX10-GISEL-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: flat_inst_valu_offset_2x_13bit_max:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_mov_b64 s[0:1], 0x3fff
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX11-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
-  %gep = getelementptr i8, ptr %p, i64 16383
-  %load = load i8, ptr %gep, align 4
-  ret i8 %load
-}
-
-define i8 @flat_inst_valu_offset_2x_neg_11bit_max(ptr %p) {
-; GFX9-SDAG-LABEL: flat_inst_valu_offset_2x_neg_11bit_max:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff000, v0
-; GFX9-SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX9-SDAG-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-SDAG-LABEL: flat_inst_valu_offset_2x_neg_11bit_max:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff000, v0
-; GFX10-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX10-SDAG-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-LABEL: flat_inst_valu_offset_2x_neg_11bit_max:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff000, v0
-; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX11-SDAG-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: flat_inst_valu_offset_2x_neg_11bit_max:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_movk_i32 s4, 0xf000
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, -1
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
-; GFX9-GISEL-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-GISEL-LABEL: flat_inst_valu_offset_2x_neg_11bit_max:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_movk_i32 s4, 0xf000
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, -1
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX10-GISEL-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: flat_inst_valu_offset_2x_neg_11bit_max:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_movk_i32 s0, 0xf000
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, -1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX11-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr %p, i64 -4096
   %load = load i8, ptr %gep, align 4
   ret i8 %load
 }
 
 define i8 @flat_inst_valu_offset_2x_neg_12bit_max(ptr %p) {
-; GFX9-SDAG-LABEL: flat_inst_valu_offset_2x_neg_12bit_max:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0xffffe000, v0
-; GFX9-SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX9-SDAG-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-SDAG-LABEL: flat_inst_valu_offset_2x_neg_12bit_max:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0xffffe000, v0
-; GFX10-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX10-SDAG-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-LABEL: flat_inst_valu_offset_2x_neg_12bit_max:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0xffffe000, v0
-; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX11-SDAG-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: flat_inst_valu_offset_2x_neg_12bit_max:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_movk_i32 s4, 0xe000
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, -1
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
-; GFX9-GISEL-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: flat_inst_valu_offset_2x_neg_12bit_max:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0xffffe000, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX9-NEXT:    flat_load_ubyte v0, v[0:1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-GISEL-LABEL: flat_inst_valu_offset_2x_neg_12bit_max:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_movk_i32 s4, 0xe000
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, -1
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX10-GISEL-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: flat_inst_valu_offset_2x_neg_12bit_max:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0xffffe000, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: flat_inst_valu_offset_2x_neg_12bit_max:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_movk_i32 s0, 0xe000
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, -1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX11-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: flat_inst_valu_offset_2x_neg_12bit_max:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xffffe000, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr %p, i64 -8192
   %load = load i8, ptr %gep, align 4
   ret i8 %load
 }
 
 define i8 @flat_inst_valu_offset_2x_neg_13bit_max(ptr %p) {
-; GFX9-SDAG-LABEL: flat_inst_valu_offset_2x_neg_13bit_max:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0xffffc000, v0
-; GFX9-SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX9-SDAG-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-SDAG-LABEL: flat_inst_valu_offset_2x_neg_13bit_max:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0xffffc000, v0
-; GFX10-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX10-SDAG-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-LABEL: flat_inst_valu_offset_2x_neg_13bit_max:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0xffffc000, v0
-; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX11-SDAG-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-GISEL-LABEL: flat_inst_valu_offset_2x_neg_13bit_max:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_movk_i32 s4, 0xc000
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, -1
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
-; GFX9-GISEL-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: flat_inst_valu_offset_2x_neg_13bit_max:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0xffffc000, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX9-NEXT:    flat_load_ubyte v0, v[0:1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-GISEL-LABEL: flat_inst_valu_offset_2x_neg_13bit_max:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_movk_i32 s4, 0xc000
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, -1
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX10-GISEL-NEXT:    flat_load_ubyte v0, v[0:1]
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: flat_inst_valu_offset_2x_neg_13bit_max:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0xffffc000, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX10-NEXT:    flat_load_ubyte v0, v[0:1]
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-GISEL-LABEL: flat_inst_valu_offset_2x_neg_13bit_max:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_movk_i32 s0, 0xc000
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, -1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX11-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: flat_inst_valu_offset_2x_neg_13bit_max:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xffffc000, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr %p, i64 -16384
   %load = load i8, ptr %gep, align 4
   ret i8 %load
diff --git a/llvm/test/CodeGen/AMDGPU/offset-split-global.ll b/llvm/test/CodeGen/AMDGPU/offset-split-global.ll
index c9c6c0912bda9..137c83a0fd80c 100644
--- a/llvm/test/CodeGen/AMDGPU/offset-split-global.ll
+++ b/llvm/test/CodeGen/AMDGPU/offset-split-global.ll
@@ -72,11 +72,8 @@ define i8 @global_inst_valu_offset_12bit_max(ptr addrspace(1) %p) {
 ; GFX10-GISEL-LABEL: global_inst_valu_offset_12bit_max:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_mov_b64 s[4:5], 0xfff
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfff, v0
+; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 ; GFX10-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -105,11 +102,8 @@ define i8 @global_inst_valu_offset_13bit_max(ptr addrspace(1) %p) {
 ; GFX9-GISEL-LABEL: global_inst_valu_offset_13bit_max:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_mov_b64 s[4:5], 0x1fff
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1fff, v0
+; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX9-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -117,11 +111,8 @@ define i8 @global_inst_valu_offset_13bit_max(ptr addrspace(1) %p) {
 ; GFX10-GISEL-LABEL: global_inst_valu_offset_13bit_max:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_mov_b64 s[4:5], 0x1fff
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1fff, v0
+; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 ; GFX10-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -129,12 +120,8 @@ define i8 @global_inst_valu_offset_13bit_max(ptr addrspace(1) %p) {
 ; GFX11-GISEL-LABEL: global_inst_valu_offset_13bit_max:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_mov_b64 s[0:1], 0x1fff
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1fff, v0
+; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 ; GFX11-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -204,18 +191,14 @@ define i8 @global_inst_valu_offset_neg_12bit_max(ptr addrspace(1) %p) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-GISEL-LABEL: global_inst_valu_offset_neg_12bit_max:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_movk_i32 s4, 0xf000
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, -1
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX10-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: global_inst_valu_offset_neg_12bit_max:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff000, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: global_inst_valu_offset_neg_12bit_max:
 ; GFX11:       ; %bb.0:
@@ -223,87 +206,38 @@ define i8 @global_inst_valu_offset_neg_12bit_max(ptr addrspace(1) %p) {
 ; GFX11-NEXT:    global_load_u8 v0, v[0:1], off offset:-4096
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-SDAG-LABEL: global_inst_valu_offset_neg_12bit_max:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff000, v0
-; GFX10-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX10-SDAG-NEXT:    global_load_ubyte v0, v[0:1], off
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 -4096
   %load = load i8, ptr addrspace(1) %gep, align 4
   ret i8 %load
 }
 
 define i8 @global_inst_valu_offset_neg_13bit_max(ptr addrspace(1) %p) {
-; GFX9-GISEL-LABEL: global_inst_valu_offset_neg_13bit_max:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_movk_i32 s4, 0xe000
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, -1
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
-; GFX9-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-GISEL-LABEL: global_inst_valu_offset_neg_13bit_max:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_movk_i32 s4, 0xe000
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, -1
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX10-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: global_inst_valu_offset_neg_13bit_max:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_movk_i32 s0, 0xe000
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, -1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX11-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-SDAG-LABEL: global_inst_valu_offset_neg_13bit_max:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0xffffe000, v0
-; GFX9-SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX9-SDAG-NEXT:    global_load_ubyte v0, v[0:1], off
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: global_inst_valu_offset_neg_13bit_max:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0xffffe000, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-SDAG-LABEL: global_inst_valu_offset_neg_13bit_max:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0xffffe000, v0
-; GFX10-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX10-SDAG-NEXT:    global_load_ubyte v0, v[0:1], off
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: global_inst_valu_offset_neg_13bit_max:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0xffffe000, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: global_inst_valu_offset_neg_13bit_max:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0xffffe000, v0
-; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX11-SDAG-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: global_inst_valu_offset_neg_13bit_max:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xffffe000, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT:    global_load_u8 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 -8192
   %load = load i8, ptr addrspace(1) %gep, align 4
   ret i8 %load
@@ -320,11 +254,8 @@ define i8 @global_inst_valu_offset_2x_11bit_max(ptr addrspace(1) %p) {
 ; GFX10-GISEL-LABEL: global_inst_valu_offset_2x_11bit_max:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_mov_b64 s[4:5], 0xfff
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfff, v0
+; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 ; GFX10-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -353,11 +284,8 @@ define i8 @global_inst_valu_offset_2x_12bit_max(ptr addrspace(1) %p) {
 ; GFX9-GISEL-LABEL: global_inst_valu_offset_2x_12bit_max:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_mov_b64 s[4:5], 0x1fff
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1fff, v0
+; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX9-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -365,11 +293,8 @@ define i8 @global_inst_valu_offset_2x_12bit_max(ptr addrspace(1) %p) {
 ; GFX10-GISEL-LABEL: global_inst_valu_offset_2x_12bit_max:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_mov_b64 s[4:5], 0x1fff
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1fff, v0
+; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 ; GFX10-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -377,12 +302,8 @@ define i8 @global_inst_valu_offset_2x_12bit_max(ptr addrspace(1) %p) {
 ; GFX11-GISEL-LABEL: global_inst_valu_offset_2x_12bit_max:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_mov_b64 s[0:1], 0x1fff
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0x1fff, v0
+; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 ; GFX11-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -422,11 +343,8 @@ define i8 @global_inst_valu_offset_2x_13bit_max(ptr addrspace(1) %p) {
 ; GFX9-GISEL-LABEL: global_inst_valu_offset_2x_13bit_max:
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_mov_b64 s[4:5], 0x3fff
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, 0x3fff, v0
+; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX9-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -434,11 +352,8 @@ define i8 @global_inst_valu_offset_2x_13bit_max(ptr addrspace(1) %p) {
 ; GFX10-GISEL-LABEL: global_inst_valu_offset_2x_13bit_max:
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_mov_b64 s[4:5], 0x3fff
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0x3fff, v0
+; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 ; GFX10-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -446,12 +361,8 @@ define i8 @global_inst_valu_offset_2x_13bit_max(ptr addrspace(1) %p) {
 ; GFX11-GISEL-LABEL: global_inst_valu_offset_2x_13bit_max:
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_mov_b64 s[0:1], 0x3fff
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
+; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, 0x3fff, v0
+; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 ; GFX11-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -495,18 +406,14 @@ define i8 @global_inst_valu_offset_2x_neg_11bit_max(ptr addrspace(1) %p) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-GISEL-LABEL: global_inst_valu_offset_2x_neg_11bit_max:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_movk_i32 s4, 0xf000
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, -1
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX10-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: global_inst_valu_offset_2x_neg_11bit_max:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff000, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: global_inst_valu_offset_2x_neg_11bit_max:
 ; GFX11:       ; %bb.0:
@@ -514,159 +421,70 @@ define i8 @global_inst_valu_offset_2x_neg_11bit_max(ptr addrspace(1) %p) {
 ; GFX11-NEXT:    global_load_u8 v0, v[0:1], off offset:-4096
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-SDAG-LABEL: global_inst_valu_offset_2x_neg_11bit_max:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff000, v0
-; GFX10-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX10-SDAG-NEXT:    global_load_ubyte v0, v[0:1], off
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 -4096
   %load = load i8, ptr addrspace(1) %gep, align 4
   ret i8 %load
 }
 
 define i8 @global_inst_valu_offset_2x_neg_12bit_max(ptr addrspace(1) %p) {
-; GFX9-GISEL-LABEL: global_inst_valu_offset_2x_neg_12bit_max:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_movk_i32 s4, 0xe000
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, -1
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
-; GFX9-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-GISEL-LABEL: global_inst_valu_offset_2x_neg_12bit_max:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_movk_i32 s4, 0xe000
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, -1
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX10-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: global_inst_valu_offset_2x_neg_12bit_max:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_movk_i32 s0, 0xe000
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, -1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX11-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-SDAG-LABEL: global_inst_valu_offset_2x_neg_12bit_max:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0xffffe000, v0
-; GFX9-SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX9-SDAG-NEXT:    global_load_ubyte v0, v[0:1], off
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: global_inst_valu_offset_2x_neg_12bit_max:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0xffffe000, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-SDAG-LABEL: global_inst_valu_offset_2x_neg_12bit_max:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0xffffe000, v0
-; GFX10-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX10-SDAG-NEXT:    global_load_ubyte v0, v[0:1], off
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: global_inst_valu_offset_2x_neg_12bit_max:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0xffffe000, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: global_inst_valu_offset_2x_neg_12bit_max:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0xffffe000, v0
-; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX11-SDAG-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: global_inst_valu_offset_2x_neg_12bit_max:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xffffe000, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT:    global_load_u8 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 -8192
   %load = load i8, ptr addrspace(1) %gep, align 4
   ret i8 %load
 }
 
 define i8 @global_inst_valu_offset_2x_neg_13bit_max(ptr addrspace(1) %p) {
-; GFX9-GISEL-LABEL: global_inst_valu_offset_2x_neg_13bit_max:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_movk_i32 s4, 0xc000
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, -1
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
-; GFX9-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX10-GISEL-LABEL: global_inst_valu_offset_2x_neg_13bit_max:
-; GFX10-GISEL:       ; %bb.0:
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_movk_i32 s4, 0xc000
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, -1
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s4
-; GFX10-GISEL-NEXT:    v_mov_b32_e32 v3, s5
-; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX10-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
-; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: global_inst_valu_offset_2x_neg_13bit_max:
-; GFX11-GISEL:       ; %bb.0:
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_movk_i32 s0, 0xc000
-; GFX11-GISEL-NEXT:    s_mov_b32 s1, -1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-GISEL-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
-; GFX11-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v2
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v3, vcc_lo
-; GFX11-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX9-SDAG-LABEL: global_inst_valu_offset_2x_neg_13bit_max:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0xffffc000, v0
-; GFX9-SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX9-SDAG-NEXT:    global_load_ubyte v0, v[0:1], off
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: global_inst_valu_offset_2x_neg_13bit_max:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0xffffc000, v0
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX9-NEXT:    global_load_ubyte v0, v[0:1], off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-SDAG-LABEL: global_inst_valu_offset_2x_neg_13bit_max:
-; GFX10-SDAG:       ; %bb.0:
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0xffffc000, v0
-; GFX10-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX10-SDAG-NEXT:    global_load_ubyte v0, v[0:1], off
-; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX10-LABEL: global_inst_valu_offset_2x_neg_13bit_max:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0xffffc000, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX10-NEXT:    global_load_ubyte v0, v[0:1], off
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-SDAG-LABEL: global_inst_valu_offset_2x_neg_13bit_max:
-; GFX11-SDAG:       ; %bb.0:
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_add_co_u32 v0, vcc_lo, 0xffffc000, v0
-; GFX11-SDAG-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX11-SDAG-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-LABEL: global_inst_valu_offset_2x_neg_13bit_max:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xffffc000, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT:    global_load_u8 v0, v[0:1], off
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %gep = getelementptr i8, ptr addrspace(1) %p, i64 -16384
   %load = load i8, ptr addrspace(1) %gep, align 4
   ret i8 %load
diff --git a/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll b/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll
index ca79772dbed74..536b2d054272e 100644
--- a/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll
+++ b/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll
@@ -3065,33 +3065,31 @@ define i64 @v_mul_284_add_82_i64(i64 %arg) {
 ; GFX7-LABEL: v_mul_284_add_82_i64:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v2, v1
-; GFX7-NEXT:    v_mov_b32_e32 v3, 0x52
-; GFX7-NEXT:    v_mov_b32_e32 v4, 0
-; GFX7-NEXT:    s_movk_i32 s6, 0x11c
-; GFX7-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, s6, v[3:4]
-; GFX7-NEXT:    v_mul_lo_u32 v2, v2, s6
-; GFX7-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
+; GFX7-NEXT:    s_movk_i32 s4, 0x11c
+; GFX7-NEXT:    v_mul_lo_u32 v3, v1, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0x52
+; GFX7-NEXT:    v_mov_b32_e32 v2, 0
+; GFX7-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, s4, v[1:2]
+; GFX7-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_284_add_82_i64:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v2, v1
-; GFX8-NEXT:    v_mov_b32_e32 v3, 0x52
-; GFX8-NEXT:    v_mov_b32_e32 v4, 0
-; GFX8-NEXT:    s_movk_i32 s6, 0x11c
-; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, s6, v[3:4]
-; GFX8-NEXT:    v_mul_lo_u32 v2, v2, s6
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v2, v1
+; GFX8-NEXT:    s_movk_i32 s4, 0x11c
+; GFX8-NEXT:    v_mul_lo_u32 v3, v1, s4
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0x52
+; GFX8-NEXT:    v_mov_b32_e32 v2, 0
+; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, s4, v[1:2]
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v3, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: v_mul_284_add_82_i64:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    v_mov_b32_e32 v3, 0x52
-; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    s_movk_i32 s6, 0x11c
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX900-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, s6, v[3:4]
 ; GFX900-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v2, s6, v[1:2]
@@ -3101,8 +3099,8 @@ define i64 @v_mul_284_add_82_i64(i64 %arg) {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v4, 0x52
-; GFX90A-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX90A-NEXT:    s_movk_i32 s6, 0x11c
+; GFX90A-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX90A-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX90A-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, s6, v[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v4, v1
@@ -3113,9 +3111,9 @@ define i64 @v_mul_284_add_82_i64(i64 %arg) {
 ; GFX10-LABEL: v_mul_284_add_82_i64:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    s_mov_b64 s[4:5], 0x52
+; GFX10-NEXT:    s_movk_i32 s4, 0x11c
 ; GFX10-NEXT:    v_mov_b32_e32 v2, v1
-; GFX10-NEXT:    v_mad_u64_u32 v[0:1], null, 0x11c, v0, s[4:5]
+; GFX10-NEXT:    v_mad_u64_u32 v[0:1], null, v0, s4, 0x52
 ; GFX10-NEXT:    v_mad_u64_u32 v[1:2], null, 0x11c, v2, v[1:2]
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %mul = mul i64 %arg, 284
@@ -3139,33 +3137,31 @@ define i64 @v_mul_934584645_add_8234599_i64(i64 %arg) {
 ; GFX7-LABEL: v_mul_934584645_add_8234599_i64:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mov_b32_e32 v2, v1
-; GFX7-NEXT:    v_mov_b32_e32 v3, 0x7da667
-; GFX7-NEXT:    v_mov_b32_e32 v4, 0
-; GFX7-NEXT:    s_mov_b32 s6, 0x37b4a145
-; GFX7-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, s6, v[3:4]
-; GFX7-NEXT:    v_mul_lo_u32 v2, v2, s6
-; GFX7-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
+; GFX7-NEXT:    s_mov_b32 s4, 0x37b4a145
+; GFX7-NEXT:    v_mul_lo_u32 v3, v1, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, 0x7da667
+; GFX7-NEXT:    v_mov_b32_e32 v2, 0
+; GFX7-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, s4, v[1:2]
+; GFX7-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_mul_934584645_add_8234599_i64:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v2, v1
-; GFX8-NEXT:    v_mov_b32_e32 v3, 0x7da667
-; GFX8-NEXT:    v_mov_b32_e32 v4, 0
-; GFX8-NEXT:    s_mov_b32 s6, 0x37b4a145
-; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, s6, v[3:4]
-; GFX8-NEXT:    v_mul_lo_u32 v2, v2, s6
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v2, v1
+; GFX8-NEXT:    s_mov_b32 s4, 0x37b4a145
+; GFX8-NEXT:    v_mul_lo_u32 v3, v1, s4
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0x7da667
+; GFX8-NEXT:    v_mov_b32_e32 v2, 0
+; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, s4, v[1:2]
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v3, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-LABEL: v_mul_934584645_add_8234599_i64:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    v_mov_b32_e32 v3, 0x7da667
-; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    s_mov_b32 s6, 0x37b4a145
+; GFX900-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX900-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX900-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, s6, v[3:4]
 ; GFX900-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v2, s6, v[1:2]
@@ -3175,8 +3171,8 @@ define i64 @v_mul_934584645_add_8234599_i64(i64 %arg) {
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    v_mov_b32_e32 v4, 0x7da667
-; GFX90A-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX90A-NEXT:    s_mov_b32 s6, 0x37b4a145
+; GFX90A-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX90A-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX90A-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v0, s6, v[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v4, v1
@@ -3187,9 +3183,9 @@ define i64 @v_mul_934584645_add_8234599_i64(i64 %arg) {
 ; GFX10-LABEL: v_mul_934584645_add_8234599_i64:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    s_mov_b64 s[4:5], 0x7da667
+; GFX10-NEXT:    s_mov_b32 s4, 0x37b4a145
 ; GFX10-NEXT:    v_mov_b32_e32 v2, v1
-; GFX10-NEXT:    v_mad_u64_u32 v[0:1], null, 0x37b4a145, v0, s[4:5]
+; GFX10-NEXT:    v_mad_u64_u32 v[0:1], null, v0, s4, 0x7da667
 ; GFX10-NEXT:    v_mad_u64_u32 v[1:2], null, 0x37b4a145, v2, v[1:2]
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
   %mul = mul i64 %arg, 934584645
diff --git a/llvm/test/CodeGen/AMDGPU/rsq.f64.ll b/llvm/test/CodeGen/AMDGPU/rsq.f64.ll
index 0654d55576645..3dc565ceed0d0 100644
--- a/llvm/test/CodeGen/AMDGPU/rsq.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/rsq.f64.ll
@@ -59,10 +59,8 @@ define amdgpu_ps <2 x i32> @s_rsq_f64(double inreg %x) {
 ;
 ; SI-GISEL-LABEL: s_rsq_f64:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_mov_b32 s2, 0
-; SI-GISEL-NEXT:    s_brev_b32 s3, 8
-; SI-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; SI-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; SI-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; SI-GISEL-NEXT:    v_bfrev_b32_e32 v1, 8
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
@@ -145,10 +143,8 @@ define amdgpu_ps <2 x i32> @s_rsq_f64(double inreg %x) {
 ;
 ; VI-GISEL-LABEL: s_rsq_f64:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_mov_b32 s2, 0
-; VI-GISEL-NEXT:    s_brev_b32 s3, 8
-; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; VI-GISEL-NEXT:    v_bfrev_b32_e32 v1, 8
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
@@ -243,11 +239,9 @@ define amdgpu_ps <2 x i32> @s_rsq_f64_fabs(double inreg %x) {
 ;
 ; SI-GISEL-LABEL: s_rsq_f64_fabs:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_mov_b32 s2, 0
-; SI-GISEL-NEXT:    v_mov_b32_e32 v0, s0
-; SI-GISEL-NEXT:    s_brev_b32 s3, 8
-; SI-GISEL-NEXT:    v_mov_b32_e32 v1, s1
-; SI-GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[2:3]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; SI-GISEL-NEXT:    v_bfrev_b32_e32 v1, 8
+; SI-GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, |s[0:1]|, v[0:1]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], |s[0:1]|, v0
@@ -329,11 +323,9 @@ define amdgpu_ps <2 x i32> @s_rsq_f64_fabs(double inreg %x) {
 ;
 ; VI-GISEL-LABEL: s_rsq_f64_fabs:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_mov_b32 s2, 0
-; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT:    s_brev_b32 s3, 8
-; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s1
-; VI-GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[2:3]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; VI-GISEL-NEXT:    v_bfrev_b32_e32 v1, 8
+; VI-GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, |s[0:1]|, v[0:1]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], |s[0:1]|, v0
@@ -428,10 +420,8 @@ define amdgpu_ps <2 x i32> @s_neg_rsq_f64(double inreg %x) {
 ;
 ; SI-GISEL-LABEL: s_neg_rsq_f64:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_mov_b32 s2, 0
-; SI-GISEL-NEXT:    s_brev_b32 s3, 8
-; SI-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; SI-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; SI-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; SI-GISEL-NEXT:    v_bfrev_b32_e32 v1, 8
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
@@ -514,10 +504,8 @@ define amdgpu_ps <2 x i32> @s_neg_rsq_f64(double inreg %x) {
 ;
 ; VI-GISEL-LABEL: s_neg_rsq_f64:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_mov_b32 s2, 0
-; VI-GISEL-NEXT:    s_brev_b32 s3, 8
-; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; VI-GISEL-NEXT:    v_bfrev_b32_e32 v1, 8
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
@@ -612,11 +600,9 @@ define amdgpu_ps <2 x i32> @s_neg_rsq_neg_f64(double inreg %x) {
 ;
 ; SI-GISEL-LABEL: s_neg_rsq_neg_f64:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_mov_b32 s2, 0
-; SI-GISEL-NEXT:    v_mov_b32_e32 v0, s0
-; SI-GISEL-NEXT:    s_brev_b32 s3, 8
-; SI-GISEL-NEXT:    v_mov_b32_e32 v1, s1
-; SI-GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, -v[0:1], s[2:3]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; SI-GISEL-NEXT:    v_bfrev_b32_e32 v1, 8
+; SI-GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, -s[0:1], v[0:1]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], -s[0:1], v0
@@ -698,11 +684,9 @@ define amdgpu_ps <2 x i32> @s_neg_rsq_neg_f64(double inreg %x) {
 ;
 ; VI-GISEL-LABEL: s_neg_rsq_neg_f64:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_mov_b32 s2, 0
-; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT:    s_brev_b32 s3, 8
-; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s1
-; VI-GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, -v[0:1], s[2:3]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; VI-GISEL-NEXT:    v_bfrev_b32_e32 v1, 8
+; VI-GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, -s[0:1], v[0:1]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], -s[0:1], v0
@@ -797,11 +781,11 @@ define double @v_rsq_f64(double %x) {
 ; SI-GISEL-LABEL: v_rsq_f64:
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    s_mov_b32 s4, 0
-; SI-GISEL-NEXT:    s_brev_b32 s5, 8
-; SI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
+; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
 ; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
@@ -879,11 +863,11 @@ define double @v_rsq_f64(double %x) {
 ; VI-GISEL-LABEL: v_rsq_f64:
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-GISEL-NEXT:    s_mov_b32 s4, 0
-; VI-GISEL-NEXT:    s_brev_b32 s5, 8
-; VI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
+; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -966,11 +950,11 @@ define double @v_rsq_f64_fabs(double %x) {
 ; SI-GISEL-LABEL: v_rsq_f64_fabs:
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    s_mov_b32 s4, 0
-; SI-GISEL-NEXT:    s_brev_b32 s5, 8
-; SI-GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[4:5]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
+; SI-GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, |v[0:1]|, v[2:3]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], |v[0:1]|, v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
 ; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
@@ -1048,11 +1032,11 @@ define double @v_rsq_f64_fabs(double %x) {
 ; VI-GISEL-LABEL: v_rsq_f64_fabs:
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-GISEL-NEXT:    s_mov_b32 s4, 0
-; VI-GISEL-NEXT:    s_brev_b32 s5, 8
-; VI-GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, |v[0:1]|, s[4:5]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
+; VI-GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, |v[0:1]|, v[2:3]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], |v[0:1]|, v2
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -1136,11 +1120,11 @@ define double @v_rsq_f64_missing_contract0(double %x) {
 ; SI-GISEL-LABEL: v_rsq_f64_missing_contract0:
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    s_mov_b32 s4, 0
-; SI-GISEL-NEXT:    s_brev_b32 s5, 8
-; SI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
+; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
 ; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
@@ -1218,11 +1202,11 @@ define double @v_rsq_f64_missing_contract0(double %x) {
 ; VI-GISEL-LABEL: v_rsq_f64_missing_contract0:
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-GISEL-NEXT:    s_mov_b32 s4, 0
-; VI-GISEL-NEXT:    s_brev_b32 s5, 8
-; VI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
+; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -1305,11 +1289,11 @@ define double @v_rsq_f64_missing_contract1(double %x) {
 ; SI-GISEL-LABEL: v_rsq_f64_missing_contract1:
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    s_mov_b32 s4, 0
-; SI-GISEL-NEXT:    s_brev_b32 s5, 8
-; SI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
+; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
 ; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
@@ -1387,11 +1371,11 @@ define double @v_rsq_f64_missing_contract1(double %x) {
 ; VI-GISEL-LABEL: v_rsq_f64_missing_contract1:
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-GISEL-NEXT:    s_mov_b32 s4, 0
-; VI-GISEL-NEXT:    s_brev_b32 s5, 8
-; VI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
+; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -1474,11 +1458,11 @@ define double @v_neg_rsq_f64(double %x) {
 ; SI-GISEL-LABEL: v_neg_rsq_f64:
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    s_mov_b32 s4, 0
-; SI-GISEL-NEXT:    s_brev_b32 s5, 8
-; SI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
+; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
 ; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
@@ -1556,11 +1540,11 @@ define double @v_neg_rsq_f64(double %x) {
 ; VI-GISEL-LABEL: v_neg_rsq_f64:
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-GISEL-NEXT:    s_mov_b32 s4, 0
-; VI-GISEL-NEXT:    s_brev_b32 s5, 8
-; VI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
+; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -1680,24 +1664,26 @@ define <2 x double> @v_rsq_v2f64(<2 x double> %x) {
 ; SI-GISEL-NEXT:    s_mov_b32 s4, 0
 ; SI-GISEL-NEXT:    s_brev_b32 s5, 8
 ; SI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v10, vcc
+; SI-GISEL-NEXT:    v_mov_b32_e32 v12, 0x100
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v12, vcc
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
-; SI-GISEL-NEXT:    v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v10, s4
 ; SI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v11, s5
+; SI-GISEL-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[2:3], v[10:11]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v14, 0xffffff80
-; SI-GISEL-NEXT:    v_mov_b32_e32 v15, 0x260
-; SI-GISEL-NEXT:    v_mov_b32_e32 v20, 0x3ff00000
 ; SI-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[0:1], v[4:5]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v15, 0x260
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5
+; SI-GISEL-NEXT:    v_mov_b32_e32 v20, 0x3ff00000
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1]
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1]
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, v10, s[4:5]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, v12, s[4:5]
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v6
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v8, 0, v14, vcc
 ; SI-GISEL-NEXT:    v_rsq_f64_e32 v[6:7], v[2:3]
@@ -1825,12 +1811,14 @@ define <2 x double> @v_rsq_v2f64(<2 x double> %x) {
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_mov_b32 s4, 0
 ; VI-GISEL-NEXT:    s_brev_b32 s5, 8
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, s4
+; VI-GISEL-NEXT:    v_mov_b32_e32 v5, s5
 ; VI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; VI-GISEL-NEXT:    v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v5, 0, v4, vcc
-; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v5
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v4, s[4:5]
+; VI-GISEL-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[2:3], v[4:5]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x100
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v7, 0, v6, vcc
+; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v7
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v6, s[4:5]
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[6:7], v[2:3]
@@ -1855,15 +1843,15 @@ define <2 x double> @v_rsq_v2f64(<2 x double> %x) {
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v10, 0, v8, vcc
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, v8, s[4:5]
 ; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, v8, s[4:5]
 ; VI-GISEL-NEXT:    v_cmp_class_f64_e64 s[4:5], v[2:3], v9
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v10
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[6:7], v[6:7], v8
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[4:5]
 ; VI-GISEL-NEXT:    v_div_scale_f64 v[4:5], s[6:7], v[0:1], v[0:1], 1.0
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[4:5]
 ; VI-GISEL-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s[4:5]
 ; VI-GISEL-NEXT:    v_div_scale_f64 v[6:7], s[4:5], v[2:3], v[2:3], 1.0
 ; VI-GISEL-NEXT:    v_div_scale_f64 v[16:17], s[4:5], 1.0, v[2:3], 1.0
@@ -1977,24 +1965,26 @@ define <2 x double> @v_neg_rsq_v2f64(<2 x double> %x) {
 ; SI-GISEL-NEXT:    s_mov_b32 s4, 0
 ; SI-GISEL-NEXT:    s_brev_b32 s5, 8
 ; SI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v10, vcc
+; SI-GISEL-NEXT:    v_mov_b32_e32 v12, 0x100
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v12, vcc
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
-; SI-GISEL-NEXT:    v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v10, s4
 ; SI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v11, s5
+; SI-GISEL-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[2:3], v[10:11]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v14, 0xffffff80
-; SI-GISEL-NEXT:    v_mov_b32_e32 v15, 0x260
-; SI-GISEL-NEXT:    v_mov_b32_e32 v20, 0xbff00000
 ; SI-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[0:1], v[4:5]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v15, 0x260
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5
+; SI-GISEL-NEXT:    v_mov_b32_e32 v20, 0xbff00000
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1]
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1]
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, v10, s[4:5]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, v12, s[4:5]
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v6
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v8, 0, v14, vcc
 ; SI-GISEL-NEXT:    v_rsq_f64_e32 v[6:7], v[2:3]
@@ -2122,12 +2112,14 @@ define <2 x double> @v_neg_rsq_v2f64(<2 x double> %x) {
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_mov_b32 s4, 0
 ; VI-GISEL-NEXT:    s_brev_b32 s5, 8
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, s4
+; VI-GISEL-NEXT:    v_mov_b32_e32 v5, s5
 ; VI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; VI-GISEL-NEXT:    v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v5, 0, v4, vcc
-; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v5
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v4, s[4:5]
+; VI-GISEL-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[2:3], v[4:5]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x100
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v7, 0, v6, vcc
+; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v7
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v6, s[4:5]
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[6:7], v[2:3]
@@ -2152,15 +2144,15 @@ define <2 x double> @v_neg_rsq_v2f64(<2 x double> %x) {
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v10, 0, v8, vcc
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, v8, s[4:5]
 ; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, v8, s[4:5]
 ; VI-GISEL-NEXT:    v_cmp_class_f64_e64 s[4:5], v[2:3], v9
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v10
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[6:7], v[6:7], v8
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[4:5]
 ; VI-GISEL-NEXT:    v_div_scale_f64 v[4:5], s[6:7], v[0:1], v[0:1], -1.0
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[4:5]
 ; VI-GISEL-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s[4:5]
 ; VI-GISEL-NEXT:    v_div_scale_f64 v[6:7], s[4:5], v[2:3], v[2:3], -1.0
 ; VI-GISEL-NEXT:    v_div_scale_f64 v[16:17], s[4:5], -1.0, v[2:3], -1.0
@@ -2242,15 +2234,17 @@ define <2 x double> @v_neg_rsq_v2f64_poisonelt(<2 x double> %x) {
 ; SI-GISEL-NEXT:    s_mov_b32 s4, 0
 ; SI-GISEL-NEXT:    s_brev_b32 s5, 8
 ; SI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v10, vcc
+; SI-GISEL-NEXT:    v_mov_b32_e32 v12, 0x100
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v12, vcc
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
-; SI-GISEL-NEXT:    v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v10, s4
 ; SI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v11, s5
+; SI-GISEL-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[2:3], v[10:11]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v14, 0xffffff80
-; SI-GISEL-NEXT:    v_mov_b32_e32 v15, 0x260
 ; SI-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[0:1], v[4:5]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v15, 0x260
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
@@ -2258,7 +2252,7 @@ define <2 x double> @v_neg_rsq_v2f64_poisonelt(<2 x double> %x) {
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1]
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, v10, s[4:5]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, v12, s[4:5]
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v6
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v8, 0, v14, vcc
 ; SI-GISEL-NEXT:    v_rsq_f64_e32 v[6:7], v[2:3]
@@ -2358,12 +2352,14 @@ define <2 x double> @v_neg_rsq_v2f64_poisonelt(<2 x double> %x) {
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_mov_b32 s4, 0
 ; VI-GISEL-NEXT:    s_brev_b32 s5, 8
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, s4
+; VI-GISEL-NEXT:    v_mov_b32_e32 v5, s5
 ; VI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; VI-GISEL-NEXT:    v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v5, 0, v4, vcc
-; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v5
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v4, s[4:5]
+; VI-GISEL-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[2:3], v[4:5]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x100
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v7, 0, v6, vcc
+; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v7
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v6, s[4:5]
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[6:7], v[2:3]
@@ -2388,15 +2384,15 @@ define <2 x double> @v_neg_rsq_v2f64_poisonelt(<2 x double> %x) {
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v10, 0, v8, vcc
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, v8, s[4:5]
 ; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, v8, s[4:5]
 ; VI-GISEL-NEXT:    v_cmp_class_f64_e64 s[4:5], v[2:3], v9
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v10
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[6:7], v[6:7], v8
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[4:5]
 ; VI-GISEL-NEXT:    v_div_scale_f64 v[4:5], s[6:7], v[0:1], v[0:1], -1.0
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[4:5]
 ; VI-GISEL-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s[4:5]
 ; VI-GISEL-NEXT:    v_div_scale_f64 v[6:7], s[4:5], v[2:3], v[2:3], s[4:5]
 ; VI-GISEL-NEXT:    v_div_scale_f64 v[16:17], s[4:5], s[4:5], v[2:3], s[4:5]
@@ -2511,15 +2507,17 @@ define <2 x double> @v_neg_pos_rsq_v2f64(<2 x double> %x) {
 ; SI-GISEL-NEXT:    s_mov_b32 s4, 0
 ; SI-GISEL-NEXT:    s_brev_b32 s5, 8
 ; SI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v10, vcc
+; SI-GISEL-NEXT:    v_mov_b32_e32 v12, 0x100
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v12, vcc
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
-; SI-GISEL-NEXT:    v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v10, s4
 ; SI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v11, s5
+; SI-GISEL-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[2:3], v[10:11]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v14, 0xffffff80
-; SI-GISEL-NEXT:    v_mov_b32_e32 v15, 0x260
 ; SI-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[0:1], v[4:5]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v15, 0x260
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
@@ -2527,7 +2525,7 @@ define <2 x double> @v_neg_pos_rsq_v2f64(<2 x double> %x) {
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1]
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, v10, s[4:5]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, v12, s[4:5]
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v6
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v8, 0, v14, vcc
 ; SI-GISEL-NEXT:    v_rsq_f64_e32 v[6:7], v[2:3]
@@ -2657,12 +2655,14 @@ define <2 x double> @v_neg_pos_rsq_v2f64(<2 x double> %x) {
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_mov_b32 s4, 0
 ; VI-GISEL-NEXT:    s_brev_b32 s5, 8
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, s4
+; VI-GISEL-NEXT:    v_mov_b32_e32 v5, s5
 ; VI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; VI-GISEL-NEXT:    v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v5, 0, v4, vcc
-; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v5
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v4, s[4:5]
+; VI-GISEL-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[2:3], v[4:5]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x100
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v7, 0, v6, vcc
+; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v7
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v6, s[4:5]
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[6:7], v[2:3]
@@ -2687,15 +2687,15 @@ define <2 x double> @v_neg_pos_rsq_v2f64(<2 x double> %x) {
 ; VI-GISEL-NEXT:    v_fma_f64 v[6:7], v[14:15], v[10:11], v[6:7]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v10, 0, v8, vcc
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, v8, s[4:5]
 ; VI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v9
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, v8, s[4:5]
 ; VI-GISEL-NEXT:    v_cmp_class_f64_e64 s[4:5], v[2:3], v9
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v10
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[6:7], v[6:7], v8
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[4:5]
 ; VI-GISEL-NEXT:    v_div_scale_f64 v[4:5], s[6:7], v[0:1], v[0:1], -1.0
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[4:5]
 ; VI-GISEL-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s[4:5]
 ; VI-GISEL-NEXT:    v_div_scale_f64 v[6:7], s[4:5], v[2:3], v[2:3], 1.0
 ; VI-GISEL-NEXT:    v_div_scale_f64 v[16:17], s[4:5], 1.0, v[2:3], 1.0
@@ -2772,11 +2772,11 @@ define double @v_rsq_f64_fneg_fabs(double %x) {
 ; SI-GISEL-LABEL: v_rsq_f64_fneg_fabs:
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    s_mov_b32 s4, 0
-; SI-GISEL-NEXT:    s_brev_b32 s5, 8
-; SI-GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, -|v[0:1]|, s[4:5]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
+; SI-GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, -|v[0:1]|, v[2:3]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], -|v[0:1]|, v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
 ; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
@@ -2854,11 +2854,11 @@ define double @v_rsq_f64_fneg_fabs(double %x) {
 ; VI-GISEL-LABEL: v_rsq_f64_fneg_fabs:
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-GISEL-NEXT:    s_mov_b32 s4, 0
-; VI-GISEL-NEXT:    s_brev_b32 s5, 8
-; VI-GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, -|v[0:1]|, s[4:5]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
+; VI-GISEL-NEXT:    v_cmp_lt_f64_e64 vcc, -|v[0:1]|, v[2:3]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], -|v[0:1]|, v2
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -2943,11 +2943,11 @@ define double @v_rsq_f64__afn_sqrt(double %x) {
 ; SI-GISEL-LABEL: v_rsq_f64__afn_sqrt:
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    s_mov_b32 s4, 0
-; SI-GISEL-NEXT:    s_brev_b32 s5, 8
-; SI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
+; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
 ; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
@@ -3025,11 +3025,11 @@ define double @v_rsq_f64__afn_sqrt(double %x) {
 ; VI-GISEL-LABEL: v_rsq_f64__afn_sqrt:
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-GISEL-NEXT:    s_mov_b32 s4, 0
-; VI-GISEL-NEXT:    s_brev_b32 s5, 8
-; VI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
+; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -3104,11 +3104,11 @@ define double @v_rsq_f64__afn_fdiv(double %x) {
 ; SI-GISEL-LABEL: v_rsq_f64__afn_fdiv:
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    s_mov_b32 s4, 0
-; SI-GISEL-NEXT:    s_brev_b32 s5, 8
-; SI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
+; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
 ; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
@@ -3174,11 +3174,11 @@ define double @v_rsq_f64__afn_fdiv(double %x) {
 ; VI-GISEL-LABEL: v_rsq_f64__afn_fdiv:
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-GISEL-NEXT:    s_mov_b32 s4, 0
-; VI-GISEL-NEXT:    s_brev_b32 s5, 8
-; VI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
+; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -3249,11 +3249,11 @@ define double @v_rsq_f64__afn(double %x) {
 ; SI-GISEL-LABEL: v_rsq_f64__afn:
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    s_mov_b32 s4, 0
-; SI-GISEL-NEXT:    s_brev_b32 s5, 8
-; SI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
+; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
 ; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
@@ -3319,11 +3319,11 @@ define double @v_rsq_f64__afn(double %x) {
 ; VI-GISEL-LABEL: v_rsq_f64__afn:
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-GISEL-NEXT:    s_mov_b32 s4, 0
-; VI-GISEL-NEXT:    s_brev_b32 s5, 8
-; VI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
+; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -3395,11 +3395,11 @@ define double @v_neg_rsq_f64__afn(double %x) {
 ; SI-GISEL-LABEL: v_neg_rsq_f64__afn:
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    s_mov_b32 s4, 0
-; SI-GISEL-NEXT:    s_brev_b32 s5, 8
-; SI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
+; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
 ; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
@@ -3467,11 +3467,11 @@ define double @v_neg_rsq_f64__afn(double %x) {
 ; VI-GISEL-LABEL: v_neg_rsq_f64__afn:
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-GISEL-NEXT:    s_mov_b32 s4, 0
-; VI-GISEL-NEXT:    s_brev_b32 s5, 8
-; VI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
+; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -3543,11 +3543,11 @@ define double @v_rsq_f64__afn_ninf(double %x) {
 ; SI-GISEL-LABEL: v_rsq_f64__afn_ninf:
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    s_mov_b32 s4, 0
-; SI-GISEL-NEXT:    s_brev_b32 s5, 8
-; SI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
+; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
 ; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
@@ -3613,11 +3613,11 @@ define double @v_rsq_f64__afn_ninf(double %x) {
 ; VI-GISEL-LABEL: v_rsq_f64__afn_ninf:
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-GISEL-NEXT:    s_mov_b32 s4, 0
-; VI-GISEL-NEXT:    s_brev_b32 s5, 8
-; VI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
+; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -3688,11 +3688,11 @@ define double @v_rsq_f64__afn_nnan(double %x) {
 ; SI-GISEL-LABEL: v_rsq_f64__afn_nnan:
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    s_mov_b32 s4, 0
-; SI-GISEL-NEXT:    s_brev_b32 s5, 8
-; SI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
+; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
 ; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
@@ -3758,11 +3758,11 @@ define double @v_rsq_f64__afn_nnan(double %x) {
 ; VI-GISEL-LABEL: v_rsq_f64__afn_nnan:
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-GISEL-NEXT:    s_mov_b32 s4, 0
-; VI-GISEL-NEXT:    s_brev_b32 s5, 8
-; VI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
+; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -3833,11 +3833,11 @@ define double @v_rsq_f64__afn_nnan_ninf(double %x) {
 ; SI-GISEL-LABEL: v_rsq_f64__afn_nnan_ninf:
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    s_mov_b32 s4, 0
-; SI-GISEL-NEXT:    s_brev_b32 s5, 8
-; SI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
+; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
 ; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
@@ -3903,11 +3903,11 @@ define double @v_rsq_f64__afn_nnan_ninf(double %x) {
 ; VI-GISEL-LABEL: v_rsq_f64__afn_nnan_ninf:
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-GISEL-NEXT:    s_mov_b32 s4, 0
-; VI-GISEL-NEXT:    s_brev_b32 s5, 8
-; VI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
+; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -3979,11 +3979,11 @@ define double @v_neg_rsq_f64__afn_nnan_ninf(double %x) {
 ; SI-GISEL-LABEL: v_neg_rsq_f64__afn_nnan_ninf:
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    s_mov_b32 s4, 0
-; SI-GISEL-NEXT:    s_brev_b32 s5, 8
-; SI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
+; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
 ; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
@@ -4051,11 +4051,11 @@ define double @v_neg_rsq_f64__afn_nnan_ninf(double %x) {
 ; VI-GISEL-LABEL: v_neg_rsq_f64__afn_nnan_ninf:
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-GISEL-NEXT:    s_mov_b32 s4, 0
-; VI-GISEL-NEXT:    s_brev_b32 s5, 8
-; VI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
+; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -4135,11 +4135,11 @@ define double @v_rsq_f64__nnan_ninf(double %x) {
 ; SI-GISEL-LABEL: v_rsq_f64__nnan_ninf:
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    s_mov_b32 s4, 0
-; SI-GISEL-NEXT:    s_brev_b32 s5, 8
-; SI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
+; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
 ; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
@@ -4217,11 +4217,11 @@ define double @v_rsq_f64__nnan_ninf(double %x) {
 ; VI-GISEL-LABEL: v_rsq_f64__nnan_ninf:
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-GISEL-NEXT:    s_mov_b32 s4, 0
-; VI-GISEL-NEXT:    s_brev_b32 s5, 8
-; VI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
+; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -4325,13 +4325,13 @@ define <2 x double> @v_rsq_v2f64__afn_nnan_ninf(<2 x double> %x) {
 ; SI-GISEL-NEXT:    s_mov_b32 s4, 0
 ; SI-GISEL-NEXT:    s_brev_b32 s5, 8
 ; SI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v10, vcc
+; SI-GISEL-NEXT:    v_mov_b32_e32 v12, 0x100
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v12, vcc
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
-; SI-GISEL-NEXT:    v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v10, s4
 ; SI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v12, 0xffffff80
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v13, 0, v12, vcc
+; SI-GISEL-NEXT:    v_mov_b32_e32 v11, s5
+; SI-GISEL-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[2:3], v[10:11]
 ; SI-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[0:1], v[4:5]
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5
@@ -4339,28 +4339,30 @@ define <2 x double> @v_rsq_v2f64__afn_nnan_ninf(<2 x double> %x) {
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1]
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, v10, s[4:5]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, v12, s[4:5]
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v8
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[4:5], v[0:1]
 ; SI-GISEL-NEXT:    v_rsq_f64_e32 v[10:11], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
-; SI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v13
+; SI-GISEL-NEXT:    v_mov_b32_e32 v12, 0xffffff80
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v13, 0, v12, vcc
 ; SI-GISEL-NEXT:    v_mul_f64 v[6:7], v[10:11], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[8:9], v[2:3], v[10:11]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v13, 0x260
+; SI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v13
 ; SI-GISEL-NEXT:    v_fma_f64 v[10:11], -v[6:7], v[8:9], 0.5
-; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v13
+; SI-GISEL-NEXT:    v_mov_b32_e32 v13, 0x260
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7]
 ; SI-GISEL-NEXT:    v_fma_f64 v[10:11], -v[8:9], v[8:9], v[2:3]
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[0:1], v13
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], v[10:11], v[6:7], v[8:9]
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
 ; SI-GISEL-NEXT:    v_fma_f64 v[10:11], -v[8:9], v[8:9], v[2:3]
-; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[2:3], v13
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[10:11], v[6:7], v[8:9]
 ; SI-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, v12, s[4:5]
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v6
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[2:3], v13
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; SI-GISEL-NEXT:    v_rcp_f64_e32 v[4:5], v[0:1]
@@ -4445,12 +4447,14 @@ define <2 x double> @v_rsq_v2f64__afn_nnan_ninf(<2 x double> %x) {
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    s_mov_b32 s4, 0
 ; VI-GISEL-NEXT:    s_brev_b32 s5, 8
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, s4
+; VI-GISEL-NEXT:    v_mov_b32_e32 v5, s5
 ; VI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; VI-GISEL-NEXT:    v_cmp_gt_f64_e64 s[4:5], s[4:5], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v5, 0, v4, vcc
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v4, s[4:5]
-; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v5
+; VI-GISEL-NEXT:    v_cmp_lt_f64_e64 s[4:5], v[2:3], v[4:5]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x100
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v7, 0, v6, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v6, s[4:5]
+; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v7
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[0:1]
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[6:7], v[2:3]
@@ -4543,10 +4547,8 @@ define amdgpu_ps <2 x i32> @s_rsq_f64_unsafe(double inreg %x) #0 {
 ;
 ; SI-GISEL-LABEL: s_rsq_f64_unsafe:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_mov_b32 s2, 0
-; SI-GISEL-NEXT:    s_brev_b32 s3, 8
-; SI-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; SI-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; SI-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; SI-GISEL-NEXT:    v_bfrev_b32_e32 v1, 8
 ; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
@@ -4617,10 +4619,8 @@ define amdgpu_ps <2 x i32> @s_rsq_f64_unsafe(double inreg %x) #0 {
 ;
 ; VI-GISEL-LABEL: s_rsq_f64_unsafe:
 ; VI-GISEL:       ; %bb.0:
-; VI-GISEL-NEXT:    s_mov_b32 s2, 0
-; VI-GISEL-NEXT:    s_brev_b32 s3, 8
-; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s2
-; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; VI-GISEL-NEXT:    v_bfrev_b32_e32 v1, 8
 ; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, s[0:1], v[0:1]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
@@ -4703,11 +4703,11 @@ define double @v_rsq_f64_unsafe(double %x) #0 {
 ; SI-GISEL-LABEL: v_rsq_f64_unsafe:
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    s_mov_b32 s4, 0
-; SI-GISEL-NEXT:    s_brev_b32 s5, 8
-; SI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
+; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
 ; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
@@ -4773,11 +4773,11 @@ define double @v_rsq_f64_unsafe(double %x) #0 {
 ; VI-GISEL-LABEL: v_rsq_f64_unsafe:
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-GISEL-NEXT:    s_mov_b32 s4, 0
-; VI-GISEL-NEXT:    s_brev_b32 s5, 8
-; VI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
+; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
@@ -5109,14 +5109,15 @@ define double @v_div_contract_sqrt_f64(double %x, double %y) {
 ; SI-GISEL-LABEL: v_div_contract_sqrt_f64:
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    s_mov_b32 s4, 0
-; SI-GISEL-NEXT:    s_brev_b32 s5, 8
-; SI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; SI-GISEL-NEXT:    v_bfrev_b32_e32 v5, 8
+; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[2:3], v[4:5]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x100
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v6, vcc
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0xffffff80
 ; SI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v11, 0x260
 ; SI-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], v[4:5]
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5
@@ -5128,8 +5129,7 @@ define double @v_div_contract_sqrt_f64(double %x, double %y) {
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v6, 0, v10, vcc
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v6
-; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x260
-; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[2:3], v6
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[2:3], v11
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1]
@@ -5190,11 +5190,11 @@ define double @v_div_contract_sqrt_f64(double %x, double %y) {
 ; VI-GISEL-LABEL: v_div_contract_sqrt_f64:
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-GISEL-NEXT:    s_mov_b32 s4, 0
-; VI-GISEL-NEXT:    s_brev_b32 s5, 8
-; VI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; VI-GISEL-NEXT:    v_bfrev_b32_e32 v5, 8
+; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[2:3], v[4:5]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x100
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v6, vcc
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[2:3]
 ; VI-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], 0.5
@@ -5276,14 +5276,15 @@ define double @v_div_arcp_sqrt_f64(double %x, double %y) {
 ; SI-GISEL-LABEL: v_div_arcp_sqrt_f64:
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    s_mov_b32 s4, 0
-; SI-GISEL-NEXT:    s_brev_b32 s5, 8
-; SI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; SI-GISEL-NEXT:    v_bfrev_b32_e32 v5, 8
+; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[2:3], v[4:5]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x100
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v6, vcc
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0xffffff80
 ; SI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v11, 0x260
 ; SI-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], v[4:5]
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5
@@ -5295,8 +5296,7 @@ define double @v_div_arcp_sqrt_f64(double %x, double %y) {
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v6, 0, v10, vcc
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v6
-; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x260
-; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[2:3], v6
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[2:3], v11
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1]
@@ -5357,11 +5357,11 @@ define double @v_div_arcp_sqrt_f64(double %x, double %y) {
 ; VI-GISEL-LABEL: v_div_arcp_sqrt_f64:
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-GISEL-NEXT:    s_mov_b32 s4, 0
-; VI-GISEL-NEXT:    s_brev_b32 s5, 8
-; VI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; VI-GISEL-NEXT:    v_bfrev_b32_e32 v5, 8
+; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[2:3], v[4:5]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x100
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v6, vcc
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[2:3]
 ; VI-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], 0.5
@@ -5443,14 +5443,15 @@ define double @v_div_contract_arcp_sqrt_f64(double %x, double %y) {
 ; SI-GISEL-LABEL: v_div_contract_arcp_sqrt_f64:
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    s_mov_b32 s4, 0
-; SI-GISEL-NEXT:    s_brev_b32 s5, 8
-; SI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; SI-GISEL-NEXT:    v_bfrev_b32_e32 v5, 8
+; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[2:3], v[4:5]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x100
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v6, vcc
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0xffffff80
 ; SI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[2:3]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v11, 0x260
 ; SI-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], v[4:5]
 ; SI-GISEL-NEXT:    v_fma_f64 v[8:9], -v[6:7], v[4:5], 0.5
@@ -5462,8 +5463,7 @@ define double @v_div_contract_arcp_sqrt_f64(double %x, double %y) {
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[8:9], v[6:7], v[4:5]
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v6, 0, v10, vcc
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[4:5], v[4:5], v6
-; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x260
-; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[2:3], v6
+; SI-GISEL-NEXT:    v_cmp_class_f64_e32 vcc, v[2:3], v11
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; SI-GISEL-NEXT:    v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1]
@@ -5524,11 +5524,11 @@ define double @v_div_contract_arcp_sqrt_f64(double %x, double %y) {
 ; VI-GISEL-LABEL: v_div_contract_arcp_sqrt_f64:
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-GISEL-NEXT:    s_mov_b32 s4, 0
-; VI-GISEL-NEXT:    s_brev_b32 s5, 8
-; VI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[2:3]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; VI-GISEL-NEXT:    v_bfrev_b32_e32 v5, 8
+; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[2:3], v[4:5]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x100
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v6, vcc
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[4:5], v[2:3]
 ; VI-GISEL-NEXT:    v_mul_f64 v[6:7], v[4:5], 0.5
@@ -5568,16 +5568,18 @@ define double @v_div_const_contract_sqrt_f64(double %x) {
 ; SI-SDAG-LABEL: v_div_const_contract_sqrt_f64:
 ; SI-SDAG:       ; %bb.0:
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SDAG-NEXT:    s_brev_b32 s7, 8
-; SI-SDAG-NEXT:    s_mov_b32 s6, 0
-; SI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[6:7], v[0:1]
+; SI-SDAG-NEXT:    s_mov_b32 s4, 0
+; SI-SDAG-NEXT:    s_brev_b32 s5, 8
+; SI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v8, 0xffffff80
 ; SI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; SI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; SI-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; SI-SDAG-NEXT:    v_mov_b32_e32 v9, 0x260
 ; SI-SDAG-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
+; SI-SDAG-NEXT:    s_mov_b32 s6, 0
 ; SI-SDAG-NEXT:    s_mov_b32 s7, 0x40700000
+; SI-SDAG-NEXT:    s_mov_b32 s8, 0x40700000
 ; SI-SDAG-NEXT:    v_mul_f64 v[4:5], v[0:1], v[2:3]
 ; SI-SDAG-NEXT:    v_mul_f64 v[2:3], v[2:3], 0.5
 ; SI-SDAG-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 0.5
@@ -5600,7 +5602,7 @@ define double @v_div_const_contract_sqrt_f64(double %x) {
 ; SI-SDAG-NEXT:    v_div_scale_f64 v[6:7], s[4:5], s[6:7], v[0:1], s[6:7]
 ; SI-SDAG-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0
 ; SI-SDAG-NEXT:    v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5]
-; SI-SDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], s7, v7
+; SI-SDAG-NEXT:    v_cmp_eq_u32_e64 s[4:5], s8, v7
 ; SI-SDAG-NEXT:    v_mul_f64 v[8:9], v[6:7], v[4:5]
 ; SI-SDAG-NEXT:    s_xor_b64 vcc, s[4:5], vcc
 ; SI-SDAG-NEXT:    v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7]
@@ -5611,19 +5613,20 @@ define double @v_div_const_contract_sqrt_f64(double %x) {
 ; SI-GISEL-LABEL: v_div_const_contract_sqrt_f64:
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    s_mov_b32 s6, 0
-; SI-GISEL-NEXT:    s_brev_b32 s7, 8
-; SI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[6:7], v[0:1]
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; SI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
+; SI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; SI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
 ; SI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v9, 0x260
+; SI-GISEL-NEXT:    s_mov_b32 s6, 0
 ; SI-GISEL-NEXT:    s_mov_b32 s7, 0x40700000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x40700000
 ; SI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
 ; SI-GISEL-NEXT:    v_mul_f64 v[2:3], v[0:1], v[2:3]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v10, 0x40700000
 ; SI-GISEL-NEXT:    v_fma_f64 v[6:7], -v[4:5], v[2:3], 0.5
 ; SI-GISEL-NEXT:    v_fma_f64 v[2:3], v[2:3], v[6:7], v[2:3]
 ; SI-GISEL-NEXT:    v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
@@ -5655,9 +5658,10 @@ define double @v_div_const_contract_sqrt_f64(double %x) {
 ; VI-SDAG-LABEL: v_div_const_contract_sqrt_f64:
 ; VI-SDAG:       ; %bb.0:
 ; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-SDAG-NEXT:    s_brev_b32 s5, 8
 ; VI-SDAG-NEXT:    s_mov_b32 s4, 0
+; VI-SDAG-NEXT:    s_brev_b32 s5, 8
 ; VI-SDAG-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
+; VI-SDAG-NEXT:    s_mov_b32 s4, 0
 ; VI-SDAG-NEXT:    s_mov_b32 s5, 0x40700000
 ; VI-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; VI-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
@@ -5695,12 +5699,13 @@ define double @v_div_const_contract_sqrt_f64(double %x) {
 ; VI-GISEL-LABEL: v_div_const_contract_sqrt_f64:
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; VI-GISEL-NEXT:    v_bfrev_b32_e32 v3, 8
+; VI-GISEL-NEXT:    v_cmp_lt_f64_e32 vcc, v[0:1], v[2:3]
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x100
 ; VI-GISEL-NEXT:    s_mov_b32 s4, 0
-; VI-GISEL-NEXT:    s_brev_b32 s5, 8
-; VI-GISEL-NEXT:    v_cmp_gt_f64_e32 vcc, s[4:5], v[0:1]
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x100
 ; VI-GISEL-NEXT:    s_mov_b32 s5, 0x40700000
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; VI-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; VI-GISEL-NEXT:    v_rsq_f64_e32 v[2:3], v[0:1]
 ; VI-GISEL-NEXT:    v_mul_f64 v[4:5], v[2:3], 0.5
diff --git a/llvm/test/CodeGen/AMDGPU/salu-to-valu.ll b/llvm/test/CodeGen/AMDGPU/salu-to-valu.ll
index 7027521d7e2dc..714b2af1698fe 100644
--- a/llvm/test/CodeGen/AMDGPU/salu-to-valu.ll
+++ b/llvm/test/CodeGen/AMDGPU/salu-to-valu.ll
@@ -171,10 +171,10 @@ entry:
 ; GCN-LABEL: {{^}}smrd_valu_ci_offset_x8:
 ; GCN-NOHSA-DAG: s_mov_b32 [[OFFSET0:s[0-9]+]], 0x9a40{{$}}
 ; CI-NOHSA-NOT: v_add
-; SI: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:16
+; SI-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:16
 ; CI-NOHSA-DAG: s_mov_b32 [[OFFSET1:s[0-9]+]], 0x9a50{{$}}
 ; CI-NOHSA-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET1]] addr64{{$}}
-; GCN-NOHSA: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET0]] addr64{{$}}
+; GCN-NOHSA-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET0]] addr64{{$}}
 
 ; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
 ; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
@@ -201,11 +201,11 @@ entry:
 
 ; GCN-LABEL: {{^}}smrd_valu_ci_offset_x16:
 
-; SI: s_mov_b32 {{s[0-9]+}}, 0x13480
+; SI-DAG: s_mov_b64 s[{{[0-9:]+}}], 0x13480
 ; SI-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:16
 ; SI-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:32
 ; SI-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:48
-; SI: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], {{s[0-9]+}} addr64
+; SI-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], {{s[0-9]+}} addr64
 ; CI-NOHSA-DAG: s_mov_b32 [[OFFSET0:s[0-9]+]], 0x13480{{$}}
 ; CI-NOHSA-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], [[OFFSET0]] addr64{{$}}
 ; CI-NOHSA-DAG: s_mov_b32 [[OFFSET1:s[0-9]+]], 0x13490{{$}}
diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
index 4f2fd3f50494c..9cb6842ae0a18 100644
--- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
@@ -1592,31 +1592,30 @@ define i64 @v_test_sdiv_pow2_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_addc_u32_e64 v6, s[6:7], 0, -1, vcc
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
 ; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[5:6]
+; GCN-IR-NEXT:    v_cmp_ne_u64_e64 s[6:7], 63, v[5:6]
 ; GCN-IR-NEXT:    v_mov_b32_e32 v7, 0x8000
 ; GCN-IR-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
-; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 63, v[5:6]
 ; GCN-IR-NEXT:    v_cndmask_b32_e64 v7, v7, 0, s[4:5]
 ; GCN-IR-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
 ; GCN-IR-NEXT:    v_mov_b32_e32 v3, v2
 ; GCN-IR-NEXT:    v_mov_b32_e32 v4, 0
-; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0x8000
-; GCN-IR-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
+; GCN-IR-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
 ; GCN-IR-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
 ; GCN-IR-NEXT:    s_cbranch_execz .LBB12_6
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
 ; GCN-IR-NEXT:    v_add_i32_e32 v9, vcc, 1, v5
-; GCN-IR-NEXT:    v_addc_u32_e32 v10, vcc, 0, v6, vcc
 ; GCN-IR-NEXT:    v_sub_i32_e64 v4, s[4:5], 63, v5
+; GCN-IR-NEXT:    v_addc_u32_e32 v10, vcc, 0, v6, vcc
+; GCN-IR-NEXT:    s_mov_b64 s[4:5], 0x8000
 ; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[9:10]
-; GCN-IR-NEXT:    v_lshl_b64 v[4:5], s[8:9], v4
+; GCN-IR-NEXT:    v_lshl_b64 v[4:5], s[4:5], v4
 ; GCN-IR-NEXT:    v_mov_b32_e32 v6, 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v7, 0
-; GCN-IR-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GCN-IR-NEXT:    s_xor_b64 s[8:9], exec, s[4:5]
+; GCN-IR-NEXT:    s_and_saveexec_b64 s[8:9], vcc
+; GCN-IR-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
 ; GCN-IR-NEXT:    s_cbranch_execz .LBB12_5
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
 ; GCN-IR-NEXT:    v_add_i32_e32 v14, vcc, -1, v0
-; GCN-IR-NEXT:    s_mov_b64 s[4:5], 0x8000
 ; GCN-IR-NEXT:    v_addc_u32_e32 v15, vcc, -1, v1, vcc
 ; GCN-IR-NEXT:    v_lshr_b64 v[10:11], s[4:5], v9
 ; GCN-IR-NEXT:    v_sub_i32_e32 v8, vcc, 47, v8
diff --git a/llvm/test/CodeGen/AMDGPU/shl.ll b/llvm/test/CodeGen/AMDGPU/shl.ll
index be0aa394dd99d..9cd13b3d45150 100644
--- a/llvm/test/CodeGen/AMDGPU/shl.ll
+++ b/llvm/test/CodeGen/AMDGPU/shl.ll
@@ -1284,18 +1284,15 @@ define amdgpu_kernel void @v_shl_i64_32_bit_constant(ptr addrspace(1) %out, ptr
 ; VI-LABEL: v_shl_i64_32_bit_constant:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
-; VI-NEXT:    s_mov_b32 s4, s0
-; VI-NEXT:    s_mov_b32 s5, s1
-; VI-NEXT:    s_mov_b64 s[0:1], 0x12d687
+; VI-NEXT:    s_load_dword s4, s[2:3], 0x0
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshl_b64 s[0:1], s[0:1], s2
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT:    s_lshl_b64 s[4:5], 0x12d687, s4
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: v_shl_i64_32_bit_constant:
@@ -1875,30 +1872,28 @@ define amdgpu_kernel void @s_shl_inline_imm_neg_4_0_i64(ptr addrspace(1) %out, p
 define amdgpu_kernel void @s_shl_inline_imm_f32_4_0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) {
 ; SI-LABEL: s_shl_inline_imm_f32_4_0_i64:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
-; SI-NEXT:    s_load_dword s2, s[0:1], 0xd
-; SI-NEXT:    s_mov_b64 s[0:1], 0x40800000
-; SI-NEXT:    s_mov_b32 s7, 0xf000
-; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    s_load_dword s4, s[0:1], 0xd
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_lshl_b64 s[0:1], s[0:1], s2
-; SI-NEXT:    v_mov_b32_e32 v0, s0
-; SI-NEXT:    v_mov_b32_e32 v1, s1
-; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT:    s_lshl_b64 s[4:5], 0x40800000, s4
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    v_mov_b32_e32 v1, s5
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: s_shl_inline_imm_f32_4_0_i64:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
-; VI-NEXT:    s_load_dword s2, s[0:1], 0x34
-; VI-NEXT:    s_mov_b64 s[0:1], 0x40800000
-; VI-NEXT:    s_mov_b32 s7, 0xf000
-; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_load_dword s4, s[0:1], 0x34
+; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; VI-NEXT:    s_mov_b32 s3, 0xf000
+; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshl_b64 s[0:1], s[0:1], s2
-; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; VI-NEXT:    s_lshl_b64 s[4:5], 0x40800000, s4
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
 ;
 ; EG-LABEL: s_shl_inline_imm_f32_4_0_i64:
@@ -1931,10 +1926,10 @@ define amdgpu_kernel void @s_shl_inline_imm_f32_neg_4_0_i64(ptr addrspace(1) %ou
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
 ; SI-NEXT:    s_load_dword s2, s[0:1], 0xd
-; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_mov_b32 s0, -4.0
-; SI-NEXT:    s_mov_b32 s1, s6
+; SI-NEXT:    s_mov_b32 s1, -1
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_lshl_b64 s[0:1], s[0:1], s2
 ; SI-NEXT:    v_mov_b32_e32 v0, s0
@@ -1946,10 +1941,10 @@ define amdgpu_kernel void @s_shl_inline_imm_f32_neg_4_0_i64(ptr addrspace(1) %ou
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x24
 ; VI-NEXT:    s_load_dword s2, s[0:1], 0x34
-; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_mov_b32 s0, -4.0
-; VI-NEXT:    s_mov_b32 s1, s6
+; VI-NEXT:    s_mov_b32 s1, -1
 ; VI-NEXT:    s_mov_b32 s7, 0xf000
+; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_lshl_b64 s[0:1], s[0:1], s2
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
diff --git a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
index ebd7dfdd0b92a..bd3a1ca80430f 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
@@ -16,2668 +16,2662 @@
 define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 ; GFX6-LABEL: test:
 ; GFX6:       ; %bb.0: ; %entry
-; GFX6-NEXT:    s_mov_b32 s44, SCRATCH_RSRC_DWORD0
-; GFX6-NEXT:    s_mov_b32 s45, SCRATCH_RSRC_DWORD1
-; GFX6-NEXT:    s_mov_b32 s46, -1
-; GFX6-NEXT:    s_mov_b32 s47, 0xe8f000
-; GFX6-NEXT:    s_add_u32 s44, s44, s3
+; GFX6-NEXT:    s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX6-NEXT:    s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX6-NEXT:    s_mov_b32 s42, -1
+; GFX6-NEXT:    s_mov_b32 s43, 0xe8f000
+; GFX6-NEXT:    s_add_u32 s40, s40, s3
 ; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
 ; GFX6-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, -1, 0
 ; GFX6-NEXT:    v_mbcnt_hi_u32_b32_e32 v0, -1, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v5, 13, v0
-; GFX6-NEXT:    s_mov_b32 s18, 0
-; GFX6-NEXT:    s_mov_b32 s19, 0xf000
-; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s2, v5
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
-; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX6-NEXT:    s_movk_i32 s4, 0x80
-; GFX6-NEXT:    s_mov_b32 s5, s18
-; GFX6-NEXT:    s_mov_b64 s[6:7], s[18:19]
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[4:7], 0 addr64 offset:3968
-; GFX6-NEXT:    s_addc_u32 s45, s45, 0
-; GFX6-NEXT:    s_movk_i32 s8, 0x100
-; GFX6-NEXT:    s_mov_b32 s9, s18
-; GFX6-NEXT:    s_mov_b64 s[10:11], s[18:19]
-; GFX6-NEXT:    s_movk_i32 s12, 0x180
-; GFX6-NEXT:    s_mov_b32 s13, s18
-; GFX6-NEXT:    s_mov_b64 s[14:15], s[18:19]
-; GFX6-NEXT:    s_movk_i32 s20, 0x200
-; GFX6-NEXT:    s_mov_b32 s21, s18
-; GFX6-NEXT:    s_mov_b64 s[22:23], s[18:19]
-; GFX6-NEXT:    s_movk_i32 s24, 0x280
-; GFX6-NEXT:    s_mov_b32 s25, s18
-; GFX6-NEXT:    s_mov_b64 s[26:27], s[18:19]
-; GFX6-NEXT:    s_movk_i32 s28, 0x300
-; GFX6-NEXT:    s_mov_b32 s29, s18
-; GFX6-NEXT:    s_mov_b64 s[30:31], s[18:19]
-; GFX6-NEXT:    s_movk_i32 s36, 0x380
-; GFX6-NEXT:    s_mov_b32 s37, s18
-; GFX6-NEXT:    s_mov_b64 s[38:39], s[18:19]
-; GFX6-NEXT:    s_movk_i32 s40, 0x400
-; GFX6-NEXT:    s_mov_b32 s41, s18
-; GFX6-NEXT:    s_mov_b64 s[42:43], s[18:19]
-; GFX6-NEXT:    s_mov_b64 s[16:17], s[2:3]
+; GFX6-NEXT:    s_mov_b32 s6, 0
 ; GFX6-NEXT:    v_mov_b32_e32 v6, 0
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_add_i32_e32 v7, vcc, s2, v5
+; GFX6-NEXT:    v_mov_b32_e32 v0, s3
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX6-NEXT:    v_addc_u32_e32 v8, vcc, 0, v0, vcc
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64
+; GFX6-NEXT:    s_addc_u32 s41, s41, 0
 ; GFX6-NEXT:    s_mov_b32 s2, 0x3fd00
+; GFX6-NEXT:    s_mov_b64 s[8:9], 0x100
+; GFX6-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX6-NEXT:    s_mov_b64 s[12:13], 0x180
+; GFX6-NEXT:    s_mov_b64 s[14:15], s[6:7]
+; GFX6-NEXT:    s_mov_b64 s[16:17], 0x200
+; GFX6-NEXT:    s_mov_b64 s[18:19], s[6:7]
+; GFX6-NEXT:    s_mov_b64 s[20:21], 0x280
+; GFX6-NEXT:    s_mov_b64 s[22:23], s[6:7]
+; GFX6-NEXT:    s_mov_b64 s[24:25], 0x300
+; GFX6-NEXT:    s_mov_b64 s[26:27], s[6:7]
+; GFX6-NEXT:    s_mov_b64 s[28:29], 0x380
+; GFX6-NEXT:    s_mov_b64 s[30:31], s[6:7]
+; GFX6-NEXT:    s_mov_b64 s[36:37], 0x400
+; GFX6-NEXT:    s_mov_b64 s[38:39], s[6:7]
+; GFX6-NEXT:    s_mov_b32 s33, 0x4f900
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:1268 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:4 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:1272 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:1276 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:1280 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:16 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[4:7], 0 addr64 offset:3984
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:16
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:1300 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:20 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:1304 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:1308 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:1312 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:24 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:28 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:32 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[4:7], 0 addr64 offset:4000
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:32
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:1332 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:36 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:1336 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:1340 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:1344 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:40 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:44 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:48 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[4:7], 0 addr64 offset:4016
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:48
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:1364 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:52 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:1368 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:1372 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:1376 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:56 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:60 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:64 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[4:7], 0 addr64 offset:4032
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:64
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:1396 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:68 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:1400 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:1404 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:1408 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:72 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:76 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:80 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[4:7], 0 addr64 offset:4048
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:80
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:1428 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:84 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:1432 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:1436 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:1440 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:88 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:92 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:96 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[4:7], 0 addr64 offset:4064
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:96
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:1460 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:100 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:1464 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:1468 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:1472 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:104 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:108 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:112 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[4:7], 0 addr64 offset:4080
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:112
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:1492 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:116 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:1496 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:1500 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:1504 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:120 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:124 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:128 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[8:11], 0 addr64 offset:3968
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:128
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:1556 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:132 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:1560 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:1564 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:1568 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:136 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:140 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:144 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[8:11], 0 addr64 offset:3984
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:144
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:1588 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:148 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:1592 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:1596 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:1600 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:152 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:156 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:160 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[8:11], 0 addr64 offset:4000
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:160
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:1620 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:164 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:1624 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:1628 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:1632 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:168 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:172 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:176 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[8:11], 0 addr64 offset:4016
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:176
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:1652 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:180 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:1656 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:1660 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:1664 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:184 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:188 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:192 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[8:11], 0 addr64 offset:4032
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:192
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:1684 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:196 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:1688 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:1692 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:1696 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:200 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:204 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:208 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[8:11], 0 addr64 offset:4048
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:208
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:1716 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:212 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:1720 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:1724 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:1728 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:216 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:220 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:224 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[8:11], 0 addr64 offset:4064
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:224
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:1748 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:228 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:1752 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:1756 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:1760 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:232 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:236 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:240 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[8:11], 0 addr64 offset:4080
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:240
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:1780 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:244 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:1784 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:1788 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:1792 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:248 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:252 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:256 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[12:15], 0 addr64 offset:3968
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:256
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:1860 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:260 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:1864 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:1868 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:1872 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:264 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:268 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:272 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[12:15], 0 addr64 offset:3984
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:272
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:1892 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:276 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:1896 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:1900 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:1904 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:280 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:284 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:288 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[12:15], 0 addr64 offset:4000
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:288
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:1924 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:292 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:1928 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:1932 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:1936 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:296 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:300 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:304 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[12:15], 0 addr64 offset:4016
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:304
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:1956 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:308 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:1960 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:1964 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:1968 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:312 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:316 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:320 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[12:15], 0 addr64 offset:4032
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:320
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:1988 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:324 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:1992 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:1996 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:2000 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:328 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:332 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:336 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[12:15], 0 addr64 offset:4048
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:336
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:2020 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:340 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:2024 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:2028 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:2032 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:344 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:348 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:352 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[12:15], 0 addr64 offset:4064
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:352
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:2052 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:356 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:2056 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:2060 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:2064 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:360 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:364 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:368 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[12:15], 0 addr64 offset:4080
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:368
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:2084 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:372 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:2088 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:2092 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:2096 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:376 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:380 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:384 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[20:23], 0 addr64 offset:3968
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:384
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:2148 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:388 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:2152 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:2156 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:2160 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:392 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:396 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:400 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[20:23], 0 addr64 offset:3984
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:400
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:2180 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:404 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:2184 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:2188 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:2192 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:408 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:412 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:416 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[20:23], 0 addr64 offset:4000
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:416
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:2212 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:420 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:2216 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:2220 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:2224 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:424 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:428 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:432 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[20:23], 0 addr64 offset:4016
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:432
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:2244 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:436 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:2248 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:2252 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:2256 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:440 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:444 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:448 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[20:23], 0 addr64 offset:4032
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:448
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:2276 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:452 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:2280 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:2284 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:2288 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:456 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:460 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:464 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[20:23], 0 addr64 offset:4048
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:464
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:2308 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:468 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:2312 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:2316 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:2320 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:472 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:476 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:480 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[20:23], 0 addr64 offset:4064
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:480
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:2340 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:484 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:2344 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:2348 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:2352 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:488 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:492 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:496 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[20:23], 0 addr64 offset:4080
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:496
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:2372 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:500 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:2376 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:2380 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:2384 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:504 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:508 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:512 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[24:27], 0 addr64 offset:3968
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:512
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:2452 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:516 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:2456 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:2460 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:2464 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:520 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:524 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:528 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[24:27], 0 addr64 offset:3984
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:528
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:2484 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:532 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:2488 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:2492 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:2496 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:536 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:540 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:544 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[24:27], 0 addr64 offset:4000
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:544
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:2516 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:548 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:2520 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:2524 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:2528 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:552 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:556 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:560 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[24:27], 0 addr64 offset:4016
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:560
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:2548 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:564 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:2552 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:2556 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:2560 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:568 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:572 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:576 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[24:27], 0 addr64 offset:4032
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:576
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:2580 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:580 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:2584 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:2588 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:2592 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:584 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:588 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:592 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[24:27], 0 addr64 offset:4048
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:592
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:2612 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:596 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:2616 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:2620 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:2624 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:600 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:604 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:608 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[24:27], 0 addr64 offset:4064
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:608
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:2644 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:612 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:2648 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:2652 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:2656 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:616 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:620 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:624 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[24:27], 0 addr64 offset:4080
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:624
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:2676 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:628 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:2680 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:2684 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:2688 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:632 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:636 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:640 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[28:31], 0 addr64 offset:3968
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:640
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:2740 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:644 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:2744 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:2748 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:2752 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:648 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:652 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:656 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[28:31], 0 addr64 offset:3984
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:656
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:2772 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:660 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:2776 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:2780 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:2784 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:664 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:668 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:672 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[28:31], 0 addr64 offset:4000
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:672
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:2804 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:676 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:2808 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:2812 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:2816 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:680 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:684 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:688 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[28:31], 0 addr64 offset:4016
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:688
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:2836 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:692 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:2840 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:2844 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:2848 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:696 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:700 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:704 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[28:31], 0 addr64 offset:4032
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:704
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:2868 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:708 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:2872 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:2876 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:2880 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:712 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:716 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:720 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[28:31], 0 addr64 offset:4048
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:720
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:2900 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:724 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:2904 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:2908 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:2912 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:728 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:732 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:736 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[28:31], 0 addr64 offset:4064
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:736
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:2932 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:740 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:2936 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:2940 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:2944 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:744 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:748 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:752 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[28:31], 0 addr64 offset:4080
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:752
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:2964 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:756 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:2968 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:2972 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:2976 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:760 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:764 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:768 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[36:39], 0 addr64 offset:3968
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:768
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:3044 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:772 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:3048 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:3052 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:3056 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:776 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:780 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:784 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[36:39], 0 addr64 offset:3984
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:784
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:3076 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:788 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:3080 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:3084 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:3088 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:792 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:796 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:800 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[36:39], 0 addr64 offset:4000
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:800
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:3108 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:804 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:3112 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:3116 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:3120 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:808 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:812 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:816 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[36:39], 0 addr64 offset:4016
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:816
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:3140 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:820 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:3144 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:3148 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:3152 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:824 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:828 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:832 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[36:39], 0 addr64 offset:4032
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:832
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:3172 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:836 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:3176 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:3180 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:3184 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:840 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:844 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:848 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[36:39], 0 addr64 offset:4048
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:848
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:3204 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:852 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:3208 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:3212 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:3216 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:856 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:860 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:864 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[36:39], 0 addr64 offset:4064
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:864
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:3236 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:868 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:3240 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:3244 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:3248 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:872 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:876 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:880 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[36:39], 0 addr64 offset:4080
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:880
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:3268 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:884 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:3272 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:3276 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:3280 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:888 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:892 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:896 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[40:43], 0 addr64 offset:3968
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:896
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:3332 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:900 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:3336 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:3340 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:3344 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:904 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:908 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:912 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[40:43], 0 addr64 offset:3984
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:912
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:3364 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:916 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:3368 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:3372 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:3376 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:920 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:924 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:928 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[40:43], 0 addr64 offset:4000
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:928
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:3396 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:932 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:3400 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:3404 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:3408 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:936 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:940 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:944 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[40:43], 0 addr64 offset:4016
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:944
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:3428 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:948 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:3432 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:3436 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:3440 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:952 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:956 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:960 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[40:43], 0 addr64 offset:4032
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:960
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:3460 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:964 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:3464 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:3468 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:3472 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:968 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:972 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:976 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[40:43], 0 addr64 offset:4048
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:976
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:3492 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:980 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:3496 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:3500 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:3504 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:984 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:988 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:992 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[7:10], v[0:1], s[40:43], 0 addr64 offset:4064
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:992
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v7, off, s[44:47], 0 offset:3524 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:996 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v8, off, s[44:47], 0 offset:3528 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v9, off, s[44:47], 0 offset:3532 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v10, off, s[44:47], 0 offset:3536 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[40:43], 0 addr64 offset:4080
-; GFX6-NEXT:    s_waitcnt expcnt(3)
-; GFX6-NEXT:    v_add_i32_e32 v7, vcc, s0, v5
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1000 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1004 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1008 ; 4-byte Folded Spill
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1008
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:3556 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1012 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:3560 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:3564 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:3568 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1016 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1020 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1024 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1024
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1028 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:12 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:16 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1032 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1036 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1040 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:16
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1040
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:20 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1044 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:24 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:28 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:32 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1048 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1052 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1056 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:32
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1056
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:36 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1060 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:40 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:44 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:48 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1064 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1068 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1072 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:48
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1072
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:52 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1076 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:56 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:60 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:64 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1080 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1084 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1088 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:64
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1088
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:68 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1092 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:72 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:76 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:80 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1096 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1100 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1104 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:80
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1104
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:84 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1108 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:88 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:92 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:96 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1112 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1116 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1120 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:96
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1120
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:100 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1124 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:104 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:108 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:112 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1128 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1132 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1136 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:112
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1136
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:116 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1140 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:120 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:124 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:128 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1144 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1148 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1152 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:128
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1152
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:132 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1156 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:136 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:140 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:144 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1160 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1164 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1168 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:144
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1168
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:148 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1172 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:152 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:156 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:160 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1176 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1180 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1184 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:160
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1184
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:164 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1188 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:168 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:172 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:176 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1192 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1196 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1200 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:176
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1200
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:180 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1204 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:184 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:188 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:192 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1208 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1212 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1216 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:192
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1216
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:196 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1220 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:200 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:204 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:208 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1224 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1228 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1232 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:208
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1232
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:212 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1236 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:216 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:220 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:224 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1240 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1244 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1248 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:224
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1248
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:228 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1252 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:232 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:236 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:240 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1256 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1260 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1264 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:240
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1264
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:244 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1268 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:248 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:252 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:256 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1272 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1276 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1280 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:256
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1280
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:260 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1284 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:264 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:268 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:272 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1288 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1292 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1296 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:272
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1296
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:276 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1300 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:280 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:284 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:288 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1304 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1308 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1312 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:288
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1312
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:292 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1316 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:296 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:300 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:304 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1320 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1324 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1328 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:304
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1328
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:308 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1332 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:312 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:316 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:320 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1336 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1340 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1344 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:320
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1344
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:324 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1348 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:328 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:332 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:336 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1352 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1356 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1360 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:336
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1360
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:340 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1364 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:344 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:348 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:352 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1368 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1372 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1376 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:352
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1376
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:356 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1380 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:360 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:364 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:368 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1384 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1388 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1392 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:368
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1392
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:372 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1396 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:376 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:380 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:384 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1400 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1404 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1408 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:384
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1408
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:388 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1412 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:392 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:396 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:400 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1416 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1420 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1424 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:400
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1424
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:404 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1428 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:408 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:412 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:416 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1432 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1436 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1440 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:416
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1440
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:420 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1444 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:424 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:428 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:432 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1448 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1452 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1456 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:432
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1456
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:436 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1460 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:440 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:444 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:448 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1464 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1468 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1472 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:448
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1472
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:452 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1476 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:456 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:460 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:464 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1480 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1484 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1488 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:464
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1488
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:468 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1492 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:472 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:476 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:480 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1496 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1500 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1504 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:480
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1504
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:484 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1508 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:488 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:492 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:496 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1512 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1516 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1520 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:496
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1520
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:500 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1524 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:504 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:508 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:512 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1528 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1532 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1536 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:512
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1536
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:516 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1540 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:520 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:524 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:528 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1544 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1548 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1552 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:528
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1552
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:532 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1556 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:536 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:540 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:544 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1560 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1564 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1568 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:544
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1568
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:548 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1572 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:552 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:556 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:560 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1576 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1580 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1584 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:560
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1584
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:564 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1588 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:568 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:572 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:576 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1592 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1596 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1600 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:576
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1600
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:580 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1604 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:584 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:588 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:592 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1608 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1612 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1616 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:592
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1616
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:596 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1620 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:600 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:604 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:608 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1624 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1628 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1632 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:608
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1632
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:612 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1636 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:616 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:620 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:624 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1640 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1644 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1648 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:624
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1648
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:628 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1652 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:632 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:636 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:640 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1656 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1660 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1664 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:640
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1664
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:644 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1668 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:648 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:652 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:656 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1672 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1676 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1680 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:656
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1680
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:660 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1684 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:664 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:668 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:672 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1688 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1692 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1696 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:672
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1696
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:676 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1700 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:680 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:684 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:688 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1704 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1708 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1712 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:688
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1712
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:692 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1716 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:696 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:700 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:704 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1720 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1724 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1728 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:704
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1728
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:708 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1732 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:712 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:716 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:720 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1736 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1740 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1744 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:720
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1744
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:724 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1748 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:728 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:732 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:736 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1752 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1756 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1760 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:736
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1760
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:740 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1764 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:744 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:748 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:752 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1768 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1772 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1776 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:752
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1776
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:756 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1780 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:760 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:764 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:768 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1784 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1788 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1792 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:768
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1792
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:772 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1796 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:776 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:780 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:784 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1800 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1804 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1808 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:784
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1808
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:788 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1812 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:792 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:796 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:800 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1816 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1820 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1824 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:800
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1824
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:804 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1828 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:808 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:812 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:816 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1832 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1836 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1840 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:816
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1840
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:820 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1844 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:824 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:828 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:832 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1848 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1852 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1856 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:832
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1856
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:836 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1860 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:840 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:844 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:848 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1864 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1868 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1872 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:848
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1872
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:852 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1876 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:856 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:860 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:864 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1880 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1884 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1888 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:864
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1888
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:868 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1892 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:872 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:876 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:880 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1896 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1900 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1904 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:880
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1904
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:884 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1908 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:888 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:892 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:896 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1912 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1916 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1920 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:896
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1920
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:900 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1924 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:904 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:908 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:912 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1928 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1932 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1936 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:912
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1936
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:916 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1940 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:920 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:924 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:928 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1944 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1948 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1952 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:928
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1952
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:932 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1956 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:936 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:940 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:944 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1960 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1964 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1968 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:944
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1968
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:948 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1972 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:952 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:956 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:960 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1976 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1980 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:1984 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:960
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:1984
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:964 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:1988 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:968 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:972 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:976 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:1992 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:1996 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2000 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:976
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2000
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:980 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2004 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:984 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:988 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:992 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2008 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2012 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2016 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:992
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2016
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:996 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2020 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:1000 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:1004 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:1008 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2024 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2028 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2032 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1008
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2032
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:1012 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2036 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:1016 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:1020 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:1024 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2040 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2044 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2048 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1024
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2048
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:1028 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2052 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:1032 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:1036 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:1040 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2056 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2060 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2064 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1040
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2064
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:1044 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2068 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:1048 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:1052 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:1056 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2072 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2076 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2080 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1056
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2080
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:1060 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2084 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:1064 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:1068 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:1072 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2088 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2092 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2096 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1072
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2096
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:1076 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2100 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:1080 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:1084 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:1088 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2104 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2108 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2112 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1088
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2112
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:1092 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2116 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:1096 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:1100 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:1104 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2120 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2124 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2128 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1104
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2128
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:1108 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2132 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:1112 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:1116 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:1120 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2136 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2140 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2144 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1120
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2144
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:1124 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2148 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:1128 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:1132 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:1136 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2152 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2156 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2160 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1136
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2160
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:1140 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2164 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:1144 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:1148 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:1152 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2168 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2172 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2176 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1152
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2176
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:1156 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2180 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:1160 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:1164 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:1168 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2184 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2188 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2192 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1168
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2192
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:1172 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2196 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:1176 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:1180 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:1184 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2200 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2204 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2208 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1184
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2208
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:1188 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2212 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:1192 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:1196 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:1200 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2216 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2220 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2224 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1200
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2224
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:1204 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2228 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:1208 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:1212 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:1216 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2232 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2236 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2240 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1216
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2240
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:1220 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2244 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:1224 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:1228 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:1232 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2248 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2252 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2256 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1232
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2256
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:1236 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2260 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:1240 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:1244 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:1248 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2264 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2268 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2272 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1248
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2272
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:1252 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2276 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:1256 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:1260 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:1264 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2280 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2284 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2288 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1264
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2288
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:1284 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2292 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:1288 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:1292 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:1296 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2296 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2300 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2304 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1280
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2304
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:1316 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2308 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:1320 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:1324 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:1328 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2312 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2316 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2320 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1296
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2320
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:1348 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2324 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:1352 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:1356 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:1360 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2328 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2332 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2336 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1312
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2336
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:1380 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2340 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:1384 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:1388 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:1392 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2344 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2348 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2352 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1328
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2352
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:1412 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2356 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:1416 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:1420 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:1424 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2360 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2364 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2368 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1344
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2368
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:1444 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2372 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:1448 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:1452 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:1456 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2376 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2380 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2384 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1360
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2384
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:1476 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2388 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:1480 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:1484 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:1488 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2392 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2396 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2400 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1376
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2400
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:1508 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2404 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:1512 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:1516 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:1520 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2408 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2412 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2416 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1392
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2416
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:1524 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2420 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:1528 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:1532 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:1536 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2424 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2428 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2432 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1408
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2432
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:1540 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2436 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:1544 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:1548 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:1552 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2440 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2444 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2448 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1424
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2448
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:1572 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2452 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:1576 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:1580 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:1584 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2456 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2460 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2464 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1440
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2464
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:1604 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2468 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:1608 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:1612 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:1616 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2472 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2476 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2480 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1456
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2480
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:1636 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2484 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:1640 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:1644 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:1648 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2488 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2492 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2496 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1472
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2496
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:1668 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2500 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:1672 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:1676 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:1680 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2504 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2508 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2512 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1488
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2512
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:1700 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2516 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:1704 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:1708 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:1712 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2520 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2524 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2528 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1504
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2528
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:1732 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2532 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:1736 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:1740 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:1744 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2536 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2540 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2544 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1520
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2544
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:1764 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2548 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:1768 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:1772 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:1776 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2552 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2556 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2560 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1536
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2560
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:1796 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2564 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:1800 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:1804 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:1808 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2568 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2572 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2576 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1552
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2576
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:1812 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2580 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:1816 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:1820 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:1824 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2584 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2588 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2592 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1568
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2592
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:1828 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2596 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:1832 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:1836 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:1840 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2600 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2604 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2608 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1584
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2608
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:1844 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2612 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:1848 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:1852 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:1856 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2616 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2620 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2624 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1600
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2624
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:1876 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2628 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:1880 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:1884 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:1888 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2632 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2636 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2640 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1616
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2640
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:1908 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2644 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:1912 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:1916 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:1920 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2648 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2652 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2656 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1632
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2656
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:1940 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2660 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:1944 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:1948 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:1952 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2664 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2668 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2672 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1648
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2672
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:1972 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2676 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:1976 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:1980 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:1984 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2680 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2684 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2688 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1664
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2688
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:2004 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2692 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:2008 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:2012 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:2016 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2696 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2700 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2704 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1680
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2704
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:2036 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2708 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:2040 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:2044 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:2048 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2712 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2716 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2720 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1696
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2720
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:2068 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2724 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:2072 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:2076 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:2080 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2728 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2732 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2736 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1712
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2736
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:2100 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2740 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:2104 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:2108 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:2112 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2744 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2748 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2752 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1728
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2752
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:2116 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2756 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:2120 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:2124 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:2128 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2760 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2764 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2768 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1744
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2768
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:2132 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2772 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:2136 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:2140 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:2144 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2776 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2780 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2784 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1760
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2784
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:2164 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2788 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:2168 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:2172 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:2176 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2792 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2796 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2800 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1776
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2800
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:2196 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2804 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:2200 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:2204 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:2208 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2808 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2812 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2816 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1792
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2816
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:2228 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2820 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:2232 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:2236 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:2240 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2824 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2828 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2832 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1808
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2832
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:2260 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2836 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:2264 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:2268 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:2272 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2840 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2844 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2848 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1824
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2848
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:2292 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2852 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:2296 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:2300 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:2304 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2856 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2860 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2864 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1840
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2864
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:2324 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2868 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:2328 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:2332 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:2336 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2872 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2876 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2880 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1856
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2880
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:2356 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2884 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:2360 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:2364 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:2368 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2888 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2892 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2896 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1872
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2896
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:2388 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2900 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:2392 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:2396 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:2400 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2904 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2908 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2912 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1888
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2912
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:2404 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2916 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:2408 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:2412 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:2416 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2920 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2924 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2928 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1904
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2928
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:2420 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2932 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:2424 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:2428 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:2432 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2936 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2940 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2944 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1920
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2944
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:2436 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2948 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:2440 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:2444 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:2448 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2952 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2956 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2960 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1936
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2960
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:2468 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2964 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:2472 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:2476 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:2480 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2968 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2972 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2976 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1952
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2976
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:2500 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2980 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:2504 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:2508 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:2512 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:2984 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:2988 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:2992 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1968
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:2992
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:2532 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:2996 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:2536 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:2540 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:2544 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3000 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3004 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3008 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:1984
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3008
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:2564 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3012 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:2568 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:2572 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:2576 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3016 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3020 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3024 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2000
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3024
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:2596 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3028 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:2600 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:2604 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:2608 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3032 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3036 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3040 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2016
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3040
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:2628 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3044 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:2632 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:2636 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:2640 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3048 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3052 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3056 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2032
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3056
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:2660 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3060 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:2664 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:2668 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:2672 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3064 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3068 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3072 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2048
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3072
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:2692 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3076 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:2696 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:2700 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:2704 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3080 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3084 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3088 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2064
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3088
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:2708 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3092 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:2712 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:2716 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:2720 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3096 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3100 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3104 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2080
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3104
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:2724 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3108 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:2728 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:2732 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:2736 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3112 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3116 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3120 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2096
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3120
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:2756 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3124 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:2760 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:2764 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:2768 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3128 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3132 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3136 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2112
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3136
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:2788 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3140 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:2792 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:2796 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:2800 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3144 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3148 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3152 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2128
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3152
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:2820 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3156 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:2824 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:2828 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:2832 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3160 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3164 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3168 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2144
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3168
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:2852 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3172 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:2856 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:2860 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:2864 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3176 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3180 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3184 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2160
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3184
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:2884 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3188 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:2888 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:2892 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:2896 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3192 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3196 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3200 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2176
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3200
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:2916 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3204 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:2920 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:2924 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:2928 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3208 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3212 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3216 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2192
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3216
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:2948 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3220 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:2952 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:2956 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:2960 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3224 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3228 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3232 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2208
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3232
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:2980 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3236 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:2984 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:2988 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:2992 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3240 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3244 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3248 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2224
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3248
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:2996 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3252 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:3000 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:3004 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:3008 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3256 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3260 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3264 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2240
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3264
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:3012 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3268 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:3016 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:3020 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:3024 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3272 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3276 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3280 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2256
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3280
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:3028 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3284 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:3032 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:3036 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:3040 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3288 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3292 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3296 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2272
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3296
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:3060 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3300 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:3064 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:3068 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:3072 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3304 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3308 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3312 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2288
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3312
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:3092 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3316 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:3096 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:3100 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:3104 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3320 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3324 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3328 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2304
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3328
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:3124 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3332 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:3128 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:3132 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:3136 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3336 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3340 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3344 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2320
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3344
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:3156 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3348 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:3160 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:3164 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:3168 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3352 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3356 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3360 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2336
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3360
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:3188 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3364 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:3192 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:3196 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:3200 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3368 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3372 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3376 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2352
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3376
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:3220 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3380 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:3224 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:3228 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:3232 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3384 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3388 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3392 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2368
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3392
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:3252 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3396 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:3256 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:3260 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:3264 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3400 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3404 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3408 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2384
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3408
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:3284 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3412 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:3288 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:3292 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:3296 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3416 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3420 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3424 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2400
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3424
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:3300 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3428 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:3304 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:3308 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:3312 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3432 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3436 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3440 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2416
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3440
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:3316 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3444 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:3320 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:3324 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:3328 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3448 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3452 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3456 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2432
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3456
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:3348 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3460 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:3352 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:3356 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:3360 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3464 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3468 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3472 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2448
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3472
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:3380 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3476 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:3384 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:3388 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:3392 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3480 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3484 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3488 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2464
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3488
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:3412 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3492 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:3416 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:3420 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:3424 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3496 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3500 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3504 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2480
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3504
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:3444 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3508 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:3448 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:3452 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:3456 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3512 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3516 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3520 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2496
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3520
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:3476 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3524 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:3480 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:3484 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:3488 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3528 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3532 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3536 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2512
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3536
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:3508 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3540 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:3512 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:3516 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:3520 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3544 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3548 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3552 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2528
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3552
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:3540 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3556 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:3544 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:3548 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:3552 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3560 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3564 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3568 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2544
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3568
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:3572 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3572 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:3576 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:3580 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:3584 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3576 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3580 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3584 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2560
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3584
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:3588 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3588 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:3592 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:3596 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:3600 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3592 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3596 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3600 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2576
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3600
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:3604 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3604 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:3608 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:3612 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:3616 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3608 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3612 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3616 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2592
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3616
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:3620 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3620 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:3624 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:3628 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:3632 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3624 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3628 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3632 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2608
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3632
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:3636 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3636 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:3640 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:3644 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:3648 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3640 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3644 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3648 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2624
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3648
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:3652 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3652 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:3656 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:3660 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:3664 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3656 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3660 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3664 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2640
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3664
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:3668 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3668 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:3672 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:3676 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:3680 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3672 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3676 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3680 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2656
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3680
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:3684 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3684 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:3688 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:3692 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:3696 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3688 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3692 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3696 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2672
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3696
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:3700 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3700 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:3704 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:3708 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:3712 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3704 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3708 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3712 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2688
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3712
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:3716 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3716 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:3720 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:3724 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:3728 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3720 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3724 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3728 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2704
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3728
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:3732 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3732 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:3736 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:3740 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:3744 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3736 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3740 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3744 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2720
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3744
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:3748 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3748 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:3752 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:3756 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:3760 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3752 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3756 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3760 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2736
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3760
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:3764 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3764 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:3768 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:3772 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:3776 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3768 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3772 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3776 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2752
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3776
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:3780 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3780 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:3784 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:3788 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:3792 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3784 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3788 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3792 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2768
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3792
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:3796 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3796 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:3800 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:3804 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:3808 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3800 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3804 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3808 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2784
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3808
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:3812 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3812 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:3816 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:3820 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:3824 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3816 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3820 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3824 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2800
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3824
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:3828 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3828 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:3832 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:3836 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:3840 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3832 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3836 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3840 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2816
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3840
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:3844 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3844 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:3848 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:3852 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:3856 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3848 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3852 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3856 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2832
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3856
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:3860 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3860 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:3864 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:3868 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:3872 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3864 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3868 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3872 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2848
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3872
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:3876 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3876 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:3880 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:3884 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:3888 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3880 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3884 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3888 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2864
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3888
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:3892 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3892 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:3896 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:3900 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:3904 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3896 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3900 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3904 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2880
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3904
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:3908 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3908 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:3912 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:3916 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:3920 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3912 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3916 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3920 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2896
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3920
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:3924 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3924 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:3928 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:3932 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:3936 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3928 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3932 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3936 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2912
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3936
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:3940 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3940 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:3944 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:3948 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:3952 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3944 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3948 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3952 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2928
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3952
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:3956 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3956 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:3960 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:3964 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:3968 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3960 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3964 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3968 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2944
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3968
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:3972 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3972 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:3976 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:3980 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:3984 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3976 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3980 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:3984 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2960
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:3984
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:3988 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:3988 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:3992 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:3996 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:4000 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:3992 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:3996 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:4000 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2976
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:4000
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:4004 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:4004 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:4008 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:4012 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:4016 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:4008 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:4012 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:4016 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:2992
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:4016
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:4020 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:4020 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:4024 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:4028 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:4032 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:4024 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:4028 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:4032 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3008
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:4032
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:4036 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:4036 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:4040 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:4044 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:4048 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:4040 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:4044 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:4048 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3024
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:4048
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:4052 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:4052 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:4056 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:4060 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:4064 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:4056 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:4060 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:4064 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3040
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:4064
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], 0 offset:4068 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], 0 offset:4068 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], 0 offset:4072 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], 0 offset:4076 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], 0 offset:4080 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], 0 offset:4072 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], 0 offset:4076 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], 0 offset:4080 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3056
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[4:7], 0 addr64 offset:4080
+; GFX6-NEXT:    s_mov_b64 s[4:5], 0x80
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3072
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:3968
 ; GFX6-NEXT:    s_mov_b32 s2, 0x40100
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3088
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:3984
 ; GFX6-NEXT:    s_mov_b32 s2, 0x40500
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3104
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:4000
 ; GFX6-NEXT:    s_mov_b32 s2, 0x40900
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3120
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:4016
 ; GFX6-NEXT:    s_mov_b32 s2, 0x40d00
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3136
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:4032
 ; GFX6-NEXT:    s_mov_b32 s2, 0x41100
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3152
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:4048
 ; GFX6-NEXT:    s_mov_b32 s2, 0x41500
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3168
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:4064
 ; GFX6-NEXT:    s_mov_b32 s2, 0x41900
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3184
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:4080
 ; GFX6-NEXT:    s_mov_b32 s2, 0x41d00
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3200
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[8:11], 0 addr64 offset:3968
 ; GFX6-NEXT:    s_mov_b32 s2, 0x42100
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3216
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[8:11], 0 addr64 offset:3984
 ; GFX6-NEXT:    s_mov_b32 s2, 0x42500
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3232
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[8:11], 0 addr64 offset:4000
 ; GFX6-NEXT:    s_mov_b32 s2, 0x42900
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3248
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[8:11], 0 addr64 offset:4016
 ; GFX6-NEXT:    s_mov_b32 s2, 0x42d00
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3264
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[8:11], 0 addr64 offset:4032
 ; GFX6-NEXT:    s_mov_b32 s2, 0x43100
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3280
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[8:11], 0 addr64 offset:4048
 ; GFX6-NEXT:    s_mov_b32 s2, 0x43500
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3296
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[8:11], 0 addr64 offset:4064
 ; GFX6-NEXT:    s_mov_b32 s2, 0x43900
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3312
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[8:11], 0 addr64 offset:4080
 ; GFX6-NEXT:    s_mov_b32 s2, 0x43d00
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3328
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[12:15], 0 addr64 offset:3968
 ; GFX6-NEXT:    s_mov_b32 s2, 0x44100
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3344
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[12:15], 0 addr64 offset:3984
 ; GFX6-NEXT:    s_mov_b32 s2, 0x44500
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3360
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[12:15], 0 addr64 offset:4000
 ; GFX6-NEXT:    s_mov_b32 s2, 0x44900
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3376
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[12:15], 0 addr64 offset:4016
 ; GFX6-NEXT:    s_mov_b32 s2, 0x44d00
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3392
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[12:15], 0 addr64 offset:4032
 ; GFX6-NEXT:    s_mov_b32 s2, 0x45100
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3408
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[12:15], 0 addr64 offset:4048
 ; GFX6-NEXT:    s_mov_b32 s2, 0x45500
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3424
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[12:15], 0 addr64 offset:4064
 ; GFX6-NEXT:    s_mov_b32 s2, 0x45900
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3440
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[12:15], 0 addr64 offset:4080
 ; GFX6-NEXT:    s_mov_b32 s2, 0x45d00
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3456
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[16:19], 0 addr64 offset:3968
 ; GFX6-NEXT:    s_mov_b32 s2, 0x46100
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3472
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[16:19], 0 addr64 offset:3984
 ; GFX6-NEXT:    s_mov_b32 s2, 0x46500
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3488
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[16:19], 0 addr64 offset:4000
 ; GFX6-NEXT:    s_mov_b32 s2, 0x46900
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3504
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[16:19], 0 addr64 offset:4016
 ; GFX6-NEXT:    s_mov_b32 s2, 0x46d00
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3520
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[16:19], 0 addr64 offset:4032
 ; GFX6-NEXT:    s_mov_b32 s2, 0x47100
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3536
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[16:19], 0 addr64 offset:4048
 ; GFX6-NEXT:    s_mov_b32 s2, 0x47500
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3552
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[16:19], 0 addr64 offset:4064
 ; GFX6-NEXT:    s_mov_b32 s2, 0x47900
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3568
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[16:19], 0 addr64 offset:4080
 ; GFX6-NEXT:    s_mov_b32 s2, 0x47d00
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3584
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[20:23], 0 addr64 offset:3968
 ; GFX6-NEXT:    s_mov_b32 s2, 0x48100
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3600
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[20:23], 0 addr64 offset:3984
 ; GFX6-NEXT:    s_mov_b32 s2, 0x48500
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3616
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[20:23], 0 addr64 offset:4000
 ; GFX6-NEXT:    s_mov_b32 s2, 0x48900
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3632
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[20:23], 0 addr64 offset:4016
 ; GFX6-NEXT:    s_mov_b32 s2, 0x48d00
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3648
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[20:23], 0 addr64 offset:4032
 ; GFX6-NEXT:    s_mov_b32 s2, 0x49100
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3664
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[20:23], 0 addr64 offset:4048
 ; GFX6-NEXT:    s_mov_b32 s2, 0x49500
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3680
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[20:23], 0 addr64 offset:4064
 ; GFX6-NEXT:    s_mov_b32 s2, 0x49900
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3696
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[20:23], 0 addr64 offset:4080
 ; GFX6-NEXT:    s_mov_b32 s2, 0x49d00
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3712
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[24:27], 0 addr64 offset:3968
 ; GFX6-NEXT:    s_mov_b32 s2, 0x4a100
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3728
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[24:27], 0 addr64 offset:3984
 ; GFX6-NEXT:    s_mov_b32 s2, 0x4a500
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3744
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[24:27], 0 addr64 offset:4000
 ; GFX6-NEXT:    s_mov_b32 s2, 0x4a900
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3760
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[24:27], 0 addr64 offset:4016
 ; GFX6-NEXT:    s_mov_b32 s2, 0x4ad00
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3776
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[24:27], 0 addr64 offset:4032
 ; GFX6-NEXT:    s_mov_b32 s2, 0x4b100
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3792
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[24:27], 0 addr64 offset:4048
 ; GFX6-NEXT:    s_mov_b32 s2, 0x4b500
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3808
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[24:27], 0 addr64 offset:4064
 ; GFX6-NEXT:    s_mov_b32 s2, 0x4b900
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3824
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[24:27], 0 addr64 offset:4080
 ; GFX6-NEXT:    s_mov_b32 s2, 0x4bd00
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3840
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[28:31], 0 addr64 offset:3968
 ; GFX6-NEXT:    s_mov_b32 s2, 0x4c100
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3856
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[28:31], 0 addr64 offset:3984
 ; GFX6-NEXT:    s_mov_b32 s2, 0x4c500
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3872
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[28:31], 0 addr64 offset:4000
 ; GFX6-NEXT:    s_mov_b32 s2, 0x4c900
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3888
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[28:31], 0 addr64 offset:4016
 ; GFX6-NEXT:    s_mov_b32 s2, 0x4cd00
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3904
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[28:31], 0 addr64 offset:4032
 ; GFX6-NEXT:    s_mov_b32 s2, 0x4d100
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3920
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[28:31], 0 addr64 offset:4048
 ; GFX6-NEXT:    s_mov_b32 s2, 0x4d500
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3936
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[28:31], 0 addr64 offset:4064
 ; GFX6-NEXT:    s_mov_b32 s2, 0x4d900
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3952
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[28:31], 0 addr64 offset:4080
 ; GFX6-NEXT:    s_mov_b32 s2, 0x4dd00
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3968
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[36:39], 0 addr64 offset:3968
 ; GFX6-NEXT:    s_mov_b32 s2, 0x4e100
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:3984
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[36:39], 0 addr64 offset:3984
 ; GFX6-NEXT:    s_mov_b32 s2, 0x4e500
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:4000
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[36:39], 0 addr64 offset:4000
 ; GFX6-NEXT:    s_mov_b32 s2, 0x4e900
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:4016
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[36:39], 0 addr64 offset:4016
 ; GFX6-NEXT:    s_mov_b32 s2, 0x4ed00
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:4032
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[36:39], 0 addr64 offset:4032
 ; GFX6-NEXT:    s_mov_b32 s2, 0x4f100
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:4048
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[36:39], 0 addr64 offset:4048
 ; GFX6-NEXT:    s_mov_b32 s2, 0x4f500
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:4064
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[36:39], 0 addr64 offset:4064
 ; GFX6-NEXT:    s_mov_b32 s2, 0x4f900
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v0, off, s[44:47], s2 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v0, off, s[40:43], s2 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dword v1, off, s[44:47], s2 offset:4 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v2, off, s[44:47], s2 offset:8 ; 4-byte Folded Spill
-; GFX6-NEXT:    buffer_store_dword v3, off, s[44:47], s2 offset:12 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v1, off, s[40:43], s2 offset:4 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v2, off, s[40:43], s2 offset:8 ; 4-byte Folded Spill
+; GFX6-NEXT:    buffer_store_dword v3, off, s[40:43], s2 offset:12 ; 4-byte Folded Spill
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[5:6], s[16:19], 0 addr64 offset:4080
+; GFX6-NEXT:    buffer_load_dwordx4 v[0:3], v[7:8], s[36:39], 0 addr64 offset:4080
 ; GFX6-NEXT:    ;;#ASMSTART
 ; GFX6-NEXT:    ;;#ASMEND
+; GFX6-NEXT:    v_add_i32_e32 v7, vcc, s0, v5
+; GFX6-NEXT:    v_mov_b32_e32 v4, s1
+; GFX6-NEXT:    v_addc_u32_e32 v8, vcc, 0, v4, vcc
 ; GFX6-NEXT:    ;;#ASMSTART
 ; GFX6-NEXT:    ;;#ASMEND
 ; GFX6-NEXT:    ;;#ASMSTART
@@ -2690,2304 +2684,2303 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 ; GFX6-NEXT:    ;;#ASMEND
 ; GFX6-NEXT:    ;;#ASMSTART
 ; GFX6-NEXT:    ;;#ASMEND
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:3556 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:3560 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:3564 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:3568 ; 4-byte Folded Reload
-; GFX6-NEXT:    v_mov_b32_e32 v4, s1
-; GFX6-NEXT:    v_addc_u32_e32 v8, vcc, 0, v4, vcc
-; GFX6-NEXT:    s_mov_b64 s[2:3], s[18:19]
+; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[36:39], 0 addr64 offset:4080
+; GFX6-NEXT:    s_waitcnt expcnt(0)
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s33 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s33 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s33 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s33 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s33, 0x4f500
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[40:43], 0 addr64 offset:4080
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[36:39], 0 addr64 offset:4064
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:3524 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:3528 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:3532 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:3536 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s33 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s33 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s33 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s33 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s33, 0x4f100
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[40:43], 0 addr64 offset:4064
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[36:39], 0 addr64 offset:4048
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:3492 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:3496 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:3500 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:3504 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s33 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s33 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s33 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s33 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s33, 0x4ed00
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[40:43], 0 addr64 offset:4048
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[36:39], 0 addr64 offset:4032
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:3460 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:3464 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:3468 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:3472 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s33 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s33 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s33 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s33 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s33, 0x4e900
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[40:43], 0 addr64 offset:4032
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[36:39], 0 addr64 offset:4016
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:3428 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:3432 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:3436 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:3440 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s33 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s33 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s33 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s33 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s33, 0x4e500
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[40:43], 0 addr64 offset:4016
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[36:39], 0 addr64 offset:4000
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:3396 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:3400 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:3404 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:3408 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s33 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s33 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s33 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s33 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s33, 0x4e100
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[40:43], 0 addr64 offset:4000
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[36:39], 0 addr64 offset:3984
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:3364 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:3368 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:3372 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:3376 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s33 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s33 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s33 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s33 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s33, 0x4dd00
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[40:43], 0 addr64 offset:3984
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[36:39], 0 addr64 offset:3968
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:3332 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:3336 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:3340 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:3344 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s33 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s33 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s33 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s33 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s33, 0x4d900
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[40:43], 0 addr64 offset:3968
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[28:31], 0 addr64 offset:4080
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:3268 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:3272 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:3276 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:3280 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s33 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s33 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s33 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s33 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s33, 0x4d500
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[36:39], 0 addr64 offset:4080
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[28:31], 0 addr64 offset:4064
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:3236 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:3240 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:3244 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:3248 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s33 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s33 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s33 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s33 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s33, 0x4d100
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[36:39], 0 addr64 offset:4064
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[28:31], 0 addr64 offset:4048
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:3204 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:3208 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:3212 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:3216 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s33 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s33 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s33 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s33 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s33, 0x4cd00
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[36:39], 0 addr64 offset:4048
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[28:31], 0 addr64 offset:4032
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:3172 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:3176 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:3180 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:3184 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s33 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s33 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s33 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s33 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s33, 0x4c900
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[36:39], 0 addr64 offset:4032
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[28:31], 0 addr64 offset:4016
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:3140 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:3144 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:3148 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:3152 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s33 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s33 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s33 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s33 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s33, 0x4c500
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[36:39], 0 addr64 offset:4016
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[28:31], 0 addr64 offset:4000
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:3108 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:3112 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:3116 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:3120 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s33 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s33 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s33 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s33 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s33, 0x4c100
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[36:39], 0 addr64 offset:4000
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[28:31], 0 addr64 offset:3984
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:3076 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:3080 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:3084 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:3088 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s33 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s33 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s33 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s33 offset:12 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[36:39], 0 addr64 offset:3984
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[28:31], 0 addr64 offset:3968
+; GFX6-NEXT:    s_mov_b32 s28, 0x4bd00
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:3044 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:3048 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:3052 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:3056 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s28 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s28 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s28 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s28 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s28, 0x4b900
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[36:39], 0 addr64 offset:3968
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[24:27], 0 addr64 offset:4080
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:2964 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:2968 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:2972 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:2976 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s28 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s28 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s28 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s28 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s28, 0x4b500
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[28:31], 0 addr64 offset:4080
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[24:27], 0 addr64 offset:4064
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:2932 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:2936 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:2940 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:2944 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s28 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s28 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s28 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s28 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s28, 0x4b100
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[28:31], 0 addr64 offset:4064
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[24:27], 0 addr64 offset:4048
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:2900 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:2904 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:2908 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:2912 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s28 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s28 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s28 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s28 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s28, 0x4ad00
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[28:31], 0 addr64 offset:4048
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[24:27], 0 addr64 offset:4032
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:2868 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:2872 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:2876 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:2880 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s28 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s28 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s28 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s28 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s28, 0x4a900
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[28:31], 0 addr64 offset:4032
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[24:27], 0 addr64 offset:4016
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:2836 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:2840 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:2844 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:2848 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s28 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s28 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s28 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s28 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s28, 0x4a500
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[28:31], 0 addr64 offset:4016
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[24:27], 0 addr64 offset:4000
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:2804 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:2808 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:2812 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:2816 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s28 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s28 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s28 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s28 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s28, 0x4a100
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[28:31], 0 addr64 offset:4000
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[24:27], 0 addr64 offset:3984
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:2772 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:2776 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:2780 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:2784 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s28 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s28 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s28 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s28 offset:12 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[28:31], 0 addr64 offset:3984
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[24:27], 0 addr64 offset:3968
+; GFX6-NEXT:    s_mov_b32 s24, 0x49d00
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:2740 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:2744 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:2748 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:2752 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s24 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s24 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s24 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s24 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s24, 0x49900
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[28:31], 0 addr64 offset:3968
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[20:23], 0 addr64 offset:4080
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:2676 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:2680 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:2684 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:2688 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s24 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s24 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s24 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s24 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s24, 0x49500
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[24:27], 0 addr64 offset:4080
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[20:23], 0 addr64 offset:4064
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:2644 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:2648 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:2652 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:2656 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s24 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s24 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s24 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s24 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s24, 0x49100
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[24:27], 0 addr64 offset:4064
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[20:23], 0 addr64 offset:4048
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:2612 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:2616 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:2620 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:2624 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s24 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s24 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s24 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s24 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s24, 0x48d00
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[24:27], 0 addr64 offset:4048
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[20:23], 0 addr64 offset:4032
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:2580 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:2584 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:2588 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:2592 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s24 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s24 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s24 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s24 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s24, 0x48900
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[24:27], 0 addr64 offset:4032
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[20:23], 0 addr64 offset:4016
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:2548 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:2552 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:2556 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:2560 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s24 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s24 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s24 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s24 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s24, 0x48500
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[24:27], 0 addr64 offset:4016
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[20:23], 0 addr64 offset:4000
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:2516 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:2520 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:2524 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:2528 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s24 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s24 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s24 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s24 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s24, 0x48100
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[24:27], 0 addr64 offset:4000
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[20:23], 0 addr64 offset:3984
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:2484 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:2488 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:2492 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:2496 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s24 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s24 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s24 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s24 offset:12 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[24:27], 0 addr64 offset:3984
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[20:23], 0 addr64 offset:3968
+; GFX6-NEXT:    s_mov_b32 s20, 0x47d00
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:2452 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:2456 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:2460 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:2464 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s20 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s20 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s20 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s20 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s20, 0x47900
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[24:27], 0 addr64 offset:3968
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[16:19], 0 addr64 offset:4080
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:2372 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:2376 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:2380 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:2384 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s20 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s20 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s20 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s20 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s20, 0x47500
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[20:23], 0 addr64 offset:4080
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[16:19], 0 addr64 offset:4064
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:2340 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:2344 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:2348 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:2352 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s20 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s20 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s20 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s20 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s20, 0x47100
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[20:23], 0 addr64 offset:4064
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[16:19], 0 addr64 offset:4048
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:2308 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:2312 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:2316 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:2320 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s20 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s20 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s20 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s20 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s20, 0x46d00
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[20:23], 0 addr64 offset:4048
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[16:19], 0 addr64 offset:4032
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:2276 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:2280 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:2284 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:2288 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s20 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s20 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s20 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s20 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s20, 0x46900
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[20:23], 0 addr64 offset:4032
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[16:19], 0 addr64 offset:4016
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:2244 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:2248 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:2252 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:2256 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s20 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s20 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s20 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s20 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s20, 0x46500
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[20:23], 0 addr64 offset:4016
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[16:19], 0 addr64 offset:4000
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:2212 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:2216 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:2220 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:2224 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s20 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s20 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s20 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s20 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s20, 0x46100
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[20:23], 0 addr64 offset:4000
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[16:19], 0 addr64 offset:3984
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:2180 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:2184 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:2188 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:2192 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s20 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s20 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s20 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s20 offset:12 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[20:23], 0 addr64 offset:3984
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[16:19], 0 addr64 offset:3968
+; GFX6-NEXT:    s_mov_b32 s16, 0x45d00
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:2148 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:2152 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:2156 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:2160 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s16 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s16 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s16 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s16 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s16, 0x45900
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[20:23], 0 addr64 offset:3968
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[12:15], 0 addr64 offset:4080
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:2084 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:2088 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:2092 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:2096 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s16 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s16 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s16 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s16 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s16, 0x45500
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[12:15], 0 addr64 offset:4080
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[12:15], 0 addr64 offset:4064
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:2052 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:2056 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:2060 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:2064 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s16 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s16 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s16 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s16 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s16, 0x45100
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[12:15], 0 addr64 offset:4064
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[12:15], 0 addr64 offset:4048
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:2020 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:2024 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:2028 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:2032 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s16 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s16 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s16 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s16 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s16, 0x44d00
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[12:15], 0 addr64 offset:4048
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[12:15], 0 addr64 offset:4032
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:1988 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:1992 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:1996 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:2000 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s16 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s16 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s16 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s16 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s16, 0x44900
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[12:15], 0 addr64 offset:4032
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[12:15], 0 addr64 offset:4016
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:1956 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:1960 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:1964 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:1968 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s16 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s16 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s16 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s16 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s16, 0x44500
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[12:15], 0 addr64 offset:4016
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[12:15], 0 addr64 offset:4000
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:1924 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:1928 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:1932 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:1936 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s16 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s16 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s16 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s16 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s16, 0x44100
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[12:15], 0 addr64 offset:4000
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[12:15], 0 addr64 offset:3984
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:1892 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:1896 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:1900 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:1904 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s16 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s16 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s16 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s16 offset:12 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[12:15], 0 addr64 offset:3984
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[12:15], 0 addr64 offset:3968
+; GFX6-NEXT:    s_mov_b32 s12, 0x43d00
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:1860 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:1864 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:1868 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:1872 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s12 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s12 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s12 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s12 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s12, 0x43900
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[12:15], 0 addr64 offset:3968
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[8:11], 0 addr64 offset:4080
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:1780 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:1784 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:1788 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:1792 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s12 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s12 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s12 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s12 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s12, 0x43500
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[8:11], 0 addr64 offset:4080
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[8:11], 0 addr64 offset:4064
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:1748 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:1752 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:1756 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:1760 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s12 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s12 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s12 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s12 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s12, 0x43100
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[8:11], 0 addr64 offset:4064
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[8:11], 0 addr64 offset:4048
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:1716 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:1720 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:1724 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:1728 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s12 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s12 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s12 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s12 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s12, 0x42d00
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[8:11], 0 addr64 offset:4048
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[8:11], 0 addr64 offset:4032
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:1684 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:1688 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:1692 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:1696 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s12 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s12 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s12 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s12 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s12, 0x42900
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[8:11], 0 addr64 offset:4032
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[8:11], 0 addr64 offset:4016
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:1652 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:1656 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:1660 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:1664 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s12 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s12 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s12 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s12 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s12, 0x42500
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[8:11], 0 addr64 offset:4016
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[8:11], 0 addr64 offset:4000
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:1620 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:1624 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:1628 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:1632 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s12 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s12 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s12 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s12 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s12, 0x42100
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[8:11], 0 addr64 offset:4000
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[8:11], 0 addr64 offset:3984
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:1588 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:1592 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:1596 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:1600 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s12 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s12 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s12 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s12 offset:12 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[8:11], 0 addr64 offset:3984
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[8:11], 0 addr64 offset:3968
+; GFX6-NEXT:    s_mov_b32 s8, 0x41d00
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:1556 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:1560 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:1564 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:1568 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s8 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s8 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s8 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s8, 0x41900
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[8:11], 0 addr64 offset:3968
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:4080
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:1492 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:1496 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:1500 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:1504 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s8 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s8 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s8 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s8, 0x41500
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[4:7], 0 addr64 offset:4080
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:4064
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:1460 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:1464 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:1468 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:1472 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s8 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s8 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s8 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s8, 0x41100
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[4:7], 0 addr64 offset:4064
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:4048
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:1428 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:1432 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:1436 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:1440 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s8 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s8 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s8 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s8, 0x40d00
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[4:7], 0 addr64 offset:4048
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:4032
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:1396 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:1400 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:1404 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:1408 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s8 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s8 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s8 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s8, 0x40900
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[4:7], 0 addr64 offset:4032
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:4016
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:1364 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:1368 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:1372 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:1376 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s8 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s8 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s8 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s8, 0x40500
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[4:7], 0 addr64 offset:4016
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:4000
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:1332 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:1336 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:1340 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:1344 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s8 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s8 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s8 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    s_mov_b32 s8, 0x40100
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[4:7], 0 addr64 offset:4000
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:3984
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:1300 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:1304 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:1308 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:1312 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s8 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s8 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s8 offset:12 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[4:7], 0 addr64 offset:3984
+; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[7:8], s[4:7], 0 addr64 offset:3968
+; GFX6-NEXT:    s_mov_b32 s4, 0x3fd00
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v9, off, s[44:47], 0 offset:1268 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v10, off, s[44:47], 0 offset:1272 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v11, off, s[44:47], 0 offset:1276 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v12, off, s[44:47], 0 offset:1280 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], s4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], s4 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], s4 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], s4 offset:12 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    buffer_store_dwordx4 v[9:12], v[7:8], s[4:7], 0 addr64 offset:3968
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:4080
-; GFX6-NEXT:    s_mov_b32 s4, 0x4f900
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x4f500
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:4068 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:4072 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:4076 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:4080 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:4064
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x4f100
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:4052 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:4056 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:4060 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:4064 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:4048
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x4ed00
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:4036 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:4040 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:4044 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:4048 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:4032
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x4e900
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:4020 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:4024 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:4028 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:4032 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:4016
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x4e500
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:4004 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:4008 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:4012 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:4016 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:4000
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x4e100
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3988 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3992 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3996 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:4000 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3984
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x4dd00
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3972 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3976 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3980 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3984 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3968
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x4d900
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3956 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3960 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3964 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3968 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3952
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x4d500
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3940 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3944 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3948 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3952 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3936
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x4d100
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3924 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3928 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3932 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3936 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3920
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x4cd00
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3908 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3912 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3916 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3920 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3904
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x4c900
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3892 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3896 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3900 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3904 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3888
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x4c500
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3876 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3880 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3884 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3888 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3872
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x4c100
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3860 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3864 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3868 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3872 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3856
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x4bd00
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3844 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3848 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3852 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3856 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3840
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x4b900
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3828 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3832 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3836 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3840 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3824
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x4b500
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3812 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3816 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3820 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3824 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3808
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x4b100
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3796 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3800 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3804 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3808 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3792
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x4ad00
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3780 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3784 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3788 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3792 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3776
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x4a900
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3764 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3768 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3772 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3776 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3760
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x4a500
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3748 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3752 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3756 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3760 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3744
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x4a100
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3732 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3736 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3740 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3744 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3728
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x49d00
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3716 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3720 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3724 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3728 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3712
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x49900
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3700 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3704 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3708 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3712 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3696
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x49500
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3684 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3688 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3692 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3696 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3680
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x49100
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3668 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3672 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3676 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3680 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3664
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x48d00
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3652 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3656 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3660 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3664 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3648
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x48900
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3636 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3640 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3644 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3648 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3632
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x48500
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3620 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3624 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3628 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3632 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3616
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x48100
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3604 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3608 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3612 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3616 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3600
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x47d00
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3588 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3592 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3596 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3600 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3584
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x47900
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3572 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3576 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3580 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3584 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3568
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x47500
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3556 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3560 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3564 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3568 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3552
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x47100
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3540 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3544 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3548 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3552 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3536
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x46d00
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3524 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3528 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3532 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3536 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3520
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x46900
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3508 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3512 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3516 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3520 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3504
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x46500
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3492 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3496 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3500 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3504 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3488
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x46100
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3476 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3480 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3484 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3488 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3472
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x45d00
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3460 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3464 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3468 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3472 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3456
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x45900
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3444 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3448 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3452 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3456 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3440
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x45500
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3428 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3432 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3436 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3440 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3424
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x45100
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3412 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3416 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3420 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3424 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3408
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x44d00
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3396 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3400 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3404 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3408 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3392
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x44900
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3380 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3384 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3388 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3392 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3376
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x44500
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3364 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3368 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3372 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3376 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3360
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x44100
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3348 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3352 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3356 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3360 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3344
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x43d00
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3332 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3336 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3340 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3344 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3328
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x43900
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3316 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3320 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3324 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3328 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3312
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x43500
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3300 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3304 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3308 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3312 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3296
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x43100
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3284 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3288 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3292 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3296 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3280
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x42d00
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3268 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3272 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3276 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3280 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3264
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x42900
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3252 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3256 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3260 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3264 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3248
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x42500
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3236 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3240 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3244 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3248 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3232
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x42100
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3220 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3224 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3228 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3232 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3216
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x41d00
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3204 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3208 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3212 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3216 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3200
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x41900
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3188 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3192 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3196 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3200 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3184
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x41500
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3172 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3176 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3180 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3184 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3168
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x41100
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3156 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3160 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3164 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3168 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3152
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x40d00
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3140 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3144 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3148 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3152 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3136
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x40900
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3124 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3128 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3132 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3136 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3120
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x40500
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3108 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3112 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3116 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3120 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3104
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x40100
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3092 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3096 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3100 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3104 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3088
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    s_mov_b32 s4, 0x3fd00
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3076 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3080 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3084 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3088 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3072
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], s4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], s4 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], s4 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], s4 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3060 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3064 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3068 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3072 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3056
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:4068 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:4072 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:4076 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:4080 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3044 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3048 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3052 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3056 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3040
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:4052 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:4056 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:4060 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:4064 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3028 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3032 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3036 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3040 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3024
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:4036 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:4040 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:4044 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:4048 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:3012 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3016 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3020 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3024 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:3008
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:4020 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:4024 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:4028 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:4032 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2996 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:3000 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:3004 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:3008 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2992
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:4004 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:4008 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:4012 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:4016 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2980 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2984 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2988 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2992 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2976
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:3988 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:3992 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:3996 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:4000 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2964 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2968 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2972 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2976 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2960
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:3972 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:3976 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:3980 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:3984 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2948 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2952 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2956 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2960 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2944
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:3956 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:3960 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:3964 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:3968 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2932 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2936 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2940 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2944 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2928
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:3940 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:3944 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:3948 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:3952 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2916 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2920 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2924 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2928 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2912
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:3924 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:3928 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:3932 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:3936 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2900 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2904 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2908 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2912 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2896
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:3908 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:3912 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:3916 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:3920 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2884 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2888 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2892 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2896 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2880
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:3892 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:3896 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:3900 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:3904 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2868 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2872 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2876 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2880 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2864
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:3876 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:3880 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:3884 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:3888 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2852 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2856 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2860 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2864 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2848
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:3860 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:3864 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:3868 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:3872 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2836 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2840 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2844 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2848 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2832
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:3844 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:3848 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:3852 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:3856 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2820 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2824 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2828 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2832 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2816
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:3828 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:3832 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:3836 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:3840 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2804 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2808 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2812 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2816 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2800
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:3812 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:3816 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:3820 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:3824 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2788 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2792 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2796 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2800 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2784
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:3796 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:3800 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:3804 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:3808 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2772 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2776 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2780 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2784 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2768
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:3780 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:3784 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:3788 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:3792 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2756 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2760 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2764 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2768 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2752
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:3764 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:3768 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:3772 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:3776 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2740 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2744 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2748 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2752 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2736
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:3748 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:3752 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:3756 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:3760 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2724 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2728 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2732 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2736 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2720
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:3732 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:3736 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:3740 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:3744 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2708 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2712 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2716 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2720 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2704
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:3716 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:3720 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:3724 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:3728 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2692 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2696 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2700 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2704 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2688
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:3700 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:3704 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:3708 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:3712 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2676 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2680 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2684 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2688 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2672
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:3684 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:3688 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:3692 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:3696 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2660 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2664 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2668 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2672 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2656
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:3668 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:3672 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:3676 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:3680 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2644 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2648 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2652 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2656 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2640
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:3652 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:3656 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:3660 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:3664 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2628 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2632 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2636 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2640 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2624
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:3636 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:3640 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:3644 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:3648 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2612 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2616 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2620 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2624 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2608
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:3620 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:3624 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:3628 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:3632 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2596 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2600 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2604 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2608 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2592
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:3604 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:3608 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:3612 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:3616 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2580 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2584 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2588 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2592 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2576
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:3588 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:3592 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:3596 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:3600 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2564 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2568 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2572 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2576 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2560
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:3572 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:3576 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:3580 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:3584 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2548 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2552 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2556 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2560 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2544
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:3540 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:3544 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:3548 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:3552 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2532 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2536 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2540 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2544 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2528
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:3508 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:3512 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:3516 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:3520 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2516 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2520 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2524 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2528 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2512
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:3476 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:3480 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:3484 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:3488 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2500 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2504 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2508 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2512 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2496
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:3444 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:3448 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:3452 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:3456 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2484 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2488 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2492 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2496 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2480
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:3412 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:3416 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:3420 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:3424 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2468 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2472 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2476 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2480 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2464
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:3380 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:3384 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:3388 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:3392 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2452 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2456 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2460 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2464 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2448
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:3348 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:3352 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:3356 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:3360 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2436 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2440 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2444 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2448 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2432
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:3316 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:3320 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:3324 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:3328 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2420 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2424 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2428 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2432 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2416
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:3300 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:3304 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:3308 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:3312 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2404 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2408 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2412 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2416 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2400
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:3284 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:3288 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:3292 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:3296 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2388 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2392 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2396 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2400 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2384
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:3252 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:3256 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:3260 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:3264 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2372 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2376 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2380 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2384 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2368
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:3220 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:3224 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:3228 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:3232 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2356 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2360 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2364 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2368 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2352
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:3188 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:3192 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:3196 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:3200 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2340 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2344 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2348 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2352 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2336
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:3156 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:3160 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:3164 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:3168 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2324 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2328 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2332 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2336 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2320
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:3124 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:3128 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:3132 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:3136 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2308 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2312 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2316 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2320 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2304
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:3092 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:3096 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:3100 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:3104 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2292 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2296 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2300 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2304 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2288
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:3060 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:3064 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:3068 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:3072 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2276 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2280 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2284 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2288 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2272
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:3028 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:3032 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:3036 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:3040 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2260 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2264 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2268 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2272 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2256
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:3012 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:3016 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:3020 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:3024 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2244 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2248 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2252 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2256 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2240
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:2996 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:3000 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:3004 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:3008 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2228 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2232 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2236 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2240 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2224
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:2980 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:2984 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:2988 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:2992 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2212 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2216 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2220 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2224 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2208
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:2948 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:2952 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:2956 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:2960 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2196 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2200 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2204 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2208 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2192
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:2916 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:2920 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:2924 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:2928 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2180 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2184 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2188 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2192 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2176
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:2884 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:2888 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:2892 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:2896 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2164 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2168 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2172 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2176 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2160
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:2852 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:2856 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:2860 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:2864 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2148 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2152 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2156 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2160 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2144
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:2820 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:2824 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:2828 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:2832 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2132 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2136 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2140 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2144 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2128
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:2788 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:2792 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:2796 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:2800 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2116 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2120 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2124 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2128 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2112
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:2756 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:2760 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:2764 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:2768 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2100 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2104 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2108 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2112 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2096
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:2724 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:2728 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:2732 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:2736 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2084 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2088 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2092 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2096 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2080
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:2708 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:2712 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:2716 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:2720 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2068 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2072 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2076 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2080 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2064
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:2692 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:2696 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:2700 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:2704 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2052 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2056 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2060 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2064 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2048
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:2660 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:2664 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:2668 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:2672 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2036 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2040 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2044 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2048 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2032
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:2628 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:2632 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:2636 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:2640 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2020 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2024 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2028 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2032 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2016
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:2596 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:2600 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:2604 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:2608 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:2004 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:2008 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:2012 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2016 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:2000
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:2564 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:2568 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:2572 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:2576 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1988 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1992 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1996 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:2000 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1984
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:2532 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:2536 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:2540 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:2544 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1972 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1976 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1980 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1984 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1968
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:2500 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:2504 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:2508 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:2512 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1956 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1960 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1964 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1968 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1952
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:2468 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:2472 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:2476 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:2480 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1940 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1944 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1948 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1952 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1936
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:2436 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:2440 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:2444 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:2448 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1924 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1928 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1932 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1936 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1920
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:2420 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:2424 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:2428 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:2432 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1908 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1912 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1916 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1920 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1904
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:2404 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:2408 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:2412 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:2416 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1892 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1896 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1900 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1904 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1888
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:2388 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:2392 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:2396 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:2400 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1876 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1880 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1884 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1888 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1872
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:2356 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:2360 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:2364 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:2368 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1860 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1864 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1868 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1872 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1856
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:2324 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:2328 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:2332 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:2336 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1844 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1848 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1852 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1856 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1840
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:2292 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:2296 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:2300 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:2304 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1828 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1832 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1836 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1840 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1824
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:2260 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:2264 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:2268 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:2272 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1812 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1816 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1820 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1824 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1808
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:2228 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:2232 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:2236 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:2240 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1796 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1800 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1804 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1808 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1792
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:2196 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:2200 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:2204 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:2208 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1780 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1784 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1788 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1792 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1776
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:2164 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:2168 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:2172 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:2176 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1764 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1768 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1772 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1776 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1760
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:2132 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:2136 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:2140 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:2144 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1748 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1752 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1756 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1760 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1744
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:2116 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:2120 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:2124 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:2128 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1732 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1736 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1740 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1744 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1728
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:2100 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:2104 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:2108 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:2112 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1716 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1720 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1724 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1728 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1712
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:2068 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:2072 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:2076 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:2080 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1700 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1704 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1708 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1712 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1696
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:2036 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:2040 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:2044 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:2048 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1684 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1688 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1692 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1696 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1680
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:2004 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:2008 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:2012 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:2016 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1668 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1672 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1676 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1680 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1664
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:1972 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:1976 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:1980 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:1984 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1652 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1656 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1660 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1664 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1648
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:1940 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:1944 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:1948 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:1952 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1636 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1640 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1644 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1648 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1632
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:1908 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:1912 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:1916 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:1920 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1620 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1624 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1628 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1632 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1616
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:1876 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:1880 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:1884 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:1888 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1604 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1608 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1612 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1616 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1600
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:1844 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:1848 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:1852 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:1856 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1588 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1592 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1596 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1600 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1584
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:1828 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:1832 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:1836 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:1840 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1572 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1576 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1580 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1584 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1568
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:1812 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:1816 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:1820 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:1824 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1556 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1560 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1564 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1568 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1552
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:1796 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:1800 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:1804 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:1808 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1540 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1544 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1548 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1552 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1536
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:1764 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:1768 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:1772 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:1776 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1524 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1528 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1532 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1536 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1520
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:1732 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:1736 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:1740 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:1744 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1508 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1512 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1516 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1520 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1504
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:1700 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:1704 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:1708 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:1712 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1492 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1496 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1500 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1504 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1488
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:1668 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:1672 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:1676 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:1680 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1476 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1480 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1484 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1488 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1472
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:1636 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:1640 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:1644 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:1648 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1460 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1464 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1468 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1472 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1456
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:1604 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:1608 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:1612 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:1616 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1444 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1448 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1452 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1456 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1440
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:1572 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:1576 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:1580 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:1584 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1428 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1432 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1436 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1440 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1424
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:1540 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:1544 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:1548 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:1552 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1412 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1416 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1420 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1424 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1408
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:1524 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:1528 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:1532 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:1536 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1396 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1400 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1404 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1408 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1392
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:1508 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:1512 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:1516 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:1520 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1380 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1384 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1388 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1392 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1376
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:1476 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:1480 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:1484 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:1488 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1364 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1368 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1372 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1376 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1360
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:1444 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:1448 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:1452 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:1456 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1348 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1352 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1356 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1360 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1344
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:1412 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:1416 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:1420 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:1424 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1332 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1336 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1340 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1344 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1328
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:1380 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:1384 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:1388 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:1392 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1316 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1320 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1324 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1328 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1312
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:1348 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:1352 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:1356 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:1360 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1300 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1304 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1308 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1312 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1296
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:1316 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:1320 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:1324 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:1328 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1284 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1288 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1292 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1296 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1280
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:1284 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:1288 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:1292 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:1296 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1268 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1272 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1276 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1280 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1264
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:1252 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:1256 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:1260 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:1264 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1252 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1256 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1260 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1264 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1248
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:1236 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:1240 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:1244 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:1248 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1236 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1240 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1244 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1248 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1232
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:1220 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:1224 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:1228 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:1232 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1220 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1224 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1228 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1232 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1216
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:1204 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:1208 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:1212 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:1216 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1204 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1208 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1212 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1216 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1200
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:1188 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:1192 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:1196 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:1200 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1188 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1192 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1196 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1200 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1184
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:1172 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:1176 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:1180 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:1184 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1172 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1176 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1180 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1184 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1168
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:1156 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:1160 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:1164 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:1168 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1156 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1160 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1164 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1168 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1152
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:1140 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:1144 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:1148 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:1152 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1140 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1144 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1148 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1152 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1136
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:1124 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:1128 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:1132 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:1136 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1124 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1128 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1132 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1136 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1120
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:1108 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:1112 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:1116 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:1120 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1108 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1112 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1116 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1120 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1104
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:1092 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:1096 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:1100 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:1104 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1092 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1096 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1100 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1104 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1088
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:1076 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:1080 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:1084 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:1088 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1076 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1080 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1084 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1088 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1072
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:1060 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:1064 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:1068 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:1072 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1060 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1064 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1068 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1072 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1056
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:1044 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:1048 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:1052 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:1056 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1044 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1048 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1052 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1056 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1040
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:1028 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:1032 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:1036 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:1040 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1028 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1032 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1036 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1040 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1024
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:1012 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:1016 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:1020 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:1024 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:1012 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1016 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1020 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1024 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:1008
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:996 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:1000 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:1004 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:1008 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:996 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:1000 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:1004 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:1008 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:992
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:980 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:984 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:988 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:992 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:980 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:984 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:988 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:992 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:976
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:964 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:968 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:972 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:976 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:964 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:968 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:972 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:976 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:960
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:948 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:952 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:956 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:960 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:948 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:952 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:956 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:960 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:944
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:932 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:936 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:940 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:944 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:932 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:936 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:940 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:944 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:928
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:916 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:920 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:924 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:928 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:916 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:920 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:924 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:928 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:912
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:900 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:904 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:908 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:912 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:900 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:904 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:908 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:912 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:896
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:884 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:888 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:892 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:896 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:884 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:888 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:892 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:896 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:880
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:868 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:872 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:876 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:880 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:868 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:872 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:876 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:880 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:864
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:852 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:856 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:860 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:864 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:852 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:856 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:860 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:864 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:848
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:836 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:840 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:844 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:848 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:836 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:840 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:844 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:848 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:832
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:820 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:824 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:828 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:832 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:820 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:824 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:828 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:832 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:816
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:804 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:808 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:812 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:816 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:804 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:808 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:812 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:816 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:800
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:788 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:792 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:796 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:800 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:788 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:792 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:796 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:800 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:784
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:772 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:776 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:780 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:784 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:772 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:776 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:780 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:784 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:768
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:756 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:760 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:764 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:768 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:756 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:760 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:764 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:768 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:752
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:740 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:744 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:748 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:752 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:740 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:744 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:748 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:752 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:736
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:724 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:728 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:732 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:736 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:724 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:728 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:732 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:736 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:720
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:708 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:712 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:716 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:720 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:708 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:712 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:716 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:720 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:704
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:692 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:696 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:700 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:704 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:692 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:696 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:700 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:704 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:688
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:676 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:680 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:684 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:688 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:676 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:680 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:684 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:688 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:672
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:660 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:664 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:668 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:672 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:660 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:664 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:668 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:672 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:656
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:644 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:648 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:652 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:656 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:644 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:648 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:652 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:656 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:640
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:628 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:632 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:636 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:640 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:628 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:632 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:636 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:640 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:624
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:612 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:616 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:620 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:624 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:612 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:616 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:620 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:624 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:608
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:596 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:600 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:604 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:608 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:596 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:600 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:604 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:608 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:592
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:580 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:584 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:588 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:592 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:580 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:584 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:588 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:592 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:576
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:564 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:568 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:572 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:576 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:564 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:568 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:572 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:576 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:560
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:548 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:552 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:556 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:560 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:548 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:552 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:556 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:560 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:544
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:532 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:536 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:540 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:544 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:532 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:536 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:540 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:544 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:528
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:516 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:520 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:524 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:528 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:516 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:520 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:524 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:528 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:512
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:500 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:504 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:508 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:512 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:500 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:504 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:508 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:512 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:496
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:484 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:488 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:492 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:496 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:484 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:488 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:492 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:496 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:480
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:468 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:472 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:476 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:480 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:468 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:472 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:476 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:480 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:464
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:452 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:456 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:460 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:464 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:452 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:456 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:460 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:464 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:448
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:436 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:440 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:444 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:448 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:436 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:440 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:444 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:448 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:432
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:420 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:424 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:428 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:432 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:420 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:424 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:428 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:432 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:416
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:404 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:408 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:412 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:416 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:404 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:408 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:412 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:416 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:400
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:388 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:392 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:396 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:400 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:388 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:392 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:396 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:400 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:384
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:372 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:376 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:380 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:384 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:372 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:376 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:380 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:384 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:368
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:356 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:360 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:364 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:368 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:356 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:360 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:364 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:368 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:352
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:340 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:344 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:348 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:352 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:340 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:344 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:348 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:352 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:336
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:324 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:328 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:332 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:336 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:324 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:328 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:332 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:336 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:320
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:308 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:312 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:316 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:320 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:308 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:312 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:316 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:320 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:304
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:292 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:296 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:300 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:304 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:292 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:296 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:300 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:304 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:288
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:276 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:280 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:284 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:288 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:276 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:280 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:284 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:288 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:272
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:260 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:264 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:268 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:272 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:260 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:264 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:268 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:272 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:256
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:244 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:248 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:252 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:256 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:244 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:248 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:252 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:256 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:240
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:228 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:232 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:236 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:240 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:228 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:232 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:236 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:240 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:224
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:212 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:216 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:220 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:224 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:212 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:216 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:220 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:224 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:208
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:196 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:200 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:204 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:208 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:196 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:200 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:204 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:208 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:192
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:180 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:184 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:188 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:192 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:180 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:184 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:188 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:192 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:176
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:164 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:168 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:172 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:176 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:164 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:168 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:172 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:176 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:160
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:148 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:152 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:156 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:160 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:148 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:152 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:156 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:160 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:144
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:132 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:136 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:140 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:144 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:132 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:136 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:140 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:144 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:128
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:116 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:120 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:124 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:128 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:116 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:120 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:124 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:128 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:112
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:100 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:104 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:108 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:112 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:100 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:104 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:108 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:112 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:96
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:84 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:88 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:92 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:96 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:84 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:88 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:92 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:96 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:80
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:68 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:72 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:76 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:80 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:68 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:72 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:76 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:80 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:64
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:52 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:56 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:60 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:64 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:52 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:56 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:60 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:64 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:48
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:36 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:40 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:44 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:48 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:36 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:40 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:44 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:48 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:32
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:20 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:24 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:28 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:32 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:20 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:24 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:28 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:32 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64 offset:16
 ; GFX6-NEXT:    s_waitcnt expcnt(0)
-; GFX6-NEXT:    buffer_load_dword v0, off, s[44:47], 0 offset:4 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v1, off, s[44:47], 0 offset:8 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v2, off, s[44:47], 0 offset:12 ; 4-byte Folded Reload
-; GFX6-NEXT:    buffer_load_dword v3, off, s[44:47], 0 offset:16 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v0, off, s[40:43], 0 offset:4 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v1, off, s[40:43], 0 offset:8 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v2, off, s[40:43], 0 offset:12 ; 4-byte Folded Reload
+; GFX6-NEXT:    buffer_load_dword v3, off, s[40:43], 0 offset:16 ; 4-byte Folded Reload
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 addr64
 ; GFX6-NEXT:    s_endpgm
@@ -5009,13 +5002,6 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 ; GFX9-FLATSCR-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:3968
 ; GFX9-FLATSCR-NEXT:    s_mov_b32 s4, 4
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s5, 0x84
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s6, 0x104
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s7, 0x184
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s8, 0x204
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s9, 0x284
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s10, 0x304
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x384
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:3984
@@ -5050,1261 +5036,1268 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 ; GFX9-FLATSCR-NEXT:    v_add_co_u32_e32 v0, vcc, s4, v2
 ; GFX9-FLATSCR-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:3968
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x84
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s5 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:3984
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s5, 0x94
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x94
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s5 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4000
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s5, 0xa4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xa4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s5 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4016
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s5, 0xb4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xb4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s5 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4032
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s5, 0xc4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xc4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s5 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4048
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s5, 0xd4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xd4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s5 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4064
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s5, 0xe4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xe4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s5 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4080
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s5, 0xf4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xf4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s5 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s5, 0x180
-; GFX9-FLATSCR-NEXT:    v_add_co_u32_e32 v0, vcc, s5, v2
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x180
+; GFX9-FLATSCR-NEXT:    v_add_co_u32_e32 v0, vcc, s4, v2
 ; GFX9-FLATSCR-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:3968
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x104
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s6 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:3984
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s6, 0x114
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x114
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s6 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4000
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s6, 0x124
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x124
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s6 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4016
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s6, 0x134
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x134
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s6 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4032
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s6, 0x144
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x144
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s6 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4048
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s6, 0x154
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x154
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s6 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4064
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s6, 0x164
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x164
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s6 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4080
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s6, 0x174
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x174
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s6 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s6, 0x200
-; GFX9-FLATSCR-NEXT:    v_add_co_u32_e32 v0, vcc, s6, v2
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x200
+; GFX9-FLATSCR-NEXT:    v_add_co_u32_e32 v0, vcc, s4, v2
 ; GFX9-FLATSCR-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:3968
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x184
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s7 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:3984
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s7, 0x194
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x194
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s7 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4000
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s7, 0x1a4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1a4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s7 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4016
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s7, 0x1b4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1b4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s7 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4032
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s7, 0x1c4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1c4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s7 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4048
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s7, 0x1d4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1d4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s7 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4064
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s7, 0x1e4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1e4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s7 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4080
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s7, 0x1f4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1f4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s7 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s7, 0x280
-; GFX9-FLATSCR-NEXT:    v_add_co_u32_e32 v0, vcc, s7, v2
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x280
+; GFX9-FLATSCR-NEXT:    v_add_co_u32_e32 v0, vcc, s4, v2
 ; GFX9-FLATSCR-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:3968
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x204
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s8 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:3984
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s8, 0x214
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x214
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s8 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4000
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s8, 0x224
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x224
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s8 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4016
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s8, 0x234
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x234
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s8 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4032
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s8, 0x244
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x244
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s8 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4048
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s8, 0x254
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x254
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s8 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4064
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s8, 0x264
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x264
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s8 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4080
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s8, 0x274
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x274
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s8 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s8, 0x300
-; GFX9-FLATSCR-NEXT:    v_add_co_u32_e32 v0, vcc, s8, v2
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x300
+; GFX9-FLATSCR-NEXT:    v_add_co_u32_e32 v0, vcc, s4, v2
 ; GFX9-FLATSCR-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:3968
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x284
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s9 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:3984
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s9, 0x294
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x294
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s9 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4000
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s9, 0x2a4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x2a4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s9 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4016
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s9, 0x2b4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x2b4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s9 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4032
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s9, 0x2c4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x2c4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s9 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4048
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s9, 0x2d4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x2d4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s9 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4064
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s9, 0x2e4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x2e4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s9 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4080
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s9, 0x2f4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x2f4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s9 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s9, 0x380
-; GFX9-FLATSCR-NEXT:    v_add_co_u32_e32 v0, vcc, s9, v2
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x380
+; GFX9-FLATSCR-NEXT:    v_add_co_u32_e32 v0, vcc, s4, v2
 ; GFX9-FLATSCR-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:3968
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x304
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s10 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:3984
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s10, 0x314
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x314
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s10 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4000
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s10, 0x324
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x324
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s10 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4016
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s10, 0x334
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x334
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s10 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4032
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s10, 0x344
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x344
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s10 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4048
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s10, 0x354
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x354
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s10 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4064
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s10, 0x364
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x364
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s10 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4080
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s10, 0x374
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x374
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s10 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s10, 0x400
-; GFX9-FLATSCR-NEXT:    v_add_co_u32_e32 v0, vcc, s10, v2
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x400
+; GFX9-FLATSCR-NEXT:    v_add_co_u32_e32 v0, vcc, s4, v2
 ; GFX9-FLATSCR-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:3968
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x384
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:3984
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x394
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x394
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4000
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x3a4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x3a4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4016
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x3b4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x3b4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4032
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x3c4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x3c4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4048
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x3d4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x3d4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[6:9], v[0:1], off offset:4064
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off offset:4080
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x3e4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x3e4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s11 ; 16-byte Folded Spill
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x3f4
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[6:9], s4 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x3f4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3]
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x404
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x404
 ; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v6, s1
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:16
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x414
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x414
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:32
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x424
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x424
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:48
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x434
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x434
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:64
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x444
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x444
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:80
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x454
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x454
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:96
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x464
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x464
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:112
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x474
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x474
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:128
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x484
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x484
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:144
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x494
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x494
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:160
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x4a4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x4a4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:176
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x4b4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x4b4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:192
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x4c4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x4c4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:208
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x4d4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x4d4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:224
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x4e4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x4e4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:240
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x4f4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x4f4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:256
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x504
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x504
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:272
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x514
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x514
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:288
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x524
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x524
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:304
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x534
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x534
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:320
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x544
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x544
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:336
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x554
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x554
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:352
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x564
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x564
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:368
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x574
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x574
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:384
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x584
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x584
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:400
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x594
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x594
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:416
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x5a4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x5a4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:432
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x5b4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x5b4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:448
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x5c4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x5c4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:464
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x5d4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x5d4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:480
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x5e4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x5e4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:496
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x5f4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x5f4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:512
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x604
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x604
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:528
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x614
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x614
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:544
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x624
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x624
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:560
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x634
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x634
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:576
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x644
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x644
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:592
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x654
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x654
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:608
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x664
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x664
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:624
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x674
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x674
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:640
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x684
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x684
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:656
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x694
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x694
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:672
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x6a4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x6a4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:688
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x6b4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x6b4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:704
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x6c4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x6c4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:720
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x6d4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x6d4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:736
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x6e4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x6e4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:752
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x6f4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x6f4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:768
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x704
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x704
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:784
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x714
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x714
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:800
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x724
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x724
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:816
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x734
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x734
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:832
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x744
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x744
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:848
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x754
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x754
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:864
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x764
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x764
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:880
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x774
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x774
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:896
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x784
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x784
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:912
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x794
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x794
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:928
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x7a4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x7a4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:944
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x7b4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x7b4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:960
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x7c4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x7c4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:976
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x7d4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x7d4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:992
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x7e4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x7e4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1008
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x7f4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x7f4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1024
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x804
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x804
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1040
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x814
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x814
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1056
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x824
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x824
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1072
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x834
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x834
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1088
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x844
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x844
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1104
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x854
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x854
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1120
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x864
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x864
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1136
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x874
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x874
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1152
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x884
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x884
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1168
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x894
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x894
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1184
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x8a4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x8a4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1200
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x8b4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x8b4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1216
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x8c4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x8c4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1232
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x8d4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x8d4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1248
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x8e4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x8e4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1264
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x8f4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x8f4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1280
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x904
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x904
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1296
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x914
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x914
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1312
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x924
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x924
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1328
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x934
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x934
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1344
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x944
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x944
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1360
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x954
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x954
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1376
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x964
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x964
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1392
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x974
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x974
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1408
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x984
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x984
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1424
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x994
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x994
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1440
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x9a4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x9a4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1456
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x9b4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x9b4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1472
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x9c4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x9c4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1488
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x9d4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x9d4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1504
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x9e4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x9e4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1520
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x9f4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x9f4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1536
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xa04
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xa04
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1552
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xa14
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xa14
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1568
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xa24
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xa24
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1584
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xa34
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xa34
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1600
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xa44
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xa44
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1616
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xa54
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xa54
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1632
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xa64
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xa64
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1648
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xa74
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xa74
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1664
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xa84
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xa84
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1680
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xa94
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xa94
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1696
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xaa4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xaa4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1712
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xab4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xab4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1728
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xac4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xac4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1744
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xad4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xad4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1760
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xae4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xae4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1776
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xaf4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xaf4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1792
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xb04
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xb04
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1808
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xb14
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xb14
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1824
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xb24
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xb24
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1840
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xb34
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xb34
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1856
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xb44
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xb44
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1872
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xb54
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xb54
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1888
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xb64
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xb64
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1904
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xb74
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xb74
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1920
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xb84
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xb84
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1936
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xb94
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xb94
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1952
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xba4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xba4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1968
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xbb4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xbb4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:1984
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xbc4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xbc4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2000
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xbd4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xbd4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2016
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xbe4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xbe4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2032
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xbf4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xbf4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2048
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xc04
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xc04
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2064
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xc14
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xc14
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2080
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xc24
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xc24
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2096
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xc34
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xc34
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2112
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xc44
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xc44
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2128
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xc54
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xc54
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2144
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xc64
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xc64
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2160
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xc74
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xc74
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2176
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xc84
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xc84
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2192
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xc94
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xc94
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2208
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xca4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xca4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2224
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xcb4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xcb4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2240
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xcc4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xcc4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2256
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xcd4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xcd4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2272
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xce4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xce4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2288
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xcf4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xcf4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2304
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xd04
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xd04
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2320
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xd14
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xd14
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2336
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xd24
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xd24
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2352
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xd34
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xd34
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2368
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xd44
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xd44
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2384
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xd54
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xd54
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2400
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xd64
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xd64
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2416
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xd74
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xd74
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2432
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xd84
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xd84
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2448
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xd94
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xd94
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2464
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xda4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xda4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2480
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xdb4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xdb4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2496
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xdc4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xdc4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2512
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xdd4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xdd4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2528
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xde4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xde4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2544
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xdf4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xdf4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2560
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xe04
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xe04
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2576
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xe14
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xe14
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2592
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xe24
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xe24
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2608
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xe34
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xe34
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2624
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xe44
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xe44
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2640
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xe54
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xe54
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2656
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xe64
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xe64
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2672
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xe74
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xe74
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2688
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xe84
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xe84
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2704
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xe94
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xe94
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2720
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xea4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xea4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2736
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xeb4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xeb4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2752
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xec4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xec4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2768
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xed4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xed4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2784
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xee4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xee4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2800
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xef4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xef4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2816
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xf04
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xf04
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2832
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xf14
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xf14
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2848
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xf24
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xf24
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2864
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xf34
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xf34
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2880
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xf44
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xf44
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2896
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xf54
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xf54
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2912
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xf64
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xf64
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2928
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xf74
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xf74
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2944
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xf84
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xf84
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2960
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xf94
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xf94
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2976
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xfa4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xfa4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:2992
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xfb4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xfb4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3008
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xfc4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xfc4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3024
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xfd4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xfd4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3040
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xfe4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xfe4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3056
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0xff4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0xff4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3072
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x1004
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1004
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3088
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x1014
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1014
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3104
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x1024
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1024
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3120
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x1034
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1034
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3136
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x1044
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1044
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3152
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x1054
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1054
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3168
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x1064
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1064
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3184
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x1074
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1074
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3200
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x1084
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1084
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3216
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x1094
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1094
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3232
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x10a4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x10a4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3248
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x10b4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x10b4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3264
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x10c4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x10c4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3280
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x10d4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x10d4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3296
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x10e4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x10e4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3312
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x10f4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x10f4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3328
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x1104
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1104
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3344
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x1114
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1114
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3360
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x1124
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1124
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3376
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x1134
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1134
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3392
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x1144
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1144
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3408
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x1154
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1154
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3424
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x1164
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1164
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3440
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x1174
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1174
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3456
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x1184
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1184
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3472
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x1194
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1194
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3488
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x11a4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x11a4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3504
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x11b4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x11b4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3520
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x11c4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x11c4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3536
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x11d4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x11d4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3552
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x11e4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x11e4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3568
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x11f4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x11f4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3584
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x1204
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1204
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3600
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x1214
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1214
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3616
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x1224
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1224
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3632
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x1234
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1234
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3648
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x1244
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1244
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3664
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x1254
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1254
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3680
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x1264
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1264
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3696
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x1274
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1274
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3712
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x1284
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1284
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3728
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x1294
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1294
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3744
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x12a4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x12a4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3760
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x12b4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x12b4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3776
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x12c4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x12c4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3792
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x12d4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x12d4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3808
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x12e4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x12e4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3824
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x12f4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x12f4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3840
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x1304
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1304
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3856
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x1314
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1314
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3872
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x1324
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1324
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3888
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x1334
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1334
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3904
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x1344
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1344
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3920
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x1354
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1354
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3936
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x1364
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1364
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3952
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x1374
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1374
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3968
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x1384
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1384
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:3984
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x1394
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x1394
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:4000
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x13a4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x13a4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:4016
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x13b4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x13b4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:4032
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x13c4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x13c4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:4048
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x13d4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x13d4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:4064
-; GFX9-FLATSCR-NEXT:    s_movk_i32 s11, 0x13e4
+; GFX9-FLATSCR-NEXT:    s_movk_i32 s4, 0x13e4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s11 ; 16-byte Folded Spill
+; GFX9-FLATSCR-NEXT:    scratch_store_dwordx4 off, v[0:3], s4 ; 16-byte Folded Spill
 ; GFX9-FLATSCR-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3] offset:4080
 ; GFX9-FLATSCR-NEXT:    s_movk_i32 s2, 0x13e4
 ; GFX9-FLATSCR-NEXT:    ;;#ASMSTART
@@ -7346,7 +7339,7 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1]
 ; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x3f4
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, s0 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    v_add_co_u32_e32 v0, vcc, s10, v4
+; GFX9-FLATSCR-NEXT:    v_add_co_u32_e32 v0, vcc, 0x400, v4
 ; GFX9-FLATSCR-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v6, vcc
 ; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x3e4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
@@ -7380,7 +7373,7 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:3968
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, s0 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    v_add_co_u32_e32 v0, vcc, s9, v4
+; GFX9-FLATSCR-NEXT:    v_add_co_u32_e32 v0, vcc, 0x380, v4
 ; GFX9-FLATSCR-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v6, vcc
 ; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x364
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
@@ -7414,7 +7407,7 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:3968
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, s0 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    v_add_co_u32_e32 v0, vcc, s8, v4
+; GFX9-FLATSCR-NEXT:    v_add_co_u32_e32 v0, vcc, 0x300, v4
 ; GFX9-FLATSCR-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v6, vcc
 ; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x2e4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
@@ -7448,7 +7441,7 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:3968
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, s0 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    v_add_co_u32_e32 v0, vcc, s7, v4
+; GFX9-FLATSCR-NEXT:    v_add_co_u32_e32 v0, vcc, 0x280, v4
 ; GFX9-FLATSCR-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v6, vcc
 ; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x264
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
@@ -7482,7 +7475,7 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:3968
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, s0 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    v_add_co_u32_e32 v0, vcc, s6, v4
+; GFX9-FLATSCR-NEXT:    v_add_co_u32_e32 v0, vcc, 0x200, v4
 ; GFX9-FLATSCR-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v6, vcc
 ; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x1e4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
@@ -7516,7 +7509,7 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:3968
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, s0 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    v_add_co_u32_e32 v0, vcc, s5, v4
+; GFX9-FLATSCR-NEXT:    v_add_co_u32_e32 v0, vcc, 0x180, v4
 ; GFX9-FLATSCR-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v6, vcc
 ; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0x164
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
@@ -7550,7 +7543,7 @@ define amdgpu_kernel void @test(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-FLATSCR-NEXT:    global_store_dwordx4 v[0:1], v[7:10], off offset:3968
 ; GFX9-FLATSCR-NEXT:    scratch_load_dwordx4 v[7:10], off, s0 ; 16-byte Folded Reload
-; GFX9-FLATSCR-NEXT:    v_add_co_u32_e32 v0, vcc, s4, v4
+; GFX9-FLATSCR-NEXT:    v_add_co_u32_e32 v0, vcc, 0x100, v4
 ; GFX9-FLATSCR-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v6, vcc
 ; GFX9-FLATSCR-NEXT:    s_movk_i32 s0, 0xe4
 ; GFX9-FLATSCR-NEXT:    s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll b/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll
index 2f43cc022afd3..fbe0b156cd9ba 100644
--- a/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll
+++ b/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll
@@ -15,126 +15,126 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
   ; CHECK: bb.0..expVert:
   ; CHECK-NEXT:   liveins: $sgpr3, $sgpr4, $sgpr5, $sgpr8, $sgpr9, $sgpr10, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr25, $sgpr27, $sgpr31
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   undef %56.sub0:sgpr_64 = COPY $sgpr31
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr27
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr25
-  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr5
-  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr4
-  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3
-  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr18
-  ; CHECK-NEXT:   undef %50.sub0:sgpr_64 = COPY $sgpr19
-  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr20
-  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr21
-  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sgpr_32 = COPY $sgpr22
-  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr23
-  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr9
-  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr10
-  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr8
-  ; CHECK-NEXT:   undef %71.sub0_sub1:sgpr_128 = S_LOAD_DWORDX2_IMM %56, 232, 0 :: (invariant load (s64) from %ir.39, addrspace 4)
-  ; CHECK-NEXT:   [[S_LSHL_B32_:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY4]], 4, implicit-def dead $scc
-  ; CHECK-NEXT:   [[S_LSHL_B32_1:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY3]], 4, implicit-def dead $scc
-  ; CHECK-NEXT:   [[S_LSHL_B32_2:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY2]], 4, implicit-def dead $scc
+  ; CHECK-NEXT:   undef [[COPY:%[0-9]+]].sub0:sgpr_64 = COPY $sgpr31
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr27
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr25
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr5
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr18
+  ; CHECK-NEXT:   undef [[COPY7:%[0-9]+]].sub0:sgpr_64 = COPY $sgpr19
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sgpr_32 = COPY $sgpr20
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr21
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr22
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr23
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr9
+  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:sgpr_32 = COPY $sgpr10
+  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:sgpr_32 = COPY $sgpr8
+  ; CHECK-NEXT:   undef [[S_LOAD_DWORDX2_IMM:%[0-9]+]].sub0_sub1:sgpr_128 = S_LOAD_DWORDX2_IMM [[COPY]], 232, 0 :: (invariant load (s64) from %ir.39, addrspace 4)
+  ; CHECK-NEXT:   [[S_LSHL_B32_:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY5]], 4, implicit-def dead $scc
+  ; CHECK-NEXT:   [[S_LSHL_B32_1:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY4]], 4, implicit-def dead $scc
+  ; CHECK-NEXT:   [[S_LSHL_B32_2:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY3]], 4, implicit-def dead $scc
   ; CHECK-NEXT:   [[S_ASHR_I32_:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_]], 31, implicit-def dead $scc
   ; CHECK-NEXT:   [[S_ASHR_I32_1:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_1]], 31, implicit-def dead $scc
   ; CHECK-NEXT:   [[S_ASHR_I32_2:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_2]], 31, implicit-def dead $scc
-  ; CHECK-NEXT:   %71.sub1:sgpr_128 = S_AND_B32 %71.sub1, 65535, implicit-def dead $scc
-  ; CHECK-NEXT:   undef %130.sub0:sreg_64 = S_ADD_U32 [[COPY5]], [[S_LSHL_B32_2]], implicit-def $scc
-  ; CHECK-NEXT:   %130.sub1:sreg_64 = S_ADDC_U32 undef %54:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %130, 16, 0 :: (invariant load (s128) from %ir.81, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]].sub1:sgpr_128 = S_AND_B32 [[S_LOAD_DWORDX2_IMM]].sub1, 65535, implicit-def dead $scc
+  ; CHECK-NEXT:   undef [[S_ADD_U32_:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY6]], [[S_LSHL_B32_2]], implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADD_U32_:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %54:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_]], 16, 0 :: (invariant load (s128) from %ir.81, addrspace 4)
   ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM1:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM undef %74:sreg_64, 0, 0 :: (invariant load (s128) from `ptr addrspace(4) undef`, addrspace 4)
   ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM undef %132:sgpr_128, 0, 0 :: (dereferenceable invariant load (s32))
   ; CHECK-NEXT:   KILL undef %74:sreg_64
   ; CHECK-NEXT:   KILL undef %132:sgpr_128
-  ; CHECK-NEXT:   KILL %130.sub0, %130.sub1
+  ; CHECK-NEXT:   KILL [[S_ADD_U32_]].sub0, [[S_ADD_U32_]].sub1
   ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[S_LOAD_DWORDX4_IMM]], 0, 0 :: (dereferenceable invariant load (s32))
   ; CHECK-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-  ; CHECK-NEXT:   undef %302.sub1:sgpr_128 = S_MOV_B32 0
+  ; CHECK-NEXT:   undef [[S_MOV_B32_:%[0-9]+]].sub1:sgpr_128 = S_MOV_B32 0
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], undef %89:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN1:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM1]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
   ; CHECK-NEXT:   KILL undef %89:sgpr_128
   ; CHECK-NEXT:   [[S_SUB_I32_:%[0-9]+]]:sreg_32 = S_SUB_I32 [[S_BUFFER_LOAD_DWORD_IMM]], 29, implicit-def dead $scc
   ; CHECK-NEXT:   [[S_SUB_I32_1:%[0-9]+]]:sreg_32 = S_SUB_I32 [[S_BUFFER_LOAD_DWORD_IMM]], 30, implicit-def dead $scc
   ; CHECK-NEXT:   [[S_SUB_I32_2:%[0-9]+]]:sreg_32 = S_SUB_I32 [[S_BUFFER_LOAD_DWORD_IMM1]], 31, implicit-def dead $scc
-  ; CHECK-NEXT:   [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY5]], 64, implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADD_U32_1:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY6]], 64, implicit-def $scc
   ; CHECK-NEXT:   [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 undef %54:sreg_32, 0, implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   undef %149.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_]], [[S_LSHL_B32_]], implicit-def $scc
-  ; CHECK-NEXT:   %149.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_]], [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   undef %156.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_]], [[S_LSHL_B32_1]], implicit-def $scc
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM2:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %149, 0, 0 :: (invariant load (s128) from %ir.87, addrspace 4)
-  ; CHECK-NEXT:   %156.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_]], [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   undef %163.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_]], [[S_LSHL_B32_2]], implicit-def $scc
-  ; CHECK-NEXT:   %163.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_]], [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
+  ; CHECK-NEXT:   undef [[S_ADD_U32_2:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_1]], [[S_LSHL_B32_]], implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADD_U32_2:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_]], [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc
+  ; CHECK-NEXT:   undef [[S_ADD_U32_3:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_1]], [[S_LSHL_B32_1]], implicit-def $scc
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM2:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_2]], 0, 0 :: (invariant load (s128) from %ir.87, addrspace 4)
+  ; CHECK-NEXT:   [[S_ADD_U32_3:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_]], [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc
+  ; CHECK-NEXT:   undef [[S_ADD_U32_4:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_1]], [[S_LSHL_B32_2]], implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADD_U32_4:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_]], [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
   ; CHECK-NEXT:   [[S_ASHR_I32_3:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 undef %171:sreg_32, 31, implicit-def dead $scc
-  ; CHECK-NEXT:   undef %176.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_]], undef %171:sreg_32, implicit-def $scc
-  ; CHECK-NEXT:   %176.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_]], [[S_ASHR_I32_3]], implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   undef %183.sub0:sreg_64 = S_ADD_U32 %50.sub0, [[S_LSHL_B32_]], implicit-def $scc
-  ; CHECK-NEXT:   %183.sub1:sreg_64 = S_ADDC_U32 undef %51:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   undef %190.sub0:sreg_64 = S_ADD_U32 %50.sub0, [[S_LSHL_B32_1]], implicit-def $scc
-  ; CHECK-NEXT:   %190.sub1:sreg_64 = S_ADDC_U32 undef %51:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   undef %200.sub0:sreg_64 = S_ADD_U32 %50.sub0, undef %171:sreg_32, implicit-def $scc
-  ; CHECK-NEXT:   %200.sub1:sreg_64 = S_ADDC_U32 undef %51:sreg_32, [[S_ASHR_I32_3]], implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   [[S_ADD_U32_1:%[0-9]+]]:sreg_32 = S_ADD_U32 %50.sub0, 224, implicit-def $scc
+  ; CHECK-NEXT:   undef [[S_ADD_U32_5:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_1]], undef %171:sreg_32, implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADD_U32_5:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_]], [[S_ASHR_I32_3]], implicit-def dead $scc, implicit $scc
+  ; CHECK-NEXT:   undef [[S_ADD_U32_6:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY7]].sub0, [[S_LSHL_B32_]], implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADD_U32_6:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %51:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc
+  ; CHECK-NEXT:   undef [[S_ADD_U32_7:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY7]].sub0, [[S_LSHL_B32_1]], implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADD_U32_7:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %51:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc
+  ; CHECK-NEXT:   undef [[S_ADD_U32_8:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY7]].sub0, undef %171:sreg_32, implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADD_U32_8:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %51:sreg_32, [[S_ASHR_I32_3]], implicit-def dead $scc, implicit $scc
+  ; CHECK-NEXT:   [[S_ADD_U32_9:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY7]].sub0, 224, implicit-def $scc
   ; CHECK-NEXT:   [[S_ADDC_U32_1:%[0-9]+]]:sreg_32 = S_ADDC_U32 undef %51:sreg_32, 0, implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   undef %210.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_1]], [[S_LSHL_B32_]], implicit-def $scc
-  ; CHECK-NEXT:   %210.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_1]], [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   undef %217.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_1]], [[S_LSHL_B32_1]], implicit-def $scc
-  ; CHECK-NEXT:   %217.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_1]], [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   undef %224.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_1]], [[S_LSHL_B32_2]], implicit-def $scc
-  ; CHECK-NEXT:   %224.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_1]], [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   [[S_ADD_U32_2:%[0-9]+]]:sreg_32 = S_ADD_U32 %50.sub0, 576, implicit-def $scc
+  ; CHECK-NEXT:   undef [[S_ADD_U32_10:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_9]], [[S_LSHL_B32_]], implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADD_U32_10:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_1]], [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc
+  ; CHECK-NEXT:   undef [[S_ADD_U32_11:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_9]], [[S_LSHL_B32_1]], implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADD_U32_11:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_1]], [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc
+  ; CHECK-NEXT:   undef [[S_ADD_U32_12:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_9]], [[S_LSHL_B32_2]], implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADD_U32_12:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_1]], [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
+  ; CHECK-NEXT:   [[S_ADD_U32_13:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY7]].sub0, 576, implicit-def $scc
   ; CHECK-NEXT:   [[S_ADDC_U32_2:%[0-9]+]]:sreg_32 = S_ADDC_U32 undef %51:sreg_32, 0, implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   undef %241.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_2]], [[S_LSHL_B32_]], implicit-def $scc
-  ; CHECK-NEXT:   %241.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_2]], [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   undef %253.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_2]], [[S_LSHL_B32_2]], implicit-def $scc
-  ; CHECK-NEXT:   %253.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_2]], [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   undef %261.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_2]], undef %171:sreg_32, implicit-def $scc
-  ; CHECK-NEXT:   %261.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_2]], [[S_ASHR_I32_3]], implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   undef %273.sub0:sreg_64 = S_ADD_U32 [[COPY6]], [[S_LSHL_B32_]], implicit-def $scc
-  ; CHECK-NEXT:   %273.sub1:sreg_64 = S_ADDC_U32 undef %48:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   undef %286.sub0:sreg_64 = S_ADD_U32 [[COPY7]], [[S_LSHL_B32_1]], implicit-def $scc
-  ; CHECK-NEXT:   %286.sub1:sreg_64 = S_ADDC_U32 undef %45:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   undef %293.sub0:sreg_64 = S_ADD_U32 [[COPY7]], [[S_LSHL_B32_2]], implicit-def $scc
-  ; CHECK-NEXT:   %293.sub1:sreg_64 = S_ADDC_U32 undef %45:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
+  ; CHECK-NEXT:   undef [[S_ADD_U32_14:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_13]], [[S_LSHL_B32_]], implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADD_U32_14:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_2]], [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc
+  ; CHECK-NEXT:   undef [[S_ADD_U32_15:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_13]], [[S_LSHL_B32_2]], implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADD_U32_15:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_2]], [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
+  ; CHECK-NEXT:   undef [[S_ADD_U32_16:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_13]], undef %171:sreg_32, implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADD_U32_16:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_2]], [[S_ASHR_I32_3]], implicit-def dead $scc, implicit $scc
+  ; CHECK-NEXT:   undef [[S_ADD_U32_17:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY8]], [[S_LSHL_B32_]], implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADD_U32_17:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %48:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc
+  ; CHECK-NEXT:   undef [[S_ADD_U32_18:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY9]], [[S_LSHL_B32_1]], implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADD_U32_18:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %45:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc
+  ; CHECK-NEXT:   undef [[S_ADD_U32_19:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY9]], [[S_LSHL_B32_2]], implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADD_U32_19:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %45:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
   ; CHECK-NEXT:   [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_LSHL_B32_]], 16, implicit-def dead $scc
   ; CHECK-NEXT:   [[S_ADD_I32_1:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_LSHL_B32_2]], 16, implicit-def dead $scc
-  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_SGPR_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM %302, [[S_ADD_I32_]], 0, 0 :: (dereferenceable invariant load (s32))
-  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_SGPR_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM %302, undef %314:sreg_32, 0, 0 :: (dereferenceable invariant load (s32))
-  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_SGPR_IMM2:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM %302, [[S_ADD_I32_1]], 0, 0 :: (dereferenceable invariant load (s32))
-  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_IMM2:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %302, 16, 0 :: (dereferenceable invariant load (s32))
+  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_SGPR_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM [[S_MOV_B32_]], [[S_ADD_I32_]], 0, 0 :: (dereferenceable invariant load (s32))
+  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_SGPR_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM [[S_MOV_B32_]], undef %312:sreg_32, 0, 0 :: (dereferenceable invariant load (s32))
+  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_SGPR_IMM2:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM [[S_MOV_B32_]], [[S_ADD_I32_1]], 0, 0 :: (dereferenceable invariant load (s32))
+  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_IMM2:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[S_MOV_B32_]], 16, 0 :: (dereferenceable invariant load (s32))
   ; CHECK-NEXT:   [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET undef %118:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
-  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_SGPR_IMM3:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %369:sgpr_128, undef %370:sreg_32, 0, 0 :: (dereferenceable invariant load (s32))
-  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_IMM3:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM undef %380:sgpr_128, 16, 0 :: (dereferenceable invariant load (s32))
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM3:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %156, 0, 0 :: (invariant load (s128) from %ir.92, addrspace 4)
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM4:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %163, 0, 0 :: (invariant load (s128) from %ir.97, addrspace 4)
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM5:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %176, 0, 0 :: (invariant load (s128) from %ir.104, addrspace 4)
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM6:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %183, 0, 0 :: (invariant load (s128) from %ir.109, addrspace 4)
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM7:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %190, 0, 0 :: (invariant load (s128) from %ir.114, addrspace 4)
+  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_SGPR_IMM3:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %367:sgpr_128, undef %368:sreg_32, 0, 0 :: (dereferenceable invariant load (s32))
+  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_IMM3:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM undef %378:sgpr_128, 16, 0 :: (dereferenceable invariant load (s32))
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM3:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_3]], 0, 0 :: (invariant load (s128) from %ir.92, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM4:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_4]], 0, 0 :: (invariant load (s128) from %ir.97, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM5:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_5]], 0, 0 :: (invariant load (s128) from %ir.104, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM6:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_6]], 0, 0 :: (invariant load (s128) from %ir.109, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM7:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_7]], 0, 0 :: (invariant load (s128) from %ir.114, addrspace 4)
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN2:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM2]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
-  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_SGPR_IMM4:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %364:sgpr_128, [[S_ADD_I32_]], 0, 0 :: (dereferenceable invariant load (s32))
-  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_SGPR_IMM5:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %375:sgpr_128, [[S_ADD_I32_1]], 0, 0 :: (dereferenceable invariant load (s32))
+  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_SGPR_IMM4:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %362:sgpr_128, [[S_ADD_I32_]], 0, 0 :: (dereferenceable invariant load (s32))
+  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_SGPR_IMM5:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %373:sgpr_128, [[S_ADD_I32_1]], 0, 0 :: (dereferenceable invariant load (s32))
   ; CHECK-NEXT:   [[S_ADD_I32_2:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM]], -98, implicit-def dead $scc
   ; CHECK-NEXT:   [[S_ADD_I32_3:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM1]], -114, implicit-def dead $scc
   ; CHECK-NEXT:   [[S_ADD_I32_4:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM2]], -130, implicit-def dead $scc
   ; CHECK-NEXT:   [[S_ADD_I32_5:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM2]], -178, implicit-def dead $scc
-  ; CHECK-NEXT:   undef %327.sub0:sreg_64 = S_ADD_U32 [[COPY8]], [[S_LSHL_B32_]], implicit-def $scc
-  ; CHECK-NEXT:   %327.sub1:sreg_64 = S_ADDC_U32 undef %42:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   undef %335.sub0:sreg_64 = S_ADD_U32 [[COPY9]], [[S_LSHL_B32_]], implicit-def $scc
-  ; CHECK-NEXT:   %335.sub1:sreg_64 = S_ADDC_U32 undef %39:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   undef %343.sub0:sreg_64 = S_ADD_U32 [[COPY9]], [[S_LSHL_B32_1]], implicit-def $scc
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM8:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %200, 0, 0 :: (invariant load (s128) from %ir.121, addrspace 4)
-  ; CHECK-NEXT:   %343.sub1:sreg_64 = S_ADDC_U32 undef %39:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   undef %351.sub0:sreg_64 = S_ADD_U32 [[COPY9]], [[S_LSHL_B32_2]], implicit-def $scc
-  ; CHECK-NEXT:   %351.sub1:sreg_64 = S_ADDC_U32 undef %39:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   [[S_LSHL_B32_3:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY10]], 4, implicit-def dead $scc
+  ; CHECK-NEXT:   undef [[S_ADD_U32_20:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY10]], [[S_LSHL_B32_]], implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADD_U32_20:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %42:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc
+  ; CHECK-NEXT:   undef [[S_ADD_U32_21:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY11]], [[S_LSHL_B32_]], implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADD_U32_21:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %39:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc
+  ; CHECK-NEXT:   undef [[S_ADD_U32_22:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY11]], [[S_LSHL_B32_1]], implicit-def $scc
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM8:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_8]], 0, 0 :: (invariant load (s128) from %ir.121, addrspace 4)
+  ; CHECK-NEXT:   [[S_ADD_U32_22:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %39:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc
+  ; CHECK-NEXT:   undef [[S_ADD_U32_23:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY11]], [[S_LSHL_B32_2]], implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADD_U32_23:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %39:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
+  ; CHECK-NEXT:   [[S_LSHL_B32_3:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY12]], 4, implicit-def dead $scc
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN3:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM3]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
   ; CHECK-NEXT:   [[S_ADD_I32_6:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_LSHL_B32_3]], 16, implicit-def dead $scc
-  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_SGPR_IMM6:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %396:sgpr_128, [[S_ADD_I32_6]], 0, 0 :: (dereferenceable invariant load (s32))
+  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_SGPR_IMM6:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_SGPR_IMM undef %394:sgpr_128, [[S_ADD_I32_6]], 0, 0 :: (dereferenceable invariant load (s32))
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN4:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM4]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM9:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %50, 224, 0 :: (invariant load (s128) from %ir.126, addrspace 4)
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM10:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %210, 0, 0 :: (invariant load (s128) from %ir.127, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM9:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[COPY7]], 224, 0 :: (invariant load (s128) from %ir.126, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM10:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_10]], 0, 0 :: (invariant load (s128) from %ir.127, addrspace 4)
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN5:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM5]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM11:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %217, 0, 0 :: (invariant load (s128) from %ir.132, addrspace 4)
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM12:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %224, 0, 0 :: (invariant load (s128) from %ir.137, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM11:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_11]], 0, 0 :: (invariant load (s128) from %ir.132, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM12:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_12]], 0, 0 :: (invariant load (s128) from %ir.137, addrspace 4)
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN6:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM6]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN7:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM7]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN8:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM8]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
@@ -146,119 +146,119 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
   ; CHECK-NEXT:   [[S_ADD_I32_12:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -329, implicit-def dead $scc
   ; CHECK-NEXT:   [[S_ADD_I32_13:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -345, implicit-def dead $scc
   ; CHECK-NEXT:   [[S_ADD_I32_14:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM6]], -441, implicit-def dead $scc
-  ; CHECK-NEXT:   [[S_ADD_U32_3:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], 160, implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADD_U32_24:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY2]], 160, implicit-def $scc
   ; CHECK-NEXT:   [[S_ADDC_U32_3:%[0-9]+]]:sreg_32 = S_ADDC_U32 undef %36:sreg_32, 0, implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   undef %411.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_3]], [[S_LSHL_B32_2]], implicit-def $scc
-  ; CHECK-NEXT:   %411.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_3]], [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   [[S_LSHL_B32_4:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY11]], 4, implicit-def dead $scc
+  ; CHECK-NEXT:   undef [[S_ADD_U32_25:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_24]], [[S_LSHL_B32_2]], implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADD_U32_25:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_3]], [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
+  ; CHECK-NEXT:   [[S_LSHL_B32_4:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY13]], 4, implicit-def dead $scc
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN9:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM10]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
   ; CHECK-NEXT:   [[S_ASHR_I32_4:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_4]], 31, implicit-def dead $scc
-  ; CHECK-NEXT:   undef %425.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_3]], [[S_LSHL_B32_4]], implicit-def $scc
-  ; CHECK-NEXT:   %425.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_3]], [[S_ASHR_I32_4]], implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   [[S_ADD_U32_4:%[0-9]+]]:sreg_32 = S_ADD_U32 %56.sub0, 168, implicit-def $scc
+  ; CHECK-NEXT:   undef [[S_ADD_U32_26:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_24]], [[S_LSHL_B32_4]], implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADD_U32_26:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_3]], [[S_ASHR_I32_4]], implicit-def dead $scc, implicit $scc
+  ; CHECK-NEXT:   [[S_ADD_U32_27:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY]].sub0, 168, implicit-def $scc
   ; CHECK-NEXT:   [[S_ADDC_U32_4:%[0-9]+]]:sreg_32 = S_ADDC_U32 undef %57:sreg_32, 0, implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM13:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %241, 0, 0 :: (invariant load (s128) from %ir.147, addrspace 4)
-  ; CHECK-NEXT:   [[S_LSHL_B32_5:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY4]], 3, implicit-def dead $scc
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM13:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_14]], 0, 0 :: (invariant load (s128) from %ir.147, addrspace 4)
+  ; CHECK-NEXT:   [[S_LSHL_B32_5:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY5]], 3, implicit-def dead $scc
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN10:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM11]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
   ; CHECK-NEXT:   [[S_ASHR_I32_5:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_5]], 31, implicit-def dead $scc
-  ; CHECK-NEXT:   undef %441.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_4]], [[S_LSHL_B32_5]], implicit-def $scc
-  ; CHECK-NEXT:   %441.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_4]], [[S_ASHR_I32_5]], implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %441, 0, 0 :: (invariant load (s32) from %ir.269, align 8, addrspace 4)
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM14:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %253, 0, 0 :: (invariant load (s128) from %ir.154, addrspace 4)
+  ; CHECK-NEXT:   undef [[S_ADD_U32_28:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_27]], [[S_LSHL_B32_5]], implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADD_U32_28:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_4]], [[S_ASHR_I32_5]], implicit-def dead $scc, implicit $scc
+  ; CHECK-NEXT:   [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[S_ADD_U32_28]], 0, 0 :: (invariant load (s32) from %ir.269, align 8, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM14:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_15]], 0, 0 :: (invariant load (s128) from %ir.154, addrspace 4)
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN11:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM12]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM15:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %261, 0, 0 :: (invariant load (s128) from %ir.159, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM15:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_16]], 0, 0 :: (invariant load (s128) from %ir.159, addrspace 4)
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN12:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM9]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN13:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM13]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
-  ; CHECK-NEXT:   %71.sub3:sgpr_128 = S_MOV_B32 553734060
-  ; CHECK-NEXT:   %71.sub2:sgpr_128 = S_MOV_B32 -1
-  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:sgpr_128 = COPY %71
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM16:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %273, 0, 0 :: (invariant load (s128) from %ir.167, addrspace 4)
-  ; CHECK-NEXT:   [[COPY13]].sub1:sgpr_128 = COPY %302.sub1
-  ; CHECK-NEXT:   [[COPY13]].sub0:sgpr_128 = COPY [[S_LOAD_DWORD_IMM]]
-  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_IMM4:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[COPY13]], 0, 0 :: (dereferenceable invariant load (s32))
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]].sub3:sgpr_128 = S_MOV_B32 553734060
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]].sub2:sgpr_128 = S_MOV_B32 -1
+  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM]]
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM16:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_17]], 0, 0 :: (invariant load (s128) from %ir.167, addrspace 4)
+  ; CHECK-NEXT:   [[COPY15:%[0-9]+]].sub1:sgpr_128 = COPY [[S_MOV_B32_]].sub1
+  ; CHECK-NEXT:   [[COPY15:%[0-9]+]].sub0:sgpr_128 = COPY [[S_LOAD_DWORD_IMM]]
+  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_IMM4:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[COPY15]], 0, 0 :: (dereferenceable invariant load (s32))
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN14:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM14]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN15:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM15]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM17:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %286, 0, 0 :: (invariant load (s128) from %ir.175, addrspace 4)
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM18:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %293, 0, 0 :: (invariant load (s128) from %ir.180, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM17:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_18]], 0, 0 :: (invariant load (s128) from %ir.175, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM18:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_19]], 0, 0 :: (invariant load (s128) from %ir.180, addrspace 4)
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN16:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM16]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
-  ; CHECK-NEXT:   [[S_LSHL_B32_6:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY3]], 3, implicit-def dead $scc
+  ; CHECK-NEXT:   [[S_LSHL_B32_6:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY4]], 3, implicit-def dead $scc
   ; CHECK-NEXT:   [[BUFFER_LOAD_DWORD_OFFSET1:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[S_LOAD_DWORDX4_IMM1]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
   ; CHECK-NEXT:   [[S_ASHR_I32_6:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_6]], 31, implicit-def dead $scc
   ; CHECK-NEXT:   [[S_ADD_I32_15:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM4]], -467, implicit-def dead $scc
-  ; CHECK-NEXT:   undef %453.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_4]], [[S_LSHL_B32_6]], implicit-def $scc
-  ; CHECK-NEXT:   %453.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_4]], [[S_ASHR_I32_6]], implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM %453, 0, 0 :: (invariant load (s64) from %ir.277, addrspace 4)
+  ; CHECK-NEXT:   undef [[S_ADD_U32_29:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_27]], [[S_LSHL_B32_6]], implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADD_U32_29:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_4]], [[S_ASHR_I32_6]], implicit-def dead $scc, implicit $scc
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX2_IMM1:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[S_ADD_U32_29]], 0, 0 :: (invariant load (s64) from %ir.277, addrspace 4)
   ; CHECK-NEXT:   [[BUFFER_LOAD_DWORD_OFFSET2:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[S_LOAD_DWORDX4_IMM17]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
   ; CHECK-NEXT:   [[BUFFER_LOAD_DWORD_OFFSET3:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[S_LOAD_DWORDX4_IMM18]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM19:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %327, 0, 0 :: (invariant load (s128) from %ir.202, addrspace 4)
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM20:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %335, 0, 0 :: (invariant load (s128) from %ir.208, addrspace 4)
-  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:sgpr_128 = COPY %71
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM21:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %343, 0, 0 :: (invariant load (s128) from %ir.213, addrspace 4)
-  ; CHECK-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_LOAD_DWORDX2_IMM]].sub1, 65535, implicit-def dead $scc
-  ; CHECK-NEXT:   [[COPY14]].sub0:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM]].sub0
-  ; CHECK-NEXT:   [[COPY14]].sub1:sgpr_128 = COPY [[S_AND_B32_]]
-  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_IMM5:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[COPY14]], 0, 0 :: (dereferenceable invariant load (s32))
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM22:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %351, 0, 0 :: (invariant load (s128) from %ir.218, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM19:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_20]], 0, 0 :: (invariant load (s128) from %ir.202, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM20:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_21]], 0, 0 :: (invariant load (s128) from %ir.208, addrspace 4)
+  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM]]
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM21:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_22]], 0, 0 :: (invariant load (s128) from %ir.213, addrspace 4)
+  ; CHECK-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_LOAD_DWORDX2_IMM1]].sub1, 65535, implicit-def dead $scc
+  ; CHECK-NEXT:   [[COPY16:%[0-9]+]].sub0:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM1]].sub0
+  ; CHECK-NEXT:   [[COPY16:%[0-9]+]].sub1:sgpr_128 = COPY [[S_AND_B32_]]
+  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_IMM5:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[COPY16]], 0, 0 :: (dereferenceable invariant load (s32))
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM22:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_23]], 0, 0 :: (invariant load (s128) from %ir.218, addrspace 4)
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN17:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM19]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN18:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM20]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
-  ; CHECK-NEXT:   [[S_LSHL_B32_7:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY2]], 3, implicit-def dead $scc
+  ; CHECK-NEXT:   [[S_LSHL_B32_7:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY3]], 3, implicit-def dead $scc
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN19:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM21]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
   ; CHECK-NEXT:   [[S_ASHR_I32_7:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_7]], 31, implicit-def dead $scc
   ; CHECK-NEXT:   [[S_ADD_I32_16:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM5]], -468, implicit-def dead $scc
-  ; CHECK-NEXT:   undef %468.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_4]], [[S_LSHL_B32_7]], implicit-def $scc
-  ; CHECK-NEXT:   %468.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_4]], [[S_ASHR_I32_7]], implicit-def dead $scc, implicit $scc
+  ; CHECK-NEXT:   undef [[S_ADD_U32_30:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_27]], [[S_LSHL_B32_7]], implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADD_U32_30:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_4]], [[S_ASHR_I32_7]], implicit-def dead $scc, implicit $scc
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN20:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM22]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX2_IMM1:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM %468, 0, 0 :: (invariant load (s64) from %ir.287, addrspace 4)
-  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:sgpr_128 = COPY %71
-  ; CHECK-NEXT:   [[S_AND_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_LOAD_DWORDX2_IMM1]].sub1, 65535, implicit-def dead $scc
-  ; CHECK-NEXT:   [[COPY15]].sub0:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM1]].sub0
-  ; CHECK-NEXT:   [[COPY15]].sub1:sgpr_128 = COPY [[S_AND_B32_1]]
-  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_IMM6:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[COPY15]], 0, 0 :: (dereferenceable invariant load (s32))
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM23:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %411, 0, 0 :: (invariant load (s128) from %ir.253, addrspace 4)
-  ; CHECK-NEXT:   [[S_LOAD_DWORD_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM undef %488:sreg_64, 0, 0 :: (invariant load (s32) from `ptr addrspace(4) undef`, addrspace 4)
-  ; CHECK-NEXT:   KILL %411.sub0, %411.sub1
-  ; CHECK-NEXT:   KILL undef %488:sreg_64
-  ; CHECK-NEXT:   KILL [[COPY15]].sub0_sub1_sub2, [[COPY15]].sub3
-  ; CHECK-NEXT:   [[S_LSHL_B32_8:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY12]], 3, implicit-def dead $scc
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM24:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %425, 0, 0 :: (invariant load (s128) from %ir.261, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX2_IMM2:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[S_ADD_U32_30]], 0, 0 :: (invariant load (s64) from %ir.287, addrspace 4)
+  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM]]
+  ; CHECK-NEXT:   [[S_AND_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_LOAD_DWORDX2_IMM2]].sub1, 65535, implicit-def dead $scc
+  ; CHECK-NEXT:   [[COPY17:%[0-9]+]].sub0:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM2]].sub0
+  ; CHECK-NEXT:   [[COPY17:%[0-9]+]].sub1:sgpr_128 = COPY [[S_AND_B32_1]]
+  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_IMM6:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[COPY17]], 0, 0 :: (dereferenceable invariant load (s32))
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM23:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_25]], 0, 0 :: (invariant load (s128) from %ir.253, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORD_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM undef %484:sreg_64, 0, 0 :: (invariant load (s32) from `ptr addrspace(4) undef`, addrspace 4)
+  ; CHECK-NEXT:   KILL [[S_ADD_U32_25]].sub0, [[S_ADD_U32_25]].sub1
+  ; CHECK-NEXT:   KILL undef %484:sreg_64
+  ; CHECK-NEXT:   KILL [[COPY17]].sub0_sub1_sub2, [[COPY17]].sub3
+  ; CHECK-NEXT:   [[S_LSHL_B32_8:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY14]], 3, implicit-def dead $scc
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM24:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_26]], 0, 0 :: (invariant load (s128) from %ir.261, addrspace 4)
   ; CHECK-NEXT:   [[S_ASHR_I32_8:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_8]], 31, implicit-def dead $scc
   ; CHECK-NEXT:   [[S_ADD_I32_17:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM6]], -469, implicit-def dead $scc
-  ; CHECK-NEXT:   undef %485.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_4]], [[S_LSHL_B32_8]], implicit-def $scc
-  ; CHECK-NEXT:   %485.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_4]], [[S_ASHR_I32_8]], implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   [[S_LOAD_DWORD_IMM2:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %485, 0, 0 :: (invariant load (s32) from %ir.298, align 8, addrspace 4)
+  ; CHECK-NEXT:   undef [[S_ADD_U32_31:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_27]], [[S_LSHL_B32_8]], implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADD_U32_31:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_4]], [[S_ASHR_I32_8]], implicit-def dead $scc, implicit $scc
+  ; CHECK-NEXT:   [[S_LOAD_DWORD_IMM2:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[S_ADD_U32_31]], 0, 0 :: (invariant load (s32) from %ir.298, align 8, addrspace 4)
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN21:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM23]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN22:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM24]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
   ; CHECK-NEXT:   KILL [[S_LOAD_DWORDX4_IMM24]]
   ; CHECK-NEXT:   KILL [[S_LOAD_DWORDX4_IMM23]]
   ; CHECK-NEXT:   [[S_AND_B32_2:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_LOAD_DWORD_IMM1]], 65535, implicit-def dead $scc
-  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:sgpr_128 = COPY %71
-  ; CHECK-NEXT:   [[COPY16]].sub1:sgpr_128 = COPY [[S_AND_B32_2]]
-  ; CHECK-NEXT:   [[COPY16]].sub0:sgpr_128 = COPY [[S_LOAD_DWORD_IMM2]]
-  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_IMM7:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[COPY16]], 0, 0 :: (dereferenceable invariant load (s32))
+  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM]]
+  ; CHECK-NEXT:   [[COPY18:%[0-9]+]].sub1:sgpr_128 = COPY [[S_AND_B32_2]]
+  ; CHECK-NEXT:   [[COPY18:%[0-9]+]].sub0:sgpr_128 = COPY [[S_LOAD_DWORD_IMM2]]
+  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_IMM7:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[COPY18]], 0, 0 :: (dereferenceable invariant load (s32))
   ; CHECK-NEXT:   [[S_ADD_I32_18:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM]], -474, implicit-def dead $scc
   ; CHECK-NEXT:   [[S_ADD_I32_19:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -475, implicit-def dead $scc
   ; CHECK-NEXT:   [[S_ADD_I32_20:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -491, implicit-def dead $scc
   ; CHECK-NEXT:   [[S_ADD_I32_21:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -507, implicit-def dead $scc
   ; CHECK-NEXT:   [[S_ADD_I32_22:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_SGPR_IMM3]], -539, implicit-def dead $scc
   ; CHECK-NEXT:   [[S_ADD_I32_23:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM7]], -473, implicit-def dead $scc
-  ; CHECK-NEXT:   [[S_ADD_U32_5:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY]], 96, implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADD_U32_32:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY1]], 96, implicit-def $scc
   ; CHECK-NEXT:   [[S_ADDC_U32_5:%[0-9]+]]:sreg_32 = S_ADDC_U32 undef %33:sreg_32, 0, implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   undef %514.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_5]], [[S_LSHL_B32_]], implicit-def $scc
-  ; CHECK-NEXT:   %514.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_5]], [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM25:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %514, 0, 0 :: (invariant load (s128) from %ir.316, addrspace 4)
-  ; CHECK-NEXT:   undef %522.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_5]], [[S_LSHL_B32_1]], implicit-def $scc
-  ; CHECK-NEXT:   %522.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_5]], [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM26:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %522, 0, 0 :: (invariant load (s128) from %ir.321, addrspace 4)
-  ; CHECK-NEXT:   undef %530.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_5]], [[S_LSHL_B32_2]], implicit-def $scc
-  ; CHECK-NEXT:   %530.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_5]], [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM27:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %530, 0, 0 :: (invariant load (s128) from %ir.326, addrspace 4)
+  ; CHECK-NEXT:   undef [[S_ADD_U32_33:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_32]], [[S_LSHL_B32_]], implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADD_U32_33:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_5]], [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM25:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_33]], 0, 0 :: (invariant load (s128) from %ir.316, addrspace 4)
+  ; CHECK-NEXT:   undef [[S_ADD_U32_34:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_32]], [[S_LSHL_B32_1]], implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADD_U32_34:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_5]], [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM26:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_34]], 0, 0 :: (invariant load (s128) from %ir.321, addrspace 4)
+  ; CHECK-NEXT:   undef [[S_ADD_U32_35:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_32]], [[S_LSHL_B32_2]], implicit-def $scc
+  ; CHECK-NEXT:   [[S_ADD_U32_35:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_5]], [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX4_IMM27:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_35]], 0, 0 :: (invariant load (s128) from %ir.326, addrspace 4)
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN23:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM25]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN24:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM26]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
   ; CHECK-NEXT:   [[BUFFER_LOAD_FORMAT_X_IDXEN25:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM27]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8)
-  ; CHECK-NEXT:   KILL [[S_LOAD_DWORDX4_IMM27]]
   ; CHECK-NEXT:   KILL [[S_LOAD_DWORDX4_IMM25]]
   ; CHECK-NEXT:   KILL [[V_MOV_B32_e32_]]
   ; CHECK-NEXT:   KILL [[S_LOAD_DWORDX4_IMM26]]
+  ; CHECK-NEXT:   KILL [[S_LOAD_DWORDX4_IMM27]]
   ; CHECK-NEXT:   [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -2, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec
   ; CHECK-NEXT:   [[V_ADD_U32_e64_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -1, [[BUFFER_LOAD_FORMAT_X_IDXEN1]], 0, implicit $exec
   ; CHECK-NEXT:   [[V_ADD_U32_e64_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -3, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec
@@ -363,20 +363,20 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x
   ; CHECK-NEXT:   [[V_OR_B32_e64_61:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_60]], [[V_ADD_U32_e64_25]], implicit $exec
   ; CHECK-NEXT:   [[V_ADD_U32_e64_27:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -575, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec
   ; CHECK-NEXT:   [[V_OR_B32_e64_62:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_61]], [[V_ADD_U32_e64_26]], implicit $exec
-  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_IMM8:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %71, 0, 0 :: (dereferenceable invariant load (s32))
+  ; CHECK-NEXT:   [[S_BUFFER_LOAD_DWORD_IMM8:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[S_LOAD_DWORDX2_IMM]], 0, 0 :: (dereferenceable invariant load (s32))
   ; CHECK-NEXT:   [[V_ADD_U32_e64_28:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -576, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec
   ; CHECK-NEXT:   [[V_OR_B32_e64_63:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_62]], [[V_ADD_U32_e64_27]], implicit $exec
   ; CHECK-NEXT:   [[V_ADD_U32_e64_29:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -577, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec
   ; CHECK-NEXT:   [[V_OR_B32_e64_64:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_63]], [[V_ADD_U32_e64_28]], implicit $exec
   ; CHECK-NEXT:   [[V_ADD_U32_e64_30:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 -593, [[BUFFER_LOAD_FORMAT_X_IDXEN]], 0, implicit $exec
   ; CHECK-NEXT:   [[V_OR_B32_e64_65:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_64]], [[V_ADD_U32_e64_29]], implicit $exec
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM undef %564:sreg_64, 0, 0 :: (invariant load (s256) from `ptr addrspace(4) undef`, addrspace 4)
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX8_IMM:%[0-9]+]]:sgpr_256 = S_LOAD_DWORDX8_IMM undef %559:sreg_64, 0, 0 :: (invariant load (s256) from `ptr addrspace(4) undef`, addrspace 4)
   ; CHECK-NEXT:   [[V_OR_B32_e64_66:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_OR_B32_e64_65]], [[V_ADD_U32_e64_30]], implicit $exec
   ; CHECK-NEXT:   [[S_ADD_I32_24:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM8]], -594, implicit-def dead $scc
   ; CHECK-NEXT:   [[V_OR_B32_e64_67:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[S_ADD_I32_24]], [[V_OR_B32_e64_66]], implicit $exec
   ; CHECK-NEXT:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U32_e64 0, [[V_OR_B32_e64_67]], implicit $exec
-  ; CHECK-NEXT:   undef %624.sub3:vreg_128 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[V_CMP_EQ_U32_e64_]], implicit $exec
-  ; CHECK-NEXT:   IMAGE_STORE_V4_V2_gfx10 %624, undef %578:vreg_64, [[S_LOAD_DWORDX8_IMM]], 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s128), addrspace 8)
+  ; CHECK-NEXT:   undef [[V_CNDMASK_B32_e64_:%[0-9]+]].sub3:vreg_128 = V_CNDMASK_B32_e64 0, 0, 0, 1, [[V_CMP_EQ_U32_e64_]], implicit $exec
+  ; CHECK-NEXT:   IMAGE_STORE_V4_V2_gfx10 [[V_CNDMASK_B32_e64_]], undef %573:vreg_64, [[S_LOAD_DWORDX8_IMM]], 15, 1, -1, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store (s128), addrspace 8)
   ; CHECK-NEXT:   S_ENDPGM 0
 .expVert:
   %0 = extractelement <31 x i32> %userData, i64 2
diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll
index 24319a639da44..520ec6e24ae3b 100644
--- a/llvm/test/CodeGen/AMDGPU/srem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/srem64.ll
@@ -1779,30 +1779,29 @@ define i64 @v_test_srem_pow2_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_addc_u32_e64 v4, s[6:7], 0, -1, vcc
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
 ; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[3:4]
+; GCN-IR-NEXT:    v_cmp_ne_u64_e64 s[6:7], 63, v[3:4]
 ; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0x8000
 ; GCN-IR-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
-; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 63, v[3:4]
 ; GCN-IR-NEXT:    v_cndmask_b32_e64 v5, v5, 0, s[4:5]
 ; GCN-IR-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
 ; GCN-IR-NEXT:    v_mov_b32_e32 v2, 0
-; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0x8000
-; GCN-IR-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
+; GCN-IR-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
 ; GCN-IR-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
 ; GCN-IR-NEXT:    s_cbranch_execz .LBB12_6
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
 ; GCN-IR-NEXT:    v_add_i32_e32 v7, vcc, 1, v3
-; GCN-IR-NEXT:    v_addc_u32_e32 v8, vcc, 0, v4, vcc
 ; GCN-IR-NEXT:    v_sub_i32_e64 v2, s[4:5], 63, v3
+; GCN-IR-NEXT:    v_addc_u32_e32 v8, vcc, 0, v4, vcc
+; GCN-IR-NEXT:    s_mov_b64 s[4:5], 0x8000
 ; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[7:8]
-; GCN-IR-NEXT:    v_lshl_b64 v[2:3], s[8:9], v2
+; GCN-IR-NEXT:    v_lshl_b64 v[2:3], s[4:5], v2
 ; GCN-IR-NEXT:    v_mov_b32_e32 v4, 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
-; GCN-IR-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GCN-IR-NEXT:    s_xor_b64 s[8:9], exec, s[4:5]
+; GCN-IR-NEXT:    s_and_saveexec_b64 s[8:9], vcc
+; GCN-IR-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
 ; GCN-IR-NEXT:    s_cbranch_execz .LBB12_5
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
 ; GCN-IR-NEXT:    v_add_i32_e32 v12, vcc, -1, v0
-; GCN-IR-NEXT:    s_mov_b64 s[4:5], 0x8000
 ; GCN-IR-NEXT:    v_addc_u32_e32 v13, vcc, -1, v1, vcc
 ; GCN-IR-NEXT:    v_lshr_b64 v[8:9], s[4:5], v7
 ; GCN-IR-NEXT:    v_sub_i32_e32 v6, vcc, 47, v6
diff --git a/llvm/test/CodeGen/AMDGPU/swdev380865.ll b/llvm/test/CodeGen/AMDGPU/swdev380865.ll
index 6c98c7def2328..9189cef019cf4 100644
--- a/llvm/test/CodeGen/AMDGPU/swdev380865.ll
+++ b/llvm/test/CodeGen/AMDGPU/swdev380865.ll
@@ -16,94 +16,43 @@ define amdgpu_kernel void @_Z6kernelILi4000ELi1EEvPd(ptr addrspace(1) %x.coerce)
 ; CHECK:       ; %bb.0: ; %entry
 ; CHECK-NEXT:    s_mov_b64 s[0:1], 0
 ; CHECK-NEXT:    s_load_dword s2, s[0:1], 0x0
-; CHECK-NEXT:    ; implicit-def: $vgpr2 : SGPR spill to VGPR lane
-; CHECK-NEXT:    ; kill: killed $sgpr0_sgpr1
-; CHECK-NEXT:    s_mov_b32 s7, 0x401c0000
-; CHECK-NEXT:    s_mov_b32 s5, 0x40280000
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_writelane_b32 v2, s2, 0
-; CHECK-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; CHECK-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
+; CHECK-NEXT:    s_mov_b32 s4, 0
 ; CHECK-NEXT:    s_mov_b32 s0, 0
-; CHECK-NEXT:    s_mov_b32 s1, 0x40140000
-; CHECK-NEXT:    s_mov_b32 s1, 0x40180000
-; CHECK-NEXT:    v_writelane_b32 v2, s0, 1
-; CHECK-NEXT:    v_writelane_b32 v2, s1, 2
-; CHECK-NEXT:    s_mov_b32 s1, 0x40220000
-; CHECK-NEXT:    v_writelane_b32 v2, s0, 3
-; CHECK-NEXT:    v_writelane_b32 v2, s1, 4
-; CHECK-NEXT:    s_mov_b32 s1, 0x40240000
-; CHECK-NEXT:    v_writelane_b32 v2, s0, 5
-; CHECK-NEXT:    v_writelane_b32 v2, s1, 6
-; CHECK-NEXT:    s_mov_b32 s1, 0x40260000
-; CHECK-NEXT:    v_writelane_b32 v2, s0, 7
+; CHECK-NEXT:    s_mov_b32 s5, 0x40280000
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_mov_b32_e32 v0, s2
-; CHECK-NEXT:    v_writelane_b32 v2, s1, 8
-; CHECK-NEXT:    v_mov_b32_e32 v1, s3
+; CHECK-NEXT:    s_mov_b32 s1, s2
+; CHECK-NEXT:    s_mov_b32 s2, 0
+; CHECK-NEXT:    v_mov_b32_e32 v0, s6
+; CHECK-NEXT:    s_mov_b32 s3, 0x40260000
+; CHECK-NEXT:    v_mov_b32_e32 v1, s7
 ; CHECK-NEXT:  .LBB0_1: ; %for.cond4.preheader
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    v_add_f64 v[0:1], v[0:1], 0
-; CHECK-NEXT:    s_mov_b32 s2, 0
-; CHECK-NEXT:    s_mov_b32 s3, 0x40140000
-; CHECK-NEXT:    v_writelane_b32 v2, s0, 9
-; CHECK-NEXT:    v_writelane_b32 v2, s6, 10
-; CHECK-NEXT:    v_writelane_b32 v2, s7, 11
-; CHECK-NEXT:    v_readlane_b32 s6, v2, 1
-; CHECK-NEXT:    v_readlane_b32 s7, v2, 2
-; CHECK-NEXT:    v_add_f64 v[0:1], v[0:1], s[2:3]
-; CHECK-NEXT:    s_mov_b32 s1, s7
-; CHECK-NEXT:    s_mov_b32 s0, s2
-; CHECK-NEXT:    v_writelane_b32 v2, s6, 1
-; CHECK-NEXT:    v_writelane_b32 v2, s7, 2
-; CHECK-NEXT:    v_readlane_b32 s6, v2, 10
-; CHECK-NEXT:    v_readlane_b32 s7, v2, 11
-; CHECK-NEXT:    s_mov_b32 s6, s2
-; CHECK-NEXT:    v_add_f64 v[0:1], v[0:1], s[0:1]
-; CHECK-NEXT:    v_readlane_b32 s0, v2, 3
-; CHECK-NEXT:    v_readlane_b32 s1, v2, 4
-; CHECK-NEXT:    s_mov_b32 s3, s1
-; CHECK-NEXT:    s_mov_b32 s0, 0
-; CHECK-NEXT:    s_mov_b32 s1, 0x40140000
-; CHECK-NEXT:    s_mov_b32 s2, s0
-; CHECK-NEXT:    s_mov_b32 s1, s3
+; CHECK-NEXT:    s_mov_b32 s6, 0
+; CHECK-NEXT:    s_mov_b32 s7, 0x40140000
+; CHECK-NEXT:    s_add_i32 s0, s0, s1
+; CHECK-NEXT:    s_cmpk_lt_i32 s0, 0xa00
+; CHECK-NEXT:    v_add_f64 v[0:1], v[0:1], s[6:7]
+; CHECK-NEXT:    s_mov_b32 s6, 0
+; CHECK-NEXT:    s_mov_b32 s7, 0x40180000
+; CHECK-NEXT:    v_add_f64 v[0:1], v[0:1], s[6:7]
+; CHECK-NEXT:    s_mov_b32 s6, 0
+; CHECK-NEXT:    s_mov_b32 s7, 0x401c0000
+; CHECK-NEXT:    v_add_f64 v[0:1], v[0:1], s[6:7]
+; CHECK-NEXT:    s_mov_b32 s6, 0
+; CHECK-NEXT:    s_mov_b32 s7, 0x40220000
+; CHECK-NEXT:    v_add_f64 v[0:1], v[0:1], s[6:7]
+; CHECK-NEXT:    s_mov_b32 s6, 0
+; CHECK-NEXT:    s_mov_b32 s7, 0x40240000
 ; CHECK-NEXT:    v_add_f64 v[0:1], v[0:1], s[6:7]
-; CHECK-NEXT:    v_writelane_b32 v2, s0, 3
-; CHECK-NEXT:    v_writelane_b32 v2, s1, 4
-; CHECK-NEXT:    v_readlane_b32 s0, v2, 5
-; CHECK-NEXT:    v_readlane_b32 s1, v2, 6
-; CHECK-NEXT:    v_add_f64 v[0:1], v[0:1], s[2:3]
-; CHECK-NEXT:    s_mov_b32 s3, s1
-; CHECK-NEXT:    s_mov_b32 s0, 0
-; CHECK-NEXT:    s_mov_b32 s1, 0x40140000
-; CHECK-NEXT:    s_mov_b32 s2, s0
-; CHECK-NEXT:    s_mov_b32 s1, s3
-; CHECK-NEXT:    v_writelane_b32 v2, s0, 5
-; CHECK-NEXT:    v_writelane_b32 v2, s1, 6
-; CHECK-NEXT:    v_add_f64 v[0:1], v[0:1], s[2:3]
-; CHECK-NEXT:    v_readlane_b32 s0, v2, 7
-; CHECK-NEXT:    v_readlane_b32 s1, v2, 8
-; CHECK-NEXT:    s_mov_b32 s3, s1
-; CHECK-NEXT:    s_mov_b32 s0, 0
-; CHECK-NEXT:    s_mov_b32 s1, 0x40140000
-; CHECK-NEXT:    s_mov_b32 s2, s0
-; CHECK-NEXT:    s_mov_b32 s1, s3
 ; CHECK-NEXT:    v_add_f64 v[0:1], v[0:1], s[2:3]
-; CHECK-NEXT:    v_writelane_b32 v2, s0, 7
-; CHECK-NEXT:    s_mov_b32 s4, s0
-; CHECK-NEXT:    v_writelane_b32 v2, s1, 8
-; CHECK-NEXT:    v_readlane_b32 s0, v2, 0
-; CHECK-NEXT:    v_readlane_b32 s2, v2, 9
-; CHECK-NEXT:    s_add_i32 s2, s2, s0
-; CHECK-NEXT:    v_writelane_b32 v2, s2, 9
 ; CHECK-NEXT:    v_add_f64 v[0:1], v[0:1], s[4:5]
-; CHECK-NEXT:    v_readlane_b32 s0, v2, 9
-; CHECK-NEXT:    s_cmpk_lt_i32 s0, 0xa00
 ; CHECK-NEXT:    s_cbranch_scc1 .LBB0_1
 ; CHECK-NEXT:  ; %bb.2: ; %for.cond.cleanup.loopexit
+; CHECK-NEXT:    v_mov_b32_e32 v2, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v3, 0
-; CHECK-NEXT:    v_mov_b32_e32 v4, 0
-; CHECK-NEXT:    global_store_dwordx2 v[3:4], v[0:1], off
-; CHECK-NEXT:    ; kill: killed $vgpr2
+; CHECK-NEXT:    global_store_dwordx2 v[2:3], v[0:1], off
 ; CHECK-NEXT:    s_endpgm
 entry:
   %0 = load i32, ptr addrspace(4) null, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
index dcf49de684924..05de0bc5f282a 100644
--- a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
+++ b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll
@@ -39,68 +39,68 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[0:1], 0, 0
 ; GLOBALNESS1-NEXT:    global_store_dword v[0:1], v42, off
 ; GLOBALNESS1-NEXT:    s_waitcnt lgkmcnt(0)
-; GLOBALNESS1-NEXT:    global_load_dword v0, v42, s[76:77]
+; GLOBALNESS1-NEXT:    global_load_dword v2, v42, s[76:77]
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[40:41], s[4:5]
 ; GLOBALNESS1-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x18
 ; GLOBALNESS1-NEXT:    s_load_dword s7, s[8:9], 0x20
 ; GLOBALNESS1-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
 ; GLOBALNESS1-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
 ; GLOBALNESS1-NEXT:    s_add_u32 s0, s0, s17
+; GLOBALNESS1-NEXT:    v_mov_b32_e32 v0, 0
 ; GLOBALNESS1-NEXT:    s_addc_u32 s1, s1, 0
-; GLOBALNESS1-NEXT:    v_mov_b32_e32 v43, 0x40994400
+; GLOBALNESS1-NEXT:    v_mov_b32_e32 v1, 0x40994400
 ; GLOBALNESS1-NEXT:    s_bitcmp1_b32 s78, 0
 ; GLOBALNESS1-NEXT:    s_waitcnt lgkmcnt(0)
-; GLOBALNESS1-NEXT:    v_cmp_ngt_f64_e32 vcc, s[4:5], v[42:43]
+; GLOBALNESS1-NEXT:    v_cmp_ngt_f64_e32 vcc, s[4:5], v[0:1]
 ; GLOBALNESS1-NEXT:    v_cmp_ngt_f64_e64 s[4:5], s[4:5], 0
-; GLOBALNESS1-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[4:5]
+; GLOBALNESS1-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
 ; GLOBALNESS1-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; GLOBALNESS1-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[4:5]
 ; GLOBALNESS1-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
-; GLOBALNESS1-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GLOBALNESS1-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GLOBALNESS1-NEXT:    s_bitcmp1_b32 s6, 0
-; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[42:43], 1, v1
-; GLOBALNESS1-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
+; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[42:43], 1, v0
+; GLOBALNESS1-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GLOBALNESS1-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; GLOBALNESS1-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
 ; GLOBALNESS1-NEXT:    s_bitcmp1_b32 s7, 0
-; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[44:45], 1, v2
-; GLOBALNESS1-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[4:5]
+; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[48:49], 1, v0
+; GLOBALNESS1-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GLOBALNESS1-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; GLOBALNESS1-NEXT:    s_getpc_b64 s[6:7]
 ; GLOBALNESS1-NEXT:    s_add_u32 s6, s6, wobble@gotpcrel32@lo+4
 ; GLOBALNESS1-NEXT:    s_addc_u32 s7, s7, wobble@gotpcrel32@hi+12
 ; GLOBALNESS1-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
-; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[48:49], 1, v2
-; GLOBALNESS1-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[4:5]
-; GLOBALNESS1-NEXT:    s_load_dwordx2 s[76:77], s[6:7], 0x0
-; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[50:51], 1, v2
+; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[50:51], 1, v0
+; GLOBALNESS1-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GLOBALNESS1-NEXT:    s_load_dwordx2 s[74:75], s[6:7], 0x0
+; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[52:53], 1, v0
+; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[44:45], 1, v1
 ; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[46:47], 1, v3
 ; GLOBALNESS1-NEXT:    s_mov_b32 s70, s16
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[38:39], s[8:9]
 ; GLOBALNESS1-NEXT:    s_mov_b32 s71, s15
 ; GLOBALNESS1-NEXT:    s_mov_b32 s72, s14
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[34:35], s[10:11]
-; GLOBALNESS1-NEXT:    s_mov_b64 s[74:75], 0x80
-; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[60:61], 1, v1
 ; GLOBALNESS1-NEXT:    s_mov_b32 s32, 0
 ; GLOBALNESS1-NEXT:    ; implicit-def: $vgpr44_vgpr45
 ; GLOBALNESS1-NEXT:    s_waitcnt vmcnt(0)
-; GLOBALNESS1-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v0
-; GLOBALNESS1-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GLOBALNESS1-NEXT:    v_cmp_gt_i32_e32 vcc, 1, v0
-; GLOBALNESS1-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; GLOBALNESS1-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GLOBALNESS1-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; GLOBALNESS1-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GLOBALNESS1-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v2
 ; GLOBALNESS1-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[52:53], 1, v2
-; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[54:55], 1, v3
-; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[56:57], 1, v4
-; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[58:59], 1, v0
+; GLOBALNESS1-NEXT:    v_cmp_gt_i32_e32 vcc, 1, v2
+; GLOBALNESS1-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GLOBALNESS1-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
+; GLOBALNESS1-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; GLOBALNESS1-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GLOBALNESS1-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[54:55], 1, v0
+; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[56:57], 1, v1
+; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[58:59], 1, v3
+; GLOBALNESS1-NEXT:    v_cmp_ne_u32_e64 s[60:61], 1, v2
 ; GLOBALNESS1-NEXT:    s_branch .LBB1_4
 ; GLOBALNESS1-NEXT:  .LBB1_1: ; %bb70.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[58:59]
+; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[60:61]
 ; GLOBALNESS1-NEXT:    s_cbranch_vccz .LBB1_29
 ; GLOBALNESS1-NEXT:  .LBB1_2: ; %Flow15
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
@@ -115,7 +115,8 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:  .LBB1_4: ; %bb5
 ; GLOBALNESS1-NEXT:    ; =>This Loop Header: Depth=1
 ; GLOBALNESS1-NEXT:    ; Child Loop BB1_16 Depth 2
-; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[0:1], s[74:75], s[74:75] op_sel:[0,1]
+; GLOBALNESS1-NEXT:    v_mov_b32_e32 v0, 0x80
+; GLOBALNESS1-NEXT:    v_mov_b32_e32 v1, 0
 ; GLOBALNESS1-NEXT:    flat_load_dword v40, v[0:1]
 ; GLOBALNESS1-NEXT:    s_add_u32 s8, s38, 40
 ; GLOBALNESS1-NEXT:    buffer_store_dword v42, off, s[0:3], 0
@@ -129,7 +130,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    s_mov_b32 s14, s70
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v31, v41
 ; GLOBALNESS1-NEXT:    s_waitcnt lgkmcnt(0)
-; GLOBALNESS1-NEXT:    s_swappc_b64 s[30:31], s[76:77]
+; GLOBALNESS1-NEXT:    s_swappc_b64 s[30:31], s[74:75]
 ; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[46:47]
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[6:7], -1
 ; GLOBALNESS1-NEXT:    ; implicit-def: $sgpr4_sgpr5
@@ -165,12 +166,12 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    v_cmp_gt_i32_e64 s[62:63], 0, v0
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v0, 0
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v1, 0x3ff00000
-; GLOBALNESS1-NEXT:    s_and_saveexec_b64 s[80:81], s[62:63]
+; GLOBALNESS1-NEXT:    s_and_saveexec_b64 s[76:77], s[62:63]
 ; GLOBALNESS1-NEXT:    s_cbranch_execz .LBB1_26
 ; GLOBALNESS1-NEXT:  ; %bb.11: ; %bb33.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS1-NEXT:    global_load_dwordx2 v[0:1], v[2:3], off
-; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[52:53]
+; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[54:55]
 ; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_13
 ; GLOBALNESS1-NEXT:  ; %bb.12: ; %bb39.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
@@ -192,16 +193,16 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GLOBALNESS1-NEXT:  .LBB1_15: ; %bb63.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_16 Depth=2
-; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[50:51]
+; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[52:53]
 ; GLOBALNESS1-NEXT:    s_cbranch_vccz .LBB1_25
 ; GLOBALNESS1-NEXT:  .LBB1_16: ; %bb44.i
 ; GLOBALNESS1-NEXT:    ; Parent Loop BB1_4 Depth=1
 ; GLOBALNESS1-NEXT:    ; => This Inner Loop Header: Depth=2
-; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[60:61]
+; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[48:49]
 ; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_15
 ; GLOBALNESS1-NEXT:  ; %bb.17: ; %bb46.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_16 Depth=2
-; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[48:49]
+; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[50:51]
 ; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_15
 ; GLOBALNESS1-NEXT:  ; %bb.18: ; %bb50.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_16 Depth=2
@@ -216,7 +217,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[66:67]
 ; GLOBALNESS1-NEXT:  .LBB1_21: ; %spam.exit.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_16 Depth=2
-; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[54:55]
+; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[56:57]
 ; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_15
 ; GLOBALNESS1-NEXT:  ; %bb.22: ; %bb55.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_16 Depth=2
@@ -230,7 +231,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    s_mov_b32 s13, s71
 ; GLOBALNESS1-NEXT:    s_mov_b32 s14, s70
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v31, v41
-; GLOBALNESS1-NEXT:    s_swappc_b64 s[30:31], s[76:77]
+; GLOBALNESS1-NEXT:    s_swappc_b64 s[30:31], s[74:75]
 ; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[46:47], 0, 0
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[4:5], s[40:41]
 ; GLOBALNESS1-NEXT:    s_mov_b64 s[6:7], s[36:37]
@@ -241,7 +242,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    s_mov_b32 s14, s70
 ; GLOBALNESS1-NEXT:    v_mov_b32_e32 v31, v41
 ; GLOBALNESS1-NEXT:    global_store_dwordx2 v[46:47], v[44:45], off
-; GLOBALNESS1-NEXT:    s_swappc_b64 s[30:31], s[76:77]
+; GLOBALNESS1-NEXT:    s_swappc_b64 s[30:31], s[74:75]
 ; GLOBALNESS1-NEXT:    s_and_saveexec_b64 s[4:5], s[64:65]
 ; GLOBALNESS1-NEXT:    s_cbranch_execz .LBB1_14
 ; GLOBALNESS1-NEXT:  ; %bb.23: ; %bb62.i
@@ -258,12 +259,12 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS1-NEXT:    v_pk_mov_b32 v[0:1], 0, 0
 ; GLOBALNESS1-NEXT:  .LBB1_26: ; %Flow24
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS1-NEXT:    s_or_b64 exec, exec, s[80:81]
+; GLOBALNESS1-NEXT:    s_or_b64 exec, exec, s[76:77]
 ; GLOBALNESS1-NEXT:    s_and_saveexec_b64 s[4:5], s[62:63]
 ; GLOBALNESS1-NEXT:    s_cbranch_execz .LBB1_2
 ; GLOBALNESS1-NEXT:  ; %bb.27: ; %bb67.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[56:57]
+; GLOBALNESS1-NEXT:    s_and_b64 vcc, exec, s[58:59]
 ; GLOBALNESS1-NEXT:    s_cbranch_vccnz .LBB1_1
 ; GLOBALNESS1-NEXT:  ; %bb.28: ; %bb69.i
 ; GLOBALNESS1-NEXT:    ; in Loop: Header=BB1_4 Depth=1
@@ -325,68 +326,68 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[0:1], 0, 0
 ; GLOBALNESS0-NEXT:    global_store_dword v[0:1], v42, off
 ; GLOBALNESS0-NEXT:    s_waitcnt lgkmcnt(0)
-; GLOBALNESS0-NEXT:    global_load_dword v0, v42, s[72:73]
+; GLOBALNESS0-NEXT:    global_load_dword v2, v42, s[72:73]
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[40:41], s[4:5]
 ; GLOBALNESS0-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x18
 ; GLOBALNESS0-NEXT:    s_load_dword s7, s[8:9], 0x20
 ; GLOBALNESS0-NEXT:    s_add_u32 flat_scratch_lo, s12, s17
 ; GLOBALNESS0-NEXT:    s_addc_u32 flat_scratch_hi, s13, 0
 ; GLOBALNESS0-NEXT:    s_add_u32 s0, s0, s17
+; GLOBALNESS0-NEXT:    v_mov_b32_e32 v0, 0
 ; GLOBALNESS0-NEXT:    s_addc_u32 s1, s1, 0
-; GLOBALNESS0-NEXT:    v_mov_b32_e32 v43, 0x40994400
+; GLOBALNESS0-NEXT:    v_mov_b32_e32 v1, 0x40994400
 ; GLOBALNESS0-NEXT:    s_bitcmp1_b32 s74, 0
 ; GLOBALNESS0-NEXT:    s_waitcnt lgkmcnt(0)
-; GLOBALNESS0-NEXT:    v_cmp_ngt_f64_e32 vcc, s[4:5], v[42:43]
+; GLOBALNESS0-NEXT:    v_cmp_ngt_f64_e32 vcc, s[4:5], v[0:1]
 ; GLOBALNESS0-NEXT:    v_cmp_ngt_f64_e64 s[4:5], s[4:5], 0
-; GLOBALNESS0-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[4:5]
+; GLOBALNESS0-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
 ; GLOBALNESS0-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; GLOBALNESS0-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[4:5]
 ; GLOBALNESS0-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
-; GLOBALNESS0-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GLOBALNESS0-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GLOBALNESS0-NEXT:    s_bitcmp1_b32 s6, 0
-; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[42:43], 1, v1
-; GLOBALNESS0-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
+; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[42:43], 1, v0
+; GLOBALNESS0-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GLOBALNESS0-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; GLOBALNESS0-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
 ; GLOBALNESS0-NEXT:    s_bitcmp1_b32 s7, 0
-; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[44:45], 1, v2
-; GLOBALNESS0-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[4:5]
+; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[48:49], 1, v0
+; GLOBALNESS0-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GLOBALNESS0-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; GLOBALNESS0-NEXT:    s_getpc_b64 s[6:7]
 ; GLOBALNESS0-NEXT:    s_add_u32 s6, s6, wobble@gotpcrel32@lo+4
 ; GLOBALNESS0-NEXT:    s_addc_u32 s7, s7, wobble@gotpcrel32@hi+12
 ; GLOBALNESS0-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
-; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[48:49], 1, v2
-; GLOBALNESS0-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[4:5]
-; GLOBALNESS0-NEXT:    s_load_dwordx2 s[78:79], s[6:7], 0x0
-; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[50:51], 1, v2
+; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[50:51], 1, v0
+; GLOBALNESS0-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GLOBALNESS0-NEXT:    s_load_dwordx2 s[76:77], s[6:7], 0x0
+; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[52:53], 1, v0
+; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[44:45], 1, v1
 ; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[46:47], 1, v3
 ; GLOBALNESS0-NEXT:    s_mov_b32 s68, s16
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[38:39], s[8:9]
 ; GLOBALNESS0-NEXT:    s_mov_b32 s69, s15
 ; GLOBALNESS0-NEXT:    s_mov_b32 s70, s14
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[34:35], s[10:11]
-; GLOBALNESS0-NEXT:    s_mov_b64 s[76:77], 0x80
-; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[60:61], 1, v1
 ; GLOBALNESS0-NEXT:    s_mov_b32 s32, 0
 ; GLOBALNESS0-NEXT:    ; implicit-def: $vgpr44_vgpr45
 ; GLOBALNESS0-NEXT:    s_waitcnt vmcnt(0)
-; GLOBALNESS0-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v0
-; GLOBALNESS0-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GLOBALNESS0-NEXT:    v_cmp_gt_i32_e32 vcc, 1, v0
-; GLOBALNESS0-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; GLOBALNESS0-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GLOBALNESS0-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; GLOBALNESS0-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GLOBALNESS0-NEXT:    v_cmp_gt_i32_e32 vcc, 0, v2
 ; GLOBALNESS0-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[52:53], 1, v2
-; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[54:55], 1, v3
-; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[56:57], 1, v4
-; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[58:59], 1, v0
+; GLOBALNESS0-NEXT:    v_cmp_gt_i32_e32 vcc, 1, v2
+; GLOBALNESS0-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GLOBALNESS0-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
+; GLOBALNESS0-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; GLOBALNESS0-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GLOBALNESS0-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[54:55], 1, v0
+; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[56:57], 1, v1
+; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[58:59], 1, v3
+; GLOBALNESS0-NEXT:    v_cmp_ne_u32_e64 s[60:61], 1, v2
 ; GLOBALNESS0-NEXT:    s_branch .LBB1_4
 ; GLOBALNESS0-NEXT:  .LBB1_1: ; %bb70.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[58:59]
+; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[60:61]
 ; GLOBALNESS0-NEXT:    s_cbranch_vccz .LBB1_29
 ; GLOBALNESS0-NEXT:  .LBB1_2: ; %Flow15
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
@@ -401,7 +402,8 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:  .LBB1_4: ; %bb5
 ; GLOBALNESS0-NEXT:    ; =>This Loop Header: Depth=1
 ; GLOBALNESS0-NEXT:    ; Child Loop BB1_16 Depth 2
-; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[0:1], s[76:77], s[76:77] op_sel:[0,1]
+; GLOBALNESS0-NEXT:    v_mov_b32_e32 v0, 0x80
+; GLOBALNESS0-NEXT:    v_mov_b32_e32 v1, 0
 ; GLOBALNESS0-NEXT:    flat_load_dword v40, v[0:1]
 ; GLOBALNESS0-NEXT:    s_add_u32 s8, s38, 40
 ; GLOBALNESS0-NEXT:    buffer_store_dword v42, off, s[0:3], 0
@@ -415,7 +417,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    s_mov_b32 s14, s68
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v31, v41
 ; GLOBALNESS0-NEXT:    s_waitcnt lgkmcnt(0)
-; GLOBALNESS0-NEXT:    s_swappc_b64 s[30:31], s[78:79]
+; GLOBALNESS0-NEXT:    s_swappc_b64 s[30:31], s[76:77]
 ; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[46:47]
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[6:7], -1
 ; GLOBALNESS0-NEXT:    ; implicit-def: $sgpr4_sgpr5
@@ -451,12 +453,12 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    v_cmp_gt_i32_e64 s[62:63], 0, v0
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v0, 0
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v1, 0x3ff00000
-; GLOBALNESS0-NEXT:    s_and_saveexec_b64 s[80:81], s[62:63]
+; GLOBALNESS0-NEXT:    s_and_saveexec_b64 s[78:79], s[62:63]
 ; GLOBALNESS0-NEXT:    s_cbranch_execz .LBB1_26
 ; GLOBALNESS0-NEXT:  ; %bb.11: ; %bb33.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
 ; GLOBALNESS0-NEXT:    global_load_dwordx2 v[0:1], v[2:3], off
-; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[52:53]
+; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[54:55]
 ; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_13
 ; GLOBALNESS0-NEXT:  ; %bb.12: ; %bb39.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
@@ -478,16 +480,16 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GLOBALNESS0-NEXT:  .LBB1_15: ; %bb63.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_16 Depth=2
-; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[50:51]
+; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[52:53]
 ; GLOBALNESS0-NEXT:    s_cbranch_vccz .LBB1_25
 ; GLOBALNESS0-NEXT:  .LBB1_16: ; %bb44.i
 ; GLOBALNESS0-NEXT:    ; Parent Loop BB1_4 Depth=1
 ; GLOBALNESS0-NEXT:    ; => This Inner Loop Header: Depth=2
-; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[60:61]
+; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[48:49]
 ; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_15
 ; GLOBALNESS0-NEXT:  ; %bb.17: ; %bb46.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_16 Depth=2
-; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[48:49]
+; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[50:51]
 ; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_15
 ; GLOBALNESS0-NEXT:  ; %bb.18: ; %bb50.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_16 Depth=2
@@ -502,7 +504,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[66:67]
 ; GLOBALNESS0-NEXT:  .LBB1_21: ; %spam.exit.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_16 Depth=2
-; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[54:55]
+; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[56:57]
 ; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_15
 ; GLOBALNESS0-NEXT:  ; %bb.22: ; %bb55.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_16 Depth=2
@@ -516,7 +518,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    s_mov_b32 s13, s69
 ; GLOBALNESS0-NEXT:    s_mov_b32 s14, s68
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v31, v41
-; GLOBALNESS0-NEXT:    s_swappc_b64 s[30:31], s[78:79]
+; GLOBALNESS0-NEXT:    s_swappc_b64 s[30:31], s[76:77]
 ; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[46:47], 0, 0
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[4:5], s[40:41]
 ; GLOBALNESS0-NEXT:    s_mov_b64 s[6:7], s[36:37]
@@ -527,7 +529,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    s_mov_b32 s14, s68
 ; GLOBALNESS0-NEXT:    v_mov_b32_e32 v31, v41
 ; GLOBALNESS0-NEXT:    global_store_dwordx2 v[46:47], v[44:45], off
-; GLOBALNESS0-NEXT:    s_swappc_b64 s[30:31], s[78:79]
+; GLOBALNESS0-NEXT:    s_swappc_b64 s[30:31], s[76:77]
 ; GLOBALNESS0-NEXT:    s_and_saveexec_b64 s[4:5], s[64:65]
 ; GLOBALNESS0-NEXT:    s_cbranch_execz .LBB1_14
 ; GLOBALNESS0-NEXT:  ; %bb.23: ; %bb62.i
@@ -544,12 +546,12 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i
 ; GLOBALNESS0-NEXT:    v_pk_mov_b32 v[0:1], 0, 0
 ; GLOBALNESS0-NEXT:  .LBB1_26: ; %Flow24
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS0-NEXT:    s_or_b64 exec, exec, s[80:81]
+; GLOBALNESS0-NEXT:    s_or_b64 exec, exec, s[78:79]
 ; GLOBALNESS0-NEXT:    s_and_saveexec_b64 s[4:5], s[62:63]
 ; GLOBALNESS0-NEXT:    s_cbranch_execz .LBB1_2
 ; GLOBALNESS0-NEXT:  ; %bb.27: ; %bb67.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
-; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[56:57]
+; GLOBALNESS0-NEXT:    s_and_b64 vcc, exec, s[58:59]
 ; GLOBALNESS0-NEXT:    s_cbranch_vccnz .LBB1_1
 ; GLOBALNESS0-NEXT:  ; %bb.28: ; %bb69.i
 ; GLOBALNESS0-NEXT:    ; in Loop: Header=BB1_4 Depth=1
diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll
index 7aa36a8b377bf..e809292aad1d3 100644
--- a/llvm/test/CodeGen/AMDGPU/udiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll
@@ -1165,30 +1165,29 @@ define i64 @v_test_udiv_pow2_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_addc_u32_e64 v5, s[6:7], 0, -1, vcc
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
 ; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[4:5]
+; GCN-IR-NEXT:    v_cmp_ne_u64_e64 s[6:7], 63, v[4:5]
 ; GCN-IR-NEXT:    v_mov_b32_e32 v3, 0x8000
 ; GCN-IR-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
-; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 63, v[4:5]
 ; GCN-IR-NEXT:    v_cndmask_b32_e64 v3, v3, 0, s[4:5]
 ; GCN-IR-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
 ; GCN-IR-NEXT:    v_mov_b32_e32 v2, 0
-; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0x8000
-; GCN-IR-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
+; GCN-IR-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
 ; GCN-IR-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
 ; GCN-IR-NEXT:    s_cbranch_execz .LBB9_6
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
 ; GCN-IR-NEXT:    v_add_i32_e32 v7, vcc, 1, v4
-; GCN-IR-NEXT:    v_addc_u32_e32 v8, vcc, 0, v5, vcc
 ; GCN-IR-NEXT:    v_sub_i32_e64 v2, s[4:5], 63, v4
+; GCN-IR-NEXT:    v_addc_u32_e32 v8, vcc, 0, v5, vcc
+; GCN-IR-NEXT:    s_mov_b64 s[4:5], 0x8000
 ; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[7:8]
-; GCN-IR-NEXT:    v_lshl_b64 v[2:3], s[8:9], v2
+; GCN-IR-NEXT:    v_lshl_b64 v[2:3], s[4:5], v2
 ; GCN-IR-NEXT:    v_mov_b32_e32 v4, 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
-; GCN-IR-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GCN-IR-NEXT:    s_xor_b64 s[8:9], exec, s[4:5]
+; GCN-IR-NEXT:    s_and_saveexec_b64 s[8:9], vcc
+; GCN-IR-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
 ; GCN-IR-NEXT:    s_cbranch_execz .LBB9_5
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
 ; GCN-IR-NEXT:    v_add_i32_e32 v12, vcc, -1, v0
-; GCN-IR-NEXT:    s_mov_b64 s[4:5], 0x8000
 ; GCN-IR-NEXT:    v_addc_u32_e32 v13, vcc, -1, v1, vcc
 ; GCN-IR-NEXT:    v_lshr_b64 v[8:9], s[4:5], v7
 ; GCN-IR-NEXT:    v_sub_i32_e32 v6, vcc, 47, v6
diff --git a/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll b/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll
index 2f82260888a7d..86e2822a3e5b1 100644
--- a/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll
+++ b/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll
@@ -33,8 +33,8 @@ define hidden void @widget() {
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    flat_load_dword v0, v[0:1]
-; GCN-NEXT:    s_mov_b64 s[16:17], 0
 ; GCN-NEXT:    s_mov_b64 s[20:21], -1
+; GCN-NEXT:    s_mov_b64 s[16:17], 0
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_cmp_gt_i32_e32 vcc, 21, v0
 ; GCN-NEXT:    s_mov_b64 s[46:47], 0
@@ -303,12 +303,12 @@ define hidden void @blam() {
 ; GCN-NEXT:    s_mov_b64 s[36:37], s[8:9]
 ; GCN-NEXT:    s_mov_b64 s[38:39], s[6:7]
 ; GCN-NEXT:    s_mov_b64 s[40:41], s[4:5]
-; GCN-NEXT:    s_mov_b64 s[50:51], 0
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    v_and_b32_e32 v2, 0x3ff, v41
-; GCN-NEXT:    flat_load_dword v44, v[0:1]
 ; GCN-NEXT:    v_mov_b32_e32 v43, 0
+; GCN-NEXT:    flat_load_dword v44, v[0:1]
+; GCN-NEXT:    s_mov_b64 s[50:51], 0
 ; GCN-NEXT:    s_getpc_b64 s[52:53]
 ; GCN-NEXT:    s_add_u32 s52, s52, spam@rel32@lo+4
 ; GCN-NEXT:    s_addc_u32 s53, s53, spam@rel32@hi+12
@@ -329,10 +329,10 @@ define hidden void @blam() {
 ; GCN-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GCN-NEXT:    flat_load_dword v0, v[42:43]
 ; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], 0
-; GCN-NEXT:    s_mov_b64 s[4:5], -1
+; GCN-NEXT:    s_mov_b64 s[6:7], 0
 ; GCN-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-NEXT:    v_cmp_lt_i32_e32 vcc, 2, v0
-; GCN-NEXT:    s_mov_b64 s[6:7], 0
+; GCN-NEXT:    s_mov_b64 s[4:5], -1
 ; GCN-NEXT:    s_and_saveexec_b64 s[8:9], vcc
 ; GCN-NEXT:    s_xor_b64 s[56:57], exec, s[8:9]
 ; GCN-NEXT:    s_cbranch_execz .LBB1_12
diff --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll
index 91d09c01639ff..9c316612528c2 100644
--- a/llvm/test/CodeGen/AMDGPU/urem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/urem64.ll
@@ -1158,30 +1158,29 @@ define i64 @v_test_urem_pow2_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_addc_u32_e64 v4, s[6:7], 0, -1, vcc
 ; GCN-IR-NEXT:    v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
 ; GCN-IR-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[3:4]
+; GCN-IR-NEXT:    v_cmp_ne_u64_e64 s[6:7], 63, v[3:4]
 ; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0x8000
 ; GCN-IR-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
-; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 63, v[3:4]
 ; GCN-IR-NEXT:    v_cndmask_b32_e64 v5, v5, 0, s[4:5]
 ; GCN-IR-NEXT:    s_xor_b64 s[4:5], s[4:5], -1
 ; GCN-IR-NEXT:    v_mov_b32_e32 v2, 0
-; GCN-IR-NEXT:    s_mov_b64 s[8:9], 0x8000
-; GCN-IR-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
+; GCN-IR-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
 ; GCN-IR-NEXT:    s_and_saveexec_b64 s[6:7], s[4:5]
 ; GCN-IR-NEXT:    s_cbranch_execz .LBB8_6
 ; GCN-IR-NEXT:  ; %bb.1: ; %udiv-bb1
 ; GCN-IR-NEXT:    v_add_i32_e32 v7, vcc, 1, v3
-; GCN-IR-NEXT:    v_addc_u32_e32 v8, vcc, 0, v4, vcc
 ; GCN-IR-NEXT:    v_sub_i32_e64 v2, s[4:5], 63, v3
+; GCN-IR-NEXT:    v_addc_u32_e32 v8, vcc, 0, v4, vcc
+; GCN-IR-NEXT:    s_mov_b64 s[4:5], 0x8000
 ; GCN-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[7:8]
-; GCN-IR-NEXT:    v_lshl_b64 v[2:3], s[8:9], v2
+; GCN-IR-NEXT:    v_lshl_b64 v[2:3], s[4:5], v2
 ; GCN-IR-NEXT:    v_mov_b32_e32 v4, 0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v5, 0
-; GCN-IR-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GCN-IR-NEXT:    s_xor_b64 s[8:9], exec, s[4:5]
+; GCN-IR-NEXT:    s_and_saveexec_b64 s[8:9], vcc
+; GCN-IR-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
 ; GCN-IR-NEXT:    s_cbranch_execz .LBB8_5
 ; GCN-IR-NEXT:  ; %bb.2: ; %udiv-preheader
 ; GCN-IR-NEXT:    v_add_i32_e32 v12, vcc, -1, v0
-; GCN-IR-NEXT:    s_mov_b64 s[4:5], 0x8000
 ; GCN-IR-NEXT:    v_addc_u32_e32 v13, vcc, -1, v1, vcc
 ; GCN-IR-NEXT:    v_lshr_b64 v[8:9], s[4:5], v7
 ; GCN-IR-NEXT:    v_sub_i32_e32 v6, vcc, 47, v6
diff --git a/llvm/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll b/llvm/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll
index f1edd5c74b105..0f32eb1b12771 100644
--- a/llvm/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll
+++ b/llvm/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll
@@ -257,11 +257,10 @@ define amdgpu_kernel void @test_s0_s1_k_f32(ptr addrspace(1) %out, float %a, flo
 
 ; GCN-DAG: v_mov_b32_e32 v[[VS1_SUB0:[0-9]+]], s[[SGPR1_SUB0]]
 ; GCN-DAG: v_mov_b32_e32 v[[VS1_SUB1:[0-9]+]], s[[SGPR1_SUB1]]
-; GCN: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[SGPR0]], v[[[VS1_SUB0]]:[[VS1_SUB1]]], v[[[VZERO]]:[[VK0_SUB1]]]
+; GCN-DAG: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[SGPR0]], v[[[VS1_SUB0]]:[[VS1_SUB1]]], v[[[VZERO]]:[[VK0_SUB1]]]
 
-; Same zero component is re-used for half of each immediate.
-; GCN: v_mov_b32_e32 v[[VK1_SUB1:[0-9]+]], 0x40b00000
-; GCN: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], [[SGPR0]], v[[[VS1_SUB0]]:[[VS1_SUB1]]], v[[[VZERO]]:[[VK1_SUB1]]]
+; GCN-DAG: v_mov_b32_e32 v[[VK1_SUB1:[0-9]+]], 0x40b00000
+; GCN-DAG: v_fma_f64 [[RESULT1:v\[[0-9]+:[0-9]+\]]], [[SGPR0]], v[[[VS1_SUB0]]:[[VS1_SUB1]]], v[{{[0-9]+}}:[[VK1_SUB1]]]
 
 ; GCN: buffer_store_dwordx2 [[RESULT0]]
 ; GCN: buffer_store_dwordx2 [[RESULT1]]
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll
index cc73302f85637..aa65a62e242da 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll
@@ -474,7 +474,7 @@ define amdgpu_kernel void @livevariables_update_missed_block(ptr addrspace(1) %s
   ; SI-NEXT:   successors: %bb.7(0x80000000)
   ; SI-NEXT: {{  $}}
   ; SI-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM killed [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.src1.kernarg.offset, align 4, addrspace 4)
-  ; SI-NEXT:   [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[S_LOAD_DWORDX2_IMM]].sub0, killed %51, 0, implicit $exec
+  ; SI-NEXT:   [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[S_LOAD_DWORDX2_IMM]].sub0, killed %48, 0, implicit $exec
   ; SI-NEXT:   [[V_ADDC_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADDC_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADDC_U32_e64 0, killed [[S_LOAD_DWORDX2_IMM]].sub1, killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec
   ; SI-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[V_ADD_CO_U32_e64_]], %subreg.sub0, killed [[V_ADDC_U32_e64_]], %subreg.sub1
   ; SI-NEXT:   [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8) from %ir.i10, addrspace 1)
@@ -502,14 +502,14 @@ define amdgpu_kernel void @livevariables_update_missed_block(ptr addrspace(1) %s
   ; SI-NEXT: bb.5.Flow:
   ; SI-NEXT:   successors: %bb.1(0x40000000), %bb.7(0x40000000)
   ; SI-NEXT: {{  $}}
-  ; SI-NEXT:   [[PHI:%[0-9]+]]:vgpr_32 = PHI [[COPY1]](s32), %bb.0, undef %52:vgpr_32, %bb.6
+  ; SI-NEXT:   [[PHI:%[0-9]+]]:vgpr_32 = PHI [[COPY1]](s32), %bb.0, undef %49:vgpr_32, %bb.6
   ; SI-NEXT:   [[SI_ELSE:%[0-9]+]]:sreg_32 = SI_ELSE killed [[SI_IF]], %bb.7, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
   ; SI-NEXT:   S_BRANCH %bb.1
   ; SI-NEXT: {{  $}}
   ; SI-NEXT: bb.6.sw.bb18:
   ; SI-NEXT:   successors: %bb.5(0x80000000)
   ; SI-NEXT: {{  $}}
-  ; SI-NEXT:   [[PHI1:%[0-9]+]]:vgpr_32 = PHI undef %36:vgpr_32, %bb.3, [[GLOBAL_LOAD_UBYTE1]], %bb.4
+  ; SI-NEXT:   [[PHI1:%[0-9]+]]:vgpr_32 = PHI undef %33:vgpr_32, %bb.3, [[GLOBAL_LOAD_UBYTE1]], %bb.4
   ; SI-NEXT:   [[V_MOV_B2:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec
   ; SI-NEXT:   GLOBAL_STORE_BYTE killed [[V_MOV_B2]], killed [[PHI1]], 0, 0, implicit $exec :: (store (s8) into `ptr addrspace(1) null`, addrspace 1)
   ; SI-NEXT:   S_BRANCH %bb.5

From 72e6c1c70d5e07bbc8cb7cae2ed915108daf93aa Mon Sep 17 00:00:00 2001
From: Luke Lau <luke@igalia.com>
Date: Mon, 30 Oct 2023 15:17:00 +0000
Subject: [PATCH 381/877] [RISCV] Begin moving post-isel vector peepholes to a
 MF pass (#70342)

We currently have three postprocess peephole optimisations for vector
pseudos:

1) Masked pseudo with all ones mask -> unmasked pseudo
2) Merge vmerge pseudo into operand pseudo's mask
3) vmerge pseudo with all ones mask -> vmv.v.v pseudo

This patch aims to move these peepholes out of SelectionDAG and into a
separate RISCVFoldMasks MachineFunction pass.

There are a few motivations for doing this:

* The current SelectionDAG implementation operates on MachineSDNodes,
which are essentially MachineInstrs but require a bunch of logic to
reason about chain and glue operands. The RISCVII::has*Op helper
functions also don't exactly line up with the SDNode operands. Mutating
these pseudos and their operands in place becomes a good bit easier at
the MachineInstr level. For example, we would no longer need to check
for cycles in the DAG during performCombineVMergeAndVOps.

* Although it's further down the line, moving this code out of
SelectionDAG allows it to be reused by GlobalISel later on.

* In performCombineVMergeAndVOps, it may be possible to commute the
operands to enable folding in more cases (see
test/CodeGen/RISCV/rvv/vmadd-vp.ll). There is existing machinery to
commute operands in TII::commuteInstruction, but it's implemented on
MachineInstrs.

The pass runs straight after ISel, before any of the other machine SSA
optimization passes run. This is so that dead-mi-elimination can mop up
any vmsets that are no longer used (but if preferred we could try and
erase them from inside RISCVFoldMasks itself). This also means that
these peepholes are no longer run at codegen -O0, so this patch isn't
strictly NFC.

Only the performVMergeToVMv peephole is refactored in this patch, the
remaining two would be implemented later. And as noted by @preames, it
should be possible to move doPeepholeSExtW out of SelectionDAG as well.
---
 llvm/lib/Target/RISCV/CMakeLists.txt          |   1 +
 llvm/lib/Target/RISCV/RISCV.h                 |   3 +
 llvm/lib/Target/RISCV/RISCVFoldMasks.cpp      | 174 ++++++++++++++++++
 llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp   |  36 ----
 llvm/lib/Target/RISCV/RISCVTargetMachine.cpp  |   4 +
 llvm/test/CodeGen/RISCV/O3-pipeline.ll        |   1 +
 .../RISCV/rvv/rvv-peephole-vmerge-to-vmv.mir  |  71 +++++++
 7 files changed, 254 insertions(+), 36 deletions(-)
 create mode 100644 llvm/lib/Target/RISCV/RISCVFoldMasks.cpp
 create mode 100644 llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-to-vmv.mir

diff --git a/llvm/lib/Target/RISCV/CMakeLists.txt b/llvm/lib/Target/RISCV/CMakeLists.txt
index 4d5fa79389ea6..b0282b72c6a8d 100644
--- a/llvm/lib/Target/RISCV/CMakeLists.txt
+++ b/llvm/lib/Target/RISCV/CMakeLists.txt
@@ -33,6 +33,7 @@ add_llvm_target(RISCVCodeGen
   RISCVMakeCompressible.cpp
   RISCVExpandAtomicPseudoInsts.cpp
   RISCVExpandPseudoInsts.cpp
+  RISCVFoldMasks.cpp
   RISCVFrameLowering.cpp
   RISCVGatherScatterLowering.cpp
   RISCVInsertVSETVLI.cpp
diff --git a/llvm/lib/Target/RISCV/RISCV.h b/llvm/lib/Target/RISCV/RISCV.h
index 3d8e33dc716ea..4e870d444120c 100644
--- a/llvm/lib/Target/RISCV/RISCV.h
+++ b/llvm/lib/Target/RISCV/RISCV.h
@@ -45,6 +45,9 @@ void initializeRISCVMakeCompressibleOptPass(PassRegistry &);
 FunctionPass *createRISCVGatherScatterLoweringPass();
 void initializeRISCVGatherScatterLoweringPass(PassRegistry &);
 
+FunctionPass *createRISCVFoldMasksPass();
+void initializeRISCVFoldMasksPass(PassRegistry &);
+
 FunctionPass *createRISCVOptWInstrsPass();
 void initializeRISCVOptWInstrsPass(PassRegistry &);
 
diff --git a/llvm/lib/Target/RISCV/RISCVFoldMasks.cpp b/llvm/lib/Target/RISCV/RISCVFoldMasks.cpp
new file mode 100644
index 0000000000000..d1c77a6cc7756
--- /dev/null
+++ b/llvm/lib/Target/RISCV/RISCVFoldMasks.cpp
@@ -0,0 +1,174 @@
+//===- RISCVFoldMasks.cpp - MI Vector Pseudo Mask Peepholes ---------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===---------------------------------------------------------------------===//
+//
+// This pass performs various peephole optimisations that fold masks into vector
+// pseudo instructions after instruction selection.
+//
+// Currently it converts
+// PseudoVMERGE_VVM %false, %false, %true, %allonesmask, %vl, %sew
+// ->
+// PseudoVMV_V_V %false, %true, %vl, %sew
+//
+//===---------------------------------------------------------------------===//
+
+#include "RISCV.h"
+#include "RISCVSubtarget.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "riscv-fold-masks"
+
+namespace {
+
+class RISCVFoldMasks : public MachineFunctionPass {
+public:
+  static char ID;
+  const TargetInstrInfo *TII;
+  MachineRegisterInfo *MRI;
+  const TargetRegisterInfo *TRI;
+  RISCVFoldMasks() : MachineFunctionPass(ID) {
+    initializeRISCVFoldMasksPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+  MachineFunctionProperties getRequiredProperties() const override {
+    return MachineFunctionProperties().set(
+        MachineFunctionProperties::Property::IsSSA);
+  }
+
+  StringRef getPassName() const override { return "RISC-V Fold Masks"; }
+
+private:
+  bool convertVMergeToVMv(MachineInstr &MI, MachineInstr *MaskDef);
+
+  bool isAllOnesMask(MachineInstr *MaskCopy);
+};
+
+} // namespace
+
+char RISCVFoldMasks::ID = 0;
+
+INITIALIZE_PASS(RISCVFoldMasks, DEBUG_TYPE, "RISC-V Fold Masks", false, false)
+
+bool RISCVFoldMasks::isAllOnesMask(MachineInstr *MaskCopy) {
+  if (!MaskCopy)
+    return false;
+  assert(MaskCopy->isCopy() && MaskCopy->getOperand(0).getReg() == RISCV::V0);
+  Register SrcReg =
+      TRI->lookThruCopyLike(MaskCopy->getOperand(1).getReg(), MRI);
+  if (!SrcReg.isVirtual())
+    return false;
+  MachineInstr *SrcDef = MRI->getVRegDef(SrcReg);
+  if (!SrcDef)
+    return false;
+
+  // TODO: Check that the VMSET is the expected bitwidth? The pseudo has
+  // undefined behaviour if it's the wrong bitwidth, so we could choose to
+  // assume that it's all-ones? Same applies to its VL.
+  switch (SrcDef->getOpcode()) {
+  case RISCV::PseudoVMSET_M_B1:
+  case RISCV::PseudoVMSET_M_B2:
+  case RISCV::PseudoVMSET_M_B4:
+  case RISCV::PseudoVMSET_M_B8:
+  case RISCV::PseudoVMSET_M_B16:
+  case RISCV::PseudoVMSET_M_B32:
+  case RISCV::PseudoVMSET_M_B64:
+    return true;
+  default:
+    return false;
+  }
+}
+
+// Transform (VMERGE_VVM_<LMUL> false, false, true, allones, vl, sew) to
+// (VMV_V_V_<LMUL> false, true, vl, sew). It may decrease uses of VMSET.
+bool RISCVFoldMasks::convertVMergeToVMv(MachineInstr &MI, MachineInstr *V0Def) {
+#define CASE_VMERGE_TO_VMV(lmul)                                               \
+  case RISCV::PseudoVMERGE_VVM_##lmul:                                         \
+    NewOpc = RISCV::PseudoVMV_V_V_##lmul;                                      \
+    break;
+  unsigned NewOpc;
+  switch (MI.getOpcode()) {
+  default:
+    llvm_unreachable("Expected VMERGE_VVM_<LMUL> instruction.");
+    CASE_VMERGE_TO_VMV(MF8)
+    CASE_VMERGE_TO_VMV(MF4)
+    CASE_VMERGE_TO_VMV(MF2)
+    CASE_VMERGE_TO_VMV(M1)
+    CASE_VMERGE_TO_VMV(M2)
+    CASE_VMERGE_TO_VMV(M4)
+    CASE_VMERGE_TO_VMV(M8)
+  }
+
+  Register MergeReg = MI.getOperand(1).getReg();
+  Register FalseReg = MI.getOperand(2).getReg();
+  // Check merge == false (or merge == undef)
+  if (MergeReg != RISCV::NoRegister && TRI->lookThruCopyLike(MergeReg, MRI) !=
+                                           TRI->lookThruCopyLike(FalseReg, MRI))
+    return false;
+
+  assert(MI.getOperand(4).isReg() && MI.getOperand(4).getReg() == RISCV::V0);
+  if (!isAllOnesMask(V0Def))
+    return false;
+
+  MI.setDesc(TII->get(NewOpc));
+  MI.removeOperand(1);  // Merge operand
+  MI.tieOperands(0, 1); // Tie false to dest
+  MI.removeOperand(3);  // Mask operand
+  MI.addOperand(
+      MachineOperand::CreateImm(RISCVII::TAIL_UNDISTURBED_MASK_UNDISTURBED));
+
+  // vmv.v.v doesn't have a mask operand, so we may be able to inflate the
+  // register class for the destination and merge operands e.g. VRNoV0 -> VR
+  MRI->recomputeRegClass(MI.getOperand(0).getReg());
+  MRI->recomputeRegClass(MI.getOperand(1).getReg());
+  return true;
+}
+
+bool RISCVFoldMasks::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(MF.getFunction()))
+    return false;
+
+  // Skip if the vector extension is not enabled.
+  const RISCVSubtarget &ST = MF.getSubtarget<RISCVSubtarget>();
+  if (!ST.hasVInstructions())
+    return false;
+
+  TII = ST.getInstrInfo();
+  MRI = &MF.getRegInfo();
+  TRI = MRI->getTargetRegisterInfo();
+
+  bool Changed = false;
+
+  // Masked pseudos coming out of isel will have their mask operand in the form:
+  //
+  // $v0:vr = COPY %mask:vr
+  // %x:vr = Pseudo_MASK %a:vr, %b:br, $v0:vr
+  //
+  // Because $v0 isn't in SSA, keep track of it so we can check the mask operand
+  // on each pseudo.
+  MachineInstr *CurrentV0Def;
+  for (MachineBasicBlock &MBB : MF) {
+    CurrentV0Def = nullptr;
+    for (MachineInstr &MI : MBB) {
+      unsigned BaseOpc = RISCV::getRVVMCOpcode(MI.getOpcode());
+      if (BaseOpc == RISCV::VMERGE_VVM)
+        Changed |= convertVMergeToVMv(MI, CurrentV0Def);
+
+      if (MI.definesRegister(RISCV::V0, TRI))
+        CurrentV0Def = &MI;
+    }
+  }
+
+  return Changed;
+}
+
+FunctionPass *llvm::createRISCVFoldMasksPass() { return new RISCVFoldMasks(); }
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index 94d1994895325..c2cac993fe13c 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -3685,40 +3685,6 @@ bool RISCVDAGToDAGISel::performCombineVMergeAndVOps(SDNode *N) {
   return true;
 }
 
-// Transform (VMERGE_VVM_<LMUL> false, false, true, allones, vl, sew) to
-// (VMV_V_V_<LMUL> false, true, vl, sew). It may decrease uses of VMSET.
-bool RISCVDAGToDAGISel::performVMergeToVMv(SDNode *N) {
-#define CASE_VMERGE_TO_VMV(lmul)                                               \
-  case RISCV::PseudoVMERGE_VVM_##lmul:                                    \
-    NewOpc = RISCV::PseudoVMV_V_V_##lmul;                                 \
-    break;
-  unsigned NewOpc;
-  switch (N->getMachineOpcode()) {
-  default:
-    llvm_unreachable("Expected VMERGE_VVM_<LMUL> instruction.");
-  CASE_VMERGE_TO_VMV(MF8)
-  CASE_VMERGE_TO_VMV(MF4)
-  CASE_VMERGE_TO_VMV(MF2)
-  CASE_VMERGE_TO_VMV(M1)
-  CASE_VMERGE_TO_VMV(M2)
-  CASE_VMERGE_TO_VMV(M4)
-  CASE_VMERGE_TO_VMV(M8)
-  }
-
-  if (!usesAllOnesMask(N, /* MaskOpIdx */ 3))
-    return false;
-
-  SDLoc DL(N);
-  SDValue PolicyOp =
-    CurDAG->getTargetConstant(/*TUMU*/ 0, DL, Subtarget->getXLenVT());
-  SDNode *Result = CurDAG->getMachineNode(
-      NewOpc, DL, N->getValueType(0),
-      {N->getOperand(1), N->getOperand(2), N->getOperand(4), N->getOperand(5),
-       PolicyOp});
-  ReplaceUses(N, Result);
-  return true;
-}
-
 bool RISCVDAGToDAGISel::doPeepholeMergeVVMFold() {
   bool MadeChange = false;
   SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
@@ -3730,8 +3696,6 @@ bool RISCVDAGToDAGISel::doPeepholeMergeVVMFold() {
 
     if (IsVMerge(N) || IsVMv(N))
       MadeChange |= performCombineVMergeAndVOps(N);
-    if (IsVMerge(N) && N->getOperand(0) == N->getOperand(1))
-      MadeChange |= performVMergeToVMv(N);
   }
   return MadeChange;
 }
diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
index 953ac097b9150..85683a3adc968 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
@@ -101,6 +101,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeRISCVTarget() {
   initializeRISCVOptWInstrsPass(*PR);
   initializeRISCVPreRAExpandPseudoPass(*PR);
   initializeRISCVExpandPseudoPass(*PR);
+  initializeRISCVFoldMasksPass(*PR);
   initializeRISCVInsertVSETVLIPass(*PR);
   initializeRISCVInsertReadWriteCSRPass(*PR);
   initializeRISCVDAGToDAGISelPass(*PR);
@@ -414,7 +415,10 @@ void RISCVPassConfig::addPreEmitPass2() {
 }
 
 void RISCVPassConfig::addMachineSSAOptimization() {
+  addPass(createRISCVFoldMasksPass());
+
   TargetPassConfig::addMachineSSAOptimization();
+
   if (EnableMachineCombiner)
     addPass(&MachineCombinerID);
 
diff --git a/llvm/test/CodeGen/RISCV/O3-pipeline.ll b/llvm/test/CodeGen/RISCV/O3-pipeline.ll
index cf0826096bd41..414b721661021 100644
--- a/llvm/test/CodeGen/RISCV/O3-pipeline.ll
+++ b/llvm/test/CodeGen/RISCV/O3-pipeline.ll
@@ -82,6 +82,7 @@
 ; CHECK-NEXT:       Lazy Block Frequency Analysis
 ; CHECK-NEXT:       RISC-V DAG->DAG Pattern Instruction Selection
 ; CHECK-NEXT:       Finalize ISel and expand pseudo-instructions
+; CHECK-NEXT:       RISC-V Fold Masks
 ; CHECK-NEXT:       Lazy Machine Block Frequency Analysis
 ; CHECK-NEXT:       Early Tail Duplication
 ; CHECK-NEXT:       Optimize machine instruction PHIs
diff --git a/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-to-vmv.mir b/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-to-vmv.mir
new file mode 100644
index 0000000000000..442419efb83ca
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/rvv-peephole-vmerge-to-vmv.mir
@@ -0,0 +1,71 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
+# RUN: llc %s -o - -mtriple=riscv64 -mattr=+v -run-pass=riscv-fold-masks \
+# RUN:     -verify-machineinstrs | FileCheck %s
+
+---
+name: undef_passthru
+body: |
+  bb.0:
+    liveins: $x1, $v8, $v9
+    ; CHECK-LABEL: name: undef_passthru
+    ; CHECK: liveins: $x1, $v8, $v9
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %false:vr = COPY $v8
+    ; CHECK-NEXT: %true:vr = COPY $v9
+    ; CHECK-NEXT: %avl:gprnox0 = COPY $x1
+    ; CHECK-NEXT: %mask:vmv0 = PseudoVMSET_M_B8 %avl, 5 /* e32 */
+    ; CHECK-NEXT: $v0 = COPY %mask
+    ; CHECK-NEXT: %x:vr = PseudoVMV_V_V_M1 %false, %true, %avl, 5 /* e32 */, 0 /* tu, mu */
+    %false:vr = COPY $v8
+    %true:vr = COPY $v9
+    %avl:gprnox0 = COPY $x1
+    %mask:vmv0 = PseudoVMSET_M_B8 %avl, 5
+    $v0 = COPY %mask
+    %x:vrnov0 = PseudoVMERGE_VVM_M1 $noreg, %false, %true, $v0, %avl, 5
+...
+---
+name: undef_false
+body: |
+  bb.0:
+    liveins: $x1, $v8, $v9
+    ; CHECK-LABEL: name: undef_false
+    ; CHECK: liveins: $x1, $v8, $v9
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %pt:vrnov0 = COPY $v8
+    ; CHECK-NEXT: %false:vr = COPY $noreg
+    ; CHECK-NEXT: %true:vr = COPY $v9
+    ; CHECK-NEXT: %avl:gprnox0 = COPY $x1
+    ; CHECK-NEXT: %mask:vmv0 = PseudoVMSET_M_B8 %avl, 5 /* e32 */
+    ; CHECK-NEXT: $v0 = COPY %mask
+    ; CHECK-NEXT: %x:vrnov0 = PseudoVMERGE_VVM_M1 %pt, %false, %true, $v0, %avl, 5 /* e32 */
+    %pt:vrnov0 = COPY $v8
+    %false:vr = COPY $noreg
+    %true:vr = COPY $v9
+    %avl:gprnox0 = COPY $x1
+    %mask:vmv0 = PseudoVMSET_M_B8 %avl, 5
+    $v0 = COPY %mask
+    %x:vrnov0 = PseudoVMERGE_VVM_M1 %pt, %false, %true, $v0, %avl, 5
+...
+---
+name: equal_passthru_false
+body: |
+  bb.0:
+    liveins: $x1, $v8, $v9
+    ; CHECK-LABEL: name: equal_passthru_false
+    ; CHECK: liveins: $x1, $v8, $v9
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %false:vr = COPY $v8
+    ; CHECK-NEXT: %pt:vrnov0 = COPY $v8
+    ; CHECK-NEXT: %true:vr = COPY $v9
+    ; CHECK-NEXT: %avl:gprnox0 = COPY $x1
+    ; CHECK-NEXT: %mask:vmv0 = PseudoVMSET_M_B8 %avl, 5 /* e32 */
+    ; CHECK-NEXT: $v0 = COPY %mask
+    ; CHECK-NEXT: %x:vr = PseudoVMV_V_V_M1 %false, %true, %avl, 5 /* e32 */, 0 /* tu, mu */
+    %false:vr = COPY $v8
+    %pt:vrnov0 = COPY $v8
+    %true:vr = COPY $v9
+    %avl:gprnox0 = COPY $x1
+    %mask:vmv0 = PseudoVMSET_M_B8 %avl, 5
+    $v0 = COPY %mask
+    %x:vrnov0 = PseudoVMERGE_VVM_M1 %pt, %false, %true, $v0, %avl, 5
+...

From 849297c97d9e87584cae7c83fcca9686f784d54a Mon Sep 17 00:00:00 2001
From: Jessica Del <50999226+OutOfCache@users.noreply.github.com>
Date: Mon, 30 Oct 2023 16:23:49 +0100
Subject: [PATCH 382/877] [AMDGPU][wmma] - Add tied wmma intrinsic (#69903)

These new intrinsics, `amdgcn_wmma_tied_f16_16x16x16_f16` and
`amdgcn_wmma_tied_f16_16x16x16_f16`,
explicitly tie the destination accumulator matrix to the input
accumulator matrix.

The `wmma_f16` and `wmma_bf16` intrinsics only write to 16-bit of the
32-bit destination VGPRs.
Which half is determined via the `op_sel` argument. The other half of
the destination registers remains unchanged.

In some cases however, we expect the destination to copy the other
halves from the input accumulator.
For instance, when packing two separate accumulator matrices into one.
In that case, the two matrices
are tied into the same registers, but separate halves. Then it is
important to copy the other matrix values
to the new destination.
---
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td      |   8 ++
 .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp  |   2 +
 llvm/lib/Target/AMDGPU/VOP3PInstructions.td   |  44 ++++----
 .../AMDGPU/GlobalISel/llvm.amdgcn.wmma_32.ll  | 100 ++++++++++++++++++
 .../AMDGPU/GlobalISel/llvm.amdgcn.wmma_64.ll  |  84 +++++++++++++++
 .../CodeGen/AMDGPU/llvm.amdgcn.wmma_32.ll     | 100 ++++++++++++++++++
 .../CodeGen/AMDGPU/llvm.amdgcn.wmma_64.ll     |  84 +++++++++++++++
 7 files changed, 404 insertions(+), 18 deletions(-)

diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 5f1d1d932f74c..89c7b6ab9ee43 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2361,8 +2361,16 @@ class AMDGPUWmmaIntrinsicIU<LLVMType AB, LLVMType CD> :
 
 def int_amdgcn_wmma_f32_16x16x16_f16   : AMDGPUWmmaIntrinsic<llvm_v16f16_ty, llvm_anyfloat_ty>;
 def int_amdgcn_wmma_f32_16x16x16_bf16  : AMDGPUWmmaIntrinsic<llvm_v16i16_ty, llvm_anyfloat_ty>;
+// The regular, untied f16/bf16 wmma intrinsics only write to one half
+// of the registers (set via the op_sel bit).
+// The content of the other 16-bit of the registers is undefined.
 def int_amdgcn_wmma_f16_16x16x16_f16   : AMDGPUWmmaIntrinsicOPSEL<llvm_v16f16_ty, llvm_anyfloat_ty>;
 def int_amdgcn_wmma_bf16_16x16x16_bf16 : AMDGPUWmmaIntrinsicOPSEL<llvm_v16i16_ty, llvm_anyint_ty>;
+// The tied versions of the f16/bf16 wmma intrinsics tie the destination matrix
+// registers to the input accumulator registers.
+// Essentially, the content of the other 16-bit is preserved from the input.
+def int_amdgcn_wmma_f16_16x16x16_f16_tied   : AMDGPUWmmaIntrinsicOPSEL<llvm_v16f16_ty, llvm_anyfloat_ty>;
+def int_amdgcn_wmma_bf16_16x16x16_bf16_tied : AMDGPUWmmaIntrinsicOPSEL<llvm_v16i16_ty, llvm_anyint_ty>;
 def int_amdgcn_wmma_i32_16x16x16_iu8   : AMDGPUWmmaIntrinsicIU<llvm_v4i32_ty, llvm_anyint_ty>;
 def int_amdgcn_wmma_i32_16x16x16_iu4   : AMDGPUWmmaIntrinsicIU<llvm_v2i32_ty, llvm_anyint_ty>;
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 5b056bd9e5dba..e409a24007a6b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4279,6 +4279,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     case Intrinsic::amdgcn_sudot8:
     case Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16:
     case Intrinsic::amdgcn_wmma_f16_16x16x16_f16:
+    case Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16_tied:
+    case Intrinsic::amdgcn_wmma_f16_16x16x16_f16_tied:
     case Intrinsic::amdgcn_wmma_f32_16x16x16_bf16:
     case Intrinsic::amdgcn_wmma_f32_16x16x16_f16:
     case Intrinsic::amdgcn_wmma_i32_16x16x16_iu4:
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index b4149729d50e5..24d7550e2dec4 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -865,22 +865,26 @@ def WMMAOpcode3AddrMappingTable : WMMAMappingTable {
 //    it converts the default pseudo to the pseudo where src2 is not the same as vdst.
 // 3) @earlyclobber on the destination satisfies the constraint during RA.
 
-multiclass WMMAInst<string Suffix, string Instr, VOPProfile P, SDPatternOperator node = null_frag, RegisterOperand _Src01RC64 = VRegSrc_256, WMMAType Type> {
+multiclass WMMAInst<string Suffix, string Instr, VOPProfile P, SDPatternOperator node = null_frag, RegisterOperand _Src01RC64 = VRegSrc_256, WMMAType Type, bit convertibleTo3Addr> {
 
   defvar WMMAConstraints2Addr = "@earlyclobber $vdst,$vdst = $src2";
   defvar WMMAConstraints3Addr = "@earlyclobber $vdst";
 
   defvar WMMAProfile = VOPProfileWMMA<P, Suffix, _Src01RC64, Type.hasClamp, Type.hasOpsel>;
   let Mnemonic = Instr, mayRaiseFPException = 0, ReadsModeReg = 0 in {
-    let Constraints = WMMAConstraints2Addr, isConvertibleToThreeAddress = 1 in {
+    let Constraints = WMMAConstraints2Addr, isConvertibleToThreeAddress = convertibleTo3Addr in {
       def _twoaddr # Suffix : VOP3P_Pseudo<Instr # Suffix, WMMAProfile>;
     }
-    let Constraints = WMMAConstraints3Addr, SchedRW = [Write32Bit, Write32Bit] in {
-      def _threeaddr # Suffix : VOP3P_Pseudo<Instr # Suffix, WMMAProfile>;
-    }
   }
-  def : WMMAOpcodeMapping<!cast<Instruction>(NAME # _twoaddr # Suffix),
+  if convertibleTo3Addr then {
+    let Mnemonic = Instr, mayRaiseFPException = 0, ReadsModeReg = 0 in {
+      let Constraints = WMMAConstraints3Addr, SchedRW = [Write32Bit, Write32Bit] in {
+        def _threeaddr # Suffix : VOP3P_Pseudo<Instr # Suffix, WMMAProfile>;
+      }
+    }
+    def : WMMAOpcodeMapping<!cast<Instruction>(NAME # _twoaddr # Suffix),
                           !cast<Instruction>(NAME # _threeaddr # Suffix)>;
+  }
 
   if !eq(Type, WMMAOpSel) then {
     def : WMMAOpSelPat<!cast<Instruction>(NAME # _twoaddr # Suffix), node, P>;
@@ -893,21 +897,25 @@ multiclass WMMAInst<string Suffix, string Instr, VOPProfile P, SDPatternOperator
 
 
 let WaveSizePredicate = isWave32 in {
-  defm V_WMMA_F32_16X16X16_F16   : WMMAInst<"_w32", "v_wmma_f32_16x16x16_f16",  VOP_V8F32_V16F16_V16F16_V8F32, int_amdgcn_wmma_f32_16x16x16_f16, VRegSrc_256, WMMARegular>;
-  defm V_WMMA_F32_16X16X16_BF16  : WMMAInst<"_w32", "v_wmma_f32_16x16x16_bf16", VOP_V8F32_V16I16_V16I16_V8F32, int_amdgcn_wmma_f32_16x16x16_bf16, VRegSrc_256, WMMARegular>;
-  defm V_WMMA_F16_16X16X16_F16   : WMMAInst<"_w32", "v_wmma_f16_16x16x16_f16",   VOP_V16F16_V16F16_V16F16_V16F16, int_amdgcn_wmma_f16_16x16x16_f16, VRegSrc_256, WMMAOpSel>;
-  defm V_WMMA_BF16_16X16X16_BF16 : WMMAInst<"_w32", "v_wmma_bf16_16x16x16_bf16", VOP_V16I16_V16I16_V16I16_V16I16, int_amdgcn_wmma_bf16_16x16x16_bf16, VRegSrc_256, WMMAOpSel>;
-  defm V_WMMA_I32_16X16X16_IU8   : WMMAInst<"_w32", "v_wmma_i32_16x16x16_iu8",   VOP_V8I32_V4I32_V4I32_V8I32, int_amdgcn_wmma_i32_16x16x16_iu8, VRegSrc_128, WMMAUIClamp>;
-  defm V_WMMA_I32_16X16X16_IU4   : WMMAInst<"_w32", "v_wmma_i32_16x16x16_iu4",   VOP_V8I32_V2I32_V2I32_V8I32, int_amdgcn_wmma_i32_16x16x16_iu4, VRegSrc_64,  WMMAUIClamp>;
+  defm V_WMMA_F32_16X16X16_F16   : WMMAInst<"_w32", "v_wmma_f32_16x16x16_f16",  VOP_V8F32_V16F16_V16F16_V8F32, int_amdgcn_wmma_f32_16x16x16_f16, VRegSrc_256, WMMARegular, 1>;
+  defm V_WMMA_F32_16X16X16_BF16  : WMMAInst<"_w32", "v_wmma_f32_16x16x16_bf16", VOP_V8F32_V16I16_V16I16_V8F32, int_amdgcn_wmma_f32_16x16x16_bf16, VRegSrc_256, WMMARegular, 1>;
+  defm V_WMMA_F16_16X16X16_F16   : WMMAInst<"_w32", "v_wmma_f16_16x16x16_f16",   VOP_V16F16_V16F16_V16F16_V16F16, int_amdgcn_wmma_f16_16x16x16_f16, VRegSrc_256, WMMAOpSel, 1>;
+  defm V_WMMA_BF16_16X16X16_BF16 : WMMAInst<"_w32", "v_wmma_bf16_16x16x16_bf16", VOP_V16I16_V16I16_V16I16_V16I16, int_amdgcn_wmma_bf16_16x16x16_bf16, VRegSrc_256, WMMAOpSel, 1>;
+  defm V_WMMA_F16_16X16X16_F16_TIED   : WMMAInst<"_w32", "v_wmma_f16_16x16x16_f16",   VOP_V16F16_V16F16_V16F16_V16F16, int_amdgcn_wmma_f16_16x16x16_f16_tied, VRegSrc_256, WMMAOpSel, 0>;
+  defm V_WMMA_BF16_16X16X16_BF16_TIED : WMMAInst<"_w32", "v_wmma_bf16_16x16x16_bf16", VOP_V16I16_V16I16_V16I16_V16I16, int_amdgcn_wmma_bf16_16x16x16_bf16_tied, VRegSrc_256, WMMAOpSel, 0>;
+  defm V_WMMA_I32_16X16X16_IU8   : WMMAInst<"_w32", "v_wmma_i32_16x16x16_iu8",   VOP_V8I32_V4I32_V4I32_V8I32, int_amdgcn_wmma_i32_16x16x16_iu8, VRegSrc_128, WMMAUIClamp, 1>;
+  defm V_WMMA_I32_16X16X16_IU4   : WMMAInst<"_w32", "v_wmma_i32_16x16x16_iu4",   VOP_V8I32_V2I32_V2I32_V8I32, int_amdgcn_wmma_i32_16x16x16_iu4, VRegSrc_64,  WMMAUIClamp, 1>;
 }
 
 let WaveSizePredicate = isWave64 in {
-  defm V_WMMA_F32_16X16X16_F16   : WMMAInst<"_w64", "v_wmma_f32_16x16x16_f16",   VOP_V4F32_V16F16_V16F16_V4F32, int_amdgcn_wmma_f32_16x16x16_f16, VRegSrc_256, WMMARegular>;
-  defm V_WMMA_F32_16X16X16_BF16  : WMMAInst<"_w64", "v_wmma_f32_16x16x16_bf16",  VOP_V4F32_V16I16_V16I16_V4F32, int_amdgcn_wmma_f32_16x16x16_bf16, VRegSrc_256, WMMARegular>;
-  defm V_WMMA_F16_16X16X16_F16   : WMMAInst<"_w64", "v_wmma_f16_16x16x16_f16",   VOP_V8F16_V16F16_V16F16_V8F16, int_amdgcn_wmma_f16_16x16x16_f16, VRegSrc_256, WMMAOpSel>;
-  defm V_WMMA_BF16_16X16X16_BF16 : WMMAInst<"_w64", "v_wmma_bf16_16x16x16_bf16", VOP_V8I16_V16I16_V16I16_V8I16, int_amdgcn_wmma_bf16_16x16x16_bf16, VRegSrc_256, WMMAOpSel>;
-  defm V_WMMA_I32_16X16X16_IU8   : WMMAInst<"_w64", "v_wmma_i32_16x16x16_iu8",   VOP_V4I32_V4I32_V4I32_V4I32, int_amdgcn_wmma_i32_16x16x16_iu8, VRegSrc_128, WMMAUIClamp>;
-  defm V_WMMA_I32_16X16X16_IU4   : WMMAInst<"_w64", "v_wmma_i32_16x16x16_iu4",   VOP_V4I32_V2I32_V2I32_V4I32, int_amdgcn_wmma_i32_16x16x16_iu4, VRegSrc_64, WMMAUIClamp>;
+  defm V_WMMA_F32_16X16X16_F16   : WMMAInst<"_w64", "v_wmma_f32_16x16x16_f16",   VOP_V4F32_V16F16_V16F16_V4F32, int_amdgcn_wmma_f32_16x16x16_f16, VRegSrc_256, WMMARegular, 1>;
+  defm V_WMMA_F32_16X16X16_BF16  : WMMAInst<"_w64", "v_wmma_f32_16x16x16_bf16",  VOP_V4F32_V16I16_V16I16_V4F32, int_amdgcn_wmma_f32_16x16x16_bf16, VRegSrc_256, WMMARegular, 1>;
+  defm V_WMMA_F16_16X16X16_F16   : WMMAInst<"_w64", "v_wmma_f16_16x16x16_f16",   VOP_V8F16_V16F16_V16F16_V8F16, int_amdgcn_wmma_f16_16x16x16_f16, VRegSrc_256, WMMAOpSel, 1>;
+  defm V_WMMA_BF16_16X16X16_BF16 : WMMAInst<"_w64", "v_wmma_bf16_16x16x16_bf16", VOP_V8I16_V16I16_V16I16_V8I16, int_amdgcn_wmma_bf16_16x16x16_bf16, VRegSrc_256, WMMAOpSel, 1>;
+  defm V_WMMA_F16_16X16X16_F16_TIED   : WMMAInst<"_w64", "v_wmma_f16_16x16x16_f16",   VOP_V8F16_V16F16_V16F16_V8F16, int_amdgcn_wmma_f16_16x16x16_f16_tied, VRegSrc_256, WMMAOpSel, 0>;
+  defm V_WMMA_BF16_16X16X16_BF16_TIED : WMMAInst<"_w64", "v_wmma_bf16_16x16x16_bf16", VOP_V8I16_V16I16_V16I16_V8I16, int_amdgcn_wmma_bf16_16x16x16_bf16_tied, VRegSrc_256, WMMAOpSel, 0>;
+  defm V_WMMA_I32_16X16X16_IU8   : WMMAInst<"_w64", "v_wmma_i32_16x16x16_iu8",   VOP_V4I32_V4I32_V4I32_V4I32, int_amdgcn_wmma_i32_16x16x16_iu8, VRegSrc_128, WMMAUIClamp, 1>;
+  defm V_WMMA_I32_16X16X16_IU4   : WMMAInst<"_w64", "v_wmma_i32_16x16x16_iu4",   VOP_V4I32_V2I32_V2I32_V4I32, int_amdgcn_wmma_i32_16x16x16_iu4, VRegSrc_64, WMMAUIClamp, 1>;
 
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_32.ll
index 6ca2dd838d37a..2ddf367f0aafc 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_32.ll
@@ -4,7 +4,9 @@
 declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<16 x half>, <16 x half> , <8 x float>)
 declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<16 x i16>, <16 x i16> , <8 x float>)
 declare <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half>, <16 x half> , <16 x half>, i1 immarg)
+declare <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.tied(<16 x half>, <16 x half> , <16 x half>, i1 immarg)
 declare <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16>, <16 x i16> , <16 x i16>, i1 immarg)
+declare <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.tied(<16 x i16>, <16 x i16> , <16 x i16>, i1 immarg)
 declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 immarg, <4 x i32>, i1 immarg, <4 x i32> , <8 x i32>, i1 immarg)
 declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 immarg, <2 x i32>, i1 immarg, <2 x i32> , <8 x i32>, i1 immarg)
 
@@ -78,6 +80,55 @@ bb:
   ret void
 }
 
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_untied(<16 x half> %A.0, <16 x half> %B.0, <16 x half> %A.1, <16 x half> %B.1, <16 x half> %C, ptr addrspace(1) %out.0, ptr addrspace(1) %out.1) {
+; W32-LABEL: test_wmma_f16_16x16x16_f16_untied:
+; W32:       ; %bb.0: ; %bb
+; W32-NEXT:    v_wmma_f16_16x16x16_f16 v[44:51], v[0:7], v[8:15], v[32:39]
+; W32-NEXT:    v_wmma_f16_16x16x16_f16 v[32:39], v[16:23], v[24:31], v[32:39]
+; W32-NEXT:    s_clause 0x1
+; W32-NEXT:    global_store_b128 v[40:41], v[44:47], off
+; W32-NEXT:    global_store_b128 v[40:41], v[48:51], off offset:16
+; W32-NEXT:    s_clause 0x1
+; W32-NEXT:    global_store_b128 v[42:43], v[32:35], off
+; W32-NEXT:    global_store_b128 v[42:43], v[36:39], off offset:16
+; W32-NEXT:    s_nop 0
+; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; W32-NEXT:    s_endpgm
+bb:
+  %res.0 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %A.0, <16 x half> %B.0, <16 x half> %C, i1 0)
+  %res.1 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %A.1, <16 x half> %B.1, <16 x half> %C, i1 0)
+  store <16 x half> %res.0, ptr addrspace(1) %out.0, align 32
+  store <16 x half> %res.1, ptr addrspace(1) %out.1, align 32
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_tied(<16 x half> %A.0, <16 x half> %B.0, <16 x half> %A.1, <16 x half> %B.1, <16 x half> %C, ptr addrspace(1) %out.0, ptr addrspace(1) %out.1) {
+; W32-LABEL: test_wmma_f16_16x16x16_f16_tied:
+; W32:       ; %bb.0: ; %bb
+; W32-NEXT:    v_dual_mov_b32 v51, v39 :: v_dual_mov_b32 v50, v38
+; W32-NEXT:    v_dual_mov_b32 v49, v37 :: v_dual_mov_b32 v48, v36
+; W32-NEXT:    v_dual_mov_b32 v47, v35 :: v_dual_mov_b32 v46, v34
+; W32-NEXT:    v_dual_mov_b32 v45, v33 :: v_dual_mov_b32 v44, v32
+; W32-NEXT:    v_wmma_f16_16x16x16_f16 v[32:39], v[16:23], v[24:31], v[32:39]
+; W32-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; W32-NEXT:    v_wmma_f16_16x16x16_f16 v[44:51], v[0:7], v[8:15], v[44:51]
+; W32-NEXT:    s_clause 0x1
+; W32-NEXT:    global_store_b128 v[40:41], v[44:47], off
+; W32-NEXT:    global_store_b128 v[40:41], v[48:51], off offset:16
+; W32-NEXT:    s_clause 0x1
+; W32-NEXT:    global_store_b128 v[42:43], v[32:35], off
+; W32-NEXT:    global_store_b128 v[42:43], v[36:39], off offset:16
+; W32-NEXT:    s_nop 0
+; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; W32-NEXT:    s_endpgm
+bb:
+  %res.0 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.tied(<16 x half> %A.0, <16 x half> %B.0, <16 x half> %C, i1 0)
+  %res.1 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.tied(<16 x half> %A.1, <16 x half> %B.1, <16 x half> %C, i1 0)
+  store <16 x half> %res.0, ptr addrspace(1) %out.0, align 32
+  store <16 x half> %res.1, ptr addrspace(1) %out.1, align 32
+  ret void
+}
+
 ; @llvm.amdgcn.wmma.bf16.16x16x16.bf16
 
 define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_lo(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, ptr addrspace(1) %out) {
@@ -112,6 +163,55 @@ bb:
   ret void
 }
 
+define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_untied(<16 x i16> %A.0, <16 x i16> %B.0, <16 x i16> %A.1, <16 x i16> %B.1, <16 x i16> %C, ptr addrspace(1) %out.0, ptr addrspace(1) %out.1) {
+; W32-LABEL: test_wmma_bf16_16x16x16_bf16_untied:
+; W32:       ; %bb.0: ; %bb
+; W32-NEXT:    v_wmma_bf16_16x16x16_bf16 v[44:51], v[0:7], v[8:15], v[32:39]
+; W32-NEXT:    v_wmma_bf16_16x16x16_bf16 v[32:39], v[16:23], v[24:31], v[32:39]
+; W32-NEXT:    s_clause 0x1
+; W32-NEXT:    global_store_b128 v[40:41], v[44:47], off
+; W32-NEXT:    global_store_b128 v[40:41], v[48:51], off offset:16
+; W32-NEXT:    s_clause 0x1
+; W32-NEXT:    global_store_b128 v[42:43], v[32:35], off
+; W32-NEXT:    global_store_b128 v[42:43], v[36:39], off offset:16
+; W32-NEXT:    s_nop 0
+; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; W32-NEXT:    s_endpgm
+bb:
+  %res.0 = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %A.0, <16 x i16> %B.0, <16 x i16> %C, i1 0)
+  %res.1 = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %A.1, <16 x i16> %B.1, <16 x i16> %C, i1 0)
+  store <16 x i16> %res.0, ptr addrspace(1) %out.0, align 32
+  store <16 x i16> %res.1, ptr addrspace(1) %out.1, align 32
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_tied(<16 x i16> %A.0, <16 x i16> %B.0, <16 x i16> %A.1, <16 x i16> %B.1, <16 x i16> %C, ptr addrspace(1) %out.0, ptr addrspace(1) %out.1) {
+; W32-LABEL: test_wmma_bf16_16x16x16_bf16_tied:
+; W32:       ; %bb.0: ; %bb
+; W32-NEXT:    v_dual_mov_b32 v51, v39 :: v_dual_mov_b32 v50, v38
+; W32-NEXT:    v_dual_mov_b32 v49, v37 :: v_dual_mov_b32 v48, v36
+; W32-NEXT:    v_dual_mov_b32 v47, v35 :: v_dual_mov_b32 v46, v34
+; W32-NEXT:    v_dual_mov_b32 v45, v33 :: v_dual_mov_b32 v44, v32
+; W32-NEXT:    v_wmma_bf16_16x16x16_bf16 v[32:39], v[16:23], v[24:31], v[32:39]
+; W32-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; W32-NEXT:    v_wmma_bf16_16x16x16_bf16 v[44:51], v[0:7], v[8:15], v[44:51]
+; W32-NEXT:    s_clause 0x1
+; W32-NEXT:    global_store_b128 v[40:41], v[44:47], off
+; W32-NEXT:    global_store_b128 v[40:41], v[48:51], off offset:16
+; W32-NEXT:    s_clause 0x1
+; W32-NEXT:    global_store_b128 v[42:43], v[32:35], off
+; W32-NEXT:    global_store_b128 v[42:43], v[36:39], off offset:16
+; W32-NEXT:    s_nop 0
+; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; W32-NEXT:    s_endpgm
+bb:
+  %res.0 = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.tied(<16 x i16> %A.0, <16 x i16> %B.0, <16 x i16> %C, i1 0)
+  %res.1 = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.tied(<16 x i16> %A.1, <16 x i16> %B.1, <16 x i16> %C, i1 0)
+  store <16 x i16> %res.0, ptr addrspace(1) %out.0, align 32
+  store <16 x i16> %res.1, ptr addrspace(1) %out.1, align 32
+  ret void
+}
+
 ; @llvm.amdgcn.wmma.i32.16x16x16.iu8
 
 define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_64.ll
index a18d0a569bfb6..09702ae5cb608 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wmma_64.ll
@@ -4,7 +4,9 @@
 declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<16 x half>, <16 x half>, <4 x float>)
 declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<16 x i16>, <16 x i16>, <4 x float>)
 declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half>, <16 x half>, <8 x half>, i1 immarg)
+declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.tied(<16 x half>, <16 x half>, <8 x half>, i1 immarg)
 declare <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16>, <16 x i16>, <8 x i16>, i1 immarg)
+declare <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.tied(<16 x i16>, <16 x i16>, <8 x i16>, i1 immarg)
 declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 immarg, <4 x i32>, i1 immarg, <4 x i32>, <4 x i32>, i1 immarg)
 declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <4 x i32>, i1 immarg)
 
@@ -70,6 +72,47 @@ bb:
   ret void
 }
 
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_untied(<16 x half> %A.0, <16 x half> %B.0, <16 x half> %A.1, <16 x half> %B.1, <8 x half> %C, ptr addrspace(1) %out.0, ptr addrspace(1) %out.1) {
+; W64-LABEL: test_wmma_f16_16x16x16_f16_untied:
+; W64:       ; %bb.0: ; %bb
+; W64-NEXT:    v_wmma_f16_16x16x16_f16 v[40:43], v[0:7], v[8:15], v[32:35]
+; W64-NEXT:    v_wmma_f16_16x16x16_f16 v[32:35], v[16:23], v[24:31], v[32:35]
+; W64-NEXT:    global_store_b128 v[36:37], v[40:43], off
+; W64-NEXT:    global_store_b128 v[38:39], v[32:35], off
+; W64-NEXT:    s_nop 0
+; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; W64-NEXT:    s_endpgm
+bb:
+  %res.0 = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %A.0, <16 x half> %B.0, <8 x half> %C, i1 0)
+  %res.1 = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %A.1, <16 x half> %B.1, <8 x half> %C, i1 0)
+  store <8 x half> %res.0, ptr addrspace(1) %out.0, align 32
+  store <8 x half> %res.1, ptr addrspace(1) %out.1, align 32
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_tied(<16 x half> %A.0, <16 x half> %B.0, <16 x half> %A.1, <16 x half> %B.1, <8 x half> %C, ptr addrspace(1) %out.0, ptr addrspace(1) %out.1) {
+; W64-LABEL: test_wmma_f16_16x16x16_f16_tied:
+; W64:       ; %bb.0: ; %bb
+; W64-NEXT:    v_mov_b32_e32 v43, v35
+; W64-NEXT:    v_mov_b32_e32 v42, v34
+; W64-NEXT:    v_mov_b32_e32 v41, v33
+; W64-NEXT:    v_mov_b32_e32 v40, v32
+; W64-NEXT:    v_wmma_f16_16x16x16_f16 v[32:35], v[16:23], v[24:31], v[32:35]
+; W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; W64-NEXT:    v_wmma_f16_16x16x16_f16 v[40:43], v[0:7], v[8:15], v[40:43]
+; W64-NEXT:    global_store_b128 v[36:37], v[40:43], off
+; W64-NEXT:    global_store_b128 v[38:39], v[32:35], off
+; W64-NEXT:    s_nop 0
+; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; W64-NEXT:    s_endpgm
+bb:
+  %res.0 = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.tied(<16 x half> %A.0, <16 x half> %B.0, <8 x half> %C, i1 0)
+  %res.1 = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.tied(<16 x half> %A.1, <16 x half> %B.1, <8 x half> %C, i1 0)
+  store <8 x half> %res.0, ptr addrspace(1) %out.0, align 32
+  store <8 x half> %res.1, ptr addrspace(1) %out.1, align 32
+  ret void
+}
+
 ; @llvm.amdgcn.wmma.bf16.16x16x16.bf16
 
 define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_lo(<16 x i16> %A, <16 x i16> %B, <8 x i16> %C, ptr addrspace(1) %out) {
@@ -100,6 +143,47 @@ bb:
   ret void
 }
 
+define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_untied(<16 x i16> %A.0, <16 x i16> %B.0, <16 x i16> %A.1, <16 x i16> %B.1, <8 x i16> %C, ptr addrspace(1) %out.0, ptr addrspace(1) %out.1) {
+; W64-LABEL: test_wmma_bf16_16x16x16_bf16_untied:
+; W64:       ; %bb.0: ; %bb
+; W64-NEXT:    v_wmma_bf16_16x16x16_bf16 v[40:43], v[0:7], v[8:15], v[32:35]
+; W64-NEXT:    v_wmma_bf16_16x16x16_bf16 v[32:35], v[16:23], v[24:31], v[32:35]
+; W64-NEXT:    global_store_b128 v[36:37], v[40:43], off
+; W64-NEXT:    global_store_b128 v[38:39], v[32:35], off
+; W64-NEXT:    s_nop 0
+; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; W64-NEXT:    s_endpgm
+bb:
+  %res.0 = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %A.0, <16 x i16> %B.0, <8 x i16> %C, i1 0)
+  %res.1 = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %A.1, <16 x i16> %B.1, <8 x i16> %C, i1 0)
+  store <8 x i16> %res.0, ptr addrspace(1) %out.0, align 32
+  store <8 x i16> %res.1, ptr addrspace(1) %out.1, align 32
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_tied(<16 x i16> %A.0, <16 x i16> %B.0, <16 x i16> %A.1, <16 x i16> %B.1, <8 x i16> %C, ptr addrspace(1) %out.0, ptr addrspace(1) %out.1) {
+; W64-LABEL: test_wmma_bf16_16x16x16_bf16_tied:
+; W64:       ; %bb.0: ; %bb
+; W64-NEXT:    v_mov_b32_e32 v43, v35
+; W64-NEXT:    v_mov_b32_e32 v42, v34
+; W64-NEXT:    v_mov_b32_e32 v41, v33
+; W64-NEXT:    v_mov_b32_e32 v40, v32
+; W64-NEXT:    v_wmma_bf16_16x16x16_bf16 v[32:35], v[16:23], v[24:31], v[32:35]
+; W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; W64-NEXT:    v_wmma_bf16_16x16x16_bf16 v[40:43], v[0:7], v[8:15], v[40:43]
+; W64-NEXT:    global_store_b128 v[36:37], v[40:43], off
+; W64-NEXT:    global_store_b128 v[38:39], v[32:35], off
+; W64-NEXT:    s_nop 0
+; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; W64-NEXT:    s_endpgm
+bb:
+  %res.0 = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.tied(<16 x i16> %A.0, <16 x i16> %B.0, <8 x i16> %C, i1 0)
+  %res.1 = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.tied(<16 x i16> %A.1, <16 x i16> %B.1, <8 x i16> %C, i1 0)
+  store <8 x i16> %res.0, ptr addrspace(1) %out.0, align 32
+  store <8 x i16> %res.1, ptr addrspace(1) %out.1, align 32
+  ret void
+}
+
 ; @llvm.amdgcn.wmma.i32.16x16x16.iu8
 
 define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, ptr addrspace(1) %out) {
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_32.ll
index 464c374f638c2..5076fda60a474 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_32.ll
@@ -4,7 +4,9 @@
 declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<16 x half>, <16 x half> , <8 x float>)
 declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<16 x i16>, <16 x i16> , <8 x float>)
 declare <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half>, <16 x half> , <16 x half>, i1 immarg)
+declare <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.tied(<16 x half>, <16 x half> , <16 x half>, i1 immarg)
 declare <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16>, <16 x i16> , <16 x i16>, i1 immarg)
+declare <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.tied(<16 x i16>, <16 x i16> , <16 x i16>, i1 immarg)
 declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 immarg, <4 x i32>, i1 immarg, <4 x i32> , <8 x i32>, i1 immarg)
 declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 immarg, <2 x i32>, i1 immarg, <2 x i32> , <8 x i32>, i1 immarg)
 
@@ -78,6 +80,55 @@ bb:
   ret void
 }
 
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_untied(<16 x half> %A.0, <16 x half> %B.0, <16 x half> %A.1, <16 x half> %B.1, <16 x half> %C, ptr addrspace(1) %out.0, ptr addrspace(1) %out.1) {
+; W32-LABEL: test_wmma_f16_16x16x16_f16_untied:
+; W32:       ; %bb.0: ; %bb
+; W32-NEXT:    v_wmma_f16_16x16x16_f16 v[44:51], v[0:7], v[8:15], v[32:39]
+; W32-NEXT:    v_wmma_f16_16x16x16_f16 v[32:39], v[16:23], v[24:31], v[32:39]
+; W32-NEXT:    s_clause 0x1
+; W32-NEXT:    global_store_b128 v[40:41], v[48:51], off offset:16
+; W32-NEXT:    global_store_b128 v[40:41], v[44:47], off
+; W32-NEXT:    s_clause 0x1
+; W32-NEXT:    global_store_b128 v[42:43], v[36:39], off offset:16
+; W32-NEXT:    global_store_b128 v[42:43], v[32:35], off
+; W32-NEXT:    s_nop 0
+; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; W32-NEXT:    s_endpgm
+bb:
+  %res.0 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %A.0, <16 x half> %B.0, <16 x half> %C, i1 0)
+  %res.1 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %A.1, <16 x half> %B.1, <16 x half> %C, i1 0)
+  store <16 x half> %res.0, ptr addrspace(1) %out.0, align 32
+  store <16 x half> %res.1, ptr addrspace(1) %out.1, align 32
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_tied(<16 x half> %A.0, <16 x half> %B.0, <16 x half> %A.1, <16 x half> %B.1, <16 x half> %C, ptr addrspace(1) %out.0, ptr addrspace(1) %out.1) {
+; W32-LABEL: test_wmma_f16_16x16x16_f16_tied:
+; W32:       ; %bb.0: ; %bb
+; W32-NEXT:    v_dual_mov_b32 v51, v39 :: v_dual_mov_b32 v50, v38
+; W32-NEXT:    v_dual_mov_b32 v49, v37 :: v_dual_mov_b32 v48, v36
+; W32-NEXT:    v_dual_mov_b32 v47, v35 :: v_dual_mov_b32 v46, v34
+; W32-NEXT:    v_dual_mov_b32 v45, v33 :: v_dual_mov_b32 v44, v32
+; W32-NEXT:    v_wmma_f16_16x16x16_f16 v[32:39], v[16:23], v[24:31], v[32:39]
+; W32-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; W32-NEXT:    v_wmma_f16_16x16x16_f16 v[44:51], v[0:7], v[8:15], v[44:51]
+; W32-NEXT:    s_clause 0x1
+; W32-NEXT:    global_store_b128 v[40:41], v[48:51], off offset:16
+; W32-NEXT:    global_store_b128 v[40:41], v[44:47], off
+; W32-NEXT:    s_clause 0x1
+; W32-NEXT:    global_store_b128 v[42:43], v[36:39], off offset:16
+; W32-NEXT:    global_store_b128 v[42:43], v[32:35], off
+; W32-NEXT:    s_nop 0
+; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; W32-NEXT:    s_endpgm
+bb:
+  %res.0 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.tied(<16 x half> %A.0, <16 x half> %B.0, <16 x half> %C, i1 0)
+  %res.1 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.tied(<16 x half> %A.1, <16 x half> %B.1, <16 x half> %C, i1 0)
+  store <16 x half> %res.0, ptr addrspace(1) %out.0, align 32
+  store <16 x half> %res.1, ptr addrspace(1) %out.1, align 32
+  ret void
+}
+
 ; @llvm.amdgcn.wmma.bf16.16x16x16.bf16
 
 define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_lo(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, ptr addrspace(1) %out) {
@@ -112,6 +163,55 @@ bb:
   ret void
 }
 
+define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_untied(<16 x i16> %A.0, <16 x i16> %B.0, <16 x i16> %A.1, <16 x i16> %B.1, <16 x i16> %C, ptr addrspace(1) %out.0, ptr addrspace(1) %out.1) {
+; W32-LABEL: test_wmma_bf16_16x16x16_bf16_untied:
+; W32:       ; %bb.0: ; %bb
+; W32-NEXT:    v_wmma_bf16_16x16x16_bf16 v[44:51], v[0:7], v[8:15], v[32:39]
+; W32-NEXT:    v_wmma_bf16_16x16x16_bf16 v[32:39], v[16:23], v[24:31], v[32:39]
+; W32-NEXT:    s_clause 0x1
+; W32-NEXT:    global_store_b128 v[40:41], v[48:51], off offset:16
+; W32-NEXT:    global_store_b128 v[40:41], v[44:47], off
+; W32-NEXT:    s_clause 0x1
+; W32-NEXT:    global_store_b128 v[42:43], v[36:39], off offset:16
+; W32-NEXT:    global_store_b128 v[42:43], v[32:35], off
+; W32-NEXT:    s_nop 0
+; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; W32-NEXT:    s_endpgm
+bb:
+  %res.0 = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %A.0, <16 x i16> %B.0, <16 x i16> %C, i1 0)
+  %res.1 = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %A.1, <16 x i16> %B.1, <16 x i16> %C, i1 0)
+  store <16 x i16> %res.0, ptr addrspace(1) %out.0, align 32
+  store <16 x i16> %res.1, ptr addrspace(1) %out.1, align 32
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_tied(<16 x i16> %A.0, <16 x i16> %B.0, <16 x i16> %A.1, <16 x i16> %B.1, <16 x i16> %C, ptr addrspace(1) %out.0, ptr addrspace(1) %out.1) {
+; W32-LABEL: test_wmma_bf16_16x16x16_bf16_tied:
+; W32:       ; %bb.0: ; %bb
+; W32-NEXT:    v_dual_mov_b32 v51, v39 :: v_dual_mov_b32 v50, v38
+; W32-NEXT:    v_dual_mov_b32 v49, v37 :: v_dual_mov_b32 v48, v36
+; W32-NEXT:    v_dual_mov_b32 v47, v35 :: v_dual_mov_b32 v46, v34
+; W32-NEXT:    v_dual_mov_b32 v45, v33 :: v_dual_mov_b32 v44, v32
+; W32-NEXT:    v_wmma_bf16_16x16x16_bf16 v[32:39], v[16:23], v[24:31], v[32:39]
+; W32-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; W32-NEXT:    v_wmma_bf16_16x16x16_bf16 v[44:51], v[0:7], v[8:15], v[44:51]
+; W32-NEXT:    s_clause 0x1
+; W32-NEXT:    global_store_b128 v[40:41], v[48:51], off offset:16
+; W32-NEXT:    global_store_b128 v[40:41], v[44:47], off
+; W32-NEXT:    s_clause 0x1
+; W32-NEXT:    global_store_b128 v[42:43], v[36:39], off offset:16
+; W32-NEXT:    global_store_b128 v[42:43], v[32:35], off
+; W32-NEXT:    s_nop 0
+; W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; W32-NEXT:    s_endpgm
+bb:
+  %res.0 = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.tied(<16 x i16> %A.0, <16 x i16> %B.0, <16 x i16> %C, i1 0)
+  %res.1 = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.tied(<16 x i16> %A.1, <16 x i16> %B.1, <16 x i16> %C, i1 0)
+  store <16 x i16> %res.0, ptr addrspace(1) %out.0, align 32
+  store <16 x i16> %res.1, ptr addrspace(1) %out.1, align 32
+  ret void
+}
+
 ; @llvm.amdgcn.wmma.i32.16x16x16.iu8
 
 define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_64.ll
index 7b1e29c18c723..e47a91b4fc0ba 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma_64.ll
@@ -4,7 +4,9 @@
 declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16(<16 x half>, <16 x half>, <4 x float>)
 declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16(<16 x i16>, <16 x i16>, <4 x float>)
 declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half>, <16 x half>, <8 x half>, i1 immarg)
+declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.tied(<16 x half>, <16 x half>, <8 x half>, i1 immarg)
 declare <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16>, <16 x i16>, <8 x i16>, i1 immarg)
+declare <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.tied(<16 x i16>, <16 x i16>, <8 x i16>, i1 immarg)
 declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8(i1 immarg, <4 x i32>, i1 immarg, <4 x i32>, <4 x i32>, i1 immarg)
 declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <4 x i32>, i1 immarg)
 
@@ -70,6 +72,47 @@ bb:
   ret void
 }
 
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_untied(<16 x half> %A.0, <16 x half> %B.0, <16 x half> %A.1, <16 x half> %B.1, <8 x half> %C, ptr addrspace(1) %out.0, ptr addrspace(1) %out.1) {
+; W64-LABEL: test_wmma_f16_16x16x16_f16_untied:
+; W64:       ; %bb.0: ; %bb
+; W64-NEXT:    v_wmma_f16_16x16x16_f16 v[40:43], v[0:7], v[8:15], v[32:35]
+; W64-NEXT:    v_wmma_f16_16x16x16_f16 v[32:35], v[16:23], v[24:31], v[32:35]
+; W64-NEXT:    global_store_b128 v[36:37], v[40:43], off
+; W64-NEXT:    global_store_b128 v[38:39], v[32:35], off
+; W64-NEXT:    s_nop 0
+; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; W64-NEXT:    s_endpgm
+bb:
+  %res.0 = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %A.0, <16 x half> %B.0, <8 x half> %C, i1 0)
+  %res.1 = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16(<16 x half> %A.1, <16 x half> %B.1, <8 x half> %C, i1 0)
+  store <8 x half> %res.0, ptr addrspace(1) %out.0, align 32
+  store <8 x half> %res.1, ptr addrspace(1) %out.1, align 32
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_tied(<16 x half> %A.0, <16 x half> %B.0, <16 x half> %A.1, <16 x half> %B.1, <8 x half> %C, ptr addrspace(1) %out.0, ptr addrspace(1) %out.1) {
+; W64-LABEL: test_wmma_f16_16x16x16_f16_tied:
+; W64:       ; %bb.0: ; %bb
+; W64-NEXT:    v_mov_b32_e32 v43, v35
+; W64-NEXT:    v_mov_b32_e32 v42, v34
+; W64-NEXT:    v_mov_b32_e32 v41, v33
+; W64-NEXT:    v_mov_b32_e32 v40, v32
+; W64-NEXT:    v_wmma_f16_16x16x16_f16 v[32:35], v[16:23], v[24:31], v[32:35]
+; W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; W64-NEXT:    v_wmma_f16_16x16x16_f16 v[40:43], v[0:7], v[8:15], v[40:43]
+; W64-NEXT:    global_store_b128 v[36:37], v[40:43], off
+; W64-NEXT:    global_store_b128 v[38:39], v[32:35], off
+; W64-NEXT:    s_nop 0
+; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; W64-NEXT:    s_endpgm
+bb:
+  %res.0 = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.tied(<16 x half> %A.0, <16 x half> %B.0, <8 x half> %C, i1 0)
+  %res.1 = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.tied(<16 x half> %A.1, <16 x half> %B.1, <8 x half> %C, i1 0)
+  store <8 x half> %res.0, ptr addrspace(1) %out.0, align 32
+  store <8 x half> %res.1, ptr addrspace(1) %out.1, align 32
+  ret void
+}
+
 ; @llvm.amdgcn.wmma.bf16.16x16x16.bf16
 
 define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_lo(<16 x i16> %A, <16 x i16> %B, <8 x i16> %C, ptr addrspace(1) %out) {
@@ -100,6 +143,47 @@ bb:
   ret void
 }
 
+define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_untied(<16 x i16> %A.0, <16 x i16> %B.0, <16 x i16> %A.1, <16 x i16> %B.1, <8 x i16> %C, ptr addrspace(1) %out.0, ptr addrspace(1) %out.1) {
+; W64-LABEL: test_wmma_bf16_16x16x16_bf16_untied:
+; W64:       ; %bb.0: ; %bb
+; W64-NEXT:    v_wmma_bf16_16x16x16_bf16 v[40:43], v[0:7], v[8:15], v[32:35]
+; W64-NEXT:    v_wmma_bf16_16x16x16_bf16 v[32:35], v[16:23], v[24:31], v[32:35]
+; W64-NEXT:    global_store_b128 v[36:37], v[40:43], off
+; W64-NEXT:    global_store_b128 v[38:39], v[32:35], off
+; W64-NEXT:    s_nop 0
+; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; W64-NEXT:    s_endpgm
+bb:
+  %res.0 = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %A.0, <16 x i16> %B.0, <8 x i16> %C, i1 0)
+  %res.1 = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16> %A.1, <16 x i16> %B.1, <8 x i16> %C, i1 0)
+  store <8 x i16> %res.0, ptr addrspace(1) %out.0, align 32
+  store <8 x i16> %res.1, ptr addrspace(1) %out.1, align 32
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_tied(<16 x i16> %A.0, <16 x i16> %B.0, <16 x i16> %A.1, <16 x i16> %B.1, <8 x i16> %C, ptr addrspace(1) %out.0, ptr addrspace(1) %out.1) {
+; W64-LABEL: test_wmma_bf16_16x16x16_bf16_tied:
+; W64:       ; %bb.0: ; %bb
+; W64-NEXT:    v_mov_b32_e32 v43, v35
+; W64-NEXT:    v_mov_b32_e32 v42, v34
+; W64-NEXT:    v_mov_b32_e32 v41, v33
+; W64-NEXT:    v_mov_b32_e32 v40, v32
+; W64-NEXT:    v_wmma_bf16_16x16x16_bf16 v[32:35], v[16:23], v[24:31], v[32:35]
+; W64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; W64-NEXT:    v_wmma_bf16_16x16x16_bf16 v[40:43], v[0:7], v[8:15], v[40:43]
+; W64-NEXT:    global_store_b128 v[36:37], v[40:43], off
+; W64-NEXT:    global_store_b128 v[38:39], v[32:35], off
+; W64-NEXT:    s_nop 0
+; W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; W64-NEXT:    s_endpgm
+bb:
+  %res.0 = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.tied(<16 x i16> %A.0, <16 x i16> %B.0, <8 x i16> %C, i1 0)
+  %res.1 = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.tied(<16 x i16> %A.1, <16 x i16> %B.1, <8 x i16> %C, i1 0)
+  store <8 x i16> %res.0, ptr addrspace(1) %out.0, align 32
+  store <8 x i16> %res.1, ptr addrspace(1) %out.1, align 32
+  ret void
+}
+
 ; @llvm.amdgcn.wmma.i32.16x16x16.iu8
 
 define amdgpu_ps void @test_wmma_i32_16x16x16_ui8_unsigned_unsigned(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, ptr addrspace(1) %out) {

From 56dab2cb0733f10df4e9cff8c83dd7081154527b Mon Sep 17 00:00:00 2001
From: Timm Baeder <tbaeder@redhat.com>
Date: Mon, 30 Oct 2023 16:27:47 +0100
Subject: [PATCH 383/877] [clang][Interp] Fix truncateCast() (#69911)

The added test case used to fail because we converted the LHS to `-1`.
---
 clang/lib/AST/Interp/IntegralAP.h | 27 +++++++++++++++------------
 clang/test/AST/Interp/intap.cpp   |  3 +++
 2 files changed, 18 insertions(+), 12 deletions(-)

diff --git a/clang/lib/AST/Interp/IntegralAP.h b/clang/lib/AST/Interp/IntegralAP.h
index cfed9ca29336d..9aefea6d0c47e 100644
--- a/clang/lib/AST/Interp/IntegralAP.h
+++ b/clang/lib/AST/Interp/IntegralAP.h
@@ -35,10 +35,18 @@ template <bool Signed> class IntegralAP final {
   friend IntegralAP<!Signed>;
   APInt V;
 
-  template <typename T> static T truncateCast(const APInt &V) {
+  template <typename T, bool InputSigned>
+  static T truncateCast(const APInt &V) {
     constexpr unsigned BitSize = sizeof(T) * 8;
-    if (BitSize >= V.getBitWidth())
-      return std::is_signed_v<T> ? V.getSExtValue() : V.getZExtValue();
+    if (BitSize >= V.getBitWidth()) {
+      APInt Extended;
+      if constexpr (InputSigned)
+        Extended = V.sext(BitSize);
+      else
+        Extended = V.zext(BitSize);
+      return std::is_signed_v<T> ? Extended.getSExtValue()
+                                 : Extended.getZExtValue();
+    }
 
     return std::is_signed_v<T> ? V.trunc(BitSize).getSExtValue()
                                : V.trunc(BitSize).getZExtValue();
@@ -80,15 +88,10 @@ template <bool Signed> class IntegralAP final {
     return V.ult(RHS.V);
   }
 
-  explicit operator bool() const { return !V.isZero(); }
-  explicit operator int8_t() const { return truncateCast<int8_t>(V); }
-  explicit operator uint8_t() const { return truncateCast<uint8_t>(V); }
-  explicit operator int16_t() const { return truncateCast<int16_t>(V); }
-  explicit operator uint16_t() const { return truncateCast<uint16_t>(V); }
-  explicit operator int32_t() const { return truncateCast<int32_t>(V); }
-  explicit operator uint32_t() const { return truncateCast<uint32_t>(V); }
-  explicit operator int64_t() const { return truncateCast<int64_t>(V); }
-  explicit operator uint64_t() const { return truncateCast<uint64_t>(V); }
+  template <typename Ty, typename = std::enable_if_t<std::is_integral_v<Ty>>>
+  explicit operator Ty() const {
+    return truncateCast<Ty, Signed>(V);
+  }
 
   template <typename T> static IntegralAP from(T Value, unsigned NumBits = 0) {
     assert(NumBits > 0);
diff --git a/clang/test/AST/Interp/intap.cpp b/clang/test/AST/Interp/intap.cpp
index 27fae1b904351..02a860eb0986c 100644
--- a/clang/test/AST/Interp/intap.cpp
+++ b/clang/test/AST/Interp/intap.cpp
@@ -27,6 +27,9 @@ static_assert(BitIntZero2 == 0, "");
 constexpr unsigned _BitInt(1) UBitIntZero1{};
 static_assert(UBitIntZero1 == 0, "");
 
+constexpr unsigned _BitInt(2) BI1 = 3u;
+static_assert(BI1 == 3, "");
+
 
 #ifdef __SIZEOF_INT128__
 namespace i128 {

From f89b85996a1bd44ca5ef1f9d0df64f15f248148d Mon Sep 17 00:00:00 2001
From: Natalie Chouinard <1953083+sudonatalie@users.noreply.github.com>
Date: Mon, 30 Oct 2023 11:36:38 -0400
Subject: [PATCH 384/877] [HLSL][SPIR-V] Fix clang driver lang target test
 (#70330)

This test has been failing since the SPIR-V backend started failing
explicitly on unsupported shader types. Switched this test to a compute
shader since it is currently the only type supported.
---
 clang/test/Driver/hlsl-lang-targets-spirv.hlsl | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/clang/test/Driver/hlsl-lang-targets-spirv.hlsl b/clang/test/Driver/hlsl-lang-targets-spirv.hlsl
index ff29f143ba1dc..e04d71263770b 100644
--- a/clang/test/Driver/hlsl-lang-targets-spirv.hlsl
+++ b/clang/test/Driver/hlsl-lang-targets-spirv.hlsl
@@ -2,8 +2,8 @@
 
 // Supported targets
 //
-// RUN: %clang -target dxil-unknown-shadermodel6.2-pixel %s -S -o /dev/null 2>&1 | FileCheck --check-prefix=CHECK-VALID %s
-// RUN: %clang -target spirv-unknown-shadermodel6.2-library %s -S -o /dev/null 2>&1 | FileCheck --check-prefix=CHECK-VALID %s
+// RUN: %clang -target dxil-unknown-shadermodel6.2-compute %s -S -o /dev/null 2>&1 | FileCheck --allow-empty --check-prefix=CHECK-VALID %s
+// RUN: %clang -target spirv-unknown-shadermodel6.2-compute %s -S -o /dev/null 2>&1 | FileCheck --allow-empty --check-prefix=CHECK-VALID %s
 
 // Empty shader model
 //
@@ -27,5 +27,5 @@
 // CHECK-NO-ENV: error: shader stage is required in target '{{.*}}' for HLSL code generation
 // CHECK-BAD-ENV: error: shader stage '{{.*}}' in target '{{.*}}' is invalid for HLSL code generation
 
-[shader("pixel")]
+[shader("compute"), numthreads(1,1,1)]
 void main() {}

From 8bc4462bc1fb14d7fb0cd80560e88a2acdd46093 Mon Sep 17 00:00:00 2001
From: tsitdikov <149382295+tsitdikov@users.noreply.github.com>
Date: Mon, 30 Oct 2023 15:37:30 +0000
Subject: [PATCH 385/877] Remove unused variable. (#70670)

All usages of the variable have been removed in
https://github.com/llvm/llvm-project/pull/68689, we now need to clean it
up.
---
 .../Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp   | 1 -
 1 file changed, 1 deletion(-)

diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 67875f668d4d3..6a515c2ba4e87 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -1852,7 +1852,6 @@ convertOmpTargetData(Operation *op, llvm::IRBuilderBase &builder,
   SmallVector<Value> mapOperands;
   SmallVector<Value> useDevPtrOperands;
   SmallVector<Value> useDevAddrOperands;
-  ArrayAttr mapTypes;
   llvm::omp::RuntimeFunction RTLFn;
   DataLayout DL = DataLayout(op->getParentOfType<ModuleOp>());
 

From bb9dced2d3b479fc47221a25eae496f15c573c3c Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett@linaro.org>
Date: Mon, 30 Oct 2023 15:45:40 +0000
Subject: [PATCH 386/877] [lldb][AArch64][Linux] Rename Is<ext>Enabled to
 Is<ext>Present (#70303)

For most register sets, if it was enabled this meant you could use it,
it was present in the process. There was no present but turned off
state. So "enabled" made sense.

Then ZA came along (and soon to be ZT0) where ZA can be present in the
hardware when you have SME, but ZA itself can be made inactive. This
means that "IsZAEnabled()" doesn't mean is it active, it means do you
have SME. Which is very confusing when we actually want to know if ZA is
active.

So instead say "IsZAPresent", to make these checks more specific. For
things that can't be made inactive, present will imply "active" as
they're never inactive.
---
 .../NativeRegisterContextLinux_arm64.cpp      | 21 +++++++++----------
 .../Process/Utility/RegisterInfoPOSIX_arm64.h | 12 +++++------
 .../RegisterContextPOSIXCore_arm64.cpp        | 12 +++++------
 3 files changed, 22 insertions(+), 23 deletions(-)

diff --git a/lldb/source/Plugins/Process/Linux/NativeRegisterContextLinux_arm64.cpp b/lldb/source/Plugins/Process/Linux/NativeRegisterContextLinux_arm64.cpp
index b5210c3681442..22aa2f3a92094 100644
--- a/lldb/source/Plugins/Process/Linux/NativeRegisterContextLinux_arm64.cpp
+++ b/lldb/source/Plugins/Process/Linux/NativeRegisterContextLinux_arm64.cpp
@@ -166,10 +166,10 @@ NativeRegisterContextLinux_arm64::NativeRegisterContextLinux_arm64(
   m_tls_is_valid = false;
 
   // SME adds the tpidr2 register
-  m_tls_size = GetRegisterInfo().IsSSVEEnabled() ? sizeof(m_tls_regs)
+  m_tls_size = GetRegisterInfo().IsSSVEPresent() ? sizeof(m_tls_regs)
                                                  : sizeof(m_tls_regs.tpidr_reg);
 
-  if (GetRegisterInfo().IsSVEEnabled() || GetRegisterInfo().IsSSVEEnabled())
+  if (GetRegisterInfo().IsSVEPresent() || GetRegisterInfo().IsSSVEPresent())
     m_sve_state = SVEState::Unknown;
   else
     m_sve_state = SVEState::Disabled;
@@ -609,8 +609,7 @@ NativeRegisterContextLinux_arm64::CacheAllRegisters(uint32_t &cached_size) {
   if (error.Fail())
     return error;
 
-  // Here this means, does the system have ZA, not whether it is active.
-  if (GetRegisterInfo().IsZAEnabled()) {
+  if (GetRegisterInfo().IsZAPresent()) {
     error = ReadZAHeader();
     if (error.Fail())
       return error;
@@ -628,7 +627,7 @@ NativeRegisterContextLinux_arm64::CacheAllRegisters(uint32_t &cached_size) {
   }
 
   // If SVE is enabled we need not copy FPR separately.
-  if (GetRegisterInfo().IsSVEEnabled() || GetRegisterInfo().IsSSVEEnabled()) {
+  if (GetRegisterInfo().IsSVEPresent() || GetRegisterInfo().IsSSVEPresent()) {
     // Store mode and register data.
     cached_size +=
         sizeof(RegisterSetType) + sizeof(m_sve_state) + GetSVEBufferSize();
@@ -640,7 +639,7 @@ NativeRegisterContextLinux_arm64::CacheAllRegisters(uint32_t &cached_size) {
   if (error.Fail())
     return error;
 
-  if (GetRegisterInfo().IsMTEEnabled()) {
+  if (GetRegisterInfo().IsMTEPresent()) {
     cached_size += sizeof(RegisterSetType) + GetMTEControlSize();
     error = ReadMTEControl();
     if (error.Fail())
@@ -708,7 +707,7 @@ Status NativeRegisterContextLinux_arm64::ReadAllRegisterValues(
   // constants and the functions vec_set_vector_length, sve_set_common and
   // za_set in the Linux Kernel.
 
-  if ((m_sve_state != SVEState::Streaming) && GetRegisterInfo().IsZAEnabled()) {
+  if ((m_sve_state != SVEState::Streaming) && GetRegisterInfo().IsZAPresent()) {
     // Use the header size not the buffer size, as we may be using the buffer
     // for fake data, which we do not want to write out.
     assert(m_za_header.size <= GetZABufferSize());
@@ -716,7 +715,7 @@ Status NativeRegisterContextLinux_arm64::ReadAllRegisterValues(
                             m_za_header.size);
   }
 
-  if (GetRegisterInfo().IsSVEEnabled() || GetRegisterInfo().IsSSVEEnabled()) {
+  if (GetRegisterInfo().IsSVEPresent() || GetRegisterInfo().IsSSVEPresent()) {
     dst = AddRegisterSetType(dst, RegisterSetType::SVE);
     *(reinterpret_cast<SVEState *>(dst)) = m_sve_state;
     dst += sizeof(m_sve_state);
@@ -726,13 +725,13 @@ Status NativeRegisterContextLinux_arm64::ReadAllRegisterValues(
                             GetFPRSize());
   }
 
-  if ((m_sve_state == SVEState::Streaming) && GetRegisterInfo().IsZAEnabled()) {
+  if ((m_sve_state == SVEState::Streaming) && GetRegisterInfo().IsZAPresent()) {
     assert(m_za_header.size <= GetZABufferSize());
     dst = AddSavedRegisters(dst, RegisterSetType::SME, GetZABuffer(),
                             m_za_header.size);
   }
 
-  if (GetRegisterInfo().IsMTEEnabled()) {
+  if (GetRegisterInfo().IsMTEPresent()) {
     dst = AddSavedRegisters(dst, RegisterSetType::MTE, GetMTEControl(),
                             GetMTEControlSize());
   }
@@ -1411,7 +1410,7 @@ std::vector<uint32_t> NativeRegisterContextLinux_arm64::GetExpeditedRegisters(
     expedited_reg_nums.push_back(GetRegisterInfo().GetRegNumSVEVG());
   // SME, streaming vector length. This is used by the ZA register which is
   // present even when streaming mode is not enabled.
-  if (GetRegisterInfo().IsSSVEEnabled())
+  if (GetRegisterInfo().IsSSVEPresent())
     expedited_reg_nums.push_back(GetRegisterInfo().GetRegNumSMESVG());
 
   return expedited_reg_nums;
diff --git a/lldb/source/Plugins/Process/Utility/RegisterInfoPOSIX_arm64.h b/lldb/source/Plugins/Process/Utility/RegisterInfoPOSIX_arm64.h
index debdf4c76abc2..1bb3400e426a9 100644
--- a/lldb/source/Plugins/Process/Utility/RegisterInfoPOSIX_arm64.h
+++ b/lldb/source/Plugins/Process/Utility/RegisterInfoPOSIX_arm64.h
@@ -120,12 +120,12 @@ class RegisterInfoPOSIX_arm64
     return false;
   }
 
-  bool IsSVEEnabled() const { return m_opt_regsets.AnySet(eRegsetMaskSVE); }
-  bool IsSSVEEnabled() const { return m_opt_regsets.AnySet(eRegsetMaskSSVE); }
-  bool IsZAEnabled() const { return m_opt_regsets.AnySet(eRegsetMaskZA); }
-  bool IsPAuthEnabled() const { return m_opt_regsets.AnySet(eRegsetMaskPAuth); }
-  bool IsMTEEnabled() const { return m_opt_regsets.AnySet(eRegsetMaskMTE); }
-  bool IsTLSEnabled() const { return m_opt_regsets.AnySet(eRegsetMaskTLS); }
+  bool IsSVEPresent() const { return m_opt_regsets.AnySet(eRegsetMaskSVE); }
+  bool IsSSVEPresent() const { return m_opt_regsets.AnySet(eRegsetMaskSSVE); }
+  bool IsZAPresent() const { return m_opt_regsets.AnySet(eRegsetMaskZA); }
+  bool IsPAuthPresent() const { return m_opt_regsets.AnySet(eRegsetMaskPAuth); }
+  bool IsMTEPresent() const { return m_opt_regsets.AnySet(eRegsetMaskMTE); }
+  bool IsTLSPresent() const { return m_opt_regsets.AnySet(eRegsetMaskTLS); }
 
   bool IsSVEReg(unsigned reg) const;
   bool IsSVEZReg(unsigned reg) const;
diff --git a/lldb/source/Plugins/Process/elf-core/RegisterContextPOSIXCore_arm64.cpp b/lldb/source/Plugins/Process/elf-core/RegisterContextPOSIXCore_arm64.cpp
index 99cee83eed125..db37b7cbb99d7 100644
--- a/lldb/source/Plugins/Process/elf-core/RegisterContextPOSIXCore_arm64.cpp
+++ b/lldb/source/Plugins/Process/elf-core/RegisterContextPOSIXCore_arm64.cpp
@@ -75,7 +75,7 @@ RegisterContextCorePOSIX_arm64::RegisterContextCorePOSIX_arm64(
       m_register_info_up->GetTargetArchitecture().GetTriple();
   m_fpr_data = getRegset(notes, target_triple, FPR_Desc);
 
-  if (m_register_info_up->IsSSVEEnabled()) {
+  if (m_register_info_up->IsSSVEPresent()) {
     m_sve_data = getRegset(notes, target_triple, AARCH64_SSVE_Desc);
     lldb::offset_t flags_offset = 12;
     uint16_t flags = m_sve_data.GetU32(&flags_offset);
@@ -83,19 +83,19 @@ RegisterContextCorePOSIX_arm64::RegisterContextCorePOSIX_arm64(
       m_sve_state = SVEState::Streaming;
   }
 
-  if (m_sve_state != SVEState::Streaming && m_register_info_up->IsSVEEnabled())
+  if (m_sve_state != SVEState::Streaming && m_register_info_up->IsSVEPresent())
     m_sve_data = getRegset(notes, target_triple, AARCH64_SVE_Desc);
 
-  if (m_register_info_up->IsPAuthEnabled())
+  if (m_register_info_up->IsPAuthPresent())
     m_pac_data = getRegset(notes, target_triple, AARCH64_PAC_Desc);
 
-  if (m_register_info_up->IsTLSEnabled())
+  if (m_register_info_up->IsTLSPresent())
     m_tls_data = getRegset(notes, target_triple, AARCH64_TLS_Desc);
 
-  if (m_register_info_up->IsZAEnabled())
+  if (m_register_info_up->IsZAPresent())
     m_za_data = getRegset(notes, target_triple, AARCH64_ZA_Desc);
 
-  if (m_register_info_up->IsMTEEnabled())
+  if (m_register_info_up->IsMTEPresent())
     m_mte_data = getRegset(notes, target_triple, AARCH64_MTE_Desc);
 
   ConfigureRegisterContext();

From fecd11ba87557997b2765cd88e5c058df4eb50ce Mon Sep 17 00:00:00 2001
From: Luke Lau <luke@igalia.com>
Date: Mon, 30 Oct 2023 15:38:49 +0000
Subject: [PATCH 387/877] [RISCV] Remove old peephole declaration in
 RISCVISelDAGToDAG.h. NFC

It was removed in 72e6c1c70d5e07bbc8cb7cae2ed915108daf93aa
---
 llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h
index 675ab4e74c8f6..5c182a8699ec0 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h
@@ -190,7 +190,6 @@ class RISCVDAGToDAGISel : public SelectionDAGISel {
   bool doPeepholeMaskedRVV(MachineSDNode *Node);
   bool doPeepholeMergeVVMFold();
   bool doPeepholeNoRegPassThru();
-  bool performVMergeToVMv(SDNode *N);
   bool performCombineVMergeAndVOps(SDNode *N);
 };
 

From d9b15b068d19089f72fc0d7dc59ed1d6d77125dc Mon Sep 17 00:00:00 2001
From: Nick Desaulniers <nickdesaulniers@users.noreply.github.com>
Date: Mon, 30 Oct 2023 08:48:31 -0700
Subject: [PATCH 388/877] [CGExprConstant] stop calling into ConstExprEmitter
 for Reference type destinations (#70366)

Fixes a bug introduced by
commit b54294e2c959 ("[clang][ConstantEmitter] have
tryEmitPrivate[ForVarInit] try ConstExprEmitter fast-path first")

In the added test case, the QualType is a LValueReferenceType.

    LValueReferenceType 0x558412998d90 'const char (&)[41]'
    `-ParenType 0x558412998d30 'const char[41]' sugar
      `-ConstantArrayType 0x558412998cf0 'const char[41]' 41
        `-QualType 0x55841294c271 'const char' const
          `-BuiltinType 0x55841294c270 'char'

Fixes: #69979
---
 clang/lib/CodeGen/CGExprConstant.cpp       | 7 ++++---
 clang/test/CodeGenCXX/const-init-cxx11.cpp | 6 ++++++
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/clang/lib/CodeGen/CGExprConstant.cpp b/clang/lib/CodeGen/CGExprConstant.cpp
index 9b67a8b3335a1..3f508032e30d6 100644
--- a/clang/lib/CodeGen/CGExprConstant.cpp
+++ b/clang/lib/CodeGen/CGExprConstant.cpp
@@ -1775,9 +1775,10 @@ llvm::Constant *ConstantEmitter::tryEmitPrivate(const Expr *E,
                                                 QualType destType) {
   assert(!destType->isVoidType() && "can't emit a void constant");
 
-  if (llvm::Constant *C =
-          ConstExprEmitter(*this).Visit(const_cast<Expr *>(E), destType))
-    return C;
+  if (!destType->isReferenceType())
+    if (llvm::Constant *C =
+            ConstExprEmitter(*this).Visit(const_cast<Expr *>(E), destType))
+      return C;
 
   Expr::EvalResult Result;
 
diff --git a/clang/test/CodeGenCXX/const-init-cxx11.cpp b/clang/test/CodeGenCXX/const-init-cxx11.cpp
index d22d78d2b94ed..3a12fe444f137 100644
--- a/clang/test/CodeGenCXX/const-init-cxx11.cpp
+++ b/clang/test/CodeGenCXX/const-init-cxx11.cpp
@@ -424,6 +424,8 @@ namespace DR2126 {
 // CHECK: @_ZN33ClassTemplateWithStaticDataMember3useE ={{.*}} constant ptr @_ZGRN33ClassTemplateWithStaticDataMember1SIvE1aE_
 // CHECK: @_ZGRN39ClassTemplateWithHiddenStaticDataMember1SIvE1aE_ = linkonce_odr hidden constant i32 5, comdat
 // CHECK: @_ZN39ClassTemplateWithHiddenStaticDataMember3useE ={{.*}} constant ptr @_ZGRN39ClassTemplateWithHiddenStaticDataMember1SIvE1aE_
+// CHECK: @.str.[[STR:[0-9]+]] ={{.*}} constant [9 x i8] c"12345678\00"
+// CHECK-NEXT: @e = global %struct.PR69979 { ptr @.str.[[STR]] }
 // CHECK: @_ZGRZN20InlineStaticConstRef3funEvE1i_ = linkonce_odr constant i32 10, comdat
 // CHECK20: @_ZZN12LocalVarInit4dtorEvE1a = internal constant {{.*}} i32 103
 
@@ -632,6 +634,10 @@ struct X {
 const char *f() { return &X::p; }
 }
 
+struct PR69979 {
+  const char (&d)[9];
+} e {"12345678"};
+
 // VirtualMembers::TemplateClass::templateMethod() must be defined in this TU,
 // not just declared.
 // CHECK: define linkonce_odr void @_ZN14VirtualMembers13TemplateClassIiE14templateMethodEv(ptr {{[^,]*}} %this)

From 3746f20b567b08f50d37904b821f808343aa334f Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Mon, 30 Oct 2023 15:58:08 +0000
Subject: [PATCH 389/877] [gn build] Port 72e6c1c70d5e

---
 llvm/utils/gn/secondary/llvm/lib/Target/RISCV/BUILD.gn | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/RISCV/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/RISCV/BUILD.gn
index 94bada7a3e75b..9ac58c45a3194 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Target/RISCV/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Target/RISCV/BUILD.gn
@@ -106,6 +106,7 @@ static_library("LLVMRISCVCodeGen") {
     "RISCVDeadRegisterDefinitions.cpp",
     "RISCVExpandAtomicPseudoInsts.cpp",
     "RISCVExpandPseudoInsts.cpp",
+    "RISCVFoldMasks.cpp",
     "RISCVFrameLowering.cpp",
     "RISCVGatherScatterLowering.cpp",
     "RISCVISelDAGToDAG.cpp",

From dc8c2a7794a65f98184eeddf9c3020c1e0a08580 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?=
 =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?=
 =?UTF-8?q?=E3=83=B3=29?= <clementval@gmail.com>
Date: Mon, 30 Oct 2023 09:08:57 -0700
Subject: [PATCH 390/877] [flang][openacc][NFC] Add test for atomic with array
 ref (#70261)

After #69944 lowering of array ref in atomic operation works properly.
Add some lowering test to catch up regression in the future.
---
 .../Lower/OpenACC/acc-atomic-update-array.f90 | 103 ++++++++++++++++++
 1 file changed, 103 insertions(+)
 create mode 100644 flang/test/Lower/OpenACC/acc-atomic-update-array.f90

diff --git a/flang/test/Lower/OpenACC/acc-atomic-update-array.f90 b/flang/test/Lower/OpenACC/acc-atomic-update-array.f90
new file mode 100644
index 0000000000000..b2f69fa05c099
--- /dev/null
+++ b/flang/test/Lower/OpenACC/acc-atomic-update-array.f90
@@ -0,0 +1,103 @@
+! RUN: bbc -fopenacc -emit-fir %s -o - | FileCheck %s --check-prefixes=CHECK,FIR
+! RUN: bbc -fopenacc -emit-hlfir %s -o - | FileCheck %s --check-prefixes=CHECK,HLFIR
+
+subroutine atomic_update_array1(r, n, x)
+  implicit none
+  integer :: n
+  real :: r(n), x
+  integer :: i
+   
+  !$acc data copy(r)
+
+  !$acc parallel loop
+  do i = 1, n
+    !$acc atomic update
+    r(i) = r(i) + x
+    !$acc end atomic
+  end do
+
+  !$acc end data
+end subroutine
+
+! CHECK-LABEL: func.func @_QPatomic_update_array1(
+! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.array<?xf32>> {fir.bindc_name = "r"}, %[[ARG1:.*]]: !fir.ref<i32> {fir.bindc_name = "n"}, %[[ARG2:.*]]: !fir.ref<f32> {fir.bindc_name = "x"}) {
+! HLFIR: %[[DECL_ARG2:.*]]:2 = hlfir.declare %[[ARG2]] {uniq_name = "_QFatomic_update_array1Ex"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+! HLFIR: %[[DECL_ARG0:.*]]:2 = hlfir.declare %[[ARG0]](%{{.*}}) {uniq_name = "_QFatomic_update_array1Er"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>)
+! FIR:   %[[ARRAY_REF:.*]] = fir.coordinate_of %[[ARG0]], %{{.*}} : (!fir.ref<!fir.array<?xf32>>, i64) -> !fir.ref<f32>
+! HLFIR: %[[ARRAY_REF:.*]] = hlfir.designate %[[DECL_ARG0]]#0 (%{{.*}})  : (!fir.box<!fir.array<?xf32>>, i64) -> !fir.ref<f32>
+! FIR:   %[[LOAD_X:.*]] = fir.load %[[ARG2]] : !fir.ref<f32>
+! HLFIR: %[[LOAD_X:.*]] = fir.load %[[DECL_ARG2]]#0 : !fir.ref<f32>
+! CHECK: acc.atomic.update %[[ARRAY_REF]] : !fir.ref<f32> {
+! CHECK: ^bb0(%[[ARG:.*]]: f32):
+! CHECK:   %[[ATOMIC:.*]] = arith.addf %[[ARG]], %[[LOAD_X]] fastmath<contract> : f32
+! CHECK:   acc.yield %[[ATOMIC]] : f32
+! CHECK: }
+
+
+subroutine atomic_read_array1(r, n, x)
+  implicit none
+  integer :: n
+  real :: r(n), x
+
+  !$acc atomic read
+  x = r(n)
+end subroutine
+
+! CHECK-LABEL: func.func @_QPatomic_read_array1(
+! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.array<?xf32>> {fir.bindc_name = "r"}, %[[ARG1:.*]]: !fir.ref<i32> {fir.bindc_name = "n"}, %[[ARG2:.*]]: !fir.ref<f32> {fir.bindc_name = "x"}) {
+! FIR: %[[ARRAY_REF:.*]] = fir.coordinate_of %[[ARG0]], %{{.*}} : (!fir.ref<!fir.array<?xf32>>, i64) -> !fir.ref<f32>
+! FIR: acc.atomic.read %[[ARG2]] = %[[ARRAY_REF]] : !fir.ref<f32>, f32
+! HLFIR: %[[DECL_X:.*]]:2 = hlfir.declare %[[ARG2]] {uniq_name = "_QFatomic_read_array1Ex"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+! HLFIR: %[[DECL_R:.*]]:2 = hlfir.declare %[[ARG0]](%{{.*}}) {uniq_name = "_QFatomic_read_array1Er"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>)
+! HLFIR: %[[DES:.*]] = hlfir.designate %[[DECL_R]]#0 (%{{.*}})  : (!fir.box<!fir.array<?xf32>>, i64) -> !fir.ref<f32>
+! HLFIR: acc.atomic.read %[[DECL_X]]#1 = %[[DES]] : !fir.ref<f32>, f32
+
+subroutine atomic_write_array1(r, n, x)
+  implicit none
+  integer :: n
+  real :: r(n), x
+  
+  !$acc atomic write
+  x = r(n)
+end subroutine
+
+! CHECK-LABEL: func.func @_QPatomic_write_array1(
+! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.array<?xf32>> {fir.bindc_name = "r"}, %[[ARG1:.*]]: !fir.ref<i32> {fir.bindc_name = "n"}, %[[ARG2:.*]]: !fir.ref<f32> {fir.bindc_name = "x"}) {
+! FIR: %[[ARRAY_REF:.*]] = fir.coordinate_of %[[ARG0]], %{{.*}} : (!fir.ref<!fir.array<?xf32>>, i64) -> !fir.ref<f32>
+! FIR: %[[LOAD:.*]] = fir.load %[[ARRAY_REF]] : !fir.ref<f32> 
+! FIR: acc.atomic.write %[[ARG2]] = %[[LOAD]] : !fir.ref<f32>, f32
+! HLFIR: %[[DECL_X:.*]]:2 = hlfir.declare %[[ARG2]] {uniq_name = "_QFatomic_write_array1Ex"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+! HLFIR: %[[DECL_R:.*]]:2 = hlfir.declare %[[ARG0]](%{{.*}}) {uniq_name = "_QFatomic_write_array1Er"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>)
+! HLFIR: %[[DES:.*]] = hlfir.designate %[[DECL_R]]#0 (%{{.*}})  : (!fir.box<!fir.array<?xf32>>, i64) -> !fir.ref<f32>
+! HLFIR: %[[LOAD:.*]] = fir.load %[[DES]] : !fir.ref<f32> 
+! HLFIR: acc.atomic.write %[[DECL_X]]#1 = %[[LOAD]] : !fir.ref<f32>, f32
+
+subroutine atomic_capture_array1(r, n, x, y)
+  implicit none
+  integer :: n, i
+  real :: r(n), x, y
+
+  !$acc atomic capture
+  r(i) = r(i) + x
+  y = r(i)
+  !$acc end atomic
+end subroutine
+
+! CHECK-LABEL: func.func @_QPatomic_capture_array1(
+! CHECK-SAME: %[[ARG0:.*]]: !fir.ref<!fir.array<?xf32>> {fir.bindc_name = "r"}, %[[ARG1:.*]]: !fir.ref<i32> {fir.bindc_name = "n"}, %[[ARG2:.*]]: !fir.ref<f32> {fir.bindc_name = "x"}, %[[ARG3:.*]]: !fir.ref<f32> {fir.bindc_name = "y"}) {
+! HLFIR: %[[DECL_X:.*]]:2 = hlfir.declare %[[ARG2]] {uniq_name = "_QFatomic_capture_array1Ex"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+! HLFIR: %[[DECL_Y:.*]]:2 = hlfir.declare %[[ARG3]] {uniq_name = "_QFatomic_capture_array1Ey"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+! HLFIR: %[[DECL_R:.*]]:2 = hlfir.declare %[[ARG0]](%{{.*}}) {uniq_name = "_QFatomic_capture_array1Er"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>)
+! HLFIR: %[[R_I:.*]] = hlfir.designate %[[DECL_R]]#0 (%{{.*}})  : (!fir.box<!fir.array<?xf32>>, i64) -> !fir.ref<f32>
+! FIR:   %[[R_I:.*]] = fir.coordinate_of %[[ARG0]], %{{.*}} : (!fir.ref<!fir.array<?xf32>>, i64) -> !fir.ref<f32>
+! HLFIR: %[[LOAD:.*]] = fir.load %[[DECL_X]]#0 : !fir.ref<f32>
+! FIR:   %[[LOAD:.*]] = fir.load %[[ARG2]] : !fir.ref<f32>
+! CHECK: acc.atomic.capture {
+! CHECK:   acc.atomic.update %[[R_I]] : !fir.ref<f32> {
+! CHECK:   ^bb0(%[[ARG:.*]]: f32):
+! CHECK:     %[[ADD:.*]] = arith.addf %[[ARG]], %[[LOAD]] fastmath<contract> : f32
+! CHECK:     acc.yield %[[ADD]] : f32
+! CHECK:   }
+! HLFIR:   acc.atomic.read %[[DECL_Y]]#1 = %[[R_I]] : !fir.ref<f32>, f32
+! FIR:     acc.atomic.read %[[ARG3]] = %[[R_I]] : !fir.ref<f32>, f32
+! CHECK: }

From f95b2f1acf1171abb0d00089fd4c9238753847e3 Mon Sep 17 00:00:00 2001
From: Alan Phipps <a-phipps@ti.com>
Date: Thu, 21 Sep 2023 13:07:31 -0500
Subject: [PATCH 391/877] Reland "[InstrProf][compiler-rt] Enable MC/DC Support
 in LLVM Source-based Code Coverage (1/3)"

Part 1 of 3. This includes the LLVM back-end processing and profile
reading/writing components. compiler-rt changes are included.

Differential Revision: https://reviews.llvm.org/D138846
---
 clang/lib/Driver/ToolChains/Darwin.cpp        |   4 +-
 clang/test/Driver/darwin-ld.c                 |   2 +-
 compiler-rt/include/profile/InstrProfData.inc |  22 +-
 compiler-rt/lib/profile/InstrProfiling.c      |   4 +
 compiler-rt/lib/profile/InstrProfiling.h      |  25 +-
 .../lib/profile/InstrProfilingBuffer.c        |  42 +-
 compiler-rt/lib/profile/InstrProfilingFile.c  |  50 ++-
 .../lib/profile/InstrProfilingInternal.h      |   8 +-
 compiler-rt/lib/profile/InstrProfilingMerge.c |  33 +-
 .../lib/profile/InstrProfilingPlatformAIX.c   |   7 +-
 .../profile/InstrProfilingPlatformDarwin.c    |   9 +
 .../lib/profile/InstrProfilingPlatformLinux.c |  10 +
 .../lib/profile/InstrProfilingPlatformOther.c |   4 +
 .../profile/InstrProfilingPlatformWindows.c   |   7 +
 .../lib/profile/InstrProfilingWriter.c        |  18 +-
 .../profile/instrprof-write-buffer-internal.c |  21 +-
 llvm/docs/LangRef.rst                         | 138 +++++++
 llvm/include/llvm/IR/IntrinsicInst.h          |  94 ++++-
 llvm/include/llvm/IR/Intrinsics.td            |  15 +
 .../ProfileData/Coverage/CoverageMapping.h    |   4 +-
 llvm/include/llvm/ProfileData/InstrProf.h     |  21 +-
 .../llvm/ProfileData/InstrProfData.inc        |  22 +-
 .../llvm/ProfileData/InstrProfReader.h        |   9 +
 .../Instrumentation/InstrProfiling.h          |  46 ++-
 .../SelectionDAG/SelectionDAGBuilder.cpp      |   6 +
 llvm/lib/IR/IntrinsicInst.cpp                 |   4 +-
 .../Coverage/CoverageMappingReader.cpp        |   4 +
 llvm/lib/ProfileData/InstrProf.cpp            |  23 +-
 llvm/lib/ProfileData/InstrProfCorrelator.cpp  |   4 +
 llvm/lib/ProfileData/InstrProfReader.cpp      | 111 ++++-
 llvm/lib/ProfileData/InstrProfWriter.cpp      |  19 +
 .../Instrumentation/InstrProfiling.cpp        | 379 ++++++++++++++----
 .../Instrumentation/InstrProfiling/mcdc.ll    |  53 +++
 .../Transforms/PGOProfile/comdat_internal.ll  |   4 +-
 .../tools/llvm-profdata/Inputs/basic.profraw  | Bin 152 -> 192 bytes
 .../llvm-profdata/Inputs/c-general.profraw    | Bin 1800 -> 2016 bytes
 .../llvm-profdata/Inputs/compat.profdata.v10  | Bin 0 -> 872 bytes
 .../llvm-profdata/Inputs/compressed.profraw   | Bin 1768 -> 1968 bytes
 .../llvm-profdata/binary-ids-padding.test     |  13 +-
 llvm/test/tools/llvm-profdata/compat.proftext |  23 ++
 .../llvm-profdata/large-binary-id-size.test   |   5 +-
 ...alformed-not-space-for-another-header.test |   9 +-
 .../malformed-num-counters-zero.test          |  10 +-
 .../malformed-ptr-to-counter-array.test       |   9 +-
 .../test/tools/llvm-profdata/mcdc-bitmap.test | 201 ++++++++++
 .../misaligned-binary-ids-size.test           |   2 +-
 .../mismatched-raw-profile-header.test        |   3 +
 .../tools/llvm-profdata/raw-32-bits-be.test   |  28 +-
 .../tools/llvm-profdata/raw-32-bits-le.test   |  28 +-
 .../tools/llvm-profdata/raw-64-bits-be.test   |  24 +-
 .../tools/llvm-profdata/raw-64-bits-le.test   |  24 +-
 .../tools/llvm-profdata/raw-two-profiles.test |  14 +-
 52 files changed, 1441 insertions(+), 174 deletions(-)
 create mode 100644 llvm/test/Instrumentation/InstrProfiling/mcdc.ll
 create mode 100644 llvm/test/tools/llvm-profdata/Inputs/compat.profdata.v10
 create mode 100644 llvm/test/tools/llvm-profdata/mcdc-bitmap.test

diff --git a/clang/lib/Driver/ToolChains/Darwin.cpp b/clang/lib/Driver/ToolChains/Darwin.cpp
index 15b9889157b90..f28e08d81bf29 100644
--- a/clang/lib/Driver/ToolChains/Darwin.cpp
+++ b/clang/lib/Driver/ToolChains/Darwin.cpp
@@ -1400,7 +1400,7 @@ void Darwin::addProfileRTLibs(const ArgList &Args,
     addExportedSymbol(CmdArgs, "_reset_fn_list");
   }
 
-  // Align __llvm_prf_{cnts,data} sections to the maximum expected page
+  // Align __llvm_prf_{cnts,bits,data} sections to the maximum expected page
   // alignment. This allows profile counters to be mmap()'d to disk. Note that
   // it's not enough to just page-align __llvm_prf_cnts: the following section
   // must also be page-aligned so that its data is not clobbered by mmap().
@@ -1410,7 +1410,7 @@ void Darwin::addProfileRTLibs(const ArgList &Args,
   // extra alignment also allows the same binary to be used with/without sync
   // enabled.
   if (!ForGCOV) {
-    for (auto IPSK : {llvm::IPSK_cnts, llvm::IPSK_data}) {
+    for (auto IPSK : {llvm::IPSK_cnts, llvm::IPSK_bitmap, llvm::IPSK_data}) {
       addSectalignToPage(
           Args, CmdArgs, "__DATA",
           llvm::getInstrProfSectionName(IPSK, llvm::Triple::MachO,
diff --git a/clang/test/Driver/darwin-ld.c b/clang/test/Driver/darwin-ld.c
index b835b1f876beb..f0ca411430cc7 100644
--- a/clang/test/Driver/darwin-ld.c
+++ b/clang/test/Driver/darwin-ld.c
@@ -336,7 +336,7 @@
 // RUN: FileCheck -check-prefix=PROFILE_SECTALIGN %s < %t.log
 // RUN: %clang -target arm64-apple-ios12 -fprofile-instr-generate -### %t.o 2> %t.log
 // RUN: FileCheck -check-prefix=PROFILE_SECTALIGN %s < %t.log
-// PROFILE_SECTALIGN: "-sectalign" "__DATA" "__llvm_prf_cnts" "0x4000" "-sectalign" "__DATA" "__llvm_prf_data" "0x4000"
+// PROFILE_SECTALIGN: "-sectalign" "__DATA" "__llvm_prf_cnts" "0x4000" "-sectalign" "__DATA" "__llvm_prf_bits" "0x4000" "-sectalign" "__DATA" "__llvm_prf_data" "0x4000"
 
 // RUN: %clang -target x86_64-apple-darwin12 -fprofile-instr-generate --coverage -### %t.o 2> %t.log
 // RUN: FileCheck -check-prefix=NO_PROFILE_EXPORT %s < %t.log
diff --git a/compiler-rt/include/profile/InstrProfData.inc b/compiler-rt/include/profile/InstrProfData.inc
index 8ba7e186d4fb1..1cf83011206a8 100644
--- a/compiler-rt/include/profile/InstrProfData.inc
+++ b/compiler-rt/include/profile/InstrProfData.inc
@@ -76,6 +76,7 @@ INSTR_PROF_DATA(const uint64_t, llvm::Type::getInt64Ty(Ctx), FuncHash, \
                 ConstantInt::get(llvm::Type::getInt64Ty(Ctx), \
                 Inc->getHash()->getZExtValue()))
 INSTR_PROF_DATA(const IntPtrT, IntPtrTy, CounterPtr, RelativeCounterPtr)
+INSTR_PROF_DATA(const IntPtrT, IntPtrTy, BitmapPtr, RelativeBitmapPtr)
 /* This is used to map function pointers for the indirect call targets to
  * function name hashes during the conversion from raw to merged profile
  * data.
@@ -87,7 +88,9 @@ INSTR_PROF_DATA(IntPtrT, llvm::Type::getInt8PtrTy(Ctx), Values, \
 INSTR_PROF_DATA(const uint32_t, llvm::Type::getInt32Ty(Ctx), NumCounters, \
                 ConstantInt::get(llvm::Type::getInt32Ty(Ctx), NumCounters))
 INSTR_PROF_DATA(const uint16_t, Int16ArrayTy, NumValueSites[IPVK_Last+1], \
-                ConstantArray::get(Int16ArrayTy, Int16ArrayVals))
+                ConstantArray::get(Int16ArrayTy, Int16ArrayVals)) \
+INSTR_PROF_DATA(const uint32_t, llvm::Type::getInt32Ty(Ctx), NumBitmapBytes, \
+                ConstantInt::get(llvm::Type::getInt32Ty(Ctx), NumBitmapBytes))
 #undef INSTR_PROF_DATA
 /* INSTR_PROF_DATA end. */
 
@@ -132,9 +135,13 @@ INSTR_PROF_RAW_HEADER(uint64_t, NumData, NumData)
 INSTR_PROF_RAW_HEADER(uint64_t, PaddingBytesBeforeCounters, PaddingBytesBeforeCounters)
 INSTR_PROF_RAW_HEADER(uint64_t, NumCounters, NumCounters)
 INSTR_PROF_RAW_HEADER(uint64_t, PaddingBytesAfterCounters, PaddingBytesAfterCounters)
+INSTR_PROF_RAW_HEADER(uint64_t, NumBitmapBytes, NumBitmapBytes)
+INSTR_PROF_RAW_HEADER(uint64_t, PaddingBytesAfterBitmapBytes, PaddingBytesAfterBitmapBytes)
 INSTR_PROF_RAW_HEADER(uint64_t, NamesSize,  NamesSize)
 INSTR_PROF_RAW_HEADER(uint64_t, CountersDelta,
                       (uintptr_t)CountersBegin - (uintptr_t)DataBegin)
+INSTR_PROF_RAW_HEADER(uint64_t, BitmapDelta,
+                      (uintptr_t)BitmapBegin - (uintptr_t)DataBegin)
 INSTR_PROF_RAW_HEADER(uint64_t, NamesDelta, (uintptr_t)NamesBegin)
 INSTR_PROF_RAW_HEADER(uint64_t, ValueKindLast, IPVK_Last)
 #undef INSTR_PROF_RAW_HEADER
@@ -267,6 +274,9 @@ INSTR_PROF_SECT_ENTRY(IPSK_data, \
 INSTR_PROF_SECT_ENTRY(IPSK_cnts, \
                       INSTR_PROF_QUOTE(INSTR_PROF_CNTS_COMMON), \
                       INSTR_PROF_CNTS_COFF, "__DATA,")
+INSTR_PROF_SECT_ENTRY(IPSK_bitmap, \
+                      INSTR_PROF_QUOTE(INSTR_PROF_BITS_COMMON), \
+                      INSTR_PROF_BITS_COFF, "__DATA,")
 INSTR_PROF_SECT_ENTRY(IPSK_name, \
                       INSTR_PROF_QUOTE(INSTR_PROF_NAME_COMMON), \
                       INSTR_PROF_NAME_COFF, "__DATA,")
@@ -645,11 +655,11 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
         (uint64_t)'f' << 16 | (uint64_t)'R' << 8 | (uint64_t)129
 
 /* Raw profile format version (start from 1). */
-#define INSTR_PROF_RAW_VERSION 8
+#define INSTR_PROF_RAW_VERSION 9
 /* Indexed profile format version (start from 1). */
-#define INSTR_PROF_INDEX_VERSION 10
+#define INSTR_PROF_INDEX_VERSION 11
 /* Coverage mapping format version (start from 0). */
-#define INSTR_PROF_COVMAP_VERSION 5
+#define INSTR_PROF_COVMAP_VERSION 6
 
 /* Profile version is always of type uint64_t. Reserve the upper 32 bits in the
  * version for other variants of profile. We set the 8th most significant bit 
@@ -686,6 +696,7 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
 #define INSTR_PROF_DATA_COMMON __llvm_prf_data
 #define INSTR_PROF_NAME_COMMON __llvm_prf_names
 #define INSTR_PROF_CNTS_COMMON __llvm_prf_cnts
+#define INSTR_PROF_BITS_COMMON __llvm_prf_bits
 #define INSTR_PROF_VALS_COMMON __llvm_prf_vals
 #define INSTR_PROF_VNODES_COMMON __llvm_prf_vnds
 #define INSTR_PROF_COVMAP_COMMON __llvm_covmap
@@ -697,6 +708,7 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
 #define INSTR_PROF_DATA_COFF ".lprfd$M"
 #define INSTR_PROF_NAME_COFF ".lprfn$M"
 #define INSTR_PROF_CNTS_COFF ".lprfc$M"
+#define INSTR_PROF_BITS_COFF ".lprfb$M"
 #define INSTR_PROF_VALS_COFF ".lprfv$M"
 #define INSTR_PROF_VNODES_COFF ".lprfnd$M"
 #define INSTR_PROF_COVMAP_COFF ".lcovmap$M"
@@ -708,6 +720,7 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
 #define INSTR_PROF_DATA_SECT_NAME INSTR_PROF_DATA_COFF
 #define INSTR_PROF_NAME_SECT_NAME INSTR_PROF_NAME_COFF
 #define INSTR_PROF_CNTS_SECT_NAME INSTR_PROF_CNTS_COFF
+#define INSTR_PROF_BITS_SECT_NAME INSTR_PROF_BITS_COFF
 /* Array of pointers. Each pointer points to a list
  * of value nodes associated with one value site.
  */
@@ -722,6 +735,7 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
 #define INSTR_PROF_DATA_SECT_NAME INSTR_PROF_QUOTE(INSTR_PROF_DATA_COMMON)
 #define INSTR_PROF_NAME_SECT_NAME INSTR_PROF_QUOTE(INSTR_PROF_NAME_COMMON)
 #define INSTR_PROF_CNTS_SECT_NAME INSTR_PROF_QUOTE(INSTR_PROF_CNTS_COMMON)
+#define INSTR_PROF_BITS_SECT_NAME INSTR_PROF_QUOTE(INSTR_PROF_BITS_COMMON)
 /* Array of pointers. Each pointer points to a list
  * of value nodes associated with one value site.
  */
diff --git a/compiler-rt/lib/profile/InstrProfiling.c b/compiler-rt/lib/profile/InstrProfiling.c
index 0dd5ff5ae6331..da04d8ebdec95 100644
--- a/compiler-rt/lib/profile/InstrProfiling.c
+++ b/compiler-rt/lib/profile/InstrProfiling.c
@@ -60,6 +60,10 @@ COMPILER_RT_VISIBILITY void __llvm_profile_reset_counters(void) {
       (__llvm_profile_get_version() & VARIANT_MASK_BYTE_COVERAGE) ? 0xFF : 0;
   memset(I, ResetValue, E - I);
 
+  I = __llvm_profile_begin_bitmap();
+  E = __llvm_profile_end_bitmap();
+  memset(I, 0x0, E - I);
+
   const __llvm_profile_data *DataBegin = __llvm_profile_begin_data();
   const __llvm_profile_data *DataEnd = __llvm_profile_end_data();
   const __llvm_profile_data *DI;
diff --git a/compiler-rt/lib/profile/InstrProfiling.h b/compiler-rt/lib/profile/InstrProfiling.h
index 4433d7bd48871..e143149fca827 100644
--- a/compiler-rt/lib/profile/InstrProfiling.h
+++ b/compiler-rt/lib/profile/InstrProfiling.h
@@ -88,6 +88,8 @@ const char *__llvm_profile_begin_names(void);
 const char *__llvm_profile_end_names(void);
 char *__llvm_profile_begin_counters(void);
 char *__llvm_profile_end_counters(void);
+char *__llvm_profile_begin_bitmap(void);
+char *__llvm_profile_end_bitmap(void);
 ValueProfNode *__llvm_profile_begin_vnodes();
 ValueProfNode *__llvm_profile_end_vnodes();
 uint32_t *__llvm_profile_begin_orderfile();
@@ -101,11 +103,11 @@ void __llvm_profile_reset_counters(void);
 /*!
  * \brief Merge profile data from buffer.
  *
- * Read profile data form buffer \p Profile  and merge with in-process profile
- * counters. The client is expected to have checked or already knows the profile
- * data in the buffer matches the in-process counter structure before calling
- * it. Returns 0 (success) if the profile data is valid. Upon reading
- * invalid/corrupted profile data, returns 1 (failure).
+ * Read profile data from buffer \p Profile and merge with in-process profile
+ * counters and bitmaps. The client is expected to have checked or already
+ * know the profile data in the buffer matches the in-process counter
+ * structure before calling it. Returns 0 (success) if the profile data is
+ * valid. Upon reading invalid/corrupted profile data, returns 1 (failure).
  */
 int __llvm_profile_merge_from_buffer(const char *Profile, uint64_t Size);
 
@@ -113,8 +115,8 @@ int __llvm_profile_merge_from_buffer(const char *Profile, uint64_t Size);
  *
  *  Returns 0 (success) if the profile data in buffer \p Profile with size
  *  \p Size was generated by the same binary and therefore matches
- *  structurally the in-process counters. If the profile data in buffer is
- *  not compatible, the interface returns 1 (failure).
+ *  structurally the in-process counters and bitmaps. If the profile data in
+ *  buffer is not compatible, the interface returns 1 (failure).
  */
 int __llvm_profile_check_compatibility(const char *Profile,
                                        uint64_t Size);
@@ -276,6 +278,10 @@ uint64_t __llvm_profile_get_num_counters(const char *Begin, const char *End);
 /*! \brief Get the size of the profile counters section in bytes. */
 uint64_t __llvm_profile_get_counters_size(const char *Begin, const char *End);
 
+/*! \brief Get the number of bytes in the profile bitmap section. */
+uint64_t __llvm_profile_get_num_bitmap_bytes(const char *Begin,
+                                             const char *End);
+
 /* ! \brief Given the sizes of the data and counter information, return the
  * number of padding bytes before and after the counters, and after the names,
  * in the raw profile.
@@ -286,8 +292,9 @@ uint64_t __llvm_profile_get_counters_size(const char *Begin, const char *End);
  * needed to achieve that.
  */
 void __llvm_profile_get_padding_sizes_for_counters(
-    uint64_t DataSize, uint64_t CountersSize, uint64_t NamesSize,
-    uint64_t *PaddingBytesBeforeCounters, uint64_t *PaddingBytesAfterCounters,
+    uint64_t DataSize, uint64_t CountersSize, uint64_t NumBitmapBytes,
+    uint64_t NamesSize, uint64_t *PaddingBytesBeforeCounters,
+    uint64_t *PaddingBytesAfterCounters, uint64_t *PaddingBytesAfterBitmap,
     uint64_t *PaddingBytesAfterNames);
 
 /*!
diff --git a/compiler-rt/lib/profile/InstrProfilingBuffer.c b/compiler-rt/lib/profile/InstrProfilingBuffer.c
index 61ac5d9c02850..c7217b2dfef8a 100644
--- a/compiler-rt/lib/profile/InstrProfilingBuffer.c
+++ b/compiler-rt/lib/profile/InstrProfilingBuffer.c
@@ -43,11 +43,14 @@ uint64_t __llvm_profile_get_size_for_buffer(void) {
   const __llvm_profile_data *DataEnd = __llvm_profile_end_data();
   const char *CountersBegin = __llvm_profile_begin_counters();
   const char *CountersEnd = __llvm_profile_end_counters();
+  const char *BitmapBegin = __llvm_profile_begin_bitmap();
+  const char *BitmapEnd = __llvm_profile_end_bitmap();
   const char *NamesBegin = __llvm_profile_begin_names();
   const char *NamesEnd = __llvm_profile_end_names();
 
   return __llvm_profile_get_size_for_buffer_internal(
-      DataBegin, DataEnd, CountersBegin, CountersEnd, NamesBegin, NamesEnd);
+      DataBegin, DataEnd, CountersBegin, CountersEnd, BitmapBegin, BitmapEnd,
+      NamesBegin, NamesEnd);
 }
 
 COMPILER_RT_VISIBILITY
@@ -83,6 +86,12 @@ uint64_t __llvm_profile_get_counters_size(const char *Begin, const char *End) {
          __llvm_profile_counter_entry_size();
 }
 
+COMPILER_RT_VISIBILITY
+uint64_t __llvm_profile_get_num_bitmap_bytes(const char *Begin,
+                                             const char *End) {
+  return (End - Begin);
+}
+
 /// Calculate the number of padding bytes needed to add to \p Offset in order
 /// for (\p Offset + Padding) to be page-aligned.
 static uint64_t calculateBytesNeededToPageAlign(uint64_t Offset) {
@@ -102,13 +111,16 @@ static int needsCounterPadding(void) {
 
 COMPILER_RT_VISIBILITY
 void __llvm_profile_get_padding_sizes_for_counters(
-    uint64_t DataSize, uint64_t CountersSize, uint64_t NamesSize,
-    uint64_t *PaddingBytesBeforeCounters, uint64_t *PaddingBytesAfterCounters,
+    uint64_t DataSize, uint64_t CountersSize, uint64_t NumBitmapBytes,
+    uint64_t NamesSize, uint64_t *PaddingBytesBeforeCounters,
+    uint64_t *PaddingBytesAfterCounters, uint64_t *PaddingBytesAfterBitmapBytes,
     uint64_t *PaddingBytesAfterNames) {
   if (!needsCounterPadding()) {
     *PaddingBytesBeforeCounters = 0;
     *PaddingBytesAfterCounters =
         __llvm_profile_get_num_padding_bytes(CountersSize);
+    *PaddingBytesAfterBitmapBytes =
+        __llvm_profile_get_num_padding_bytes(NumBitmapBytes);
     *PaddingBytesAfterNames = __llvm_profile_get_num_padding_bytes(NamesSize);
     return;
   }
@@ -118,31 +130,37 @@ void __llvm_profile_get_padding_sizes_for_counters(
   *PaddingBytesBeforeCounters =
       calculateBytesNeededToPageAlign(sizeof(__llvm_profile_header) + DataSize);
   *PaddingBytesAfterCounters = calculateBytesNeededToPageAlign(CountersSize);
+  *PaddingBytesAfterBitmapBytes =
+      calculateBytesNeededToPageAlign(NumBitmapBytes);
   *PaddingBytesAfterNames = calculateBytesNeededToPageAlign(NamesSize);
 }
 
 COMPILER_RT_VISIBILITY
 uint64_t __llvm_profile_get_size_for_buffer_internal(
     const __llvm_profile_data *DataBegin, const __llvm_profile_data *DataEnd,
-    const char *CountersBegin, const char *CountersEnd, const char *NamesBegin,
-    const char *NamesEnd) {
+    const char *CountersBegin, const char *CountersEnd, const char *BitmapBegin,
+    const char *BitmapEnd, const char *NamesBegin, const char *NamesEnd) {
   /* Match logic in __llvm_profile_write_buffer(). */
   const uint64_t NamesSize = (NamesEnd - NamesBegin) * sizeof(char);
   uint64_t DataSize = __llvm_profile_get_data_size(DataBegin, DataEnd);
   uint64_t CountersSize =
       __llvm_profile_get_counters_size(CountersBegin, CountersEnd);
+  const uint64_t NumBitmapBytes =
+      __llvm_profile_get_num_bitmap_bytes(BitmapBegin, BitmapEnd);
 
   /* Determine how much padding is needed before/after the counters and after
    * the names. */
   uint64_t PaddingBytesBeforeCounters, PaddingBytesAfterCounters,
-      PaddingBytesAfterNames;
+      PaddingBytesAfterNames, PaddingBytesAfterBitmapBytes;
   __llvm_profile_get_padding_sizes_for_counters(
-      DataSize, CountersSize, NamesSize, &PaddingBytesBeforeCounters,
-      &PaddingBytesAfterCounters, &PaddingBytesAfterNames);
+      DataSize, CountersSize, NumBitmapBytes, NamesSize,
+      &PaddingBytesBeforeCounters, &PaddingBytesAfterCounters,
+      &PaddingBytesAfterBitmapBytes, &PaddingBytesAfterNames);
 
   return sizeof(__llvm_profile_header) + __llvm_write_binary_ids(NULL) +
          DataSize + PaddingBytesBeforeCounters + CountersSize +
-         PaddingBytesAfterCounters + NamesSize + PaddingBytesAfterNames;
+         PaddingBytesAfterCounters + NumBitmapBytes +
+         PaddingBytesAfterBitmapBytes + NamesSize + PaddingBytesAfterNames;
 }
 
 COMPILER_RT_VISIBILITY
@@ -160,9 +178,11 @@ COMPILER_RT_VISIBILITY int __llvm_profile_write_buffer(char *Buffer) {
 COMPILER_RT_VISIBILITY int __llvm_profile_write_buffer_internal(
     char *Buffer, const __llvm_profile_data *DataBegin,
     const __llvm_profile_data *DataEnd, const char *CountersBegin,
-    const char *CountersEnd, const char *NamesBegin, const char *NamesEnd) {
+    const char *CountersEnd, const char *BitmapBegin, const char *BitmapEnd,
+    const char *NamesBegin, const char *NamesEnd) {
   ProfDataWriter BufferWriter;
   initBufferWriter(&BufferWriter, Buffer);
   return lprofWriteDataImpl(&BufferWriter, DataBegin, DataEnd, CountersBegin,
-                            CountersEnd, 0, NamesBegin, NamesEnd, 0);
+                            CountersEnd, BitmapBegin, BitmapEnd, 0, NamesBegin,
+                            NamesEnd, 0);
 }
diff --git a/compiler-rt/lib/profile/InstrProfilingFile.c b/compiler-rt/lib/profile/InstrProfilingFile.c
index ae7872c63f796..1685b30b9492a 100644
--- a/compiler-rt/lib/profile/InstrProfilingFile.c
+++ b/compiler-rt/lib/profile/InstrProfilingFile.c
@@ -108,14 +108,18 @@ static int mmapForContinuousMode(uint64_t CurrentFileOffset, FILE *File) {
   const __llvm_profile_data *DataEnd = __llvm_profile_end_data();
   const char *CountersBegin = __llvm_profile_begin_counters();
   const char *CountersEnd = __llvm_profile_end_counters();
+  const char *BitmapBegin = __llvm_profile_begin_bitmap();
+  const char *BitmapEnd = __llvm_profile_end_bitmap();
   const char *NamesBegin = __llvm_profile_begin_names();
   const char *NamesEnd = __llvm_profile_end_names();
   const uint64_t NamesSize = (NamesEnd - NamesBegin) * sizeof(char);
   uint64_t DataSize = __llvm_profile_get_data_size(DataBegin, DataEnd);
   uint64_t CountersSize =
       __llvm_profile_get_counters_size(CountersBegin, CountersEnd);
+  uint64_t NumBitmapBytes =
+      __llvm_profile_get_num_bitmap_bytes(BitmapBegin, BitmapEnd);
 
-  /* Check that the counter and data sections in this image are
+  /* Check that the counter, bitmap, and data sections in this image are
    * page-aligned. */
   unsigned PageSize = getpagesize();
   if ((intptr_t)CountersBegin % PageSize != 0) {
@@ -123,6 +127,11 @@ static int mmapForContinuousMode(uint64_t CurrentFileOffset, FILE *File) {
              CountersBegin, PageSize);
     return 1;
   }
+  if ((intptr_t)BitmapBegin % PageSize != 0) {
+    PROF_ERR("Bitmap section not page-aligned (start = %p, pagesz = %u).\n",
+             BitmapBegin, PageSize);
+    return 1;
+  }
   if ((intptr_t)DataBegin % PageSize != 0) {
     PROF_ERR("Data section not page-aligned (start = %p, pagesz = %u).\n",
              DataBegin, PageSize);
@@ -132,10 +141,11 @@ static int mmapForContinuousMode(uint64_t CurrentFileOffset, FILE *File) {
   /* Determine how much padding is needed before/after the counters and
    * after the names. */
   uint64_t PaddingBytesBeforeCounters, PaddingBytesAfterCounters,
-      PaddingBytesAfterNames;
+      PaddingBytesAfterNames, PaddingBytesAfterBitmapBytes;
   __llvm_profile_get_padding_sizes_for_counters(
-      DataSize, CountersSize, NamesSize, &PaddingBytesBeforeCounters,
-      &PaddingBytesAfterCounters, &PaddingBytesAfterNames);
+      DataSize, CountersSize, NumBitmapBytes, NamesSize,
+      &PaddingBytesBeforeCounters, &PaddingBytesAfterCounters,
+      &PaddingBytesAfterBitmapBytes, &PaddingBytesAfterNames);
 
   uint64_t PageAlignedCountersLength = CountersSize + PaddingBytesAfterCounters;
   uint64_t FileOffsetToCounters = CurrentFileOffset +
@@ -155,6 +165,31 @@ static int mmapForContinuousMode(uint64_t CurrentFileOffset, FILE *File) {
         FileOffsetToCounters);
     return 1;
   }
+
+  /* Also mmap MCDC bitmap bytes. If there aren't any bitmap bytes, mmap()
+   * will fail with EINVAL. */
+  if (NumBitmapBytes == 0)
+    return 0;
+
+  uint64_t PageAlignedBitmapLength =
+      NumBitmapBytes + PaddingBytesAfterBitmapBytes;
+  uint64_t FileOffsetToBitmap =
+      CurrentFileOffset + sizeof(__llvm_profile_header) + DataSize +
+      PaddingBytesBeforeCounters + CountersSize + PaddingBytesAfterCounters;
+  void *BitmapMmap =
+      mmap((void *)BitmapBegin, PageAlignedBitmapLength, PROT_READ | PROT_WRITE,
+           MAP_FIXED | MAP_SHARED, Fileno, FileOffsetToBitmap);
+  if (BitmapMmap != BitmapBegin) {
+    PROF_ERR(
+        "Continuous counter sync mode is enabled, but mmap() failed (%s).\n"
+        "  - BitmapBegin: %p\n"
+        "  - PageAlignedBitmapLength: %" PRIu64 "\n"
+        "  - Fileno: %d\n"
+        "  - FileOffsetToBitmap: %" PRIu64 "\n",
+        strerror(errno), BitmapBegin, PageAlignedBitmapLength, Fileno,
+        FileOffsetToBitmap);
+    return 1;
+  }
   return 0;
 }
 #elif defined(__ELF__) || defined(_WIN32)
@@ -197,6 +232,8 @@ static int mmapForContinuousMode(uint64_t CurrentFileOffset, FILE *File) {
   const __llvm_profile_data *DataEnd = __llvm_profile_end_data();
   const char *CountersBegin = __llvm_profile_begin_counters();
   const char *CountersEnd = __llvm_profile_end_counters();
+  const char *BitmapBegin = __llvm_profile_begin_bitmap();
+  const char *BitmapEnd = __llvm_profile_end_bitmap();
   uint64_t DataSize = __llvm_profile_get_data_size(DataBegin, DataEnd);
   /* Get the file size. */
   uint64_t FileSize = 0;
@@ -218,6 +255,11 @@ static int mmapForContinuousMode(uint64_t CurrentFileOffset, FILE *File) {
 
   /* Return the memory allocated for counters to OS. */
   lprofReleaseMemoryPagesToOS((uintptr_t)CountersBegin, (uintptr_t)CountersEnd);
+
+  /* BIAS MODE not supported yet for Bitmap (MCDC). */
+
+  /* Return the memory allocated for counters to OS. */
+  lprofReleaseMemoryPagesToOS((uintptr_t)BitmapBegin, (uintptr_t)BitmapEnd);
   return 0;
 }
 #else
diff --git a/compiler-rt/lib/profile/InstrProfilingInternal.h b/compiler-rt/lib/profile/InstrProfilingInternal.h
index 360165e32ab3f..03ed67fcfa766 100644
--- a/compiler-rt/lib/profile/InstrProfilingInternal.h
+++ b/compiler-rt/lib/profile/InstrProfilingInternal.h
@@ -21,8 +21,8 @@
  */
 uint64_t __llvm_profile_get_size_for_buffer_internal(
     const __llvm_profile_data *DataBegin, const __llvm_profile_data *DataEnd,
-    const char *CountersBegin, const char *CountersEnd, const char *NamesBegin,
-    const char *NamesEnd);
+    const char *CountersBegin, const char *CountersEnd, const char *BitmapBegin,
+    const char *BitmapEnd, const char *NamesBegin, const char *NamesEnd);
 
 /*!
  * \brief Write instrumentation data to the given buffer, given explicit
@@ -36,7 +36,8 @@ uint64_t __llvm_profile_get_size_for_buffer_internal(
 int __llvm_profile_write_buffer_internal(
     char *Buffer, const __llvm_profile_data *DataBegin,
     const __llvm_profile_data *DataEnd, const char *CountersBegin,
-    const char *CountersEnd, const char *NamesBegin, const char *NamesEnd);
+    const char *CountersEnd, const char *BitmapBegin, const char *BitmapEnd,
+    const char *NamesBegin, const char *NamesEnd);
 
 /*!
  * The data structure describing the data to be written by the
@@ -153,6 +154,7 @@ int lprofWriteDataImpl(ProfDataWriter *Writer,
                        const __llvm_profile_data *DataBegin,
                        const __llvm_profile_data *DataEnd,
                        const char *CountersBegin, const char *CountersEnd,
+                       const char *BitmapBegin, const char *BitmapEnd,
                        VPDataReaderType *VPDataReader, const char *NamesBegin,
                        const char *NamesEnd, int SkipNameDataWrite);
 
diff --git a/compiler-rt/lib/profile/InstrProfilingMerge.c b/compiler-rt/lib/profile/InstrProfilingMerge.c
index 9cf12f251f726..c5f168bf75177 100644
--- a/compiler-rt/lib/profile/InstrProfilingMerge.c
+++ b/compiler-rt/lib/profile/InstrProfilingMerge.c
@@ -66,6 +66,9 @@ int __llvm_profile_check_compatibility(const char *ProfileData,
       Header->NumCounters !=
           __llvm_profile_get_num_counters(__llvm_profile_begin_counters(),
                                           __llvm_profile_end_counters()) ||
+      Header->NumBitmapBytes !=
+          __llvm_profile_get_num_bitmap_bytes(__llvm_profile_begin_bitmap(),
+                                              __llvm_profile_end_bitmap()) ||
       Header->NamesSize != (uint64_t)(__llvm_profile_end_names() -
                                       __llvm_profile_begin_names()) ||
       Header->ValueKindLast != IPVK_Last)
@@ -74,7 +77,8 @@ int __llvm_profile_check_compatibility(const char *ProfileData,
   if (ProfileSize <
       sizeof(__llvm_profile_header) + Header->BinaryIdsSize +
           Header->NumData * sizeof(__llvm_profile_data) + Header->NamesSize +
-          Header->NumCounters * __llvm_profile_counter_entry_size())
+          Header->NumCounters * __llvm_profile_counter_entry_size() +
+          Header->NumBitmapBytes)
     return 1;
 
   for (SrcData = SrcDataStart,
@@ -82,7 +86,8 @@ int __llvm_profile_check_compatibility(const char *ProfileData,
        SrcData < SrcDataEnd; ++SrcData, ++DstData) {
     if (SrcData->NameRef != DstData->NameRef ||
         SrcData->FuncHash != DstData->FuncHash ||
-        SrcData->NumCounters != DstData->NumCounters)
+        SrcData->NumCounters != DstData->NumCounters ||
+        SrcData->NumBitmapBytes != DstData->NumBitmapBytes)
       return 1;
   }
 
@@ -112,9 +117,11 @@ int __llvm_profile_merge_from_buffer(const char *ProfileData,
   __llvm_profile_header *Header = (__llvm_profile_header *)ProfileData;
   char *SrcCountersStart, *DstCounter;
   const char *SrcCountersEnd, *SrcCounter;
+  const char *SrcBitmapStart;
   const char *SrcNameStart;
   const char *SrcValueProfDataStart, *SrcValueProfData;
   uintptr_t CountersDelta = Header->CountersDelta;
+  uintptr_t BitmapDelta = Header->BitmapDelta;
 
   SrcDataStart =
       (__llvm_profile_data *)(ProfileData + sizeof(__llvm_profile_header) +
@@ -123,11 +130,12 @@ int __llvm_profile_merge_from_buffer(const char *ProfileData,
   SrcCountersStart = (char *)SrcDataEnd;
   SrcCountersEnd = SrcCountersStart +
                    Header->NumCounters * __llvm_profile_counter_entry_size();
-  SrcNameStart = SrcCountersEnd;
+  SrcBitmapStart = SrcCountersEnd;
+  SrcNameStart = SrcBitmapStart + Header->NumBitmapBytes;
   SrcValueProfDataStart =
       SrcNameStart + Header->NamesSize +
       __llvm_profile_get_num_padding_bytes(Header->NamesSize);
-  if (SrcNameStart < SrcCountersStart)
+  if (SrcNameStart < SrcCountersStart || SrcNameStart < SrcBitmapStart)
     return 1;
 
   // Merge counters by iterating the entire counter section when debug info
@@ -157,6 +165,8 @@ int __llvm_profile_merge_from_buffer(const char *ProfileData,
     // extend CounterPtr to get the original value.
     char *DstCounters =
         (char *)((uintptr_t)DstData + signextIfWin64(DstData->CounterPtr));
+    char *DstBitmap =
+        (char *)((uintptr_t)DstData + signextIfWin64(DstData->BitmapPtr));
     unsigned NVK = 0;
 
     // SrcData is a serialized representation of the memory image. We need to
@@ -186,6 +196,21 @@ int __llvm_profile_merge_from_buffer(const char *ProfileData,
       }
     }
 
+    const char *SrcBitmap =
+        SrcBitmapStart + ((uintptr_t)SrcData->BitmapPtr - BitmapDelta);
+    // BitmapDelta also needs to be decreased as we advance to the next data
+    // record.
+    BitmapDelta -= sizeof(*SrcData);
+    unsigned NB = SrcData->NumBitmapBytes;
+    // NumBitmapBytes may legitimately be 0. Just keep going.
+    if (NB != 0) {
+      if (SrcBitmap < SrcBitmapStart || (SrcBitmap + NB) > SrcNameStart)
+        return 1;
+      // Merge Src and Dst Bitmap bytes by simply ORing them together.
+      for (unsigned I = 0; I < NB; I++)
+        DstBitmap[I] |= SrcBitmap[I];
+    }
+
     /* Now merge value profile data. */
     if (!VPMergeHook)
       continue;
diff --git a/compiler-rt/lib/profile/InstrProfilingPlatformAIX.c b/compiler-rt/lib/profile/InstrProfilingPlatformAIX.c
index 63219da18ae3a..9f46a98d78ac4 100644
--- a/compiler-rt/lib/profile/InstrProfilingPlatformAIX.c
+++ b/compiler-rt/lib/profile/InstrProfilingPlatformAIX.c
@@ -187,6 +187,8 @@ void __llvm_profile_register_names_function(void *NamesStart,
 // define these zero length variables in each of the above 4 sections.
 static int dummy_cnts[0] COMPILER_RT_SECTION(
     COMPILER_RT_SEG INSTR_PROF_CNTS_SECT_NAME);
+static int dummy_bits[0] COMPILER_RT_SECTION(
+    COMPILER_RT_SEG INSTR_PROF_BITS_SECT_NAME);
 static int dummy_data[0] COMPILER_RT_SECTION(
     COMPILER_RT_SEG INSTR_PROF_DATA_SECT_NAME);
 static const int dummy_name[0] COMPILER_RT_SECTION(
@@ -202,8 +204,9 @@ static int dummy_vnds[0] COMPILER_RT_SECTION(
 #pragma GCC diagnostic ignored "-Wcast-qual"
 #endif
 COMPILER_RT_VISIBILITY
-void *__llvm_profile_keep[] = {(void *)&dummy_cnts, (void *)&dummy_data,
-                               (void *)&dummy_name, (void *)&dummy_vnds};
+void *__llvm_profile_keep[] = {(void *)&dummy_cnts, (void *)&dummy_bits,
+                               (void *)&dummy_data, (void *)&dummy_name,
+                               (void *)&dummy_vnds};
 #ifdef __GNUC__
 #pragma GCC diagnostic pop
 #endif
diff --git a/compiler-rt/lib/profile/InstrProfilingPlatformDarwin.c b/compiler-rt/lib/profile/InstrProfilingPlatformDarwin.c
index d9f2a113f5b02..2154d242a8174 100644
--- a/compiler-rt/lib/profile/InstrProfilingPlatformDarwin.c
+++ b/compiler-rt/lib/profile/InstrProfilingPlatformDarwin.c
@@ -31,6 +31,11 @@ extern char
 COMPILER_RT_VISIBILITY
 extern char CountersEnd __asm("section$end$__DATA$" INSTR_PROF_CNTS_SECT_NAME);
 COMPILER_RT_VISIBILITY
+extern char
+    BitmapStart __asm("section$start$__DATA$" INSTR_PROF_BITS_SECT_NAME);
+COMPILER_RT_VISIBILITY
+extern char BitmapEnd __asm("section$end$__DATA$" INSTR_PROF_BITS_SECT_NAME);
+COMPILER_RT_VISIBILITY
 extern uint32_t
     OrderFileStart __asm("section$start$__DATA$" INSTR_PROF_ORDERFILE_SECT_NAME);
 
@@ -56,6 +61,10 @@ char *__llvm_profile_begin_counters(void) { return &CountersStart; }
 COMPILER_RT_VISIBILITY
 char *__llvm_profile_end_counters(void) { return &CountersEnd; }
 COMPILER_RT_VISIBILITY
+char *__llvm_profile_begin_bitmap(void) { return &BitmapStart; }
+COMPILER_RT_VISIBILITY
+char *__llvm_profile_end_bitmap(void) { return &BitmapEnd; }
+COMPILER_RT_VISIBILITY
 uint32_t *__llvm_profile_begin_orderfile(void) { return &OrderFileStart; }
 
 COMPILER_RT_VISIBILITY
diff --git a/compiler-rt/lib/profile/InstrProfilingPlatformLinux.c b/compiler-rt/lib/profile/InstrProfilingPlatformLinux.c
index 2cce0a4b2c48d..d0c42462e5e31 100644
--- a/compiler-rt/lib/profile/InstrProfilingPlatformLinux.c
+++ b/compiler-rt/lib/profile/InstrProfilingPlatformLinux.c
@@ -35,6 +35,8 @@
 #define PROF_NAME_STOP INSTR_PROF_SECT_STOP(INSTR_PROF_NAME_COMMON)
 #define PROF_CNTS_START INSTR_PROF_SECT_START(INSTR_PROF_CNTS_COMMON)
 #define PROF_CNTS_STOP INSTR_PROF_SECT_STOP(INSTR_PROF_CNTS_COMMON)
+#define PROF_BITS_START INSTR_PROF_SECT_START(INSTR_PROF_BITS_COMMON)
+#define PROF_BITS_STOP INSTR_PROF_SECT_STOP(INSTR_PROF_BITS_COMMON)
 #define PROF_ORDERFILE_START INSTR_PROF_SECT_START(INSTR_PROF_ORDERFILE_COMMON)
 #define PROF_VNODES_START INSTR_PROF_SECT_START(INSTR_PROF_VNODES_COMMON)
 #define PROF_VNODES_STOP INSTR_PROF_SECT_STOP(INSTR_PROF_VNODES_COMMON)
@@ -48,6 +50,8 @@ extern __llvm_profile_data PROF_DATA_STOP COMPILER_RT_VISIBILITY
     COMPILER_RT_WEAK;
 extern char PROF_CNTS_START COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
 extern char PROF_CNTS_STOP COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
+extern char PROF_BITS_START COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
+extern char PROF_BITS_STOP COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
 extern uint32_t PROF_ORDERFILE_START COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
 extern char PROF_NAME_START COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
 extern char PROF_NAME_STOP COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
@@ -74,6 +78,12 @@ COMPILER_RT_VISIBILITY char *__llvm_profile_begin_counters(void) {
 COMPILER_RT_VISIBILITY char *__llvm_profile_end_counters(void) {
   return &PROF_CNTS_STOP;
 }
+COMPILER_RT_VISIBILITY char *__llvm_profile_begin_bitmap(void) {
+  return &PROF_BITS_START;
+}
+COMPILER_RT_VISIBILITY char *__llvm_profile_end_bitmap(void) {
+  return &PROF_BITS_STOP;
+}
 COMPILER_RT_VISIBILITY uint32_t *__llvm_profile_begin_orderfile(void) {
   return &PROF_ORDERFILE_START;
 }
diff --git a/compiler-rt/lib/profile/InstrProfilingPlatformOther.c b/compiler-rt/lib/profile/InstrProfilingPlatformOther.c
index c7b6e842c9fac..5319ca813b43f 100644
--- a/compiler-rt/lib/profile/InstrProfilingPlatformOther.c
+++ b/compiler-rt/lib/profile/InstrProfilingPlatformOther.c
@@ -88,6 +88,10 @@ COMPILER_RT_VISIBILITY
 char *__llvm_profile_begin_counters(void) { return CountersFirst; }
 COMPILER_RT_VISIBILITY
 char *__llvm_profile_end_counters(void) { return CountersLast; }
+COMPILER_RT_VISIBILITY
+char *__llvm_profile_begin_bitmap(void) { return BitmapFirst; }
+COMPILER_RT_VISIBILITY
+char *__llvm_profile_end_bitmap(void) { return BitmapLast; }
 /* TODO: correctly set up OrderFileFirst. */
 COMPILER_RT_VISIBILITY
 uint32_t *__llvm_profile_begin_orderfile(void) { return OrderFileFirst; }
diff --git a/compiler-rt/lib/profile/InstrProfilingPlatformWindows.c b/compiler-rt/lib/profile/InstrProfilingPlatformWindows.c
index dd576b2f8357d..9dbd702865fd2 100644
--- a/compiler-rt/lib/profile/InstrProfilingPlatformWindows.c
+++ b/compiler-rt/lib/profile/InstrProfilingPlatformWindows.c
@@ -14,6 +14,7 @@
 #if defined(_MSC_VER)
 /* Merge read-write sections into .data. */
 #pragma comment(linker, "/MERGE:.lprfc=.data")
+#pragma comment(linker, "/MERGE:.lprfb=.data")
 #pragma comment(linker, "/MERGE:.lprfd=.data")
 #pragma comment(linker, "/MERGE:.lprfv=.data")
 #pragma comment(linker, "/MERGE:.lprfnd=.data")
@@ -30,6 +31,8 @@
 #pragma section(".lprfd$Z", read, write)
 #pragma section(".lprfc$A", read, write)
 #pragma section(".lprfc$Z", read, write)
+#pragma section(".lprfb$A", read, write)
+#pragma section(".lprfb$Z", read, write)
 #pragma section(".lorderfile$A", read, write)
 #pragma section(".lprfnd$A", read, write)
 #pragma section(".lprfnd$Z", read, write)
@@ -43,6 +46,8 @@ const char COMPILER_RT_SECTION(".lprfn$Z") NamesEnd = '\0';
 
 char COMPILER_RT_SECTION(".lprfc$A") CountersStart;
 char COMPILER_RT_SECTION(".lprfc$Z") CountersEnd;
+char COMPILER_RT_SECTION(".lprfb$A") BitmapStart;
+char COMPILER_RT_SECTION(".lprfb$Z") BitmapEnd;
 uint32_t COMPILER_RT_SECTION(".lorderfile$A") OrderFileStart;
 
 ValueProfNode COMPILER_RT_SECTION(".lprfnd$A") VNodesStart;
@@ -58,6 +63,8 @@ const char *__llvm_profile_end_names(void) { return &NamesEnd; }
 
 char *__llvm_profile_begin_counters(void) { return &CountersStart + 1; }
 char *__llvm_profile_end_counters(void) { return &CountersEnd; }
+char *__llvm_profile_begin_bitmap(void) { return &BitmapStart + 1; }
+char *__llvm_profile_end_bitmap(void) { return &BitmapEnd; }
 uint32_t *__llvm_profile_begin_orderfile(void) { return &OrderFileStart; }
 
 ValueProfNode *__llvm_profile_begin_vnodes(void) { return &VNodesStart + 1; }
diff --git a/compiler-rt/lib/profile/InstrProfilingWriter.c b/compiler-rt/lib/profile/InstrProfilingWriter.c
index 1e22398a4c0f6..3b61f3def9f6e 100644
--- a/compiler-rt/lib/profile/InstrProfilingWriter.c
+++ b/compiler-rt/lib/profile/InstrProfilingWriter.c
@@ -246,17 +246,20 @@ COMPILER_RT_VISIBILITY int lprofWriteData(ProfDataWriter *Writer,
   const __llvm_profile_data *DataEnd = __llvm_profile_end_data();
   const char *CountersBegin = __llvm_profile_begin_counters();
   const char *CountersEnd = __llvm_profile_end_counters();
+  const char *BitmapBegin = __llvm_profile_begin_bitmap();
+  const char *BitmapEnd = __llvm_profile_end_bitmap();
   const char *NamesBegin = __llvm_profile_begin_names();
   const char *NamesEnd = __llvm_profile_end_names();
   return lprofWriteDataImpl(Writer, DataBegin, DataEnd, CountersBegin,
-                            CountersEnd, VPDataReader, NamesBegin, NamesEnd,
-                            SkipNameDataWrite);
+                            CountersEnd, BitmapBegin, BitmapEnd, VPDataReader,
+                            NamesBegin, NamesEnd, SkipNameDataWrite);
 }
 
 COMPILER_RT_VISIBILITY int
 lprofWriteDataImpl(ProfDataWriter *Writer, const __llvm_profile_data *DataBegin,
                    const __llvm_profile_data *DataEnd,
                    const char *CountersBegin, const char *CountersEnd,
+                   const char *BitmapBegin, const char *BitmapEnd,
                    VPDataReaderType *VPDataReader, const char *NamesBegin,
                    const char *NamesEnd, int SkipNameDataWrite) {
   int DebugInfoCorrelate =
@@ -271,6 +274,8 @@ lprofWriteDataImpl(ProfDataWriter *Writer, const __llvm_profile_data *DataBegin,
       __llvm_profile_get_counters_size(CountersBegin, CountersEnd);
   const uint64_t NumCounters =
       __llvm_profile_get_num_counters(CountersBegin, CountersEnd);
+  const uint64_t NumBitmapBytes =
+      __llvm_profile_get_num_bitmap_bytes(BitmapBegin, BitmapEnd);
   const uint64_t NamesSize = DebugInfoCorrelate ? 0 : NamesEnd - NamesBegin;
 
   /* Create the header. */
@@ -279,11 +284,11 @@ lprofWriteDataImpl(ProfDataWriter *Writer, const __llvm_profile_data *DataBegin,
   /* Determine how much padding is needed before/after the counters and after
    * the names. */
   uint64_t PaddingBytesBeforeCounters, PaddingBytesAfterCounters,
-      PaddingBytesAfterNames;
+      PaddingBytesAfterNames, PaddingBytesAfterBitmapBytes;
   __llvm_profile_get_padding_sizes_for_counters(
-      DataSectionSize, CountersSectionSize, NamesSize,
+      DataSectionSize, CountersSectionSize, NumBitmapBytes, NamesSize,
       &PaddingBytesBeforeCounters, &PaddingBytesAfterCounters,
-      &PaddingBytesAfterNames);
+      &PaddingBytesAfterBitmapBytes, &PaddingBytesAfterNames);
 
   {
 /* Initialize header structure.  */
@@ -295,6 +300,7 @@ lprofWriteDataImpl(ProfDataWriter *Writer, const __llvm_profile_data *DataBegin,
    * CountersDelta to match. */
 #ifdef _WIN64
   Header.CountersDelta = (uint32_t)Header.CountersDelta;
+  Header.BitmapDelta = (uint32_t)Header.BitmapDelta;
 #endif
 
   /* The data and names sections are omitted in lightweight mode. */
@@ -319,6 +325,8 @@ lprofWriteDataImpl(ProfDataWriter *Writer, const __llvm_profile_data *DataBegin,
       {NULL, sizeof(uint8_t), PaddingBytesBeforeCounters, 1},
       {CountersBegin, sizeof(uint8_t), CountersSectionSize, 0},
       {NULL, sizeof(uint8_t), PaddingBytesAfterCounters, 1},
+      {BitmapBegin, sizeof(uint8_t), NumBitmapBytes, 0},
+      {NULL, sizeof(uint8_t), PaddingBytesAfterBitmapBytes, 1},
       {(SkipNameDataWrite || DebugInfoCorrelate) ? NULL : NamesBegin,
        sizeof(uint8_t), NamesSize, 0},
       {NULL, sizeof(uint8_t), PaddingBytesAfterNames, 1}};
diff --git a/compiler-rt/test/profile/instrprof-write-buffer-internal.c b/compiler-rt/test/profile/instrprof-write-buffer-internal.c
index 7b96c6d91c33f..d9670f739ca98 100644
--- a/compiler-rt/test/profile/instrprof-write-buffer-internal.c
+++ b/compiler-rt/test/profile/instrprof-write-buffer-internal.c
@@ -25,17 +25,18 @@ const char *__llvm_profile_begin_names(void);
 const char *__llvm_profile_end_names(void);
 char *__llvm_profile_begin_counters(void);
 char *__llvm_profile_end_counters(void);
+char *__llvm_profile_begin_bitmap(void);
+char *__llvm_profile_end_bitmap(void);
 
 uint64_t __llvm_profile_get_size_for_buffer_internal(
     const void *DataBegin, const void *DataEnd, const char *CountersBegin,
-    const char *CountersEnd, const char *NamesBegin, const char *NamesEnd);
+    const char *CountersEnd, const char *BitmapBegin, const char *BitmapEnd,
+    const char *NamesBegin, const char *NamesEnd);
 
-int __llvm_profile_write_buffer_internal(char *Buffer, const void *DataBegin,
-                                         const void *DataEnd,
-                                         const char *CountersBegin,
-                                         const char *CountersEnd,
-                                         const char *NamesBegin,
-                                         const char *NamesEnd);
+int __llvm_profile_write_buffer_internal(
+    char *Buffer, const void *DataBegin, const void *DataEnd,
+    const char *CountersBegin, const char *CountersEnd, const char *BitmapBegin,
+    const char *BitmapEnd, const char *NamesBegin, const char *NamesEnd);
 
 void __llvm_profile_set_dumped(void);
 
@@ -43,12 +44,14 @@ int main(int argc, const char *argv[]) {
   uint64_t bufsize = __llvm_profile_get_size_for_buffer_internal(
       __llvm_profile_begin_data(), __llvm_profile_end_data(),
       __llvm_profile_begin_counters(), __llvm_profile_end_counters(),
+      __llvm_profile_begin_bitmap(), __llvm_profile_end_bitmap(),
       __llvm_profile_begin_names(), __llvm_profile_end_names());
 
   char *buf = malloc(bufsize);
-  int ret = __llvm_profile_write_buffer_internal(buf,
-      __llvm_profile_begin_data(), __llvm_profile_end_data(),
+  int ret = __llvm_profile_write_buffer_internal(
+      buf, __llvm_profile_begin_data(), __llvm_profile_end_data(),
       __llvm_profile_begin_counters(), __llvm_profile_end_counters(),
+      __llvm_profile_begin_bitmap(), __llvm_profile_end_bitmap(),
       __llvm_profile_begin_names(), __llvm_profile_end_names());
 
   if (ret != 0) {
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 3631dff50f30d..803503a0e8cc7 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -13954,6 +13954,144 @@ pass will generate the appropriate data structures and replace the
 ``llvm.instrprof.value.profile`` intrinsic with the call to the profile
 runtime library with proper arguments.
 
+'``llvm.instrprof.mcdc.parameters``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+::
+
+      declare void @llvm.instrprof.mcdc.parameters(ptr <name>, i64 <hash>,
+                                                   i32 <bitmap-bytes>)
+
+Overview:
+"""""""""
+
+The '``llvm.instrprof.mcdc.parameters``' intrinsic is used to initiate MC/DC
+code coverage instrumentation for a function.
+
+Arguments:
+""""""""""
+
+The first argument is a pointer to a global variable containing the
+name of the entity being instrumented. This should generally be the
+(mangled) function name for a set of counters.
+
+The second argument is a hash value that can be used by the consumer
+of the profile data to detect changes to the instrumented source.
+
+The third argument is the number of bitmap bytes required by the function to
+record the number of test vectors executed for each boolean expression.
+
+Semantics:
+""""""""""
+
+This intrinsic represents basic MC/DC parameters initiating one or more MC/DC
+instrumentation sequences in a function. It will cause the ``-instrprof`` pass
+to generate the appropriate data structures and the code to instrument MC/DC
+test vectors in a format that can be written out by a compiler runtime and
+consumed via the ``llvm-profdata`` tool.
+
+'``llvm.instrprof.mcdc.condbitmap.update``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+::
+
+      declare void @llvm.instrprof.mcdc.condbitmap.update(ptr <name>, i64 <hash>,
+                                                          i32 <condition-id>,
+                                                          ptr <mcdc-temp-addr>,
+                                                          i1 <bool-value>)
+
+Overview:
+"""""""""
+
+The '``llvm.instrprof.mcdc.condbitmap.update``' intrinsic is used to track
+MC/DC condition evaluation for each condition in a boolean expression.
+
+Arguments:
+""""""""""
+
+The first argument is a pointer to a global variable containing the
+name of the entity being instrumented. This should generally be the
+(mangled) function name for a set of counters.
+
+The second argument is a hash value that can be used by the consumer
+of the profile data to detect changes to the instrumented source.
+
+The third argument is an ID of a condition to track. This value is used as a
+bit index into the condition bitmap.
+
+The fourth argument is the address of the condition bitmap.
+
+The fifth argument is the boolean value representing the evaluation of the
+condition (true or false)
+
+Semantics:
+""""""""""
+
+This intrinsic represents the update of a condition bitmap that is local to a
+function and will cause the ``-instrprof`` pass to generate the code to
+instrument the control flow around each condition in a boolean expression. The
+ID of each condition corresponds to a bit index in the condition bitmap which
+is set based on the evaluation of the condition.
+
+'``llvm.instrprof.mcdc.tvbitmap.update``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+::
+
+      declare void @llvm.instrprof.mcdc.tvbitmap.update(ptr <name>, i64 <hash>,
+                                                        i32 <bitmap-bytes>)
+                                                        i32 <bitmap-index>,
+                                                        ptr <mcdc-temp-addr>)
+
+Overview:
+"""""""""
+
+The '``llvm.instrprof.mcdc.tvbitmap.update``' intrinsic is used to track MC/DC
+test vector execution after each boolean expression has been fully executed.
+The overall value of the condition bitmap, after it has been successively
+updated using the '``llvm.instrprof.mcdc.condbitmap.update``' intrinsic with
+the true or false evaluation of each condition, uniquely identifies an executed
+MC/DC test vector and is used as a bit index into the global test vector
+bitmap.
+
+Arguments:
+""""""""""
+
+The first argument is a pointer to a global variable containing the
+name of the entity being instrumented. This should generally be the
+(mangled) function name for a set of counters.
+
+The second argument is a hash value that can be used by the consumer
+of the profile data to detect changes to the instrumented source.
+
+The third argument is the number of bitmap bytes required by the function to
+record the number of test vectors executed for each boolean expression.
+
+The fourth argument is the byte index into the global test vector bitmap
+corresponding to the function.
+
+The fifth argument is the address of the condition bitmap, which contains a
+value representing an executed MC/DC test vector. It is loaded and used as the
+bit index of the test vector bitmap.
+
+Semantics:
+""""""""""
+
+This intrinsic represents the final operation of an MC/DC instrumentation
+sequence and will cause the ``-instrprof`` pass to generate the code to
+instrument an update of a function's global test vector bitmap to indicate that
+a test vector has been executed. The global test vector bitmap can be consumed
+by the ``llvm-profdata`` and ``llvm-cov`` tools.
+
 '``llvm.thread.pointer``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
diff --git a/llvm/include/llvm/IR/IntrinsicInst.h b/llvm/include/llvm/IR/IntrinsicInst.h
index c9777c72558be..c26ecef6eaaee 100644
--- a/llvm/include/llvm/IR/IntrinsicInst.h
+++ b/llvm/include/llvm/IR/IntrinsicInst.h
@@ -1424,6 +1424,11 @@ class InstrProfInstBase : public IntrinsicInst {
   ConstantInt *getHash() const {
     return cast<ConstantInt>(const_cast<Value *>(getArgOperand(1)));
   }
+};
+
+/// A base class for all instrprof counter intrinsics.
+class InstrProfCntrInstBase : public InstrProfInstBase {
+public:
   // The number of counters for the instrumented function.
   ConstantInt *getNumCounters() const;
   // The index of the counter that this instruction acts on.
@@ -1431,7 +1436,7 @@ class InstrProfInstBase : public IntrinsicInst {
 };
 
 /// This represents the llvm.instrprof.cover intrinsic.
-class InstrProfCoverInst : public InstrProfInstBase {
+class InstrProfCoverInst : public InstrProfCntrInstBase {
 public:
   static bool classof(const IntrinsicInst *I) {
     return I->getIntrinsicID() == Intrinsic::instrprof_cover;
@@ -1442,7 +1447,7 @@ class InstrProfCoverInst : public InstrProfInstBase {
 };
 
 /// This represents the llvm.instrprof.increment intrinsic.
-class InstrProfIncrementInst : public InstrProfInstBase {
+class InstrProfIncrementInst : public InstrProfCntrInstBase {
 public:
   static bool classof(const IntrinsicInst *I) {
     return I->getIntrinsicID() == Intrinsic::instrprof_increment ||
@@ -1466,7 +1471,7 @@ class InstrProfIncrementInstStep : public InstrProfIncrementInst {
 };
 
 /// This represents the llvm.instrprof.timestamp intrinsic.
-class InstrProfTimestampInst : public InstrProfInstBase {
+class InstrProfTimestampInst : public InstrProfCntrInstBase {
 public:
   static bool classof(const IntrinsicInst *I) {
     return I->getIntrinsicID() == Intrinsic::instrprof_timestamp;
@@ -1477,7 +1482,7 @@ class InstrProfTimestampInst : public InstrProfInstBase {
 };
 
 /// This represents the llvm.instrprof.value.profile intrinsic.
-class InstrProfValueProfileInst : public InstrProfInstBase {
+class InstrProfValueProfileInst : public InstrProfCntrInstBase {
 public:
   static bool classof(const IntrinsicInst *I) {
     return I->getIntrinsicID() == Intrinsic::instrprof_value_profile;
@@ -1500,6 +1505,87 @@ class InstrProfValueProfileInst : public InstrProfInstBase {
   }
 };
 
+/// A base class for instrprof mcdc intrinsics that require global bitmap bytes.
+class InstrProfMCDCBitmapInstBase : public InstrProfInstBase {
+public:
+  static bool classof(const IntrinsicInst *I) {
+    return I->getIntrinsicID() == Intrinsic::instrprof_mcdc_parameters ||
+           I->getIntrinsicID() == Intrinsic::instrprof_mcdc_tvbitmap_update;
+  }
+  static bool classof(const Value *V) {
+    return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
+  }
+
+  /// \return The number of bytes used for the MCDC bitmaps for the instrumented
+  /// function.
+  ConstantInt *getNumBitmapBytes() const {
+    return cast<ConstantInt>(const_cast<Value *>(getArgOperand(2)));
+  }
+};
+
+/// This represents the llvm.instrprof.mcdc.parameters intrinsic.
+class InstrProfMCDCBitmapParameters : public InstrProfMCDCBitmapInstBase {
+public:
+  static bool classof(const IntrinsicInst *I) {
+    return I->getIntrinsicID() == Intrinsic::instrprof_mcdc_parameters;
+  }
+  static bool classof(const Value *V) {
+    return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
+  }
+};
+
+/// This represents the llvm.instrprof.mcdc.tvbitmap.update intrinsic.
+class InstrProfMCDCTVBitmapUpdate : public InstrProfMCDCBitmapInstBase {
+public:
+  static bool classof(const IntrinsicInst *I) {
+    return I->getIntrinsicID() == Intrinsic::instrprof_mcdc_tvbitmap_update;
+  }
+  static bool classof(const Value *V) {
+    return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
+  }
+
+  /// \return The index of the TestVector Bitmap upon which this intrinsic
+  /// acts.
+  ConstantInt *getBitmapIndex() const {
+    return cast<ConstantInt>(const_cast<Value *>(getArgOperand(3)));
+  }
+
+  /// \return The address of the corresponding condition bitmap containing
+  /// the index of the TestVector to update within the TestVector Bitmap.
+  Value *getMCDCCondBitmapAddr() const {
+    return cast<Value>(const_cast<Value *>(getArgOperand(4)));
+  }
+};
+
+/// This represents the llvm.instrprof.mcdc.condbitmap.update intrinsic.
+/// It does not pertain to global bitmap updates or parameters and so doesn't
+/// inherit from InstrProfMCDCBitmapInstBase.
+class InstrProfMCDCCondBitmapUpdate : public InstrProfInstBase {
+public:
+  static bool classof(const IntrinsicInst *I) {
+    return I->getIntrinsicID() == Intrinsic::instrprof_mcdc_condbitmap_update;
+  }
+  static bool classof(const Value *V) {
+    return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
+  }
+
+  /// \return The ID of the condition to update.
+  ConstantInt *getCondID() const {
+    return cast<ConstantInt>(const_cast<Value *>(getArgOperand(2)));
+  }
+
+  /// \return The address of the corresponding condition bitmap.
+  Value *getMCDCCondBitmapAddr() const {
+    return cast<Value>(const_cast<Value *>(getArgOperand(3)));
+  }
+
+  /// \return The boolean value to set in the condition bitmap for the
+  /// corresponding condition ID. This represents how the condition evaluated.
+  Value *getCondBool() const {
+    return cast<Value>(const_cast<Value *>(getArgOperand(4)));
+  }
+};
+
 class PseudoProbeInst : public IntrinsicInst {
 public:
   static bool classof(const IntrinsicInst *I) {
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index 80c3d8d403d91..d7a81591d0344 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -909,6 +909,21 @@ def int_instrprof_value_profile : Intrinsic<[],
                                              llvm_i64_ty, llvm_i32_ty,
                                              llvm_i32_ty]>;
 
+// A parameter configuration for instrumentation based MCDC profiling.
+def int_instrprof_mcdc_parameters : Intrinsic<[],
+                                              [llvm_ptr_ty, llvm_i64_ty,
+                                               llvm_i32_ty]>;
+
+// A test vector bitmap update for instrumentation based MCDC profiling.
+def int_instrprof_mcdc_tvbitmap_update : Intrinsic<[],
+                                        [llvm_ptr_ty, llvm_i64_ty,
+                                         llvm_i32_ty, llvm_i32_ty, llvm_ptr_ty]>;
+
+// A condition bitmap value update for instrumentation based MCDC profiling.
+def int_instrprof_mcdc_condbitmap_update : Intrinsic<[],
+                                        [llvm_ptr_ty, llvm_i64_ty,
+                                         llvm_i32_ty, llvm_ptr_ty, llvm_i1_ty]>;
+
 def int_call_preallocated_setup : DefaultAttrsIntrinsic<[llvm_token_ty], [llvm_i32_ty]>;
 def int_call_preallocated_arg : DefaultAttrsIntrinsic<[llvm_ptr_ty], [llvm_token_ty, llvm_i32_ty]>;
 def int_call_preallocated_teardown : DefaultAttrsIntrinsic<[], [llvm_token_ty]>;
diff --git a/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h b/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h
index 948203eea8636..e9fbc4631dd5f 100644
--- a/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h
+++ b/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h
@@ -1027,7 +1027,9 @@ enum CovMapVersion {
   // Compilation directory is stored separately and combined with relative
   // filenames to produce an absolute file path.
   Version6 = 5,
-  // The current version is Version6.
+  // Branch regions extended and Decision Regions added for MC/DC.
+  Version7 = 6,
+  // The current version is Version7.
   CurrentVersion = INSTR_PROF_COVMAP_VERSION
 };
 
diff --git a/llvm/include/llvm/ProfileData/InstrProf.h b/llvm/include/llvm/ProfileData/InstrProf.h
index e968f8ffd5075..e16785efc3a36 100644
--- a/llvm/include/llvm/ProfileData/InstrProf.h
+++ b/llvm/include/llvm/ProfileData/InstrProf.h
@@ -96,6 +96,9 @@ inline StringRef getInstrProfDataVarPrefix() { return "__profd_"; }
 /// Return the name prefix of profile counter variables.
 inline StringRef getInstrProfCountersVarPrefix() { return "__profc_"; }
 
+/// Return the name prefix of profile bitmap variables.
+inline StringRef getInstrProfBitmapVarPrefix() { return "__profbm_"; }
+
 /// Return the name prefix of value profile variables.
 inline StringRef getInstrProfValuesVarPrefix() { return "__profvp_"; }
 
@@ -338,6 +341,7 @@ enum class instrprof_error {
   invalid_prof,
   hash_mismatch,
   count_mismatch,
+  bitmap_mismatch,
   counter_overflow,
   value_site_count_mismatch,
   compress_failed,
@@ -693,18 +697,23 @@ struct InstrProfValueSiteRecord {
 /// Profiling information for a single function.
 struct InstrProfRecord {
   std::vector<uint64_t> Counts;
+  std::vector<uint8_t> BitmapBytes;
 
   InstrProfRecord() = default;
   InstrProfRecord(std::vector<uint64_t> Counts) : Counts(std::move(Counts)) {}
+  InstrProfRecord(std::vector<uint64_t> Counts,
+                  std::vector<uint8_t> BitmapBytes)
+      : Counts(std::move(Counts)), BitmapBytes(std::move(BitmapBytes)) {}
   InstrProfRecord(InstrProfRecord &&) = default;
   InstrProfRecord(const InstrProfRecord &RHS)
-      : Counts(RHS.Counts),
+      : Counts(RHS.Counts), BitmapBytes(RHS.BitmapBytes),
         ValueData(RHS.ValueData
                       ? std::make_unique<ValueProfData>(*RHS.ValueData)
                       : nullptr) {}
   InstrProfRecord &operator=(InstrProfRecord &&) = default;
   InstrProfRecord &operator=(const InstrProfRecord &RHS) {
     Counts = RHS.Counts;
+    BitmapBytes = RHS.BitmapBytes;
     if (!RHS.ValueData) {
       ValueData = nullptr;
       return *this;
@@ -883,6 +892,11 @@ struct NamedInstrProfRecord : InstrProfRecord {
   NamedInstrProfRecord(StringRef Name, uint64_t Hash,
                        std::vector<uint64_t> Counts)
       : InstrProfRecord(std::move(Counts)), Name(Name), Hash(Hash) {}
+  NamedInstrProfRecord(StringRef Name, uint64_t Hash,
+                       std::vector<uint64_t> Counts,
+                       std::vector<uint8_t> BitmapBytes)
+      : InstrProfRecord(std::move(Counts), std::move(BitmapBytes)), Name(Name),
+        Hash(Hash) {}
 
   static bool hasCSFlagInHash(uint64_t FuncHash) {
     return ((FuncHash >> CS_FLAG_IN_FUNC_HASH) & 1);
@@ -1014,7 +1028,9 @@ enum ProfVersion {
   Version9 = 9,
   // An additional (optional) temporal profile traces section is added.
   Version10 = 10,
-  // The current version is 10.
+  // An additional field is used for bitmap bytes.
+  Version11 = 11,
+  // The current version is 11.
   CurrentVersion = INSTR_PROF_INDEX_VERSION
 };
 const uint64_t Version = ProfVersion::CurrentVersion;
@@ -1152,6 +1168,7 @@ namespace RawInstrProf {
 // Version 6: Added binary id.
 // Version 7: Reorder binary id and include version in signature.
 // Version 8: Use relative counter pointer.
+// Version 9: Added relative bitmap bytes pointer and count used by MC/DC.
 const uint64_t Version = INSTR_PROF_RAW_VERSION;
 
 template <class IntPtrT> inline uint64_t getMagic();
diff --git a/llvm/include/llvm/ProfileData/InstrProfData.inc b/llvm/include/llvm/ProfileData/InstrProfData.inc
index 13be2753e514e..fa17ac370f696 100644
--- a/llvm/include/llvm/ProfileData/InstrProfData.inc
+++ b/llvm/include/llvm/ProfileData/InstrProfData.inc
@@ -76,6 +76,7 @@ INSTR_PROF_DATA(const uint64_t, llvm::Type::getInt64Ty(Ctx), FuncHash, \
                 ConstantInt::get(llvm::Type::getInt64Ty(Ctx), \
                 Inc->getHash()->getZExtValue()))
 INSTR_PROF_DATA(const IntPtrT, IntPtrTy, CounterPtr, RelativeCounterPtr)
+INSTR_PROF_DATA(const IntPtrT, IntPtrTy, BitmapPtr, RelativeBitmapPtr)
 /* This is used to map function pointers for the indirect call targets to
  * function name hashes during the conversion from raw to merged profile
  * data.
@@ -87,7 +88,9 @@ INSTR_PROF_DATA(IntPtrT, llvm::Type::getInt8PtrTy(Ctx), Values, \
 INSTR_PROF_DATA(const uint32_t, llvm::Type::getInt32Ty(Ctx), NumCounters, \
                 ConstantInt::get(llvm::Type::getInt32Ty(Ctx), NumCounters))
 INSTR_PROF_DATA(const uint16_t, Int16ArrayTy, NumValueSites[IPVK_Last+1], \
-                ConstantArray::get(Int16ArrayTy, Int16ArrayVals))
+                ConstantArray::get(Int16ArrayTy, Int16ArrayVals)) \
+INSTR_PROF_DATA(const uint32_t, llvm::Type::getInt32Ty(Ctx), NumBitmapBytes, \
+                ConstantInt::get(llvm::Type::getInt32Ty(Ctx), NumBitmapBytes))
 #undef INSTR_PROF_DATA
 /* INSTR_PROF_DATA end. */
 
@@ -132,9 +135,13 @@ INSTR_PROF_RAW_HEADER(uint64_t, NumData, NumData)
 INSTR_PROF_RAW_HEADER(uint64_t, PaddingBytesBeforeCounters, PaddingBytesBeforeCounters)
 INSTR_PROF_RAW_HEADER(uint64_t, NumCounters, NumCounters)
 INSTR_PROF_RAW_HEADER(uint64_t, PaddingBytesAfterCounters, PaddingBytesAfterCounters)
+INSTR_PROF_RAW_HEADER(uint64_t, NumBitmapBytes, NumBitmapBytes)
+INSTR_PROF_RAW_HEADER(uint64_t, PaddingBytesAfterBitmapBytes, PaddingBytesAfterBitmapBytes)
 INSTR_PROF_RAW_HEADER(uint64_t, NamesSize,  NamesSize)
 INSTR_PROF_RAW_HEADER(uint64_t, CountersDelta,
                       (uintptr_t)CountersBegin - (uintptr_t)DataBegin)
+INSTR_PROF_RAW_HEADER(uint64_t, BitmapDelta,
+                      (uintptr_t)BitmapBegin - (uintptr_t)DataBegin)
 INSTR_PROF_RAW_HEADER(uint64_t, NamesDelta, (uintptr_t)NamesBegin)
 INSTR_PROF_RAW_HEADER(uint64_t, ValueKindLast, IPVK_Last)
 #undef INSTR_PROF_RAW_HEADER
@@ -267,6 +274,9 @@ INSTR_PROF_SECT_ENTRY(IPSK_data, \
 INSTR_PROF_SECT_ENTRY(IPSK_cnts, \
                       INSTR_PROF_QUOTE(INSTR_PROF_CNTS_COMMON), \
                       INSTR_PROF_CNTS_COFF, "__DATA,")
+INSTR_PROF_SECT_ENTRY(IPSK_bitmap, \
+                      INSTR_PROF_QUOTE(INSTR_PROF_BITS_COMMON), \
+                      INSTR_PROF_BITS_COFF, "__DATA,")
 INSTR_PROF_SECT_ENTRY(IPSK_name, \
                       INSTR_PROF_QUOTE(INSTR_PROF_NAME_COMMON), \
                       INSTR_PROF_NAME_COFF, "__DATA,")
@@ -645,11 +655,11 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
         (uint64_t)'f' << 16 | (uint64_t)'R' << 8 | (uint64_t)129
 
 /* Raw profile format version (start from 1). */
-#define INSTR_PROF_RAW_VERSION 8
+#define INSTR_PROF_RAW_VERSION 9
 /* Indexed profile format version (start from 1). */
-#define INSTR_PROF_INDEX_VERSION 10
+#define INSTR_PROF_INDEX_VERSION 11
 /* Coverage mapping format version (start from 0). */
-#define INSTR_PROF_COVMAP_VERSION 5
+#define INSTR_PROF_COVMAP_VERSION 6
 
 /* Profile version is always of type uint64_t. Reserve the upper 32 bits in the
  * version for other variants of profile. We set the 8th most significant bit 
@@ -686,6 +696,7 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
 #define INSTR_PROF_DATA_COMMON __llvm_prf_data
 #define INSTR_PROF_NAME_COMMON __llvm_prf_names
 #define INSTR_PROF_CNTS_COMMON __llvm_prf_cnts
+#define INSTR_PROF_BITS_COMMON __llvm_prf_bits
 #define INSTR_PROF_VALS_COMMON __llvm_prf_vals
 #define INSTR_PROF_VNODES_COMMON __llvm_prf_vnds
 #define INSTR_PROF_COVMAP_COMMON __llvm_covmap
@@ -697,6 +708,7 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
 #define INSTR_PROF_DATA_COFF ".lprfd$M"
 #define INSTR_PROF_NAME_COFF ".lprfn$M"
 #define INSTR_PROF_CNTS_COFF ".lprfc$M"
+#define INSTR_PROF_BITS_COFF ".lprfb$M"
 #define INSTR_PROF_VALS_COFF ".lprfv$M"
 #define INSTR_PROF_VNODES_COFF ".lprfnd$M"
 #define INSTR_PROF_COVMAP_COFF ".lcovmap$M"
@@ -708,6 +720,7 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
 #define INSTR_PROF_DATA_SECT_NAME INSTR_PROF_DATA_COFF
 #define INSTR_PROF_NAME_SECT_NAME INSTR_PROF_NAME_COFF
 #define INSTR_PROF_CNTS_SECT_NAME INSTR_PROF_CNTS_COFF
+#define INSTR_PROF_BITS_SECT_NAME INSTR_PROF_BITS_COFF
 /* Array of pointers. Each pointer points to a list
  * of value nodes associated with one value site.
  */
@@ -722,6 +735,7 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
 #define INSTR_PROF_DATA_SECT_NAME INSTR_PROF_QUOTE(INSTR_PROF_DATA_COMMON)
 #define INSTR_PROF_NAME_SECT_NAME INSTR_PROF_QUOTE(INSTR_PROF_NAME_COMMON)
 #define INSTR_PROF_CNTS_SECT_NAME INSTR_PROF_QUOTE(INSTR_PROF_CNTS_COMMON)
+#define INSTR_PROF_BITS_SECT_NAME INSTR_PROF_QUOTE(INSTR_PROF_BITS_COMMON)
 /* Array of pointers. Each pointer points to a list
  * of value nodes associated with one value site.
  */
diff --git a/llvm/include/llvm/ProfileData/InstrProfReader.h b/llvm/include/llvm/ProfileData/InstrProfReader.h
index 5f54cbeb1b01e..cf6429a324d36 100644
--- a/llvm/include/llvm/ProfileData/InstrProfReader.h
+++ b/llvm/include/llvm/ProfileData/InstrProfReader.h
@@ -323,11 +323,14 @@ class RawInstrProfReader : public InstrProfReader {
   // the variant types of the profile.
   uint64_t Version;
   uint64_t CountersDelta;
+  uint64_t BitmapDelta;
   uint64_t NamesDelta;
   const RawInstrProf::ProfileData<IntPtrT> *Data;
   const RawInstrProf::ProfileData<IntPtrT> *DataEnd;
   const char *CountersStart;
   const char *CountersEnd;
+  const char *BitmapStart;
+  const char *BitmapEnd;
   const char *NamesStart;
   const char *NamesEnd;
   // After value profile is all read, this pointer points to
@@ -428,6 +431,7 @@ class RawInstrProfReader : public InstrProfReader {
   Error readName(NamedInstrProfRecord &Record);
   Error readFuncHash(NamedInstrProfRecord &Record);
   Error readRawCounts(InstrProfRecord &Record);
+  Error readRawBitmapBytes(InstrProfRecord &Record);
   Error readValueProfilingData(InstrProfRecord &Record);
   bool atEnd() const { return Data == DataEnd; }
 
@@ -440,6 +444,7 @@ class RawInstrProfReader : public InstrProfReader {
       // As we advance to the next record, we maintain the correct CountersDelta
       // with respect to the next record.
       CountersDelta -= sizeof(*Data);
+      BitmapDelta -= sizeof(*Data);
     }
     Data++;
     ValueDataStart += CurValueDataSize;
@@ -733,6 +738,10 @@ class IndexedInstrProfReader : public InstrProfReader {
   Error getFunctionCounts(StringRef FuncName, uint64_t FuncHash,
                           std::vector<uint64_t> &Counts);
 
+  /// Fill Bitmap Bytes with the profile data for the given function name.
+  Error getFunctionBitmapBytes(StringRef FuncName, uint64_t FuncHash,
+                               std::vector<uint8_t> &BitmapBytes);
+
   /// Return the maximum of all known function counts.
   /// \c UseCS indicates whether to use the context-sensitive count.
   uint64_t getMaximumFunctionCount(bool UseCS) {
diff --git a/llvm/include/llvm/Transforms/Instrumentation/InstrProfiling.h b/llvm/include/llvm/Transforms/Instrumentation/InstrProfiling.h
index cb0c055dcb74a..d8f3e75087ace 100644
--- a/llvm/include/llvm/Transforms/Instrumentation/InstrProfiling.h
+++ b/llvm/include/llvm/Transforms/Instrumentation/InstrProfiling.h
@@ -50,6 +50,7 @@ class InstrProfiling : public PassInfoMixin<InstrProfiling> {
     uint32_t NumValueSites[IPVK_Last + 1];
     GlobalVariable *RegionCounters = nullptr;
     GlobalVariable *DataVar = nullptr;
+    GlobalVariable *RegionBitmaps = nullptr;
 
     PerFunctionProfileData() {
       memset(NumValueSites, 0, sizeof(uint32_t) * (IPVK_Last + 1));
@@ -105,20 +106,59 @@ class InstrProfiling : public PassInfoMixin<InstrProfiling> {
   /// Force emitting of name vars for unused functions.
   void lowerCoverageData(GlobalVariable *CoverageNamesVar);
 
+  /// Replace instrprof.mcdc.tvbitmask.update with a shift and or instruction
+  /// using the index represented by the a temp value into a bitmap.
+  void lowerMCDCTestVectorBitmapUpdate(InstrProfMCDCTVBitmapUpdate *Ins);
+
+  /// Replace instrprof.mcdc.temp.update with a shift and or instruction using
+  /// the corresponding condition ID.
+  void lowerMCDCCondBitmapUpdate(InstrProfMCDCCondBitmapUpdate *Ins);
+
   /// Compute the address of the counter value that this profiling instruction
   /// acts on.
-  Value *getCounterAddress(InstrProfInstBase *I);
+  Value *getCounterAddress(InstrProfCntrInstBase *I);
 
   /// Get the region counters for an increment, creating them if necessary.
   ///
   /// If the counter array doesn't yet exist, the profile data variables
   /// referring to them will also be created.
-  GlobalVariable *getOrCreateRegionCounters(InstrProfInstBase *Inc);
+  GlobalVariable *getOrCreateRegionCounters(InstrProfCntrInstBase *Inc);
 
   /// Create the region counters.
-  GlobalVariable *createRegionCounters(InstrProfInstBase *Inc, StringRef Name,
+  GlobalVariable *createRegionCounters(InstrProfCntrInstBase *Inc,
+                                       StringRef Name,
                                        GlobalValue::LinkageTypes Linkage);
 
+  /// Compute the address of the test vector bitmap that this profiling
+  /// instruction acts on.
+  Value *getBitmapAddress(InstrProfMCDCTVBitmapUpdate *I);
+
+  /// Get the region bitmaps for an increment, creating them if necessary.
+  ///
+  /// If the bitmap array doesn't yet exist, the profile data variables
+  /// referring to them will also be created.
+  GlobalVariable *getOrCreateRegionBitmaps(InstrProfMCDCBitmapInstBase *Inc);
+
+  /// Create the MC/DC bitmap as a byte-aligned array of bytes associated with
+  /// an MC/DC Decision region. The number of bytes required is indicated by
+  /// the intrinsic used (type InstrProfMCDCBitmapInstBase).  This is called
+  /// as part of setupProfileSection() and is conceptually very similar to
+  /// what is done for profile data counters in createRegionCounters().
+  GlobalVariable *createRegionBitmaps(InstrProfMCDCBitmapInstBase *Inc,
+                                      StringRef Name,
+                                      GlobalValue::LinkageTypes Linkage);
+
+  /// Set Comdat property of GV, if required.
+  void maybeSetComdat(GlobalVariable *GV, Function *Fn, StringRef VarName);
+
+  /// Setup the sections into which counters and bitmaps are allocated.
+  GlobalVariable *setupProfileSection(InstrProfInstBase *Inc,
+                                      InstrProfSectKind IPSK);
+
+  /// Create INSTR_PROF_DATA variable for counters and bitmaps.
+  void createDataVariable(InstrProfCntrInstBase *Inc,
+                          InstrProfMCDCBitmapParameters *Update);
+
   /// Emit the section with compressed function names.
   void emitNameData();
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 71f6a3791c2ce..0e6129aaf5219 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -7197,6 +7197,12 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     llvm_unreachable("instrprof failed to lower a timestamp");
   case Intrinsic::instrprof_value_profile:
     llvm_unreachable("instrprof failed to lower a value profiling call");
+  case Intrinsic::instrprof_mcdc_parameters:
+    llvm_unreachable("instrprof failed to lower mcdc parameters");
+  case Intrinsic::instrprof_mcdc_tvbitmap_update:
+    llvm_unreachable("instrprof failed to lower an mcdc tvbitmap update");
+  case Intrinsic::instrprof_mcdc_condbitmap_update:
+    llvm_unreachable("instrprof failed to lower an mcdc condbitmap update");
   case Intrinsic::localescape: {
     MachineFunction &MF = DAG.getMachineFunction();
     const TargetInstrInfo *TII = DAG.getSubtarget().getInstrInfo();
diff --git a/llvm/lib/IR/IntrinsicInst.cpp b/llvm/lib/IR/IntrinsicInst.cpp
index e4ddd57575355..20ae08dd12830 100644
--- a/llvm/lib/IR/IntrinsicInst.cpp
+++ b/llvm/lib/IR/IntrinsicInst.cpp
@@ -270,13 +270,13 @@ int llvm::Intrinsic::lookupLLVMIntrinsicByName(ArrayRef<const char *> NameTable,
   return -1;
 }
 
-ConstantInt *InstrProfInstBase::getNumCounters() const {
+ConstantInt *InstrProfCntrInstBase::getNumCounters() const {
   if (InstrProfValueProfileInst::classof(this))
     llvm_unreachable("InstrProfValueProfileInst does not have counters!");
   return cast<ConstantInt>(const_cast<Value *>(getArgOperand(2)));
 }
 
-ConstantInt *InstrProfInstBase::getIndex() const {
+ConstantInt *InstrProfCntrInstBase::getIndex() const {
   if (InstrProfValueProfileInst::classof(this))
     llvm_unreachable("Please use InstrProfValueProfileInst::getIndex()");
   return cast<ConstantInt>(const_cast<Value *>(getArgOperand(3)));
diff --git a/llvm/lib/ProfileData/Coverage/CoverageMappingReader.cpp b/llvm/lib/ProfileData/Coverage/CoverageMappingReader.cpp
index 857498256ec54..d6aade6fcd0f8 100644
--- a/llvm/lib/ProfileData/Coverage/CoverageMappingReader.cpp
+++ b/llvm/lib/ProfileData/Coverage/CoverageMappingReader.cpp
@@ -783,6 +783,7 @@ Expected<std::unique_ptr<CovMapFuncRecordReader>> CovMapFuncRecordReader::get(
   case CovMapVersion::Version4:
   case CovMapVersion::Version5:
   case CovMapVersion::Version6:
+  case CovMapVersion::Version7:
     // Decompress the name data.
     if (Error E = P.create(P.getNameData()))
       return std::move(E);
@@ -801,6 +802,9 @@ Expected<std::unique_ptr<CovMapFuncRecordReader>> CovMapFuncRecordReader::get(
     else if (Version == CovMapVersion::Version6)
       return std::make_unique<VersionedCovMapFuncRecordReader<
           CovMapVersion::Version6, IntPtrT, Endian>>(P, R, D, F);
+    else if (Version == CovMapVersion::Version7)
+      return std::make_unique<VersionedCovMapFuncRecordReader<
+          CovMapVersion::Version7, IntPtrT, Endian>>(P, R, D, F);
   }
   llvm_unreachable("Unsupported version");
 }
diff --git a/llvm/lib/ProfileData/InstrProf.cpp b/llvm/lib/ProfileData/InstrProf.cpp
index 0cb296b3bde6c..583415ff451a0 100644
--- a/llvm/lib/ProfileData/InstrProf.cpp
+++ b/llvm/lib/ProfileData/InstrProf.cpp
@@ -136,6 +136,9 @@ static std::string getInstrProfErrString(instrprof_error Err,
   case instrprof_error::count_mismatch:
     OS << "function basic block count change detected (counter mismatch)";
     break;
+  case instrprof_error::bitmap_mismatch:
+    OS << "function bitmap size change detected (bitmap size mismatch)";
+    break;
   case instrprof_error::counter_overflow:
     OS << "counter overflow";
     break;
@@ -815,6 +818,18 @@ void InstrProfRecord::merge(InstrProfRecord &Other, uint64_t Weight,
       Warn(instrprof_error::counter_overflow);
   }
 
+  // If the number of bitmap bytes doesn't match we either have bad data
+  // or a hash collision.
+  if (BitmapBytes.size() != Other.BitmapBytes.size()) {
+    Warn(instrprof_error::bitmap_mismatch);
+    return;
+  }
+
+  // Bitmap bytes are merged by simply ORing them together.
+  for (size_t I = 0, E = Other.BitmapBytes.size(); I < E; ++I) {
+    BitmapBytes[I] = Other.BitmapBytes[I] | BitmapBytes[I];
+  }
+
   for (uint32_t Kind = IPVK_First; Kind <= IPVK_Last; ++Kind)
     mergeValueProfData(Kind, Other, Weight, Warn);
 }
@@ -1487,9 +1502,11 @@ Expected<Header> Header::readFromBuffer(const unsigned char *Buffer) {
     // When a new field is added in the header add a case statement here to
     // populate it.
     static_assert(
-        IndexedInstrProf::ProfVersion::CurrentVersion == Version10,
+        IndexedInstrProf::ProfVersion::CurrentVersion == Version11,
         "Please update the reading code below if a new field has been added, "
         "if not add a case statement to fall through to the latest version.");
+  case 11ull:
+    [[fallthrough]];
   case 10ull:
     H.TemporalProfTracesOffset =
         read(Buffer, offsetOf(&Header::TemporalProfTracesOffset));
@@ -1513,10 +1530,12 @@ size_t Header::size() const {
     // When a new field is added to the header add a case statement here to
     // compute the size as offset of the new field + size of the new field. This
     // relies on the field being added to the end of the list.
-    static_assert(IndexedInstrProf::ProfVersion::CurrentVersion == Version10,
+    static_assert(IndexedInstrProf::ProfVersion::CurrentVersion == Version11,
                   "Please update the size computation below if a new field has "
                   "been added to the header, if not add a case statement to "
                   "fall through to the latest version.");
+  case 11ull:
+    [[fallthrough]];
   case 10ull:
     return offsetOf(&Header::TemporalProfTracesOffset) +
            sizeof(Header::TemporalProfTracesOffset);
diff --git a/llvm/lib/ProfileData/InstrProfCorrelator.cpp b/llvm/lib/ProfileData/InstrProfCorrelator.cpp
index f298fcab1220c..2138368500bed 100644
--- a/llvm/lib/ProfileData/InstrProfCorrelator.cpp
+++ b/llvm/lib/ProfileData/InstrProfCorrelator.cpp
@@ -210,11 +210,15 @@ void InstrProfCorrelatorImpl<IntPtrT>::addProbe(StringRef FunctionName,
       // In this mode, CounterPtr actually stores the section relative address
       // of the counter.
       maybeSwap<IntPtrT>(CounterOffset),
+      // TODO: MC/DC is not yet supported.
+      /*BitmapOffset=*/maybeSwap<IntPtrT>(0),
       maybeSwap<IntPtrT>(FunctionPtr),
       // TODO: Value profiling is not yet supported.
       /*ValuesPtr=*/maybeSwap<IntPtrT>(0),
       maybeSwap<uint32_t>(NumCounters),
       /*NumValueSites=*/{maybeSwap<uint16_t>(0), maybeSwap<uint16_t>(0)},
+      // TODO: MC/DC is not yet supported.
+      /*NumBitmapBytes=*/maybeSwap<uint32_t>(0),
   });
   NamesVec.push_back(FunctionName.str());
 }
diff --git a/llvm/lib/ProfileData/InstrProfReader.cpp b/llvm/lib/ProfileData/InstrProfReader.cpp
index a920a31d0a4b2..31d3ff2bcbf6c 100644
--- a/llvm/lib/ProfileData/InstrProfReader.cpp
+++ b/llvm/lib/ProfileData/InstrProfReader.cpp
@@ -434,6 +434,29 @@ Error TextInstrProfReader::readNextRecord(NamedInstrProfRecord &Record) {
     Record.Counts.push_back(Count);
   }
 
+  // Bitmap byte information is indicated with special character.
+  if (Line->startswith("$")) {
+    Record.BitmapBytes.clear();
+    // Read the number of bitmap bytes.
+    uint64_t NumBitmapBytes;
+    if ((Line++)->drop_front(1).trim().getAsInteger(0, NumBitmapBytes))
+      return error(instrprof_error::malformed,
+                   "number of bitmap bytes is not a valid integer");
+    if (NumBitmapBytes != 0) {
+      // Read each bitmap and fill our internal storage with the values.
+      Record.BitmapBytes.reserve(NumBitmapBytes);
+      for (uint8_t I = 0; I < NumBitmapBytes; ++I) {
+        if (Line.is_at_end())
+          return error(instrprof_error::truncated);
+        uint8_t BitmapByte;
+        if ((Line++)->getAsInteger(0, BitmapByte))
+          return error(instrprof_error::malformed,
+                       "bitmap byte is not a valid integer");
+        Record.BitmapBytes.push_back(BitmapByte);
+      }
+    }
+  }
+
   // Check if value profile data exists and read it if so.
   if (Error E = readValueProfileData(Record))
     return error(std::move(E));
@@ -550,11 +573,14 @@ Error RawInstrProfReader<IntPtrT>::readHeader(
     return error(instrprof_error::bad_header);
 
   CountersDelta = swap(Header.CountersDelta);
+  BitmapDelta = swap(Header.BitmapDelta);
   NamesDelta = swap(Header.NamesDelta);
   auto NumData = swap(Header.NumData);
   auto PaddingBytesBeforeCounters = swap(Header.PaddingBytesBeforeCounters);
   auto CountersSize = swap(Header.NumCounters) * getCounterTypeSize();
   auto PaddingBytesAfterCounters = swap(Header.PaddingBytesAfterCounters);
+  auto NumBitmapBytes = swap(Header.NumBitmapBytes);
+  auto PaddingBytesAfterBitmapBytes = swap(Header.PaddingBytesAfterBitmapBytes);
   auto NamesSize = swap(Header.NamesSize);
   ValueKindLast = swap(Header.ValueKindLast);
 
@@ -564,8 +590,10 @@ Error RawInstrProfReader<IntPtrT>::readHeader(
   // Profile data starts after profile header and binary ids if exist.
   ptrdiff_t DataOffset = sizeof(RawInstrProf::Header) + BinaryIdsSize;
   ptrdiff_t CountersOffset = DataOffset + DataSize + PaddingBytesBeforeCounters;
-  ptrdiff_t NamesOffset =
+  ptrdiff_t BitmapOffset =
       CountersOffset + CountersSize + PaddingBytesAfterCounters;
+  ptrdiff_t NamesOffset =
+      BitmapOffset + NumBitmapBytes + PaddingBytesAfterBitmapBytes;
   ptrdiff_t ValueDataOffset = NamesOffset + NamesSize + PaddingSize;
 
   auto *Start = reinterpret_cast<const char *>(&Header);
@@ -594,6 +622,8 @@ Error RawInstrProfReader<IntPtrT>::readHeader(
       reinterpret_cast<const uint8_t *>(&Header) + sizeof(RawInstrProf::Header);
   CountersStart = Start + CountersOffset;
   CountersEnd = CountersStart + CountersSize;
+  BitmapStart = Start + BitmapOffset;
+  BitmapEnd = BitmapStart + NumBitmapBytes;
   ValueDataStart = reinterpret_cast<const uint8_t *>(Start + ValueDataOffset);
 
   const uint8_t *BufferEnd = (const uint8_t *)DataBuffer->getBufferEnd();
@@ -684,6 +714,49 @@ Error RawInstrProfReader<IntPtrT>::readRawCounts(
   return success();
 }
 
+template <class IntPtrT>
+Error RawInstrProfReader<IntPtrT>::readRawBitmapBytes(InstrProfRecord &Record) {
+  uint32_t NumBitmapBytes = swap(Data->NumBitmapBytes);
+
+  Record.BitmapBytes.clear();
+  Record.BitmapBytes.reserve(NumBitmapBytes);
+
+  // It's possible MCDC is either not enabled or only used for some functions
+  // and not others. So if we record 0 bytes, just move on.
+  if (NumBitmapBytes == 0)
+    return success();
+
+  // BitmapDelta decreases as we advance to the next data record.
+  ptrdiff_t BitmapOffset = swap(Data->BitmapPtr) - BitmapDelta;
+  if (BitmapOffset < 0)
+    return error(
+        instrprof_error::malformed,
+        ("bitmap offset " + Twine(BitmapOffset) + " is negative").str());
+
+  if (BitmapOffset >= BitmapEnd - BitmapStart)
+    return error(instrprof_error::malformed,
+                 ("bitmap offset " + Twine(BitmapOffset) +
+                  " is greater than the maximum bitmap offset " +
+                  Twine(BitmapEnd - BitmapStart - 1))
+                     .str());
+
+  uint64_t MaxNumBitmapBytes =
+      (BitmapEnd - (BitmapStart + BitmapOffset)) / sizeof(uint8_t);
+  if (NumBitmapBytes > MaxNumBitmapBytes)
+    return error(instrprof_error::malformed,
+                 ("number of bitmap bytes " + Twine(NumBitmapBytes) +
+                  " is greater than the maximum number of bitmap bytes " +
+                  Twine(MaxNumBitmapBytes))
+                     .str());
+
+  for (uint32_t I = 0; I < NumBitmapBytes; I++) {
+    const char *Ptr = BitmapStart + BitmapOffset + I;
+    Record.BitmapBytes.push_back(swap(*Ptr));
+  }
+
+  return success();
+}
+
 template <class IntPtrT>
 Error RawInstrProfReader<IntPtrT>::readValueProfilingData(
     InstrProfRecord &Record) {
@@ -734,6 +807,10 @@ Error RawInstrProfReader<IntPtrT>::readNextRecord(NamedInstrProfRecord &Record)
   if (Error E = readRawCounts(Record))
     return error(std::move(E));
 
+  // Read raw bitmap bytes and set Record.
+  if (Error E = readRawBitmapBytes(Record))
+    return error(std::move(E));
+
   // Read value data and set Record.
   if (Error E = readValueProfilingData(Record))
     return error(std::move(E));
@@ -795,6 +872,7 @@ data_type InstrProfLookupTrait::ReadData(StringRef K, const unsigned char *D,
 
   DataBuffer.clear();
   std::vector<uint64_t> CounterBuffer;
+  std::vector<uint8_t> BitmapByteBuffer;
 
   const unsigned char *End = D + N;
   while (D < End) {
@@ -823,7 +901,26 @@ data_type InstrProfLookupTrait::ReadData(StringRef K, const unsigned char *D,
       CounterBuffer.push_back(
           endian::readNext<uint64_t, llvm::endianness::little, unaligned>(D));
 
-    DataBuffer.emplace_back(K, Hash, std::move(CounterBuffer));
+    // Read bitmap bytes for GET_VERSION(FormatVersion) > 10.
+    if (GET_VERSION(FormatVersion) > IndexedInstrProf::ProfVersion::Version10) {
+      uint64_t BitmapBytes = 0;
+      if (D + sizeof(uint64_t) > End)
+        return data_type();
+      BitmapBytes =
+          endian::readNext<uint64_t, llvm::endianness::little, unaligned>(D);
+      // Read bitmap byte values.
+      if (D + BitmapBytes * sizeof(uint8_t) > End)
+        return data_type();
+      BitmapByteBuffer.clear();
+      BitmapByteBuffer.reserve(BitmapBytes);
+      for (uint64_t J = 0; J < BitmapBytes; ++J)
+        BitmapByteBuffer.push_back(static_cast<uint8_t>(
+            endian::readNext<uint64_t, llvm::endianness::little, unaligned>(
+                D)));
+    }
+
+    DataBuffer.emplace_back(K, Hash, std::move(CounterBuffer),
+                            std::move(BitmapByteBuffer));
 
     // Read value profiling data.
     if (GET_VERSION(FormatVersion) > IndexedInstrProf::ProfVersion::Version2 &&
@@ -1337,6 +1434,16 @@ Error IndexedInstrProfReader::getFunctionCounts(StringRef FuncName,
   return success();
 }
 
+Error IndexedInstrProfReader::getFunctionBitmapBytes(
+    StringRef FuncName, uint64_t FuncHash, std::vector<uint8_t> &BitmapBytes) {
+  Expected<InstrProfRecord> Record = getInstrProfRecord(FuncName, FuncHash);
+  if (Error E = Record.takeError())
+    return error(std::move(E));
+
+  BitmapBytes = Record.get().BitmapBytes;
+  return success();
+}
+
 Error IndexedInstrProfReader::readNextRecord(NamedInstrProfRecord &Record) {
   ArrayRef<NamedInstrProfRecord> Data;
 
diff --git a/llvm/lib/ProfileData/InstrProfWriter.cpp b/llvm/lib/ProfileData/InstrProfWriter.cpp
index 6892654b00ea4..85339b39c74c4 100644
--- a/llvm/lib/ProfileData/InstrProfWriter.cpp
+++ b/llvm/lib/ProfileData/InstrProfWriter.cpp
@@ -132,6 +132,8 @@ class InstrProfRecordWriterTrait {
       M += sizeof(uint64_t); // The function hash
       M += sizeof(uint64_t); // The size of the Counts vector
       M += ProfRecord.Counts.size() * sizeof(uint64_t);
+      M += sizeof(uint64_t); // The size of the Bitmap vector
+      M += ProfRecord.BitmapBytes.size() * sizeof(uint64_t);
 
       // Value data
       M += ValueProfData::getSize(ProfileData.second);
@@ -161,6 +163,10 @@ class InstrProfRecordWriterTrait {
       for (uint64_t I : ProfRecord.Counts)
         LE.write<uint64_t>(I);
 
+      LE.write<uint64_t>(ProfRecord.BitmapBytes.size());
+      for (uint64_t I : ProfRecord.BitmapBytes)
+        LE.write<uint64_t>(I);
+
       // Write value data
       std::unique_ptr<ValueProfData> VDataPtr =
           ValueProfData::serializeFrom(ProfileData.second);
@@ -380,6 +386,8 @@ bool InstrProfWriter::shouldEncodeData(const ProfilingData &PD) {
     const InstrProfRecord &IPR = Func.second;
     if (llvm::any_of(IPR.Counts, [](uint64_t Count) { return Count > 0; }))
       return true;
+    if (llvm::any_of(IPR.BitmapBytes, [](uint8_t Byte) { return Byte > 0; }))
+      return true;
   }
   return false;
 }
@@ -703,6 +711,17 @@ void InstrProfWriter::writeRecordInText(StringRef Name, uint64_t Hash,
   for (uint64_t Count : Func.Counts)
     OS << Count << "\n";
 
+  if (Func.BitmapBytes.size() > 0) {
+    OS << "# Num Bitmap Bytes:\n$" << Func.BitmapBytes.size() << "\n";
+    OS << "# Bitmap Byte Values:\n";
+    for (uint8_t Byte : Func.BitmapBytes) {
+      OS << "0x";
+      OS.write_hex(Byte);
+      OS << "\n";
+    }
+    OS << "\n";
+  }
+
   uint32_t NumValueKinds = Func.getNumValueKinds();
   if (!NumValueKinds) {
     OS << "\n";
diff --git a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
index 57fcfd5383691..55eef2b76e9be 100644
--- a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
+++ b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
@@ -432,6 +432,15 @@ bool InstrProfiling::lowerIntrinsics(Function *F) {
       } else if (auto *IPVP = dyn_cast<InstrProfValueProfileInst>(&Instr)) {
         lowerValueProfileInst(IPVP);
         MadeChange = true;
+      } else if (auto *IPMP = dyn_cast<InstrProfMCDCBitmapParameters>(&Instr)) {
+        IPMP->eraseFromParent();
+        MadeChange = true;
+      } else if (auto *IPBU = dyn_cast<InstrProfMCDCTVBitmapUpdate>(&Instr)) {
+        lowerMCDCTestVectorBitmapUpdate(IPBU);
+        MadeChange = true;
+      } else if (auto *IPTU = dyn_cast<InstrProfMCDCCondBitmapUpdate>(&Instr)) {
+        lowerMCDCCondBitmapUpdate(IPTU);
+        MadeChange = true;
       }
     }
   }
@@ -546,19 +555,33 @@ bool InstrProfiling::run(
   // the instrumented function. This is counting the number of instrumented
   // target value sites to enter it as field in the profile data variable.
   for (Function &F : M) {
-    InstrProfInstBase *FirstProfInst = nullptr;
-    for (BasicBlock &BB : F)
-      for (auto I = BB.begin(), E = BB.end(); I != E; I++)
+    InstrProfCntrInstBase *FirstProfInst = nullptr;
+    InstrProfMCDCBitmapParameters *FirstProfMCDCParams = nullptr;
+    for (BasicBlock &BB : F) {
+      for (auto I = BB.begin(), E = BB.end(); I != E; I++) {
         if (auto *Ind = dyn_cast<InstrProfValueProfileInst>(I))
           computeNumValueSiteCounts(Ind);
-        else if (FirstProfInst == nullptr &&
-                 (isa<InstrProfIncrementInst>(I) || isa<InstrProfCoverInst>(I)))
-          FirstProfInst = dyn_cast<InstrProfInstBase>(I);
+        else {
+          if (FirstProfInst == nullptr &&
+              (isa<InstrProfIncrementInst>(I) || isa<InstrProfCoverInst>(I)))
+            FirstProfInst = dyn_cast<InstrProfCntrInstBase>(I);
+          if (FirstProfMCDCParams == nullptr)
+            FirstProfMCDCParams = dyn_cast<InstrProfMCDCBitmapParameters>(I);
+        }
+      }
+    }
+
+    // If the MCDCBitmapParameters intrinsic was seen, create the bitmaps.
+    if (FirstProfMCDCParams != nullptr) {
+      static_cast<void>(getOrCreateRegionBitmaps(FirstProfMCDCParams));
+    }
 
-    // Value profiling intrinsic lowering requires per-function profile data
-    // variable to be created first.
-    if (FirstProfInst != nullptr)
+    // Use a profile intrinsic to create the region counters and data variable.
+    // Also create the data variable based on the MCDCParams.
+    if (FirstProfInst != nullptr) {
       static_cast<void>(getOrCreateRegionCounters(FirstProfInst));
+      createDataVariable(FirstProfInst, FirstProfMCDCParams);
+    }
   }
 
   for (Function &F : M)
@@ -672,7 +695,7 @@ void InstrProfiling::lowerValueProfileInst(InstrProfValueProfileInst *Ind) {
   Ind->eraseFromParent();
 }
 
-Value *InstrProfiling::getCounterAddress(InstrProfInstBase *I) {
+Value *InstrProfiling::getCounterAddress(InstrProfCntrInstBase *I) {
   auto *Counters = getOrCreateRegionCounters(I);
   IRBuilder<> Builder(I);
 
@@ -712,6 +735,25 @@ Value *InstrProfiling::getCounterAddress(InstrProfInstBase *I) {
   return Builder.CreateIntToPtr(Add, Addr->getType());
 }
 
+Value *InstrProfiling::getBitmapAddress(InstrProfMCDCTVBitmapUpdate *I) {
+  auto *Bitmaps = getOrCreateRegionBitmaps(I);
+  IRBuilder<> Builder(I);
+
+  auto *Addr = Builder.CreateConstInBoundsGEP2_32(
+      Bitmaps->getValueType(), Bitmaps, 0, I->getBitmapIndex()->getZExtValue());
+
+  if (isRuntimeCounterRelocationEnabled()) {
+    LLVMContext &Ctx = M->getContext();
+    Ctx.diagnose(DiagnosticInfoPGOProfile(
+        M->getName().data(),
+        Twine("Runtime counter relocation is presently not supported for MC/DC "
+              "bitmaps."),
+        DS_Warning));
+  }
+
+  return Addr;
+}
+
 void InstrProfiling::lowerCover(InstrProfCoverInst *CoverInstruction) {
   auto *Addr = getCounterAddress(CoverInstruction);
   IRBuilder<> Builder(CoverInstruction);
@@ -771,6 +813,86 @@ void InstrProfiling::lowerCoverageData(GlobalVariable *CoverageNamesVar) {
   CoverageNamesVar->eraseFromParent();
 }
 
+void InstrProfiling::lowerMCDCTestVectorBitmapUpdate(
+    InstrProfMCDCTVBitmapUpdate *Update) {
+  IRBuilder<> Builder(Update);
+  auto *Int8Ty = Type::getInt8Ty(M->getContext());
+  auto *Int8PtrTy = Type::getInt8PtrTy(M->getContext());
+  auto *Int32Ty = Type::getInt32Ty(M->getContext());
+  auto *Int64Ty = Type::getInt64Ty(M->getContext());
+  auto *MCDCCondBitmapAddr = Update->getMCDCCondBitmapAddr();
+  auto *BitmapAddr = getBitmapAddress(Update);
+
+  // Load Temp Val.
+  //  %mcdc.temp = load i32, ptr %mcdc.addr, align 4
+  auto *Temp = Builder.CreateLoad(Int32Ty, MCDCCondBitmapAddr, "mcdc.temp");
+
+  // Calculate byte offset using div8.
+  //  %1 = lshr i32 %mcdc.temp, 3
+  auto *BitmapByteOffset = Builder.CreateLShr(Temp, 0x3);
+
+  // Add byte offset to section base byte address.
+  //  %2 = zext i32 %1 to i64
+  //  %3 = add i64 ptrtoint (ptr @__profbm_test to i64), %2
+  auto *BitmapByteAddr =
+      Builder.CreateAdd(Builder.CreatePtrToInt(BitmapAddr, Int64Ty),
+                        Builder.CreateZExtOrBitCast(BitmapByteOffset, Int64Ty));
+
+  // Convert to a pointer.
+  //  %4 = inttoptr i32 %3 to ptr
+  BitmapByteAddr = Builder.CreateIntToPtr(BitmapByteAddr, Int8PtrTy);
+
+  // Calculate bit offset into bitmap byte by using div8 remainder (AND ~8)
+  //  %5 = and i32 %mcdc.temp, 7
+  //  %6 = trunc i32 %5 to i8
+  auto *BitToSet = Builder.CreateTrunc(Builder.CreateAnd(Temp, 0x7), Int8Ty);
+
+  // Shift bit offset left to form a bitmap.
+  //  %7 = shl i8 1, %6
+  auto *ShiftedVal = Builder.CreateShl(Builder.getInt8(0x1), BitToSet);
+
+  // Load profile bitmap byte.
+  //  %mcdc.bits = load i8, ptr %4, align 1
+  auto *Bitmap = Builder.CreateLoad(Int8Ty, BitmapByteAddr, "mcdc.bits");
+
+  // Perform logical OR of profile bitmap byte and shifted bit offset.
+  //  %8 = or i8 %mcdc.bits, %7
+  auto *Result = Builder.CreateOr(Bitmap, ShiftedVal);
+
+  // Store the updated profile bitmap byte.
+  //  store i8 %8, ptr %3, align 1
+  Builder.CreateStore(Result, BitmapByteAddr);
+  Update->eraseFromParent();
+}
+
+void InstrProfiling::lowerMCDCCondBitmapUpdate(
+    InstrProfMCDCCondBitmapUpdate *Update) {
+  IRBuilder<> Builder(Update);
+  auto *Int32Ty = Type::getInt32Ty(M->getContext());
+  auto *MCDCCondBitmapAddr = Update->getMCDCCondBitmapAddr();
+
+  // Load the MCDC temporary value from the stack.
+  //  %mcdc.temp = load i32, ptr %mcdc.addr, align 4
+  auto *Temp = Builder.CreateLoad(Int32Ty, MCDCCondBitmapAddr, "mcdc.temp");
+
+  // Zero-extend the evaluated condition boolean value (0 or 1) by 32bits.
+  //  %1 = zext i1 %tobool to i32
+  auto *CondV_32 = Builder.CreateZExt(Update->getCondBool(), Int32Ty);
+
+  // Shift the boolean value left (by the condition's ID) to form a bitmap.
+  //  %2 = shl i32 %1, <Update->getCondID()>
+  auto *ShiftedVal = Builder.CreateShl(CondV_32, Update->getCondID());
+
+  // Perform logical OR of the bitmap against the loaded MCDC temporary value.
+  //  %3 = or i32 %mcdc.temp, %2
+  auto *Result = Builder.CreateOr(Temp, ShiftedVal);
+
+  // Store the updated temporary value back to the stack.
+  //  store i32 %3, ptr %mcdc.addr, align 4
+  Builder.CreateStore(Result, MCDCCondBitmapAddr);
+  Update->eraseFromParent();
+}
+
 /// Get the name of a profiling variable for a particular function.
 static std::string getVarName(InstrProfInstBase *Inc, StringRef Prefix,
                               bool &Renamed) {
@@ -926,37 +1048,31 @@ static bool needsRuntimeRegistrationOfSectionRange(const Triple &TT) {
   return true;
 }
 
-GlobalVariable *
-InstrProfiling::createRegionCounters(InstrProfInstBase *Inc, StringRef Name,
-                                     GlobalValue::LinkageTypes Linkage) {
-  uint64_t NumCounters = Inc->getNumCounters()->getZExtValue();
-  auto &Ctx = M->getContext();
-  GlobalVariable *GV;
-  if (isa<InstrProfCoverInst>(Inc)) {
-    auto *CounterTy = Type::getInt8Ty(Ctx);
-    auto *CounterArrTy = ArrayType::get(CounterTy, NumCounters);
-    // TODO: `Constant::getAllOnesValue()` does not yet accept an array type.
-    std::vector<Constant *> InitialValues(NumCounters,
-                                          Constant::getAllOnesValue(CounterTy));
-    GV = new GlobalVariable(*M, CounterArrTy, false, Linkage,
-                            ConstantArray::get(CounterArrTy, InitialValues),
-                            Name);
-    GV->setAlignment(Align(1));
-  } else {
-    auto *CounterTy = ArrayType::get(Type::getInt64Ty(Ctx), NumCounters);
-    GV = new GlobalVariable(*M, CounterTy, false, Linkage,
-                            Constant::getNullValue(CounterTy), Name);
-    GV->setAlignment(Align(8));
-  }
-  return GV;
+void InstrProfiling::maybeSetComdat(GlobalVariable *GV, Function *Fn,
+                                    StringRef VarName) {
+  bool DataReferencedByCode = profDataReferencedByCode(*M);
+  bool NeedComdat = needsComdatForCounter(*Fn, *M);
+  bool UseComdat = (NeedComdat || TT.isOSBinFormatELF());
+
+  if (!UseComdat)
+    return;
+
+  StringRef GroupName =
+      TT.isOSBinFormatCOFF() && DataReferencedByCode ? GV->getName() : VarName;
+  Comdat *C = M->getOrInsertComdat(GroupName);
+  if (!NeedComdat)
+    C->setSelectionKind(Comdat::NoDeduplicate);
+  GV->setComdat(C);
+  // COFF doesn't allow the comdat group leader to have private linkage, so
+  // upgrade private linkage to internal linkage to produce a symbol table
+  // entry.
+  if (TT.isOSBinFormatCOFF() && GV->hasPrivateLinkage())
+    GV->setLinkage(GlobalValue::InternalLinkage);
 }
 
-GlobalVariable *
-InstrProfiling::getOrCreateRegionCounters(InstrProfInstBase *Inc) {
+GlobalVariable *InstrProfiling::setupProfileSection(InstrProfInstBase *Inc,
+                                                    InstrProfSectKind IPSK) {
   GlobalVariable *NamePtr = Inc->getName();
-  auto &PD = ProfileDataMap[NamePtr];
-  if (PD.RegionCounters)
-    return PD.RegionCounters;
 
   // Match the linkage and visibility of the name global.
   Function *Fn = Inc->getParent()->getParent();
@@ -995,42 +1111,100 @@ InstrProfiling::getOrCreateRegionCounters(InstrProfInstBase *Inc) {
   // nodeduplicate COMDAT which is lowered to a zero-flag section group. This
   // allows -z start-stop-gc to discard the entire group when the function is
   // discarded.
-  bool DataReferencedByCode = profDataReferencedByCode(*M);
-  bool NeedComdat = needsComdatForCounter(*Fn, *M);
   bool Renamed;
-  std::string CntsVarName =
-      getVarName(Inc, getInstrProfCountersVarPrefix(), Renamed);
-  std::string DataVarName =
-      getVarName(Inc, getInstrProfDataVarPrefix(), Renamed);
-  auto MaybeSetComdat = [&](GlobalVariable *GV) {
-    bool UseComdat = (NeedComdat || TT.isOSBinFormatELF());
-    if (UseComdat) {
-      StringRef GroupName = TT.isOSBinFormatCOFF() && DataReferencedByCode
-                                ? GV->getName()
-                                : CntsVarName;
-      Comdat *C = M->getOrInsertComdat(GroupName);
-      if (!NeedComdat)
-        C->setSelectionKind(Comdat::NoDeduplicate);
-      GV->setComdat(C);
-      // COFF doesn't allow the comdat group leader to have private linkage, so
-      // upgrade private linkage to internal linkage to produce a symbol table
-      // entry.
-      if (TT.isOSBinFormatCOFF() && GV->hasPrivateLinkage())
-        GV->setLinkage(GlobalValue::InternalLinkage);
-    }
-  };
+  GlobalVariable *Ptr;
+  StringRef VarPrefix;
+  std::string VarName;
+  if (IPSK == IPSK_cnts) {
+    VarPrefix = getInstrProfCountersVarPrefix();
+    VarName = getVarName(Inc, VarPrefix, Renamed);
+    InstrProfCntrInstBase *CntrIncrement = dyn_cast<InstrProfCntrInstBase>(Inc);
+    Ptr = createRegionCounters(CntrIncrement, VarName, Linkage);
+  } else if (IPSK == IPSK_bitmap) {
+    VarPrefix = getInstrProfBitmapVarPrefix();
+    VarName = getVarName(Inc, VarPrefix, Renamed);
+    InstrProfMCDCBitmapInstBase *BitmapUpdate =
+        dyn_cast<InstrProfMCDCBitmapInstBase>(Inc);
+    Ptr = createRegionBitmaps(BitmapUpdate, VarName, Linkage);
+  } else {
+    llvm_unreachable("Profile Section must be for Counters or Bitmaps");
+  }
+
+  Ptr->setVisibility(Visibility);
+  // Put the counters and bitmaps in their own sections so linkers can
+  // remove unneeded sections.
+  Ptr->setSection(getInstrProfSectionName(IPSK, TT.getObjectFormat()));
+  Ptr->setLinkage(Linkage);
+  maybeSetComdat(Ptr, Fn, VarName);
+  return Ptr;
+}
+
+GlobalVariable *
+InstrProfiling::createRegionBitmaps(InstrProfMCDCBitmapInstBase *Inc,
+                                    StringRef Name,
+                                    GlobalValue::LinkageTypes Linkage) {
+  uint64_t NumBytes = Inc->getNumBitmapBytes()->getZExtValue();
+  auto *BitmapTy = ArrayType::get(Type::getInt8Ty(M->getContext()), NumBytes);
+  auto GV = new GlobalVariable(*M, BitmapTy, false, Linkage,
+                               Constant::getNullValue(BitmapTy), Name);
+  GV->setAlignment(Align(1));
+  return GV;
+}
+
+GlobalVariable *
+InstrProfiling::getOrCreateRegionBitmaps(InstrProfMCDCBitmapInstBase *Inc) {
+  GlobalVariable *NamePtr = Inc->getName();
+  auto &PD = ProfileDataMap[NamePtr];
+  if (PD.RegionBitmaps)
+    return PD.RegionBitmaps;
+
+  // If RegionBitmaps doesn't already exist, create it by first setting up
+  // the corresponding profile section.
+  auto *BitmapPtr = setupProfileSection(Inc, IPSK_bitmap);
+  PD.RegionBitmaps = BitmapPtr;
+  return PD.RegionBitmaps;
+}
 
+GlobalVariable *
+InstrProfiling::createRegionCounters(InstrProfCntrInstBase *Inc, StringRef Name,
+                                     GlobalValue::LinkageTypes Linkage) {
   uint64_t NumCounters = Inc->getNumCounters()->getZExtValue();
-  LLVMContext &Ctx = M->getContext();
+  auto &Ctx = M->getContext();
+  GlobalVariable *GV;
+  if (isa<InstrProfCoverInst>(Inc)) {
+    auto *CounterTy = Type::getInt8Ty(Ctx);
+    auto *CounterArrTy = ArrayType::get(CounterTy, NumCounters);
+    // TODO: `Constant::getAllOnesValue()` does not yet accept an array type.
+    std::vector<Constant *> InitialValues(NumCounters,
+                                          Constant::getAllOnesValue(CounterTy));
+    GV = new GlobalVariable(*M, CounterArrTy, false, Linkage,
+                            ConstantArray::get(CounterArrTy, InitialValues),
+                            Name);
+    GV->setAlignment(Align(1));
+  } else {
+    auto *CounterTy = ArrayType::get(Type::getInt64Ty(Ctx), NumCounters);
+    GV = new GlobalVariable(*M, CounterTy, false, Linkage,
+                            Constant::getNullValue(CounterTy), Name);
+    GV->setAlignment(Align(8));
+  }
+  return GV;
+}
+
+GlobalVariable *
+InstrProfiling::getOrCreateRegionCounters(InstrProfCntrInstBase *Inc) {
+  GlobalVariable *NamePtr = Inc->getName();
+  auto &PD = ProfileDataMap[NamePtr];
+  if (PD.RegionCounters)
+    return PD.RegionCounters;
 
-  auto *CounterPtr = createRegionCounters(Inc, CntsVarName, Linkage);
-  CounterPtr->setVisibility(Visibility);
-  CounterPtr->setSection(
-      getInstrProfSectionName(IPSK_cnts, TT.getObjectFormat()));
-  CounterPtr->setLinkage(Linkage);
-  MaybeSetComdat(CounterPtr);
+  // If RegionCounters doesn't already exist, create it by first setting up
+  // the corresponding profile section.
+  auto *CounterPtr = setupProfileSection(Inc, IPSK_cnts);
   PD.RegionCounters = CounterPtr;
+
   if (DebugInfoCorrelate) {
+    LLVMContext &Ctx = M->getContext();
+    Function *Fn = Inc->getParent()->getParent();
     if (auto *SP = Fn->getSubprogram()) {
       DIBuilder DB(*M, true, SP->getUnit());
       Metadata *FunctionNameAnnotation[] = {
@@ -1059,8 +1233,50 @@ InstrProfiling::getOrCreateRegionCounters(InstrProfInstBase *Inc) {
       CounterPtr->addDebugInfo(DICounter);
       DB.finalize();
     }
+
+    // Mark the counter variable as used so that it isn't optimized out.
+    CompilerUsedVars.push_back(PD.RegionCounters);
   }
 
+  return PD.RegionCounters;
+}
+
+void InstrProfiling::createDataVariable(InstrProfCntrInstBase *Inc,
+                                        InstrProfMCDCBitmapParameters *Params) {
+  // When debug information is correlated to profile data, a data variable
+  // is not needed.
+  if (DebugInfoCorrelate)
+    return;
+
+  GlobalVariable *NamePtr = Inc->getName();
+  auto &PD = ProfileDataMap[NamePtr];
+
+  LLVMContext &Ctx = M->getContext();
+
+  Function *Fn = Inc->getParent()->getParent();
+  GlobalValue::LinkageTypes Linkage = NamePtr->getLinkage();
+  GlobalValue::VisibilityTypes Visibility = NamePtr->getVisibility();
+
+  // Due to the limitation of binder as of 2021/09/28, the duplicate weak
+  // symbols in the same csect won't be discarded. When there are duplicate weak
+  // symbols, we can NOT guarantee that the relocations get resolved to the
+  // intended weak symbol, so we can not ensure the correctness of the relative
+  // CounterPtr, so we have to use private linkage for counter and data symbols.
+  if (TT.isOSBinFormatXCOFF()) {
+    Linkage = GlobalValue::PrivateLinkage;
+    Visibility = GlobalValue::DefaultVisibility;
+  }
+
+  bool DataReferencedByCode = profDataReferencedByCode(*M);
+  bool NeedComdat = needsComdatForCounter(*Fn, *M);
+  bool Renamed;
+
+  // The Data Variable section is anchored to profile counters.
+  std::string CntsVarName =
+      getVarName(Inc, getInstrProfCountersVarPrefix(), Renamed);
+  std::string DataVarName =
+      getVarName(Inc, getInstrProfDataVarPrefix(), Renamed);
+
   auto *Int8PtrTy = Type::getInt8PtrTy(Ctx);
   // Allocate statically the array of pointers to value profile nodes for
   // the current function.
@@ -1078,16 +1294,17 @@ InstrProfiling::getOrCreateRegionCounters(InstrProfInstBase *Inc) {
     ValuesVar->setSection(
         getInstrProfSectionName(IPSK_vals, TT.getObjectFormat()));
     ValuesVar->setAlignment(Align(8));
-    MaybeSetComdat(ValuesVar);
+    maybeSetComdat(ValuesVar, Fn, CntsVarName);
     ValuesPtrExpr =
         ConstantExpr::getBitCast(ValuesVar, Type::getInt8PtrTy(Ctx));
   }
 
-  if (DebugInfoCorrelate) {
-    // Mark the counter variable as used so that it isn't optimized out.
-    CompilerUsedVars.push_back(PD.RegionCounters);
-    return PD.RegionCounters;
-  }
+  uint64_t NumCounters = Inc->getNumCounters()->getZExtValue();
+  auto *CounterPtr = PD.RegionCounters;
+
+  uint64_t NumBitmapBytes = 0;
+  if (Params != nullptr)
+    NumBitmapBytes = Params->getNumBitmapBytes()->getZExtValue();
 
   // Create data variable.
   auto *IntPtrTy = M->getDataLayout().getIntPtrType(M->getContext());
@@ -1130,6 +1347,16 @@ InstrProfiling::getOrCreateRegionCounters(InstrProfInstBase *Inc) {
       ConstantExpr::getSub(ConstantExpr::getPtrToInt(CounterPtr, IntPtrTy),
                            ConstantExpr::getPtrToInt(Data, IntPtrTy));
 
+  // Bitmaps are relative to the same data variable as profile counters.
+  GlobalVariable *BitmapPtr = PD.RegionBitmaps;
+  Constant *RelativeBitmapPtr = ConstantInt::get(IntPtrTy, 0);
+
+  if (BitmapPtr != nullptr) {
+    RelativeBitmapPtr =
+        ConstantExpr::getSub(ConstantExpr::getPtrToInt(BitmapPtr, IntPtrTy),
+                             ConstantExpr::getPtrToInt(Data, IntPtrTy));
+  }
+
   Constant *DataVals[] = {
 #define INSTR_PROF_DATA(Type, LLVMType, Name, Init) Init,
 #include "llvm/ProfileData/InstrProfData.inc"
@@ -1139,7 +1366,7 @@ InstrProfiling::getOrCreateRegionCounters(InstrProfInstBase *Inc) {
   Data->setVisibility(Visibility);
   Data->setSection(getInstrProfSectionName(IPSK_data, TT.getObjectFormat()));
   Data->setAlignment(Align(INSTR_PROF_DATA_ALIGNMENT));
-  MaybeSetComdat(Data);
+  maybeSetComdat(Data, Fn, CntsVarName);
 
   PD.DataVar = Data;
 
@@ -1151,8 +1378,6 @@ InstrProfiling::getOrCreateRegionCounters(InstrProfInstBase *Inc) {
   NamePtr->setLinkage(GlobalValue::PrivateLinkage);
   // Collect the referenced names to be used by emitNameData.
   ReferencedNames.push_back(NamePtr);
-
-  return PD.RegionCounters;
 }
 
 void InstrProfiling::emitVNodes() {
diff --git a/llvm/test/Instrumentation/InstrProfiling/mcdc.ll b/llvm/test/Instrumentation/InstrProfiling/mcdc.ll
new file mode 100644
index 0000000000000..fccb026c25bf2
--- /dev/null
+++ b/llvm/test/Instrumentation/InstrProfiling/mcdc.ll
@@ -0,0 +1,53 @@
+; Check that MC/DC intrinsics are properly lowered
+; RUN: opt < %s -passes=instrprof -S | FileCheck %s
+; RUN: opt < %s -passes=instrprof -runtime-counter-relocation -S 2>&1 | FileCheck %s --check-prefix RELOC
+
+; RELOC: Runtime counter relocation is presently not supported for MC/DC bitmaps
+
+target triple = "x86_64-unknown-linux-gnu"
+
+@__profn_test = private constant [4 x i8] c"test"
+
+; CHECK: @__profbm_test = private global [1 x i8] zeroinitializer, section "__llvm_prf_bits", comdat, align 1
+
+define dso_local void @test(i32 noundef %A) {
+entry:
+  %A.addr = alloca i32, align 4
+  %mcdc.addr = alloca i32, align 4
+  call void @llvm.instrprof.cover(ptr @__profn_test, i64 99278, i32 5, i32 0)
+  ; CHECK: store i8 0, ptr @__profc_test, align 1
+
+  call void @llvm.instrprof.mcdc.parameters(ptr @__profn_test, i64 99278, i32 1)
+  store i32 0, ptr %mcdc.addr, align 4
+  %0 = load i32, ptr %A.addr, align 4
+  %tobool = icmp ne i32 %0, 0
+
+  call void @llvm.instrprof.mcdc.condbitmap.update(ptr @__profn_test, i64 99278, i32 0, ptr %mcdc.addr, i1 %tobool)
+  ; CHECK:      %mcdc.temp = load i32, ptr %mcdc.addr, align 4
+  ; CHECK-NEXT: %1 = zext i1 %tobool to i32
+  ; CHECK-NEXT: %2 = shl i32 %1, 0
+  ; CHECK-NEXT: %3 = or i32 %mcdc.temp, %2
+  ; CHECK-NEXT: store i32 %3, ptr %mcdc.addr, align 4
+
+  call void @llvm.instrprof.mcdc.tvbitmap.update(ptr @__profn_test, i64 99278, i32 1, i32 0, ptr %mcdc.addr)
+  ; CHECK:       %mcdc.temp1 = load i32, ptr %mcdc.addr, align 4
+  ; CHECK-NEXT: %4 = lshr i32 %mcdc.temp1, 3
+  ; CHECK-NEXT: %5 = zext i32 %4 to i64
+  ; CHECK-NEXT: %6 = add i64 ptrtoint (ptr @__profbm_test to i64), %5
+  ; CHECK-NEXT: %7 = inttoptr i64 %6 to ptr
+  ; CHECK-NEXT: %8 = and i32 %mcdc.temp1, 7
+  ; CHECK-NEXT: %9 = trunc i32 %8 to i8
+  ; CHECK-NEXT: %10 = shl i8 1, %9
+  ; CHECK-NEXT: %mcdc.bits = load i8, ptr %7, align 1
+  ; CHECK-NEXT: %11 = or i8 %mcdc.bits, %10
+  ; CHECK-NEXT: store i8 %11, ptr %7, align 1
+  ret void
+}
+
+declare void @llvm.instrprof.cover(ptr, i64, i32, i32)
+
+declare void @llvm.instrprof.mcdc.parameters(ptr, i64, i32)
+
+declare void @llvm.instrprof.mcdc.condbitmap.update(ptr, i64, i32, ptr, i1)
+
+declare void @llvm.instrprof.mcdc.tvbitmap.update(ptr, i64, i32, i32, ptr)
diff --git a/llvm/test/Transforms/PGOProfile/comdat_internal.ll b/llvm/test/Transforms/PGOProfile/comdat_internal.ll
index 1c44a274f3c04..8c6942c0f527b 100644
--- a/llvm/test/Transforms/PGOProfile/comdat_internal.ll
+++ b/llvm/test/Transforms/PGOProfile/comdat_internal.ll
@@ -13,9 +13,9 @@ $foo = comdat any
 ; CHECK: @__llvm_profile_raw_version = hidden constant i64 {{[0-9]+}}, comdat
 ; CHECK-NOT: __profn__stdin__foo
 ; CHECK: @__profc__stdin__foo.[[#FOO_HASH]] = private global [1 x i64] zeroinitializer, section "__llvm_prf_cnts", comdat, align 8
-; CHECK: @__profd__stdin__foo.[[#FOO_HASH]] = private global { i64, i64, i64, ptr, ptr, i32, [2 x i16] } { i64 {{.*}}, i64 [[#FOO_HASH]], i64 sub (i64 ptrtoint (ptr @__profc__stdin__foo.742261418966908927 to i64), i64 ptrtoint (ptr @__profd__stdin__foo.742261418966908927 to i64)), ptr null
+; CHECK: @__profd__stdin__foo.[[#FOO_HASH]] = private global { i64, i64, i64, i64, ptr, ptr, i32, [2 x i16], i32 } { i64 {{.*}}, i64 [[#FOO_HASH]], i64 sub (i64 ptrtoint (ptr @__profc__stdin__foo.742261418966908927 to i64), i64 ptrtoint (ptr @__profd__stdin__foo.742261418966908927 to i64)), i64 0, ptr null
 ; CHECK-NOT: @foo
-; CHECK-SAME: , ptr null, i32 1, [2 x i16] zeroinitializer }, section "__llvm_prf_data", comdat($__profc__stdin__foo.[[#FOO_HASH]]), align 8
+; CHECK-SAME: , ptr null, i32 1, [2 x i16] zeroinitializer, i32 0 }, section "__llvm_prf_data", comdat($__profc__stdin__foo.[[#FOO_HASH]]), align 8
 ; CHECK: @__llvm_prf_nm
 ; CHECK: @llvm.compiler.used
 
diff --git a/llvm/test/tools/llvm-profdata/Inputs/basic.profraw b/llvm/test/tools/llvm-profdata/Inputs/basic.profraw
index ad88759398c6020f4ab8a5606258e69d98e36687..1b284b84fad6dd7f9407b1c3b99cb178af0e09c6 100644
GIT binary patch
delta 63
zcmbQicz}_!u_!ISs37M*=R{6@Muv%+@_Y__aks4+{{I)d(5P~>exjd}00R^}C<ciz
MFiaFsm^gt00I~}dcmMzZ

delta 71
zcmX@WID?V1u_!ISs37M*$3#y3i5>!MPiq_)7#Jp|DQPSa2C)!;5z2(oEDX7cnRyHh
E0CJEFxc~qF

diff --git a/llvm/test/tools/llvm-profdata/Inputs/c-general.profraw b/llvm/test/tools/llvm-profdata/Inputs/c-general.profraw
index bc8fc5db1cb154d98ca962e84313463e3298cb92..9cd225587c92511e99f3497ce1d5f47c6fc5f0af 100644
GIT binary patch
delta 308
zcmeC+d%(}xSd^AuRFLzZb0Vky<O$3iAUx4RfX~9}b<L-S|Nr|Id}-pXpXg@<mT2IZ
z+`z&yu|t4E;?I9DnB2%LK5;!KBg4dv;v5dhg7-m!55+k=kOU|5GXg~?3o?px>_8T_
z2Map_g$4d1%$=MM7A^z|M<5H&2MaF*3U?q0gIzKCB2aV#lIY~~%z~3Sm;_Lj)^mdV
hG1&qro`I~m9;i42C}@Bz*w4f{If6-G^8%&}764*Dk$V6D

delta 330
zcmaFB-@(V(Sd^AuRFLzZV<M;iL=OQr?j8pQ28M}gMiUos2sW=}UHZ|vFC*x(X@r=K
zpo_-DM)8RcI0RWdkF#hTEzY!!(dF|N{;8e-6rZfX$RQXyr9zW+zG|HG9M4#j<cAk3
zKoSW+i9*d&Hfl|i_S<DNH=kTEli>$QVgXRX-d83sq200i+W)W}ZBd!q5+)yH6rcP6
zC=<D)Yena*#Ss%9+w_0VVPV(-mQi5h5d1pTmc=_DRyh8mQJDU7xivpPGC+4pC_Fu!
zx%XpC<%MfNKLCNjzyDw`c><G!MD2^`M+&CBcAE6+!;GZ%4;2zn$%zvrBz|4(eQuUA
t^|k~=9|H$WqX4snVAN?IC-;SZB^TWwvJb!xNnqxfEWjqPxq&%>1puPjh&2EJ

diff --git a/llvm/test/tools/llvm-profdata/Inputs/compat.profdata.v10 b/llvm/test/tools/llvm-profdata/Inputs/compat.profdata.v10
new file mode 100644
index 0000000000000000000000000000000000000000..c331e30b48ff5d3be2efe4636d0c9fee56e764b5
GIT binary patch
literal 872
zcmeyLQ&5zjmf6U~fE@@hqlzb>@!6o#0#KR>O2d@>u3-d=z-Xv61B@@A4iSOzA*vY|
z7PO)2gNZxvLQF<i|6no;iajv(4N?#nqN`U}$c|P00d-ES;tA^@A%t!YgDDSI^$T|M
zVio^T$A?wiK$;(`c!V8x@jH&##m$_si`(N6XJmNd<&!V=%IXEgy$lQ<V0SPu<RliQ
zr^e@%<|d^U75|3-dsuWs)hIw|SnP8^`H;9^V1T6tMwkR61Mlmj6YP~1M8dRaK(*v0
z=B6r?XO?6r6c;2Wrxr6n&43vJRR*UJCi1gOR`2ocTrLciQ-IQGrFqFEnfZC~$@!&u
mCGq)rIh8Qou+#yIO_*T}3@{cf{lawMq@nK1M|0;vG(G@<1XkJr

literal 0
HcmV?d00001

diff --git a/llvm/test/tools/llvm-profdata/Inputs/compressed.profraw b/llvm/test/tools/llvm-profdata/Inputs/compressed.profraw
index 134b78f7af5b760dc3af7422c1bf7661f4bae14a..9966729d92ddc33bf89eeb3fee87215bbabbbef1 100644
GIT binary patch
delta 326
zcmaFCyMbS@u_!ISs37M*Cj%5r6jYzQfSCh?Ct3(dSahA6xxL~4|GoghG>>`)21b~&
z3=Oc72^^Cruy9PAA;2N==RX)sp2#ddaX%*`!^DH)91h5W??Hke#W_5X1SiWg0!1e)
zGKzETKo<4~3kL#)1^yz;om>wVZUhQPAPcVt3vUDpcOVIaT`~C~P;>*5=;Zs%f|Dhf
o1W=XsbAtRa*#jt^fvmV6sJH?sXn-s@pNVsF29v<%4NMj+0Qs7eE&u=k

delta 364
zcmdnM|AJSru_!ISs37M*2Ll8MOcYd~=pmqxDy0AxV1$ap<mHy0d+VVRGjC#n#>532
zg3W7Lmwt5a%Luw`8X;yQ=%O*PQGDV94nY>r<189Si!*Iwboso6f2t<{#V0E;atMY_
zsnBGduNo&k$1~O>`Qe2MkVFDdqEPdcjat*B{dO76%_kSkWcUG+SOAo;_m#;@Xm_l>
z_CIV#TU6$@gvkdP#V0=i%0w>dTG2Ucam2*OHvOM-SQvJIWfYh=1iwzTW$_M(6^_4X
z6sG@NZp{yn4A5N?3QrGb?)?~3dEpwwUk3mFgTdqpOcD~cFP<MMnD*Lf(yI?MlG;C1
zNI)egPLPoJb+z}oS<2Mg60q=qX%t|V5R5v_<K({3ujHZ|MD_vLAqmVJlLgoWHa9RE
GumAv>dW;(Y

diff --git a/llvm/test/tools/llvm-profdata/binary-ids-padding.test b/llvm/test/tools/llvm-profdata/binary-ids-padding.test
index 67db5c98ef323..eda63203a304a 100644
--- a/llvm/test/tools/llvm-profdata/binary-ids-padding.test
+++ b/llvm/test/tools/llvm-profdata/binary-ids-padding.test
@@ -5,13 +5,15 @@
 // INSTR_PROF_RAW_HEADER(uint64_t, BinaryIdsSize, __llvm_write_binary_ids(NULL))
 // INSTR_PROF_RAW_HEADER(uint64_t, DataSize, DataSize)
 // INSTR_PROF_RAW_HEADER(uint64_t, CountersSize, CountersSize)
+// INSTR_PROF_RAW_HEADER(uint64_t, NumBitmaskBytes, NumBitmaskBytes)
 // INSTR_PROF_RAW_HEADER(uint64_t, NamesSize,  NamesSize)
 // INSTR_PROF_RAW_HEADER(uint64_t, CountersDelta, (uintptr_t)CountersBegin)
+// INSTR_PROF_RAW_HEADER(uint64_t, BitmaskDelta, (uintptr_t)BitmaskBegin)
 // INSTR_PROF_RAW_HEADER(uint64_t, NamesDelta, (uintptr_t)NamesBegin)
 // INSTR_PROF_RAW_HEADER(uint64_t, ValueKindLast, IPVK_Last)
 
 RUN: printf '\201rforpl\377' > %t.profraw
-RUN: printf '\10\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\11\0\0\0\0\0\0\0' >> %t.profraw
 // There will be 2 20-byte binary IDs, so the total Binary IDs size will be 64 bytes.
 //   2 * 8  binary ID sizes
 // + 2 * 20 binary IDs (of size 20)
@@ -23,8 +25,11 @@ RUN: printf '\2\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\3\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\20\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\4\0\1\0\0\0' >> %t.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\4\0\2\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 
@@ -51,14 +56,18 @@ RUN: printf '\1\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\4\0\1\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\1\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 
 RUN: printf '\067\265\035\031\112\165\023\344' >> %t.profraw
 RUN: printf '\02\0\0\0\0\0\0\0' >> %t.profraw
-RUN: printf '\xd8\xff\3\0\1\0\0\0' >> %t.profraw
+RUN: printf '\xc8\xff\3\0\1\0\0\0' >> %t.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\02\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 
 RUN: printf '\023\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\067\0\0\0\0\0\0\0' >> %t.profraw
diff --git a/llvm/test/tools/llvm-profdata/compat.proftext b/llvm/test/tools/llvm-profdata/compat.proftext
index 73321cc5e66d4..111fd41988197 100644
--- a/llvm/test/tools/llvm-profdata/compat.proftext
+++ b/llvm/test/tools/llvm-profdata/compat.proftext
@@ -87,3 +87,26 @@ large_numbers
 # FORMATV4: Total functions: 3
 # FORMATV4: Maximum function count: 2305843009213693952
 # FORMATV4: Maximum internal block count: 1152921504606846976
+
+# RUN: llvm-profdata show %S/Inputs/compat.profdata.v10 -all-functions --counts | FileCheck %s -check-prefix=FORMATV10
+
+# FORMATV10: Counters:
+# FORMATV10:   large_numbers:
+# FORMATV10:     Hash: 0x3fffffffffffffff
+# FORMATV10:     Counters: 6
+# FORMATV10:     Function count: 2305843009213693952
+# FORMATV10:     Block counts: [1152921504606846976, 576460752303423488, 288230376151711744, 144115188075855872, 72057594037927936]
+# FORMATV10:   name with spaces:
+# FORMATV10:     Hash: 0x0000000000000400
+# FORMATV10:     Counters: 2
+# FORMATV10:     Function count: 0
+# FORMATV10:     Block counts: [0]
+# FORMATV10:   function_count_only:
+# FORMATV10:     Hash: 0x0000000000000000
+# FORMATV10:     Counters: 1
+# FORMATV10:     Function count: 97531
+# FORMATV10:     Block counts: []
+# FORMATV10: Functions shown: 3
+# FORMATV10: Total functions: 3
+# FORMATV10: Maximum function count: 2305843009213693952
+# FORMATV10: Maximum internal block count: 1152921504606846976
diff --git a/llvm/test/tools/llvm-profdata/large-binary-id-size.test b/llvm/test/tools/llvm-profdata/large-binary-id-size.test
index 2394431e94de4..38b838e0d100a 100644
--- a/llvm/test/tools/llvm-profdata/large-binary-id-size.test
+++ b/llvm/test/tools/llvm-profdata/large-binary-id-size.test
@@ -1,5 +1,5 @@
 RUN: printf '\201rforpl\377' > %t.profraw
-RUN: printf '\10\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\11\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\40\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
@@ -9,6 +9,9 @@ RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 
 // Check for a corrupted size being too large past the end of the file.
 RUN: printf '\7\7\7\7\7\7\7\7' >> %t.profraw
diff --git a/llvm/test/tools/llvm-profdata/malformed-not-space-for-another-header.test b/llvm/test/tools/llvm-profdata/malformed-not-space-for-another-header.test
index 06f418d0235d2..c967e850dbe35 100644
--- a/llvm/test/tools/llvm-profdata/malformed-not-space-for-another-header.test
+++ b/llvm/test/tools/llvm-profdata/malformed-not-space-for-another-header.test
@@ -5,20 +5,25 @@
 // INSTR_PROF_RAW_HEADER(uint64_t, BinaryIdsSize, __llvm_write_binary_ids(NULL))
 // INSTR_PROF_RAW_HEADER(uint64_t, DataSize, DataSize)
 // INSTR_PROF_RAW_HEADER(uint64_t, CountersSize, CountersSize)
+// INSTR_PROF_RAW_HEADER(uint64_t, NumBitmaskBytes, NumBitmaskBytes)
 // INSTR_PROF_RAW_HEADER(uint64_t, NamesSize,  NamesSize)
 // INSTR_PROF_RAW_HEADER(uint64_t, CountersDelta, (uintptr_t)CountersBegin)
+// INSTR_PROF_RAW_HEADER(uint64_t, BitmaskDelta, (uintptr_t)BitmaskBegin)
 // INSTR_PROF_RAW_HEADER(uint64_t, NamesDelta, (uintptr_t)NamesBegin)
 // INSTR_PROF_RAW_HEADER(uint64_t, ValueKindLast, IPVK_Last)
 
 RUN: printf '\201rforpl\377' > %t.profraw
-RUN: printf '\10\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\11\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\1\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\1\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\10\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\4\0\1\0\0\0' >> %t.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\4\0\2\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 
@@ -35,7 +40,9 @@ RUN: printf '\1\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\4\0\1\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\1\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 
 RUN: printf '\023\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\3\0foo\0\0\0' >> %t.profraw
diff --git a/llvm/test/tools/llvm-profdata/malformed-num-counters-zero.test b/llvm/test/tools/llvm-profdata/malformed-num-counters-zero.test
index b718cf0fd8e97..e1e33824bf2f8 100644
--- a/llvm/test/tools/llvm-profdata/malformed-num-counters-zero.test
+++ b/llvm/test/tools/llvm-profdata/malformed-num-counters-zero.test
@@ -5,20 +5,26 @@
 // INSTR_PROF_RAW_HEADER(uint64_t, BinaryIdsSize, __llvm_write_binary_ids(NULL))
 // INSTR_PROF_RAW_HEADER(uint64_t, DataSize, DataSize)
 // INSTR_PROF_RAW_HEADER(uint64_t, CountersSize, CountersSize)
+// INSTR_PROF_RAW_HEADER(uint64_t, NumBitmaskBytes, NumBitmaskBytes)
 // INSTR_PROF_RAW_HEADER(uint64_t, NamesSize,  NamesSize)
 // INSTR_PROF_RAW_HEADER(uint64_t, CountersDelta, (uintptr_t)CountersBegin)
+// INSTR_PROF_RAW_HEADER(uint64_t, BitmaskDelta, (uintptr_t)BitmaskBegin)
 // INSTR_PROF_RAW_HEADER(uint64_t, NamesDelta, (uintptr_t)NamesBegin)
 // INSTR_PROF_RAW_HEADER(uint64_t, ValueKindLast, IPVK_Last)
 
 RUN: printf '\201rforpl\377' > %t.profraw
-RUN: printf '\10\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\11\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\1\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\1\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\10\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\4\0\1\0\0\0' >> %t.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\4\0\2\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 
@@ -35,8 +41,10 @@ RUN: printf '\1\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\4\0\1\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 // Make NumCounters = 0 so that we get "number of counters is zero" error message
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 
 RUN: printf '\023\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\3\0foo\0\0\0' >> %t.profraw
diff --git a/llvm/test/tools/llvm-profdata/malformed-ptr-to-counter-array.test b/llvm/test/tools/llvm-profdata/malformed-ptr-to-counter-array.test
index 38e40334a6a69..3c23bc7dd0f7f 100644
--- a/llvm/test/tools/llvm-profdata/malformed-ptr-to-counter-array.test
+++ b/llvm/test/tools/llvm-profdata/malformed-ptr-to-counter-array.test
@@ -5,20 +5,25 @@
 // INSTR_PROF_RAW_HEADER(uint64_t, BinaryIdsSize, __llvm_write_binary_ids(NULL))
 // INSTR_PROF_RAW_HEADER(uint64_t, DataSize, DataSize)
 // INSTR_PROF_RAW_HEADER(uint64_t, CountersSize, CountersSize)
+// INSTR_PROF_RAW_HEADER(uint64_t, NumBitmaskBytes, NumBitmaskBytes)
 // INSTR_PROF_RAW_HEADER(uint64_t, NamesSize,  NamesSize)
 // INSTR_PROF_RAW_HEADER(uint64_t, CountersDelta, (uintptr_t)CountersBegin)
+// INSTR_PROF_RAW_HEADER(uint64_t, BitmaskDelta, (uintptr_t)BitmaskBegin)
 // INSTR_PROF_RAW_HEADER(uint64_t, NamesDelta, (uintptr_t)NamesBegin)
 // INSTR_PROF_RAW_HEADER(uint64_t, ValueKindLast, IPVK_Last)
 
 RUN: printf '\201rforpl\377' > %t.profraw
-RUN: printf '\10\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\11\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\1\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\2\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\10\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\6\0\1\0\0\0' >> %t.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\6\0\2\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 
@@ -38,10 +43,12 @@ RUN: printf '\02\0\0\0\0\0\0\0' >> %t.profraw
 // Octal '\11' is 9 in decimal: this should push CounterOffset to 1. As there are two counters,
 // the profile reader should error out.
 RUN: printf '\11\0\6\0\1\0\0\0' >> %t.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\02\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 
 // Counter Section
 
diff --git a/llvm/test/tools/llvm-profdata/mcdc-bitmap.test b/llvm/test/tools/llvm-profdata/mcdc-bitmap.test
new file mode 100644
index 0000000000000..a7b1b5df8c306
--- /dev/null
+++ b/llvm/test/tools/llvm-profdata/mcdc-bitmap.test
@@ -0,0 +1,201 @@
+# Test MC/DC bitmap reading and merging.
+
+# Merge as profdata.
+RUN: split-file %s %t
+RUN: llvm-profdata merge %t/mcdc-1.proftext %t/mcdc-2.proftext -o %t.profdata
+RUN: llvm-profdata show %t.profdata --text -all-functions | FileCheck %s --check-prefix=MCDC
+# Merge as proftext.
+RUN: llvm-profdata merge %t/mcdc-1.proftext %t/mcdc-2.proftext -o %t.proftext
+RUN: llvm-profdata show %t.proftext --text -all-functions | FileCheck %s --check-prefix=MCDC
+
+MCDC: # Num Bitmap Bytes:
+MCDC-NEXT: $1
+MCDC-NEXT: # Bitmap Byte Values:
+MCDC-NEXT: a
+MCDC: # Num Bitmap Bytes:
+MCDC-NEXT: $2
+MCDC-NEXT: # Bitmap Byte Values:
+MCDC-NEXT: 0x29
+MCDC-NEXT: 0x0
+
+# Merge as profdata.
+RUN: llvm-profdata merge %t/mcdc-3.proftext %t/mcdc-4.proftext -o %t.profdata
+RUN: llvm-profdata show %t.profdata --text -all-functions | FileCheck %s --check-prefix=MCDC2
+# Merge as proftext.
+RUN: llvm-profdata merge %t/mcdc-3.proftext %t/mcdc-4.proftext -o %t.proftext
+RUN: llvm-profdata show %t.proftext --text -all-functions | FileCheck %s --check-prefix=MCDC2
+
+MCDC2: # Num Bitmap Bytes:
+MCDC2-NEXT: $8
+MCDC2-NEXT: # Bitmap Byte Values:
+MCDC2-NEXT: 0x1
+MCDC2-NEXT: 0x2
+MCDC2-NEXT: 0x3
+MCDC2-NEXT: 0xf
+MCDC2-NEXT: 0xf
+MCDC2-NEXT: 0xe
+MCDC2-NEXT: 0xf
+MCDC2-NEXT: 0xa
+
+# Incompatible size mismatch.
+RUN: llvm-profdata merge %t/mcdc-2.proftext %t/mcdc-4.proftext -o %t.profdata 2>&1 | FileCheck %s --check-prefix=MCDC3
+# Merge as proftext
+RUN: llvm-profdata merge %t/mcdc-2.proftext %t/mcdc-4.proftext -o %t.proftext 2>&1 | FileCheck %s --check-prefix=MCDC3
+
+MCDC3: function bitmap size change detected (bitmap size mismatch)
+
+# Invalid number of bitmap bytes.
+RUN: not llvm-profdata merge %t/mcdc-3.proftext %t/mcdc-err0.proftext -o %t.proftext 2>&1 | FileCheck %s --check-prefix=MCDC4
+
+MCDC4: malformed instrumentation profile data: number of bitmap bytes is not a valid integer
+
+# Invalid bitmap byte.
+RUN: not llvm-profdata merge %t/mcdc-3.proftext %t/mcdc-err1.proftext -o %t.proftext 2>&1 | FileCheck %s --check-prefix=MCDC5
+
+MCDC5: malformed instrumentation profile data: bitmap byte is not a valid integer
+
+;--- mcdc-1.proftext
+main
+# Func Hash:
+702755447896
+# Num Counters:
+4
+# Counter Values:
+1
+0
+1
+0
+# Num Bitmask Bytes:
+$1
+# Bitmask Byte Values:
+2
+;--- mcdc-2.proftext
+main
+# Func Hash:
+702755447896
+# Num Counters:
+4
+# Counter Values:
+1
+1
+1
+1
+# Num Bitmask Bytes:
+$1
+# Bitmask Byte Values:
+8
+
+
+test3
+# Func Hash:
+15288018065
+# Num Counters:
+6
+# Counter Values:
+4
+2
+1
+0
+0
+2
+# Num Bitmask Bytes:
+$0x2
+# Bitmask Byte Values:
+0x29
+0x0
+;--- mcdc-3.proftext
+test3
+# Func Hash:
+15288018065
+# Num Counters:
+6
+# Counter Values:
+4
+2
+1
+0
+0
+2
+# Num Bitmask Bytes:
+$8
+# Bitmask Byte Values:
+0x0
+0x2
+0x3
+0xf
+0xf
+0xa
+0xc
+0x2
+;--- mcdc-4.proftext
+test3
+# Func Hash:
+15288018065
+# Num Counters:
+6
+# Counter Values:
+4
+2
+1
+0
+0
+2
+# Num Bitmask Bytes:
+$       8
+# Bitmask Byte Values:
+1
+2
+3
+4
+5
+6
+7
+8
+;--- mcdc-err0.proftext
+test3
+# Func Hash:
+15288018065
+# Num Counters:
+6
+# Counter Values:
+4
+2
+1
+0
+0
+2
+# Num Bitmask Bytes:
+$8.9
+# Bitmask Byte Values:
+1
+2
+3
+4
+5
+6
+7
+8
+;--- mcdc-err1.proftext
+test3
+# Func Hash:
+15288018065
+# Num Counters:
+6
+# Counter Values:
+4
+2
+1
+0
+0
+2
+# Num Bitmask Bytes:
+$8
+# Bitmask Byte Values:
+1
+2
+3
+4
+5.4
+6
+7
+8
diff --git a/llvm/test/tools/llvm-profdata/misaligned-binary-ids-size.test b/llvm/test/tools/llvm-profdata/misaligned-binary-ids-size.test
index 171b5cc60878f..4a5c42843ff4d 100644
--- a/llvm/test/tools/llvm-profdata/misaligned-binary-ids-size.test
+++ b/llvm/test/tools/llvm-profdata/misaligned-binary-ids-size.test
@@ -1,5 +1,5 @@
 RUN: printf '\201rforpl\377' > %t.profraw
-RUN: printf '\10\0\0\0\0\0\0\0' >> %t.profraw
+RUN: printf '\11\0\0\0\0\0\0\0' >> %t.profraw
 // We should fail on this because the binary IDs is not a multiple of 8 bytes.
 RUN: printf '\77\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
diff --git a/llvm/test/tools/llvm-profdata/mismatched-raw-profile-header.test b/llvm/test/tools/llvm-profdata/mismatched-raw-profile-header.test
index c0072bcbde1b3..2a92575ee3407 100644
--- a/llvm/test/tools/llvm-profdata/mismatched-raw-profile-header.test
+++ b/llvm/test/tools/llvm-profdata/mismatched-raw-profile-header.test
@@ -8,6 +8,9 @@ RUN: printf '\0\0\0\0\0\0\0\2' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\3' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\20' >> %t
 RUN: printf '\0\0\0\1\0\4\0\0' >> %t
 RUN: printf '\0\0\0\2\0\4\0\0' >> %t
diff --git a/llvm/test/tools/llvm-profdata/raw-32-bits-be.test b/llvm/test/tools/llvm-profdata/raw-32-bits-be.test
index c8e862009ef02..fbd31d044382a 100644
--- a/llvm/test/tools/llvm-profdata/raw-32-bits-be.test
+++ b/llvm/test/tools/llvm-profdata/raw-32-bits-be.test
@@ -1,37 +1,46 @@
 RUN: printf '\377lprofR\201' > %t
-RUN: printf '\0\0\0\0\0\0\0\10' >> %t
+RUN: printf '\0\0\0\0\0\0\0\11' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\2' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\3' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0\0\0\0\4' >> %t
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\20' >> %t
 RUN: printf '\0\0\0\0\1\0\0\0' >> %t
+RUN: printf '\0\0\0\0\3\0\0\0' >> %t
 RUN: printf '\0\0\0\0\2\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 
 RUN: printf '\134\370\302\114\333\030\275\254' >> %t
 RUN: printf '\0\0\0\0\0\0\0\1' >> %t
 RUN: printf '\1\0\0\0' >> %t
+RUN: printf '\3\0\0\0' >> %t
 RUN: printf '\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0' >> %t
 RUN: printf '\0\0\0\1' >> %t
-RUN: printf '\0\0\0\0\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0\0\0\0\3' >> %t
+RUN: printf '\0\0\0\0' >> %t
 
 RUN: printf '\344\023\165\112\031\035\265\067' >> %t
 RUN: printf '\0\0\0\0\0\0\0\2' >> %t
-RUN: printf '\0\xff\xff\xe0' >> %t
+RUN: printf '\0\xff\xff\xd8' >> %t
+RUN: printf '\2\xff\xff\xd3' >> %t
 RUN: printf '\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0' >> %t
 RUN: printf '\0\0\0\2' >> %t
-RUN: printf '\0\0\0\0\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0\0\0\0\1' >> %t
+RUN: printf '\0\0\0\0' >> %t
 
 RUN: printf '\0\0\0\0\0\0\0\023' >> %t
 RUN: printf '\0\0\0\0\0\0\0\067' >> %t
 RUN: printf '\0\0\0\0\0\0\0\101' >> %t
+RUN: printf '\125\125\125\052' >> %t
 RUN: printf '\7\0foo\1bar\0\0\0\0\0\0\0' >> %t
 
 RUN: llvm-profdata show %t -all-functions -counts | FileCheck %s
+RUN: llvm-profdata show %t -all-functions -text | FileCheck %s -check-prefix=MCDC
 
 CHECK: Counters:
 CHECK:   foo:
@@ -48,3 +57,14 @@ CHECK: Functions shown: 2
 CHECK: Total functions: 2
 CHECK: Maximum function count: 55
 CHECK: Maximum internal block count: 65
+
+MCDC: Num Bitmap Bytes:
+MCDC-NEXT: $3
+MCDC-NEXT: Bitmap Byte Values:
+MCDC-NEXT: 55
+MCDC-NEXT: 55
+MCDC-NEXT: 55
+MCDC: Num Bitmap Bytes:
+MCDC-NEXT: $1
+MCDC-NEXT: Bitmap Byte Values:
+MCDC-NEXT: 0x2a
diff --git a/llvm/test/tools/llvm-profdata/raw-32-bits-le.test b/llvm/test/tools/llvm-profdata/raw-32-bits-le.test
index 523ff1ceb4807..bb899c5fdb555 100644
--- a/llvm/test/tools/llvm-profdata/raw-32-bits-le.test
+++ b/llvm/test/tools/llvm-profdata/raw-32-bits-le.test
@@ -1,37 +1,46 @@
 RUN: printf '\201Rforpl\377' > %t
-RUN: printf '\10\0\0\0\0\0\0\0' >> %t
+RUN: printf '\11\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\2\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\3\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
+RUN: printf '\4\0\0\0\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\20\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\1\0\0\0\0' >> %t
+RUN: printf '\0\0\0\3\0\0\0\0' >> %t
 RUN: printf '\0\0\0\2\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 
 RUN: printf '\254\275\030\333\114\302\370\134' >> %t
 RUN: printf '\1\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\1' >> %t
+RUN: printf '\0\0\0\3' >> %t
 RUN: printf '\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0' >> %t
 RUN: printf '\1\0\0\0' >> %t
-RUN: printf '\0\0\0\0\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0\3\0\0\0' >> %t
+RUN: printf '\0\0\0\0' >> %t
 
 RUN: printf '\067\265\035\031\112\165\023\344' >> %t
 RUN: printf '\02\0\0\0\0\0\0\0' >> %t
-RUN: printf '\xe0\xff\xff\0' >> %t
+RUN: printf '\xd8\xff\xff\0' >> %t
+RUN: printf '\xd3\xff\xff\2' >> %t
 RUN: printf '\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0' >> %t
 RUN: printf '\2\0\0\0' >> %t
-RUN: printf '\0\0\0\0\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0\1\0\0\0' >> %t
+RUN: printf '\0\0\0\0' >> %t
 
 RUN: printf '\023\0\0\0\0\0\0\0' >> %t
 RUN: printf '\067\0\0\0\0\0\0\0' >> %t
 RUN: printf '\101\0\0\0\0\0\0\0' >> %t
+RUN: printf '\125\125\125\052' >> %t
 RUN: printf '\7\0foo\1bar\0\0\0\0\0\0\0' >> %t
 
 RUN: llvm-profdata show %t -all-functions -counts | FileCheck %s
+RUN: llvm-profdata show %t -all-functions -text | FileCheck %s -check-prefix=MCDC
 
 CHECK: Counters:
 CHECK:   foo:
@@ -48,3 +57,14 @@ CHECK: Functions shown: 2
 CHECK: Total functions: 2
 CHECK: Maximum function count: 55
 CHECK: Maximum internal block count: 65
+
+MCDC: Num Bitmap Bytes:
+MCDC-NEXT: $3
+MCDC-NEXT: Bitmap Byte Values:
+MCDC-NEXT: 55
+MCDC-NEXT: 55
+MCDC-NEXT: 55
+MCDC: Num Bitmap Bytes:
+MCDC-NEXT: $1
+MCDC-NEXT: Bitmap Byte Values:
+MCDC-NEXT: 0x2a
diff --git a/llvm/test/tools/llvm-profdata/raw-64-bits-be.test b/llvm/test/tools/llvm-profdata/raw-64-bits-be.test
index b2b8b31dafbf5..8fcadb6a0dd28 100644
--- a/llvm/test/tools/llvm-profdata/raw-64-bits-be.test
+++ b/llvm/test/tools/llvm-profdata/raw-64-bits-be.test
@@ -1,35 +1,44 @@
 RUN: printf '\377lprofr\201' > %t
-RUN: printf '\0\0\0\0\0\0\0\10' >> %t
+RUN: printf '\0\0\0\0\0\0\0\11' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\2' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\3' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0\0\0\0\4' >> %t
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\20' >> %t
 RUN: printf '\0\0\0\1\0\4\0\0' >> %t
+RUN: printf '\0\0\0\3\0\4\0\0' >> %t
 RUN: printf '\0\0\0\2\0\4\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 
 RUN: printf '\134\370\302\114\333\030\275\254' >> %t
 RUN: printf '\0\0\0\0\0\0\0\1' >> %t
 RUN: printf '\0\0\0\1\0\4\0\0' >> %t
+RUN: printf '\0\0\0\3\0\4\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\1\0\0\0\0' >> %t
+RUN: printf '\0\0\0\3\0\0\0\0' >> %t
 
 RUN: printf '\344\023\165\112\031\035\265\067' >> %t
 RUN: printf '\0\0\0\0\0\0\0\02' >> %t
-RUN: printf '\0\0\0\1\0\3\xff\xd8' >> %t
+RUN: printf '\0\0\0\1\0\3\xff\xc8' >> %t
+RUN: printf '\0\0\0\3\0\3\xff\xc3' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\02\0\0\0\0' >> %t
+RUN: printf '\0\0\0\1\0\0\0\0' >> %t
 
 RUN: printf '\0\0\0\0\0\0\0\023' >> %t
 RUN: printf '\0\0\0\0\0\0\0\067' >> %t
 RUN: printf '\0\0\0\0\0\0\0\101' >> %t
+RUN: printf '\125\125\125\052' >> %t
 RUN: printf '\7\0foo\1bar\0\0\0\0\0\0\0' >> %t
 
 RUN: llvm-profdata show %t -all-functions -counts | FileCheck %s
+RUN: llvm-profdata show %t -all-functions -text | FileCheck %s -check-prefix=MCDC
 
 CHECK: Counters:
 CHECK:   foo:
@@ -46,3 +55,14 @@ CHECK: Functions shown: 2
 CHECK: Total functions: 2
 CHECK: Maximum function count: 55
 CHECK: Maximum internal block count: 65
+
+MCDC: Num Bitmap Bytes:
+MCDC-NEXT: $3
+MCDC-NEXT: Bitmap Byte Values:
+MCDC-NEXT: 55
+MCDC-NEXT: 55
+MCDC-NEXT: 55
+MCDC: Num Bitmap Bytes:
+MCDC-NEXT: $1
+MCDC-NEXT: Bitmap Byte Values:
+MCDC-NEXT: 0x2a
diff --git a/llvm/test/tools/llvm-profdata/raw-64-bits-le.test b/llvm/test/tools/llvm-profdata/raw-64-bits-le.test
index 4e95798bc0afb..0aa8b38f69267 100644
--- a/llvm/test/tools/llvm-profdata/raw-64-bits-le.test
+++ b/llvm/test/tools/llvm-profdata/raw-64-bits-le.test
@@ -1,35 +1,44 @@
 RUN: printf '\201rforpl\377' > %t
-RUN: printf '\10\0\0\0\0\0\0\0' >> %t
+RUN: printf '\11\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\2\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\3\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
+RUN: printf '\4\0\0\0\0\0\0\0' >> %t
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\20\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\4\0\1\0\0\0' >> %t
+RUN: printf '\0\0\4\0\3\0\0\0' >> %t
 RUN: printf '\0\0\4\0\2\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 
 RUN: printf '\254\275\030\333\114\302\370\134' >> %t
 RUN: printf '\1\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\4\0\1\0\0\0' >> %t
+RUN: printf '\0\0\4\0\3\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\1\0\0\0\0\0\0\0' >> %t
+RUN: printf '\3\0\0\0\0\0\0\0' >> %t
 
 RUN: printf '\067\265\035\031\112\165\023\344' >> %t
 RUN: printf '\02\0\0\0\0\0\0\0' >> %t
-RUN: printf '\xd8\xff\3\0\1\0\0\0' >> %t
+RUN: printf '\xc8\xff\3\0\1\0\0\0' >> %t
+RUN: printf '\xc3\xff\3\0\3\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t
 RUN: printf '\02\0\0\0\0\0\0\0' >> %t
+RUN: printf '\1\0\0\0\0\0\0\0' >> %t
 
 RUN: printf '\023\0\0\0\0\0\0\0' >> %t
 RUN: printf '\067\0\0\0\0\0\0\0' >> %t
 RUN: printf '\101\0\0\0\0\0\0\0' >> %t
+RUN: printf '\125\125\125\052' >> %t
 RUN: printf '\7\0foo\1bar\0\0\0\0\0\0\0' >> %t
 
 RUN: llvm-profdata show %t -all-functions -counts | FileCheck %s
+RUN: llvm-profdata show %t -all-functions -text | FileCheck %s -check-prefix=MCDC
 
 CHECK: Counters:
 CHECK:   foo:
@@ -46,3 +55,14 @@ CHECK: Functions shown: 2
 CHECK: Total functions: 2
 CHECK: Maximum function count: 55
 CHECK: Maximum internal block count: 65
+
+MCDC: Num Bitmap Bytes:
+MCDC-NEXT: $3
+MCDC-NEXT: Bitmap Byte Values:
+MCDC-NEXT: 55
+MCDC-NEXT: 55
+MCDC-NEXT: 55
+MCDC: Num Bitmap Bytes:
+MCDC-NEXT: $1
+MCDC-NEXT: Bitmap Byte Values:
+MCDC-NEXT: 0x2a
diff --git a/llvm/test/tools/llvm-profdata/raw-two-profiles.test b/llvm/test/tools/llvm-profdata/raw-two-profiles.test
index 8d46c91e2732c..f4a9aa8e1bbc3 100644
--- a/llvm/test/tools/llvm-profdata/raw-two-profiles.test
+++ b/llvm/test/tools/llvm-profdata/raw-two-profiles.test
@@ -1,12 +1,15 @@
 RUN: printf '\201rforpl\377' > %t-foo.profraw
-RUN: printf '\10\0\0\0\0\0\0\0' >> %t-foo.profraw
+RUN: printf '\11\0\0\0\0\0\0\0' >> %t-foo.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t-foo.profraw
 RUN: printf '\1\0\0\0\0\0\0\0' >> %t-foo.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t-foo.profraw
 RUN: printf '\1\0\0\0\0\0\0\0' >> %t-foo.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t-foo.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t-foo.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t-foo.profraw
 RUN: printf '\10\0\0\0\0\0\0\0' >> %t-foo.profraw
 RUN: printf '\0\0\4\0\1\0\0\0' >> %t-foo.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t-foo.profraw
 RUN: printf '\0\0\4\0\2\0\0\0' >> %t-foo.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t-foo.profraw
 
@@ -15,20 +18,25 @@ RUN: printf '\1\0\0\0\0\0\0\0' >> %t-foo.profraw
 RUN: printf '\0\0\4\0\1\0\0\0' >> %t-foo.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t-foo.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t-foo.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t-foo.profraw
 RUN: printf '\1\0\0\0\0\0\0\0' >> %t-foo.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t-foo.profraw
 
 RUN: printf '\023\0\0\0\0\0\0\0' >> %t-foo.profraw
 RUN: printf '\3\0foo\0\0\0' >> %t-foo.profraw
 
 RUN: printf '\201rforpl\377' > %t-bar.profraw
-RUN: printf '\10\0\0\0\0\0\0\0' >> %t-bar.profraw
+RUN: printf '\11\0\0\0\0\0\0\0' >> %t-bar.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t-bar.profraw
 RUN: printf '\1\0\0\0\0\0\0\0' >> %t-bar.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t-bar.profraw
 RUN: printf '\2\0\0\0\0\0\0\0' >> %t-bar.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t-bar.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t-bar.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t-bar.profraw
 RUN: printf '\10\0\0\0\0\0\0\0' >> %t-bar.profraw
 RUN: printf '\0\0\6\0\1\0\0\0' >> %t-bar.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t-bar.profraw
 RUN: printf '\0\0\6\0\2\0\0\0' >> %t-bar.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t-bar.profraw
 
@@ -37,7 +45,9 @@ RUN: printf '\02\0\0\0\0\0\0\0' >> %t-bar.profraw
 RUN: printf '\0\0\6\0\1\0\0\0' >> %t-bar.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t-bar.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t-bar.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t-bar.profraw
 RUN: printf '\02\0\0\0\0\0\0\0' >> %t-bar.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t-bar.profraw
 
 RUN: printf '\067\0\0\0\0\0\0\0' >> %t-bar.profraw
 RUN: printf '\101\0\0\0\0\0\0\0' >> %t-bar.profraw

From 7de70e0f724f7d0aec7ab0f78c648982989efc5b Mon Sep 17 00:00:00 2001
From: Z572 <37945516+Z572@users.noreply.github.com>
Date: Tue, 31 Oct 2023 00:20:00 +0800
Subject: [PATCH 392/877] [Flang][OpenMP] Fix comments that should not be
 Sentinels on fixed format. (#68911)

Fixes #68653
---
 flang/lib/Parser/prescan.cpp         | 16 +++++------
 flang/test/Parser/OpenMP/sentinels.f | 42 ++++++++++++++++++++++++++++
 2 files changed, 49 insertions(+), 9 deletions(-)
 create mode 100644 flang/test/Parser/OpenMP/sentinels.f

diff --git a/flang/lib/Parser/prescan.cpp b/flang/lib/Parser/prescan.cpp
index 2f25b02bf7a32..f61eff5b0dd6a 100644
--- a/flang/lib/Parser/prescan.cpp
+++ b/flang/lib/Parser/prescan.cpp
@@ -1177,16 +1177,14 @@ Prescanner::IsFixedFormCompilerDirectiveLine(const char *start) const {
   char sentinel[5], *sp{sentinel};
   int column{2};
   for (; column < 6; ++column, ++p) {
-    if (*p != ' ') {
-      if (*p == '\n' || *p == '\t') {
-        break;
-      }
-      if (sp == sentinel + 1 && sentinel[0] == '$' && IsDecimalDigit(*p)) {
-        // OpenMP conditional compilation line: leave the label alone
-        break;
-      }
-      *sp++ = ToLowerCaseLetter(*p);
+    if (*p == ' ' || *p == '\n' || *p == '\t') {
+      break;
+    }
+    if (sp == sentinel + 1 && sentinel[0] == '$' && IsDecimalDigit(*p)) {
+      // OpenMP conditional compilation line: leave the label alone
+      break;
     }
+    *sp++ = ToLowerCaseLetter(*p);
   }
   if (column == 6) {
     if (*p == ' ' || *p == '\t' || *p == '0') {
diff --git a/flang/test/Parser/OpenMP/sentinels.f b/flang/test/Parser/OpenMP/sentinels.f
new file mode 100644
index 0000000000000..98d4bad19f6a2
--- /dev/null
+++ b/flang/test/Parser/OpenMP/sentinels.f
@@ -0,0 +1,42 @@
+! RUN: %flang_fc1 -fopenmp -E %s | FileCheck %s
+! CHECK:      program main
+! CHECK:       interface
+! CHECK:        subroutine sub(a, b)
+! CHECK:!dir$ ignore_tkr a
+! CHECK:!dir$ ignore_tkr b
+! CHECK:          real(4):: a, b
+! CHECK:        end subroutine
+! CHECK:       end interface
+! CHECK:      PRINT *, "Is '    '"
+! CHECK:  123 PRINT *, "Is '123 '"
+
+!@cuf subroutine atcuf;
+      program main
+       interface
+        subroutine sub(a, b)
+!dir$ ignore_tkr a
+!dir$ ignore_tkr
+!dir$+ b
+          real(4):: a, b
+        end subroutine
+       end interface
+!
+!	comment line
+!@fp  PRINT *, "This is a comment line"
+!@f p PRINT *, "This is a comment line"
+!$    PRINT *, "Is '    '"
+!$123 PRINT *, "Is '123 '"
+!$ABC PRINT *, "Is 'ABC '"
+! $    PRINT *, "This is a comment line 6"
+c    $This is a comment line
+!0$110This is a comment line
+
+! $ This is a comment line
+! $  0This is a comment line
+!    &This is a comment line
+!  $  This is a comment line
+! $   This is a comment line
+C $   This is a comment line
+c $   his is a comment line
+* $   This is a comment line
+      end

From 8a1719d3edbb04ac6a20062911d59d38aec3b2ca Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder@redhat.com>
Date: Mon, 30 Oct 2023 16:16:53 +0100
Subject: [PATCH 393/877] [clang][Interp][NFC] Use delegate() in
 VisitCXXBindTemporaryExpr

---
 clang/lib/AST/Interp/ByteCodeExprGen.cpp | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/clang/lib/AST/Interp/ByteCodeExprGen.cpp b/clang/lib/AST/Interp/ByteCodeExprGen.cpp
index 195af664c13da..485893d58f487 100644
--- a/clang/lib/AST/Interp/ByteCodeExprGen.cpp
+++ b/clang/lib/AST/Interp/ByteCodeExprGen.cpp
@@ -1314,9 +1314,7 @@ bool ByteCodeExprGen<Emitter>::VisitMaterializeTemporaryExpr(
 template <class Emitter>
 bool ByteCodeExprGen<Emitter>::VisitCXXBindTemporaryExpr(
     const CXXBindTemporaryExpr *E) {
-  if (Initializing)
-    return this->visitInitializer(E->getSubExpr());
-  return this->visit(E->getSubExpr());
+  return this->delegate(E->getSubExpr());
 }
 
 template <class Emitter>

From f75370310c097cde2fce5d980398ecef8f48b633 Mon Sep 17 00:00:00 2001
From: Arthur Eubanks <aeubanks@google.com>
Date: Mon, 30 Oct 2023 09:24:18 -0700
Subject: [PATCH 394/877] [X86] Print 'l' section flag for SHF_X86_64_LARGE
 (#70380)

When directly compiling to an object file we properly set the section
flag, but not when emitting assembly.
---
 llvm/lib/MC/MCSectionELF.cpp | 3 +++
 llvm/test/MC/ELF/section.s   | 2 ++
 2 files changed, 5 insertions(+)

diff --git a/llvm/lib/MC/MCSectionELF.cpp b/llvm/lib/MC/MCSectionELF.cpp
index 666252ffcb74e..95fdf33522076 100644
--- a/llvm/lib/MC/MCSectionELF.cpp
+++ b/llvm/lib/MC/MCSectionELF.cpp
@@ -123,6 +123,9 @@ void MCSectionELF::printSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
   } else if (Arch == Triple::hexagon) {
     if (Flags & ELF::SHF_HEX_GPREL)
       OS << 's';
+  } else if (Arch == Triple::x86_64) {
+    if (Flags & ELF::SHF_X86_64_LARGE)
+      OS << 'l';
   }
 
   OS << '"';
diff --git a/llvm/test/MC/ELF/section.s b/llvm/test/MC/ELF/section.s
index 3a6d046821f4a..8c625256a2761 100644
--- a/llvm/test/MC/ELF/section.s
+++ b/llvm/test/MC/ELF/section.s
@@ -269,6 +269,8 @@ bar:
 // CHECK-NEXT:     ]
 
 .section .large,"l"
+// ASM: .section .large,"l"
+
 // CHECK:        Section {
 // CHECK:          Name: .large
 // CHECK-NEXT:     Type: SHT_PROGBITS

From 3c5885535a82fc5266450620da8c2b880ae9b497 Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2@gmail.com>
Date: Mon, 30 Oct 2023 12:28:51 -0400
Subject: [PATCH 395/877] [libc++][tests] Fix a few remaining instances of
 outdated static assertion regexes in our test suite (#70454)

This is a re-application of 166b3a86173, which was reverted in
fde1ecdec878b because it broke some tests.
---
 .../cpp17_iterator_concepts.verify.cpp        | 86 +++++++++----------
 .../mdspan/layout_stride/extents.verify.cpp   |  4 +-
 2 files changed, 45 insertions(+), 45 deletions(-)

diff --git a/libcxx/test/libcxx/algorithms/cpp17_iterator_concepts.verify.cpp b/libcxx/test/libcxx/algorithms/cpp17_iterator_concepts.verify.cpp
index 3bc49cd3b1b18..344543d5f19ff 100644
--- a/libcxx/test/libcxx/algorithms/cpp17_iterator_concepts.verify.cpp
+++ b/libcxx/test/libcxx/algorithms/cpp17_iterator_concepts.verify.cpp
@@ -83,26 +83,26 @@ struct diff_t_not_signed : valid_iterator<diff_t_not_signed> {
 };
 
 void check_iterator_requirements() {
-  static_assert(std::__cpp17_iterator<missing_deref>); // expected-error-re {{{{static assertion|static_assert}} failed}}
+  static_assert(std::__cpp17_iterator<missing_deref>); // expected-error {{static assertion failed}}
   // expected-note@*:* {{indirection requires pointer operand}}
 
-  static_assert(std::__cpp17_iterator<missing_preincrement>); // expected-error-re {{{{static assertion|static_assert}} failed}}
+  static_assert(std::__cpp17_iterator<missing_preincrement>); // expected-error {{static assertion failed}}
   // expected-note@*:* {{cannot increment value of type 'missing_preincrement'}}
 
 
-  static_assert(std::__cpp17_iterator<not_move_constructible>); // expected-error-re {{{{static assertion|static_assert}} failed}}
+  static_assert(std::__cpp17_iterator<not_move_constructible>); // expected-error {{static assertion failed}}
   // expected-note@*:* {{because 'not_move_constructible' does not satisfy '__cpp17_move_constructible'}}
 
-  static_assert(std::__cpp17_iterator<not_copy_constructible>); // expected-error-re {{{{static assertion|static_assert}} failed}}
+  static_assert(std::__cpp17_iterator<not_copy_constructible>); // expected-error {{static assertion failed}}
   // expected-note@*:* {{because 'not_copy_constructible' does not satisfy '__cpp17_copy_constructible'}}
 
-  static_assert(std::__cpp17_iterator<not_move_assignable>); // expected-error-re {{{{static assertion|static_assert}} failed}}
+  static_assert(std::__cpp17_iterator<not_move_assignable>); // expected-error {{static assertion failed}}
   // expected-note@*:* {{because 'not_move_assignable' does not satisfy '__cpp17_copy_assignable'}}
 
-  static_assert(std::__cpp17_iterator<not_copy_assignable>); // expected-error-re {{{{static assertion|static_assert}} failed}}
+  static_assert(std::__cpp17_iterator<not_copy_assignable>); // expected-error {{static assertion failed}}
   // expectted-note@*:* {{because 'not_copy_assignable' does not satisfy '__cpp17_copy_assignable'}}
 
-  static_assert(std::__cpp17_iterator<diff_t_not_signed>); // expected-error-re {{{{static assertion|static_assert}} failed}}
+  static_assert(std::__cpp17_iterator<diff_t_not_signed>); // expected-error {{static assertion failed}}
   // expectted-note@*:* {{'is_signed_v<__iter_diff_t<diff_t_not_signed> >' evaluated to false}}
 }
 
@@ -115,10 +115,10 @@ bool operator==(not_unequality_comparable, not_unequality_comparable);
 bool operator!=(not_unequality_comparable, not_unequality_comparable) = delete;
 
 void check_input_iterator_requirements() {
-  _LIBCPP_REQUIRE_CPP17_INPUT_ITERATOR(not_equality_comparable); // expected-error-re {{{{static assertion|static_assert}} failed}}
+  _LIBCPP_REQUIRE_CPP17_INPUT_ITERATOR(not_equality_comparable); // expected-error {{static assertion failed}}
   // expected-note@*:* {{'__lhs == __rhs' would be invalid: overload resolution selected deleted operator '=='}}
 
-  _LIBCPP_REQUIRE_CPP17_INPUT_ITERATOR(not_unequality_comparable); // expected-error-re {{{{static assertion|static_assert}} failed}}
+  _LIBCPP_REQUIRE_CPP17_INPUT_ITERATOR(not_unequality_comparable); // expected-error {{static assertion failed}}
   // expected-note@*:* {{'__lhs != __rhs' would be invalid: overload resolution selected deleted operator '!='}}
 }
 
@@ -138,9 +138,9 @@ struct postincrement_not_ref : valid_iterator<postincrement_not_ref> {};
 bool operator==(postincrement_not_ref, postincrement_not_ref);
 
 void check_forward_iterator_requirements() {
-  _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(not_default_constructible); // expected-error-re {{{{static assertion|static_assert}} failed}}
+  _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(not_default_constructible); // expected-error {{static assertion failed}}
   // expected-note@*:* {{because 'not_default_constructible' does not satisfy '__cpp17_default_constructible'}}
-  _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(postincrement_not_ref); // expected-error-re {{{{static assertion|static_assert}} failed}}
+  _LIBCPP_REQUIRE_CPP17_FORWARD_ITERATOR(postincrement_not_ref); // expected-error {{static assertion failed}}
 #ifndef _AIX
   // expected-note@*:* {{because type constraint 'convertible_to<valid_iterator<postincrement_not_ref>::Proxy, const postincrement_not_ref &>' was not satisfied}}
 #endif
@@ -167,11 +167,11 @@ struct not_returning_iter_reference : valid_forward_iterator<not_returning_iter_
 };
 
 void check_bidirectional_iterator_requirements() {
-  _LIBCPP_REQUIRE_CPP17_BIDIRECTIONAL_ITERATOR(missing_predecrement); // expected-error-re {{{{static assertion|static_assert}} failed}}
+  _LIBCPP_REQUIRE_CPP17_BIDIRECTIONAL_ITERATOR(missing_predecrement); // expected-error {{static assertion failed}}
   // expected-note@*:* {{cannot decrement value of type 'missing_predecrement'}}
-  _LIBCPP_REQUIRE_CPP17_BIDIRECTIONAL_ITERATOR(missing_postdecrement); // expected-error-re {{{{static assertion|static_assert}} failed}}
+  _LIBCPP_REQUIRE_CPP17_BIDIRECTIONAL_ITERATOR(missing_postdecrement); // expected-error {{static assertion failed}}
   // expected-note@*:* {{cannot decrement value of type 'missing_postdecrement'}}
-  _LIBCPP_REQUIRE_CPP17_BIDIRECTIONAL_ITERATOR(not_returning_iter_reference); // expected-error-re {{{{static assertion|static_assert}} failed}}
+  _LIBCPP_REQUIRE_CPP17_BIDIRECTIONAL_ITERATOR(not_returning_iter_reference); // expected-error {{static assertion failed}}
   // expected-note@*:* {{because type constraint 'same_as<int, __iter_reference<not_returning_iter_reference> >' was not satisfied}}
 }
 
@@ -359,62 +359,62 @@ struct missing_const_const_greater_eq : valid_random_access_iterator<missing_con
 };
 
 void check_random_access_iterator() {
-  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_plus_equals); // expected-error-re {{{{static assertion|static_assert}} failed}}
+  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_plus_equals); // expected-error {{static assertion failed}}
   // expected-note@*:* {{because '__iter += __n' would be invalid: overload resolution selected deleted operator '+='}}
-  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_minus_equals); // expected-error-re {{{{static assertion|static_assert}} failed}}
+  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_minus_equals); // expected-error {{static assertion failed}}
   // expected-note@*:* {{because '__iter -= __n' would be invalid: overload resolution selected deleted operator '-='}}
-  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_plus_iter_diff); // expected-error-re {{{{static assertion|static_assert}} failed}}
+  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_plus_iter_diff); // expected-error {{static assertion failed}}
   // expected-note@*:* {{because '__iter + __n' would be invalid: overload resolution selected deleted operator '+'}}
-  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_plus_diff_iter); // expected-error-re {{{{static assertion|static_assert}} failed}}
+  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_plus_diff_iter); // expected-error {{static assertion failed}}
   // expected-note@*:* {{because '__n + __iter' would be invalid: overload resolution selected deleted operator '+'}}
-  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_plus_iter_diff_const_mut); // expected-error-re {{{{static assertion|static_assert}} failed}}
+  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_plus_iter_diff_const_mut); // expected-error {{static assertion failed}}
   // expected-note@*:* {{because 'std::as_const(__iter) + __n' would be invalid: overload resolution selected deleted operator '+'}}
-  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_plus_iter_diff_mut_const); // expected-error-re {{{{static assertion|static_assert}} failed}}
+  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_plus_iter_diff_mut_const); // expected-error {{static assertion failed}}
   // expected-note@*:* {{because '__n + std::as_const(__iter)' would be invalid: overload resolution selected deleted operator '+'}}
-  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_minus_iter_diff_const); // expected-error-re {{{{static assertion|static_assert}} failed}}
+  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_minus_iter_diff_const); // expected-error {{static assertion failed}}
   // expected-note@*:* {{because 'std::as_const(__iter) - __n' would be invalid: overload resolution selected deleted operator '-'}}
-  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_minus_iter_iter); // expected-error-re {{{{static assertion|static_assert}} failed}}
+  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_minus_iter_iter); // expected-error {{static assertion failed}}
   // expected-note@*:* {{because '__iter - __iter' would be invalid: overload resolution selected deleted operator '-'}}
-  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_minus_const_iter_iter); // expected-error-re {{{{static assertion|static_assert}} failed}}
+  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_minus_const_iter_iter); // expected-error {{static assertion failed}}
   // expected-note@*:* {{because 'std::as_const(__iter) - __iter' would be invalid: overload resolution selected deleted operator '-'}}
-  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_minus_iter_const_iter); // expected-error-re {{{{static assertion|static_assert}} failed}}
+  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_minus_iter_const_iter); // expected-error {{static assertion failed}}
   // expected-note@*:* {{because '__iter - std::as_const(__iter)' would be invalid: overload resolution selected deleted operator '-'}}
-  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_minus_const_iter_const_iter); // expected-error-re {{{{static assertion|static_assert}} failed}}
+  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_minus_const_iter_const_iter); // expected-error {{static assertion failed}}
   // expected-note@*:* {{because 'std::as_const(__iter) - std::as_const(__iter)' would be invalid: overload resolution selected deleted operator '-'}}
-  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_subscript_operator); // expected-error-re {{{{static assertion|static_assert}} failed}}
+  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_subscript_operator); // expected-error {{static assertion failed}}
   // expected-note@*:* {{because '__iter[__n]' would be invalid: overload resolution selected deleted operator '[]'}}
-  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_const_subscript_operator); // expected-error-re {{{{static assertion|static_assert}} failed}}
+  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_const_subscript_operator); // expected-error {{static assertion failed}}
   // expected-note@*:* {{because 'std::as_const(__iter)[__n]' would be invalid: overload resolution selected deleted operator '[]'}}
-  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_less); // expected-error-re {{{{static assertion|static_assert}} failed}}
+  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_less); // expected-error {{static assertion failed}}
   // expected-note@*:* {{because '__iter < __iter' would be invalid: overload resolution selected deleted operator '<'}}
-  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_const_mut_less); // expected-error-re {{{{static assertion|static_assert}} failed}}
+  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_const_mut_less); // expected-error {{static assertion failed}}
   // expected-note@*:* {{because 'std::as_const(__iter) < __iter' would be invalid: overload resolution selected deleted operator '<'}}
-  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_mut_const_less); // expected-error-re {{{{static assertion|static_assert}} failed}}
+  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_mut_const_less); // expected-error {{static assertion failed}}
   // expected-note@*:* {{because '__iter < std::as_const(__iter)' would be invalid: overload resolution selected deleted operator '<'}}
-  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_const_const_less); // expected-error-re {{{{static assertion|static_assert}} failed}}
+  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_const_const_less); // expected-error {{static assertion failed}}
   // expected-note@*:* {{because 'std::as_const(__iter) < std::as_const(__iter)' would be invalid: overload resolution selected deleted operator '<'}}
-  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_greater); // expected-error-re {{{{static assertion|static_assert}} failed}}
+  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_greater); // expected-error {{static assertion failed}}
   // expected-note@*:* {{because '__iter > __iter' would be invalid: overload resolution selected deleted operator '>'}}
-  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_const_mut_greater); // expected-error-re {{{{static assertion|static_assert}} failed}}
+  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_const_mut_greater); // expected-error {{static assertion failed}}
   // expected-note@*:* {{because 'std::as_const(__iter) > __iter' would be invalid: overload resolution selected deleted operator '>'}}
-  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_mut_const_greater); // expected-error-re {{{{static assertion|static_assert}} failed}}
+  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_mut_const_greater); // expected-error {{static assertion failed}}
   // expected-note@*:* {{because '__iter > std::as_const(__iter)' would be invalid: overload resolution selected deleted operator '>'}}
-  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_const_const_greater); // expected-error-re {{{{static assertion|static_assert}} failed}}
+  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_const_const_greater); // expected-error {{static assertion failed}}
   // expected-note@*:* {{because 'std::as_const(__iter) > std::as_const(__iter)' would be invalid: overload resolution selected deleted operator '>'}}
-  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_less_eq); // expected-error-re {{{{static assertion|static_assert}} failed}}
+  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_less_eq); // expected-error {{static assertion failed}}
   // expected-note@*:* {{because '__iter <= __iter' would be invalid: overload resolution selected deleted operator '<='}}
-  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_const_mut_less_eq); // expected-error-re {{{{static assertion|static_assert}} failed}}
+  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_const_mut_less_eq); // expected-error {{static assertion failed}}
   // expected-note@*:* {{because 'std::as_const(__iter) <= __iter' would be invalid: overload resolution selected deleted operator '<='}}
-  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_mut_const_less_eq); // expected-error-re {{{{static assertion|static_assert}} failed}}
+  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_mut_const_less_eq); // expected-error {{static assertion failed}}
   // expected-note@*:* {{because '__iter <= std::as_const(__iter)' would be invalid: overload resolution selected deleted operator '<='}}
-  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_const_const_less_eq); // expected-error-re {{{{static assertion|static_assert}} failed}}
+  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_const_const_less_eq); // expected-error {{static assertion failed}}
   // expected-note@*:* {{because 'std::as_const(__iter) <= std::as_const(__iter)' would be invalid: overload resolution selected deleted operator '<='}}
-  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_greater_eq); // expected-error-re {{{{static assertion|static_assert}} failed}}
+  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_greater_eq); // expected-error {{static assertion failed}}
   // expected-note@*:* {{because '__iter >= __iter' would be invalid: overload resolution selected deleted operator '>='}}
-  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_const_mut_greater_eq); // expected-error-re {{{{static assertion|static_assert}} failed}}
+  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_const_mut_greater_eq); // expected-error {{static assertion failed}}
   // expected-note@*:* {{because 'std::as_const(__iter) >= __iter' would be invalid: overload resolution selected deleted operator '>='}}
-  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_mut_const_greater_eq); // expected-error-re {{{{static assertion|static_assert}} failed}}
+  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_mut_const_greater_eq); // expected-error {{static assertion failed}}
   // expected-note@*:* {{because '__iter >= std::as_const(__iter)' would be invalid: overload resolution selected deleted operator '>='}}
-  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_const_const_greater_eq); // expected-error-re {{{{static assertion|static_assert}} failed}}
+  _LIBCPP_REQUIRE_CPP17_RANDOM_ACCESS_ITERATOR(missing_const_const_greater_eq); // expected-error {{static assertion failed}}
   // expected-note@*:* {{because 'std::as_const(__iter) >= std::as_const(__iter)' would be invalid: overload resolution selected deleted operator '>='}}
 }
diff --git a/libcxx/test/std/containers/views/mdspan/layout_stride/extents.verify.cpp b/libcxx/test/std/containers/views/mdspan/layout_stride/extents.verify.cpp
index 4742527f7af11..46f2b774bcbd9 100644
--- a/libcxx/test/std/containers/views/mdspan/layout_stride/extents.verify.cpp
+++ b/libcxx/test/std/containers/views/mdspan/layout_stride/extents.verify.cpp
@@ -23,11 +23,11 @@
 #include <mdspan>
 
 void not_extents() {
-  // expected-error-re@*:* {{{{(static_assert|static assertion)}} failed {{.*}}layout_stride::mapping template argument must be a specialization of extents}}
+  // expected-error-re@*:* {{static assertion failed {{.*}}layout_stride::mapping template argument must be a specialization of extents}}
   [[maybe_unused]] std::layout_stride::mapping<void> mapping;
 }
 
 void representable() {
-  // expected-error-re@*:* {{{{(static_assert|static assertion)}} failed {{.*}}layout_stride::mapping product of static extents must be representable as index_type.}}
+  // expected-error-re@*:* {{static assertion failed {{.*}}layout_stride::mapping product of static extents must be representable as index_type.}}
   [[maybe_unused]] std::layout_stride::mapping<std::extents<char, 20, 20>> mapping;
 }

From 101008be830bb475f717a388c69cea1f48677baf Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Mon, 30 Oct 2023 16:44:22 +0000
Subject: [PATCH 396/877] [AMDGPU] CodeGen for 64-bit buffer atomic cmpswap
 intrinsics (#70475)

Implement codegen for:
llvm.amdgcn.raw.buffer.atomic.cmpswap.i64
llvm.amdgcn.raw.ptr.buffer.atomic.cmpswap.i64
llvm.amdgcn.struct.buffer.atomic.cmpswap.i64
llvm.amdgcn.struct.ptr.buffer.atomic.cmpswap.i64
---
 llvm/lib/Target/AMDGPU/BUFInstructions.td     |  51 ++--
 llvm/lib/Target/AMDGPU/SIInstrInfo.td         |   5 +-
 .../llvm.amdgcn.raw.buffer.atomic.cmpswap.ll  | 242 ++++++++++++++++-
 ...vm.amdgcn.raw.ptr.buffer.atomic.cmpswap.ll | 242 ++++++++++++++++-
 ...lvm.amdgcn.struct.buffer.atomic.cmpswap.ll | 247 +++++++++++++++++-
 ...amdgcn.struct.ptr.buffer.atomic.cmpswap.ll | 247 +++++++++++++++++-
 .../AMDGPU/llvm.amdgcn.raw.buffer.atomic.ll   |  22 ++
 .../llvm.amdgcn.raw.ptr.buffer.atomic.ll      |  22 ++
 .../llvm.amdgcn.struct.buffer.atomic.ll       |  28 ++
 .../llvm.amdgcn.struct.ptr.buffer.atomic.ll   |  28 ++
 10 files changed, 1070 insertions(+), 64 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td
index 897bbfa5c58be..31fdd2c8e2b36 100644
--- a/llvm/lib/Target/AMDGPU/BUFInstructions.td
+++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td
@@ -1659,73 +1659,76 @@ let SubtargetPredicate = isGFX90APlus in {
   defm : SIBufferAtomicPat<"SIbuffer_atomic_fmax", f64, "BUFFER_ATOMIC_MAX_F64">;
 } // End SubtargetPredicate = isGFX90APlus
 
-multiclass SIBufferAtomicCmpSwapPat<string Inst> {
-
+multiclass SIBufferAtomicCmpSwapPat<ValueType vt, ValueType data_vt, string Inst> {
   foreach RtnMode = ["ret", "noret"] in {
-
     defvar Op = !cast<SDPatternOperator>(SIbuffer_atomic_cmpswap
                                          # !if(!eq(RtnMode, "ret"), "", "_noret"));
     defvar InstSuffix = !if(!eq(RtnMode, "ret"), "_RTN", "");
     defvar CachePolicy = !if(!eq(RtnMode, "ret"), (set_glc $cachepolicy),
       (timm:$cachepolicy));
+    defvar SrcRC = getVregSrcForVT<vt>.ret;
+    defvar DataRC = getVregSrcForVT<data_vt>.ret;
+    defvar SubLo = !if(!eq(vt, i32), sub0, sub0_sub1);
+    defvar SubHi = !if(!eq(vt, i32), sub1, sub2_sub3);
 
     defvar OffsetResDag = (!cast<MUBUF_Pseudo>(Inst # "_OFFSET" # InstSuffix)
-      (REG_SEQUENCE VReg_64, VGPR_32:$data, sub0, VGPR_32:$cmp, sub1),
+      (REG_SEQUENCE DataRC, SrcRC:$data, SubLo, SrcRC:$cmp, SubHi),
       SReg_128:$rsrc, SCSrc_b32:$soffset, timm:$offset, CachePolicy);
     def : GCNPat<
-      (Op
-          i32:$data, i32:$cmp, v4i32:$rsrc, 0, 0, i32:$soffset,
-          timm:$offset, timm:$cachepolicy, 0),
+      (vt (Op
+          vt:$data, vt:$cmp, v4i32:$rsrc, 0, 0, i32:$soffset,
+          timm:$offset, timm:$cachepolicy, 0)),
       !if(!eq(RtnMode, "ret"),
-        (EXTRACT_SUBREG (i64 (COPY_TO_REGCLASS OffsetResDag, VReg_64)), sub0),
+        (EXTRACT_SUBREG OffsetResDag, SubLo),
         OffsetResDag)
     >;
 
     defvar IdxenResDag = (!cast<MUBUF_Pseudo>(Inst # "_IDXEN" # InstSuffix)
-      (REG_SEQUENCE VReg_64, VGPR_32:$data, sub0, VGPR_32:$cmp, sub1),
+      (REG_SEQUENCE DataRC, SrcRC:$data, SubLo, SrcRC:$cmp, SubHi),
       VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset, timm:$offset,
       CachePolicy);
     def : GCNPat<
-      (Op
-          i32:$data, i32:$cmp, v4i32:$rsrc, i32:$vindex,
+      (vt (Op
+          vt:$data, vt:$cmp, v4i32:$rsrc, i32:$vindex,
           0, i32:$soffset, timm:$offset,
-          timm:$cachepolicy, timm),
+          timm:$cachepolicy, timm)),
       !if(!eq(RtnMode, "ret"),
-        (EXTRACT_SUBREG (i64 (COPY_TO_REGCLASS IdxenResDag, VReg_64)), sub0),
+        (EXTRACT_SUBREG IdxenResDag, SubLo),
         IdxenResDag)
     >;
 
     defvar OffenResDag = (!cast<MUBUF_Pseudo>(Inst # "_OFFEN" # InstSuffix)
-      (REG_SEQUENCE VReg_64, VGPR_32:$data, sub0, VGPR_32:$cmp, sub1),
+      (REG_SEQUENCE DataRC, SrcRC:$data, SubLo, SrcRC:$cmp, SubHi),
       VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset, timm:$offset,
       CachePolicy);
     def : GCNPat<
-      (Op
-          i32:$data, i32:$cmp, v4i32:$rsrc, 0,
+      (vt (Op
+          vt:$data, vt:$cmp, v4i32:$rsrc, 0,
           i32:$voffset, i32:$soffset, timm:$offset,
-          timm:$cachepolicy, 0),
+          timm:$cachepolicy, 0)),
       !if(!eq(RtnMode, "ret"),
-        (EXTRACT_SUBREG (i64 (COPY_TO_REGCLASS OffenResDag, VReg_64)), sub0),
+        (EXTRACT_SUBREG OffenResDag, SubLo),
         OffenResDag)
     >;
 
     defvar BothenResDag = (!cast<MUBUF_Pseudo>(Inst # "_BOTHEN" # InstSuffix)
-      (REG_SEQUENCE VReg_64, VGPR_32:$data, sub0, VGPR_32:$cmp, sub1),
+      (REG_SEQUENCE DataRC, SrcRC:$data, SubLo, SrcRC:$cmp, SubHi),
       (REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1),
       SReg_128:$rsrc, SCSrc_b32:$soffset, timm:$offset, CachePolicy);
     def : GCNPat<
-      (Op
-          i32:$data, i32:$cmp, v4i32:$rsrc, i32:$vindex,
+      (vt (Op
+          vt:$data, vt:$cmp, v4i32:$rsrc, i32:$vindex,
           i32:$voffset, i32:$soffset, timm:$offset,
-          timm:$cachepolicy, timm),
+          timm:$cachepolicy, timm)),
       !if(!eq(RtnMode, "ret"),
-        (EXTRACT_SUBREG (i64 (COPY_TO_REGCLASS BothenResDag, VReg_64)), sub0),
+        (EXTRACT_SUBREG BothenResDag, SubLo),
         BothenResDag)
     >;
   } // end foreach RtnMode
 }
 
-defm : SIBufferAtomicCmpSwapPat<"BUFFER_ATOMIC_CMPSWAP">;
+defm : SIBufferAtomicCmpSwapPat<i32, v2i32, "BUFFER_ATOMIC_CMPSWAP">;
+defm : SIBufferAtomicCmpSwapPat<i64, v2i64, "BUFFER_ATOMIC_CMPSWAP_X2">;
 
 class MUBUFLoad_PatternADDR64 <MUBUF_Pseudo Instr_ADDR64, ValueType vt,
                               PatFrag constant_ld> : GCNPat <
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index b0b91d8317188..b0493edfa335a 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -199,10 +199,7 @@ defm SIbuffer_atomic_fmax : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_FMAX">;
 
 def SIbuffer_atomic_cmpswap : SDNode <"AMDGPUISD::BUFFER_ATOMIC_CMPSWAP",
   SDTypeProfile<1, 9,
-    [SDTCisVT<0, i32>,   // dst
-     SDTCisVT<1, i32>,   // src
-     SDTCisVT<2, i32>,   // cmp
-     SDTCisVT<3, v4i32>, // rsrc
+    [SDTCisVT<3, v4i32>, // rsrc
      SDTCisVT<4, i32>,   // vindex(VGPR)
      SDTCisVT<5, i32>,   // voffset(VGPR)
      SDTCisVT<6, i32>,   // soffset(SGPR)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.cmpswap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.cmpswap.ll
index d3477accf83d3..a78199bcebd2e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.cmpswap.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.atomic.cmpswap.ll
@@ -196,21 +196,235 @@ define amdgpu_ps float @raw_buffer_atomic_cmpswap_i32__vgpr_val__vgpr_cmp__sgpr_
   ret float %cast
 }
 
+; Natural mapping
+define amdgpu_ps double @raw_buffer_atomic_cmpswap_i64__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset(i64 %val, i64 %cmp, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
+  ; CHECK-LABEL: name: raw_buffer_atomic_cmpswap_i64__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; CHECK-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; CHECK-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr6
+  ; CHECK-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[REG_SEQUENCE]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
+  ; CHECK-NEXT:   [[BUFFER_ATOMIC_CMPSWAP_X2_OFFEN_RTN:%[0-9]+]]:vreg_128 = BUFFER_ATOMIC_CMPSWAP_X2_OFFEN_RTN [[REG_SEQUENCE3]], [[COPY8]], [[REG_SEQUENCE2]], [[COPY9]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:vreg_64 = COPY [[BUFFER_ATOMIC_CMPSWAP_X2_OFFEN_RTN]].sub0_sub1
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY10]].sub0
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY10]].sub1
+  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY11]], implicit $exec
+  ; CHECK-NEXT:   $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY12]], implicit $exec
+  ; CHECK-NEXT:   $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+  ; CHECK-NEXT:   SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
+  %ret = call i64 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i64(i64 %val, i64 %cmp, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
+  %cast = bitcast i64 %ret to double
+  ret double %cast
+}
 
-; FIXME: 64-bit not handled
-; ; Natural mapping
-; define amdgpu_ps <2 x float> @raw_buffer_atomic_cmpswap_i64__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset(i64 %val, i64 %cmp, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
-;   %ret = call i64 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i64(i64 %val, i64 %cmp, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
-;   %cast = bitcast i64 %ret to <2 x float>
-;   ret <2 x float> %cast
-; }
+; Natural mapping
+define amdgpu_ps void @raw_buffer_atomic_cmpswap_i64_noret__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset(i64 %val, i64 %cmp, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
+  ; CHECK-LABEL: name: raw_buffer_atomic_cmpswap_i64_noret__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; CHECK-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; CHECK-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr6
+  ; CHECK-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[REG_SEQUENCE]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
+  ; CHECK-NEXT:   BUFFER_ATOMIC_CMPSWAP_X2_OFFEN [[REG_SEQUENCE3]], [[COPY8]], [[REG_SEQUENCE2]], [[COPY9]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
+  ; CHECK-NEXT:   S_ENDPGM 0
+  %ret = call i64 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i64(i64 %val, i64 %cmp, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
+  ret void
+}
+
+; All operands need regbank legalization
+define amdgpu_ps double @raw_buffer_atomic_cmpswap_i64__sgpr_val__sgpr_cmp__vgpr_rsrc__sgpr_voffset__vgpr_soffset(i64 inreg %val, i64 inreg %cmp, <4 x i32> %rsrc, i32 inreg %voffset, i32 %soffset) {
+  ; CHECK-LABEL: name: raw_buffer_atomic_cmpswap_i64__sgpr_val__sgpr_cmp__vgpr_rsrc__sgpr_voffset__vgpr_soffset
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; CHECK-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; CHECK-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY8]]
+  ; CHECK-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
+  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
+  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
+  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
+  ; CHECK-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]].sub0_sub1
+  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]].sub2_sub3
+  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE3]].sub0_sub1
+  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE3]].sub2_sub3
+  ; CHECK-NEXT:   [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY15]], [[COPY13]], implicit $exec
+  ; CHECK-NEXT:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY16]], [[COPY14]], implicit $exec
+  ; CHECK-NEXT:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec
+  ; CHECK-NEXT:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY9]], implicit $exec
+  ; CHECK-NEXT:   [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
+  ; CHECK-NEXT:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   successors: %bb.4(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[REG_SEQUENCE4:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0_sub1, [[COPY11]], %subreg.sub2_sub3
+  ; CHECK-NEXT:   [[BUFFER_ATOMIC_CMPSWAP_X2_OFFEN_RTN:%[0-9]+]]:vreg_128 = BUFFER_ATOMIC_CMPSWAP_X2_OFFEN_RTN [[REG_SEQUENCE4]], [[COPY12]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
+  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:vreg_64 = COPY [[BUFFER_ATOMIC_CMPSWAP_X2_OFFEN_RTN]].sub0_sub1
+  ; CHECK-NEXT:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
+  ; CHECK-NEXT:   SI_WATERFALL_LOOP %bb.2, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4:
+  ; CHECK-NEXT:   successors: %bb.5(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $exec = S_MOV_B64_term [[S_MOV_B64_]]
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.5:
+  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[COPY17]].sub0
+  ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[COPY17]].sub1
+  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY18]], implicit $exec
+  ; CHECK-NEXT:   $sgpr0 = COPY [[V_READFIRSTLANE_B32_5]]
+  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY19]], implicit $exec
+  ; CHECK-NEXT:   $sgpr1 = COPY [[V_READFIRSTLANE_B32_6]]
+  ; CHECK-NEXT:   SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
+  %ret = call i64 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i64(i64 %val, i64 %cmp, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
+  %cast = bitcast i64 %ret to double
+  ret double %cast
+}
 
-; define amdgpu_ps void @raw_buffer_atomic_cmpswap_i64_noret__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset(i64 %val, i64 %cmp, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
-;   %ret = call i64 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i64(i64 %val, i64 %cmp, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
-;   ret void
-; }
+; All operands need regbank legalization
+define amdgpu_ps void @raw_buffer_atomic_cmpswap_i64_noret__sgpr_val__sgpr_cmp__vgpr_rsrc__sgpr_voffset__vgpr_soffset(i64 inreg %val, i64 inreg %cmp, <4 x i32> %rsrc, i32 inreg %voffset, i32 %soffset) {
+  ; CHECK-LABEL: name: raw_buffer_atomic_cmpswap_i64_noret__sgpr_val__sgpr_cmp__vgpr_rsrc__sgpr_voffset__vgpr_soffset
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; CHECK-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; CHECK-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY8]]
+  ; CHECK-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
+  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
+  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
+  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
+  ; CHECK-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]].sub0_sub1
+  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]].sub2_sub3
+  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE3]].sub0_sub1
+  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE3]].sub2_sub3
+  ; CHECK-NEXT:   [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY15]], [[COPY13]], implicit $exec
+  ; CHECK-NEXT:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY16]], [[COPY14]], implicit $exec
+  ; CHECK-NEXT:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec
+  ; CHECK-NEXT:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY9]], implicit $exec
+  ; CHECK-NEXT:   [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
+  ; CHECK-NEXT:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   successors: %bb.4(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[REG_SEQUENCE4:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0_sub1, [[COPY11]], %subreg.sub2_sub3
+  ; CHECK-NEXT:   BUFFER_ATOMIC_CMPSWAP_X2_OFFEN [[REG_SEQUENCE4]], [[COPY12]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
+  ; CHECK-NEXT:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
+  ; CHECK-NEXT:   SI_WATERFALL_LOOP %bb.2, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4:
+  ; CHECK-NEXT:   successors: %bb.5(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $exec = S_MOV_B64_term [[S_MOV_B64_]]
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.5:
+  ; CHECK-NEXT:   S_ENDPGM 0
+  %ret = call i64 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i64(i64 %val, i64 %cmp, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
+  ret void
+}
 
-declare i32 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i32(i32, i32, <4 x i32>, i32, i32, i32 immarg) #0
-declare i64 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i64(i64, i64, <4 x i32>, i32, i32, i32 immarg) #0
+define amdgpu_ps double @raw_buffer_atomic_cmpswap_i64__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffset_add4095(i64 %val, i64 %cmp, <4 x i32> inreg %rsrc, i32 %voffset.base, i32 inreg %soffset) {
+  ; CHECK-LABEL: name: raw_buffer_atomic_cmpswap_i64__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffset_add4095
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; CHECK-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; CHECK-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr6
+  ; CHECK-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[REG_SEQUENCE]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
+  ; CHECK-NEXT:   [[BUFFER_ATOMIC_CMPSWAP_X2_OFFEN_RTN:%[0-9]+]]:vreg_128 = BUFFER_ATOMIC_CMPSWAP_X2_OFFEN_RTN [[REG_SEQUENCE3]], [[COPY8]], [[REG_SEQUENCE2]], [[COPY9]], 4095, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:vreg_64 = COPY [[BUFFER_ATOMIC_CMPSWAP_X2_OFFEN_RTN]].sub0_sub1
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY10]].sub0
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY10]].sub1
+  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY11]], implicit $exec
+  ; CHECK-NEXT:   $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY12]], implicit $exec
+  ; CHECK-NEXT:   $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+  ; CHECK-NEXT:   SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
+  %voffset = add i32 %voffset.base, 4095
+  %ret = call i64 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i64(i64 %val, i64 %cmp, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
+  %cast = bitcast i64 %ret to double
+  ret double %cast
+}
 
-attributes #0 = { nounwind }
+declare i32 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i32(i32, i32, <4 x i32>, i32, i32, i32 immarg)
+declare i64 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i64(i64, i64, <4 x i32>, i32, i32, i32 immarg)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.atomic.cmpswap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.atomic.cmpswap.ll
index e3efe0e3ae41c..56b2d0452dd45 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.atomic.cmpswap.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.atomic.cmpswap.ll
@@ -196,21 +196,235 @@ define amdgpu_ps float @raw_ptr_buffer_atomic_cmpswap_i32__vgpr_val__vgpr_cmp__s
   ret float %cast
 }
 
+; Natural mapping
+define amdgpu_ps double @raw_ptr_buffer_atomic_cmpswap_i64__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset(i64 %val, i64 %cmp, ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
+  ; CHECK-LABEL: name: raw_ptr_buffer_atomic_cmpswap_i64__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; CHECK-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; CHECK-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr6
+  ; CHECK-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
+  ; CHECK-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[REG_SEQUENCE]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
+  ; CHECK-NEXT:   [[BUFFER_ATOMIC_CMPSWAP_X2_OFFEN_RTN:%[0-9]+]]:vreg_128 = BUFFER_ATOMIC_CMPSWAP_X2_OFFEN_RTN [[REG_SEQUENCE3]], [[COPY8]], [[REG_SEQUENCE2]], [[COPY9]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8)
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:vreg_64 = COPY [[BUFFER_ATOMIC_CMPSWAP_X2_OFFEN_RTN]].sub0_sub1
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY10]].sub0
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY10]].sub1
+  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY11]], implicit $exec
+  ; CHECK-NEXT:   $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY12]], implicit $exec
+  ; CHECK-NEXT:   $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+  ; CHECK-NEXT:   SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
+  %ret = call i64 @llvm.amdgcn.raw.ptr.buffer.atomic.cmpswap.i64(i64 %val, i64 %cmp, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0)
+  %cast = bitcast i64 %ret to double
+  ret double %cast
+}
 
-; FIXME: 64-bit not handled
-; ; Natural mapping
-; define amdgpu_ps <2 x float> @raw_ptr_buffer_atomic_cmpswap_i64__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset(i64 %val, i64 %cmp, ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
-;   %ret = call i64 @llvm.amdgcn.raw.ptr.buffer.atomic.cmpswap.i64(i64 %val, i64 %cmp, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0)
-;   %cast = bitcast i64 %ret to <2 x float>
-;   ret <2 x float> %cast
-; }
+; Natural mapping
+define amdgpu_ps void @raw_ptr_buffer_atomic_cmpswap_i64_noret__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset(i64 %val, i64 %cmp, ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
+  ; CHECK-LABEL: name: raw_ptr_buffer_atomic_cmpswap_i64_noret__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; CHECK-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; CHECK-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr6
+  ; CHECK-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
+  ; CHECK-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[REG_SEQUENCE]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
+  ; CHECK-NEXT:   BUFFER_ATOMIC_CMPSWAP_X2_OFFEN [[REG_SEQUENCE3]], [[COPY8]], [[REG_SEQUENCE2]], [[COPY9]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8)
+  ; CHECK-NEXT:   S_ENDPGM 0
+  %ret = call i64 @llvm.amdgcn.raw.ptr.buffer.atomic.cmpswap.i64(i64 %val, i64 %cmp, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0)
+  ret void
+}
+
+; All operands need regbank legalization
+define amdgpu_ps double @raw_ptr_buffer_atomic_cmpswap_i64__sgpr_val__sgpr_cmp__vgpr_rsrc__sgpr_voffset__vgpr_soffset(i64 inreg %val, i64 inreg %cmp, ptr addrspace(8) %rsrc, i32 inreg %voffset, i32 %soffset) {
+  ; CHECK-LABEL: name: raw_ptr_buffer_atomic_cmpswap_i64__sgpr_val__sgpr_cmp__vgpr_rsrc__sgpr_voffset__vgpr_soffset
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; CHECK-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+  ; CHECK-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY8]]
+  ; CHECK-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
+  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
+  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
+  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
+  ; CHECK-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]].sub0_sub1
+  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]].sub2_sub3
+  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE3]].sub0_sub1
+  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE3]].sub2_sub3
+  ; CHECK-NEXT:   [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY15]], [[COPY13]], implicit $exec
+  ; CHECK-NEXT:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY16]], [[COPY14]], implicit $exec
+  ; CHECK-NEXT:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec
+  ; CHECK-NEXT:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY9]], implicit $exec
+  ; CHECK-NEXT:   [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
+  ; CHECK-NEXT:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   successors: %bb.4(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[REG_SEQUENCE4:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0_sub1, [[COPY11]], %subreg.sub2_sub3
+  ; CHECK-NEXT:   [[BUFFER_ATOMIC_CMPSWAP_X2_OFFEN_RTN:%[0-9]+]]:vreg_128 = BUFFER_ATOMIC_CMPSWAP_X2_OFFEN_RTN [[REG_SEQUENCE4]], [[COPY12]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8)
+  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:vreg_64 = COPY [[BUFFER_ATOMIC_CMPSWAP_X2_OFFEN_RTN]].sub0_sub1
+  ; CHECK-NEXT:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
+  ; CHECK-NEXT:   SI_WATERFALL_LOOP %bb.2, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4:
+  ; CHECK-NEXT:   successors: %bb.5(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $exec = S_MOV_B64_term [[S_MOV_B64_]]
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.5:
+  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[COPY17]].sub0
+  ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[COPY17]].sub1
+  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY18]], implicit $exec
+  ; CHECK-NEXT:   $sgpr0 = COPY [[V_READFIRSTLANE_B32_5]]
+  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY19]], implicit $exec
+  ; CHECK-NEXT:   $sgpr1 = COPY [[V_READFIRSTLANE_B32_6]]
+  ; CHECK-NEXT:   SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
+  %ret = call i64 @llvm.amdgcn.raw.ptr.buffer.atomic.cmpswap.i64(i64 %val, i64 %cmp, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0)
+  %cast = bitcast i64 %ret to double
+  ret double %cast
+}
 
-; define amdgpu_ps void @raw_ptr_buffer_atomic_cmpswap_i64_noret__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset(i64 %val, i64 %cmp, ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
-;   %ret = call i64 @llvm.amdgcn.raw.ptr.buffer.atomic.cmpswap.i64(i64 %val, i64 %cmp, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0)
-;   ret void
-; }
+; All operands need regbank legalization
+define amdgpu_ps void @raw_ptr_buffer_atomic_cmpswap_i64_noret__sgpr_val__sgpr_cmp__vgpr_rsrc__sgpr_voffset__vgpr_soffset(i64 inreg %val, i64 inreg %cmp, ptr addrspace(8) %rsrc, i32 inreg %voffset, i32 %soffset) {
+  ; CHECK-LABEL: name: raw_ptr_buffer_atomic_cmpswap_i64_noret__sgpr_val__sgpr_cmp__vgpr_rsrc__sgpr_voffset__vgpr_soffset
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; CHECK-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+  ; CHECK-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY8]]
+  ; CHECK-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
+  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
+  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
+  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
+  ; CHECK-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]].sub0_sub1
+  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]].sub2_sub3
+  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE3]].sub0_sub1
+  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE3]].sub2_sub3
+  ; CHECK-NEXT:   [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY15]], [[COPY13]], implicit $exec
+  ; CHECK-NEXT:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY16]], [[COPY14]], implicit $exec
+  ; CHECK-NEXT:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY9]], implicit $exec
+  ; CHECK-NEXT:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY9]], implicit $exec
+  ; CHECK-NEXT:   [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
+  ; CHECK-NEXT:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   successors: %bb.4(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[REG_SEQUENCE4:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY10]], %subreg.sub0_sub1, [[COPY11]], %subreg.sub2_sub3
+  ; CHECK-NEXT:   BUFFER_ATOMIC_CMPSWAP_X2_OFFEN [[REG_SEQUENCE4]], [[COPY12]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8)
+  ; CHECK-NEXT:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
+  ; CHECK-NEXT:   SI_WATERFALL_LOOP %bb.2, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4:
+  ; CHECK-NEXT:   successors: %bb.5(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $exec = S_MOV_B64_term [[S_MOV_B64_]]
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.5:
+  ; CHECK-NEXT:   S_ENDPGM 0
+  %ret = call i64 @llvm.amdgcn.raw.ptr.buffer.atomic.cmpswap.i64(i64 %val, i64 %cmp, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0)
+  ret void
+}
 
-declare i32 @llvm.amdgcn.raw.ptr.buffer.atomic.cmpswap.i32(i32, i32, ptr addrspace(8), i32, i32, i32 immarg) #0
-declare i64 @llvm.amdgcn.raw.ptr.buffer.atomic.cmpswap.i64(i64, i64, ptr addrspace(8), i32, i32, i32 immarg) #0
+define amdgpu_ps double @raw_ptr_buffer_atomic_cmpswap_i64__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffset_add4095(i64 %val, i64 %cmp, ptr addrspace(8) inreg %rsrc, i32 %voffset.base, i32 inreg %soffset) {
+  ; CHECK-LABEL: name: raw_ptr_buffer_atomic_cmpswap_i64__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset__voffset_add4095
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; CHECK-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; CHECK-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr6
+  ; CHECK-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
+  ; CHECK-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[REG_SEQUENCE]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
+  ; CHECK-NEXT:   [[BUFFER_ATOMIC_CMPSWAP_X2_OFFEN_RTN:%[0-9]+]]:vreg_128 = BUFFER_ATOMIC_CMPSWAP_X2_OFFEN_RTN [[REG_SEQUENCE3]], [[COPY8]], [[REG_SEQUENCE2]], [[COPY9]], 4095, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8)
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:vreg_64 = COPY [[BUFFER_ATOMIC_CMPSWAP_X2_OFFEN_RTN]].sub0_sub1
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[COPY10]].sub0
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY10]].sub1
+  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY11]], implicit $exec
+  ; CHECK-NEXT:   $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY12]], implicit $exec
+  ; CHECK-NEXT:   $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+  ; CHECK-NEXT:   SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
+  %voffset = add i32 %voffset.base, 4095
+  %ret = call i64 @llvm.amdgcn.raw.ptr.buffer.atomic.cmpswap.i64(i64 %val, i64 %cmp, ptr addrspace(8) %rsrc, i32 %voffset, i32 %soffset, i32 0)
+  %cast = bitcast i64 %ret to double
+  ret double %cast
+}
 
-attributes #0 = { nounwind }
+declare i32 @llvm.amdgcn.raw.ptr.buffer.atomic.cmpswap.i32(i32, i32, ptr addrspace(8), i32, i32, i32 immarg)
+declare i64 @llvm.amdgcn.raw.ptr.buffer.atomic.cmpswap.i64(i64, i64, ptr addrspace(8), i32, i32, i32 immarg)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.cmpswap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.cmpswap.ll
index d9e35ff729deb..f4ca44be373fa 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.cmpswap.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.atomic.cmpswap.ll
@@ -1,7 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck %s
 
-
 ; Natural mapping
 define amdgpu_ps float @struct_buffer_atomic_cmpswap_i32__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset(i32 %val, i32 %cmp, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
   ; CHECK-LABEL: name: struct_buffer_atomic_cmpswap_i32__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset
@@ -209,7 +208,247 @@ define amdgpu_ps float @struct_buffer_atomic_cmpswap_i32__vgpr_val__vgpr_cmp__sg
   ret float %cast
 }
 
-declare i32 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i32(i32, i32, <4 x i32>, i32, i32, i32, i32 immarg) #0
-declare i64 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i64(i64, i64, <4 x i32>, i32, i32, i32, i32 immarg) #0
+; Natural mapping
+define amdgpu_ps double @struct_buffer_atomic_cmpswap_i64__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset(i64 %val, i64 %cmp, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+  ; CHECK-LABEL: name: struct_buffer_atomic_cmpswap_i64__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; CHECK-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; CHECK-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr5
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY $sgpr6
+  ; CHECK-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1
+  ; CHECK-NEXT:   [[REG_SEQUENCE4:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[REG_SEQUENCE]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
+  ; CHECK-NEXT:   [[BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN_RTN:%[0-9]+]]:vreg_128 = BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN_RTN [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], [[COPY10]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:vreg_64 = COPY [[BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN_RTN]].sub0_sub1
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY11]].sub0
+  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY11]].sub1
+  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY12]], implicit $exec
+  ; CHECK-NEXT:   $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY13]], implicit $exec
+  ; CHECK-NEXT:   $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+  ; CHECK-NEXT:   SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
+  %ret = call i64 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i64(i64 %val, i64 %cmp, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
+  %cast = bitcast i64 %ret to double
+  ret double %cast
+}
+
+; Natural mapping
+define amdgpu_ps void @struct_buffer_atomic_cmpswap_noret_i64__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset(i64 %val, i64 %cmp, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+  ; CHECK-LABEL: name: struct_buffer_atomic_cmpswap_noret_i64__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; CHECK-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; CHECK-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr5
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY $sgpr6
+  ; CHECK-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1
+  ; CHECK-NEXT:   [[REG_SEQUENCE4:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[REG_SEQUENCE]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
+  ; CHECK-NEXT:   BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], [[COPY10]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
+  ; CHECK-NEXT:   S_ENDPGM 0
+  %ret = call i64 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i64(i64 %val, i64 %cmp, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
+  ret void
+}
+
+; All operands need legalization
+define amdgpu_ps double @struct_buffer_atomic_cmpswap_i64__sgpr_val__sgpr_cmp__vgpr_rsrc__sgpr_voffset__vgpr_soffset(i64 inreg %val, i64 inreg %cmp, <4 x i32> %rsrc, i32 inreg %vindex, i32 inreg %voffset, i32 %soffset) {
+  ; CHECK-LABEL: name: struct_buffer_atomic_cmpswap_i64__sgpr_val__sgpr_cmp__vgpr_rsrc__sgpr_voffset__vgpr_soffset
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; CHECK-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; CHECK-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr7
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
+  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY8]]
+  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY9]]
+  ; CHECK-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
+  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
+  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
+  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
+  ; CHECK-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]].sub0_sub1
+  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]].sub2_sub3
+  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE3]].sub0_sub1
+  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE3]].sub2_sub3
+  ; CHECK-NEXT:   [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY17]], [[COPY15]], implicit $exec
+  ; CHECK-NEXT:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY18]], [[COPY16]], implicit $exec
+  ; CHECK-NEXT:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec
+  ; CHECK-NEXT:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY10]], implicit $exec
+  ; CHECK-NEXT:   [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
+  ; CHECK-NEXT:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   successors: %bb.4(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY13]], %subreg.sub0, [[COPY14]], %subreg.sub1
+  ; CHECK-NEXT:   [[REG_SEQUENCE5:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY11]], %subreg.sub0_sub1, [[COPY12]], %subreg.sub2_sub3
+  ; CHECK-NEXT:   [[BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN_RTN:%[0-9]+]]:vreg_128 = BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN_RTN [[REG_SEQUENCE5]], [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
+  ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:vreg_64 = COPY [[BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN_RTN]].sub0_sub1
+  ; CHECK-NEXT:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
+  ; CHECK-NEXT:   SI_WATERFALL_LOOP %bb.2, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4:
+  ; CHECK-NEXT:   successors: %bb.5(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $exec = S_MOV_B64_term [[S_MOV_B64_]]
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.5:
+  ; CHECK-NEXT:   [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[COPY19]].sub0
+  ; CHECK-NEXT:   [[COPY21:%[0-9]+]]:vgpr_32 = COPY [[COPY19]].sub1
+  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY20]], implicit $exec
+  ; CHECK-NEXT:   $sgpr0 = COPY [[V_READFIRSTLANE_B32_5]]
+  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY21]], implicit $exec
+  ; CHECK-NEXT:   $sgpr1 = COPY [[V_READFIRSTLANE_B32_6]]
+  ; CHECK-NEXT:   SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
+  %ret = call i64 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i64(i64 %val, i64 %cmp, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
+  %cast = bitcast i64 %ret to double
+  ret double %cast
+}
+
+; All operands need legalization
+define amdgpu_ps void @struct_buffer_atomic_cmpswap_i64_noret__sgpr_val__sgpr_cmp__vgpr_rsrc__sgpr_voffset__vgpr_soffset(i64 inreg %val, i64 inreg %cmp, <4 x i32> %rsrc, i32 inreg %vindex, i32 inreg %voffset, i32 %soffset) {
+  ; CHECK-LABEL: name: struct_buffer_atomic_cmpswap_i64_noret__sgpr_val__sgpr_cmp__vgpr_rsrc__sgpr_voffset__vgpr_soffset
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; CHECK-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; CHECK-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr7
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
+  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY8]]
+  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY9]]
+  ; CHECK-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
+  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
+  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
+  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
+  ; CHECK-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]].sub0_sub1
+  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]].sub2_sub3
+  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE3]].sub0_sub1
+  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE3]].sub2_sub3
+  ; CHECK-NEXT:   [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY17]], [[COPY15]], implicit $exec
+  ; CHECK-NEXT:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY18]], [[COPY16]], implicit $exec
+  ; CHECK-NEXT:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec
+  ; CHECK-NEXT:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY10]], implicit $exec
+  ; CHECK-NEXT:   [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
+  ; CHECK-NEXT:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   successors: %bb.4(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY13]], %subreg.sub0, [[COPY14]], %subreg.sub1
+  ; CHECK-NEXT:   [[REG_SEQUENCE5:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY11]], %subreg.sub0_sub1, [[COPY12]], %subreg.sub2_sub3
+  ; CHECK-NEXT:   BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN [[REG_SEQUENCE5]], [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
+  ; CHECK-NEXT:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
+  ; CHECK-NEXT:   SI_WATERFALL_LOOP %bb.2, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4:
+  ; CHECK-NEXT:   successors: %bb.5(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $exec = S_MOV_B64_term [[S_MOV_B64_]]
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.5:
+  ; CHECK-NEXT:   S_ENDPGM 0
+  %ret = call i64 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i64(i64 %val, i64 %cmp, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
+  ret void
+}
+
+define amdgpu_ps double @struct_buffer_atomic_cmpswap_i64__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffset_add4095(i64 %val, i64 %cmp, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset.base, i32 inreg %soffset) {
+  ; CHECK-LABEL: name: struct_buffer_atomic_cmpswap_i64__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffset_add4095
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; CHECK-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; CHECK-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr5
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY $sgpr6
+  ; CHECK-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1
+  ; CHECK-NEXT:   [[REG_SEQUENCE4:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[REG_SEQUENCE]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
+  ; CHECK-NEXT:   [[BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN_RTN:%[0-9]+]]:vreg_128 = BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN_RTN [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], [[COPY10]], 4095, 1, implicit $exec :: (volatile dereferenceable load store (s64), align 1, addrspace 8)
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:vreg_64 = COPY [[BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN_RTN]].sub0_sub1
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY11]].sub0
+  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY11]].sub1
+  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY12]], implicit $exec
+  ; CHECK-NEXT:   $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY13]], implicit $exec
+  ; CHECK-NEXT:   $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+  ; CHECK-NEXT:   SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
+  %voffset = add i32 %voffset.base, 4095
+  %ret = call i64 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i64(i64 %val, i64 %cmp, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
+  %cast = bitcast i64 %ret to double
+  ret double %cast
+}
 
-attributes #0 = { nounwind }
+declare i32 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i32(i32, i32, <4 x i32>, i32, i32, i32, i32 immarg)
+declare i64 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i64(i64, i64, <4 x i32>, i32, i32, i32, i32 immarg)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.atomic.cmpswap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.atomic.cmpswap.ll
index 662643c3c680d..e8e6cab4edbe8 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.atomic.cmpswap.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.atomic.cmpswap.ll
@@ -1,7 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck %s
 
-
 ; Natural mapping
 define amdgpu_ps float @struct_ptr_buffer_atomic_cmpswap_i32__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset(i32 %val, i32 %cmp, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
   ; CHECK-LABEL: name: struct_ptr_buffer_atomic_cmpswap_i32__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset
@@ -209,7 +208,247 @@ define amdgpu_ps float @struct_ptr_buffer_atomic_cmpswap_i32__vgpr_val__vgpr_cmp
   ret float %cast
 }
 
-declare i32 @llvm.amdgcn.struct.ptr.buffer.atomic.cmpswap.i32(i32, i32, ptr addrspace(8), i32, i32, i32, i32 immarg) #0
-declare i64 @llvm.amdgcn.struct.ptr.buffer.atomic.cmpswap.i64(i64, i64, ptr addrspace(8), i32, i32, i32, i32 immarg) #0
+; Natural mapping
+define amdgpu_ps double @struct_ptr_buffer_atomic_cmpswap_i64__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset(i64 %val, i64 %cmp, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+  ; CHECK-LABEL: name: struct_ptr_buffer_atomic_cmpswap_i64__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; CHECK-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; CHECK-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr5
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY $sgpr6
+  ; CHECK-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
+  ; CHECK-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1
+  ; CHECK-NEXT:   [[REG_SEQUENCE4:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[REG_SEQUENCE]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
+  ; CHECK-NEXT:   [[BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN_RTN:%[0-9]+]]:vreg_128 = BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN_RTN [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], [[COPY10]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8)
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:vreg_64 = COPY [[BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN_RTN]].sub0_sub1
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY11]].sub0
+  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY11]].sub1
+  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY12]], implicit $exec
+  ; CHECK-NEXT:   $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY13]], implicit $exec
+  ; CHECK-NEXT:   $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+  ; CHECK-NEXT:   SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
+  %ret = call i64 @llvm.amdgcn.struct.ptr.buffer.atomic.cmpswap.i64(i64 %val, i64 %cmp, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
+  %cast = bitcast i64 %ret to double
+  ret double %cast
+}
+
+; Natural mapping
+define amdgpu_ps void @struct_ptr_buffer_atomic_cmpswap_noret_i64__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset(i64 %val, i64 %cmp, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
+  ; CHECK-LABEL: name: struct_ptr_buffer_atomic_cmpswap_noret_i64__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; CHECK-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; CHECK-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr5
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY $sgpr6
+  ; CHECK-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
+  ; CHECK-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1
+  ; CHECK-NEXT:   [[REG_SEQUENCE4:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[REG_SEQUENCE]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
+  ; CHECK-NEXT:   BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], [[COPY10]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8)
+  ; CHECK-NEXT:   S_ENDPGM 0
+  %ret = call i64 @llvm.amdgcn.struct.ptr.buffer.atomic.cmpswap.i64(i64 %val, i64 %cmp, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
+  ret void
+}
+
+; All operands need legalization
+define amdgpu_ps double @struct_ptr_buffer_atomic_cmpswap_i64__sgpr_val__sgpr_cmp__vgpr_rsrc__sgpr_voffset__vgpr_soffset(i64 inreg %val, i64 inreg %cmp, ptr addrspace(8) %rsrc, i32 inreg %vindex, i32 inreg %voffset, i32 %soffset) {
+  ; CHECK-LABEL: name: struct_ptr_buffer_atomic_cmpswap_i64__sgpr_val__sgpr_cmp__vgpr_rsrc__sgpr_voffset__vgpr_soffset
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; CHECK-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr7
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+  ; CHECK-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
+  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY8]]
+  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY9]]
+  ; CHECK-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
+  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
+  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
+  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
+  ; CHECK-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]].sub0_sub1
+  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]].sub2_sub3
+  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE3]].sub0_sub1
+  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE3]].sub2_sub3
+  ; CHECK-NEXT:   [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY17]], [[COPY15]], implicit $exec
+  ; CHECK-NEXT:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY18]], [[COPY16]], implicit $exec
+  ; CHECK-NEXT:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec
+  ; CHECK-NEXT:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY10]], implicit $exec
+  ; CHECK-NEXT:   [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
+  ; CHECK-NEXT:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   successors: %bb.4(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY13]], %subreg.sub0, [[COPY14]], %subreg.sub1
+  ; CHECK-NEXT:   [[REG_SEQUENCE5:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY11]], %subreg.sub0_sub1, [[COPY12]], %subreg.sub2_sub3
+  ; CHECK-NEXT:   [[BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN_RTN:%[0-9]+]]:vreg_128 = BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN_RTN [[REG_SEQUENCE5]], [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8)
+  ; CHECK-NEXT:   [[COPY19:%[0-9]+]]:vreg_64 = COPY [[BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN_RTN]].sub0_sub1
+  ; CHECK-NEXT:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
+  ; CHECK-NEXT:   SI_WATERFALL_LOOP %bb.2, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4:
+  ; CHECK-NEXT:   successors: %bb.5(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $exec = S_MOV_B64_term [[S_MOV_B64_]]
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.5:
+  ; CHECK-NEXT:   [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[COPY19]].sub0
+  ; CHECK-NEXT:   [[COPY21:%[0-9]+]]:vgpr_32 = COPY [[COPY19]].sub1
+  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_5:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY20]], implicit $exec
+  ; CHECK-NEXT:   $sgpr0 = COPY [[V_READFIRSTLANE_B32_5]]
+  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_6:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY21]], implicit $exec
+  ; CHECK-NEXT:   $sgpr1 = COPY [[V_READFIRSTLANE_B32_6]]
+  ; CHECK-NEXT:   SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
+  %ret = call i64 @llvm.amdgcn.struct.ptr.buffer.atomic.cmpswap.i64(i64 %val, i64 %cmp, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
+  %cast = bitcast i64 %ret to double
+  ret double %cast
+}
+
+; All operands need legalization
+define amdgpu_ps void @struct_ptr_buffer_atomic_cmpswap_i64_noret__sgpr_val__sgpr_cmp__vgpr_rsrc__sgpr_voffset__vgpr_soffset(i64 inreg %val, i64 inreg %cmp, ptr addrspace(8) %rsrc, i32 inreg %vindex, i32 inreg %voffset, i32 %soffset) {
+  ; CHECK-LABEL: name: struct_ptr_buffer_atomic_cmpswap_i64_noret__sgpr_val__sgpr_cmp__vgpr_rsrc__sgpr_voffset__vgpr_soffset
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; CHECK-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr7
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+  ; CHECK-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE1]]
+  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY8]]
+  ; CHECK-NEXT:   [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY9]]
+  ; CHECK-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64_xexec = S_MOV_B64 $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.3(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY4]], implicit $exec
+  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
+  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
+  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
+  ; CHECK-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
+  ; CHECK-NEXT:   [[COPY15:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]].sub0_sub1
+  ; CHECK-NEXT:   [[COPY16:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE2]].sub2_sub3
+  ; CHECK-NEXT:   [[COPY17:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE3]].sub0_sub1
+  ; CHECK-NEXT:   [[COPY18:%[0-9]+]]:sreg_64 = COPY [[REG_SEQUENCE3]].sub2_sub3
+  ; CHECK-NEXT:   [[V_CMP_EQ_U64_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY17]], [[COPY15]], implicit $exec
+  ; CHECK-NEXT:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[COPY18]], [[COPY16]], implicit $exec
+  ; CHECK-NEXT:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_]], [[V_CMP_EQ_U64_e64_1]], implicit-def dead $scc
+  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY10]], implicit $exec
+  ; CHECK-NEXT:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY10]], implicit $exec
+  ; CHECK-NEXT:   [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[S_AND_B64_]], [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc
+  ; CHECK-NEXT:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   successors: %bb.4(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[REG_SEQUENCE4:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY13]], %subreg.sub0, [[COPY14]], %subreg.sub1
+  ; CHECK-NEXT:   [[REG_SEQUENCE5:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY11]], %subreg.sub0_sub1, [[COPY12]], %subreg.sub2_sub3
+  ; CHECK-NEXT:   BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN [[REG_SEQUENCE5]], [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8)
+  ; CHECK-NEXT:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
+  ; CHECK-NEXT:   SI_WATERFALL_LOOP %bb.2, implicit $exec
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4:
+  ; CHECK-NEXT:   successors: %bb.5(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $exec = S_MOV_B64_term [[S_MOV_B64_]]
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.5:
+  ; CHECK-NEXT:   S_ENDPGM 0
+  %ret = call i64 @llvm.amdgcn.struct.ptr.buffer.atomic.cmpswap.i64(i64 %val, i64 %cmp, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
+  ret void
+}
+
+define amdgpu_ps double @struct_ptr_buffer_atomic_cmpswap_i64__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffset_add4095(i64 %val, i64 %cmp, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset.base, i32 inreg %soffset) {
+  ; CHECK-LABEL: name: struct_ptr_buffer_atomic_cmpswap_i64__vgpr_val__vgpr_cmp__sgpr_rsrc__vgpr_voffset__sgpr_soffset_voffset_add4095
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK-NEXT:   liveins: $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; CHECK-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; CHECK-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY3]], %subreg.sub1
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:sreg_32 = COPY $sgpr2
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr3
+  ; CHECK-NEXT:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr4
+  ; CHECK-NEXT:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr5
+  ; CHECK-NEXT:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+  ; CHECK-NEXT:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY $vgpr5
+  ; CHECK-NEXT:   [[COPY10:%[0-9]+]]:sreg_32 = COPY $sgpr6
+  ; CHECK-NEXT:   [[REG_SEQUENCE2:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
+  ; CHECK-NEXT:   [[REG_SEQUENCE3:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY9]], %subreg.sub1
+  ; CHECK-NEXT:   [[REG_SEQUENCE4:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[REG_SEQUENCE]], %subreg.sub0_sub1, [[REG_SEQUENCE1]], %subreg.sub2_sub3
+  ; CHECK-NEXT:   [[BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN_RTN:%[0-9]+]]:vreg_128 = BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN_RTN [[REG_SEQUENCE4]], [[REG_SEQUENCE3]], [[REG_SEQUENCE2]], [[COPY10]], 4095, 1, implicit $exec :: (volatile dereferenceable load store (s64) on %ir.rsrc, align 1, addrspace 8)
+  ; CHECK-NEXT:   [[COPY11:%[0-9]+]]:vreg_64 = COPY [[BUFFER_ATOMIC_CMPSWAP_X2_BOTHEN_RTN]].sub0_sub1
+  ; CHECK-NEXT:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY11]].sub0
+  ; CHECK-NEXT:   [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[COPY11]].sub1
+  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY12]], implicit $exec
+  ; CHECK-NEXT:   $sgpr0 = COPY [[V_READFIRSTLANE_B32_]]
+  ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY13]], implicit $exec
+  ; CHECK-NEXT:   $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]]
+  ; CHECK-NEXT:   SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
+  %voffset = add i32 %voffset.base, 4095
+  %ret = call i64 @llvm.amdgcn.struct.ptr.buffer.atomic.cmpswap.i64(i64 %val, i64 %cmp, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
+  %cast = bitcast i64 %ret to double
+  ret double %cast
+}
 
-attributes #0 = { nounwind }
+declare i32 @llvm.amdgcn.struct.ptr.buffer.atomic.cmpswap.i32(i32, i32, ptr addrspace(8), i32, i32, i32, i32 immarg)
+declare i64 @llvm.amdgcn.struct.ptr.buffer.atomic.cmpswap.i64(i64, i64, ptr addrspace(8), i32, i32, i32, i32 immarg)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.atomic.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.atomic.ll
index 1e76044d935c2..a197d4802188a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.atomic.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.atomic.ll
@@ -108,6 +108,27 @@ main_body:
   ret float %v.float
 }
 
+;CHECK-LABEL: {{^}}test5:
+;CHECK-NOT: s_waitcnt
+;CHECK: buffer_atomic_cmpswap_x2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 0 glc
+;CHECK-DAG: s_waitcnt vmcnt(0)
+;CHECK-DAG: s_movk_i32 [[SOFS:s[0-9]+]], 0x1ffc
+;CHECK: buffer_atomic_cmpswap_x2 {{v\[[0-9]+:[0-9]+\]}}, v4, s[0:3], 0 offen glc
+;CHECK: s_waitcnt vmcnt(0)
+;CHECK: buffer_atomic_cmpswap_x2 {{v\[[0-9]+:[0-9]+\]}}, v4, s[0:3], 0 offen offset:44 glc
+;CHECK-DAG: s_waitcnt vmcnt(0)
+;CHECK: buffer_atomic_cmpswap_x2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], [[SOFS]] offset:4 glc
+define amdgpu_ps float @test5(<4 x i32> inreg %rsrc, i64 %data, i64 %cmp, i32 %vindex, i32 %voffset) {
+main_body:
+  %o1 = call i64 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i64(i64 %data, i64 %cmp, <4 x i32> %rsrc, i32 0, i32 0, i32 0)
+  %o3 = call i64 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i64(i64 %o1, i64 %cmp, <4 x i32> %rsrc, i32 %voffset, i32 0, i32 0)
+  %ofs.5 = add i32 %voffset, 44
+  %o5 = call i64 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i64(i64 %o3, i64 %cmp, <4 x i32> %rsrc, i32 %ofs.5, i32 0, i32 0)
+  %o6 = call i64 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i64(i64 %o5, i64 %cmp, <4 x i32> %rsrc, i32 4, i32 8188, i32 0)
+  %out = sitofp i64 %o6 to float
+  ret float %out
+}
+
 declare i32 @llvm.amdgcn.raw.buffer.atomic.swap.i32(i32, <4 x i32>, i32, i32, i32) #0
 declare float @llvm.amdgcn.raw.buffer.atomic.swap.f32(float, <4 x i32>, i32, i32, i32) #0
 declare i32 @llvm.amdgcn.raw.buffer.atomic.add.i32(i32, <4 x i32>, i32, i32, i32) #0
@@ -122,5 +143,6 @@ declare i32 @llvm.amdgcn.raw.buffer.atomic.xor.i32(i32, <4 x i32>, i32, i32, i32
 declare i32 @llvm.amdgcn.raw.buffer.atomic.inc.i32(i32, <4 x i32>, i32, i32, i32) #0
 declare i32 @llvm.amdgcn.raw.buffer.atomic.dec.i32(i32, <4 x i32>, i32, i32, i32) #0
 declare i32 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i32(i32, i32, <4 x i32>, i32, i32, i32) #0
+declare i64 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i64(i64, i64, <4 x i32>, i32, i32, i32) #0
 
 attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.ll
index 3ac9e84171450..2b7ef147cae0f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.atomic.ll
@@ -108,6 +108,27 @@ main_body:
   ret float %v.float
 }
 
+;CHECK-LABEL: {{^}}test5:
+;CHECK-NOT: s_waitcnt
+;CHECK: buffer_atomic_cmpswap_x2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], 0 glc
+;CHECK-DAG: s_waitcnt vmcnt(0)
+;CHECK-DAG: s_movk_i32 [[SOFS:s[0-9]+]], 0x1ffc
+;CHECK: buffer_atomic_cmpswap_x2 {{v\[[0-9]+:[0-9]+\]}}, v4, s[0:3], 0 offen glc
+;CHECK: s_waitcnt vmcnt(0)
+;CHECK: buffer_atomic_cmpswap_x2 {{v\[[0-9]+:[0-9]+\]}}, v4, s[0:3], 0 offen offset:44 glc
+;CHECK-DAG: s_waitcnt vmcnt(0)
+;CHECK: buffer_atomic_cmpswap_x2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], [[SOFS]] offset:4 glc
+define amdgpu_ps float @test5(ptr addrspace(8) inreg %rsrc, i64 %data, i64 %cmp, i32 %vindex, i32 %voffset) {
+main_body:
+  %o1 = call i64 @llvm.amdgcn.raw.ptr.buffer.atomic.cmpswap.i64(i64 %data, i64 %cmp, ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0)
+  %o3 = call i64 @llvm.amdgcn.raw.ptr.buffer.atomic.cmpswap.i64(i64 %o1, i64 %cmp, ptr addrspace(8) %rsrc, i32 %voffset, i32 0, i32 0)
+  %ofs.5 = add i32 %voffset, 44
+  %o5 = call i64 @llvm.amdgcn.raw.ptr.buffer.atomic.cmpswap.i64(i64 %o3, i64 %cmp, ptr addrspace(8) %rsrc, i32 %ofs.5, i32 0, i32 0)
+  %o6 = call i64 @llvm.amdgcn.raw.ptr.buffer.atomic.cmpswap.i64(i64 %o5, i64 %cmp, ptr addrspace(8) %rsrc, i32 4, i32 8188, i32 0)
+  %out = sitofp i64 %o6 to float
+  ret float %out
+}
+
 declare i32 @llvm.amdgcn.raw.ptr.buffer.atomic.swap.i32(i32, ptr addrspace(8), i32, i32, i32) #0
 declare float @llvm.amdgcn.raw.ptr.buffer.atomic.swap.f32(float, ptr addrspace(8), i32, i32, i32) #0
 declare i32 @llvm.amdgcn.raw.ptr.buffer.atomic.add.i32(i32, ptr addrspace(8), i32, i32, i32) #0
@@ -122,5 +143,6 @@ declare i32 @llvm.amdgcn.raw.ptr.buffer.atomic.xor.i32(i32, ptr addrspace(8), i3
 declare i32 @llvm.amdgcn.raw.ptr.buffer.atomic.inc.i32(i32, ptr addrspace(8), i32, i32, i32) #0
 declare i32 @llvm.amdgcn.raw.ptr.buffer.atomic.dec.i32(i32, ptr addrspace(8), i32, i32, i32) #0
 declare i32 @llvm.amdgcn.raw.ptr.buffer.atomic.cmpswap.i32(i32, i32, ptr addrspace(8), i32, i32, i32) #0
+declare i64 @llvm.amdgcn.raw.ptr.buffer.atomic.cmpswap.i64(i64, i64, ptr addrspace(8), i32, i32, i32) #0
 
 attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.atomic.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.atomic.ll
index fd1016f3379b7..8e709e846842e 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.atomic.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.atomic.ll
@@ -120,6 +120,33 @@ main_body:
   ret float %v.float
 }
 
+;CHECK-LABEL: {{^}}test5:
+;CHECK-NOT: s_waitcnt
+;CHECK: buffer_atomic_cmpswap_x2 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, s[0:3], 0 idxen glc
+;CHECK: s_waitcnt vmcnt(0)
+;CHECK: buffer_atomic_cmpswap_x2 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, s[0:3], 0 idxen glc
+;CHECK: s_waitcnt vmcnt(0)
+;CHECK: s_movk_i32 [[SOFS:s[0-9]+]], 0x1ffc
+;CHECK: buffer_atomic_cmpswap_x2 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, s[0:3], 0 idxen offen glc
+;CHECK: s_waitcnt vmcnt(0)
+;CHECK: buffer_atomic_cmpswap_x2 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, s[0:3], 0 idxen offen glc
+;CHECK: s_waitcnt vmcnt(0)
+;CHECK: buffer_atomic_cmpswap_x2 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, s[0:3], 0 idxen offen offset:44 glc
+;CHECK-DAG: s_waitcnt vmcnt(0)
+;CHECK: buffer_atomic_cmpswap_x2 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, s[0:3], [[SOFS]] idxen offset:4 glc
+define amdgpu_ps float @test5(<4 x i32> inreg %rsrc, i64 %data, i64 %cmp, i32 %vindex, i32 %voffset) {
+main_body:
+  %o1 = call i64 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i64(i64 %data, i64 %cmp, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
+  %o2 = call i64 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i64(i64 %o1, i64 %cmp, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0)
+  %o3 = call i64 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i64(i64 %o2, i64 %cmp, <4 x i32> %rsrc, i32 0, i32 %voffset, i32 0, i32 0)
+  %o4 = call i64 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i64(i64 %o3, i64 %cmp, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 0, i32 0)
+  %offs.5 = add i32 %voffset, 44
+  %o5 = call i64 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i64(i64 %o4, i64 %cmp, <4 x i32> %rsrc, i32 0, i32 %offs.5, i32 0, i32 0)
+  %o6 = call i64 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i64(i64 %o5, i64 %cmp, <4 x i32> %rsrc, i32 0, i32 4, i32 8188, i32 0)
+  %out = sitofp i64 %o6 to float
+  ret float %out
+}
+
 declare i32 @llvm.amdgcn.struct.buffer.atomic.swap.i32(i32, <4 x i32>, i32, i32, i32, i32) #0
 declare float @llvm.amdgcn.struct.buffer.atomic.swap.f32(float, <4 x i32>, i32, i32, i32, i32) #0
 declare i32 @llvm.amdgcn.struct.buffer.atomic.add.i32(i32, <4 x i32>, i32, i32, i32, i32) #0
@@ -134,5 +161,6 @@ declare i32 @llvm.amdgcn.struct.buffer.atomic.xor.i32(i32, <4 x i32>, i32, i32,
 declare i32 @llvm.amdgcn.struct.buffer.atomic.inc.i32(i32, <4 x i32>, i32, i32, i32, i32) #0
 declare i32 @llvm.amdgcn.struct.buffer.atomic.dec.i32(i32, <4 x i32>, i32, i32, i32, i32) #0
 declare i32 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i32(i32, i32, <4 x i32>, i32, i32, i32, i32) #0
+declare i64 @llvm.amdgcn.struct.buffer.atomic.cmpswap.i64(i64, i64, <4 x i32>, i32, i32, i32, i32) #0
 
 attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.ll
index 480c24706c3c3..2888e2280de93 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.atomic.ll
@@ -120,6 +120,33 @@ main_body:
   ret float %v.float
 }
 
+;CHECK-LABEL: {{^}}test5:
+;CHECK-NOT: s_waitcnt
+;CHECK: buffer_atomic_cmpswap_x2 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, s[0:3], 0 idxen glc
+;CHECK: s_waitcnt vmcnt(0)
+;CHECK: buffer_atomic_cmpswap_x2 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, s[0:3], 0 idxen glc
+;CHECK: s_waitcnt vmcnt(0)
+;CHECK: s_movk_i32 [[SOFS:s[0-9]+]], 0x1ffc
+;CHECK: buffer_atomic_cmpswap_x2 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, s[0:3], 0 idxen offen glc
+;CHECK: s_waitcnt vmcnt(0)
+;CHECK: buffer_atomic_cmpswap_x2 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, s[0:3], 0 idxen offen glc
+;CHECK: s_waitcnt vmcnt(0)
+;CHECK: buffer_atomic_cmpswap_x2 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, s[0:3], 0 idxen offen offset:44 glc
+;CHECK-DAG: s_waitcnt vmcnt(0)
+;CHECK: buffer_atomic_cmpswap_x2 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, s[0:3], [[SOFS]] idxen offset:4 glc
+define amdgpu_ps float @test5(ptr addrspace(8) inreg %rsrc, i64 %data, i64 %cmp, i32 %vindex, i32 %voffset) {
+main_body:
+  %o1 = call i64 @llvm.amdgcn.struct.ptr.buffer.atomic.cmpswap.i64(i64 %data, i64 %cmp, ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0)
+  %o2 = call i64 @llvm.amdgcn.struct.ptr.buffer.atomic.cmpswap.i64(i64 %o1, i64 %cmp, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0, i32 0)
+  %o3 = call i64 @llvm.amdgcn.struct.ptr.buffer.atomic.cmpswap.i64(i64 %o2, i64 %cmp, ptr addrspace(8) %rsrc, i32 0, i32 %voffset, i32 0, i32 0)
+  %o4 = call i64 @llvm.amdgcn.struct.ptr.buffer.atomic.cmpswap.i64(i64 %o3, i64 %cmp, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 0, i32 0)
+  %offs.5 = add i32 %voffset, 44
+  %o5 = call i64 @llvm.amdgcn.struct.ptr.buffer.atomic.cmpswap.i64(i64 %o4, i64 %cmp, ptr addrspace(8) %rsrc, i32 0, i32 %offs.5, i32 0, i32 0)
+  %o6 = call i64 @llvm.amdgcn.struct.ptr.buffer.atomic.cmpswap.i64(i64 %o5, i64 %cmp, ptr addrspace(8) %rsrc, i32 0, i32 4, i32 8188, i32 0)
+  %out = sitofp i64 %o6 to float
+  ret float %out
+}
+
 declare i32 @llvm.amdgcn.struct.ptr.buffer.atomic.swap.i32(i32, ptr addrspace(8), i32, i32, i32, i32) #0
 declare float @llvm.amdgcn.struct.ptr.buffer.atomic.swap.f32(float, ptr addrspace(8), i32, i32, i32, i32) #0
 declare i32 @llvm.amdgcn.struct.ptr.buffer.atomic.add.i32(i32, ptr addrspace(8), i32, i32, i32, i32) #0
@@ -134,5 +161,6 @@ declare i32 @llvm.amdgcn.struct.ptr.buffer.atomic.xor.i32(i32, ptr addrspace(8),
 declare i32 @llvm.amdgcn.struct.ptr.buffer.atomic.inc.i32(i32, ptr addrspace(8), i32, i32, i32, i32) #0
 declare i32 @llvm.amdgcn.struct.ptr.buffer.atomic.dec.i32(i32, ptr addrspace(8), i32, i32, i32, i32) #0
 declare i32 @llvm.amdgcn.struct.ptr.buffer.atomic.cmpswap.i32(i32, i32, ptr addrspace(8), i32, i32, i32, i32) #0
+declare i64 @llvm.amdgcn.struct.ptr.buffer.atomic.cmpswap.i64(i64, i64, ptr addrspace(8), i32, i32, i32, i32) #0
 
 attributes #0 = { nounwind }

From f706837e2b85c7e5f29b118d0ecac41ba23e226f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?=
 =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?=
 =?UTF-8?q?=E3=83=B3=29?= <clementval@gmail.com>
Date: Mon, 30 Oct 2023 09:51:42 -0700
Subject: [PATCH 397/877] [flang][mlir][openacc] Switch device_type
 representation to an enum (#70250)

Switch the representation from scalar integer to a enumeration. The
parser transform the string in the input to the correct enumeration.
---
 flang/include/flang/Parser/dump-parse-tree.h  |  1 +
 flang/include/flang/Parser/parse-tree.h       |  4 +-
 flang/lib/Lower/OpenACC.cpp                   | 95 +++++++++++--------
 flang/lib/Parser/openacc-parsers.cpp          | 12 ++-
 flang/test/Lower/OpenACC/acc-init.f90         | 10 +-
 flang/test/Lower/OpenACC/acc-set.f90          |  8 +-
 flang/test/Lower/OpenACC/acc-shutdown.f90     |  6 +-
 flang/test/Lower/OpenACC/acc-update.f90       |  9 +-
 flang/test/Semantics/OpenACC/acc-data.f90     |  2 +-
 .../Semantics/OpenACC/acc-init-validity.f90   |  8 +-
 .../Semantics/OpenACC/acc-kernels-loop.f90    |  4 +-
 flang/test/Semantics/OpenACC/acc-kernels.f90  |  4 +-
 flang/test/Semantics/OpenACC/acc-parallel.f90 |  6 +-
 .../Semantics/OpenACC/acc-set-validity.f90    |  8 +-
 .../OpenACC/acc-shutdown-validity.f90         |  8 +-
 .../Semantics/OpenACC/acc-update-validity.f90 |  2 +-
 .../mlir/Dialect/OpenACC/OpenACCOps.td        | 45 ++++++---
 mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp       |  5 +-
 mlir/test/Dialect/OpenACC/invalid.mlir        |  2 +-
 mlir/test/Dialect/OpenACC/ops.mlir            | 20 ++--
 20 files changed, 146 insertions(+), 113 deletions(-)

diff --git a/flang/include/flang/Parser/dump-parse-tree.h b/flang/include/flang/Parser/dump-parse-tree.h
index 494e54faa64c8..7c479a2334ea5 100644
--- a/flang/include/flang/Parser/dump-parse-tree.h
+++ b/flang/include/flang/Parser/dump-parse-tree.h
@@ -101,6 +101,7 @@ class ParseTreeDumper {
   NODE(parser, AccSelfClause)
   NODE(parser, AccStandaloneDirective)
   NODE(parser, AccDeviceTypeExpr)
+  NODE_ENUM(parser::AccDeviceTypeExpr, Device)
   NODE(parser, AccDeviceTypeExprList)
   NODE(parser, AccTileExpr)
   NODE(parser, AccTileExprList)
diff --git a/flang/include/flang/Parser/parse-tree.h b/flang/include/flang/Parser/parse-tree.h
index 83c8db936934a..4806fc49f3441 100644
--- a/flang/include/flang/Parser/parse-tree.h
+++ b/flang/include/flang/Parser/parse-tree.h
@@ -4061,9 +4061,9 @@ struct AccWaitArgument {
 };
 
 struct AccDeviceTypeExpr {
-  TUPLE_CLASS_BOILERPLATE(AccDeviceTypeExpr);
+  ENUM_CLASS(Device, Star, Default, Nvidia, Radeon, Host, Multicore)
+  WRAPPER_CLASS_BOILERPLATE(AccDeviceTypeExpr, Device);
   CharBlock source;
-  std::tuple<std::optional<ScalarIntExpr>> t; // if null then *
 };
 
 struct AccDeviceTypeExprList {
diff --git a/flang/lib/Lower/OpenACC.cpp b/flang/lib/Lower/OpenACC.cpp
index 5c29c78141730..3f7ef5e574712 100644
--- a/flang/lib/Lower/OpenACC.cpp
+++ b/flang/lib/Lower/OpenACC.cpp
@@ -150,7 +150,7 @@ static void createDeclareAllocFuncWithArg(mlir::OpBuilder &modBuilder,
           builder, loc, registerFuncOp.getArgument(0), asFortranDesc, bounds,
           /*structured=*/false, /*implicit=*/true,
           mlir::acc::DataClause::acc_update_device, descTy);
-  llvm::SmallVector<int32_t> operandSegments{0, 0, 0, 0, 0, 1};
+  llvm::SmallVector<int32_t> operandSegments{0, 0, 0, 0, 1};
   llvm::SmallVector<mlir::Value> operands{updateDeviceOp.getResult()};
   createSimpleOp<mlir::acc::UpdateOp>(builder, loc, operands, operandSegments);
 
@@ -219,7 +219,7 @@ static void createDeclareDeallocFuncWithArg(
           builder, loc, loadOp, asFortran, bounds,
           /*structured=*/false, /*implicit=*/true,
           mlir::acc::DataClause::acc_update_device, loadOp.getType());
-  llvm::SmallVector<int32_t> operandSegments{0, 0, 0, 0, 0, 1};
+  llvm::SmallVector<int32_t> operandSegments{0, 0, 0, 0, 1};
   llvm::SmallVector<mlir::Value> operands{updateDeviceOp.getResult()};
   createSimpleOp<mlir::acc::UpdateOp>(builder, loc, operands, operandSegments);
   modBuilder.setInsertionPointAfter(postDeallocOp);
@@ -1416,27 +1416,35 @@ static void genAsyncClause(Fortran::lower::AbstractConverter &converter,
   }
 }
 
-static void genDeviceTypeClause(
-    Fortran::lower::AbstractConverter &converter, mlir::Location clauseLocation,
+static mlir::acc::DeviceType
+getDeviceType(Fortran::parser::AccDeviceTypeExpr::Device device) {
+  switch (device) {
+  case Fortran::parser::AccDeviceTypeExpr::Device::Star:
+    return mlir::acc::DeviceType::Star;
+  case Fortran::parser::AccDeviceTypeExpr::Device::Default:
+    return mlir::acc::DeviceType::Default;
+  case Fortran::parser::AccDeviceTypeExpr::Device::Nvidia:
+    return mlir::acc::DeviceType::Nvidia;
+  case Fortran::parser::AccDeviceTypeExpr::Device::Radeon:
+    return mlir::acc::DeviceType::Radeon;
+  case Fortran::parser::AccDeviceTypeExpr::Device::Host:
+    return mlir::acc::DeviceType::Host;
+  case Fortran::parser::AccDeviceTypeExpr::Device::Multicore:
+    return mlir::acc::DeviceType::Multicore;
+  }
+  return mlir::acc::DeviceType::Default;
+}
+
+static void gatherDeviceTypeAttrs(
+    fir::FirOpBuilder &builder, mlir::Location clauseLocation,
     const Fortran::parser::AccClause::DeviceType *deviceTypeClause,
-    llvm::SmallVectorImpl<mlir::Value> &operands,
+    llvm::SmallVector<mlir::Attribute> &deviceTypes,
     Fortran::lower::StatementContext &stmtCtx) {
   const Fortran::parser::AccDeviceTypeExprList &deviceTypeExprList =
       deviceTypeClause->v;
-  for (const auto &deviceTypeExpr : deviceTypeExprList.v) {
-    const auto &expr = std::get<std::optional<Fortran::parser::ScalarIntExpr>>(
-        deviceTypeExpr.t);
-    if (expr) {
-      operands.push_back(fir::getBase(converter.genExprValue(
-          *Fortran::semantics::GetExpr(expr), stmtCtx, &clauseLocation)));
-    } else {
-      // * was passed as value and will be represented as a special constant.
-      fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
-      mlir::Value star = firOpBuilder.createIntegerConstant(
-          clauseLocation, firOpBuilder.getIndexType(), starCst);
-      operands.push_back(star);
-    }
-  }
+  for (const auto &deviceTypeExpr : deviceTypeExprList.v)
+    deviceTypes.push_back(mlir::acc::DeviceTypeAttr::get(
+        builder.getContext(), getDeviceType(deviceTypeExpr.v)));
 }
 
 static void genIfClause(Fortran::lower::AbstractConverter &converter,
@@ -2443,10 +2451,10 @@ genACCInitShutdownOp(Fortran::lower::AbstractConverter &converter,
                      mlir::Location currentLocation,
                      const Fortran::parser::AccClauseList &accClauseList) {
   mlir::Value ifCond, deviceNum;
-  llvm::SmallVector<mlir::Value> deviceTypeOperands;
 
-  fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
+  fir::FirOpBuilder &builder = converter.getFirOpBuilder();
   Fortran::lower::StatementContext stmtCtx;
+  llvm::SmallVector<mlir::Attribute> deviceTypes;
 
   // Lower clauses values mapped to operands.
   // Keep track of each group of operands separately as clauses can appear
@@ -2464,19 +2472,23 @@ genACCInitShutdownOp(Fortran::lower::AbstractConverter &converter,
     } else if (const auto *deviceTypeClause =
                    std::get_if<Fortran::parser::AccClause::DeviceType>(
                        &clause.u)) {
-      genDeviceTypeClause(converter, clauseLocation, deviceTypeClause,
-                          deviceTypeOperands, stmtCtx);
+      gatherDeviceTypeAttrs(builder, clauseLocation, deviceTypeClause,
+                            deviceTypes, stmtCtx);
     }
   }
 
   // Prepare the operand segment size attribute and the operands value range.
   llvm::SmallVector<mlir::Value, 6> operands;
-  llvm::SmallVector<int32_t, 3> operandSegments;
-  addOperands(operands, operandSegments, deviceTypeOperands);
+  llvm::SmallVector<int32_t, 2> operandSegments;
+
   addOperand(operands, operandSegments, deviceNum);
   addOperand(operands, operandSegments, ifCond);
 
-  createSimpleOp<Op>(firOpBuilder, currentLocation, operands, operandSegments);
+  Op op =
+      createSimpleOp<Op>(builder, currentLocation, operands, operandSegments);
+  if (!deviceTypes.empty())
+    op.setDeviceTypesAttr(
+        mlir::ArrayAttr::get(builder.getContext(), deviceTypes));
 }
 
 void genACCSetOp(Fortran::lower::AbstractConverter &converter,
@@ -2485,8 +2497,9 @@ void genACCSetOp(Fortran::lower::AbstractConverter &converter,
   mlir::Value ifCond, deviceNum, defaultAsync;
   llvm::SmallVector<mlir::Value> deviceTypeOperands;
 
-  fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
+  fir::FirOpBuilder &builder = converter.getFirOpBuilder();
   Fortran::lower::StatementContext stmtCtx;
+  llvm::SmallVector<mlir::Attribute> deviceTypes;
 
   // Lower clauses values mapped to operands.
   // Keep track of each group of operands separately as clauses can appear
@@ -2509,21 +2522,24 @@ void genACCSetOp(Fortran::lower::AbstractConverter &converter,
     } else if (const auto *deviceTypeClause =
                    std::get_if<Fortran::parser::AccClause::DeviceType>(
                        &clause.u)) {
-      genDeviceTypeClause(converter, clauseLocation, deviceTypeClause,
-                          deviceTypeOperands, stmtCtx);
+      gatherDeviceTypeAttrs(builder, clauseLocation, deviceTypeClause,
+                            deviceTypes, stmtCtx);
     }
   }
 
   // Prepare the operand segment size attribute and the operands value range.
   llvm::SmallVector<mlir::Value> operands;
-  llvm::SmallVector<int32_t, 4> operandSegments;
-  addOperands(operands, operandSegments, deviceTypeOperands);
+  llvm::SmallVector<int32_t, 3> operandSegments;
   addOperand(operands, operandSegments, defaultAsync);
   addOperand(operands, operandSegments, deviceNum);
   addOperand(operands, operandSegments, ifCond);
 
-  createSimpleOp<mlir::acc::SetOp>(firOpBuilder, currentLocation, operands,
-                                   operandSegments);
+  auto op = createSimpleOp<mlir::acc::SetOp>(builder, currentLocation, operands,
+                                             operandSegments);
+  if (!deviceTypes.empty()) {
+    assert(deviceTypes.size() == 1 && "expect only one value for acc.set");
+    op.setDeviceTypeAttr(mlir::cast<mlir::acc::DeviceTypeAttr>(deviceTypes[0]));
+  }
 }
 
 static void
@@ -2535,6 +2551,7 @@ genACCUpdateOp(Fortran::lower::AbstractConverter &converter,
   mlir::Value ifCond, async, waitDevnum;
   llvm::SmallVector<mlir::Value> dataClauseOperands, updateHostOperands,
       waitOperands, deviceTypeOperands;
+  llvm::SmallVector<mlir::Attribute> deviceTypes;
 
   // Async and wait clause have optional values but can be present with
   // no value as well. When there is no value, the op has an attribute to
@@ -2563,8 +2580,8 @@ genACCUpdateOp(Fortran::lower::AbstractConverter &converter,
     } else if (const auto *deviceTypeClause =
                    std::get_if<Fortran::parser::AccClause::DeviceType>(
                        &clause.u)) {
-      genDeviceTypeClause(converter, clauseLocation, deviceTypeClause,
-                          deviceTypeOperands, stmtCtx);
+      gatherDeviceTypeAttrs(builder, clauseLocation, deviceTypeClause,
+                            deviceTypes, stmtCtx);
     } else if (const auto *hostClause =
                    std::get_if<Fortran::parser::AccClause::Host>(&clause.u)) {
       genDataOperandOperations<mlir::acc::GetDevicePtrOp>(
@@ -2602,11 +2619,13 @@ genACCUpdateOp(Fortran::lower::AbstractConverter &converter,
   addOperand(operands, operandSegments, async);
   addOperand(operands, operandSegments, waitDevnum);
   addOperands(operands, operandSegments, waitOperands);
-  addOperands(operands, operandSegments, deviceTypeOperands);
   addOperands(operands, operandSegments, dataClauseOperands);
 
   mlir::acc::UpdateOp updateOp = createSimpleOp<mlir::acc::UpdateOp>(
       builder, currentLocation, operands, operandSegments);
+  if (!deviceTypes.empty())
+    updateOp.setDeviceTypesAttr(
+        mlir::ArrayAttr::get(builder.getContext(), deviceTypes));
 
   genDataExitOperations<mlir::acc::GetDevicePtrOp, mlir::acc::UpdateHostOp>(
       builder, updateHostOperands, /*structured=*/false);
@@ -2787,7 +2806,7 @@ static void createDeclareAllocFunc(mlir::OpBuilder &modBuilder,
           builder, loc, addrOp, asFortranDesc, bounds,
           /*structured=*/false, /*implicit=*/true,
           mlir::acc::DataClause::acc_update_device, addrOp.getType());
-  llvm::SmallVector<int32_t> operandSegments{0, 0, 0, 0, 0, 1};
+  llvm::SmallVector<int32_t> operandSegments{0, 0, 0, 0, 1};
   llvm::SmallVector<mlir::Value> operands{updateDeviceOp.getResult()};
   createSimpleOp<mlir::acc::UpdateOp>(builder, loc, operands, operandSegments);
 
@@ -2863,7 +2882,7 @@ static void createDeclareDeallocFunc(mlir::OpBuilder &modBuilder,
           builder, loc, addrOp, asFortran, bounds,
           /*structured=*/false, /*implicit=*/true,
           mlir::acc::DataClause::acc_update_device, addrOp.getType());
-  llvm::SmallVector<int32_t> operandSegments{0, 0, 0, 0, 0, 1};
+  llvm::SmallVector<int32_t> operandSegments{0, 0, 0, 0, 1};
   llvm::SmallVector<mlir::Value> operands{updateDeviceOp.getResult()};
   createSimpleOp<mlir::acc::UpdateOp>(builder, loc, operands, operandSegments);
   modBuilder.setInsertionPointAfter(postDeallocOp);
diff --git a/flang/lib/Parser/openacc-parsers.cpp b/flang/lib/Parser/openacc-parsers.cpp
index 131f7332a6970..5b9267e0e17c6 100644
--- a/flang/lib/Parser/openacc-parsers.cpp
+++ b/flang/lib/Parser/openacc-parsers.cpp
@@ -53,9 +53,15 @@ TYPE_PARSER(construct<AccSizeExpr>(scalarIntExpr) ||
     construct<AccSizeExpr>("*" >> construct<std::optional<ScalarIntExpr>>()))
 TYPE_PARSER(construct<AccSizeExprList>(nonemptyList(Parser<AccSizeExpr>{})))
 
-TYPE_PARSER(construct<AccDeviceTypeExpr>(scalarIntExpr) ||
-    construct<AccDeviceTypeExpr>(
-        "*" >> construct<std::optional<ScalarIntExpr>>()))
+TYPE_PARSER(sourced(construct<AccDeviceTypeExpr>(
+    first("*" >> pure(AccDeviceTypeExpr::Device::Star),
+        "DEFAULT" >> pure(AccDeviceTypeExpr::Device::Default),
+        "NVIDIA" >> pure(AccDeviceTypeExpr::Device::Nvidia),
+        "ACC_DEVICE_NVIDIA" >> pure(AccDeviceTypeExpr::Device::Nvidia),
+        "RADEON" >> pure(AccDeviceTypeExpr::Device::Radeon),
+        "HOST" >> pure(AccDeviceTypeExpr::Device::Host),
+        "MULTICORE" >> pure(AccDeviceTypeExpr::Device::Multicore)))))
+
 TYPE_PARSER(
     construct<AccDeviceTypeExprList>(nonemptyList(Parser<AccDeviceTypeExpr>{})))
 
diff --git a/flang/test/Lower/OpenACC/acc-init.f90 b/flang/test/Lower/OpenACC/acc-init.f90
index de940426b6f1c..d1fd638c7ac0e 100644
--- a/flang/test/Lower/OpenACC/acc-init.f90
+++ b/flang/test/Lower/OpenACC/acc-init.f90
@@ -4,6 +4,7 @@
 ! RUN: bbc -fopenacc -emit-hlfir %s -o - | FileCheck %s
 
 subroutine acc_init
+  implicit none
   logical :: ifCondition = .TRUE.
   integer :: ifInt = 1
 
@@ -23,15 +24,16 @@ subroutine acc_init
 !CHECK: [[DEVNUM:%.*]] = arith.constant 1 : i32
 !CHECK: acc.init device_num([[DEVNUM]] : i32){{$}}
 
-  !$acc init device_num(1) device_type(1, 2)
+  !$acc init device_num(1) device_type(host, multicore)
 !CHECK: [[DEVNUM:%.*]] = arith.constant 1 : i32
-!CHECK: [[DEVTYPE1:%.*]] = arith.constant 1 : i32
-!CHECK: [[DEVTYPE2:%.*]] = arith.constant 2 : i32
-!CHECK: acc.init device_type([[DEVTYPE1]], [[DEVTYPE2]] : i32, i32) device_num([[DEVNUM]] : i32){{$}}
+!CHECK: acc.init device_num([[DEVNUM]] : i32) attributes {device_types = [#acc.device_type<host>, #acc.device_type<multicore>]}
 
   !$acc init if(ifInt)
 !CHECK: %[[IFINT:.*]] = fir.load %{{.*}} : !fir.ref<i32>
 !CHECK: %[[CONV:.*]] = fir.convert %[[IFINT]] : (i32) -> i1
 !CHECK: acc.init if(%[[CONV]])
 
+   !$acc init device_type(nvidia)
+!CHECK: acc.init attributes {device_types = [#acc.device_type<nvidia>]}
+
 end subroutine acc_init
diff --git a/flang/test/Lower/OpenACC/acc-set.f90 b/flang/test/Lower/OpenACC/acc-set.f90
index 52baedeafecb2..39bf26e0072b7 100644
--- a/flang/test/Lower/OpenACC/acc-set.f90
+++ b/flang/test/Lower/OpenACC/acc-set.f90
@@ -14,7 +14,7 @@ program test_acc_set
 
 !$acc set device_type(*)
 
-!$acc set device_type(0)
+!$acc set device_type(multicore)
 
 end
 
@@ -34,10 +34,8 @@ program test_acc_set
 ! CHECK: %[[C0:.*]] = arith.constant 0 : i32
 ! CHECK: acc.set device_num(%[[C0]] : i32)
 
-! CHECK: %[[C_1:.*]] = arith.constant -1 : index
-! CHECK: acc.set device_type(%[[C_1]] : index)
+! CHECK: acc.set attributes {device_type = #acc.device_type<*>}
 
-! CHECK: %[[C0:.*]] = arith.constant 0 : i32
-! CHECK: acc.set device_type(%[[C0]] : i32)
+! CHECK: acc.set attributes {device_type = #acc.device_type<multicore>}
 
 
diff --git a/flang/test/Lower/OpenACC/acc-shutdown.f90 b/flang/test/Lower/OpenACC/acc-shutdown.f90
index 49e1acc546d90..f63f5d62b4fe9 100644
--- a/flang/test/Lower/OpenACC/acc-shutdown.f90
+++ b/flang/test/Lower/OpenACC/acc-shutdown.f90
@@ -22,10 +22,8 @@ subroutine acc_shutdown
 !CHECK: [[DEVNUM:%.*]] = arith.constant 1 : i32
 !CHECK: acc.shutdown device_num([[DEVNUM]] : i32){{$}}
 
-  !$acc shutdown device_num(1) device_type(1, 2)
+  !$acc shutdown device_num(1) device_type(default, nvidia)
 !CHECK: [[DEVNUM:%.*]] = arith.constant 1 : i32
-!CHECK: [[DEVTYPE1:%.*]] = arith.constant 1 : i32
-!CHECK: [[DEVTYPE2:%.*]] = arith.constant 2 : i32
-!CHECK: acc.shutdown device_type([[DEVTYPE1]], [[DEVTYPE2]] : i32, i32) device_num([[DEVNUM]] : i32){{$}}
+!CHECK: acc.shutdown device_num([[DEVNUM]] : i32) attributes {device_types = [#acc.device_type<default>, #acc.device_type<nvidia>]} 
 
 end subroutine acc_shutdown
diff --git a/flang/test/Lower/OpenACC/acc-update.f90 b/flang/test/Lower/OpenACC/acc-update.f90
index f7343a69285f8..5d5f5733ef7f1 100644
--- a/flang/test/Lower/OpenACC/acc-update.f90
+++ b/flang/test/Lower/OpenACC/acc-update.f90
@@ -145,20 +145,17 @@ subroutine acc_update
 ! FIR:   acc.update_host accPtr(%[[DEVPTR_A]] : !fir.ref<!fir.array<10x10xf32>>) bounds(%{{.*}}, %{{.*}}) to varPtr(%[[A]] : !fir.ref<!fir.array<10x10xf32>>) {name = "a", structured = false}
 ! HLFIR: acc.update_host accPtr(%[[DEVPTR_A]] : !fir.ref<!fir.array<10x10xf32>>) bounds(%{{.*}}, %{{.*}}) to varPtr(%[[DECLA]]#1 : !fir.ref<!fir.array<10x10xf32>>) {name = "a", structured = false}
 
-  !$acc update host(a) device_type(1, 2)
+  !$acc update host(a) device_type(default, host)
 ! FIR:   %[[DEVPTR_A:.*]] = acc.getdeviceptr varPtr(%[[A]] : !fir.ref<!fir.array<10x10xf32>>) bounds(%{{.*}}, %{{.*}}) -> !fir.ref<!fir.array<10x10xf32>> {dataClause = #acc<data_clause acc_update_host>, name = "a", structured = false}
 ! HLFIR: %[[DEVPTR_A:.*]] = acc.getdeviceptr varPtr(%[[DECLA]]#1 : !fir.ref<!fir.array<10x10xf32>>) bounds(%{{.*}}, %{{.*}}) -> !fir.ref<!fir.array<10x10xf32>> {dataClause = #acc<data_clause acc_update_host>, name = "a", structured = false}
-! CHECK: [[DEVTYPE1:%.*]] = arith.constant 1 : i32
-! CHECK: [[DEVTYPE2:%.*]] = arith.constant 2 : i32
-! CHECK: acc.update device_type([[DEVTYPE1]], [[DEVTYPE2]] : i32, i32) dataOperands(%[[DEVPTR_A]] : !fir.ref<!fir.array<10x10xf32>>){{$}}
+! CHECK: acc.update dataOperands(%[[DEVPTR_A]] : !fir.ref<!fir.array<10x10xf32>>) attributes {device_types = [#acc.device_type<default>, #acc.device_type<host>]} 
 ! FIR:   acc.update_host accPtr(%[[DEVPTR_A]] : !fir.ref<!fir.array<10x10xf32>>) bounds(%{{.*}}, %{{.*}}) to varPtr(%[[A]] : !fir.ref<!fir.array<10x10xf32>>) {name = "a", structured = false}
 ! HLFIR: acc.update_host accPtr(%[[DEVPTR_A]] : !fir.ref<!fir.array<10x10xf32>>) bounds(%{{.*}}, %{{.*}}) to varPtr(%[[DECLA]]#1 : !fir.ref<!fir.array<10x10xf32>>) {name = "a", structured = false}
 
   !$acc update host(a) device_type(*)
 ! FIR:   %[[DEVPTR_A:.*]] = acc.getdeviceptr varPtr(%[[A]] : !fir.ref<!fir.array<10x10xf32>>) bounds(%{{.*}}, %{{.*}}) -> !fir.ref<!fir.array<10x10xf32>> {dataClause = #acc<data_clause acc_update_host>, name = "a", structured = false}
 ! HLFIR: %[[DEVPTR_A:.*]] = acc.getdeviceptr varPtr(%[[DECLA]]#1 : !fir.ref<!fir.array<10x10xf32>>) bounds(%{{.*}}, %{{.*}}) -> !fir.ref<!fir.array<10x10xf32>> {dataClause = #acc<data_clause acc_update_host>, name = "a", structured = false}
-! CHECK: [[DEVTYPE3:%.*]] = arith.constant -1 : index
-! CHECK: acc.update device_type([[DEVTYPE3]] : index) dataOperands(%[[DEVPTR_A]] : !fir.ref<!fir.array<10x10xf32>>){{$}}
+! CHECK: acc.update dataOperands(%[[DEVPTR_A]] : !fir.ref<!fir.array<10x10xf32>>) attributes {device_types = [#acc.device_type<*>]} 
 ! FIR:   acc.update_host accPtr(%[[DEVPTR_A]] : !fir.ref<!fir.array<10x10xf32>>) bounds(%{{.*}}, %{{.*}}) to varPtr(%[[A]] : !fir.ref<!fir.array<10x10xf32>>) {name = "a", structured = false}
 ! HLFIR: acc.update_host accPtr(%[[DEVPTR_A]] : !fir.ref<!fir.array<10x10xf32>>) bounds(%{{.*}}, %{{.*}}) to varPtr(%[[DECLA]]#1 : !fir.ref<!fir.array<10x10xf32>>) {name = "a", structured = false}
 
diff --git a/flang/test/Semantics/OpenACC/acc-data.f90 b/flang/test/Semantics/OpenACC/acc-data.f90
index 17e0624b8cf24..1a7a6f95f3d89 100644
--- a/flang/test/Semantics/OpenACC/acc-data.f90
+++ b/flang/test/Semantics/OpenACC/acc-data.f90
@@ -184,7 +184,7 @@ program openacc_data_validity
   !$acc data copy(aa) wait
   !$acc end data
 
-  !$acc data copy(aa) device_type(1) wait
+  !$acc data copy(aa) device_type(default) wait
   !$acc end data
 
 end program openacc_data_validity
diff --git a/flang/test/Semantics/OpenACC/acc-init-validity.f90 b/flang/test/Semantics/OpenACC/acc-init-validity.f90
index f54898f73fdce..3b594a25217c0 100644
--- a/flang/test/Semantics/OpenACC/acc-init-validity.f90
+++ b/flang/test/Semantics/OpenACC/acc-init-validity.f90
@@ -20,9 +20,9 @@ program openacc_init_validity
   !$acc init if(ifInt)
   !$acc init device_num(1)
   !$acc init device_num(i)
-  !$acc init device_type(i)
-  !$acc init device_type(2, i, j)
-  !$acc init device_num(i) device_type(i, j) if(ifCondition)
+  !$acc init device_type(default)
+  !$acc init device_type(nvidia, radeon)
+  !$acc init device_num(i) device_type(host, multicore) if(ifCondition)
 
   !$acc parallel
   !ERROR: Directive INIT may not be called within a compute region
@@ -94,7 +94,7 @@ program openacc_init_validity
   !$acc init device_num(1) device_num(i)
 
   !ERROR: At most one DEVICE_TYPE clause can appear on the INIT directive
-  !$acc init device_type(2) device_type(i, j)
+  !$acc init device_type(nvidia) device_type(default, *)
 
   !ERROR: Must have LOGICAL or INTEGER type
   !$acc init if(ifReal)
diff --git a/flang/test/Semantics/OpenACC/acc-kernels-loop.f90 b/flang/test/Semantics/OpenACC/acc-kernels-loop.f90
index 5facd47377880..1a280f7c54f5c 100644
--- a/flang/test/Semantics/OpenACC/acc-kernels-loop.f90
+++ b/flang/test/Semantics/OpenACC/acc-kernels-loop.f90
@@ -264,12 +264,12 @@ program openacc_kernels_loop_validity
     a(i) = 3.14
   end do
 
-  !$acc kernels loop device_type(1)
+  !$acc kernels loop device_type(multicore)
   do i = 1, N
     a(i) = 3.14
   end do
 
-  !$acc kernels loop device_type(1, 3)
+  !$acc kernels loop device_type(host, multicore)
   do i = 1, N
     a(i) = 3.14
   end do
diff --git a/flang/test/Semantics/OpenACC/acc-kernels.f90 b/flang/test/Semantics/OpenACC/acc-kernels.f90
index a2c9c9e8be99b..de220f7c7ddf7 100644
--- a/flang/test/Semantics/OpenACC/acc-kernels.f90
+++ b/flang/test/Semantics/OpenACC/acc-kernels.f90
@@ -122,10 +122,10 @@ program openacc_kernels_validity
   !$acc kernels device_type(*)
   !$acc end kernels
 
-  !$acc kernels device_type(1)
+  !$acc kernels device_type(default)
   !$acc end kernels
 
-  !$acc kernels device_type(1, 3)
+  !$acc kernels device_type(default, host)
   !$acc end kernels
 
   !$acc kernels device_type(*) async wait num_gangs(8) num_workers(8) vector_length(128)
diff --git a/flang/test/Semantics/OpenACC/acc-parallel.f90 b/flang/test/Semantics/OpenACC/acc-parallel.f90
index e85922e37c63e..0e8d240d01998 100644
--- a/flang/test/Semantics/OpenACC/acc-parallel.f90
+++ b/flang/test/Semantics/OpenACC/acc-parallel.f90
@@ -111,10 +111,10 @@ program openacc_parallel_validity
   !$acc parallel device_type(*)
   !$acc end parallel
 
-  !$acc parallel device_type(1)
+  !$acc parallel device_type(default)
   !$acc end parallel
 
-  !$acc parallel device_type(1, 3)
+  !$acc parallel device_type(default, host)
   !$acc end parallel
 
   !ERROR: Clause PRIVATE is not allowed after clause DEVICE_TYPE on the PARALLEL directive
@@ -131,7 +131,7 @@ program openacc_parallel_validity
   !$acc parallel device_type(*) num_gangs(8)
   !$acc end parallel
 
-  !$acc parallel device_type(1) async device_type(2) wait
+  !$acc parallel device_type(*) async device_type(host) wait
   !$acc end parallel
 
   !ERROR: Clause IF is not allowed after clause DEVICE_TYPE on the PARALLEL directive
diff --git a/flang/test/Semantics/OpenACC/acc-set-validity.f90 b/flang/test/Semantics/OpenACC/acc-set-validity.f90
index 896e39df6535c..74522b30d11bc 100644
--- a/flang/test/Semantics/OpenACC/acc-set-validity.f90
+++ b/flang/test/Semantics/OpenACC/acc-set-validity.f90
@@ -90,17 +90,17 @@ program openacc_clause_validity
   !$acc set device_num(1) device_num(i)
 
   !ERROR: At most one DEVICE_TYPE clause can appear on the SET directive
-  !$acc set device_type(i) device_type(2)
+  !$acc set device_type(*) device_type(nvidia)
 
   !$acc set default_async(2)
   !$acc set default_async(i)
   !$acc set device_num(1)
   !$acc set device_num(i)
-  !$acc set device_type(i)
-  !$acc set device_num(1) default_async(2) device_type(2)
+  !$acc set device_type(default)
+  !$acc set device_num(1) default_async(2) device_type(*)
 
   !ERROR: The DEVICE_TYPE clause on the SET directive accepts only one value
-  !$acc set device_type(1, 2)
+  !$acc set device_type(*, default)
 
   !ERROR: At least one of DEFAULT_ASYNC, DEVICE_NUM, DEVICE_TYPE clause must appear on the SET directive
   !$acc set
diff --git a/flang/test/Semantics/OpenACC/acc-shutdown-validity.f90 b/flang/test/Semantics/OpenACC/acc-shutdown-validity.f90
index de40963f99e04..43aed4fc98f42 100644
--- a/flang/test/Semantics/OpenACC/acc-shutdown-validity.f90
+++ b/flang/test/Semantics/OpenACC/acc-shutdown-validity.f90
@@ -80,9 +80,9 @@ program openacc_shutdown_validity
   !$acc shutdown if(ifCondition)
   !$acc shutdown device_num(1)
   !$acc shutdown device_num(i)
-  !$acc shutdown device_type(i)
-  !$acc shutdown device_type(2, i, j)
-  !$acc shutdown device_num(i) device_type(i, j) if(ifCondition)
+  !$acc shutdown device_type(*)
+  !$acc shutdown device_type(*, default, host)
+  !$acc shutdown device_num(i) device_type(default, host) if(ifCondition)
 
   !ERROR: At most one IF clause can appear on the SHUTDOWN directive
   !$acc shutdown if(.TRUE.) if(ifCondition)
@@ -91,6 +91,6 @@ program openacc_shutdown_validity
   !$acc shutdown device_num(1) device_num(i)
 
   !ERROR: At most one DEVICE_TYPE clause can appear on the SHUTDOWN directive
-  !$acc shutdown device_type(2) device_type(i, j)
+  !$acc shutdown device_type(*) device_type(host, default)
 
 end program openacc_shutdown_validity
diff --git a/flang/test/Semantics/OpenACC/acc-update-validity.f90 b/flang/test/Semantics/OpenACC/acc-update-validity.f90
index a409ba5ea549f..1e75742e63e97 100644
--- a/flang/test/Semantics/OpenACC/acc-update-validity.f90
+++ b/flang/test/Semantics/OpenACC/acc-update-validity.f90
@@ -53,7 +53,7 @@ program openacc_update_validity
 
   !$acc update host(bb) device_type(*) wait
 
-  !$acc update self(cc) device_type(1,2) async device_type(3) wait
+  !$acc update self(cc) device_type(host,multicore) async device_type(*) wait
 
   !ERROR: At most one IF clause can appear on the UPDATE directive
   !$acc update device(aa) if(.true.) if(ifCondition)
diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td b/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
index 10018c9fc7e27..3c5173fdda7f6 100644
--- a/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
+++ b/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
@@ -155,6 +155,30 @@ def DeclareActionAttr : OpenACC_Attr<"DeclareAction", "declare_action"> {
   let assemblyFormat = "`<` struct(params) `>`";
 }
 
+// Device type enumeration.
+def OpenACC_DeviceTypeStar      : I32EnumAttrCase<"Star", 0, "*">;
+def OpenACC_DeviceTypeDefault   : I32EnumAttrCase<"Default", 1, "default">;
+def OpenACC_DeviceTypeHost      : I32EnumAttrCase<"Host", 2, "host">;
+def OpenACC_DeviceTypeMulticore : I32EnumAttrCase<"Multicore", 3, "multicore">;
+def OpenACC_DeviceTypeNvidia    : I32EnumAttrCase<"Nvidia", 4, "nvidia">;
+def OpenACC_DeviceTypeRadeon    : I32EnumAttrCase<"Radeon", 5, "radeon">;
+
+
+def OpenACC_DeviceType : I32EnumAttr<"DeviceType",
+    "built-in device type supported by OpenACC",
+    [OpenACC_DeviceTypeStar, OpenACC_DeviceTypeDefault,
+     OpenACC_DeviceTypeHost, OpenACC_DeviceTypeMulticore,
+     OpenACC_DeviceTypeNvidia, OpenACC_DeviceTypeRadeon
+    ]> {
+  let genSpecializedAttr = 0;
+  let cppNamespace = "::mlir::acc";
+}
+def OpenACC_DeviceTypeAttr : EnumAttr<OpenACC_Dialect,
+                                      OpenACC_DeviceType,
+                                      "device_type"> {
+  let assemblyFormat = [{ ```<` $value `>` }];
+}
+
 // Used for data specification in data clauses (2.7.1).
 // Either (or both) extent and upperbound must be specified.
 def OpenACC_DataBoundsOp : OpenACC_Op<"bounds",
@@ -1624,14 +1648,12 @@ def OpenACC_InitOp : OpenACC_Op<"init", [AttrSizedOperandSegments]> {
     ```
   }];
 
-  let arguments = (ins Variadic<AnyInteger>:$deviceTypeOperands,
+  let arguments = (ins OptionalAttr<TypedArrayAttrBase<OpenACC_DeviceTypeAttr, "Device type attributes">>:$device_types,
                        Optional<IntOrIndex>:$deviceNumOperand,
                        Optional<I1>:$ifCond);
 
   let assemblyFormat = [{
-    oilist(
-        `device_type` `(` $deviceTypeOperands `:` type($deviceTypeOperands) `)`
-      | `device_num` `(` $deviceNumOperand `:` type($deviceNumOperand) `)`
+    oilist(`device_num` `(` $deviceNumOperand `:` type($deviceNumOperand) `)`
       | `if` `(` $ifCond `)`
     ) attr-dict-with-keyword
   }];
@@ -1657,13 +1679,12 @@ def OpenACC_ShutdownOp : OpenACC_Op<"shutdown", [AttrSizedOperandSegments]> {
     ```
   }];
 
-  let arguments = (ins Variadic<AnyInteger>:$deviceTypeOperands,
+  let arguments = (ins OptionalAttr<TypedArrayAttrBase<OpenACC_DeviceTypeAttr, "Device type attributes">>:$device_types,
                        Optional<IntOrIndex>:$deviceNumOperand,
                        Optional<I1>:$ifCond);
 
   let assemblyFormat = [{
-    oilist(`device_type` `(` $deviceTypeOperands `:` type($deviceTypeOperands) `)`
-    |`device_num` `(` $deviceNumOperand `:` type($deviceNumOperand) `)`
+    oilist(`device_num` `(` $deviceNumOperand `:` type($deviceNumOperand) `)`
     |`if` `(` $ifCond `)`
     ) attr-dict-with-keyword
   }];
@@ -1687,15 +1708,13 @@ def OpenACC_SetOp : OpenACC_Op<"set", [AttrSizedOperandSegments]> {
     ```
   }];
 
-  let arguments = (ins Optional<IntOrIndex>:$deviceType,
+  let arguments = (ins OptionalAttr<OpenACC_DeviceTypeAttr>:$device_type,
                        Optional<IntOrIndex>:$defaultAsync,
                        Optional<IntOrIndex>:$deviceNum,
                        Optional<I1>:$ifCond);
 
   let assemblyFormat = [{
-    oilist(
-      `device_type` `(` $deviceType `:` type($deviceType) `)`
-    | `default_async` `(` $defaultAsync `:` type($defaultAsync) `)`
+    oilist(`default_async` `(` $defaultAsync `:` type($defaultAsync) `)`
     | `device_num` `(` $deviceNum `:` type($deviceNum) `)`
     | `if` `(` $ifCond `)`
     ) attr-dict-with-keyword
@@ -1729,7 +1748,7 @@ def OpenACC_UpdateOp : OpenACC_Op<"update", [AttrSizedOperandSegments]> {
                        Variadic<IntOrIndex>:$waitOperands,
                        UnitAttr:$async,
                        UnitAttr:$wait,
-                       Variadic<IntOrIndex>:$deviceTypeOperands,
+                       OptionalAttr<TypedArrayAttrBase<OpenACC_DeviceTypeAttr, "Device type attributes">>:$device_types,
                        Variadic<OpenACC_PointerLikeTypeInterface>:$dataClauseOperands,
                        UnitAttr:$ifPresent);
 
@@ -1746,8 +1765,6 @@ def OpenACC_UpdateOp : OpenACC_Op<"update", [AttrSizedOperandSegments]> {
         `if` `(` $ifCond `)`
       | `async` `(` $asyncOperand `:` type($asyncOperand) `)`
       | `wait_devnum` `(` $waitDevnum `:` type($waitDevnum) `)`
-      | `device_type` `(` $deviceTypeOperands `:`
-          type($deviceTypeOperands) `)`
       | `wait` `(` $waitOperands `:` type($waitOperands) `)`
       | `dataOperands` `(` $dataClauseOperands `:` type($dataClauseOperands) `)`
     )
diff --git a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
index 98c800033cbe9..d3747d3310409 100644
--- a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
+++ b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
@@ -1240,7 +1240,7 @@ LogicalResult acc::SetOp::verify() {
   while ((currOp = currOp->getParentOp()))
     if (isComputeOperation(currOp))
       return emitOpError("cannot be nested in a compute operation");
-  if (!getDeviceType() && !getDefaultAsync() && !getDeviceNum())
+  if (!getDeviceTypeAttr() && !getDefaultAsync() && !getDeviceNum())
     return emitOpError("at least one default_async, device_num, or device_type "
                        "operand must appear");
   return success();
@@ -1285,8 +1285,7 @@ Value UpdateOp::getDataOperand(unsigned i) {
   unsigned numOptional = getAsyncOperand() ? 1 : 0;
   numOptional += getWaitDevnum() ? 1 : 0;
   numOptional += getIfCond() ? 1 : 0;
-  return getOperand(getWaitOperands().size() + getDeviceTypeOperands().size() +
-                    numOptional + i);
+  return getOperand(getWaitOperands().size() + numOptional + i);
 }
 
 void UpdateOp::getCanonicalizationPatterns(RewritePatternSet &results,
diff --git a/mlir/test/Dialect/OpenACC/invalid.mlir b/mlir/test/Dialect/OpenACC/invalid.mlir
index ff92eab478bb4..b5241a8e4dc47 100644
--- a/mlir/test/Dialect/OpenACC/invalid.mlir
+++ b/mlir/test/Dialect/OpenACC/invalid.mlir
@@ -475,7 +475,7 @@ acc.parallel num_gangs(%i64value, %i64value, %i64value, %i64value : i64, i64, i6
 %i64value = arith.constant 1 : i64
 acc.parallel {
 // expected-error@+1 {{'acc.set' op cannot be nested in a compute operation}}
-  acc.set device_type(%i64value : i64)
+  acc.set attributes {device_type = #acc.device_type<nvidia>}
   acc.yield
 }
 
diff --git a/mlir/test/Dialect/OpenACC/ops.mlir b/mlir/test/Dialect/OpenACC/ops.mlir
index d1950b1fb3f29..cf7a838f55ef8 100644
--- a/mlir/test/Dialect/OpenACC/ops.mlir
+++ b/mlir/test/Dialect/OpenACC/ops.mlir
@@ -974,7 +974,7 @@ func.func @testupdateop(%a: memref<f32>, %b: memref<f32>, %c: memref<f32>) -> ()
   acc.update async(%idxValue: index) dataOperands(%0: memref<f32>)
   acc.update wait_devnum(%i64Value: i64) wait(%i32Value, %idxValue : i32, index) dataOperands(%0: memref<f32>)
   acc.update if(%ifCond) dataOperands(%0: memref<f32>)
-  acc.update device_type(%i32Value : i32) dataOperands(%0: memref<f32>)
+  acc.update dataOperands(%0: memref<f32>) attributes {acc.device_types = [#acc.device_type<nvidia>]}
   acc.update dataOperands(%0, %1, %2 : memref<f32>, memref<f32>, memref<f32>)
   acc.update dataOperands(%0, %1, %2 : memref<f32>, memref<f32>, memref<f32>) attributes {async}
   acc.update dataOperands(%0, %1, %2 : memref<f32>, memref<f32>, memref<f32>) attributes {wait}
@@ -993,7 +993,7 @@ func.func @testupdateop(%a: memref<f32>, %b: memref<f32>, %c: memref<f32>) -> ()
 // CHECK:   acc.update async([[IDXVALUE]] : index) dataOperands(%{{.*}} : memref<f32>)
 // CHECK:   acc.update wait_devnum([[I64VALUE]] : i64) wait([[I32VALUE]], [[IDXVALUE]] : i32, index) dataOperands(%{{.*}} : memref<f32>)
 // CHECK:   acc.update if([[IFCOND]]) dataOperands(%{{.*}} : memref<f32>)
-// CHECK:   acc.update device_type([[I32VALUE]] : i32) dataOperands(%{{.*}} : memref<f32>)
+// CHECK:   acc.update dataOperands(%{{.*}} : memref<f32>) attributes {acc.device_types = [#acc.device_type<nvidia>]}
 // CHECK:   acc.update dataOperands(%{{.*}}, %{{.*}}, %{{.*}} : memref<f32>, memref<f32>, memref<f32>)
 // CHECK:   acc.update dataOperands(%{{.*}}, %{{.*}}, %{{.*}} : memref<f32>, memref<f32>, memref<f32>) attributes {async}
 // CHECK:   acc.update dataOperands(%{{.*}}, %{{.*}}, %{{.*}} : memref<f32>, memref<f32>, memref<f32>) attributes {wait}
@@ -1047,8 +1047,7 @@ acc.wait if(%ifCond)
 %idxValue = arith.constant 1 : index
 %ifCond = arith.constant true
 acc.init
-acc.init device_type(%i32Value : i32)
-acc.init device_type(%i32Value, %i32Value2 : i32, i32)
+acc.init attributes {acc.device_types = [#acc.device_type<nvidia>]}
 acc.init device_num(%i64Value : i64)
 acc.init device_num(%i32Value : i32)
 acc.init device_num(%idxValue : index)
@@ -1062,8 +1061,7 @@ acc.init device_num(%idxValue : index) if(%ifCond)
 // CHECK: [[IDXVALUE:%.*]] = arith.constant 1 : index
 // CHECK: [[IFCOND:%.*]] = arith.constant true
 // CHECK: acc.init
-// CHECK: acc.init device_type([[I32VALUE]] : i32)
-// CHECK: acc.init device_type([[I32VALUE]], [[I32VALUE2]] : i32, i32)
+// CHECK: acc.init attributes {acc.device_types = [#acc.device_type<nvidia>]} 
 // CHECK: acc.init device_num([[I64VALUE]] : i64)
 // CHECK: acc.init device_num([[I32VALUE]] : i32)
 // CHECK: acc.init device_num([[IDXVALUE]] : index)
@@ -1079,8 +1077,7 @@ acc.init device_num(%idxValue : index) if(%ifCond)
 %idxValue = arith.constant 1 : index
 %ifCond = arith.constant true
 acc.shutdown
-acc.shutdown device_type(%i32Value : i32)
-acc.shutdown device_type(%i32Value, %i32Value2 : i32, i32)
+acc.shutdown attributes {acc.device_types = [#acc.device_type<default>]}
 acc.shutdown device_num(%i64Value : i64)
 acc.shutdown device_num(%i32Value : i32)
 acc.shutdown device_num(%idxValue : index)
@@ -1094,8 +1091,7 @@ acc.shutdown device_num(%idxValue : index) if(%ifCond)
 // CHECK: [[IDXVALUE:%.*]] = arith.constant 1 : index
 // CHECK: [[IFCOND:%.*]] = arith.constant true
 // CHECK: acc.shutdown
-// CHECK: acc.shutdown device_type([[I32VALUE]] : i32)
-// CHECK: acc.shutdown device_type([[I32VALUE]], [[I32VALUE2]] : i32, i32)
+// CHECK: acc.shutdown attributes {acc.device_types = [#acc.device_type<default>]}
 // CHECK: acc.shutdown device_num([[I64VALUE]] : i64)
 // CHECK: acc.shutdown device_num([[I32VALUE]] : i32)
 // CHECK: acc.shutdown device_num([[IDXVALUE]] : index)
@@ -1718,7 +1714,7 @@ func.func @compute3(%a: memref<10x10xf32>, %b: memref<10x10xf32>, %c: memref<10x
 %i32Value2 = arith.constant 2 : i32
 %idxValue = arith.constant 1 : index
 %ifCond = arith.constant true
-acc.set device_type(%i32Value : i32)
+acc.set attributes {device_type = #acc.device_type<nvidia>}
 acc.set device_num(%i64Value : i64)
 acc.set device_num(%i32Value : i32)
 acc.set device_num(%idxValue : index)
@@ -1730,7 +1726,7 @@ acc.set default_async(%i32Value : i32)
 // CHECK: [[I32VALUE2:%.*]] = arith.constant 2 : i32
 // CHECK: [[IDXVALUE:%.*]] = arith.constant 1 : index
 // CHECK: [[IFCOND:%.*]] = arith.constant true
-// CHECK: acc.set device_type([[I32VALUE]] : i32)
+// CHECK: acc.set attributes {device_type = #acc.device_type<nvidia>}
 // CHECK: acc.set device_num([[I64VALUE]] : i64)
 // CHECK: acc.set device_num([[I32VALUE]] : i32)
 // CHECK: acc.set device_num([[IDXVALUE]] : index)

From 284d136c4ade9469fcd0a391472789adb34e7a1e Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Mon, 30 Oct 2023 09:58:51 -0700
Subject: [PATCH 398/877] [RISCV] Teach copyPhysReg to allow copies between
 GPR<->FPR32/FPR64 (#70525)

This is needed because GISel emits copies instead of bitcasts like
SelectionDAG.
---
 llvm/lib/Target/RISCV/RISCVInstrInfo.cpp      | 30 ++++++++++++++++
 .../RISCV/GlobalISel/fpr-gpr-copy-rv32.ll     | 19 +++++++++++
 .../RISCV/GlobalISel/fpr-gpr-copy-rv64.ll     | 34 +++++++++++++++++++
 3 files changed, 83 insertions(+)
 create mode 100644 llvm/test/CodeGen/RISCV/GlobalISel/fpr-gpr-copy-rv32.ll
 create mode 100644 llvm/test/CodeGen/RISCV/GlobalISel/fpr-gpr-copy-rv64.ll

diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index 996ef1c6f574a..412fb7e7f7fc1 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -471,6 +471,36 @@ void RISCVInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     return;
   }
 
+  if (RISCV::FPR32RegClass.contains(DstReg) &&
+      RISCV::GPRRegClass.contains(SrcReg)) {
+    BuildMI(MBB, MBBI, DL, get(RISCV::FMV_W_X), DstReg)
+        .addReg(SrcReg, getKillRegState(KillSrc));
+    return;
+  }
+
+  if (RISCV::GPRRegClass.contains(DstReg) &&
+      RISCV::FPR32RegClass.contains(SrcReg)) {
+    BuildMI(MBB, MBBI, DL, get(RISCV::FMV_X_W), DstReg)
+        .addReg(SrcReg, getKillRegState(KillSrc));
+    return;
+  }
+
+  if (RISCV::FPR64RegClass.contains(DstReg) &&
+      RISCV::GPRRegClass.contains(SrcReg)) {
+    assert(STI.getXLen() == 64 && "Unexpected GPR size");
+    BuildMI(MBB, MBBI, DL, get(RISCV::FMV_D_X), DstReg)
+        .addReg(SrcReg, getKillRegState(KillSrc));
+    return;
+  }
+
+  if (RISCV::GPRRegClass.contains(DstReg) &&
+      RISCV::FPR64RegClass.contains(SrcReg)) {
+    assert(STI.getXLen() == 64 && "Unexpected GPR size");
+    BuildMI(MBB, MBBI, DL, get(RISCV::FMV_X_D), DstReg)
+        .addReg(SrcReg, getKillRegState(KillSrc));
+    return;
+  }
+
   // VR->VR copies.
   if (RISCV::VRRegClass.contains(DstReg, SrcReg)) {
     copyPhysRegVector(MBB, MBBI, DL, DstReg, SrcReg, KillSrc, RISCV::VMV1R_V);
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/fpr-gpr-copy-rv32.ll b/llvm/test/CodeGen/RISCV/GlobalISel/fpr-gpr-copy-rv32.ll
new file mode 100644
index 0000000000000..1757e5550f81a
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/fpr-gpr-copy-rv32.ll
@@ -0,0 +1,19 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc -mtriple=riscv32 -global-isel -mattr=+f -target-abi=ilp32 \
+; RUN:   -verify-machineinstrs < %s | FileCheck -check-prefix=RV32I %s
+
+; Test copying between FPR32 and GPR on RV32.
+; FIXME: This test should be replaced with a more general calling convention
+; test once we have more FP implemented.
+
+define float @fadd(float %x, float %y) {
+; RV32I-LABEL: fadd:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    fmv.w.x fa5, a0
+; RV32I-NEXT:    fmv.w.x fa4, a1
+; RV32I-NEXT:    fadd.s fa5, fa5, fa4
+; RV32I-NEXT:    fmv.x.w a0, fa5
+; RV32I-NEXT:    ret
+  %a = fadd float %x, %y
+  ret float %a
+}
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/fpr-gpr-copy-rv64.ll b/llvm/test/CodeGen/RISCV/GlobalISel/fpr-gpr-copy-rv64.ll
new file mode 100644
index 0000000000000..2eca0ad66f5c6
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/fpr-gpr-copy-rv64.ll
@@ -0,0 +1,34 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc -mtriple=riscv64 -global-isel -mattr=+d -target-abi=lp64 \
+; RUN:   -verify-machineinstrs < %s | FileCheck -check-prefix=RV64I %s
+
+; Test copying between FPR64 and GPR on RV64.
+; FIXME: This test should be replaced with a more general calling convention
+; test once we have more FP implemented.
+
+define double @fadd_f64(double %x, double %y) {
+; RV64I-LABEL: fadd_f64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    fmv.d.x fa5, a0
+; RV64I-NEXT:    fmv.d.x fa4, a1
+; RV64I-NEXT:    fadd.d fa5, fa5, fa4
+; RV64I-NEXT:    fmv.x.d a0, fa5
+; RV64I-NEXT:    ret
+  %a = fadd double %x, %y
+  ret double %a
+
+; Test copying between FPR32 and GPR on RV64.
+; FIXME: This test should be replaced with a more general calling convention
+; test once we have more FP implemented.
+
+define float @fadd_f32(float %x, float %y) {
+; RV32I-LABEL: fadd:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    fmv.d.x fa5, a0
+; RV32I-NEXT:    fmv.d.x fa4, a1
+; RV32I-NEXT:    fadd.d fa5, fa5, fa4
+; RV32I-NEXT:    fmv.x.d a0, fa5
+; RV32I-NEXT:    ret
+  %a = fadd float %x, %y
+  ret float %a
+}

From c42b640208aa74c65cef5943bc05522780a72723 Mon Sep 17 00:00:00 2001
From: Adrian Prantl <adrian-prantl@users.noreply.github.com>
Date: Mon, 30 Oct 2023 10:00:40 -0700
Subject: [PATCH 399/877] Fix the DEVELOPER_DIR computation (#70528)

The code was incorrectly going into the wrong direction by removing one
component instead of appendeing /Developer to it. Due to fallback
mechanisms in xcrun this never seemed to have caused any issues.
---
 lldb/source/Host/macosx/objcxx/HostInfoMacOSX.mm | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/lldb/source/Host/macosx/objcxx/HostInfoMacOSX.mm b/lldb/source/Host/macosx/objcxx/HostInfoMacOSX.mm
index e3506a01c606b..33d94504fe70f 100644
--- a/lldb/source/Host/macosx/objcxx/HostInfoMacOSX.mm
+++ b/lldb/source/Host/macosx/objcxx/HostInfoMacOSX.mm
@@ -461,13 +461,11 @@ static void ParseOSVersion(llvm::VersionTuple &version, NSString *Key) {
     // Invoke xcrun with the shlib dir.
     if (FileSpec fspec = HostInfo::GetShlibDir()) {
       if (FileSystem::Instance().Exists(fspec)) {
-        std::string contents_dir =
-            XcodeSDK::FindXcodeContentsDirectoryInPath(fspec.GetPath());
-        llvm::StringRef shlib_developer_dir =
-            llvm::sys::path::parent_path(contents_dir);
-        if (!shlib_developer_dir.empty()) {
-          auto sdk =
-              xcrun(sdk_name, show_sdk_path, std::move(shlib_developer_dir));
+        llvm::SmallString<0> shlib_developer_dir(
+            XcodeSDK::FindXcodeContentsDirectoryInPath(fspec.GetPath()));
+        llvm::sys::path::append(shlib_developer_dir, "Developer");
+        if (FileSystem::Instance().Exists(shlib_developer_dir)) {
+          auto sdk = xcrun(sdk_name, show_sdk_path, shlib_developer_dir);
           if (!sdk)
             return sdk.takeError();
           if (!sdk->empty())

From a1b4005bae860af92ffd895bd5f2e9ba6c0a70aa Mon Sep 17 00:00:00 2001
From: Jake Egan <5326451+jakeegan@users.noreply.github.com>
Date: Mon, 30 Oct 2023 13:06:49 -0400
Subject: [PATCH 400/877] =?UTF-8?q?[clang][Module]=20Mark=20test=20unsuppo?=
 =?UTF-8?q?rted=20since=20objc=20doesn't=20have=20xcoff/g=E2=80=A6=20(#706?=
 =?UTF-8?q?61)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

…off support

Same as D135848. The newly added test fails with `fatal error: error in
backend: Objective-C support is unimplemented for object file format`.
---
 clang/test/Modules/relative-resource-dir.m | 1 +
 1 file changed, 1 insertion(+)

diff --git a/clang/test/Modules/relative-resource-dir.m b/clang/test/Modules/relative-resource-dir.m
index 2efc61259fd78..e0d1b6d221d10 100644
--- a/clang/test/Modules/relative-resource-dir.m
+++ b/clang/test/Modules/relative-resource-dir.m
@@ -1,3 +1,4 @@
+// UNSUPPORTED: -zos, -aix
 // REQUIRES: shell
 
 // RUN: EXPECTED_RESOURCE_DIR=`%clang -print-resource-dir` && \

From 693941132e27d82f068acbb7f27e134989987de6 Mon Sep 17 00:00:00 2001
From: Nick Desaulniers <nickdesaulniers@users.noreply.github.com>
Date: Mon, 30 Oct 2023 10:07:58 -0700
Subject: [PATCH 401/877] [docs] mention that DenseMap has a SmallDenseMap
 variant (#70677)

via https://github.com/llvm/llvm-project/pull/67699/files#r1375105711
---
 llvm/docs/ProgrammersManual.rst | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/llvm/docs/ProgrammersManual.rst b/llvm/docs/ProgrammersManual.rst
index 22e95261e6392..5bc71bea77cdf 100644
--- a/llvm/docs/ProgrammersManual.rst
+++ b/llvm/docs/ProgrammersManual.rst
@@ -2323,6 +2323,10 @@ construct, but cheap to compare against.  The DenseMapInfo is responsible for
 defining the appropriate comparison and hashing methods for each alternate key
 type used.
 
+DenseMap.h also contains a SmallDenseMap variant, that similar to
+:ref:`SmallVector <dss_smallvector>` performs no heap allocation until the
+number of elements in the template parameter N are exceeded.
+
 .. _dss_valuemap:
 
 llvm/IR/ValueMap.h

From c3f7ca78101b77fa522f059af520526ff878a5b0 Mon Sep 17 00:00:00 2001
From: Michael Buch <michaelbuch12@gmail.com>
Date: Mon, 30 Oct 2023 17:08:25 +0000
Subject: [PATCH 402/877] [lldb][Test] TestDataFormatterLibcxxChrono.py: skip
 test on older clang versions (#70544)

These tests were failing on the LLDB public matrix build-bots for older
clang versions:
```
clang-7: warning: argument unused during compilation: '-nostdlib++' [-Wunused-command-line-argument]
error: invalid value 'c++20' in '-std=c++20'
note: use 'c++98' or 'c++03' for 'ISO C++ 1998 with amendments' standard
note: use 'gnu++98' or 'gnu++03' for 'ISO C++ 1998 with amendments and GNU extensions' standard
note: use 'c++11' for 'ISO C++ 2011 with amendments' standard
note: use 'gnu++11' for 'ISO C++ 2011 with amendments and GNU extensions' standard
note: use 'c++14' for 'ISO C++ 2014 with amendments' standard
note: use 'gnu++14' for 'ISO C++ 2014 with amendments and GNU extensions' standard
note: use 'c++17' for 'ISO C++ 2017 with amendments' standard
note: use 'gnu++17' for 'ISO C++ 2017 with amendments and GNU extensions' standard
note: use 'c++2a' for 'Working draft for ISO C++ 2020' standard
note: use 'gnu++2a' for 'Working draft for ISO C++ 2020 with GNU extensions' standard
make: *** [main.o] Error 1
```

The test fails because we try to compile it with `-std=c++20` (which is
required for std::chrono::{days,weeks,months,years}) on clang versions
that don't support the `-std=c++20` flag.

We could change the test to conditionally compile the C++20 parts of the
test based on the `-std=` flag and have two versions of the python
tests, one for the C++11 chrono features and one for the C++20 features.

This patch instead just disables the test on older clang versions
(because it's simpler and we don't really lose important coverage).
---
 .../libcxx/chrono/TestDataFormatterLibcxxChrono.py               | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/chrono/TestDataFormatterLibcxxChrono.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/chrono/TestDataFormatterLibcxxChrono.py
index 076b0d07b88ae..b2f86817f3b0e 100644
--- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/chrono/TestDataFormatterLibcxxChrono.py
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/chrono/TestDataFormatterLibcxxChrono.py
@@ -11,6 +11,7 @@
 
 class LibcxxChronoDataFormatterTestCase(TestBase):
     @add_test_categories(["libc++"])
+    @skipIf(compiler="clang", compiler_version=["<", "11.0"])
     def test_with_run_command(self):
         """Test that that file and class static variables display correctly."""
         self.build()

From 77e88db6b7cd982769a852b0fedfc57b3374d1d9 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Mon, 30 Oct 2023 10:12:56 -0700
Subject: [PATCH 403/877] [RISCV][GISel] Add missing curly brace to test. NFC

---
 llvm/test/CodeGen/RISCV/GlobalISel/fpr-gpr-copy-rv64.ll | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/fpr-gpr-copy-rv64.ll b/llvm/test/CodeGen/RISCV/GlobalISel/fpr-gpr-copy-rv64.ll
index 2eca0ad66f5c6..287bbbad6d52d 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/fpr-gpr-copy-rv64.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/fpr-gpr-copy-rv64.ll
@@ -16,6 +16,7 @@ define double @fadd_f64(double %x, double %y) {
 ; RV64I-NEXT:    ret
   %a = fadd double %x, %y
   ret double %a
+}
 
 ; Test copying between FPR32 and GPR on RV64.
 ; FIXME: This test should be replaced with a more general calling convention

From 7358c26d6acaa6c393623fde7cbc70372d0c67a8 Mon Sep 17 00:00:00 2001
From: Leandro Lupori <leandro.lupori@linaro.org>
Date: Mon, 30 Oct 2023 14:25:21 -0300
Subject: [PATCH 404/877] [flang] Check for overflows in RESHAPE folding
 (#68342)

TotalElementCount() was modified to return std::optional<uint64_t>,
where std::nullopt means overflow occurred. Besides the additional
check in RESHAPE folding, all callers of TotalElementCount() were
changed, to also check for overflows.
---
 flang/include/flang/Evaluate/constant.h      |  3 +-
 flang/include/flang/Evaluate/initial-image.h | 15 +++-
 flang/lib/Evaluate/constant.cpp              | 31 ++++++--
 flang/lib/Evaluate/fold-designator.cpp       |  4 +-
 flang/lib/Evaluate/fold-implementation.h     | 84 ++++++++++++--------
 flang/lib/Evaluate/initial-image.cpp         | 10 ++-
 flang/lib/Semantics/data-to-inits.cpp        |  2 +
 flang/test/Semantics/reshape.f90             |  4 +
 8 files changed, 107 insertions(+), 46 deletions(-)

diff --git a/flang/include/flang/Evaluate/constant.h b/flang/include/flang/Evaluate/constant.h
index 04474e2f49a0f..8c841918bccbe 100644
--- a/flang/include/flang/Evaluate/constant.h
+++ b/flang/include/flang/Evaluate/constant.h
@@ -46,7 +46,8 @@ inline int GetRank(const ConstantSubscripts &s) {
   return static_cast<int>(s.size());
 }
 
-std::size_t TotalElementCount(const ConstantSubscripts &);
+// Returns the number of elements of shape, if no overflow occurs.
+std::optional<uint64_t> TotalElementCount(const ConstantSubscripts &shape);
 
 // Validate dimension re-ordering like ORDER in RESHAPE.
 // On success, return a vector that can be used as dimOrder in
diff --git a/flang/include/flang/Evaluate/initial-image.h b/flang/include/flang/Evaluate/initial-image.h
index e1f9a68acb00c..dc9a9bfbfdf22 100644
--- a/flang/include/flang/Evaluate/initial-image.h
+++ b/flang/include/flang/Evaluate/initial-image.h
@@ -22,7 +22,14 @@ namespace Fortran::evaluate {
 
 class InitialImage {
 public:
-  enum Result { Ok, NotAConstant, OutOfRange, SizeMismatch, LengthMismatch };
+  enum Result {
+    Ok,
+    NotAConstant,
+    OutOfRange,
+    SizeMismatch,
+    LengthMismatch,
+    TooManyElems
+  };
 
   explicit InitialImage(std::size_t bytes) : data_(bytes) {}
   InitialImage(InitialImage &&that) = default;
@@ -60,7 +67,11 @@ class InitialImage {
     if (offset < 0 || offset + bytes > data_.size()) {
       return OutOfRange;
     } else {
-      auto elements{TotalElementCount(x.shape())};
+      auto optElements{TotalElementCount(x.shape())};
+      if (!optElements) {
+        return TooManyElems;
+      }
+      auto elements{*optElements};
       auto elementBytes{bytes > 0 ? bytes / elements : 0};
       if (elements * elementBytes != bytes) {
         return SizeMismatch;
diff --git a/flang/lib/Evaluate/constant.cpp b/flang/lib/Evaluate/constant.cpp
index 0e0d412118d3b..a3bdefb76a414 100644
--- a/flang/lib/Evaluate/constant.cpp
+++ b/flang/lib/Evaluate/constant.cpp
@@ -80,8 +80,18 @@ ConstantSubscript ConstantBounds::SubscriptsToOffset(
   return offset;
 }
 
-std::size_t TotalElementCount(const ConstantSubscripts &shape) {
-  return static_cast<std::size_t>(GetSize(shape));
+std::optional<uint64_t> TotalElementCount(const ConstantSubscripts &shape) {
+  uint64_t size{1};
+  for (auto dim : shape) {
+    CHECK(dim >= 0);
+    uint64_t osize{size};
+    size = osize * dim;
+    if (size > std::numeric_limits<decltype(dim)>::max() ||
+        (dim != 0 && size / dim != osize)) {
+      return std::nullopt;
+    }
+  }
+  return static_cast<uint64_t>(GetSize(shape));
 }
 
 bool ConstantBounds::IncrementSubscripts(
@@ -135,7 +145,7 @@ template <typename RESULT, typename ELEMENT>
 ConstantBase<RESULT, ELEMENT>::ConstantBase(
     std::vector<Element> &&x, ConstantSubscripts &&sh, Result res)
     : ConstantBounds(std::move(sh)), result_{res}, values_(std::move(x)) {
-  CHECK(size() == TotalElementCount(shape()));
+  CHECK(TotalElementCount(shape()) && size() == *TotalElementCount(shape()));
 }
 
 template <typename RESULT, typename ELEMENT>
@@ -149,7 +159,9 @@ bool ConstantBase<RESULT, ELEMENT>::operator==(const ConstantBase &that) const {
 template <typename RESULT, typename ELEMENT>
 auto ConstantBase<RESULT, ELEMENT>::Reshape(
     const ConstantSubscripts &dims) const -> std::vector<Element> {
-  std::size_t n{TotalElementCount(dims)};
+  std::optional<uint64_t> optN{TotalElementCount(dims)};
+  CHECK(optN);
+  uint64_t n{*optN};
   CHECK(!empty() || n == 0);
   std::vector<Element> elements;
   auto iter{values().cbegin()};
@@ -209,7 +221,8 @@ template <int KIND>
 Constant<Type<TypeCategory::Character, KIND>>::Constant(ConstantSubscript len,
     std::vector<Scalar<Result>> &&strings, ConstantSubscripts &&sh)
     : ConstantBounds(std::move(sh)), length_{len} {
-  CHECK(strings.size() == TotalElementCount(shape()));
+  CHECK(TotalElementCount(shape()) &&
+      strings.size() == *TotalElementCount(shape()));
   values_.assign(strings.size() * length_,
       static_cast<typename Scalar<Result>::value_type>(' '));
   ConstantSubscript at{0};
@@ -236,7 +249,9 @@ bool Constant<Type<TypeCategory::Character, KIND>>::empty() const {
 template <int KIND>
 std::size_t Constant<Type<TypeCategory::Character, KIND>>::size() const {
   if (length_ == 0) {
-    return TotalElementCount(shape());
+    std::optional<uint64_t> n{TotalElementCount(shape())};
+    CHECK(n);
+    return *n;
   } else {
     return static_cast<ConstantSubscript>(values_.size()) / length_;
   }
@@ -274,7 +289,9 @@ auto Constant<Type<TypeCategory::Character, KIND>>::Substring(
 template <int KIND>
 auto Constant<Type<TypeCategory::Character, KIND>>::Reshape(
     ConstantSubscripts &&dims) const -> Constant<Result> {
-  std::size_t n{TotalElementCount(dims)};
+  std::optional<uint64_t> optN{TotalElementCount(dims)};
+  CHECK(optN);
+  uint64_t n{*optN};
   CHECK(!empty() || n == 0);
   std::vector<Element> elements;
   ConstantSubscript at{0},
diff --git a/flang/lib/Evaluate/fold-designator.cpp b/flang/lib/Evaluate/fold-designator.cpp
index 7298b0a2fb10c..6952436681f75 100644
--- a/flang/lib/Evaluate/fold-designator.cpp
+++ b/flang/lib/Evaluate/fold-designator.cpp
@@ -373,7 +373,9 @@ ConstantObjectPointer ConstantObjectPointer::From(
     FoldingContext &context, const Expr<SomeType> &expr) {
   auto extents{GetConstantExtents(context, expr)};
   CHECK(extents);
-  std::size_t elements{TotalElementCount(*extents)};
+  std::optional<uint64_t> optElements{TotalElementCount(*extents)};
+  CHECK(optElements);
+  uint64_t elements{*optElements};
   CHECK(elements > 0);
   int rank{GetRank(*extents)};
   ConstantSubscripts at(rank, 1);
diff --git a/flang/lib/Evaluate/fold-implementation.h b/flang/lib/Evaluate/fold-implementation.h
index 2a40018cd5a38..868b7b6990fd3 100644
--- a/flang/lib/Evaluate/fold-implementation.h
+++ b/flang/lib/Evaluate/fold-implementation.h
@@ -492,7 +492,13 @@ Expr<TR> FoldElementalIntrinsicHelper(FoldingContext &context,
     CHECK(rank == GetRank(shape));
     // Compute all the scalar values of the results
     std::vector<Scalar<TR>> results;
-    if (TotalElementCount(shape) > 0) {
+    std::optional<uint64_t> n{TotalElementCount(shape)};
+    if (!n) {
+      context.messages().Say(
+          "Too many elements in elemental intrinsic function result"_err_en_US);
+      return Expr<TR>{std::move(funcRef)};
+    }
+    if (*n > 0) {
       ConstantBounds bounds{shape};
       ConstantSubscripts resultIndex(rank, 1);
       ConstantSubscripts argIndex[]{std::get<I>(*args)->lbounds()...};
@@ -879,33 +885,40 @@ template <typename T> Expr<T> Folder<T>::RESHAPE(FunctionRef<T> &&funcRef) {
     context_.messages().Say(
         "'shape=' argument must not have a negative extent"_err_en_US);
   } else {
-    int rank{GetRank(shape.value())};
-    std::size_t resultElements{TotalElementCount(shape.value())};
-    std::optional<std::vector<int>> dimOrder;
-    if (order) {
-      dimOrder = ValidateDimensionOrder(rank, *order);
-    }
-    std::vector<int> *dimOrderPtr{dimOrder ? &dimOrder.value() : nullptr};
-    if (order && !dimOrder) {
-      context_.messages().Say("Invalid 'order=' argument in RESHAPE"_err_en_US);
-    } else if (resultElements > source->size() && (!pad || pad->empty())) {
+    std::optional<uint64_t> optResultElement{TotalElementCount(shape.value())};
+    if (!optResultElement) {
       context_.messages().Say(
-          "Too few elements in 'source=' argument and 'pad=' "
-          "argument is not present or has null size"_err_en_US);
+          "'shape=' argument has too many elements"_err_en_US);
     } else {
-      Constant<T> result{!source->empty() || !pad
-              ? source->Reshape(std::move(shape.value()))
-              : pad->Reshape(std::move(shape.value()))};
-      ConstantSubscripts subscripts{result.lbounds()};
-      auto copied{result.CopyFrom(*source,
-          std::min(source->size(), resultElements), subscripts, dimOrderPtr)};
-      if (copied < resultElements) {
-        CHECK(pad);
-        copied += result.CopyFrom(
-            *pad, resultElements - copied, subscripts, dimOrderPtr);
+      int rank{GetRank(shape.value())};
+      uint64_t resultElements{*optResultElement};
+      std::optional<std::vector<int>> dimOrder;
+      if (order) {
+        dimOrder = ValidateDimensionOrder(rank, *order);
+      }
+      std::vector<int> *dimOrderPtr{dimOrder ? &dimOrder.value() : nullptr};
+      if (order && !dimOrder) {
+        context_.messages().Say(
+            "Invalid 'order=' argument in RESHAPE"_err_en_US);
+      } else if (resultElements > source->size() && (!pad || pad->empty())) {
+        context_.messages().Say(
+            "Too few elements in 'source=' argument and 'pad=' "
+            "argument is not present or has null size"_err_en_US);
+      } else {
+        Constant<T> result{!source->empty() || !pad
+                ? source->Reshape(std::move(shape.value()))
+                : pad->Reshape(std::move(shape.value()))};
+        ConstantSubscripts subscripts{result.lbounds()};
+        auto copied{result.CopyFrom(*source,
+            std::min(source->size(), resultElements), subscripts, dimOrderPtr)};
+        if (copied < resultElements) {
+          CHECK(pad);
+          copied += result.CopyFrom(
+              *pad, resultElements - copied, subscripts, dimOrderPtr);
+        }
+        CHECK(copied == resultElements);
+        return Expr<T>{std::move(result)};
       }
-      CHECK(copied == resultElements);
-      return Expr<T>{std::move(result)};
     }
   }
   // Invalid, prevent re-folding
@@ -944,14 +957,19 @@ template <typename T> Expr<T> Folder<T>::SPREAD(FunctionRef<T> &&funcRef) {
     ConstantSubscripts shape{source->shape()};
     shape.insert(shape.begin() + *dim - 1, *ncopies);
     Constant<T> spread{source->Reshape(std::move(shape))};
-    std::vector<int> dimOrder;
-    for (int j{0}; j < sourceRank; ++j) {
-      dimOrder.push_back(j < *dim - 1 ? j : j + 1);
-    }
-    dimOrder.push_back(*dim - 1);
-    ConstantSubscripts at{spread.lbounds()}; // all 1
-    spread.CopyFrom(*source, TotalElementCount(spread.shape()), at, &dimOrder);
-    return Expr<T>{std::move(spread)};
+    std::optional<uint64_t> n{TotalElementCount(spread.shape())};
+    if (!n) {
+      context_.messages().Say("Too many elements in SPREAD result"_err_en_US);
+    } else {
+      std::vector<int> dimOrder;
+      for (int j{0}; j < sourceRank; ++j) {
+        dimOrder.push_back(j < *dim - 1 ? j : j + 1);
+      }
+      dimOrder.push_back(*dim - 1);
+      ConstantSubscripts at{spread.lbounds()}; // all 1
+      spread.CopyFrom(*source, *n, at, &dimOrder);
+      return Expr<T>{std::move(spread)};
+    }
   }
   // Invalid, prevent re-folding
   return MakeInvalidIntrinsic(std::move(funcRef));
diff --git a/flang/lib/Evaluate/initial-image.cpp b/flang/lib/Evaluate/initial-image.cpp
index a0fe4ec95da94..3b0d738c422d4 100644
--- a/flang/lib/Evaluate/initial-image.cpp
+++ b/flang/lib/Evaluate/initial-image.cpp
@@ -18,7 +18,11 @@ auto InitialImage::Add(ConstantSubscript offset, std::size_t bytes,
   if (offset < 0 || offset + bytes > data_.size()) {
     return OutOfRange;
   } else {
-    auto elements{TotalElementCount(x.shape())};
+    auto optElements{TotalElementCount(x.shape())};
+    if (!optElements) {
+      return TooManyElems;
+    }
+    auto elements{*optElements};
     auto elementBytes{bytes > 0 ? bytes / elements : 0};
     if (elements * elementBytes != bytes) {
       return SizeMismatch;
@@ -89,7 +93,9 @@ class AsConstantHelper {
     }
     using Const = Constant<T>;
     using Scalar = typename Const::Element;
-    std::size_t elements{TotalElementCount(extents_)};
+    std::optional<uint64_t> optElements{TotalElementCount(extents_)};
+    CHECK(optElements);
+    uint64_t elements{*optElements};
     std::vector<Scalar> typedValue(elements);
     auto elemBytes{ToInt64(type_.MeasureSizeInBytes(
         context_, GetRank(extents_) > 0, charLength_))};
diff --git a/flang/lib/Semantics/data-to-inits.cpp b/flang/lib/Semantics/data-to-inits.cpp
index bc2d8147e91b5..86f5f399310c0 100644
--- a/flang/lib/Semantics/data-to-inits.cpp
+++ b/flang/lib/Semantics/data-to-inits.cpp
@@ -462,6 +462,8 @@ bool DataInitializationCompiler<DSV>::InitElement(
             "DATA statement value '%s' for '%s' has the wrong length"_warn_en_US,
             folded.AsFortran(), DescribeElement());
         return true;
+      } else if (status == evaluate::InitialImage::TooManyElems) {
+        exprAnalyzer_.Say("DATA statement has too many elements"_err_en_US);
       } else {
         CHECK(exprAnalyzer_.context().AnyFatalError());
       }
diff --git a/flang/test/Semantics/reshape.f90 b/flang/test/Semantics/reshape.f90
index fb5e0023e2716..ea302ceed66aa 100644
--- a/flang/test/Semantics/reshape.f90
+++ b/flang/test/Semantics/reshape.f90
@@ -49,6 +49,10 @@ program reshaper
   integer, parameter :: array21(I64_MAX - 2 : I64_MAX) = [1, 2, 3]
   integer, parameter :: array22(2) = RESHAPE(array21, [2])
 
+  integer(8), parameter :: huge_shape(2) = [I64_MAX, I64_MAX]
+  !ERROR: 'shape=' argument has too many elements
+  integer :: array23(I64_MAX, I64_MAX) = RESHAPE([1, 2, 3], huge_shape)
+
   !ERROR: Size of 'shape=' argument must not be greater than 15
   CALL ext_sub(RESHAPE([(n, n=1,20)], &
     [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]))

From 9a7c26a399a827a5eb6ec5a2c5895c1bb5b08c54 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Mon, 30 Oct 2023 10:27:57 -0700
Subject: [PATCH 405/877] [GISel] Restrict G_BSWAP to multiples of 16 bits.
 (#70245)

This is consistent with the IR verifier and SelectionDAG's getNode.

Update tests accordingly. I tried to keep some coverage of non-pow2 when
possible. X86 didn't like a G_UNMERGE_VALUES from s48 to 3 s16 that got
created when I tried s48.
---
 llvm/lib/CodeGen/MachineVerifier.cpp          |   6 +
 .../AArch64/GlobalISel/legalize-bswap.mir     |  51 +++------
 .../AMDGPU/GlobalISel/legalize-bswap.mir      | 106 ++++++------------
 .../CodeGen/X86/GlobalISel/legalize-bswap.mir |  31 +----
 llvm/test/MachineVerifier/test_g_bswap.mir    |  19 ++++
 5 files changed, 77 insertions(+), 136 deletions(-)
 create mode 100644 llvm/test/MachineVerifier/test_g_bswap.mir

diff --git a/llvm/lib/CodeGen/MachineVerifier.cpp b/llvm/lib/CodeGen/MachineVerifier.cpp
index f3e676b3a41a2..dadaf60fa09da 100644
--- a/llvm/lib/CodeGen/MachineVerifier.cpp
+++ b/llvm/lib/CodeGen/MachineVerifier.cpp
@@ -1592,6 +1592,12 @@ void MachineVerifier::verifyPreISelGenericInstruction(const MachineInstr *MI) {
       report("G_SEXT_INREG size must be less than source bit width", MI);
     break;
   }
+  case TargetOpcode::G_BSWAP: {
+    LLT DstTy = MRI->getType(MI->getOperand(0).getReg());
+    if (DstTy.getScalarSizeInBits() % 16 != 0)
+      report("G_BSWAP size must be a multiple of 16 bits", MI);
+    break;
+  }
   case TargetOpcode::G_SHUFFLE_VECTOR: {
     const MachineOperand &MaskOp = MI->getOperand(3);
     if (!MaskOp.isShuffleMask()) {
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-bswap.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-bswap.mir
index 6111f49660289..fba0881d4e86f 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-bswap.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-bswap.mir
@@ -110,48 +110,27 @@ body:             |
     RET_ReallyLR implicit $q0
 ...
 ---
-name:            bswap_s88
+name:            bswap_s80
 tracksRegLiveness: true
 body:             |
   bb.0:
     liveins: $x0
-    ; CHECK-LABEL: name: bswap_s88
+    ; CHECK-LABEL: name: bswap_s80
     ; CHECK: liveins: $x0
-    ; CHECK: [[DEF:%[0-9]+]]:_(s64) = G_IMPLICIT_DEF
-    ; CHECK: [[BSWAP:%[0-9]+]]:_(s64) = G_BSWAP [[DEF]]
-    ; CHECK: [[BSWAP1:%[0-9]+]]:_(s64) = G_BSWAP [[DEF]]
-    ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 40
-    ; CHECK: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[BSWAP]], [[C]](s64)
-    ; CHECK: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 24
-    ; CHECK: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[BSWAP1]], [[C1]](s64)
-    ; CHECK: [[OR:%[0-9]+]]:_(s64) = G_OR [[LSHR]], [[SHL]]
-    ; CHECK: $x0 = COPY [[OR]](s64)
-    ; CHECK: RET_ReallyLR implicit $x0
-    %val:_(s88) = G_IMPLICIT_DEF
-    %bswap:_(s88) = G_BSWAP %val
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s64) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[BSWAP:%[0-9]+]]:_(s64) = G_BSWAP [[DEF]]
+    ; CHECK-NEXT: [[BSWAP1:%[0-9]+]]:_(s64) = G_BSWAP [[DEF]]
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 48
+    ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[BSWAP]], [[C]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
+    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[BSWAP1]], [[C1]](s64)
+    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s64) = G_OR [[LSHR]], [[SHL]]
+    ; CHECK-NEXT: $x0 = COPY [[OR]](s64)
+    ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    %val:_(s80) = G_IMPLICIT_DEF
+    %bswap:_(s80) = G_BSWAP %val
     %trunc:_(s64) = G_TRUNC %bswap
     $x0 = COPY %trunc(s64)
     RET_ReallyLR implicit $x0
 ...
----
-name:            bswap_s4
-tracksRegLiveness: true
-body:             |
-  bb.0:
-    liveins: $x0
-    ; CHECK-LABEL: name: bswap_s4
-    ; CHECK: liveins: $x0
-    ; CHECK: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-    ; CHECK: [[BSWAP:%[0-9]+]]:_(s32) = G_BSWAP [[DEF]]
-    ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 28
-    ; CHECK: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BSWAP]], [[C]](s64)
-    ; CHECK: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 15
-    ; CHECK: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LSHR]](s32)
-    ; CHECK: %ext:_(s64) = G_AND [[ANYEXT]], [[C1]]
-    ; CHECK: $x0 = COPY %ext(s64)
-    ; CHECK: RET_ReallyLR implicit $x0
-    %val:_(s4) = G_IMPLICIT_DEF
-    %bswap:_(s4) = G_BSWAP %val
-    %ext:_(s64) = G_ZEXT %bswap
-    $x0 = COPY %ext(s64)
-    RET_ReallyLR implicit $x0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-bswap.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-bswap.mir
index 2b855e33e96d4..63235842de57b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-bswap.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-bswap.mir
@@ -2,42 +2,6 @@
 # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -O0 -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX7 %s
 # RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -O0 -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX8 %s
 
----
-name: bswap_s8
-
-body: |
-  bb.0:
-    liveins: $vgpr0
-    ; GFX7-LABEL: name: bswap_s8
-    ; GFX7: liveins: $vgpr0
-    ; GFX7-NEXT: {{  $}}
-    ; GFX7-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; GFX7-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-    ; GFX7-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
-    ; GFX7-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C]](s32)
-    ; GFX7-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[COPY1]](s32)
-    ; GFX7-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C]](s32)
-    ; GFX7-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C1]]
-    ; GFX7-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[AND]], [[COPY2]](s32)
-    ; GFX7-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[LSHR]], [[SHL]]
-    ; GFX7-NEXT: $vgpr0 = COPY [[OR]](s32)
-    ; GFX8-LABEL: name: bswap_s8
-    ; GFX8: liveins: $vgpr0
-    ; GFX8-NEXT: {{  $}}
-    ; GFX8-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; GFX8-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
-    ; GFX8-NEXT: [[BSWAP:%[0-9]+]]:_(s16) = G_BSWAP [[TRUNC]]
-    ; GFX8-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
-    ; GFX8-NEXT: [[LSHR:%[0-9]+]]:_(s16) = G_LSHR [[BSWAP]], [[C]](s16)
-    ; GFX8-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR]](s16)
-    ; GFX8-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
-    %0:_(s32) = COPY $vgpr0
-    %1:_(s8) = G_TRUNC %0
-    %2:_(s8) = G_BSWAP %1
-    %3:_(s32) = G_ANYEXT %2
-    $vgpr0 = COPY %3
-...
-
 ---
 name: bswap_s16
 
@@ -74,40 +38,6 @@ body: |
     $vgpr0 = COPY %3
 ...
 
----
-name: bswap_s24
-
-body: |
-  bb.0:
-    liveins: $vgpr0
-    ; GFX7-LABEL: name: bswap_s24
-    ; GFX7: liveins: $vgpr0
-    ; GFX7-NEXT: {{  $}}
-    ; GFX7-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; GFX7-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX7-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16777215
-    ; GFX7-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C]](s32)
-    ; GFX7-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[COPY1]](s32)
-    ; GFX7-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C]](s32)
-    ; GFX7-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C1]]
-    ; GFX7-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[AND]], [[COPY2]](s32)
-    ; GFX7-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[LSHR]], [[SHL]]
-    ; GFX7-NEXT: $vgpr0 = COPY [[OR]](s32)
-    ; GFX8-LABEL: name: bswap_s24
-    ; GFX8: liveins: $vgpr0
-    ; GFX8-NEXT: {{  $}}
-    ; GFX8-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; GFX8-NEXT: [[BSWAP:%[0-9]+]]:_(s32) = G_BSWAP [[COPY]]
-    ; GFX8-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; GFX8-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BSWAP]], [[C]](s32)
-    ; GFX8-NEXT: $vgpr0 = COPY [[LSHR]](s32)
-    %0:_(s32) = COPY $vgpr0
-    %1:_(s24) = G_TRUNC %0
-    %2:_(s24) = G_BSWAP %1
-    %3:_(s32) = G_ANYEXT %2
-    $vgpr0 = COPY %3
-...
-
 ---
 name: bswap_s32
 
@@ -438,3 +368,39 @@ body: |
     %1:_(<2 x s64>) = G_BSWAP %0
     $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1
 ...
+
+---
+name: bswap_s48
+
+body: |
+  bb.0:
+    liveins: $vgpr0_vgpr1
+    ; GFX7-LABEL: name: bswap_s48
+    ; GFX7: liveins: $vgpr0_vgpr1
+    ; GFX7-NEXT: {{  $}}
+    ; GFX7-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
+    ; GFX7-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64)
+    ; GFX7-NEXT: [[BSWAP:%[0-9]+]]:_(s32) = G_BSWAP [[UV1]]
+    ; GFX7-NEXT: [[BSWAP1:%[0-9]+]]:_(s32) = G_BSWAP [[UV]]
+    ; GFX7-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[BSWAP]](s32), [[BSWAP1]](s32)
+    ; GFX7-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX7-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[MV]], [[C]](s32)
+    ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[LSHR]](s64)
+    ;
+    ; GFX8-LABEL: name: bswap_s48
+    ; GFX8: liveins: $vgpr0_vgpr1
+    ; GFX8-NEXT: {{  $}}
+    ; GFX8-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
+    ; GFX8-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64)
+    ; GFX8-NEXT: [[BSWAP:%[0-9]+]]:_(s32) = G_BSWAP [[UV1]]
+    ; GFX8-NEXT: [[BSWAP1:%[0-9]+]]:_(s32) = G_BSWAP [[UV]]
+    ; GFX8-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[BSWAP]](s32), [[BSWAP1]](s32)
+    ; GFX8-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX8-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[MV]], [[C]](s32)
+    ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[LSHR]](s64)
+    %0:_(s64) = COPY $vgpr0_vgpr1
+    %1:_(s48) = G_TRUNC %0
+    %2:_(s48) = G_BSWAP %1
+    %3:_(s64) = G_ANYEXT %2
+    $vgpr0_vgpr1 = COPY %3
+...
diff --git a/llvm/test/CodeGen/X86/GlobalISel/legalize-bswap.mir b/llvm/test/CodeGen/X86/GlobalISel/legalize-bswap.mir
index bdac19b090d22..5aa78f33aa98b 100644
--- a/llvm/test/CodeGen/X86/GlobalISel/legalize-bswap.mir
+++ b/llvm/test/CodeGen/X86/GlobalISel/legalize-bswap.mir
@@ -2,37 +2,8 @@
 # RUN: llc -mtriple=i386-linux-gnu -run-pass=legalizer %s -o - | FileCheck %s --check-prefix=X86-32
 # RUN: llc -mtriple=x86_64-linux-gnu -run-pass=legalizer %s -o - | FileCheck %s --check-prefix=X86-64
 
-# test bswap for s16, s17, s32, and s64
+# test bswap for s16, s32, and s64
 
-...
----
-name:            test_bswap17
-body:             |
-  bb.1:
-    ; X86-32-LABEL: name: test_bswap17
-    ; X86-32: [[DEF:%[0-9]+]]:_(s17) = IMPLICIT_DEF
-    ; X86-32-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[DEF]](s17)
-    ; X86-32-NEXT: [[BSWAP:%[0-9]+]]:_(s32) = G_BSWAP [[ANYEXT]]
-    ; X86-32-NEXT: [[C:%[0-9]+]]:_(s8) = G_CONSTANT i8 15
-    ; X86-32-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BSWAP]], [[C]](s8)
-    ; X86-32-NEXT: [[TRUNC:%[0-9]+]]:_(s17) = G_TRUNC [[LSHR]](s32)
-    ; X86-32-NEXT: [[COPY:%[0-9]+]]:_(s17) = COPY [[TRUNC]](s17)
-    ; X86-32-NEXT: RET 0, implicit [[COPY]](s17)
-    ; X86-64-LABEL: name: test_bswap17
-    ; X86-64: [[DEF:%[0-9]+]]:_(s17) = IMPLICIT_DEF
-    ; X86-64-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[DEF]](s17)
-    ; X86-64-NEXT: [[BSWAP:%[0-9]+]]:_(s32) = G_BSWAP [[ANYEXT]]
-    ; X86-64-NEXT: [[C:%[0-9]+]]:_(s8) = G_CONSTANT i8 15
-    ; X86-64-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BSWAP]], [[C]](s8)
-    ; X86-64-NEXT: [[TRUNC:%[0-9]+]]:_(s17) = G_TRUNC [[LSHR]](s32)
-    ; X86-64-NEXT: [[COPY:%[0-9]+]]:_(s17) = COPY [[TRUNC]](s17)
-    ; X86-64-NEXT: RET 0, implicit [[COPY]](s17)
-    %0:_(s17) = IMPLICIT_DEF
-    %1:_(s17) = G_BSWAP %0
-    %2:_(s17) = COPY %1(s17)
-    RET 0, implicit %2
-
-...
 ---
 name:            test_bswap64
 body:             |
diff --git a/llvm/test/MachineVerifier/test_g_bswap.mir b/llvm/test/MachineVerifier/test_g_bswap.mir
new file mode 100644
index 0000000000000..679114f06a00e
--- /dev/null
+++ b/llvm/test/MachineVerifier/test_g_bswap.mir
@@ -0,0 +1,19 @@
+#RUN: not --crash llc -mtriple=aarch64 -o - -global-isel -run-pass=none -verify-machineinstrs %s 2>&1 | FileCheck %s
+# REQUIRES: aarch64-registered-target
+
+---
+name:            test_bswap
+legalized:       true
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+liveins:
+body:             |
+  bb.0:
+
+    %0:_(s17) = G_CONSTANT i32 17
+
+    ; CHECK: Bad machine code: G_BSWAP size must be a multiple of 16 bits
+    %1:_(s17) = G_BSWAP %0
+
+...

From e46dd6fbc0aff05884b5e3736bb4286a5914a2d3 Mon Sep 17 00:00:00 2001
From: Nikita Popov <nikita.ppv@gmail.com>
Date: Mon, 30 Oct 2023 18:31:33 +0100
Subject: [PATCH 406/877] Revert "[InstCombine] Simplify and/or of icmp eq with
 op replacement (#70335)"

This reverts commit 1770a2e325192f1665018e21200596da1904a330.

Stage 2 llvm-tblgen crashes when generating X86GenAsmWriter.inc and
other files.
---
 llvm/lib/Analysis/InstructionSimplify.cpp     |  60 --
 llvm/test/CodeGen/PowerPC/pr45448.ll          |  10 +-
 .../div-by-0-guard-before-smul_ov.ll          |  11 +-
 .../div-by-0-guard-before-umul_ov.ll          |  11 +-
 llvm/test/Transforms/InstCombine/ispow2.ll    |  24 +-
 .../InstSimplify/and-or-icmp-ctpop.ll         |  10 +-
 .../InstSimplify/and-or-icmp-min-max.ll       | 617 +++++++++++++-----
 .../InstSimplify/and-or-implied-cond.ll       |  10 +-
 .../div-by-0-guard-before-smul_ov-not.ll      |  13 +-
 .../div-by-0-guard-before-smul_ov.ll          |  11 +-
 .../div-by-0-guard-before-umul_ov-not.ll      |  13 +-
 .../div-by-0-guard-before-umul_ov.ll          |  11 +-
 ...f-negative-is-non-zero-and-no-underflow.ll |  20 +-
 llvm/test/Transforms/PGOProfile/chr.ll        |   7 +-
 14 files changed, 587 insertions(+), 241 deletions(-)

diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp
index a68ca950e5232..f0e60c9a2dac6 100644
--- a/llvm/lib/Analysis/InstructionSimplify.cpp
+++ b/llvm/lib/Analysis/InstructionSimplify.cpp
@@ -2025,52 +2025,6 @@ static Value *simplifyAndOrOfCmps(const SimplifyQuery &Q, Value *Op0,
   return nullptr;
 }
 
-static Value *simplifyWithOpReplaced(Value *V, Value *Op, Value *RepOp,
-                                     const SimplifyQuery &Q,
-                                     bool AllowRefinement,
-                                     SmallVectorImpl<Instruction *> *DropFlags,
-                                     unsigned MaxRecurse);
-
-static Value *simplifyAndOrWithICmpEq(unsigned Opcode, Value *Op0, Value *Op1,
-                                      const SimplifyQuery &Q,
-                                      unsigned MaxRecurse) {
-  assert((Opcode == Instruction::And || Opcode == Instruction::Or) &&
-         "Must be and/or");
-  ICmpInst::Predicate Pred;
-  Value *A, *B;
-  if (!match(Op0, m_ICmp(Pred, m_Value(A), m_Value(B))) ||
-      !ICmpInst::isEquality(Pred) || !MaxRecurse--)
-    return nullptr;
-
-  auto Simplify = [&](Value *Res) -> Value * {
-    // and (icmp eq a, b), x implies (a==b) inside x.
-    // or (icmp ne a, b), x implies (a==b) inside x.
-    // If x simplifies to true/false, we can simplify the and/or.
-    if (Pred ==
-        (Opcode == Instruction::And ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_NE))
-      return simplifyBinOp(Opcode, Op0, Res, Q, MaxRecurse);
-    // If we have and (icmp ne a, b), x and for a==b we can simplify x to false,
-    // then we can drop the icmp, as x will already be false in the case where
-    // the icmp is false. Similar for or and true.
-    if (Res == ConstantExpr::getBinOpAbsorber(Opcode, Res->getType()))
-      return Op1;
-    return nullptr;
-  };
-
-  // Increment MaxRecurse again, because simplifyWithOpReplaced() does its own
-  // decrement.
-  if (Value *Res =
-          simplifyWithOpReplaced(Op1, A, B, Q, /* AllowRefinement */ true,
-                                 /* DropFlags */ nullptr, MaxRecurse + 1))
-    return Simplify(Res);
-  if (Value *Res =
-          simplifyWithOpReplaced(Op1, B, A, Q, /* AllowRefinement */ true,
-                                 /* DropFlags */ nullptr, MaxRecurse + 1))
-    return Simplify(Res);
-
-  return nullptr;
-}
-
 /// Given a bitwise logic op, check if the operands are add/sub with a common
 /// source value and inverted constant (identity: C - X -> ~(X + ~C)).
 static Value *simplifyLogicOfAddSub(Value *Op0, Value *Op1,
@@ -2205,13 +2159,6 @@ static Value *simplifyAndInst(Value *Op0, Value *Op1, const SimplifyQuery &Q,
       isKnownToBeAPowerOfTwo(Op0, Q.DL, /*OrZero*/ true, 0, Q.AC, Q.CxtI, Q.DT))
     return Constant::getNullValue(Op0->getType());
 
-  if (Value *V =
-          simplifyAndOrWithICmpEq(Instruction::And, Op0, Op1, Q, MaxRecurse))
-    return V;
-  if (Value *V =
-          simplifyAndOrWithICmpEq(Instruction::And, Op1, Op0, Q, MaxRecurse))
-    return V;
-
   if (Value *V = simplifyAndOrOfCmps(Q, Op0, Op1, true))
     return V;
 
@@ -2488,13 +2435,6 @@ static Value *simplifyOrInst(Value *Op0, Value *Op1, const SimplifyQuery &Q,
       match(Op0, m_LShr(m_Specific(X), m_Specific(Y))))
     return Op1;
 
-  if (Value *V =
-          simplifyAndOrWithICmpEq(Instruction::Or, Op0, Op1, Q, MaxRecurse))
-    return V;
-  if (Value *V =
-          simplifyAndOrWithICmpEq(Instruction::Or, Op1, Op0, Q, MaxRecurse))
-    return V;
-
   if (Value *V = simplifyAndOrOfCmps(Q, Op0, Op1, false))
     return V;
 
diff --git a/llvm/test/CodeGen/PowerPC/pr45448.ll b/llvm/test/CodeGen/PowerPC/pr45448.ll
index 0f2dcb3ccc8a0..6b3d578f6b338 100644
--- a/llvm/test/CodeGen/PowerPC/pr45448.ll
+++ b/llvm/test/CodeGen/PowerPC/pr45448.ll
@@ -20,16 +20,20 @@ define hidden void @julia_tryparse_internal_45896() #0 {
 ; CHECK-NEXT:  .LBB0_6: # %fail194
 ; CHECK-NEXT:  .LBB0_7: # %L670
 ; CHECK-NEXT:    li r5, -3
+; CHECK-NEXT:    cmpdi r3, 0
 ; CHECK-NEXT:    sradi r4, r3, 63
 ; CHECK-NEXT:    rldic r5, r5, 4, 32
+; CHECK-NEXT:    crnot 4*cr5+lt, eq
 ; CHECK-NEXT:    mulhdu r3, r3, r5
 ; CHECK-NEXT:    maddld r6, r4, r5, r3
 ; CHECK-NEXT:    cmpld cr1, r6, r3
 ; CHECK-NEXT:    mulhdu. r3, r4, r5
+; CHECK-NEXT:    bc 4, 4*cr5+lt, .LBB0_10
+; CHECK-NEXT:  # %bb.8: # %L670
 ; CHECK-NEXT:    crorc 4*cr5+lt, 4*cr1+lt, eq
-; CHECK-NEXT:    bc 4, 4*cr5+lt, .LBB0_9
-; CHECK-NEXT:  # %bb.8: # %L917
-; CHECK-NEXT:  .LBB0_9: # %L994
+; CHECK-NEXT:    bc 4, 4*cr5+lt, .LBB0_10
+; CHECK-NEXT:  # %bb.9: # %L917
+; CHECK-NEXT:  .LBB0_10: # %L994
 top:
   %0 = load i64, ptr undef, align 8
   %1 = icmp ne i64 %0, 0
diff --git a/llvm/test/Transforms/InstCombine/div-by-0-guard-before-smul_ov.ll b/llvm/test/Transforms/InstCombine/div-by-0-guard-before-smul_ov.ll
index 08eefbebb7363..23bfc75b945ba 100644
--- a/llvm/test/Transforms/InstCombine/div-by-0-guard-before-smul_ov.ll
+++ b/llvm/test/Transforms/InstCombine/div-by-0-guard-before-smul_ov.ll
@@ -47,7 +47,11 @@ define i1 @n2_wrong_size(i4 %size0, i4 %size1, i4 %nmemb) {
 
 define i1 @n3_wrong_pred(i4 %size, i4 %nmemb) {
 ; CHECK-LABEL: @n3_wrong_pred(
-; CHECK-NEXT:    ret i1 false
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i4 [[SIZE:%.*]], 0
+; CHECK-NEXT:    [[SMUL:%.*]] = tail call { i4, i1 } @llvm.smul.with.overflow.i4(i4 [[SIZE]], i4 [[NMEMB:%.*]])
+; CHECK-NEXT:    [[SMUL_OV:%.*]] = extractvalue { i4, i1 } [[SMUL]], 1
+; CHECK-NEXT:    [[AND:%.*]] = and i1 [[SMUL_OV]], [[CMP]]
+; CHECK-NEXT:    ret i1 [[AND]]
 ;
   %cmp = icmp eq i4 %size, 0 ; not 'ne'
   %smul = tail call { i4, i1 } @llvm.smul.with.overflow.i4(i4 %size, i4 %nmemb)
@@ -59,7 +63,10 @@ define i1 @n3_wrong_pred(i4 %size, i4 %nmemb) {
 define i1 @n4_not_and(i4 %size, i4 %nmemb) {
 ; CHECK-LABEL: @n4_not_and(
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i4 [[SIZE:%.*]], 0
-; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK-NEXT:    [[SMUL:%.*]] = tail call { i4, i1 } @llvm.smul.with.overflow.i4(i4 [[SIZE]], i4 [[NMEMB:%.*]])
+; CHECK-NEXT:    [[SMUL_OV:%.*]] = extractvalue { i4, i1 } [[SMUL]], 1
+; CHECK-NEXT:    [[AND:%.*]] = or i1 [[SMUL_OV]], [[CMP]]
+; CHECK-NEXT:    ret i1 [[AND]]
 ;
   %cmp = icmp ne i4 %size, 0
   %smul = tail call { i4, i1 } @llvm.smul.with.overflow.i4(i4 %size, i4 %nmemb)
diff --git a/llvm/test/Transforms/InstCombine/div-by-0-guard-before-umul_ov.ll b/llvm/test/Transforms/InstCombine/div-by-0-guard-before-umul_ov.ll
index 047f8855fe5cb..dbc3b5e7a25be 100644
--- a/llvm/test/Transforms/InstCombine/div-by-0-guard-before-umul_ov.ll
+++ b/llvm/test/Transforms/InstCombine/div-by-0-guard-before-umul_ov.ll
@@ -47,7 +47,11 @@ define i1 @n2_wrong_size(i4 %size0, i4 %size1, i4 %nmemb) {
 
 define i1 @n3_wrong_pred(i4 %size, i4 %nmemb) {
 ; CHECK-LABEL: @n3_wrong_pred(
-; CHECK-NEXT:    ret i1 false
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i4 [[SIZE:%.*]], 0
+; CHECK-NEXT:    [[UMUL:%.*]] = tail call { i4, i1 } @llvm.umul.with.overflow.i4(i4 [[SIZE]], i4 [[NMEMB:%.*]])
+; CHECK-NEXT:    [[UMUL_OV:%.*]] = extractvalue { i4, i1 } [[UMUL]], 1
+; CHECK-NEXT:    [[AND:%.*]] = and i1 [[UMUL_OV]], [[CMP]]
+; CHECK-NEXT:    ret i1 [[AND]]
 ;
   %cmp = icmp eq i4 %size, 0 ; not 'ne'
   %umul = tail call { i4, i1 } @llvm.umul.with.overflow.i4(i4 %size, i4 %nmemb)
@@ -59,7 +63,10 @@ define i1 @n3_wrong_pred(i4 %size, i4 %nmemb) {
 define i1 @n4_not_and(i4 %size, i4 %nmemb) {
 ; CHECK-LABEL: @n4_not_and(
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i4 [[SIZE:%.*]], 0
-; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK-NEXT:    [[UMUL:%.*]] = tail call { i4, i1 } @llvm.umul.with.overflow.i4(i4 [[SIZE]], i4 [[NMEMB:%.*]])
+; CHECK-NEXT:    [[UMUL_OV:%.*]] = extractvalue { i4, i1 } [[UMUL]], 1
+; CHECK-NEXT:    [[AND:%.*]] = or i1 [[UMUL_OV]], [[CMP]]
+; CHECK-NEXT:    ret i1 [[AND]]
 ;
   %cmp = icmp ne i4 %size, 0
   %umul = tail call { i4, i1 } @llvm.umul.with.overflow.i4(i4 %size, i4 %nmemb)
diff --git a/llvm/test/Transforms/InstCombine/ispow2.ll b/llvm/test/Transforms/InstCombine/ispow2.ll
index cc50c5cd1e668..740f79cd32b39 100644
--- a/llvm/test/Transforms/InstCombine/ispow2.ll
+++ b/llvm/test/Transforms/InstCombine/ispow2.ll
@@ -392,7 +392,9 @@ define i1 @is_pow2_ctpop_wrong_pred1(i32 %x) {
 ; CHECK-LABEL: @is_pow2_ctpop_wrong_pred1(
 ; CHECK-NEXT:    [[T0:%.*]] = tail call i32 @llvm.ctpop.i32(i32 [[X:%.*]]), !range [[RNG0]]
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i32 [[T0]], 2
-; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK-NEXT:    [[NOTZERO:%.*]] = icmp ne i32 [[X]], 0
+; CHECK-NEXT:    [[R:%.*]] = and i1 [[NOTZERO]], [[CMP]]
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %t0 = tail call i32 @llvm.ctpop.i32(i32 %x)
   %cmp = icmp ugt i32 %t0, 2
@@ -944,7 +946,9 @@ define i1 @is_pow2or0_ctpop_wrong_pred1(i32 %x) {
 ; CHECK-LABEL: @is_pow2or0_ctpop_wrong_pred1(
 ; CHECK-NEXT:    [[T0:%.*]] = tail call i32 @llvm.ctpop.i32(i32 [[X:%.*]]), !range [[RNG0]]
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[T0]], 1
-; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK-NEXT:    [[ISZERO:%.*]] = icmp eq i32 [[X]], 0
+; CHECK-NEXT:    [[R:%.*]] = or i1 [[ISZERO]], [[CMP]]
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %t0 = tail call i32 @llvm.ctpop.i32(i32 %x)
   %cmp = icmp ne i32 %t0, 1
@@ -955,7 +959,11 @@ define i1 @is_pow2or0_ctpop_wrong_pred1(i32 %x) {
 
 define i1 @is_pow2or0_ctpop_wrong_pred2(i32 %x) {
 ; CHECK-LABEL: @is_pow2or0_ctpop_wrong_pred2(
-; CHECK-NEXT:    ret i1 true
+; CHECK-NEXT:    [[T0:%.*]] = tail call i32 @llvm.ctpop.i32(i32 [[X:%.*]]), !range [[RNG0]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[T0]], 1
+; CHECK-NEXT:    [[ISZERO:%.*]] = icmp ne i32 [[X]], 0
+; CHECK-NEXT:    [[R:%.*]] = or i1 [[ISZERO]], [[CMP]]
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %t0 = tail call i32 @llvm.ctpop.i32(i32 %x)
   %cmp = icmp ne i32 %t0, 1
@@ -1141,7 +1149,9 @@ define i1 @isnot_pow2nor0_ctpop_wrong_pred1(i32 %x) {
 ; CHECK-LABEL: @isnot_pow2nor0_ctpop_wrong_pred1(
 ; CHECK-NEXT:    [[T0:%.*]] = tail call i32 @llvm.ctpop.i32(i32 [[X:%.*]]), !range [[RNG0]]
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[T0]], 1
-; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK-NEXT:    [[NOTZERO:%.*]] = icmp ne i32 [[X]], 0
+; CHECK-NEXT:    [[R:%.*]] = and i1 [[NOTZERO]], [[CMP]]
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %t0 = tail call i32 @llvm.ctpop.i32(i32 %x)
   %cmp = icmp eq i32 %t0, 1
@@ -1152,7 +1162,11 @@ define i1 @isnot_pow2nor0_ctpop_wrong_pred1(i32 %x) {
 
 define i1 @isnot_pow2nor0_ctpop_wrong_pred2(i32 %x) {
 ; CHECK-LABEL: @isnot_pow2nor0_ctpop_wrong_pred2(
-; CHECK-NEXT:    ret i1 false
+; CHECK-NEXT:    [[T0:%.*]] = tail call i32 @llvm.ctpop.i32(i32 [[X:%.*]]), !range [[RNG0]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[T0]], 1
+; CHECK-NEXT:    [[NOTZERO:%.*]] = icmp eq i32 [[X]], 0
+; CHECK-NEXT:    [[R:%.*]] = and i1 [[NOTZERO]], [[CMP]]
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %t0 = tail call i32 @llvm.ctpop.i32(i32 %x)
   %cmp = icmp eq i32 %t0, 1
diff --git a/llvm/test/Transforms/InstSimplify/and-or-icmp-ctpop.ll b/llvm/test/Transforms/InstSimplify/and-or-icmp-ctpop.ll
index 6fe8d29bd10bf..6de97c3a7a76d 100644
--- a/llvm/test/Transforms/InstSimplify/and-or-icmp-ctpop.ll
+++ b/llvm/test/Transforms/InstSimplify/and-or-icmp-ctpop.ll
@@ -40,7 +40,11 @@ define <2 x i1> @eq_or_non_0_commute(<2 x i32> %x) {
 
 define i1 @eq_or_non_0_wrong_pred1(i32 %x) {
 ; CHECK-LABEL: @eq_or_non_0_wrong_pred1(
-; CHECK-NEXT:    ret i1 true
+; CHECK-NEXT:    [[T0:%.*]] = tail call i32 @llvm.ctpop.i32(i32 [[X:%.*]])
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[T0]], 10
+; CHECK-NEXT:    [[NOTZERO:%.*]] = icmp ne i32 [[X]], 0
+; CHECK-NEXT:    [[R:%.*]] = or i1 [[NOTZERO]], [[CMP]]
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %t0 = tail call i32 @llvm.ctpop.i32(i32 %x)
   %cmp = icmp ne i32 %t0, 10
@@ -86,7 +90,9 @@ define i1 @ne_and_is_0_wrong_pred1(i32 %x) {
 ; CHECK-LABEL: @ne_and_is_0_wrong_pred1(
 ; CHECK-NEXT:    [[T0:%.*]] = tail call i32 @llvm.ctpop.i32(i32 [[X:%.*]])
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[T0]], 10
-; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK-NEXT:    [[ISZERO:%.*]] = icmp eq i32 [[X]], 0
+; CHECK-NEXT:    [[R:%.*]] = or i1 [[ISZERO]], [[CMP]]
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %t0 = tail call i32 @llvm.ctpop.i32(i32 %x)
   %cmp = icmp ne i32 %t0, 10
diff --git a/llvm/test/Transforms/InstSimplify/and-or-icmp-min-max.ll b/llvm/test/Transforms/InstSimplify/and-or-icmp-min-max.ll
index 4e3832f31e5a4..7ea1797c99898 100644
--- a/llvm/test/Transforms/InstSimplify/and-or-icmp-min-max.ll
+++ b/llvm/test/Transforms/InstSimplify/and-or-icmp-min-max.ll
@@ -16,7 +16,10 @@
 
 define i1 @slt_and_max(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @slt_and_max(
-; CHECK-NEXT:    ret i1 false
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X]], 127
+; CHECK-NEXT:    [[R:%.*]] = and i1 [[CMP]], [[CMPEQ]]
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %cmp = icmp slt i8 %x, %y
   %cmpeq = icmp eq i8 %x, 127
@@ -26,7 +29,10 @@ define i1 @slt_and_max(i8 %x, i8 %y)  {
 
 define <2 x i1> @slt_and_max_commute(<2 x i8> %x, <2 x i8> %y)  {
 ; CHECK-LABEL: @slt_and_max_commute(
-; CHECK-NEXT:    ret <2 x i1> zeroinitializer
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt <2 x i8> [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq <2 x i8> [[X]], <i8 127, i8 127>
+; CHECK-NEXT:    [[R:%.*]] = and <2 x i1> [[CMPEQ]], [[CMP]]
+; CHECK-NEXT:    ret <2 x i1> [[R]]
 ;
   %cmp = icmp slt <2 x i8> %x, %y
   %cmpeq = icmp eq <2 x i8> %x, <i8 127, i8 127>
@@ -36,7 +42,10 @@ define <2 x i1> @slt_and_max_commute(<2 x i8> %x, <2 x i8> %y)  {
 
 define i1 @slt_swap_and_max(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @slt_swap_and_max(
-; CHECK-NEXT:    ret i1 false
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i8 [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X]], 127
+; CHECK-NEXT:    [[R:%.*]] = and i1 [[CMP]], [[CMPEQ]]
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %cmp = icmp sgt i8 %y, %x
   %cmpeq = icmp eq i8 %x, 127
@@ -46,7 +55,10 @@ define i1 @slt_swap_and_max(i8 %x, i8 %y)  {
 
 define i1 @slt_swap_and_max_commute(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @slt_swap_and_max_commute(
-; CHECK-NEXT:    ret i1 false
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i8 [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X]], 127
+; CHECK-NEXT:    [[R:%.*]] = and i1 [[CMPEQ]], [[CMP]]
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %cmp = icmp sgt i8 %y, %x
   %cmpeq = icmp eq i8 %x, 127
@@ -56,7 +68,10 @@ define i1 @slt_swap_and_max_commute(i8 %x, i8 %y)  {
 
 define i1 @ult_and_max(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @ult_and_max(
-; CHECK-NEXT:    ret i1 false
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i8 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X]], -1
+; CHECK-NEXT:    [[R:%.*]] = and i1 [[CMP]], [[CMPEQ]]
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %cmp = icmp ult i8 %x, %y
   %cmpeq = icmp eq i8 %x, 255
@@ -66,7 +81,10 @@ define i1 @ult_and_max(i8 %x, i8 %y)  {
 
 define i1 @ult_and_max_commute(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @ult_and_max_commute(
-; CHECK-NEXT:    ret i1 false
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i8 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X]], -1
+; CHECK-NEXT:    [[R:%.*]] = and i1 [[CMPEQ]], [[CMP]]
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %cmp = icmp ult i8 %x, %y
   %cmpeq = icmp eq i8 %x, 255
@@ -76,7 +94,10 @@ define i1 @ult_and_max_commute(i8 %x, i8 %y)  {
 
 define i1 @ult_swap_and_max(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @ult_swap_and_max(
-; CHECK-NEXT:    ret i1 false
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i8 [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X]], -1
+; CHECK-NEXT:    [[R:%.*]] = and i1 [[CMP]], [[CMPEQ]]
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %cmp = icmp ugt i8 %y, %x
   %cmpeq = icmp eq i8 %x, 255
@@ -86,7 +107,10 @@ define i1 @ult_swap_and_max(i8 %x, i8 %y)  {
 
 define i1 @ult_swap_and_max_commute(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @ult_swap_and_max_commute(
-; CHECK-NEXT:    ret i1 false
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i8 [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X]], -1
+; CHECK-NEXT:    [[R:%.*]] = and i1 [[CMPEQ]], [[CMP]]
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %cmp = icmp ugt i8 %y, %x
   %cmpeq = icmp eq i8 %x, 255
@@ -102,7 +126,10 @@ define i1 @ult_swap_and_max_commute(i8 %x, i8 %y)  {
 
 define i1 @sgt_and_min(i9 %x, i9 %y)  {
 ; CHECK-LABEL: @sgt_and_min(
-; CHECK-NEXT:    ret i1 false
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i9 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i9 [[X]], -256
+; CHECK-NEXT:    [[R:%.*]] = and i1 [[CMP]], [[CMPEQ]]
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %cmp = icmp sgt i9 %x, %y
   %cmpeq = icmp eq i9 %x, 256
@@ -112,7 +139,10 @@ define i1 @sgt_and_min(i9 %x, i9 %y)  {
 
 define i1 @sgt_and_min_commute(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @sgt_and_min_commute(
-; CHECK-NEXT:    ret i1 false
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i8 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X]], -128
+; CHECK-NEXT:    [[R:%.*]] = and i1 [[CMPEQ]], [[CMP]]
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %cmp = icmp sgt i8 %x, %y
   %cmpeq = icmp eq i8 %x, 128
@@ -122,7 +152,10 @@ define i1 @sgt_and_min_commute(i8 %x, i8 %y)  {
 
 define i1 @sgt_swap_and_min(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @sgt_swap_and_min(
-; CHECK-NEXT:    ret i1 false
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X]], -128
+; CHECK-NEXT:    [[R:%.*]] = and i1 [[CMP]], [[CMPEQ]]
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %cmp = icmp slt i8 %y, %x
   %cmpeq = icmp eq i8 %x, 128
@@ -132,7 +165,10 @@ define i1 @sgt_swap_and_min(i8 %x, i8 %y)  {
 
 define i1 @sgt_swap_and_min_commute(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @sgt_swap_and_min_commute(
-; CHECK-NEXT:    ret i1 false
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X]], -128
+; CHECK-NEXT:    [[R:%.*]] = and i1 [[CMPEQ]], [[CMP]]
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %cmp = icmp slt i8 %y, %x
   %cmpeq = icmp eq i8 %x, 128
@@ -188,7 +224,10 @@ define i1 @ugt_swap_and_min_commute(i8 %x, i8 %y)  {
 
 define i1 @sge_or_not_max(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @sge_or_not_max(
-; CHECK-NEXT:    ret i1 true
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sge i8 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X]], 127
+; CHECK-NEXT:    [[R:%.*]] = or i1 [[CMP]], [[CMPEQ]]
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %cmp = icmp sge i8 %x, %y
   %cmpeq = icmp ne i8 %x, 127
@@ -198,7 +237,10 @@ define i1 @sge_or_not_max(i8 %x, i8 %y)  {
 
 define i1 @sge_or_not_max_commute(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @sge_or_not_max_commute(
-; CHECK-NEXT:    ret i1 true
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sge i8 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X]], 127
+; CHECK-NEXT:    [[R:%.*]] = or i1 [[CMPEQ]], [[CMP]]
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %cmp = icmp sge i8 %x, %y
   %cmpeq = icmp ne i8 %x, 127
@@ -208,7 +250,10 @@ define i1 @sge_or_not_max_commute(i8 %x, i8 %y)  {
 
 define i1 @sge_swap_or_not_max(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @sge_swap_or_not_max(
-; CHECK-NEXT:    ret i1 true
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sle i8 [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X]], 127
+; CHECK-NEXT:    [[R:%.*]] = or i1 [[CMP]], [[CMPEQ]]
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %cmp = icmp sle i8 %y, %x
   %cmpeq = icmp ne i8 %x, 127
@@ -218,7 +263,10 @@ define i1 @sge_swap_or_not_max(i8 %x, i8 %y)  {
 
 define i1 @sge_swap_or_not_max_commute(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @sge_swap_or_not_max_commute(
-; CHECK-NEXT:    ret i1 true
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sle i8 [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X]], 127
+; CHECK-NEXT:    [[R:%.*]] = or i1 [[CMPEQ]], [[CMP]]
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %cmp = icmp sle i8 %y, %x
   %cmpeq = icmp ne i8 %x, 127
@@ -228,7 +276,10 @@ define i1 @sge_swap_or_not_max_commute(i8 %x, i8 %y)  {
 
 define i1 @uge_or_not_max(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @uge_or_not_max(
-; CHECK-NEXT:    ret i1 true
+; CHECK-NEXT:    [[CMP:%.*]] = icmp uge i8 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X]], -1
+; CHECK-NEXT:    [[R:%.*]] = or i1 [[CMP]], [[CMPEQ]]
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %cmp = icmp uge i8 %x, %y
   %cmpeq = icmp ne i8 %x, 255
@@ -238,7 +289,10 @@ define i1 @uge_or_not_max(i8 %x, i8 %y)  {
 
 define i1 @uge_or_not_max_commute(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @uge_or_not_max_commute(
-; CHECK-NEXT:    ret i1 true
+; CHECK-NEXT:    [[CMP:%.*]] = icmp uge i8 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X]], -1
+; CHECK-NEXT:    [[R:%.*]] = or i1 [[CMPEQ]], [[CMP]]
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %cmp = icmp uge i8 %x, %y
   %cmpeq = icmp ne i8 %x, 255
@@ -248,7 +302,10 @@ define i1 @uge_or_not_max_commute(i8 %x, i8 %y)  {
 
 define i1 @uge_swap_or_not_max(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @uge_swap_or_not_max(
-; CHECK-NEXT:    ret i1 true
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ule i8 [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X]], -1
+; CHECK-NEXT:    [[R:%.*]] = or i1 [[CMP]], [[CMPEQ]]
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %cmp = icmp ule i8 %y, %x
   %cmpeq = icmp ne i8 %x, 255
@@ -258,7 +315,10 @@ define i1 @uge_swap_or_not_max(i8 %x, i8 %y)  {
 
 define i1 @uge_swap_or_not_max_commute(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @uge_swap_or_not_max_commute(
-; CHECK-NEXT:    ret i1 true
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ule i8 [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X]], -1
+; CHECK-NEXT:    [[R:%.*]] = or i1 [[CMPEQ]], [[CMP]]
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %cmp = icmp ule i8 %y, %x
   %cmpeq = icmp ne i8 %x, 255
@@ -274,7 +334,10 @@ define i1 @uge_swap_or_not_max_commute(i8 %x, i8 %y)  {
 
 define i1 @sle_or_not_min(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @sle_or_not_min(
-; CHECK-NEXT:    ret i1 true
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sle i8 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X]], -128
+; CHECK-NEXT:    [[R:%.*]] = or i1 [[CMP]], [[CMPEQ]]
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %cmp = icmp sle i8 %x, %y
   %cmpeq = icmp ne i8 %x, 128
@@ -284,7 +347,10 @@ define i1 @sle_or_not_min(i8 %x, i8 %y)  {
 
 define i1 @sle_or_not_min_commute(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @sle_or_not_min_commute(
-; CHECK-NEXT:    ret i1 true
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sle i8 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X]], -128
+; CHECK-NEXT:    [[R:%.*]] = or i1 [[CMPEQ]], [[CMP]]
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %cmp = icmp sle i8 %x, %y
   %cmpeq = icmp ne i8 %x, 128
@@ -294,7 +360,10 @@ define i1 @sle_or_not_min_commute(i8 %x, i8 %y)  {
 
 define i1 @sle_swap_or_not_min(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @sle_swap_or_not_min(
-; CHECK-NEXT:    ret i1 true
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sge i8 [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X]], -128
+; CHECK-NEXT:    [[R:%.*]] = or i1 [[CMP]], [[CMPEQ]]
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %cmp = icmp sge i8 %y, %x
   %cmpeq = icmp ne i8 %x, 128
@@ -304,7 +373,10 @@ define i1 @sle_swap_or_not_min(i8 %x, i8 %y)  {
 
 define i1 @sle_swap_or_not_min_commute(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @sle_swap_or_not_min_commute(
-; CHECK-NEXT:    ret i1 true
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sge i8 [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X]], -128
+; CHECK-NEXT:    [[R:%.*]] = or i1 [[CMPEQ]], [[CMP]]
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %cmp = icmp sge i8 %y, %x
   %cmpeq = icmp ne i8 %x, 128
@@ -360,8 +432,10 @@ define i1 @ule_swap_or_not_min_commute(i8 %x, i8 %y)  {
 
 define i1 @sge_and_max(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @sge_and_max(
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X:%.*]], 127
-; CHECK-NEXT:    ret i1 [[CMPEQ]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sge i8 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X]], 127
+; CHECK-NEXT:    [[R:%.*]] = and i1 [[CMP]], [[CMPEQ]]
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %cmp = icmp sge i8 %x, %y
   %cmpeq = icmp eq i8 %x, 127
@@ -371,8 +445,10 @@ define i1 @sge_and_max(i8 %x, i8 %y)  {
 
 define i1 @sge_and_max_commute(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @sge_and_max_commute(
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X:%.*]], 127
-; CHECK-NEXT:    ret i1 [[CMPEQ]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sge i8 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X]], 127
+; CHECK-NEXT:    [[R:%.*]] = and i1 [[CMPEQ]], [[CMP]]
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %cmp = icmp sge i8 %x, %y
   %cmpeq = icmp eq i8 %x, 127
@@ -382,8 +458,10 @@ define i1 @sge_and_max_commute(i8 %x, i8 %y)  {
 
 define i1 @sge_swap_and_max(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @sge_swap_and_max(
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X:%.*]], 127
-; CHECK-NEXT:    ret i1 [[CMPEQ]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sle i8 [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X]], 127
+; CHECK-NEXT:    [[R:%.*]] = and i1 [[CMP]], [[CMPEQ]]
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %cmp = icmp sle i8 %y, %x
   %cmpeq = icmp eq i8 %x, 127
@@ -393,8 +471,10 @@ define i1 @sge_swap_and_max(i8 %x, i8 %y)  {
 
 define i1 @sge_swap_and_max_commute(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @sge_swap_and_max_commute(
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X:%.*]], 127
-; CHECK-NEXT:    ret i1 [[CMPEQ]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sle i8 [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X]], 127
+; CHECK-NEXT:    [[R:%.*]] = and i1 [[CMPEQ]], [[CMP]]
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %cmp = icmp sle i8 %y, %x
   %cmpeq = icmp eq i8 %x, 127
@@ -404,8 +484,10 @@ define i1 @sge_swap_and_max_commute(i8 %x, i8 %y)  {
 
 define i1 @uge_and_max(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @uge_and_max(
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X:%.*]], -1
-; CHECK-NEXT:    ret i1 [[CMPEQ]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp uge i8 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X]], -1
+; CHECK-NEXT:    [[R:%.*]] = and i1 [[CMP]], [[CMPEQ]]
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %cmp = icmp uge i8 %x, %y
   %cmpeq = icmp eq i8 %x, 255
@@ -415,8 +497,10 @@ define i1 @uge_and_max(i8 %x, i8 %y)  {
 
 define i1 @uge_and_max_commute(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @uge_and_max_commute(
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X:%.*]], -1
-; CHECK-NEXT:    ret i1 [[CMPEQ]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp uge i8 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X]], -1
+; CHECK-NEXT:    [[R:%.*]] = and i1 [[CMPEQ]], [[CMP]]
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %cmp = icmp uge i8 %x, %y
   %cmpeq = icmp eq i8 %x, 255
@@ -426,8 +510,10 @@ define i1 @uge_and_max_commute(i8 %x, i8 %y)  {
 
 define i1 @uge_swap_and_max(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @uge_swap_and_max(
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X:%.*]], -1
-; CHECK-NEXT:    ret i1 [[CMPEQ]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ule i8 [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X]], -1
+; CHECK-NEXT:    [[R:%.*]] = and i1 [[CMP]], [[CMPEQ]]
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %cmp = icmp ule i8 %y, %x
   %cmpeq = icmp eq i8 %x, 255
@@ -437,8 +523,10 @@ define i1 @uge_swap_and_max(i8 %x, i8 %y)  {
 
 define i1 @uge_swap_and_max_commute(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @uge_swap_and_max_commute(
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X:%.*]], -1
-; CHECK-NEXT:    ret i1 [[CMPEQ]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ule i8 [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X]], -1
+; CHECK-NEXT:    [[R:%.*]] = and i1 [[CMPEQ]], [[CMP]]
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %cmp = icmp ule i8 %y, %x
   %cmpeq = icmp eq i8 %x, 255
@@ -454,8 +542,10 @@ define i1 @uge_swap_and_max_commute(i8 %x, i8 %y)  {
 
 define i1 @sle_and_min(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @sle_and_min(
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X:%.*]], -128
-; CHECK-NEXT:    ret i1 [[CMPEQ]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sle i8 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X]], -128
+; CHECK-NEXT:    [[R:%.*]] = and i1 [[CMP]], [[CMPEQ]]
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %cmp = icmp sle i8 %x, %y
   %cmpeq = icmp eq i8 %x, 128
@@ -465,8 +555,10 @@ define i1 @sle_and_min(i8 %x, i8 %y)  {
 
 define i1 @sle_and_min_commute(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @sle_and_min_commute(
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X:%.*]], -128
-; CHECK-NEXT:    ret i1 [[CMPEQ]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sle i8 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X]], -128
+; CHECK-NEXT:    [[R:%.*]] = and i1 [[CMPEQ]], [[CMP]]
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %cmp = icmp sle i8 %x, %y
   %cmpeq = icmp eq i8 %x, 128
@@ -476,8 +568,10 @@ define i1 @sle_and_min_commute(i8 %x, i8 %y)  {
 
 define i1 @sle_swap_and_min(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @sle_swap_and_min(
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X:%.*]], -128
-; CHECK-NEXT:    ret i1 [[CMPEQ]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sge i8 [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X]], -128
+; CHECK-NEXT:    [[R:%.*]] = and i1 [[CMP]], [[CMPEQ]]
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %cmp = icmp sge i8 %y, %x
   %cmpeq = icmp eq i8 %x, 128
@@ -487,8 +581,10 @@ define i1 @sle_swap_and_min(i8 %x, i8 %y)  {
 
 define i1 @sle_swap_and_min_commute(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @sle_swap_and_min_commute(
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X:%.*]], -128
-; CHECK-NEXT:    ret i1 [[CMPEQ]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sge i8 [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X]], -128
+; CHECK-NEXT:    [[R:%.*]] = and i1 [[CMPEQ]], [[CMP]]
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %cmp = icmp sge i8 %y, %x
   %cmpeq = icmp eq i8 %x, 128
@@ -924,8 +1020,10 @@ define i1 @ugt_swap_and_not_min_commute(i8 %x, i8 %y)  {
 
 define i1 @slt_or_not_max(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @slt_or_not_max(
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X:%.*]], 127
-; CHECK-NEXT:    ret i1 [[CMPEQ]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X]], 127
+; CHECK-NEXT:    [[R:%.*]] = or i1 [[CMP]], [[CMPEQ]]
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %cmp = icmp slt i8 %x, %y
   %cmpeq = icmp ne i8 %x, 127
@@ -935,8 +1033,10 @@ define i1 @slt_or_not_max(i8 %x, i8 %y)  {
 
 define i1 @slt_or_not_max_commute(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @slt_or_not_max_commute(
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X:%.*]], 127
-; CHECK-NEXT:    ret i1 [[CMPEQ]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X]], 127
+; CHECK-NEXT:    [[R:%.*]] = or i1 [[CMPEQ]], [[CMP]]
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %cmp = icmp slt i8 %x, %y
   %cmpeq = icmp ne i8 %x, 127
@@ -946,8 +1046,10 @@ define i1 @slt_or_not_max_commute(i8 %x, i8 %y)  {
 
 define i1 @slt_swap_or_not_max(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @slt_swap_or_not_max(
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X:%.*]], 127
-; CHECK-NEXT:    ret i1 [[CMPEQ]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i8 [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X]], 127
+; CHECK-NEXT:    [[R:%.*]] = or i1 [[CMP]], [[CMPEQ]]
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %cmp = icmp sgt i8 %y, %x
   %cmpeq = icmp ne i8 %x, 127
@@ -957,8 +1059,10 @@ define i1 @slt_swap_or_not_max(i8 %x, i8 %y)  {
 
 define i1 @slt_swap_or_not_max_commute(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @slt_swap_or_not_max_commute(
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X:%.*]], 127
-; CHECK-NEXT:    ret i1 [[CMPEQ]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i8 [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X]], 127
+; CHECK-NEXT:    [[R:%.*]] = or i1 [[CMPEQ]], [[CMP]]
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %cmp = icmp sgt i8 %y, %x
   %cmpeq = icmp ne i8 %x, 127
@@ -968,8 +1072,10 @@ define i1 @slt_swap_or_not_max_commute(i8 %x, i8 %y)  {
 
 define i1 @ult_or_not_max(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @ult_or_not_max(
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X:%.*]], -1
-; CHECK-NEXT:    ret i1 [[CMPEQ]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i8 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X]], -1
+; CHECK-NEXT:    [[R:%.*]] = or i1 [[CMP]], [[CMPEQ]]
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %cmp = icmp ult i8 %x, %y
   %cmpeq = icmp ne i8 %x, 255
@@ -979,8 +1085,10 @@ define i1 @ult_or_not_max(i8 %x, i8 %y)  {
 
 define i1 @ult_or_not_max_commute(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @ult_or_not_max_commute(
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X:%.*]], -1
-; CHECK-NEXT:    ret i1 [[CMPEQ]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i8 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X]], -1
+; CHECK-NEXT:    [[R:%.*]] = or i1 [[CMPEQ]], [[CMP]]
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %cmp = icmp ult i8 %x, %y
   %cmpeq = icmp ne i8 %x, 255
@@ -990,8 +1098,10 @@ define i1 @ult_or_not_max_commute(i8 %x, i8 %y)  {
 
 define i1 @ult_swap_or_not_max(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @ult_swap_or_not_max(
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X:%.*]], -1
-; CHECK-NEXT:    ret i1 [[CMPEQ]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i8 [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X]], -1
+; CHECK-NEXT:    [[R:%.*]] = or i1 [[CMP]], [[CMPEQ]]
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %cmp = icmp ugt i8 %y, %x
   %cmpeq = icmp ne i8 %x, 255
@@ -1001,8 +1111,10 @@ define i1 @ult_swap_or_not_max(i8 %x, i8 %y)  {
 
 define i1 @ult_swap_or_not_max_commute(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @ult_swap_or_not_max_commute(
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X:%.*]], -1
-; CHECK-NEXT:    ret i1 [[CMPEQ]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i8 [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X]], -1
+; CHECK-NEXT:    [[R:%.*]] = or i1 [[CMPEQ]], [[CMP]]
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %cmp = icmp ugt i8 %y, %x
   %cmpeq = icmp ne i8 %x, 255
@@ -1018,8 +1130,10 @@ define i1 @ult_swap_or_not_max_commute(i8 %x, i8 %y)  {
 
 define i1 @sgt_or_not_min(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @sgt_or_not_min(
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X:%.*]], -128
-; CHECK-NEXT:    ret i1 [[CMPEQ]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i8 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X]], -128
+; CHECK-NEXT:    [[R:%.*]] = or i1 [[CMP]], [[CMPEQ]]
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %cmp = icmp sgt i8 %x, %y
   %cmpeq = icmp ne i8 %x, 128
@@ -1029,8 +1143,10 @@ define i1 @sgt_or_not_min(i8 %x, i8 %y)  {
 
 define i1 @sgt_or_not_min_commute(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @sgt_or_not_min_commute(
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X:%.*]], -128
-; CHECK-NEXT:    ret i1 [[CMPEQ]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i8 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X]], -128
+; CHECK-NEXT:    [[R:%.*]] = or i1 [[CMPEQ]], [[CMP]]
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %cmp = icmp sgt i8 %x, %y
   %cmpeq = icmp ne i8 %x, 128
@@ -1040,8 +1156,10 @@ define i1 @sgt_or_not_min_commute(i8 %x, i8 %y)  {
 
 define i1 @sgt_swap_or_not_min(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @sgt_swap_or_not_min(
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X:%.*]], -128
-; CHECK-NEXT:    ret i1 [[CMPEQ]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X]], -128
+; CHECK-NEXT:    [[R:%.*]] = or i1 [[CMP]], [[CMPEQ]]
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %cmp = icmp slt i8 %y, %x
   %cmpeq = icmp ne i8 %x, 128
@@ -1051,8 +1169,10 @@ define i1 @sgt_swap_or_not_min(i8 %x, i8 %y)  {
 
 define i1 @sgt_swap_or_not_min_commute(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @sgt_swap_or_not_min_commute(
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X:%.*]], -128
-; CHECK-NEXT:    ret i1 [[CMPEQ]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X]], -128
+; CHECK-NEXT:    [[R:%.*]] = or i1 [[CMPEQ]], [[CMP]]
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %cmp = icmp slt i8 %y, %x
   %cmpeq = icmp ne i8 %x, 128
@@ -1112,7 +1232,11 @@ define i1 @ugt_swap_or_not_min_commute(i823 %x, i823 %y)  {
 
 define i1 @slt_and_max_not_op(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @slt_and_max_not_op(
-; CHECK-NEXT:    ret i1 false
+; CHECK-NEXT:    [[NOTX:%.*]] = xor i8 [[X:%.*]], -1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 [[NOTX]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X]], -128
+; CHECK-NEXT:    [[R:%.*]] = and i1 [[CMP]], [[CMPEQ]]
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %notx = xor i8 %x, -1
   %cmp = icmp slt i8 %notx, %y
@@ -1123,7 +1247,11 @@ define i1 @slt_and_max_not_op(i8 %x, i8 %y)  {
 
 define <2 x i1> @slt_and_max_commute_not_op(<2 x i8> %x, <2 x i8> %y)  {
 ; CHECK-LABEL: @slt_and_max_commute_not_op(
-; CHECK-NEXT:    ret <2 x i1> zeroinitializer
+; CHECK-NEXT:    [[NOTX:%.*]] = xor <2 x i8> [[X:%.*]], <i8 -1, i8 -1>
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt <2 x i8> [[NOTX]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq <2 x i8> [[X]], <i8 -128, i8 -128>
+; CHECK-NEXT:    [[R:%.*]] = and <2 x i1> [[CMPEQ]], [[CMP]]
+; CHECK-NEXT:    ret <2 x i1> [[R]]
 ;
   %notx = xor <2 x i8> %x, <i8 -1, i8 -1>
   %cmp = icmp slt <2 x i8> %notx, %y
@@ -1134,7 +1262,11 @@ define <2 x i1> @slt_and_max_commute_not_op(<2 x i8> %x, <2 x i8> %y)  {
 
 define i1 @slt_swap_and_max_not_op(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @slt_swap_and_max_not_op(
-; CHECK-NEXT:    ret i1 false
+; CHECK-NEXT:    [[NOTX:%.*]] = xor i8 [[X:%.*]], -1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i8 [[Y:%.*]], [[NOTX]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X]], -128
+; CHECK-NEXT:    [[R:%.*]] = and i1 [[CMP]], [[CMPEQ]]
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %notx = xor i8 %x, -1
   %cmp = icmp sgt i8 %y, %notx
@@ -1145,7 +1277,11 @@ define i1 @slt_swap_and_max_not_op(i8 %x, i8 %y)  {
 
 define i1 @slt_swap_and_max_commute_not_op(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @slt_swap_and_max_commute_not_op(
-; CHECK-NEXT:    ret i1 false
+; CHECK-NEXT:    [[NOTX:%.*]] = xor i8 [[X:%.*]], -1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i8 [[Y:%.*]], [[NOTX]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X]], -128
+; CHECK-NEXT:    [[R:%.*]] = and i1 [[CMPEQ]], [[CMP]]
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %notx = xor i8 %x, -1
   %cmp = icmp sgt i8 %y, %notx
@@ -1156,7 +1292,11 @@ define i1 @slt_swap_and_max_commute_not_op(i8 %x, i8 %y)  {
 
 define i1 @ult_and_max_not_op(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @ult_and_max_not_op(
-; CHECK-NEXT:    ret i1 false
+; CHECK-NEXT:    [[NOTX:%.*]] = xor i8 [[X:%.*]], -1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i8 [[NOTX]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X]], 0
+; CHECK-NEXT:    [[R:%.*]] = and i1 [[CMP]], [[CMPEQ]]
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %notx = xor i8 %x, -1
   %cmp = icmp ult i8 %notx, %y
@@ -1167,7 +1307,11 @@ define i1 @ult_and_max_not_op(i8 %x, i8 %y)  {
 
 define i1 @ult_and_max_commute_not_op(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @ult_and_max_commute_not_op(
-; CHECK-NEXT:    ret i1 false
+; CHECK-NEXT:    [[NOTX:%.*]] = xor i8 [[X:%.*]], -1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i8 [[NOTX]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X]], 0
+; CHECK-NEXT:    [[R:%.*]] = and i1 [[CMPEQ]], [[CMP]]
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %notx = xor i8 %x, -1
   %cmp = icmp ult i8 %notx, %y
@@ -1178,7 +1322,11 @@ define i1 @ult_and_max_commute_not_op(i8 %x, i8 %y)  {
 
 define i1 @ult_swap_and_max_not_op(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @ult_swap_and_max_not_op(
-; CHECK-NEXT:    ret i1 false
+; CHECK-NEXT:    [[NOTX:%.*]] = xor i8 [[X:%.*]], -1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i8 [[Y:%.*]], [[NOTX]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X]], 0
+; CHECK-NEXT:    [[R:%.*]] = and i1 [[CMP]], [[CMPEQ]]
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %notx = xor i8 %x, -1
   %cmp = icmp ugt i8 %y, %notx
@@ -1189,7 +1337,11 @@ define i1 @ult_swap_and_max_not_op(i8 %x, i8 %y)  {
 
 define i1 @ult_swap_and_max_commute_not_op(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @ult_swap_and_max_commute_not_op(
-; CHECK-NEXT:    ret i1 false
+; CHECK-NEXT:    [[NOTX:%.*]] = xor i8 [[X:%.*]], -1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i8 [[Y:%.*]], [[NOTX]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X]], 0
+; CHECK-NEXT:    [[R:%.*]] = and i1 [[CMPEQ]], [[CMP]]
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %notx = xor i8 %x, -1
   %cmp = icmp ugt i8 %y, %notx
@@ -1206,7 +1358,11 @@ define i1 @ult_swap_and_max_commute_not_op(i8 %x, i8 %y)  {
 
 define i1 @sgt_and_min_not_op(i9 %x, i9 %y)  {
 ; CHECK-LABEL: @sgt_and_min_not_op(
-; CHECK-NEXT:    ret i1 false
+; CHECK-NEXT:    [[NOTX:%.*]] = xor i9 [[X:%.*]], -1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i9 [[NOTX]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i9 [[X]], 255
+; CHECK-NEXT:    [[R:%.*]] = and i1 [[CMP]], [[CMPEQ]]
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %notx = xor i9 %x, -1
   %cmp = icmp sgt i9 %notx, %y
@@ -1217,7 +1373,11 @@ define i1 @sgt_and_min_not_op(i9 %x, i9 %y)  {
 
 define i1 @sgt_and_min_commute_not_op(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @sgt_and_min_commute_not_op(
-; CHECK-NEXT:    ret i1 false
+; CHECK-NEXT:    [[NOTX:%.*]] = xor i8 [[X:%.*]], -1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i8 [[NOTX]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X]], 127
+; CHECK-NEXT:    [[R:%.*]] = and i1 [[CMPEQ]], [[CMP]]
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %notx = xor i8 %x, -1
   %cmp = icmp sgt i8 %notx, %y
@@ -1228,7 +1388,11 @@ define i1 @sgt_and_min_commute_not_op(i8 %x, i8 %y)  {
 
 define i1 @sgt_swap_and_min_not_op(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @sgt_swap_and_min_not_op(
-; CHECK-NEXT:    ret i1 false
+; CHECK-NEXT:    [[NOTX:%.*]] = xor i8 [[X:%.*]], -1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 [[Y:%.*]], [[NOTX]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X]], 127
+; CHECK-NEXT:    [[R:%.*]] = and i1 [[CMP]], [[CMPEQ]]
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %notx = xor i8 %x, -1
   %cmp = icmp slt i8 %y, %notx
@@ -1239,7 +1403,11 @@ define i1 @sgt_swap_and_min_not_op(i8 %x, i8 %y)  {
 
 define i1 @sgt_swap_and_min_commute_not_op(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @sgt_swap_and_min_commute_not_op(
-; CHECK-NEXT:    ret i1 false
+; CHECK-NEXT:    [[NOTX:%.*]] = xor i8 [[X:%.*]], -1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 [[Y:%.*]], [[NOTX]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X]], 127
+; CHECK-NEXT:    [[R:%.*]] = and i1 [[CMPEQ]], [[CMP]]
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %notx = xor i8 %x, -1
   %cmp = icmp slt i8 %y, %notx
@@ -1250,7 +1418,11 @@ define i1 @sgt_swap_and_min_commute_not_op(i8 %x, i8 %y)  {
 
 define i1 @ugt_and_min_not_op(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @ugt_and_min_not_op(
-; CHECK-NEXT:    ret i1 false
+; CHECK-NEXT:    [[NOTX:%.*]] = xor i8 [[X:%.*]], -1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i8 [[NOTX]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X]], -1
+; CHECK-NEXT:    [[R:%.*]] = and i1 [[CMP]], [[CMPEQ]]
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %notx = xor i8 %x, -1
   %cmp = icmp ugt i8 %notx, %y
@@ -1261,7 +1433,11 @@ define i1 @ugt_and_min_not_op(i8 %x, i8 %y)  {
 
 define i1 @ugt_and_min_commute_not_op(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @ugt_and_min_commute_not_op(
-; CHECK-NEXT:    ret i1 false
+; CHECK-NEXT:    [[NOTX:%.*]] = xor i8 [[X:%.*]], -1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i8 [[NOTX]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X]], -1
+; CHECK-NEXT:    [[R:%.*]] = and i1 [[CMPEQ]], [[CMP]]
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %notx = xor i8 %x, -1
   %cmp = icmp ugt i8 %notx, %y
@@ -1272,7 +1448,11 @@ define i1 @ugt_and_min_commute_not_op(i8 %x, i8 %y)  {
 
 define i1 @ugt_swap_and_min_not_op(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @ugt_swap_and_min_not_op(
-; CHECK-NEXT:    ret i1 false
+; CHECK-NEXT:    [[NOTX:%.*]] = xor i8 [[X:%.*]], -1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i8 [[Y:%.*]], [[NOTX]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X]], -1
+; CHECK-NEXT:    [[R:%.*]] = and i1 [[CMP]], [[CMPEQ]]
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %notx = xor i8 %x, -1
   %cmp = icmp ult i8 %y, %notx
@@ -1283,7 +1463,11 @@ define i1 @ugt_swap_and_min_not_op(i8 %x, i8 %y)  {
 
 define i1 @ugt_swap_and_min_commute_not_op(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @ugt_swap_and_min_commute_not_op(
-; CHECK-NEXT:    ret i1 false
+; CHECK-NEXT:    [[NOTX:%.*]] = xor i8 [[X:%.*]], -1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i8 [[Y:%.*]], [[NOTX]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X]], -1
+; CHECK-NEXT:    [[R:%.*]] = and i1 [[CMPEQ]], [[CMP]]
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %notx = xor i8 %x, -1
   %cmp = icmp ult i8 %y, %notx
@@ -1300,7 +1484,11 @@ define i1 @ugt_swap_and_min_commute_not_op(i8 %x, i8 %y)  {
 
 define i1 @sge_or_not_max_not_op(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @sge_or_not_max_not_op(
-; CHECK-NEXT:    ret i1 true
+; CHECK-NEXT:    [[NOTX:%.*]] = xor i8 [[X:%.*]], -1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sge i8 [[NOTX]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X]], -128
+; CHECK-NEXT:    [[R:%.*]] = or i1 [[CMP]], [[CMPEQ]]
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %notx = xor i8 %x, -1
   %cmp = icmp sge i8 %notx, %y
@@ -1311,7 +1499,11 @@ define i1 @sge_or_not_max_not_op(i8 %x, i8 %y)  {
 
 define i1 @sge_or_not_max_commute_not_op(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @sge_or_not_max_commute_not_op(
-; CHECK-NEXT:    ret i1 true
+; CHECK-NEXT:    [[NOTX:%.*]] = xor i8 [[X:%.*]], -1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sge i8 [[NOTX]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X]], -128
+; CHECK-NEXT:    [[R:%.*]] = or i1 [[CMPEQ]], [[CMP]]
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %notx = xor i8 %x, -1
   %cmp = icmp sge i8 %notx, %y
@@ -1322,7 +1514,11 @@ define i1 @sge_or_not_max_commute_not_op(i8 %x, i8 %y)  {
 
 define i1 @sge_swap_or_not_max_not_op(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @sge_swap_or_not_max_not_op(
-; CHECK-NEXT:    ret i1 true
+; CHECK-NEXT:    [[NOTX:%.*]] = xor i8 [[X:%.*]], -1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sle i8 [[Y:%.*]], [[NOTX]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X]], -128
+; CHECK-NEXT:    [[R:%.*]] = or i1 [[CMP]], [[CMPEQ]]
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %notx = xor i8 %x, -1
   %cmp = icmp sle i8 %y, %notx
@@ -1333,7 +1529,11 @@ define i1 @sge_swap_or_not_max_not_op(i8 %x, i8 %y)  {
 
 define i1 @sge_swap_or_not_max_commute_not_op(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @sge_swap_or_not_max_commute_not_op(
-; CHECK-NEXT:    ret i1 true
+; CHECK-NEXT:    [[NOTX:%.*]] = xor i8 [[X:%.*]], -1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sle i8 [[Y:%.*]], [[NOTX]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X]], -128
+; CHECK-NEXT:    [[R:%.*]] = or i1 [[CMPEQ]], [[CMP]]
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %notx = xor i8 %x, -1
   %cmp = icmp sle i8 %y, %notx
@@ -1344,7 +1544,11 @@ define i1 @sge_swap_or_not_max_commute_not_op(i8 %x, i8 %y)  {
 
 define i1 @uge_or_not_max_not_op(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @uge_or_not_max_not_op(
-; CHECK-NEXT:    ret i1 true
+; CHECK-NEXT:    [[NOTX:%.*]] = xor i8 [[X:%.*]], -1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp uge i8 [[NOTX]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X]], 0
+; CHECK-NEXT:    [[R:%.*]] = or i1 [[CMP]], [[CMPEQ]]
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %notx = xor i8 %x, -1
   %cmp = icmp uge i8 %notx, %y
@@ -1355,7 +1559,11 @@ define i1 @uge_or_not_max_not_op(i8 %x, i8 %y)  {
 
 define i1 @uge_or_not_max_commute_not_op(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @uge_or_not_max_commute_not_op(
-; CHECK-NEXT:    ret i1 true
+; CHECK-NEXT:    [[NOTX:%.*]] = xor i8 [[X:%.*]], -1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp uge i8 [[NOTX]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X]], 0
+; CHECK-NEXT:    [[R:%.*]] = or i1 [[CMPEQ]], [[CMP]]
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %notx = xor i8 %x, -1
   %cmp = icmp uge i8 %notx, %y
@@ -1366,7 +1574,11 @@ define i1 @uge_or_not_max_commute_not_op(i8 %x, i8 %y)  {
 
 define i1 @uge_swap_or_not_max_not_op(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @uge_swap_or_not_max_not_op(
-; CHECK-NEXT:    ret i1 true
+; CHECK-NEXT:    [[NOTX:%.*]] = xor i8 [[X:%.*]], -1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ule i8 [[Y:%.*]], [[NOTX]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X]], 0
+; CHECK-NEXT:    [[R:%.*]] = or i1 [[CMP]], [[CMPEQ]]
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %notx = xor i8 %x, -1
   %cmp = icmp ule i8 %y, %notx
@@ -1377,7 +1589,11 @@ define i1 @uge_swap_or_not_max_not_op(i8 %x, i8 %y)  {
 
 define i1 @uge_swap_or_not_max_commute_not_op(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @uge_swap_or_not_max_commute_not_op(
-; CHECK-NEXT:    ret i1 true
+; CHECK-NEXT:    [[NOTX:%.*]] = xor i8 [[X:%.*]], -1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ule i8 [[Y:%.*]], [[NOTX]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X]], 0
+; CHECK-NEXT:    [[R:%.*]] = or i1 [[CMPEQ]], [[CMP]]
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %notx = xor i8 %x, -1
   %cmp = icmp ule i8 %y, %notx
@@ -1394,7 +1610,11 @@ define i1 @uge_swap_or_not_max_commute_not_op(i8 %x, i8 %y)  {
 
 define i1 @sle_or_not_min_not_op(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @sle_or_not_min_not_op(
-; CHECK-NEXT:    ret i1 true
+; CHECK-NEXT:    [[NOTX:%.*]] = xor i8 [[X:%.*]], -1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sle i8 [[NOTX]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X]], 127
+; CHECK-NEXT:    [[R:%.*]] = or i1 [[CMP]], [[CMPEQ]]
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %notx = xor i8 %x, -1
   %cmp = icmp sle i8 %notx, %y
@@ -1405,7 +1625,11 @@ define i1 @sle_or_not_min_not_op(i8 %x, i8 %y)  {
 
 define i1 @sle_or_not_min_commute_not_op(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @sle_or_not_min_commute_not_op(
-; CHECK-NEXT:    ret i1 true
+; CHECK-NEXT:    [[NOTX:%.*]] = xor i8 [[X:%.*]], -1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sle i8 [[NOTX]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X]], 127
+; CHECK-NEXT:    [[R:%.*]] = or i1 [[CMPEQ]], [[CMP]]
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %notx = xor i8 %x, -1
   %cmp = icmp sle i8 %notx, %y
@@ -1416,7 +1640,11 @@ define i1 @sle_or_not_min_commute_not_op(i8 %x, i8 %y)  {
 
 define i1 @sle_swap_or_not_min_not_op(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @sle_swap_or_not_min_not_op(
-; CHECK-NEXT:    ret i1 true
+; CHECK-NEXT:    [[NOTX:%.*]] = xor i8 [[X:%.*]], -1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sge i8 [[Y:%.*]], [[NOTX]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X]], 127
+; CHECK-NEXT:    [[R:%.*]] = or i1 [[CMP]], [[CMPEQ]]
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %notx = xor i8 %x, -1
   %cmp = icmp sge i8 %y, %notx
@@ -1427,7 +1655,11 @@ define i1 @sle_swap_or_not_min_not_op(i8 %x, i8 %y)  {
 
 define i1 @sle_swap_or_not_min_commute_not_op(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @sle_swap_or_not_min_commute_not_op(
-; CHECK-NEXT:    ret i1 true
+; CHECK-NEXT:    [[NOTX:%.*]] = xor i8 [[X:%.*]], -1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sge i8 [[Y:%.*]], [[NOTX]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X]], 127
+; CHECK-NEXT:    [[R:%.*]] = or i1 [[CMPEQ]], [[CMP]]
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %notx = xor i8 %x, -1
   %cmp = icmp sge i8 %y, %notx
@@ -1438,7 +1670,11 @@ define i1 @sle_swap_or_not_min_commute_not_op(i8 %x, i8 %y)  {
 
 define i1 @ule_or_not_min_not_op(i427 %x, i427 %y)  {
 ; CHECK-LABEL: @ule_or_not_min_not_op(
-; CHECK-NEXT:    ret i1 true
+; CHECK-NEXT:    [[NOTX:%.*]] = xor i427 [[X:%.*]], -1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ule i427 [[NOTX]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i427 [[X]], -1
+; CHECK-NEXT:    [[R:%.*]] = or i1 [[CMP]], [[CMPEQ]]
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %notx = xor i427 %x, -1
   %cmp = icmp ule i427 %notx, %y
@@ -1449,7 +1685,11 @@ define i1 @ule_or_not_min_not_op(i427 %x, i427 %y)  {
 
 define i1 @ule_or_not_min_commute_not_op(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @ule_or_not_min_commute_not_op(
-; CHECK-NEXT:    ret i1 true
+; CHECK-NEXT:    [[NOTX:%.*]] = xor i8 [[X:%.*]], -1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ule i8 [[NOTX]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X]], -1
+; CHECK-NEXT:    [[R:%.*]] = or i1 [[CMPEQ]], [[CMP]]
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %notx = xor i8 %x, -1
   %cmp = icmp ule i8 %notx, %y
@@ -1460,7 +1700,11 @@ define i1 @ule_or_not_min_commute_not_op(i8 %x, i8 %y)  {
 
 define i1 @ule_swap_or_not_min_not_op(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @ule_swap_or_not_min_not_op(
-; CHECK-NEXT:    ret i1 true
+; CHECK-NEXT:    [[NOTX:%.*]] = xor i8 [[X:%.*]], -1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp uge i8 [[Y:%.*]], [[NOTX]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X]], -1
+; CHECK-NEXT:    [[R:%.*]] = or i1 [[CMP]], [[CMPEQ]]
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %notx = xor i8 %x, -1
   %cmp = icmp uge i8 %y, %notx
@@ -1471,7 +1715,11 @@ define i1 @ule_swap_or_not_min_not_op(i8 %x, i8 %y)  {
 
 define i1 @ule_swap_or_not_min_commute_not_op(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @ule_swap_or_not_min_commute_not_op(
-; CHECK-NEXT:    ret i1 true
+; CHECK-NEXT:    [[NOTX:%.*]] = xor i8 [[X:%.*]], -1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp uge i8 [[Y:%.*]], [[NOTX]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X]], -1
+; CHECK-NEXT:    [[R:%.*]] = or i1 [[CMPEQ]], [[CMP]]
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %notx = xor i8 %x, -1
   %cmp = icmp uge i8 %y, %notx
@@ -1488,8 +1736,11 @@ define i1 @ule_swap_or_not_min_commute_not_op(i8 %x, i8 %y)  {
 
 define i1 @sge_and_max_not_op(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @sge_and_max_not_op(
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X:%.*]], -128
-; CHECK-NEXT:    ret i1 [[CMPEQ]]
+; CHECK-NEXT:    [[NOTX:%.*]] = xor i8 [[X:%.*]], -1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sge i8 [[NOTX]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X]], -128
+; CHECK-NEXT:    [[R:%.*]] = and i1 [[CMP]], [[CMPEQ]]
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %notx = xor i8 %x, -1
   %cmp = icmp sge i8 %notx, %y
@@ -1500,8 +1751,11 @@ define i1 @sge_and_max_not_op(i8 %x, i8 %y)  {
 
 define i1 @sge_and_max_commute_not_op(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @sge_and_max_commute_not_op(
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X:%.*]], -128
-; CHECK-NEXT:    ret i1 [[CMPEQ]]
+; CHECK-NEXT:    [[NOTX:%.*]] = xor i8 [[X:%.*]], -1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sge i8 [[NOTX]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X]], -128
+; CHECK-NEXT:    [[R:%.*]] = and i1 [[CMPEQ]], [[CMP]]
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %notx = xor i8 %x, -1
   %cmp = icmp sge i8 %notx, %y
@@ -1512,8 +1766,11 @@ define i1 @sge_and_max_commute_not_op(i8 %x, i8 %y)  {
 
 define i1 @sge_swap_and_max_not_op(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @sge_swap_and_max_not_op(
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X:%.*]], -128
-; CHECK-NEXT:    ret i1 [[CMPEQ]]
+; CHECK-NEXT:    [[NOTX:%.*]] = xor i8 [[X:%.*]], -1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sle i8 [[Y:%.*]], [[NOTX]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X]], -128
+; CHECK-NEXT:    [[R:%.*]] = and i1 [[CMP]], [[CMPEQ]]
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %notx = xor i8 %x, -1
   %cmp = icmp sle i8 %y, %notx
@@ -1524,8 +1781,11 @@ define i1 @sge_swap_and_max_not_op(i8 %x, i8 %y)  {
 
 define i1 @sge_swap_and_max_commute_not_op(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @sge_swap_and_max_commute_not_op(
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X:%.*]], -128
-; CHECK-NEXT:    ret i1 [[CMPEQ]]
+; CHECK-NEXT:    [[NOTX:%.*]] = xor i8 [[X:%.*]], -1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sle i8 [[Y:%.*]], [[NOTX]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X]], -128
+; CHECK-NEXT:    [[R:%.*]] = and i1 [[CMPEQ]], [[CMP]]
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %notx = xor i8 %x, -1
   %cmp = icmp sle i8 %y, %notx
@@ -1536,8 +1796,11 @@ define i1 @sge_swap_and_max_commute_not_op(i8 %x, i8 %y)  {
 
 define i1 @uge_and_max_not_op(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @uge_and_max_not_op(
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X:%.*]], 0
-; CHECK-NEXT:    ret i1 [[CMPEQ]]
+; CHECK-NEXT:    [[NOTX:%.*]] = xor i8 [[X:%.*]], -1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp uge i8 [[NOTX]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X]], 0
+; CHECK-NEXT:    [[R:%.*]] = and i1 [[CMP]], [[CMPEQ]]
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %notx = xor i8 %x, -1
   %cmp = icmp uge i8 %notx, %y
@@ -1548,8 +1811,11 @@ define i1 @uge_and_max_not_op(i8 %x, i8 %y)  {
 
 define i1 @uge_and_max_commute_not_op(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @uge_and_max_commute_not_op(
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X:%.*]], 0
-; CHECK-NEXT:    ret i1 [[CMPEQ]]
+; CHECK-NEXT:    [[NOTX:%.*]] = xor i8 [[X:%.*]], -1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp uge i8 [[NOTX]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X]], 0
+; CHECK-NEXT:    [[R:%.*]] = and i1 [[CMPEQ]], [[CMP]]
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %notx = xor i8 %x, -1
   %cmp = icmp uge i8 %notx, %y
@@ -1560,8 +1826,11 @@ define i1 @uge_and_max_commute_not_op(i8 %x, i8 %y)  {
 
 define i1 @uge_swap_and_max_not_op(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @uge_swap_and_max_not_op(
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X:%.*]], 0
-; CHECK-NEXT:    ret i1 [[CMPEQ]]
+; CHECK-NEXT:    [[NOTX:%.*]] = xor i8 [[X:%.*]], -1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ule i8 [[Y:%.*]], [[NOTX]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X]], 0
+; CHECK-NEXT:    [[R:%.*]] = and i1 [[CMP]], [[CMPEQ]]
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %notx = xor i8 %x, -1
   %cmp = icmp ule i8 %y, %notx
@@ -1572,8 +1841,11 @@ define i1 @uge_swap_and_max_not_op(i8 %x, i8 %y)  {
 
 define i1 @uge_swap_and_max_commute_not_op(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @uge_swap_and_max_commute_not_op(
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X:%.*]], 0
-; CHECK-NEXT:    ret i1 [[CMPEQ]]
+; CHECK-NEXT:    [[NOTX:%.*]] = xor i8 [[X:%.*]], -1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ule i8 [[Y:%.*]], [[NOTX]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X]], 0
+; CHECK-NEXT:    [[R:%.*]] = and i1 [[CMPEQ]], [[CMP]]
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %notx = xor i8 %x, -1
   %cmp = icmp ule i8 %y, %notx
@@ -2156,8 +2428,11 @@ define i1 @ugt_swap_and_not_min_commute_not_op(i8 %x, i8 %y)  {
 
 define i1 @slt_or_not_max_not_op(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @slt_or_not_max_not_op(
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X:%.*]], -128
-; CHECK-NEXT:    ret i1 [[CMPEQ]]
+; CHECK-NEXT:    [[NOTX:%.*]] = xor i8 [[X:%.*]], -1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 [[NOTX]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X]], -128
+; CHECK-NEXT:    [[R:%.*]] = or i1 [[CMP]], [[CMPEQ]]
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %notx = xor i8 %x, -1
   %cmp = icmp slt i8 %notx, %y
@@ -2168,8 +2443,11 @@ define i1 @slt_or_not_max_not_op(i8 %x, i8 %y)  {
 
 define i1 @slt_or_not_max_commute_not_op(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @slt_or_not_max_commute_not_op(
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X:%.*]], -128
-; CHECK-NEXT:    ret i1 [[CMPEQ]]
+; CHECK-NEXT:    [[NOTX:%.*]] = xor i8 [[X:%.*]], -1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 [[NOTX]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X]], -128
+; CHECK-NEXT:    [[R:%.*]] = or i1 [[CMPEQ]], [[CMP]]
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %notx = xor i8 %x, -1
   %cmp = icmp slt i8 %notx, %y
@@ -2180,8 +2458,11 @@ define i1 @slt_or_not_max_commute_not_op(i8 %x, i8 %y)  {
 
 define i1 @slt_swap_or_not_max_not_op(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @slt_swap_or_not_max_not_op(
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X:%.*]], -128
-; CHECK-NEXT:    ret i1 [[CMPEQ]]
+; CHECK-NEXT:    [[NOTX:%.*]] = xor i8 [[X:%.*]], -1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i8 [[Y:%.*]], [[NOTX]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X]], -128
+; CHECK-NEXT:    [[R:%.*]] = or i1 [[CMP]], [[CMPEQ]]
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %notx = xor i8 %x, -1
   %cmp = icmp sgt i8 %y, %notx
@@ -2192,8 +2473,11 @@ define i1 @slt_swap_or_not_max_not_op(i8 %x, i8 %y)  {
 
 define i1 @slt_swap_or_not_max_commute_not_op(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @slt_swap_or_not_max_commute_not_op(
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X:%.*]], -128
-; CHECK-NEXT:    ret i1 [[CMPEQ]]
+; CHECK-NEXT:    [[NOTX:%.*]] = xor i8 [[X:%.*]], -1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i8 [[Y:%.*]], [[NOTX]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X]], -128
+; CHECK-NEXT:    [[R:%.*]] = or i1 [[CMPEQ]], [[CMP]]
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %notx = xor i8 %x, -1
   %cmp = icmp sgt i8 %y, %notx
@@ -2204,8 +2488,11 @@ define i1 @slt_swap_or_not_max_commute_not_op(i8 %x, i8 %y)  {
 
 define i1 @ult_or_not_max_not_op(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @ult_or_not_max_not_op(
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X:%.*]], 0
-; CHECK-NEXT:    ret i1 [[CMPEQ]]
+; CHECK-NEXT:    [[NOTX:%.*]] = xor i8 [[X:%.*]], -1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i8 [[NOTX]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X]], 0
+; CHECK-NEXT:    [[R:%.*]] = or i1 [[CMP]], [[CMPEQ]]
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %notx = xor i8 %x, -1
   %cmp = icmp ult i8 %notx, %y
@@ -2216,8 +2503,11 @@ define i1 @ult_or_not_max_not_op(i8 %x, i8 %y)  {
 
 define i1 @ult_or_not_max_commute_not_op(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @ult_or_not_max_commute_not_op(
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X:%.*]], 0
-; CHECK-NEXT:    ret i1 [[CMPEQ]]
+; CHECK-NEXT:    [[NOTX:%.*]] = xor i8 [[X:%.*]], -1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i8 [[NOTX]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X]], 0
+; CHECK-NEXT:    [[R:%.*]] = or i1 [[CMPEQ]], [[CMP]]
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %notx = xor i8 %x, -1
   %cmp = icmp ult i8 %notx, %y
@@ -2228,8 +2518,11 @@ define i1 @ult_or_not_max_commute_not_op(i8 %x, i8 %y)  {
 
 define i1 @ult_swap_or_not_max_not_op(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @ult_swap_or_not_max_not_op(
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X:%.*]], 0
-; CHECK-NEXT:    ret i1 [[CMPEQ]]
+; CHECK-NEXT:    [[NOTX:%.*]] = xor i8 [[X:%.*]], -1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i8 [[Y:%.*]], [[NOTX]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X]], 0
+; CHECK-NEXT:    [[R:%.*]] = or i1 [[CMP]], [[CMPEQ]]
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %notx = xor i8 %x, -1
   %cmp = icmp ugt i8 %y, %notx
@@ -2240,8 +2533,11 @@ define i1 @ult_swap_or_not_max_not_op(i8 %x, i8 %y)  {
 
 define i1 @ult_swap_or_not_max_commute_not_op(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @ult_swap_or_not_max_commute_not_op(
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X:%.*]], 0
-; CHECK-NEXT:    ret i1 [[CMPEQ]]
+; CHECK-NEXT:    [[NOTX:%.*]] = xor i8 [[X:%.*]], -1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i8 [[Y:%.*]], [[NOTX]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X]], 0
+; CHECK-NEXT:    [[R:%.*]] = or i1 [[CMPEQ]], [[CMP]]
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %notx = xor i8 %x, -1
   %cmp = icmp ugt i8 %y, %notx
@@ -2258,8 +2554,11 @@ define i1 @ult_swap_or_not_max_commute_not_op(i8 %x, i8 %y)  {
 
 define i1 @sgt_or_not_min_not_op(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @sgt_or_not_min_not_op(
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X:%.*]], 127
-; CHECK-NEXT:    ret i1 [[CMPEQ]]
+; CHECK-NEXT:    [[NOTX:%.*]] = xor i8 [[X:%.*]], -1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i8 [[NOTX]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X]], 127
+; CHECK-NEXT:    [[R:%.*]] = or i1 [[CMP]], [[CMPEQ]]
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %notx = xor i8 %x, -1
   %cmp = icmp sgt i8 %notx, %y
@@ -2270,8 +2569,11 @@ define i1 @sgt_or_not_min_not_op(i8 %x, i8 %y)  {
 
 define i1 @sgt_or_not_min_commute_not_op(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @sgt_or_not_min_commute_not_op(
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X:%.*]], 127
-; CHECK-NEXT:    ret i1 [[CMPEQ]]
+; CHECK-NEXT:    [[NOTX:%.*]] = xor i8 [[X:%.*]], -1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i8 [[NOTX]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X]], 127
+; CHECK-NEXT:    [[R:%.*]] = or i1 [[CMPEQ]], [[CMP]]
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %notx = xor i8 %x, -1
   %cmp = icmp sgt i8 %notx, %y
@@ -2282,8 +2584,11 @@ define i1 @sgt_or_not_min_commute_not_op(i8 %x, i8 %y)  {
 
 define i1 @sgt_swap_or_not_min_not_op(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @sgt_swap_or_not_min_not_op(
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X:%.*]], 127
-; CHECK-NEXT:    ret i1 [[CMPEQ]]
+; CHECK-NEXT:    [[NOTX:%.*]] = xor i8 [[X:%.*]], -1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 [[Y:%.*]], [[NOTX]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X]], 127
+; CHECK-NEXT:    [[R:%.*]] = or i1 [[CMP]], [[CMPEQ]]
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %notx = xor i8 %x, -1
   %cmp = icmp slt i8 %y, %notx
@@ -2294,8 +2599,11 @@ define i1 @sgt_swap_or_not_min_not_op(i8 %x, i8 %y)  {
 
 define i1 @sgt_swap_or_not_min_commute_not_op(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @sgt_swap_or_not_min_commute_not_op(
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X:%.*]], 127
-; CHECK-NEXT:    ret i1 [[CMPEQ]]
+; CHECK-NEXT:    [[NOTX:%.*]] = xor i8 [[X:%.*]], -1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 [[Y:%.*]], [[NOTX]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X]], 127
+; CHECK-NEXT:    [[R:%.*]] = or i1 [[CMPEQ]], [[CMP]]
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %notx = xor i8 %x, -1
   %cmp = icmp slt i8 %y, %notx
@@ -2306,8 +2614,11 @@ define i1 @sgt_swap_or_not_min_commute_not_op(i8 %x, i8 %y)  {
 
 define i1 @ugt_or_not_min_not_op(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @ugt_or_not_min_not_op(
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X:%.*]], -1
-; CHECK-NEXT:    ret i1 [[CMPEQ]]
+; CHECK-NEXT:    [[NOTX:%.*]] = xor i8 [[X:%.*]], -1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i8 [[NOTX]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X]], -1
+; CHECK-NEXT:    [[R:%.*]] = or i1 [[CMP]], [[CMPEQ]]
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %notx = xor i8 %x, -1
   %cmp = icmp ugt i8 %notx, %y
@@ -2318,8 +2629,11 @@ define i1 @ugt_or_not_min_not_op(i8 %x, i8 %y)  {
 
 define i1 @ugt_or_not_min_commute_not_op(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @ugt_or_not_min_commute_not_op(
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X:%.*]], -1
-; CHECK-NEXT:    ret i1 [[CMPEQ]]
+; CHECK-NEXT:    [[NOTX:%.*]] = xor i8 [[X:%.*]], -1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i8 [[NOTX]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X]], -1
+; CHECK-NEXT:    [[R:%.*]] = or i1 [[CMPEQ]], [[CMP]]
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %notx = xor i8 %x, -1
   %cmp = icmp ugt i8 %notx, %y
@@ -2330,8 +2644,11 @@ define i1 @ugt_or_not_min_commute_not_op(i8 %x, i8 %y)  {
 
 define i1 @ugt_swap_or_not_min_not_op(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @ugt_swap_or_not_min_not_op(
-; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X:%.*]], -1
-; CHECK-NEXT:    ret i1 [[CMPEQ]]
+; CHECK-NEXT:    [[NOTX:%.*]] = xor i8 [[X:%.*]], -1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i8 [[Y:%.*]], [[NOTX]]
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne i8 [[X]], -1
+; CHECK-NEXT:    [[R:%.*]] = or i1 [[CMP]], [[CMPEQ]]
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %notx = xor i8 %x, -1
   %cmp = icmp ult i8 %y, %notx
diff --git a/llvm/test/Transforms/InstSimplify/and-or-implied-cond.ll b/llvm/test/Transforms/InstSimplify/and-or-implied-cond.ll
index 7af3138dcaf27..db38077e0c513 100644
--- a/llvm/test/Transforms/InstSimplify/and-or-implied-cond.ll
+++ b/llvm/test/Transforms/InstSimplify/and-or-implied-cond.ll
@@ -236,7 +236,9 @@ define i1 @pr69050(i32 %arg, i32 %arg1) {
 ; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[ARG:%.*]], -1
 ; CHECK-NEXT:    [[AND:%.*]] = and i32 [[XOR]], [[ARG1:%.*]]
 ; CHECK-NEXT:    [[ICMP:%.*]] = icmp ne i32 [[AND]], 0
-; CHECK-NEXT:    ret i1 [[ICMP]]
+; CHECK-NEXT:    [[ICMP2:%.*]] = icmp ne i32 [[ARG]], -1
+; CHECK-NEXT:    [[AND3:%.*]] = and i1 [[ICMP2]], [[ICMP]]
+; CHECK-NEXT:    ret i1 [[AND3]]
 ;
   %xor = xor i32 %arg, -1
   %and = and i32 %xor, %arg1
@@ -249,7 +251,11 @@ define i1 @pr69050(i32 %arg, i32 %arg1) {
 define i1 @pr69091(i32 %arg, i32 %arg1) {
 ; CHECK-LABEL: @pr69091(
 ; CHECK-NEXT:    [[ICMP:%.*]] = icmp ne i32 [[ARG:%.*]], -1
-; CHECK-NEXT:    ret i1 [[ICMP]]
+; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[ARG]], 1
+; CHECK-NEXT:    [[MUL:%.*]] = mul i32 [[ADD]], [[ARG1:%.*]]
+; CHECK-NEXT:    [[ICMP2:%.*]] = icmp ne i32 [[MUL]], 0
+; CHECK-NEXT:    [[OR:%.*]] = or i1 [[ICMP]], [[ICMP2]]
+; CHECK-NEXT:    ret i1 [[OR]]
 ;
   %icmp = icmp ne i32 %arg, -1
   %add = add i32 %arg, 1
diff --git a/llvm/test/Transforms/InstSimplify/div-by-0-guard-before-smul_ov-not.ll b/llvm/test/Transforms/InstSimplify/div-by-0-guard-before-smul_ov-not.ll
index fa0b7f165a3ad..ef2416eccb839 100644
--- a/llvm/test/Transforms/InstSimplify/div-by-0-guard-before-smul_ov-not.ll
+++ b/llvm/test/Transforms/InstSimplify/div-by-0-guard-before-smul_ov-not.ll
@@ -52,7 +52,12 @@ define i1 @n2_wrong_size(i4 %size0, i4 %size1, i4 %nmemb) {
 
 define i1 @n3_wrong_pred(i4 %size, i4 %nmemb) {
 ; CHECK-LABEL: @n3_wrong_pred(
-; CHECK-NEXT:    ret i1 true
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i4 [[SIZE:%.*]], 0
+; CHECK-NEXT:    [[SMUL:%.*]] = tail call { i4, i1 } @llvm.smul.with.overflow.i4(i4 [[SIZE]], i4 [[NMEMB:%.*]])
+; CHECK-NEXT:    [[SMUL_OV:%.*]] = extractvalue { i4, i1 } [[SMUL]], 1
+; CHECK-NEXT:    [[PHITMP:%.*]] = xor i1 [[SMUL_OV]], true
+; CHECK-NEXT:    [[OR:%.*]] = or i1 [[CMP]], [[PHITMP]]
+; CHECK-NEXT:    ret i1 [[OR]]
 ;
   %cmp = icmp ne i4 %size, 0 ; not 'eq'
   %smul = tail call { i4, i1 } @llvm.smul.with.overflow.i4(i4 %size, i4 %nmemb)
@@ -65,7 +70,11 @@ define i1 @n3_wrong_pred(i4 %size, i4 %nmemb) {
 define i1 @n4_not_and(i4 %size, i4 %nmemb) {
 ; CHECK-LABEL: @n4_not_and(
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i4 [[SIZE:%.*]], 0
-; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK-NEXT:    [[SMUL:%.*]] = tail call { i4, i1 } @llvm.smul.with.overflow.i4(i4 [[SIZE]], i4 [[NMEMB:%.*]])
+; CHECK-NEXT:    [[SMUL_OV:%.*]] = extractvalue { i4, i1 } [[SMUL]], 1
+; CHECK-NEXT:    [[PHITMP:%.*]] = xor i1 [[SMUL_OV]], true
+; CHECK-NEXT:    [[OR:%.*]] = and i1 [[CMP]], [[PHITMP]]
+; CHECK-NEXT:    ret i1 [[OR]]
 ;
   %cmp = icmp eq i4 %size, 0
   %smul = tail call { i4, i1 } @llvm.smul.with.overflow.i4(i4 %size, i4 %nmemb)
diff --git a/llvm/test/Transforms/InstSimplify/div-by-0-guard-before-smul_ov.ll b/llvm/test/Transforms/InstSimplify/div-by-0-guard-before-smul_ov.ll
index 001a24c0f026e..84462567271ac 100644
--- a/llvm/test/Transforms/InstSimplify/div-by-0-guard-before-smul_ov.ll
+++ b/llvm/test/Transforms/InstSimplify/div-by-0-guard-before-smul_ov.ll
@@ -46,7 +46,11 @@ define i1 @n2_wrong_size(i4 %size0, i4 %size1, i4 %nmemb) {
 
 define i1 @n3_wrong_pred(i4 %size, i4 %nmemb) {
 ; CHECK-LABEL: @n3_wrong_pred(
-; CHECK-NEXT:    ret i1 false
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i4 [[SIZE:%.*]], 0
+; CHECK-NEXT:    [[SMUL:%.*]] = tail call { i4, i1 } @llvm.smul.with.overflow.i4(i4 [[SIZE]], i4 [[NMEMB:%.*]])
+; CHECK-NEXT:    [[SMUL_OV:%.*]] = extractvalue { i4, i1 } [[SMUL]], 1
+; CHECK-NEXT:    [[AND:%.*]] = and i1 [[SMUL_OV]], [[CMP]]
+; CHECK-NEXT:    ret i1 [[AND]]
 ;
   %cmp = icmp eq i4 %size, 0 ; not 'ne'
   %smul = tail call { i4, i1 } @llvm.smul.with.overflow.i4(i4 %size, i4 %nmemb)
@@ -58,7 +62,10 @@ define i1 @n3_wrong_pred(i4 %size, i4 %nmemb) {
 define i1 @n4_not_and(i4 %size, i4 %nmemb) {
 ; CHECK-LABEL: @n4_not_and(
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i4 [[SIZE:%.*]], 0
-; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK-NEXT:    [[SMUL:%.*]] = tail call { i4, i1 } @llvm.smul.with.overflow.i4(i4 [[SIZE]], i4 [[NMEMB:%.*]])
+; CHECK-NEXT:    [[SMUL_OV:%.*]] = extractvalue { i4, i1 } [[SMUL]], 1
+; CHECK-NEXT:    [[AND:%.*]] = or i1 [[SMUL_OV]], [[CMP]]
+; CHECK-NEXT:    ret i1 [[AND]]
 ;
   %cmp = icmp ne i4 %size, 0
   %smul = tail call { i4, i1 } @llvm.smul.with.overflow.i4(i4 %size, i4 %nmemb)
diff --git a/llvm/test/Transforms/InstSimplify/div-by-0-guard-before-umul_ov-not.ll b/llvm/test/Transforms/InstSimplify/div-by-0-guard-before-umul_ov-not.ll
index a1c317ec8ee29..c332b35fc1874 100644
--- a/llvm/test/Transforms/InstSimplify/div-by-0-guard-before-umul_ov-not.ll
+++ b/llvm/test/Transforms/InstSimplify/div-by-0-guard-before-umul_ov-not.ll
@@ -52,7 +52,12 @@ define i1 @n2_wrong_size(i4 %size0, i4 %size1, i4 %nmemb) {
 
 define i1 @n3_wrong_pred(i4 %size, i4 %nmemb) {
 ; CHECK-LABEL: @n3_wrong_pred(
-; CHECK-NEXT:    ret i1 true
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i4 [[SIZE:%.*]], 0
+; CHECK-NEXT:    [[UMUL:%.*]] = tail call { i4, i1 } @llvm.umul.with.overflow.i4(i4 [[SIZE]], i4 [[NMEMB:%.*]])
+; CHECK-NEXT:    [[UMUL_OV:%.*]] = extractvalue { i4, i1 } [[UMUL]], 1
+; CHECK-NEXT:    [[PHITMP:%.*]] = xor i1 [[UMUL_OV]], true
+; CHECK-NEXT:    [[OR:%.*]] = or i1 [[CMP]], [[PHITMP]]
+; CHECK-NEXT:    ret i1 [[OR]]
 ;
   %cmp = icmp ne i4 %size, 0 ; not 'eq'
   %umul = tail call { i4, i1 } @llvm.umul.with.overflow.i4(i4 %size, i4 %nmemb)
@@ -65,7 +70,11 @@ define i1 @n3_wrong_pred(i4 %size, i4 %nmemb) {
 define i1 @n4_not_and(i4 %size, i4 %nmemb) {
 ; CHECK-LABEL: @n4_not_and(
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i4 [[SIZE:%.*]], 0
-; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK-NEXT:    [[UMUL:%.*]] = tail call { i4, i1 } @llvm.umul.with.overflow.i4(i4 [[SIZE]], i4 [[NMEMB:%.*]])
+; CHECK-NEXT:    [[UMUL_OV:%.*]] = extractvalue { i4, i1 } [[UMUL]], 1
+; CHECK-NEXT:    [[PHITMP:%.*]] = xor i1 [[UMUL_OV]], true
+; CHECK-NEXT:    [[OR:%.*]] = and i1 [[CMP]], [[PHITMP]]
+; CHECK-NEXT:    ret i1 [[OR]]
 ;
   %cmp = icmp eq i4 %size, 0
   %umul = tail call { i4, i1 } @llvm.umul.with.overflow.i4(i4 %size, i4 %nmemb)
diff --git a/llvm/test/Transforms/InstSimplify/div-by-0-guard-before-umul_ov.ll b/llvm/test/Transforms/InstSimplify/div-by-0-guard-before-umul_ov.ll
index f75b8a6282764..9ab0819e214e7 100644
--- a/llvm/test/Transforms/InstSimplify/div-by-0-guard-before-umul_ov.ll
+++ b/llvm/test/Transforms/InstSimplify/div-by-0-guard-before-umul_ov.ll
@@ -46,7 +46,11 @@ define i1 @n2_wrong_size(i4 %size0, i4 %size1, i4 %nmemb) {
 
 define i1 @n3_wrong_pred(i4 %size, i4 %nmemb) {
 ; CHECK-LABEL: @n3_wrong_pred(
-; CHECK-NEXT:    ret i1 false
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i4 [[SIZE:%.*]], 0
+; CHECK-NEXT:    [[UMUL:%.*]] = tail call { i4, i1 } @llvm.umul.with.overflow.i4(i4 [[SIZE]], i4 [[NMEMB:%.*]])
+; CHECK-NEXT:    [[UMUL_OV:%.*]] = extractvalue { i4, i1 } [[UMUL]], 1
+; CHECK-NEXT:    [[AND:%.*]] = and i1 [[UMUL_OV]], [[CMP]]
+; CHECK-NEXT:    ret i1 [[AND]]
 ;
   %cmp = icmp eq i4 %size, 0 ; not 'ne'
   %umul = tail call { i4, i1 } @llvm.umul.with.overflow.i4(i4 %size, i4 %nmemb)
@@ -58,7 +62,10 @@ define i1 @n3_wrong_pred(i4 %size, i4 %nmemb) {
 define i1 @n4_not_and(i4 %size, i4 %nmemb) {
 ; CHECK-LABEL: @n4_not_and(
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i4 [[SIZE:%.*]], 0
-; CHECK-NEXT:    ret i1 [[CMP]]
+; CHECK-NEXT:    [[UMUL:%.*]] = tail call { i4, i1 } @llvm.umul.with.overflow.i4(i4 [[SIZE]], i4 [[NMEMB:%.*]])
+; CHECK-NEXT:    [[UMUL_OV:%.*]] = extractvalue { i4, i1 } [[UMUL]], 1
+; CHECK-NEXT:    [[AND:%.*]] = or i1 [[UMUL_OV]], [[CMP]]
+; CHECK-NEXT:    ret i1 [[AND]]
 ;
   %cmp = icmp ne i4 %size, 0
   %umul = tail call { i4, i1 } @llvm.umul.with.overflow.i4(i4 %size, i4 %nmemb)
diff --git a/llvm/test/Transforms/InstSimplify/result-of-add-of-negative-is-non-zero-and-no-underflow.ll b/llvm/test/Transforms/InstSimplify/result-of-add-of-negative-is-non-zero-and-no-underflow.ll
index 38bc66ff4d692..abcbb78889f1d 100644
--- a/llvm/test/Transforms/InstSimplify/result-of-add-of-negative-is-non-zero-and-no-underflow.ll
+++ b/llvm/test/Transforms/InstSimplify/result-of-add-of-negative-is-non-zero-and-no-underflow.ll
@@ -18,7 +18,10 @@ define i1 @t1(i8 %base, i8 %offset) {
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP]])
 ; CHECK-NEXT:    [[ADJUSTED:%.*]] = add i8 [[BASE]], [[OFFSET:%.*]]
 ; CHECK-NEXT:    call void @use8(i8 [[ADJUSTED]])
-; CHECK-NEXT:    ret i1 true
+; CHECK-NEXT:    [[NOT_NULL:%.*]] = icmp ne i8 [[ADJUSTED]], 0
+; CHECK-NEXT:    [[NO_UNDERFLOW:%.*]] = icmp ult i8 [[ADJUSTED]], [[BASE]]
+; CHECK-NEXT:    [[R:%.*]] = or i1 [[NOT_NULL]], [[NO_UNDERFLOW]]
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %cmp = icmp slt i8 %base, 0
   call void @llvm.assume(i1 %cmp)
@@ -36,7 +39,10 @@ define i1 @t2_commutative(i8 %base, i8 %offset) {
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP]])
 ; CHECK-NEXT:    [[ADJUSTED:%.*]] = add i8 [[BASE]], [[OFFSET:%.*]]
 ; CHECK-NEXT:    call void @use8(i8 [[ADJUSTED]])
-; CHECK-NEXT:    ret i1 true
+; CHECK-NEXT:    [[NOT_NULL:%.*]] = icmp ne i8 [[ADJUSTED]], 0
+; CHECK-NEXT:    [[NO_UNDERFLOW:%.*]] = icmp ugt i8 [[BASE]], [[ADJUSTED]]
+; CHECK-NEXT:    [[R:%.*]] = or i1 [[NOT_NULL]], [[NO_UNDERFLOW]]
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %cmp = icmp slt i8 %base, 0
   call void @llvm.assume(i1 %cmp)
@@ -57,7 +63,10 @@ define i1 @t3(i8 %base, i8 %offset) {
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP]])
 ; CHECK-NEXT:    [[ADJUSTED:%.*]] = add i8 [[BASE]], [[OFFSET:%.*]]
 ; CHECK-NEXT:    call void @use8(i8 [[ADJUSTED]])
-; CHECK-NEXT:    ret i1 false
+; CHECK-NEXT:    [[NOT_NULL:%.*]] = icmp eq i8 [[ADJUSTED]], 0
+; CHECK-NEXT:    [[NO_UNDERFLOW:%.*]] = icmp uge i8 [[ADJUSTED]], [[BASE]]
+; CHECK-NEXT:    [[R:%.*]] = and i1 [[NOT_NULL]], [[NO_UNDERFLOW]]
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %cmp = icmp slt i8 %base, 0
   call void @llvm.assume(i1 %cmp)
@@ -75,7 +84,10 @@ define i1 @t4_commutative(i8 %base, i8 %offset) {
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP]])
 ; CHECK-NEXT:    [[ADJUSTED:%.*]] = add i8 [[BASE]], [[OFFSET:%.*]]
 ; CHECK-NEXT:    call void @use8(i8 [[ADJUSTED]])
-; CHECK-NEXT:    ret i1 false
+; CHECK-NEXT:    [[NOT_NULL:%.*]] = icmp eq i8 [[ADJUSTED]], 0
+; CHECK-NEXT:    [[NO_UNDERFLOW:%.*]] = icmp ule i8 [[BASE]], [[ADJUSTED]]
+; CHECK-NEXT:    [[R:%.*]] = and i1 [[NOT_NULL]], [[NO_UNDERFLOW]]
+; CHECK-NEXT:    ret i1 [[R]]
 ;
   %cmp = icmp slt i8 %base, 0
   call void @llvm.assume(i1 %cmp)
diff --git a/llvm/test/Transforms/PGOProfile/chr.ll b/llvm/test/Transforms/PGOProfile/chr.ll
index c4030af943a90..c82800ec11a12 100644
--- a/llvm/test/Transforms/PGOProfile/chr.ll
+++ b/llvm/test/Transforms/PGOProfile/chr.ll
@@ -1298,11 +1298,12 @@ define i32 @test_chr_14(ptr %i, ptr %j, i32 %sum0, i1 %pred, i32 %z) !prof !14 {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[Z_FR:%.*]] = freeze i32 [[Z:%.*]]
 ; CHECK-NEXT:    [[I0:%.*]] = load i32, ptr [[I:%.*]], align 4
-; CHECK-NEXT:    [[V1:%.*]] = icmp eq i32 [[Z_FR]], 1
-; CHECK-NEXT:    br i1 [[V1]], label [[BB1:%.*]], label [[ENTRY_SPLIT_NONCHR:%.*]], !prof [[PROF15]]
+; CHECK-NEXT:    [[V1_NOT:%.*]] = icmp eq i32 [[Z_FR]], 1
+; CHECK-NEXT:    br i1 [[V1_NOT]], label [[BB1:%.*]], label [[ENTRY_SPLIT_NONCHR:%.*]], !prof [[PROF15]]
 ; CHECK:       entry.split.nonchr:
+; CHECK-NEXT:    [[PRED_FR:%.*]] = freeze i1 [[PRED:%.*]]
 ; CHECK-NEXT:    [[V0:%.*]] = icmp eq i32 [[Z_FR]], 0
-; CHECK-NEXT:    [[V3_NONCHR:%.*]] = and i1 [[V0]], [[PRED:%.*]]
+; CHECK-NEXT:    [[V3_NONCHR:%.*]] = and i1 [[V0]], [[PRED_FR]]
 ; CHECK-NEXT:    br i1 [[V3_NONCHR]], label [[BB0_NONCHR:%.*]], label [[BB1]], !prof [[PROF16]]
 ; CHECK:       bb0.nonchr:
 ; CHECK-NEXT:    call void @foo()

From 0f8615f4dc568f4d7cbf73580eef3e78f64f3bd0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?=
 =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?=
 =?UTF-8?q?=E3=83=B3=29?= <clementval@gmail.com>
Date: Mon, 30 Oct 2023 10:35:43 -0700
Subject: [PATCH 407/877] [flang][openacc][openmp] Set correct location on
 atomic operations (#70680)

The location set on atomic operations in both OpenMP and OpenACC was
completly off. The real location needs to be created from the source
CharBlock of the parse tree node of the respective atomic statement.
This patch updates locations in lowering for atomic operations.
---
 flang/lib/Lower/DirectivesCommon.h     | 65 ++++++++++++--------------
 flang/lib/Lower/OpenACC.cpp            | 14 ++++--
 flang/lib/Lower/OpenMP.cpp             | 19 ++++++--
 flang/test/Lower/OpenACC/locations.f90 | 52 +++++++++++++++++++++
 4 files changed, 104 insertions(+), 46 deletions(-)

diff --git a/flang/lib/Lower/DirectivesCommon.h b/flang/lib/Lower/DirectivesCommon.h
index 1b231ee1b891b..33b8198a2518b 100644
--- a/flang/lib/Lower/DirectivesCommon.h
+++ b/flang/lib/Lower/DirectivesCommon.h
@@ -132,10 +132,9 @@ static inline void genOmpAccAtomicCaptureStatement(
     mlir::Value toAddress,
     [[maybe_unused]] const AtomicListT *leftHandClauseList,
     [[maybe_unused]] const AtomicListT *rightHandClauseList,
-    mlir::Type elementType) {
+    mlir::Type elementType, mlir::Location loc) {
   // Generate `atomic.read` operation for atomic assigment statements
   fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
-  mlir::Location currentLocation = converter.getCurrentLocation();
 
   if constexpr (std::is_same<AtomicListT,
                              Fortran::parser::OmpAtomicClauseList>()) {
@@ -151,12 +150,11 @@ static inline void genOmpAccAtomicCaptureStatement(
       genOmpAtomicHintAndMemoryOrderClauses(converter, *rightHandClauseList,
                                             hint, memoryOrder);
     firOpBuilder.create<mlir::omp::AtomicReadOp>(
-        currentLocation, fromAddress, toAddress,
-        mlir::TypeAttr::get(elementType), hint, memoryOrder);
+        loc, fromAddress, toAddress, mlir::TypeAttr::get(elementType), hint,
+        memoryOrder);
   } else {
     firOpBuilder.create<mlir::acc::AtomicReadOp>(
-        currentLocation, fromAddress, toAddress,
-        mlir::TypeAttr::get(elementType));
+        loc, fromAddress, toAddress, mlir::TypeAttr::get(elementType));
   }
 }
 
@@ -166,11 +164,10 @@ template <typename AtomicListT>
 static inline void genOmpAccAtomicWriteStatement(
     Fortran::lower::AbstractConverter &converter, mlir::Value lhsAddr,
     mlir::Value rhsExpr, [[maybe_unused]] const AtomicListT *leftHandClauseList,
-    [[maybe_unused]] const AtomicListT *rightHandClauseList,
+    [[maybe_unused]] const AtomicListT *rightHandClauseList, mlir::Location loc,
     mlir::Value *evaluatedExprValue = nullptr) {
   // Generate `atomic.write` operation for atomic assignment statements
   fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
-  mlir::Location currentLocation = converter.getCurrentLocation();
 
   if constexpr (std::is_same<AtomicListT,
                              Fortran::parser::OmpAtomicClauseList>()) {
@@ -184,11 +181,10 @@ static inline void genOmpAccAtomicWriteStatement(
     if (rightHandClauseList)
       genOmpAtomicHintAndMemoryOrderClauses(converter, *rightHandClauseList,
                                             hint, memoryOrder);
-    firOpBuilder.create<mlir::omp::AtomicWriteOp>(currentLocation, lhsAddr,
-                                                  rhsExpr, hint, memoryOrder);
+    firOpBuilder.create<mlir::omp::AtomicWriteOp>(loc, lhsAddr, rhsExpr, hint,
+                                                  memoryOrder);
   } else {
-    firOpBuilder.create<mlir::acc::AtomicWriteOp>(currentLocation, lhsAddr,
-                                                  rhsExpr);
+    firOpBuilder.create<mlir::acc::AtomicWriteOp>(loc, lhsAddr, rhsExpr);
   }
 }
 
@@ -200,7 +196,7 @@ static inline void genOmpAccAtomicUpdateStatement(
     mlir::Type varType, const Fortran::parser::Variable &assignmentStmtVariable,
     const Fortran::parser::Expr &assignmentStmtExpr,
     [[maybe_unused]] const AtomicListT *leftHandClauseList,
-    [[maybe_unused]] const AtomicListT *rightHandClauseList,
+    [[maybe_unused]] const AtomicListT *rightHandClauseList, mlir::Location loc,
     mlir::Operation *atomicCaptureOp = nullptr) {
   // Generate `atomic.update` operation for atomic assignment statements
   fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
@@ -302,7 +298,7 @@ static inline void genOmpAccAtomicUpdateStatement(
 /// Processes an atomic construct with write clause.
 template <typename AtomicT, typename AtomicListT>
 void genOmpAccAtomicWrite(Fortran::lower::AbstractConverter &converter,
-                          const AtomicT &atomicWrite) {
+                          const AtomicT &atomicWrite, mlir::Location loc) {
   const AtomicListT *rightHandClauseList = nullptr;
   const AtomicListT *leftHandClauseList = nullptr;
   if constexpr (std::is_same<AtomicListT,
@@ -324,13 +320,13 @@ void genOmpAccAtomicWrite(Fortran::lower::AbstractConverter &converter,
   mlir::Value lhsAddr =
       fir::getBase(converter.genExprAddr(assign.lhs, stmtCtx));
   genOmpAccAtomicWriteStatement(converter, lhsAddr, rhsExpr, leftHandClauseList,
-                                rightHandClauseList);
+                                rightHandClauseList, loc);
 }
 
 /// Processes an atomic construct with read clause.
 template <typename AtomicT, typename AtomicListT>
 void genOmpAccAtomicRead(Fortran::lower::AbstractConverter &converter,
-                         const AtomicT &atomicRead) {
+                         const AtomicT &atomicRead, mlir::Location loc) {
   const AtomicListT *rightHandClauseList = nullptr;
   const AtomicListT *leftHandClauseList = nullptr;
   if constexpr (std::is_same<AtomicListT,
@@ -357,20 +353,19 @@ void genOmpAccAtomicRead(Fortran::lower::AbstractConverter &converter,
       fir::getBase(converter.genExprAddr(fromExpr, stmtCtx));
   mlir::Value toAddress = fir::getBase(converter.genExprAddr(
       *Fortran::semantics::GetExpr(assignmentStmtVariable), stmtCtx));
-  mlir::Location loc = converter.getCurrentLocation();
   fir::FirOpBuilder &builder = converter.getFirOpBuilder();
   if (fromAddress.getType() != toAddress.getType())
     fromAddress =
         builder.create<fir::ConvertOp>(loc, toAddress.getType(), fromAddress);
   genOmpAccAtomicCaptureStatement(converter, fromAddress, toAddress,
                                   leftHandClauseList, rightHandClauseList,
-                                  elementType);
+                                  elementType, loc);
 }
 
 /// Processes an atomic construct with update clause.
 template <typename AtomicT, typename AtomicListT>
 void genOmpAccAtomicUpdate(Fortran::lower::AbstractConverter &converter,
-                           const AtomicT &atomicUpdate) {
+                           const AtomicT &atomicUpdate, mlir::Location loc) {
   const AtomicListT *rightHandClauseList = nullptr;
   const AtomicListT *leftHandClauseList = nullptr;
   if constexpr (std::is_same<AtomicListT,
@@ -395,13 +390,13 @@ void genOmpAccAtomicUpdate(Fortran::lower::AbstractConverter &converter,
   mlir::Type varType = fir::unwrapRefType(lhsAddr.getType());
   genOmpAccAtomicUpdateStatement<AtomicListT>(
       converter, lhsAddr, varType, assignmentStmtVariable, assignmentStmtExpr,
-      leftHandClauseList, rightHandClauseList);
+      leftHandClauseList, rightHandClauseList, loc);
 }
 
 /// Processes an atomic construct with no clause - which implies update clause.
 template <typename AtomicT, typename AtomicListT>
 void genOmpAtomic(Fortran::lower::AbstractConverter &converter,
-                  const AtomicT &atomicConstruct) {
+                  const AtomicT &atomicConstruct, mlir::Location loc) {
   const AtomicListT &atomicClauseList =
       std::get<AtomicListT>(atomicConstruct.t);
   const auto &assignmentStmtExpr = std::get<Fortran::parser::Expr>(
@@ -420,15 +415,14 @@ void genOmpAtomic(Fortran::lower::AbstractConverter &converter,
   // the update clause is specified (for both OpenMP and OpenACC).
   genOmpAccAtomicUpdateStatement<AtomicListT>(
       converter, lhsAddr, varType, assignmentStmtVariable, assignmentStmtExpr,
-      &atomicClauseList, nullptr);
+      &atomicClauseList, nullptr, loc);
 }
 
 /// Processes an atomic construct with capture clause.
 template <typename AtomicT, typename AtomicListT>
 void genOmpAccAtomicCapture(Fortran::lower::AbstractConverter &converter,
-                            const AtomicT &atomicCapture) {
+                            const AtomicT &atomicCapture, mlir::Location loc) {
   fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
-  mlir::Location currentLocation = converter.getCurrentLocation();
 
   const Fortran::parser::AssignmentStmt &stmt1 =
       std::get<typename AtomicT::Stmt1>(atomicCapture.t).v.statement;
@@ -480,11 +474,10 @@ void genOmpAccAtomicCapture(Fortran::lower::AbstractConverter &converter,
                                           memoryOrder);
     genOmpAtomicHintAndMemoryOrderClauses(converter, rightHandClauseList, hint,
                                           memoryOrder);
-    atomicCaptureOp = firOpBuilder.create<mlir::omp::AtomicCaptureOp>(
-        currentLocation, hint, memoryOrder);
-  } else {
     atomicCaptureOp =
-        firOpBuilder.create<mlir::acc::AtomicCaptureOp>(currentLocation);
+        firOpBuilder.create<mlir::omp::AtomicCaptureOp>(loc, hint, memoryOrder);
+  } else {
+    atomicCaptureOp = firOpBuilder.create<mlir::acc::AtomicCaptureOp>(loc);
   }
 
   firOpBuilder.createBlock(&(atomicCaptureOp->getRegion(0)));
@@ -499,11 +492,11 @@ void genOmpAccAtomicCapture(Fortran::lower::AbstractConverter &converter,
       genOmpAccAtomicCaptureStatement<AtomicListT>(
           converter, stmt1RHSArg, stmt1LHSArg,
           /*leftHandClauseList=*/nullptr,
-          /*rightHandClauseList=*/nullptr, elementType);
+          /*rightHandClauseList=*/nullptr, elementType, loc);
       genOmpAccAtomicUpdateStatement<AtomicListT>(
           converter, stmt1RHSArg, stmt2VarType, stmt2Var, stmt2Expr,
           /*leftHandClauseList=*/nullptr,
-          /*rightHandClauseList=*/nullptr, atomicCaptureOp);
+          /*rightHandClauseList=*/nullptr, loc, atomicCaptureOp);
     } else {
       // Atomic capture construct is of the form [capture-stmt, write-stmt]
       const Fortran::semantics::SomeExpr &fromExpr =
@@ -512,11 +505,11 @@ void genOmpAccAtomicCapture(Fortran::lower::AbstractConverter &converter,
       genOmpAccAtomicCaptureStatement<AtomicListT>(
           converter, stmt1RHSArg, stmt1LHSArg,
           /*leftHandClauseList=*/nullptr,
-          /*rightHandClauseList=*/nullptr, elementType);
+          /*rightHandClauseList=*/nullptr, elementType, loc);
       genOmpAccAtomicWriteStatement<AtomicListT>(
           converter, stmt1RHSArg, stmt2RHSArg,
           /*leftHandClauseList=*/nullptr,
-          /*rightHandClauseList=*/nullptr);
+          /*rightHandClauseList=*/nullptr, loc);
     }
   } else {
     // Atomic capture construct is of the form [update-stmt, capture-stmt]
@@ -527,19 +520,19 @@ void genOmpAccAtomicCapture(Fortran::lower::AbstractConverter &converter,
     genOmpAccAtomicCaptureStatement<AtomicListT>(
         converter, stmt1LHSArg, stmt2LHSArg,
         /*leftHandClauseList=*/nullptr,
-        /*rightHandClauseList=*/nullptr, elementType);
+        /*rightHandClauseList=*/nullptr, elementType, loc);
     firOpBuilder.setInsertionPointToStart(&block);
     genOmpAccAtomicUpdateStatement<AtomicListT>(
         converter, stmt1LHSArg, stmt1VarType, stmt1Var, stmt1Expr,
         /*leftHandClauseList=*/nullptr,
-        /*rightHandClauseList=*/nullptr, atomicCaptureOp);
+        /*rightHandClauseList=*/nullptr, loc, atomicCaptureOp);
   }
   firOpBuilder.setInsertionPointToEnd(&block);
   if constexpr (std::is_same<AtomicListT,
                              Fortran::parser::OmpAtomicClauseList>()) {
-    firOpBuilder.create<mlir::omp::TerminatorOp>(currentLocation);
+    firOpBuilder.create<mlir::omp::TerminatorOp>(loc);
   } else {
-    firOpBuilder.create<mlir::acc::TerminatorOp>(currentLocation);
+    firOpBuilder.create<mlir::acc::TerminatorOp>(loc);
   }
   firOpBuilder.setInsertionPointToStart(&block);
 }
diff --git a/flang/lib/Lower/OpenACC.cpp b/flang/lib/Lower/OpenACC.cpp
index 3f7ef5e574712..8218af691b79c 100644
--- a/flang/lib/Lower/OpenACC.cpp
+++ b/flang/lib/Lower/OpenACC.cpp
@@ -3306,25 +3306,29 @@ static void
 genACC(Fortran::lower::AbstractConverter &converter,
        Fortran::lower::pft::Evaluation &eval,
        const Fortran::parser::OpenACCAtomicConstruct &atomicConstruct) {
+
+  mlir::Location loc = converter.genLocation(atomicConstruct.source);
   std::visit(
       Fortran::common::visitors{
           [&](const Fortran::parser::AccAtomicRead &atomicRead) {
             Fortran::lower::genOmpAccAtomicRead<Fortran::parser::AccAtomicRead,
-                                                void>(converter, atomicRead);
+                                                void>(converter, atomicRead,
+                                                      loc);
           },
           [&](const Fortran::parser::AccAtomicWrite &atomicWrite) {
             Fortran::lower::genOmpAccAtomicWrite<
-                Fortran::parser::AccAtomicWrite, void>(converter, atomicWrite);
+                Fortran::parser::AccAtomicWrite, void>(converter, atomicWrite,
+                                                       loc);
           },
           [&](const Fortran::parser::AccAtomicUpdate &atomicUpdate) {
             Fortran::lower::genOmpAccAtomicUpdate<
-                Fortran::parser::AccAtomicUpdate, void>(converter,
-                                                        atomicUpdate);
+                Fortran::parser::AccAtomicUpdate, void>(converter, atomicUpdate,
+                                                        loc);
           },
           [&](const Fortran::parser::AccAtomicCapture &atomicCapture) {
             Fortran::lower::genOmpAccAtomicCapture<
                 Fortran::parser::AccAtomicCapture, void>(converter,
-                                                         atomicCapture);
+                                                         atomicCapture, loc);
           },
       },
       atomicConstruct.u);
diff --git a/flang/lib/Lower/OpenMP.cpp b/flang/lib/Lower/OpenMP.cpp
index 0faaae6c08e04..1b9a03f74ac47 100644
--- a/flang/lib/Lower/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP.cpp
@@ -3060,29 +3060,38 @@ genOMP(Fortran::lower::AbstractConverter &converter,
   std::visit(
       Fortran::common::visitors{
           [&](const Fortran::parser::OmpAtomicRead &atomicRead) {
+            mlir::Location loc = converter.genLocation(atomicRead.source);
             Fortran::lower::genOmpAccAtomicRead<
                 Fortran::parser::OmpAtomicRead,
-                Fortran::parser::OmpAtomicClauseList>(converter, atomicRead);
+                Fortran::parser::OmpAtomicClauseList>(converter, atomicRead,
+                                                      loc);
           },
           [&](const Fortran::parser::OmpAtomicWrite &atomicWrite) {
+            mlir::Location loc = converter.genLocation(atomicWrite.source);
             Fortran::lower::genOmpAccAtomicWrite<
                 Fortran::parser::OmpAtomicWrite,
-                Fortran::parser::OmpAtomicClauseList>(converter, atomicWrite);
+                Fortran::parser::OmpAtomicClauseList>(converter, atomicWrite,
+                                                      loc);
           },
           [&](const Fortran::parser::OmpAtomic &atomicConstruct) {
+            mlir::Location loc = converter.genLocation(atomicConstruct.source);
             Fortran::lower::genOmpAtomic<Fortran::parser::OmpAtomic,
                                          Fortran::parser::OmpAtomicClauseList>(
-                converter, atomicConstruct);
+                converter, atomicConstruct, loc);
           },
           [&](const Fortran::parser::OmpAtomicUpdate &atomicUpdate) {
+            mlir::Location loc = converter.genLocation(atomicUpdate.source);
             Fortran::lower::genOmpAccAtomicUpdate<
                 Fortran::parser::OmpAtomicUpdate,
-                Fortran::parser::OmpAtomicClauseList>(converter, atomicUpdate);
+                Fortran::parser::OmpAtomicClauseList>(converter, atomicUpdate,
+                                                      loc);
           },
           [&](const Fortran::parser::OmpAtomicCapture &atomicCapture) {
+            mlir::Location loc = converter.genLocation(atomicCapture.source);
             Fortran::lower::genOmpAccAtomicCapture<
                 Fortran::parser::OmpAtomicCapture,
-                Fortran::parser::OmpAtomicClauseList>(converter, atomicCapture);
+                Fortran::parser::OmpAtomicClauseList>(converter, atomicCapture,
+                                                      loc);
           },
       },
       atomicConstruct.u);
diff --git a/flang/test/Lower/OpenACC/locations.f90 b/flang/test/Lower/OpenACC/locations.f90
index 19788f9f6d1aa..031d8eda48acd 100644
--- a/flang/test/Lower/OpenACC/locations.f90
+++ b/flang/test/Lower/OpenACC/locations.f90
@@ -111,4 +111,56 @@ subroutine if_clause_expr_location(arr)
     !CHECK-NEXT: } loc("{{.*}}locations.f90":99:11)
   end subroutine
 
+  subroutine atomic_read_loc()
+    integer(4) :: x
+    integer(8) :: y
+  
+    !$acc atomic read
+    y = x
+  end
+  !CHECK: acc.atomic.read {{.*}} loc("{{.*}}locations.f90":118:11)
+
+  subroutine atomic_capture_loc()
+    implicit none
+    integer :: k, v, i
+  
+    k = 1
+    v = 0
+  
+    !$acc atomic capture
+    v = k
+    k = (i + 1) * 3.14
+    !$acc end atomic
+
+! CHECK: acc.atomic.capture {
+! CHECK:   acc.atomic.read {{.*}} loc("{{.*}}locations.f90":130:11)
+! CHECK:   acc.atomic.write {{.*}} loc("{{.*}}locations.f90":130:11)
+! CHECK: } loc("{{.*}}locations.f90":130:11)
+
+  end subroutine
+
+  subroutine atomic_update_loc()
+    implicit none
+    integer :: x, y, z
+    
+    !$acc atomic 
+    y = y + 1
+! CHECK: acc.atomic.update %{{.*}} : !fir.ref<i32> {
+! CHECK: ^bb0(%{{.*}}: i32 loc("{{.*}}locations.f90":142:3)):
+! CHECK: } loc("{{.*}}locations.f90":142:3)
+    
+    !$acc atomic update
+    z = x * z 
+
+    ! %3 = fir.load %0 : !fir.ref<i32> loc("/local/home/vclement/llvm-project/flang/test/Lower/OpenACC/locations.f90":142:3)
+    ! acc.atomic.update %2 : !fir.ref<i32> {
+    ! ^bb0(%arg0: i32 loc("/local/home/vclement/llvm-project/flang/test/Lower/OpenACC/locations.f90":142:3)):
+    !   %4 = arith.muli %3, %arg0 : i32 loc("/local/home/vclement/llvm-project/flang/test/Lower/OpenACC/locations.f90":142:3)
+    !   acc.yield %4 : i32 loc("/local/home/vclement/llvm-project/flang/test/Lower/OpenACC/locations.f90":142:3)
+    ! } loc("/local/home/vclement/llvm-project/flang/test/Lower/OpenACC/locations.f90":142:3)
+  end subroutine
+
+
 end module
+
+

From b45236f13391dbe03fb9ea029b1a42e6dd989de4 Mon Sep 17 00:00:00 2001
From: DaPorkchop_ <daporkchop@daporkchop.net>
Date: Mon, 30 Oct 2023 19:15:36 +0100
Subject: [PATCH 408/877] [clang] Implement constexpr bit_cast for vectors
 (#66894)

This makes __builtin_bit_cast support converting to and from vector
types in a constexpr context.
---
 .../include/clang/Basic/DiagnosticASTKinds.td |   5 +-
 clang/lib/AST/ExprConstant.cpp                | 269 ++++++++++++------
 clang/lib/CodeGen/CGExprConstant.cpp          |   3 +
 clang/test/CodeGen/const-init.c               |   9 +-
 .../constexpr-builtin-bit-cast-fp80.cpp       |  66 +++++
 .../SemaCXX/constexpr-builtin-bit-cast.cpp    |  44 +++
 6 files changed, 297 insertions(+), 99 deletions(-)
 create mode 100644 clang/test/SemaCXX/constexpr-builtin-bit-cast-fp80.cpp

diff --git a/clang/include/clang/Basic/DiagnosticASTKinds.td b/clang/include/clang/Basic/DiagnosticASTKinds.td
index 492f6b270ca52..031117f2c4137 100644
--- a/clang/include/clang/Basic/DiagnosticASTKinds.td
+++ b/clang/include/clang/Basic/DiagnosticASTKinds.td
@@ -317,7 +317,7 @@ def note_constexpr_memcpy_unsupported : Note<
   "source is not a contiguous array of at least %4 elements of type %3|"
   "destination is not a contiguous array of at least %4 elements of type %3}2">;
 def note_constexpr_bit_cast_unsupported_type : Note<
-  "constexpr bit_cast involving type %0 is not yet supported">;
+  "constexpr bit cast involving type %0 is not yet supported">;
 def note_constexpr_bit_cast_unsupported_bitfield : Note<
   "constexpr bit_cast involving bit-field is not yet supported">;
 def note_constexpr_bit_cast_invalid_type : Note<
@@ -326,6 +326,9 @@ def note_constexpr_bit_cast_invalid_type : Note<
   "%select{type|member}1 is not allowed in a constant expression">;
 def note_constexpr_bit_cast_invalid_subtype : Note<
   "invalid type %0 is a %select{member|base}1 of %2">;
+def note_constexpr_bit_cast_invalid_vector : Note<
+  "bit_cast involving type %0 is not allowed in a constant expression; "
+  "element size %1 * element count %2 is not a multiple of the byte size %3">;
 def note_constexpr_bit_cast_indet_dest : Note<
   "indeterminate value can only initialize an object of type 'unsigned char'"
   "%select{, 'char',|}1 or 'std::byte'; %0 is invalid">;
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index 5947805f9576f..f6f71ce6bfe0f 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -2737,53 +2737,6 @@ static bool truncateBitfieldValue(EvalInfo &Info, const Expr *E,
   return true;
 }
 
-static bool EvalAndBitcastToAPInt(EvalInfo &Info, const Expr *E,
-                                  llvm::APInt &Res) {
-  APValue SVal;
-  if (!Evaluate(SVal, Info, E))
-    return false;
-  if (SVal.isInt()) {
-    Res = SVal.getInt();
-    return true;
-  }
-  if (SVal.isFloat()) {
-    Res = SVal.getFloat().bitcastToAPInt();
-    return true;
-  }
-  if (SVal.isVector()) {
-    QualType VecTy = E->getType();
-    unsigned VecSize = Info.Ctx.getTypeSize(VecTy);
-    QualType EltTy = VecTy->castAs<VectorType>()->getElementType();
-    unsigned EltSize = Info.Ctx.getTypeSize(EltTy);
-    bool BigEndian = Info.Ctx.getTargetInfo().isBigEndian();
-    Res = llvm::APInt::getZero(VecSize);
-    for (unsigned i = 0; i < SVal.getVectorLength(); i++) {
-      APValue &Elt = SVal.getVectorElt(i);
-      llvm::APInt EltAsInt;
-      if (Elt.isInt()) {
-        EltAsInt = Elt.getInt();
-      } else if (Elt.isFloat()) {
-        EltAsInt = Elt.getFloat().bitcastToAPInt();
-      } else {
-        // Don't try to handle vectors of anything other than int or float
-        // (not sure if it's possible to hit this case).
-        Info.FFDiag(E, diag::note_invalid_subexpr_in_const_expr);
-        return false;
-      }
-      unsigned BaseEltSize = EltAsInt.getBitWidth();
-      if (BigEndian)
-        Res |= EltAsInt.zextOrTrunc(VecSize).rotr(i*EltSize+BaseEltSize);
-      else
-        Res |= EltAsInt.zextOrTrunc(VecSize).rotl(i*EltSize);
-    }
-    return true;
-  }
-  // Give up if the input isn't an int, float, or vector.  For example, we
-  // reject "(v4i16)(intptr_t)&a".
-  Info.FFDiag(E, diag::note_invalid_subexpr_in_const_expr);
-  return false;
-}
-
 /// Perform the given integer operation, which is known to need at most BitWidth
 /// bits, and check for overflow in the original type (if that type was not an
 /// unsigned type).
@@ -7023,10 +6976,11 @@ class APValueToBufferConverter {
       return visitArray(Val, Ty, Offset);
     case APValue::Struct:
       return visitRecord(Val, Ty, Offset);
+    case APValue::Vector:
+      return visitVector(Val, Ty, Offset);
 
     case APValue::ComplexInt:
     case APValue::ComplexFloat:
-    case APValue::Vector:
     case APValue::FixedPoint:
       // FIXME: We should support these.
 
@@ -7113,6 +7067,72 @@ class APValueToBufferConverter {
     return true;
   }
 
+  bool visitVector(const APValue &Val, QualType Ty, CharUnits Offset) {
+    const VectorType *VTy = Ty->castAs<VectorType>();
+    QualType EltTy = VTy->getElementType();
+    unsigned NElts = VTy->getNumElements();
+    unsigned EltSize =
+        VTy->isExtVectorBoolType() ? 1 : Info.Ctx.getTypeSize(EltTy);
+
+    if ((NElts * EltSize) % Info.Ctx.getCharWidth() != 0) {
+      // The vector's size in bits is not a multiple of the target's byte size,
+      // so its layout is unspecified. For now, we'll simply treat these cases
+      // as unsupported (this should only be possible with OpenCL bool vectors
+      // whose element count isn't a multiple of the byte size).
+      Info.FFDiag(BCE->getBeginLoc(),
+                  diag::note_constexpr_bit_cast_invalid_vector)
+          << Ty.getCanonicalType() << EltSize << NElts
+          << Info.Ctx.getCharWidth();
+      return false;
+    }
+
+    if (EltTy->isRealFloatingType() && &Info.Ctx.getFloatTypeSemantics(EltTy) ==
+                                           &APFloat::x87DoubleExtended()) {
+      // The layout for x86_fp80 vectors seems to be handled very inconsistently
+      // by both clang and LLVM, so for now we won't allow bit_casts involving
+      // it in a constexpr context.
+      Info.FFDiag(BCE->getBeginLoc(),
+                  diag::note_constexpr_bit_cast_unsupported_type)
+          << EltTy;
+      return false;
+    }
+
+    if (VTy->isExtVectorBoolType()) {
+      // Special handling for OpenCL bool vectors:
+      // Since these vectors are stored as packed bits, but we can't write
+      // individual bits to the BitCastBuffer, we'll buffer all of the elements
+      // together into an appropriately sized APInt and write them all out at
+      // once. Because we don't accept vectors where NElts * EltSize isn't a
+      // multiple of the char size, there will be no padding space, so we don't
+      // have to worry about writing data which should have been left
+      // uninitialized.
+      bool BigEndian = Info.Ctx.getTargetInfo().isBigEndian();
+
+      llvm::APInt Res = llvm::APInt::getZero(NElts);
+      for (unsigned I = 0; I < NElts; ++I) {
+        const llvm::APSInt &EltAsInt = Val.getVectorElt(I).getInt();
+        assert(EltAsInt.isUnsigned() && EltAsInt.getBitWidth() == 1 &&
+               "bool vector element must be 1-bit unsigned integer!");
+
+        Res.insertBits(EltAsInt, BigEndian ? (NElts - I - 1) : I);
+      }
+
+      SmallVector<uint8_t, 8> Bytes(NElts / 8);
+      llvm::StoreIntToMemory(Res, &*Bytes.begin(), NElts / 8);
+      Buffer.writeObject(Offset, Bytes);
+    } else {
+      // Iterate over each of the elements and write them out to the buffer at
+      // the appropriate offset.
+      CharUnits EltSizeChars = Info.Ctx.getTypeSizeInChars(EltTy);
+      for (unsigned I = 0; I < NElts; ++I) {
+        if (!visit(Val.getVectorElt(I), EltTy, Offset + I * EltSizeChars))
+          return false;
+      }
+    }
+
+    return true;
+  }
+
   bool visitInt(const APSInt &Val, QualType Ty, CharUnits Offset) {
     APSInt AdjustedVal = Val;
     unsigned Width = AdjustedVal.getBitWidth();
@@ -7121,7 +7141,7 @@ class APValueToBufferConverter {
       AdjustedVal = AdjustedVal.extend(Width);
     }
 
-    SmallVector<unsigned char, 8> Bytes(Width / 8);
+    SmallVector<uint8_t, 8> Bytes(Width / 8);
     llvm::StoreIntToMemory(AdjustedVal, &*Bytes.begin(), Width / 8);
     Buffer.writeObject(Offset, Bytes);
     return true;
@@ -7322,6 +7342,77 @@ class BufferToAPValueConverter {
     return ArrayValue;
   }
 
+  std::optional<APValue> visit(const VectorType *VTy, CharUnits Offset) {
+    QualType EltTy = VTy->getElementType();
+    unsigned NElts = VTy->getNumElements();
+    unsigned EltSize =
+        VTy->isExtVectorBoolType() ? 1 : Info.Ctx.getTypeSize(EltTy);
+
+    if ((NElts * EltSize) % Info.Ctx.getCharWidth() != 0) {
+      // The vector's size in bits is not a multiple of the target's byte size,
+      // so its layout is unspecified. For now, we'll simply treat these cases
+      // as unsupported (this should only be possible with OpenCL bool vectors
+      // whose element count isn't a multiple of the byte size).
+      Info.FFDiag(BCE->getBeginLoc(),
+                  diag::note_constexpr_bit_cast_invalid_vector)
+          << QualType(VTy, 0) << EltSize << NElts << Info.Ctx.getCharWidth();
+      return std::nullopt;
+    }
+
+    if (EltTy->isRealFloatingType() && &Info.Ctx.getFloatTypeSemantics(EltTy) ==
+                                           &APFloat::x87DoubleExtended()) {
+      // The layout for x86_fp80 vectors seems to be handled very inconsistently
+      // by both clang and LLVM, so for now we won't allow bit_casts involving
+      // it in a constexpr context.
+      Info.FFDiag(BCE->getBeginLoc(),
+                  diag::note_constexpr_bit_cast_unsupported_type)
+          << EltTy;
+      return std::nullopt;
+    }
+
+    SmallVector<APValue, 4> Elts;
+    Elts.reserve(NElts);
+    if (VTy->isExtVectorBoolType()) {
+      // Special handling for OpenCL bool vectors:
+      // Since these vectors are stored as packed bits, but we can't read
+      // individual bits from the BitCastBuffer, we'll buffer all of the
+      // elements together into an appropriately sized APInt and write them all
+      // out at once. Because we don't accept vectors where NElts * EltSize
+      // isn't a multiple of the char size, there will be no padding space, so
+      // we don't have to worry about reading any padding data which didn't
+      // actually need to be accessed.
+      bool BigEndian = Info.Ctx.getTargetInfo().isBigEndian();
+
+      SmallVector<uint8_t, 8> Bytes;
+      Bytes.reserve(NElts / 8);
+      if (!Buffer.readObject(Offset, CharUnits::fromQuantity(NElts / 8), Bytes))
+        return std::nullopt;
+
+      APSInt SValInt(NElts, true);
+      llvm::LoadIntFromMemory(SValInt, &*Bytes.begin(), Bytes.size());
+
+      for (unsigned I = 0; I < NElts; ++I) {
+        llvm::APInt Elt =
+            SValInt.extractBits(1, (BigEndian ? NElts - I - 1 : I) * EltSize);
+        Elts.emplace_back(
+            APSInt(std::move(Elt), !EltTy->isSignedIntegerType()));
+      }
+    } else {
+      // Iterate over each of the elements and read them from the buffer at
+      // the appropriate offset.
+      CharUnits EltSizeChars = Info.Ctx.getTypeSizeInChars(EltTy);
+      for (unsigned I = 0; I < NElts; ++I) {
+        std::optional<APValue> EltValue =
+            visitType(EltTy, Offset + I * EltSizeChars);
+        if (!EltValue)
+          return std::nullopt;
+        Elts.push_back(std::move(*EltValue));
+      }
+    }
+
+    return APValue(Elts.data(), Elts.size());
+  }
+
   std::optional<APValue> visit(const Type *Ty, CharUnits Offset) {
     return unsupportedType(QualType(Ty, 0));
   }
@@ -7421,25 +7512,15 @@ static bool checkBitCastConstexprEligibility(EvalInfo *Info,
   return SourceOK;
 }
 
-static bool handleLValueToRValueBitCast(EvalInfo &Info, APValue &DestValue,
-                                        APValue &SourceValue,
+static bool handleRValueToRValueBitCast(EvalInfo &Info, APValue &DestValue,
+                                        const APValue &SourceRValue,
                                         const CastExpr *BCE) {
   assert(CHAR_BIT == 8 && Info.Ctx.getTargetInfo().getCharWidth() == 8 &&
          "no host or target supports non 8-bit chars");
-  assert(SourceValue.isLValue() &&
-         "LValueToRValueBitcast requires an lvalue operand!");
 
   if (!checkBitCastConstexprEligibility(&Info, Info.Ctx, BCE))
     return false;
 
-  LValue SourceLValue;
-  APValue SourceRValue;
-  SourceLValue.setFrom(Info.Ctx, SourceValue);
-  if (!handleLValueToRValueConversion(
-          Info, BCE, BCE->getSubExpr()->getType().withConst(), SourceLValue,
-          SourceRValue, /*WantObjectRepresentation=*/true))
-    return false;
-
   // Read out SourceValue into a char buffer.
   std::optional<BitCastBuffer> Buffer =
       APValueToBufferConverter::convert(Info, SourceRValue, BCE);
@@ -7456,6 +7537,25 @@ static bool handleLValueToRValueBitCast(EvalInfo &Info, APValue &DestValue,
   return true;
 }
 
+static bool handleLValueToRValueBitCast(EvalInfo &Info, APValue &DestValue,
+                                        APValue &SourceValue,
+                                        const CastExpr *BCE) {
+  assert(CHAR_BIT == 8 && Info.Ctx.getTargetInfo().getCharWidth() == 8 &&
+         "no host or target supports non 8-bit chars");
+  assert(SourceValue.isLValue() &&
+         "LValueToRValueBitcast requires an lvalue operand!");
+
+  LValue SourceLValue;
+  APValue SourceRValue;
+  SourceLValue.setFrom(Info.Ctx, SourceValue);
+  if (!handleLValueToRValueConversion(
+          Info, BCE, BCE->getSubExpr()->getType().withConst(), SourceLValue,
+          SourceRValue, /*WantObjectRepresentation=*/true))
+    return false;
+
+  return handleRValueToRValueBitCast(Info, DestValue, SourceRValue, BCE);
+}
+
 template <class Derived>
 class ExprEvaluatorBase
   : public ConstStmtVisitor<Derived, bool> {
@@ -10540,41 +10640,22 @@ bool VectorExprEvaluator::VisitCastExpr(const CastExpr *E) {
     return Success(Elts, E);
   }
   case CK_BitCast: {
-    // Evaluate the operand into an APInt we can extract from.
-    llvm::APInt SValInt;
-    if (!EvalAndBitcastToAPInt(Info, SE, SValInt))
+    APValue SVal;
+    if (!Evaluate(SVal, Info, SE))
+      return false;
+
+    if (!SVal.isInt() && !SVal.isFloat() && !SVal.isVector()) {
+      // Give up if the input isn't an int, float, or vector.  For example, we
+      // reject "(v4i16)(intptr_t)&a".
+      Info.FFDiag(E, diag::note_constexpr_invalid_cast)
+          << 2 << Info.Ctx.getLangOpts().CPlusPlus;
       return false;
-    // Extract the elements
-    QualType EltTy = VTy->getElementType();
-    unsigned EltSize = Info.Ctx.getTypeSize(EltTy);
-    bool BigEndian = Info.Ctx.getTargetInfo().isBigEndian();
-    SmallVector<APValue, 4> Elts;
-    if (EltTy->isRealFloatingType()) {
-      const llvm::fltSemantics &Sem = Info.Ctx.getFloatTypeSemantics(EltTy);
-      unsigned FloatEltSize = EltSize;
-      if (&Sem == &APFloat::x87DoubleExtended())
-        FloatEltSize = 80;
-      for (unsigned i = 0; i < NElts; i++) {
-        llvm::APInt Elt;
-        if (BigEndian)
-          Elt = SValInt.rotl(i * EltSize + FloatEltSize).trunc(FloatEltSize);
-        else
-          Elt = SValInt.rotr(i * EltSize).trunc(FloatEltSize);
-        Elts.push_back(APValue(APFloat(Sem, Elt)));
-      }
-    } else if (EltTy->isIntegerType()) {
-      for (unsigned i = 0; i < NElts; i++) {
-        llvm::APInt Elt;
-        if (BigEndian)
-          Elt = SValInt.rotl(i*EltSize+EltSize).zextOrTrunc(EltSize);
-        else
-          Elt = SValInt.rotr(i*EltSize).zextOrTrunc(EltSize);
-        Elts.push_back(APValue(APSInt(Elt, !EltTy->isSignedIntegerType())));
-      }
-    } else {
-      return Error(E);
     }
-    return Success(Elts, E);
+
+    if (!handleRValueToRValueBitCast(Info, Result, SVal, E))
+      return false;
+
+    return true;
   }
   default:
     return ExprEvaluatorBaseTy::VisitCastExpr(E);
diff --git a/clang/lib/CodeGen/CGExprConstant.cpp b/clang/lib/CodeGen/CGExprConstant.cpp
index 3f508032e30d6..c46f38d651972 100644
--- a/clang/lib/CodeGen/CGExprConstant.cpp
+++ b/clang/lib/CodeGen/CGExprConstant.cpp
@@ -2152,6 +2152,9 @@ llvm::Constant *ConstantEmitter::tryEmitPrivate(const APValue &Value,
         Inits[I] = llvm::ConstantInt::get(CGM.getLLVMContext(), Elt.getInt());
       else if (Elt.isFloat())
         Inits[I] = llvm::ConstantFP::get(CGM.getLLVMContext(), Elt.getFloat());
+      else if (Elt.isIndeterminate())
+        Inits[I] = llvm::UndefValue::get(CGM.getTypes().ConvertType(
+            DestType->castAs<VectorType>()->getElementType()));
       else
         llvm_unreachable("unsupported vector element type");
     }
diff --git a/clang/test/CodeGen/const-init.c b/clang/test/CodeGen/const-init.c
index 7062393c45022..0e4fc4ad48af8 100644
--- a/clang/test/CodeGen/const-init.c
+++ b/clang/test/CodeGen/const-init.c
@@ -140,11 +140,12 @@ void g28(void) {
   typedef short v12i16 __attribute((vector_size(24)));
   typedef long double v2f80 __attribute((vector_size(24)));
   // CHECK: @g28.a = internal global <1 x i64> <i64 10>
-  // CHECK: @g28.b = internal global <12 x i16> <i16 0, i16 0, i16 0, i16 -32768, i16 16383, i16 0, i16 0, i16 0, i16 0, i16 -32768, i16 16384, i16 0>
-  // CHECK: @g28.c = internal global <2 x x86_fp80> <x86_fp80 0xK3FFF8000000000000000, x86_fp80 0xK40008000000000000000>, align 32
+  // @g28.b = internal global <12 x i16> <i16 0, i16 0, i16 0, i16 -32768, i16 16383, i16 0, i16 0, i16 0, i16 0, i16 -32768, i16 16384, i16 0>
+  // @g28.c = internal global <2 x x86_fp80> <x86_fp80 0xK3FFF8000000000000000, x86_fp80 0xK40008000000000000000>, align 32
   static v1i64 a = (v1i64)10LL;
-  static v12i16 b = (v12i16)(v2f80){1,2};
-  static v2f80 c = (v2f80)(v12i16){0,0,0,-32768,16383,0,0,0,0,-32768,16384,0};
+  //FIXME: support constant bitcast between vectors of x86_fp80
+  //static v12i16 b = (v12i16)(v2f80){1,2};
+  //static v2f80 c = (v2f80)(v12i16){0,0,0,-32768,16383,0,0,0,0,-32768,16384,0};
 }
 
 // PR13643
diff --git a/clang/test/SemaCXX/constexpr-builtin-bit-cast-fp80.cpp b/clang/test/SemaCXX/constexpr-builtin-bit-cast-fp80.cpp
new file mode 100644
index 0000000000000..b37b362c81e75
--- /dev/null
+++ b/clang/test/SemaCXX/constexpr-builtin-bit-cast-fp80.cpp
@@ -0,0 +1,66 @@
+// RUN: %clang_cc1 -verify -std=c++2a -fsyntax-only -triple i386-pc-linux-gnu %s
+
+// This is separate from constexpr-builtin-bit-cast.cpp because we want to
+// compile for i386 so that sizeof(long double) is 12.
+
+typedef long double fp80x2_v __attribute__((ext_vector_type(2)));
+
+static_assert(sizeof(long double) == 12, "");
+static_assert(sizeof(fp80x2_v) == 32, "");
+
+struct fp80x2_s {
+  char _data[2 * 10];
+  unsigned char _pad[sizeof(fp80x2_v) - 2 * 10];
+
+  constexpr bool operator==(const fp80x2_s& rhs) const {
+    for (int i = 0; i < 2 * 10; ++i)
+      if (_data[i] != rhs._data[i])
+        return false;
+    return true;
+  }
+};
+
+namespace builtin_bit_cast {
+  constexpr static fp80x2_v test_vec_fp80 = { 1, 2 };
+  constexpr static fp80x2_s test_str_fp80 = { { 0, 0, 0, 0, 0, 0, 0, -128, -1, 63, 0, 0, 0, 0, 0, 0, 0, -128, 0, 64 }, {} };
+
+  // expected-error@+2 {{static assertion expression is not an integral constant expression}}
+  // expected-note@+1 {{constexpr bit cast involving type 'long double' is not yet supported}}
+  static_assert(__builtin_bit_cast(fp80x2_s, test_vec_fp80) == test_str_fp80, "");
+
+  // expected-error@+2 {{static assertion expression is not an integral constant expression}}
+  // expected-note@+1 {{constexpr bit cast involving type 'long double' is not yet supported}}
+  static_assert(__builtin_bit_cast(fp80x2_s, __builtin_bit_cast(fp80x2_v, test_str_fp80)) == test_str_fp80, "");
+
+  // expected-error@+2 {{constexpr variable 'bad_str_fp80_0' must be initialized by a constant expression}}
+  // expected-note@+1 {{constexpr bit cast involving type 'long double' is not yet supported}}
+  constexpr static char bad_str_fp80_0 = __builtin_bit_cast(fp80x2_s, test_vec_fp80)._pad[0];
+
+  // expected-error@+2 {{constexpr variable 'bad_str_fp80_1' must be initialized by a constant expression}}
+  // expected-note@+1 {{constexpr bit cast involving type 'long double' is not yet supported}}
+  constexpr static char bad_str_fp80_1 = __builtin_bit_cast(fp80x2_s, test_vec_fp80)._pad[1];
+
+  // expected-error@+2 {{constexpr variable 'bad_str_fp80_11' must be initialized by a constant expression}}
+  // expected-note@+1 {{constexpr bit cast involving type 'long double' is not yet supported}}
+  constexpr static char bad_str_fp80_11 = __builtin_bit_cast(fp80x2_s, test_vec_fp80)._pad[11];
+
+  // expected-error@+2 {{constexpr variable 'struct2v' must be initialized by a constant expression}}
+  // expected-note@+1 {{constexpr bit cast involving type 'long double' is not yet supported}}
+  constexpr static fp80x2_v struct2v = __builtin_bit_cast(fp80x2_v, test_str_fp80);
+}
+
+namespace c_cast {
+  typedef short v12i16 __attribute((vector_size(24)));
+  typedef long double v2f80 __attribute((vector_size(24)));
+
+  // FIXME: re-enable the corresponding test cases in CodeGen/const-init.c when
+  //  constexpr bitcast with x86_fp80 is supported
+
+  // expected-error@+2 {{constexpr variable 'b' must be initialized by a constant expression}}
+  // expected-note@+1 {{constexpr bit cast involving type 'long double' is not yet supported}}
+  constexpr static v12i16 b = (v12i16)(v2f80){1,2};
+
+  // expected-error@+2 {{constexpr variable 'c' must be initialized by a constant expression}}
+  // expected-note@+1 {{constexpr bit cast involving type 'long double' is not yet supported}}
+  constexpr static v2f80 c = (v2f80)(v12i16){0,0,0,-32768,16383,0,0,0,0,-32768,16384,0};
+}
diff --git a/clang/test/SemaCXX/constexpr-builtin-bit-cast.cpp b/clang/test/SemaCXX/constexpr-builtin-bit-cast.cpp
index a6ebe0572d063..c5b8032f40b13 100644
--- a/clang/test/SemaCXX/constexpr-builtin-bit-cast.cpp
+++ b/clang/test/SemaCXX/constexpr-builtin-bit-cast.cpp
@@ -463,3 +463,47 @@ static_assert(bit_cast<long double>(ld539) == fivehundredandthirtynine, "");
 static_assert(round_trip<__int128_t>(34.0L));
 #endif
 }
+
+namespace test_vector {
+
+typedef unsigned uint2 __attribute__((vector_size(2 * sizeof(unsigned))));
+typedef char byte8 __attribute__((vector_size(sizeof(unsigned long long))));
+
+constexpr uint2 test_vector = { 0x0C05FEFE, 0xCAFEBABE };
+
+static_assert(bit_cast<unsigned long long>(test_vector) == (LITTLE_END
+                                                                ? 0xCAFEBABE0C05FEFE
+                                                                : 0x0C05FEFECAFEBABE), "");
+
+static_assert(round_trip<uint2>(0xCAFEBABE0C05FEFEULL), "");
+static_assert(round_trip<byte8>(0xCAFEBABE0C05FEFEULL), "");
+
+typedef bool bool8 __attribute__((ext_vector_type(8)));
+typedef bool bool9 __attribute__((ext_vector_type(9)));
+typedef bool bool16 __attribute__((ext_vector_type(16)));
+typedef bool bool17 __attribute__((ext_vector_type(17)));
+typedef bool bool32 __attribute__((ext_vector_type(32)));
+typedef bool bool128 __attribute__((ext_vector_type(128)));
+
+static_assert(bit_cast<unsigned char>(bool8{1,0,1,0,1,0,1,0}) == (LITTLE_END ? 0x55 : 0xAA), "");
+static_assert(round_trip<bool8>(static_cast<unsigned char>(0)), "");
+static_assert(round_trip<bool8>(static_cast<unsigned char>(1)), "");
+static_assert(round_trip<bool8>(static_cast<unsigned char>(0x55)), "");
+
+static_assert(bit_cast<unsigned short>(bool16{1,1,1,1,1,0,0,0, 1,1,1,1,0,1,0,0}) == (LITTLE_END ? 0x2F1F : 0xF8F4), "");
+
+static_assert(round_trip<bool16>(static_cast<short>(0xCAFE)), "");
+static_assert(round_trip<bool32>(static_cast<int>(0xCAFEBABE)), "");
+static_assert(round_trip<bool128>(static_cast<__int128_t>(0xCAFEBABE0C05FEFEULL)), "");
+
+// expected-error@+2 {{constexpr variable 'bad_bool9_to_short' must be initialized by a constant expression}}
+// expected-note@+1 {{bit_cast involving type 'bool __attribute__((ext_vector_type(9)))' (vector of 9 'bool' values) is not allowed in a constant expression; element size 1 * element count 9 is not a multiple of the byte size 8}}
+constexpr unsigned short bad_bool9_to_short = __builtin_bit_cast(unsigned short, bool9{1,1,0,1,0,1,0,1,0});
+// expected-error@+2 {{constexpr variable 'bad_short_to_bool9' must be initialized by a constant expression}}
+// expected-note@+1 {{bit_cast involving type 'bool __attribute__((ext_vector_type(9)))' (vector of 9 'bool' values) is not allowed in a constant expression; element size 1 * element count 9 is not a multiple of the byte size 8}}
+constexpr bool9 bad_short_to_bool9 = __builtin_bit_cast(bool9, static_cast<unsigned short>(0));
+// expected-error@+2 {{constexpr variable 'bad_int_to_bool17' must be initialized by a constant expression}}
+// expected-note@+1 {{bit_cast involving type 'bool __attribute__((ext_vector_type(17)))' (vector of 17 'bool' values) is not allowed in a constant expression; element size 1 * element count 17 is not a multiple of the byte size 8}}
+constexpr bool17 bad_int_to_bool17 = __builtin_bit_cast(bool17, 0x0001CAFEU);
+
+}

From 1de5fe18d8e3935a1a0860068d4e23b7bb7e83b0 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Mon, 30 Oct 2023 17:38:56 +0000
Subject: [PATCH 409/877] [MCA][X86] Add AVX512 FMA instruction test coverage

---
 .../llvm-mca/X86/Generic/resources-avx512.s   | 170 ++++++++-
 .../llvm-mca/X86/Generic/resources-avx512vl.s | 338 +++++++++++++++++-
 .../X86/IceLakeServer/resources-avx512.s      | 170 ++++++++-
 .../X86/IceLakeServer/resources-avx512vl.s    | 338 +++++++++++++++++-
 .../X86/SapphireRapids/resources-avx512.s     | 170 ++++++++-
 .../X86/SapphireRapids/resources-avx512vl.s   | 338 +++++++++++++++++-
 .../X86/SkylakeServer/resources-avx512.s      | 170 ++++++++-
 .../X86/SkylakeServer/resources-avx512vl.s    | 338 +++++++++++++++++-
 .../llvm-mca/X86/Znver4/resources-avx512.s    | 170 ++++++++-
 .../llvm-mca/X86/Znver4/resources-avx512vl.s  | 338 +++++++++++++++++-
 10 files changed, 2530 insertions(+), 10 deletions(-)

diff --git a/llvm/test/tools/llvm-mca/X86/Generic/resources-avx512.s b/llvm/test/tools/llvm-mca/X86/Generic/resources-avx512.s
index 5b9b8cd6bb5fc..a8937f7dcfd11 100644
--- a/llvm/test/tools/llvm-mca/X86/Generic/resources-avx512.s
+++ b/llvm/test/tools/llvm-mca/X86/Generic/resources-avx512.s
@@ -298,6 +298,66 @@ vdivps            %zmm16, %zmm17, %zmm19 {z}{k1}
 vdivps            (%rax), %zmm17, %zmm19 {z}{k1}
 vdivps            (%rax){1to16}, %zmm17, %zmm19 {z}{k1}
 
+vfmadd132pd       %zmm16, %zmm17, %zmm19
+vfmadd132pd       (%rax), %zmm17, %zmm19
+vfmadd132pd       (%rax){1to8}, %zmm17, %zmm19
+vfmadd132pd       %zmm16, %zmm17, %zmm19 {k1}
+vfmadd132pd       (%rax), %zmm17, %zmm19 {k1}
+vfmadd132pd       (%rax){1to8}, %zmm17, %zmm19 {k1}
+vfmadd132pd       %zmm16, %zmm17, %zmm19 {z}{k1}
+vfmadd132pd       (%rax), %zmm17, %zmm19 {z}{k1}
+vfmadd132pd       (%rax){1to8}, %zmm17, %zmm19 {z}{k1}
+
+vfmadd213pd       %zmm16, %zmm17, %zmm19
+vfmadd213pd       (%rax), %zmm17, %zmm19
+vfmadd213pd       (%rax){1to8}, %zmm17, %zmm19
+vfmadd213pd       %zmm16, %zmm17, %zmm19 {k1}
+vfmadd213pd       (%rax), %zmm17, %zmm19 {k1}
+vfmadd213pd       (%rax){1to8}, %zmm17, %zmm19 {k1}
+vfmadd213pd       %zmm16, %zmm17, %zmm19 {z}{k1}
+vfmadd213pd       (%rax), %zmm17, %zmm19 {z}{k1}
+vfmadd213pd       (%rax){1to8}, %zmm17, %zmm19 {z}{k1}
+
+vfmadd231pd       %zmm16, %zmm17, %zmm19
+vfmadd231pd       (%rax), %zmm17, %zmm19
+vfmadd231pd       (%rax){1to8}, %zmm17, %zmm19
+vfmadd231pd       %zmm16, %zmm17, %zmm19 {k1}
+vfmadd231pd       (%rax), %zmm17, %zmm19 {k1}
+vfmadd231pd       (%rax){1to8}, %zmm17, %zmm19 {k1}
+vfmadd231pd       %zmm16, %zmm17, %zmm19 {z}{k1}
+vfmadd231pd       (%rax), %zmm17, %zmm19 {z}{k1}
+vfmadd231pd       (%rax){1to8}, %zmm17, %zmm19 {z}{k1}
+
+vfmadd132ps       %zmm16, %zmm17, %zmm19
+vfmadd132ps       (%rax), %zmm17, %zmm19
+vfmadd132ps       (%rax){1to16}, %zmm17, %zmm19
+vfmadd132ps       %zmm16, %zmm17, %zmm19 {k1}
+vfmadd132ps       (%rax), %zmm17, %zmm19 {k1}
+vfmadd132ps       (%rax){1to16}, %zmm17, %zmm19 {k1}
+vfmadd132ps       %zmm16, %zmm17, %zmm19 {z}{k1}
+vfmadd132ps       (%rax), %zmm17, %zmm19 {z}{k1}
+vfmadd132ps       (%rax){1to16}, %zmm17, %zmm19 {z}{k1}
+
+vfmadd213ps       %zmm16, %zmm17, %zmm19
+vfmadd213ps       (%rax), %zmm17, %zmm19
+vfmadd213ps       (%rax){1to16}, %zmm17, %zmm19
+vfmadd213ps       %zmm16, %zmm17, %zmm19 {k1}
+vfmadd213ps       (%rax), %zmm17, %zmm19 {k1}
+vfmadd213ps       (%rax){1to16}, %zmm17, %zmm19 {k1}
+vfmadd213ps       %zmm16, %zmm17, %zmm19 {z}{k1}
+vfmadd213ps       (%rax), %zmm17, %zmm19 {z}{k1}
+vfmadd213ps       (%rax){1to16}, %zmm17, %zmm19 {z}{k1}
+
+vfmadd231ps       %zmm16, %zmm17, %zmm19
+vfmadd231ps       (%rax), %zmm17, %zmm19
+vfmadd231ps       (%rax){1to16}, %zmm17, %zmm19
+vfmadd231ps       %zmm16, %zmm17, %zmm19 {k1}
+vfmadd231ps       (%rax), %zmm17, %zmm19 {k1}
+vfmadd231ps       (%rax){1to16}, %zmm17, %zmm19 {k1}
+vfmadd231ps       %zmm16, %zmm17, %zmm19 {z}{k1}
+vfmadd231ps       (%rax), %zmm17, %zmm19 {z}{k1}
+vfmadd231ps       (%rax){1to16}, %zmm17, %zmm19 {z}{k1}
+
 vgatherdpd        (%rax,%ymm1,2), %zmm2 {k1}
 vgatherdps        (%rax,%zmm1,2), %zmm2 {k1}
 vgatherqpd        (%rax,%zmm1,2), %zmm2 {k1}
@@ -1274,6 +1334,60 @@ vunpcklps         (%rax){1to16}, %zmm17, %zmm19 {z}{k1}
 # CHECK-NEXT:  3      29    28.00                       vdivps	%zmm16, %zmm17, %zmm19 {%k1} {z}
 # CHECK-NEXT:  4      36    28.00   *                   vdivps	(%rax), %zmm17, %zmm19 {%k1} {z}
 # CHECK-NEXT:  4      36    28.00   *                   vdivps	(%rax){1to16}, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  1      5     0.50                        vfmadd132pd	%zmm16, %zmm17, %zmm19
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd132pd	(%rax), %zmm17, %zmm19
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd132pd	(%rax){1to8}, %zmm17, %zmm19
+# CHECK-NEXT:  1      5     0.50                        vfmadd132pd	%zmm16, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd132pd	(%rax), %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd132pd	(%rax){1to8}, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  1      5     0.50                        vfmadd132pd	%zmm16, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd132pd	(%rax), %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd132pd	(%rax){1to8}, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  1      5     0.50                        vfmadd213pd	%zmm16, %zmm17, %zmm19
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd213pd	(%rax), %zmm17, %zmm19
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd213pd	(%rax){1to8}, %zmm17, %zmm19
+# CHECK-NEXT:  1      5     0.50                        vfmadd213pd	%zmm16, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd213pd	(%rax), %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd213pd	(%rax){1to8}, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  1      5     0.50                        vfmadd213pd	%zmm16, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd213pd	(%rax), %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd213pd	(%rax){1to8}, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  1      5     0.50                        vfmadd231pd	%zmm16, %zmm17, %zmm19
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd231pd	(%rax), %zmm17, %zmm19
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd231pd	(%rax){1to8}, %zmm17, %zmm19
+# CHECK-NEXT:  1      5     0.50                        vfmadd231pd	%zmm16, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd231pd	(%rax), %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd231pd	(%rax){1to8}, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  1      5     0.50                        vfmadd231pd	%zmm16, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd231pd	(%rax), %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd231pd	(%rax){1to8}, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  1      5     0.50                        vfmadd132ps	%zmm16, %zmm17, %zmm19
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd132ps	(%rax), %zmm17, %zmm19
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd132ps	(%rax){1to16}, %zmm17, %zmm19
+# CHECK-NEXT:  1      5     0.50                        vfmadd132ps	%zmm16, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd132ps	(%rax), %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd132ps	(%rax){1to16}, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  1      5     0.50                        vfmadd132ps	%zmm16, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd132ps	(%rax), %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd132ps	(%rax){1to16}, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  1      5     0.50                        vfmadd213ps	%zmm16, %zmm17, %zmm19
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd213ps	(%rax), %zmm17, %zmm19
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd213ps	(%rax){1to16}, %zmm17, %zmm19
+# CHECK-NEXT:  1      5     0.50                        vfmadd213ps	%zmm16, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd213ps	(%rax), %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd213ps	(%rax){1to16}, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  1      5     0.50                        vfmadd213ps	%zmm16, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd213ps	(%rax), %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd213ps	(%rax){1to16}, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  1      5     0.50                        vfmadd231ps	%zmm16, %zmm17, %zmm19
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd231ps	(%rax), %zmm17, %zmm19
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd231ps	(%rax){1to16}, %zmm17, %zmm19
+# CHECK-NEXT:  1      5     0.50                        vfmadd231ps	%zmm16, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd231ps	(%rax), %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd231ps	(%rax){1to16}, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  1      5     0.50                        vfmadd231ps	%zmm16, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd231ps	(%rax), %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd231ps	(%rax){1to16}, %zmm17, %zmm19 {%k1} {z}
 # CHECK-NEXT:  1      5     0.50    *                   vgatherdpd	(%rax,%ymm1,2), %zmm2 {%k1}
 # CHECK-NEXT:  1      5     0.50    *                   vgatherdps	(%rax,%zmm1,2), %zmm2 {%k1}
 # CHECK-NEXT:  1      5     0.50    *                   vgatherqpd	(%rax,%zmm1,2), %zmm2 {%k1}
@@ -1913,7 +2027,7 @@ vunpcklps         (%rax){1to16}, %zmm17, %zmm19 {z}{k1}
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
-# CHECK-NEXT:  -     1506.00 170.00 307.00 16.00 522.00 281.50 281.50
+# CHECK-NEXT:  -     1506.00 197.00 334.00 16.00 522.00 299.50 299.50
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
@@ -2176,6 +2290,60 @@ vunpcklps         (%rax){1to16}, %zmm17, %zmm19 {z}{k1}
 # CHECK-NEXT:  -     28.00  2.50    -      -     0.50    -      -     vdivps	%zmm16, %zmm17, %zmm19 {%k1} {z}
 # CHECK-NEXT:  -     28.00  2.50    -      -     0.50   0.50   0.50   vdivps	(%rax), %zmm17, %zmm19 {%k1} {z}
 # CHECK-NEXT:  -     28.00  2.50    -      -     0.50   0.50   0.50   vdivps	(%rax){1to16}, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmadd132pd	%zmm16, %zmm17, %zmm19
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd132pd	(%rax), %zmm17, %zmm19
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd132pd	(%rax){1to8}, %zmm17, %zmm19
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmadd132pd	%zmm16, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd132pd	(%rax), %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd132pd	(%rax){1to8}, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmadd132pd	%zmm16, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd132pd	(%rax), %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd132pd	(%rax){1to8}, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmadd213pd	%zmm16, %zmm17, %zmm19
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd213pd	(%rax), %zmm17, %zmm19
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd213pd	(%rax){1to8}, %zmm17, %zmm19
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmadd213pd	%zmm16, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd213pd	(%rax), %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd213pd	(%rax){1to8}, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmadd213pd	%zmm16, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd213pd	(%rax), %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd213pd	(%rax){1to8}, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmadd231pd	%zmm16, %zmm17, %zmm19
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd231pd	(%rax), %zmm17, %zmm19
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd231pd	(%rax){1to8}, %zmm17, %zmm19
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmadd231pd	%zmm16, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd231pd	(%rax), %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd231pd	(%rax){1to8}, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmadd231pd	%zmm16, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd231pd	(%rax), %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd231pd	(%rax){1to8}, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmadd132ps	%zmm16, %zmm17, %zmm19
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd132ps	(%rax), %zmm17, %zmm19
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd132ps	(%rax){1to16}, %zmm17, %zmm19
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmadd132ps	%zmm16, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd132ps	(%rax), %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd132ps	(%rax){1to16}, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmadd132ps	%zmm16, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd132ps	(%rax), %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd132ps	(%rax){1to16}, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmadd213ps	%zmm16, %zmm17, %zmm19
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd213ps	(%rax), %zmm17, %zmm19
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd213ps	(%rax){1to16}, %zmm17, %zmm19
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmadd213ps	%zmm16, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd213ps	(%rax), %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd213ps	(%rax){1to16}, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmadd213ps	%zmm16, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd213ps	(%rax), %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd213ps	(%rax){1to16}, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmadd231ps	%zmm16, %zmm17, %zmm19
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd231ps	(%rax), %zmm17, %zmm19
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd231ps	(%rax){1to16}, %zmm17, %zmm19
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmadd231ps	%zmm16, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd231ps	(%rax), %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd231ps	(%rax){1to16}, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmadd231ps	%zmm16, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd231ps	(%rax), %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd231ps	(%rax){1to16}, %zmm17, %zmm19 {%k1} {z}
 # CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vgatherdpd	(%rax,%ymm1,2), %zmm2 {%k1}
 # CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vgatherdps	(%rax,%zmm1,2), %zmm2 {%k1}
 # CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vgatherqpd	(%rax,%zmm1,2), %zmm2 {%k1}
diff --git a/llvm/test/tools/llvm-mca/X86/Generic/resources-avx512vl.s b/llvm/test/tools/llvm-mca/X86/Generic/resources-avx512vl.s
index 84852a2a8b156..e8e7a80f690bf 100644
--- a/llvm/test/tools/llvm-mca/X86/Generic/resources-avx512vl.s
+++ b/llvm/test/tools/llvm-mca/X86/Generic/resources-avx512vl.s
@@ -418,6 +418,126 @@ vdivps            %ymm16, %ymm17, %ymm19 {z}{k1}
 vdivps            (%rax), %ymm17, %ymm19 {z}{k1}
 vdivps            (%rax){1to8}, %ymm17, %ymm19 {z}{k1}
 
+vfmadd132pd       %xmm16, %xmm17, %xmm19
+vfmadd132pd       (%rax), %xmm17, %xmm19
+vfmadd132pd       (%rax){1to2}, %xmm17, %xmm19
+vfmadd132pd       %xmm16, %xmm17, %xmm19 {k1}
+vfmadd132pd       (%rax), %xmm17, %xmm19 {k1}
+vfmadd132pd       (%rax){1to2}, %xmm17, %xmm19 {k1}
+vfmadd132pd       %xmm16, %xmm17, %xmm19 {z}{k1}
+vfmadd132pd       (%rax), %xmm17, %xmm19 {z}{k1}
+vfmadd132pd       (%rax){1to2}, %xmm17, %xmm19 {z}{k1}
+
+vfmadd132pd       %ymm16, %ymm17, %ymm19
+vfmadd132pd       (%rax), %ymm17, %ymm19
+vfmadd132pd       (%rax){1to4}, %ymm17, %ymm19
+vfmadd132pd       %ymm16, %ymm17, %ymm19 {k1}
+vfmadd132pd       (%rax), %ymm17, %ymm19 {k1}
+vfmadd132pd       (%rax){1to4}, %ymm17, %ymm19 {k1}
+vfmadd132pd       %ymm16, %ymm17, %ymm19 {z}{k1}
+vfmadd132pd       (%rax), %ymm17, %ymm19 {z}{k1}
+vfmadd132pd       (%rax){1to4}, %ymm17, %ymm19 {z}{k1}
+
+vfmadd213pd       %xmm16, %xmm17, %xmm19
+vfmadd213pd       (%rax), %xmm17, %xmm19
+vfmadd213pd       (%rax){1to2}, %xmm17, %xmm19
+vfmadd213pd       %xmm16, %xmm17, %xmm19 {k1}
+vfmadd213pd       (%rax), %xmm17, %xmm19 {k1}
+vfmadd213pd       (%rax){1to2}, %xmm17, %xmm19 {k1}
+vfmadd213pd       %xmm16, %xmm17, %xmm19 {z}{k1}
+vfmadd213pd       (%rax), %xmm17, %xmm19 {z}{k1}
+vfmadd213pd       (%rax){1to2}, %xmm17, %xmm19 {z}{k1}
+
+vfmadd213pd       %ymm16, %ymm17, %ymm19
+vfmadd213pd       (%rax), %ymm17, %ymm19
+vfmadd213pd       (%rax){1to4}, %ymm17, %ymm19
+vfmadd213pd       %ymm16, %ymm17, %ymm19 {k1}
+vfmadd213pd       (%rax), %ymm17, %ymm19 {k1}
+vfmadd213pd       (%rax){1to4}, %ymm17, %ymm19 {k1}
+vfmadd213pd       %ymm16, %ymm17, %ymm19 {z}{k1}
+vfmadd213pd       (%rax), %ymm17, %ymm19 {z}{k1}
+vfmadd213pd       (%rax){1to4}, %ymm17, %ymm19 {z}{k1}
+
+vfmadd231pd       %xmm16, %xmm17, %xmm19
+vfmadd231pd       (%rax), %xmm17, %xmm19
+vfmadd231pd       (%rax){1to2}, %xmm17, %xmm19
+vfmadd231pd       %xmm16, %xmm17, %xmm19 {k1}
+vfmadd231pd       (%rax), %xmm17, %xmm19 {k1}
+vfmadd231pd       (%rax){1to2}, %xmm17, %xmm19 {k1}
+vfmadd231pd       %xmm16, %xmm17, %xmm19 {z}{k1}
+vfmadd231pd       (%rax), %xmm17, %xmm19 {z}{k1}
+vfmadd231pd       (%rax){1to2}, %xmm17, %xmm19 {z}{k1}
+
+vfmadd231pd       %ymm16, %ymm17, %ymm19
+vfmadd231pd       (%rax), %ymm17, %ymm19
+vfmadd231pd       (%rax){1to4}, %ymm17, %ymm19
+vfmadd231pd       %ymm16, %ymm17, %ymm19 {k1}
+vfmadd231pd       (%rax), %ymm17, %ymm19 {k1}
+vfmadd231pd       (%rax){1to4}, %ymm17, %ymm19 {k1}
+vfmadd231pd       %ymm16, %ymm17, %ymm19 {z}{k1}
+vfmadd231pd       (%rax), %ymm17, %ymm19 {z}{k1}
+vfmadd231pd       (%rax){1to4}, %ymm17, %ymm19 {z}{k1}
+
+vfmadd132ps       %xmm16, %xmm17, %xmm19
+vfmadd132ps       (%rax), %xmm17, %xmm19
+vfmadd132ps       (%rax){1to4}, %xmm17, %xmm19
+vfmadd132ps       %xmm16, %xmm17, %xmm19 {k1}
+vfmadd132ps       (%rax), %xmm17, %xmm19 {k1}
+vfmadd132ps       (%rax){1to4}, %xmm17, %xmm19 {k1}
+vfmadd132ps       %xmm16, %xmm17, %xmm19 {z}{k1}
+vfmadd132ps       (%rax), %xmm17, %xmm19 {z}{k1}
+vfmadd132ps       (%rax){1to4}, %xmm17, %xmm19 {z}{k1}
+
+vfmadd132ps       %ymm16, %ymm17, %ymm19
+vfmadd132ps       (%rax), %ymm17, %ymm19
+vfmadd132ps       (%rax){1to8}, %ymm17, %ymm19
+vfmadd132ps       %ymm16, %ymm17, %ymm19 {k1}
+vfmadd132ps       (%rax), %ymm17, %ymm19 {k1}
+vfmadd132ps       (%rax){1to8}, %ymm17, %ymm19 {k1}
+vfmadd132ps       %ymm16, %ymm17, %ymm19 {z}{k1}
+vfmadd132ps       (%rax), %ymm17, %ymm19 {z}{k1}
+vfmadd132ps       (%rax){1to8}, %ymm17, %ymm19 {z}{k1}
+
+vfmadd213ps       %xmm16, %xmm17, %xmm19
+vfmadd213ps       (%rax), %xmm17, %xmm19
+vfmadd213ps       (%rax){1to4}, %xmm17, %xmm19
+vfmadd213ps       %xmm16, %xmm17, %xmm19 {k1}
+vfmadd213ps       (%rax), %xmm17, %xmm19 {k1}
+vfmadd213ps       (%rax){1to4}, %xmm17, %xmm19 {k1}
+vfmadd213ps       %xmm16, %xmm17, %xmm19 {z}{k1}
+vfmadd213ps       (%rax), %xmm17, %xmm19 {z}{k1}
+vfmadd213ps       (%rax){1to4}, %xmm17, %xmm19 {z}{k1}
+
+vfmadd213ps       %ymm16, %ymm17, %ymm19
+vfmadd213ps       (%rax), %ymm17, %ymm19
+vfmadd213ps       (%rax){1to8}, %ymm17, %ymm19
+vfmadd213ps       %ymm16, %ymm17, %ymm19 {k1}
+vfmadd213ps       (%rax), %ymm17, %ymm19 {k1}
+vfmadd213ps       (%rax){1to8}, %ymm17, %ymm19 {k1}
+vfmadd213ps       %ymm16, %ymm17, %ymm19 {z}{k1}
+vfmadd213ps       (%rax), %ymm17, %ymm19 {z}{k1}
+vfmadd213ps       (%rax){1to8}, %ymm17, %ymm19 {z}{k1}
+
+vfmadd231ps       %xmm16, %xmm17, %xmm19
+vfmadd231ps       (%rax), %xmm17, %xmm19
+vfmadd231ps       (%rax){1to4}, %xmm17, %xmm19
+vfmadd231ps       %xmm16, %xmm17, %xmm19 {k1}
+vfmadd231ps       (%rax), %xmm17, %xmm19 {k1}
+vfmadd231ps       (%rax){1to4}, %xmm17, %xmm19 {k1}
+vfmadd231ps       %xmm16, %xmm17, %xmm19 {z}{k1}
+vfmadd231ps       (%rax), %xmm17, %xmm19 {z}{k1}
+vfmadd231ps       (%rax){1to4}, %xmm17, %xmm19 {z}{k1}
+
+vfmadd231ps       %ymm16, %ymm17, %ymm19
+vfmadd231ps       (%rax), %ymm17, %ymm19
+vfmadd231ps       (%rax){1to8}, %ymm17, %ymm19
+vfmadd231ps       %ymm16, %ymm17, %ymm19 {k1}
+vfmadd231ps       (%rax), %ymm17, %ymm19 {k1}
+vfmadd231ps       (%rax){1to8}, %ymm17, %ymm19 {k1}
+vfmadd231ps       %ymm16, %ymm17, %ymm19 {z}{k1}
+vfmadd231ps       (%rax), %ymm17, %ymm19 {z}{k1}
+vfmadd231ps       (%rax){1to8}, %ymm17, %ymm19 {z}{k1}
+
 vgatherdpd        (%rax,%xmm1,2), %ymm2 {k1}
 vgatherdps        (%rax,%ymm1,2), %ymm2 {k1}
 vgatherqpd        (%rax,%ymm1,2), %ymm2 {k1}
@@ -1961,6 +2081,114 @@ vunpcklps         (%rax){1to8}, %ymm17, %ymm19 {z}{k1}
 # CHECK-NEXT:  3      29    28.00                       vdivps	%ymm16, %ymm17, %ymm19 {%k1} {z}
 # CHECK-NEXT:  4      36    28.00   *                   vdivps	(%rax), %ymm17, %ymm19 {%k1} {z}
 # CHECK-NEXT:  4      36    28.00   *                   vdivps	(%rax){1to8}, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  1      5     0.50                        vfmadd132pd	%xmm16, %xmm17, %xmm19
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd132pd	(%rax), %xmm17, %xmm19
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd132pd	(%rax){1to2}, %xmm17, %xmm19
+# CHECK-NEXT:  1      5     0.50                        vfmadd132pd	%xmm16, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd132pd	(%rax), %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd132pd	(%rax){1to2}, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  1      5     0.50                        vfmadd132pd	%xmm16, %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd132pd	(%rax), %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd132pd	(%rax){1to2}, %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  1      5     0.50                        vfmadd132pd	%ymm16, %ymm17, %ymm19
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd132pd	(%rax), %ymm17, %ymm19
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd132pd	(%rax){1to4}, %ymm17, %ymm19
+# CHECK-NEXT:  1      5     0.50                        vfmadd132pd	%ymm16, %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd132pd	(%rax), %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd132pd	(%rax){1to4}, %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  1      5     0.50                        vfmadd132pd	%ymm16, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd132pd	(%rax), %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd132pd	(%rax){1to4}, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  1      5     0.50                        vfmadd213pd	%xmm16, %xmm17, %xmm19
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd213pd	(%rax), %xmm17, %xmm19
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd213pd	(%rax){1to2}, %xmm17, %xmm19
+# CHECK-NEXT:  1      5     0.50                        vfmadd213pd	%xmm16, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd213pd	(%rax), %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd213pd	(%rax){1to2}, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  1      5     0.50                        vfmadd213pd	%xmm16, %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd213pd	(%rax), %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd213pd	(%rax){1to2}, %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  1      5     0.50                        vfmadd213pd	%ymm16, %ymm17, %ymm19
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd213pd	(%rax), %ymm17, %ymm19
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd213pd	(%rax){1to4}, %ymm17, %ymm19
+# CHECK-NEXT:  1      5     0.50                        vfmadd213pd	%ymm16, %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd213pd	(%rax), %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd213pd	(%rax){1to4}, %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  1      5     0.50                        vfmadd213pd	%ymm16, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd213pd	(%rax), %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd213pd	(%rax){1to4}, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  1      5     0.50                        vfmadd231pd	%xmm16, %xmm17, %xmm19
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd231pd	(%rax), %xmm17, %xmm19
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd231pd	(%rax){1to2}, %xmm17, %xmm19
+# CHECK-NEXT:  1      5     0.50                        vfmadd231pd	%xmm16, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd231pd	(%rax), %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd231pd	(%rax){1to2}, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  1      5     0.50                        vfmadd231pd	%xmm16, %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd231pd	(%rax), %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd231pd	(%rax){1to2}, %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  1      5     0.50                        vfmadd231pd	%ymm16, %ymm17, %ymm19
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd231pd	(%rax), %ymm17, %ymm19
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd231pd	(%rax){1to4}, %ymm17, %ymm19
+# CHECK-NEXT:  1      5     0.50                        vfmadd231pd	%ymm16, %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd231pd	(%rax), %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd231pd	(%rax){1to4}, %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  1      5     0.50                        vfmadd231pd	%ymm16, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd231pd	(%rax), %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd231pd	(%rax){1to4}, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  1      5     0.50                        vfmadd132ps	%xmm16, %xmm17, %xmm19
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd132ps	(%rax), %xmm17, %xmm19
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd132ps	(%rax){1to4}, %xmm17, %xmm19
+# CHECK-NEXT:  1      5     0.50                        vfmadd132ps	%xmm16, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd132ps	(%rax), %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd132ps	(%rax){1to4}, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  1      5     0.50                        vfmadd132ps	%xmm16, %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd132ps	(%rax), %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd132ps	(%rax){1to4}, %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  1      5     0.50                        vfmadd132ps	%ymm16, %ymm17, %ymm19
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd132ps	(%rax), %ymm17, %ymm19
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd132ps	(%rax){1to8}, %ymm17, %ymm19
+# CHECK-NEXT:  1      5     0.50                        vfmadd132ps	%ymm16, %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd132ps	(%rax), %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd132ps	(%rax){1to8}, %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  1      5     0.50                        vfmadd132ps	%ymm16, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd132ps	(%rax), %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd132ps	(%rax){1to8}, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  1      5     0.50                        vfmadd213ps	%xmm16, %xmm17, %xmm19
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd213ps	(%rax), %xmm17, %xmm19
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd213ps	(%rax){1to4}, %xmm17, %xmm19
+# CHECK-NEXT:  1      5     0.50                        vfmadd213ps	%xmm16, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd213ps	(%rax), %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd213ps	(%rax){1to4}, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  1      5     0.50                        vfmadd213ps	%xmm16, %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd213ps	(%rax), %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd213ps	(%rax){1to4}, %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  1      5     0.50                        vfmadd213ps	%ymm16, %ymm17, %ymm19
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd213ps	(%rax), %ymm17, %ymm19
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd213ps	(%rax){1to8}, %ymm17, %ymm19
+# CHECK-NEXT:  1      5     0.50                        vfmadd213ps	%ymm16, %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd213ps	(%rax), %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd213ps	(%rax){1to8}, %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  1      5     0.50                        vfmadd213ps	%ymm16, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd213ps	(%rax), %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd213ps	(%rax){1to8}, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  1      5     0.50                        vfmadd231ps	%xmm16, %xmm17, %xmm19
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd231ps	(%rax), %xmm17, %xmm19
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd231ps	(%rax){1to4}, %xmm17, %xmm19
+# CHECK-NEXT:  1      5     0.50                        vfmadd231ps	%xmm16, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd231ps	(%rax), %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd231ps	(%rax){1to4}, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  1      5     0.50                        vfmadd231ps	%xmm16, %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd231ps	(%rax), %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd231ps	(%rax){1to4}, %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  1      5     0.50                        vfmadd231ps	%ymm16, %ymm17, %ymm19
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd231ps	(%rax), %ymm17, %ymm19
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd231ps	(%rax){1to8}, %ymm17, %ymm19
+# CHECK-NEXT:  1      5     0.50                        vfmadd231ps	%ymm16, %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd231ps	(%rax), %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd231ps	(%rax){1to8}, %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  1      5     0.50                        vfmadd231ps	%ymm16, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd231ps	(%rax), %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd231ps	(%rax){1to8}, %ymm17, %ymm19 {%k1} {z}
 # CHECK-NEXT:  1      5     0.50    *                   vgatherdpd	(%rax,%xmm1,2), %ymm2 {%k1}
 # CHECK-NEXT:  1      5     0.50    *                   vgatherdps	(%rax,%ymm1,2), %ymm2 {%k1}
 # CHECK-NEXT:  1      5     0.50    *                   vgatherqpd	(%rax,%ymm1,2), %ymm2 {%k1}
@@ -3000,7 +3228,7 @@ vunpcklps         (%rax){1to8}, %ymm17, %ymm19 {z}{k1}
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]
-# CHECK-NEXT:  -     1935.00 224.00 525.50 32.00 738.50 450.50 450.50
+# CHECK-NEXT:  -     1935.00 278.00 579.50 32.00 738.50 486.50 486.50
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6.0]  [6.1]  Instructions:
@@ -3376,6 +3604,114 @@ vunpcklps         (%rax){1to8}, %ymm17, %ymm19 {z}{k1}
 # CHECK-NEXT:  -     28.00  2.50    -      -     0.50    -      -     vdivps	%ymm16, %ymm17, %ymm19 {%k1} {z}
 # CHECK-NEXT:  -     28.00  2.50    -      -     0.50   0.50   0.50   vdivps	(%rax), %ymm17, %ymm19 {%k1} {z}
 # CHECK-NEXT:  -     28.00  2.50    -      -     0.50   0.50   0.50   vdivps	(%rax){1to8}, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmadd132pd	%xmm16, %xmm17, %xmm19
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd132pd	(%rax), %xmm17, %xmm19
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd132pd	(%rax){1to2}, %xmm17, %xmm19
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmadd132pd	%xmm16, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd132pd	(%rax), %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd132pd	(%rax){1to2}, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmadd132pd	%xmm16, %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd132pd	(%rax), %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd132pd	(%rax){1to2}, %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmadd132pd	%ymm16, %ymm17, %ymm19
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd132pd	(%rax), %ymm17, %ymm19
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd132pd	(%rax){1to4}, %ymm17, %ymm19
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmadd132pd	%ymm16, %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd132pd	(%rax), %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd132pd	(%rax){1to4}, %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmadd132pd	%ymm16, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd132pd	(%rax), %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd132pd	(%rax){1to4}, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmadd213pd	%xmm16, %xmm17, %xmm19
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd213pd	(%rax), %xmm17, %xmm19
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd213pd	(%rax){1to2}, %xmm17, %xmm19
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmadd213pd	%xmm16, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd213pd	(%rax), %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd213pd	(%rax){1to2}, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmadd213pd	%xmm16, %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd213pd	(%rax), %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd213pd	(%rax){1to2}, %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmadd213pd	%ymm16, %ymm17, %ymm19
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd213pd	(%rax), %ymm17, %ymm19
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd213pd	(%rax){1to4}, %ymm17, %ymm19
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmadd213pd	%ymm16, %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd213pd	(%rax), %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd213pd	(%rax){1to4}, %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmadd213pd	%ymm16, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd213pd	(%rax), %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd213pd	(%rax){1to4}, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmadd231pd	%xmm16, %xmm17, %xmm19
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd231pd	(%rax), %xmm17, %xmm19
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd231pd	(%rax){1to2}, %xmm17, %xmm19
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmadd231pd	%xmm16, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd231pd	(%rax), %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd231pd	(%rax){1to2}, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmadd231pd	%xmm16, %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd231pd	(%rax), %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd231pd	(%rax){1to2}, %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmadd231pd	%ymm16, %ymm17, %ymm19
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd231pd	(%rax), %ymm17, %ymm19
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd231pd	(%rax){1to4}, %ymm17, %ymm19
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmadd231pd	%ymm16, %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd231pd	(%rax), %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd231pd	(%rax){1to4}, %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmadd231pd	%ymm16, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd231pd	(%rax), %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd231pd	(%rax){1to4}, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmadd132ps	%xmm16, %xmm17, %xmm19
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd132ps	(%rax), %xmm17, %xmm19
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd132ps	(%rax){1to4}, %xmm17, %xmm19
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmadd132ps	%xmm16, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd132ps	(%rax), %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd132ps	(%rax){1to4}, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmadd132ps	%xmm16, %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd132ps	(%rax), %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd132ps	(%rax){1to4}, %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmadd132ps	%ymm16, %ymm17, %ymm19
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd132ps	(%rax), %ymm17, %ymm19
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd132ps	(%rax){1to8}, %ymm17, %ymm19
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmadd132ps	%ymm16, %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd132ps	(%rax), %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd132ps	(%rax){1to8}, %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmadd132ps	%ymm16, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd132ps	(%rax), %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd132ps	(%rax){1to8}, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmadd213ps	%xmm16, %xmm17, %xmm19
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd213ps	(%rax), %xmm17, %xmm19
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd213ps	(%rax){1to4}, %xmm17, %xmm19
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmadd213ps	%xmm16, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd213ps	(%rax), %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd213ps	(%rax){1to4}, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmadd213ps	%xmm16, %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd213ps	(%rax), %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd213ps	(%rax){1to4}, %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmadd213ps	%ymm16, %ymm17, %ymm19
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd213ps	(%rax), %ymm17, %ymm19
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd213ps	(%rax){1to8}, %ymm17, %ymm19
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmadd213ps	%ymm16, %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd213ps	(%rax), %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd213ps	(%rax){1to8}, %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmadd213ps	%ymm16, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd213ps	(%rax), %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd213ps	(%rax){1to8}, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmadd231ps	%xmm16, %xmm17, %xmm19
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd231ps	(%rax), %xmm17, %xmm19
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd231ps	(%rax){1to4}, %xmm17, %xmm19
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmadd231ps	%xmm16, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd231ps	(%rax), %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd231ps	(%rax){1to4}, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmadd231ps	%xmm16, %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd231ps	(%rax), %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd231ps	(%rax){1to4}, %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmadd231ps	%ymm16, %ymm17, %ymm19
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd231ps	(%rax), %ymm17, %ymm19
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd231ps	(%rax){1to8}, %ymm17, %ymm19
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmadd231ps	%ymm16, %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd231ps	(%rax), %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd231ps	(%rax){1to8}, %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -     vfmadd231ps	%ymm16, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd231ps	(%rax), %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -     0.50   0.50   vfmadd231ps	(%rax){1to8}, %ymm17, %ymm19 {%k1} {z}
 # CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vgatherdpd	(%rax,%xmm1,2), %ymm2 {%k1}
 # CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vgatherdps	(%rax,%ymm1,2), %ymm2 {%k1}
 # CHECK-NEXT:  -      -      -      -      -      -     0.50   0.50   vgatherqpd	(%rax,%ymm1,2), %ymm2 {%k1}
diff --git a/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx512.s b/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx512.s
index 40ab0656c4888..d99213f0b25a6 100644
--- a/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx512.s
+++ b/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx512.s
@@ -298,6 +298,66 @@ vdivps            %zmm16, %zmm17, %zmm19 {z}{k1}
 vdivps            (%rax), %zmm17, %zmm19 {z}{k1}
 vdivps            (%rax){1to16}, %zmm17, %zmm19 {z}{k1}
 
+vfmadd132pd       %zmm16, %zmm17, %zmm19
+vfmadd132pd       (%rax), %zmm17, %zmm19
+vfmadd132pd       (%rax){1to8}, %zmm17, %zmm19
+vfmadd132pd       %zmm16, %zmm17, %zmm19 {k1}
+vfmadd132pd       (%rax), %zmm17, %zmm19 {k1}
+vfmadd132pd       (%rax){1to8}, %zmm17, %zmm19 {k1}
+vfmadd132pd       %zmm16, %zmm17, %zmm19 {z}{k1}
+vfmadd132pd       (%rax), %zmm17, %zmm19 {z}{k1}
+vfmadd132pd       (%rax){1to8}, %zmm17, %zmm19 {z}{k1}
+
+vfmadd213pd       %zmm16, %zmm17, %zmm19
+vfmadd213pd       (%rax), %zmm17, %zmm19
+vfmadd213pd       (%rax){1to8}, %zmm17, %zmm19
+vfmadd213pd       %zmm16, %zmm17, %zmm19 {k1}
+vfmadd213pd       (%rax), %zmm17, %zmm19 {k1}
+vfmadd213pd       (%rax){1to8}, %zmm17, %zmm19 {k1}
+vfmadd213pd       %zmm16, %zmm17, %zmm19 {z}{k1}
+vfmadd213pd       (%rax), %zmm17, %zmm19 {z}{k1}
+vfmadd213pd       (%rax){1to8}, %zmm17, %zmm19 {z}{k1}
+
+vfmadd231pd       %zmm16, %zmm17, %zmm19
+vfmadd231pd       (%rax), %zmm17, %zmm19
+vfmadd231pd       (%rax){1to8}, %zmm17, %zmm19
+vfmadd231pd       %zmm16, %zmm17, %zmm19 {k1}
+vfmadd231pd       (%rax), %zmm17, %zmm19 {k1}
+vfmadd231pd       (%rax){1to8}, %zmm17, %zmm19 {k1}
+vfmadd231pd       %zmm16, %zmm17, %zmm19 {z}{k1}
+vfmadd231pd       (%rax), %zmm17, %zmm19 {z}{k1}
+vfmadd231pd       (%rax){1to8}, %zmm17, %zmm19 {z}{k1}
+
+vfmadd132ps       %zmm16, %zmm17, %zmm19
+vfmadd132ps       (%rax), %zmm17, %zmm19
+vfmadd132ps       (%rax){1to16}, %zmm17, %zmm19
+vfmadd132ps       %zmm16, %zmm17, %zmm19 {k1}
+vfmadd132ps       (%rax), %zmm17, %zmm19 {k1}
+vfmadd132ps       (%rax){1to16}, %zmm17, %zmm19 {k1}
+vfmadd132ps       %zmm16, %zmm17, %zmm19 {z}{k1}
+vfmadd132ps       (%rax), %zmm17, %zmm19 {z}{k1}
+vfmadd132ps       (%rax){1to16}, %zmm17, %zmm19 {z}{k1}
+
+vfmadd213ps       %zmm16, %zmm17, %zmm19
+vfmadd213ps       (%rax), %zmm17, %zmm19
+vfmadd213ps       (%rax){1to16}, %zmm17, %zmm19
+vfmadd213ps       %zmm16, %zmm17, %zmm19 {k1}
+vfmadd213ps       (%rax), %zmm17, %zmm19 {k1}
+vfmadd213ps       (%rax){1to16}, %zmm17, %zmm19 {k1}
+vfmadd213ps       %zmm16, %zmm17, %zmm19 {z}{k1}
+vfmadd213ps       (%rax), %zmm17, %zmm19 {z}{k1}
+vfmadd213ps       (%rax){1to16}, %zmm17, %zmm19 {z}{k1}
+
+vfmadd231ps       %zmm16, %zmm17, %zmm19
+vfmadd231ps       (%rax), %zmm17, %zmm19
+vfmadd231ps       (%rax){1to16}, %zmm17, %zmm19
+vfmadd231ps       %zmm16, %zmm17, %zmm19 {k1}
+vfmadd231ps       (%rax), %zmm17, %zmm19 {k1}
+vfmadd231ps       (%rax){1to16}, %zmm17, %zmm19 {k1}
+vfmadd231ps       %zmm16, %zmm17, %zmm19 {z}{k1}
+vfmadd231ps       (%rax), %zmm17, %zmm19 {z}{k1}
+vfmadd231ps       (%rax){1to16}, %zmm17, %zmm19 {z}{k1}
+
 vgatherdpd        (%rax,%ymm1,2), %zmm2 {k1}
 vgatherdps        (%rax,%zmm1,2), %zmm2 {k1}
 vgatherqpd        (%rax,%zmm1,2), %zmm2 {k1}
@@ -1274,6 +1334,60 @@ vunpcklps         (%rax){1to16}, %zmm17, %zmm19 {z}{k1}
 # CHECK-NEXT:  3      18    10.00                       vdivps	%zmm16, %zmm17, %zmm19 {%k1} {z}
 # CHECK-NEXT:  4      25    10.00   *                   vdivps	(%rax), %zmm17, %zmm19 {%k1} {z}
 # CHECK-NEXT:  4      25    10.00   *                   vdivps	(%rax){1to16}, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  1      4     0.50                        vfmadd132pd	%zmm16, %zmm17, %zmm19
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd132pd	(%rax), %zmm17, %zmm19
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd132pd	(%rax){1to8}, %zmm17, %zmm19
+# CHECK-NEXT:  1      4     0.50                        vfmadd132pd	%zmm16, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd132pd	(%rax), %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd132pd	(%rax){1to8}, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  1      4     0.50                        vfmadd132pd	%zmm16, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd132pd	(%rax), %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd132pd	(%rax){1to8}, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  1      4     0.50                        vfmadd213pd	%zmm16, %zmm17, %zmm19
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd213pd	(%rax), %zmm17, %zmm19
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd213pd	(%rax){1to8}, %zmm17, %zmm19
+# CHECK-NEXT:  1      4     0.50                        vfmadd213pd	%zmm16, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd213pd	(%rax), %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd213pd	(%rax){1to8}, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  1      4     0.50                        vfmadd213pd	%zmm16, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd213pd	(%rax), %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd213pd	(%rax){1to8}, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  1      4     0.50                        vfmadd231pd	%zmm16, %zmm17, %zmm19
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd231pd	(%rax), %zmm17, %zmm19
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd231pd	(%rax){1to8}, %zmm17, %zmm19
+# CHECK-NEXT:  1      4     0.50                        vfmadd231pd	%zmm16, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd231pd	(%rax), %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd231pd	(%rax){1to8}, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  1      4     0.50                        vfmadd231pd	%zmm16, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd231pd	(%rax), %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd231pd	(%rax){1to8}, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  1      4     0.50                        vfmadd132ps	%zmm16, %zmm17, %zmm19
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd132ps	(%rax), %zmm17, %zmm19
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd132ps	(%rax){1to16}, %zmm17, %zmm19
+# CHECK-NEXT:  1      4     0.50                        vfmadd132ps	%zmm16, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd132ps	(%rax), %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd132ps	(%rax){1to16}, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  1      4     0.50                        vfmadd132ps	%zmm16, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd132ps	(%rax), %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd132ps	(%rax){1to16}, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  1      4     0.50                        vfmadd213ps	%zmm16, %zmm17, %zmm19
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd213ps	(%rax), %zmm17, %zmm19
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd213ps	(%rax){1to16}, %zmm17, %zmm19
+# CHECK-NEXT:  1      4     0.50                        vfmadd213ps	%zmm16, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd213ps	(%rax), %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd213ps	(%rax){1to16}, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  1      4     0.50                        vfmadd213ps	%zmm16, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd213ps	(%rax), %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd213ps	(%rax){1to16}, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  1      4     0.50                        vfmadd231ps	%zmm16, %zmm17, %zmm19
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd231ps	(%rax), %zmm17, %zmm19
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd231ps	(%rax){1to16}, %zmm17, %zmm19
+# CHECK-NEXT:  1      4     0.50                        vfmadd231ps	%zmm16, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd231ps	(%rax), %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd231ps	(%rax){1to16}, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  1      4     0.50                        vfmadd231ps	%zmm16, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd231ps	(%rax), %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd231ps	(%rax){1to16}, %zmm17, %zmm19 {%k1} {z}
 # CHECK-NEXT:  5      21    4.00    *                   vgatherdpd	(%rax,%ymm1,2), %zmm2 {%k1}
 # CHECK-NEXT:  5      25    8.00    *                   vgatherdps	(%rax,%zmm1,2), %zmm2 {%k1}
 # CHECK-NEXT:  5      21    4.00    *                   vgatherqpd	(%rax,%zmm1,2), %zmm2 {%k1}
@@ -1917,7 +2031,7 @@ vunpcklps         (%rax){1to16}, %zmm17, %zmm19 {z}{k1}
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]
-# CHECK-NEXT:  -     612.00 317.17 99.67  309.50 309.50 8.00   612.17 2.00   8.00   8.00   8.00
+# CHECK-NEXT:  -     612.00 344.17 99.67  327.50 327.50 8.00   639.17 2.00   8.00   8.00   8.00
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   Instructions:
@@ -2180,6 +2294,60 @@ vunpcklps         (%rax){1to16}, %zmm17, %zmm19 {z}{k1}
 # CHECK-NEXT:  -     10.00  2.00    -      -      -      -     1.00    -      -      -      -     vdivps	%zmm16, %zmm17, %zmm19 {%k1} {z}
 # CHECK-NEXT:  -     10.00  2.00    -     0.50   0.50    -     1.00    -      -      -      -     vdivps	(%rax), %zmm17, %zmm19 {%k1} {z}
 # CHECK-NEXT:  -     10.00  2.00    -     0.50   0.50    -     1.00    -      -      -      -     vdivps	(%rax){1to16}, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vfmadd132pd	%zmm16, %zmm17, %zmm19
+# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vfmadd132pd	(%rax), %zmm17, %zmm19
+# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vfmadd132pd	(%rax){1to8}, %zmm17, %zmm19
+# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vfmadd132pd	%zmm16, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vfmadd132pd	(%rax), %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vfmadd132pd	(%rax){1to8}, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vfmadd132pd	%zmm16, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vfmadd132pd	(%rax), %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vfmadd132pd	(%rax){1to8}, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vfmadd213pd	%zmm16, %zmm17, %zmm19
+# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vfmadd213pd	(%rax), %zmm17, %zmm19
+# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vfmadd213pd	(%rax){1to8}, %zmm17, %zmm19
+# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vfmadd213pd	%zmm16, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vfmadd213pd	(%rax), %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vfmadd213pd	(%rax){1to8}, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vfmadd213pd	%zmm16, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vfmadd213pd	(%rax), %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vfmadd213pd	(%rax){1to8}, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vfmadd231pd	%zmm16, %zmm17, %zmm19
+# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vfmadd231pd	(%rax), %zmm17, %zmm19
+# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vfmadd231pd	(%rax){1to8}, %zmm17, %zmm19
+# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vfmadd231pd	%zmm16, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vfmadd231pd	(%rax), %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vfmadd231pd	(%rax){1to8}, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vfmadd231pd	%zmm16, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vfmadd231pd	(%rax), %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vfmadd231pd	(%rax){1to8}, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vfmadd132ps	%zmm16, %zmm17, %zmm19
+# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vfmadd132ps	(%rax), %zmm17, %zmm19
+# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vfmadd132ps	(%rax){1to16}, %zmm17, %zmm19
+# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vfmadd132ps	%zmm16, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vfmadd132ps	(%rax), %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vfmadd132ps	(%rax){1to16}, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vfmadd132ps	%zmm16, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vfmadd132ps	(%rax), %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vfmadd132ps	(%rax){1to16}, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vfmadd213ps	%zmm16, %zmm17, %zmm19
+# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vfmadd213ps	(%rax), %zmm17, %zmm19
+# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vfmadd213ps	(%rax){1to16}, %zmm17, %zmm19
+# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vfmadd213ps	%zmm16, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vfmadd213ps	(%rax), %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vfmadd213ps	(%rax){1to16}, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vfmadd213ps	%zmm16, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vfmadd213ps	(%rax), %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vfmadd213ps	(%rax){1to16}, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vfmadd231ps	%zmm16, %zmm17, %zmm19
+# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vfmadd231ps	(%rax), %zmm17, %zmm19
+# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vfmadd231ps	(%rax){1to16}, %zmm17, %zmm19
+# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vfmadd231ps	%zmm16, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vfmadd231ps	(%rax), %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vfmadd231ps	(%rax){1to16}, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vfmadd231ps	%zmm16, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vfmadd231ps	(%rax), %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vfmadd231ps	(%rax){1to16}, %zmm17, %zmm19 {%k1} {z}
 # CHECK-NEXT:  -      -     1.58   0.58   4.00   4.00    -     0.58   0.25    -      -      -     vgatherdpd	(%rax,%ymm1,2), %zmm2 {%k1}
 # CHECK-NEXT:  -      -     1.58   0.58   8.00   8.00    -     0.58   0.25    -      -      -     vgatherdps	(%rax,%zmm1,2), %zmm2 {%k1}
 # CHECK-NEXT:  -      -     1.58   0.58   4.00   4.00    -     0.58   0.25    -      -      -     vgatherqpd	(%rax,%zmm1,2), %zmm2 {%k1}
diff --git a/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx512vl.s b/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx512vl.s
index 6d11293789a60..375087ae0cfe4 100644
--- a/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx512vl.s
+++ b/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx512vl.s
@@ -418,6 +418,126 @@ vdivps            %ymm16, %ymm17, %ymm19 {z}{k1}
 vdivps            (%rax), %ymm17, %ymm19 {z}{k1}
 vdivps            (%rax){1to8}, %ymm17, %ymm19 {z}{k1}
 
+vfmadd132pd       %xmm16, %xmm17, %xmm19
+vfmadd132pd       (%rax), %xmm17, %xmm19
+vfmadd132pd       (%rax){1to2}, %xmm17, %xmm19
+vfmadd132pd       %xmm16, %xmm17, %xmm19 {k1}
+vfmadd132pd       (%rax), %xmm17, %xmm19 {k1}
+vfmadd132pd       (%rax){1to2}, %xmm17, %xmm19 {k1}
+vfmadd132pd       %xmm16, %xmm17, %xmm19 {z}{k1}
+vfmadd132pd       (%rax), %xmm17, %xmm19 {z}{k1}
+vfmadd132pd       (%rax){1to2}, %xmm17, %xmm19 {z}{k1}
+
+vfmadd132pd       %ymm16, %ymm17, %ymm19
+vfmadd132pd       (%rax), %ymm17, %ymm19
+vfmadd132pd       (%rax){1to4}, %ymm17, %ymm19
+vfmadd132pd       %ymm16, %ymm17, %ymm19 {k1}
+vfmadd132pd       (%rax), %ymm17, %ymm19 {k1}
+vfmadd132pd       (%rax){1to4}, %ymm17, %ymm19 {k1}
+vfmadd132pd       %ymm16, %ymm17, %ymm19 {z}{k1}
+vfmadd132pd       (%rax), %ymm17, %ymm19 {z}{k1}
+vfmadd132pd       (%rax){1to4}, %ymm17, %ymm19 {z}{k1}
+
+vfmadd213pd       %xmm16, %xmm17, %xmm19
+vfmadd213pd       (%rax), %xmm17, %xmm19
+vfmadd213pd       (%rax){1to2}, %xmm17, %xmm19
+vfmadd213pd       %xmm16, %xmm17, %xmm19 {k1}
+vfmadd213pd       (%rax), %xmm17, %xmm19 {k1}
+vfmadd213pd       (%rax){1to2}, %xmm17, %xmm19 {k1}
+vfmadd213pd       %xmm16, %xmm17, %xmm19 {z}{k1}
+vfmadd213pd       (%rax), %xmm17, %xmm19 {z}{k1}
+vfmadd213pd       (%rax){1to2}, %xmm17, %xmm19 {z}{k1}
+
+vfmadd213pd       %ymm16, %ymm17, %ymm19
+vfmadd213pd       (%rax), %ymm17, %ymm19
+vfmadd213pd       (%rax){1to4}, %ymm17, %ymm19
+vfmadd213pd       %ymm16, %ymm17, %ymm19 {k1}
+vfmadd213pd       (%rax), %ymm17, %ymm19 {k1}
+vfmadd213pd       (%rax){1to4}, %ymm17, %ymm19 {k1}
+vfmadd213pd       %ymm16, %ymm17, %ymm19 {z}{k1}
+vfmadd213pd       (%rax), %ymm17, %ymm19 {z}{k1}
+vfmadd213pd       (%rax){1to4}, %ymm17, %ymm19 {z}{k1}
+
+vfmadd231pd       %xmm16, %xmm17, %xmm19
+vfmadd231pd       (%rax), %xmm17, %xmm19
+vfmadd231pd       (%rax){1to2}, %xmm17, %xmm19
+vfmadd231pd       %xmm16, %xmm17, %xmm19 {k1}
+vfmadd231pd       (%rax), %xmm17, %xmm19 {k1}
+vfmadd231pd       (%rax){1to2}, %xmm17, %xmm19 {k1}
+vfmadd231pd       %xmm16, %xmm17, %xmm19 {z}{k1}
+vfmadd231pd       (%rax), %xmm17, %xmm19 {z}{k1}
+vfmadd231pd       (%rax){1to2}, %xmm17, %xmm19 {z}{k1}
+
+vfmadd231pd       %ymm16, %ymm17, %ymm19
+vfmadd231pd       (%rax), %ymm17, %ymm19
+vfmadd231pd       (%rax){1to4}, %ymm17, %ymm19
+vfmadd231pd       %ymm16, %ymm17, %ymm19 {k1}
+vfmadd231pd       (%rax), %ymm17, %ymm19 {k1}
+vfmadd231pd       (%rax){1to4}, %ymm17, %ymm19 {k1}
+vfmadd231pd       %ymm16, %ymm17, %ymm19 {z}{k1}
+vfmadd231pd       (%rax), %ymm17, %ymm19 {z}{k1}
+vfmadd231pd       (%rax){1to4}, %ymm17, %ymm19 {z}{k1}
+
+vfmadd132ps       %xmm16, %xmm17, %xmm19
+vfmadd132ps       (%rax), %xmm17, %xmm19
+vfmadd132ps       (%rax){1to4}, %xmm17, %xmm19
+vfmadd132ps       %xmm16, %xmm17, %xmm19 {k1}
+vfmadd132ps       (%rax), %xmm17, %xmm19 {k1}
+vfmadd132ps       (%rax){1to4}, %xmm17, %xmm19 {k1}
+vfmadd132ps       %xmm16, %xmm17, %xmm19 {z}{k1}
+vfmadd132ps       (%rax), %xmm17, %xmm19 {z}{k1}
+vfmadd132ps       (%rax){1to4}, %xmm17, %xmm19 {z}{k1}
+
+vfmadd132ps       %ymm16, %ymm17, %ymm19
+vfmadd132ps       (%rax), %ymm17, %ymm19
+vfmadd132ps       (%rax){1to8}, %ymm17, %ymm19
+vfmadd132ps       %ymm16, %ymm17, %ymm19 {k1}
+vfmadd132ps       (%rax), %ymm17, %ymm19 {k1}
+vfmadd132ps       (%rax){1to8}, %ymm17, %ymm19 {k1}
+vfmadd132ps       %ymm16, %ymm17, %ymm19 {z}{k1}
+vfmadd132ps       (%rax), %ymm17, %ymm19 {z}{k1}
+vfmadd132ps       (%rax){1to8}, %ymm17, %ymm19 {z}{k1}
+
+vfmadd213ps       %xmm16, %xmm17, %xmm19
+vfmadd213ps       (%rax), %xmm17, %xmm19
+vfmadd213ps       (%rax){1to4}, %xmm17, %xmm19
+vfmadd213ps       %xmm16, %xmm17, %xmm19 {k1}
+vfmadd213ps       (%rax), %xmm17, %xmm19 {k1}
+vfmadd213ps       (%rax){1to4}, %xmm17, %xmm19 {k1}
+vfmadd213ps       %xmm16, %xmm17, %xmm19 {z}{k1}
+vfmadd213ps       (%rax), %xmm17, %xmm19 {z}{k1}
+vfmadd213ps       (%rax){1to4}, %xmm17, %xmm19 {z}{k1}
+
+vfmadd213ps       %ymm16, %ymm17, %ymm19
+vfmadd213ps       (%rax), %ymm17, %ymm19
+vfmadd213ps       (%rax){1to8}, %ymm17, %ymm19
+vfmadd213ps       %ymm16, %ymm17, %ymm19 {k1}
+vfmadd213ps       (%rax), %ymm17, %ymm19 {k1}
+vfmadd213ps       (%rax){1to8}, %ymm17, %ymm19 {k1}
+vfmadd213ps       %ymm16, %ymm17, %ymm19 {z}{k1}
+vfmadd213ps       (%rax), %ymm17, %ymm19 {z}{k1}
+vfmadd213ps       (%rax){1to8}, %ymm17, %ymm19 {z}{k1}
+
+vfmadd231ps       %xmm16, %xmm17, %xmm19
+vfmadd231ps       (%rax), %xmm17, %xmm19
+vfmadd231ps       (%rax){1to4}, %xmm17, %xmm19
+vfmadd231ps       %xmm16, %xmm17, %xmm19 {k1}
+vfmadd231ps       (%rax), %xmm17, %xmm19 {k1}
+vfmadd231ps       (%rax){1to4}, %xmm17, %xmm19 {k1}
+vfmadd231ps       %xmm16, %xmm17, %xmm19 {z}{k1}
+vfmadd231ps       (%rax), %xmm17, %xmm19 {z}{k1}
+vfmadd231ps       (%rax){1to4}, %xmm17, %xmm19 {z}{k1}
+
+vfmadd231ps       %ymm16, %ymm17, %ymm19
+vfmadd231ps       (%rax), %ymm17, %ymm19
+vfmadd231ps       (%rax){1to8}, %ymm17, %ymm19
+vfmadd231ps       %ymm16, %ymm17, %ymm19 {k1}
+vfmadd231ps       (%rax), %ymm17, %ymm19 {k1}
+vfmadd231ps       (%rax){1to8}, %ymm17, %ymm19 {k1}
+vfmadd231ps       %ymm16, %ymm17, %ymm19 {z}{k1}
+vfmadd231ps       (%rax), %ymm17, %ymm19 {z}{k1}
+vfmadd231ps       (%rax){1to8}, %ymm17, %ymm19 {z}{k1}
+
 vgatherdpd        (%rax,%xmm1,2), %ymm2 {k1}
 vgatherdps        (%rax,%ymm1,2), %ymm2 {k1}
 vgatherqpd        (%rax,%ymm1,2), %ymm2 {k1}
@@ -1961,6 +2081,114 @@ vunpcklps         (%rax){1to8}, %ymm17, %ymm19 {z}{k1}
 # CHECK-NEXT:  1      11    5.00                        vdivps	%ymm16, %ymm17, %ymm19 {%k1} {z}
 # CHECK-NEXT:  2      18    5.00    *                   vdivps	(%rax), %ymm17, %ymm19 {%k1} {z}
 # CHECK-NEXT:  2      18    5.00    *                   vdivps	(%rax){1to8}, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  1      4     0.50                        vfmadd132pd	%xmm16, %xmm17, %xmm19
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd132pd	(%rax), %xmm17, %xmm19
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd132pd	(%rax){1to2}, %xmm17, %xmm19
+# CHECK-NEXT:  1      4     0.50                        vfmadd132pd	%xmm16, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd132pd	(%rax), %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd132pd	(%rax){1to2}, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  1      4     0.50                        vfmadd132pd	%xmm16, %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd132pd	(%rax), %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd132pd	(%rax){1to2}, %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  1      4     0.50                        vfmadd132pd	%ymm16, %ymm17, %ymm19
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd132pd	(%rax), %ymm17, %ymm19
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd132pd	(%rax){1to4}, %ymm17, %ymm19
+# CHECK-NEXT:  1      4     0.50                        vfmadd132pd	%ymm16, %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd132pd	(%rax), %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd132pd	(%rax){1to4}, %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  1      4     0.50                        vfmadd132pd	%ymm16, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd132pd	(%rax), %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd132pd	(%rax){1to4}, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  1      4     0.50                        vfmadd213pd	%xmm16, %xmm17, %xmm19
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd213pd	(%rax), %xmm17, %xmm19
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd213pd	(%rax){1to2}, %xmm17, %xmm19
+# CHECK-NEXT:  1      4     0.50                        vfmadd213pd	%xmm16, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd213pd	(%rax), %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd213pd	(%rax){1to2}, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  1      4     0.50                        vfmadd213pd	%xmm16, %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd213pd	(%rax), %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd213pd	(%rax){1to2}, %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  1      4     0.50                        vfmadd213pd	%ymm16, %ymm17, %ymm19
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd213pd	(%rax), %ymm17, %ymm19
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd213pd	(%rax){1to4}, %ymm17, %ymm19
+# CHECK-NEXT:  1      4     0.50                        vfmadd213pd	%ymm16, %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd213pd	(%rax), %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd213pd	(%rax){1to4}, %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  1      4     0.50                        vfmadd213pd	%ymm16, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd213pd	(%rax), %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd213pd	(%rax){1to4}, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  1      4     0.50                        vfmadd231pd	%xmm16, %xmm17, %xmm19
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd231pd	(%rax), %xmm17, %xmm19
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd231pd	(%rax){1to2}, %xmm17, %xmm19
+# CHECK-NEXT:  1      4     0.50                        vfmadd231pd	%xmm16, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd231pd	(%rax), %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd231pd	(%rax){1to2}, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  1      4     0.50                        vfmadd231pd	%xmm16, %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd231pd	(%rax), %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd231pd	(%rax){1to2}, %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  1      4     0.50                        vfmadd231pd	%ymm16, %ymm17, %ymm19
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd231pd	(%rax), %ymm17, %ymm19
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd231pd	(%rax){1to4}, %ymm17, %ymm19
+# CHECK-NEXT:  1      4     0.50                        vfmadd231pd	%ymm16, %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd231pd	(%rax), %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd231pd	(%rax){1to4}, %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  1      4     0.50                        vfmadd231pd	%ymm16, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd231pd	(%rax), %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd231pd	(%rax){1to4}, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  1      4     0.50                        vfmadd132ps	%xmm16, %xmm17, %xmm19
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd132ps	(%rax), %xmm17, %xmm19
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd132ps	(%rax){1to4}, %xmm17, %xmm19
+# CHECK-NEXT:  1      4     0.50                        vfmadd132ps	%xmm16, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd132ps	(%rax), %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd132ps	(%rax){1to4}, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  1      4     0.50                        vfmadd132ps	%xmm16, %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd132ps	(%rax), %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd132ps	(%rax){1to4}, %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  1      4     0.50                        vfmadd132ps	%ymm16, %ymm17, %ymm19
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd132ps	(%rax), %ymm17, %ymm19
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd132ps	(%rax){1to8}, %ymm17, %ymm19
+# CHECK-NEXT:  1      4     0.50                        vfmadd132ps	%ymm16, %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd132ps	(%rax), %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd132ps	(%rax){1to8}, %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  1      4     0.50                        vfmadd132ps	%ymm16, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd132ps	(%rax), %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd132ps	(%rax){1to8}, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  1      4     0.50                        vfmadd213ps	%xmm16, %xmm17, %xmm19
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd213ps	(%rax), %xmm17, %xmm19
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd213ps	(%rax){1to4}, %xmm17, %xmm19
+# CHECK-NEXT:  1      4     0.50                        vfmadd213ps	%xmm16, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd213ps	(%rax), %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd213ps	(%rax){1to4}, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  1      4     0.50                        vfmadd213ps	%xmm16, %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd213ps	(%rax), %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd213ps	(%rax){1to4}, %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  1      4     0.50                        vfmadd213ps	%ymm16, %ymm17, %ymm19
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd213ps	(%rax), %ymm17, %ymm19
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd213ps	(%rax){1to8}, %ymm17, %ymm19
+# CHECK-NEXT:  1      4     0.50                        vfmadd213ps	%ymm16, %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd213ps	(%rax), %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd213ps	(%rax){1to8}, %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  1      4     0.50                        vfmadd213ps	%ymm16, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd213ps	(%rax), %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd213ps	(%rax){1to8}, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  1      4     0.50                        vfmadd231ps	%xmm16, %xmm17, %xmm19
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd231ps	(%rax), %xmm17, %xmm19
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd231ps	(%rax){1to4}, %xmm17, %xmm19
+# CHECK-NEXT:  1      4     0.50                        vfmadd231ps	%xmm16, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd231ps	(%rax), %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd231ps	(%rax){1to4}, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  1      4     0.50                        vfmadd231ps	%xmm16, %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd231ps	(%rax), %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd231ps	(%rax){1to4}, %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  1      4     0.50                        vfmadd231ps	%ymm16, %ymm17, %ymm19
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd231ps	(%rax), %ymm17, %ymm19
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd231ps	(%rax){1to8}, %ymm17, %ymm19
+# CHECK-NEXT:  1      4     0.50                        vfmadd231ps	%ymm16, %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd231ps	(%rax), %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd231ps	(%rax){1to8}, %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  1      4     0.50                        vfmadd231ps	%ymm16, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd231ps	(%rax), %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd231ps	(%rax){1to8}, %ymm17, %ymm19 {%k1} {z}
 # CHECK-NEXT:  5      19    2.00    *                   vgatherdpd	(%rax,%xmm1,2), %ymm2 {%k1}
 # CHECK-NEXT:  5      21    4.00    *                   vgatherdps	(%rax,%ymm1,2), %ymm2 {%k1}
 # CHECK-NEXT:  5      19    2.00    *                   vgatherqpd	(%rax,%ymm1,2), %ymm2 {%k1}
@@ -3004,7 +3232,7 @@ vunpcklps         (%rax){1to8}, %ymm17, %ymm19 {z}{k1}
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]
-# CHECK-NEXT:  -     423.00 384.33 359.33 456.50 456.50 16.00  722.33 4.00   16.00  16.00  16.00
+# CHECK-NEXT:  -     423.00 438.33 413.33 492.50 492.50 16.00  722.33 4.00   16.00  16.00  16.00
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   Instructions:
@@ -3380,6 +3608,114 @@ vunpcklps         (%rax){1to8}, %ymm17, %ymm19 {z}{k1}
 # CHECK-NEXT:  -     5.00   1.00    -      -      -      -      -      -      -      -      -     vdivps	%ymm16, %ymm17, %ymm19 {%k1} {z}
 # CHECK-NEXT:  -     5.00   1.00    -     0.50   0.50    -      -      -      -      -      -     vdivps	(%rax), %ymm17, %ymm19 {%k1} {z}
 # CHECK-NEXT:  -     5.00   1.00    -     0.50   0.50    -      -      -      -      -      -     vdivps	(%rax){1to8}, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     vfmadd132pd	%xmm16, %xmm17, %xmm19
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -      -      -     vfmadd132pd	(%rax), %xmm17, %xmm19
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -      -      -     vfmadd132pd	(%rax){1to2}, %xmm17, %xmm19
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     vfmadd132pd	%xmm16, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -      -      -     vfmadd132pd	(%rax), %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -      -      -     vfmadd132pd	(%rax){1to2}, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     vfmadd132pd	%xmm16, %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -      -      -     vfmadd132pd	(%rax), %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -      -      -     vfmadd132pd	(%rax){1to2}, %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     vfmadd132pd	%ymm16, %ymm17, %ymm19
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -      -      -     vfmadd132pd	(%rax), %ymm17, %ymm19
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -      -      -     vfmadd132pd	(%rax){1to4}, %ymm17, %ymm19
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     vfmadd132pd	%ymm16, %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -      -      -     vfmadd132pd	(%rax), %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -      -      -     vfmadd132pd	(%rax){1to4}, %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     vfmadd132pd	%ymm16, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -      -      -     vfmadd132pd	(%rax), %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -      -      -     vfmadd132pd	(%rax){1to4}, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     vfmadd213pd	%xmm16, %xmm17, %xmm19
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -      -      -     vfmadd213pd	(%rax), %xmm17, %xmm19
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -      -      -     vfmadd213pd	(%rax){1to2}, %xmm17, %xmm19
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     vfmadd213pd	%xmm16, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -      -      -     vfmadd213pd	(%rax), %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -      -      -     vfmadd213pd	(%rax){1to2}, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     vfmadd213pd	%xmm16, %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -      -      -     vfmadd213pd	(%rax), %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -      -      -     vfmadd213pd	(%rax){1to2}, %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     vfmadd213pd	%ymm16, %ymm17, %ymm19
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -      -      -     vfmadd213pd	(%rax), %ymm17, %ymm19
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -      -      -     vfmadd213pd	(%rax){1to4}, %ymm17, %ymm19
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     vfmadd213pd	%ymm16, %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -      -      -     vfmadd213pd	(%rax), %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -      -      -     vfmadd213pd	(%rax){1to4}, %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     vfmadd213pd	%ymm16, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -      -      -     vfmadd213pd	(%rax), %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -      -      -     vfmadd213pd	(%rax){1to4}, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     vfmadd231pd	%xmm16, %xmm17, %xmm19
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -      -      -     vfmadd231pd	(%rax), %xmm17, %xmm19
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -      -      -     vfmadd231pd	(%rax){1to2}, %xmm17, %xmm19
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     vfmadd231pd	%xmm16, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -      -      -     vfmadd231pd	(%rax), %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -      -      -     vfmadd231pd	(%rax){1to2}, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     vfmadd231pd	%xmm16, %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -      -      -     vfmadd231pd	(%rax), %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -      -      -     vfmadd231pd	(%rax){1to2}, %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     vfmadd231pd	%ymm16, %ymm17, %ymm19
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -      -      -     vfmadd231pd	(%rax), %ymm17, %ymm19
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -      -      -     vfmadd231pd	(%rax){1to4}, %ymm17, %ymm19
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     vfmadd231pd	%ymm16, %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -      -      -     vfmadd231pd	(%rax), %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -      -      -     vfmadd231pd	(%rax){1to4}, %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     vfmadd231pd	%ymm16, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -      -      -     vfmadd231pd	(%rax), %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -      -      -     vfmadd231pd	(%rax){1to4}, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     vfmadd132ps	%xmm16, %xmm17, %xmm19
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -      -      -     vfmadd132ps	(%rax), %xmm17, %xmm19
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -      -      -     vfmadd132ps	(%rax){1to4}, %xmm17, %xmm19
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     vfmadd132ps	%xmm16, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -      -      -     vfmadd132ps	(%rax), %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -      -      -     vfmadd132ps	(%rax){1to4}, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     vfmadd132ps	%xmm16, %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -      -      -     vfmadd132ps	(%rax), %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -      -      -     vfmadd132ps	(%rax){1to4}, %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     vfmadd132ps	%ymm16, %ymm17, %ymm19
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -      -      -     vfmadd132ps	(%rax), %ymm17, %ymm19
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -      -      -     vfmadd132ps	(%rax){1to8}, %ymm17, %ymm19
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     vfmadd132ps	%ymm16, %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -      -      -     vfmadd132ps	(%rax), %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -      -      -     vfmadd132ps	(%rax){1to8}, %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     vfmadd132ps	%ymm16, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -      -      -     vfmadd132ps	(%rax), %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -      -      -     vfmadd132ps	(%rax){1to8}, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     vfmadd213ps	%xmm16, %xmm17, %xmm19
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -      -      -     vfmadd213ps	(%rax), %xmm17, %xmm19
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -      -      -     vfmadd213ps	(%rax){1to4}, %xmm17, %xmm19
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     vfmadd213ps	%xmm16, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -      -      -     vfmadd213ps	(%rax), %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -      -      -     vfmadd213ps	(%rax){1to4}, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     vfmadd213ps	%xmm16, %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -      -      -     vfmadd213ps	(%rax), %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -      -      -     vfmadd213ps	(%rax){1to4}, %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     vfmadd213ps	%ymm16, %ymm17, %ymm19
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -      -      -     vfmadd213ps	(%rax), %ymm17, %ymm19
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -      -      -     vfmadd213ps	(%rax){1to8}, %ymm17, %ymm19
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     vfmadd213ps	%ymm16, %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -      -      -     vfmadd213ps	(%rax), %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -      -      -     vfmadd213ps	(%rax){1to8}, %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     vfmadd213ps	%ymm16, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -      -      -     vfmadd213ps	(%rax), %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -      -      -     vfmadd213ps	(%rax){1to8}, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     vfmadd231ps	%xmm16, %xmm17, %xmm19
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -      -      -     vfmadd231ps	(%rax), %xmm17, %xmm19
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -      -      -     vfmadd231ps	(%rax){1to4}, %xmm17, %xmm19
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     vfmadd231ps	%xmm16, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -      -      -     vfmadd231ps	(%rax), %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -      -      -     vfmadd231ps	(%rax){1to4}, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     vfmadd231ps	%xmm16, %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -      -      -     vfmadd231ps	(%rax), %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -      -      -     vfmadd231ps	(%rax){1to4}, %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     vfmadd231ps	%ymm16, %ymm17, %ymm19
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -      -      -     vfmadd231ps	(%rax), %ymm17, %ymm19
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -      -      -     vfmadd231ps	(%rax){1to8}, %ymm17, %ymm19
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     vfmadd231ps	%ymm16, %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -      -      -     vfmadd231ps	(%rax), %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -      -      -     vfmadd231ps	(%rax){1to8}, %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -      -      -     vfmadd231ps	%ymm16, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -      -      -     vfmadd231ps	(%rax), %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -      -      -     vfmadd231ps	(%rax){1to8}, %ymm17, %ymm19 {%k1} {z}
 # CHECK-NEXT:  -      -     1.58   0.58   2.00   2.00    -     0.58   0.25    -      -      -     vgatherdpd	(%rax,%xmm1,2), %ymm2 {%k1}
 # CHECK-NEXT:  -      -     1.58   0.58   4.00   4.00    -     0.58   0.25    -      -      -     vgatherdps	(%rax,%ymm1,2), %ymm2 {%k1}
 # CHECK-NEXT:  -      -     1.58   0.58   2.00   2.00    -     0.58   0.25    -      -      -     vgatherqpd	(%rax,%ymm1,2), %ymm2 {%k1}
diff --git a/llvm/test/tools/llvm-mca/X86/SapphireRapids/resources-avx512.s b/llvm/test/tools/llvm-mca/X86/SapphireRapids/resources-avx512.s
index 784c851d707e8..b34ccaacc11a3 100644
--- a/llvm/test/tools/llvm-mca/X86/SapphireRapids/resources-avx512.s
+++ b/llvm/test/tools/llvm-mca/X86/SapphireRapids/resources-avx512.s
@@ -298,6 +298,66 @@ vdivps            %zmm16, %zmm17, %zmm19 {z}{k1}
 vdivps            (%rax), %zmm17, %zmm19 {z}{k1}
 vdivps            (%rax){1to16}, %zmm17, %zmm19 {z}{k1}
 
+vfmadd132pd       %zmm16, %zmm17, %zmm19
+vfmadd132pd       (%rax), %zmm17, %zmm19
+vfmadd132pd       (%rax){1to8}, %zmm17, %zmm19
+vfmadd132pd       %zmm16, %zmm17, %zmm19 {k1}
+vfmadd132pd       (%rax), %zmm17, %zmm19 {k1}
+vfmadd132pd       (%rax){1to8}, %zmm17, %zmm19 {k1}
+vfmadd132pd       %zmm16, %zmm17, %zmm19 {z}{k1}
+vfmadd132pd       (%rax), %zmm17, %zmm19 {z}{k1}
+vfmadd132pd       (%rax){1to8}, %zmm17, %zmm19 {z}{k1}
+
+vfmadd213pd       %zmm16, %zmm17, %zmm19
+vfmadd213pd       (%rax), %zmm17, %zmm19
+vfmadd213pd       (%rax){1to8}, %zmm17, %zmm19
+vfmadd213pd       %zmm16, %zmm17, %zmm19 {k1}
+vfmadd213pd       (%rax), %zmm17, %zmm19 {k1}
+vfmadd213pd       (%rax){1to8}, %zmm17, %zmm19 {k1}
+vfmadd213pd       %zmm16, %zmm17, %zmm19 {z}{k1}
+vfmadd213pd       (%rax), %zmm17, %zmm19 {z}{k1}
+vfmadd213pd       (%rax){1to8}, %zmm17, %zmm19 {z}{k1}
+
+vfmadd231pd       %zmm16, %zmm17, %zmm19
+vfmadd231pd       (%rax), %zmm17, %zmm19
+vfmadd231pd       (%rax){1to8}, %zmm17, %zmm19
+vfmadd231pd       %zmm16, %zmm17, %zmm19 {k1}
+vfmadd231pd       (%rax), %zmm17, %zmm19 {k1}
+vfmadd231pd       (%rax){1to8}, %zmm17, %zmm19 {k1}
+vfmadd231pd       %zmm16, %zmm17, %zmm19 {z}{k1}
+vfmadd231pd       (%rax), %zmm17, %zmm19 {z}{k1}
+vfmadd231pd       (%rax){1to8}, %zmm17, %zmm19 {z}{k1}
+
+vfmadd132ps       %zmm16, %zmm17, %zmm19
+vfmadd132ps       (%rax), %zmm17, %zmm19
+vfmadd132ps       (%rax){1to16}, %zmm17, %zmm19
+vfmadd132ps       %zmm16, %zmm17, %zmm19 {k1}
+vfmadd132ps       (%rax), %zmm17, %zmm19 {k1}
+vfmadd132ps       (%rax){1to16}, %zmm17, %zmm19 {k1}
+vfmadd132ps       %zmm16, %zmm17, %zmm19 {z}{k1}
+vfmadd132ps       (%rax), %zmm17, %zmm19 {z}{k1}
+vfmadd132ps       (%rax){1to16}, %zmm17, %zmm19 {z}{k1}
+
+vfmadd213ps       %zmm16, %zmm17, %zmm19
+vfmadd213ps       (%rax), %zmm17, %zmm19
+vfmadd213ps       (%rax){1to16}, %zmm17, %zmm19
+vfmadd213ps       %zmm16, %zmm17, %zmm19 {k1}
+vfmadd213ps       (%rax), %zmm17, %zmm19 {k1}
+vfmadd213ps       (%rax){1to16}, %zmm17, %zmm19 {k1}
+vfmadd213ps       %zmm16, %zmm17, %zmm19 {z}{k1}
+vfmadd213ps       (%rax), %zmm17, %zmm19 {z}{k1}
+vfmadd213ps       (%rax){1to16}, %zmm17, %zmm19 {z}{k1}
+
+vfmadd231ps       %zmm16, %zmm17, %zmm19
+vfmadd231ps       (%rax), %zmm17, %zmm19
+vfmadd231ps       (%rax){1to16}, %zmm17, %zmm19
+vfmadd231ps       %zmm16, %zmm17, %zmm19 {k1}
+vfmadd231ps       (%rax), %zmm17, %zmm19 {k1}
+vfmadd231ps       (%rax){1to16}, %zmm17, %zmm19 {k1}
+vfmadd231ps       %zmm16, %zmm17, %zmm19 {z}{k1}
+vfmadd231ps       (%rax), %zmm17, %zmm19 {z}{k1}
+vfmadd231ps       (%rax){1to16}, %zmm17, %zmm19 {z}{k1}
+
 vgatherdpd        (%rax,%ymm1,2), %zmm2 {k1}
 vgatherdps        (%rax,%zmm1,2), %zmm2 {k1}
 vgatherqpd        (%rax,%zmm1,2), %zmm2 {k1}
@@ -1274,6 +1334,60 @@ vunpcklps         (%rax){1to16}, %zmm17, %zmm19 {z}{k1}
 # CHECK-NEXT:  3      18    2.00                        vdivps	%zmm16, %zmm17, %zmm19 {%k1} {z}
 # CHECK-NEXT:  4      25    2.00    *                   vdivps	(%rax), %zmm17, %zmm19 {%k1} {z}
 # CHECK-NEXT:  4      25    2.00    *                   vdivps	(%rax){1to16}, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  1      4     1.00                        vfmadd132pd	%zmm16, %zmm17, %zmm19
+# CHECK-NEXT:  2      12    1.00    *                   vfmadd132pd	(%rax), %zmm17, %zmm19
+# CHECK-NEXT:  2      12    1.00    *                   vfmadd132pd	(%rax){1to8}, %zmm17, %zmm19
+# CHECK-NEXT:  1      4     1.00                        vfmadd132pd	%zmm16, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  2      12    1.00    *                   vfmadd132pd	(%rax), %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  2      12    1.00    *                   vfmadd132pd	(%rax){1to8}, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  1      4     1.00                        vfmadd132pd	%zmm16, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  2      12    1.00    *                   vfmadd132pd	(%rax), %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  2      12    1.00    *                   vfmadd132pd	(%rax){1to8}, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  1      4     1.00                        vfmadd213pd	%zmm16, %zmm17, %zmm19
+# CHECK-NEXT:  2      12    1.00    *                   vfmadd213pd	(%rax), %zmm17, %zmm19
+# CHECK-NEXT:  2      12    1.00    *                   vfmadd213pd	(%rax){1to8}, %zmm17, %zmm19
+# CHECK-NEXT:  1      4     1.00                        vfmadd213pd	%zmm16, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  2      12    1.00    *                   vfmadd213pd	(%rax), %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  2      12    1.00    *                   vfmadd213pd	(%rax){1to8}, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  1      4     1.00                        vfmadd213pd	%zmm16, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  2      12    1.00    *                   vfmadd213pd	(%rax), %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  2      12    1.00    *                   vfmadd213pd	(%rax){1to8}, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  1      4     1.00                        vfmadd231pd	%zmm16, %zmm17, %zmm19
+# CHECK-NEXT:  2      12    1.00    *                   vfmadd231pd	(%rax), %zmm17, %zmm19
+# CHECK-NEXT:  2      12    1.00    *                   vfmadd231pd	(%rax){1to8}, %zmm17, %zmm19
+# CHECK-NEXT:  1      4     1.00                        vfmadd231pd	%zmm16, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  2      12    1.00    *                   vfmadd231pd	(%rax), %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  2      12    1.00    *                   vfmadd231pd	(%rax){1to8}, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  1      4     1.00                        vfmadd231pd	%zmm16, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  2      12    1.00    *                   vfmadd231pd	(%rax), %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  2      12    1.00    *                   vfmadd231pd	(%rax){1to8}, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  1      4     1.00                        vfmadd132ps	%zmm16, %zmm17, %zmm19
+# CHECK-NEXT:  2      12    1.00    *                   vfmadd132ps	(%rax), %zmm17, %zmm19
+# CHECK-NEXT:  2      12    1.00    *                   vfmadd132ps	(%rax){1to16}, %zmm17, %zmm19
+# CHECK-NEXT:  1      4     1.00                        vfmadd132ps	%zmm16, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  2      12    1.00    *                   vfmadd132ps	(%rax), %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  2      12    1.00    *                   vfmadd132ps	(%rax){1to16}, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  1      4     1.00                        vfmadd132ps	%zmm16, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  2      12    1.00    *                   vfmadd132ps	(%rax), %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  2      12    1.00    *                   vfmadd132ps	(%rax){1to16}, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  1      4     1.00                        vfmadd213ps	%zmm16, %zmm17, %zmm19
+# CHECK-NEXT:  2      12    1.00    *                   vfmadd213ps	(%rax), %zmm17, %zmm19
+# CHECK-NEXT:  2      12    1.00    *                   vfmadd213ps	(%rax){1to16}, %zmm17, %zmm19
+# CHECK-NEXT:  1      4     1.00                        vfmadd213ps	%zmm16, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  2      12    1.00    *                   vfmadd213ps	(%rax), %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  2      12    1.00    *                   vfmadd213ps	(%rax){1to16}, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  1      4     1.00                        vfmadd213ps	%zmm16, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  2      12    1.00    *                   vfmadd213ps	(%rax), %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  2      12    1.00    *                   vfmadd213ps	(%rax){1to16}, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  1      4     1.00                        vfmadd231ps	%zmm16, %zmm17, %zmm19
+# CHECK-NEXT:  2      12    1.00    *                   vfmadd231ps	(%rax), %zmm17, %zmm19
+# CHECK-NEXT:  2      12    1.00    *                   vfmadd231ps	(%rax){1to16}, %zmm17, %zmm19
+# CHECK-NEXT:  1      4     1.00                        vfmadd231ps	%zmm16, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  2      12    1.00    *                   vfmadd231ps	(%rax), %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  2      12    1.00    *                   vfmadd231ps	(%rax){1to16}, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  1      4     1.00                        vfmadd231ps	%zmm16, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  2      12    1.00    *                   vfmadd231ps	(%rax), %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  2      12    1.00    *                   vfmadd231ps	(%rax){1to16}, %zmm17, %zmm19 {%k1} {z}
 # CHECK-NEXT:  11     28    2.67    *                   vgatherdpd	(%rax,%ymm1,2), %zmm2 {%k1}
 # CHECK-NEXT:  19     30    5.33    *                   vgatherdps	(%rax,%zmm1,2), %zmm2 {%k1}
 # CHECK-NEXT:  11     28    2.67    *                   vgatherqpd	(%rax,%zmm1,2), %zmm2 {%k1}
@@ -1918,7 +2032,7 @@ vunpcklps         (%rax){1to16}, %zmm17, %zmm19 {z}{k1}
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]
-# CHECK-NEXT: 436.00 12.00  206.33 206.33 8.00   575.00  -     8.00   8.00   8.00    -     206.33  -
+# CHECK-NEXT: 490.00 12.00  218.33 218.33 8.00   575.00  -     8.00   8.00   8.00    -     218.33  -
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   Instructions:
@@ -2181,6 +2295,60 @@ vunpcklps         (%rax){1to16}, %zmm17, %zmm19 {z}{k1}
 # CHECK-NEXT: 2.50    -      -      -      -     0.50    -      -      -      -      -      -      -     vdivps	%zmm16, %zmm17, %zmm19 {%k1} {z}
 # CHECK-NEXT: 2.50    -     0.33   0.33    -     0.50    -      -      -      -      -     0.33    -     vdivps	(%rax), %zmm17, %zmm19 {%k1} {z}
 # CHECK-NEXT: 2.50    -     0.33   0.33    -     0.50    -      -      -      -      -     0.33    -     vdivps	(%rax){1to16}, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -      -      -      -      -     vfmadd132pd	%zmm16, %zmm17, %zmm19
+# CHECK-NEXT: 1.00    -     0.33   0.33    -      -      -      -      -      -      -     0.33    -     vfmadd132pd	(%rax), %zmm17, %zmm19
+# CHECK-NEXT: 1.00    -     0.33   0.33    -      -      -      -      -      -      -     0.33    -     vfmadd132pd	(%rax){1to8}, %zmm17, %zmm19
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -      -      -      -      -     vfmadd132pd	%zmm16, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT: 1.00    -     0.33   0.33    -      -      -      -      -      -      -     0.33    -     vfmadd132pd	(%rax), %zmm17, %zmm19 {%k1}
+# CHECK-NEXT: 1.00    -     0.33   0.33    -      -      -      -      -      -      -     0.33    -     vfmadd132pd	(%rax){1to8}, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -      -      -      -      -     vfmadd132pd	%zmm16, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT: 1.00    -     0.33   0.33    -      -      -      -      -      -      -     0.33    -     vfmadd132pd	(%rax), %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT: 1.00    -     0.33   0.33    -      -      -      -      -      -      -     0.33    -     vfmadd132pd	(%rax){1to8}, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -      -      -      -      -     vfmadd213pd	%zmm16, %zmm17, %zmm19
+# CHECK-NEXT: 1.00    -     0.33   0.33    -      -      -      -      -      -      -     0.33    -     vfmadd213pd	(%rax), %zmm17, %zmm19
+# CHECK-NEXT: 1.00    -     0.33   0.33    -      -      -      -      -      -      -     0.33    -     vfmadd213pd	(%rax){1to8}, %zmm17, %zmm19
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -      -      -      -      -     vfmadd213pd	%zmm16, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT: 1.00    -     0.33   0.33    -      -      -      -      -      -      -     0.33    -     vfmadd213pd	(%rax), %zmm17, %zmm19 {%k1}
+# CHECK-NEXT: 1.00    -     0.33   0.33    -      -      -      -      -      -      -     0.33    -     vfmadd213pd	(%rax){1to8}, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -      -      -      -      -     vfmadd213pd	%zmm16, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT: 1.00    -     0.33   0.33    -      -      -      -      -      -      -     0.33    -     vfmadd213pd	(%rax), %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT: 1.00    -     0.33   0.33    -      -      -      -      -      -      -     0.33    -     vfmadd213pd	(%rax){1to8}, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -      -      -      -      -     vfmadd231pd	%zmm16, %zmm17, %zmm19
+# CHECK-NEXT: 1.00    -     0.33   0.33    -      -      -      -      -      -      -     0.33    -     vfmadd231pd	(%rax), %zmm17, %zmm19
+# CHECK-NEXT: 1.00    -     0.33   0.33    -      -      -      -      -      -      -     0.33    -     vfmadd231pd	(%rax){1to8}, %zmm17, %zmm19
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -      -      -      -      -     vfmadd231pd	%zmm16, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT: 1.00    -     0.33   0.33    -      -      -      -      -      -      -     0.33    -     vfmadd231pd	(%rax), %zmm17, %zmm19 {%k1}
+# CHECK-NEXT: 1.00    -     0.33   0.33    -      -      -      -      -      -      -     0.33    -     vfmadd231pd	(%rax){1to8}, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -      -      -      -      -     vfmadd231pd	%zmm16, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT: 1.00    -     0.33   0.33    -      -      -      -      -      -      -     0.33    -     vfmadd231pd	(%rax), %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT: 1.00    -     0.33   0.33    -      -      -      -      -      -      -     0.33    -     vfmadd231pd	(%rax){1to8}, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -      -      -      -      -     vfmadd132ps	%zmm16, %zmm17, %zmm19
+# CHECK-NEXT: 1.00    -     0.33   0.33    -      -      -      -      -      -      -     0.33    -     vfmadd132ps	(%rax), %zmm17, %zmm19
+# CHECK-NEXT: 1.00    -     0.33   0.33    -      -      -      -      -      -      -     0.33    -     vfmadd132ps	(%rax){1to16}, %zmm17, %zmm19
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -      -      -      -      -     vfmadd132ps	%zmm16, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT: 1.00    -     0.33   0.33    -      -      -      -      -      -      -     0.33    -     vfmadd132ps	(%rax), %zmm17, %zmm19 {%k1}
+# CHECK-NEXT: 1.00    -     0.33   0.33    -      -      -      -      -      -      -     0.33    -     vfmadd132ps	(%rax){1to16}, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -      -      -      -      -     vfmadd132ps	%zmm16, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT: 1.00    -     0.33   0.33    -      -      -      -      -      -      -     0.33    -     vfmadd132ps	(%rax), %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT: 1.00    -     0.33   0.33    -      -      -      -      -      -      -     0.33    -     vfmadd132ps	(%rax){1to16}, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -      -      -      -      -     vfmadd213ps	%zmm16, %zmm17, %zmm19
+# CHECK-NEXT: 1.00    -     0.33   0.33    -      -      -      -      -      -      -     0.33    -     vfmadd213ps	(%rax), %zmm17, %zmm19
+# CHECK-NEXT: 1.00    -     0.33   0.33    -      -      -      -      -      -      -     0.33    -     vfmadd213ps	(%rax){1to16}, %zmm17, %zmm19
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -      -      -      -      -     vfmadd213ps	%zmm16, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT: 1.00    -     0.33   0.33    -      -      -      -      -      -      -     0.33    -     vfmadd213ps	(%rax), %zmm17, %zmm19 {%k1}
+# CHECK-NEXT: 1.00    -     0.33   0.33    -      -      -      -      -      -      -     0.33    -     vfmadd213ps	(%rax){1to16}, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -      -      -      -      -     vfmadd213ps	%zmm16, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT: 1.00    -     0.33   0.33    -      -      -      -      -      -      -     0.33    -     vfmadd213ps	(%rax), %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT: 1.00    -     0.33   0.33    -      -      -      -      -      -      -     0.33    -     vfmadd213ps	(%rax){1to16}, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -      -      -      -      -     vfmadd231ps	%zmm16, %zmm17, %zmm19
+# CHECK-NEXT: 1.00    -     0.33   0.33    -      -      -      -      -      -      -     0.33    -     vfmadd231ps	(%rax), %zmm17, %zmm19
+# CHECK-NEXT: 1.00    -     0.33   0.33    -      -      -      -      -      -      -     0.33    -     vfmadd231ps	(%rax){1to16}, %zmm17, %zmm19
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -      -      -      -      -     vfmadd231ps	%zmm16, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT: 1.00    -     0.33   0.33    -      -      -      -      -      -      -     0.33    -     vfmadd231ps	(%rax), %zmm17, %zmm19 {%k1}
+# CHECK-NEXT: 1.00    -     0.33   0.33    -      -      -      -      -      -      -     0.33    -     vfmadd231ps	(%rax){1to16}, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -      -      -      -      -     vfmadd231ps	%zmm16, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT: 1.00    -     0.33   0.33    -      -      -      -      -      -      -     0.33    -     vfmadd231ps	(%rax), %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT: 1.00    -     0.33   0.33    -      -      -      -      -      -      -     0.33    -     vfmadd231ps	(%rax){1to16}, %zmm17, %zmm19 {%k1} {z}
 # CHECK-NEXT: 1.00    -     2.67   2.67    -     2.00    -      -      -      -      -     2.67    -     vgatherdpd	(%rax,%ymm1,2), %zmm2 {%k1}
 # CHECK-NEXT: 1.00    -     5.33   5.33    -     2.00    -      -      -      -      -     5.33    -     vgatherdps	(%rax,%zmm1,2), %zmm2 {%k1}
 # CHECK-NEXT: 1.00    -     2.67   2.67    -     2.00    -      -      -      -      -     2.67    -     vgatherqpd	(%rax,%zmm1,2), %zmm2 {%k1}
diff --git a/llvm/test/tools/llvm-mca/X86/SapphireRapids/resources-avx512vl.s b/llvm/test/tools/llvm-mca/X86/SapphireRapids/resources-avx512vl.s
index 2b2d1f3dcc876..3ad66f1c3d712 100644
--- a/llvm/test/tools/llvm-mca/X86/SapphireRapids/resources-avx512vl.s
+++ b/llvm/test/tools/llvm-mca/X86/SapphireRapids/resources-avx512vl.s
@@ -418,6 +418,126 @@ vdivps            %ymm16, %ymm17, %ymm19 {z}{k1}
 vdivps            (%rax), %ymm17, %ymm19 {z}{k1}
 vdivps            (%rax){1to8}, %ymm17, %ymm19 {z}{k1}
 
+vfmadd132pd       %xmm16, %xmm17, %xmm19
+vfmadd132pd       (%rax), %xmm17, %xmm19
+vfmadd132pd       (%rax){1to2}, %xmm17, %xmm19
+vfmadd132pd       %xmm16, %xmm17, %xmm19 {k1}
+vfmadd132pd       (%rax), %xmm17, %xmm19 {k1}
+vfmadd132pd       (%rax){1to2}, %xmm17, %xmm19 {k1}
+vfmadd132pd       %xmm16, %xmm17, %xmm19 {z}{k1}
+vfmadd132pd       (%rax), %xmm17, %xmm19 {z}{k1}
+vfmadd132pd       (%rax){1to2}, %xmm17, %xmm19 {z}{k1}
+
+vfmadd132pd       %ymm16, %ymm17, %ymm19
+vfmadd132pd       (%rax), %ymm17, %ymm19
+vfmadd132pd       (%rax){1to4}, %ymm17, %ymm19
+vfmadd132pd       %ymm16, %ymm17, %ymm19 {k1}
+vfmadd132pd       (%rax), %ymm17, %ymm19 {k1}
+vfmadd132pd       (%rax){1to4}, %ymm17, %ymm19 {k1}
+vfmadd132pd       %ymm16, %ymm17, %ymm19 {z}{k1}
+vfmadd132pd       (%rax), %ymm17, %ymm19 {z}{k1}
+vfmadd132pd       (%rax){1to4}, %ymm17, %ymm19 {z}{k1}
+
+vfmadd213pd       %xmm16, %xmm17, %xmm19
+vfmadd213pd       (%rax), %xmm17, %xmm19
+vfmadd213pd       (%rax){1to2}, %xmm17, %xmm19
+vfmadd213pd       %xmm16, %xmm17, %xmm19 {k1}
+vfmadd213pd       (%rax), %xmm17, %xmm19 {k1}
+vfmadd213pd       (%rax){1to2}, %xmm17, %xmm19 {k1}
+vfmadd213pd       %xmm16, %xmm17, %xmm19 {z}{k1}
+vfmadd213pd       (%rax), %xmm17, %xmm19 {z}{k1}
+vfmadd213pd       (%rax){1to2}, %xmm17, %xmm19 {z}{k1}
+
+vfmadd213pd       %ymm16, %ymm17, %ymm19
+vfmadd213pd       (%rax), %ymm17, %ymm19
+vfmadd213pd       (%rax){1to4}, %ymm17, %ymm19
+vfmadd213pd       %ymm16, %ymm17, %ymm19 {k1}
+vfmadd213pd       (%rax), %ymm17, %ymm19 {k1}
+vfmadd213pd       (%rax){1to4}, %ymm17, %ymm19 {k1}
+vfmadd213pd       %ymm16, %ymm17, %ymm19 {z}{k1}
+vfmadd213pd       (%rax), %ymm17, %ymm19 {z}{k1}
+vfmadd213pd       (%rax){1to4}, %ymm17, %ymm19 {z}{k1}
+
+vfmadd231pd       %xmm16, %xmm17, %xmm19
+vfmadd231pd       (%rax), %xmm17, %xmm19
+vfmadd231pd       (%rax){1to2}, %xmm17, %xmm19
+vfmadd231pd       %xmm16, %xmm17, %xmm19 {k1}
+vfmadd231pd       (%rax), %xmm17, %xmm19 {k1}
+vfmadd231pd       (%rax){1to2}, %xmm17, %xmm19 {k1}
+vfmadd231pd       %xmm16, %xmm17, %xmm19 {z}{k1}
+vfmadd231pd       (%rax), %xmm17, %xmm19 {z}{k1}
+vfmadd231pd       (%rax){1to2}, %xmm17, %xmm19 {z}{k1}
+
+vfmadd231pd       %ymm16, %ymm17, %ymm19
+vfmadd231pd       (%rax), %ymm17, %ymm19
+vfmadd231pd       (%rax){1to4}, %ymm17, %ymm19
+vfmadd231pd       %ymm16, %ymm17, %ymm19 {k1}
+vfmadd231pd       (%rax), %ymm17, %ymm19 {k1}
+vfmadd231pd       (%rax){1to4}, %ymm17, %ymm19 {k1}
+vfmadd231pd       %ymm16, %ymm17, %ymm19 {z}{k1}
+vfmadd231pd       (%rax), %ymm17, %ymm19 {z}{k1}
+vfmadd231pd       (%rax){1to4}, %ymm17, %ymm19 {z}{k1}
+
+vfmadd132ps       %xmm16, %xmm17, %xmm19
+vfmadd132ps       (%rax), %xmm17, %xmm19
+vfmadd132ps       (%rax){1to4}, %xmm17, %xmm19
+vfmadd132ps       %xmm16, %xmm17, %xmm19 {k1}
+vfmadd132ps       (%rax), %xmm17, %xmm19 {k1}
+vfmadd132ps       (%rax){1to4}, %xmm17, %xmm19 {k1}
+vfmadd132ps       %xmm16, %xmm17, %xmm19 {z}{k1}
+vfmadd132ps       (%rax), %xmm17, %xmm19 {z}{k1}
+vfmadd132ps       (%rax){1to4}, %xmm17, %xmm19 {z}{k1}
+
+vfmadd132ps       %ymm16, %ymm17, %ymm19
+vfmadd132ps       (%rax), %ymm17, %ymm19
+vfmadd132ps       (%rax){1to8}, %ymm17, %ymm19
+vfmadd132ps       %ymm16, %ymm17, %ymm19 {k1}
+vfmadd132ps       (%rax), %ymm17, %ymm19 {k1}
+vfmadd132ps       (%rax){1to8}, %ymm17, %ymm19 {k1}
+vfmadd132ps       %ymm16, %ymm17, %ymm19 {z}{k1}
+vfmadd132ps       (%rax), %ymm17, %ymm19 {z}{k1}
+vfmadd132ps       (%rax){1to8}, %ymm17, %ymm19 {z}{k1}
+
+vfmadd213ps       %xmm16, %xmm17, %xmm19
+vfmadd213ps       (%rax), %xmm17, %xmm19
+vfmadd213ps       (%rax){1to4}, %xmm17, %xmm19
+vfmadd213ps       %xmm16, %xmm17, %xmm19 {k1}
+vfmadd213ps       (%rax), %xmm17, %xmm19 {k1}
+vfmadd213ps       (%rax){1to4}, %xmm17, %xmm19 {k1}
+vfmadd213ps       %xmm16, %xmm17, %xmm19 {z}{k1}
+vfmadd213ps       (%rax), %xmm17, %xmm19 {z}{k1}
+vfmadd213ps       (%rax){1to4}, %xmm17, %xmm19 {z}{k1}
+
+vfmadd213ps       %ymm16, %ymm17, %ymm19
+vfmadd213ps       (%rax), %ymm17, %ymm19
+vfmadd213ps       (%rax){1to8}, %ymm17, %ymm19
+vfmadd213ps       %ymm16, %ymm17, %ymm19 {k1}
+vfmadd213ps       (%rax), %ymm17, %ymm19 {k1}
+vfmadd213ps       (%rax){1to8}, %ymm17, %ymm19 {k1}
+vfmadd213ps       %ymm16, %ymm17, %ymm19 {z}{k1}
+vfmadd213ps       (%rax), %ymm17, %ymm19 {z}{k1}
+vfmadd213ps       (%rax){1to8}, %ymm17, %ymm19 {z}{k1}
+
+vfmadd231ps       %xmm16, %xmm17, %xmm19
+vfmadd231ps       (%rax), %xmm17, %xmm19
+vfmadd231ps       (%rax){1to4}, %xmm17, %xmm19
+vfmadd231ps       %xmm16, %xmm17, %xmm19 {k1}
+vfmadd231ps       (%rax), %xmm17, %xmm19 {k1}
+vfmadd231ps       (%rax){1to4}, %xmm17, %xmm19 {k1}
+vfmadd231ps       %xmm16, %xmm17, %xmm19 {z}{k1}
+vfmadd231ps       (%rax), %xmm17, %xmm19 {z}{k1}
+vfmadd231ps       (%rax){1to4}, %xmm17, %xmm19 {z}{k1}
+
+vfmadd231ps       %ymm16, %ymm17, %ymm19
+vfmadd231ps       (%rax), %ymm17, %ymm19
+vfmadd231ps       (%rax){1to8}, %ymm17, %ymm19
+vfmadd231ps       %ymm16, %ymm17, %ymm19 {k1}
+vfmadd231ps       (%rax), %ymm17, %ymm19 {k1}
+vfmadd231ps       (%rax){1to8}, %ymm17, %ymm19 {k1}
+vfmadd231ps       %ymm16, %ymm17, %ymm19 {z}{k1}
+vfmadd231ps       (%rax), %ymm17, %ymm19 {z}{k1}
+vfmadd231ps       (%rax){1to8}, %ymm17, %ymm19 {z}{k1}
+
 vgatherdpd        (%rax,%xmm1,2), %ymm2 {k1}
 vgatherdps        (%rax,%ymm1,2), %ymm2 {k1}
 vgatherqpd        (%rax,%ymm1,2), %ymm2 {k1}
@@ -1961,6 +2081,114 @@ vunpcklps         (%rax){1to8}, %ymm17, %ymm19 {z}{k1}
 # CHECK-NEXT:  1      11    1.00                        vdivps	%ymm16, %ymm17, %ymm19 {%k1} {z}
 # CHECK-NEXT:  2      19    1.00    *                   vdivps	(%rax), %ymm17, %ymm19 {%k1} {z}
 # CHECK-NEXT:  2      19    1.00    *                   vdivps	(%rax){1to8}, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  1      4     0.50                        vfmadd132pd	%xmm16, %xmm17, %xmm19
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd132pd	(%rax), %xmm17, %xmm19
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd132pd	(%rax){1to2}, %xmm17, %xmm19
+# CHECK-NEXT:  1      4     0.50                        vfmadd132pd	%xmm16, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd132pd	(%rax), %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd132pd	(%rax){1to2}, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  1      4     0.50                        vfmadd132pd	%xmm16, %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd132pd	(%rax), %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd132pd	(%rax){1to2}, %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  1      4     0.50                        vfmadd132pd	%ymm16, %ymm17, %ymm19
+# CHECK-NEXT:  2      12    0.50    *                   vfmadd132pd	(%rax), %ymm17, %ymm19
+# CHECK-NEXT:  2      12    0.50    *                   vfmadd132pd	(%rax){1to4}, %ymm17, %ymm19
+# CHECK-NEXT:  1      4     0.50                        vfmadd132pd	%ymm16, %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  2      12    0.50    *                   vfmadd132pd	(%rax), %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  2      12    0.50    *                   vfmadd132pd	(%rax){1to4}, %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  1      4     0.50                        vfmadd132pd	%ymm16, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  2      12    0.50    *                   vfmadd132pd	(%rax), %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  2      12    0.50    *                   vfmadd132pd	(%rax){1to4}, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  1      4     0.50                        vfmadd213pd	%xmm16, %xmm17, %xmm19
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd213pd	(%rax), %xmm17, %xmm19
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd213pd	(%rax){1to2}, %xmm17, %xmm19
+# CHECK-NEXT:  1      4     0.50                        vfmadd213pd	%xmm16, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd213pd	(%rax), %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd213pd	(%rax){1to2}, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  1      4     0.50                        vfmadd213pd	%xmm16, %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd213pd	(%rax), %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd213pd	(%rax){1to2}, %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  1      4     0.50                        vfmadd213pd	%ymm16, %ymm17, %ymm19
+# CHECK-NEXT:  2      12    0.50    *                   vfmadd213pd	(%rax), %ymm17, %ymm19
+# CHECK-NEXT:  2      12    0.50    *                   vfmadd213pd	(%rax){1to4}, %ymm17, %ymm19
+# CHECK-NEXT:  1      4     0.50                        vfmadd213pd	%ymm16, %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  2      12    0.50    *                   vfmadd213pd	(%rax), %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  2      12    0.50    *                   vfmadd213pd	(%rax){1to4}, %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  1      4     0.50                        vfmadd213pd	%ymm16, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  2      12    0.50    *                   vfmadd213pd	(%rax), %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  2      12    0.50    *                   vfmadd213pd	(%rax){1to4}, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  1      4     0.50                        vfmadd231pd	%xmm16, %xmm17, %xmm19
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd231pd	(%rax), %xmm17, %xmm19
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd231pd	(%rax){1to2}, %xmm17, %xmm19
+# CHECK-NEXT:  1      4     0.50                        vfmadd231pd	%xmm16, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd231pd	(%rax), %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd231pd	(%rax){1to2}, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  1      4     0.50                        vfmadd231pd	%xmm16, %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd231pd	(%rax), %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd231pd	(%rax){1to2}, %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  1      4     0.50                        vfmadd231pd	%ymm16, %ymm17, %ymm19
+# CHECK-NEXT:  2      12    0.50    *                   vfmadd231pd	(%rax), %ymm17, %ymm19
+# CHECK-NEXT:  2      12    0.50    *                   vfmadd231pd	(%rax){1to4}, %ymm17, %ymm19
+# CHECK-NEXT:  1      4     0.50                        vfmadd231pd	%ymm16, %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  2      12    0.50    *                   vfmadd231pd	(%rax), %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  2      12    0.50    *                   vfmadd231pd	(%rax){1to4}, %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  1      4     0.50                        vfmadd231pd	%ymm16, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  2      12    0.50    *                   vfmadd231pd	(%rax), %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  2      12    0.50    *                   vfmadd231pd	(%rax){1to4}, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  1      4     0.50                        vfmadd132ps	%xmm16, %xmm17, %xmm19
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd132ps	(%rax), %xmm17, %xmm19
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd132ps	(%rax){1to4}, %xmm17, %xmm19
+# CHECK-NEXT:  1      4     0.50                        vfmadd132ps	%xmm16, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd132ps	(%rax), %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd132ps	(%rax){1to4}, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  1      4     0.50                        vfmadd132ps	%xmm16, %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd132ps	(%rax), %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd132ps	(%rax){1to4}, %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  1      4     0.50                        vfmadd132ps	%ymm16, %ymm17, %ymm19
+# CHECK-NEXT:  2      12    0.50    *                   vfmadd132ps	(%rax), %ymm17, %ymm19
+# CHECK-NEXT:  2      12    0.50    *                   vfmadd132ps	(%rax){1to8}, %ymm17, %ymm19
+# CHECK-NEXT:  1      4     0.50                        vfmadd132ps	%ymm16, %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  2      12    0.50    *                   vfmadd132ps	(%rax), %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  2      12    0.50    *                   vfmadd132ps	(%rax){1to8}, %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  1      4     0.50                        vfmadd132ps	%ymm16, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  2      12    0.50    *                   vfmadd132ps	(%rax), %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  2      12    0.50    *                   vfmadd132ps	(%rax){1to8}, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  1      4     0.50                        vfmadd213ps	%xmm16, %xmm17, %xmm19
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd213ps	(%rax), %xmm17, %xmm19
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd213ps	(%rax){1to4}, %xmm17, %xmm19
+# CHECK-NEXT:  1      4     0.50                        vfmadd213ps	%xmm16, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd213ps	(%rax), %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd213ps	(%rax){1to4}, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  1      4     0.50                        vfmadd213ps	%xmm16, %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd213ps	(%rax), %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd213ps	(%rax){1to4}, %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  1      4     0.50                        vfmadd213ps	%ymm16, %ymm17, %ymm19
+# CHECK-NEXT:  2      12    0.50    *                   vfmadd213ps	(%rax), %ymm17, %ymm19
+# CHECK-NEXT:  2      12    0.50    *                   vfmadd213ps	(%rax){1to8}, %ymm17, %ymm19
+# CHECK-NEXT:  1      4     0.50                        vfmadd213ps	%ymm16, %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  2      12    0.50    *                   vfmadd213ps	(%rax), %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  2      12    0.50    *                   vfmadd213ps	(%rax){1to8}, %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  1      4     0.50                        vfmadd213ps	%ymm16, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  2      12    0.50    *                   vfmadd213ps	(%rax), %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  2      12    0.50    *                   vfmadd213ps	(%rax){1to8}, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  1      4     0.50                        vfmadd231ps	%xmm16, %xmm17, %xmm19
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd231ps	(%rax), %xmm17, %xmm19
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd231ps	(%rax){1to4}, %xmm17, %xmm19
+# CHECK-NEXT:  1      4     0.50                        vfmadd231ps	%xmm16, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd231ps	(%rax), %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd231ps	(%rax){1to4}, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  1      4     0.50                        vfmadd231ps	%xmm16, %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd231ps	(%rax), %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd231ps	(%rax){1to4}, %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  1      4     0.50                        vfmadd231ps	%ymm16, %ymm17, %ymm19
+# CHECK-NEXT:  2      12    0.50    *                   vfmadd231ps	(%rax), %ymm17, %ymm19
+# CHECK-NEXT:  2      12    0.50    *                   vfmadd231ps	(%rax){1to8}, %ymm17, %ymm19
+# CHECK-NEXT:  1      4     0.50                        vfmadd231ps	%ymm16, %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  2      12    0.50    *                   vfmadd231ps	(%rax), %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  2      12    0.50    *                   vfmadd231ps	(%rax){1to8}, %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  1      4     0.50                        vfmadd231ps	%ymm16, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  2      12    0.50    *                   vfmadd231ps	(%rax), %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  2      12    0.50    *                   vfmadd231ps	(%rax){1to8}, %ymm17, %ymm19 {%k1} {z}
 # CHECK-NEXT:  7      28    1.33    *                   vgatherdpd	(%rax,%xmm1,2), %ymm2 {%k1}
 # CHECK-NEXT:  11     29    2.67    *                   vgatherdps	(%rax,%ymm1,2), %ymm2 {%k1}
 # CHECK-NEXT:  7      28    1.33    *                   vgatherqpd	(%rax,%ymm1,2), %ymm2 {%k1}
@@ -3005,7 +3233,7 @@ vunpcklps         (%rax){1to8}, %ymm17, %ymm19 {z}{k1}
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]
-# CHECK-NEXT: 323.33 347.33 304.33 304.33 16.00  794.33  -     16.00  16.00  16.00   -     304.33  -
+# CHECK-NEXT: 377.33 401.33 328.33 328.33 16.00  794.33  -     16.00  16.00  16.00   -     328.33  -
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   Instructions:
@@ -3381,6 +3609,114 @@ vunpcklps         (%rax){1to8}, %ymm17, %ymm19 {z}{k1}
 # CHECK-NEXT: 1.00    -      -      -      -      -      -      -      -      -      -      -      -     vdivps	%ymm16, %ymm17, %ymm19 {%k1} {z}
 # CHECK-NEXT: 1.00    -     0.33   0.33    -      -      -      -      -      -      -     0.33    -     vdivps	(%rax), %ymm17, %ymm19 {%k1} {z}
 # CHECK-NEXT: 1.00    -     0.33   0.33    -      -      -      -      -      -      -     0.33    -     vdivps	(%rax){1to8}, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     vfmadd132pd	%xmm16, %xmm17, %xmm19
+# CHECK-NEXT: 0.50   0.50   0.33   0.33    -      -      -      -      -      -      -     0.33    -     vfmadd132pd	(%rax), %xmm17, %xmm19
+# CHECK-NEXT: 0.50   0.50   0.33   0.33    -      -      -      -      -      -      -     0.33    -     vfmadd132pd	(%rax){1to2}, %xmm17, %xmm19
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     vfmadd132pd	%xmm16, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT: 0.50   0.50   0.33   0.33    -      -      -      -      -      -      -     0.33    -     vfmadd132pd	(%rax), %xmm17, %xmm19 {%k1}
+# CHECK-NEXT: 0.50   0.50   0.33   0.33    -      -      -      -      -      -      -     0.33    -     vfmadd132pd	(%rax){1to2}, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     vfmadd132pd	%xmm16, %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT: 0.50   0.50   0.33   0.33    -      -      -      -      -      -      -     0.33    -     vfmadd132pd	(%rax), %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT: 0.50   0.50   0.33   0.33    -      -      -      -      -      -      -     0.33    -     vfmadd132pd	(%rax){1to2}, %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     vfmadd132pd	%ymm16, %ymm17, %ymm19
+# CHECK-NEXT: 0.50   0.50   0.33   0.33    -      -      -      -      -      -      -     0.33    -     vfmadd132pd	(%rax), %ymm17, %ymm19
+# CHECK-NEXT: 0.50   0.50   0.33   0.33    -      -      -      -      -      -      -     0.33    -     vfmadd132pd	(%rax){1to4}, %ymm17, %ymm19
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     vfmadd132pd	%ymm16, %ymm17, %ymm19 {%k1}
+# CHECK-NEXT: 0.50   0.50   0.33   0.33    -      -      -      -      -      -      -     0.33    -     vfmadd132pd	(%rax), %ymm17, %ymm19 {%k1}
+# CHECK-NEXT: 0.50   0.50   0.33   0.33    -      -      -      -      -      -      -     0.33    -     vfmadd132pd	(%rax){1to4}, %ymm17, %ymm19 {%k1}
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     vfmadd132pd	%ymm16, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT: 0.50   0.50   0.33   0.33    -      -      -      -      -      -      -     0.33    -     vfmadd132pd	(%rax), %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT: 0.50   0.50   0.33   0.33    -      -      -      -      -      -      -     0.33    -     vfmadd132pd	(%rax){1to4}, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     vfmadd213pd	%xmm16, %xmm17, %xmm19
+# CHECK-NEXT: 0.50   0.50   0.33   0.33    -      -      -      -      -      -      -     0.33    -     vfmadd213pd	(%rax), %xmm17, %xmm19
+# CHECK-NEXT: 0.50   0.50   0.33   0.33    -      -      -      -      -      -      -     0.33    -     vfmadd213pd	(%rax){1to2}, %xmm17, %xmm19
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     vfmadd213pd	%xmm16, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT: 0.50   0.50   0.33   0.33    -      -      -      -      -      -      -     0.33    -     vfmadd213pd	(%rax), %xmm17, %xmm19 {%k1}
+# CHECK-NEXT: 0.50   0.50   0.33   0.33    -      -      -      -      -      -      -     0.33    -     vfmadd213pd	(%rax){1to2}, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     vfmadd213pd	%xmm16, %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT: 0.50   0.50   0.33   0.33    -      -      -      -      -      -      -     0.33    -     vfmadd213pd	(%rax), %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT: 0.50   0.50   0.33   0.33    -      -      -      -      -      -      -     0.33    -     vfmadd213pd	(%rax){1to2}, %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     vfmadd213pd	%ymm16, %ymm17, %ymm19
+# CHECK-NEXT: 0.50   0.50   0.33   0.33    -      -      -      -      -      -      -     0.33    -     vfmadd213pd	(%rax), %ymm17, %ymm19
+# CHECK-NEXT: 0.50   0.50   0.33   0.33    -      -      -      -      -      -      -     0.33    -     vfmadd213pd	(%rax){1to4}, %ymm17, %ymm19
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     vfmadd213pd	%ymm16, %ymm17, %ymm19 {%k1}
+# CHECK-NEXT: 0.50   0.50   0.33   0.33    -      -      -      -      -      -      -     0.33    -     vfmadd213pd	(%rax), %ymm17, %ymm19 {%k1}
+# CHECK-NEXT: 0.50   0.50   0.33   0.33    -      -      -      -      -      -      -     0.33    -     vfmadd213pd	(%rax){1to4}, %ymm17, %ymm19 {%k1}
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     vfmadd213pd	%ymm16, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT: 0.50   0.50   0.33   0.33    -      -      -      -      -      -      -     0.33    -     vfmadd213pd	(%rax), %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT: 0.50   0.50   0.33   0.33    -      -      -      -      -      -      -     0.33    -     vfmadd213pd	(%rax){1to4}, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     vfmadd231pd	%xmm16, %xmm17, %xmm19
+# CHECK-NEXT: 0.50   0.50   0.33   0.33    -      -      -      -      -      -      -     0.33    -     vfmadd231pd	(%rax), %xmm17, %xmm19
+# CHECK-NEXT: 0.50   0.50   0.33   0.33    -      -      -      -      -      -      -     0.33    -     vfmadd231pd	(%rax){1to2}, %xmm17, %xmm19
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     vfmadd231pd	%xmm16, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT: 0.50   0.50   0.33   0.33    -      -      -      -      -      -      -     0.33    -     vfmadd231pd	(%rax), %xmm17, %xmm19 {%k1}
+# CHECK-NEXT: 0.50   0.50   0.33   0.33    -      -      -      -      -      -      -     0.33    -     vfmadd231pd	(%rax){1to2}, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     vfmadd231pd	%xmm16, %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT: 0.50   0.50   0.33   0.33    -      -      -      -      -      -      -     0.33    -     vfmadd231pd	(%rax), %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT: 0.50   0.50   0.33   0.33    -      -      -      -      -      -      -     0.33    -     vfmadd231pd	(%rax){1to2}, %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     vfmadd231pd	%ymm16, %ymm17, %ymm19
+# CHECK-NEXT: 0.50   0.50   0.33   0.33    -      -      -      -      -      -      -     0.33    -     vfmadd231pd	(%rax), %ymm17, %ymm19
+# CHECK-NEXT: 0.50   0.50   0.33   0.33    -      -      -      -      -      -      -     0.33    -     vfmadd231pd	(%rax){1to4}, %ymm17, %ymm19
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     vfmadd231pd	%ymm16, %ymm17, %ymm19 {%k1}
+# CHECK-NEXT: 0.50   0.50   0.33   0.33    -      -      -      -      -      -      -     0.33    -     vfmadd231pd	(%rax), %ymm17, %ymm19 {%k1}
+# CHECK-NEXT: 0.50   0.50   0.33   0.33    -      -      -      -      -      -      -     0.33    -     vfmadd231pd	(%rax){1to4}, %ymm17, %ymm19 {%k1}
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     vfmadd231pd	%ymm16, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT: 0.50   0.50   0.33   0.33    -      -      -      -      -      -      -     0.33    -     vfmadd231pd	(%rax), %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT: 0.50   0.50   0.33   0.33    -      -      -      -      -      -      -     0.33    -     vfmadd231pd	(%rax){1to4}, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     vfmadd132ps	%xmm16, %xmm17, %xmm19
+# CHECK-NEXT: 0.50   0.50   0.33   0.33    -      -      -      -      -      -      -     0.33    -     vfmadd132ps	(%rax), %xmm17, %xmm19
+# CHECK-NEXT: 0.50   0.50   0.33   0.33    -      -      -      -      -      -      -     0.33    -     vfmadd132ps	(%rax){1to4}, %xmm17, %xmm19
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     vfmadd132ps	%xmm16, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT: 0.50   0.50   0.33   0.33    -      -      -      -      -      -      -     0.33    -     vfmadd132ps	(%rax), %xmm17, %xmm19 {%k1}
+# CHECK-NEXT: 0.50   0.50   0.33   0.33    -      -      -      -      -      -      -     0.33    -     vfmadd132ps	(%rax){1to4}, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     vfmadd132ps	%xmm16, %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT: 0.50   0.50   0.33   0.33    -      -      -      -      -      -      -     0.33    -     vfmadd132ps	(%rax), %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT: 0.50   0.50   0.33   0.33    -      -      -      -      -      -      -     0.33    -     vfmadd132ps	(%rax){1to4}, %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     vfmadd132ps	%ymm16, %ymm17, %ymm19
+# CHECK-NEXT: 0.50   0.50   0.33   0.33    -      -      -      -      -      -      -     0.33    -     vfmadd132ps	(%rax), %ymm17, %ymm19
+# CHECK-NEXT: 0.50   0.50   0.33   0.33    -      -      -      -      -      -      -     0.33    -     vfmadd132ps	(%rax){1to8}, %ymm17, %ymm19
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     vfmadd132ps	%ymm16, %ymm17, %ymm19 {%k1}
+# CHECK-NEXT: 0.50   0.50   0.33   0.33    -      -      -      -      -      -      -     0.33    -     vfmadd132ps	(%rax), %ymm17, %ymm19 {%k1}
+# CHECK-NEXT: 0.50   0.50   0.33   0.33    -      -      -      -      -      -      -     0.33    -     vfmadd132ps	(%rax){1to8}, %ymm17, %ymm19 {%k1}
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     vfmadd132ps	%ymm16, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT: 0.50   0.50   0.33   0.33    -      -      -      -      -      -      -     0.33    -     vfmadd132ps	(%rax), %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT: 0.50   0.50   0.33   0.33    -      -      -      -      -      -      -     0.33    -     vfmadd132ps	(%rax){1to8}, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     vfmadd213ps	%xmm16, %xmm17, %xmm19
+# CHECK-NEXT: 0.50   0.50   0.33   0.33    -      -      -      -      -      -      -     0.33    -     vfmadd213ps	(%rax), %xmm17, %xmm19
+# CHECK-NEXT: 0.50   0.50   0.33   0.33    -      -      -      -      -      -      -     0.33    -     vfmadd213ps	(%rax){1to4}, %xmm17, %xmm19
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     vfmadd213ps	%xmm16, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT: 0.50   0.50   0.33   0.33    -      -      -      -      -      -      -     0.33    -     vfmadd213ps	(%rax), %xmm17, %xmm19 {%k1}
+# CHECK-NEXT: 0.50   0.50   0.33   0.33    -      -      -      -      -      -      -     0.33    -     vfmadd213ps	(%rax){1to4}, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     vfmadd213ps	%xmm16, %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT: 0.50   0.50   0.33   0.33    -      -      -      -      -      -      -     0.33    -     vfmadd213ps	(%rax), %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT: 0.50   0.50   0.33   0.33    -      -      -      -      -      -      -     0.33    -     vfmadd213ps	(%rax){1to4}, %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     vfmadd213ps	%ymm16, %ymm17, %ymm19
+# CHECK-NEXT: 0.50   0.50   0.33   0.33    -      -      -      -      -      -      -     0.33    -     vfmadd213ps	(%rax), %ymm17, %ymm19
+# CHECK-NEXT: 0.50   0.50   0.33   0.33    -      -      -      -      -      -      -     0.33    -     vfmadd213ps	(%rax){1to8}, %ymm17, %ymm19
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     vfmadd213ps	%ymm16, %ymm17, %ymm19 {%k1}
+# CHECK-NEXT: 0.50   0.50   0.33   0.33    -      -      -      -      -      -      -     0.33    -     vfmadd213ps	(%rax), %ymm17, %ymm19 {%k1}
+# CHECK-NEXT: 0.50   0.50   0.33   0.33    -      -      -      -      -      -      -     0.33    -     vfmadd213ps	(%rax){1to8}, %ymm17, %ymm19 {%k1}
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     vfmadd213ps	%ymm16, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT: 0.50   0.50   0.33   0.33    -      -      -      -      -      -      -     0.33    -     vfmadd213ps	(%rax), %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT: 0.50   0.50   0.33   0.33    -      -      -      -      -      -      -     0.33    -     vfmadd213ps	(%rax){1to8}, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     vfmadd231ps	%xmm16, %xmm17, %xmm19
+# CHECK-NEXT: 0.50   0.50   0.33   0.33    -      -      -      -      -      -      -     0.33    -     vfmadd231ps	(%rax), %xmm17, %xmm19
+# CHECK-NEXT: 0.50   0.50   0.33   0.33    -      -      -      -      -      -      -     0.33    -     vfmadd231ps	(%rax){1to4}, %xmm17, %xmm19
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     vfmadd231ps	%xmm16, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT: 0.50   0.50   0.33   0.33    -      -      -      -      -      -      -     0.33    -     vfmadd231ps	(%rax), %xmm17, %xmm19 {%k1}
+# CHECK-NEXT: 0.50   0.50   0.33   0.33    -      -      -      -      -      -      -     0.33    -     vfmadd231ps	(%rax){1to4}, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     vfmadd231ps	%xmm16, %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT: 0.50   0.50   0.33   0.33    -      -      -      -      -      -      -     0.33    -     vfmadd231ps	(%rax), %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT: 0.50   0.50   0.33   0.33    -      -      -      -      -      -      -     0.33    -     vfmadd231ps	(%rax){1to4}, %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     vfmadd231ps	%ymm16, %ymm17, %ymm19
+# CHECK-NEXT: 0.50   0.50   0.33   0.33    -      -      -      -      -      -      -     0.33    -     vfmadd231ps	(%rax), %ymm17, %ymm19
+# CHECK-NEXT: 0.50   0.50   0.33   0.33    -      -      -      -      -      -      -     0.33    -     vfmadd231ps	(%rax){1to8}, %ymm17, %ymm19
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     vfmadd231ps	%ymm16, %ymm17, %ymm19 {%k1}
+# CHECK-NEXT: 0.50   0.50   0.33   0.33    -      -      -      -      -      -      -     0.33    -     vfmadd231ps	(%rax), %ymm17, %ymm19 {%k1}
+# CHECK-NEXT: 0.50   0.50   0.33   0.33    -      -      -      -      -      -      -     0.33    -     vfmadd231ps	(%rax){1to8}, %ymm17, %ymm19 {%k1}
+# CHECK-NEXT: 0.50   0.50    -      -      -      -      -      -      -      -      -      -      -     vfmadd231ps	%ymm16, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT: 0.50   0.50   0.33   0.33    -      -      -      -      -      -      -     0.33    -     vfmadd231ps	(%rax), %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT: 0.50   0.50   0.33   0.33    -      -      -      -      -      -      -     0.33    -     vfmadd231ps	(%rax){1to8}, %ymm17, %ymm19 {%k1} {z}
 # CHECK-NEXT: 1.00   1.00   1.33   1.33    -     1.00    -      -      -      -      -     1.33    -     vgatherdpd	(%rax,%xmm1,2), %ymm2 {%k1}
 # CHECK-NEXT: 1.00   1.00   2.67   2.67    -     1.00    -      -      -      -      -     2.67    -     vgatherdps	(%rax,%ymm1,2), %ymm2 {%k1}
 # CHECK-NEXT: 1.00   1.00   1.33   1.33    -     1.00    -      -      -      -      -     1.33    -     vgatherqpd	(%rax,%ymm1,2), %ymm2 {%k1}
diff --git a/llvm/test/tools/llvm-mca/X86/SkylakeServer/resources-avx512.s b/llvm/test/tools/llvm-mca/X86/SkylakeServer/resources-avx512.s
index c17426a654050..b1bfd7a9ec448 100644
--- a/llvm/test/tools/llvm-mca/X86/SkylakeServer/resources-avx512.s
+++ b/llvm/test/tools/llvm-mca/X86/SkylakeServer/resources-avx512.s
@@ -298,6 +298,66 @@ vdivps            %zmm16, %zmm17, %zmm19 {z}{k1}
 vdivps            (%rax), %zmm17, %zmm19 {z}{k1}
 vdivps            (%rax){1to16}, %zmm17, %zmm19 {z}{k1}
 
+vfmadd132pd       %zmm16, %zmm17, %zmm19
+vfmadd132pd       (%rax), %zmm17, %zmm19
+vfmadd132pd       (%rax){1to8}, %zmm17, %zmm19
+vfmadd132pd       %zmm16, %zmm17, %zmm19 {k1}
+vfmadd132pd       (%rax), %zmm17, %zmm19 {k1}
+vfmadd132pd       (%rax){1to8}, %zmm17, %zmm19 {k1}
+vfmadd132pd       %zmm16, %zmm17, %zmm19 {z}{k1}
+vfmadd132pd       (%rax), %zmm17, %zmm19 {z}{k1}
+vfmadd132pd       (%rax){1to8}, %zmm17, %zmm19 {z}{k1}
+
+vfmadd213pd       %zmm16, %zmm17, %zmm19
+vfmadd213pd       (%rax), %zmm17, %zmm19
+vfmadd213pd       (%rax){1to8}, %zmm17, %zmm19
+vfmadd213pd       %zmm16, %zmm17, %zmm19 {k1}
+vfmadd213pd       (%rax), %zmm17, %zmm19 {k1}
+vfmadd213pd       (%rax){1to8}, %zmm17, %zmm19 {k1}
+vfmadd213pd       %zmm16, %zmm17, %zmm19 {z}{k1}
+vfmadd213pd       (%rax), %zmm17, %zmm19 {z}{k1}
+vfmadd213pd       (%rax){1to8}, %zmm17, %zmm19 {z}{k1}
+
+vfmadd231pd       %zmm16, %zmm17, %zmm19
+vfmadd231pd       (%rax), %zmm17, %zmm19
+vfmadd231pd       (%rax){1to8}, %zmm17, %zmm19
+vfmadd231pd       %zmm16, %zmm17, %zmm19 {k1}
+vfmadd231pd       (%rax), %zmm17, %zmm19 {k1}
+vfmadd231pd       (%rax){1to8}, %zmm17, %zmm19 {k1}
+vfmadd231pd       %zmm16, %zmm17, %zmm19 {z}{k1}
+vfmadd231pd       (%rax), %zmm17, %zmm19 {z}{k1}
+vfmadd231pd       (%rax){1to8}, %zmm17, %zmm19 {z}{k1}
+
+vfmadd132ps       %zmm16, %zmm17, %zmm19
+vfmadd132ps       (%rax), %zmm17, %zmm19
+vfmadd132ps       (%rax){1to16}, %zmm17, %zmm19
+vfmadd132ps       %zmm16, %zmm17, %zmm19 {k1}
+vfmadd132ps       (%rax), %zmm17, %zmm19 {k1}
+vfmadd132ps       (%rax){1to16}, %zmm17, %zmm19 {k1}
+vfmadd132ps       %zmm16, %zmm17, %zmm19 {z}{k1}
+vfmadd132ps       (%rax), %zmm17, %zmm19 {z}{k1}
+vfmadd132ps       (%rax){1to16}, %zmm17, %zmm19 {z}{k1}
+
+vfmadd213ps       %zmm16, %zmm17, %zmm19
+vfmadd213ps       (%rax), %zmm17, %zmm19
+vfmadd213ps       (%rax){1to16}, %zmm17, %zmm19
+vfmadd213ps       %zmm16, %zmm17, %zmm19 {k1}
+vfmadd213ps       (%rax), %zmm17, %zmm19 {k1}
+vfmadd213ps       (%rax){1to16}, %zmm17, %zmm19 {k1}
+vfmadd213ps       %zmm16, %zmm17, %zmm19 {z}{k1}
+vfmadd213ps       (%rax), %zmm17, %zmm19 {z}{k1}
+vfmadd213ps       (%rax){1to16}, %zmm17, %zmm19 {z}{k1}
+
+vfmadd231ps       %zmm16, %zmm17, %zmm19
+vfmadd231ps       (%rax), %zmm17, %zmm19
+vfmadd231ps       (%rax){1to16}, %zmm17, %zmm19
+vfmadd231ps       %zmm16, %zmm17, %zmm19 {k1}
+vfmadd231ps       (%rax), %zmm17, %zmm19 {k1}
+vfmadd231ps       (%rax){1to16}, %zmm17, %zmm19 {k1}
+vfmadd231ps       %zmm16, %zmm17, %zmm19 {z}{k1}
+vfmadd231ps       (%rax), %zmm17, %zmm19 {z}{k1}
+vfmadd231ps       (%rax){1to16}, %zmm17, %zmm19 {z}{k1}
+
 vgatherdpd        (%rax,%ymm1,2), %zmm2 {k1}
 vgatherdps        (%rax,%zmm1,2), %zmm2 {k1}
 vgatherqpd        (%rax,%zmm1,2), %zmm2 {k1}
@@ -1274,6 +1334,60 @@ vunpcklps         (%rax){1to16}, %zmm17, %zmm19 {z}{k1}
 # CHECK-NEXT:  3      18    10.00                       vdivps	%zmm16, %zmm17, %zmm19 {%k1} {z}
 # CHECK-NEXT:  4      25    10.00   *                   vdivps	(%rax), %zmm17, %zmm19 {%k1} {z}
 # CHECK-NEXT:  4      25    10.00   *                   vdivps	(%rax){1to16}, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  1      4     0.50                        vfmadd132pd	%zmm16, %zmm17, %zmm19
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd132pd	(%rax), %zmm17, %zmm19
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd132pd	(%rax){1to8}, %zmm17, %zmm19
+# CHECK-NEXT:  1      4     0.50                        vfmadd132pd	%zmm16, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd132pd	(%rax), %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd132pd	(%rax){1to8}, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  1      4     0.50                        vfmadd132pd	%zmm16, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd132pd	(%rax), %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd132pd	(%rax){1to8}, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  1      4     0.50                        vfmadd213pd	%zmm16, %zmm17, %zmm19
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd213pd	(%rax), %zmm17, %zmm19
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd213pd	(%rax){1to8}, %zmm17, %zmm19
+# CHECK-NEXT:  1      4     0.50                        vfmadd213pd	%zmm16, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd213pd	(%rax), %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd213pd	(%rax){1to8}, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  1      4     0.50                        vfmadd213pd	%zmm16, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd213pd	(%rax), %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd213pd	(%rax){1to8}, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  1      4     0.50                        vfmadd231pd	%zmm16, %zmm17, %zmm19
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd231pd	(%rax), %zmm17, %zmm19
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd231pd	(%rax){1to8}, %zmm17, %zmm19
+# CHECK-NEXT:  1      4     0.50                        vfmadd231pd	%zmm16, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd231pd	(%rax), %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd231pd	(%rax){1to8}, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  1      4     0.50                        vfmadd231pd	%zmm16, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd231pd	(%rax), %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd231pd	(%rax){1to8}, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  1      4     0.50                        vfmadd132ps	%zmm16, %zmm17, %zmm19
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd132ps	(%rax), %zmm17, %zmm19
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd132ps	(%rax){1to16}, %zmm17, %zmm19
+# CHECK-NEXT:  1      4     0.50                        vfmadd132ps	%zmm16, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd132ps	(%rax), %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd132ps	(%rax){1to16}, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  1      4     0.50                        vfmadd132ps	%zmm16, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd132ps	(%rax), %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd132ps	(%rax){1to16}, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  1      4     0.50                        vfmadd213ps	%zmm16, %zmm17, %zmm19
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd213ps	(%rax), %zmm17, %zmm19
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd213ps	(%rax){1to16}, %zmm17, %zmm19
+# CHECK-NEXT:  1      4     0.50                        vfmadd213ps	%zmm16, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd213ps	(%rax), %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd213ps	(%rax){1to16}, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  1      4     0.50                        vfmadd213ps	%zmm16, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd213ps	(%rax), %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd213ps	(%rax){1to16}, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  1      4     0.50                        vfmadd231ps	%zmm16, %zmm17, %zmm19
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd231ps	(%rax), %zmm17, %zmm19
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd231ps	(%rax){1to16}, %zmm17, %zmm19
+# CHECK-NEXT:  1      4     0.50                        vfmadd231ps	%zmm16, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd231ps	(%rax), %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd231ps	(%rax){1to16}, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  1      4     0.50                        vfmadd231ps	%zmm16, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd231ps	(%rax), %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd231ps	(%rax){1to16}, %zmm17, %zmm19 {%k1} {z}
 # CHECK-NEXT:  5      21    4.00    *                   vgatherdpd	(%rax,%ymm1,2), %zmm2 {%k1}
 # CHECK-NEXT:  5      25    8.00    *                   vgatherdps	(%rax,%zmm1,2), %zmm2 {%k1}
 # CHECK-NEXT:  5      21    4.00    *                   vgatherqpd	(%rax,%zmm1,2), %zmm2 {%k1}
@@ -1915,7 +2029,7 @@ vunpcklps         (%rax){1to16}, %zmm17, %zmm19 {z}{k1}
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]
-# CHECK-NEXT:  -     612.00 312.67 99.67  314.83 314.83 16.00  616.67 2.00   5.33
+# CHECK-NEXT:  -     612.00 339.67 99.67  332.83 332.83 16.00  643.67 2.00   5.33
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    Instructions:
@@ -2178,6 +2292,60 @@ vunpcklps         (%rax){1to16}, %zmm17, %zmm19 {z}{k1}
 # CHECK-NEXT:  -     10.00  2.00    -      -      -      -     1.00    -      -     vdivps	%zmm16, %zmm17, %zmm19 {%k1} {z}
 # CHECK-NEXT:  -     10.00  2.00    -     0.50   0.50    -     1.00    -      -     vdivps	(%rax), %zmm17, %zmm19 {%k1} {z}
 # CHECK-NEXT:  -     10.00  2.00    -     0.50   0.50    -     1.00    -      -     vdivps	(%rax){1to16}, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -     vfmadd132pd	%zmm16, %zmm17, %zmm19
+# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -     vfmadd132pd	(%rax), %zmm17, %zmm19
+# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -     vfmadd132pd	(%rax){1to8}, %zmm17, %zmm19
+# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -     vfmadd132pd	%zmm16, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -     vfmadd132pd	(%rax), %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -     vfmadd132pd	(%rax){1to8}, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -     vfmadd132pd	%zmm16, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -     vfmadd132pd	(%rax), %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -     vfmadd132pd	(%rax){1to8}, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -     vfmadd213pd	%zmm16, %zmm17, %zmm19
+# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -     vfmadd213pd	(%rax), %zmm17, %zmm19
+# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -     vfmadd213pd	(%rax){1to8}, %zmm17, %zmm19
+# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -     vfmadd213pd	%zmm16, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -     vfmadd213pd	(%rax), %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -     vfmadd213pd	(%rax){1to8}, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -     vfmadd213pd	%zmm16, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -     vfmadd213pd	(%rax), %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -     vfmadd213pd	(%rax){1to8}, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -     vfmadd231pd	%zmm16, %zmm17, %zmm19
+# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -     vfmadd231pd	(%rax), %zmm17, %zmm19
+# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -     vfmadd231pd	(%rax){1to8}, %zmm17, %zmm19
+# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -     vfmadd231pd	%zmm16, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -     vfmadd231pd	(%rax), %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -     vfmadd231pd	(%rax){1to8}, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -     vfmadd231pd	%zmm16, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -     vfmadd231pd	(%rax), %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -     vfmadd231pd	(%rax){1to8}, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -     vfmadd132ps	%zmm16, %zmm17, %zmm19
+# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -     vfmadd132ps	(%rax), %zmm17, %zmm19
+# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -     vfmadd132ps	(%rax){1to16}, %zmm17, %zmm19
+# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -     vfmadd132ps	%zmm16, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -     vfmadd132ps	(%rax), %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -     vfmadd132ps	(%rax){1to16}, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -     vfmadd132ps	%zmm16, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -     vfmadd132ps	(%rax), %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -     vfmadd132ps	(%rax){1to16}, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -     vfmadd213ps	%zmm16, %zmm17, %zmm19
+# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -     vfmadd213ps	(%rax), %zmm17, %zmm19
+# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -     vfmadd213ps	(%rax){1to16}, %zmm17, %zmm19
+# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -     vfmadd213ps	%zmm16, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -     vfmadd213ps	(%rax), %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -     vfmadd213ps	(%rax){1to16}, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -     vfmadd213ps	%zmm16, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -     vfmadd213ps	(%rax), %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -     vfmadd213ps	(%rax){1to16}, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -     vfmadd231ps	%zmm16, %zmm17, %zmm19
+# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -     vfmadd231ps	(%rax), %zmm17, %zmm19
+# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -     vfmadd231ps	(%rax){1to16}, %zmm17, %zmm19
+# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -     vfmadd231ps	%zmm16, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -     vfmadd231ps	(%rax), %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -     vfmadd231ps	(%rax){1to16}, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -     vfmadd231ps	%zmm16, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -     vfmadd231ps	(%rax), %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -     vfmadd231ps	(%rax){1to16}, %zmm17, %zmm19 {%k1} {z}
 # CHECK-NEXT:  -      -     1.58   0.58   4.00   4.00    -     0.58   0.25    -     vgatherdpd	(%rax,%ymm1,2), %zmm2 {%k1}
 # CHECK-NEXT:  -      -     1.58   0.58   8.00   8.00    -     0.58   0.25    -     vgatherdps	(%rax,%zmm1,2), %zmm2 {%k1}
 # CHECK-NEXT:  -      -     1.58   0.58   4.00   4.00    -     0.58   0.25    -     vgatherqpd	(%rax,%zmm1,2), %zmm2 {%k1}
diff --git a/llvm/test/tools/llvm-mca/X86/SkylakeServer/resources-avx512vl.s b/llvm/test/tools/llvm-mca/X86/SkylakeServer/resources-avx512vl.s
index d61bc0d9c7e3b..2ad91ea514aa2 100644
--- a/llvm/test/tools/llvm-mca/X86/SkylakeServer/resources-avx512vl.s
+++ b/llvm/test/tools/llvm-mca/X86/SkylakeServer/resources-avx512vl.s
@@ -418,6 +418,126 @@ vdivps            %ymm16, %ymm17, %ymm19 {z}{k1}
 vdivps            (%rax), %ymm17, %ymm19 {z}{k1}
 vdivps            (%rax){1to8}, %ymm17, %ymm19 {z}{k1}
 
+vfmadd132pd       %xmm16, %xmm17, %xmm19
+vfmadd132pd       (%rax), %xmm17, %xmm19
+vfmadd132pd       (%rax){1to2}, %xmm17, %xmm19
+vfmadd132pd       %xmm16, %xmm17, %xmm19 {k1}
+vfmadd132pd       (%rax), %xmm17, %xmm19 {k1}
+vfmadd132pd       (%rax){1to2}, %xmm17, %xmm19 {k1}
+vfmadd132pd       %xmm16, %xmm17, %xmm19 {z}{k1}
+vfmadd132pd       (%rax), %xmm17, %xmm19 {z}{k1}
+vfmadd132pd       (%rax){1to2}, %xmm17, %xmm19 {z}{k1}
+
+vfmadd132pd       %ymm16, %ymm17, %ymm19
+vfmadd132pd       (%rax), %ymm17, %ymm19
+vfmadd132pd       (%rax){1to4}, %ymm17, %ymm19
+vfmadd132pd       %ymm16, %ymm17, %ymm19 {k1}
+vfmadd132pd       (%rax), %ymm17, %ymm19 {k1}
+vfmadd132pd       (%rax){1to4}, %ymm17, %ymm19 {k1}
+vfmadd132pd       %ymm16, %ymm17, %ymm19 {z}{k1}
+vfmadd132pd       (%rax), %ymm17, %ymm19 {z}{k1}
+vfmadd132pd       (%rax){1to4}, %ymm17, %ymm19 {z}{k1}
+
+vfmadd213pd       %xmm16, %xmm17, %xmm19
+vfmadd213pd       (%rax), %xmm17, %xmm19
+vfmadd213pd       (%rax){1to2}, %xmm17, %xmm19
+vfmadd213pd       %xmm16, %xmm17, %xmm19 {k1}
+vfmadd213pd       (%rax), %xmm17, %xmm19 {k1}
+vfmadd213pd       (%rax){1to2}, %xmm17, %xmm19 {k1}
+vfmadd213pd       %xmm16, %xmm17, %xmm19 {z}{k1}
+vfmadd213pd       (%rax), %xmm17, %xmm19 {z}{k1}
+vfmadd213pd       (%rax){1to2}, %xmm17, %xmm19 {z}{k1}
+
+vfmadd213pd       %ymm16, %ymm17, %ymm19
+vfmadd213pd       (%rax), %ymm17, %ymm19
+vfmadd213pd       (%rax){1to4}, %ymm17, %ymm19
+vfmadd213pd       %ymm16, %ymm17, %ymm19 {k1}
+vfmadd213pd       (%rax), %ymm17, %ymm19 {k1}
+vfmadd213pd       (%rax){1to4}, %ymm17, %ymm19 {k1}
+vfmadd213pd       %ymm16, %ymm17, %ymm19 {z}{k1}
+vfmadd213pd       (%rax), %ymm17, %ymm19 {z}{k1}
+vfmadd213pd       (%rax){1to4}, %ymm17, %ymm19 {z}{k1}
+
+vfmadd231pd       %xmm16, %xmm17, %xmm19
+vfmadd231pd       (%rax), %xmm17, %xmm19
+vfmadd231pd       (%rax){1to2}, %xmm17, %xmm19
+vfmadd231pd       %xmm16, %xmm17, %xmm19 {k1}
+vfmadd231pd       (%rax), %xmm17, %xmm19 {k1}
+vfmadd231pd       (%rax){1to2}, %xmm17, %xmm19 {k1}
+vfmadd231pd       %xmm16, %xmm17, %xmm19 {z}{k1}
+vfmadd231pd       (%rax), %xmm17, %xmm19 {z}{k1}
+vfmadd231pd       (%rax){1to2}, %xmm17, %xmm19 {z}{k1}
+
+vfmadd231pd       %ymm16, %ymm17, %ymm19
+vfmadd231pd       (%rax), %ymm17, %ymm19
+vfmadd231pd       (%rax){1to4}, %ymm17, %ymm19
+vfmadd231pd       %ymm16, %ymm17, %ymm19 {k1}
+vfmadd231pd       (%rax), %ymm17, %ymm19 {k1}
+vfmadd231pd       (%rax){1to4}, %ymm17, %ymm19 {k1}
+vfmadd231pd       %ymm16, %ymm17, %ymm19 {z}{k1}
+vfmadd231pd       (%rax), %ymm17, %ymm19 {z}{k1}
+vfmadd231pd       (%rax){1to4}, %ymm17, %ymm19 {z}{k1}
+
+vfmadd132ps       %xmm16, %xmm17, %xmm19
+vfmadd132ps       (%rax), %xmm17, %xmm19
+vfmadd132ps       (%rax){1to4}, %xmm17, %xmm19
+vfmadd132ps       %xmm16, %xmm17, %xmm19 {k1}
+vfmadd132ps       (%rax), %xmm17, %xmm19 {k1}
+vfmadd132ps       (%rax){1to4}, %xmm17, %xmm19 {k1}
+vfmadd132ps       %xmm16, %xmm17, %xmm19 {z}{k1}
+vfmadd132ps       (%rax), %xmm17, %xmm19 {z}{k1}
+vfmadd132ps       (%rax){1to4}, %xmm17, %xmm19 {z}{k1}
+
+vfmadd132ps       %ymm16, %ymm17, %ymm19
+vfmadd132ps       (%rax), %ymm17, %ymm19
+vfmadd132ps       (%rax){1to8}, %ymm17, %ymm19
+vfmadd132ps       %ymm16, %ymm17, %ymm19 {k1}
+vfmadd132ps       (%rax), %ymm17, %ymm19 {k1}
+vfmadd132ps       (%rax){1to8}, %ymm17, %ymm19 {k1}
+vfmadd132ps       %ymm16, %ymm17, %ymm19 {z}{k1}
+vfmadd132ps       (%rax), %ymm17, %ymm19 {z}{k1}
+vfmadd132ps       (%rax){1to8}, %ymm17, %ymm19 {z}{k1}
+
+vfmadd213ps       %xmm16, %xmm17, %xmm19
+vfmadd213ps       (%rax), %xmm17, %xmm19
+vfmadd213ps       (%rax){1to4}, %xmm17, %xmm19
+vfmadd213ps       %xmm16, %xmm17, %xmm19 {k1}
+vfmadd213ps       (%rax), %xmm17, %xmm19 {k1}
+vfmadd213ps       (%rax){1to4}, %xmm17, %xmm19 {k1}
+vfmadd213ps       %xmm16, %xmm17, %xmm19 {z}{k1}
+vfmadd213ps       (%rax), %xmm17, %xmm19 {z}{k1}
+vfmadd213ps       (%rax){1to4}, %xmm17, %xmm19 {z}{k1}
+
+vfmadd213ps       %ymm16, %ymm17, %ymm19
+vfmadd213ps       (%rax), %ymm17, %ymm19
+vfmadd213ps       (%rax){1to8}, %ymm17, %ymm19
+vfmadd213ps       %ymm16, %ymm17, %ymm19 {k1}
+vfmadd213ps       (%rax), %ymm17, %ymm19 {k1}
+vfmadd213ps       (%rax){1to8}, %ymm17, %ymm19 {k1}
+vfmadd213ps       %ymm16, %ymm17, %ymm19 {z}{k1}
+vfmadd213ps       (%rax), %ymm17, %ymm19 {z}{k1}
+vfmadd213ps       (%rax){1to8}, %ymm17, %ymm19 {z}{k1}
+
+vfmadd231ps       %xmm16, %xmm17, %xmm19
+vfmadd231ps       (%rax), %xmm17, %xmm19
+vfmadd231ps       (%rax){1to4}, %xmm17, %xmm19
+vfmadd231ps       %xmm16, %xmm17, %xmm19 {k1}
+vfmadd231ps       (%rax), %xmm17, %xmm19 {k1}
+vfmadd231ps       (%rax){1to4}, %xmm17, %xmm19 {k1}
+vfmadd231ps       %xmm16, %xmm17, %xmm19 {z}{k1}
+vfmadd231ps       (%rax), %xmm17, %xmm19 {z}{k1}
+vfmadd231ps       (%rax){1to4}, %xmm17, %xmm19 {z}{k1}
+
+vfmadd231ps       %ymm16, %ymm17, %ymm19
+vfmadd231ps       (%rax), %ymm17, %ymm19
+vfmadd231ps       (%rax){1to8}, %ymm17, %ymm19
+vfmadd231ps       %ymm16, %ymm17, %ymm19 {k1}
+vfmadd231ps       (%rax), %ymm17, %ymm19 {k1}
+vfmadd231ps       (%rax){1to8}, %ymm17, %ymm19 {k1}
+vfmadd231ps       %ymm16, %ymm17, %ymm19 {z}{k1}
+vfmadd231ps       (%rax), %ymm17, %ymm19 {z}{k1}
+vfmadd231ps       (%rax){1to8}, %ymm17, %ymm19 {z}{k1}
+
 vgatherdpd        (%rax,%xmm1,2), %ymm2 {k1}
 vgatherdps        (%rax,%ymm1,2), %ymm2 {k1}
 vgatherqpd        (%rax,%ymm1,2), %ymm2 {k1}
@@ -1961,6 +2081,114 @@ vunpcklps         (%rax){1to8}, %ymm17, %ymm19 {z}{k1}
 # CHECK-NEXT:  1      11    5.00                        vdivps	%ymm16, %ymm17, %ymm19 {%k1} {z}
 # CHECK-NEXT:  2      18    5.00    *                   vdivps	(%rax), %ymm17, %ymm19 {%k1} {z}
 # CHECK-NEXT:  2      18    5.00    *                   vdivps	(%rax){1to8}, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  1      4     0.50                        vfmadd132pd	%xmm16, %xmm17, %xmm19
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd132pd	(%rax), %xmm17, %xmm19
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd132pd	(%rax){1to2}, %xmm17, %xmm19
+# CHECK-NEXT:  1      4     0.50                        vfmadd132pd	%xmm16, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd132pd	(%rax), %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd132pd	(%rax){1to2}, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  1      4     0.50                        vfmadd132pd	%xmm16, %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd132pd	(%rax), %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd132pd	(%rax){1to2}, %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  1      4     0.50                        vfmadd132pd	%ymm16, %ymm17, %ymm19
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd132pd	(%rax), %ymm17, %ymm19
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd132pd	(%rax){1to4}, %ymm17, %ymm19
+# CHECK-NEXT:  1      4     0.50                        vfmadd132pd	%ymm16, %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd132pd	(%rax), %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd132pd	(%rax){1to4}, %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  1      4     0.50                        vfmadd132pd	%ymm16, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd132pd	(%rax), %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd132pd	(%rax){1to4}, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  1      4     0.50                        vfmadd213pd	%xmm16, %xmm17, %xmm19
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd213pd	(%rax), %xmm17, %xmm19
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd213pd	(%rax){1to2}, %xmm17, %xmm19
+# CHECK-NEXT:  1      4     0.50                        vfmadd213pd	%xmm16, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd213pd	(%rax), %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd213pd	(%rax){1to2}, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  1      4     0.50                        vfmadd213pd	%xmm16, %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd213pd	(%rax), %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd213pd	(%rax){1to2}, %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  1      4     0.50                        vfmadd213pd	%ymm16, %ymm17, %ymm19
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd213pd	(%rax), %ymm17, %ymm19
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd213pd	(%rax){1to4}, %ymm17, %ymm19
+# CHECK-NEXT:  1      4     0.50                        vfmadd213pd	%ymm16, %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd213pd	(%rax), %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd213pd	(%rax){1to4}, %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  1      4     0.50                        vfmadd213pd	%ymm16, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd213pd	(%rax), %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd213pd	(%rax){1to4}, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  1      4     0.50                        vfmadd231pd	%xmm16, %xmm17, %xmm19
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd231pd	(%rax), %xmm17, %xmm19
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd231pd	(%rax){1to2}, %xmm17, %xmm19
+# CHECK-NEXT:  1      4     0.50                        vfmadd231pd	%xmm16, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd231pd	(%rax), %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd231pd	(%rax){1to2}, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  1      4     0.50                        vfmadd231pd	%xmm16, %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd231pd	(%rax), %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd231pd	(%rax){1to2}, %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  1      4     0.50                        vfmadd231pd	%ymm16, %ymm17, %ymm19
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd231pd	(%rax), %ymm17, %ymm19
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd231pd	(%rax){1to4}, %ymm17, %ymm19
+# CHECK-NEXT:  1      4     0.50                        vfmadd231pd	%ymm16, %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd231pd	(%rax), %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd231pd	(%rax){1to4}, %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  1      4     0.50                        vfmadd231pd	%ymm16, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd231pd	(%rax), %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd231pd	(%rax){1to4}, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  1      4     0.50                        vfmadd132ps	%xmm16, %xmm17, %xmm19
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd132ps	(%rax), %xmm17, %xmm19
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd132ps	(%rax){1to4}, %xmm17, %xmm19
+# CHECK-NEXT:  1      4     0.50                        vfmadd132ps	%xmm16, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd132ps	(%rax), %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd132ps	(%rax){1to4}, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  1      4     0.50                        vfmadd132ps	%xmm16, %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd132ps	(%rax), %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd132ps	(%rax){1to4}, %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  1      4     0.50                        vfmadd132ps	%ymm16, %ymm17, %ymm19
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd132ps	(%rax), %ymm17, %ymm19
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd132ps	(%rax){1to8}, %ymm17, %ymm19
+# CHECK-NEXT:  1      4     0.50                        vfmadd132ps	%ymm16, %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd132ps	(%rax), %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd132ps	(%rax){1to8}, %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  1      4     0.50                        vfmadd132ps	%ymm16, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd132ps	(%rax), %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd132ps	(%rax){1to8}, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  1      4     0.50                        vfmadd213ps	%xmm16, %xmm17, %xmm19
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd213ps	(%rax), %xmm17, %xmm19
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd213ps	(%rax){1to4}, %xmm17, %xmm19
+# CHECK-NEXT:  1      4     0.50                        vfmadd213ps	%xmm16, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd213ps	(%rax), %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd213ps	(%rax){1to4}, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  1      4     0.50                        vfmadd213ps	%xmm16, %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd213ps	(%rax), %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd213ps	(%rax){1to4}, %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  1      4     0.50                        vfmadd213ps	%ymm16, %ymm17, %ymm19
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd213ps	(%rax), %ymm17, %ymm19
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd213ps	(%rax){1to8}, %ymm17, %ymm19
+# CHECK-NEXT:  1      4     0.50                        vfmadd213ps	%ymm16, %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd213ps	(%rax), %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd213ps	(%rax){1to8}, %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  1      4     0.50                        vfmadd213ps	%ymm16, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd213ps	(%rax), %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd213ps	(%rax){1to8}, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  1      4     0.50                        vfmadd231ps	%xmm16, %xmm17, %xmm19
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd231ps	(%rax), %xmm17, %xmm19
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd231ps	(%rax){1to4}, %xmm17, %xmm19
+# CHECK-NEXT:  1      4     0.50                        vfmadd231ps	%xmm16, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd231ps	(%rax), %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd231ps	(%rax){1to4}, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  1      4     0.50                        vfmadd231ps	%xmm16, %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd231ps	(%rax), %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  2      10    0.50    *                   vfmadd231ps	(%rax){1to4}, %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  1      4     0.50                        vfmadd231ps	%ymm16, %ymm17, %ymm19
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd231ps	(%rax), %ymm17, %ymm19
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd231ps	(%rax){1to8}, %ymm17, %ymm19
+# CHECK-NEXT:  1      4     0.50                        vfmadd231ps	%ymm16, %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd231ps	(%rax), %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd231ps	(%rax){1to8}, %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  1      4     0.50                        vfmadd231ps	%ymm16, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd231ps	(%rax), %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  2      11    0.50    *                   vfmadd231ps	(%rax){1to8}, %ymm17, %ymm19 {%k1} {z}
 # CHECK-NEXT:  5      19    2.00    *                   vgatherdpd	(%rax,%xmm1,2), %ymm2 {%k1}
 # CHECK-NEXT:  5      21    4.00    *                   vgatherdps	(%rax,%ymm1,2), %ymm2 {%k1}
 # CHECK-NEXT:  5      19    2.00    *                   vgatherqpd	(%rax,%ymm1,2), %ymm2 {%k1}
@@ -3002,7 +3230,7 @@ vunpcklps         (%rax){1to8}, %ymm17, %ymm19 {z}{k1}
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]
-# CHECK-NEXT:  -     423.00 384.33 296.33 467.17 467.17 32.00  785.33 4.00   10.67
+# CHECK-NEXT:  -     423.00 438.33 350.33 503.17 503.17 32.00  785.33 4.00   10.67
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    Instructions:
@@ -3378,6 +3606,114 @@ vunpcklps         (%rax){1to8}, %ymm17, %ymm19 {z}{k1}
 # CHECK-NEXT:  -     5.00   1.00    -      -      -      -      -      -      -     vdivps	%ymm16, %ymm17, %ymm19 {%k1} {z}
 # CHECK-NEXT:  -     5.00   1.00    -     0.50   0.50    -      -      -      -     vdivps	(%rax), %ymm17, %ymm19 {%k1} {z}
 # CHECK-NEXT:  -     5.00   1.00    -     0.50   0.50    -      -      -      -     vdivps	(%rax){1to8}, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -     vfmadd132pd	%xmm16, %xmm17, %xmm19
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -     vfmadd132pd	(%rax), %xmm17, %xmm19
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -     vfmadd132pd	(%rax){1to2}, %xmm17, %xmm19
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -     vfmadd132pd	%xmm16, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -     vfmadd132pd	(%rax), %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -     vfmadd132pd	(%rax){1to2}, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -     vfmadd132pd	%xmm16, %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -     vfmadd132pd	(%rax), %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -     vfmadd132pd	(%rax){1to2}, %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -     vfmadd132pd	%ymm16, %ymm17, %ymm19
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -     vfmadd132pd	(%rax), %ymm17, %ymm19
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -     vfmadd132pd	(%rax){1to4}, %ymm17, %ymm19
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -     vfmadd132pd	%ymm16, %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -     vfmadd132pd	(%rax), %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -     vfmadd132pd	(%rax){1to4}, %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -     vfmadd132pd	%ymm16, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -     vfmadd132pd	(%rax), %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -     vfmadd132pd	(%rax){1to4}, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -     vfmadd213pd	%xmm16, %xmm17, %xmm19
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -     vfmadd213pd	(%rax), %xmm17, %xmm19
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -     vfmadd213pd	(%rax){1to2}, %xmm17, %xmm19
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -     vfmadd213pd	%xmm16, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -     vfmadd213pd	(%rax), %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -     vfmadd213pd	(%rax){1to2}, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -     vfmadd213pd	%xmm16, %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -     vfmadd213pd	(%rax), %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -     vfmadd213pd	(%rax){1to2}, %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -     vfmadd213pd	%ymm16, %ymm17, %ymm19
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -     vfmadd213pd	(%rax), %ymm17, %ymm19
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -     vfmadd213pd	(%rax){1to4}, %ymm17, %ymm19
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -     vfmadd213pd	%ymm16, %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -     vfmadd213pd	(%rax), %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -     vfmadd213pd	(%rax){1to4}, %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -     vfmadd213pd	%ymm16, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -     vfmadd213pd	(%rax), %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -     vfmadd213pd	(%rax){1to4}, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -     vfmadd231pd	%xmm16, %xmm17, %xmm19
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -     vfmadd231pd	(%rax), %xmm17, %xmm19
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -     vfmadd231pd	(%rax){1to2}, %xmm17, %xmm19
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -     vfmadd231pd	%xmm16, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -     vfmadd231pd	(%rax), %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -     vfmadd231pd	(%rax){1to2}, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -     vfmadd231pd	%xmm16, %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -     vfmadd231pd	(%rax), %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -     vfmadd231pd	(%rax){1to2}, %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -     vfmadd231pd	%ymm16, %ymm17, %ymm19
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -     vfmadd231pd	(%rax), %ymm17, %ymm19
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -     vfmadd231pd	(%rax){1to4}, %ymm17, %ymm19
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -     vfmadd231pd	%ymm16, %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -     vfmadd231pd	(%rax), %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -     vfmadd231pd	(%rax){1to4}, %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -     vfmadd231pd	%ymm16, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -     vfmadd231pd	(%rax), %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -     vfmadd231pd	(%rax){1to4}, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -     vfmadd132ps	%xmm16, %xmm17, %xmm19
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -     vfmadd132ps	(%rax), %xmm17, %xmm19
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -     vfmadd132ps	(%rax){1to4}, %xmm17, %xmm19
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -     vfmadd132ps	%xmm16, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -     vfmadd132ps	(%rax), %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -     vfmadd132ps	(%rax){1to4}, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -     vfmadd132ps	%xmm16, %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -     vfmadd132ps	(%rax), %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -     vfmadd132ps	(%rax){1to4}, %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -     vfmadd132ps	%ymm16, %ymm17, %ymm19
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -     vfmadd132ps	(%rax), %ymm17, %ymm19
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -     vfmadd132ps	(%rax){1to8}, %ymm17, %ymm19
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -     vfmadd132ps	%ymm16, %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -     vfmadd132ps	(%rax), %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -     vfmadd132ps	(%rax){1to8}, %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -     vfmadd132ps	%ymm16, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -     vfmadd132ps	(%rax), %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -     vfmadd132ps	(%rax){1to8}, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -     vfmadd213ps	%xmm16, %xmm17, %xmm19
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -     vfmadd213ps	(%rax), %xmm17, %xmm19
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -     vfmadd213ps	(%rax){1to4}, %xmm17, %xmm19
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -     vfmadd213ps	%xmm16, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -     vfmadd213ps	(%rax), %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -     vfmadd213ps	(%rax){1to4}, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -     vfmadd213ps	%xmm16, %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -     vfmadd213ps	(%rax), %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -     vfmadd213ps	(%rax){1to4}, %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -     vfmadd213ps	%ymm16, %ymm17, %ymm19
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -     vfmadd213ps	(%rax), %ymm17, %ymm19
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -     vfmadd213ps	(%rax){1to8}, %ymm17, %ymm19
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -     vfmadd213ps	%ymm16, %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -     vfmadd213ps	(%rax), %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -     vfmadd213ps	(%rax){1to8}, %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -     vfmadd213ps	%ymm16, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -     vfmadd213ps	(%rax), %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -     vfmadd213ps	(%rax){1to8}, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -     vfmadd231ps	%xmm16, %xmm17, %xmm19
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -     vfmadd231ps	(%rax), %xmm17, %xmm19
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -     vfmadd231ps	(%rax){1to4}, %xmm17, %xmm19
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -     vfmadd231ps	%xmm16, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -     vfmadd231ps	(%rax), %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -     vfmadd231ps	(%rax){1to4}, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -     vfmadd231ps	%xmm16, %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -     vfmadd231ps	(%rax), %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -     vfmadd231ps	(%rax){1to4}, %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -     vfmadd231ps	%ymm16, %ymm17, %ymm19
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -     vfmadd231ps	(%rax), %ymm17, %ymm19
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -     vfmadd231ps	(%rax){1to8}, %ymm17, %ymm19
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -     vfmadd231ps	%ymm16, %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -     vfmadd231ps	(%rax), %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -     vfmadd231ps	(%rax){1to8}, %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  -      -     0.50   0.50    -      -      -      -      -      -     vfmadd231ps	%ymm16, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -     vfmadd231ps	(%rax), %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -     0.50   0.50   0.50   0.50    -      -      -      -     vfmadd231ps	(%rax){1to8}, %ymm17, %ymm19 {%k1} {z}
 # CHECK-NEXT:  -      -     1.58   0.58   2.00   2.00    -     0.58   0.25    -     vgatherdpd	(%rax,%xmm1,2), %ymm2 {%k1}
 # CHECK-NEXT:  -      -     1.58   0.58   4.00   4.00    -     0.58   0.25    -     vgatherdps	(%rax,%ymm1,2), %ymm2 {%k1}
 # CHECK-NEXT:  -      -     1.58   0.58   2.00   2.00    -     0.58   0.25    -     vgatherqpd	(%rax,%ymm1,2), %ymm2 {%k1}
diff --git a/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512.s b/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512.s
index 4fb89645febce..6742cfccb2d00 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512.s
@@ -298,6 +298,66 @@ vdivps            %zmm16, %zmm17, %zmm19 {z}{k1}
 vdivps            (%rax), %zmm17, %zmm19 {z}{k1}
 vdivps            (%rax){1to16}, %zmm17, %zmm19 {z}{k1}
 
+vfmadd132pd       %zmm16, %zmm17, %zmm19
+vfmadd132pd       (%rax), %zmm17, %zmm19
+vfmadd132pd       (%rax){1to8}, %zmm17, %zmm19
+vfmadd132pd       %zmm16, %zmm17, %zmm19 {k1}
+vfmadd132pd       (%rax), %zmm17, %zmm19 {k1}
+vfmadd132pd       (%rax){1to8}, %zmm17, %zmm19 {k1}
+vfmadd132pd       %zmm16, %zmm17, %zmm19 {z}{k1}
+vfmadd132pd       (%rax), %zmm17, %zmm19 {z}{k1}
+vfmadd132pd       (%rax){1to8}, %zmm17, %zmm19 {z}{k1}
+
+vfmadd213pd       %zmm16, %zmm17, %zmm19
+vfmadd213pd       (%rax), %zmm17, %zmm19
+vfmadd213pd       (%rax){1to8}, %zmm17, %zmm19
+vfmadd213pd       %zmm16, %zmm17, %zmm19 {k1}
+vfmadd213pd       (%rax), %zmm17, %zmm19 {k1}
+vfmadd213pd       (%rax){1to8}, %zmm17, %zmm19 {k1}
+vfmadd213pd       %zmm16, %zmm17, %zmm19 {z}{k1}
+vfmadd213pd       (%rax), %zmm17, %zmm19 {z}{k1}
+vfmadd213pd       (%rax){1to8}, %zmm17, %zmm19 {z}{k1}
+
+vfmadd231pd       %zmm16, %zmm17, %zmm19
+vfmadd231pd       (%rax), %zmm17, %zmm19
+vfmadd231pd       (%rax){1to8}, %zmm17, %zmm19
+vfmadd231pd       %zmm16, %zmm17, %zmm19 {k1}
+vfmadd231pd       (%rax), %zmm17, %zmm19 {k1}
+vfmadd231pd       (%rax){1to8}, %zmm17, %zmm19 {k1}
+vfmadd231pd       %zmm16, %zmm17, %zmm19 {z}{k1}
+vfmadd231pd       (%rax), %zmm17, %zmm19 {z}{k1}
+vfmadd231pd       (%rax){1to8}, %zmm17, %zmm19 {z}{k1}
+
+vfmadd132ps       %zmm16, %zmm17, %zmm19
+vfmadd132ps       (%rax), %zmm17, %zmm19
+vfmadd132ps       (%rax){1to16}, %zmm17, %zmm19
+vfmadd132ps       %zmm16, %zmm17, %zmm19 {k1}
+vfmadd132ps       (%rax), %zmm17, %zmm19 {k1}
+vfmadd132ps       (%rax){1to16}, %zmm17, %zmm19 {k1}
+vfmadd132ps       %zmm16, %zmm17, %zmm19 {z}{k1}
+vfmadd132ps       (%rax), %zmm17, %zmm19 {z}{k1}
+vfmadd132ps       (%rax){1to16}, %zmm17, %zmm19 {z}{k1}
+
+vfmadd213ps       %zmm16, %zmm17, %zmm19
+vfmadd213ps       (%rax), %zmm17, %zmm19
+vfmadd213ps       (%rax){1to16}, %zmm17, %zmm19
+vfmadd213ps       %zmm16, %zmm17, %zmm19 {k1}
+vfmadd213ps       (%rax), %zmm17, %zmm19 {k1}
+vfmadd213ps       (%rax){1to16}, %zmm17, %zmm19 {k1}
+vfmadd213ps       %zmm16, %zmm17, %zmm19 {z}{k1}
+vfmadd213ps       (%rax), %zmm17, %zmm19 {z}{k1}
+vfmadd213ps       (%rax){1to16}, %zmm17, %zmm19 {z}{k1}
+
+vfmadd231ps       %zmm16, %zmm17, %zmm19
+vfmadd231ps       (%rax), %zmm17, %zmm19
+vfmadd231ps       (%rax){1to16}, %zmm17, %zmm19
+vfmadd231ps       %zmm16, %zmm17, %zmm19 {k1}
+vfmadd231ps       (%rax), %zmm17, %zmm19 {k1}
+vfmadd231ps       (%rax){1to16}, %zmm17, %zmm19 {k1}
+vfmadd231ps       %zmm16, %zmm17, %zmm19 {z}{k1}
+vfmadd231ps       (%rax), %zmm17, %zmm19 {z}{k1}
+vfmadd231ps       (%rax){1to16}, %zmm17, %zmm19 {z}{k1}
+
 vgatherdpd        (%rax,%ymm1,2), %zmm2 {k1}
 vgatherdps        (%rax,%zmm1,2), %zmm2 {k1}
 vgatherqpd        (%rax,%zmm1,2), %zmm2 {k1}
@@ -1274,6 +1334,60 @@ vunpcklps         (%rax){1to16}, %zmm17, %zmm19 {z}{k1}
 # CHECK-NEXT:  1      11    6.00                        vdivps	%zmm16, %zmm17, %zmm19 {%k1} {z}
 # CHECK-NEXT:  1      18    6.00    *                   vdivps	(%rax), %zmm17, %zmm19 {%k1} {z}
 # CHECK-NEXT:  1      18    6.00    *                   vdivps	(%rax){1to16}, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  1      4     1.00                        vfmadd132pd	%zmm16, %zmm17, %zmm19
+# CHECK-NEXT:  1      11    1.00    *                   vfmadd132pd	(%rax), %zmm17, %zmm19
+# CHECK-NEXT:  1      11    1.00    *                   vfmadd132pd	(%rax){1to8}, %zmm17, %zmm19
+# CHECK-NEXT:  1      4     1.00                        vfmadd132pd	%zmm16, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  1      11    1.00    *                   vfmadd132pd	(%rax), %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  1      11    1.00    *                   vfmadd132pd	(%rax){1to8}, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  1      4     1.00                        vfmadd132pd	%zmm16, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  1      11    1.00    *                   vfmadd132pd	(%rax), %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  1      11    1.00    *                   vfmadd132pd	(%rax){1to8}, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  1      4     1.00                        vfmadd213pd	%zmm16, %zmm17, %zmm19
+# CHECK-NEXT:  1      11    1.00    *                   vfmadd213pd	(%rax), %zmm17, %zmm19
+# CHECK-NEXT:  1      11    1.00    *                   vfmadd213pd	(%rax){1to8}, %zmm17, %zmm19
+# CHECK-NEXT:  1      4     1.00                        vfmadd213pd	%zmm16, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  1      11    1.00    *                   vfmadd213pd	(%rax), %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  1      11    1.00    *                   vfmadd213pd	(%rax){1to8}, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  1      4     1.00                        vfmadd213pd	%zmm16, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  1      11    1.00    *                   vfmadd213pd	(%rax), %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  1      11    1.00    *                   vfmadd213pd	(%rax){1to8}, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  1      4     1.00                        vfmadd231pd	%zmm16, %zmm17, %zmm19
+# CHECK-NEXT:  1      11    1.00    *                   vfmadd231pd	(%rax), %zmm17, %zmm19
+# CHECK-NEXT:  1      11    1.00    *                   vfmadd231pd	(%rax){1to8}, %zmm17, %zmm19
+# CHECK-NEXT:  1      4     1.00                        vfmadd231pd	%zmm16, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  1      11    1.00    *                   vfmadd231pd	(%rax), %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  1      11    1.00    *                   vfmadd231pd	(%rax){1to8}, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  1      4     1.00                        vfmadd231pd	%zmm16, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  1      11    1.00    *                   vfmadd231pd	(%rax), %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  1      11    1.00    *                   vfmadd231pd	(%rax){1to8}, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  1      4     1.00                        vfmadd132ps	%zmm16, %zmm17, %zmm19
+# CHECK-NEXT:  1      11    1.00    *                   vfmadd132ps	(%rax), %zmm17, %zmm19
+# CHECK-NEXT:  1      11    1.00    *                   vfmadd132ps	(%rax){1to16}, %zmm17, %zmm19
+# CHECK-NEXT:  1      4     1.00                        vfmadd132ps	%zmm16, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  1      11    1.00    *                   vfmadd132ps	(%rax), %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  1      11    1.00    *                   vfmadd132ps	(%rax){1to16}, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  1      4     1.00                        vfmadd132ps	%zmm16, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  1      11    1.00    *                   vfmadd132ps	(%rax), %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  1      11    1.00    *                   vfmadd132ps	(%rax){1to16}, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  1      4     1.00                        vfmadd213ps	%zmm16, %zmm17, %zmm19
+# CHECK-NEXT:  1      11    1.00    *                   vfmadd213ps	(%rax), %zmm17, %zmm19
+# CHECK-NEXT:  1      11    1.00    *                   vfmadd213ps	(%rax){1to16}, %zmm17, %zmm19
+# CHECK-NEXT:  1      4     1.00                        vfmadd213ps	%zmm16, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  1      11    1.00    *                   vfmadd213ps	(%rax), %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  1      11    1.00    *                   vfmadd213ps	(%rax){1to16}, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  1      4     1.00                        vfmadd213ps	%zmm16, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  1      11    1.00    *                   vfmadd213ps	(%rax), %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  1      11    1.00    *                   vfmadd213ps	(%rax){1to16}, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  1      4     1.00                        vfmadd231ps	%zmm16, %zmm17, %zmm19
+# CHECK-NEXT:  1      11    1.00    *                   vfmadd231ps	(%rax), %zmm17, %zmm19
+# CHECK-NEXT:  1      11    1.00    *                   vfmadd231ps	(%rax){1to16}, %zmm17, %zmm19
+# CHECK-NEXT:  1      4     1.00                        vfmadd231ps	%zmm16, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  1      11    1.00    *                   vfmadd231ps	(%rax), %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  1      11    1.00    *                   vfmadd231ps	(%rax){1to16}, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  1      4     1.00                        vfmadd231ps	%zmm16, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  1      11    1.00    *                   vfmadd231ps	(%rax), %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  1      11    1.00    *                   vfmadd231ps	(%rax){1to16}, %zmm17, %zmm19 {%k1} {z}
 # CHECK-NEXT:  1      5     0.33    *                   vgatherdpd	(%rax,%ymm1,2), %zmm2 {%k1}
 # CHECK-NEXT:  1      5     0.33    *                   vgatherdps	(%rax,%zmm1,2), %zmm2 {%k1}
 # CHECK-NEXT:  1      5     0.33    *                   vgatherqpd	(%rax,%zmm1,2), %zmm2 {%k1}
@@ -1928,7 +2042,7 @@ vunpcklps         (%rax){1to16}, %zmm17, %zmm19 {z}{k1}
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12.0] [12.1] [13]   [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1]
-# CHECK-NEXT: 2.67   2.67   2.67    -      -      -      -      -     167.00 1006.50 618.00 352.50 277.50 277.50 16.00 187.67 187.67 187.67 182.33 182.33 182.33 8.00   8.00
+# CHECK-NEXT: 2.67   2.67   2.67    -      -      -      -      -     221.00 1060.50 618.00 352.50 295.50 295.50 16.00 199.67 199.67 199.67 194.33 194.33 194.33 8.00   8.00
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12.0] [12.1] [13]   [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1] Instructions:
@@ -2191,6 +2305,60 @@ vunpcklps         (%rax){1to16}, %zmm17, %zmm19 {z}{k1}
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     6.00    -      -      -      -      -      -      -      -      -      -      -      -      -     vdivps	%zmm16, %zmm17, %zmm19 {%k1} {z}
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     6.00    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vdivps	(%rax), %zmm17, %zmm19 {%k1} {z}
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     6.00    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vdivps	(%rax){1to16}, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     vfmadd132pd	%zmm16, %zmm17, %zmm19
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vfmadd132pd	(%rax), %zmm17, %zmm19
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vfmadd132pd	(%rax){1to8}, %zmm17, %zmm19
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     vfmadd132pd	%zmm16, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vfmadd132pd	(%rax), %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vfmadd132pd	(%rax){1to8}, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     vfmadd132pd	%zmm16, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vfmadd132pd	(%rax), %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vfmadd132pd	(%rax){1to8}, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     vfmadd213pd	%zmm16, %zmm17, %zmm19
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vfmadd213pd	(%rax), %zmm17, %zmm19
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vfmadd213pd	(%rax){1to8}, %zmm17, %zmm19
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     vfmadd213pd	%zmm16, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vfmadd213pd	(%rax), %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vfmadd213pd	(%rax){1to8}, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     vfmadd213pd	%zmm16, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vfmadd213pd	(%rax), %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vfmadd213pd	(%rax){1to8}, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     vfmadd231pd	%zmm16, %zmm17, %zmm19
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vfmadd231pd	(%rax), %zmm17, %zmm19
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vfmadd231pd	(%rax){1to8}, %zmm17, %zmm19
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     vfmadd231pd	%zmm16, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vfmadd231pd	(%rax), %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vfmadd231pd	(%rax){1to8}, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     vfmadd231pd	%zmm16, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vfmadd231pd	(%rax), %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vfmadd231pd	(%rax){1to8}, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     vfmadd132ps	%zmm16, %zmm17, %zmm19
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vfmadd132ps	(%rax), %zmm17, %zmm19
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vfmadd132ps	(%rax){1to16}, %zmm17, %zmm19
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     vfmadd132ps	%zmm16, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vfmadd132ps	(%rax), %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vfmadd132ps	(%rax){1to16}, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     vfmadd132ps	%zmm16, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vfmadd132ps	(%rax), %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vfmadd132ps	(%rax){1to16}, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     vfmadd213ps	%zmm16, %zmm17, %zmm19
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vfmadd213ps	(%rax), %zmm17, %zmm19
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vfmadd213ps	(%rax){1to16}, %zmm17, %zmm19
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     vfmadd213ps	%zmm16, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vfmadd213ps	(%rax), %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vfmadd213ps	(%rax){1to16}, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     vfmadd213ps	%zmm16, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vfmadd213ps	(%rax), %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vfmadd213ps	(%rax){1to16}, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     vfmadd231ps	%zmm16, %zmm17, %zmm19
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vfmadd231ps	(%rax), %zmm17, %zmm19
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vfmadd231ps	(%rax){1to16}, %zmm17, %zmm19
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     vfmadd231ps	%zmm16, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vfmadd231ps	(%rax), %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vfmadd231ps	(%rax){1to16}, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     vfmadd231ps	%zmm16, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vfmadd231ps	(%rax), %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vfmadd231ps	(%rax){1to16}, %zmm17, %zmm19 {%k1} {z}
 # CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -      -      -      -      -      -      -      -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vgatherdpd	(%rax,%ymm1,2), %zmm2 {%k1}
 # CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -      -      -      -      -      -      -      -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vgatherdps	(%rax,%zmm1,2), %zmm2 {%k1}
 # CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -      -      -      -      -      -      -      -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vgatherqpd	(%rax,%zmm1,2), %zmm2 {%k1}
diff --git a/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512vl.s b/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512vl.s
index a1fd2056d8542..e2813a564384c 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512vl.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512vl.s
@@ -418,6 +418,126 @@ vdivps            %ymm16, %ymm17, %ymm19 {z}{k1}
 vdivps            (%rax), %ymm17, %ymm19 {z}{k1}
 vdivps            (%rax){1to8}, %ymm17, %ymm19 {z}{k1}
 
+vfmadd132pd       %xmm16, %xmm17, %xmm19
+vfmadd132pd       (%rax), %xmm17, %xmm19
+vfmadd132pd       (%rax){1to2}, %xmm17, %xmm19
+vfmadd132pd       %xmm16, %xmm17, %xmm19 {k1}
+vfmadd132pd       (%rax), %xmm17, %xmm19 {k1}
+vfmadd132pd       (%rax){1to2}, %xmm17, %xmm19 {k1}
+vfmadd132pd       %xmm16, %xmm17, %xmm19 {z}{k1}
+vfmadd132pd       (%rax), %xmm17, %xmm19 {z}{k1}
+vfmadd132pd       (%rax){1to2}, %xmm17, %xmm19 {z}{k1}
+
+vfmadd132pd       %ymm16, %ymm17, %ymm19
+vfmadd132pd       (%rax), %ymm17, %ymm19
+vfmadd132pd       (%rax){1to4}, %ymm17, %ymm19
+vfmadd132pd       %ymm16, %ymm17, %ymm19 {k1}
+vfmadd132pd       (%rax), %ymm17, %ymm19 {k1}
+vfmadd132pd       (%rax){1to4}, %ymm17, %ymm19 {k1}
+vfmadd132pd       %ymm16, %ymm17, %ymm19 {z}{k1}
+vfmadd132pd       (%rax), %ymm17, %ymm19 {z}{k1}
+vfmadd132pd       (%rax){1to4}, %ymm17, %ymm19 {z}{k1}
+
+vfmadd213pd       %xmm16, %xmm17, %xmm19
+vfmadd213pd       (%rax), %xmm17, %xmm19
+vfmadd213pd       (%rax){1to2}, %xmm17, %xmm19
+vfmadd213pd       %xmm16, %xmm17, %xmm19 {k1}
+vfmadd213pd       (%rax), %xmm17, %xmm19 {k1}
+vfmadd213pd       (%rax){1to2}, %xmm17, %xmm19 {k1}
+vfmadd213pd       %xmm16, %xmm17, %xmm19 {z}{k1}
+vfmadd213pd       (%rax), %xmm17, %xmm19 {z}{k1}
+vfmadd213pd       (%rax){1to2}, %xmm17, %xmm19 {z}{k1}
+
+vfmadd213pd       %ymm16, %ymm17, %ymm19
+vfmadd213pd       (%rax), %ymm17, %ymm19
+vfmadd213pd       (%rax){1to4}, %ymm17, %ymm19
+vfmadd213pd       %ymm16, %ymm17, %ymm19 {k1}
+vfmadd213pd       (%rax), %ymm17, %ymm19 {k1}
+vfmadd213pd       (%rax){1to4}, %ymm17, %ymm19 {k1}
+vfmadd213pd       %ymm16, %ymm17, %ymm19 {z}{k1}
+vfmadd213pd       (%rax), %ymm17, %ymm19 {z}{k1}
+vfmadd213pd       (%rax){1to4}, %ymm17, %ymm19 {z}{k1}
+
+vfmadd231pd       %xmm16, %xmm17, %xmm19
+vfmadd231pd       (%rax), %xmm17, %xmm19
+vfmadd231pd       (%rax){1to2}, %xmm17, %xmm19
+vfmadd231pd       %xmm16, %xmm17, %xmm19 {k1}
+vfmadd231pd       (%rax), %xmm17, %xmm19 {k1}
+vfmadd231pd       (%rax){1to2}, %xmm17, %xmm19 {k1}
+vfmadd231pd       %xmm16, %xmm17, %xmm19 {z}{k1}
+vfmadd231pd       (%rax), %xmm17, %xmm19 {z}{k1}
+vfmadd231pd       (%rax){1to2}, %xmm17, %xmm19 {z}{k1}
+
+vfmadd231pd       %ymm16, %ymm17, %ymm19
+vfmadd231pd       (%rax), %ymm17, %ymm19
+vfmadd231pd       (%rax){1to4}, %ymm17, %ymm19
+vfmadd231pd       %ymm16, %ymm17, %ymm19 {k1}
+vfmadd231pd       (%rax), %ymm17, %ymm19 {k1}
+vfmadd231pd       (%rax){1to4}, %ymm17, %ymm19 {k1}
+vfmadd231pd       %ymm16, %ymm17, %ymm19 {z}{k1}
+vfmadd231pd       (%rax), %ymm17, %ymm19 {z}{k1}
+vfmadd231pd       (%rax){1to4}, %ymm17, %ymm19 {z}{k1}
+
+vfmadd132ps       %xmm16, %xmm17, %xmm19
+vfmadd132ps       (%rax), %xmm17, %xmm19
+vfmadd132ps       (%rax){1to4}, %xmm17, %xmm19
+vfmadd132ps       %xmm16, %xmm17, %xmm19 {k1}
+vfmadd132ps       (%rax), %xmm17, %xmm19 {k1}
+vfmadd132ps       (%rax){1to4}, %xmm17, %xmm19 {k1}
+vfmadd132ps       %xmm16, %xmm17, %xmm19 {z}{k1}
+vfmadd132ps       (%rax), %xmm17, %xmm19 {z}{k1}
+vfmadd132ps       (%rax){1to4}, %xmm17, %xmm19 {z}{k1}
+
+vfmadd132ps       %ymm16, %ymm17, %ymm19
+vfmadd132ps       (%rax), %ymm17, %ymm19
+vfmadd132ps       (%rax){1to8}, %ymm17, %ymm19
+vfmadd132ps       %ymm16, %ymm17, %ymm19 {k1}
+vfmadd132ps       (%rax), %ymm17, %ymm19 {k1}
+vfmadd132ps       (%rax){1to8}, %ymm17, %ymm19 {k1}
+vfmadd132ps       %ymm16, %ymm17, %ymm19 {z}{k1}
+vfmadd132ps       (%rax), %ymm17, %ymm19 {z}{k1}
+vfmadd132ps       (%rax){1to8}, %ymm17, %ymm19 {z}{k1}
+
+vfmadd213ps       %xmm16, %xmm17, %xmm19
+vfmadd213ps       (%rax), %xmm17, %xmm19
+vfmadd213ps       (%rax){1to4}, %xmm17, %xmm19
+vfmadd213ps       %xmm16, %xmm17, %xmm19 {k1}
+vfmadd213ps       (%rax), %xmm17, %xmm19 {k1}
+vfmadd213ps       (%rax){1to4}, %xmm17, %xmm19 {k1}
+vfmadd213ps       %xmm16, %xmm17, %xmm19 {z}{k1}
+vfmadd213ps       (%rax), %xmm17, %xmm19 {z}{k1}
+vfmadd213ps       (%rax){1to4}, %xmm17, %xmm19 {z}{k1}
+
+vfmadd213ps       %ymm16, %ymm17, %ymm19
+vfmadd213ps       (%rax), %ymm17, %ymm19
+vfmadd213ps       (%rax){1to8}, %ymm17, %ymm19
+vfmadd213ps       %ymm16, %ymm17, %ymm19 {k1}
+vfmadd213ps       (%rax), %ymm17, %ymm19 {k1}
+vfmadd213ps       (%rax){1to8}, %ymm17, %ymm19 {k1}
+vfmadd213ps       %ymm16, %ymm17, %ymm19 {z}{k1}
+vfmadd213ps       (%rax), %ymm17, %ymm19 {z}{k1}
+vfmadd213ps       (%rax){1to8}, %ymm17, %ymm19 {z}{k1}
+
+vfmadd231ps       %xmm16, %xmm17, %xmm19
+vfmadd231ps       (%rax), %xmm17, %xmm19
+vfmadd231ps       (%rax){1to4}, %xmm17, %xmm19
+vfmadd231ps       %xmm16, %xmm17, %xmm19 {k1}
+vfmadd231ps       (%rax), %xmm17, %xmm19 {k1}
+vfmadd231ps       (%rax){1to4}, %xmm17, %xmm19 {k1}
+vfmadd231ps       %xmm16, %xmm17, %xmm19 {z}{k1}
+vfmadd231ps       (%rax), %xmm17, %xmm19 {z}{k1}
+vfmadd231ps       (%rax){1to4}, %xmm17, %xmm19 {z}{k1}
+
+vfmadd231ps       %ymm16, %ymm17, %ymm19
+vfmadd231ps       (%rax), %ymm17, %ymm19
+vfmadd231ps       (%rax){1to8}, %ymm17, %ymm19
+vfmadd231ps       %ymm16, %ymm17, %ymm19 {k1}
+vfmadd231ps       (%rax), %ymm17, %ymm19 {k1}
+vfmadd231ps       (%rax){1to8}, %ymm17, %ymm19 {k1}
+vfmadd231ps       %ymm16, %ymm17, %ymm19 {z}{k1}
+vfmadd231ps       (%rax), %ymm17, %ymm19 {z}{k1}
+vfmadd231ps       (%rax){1to8}, %ymm17, %ymm19 {z}{k1}
+
 vgatherdpd        (%rax,%xmm1,2), %ymm2 {k1}
 vgatherdps        (%rax,%ymm1,2), %ymm2 {k1}
 vgatherqpd        (%rax,%ymm1,2), %ymm2 {k1}
@@ -1961,6 +2081,114 @@ vunpcklps         (%rax){1to8}, %ymm17, %ymm19 {z}{k1}
 # CHECK-NEXT:  1      11    3.00                        vdivps	%ymm16, %ymm17, %ymm19 {%k1} {z}
 # CHECK-NEXT:  1      18    3.00    *                   vdivps	(%rax), %ymm17, %ymm19 {%k1} {z}
 # CHECK-NEXT:  1      18    3.00    *                   vdivps	(%rax){1to8}, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  1      4     0.50                        vfmadd132pd	%xmm16, %xmm17, %xmm19
+# CHECK-NEXT:  1      11    0.50    *                   vfmadd132pd	(%rax), %xmm17, %xmm19
+# CHECK-NEXT:  1      11    0.50    *                   vfmadd132pd	(%rax){1to2}, %xmm17, %xmm19
+# CHECK-NEXT:  1      4     0.50                        vfmadd132pd	%xmm16, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  1      11    0.50    *                   vfmadd132pd	(%rax), %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  1      11    0.50    *                   vfmadd132pd	(%rax){1to2}, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  1      4     0.50                        vfmadd132pd	%xmm16, %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  1      11    0.50    *                   vfmadd132pd	(%rax), %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  1      11    0.50    *                   vfmadd132pd	(%rax){1to2}, %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  1      4     0.50                        vfmadd132pd	%ymm16, %ymm17, %ymm19
+# CHECK-NEXT:  1      11    0.50    *                   vfmadd132pd	(%rax), %ymm17, %ymm19
+# CHECK-NEXT:  1      11    0.50    *                   vfmadd132pd	(%rax){1to4}, %ymm17, %ymm19
+# CHECK-NEXT:  1      4     0.50                        vfmadd132pd	%ymm16, %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  1      11    0.50    *                   vfmadd132pd	(%rax), %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  1      11    0.50    *                   vfmadd132pd	(%rax){1to4}, %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  1      4     0.50                        vfmadd132pd	%ymm16, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  1      11    0.50    *                   vfmadd132pd	(%rax), %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  1      11    0.50    *                   vfmadd132pd	(%rax){1to4}, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  1      4     0.50                        vfmadd213pd	%xmm16, %xmm17, %xmm19
+# CHECK-NEXT:  1      11    0.50    *                   vfmadd213pd	(%rax), %xmm17, %xmm19
+# CHECK-NEXT:  1      11    0.50    *                   vfmadd213pd	(%rax){1to2}, %xmm17, %xmm19
+# CHECK-NEXT:  1      4     0.50                        vfmadd213pd	%xmm16, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  1      11    0.50    *                   vfmadd213pd	(%rax), %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  1      11    0.50    *                   vfmadd213pd	(%rax){1to2}, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  1      4     0.50                        vfmadd213pd	%xmm16, %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  1      11    0.50    *                   vfmadd213pd	(%rax), %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  1      11    0.50    *                   vfmadd213pd	(%rax){1to2}, %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  1      4     0.50                        vfmadd213pd	%ymm16, %ymm17, %ymm19
+# CHECK-NEXT:  1      11    0.50    *                   vfmadd213pd	(%rax), %ymm17, %ymm19
+# CHECK-NEXT:  1      11    0.50    *                   vfmadd213pd	(%rax){1to4}, %ymm17, %ymm19
+# CHECK-NEXT:  1      4     0.50                        vfmadd213pd	%ymm16, %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  1      11    0.50    *                   vfmadd213pd	(%rax), %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  1      11    0.50    *                   vfmadd213pd	(%rax){1to4}, %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  1      4     0.50                        vfmadd213pd	%ymm16, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  1      11    0.50    *                   vfmadd213pd	(%rax), %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  1      11    0.50    *                   vfmadd213pd	(%rax){1to4}, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  1      4     0.50                        vfmadd231pd	%xmm16, %xmm17, %xmm19
+# CHECK-NEXT:  1      11    0.50    *                   vfmadd231pd	(%rax), %xmm17, %xmm19
+# CHECK-NEXT:  1      11    0.50    *                   vfmadd231pd	(%rax){1to2}, %xmm17, %xmm19
+# CHECK-NEXT:  1      4     0.50                        vfmadd231pd	%xmm16, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  1      11    0.50    *                   vfmadd231pd	(%rax), %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  1      11    0.50    *                   vfmadd231pd	(%rax){1to2}, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  1      4     0.50                        vfmadd231pd	%xmm16, %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  1      11    0.50    *                   vfmadd231pd	(%rax), %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  1      11    0.50    *                   vfmadd231pd	(%rax){1to2}, %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  1      4     0.50                        vfmadd231pd	%ymm16, %ymm17, %ymm19
+# CHECK-NEXT:  1      11    0.50    *                   vfmadd231pd	(%rax), %ymm17, %ymm19
+# CHECK-NEXT:  1      11    0.50    *                   vfmadd231pd	(%rax){1to4}, %ymm17, %ymm19
+# CHECK-NEXT:  1      4     0.50                        vfmadd231pd	%ymm16, %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  1      11    0.50    *                   vfmadd231pd	(%rax), %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  1      11    0.50    *                   vfmadd231pd	(%rax){1to4}, %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  1      4     0.50                        vfmadd231pd	%ymm16, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  1      11    0.50    *                   vfmadd231pd	(%rax), %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  1      11    0.50    *                   vfmadd231pd	(%rax){1to4}, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  1      4     0.50                        vfmadd132ps	%xmm16, %xmm17, %xmm19
+# CHECK-NEXT:  1      11    0.50    *                   vfmadd132ps	(%rax), %xmm17, %xmm19
+# CHECK-NEXT:  1      11    0.50    *                   vfmadd132ps	(%rax){1to4}, %xmm17, %xmm19
+# CHECK-NEXT:  1      4     0.50                        vfmadd132ps	%xmm16, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  1      11    0.50    *                   vfmadd132ps	(%rax), %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  1      11    0.50    *                   vfmadd132ps	(%rax){1to4}, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  1      4     0.50                        vfmadd132ps	%xmm16, %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  1      11    0.50    *                   vfmadd132ps	(%rax), %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  1      11    0.50    *                   vfmadd132ps	(%rax){1to4}, %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  1      4     0.50                        vfmadd132ps	%ymm16, %ymm17, %ymm19
+# CHECK-NEXT:  1      11    0.50    *                   vfmadd132ps	(%rax), %ymm17, %ymm19
+# CHECK-NEXT:  1      11    0.50    *                   vfmadd132ps	(%rax){1to8}, %ymm17, %ymm19
+# CHECK-NEXT:  1      4     0.50                        vfmadd132ps	%ymm16, %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  1      11    0.50    *                   vfmadd132ps	(%rax), %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  1      11    0.50    *                   vfmadd132ps	(%rax){1to8}, %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  1      4     0.50                        vfmadd132ps	%ymm16, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  1      11    0.50    *                   vfmadd132ps	(%rax), %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  1      11    0.50    *                   vfmadd132ps	(%rax){1to8}, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  1      4     0.50                        vfmadd213ps	%xmm16, %xmm17, %xmm19
+# CHECK-NEXT:  1      11    0.50    *                   vfmadd213ps	(%rax), %xmm17, %xmm19
+# CHECK-NEXT:  1      11    0.50    *                   vfmadd213ps	(%rax){1to4}, %xmm17, %xmm19
+# CHECK-NEXT:  1      4     0.50                        vfmadd213ps	%xmm16, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  1      11    0.50    *                   vfmadd213ps	(%rax), %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  1      11    0.50    *                   vfmadd213ps	(%rax){1to4}, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  1      4     0.50                        vfmadd213ps	%xmm16, %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  1      11    0.50    *                   vfmadd213ps	(%rax), %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  1      11    0.50    *                   vfmadd213ps	(%rax){1to4}, %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  1      4     0.50                        vfmadd213ps	%ymm16, %ymm17, %ymm19
+# CHECK-NEXT:  1      11    0.50    *                   vfmadd213ps	(%rax), %ymm17, %ymm19
+# CHECK-NEXT:  1      11    0.50    *                   vfmadd213ps	(%rax){1to8}, %ymm17, %ymm19
+# CHECK-NEXT:  1      4     0.50                        vfmadd213ps	%ymm16, %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  1      11    0.50    *                   vfmadd213ps	(%rax), %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  1      11    0.50    *                   vfmadd213ps	(%rax){1to8}, %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  1      4     0.50                        vfmadd213ps	%ymm16, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  1      11    0.50    *                   vfmadd213ps	(%rax), %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  1      11    0.50    *                   vfmadd213ps	(%rax){1to8}, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  1      4     0.50                        vfmadd231ps	%xmm16, %xmm17, %xmm19
+# CHECK-NEXT:  1      11    0.50    *                   vfmadd231ps	(%rax), %xmm17, %xmm19
+# CHECK-NEXT:  1      11    0.50    *                   vfmadd231ps	(%rax){1to4}, %xmm17, %xmm19
+# CHECK-NEXT:  1      4     0.50                        vfmadd231ps	%xmm16, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  1      11    0.50    *                   vfmadd231ps	(%rax), %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  1      11    0.50    *                   vfmadd231ps	(%rax){1to4}, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  1      4     0.50                        vfmadd231ps	%xmm16, %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  1      11    0.50    *                   vfmadd231ps	(%rax), %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  1      11    0.50    *                   vfmadd231ps	(%rax){1to4}, %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  1      4     0.50                        vfmadd231ps	%ymm16, %ymm17, %ymm19
+# CHECK-NEXT:  1      11    0.50    *                   vfmadd231ps	(%rax), %ymm17, %ymm19
+# CHECK-NEXT:  1      11    0.50    *                   vfmadd231ps	(%rax){1to8}, %ymm17, %ymm19
+# CHECK-NEXT:  1      4     0.50                        vfmadd231ps	%ymm16, %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  1      11    0.50    *                   vfmadd231ps	(%rax), %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  1      11    0.50    *                   vfmadd231ps	(%rax){1to8}, %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  1      4     0.50                        vfmadd231ps	%ymm16, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  1      11    0.50    *                   vfmadd231ps	(%rax), %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  1      11    0.50    *                   vfmadd231ps	(%rax){1to8}, %ymm17, %ymm19 {%k1} {z}
 # CHECK-NEXT:  1      5     0.33    *                   vgatherdpd	(%rax,%xmm1,2), %ymm2 {%k1}
 # CHECK-NEXT:  1      5     0.33    *                   vgatherdps	(%rax,%ymm1,2), %ymm2 {%k1}
 # CHECK-NEXT:  1      5     0.33    *                   vgatherqpd	(%rax,%ymm1,2), %ymm2 {%k1}
@@ -3015,7 +3243,7 @@ vunpcklps         (%rax){1to8}, %ymm17, %ymm19 {z}{k1}
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12.0] [12.1] [13]   [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1]
-# CHECK-NEXT: 5.33   5.33   5.33    -      -      -      -      -     154.00 924.00 531.50 261.50 442.50 442.50 32.00  300.33 300.33 300.33 289.67 289.67 289.67 16.00  16.00
+# CHECK-NEXT: 5.33   5.33   5.33    -      -      -      -      -     208.00 978.00 531.50 261.50 478.50 478.50 32.00  324.33 324.33 324.33 313.67 313.67 313.67 16.00  16.00
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12.0] [12.1] [13]   [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1] Instructions:
@@ -3391,6 +3619,114 @@ vunpcklps         (%rax){1to8}, %ymm17, %ymm19 {z}{k1}
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     3.00    -      -      -      -      -      -      -      -      -      -      -      -      -     vdivps	%ymm16, %ymm17, %ymm19 {%k1} {z}
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     3.00    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vdivps	(%rax), %ymm17, %ymm19 {%k1} {z}
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     3.00    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vdivps	(%rax){1to8}, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     vfmadd132pd	%xmm16, %xmm17, %xmm19
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vfmadd132pd	(%rax), %xmm17, %xmm19
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vfmadd132pd	(%rax){1to2}, %xmm17, %xmm19
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     vfmadd132pd	%xmm16, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vfmadd132pd	(%rax), %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vfmadd132pd	(%rax){1to2}, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     vfmadd132pd	%xmm16, %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vfmadd132pd	(%rax), %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vfmadd132pd	(%rax){1to2}, %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     vfmadd132pd	%ymm16, %ymm17, %ymm19
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vfmadd132pd	(%rax), %ymm17, %ymm19
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vfmadd132pd	(%rax){1to4}, %ymm17, %ymm19
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     vfmadd132pd	%ymm16, %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vfmadd132pd	(%rax), %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vfmadd132pd	(%rax){1to4}, %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     vfmadd132pd	%ymm16, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vfmadd132pd	(%rax), %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vfmadd132pd	(%rax){1to4}, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     vfmadd213pd	%xmm16, %xmm17, %xmm19
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vfmadd213pd	(%rax), %xmm17, %xmm19
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vfmadd213pd	(%rax){1to2}, %xmm17, %xmm19
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     vfmadd213pd	%xmm16, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vfmadd213pd	(%rax), %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vfmadd213pd	(%rax){1to2}, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     vfmadd213pd	%xmm16, %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vfmadd213pd	(%rax), %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vfmadd213pd	(%rax){1to2}, %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     vfmadd213pd	%ymm16, %ymm17, %ymm19
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vfmadd213pd	(%rax), %ymm17, %ymm19
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vfmadd213pd	(%rax){1to4}, %ymm17, %ymm19
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     vfmadd213pd	%ymm16, %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vfmadd213pd	(%rax), %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vfmadd213pd	(%rax){1to4}, %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     vfmadd213pd	%ymm16, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vfmadd213pd	(%rax), %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vfmadd213pd	(%rax){1to4}, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     vfmadd231pd	%xmm16, %xmm17, %xmm19
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vfmadd231pd	(%rax), %xmm17, %xmm19
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vfmadd231pd	(%rax){1to2}, %xmm17, %xmm19
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     vfmadd231pd	%xmm16, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vfmadd231pd	(%rax), %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vfmadd231pd	(%rax){1to2}, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     vfmadd231pd	%xmm16, %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vfmadd231pd	(%rax), %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vfmadd231pd	(%rax){1to2}, %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     vfmadd231pd	%ymm16, %ymm17, %ymm19
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vfmadd231pd	(%rax), %ymm17, %ymm19
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vfmadd231pd	(%rax){1to4}, %ymm17, %ymm19
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     vfmadd231pd	%ymm16, %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vfmadd231pd	(%rax), %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vfmadd231pd	(%rax){1to4}, %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     vfmadd231pd	%ymm16, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vfmadd231pd	(%rax), %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vfmadd231pd	(%rax){1to4}, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     vfmadd132ps	%xmm16, %xmm17, %xmm19
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vfmadd132ps	(%rax), %xmm17, %xmm19
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vfmadd132ps	(%rax){1to4}, %xmm17, %xmm19
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     vfmadd132ps	%xmm16, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vfmadd132ps	(%rax), %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vfmadd132ps	(%rax){1to4}, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     vfmadd132ps	%xmm16, %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vfmadd132ps	(%rax), %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vfmadd132ps	(%rax){1to4}, %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     vfmadd132ps	%ymm16, %ymm17, %ymm19
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vfmadd132ps	(%rax), %ymm17, %ymm19
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vfmadd132ps	(%rax){1to8}, %ymm17, %ymm19
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     vfmadd132ps	%ymm16, %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vfmadd132ps	(%rax), %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vfmadd132ps	(%rax){1to8}, %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     vfmadd132ps	%ymm16, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vfmadd132ps	(%rax), %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vfmadd132ps	(%rax){1to8}, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     vfmadd213ps	%xmm16, %xmm17, %xmm19
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vfmadd213ps	(%rax), %xmm17, %xmm19
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vfmadd213ps	(%rax){1to4}, %xmm17, %xmm19
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     vfmadd213ps	%xmm16, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vfmadd213ps	(%rax), %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vfmadd213ps	(%rax){1to4}, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     vfmadd213ps	%xmm16, %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vfmadd213ps	(%rax), %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vfmadd213ps	(%rax){1to4}, %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     vfmadd213ps	%ymm16, %ymm17, %ymm19
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vfmadd213ps	(%rax), %ymm17, %ymm19
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vfmadd213ps	(%rax){1to8}, %ymm17, %ymm19
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     vfmadd213ps	%ymm16, %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vfmadd213ps	(%rax), %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vfmadd213ps	(%rax){1to8}, %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     vfmadd213ps	%ymm16, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vfmadd213ps	(%rax), %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vfmadd213ps	(%rax){1to8}, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     vfmadd231ps	%xmm16, %xmm17, %xmm19
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vfmadd231ps	(%rax), %xmm17, %xmm19
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vfmadd231ps	(%rax){1to4}, %xmm17, %xmm19
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     vfmadd231ps	%xmm16, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vfmadd231ps	(%rax), %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vfmadd231ps	(%rax){1to4}, %xmm17, %xmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     vfmadd231ps	%xmm16, %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vfmadd231ps	(%rax), %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vfmadd231ps	(%rax){1to4}, %xmm17, %xmm19 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     vfmadd231ps	%ymm16, %ymm17, %ymm19
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vfmadd231ps	(%rax), %ymm17, %ymm19
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vfmadd231ps	(%rax){1to8}, %ymm17, %ymm19
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     vfmadd231ps	%ymm16, %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vfmadd231ps	(%rax), %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vfmadd231ps	(%rax){1to8}, %ymm17, %ymm19 {%k1}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -     vfmadd231ps	%ymm16, %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vfmadd231ps	(%rax), %ymm17, %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vfmadd231ps	(%rax){1to8}, %ymm17, %ymm19 {%k1} {z}
 # CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -      -      -      -      -      -      -      -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vgatherdpd	(%rax,%xmm1,2), %ymm2 {%k1}
 # CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -      -      -      -      -      -      -      -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vgatherdps	(%rax,%ymm1,2), %ymm2 {%k1}
 # CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -      -      -      -      -      -      -      -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vgatherqpd	(%rax,%ymm1,2), %ymm2 {%k1}

From 80941193765b96cf5d6d95313665cc0a720e1d28 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Mon, 30 Oct 2023 18:06:07 +0000
Subject: [PATCH 410/877] [X86] IceLakeServer - ZMM FMA can only use Port0

Fix discrepancy from when this was forked from the SkylakeServer model

Confirmed with Agner + uops.info
---
 llvm/lib/Target/X86/X86SchedIceLake.td        |   2 +-
 .../X86/IceLakeServer/resources-avx512.s      | 218 +++++++++---------
 2 files changed, 110 insertions(+), 110 deletions(-)

diff --git a/llvm/lib/Target/X86/X86SchedIceLake.td b/llvm/lib/Target/X86/X86SchedIceLake.td
index e5be73d22c40b..a2aa2655bca28 100644
--- a/llvm/lib/Target/X86/X86SchedIceLake.td
+++ b/llvm/lib/Target/X86/X86SchedIceLake.td
@@ -316,7 +316,7 @@ defm : ICXWriteResPair<WriteFRsqrtZ,[ICXPort0,ICXPort5],  9, [2,1], 3, 7>;
 defm : ICXWriteResPair<WriteFMA,  [ICXPort01],  4, [1], 1, 5>; // Fused Multiply Add.
 defm : ICXWriteResPair<WriteFMAX, [ICXPort01],  4, [1], 1, 6>;
 defm : ICXWriteResPair<WriteFMAY, [ICXPort01],  4, [1], 1, 7>;
-defm : ICXWriteResPair<WriteFMAZ, [ICXPort05],  4, [1], 1, 7>;
+defm : ICXWriteResPair<WriteFMAZ, [ICXPort0],   4, [1], 1, 7>;
 defm : ICXWriteResPair<WriteDPPD, [ICXPort5,ICXPort015],  9, [1,2], 3, 6>; // Floating point double dot product.
 defm : ICXWriteResPair<WriteDPPS, [ICXPort5,ICXPort015], 13, [1,3], 4, 6>;
 defm : ICXWriteResPair<WriteDPPSY,[ICXPort5,ICXPort015], 13, [1,3], 4, 7>;
diff --git a/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx512.s b/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx512.s
index d99213f0b25a6..6d33fdb3359b4 100644
--- a/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx512.s
+++ b/llvm/test/tools/llvm-mca/X86/IceLakeServer/resources-avx512.s
@@ -1334,60 +1334,60 @@ vunpcklps         (%rax){1to16}, %zmm17, %zmm19 {z}{k1}
 # CHECK-NEXT:  3      18    10.00                       vdivps	%zmm16, %zmm17, %zmm19 {%k1} {z}
 # CHECK-NEXT:  4      25    10.00   *                   vdivps	(%rax), %zmm17, %zmm19 {%k1} {z}
 # CHECK-NEXT:  4      25    10.00   *                   vdivps	(%rax){1to16}, %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  1      4     0.50                        vfmadd132pd	%zmm16, %zmm17, %zmm19
-# CHECK-NEXT:  2      11    0.50    *                   vfmadd132pd	(%rax), %zmm17, %zmm19
-# CHECK-NEXT:  2      11    0.50    *                   vfmadd132pd	(%rax){1to8}, %zmm17, %zmm19
-# CHECK-NEXT:  1      4     0.50                        vfmadd132pd	%zmm16, %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  2      11    0.50    *                   vfmadd132pd	(%rax), %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  2      11    0.50    *                   vfmadd132pd	(%rax){1to8}, %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  1      4     0.50                        vfmadd132pd	%zmm16, %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  2      11    0.50    *                   vfmadd132pd	(%rax), %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  2      11    0.50    *                   vfmadd132pd	(%rax){1to8}, %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  1      4     0.50                        vfmadd213pd	%zmm16, %zmm17, %zmm19
-# CHECK-NEXT:  2      11    0.50    *                   vfmadd213pd	(%rax), %zmm17, %zmm19
-# CHECK-NEXT:  2      11    0.50    *                   vfmadd213pd	(%rax){1to8}, %zmm17, %zmm19
-# CHECK-NEXT:  1      4     0.50                        vfmadd213pd	%zmm16, %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  2      11    0.50    *                   vfmadd213pd	(%rax), %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  2      11    0.50    *                   vfmadd213pd	(%rax){1to8}, %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  1      4     0.50                        vfmadd213pd	%zmm16, %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  2      11    0.50    *                   vfmadd213pd	(%rax), %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  2      11    0.50    *                   vfmadd213pd	(%rax){1to8}, %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  1      4     0.50                        vfmadd231pd	%zmm16, %zmm17, %zmm19
-# CHECK-NEXT:  2      11    0.50    *                   vfmadd231pd	(%rax), %zmm17, %zmm19
-# CHECK-NEXT:  2      11    0.50    *                   vfmadd231pd	(%rax){1to8}, %zmm17, %zmm19
-# CHECK-NEXT:  1      4     0.50                        vfmadd231pd	%zmm16, %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  2      11    0.50    *                   vfmadd231pd	(%rax), %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  2      11    0.50    *                   vfmadd231pd	(%rax){1to8}, %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  1      4     0.50                        vfmadd231pd	%zmm16, %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  2      11    0.50    *                   vfmadd231pd	(%rax), %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  2      11    0.50    *                   vfmadd231pd	(%rax){1to8}, %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  1      4     0.50                        vfmadd132ps	%zmm16, %zmm17, %zmm19
-# CHECK-NEXT:  2      11    0.50    *                   vfmadd132ps	(%rax), %zmm17, %zmm19
-# CHECK-NEXT:  2      11    0.50    *                   vfmadd132ps	(%rax){1to16}, %zmm17, %zmm19
-# CHECK-NEXT:  1      4     0.50                        vfmadd132ps	%zmm16, %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  2      11    0.50    *                   vfmadd132ps	(%rax), %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  2      11    0.50    *                   vfmadd132ps	(%rax){1to16}, %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  1      4     0.50                        vfmadd132ps	%zmm16, %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  2      11    0.50    *                   vfmadd132ps	(%rax), %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  2      11    0.50    *                   vfmadd132ps	(%rax){1to16}, %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  1      4     0.50                        vfmadd213ps	%zmm16, %zmm17, %zmm19
-# CHECK-NEXT:  2      11    0.50    *                   vfmadd213ps	(%rax), %zmm17, %zmm19
-# CHECK-NEXT:  2      11    0.50    *                   vfmadd213ps	(%rax){1to16}, %zmm17, %zmm19
-# CHECK-NEXT:  1      4     0.50                        vfmadd213ps	%zmm16, %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  2      11    0.50    *                   vfmadd213ps	(%rax), %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  2      11    0.50    *                   vfmadd213ps	(%rax){1to16}, %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  1      4     0.50                        vfmadd213ps	%zmm16, %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  2      11    0.50    *                   vfmadd213ps	(%rax), %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  2      11    0.50    *                   vfmadd213ps	(%rax){1to16}, %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  1      4     0.50                        vfmadd231ps	%zmm16, %zmm17, %zmm19
-# CHECK-NEXT:  2      11    0.50    *                   vfmadd231ps	(%rax), %zmm17, %zmm19
-# CHECK-NEXT:  2      11    0.50    *                   vfmadd231ps	(%rax){1to16}, %zmm17, %zmm19
-# CHECK-NEXT:  1      4     0.50                        vfmadd231ps	%zmm16, %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  2      11    0.50    *                   vfmadd231ps	(%rax), %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  2      11    0.50    *                   vfmadd231ps	(%rax){1to16}, %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  1      4     0.50                        vfmadd231ps	%zmm16, %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  2      11    0.50    *                   vfmadd231ps	(%rax), %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  2      11    0.50    *                   vfmadd231ps	(%rax){1to16}, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  1      4     1.00                        vfmadd132pd	%zmm16, %zmm17, %zmm19
+# CHECK-NEXT:  2      11    1.00    *                   vfmadd132pd	(%rax), %zmm17, %zmm19
+# CHECK-NEXT:  2      11    1.00    *                   vfmadd132pd	(%rax){1to8}, %zmm17, %zmm19
+# CHECK-NEXT:  1      4     1.00                        vfmadd132pd	%zmm16, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  2      11    1.00    *                   vfmadd132pd	(%rax), %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  2      11    1.00    *                   vfmadd132pd	(%rax){1to8}, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  1      4     1.00                        vfmadd132pd	%zmm16, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  2      11    1.00    *                   vfmadd132pd	(%rax), %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  2      11    1.00    *                   vfmadd132pd	(%rax){1to8}, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  1      4     1.00                        vfmadd213pd	%zmm16, %zmm17, %zmm19
+# CHECK-NEXT:  2      11    1.00    *                   vfmadd213pd	(%rax), %zmm17, %zmm19
+# CHECK-NEXT:  2      11    1.00    *                   vfmadd213pd	(%rax){1to8}, %zmm17, %zmm19
+# CHECK-NEXT:  1      4     1.00                        vfmadd213pd	%zmm16, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  2      11    1.00    *                   vfmadd213pd	(%rax), %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  2      11    1.00    *                   vfmadd213pd	(%rax){1to8}, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  1      4     1.00                        vfmadd213pd	%zmm16, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  2      11    1.00    *                   vfmadd213pd	(%rax), %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  2      11    1.00    *                   vfmadd213pd	(%rax){1to8}, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  1      4     1.00                        vfmadd231pd	%zmm16, %zmm17, %zmm19
+# CHECK-NEXT:  2      11    1.00    *                   vfmadd231pd	(%rax), %zmm17, %zmm19
+# CHECK-NEXT:  2      11    1.00    *                   vfmadd231pd	(%rax){1to8}, %zmm17, %zmm19
+# CHECK-NEXT:  1      4     1.00                        vfmadd231pd	%zmm16, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  2      11    1.00    *                   vfmadd231pd	(%rax), %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  2      11    1.00    *                   vfmadd231pd	(%rax){1to8}, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  1      4     1.00                        vfmadd231pd	%zmm16, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  2      11    1.00    *                   vfmadd231pd	(%rax), %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  2      11    1.00    *                   vfmadd231pd	(%rax){1to8}, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  1      4     1.00                        vfmadd132ps	%zmm16, %zmm17, %zmm19
+# CHECK-NEXT:  2      11    1.00    *                   vfmadd132ps	(%rax), %zmm17, %zmm19
+# CHECK-NEXT:  2      11    1.00    *                   vfmadd132ps	(%rax){1to16}, %zmm17, %zmm19
+# CHECK-NEXT:  1      4     1.00                        vfmadd132ps	%zmm16, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  2      11    1.00    *                   vfmadd132ps	(%rax), %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  2      11    1.00    *                   vfmadd132ps	(%rax){1to16}, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  1      4     1.00                        vfmadd132ps	%zmm16, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  2      11    1.00    *                   vfmadd132ps	(%rax), %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  2      11    1.00    *                   vfmadd132ps	(%rax){1to16}, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  1      4     1.00                        vfmadd213ps	%zmm16, %zmm17, %zmm19
+# CHECK-NEXT:  2      11    1.00    *                   vfmadd213ps	(%rax), %zmm17, %zmm19
+# CHECK-NEXT:  2      11    1.00    *                   vfmadd213ps	(%rax){1to16}, %zmm17, %zmm19
+# CHECK-NEXT:  1      4     1.00                        vfmadd213ps	%zmm16, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  2      11    1.00    *                   vfmadd213ps	(%rax), %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  2      11    1.00    *                   vfmadd213ps	(%rax){1to16}, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  1      4     1.00                        vfmadd213ps	%zmm16, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  2      11    1.00    *                   vfmadd213ps	(%rax), %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  2      11    1.00    *                   vfmadd213ps	(%rax){1to16}, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  1      4     1.00                        vfmadd231ps	%zmm16, %zmm17, %zmm19
+# CHECK-NEXT:  2      11    1.00    *                   vfmadd231ps	(%rax), %zmm17, %zmm19
+# CHECK-NEXT:  2      11    1.00    *                   vfmadd231ps	(%rax){1to16}, %zmm17, %zmm19
+# CHECK-NEXT:  1      4     1.00                        vfmadd231ps	%zmm16, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  2      11    1.00    *                   vfmadd231ps	(%rax), %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  2      11    1.00    *                   vfmadd231ps	(%rax){1to16}, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  1      4     1.00                        vfmadd231ps	%zmm16, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  2      11    1.00    *                   vfmadd231ps	(%rax), %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  2      11    1.00    *                   vfmadd231ps	(%rax){1to16}, %zmm17, %zmm19 {%k1} {z}
 # CHECK-NEXT:  5      21    4.00    *                   vgatherdpd	(%rax,%ymm1,2), %zmm2 {%k1}
 # CHECK-NEXT:  5      25    8.00    *                   vgatherdps	(%rax,%zmm1,2), %zmm2 {%k1}
 # CHECK-NEXT:  5      21    4.00    *                   vgatherqpd	(%rax,%zmm1,2), %zmm2 {%k1}
@@ -2031,7 +2031,7 @@ vunpcklps         (%rax){1to16}, %zmm17, %zmm19 {z}{k1}
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]
-# CHECK-NEXT:  -     612.00 344.17 99.67  327.50 327.50 8.00   639.17 2.00   8.00   8.00   8.00
+# CHECK-NEXT:  -     612.00 371.17 99.67  327.50 327.50 8.00   612.17 2.00   8.00   8.00   8.00
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   Instructions:
@@ -2294,60 +2294,60 @@ vunpcklps         (%rax){1to16}, %zmm17, %zmm19 {z}{k1}
 # CHECK-NEXT:  -     10.00  2.00    -      -      -      -     1.00    -      -      -      -     vdivps	%zmm16, %zmm17, %zmm19 {%k1} {z}
 # CHECK-NEXT:  -     10.00  2.00    -     0.50   0.50    -     1.00    -      -      -      -     vdivps	(%rax), %zmm17, %zmm19 {%k1} {z}
 # CHECK-NEXT:  -     10.00  2.00    -     0.50   0.50    -     1.00    -      -      -      -     vdivps	(%rax){1to16}, %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vfmadd132pd	%zmm16, %zmm17, %zmm19
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vfmadd132pd	(%rax), %zmm17, %zmm19
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vfmadd132pd	(%rax){1to8}, %zmm17, %zmm19
-# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vfmadd132pd	%zmm16, %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vfmadd132pd	(%rax), %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vfmadd132pd	(%rax){1to8}, %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vfmadd132pd	%zmm16, %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vfmadd132pd	(%rax), %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vfmadd132pd	(%rax){1to8}, %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vfmadd213pd	%zmm16, %zmm17, %zmm19
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vfmadd213pd	(%rax), %zmm17, %zmm19
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vfmadd213pd	(%rax){1to8}, %zmm17, %zmm19
-# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vfmadd213pd	%zmm16, %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vfmadd213pd	(%rax), %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vfmadd213pd	(%rax){1to8}, %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vfmadd213pd	%zmm16, %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vfmadd213pd	(%rax), %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vfmadd213pd	(%rax){1to8}, %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vfmadd231pd	%zmm16, %zmm17, %zmm19
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vfmadd231pd	(%rax), %zmm17, %zmm19
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vfmadd231pd	(%rax){1to8}, %zmm17, %zmm19
-# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vfmadd231pd	%zmm16, %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vfmadd231pd	(%rax), %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vfmadd231pd	(%rax){1to8}, %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vfmadd231pd	%zmm16, %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vfmadd231pd	(%rax), %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vfmadd231pd	(%rax){1to8}, %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vfmadd132ps	%zmm16, %zmm17, %zmm19
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vfmadd132ps	(%rax), %zmm17, %zmm19
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vfmadd132ps	(%rax){1to16}, %zmm17, %zmm19
-# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vfmadd132ps	%zmm16, %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vfmadd132ps	(%rax), %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vfmadd132ps	(%rax){1to16}, %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vfmadd132ps	%zmm16, %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vfmadd132ps	(%rax), %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vfmadd132ps	(%rax){1to16}, %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vfmadd213ps	%zmm16, %zmm17, %zmm19
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vfmadd213ps	(%rax), %zmm17, %zmm19
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vfmadd213ps	(%rax){1to16}, %zmm17, %zmm19
-# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vfmadd213ps	%zmm16, %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vfmadd213ps	(%rax), %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vfmadd213ps	(%rax){1to16}, %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vfmadd213ps	%zmm16, %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vfmadd213ps	(%rax), %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vfmadd213ps	(%rax){1to16}, %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vfmadd231ps	%zmm16, %zmm17, %zmm19
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vfmadd231ps	(%rax), %zmm17, %zmm19
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vfmadd231ps	(%rax){1to16}, %zmm17, %zmm19
-# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vfmadd231ps	%zmm16, %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vfmadd231ps	(%rax), %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vfmadd231ps	(%rax){1to16}, %zmm17, %zmm19 {%k1}
-# CHECK-NEXT:  -      -     0.50    -      -      -      -     0.50    -      -      -      -     vfmadd231ps	%zmm16, %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vfmadd231ps	(%rax), %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  -      -     0.50    -     0.50   0.50    -     0.50    -      -      -      -     vfmadd231ps	(%rax){1to16}, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -     vfmadd132pd	%zmm16, %zmm17, %zmm19
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vfmadd132pd	(%rax), %zmm17, %zmm19
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vfmadd132pd	(%rax){1to8}, %zmm17, %zmm19
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -     vfmadd132pd	%zmm16, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vfmadd132pd	(%rax), %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vfmadd132pd	(%rax){1to8}, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -     vfmadd132pd	%zmm16, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vfmadd132pd	(%rax), %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vfmadd132pd	(%rax){1to8}, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -     vfmadd213pd	%zmm16, %zmm17, %zmm19
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vfmadd213pd	(%rax), %zmm17, %zmm19
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vfmadd213pd	(%rax){1to8}, %zmm17, %zmm19
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -     vfmadd213pd	%zmm16, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vfmadd213pd	(%rax), %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vfmadd213pd	(%rax){1to8}, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -     vfmadd213pd	%zmm16, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vfmadd213pd	(%rax), %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vfmadd213pd	(%rax){1to8}, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -     vfmadd231pd	%zmm16, %zmm17, %zmm19
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vfmadd231pd	(%rax), %zmm17, %zmm19
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vfmadd231pd	(%rax){1to8}, %zmm17, %zmm19
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -     vfmadd231pd	%zmm16, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vfmadd231pd	(%rax), %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vfmadd231pd	(%rax){1to8}, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -     vfmadd231pd	%zmm16, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vfmadd231pd	(%rax), %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vfmadd231pd	(%rax){1to8}, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -     vfmadd132ps	%zmm16, %zmm17, %zmm19
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vfmadd132ps	(%rax), %zmm17, %zmm19
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vfmadd132ps	(%rax){1to16}, %zmm17, %zmm19
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -     vfmadd132ps	%zmm16, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vfmadd132ps	(%rax), %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vfmadd132ps	(%rax){1to16}, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -     vfmadd132ps	%zmm16, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vfmadd132ps	(%rax), %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vfmadd132ps	(%rax){1to16}, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -     vfmadd213ps	%zmm16, %zmm17, %zmm19
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vfmadd213ps	(%rax), %zmm17, %zmm19
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vfmadd213ps	(%rax){1to16}, %zmm17, %zmm19
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -     vfmadd213ps	%zmm16, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vfmadd213ps	(%rax), %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vfmadd213ps	(%rax){1to16}, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -     vfmadd213ps	%zmm16, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vfmadd213ps	(%rax), %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vfmadd213ps	(%rax){1to16}, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -     vfmadd231ps	%zmm16, %zmm17, %zmm19
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vfmadd231ps	(%rax), %zmm17, %zmm19
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vfmadd231ps	(%rax){1to16}, %zmm17, %zmm19
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -     vfmadd231ps	%zmm16, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vfmadd231ps	(%rax), %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vfmadd231ps	(%rax){1to16}, %zmm17, %zmm19 {%k1}
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -      -      -      -      -     vfmadd231ps	%zmm16, %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vfmadd231ps	(%rax), %zmm17, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -     1.00    -     0.50   0.50    -      -      -      -      -      -     vfmadd231ps	(%rax){1to16}, %zmm17, %zmm19 {%k1} {z}
 # CHECK-NEXT:  -      -     1.58   0.58   4.00   4.00    -     0.58   0.25    -      -      -     vgatherdpd	(%rax,%ymm1,2), %zmm2 {%k1}
 # CHECK-NEXT:  -      -     1.58   0.58   8.00   8.00    -     0.58   0.25    -      -      -     vgatherdps	(%rax,%zmm1,2), %zmm2 {%k1}
 # CHECK-NEXT:  -      -     1.58   0.58   4.00   4.00    -     0.58   0.25    -      -      -     vgatherqpd	(%rax,%zmm1,2), %zmm2 {%k1}

From a8799719f704fc80640c59b26f6c9ad2631ee103 Mon Sep 17 00:00:00 2001
From: Antonio Frighetto <me@antoniofrighetto.com>
Date: Mon, 30 Oct 2023 15:31:52 +0100
Subject: [PATCH 411/877] [AArch64] Introduce tests for PR67879 (NFC)

---
 .../Atomics/aarch64-atomic-load-rcpc_immo.ll  | 827 ++++++++++++++++++
 .../Atomics/aarch64-atomic-store-rcpc_immo.ll | 368 ++++++++
 2 files changed, 1195 insertions(+)
 create mode 100644 llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-load-rcpc_immo.ll
 create mode 100644 llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-store-rcpc_immo.ll

diff --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-load-rcpc_immo.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-load-rcpc_immo.ll
new file mode 100644
index 0000000000000..05f37a4e440eb
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-load-rcpc_immo.ll
@@ -0,0 +1,827 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --filter-out "\b(sp)\b" --filter "^\s*(ld|st[^r]|swp|cas|bl|add|and|eor|orn|orr|sub|mvn|sxt|cmp|ccmp|csel|dmb)"
+; RUN: llc %s -o - -verify-machineinstrs -mtriple=aarch64 -mattr=+v8.4a -mattr=+rcpc-immo -global-isel=true -global-isel-abort=2 -O0 | FileCheck %s --check-prefixes=CHECK,GISEL
+; RUN: llc %s -o - -verify-machineinstrs -mtriple=aarch64 -mattr=+v8.4a -mattr=+rcpc-immo -global-isel=false -O1 | FileCheck %s --check-prefixes=CHECK,SDAG
+
+define i8 @load_atomic_i8_aligned_unordered(ptr %ptr) {
+; CHECK-LABEL: load_atomic_i8_aligned_unordered:
+; CHECK:    ldrb w0, [x0, #4]
+    %gep = getelementptr inbounds i8, ptr %ptr, i32 4
+    %r = load atomic i8, ptr %gep unordered, align 1
+    ret i8 %r
+}
+
+define i8 @load_atomic_i8_aligned_unordered_const(ptr readonly %ptr) {
+; CHECK-LABEL: load_atomic_i8_aligned_unordered_const:
+; CHECK:    ldrb w0, [x0, #4]
+    %gep = getelementptr inbounds i8, ptr %ptr, i32 4
+    %r = load atomic i8, ptr %gep unordered, align 1
+    ret i8 %r
+}
+
+define i8 @load_atomic_i8_aligned_monotonic(ptr %ptr) {
+; CHECK-LABEL: load_atomic_i8_aligned_monotonic:
+; CHECK:    ldrb w0, [x0, #4]
+    %gep = getelementptr inbounds i8, ptr %ptr, i32 4
+    %r = load atomic i8, ptr %gep monotonic, align 1
+    ret i8 %r
+}
+
+define i8 @load_atomic_i8_aligned_monotonic_const(ptr readonly %ptr) {
+; CHECK-LABEL: load_atomic_i8_aligned_monotonic_const:
+; CHECK:    ldrb w0, [x0, #4]
+    %gep = getelementptr inbounds i8, ptr %ptr, i32 4
+    %r = load atomic i8, ptr %gep monotonic, align 1
+    ret i8 %r
+}
+
+define i8 @load_atomic_i8_aligned_acquire(ptr %ptr) {
+; CHECK-LABEL: load_atomic_i8_aligned_acquire:
+; CHECK:    add x8, x0, #4
+; CHECK:    ldaprb w0, [x8]
+    %gep = getelementptr inbounds i8, ptr %ptr, i32 4
+    %r = load atomic i8, ptr %gep acquire, align 1
+    ret i8 %r
+}
+
+define i8 @load_atomic_i8_aligned_acquire_const(ptr readonly %ptr) {
+; CHECK-LABEL: load_atomic_i8_aligned_acquire_const:
+; CHECK:    add x8, x0, #4
+; CHECK:    ldaprb w0, [x8]
+    %gep = getelementptr inbounds i8, ptr %ptr, i32 4
+    %r = load atomic i8, ptr %gep acquire, align 1
+    ret i8 %r
+}
+
+define i8 @load_atomic_i8_aligned_seq_cst(ptr %ptr) {
+; CHECK-LABEL: load_atomic_i8_aligned_seq_cst:
+; CHECK:    add x8, x0, #4
+; CHECK:    ldarb w0, [x8]
+    %gep = getelementptr inbounds i8, ptr %ptr, i32 4
+    %r = load atomic i8, ptr %gep seq_cst, align 1
+    ret i8 %r
+}
+
+define i8 @load_atomic_i8_aligned_seq_cst_const(ptr readonly %ptr) {
+; CHECK-LABEL: load_atomic_i8_aligned_seq_cst_const:
+; CHECK:    add x8, x0, #4
+; CHECK:    ldarb w0, [x8]
+    %gep = getelementptr inbounds i8, ptr %ptr, i32 4
+    %r = load atomic i8, ptr %gep seq_cst, align 1
+    ret i8 %r
+}
+
+define i16 @load_atomic_i16_aligned_unordered(ptr %ptr) {
+; CHECK-LABEL: load_atomic_i16_aligned_unordered:
+; CHECK:    ldrh w0, [x0, #8]
+    %gep = getelementptr inbounds i16, ptr %ptr, i32 4
+    %r = load atomic i16, ptr %gep unordered, align 2
+    ret i16 %r
+}
+
+define i16 @load_atomic_i16_aligned_unordered_const(ptr readonly %ptr) {
+; CHECK-LABEL: load_atomic_i16_aligned_unordered_const:
+; CHECK:    ldrh w0, [x0, #8]
+    %gep = getelementptr inbounds i16, ptr %ptr, i32 4
+    %r = load atomic i16, ptr %gep unordered, align 2
+    ret i16 %r
+}
+
+define i16 @load_atomic_i16_aligned_monotonic(ptr %ptr) {
+; CHECK-LABEL: load_atomic_i16_aligned_monotonic:
+; CHECK:    ldrh w0, [x0, #8]
+    %gep = getelementptr inbounds i16, ptr %ptr, i32 4
+    %r = load atomic i16, ptr %gep monotonic, align 2
+    ret i16 %r
+}
+
+define i16 @load_atomic_i16_aligned_monotonic_const(ptr readonly %ptr) {
+; CHECK-LABEL: load_atomic_i16_aligned_monotonic_const:
+; CHECK:    ldrh w0, [x0, #8]
+    %gep = getelementptr inbounds i16, ptr %ptr, i32 4
+    %r = load atomic i16, ptr %gep monotonic, align 2
+    ret i16 %r
+}
+
+define i16 @load_atomic_i16_aligned_acquire(ptr %ptr) {
+; CHECK-LABEL: load_atomic_i16_aligned_acquire:
+; CHECK:    add x8, x0, #8
+; CHECK:    ldaprh w0, [x8]
+    %gep = getelementptr inbounds i16, ptr %ptr, i32 4
+    %r = load atomic i16, ptr %gep acquire, align 2
+    ret i16 %r
+}
+
+define i16 @load_atomic_i16_aligned_acquire_const(ptr readonly %ptr) {
+; CHECK-LABEL: load_atomic_i16_aligned_acquire_const:
+; CHECK:    add x8, x0, #8
+; CHECK:    ldaprh w0, [x8]
+    %gep = getelementptr inbounds i16, ptr %ptr, i32 4
+    %r = load atomic i16, ptr %gep acquire, align 2
+    ret i16 %r
+}
+
+define i16 @load_atomic_i16_aligned_seq_cst(ptr %ptr) {
+; CHECK-LABEL: load_atomic_i16_aligned_seq_cst:
+; CHECK:    add x8, x0, #8
+; CHECK:    ldarh w0, [x8]
+    %gep = getelementptr inbounds i16, ptr %ptr, i32 4
+    %r = load atomic i16, ptr %gep seq_cst, align 2
+    ret i16 %r
+}
+
+define i16 @load_atomic_i16_aligned_seq_cst_const(ptr readonly %ptr) {
+; CHECK-LABEL: load_atomic_i16_aligned_seq_cst_const:
+; CHECK:    add x8, x0, #8
+; CHECK:    ldarh w0, [x8]
+    %gep = getelementptr inbounds i16, ptr %ptr, i32 4
+    %r = load atomic i16, ptr %gep seq_cst, align 2
+    ret i16 %r
+}
+
+define i32 @load_atomic_i32_aligned_unordered(ptr %ptr) {
+; CHECK-LABEL: load_atomic_i32_aligned_unordered:
+; CHECK:    ldr w0, [x0, #16]
+    %gep = getelementptr inbounds i32, ptr %ptr, i32 4
+    %r = load atomic i32, ptr %gep unordered, align 4
+    ret i32 %r
+}
+
+define i32 @load_atomic_i32_aligned_unordered_const(ptr readonly %ptr) {
+; CHECK-LABEL: load_atomic_i32_aligned_unordered_const:
+; CHECK:    ldr w0, [x0, #16]
+    %gep = getelementptr inbounds i32, ptr %ptr, i32 4
+    %r = load atomic i32, ptr %gep unordered, align 4
+    ret i32 %r
+}
+
+define i32 @load_atomic_i32_aligned_monotonic(ptr %ptr) {
+; CHECK-LABEL: load_atomic_i32_aligned_monotonic:
+; CHECK:    ldr w0, [x0, #16]
+    %gep = getelementptr inbounds i32, ptr %ptr, i32 4
+    %r = load atomic i32, ptr %gep monotonic, align 4
+    ret i32 %r
+}
+
+define i32 @load_atomic_i32_aligned_monotonic_const(ptr readonly %ptr) {
+; CHECK-LABEL: load_atomic_i32_aligned_monotonic_const:
+; CHECK:    ldr w0, [x0, #16]
+    %gep = getelementptr inbounds i32, ptr %ptr, i32 4
+    %r = load atomic i32, ptr %gep monotonic, align 4
+    ret i32 %r
+}
+
+define i32 @load_atomic_i32_aligned_acquire(ptr %ptr) {
+; CHECK-LABEL: load_atomic_i32_aligned_acquire:
+; CHECK:    add x8, x0, #16
+; CHECK:    ldapr w0, [x8]
+    %gep = getelementptr inbounds i32, ptr %ptr, i32 4
+    %r = load atomic i32, ptr %gep acquire, align 4
+    ret i32 %r
+}
+
+define i32 @load_atomic_i32_aligned_acquire_const(ptr readonly %ptr) {
+; CHECK-LABEL: load_atomic_i32_aligned_acquire_const:
+; CHECK:    add x8, x0, #16
+; CHECK:    ldapr w0, [x8]
+    %gep = getelementptr inbounds i32, ptr %ptr, i32 4
+    %r = load atomic i32, ptr %gep acquire, align 4
+    ret i32 %r
+}
+
+define i32 @load_atomic_i32_aligned_seq_cst(ptr %ptr) {
+; CHECK-LABEL: load_atomic_i32_aligned_seq_cst:
+; CHECK:    add x8, x0, #16
+; CHECK:    ldar w0, [x8]
+    %gep = getelementptr inbounds i32, ptr %ptr, i32 4
+    %r = load atomic i32, ptr %gep seq_cst, align 4
+    ret i32 %r
+}
+
+define i32 @load_atomic_i32_aligned_seq_cst_const(ptr readonly %ptr) {
+; CHECK-LABEL: load_atomic_i32_aligned_seq_cst_const:
+; CHECK:    add x8, x0, #16
+; CHECK:    ldar w0, [x8]
+    %gep = getelementptr inbounds i32, ptr %ptr, i32 4
+    %r = load atomic i32, ptr %gep seq_cst, align 4
+    ret i32 %r
+}
+
+define i64 @load_atomic_i64_aligned_unordered(ptr %ptr) {
+; CHECK-LABEL: load_atomic_i64_aligned_unordered:
+; CHECK:    ldr x0, [x0, #32]
+    %gep = getelementptr inbounds i64, ptr %ptr, i32 4
+    %r = load atomic i64, ptr %gep unordered, align 8
+    ret i64 %r
+}
+
+define i64 @load_atomic_i64_aligned_unordered_const(ptr readonly %ptr) {
+; CHECK-LABEL: load_atomic_i64_aligned_unordered_const:
+; CHECK:    ldr x0, [x0, #32]
+    %gep = getelementptr inbounds i64, ptr %ptr, i32 4
+    %r = load atomic i64, ptr %gep unordered, align 8
+    ret i64 %r
+}
+
+define i64 @load_atomic_i64_aligned_monotonic(ptr %ptr) {
+; CHECK-LABEL: load_atomic_i64_aligned_monotonic:
+; CHECK:    ldr x0, [x0, #32]
+    %gep = getelementptr inbounds i64, ptr %ptr, i32 4
+    %r = load atomic i64, ptr %gep monotonic, align 8
+    ret i64 %r
+}
+
+define i64 @load_atomic_i64_aligned_monotonic_const(ptr readonly %ptr) {
+; CHECK-LABEL: load_atomic_i64_aligned_monotonic_const:
+; CHECK:    ldr x0, [x0, #32]
+    %gep = getelementptr inbounds i64, ptr %ptr, i32 4
+    %r = load atomic i64, ptr %gep monotonic, align 8
+    ret i64 %r
+}
+
+define i64 @load_atomic_i64_aligned_acquire(ptr %ptr) {
+; CHECK-LABEL: load_atomic_i64_aligned_acquire:
+; CHECK:    add x8, x0, #32
+; CHECK:    ldapr x0, [x8]
+    %gep = getelementptr inbounds i64, ptr %ptr, i32 4
+    %r = load atomic i64, ptr %gep acquire, align 8
+    ret i64 %r
+}
+
+define i64 @load_atomic_i64_aligned_acquire_const(ptr readonly %ptr) {
+; CHECK-LABEL: load_atomic_i64_aligned_acquire_const:
+; CHECK:    add x8, x0, #32
+; CHECK:    ldapr x0, [x8]
+    %gep = getelementptr inbounds i64, ptr %ptr, i32 4
+    %r = load atomic i64, ptr %gep acquire, align 8
+    ret i64 %r
+}
+
+define i64 @load_atomic_i64_aligned_seq_cst(ptr %ptr) {
+; CHECK-LABEL: load_atomic_i64_aligned_seq_cst:
+; CHECK:    add x8, x0, #32
+; CHECK:    ldar x0, [x8]
+    %gep = getelementptr inbounds i64, ptr %ptr, i32 4
+    %r = load atomic i64, ptr %gep seq_cst, align 8
+    ret i64 %r
+}
+
+define i64 @load_atomic_i64_aligned_seq_cst_const(ptr readonly %ptr) {
+; CHECK-LABEL: load_atomic_i64_aligned_seq_cst_const:
+; CHECK:    add x8, x0, #32
+; CHECK:    ldar x0, [x8]
+    %gep = getelementptr inbounds i64, ptr %ptr, i32 4
+    %r = load atomic i64, ptr %gep seq_cst, align 8
+    ret i64 %r
+}
+
+define i128 @load_atomic_i128_aligned_unordered(ptr %ptr) {
+; CHECK-LABEL: load_atomic_i128_aligned_unordered:
+; CHECK:    ldp x0, x1, [x0, #64]
+    %gep = getelementptr inbounds i128, ptr %ptr, i32 4
+    %r = load atomic i128, ptr %gep unordered, align 16
+    ret i128 %r
+}
+
+define i128 @load_atomic_i128_aligned_unordered_const(ptr readonly %ptr) {
+; CHECK-LABEL: load_atomic_i128_aligned_unordered_const:
+; CHECK:    ldp x0, x1, [x0, #64]
+    %gep = getelementptr inbounds i128, ptr %ptr, i32 4
+    %r = load atomic i128, ptr %gep unordered, align 16
+    ret i128 %r
+}
+
+define i128 @load_atomic_i128_aligned_monotonic(ptr %ptr) {
+; CHECK-LABEL: load_atomic_i128_aligned_monotonic:
+; CHECK:    ldp x0, x1, [x0, #64]
+    %gep = getelementptr inbounds i128, ptr %ptr, i32 4
+    %r = load atomic i128, ptr %gep monotonic, align 16
+    ret i128 %r
+}
+
+define i128 @load_atomic_i128_aligned_monotonic_const(ptr readonly %ptr) {
+; CHECK-LABEL: load_atomic_i128_aligned_monotonic_const:
+; CHECK:    ldp x0, x1, [x0, #64]
+    %gep = getelementptr inbounds i128, ptr %ptr, i32 4
+    %r = load atomic i128, ptr %gep monotonic, align 16
+    ret i128 %r
+}
+
+define i128 @load_atomic_i128_aligned_acquire(ptr %ptr) {
+; CHECK-LABEL: load_atomic_i128_aligned_acquire:
+; CHECK:    ldp x0, x1, [x0, #64]
+; CHECK:    dmb ishld
+    %gep = getelementptr inbounds i128, ptr %ptr, i32 4
+    %r = load atomic i128, ptr %gep acquire, align 16
+    ret i128 %r
+}
+
+define i128 @load_atomic_i128_aligned_acquire_const(ptr readonly %ptr) {
+; CHECK-LABEL: load_atomic_i128_aligned_acquire_const:
+; CHECK:    ldp x0, x1, [x0, #64]
+; CHECK:    dmb ishld
+    %gep = getelementptr inbounds i128, ptr %ptr, i32 4
+    %r = load atomic i128, ptr %gep acquire, align 16
+    ret i128 %r
+}
+
+define i128 @load_atomic_i128_aligned_seq_cst(ptr %ptr) {
+; CHECK-LABEL: load_atomic_i128_aligned_seq_cst:
+; CHECK:    ldp x0, x1, [x0, #64]
+; CHECK:    dmb ish
+    %gep = getelementptr inbounds i128, ptr %ptr, i32 4
+    %r = load atomic i128, ptr %gep seq_cst, align 16
+    ret i128 %r
+}
+
+define i128 @load_atomic_i128_aligned_seq_cst_const(ptr readonly %ptr) {
+; CHECK-LABEL: load_atomic_i128_aligned_seq_cst_const:
+; CHECK:    ldp x0, x1, [x0, #64]
+; CHECK:    dmb ish
+    %gep = getelementptr inbounds i128, ptr %ptr, i32 4
+    %r = load atomic i128, ptr %gep seq_cst, align 16
+    ret i128 %r
+}
+
+define i8 @load_atomic_i8_unaligned_unordered(ptr %ptr) {
+; CHECK-LABEL: load_atomic_i8_unaligned_unordered:
+; CHECK:    ldrb w0, [x0, #4]
+    %gep = getelementptr inbounds i8, ptr %ptr, i32 4
+    %r = load atomic i8, ptr %gep unordered, align 1
+    ret i8 %r
+}
+
+define i8 @load_atomic_i8_unaligned_unordered_const(ptr readonly %ptr) {
+; CHECK-LABEL: load_atomic_i8_unaligned_unordered_const:
+; CHECK:    ldrb w0, [x0, #4]
+    %gep = getelementptr inbounds i8, ptr %ptr, i32 4
+    %r = load atomic i8, ptr %gep unordered, align 1
+    ret i8 %r
+}
+
+define i8 @load_atomic_i8_unaligned_monotonic(ptr %ptr) {
+; CHECK-LABEL: load_atomic_i8_unaligned_monotonic:
+; CHECK:    ldrb w0, [x0, #4]
+    %gep = getelementptr inbounds i8, ptr %ptr, i32 4
+    %r = load atomic i8, ptr %gep monotonic, align 1
+    ret i8 %r
+}
+
+define i8 @load_atomic_i8_unaligned_monotonic_const(ptr readonly %ptr) {
+; CHECK-LABEL: load_atomic_i8_unaligned_monotonic_const:
+; CHECK:    ldrb w0, [x0, #4]
+    %gep = getelementptr inbounds i8, ptr %ptr, i32 4
+    %r = load atomic i8, ptr %gep monotonic, align 1
+    ret i8 %r
+}
+
+define i8 @load_atomic_i8_unaligned_acquire(ptr %ptr) {
+; CHECK-LABEL: load_atomic_i8_unaligned_acquire:
+; CHECK:    add x8, x0, #4
+; CHECK:    ldaprb w0, [x8]
+    %gep = getelementptr inbounds i8, ptr %ptr, i32 4
+    %r = load atomic i8, ptr %gep acquire, align 1
+    ret i8 %r
+}
+
+define i8 @load_atomic_i8_unaligned_acquire_const(ptr readonly %ptr) {
+; CHECK-LABEL: load_atomic_i8_unaligned_acquire_const:
+; CHECK:    add x8, x0, #4
+; CHECK:    ldaprb w0, [x8]
+    %gep = getelementptr inbounds i8, ptr %ptr, i32 4
+    %r = load atomic i8, ptr %gep acquire, align 1
+    ret i8 %r
+}
+
+define i8 @load_atomic_i8_unaligned_seq_cst(ptr %ptr) {
+; CHECK-LABEL: load_atomic_i8_unaligned_seq_cst:
+; CHECK:    add x8, x0, #4
+; CHECK:    ldarb w0, [x8]
+    %gep = getelementptr inbounds i8, ptr %ptr, i32 4
+    %r = load atomic i8, ptr %gep seq_cst, align 1
+    ret i8 %r
+}
+
+define i8 @load_atomic_i8_unaligned_seq_cst_const(ptr readonly %ptr) {
+; CHECK-LABEL: load_atomic_i8_unaligned_seq_cst_const:
+; CHECK:    add x8, x0, #4
+; CHECK:    ldarb w0, [x8]
+    %gep = getelementptr inbounds i8, ptr %ptr, i32 4
+    %r = load atomic i8, ptr %gep seq_cst, align 1
+    ret i8 %r
+}
+
+define i16 @load_atomic_i16_unaligned_unordered(ptr %ptr) {
+; GISEL-LABEL: load_atomic_i16_unaligned_unordered:
+; GISEL:    add x1, x8, #4
+; GISEL:    bl __atomic_load
+;
+; SDAG-LABEL: load_atomic_i16_unaligned_unordered:
+; SDAG:    add x1, x0, #4
+; SDAG:    bl __atomic_load
+    %gep = getelementptr inbounds i8, ptr %ptr, i32 4
+    %r = load atomic i16, ptr %gep unordered, align 1
+    ret i16 %r
+}
+
+define i16 @load_atomic_i16_unaligned_unordered_const(ptr readonly %ptr) {
+; GISEL-LABEL: load_atomic_i16_unaligned_unordered_const:
+; GISEL:    add x1, x8, #4
+; GISEL:    bl __atomic_load
+;
+; SDAG-LABEL: load_atomic_i16_unaligned_unordered_const:
+; SDAG:    add x1, x0, #4
+; SDAG:    bl __atomic_load
+    %gep = getelementptr inbounds i8, ptr %ptr, i32 4
+    %r = load atomic i16, ptr %gep unordered, align 1
+    ret i16 %r
+}
+
+define i16 @load_atomic_i16_unaligned_monotonic(ptr %ptr) {
+; GISEL-LABEL: load_atomic_i16_unaligned_monotonic:
+; GISEL:    add x1, x8, #8
+; GISEL:    bl __atomic_load
+;
+; SDAG-LABEL: load_atomic_i16_unaligned_monotonic:
+; SDAG:    add x1, x0, #8
+; SDAG:    bl __atomic_load
+    %gep = getelementptr inbounds i16, ptr %ptr, i32 4
+    %r = load atomic i16, ptr %gep monotonic, align 1
+    ret i16 %r
+}
+
+define i16 @load_atomic_i16_unaligned_monotonic_const(ptr readonly %ptr) {
+; GISEL-LABEL: load_atomic_i16_unaligned_monotonic_const:
+; GISEL:    add x1, x8, #8
+; GISEL:    bl __atomic_load
+;
+; SDAG-LABEL: load_atomic_i16_unaligned_monotonic_const:
+; SDAG:    add x1, x0, #8
+; SDAG:    bl __atomic_load
+    %gep = getelementptr inbounds i16, ptr %ptr, i32 4
+    %r = load atomic i16, ptr %gep monotonic, align 1
+    ret i16 %r
+}
+
+define i16 @load_atomic_i16_unaligned_acquire(ptr %ptr) {
+; GISEL-LABEL: load_atomic_i16_unaligned_acquire:
+; GISEL:    add x1, x8, #8
+; GISEL:    bl __atomic_load
+;
+; SDAG-LABEL: load_atomic_i16_unaligned_acquire:
+; SDAG:    add x1, x0, #8
+; SDAG:    bl __atomic_load
+    %gep = getelementptr inbounds i16, ptr %ptr, i32 4
+    %r = load atomic i16, ptr %gep acquire, align 1
+    ret i16 %r
+}
+
+define i16 @load_atomic_i16_unaligned_acquire_const(ptr readonly %ptr) {
+; GISEL-LABEL: load_atomic_i16_unaligned_acquire_const:
+; GISEL:    add x1, x8, #8
+; GISEL:    bl __atomic_load
+;
+; SDAG-LABEL: load_atomic_i16_unaligned_acquire_const:
+; SDAG:    add x1, x0, #8
+; SDAG:    bl __atomic_load
+    %gep = getelementptr inbounds i16, ptr %ptr, i32 4
+    %r = load atomic i16, ptr %gep acquire, align 1
+    ret i16 %r
+}
+
+define i16 @load_atomic_i16_unaligned_seq_cst(ptr %ptr) {
+; GISEL-LABEL: load_atomic_i16_unaligned_seq_cst:
+; GISEL:    add x1, x8, #8
+; GISEL:    bl __atomic_load
+;
+; SDAG-LABEL: load_atomic_i16_unaligned_seq_cst:
+; SDAG:    add x1, x0, #8
+; SDAG:    bl __atomic_load
+    %gep = getelementptr inbounds i16, ptr %ptr, i32 4
+    %r = load atomic i16, ptr %gep seq_cst, align 1
+    ret i16 %r
+}
+
+define i16 @load_atomic_i16_unaligned_seq_cst_const(ptr readonly %ptr) {
+; GISEL-LABEL: load_atomic_i16_unaligned_seq_cst_const:
+; GISEL:    add x1, x8, #8
+; GISEL:    bl __atomic_load
+;
+; SDAG-LABEL: load_atomic_i16_unaligned_seq_cst_const:
+; SDAG:    add x1, x0, #8
+; SDAG:    bl __atomic_load
+    %gep = getelementptr inbounds i16, ptr %ptr, i32 4
+    %r = load atomic i16, ptr %gep seq_cst, align 1
+    ret i16 %r
+}
+
+define i32 @load_atomic_i32_unaligned_unordered(ptr %ptr) {
+; GISEL-LABEL: load_atomic_i32_unaligned_unordered:
+; GISEL:    add x1, x8, #16
+; GISEL:    bl __atomic_load
+;
+; SDAG-LABEL: load_atomic_i32_unaligned_unordered:
+; SDAG:    add x1, x0, #16
+; SDAG:    bl __atomic_load
+    %gep = getelementptr inbounds i32, ptr %ptr, i32 4
+    %r = load atomic i32, ptr %gep unordered, align 1
+    ret i32 %r
+}
+
+define i32 @load_atomic_i32_unaligned_unordered_const(ptr readonly %ptr) {
+; GISEL-LABEL: load_atomic_i32_unaligned_unordered_const:
+; GISEL:    add x1, x8, #16
+; GISEL:    bl __atomic_load
+;
+; SDAG-LABEL: load_atomic_i32_unaligned_unordered_const:
+; SDAG:    add x1, x0, #16
+; SDAG:    bl __atomic_load
+    %gep = getelementptr inbounds i32, ptr %ptr, i32 4
+    %r = load atomic i32, ptr %gep unordered, align 1
+    ret i32 %r
+}
+
+define i32 @load_atomic_i32_unaligned_monotonic(ptr %ptr) {
+; GISEL-LABEL: load_atomic_i32_unaligned_monotonic:
+; GISEL:    add x1, x8, #16
+; GISEL:    bl __atomic_load
+;
+; SDAG-LABEL: load_atomic_i32_unaligned_monotonic:
+; SDAG:    add x1, x0, #16
+; SDAG:    bl __atomic_load
+    %gep = getelementptr inbounds i32, ptr %ptr, i32 4
+    %r = load atomic i32, ptr %gep monotonic, align 1
+    ret i32 %r
+}
+
+define i32 @load_atomic_i32_unaligned_monotonic_const(ptr readonly %ptr) {
+; GISEL-LABEL: load_atomic_i32_unaligned_monotonic_const:
+; GISEL:    add x1, x8, #16
+; GISEL:    bl __atomic_load
+;
+; SDAG-LABEL: load_atomic_i32_unaligned_monotonic_const:
+; SDAG:    add x1, x0, #16
+; SDAG:    bl __atomic_load
+    %gep = getelementptr inbounds i32, ptr %ptr, i32 4
+    %r = load atomic i32, ptr %gep monotonic, align 1
+    ret i32 %r
+}
+
+define i32 @load_atomic_i32_unaligned_acquire(ptr %ptr) {
+; GISEL-LABEL: load_atomic_i32_unaligned_acquire:
+; GISEL:    add x1, x8, #16
+; GISEL:    bl __atomic_load
+;
+; SDAG-LABEL: load_atomic_i32_unaligned_acquire:
+; SDAG:    add x1, x0, #16
+; SDAG:    bl __atomic_load
+    %gep = getelementptr inbounds i32, ptr %ptr, i32 4
+    %r = load atomic i32, ptr %gep acquire, align 1
+    ret i32 %r
+}
+
+define i32 @load_atomic_i32_unaligned_acquire_const(ptr readonly %ptr) {
+; GISEL-LABEL: load_atomic_i32_unaligned_acquire_const:
+; GISEL:    add x1, x8, #16
+; GISEL:    bl __atomic_load
+;
+; SDAG-LABEL: load_atomic_i32_unaligned_acquire_const:
+; SDAG:    add x1, x0, #16
+; SDAG:    bl __atomic_load
+    %gep = getelementptr inbounds i32, ptr %ptr, i32 4
+    %r = load atomic i32, ptr %gep acquire, align 1
+    ret i32 %r
+}
+
+define i32 @load_atomic_i32_unaligned_seq_cst(ptr %ptr) {
+; GISEL-LABEL: load_atomic_i32_unaligned_seq_cst:
+; GISEL:    add x1, x8, #16
+; GISEL:    bl __atomic_load
+;
+; SDAG-LABEL: load_atomic_i32_unaligned_seq_cst:
+; SDAG:    add x1, x0, #16
+; SDAG:    bl __atomic_load
+    %gep = getelementptr inbounds i32, ptr %ptr, i32 4
+    %r = load atomic i32, ptr %gep seq_cst, align 1
+    ret i32 %r
+}
+
+define i32 @load_atomic_i32_unaligned_seq_cst_const(ptr readonly %ptr) {
+; GISEL-LABEL: load_atomic_i32_unaligned_seq_cst_const:
+; GISEL:    add x1, x8, #16
+; GISEL:    bl __atomic_load
+;
+; SDAG-LABEL: load_atomic_i32_unaligned_seq_cst_const:
+; SDAG:    add x1, x0, #16
+; SDAG:    bl __atomic_load
+    %gep = getelementptr inbounds i32, ptr %ptr, i32 4
+    %r = load atomic i32, ptr %gep seq_cst, align 1
+    ret i32 %r
+}
+
+define i64 @load_atomic_i64_unaligned_unordered(ptr %ptr) {
+; GISEL-LABEL: load_atomic_i64_unaligned_unordered:
+; GISEL:    add x1, x8, #32
+; GISEL:    bl __atomic_load
+;
+; SDAG-LABEL: load_atomic_i64_unaligned_unordered:
+; SDAG:    add x1, x0, #32
+; SDAG:    bl __atomic_load
+    %gep = getelementptr inbounds i64, ptr %ptr, i32 4
+    %r = load atomic i64, ptr %gep unordered, align 1
+    ret i64 %r
+}
+
+define i64 @load_atomic_i64_unaligned_unordered_const(ptr readonly %ptr) {
+; GISEL-LABEL: load_atomic_i64_unaligned_unordered_const:
+; GISEL:    add x1, x8, #32
+; GISEL:    bl __atomic_load
+;
+; SDAG-LABEL: load_atomic_i64_unaligned_unordered_const:
+; SDAG:    add x1, x0, #32
+; SDAG:    bl __atomic_load
+    %gep = getelementptr inbounds i64, ptr %ptr, i32 4
+    %r = load atomic i64, ptr %gep unordered, align 1
+    ret i64 %r
+}
+
+define i64 @load_atomic_i64_unaligned_monotonic(ptr %ptr) {
+; GISEL-LABEL: load_atomic_i64_unaligned_monotonic:
+; GISEL:    add x1, x8, #32
+; GISEL:    bl __atomic_load
+;
+; SDAG-LABEL: load_atomic_i64_unaligned_monotonic:
+; SDAG:    add x1, x0, #32
+; SDAG:    bl __atomic_load
+    %gep = getelementptr inbounds i64, ptr %ptr, i32 4
+    %r = load atomic i64, ptr %gep monotonic, align 1
+    ret i64 %r
+}
+
+define i64 @load_atomic_i64_unaligned_monotonic_const(ptr readonly %ptr) {
+; GISEL-LABEL: load_atomic_i64_unaligned_monotonic_const:
+; GISEL:    add x1, x8, #32
+; GISEL:    bl __atomic_load
+;
+; SDAG-LABEL: load_atomic_i64_unaligned_monotonic_const:
+; SDAG:    add x1, x0, #32
+; SDAG:    bl __atomic_load
+    %gep = getelementptr inbounds i64, ptr %ptr, i32 4
+    %r = load atomic i64, ptr %gep monotonic, align 1
+    ret i64 %r
+}
+
+define i64 @load_atomic_i64_unaligned_acquire(ptr %ptr) {
+; GISEL-LABEL: load_atomic_i64_unaligned_acquire:
+; GISEL:    add x1, x8, #32
+; GISEL:    bl __atomic_load
+;
+; SDAG-LABEL: load_atomic_i64_unaligned_acquire:
+; SDAG:    add x1, x0, #32
+; SDAG:    bl __atomic_load
+    %gep = getelementptr inbounds i64, ptr %ptr, i32 4
+    %r = load atomic i64, ptr %gep acquire, align 1
+    ret i64 %r
+}
+
+define i64 @load_atomic_i64_unaligned_acquire_const(ptr readonly %ptr) {
+; GISEL-LABEL: load_atomic_i64_unaligned_acquire_const:
+; GISEL:    add x1, x8, #32
+; GISEL:    bl __atomic_load
+;
+; SDAG-LABEL: load_atomic_i64_unaligned_acquire_const:
+; SDAG:    add x1, x0, #32
+; SDAG:    bl __atomic_load
+    %gep = getelementptr inbounds i64, ptr %ptr, i32 4
+    %r = load atomic i64, ptr %gep acquire, align 1
+    ret i64 %r
+}
+
+define i64 @load_atomic_i64_unaligned_seq_cst(ptr %ptr) {
+; GISEL-LABEL: load_atomic_i64_unaligned_seq_cst:
+; GISEL:    add x1, x8, #32
+; GISEL:    bl __atomic_load
+;
+; SDAG-LABEL: load_atomic_i64_unaligned_seq_cst:
+; SDAG:    add x1, x0, #32
+; SDAG:    bl __atomic_load
+    %gep = getelementptr inbounds i64, ptr %ptr, i32 4
+    %r = load atomic i64, ptr %gep seq_cst, align 1
+    ret i64 %r
+}
+
+define i64 @load_atomic_i64_unaligned_seq_cst_const(ptr readonly %ptr) {
+; GISEL-LABEL: load_atomic_i64_unaligned_seq_cst_const:
+; GISEL:    add x1, x8, #32
+; GISEL:    bl __atomic_load
+;
+; SDAG-LABEL: load_atomic_i64_unaligned_seq_cst_const:
+; SDAG:    add x1, x0, #32
+; SDAG:    bl __atomic_load
+    %gep = getelementptr inbounds i64, ptr %ptr, i32 4
+    %r = load atomic i64, ptr %gep seq_cst, align 1
+    ret i64 %r
+}
+
+define i128 @load_atomic_i128_unaligned_unordered(ptr %ptr) {
+; GISEL-LABEL: load_atomic_i128_unaligned_unordered:
+; GISEL:    add x1, x8, #64
+; GISEL:    bl __atomic_load
+;
+; SDAG-LABEL: load_atomic_i128_unaligned_unordered:
+; SDAG:    add x1, x0, #64
+; SDAG:    bl __atomic_load
+    %gep = getelementptr inbounds i128, ptr %ptr, i32 4
+    %r = load atomic i128, ptr %gep unordered, align 1
+    ret i128 %r
+}
+
+define i128 @load_atomic_i128_unaligned_unordered_const(ptr readonly %ptr) {
+; GISEL-LABEL: load_atomic_i128_unaligned_unordered_const:
+; GISEL:    add x1, x8, #64
+; GISEL:    bl __atomic_load
+;
+; SDAG-LABEL: load_atomic_i128_unaligned_unordered_const:
+; SDAG:    add x1, x0, #64
+; SDAG:    bl __atomic_load
+    %gep = getelementptr inbounds i128, ptr %ptr, i32 4
+    %r = load atomic i128, ptr %gep unordered, align 1
+    ret i128 %r
+}
+
+define i128 @load_atomic_i128_unaligned_monotonic(ptr %ptr) {
+; GISEL-LABEL: load_atomic_i128_unaligned_monotonic:
+; GISEL:    add x1, x8, #64
+; GISEL:    bl __atomic_load
+;
+; SDAG-LABEL: load_atomic_i128_unaligned_monotonic:
+; SDAG:    add x1, x0, #64
+; SDAG:    bl __atomic_load
+    %gep = getelementptr inbounds i128, ptr %ptr, i32 4
+    %r = load atomic i128, ptr %gep monotonic, align 1
+    ret i128 %r
+}
+
+define i128 @load_atomic_i128_unaligned_monotonic_const(ptr readonly %ptr) {
+; GISEL-LABEL: load_atomic_i128_unaligned_monotonic_const:
+; GISEL:    add x1, x8, #64
+; GISEL:    bl __atomic_load
+;
+; SDAG-LABEL: load_atomic_i128_unaligned_monotonic_const:
+; SDAG:    add x1, x0, #64
+; SDAG:    bl __atomic_load
+    %gep = getelementptr inbounds i128, ptr %ptr, i32 4
+    %r = load atomic i128, ptr %gep monotonic, align 1
+    ret i128 %r
+}
+
+define i128 @load_atomic_i128_unaligned_acquire(ptr %ptr) {
+; GISEL-LABEL: load_atomic_i128_unaligned_acquire:
+; GISEL:    add x1, x8, #64
+; GISEL:    bl __atomic_load
+;
+; SDAG-LABEL: load_atomic_i128_unaligned_acquire:
+; SDAG:    add x1, x0, #64
+; SDAG:    bl __atomic_load
+    %gep = getelementptr inbounds i128, ptr %ptr, i32 4
+    %r = load atomic i128, ptr %gep acquire, align 1
+    ret i128 %r
+}
+
+define i128 @load_atomic_i128_unaligned_acquire_const(ptr readonly %ptr) {
+; GISEL-LABEL: load_atomic_i128_unaligned_acquire_const:
+; GISEL:    add x1, x8, #64
+; GISEL:    bl __atomic_load
+;
+; SDAG-LABEL: load_atomic_i128_unaligned_acquire_const:
+; SDAG:    add x1, x0, #64
+; SDAG:    bl __atomic_load
+    %gep = getelementptr inbounds i128, ptr %ptr, i32 4
+    %r = load atomic i128, ptr %gep acquire, align 1
+    ret i128 %r
+}
+
+define i128 @load_atomic_i128_unaligned_seq_cst(ptr %ptr) {
+; GISEL-LABEL: load_atomic_i128_unaligned_seq_cst:
+; GISEL:    add x1, x8, #64
+; GISEL:    bl __atomic_load
+;
+; SDAG-LABEL: load_atomic_i128_unaligned_seq_cst:
+; SDAG:    add x1, x0, #64
+; SDAG:    bl __atomic_load
+    %gep = getelementptr inbounds i128, ptr %ptr, i32 4
+    %r = load atomic i128, ptr %gep seq_cst, align 1
+    ret i128 %r
+}
+
+define i128 @load_atomic_i128_unaligned_seq_cst_const(ptr readonly %ptr) {
+; GISEL-LABEL: load_atomic_i128_unaligned_seq_cst_const:
+; GISEL:    add x1, x8, #64
+; GISEL:    bl __atomic_load
+;
+; SDAG-LABEL: load_atomic_i128_unaligned_seq_cst_const:
+; SDAG:    add x1, x0, #64
+; SDAG:    bl __atomic_load
+    %gep = getelementptr inbounds i128, ptr %ptr, i32 4
+    %r = load atomic i128, ptr %gep seq_cst, align 1
+    ret i128 %r
+}
diff --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-store-rcpc_immo.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-store-rcpc_immo.ll
new file mode 100644
index 0000000000000..86cb738c5799d
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-store-rcpc_immo.ll
@@ -0,0 +1,368 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --filter-out "\b(sp)\b" --filter "^\s*(ld[^r]|st|swp|cas|bl|add|and|eor|orn|orr|sub|mvn|sxt|cmp|ccmp|csel|dmb)"
+; RUN: llc %s -o - -verify-machineinstrs -mtriple=aarch64 -mattr=+v8.4a -mattr=+rcpc-immo -global-isel=true -global-isel-abort=2 -O0 | FileCheck %s --check-prefixes=CHECK,GISEL
+; RUN: llc %s -o - -verify-machineinstrs -mtriple=aarch64 -mattr=+v8.4a -mattr=+rcpc-immo -global-isel=false -O1 | FileCheck %s --check-prefixes=CHECK,SDAG
+
+define void @store_atomic_i8_aligned_unordered(i8 %value, ptr %ptr) {
+; CHECK-LABEL: store_atomic_i8_aligned_unordered:
+; CHECK:    strb w0, [x1, #4]
+    %gep = getelementptr inbounds i8, ptr %ptr, i32 4
+    store atomic i8 %value, ptr %gep unordered, align 1
+    ret void
+}
+
+define void @store_atomic_i8_aligned_monotonic(i8 %value, ptr %ptr) {
+; CHECK-LABEL: store_atomic_i8_aligned_monotonic:
+; CHECK:    strb w0, [x1, #4]
+    %gep = getelementptr inbounds i8, ptr %ptr, i32 4
+    store atomic i8 %value, ptr %gep monotonic, align 1
+    ret void
+}
+
+define void @store_atomic_i8_aligned_release(i8 %value, ptr %ptr) {
+; CHECK-LABEL: store_atomic_i8_aligned_release:
+; CHECK:    add x8, x1, #4
+; CHECK:    stlrb w0, [x8]
+    %gep = getelementptr inbounds i8, ptr %ptr, i32 4
+    store atomic i8 %value, ptr %gep release, align 1
+    ret void
+}
+
+define void @store_atomic_i8_aligned_seq_cst(i8 %value, ptr %ptr) {
+; CHECK-LABEL: store_atomic_i8_aligned_seq_cst:
+; CHECK:    add x8, x1, #4
+; CHECK:    stlrb w0, [x8]
+    %gep = getelementptr inbounds i8, ptr %ptr, i32 4
+    store atomic i8 %value, ptr %gep seq_cst, align 1
+    ret void
+}
+
+define void @store_atomic_i16_aligned_unordered(i16 %value, ptr %ptr) {
+; CHECK-LABEL: store_atomic_i16_aligned_unordered:
+; CHECK:    strh w0, [x1, #8]
+    %gep = getelementptr inbounds i16, ptr %ptr, i32 4
+    store atomic i16 %value, ptr %gep unordered, align 2
+    ret void
+}
+
+define void @store_atomic_i16_aligned_monotonic(i16 %value, ptr %ptr) {
+; CHECK-LABEL: store_atomic_i16_aligned_monotonic:
+; CHECK:    strh w0, [x1, #8]
+    %gep = getelementptr inbounds i16, ptr %ptr, i32 4
+    store atomic i16 %value, ptr %gep monotonic, align 2
+    ret void
+}
+
+define void @store_atomic_i16_aligned_release(i16 %value, ptr %ptr) {
+; CHECK-LABEL: store_atomic_i16_aligned_release:
+; CHECK:    add x8, x1, #8
+; CHECK:    stlrh w0, [x8]
+    %gep = getelementptr inbounds i16, ptr %ptr, i32 4
+    store atomic i16 %value, ptr %gep release, align 2
+    ret void
+}
+
+define void @store_atomic_i16_aligned_seq_cst(i16 %value, ptr %ptr) {
+; CHECK-LABEL: store_atomic_i16_aligned_seq_cst:
+; CHECK:    add x8, x1, #8
+; CHECK:    stlrh w0, [x8]
+    %gep = getelementptr inbounds i16, ptr %ptr, i32 4
+    store atomic i16 %value, ptr %gep seq_cst, align 2
+    ret void
+}
+
+define void @store_atomic_i32_aligned_unordered(i32 %value, ptr %ptr) {
+; CHECK-LABEL: store_atomic_i32_aligned_unordered:
+; CHECK:    str w0, [x1, #16]
+    %gep = getelementptr inbounds i32, ptr %ptr, i32 4
+    store atomic i32 %value, ptr %gep unordered, align 4
+    ret void
+}
+
+define void @store_atomic_i32_aligned_monotonic(i32 %value, ptr %ptr) {
+; CHECK-LABEL: store_atomic_i32_aligned_monotonic:
+; CHECK:    str w0, [x1, #16]
+    %gep = getelementptr inbounds i32, ptr %ptr, i32 4
+    store atomic i32 %value, ptr %gep monotonic, align 4
+    ret void
+}
+
+define void @store_atomic_i32_aligned_release(i32 %value, ptr %ptr) {
+; CHECK-LABEL: store_atomic_i32_aligned_release:
+; CHECK:    add x8, x1, #16
+; CHECK:    stlr w0, [x8]
+    %gep = getelementptr inbounds i32, ptr %ptr, i32 4
+    store atomic i32 %value, ptr %gep release, align 4
+    ret void
+}
+
+define void @store_atomic_i32_aligned_seq_cst(i32 %value, ptr %ptr) {
+; CHECK-LABEL: store_atomic_i32_aligned_seq_cst:
+; CHECK:    add x8, x1, #16
+; CHECK:    stlr w0, [x8]
+    %gep = getelementptr inbounds i32, ptr %ptr, i32 4
+    store atomic i32 %value, ptr %gep seq_cst, align 4
+    ret void
+}
+
+define void @store_atomic_i64_aligned_unordered(i64 %value, ptr %ptr) {
+; CHECK-LABEL: store_atomic_i64_aligned_unordered:
+; CHECK:    str x0, [x1, #32]
+    %gep = getelementptr inbounds i64, ptr %ptr, i32 4
+    store atomic i64 %value, ptr %gep unordered, align 8
+    ret void
+}
+
+define void @store_atomic_i64_aligned_monotonic(i64 %value, ptr %ptr) {
+; CHECK-LABEL: store_atomic_i64_aligned_monotonic:
+; CHECK:    str x0, [x1, #32]
+    %gep = getelementptr inbounds i64, ptr %ptr, i32 4
+    store atomic i64 %value, ptr %gep monotonic, align 8
+    ret void
+}
+
+define void @store_atomic_i64_aligned_release(i64 %value, ptr %ptr) {
+; CHECK-LABEL: store_atomic_i64_aligned_release:
+; CHECK:    add x8, x1, #32
+; CHECK:    stlr x0, [x8]
+    %gep = getelementptr inbounds i64, ptr %ptr, i32 4
+    store atomic i64 %value, ptr %gep release, align 8
+    ret void
+}
+
+define void @store_atomic_i64_aligned_seq_cst(i64 %value, ptr %ptr) {
+; CHECK-LABEL: store_atomic_i64_aligned_seq_cst:
+; CHECK:    add x8, x1, #32
+; CHECK:    stlr x0, [x8]
+    %gep = getelementptr inbounds i64, ptr %ptr, i32 4
+    store atomic i64 %value, ptr %gep seq_cst, align 8
+    ret void
+}
+
+define void @store_atomic_i128_aligned_unordered(i128 %value, ptr %ptr) {
+; CHECK-LABEL: store_atomic_i128_aligned_unordered:
+; CHECK:    stp x0, x1, [x2, #64]
+    %gep = getelementptr inbounds i128, ptr %ptr, i32 4
+    store atomic i128 %value, ptr %gep unordered, align 16
+    ret void
+}
+
+define void @store_atomic_i128_aligned_monotonic(i128 %value, ptr %ptr) {
+; CHECK-LABEL: store_atomic_i128_aligned_monotonic:
+; CHECK:    stp x0, x1, [x2, #64]
+    %gep = getelementptr inbounds i128, ptr %ptr, i32 4
+    store atomic i128 %value, ptr %gep monotonic, align 16
+    ret void
+}
+
+define void @store_atomic_i128_aligned_release(i128 %value, ptr %ptr) {
+; CHECK-LABEL: store_atomic_i128_aligned_release:
+; CHECK:    dmb ish
+; CHECK:    stp x0, x1, [x2, #64]
+    %gep = getelementptr inbounds i128, ptr %ptr, i32 4
+    store atomic i128 %value, ptr %gep release, align 16
+    ret void
+}
+
+define void @store_atomic_i128_aligned_seq_cst(i128 %value, ptr %ptr) {
+; CHECK-LABEL: store_atomic_i128_aligned_seq_cst:
+; CHECK:    dmb ish
+; CHECK:    stp x0, x1, [x2, #64]
+; CHECK:    dmb ish
+    %gep = getelementptr inbounds i128, ptr %ptr, i32 4
+    store atomic i128 %value, ptr %gep seq_cst, align 16
+    ret void
+}
+
+define void @store_atomic_i8_unaligned_unordered(i8 %value, ptr %ptr) {
+; CHECK-LABEL: store_atomic_i8_unaligned_unordered:
+; CHECK:    strb w0, [x1, #4]
+    %gep = getelementptr inbounds i8, ptr %ptr, i32 4
+    store atomic i8 %value, ptr %gep unordered, align 1
+    ret void
+}
+
+define void @store_atomic_i8_unaligned_monotonic(i8 %value, ptr %ptr) {
+; CHECK-LABEL: store_atomic_i8_unaligned_monotonic:
+; CHECK:    strb w0, [x1, #4]
+    %gep = getelementptr inbounds i8, ptr %ptr, i32 4
+    store atomic i8 %value, ptr %gep monotonic, align 1
+    ret void
+}
+
+define void @store_atomic_i8_unaligned_release(i8 %value, ptr %ptr) {
+; CHECK-LABEL: store_atomic_i8_unaligned_release:
+; CHECK:    add x8, x1, #4
+; CHECK:    stlrb w0, [x8]
+    %gep = getelementptr inbounds i8, ptr %ptr, i32 4
+    store atomic i8 %value, ptr %gep release, align 1
+    ret void
+}
+
+define void @store_atomic_i8_unaligned_seq_cst(i8 %value, ptr %ptr) {
+; CHECK-LABEL: store_atomic_i8_unaligned_seq_cst:
+; CHECK:    add x8, x1, #4
+; CHECK:    stlrb w0, [x8]
+    %gep = getelementptr inbounds i8, ptr %ptr, i32 4
+    store atomic i8 %value, ptr %gep seq_cst, align 1
+    ret void
+}
+
+define void @store_atomic_i16_unaligned_unordered(i16 %value, ptr %ptr) {
+; CHECK-LABEL: store_atomic_i16_unaligned_unordered:
+; CHECK:    add x1, x1, #8
+; CHECK:    bl __atomic_store
+    %gep = getelementptr inbounds i16, ptr %ptr, i32 4
+    store atomic i16 %value, ptr %gep unordered, align 1
+    ret void
+}
+
+define void @store_atomic_i16_unaligned_monotonic(i16 %value, ptr %ptr) {
+; CHECK-LABEL: store_atomic_i16_unaligned_monotonic:
+; CHECK:    add x1, x1, #8
+; CHECK:    bl __atomic_store
+    %gep = getelementptr inbounds i16, ptr %ptr, i32 4
+    store atomic i16 %value, ptr %gep monotonic, align 1
+    ret void
+}
+
+define void @store_atomic_i16_unaligned_release(i16 %value, ptr %ptr) {
+; CHECK-LABEL: store_atomic_i16_unaligned_release:
+; CHECK:    add x1, x1, #8
+; CHECK:    bl __atomic_store
+    %gep = getelementptr inbounds i16, ptr %ptr, i32 4
+    store atomic i16 %value, ptr %gep release, align 1
+    ret void
+}
+
+define void @store_atomic_i16_unaligned_seq_cst(i16 %value, ptr %ptr) {
+; CHECK-LABEL: store_atomic_i16_unaligned_seq_cst:
+; CHECK:    add x1, x1, #8
+; CHECK:    bl __atomic_store
+    %gep = getelementptr inbounds i16, ptr %ptr, i32 4
+    store atomic i16 %value, ptr %gep seq_cst, align 1
+    ret void
+}
+
+define void @store_atomic_i32_unaligned_unordered(i32 %value, ptr %ptr) {
+; CHECK-LABEL: store_atomic_i32_unaligned_unordered:
+; CHECK:    add x1, x1, #16
+; CHECK:    bl __atomic_store
+    %gep = getelementptr inbounds i32, ptr %ptr, i32 4
+    store atomic i32 %value, ptr %gep unordered, align 1
+    ret void
+}
+
+define void @store_atomic_i32_unaligned_monotonic(i32 %value, ptr %ptr) {
+; CHECK-LABEL: store_atomic_i32_unaligned_monotonic:
+; CHECK:    add x1, x1, #16
+; CHECK:    bl __atomic_store
+    %gep = getelementptr inbounds i32, ptr %ptr, i32 4
+    store atomic i32 %value, ptr %gep monotonic, align 1
+    ret void
+}
+
+define void @store_atomic_i32_unaligned_release(i32 %value, ptr %ptr) {
+; CHECK-LABEL: store_atomic_i32_unaligned_release:
+; CHECK:    add x1, x1, #16
+; CHECK:    bl __atomic_store
+    %gep = getelementptr inbounds i32, ptr %ptr, i32 4
+    store atomic i32 %value, ptr %gep release, align 1
+    ret void
+}
+
+define void @store_atomic_i32_unaligned_seq_cst(i32 %value, ptr %ptr) {
+; CHECK-LABEL: store_atomic_i32_unaligned_seq_cst:
+; CHECK:    add x1, x1, #16
+; CHECK:    bl __atomic_store
+    %gep = getelementptr inbounds i32, ptr %ptr, i32 4
+    store atomic i32 %value, ptr %gep seq_cst, align 1
+    ret void
+}
+
+define void @store_atomic_i64_unaligned_unordered(i64 %value, ptr %ptr) {
+; CHECK-LABEL: store_atomic_i64_unaligned_unordered:
+; CHECK:    add x1, x1, #32
+; CHECK:    bl __atomic_store
+    %gep = getelementptr inbounds i64, ptr %ptr, i32 4
+    store atomic i64 %value, ptr %gep unordered, align 1
+    ret void
+}
+
+define void @store_atomic_i64_unaligned_monotonic(i64 %value, ptr %ptr) {
+; CHECK-LABEL: store_atomic_i64_unaligned_monotonic:
+; CHECK:    add x1, x1, #32
+; CHECK:    bl __atomic_store
+    %gep = getelementptr inbounds i64, ptr %ptr, i32 4
+    store atomic i64 %value, ptr %gep monotonic, align 1
+    ret void
+}
+
+define void @store_atomic_i64_unaligned_release(i64 %value, ptr %ptr) {
+; CHECK-LABEL: store_atomic_i64_unaligned_release:
+; CHECK:    add x1, x1, #32
+; CHECK:    bl __atomic_store
+    %gep = getelementptr inbounds i64, ptr %ptr, i32 4
+    store atomic i64 %value, ptr %gep release, align 1
+    ret void
+}
+
+define void @store_atomic_i64_unaligned_seq_cst(i64 %value, ptr %ptr) {
+; CHECK-LABEL: store_atomic_i64_unaligned_seq_cst:
+; CHECK:    add x1, x1, #32
+; CHECK:    bl __atomic_store
+    %gep = getelementptr inbounds i64, ptr %ptr, i32 4
+    store atomic i64 %value, ptr %gep seq_cst, align 1
+    ret void
+}
+
+define void @store_atomic_i128_unaligned_unordered(i128 %value, ptr %ptr) {
+; GISEL-LABEL: store_atomic_i128_unaligned_unordered:
+; GISEL:    add x1, x8, #64
+; GISEL:    bl __atomic_store
+;
+; SDAG-LABEL: store_atomic_i128_unaligned_unordered:
+; SDAG:    add x1, x2, #64
+; SDAG:    bl __atomic_store
+    %gep = getelementptr inbounds i128, ptr %ptr, i32 4
+    store atomic i128 %value, ptr %gep unordered, align 1
+    ret void
+}
+
+define void @store_atomic_i128_unaligned_monotonic(i128 %value, ptr %ptr) {
+; GISEL-LABEL: store_atomic_i128_unaligned_monotonic:
+; GISEL:    add x1, x8, #64
+; GISEL:    bl __atomic_store
+;
+; SDAG-LABEL: store_atomic_i128_unaligned_monotonic:
+; SDAG:    add x1, x2, #64
+; SDAG:    bl __atomic_store
+    %gep = getelementptr inbounds i128, ptr %ptr, i32 4
+    store atomic i128 %value, ptr %gep monotonic, align 1
+    ret void
+}
+
+define void @store_atomic_i128_unaligned_release(i128 %value, ptr %ptr) {
+; GISEL-LABEL: store_atomic_i128_unaligned_release:
+; GISEL:    add x1, x8, #64
+; GISEL:    bl __atomic_store
+;
+; SDAG-LABEL: store_atomic_i128_unaligned_release:
+; SDAG:    add x1, x2, #64
+; SDAG:    bl __atomic_store
+    %gep = getelementptr inbounds i128, ptr %ptr, i32 4
+    store atomic i128 %value, ptr %gep release, align 1
+    ret void
+}
+
+define void @store_atomic_i128_unaligned_seq_cst(i128 %value, ptr %ptr) {
+; GISEL-LABEL: store_atomic_i128_unaligned_seq_cst:
+; GISEL:    add x1, x8, #64
+; GISEL:    bl __atomic_store
+;
+; SDAG-LABEL: store_atomic_i128_unaligned_seq_cst:
+; SDAG:    add x1, x2, #64
+; SDAG:    bl __atomic_store
+    %gep = getelementptr inbounds i128, ptr %ptr, i32 4
+    store atomic i128 %value, ptr %gep seq_cst, align 1
+    ret void
+}

From 9fe5700611de180c2b5cfc0422eaebe1d027a826 Mon Sep 17 00:00:00 2001
From: Antonio Frighetto <me@antoniofrighetto.com>
Date: Mon, 30 Oct 2023 15:47:39 +0100
Subject: [PATCH 412/877] [AArch64] Add support for v8.4a `ldapur`/`stlur`

AArch64 backend now features v8.4a atomic Load-Acquire
RCpc and Store-Release register unscaled support.
---
 .../Target/AArch64/AArch64ISelDAGToDAG.cpp    | 18 ++++++-----
 .../lib/Target/AArch64/AArch64InstrAtomics.td | 31 +++++++++++++++++++
 llvm/lib/Target/AArch64/AArch64InstrInfo.td   |  2 +-
 .../GISel/AArch64InstructionSelector.cpp      |  3 --
 .../Atomics/aarch64-atomic-load-rcpc_immo.ll  | 30 ++++++------------
 .../Atomics/aarch64-atomic-store-rcpc_immo.ll | 30 ++++++------------
 6 files changed, 63 insertions(+), 51 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 38759a2474518..7617dccdeee39 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -997,6 +997,15 @@ static bool isWorthFoldingADDlow(SDValue N) {
   return true;
 }
 
+/// Check if the immediate offset is valid as a scaled immediate.
+static bool isValidAsScaledImmediate(int64_t Offset, unsigned Range,
+                                     unsigned Size) {
+  if ((Offset & (Size - 1)) == 0 && Offset >= 0 &&
+      Offset < (Range << Log2_32(Size)))
+    return true;
+  return false;
+}
+
 /// SelectAddrModeIndexedBitWidth - Select a "register plus scaled (un)signed BW-bit
 /// immediate" address.  The "Size" argument is the size in bytes of the memory
 /// reference, which determines the scale.
@@ -1092,7 +1101,7 @@ bool AArch64DAGToDAGISel::SelectAddrModeIndexed(SDValue N, unsigned Size,
     if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
       int64_t RHSC = (int64_t)RHS->getZExtValue();
       unsigned Scale = Log2_32(Size);
-      if ((RHSC & (Size - 1)) == 0 && RHSC >= 0 && RHSC < (0x1000 << Scale)) {
+      if (isValidAsScaledImmediate(RHSC, 0x1000, Size)) {
         Base = N.getOperand(0);
         if (Base.getOpcode() == ISD::FrameIndex) {
           int FI = cast<FrameIndexSDNode>(Base)->getIndex();
@@ -1130,10 +1139,6 @@ bool AArch64DAGToDAGISel::SelectAddrModeUnscaled(SDValue N, unsigned Size,
     return false;
   if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
     int64_t RHSC = RHS->getSExtValue();
-    // If the offset is valid as a scaled immediate, don't match here.
-    if ((RHSC & (Size - 1)) == 0 && RHSC >= 0 &&
-        RHSC < (0x1000 << Log2_32(Size)))
-      return false;
     if (RHSC >= -256 && RHSC < 256) {
       Base = N.getOperand(0);
       if (Base.getOpcode() == ISD::FrameIndex) {
@@ -1312,11 +1317,10 @@ bool AArch64DAGToDAGISel::SelectAddrModeXRO(SDValue N, unsigned Size,
   //     LDR  X2, [BaseReg, X0]
   if (isa<ConstantSDNode>(RHS)) {
     int64_t ImmOff = (int64_t)cast<ConstantSDNode>(RHS)->getZExtValue();
-    unsigned Scale = Log2_32(Size);
     // Skip the immediate can be selected by load/store addressing mode.
     // Also skip the immediate can be encoded by a single ADD (SUB is also
     // checked by using -ImmOff).
-    if ((ImmOff % Size == 0 && ImmOff >= 0 && ImmOff < (0x1000 << Scale)) ||
+    if (isValidAsScaledImmediate(ImmOff, 0x1000, Size) ||
         isPreferredADD(ImmOff) || isPreferredADD(-ImmOff))
       return false;
 
diff --git a/llvm/lib/Target/AArch64/AArch64InstrAtomics.td b/llvm/lib/Target/AArch64/AArch64InstrAtomics.td
index fa5a8515ed92e..0002db52b1995 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrAtomics.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrAtomics.td
@@ -573,3 +573,34 @@ let Predicates = [HasRCPC3, HasNEON] in {
                 (i64 (bitconvert (v1f64 VecListOne64:$Vt)))),
             (STL1 (SUBREG_TO_REG (i64 0), VecListOne64:$Vt, dsub), (i64 0), GPR64sp:$Rn)>;
 }
+
+// v8.4a FEAT_LRCPC2 patterns
+let Predicates = [HasRCPC_IMMO] in {
+  // Load-Acquire RCpc Register unscaled loads
+  def : Pat<(acquiring_load<atomic_load_az_8>
+               (am_unscaled8 GPR64sp:$Rn, simm9:$offset)),
+          (LDAPURBi GPR64sp:$Rn, simm9:$offset)>;
+  def : Pat<(acquiring_load<atomic_load_az_16>
+               (am_unscaled16 GPR64sp:$Rn, simm9:$offset)),
+          (LDAPURHi GPR64sp:$Rn, simm9:$offset)>;
+  def : Pat<(acquiring_load<atomic_load_32>
+               (am_unscaled32 GPR64sp:$Rn, simm9:$offset)),
+          (LDAPURi GPR64sp:$Rn, simm9:$offset)>;
+  def : Pat<(acquiring_load<atomic_load_64>
+               (am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
+          (LDAPURXi GPR64sp:$Rn, simm9:$offset)>;
+
+  // Store-Release Register unscaled stores
+  def : Pat<(releasing_store<atomic_store_8>
+               (am_unscaled8 GPR64sp:$Rn, simm9:$offset), GPR32:$val),
+          (STLURBi GPR32:$val, GPR64sp:$Rn, simm9:$offset)>;
+  def : Pat<(releasing_store<atomic_store_16>
+               (am_unscaled16 GPR64sp:$Rn, simm9:$offset), GPR32:$val),
+          (STLURHi GPR32:$val, GPR64sp:$Rn, simm9:$offset)>;
+  def : Pat<(releasing_store<atomic_store_32>
+               (am_unscaled32 GPR64sp:$Rn, simm9:$offset), GPR32:$val),
+          (STLURWi GPR32:$val, GPR64sp:$Rn, simm9:$offset)>;
+  def : Pat<(releasing_store<atomic_store_64>
+               (am_unscaled64 GPR64sp:$Rn, simm9:$offset), GPR64:$val),
+          (STLURXi GPR64:$val, GPR64sp:$Rn, simm9:$offset)>;
+}
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index ee42612c0fcdd..069a283dd311e 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -94,7 +94,7 @@ def HasTLB_RMI          : Predicate<"Subtarget->hasTLB_RMI()">,
 def HasFlagM         : Predicate<"Subtarget->hasFlagM()">,
                        AssemblerPredicateWithAll<(all_of FeatureFlagM), "flagm">;
 
-def HasRCPC_IMMO      : Predicate<"Subtarget->hasRCPCImm()">,
+def HasRCPC_IMMO      : Predicate<"Subtarget->hasRCPC_IMMO()">,
                        AssemblerPredicateWithAll<(all_of FeatureRCPC_IMMO), "rcpc-immo">;
 
 def HasFPARMv8       : Predicate<"Subtarget->hasFPARMv8()">,
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
index 2089bfba5ff37..88516967515a5 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
@@ -7397,9 +7397,6 @@ AArch64InstructionSelector::selectAddrModeUnscaled(MachineOperand &Root,
     return std::nullopt;
   RHSC = RHSOp1.getCImm()->getSExtValue();
 
-  // If the offset is valid as a scaled immediate, don't match here.
-  if ((RHSC & (Size - 1)) == 0 && RHSC >= 0 && RHSC < (0x1000 << Log2_32(Size)))
-    return std::nullopt;
   if (RHSC >= -256 && RHSC < 256) {
     MachineOperand &Base = RootDef->getOperand(1);
     return {{
diff --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-load-rcpc_immo.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-load-rcpc_immo.ll
index 05f37a4e440eb..cea15419e67c8 100644
--- a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-load-rcpc_immo.ll
+++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-load-rcpc_immo.ll
@@ -36,8 +36,7 @@ define i8 @load_atomic_i8_aligned_monotonic_const(ptr readonly %ptr) {
 
 define i8 @load_atomic_i8_aligned_acquire(ptr %ptr) {
 ; CHECK-LABEL: load_atomic_i8_aligned_acquire:
-; CHECK:    add x8, x0, #4
-; CHECK:    ldaprb w0, [x8]
+; CHECK:    ldapurb w0, [x0, #4]
     %gep = getelementptr inbounds i8, ptr %ptr, i32 4
     %r = load atomic i8, ptr %gep acquire, align 1
     ret i8 %r
@@ -45,8 +44,7 @@ define i8 @load_atomic_i8_aligned_acquire(ptr %ptr) {
 
 define i8 @load_atomic_i8_aligned_acquire_const(ptr readonly %ptr) {
 ; CHECK-LABEL: load_atomic_i8_aligned_acquire_const:
-; CHECK:    add x8, x0, #4
-; CHECK:    ldaprb w0, [x8]
+; CHECK:    ldapurb w0, [x0, #4]
     %gep = getelementptr inbounds i8, ptr %ptr, i32 4
     %r = load atomic i8, ptr %gep acquire, align 1
     ret i8 %r
@@ -104,8 +102,7 @@ define i16 @load_atomic_i16_aligned_monotonic_const(ptr readonly %ptr) {
 
 define i16 @load_atomic_i16_aligned_acquire(ptr %ptr) {
 ; CHECK-LABEL: load_atomic_i16_aligned_acquire:
-; CHECK:    add x8, x0, #8
-; CHECK:    ldaprh w0, [x8]
+; CHECK:    ldapurh w0, [x0, #8]
     %gep = getelementptr inbounds i16, ptr %ptr, i32 4
     %r = load atomic i16, ptr %gep acquire, align 2
     ret i16 %r
@@ -113,8 +110,7 @@ define i16 @load_atomic_i16_aligned_acquire(ptr %ptr) {
 
 define i16 @load_atomic_i16_aligned_acquire_const(ptr readonly %ptr) {
 ; CHECK-LABEL: load_atomic_i16_aligned_acquire_const:
-; CHECK:    add x8, x0, #8
-; CHECK:    ldaprh w0, [x8]
+; CHECK:    ldapurh w0, [x0, #8]
     %gep = getelementptr inbounds i16, ptr %ptr, i32 4
     %r = load atomic i16, ptr %gep acquire, align 2
     ret i16 %r
@@ -172,8 +168,7 @@ define i32 @load_atomic_i32_aligned_monotonic_const(ptr readonly %ptr) {
 
 define i32 @load_atomic_i32_aligned_acquire(ptr %ptr) {
 ; CHECK-LABEL: load_atomic_i32_aligned_acquire:
-; CHECK:    add x8, x0, #16
-; CHECK:    ldapr w0, [x8]
+; CHECK:    ldapur w0, [x0, #16]
     %gep = getelementptr inbounds i32, ptr %ptr, i32 4
     %r = load atomic i32, ptr %gep acquire, align 4
     ret i32 %r
@@ -181,8 +176,7 @@ define i32 @load_atomic_i32_aligned_acquire(ptr %ptr) {
 
 define i32 @load_atomic_i32_aligned_acquire_const(ptr readonly %ptr) {
 ; CHECK-LABEL: load_atomic_i32_aligned_acquire_const:
-; CHECK:    add x8, x0, #16
-; CHECK:    ldapr w0, [x8]
+; CHECK:    ldapur w0, [x0, #16]
     %gep = getelementptr inbounds i32, ptr %ptr, i32 4
     %r = load atomic i32, ptr %gep acquire, align 4
     ret i32 %r
@@ -240,8 +234,7 @@ define i64 @load_atomic_i64_aligned_monotonic_const(ptr readonly %ptr) {
 
 define i64 @load_atomic_i64_aligned_acquire(ptr %ptr) {
 ; CHECK-LABEL: load_atomic_i64_aligned_acquire:
-; CHECK:    add x8, x0, #32
-; CHECK:    ldapr x0, [x8]
+; CHECK:    ldapur x0, [x0, #32]
     %gep = getelementptr inbounds i64, ptr %ptr, i32 4
     %r = load atomic i64, ptr %gep acquire, align 8
     ret i64 %r
@@ -249,8 +242,7 @@ define i64 @load_atomic_i64_aligned_acquire(ptr %ptr) {
 
 define i64 @load_atomic_i64_aligned_acquire_const(ptr readonly %ptr) {
 ; CHECK-LABEL: load_atomic_i64_aligned_acquire_const:
-; CHECK:    add x8, x0, #32
-; CHECK:    ldapr x0, [x8]
+; CHECK:    ldapur x0, [x0, #32]
     %gep = getelementptr inbounds i64, ptr %ptr, i32 4
     %r = load atomic i64, ptr %gep acquire, align 8
     ret i64 %r
@@ -376,8 +368,7 @@ define i8 @load_atomic_i8_unaligned_monotonic_const(ptr readonly %ptr) {
 
 define i8 @load_atomic_i8_unaligned_acquire(ptr %ptr) {
 ; CHECK-LABEL: load_atomic_i8_unaligned_acquire:
-; CHECK:    add x8, x0, #4
-; CHECK:    ldaprb w0, [x8]
+; CHECK:    ldapurb w0, [x0, #4]
     %gep = getelementptr inbounds i8, ptr %ptr, i32 4
     %r = load atomic i8, ptr %gep acquire, align 1
     ret i8 %r
@@ -385,8 +376,7 @@ define i8 @load_atomic_i8_unaligned_acquire(ptr %ptr) {
 
 define i8 @load_atomic_i8_unaligned_acquire_const(ptr readonly %ptr) {
 ; CHECK-LABEL: load_atomic_i8_unaligned_acquire_const:
-; CHECK:    add x8, x0, #4
-; CHECK:    ldaprb w0, [x8]
+; CHECK:    ldapurb w0, [x0, #4]
     %gep = getelementptr inbounds i8, ptr %ptr, i32 4
     %r = load atomic i8, ptr %gep acquire, align 1
     ret i8 %r
diff --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-store-rcpc_immo.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-store-rcpc_immo.ll
index 86cb738c5799d..4f461571c5582 100644
--- a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-store-rcpc_immo.ll
+++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomic-store-rcpc_immo.ll
@@ -20,8 +20,7 @@ define void @store_atomic_i8_aligned_monotonic(i8 %value, ptr %ptr) {
 
 define void @store_atomic_i8_aligned_release(i8 %value, ptr %ptr) {
 ; CHECK-LABEL: store_atomic_i8_aligned_release:
-; CHECK:    add x8, x1, #4
-; CHECK:    stlrb w0, [x8]
+; CHECK:    stlurb w0, [x1, #4]
     %gep = getelementptr inbounds i8, ptr %ptr, i32 4
     store atomic i8 %value, ptr %gep release, align 1
     ret void
@@ -29,8 +28,7 @@ define void @store_atomic_i8_aligned_release(i8 %value, ptr %ptr) {
 
 define void @store_atomic_i8_aligned_seq_cst(i8 %value, ptr %ptr) {
 ; CHECK-LABEL: store_atomic_i8_aligned_seq_cst:
-; CHECK:    add x8, x1, #4
-; CHECK:    stlrb w0, [x8]
+; CHECK:    stlurb w0, [x1, #4]
     %gep = getelementptr inbounds i8, ptr %ptr, i32 4
     store atomic i8 %value, ptr %gep seq_cst, align 1
     ret void
@@ -54,8 +52,7 @@ define void @store_atomic_i16_aligned_monotonic(i16 %value, ptr %ptr) {
 
 define void @store_atomic_i16_aligned_release(i16 %value, ptr %ptr) {
 ; CHECK-LABEL: store_atomic_i16_aligned_release:
-; CHECK:    add x8, x1, #8
-; CHECK:    stlrh w0, [x8]
+; CHECK:    stlurh w0, [x1, #8]
     %gep = getelementptr inbounds i16, ptr %ptr, i32 4
     store atomic i16 %value, ptr %gep release, align 2
     ret void
@@ -63,8 +60,7 @@ define void @store_atomic_i16_aligned_release(i16 %value, ptr %ptr) {
 
 define void @store_atomic_i16_aligned_seq_cst(i16 %value, ptr %ptr) {
 ; CHECK-LABEL: store_atomic_i16_aligned_seq_cst:
-; CHECK:    add x8, x1, #8
-; CHECK:    stlrh w0, [x8]
+; CHECK:    stlurh w0, [x1, #8]
     %gep = getelementptr inbounds i16, ptr %ptr, i32 4
     store atomic i16 %value, ptr %gep seq_cst, align 2
     ret void
@@ -88,8 +84,7 @@ define void @store_atomic_i32_aligned_monotonic(i32 %value, ptr %ptr) {
 
 define void @store_atomic_i32_aligned_release(i32 %value, ptr %ptr) {
 ; CHECK-LABEL: store_atomic_i32_aligned_release:
-; CHECK:    add x8, x1, #16
-; CHECK:    stlr w0, [x8]
+; CHECK:    stlur w0, [x1, #16]
     %gep = getelementptr inbounds i32, ptr %ptr, i32 4
     store atomic i32 %value, ptr %gep release, align 4
     ret void
@@ -97,8 +92,7 @@ define void @store_atomic_i32_aligned_release(i32 %value, ptr %ptr) {
 
 define void @store_atomic_i32_aligned_seq_cst(i32 %value, ptr %ptr) {
 ; CHECK-LABEL: store_atomic_i32_aligned_seq_cst:
-; CHECK:    add x8, x1, #16
-; CHECK:    stlr w0, [x8]
+; CHECK:    stlur w0, [x1, #16]
     %gep = getelementptr inbounds i32, ptr %ptr, i32 4
     store atomic i32 %value, ptr %gep seq_cst, align 4
     ret void
@@ -122,8 +116,7 @@ define void @store_atomic_i64_aligned_monotonic(i64 %value, ptr %ptr) {
 
 define void @store_atomic_i64_aligned_release(i64 %value, ptr %ptr) {
 ; CHECK-LABEL: store_atomic_i64_aligned_release:
-; CHECK:    add x8, x1, #32
-; CHECK:    stlr x0, [x8]
+; CHECK:    stlur x0, [x1, #32]
     %gep = getelementptr inbounds i64, ptr %ptr, i32 4
     store atomic i64 %value, ptr %gep release, align 8
     ret void
@@ -131,8 +124,7 @@ define void @store_atomic_i64_aligned_release(i64 %value, ptr %ptr) {
 
 define void @store_atomic_i64_aligned_seq_cst(i64 %value, ptr %ptr) {
 ; CHECK-LABEL: store_atomic_i64_aligned_seq_cst:
-; CHECK:    add x8, x1, #32
-; CHECK:    stlr x0, [x8]
+; CHECK:    stlur x0, [x1, #32]
     %gep = getelementptr inbounds i64, ptr %ptr, i32 4
     store atomic i64 %value, ptr %gep seq_cst, align 8
     ret void
@@ -191,8 +183,7 @@ define void @store_atomic_i8_unaligned_monotonic(i8 %value, ptr %ptr) {
 
 define void @store_atomic_i8_unaligned_release(i8 %value, ptr %ptr) {
 ; CHECK-LABEL: store_atomic_i8_unaligned_release:
-; CHECK:    add x8, x1, #4
-; CHECK:    stlrb w0, [x8]
+; CHECK:    stlurb w0, [x1, #4]
     %gep = getelementptr inbounds i8, ptr %ptr, i32 4
     store atomic i8 %value, ptr %gep release, align 1
     ret void
@@ -200,8 +191,7 @@ define void @store_atomic_i8_unaligned_release(i8 %value, ptr %ptr) {
 
 define void @store_atomic_i8_unaligned_seq_cst(i8 %value, ptr %ptr) {
 ; CHECK-LABEL: store_atomic_i8_unaligned_seq_cst:
-; CHECK:    add x8, x1, #4
-; CHECK:    stlrb w0, [x8]
+; CHECK:    stlurb w0, [x1, #4]
     %gep = getelementptr inbounds i8, ptr %ptr, i32 4
     store atomic i8 %value, ptr %gep seq_cst, align 1
     ret void

From 896749aa0d420ae573255a64a349bc2a76cfed37 Mon Sep 17 00:00:00 2001
From: Jon Chesterfield <JonChesterfield@users.noreply.github.com>
Date: Mon, 30 Oct 2023 18:35:52 +0000
Subject: [PATCH 413/877] [amdgpu][openmp] Avoiding writing to packet header
 twice (#70695)

I think it follows from the HSA spec that a write to the first byte is
deemed significant to the GPU in which case writing to the second short
and reading back from it later would be safe. However, the examples for
this all involve an atomic write to the first 32 bits and it seems a
credible risk that the occasional CI errors abound invalid packets have
as their root cause that the firmware notices the early write to
packet->setup and treats that as a sign that the packet is ready to go.

That was overly-paranoid, however in passing noticed the code in libc is
genuinely invalid. The memset writes a zero to the header byte, changing
it from type_invalid (1) to type_vendor (0), at which point the GPU is
free to read the 64 byte packet and interpret it as a vendor packet,
which is probably why libc CI periodically errors about invalid packets.

Also a drive by change to do the atomic store on a uint32_t
consistently. I'm not sure offhand what __atomic_store_n on a uint16_t*
and an int resolves to, seems better to be unambiguous there.
---
 libc/utils/gpu/loader/amdgpu/Loader.cpp          | 15 ++++++++-------
 .../plugins-nextgen/amdgpu/src/rtl.cpp           | 16 ++++++++--------
 2 files changed, 16 insertions(+), 15 deletions(-)

diff --git a/libc/utils/gpu/loader/amdgpu/Loader.cpp b/libc/utils/gpu/loader/amdgpu/Loader.cpp
index 1d0247a6dc5dc..c2a11fd8aab72 100644
--- a/libc/utils/gpu/loader/amdgpu/Loader.cpp
+++ b/libc/utils/gpu/loader/amdgpu/Loader.cpp
@@ -222,13 +222,13 @@ hsa_status_t launch_kernel(hsa_agent_t dev_agent, hsa_executable_t executable,
   // Set up the packet for exeuction on the device. We currently only launch
   // with one thread on the device, forcing the rest of the wavefront to be
   // masked off.
-  std::memset(packet, 0, sizeof(hsa_kernel_dispatch_packet_t));
-  packet->setup = (1 + (params.num_blocks_y * params.num_threads_y != 1) +
-                   (params.num_blocks_z * params.num_threads_z != 1))
-                  << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
+  uint16_t setup = (1 + (params.num_blocks_y * params.num_threads_y != 1) +
+                    (params.num_blocks_z * params.num_threads_z != 1))
+                   << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
   packet->workgroup_size_x = params.num_threads_x;
   packet->workgroup_size_y = params.num_threads_y;
   packet->workgroup_size_z = params.num_threads_z;
+  packet->reserved0 = 0;
   packet->grid_size_x = params.num_blocks_x * params.num_threads_x;
   packet->grid_size_y = params.num_blocks_y * params.num_threads_y;
   packet->grid_size_z = params.num_blocks_z * params.num_threads_z;
@@ -236,7 +236,7 @@ hsa_status_t launch_kernel(hsa_agent_t dev_agent, hsa_executable_t executable,
   packet->group_segment_size = group_size;
   packet->kernel_object = kernel;
   packet->kernarg_address = args;
-
+  packet->reserved2 = 0;
   // Create a signal to indicate when this packet has been completed.
   if (hsa_status_t err =
           hsa_signal_create(1, 0, nullptr, &packet->completion_signal))
@@ -244,12 +244,13 @@ hsa_status_t launch_kernel(hsa_agent_t dev_agent, hsa_executable_t executable,
 
   // Initialize the packet header and set the doorbell signal to begin execution
   // by the HSA runtime.
-  uint16_t setup = packet->setup;
   uint16_t header =
       (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
       (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE) |
       (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE);
-  __atomic_store_n(&packet->header, header | (setup << 16), __ATOMIC_RELEASE);
+  uint32_t header_word =
+      header | (setup << 16u) __atomic_store_n((uint32_t *)&packet->header,
+                                               header_word, __ATOMIC_RELEASE);
   hsa_signal_store_relaxed(queue->doorbell_signal, packet_id);
 
   // Wait until the kernel has completed execution on the device. Periodically
diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
index fbecb4963c4ab..71207f767fdcc 100644
--- a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -649,8 +649,8 @@ struct AMDGPUQueueTy {
     hsa_kernel_dispatch_packet_t *Packet = acquirePacket(PacketId);
     assert(Packet && "Invalid packet");
 
-    // The header of the packet is written in the last moment.
-    Packet->setup = UINT16_C(1) << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
+    // The first 32 bits of the packet are written after the other fields
+    uint16_t Setup = UINT16_C(1) << HSA_KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS;
     Packet->workgroup_size_x = NumThreads;
     Packet->workgroup_size_y = 1;
     Packet->workgroup_size_z = 1;
@@ -666,7 +666,7 @@ struct AMDGPUQueueTy {
     Packet->completion_signal = OutputSignal->get();
 
     // Publish the packet. Do not modify the packet after this point.
-    publishKernelPacket(PacketId, Packet);
+    publishKernelPacket(PacketId, Setup, Packet);
 
     return Plugin::success();
   }
@@ -743,17 +743,17 @@ struct AMDGPUQueueTy {
   /// Publish the kernel packet so that the HSA runtime can start processing
   /// the kernel launch. Do not modify the packet once this function is called.
   /// Assumes the queue lock is acquired.
-  void publishKernelPacket(uint64_t PacketId,
+  void publishKernelPacket(uint64_t PacketId, uint16_t Setup,
                            hsa_kernel_dispatch_packet_t *Packet) {
     uint32_t *PacketPtr = reinterpret_cast<uint32_t *>(Packet);
 
-    uint16_t Setup = Packet->setup;
     uint16_t Header = HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE;
     Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
     Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
 
     // Publish the packet. Do not modify the package after this point.
-    __atomic_store_n(PacketPtr, Header | (Setup << 16), __ATOMIC_RELEASE);
+    uint32_t HeaderWord = Header | (Setup << 16u);
+    __atomic_store_n(PacketPtr, HeaderWord, __ATOMIC_RELEASE);
 
     // Signal the doorbell about the published packet.
     hsa_signal_store_relaxed(Queue->doorbell_signal, PacketId);
@@ -765,14 +765,14 @@ struct AMDGPUQueueTy {
   void publishBarrierPacket(uint64_t PacketId,
                             hsa_barrier_and_packet_t *Packet) {
     uint32_t *PacketPtr = reinterpret_cast<uint32_t *>(Packet);
-
     uint16_t Setup = 0;
     uint16_t Header = HSA_PACKET_TYPE_BARRIER_AND << HSA_PACKET_HEADER_TYPE;
     Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_ACQUIRE_FENCE_SCOPE;
     Header |= HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE;
 
     // Publish the packet. Do not modify the package after this point.
-    __atomic_store_n(PacketPtr, Header | (Setup << 16), __ATOMIC_RELEASE);
+    uint32_t HeaderWord = Header | (Setup << 16u);
+    __atomic_store_n(PacketPtr, HeaderWord, __ATOMIC_RELEASE);
 
     // Signal the doorbell about the published packet.
     hsa_signal_store_relaxed(Queue->doorbell_signal, PacketId);

From 89564f0b69ac935dc3b865e4d627f3a0610b4abd Mon Sep 17 00:00:00 2001
From: Philip Reames <preames@rivosinc.com>
Date: Mon, 30 Oct 2023 11:33:05 -0700
Subject: [PATCH 414/877] Regenerate a set of auto-update tests [nfc]

To reduce the spurious test delta in an upcoming change.
---
 .../InstCombine/adjust-for-minmax.ll          |  64 +++++------
 .../Transforms/InstCombine/cast-mul-select.ll | 108 ++++++++++++++++--
 .../SLPVectorizer/AArch64/getelementptr.ll    |  58 +++++-----
 .../AMDGPU/uniform-unswitch.ll                |   4 +-
 4 files changed, 164 insertions(+), 70 deletions(-)

diff --git a/llvm/test/Transforms/InstCombine/adjust-for-minmax.ll b/llvm/test/Transforms/InstCombine/adjust-for-minmax.ll
index 703ba52fa378f..67871f3d64c41 100644
--- a/llvm/test/Transforms/InstCombine/adjust-for-minmax.ll
+++ b/llvm/test/Transforms/InstCombine/adjust-for-minmax.ll
@@ -245,8 +245,8 @@ define <2 x i32> @umin4_vec(<2 x i32> %n) {
 
 define i64 @smax_sext(i32 %a) {
 ; CHECK-LABEL: @smax_sext(
-; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.smax.i32(i32 [[A:%.*]], i32 0)
-; CHECK-NEXT:    [[MAX:%.*]] = zext i32 [[TMP1]] to i64
+; CHECK-NEXT:    [[NARROW:%.*]] = call i32 @llvm.smax.i32(i32 [[A:%.*]], i32 0)
+; CHECK-NEXT:    [[MAX:%.*]] = zext i32 [[NARROW]] to i64
 ; CHECK-NEXT:    ret i64 [[MAX]]
 ;
   %a_ext = sext i32 %a to i64
@@ -257,8 +257,8 @@ define i64 @smax_sext(i32 %a) {
 
 define <2 x i64> @smax_sext_vec(<2 x i32> %a) {
 ; CHECK-LABEL: @smax_sext_vec(
-; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x i32> @llvm.smax.v2i32(<2 x i32> [[A:%.*]], <2 x i32> zeroinitializer)
-; CHECK-NEXT:    [[MAX:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
+; CHECK-NEXT:    [[NARROW:%.*]] = call <2 x i32> @llvm.smax.v2i32(<2 x i32> [[A:%.*]], <2 x i32> zeroinitializer)
+; CHECK-NEXT:    [[MAX:%.*]] = zext <2 x i32> [[NARROW]] to <2 x i64>
 ; CHECK-NEXT:    ret <2 x i64> [[MAX]]
 ;
   %a_ext = sext <2 x i32> %a to <2 x i64>
@@ -269,8 +269,8 @@ define <2 x i64> @smax_sext_vec(<2 x i32> %a) {
 
 define i64 @smin_sext(i32 %a) {
 ; CHECK-LABEL: @smin_sext(
-; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.smin.i32(i32 [[A:%.*]], i32 0)
-; CHECK-NEXT:    [[MIN:%.*]] = sext i32 [[TMP1]] to i64
+; CHECK-NEXT:    [[NARROW:%.*]] = call i32 @llvm.smin.i32(i32 [[A:%.*]], i32 0)
+; CHECK-NEXT:    [[MIN:%.*]] = sext i32 [[NARROW]] to i64
 ; CHECK-NEXT:    ret i64 [[MIN]]
 ;
   %a_ext = sext i32 %a to i64
@@ -281,8 +281,8 @@ define i64 @smin_sext(i32 %a) {
 
 define <2 x i64>@smin_sext_vec(<2 x i32> %a) {
 ; CHECK-LABEL: @smin_sext_vec(
-; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x i32> @llvm.smin.v2i32(<2 x i32> [[A:%.*]], <2 x i32> zeroinitializer)
-; CHECK-NEXT:    [[MIN:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64>
+; CHECK-NEXT:    [[NARROW:%.*]] = call <2 x i32> @llvm.smin.v2i32(<2 x i32> [[A:%.*]], <2 x i32> zeroinitializer)
+; CHECK-NEXT:    [[MIN:%.*]] = sext <2 x i32> [[NARROW]] to <2 x i64>
 ; CHECK-NEXT:    ret <2 x i64> [[MIN]]
 ;
   %a_ext = sext <2 x i32> %a to <2 x i64>
@@ -293,8 +293,8 @@ define <2 x i64>@smin_sext_vec(<2 x i32> %a) {
 
 define i64 @umax_sext(i32 %a) {
 ; CHECK-LABEL: @umax_sext(
-; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.umax.i32(i32 [[A:%.*]], i32 3)
-; CHECK-NEXT:    [[MAX:%.*]] = sext i32 [[TMP1]] to i64
+; CHECK-NEXT:    [[NARROW:%.*]] = call i32 @llvm.umax.i32(i32 [[A:%.*]], i32 3)
+; CHECK-NEXT:    [[MAX:%.*]] = sext i32 [[NARROW]] to i64
 ; CHECK-NEXT:    ret i64 [[MAX]]
 ;
   %a_ext = sext i32 %a to i64
@@ -305,8 +305,8 @@ define i64 @umax_sext(i32 %a) {
 
 define <2 x i64> @umax_sext_vec(<2 x i32> %a) {
 ; CHECK-LABEL: @umax_sext_vec(
-; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x i32> @llvm.umax.v2i32(<2 x i32> [[A:%.*]], <2 x i32> <i32 3, i32 3>)
-; CHECK-NEXT:    [[MAX:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64>
+; CHECK-NEXT:    [[NARROW:%.*]] = call <2 x i32> @llvm.umax.v2i32(<2 x i32> [[A:%.*]], <2 x i32> <i32 3, i32 3>)
+; CHECK-NEXT:    [[MAX:%.*]] = sext <2 x i32> [[NARROW]] to <2 x i64>
 ; CHECK-NEXT:    ret <2 x i64> [[MAX]]
 ;
   %a_ext = sext <2 x i32> %a to <2 x i64>
@@ -317,8 +317,8 @@ define <2 x i64> @umax_sext_vec(<2 x i32> %a) {
 
 define i64 @umin_sext(i32 %a) {
 ; CHECK-LABEL: @umin_sext(
-; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.umin.i32(i32 [[A:%.*]], i32 2)
-; CHECK-NEXT:    [[MIN:%.*]] = zext i32 [[TMP1]] to i64
+; CHECK-NEXT:    [[NARROW:%.*]] = call i32 @llvm.umin.i32(i32 [[A:%.*]], i32 2)
+; CHECK-NEXT:    [[MIN:%.*]] = zext i32 [[NARROW]] to i64
 ; CHECK-NEXT:    ret i64 [[MIN]]
 ;
   %a_ext = sext i32 %a to i64
@@ -329,8 +329,8 @@ define i64 @umin_sext(i32 %a) {
 
 define <2 x i64> @umin_sext_vec(<2 x i32> %a) {
 ; CHECK-LABEL: @umin_sext_vec(
-; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x i32> @llvm.umin.v2i32(<2 x i32> [[A:%.*]], <2 x i32> <i32 2, i32 2>)
-; CHECK-NEXT:    [[MIN:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
+; CHECK-NEXT:    [[NARROW:%.*]] = call <2 x i32> @llvm.umin.v2i32(<2 x i32> [[A:%.*]], <2 x i32> <i32 2, i32 2>)
+; CHECK-NEXT:    [[MIN:%.*]] = zext <2 x i32> [[NARROW]] to <2 x i64>
 ; CHECK-NEXT:    ret <2 x i64> [[MIN]]
 ;
   %a_ext = sext <2 x i32> %a to <2 x i64>
@@ -341,8 +341,8 @@ define <2 x i64> @umin_sext_vec(<2 x i32> %a) {
 
 define i64 @umax_sext2(i32 %a) {
 ; CHECK-LABEL: @umax_sext2(
-; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.umax.i32(i32 [[A:%.*]], i32 2)
-; CHECK-NEXT:    [[MIN:%.*]] = sext i32 [[TMP1]] to i64
+; CHECK-NEXT:    [[NARROW:%.*]] = call i32 @llvm.umax.i32(i32 [[A:%.*]], i32 2)
+; CHECK-NEXT:    [[MIN:%.*]] = sext i32 [[NARROW]] to i64
 ; CHECK-NEXT:    ret i64 [[MIN]]
 ;
   %a_ext = sext i32 %a to i64
@@ -353,8 +353,8 @@ define i64 @umax_sext2(i32 %a) {
 
 define <2 x i64> @umax_sext2_vec(<2 x i32> %a) {
 ; CHECK-LABEL: @umax_sext2_vec(
-; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x i32> @llvm.umax.v2i32(<2 x i32> [[A:%.*]], <2 x i32> <i32 2, i32 2>)
-; CHECK-NEXT:    [[MIN:%.*]] = sext <2 x i32> [[TMP1]] to <2 x i64>
+; CHECK-NEXT:    [[NARROW:%.*]] = call <2 x i32> @llvm.umax.v2i32(<2 x i32> [[A:%.*]], <2 x i32> <i32 2, i32 2>)
+; CHECK-NEXT:    [[MIN:%.*]] = sext <2 x i32> [[NARROW]] to <2 x i64>
 ; CHECK-NEXT:    ret <2 x i64> [[MIN]]
 ;
   %a_ext = sext <2 x i32> %a to <2 x i64>
@@ -365,8 +365,8 @@ define <2 x i64> @umax_sext2_vec(<2 x i32> %a) {
 
 define i64 @umin_sext2(i32 %a) {
 ; CHECK-LABEL: @umin_sext2(
-; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.umin.i32(i32 [[A:%.*]], i32 3)
-; CHECK-NEXT:    [[MIN:%.*]] = zext i32 [[TMP1]] to i64
+; CHECK-NEXT:    [[NARROW:%.*]] = call i32 @llvm.umin.i32(i32 [[A:%.*]], i32 3)
+; CHECK-NEXT:    [[MIN:%.*]] = zext i32 [[NARROW]] to i64
 ; CHECK-NEXT:    ret i64 [[MIN]]
 ;
   %a_ext = sext i32 %a to i64
@@ -377,8 +377,8 @@ define i64 @umin_sext2(i32 %a) {
 
 define <2 x i64> @umin_sext2_vec(<2 x i32> %a) {
 ; CHECK-LABEL: @umin_sext2_vec(
-; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x i32> @llvm.umin.v2i32(<2 x i32> [[A:%.*]], <2 x i32> <i32 3, i32 3>)
-; CHECK-NEXT:    [[MIN:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
+; CHECK-NEXT:    [[NARROW:%.*]] = call <2 x i32> @llvm.umin.v2i32(<2 x i32> [[A:%.*]], <2 x i32> <i32 3, i32 3>)
+; CHECK-NEXT:    [[MIN:%.*]] = zext <2 x i32> [[NARROW]] to <2 x i64>
 ; CHECK-NEXT:    ret <2 x i64> [[MIN]]
 ;
   %a_ext = sext <2 x i32> %a to <2 x i64>
@@ -389,8 +389,8 @@ define <2 x i64> @umin_sext2_vec(<2 x i32> %a) {
 
 define i64 @umax_zext(i32 %a) {
 ; CHECK-LABEL: @umax_zext(
-; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.umax.i32(i32 [[A:%.*]], i32 3)
-; CHECK-NEXT:    [[MAX:%.*]] = zext i32 [[TMP1]] to i64
+; CHECK-NEXT:    [[NARROW:%.*]] = call i32 @llvm.umax.i32(i32 [[A:%.*]], i32 3)
+; CHECK-NEXT:    [[MAX:%.*]] = zext i32 [[NARROW]] to i64
 ; CHECK-NEXT:    ret i64 [[MAX]]
 ;
   %a_ext = zext i32 %a to i64
@@ -401,8 +401,8 @@ define i64 @umax_zext(i32 %a) {
 
 define <2 x i64> @umax_zext_vec(<2 x i32> %a) {
 ; CHECK-LABEL: @umax_zext_vec(
-; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x i32> @llvm.umax.v2i32(<2 x i32> [[A:%.*]], <2 x i32> <i32 3, i32 3>)
-; CHECK-NEXT:    [[MAX:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
+; CHECK-NEXT:    [[NARROW:%.*]] = call <2 x i32> @llvm.umax.v2i32(<2 x i32> [[A:%.*]], <2 x i32> <i32 3, i32 3>)
+; CHECK-NEXT:    [[MAX:%.*]] = zext <2 x i32> [[NARROW]] to <2 x i64>
 ; CHECK-NEXT:    ret <2 x i64> [[MAX]]
 ;
   %a_ext = zext <2 x i32> %a to <2 x i64>
@@ -413,8 +413,8 @@ define <2 x i64> @umax_zext_vec(<2 x i32> %a) {
 
 define i64 @umin_zext(i32 %a) {
 ; CHECK-LABEL: @umin_zext(
-; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.umin.i32(i32 [[A:%.*]], i32 2)
-; CHECK-NEXT:    [[MIN:%.*]] = zext i32 [[TMP1]] to i64
+; CHECK-NEXT:    [[NARROW:%.*]] = call i32 @llvm.umin.i32(i32 [[A:%.*]], i32 2)
+; CHECK-NEXT:    [[MIN:%.*]] = zext i32 [[NARROW]] to i64
 ; CHECK-NEXT:    ret i64 [[MIN]]
 ;
   %a_ext = zext i32 %a to i64
@@ -425,8 +425,8 @@ define i64 @umin_zext(i32 %a) {
 
 define <2 x i64> @umin_zext_vec(<2 x i32> %a) {
 ; CHECK-LABEL: @umin_zext_vec(
-; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x i32> @llvm.umin.v2i32(<2 x i32> [[A:%.*]], <2 x i32> <i32 2, i32 2>)
-; CHECK-NEXT:    [[MIN:%.*]] = zext <2 x i32> [[TMP1]] to <2 x i64>
+; CHECK-NEXT:    [[NARROW:%.*]] = call <2 x i32> @llvm.umin.v2i32(<2 x i32> [[A:%.*]], <2 x i32> <i32 2, i32 2>)
+; CHECK-NEXT:    [[MIN:%.*]] = zext <2 x i32> [[NARROW]] to <2 x i64>
 ; CHECK-NEXT:    ret <2 x i64> [[MIN]]
 ;
   %a_ext = zext <2 x i32> %a to <2 x i64>
diff --git a/llvm/test/Transforms/InstCombine/cast-mul-select.ll b/llvm/test/Transforms/InstCombine/cast-mul-select.ll
index 8c06e556de48a..ab8333beb9e76 100644
--- a/llvm/test/Transforms/InstCombine/cast-mul-select.ll
+++ b/llvm/test/Transforms/InstCombine/cast-mul-select.ll
@@ -9,17 +9,20 @@ define i32 @mul(i32 %x, i32 %y) {
 ; CHECK-NEXT:    [[C:%.*]] = mul i32 [[X:%.*]], [[Y:%.*]]
 ; CHECK-NEXT:    [[D:%.*]] = and i32 [[C]], 255
 ; CHECK-NEXT:    ret i32 [[D]]
+;
+; DBGINFO-LABEL: @mul(
+; DBGINFO-NEXT:    call void @llvm.dbg.value(metadata i32 [[X:%.*]], metadata [[META9:![0-9]+]], metadata !DIExpression(DW_OP_LLVM_convert, 32, DW_ATE_unsigned, DW_OP_LLVM_convert, 8, DW_ATE_unsigned, DW_OP_stack_value)), !dbg [[DBG15:![0-9]+]]
+; DBGINFO-NEXT:    call void @llvm.dbg.value(metadata i32 [[Y:%.*]], metadata [[META11:![0-9]+]], metadata !DIExpression(DW_OP_LLVM_convert, 32, DW_ATE_unsigned, DW_OP_LLVM_convert, 8, DW_ATE_unsigned, DW_OP_stack_value)), !dbg [[DBG16:![0-9]+]]
+; DBGINFO-NEXT:    [[C:%.*]] = mul i32 [[X]], [[Y]], !dbg [[DBG17:![0-9]+]]
+; DBGINFO-NEXT:    [[D:%.*]] = and i32 [[C]], 255, !dbg [[DBG18:![0-9]+]]
+; DBGINFO-NEXT:    call void @llvm.dbg.value(metadata i32 [[C]], metadata [[META12:![0-9]+]], metadata !DIExpression()), !dbg [[DBG17]]
+; DBGINFO-NEXT:    call void @llvm.dbg.value(metadata i32 [[D]], metadata [[META13:![0-9]+]], metadata !DIExpression()), !dbg [[DBG18]]
+; DBGINFO-NEXT:    ret i32 [[D]], !dbg [[DBG19:![0-9]+]]
+;
 
 ; Test that when zext is evaluated in different type
 ; we preserve the debug information in the resulting
 ; instruction.
-; DBGINFO-LABEL: @mul(
-; DBGINFO-NEXT:    call void @llvm.dbg.value(metadata i32 %x, {{.*}} !DIExpression(DW_OP_LLVM_convert, 32, DW_ATE_unsigned, DW_OP_LLVM_convert, 8, DW_ATE_unsigned, DW_OP_stack_value))
-; DBGINFO-NEXT:    call void @llvm.dbg.value(metadata i32 %y, {{.*}} !DIExpression(DW_OP_LLVM_convert, 32, DW_ATE_unsigned, DW_OP_LLVM_convert, 8, DW_ATE_unsigned, DW_OP_stack_value))
-; DBGINFO-NEXT:    [[C:%.*]] = mul i32 {{.*}}
-; DBGINFO-NEXT:    [[D:%.*]] = and i32 {{.*}}
-; DBGINFO-NEXT:    call void @llvm.dbg.value(metadata i32 [[C]]
-; DBGINFO-NEXT:    call void @llvm.dbg.value(metadata i32 [[D]]
 
   %A = trunc i32 %x to i8
   %B = trunc i32 %y to i8
@@ -34,6 +37,18 @@ define i32 @select1(i1 %cond, i32 %x, i32 %y, i32 %z) {
 ; CHECK-NEXT:    [[E:%.*]] = select i1 [[COND:%.*]], i32 [[Z:%.*]], i32 [[D]]
 ; CHECK-NEXT:    [[F:%.*]] = and i32 [[E]], 255
 ; CHECK-NEXT:    ret i32 [[F]]
+;
+; DBGINFO-LABEL: @select1(
+; DBGINFO-NEXT:    call void @llvm.dbg.value(metadata i32 [[X:%.*]], metadata [[META22:![0-9]+]], metadata !DIExpression(DW_OP_LLVM_convert, 32, DW_ATE_unsigned, DW_OP_LLVM_convert, 8, DW_ATE_unsigned, DW_OP_stack_value)), !dbg [[DBG28:![0-9]+]]
+; DBGINFO-NEXT:    call void @llvm.dbg.value(metadata i32 [[Y:%.*]], metadata [[META23:![0-9]+]], metadata !DIExpression(DW_OP_LLVM_convert, 32, DW_ATE_unsigned, DW_OP_LLVM_convert, 8, DW_ATE_unsigned, DW_OP_stack_value)), !dbg [[DBG29:![0-9]+]]
+; DBGINFO-NEXT:    call void @llvm.dbg.value(metadata i32 [[Z:%.*]], metadata [[META24:![0-9]+]], metadata !DIExpression(DW_OP_LLVM_convert, 32, DW_ATE_unsigned, DW_OP_LLVM_convert, 8, DW_ATE_unsigned, DW_OP_stack_value)), !dbg [[DBG30:![0-9]+]]
+; DBGINFO-NEXT:    [[D:%.*]] = add i32 [[X]], [[Y]], !dbg [[DBG31:![0-9]+]]
+; DBGINFO-NEXT:    call void @llvm.dbg.value(metadata !DIArgList(i32 [[X]], i32 [[Y]]), metadata [[META25:![0-9]+]], metadata !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_convert, 32, DW_ATE_unsigned, DW_OP_LLVM_convert, 8, DW_ATE_unsigned, DW_OP_LLVM_arg, 1, DW_OP_LLVM_convert, 32, DW_ATE_unsigned, DW_OP_LLVM_convert, 8, DW_ATE_unsigned, DW_OP_plus, DW_OP_stack_value)), !dbg [[DBG31]]
+; DBGINFO-NEXT:    [[E:%.*]] = select i1 [[COND:%.*]], i32 [[Z]], i32 [[D]], !dbg [[DBG32:![0-9]+]]
+; DBGINFO-NEXT:    [[F:%.*]] = and i32 [[E]], 255, !dbg [[DBG33:![0-9]+]]
+; DBGINFO-NEXT:    call void @llvm.dbg.value(metadata i32 [[E]], metadata [[META26:![0-9]+]], metadata !DIExpression()), !dbg [[DBG32]]
+; DBGINFO-NEXT:    call void @llvm.dbg.value(metadata i32 [[F]], metadata [[META27:![0-9]+]], metadata !DIExpression()), !dbg [[DBG33]]
+; DBGINFO-NEXT:    ret i32 [[F]], !dbg [[DBG34:![0-9]+]]
 ;
   %A = trunc i32 %x to i8
   %B = trunc i32 %y to i8
@@ -49,6 +64,17 @@ define i8 @select2(i1 %cond, i8 %x, i8 %y, i8 %z) {
 ; CHECK-NEXT:    [[D:%.*]] = add i8 [[X:%.*]], [[Y:%.*]]
 ; CHECK-NEXT:    [[E:%.*]] = select i1 [[COND:%.*]], i8 [[Z:%.*]], i8 [[D]]
 ; CHECK-NEXT:    ret i8 [[E]]
+;
+; DBGINFO-LABEL: @select2(
+; DBGINFO-NEXT:    call void @llvm.dbg.value(metadata i8 [[X:%.*]], metadata [[META37:![0-9]+]], metadata !DIExpression(DW_OP_LLVM_convert, 8, DW_ATE_unsigned, DW_OP_LLVM_convert, 32, DW_ATE_unsigned, DW_OP_stack_value)), !dbg [[DBG43:![0-9]+]]
+; DBGINFO-NEXT:    call void @llvm.dbg.value(metadata i8 [[Y:%.*]], metadata [[META38:![0-9]+]], metadata !DIExpression(DW_OP_LLVM_convert, 8, DW_ATE_unsigned, DW_OP_LLVM_convert, 32, DW_ATE_unsigned, DW_OP_stack_value)), !dbg [[DBG44:![0-9]+]]
+; DBGINFO-NEXT:    call void @llvm.dbg.value(metadata i8 [[Z:%.*]], metadata [[META39:![0-9]+]], metadata !DIExpression(DW_OP_LLVM_convert, 8, DW_ATE_unsigned, DW_OP_LLVM_convert, 32, DW_ATE_unsigned, DW_OP_stack_value)), !dbg [[DBG45:![0-9]+]]
+; DBGINFO-NEXT:    [[D:%.*]] = add i8 [[X]], [[Y]], !dbg [[DBG46:![0-9]+]]
+; DBGINFO-NEXT:    call void @llvm.dbg.value(metadata !DIArgList(i8 [[X]], i8 [[Y]]), metadata [[META40:![0-9]+]], metadata !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_convert, 8, DW_ATE_unsigned, DW_OP_LLVM_convert, 32, DW_ATE_unsigned, DW_OP_LLVM_arg, 1, DW_OP_LLVM_convert, 8, DW_ATE_unsigned, DW_OP_LLVM_convert, 32, DW_ATE_unsigned, DW_OP_plus, DW_OP_stack_value)), !dbg [[DBG46]]
+; DBGINFO-NEXT:    [[E:%.*]] = select i1 [[COND:%.*]], i8 [[Z]], i8 [[D]], !dbg [[DBG47:![0-9]+]]
+; DBGINFO-NEXT:    call void @llvm.dbg.value(metadata i32 poison, metadata [[META41:![0-9]+]], metadata !DIExpression()), !dbg [[DBG47]]
+; DBGINFO-NEXT:    call void @llvm.dbg.value(metadata i8 [[E]], metadata [[META42:![0-9]+]], metadata !DIExpression()), !dbg [[DBG48:![0-9]+]]
+; DBGINFO-NEXT:    ret i8 [[E]], !dbg [[DBG49:![0-9]+]]
 ;
   %A = zext i8 %x to i32
   %B = zext i8 %y to i32
@@ -69,6 +95,17 @@ define i32 @eval_trunc_multi_use_in_one_inst(i32 %x) {
 ; CHECK-NEXT:    [[M:%.*]] = mul i64 [[A]], [[A]]
 ; CHECK-NEXT:    [[T:%.*]] = trunc i64 [[M]] to i32
 ; CHECK-NEXT:    ret i32 [[T]]
+;
+; DBGINFO-LABEL: @eval_trunc_multi_use_in_one_inst(
+; DBGINFO-NEXT:    [[Z:%.*]] = zext i32 [[X:%.*]] to i64, !dbg [[DBG57:![0-9]+]]
+; DBGINFO-NEXT:    call void @llvm.dbg.value(metadata i64 [[Z]], metadata [[META52:![0-9]+]], metadata !DIExpression()), !dbg [[DBG57]]
+; DBGINFO-NEXT:    [[A:%.*]] = add nuw nsw i64 [[Z]], 15, !dbg [[DBG58:![0-9]+]]
+; DBGINFO-NEXT:    call void @llvm.dbg.value(metadata i64 [[A]], metadata [[META54:![0-9]+]], metadata !DIExpression()), !dbg [[DBG58]]
+; DBGINFO-NEXT:    [[M:%.*]] = mul i64 [[A]], [[A]], !dbg [[DBG59:![0-9]+]]
+; DBGINFO-NEXT:    call void @llvm.dbg.value(metadata i64 [[M]], metadata [[META55:![0-9]+]], metadata !DIExpression()), !dbg [[DBG59]]
+; DBGINFO-NEXT:    [[T:%.*]] = trunc i64 [[M]] to i32, !dbg [[DBG60:![0-9]+]]
+; DBGINFO-NEXT:    call void @llvm.dbg.value(metadata i32 [[T]], metadata [[META56:![0-9]+]], metadata !DIExpression()), !dbg [[DBG60]]
+; DBGINFO-NEXT:    ret i32 [[T]], !dbg [[DBG61:![0-9]+]]
 ;
   %z = zext i32 %x to i64
   %a = add nsw nuw i64 %z, 15
@@ -84,6 +121,17 @@ define i32 @eval_zext_multi_use_in_one_inst(i32 %x) {
 ; CHECK-NEXT:    [[M:%.*]] = mul nuw nsw i16 [[A]], [[A]]
 ; CHECK-NEXT:    [[R:%.*]] = zext i16 [[M]] to i32
 ; CHECK-NEXT:    ret i32 [[R]]
+;
+; DBGINFO-LABEL: @eval_zext_multi_use_in_one_inst(
+; DBGINFO-NEXT:    [[T:%.*]] = trunc i32 [[X:%.*]] to i16, !dbg [[DBG69:![0-9]+]]
+; DBGINFO-NEXT:    call void @llvm.dbg.value(metadata i16 [[T]], metadata [[META64:![0-9]+]], metadata !DIExpression()), !dbg [[DBG69]]
+; DBGINFO-NEXT:    [[A:%.*]] = and i16 [[T]], 5, !dbg [[DBG70:![0-9]+]]
+; DBGINFO-NEXT:    call void @llvm.dbg.value(metadata i16 [[A]], metadata [[META66:![0-9]+]], metadata !DIExpression()), !dbg [[DBG70]]
+; DBGINFO-NEXT:    [[M:%.*]] = mul nuw nsw i16 [[A]], [[A]], !dbg [[DBG71:![0-9]+]]
+; DBGINFO-NEXT:    call void @llvm.dbg.value(metadata i16 [[M]], metadata [[META67:![0-9]+]], metadata !DIExpression()), !dbg [[DBG71]]
+; DBGINFO-NEXT:    [[R:%.*]] = zext i16 [[M]] to i32, !dbg [[DBG72:![0-9]+]]
+; DBGINFO-NEXT:    call void @llvm.dbg.value(metadata i32 [[R]], metadata [[META68:![0-9]+]], metadata !DIExpression()), !dbg [[DBG72]]
+; DBGINFO-NEXT:    ret i32 [[R]], !dbg [[DBG73:![0-9]+]]
 ;
   %t = trunc i32 %x to i16
   %a = and i16 %t, 5
@@ -100,6 +148,19 @@ define i32 @eval_sext_multi_use_in_one_inst(i32 %x) {
 ; CHECK-NEXT:    [[O:%.*]] = or i16 [[M]], -32768
 ; CHECK-NEXT:    [[R:%.*]] = sext i16 [[O]] to i32
 ; CHECK-NEXT:    ret i32 [[R]]
+;
+; DBGINFO-LABEL: @eval_sext_multi_use_in_one_inst(
+; DBGINFO-NEXT:    [[T:%.*]] = trunc i32 [[X:%.*]] to i16, !dbg [[DBG81:![0-9]+]]
+; DBGINFO-NEXT:    call void @llvm.dbg.value(metadata i16 [[T]], metadata [[META76:![0-9]+]], metadata !DIExpression()), !dbg [[DBG81]]
+; DBGINFO-NEXT:    [[A:%.*]] = and i16 [[T]], 14, !dbg [[DBG82:![0-9]+]]
+; DBGINFO-NEXT:    call void @llvm.dbg.value(metadata i16 [[A]], metadata [[META77:![0-9]+]], metadata !DIExpression()), !dbg [[DBG82]]
+; DBGINFO-NEXT:    [[M:%.*]] = mul nuw nsw i16 [[A]], [[A]], !dbg [[DBG83:![0-9]+]]
+; DBGINFO-NEXT:    call void @llvm.dbg.value(metadata i16 [[M]], metadata [[META78:![0-9]+]], metadata !DIExpression()), !dbg [[DBG83]]
+; DBGINFO-NEXT:    [[O:%.*]] = or i16 [[M]], -32768, !dbg [[DBG84:![0-9]+]]
+; DBGINFO-NEXT:    call void @llvm.dbg.value(metadata i16 [[O]], metadata [[META79:![0-9]+]], metadata !DIExpression()), !dbg [[DBG84]]
+; DBGINFO-NEXT:    [[R:%.*]] = sext i16 [[O]] to i32, !dbg [[DBG85:![0-9]+]]
+; DBGINFO-NEXT:    call void @llvm.dbg.value(metadata i32 [[R]], metadata [[META80:![0-9]+]], metadata !DIExpression()), !dbg [[DBG85]]
+; DBGINFO-NEXT:    ret i32 [[R]], !dbg [[DBG86:![0-9]+]]
 ;
   %t = trunc i32 %x to i16
   %a = and i16 %t, 14
@@ -140,6 +201,39 @@ define void @PR36225(i32 %a, i32 %b, i1 %c1, i3 %v1, i3 %v2) {
 ; CHECK:       exit:
 ; CHECK-NEXT:    unreachable
 ;
+; DBGINFO-LABEL: @PR36225(
+; DBGINFO-NEXT:  entry:
+; DBGINFO-NEXT:    br label [[WHILE_BODY:%.*]], !dbg [[DBG94:![0-9]+]]
+; DBGINFO:       while.body:
+; DBGINFO-NEXT:    call void @llvm.dbg.value(metadata i32 [[B:%.*]], metadata [[META89:![0-9]+]], metadata !DIExpression(DW_OP_constu, 0, DW_OP_eq, DW_OP_stack_value)), !dbg [[DBG95:![0-9]+]]
+; DBGINFO-NEXT:    br i1 [[C1:%.*]], label [[FOR_BODY3_US:%.*]], label [[FOR_BODY3:%.*]], !dbg [[DBG96:![0-9]+]]
+; DBGINFO:       for.body3.us:
+; DBGINFO-NEXT:    [[TOBOOL:%.*]] = icmp eq i32 [[B]], 0, !dbg [[DBG95]]
+; DBGINFO-NEXT:    call void @llvm.dbg.value(metadata i1 [[TOBOOL]], metadata [[META89]], metadata !DIExpression()), !dbg [[DBG95]]
+; DBGINFO-NEXT:    [[SPEC_SELECT:%.*]] = select i1 [[TOBOOL]], i8 0, i8 4, !dbg [[DBG97:![0-9]+]]
+; DBGINFO-NEXT:    call void @llvm.dbg.value(metadata i8 [[SPEC_SELECT]], metadata [[META90:![0-9]+]], metadata !DIExpression()), !dbg [[DBG97]]
+; DBGINFO-NEXT:    switch i3 [[V1:%.*]], label [[EXIT:%.*]] [
+; DBGINFO-NEXT:    i3 0, label [[FOR_END:%.*]]
+; DBGINFO-NEXT:    i3 -1, label [[FOR_END]]
+; DBGINFO-NEXT:    ], !dbg [[DBG98:![0-9]+]]
+; DBGINFO:       for.body3:
+; DBGINFO-NEXT:    switch i3 [[V2:%.*]], label [[EXIT]] [
+; DBGINFO-NEXT:    i3 0, label [[FOR_END]]
+; DBGINFO-NEXT:    i3 -1, label [[FOR_END]]
+; DBGINFO-NEXT:    ], !dbg [[DBG99:![0-9]+]]
+; DBGINFO:       for.end:
+; DBGINFO-NEXT:    [[H:%.*]] = phi i8 [ [[SPEC_SELECT]], [[FOR_BODY3_US]] ], [ [[SPEC_SELECT]], [[FOR_BODY3_US]] ], [ 0, [[FOR_BODY3]] ], [ 0, [[FOR_BODY3]] ], !dbg [[DBG100:![0-9]+]]
+; DBGINFO-NEXT:    call void @llvm.dbg.value(metadata i8 [[H]], metadata [[META91:![0-9]+]], metadata !DIExpression()), !dbg [[DBG100]]
+; DBGINFO-NEXT:    [[CONV:%.*]] = zext i8 [[H]] to i32, !dbg [[DBG101:![0-9]+]]
+; DBGINFO-NEXT:    call void @llvm.dbg.value(metadata i32 [[CONV]], metadata [[META92:![0-9]+]], metadata !DIExpression()), !dbg [[DBG101]]
+; DBGINFO-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CONV]], [[A:%.*]], !dbg [[DBG102:![0-9]+]]
+; DBGINFO-NEXT:    call void @llvm.dbg.value(metadata i1 [[CMP]], metadata [[META93:![0-9]+]], metadata !DIExpression()), !dbg [[DBG102]]
+; DBGINFO-NEXT:    br i1 [[CMP]], label [[EXIT]], label [[EXIT2:%.*]], !dbg [[DBG103:![0-9]+]]
+; DBGINFO:       exit2:
+; DBGINFO-NEXT:    unreachable, !dbg [[DBG104:![0-9]+]]
+; DBGINFO:       exit:
+; DBGINFO-NEXT:    unreachable, !dbg [[DBG105:![0-9]+]]
+;
 entry:
   br label %while.body
 
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll
index c36c8c27a8ef7..9a55e1eee5bd4 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll
@@ -234,42 +234,42 @@ define void @test_i16_extend(ptr %p.1, ptr %p.2, i32 %idx.i32) {
 ; CHECK-NEXT:    [[IDX_0:%.*]] = zext i32 [[IDX_I32:%.*]] to i64
 ; CHECK-NEXT:    [[T53:%.*]] = getelementptr inbounds i16, ptr [[P_1:%.*]], i64 [[IDX_0]]
 ; CHECK-NEXT:    [[T56:%.*]] = getelementptr inbounds i16, ptr [[P_2:%.*]], i64 [[IDX_0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr [[T53]], align 2
-; CHECK-NEXT:    [[TMP3:%.*]] = zext <8 x i16> [[TMP2]] to <8 x i32>
-; CHECK-NEXT:    [[TMP5:%.*]] = load <8 x i16>, ptr [[T56]], align 2
-; CHECK-NEXT:    [[TMP6:%.*]] = zext <8 x i16> [[TMP5]] to <8 x i32>
-; CHECK-NEXT:    [[TMP7:%.*]] = sub nsw <8 x i32> [[TMP3]], [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x i32> [[TMP7]], i64 0
-; CHECK-NEXT:    [[TMP9:%.*]] = sext i32 [[TMP8]] to i64
-; CHECK-NEXT:    [[T60:%.*]] = getelementptr inbounds i32, ptr [[P_0]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr [[T53]], align 2
+; CHECK-NEXT:    [[TMP2:%.*]] = zext <8 x i16> [[TMP1]] to <8 x i32>
+; CHECK-NEXT:    [[TMP3:%.*]] = load <8 x i16>, ptr [[T56]], align 2
+; CHECK-NEXT:    [[TMP4:%.*]] = zext <8 x i16> [[TMP3]] to <8 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = sub nsw <8 x i32> [[TMP2]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <8 x i32> [[TMP5]], i64 0
+; CHECK-NEXT:    [[TMP7:%.*]] = sext i32 [[TMP6]] to i64
+; CHECK-NEXT:    [[T60:%.*]] = getelementptr inbounds i32, ptr [[P_0]], i64 [[TMP7]]
 ; CHECK-NEXT:    [[L_1:%.*]] = load i32, ptr [[T60]], align 4
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <8 x i32> [[TMP7]], i64 1
-; CHECK-NEXT:    [[TMP11:%.*]] = sext i32 [[TMP10]] to i64
-; CHECK-NEXT:    [[T71:%.*]] = getelementptr inbounds i32, ptr [[P_0]], i64 [[TMP11]]
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <8 x i32> [[TMP5]], i64 1
+; CHECK-NEXT:    [[TMP9:%.*]] = sext i32 [[TMP8]] to i64
+; CHECK-NEXT:    [[T71:%.*]] = getelementptr inbounds i32, ptr [[P_0]], i64 [[TMP9]]
 ; CHECK-NEXT:    [[L_2:%.*]] = load i32, ptr [[T71]], align 4
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <8 x i32> [[TMP7]], i64 2
-; CHECK-NEXT:    [[TMP13:%.*]] = sext i32 [[TMP12]] to i64
-; CHECK-NEXT:    [[T82:%.*]] = getelementptr inbounds i32, ptr [[P_0]], i64 [[TMP13]]
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <8 x i32> [[TMP5]], i64 2
+; CHECK-NEXT:    [[TMP11:%.*]] = sext i32 [[TMP10]] to i64
+; CHECK-NEXT:    [[T82:%.*]] = getelementptr inbounds i32, ptr [[P_0]], i64 [[TMP11]]
 ; CHECK-NEXT:    [[L_3:%.*]] = load i32, ptr [[T82]], align 4
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <8 x i32> [[TMP7]], i64 3
-; CHECK-NEXT:    [[TMP15:%.*]] = sext i32 [[TMP14]] to i64
-; CHECK-NEXT:    [[T93:%.*]] = getelementptr inbounds i32, ptr [[P_0]], i64 [[TMP15]]
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <8 x i32> [[TMP5]], i64 3
+; CHECK-NEXT:    [[TMP13:%.*]] = sext i32 [[TMP12]] to i64
+; CHECK-NEXT:    [[T93:%.*]] = getelementptr inbounds i32, ptr [[P_0]], i64 [[TMP13]]
 ; CHECK-NEXT:    [[L_4:%.*]] = load i32, ptr [[T93]], align 4
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <8 x i32> [[TMP7]], i64 4
-; CHECK-NEXT:    [[TMP17:%.*]] = sext i32 [[TMP16]] to i64
-; CHECK-NEXT:    [[T104:%.*]] = getelementptr inbounds i32, ptr [[P_0]], i64 [[TMP17]]
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <8 x i32> [[TMP5]], i64 4
+; CHECK-NEXT:    [[TMP15:%.*]] = sext i32 [[TMP14]] to i64
+; CHECK-NEXT:    [[T104:%.*]] = getelementptr inbounds i32, ptr [[P_0]], i64 [[TMP15]]
 ; CHECK-NEXT:    [[L_5:%.*]] = load i32, ptr [[T104]], align 4
-; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <8 x i32> [[TMP7]], i64 5
-; CHECK-NEXT:    [[TMP19:%.*]] = sext i32 [[TMP18]] to i64
-; CHECK-NEXT:    [[T115:%.*]] = getelementptr inbounds i32, ptr [[P_0]], i64 [[TMP19]]
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <8 x i32> [[TMP5]], i64 5
+; CHECK-NEXT:    [[TMP17:%.*]] = sext i32 [[TMP16]] to i64
+; CHECK-NEXT:    [[T115:%.*]] = getelementptr inbounds i32, ptr [[P_0]], i64 [[TMP17]]
 ; CHECK-NEXT:    [[L_6:%.*]] = load i32, ptr [[T115]], align 4
-; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <8 x i32> [[TMP7]], i64 6
-; CHECK-NEXT:    [[TMP21:%.*]] = sext i32 [[TMP20]] to i64
-; CHECK-NEXT:    [[T126:%.*]] = getelementptr inbounds i32, ptr [[P_0]], i64 [[TMP21]]
+; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <8 x i32> [[TMP5]], i64 6
+; CHECK-NEXT:    [[TMP19:%.*]] = sext i32 [[TMP18]] to i64
+; CHECK-NEXT:    [[T126:%.*]] = getelementptr inbounds i32, ptr [[P_0]], i64 [[TMP19]]
 ; CHECK-NEXT:    [[L_7:%.*]] = load i32, ptr [[T126]], align 4
-; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <8 x i32> [[TMP7]], i64 7
-; CHECK-NEXT:    [[TMP23:%.*]] = sext i32 [[TMP22]] to i64
-; CHECK-NEXT:    [[T137:%.*]] = getelementptr inbounds i32, ptr [[P_0]], i64 [[TMP23]]
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <8 x i32> [[TMP5]], i64 7
+; CHECK-NEXT:    [[TMP21:%.*]] = sext i32 [[TMP20]] to i64
+; CHECK-NEXT:    [[T137:%.*]] = getelementptr inbounds i32, ptr [[P_0]], i64 [[TMP21]]
 ; CHECK-NEXT:    [[L_8:%.*]] = load i32, ptr [[T137]], align 4
 ; CHECK-NEXT:    call void @use(i32 [[L_1]], i32 [[L_2]], i32 [[L_3]], i32 [[L_4]], i32 [[L_5]], i32 [[L_6]], i32 [[L_7]], i32 [[L_8]])
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/AMDGPU/uniform-unswitch.ll b/llvm/test/Transforms/SimpleLoopUnswitch/AMDGPU/uniform-unswitch.ll
index 8d1b0c7177be4..cbbf4d6e7be19 100644
--- a/llvm/test/Transforms/SimpleLoopUnswitch/AMDGPU/uniform-unswitch.ll
+++ b/llvm/test/Transforms/SimpleLoopUnswitch/AMDGPU/uniform-unswitch.ll
@@ -19,8 +19,8 @@
 ; SHOULDBE-NEXT: br i1
 
 define amdgpu_kernel void @uniform_unswitch(ptr nocapture %out, i32 %n, i32 %x) {
-; CHECK-LABEL: define amdgpu_kernel void @uniform_unswitch
-; CHECK-SAME: (ptr nocapture writeonly [[OUT:%.*]], i32 [[N:%.*]], i32 [[X:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-LABEL: define amdgpu_kernel void @uniform_unswitch(
+; CHECK-SAME: ptr nocapture writeonly [[OUT:%.*]], i32 [[N:%.*]], i32 [[X:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[OUT_GLOBAL:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
 ; CHECK-NEXT:    [[CMP6:%.*]] = icmp sgt i32 [[N]], 0

From 849f963e3139d79eba9989554a299ec6a1a16b10 Mon Sep 17 00:00:00 2001
From: Igor Kirillov <igor.kirillov@arm.com>
Date: Mon, 30 Oct 2023 18:40:48 +0000
Subject: [PATCH 415/877] [CodeGen] Improve ExpandMemCmp for more efficient
 non-register aligned sizes handling (#70469)

* Enhanced the logic of ExpandMemCmp pass to merge contiguous
subsequences
  in LoadSequence, based on sizes allowed in `AllowedTailExpansions`.
* This enhancement seeks to minimize the number of basic blocks and
produce
  optimized code when using memcmp with non-register aligned sizes.
* Enable this feature for AArch64 with memcmp sizes modulo 8 equal to
  3, 5, and 6.

Reapplication of #69942 after fixing a bug
---
 .../llvm/Analysis/TargetTransformInfo.h       |   11 +
 llvm/lib/CodeGen/ExpandMemCmp.cpp             |   95 +-
 .../AArch64/AArch64TargetTransformInfo.cpp    |    1 +
 llvm/test/CodeGen/AArch64/memcmp.ll           | 3005 +++++++++++++++++
 .../Transforms/ExpandMemCmp/AArch64/memcmp.ll |  881 +++++
 5 files changed, 3973 insertions(+), 20 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/memcmp.ll
 create mode 100644 llvm/test/Transforms/ExpandMemCmp/AArch64/memcmp.ll

diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 5234ef8788d9e..3ec80d99b392b 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -907,6 +907,17 @@ class TargetTransformInfo {
     // be done with two 4-byte compares instead of 4+2+1-byte compares. This
     // requires all loads in LoadSizes to be doable in an unaligned way.
     bool AllowOverlappingLoads = false;
+
+    // Sometimes, the amount of data that needs to be compared is smaller than
+    // the standard register size, but it cannot be loaded with just one load
+    // instruction. For example, if the size of the memory comparison is 6
+    // bytes, we can handle it more efficiently by loading all 6 bytes in a
+    // single block and generating an 8-byte number, instead of generating two
+    // separate blocks with conditional jumps for 4 and 2 byte loads. This
+    // approach simplifies the process and produces the comparison result as
+    // normal. This array lists the allowed sizes of memcmp tails that can be
+    // merged into one block
+    SmallVector<unsigned, 4> AllowedTailExpansions;
   };
   MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize,
                                                bool IsZeroCmp) const;
diff --git a/llvm/lib/CodeGen/ExpandMemCmp.cpp b/llvm/lib/CodeGen/ExpandMemCmp.cpp
index 911ebd41afc5b..28e258be226a6 100644
--- a/llvm/lib/CodeGen/ExpandMemCmp.cpp
+++ b/llvm/lib/CodeGen/ExpandMemCmp.cpp
@@ -117,8 +117,8 @@ class MemCmpExpansion {
     Value *Lhs = nullptr;
     Value *Rhs = nullptr;
   };
-  LoadPair getLoadPair(Type *LoadSizeType, bool NeedsBSwap, Type *CmpSizeType,
-                       unsigned OffsetBytes);
+  LoadPair getLoadPair(Type *LoadSizeType, Type *BSwapSizeType,
+                       Type *CmpSizeType, unsigned OffsetBytes);
 
   static LoadEntryVector
   computeGreedyLoadSequence(uint64_t Size, llvm::ArrayRef<unsigned> LoadSizes,
@@ -128,6 +128,11 @@ class MemCmpExpansion {
                                  unsigned MaxNumLoads,
                                  unsigned &NumLoadsNonOneByte);
 
+  static void optimiseLoadSequence(
+      LoadEntryVector &LoadSequence,
+      const TargetTransformInfo::MemCmpExpansionOptions &Options,
+      bool IsUsedForZeroCmp);
+
 public:
   MemCmpExpansion(CallInst *CI, uint64_t Size,
                   const TargetTransformInfo::MemCmpExpansionOptions &Options,
@@ -210,6 +215,37 @@ MemCmpExpansion::computeOverlappingLoadSequence(uint64_t Size,
   return LoadSequence;
 }
 
+void MemCmpExpansion::optimiseLoadSequence(
+    LoadEntryVector &LoadSequence,
+    const TargetTransformInfo::MemCmpExpansionOptions &Options,
+    bool IsUsedForZeroCmp) {
+  // This part of code attempts to optimize the LoadSequence by merging allowed
+  // subsequences into single loads of allowed sizes from
+  // `MemCmpExpansionOptions::AllowedTailExpansions`. If it is for zero
+  // comparison or if no allowed tail expansions are specified, we exit early.
+  if (IsUsedForZeroCmp || Options.AllowedTailExpansions.empty())
+    return;
+
+  while (LoadSequence.size() >= 2) {
+    auto Last = LoadSequence[LoadSequence.size() - 1];
+    auto PreLast = LoadSequence[LoadSequence.size() - 2];
+
+    // Exit the loop if the two sequences are not contiguous
+    if (PreLast.Offset + PreLast.LoadSize != Last.Offset)
+      break;
+
+    auto LoadSize = Last.LoadSize + PreLast.LoadSize;
+    if (find(Options.AllowedTailExpansions, LoadSize) ==
+        Options.AllowedTailExpansions.end())
+      break;
+
+    // Remove the last two sequences and replace with the combined sequence
+    LoadSequence.pop_back();
+    LoadSequence.pop_back();
+    LoadSequence.emplace_back(PreLast.Offset, LoadSize);
+  }
+}
+
 // Initialize the basic block structure required for expansion of memcmp call
 // with given maximum load size and memcmp size parameter.
 // This structure includes:
@@ -255,6 +291,7 @@ MemCmpExpansion::MemCmpExpansion(
     }
   }
   assert(LoadSequence.size() <= Options.MaxNumLoads && "broken invariant");
+  optimiseLoadSequence(LoadSequence, Options, IsUsedForZeroCmp);
 }
 
 unsigned MemCmpExpansion::getNumBlocks() {
@@ -278,7 +315,7 @@ void MemCmpExpansion::createResultBlock() {
 }
 
 MemCmpExpansion::LoadPair MemCmpExpansion::getLoadPair(Type *LoadSizeType,
-                                                       bool NeedsBSwap,
+                                                       Type *BSwapSizeType,
                                                        Type *CmpSizeType,
                                                        unsigned OffsetBytes) {
   // Get the memory source at offset `OffsetBytes`.
@@ -307,16 +344,22 @@ MemCmpExpansion::LoadPair MemCmpExpansion::getLoadPair(Type *LoadSizeType,
   if (!Rhs)
     Rhs = Builder.CreateAlignedLoad(LoadSizeType, RhsSource, RhsAlign);
 
+  // Zero extend if Byte Swap intrinsic has different type
+  if (BSwapSizeType && LoadSizeType != BSwapSizeType) {
+    Lhs = Builder.CreateZExt(Lhs, BSwapSizeType);
+    Rhs = Builder.CreateZExt(Rhs, BSwapSizeType);
+  }
+
   // Swap bytes if required.
-  if (NeedsBSwap) {
-    Function *Bswap = Intrinsic::getDeclaration(CI->getModule(),
-                                                Intrinsic::bswap, LoadSizeType);
+  if (BSwapSizeType) {
+    Function *Bswap = Intrinsic::getDeclaration(
+        CI->getModule(), Intrinsic::bswap, BSwapSizeType);
     Lhs = Builder.CreateCall(Bswap, Lhs);
     Rhs = Builder.CreateCall(Bswap, Rhs);
   }
 
   // Zero extend if required.
-  if (CmpSizeType != nullptr && CmpSizeType != LoadSizeType) {
+  if (CmpSizeType != nullptr && CmpSizeType != Lhs->getType()) {
     Lhs = Builder.CreateZExt(Lhs, CmpSizeType);
     Rhs = Builder.CreateZExt(Rhs, CmpSizeType);
   }
@@ -332,7 +375,7 @@ void MemCmpExpansion::emitLoadCompareByteBlock(unsigned BlockIndex,
   BasicBlock *BB = LoadCmpBlocks[BlockIndex];
   Builder.SetInsertPoint(BB);
   const LoadPair Loads =
-      getLoadPair(Type::getInt8Ty(CI->getContext()), /*NeedsBSwap=*/false,
+      getLoadPair(Type::getInt8Ty(CI->getContext()), nullptr,
                   Type::getInt32Ty(CI->getContext()), OffsetBytes);
   Value *Diff = Builder.CreateSub(Loads.Lhs, Loads.Rhs);
 
@@ -385,11 +428,12 @@ Value *MemCmpExpansion::getCompareLoadPairs(unsigned BlockIndex,
   IntegerType *const MaxLoadType =
       NumLoads == 1 ? nullptr
                     : IntegerType::get(CI->getContext(), MaxLoadSize * 8);
+
   for (unsigned i = 0; i < NumLoads; ++i, ++LoadIndex) {
     const LoadEntry &CurLoadEntry = LoadSequence[LoadIndex];
     const LoadPair Loads = getLoadPair(
-        IntegerType::get(CI->getContext(), CurLoadEntry.LoadSize * 8),
-        /*NeedsBSwap=*/false, MaxLoadType, CurLoadEntry.Offset);
+        IntegerType::get(CI->getContext(), CurLoadEntry.LoadSize * 8), nullptr,
+        MaxLoadType, CurLoadEntry.Offset);
 
     if (NumLoads != 1) {
       // If we have multiple loads per block, we need to generate a composite
@@ -475,14 +519,20 @@ void MemCmpExpansion::emitLoadCompareBlock(unsigned BlockIndex) {
 
   Type *LoadSizeType =
       IntegerType::get(CI->getContext(), CurLoadEntry.LoadSize * 8);
-  Type *MaxLoadType = IntegerType::get(CI->getContext(), MaxLoadSize * 8);
+  Type *BSwapSizeType =
+      DL.isLittleEndian()
+          ? IntegerType::get(CI->getContext(),
+                             PowerOf2Ceil(CurLoadEntry.LoadSize * 8))
+          : nullptr;
+  Type *MaxLoadType = IntegerType::get(
+      CI->getContext(),
+      std::max(MaxLoadSize, (unsigned)PowerOf2Ceil(CurLoadEntry.LoadSize)) * 8);
   assert(CurLoadEntry.LoadSize <= MaxLoadSize && "Unexpected load type");
 
   Builder.SetInsertPoint(LoadCmpBlocks[BlockIndex]);
 
-  const LoadPair Loads =
-      getLoadPair(LoadSizeType, /*NeedsBSwap=*/DL.isLittleEndian(), MaxLoadType,
-                  CurLoadEntry.Offset);
+  const LoadPair Loads = getLoadPair(LoadSizeType, BSwapSizeType, MaxLoadType,
+                                     CurLoadEntry.Offset);
 
   // Add the loaded values to the phi nodes for calculating memcmp result only
   // if result is not used in a zero equality.
@@ -587,19 +637,24 @@ Value *MemCmpExpansion::getMemCmpEqZeroOneBlock() {
 /// A memcmp expansion that only has one block of load and compare can bypass
 /// the compare, branch, and phi IR that is required in the general case.
 Value *MemCmpExpansion::getMemCmpOneBlock() {
-  Type *LoadSizeType = IntegerType::get(CI->getContext(), Size * 8);
   bool NeedsBSwap = DL.isLittleEndian() && Size != 1;
+  Type *LoadSizeType = IntegerType::get(CI->getContext(), Size * 8);
+  Type *BSwapSizeType =
+      NeedsBSwap ? IntegerType::get(CI->getContext(), PowerOf2Ceil(Size * 8))
+                 : nullptr;
+  Type *MaxLoadType =
+      IntegerType::get(CI->getContext(),
+                       std::max(MaxLoadSize, (unsigned)PowerOf2Ceil(Size)) * 8);
 
   // The i8 and i16 cases don't need compares. We zext the loaded values and
   // subtract them to get the suitable negative, zero, or positive i32 result.
-  if (Size < 4) {
-    const LoadPair Loads =
-        getLoadPair(LoadSizeType, NeedsBSwap, Builder.getInt32Ty(),
-                    /*Offset*/ 0);
+  if (Size == 1 || Size == 2) {
+    const LoadPair Loads = getLoadPair(LoadSizeType, BSwapSizeType,
+                                       Builder.getInt32Ty(), /*Offset*/ 0);
     return Builder.CreateSub(Loads.Lhs, Loads.Rhs);
   }
 
-  const LoadPair Loads = getLoadPair(LoadSizeType, NeedsBSwap, LoadSizeType,
+  const LoadPair Loads = getLoadPair(LoadSizeType, BSwapSizeType, MaxLoadType,
                                      /*Offset*/ 0);
   // The result of memcmp is negative, zero, or positive, so produce that by
   // subtracting 2 extended compare bits: sub (ugt, ult).
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 6bbd7009e2378..776619c90393c 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -2994,6 +2994,7 @@ AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
   // they may wake up the FP unit, which raises the power consumption.  Perhaps
   // they could be used with no holds barred (-O3).
   Options.LoadSizes = {8, 4, 2, 1};
+  Options.AllowedTailExpansions = {3, 5, 6};
   return Options;
 }
 
diff --git a/llvm/test/CodeGen/AArch64/memcmp.ll b/llvm/test/CodeGen/AArch64/memcmp.ll
new file mode 100644
index 0000000000000..d13a416a28761
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/memcmp.ll
@@ -0,0 +1,3005 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc < %s -mtriple=aarch64-unknown-unknown | FileCheck %s
+
+@.str = private constant [513 x i8] c"01234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901\00", align 1
+
+declare dso_local i32 @memcmp(ptr, ptr, i64)
+
+define i32 @length0(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length0:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    ret
+   %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 0) nounwind
+   ret i32 %m
+ }
+
+define i1 @length0_eq(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length0_eq:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w0, #1 // =0x1
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 0) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length0_lt(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length0_lt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 0) nounwind
+  %c = icmp slt i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length2(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length2:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldrh w8, [x0]
+; CHECK-NEXT:    ldrh w9, [x1]
+; CHECK-NEXT:    rev w8, w8
+; CHECK-NEXT:    rev w9, w9
+; CHECK-NEXT:    lsr w8, w8, #16
+; CHECK-NEXT:    sub w0, w8, w9, lsr #16
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind
+  ret i32 %m
+}
+
+define i32 @length2_const(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length2_const:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldrh w9, [x0]
+; CHECK-NEXT:    mov w8, #-12594 // =0xffffcece
+; CHECK-NEXT:    rev w9, w9
+; CHECK-NEXT:    add w0, w8, w9, lsr #16
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i64 2) nounwind
+  ret i32 %m
+}
+
+define i1 @length2_gt_const(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length2_gt_const:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldrh w9, [x0]
+; CHECK-NEXT:    mov w8, #-12594 // =0xffffcece
+; CHECK-NEXT:    rev w9, w9
+; CHECK-NEXT:    add w8, w8, w9, lsr #16
+; CHECK-NEXT:    cmp w8, #0
+; CHECK-NEXT:    cset w0, gt
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i64 2) nounwind
+  %c = icmp sgt i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length2_eq(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length2_eq:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldrh w8, [x0]
+; CHECK-NEXT:    ldrh w9, [x1]
+; CHECK-NEXT:    cmp w8, w9
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length2_lt(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length2_lt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldrh w8, [x0]
+; CHECK-NEXT:    ldrh w9, [x1]
+; CHECK-NEXT:    rev w8, w8
+; CHECK-NEXT:    rev w9, w9
+; CHECK-NEXT:    lsr w8, w8, #16
+; CHECK-NEXT:    sub w8, w8, w9, lsr #16
+; CHECK-NEXT:    lsr w0, w8, #31
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind
+  %c = icmp slt i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length2_gt(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length2_gt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldrh w8, [x0]
+; CHECK-NEXT:    ldrh w9, [x1]
+; CHECK-NEXT:    rev w8, w8
+; CHECK-NEXT:    rev w9, w9
+; CHECK-NEXT:    lsr w8, w8, #16
+; CHECK-NEXT:    sub w8, w8, w9, lsr #16
+; CHECK-NEXT:    cmp w8, #0
+; CHECK-NEXT:    cset w0, gt
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind
+  %c = icmp sgt i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length2_eq_const(ptr %X) nounwind {
+; CHECK-LABEL: length2_eq_const:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldrh w8, [x0]
+; CHECK-NEXT:    mov w9, #12849 // =0x3231
+; CHECK-NEXT:    cmp w8, w9
+; CHECK-NEXT:    cset w0, ne
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i64 2) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length2_eq_nobuiltin_attr(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length2_eq_nobuiltin_attr:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    mov w2, #2 // =0x2
+; CHECK-NEXT:    bl memcmp
+; CHECK-NEXT:    cmp w0, #0
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 2) nounwind nobuiltin
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length3(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length3:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldrb w8, [x0, #2]
+; CHECK-NEXT:    ldrh w9, [x0]
+; CHECK-NEXT:    ldrb w10, [x1, #2]
+; CHECK-NEXT:    ldrh w11, [x1]
+; CHECK-NEXT:    orr w8, w9, w8, lsl #16
+; CHECK-NEXT:    orr w9, w11, w10, lsl #16
+; CHECK-NEXT:    rev w8, w8
+; CHECK-NEXT:    rev w9, w9
+; CHECK-NEXT:    cmp w8, w9
+; CHECK-NEXT:    cset w8, hi
+; CHECK-NEXT:    cset w9, lo
+; CHECK-NEXT:    sub w0, w8, w9
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 3) nounwind
+  ret i32 %m
+}
+
+define i1 @length3_eq(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length3_eq:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldrh w8, [x0]
+; CHECK-NEXT:    ldrh w9, [x1]
+; CHECK-NEXT:    ldrb w10, [x0, #2]
+; CHECK-NEXT:    ldrb w11, [x1, #2]
+; CHECK-NEXT:    cmp w8, w9
+; CHECK-NEXT:    ccmp w10, w11, #0, eq
+; CHECK-NEXT:    cset w0, ne
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 3) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length4(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length4:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr w8, [x0]
+; CHECK-NEXT:    ldr w9, [x1]
+; CHECK-NEXT:    rev w8, w8
+; CHECK-NEXT:    rev w9, w9
+; CHECK-NEXT:    cmp w8, w9
+; CHECK-NEXT:    cset w8, hi
+; CHECK-NEXT:    cset w9, lo
+; CHECK-NEXT:    sub w0, w8, w9
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind
+  ret i32 %m
+}
+
+define i1 @length4_eq(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length4_eq:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr w8, [x0]
+; CHECK-NEXT:    ldr w9, [x1]
+; CHECK-NEXT:    cmp w8, w9
+; CHECK-NEXT:    cset w0, ne
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length4_lt(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length4_lt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr w8, [x0]
+; CHECK-NEXT:    ldr w9, [x1]
+; CHECK-NEXT:    rev w8, w8
+; CHECK-NEXT:    rev w9, w9
+; CHECK-NEXT:    cmp w8, w9
+; CHECK-NEXT:    cset w8, hi
+; CHECK-NEXT:    cset w9, lo
+; CHECK-NEXT:    sub w8, w8, w9
+; CHECK-NEXT:    lsr w0, w8, #31
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind
+  %c = icmp slt i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length4_gt(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length4_gt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr w8, [x0]
+; CHECK-NEXT:    ldr w9, [x1]
+; CHECK-NEXT:    rev w8, w8
+; CHECK-NEXT:    rev w9, w9
+; CHECK-NEXT:    cmp w8, w9
+; CHECK-NEXT:    cset w8, hi
+; CHECK-NEXT:    cset w9, lo
+; CHECK-NEXT:    sub w8, w8, w9
+; CHECK-NEXT:    cmp w8, #0
+; CHECK-NEXT:    cset w0, gt
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 4) nounwind
+  %c = icmp sgt i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length4_eq_const(ptr %X) nounwind {
+; CHECK-LABEL: length4_eq_const:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr w8, [x0]
+; CHECK-NEXT:    mov w9, #12849 // =0x3231
+; CHECK-NEXT:    movk w9, #13363, lsl #16
+; CHECK-NEXT:    cmp w8, w9
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i64 4) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length5(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length5:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldrb w8, [x0, #4]
+; CHECK-NEXT:    ldr w9, [x0]
+; CHECK-NEXT:    ldrb w10, [x1, #4]
+; CHECK-NEXT:    ldr w11, [x1]
+; CHECK-NEXT:    orr x8, x9, x8, lsl #32
+; CHECK-NEXT:    orr x9, x11, x10, lsl #32
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    cset w8, hi
+; CHECK-NEXT:    cset w9, lo
+; CHECK-NEXT:    sub w0, w8, w9
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 5) nounwind
+  ret i32 %m
+}
+
+define i1 @length5_eq(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length5_eq:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr w8, [x0]
+; CHECK-NEXT:    ldr w9, [x1]
+; CHECK-NEXT:    ldrb w10, [x0, #4]
+; CHECK-NEXT:    ldrb w11, [x1, #4]
+; CHECK-NEXT:    cmp w8, w9
+; CHECK-NEXT:    ccmp w10, w11, #0, eq
+; CHECK-NEXT:    cset w0, ne
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 5) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length5_lt(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length5_lt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldrb w8, [x0, #4]
+; CHECK-NEXT:    ldr w9, [x0]
+; CHECK-NEXT:    ldrb w10, [x1, #4]
+; CHECK-NEXT:    ldr w11, [x1]
+; CHECK-NEXT:    orr x8, x9, x8, lsl #32
+; CHECK-NEXT:    orr x9, x11, x10, lsl #32
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    cset w8, hi
+; CHECK-NEXT:    cset w9, lo
+; CHECK-NEXT:    sub w8, w8, w9
+; CHECK-NEXT:    lsr w0, w8, #31
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 5) nounwind
+  %c = icmp slt i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length6(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length6:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldrh w8, [x0, #4]
+; CHECK-NEXT:    ldr w9, [x0]
+; CHECK-NEXT:    ldrh w10, [x1, #4]
+; CHECK-NEXT:    ldr w11, [x1]
+; CHECK-NEXT:    orr x8, x9, x8, lsl #32
+; CHECK-NEXT:    orr x9, x11, x10, lsl #32
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    cset w8, hi
+; CHECK-NEXT:    cset w9, lo
+; CHECK-NEXT:    sub w0, w8, w9
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 6) nounwind
+  ret i32 %m
+}
+
+define i32 @length7(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length7:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr w8, [x0]
+; CHECK-NEXT:    ldr w9, [x1]
+; CHECK-NEXT:    rev w8, w8
+; CHECK-NEXT:    rev w9, w9
+; CHECK-NEXT:    cmp w8, w9
+; CHECK-NEXT:    b.ne .LBB22_3
+; CHECK-NEXT:  // %bb.1: // %loadbb1
+; CHECK-NEXT:    ldur w8, [x0, #3]
+; CHECK-NEXT:    ldur w9, [x1, #3]
+; CHECK-NEXT:    rev w8, w8
+; CHECK-NEXT:    rev w9, w9
+; CHECK-NEXT:    cmp w8, w9
+; CHECK-NEXT:    b.ne .LBB22_3
+; CHECK-NEXT:  // %bb.2:
+; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB22_3: // %res_block
+; CHECK-NEXT:    cmp w8, w9
+; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
+; CHECK-NEXT:    cneg w0, w8, hs
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 7) nounwind
+  ret i32 %m
+}
+
+define i1 @length7_lt(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length7_lt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr w8, [x0]
+; CHECK-NEXT:    ldr w9, [x1]
+; CHECK-NEXT:    rev w8, w8
+; CHECK-NEXT:    rev w9, w9
+; CHECK-NEXT:    cmp w8, w9
+; CHECK-NEXT:    b.ne .LBB23_3
+; CHECK-NEXT:  // %bb.1: // %loadbb1
+; CHECK-NEXT:    ldur w8, [x0, #3]
+; CHECK-NEXT:    ldur w9, [x1, #3]
+; CHECK-NEXT:    rev w8, w8
+; CHECK-NEXT:    rev w9, w9
+; CHECK-NEXT:    cmp w8, w9
+; CHECK-NEXT:    b.ne .LBB23_3
+; CHECK-NEXT:  // %bb.2:
+; CHECK-NEXT:    lsr w0, wzr, #31
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB23_3: // %res_block
+; CHECK-NEXT:    cmp w8, w9
+; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
+; CHECK-NEXT:    cneg w8, w8, hs
+; CHECK-NEXT:    lsr w0, w8, #31
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 7) nounwind
+  %c = icmp slt i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length7_eq(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length7_eq:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr w8, [x0]
+; CHECK-NEXT:    ldr w9, [x1]
+; CHECK-NEXT:    ldur w10, [x0, #3]
+; CHECK-NEXT:    ldur w11, [x1, #3]
+; CHECK-NEXT:    cmp w8, w9
+; CHECK-NEXT:    ccmp w10, w11, #0, eq
+; CHECK-NEXT:    cset w0, ne
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 7) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length8(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr x8, [x0]
+; CHECK-NEXT:    ldr x9, [x1]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    cset w8, hi
+; CHECK-NEXT:    cset w9, lo
+; CHECK-NEXT:    sub w0, w8, w9
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 8) nounwind
+  ret i32 %m
+}
+
+define i1 @length8_eq(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length8_eq:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr x8, [x0]
+; CHECK-NEXT:    ldr x9, [x1]
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 8) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length8_eq_const(ptr %X) nounwind {
+; CHECK-LABEL: length8_eq_const:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov x9, #12592 // =0x3130
+; CHECK-NEXT:    ldr x8, [x0]
+; CHECK-NEXT:    movk x9, #13106, lsl #16
+; CHECK-NEXT:    movk x9, #13620, lsl #32
+; CHECK-NEXT:    movk x9, #14134, lsl #48
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    cset w0, ne
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 8) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length9(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length9:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr x8, [x0]
+; CHECK-NEXT:    ldr x9, [x1]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB28_2
+; CHECK-NEXT:  // %bb.1: // %loadbb1
+; CHECK-NEXT:    ldrb w8, [x0, #8]
+; CHECK-NEXT:    ldrb w9, [x1, #8]
+; CHECK-NEXT:    sub w0, w8, w9
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB28_2: // %res_block
+; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
+; CHECK-NEXT:    cneg w0, w8, hs
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 9) nounwind
+  ret i32 %m
+}
+
+define i1 @length9_eq(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length9_eq:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr x8, [x0]
+; CHECK-NEXT:    ldr x9, [x1]
+; CHECK-NEXT:    ldrb w10, [x0, #8]
+; CHECK-NEXT:    ldrb w11, [x1, #8]
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    ccmp x10, x11, #0, eq
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 9) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length10(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length10:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr x8, [x0]
+; CHECK-NEXT:    ldr x9, [x1]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB30_3
+; CHECK-NEXT:  // %bb.1: // %loadbb1
+; CHECK-NEXT:    ldrh w8, [x0, #8]
+; CHECK-NEXT:    ldrh w9, [x1, #8]
+; CHECK-NEXT:    rev w8, w8
+; CHECK-NEXT:    rev w9, w9
+; CHECK-NEXT:    lsr w8, w8, #16
+; CHECK-NEXT:    lsr w9, w9, #16
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB30_3
+; CHECK-NEXT:  // %bb.2:
+; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB30_3: // %res_block
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
+; CHECK-NEXT:    cneg w0, w8, hs
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 10) nounwind
+  ret i32 %m
+}
+
+define i1 @length10_eq(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length10_eq:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr x8, [x0]
+; CHECK-NEXT:    ldr x9, [x1]
+; CHECK-NEXT:    ldrh w10, [x0, #8]
+; CHECK-NEXT:    ldrh w11, [x1, #8]
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    ccmp x10, x11, #0, eq
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 10) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length11(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length11:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr x8, [x0]
+; CHECK-NEXT:    ldr x9, [x1]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB32_3
+; CHECK-NEXT:  // %bb.1: // %loadbb1
+; CHECK-NEXT:    ldur x8, [x0, #3]
+; CHECK-NEXT:    ldur x9, [x1, #3]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB32_3
+; CHECK-NEXT:  // %bb.2:
+; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB32_3: // %res_block
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
+; CHECK-NEXT:    cneg w0, w8, hs
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 11) nounwind
+  ret i32 %m
+}
+
+define i1 @length11_eq(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length11_eq:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr x8, [x0]
+; CHECK-NEXT:    ldr x9, [x1]
+; CHECK-NEXT:    ldur x10, [x0, #3]
+; CHECK-NEXT:    ldur x11, [x1, #3]
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    ccmp x10, x11, #0, eq
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 11) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length12_eq(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length12_eq:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr x8, [x0]
+; CHECK-NEXT:    ldr x9, [x1]
+; CHECK-NEXT:    ldr w10, [x0, #8]
+; CHECK-NEXT:    ldr w11, [x1, #8]
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    ccmp x10, x11, #0, eq
+; CHECK-NEXT:    cset w0, ne
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 12) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length12(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length12:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr x8, [x0]
+; CHECK-NEXT:    ldr x9, [x1]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB35_3
+; CHECK-NEXT:  // %bb.1: // %loadbb1
+; CHECK-NEXT:    ldr w8, [x0, #8]
+; CHECK-NEXT:    ldr w9, [x1, #8]
+; CHECK-NEXT:    rev w8, w8
+; CHECK-NEXT:    rev w9, w9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB35_3
+; CHECK-NEXT:  // %bb.2:
+; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB35_3: // %res_block
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
+; CHECK-NEXT:    cneg w0, w8, hs
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 12) nounwind
+  ret i32 %m
+}
+
+define i1 @length13_eq(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length13_eq:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr x8, [x0]
+; CHECK-NEXT:    ldr x9, [x1]
+; CHECK-NEXT:    ldur x10, [x0, #5]
+; CHECK-NEXT:    ldur x11, [x1, #5]
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    ccmp x10, x11, #0, eq
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 13) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length14_eq(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length14_eq:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr x8, [x0]
+; CHECK-NEXT:    ldr x9, [x1]
+; CHECK-NEXT:    ldur x10, [x0, #6]
+; CHECK-NEXT:    ldur x11, [x1, #6]
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    ccmp x10, x11, #0, eq
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 14) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length15(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length15:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr x8, [x0]
+; CHECK-NEXT:    ldr x9, [x1]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB38_3
+; CHECK-NEXT:  // %bb.1: // %loadbb1
+; CHECK-NEXT:    ldur x8, [x0, #7]
+; CHECK-NEXT:    ldur x9, [x1, #7]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB38_3
+; CHECK-NEXT:  // %bb.2:
+; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB38_3: // %res_block
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
+; CHECK-NEXT:    cneg w0, w8, hs
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 15) nounwind
+  ret i32 %m
+}
+
+define i1 @length15_lt(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length15_lt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr x8, [x0]
+; CHECK-NEXT:    ldr x9, [x1]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB39_3
+; CHECK-NEXT:  // %bb.1: // %loadbb1
+; CHECK-NEXT:    ldur x8, [x0, #7]
+; CHECK-NEXT:    ldur x9, [x1, #7]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB39_3
+; CHECK-NEXT:  // %bb.2:
+; CHECK-NEXT:    lsr w0, wzr, #31
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB39_3: // %res_block
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
+; CHECK-NEXT:    cneg w8, w8, hs
+; CHECK-NEXT:    lsr w0, w8, #31
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 15) nounwind
+  %c = icmp slt i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length15_const(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length15_const:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov x8, #14136 // =0x3738
+; CHECK-NEXT:    ldr x9, [x0]
+; CHECK-NEXT:    movk x8, #13622, lsl #16
+; CHECK-NEXT:    movk x8, #13108, lsl #32
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    movk x8, #12594, lsl #48
+; CHECK-NEXT:    cmp x9, x8
+; CHECK-NEXT:    b.ne .LBB40_3
+; CHECK-NEXT:  // %bb.1: // %loadbb1
+; CHECK-NEXT:    mov x8, #13365 // =0x3435
+; CHECK-NEXT:    ldur x9, [x0, #7]
+; CHECK-NEXT:    movk x8, #12851, lsl #16
+; CHECK-NEXT:    movk x8, #12337, lsl #32
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    movk x8, #14393, lsl #48
+; CHECK-NEXT:    cmp x9, x8
+; CHECK-NEXT:    b.ne .LBB40_3
+; CHECK-NEXT:  // %bb.2:
+; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB40_3: // %res_block
+; CHECK-NEXT:    cmp x9, x8
+; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
+; CHECK-NEXT:    cneg w0, w8, hs
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i64 15) nounwind
+  ret i32 %m
+}
+
+define i1 @length15_eq(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length15_eq:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr x8, [x0]
+; CHECK-NEXT:    ldr x9, [x1]
+; CHECK-NEXT:    ldur x10, [x0, #7]
+; CHECK-NEXT:    ldur x11, [x1, #7]
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    ccmp x10, x11, #0, eq
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 15) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i1 @length15_gt_const(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length15_gt_const:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov x8, #14136 // =0x3738
+; CHECK-NEXT:    ldr x9, [x0]
+; CHECK-NEXT:    movk x8, #13622, lsl #16
+; CHECK-NEXT:    movk x8, #13108, lsl #32
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    movk x8, #12594, lsl #48
+; CHECK-NEXT:    cmp x9, x8
+; CHECK-NEXT:    b.ne .LBB42_3
+; CHECK-NEXT:  // %bb.1: // %loadbb1
+; CHECK-NEXT:    mov x8, #13365 // =0x3435
+; CHECK-NEXT:    ldur x9, [x0, #7]
+; CHECK-NEXT:    movk x8, #12851, lsl #16
+; CHECK-NEXT:    movk x8, #12337, lsl #32
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    movk x8, #14393, lsl #48
+; CHECK-NEXT:    cmp x9, x8
+; CHECK-NEXT:    b.ne .LBB42_3
+; CHECK-NEXT:  // %bb.2:
+; CHECK-NEXT:    mov w8, wzr
+; CHECK-NEXT:    b .LBB42_4
+; CHECK-NEXT:  .LBB42_3: // %res_block
+; CHECK-NEXT:    cmp x9, x8
+; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
+; CHECK-NEXT:    cneg w8, w8, hs
+; CHECK-NEXT:  .LBB42_4: // %endblock
+; CHECK-NEXT:    cmp w8, #0
+; CHECK-NEXT:    cset w0, gt
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr getelementptr inbounds ([513 x i8], ptr @.str, i32 0, i32 1), i64 15) nounwind
+  %c = icmp sgt i32 %m, 0
+  ret i1 %c
+}
+
+
+define i32 @length16(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr x8, [x0]
+; CHECK-NEXT:    ldr x9, [x1]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB43_3
+; CHECK-NEXT:  // %bb.1: // %loadbb1
+; CHECK-NEXT:    ldr x8, [x0, #8]
+; CHECK-NEXT:    ldr x9, [x1, #8]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB43_3
+; CHECK-NEXT:  // %bb.2:
+; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB43_3: // %res_block
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
+; CHECK-NEXT:    cneg w0, w8, hs
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 16) nounwind
+  ret i32 %m
+}
+
+define i1 @length16_eq(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: length16_eq:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldp x8, x11, [x1]
+; CHECK-NEXT:    ldp x9, x10, [x0]
+; CHECK-NEXT:    cmp x9, x8
+; CHECK-NEXT:    ccmp x10, x11, #0, eq
+; CHECK-NEXT:    cset w0, ne
+; CHECK-NEXT:    ret
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 16) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length16_lt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: length16_lt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr x8, [x0]
+; CHECK-NEXT:    ldr x9, [x1]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB45_3
+; CHECK-NEXT:  // %bb.1: // %loadbb1
+; CHECK-NEXT:    ldr x8, [x0, #8]
+; CHECK-NEXT:    ldr x9, [x1, #8]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB45_3
+; CHECK-NEXT:  // %bb.2:
+; CHECK-NEXT:    lsr w0, wzr, #31
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB45_3: // %res_block
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
+; CHECK-NEXT:    cneg w8, w8, hs
+; CHECK-NEXT:    lsr w0, w8, #31
+; CHECK-NEXT:    ret
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 16) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length16_gt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: length16_gt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr x8, [x0]
+; CHECK-NEXT:    ldr x9, [x1]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB46_3
+; CHECK-NEXT:  // %bb.1: // %loadbb1
+; CHECK-NEXT:    ldr x8, [x0, #8]
+; CHECK-NEXT:    ldr x9, [x1, #8]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB46_3
+; CHECK-NEXT:  // %bb.2:
+; CHECK-NEXT:    mov w8, wzr
+; CHECK-NEXT:    b .LBB46_4
+; CHECK-NEXT:  .LBB46_3: // %res_block
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
+; CHECK-NEXT:    cneg w8, w8, hs
+; CHECK-NEXT:  .LBB46_4: // %endblock
+; CHECK-NEXT:    cmp w8, #0
+; CHECK-NEXT:    cset w0, gt
+; CHECK-NEXT:    ret
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 16) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length16_eq_const(ptr %X) nounwind {
+; CHECK-LABEL: length16_eq_const:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov x8, #12592 // =0x3130
+; CHECK-NEXT:    ldp x9, x10, [x0]
+; CHECK-NEXT:    movk x8, #13106, lsl #16
+; CHECK-NEXT:    movk x8, #13620, lsl #32
+; CHECK-NEXT:    movk x8, #14134, lsl #48
+; CHECK-NEXT:    cmp x9, x8
+; CHECK-NEXT:    mov x8, #14648 // =0x3938
+; CHECK-NEXT:    movk x8, #12592, lsl #16
+; CHECK-NEXT:    movk x8, #13106, lsl #32
+; CHECK-NEXT:    movk x8, #13620, lsl #48
+; CHECK-NEXT:    ccmp x10, x8, #0, eq
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 16) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+
+define i32 @length24(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length24:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr x8, [x0]
+; CHECK-NEXT:    ldr x9, [x1]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB48_4
+; CHECK-NEXT:  // %bb.1: // %loadbb1
+; CHECK-NEXT:    ldr x8, [x0, #8]
+; CHECK-NEXT:    ldr x9, [x1, #8]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB48_4
+; CHECK-NEXT:  // %bb.2: // %loadbb2
+; CHECK-NEXT:    ldr x8, [x0, #16]
+; CHECK-NEXT:    ldr x9, [x1, #16]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB48_4
+; CHECK-NEXT:  // %bb.3:
+; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB48_4: // %res_block
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
+; CHECK-NEXT:    cneg w0, w8, hs
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 24) nounwind
+  ret i32 %m
+}
+
+define i1 @length24_eq(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: length24_eq:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldp x8, x11, [x1]
+; CHECK-NEXT:    ldr x12, [x0, #16]
+; CHECK-NEXT:    ldp x9, x10, [x0]
+; CHECK-NEXT:    ldr x13, [x1, #16]
+; CHECK-NEXT:    cmp x9, x8
+; CHECK-NEXT:    ccmp x10, x11, #0, eq
+; CHECK-NEXT:    ccmp x12, x13, #0, eq
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    ret
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 24) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length24_lt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: length24_lt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr x8, [x0]
+; CHECK-NEXT:    ldr x9, [x1]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB50_4
+; CHECK-NEXT:  // %bb.1: // %loadbb1
+; CHECK-NEXT:    ldr x8, [x0, #8]
+; CHECK-NEXT:    ldr x9, [x1, #8]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB50_4
+; CHECK-NEXT:  // %bb.2: // %loadbb2
+; CHECK-NEXT:    ldr x8, [x0, #16]
+; CHECK-NEXT:    ldr x9, [x1, #16]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB50_4
+; CHECK-NEXT:  // %bb.3:
+; CHECK-NEXT:    lsr w0, wzr, #31
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB50_4: // %res_block
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
+; CHECK-NEXT:    cneg w8, w8, hs
+; CHECK-NEXT:    lsr w0, w8, #31
+; CHECK-NEXT:    ret
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 24) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length24_gt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: length24_gt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr x8, [x0]
+; CHECK-NEXT:    ldr x9, [x1]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB51_4
+; CHECK-NEXT:  // %bb.1: // %loadbb1
+; CHECK-NEXT:    ldr x8, [x0, #8]
+; CHECK-NEXT:    ldr x9, [x1, #8]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB51_4
+; CHECK-NEXT:  // %bb.2: // %loadbb2
+; CHECK-NEXT:    ldr x8, [x0, #16]
+; CHECK-NEXT:    ldr x9, [x1, #16]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB51_4
+; CHECK-NEXT:  // %bb.3:
+; CHECK-NEXT:    mov w8, wzr
+; CHECK-NEXT:    b .LBB51_5
+; CHECK-NEXT:  .LBB51_4: // %res_block
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
+; CHECK-NEXT:    cneg w8, w8, hs
+; CHECK-NEXT:  .LBB51_5: // %endblock
+; CHECK-NEXT:    cmp w8, #0
+; CHECK-NEXT:    cset w0, gt
+; CHECK-NEXT:    ret
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 24) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length24_eq_const(ptr %X) nounwind {
+; CHECK-LABEL: length24_eq_const:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov x8, #12592 // =0x3130
+; CHECK-NEXT:    ldp x9, x10, [x0]
+; CHECK-NEXT:    movk x8, #13106, lsl #16
+; CHECK-NEXT:    ldr x11, [x0, #16]
+; CHECK-NEXT:    movk x8, #13620, lsl #32
+; CHECK-NEXT:    movk x8, #14134, lsl #48
+; CHECK-NEXT:    cmp x9, x8
+; CHECK-NEXT:    mov x8, #14648 // =0x3938
+; CHECK-NEXT:    movk x8, #12592, lsl #16
+; CHECK-NEXT:    movk x8, #13106, lsl #32
+; CHECK-NEXT:    movk x8, #13620, lsl #48
+; CHECK-NEXT:    ccmp x10, x8, #0, eq
+; CHECK-NEXT:    mov x8, #14134 // =0x3736
+; CHECK-NEXT:    movk x8, #14648, lsl #16
+; CHECK-NEXT:    movk x8, #12592, lsl #32
+; CHECK-NEXT:    movk x8, #13106, lsl #48
+; CHECK-NEXT:    ccmp x11, x8, #0, eq
+; CHECK-NEXT:    cset w0, ne
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 24) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length31(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length31:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr x8, [x0]
+; CHECK-NEXT:    ldr x9, [x1]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB53_5
+; CHECK-NEXT:  // %bb.1: // %loadbb1
+; CHECK-NEXT:    ldr x8, [x0, #8]
+; CHECK-NEXT:    ldr x9, [x1, #8]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB53_5
+; CHECK-NEXT:  // %bb.2: // %loadbb2
+; CHECK-NEXT:    ldr x8, [x0, #16]
+; CHECK-NEXT:    ldr x9, [x1, #16]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB53_5
+; CHECK-NEXT:  // %bb.3: // %loadbb3
+; CHECK-NEXT:    ldur x8, [x0, #23]
+; CHECK-NEXT:    ldur x9, [x1, #23]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB53_5
+; CHECK-NEXT:  // %bb.4:
+; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB53_5: // %res_block
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
+; CHECK-NEXT:    cneg w0, w8, hs
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 31) nounwind
+  ret i32 %m
+}
+
+define i1 @length31_eq(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: length31_eq:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldp x8, x11, [x1]
+; CHECK-NEXT:    ldr x12, [x0, #16]
+; CHECK-NEXT:    ldp x9, x10, [x0]
+; CHECK-NEXT:    ldr x13, [x1, #16]
+; CHECK-NEXT:    cmp x9, x8
+; CHECK-NEXT:    ldur x8, [x0, #23]
+; CHECK-NEXT:    ldur x9, [x1, #23]
+; CHECK-NEXT:    ccmp x10, x11, #0, eq
+; CHECK-NEXT:    ccmp x12, x13, #0, eq
+; CHECK-NEXT:    ccmp x8, x9, #0, eq
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    ret
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 31) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length31_lt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: length31_lt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr x8, [x0]
+; CHECK-NEXT:    ldr x9, [x1]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB55_5
+; CHECK-NEXT:  // %bb.1: // %loadbb1
+; CHECK-NEXT:    ldr x8, [x0, #8]
+; CHECK-NEXT:    ldr x9, [x1, #8]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB55_5
+; CHECK-NEXT:  // %bb.2: // %loadbb2
+; CHECK-NEXT:    ldr x8, [x0, #16]
+; CHECK-NEXT:    ldr x9, [x1, #16]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB55_5
+; CHECK-NEXT:  // %bb.3: // %loadbb3
+; CHECK-NEXT:    ldur x8, [x0, #23]
+; CHECK-NEXT:    ldur x9, [x1, #23]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB55_5
+; CHECK-NEXT:  // %bb.4:
+; CHECK-NEXT:    lsr w0, wzr, #31
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB55_5: // %res_block
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
+; CHECK-NEXT:    cneg w8, w8, hs
+; CHECK-NEXT:    lsr w0, w8, #31
+; CHECK-NEXT:    ret
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 31) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length31_gt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: length31_gt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr x8, [x0]
+; CHECK-NEXT:    ldr x9, [x1]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB56_5
+; CHECK-NEXT:  // %bb.1: // %loadbb1
+; CHECK-NEXT:    ldr x8, [x0, #8]
+; CHECK-NEXT:    ldr x9, [x1, #8]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB56_5
+; CHECK-NEXT:  // %bb.2: // %loadbb2
+; CHECK-NEXT:    ldr x8, [x0, #16]
+; CHECK-NEXT:    ldr x9, [x1, #16]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB56_5
+; CHECK-NEXT:  // %bb.3: // %loadbb3
+; CHECK-NEXT:    ldur x8, [x0, #23]
+; CHECK-NEXT:    ldur x9, [x1, #23]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB56_5
+; CHECK-NEXT:  // %bb.4:
+; CHECK-NEXT:    mov w8, wzr
+; CHECK-NEXT:    b .LBB56_6
+; CHECK-NEXT:  .LBB56_5: // %res_block
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
+; CHECK-NEXT:    cneg w8, w8, hs
+; CHECK-NEXT:  .LBB56_6: // %endblock
+; CHECK-NEXT:    cmp w8, #0
+; CHECK-NEXT:    cset w0, gt
+; CHECK-NEXT:    ret
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 31) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length31_eq_prefer128(ptr %x, ptr %y) nounwind "prefer-vector-width"="128" {
+; CHECK-LABEL: length31_eq_prefer128:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldp x8, x11, [x1]
+; CHECK-NEXT:    ldr x12, [x0, #16]
+; CHECK-NEXT:    ldp x9, x10, [x0]
+; CHECK-NEXT:    ldr x13, [x1, #16]
+; CHECK-NEXT:    cmp x9, x8
+; CHECK-NEXT:    ldur x8, [x0, #23]
+; CHECK-NEXT:    ldur x9, [x1, #23]
+; CHECK-NEXT:    ccmp x10, x11, #0, eq
+; CHECK-NEXT:    ccmp x12, x13, #0, eq
+; CHECK-NEXT:    ccmp x8, x9, #0, eq
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    ret
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 31) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length31_eq_const(ptr %X) nounwind {
+; CHECK-LABEL: length31_eq_const:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov x8, #12592 // =0x3130
+; CHECK-NEXT:    ldp x9, x10, [x0]
+; CHECK-NEXT:    movk x8, #13106, lsl #16
+; CHECK-NEXT:    ldr x11, [x0, #16]
+; CHECK-NEXT:    movk x8, #13620, lsl #32
+; CHECK-NEXT:    movk x8, #14134, lsl #48
+; CHECK-NEXT:    cmp x9, x8
+; CHECK-NEXT:    mov x8, #14648 // =0x3938
+; CHECK-NEXT:    ldur x9, [x0, #23]
+; CHECK-NEXT:    movk x8, #12592, lsl #16
+; CHECK-NEXT:    movk x8, #13106, lsl #32
+; CHECK-NEXT:    movk x8, #13620, lsl #48
+; CHECK-NEXT:    ccmp x10, x8, #0, eq
+; CHECK-NEXT:    mov x8, #14134 // =0x3736
+; CHECK-NEXT:    movk x8, #14648, lsl #16
+; CHECK-NEXT:    movk x8, #12592, lsl #32
+; CHECK-NEXT:    movk x8, #13106, lsl #48
+; CHECK-NEXT:    ccmp x11, x8, #0, eq
+; CHECK-NEXT:    mov x8, #13363 // =0x3433
+; CHECK-NEXT:    movk x8, #13877, lsl #16
+; CHECK-NEXT:    movk x8, #14391, lsl #32
+; CHECK-NEXT:    movk x8, #12345, lsl #48
+; CHECK-NEXT:    ccmp x9, x8, #0, eq
+; CHECK-NEXT:    cset w0, ne
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 31) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length32(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr x8, [x0]
+; CHECK-NEXT:    ldr x9, [x1]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB59_5
+; CHECK-NEXT:  // %bb.1: // %loadbb1
+; CHECK-NEXT:    ldr x8, [x0, #8]
+; CHECK-NEXT:    ldr x9, [x1, #8]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB59_5
+; CHECK-NEXT:  // %bb.2: // %loadbb2
+; CHECK-NEXT:    ldr x8, [x0, #16]
+; CHECK-NEXT:    ldr x9, [x1, #16]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB59_5
+; CHECK-NEXT:  // %bb.3: // %loadbb3
+; CHECK-NEXT:    ldr x8, [x0, #24]
+; CHECK-NEXT:    ldr x9, [x1, #24]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB59_5
+; CHECK-NEXT:  // %bb.4:
+; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB59_5: // %res_block
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
+; CHECK-NEXT:    cneg w0, w8, hs
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 32) nounwind
+  ret i32 %m
+}
+
+
+define i1 @length32_eq(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: length32_eq:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldp x8, x11, [x1]
+; CHECK-NEXT:    ldp x9, x10, [x0]
+; CHECK-NEXT:    ldp x12, x13, [x1, #16]
+; CHECK-NEXT:    cmp x9, x8
+; CHECK-NEXT:    ldp x8, x9, [x0, #16]
+; CHECK-NEXT:    ccmp x10, x11, #0, eq
+; CHECK-NEXT:    ccmp x8, x12, #0, eq
+; CHECK-NEXT:    ccmp x9, x13, #0, eq
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    ret
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 32) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length32_lt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: length32_lt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr x8, [x0]
+; CHECK-NEXT:    ldr x9, [x1]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB61_5
+; CHECK-NEXT:  // %bb.1: // %loadbb1
+; CHECK-NEXT:    ldr x8, [x0, #8]
+; CHECK-NEXT:    ldr x9, [x1, #8]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB61_5
+; CHECK-NEXT:  // %bb.2: // %loadbb2
+; CHECK-NEXT:    ldr x8, [x0, #16]
+; CHECK-NEXT:    ldr x9, [x1, #16]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB61_5
+; CHECK-NEXT:  // %bb.3: // %loadbb3
+; CHECK-NEXT:    ldr x8, [x0, #24]
+; CHECK-NEXT:    ldr x9, [x1, #24]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB61_5
+; CHECK-NEXT:  // %bb.4:
+; CHECK-NEXT:    lsr w0, wzr, #31
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB61_5: // %res_block
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
+; CHECK-NEXT:    cneg w8, w8, hs
+; CHECK-NEXT:    lsr w0, w8, #31
+; CHECK-NEXT:    ret
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 32) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length32_gt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: length32_gt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr x8, [x0]
+; CHECK-NEXT:    ldr x9, [x1]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB62_5
+; CHECK-NEXT:  // %bb.1: // %loadbb1
+; CHECK-NEXT:    ldr x8, [x0, #8]
+; CHECK-NEXT:    ldr x9, [x1, #8]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB62_5
+; CHECK-NEXT:  // %bb.2: // %loadbb2
+; CHECK-NEXT:    ldr x8, [x0, #16]
+; CHECK-NEXT:    ldr x9, [x1, #16]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB62_5
+; CHECK-NEXT:  // %bb.3: // %loadbb3
+; CHECK-NEXT:    ldr x8, [x0, #24]
+; CHECK-NEXT:    ldr x9, [x1, #24]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB62_5
+; CHECK-NEXT:  // %bb.4:
+; CHECK-NEXT:    mov w8, wzr
+; CHECK-NEXT:    b .LBB62_6
+; CHECK-NEXT:  .LBB62_5: // %res_block
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
+; CHECK-NEXT:    cneg w8, w8, hs
+; CHECK-NEXT:  .LBB62_6: // %endblock
+; CHECK-NEXT:    cmp w8, #0
+; CHECK-NEXT:    cset w0, gt
+; CHECK-NEXT:    ret
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 32) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length32_eq_prefer128(ptr %x, ptr %y) nounwind "prefer-vector-width"="128" {
+; CHECK-LABEL: length32_eq_prefer128:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldp x8, x11, [x1]
+; CHECK-NEXT:    ldp x9, x10, [x0]
+; CHECK-NEXT:    ldp x12, x13, [x1, #16]
+; CHECK-NEXT:    cmp x9, x8
+; CHECK-NEXT:    ldp x8, x9, [x0, #16]
+; CHECK-NEXT:    ccmp x10, x11, #0, eq
+; CHECK-NEXT:    ccmp x8, x12, #0, eq
+; CHECK-NEXT:    ccmp x9, x13, #0, eq
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    ret
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 32) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length32_eq_const(ptr %X) nounwind {
+; CHECK-LABEL: length32_eq_const:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov x8, #12592 // =0x3130
+; CHECK-NEXT:    ldp x9, x10, [x0]
+; CHECK-NEXT:    movk x8, #13106, lsl #16
+; CHECK-NEXT:    movk x8, #13620, lsl #32
+; CHECK-NEXT:    movk x8, #14134, lsl #48
+; CHECK-NEXT:    cmp x9, x8
+; CHECK-NEXT:    mov x8, #14648 // =0x3938
+; CHECK-NEXT:    movk x8, #12592, lsl #16
+; CHECK-NEXT:    ldp x9, x11, [x0, #16]
+; CHECK-NEXT:    movk x8, #13106, lsl #32
+; CHECK-NEXT:    movk x8, #13620, lsl #48
+; CHECK-NEXT:    ccmp x10, x8, #0, eq
+; CHECK-NEXT:    mov x8, #14134 // =0x3736
+; CHECK-NEXT:    movk x8, #14648, lsl #16
+; CHECK-NEXT:    movk x8, #12592, lsl #32
+; CHECK-NEXT:    movk x8, #13106, lsl #48
+; CHECK-NEXT:    ccmp x9, x8, #0, eq
+; CHECK-NEXT:    mov x8, #13620 // =0x3534
+; CHECK-NEXT:    movk x8, #14134, lsl #16
+; CHECK-NEXT:    movk x8, #14648, lsl #32
+; CHECK-NEXT:    movk x8, #12592, lsl #48
+; CHECK-NEXT:    ccmp x11, x8, #0, eq
+; CHECK-NEXT:    cset w0, ne
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 32) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length48(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length48:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr x8, [x0]
+; CHECK-NEXT:    ldr x9, [x1]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB65_7
+; CHECK-NEXT:  // %bb.1: // %loadbb1
+; CHECK-NEXT:    ldr x8, [x0, #8]
+; CHECK-NEXT:    ldr x9, [x1, #8]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB65_7
+; CHECK-NEXT:  // %bb.2: // %loadbb2
+; CHECK-NEXT:    ldr x8, [x0, #16]
+; CHECK-NEXT:    ldr x9, [x1, #16]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB65_7
+; CHECK-NEXT:  // %bb.3: // %loadbb3
+; CHECK-NEXT:    ldr x8, [x0, #24]
+; CHECK-NEXT:    ldr x9, [x1, #24]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB65_7
+; CHECK-NEXT:  // %bb.4: // %loadbb4
+; CHECK-NEXT:    ldr x8, [x0, #32]
+; CHECK-NEXT:    ldr x9, [x1, #32]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB65_7
+; CHECK-NEXT:  // %bb.5: // %loadbb5
+; CHECK-NEXT:    ldr x8, [x0, #40]
+; CHECK-NEXT:    ldr x9, [x1, #40]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB65_7
+; CHECK-NEXT:  // %bb.6:
+; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB65_7: // %res_block
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
+; CHECK-NEXT:    cneg w0, w8, hs
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 48) nounwind
+  ret i32 %m
+}
+
+define i1 @length48_eq(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: length48_eq:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldp x8, x11, [x1]
+; CHECK-NEXT:    ldp x9, x10, [x0]
+; CHECK-NEXT:    ldp x12, x13, [x1, #16]
+; CHECK-NEXT:    cmp x9, x8
+; CHECK-NEXT:    ldp x8, x9, [x0, #16]
+; CHECK-NEXT:    ccmp x10, x11, #0, eq
+; CHECK-NEXT:    ccmp x8, x12, #0, eq
+; CHECK-NEXT:    ldp x8, x11, [x0, #32]
+; CHECK-NEXT:    ldp x10, x12, [x1, #32]
+; CHECK-NEXT:    ccmp x9, x13, #0, eq
+; CHECK-NEXT:    ccmp x8, x10, #0, eq
+; CHECK-NEXT:    ccmp x11, x12, #0, eq
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    ret
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 48) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length48_lt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: length48_lt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr x8, [x0]
+; CHECK-NEXT:    ldr x9, [x1]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB67_7
+; CHECK-NEXT:  // %bb.1: // %loadbb1
+; CHECK-NEXT:    ldr x8, [x0, #8]
+; CHECK-NEXT:    ldr x9, [x1, #8]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB67_7
+; CHECK-NEXT:  // %bb.2: // %loadbb2
+; CHECK-NEXT:    ldr x8, [x0, #16]
+; CHECK-NEXT:    ldr x9, [x1, #16]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB67_7
+; CHECK-NEXT:  // %bb.3: // %loadbb3
+; CHECK-NEXT:    ldr x8, [x0, #24]
+; CHECK-NEXT:    ldr x9, [x1, #24]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB67_7
+; CHECK-NEXT:  // %bb.4: // %loadbb4
+; CHECK-NEXT:    ldr x8, [x0, #32]
+; CHECK-NEXT:    ldr x9, [x1, #32]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB67_7
+; CHECK-NEXT:  // %bb.5: // %loadbb5
+; CHECK-NEXT:    ldr x8, [x0, #40]
+; CHECK-NEXT:    ldr x9, [x1, #40]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB67_7
+; CHECK-NEXT:  // %bb.6:
+; CHECK-NEXT:    lsr w0, wzr, #31
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB67_7: // %res_block
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
+; CHECK-NEXT:    cneg w8, w8, hs
+; CHECK-NEXT:    lsr w0, w8, #31
+; CHECK-NEXT:    ret
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 48) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length48_gt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: length48_gt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr x8, [x0]
+; CHECK-NEXT:    ldr x9, [x1]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB68_7
+; CHECK-NEXT:  // %bb.1: // %loadbb1
+; CHECK-NEXT:    ldr x8, [x0, #8]
+; CHECK-NEXT:    ldr x9, [x1, #8]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB68_7
+; CHECK-NEXT:  // %bb.2: // %loadbb2
+; CHECK-NEXT:    ldr x8, [x0, #16]
+; CHECK-NEXT:    ldr x9, [x1, #16]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB68_7
+; CHECK-NEXT:  // %bb.3: // %loadbb3
+; CHECK-NEXT:    ldr x8, [x0, #24]
+; CHECK-NEXT:    ldr x9, [x1, #24]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB68_7
+; CHECK-NEXT:  // %bb.4: // %loadbb4
+; CHECK-NEXT:    ldr x8, [x0, #32]
+; CHECK-NEXT:    ldr x9, [x1, #32]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB68_7
+; CHECK-NEXT:  // %bb.5: // %loadbb5
+; CHECK-NEXT:    ldr x8, [x0, #40]
+; CHECK-NEXT:    ldr x9, [x1, #40]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB68_7
+; CHECK-NEXT:  // %bb.6:
+; CHECK-NEXT:    mov w8, wzr
+; CHECK-NEXT:    b .LBB68_8
+; CHECK-NEXT:  .LBB68_7: // %res_block
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
+; CHECK-NEXT:    cneg w8, w8, hs
+; CHECK-NEXT:  .LBB68_8: // %endblock
+; CHECK-NEXT:    cmp w8, #0
+; CHECK-NEXT:    cset w0, gt
+; CHECK-NEXT:    ret
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 48) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length48_eq_prefer128(ptr %x, ptr %y) nounwind "prefer-vector-width"="128" {
+; CHECK-LABEL: length48_eq_prefer128:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldp x8, x11, [x1]
+; CHECK-NEXT:    ldp x9, x10, [x0]
+; CHECK-NEXT:    ldp x12, x13, [x1, #16]
+; CHECK-NEXT:    cmp x9, x8
+; CHECK-NEXT:    ldp x8, x9, [x0, #16]
+; CHECK-NEXT:    ccmp x10, x11, #0, eq
+; CHECK-NEXT:    ccmp x8, x12, #0, eq
+; CHECK-NEXT:    ldp x8, x11, [x0, #32]
+; CHECK-NEXT:    ldp x10, x12, [x1, #32]
+; CHECK-NEXT:    ccmp x9, x13, #0, eq
+; CHECK-NEXT:    ccmp x8, x10, #0, eq
+; CHECK-NEXT:    ccmp x11, x12, #0, eq
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    ret
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 48) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length48_eq_const(ptr %X) nounwind {
+; CHECK-LABEL: length48_eq_const:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov x8, #12592 // =0x3130
+; CHECK-NEXT:    ldp x9, x10, [x0]
+; CHECK-NEXT:    movk x8, #13106, lsl #16
+; CHECK-NEXT:    ldp x11, x12, [x0, #16]
+; CHECK-NEXT:    movk x8, #13620, lsl #32
+; CHECK-NEXT:    movk x8, #14134, lsl #48
+; CHECK-NEXT:    cmp x9, x8
+; CHECK-NEXT:    mov x9, #14648 // =0x3938
+; CHECK-NEXT:    movk x9, #12592, lsl #16
+; CHECK-NEXT:    movk x9, #13106, lsl #32
+; CHECK-NEXT:    movk x9, #13620, lsl #48
+; CHECK-NEXT:    ccmp x10, x9, #0, eq
+; CHECK-NEXT:    mov x9, #14134 // =0x3736
+; CHECK-NEXT:    movk x9, #14648, lsl #16
+; CHECK-NEXT:    movk x9, #12592, lsl #32
+; CHECK-NEXT:    movk x9, #13106, lsl #48
+; CHECK-NEXT:    ccmp x11, x9, #0, eq
+; CHECK-NEXT:    mov x9, #13620 // =0x3534
+; CHECK-NEXT:    movk x9, #14134, lsl #16
+; CHECK-NEXT:    ldp x10, x11, [x0, #32]
+; CHECK-NEXT:    movk x9, #14648, lsl #32
+; CHECK-NEXT:    movk x9, #12592, lsl #48
+; CHECK-NEXT:    ccmp x12, x9, #0, eq
+; CHECK-NEXT:    mov x9, #13106 // =0x3332
+; CHECK-NEXT:    movk x9, #13620, lsl #16
+; CHECK-NEXT:    movk x9, #14134, lsl #32
+; CHECK-NEXT:    movk x9, #14648, lsl #48
+; CHECK-NEXT:    ccmp x10, x9, #0, eq
+; CHECK-NEXT:    ccmp x11, x8, #0, eq
+; CHECK-NEXT:    cset w0, ne
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 48) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length63(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length63:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr x8, [x0]
+; CHECK-NEXT:    ldr x9, [x1]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB71_9
+; CHECK-NEXT:  // %bb.1: // %loadbb1
+; CHECK-NEXT:    ldr x8, [x0, #8]
+; CHECK-NEXT:    ldr x9, [x1, #8]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB71_9
+; CHECK-NEXT:  // %bb.2: // %loadbb2
+; CHECK-NEXT:    ldr x8, [x0, #16]
+; CHECK-NEXT:    ldr x9, [x1, #16]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB71_9
+; CHECK-NEXT:  // %bb.3: // %loadbb3
+; CHECK-NEXT:    ldr x8, [x0, #24]
+; CHECK-NEXT:    ldr x9, [x1, #24]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB71_9
+; CHECK-NEXT:  // %bb.4: // %loadbb4
+; CHECK-NEXT:    ldr x8, [x0, #32]
+; CHECK-NEXT:    ldr x9, [x1, #32]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB71_9
+; CHECK-NEXT:  // %bb.5: // %loadbb5
+; CHECK-NEXT:    ldr x8, [x0, #40]
+; CHECK-NEXT:    ldr x9, [x1, #40]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB71_9
+; CHECK-NEXT:  // %bb.6: // %loadbb6
+; CHECK-NEXT:    ldr x8, [x0, #48]
+; CHECK-NEXT:    ldr x9, [x1, #48]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB71_9
+; CHECK-NEXT:  // %bb.7: // %loadbb7
+; CHECK-NEXT:    ldur x8, [x0, #55]
+; CHECK-NEXT:    ldur x9, [x1, #55]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB71_9
+; CHECK-NEXT:  // %bb.8:
+; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB71_9: // %res_block
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
+; CHECK-NEXT:    cneg w0, w8, hs
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 63) nounwind
+  ret i32 %m
+}
+
+define i1 @length63_eq(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: length63_eq:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldp x8, x11, [x1]
+; CHECK-NEXT:    ldp x9, x10, [x0]
+; CHECK-NEXT:    ldp x12, x13, [x1, #16]
+; CHECK-NEXT:    cmp x9, x8
+; CHECK-NEXT:    ldp x8, x9, [x0, #16]
+; CHECK-NEXT:    ccmp x10, x11, #0, eq
+; CHECK-NEXT:    ccmp x8, x12, #0, eq
+; CHECK-NEXT:    ldp x8, x11, [x0, #32]
+; CHECK-NEXT:    ldp x10, x12, [x1, #32]
+; CHECK-NEXT:    ccmp x9, x13, #0, eq
+; CHECK-NEXT:    ldr x9, [x0, #48]
+; CHECK-NEXT:    ldr x13, [x1, #48]
+; CHECK-NEXT:    ccmp x8, x10, #0, eq
+; CHECK-NEXT:    ldur x8, [x0, #55]
+; CHECK-NEXT:    ldur x10, [x1, #55]
+; CHECK-NEXT:    ccmp x11, x12, #0, eq
+; CHECK-NEXT:    ccmp x9, x13, #0, eq
+; CHECK-NEXT:    ccmp x8, x10, #0, eq
+; CHECK-NEXT:    cset w0, ne
+; CHECK-NEXT:    ret
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 63) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length63_lt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: length63_lt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr x8, [x0]
+; CHECK-NEXT:    ldr x9, [x1]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB73_9
+; CHECK-NEXT:  // %bb.1: // %loadbb1
+; CHECK-NEXT:    ldr x8, [x0, #8]
+; CHECK-NEXT:    ldr x9, [x1, #8]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB73_9
+; CHECK-NEXT:  // %bb.2: // %loadbb2
+; CHECK-NEXT:    ldr x8, [x0, #16]
+; CHECK-NEXT:    ldr x9, [x1, #16]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB73_9
+; CHECK-NEXT:  // %bb.3: // %loadbb3
+; CHECK-NEXT:    ldr x8, [x0, #24]
+; CHECK-NEXT:    ldr x9, [x1, #24]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB73_9
+; CHECK-NEXT:  // %bb.4: // %loadbb4
+; CHECK-NEXT:    ldr x8, [x0, #32]
+; CHECK-NEXT:    ldr x9, [x1, #32]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB73_9
+; CHECK-NEXT:  // %bb.5: // %loadbb5
+; CHECK-NEXT:    ldr x8, [x0, #40]
+; CHECK-NEXT:    ldr x9, [x1, #40]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB73_9
+; CHECK-NEXT:  // %bb.6: // %loadbb6
+; CHECK-NEXT:    ldr x8, [x0, #48]
+; CHECK-NEXT:    ldr x9, [x1, #48]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB73_9
+; CHECK-NEXT:  // %bb.7: // %loadbb7
+; CHECK-NEXT:    ldur x8, [x0, #55]
+; CHECK-NEXT:    ldur x9, [x1, #55]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB73_9
+; CHECK-NEXT:  // %bb.8:
+; CHECK-NEXT:    lsr w0, wzr, #31
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB73_9: // %res_block
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
+; CHECK-NEXT:    cneg w8, w8, hs
+; CHECK-NEXT:    lsr w0, w8, #31
+; CHECK-NEXT:    ret
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 63) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length63_gt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: length63_gt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr x8, [x0]
+; CHECK-NEXT:    ldr x9, [x1]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB74_9
+; CHECK-NEXT:  // %bb.1: // %loadbb1
+; CHECK-NEXT:    ldr x8, [x0, #8]
+; CHECK-NEXT:    ldr x9, [x1, #8]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB74_9
+; CHECK-NEXT:  // %bb.2: // %loadbb2
+; CHECK-NEXT:    ldr x8, [x0, #16]
+; CHECK-NEXT:    ldr x9, [x1, #16]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB74_9
+; CHECK-NEXT:  // %bb.3: // %loadbb3
+; CHECK-NEXT:    ldr x8, [x0, #24]
+; CHECK-NEXT:    ldr x9, [x1, #24]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB74_9
+; CHECK-NEXT:  // %bb.4: // %loadbb4
+; CHECK-NEXT:    ldr x8, [x0, #32]
+; CHECK-NEXT:    ldr x9, [x1, #32]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB74_9
+; CHECK-NEXT:  // %bb.5: // %loadbb5
+; CHECK-NEXT:    ldr x8, [x0, #40]
+; CHECK-NEXT:    ldr x9, [x1, #40]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB74_9
+; CHECK-NEXT:  // %bb.6: // %loadbb6
+; CHECK-NEXT:    ldr x8, [x0, #48]
+; CHECK-NEXT:    ldr x9, [x1, #48]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB74_9
+; CHECK-NEXT:  // %bb.7: // %loadbb7
+; CHECK-NEXT:    ldur x8, [x0, #55]
+; CHECK-NEXT:    ldur x9, [x1, #55]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB74_9
+; CHECK-NEXT:  // %bb.8:
+; CHECK-NEXT:    mov w8, wzr
+; CHECK-NEXT:    b .LBB74_10
+; CHECK-NEXT:  .LBB74_9: // %res_block
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
+; CHECK-NEXT:    cneg w8, w8, hs
+; CHECK-NEXT:  .LBB74_10: // %endblock
+; CHECK-NEXT:    cmp w8, #0
+; CHECK-NEXT:    cset w0, gt
+; CHECK-NEXT:    ret
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 63) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length63_eq_const(ptr %X) nounwind {
+; CHECK-LABEL: length63_eq_const:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov x8, #12592 // =0x3130
+; CHECK-NEXT:    ldp x9, x10, [x0]
+; CHECK-NEXT:    movk x8, #13106, lsl #16
+; CHECK-NEXT:    ldp x11, x12, [x0, #16]
+; CHECK-NEXT:    movk x8, #13620, lsl #32
+; CHECK-NEXT:    movk x8, #14134, lsl #48
+; CHECK-NEXT:    cmp x9, x8
+; CHECK-NEXT:    mov x9, #14648 // =0x3938
+; CHECK-NEXT:    movk x9, #12592, lsl #16
+; CHECK-NEXT:    movk x9, #13106, lsl #32
+; CHECK-NEXT:    movk x9, #13620, lsl #48
+; CHECK-NEXT:    ccmp x10, x9, #0, eq
+; CHECK-NEXT:    mov x10, #14134 // =0x3736
+; CHECK-NEXT:    movk x10, #14648, lsl #16
+; CHECK-NEXT:    movk x10, #12592, lsl #32
+; CHECK-NEXT:    movk x10, #13106, lsl #48
+; CHECK-NEXT:    ccmp x11, x10, #0, eq
+; CHECK-NEXT:    mov x10, #13620 // =0x3534
+; CHECK-NEXT:    movk x10, #14134, lsl #16
+; CHECK-NEXT:    ldp x11, x13, [x0, #32]
+; CHECK-NEXT:    movk x10, #14648, lsl #32
+; CHECK-NEXT:    movk x10, #12592, lsl #48
+; CHECK-NEXT:    ccmp x12, x10, #0, eq
+; CHECK-NEXT:    mov x10, #13106 // =0x3332
+; CHECK-NEXT:    ldr x12, [x0, #48]
+; CHECK-NEXT:    movk x10, #13620, lsl #16
+; CHECK-NEXT:    movk x10, #14134, lsl #32
+; CHECK-NEXT:    movk x10, #14648, lsl #48
+; CHECK-NEXT:    ccmp x11, x10, #0, eq
+; CHECK-NEXT:    ldur x10, [x0, #55]
+; CHECK-NEXT:    ccmp x13, x8, #0, eq
+; CHECK-NEXT:    mov x8, #13877 // =0x3635
+; CHECK-NEXT:    movk x8, #14391, lsl #16
+; CHECK-NEXT:    ccmp x12, x9, #0, eq
+; CHECK-NEXT:    movk x8, #12345, lsl #32
+; CHECK-NEXT:    movk x8, #12849, lsl #48
+; CHECK-NEXT:    ccmp x10, x8, #0, eq
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 63) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length64(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr x8, [x0]
+; CHECK-NEXT:    ldr x9, [x1]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB76_9
+; CHECK-NEXT:  // %bb.1: // %loadbb1
+; CHECK-NEXT:    ldr x8, [x0, #8]
+; CHECK-NEXT:    ldr x9, [x1, #8]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB76_9
+; CHECK-NEXT:  // %bb.2: // %loadbb2
+; CHECK-NEXT:    ldr x8, [x0, #16]
+; CHECK-NEXT:    ldr x9, [x1, #16]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB76_9
+; CHECK-NEXT:  // %bb.3: // %loadbb3
+; CHECK-NEXT:    ldr x8, [x0, #24]
+; CHECK-NEXT:    ldr x9, [x1, #24]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB76_9
+; CHECK-NEXT:  // %bb.4: // %loadbb4
+; CHECK-NEXT:    ldr x8, [x0, #32]
+; CHECK-NEXT:    ldr x9, [x1, #32]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB76_9
+; CHECK-NEXT:  // %bb.5: // %loadbb5
+; CHECK-NEXT:    ldr x8, [x0, #40]
+; CHECK-NEXT:    ldr x9, [x1, #40]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB76_9
+; CHECK-NEXT:  // %bb.6: // %loadbb6
+; CHECK-NEXT:    ldr x8, [x0, #48]
+; CHECK-NEXT:    ldr x9, [x1, #48]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB76_9
+; CHECK-NEXT:  // %bb.7: // %loadbb7
+; CHECK-NEXT:    ldr x8, [x0, #56]
+; CHECK-NEXT:    ldr x9, [x1, #56]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB76_9
+; CHECK-NEXT:  // %bb.8:
+; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB76_9: // %res_block
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
+; CHECK-NEXT:    cneg w0, w8, hs
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 64) nounwind
+  ret i32 %m
+}
+
+define i1 @length64_eq(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: length64_eq:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldp x8, x11, [x1]
+; CHECK-NEXT:    ldp x9, x10, [x0]
+; CHECK-NEXT:    ldp x12, x13, [x1, #16]
+; CHECK-NEXT:    cmp x9, x8
+; CHECK-NEXT:    ldp x8, x9, [x0, #16]
+; CHECK-NEXT:    ccmp x10, x11, #0, eq
+; CHECK-NEXT:    ccmp x8, x12, #0, eq
+; CHECK-NEXT:    ldp x8, x11, [x0, #32]
+; CHECK-NEXT:    ldp x10, x12, [x1, #32]
+; CHECK-NEXT:    ccmp x9, x13, #0, eq
+; CHECK-NEXT:    ldp x9, x13, [x1, #48]
+; CHECK-NEXT:    ccmp x8, x10, #0, eq
+; CHECK-NEXT:    ldp x8, x10, [x0, #48]
+; CHECK-NEXT:    ccmp x11, x12, #0, eq
+; CHECK-NEXT:    ccmp x8, x9, #0, eq
+; CHECK-NEXT:    ccmp x10, x13, #0, eq
+; CHECK-NEXT:    cset w0, ne
+; CHECK-NEXT:    ret
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 64) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length64_lt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: length64_lt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr x8, [x0]
+; CHECK-NEXT:    ldr x9, [x1]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB78_9
+; CHECK-NEXT:  // %bb.1: // %loadbb1
+; CHECK-NEXT:    ldr x8, [x0, #8]
+; CHECK-NEXT:    ldr x9, [x1, #8]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB78_9
+; CHECK-NEXT:  // %bb.2: // %loadbb2
+; CHECK-NEXT:    ldr x8, [x0, #16]
+; CHECK-NEXT:    ldr x9, [x1, #16]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB78_9
+; CHECK-NEXT:  // %bb.3: // %loadbb3
+; CHECK-NEXT:    ldr x8, [x0, #24]
+; CHECK-NEXT:    ldr x9, [x1, #24]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB78_9
+; CHECK-NEXT:  // %bb.4: // %loadbb4
+; CHECK-NEXT:    ldr x8, [x0, #32]
+; CHECK-NEXT:    ldr x9, [x1, #32]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB78_9
+; CHECK-NEXT:  // %bb.5: // %loadbb5
+; CHECK-NEXT:    ldr x8, [x0, #40]
+; CHECK-NEXT:    ldr x9, [x1, #40]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB78_9
+; CHECK-NEXT:  // %bb.6: // %loadbb6
+; CHECK-NEXT:    ldr x8, [x0, #48]
+; CHECK-NEXT:    ldr x9, [x1, #48]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB78_9
+; CHECK-NEXT:  // %bb.7: // %loadbb7
+; CHECK-NEXT:    ldr x8, [x0, #56]
+; CHECK-NEXT:    ldr x9, [x1, #56]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB78_9
+; CHECK-NEXT:  // %bb.8:
+; CHECK-NEXT:    lsr w0, wzr, #31
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB78_9: // %res_block
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
+; CHECK-NEXT:    cneg w8, w8, hs
+; CHECK-NEXT:    lsr w0, w8, #31
+; CHECK-NEXT:    ret
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 64) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length64_gt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: length64_gt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr x8, [x0]
+; CHECK-NEXT:    ldr x9, [x1]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB79_9
+; CHECK-NEXT:  // %bb.1: // %loadbb1
+; CHECK-NEXT:    ldr x8, [x0, #8]
+; CHECK-NEXT:    ldr x9, [x1, #8]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB79_9
+; CHECK-NEXT:  // %bb.2: // %loadbb2
+; CHECK-NEXT:    ldr x8, [x0, #16]
+; CHECK-NEXT:    ldr x9, [x1, #16]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB79_9
+; CHECK-NEXT:  // %bb.3: // %loadbb3
+; CHECK-NEXT:    ldr x8, [x0, #24]
+; CHECK-NEXT:    ldr x9, [x1, #24]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB79_9
+; CHECK-NEXT:  // %bb.4: // %loadbb4
+; CHECK-NEXT:    ldr x8, [x0, #32]
+; CHECK-NEXT:    ldr x9, [x1, #32]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB79_9
+; CHECK-NEXT:  // %bb.5: // %loadbb5
+; CHECK-NEXT:    ldr x8, [x0, #40]
+; CHECK-NEXT:    ldr x9, [x1, #40]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB79_9
+; CHECK-NEXT:  // %bb.6: // %loadbb6
+; CHECK-NEXT:    ldr x8, [x0, #48]
+; CHECK-NEXT:    ldr x9, [x1, #48]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB79_9
+; CHECK-NEXT:  // %bb.7: // %loadbb7
+; CHECK-NEXT:    ldr x8, [x0, #56]
+; CHECK-NEXT:    ldr x9, [x1, #56]
+; CHECK-NEXT:    rev x8, x8
+; CHECK-NEXT:    rev x9, x9
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    b.ne .LBB79_9
+; CHECK-NEXT:  // %bb.8:
+; CHECK-NEXT:    mov w8, wzr
+; CHECK-NEXT:    b .LBB79_10
+; CHECK-NEXT:  .LBB79_9: // %res_block
+; CHECK-NEXT:    cmp x8, x9
+; CHECK-NEXT:    mov w8, #-1 // =0xffffffff
+; CHECK-NEXT:    cneg w8, w8, hs
+; CHECK-NEXT:  .LBB79_10: // %endblock
+; CHECK-NEXT:    cmp w8, #0
+; CHECK-NEXT:    cset w0, gt
+; CHECK-NEXT:    ret
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 64) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length64_eq_const(ptr %X) nounwind {
+; CHECK-LABEL: length64_eq_const:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov x8, #12592 // =0x3130
+; CHECK-NEXT:    ldp x9, x10, [x0]
+; CHECK-NEXT:    movk x8, #13106, lsl #16
+; CHECK-NEXT:    ldp x11, x12, [x0, #16]
+; CHECK-NEXT:    movk x8, #13620, lsl #32
+; CHECK-NEXT:    ldp x13, x14, [x0, #32]
+; CHECK-NEXT:    movk x8, #14134, lsl #48
+; CHECK-NEXT:    cmp x9, x8
+; CHECK-NEXT:    mov x9, #14648 // =0x3938
+; CHECK-NEXT:    movk x9, #12592, lsl #16
+; CHECK-NEXT:    movk x9, #13106, lsl #32
+; CHECK-NEXT:    movk x9, #13620, lsl #48
+; CHECK-NEXT:    ccmp x10, x9, #0, eq
+; CHECK-NEXT:    mov x10, #14134 // =0x3736
+; CHECK-NEXT:    movk x10, #14648, lsl #16
+; CHECK-NEXT:    movk x10, #12592, lsl #32
+; CHECK-NEXT:    movk x10, #13106, lsl #48
+; CHECK-NEXT:    ccmp x11, x10, #0, eq
+; CHECK-NEXT:    mov x11, #13620 // =0x3534
+; CHECK-NEXT:    movk x11, #14134, lsl #16
+; CHECK-NEXT:    movk x11, #14648, lsl #32
+; CHECK-NEXT:    movk x11, #12592, lsl #48
+; CHECK-NEXT:    ccmp x12, x11, #0, eq
+; CHECK-NEXT:    mov x11, #13106 // =0x3332
+; CHECK-NEXT:    movk x11, #13620, lsl #16
+; CHECK-NEXT:    movk x11, #14134, lsl #32
+; CHECK-NEXT:    movk x11, #14648, lsl #48
+; CHECK-NEXT:    ccmp x13, x11, #0, eq
+; CHECK-NEXT:    ldp x11, x12, [x0, #48]
+; CHECK-NEXT:    ccmp x14, x8, #0, eq
+; CHECK-NEXT:    ccmp x11, x9, #0, eq
+; CHECK-NEXT:    ccmp x12, x10, #0, eq
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 64) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length96(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length96:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w2, #96 // =0x60
+; CHECK-NEXT:    b memcmp
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 96) nounwind
+  ret i32 %m
+}
+
+define i1 @length96_eq(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: length96_eq:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    mov w2, #96 // =0x60
+; CHECK-NEXT:    bl memcmp
+; CHECK-NEXT:    cmp w0, #0
+; CHECK-NEXT:    cset w0, ne
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 96) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length96_lt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: length96_lt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    mov w2, #96 // =0x60
+; CHECK-NEXT:    bl memcmp
+; CHECK-NEXT:    lsr w0, w0, #31
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 96) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length96_gt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: length96_gt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    mov w2, #96 // =0x60
+; CHECK-NEXT:    bl memcmp
+; CHECK-NEXT:    cmp w0, #0
+; CHECK-NEXT:    cset w0, gt
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 96) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length96_eq_const(ptr %X) nounwind {
+; CHECK-LABEL: length96_eq_const:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    adrp x1, .L.str
+; CHECK-NEXT:    add x1, x1, :lo12:.L.str
+; CHECK-NEXT:    mov w2, #96 // =0x60
+; CHECK-NEXT:    bl memcmp
+; CHECK-NEXT:    cmp w0, #0
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 96) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length127(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length127:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w2, #127 // =0x7f
+; CHECK-NEXT:    b memcmp
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 127) nounwind
+  ret i32 %m
+}
+
+define i1 @length127_eq(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: length127_eq:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    mov w2, #127 // =0x7f
+; CHECK-NEXT:    bl memcmp
+; CHECK-NEXT:    cmp w0, #0
+; CHECK-NEXT:    cset w0, ne
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 127) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length127_lt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: length127_lt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    mov w2, #127 // =0x7f
+; CHECK-NEXT:    bl memcmp
+; CHECK-NEXT:    lsr w0, w0, #31
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 127) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length127_gt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: length127_gt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    mov w2, #127 // =0x7f
+; CHECK-NEXT:    bl memcmp
+; CHECK-NEXT:    cmp w0, #0
+; CHECK-NEXT:    cset w0, gt
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 127) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length127_eq_const(ptr %X) nounwind {
+; CHECK-LABEL: length127_eq_const:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    adrp x1, .L.str
+; CHECK-NEXT:    add x1, x1, :lo12:.L.str
+; CHECK-NEXT:    mov w2, #127 // =0x7f
+; CHECK-NEXT:    bl memcmp
+; CHECK-NEXT:    cmp w0, #0
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 127) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length128(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length128:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w2, #128 // =0x80
+; CHECK-NEXT:    b memcmp
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 128) nounwind
+  ret i32 %m
+}
+
+define i1 @length128_eq(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: length128_eq:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    mov w2, #128 // =0x80
+; CHECK-NEXT:    bl memcmp
+; CHECK-NEXT:    cmp w0, #0
+; CHECK-NEXT:    cset w0, ne
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 128) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length128_lt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: length128_lt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    mov w2, #128 // =0x80
+; CHECK-NEXT:    bl memcmp
+; CHECK-NEXT:    lsr w0, w0, #31
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 128) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length128_gt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: length128_gt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    mov w2, #128 // =0x80
+; CHECK-NEXT:    bl memcmp
+; CHECK-NEXT:    cmp w0, #0
+; CHECK-NEXT:    cset w0, gt
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 128) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length128_eq_const(ptr %X) nounwind {
+; CHECK-LABEL: length128_eq_const:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    adrp x1, .L.str
+; CHECK-NEXT:    add x1, x1, :lo12:.L.str
+; CHECK-NEXT:    mov w2, #128 // =0x80
+; CHECK-NEXT:    bl memcmp
+; CHECK-NEXT:    cmp w0, #0
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 128) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length192(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length192:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w2, #192 // =0xc0
+; CHECK-NEXT:    b memcmp
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 192) nounwind
+  ret i32 %m
+}
+
+define i1 @length192_eq(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: length192_eq:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    mov w2, #192 // =0xc0
+; CHECK-NEXT:    bl memcmp
+; CHECK-NEXT:    cmp w0, #0
+; CHECK-NEXT:    cset w0, ne
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 192) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length192_lt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: length192_lt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    mov w2, #192 // =0xc0
+; CHECK-NEXT:    bl memcmp
+; CHECK-NEXT:    lsr w0, w0, #31
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 192) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length192_gt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: length192_gt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    mov w2, #192 // =0xc0
+; CHECK-NEXT:    bl memcmp
+; CHECK-NEXT:    cmp w0, #0
+; CHECK-NEXT:    cset w0, gt
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 192) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length192_eq_const(ptr %X) nounwind {
+; CHECK-LABEL: length192_eq_const:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    adrp x1, .L.str
+; CHECK-NEXT:    add x1, x1, :lo12:.L.str
+; CHECK-NEXT:    mov w2, #192 // =0xc0
+; CHECK-NEXT:    bl memcmp
+; CHECK-NEXT:    cmp w0, #0
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 192) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length255(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length255:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w2, #255 // =0xff
+; CHECK-NEXT:    b memcmp
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 255) nounwind
+  ret i32 %m
+}
+
+define i1 @length255_eq(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: length255_eq:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    mov w2, #255 // =0xff
+; CHECK-NEXT:    bl memcmp
+; CHECK-NEXT:    cmp w0, #0
+; CHECK-NEXT:    cset w0, ne
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 255) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length255_lt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: length255_lt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    mov w2, #255 // =0xff
+; CHECK-NEXT:    bl memcmp
+; CHECK-NEXT:    lsr w0, w0, #31
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 255) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length255_gt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: length255_gt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    mov w2, #255 // =0xff
+; CHECK-NEXT:    bl memcmp
+; CHECK-NEXT:    cmp w0, #0
+; CHECK-NEXT:    cset w0, gt
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 255) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length255_eq_const(ptr %X) nounwind {
+; CHECK-LABEL: length255_eq_const:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    adrp x1, .L.str
+; CHECK-NEXT:    add x1, x1, :lo12:.L.str
+; CHECK-NEXT:    mov w2, #255 // =0xff
+; CHECK-NEXT:    bl memcmp
+; CHECK-NEXT:    cmp w0, #0
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 255) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length256(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length256:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w2, #256 // =0x100
+; CHECK-NEXT:    b memcmp
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 256) nounwind
+  ret i32 %m
+}
+
+define i1 @length256_eq(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: length256_eq:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    mov w2, #256 // =0x100
+; CHECK-NEXT:    bl memcmp
+; CHECK-NEXT:    cmp w0, #0
+; CHECK-NEXT:    cset w0, ne
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 256) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length256_lt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: length256_lt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    mov w2, #256 // =0x100
+; CHECK-NEXT:    bl memcmp
+; CHECK-NEXT:    lsr w0, w0, #31
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 256) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length256_gt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: length256_gt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    mov w2, #256 // =0x100
+; CHECK-NEXT:    bl memcmp
+; CHECK-NEXT:    cmp w0, #0
+; CHECK-NEXT:    cset w0, gt
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 256) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length256_eq_const(ptr %X) nounwind {
+; CHECK-LABEL: length256_eq_const:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    adrp x1, .L.str
+; CHECK-NEXT:    add x1, x1, :lo12:.L.str
+; CHECK-NEXT:    mov w2, #256 // =0x100
+; CHECK-NEXT:    bl memcmp
+; CHECK-NEXT:    cmp w0, #0
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 256) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length384(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length384:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w2, #384 // =0x180
+; CHECK-NEXT:    b memcmp
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 384) nounwind
+  ret i32 %m
+}
+
+define i1 @length384_eq(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: length384_eq:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    mov w2, #384 // =0x180
+; CHECK-NEXT:    bl memcmp
+; CHECK-NEXT:    cmp w0, #0
+; CHECK-NEXT:    cset w0, ne
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 384) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length384_lt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: length384_lt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    mov w2, #384 // =0x180
+; CHECK-NEXT:    bl memcmp
+; CHECK-NEXT:    lsr w0, w0, #31
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 384) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length384_gt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: length384_gt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    mov w2, #384 // =0x180
+; CHECK-NEXT:    bl memcmp
+; CHECK-NEXT:    cmp w0, #0
+; CHECK-NEXT:    cset w0, gt
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 384) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length384_eq_const(ptr %X) nounwind {
+; CHECK-LABEL: length384_eq_const:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    adrp x1, .L.str
+; CHECK-NEXT:    add x1, x1, :lo12:.L.str
+; CHECK-NEXT:    mov w2, #384 // =0x180
+; CHECK-NEXT:    bl memcmp
+; CHECK-NEXT:    cmp w0, #0
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 384) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length511(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length511:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w2, #511 // =0x1ff
+; CHECK-NEXT:    b memcmp
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 511) nounwind
+  ret i32 %m
+}
+
+define i1 @length511_eq(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: length511_eq:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    mov w2, #511 // =0x1ff
+; CHECK-NEXT:    bl memcmp
+; CHECK-NEXT:    cmp w0, #0
+; CHECK-NEXT:    cset w0, ne
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 511) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length511_lt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: length511_lt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    mov w2, #511 // =0x1ff
+; CHECK-NEXT:    bl memcmp
+; CHECK-NEXT:    lsr w0, w0, #31
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 511) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length511_gt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: length511_gt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    mov w2, #511 // =0x1ff
+; CHECK-NEXT:    bl memcmp
+; CHECK-NEXT:    cmp w0, #0
+; CHECK-NEXT:    cset w0, gt
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 511) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length511_eq_const(ptr %X) nounwind {
+; CHECK-LABEL: length511_eq_const:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    adrp x1, .L.str
+; CHECK-NEXT:    add x1, x1, :lo12:.L.str
+; CHECK-NEXT:    mov w2, #511 // =0x1ff
+; CHECK-NEXT:    bl memcmp
+; CHECK-NEXT:    cmp w0, #0
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 511) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @length512(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: length512:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w2, #512 // =0x200
+; CHECK-NEXT:    b memcmp
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 512) nounwind
+  ret i32 %m
+}
+
+define i1 @length512_eq(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: length512_eq:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    mov w2, #512 // =0x200
+; CHECK-NEXT:    bl memcmp
+; CHECK-NEXT:    cmp w0, #0
+; CHECK-NEXT:    cset w0, ne
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 512) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length512_lt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: length512_lt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    mov w2, #512 // =0x200
+; CHECK-NEXT:    bl memcmp
+; CHECK-NEXT:    lsr w0, w0, #31
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 512) nounwind
+  %cmp = icmp slt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length512_gt(ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: length512_gt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    mov w2, #512 // =0x200
+; CHECK-NEXT:    bl memcmp
+; CHECK-NEXT:    cmp w0, #0
+; CHECK-NEXT:    cset w0, gt
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 512) nounwind
+  %cmp = icmp sgt i32 %call, 0
+  ret i1 %cmp
+}
+
+define i1 @length512_eq_const(ptr %X) nounwind {
+; CHECK-LABEL: length512_eq_const:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    adrp x1, .L.str
+; CHECK-NEXT:    add x1, x1, :lo12:.L.str
+; CHECK-NEXT:    mov w2, #512 // =0x200
+; CHECK-NEXT:    bl memcmp
+; CHECK-NEXT:    cmp w0, #0
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr @.str, i64 512) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @huge_length(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: huge_length:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov x2, #9223372036854775807 // =0x7fffffffffffffff
+; CHECK-NEXT:    b memcmp
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 9223372036854775807) nounwind
+  ret i32 %m
+}
+
+define i1 @huge_length_eq(ptr %X, ptr %Y) nounwind {
+; CHECK-LABEL: huge_length_eq:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    mov x2, #9223372036854775807 // =0x7fffffffffffffff
+; CHECK-NEXT:    bl memcmp
+; CHECK-NEXT:    cmp w0, #0
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 9223372036854775807) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
+
+define i32 @nonconst_length(ptr %X, ptr %Y, i64 %size) nounwind {
+; CHECK-LABEL: nonconst_length:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    b memcmp
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 %size) nounwind
+  ret i32 %m
+}
+
+define i1 @nonconst_length_eq(ptr %X, ptr %Y, i64 %size) nounwind {
+; CHECK-LABEL: nonconst_length_eq:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    bl memcmp
+; CHECK-NEXT:    cmp w0, #0
+; CHECK-NEXT:    cset w0, eq
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %m = tail call i32 @memcmp(ptr %X, ptr %Y, i64 %size) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
diff --git a/llvm/test/Transforms/ExpandMemCmp/AArch64/memcmp.ll b/llvm/test/Transforms/ExpandMemCmp/AArch64/memcmp.ll
new file mode 100644
index 0000000000000..54f8c7006bb51
--- /dev/null
+++ b/llvm/test/Transforms/ExpandMemCmp/AArch64/memcmp.ll
@@ -0,0 +1,881 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; RUN: opt -S -expandmemcmp -memcmp-num-loads-per-block=1 -mtriple=aarch64-unknown-unknown < %s | FileCheck %s
+
+declare i32 @memcmp(ptr nocapture, ptr nocapture, i64)
+
+define i32 @cmp2(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
+; CHECK-LABEL: define i32 @cmp2(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; CHECK-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; CHECK-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; CHECK-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; CHECK-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    ret i32 [[TMP7]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 2)
+  ret i32 %call
+}
+
+define i32 @cmp2_align2(ptr nocapture readonly align 2 %x, ptr nocapture readonly align 2 %y)  {
+; CHECK-LABEL: define i32 @cmp2_align2(
+; CHECK-SAME: ptr nocapture readonly align 2 [[X:%.*]], ptr nocapture readonly align 2 [[Y:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 2
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 2
+; CHECK-NEXT:    [[TMP3:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; CHECK-NEXT:    [[TMP4:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP2]])
+; CHECK-NEXT:    [[TMP5:%.*]] = zext i16 [[TMP3]] to i32
+; CHECK-NEXT:    [[TMP6:%.*]] = zext i16 [[TMP4]] to i32
+; CHECK-NEXT:    [[TMP7:%.*]] = sub i32 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    ret i32 [[TMP7]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 2)
+  ret i32 %call
+}
+
+define i32 @cmp3(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
+; CHECK-LABEL: define i32 @cmp3(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i24, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i24, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = zext i24 [[TMP1]] to i32
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i24 [[TMP2]] to i32
+; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ugt i32 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ult i32 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP7]] to i32
+; CHECK-NEXT:    [[TMP10:%.*]] = zext i1 [[TMP8]] to i32
+; CHECK-NEXT:    [[TMP11:%.*]] = sub i32 [[TMP9]], [[TMP10]]
+; CHECK-NEXT:    ret i32 [[TMP11]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 3)
+  ret i32 %call
+}
+
+define i32 @cmp4(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
+; CHECK-LABEL: define i32 @cmp4(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP2]])
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ugt i32 [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ult i32 [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; CHECK-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; CHECK-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    ret i32 [[TMP9]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 4)
+  ret i32 %call
+}
+
+define i32 @cmp5(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
+; CHECK-LABEL: define i32 @cmp5(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i40, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i40, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = zext i40 [[TMP1]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i40 [[TMP2]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ugt i64 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ult i64 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP7]] to i32
+; CHECK-NEXT:    [[TMP10:%.*]] = zext i1 [[TMP8]] to i32
+; CHECK-NEXT:    [[TMP11:%.*]] = sub i32 [[TMP9]], [[TMP10]]
+; CHECK-NEXT:    ret i32 [[TMP11]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 5)
+  ret i32 %call
+}
+
+define i32 @cmp6(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
+; CHECK-LABEL: define i32 @cmp6(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i48, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i48, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = zext i48 [[TMP1]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i48 [[TMP2]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ugt i64 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ult i64 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    [[TMP9:%.*]] = zext i1 [[TMP7]] to i32
+; CHECK-NEXT:    [[TMP10:%.*]] = zext i1 [[TMP8]] to i32
+; CHECK-NEXT:    [[TMP11:%.*]] = sub i32 [[TMP9]], [[TMP10]]
+; CHECK-NEXT:    ret i32 [[TMP11]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 6)
+  ret i32 %call
+}
+
+define i32 @cmp7(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
+; CHECK-LABEL: define i32 @cmp7(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; CHECK-NEXT:    br label [[LOADBB:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    [[PHI_SRC1:%.*]] = phi i32 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; CHECK-NEXT:    [[PHI_SRC2:%.*]] = phi i32 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i32 [[PHI_SRC1]], [[PHI_SRC2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb:
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP5]] = call i32 @llvm.bswap.i32(i32 [[TMP3]])
+; CHECK-NEXT:    [[TMP6]] = call i32 @llvm.bswap.i32(i32 [[TMP4]])
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP12]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; CHECK-NEXT:    [[TMP13]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[PHI_RES]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 7)
+  ret i32 %call
+}
+
+define i32 @cmp8(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
+; CHECK-LABEL: define i32 @cmp8(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP1]])
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP2]])
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ugt i64 [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ult i64 [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = zext i1 [[TMP5]] to i32
+; CHECK-NEXT:    [[TMP8:%.*]] = zext i1 [[TMP6]] to i32
+; CHECK-NEXT:    [[TMP9:%.*]] = sub i32 [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    ret i32 [[TMP9]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 8)
+  ret i32 %call
+}
+
+define i32 @cmp9(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
+; CHECK-LABEL: define i32 @cmp9(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; CHECK-NEXT:    br label [[LOADBB:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[TMP5:%.*]], [[TMP6:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb:
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; CHECK-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[LOADBB1:%.*]], label [[RES_BLOCK:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP8]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP12:%.*]] = zext i8 [[TMP10]] to i32
+; CHECK-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP11]] to i32
+; CHECK-NEXT:    [[TMP14:%.*]] = sub i32 [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    br label [[ENDBLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ [[TMP14]], [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[PHI_RES]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 9)
+  ret i32 %call
+}
+
+define i32 @cmp10(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
+; CHECK-LABEL: define i32 @cmp10(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; CHECK-NEXT:    br label [[LOADBB:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP14:%.*]], [[LOADBB1:%.*]] ]
+; CHECK-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP15:%.*]], [[LOADBB1]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb:
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; CHECK-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i16, ptr [[TMP8]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = load i16, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP12:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP10]])
+; CHECK-NEXT:    [[TMP13:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP11]])
+; CHECK-NEXT:    [[TMP14]] = zext i16 [[TMP12]] to i64
+; CHECK-NEXT:    [[TMP15]] = zext i16 [[TMP13]] to i64
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[TMP14]], [[TMP15]]
+; CHECK-NEXT:    br i1 [[TMP16]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[PHI_RES]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 10)
+  ret i32 %call
+}
+
+define i32 @cmp11(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
+; CHECK-LABEL: define i32 @cmp11(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; CHECK-NEXT:    br label [[LOADBB:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; CHECK-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb:
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; CHECK-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; CHECK-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[PHI_RES]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 11)
+  ret i32 %call
+}
+
+define i32 @cmp12(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
+; CHECK-LABEL: define i32 @cmp12(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; CHECK-NEXT:    br label [[LOADBB:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP14:%.*]], [[LOADBB1:%.*]] ]
+; CHECK-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP15:%.*]], [[LOADBB1]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb:
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; CHECK-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP8]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP12:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP10]])
+; CHECK-NEXT:    [[TMP13:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP11]])
+; CHECK-NEXT:    [[TMP14]] = zext i32 [[TMP12]] to i64
+; CHECK-NEXT:    [[TMP15]] = zext i32 [[TMP13]] to i64
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[TMP14]], [[TMP15]]
+; CHECK-NEXT:    br i1 [[TMP16]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[PHI_RES]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 12)
+  ret i32 %call
+}
+
+define i32 @cmp13(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
+; CHECK-LABEL: define i32 @cmp13(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; CHECK-NEXT:    br label [[LOADBB:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; CHECK-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb:
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; CHECK-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 5
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 5
+; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; CHECK-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[PHI_RES]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 13)
+  ret i32 %call
+}
+
+define i32 @cmp14(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
+; CHECK-LABEL: define i32 @cmp14(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; CHECK-NEXT:    br label [[LOADBB:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; CHECK-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb:
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; CHECK-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 6
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 6
+; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; CHECK-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[PHI_RES]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 14)
+  ret i32 %call
+}
+
+define i32 @cmp15(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
+; CHECK-LABEL: define i32 @cmp15(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; CHECK-NEXT:    br label [[LOADBB:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; CHECK-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb:
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; CHECK-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 7
+; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; CHECK-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[PHI_RES]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 15)
+  ret i32 %call
+}
+
+define i32 @cmp16(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
+; CHECK-LABEL: define i32 @cmp16(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; CHECK-NEXT:    br label [[LOADBB:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    [[PHI_SRC1:%.*]] = phi i64 [ [[TMP5:%.*]], [[LOADBB]] ], [ [[TMP12:%.*]], [[LOADBB1:%.*]] ]
+; CHECK-NEXT:    [[PHI_SRC2:%.*]] = phi i64 [ [[TMP6:%.*]], [[LOADBB]] ], [ [[TMP13:%.*]], [[LOADBB1]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[PHI_SRC1]], [[PHI_SRC2]]
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[TMP1]], i32 -1, i32 1
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb:
+; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP5]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
+; CHECK-NEXT:    [[TMP6]] = call i64 @llvm.bswap.i64(i64 [[TMP4]])
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[TMP5]], [[TMP6]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[LOADBB1]], label [[RES_BLOCK:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP8]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = load i64, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP12]] = call i64 @llvm.bswap.i64(i64 [[TMP10]])
+; CHECK-NEXT:    [[TMP13]] = call i64 @llvm.bswap.i64(i64 [[TMP11]])
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[ENDBLOCK]], label [[RES_BLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ [[TMP2]], [[RES_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[PHI_RES]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 16)
+  ret i32 %call
+}
+
+define i32 @cmp_eq2(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
+; CHECK-LABEL: define i32 @cmp_eq2(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i16 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP4]], 0
+; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 2)
+  %cmp = icmp eq i32 %call, 0
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @cmp_eq3(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
+; CHECK-LABEL: define i32 @cmp_eq3(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; CHECK-NEXT:    br label [[LOADBB:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb:
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i16 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    br i1 [[TMP3]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 2
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 2
+; CHECK-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne i8 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    br i1 [[TMP8]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
+; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 3)
+  %cmp = icmp eq i32 %call, 0
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @cmp_eq4(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
+; CHECK-LABEL: define i32 @cmp_eq4(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP4]], 0
+; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 4)
+  %cmp = icmp eq i32 %call, 0
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @cmp_eq5(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
+; CHECK-LABEL: define i32 @cmp_eq5(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; CHECK-NEXT:    br label [[LOADBB:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb:
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    br i1 [[TMP3]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; CHECK-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne i8 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    br i1 [[TMP8]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
+; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 5)
+  %cmp = icmp eq i32 %call, 0
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @cmp_eq6(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
+; CHECK-LABEL: define i32 @cmp_eq6(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; CHECK-NEXT:    br label [[LOADBB:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb:
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    br i1 [[TMP3]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; CHECK-NEXT:    [[TMP6:%.*]] = load i16, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i16, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne i16 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    br i1 [[TMP8]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
+; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 6)
+  %cmp = icmp eq i32 %call, 0
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @cmp_eq6_align4(ptr nocapture readonly align 4 %x, ptr nocapture readonly align 4 %y)  {
+; CHECK-LABEL: define i32 @cmp_eq6_align4(
+; CHECK-SAME: ptr nocapture readonly align 4 [[X:%.*]], ptr nocapture readonly align 4 [[Y:%.*]]) {
+; CHECK-NEXT:    br label [[LOADBB:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb:
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    br i1 [[TMP3]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 4
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 4
+; CHECK-NEXT:    [[TMP6:%.*]] = load i16, ptr [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = load i16, ptr [[TMP5]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne i16 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    br i1 [[TMP8]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
+; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 6)
+  %cmp = icmp eq i32 %call, 0
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @cmp_eq7(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
+; CHECK-LABEL: define i32 @cmp_eq7(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; CHECK-NEXT:    br label [[LOADBB:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb:
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    br i1 [[TMP3]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne i32 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    br i1 [[TMP8]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
+; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 7)
+  %cmp = icmp eq i32 %call, 0
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @cmp_eq8(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
+; CHECK-LABEL: define i32 @cmp_eq8(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i1 [[TMP3]] to i32
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[TMP4]], 0
+; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 8)
+  %cmp = icmp eq i32 %call, 0
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @cmp_eq9(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
+; CHECK-LABEL: define i32 @cmp_eq9(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; CHECK-NEXT:    br label [[LOADBB:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb:
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    br i1 [[TMP3]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i8, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i8, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne i8 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    br i1 [[TMP8]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
+; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 9)
+  %cmp = icmp eq i32 %call, 0
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @cmp_eq10(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
+; CHECK-LABEL: define i32 @cmp_eq10(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; CHECK-NEXT:    br label [[LOADBB:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb:
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    br i1 [[TMP3]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i16, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i16, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne i16 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    br i1 [[TMP8]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
+; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 10)
+  %cmp = icmp eq i32 %call, 0
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @cmp_eq11(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
+; CHECK-LABEL: define i32 @cmp_eq11(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; CHECK-NEXT:    br label [[LOADBB:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb:
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    br i1 [[TMP3]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 3
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 3
+; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne i64 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    br i1 [[TMP8]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
+; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 11)
+  %cmp = icmp eq i32 %call, 0
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @cmp_eq12(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
+; CHECK-LABEL: define i32 @cmp_eq12(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; CHECK-NEXT:    br label [[LOADBB:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb:
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    br i1 [[TMP3]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne i32 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    br i1 [[TMP8]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
+; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 12)
+  %cmp = icmp eq i32 %call, 0
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @cmp_eq13(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
+; CHECK-LABEL: define i32 @cmp_eq13(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; CHECK-NEXT:    br label [[LOADBB:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb:
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    br i1 [[TMP3]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 5
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 5
+; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne i64 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    br i1 [[TMP8]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
+; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 13)
+  %cmp = icmp eq i32 %call, 0
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @cmp_eq14(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
+; CHECK-LABEL: define i32 @cmp_eq14(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; CHECK-NEXT:    br label [[LOADBB:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb:
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    br i1 [[TMP3]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 6
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 6
+; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne i64 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    br i1 [[TMP8]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
+; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 14)
+  %cmp = icmp eq i32 %call, 0
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @cmp_eq15(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
+; CHECK-LABEL: define i32 @cmp_eq15(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; CHECK-NEXT:    br label [[LOADBB:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb:
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    br i1 [[TMP3]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 7
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 7
+; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne i64 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    br i1 [[TMP8]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
+; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 15)
+  %cmp = icmp eq i32 %call, 0
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @cmp_eq16(ptr nocapture readonly %x, ptr nocapture readonly %y)  {
+; CHECK-LABEL: define i32 @cmp_eq16(
+; CHECK-SAME: ptr nocapture readonly [[X:%.*]], ptr nocapture readonly [[Y:%.*]]) {
+; CHECK-NEXT:    br label [[LOADBB:%.*]]
+; CHECK:       res_block:
+; CHECK-NEXT:    br label [[ENDBLOCK:%.*]]
+; CHECK:       loadbb:
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[X]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[Y]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    br i1 [[TMP3]], label [[RES_BLOCK:%.*]], label [[LOADBB1:%.*]]
+; CHECK:       loadbb1:
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[X]], i64 8
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[Y]], i64 8
+; CHECK-NEXT:    [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp ne i64 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    br i1 [[TMP8]], label [[RES_BLOCK]], label [[ENDBLOCK]]
+; CHECK:       endblock:
+; CHECK-NEXT:    [[PHI_RES:%.*]] = phi i32 [ 0, [[LOADBB1]] ], [ 1, [[RES_BLOCK]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[PHI_RES]], 0
+; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
+  %call = tail call i32 @memcmp(ptr %x, ptr %y, i64 16)
+  %cmp = icmp eq i32 %call, 0
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}

From 134c91595568ea1335b22e559f20c1a488ea270e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jan=20Kokem=C3=BCller?= <jan.kokemueller@gmail.com>
Date: Mon, 30 Oct 2023 19:56:03 +0100
Subject: [PATCH 416/877] [libc++] Fix UB in <expected> related to "has value"
 flag (#68552) (#68733)

The calls to std::construct_at might overwrite the previously set
__has_value_ flag in the case where the flag is overlapping with
the actual value or error being stored (since we use [[no_unique_address]]).
To fix this issue, this patch ensures that we initialize the
__has_value_ flag after we call std::construct_at.

Fixes #68552
---
 libcxx/include/__expected/expected.h          | 177 ++++++++----------
 .../assign/emplace.intializer_list.pass.cpp   |   8 +
 .../expected.expected/assign/emplace.pass.cpp |   7 +
 .../ctor/ctor.convert.copy.pass.cpp           |  11 +-
 .../ctor/ctor.convert.move.pass.cpp           |  11 +-
 .../expected.expected/ctor/ctor.copy.pass.cpp |  18 +-
 .../ctor/ctor.default.pass.cpp                |   5 +-
 .../ctor/ctor.inplace.pass.cpp                |  18 +-
 .../ctor/ctor.inplace_init_list.pass.cpp      |   9 +-
 .../expected.expected/ctor/ctor.move.pass.cpp |  20 +-
 .../expected.expected/ctor/ctor.u.pass.cpp    |  18 +-
 .../ctor/ctor.unexpect.pass.cpp               |  18 +-
 .../ctor/ctor.unexpect_init_list.pass.cpp     |   9 +-
 .../ctor/ctor.unexpected.copy.pass.cpp        |   8 +-
 .../ctor/ctor.unexpected.move.pass.cpp        |   8 +-
 .../observers/has_value.pass.cpp              |  32 ++++
 .../expected.expected/swap/free.swap.pass.cpp |  64 ++++++-
 .../swap/member.swap.pass.cpp                 |  64 ++++++-
 .../ctor/ctor.convert.copy.pass.cpp           |  11 +-
 .../ctor/ctor.convert.move.pass.cpp           |  11 +-
 .../expected.void/ctor/ctor.copy.pass.cpp     |  11 +-
 .../expected.void/ctor/ctor.move.pass.cpp     |  12 +-
 .../expected.void/ctor/ctor.unexpect.pass.cpp |   6 +-
 .../ctor/ctor.unexpect_init_list.pass.cpp     |   9 +-
 .../ctor/ctor.unexpected.copy.pass.cpp        |   4 +-
 .../ctor/ctor.unexpected.move.pass.cpp        |   4 +-
 .../observers/has_value.pass.cpp              |  12 ++
 .../expected.void/swap/free.swap.pass.cpp     |  32 +++-
 .../expected.void/swap/member.swap.pass.cpp   |  30 ++-
 libcxx/test/std/utilities/expected/types.h    |  49 +++++
 30 files changed, 534 insertions(+), 162 deletions(-)

diff --git a/libcxx/include/__expected/expected.h b/libcxx/include/__expected/expected.h
index 045370a486fae..bf16c8f720d26 100644
--- a/libcxx/include/__expected/expected.h
+++ b/libcxx/include/__expected/expected.h
@@ -119,9 +119,7 @@ class expected {
   _LIBCPP_HIDE_FROM_ABI constexpr expected()
     noexcept(is_nothrow_default_constructible_v<_Tp>) // strengthened
     requires is_default_constructible_v<_Tp>
-      : __has_val_(true) {
-    std::construct_at(std::addressof(__union_.__val_));
-  }
+      : __union_(std::in_place), __has_val_(true) {}
 
   _LIBCPP_HIDE_FROM_ABI constexpr expected(const expected&) = delete;
 
@@ -136,14 +134,7 @@ class expected {
     noexcept(is_nothrow_copy_constructible_v<_Tp> && is_nothrow_copy_constructible_v<_Err>) // strengthened
     requires(is_copy_constructible_v<_Tp> && is_copy_constructible_v<_Err> &&
              !(is_trivially_copy_constructible_v<_Tp> && is_trivially_copy_constructible_v<_Err>))
-      : __has_val_(__other.__has_val_) {
-    if (__has_val_) {
-      std::construct_at(std::addressof(__union_.__val_), __other.__union_.__val_);
-    } else {
-      std::construct_at(std::addressof(__union_.__unex_), __other.__union_.__unex_);
-    }
-  }
-
+      : __union_(__other.__has_val_, __other.__union_), __has_val_(__other.__has_val_) { }
 
   _LIBCPP_HIDE_FROM_ABI constexpr expected(expected&&)
     requires(is_move_constructible_v<_Tp> && is_move_constructible_v<_Err>
@@ -154,13 +145,7 @@ class expected {
     noexcept(is_nothrow_move_constructible_v<_Tp> && is_nothrow_move_constructible_v<_Err>)
     requires(is_move_constructible_v<_Tp> && is_move_constructible_v<_Err> &&
              !(is_trivially_move_constructible_v<_Tp> && is_trivially_move_constructible_v<_Err>))
-      : __has_val_(__other.__has_val_) {
-    if (__has_val_) {
-      std::construct_at(std::addressof(__union_.__val_), std::move(__other.__union_.__val_));
-    } else {
-      std::construct_at(std::addressof(__union_.__unex_), std::move(__other.__union_.__unex_));
-    }
-  }
+      : __union_(__other.__has_val_, std::move(__other.__union_)), __has_val_(__other.__has_val_) { }
 
 private:
   template <class _Up, class _OtherErr, class _UfQual, class _OtherErrQual>
@@ -200,26 +185,14 @@ class expected {
   expected(const expected<_Up, _OtherErr>& __other)
     noexcept(is_nothrow_constructible_v<_Tp, const _Up&> &&
              is_nothrow_constructible_v<_Err, const _OtherErr&>) // strengthened
-      : __has_val_(__other.__has_val_) {
-    if (__has_val_) {
-      std::construct_at(std::addressof(__union_.__val_), __other.__union_.__val_);
-    } else {
-      std::construct_at(std::addressof(__union_.__unex_), __other.__union_.__unex_);
-    }
-  }
+      : __union_(__other.__has_val_, __other.__union_), __has_val_(__other.__has_val_) {}
 
   template <class _Up, class _OtherErr>
     requires __can_convert<_Up, _OtherErr, _Up, _OtherErr>::value
   _LIBCPP_HIDE_FROM_ABI constexpr explicit(!is_convertible_v<_Up, _Tp> || !is_convertible_v<_OtherErr, _Err>)
   expected(expected<_Up, _OtherErr>&& __other)
     noexcept(is_nothrow_constructible_v<_Tp, _Up> && is_nothrow_constructible_v<_Err, _OtherErr>) // strengthened
-      : __has_val_(__other.__has_val_) {
-    if (__has_val_) {
-      std::construct_at(std::addressof(__union_.__val_), std::move(__other.__union_.__val_));
-    } else {
-      std::construct_at(std::addressof(__union_.__unex_), std::move(__other.__union_.__unex_));
-    }
-  }
+      : __union_(__other.__has_val_, std::move(__other.__union_)), __has_val_(__other.__has_val_) {}
 
   template <class _Up = _Tp>
     requires(!is_same_v<remove_cvref_t<_Up>, in_place_t> && !is_same_v<expected, remove_cvref_t<_Up>> &&
@@ -227,61 +200,47 @@ class expected {
              (!is_same_v<remove_cv_t<_Tp>, bool> || !__is_std_expected<remove_cvref_t<_Up>>::value))
   _LIBCPP_HIDE_FROM_ABI constexpr explicit(!is_convertible_v<_Up, _Tp>)
       expected(_Up&& __u) noexcept(is_nothrow_constructible_v<_Tp, _Up>) // strengthened
-      : __has_val_(true) {
-    std::construct_at(std::addressof(__union_.__val_), std::forward<_Up>(__u));
-  }
+      : __union_(std::in_place, std::forward<_Up>(__u)), __has_val_(true) {}
 
   template <class _OtherErr>
     requires is_constructible_v<_Err, const _OtherErr&>
   _LIBCPP_HIDE_FROM_ABI constexpr explicit(!is_convertible_v<const _OtherErr&, _Err>)
   expected(const unexpected<_OtherErr>& __unex)
     noexcept(is_nothrow_constructible_v<_Err, const _OtherErr&>) // strengthened
-      : __has_val_(false) {
-    std::construct_at(std::addressof(__union_.__unex_), __unex.error());
-  }
+      : __union_(std::unexpect, __unex.error()), __has_val_(false) {}
 
   template <class _OtherErr>
     requires is_constructible_v<_Err, _OtherErr>
   _LIBCPP_HIDE_FROM_ABI constexpr explicit(!is_convertible_v<_OtherErr, _Err>)
   expected(unexpected<_OtherErr>&& __unex)
     noexcept(is_nothrow_constructible_v<_Err, _OtherErr>) // strengthened
-      : __has_val_(false) {
-    std::construct_at(std::addressof(__union_.__unex_), std::move(__unex.error()));
-  }
+      : __union_(std::unexpect, std::move(__unex.error())), __has_val_(false) {}
 
   template <class... _Args>
     requires is_constructible_v<_Tp, _Args...>
   _LIBCPP_HIDE_FROM_ABI constexpr explicit expected(in_place_t, _Args&&... __args)
     noexcept(is_nothrow_constructible_v<_Tp, _Args...>) // strengthened
-      : __has_val_(true) {
-    std::construct_at(std::addressof(__union_.__val_), std::forward<_Args>(__args)...);
-  }
+      : __union_(std::in_place, std::forward<_Args>(__args)...), __has_val_(true) {}
 
   template <class _Up, class... _Args>
     requires is_constructible_v< _Tp, initializer_list<_Up>&, _Args... >
   _LIBCPP_HIDE_FROM_ABI constexpr explicit
   expected(in_place_t, initializer_list<_Up> __il, _Args&&... __args)
     noexcept(is_nothrow_constructible_v<_Tp, initializer_list<_Up>&, _Args...>) // strengthened
-      : __has_val_(true) {
-    std::construct_at(std::addressof(__union_.__val_), __il, std::forward<_Args>(__args)...);
-  }
+      : __union_(std::in_place, __il, std::forward<_Args>(__args)...), __has_val_(true) {}
 
   template <class... _Args>
     requires is_constructible_v<_Err, _Args...>
   _LIBCPP_HIDE_FROM_ABI constexpr explicit expected(unexpect_t, _Args&&... __args)
-    noexcept(is_nothrow_constructible_v<_Err, _Args...>)  // strengthened
-      : __has_val_(false) {
-    std::construct_at(std::addressof(__union_.__unex_), std::forward<_Args>(__args)...);
-  }
+    noexcept(is_nothrow_constructible_v<_Err, _Args...>) // strengthened
+      : __union_(std::unexpect, std::forward<_Args>(__args)...), __has_val_(false) {}
 
   template <class _Up, class... _Args>
     requires is_constructible_v< _Err, initializer_list<_Up>&, _Args... >
   _LIBCPP_HIDE_FROM_ABI constexpr explicit
   expected(unexpect_t, initializer_list<_Up> __il, _Args&&... __args)
     noexcept(is_nothrow_constructible_v<_Err, initializer_list<_Up>&, _Args...>) // strengthened
-      : __has_val_(false) {
-    std::construct_at(std::addressof(__union_.__unex_), __il, std::forward<_Args>(__args)...);
-  }
+      : __union_(std::unexpect, __il, std::forward<_Args>(__args)...), __has_val_(false) {}
 
   // [expected.object.dtor], destructor
 
@@ -440,9 +399,10 @@ class expected {
       std::destroy_at(std::addressof(__union_.__val_));
     } else {
       std::destroy_at(std::addressof(__union_.__unex_));
-      __has_val_ = true;
     }
-    return *std::construct_at(std::addressof(__union_.__val_), std::forward<_Args>(__args)...);
+    std::construct_at(std::addressof(__union_.__val_), std::forward<_Args>(__args)...);
+    __has_val_ = true;
+    return __union_.__val_;
   }
 
   template <class _Up, class... _Args>
@@ -452,9 +412,10 @@ class expected {
       std::destroy_at(std::addressof(__union_.__val_));
     } else {
       std::destroy_at(std::addressof(__union_.__unex_));
-      __has_val_ = true;
     }
-    return *std::construct_at(std::addressof(__union_.__val_), __il, std::forward<_Args>(__args)...);
+    std::construct_at(std::addressof(__union_.__val_), __il, std::forward<_Args>(__args)...);
+    __has_val_ = true;
+    return __union_.__val_;
   }
 
 
@@ -893,11 +854,15 @@ class expected {
   }
 
 private:
-  struct __empty_t {};
-
   template <class _ValueType, class _ErrorType>
   union __union_t {
-    _LIBCPP_HIDE_FROM_ABI constexpr __union_t() {}
+    template <class... _Args>
+    _LIBCPP_HIDE_FROM_ABI constexpr explicit __union_t(std::in_place_t, _Args&&... __args)
+        : __val_(std::forward<_Args>(__args)...) {}
+
+    template <class... _Args>
+    _LIBCPP_HIDE_FROM_ABI constexpr explicit __union_t(std::unexpect_t, _Args&&... __args)
+        : __unex_(std::forward<_Args>(__args)...) {}
 
     template <class _Func, class... _Args>
     _LIBCPP_HIDE_FROM_ABI constexpr explicit __union_t(
@@ -909,6 +874,14 @@ class expected {
         std::__expected_construct_unexpected_from_invoke_tag, _Func&& __f, _Args&&... __args)
         : __unex_(std::invoke(std::forward<_Func>(__f), std::forward<_Args>(__args)...)) {}
 
+    template <class _Union>
+    _LIBCPP_HIDE_FROM_ABI constexpr explicit __union_t(bool __has_val, _Union&& __other) {
+      if (__has_val)
+        std::construct_at(std::addressof(__val_), std::forward<_Union>(__other).__val_);
+      else
+        std::construct_at(std::addressof(__unex_), std::forward<_Union>(__other).__unex_);
+    }
+
     _LIBCPP_HIDE_FROM_ABI constexpr ~__union_t()
       requires(is_trivially_destructible_v<_ValueType> && is_trivially_destructible_v<_ErrorType>)
     = default;
@@ -927,10 +900,17 @@ class expected {
   template <class _ValueType, class _ErrorType>
     requires(is_trivially_move_constructible_v<_ValueType> && is_trivially_move_constructible_v<_ErrorType>)
   union __union_t<_ValueType, _ErrorType> {
-    _LIBCPP_HIDE_FROM_ABI constexpr __union_t() : __empty_() {}
     _LIBCPP_HIDE_FROM_ABI constexpr __union_t(const __union_t&) = default;
     _LIBCPP_HIDE_FROM_ABI constexpr __union_t& operator=(const __union_t&) = default;
 
+    template <class... _Args>
+    _LIBCPP_HIDE_FROM_ABI constexpr explicit __union_t(std::in_place_t, _Args&&... __args)
+        : __val_(std::forward<_Args>(__args)...) {}
+
+    template <class... _Args>
+    _LIBCPP_HIDE_FROM_ABI constexpr explicit __union_t(std::unexpect_t, _Args&&... __args)
+        : __unex_(std::forward<_Args>(__args)...) {}
+
     template <class _Func, class... _Args>
     _LIBCPP_HIDE_FROM_ABI constexpr explicit __union_t(
         std::__expected_construct_in_place_from_invoke_tag, _Func&& __f, _Args&&... __args)
@@ -941,6 +921,14 @@ class expected {
         std::__expected_construct_unexpected_from_invoke_tag, _Func&& __f, _Args&&... __args)
         : __unex_(std::invoke(std::forward<_Func>(__f), std::forward<_Args>(__args)...)) {}
 
+    template <class _Union>
+    _LIBCPP_HIDE_FROM_ABI constexpr explicit __union_t(bool __has_val, _Union&& __other) {
+      if (__has_val)
+        std::construct_at(std::addressof(__val_), std::forward<_Union>(__other).__val_);
+      else
+        std::construct_at(std::addressof(__unex_), std::forward<_Union>(__other).__unex_);
+    }
+
     _LIBCPP_HIDE_FROM_ABI constexpr ~__union_t()
       requires(is_trivially_destructible_v<_ValueType> && is_trivially_destructible_v<_ErrorType>)
     = default;
@@ -950,7 +938,6 @@ class expected {
       requires(!is_trivially_destructible_v<_ValueType> || !is_trivially_destructible_v<_ErrorType>)
     {}
 
-    _LIBCPP_NO_UNIQUE_ADDRESS __empty_t __empty_;
     _LIBCPP_NO_UNIQUE_ADDRESS _ValueType __val_;
     _LIBCPP_NO_UNIQUE_ADDRESS _ErrorType __unex_;
   };
@@ -998,11 +985,7 @@ class expected<_Tp, _Err> {
   _LIBCPP_HIDE_FROM_ABI constexpr expected(const expected& __rhs)
     noexcept(is_nothrow_copy_constructible_v<_Err>) // strengthened
     requires(is_copy_constructible_v<_Err> && !is_trivially_copy_constructible_v<_Err>)
-      : __has_val_(__rhs.__has_val_) {
-    if (!__rhs.__has_val_) {
-      std::construct_at(std::addressof(__union_.__unex_), __rhs.__union_.__unex_);
-    }
-  }
+      : __union_(__rhs.__has_val_, __rhs.__union_), __has_val_(__rhs.__has_val_) {}
 
   _LIBCPP_HIDE_FROM_ABI constexpr expected(expected&&)
     requires(is_move_constructible_v<_Err> && is_trivially_move_constructible_v<_Err>)
@@ -1011,51 +994,35 @@ class expected<_Tp, _Err> {
   _LIBCPP_HIDE_FROM_ABI constexpr expected(expected&& __rhs)
     noexcept(is_nothrow_move_constructible_v<_Err>)
     requires(is_move_constructible_v<_Err> && !is_trivially_move_constructible_v<_Err>)
-      : __has_val_(__rhs.__has_val_) {
-    if (!__rhs.__has_val_) {
-      std::construct_at(std::addressof(__union_.__unex_), std::move(__rhs.__union_.__unex_));
-    }
-  }
+      : __union_(__rhs.__has_val_, std::move(__rhs.__union_)), __has_val_(__rhs.__has_val_) {}
 
   template <class _Up, class _OtherErr>
     requires __can_convert<_Up, _OtherErr, const _OtherErr&>::value
   _LIBCPP_HIDE_FROM_ABI constexpr explicit(!is_convertible_v<const _OtherErr&, _Err>)
   expected(const expected<_Up, _OtherErr>& __rhs)
     noexcept(is_nothrow_constructible_v<_Err, const _OtherErr&>) // strengthened
-      : __has_val_(__rhs.__has_val_) {
-    if (!__rhs.__has_val_) {
-      std::construct_at(std::addressof(__union_.__unex_), __rhs.__union_.__unex_);
-    }
-  }
+      : __union_(__rhs.__has_val_, __rhs.__union_), __has_val_(__rhs.__has_val_) {}
 
   template <class _Up, class _OtherErr>
     requires __can_convert<_Up, _OtherErr, _OtherErr>::value
   _LIBCPP_HIDE_FROM_ABI constexpr explicit(!is_convertible_v<_OtherErr, _Err>)
   expected(expected<_Up, _OtherErr>&& __rhs)
     noexcept(is_nothrow_constructible_v<_Err, _OtherErr>) // strengthened
-      : __has_val_(__rhs.__has_val_) {
-    if (!__rhs.__has_val_) {
-      std::construct_at(std::addressof(__union_.__unex_), std::move(__rhs.__union_.__unex_));
-    }
-  }
+      : __union_(__rhs.__has_val_, std::move(__rhs.__union_)), __has_val_(__rhs.__has_val_) {}
 
   template <class _OtherErr>
     requires is_constructible_v<_Err, const _OtherErr&>
   _LIBCPP_HIDE_FROM_ABI constexpr explicit(!is_convertible_v<const _OtherErr&, _Err>)
   expected(const unexpected<_OtherErr>& __unex)
     noexcept(is_nothrow_constructible_v<_Err, const _OtherErr&>) // strengthened
-      : __has_val_(false) {
-    std::construct_at(std::addressof(__union_.__unex_), __unex.error());
-  }
+      : __union_(std::unexpect, __unex.error()), __has_val_(false) {}
 
   template <class _OtherErr>
     requires is_constructible_v<_Err, _OtherErr>
   _LIBCPP_HIDE_FROM_ABI constexpr explicit(!is_convertible_v<_OtherErr, _Err>)
   expected(unexpected<_OtherErr>&& __unex)
     noexcept(is_nothrow_constructible_v<_Err, _OtherErr>) // strengthened
-      : __has_val_(false) {
-    std::construct_at(std::addressof(__union_.__unex_), std::move(__unex.error()));
-  }
+      : __union_(std::unexpect, std::move(__unex.error())), __has_val_(false) {}
 
   _LIBCPP_HIDE_FROM_ABI constexpr explicit expected(in_place_t) noexcept : __has_val_(true) {}
 
@@ -1063,17 +1030,13 @@ class expected<_Tp, _Err> {
     requires is_constructible_v<_Err, _Args...>
   _LIBCPP_HIDE_FROM_ABI constexpr explicit expected(unexpect_t, _Args&&... __args)
     noexcept(is_nothrow_constructible_v<_Err, _Args...>) // strengthened
-      : __has_val_(false) {
-    std::construct_at(std::addressof(__union_.__unex_), std::forward<_Args>(__args)...);
-  }
+      : __union_(std::unexpect, std::forward<_Args>(__args)...), __has_val_(false) {}
 
   template <class _Up, class... _Args>
     requires is_constructible_v< _Err, initializer_list<_Up>&, _Args... >
   _LIBCPP_HIDE_FROM_ABI constexpr explicit expected(unexpect_t, initializer_list<_Up> __il, _Args&&... __args)
     noexcept(is_nothrow_constructible_v<_Err, initializer_list<_Up>&, _Args...>) // strengthened
-      : __has_val_(false) {
-    std::construct_at(std::addressof(__union_.__unex_), __il, std::forward<_Args>(__args)...);
-  }
+      : __union_(std::unexpect, __il, std::forward<_Args>(__args)...), __has_val_(false) {}
 
 private:
   template <class _Func>
@@ -1507,11 +1470,23 @@ class expected<_Tp, _Err> {
   union __union_t {
     _LIBCPP_HIDE_FROM_ABI constexpr __union_t() : __empty_() {}
 
+    template <class... _Args>
+    _LIBCPP_HIDE_FROM_ABI constexpr explicit __union_t(std::unexpect_t, _Args&&... __args)
+        : __unex_(std::forward<_Args>(__args)...) {}
+
     template <class _Func, class... _Args>
     _LIBCPP_HIDE_FROM_ABI constexpr explicit __union_t(
         __expected_construct_unexpected_from_invoke_tag, _Func&& __f, _Args&&... __args)
         : __unex_(std::invoke(std::forward<_Func>(__f), std::forward<_Args>(__args)...)) {}
 
+    template <class _Union>
+    _LIBCPP_HIDE_FROM_ABI constexpr explicit __union_t(bool __has_val, _Union&& __other) {
+      if (__has_val)
+        std::construct_at(std::addressof(__empty_));
+      else
+        std::construct_at(std::addressof(__unex_), std::forward<_Union>(__other).__unex_);
+    }
+
     _LIBCPP_HIDE_FROM_ABI constexpr ~__union_t()
       requires(is_trivially_destructible_v<_ErrorType>)
     = default;
@@ -1534,11 +1509,23 @@ class expected<_Tp, _Err> {
     _LIBCPP_HIDE_FROM_ABI constexpr __union_t(const __union_t&) = default;
     _LIBCPP_HIDE_FROM_ABI constexpr __union_t& operator=(const __union_t&) = default;
 
+    template <class... _Args>
+    _LIBCPP_HIDE_FROM_ABI constexpr explicit __union_t(std::unexpect_t, _Args&&... __args)
+        : __unex_(std::forward<_Args>(__args)...) {}
+
     template <class _Func, class... _Args>
     _LIBCPP_HIDE_FROM_ABI constexpr explicit __union_t(
         __expected_construct_unexpected_from_invoke_tag, _Func&& __f, _Args&&... __args)
         : __unex_(std::invoke(std::forward<_Func>(__f), std::forward<_Args>(__args)...)) {}
 
+    template <class _Union>
+    _LIBCPP_HIDE_FROM_ABI constexpr explicit __union_t(bool __has_val, _Union&& __other) {
+      if (__has_val)
+        std::construct_at(std::addressof(__empty_));
+      else
+        std::construct_at(std::addressof(__unex_), std::forward<_Union>(__other).__unex_);
+    }
+
     _LIBCPP_HIDE_FROM_ABI constexpr ~__union_t()
       requires(is_trivially_destructible_v<_ErrorType>)
     = default;
diff --git a/libcxx/test/std/utilities/expected/expected.expected/assign/emplace.intializer_list.pass.cpp b/libcxx/test/std/utilities/expected/expected.expected/assign/emplace.intializer_list.pass.cpp
index 3cdfcde3f4d69..922200a8c0263 100644
--- a/libcxx/test/std/utilities/expected/expected.expected/assign/emplace.intializer_list.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.expected/assign/emplace.intializer_list.pass.cpp
@@ -81,6 +81,14 @@ constexpr bool test() {
     assert(e.value().i == 10);
   }
 
+  // TailClobberer
+  {
+    std::expected<TailClobberer<0>, bool> e(std::unexpect);
+    auto list = {4, 5, 6};
+    e.emplace(list);
+    assert(e.has_value());
+  }
+
   return true;
 }
 
diff --git a/libcxx/test/std/utilities/expected/expected.expected/assign/emplace.pass.cpp b/libcxx/test/std/utilities/expected/expected.expected/assign/emplace.pass.cpp
index c62e628935020..491de2dff0331 100644
--- a/libcxx/test/std/utilities/expected/expected.expected/assign/emplace.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.expected/assign/emplace.pass.cpp
@@ -73,6 +73,13 @@ constexpr bool test() {
     assert(e.value() == 10);
   }
 
+  // TailClobberer
+  {
+    std::expected<TailClobberer<0>, bool> e(std::unexpect);
+    e.emplace();
+    assert(e.has_value());
+  }
+
   return true;
 }
 
diff --git a/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.convert.copy.pass.cpp b/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.convert.copy.pass.cpp
index 9274b9a2c030e..16de28d970396 100644
--- a/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.convert.copy.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.convert.copy.pass.cpp
@@ -45,6 +45,7 @@
 #include <utility>
 
 #include "test_macros.h"
+#include "../../types.h"
 
 // Test Constraints:
 template <class T1, class Err1, class T2, class Err2>
@@ -161,13 +162,19 @@ constexpr bool test() {
     assert(e1.error() == 5);
   }
 
+  // convert TailClobberer
+  {
+    const std::expected<TailClobbererNonTrivialMove<0>, char> e1;
+    std::expected<TailClobberer<0>, char> e2 = e1;
+    assert(e2.has_value());
+    assert(e1.has_value());
+  }
+
   return true;
 }
 
 void testException() {
 #ifndef TEST_HAS_NO_EXCEPTIONS
-  struct Except {};
-
   struct ThrowingInt {
     ThrowingInt(int) { throw Except{}; }
   };
diff --git a/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.convert.move.pass.cpp b/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.convert.move.pass.cpp
index 71979311bfd10..0e30ea2c7fe0b 100644
--- a/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.convert.move.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.convert.move.pass.cpp
@@ -46,6 +46,7 @@
 
 #include "MoveOnly.h"
 #include "test_macros.h"
+#include "../../types.h"
 
 // Test Constraints:
 template <class T1, class Err1, class T2, class Err2>
@@ -160,13 +161,19 @@ constexpr bool test() {
     assert(e1.error().get() == 0);
   }
 
+  // convert TailClobberer
+  {
+    std::expected<TailClobbererNonTrivialMove<0>, char> e1;
+    std::expected<TailClobberer<0>, char> e2 = std::move(e1);
+    assert(e2.has_value());
+    assert(e1.has_value());
+  }
+
   return true;
 }
 
 void testException() {
 #ifndef TEST_HAS_NO_EXCEPTIONS
-  struct Except {};
-
   struct ThrowingInt {
     ThrowingInt(int) { throw Except{}; }
   };
diff --git a/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.copy.pass.cpp b/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.copy.pass.cpp
index 77d73485025ab..581df51207da2 100644
--- a/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.copy.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.copy.pass.cpp
@@ -30,6 +30,7 @@
 #include <utility>
 
 #include "test_macros.h"
+#include "../../types.h"
 
 struct NonCopyable {
   NonCopyable(const NonCopyable&) = delete;
@@ -93,13 +94,26 @@ constexpr bool test() {
     assert(!e2.has_value());
     assert(e2.error() == 5);
   }
+
+  // copy TailClobberer as value
+  {
+    const std::expected<TailClobberer<0>, bool> e1;
+    auto e2 = e1;
+    assert(e2.has_value());
+  }
+
+  // copy TailClobberer as error
+  {
+    const std::expected<bool, TailClobberer<1>> e1(std::unexpect);
+    auto e2 = e1;
+    assert(!e2.has_value());
+  }
+
   return true;
 }
 
 void testException() {
 #ifndef TEST_HAS_NO_EXCEPTIONS
-  struct Except {};
-
   struct Throwing {
     Throwing() = default;
     Throwing(const Throwing&) { throw Except{}; }
diff --git a/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.default.pass.cpp b/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.default.pass.cpp
index 431e604e8b692..dcd046bdd9d89 100644
--- a/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.default.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.default.pass.cpp
@@ -22,6 +22,7 @@
 #include <type_traits>
 
 #include "test_macros.h"
+#include "../../types.h"
 
 struct NoDedefaultCtor {
   NoDedefaultCtor() = delete;
@@ -45,6 +46,7 @@ constexpr void testDefaultCtor() {
 
 template <class T>
 constexpr void testTypes() {
+  testDefaultCtor<T, bool>();
   testDefaultCtor<T, int>();
   testDefaultCtor<T, NoDedefaultCtor>();
 }
@@ -52,13 +54,12 @@ constexpr void testTypes() {
 constexpr bool test() {
   testTypes<int>();
   testTypes<MyInt>();
+  testTypes<TailClobberer<0>>();
   return true;
 }
 
 void testException() {
 #ifndef TEST_HAS_NO_EXCEPTIONS
-  struct Except {};
-
   struct Throwing {
     Throwing() { throw Except{}; };
   };
diff --git a/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.inplace.pass.cpp b/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.inplace.pass.cpp
index 92952551711e0..88ec41939439a 100644
--- a/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.inplace.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.inplace.pass.cpp
@@ -26,6 +26,7 @@
 
 #include "MoveOnly.h"
 #include "test_macros.h"
+#include "../../types.h"
 
 // Test Constraints:
 static_assert(std::is_constructible_v<std::expected<int, int>, std::in_place_t>);
@@ -54,24 +55,24 @@ struct CopyOnly {
   friend constexpr bool operator==(const CopyOnly& mi, int ii) { return mi.i == ii; }
 };
 
-template <class T>
+template <class T, class E = int>
 constexpr void testInt() {
-  std::expected<T, int> e(std::in_place, 5);
+  std::expected<T, E> e(std::in_place, 5);
   assert(e.has_value());
   assert(e.value() == 5);
 }
 
-template <class T>
+template <class T, class E = int>
 constexpr void testLValue() {
   T t(5);
-  std::expected<T, int> e(std::in_place, t);
+  std::expected<T, E> e(std::in_place, t);
   assert(e.has_value());
   assert(e.value() == 5);
 }
 
-template <class T>
+template <class T, class E = int>
 constexpr void testRValue() {
-  std::expected<T, int> e(std::in_place, T(5));
+  std::expected<T, E> e(std::in_place, T(5));
   assert(e.has_value());
   assert(e.value() == 5);
 }
@@ -80,10 +81,13 @@ constexpr bool test() {
   testInt<int>();
   testInt<CopyOnly>();
   testInt<MoveOnly>();
+  testInt<TailClobberer<0>, bool>();
   testLValue<int>();
   testLValue<CopyOnly>();
+  testLValue<TailClobberer<0>, bool>();
   testRValue<int>();
   testRValue<MoveOnly>();
+  testRValue<TailClobberer<0>, bool>();
 
   // no arg
   {
@@ -111,8 +115,6 @@ constexpr bool test() {
 
 void testException() {
 #ifndef TEST_HAS_NO_EXCEPTIONS
-  struct Except {};
-
   struct Throwing {
     Throwing(int) { throw Except{}; };
   };
diff --git a/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.inplace_init_list.pass.cpp b/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.inplace_init_list.pass.cpp
index b4cad54b860e9..a97086fcc9195 100644
--- a/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.inplace_init_list.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.inplace_init_list.pass.cpp
@@ -28,6 +28,7 @@
 
 #include "MoveOnly.h"
 #include "test_macros.h"
+#include "../../types.h"
 
 // Test Constraints:
 static_assert(
@@ -90,13 +91,17 @@ constexpr bool test() {
     assert(m.get() == 0);
   }
 
+  // TailClobberer
+  {
+    std::expected<TailClobberer<0>, bool> e(std::in_place, {1, 2, 3});
+    assert(e.has_value());
+  }
+
   return true;
 }
 
 void testException() {
 #ifndef TEST_HAS_NO_EXCEPTIONS
-  struct Except {};
-
   struct Throwing {
     Throwing(std::initializer_list<int>, int) { throw Except{}; };
   };
diff --git a/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.move.pass.cpp b/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.move.pass.cpp
index 5e6749e50c16c..cd89e2445860a 100644
--- a/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.move.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.move.pass.cpp
@@ -32,6 +32,7 @@
 #include <utility>
 
 #include "test_macros.h"
+#include "../../types.h"
 
 struct NonMovable {
   NonMovable(NonMovable&&) = delete;
@@ -112,13 +113,28 @@ constexpr bool test() {
     assert(e2.error() == 5);
     assert(!e1.has_value());
   }
+
+  // move TailClobbererNonTrivialMove as value
+  {
+    std::expected<TailClobbererNonTrivialMove<0>, bool> e1;
+    auto e2 = std::move(e1);
+    assert(e2.has_value());
+    assert(e1.has_value());
+  }
+
+  // move TailClobbererNonTrivialMove as error
+  {
+    std::expected<bool, TailClobbererNonTrivialMove<1>> e1(std::unexpect);
+    auto e2 = std::move(e1);
+    assert(!e2.has_value());
+    assert(!e1.has_value());
+  }
+
   return true;
 }
 
 void testException() {
 #ifndef TEST_HAS_NO_EXCEPTIONS
-  struct Except {};
-
   struct Throwing {
     Throwing() = default;
     Throwing(Throwing&&) { throw Except{}; }
diff --git a/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.u.pass.cpp b/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.u.pass.cpp
index 9e82943f9f314..1cf3d9cc2ef49 100644
--- a/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.u.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.u.pass.cpp
@@ -29,6 +29,7 @@
 
 #include "MoveOnly.h"
 #include "test_macros.h"
+#include "../../types.h"
 
 // Test Constraints:
 static_assert(std::is_constructible_v<std::expected<int, int>, int>);
@@ -70,24 +71,24 @@ struct CopyOnly {
 struct BaseError {};
 struct DerivedError : BaseError {};
 
-template <class T>
+template <class T, class E = int>
 constexpr void testInt() {
-  std::expected<T, int> e(5);
+  std::expected<T, E> e(5);
   assert(e.has_value());
   assert(e.value() == 5);
 }
 
-template <class T>
+template <class T, class E = int>
 constexpr void testLValue() {
   T t(5);
-  std::expected<T, int> e(t);
+  std::expected<T, E> e(t);
   assert(e.has_value());
   assert(e.value() == 5);
 }
 
-template <class T>
+template <class T, class E = int>
 constexpr void testRValue() {
-  std::expected<T, int> e(T(5));
+  std::expected<T, E> e(T(5));
   assert(e.has_value());
   assert(e.value() == 5);
 }
@@ -96,10 +97,13 @@ constexpr bool test() {
   testInt<int>();
   testInt<CopyOnly>();
   testInt<MoveOnly>();
+  testInt<TailClobberer<0>, bool>();
   testLValue<int>();
   testLValue<CopyOnly>();
+  testLValue<TailClobberer<0>, bool>();
   testRValue<int>();
   testRValue<MoveOnly>();
+  testRValue<TailClobberer<0>, bool>();
 
   // Test default template argument.
   // Without it, the template parameter cannot be deduced from an initializer list
@@ -153,8 +157,6 @@ constexpr bool test() {
 
 void testException() {
 #ifndef TEST_HAS_NO_EXCEPTIONS
-  struct Except {};
-
   struct Throwing {
     Throwing(int) { throw Except{}; };
   };
diff --git a/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.unexpect.pass.cpp b/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.unexpect.pass.cpp
index 5a78e41dfcae2..27ce97737d288 100644
--- a/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.unexpect.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.unexpect.pass.cpp
@@ -26,6 +26,7 @@
 
 #include "MoveOnly.h"
 #include "test_macros.h"
+#include "../../types.h"
 
 // Test Constraints:
 static_assert(std::is_constructible_v<std::expected<int, int>, std::unexpect_t>);
@@ -54,24 +55,24 @@ struct CopyOnly {
   friend constexpr bool operator==(const CopyOnly& mi, int ii) { return mi.i == ii; }
 };
 
-template <class T>
+template <class T, class V = int>
 constexpr void testInt() {
-  std::expected<int, T> e(std::unexpect, 5);
+  std::expected<V, T> e(std::unexpect, 5);
   assert(!e.has_value());
   assert(e.error() == 5);
 }
 
-template <class T>
+template <class T, class V = int>
 constexpr void testLValue() {
   T t(5);
-  std::expected<int, T> e(std::unexpect, t);
+  std::expected<V, T> e(std::unexpect, t);
   assert(!e.has_value());
   assert(e.error() == 5);
 }
 
-template <class T>
+template <class T, class V = int>
 constexpr void testRValue() {
-  std::expected<int, T> e(std::unexpect, T(5));
+  std::expected<V, T> e(std::unexpect, T(5));
   assert(!e.has_value());
   assert(e.error() == 5);
 }
@@ -80,10 +81,13 @@ constexpr bool test() {
   testInt<int>();
   testInt<CopyOnly>();
   testInt<MoveOnly>();
+  testInt<TailClobberer<1>, bool>();
   testLValue<int>();
   testLValue<CopyOnly>();
+  testLValue<TailClobberer<1>, bool>();
   testRValue<int>();
   testRValue<MoveOnly>();
+  testRValue<TailClobberer<1>, bool>();
 
   // no arg
   {
@@ -111,8 +115,6 @@ constexpr bool test() {
 
 void testException() {
 #ifndef TEST_HAS_NO_EXCEPTIONS
-  struct Except {};
-
   struct Throwing {
     Throwing(int) { throw Except{}; };
   };
diff --git a/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.unexpect_init_list.pass.cpp b/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.unexpect_init_list.pass.cpp
index 7cc36b51e4153..4f5d3d1492d37 100644
--- a/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.unexpect_init_list.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.unexpect_init_list.pass.cpp
@@ -28,6 +28,7 @@
 
 #include "MoveOnly.h"
 #include "test_macros.h"
+#include "../../types.h"
 
 // Test Constraints:
 static_assert(
@@ -90,13 +91,17 @@ constexpr bool test() {
     assert(m.get() == 0);
   }
 
+  // TailClobberer
+  {
+    std::expected<bool, TailClobberer<1>> e(std::unexpect, {1, 2, 3});
+    assert(!e.has_value());
+  }
+
   return true;
 }
 
 void testException() {
 #ifndef TEST_HAS_NO_EXCEPTIONS
-  struct Except {};
-
   struct Throwing {
     Throwing(std::initializer_list<int>, int) { throw Except{}; };
   };
diff --git a/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.unexpected.copy.pass.cpp b/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.unexpected.copy.pass.cpp
index 09ac91182b3b8..bbfd3048533c7 100644
--- a/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.unexpected.copy.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.unexpected.copy.pass.cpp
@@ -27,6 +27,7 @@
 
 #include "MoveOnly.h"
 #include "test_macros.h"
+#include "../../types.h"
 
 // Test Constraints
 static_assert(std::is_constructible_v<std::expected<int, int>, const std::unexpected<int>&>);
@@ -49,10 +50,10 @@ struct MyInt {
   friend constexpr bool operator==(const MyInt&, const MyInt&) = default;
 };
 
-template <class T>
+template <class T, class V = int>
 constexpr void testUnexpected() {
   const std::unexpected<int> u(5);
-  std::expected<int, T> e(u);
+  std::expected<V, T> e(u);
   assert(!e.has_value());
   assert(e.error() == 5);
 }
@@ -60,13 +61,12 @@ constexpr void testUnexpected() {
 constexpr bool test() {
   testUnexpected<int>();
   testUnexpected<MyInt>();
+  testUnexpected<TailClobberer<1>, bool>();
   return true;
 }
 
 void testException() {
 #ifndef TEST_HAS_NO_EXCEPTIONS
-  struct Except {};
-
   struct Throwing {
     Throwing(int) { throw Except{}; }
   };
diff --git a/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.unexpected.move.pass.cpp b/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.unexpected.move.pass.cpp
index 9aaaa3fe1a448..800d47bda6958 100644
--- a/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.unexpected.move.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.expected/ctor/ctor.unexpected.move.pass.cpp
@@ -27,6 +27,7 @@
 
 #include "MoveOnly.h"
 #include "test_macros.h"
+#include "../../types.h"
 
 // Test Constraints
 static_assert(std::is_constructible_v<std::expected<int, int>, std::unexpected<int>>);
@@ -49,10 +50,10 @@ struct MyInt {
   friend constexpr bool operator==(const MyInt&, const MyInt&) = default;
 };
 
-template <class Err>
+template <class Err, class V = int>
 constexpr void testInt() {
   std::unexpected<int> u(5);
-  std::expected<int, Err> e(std::move(u));
+  std::expected<V, Err> e(std::move(u));
   assert(!e.has_value());
   assert(e.error() == 5);
 }
@@ -69,14 +70,13 @@ constexpr bool test() {
   testInt<int>();
   testInt<MyInt>();
   testInt<MoveOnly>();
+  testInt<TailClobberer<1>, bool>();
   testMoveOnly();
   return true;
 }
 
 void testException() {
 #ifndef TEST_HAS_NO_EXCEPTIONS
-  struct Except {};
-
   struct Throwing {
     Throwing(int) { throw Except{}; }
   };
diff --git a/libcxx/test/std/utilities/expected/expected.expected/observers/has_value.pass.cpp b/libcxx/test/std/utilities/expected/expected.expected/observers/has_value.pass.cpp
index 27d657a065699..2b24b0ac24ddb 100644
--- a/libcxx/test/std/utilities/expected/expected.expected/observers/has_value.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.expected/observers/has_value.pass.cpp
@@ -12,10 +12,12 @@
 #include <cassert>
 #include <concepts>
 #include <expected>
+#include <optional>
 #include <type_traits>
 #include <utility>
 
 #include "test_macros.h"
+#include "../../types.h"
 
 // Test noexcept
 template <class T>
@@ -43,6 +45,36 @@ constexpr bool test() {
     assert(!e.has_value());
   }
 
+  // The following tests check that the "has_value" flag is not overwritten
+  // by the constructor of the value. This could happen because the flag is
+  // stored in the tail padding of the value.
+  //
+  // The first test is a simplified version of the real code where this was
+  // first observed.
+  //
+  // The other tests use a synthetic struct that clobbers its tail padding
+  // on construction, making the issue easier to reproduce.
+  //
+  // See https://github.com/llvm/llvm-project/issues/68552 and the linked PR.
+  {
+    auto f1 = [] -> std::expected<std::optional<int>, long> { return 0; };
+
+    auto f2 = [&f1] -> std::expected<std::optional<int>, int> {
+      return f1().transform_error([](auto) { return 0; });
+    };
+
+    auto e = f2();
+    assert(e.has_value());
+  }
+  {
+    const std::expected<TailClobberer<0>, bool> e = {};
+    // clang-cl does not support [[no_unique_address]] yet.
+#if !(defined(TEST_COMPILER_CLANG) && defined(_MSC_VER))
+    LIBCPP_STATIC_ASSERT(sizeof(TailClobberer<0>) == sizeof(e));
+#endif
+    assert(e.has_value());
+  }
+
   return true;
 }
 
diff --git a/libcxx/test/std/utilities/expected/expected.expected/swap/free.swap.pass.cpp b/libcxx/test/std/utilities/expected/expected.expected/swap/free.swap.pass.cpp
index 15c66d2b75076..3c03efd832919 100644
--- a/libcxx/test/std/utilities/expected/expected.expected/swap/free.swap.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.expected/swap/free.swap.pass.cpp
@@ -129,7 +129,7 @@ constexpr bool test() {
     std::expected<TrackedMove<true>, TrackedMove<false>> e1(std::in_place, 5);
     std::expected<TrackedMove<true>, TrackedMove<false>> e2(std::unexpect, 10);
 
-    e1.swap(e2);
+    swap(e1, e2);
 
     assert(!e1.has_value());
     assert(e1.error().i == 10);
@@ -180,6 +180,35 @@ constexpr bool test() {
     assert(!e2.error().swapCalled);
   }
 
+  // TailClobberer
+  {
+    // is_nothrow_move_constructible_v<E>
+    {
+      std::expected<TailClobbererNonTrivialMove<0, true>, TailClobbererNonTrivialMove<1, true>> x(std::in_place);
+      std::expected<TailClobbererNonTrivialMove<0, true>, TailClobbererNonTrivialMove<1, true>> y(std::unexpect);
+
+      swap(x, y);
+
+      // Both of these would fail if adjusting the "has value" flags happened
+      // _before_ constructing the member objects inside the `swap`.
+      assert(!x.has_value());
+      assert(y.has_value());
+    }
+
+    // !is_nothrow_move_constructible_v<E>
+    {
+      std::expected<TailClobbererNonTrivialMove<0, true>, TailClobbererNonTrivialMove<1, false>> x(std::in_place);
+      std::expected<TailClobbererNonTrivialMove<0, true>, TailClobbererNonTrivialMove<1, false>> y(std::unexpect);
+
+      swap(x, y);
+
+      // Both of these would fail if adjusting the "has value" flags happened
+      // _before_ constructing the member objects inside the `swap`.
+      assert(!x.has_value());
+      assert(y.has_value());
+    }
+  }
+
   return true;
 }
 
@@ -210,6 +239,39 @@ void testException() {
       assert(*e1 == 5);
     }
   }
+
+  // TailClobberer
+  {
+    // is_nothrow_move_constructible_v<E>
+    {
+      std::expected<TailClobbererNonTrivialMove<0, false, true>, TailClobbererNonTrivialMove<1>> x(std::in_place);
+      std::expected<TailClobbererNonTrivialMove<0, false, true>, TailClobbererNonTrivialMove<1>> y(std::unexpect);
+      try {
+        swap(x, y);
+        assert(false);
+      } catch (Except) {
+        assert(x.has_value());
+        // This would fail if `TailClobbererNonTrivialMove<1>` clobbered the
+        // flag when rolling back the swap.
+        assert(!y.has_value());
+      }
+    }
+
+    // !is_nothrow_move_constructible_v<E>
+    {
+      std::expected<TailClobbererNonTrivialMove<0>, TailClobbererNonTrivialMove<1, false, true>> x(std::in_place);
+      std::expected<TailClobbererNonTrivialMove<0>, TailClobbererNonTrivialMove<1, false, true>> y(std::unexpect);
+      try {
+        swap(x, y);
+        assert(false);
+      } catch (Except) {
+        // This would fail if `TailClobbererNonTrivialMove<0>` clobbered the
+        // flag when rolling back the swap.
+        assert(x.has_value());
+        assert(!y.has_value());
+      }
+    }
+  }
 #endif // TEST_HAS_NO_EXCEPTIONS
 }
 
diff --git a/libcxx/test/std/utilities/expected/expected.expected/swap/member.swap.pass.cpp b/libcxx/test/std/utilities/expected/expected.expected/swap/member.swap.pass.cpp
index d2d4a09922092..b6b112cfbeb8b 100644
--- a/libcxx/test/std/utilities/expected/expected.expected/swap/member.swap.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.expected/swap/member.swap.pass.cpp
@@ -70,7 +70,7 @@ static_assert(!HasMemberSwap<MoveMayThrow, MoveMayThrow>);
 
 // Test noexcept
 template <class T, class E>
-concept MemberSwapNoexcept =
+concept MemberSwapNoexcept = //
     requires(std::expected<T, E> x, std::expected<T, E> y) {
       { x.swap(y) } noexcept;
     };
@@ -198,6 +198,35 @@ constexpr bool test() {
     assert(!e2.error().swapCalled);
   }
 
+  // TailClobberer
+  {
+    // is_nothrow_move_constructible_v<E>
+    {
+      std::expected<TailClobbererNonTrivialMove<0, true>, TailClobbererNonTrivialMove<1, true>> x(std::in_place);
+      std::expected<TailClobbererNonTrivialMove<0, true>, TailClobbererNonTrivialMove<1, true>> y(std::unexpect);
+
+      x.swap(y);
+
+      // Both of these would fail if adjusting the "has value" flags happened
+      // _before_ constructing the member objects inside the `swap`.
+      assert(!x.has_value());
+      assert(y.has_value());
+    }
+
+    // !is_nothrow_move_constructible_v<E>
+    {
+      std::expected<TailClobbererNonTrivialMove<0, true>, TailClobbererNonTrivialMove<1, false>> x(std::in_place);
+      std::expected<TailClobbererNonTrivialMove<0, true>, TailClobbererNonTrivialMove<1, false>> y(std::unexpect);
+
+      x.swap(y);
+
+      // Both of these would fail if adjusting the "has value" flags happened
+      // _before_ constructing the member objects inside the `swap`.
+      assert(!x.has_value());
+      assert(y.has_value());
+    }
+  }
+
   return true;
 }
 
@@ -228,6 +257,39 @@ void testException() {
       assert(*e1 == 5);
     }
   }
+
+  // TailClobberer
+  {
+    // is_nothrow_move_constructible_v<E>
+    {
+      std::expected<TailClobbererNonTrivialMove<0, false, true>, TailClobbererNonTrivialMove<1>> x(std::in_place);
+      std::expected<TailClobbererNonTrivialMove<0, false, true>, TailClobbererNonTrivialMove<1>> y(std::unexpect);
+      try {
+        x.swap(y);
+        assert(false);
+      } catch (Except) {
+        assert(x.has_value());
+        // This would fail if `TailClobbererNonTrivialMove<1>` clobbered the
+        // flag when rolling back the swap.
+        assert(!y.has_value());
+      }
+    }
+
+    // !is_nothrow_move_constructible_v<E>
+    {
+      std::expected<TailClobbererNonTrivialMove<0>, TailClobbererNonTrivialMove<1, false, true>> x(std::in_place);
+      std::expected<TailClobbererNonTrivialMove<0>, TailClobbererNonTrivialMove<1, false, true>> y(std::unexpect);
+      try {
+        x.swap(y);
+        assert(false);
+      } catch (Except) {
+        // This would fail if `TailClobbererNonTrivialMove<0>` clobbered the
+        // flag when rolling back the swap.
+        assert(x.has_value());
+        assert(!y.has_value());
+      }
+    }
+  }
 #endif // TEST_HAS_NO_EXCEPTIONS
 }
 
diff --git a/libcxx/test/std/utilities/expected/expected.void/ctor/ctor.convert.copy.pass.cpp b/libcxx/test/std/utilities/expected/expected.void/ctor/ctor.convert.copy.pass.cpp
index 40f8efa5f94bf..05f556e25eac1 100644
--- a/libcxx/test/std/utilities/expected/expected.void/ctor/ctor.convert.copy.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.void/ctor/ctor.convert.copy.pass.cpp
@@ -33,6 +33,7 @@
 #include <utility>
 
 #include "test_macros.h"
+#include "../../types.h"
 
 // Test Constraints:
 template <class T1, class Err1, class T2, class Err2>
@@ -97,13 +98,19 @@ constexpr bool test() {
     assert(e1.error() == 5);
   }
 
+  // convert TailClobberer
+  {
+    const std::expected<void, TailClobbererNonTrivialMove<1>> e1(std::unexpect);
+    std::expected<void, TailClobberer<1>> e2 = e1;
+    assert(!e2.has_value());
+    assert(!e1.has_value());
+  }
+
   return true;
 }
 
 void testException() {
 #ifndef TEST_HAS_NO_EXCEPTIONS
-  struct Except {};
-
   struct ThrowingInt {
     ThrowingInt(int) { throw Except{}; }
   };
diff --git a/libcxx/test/std/utilities/expected/expected.void/ctor/ctor.convert.move.pass.cpp b/libcxx/test/std/utilities/expected/expected.void/ctor/ctor.convert.move.pass.cpp
index b28fc7a03bb34..a48888be53ee0 100644
--- a/libcxx/test/std/utilities/expected/expected.void/ctor/ctor.convert.move.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.void/ctor/ctor.convert.move.pass.cpp
@@ -34,6 +34,7 @@
 
 #include "MoveOnly.h"
 #include "test_macros.h"
+#include "../../types.h"
 
 // Test Constraints:
 template <class T1, class Err1, class T2, class Err2>
@@ -98,13 +99,19 @@ constexpr bool test() {
     assert(e1.error().get() == 0);
   }
 
+  // convert TailClobberer
+  {
+    std::expected<void, TailClobbererNonTrivialMove<1>> e1(std::unexpect);
+    std::expected<void, TailClobberer<1>> e2 = std::move(e1);
+    assert(!e2.has_value());
+    assert(!e1.has_value());
+  }
+
   return true;
 }
 
 void testException() {
 #ifndef TEST_HAS_NO_EXCEPTIONS
-  struct Except {};
-
   struct ThrowingInt {
     ThrowingInt(int) { throw Except{}; }
   };
diff --git a/libcxx/test/std/utilities/expected/expected.void/ctor/ctor.copy.pass.cpp b/libcxx/test/std/utilities/expected/expected.void/ctor/ctor.copy.pass.cpp
index 689f152a3ac55..7c04a5fa9d044 100644
--- a/libcxx/test/std/utilities/expected/expected.void/ctor/ctor.copy.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.void/ctor/ctor.copy.pass.cpp
@@ -25,6 +25,7 @@
 #include <utility>
 
 #include "test_macros.h"
+#include "../../types.h"
 
 struct NonCopyable {
   NonCopyable(const NonCopyable&) = delete;
@@ -62,13 +63,19 @@ constexpr bool test() {
     assert(!e2.has_value());
     assert(e2.error() == 5);
   }
+
+  // copy TailClobberer as error
+  {
+    const std::expected<void, TailClobberer<1>> e1(std::unexpect);
+    auto e2 = e1;
+    assert(!e2.has_value());
+  }
+
   return true;
 }
 
 void testException() {
 #ifndef TEST_HAS_NO_EXCEPTIONS
-  struct Except {};
-
   struct Throwing {
     Throwing() = default;
     Throwing(const Throwing&) { throw Except{}; }
diff --git a/libcxx/test/std/utilities/expected/expected.void/ctor/ctor.move.pass.cpp b/libcxx/test/std/utilities/expected/expected.void/ctor/ctor.move.pass.cpp
index 61bce2be4897f..bfb5028c9264d 100644
--- a/libcxx/test/std/utilities/expected/expected.void/ctor/ctor.move.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.void/ctor/ctor.move.pass.cpp
@@ -25,6 +25,7 @@
 #include <utility>
 
 #include "test_macros.h"
+#include "../../types.h"
 
 struct NonMovable {
   NonMovable(NonMovable&&) = delete;
@@ -76,13 +77,20 @@ constexpr bool test() {
     assert(e2.error() == 5);
     assert(!e1.has_value());
   }
+
+  // move TailClobbererNonTrivialMove as error
+  {
+    std::expected<void, TailClobbererNonTrivialMove<1>> e1(std::unexpect);
+    auto e2 = std::move(e1);
+    assert(!e2.has_value());
+    assert(!e1.has_value());
+  }
+
   return true;
 }
 
 void testException() {
 #ifndef TEST_HAS_NO_EXCEPTIONS
-  struct Except {};
-
   struct Throwing {
     Throwing() = default;
     Throwing(Throwing&&) { throw Except{}; }
diff --git a/libcxx/test/std/utilities/expected/expected.void/ctor/ctor.unexpect.pass.cpp b/libcxx/test/std/utilities/expected/expected.void/ctor/ctor.unexpect.pass.cpp
index 0a857c77d9c7a..85bc98d7f462d 100644
--- a/libcxx/test/std/utilities/expected/expected.void/ctor/ctor.unexpect.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.void/ctor/ctor.unexpect.pass.cpp
@@ -26,6 +26,7 @@
 
 #include "MoveOnly.h"
 #include "test_macros.h"
+#include "../../types.h"
 
 // Test Constraints:
 static_assert(std::is_constructible_v<std::expected<void, int>, std::unexpect_t>);
@@ -80,10 +81,13 @@ constexpr bool test() {
   testInt<int>();
   testInt<CopyOnly>();
   testInt<MoveOnly>();
+  testInt<TailClobberer<1>>();
   testLValue<int>();
   testLValue<CopyOnly>();
+  testLValue<TailClobberer<1>>();
   testRValue<int>();
   testRValue<MoveOnly>();
+  testRValue<TailClobberer<1>>();
 
   // no arg
   {
@@ -111,8 +115,6 @@ constexpr bool test() {
 
 void testException() {
 #ifndef TEST_HAS_NO_EXCEPTIONS
-  struct Except {};
-
   struct Throwing {
     Throwing(int) { throw Except{}; };
   };
diff --git a/libcxx/test/std/utilities/expected/expected.void/ctor/ctor.unexpect_init_list.pass.cpp b/libcxx/test/std/utilities/expected/expected.void/ctor/ctor.unexpect_init_list.pass.cpp
index a73921225f1fa..4128668a6b07b 100644
--- a/libcxx/test/std/utilities/expected/expected.void/ctor/ctor.unexpect_init_list.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.void/ctor/ctor.unexpect_init_list.pass.cpp
@@ -28,6 +28,7 @@
 
 #include "MoveOnly.h"
 #include "test_macros.h"
+#include "../../types.h"
 
 // Test Constraints:
 static_assert(
@@ -89,13 +90,17 @@ constexpr bool test() {
     assert(m.get() == 0);
   }
 
+  // TailClobberer
+  {
+    std::expected<void, TailClobberer<1>> e(std::unexpect, {1, 2, 3});
+    assert(!e.has_value());
+  }
+
   return true;
 }
 
 void testException() {
 #ifndef TEST_HAS_NO_EXCEPTIONS
-  struct Except {};
-
   struct Throwing {
     Throwing(std::initializer_list<int>, int) { throw Except{}; };
   };
diff --git a/libcxx/test/std/utilities/expected/expected.void/ctor/ctor.unexpected.copy.pass.cpp b/libcxx/test/std/utilities/expected/expected.void/ctor/ctor.unexpected.copy.pass.cpp
index 89e1c9275e3e0..ba738a3e339de 100644
--- a/libcxx/test/std/utilities/expected/expected.void/ctor/ctor.unexpected.copy.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.void/ctor/ctor.unexpected.copy.pass.cpp
@@ -27,6 +27,7 @@
 
 #include "MoveOnly.h"
 #include "test_macros.h"
+#include "../../types.h"
 
 // Test Constraints
 static_assert(std::is_constructible_v<std::expected<void, int>, const std::unexpected<int>&>);
@@ -60,13 +61,12 @@ constexpr void testUnexpected() {
 constexpr bool test() {
   testUnexpected<int>();
   testUnexpected<MyInt>();
+  testUnexpected<TailClobberer<1>>();
   return true;
 }
 
 void testException() {
 #ifndef TEST_HAS_NO_EXCEPTIONS
-  struct Except {};
-
   struct Throwing {
     Throwing(int) { throw Except{}; }
   };
diff --git a/libcxx/test/std/utilities/expected/expected.void/ctor/ctor.unexpected.move.pass.cpp b/libcxx/test/std/utilities/expected/expected.void/ctor/ctor.unexpected.move.pass.cpp
index 2ddcb63c085f0..33a5e7293df21 100644
--- a/libcxx/test/std/utilities/expected/expected.void/ctor/ctor.unexpected.move.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.void/ctor/ctor.unexpected.move.pass.cpp
@@ -27,6 +27,7 @@
 
 #include "MoveOnly.h"
 #include "test_macros.h"
+#include "../../types.h"
 
 // Test Constraints
 static_assert(std::is_constructible_v<std::expected<void, int>, std::unexpected<int>>);
@@ -69,14 +70,13 @@ constexpr bool test() {
   testInt<int>();
   testInt<MyInt>();
   testInt<MoveOnly>();
+  testInt<TailClobberer<1>>();
   testMoveOnly();
   return true;
 }
 
 void testException() {
 #ifndef TEST_HAS_NO_EXCEPTIONS
-  struct Except {};
-
   struct Throwing {
     Throwing(int) { throw Except{}; }
   };
diff --git a/libcxx/test/std/utilities/expected/expected.void/observers/has_value.pass.cpp b/libcxx/test/std/utilities/expected/expected.void/observers/has_value.pass.cpp
index 42a173d60c898..fe92bb401643d 100644
--- a/libcxx/test/std/utilities/expected/expected.void/observers/has_value.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.void/observers/has_value.pass.cpp
@@ -16,6 +16,7 @@
 #include <utility>
 
 #include "test_macros.h"
+#include "../../types.h"
 
 // Test noexcept
 template <class T>
@@ -43,6 +44,17 @@ constexpr bool test() {
     assert(!e.has_value());
   }
 
+  // See comments of the corresponding test in
+  // "expected.expected/observers/has_value.pass.cpp".
+  {
+    const std::expected<void, TailClobberer<1>> e(std::unexpect);
+    // clang-cl does not support [[no_unique_address]] yet.
+#if !(defined(TEST_COMPILER_CLANG) && defined(_MSC_VER))
+    LIBCPP_STATIC_ASSERT(sizeof(TailClobberer<1>) == sizeof(e));
+#endif
+    assert(!e.has_value());
+  }
+
   return true;
 }
 
diff --git a/libcxx/test/std/utilities/expected/expected.void/swap/free.swap.pass.cpp b/libcxx/test/std/utilities/expected/expected.void/swap/free.swap.pass.cpp
index ad1eb3f0bd20d..f7314c79fa6db 100644
--- a/libcxx/test/std/utilities/expected/expected.void/swap/free.swap.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.void/swap/free.swap.pass.cpp
@@ -91,7 +91,7 @@ constexpr bool test() {
     std::expected<void, Traced> e1(std::in_place);
     std::expected<void, Traced> e2(std::unexpect, s, 10);
 
-    e1.swap(e2);
+    swap(e1, e2);
 
     assert(!e1.has_value());
     assert(e1.error().data_ == 10);
@@ -107,7 +107,7 @@ constexpr bool test() {
     std::expected<void, Traced> e1(std::unexpect, s, 10);
     std::expected<void, Traced> e2(std::in_place);
 
-    e1.swap(e2);
+    swap(e1, e2);
 
     assert(e1.has_value());
     assert(!e2.has_value());
@@ -117,6 +117,19 @@ constexpr bool test() {
     assert(s.dtorCalled);
   }
 
+  // TailClobberer
+  {
+    std::expected<void, TailClobbererNonTrivialMove<1>> x(std::in_place);
+    std::expected<void, TailClobbererNonTrivialMove<1>> y(std::unexpect);
+
+    swap(x, y);
+
+    // The next line would fail if adjusting the "has value" flag happened
+    // _before_ constructing the member object inside the `swap`.
+    assert(!x.has_value());
+    assert(y.has_value());
+  }
+
   return true;
 }
 
@@ -151,6 +164,21 @@ void testException() {
       assert(!e2Destroyed);
     }
   }
+
+  // TailClobberer
+  {
+    std::expected<void, TailClobbererNonTrivialMove<0, false, true>> x(std::in_place);
+    std::expected<void, TailClobbererNonTrivialMove<0, false, true>> y(std::unexpect);
+    try {
+      swap(x, y);
+      assert(false);
+    } catch (Except) {
+      // This would fail if `TailClobbererNonTrivialMove<0, false, true>`
+      // clobbered the flag before throwing the exception.
+      assert(x.has_value());
+      assert(!y.has_value());
+    }
+  }
 #endif // TEST_HAS_NO_EXCEPTIONS
 }
 
diff --git a/libcxx/test/std/utilities/expected/expected.void/swap/member.swap.pass.cpp b/libcxx/test/std/utilities/expected/expected.void/swap/member.swap.pass.cpp
index a71c701a46930..70e004abef684 100644
--- a/libcxx/test/std/utilities/expected/expected.void/swap/member.swap.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.void/swap/member.swap.pass.cpp
@@ -52,7 +52,7 @@ struct MoveMayThrow {
 };
 
 template <class E>
-concept MemberSwapNoexcept =
+concept MemberSwapNoexcept = //
     requires(std::expected<void, E> x, std::expected<void, E> y) {
       { x.swap(y) } noexcept;
     };
@@ -126,6 +126,19 @@ constexpr bool test() {
     assert(s.dtorCalled);
   }
 
+  // TailClobberer
+  {
+    std::expected<void, TailClobbererNonTrivialMove<1>> x(std::in_place);
+    std::expected<void, TailClobbererNonTrivialMove<1>> y(std::unexpect);
+
+    x.swap(y);
+
+    // The next line would fail if adjusting the "has value" flag happened
+    // _before_ constructing the member object inside the `swap`.
+    assert(!x.has_value());
+    assert(y.has_value());
+  }
+
   return true;
 }
 
@@ -160,6 +173,21 @@ void testException() {
       assert(!e2Destroyed);
     }
   }
+
+  // TailClobberer
+  {
+    std::expected<void, TailClobbererNonTrivialMove<0, false, true>> x(std::in_place);
+    std::expected<void, TailClobbererNonTrivialMove<0, false, true>> y(std::unexpect);
+    try {
+      x.swap(y);
+      assert(false);
+    } catch (Except) {
+      // This would fail if `TailClobbererNonTrivialMove<0, false, true>`
+      // clobbered the flag before throwing the exception.
+      assert(x.has_value());
+      assert(!y.has_value());
+    }
+  }
 #endif // TEST_HAS_NO_EXCEPTIONS
 }
 
diff --git a/libcxx/test/std/utilities/expected/types.h b/libcxx/test/std/utilities/expected/types.h
index 7c7e517785b4f..ac4a82f2baf78 100644
--- a/libcxx/test/std/utilities/expected/types.h
+++ b/libcxx/test/std/utilities/expected/types.h
@@ -9,7 +9,9 @@
 #ifndef TEST_STD_UTILITIES_EXPECTED_TYPES_H
 #define TEST_STD_UTILITIES_EXPECTED_TYPES_H
 
+#include <cstring>
 #include <utility>
+#include <type_traits>
 #include "test_macros.h"
 
 template <bool copyMoveNoexcept, bool convertNoexcept = true>
@@ -150,4 +152,51 @@ struct MoveOnlyErrorType {
   MoveOnlyErrorType& operator=(const MoveOnlyErrorType&) = delete;
 };
 
+// This type has one byte of tail padding where `std::expected` may put its
+// "has value" flag. The constructor will clobber all bytes including the
+// tail padding. With this type we can check that `std::expected` handles
+// the case where the "has value" flag is an overlapping subobject correctly.
+//
+// See https://github.com/llvm/llvm-project/issues/68552 for details.
+template <int Constant>
+struct TailClobberer {
+  constexpr TailClobberer() noexcept {
+    if (!std::is_constant_evaluated()) {
+      std::memset(this, Constant, sizeof(*this));
+    }
+    // Always set `b` itself to `false` so that the comparison works.
+    b = false;
+  }
+  constexpr TailClobberer(const TailClobberer&) : TailClobberer() {}
+  constexpr TailClobberer(TailClobberer&&) = default;
+  // Converts from `int`/`std::initializer_list<int>, used in some tests.
+  constexpr TailClobberer(int) : TailClobberer() {}
+  constexpr TailClobberer(std::initializer_list<int>) noexcept : TailClobberer() {}
+
+  friend constexpr bool operator==(const TailClobberer&, const TailClobberer&) = default;
+
+  friend constexpr void swap(TailClobberer&, TailClobberer&){};
+
+private:
+  alignas(2) bool b;
+};
+static_assert(!std::is_trivially_copy_constructible_v<TailClobberer<0>>);
+static_assert(std::is_trivially_move_constructible_v<TailClobberer<0>>);
+
+template <int Constant, bool Noexcept = true, bool ThrowOnMove = false>
+struct TailClobbererNonTrivialMove : TailClobberer<Constant> {
+  using TailClobberer<Constant>::TailClobberer;
+  constexpr TailClobbererNonTrivialMove(TailClobbererNonTrivialMove&&) noexcept(Noexcept) : TailClobberer<Constant>() {
+#ifndef TEST_HAS_NO_EXCEPTIONS
+    if constexpr (!Noexcept && ThrowOnMove)
+      throw Except{};
+#endif
+  }
+};
+static_assert(!std::is_trivially_copy_constructible_v<TailClobbererNonTrivialMove<0>>);
+static_assert(std::is_move_constructible_v<TailClobbererNonTrivialMove<0>>);
+static_assert(!std::is_trivially_move_constructible_v<TailClobbererNonTrivialMove<0>>);
+static_assert(std::is_nothrow_move_constructible_v<TailClobbererNonTrivialMove<0, true>>);
+static_assert(!std::is_nothrow_move_constructible_v<TailClobbererNonTrivialMove<0, false>>);
+
 #endif // TEST_STD_UTILITIES_EXPECTED_TYPES_H

From d0caa4eef702f6eda1ce5ab3d72faabb55b15ca9 Mon Sep 17 00:00:00 2001
From: Vlad Serebrennikov <serebrennikov.vladislav@gmail.com>
Date: Mon, 30 Oct 2023 23:06:28 +0400
Subject: [PATCH 417/877] [ADT] Backport std::to_underlying from C++23 (#70681)

This patch backports a one-liner `std::to_underlying` that came with C++23. This is useful for refactoring unscoped enums into scoped enums, because the latter are not implicitly convertible to integer types.

I followed libc++ implementation, but I consider their testing too heavy for us, so I wrote a simpler set of tests.
---
 llvm/include/llvm/ADT/STLForwardCompat.h    |  7 +++++++
 llvm/unittests/ADT/STLForwardCompatTest.cpp | 17 +++++++++++++++++
 2 files changed, 24 insertions(+)

diff --git a/llvm/include/llvm/ADT/STLForwardCompat.h b/llvm/include/llvm/ADT/STLForwardCompat.h
index 97d0bff9aaedb..6afe3610b257f 100644
--- a/llvm/include/llvm/ADT/STLForwardCompat.h
+++ b/llvm/include/llvm/ADT/STLForwardCompat.h
@@ -60,6 +60,13 @@ auto transformOptional(std::optional<T> &&O, const Function &F)
   return std::nullopt;
 }
 
+/// Returns underlying integer value of an enum. Backport of C++23
+/// std::to_underlying.
+template <typename Enum>
+[[nodiscard]] constexpr std::underlying_type_t<Enum> to_underlying(Enum E) {
+  return static_cast<std::underlying_type_t<Enum>>(E);
+}
+
 } // namespace llvm
 
 #endif // LLVM_ADT_STLFORWARDCOMPAT_H
diff --git a/llvm/unittests/ADT/STLForwardCompatTest.cpp b/llvm/unittests/ADT/STLForwardCompatTest.cpp
index e9cd88cd4c27d..b0c95d09ba2c6 100644
--- a/llvm/unittests/ADT/STLForwardCompatTest.cpp
+++ b/llvm/unittests/ADT/STLForwardCompatTest.cpp
@@ -119,4 +119,21 @@ TEST(TransformTest, MoveTransformLlvm) {
   EXPECT_EQ(0u, MoveOnly::Destructions);
 }
 
+TEST(TransformTest, ToUnderlying) {
+  enum E { A1 = 0, B1 = -1 };
+  static_assert(llvm::to_underlying(A1) == 0);
+  static_assert(llvm::to_underlying(B1) == -1);
+
+  enum E2 : unsigned char { A2 = 0, B2 };
+  static_assert(
+      std::is_same_v<unsigned char, decltype(llvm::to_underlying(A2))>);
+  static_assert(llvm::to_underlying(A2) == 0);
+  static_assert(llvm::to_underlying(B2) == 1);
+
+  enum class E3 { A3 = -1, B3 };
+  static_assert(std::is_same_v<int, decltype(llvm::to_underlying(E3::A3))>);
+  static_assert(llvm::to_underlying(E3::A3) == -1);
+  static_assert(llvm::to_underlying(E3::B3) == 0);
+}
+
 } // namespace

From 3f2ed812f021e723212ddb9f808757a7ec3841e1 Mon Sep 17 00:00:00 2001
From: Philip Reames <preames@rivosinc.com>
Date: Mon, 30 Oct 2023 12:09:43 -0700
Subject: [PATCH 418/877] [InstCombine] Infer nneg on zext when forming from
 non-negative sext (#70706)

Builds on #67982 which recently introduced the nneg flag on a zext
instruction. InstCombine is one of our largest canonicalizers of zext
from non-negative sext instructions, so set the flag there.
---
 .../InstCombine/InstCombineCasts.cpp          |  7 ++--
 .../InstCombine/adjust-for-minmax.ll          | 12 +++----
 .../Transforms/InstCombine/cast-mul-select.ll |  4 +--
 .../Transforms/InstCombine/icmp-ext-ext.ll    |  4 +--
 .../InstCombine/memcpy-from-global.ll         |  8 ++---
 .../InstCombine/minmax-intrinsics.ll          |  4 +--
 .../Transforms/InstCombine/narrow-math.ll     |  8 ++---
 .../Transforms/InstCombine/select_meta.ll     |  2 +-
 llvm/test/Transforms/InstCombine/sext.ll      | 16 +++++-----
 .../Transforms/InstCombine/udiv-simplify.ll   |  4 +--
 llvm/test/Transforms/InstCombine/wcslen-1.ll  |  4 +--
 llvm/test/Transforms/InstCombine/wcslen-3.ll  |  2 +-
 .../sve-interleaved-masked-accesses.ll        | 32 +++++++++----------
 .../Transforms/LoopVectorize/induction.ll     |  6 ++--
 .../PhaseOrdering/gvn-replacement-vs-hoist.ll |  4 +--
 .../test/Transforms/PhaseOrdering/lto-licm.ll |  2 +-
 .../SLPVectorizer/AArch64/getelementptr.ll    |  6 ++--
 .../AMDGPU/uniform-unswitch.ll                |  2 +-
 18 files changed, 65 insertions(+), 62 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
index 2285a91cbdf2b..2127000c4b780 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -1372,8 +1372,11 @@ Instruction *InstCombinerImpl::visitSExt(SExtInst &Sext) {
   unsigned DestBitSize = DestTy->getScalarSizeInBits();
 
   // If the value being extended is zero or positive, use a zext instead.
-  if (isKnownNonNegative(Src, DL, 0, &AC, &Sext, &DT))
-    return CastInst::Create(Instruction::ZExt, Src, DestTy);
+  if (isKnownNonNegative(Src, DL, 0, &AC, &Sext, &DT)) {
+    auto CI = CastInst::Create(Instruction::ZExt, Src, DestTy);
+    CI->setNonNeg(true);
+    return CI;
+  }
 
   // Try to extend the entire expression tree to the wide destination type.
   if (shouldChangeType(SrcTy, DestTy) && canEvaluateSExtd(Src, DestTy)) {
diff --git a/llvm/test/Transforms/InstCombine/adjust-for-minmax.ll b/llvm/test/Transforms/InstCombine/adjust-for-minmax.ll
index 67871f3d64c41..dced559445053 100644
--- a/llvm/test/Transforms/InstCombine/adjust-for-minmax.ll
+++ b/llvm/test/Transforms/InstCombine/adjust-for-minmax.ll
@@ -246,7 +246,7 @@ define <2 x i32> @umin4_vec(<2 x i32> %n) {
 define i64 @smax_sext(i32 %a) {
 ; CHECK-LABEL: @smax_sext(
 ; CHECK-NEXT:    [[NARROW:%.*]] = call i32 @llvm.smax.i32(i32 [[A:%.*]], i32 0)
-; CHECK-NEXT:    [[MAX:%.*]] = zext i32 [[NARROW]] to i64
+; CHECK-NEXT:    [[MAX:%.*]] = zext nneg i32 [[NARROW]] to i64
 ; CHECK-NEXT:    ret i64 [[MAX]]
 ;
   %a_ext = sext i32 %a to i64
@@ -258,7 +258,7 @@ define i64 @smax_sext(i32 %a) {
 define <2 x i64> @smax_sext_vec(<2 x i32> %a) {
 ; CHECK-LABEL: @smax_sext_vec(
 ; CHECK-NEXT:    [[NARROW:%.*]] = call <2 x i32> @llvm.smax.v2i32(<2 x i32> [[A:%.*]], <2 x i32> zeroinitializer)
-; CHECK-NEXT:    [[MAX:%.*]] = zext <2 x i32> [[NARROW]] to <2 x i64>
+; CHECK-NEXT:    [[MAX:%.*]] = zext nneg <2 x i32> [[NARROW]] to <2 x i64>
 ; CHECK-NEXT:    ret <2 x i64> [[MAX]]
 ;
   %a_ext = sext <2 x i32> %a to <2 x i64>
@@ -318,7 +318,7 @@ define <2 x i64> @umax_sext_vec(<2 x i32> %a) {
 define i64 @umin_sext(i32 %a) {
 ; CHECK-LABEL: @umin_sext(
 ; CHECK-NEXT:    [[NARROW:%.*]] = call i32 @llvm.umin.i32(i32 [[A:%.*]], i32 2)
-; CHECK-NEXT:    [[MIN:%.*]] = zext i32 [[NARROW]] to i64
+; CHECK-NEXT:    [[MIN:%.*]] = zext nneg i32 [[NARROW]] to i64
 ; CHECK-NEXT:    ret i64 [[MIN]]
 ;
   %a_ext = sext i32 %a to i64
@@ -330,7 +330,7 @@ define i64 @umin_sext(i32 %a) {
 define <2 x i64> @umin_sext_vec(<2 x i32> %a) {
 ; CHECK-LABEL: @umin_sext_vec(
 ; CHECK-NEXT:    [[NARROW:%.*]] = call <2 x i32> @llvm.umin.v2i32(<2 x i32> [[A:%.*]], <2 x i32> <i32 2, i32 2>)
-; CHECK-NEXT:    [[MIN:%.*]] = zext <2 x i32> [[NARROW]] to <2 x i64>
+; CHECK-NEXT:    [[MIN:%.*]] = zext nneg <2 x i32> [[NARROW]] to <2 x i64>
 ; CHECK-NEXT:    ret <2 x i64> [[MIN]]
 ;
   %a_ext = sext <2 x i32> %a to <2 x i64>
@@ -366,7 +366,7 @@ define <2 x i64> @umax_sext2_vec(<2 x i32> %a) {
 define i64 @umin_sext2(i32 %a) {
 ; CHECK-LABEL: @umin_sext2(
 ; CHECK-NEXT:    [[NARROW:%.*]] = call i32 @llvm.umin.i32(i32 [[A:%.*]], i32 3)
-; CHECK-NEXT:    [[MIN:%.*]] = zext i32 [[NARROW]] to i64
+; CHECK-NEXT:    [[MIN:%.*]] = zext nneg i32 [[NARROW]] to i64
 ; CHECK-NEXT:    ret i64 [[MIN]]
 ;
   %a_ext = sext i32 %a to i64
@@ -378,7 +378,7 @@ define i64 @umin_sext2(i32 %a) {
 define <2 x i64> @umin_sext2_vec(<2 x i32> %a) {
 ; CHECK-LABEL: @umin_sext2_vec(
 ; CHECK-NEXT:    [[NARROW:%.*]] = call <2 x i32> @llvm.umin.v2i32(<2 x i32> [[A:%.*]], <2 x i32> <i32 3, i32 3>)
-; CHECK-NEXT:    [[MIN:%.*]] = zext <2 x i32> [[NARROW]] to <2 x i64>
+; CHECK-NEXT:    [[MIN:%.*]] = zext nneg <2 x i32> [[NARROW]] to <2 x i64>
 ; CHECK-NEXT:    ret <2 x i64> [[MIN]]
 ;
   %a_ext = sext <2 x i32> %a to <2 x i64>
diff --git a/llvm/test/Transforms/InstCombine/cast-mul-select.ll b/llvm/test/Transforms/InstCombine/cast-mul-select.ll
index ab8333beb9e76..23e934de0baeb 100644
--- a/llvm/test/Transforms/InstCombine/cast-mul-select.ll
+++ b/llvm/test/Transforms/InstCombine/cast-mul-select.ll
@@ -193,7 +193,7 @@ define void @PR36225(i32 %a, i32 %b, i1 %c1, i3 %v1, i3 %v2) {
 ; CHECK-NEXT:    ]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    [[H:%.*]] = phi i8 [ [[SPEC_SELECT]], [[FOR_BODY3_US]] ], [ [[SPEC_SELECT]], [[FOR_BODY3_US]] ], [ 0, [[FOR_BODY3]] ], [ 0, [[FOR_BODY3]] ]
-; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[H]] to i32
+; CHECK-NEXT:    [[CONV:%.*]] = zext nneg i8 [[H]] to i32
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CONV]], [[A:%.*]]
 ; CHECK-NEXT:    br i1 [[CMP]], label [[EXIT]], label [[EXIT2:%.*]]
 ; CHECK:       exit2:
@@ -224,7 +224,7 @@ define void @PR36225(i32 %a, i32 %b, i1 %c1, i3 %v1, i3 %v2) {
 ; DBGINFO:       for.end:
 ; DBGINFO-NEXT:    [[H:%.*]] = phi i8 [ [[SPEC_SELECT]], [[FOR_BODY3_US]] ], [ [[SPEC_SELECT]], [[FOR_BODY3_US]] ], [ 0, [[FOR_BODY3]] ], [ 0, [[FOR_BODY3]] ], !dbg [[DBG100:![0-9]+]]
 ; DBGINFO-NEXT:    call void @llvm.dbg.value(metadata i8 [[H]], metadata [[META91:![0-9]+]], metadata !DIExpression()), !dbg [[DBG100]]
-; DBGINFO-NEXT:    [[CONV:%.*]] = zext i8 [[H]] to i32, !dbg [[DBG101:![0-9]+]]
+; DBGINFO-NEXT:    [[CONV:%.*]] = zext nneg i8 [[H]] to i32, !dbg [[DBG101:![0-9]+]]
 ; DBGINFO-NEXT:    call void @llvm.dbg.value(metadata i32 [[CONV]], metadata [[META92:![0-9]+]], metadata !DIExpression()), !dbg [[DBG101]]
 ; DBGINFO-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CONV]], [[A:%.*]], !dbg [[DBG102:![0-9]+]]
 ; DBGINFO-NEXT:    call void @llvm.dbg.value(metadata i1 [[CMP]], metadata [[META93:![0-9]+]], metadata !DIExpression()), !dbg [[DBG102]]
diff --git a/llvm/test/Transforms/InstCombine/icmp-ext-ext.ll b/llvm/test/Transforms/InstCombine/icmp-ext-ext.ll
index b3dafe06a3879..f70e48e273846 100644
--- a/llvm/test/Transforms/InstCombine/icmp-ext-ext.ll
+++ b/llvm/test/Transforms/InstCombine/icmp-ext-ext.ll
@@ -289,7 +289,7 @@ define i1 @zext_sext_eq_known_nonneg(i8 %x, i8 %y) {
 define i1 @zext_sext_sle_known_nonneg_op0_narrow(i8 %x, i16 %y) {
 ; CHECK-LABEL: @zext_sext_sle_known_nonneg_op0_narrow(
 ; CHECK-NEXT:    [[N:%.*]] = and i8 [[X:%.*]], 12
-; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[N]] to i16
+; CHECK-NEXT:    [[TMP1:%.*]] = zext nneg i8 [[N]] to i16
 ; CHECK-NEXT:    [[C:%.*]] = icmp sle i16 [[TMP1]], [[Y:%.*]]
 ; CHECK-NEXT:    ret i1 [[C]]
 ;
@@ -370,7 +370,7 @@ define <2 x i1> @sext_zext_sge_known_nonneg_op0_narrow(<2 x i5> %x, <2 x i8> %y)
 define i1 @sext_zext_uge_known_nonneg_op0_wide(i16 %x, i8 %y) {
 ; CHECK-LABEL: @sext_zext_uge_known_nonneg_op0_wide(
 ; CHECK-NEXT:    [[N:%.*]] = and i8 [[Y:%.*]], 12
-; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[N]] to i16
+; CHECK-NEXT:    [[TMP1:%.*]] = zext nneg i8 [[N]] to i16
 ; CHECK-NEXT:    [[C:%.*]] = icmp ule i16 [[TMP1]], [[X:%.*]]
 ; CHECK-NEXT:    ret i1 [[C]]
 ;
diff --git a/llvm/test/Transforms/InstCombine/memcpy-from-global.ll b/llvm/test/Transforms/InstCombine/memcpy-from-global.ll
index ea9b16e1382ee..59e756eed3fd7 100644
--- a/llvm/test/Transforms/InstCombine/memcpy-from-global.ll
+++ b/llvm/test/Transforms/InstCombine/memcpy-from-global.ll
@@ -8,25 +8,25 @@ define float @test1(i32 %hash, float %x, float %y, float %z, float %w) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP3:%.*]] = shl i32 [[HASH:%.*]], 2
 ; CHECK-NEXT:    [[TMP5:%.*]] = and i32 [[TMP3]], 124
-; CHECK-NEXT:    [[TMP0:%.*]] = zext i32 [[TMP5]] to i64
+; CHECK-NEXT:    [[TMP0:%.*]] = zext nneg i32 [[TMP5]] to i64
 ; CHECK-NEXT:    [[TMP753:%.*]] = getelementptr [128 x float], ptr @C.0.1248, i64 0, i64 [[TMP0]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = load float, ptr [[TMP753]], align 4
 ; CHECK-NEXT:    [[TMP11:%.*]] = fmul float [[TMP9]], [[X:%.*]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = fadd float [[TMP11]], 0.000000e+00
 ; CHECK-NEXT:    [[TMP17_SUM52:%.*]] = or i32 [[TMP5]], 1
-; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP17_SUM52]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = zext nneg i32 [[TMP17_SUM52]] to i64
 ; CHECK-NEXT:    [[TMP1851:%.*]] = getelementptr [128 x float], ptr @C.0.1248, i64 0, i64 [[TMP1]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = load float, ptr [[TMP1851]], align 4
 ; CHECK-NEXT:    [[TMP21:%.*]] = fmul float [[TMP19]], [[Y:%.*]]
 ; CHECK-NEXT:    [[TMP23:%.*]] = fadd float [[TMP21]], [[TMP13]]
 ; CHECK-NEXT:    [[TMP27_SUM50:%.*]] = or i32 [[TMP5]], 2
-; CHECK-NEXT:    [[TMP2:%.*]] = zext i32 [[TMP27_SUM50]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = zext nneg i32 [[TMP27_SUM50]] to i64
 ; CHECK-NEXT:    [[TMP2849:%.*]] = getelementptr [128 x float], ptr @C.0.1248, i64 0, i64 [[TMP2]]
 ; CHECK-NEXT:    [[TMP29:%.*]] = load float, ptr [[TMP2849]], align 4
 ; CHECK-NEXT:    [[TMP31:%.*]] = fmul float [[TMP29]], [[Z:%.*]]
 ; CHECK-NEXT:    [[TMP33:%.*]] = fadd float [[TMP31]], [[TMP23]]
 ; CHECK-NEXT:    [[TMP37_SUM48:%.*]] = or i32 [[TMP5]], 3
-; CHECK-NEXT:    [[TMP3:%.*]] = zext i32 [[TMP37_SUM48]] to i64
+; CHECK-NEXT:    [[TMP3:%.*]] = zext nneg i32 [[TMP37_SUM48]] to i64
 ; CHECK-NEXT:    [[TMP3847:%.*]] = getelementptr [128 x float], ptr @C.0.1248, i64 0, i64 [[TMP3]]
 ; CHECK-NEXT:    [[TMP39:%.*]] = load float, ptr [[TMP3847]], align 4
 ; CHECK-NEXT:    [[TMP41:%.*]] = fmul float [[TMP39]], [[W:%.*]]
diff --git a/llvm/test/Transforms/InstCombine/minmax-intrinsics.ll b/llvm/test/Transforms/InstCombine/minmax-intrinsics.ll
index 7a4da66ae2151..09003ebacd6ca 100644
--- a/llvm/test/Transforms/InstCombine/minmax-intrinsics.ll
+++ b/llvm/test/Transforms/InstCombine/minmax-intrinsics.ll
@@ -217,7 +217,7 @@ define i8 @umin_zext_uses(i5 %x, i5 %y) {
 define i8 @smax_sext_constant(i5 %x) {
 ; CHECK-LABEL: @smax_sext_constant(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i5 @llvm.smax.i5(i5 [[X:%.*]], i5 7)
-; CHECK-NEXT:    [[M:%.*]] = zext i5 [[TMP1]] to i8
+; CHECK-NEXT:    [[M:%.*]] = zext nneg i5 [[TMP1]] to i8
 ; CHECK-NEXT:    ret i8 [[M]]
 ;
   %e = sext i5 %x to i8
@@ -322,7 +322,7 @@ define i8 @umax_zext_constant_big(i5 %x) {
 define i8 @umin_sext_constant(i5 %x) {
 ; CHECK-LABEL: @umin_sext_constant(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i5 @llvm.umin.i5(i5 [[X:%.*]], i5 7)
-; CHECK-NEXT:    [[M:%.*]] = zext i5 [[TMP1]] to i8
+; CHECK-NEXT:    [[M:%.*]] = zext nneg i5 [[TMP1]] to i8
 ; CHECK-NEXT:    ret i8 [[M]]
 ;
   %e = sext i5 %x to i8
diff --git a/llvm/test/Transforms/InstCombine/narrow-math.ll b/llvm/test/Transforms/InstCombine/narrow-math.ll
index bfff00f62deac..6eacb1ca2c018 100644
--- a/llvm/test/Transforms/InstCombine/narrow-math.ll
+++ b/llvm/test/Transforms/InstCombine/narrow-math.ll
@@ -141,7 +141,7 @@ define i64 @test2(i32 %V) {
 ; CHECK-NEXT:    [[CALL1:%.*]] = call i32 @callee(), !range [[RNG0]]
 ; CHECK-NEXT:    [[CALL2:%.*]] = call i32 @callee(), !range [[RNG0]]
 ; CHECK-NEXT:    [[ADD:%.*]] = add nuw nsw i32 [[CALL1]], [[CALL2]]
-; CHECK-NEXT:    [[ZEXT:%.*]] = zext i32 [[ADD]] to i64
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext nneg i32 [[ADD]] to i64
 ; CHECK-NEXT:    ret i64 [[ZEXT]]
 ;
   %call1 = call i32 @callee(), !range !0
@@ -172,7 +172,7 @@ define i64 @test4(i32 %V) {
 ; CHECK-NEXT:    [[CALL1:%.*]] = call i32 @callee(), !range [[RNG0]]
 ; CHECK-NEXT:    [[CALL2:%.*]] = call i32 @callee(), !range [[RNG0]]
 ; CHECK-NEXT:    [[ADD:%.*]] = mul nuw nsw i32 [[CALL1]], [[CALL2]]
-; CHECK-NEXT:    [[ZEXT:%.*]] = zext i32 [[ADD]] to i64
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext nneg i32 [[ADD]] to i64
 ; CHECK-NEXT:    ret i64 [[ZEXT]]
 ;
   %call1 = call i32 @callee(), !range !0
@@ -480,7 +480,7 @@ define i64 @test12(i32 %V) {
 ; CHECK-NEXT:    [[CALL1:%.*]] = call i32 @callee(), !range [[RNG1]]
 ; CHECK-NEXT:    [[CALL2:%.*]] = call i32 @callee(), !range [[RNG1]]
 ; CHECK-NEXT:    [[NARROW:%.*]] = mul nsw i32 [[CALL1]], [[CALL2]]
-; CHECK-NEXT:    [[ADD:%.*]] = zext i32 [[NARROW]] to i64
+; CHECK-NEXT:    [[ADD:%.*]] = zext nneg i32 [[NARROW]] to i64
 ; CHECK-NEXT:    ret i64 [[ADD]]
 ;
   %call1 = call i32 @callee(), !range !1
@@ -614,7 +614,7 @@ define i64 @test18(i32 %V) {
 define i64 @test19(i32 %V) {
 ; CHECK-LABEL: @test19(
 ; CHECK-NEXT:    [[CALL1:%.*]] = call i32 @callee(), !range [[RNG0]]
-; CHECK-NEXT:    [[SEXT1:%.*]] = zext i32 [[CALL1]] to i64
+; CHECK-NEXT:    [[SEXT1:%.*]] = zext nneg i32 [[CALL1]] to i64
 ; CHECK-NEXT:    [[SUB:%.*]] = sub nuw nsw i64 -2147481648, [[SEXT1]]
 ; CHECK-NEXT:    ret i64 [[SUB]]
 ;
diff --git a/llvm/test/Transforms/InstCombine/select_meta.ll b/llvm/test/Transforms/InstCombine/select_meta.ll
index f788dec108dfb..df1e5a82ad5d1 100644
--- a/llvm/test/Transforms/InstCombine/select_meta.ll
+++ b/llvm/test/Transforms/InstCombine/select_meta.ll
@@ -64,7 +64,7 @@ define i32 @foo2(i32, i32) local_unnamed_addr #0  {
 define i64 @test43(i32 %a) nounwind {
 ; CHECK-LABEL: @test43(
 ; CHECK-NEXT:    [[NARROW:%.*]] = call i32 @llvm.smax.i32(i32 [[A:%.*]], i32 0)
-; CHECK-NEXT:    [[MAX:%.*]] = zext i32 [[NARROW]] to i64
+; CHECK-NEXT:    [[MAX:%.*]] = zext nneg i32 [[NARROW]] to i64
 ; CHECK-NEXT:    ret i64 [[MAX]]
 ;
   %a_ext = sext i32 %a to i64
diff --git a/llvm/test/Transforms/InstCombine/sext.ll b/llvm/test/Transforms/InstCombine/sext.ll
index c204b37ff85a5..0e7caff0cfdef 100644
--- a/llvm/test/Transforms/InstCombine/sext.ll
+++ b/llvm/test/Transforms/InstCombine/sext.ll
@@ -12,7 +12,7 @@ declare void @use_vec(<2 x i5>)
 define i64 @test1(i32 %x) {
 ; CHECK-LABEL: @test1(
 ; CHECK-NEXT:    [[T:%.*]] = call i32 @llvm.ctpop.i32(i32 [[X:%.*]]), !range [[RNG0:![0-9]+]]
-; CHECK-NEXT:    [[S:%.*]] = zext i32 [[T]] to i64
+; CHECK-NEXT:    [[S:%.*]] = zext nneg i32 [[T]] to i64
 ; CHECK-NEXT:    ret i64 [[S]]
 ;
   %t = call i32 @llvm.ctpop.i32(i32 %x)
@@ -23,7 +23,7 @@ define i64 @test1(i32 %x) {
 define i64 @test2(i32 %x) {
 ; CHECK-LABEL: @test2(
 ; CHECK-NEXT:    [[T:%.*]] = call i32 @llvm.ctlz.i32(i32 [[X:%.*]], i1 true), !range [[RNG0]]
-; CHECK-NEXT:    [[S:%.*]] = zext i32 [[T]] to i64
+; CHECK-NEXT:    [[S:%.*]] = zext nneg i32 [[T]] to i64
 ; CHECK-NEXT:    ret i64 [[S]]
 ;
   %t = call i32 @llvm.ctlz.i32(i32 %x, i1 true)
@@ -34,7 +34,7 @@ define i64 @test2(i32 %x) {
 define i64 @test3(i32 %x) {
 ; CHECK-LABEL: @test3(
 ; CHECK-NEXT:    [[T:%.*]] = call i32 @llvm.cttz.i32(i32 [[X:%.*]], i1 true), !range [[RNG0]]
-; CHECK-NEXT:    [[S:%.*]] = zext i32 [[T]] to i64
+; CHECK-NEXT:    [[S:%.*]] = zext nneg i32 [[T]] to i64
 ; CHECK-NEXT:    ret i64 [[S]]
 ;
   %t = call i32 @llvm.cttz.i32(i32 %x, i1 true)
@@ -45,7 +45,7 @@ define i64 @test3(i32 %x) {
 define i64 @test4(i32 %x) {
 ; CHECK-LABEL: @test4(
 ; CHECK-NEXT:    [[T:%.*]] = udiv i32 [[X:%.*]], 3
-; CHECK-NEXT:    [[S:%.*]] = zext i32 [[T]] to i64
+; CHECK-NEXT:    [[S:%.*]] = zext nneg i32 [[T]] to i64
 ; CHECK-NEXT:    ret i64 [[S]]
 ;
   %t = udiv i32 %x, 3
@@ -56,7 +56,7 @@ define i64 @test4(i32 %x) {
 define i64 @test5(i32 %x) {
 ; CHECK-LABEL: @test5(
 ; CHECK-NEXT:    [[T:%.*]] = urem i32 [[X:%.*]], 30000
-; CHECK-NEXT:    [[S:%.*]] = zext i32 [[T]] to i64
+; CHECK-NEXT:    [[S:%.*]] = zext nneg i32 [[T]] to i64
 ; CHECK-NEXT:    ret i64 [[S]]
 ;
   %t = urem i32 %x, 30000
@@ -68,7 +68,7 @@ define i64 @test6(i32 %x) {
 ; CHECK-LABEL: @test6(
 ; CHECK-NEXT:    [[U:%.*]] = lshr i32 [[X:%.*]], 3
 ; CHECK-NEXT:    [[T:%.*]] = mul nuw nsw i32 [[U]], 3
-; CHECK-NEXT:    [[S:%.*]] = zext i32 [[T]] to i64
+; CHECK-NEXT:    [[S:%.*]] = zext nneg i32 [[T]] to i64
 ; CHECK-NEXT:    ret i64 [[S]]
 ;
   %u = lshr i32 %x, 3
@@ -81,7 +81,7 @@ define i64 @test7(i32 %x) {
 ; CHECK-LABEL: @test7(
 ; CHECK-NEXT:    [[T:%.*]] = and i32 [[X:%.*]], 511
 ; CHECK-NEXT:    [[U:%.*]] = sub nuw nsw i32 20000, [[T]]
-; CHECK-NEXT:    [[S:%.*]] = zext i32 [[U]] to i64
+; CHECK-NEXT:    [[S:%.*]] = zext nneg i32 [[U]] to i64
 ; CHECK-NEXT:    ret i64 [[S]]
 ;
   %t = and i32 %x, 511
@@ -296,7 +296,7 @@ define i32 @test17(i1 %x) {
 define i32 @test18(i16 %x) {
 ; CHECK-LABEL: @test18(
 ; CHECK-NEXT:    [[SEL:%.*]] = call i16 @llvm.smax.i16(i16 [[X:%.*]], i16 0)
-; CHECK-NEXT:    [[EXT:%.*]] = zext i16 [[SEL]] to i32
+; CHECK-NEXT:    [[EXT:%.*]] = zext nneg i16 [[SEL]] to i32
 ; CHECK-NEXT:    ret i32 [[EXT]]
 ;
   %cmp = icmp slt i16 %x, 0
diff --git a/llvm/test/Transforms/InstCombine/udiv-simplify.ll b/llvm/test/Transforms/InstCombine/udiv-simplify.ll
index 724170e376b35..a38d32d792550 100644
--- a/llvm/test/Transforms/InstCombine/udiv-simplify.ll
+++ b/llvm/test/Transforms/InstCombine/udiv-simplify.ll
@@ -27,7 +27,7 @@ define i64 @test1_PR2274(i32 %x, i32 %g) nounwind {
 ; CHECK-LABEL: @test1_PR2274(
 ; CHECK-NEXT:    [[Y:%.*]] = lshr i32 [[X:%.*]], 30
 ; CHECK-NEXT:    [[R:%.*]] = udiv i32 [[Y]], [[G:%.*]]
-; CHECK-NEXT:    [[Z:%.*]] = zext i32 [[R]] to i64
+; CHECK-NEXT:    [[Z:%.*]] = zext nneg i32 [[R]] to i64
 ; CHECK-NEXT:    ret i64 [[Z]]
 ;
   %y = lshr i32 %x, 30
@@ -39,7 +39,7 @@ define i64 @test2_PR2274(i32 %x, i32 %v) nounwind {
 ; CHECK-LABEL: @test2_PR2274(
 ; CHECK-NEXT:    [[Y:%.*]] = lshr i32 [[X:%.*]], 31
 ; CHECK-NEXT:    [[R:%.*]] = udiv i32 [[Y]], [[V:%.*]]
-; CHECK-NEXT:    [[Z:%.*]] = zext i32 [[R]] to i64
+; CHECK-NEXT:    [[Z:%.*]] = zext nneg i32 [[R]] to i64
 ; CHECK-NEXT:    ret i64 [[Z]]
 ;
   %y = lshr i32 %x, 31
diff --git a/llvm/test/Transforms/InstCombine/wcslen-1.ll b/llvm/test/Transforms/InstCombine/wcslen-1.ll
index 5d05cff6e54b8..4a9a4b9263202 100644
--- a/llvm/test/Transforms/InstCombine/wcslen-1.ll
+++ b/llvm/test/Transforms/InstCombine/wcslen-1.ll
@@ -175,7 +175,7 @@ define i64 @test_no_simplify2_no_null_opt(i32 %x) #0 {
 define i64 @test_no_simplify3(i32 %x) {
 ; CHECK-LABEL: @test_no_simplify3(
 ; CHECK-NEXT:    [[AND:%.*]] = and i32 [[X:%.*]], 15
-; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[AND]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = zext nneg i32 [[AND]] to i64
 ; CHECK-NEXT:    [[HELLO_P:%.*]] = getelementptr inbounds [13 x i32], ptr @null_hello_mid, i64 0, i64 [[TMP1]]
 ; CHECK-NEXT:    [[HELLO_L:%.*]] = call i64 @wcslen(ptr nonnull [[HELLO_P]])
 ; CHECK-NEXT:    ret i64 [[HELLO_L]]
@@ -189,7 +189,7 @@ define i64 @test_no_simplify3(i32 %x) {
 define i64 @test_no_simplify3_no_null_opt(i32 %x) #0 {
 ; CHECK-LABEL: @test_no_simplify3_no_null_opt(
 ; CHECK-NEXT:    [[AND:%.*]] = and i32 [[X:%.*]], 15
-; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[AND]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = zext nneg i32 [[AND]] to i64
 ; CHECK-NEXT:    [[HELLO_P:%.*]] = getelementptr inbounds [13 x i32], ptr @null_hello_mid, i64 0, i64 [[TMP1]]
 ; CHECK-NEXT:    [[HELLO_L:%.*]] = call i64 @wcslen(ptr [[HELLO_P]])
 ; CHECK-NEXT:    ret i64 [[HELLO_L]]
diff --git a/llvm/test/Transforms/InstCombine/wcslen-3.ll b/llvm/test/Transforms/InstCombine/wcslen-3.ll
index c463b6b1e9526..6dc9534c4986e 100644
--- a/llvm/test/Transforms/InstCombine/wcslen-3.ll
+++ b/llvm/test/Transforms/InstCombine/wcslen-3.ll
@@ -164,7 +164,7 @@ define i64 @test_no_simplify2(i16 %x) {
 define i64 @test_no_simplify3(i16 %x) {
 ; CHECK-LABEL: @test_no_simplify3(
 ; CHECK-NEXT:    [[AND:%.*]] = and i16 [[X:%.*]], 15
-; CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[AND]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = zext nneg i16 [[AND]] to i64
 ; CHECK-NEXT:    [[HELLO_P:%.*]] = getelementptr inbounds [13 x i16], ptr @null_hello_mid, i64 0, i64 [[TMP1]]
 ; CHECK-NEXT:    [[HELLO_L:%.*]] = call i64 @wcslen(ptr nonnull [[HELLO_P]])
 ; CHECK-NEXT:    ret i64 [[HELLO_L]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll
index 83523545f84d0..4803b96642afd 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll
@@ -82,19 +82,19 @@ define dso_local void @masked_strided1(ptr noalias nocapture readonly %p, ptr no
 ; SCALAR_TAIL_FOLDING-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
 ; SCALAR_TAIL_FOLDING:       if.then:
 ; SCALAR_TAIL_FOLDING-NEXT:    [[MUL:%.*]] = shl nuw nsw i32 [[IX_024]], 1
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP22:%.*]] = zext i32 [[MUL]] to i64
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP22:%.*]] = zext nneg i32 [[MUL]] to i64
 ; SCALAR_TAIL_FOLDING-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP22]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP23:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
 ; SCALAR_TAIL_FOLDING-NEXT:    [[ADD:%.*]] = or i32 [[MUL]], 1
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP24:%.*]] = zext i32 [[ADD]] to i64
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP24:%.*]] = zext nneg i32 [[ADD]] to i64
 ; SCALAR_TAIL_FOLDING-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP24]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP25:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1
 ; SCALAR_TAIL_FOLDING-NEXT:    [[SPEC_SELECT_I:%.*]] = call i8 @llvm.smax.i8(i8 [[TMP23]], i8 [[TMP25]])
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP26:%.*]] = zext i32 [[MUL]] to i64
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP26:%.*]] = zext nneg i32 [[MUL]] to i64
 ; SCALAR_TAIL_FOLDING-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 [[TMP26]]
 ; SCALAR_TAIL_FOLDING-NEXT:    store i8 [[SPEC_SELECT_I]], ptr [[ARRAYIDX6]], align 1
 ; SCALAR_TAIL_FOLDING-NEXT:    [[SUB:%.*]] = sub i8 0, [[SPEC_SELECT_I]]
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP27:%.*]] = zext i32 [[ADD]] to i64
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP27:%.*]] = zext nneg i32 [[ADD]] to i64
 ; SCALAR_TAIL_FOLDING-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 [[TMP27]]
 ; SCALAR_TAIL_FOLDING-NEXT:    store i8 [[SUB]], ptr [[ARRAYIDX11]], align 1
 ; SCALAR_TAIL_FOLDING-NEXT:    br label [[FOR_INC]]
@@ -239,12 +239,12 @@ define dso_local void @masked_strided2(ptr noalias nocapture readnone %p, ptr no
 ; SCALAR_TAIL_FOLDING-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[TMP4]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP7:%.*]] = shl nuw nsw <vscale x 16 x i32> [[VEC_IND]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> poison, i32 1, i64 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer)
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP8:%.*]] = zext <vscale x 16 x i32> [[TMP7]] to <vscale x 16 x i64>
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP8:%.*]] = zext nneg <vscale x 16 x i32> [[TMP7]] to <vscale x 16 x i64>
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP8]]
 ; SCALAR_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 1, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x ptr> [[TMP9]], i32 1, <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer))
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP10:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP11:%.*]] = or <vscale x 16 x i32> [[TMP7]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> poison, i32 1, i64 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer)
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP12:%.*]] = zext <vscale x 16 x i32> [[TMP11]] to <vscale x 16 x i64>
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP12:%.*]] = zext nneg <vscale x 16 x i32> [[TMP11]] to <vscale x 16 x i64>
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP12]]
 ; SCALAR_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 2, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x ptr> [[TMP13]], i32 1, <vscale x 16 x i1> [[TMP10]])
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vscale.i32()
@@ -262,14 +262,14 @@ define dso_local void @masked_strided2(ptr noalias nocapture readnone %p, ptr no
 ; SCALAR_TAIL_FOLDING:       for.body:
 ; SCALAR_TAIL_FOLDING-NEXT:    [[IX_012:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_INC:%.*]] ]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[MUL:%.*]] = shl nuw nsw i32 [[IX_012]], 1
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP17:%.*]] = zext i32 [[MUL]] to i64
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP17:%.*]] = zext nneg i32 [[MUL]] to i64
 ; SCALAR_TAIL_FOLDING-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 [[TMP17]]
 ; SCALAR_TAIL_FOLDING-NEXT:    store i8 1, ptr [[ARRAYIDX]], align 1
 ; SCALAR_TAIL_FOLDING-NEXT:    [[CMP1:%.*]] = icmp ugt i32 [[IX_012]], [[CONV]]
 ; SCALAR_TAIL_FOLDING-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
 ; SCALAR_TAIL_FOLDING:       if.then:
 ; SCALAR_TAIL_FOLDING-NEXT:    [[ADD:%.*]] = or i32 [[MUL]], 1
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP18:%.*]] = zext i32 [[ADD]] to i64
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP18:%.*]] = zext nneg i32 [[ADD]] to i64
 ; SCALAR_TAIL_FOLDING-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 [[TMP18]]
 ; SCALAR_TAIL_FOLDING-NEXT:    store i8 2, ptr [[ARRAYIDX3]], align 1
 ; SCALAR_TAIL_FOLDING-NEXT:    br label [[FOR_INC]]
@@ -303,12 +303,12 @@ define dso_local void @masked_strided2(ptr noalias nocapture readnone %p, ptr no
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[TMP3]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP6:%.*]] = shl nuw nsw <vscale x 16 x i32> [[VEC_IND]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> poison, i32 1, i64 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer)
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP7:%.*]] = zext <vscale x 16 x i32> [[TMP6]] to <vscale x 16 x i64>
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP7:%.*]] = zext nneg <vscale x 16 x i32> [[TMP6]] to <vscale x 16 x i64>
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP7]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 1, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x ptr> [[TMP8]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP9:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP10:%.*]] = or <vscale x 16 x i32> [[TMP6]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> poison, i32 1, i64 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer)
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP11:%.*]] = zext <vscale x 16 x i32> [[TMP10]] to <vscale x 16 x i64>
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP11:%.*]] = zext nneg <vscale x 16 x i32> [[TMP10]] to <vscale x 16 x i64>
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP11]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP13:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i1> [[TMP9]], <vscale x 16 x i1> zeroinitializer
 ; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 2, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x ptr> [[TMP12]], i32 1, <vscale x 16 x i1> [[TMP13]])
@@ -404,12 +404,12 @@ define dso_local void @masked_strided3(ptr noalias nocapture readnone %p, ptr no
 ; SCALAR_TAIL_FOLDING-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[TMP4]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP7:%.*]] = shl nuw nsw <vscale x 16 x i32> [[VEC_IND]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> poison, i32 1, i64 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer)
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP8:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP9:%.*]] = zext <vscale x 16 x i32> [[TMP7]] to <vscale x 16 x i64>
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP9:%.*]] = zext nneg <vscale x 16 x i32> [[TMP7]] to <vscale x 16 x i64>
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP9]]
 ; SCALAR_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 1, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x ptr> [[TMP10]], i32 1, <vscale x 16 x i1> [[TMP8]])
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP11:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP12:%.*]] = or <vscale x 16 x i32> [[TMP7]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> poison, i32 1, i64 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer)
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP13:%.*]] = zext <vscale x 16 x i32> [[TMP12]] to <vscale x 16 x i64>
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP13:%.*]] = zext nneg <vscale x 16 x i32> [[TMP12]] to <vscale x 16 x i64>
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP13]]
 ; SCALAR_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 2, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x ptr> [[TMP14]], i32 1, <vscale x 16 x i1> [[TMP11]])
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP15:%.*]] = call i32 @llvm.vscale.i32()
@@ -430,7 +430,7 @@ define dso_local void @masked_strided3(ptr noalias nocapture readnone %p, ptr no
 ; SCALAR_TAIL_FOLDING-NEXT:    [[CMP1:%.*]] = icmp ugt i32 [[IX_018]], [[CONV]]
 ; SCALAR_TAIL_FOLDING-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
 ; SCALAR_TAIL_FOLDING:       if.then:
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP18:%.*]] = zext i32 [[MUL]] to i64
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP18:%.*]] = zext nneg i32 [[MUL]] to i64
 ; SCALAR_TAIL_FOLDING-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 [[TMP18]]
 ; SCALAR_TAIL_FOLDING-NEXT:    store i8 1, ptr [[ARRAYIDX]], align 1
 ; SCALAR_TAIL_FOLDING-NEXT:    br label [[IF_END]]
@@ -439,7 +439,7 @@ define dso_local void @masked_strided3(ptr noalias nocapture readnone %p, ptr no
 ; SCALAR_TAIL_FOLDING-NEXT:    br i1 [[CMP4]], label [[IF_THEN6:%.*]], label [[FOR_INC]]
 ; SCALAR_TAIL_FOLDING:       if.then6:
 ; SCALAR_TAIL_FOLDING-NEXT:    [[ADD:%.*]] = or i32 [[MUL]], 1
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP19:%.*]] = zext i32 [[ADD]] to i64
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP19:%.*]] = zext nneg i32 [[ADD]] to i64
 ; SCALAR_TAIL_FOLDING-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 [[TMP19]]
 ; SCALAR_TAIL_FOLDING-NEXT:    store i8 2, ptr [[ARRAYIDX7]], align 1
 ; SCALAR_TAIL_FOLDING-NEXT:    br label [[FOR_INC]]
@@ -477,13 +477,13 @@ define dso_local void @masked_strided3(ptr noalias nocapture readnone %p, ptr no
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[TMP3]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP6:%.*]] = shl nuw nsw <vscale x 16 x i32> [[VEC_IND]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> poison, i32 1, i64 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer)
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP7:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP8:%.*]] = zext <vscale x 16 x i32> [[TMP6]] to <vscale x 16 x i64>
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP8:%.*]] = zext nneg <vscale x 16 x i32> [[TMP6]] to <vscale x 16 x i64>
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP8]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP10:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> zeroinitializer
 ; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 1, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x ptr> [[TMP9]], i32 1, <vscale x 16 x i1> [[TMP10]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP11:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP12:%.*]] = or <vscale x 16 x i32> [[TMP6]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> poison, i32 1, i64 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer)
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP13:%.*]] = zext <vscale x 16 x i32> [[TMP12]] to <vscale x 16 x i64>
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP13:%.*]] = zext nneg <vscale x 16 x i32> [[TMP12]] to <vscale x 16 x i64>
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP13]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP15:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i1> [[TMP11]], <vscale x 16 x i1> zeroinitializer
 ; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 2, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x ptr> [[TMP14]], i32 1, <vscale x 16 x i1> [[TMP15]])
diff --git a/llvm/test/Transforms/LoopVectorize/induction.ll b/llvm/test/Transforms/LoopVectorize/induction.ll
index 90ad054c5a22e..2df55bdf89a00 100644
--- a/llvm/test/Transforms/LoopVectorize/induction.ll
+++ b/llvm/test/Transforms/LoopVectorize/induction.ll
@@ -2083,7 +2083,7 @@ define i32 @scalarize_induction_variable_05(ptr %a, i32 %x, i1 %c, i32 %n) {
 ; IND:       for.body:
 ; IND-NEXT:    [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[IF_END:%.*]] ]
 ; IND-NEXT:    [[SUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[VAR4:%.*]], [[IF_END]] ]
-; IND-NEXT:    [[TMP16:%.*]] = zext i32 [[I]] to i64
+; IND-NEXT:    [[TMP16:%.*]] = zext nneg i32 [[I]] to i64
 ; IND-NEXT:    [[VAR0:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP16]]
 ; IND-NEXT:    [[VAR1:%.*]] = load i32, ptr [[VAR0]], align 4
 ; IND-NEXT:    br i1 [[C]], label [[IF_THEN:%.*]], label [[IF_END]]
@@ -2173,7 +2173,7 @@ define i32 @scalarize_induction_variable_05(ptr %a, i32 %x, i1 %c, i32 %n) {
 ; UNROLL:       for.body:
 ; UNROLL-NEXT:    [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[IF_END:%.*]] ]
 ; UNROLL-NEXT:    [[SUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[VAR4:%.*]], [[IF_END]] ]
-; UNROLL-NEXT:    [[TMP26:%.*]] = zext i32 [[I]] to i64
+; UNROLL-NEXT:    [[TMP26:%.*]] = zext nneg i32 [[I]] to i64
 ; UNROLL-NEXT:    [[VAR0:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP26]]
 ; UNROLL-NEXT:    [[VAR1:%.*]] = load i32, ptr [[VAR0]], align 4
 ; UNROLL-NEXT:    br i1 [[C]], label [[IF_THEN:%.*]], label [[IF_END]]
@@ -2397,7 +2397,7 @@ define i32 @scalarize_induction_variable_05(ptr %a, i32 %x, i1 %c, i32 %n) {
 ; INTERLEAVE:       for.body:
 ; INTERLEAVE-NEXT:    [[I:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[I_NEXT:%.*]], [[IF_END:%.*]] ]
 ; INTERLEAVE-NEXT:    [[SUM:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[VAR4:%.*]], [[IF_END]] ]
-; INTERLEAVE-NEXT:    [[TMP46:%.*]] = zext i32 [[I]] to i64
+; INTERLEAVE-NEXT:    [[TMP46:%.*]] = zext nneg i32 [[I]] to i64
 ; INTERLEAVE-NEXT:    [[VAR0:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP46]]
 ; INTERLEAVE-NEXT:    [[VAR1:%.*]] = load i32, ptr [[VAR0]], align 4
 ; INTERLEAVE-NEXT:    br i1 [[C]], label [[IF_THEN:%.*]], label [[IF_END]]
diff --git a/llvm/test/Transforms/PhaseOrdering/gvn-replacement-vs-hoist.ll b/llvm/test/Transforms/PhaseOrdering/gvn-replacement-vs-hoist.ll
index ea863f7355ad9..522ebf9dcc04b 100644
--- a/llvm/test/Transforms/PhaseOrdering/gvn-replacement-vs-hoist.ll
+++ b/llvm/test/Transforms/PhaseOrdering/gvn-replacement-vs-hoist.ll
@@ -6,7 +6,7 @@ define void @test(ptr noundef %a, i32 noundef %beam) {
 ; CHECK-SAME: (ptr nocapture noundef writeonly [[A:%.*]], i32 noundef [[BEAM:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[MUL:%.*]] = shl nuw nsw i32 [[BEAM]], 1
-; CHECK-NEXT:    [[IDXPROM:%.*]] = zext i32 [[MUL]] to i64
+; CHECK-NEXT:    [[IDXPROM:%.*]] = zext nneg i32 [[MUL]] to i64
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IDXPROM]]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.cond.cleanup:
@@ -20,7 +20,7 @@ define void @test(ptr noundef %a, i32 noundef %beam) {
 ; CHECK-NEXT:    br label [[FOR_INC]]
 ; CHECK:       if.else:
 ; CHECK-NEXT:    [[MUL2:%.*]] = shl nuw nsw i32 [[I_06]], 1
-; CHECK-NEXT:    [[IDXPROM3:%.*]] = zext i32 [[MUL2]] to i64
+; CHECK-NEXT:    [[IDXPROM3:%.*]] = zext nneg i32 [[MUL2]] to i64
 ; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IDXPROM3]]
 ; CHECK-NEXT:    store i32 1, ptr [[ARRAYIDX4]], align 4
 ; CHECK-NEXT:    br label [[FOR_INC]]
diff --git a/llvm/test/Transforms/PhaseOrdering/lto-licm.ll b/llvm/test/Transforms/PhaseOrdering/lto-licm.ll
index 1a5a67d8241a1..763e266e6a382 100644
--- a/llvm/test/Transforms/PhaseOrdering/lto-licm.ll
+++ b/llvm/test/Transforms/PhaseOrdering/lto-licm.ll
@@ -11,7 +11,7 @@ define void @hoist_fdiv(ptr %a, float %b) {
 ; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[I_0]], 1024
 ; CHECK-NEXT:    br i1 [[CMP_NOT]], label [[FOR_END:%.*]], label [[FOR_INC]]
 ; CHECK:       for.inc:
-; CHECK-NEXT:    [[IDXPROM:%.*]] = zext i32 [[I_0]] to i64
+; CHECK-NEXT:    [[IDXPROM:%.*]] = zext nneg i32 [[I_0]] to i64
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[IDXPROM]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[ARRAYIDX]], align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = fmul fast float [[TMP1]], [[TMP0]]
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll
index 9a55e1eee5bd4..63934a2cc9646 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll
@@ -67,7 +67,7 @@ define i32 @getelementptr_4x32(ptr nocapture readonly %g, i32 %n, i32 %x, i32 %y
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP5:%.*]] = add nsw <2 x i32> [[TMP4]], [[TMP0]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i32> [[TMP5]], i64 0
-; CHECK-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = zext nneg i32 [[TMP6]] to i64
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[G:%.*]], i64 [[TMP7]]
 ; CHECK-NEXT:    [[T6:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
 ; CHECK-NEXT:    [[ADD1:%.*]] = add nsw i32 [[T6]], [[SUM_032]]
@@ -159,12 +159,12 @@ define i32 @getelementptr_2x32(ptr nocapture readonly %g, i32 %n, i32 %x, i32 %y
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[SUM_032:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[ADD16]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[T4:%.*]] = shl nuw nsw i32 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[TMP2:%.*]] = zext i32 [[T4]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = zext nneg i32 [[T4]] to i64
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[G:%.*]], i64 [[TMP2]]
 ; CHECK-NEXT:    [[T6:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
 ; CHECK-NEXT:    [[ADD1:%.*]] = add nsw i32 [[T6]], [[SUM_032]]
 ; CHECK-NEXT:    [[T7:%.*]] = or i32 [[T4]], 1
-; CHECK-NEXT:    [[TMP3:%.*]] = zext i32 [[T7]] to i64
+; CHECK-NEXT:    [[TMP3:%.*]] = zext nneg i32 [[T7]] to i64
 ; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[G]], i64 [[TMP3]]
 ; CHECK-NEXT:    [[T8:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4
 ; CHECK-NEXT:    [[ADD6:%.*]] = add nsw i32 [[ADD1]], [[T8]]
diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/AMDGPU/uniform-unswitch.ll b/llvm/test/Transforms/SimpleLoopUnswitch/AMDGPU/uniform-unswitch.ll
index cbbf4d6e7be19..2069efd12d27a 100644
--- a/llvm/test/Transforms/SimpleLoopUnswitch/AMDGPU/uniform-unswitch.ll
+++ b/llvm/test/Transforms/SimpleLoopUnswitch/AMDGPU/uniform-unswitch.ll
@@ -34,7 +34,7 @@ define amdgpu_kernel void @uniform_unswitch(ptr nocapture %out, i32 %n, i32 %x)
 ; CHECK-NEXT:    [[I_07:%.*]] = phi i32 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[FOR_INC:%.*]] ]
 ; CHECK-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
 ; CHECK:       if.then:
-; CHECK-NEXT:    [[TMP0:%.*]] = zext i32 [[I_07]] to i64
+; CHECK-NEXT:    [[TMP0:%.*]] = zext nneg i32 [[I_07]] to i64
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[OUT_GLOBAL]], i64 [[TMP0]]
 ; CHECK-NEXT:    store i32 [[I_07]], ptr addrspace(1) [[ARRAYIDX]], align 4
 ; CHECK-NEXT:    br label [[FOR_INC]]

From 4aa12afb967bd7c5f051f3b72271f787f1a7538b Mon Sep 17 00:00:00 2001
From: Aiden Grossman <agrossman154@yahoo.com>
Date: Mon, 30 Oct 2023 12:23:51 -0700
Subject: [PATCH 419/877] [Github] Fetch all commits in PR for code formatting
 checks (#69766)

This patch makes a couple changes to the PR code formatting check:
- Moves the `changed-files` action to before the checkout to make sure
that it pulls
information from the Github API rather than by running `git diff` to
alleviate some
performance problems.
- Checkout the head of the pull request head instead of the base of the
pull request
to ensure that we have the PR commits inside the checkout.
- Add an additional sparse checkout of the necessary LLVM tools to run
the action
to alleviate security problems introduced by checking out the head of
the pull
request. Only code from the base of the pull request runs.
- Adjust the commit references to be based on `HEAD` as Github doesn't
give
exact commit SHAs for the first commit in the PR.
---
 .github/workflows/pr-code-format.yml | 46 ++++++++++++++++++++--------
 1 file changed, 33 insertions(+), 13 deletions(-)

diff --git a/.github/workflows/pr-code-format.yml b/.github/workflows/pr-code-format.yml
index 3a91ffb0b1ad9..c021c14f4a495 100644
--- a/.github/workflows/pr-code-format.yml
+++ b/.github/workflows/pr-code-format.yml
@@ -7,17 +7,37 @@ jobs:
   code_formatter:
     runs-on: ubuntu-latest
     steps:
-      - name: Fetch LLVM sources
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 2
-
+      # Get changed files before checking out the repository to force the action
+      # to analyze the diff from the Github API rather than looking at the
+      # shallow clone and erroring out, which is significantly more prone to
+      # failure.
       - name: Get changed files
         id: changed-files
         uses: tj-actions/changed-files@v39
         with:
           separator: ","
-          fetch_depth: 100 # Fetches only the last 10 commits
+
+      - name: Calculate number of commits to fetch
+        run: echo "PR_FETCH_DEPTH=$(( ${{ github.event.pull_request.commits }} + 1 ))" >> "${GITHUB_ENV}"
+
+      - name: Fetch PR sources
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ github.event.pull_request.head.ref }}
+          fetch-depth: ${{ env.PR_FETCH_DEPTH }}
+          path: pr-sources
+
+      # We need to make sure that we aren't executing/using any code from the
+      # PR for security reasons as we're using pull_request_target. Checkout
+      # the target branch with the necessary files.
+      - name: Fetch LLVM Sources
+        uses: actions/checkout@v4
+        with:
+          sparse-checkout: |
+            llvm/utils/git/requirements_formatting.txt
+            llvm/utils/git/code-format-helper.py
+          sparse-checkout-cone-mode: false
+          path: llvm-sources
 
       - name: "Listed files"
         run: |
@@ -34,21 +54,21 @@ jobs:
         with:
           python-version: '3.11'
           cache: 'pip'
-          cache-dependency-path: 'llvm/utils/git/requirements_formatting.txt'
+          cache-dependency-path: 'llvm-sources/llvm/utils/git/requirements_formatting.txt'
 
       - name: Install python dependencies
-        run: pip install -r llvm/utils/git/requirements_formatting.txt
+        run: pip install -r llvm-sources/llvm/utils/git/requirements_formatting.txt
 
       - name: Run code formatter
         env:
           GITHUB_PR_NUMBER: ${{ github.event.pull_request.number }}
-          START_REV: ${{ github.event.pull_request.base.sha }}
-          END_REV: ${{ github.event.pull_request.head.sha }}
+          PR_DEPTH: ${{ github.event.pull_request.commits }}
           CHANGED_FILES: ${{ steps.changed-files.outputs.all_changed_files }}
+        working-directory: ./pr-sources
         run: |
-          python llvm/utils/git/code-format-helper.py \
+          python ../llvm-sources/llvm/utils/git/code-format-helper.py \
             --token ${{ secrets.GITHUB_TOKEN }} \
             --issue-number $GITHUB_PR_NUMBER \
-            --start-rev $START_REV \
-            --end-rev $END_REV \
+            --start-rev HEAD~$PR_DEPTH \
+            --end-rev HEAD \
             --changed-files "$CHANGED_FILES"

From 96410a6b1403de3a90fa76bc68e97807be969e97 Mon Sep 17 00:00:00 2001
From: Aiden Grossman <agrossman154@yahoo.com>
Date: Mon, 30 Oct 2023 12:33:35 -0700
Subject: [PATCH 420/877] Revert "[Github] Fetch all commits in PR for code
 formatting checks (#69766)"

This reverts commit 4aa12afb967bd7c5f051f3b72271f787f1a7538b.

This change introduced failures upon checking out the PR source code.
Pulling this out of tree while I investigate further.
---
 .github/workflows/pr-code-format.yml | 46 ++++++++--------------------
 1 file changed, 13 insertions(+), 33 deletions(-)

diff --git a/.github/workflows/pr-code-format.yml b/.github/workflows/pr-code-format.yml
index c021c14f4a495..3a91ffb0b1ad9 100644
--- a/.github/workflows/pr-code-format.yml
+++ b/.github/workflows/pr-code-format.yml
@@ -7,37 +7,17 @@ jobs:
   code_formatter:
     runs-on: ubuntu-latest
     steps:
-      # Get changed files before checking out the repository to force the action
-      # to analyze the diff from the Github API rather than looking at the
-      # shallow clone and erroring out, which is significantly more prone to
-      # failure.
+      - name: Fetch LLVM sources
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 2
+
       - name: Get changed files
         id: changed-files
         uses: tj-actions/changed-files@v39
         with:
           separator: ","
-
-      - name: Calculate number of commits to fetch
-        run: echo "PR_FETCH_DEPTH=$(( ${{ github.event.pull_request.commits }} + 1 ))" >> "${GITHUB_ENV}"
-
-      - name: Fetch PR sources
-        uses: actions/checkout@v4
-        with:
-          ref: ${{ github.event.pull_request.head.ref }}
-          fetch-depth: ${{ env.PR_FETCH_DEPTH }}
-          path: pr-sources
-
-      # We need to make sure that we aren't executing/using any code from the
-      # PR for security reasons as we're using pull_request_target. Checkout
-      # the target branch with the necessary files.
-      - name: Fetch LLVM Sources
-        uses: actions/checkout@v4
-        with:
-          sparse-checkout: |
-            llvm/utils/git/requirements_formatting.txt
-            llvm/utils/git/code-format-helper.py
-          sparse-checkout-cone-mode: false
-          path: llvm-sources
+          fetch_depth: 100 # Fetches only the last 10 commits
 
       - name: "Listed files"
         run: |
@@ -54,21 +34,21 @@ jobs:
         with:
           python-version: '3.11'
           cache: 'pip'
-          cache-dependency-path: 'llvm-sources/llvm/utils/git/requirements_formatting.txt'
+          cache-dependency-path: 'llvm/utils/git/requirements_formatting.txt'
 
       - name: Install python dependencies
-        run: pip install -r llvm-sources/llvm/utils/git/requirements_formatting.txt
+        run: pip install -r llvm/utils/git/requirements_formatting.txt
 
       - name: Run code formatter
         env:
           GITHUB_PR_NUMBER: ${{ github.event.pull_request.number }}
-          PR_DEPTH: ${{ github.event.pull_request.commits }}
+          START_REV: ${{ github.event.pull_request.base.sha }}
+          END_REV: ${{ github.event.pull_request.head.sha }}
           CHANGED_FILES: ${{ steps.changed-files.outputs.all_changed_files }}
-        working-directory: ./pr-sources
         run: |
-          python ../llvm-sources/llvm/utils/git/code-format-helper.py \
+          python llvm/utils/git/code-format-helper.py \
             --token ${{ secrets.GITHUB_TOKEN }} \
             --issue-number $GITHUB_PR_NUMBER \
-            --start-rev HEAD~$PR_DEPTH \
-            --end-rev HEAD \
+            --start-rev $START_REV \
+            --end-rev $END_REV \
             --changed-files "$CHANGED_FILES"

From 7c2ef38c36eda2907cd6a3efff88bb86a1b381a3 Mon Sep 17 00:00:00 2001
From: Vlad Serebrennikov <serebrennikov.vladislav@gmail.com>
Date: Mon, 30 Oct 2023 22:31:04 +0300
Subject: [PATCH 421/877] [mlir][NFC] Use `llvm::to_underlying` in sparse
 tensor IR detail

---
 mlir/lib/Dialect/SparseTensor/IR/Detail/DimLvlMap.h |  3 ++-
 .../Dialect/SparseTensor/IR/Detail/TemplateExtras.h |  7 -------
 mlir/lib/Dialect/SparseTensor/IR/Detail/Var.h       | 13 +++++++------
 3 files changed, 9 insertions(+), 14 deletions(-)

diff --git a/mlir/lib/Dialect/SparseTensor/IR/Detail/DimLvlMap.h b/mlir/lib/Dialect/SparseTensor/IR/Detail/DimLvlMap.h
index 664b49509f070..b3200d0983eb7 100644
--- a/mlir/lib/Dialect/SparseTensor/IR/Detail/DimLvlMap.h
+++ b/mlir/lib/Dialect/SparseTensor/IR/Detail/DimLvlMap.h
@@ -12,6 +12,7 @@
 #include "Var.h"
 
 #include "mlir/Dialect/SparseTensor/IR/SparseTensor.h"
+#include "llvm/ADT/STLForwardCompat.h"
 
 namespace mlir {
 namespace sparse_tensor {
@@ -22,7 +23,7 @@ enum class ExprKind : bool { Dimension = false, Level = true };
 
 constexpr VarKind getVarKindAllowedInExpr(ExprKind ek) {
   using VK = std::underlying_type_t<VarKind>;
-  return VarKind{2 * static_cast<VK>(!to_underlying(ek))};
+  return VarKind{2 * static_cast<VK>(!llvm::to_underlying(ek))};
 }
 static_assert(getVarKindAllowedInExpr(ExprKind::Dimension) == VarKind::Level &&
               getVarKindAllowedInExpr(ExprKind::Level) == VarKind::Dimension);
diff --git a/mlir/lib/Dialect/SparseTensor/IR/Detail/TemplateExtras.h b/mlir/lib/Dialect/SparseTensor/IR/Detail/TemplateExtras.h
index 7f0c1fd8c46c7..383fbcb8dc520 100644
--- a/mlir/lib/Dialect/SparseTensor/IR/Detail/TemplateExtras.h
+++ b/mlir/lib/Dialect/SparseTensor/IR/Detail/TemplateExtras.h
@@ -37,13 +37,6 @@ operator<<(llvm::raw_ostream &os, T const &t) {
   return os;
 }
 
-//===----------------------------------------------------------------------===//
-/// Convert an enum to its underlying type.
-template <typename Enum>
-constexpr std::underlying_type_t<Enum> to_underlying(Enum e) noexcept {
-  return static_cast<std::underlying_type_t<Enum>>(e);
-}
-
 //===----------------------------------------------------------------------===//
 template <typename T>
 static constexpr bool IsZeroCostAbstraction =
diff --git a/mlir/lib/Dialect/SparseTensor/IR/Detail/Var.h b/mlir/lib/Dialect/SparseTensor/IR/Detail/Var.h
index 2606dd399eec8..81f480187c059 100644
--- a/mlir/lib/Dialect/SparseTensor/IR/Detail/Var.h
+++ b/mlir/lib/Dialect/SparseTensor/IR/Detail/Var.h
@@ -13,6 +13,7 @@
 
 #include "mlir/IR/OpImplementation.h"
 #include "llvm/ADT/EnumeratedArray.h"
+#include "llvm/ADT/STLForwardCompat.h"
 #include "llvm/ADT/SmallBitVector.h"
 #include "llvm/ADT/StringMap.h"
 
@@ -31,13 +32,13 @@ namespace ir_detail {
 enum class VarKind { Symbol = 1, Dimension = 0, Level = 2 };
 
 [[nodiscard]] constexpr bool isWF(VarKind vk) {
-  const auto vk_ = to_underlying(vk);
+  const auto vk_ = llvm::to_underlying(vk);
   return 0 <= vk_ && vk_ <= 2;
 }
 
 /// Swaps `Dimension` and `Level`, but leaves `Symbol` the same.
 constexpr VarKind flipVarKind(VarKind vk) {
-  return VarKind{2 - to_underlying(vk)};
+  return VarKind{2 - llvm::to_underlying(vk)};
 }
 static_assert(flipVarKind(VarKind::Symbol) == VarKind::Symbol &&
               flipVarKind(VarKind::Dimension) == VarKind::Level &&
@@ -49,7 +50,7 @@ constexpr char toChar(VarKind vk) {
   // in the range [-44..126] (where that lower bound is under worst-case
   // rearranging of the expression); and `int_fast8_t` is the fastest type
   // which can support that range without over-/underflow.
-  const auto vk_ = static_cast<int_fast8_t>(to_underlying(vk));
+  const auto vk_ = static_cast<int_fast8_t>(llvm::to_underlying(vk));
   return static_cast<char>(100 + vk_ * (26 - vk_ * 11));
 }
 static_assert(toChar(VarKind::Symbol) == 's' &&
@@ -100,7 +101,7 @@ class Var {
   public:
     constexpr Impl(VarKind vk, Num n)
         : data((static_cast<Storage>(n) << 2) |
-               static_cast<Storage>(to_underlying(vk))) {
+               static_cast<Storage>(llvm::to_underlying(vk))) {
       assert(isWF(vk) && "unknown VarKind");
       assert(isWF_Num(n) && "Var::Num is too large");
     }
@@ -215,7 +216,7 @@ class Ranks final {
 
   static constexpr unsigned to_index(VarKind vk) {
     assert(isWF(vk) && "unknown VarKind");
-    return static_cast<unsigned>(to_underlying(vk));
+    return static_cast<unsigned>(llvm::to_underlying(vk));
   }
 
 public:
@@ -349,7 +350,7 @@ class VarEnv final {
   /// to live too long.
   VarInfo const &access(VarInfo::ID id) const {
     // `SmallVector::operator[]` already asserts the index is in-bounds.
-    return vars[to_underlying(id)];
+    return vars[llvm::to_underlying(id)];
   }
   VarInfo const *access(std::optional<VarInfo::ID> oid) const {
     return oid ? &access(*oid) : nullptr;

From 04dd2ac03a568af157f465ec1242ce8bd14e0dcf Mon Sep 17 00:00:00 2001
From: Michael Maitland <michaeltmaitland@gmail.com>
Date: Mon, 30 Oct 2023 15:46:36 -0400
Subject: [PATCH 422/877] [RISCV][GlobalISel] Select G_GLOBAL_VALUE (#70091)

G_GLOBAL_VALUE should be lowered into an absolute address if
`-codemodel=small` is used or into a PC-relative if `-codemodel=medium`
is used.

PR #68380 tried to create special instructions to do this, but I don't
see why we need to do that.
---
 .../CodeGen/GlobalISel/InstructionSelector.h  |   8 ++
 .../CodeGen/GlobalISel/InstructionSelect.cpp  |   2 +
 .../RISCV/GISel/RISCVInstructionSelector.cpp  | 127 +++++++++++++++++
 .../instruction-select/global-value32.mir     | 129 ++++++++++++++++++
 .../instruction-select/global-value64.mir     | 128 +++++++++++++++++
 5 files changed, 394 insertions(+)
 create mode 100644 llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/global-value32.mir
 create mode 100644 llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/global-value64.mir

diff --git a/llvm/include/llvm/CodeGen/GlobalISel/InstructionSelector.h b/llvm/include/llvm/CodeGen/GlobalISel/InstructionSelector.h
index 1662136cfa94a..8331cb58a0991 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/InstructionSelector.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/InstructionSelector.h
@@ -31,6 +31,14 @@ class InstructionSelector : public GIMatchTableExecutor {
   ///     for I in all mutated/inserted instructions:
   ///       !isPreISelGenericOpcode(I.getOpcode())
   virtual bool select(MachineInstr &I) = 0;
+
+  void setTargetPassConfig(const TargetPassConfig *T) { TPC = T; }
+
+  void setRemarkEmitter(MachineOptimizationRemarkEmitter *M) { MORE = M; }
+
+protected:
+  const TargetPassConfig *TPC = nullptr;
+  MachineOptimizationRemarkEmitter *MORE = nullptr;
 };
 } // namespace llvm
 
diff --git a/llvm/lib/CodeGen/GlobalISel/InstructionSelect.cpp b/llvm/lib/CodeGen/GlobalISel/InstructionSelect.cpp
index 75f1fbc3b2d1b..baea773cf528e 100644
--- a/llvm/lib/CodeGen/GlobalISel/InstructionSelect.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/InstructionSelect.cpp
@@ -90,6 +90,7 @@ bool InstructionSelect::runOnMachineFunction(MachineFunction &MF) {
 
   const TargetPassConfig &TPC = getAnalysis<TargetPassConfig>();
   InstructionSelector *ISel = MF.getSubtarget().getInstructionSelector();
+  ISel->setTargetPassConfig(&TPC);
 
   CodeGenOptLevel OldOptLevel = OptLevel;
   auto RestoreOptLevel = make_scope_exit([=]() { OptLevel = OldOptLevel; });
@@ -109,6 +110,7 @@ bool InstructionSelect::runOnMachineFunction(MachineFunction &MF) {
 
   // An optimization remark emitter. Used to report failures.
   MachineOptimizationRemarkEmitter MORE(MF, /*MBFI=*/nullptr);
+  ISel->setRemarkEmitter(&MORE);
 
   // FIXME: There are many other MF/MFI fields we need to initialize.
 
diff --git a/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp b/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp
index bd3662c942de8..b03be71ed7b2a 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp
+++ b/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp
@@ -63,6 +63,8 @@ class RISCVInstructionSelector : public InstructionSelector {
   bool selectCopy(MachineInstr &MI, MachineRegisterInfo &MRI) const;
   bool selectConstant(MachineInstr &MI, MachineIRBuilder &MIB,
                       MachineRegisterInfo &MRI) const;
+  bool selectGlobalValue(MachineInstr &MI, MachineIRBuilder &MIB,
+                         MachineRegisterInfo &MRI) const;
   bool selectSExtInreg(MachineInstr &MI, MachineIRBuilder &MIB) const;
   bool selectSelect(MachineInstr &MI, MachineIRBuilder &MIB,
                     MachineRegisterInfo &MRI) const;
@@ -98,6 +100,7 @@ class RISCVInstructionSelector : public InstructionSelector {
   const RISCVInstrInfo &TII;
   const RISCVRegisterInfo &TRI;
   const RISCVRegisterBankInfo &RBI;
+  const RISCVTargetMachine &TM;
 
   // FIXME: This is necessary because DAGISel uses "Subtarget->" and GlobalISel
   // uses "STI." in the code generated by TableGen. We need to unify the name of
@@ -123,6 +126,7 @@ RISCVInstructionSelector::RISCVInstructionSelector(
     const RISCVTargetMachine &TM, const RISCVSubtarget &STI,
     const RISCVRegisterBankInfo &RBI)
     : STI(STI), TII(*STI.getInstrInfo()), TRI(*STI.getRegisterInfo()), RBI(RBI),
+      TM(TM),
 
 #define GET_GLOBALISEL_PREDICATES_INIT
 #include "RISCVGenGlobalISel.inc"
@@ -346,6 +350,8 @@ bool RISCVInstructionSelector::select(MachineInstr &MI) {
     return selectCopy(MI, MRI);
   case TargetOpcode::G_CONSTANT:
     return selectConstant(MI, MIB, MRI);
+  case TargetOpcode::G_GLOBAL_VALUE:
+    return selectGlobalValue(MI, MIB, MRI);
   case TargetOpcode::G_BRCOND: {
     // TODO: Fold with G_ICMP.
     auto Bcc =
@@ -543,6 +549,127 @@ bool RISCVInstructionSelector::selectConstant(MachineInstr &MI,
   return true;
 }
 
+bool RISCVInstructionSelector::selectGlobalValue(
+    MachineInstr &MI, MachineIRBuilder &MIB, MachineRegisterInfo &MRI) const {
+  assert(MI.getOpcode() == TargetOpcode::G_GLOBAL_VALUE &&
+         "Expected G_GLOBAL_VALUE");
+
+  auto *GV = MI.getOperand(1).getGlobal();
+  if (GV->isThreadLocal()) {
+    // TODO: implement this case.
+    return false;
+  }
+
+  Register DefReg = MI.getOperand(0).getReg();
+  const LLT DefTy = MRI.getType(DefReg);
+  MachineInstr *Result = nullptr;
+
+  // When HWASAN is used and tagging of global variables is enabled
+  // they should be accessed via the GOT, since the tagged address of a global
+  // is incompatible with existing code models. This also applies to non-pic
+  // mode.
+  if (TM.isPositionIndependent() || Subtarget->allowTaggedGlobals()) {
+    if (GV->isDSOLocal() && !Subtarget->allowTaggedGlobals()) {
+      // Use PC-relative addressing to access the symbol. This generates the
+      // pattern (PseudoLLA sym), which expands to (addi (auipc %pcrel_hi(sym))
+      // %pcrel_lo(auipc)).
+      Result = MIB.buildInstr(RISCV::PseudoLLA)
+                   .addDef(DefReg)
+                   .addGlobalAddress(GV, 0);
+    } else {
+      // Use PC-relative addressing to access the GOT for this symbol, then
+      // load the address from the GOT. This generates the pattern (PseudoLGA
+      // sym), which expands to (ld (addi (auipc %got_pcrel_hi(sym))
+      // %pcrel_lo(auipc))).
+      MachineFunction &MF = *MI.getParent()->getParent();
+      MachineMemOperand *MemOp = MF.getMachineMemOperand(
+          MachinePointerInfo::getGOT(MF),
+          MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
+              MachineMemOperand::MOInvariant,
+          DefTy, Align(DefTy.getSizeInBits() / 8));
+
+      Result = MIB.buildInstr(RISCV::PseudoLGA)
+                   .addDef(DefReg)
+                   .addGlobalAddress(GV, 0)
+                   .addMemOperand(MemOp);
+    }
+
+    if (!constrainSelectedInstRegOperands(*Result, TII, TRI, RBI))
+      return false;
+
+    MI.eraseFromParent();
+    return true;
+  }
+
+  switch (TM.getCodeModel()) {
+  default: {
+    reportGISelFailure(const_cast<MachineFunction &>(*MF), *TPC, *MORE,
+                       getName(), "Unsupported code model for lowering", MI);
+    return false;
+  }
+  case CodeModel::Small: {
+    // Must lie within a single 2 GiB address range and must lie between
+    // absolute addresses -2 GiB and +2 GiB. This generates the pattern (addi
+    // (lui %hi(sym)) %lo(sym)).
+    Register AddrHiDest = MRI.createVirtualRegister(&RISCV::GPRRegClass);
+    MachineInstr *AddrHi = MIB.buildInstr(RISCV::LUI)
+                               .addDef(AddrHiDest)
+                               .addGlobalAddress(GV, RISCVII::MO_HI);
+
+    if (!constrainSelectedInstRegOperands(*AddrHi, TII, TRI, RBI))
+      return false;
+
+    Result = MIB.buildInstr(RISCV::ADDI)
+                 .addDef(DefReg)
+                 .addReg(AddrHiDest)
+                 .addGlobalAddress(GV, 0, RISCVII::MO_LO);
+
+    if (!constrainSelectedInstRegOperands(*Result, TII, TRI, RBI))
+      return false;
+
+    MI.eraseFromParent();
+    return true;
+  }
+  case CodeModel::Medium: {
+    // Emit LGA/LLA instead of the sequence it expands to because the pcrel_lo
+    // relocation needs to reference a label that points to the auipc
+    // instruction itself, not the global. This cannot be done inside the
+    // instruction selector.
+    if (GV->hasExternalWeakLinkage()) {
+      // An extern weak symbol may be undefined, i.e. have value 0, which may
+      // not be within 2GiB of PC, so use GOT-indirect addressing to access the
+      // symbol. This generates the pattern (PseudoLGA sym), which expands to
+      // (ld (addi (auipc %got_pcrel_hi(sym)) %pcrel_lo(auipc))).
+      MachineFunction &MF = *MI.getParent()->getParent();
+      MachineMemOperand *MemOp = MF.getMachineMemOperand(
+          MachinePointerInfo::getGOT(MF),
+          MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
+              MachineMemOperand::MOInvariant,
+          DefTy, Align(DefTy.getSizeInBits() / 8));
+
+      Result = MIB.buildInstr(RISCV::PseudoLGA)
+                   .addDef(DefReg)
+                   .addGlobalAddress(GV, 0)
+                   .addMemOperand(MemOp);
+    } else {
+      // Generate a sequence for accessing addresses within any 2GiB range
+      // within the address space. This generates the pattern (PseudoLLA sym),
+      // which expands to (addi (auipc %pcrel_hi(sym)) %pcrel_lo(auipc)).
+      Result = MIB.buildInstr(RISCV::PseudoLLA)
+                   .addDef(DefReg)
+                   .addGlobalAddress(GV, 0);
+    }
+
+    if (!constrainSelectedInstRegOperands(*Result, TII, TRI, RBI))
+      return false;
+
+    MI.eraseFromParent();
+    return true;
+  }
+  }
+  return false;
+}
+
 bool RISCVInstructionSelector::selectSExtInreg(MachineInstr &MI,
                                                MachineIRBuilder &MIB) const {
   if (!STI.isRV64())
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/global-value32.mir b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/global-value32.mir
new file mode 100644
index 0000000000000..bbc7bc893e682
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/global-value32.mir
@@ -0,0 +1,129 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
+# RUN: llc -mtriple=riscv32 -run-pass=instruction-select -relocation-model=pic \
+# RUN:   %s -o - | FileCheck --check-prefix=RV32-PIE %s
+# RUN: llc -mtriple=riscv32 -run-pass=instruction-select \
+# RUN:   -mattr=+tagged-globals %s -o - | FileCheck  \
+# RUN:   --check-prefix=RV32-NOPIE-TAG %s
+# RUN: llc -mtriple=riscv32 -run-pass=instruction-select -code-model=small \
+# RUN:   %s -o - | FileCheck --check-prefix=RV32-SMALL-NOPIE-NOTAG %s
+# RUN: llc -mtriple=riscv32 -run-pass=instruction-select -code-model=medium \
+# RUN:   %s -o - | FileCheck --check-prefix=RV32-MED %s
+
+
+--- |
+  @x = global i32 0, align 4
+  define ptr @global_addr() {
+  entry:
+    ret ptr @x
+  }
+  @y = extern_weak global i32, align 4
+  define ptr @extern_weak_global_addr() {
+  entry:
+    ret ptr @y
+  }
+  @z = dso_local global i32 0, align 4
+  define ptr @local_global_addr() {
+  entry:
+    ret ptr @z
+  }
+...
+---
+name:            global_addr
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: gprb, preferred-register: '' }
+body:             |
+  bb.1.entry:
+    ; RV32-PIE-LABEL: name: global_addr
+    ; RV32-PIE: [[PseudoLGA:%[0-9]+]]:gpr = PseudoLGA @x :: (dereferenceable invariant load (p0) from got)
+    ; RV32-PIE-NEXT: $x10 = COPY [[PseudoLGA]]
+    ; RV32-PIE-NEXT: PseudoRET implicit $x10
+    ;
+    ; RV32-NOPIE-TAG-LABEL: name: global_addr
+    ; RV32-NOPIE-TAG: [[PseudoLGA:%[0-9]+]]:gpr = PseudoLGA @x :: (dereferenceable invariant load (p0) from got)
+    ; RV32-NOPIE-TAG-NEXT: $x10 = COPY [[PseudoLGA]]
+    ; RV32-NOPIE-TAG-NEXT: PseudoRET implicit $x10
+    ;
+    ; RV32-SMALL-NOPIE-NOTAG-LABEL: name: global_addr
+    ; RV32-SMALL-NOPIE-NOTAG: [[LUI:%[0-9]+]]:gpr = LUI @x + 4
+    ; RV32-SMALL-NOPIE-NOTAG-NEXT: [[ADDI:%[0-9]+]]:gpr = ADDI [[LUI]], target-flags(riscv-lo) @x
+    ; RV32-SMALL-NOPIE-NOTAG-NEXT: $x10 = COPY [[ADDI]]
+    ; RV32-SMALL-NOPIE-NOTAG-NEXT: PseudoRET implicit $x10
+    ;
+    ; RV32-MED-LABEL: name: global_addr
+    ; RV32-MED: [[PseudoLLA:%[0-9]+]]:gpr = PseudoLLA @x
+    ; RV32-MED-NEXT: $x10 = COPY [[PseudoLLA]]
+    ; RV32-MED-NEXT: PseudoRET implicit $x10
+    %0:gprb(p0) = G_GLOBAL_VALUE @x
+    $x10 = COPY %0(p0)
+    PseudoRET implicit $x10
+...
+---
+name:            extern_weak_global_addr
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: gprb, preferred-register: '' }
+body:             |
+  bb.1.entry:
+    ; RV32-PIE-LABEL: name: extern_weak_global_addr
+    ; RV32-PIE: [[PseudoLGA:%[0-9]+]]:gpr = PseudoLGA @y :: (dereferenceable invariant load (p0) from got)
+    ; RV32-PIE-NEXT: $x10 = COPY [[PseudoLGA]]
+    ; RV32-PIE-NEXT: PseudoRET implicit $x10
+    ;
+    ; RV32-NOPIE-TAG-LABEL: name: extern_weak_global_addr
+    ; RV32-NOPIE-TAG: [[PseudoLGA:%[0-9]+]]:gpr = PseudoLGA @y :: (dereferenceable invariant load (p0) from got)
+    ; RV32-NOPIE-TAG-NEXT: $x10 = COPY [[PseudoLGA]]
+    ; RV32-NOPIE-TAG-NEXT: PseudoRET implicit $x10
+    ;
+    ; RV32-SMALL-NOPIE-NOTAG-LABEL: name: extern_weak_global_addr
+    ; RV32-SMALL-NOPIE-NOTAG: [[LUI:%[0-9]+]]:gpr = LUI @y + 4
+    ; RV32-SMALL-NOPIE-NOTAG-NEXT: [[ADDI:%[0-9]+]]:gpr = ADDI [[LUI]], target-flags(riscv-lo) @y
+    ; RV32-SMALL-NOPIE-NOTAG-NEXT: $x10 = COPY [[ADDI]]
+    ; RV32-SMALL-NOPIE-NOTAG-NEXT: PseudoRET implicit $x10
+    ;
+    ; RV32-MED-LABEL: name: extern_weak_global_addr
+    ; RV32-MED: [[PseudoLGA:%[0-9]+]]:gpr = PseudoLGA @y :: (dereferenceable invariant load (p0) from got)
+    ; RV32-MED-NEXT: $x10 = COPY [[PseudoLGA]]
+    ; RV32-MED-NEXT: PseudoRET implicit $x10
+    %0:gprb(p0) = G_GLOBAL_VALUE @y
+    $x10 = COPY %0(p0)
+    PseudoRET implicit $x10
+...
+---
+name:            local_global_addr
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: gprb, preferred-register: '' }
+body:             |
+  bb.1.entry:
+    ; RV32-PIE-LABEL: name: local_global_addr
+    ; RV32-PIE: [[PseudoLLA:%[0-9]+]]:gpr = PseudoLLA @z
+    ; RV32-PIE-NEXT: $x10 = COPY [[PseudoLLA]]
+    ; RV32-PIE-NEXT: PseudoRET implicit $x10
+    ;
+    ; RV32-NOPIE-TAG-LABEL: name: local_global_addr
+    ; RV32-NOPIE-TAG: [[PseudoLGA:%[0-9]+]]:gpr = PseudoLGA @z :: (dereferenceable invariant load (p0) from got)
+    ; RV32-NOPIE-TAG-NEXT: $x10 = COPY [[PseudoLGA]]
+    ; RV32-NOPIE-TAG-NEXT: PseudoRET implicit $x10
+    ;
+    ; RV32-SMALL-NOPIE-NOTAG-LABEL: name: local_global_addr
+    ; RV32-SMALL-NOPIE-NOTAG: [[LUI:%[0-9]+]]:gpr = LUI @z + 4
+    ; RV32-SMALL-NOPIE-NOTAG-NEXT: [[ADDI:%[0-9]+]]:gpr = ADDI [[LUI]], target-flags(riscv-lo) @z
+    ; RV32-SMALL-NOPIE-NOTAG-NEXT: $x10 = COPY [[ADDI]]
+    ; RV32-SMALL-NOPIE-NOTAG-NEXT: PseudoRET implicit $x10
+    ;
+    ; RV32-MED-LABEL: name: local_global_addr
+    ; RV32-MED: [[PseudoLLA:%[0-9]+]]:gpr = PseudoLLA @z
+    ; RV32-MED-NEXT: $x10 = COPY [[PseudoLLA]]
+    ; RV32-MED-NEXT: PseudoRET implicit $x10
+    %0:gprb(p0) = G_GLOBAL_VALUE @z
+    $x10 = COPY %0(p0)
+    PseudoRET implicit $x10
+...
+
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/global-value64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/global-value64.mir
new file mode 100644
index 0000000000000..440afaab7b28e
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/global-value64.mir
@@ -0,0 +1,128 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
+# RUN: llc -mtriple=riscv64 -run-pass=instruction-select -relocation-model=pic \
+# RUN:   %s -o - | FileCheck --check-prefix=RV64-PIE %s
+# RUN: llc -mtriple=riscv64 -run-pass=instruction-select \
+# RUN:   -mattr=+tagged-globals %s -o - | FileCheck  \
+# RUN:   --check-prefix=RV64-NOPIE-TAG %s
+# RUN: llc -mtriple=riscv64 -run-pass=instruction-select -code-model=small \
+# RUN:   %s -o - | FileCheck --check-prefix=RV64-SMALL-NOPIE-NOTAG %s
+# RUN: llc -mtriple=riscv64 -run-pass=instruction-select -code-model=medium \
+# RUN:   %s -o - | FileCheck --check-prefix=RV64-MED %s
+
+--- |
+  @x = global i32 0, align 4
+  define ptr @global_addr() {
+  entry:
+    ret ptr @x
+  }
+  @y = extern_weak global i32, align 4
+  define ptr @extern_weak_global_addr() {
+  entry:
+    ret ptr @y
+  }
+  @z = dso_local global i32 0, align 4
+  define ptr @local_global_addr() {
+  entry:
+    ret ptr @z
+  }
+...
+---
+name:            global_addr
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: gprb, preferred-register: '' }
+body:             |
+  bb.1.entry:
+    ; RV64-PIE-LABEL: name: global_addr
+    ; RV64-PIE: [[PseudoLGA:%[0-9]+]]:gpr = PseudoLGA @x :: (dereferenceable invariant load (p0) from got)
+    ; RV64-PIE-NEXT: $x10 = COPY [[PseudoLGA]]
+    ; RV64-PIE-NEXT: PseudoRET implicit $x10
+    ;
+    ; RV64-NOPIE-TAG-LABEL: name: global_addr
+    ; RV64-NOPIE-TAG: [[PseudoLGA:%[0-9]+]]:gpr = PseudoLGA @x :: (dereferenceable invariant load (p0) from got)
+    ; RV64-NOPIE-TAG-NEXT: $x10 = COPY [[PseudoLGA]]
+    ; RV64-NOPIE-TAG-NEXT: PseudoRET implicit $x10
+    ;
+    ; RV64-SMALL-NOPIE-NOTAG-LABEL: name: global_addr
+    ; RV64-SMALL-NOPIE-NOTAG: [[LUI:%[0-9]+]]:gpr = LUI @x + 4
+    ; RV64-SMALL-NOPIE-NOTAG-NEXT: [[ADDI:%[0-9]+]]:gpr = ADDI [[LUI]], target-flags(riscv-lo) @x
+    ; RV64-SMALL-NOPIE-NOTAG-NEXT: $x10 = COPY [[ADDI]]
+    ; RV64-SMALL-NOPIE-NOTAG-NEXT: PseudoRET implicit $x10
+    ;
+    ; RV64-MED-LABEL: name: global_addr
+    ; RV64-MED: [[PseudoLLA:%[0-9]+]]:gpr = PseudoLLA @x
+    ; RV64-MED-NEXT: $x10 = COPY [[PseudoLLA]]
+    ; RV64-MED-NEXT: PseudoRET implicit $x10
+    %0:gprb(p0) = G_GLOBAL_VALUE @x
+    $x10 = COPY %0(p0)
+    PseudoRET implicit $x10
+...
+---
+name:            extern_weak_global_addr
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: gprb, preferred-register: '' }
+body:             |
+  bb.1.entry:
+    ; RV64-PIE-LABEL: name: extern_weak_global_addr
+    ; RV64-PIE: [[PseudoLGA:%[0-9]+]]:gpr = PseudoLGA @y :: (dereferenceable invariant load (p0) from got)
+    ; RV64-PIE-NEXT: $x10 = COPY [[PseudoLGA]]
+    ; RV64-PIE-NEXT: PseudoRET implicit $x10
+    ;
+    ; RV64-NOPIE-TAG-LABEL: name: extern_weak_global_addr
+    ; RV64-NOPIE-TAG: [[PseudoLGA:%[0-9]+]]:gpr = PseudoLGA @y :: (dereferenceable invariant load (p0) from got)
+    ; RV64-NOPIE-TAG-NEXT: $x10 = COPY [[PseudoLGA]]
+    ; RV64-NOPIE-TAG-NEXT: PseudoRET implicit $x10
+    ;
+    ; RV64-SMALL-NOPIE-NOTAG-LABEL: name: extern_weak_global_addr
+    ; RV64-SMALL-NOPIE-NOTAG: [[LUI:%[0-9]+]]:gpr = LUI @y + 4
+    ; RV64-SMALL-NOPIE-NOTAG-NEXT: [[ADDI:%[0-9]+]]:gpr = ADDI [[LUI]], target-flags(riscv-lo) @y
+    ; RV64-SMALL-NOPIE-NOTAG-NEXT: $x10 = COPY [[ADDI]]
+    ; RV64-SMALL-NOPIE-NOTAG-NEXT: PseudoRET implicit $x10
+    ;
+    ; RV64-MED-LABEL: name: extern_weak_global_addr
+    ; RV64-MED: [[PseudoLGA:%[0-9]+]]:gpr = PseudoLGA @y :: (dereferenceable invariant load (p0) from got)
+    ; RV64-MED-NEXT: $x10 = COPY [[PseudoLGA]]
+    ; RV64-MED-NEXT: PseudoRET implicit $x10
+    %0:gprb(p0) = G_GLOBAL_VALUE @y
+    $x10 = COPY %0(p0)
+    PseudoRET implicit $x10
+...
+---
+name:            local_global_addr
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: gprb, preferred-register: '' }
+body:             |
+  bb.1.entry:
+    ; RV64-PIE-LABEL: name: local_global_addr
+    ; RV64-PIE: [[PseudoLLA:%[0-9]+]]:gpr = PseudoLLA @z
+    ; RV64-PIE-NEXT: $x10 = COPY [[PseudoLLA]]
+    ; RV64-PIE-NEXT: PseudoRET implicit $x10
+    ;
+    ; RV64-NOPIE-TAG-LABEL: name: local_global_addr
+    ; RV64-NOPIE-TAG: [[PseudoLGA:%[0-9]+]]:gpr = PseudoLGA @z :: (dereferenceable invariant load (p0) from got)
+    ; RV64-NOPIE-TAG-NEXT: $x10 = COPY [[PseudoLGA]]
+    ; RV64-NOPIE-TAG-NEXT: PseudoRET implicit $x10
+    ;
+    ; RV64-SMALL-NOPIE-NOTAG-LABEL: name: local_global_addr
+    ; RV64-SMALL-NOPIE-NOTAG: [[LUI:%[0-9]+]]:gpr = LUI @z + 4
+    ; RV64-SMALL-NOPIE-NOTAG-NEXT: [[ADDI:%[0-9]+]]:gpr = ADDI [[LUI]], target-flags(riscv-lo) @z
+    ; RV64-SMALL-NOPIE-NOTAG-NEXT: $x10 = COPY [[ADDI]]
+    ; RV64-SMALL-NOPIE-NOTAG-NEXT: PseudoRET implicit $x10
+    ;
+    ; RV64-MED-LABEL: name: local_global_addr
+    ; RV64-MED: [[PseudoLLA:%[0-9]+]]:gpr = PseudoLLA @z
+    ; RV64-MED-NEXT: $x10 = COPY [[PseudoLLA]]
+    ; RV64-MED-NEXT: PseudoRET implicit $x10
+    %0:gprb(p0) = G_GLOBAL_VALUE @z
+    $x10 = COPY %0(p0)
+    PseudoRET implicit $x10
+...
+

From 093bc6b61a6c87e138a4bf89fe620f6e63d20eda Mon Sep 17 00:00:00 2001
From: Michael Maitland <michaeltmaitland@gmail.com>
Date: Mon, 30 Oct 2023 15:47:45 -0400
Subject: [PATCH 423/877] [RISCV] SiFive7 VLDS Sched should not depend on VL
 when stride is x0. (#70266)

When stride is x0, a strided load should behave like a unit stride load,
which uses the VLDE sched class.

---------

Co-authored-by: Wang Pengcheng <wangpengcheng.pp@bytedance.com>
---
 llvm/lib/Target/RISCV/RISCVSchedSiFive7.td    |  34 ++++-
 llvm/lib/Target/RISCV/RISCVScheduleV.td       |  42 ++++++
 .../llvm-mca/RISCV/SiFive7/strided-load-x0.s  | 125 ++++++++++++++++++
 3 files changed, 194 insertions(+), 7 deletions(-)
 create mode 100644 llvm/test/tools/llvm-mca/RISCV/SiFive7/strided-load-x0.s

diff --git a/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td b/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td
index d2447cf23e266..9da68dc9a139d 100644
--- a/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td
+++ b/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td
@@ -455,11 +455,19 @@ foreach mx = SchedMxList in {
 // specific suffixes, but since SEW is already encoded in the name of the
 // resource, we do not need to use LMULSEWXXX constructors. However, we do
 // use the SEW from the name to determine the number of Cycles.
+
+// This predicate is true when the rs2 operand of vlse or vsse is x0, false
+// otherwise.
+def VLDSX0Pred : MCSchedPredicate<CheckRegOperand<3, X0>>;
+
 foreach mx = SchedMxList in {
+  defvar VLDSX0Cycles = SiFive7GetCyclesDefault<mx>.c;
   defvar Cycles = SiFive7GetCyclesOnePerElement<mx, 8>.c;
   defvar IsWorstCase = SiFive7IsWorstCaseMX<mx, SchedMxList>.c;
+  defm SiFive7 : LMULWriteResMXVariant<"WriteVLDS8",  VLDSX0Pred, [SiFive7VL],
+                                       4, [VLDSX0Cycles], !add(3, Cycles),
+                                       [Cycles], mx, IsWorstCase>;
   let Latency = !add(3, Cycles), ReleaseAtCycles = [Cycles] in {
-    defm "" : LMULWriteResMX<"WriteVLDS8",  [SiFive7VL], mx, IsWorstCase>;
     defm "" : LMULWriteResMX<"WriteVLDUX8", [SiFive7VL], mx, IsWorstCase>;
     defm "" : LMULWriteResMX<"WriteVLDOX8", [SiFive7VL], mx, IsWorstCase>;
   }
@@ -469,11 +477,17 @@ foreach mx = SchedMxList in {
     defm "" : LMULWriteResMX<"WriteVSTOX8", [SiFive7VS], mx, IsWorstCase>;
   }
 }
-foreach mx = SchedMxList in {
+// TODO: The MxLists need to be filtered by EEW. We only need to support
+// LMUL >= SEW_min/ELEN. Here, the smallest EEW prevents us from having MF8
+// since LMUL >= 16/64.
+foreach mx = ["MF4", "MF2", "M1", "M2", "M4", "M8"] in {
+  defvar VLDSX0Cycles = SiFive7GetCyclesDefault<mx>.c;
   defvar Cycles = SiFive7GetCyclesOnePerElement<mx, 16>.c;
   defvar IsWorstCase = SiFive7IsWorstCaseMX<mx, SchedMxList>.c;
+  defm SiFive7 : LMULWriteResMXVariant<"WriteVLDS16",  VLDSX0Pred, [SiFive7VL],
+                                       4, [VLDSX0Cycles], !add(3, Cycles),
+                                       [Cycles], mx, IsWorstCase>;
   let Latency = !add(3, Cycles), ReleaseAtCycles = [Cycles] in {
-    defm "" : LMULWriteResMX<"WriteVLDS16",  [SiFive7VL], mx, IsWorstCase>;
     defm "" : LMULWriteResMX<"WriteVLDUX16", [SiFive7VL], mx, IsWorstCase>;
     defm "" : LMULWriteResMX<"WriteVLDOX16", [SiFive7VL], mx, IsWorstCase>;
   }
@@ -483,11 +497,14 @@ foreach mx = SchedMxList in {
     defm "" : LMULWriteResMX<"WriteVSTOX16", [SiFive7VS], mx, IsWorstCase>;
   }
 }
-foreach mx = SchedMxList in {
+foreach mx = ["MF2", "M1", "M2", "M4", "M8"] in {
+  defvar VLDSX0Cycles = SiFive7GetCyclesDefault<mx>.c;
   defvar Cycles = SiFive7GetCyclesOnePerElement<mx, 32>.c;
   defvar IsWorstCase = SiFive7IsWorstCaseMX<mx, SchedMxList>.c;
+  defm SiFive7 : LMULWriteResMXVariant<"WriteVLDS32",  VLDSX0Pred, [SiFive7VL],
+                                       4, [VLDSX0Cycles], !add(3, Cycles),
+                                       [Cycles], mx, IsWorstCase>;
   let Latency = !add(3, Cycles), ReleaseAtCycles = [Cycles] in {
-    defm "" : LMULWriteResMX<"WriteVLDS32",  [SiFive7VL], mx, IsWorstCase>;
     defm "" : LMULWriteResMX<"WriteVLDUX32", [SiFive7VL], mx, IsWorstCase>;
     defm "" : LMULWriteResMX<"WriteVLDOX32", [SiFive7VL], mx, IsWorstCase>;
   }
@@ -497,11 +514,14 @@ foreach mx = SchedMxList in {
     defm "" : LMULWriteResMX<"WriteVSTOX32", [SiFive7VS], mx, IsWorstCase>;
   }
 }
-foreach mx = SchedMxList in {
+foreach mx = ["M1", "M2", "M4", "M8"] in {
+  defvar VLDSX0Cycles = SiFive7GetCyclesDefault<mx>.c;
   defvar Cycles = SiFive7GetCyclesOnePerElement<mx, 64>.c;
   defvar IsWorstCase = SiFive7IsWorstCaseMX<mx, SchedMxList>.c;
+  defm SiFive7 : LMULWriteResMXVariant<"WriteVLDS64",  VLDSX0Pred, [SiFive7VL],
+                                       4, [VLDSX0Cycles], !add(3, Cycles),
+                                       [Cycles], mx, IsWorstCase>;
   let Latency = !add(3, Cycles), ReleaseAtCycles = [Cycles] in {
-    defm "" : LMULWriteResMX<"WriteVLDS64",  [SiFive7VL], mx, IsWorstCase>;
     defm "" : LMULWriteResMX<"WriteVLDUX64", [SiFive7VL], mx, IsWorstCase>;
     defm "" : LMULWriteResMX<"WriteVLDOX64", [SiFive7VL], mx, IsWorstCase>;
   }
diff --git a/llvm/lib/Target/RISCV/RISCVScheduleV.td b/llvm/lib/Target/RISCV/RISCVScheduleV.td
index 7af7716c96b85..b5ddb8197993b 100644
--- a/llvm/lib/Target/RISCV/RISCVScheduleV.td
+++ b/llvm/lib/Target/RISCV/RISCVScheduleV.td
@@ -62,6 +62,48 @@ multiclass LMULSEWWriteResMXSEW<string name, list<ProcResourceKind> resources,
     def : WriteRes<!cast<SchedWrite>(name # "_WorstCase"), resources>;
 }
 
+// Define a SchedAlias for the SchedWrite associated with (name, mx) whose
+// behavior is aliased to a Variant. The Variant has Latency predLad and
+// ReleaseAtCycles predCycles if the SchedPredicate Pred is true, otherwise has
+// Latency noPredLat and ReleaseAtCycles noPredCycles. The WorstCase SchedWrite
+// is created similiarly if IsWorstCase is true.
+multiclass LMULWriteResMXVariant<string name, SchedPredicateBase Pred,
+                                 list<ProcResourceKind> resources,
+                                 int predLat, list<int> predCycles,
+                                 int noPredLat, list<int> noPredCycles,
+                                 string mx, bit IsWorstCase> {
+  defvar nameMX = name # "_" # mx;
+
+  // Define the different behaviors
+  def NAME # nameMX # "_Pred" : SchedWriteRes<resources> {
+    let Latency = predLat;
+    let ReleaseAtCycles = predCycles;
+  }
+  def NAME # nameMX # "_NoPred" : SchedWriteRes<resources> {
+    let Latency = noPredLat;
+    let ReleaseAtCycles = noPredCycles;
+  }
+
+  // Tie behavior to predicate
+  def NAME # nameMX # "_Variant" : SchedWriteVariant<[
+    SchedVar<Pred, [!cast<SchedWriteRes>(NAME # nameMX # "_Pred")]>,
+    SchedVar<NoSchedPred, [!cast<SchedWriteRes>(NAME # nameMX # "_NoPred")]>
+  ]>;
+  def : SchedAlias<
+    !cast<SchedReadWrite>(nameMX),
+    !cast<SchedReadWrite>(NAME # nameMX # "_Variant")>;
+
+  if IsWorstCase then {
+    def NAME # name # "_WorstCase_Variant" : SchedWriteVariant<[
+      SchedVar<Pred, [!cast<SchedWriteRes>(NAME # nameMX # "_Pred")]>,
+      SchedVar<NoSchedPred, [!cast<SchedWriteRes>(NAME # nameMX # "_NoPred")]>
+    ]>;
+    def : SchedAlias<
+      !cast<SchedReadWrite>(name # "_WorstCase"),
+      !cast<SchedReadWrite>(NAME # name # "_WorstCase_Variant")>;
+  }
+}
+
 // Define multiclasses to define SchedWrite, SchedRead,  WriteRes, and
 // ReadAdvance for each (name, LMUL) pair and for each LMUL in each of the
 // SchedMxList variants above. Each multiclass is responsible for defining
diff --git a/llvm/test/tools/llvm-mca/RISCV/SiFive7/strided-load-x0.s b/llvm/test/tools/llvm-mca/RISCV/SiFive7/strided-load-x0.s
new file mode 100644
index 0000000000000..8b52d0ece6359
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/RISCV/SiFive7/strided-load-x0.s
@@ -0,0 +1,125 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -debug -mtriple=riscv64 -mcpu=sifive-x280 -iterations=1 < %s | FileCheck %s
+
+vsetvli zero, zero, e32, m1, tu, mu
+
+vlse8.v  v1, (a1), a2
+vlse16.v v1, (a1), a2
+vlse32.v v1, (a1), a2
+vlse64.v v1, (a1), a2
+
+vlse8.v  v1, (a1), zero
+vlse16.v v1, (a1), zero
+vlse32.v v1, (a1), zero
+vlse64.v v1, (a1), zero
+
+vle8.v  v1, (a1)
+vle16.v v1, (a1)
+vle32.v v1, (a1)
+vle64.v v1, (a1)
+
+vsetvli zero, zero, e64, m1, tu, mu
+
+vlse8.v  v1, (a1), a2
+vlse16.v v1, (a1), a2
+vlse32.v v1, (a1), a2
+vlse64.v v1, (a1), a2
+
+vlse8.v  v1, (a1), zero
+vlse16.v v1, (a1), zero
+vlse32.v v1, (a1), zero
+vlse64.v v1, (a1), zero
+
+vle8.v  v1, (a1)
+vle16.v v1, (a1)
+vle32.v v1, (a1)
+vle64.v v1, (a1)
+
+# CHECK:      Iterations:        1
+# CHECK-NEXT: Instructions:      26
+# CHECK-NEXT: Total Cycles:      3523
+# CHECK-NEXT: Total uOps:        26
+
+# CHECK:      Dispatch Width:    2
+# CHECK-NEXT: uOps Per Cycle:    0.01
+# CHECK-NEXT: IPC:               0.01
+# CHECK-NEXT: Block RThroughput: 3517.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  1      3     1.00                  U     vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  1      515   512.00  *                   vlse8.v	v1, (a1), a2
+# CHECK-NEXT:  1      259   256.00  *                   vlse16.v	v1, (a1), a2
+# CHECK-NEXT:  1      19    16.00   *                   vlse32.v	v1, (a1), a2
+# CHECK-NEXT:  1      67    64.00   *                   vlse64.v	v1, (a1), a2
+# CHECK-NEXT:  1      515   512.00  *                   vlse8.v	v1, (a1), zero
+# CHECK-NEXT:  1      259   256.00  *                   vlse16.v	v1, (a1), zero
+# CHECK-NEXT:  1      19    16.00   *                   vlse32.v	v1, (a1), zero
+# CHECK-NEXT:  1      67    64.00   *                   vlse64.v	v1, (a1), zero
+# CHECK-NEXT:  1      4     1.00    *                   vle8.v	v1, (a1)
+# CHECK-NEXT:  1      4     1.00    *                   vle16.v	v1, (a1)
+# CHECK-NEXT:  1      4     2.00    *                   vle32.v	v1, (a1)
+# CHECK-NEXT:  1      4     4.00    *                   vle64.v	v1, (a1)
+# CHECK-NEXT:  1      3     1.00                  U     vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  1      515   512.00  *                   vlse8.v	v1, (a1), a2
+# CHECK-NEXT:  1      259   256.00  *                   vlse16.v	v1, (a1), a2
+# CHECK-NEXT:  1      131   128.00  *                   vlse32.v	v1, (a1), a2
+# CHECK-NEXT:  1      11    8.00    *                   vlse64.v	v1, (a1), a2
+# CHECK-NEXT:  1      515   512.00  *                   vlse8.v	v1, (a1), zero
+# CHECK-NEXT:  1      259   256.00  *                   vlse16.v	v1, (a1), zero
+# CHECK-NEXT:  1      131   128.00  *                   vlse32.v	v1, (a1), zero
+# CHECK-NEXT:  1      11    8.00    *                   vlse64.v	v1, (a1), zero
+# CHECK-NEXT:  1      4     1.00    *                   vle8.v	v1, (a1)
+# CHECK-NEXT:  1      4     1.00    *                   vle16.v	v1, (a1)
+# CHECK-NEXT:  1      4     1.00    *                   vle32.v	v1, (a1)
+# CHECK-NEXT:  1      4     2.00    *                   vle64.v	v1, (a1)
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SiFive7FDiv
+# CHECK-NEXT: [1]   - SiFive7IDiv
+# CHECK-NEXT: [2]   - SiFive7PipeA
+# CHECK-NEXT: [3]   - SiFive7PipeB
+# CHECK-NEXT: [4]   - SiFive7PipeV
+# CHECK-NEXT: [5]   - SiFive7VA
+# CHECK-NEXT: [6]   - SiFive7VL
+# CHECK-NEXT: [7]   - SiFive7VS
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]
+# CHECK-NEXT:  -      -     2.00    -     3517.00  -    3517.00  -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    Instructions:
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vsetvli	zero, zero, e32, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -     512.00  -     512.00  -     vlse8.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -     256.00  -     256.00  -     vlse16.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -     16.00   -     16.00   -     vlse32.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -     64.00   -     64.00   -     vlse64.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -     512.00  -     512.00  -     vlse8.v	v1, (a1), zero
+# CHECK-NEXT:  -      -      -      -     256.00  -     256.00  -     vlse16.v	v1, (a1), zero
+# CHECK-NEXT:  -      -      -      -     16.00   -     16.00   -     vlse32.v	v1, (a1), zero
+# CHECK-NEXT:  -      -      -      -     64.00   -     64.00   -     vlse64.v	v1, (a1), zero
+# CHECK-NEXT:  -      -      -      -     1.00    -     1.00    -     vle8.v	v1, (a1)
+# CHECK-NEXT:  -      -      -      -     1.00    -     1.00    -     vle16.v	v1, (a1)
+# CHECK-NEXT:  -      -      -      -     2.00    -     2.00    -     vle32.v	v1, (a1)
+# CHECK-NEXT:  -      -      -      -     4.00    -     4.00    -     vle64.v	v1, (a1)
+# CHECK-NEXT:  -      -     1.00    -      -      -      -      -     vsetvli	zero, zero, e64, m1, tu, mu
+# CHECK-NEXT:  -      -      -      -     512.00  -     512.00  -     vlse8.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -     256.00  -     256.00  -     vlse16.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -     128.00  -     128.00  -     vlse32.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -     8.00    -     8.00    -     vlse64.v	v1, (a1), a2
+# CHECK-NEXT:  -      -      -      -     512.00  -     512.00  -     vlse8.v	v1, (a1), zero
+# CHECK-NEXT:  -      -      -      -     256.00  -     256.00  -     vlse16.v	v1, (a1), zero
+# CHECK-NEXT:  -      -      -      -     128.00  -     128.00  -     vlse32.v	v1, (a1), zero
+# CHECK-NEXT:  -      -      -      -     8.00    -     8.00    -     vlse64.v	v1, (a1), zero
+# CHECK-NEXT:  -      -      -      -     1.00    -     1.00    -     vle8.v	v1, (a1)
+# CHECK-NEXT:  -      -      -      -     1.00    -     1.00    -     vle16.v	v1, (a1)
+# CHECK-NEXT:  -      -      -      -     1.00    -     1.00    -     vle32.v	v1, (a1)
+# CHECK-NEXT:  -      -      -      -     2.00    -     2.00    -     vle64.v	v1, (a1)

From 651d88e3320472236d5d6e5806116c9bffd0f829 Mon Sep 17 00:00:00 2001
From: Jakub Kuderski <jakub@nod-labs.com>
Date: Mon, 30 Oct 2023 15:58:33 -0400
Subject: [PATCH 424/877] [mlir][vector] Update reduction kind docs. NFC.
 (#70673)

Update the documentation surrounding reduction kinds. Highlight
different min/max reduction kinds for signed/unsigned integers and
floats. Update IR examples.
---
 .../mlir/Dialect/Vector/IR/VectorOps.td       | 36 ++++++++++---------
 1 file changed, 20 insertions(+), 16 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
index 168ff45ca6154..62ae300b3cdc8 100644
--- a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
+++ b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
@@ -86,8 +86,9 @@ def Vector_ContractionOp :
 
     An optional kind attribute may be used to specify the combining function
     between the intermediate result and accumulator argument of rank K. This
-    attribute can take the values add/mul/min/max for int/fp, and/or/xor for
-    int only. The default is "add".
+    attribute can take the values `add`/`mul`/`minsi`/`minui`/`maxsi`/`maxui`
+    /`and`/`or`/`xor` for integers, and `add`/`mul`/`minf`/`maxf`/`minimumf`
+    /`maximumf` for floats. The default is `add`.
 
     Example:
 
@@ -149,7 +150,7 @@ def Vector_ContractionOp :
     #contraction_trait = {
       indexing_maps = #contraction_accesses,
       iterator_types = ["reduction"],
-      kind = #vector.kind<max>
+      kind = #vector.kind<maxf>
     }
     %6 = vector.contract #contraction_trait %0, %1, %2
       : vector<10xf32>, vector<10xf32> into f32
@@ -232,7 +233,8 @@ def Vector_ReductionOp :
   let summary = "reduction operation";
   let description = [{
     Reduces an 1-D vector "horizontally" into a scalar using the given
-    operation (add/mul/min/max for int/fp and and/or/xor for int only).
+    operation: `add`/`mul`/`minsi`/`minui`/`maxsi`/`maxui`/`and`/`or`/`xor` for
+    integers, and `add`/`mul`/`minf`/`maxf`/`minimumf`/`maximumf` for floats.
     Reductions also allow an optional fused accumulator.
 
     Note that these operations are restricted to 1-D vectors to remain
@@ -289,8 +291,9 @@ def Vector_MultiDimReductionOp :
   let summary = "Multi-dimensional reduction operation";
   let description = [{
     Reduces an n-D vector into an (n-k)-D vector (or a scalar when k == n)
-    using the given operation (add/mul/min/max for int/fp and and/or/xor for
-    int only).
+    using the given operation: `add`/`mul`/`minsi`/`minui`/`maxsi`/`maxui`
+    /`and`/`or`/`xor` for integers, and `add`/`mul`/`minf`/`maxf`/`minimumf`
+    /`maximumf` for floats.
     Takes an initial accumulator operand.
 
     Example:
@@ -937,11 +940,12 @@ def Vector_OuterProductOp :
     lowered to the LLVMIR dialect, this form emits `llvm.intr.fma`, which
     is guaranteed to lower to actual `fma` instructions on x86.
 
-    An optional kind attribute may be specified to be add/mul/min/max
-    for int/fp, and and/or/xor for int only. The default is "add", in which
-    case the operation returns a fused multiply-add. In other cases it returns
-    a multiply followed by the appropriate operation (for example, a compare and
-    select for "max").
+    An optional kind attribute may be specified to be: `add`/`mul`/`minsi`
+    /`minui`/`maxsi`/`maxui`/`and`/`or`/`xor` for integers, and `add`/`mul`
+    /`minf`/`maxf`/`minimumf`/`maximumf` for floats.
+    The default is `add`, in which case the operation returns a fused
+    multiply-add. In other cases it returns a multiply followed by the
+    appropriate operation (for example, a compare and select for `maxf`).
 
     Example:
 
@@ -953,7 +957,7 @@ def Vector_OuterProductOp :
       vector<4xf32>, vector<8xf32>, vector<4x8xf32>
     return %3: vector<4x8xf32>
 
-    %4 = vector.outerproduct %0, %1, %2 {kind = #vector.kind<max>}:
+    %4 = vector.outerproduct %0, %1, %2 {kind = #vector.kind<maxf>}:
       vector<4xf32>, vector<8xf32>, vector<4x8xf32>
     return %3: vector<4x8xf32>
 
@@ -2764,10 +2768,10 @@ def Vector_ScanOp :
   let description = [{
     Performs an inclusive/exclusive scan on an n-D vector along a single
     dimension returning an n-D result vector using the given
-    operation (add/mul/min/max for int/fp and and/or/xor for
-    int only) and a specified value for the initial value. The operator
-    returns the result of scan as well as the result of the last
-    reduction in the scan.
+    operation (`add`/`mul`/`minsi`/`minui`/`maxsi`/`maxui`/`and`/`or`/`xor` for
+    integers, and `add`/`mul`/`minf`/`maxf`/`minimumf`/`maximumf` for floats),
+    and a specified value for the initial value. The operator returns the
+    result of scan as well as the result of the last reduction in the scan.
 
     Example:
 

From 9e390a140857355b3524924075a261b9d06ae850 Mon Sep 17 00:00:00 2001
From: Joseph Huber <jhuber6@vols.utk.edu>
Date: Mon, 30 Oct 2023 14:58:09 -0500
Subject: [PATCH 425/877] [libc][Obvious] Fix missing semicolon in AMDGPU
 loader implementation

Summary:
Title
---
 libc/utils/gpu/loader/amdgpu/Loader.cpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/libc/utils/gpu/loader/amdgpu/Loader.cpp b/libc/utils/gpu/loader/amdgpu/Loader.cpp
index c2a11fd8aab72..2f99076a720e2 100644
--- a/libc/utils/gpu/loader/amdgpu/Loader.cpp
+++ b/libc/utils/gpu/loader/amdgpu/Loader.cpp
@@ -248,9 +248,8 @@ hsa_status_t launch_kernel(hsa_agent_t dev_agent, hsa_executable_t executable,
       (HSA_PACKET_TYPE_KERNEL_DISPATCH << HSA_PACKET_HEADER_TYPE) |
       (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE) |
       (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE);
-  uint32_t header_word =
-      header | (setup << 16u) __atomic_store_n((uint32_t *)&packet->header,
-                                               header_word, __ATOMIC_RELEASE);
+  uint32_t header_word = header | (setup << 16u);
+  __atomic_store_n((uint32_t *)&packet->header, header_word, __ATOMIC_RELEASE);
   hsa_signal_store_relaxed(queue->doorbell_signal, packet_id);
 
   // Wait until the kernel has completed execution on the device. Periodically

From 0d5b7dd25cc47123d6920207c089c8a9b98571b4 Mon Sep 17 00:00:00 2001
From: Shilei Tian <i@tianshilei.me>
Date: Mon, 30 Oct 2023 15:59:05 -0400
Subject: [PATCH 426/877] [OpenMP] Add a test for D158802 (#70678)

In D158802 we honored user's `thread_limit` value even with the
optimization
introduced in D152014. This patch adds a simple test.
---
 .../small_trip_count_thread_limit.cpp         | 31 +++++++++++++++++++
 1 file changed, 31 insertions(+)
 create mode 100644 openmp/libomptarget/test/offloading/small_trip_count_thread_limit.cpp

diff --git a/openmp/libomptarget/test/offloading/small_trip_count_thread_limit.cpp b/openmp/libomptarget/test/offloading/small_trip_count_thread_limit.cpp
new file mode 100644
index 0000000000000..9796c2dc11663
--- /dev/null
+++ b/openmp/libomptarget/test/offloading/small_trip_count_thread_limit.cpp
@@ -0,0 +1,31 @@
+// clang-format off
+// RUN: %libomptarget-compilexx-generic
+// RUN: env LIBOMPTARGET_INFO=16 \
+// RUN:   %libomptarget-run-generic 2>&1 | %fcheck-generic
+
+// UNSUPPORTED: aarch64-unknown-linux-gnu
+// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: x86_64-pc-linux-gnu
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+
+int main(int argc, char *argv[]) {
+  constexpr const int block_size = 256;
+  constexpr const int grid_size = 4;
+  constexpr const int count = block_size * grid_size;
+
+  int *data = new int[count];
+
+#pragma omp target teams distribute parallel for thread_limit(block_size) map(from: data[0:count])
+  for (int i = 0; i < count; ++i)
+    data[i] = i;
+
+  for (int i = 0; i < count; ++i)
+    if (data[i] != i)
+      return 1;
+
+  delete[] data;
+
+  return 0;
+}
+
+// CHECK: Launching kernel {{.*}} with 4 blocks and 256 threads in SPMD mode

From 68c384676cf92289d823576f915ed296d209354f Mon Sep 17 00:00:00 2001
From: Andrew Gozillon <Andrew.Gozillon@amd.com>
Date: Mon, 30 Oct 2023 14:51:47 -0500
Subject: [PATCH 427/877] [Flang][MLIR][OpenMP] Temporarily re-add basic
 handling of uses in target regions to avoid gfortran test-suite regressions

This was a regression introduced by myself in:

 https://github.com/llvm/llvm-project/commit/6a62707c048e16ce9bad37ed8e3520799139436b

where I too hastily removed the basic handling of implicit captures
we have currently. This will be superseded by all implicit captures
being added to target operations map_info entries in a soon landing
series of patches, however, that is currently not the case so we must
continue to do some basic handling of these captures for the time
being. This patch re-adds that behaviour to avoid regressions.

Unfortunately this means some test changes as well as
getUsedValuesDefinedAbove grabs constants used outside
of the target region which aren't handled particularly
well currently.
---
 .../OpenMP/OpenMPToLLVMIRTranslation.cpp      | 18 ++++++++++++++++
 .../basic-target-region-1D-array-section.f90  |  7 ++++---
 .../basic-target-region-3D-array-section.f90  | 12 ++++++++---
 .../fortran/basic-target-region-3D-array.f90  | 21 +++++++++++++------
 4 files changed, 46 insertions(+), 12 deletions(-)

diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 6a515c2ba4e87..1daf60b8659bb 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -32,6 +32,7 @@
 #include "llvm/TargetParser/Triple.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
 
+#include <any>
 #include <optional>
 #include <utility>
 
@@ -2407,6 +2408,23 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder,
       kernelInput.push_back(mapData.OriginalValue[i]);
   }
 
+  // Do some very basic handling of implicit captures that are caught
+  // by use in the target region.
+  // TODO/FIXME: Remove on addition of IsolatedFromAbove patch series
+  // as this will become redundant and perhaps erroneous in cases
+  // where more complex implicit capture semantics are required.
+  llvm::SetVector<Value> uses;
+  getUsedValuesDefinedAbove(targetRegion, uses);
+
+  for (mlir::Value use : uses) {
+    llvm::Value *useValue = moduleTranslation.lookupValue(use);
+    if (useValue &&
+        !std::any_of(
+            mapData.OriginalValue.begin(), mapData.OriginalValue.end(),
+            [&](llvm::Value *mapValue) { return mapValue == useValue; }))
+      kernelInput.push_back(useValue);
+  }
+
   builder.restoreIP(moduleTranslation.getOpenMPBuilder()->createTarget(
       ompLoc, allocaIP, builder.saveIP(), entryInfo, defaultValTeams,
       defaultValThreads, kernelInput, genMapInfoCB, bodyCB, argAccessorCB));
diff --git a/openmp/libomptarget/test/offloading/fortran/basic-target-region-1D-array-section.f90 b/openmp/libomptarget/test/offloading/fortran/basic-target-region-1D-array-section.f90
index 11d3b6936bcea..58f5379e330ec 100644
--- a/openmp/libomptarget/test/offloading/fortran/basic-target-region-1D-array-section.f90
+++ b/openmp/libomptarget/test/offloading/fortran/basic-target-region-1D-array-section.f90
@@ -14,10 +14,11 @@ program main
     integer :: write_arr(10) =  (/0,0,0,0,0,0,0,0,0,0/)
     integer :: read_arr(10) = (/1,2,3,4,5,6,7,8,9,10/)
     integer :: i = 2
-
-    !$omp target map(to:read_arr(2:5)) map(from:write_arr(2:5)) map(tofrom:i)
-        do i = 2, 5
+    integer :: j = 5
+    !$omp target map(to:read_arr(2:5)) map(from:write_arr(2:5)) map(to:i,j)
+        do while (i <= j)
             write_arr(i) = read_arr(i)
+            i = i + 1
         end do
     !$omp end target
     
diff --git a/openmp/libomptarget/test/offloading/fortran/basic-target-region-3D-array-section.f90 b/openmp/libomptarget/test/offloading/fortran/basic-target-region-3D-array-section.f90
index 28b2afced4d1b..e3df7983e6b5c 100644
--- a/openmp/libomptarget/test/offloading/fortran/basic-target-region-3D-array-section.f90
+++ b/openmp/libomptarget/test/offloading/fortran/basic-target-region-3D-array-section.f90
@@ -14,6 +14,7 @@ program main
     integer :: inArray(3,3,3)
     integer :: outArray(3,3,3)
     integer :: i, j, k 
+    integer :: j2 = 3, k2 = 3
 
     do i = 1, 3
       do j = 1, 3
@@ -24,11 +25,16 @@ program main
        end do
     end do
 
-!$omp target map(tofrom:inArray(1:3, 1:3, 2:2), outArray(1:3, 1:3, 1:3), j, k)
-    do j = 1, 3
-      do k = 1, 3
+j = 1
+k = 1 
+!$omp target map(tofrom:inArray(1:3, 1:3, 2:2), outArray(1:3, 1:3, 1:3), j, k, j2, k2)
+    do while (j <= j2)
+      k = 1
+      do while (k <= k2)
         outArray(k, j, 2) = inArray(k, j, 2)
+        k = k + 1
       end do
+      j = j + 1
     end do
 !$omp end target
 
diff --git a/openmp/libomptarget/test/offloading/fortran/basic-target-region-3D-array.f90 b/openmp/libomptarget/test/offloading/fortran/basic-target-region-3D-array.f90
index 58f42138ad0af..44ff394dcda16 100644
--- a/openmp/libomptarget/test/offloading/fortran/basic-target-region-3D-array.f90
+++ b/openmp/libomptarget/test/offloading/fortran/basic-target-region-3D-array.f90
@@ -10,9 +10,9 @@
 
 ! RUN: %libomptarget-compile-fortran-run-and-check-generic
 program main
-    implicit none
     integer :: x(2,2,2)
-    integer :: i = 1, j = 1, k = 1
+    integer :: i, j, k
+    integer :: i2 = 2, j2 = 2, k2 = 2
     integer :: counter = 1
     do i = 1, 2
         do j = 1, 2
@@ -22,14 +22,23 @@ program main
         end do
     end do
 
-!$omp target map(tofrom:x, i, j, k, counter)
-    do i = 1, 2
-        do j = 1, 2
-          do k = 1, 2
+i = 1
+j = 1 
+k = 1
+
+!$omp target map(tofrom:x, counter) map(to: i, j, k, i2, j2, k2)
+    do while (i <= i2)
+      j = 1
+        do while (j <= j2)
+          k = 1
+          do while (k <= k2)
             x(i, j, k) = counter
             counter = counter + 1
+            k = k + 1
           end do
+          j = j + 1
         end do
+        i = i + 1
     end do
 !$omp end target
 

From cc6f9cf5a2d5c246f30b74f748ee9a355e49d22a Mon Sep 17 00:00:00 2001
From: Philip Reames <preames@rivosinc.com>
Date: Mon, 30 Oct 2023 13:11:10 -0700
Subject: [PATCH 428/877] [RISCV] Add zbb coverage to test file [nfc]

---
 llvm/test/CodeGen/RISCV/sext-zext-trunc.ll | 285 ++++++++++++---------
 1 file changed, 161 insertions(+), 124 deletions(-)

diff --git a/llvm/test/CodeGen/RISCV/sext-zext-trunc.ll b/llvm/test/CodeGen/RISCV/sext-zext-trunc.ll
index 6be6785fc1d0e..98488c9a589a3 100644
--- a/llvm/test/CodeGen/RISCV/sext-zext-trunc.ll
+++ b/llvm/test/CodeGen/RISCV/sext-zext-trunc.ll
@@ -2,7 +2,9 @@
 ; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s \
 ; RUN:   | FileCheck %s -check-prefix=RV32I
 ; RUN: llc -mtriple=riscv64 -verify-machineinstrs < %s \
-; RUN:   | FileCheck %s -check-prefix=RV64I
+; RUN:   | FileCheck %s -check-prefixes=RV64,RV64I
+; RUN: llc -mtriple=riscv64 -mattr=+zbb -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s -check-prefixes=RV64,RV64ZBB
 
 define i8 @sext_i1_to_i8(i1 %a) nounwind {
 ; RV32I-LABEL: sext_i1_to_i8:
@@ -11,11 +13,11 @@ define i8 @sext_i1_to_i8(i1 %a) nounwind {
 ; RV32I-NEXT:    srai a0, a0, 31
 ; RV32I-NEXT:    ret
 ;
-; RV64I-LABEL: sext_i1_to_i8:
-; RV64I:       # %bb.0:
-; RV64I-NEXT:    slli a0, a0, 63
-; RV64I-NEXT:    srai a0, a0, 63
-; RV64I-NEXT:    ret
+; RV64-LABEL: sext_i1_to_i8:
+; RV64:       # %bb.0:
+; RV64-NEXT:    slli a0, a0, 63
+; RV64-NEXT:    srai a0, a0, 63
+; RV64-NEXT:    ret
   %1 = sext i1 %a to i8
   ret i8 %1
 }
@@ -27,11 +29,11 @@ define i16 @sext_i1_to_i16(i1 %a) nounwind {
 ; RV32I-NEXT:    srai a0, a0, 31
 ; RV32I-NEXT:    ret
 ;
-; RV64I-LABEL: sext_i1_to_i16:
-; RV64I:       # %bb.0:
-; RV64I-NEXT:    slli a0, a0, 63
-; RV64I-NEXT:    srai a0, a0, 63
-; RV64I-NEXT:    ret
+; RV64-LABEL: sext_i1_to_i16:
+; RV64:       # %bb.0:
+; RV64-NEXT:    slli a0, a0, 63
+; RV64-NEXT:    srai a0, a0, 63
+; RV64-NEXT:    ret
   %1 = sext i1 %a to i16
   ret i16 %1
 }
@@ -43,11 +45,11 @@ define i32 @sext_i1_to_i32(i1 %a) nounwind {
 ; RV32I-NEXT:    srai a0, a0, 31
 ; RV32I-NEXT:    ret
 ;
-; RV64I-LABEL: sext_i1_to_i32:
-; RV64I:       # %bb.0:
-; RV64I-NEXT:    slli a0, a0, 63
-; RV64I-NEXT:    srai a0, a0, 63
-; RV64I-NEXT:    ret
+; RV64-LABEL: sext_i1_to_i32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    slli a0, a0, 63
+; RV64-NEXT:    srai a0, a0, 63
+; RV64-NEXT:    ret
   %1 = sext i1 %a to i32
   ret i32 %1
 }
@@ -60,11 +62,11 @@ define i64 @sext_i1_to_i64(i1 %a) nounwind {
 ; RV32I-NEXT:    mv a1, a0
 ; RV32I-NEXT:    ret
 ;
-; RV64I-LABEL: sext_i1_to_i64:
-; RV64I:       # %bb.0:
-; RV64I-NEXT:    slli a0, a0, 63
-; RV64I-NEXT:    srai a0, a0, 63
-; RV64I-NEXT:    ret
+; RV64-LABEL: sext_i1_to_i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    slli a0, a0, 63
+; RV64-NEXT:    srai a0, a0, 63
+; RV64-NEXT:    ret
   %1 = sext i1 %a to i64
   ret i64 %1
 }
@@ -81,6 +83,11 @@ define i16 @sext_i8_to_i16(i8 %a) nounwind {
 ; RV64I-NEXT:    slli a0, a0, 56
 ; RV64I-NEXT:    srai a0, a0, 56
 ; RV64I-NEXT:    ret
+;
+; RV64ZBB-LABEL: sext_i8_to_i16:
+; RV64ZBB:       # %bb.0:
+; RV64ZBB-NEXT:    sext.b a0, a0
+; RV64ZBB-NEXT:    ret
   %1 = sext i8 %a to i16
   ret i16 %1
 }
@@ -97,6 +104,11 @@ define i32 @sext_i8_to_i32(i8 %a) nounwind {
 ; RV64I-NEXT:    slli a0, a0, 56
 ; RV64I-NEXT:    srai a0, a0, 56
 ; RV64I-NEXT:    ret
+;
+; RV64ZBB-LABEL: sext_i8_to_i32:
+; RV64ZBB:       # %bb.0:
+; RV64ZBB-NEXT:    sext.b a0, a0
+; RV64ZBB-NEXT:    ret
   %1 = sext i8 %a to i32
   ret i32 %1
 }
@@ -114,6 +126,11 @@ define i64 @sext_i8_to_i64(i8 %a) nounwind {
 ; RV64I-NEXT:    slli a0, a0, 56
 ; RV64I-NEXT:    srai a0, a0, 56
 ; RV64I-NEXT:    ret
+;
+; RV64ZBB-LABEL: sext_i8_to_i64:
+; RV64ZBB:       # %bb.0:
+; RV64ZBB-NEXT:    sext.b a0, a0
+; RV64ZBB-NEXT:    ret
   %1 = sext i8 %a to i64
   ret i64 %1
 }
@@ -130,6 +147,11 @@ define i32 @sext_i16_to_i32(i16 %a) nounwind {
 ; RV64I-NEXT:    slli a0, a0, 48
 ; RV64I-NEXT:    srai a0, a0, 48
 ; RV64I-NEXT:    ret
+;
+; RV64ZBB-LABEL: sext_i16_to_i32:
+; RV64ZBB:       # %bb.0:
+; RV64ZBB-NEXT:    sext.h a0, a0
+; RV64ZBB-NEXT:    ret
   %1 = sext i16 %a to i32
   ret i32 %1
 }
@@ -147,6 +169,11 @@ define i64 @sext_i16_to_i64(i16 %a) nounwind {
 ; RV64I-NEXT:    slli a0, a0, 48
 ; RV64I-NEXT:    srai a0, a0, 48
 ; RV64I-NEXT:    ret
+;
+; RV64ZBB-LABEL: sext_i16_to_i64:
+; RV64ZBB:       # %bb.0:
+; RV64ZBB-NEXT:    sext.h a0, a0
+; RV64ZBB-NEXT:    ret
   %1 = sext i16 %a to i64
   ret i64 %1
 }
@@ -157,10 +184,10 @@ define i64 @sext_i32_to_i64(i32 %a) nounwind {
 ; RV32I-NEXT:    srai a1, a0, 31
 ; RV32I-NEXT:    ret
 ;
-; RV64I-LABEL: sext_i32_to_i64:
-; RV64I:       # %bb.0:
-; RV64I-NEXT:    sext.w a0, a0
-; RV64I-NEXT:    ret
+; RV64-LABEL: sext_i32_to_i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    sext.w a0, a0
+; RV64-NEXT:    ret
   %1 = sext i32 %a to i64
   ret i64 %1
 }
@@ -171,10 +198,10 @@ define i8 @zext_i1_to_i8(i1 %a) nounwind {
 ; RV32I-NEXT:    andi a0, a0, 1
 ; RV32I-NEXT:    ret
 ;
-; RV64I-LABEL: zext_i1_to_i8:
-; RV64I:       # %bb.0:
-; RV64I-NEXT:    andi a0, a0, 1
-; RV64I-NEXT:    ret
+; RV64-LABEL: zext_i1_to_i8:
+; RV64:       # %bb.0:
+; RV64-NEXT:    andi a0, a0, 1
+; RV64-NEXT:    ret
   %1 = zext i1 %a to i8
   ret i8 %1
 }
@@ -185,10 +212,10 @@ define i16 @zext_i1_to_i16(i1 %a) nounwind {
 ; RV32I-NEXT:    andi a0, a0, 1
 ; RV32I-NEXT:    ret
 ;
-; RV64I-LABEL: zext_i1_to_i16:
-; RV64I:       # %bb.0:
-; RV64I-NEXT:    andi a0, a0, 1
-; RV64I-NEXT:    ret
+; RV64-LABEL: zext_i1_to_i16:
+; RV64:       # %bb.0:
+; RV64-NEXT:    andi a0, a0, 1
+; RV64-NEXT:    ret
   %1 = zext i1 %a to i16
   ret i16 %1
 }
@@ -199,10 +226,10 @@ define i32 @zext_i1_to_i32(i1 %a) nounwind {
 ; RV32I-NEXT:    andi a0, a0, 1
 ; RV32I-NEXT:    ret
 ;
-; RV64I-LABEL: zext_i1_to_i32:
-; RV64I:       # %bb.0:
-; RV64I-NEXT:    andi a0, a0, 1
-; RV64I-NEXT:    ret
+; RV64-LABEL: zext_i1_to_i32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    andi a0, a0, 1
+; RV64-NEXT:    ret
   %1 = zext i1 %a to i32
   ret i32 %1
 }
@@ -214,10 +241,10 @@ define i64 @zext_i1_to_i64(i1 %a) nounwind {
 ; RV32I-NEXT:    li a1, 0
 ; RV32I-NEXT:    ret
 ;
-; RV64I-LABEL: zext_i1_to_i64:
-; RV64I:       # %bb.0:
-; RV64I-NEXT:    andi a0, a0, 1
-; RV64I-NEXT:    ret
+; RV64-LABEL: zext_i1_to_i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    andi a0, a0, 1
+; RV64-NEXT:    ret
   %1 = zext i1 %a to i64
   ret i64 %1
 }
@@ -228,10 +255,10 @@ define i16 @zext_i8_to_i16(i8 %a) nounwind {
 ; RV32I-NEXT:    andi a0, a0, 255
 ; RV32I-NEXT:    ret
 ;
-; RV64I-LABEL: zext_i8_to_i16:
-; RV64I:       # %bb.0:
-; RV64I-NEXT:    andi a0, a0, 255
-; RV64I-NEXT:    ret
+; RV64-LABEL: zext_i8_to_i16:
+; RV64:       # %bb.0:
+; RV64-NEXT:    andi a0, a0, 255
+; RV64-NEXT:    ret
   %1 = zext i8 %a to i16
   ret i16 %1
 }
@@ -242,10 +269,10 @@ define i32 @zext_i8_to_i32(i8 %a) nounwind {
 ; RV32I-NEXT:    andi a0, a0, 255
 ; RV32I-NEXT:    ret
 ;
-; RV64I-LABEL: zext_i8_to_i32:
-; RV64I:       # %bb.0:
-; RV64I-NEXT:    andi a0, a0, 255
-; RV64I-NEXT:    ret
+; RV64-LABEL: zext_i8_to_i32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    andi a0, a0, 255
+; RV64-NEXT:    ret
   %1 = zext i8 %a to i32
   ret i32 %1
 }
@@ -257,10 +284,10 @@ define i64 @zext_i8_to_i64(i8 %a) nounwind {
 ; RV32I-NEXT:    li a1, 0
 ; RV32I-NEXT:    ret
 ;
-; RV64I-LABEL: zext_i8_to_i64:
-; RV64I:       # %bb.0:
-; RV64I-NEXT:    andi a0, a0, 255
-; RV64I-NEXT:    ret
+; RV64-LABEL: zext_i8_to_i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    andi a0, a0, 255
+; RV64-NEXT:    ret
   %1 = zext i8 %a to i64
   ret i64 %1
 }
@@ -277,6 +304,11 @@ define i32 @zext_i16_to_i32(i16 %a) nounwind {
 ; RV64I-NEXT:    slli a0, a0, 48
 ; RV64I-NEXT:    srli a0, a0, 48
 ; RV64I-NEXT:    ret
+;
+; RV64ZBB-LABEL: zext_i16_to_i32:
+; RV64ZBB:       # %bb.0:
+; RV64ZBB-NEXT:    zext.h a0, a0
+; RV64ZBB-NEXT:    ret
   %1 = zext i16 %a to i32
   ret i32 %1
 }
@@ -294,6 +326,11 @@ define i64 @zext_i16_to_i64(i16 %a) nounwind {
 ; RV64I-NEXT:    slli a0, a0, 48
 ; RV64I-NEXT:    srli a0, a0, 48
 ; RV64I-NEXT:    ret
+;
+; RV64ZBB-LABEL: zext_i16_to_i64:
+; RV64ZBB:       # %bb.0:
+; RV64ZBB-NEXT:    zext.h a0, a0
+; RV64ZBB-NEXT:    ret
   %1 = zext i16 %a to i64
   ret i64 %1
 }
@@ -304,11 +341,11 @@ define i64 @zext_i32_to_i64(i32 %a) nounwind {
 ; RV32I-NEXT:    li a1, 0
 ; RV32I-NEXT:    ret
 ;
-; RV64I-LABEL: zext_i32_to_i64:
-; RV64I:       # %bb.0:
-; RV64I-NEXT:    slli a0, a0, 32
-; RV64I-NEXT:    srli a0, a0, 32
-; RV64I-NEXT:    ret
+; RV64-LABEL: zext_i32_to_i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    slli a0, a0, 32
+; RV64-NEXT:    srli a0, a0, 32
+; RV64-NEXT:    ret
   %1 = zext i32 %a to i64
   ret i64 %1
 }
@@ -318,9 +355,9 @@ define i1 @trunc_i8_to_i1(i8 %a) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    ret
 ;
-; RV64I-LABEL: trunc_i8_to_i1:
-; RV64I:       # %bb.0:
-; RV64I-NEXT:    ret
+; RV64-LABEL: trunc_i8_to_i1:
+; RV64:       # %bb.0:
+; RV64-NEXT:    ret
   %1 = trunc i8 %a to i1
   ret i1 %1
 }
@@ -330,9 +367,9 @@ define i1 @trunc_i16_to_i1(i16 %a) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    ret
 ;
-; RV64I-LABEL: trunc_i16_to_i1:
-; RV64I:       # %bb.0:
-; RV64I-NEXT:    ret
+; RV64-LABEL: trunc_i16_to_i1:
+; RV64:       # %bb.0:
+; RV64-NEXT:    ret
   %1 = trunc i16 %a to i1
   ret i1 %1
 }
@@ -342,9 +379,9 @@ define i1 @trunc_i32_to_i1(i32 %a) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    ret
 ;
-; RV64I-LABEL: trunc_i32_to_i1:
-; RV64I:       # %bb.0:
-; RV64I-NEXT:    ret
+; RV64-LABEL: trunc_i32_to_i1:
+; RV64:       # %bb.0:
+; RV64-NEXT:    ret
   %1 = trunc i32 %a to i1
   ret i1 %1
 }
@@ -354,9 +391,9 @@ define i1 @trunc_i64_to_i1(i64 %a) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    ret
 ;
-; RV64I-LABEL: trunc_i64_to_i1:
-; RV64I:       # %bb.0:
-; RV64I-NEXT:    ret
+; RV64-LABEL: trunc_i64_to_i1:
+; RV64:       # %bb.0:
+; RV64-NEXT:    ret
   %1 = trunc i64 %a to i1
   ret i1 %1
 }
@@ -366,9 +403,9 @@ define i8 @trunc_i16_to_i8(i16 %a) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    ret
 ;
-; RV64I-LABEL: trunc_i16_to_i8:
-; RV64I:       # %bb.0:
-; RV64I-NEXT:    ret
+; RV64-LABEL: trunc_i16_to_i8:
+; RV64:       # %bb.0:
+; RV64-NEXT:    ret
   %1 = trunc i16 %a to i8
   ret i8 %1
 }
@@ -378,9 +415,9 @@ define i8 @trunc_i32_to_i8(i32 %a) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    ret
 ;
-; RV64I-LABEL: trunc_i32_to_i8:
-; RV64I:       # %bb.0:
-; RV64I-NEXT:    ret
+; RV64-LABEL: trunc_i32_to_i8:
+; RV64:       # %bb.0:
+; RV64-NEXT:    ret
   %1 = trunc i32 %a to i8
   ret i8 %1
 }
@@ -390,9 +427,9 @@ define i8 @trunc_i64_to_i8(i64 %a) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    ret
 ;
-; RV64I-LABEL: trunc_i64_to_i8:
-; RV64I:       # %bb.0:
-; RV64I-NEXT:    ret
+; RV64-LABEL: trunc_i64_to_i8:
+; RV64:       # %bb.0:
+; RV64-NEXT:    ret
   %1 = trunc i64 %a to i8
   ret i8 %1
 }
@@ -402,9 +439,9 @@ define i16 @trunc_i32_to_i16(i32 %a) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    ret
 ;
-; RV64I-LABEL: trunc_i32_to_i16:
-; RV64I:       # %bb.0:
-; RV64I-NEXT:    ret
+; RV64-LABEL: trunc_i32_to_i16:
+; RV64:       # %bb.0:
+; RV64-NEXT:    ret
   %1 = trunc i32 %a to i16
   ret i16 %1
 }
@@ -414,9 +451,9 @@ define i16 @trunc_i64_to_i16(i64 %a) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    ret
 ;
-; RV64I-LABEL: trunc_i64_to_i16:
-; RV64I:       # %bb.0:
-; RV64I-NEXT:    ret
+; RV64-LABEL: trunc_i64_to_i16:
+; RV64:       # %bb.0:
+; RV64-NEXT:    ret
   %1 = trunc i64 %a to i16
   ret i16 %1
 }
@@ -426,9 +463,9 @@ define i32 @trunc_i64_to_i32(i64 %a) nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    ret
 ;
-; RV64I-LABEL: trunc_i64_to_i32:
-; RV64I:       # %bb.0:
-; RV64I-NEXT:    ret
+; RV64-LABEL: trunc_i64_to_i32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    ret
   %1 = trunc i64 %a to i32
   ret i32 %1
 }
@@ -441,11 +478,11 @@ define i32 @sext_of_not_i32(i1 %x) {
 ; RV32I-NEXT:    addi a0, a0, -1
 ; RV32I-NEXT:    ret
 ;
-; RV64I-LABEL: sext_of_not_i32:
-; RV64I:       # %bb.0:
-; RV64I-NEXT:    andi a0, a0, 1
-; RV64I-NEXT:    addi a0, a0, -1
-; RV64I-NEXT:    ret
+; RV64-LABEL: sext_of_not_i32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    andi a0, a0, 1
+; RV64-NEXT:    addi a0, a0, -1
+; RV64-NEXT:    ret
   %xor = xor i1 %x, 1
   %sext = sext i1 %xor to i32
   ret i32 %sext
@@ -459,11 +496,11 @@ define i64 @sext_of_not_i64(i1 %x) {
 ; RV32I-NEXT:    mv a1, a0
 ; RV32I-NEXT:    ret
 ;
-; RV64I-LABEL: sext_of_not_i64:
-; RV64I:       # %bb.0:
-; RV64I-NEXT:    andi a0, a0, 1
-; RV64I-NEXT:    addi a0, a0, -1
-; RV64I-NEXT:    ret
+; RV64-LABEL: sext_of_not_i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    andi a0, a0, 1
+; RV64-NEXT:    addi a0, a0, -1
+; RV64-NEXT:    ret
   %xor = xor i1 %x, 1
   %sext = sext i1 %xor to i64
   ret i64 %sext
@@ -478,13 +515,13 @@ define i32 @sext_of_not_cmp_i32(i32 %x) {
 ; RV32I-NEXT:    addi a0, a0, -1
 ; RV32I-NEXT:    ret
 ;
-; RV64I-LABEL: sext_of_not_cmp_i32:
-; RV64I:       # %bb.0:
-; RV64I-NEXT:    sext.w a0, a0
-; RV64I-NEXT:    addi a0, a0, -7
-; RV64I-NEXT:    seqz a0, a0
-; RV64I-NEXT:    addi a0, a0, -1
-; RV64I-NEXT:    ret
+; RV64-LABEL: sext_of_not_cmp_i32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    sext.w a0, a0
+; RV64-NEXT:    addi a0, a0, -7
+; RV64-NEXT:    seqz a0, a0
+; RV64-NEXT:    addi a0, a0, -1
+; RV64-NEXT:    ret
   %cmp = icmp eq i32 %x, 7
   %xor = xor i1 %cmp, 1
   %sext = sext i1 %xor to i32
@@ -501,12 +538,12 @@ define i64 @sext_of_not_cmp_i64(i64 %x) {
 ; RV32I-NEXT:    mv a1, a0
 ; RV32I-NEXT:    ret
 ;
-; RV64I-LABEL: sext_of_not_cmp_i64:
-; RV64I:       # %bb.0:
-; RV64I-NEXT:    addi a0, a0, -7
-; RV64I-NEXT:    seqz a0, a0
-; RV64I-NEXT:    addi a0, a0, -1
-; RV64I-NEXT:    ret
+; RV64-LABEL: sext_of_not_cmp_i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    addi a0, a0, -7
+; RV64-NEXT:    seqz a0, a0
+; RV64-NEXT:    addi a0, a0, -1
+; RV64-NEXT:    ret
   %cmp = icmp eq i64 %x, 7
   %xor = xor i1 %cmp, 1
   %sext = sext i1 %xor to i64
@@ -522,13 +559,13 @@ define i32 @dec_of_zexted_cmp_i32(i32 %x) {
 ; RV32I-NEXT:    addi a0, a0, -1
 ; RV32I-NEXT:    ret
 ;
-; RV64I-LABEL: dec_of_zexted_cmp_i32:
-; RV64I:       # %bb.0:
-; RV64I-NEXT:    sext.w a0, a0
-; RV64I-NEXT:    addi a0, a0, -7
-; RV64I-NEXT:    seqz a0, a0
-; RV64I-NEXT:    addi a0, a0, -1
-; RV64I-NEXT:    ret
+; RV64-LABEL: dec_of_zexted_cmp_i32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    sext.w a0, a0
+; RV64-NEXT:    addi a0, a0, -7
+; RV64-NEXT:    seqz a0, a0
+; RV64-NEXT:    addi a0, a0, -1
+; RV64-NEXT:    ret
   %cmp = icmp eq i32 %x, 7
   %zext = zext i1 %cmp to i32
   %dec = sub i32 %zext, 1
@@ -545,12 +582,12 @@ define i64 @dec_of_zexted_cmp_i64(i64 %x) {
 ; RV32I-NEXT:    mv a1, a0
 ; RV32I-NEXT:    ret
 ;
-; RV64I-LABEL: dec_of_zexted_cmp_i64:
-; RV64I:       # %bb.0:
-; RV64I-NEXT:    addi a0, a0, -7
-; RV64I-NEXT:    seqz a0, a0
-; RV64I-NEXT:    addi a0, a0, -1
-; RV64I-NEXT:    ret
+; RV64-LABEL: dec_of_zexted_cmp_i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    addi a0, a0, -7
+; RV64-NEXT:    seqz a0, a0
+; RV64-NEXT:    addi a0, a0, -1
+; RV64-NEXT:    ret
   %cmp = icmp eq i64 %x, 7
   %zext = zext i1 %cmp to i64
   %dec = sub i64 %zext, 1

From 2446439f51cf0d2dfb11c823436a930de7a4b8a2 Mon Sep 17 00:00:00 2001
From: Teresa Johnson <tejohnson@google.com>
Date: Mon, 30 Oct 2023 13:19:37 -0700
Subject: [PATCH 429/877] [MemProf] Handle profiles with missing column numbers
 (#70520)

Detect when we are matching a memprof profile with no column numbers,
and in that case treat all column numbers as 0 when matching. The
profiled binary might have been built with -gno-column-info, for
example.
---
 .../Instrumentation/MemProfiler.cpp           |  12 ++++-
 .../PGOProfile/Inputs/memprof.nocolinfo.exe   | Bin 0 -> 1520760 bytes
 .../Inputs/memprof.nocolinfo.memprofraw       | Bin 0 -> 2352 bytes
 .../Inputs/update_memprof_inputs.sh           |   4 ++
 llvm/test/Transforms/PGOProfile/memprof.ll    |  46 ++++++++++++++++++
 5 files changed, 60 insertions(+), 2 deletions(-)
 create mode 100755 llvm/test/Transforms/PGOProfile/Inputs/memprof.nocolinfo.exe
 create mode 100644 llvm/test/Transforms/PGOProfile/Inputs/memprof.nocolinfo.memprofraw

diff --git a/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp b/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp
index 8cd06f878897b..2b29ea2a65fdc 100644
--- a/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp
@@ -729,6 +729,12 @@ static void readMemprof(Module &M, Function &F,
     return;
   }
 
+  // Detect if there are non-zero column numbers in the profile. If not,
+  // treat all column numbers as 0 when matching (i.e. ignore any non-zero
+  // columns in the IR). The profiled binary might have been built with
+  // column numbers disabled, for example.
+  bool ProfileHasColumns = false;
+
   // Build maps of the location hash to all profile data with that leaf location
   // (allocation info and the callsites).
   std::map<uint64_t, std::set<const AllocationInfo *>> LocHashToAllocInfo;
@@ -742,6 +748,7 @@ static void readMemprof(Module &M, Function &F,
     // of call stack frames.
     uint64_t StackId = computeStackId(AI.CallStack[0]);
     LocHashToAllocInfo[StackId].insert(&AI);
+    ProfileHasColumns |= AI.CallStack[0].Column;
   }
   for (auto &CS : MemProfRec->CallSites) {
     // Need to record all frames from leaf up to and including this function,
@@ -750,6 +757,7 @@ static void readMemprof(Module &M, Function &F,
     for (auto &StackFrame : CS) {
       uint64_t StackId = computeStackId(StackFrame);
       LocHashToCallSites[StackId].insert(std::make_pair(&CS, Idx++));
+      ProfileHasColumns |= StackFrame.Column;
       // Once we find this function, we can stop recording.
       if (StackFrame.Function == FuncGUID)
         break;
@@ -798,8 +806,8 @@ static void readMemprof(Module &M, Function &F,
         if (Name.empty())
           Name = DIL->getScope()->getSubprogram()->getName();
         auto CalleeGUID = Function::getGUID(Name);
-        auto StackId =
-            computeStackId(CalleeGUID, GetOffset(DIL), DIL->getColumn());
+        auto StackId = computeStackId(CalleeGUID, GetOffset(DIL),
+                                      ProfileHasColumns ? DIL->getColumn() : 0);
         // LeafFound will only be false on the first iteration, since we either
         // set it true or break out of the loop below.
         if (!LeafFound) {
diff --git a/llvm/test/Transforms/PGOProfile/Inputs/memprof.nocolinfo.exe b/llvm/test/Transforms/PGOProfile/Inputs/memprof.nocolinfo.exe
new file mode 100755
index 0000000000000000000000000000000000000000..3000e2b8515a25c02dba016e75107b2bf80b3b85
GIT binary patch
literal 1520760
zcmeFad)!Rb`#-$zx#wowPTL_lrEw^S!VpoY!R=TyBsr9Zq?9z2qEg%Wc1YtasRpT(
z7?o3z2^CUCrJ~djg-+9v4xf6iwcpqNJ+FB`c|FhnPwVsj(*1Vb?{%$fUF%xcT5IpM
zAMSK@XJ1~P<rSZGg=K!q)V#oj!uk(q8lJ2FWm#dXwN+aFY-BaCN(kzg{}1cGhOMCc
zU*6xh`S1_k-vO1&m%q*LJ*WTvoG<A9i}!a;_b1Ep{`NF9e;4S#;jf_nguml|6}mrr
ze_J;G+5Dc-)x+auE`pr?oi+a`B~}T4^Y1zNrQLsU>f-mjzk`h+pY6T?|DpAjl%Gud
z{!e{>K`^d=L)fyq>VJ54G3GF;*m<bWu7LWyzcXqG?{Dw-dLO@MXik0qFYWW}8PVUn
z_$JdMmUZz>b#wk+9dUoOyuVEyru35EQ<DMs|M;i*ee?f2wcmGYzk7eTRvzzfufC=u
zh7Y;4Mbi;O8;=-1a?IGqV_RR^_|g_l?!K!@bC=JU-L$Do&+A+U(;BV9{dZ$0gPDI%
zc5Gd?Xve&S{CL)4xGMh3zZ&-l{tI5`Ecg{?!G9I}-}rq}@xS5ev*3>}x2y{PqR-RS
z^Zu3R+2Ft9ubu^e?`(LTf9H9+Zk|>9U-UUCc<^8F2KD|Oj|l#6b{KFLeCS#5d(VPT
zIt#w=Ecgp&!CyNI{^nWmkIsT07X07rcH%7ZKb-~t{VaI?1^;f(QfI--p9QaS7QEJ3
z@VaNg+noh(+%V5FV~I;!!GjFS&o`X~pLP~}`C0I-XTcAi1^@Ldc;!a_Zs!Znf^Wa*
z-{tqqBv;WYXVsX1dDxrt+RKDkNAS#-z=x_$Vq!ES-(XU%{PmYWzW?N0yxFb5!xJua
z8M-OYMG~+Kf6Xbm@*N&hp6R*x=RV-snZVP^^PtQdhR3=V{CyNZoP?f{1aO@-t#gk8
z54{IGPvw79eJ&z>M!W`jYjdvri#vd4w&dd3w}Csa=i*yNK~H-j@a9@?S($i@J<m9~
z?H%|e@I(>h?^5}y>wu@10UxG#3pY-dvtoBOaFH(*pLQwa<J2!jH$b1r9pIUv@|*vG
zeCjap>WX))4}2o^`>~e5rxJg!8-bUYmfNl-Dd4f!!0*}rqA!6bsbBU-p=W3e<Yk(0
zOY6bFuO|LKMgnhNoLle2r@)`?0-ifSV~r|j#zi^H`8rphX*Tf43E*B<n&-}U<*aB^
z*twtb92o%q&!^_rTWT2aWu)ig$ALSoz&}WNiYuYs#K~NJ_Q<?u^o*Tp=t`PA!<6;q
z(ca{G)cYu4>#I%R&u+}+dGuPyr}_c^L*;8v2ELcZ-T40Cd6U{z`vJ(;A^DLH0>6~-
zCG)@&{2O|T8@pv^6}9s>xq3d-74px~c<gaVd3NW@|0V;(#DR1i_y*;_UfZ>w^f_+?
zc*>KXyyAc-{xW#t%Cq1M<n2Sb_5S%k;K|Q&@uxH%4x(|fSH`2!a{=L_)Ndn?fae*l
zcXmnG$NCcZ`HDX#X}jUctb{#3QM_MJ_5T*QxS3m)NqTSOchUG=AoekS^%2>p^eW(y
zv8Y#uxmzZQ;~AdV6W|}N`2DgRGCU8{Jo1QyW+VS0;m;q0d}0^)y*SY8Yv8f{z`b?Z
zB?aJd7NFjjwceO0X80$NJ#TFVo~H<J(Hin=2yb&A@bHtU_i5!nHWfV4d8jw5cwjg1
z48{3=^6$nD=~@`4YqeoFHv@mF8u*7PPg@;dDH>l7==iEn{nBoh>azv<NZ5DFB@(EO
z9TrUlUZDJ$BH%oaEb9cG88p70nFRUmRBvGd^1*J<-;19MdqKVh>3?Socw{c*z47>r
z1Sn%4&MVexd=7Wb<^N~}<U<313$0t0^#*@es`qw{Z`TukSm(Rg2=I9G-Q^R&lXwMq
zU*)ea4&drJ5covJkJm7HV>xSg3&elV&gToik^hP6{q`f^sW-vn`S}6qIKvZp13VR!
zzltbe@ae=;?snkc(meA1bnu6tMZZh)-IDnYc<3PNt)%#-$5fw&z^_z==BxqE`ANvH
z@%c!u{%h3VtWQy|h`D8t13hPw|CIOz{OON#dH#_6+_X0qZsh*HXP@><ARi4u-n0L_
zy^5!Rd+{Mq0`&$d&diViW%}hP*)6308J`254N%_-JP4k|v%tN$`usldq}~AT>3LA%
zo8do5^=6Az&jjrAmh!(Oj%VZ(tAR`Sb<5brz>{^se^~LKRx3UO_=k!IHUW>*{Jyv%
z`YU`gSI>9Kqg~&To-cF&9_|2nZ(L8h8hGleTzt)3)Ehhv{U26+Zao+ECTSiYSQ0$W
z<B-2w<!8PRp2!aHd;a;^AIkq}uKxGP?-{!}LB!Qo`5@X>7W~!=;J;t_$4`MguNNLW
z0X+16F8`s^z*9P2y#Dp;{2rqD{oPvd+dnA|w2%qf_;d6W^y#enyxbA^y%cY|NyCi%
zO3FiG8YiQlgQtrs)I{^rkmjYHo|gonzeV%=opr(AituwL0FTo+J@O5BG9TpHCox#%
z?*s1T1IrFU-r1ci|C!q1PnwUO3W4Whntx9Y1s;A7JXy6{6`7Y!zkf#d*{bn59EJQp
zD!>0n)&HYheP;X)Jf`{ExgLKt_-!Z6=L-~fDng&oYp}mJ&)#@5@Ovnpj1<Q<{kV|y
z9P}dU<-Bh59^mn9xq5c_0=TsoxEJ4E3xGePc~@NZ{95}ZP5qJ)J&oNSB0YB;03M?G
zsOL%G2^tqMUGG?zV4fPL^<H_7>az#-c|`Fqo&ryj=JA)e0T0u<@AE3aqvR)(ngj1n
zesVzL)d(6#n{@vDi{{PJ8t1R0e(@axzr7E3_TtY$t@j$Lcj_f*uM>p4n9D6&r-LW;
z4D2vo@n@C-kF|q*PsN?*fjbWY7dLRrX`Np(G`}<}g?g>0z%RqrEkVsEFC{&HZw3BU
zgm1qFJds|gcbf7%B@0jEx9Pip%k<=y)2e^+3gFUAx7_?H_~V;^=PCZ1<}=yd@DmR|
zQ6F(K-oBA5>EYLk!x;Yz?M1s}I>W!80iNbKtUMO+eAoi|$Lm2KkEcg_@LWrNGHn3l
z50aleqj}v<iszxrAs=cEes7(=ay0m-kzY+M2F~lFr({Am?M?5=wc7_7;E8v0@%oif
z?<F)YZXXLgtNU6Xs@<MeKjih`*I$7@tm`Sy{zJr{jXu0z_pR_7{10l^PN~S?@tx4q
zi%SbNE?rM~MYmg^=OT(f#Up{YBL2^Gzbo4b{P$?PPA*5i$v(gzR=j;x@Z3Z8*{1!K
z)&5$d@_oaQFCu?9PzU^tX@2~p3FL#7QST7t87S)=V~3{XKh>hZTT^>?3<uBGgkL)q
zcq$J6_xh!7A#iIsaQSt&l&k^V{tS3&{e^cl4uomEC)J*g+SAMHT1ucZ`qv}7jg*0J
zaL%I!y#hSD9(Is)%Pm*rL!XnhZ`ob#_7B-J@&kC%2f*XCw};HXCjNA%`S<Hm;14ZB
zd*9M}zwQeBMe4`xgTddI?EJ`cz+?BKUT@w!T?X@h>MP)0{O_?&^;{0Ty4L&aR^axl
zz&-ub7eODV6z~-)e<lpPC+XQ>67WrwcU4-c{6oMqQ+eKbLvh+??zangg7zPZHII$c
zy6>MqRi5(d4UM5sxIF5WZ3nlUk^yM?#eNL9Orvh;_LB0@`e<Z3$d{$|uF&=DJQ^?S
z?$>%(f`5SW-}o{3t+#;RuK3<sz`rKHJs{JH(KA~D@-M3Vy)wZWe~v7I|J<hdtz97h
zB<b094Dihqm#!%QPX&r^C8au3?{M<-(w9SiKaIzxx}J*Ada9EIG^6L8<PQx#10JIJ
zqQ9&=jQo!@PYtLB`C_t<CH`sTUne~Ww}JkrNY9NjK#V-EU!HpwIOho`UIrd}3HF!e
zr&|u@LC@^xz`gPEw~p&EWY5jlK_7b%<X35bHF_NU!Ir=$Dt^7@10l)>R%?5CpR=5f
z-{j_8eP(O_Rv~?AIH>nQ>X#;(Kc{sb^xE}h4ag_S{s-#=pGEdxz5uvg2KspOT3Git
zLb|_kzP2}57WT|&yz=lPJz=;1kw2$(e4R)7SN<OLhA0lN%mR<ny2-v6{GmeVDPhVj
z(-e=F&eh+Rz-9a=u@>XbQTaVRz?0|!c}bt#vUV2avnj~S@rPSxJfb}B0+%$vElVZA
zGyG}V_xP+O`1u^FyNpLOPCp_)fA9|A1vK9K-Ua^7gjdvl32MK1{l596>O<?MMfvEj
zSO)F&<UcD5`8Ubm1{4ENE=7AK&2Y>A3c=$X2anh9O|-oaQoA;)U&Yn0Mk&wMlHdvI
zIfAFpV-eIFr1(5^JnR#D0rielo)b@~d`IA7Mz@T826$pPa7icJQl$BVrTN3xieGRF
z@|Ny1R8V~3ufSuUfXCBwgP6tiW9ARoXP(NhyAybPCgeT-!9Rj0N&Z|c>rKNmkj82I
z+K{hI<Fvl+mnQ0if4J8B=RwG)==|c<zkvt!{Is#kmzWNm^O>&_z<FNyRoAC+U7vdX
zZ(Hcctnz#IS>FJ8RIr0gh*HYCe_Gb74Z;6Bwd;>Az*kV*c;ar@$6k!~-mE+^^-o*(
zA9^dEodX`{2lUG=imzA?JiZ~fALmqs{?>26J8J{xy3Z|0zHt)z%XW=hhReKf`YU!4
z{2di9BZf8jSJbY$Jy7o#)UMN-hb$&Pw?$zSPv%nISX=zT@F%}Rz0I}WPjtLoKs<k4
z5Bb%E_mhEO_?_p$Bg4uqJ68fv-U2&_m|Lc-0-hNM{9eVI90s1Ce4vV~yWMzrdjnUp
zy2>|~fp7GW&j*i8({9<h1U$SxdSDFXU#Ga&Oy&{8Uy0_C27drg@p<Suz<bg7dR^mf
zA@L7=9r9tytFQS$`E`HTTW`+#4Y;lOt>^!lzZBPf+Dh8q1tMVd4>pGV3lyI(1H|Ab
zC=OQ*!47pOu0C`n@RsCPC;I_^jru+K3HT##!~UM0BYfzu&=<M*l0xuUE1>5D)$`b8
zz@280mu*kC3|4#E(|~*Nd7O-2(=OhJw8W7NzKiVNsQ~(<=zQn2j;}-o@bA!iR~LcD
zAv-JxfG4K&i`QSJ>p=gyG*0_n3Y_EWnL)to(RiGA6YwU4ch`C&G_URK4Eb1R=r79|
zx0HxO|KwM&vo|ij-U2*9`Qak9e^$?-Bn-Nxn&!{P$xjL;V4884)^l?abIUvj{8MQh
zZGH`Skm@~G`!_um`nOb`7aszDXbo^L4qyKjc+zR$p8w}nguJsE?Ui)PEmseK{D<VX
zc|#ySknCLYDaf-u4{JUdr+o4|UDw;q5SRLEy~lfj=Q66d!&|_!TcLkBmH$dMI*i}i
zl=n84{L|p?lV4qTAM`&$e*V9DkiU-Pdx!x|e+B!ZUT=QCL<W?>Q?r13{XSX-psNq9
zQwALc9--%#R>#0&&jXL=Z@)_eUH%u*FZ;E<SBt#iA3*xQKNEO%Q*J-bUI3i)@Dh4X
z6r^*a@jB01Cos>-vcfH;IzgW)G_D(u1)e2+&e3@!*aYp8$9UYbROheg{lISpY#o=x
z%(N?|`;cCISdkC@$ZWLhdgWQ8^Jzl&Lu5L0%T$SD#%}MEJ|lHpbDl78KI%>6Lmvq*
zZs~eP?Md@UYf;1S4<J9eeGcR~ADF&O`DtBq?|Z<*w4Yy2#|57^m5}Z*_E}5yR@VGE
zq|YbF^29Cg=y_}8JG9H27asRP{~$dl`K;zeC#ipLT7!ByPiSxqcu4b4?G|hC55TiD
z-Y=XCoSz@ts`*}cA^O)*{WojB|4jRbzS-ch(~#e!@<Sd69=#OpeP8jnUjokSl+GJ~
zrz!t*bl)#V@g$<p0mi-tzt`Sf+FyNWfATgRN7=7($598#w~ha>ztz(G!EOtFaaXrI
zyj|Nx`+KDZg2$%yRG%pDb`+o6OJZsKrxVRDH)y}`^KYYNUNiD<)BNSX3iuo3Cl~6x
zd=AxnT;>bIb35S&?*!g}#_2a9;LWJsiVJ~rzA;ww1b%*X;4sKPL-jr~9k{g;{v&Ca
zTk^GkBh<eobRF~<wRha*uydTweLhwHjC}(BcIqg8jStyxfQ!4jrO|5W!}EAX{WhcP
zYj1u(CJGyS26bI|zVeI^zcTnriVy4MxlrS8kq1RUis!exOF+IU<+rnT0Z-DtaoMtH
z?*@`@coFbKN%*a#{cfq-96X(<A3tjk`5Vcele~QZ8jl~o3Z4|5CysnqaoTSw|2yz7
ztvk}9kFf)v+y12UMf?fq@5RZ>B>#8E=^FIckJ_#&ZGm%~`RrQYgDG!`-w3=k`QgGT
zs{btTk5K+~+V9!tp^rDOb$mhFH3_&3Pq(c1*2Sj~XQcUVDJp?_FQESVMXEFHa{ffU
zk~X`g!H3FE=XaGag#S47{8eemGmXFXp?FehF>pQyzg`j>BOlkiRLt&{HtT^$so#4a
z0G=Uz3Qq%%(7fDK^^B9A*U30F_45ASPKk%EKal>Lk3-(kJlPvBcl-@JtLsNEKN;K_
zJeiBpFA~<=(p%Tv_5;XoBZ_xdJrkto%rJO@SAa*tlv^4U0e^tTOZOL{59gmddO<#P
z9e6zdStb5y>~I75=XZBNK2;2P3GZ&HI2!nPiqEG-!0^07`UE7AGxl6heqQ%E$R|dF
ze}?iOlm(;V$<n-bi`pki_E{kYH}czP-{3`W-fRc{{>p!kbcE5TF6rN51$a20Z}>Lw
zFvY#Shk(cR`6Vyj_V^Zf=tI=&`AM7am51`5avFzYd*BbA{D&o>XN2~3YhD7LJ`_LS
z?FgKo6YJO)JkCwfXPw%ix8%vjZck9ZuhqOaSpfN=Du351$j8S4zee#@ng>P}0{7;l
z^jgR}j{%>h^3eeF=}qzHnX16!?<4+r=hO$AfTt9-cWN`pw;_DL#uH~9>it6NZLkGA
zk<Gw8J71^%kemTr!kt^Ld>!&3+Fx3v&z(lwA+9cxyxElg6`{{L<c9}!-zKK}HeP(#
zT}t<Jj-lSJps@xDt?8FnXuj^+2K=EGke9T|EkE52JVE|^R{?O&pU3I*hne>9w^Pcq
zrJmY{>MbYsH{<0G!gou2FnaR7T;v3Bo6a>(i@^;~n)3E1>Hz0`;2%1Gho9G4(+~It
z^4m=^zzzQ%!Ux>|{50Vq$v=&}P5Ui=^@r?q*jdsKw~T%k^(IFG_u|!WHh3}{;kTZj
z*H(W{>w09l@+@c!o-EmOy2LAEPu?foeGl-p)W5HN13XR7TmE()^!bGRaD^yf_@mcC
ze=(z5w#Ohpl=v%810K`&CcN>|e<^r)J+kCo)kpI#j1J3sN!LNi5d6)H!%OOdKb?i$
zlB&-K?SLmPgP!7MZn>p5@Q&1vcP~M`yw7<?I>@v)L3!E;jSu!(@Hf_mRZ;yDls7il
z_o|$ekoWAcR^xL=ieo=ZGmT$ax^M7=>c2MxJW~}so*ioFI%Ousv3fGzjsCn(xM=`*
zoZG<T`QhC&As-<>JovlXf%4u~m4WkfVrIHG_1;bXd0O>!8lc|h+OGAtLH+}p-^<HF
z#_+^7UQJZ_`#%F7q4S8mKY@qnIg_jOJ+I^`)Z0pV%=U=kA4_^p?gabr`zej@0v>%7
zJhv*(t2!=HIxf8VZc};0XX^yyy?(z_^De6h_+;fNGX(nZ{^zxafhVcG^E(5dPx;av
zqJU{vG1>DzN&E~Rd>Z5B6Rmfo{D;AlZ|CYcO5;P4;zLHC^N*50f2?-n=e`~p34VU=
zZmCR^rd~($aL+!|3SqbK@mzfdRfqkP2cdr_{e`8ifkzrc{%vj7d6Hn5apa@%zU)EZ
z&ML@%s`49kUQ2gCoH?cVC*C^f8$_gE6hDy<p46kz=ZxZGWn7zfT|wiz%SDjiLH<@-
z`)d;Q*ZdE_lidM5Ybt-klWI4eA1f(-&v(Gnnm2m*FT+*ODd6$)s|$6!^ZT1k=P5s}
z<NB;r{ATdX(R#;7;5F^#b?&4n>J8lk`JYw(=QQLa+kyX|e(wSKsP1Qa@!@&lG5j6K
zKKo>$XYh|{-dQkT`8EHPaOakX9s{1z_;Ziqz1{(yrsv_O)dpUT`fE%d=y?bERat#L
z>RWm~>IU`0k|cln2JqxjyIM=V#&7v~fd9P=efa&*QEvgaXnoPFA9x(<uNxi!9)1S)
z->h~xq4weR=H_MKPxOTRSe3s|=c$EcpD(1nre7*hdw0rsF*xtLUhRiI|Kq%_EaX!(
zj(*qq;uEr4ToNb4pWcS?c)mJT{yCZ_Yy4N(8X?Uwc8>i8`8%{--^xT|@Rw=*{^3vH
zk9-U}csv)@fc!I*FP+i#MXUzoTj?)M=mH*o4)CTZaNb|NZvpU3A$VR;J)1~kVA`9#
z68IyESFNefW3)va>#p^_CqEkb$Em+Qz8v~zD1VqI1INho^Xk{g#AonV$^IK8AQ(LH
zeXczd$ACw5KmT0Sb5|MEyN>$zldFL9KETsYgNN6lujqMGjON)|5y)2|KWsG+_?LuF
zk$_<O{dtNfRo6hC-zOdTCh&xw8+!h5>>uEn!?}L4R0fW*&#%P4<~-E<0r}5;gMs%V
zJ2zeoo>*=0%XH?JQ7eH5hXR*$!Y!5N0?+=C+b<^;0uNV#yllI<WtheR&eKj_1byBm
z|9M~?c%m!7<K>fc^qeU54RGluxBL<SPwYwPGXbzwE)4ui%4hzb3jWA^$X8IFk!OHg
zl$VZ|g{`r3b+X$&nFkHd&%@t31$d?v#*1tdxn<Jhz{9Pezc(LE->EqDdzZg}rzrn?
zNZ(_P(Q^*h>iJamC-8UHdRLc#9fFUcy|1c$YU=Y6N#glHCVJzyO~`**Q~}<X@ZV=c
zPtKzbZv!5A4f?1$mL-AI@TWS$J_V{zSo5OnG2}%*Dt>t{)Envpo`)50BlET4d6)dZ
z>aS|gPjc(MN#cX)uV*PfbZrZs-zi@0U8VXz0G_PYd){lnGqlfdZh<rW*5|qP{w)q>
z@GQj>NB0j?w12ok{LQrMeDb%g@1fqPS!hV{?72*N(!_IZXYlZQDBC{+PmJ>aertg{
zi@`6`t6OSnKPGkGN!-INAAAA%_+H3&Reatbz_T=twyT~Q(zCWc|7_Fq&o_sle|_qg
z$F2Zww?Vyet#|*O!1<gmEds{=Nqvu1ra!mjEdkDX{*^0%2Q|-;Y0E7&^!c3dm#EjX
z|GCoThJO$F)l0VkUrPNrOyl!=G>)#-d?UUU<J5b8c%tTM;Uwx6_i)P(l4zNFqvVIL
z>hskhIv?Dpaq<SzGxk5_(enbYf4|iJ%2I!|)&2@ne@)bU?0d?)E~tfmNuNNwWV&(7
zW%YscJv^_E1m2d$_55#DPhF3A_OJIH@Fe51k8SLe(S2-N>n&0|tN0?tOE*Kk;cBq6
zgcrBWegOP@f5mZ`D9t#HJ`Z_G^W1Vq=STZ@w0EB3$K^kaK4A;;lNCRm0-mBcnI{WI
zBcG*x%eCtN7t*|Tst4r1C%lHlTf-B(6!CVr8tw)k^iRxyp3)q*+%LG1KScewUgLk{
zJ;+PAaLe(Ukl#uEJoHoGncZlw*I&=ed$<*>^Q__l7rVdjSDvBrqp|15ln+$z4?K7a
zc)WaIXEFHsebNv215eU=@PQ!k0^;v~Klsy6gFmMFRM2skTn&7e;`bfUdf&`#*Z2y+
zOOPEV%XDVi6)XYyH<hR7hmeobe#N%)Fh71x>%NmEz>`@5`*&2H7sc@m{~B6f%=;7k
zu}^dL9C9CUey;3@B>bk|oqgb!AKkJ>=dU!)UwJaFjXdw`R&5Hr9L4Q9qoC*AG_F^x
zK3@?&WjT2GKD1+!@SFDDMRq<>7kGQ>-?DAN^9$)y{vY5`&8vmhEmsr)=X1<%H-q0E
z2YX6$+%jbm<au3oROjm$&DV3}w~XCVbsM_BzYa83dFfw+^SvK~2cur=KGZAW$Sudk
zZpIGZkliZkJYv6!e)OJa+td~Ed@s#@wKL~e{cqEHC!yZUwBGa7--7zQR)ON97OMW0
zAwR^!KLDQD0o;oNPwT!%cpz{wqgz@hARq4r+*?Oaz6JU?<AKYsyQQ531Y<W#pG%kF
z?3VJfur+vmBIHLXUbY%|CXxTt?g{*7il6Ul{B*(%T!uQzGwpiFFQ;{1MAm(#UcLw8
z<8hELN8>bhE#&#0pIthSBu=9rKi7Ku?bi0v{!-^M;7>dYd0F<l<xgGj^Lvrsy$Bw@
zXKb_Pf&9MYC4(T(&rJ=Labeoa_ehj|5_r60Za?;K2|ZI60N<;6{@M@n{QS8i9c}n|
zf2^L4$Ga%MdQR6tk<;Kmr2NlmTuo{}iW|7)=+)pqME&0BM(D}+{vDP1%Zxj#CH%pg
zPn&3*=X2_PlfV<!b(st|w{%FOUW?|bw$jn2z4klM-y1I<Yo3<SJk7)Z*7Kk=?Vm4^
zdD-a6&mkN>54e2-ej?MHTUzLN-%a*xHVyjlePw%kgD2A$dOo2%eKig^m2&Yh)gix>
z#@*!6z|(qO-&}b{svpK_Ju+!4^vTe<d9frCroBGOZ>Lm+{5Yz&&rINH^Fo%C@mlYp
zQ^2EiPBFa}@T0^tdMWTkTeR1US4(w$ozeC6MasW%81#%h4Egrjk@iU7jVYejm;iph
z-{zm~S}#49GF#VWAzGJB(ECqfdjE+GH@9@}h<Znn|64K-8hi5iy>2t=4br+}%O}9?
zU1*o*pS6Dm9;1EEFEkF%C3|kwc_Bt|_0|%|W8+72`PWy1p1e=HS^V6LBi;vFsQZoE
z$j_hC{D<?~SV{2n`Ab*zE55hk`HxYr^A_~HUG4n!XNuFhyMiQ~Mo+%i<vs~9#{PW2
z)(@?K^Svl_Zvvi;p<bC5-BNiO`1$^;AD;ux&l_Z01K&#dQgSHxA0fNFt^N=pf4F=<
zcw!Xa9@Y1$6SPlKSD#C<zsT+1FDj$ogQs%wD!LyS)BQlt-#X=^UcS$vR}<iTpF_D8
zYPZ_pe_QR=SMALAE><lD58neeUjm@%N4|gTdI|Uj&-4PngdMkRk%|l+qjQ!%@_d59
z`5uV{;?Kq(VqYU(Emodh*MaAL8ZWCgp7S~Qv5k;-Hs;#1yyg#l|K5mtsP`($tEct@
z{{Pq?z5zJjA2|6T;DadtiR%0nr}=BIo;!R(>!7AGpBnq{y%PgEf#2?gc6o7l>Yb41
z=j$3j2)r5TV|@cWrt!am+Ih6bu{g!CO@Bb%(sh<6|GIRL(UbF>F(-frX<c9W9O&PH
z;^CNfkmvg->dyks_j-*~e~a~o9j2@PgKI+mRpM{i3^?Dvv~djd*+KKs0bSSI6vu7}
zfQO%>TdFwUtJA9&`1$^sPt~6%lRppa3;EfE?-~gFJHoq2oHyfz&y^-gyfyeu)UIx$
zfR7|R{xI4#n*5}knAh|#-<z>`2=H4dPQEXBsi`+i?_Z3zA%B|kmQgx?@jl_#(xIka
z&hz)Z0z9}OcU+Ht0eFhmajiZA9;JPZhrR-y)#r-EZQOEW1K206`&TVB;6AGPLy+_Q
zr<A`7^zp`Hd-YGgzw5^JkdKl66AuD+G_UaVdFx8(b2;TLo8E+ciq2CG)P=nBBjV>c
zAKJBgIPmBc=y|QS_fZ|cd>_|GigW&dsx)|>pniP3J@n~EdY<SBobS7+JQh5B-+NTg
zog*|4?w95mzZy#Mx$r*VeBWT#@6<lDU%FlLTf@Wm(p)8GG5$7^_+OT|VdO*AVK?tQ
zy78ru=jX=;bb$P=WQQs(fk$*bDBa<flbwO{`z)_$o)FPILDE0B%+Y(#?90SarFi~%
zp04{2(>(H#_7~rGxB3Ru%lGfyl0dtx?r5*)&!0RCob#@OYk>2*W<f>pSEo4qZWZ8s
zf8Z-!6sK|2Js){OlJcd=1(4@^V78P39wYzs7eY_I$EKXd;V~3fv*K6AA9&yC{UgBb
z_h26{AIqBxoZpL_5`_NNH1t<F9gpQT@8$E9XJ&&ZdjR8CJ_F&F>vg}e66LG2Bp)_<
z@_W`df3Nz`bMJr3!qfHV^P#^3dMh#o^86m+x8D4_1Nz7|kz3YkUdsDpGfRM{5!rv9
z=GBo0P;XZ2UG^5_ZORXC83R4-C6Iqo<tGYm+ROJI%+UEdOZ)Jx#q36Y4vp)hJ;2ZB
z2u<IFyrXedmh*1uQXBo48jgPS`rT1~yMx+0|9tTOM*hEIH~53w&@PXso$fy;X#aVm
zA3S`Y`8plfi>cn(mBHVd>TT8)Jp4U{w{>5_UIcw4Ou40##{Vp>7uG$l_Gt}yaT~X!
zdV)WA4e$pQFJ1`wxbBx;rTCT?fu|pZd>h5Txdiyl<mXpOUTylZKjkN%7AwErxA~Ux
zT)Q23g!T<ykO5_Qg1<F#CB1vFpObDi<HGhqpCigM_6qRSrg&aW{fF-@`BvBU8Qt%l
zs65ZU03M!y7kvplL-Az(U%<1PZ+m{S<3_dfNa*RUf7k1JGpXxMS?;-|K;|j8UG*V9
zS@EA#pYT-RUccX^=c+cHtG*`hUl_YtdQRF{c?K_lo_ycbJn>s2&-+w!FNZ##QvdE0
zhcWVPsDI~p=XzIQ+{t6EZuzVxc=$Z-Z=DDEdBQdlcMbmmI`13T8aywPK7Hbl=kwg#
z=K|;as{RMSleGU|?~w9nUMIuQEgdqz?RQ~+uYa4CfIca`2QR7e|6B%~@2UGu;-TqZ
zz9-~92RPsJ_IN$ud>?pmedsfd;^Z#zXJa?sry9Enc#787t7YIAd5ioxehPS)_8V;-
zkM<83kK*=jS*PvYOa1a?f9Rhc2D^Fwzi<NZCFDP$M^P`|!@1z3>QC#_yEWfPX}+;i
z^=Yj8GZ|XHPm%SzY3~P=cjb2jKi_*fN}nqT({lxv>vKVwFJTASR&`59SvQ&f<^9`j
z7X#<}=mxY!z5JZd3eA)G{IRFnZ2{Tsw;=e-lRrGC@tMDq;_r@nH&UE@sV{K8PjZ{C
zTeB0;j}k`QvSbeAL;5`AaK&Fg0X$6oRl5W5AyjYd9_XJ~4E}uO8F4T0=u+Uad~?f)
zMOrWA`A0pR@};t?A@6AXj400+yMU)DpK13a@We-`SG(7muJ7qwL+j-kVmH(8d|&!l
zUB5>bfahhcccsqPyuNrYrS-m+TW^gcz*7`|KJA5i`M#HZdJjY5EAU8q<CdXe@bEpo
zhpq+>pEup8<CpKznx*rRH5vSpCb^~4-{8-DmaG5I8prs4m`O5CnDNNZSD(@G!uRYA
zoDF$?|8dVc;Ng|1cYx|MW+U(j#l4e~N11wO(Z0%0Yk+fpHRlG{Cvyk*y*QS>9XR{d
z#^1mbJeX^TD++<%PkG>8nV^mT{7U1uw*!0vjrWb>NJf8K-`^ae?Y)09@DS~LPk$A7
zg75(cfyd~4q@+I2naM!^!OGulI^_8ttxuN$=X|oqFTk_3Ke@jm+I2qVy~jep`TmNi
z?B|&F@_Ep?Vt<44cUxZA2|PmkEzM=W#mF}&I}elbYjEDj>m%dW;Jp9#sm2q2|MBiE
zkdM(h;DO3$S37F&67}=XNS{NuKt4my6@07y_9yM*oi`5h|Byeo)%or{n(r204fz8U
zho8_qneVX~E_xa}@N-aOrUFmLFK|8N9u1UN=z7!9a}eoPx75<}g56|?FWZBM@3lCp
z>lc2{@#fmlKTG!q*3@y$@7=f2{gC*bx#RcL0n{6&{l+`>y|xVH)zNbx--YxKi~eT(
z@_qQ1Yd$uN_A6SAhdkdWvt|<b`TI~=)!!MFtN$Omjt=etuI9Ji)AJg>SLe5L!Ozbr
zeV~4Lg#7S?=G%Pk_Wfb-WU1cIzXWdW%hl&k&6n)&8@YZUVZkl8RfQcKvtcO3Ti*?n
z{MOhrL*KJ2aXt9?p2-p;AkW{M>vAvT`FZSLBvCZ}%z5b(I$uO-zHn-RpU>IX3cu0Q
zdIIg8<+j4Ij?MsoitZn~JPw?nb9goko^CWAFZ>4bamvG6X&#c&JVe5jTYBsM6~Ev2
zm1qC&a_xUhKI#oW0{j1?Jm*ybp4IP1ELVI~6X2POXs?$CHdK4^ePVBa1pSjVz5-tX
zPw0NW=eNhs01we~+V^OC!}@;oFx4la`AL}87mak?ZP9q&zYqNBH=w^)?*}>$CTSjw
zlt+8Nr1^S>?iWS$J|fQ!2MV#ih#e<>;W7Tk--o(J#=G&K@DGq5rwWZ;26=u@@YE{c
z{2iYQG>?r_9{Z^7&qq(?>ND{o)SCzZmu0V8W~9L%r1SB{U#mX*b9w6Pd&ddlZz}no
zX>VtWCwpVS`91o;{m>`70{p#IpRw~HpY{VEqxgb$khd=bF2l+#tF}WvM*B#4y3gQT
zDG8?(FaGq^`z5kp=C<oYjeC6mUK0^><C4BNI?ZLXtox3k-U!9LtPD3J&-cThBjd=}
zhwqE%KTq|i_w^P(13Yv)^px=8mUk8B@67%!e&ohC+CO<p{MO*S&pEFl^kh31_Jcg%
zyE{<xs}SW^1$u5C({;B*Mz?IM1AhJv$dN(7qx5{pI^{{y`n_LS$iGJ8@q!A_Gmqxw
z)(=CT@Bg3nIpjm{z@E)i&yAHKAKHO<=<T0RsSbI5POM)|$n*L2jT3?My&{8a01qAp
zzjq!~QP%0kPx!u?z%{@V*Feu=)pLXl5Yvx*&%?wyz&S5^Q1gxMw2t^`Cgl14QZc{F
zAHEoR`nBFWOF*C4V&F-|<HLZzKzZr({osjHJn?-9JVei#pED9TKW{edGVt?#df!EX
zccFU6PXNyMqVJah>yBT2?@XHMmZuh?-V8k_yhhINjGg(rMSY)zd~hWC#T$>m`M`gS
z#@$+tC!1)0>{Zdn@bi0chnE14Ujlyl(JgO(2i&G}qVu)i<KLoRByD!f%dbN|`YL$D
zUET79mk)fKn-5Is4*4yVhcvqucyG#MM?V6b^Mpkk6sL3ZFJ+x#+RM)y&lX2AIDa4Z
zi1OH!kNvIjCr0sSh@Q(NPoTYC|JE&oepya&qlMZbeH3=^_VF5Tf<6hVcgtPivE$$u
z_1w~0$NSSXzVcO1i}ak(6Fg_g&i%!2jU9M@&#(T*-&g(Z5ag4)bKAQ{pHqs_KKzdQ
zsF%N&@v_c4oNqT;3VwdCZTUxPAIj^#lZn;nANnL$pRaxe9w&U8KDV5t_38SOkmq}2
z-o6R?Uqa)3xg-W=TqpJX@j>mE?{&NcX}p|x2=aWNURwt^-;0&}0{S@m{Ho_yTh0Iv
zQocGy?+*#;{8d5q@z(`Ezo*<#*Bv49xA}U$6Qp(MwGxnxo%ue2pT!Ri&hPbi(Ky5N
z(WfgR&-wF`mw@woWJSk;M<{Rqz8vCYlJsn=>k+<3>zQicd4%Tc9_ML4euX&pvf8bt
z#?Qz_@T)7e1I`@^dA?U9rQ?P3iW(XZgBlOLe!Tez^vTfv>Lfi!4$=MppLIdK{C$Vt
z2Lew~o_5(3@bi0Z+ui|g(S1sN8Uz1^^5<_f4)Z;kQ|^E~e{W@!#-&vhmuB4qdA<jr
zNXC)zAO23$oA;t!d=JT-M<Jj50{vA_{L+*Y&j9EB(pMwk;rq2d*Y%oB<ND$g;K}N`
z!<)Z4YCMThJXtGY+|{4#mX-C6!TG)ZGOB0vR`}HnsBbmaeo4`}|3}hH!^6)tO_g6X
zxbp(+Fj;wG&jAmO0ly3fw>)NHyae_9*Ym^Dx*kc<dZeL@3&YR*W;-;F-Aw)4UgLJ6
z4eAy5a7$;+Cqt3k_WrK@!uLBp)C>K^-v?|P1s=K^JQ4=ovg~H)Iga}AIkf}7FZPk*
z{C(Bi^|`5SG%vg<4q)PZ^f>H1K<nMu5_tSV=;OuzEmeU(L~&+wH{kre_4njEFUHQ%
zX~+kbXvYK^K%U=|y<scbo2LEV!}5DZ&-BC4GoX52b`|hg_uPItwHQ3C{}7GC&U27I
ztvsR4khf_+f6c4FlY}QTz#WZm-Z<*<4e$i*FKyBJD?;n9De6Bd^&dG7aLZ$*VYiI#
zdz`QMiMHU+r#$EO5O98v{!i&RW4AE*^P`dgn||T%gch6x9;0){1D^v=>H5waU(LP(
zp49ovvqN0Zc_Vbr`$H}0W9xhRowZ$mOLrK3##5YGtLtd~9#FYf;8{ZXe^T+M3I9;$
z5yQ{V59dk#V{ra{N8MGx<743`G928p?svs0KY2#m8>RME)_I4&`*Xg|r@WtLw*8Gh
zH<2CcUxxN>r*^%m>pOmre#kiR_^IE!J_g)wfOdIt?BI)v$ANo!c-wz~JBNXL{k}!p
z8>99PDukY~<rt@NwOiE=!1<iIa4q;Fi*oJUvmE62(fZ;g&G*`peMYu`{GDW<x5olc
zF9ZKD)u*j@&;L)rmn(kRxzOhpif?7r4--`H3**7h_o5GYM(sHq{N6lhYn~sWJbzsQ
zcvAY^fnwFCr=BD5Jx-R4cQ+oM1O6c@|M6|Wok_sOZQL?%CiLO^Lc2c!+zCVe5tYAr
zh06B@E@E!kb4=xFUvA`|z@zlM)?9rbEv@gPjZ&WX)qfoFpVLY3@b`j0Q~o&d-?s+x
z{5(KC^{cr0m84s4Ig?fW={bbtPr$<;qrIM=AFd938To@Hd7ZKUD|9|sbt>@Wqws%^
z=a(bk3DR>x7vw`eMfa}f>v*x-L!XrDzh2kf_A2m?Q2YlO7bb4-d+|RH1AoE+&r>RY
z`B7~z>C;mdR;FFCP2d;zcFR7E8@ygPAQQ2vm+u>ST^!Hg{QOcI2{;C~$Zpf5B7^gO
zT08aIu)e?fsOs~P{D+a}yyeIBz>~C3a*3|r`P{a==3|X$A7+r|52-q^+i|V;q1NE%
z=jCp`3U+&o=H=<?S0VDN?F}H$_o}t%34VS*ecJuNt5Q5KeE@ih_Wic~3_MBoo@|GD
z&m}$U{-yFXz6R)X3(+I6gXhnA;<v{Byf4?e2lz8rLw>Z{?KhccjbHJ->$9&155Hgk
z$DNROW`W0>e`jVOAER~UXJN?m_uv20eEv4FTkVpN4?PY)@2GmVEDiqIH1Axf>rlS`
zZPYBtCtnATx4->>#+eAknb&-f|AO{2=V;y%Zix0ucyUX<<a@?$e6K~j=fK13g*__I
z_m9n%0BHJ)zeDucd%z=fe%Jap;3--UE|h%U=ws`58$Exxa}wmYlATLxzRmYWKKU8s
zqkD4w>SH~pNKu~rqvjRUsb8)>ANKi?;`98u;CFsQzsym4o_QU3T=y#~D}L7>z%!KB
z{jBTCFs&=!*7v^cqq+KDCCeORAHGNZP+#ET!H^%R_12YsG4iR$fqU`!dVSt3s?VEw
zb}l;z@_Zlo`!c>v|MEUw^F_e<zN3qF08i4n-kSZu6Lh{X^nK;ok=x#uO`uOq&#!OQ
zcAc64{3-H>0WW}u^AktownK5dmW&r;pOD^PG+6n^NS<Tb`yQ=}hm^#+DNFrwLeDGH
zdM|{8FSm5o=c^-^OF)*gLk<1WMAU12kM@dL+%iul0#k2H*E?Rn-2DUa4DE}aiUa3!
z?JL!uLA7Tk<v$_u(6pEDCA(PT7~fO$r1JB6DzE_c@^^4%^#*^uFWOb4_1-lAc$W5U
zzMrP;qWg&INTO)$klBOr<>ifCbbr;NeaoZLKqJrh4R)*oobMalS_Aq-KL>voHFQAr
zPhJQ8hpYZa)&4fc+pRjUS@is5P-u;voipGUGrDD^#?QZL9dYtT=*jzO6<-9--}CLg
z33zg2?s(~@@3jT>`D)L9UN{HxS5q9b-Uc3eJ(uU#-N3`NpH}A!;8|Ly?=C=lYf${W
zSLeHVgg3ewJbdrUjp`4)PT98*@@ZO+__nB?w4YN?_cs#sTt{`?r{a6e=6lb{|0)4i
ziuYX8r=!3>mh#C<tD(Kg57EEgx$xnpkUx+5*Vh^Hk5XP_%fik0E$7>pYy9DRRDRKX
zDM9(tOug44L2)Ue`{5DY4;QuF^5J&W>u7wIWr$m@E&-gsqqIrpN2BLt>c`J@-5M!?
z|Hv?ROFPXs_+INbR)8nfAM)P#s(BoEnBv~rIvQ`MB7S;tHE=HMY3+mkC49MMYY;qz
zw2t1Paqn5m!}sdC;2N4o`Yizuf9Ldm)!(@u`g`-~m8&4n-#Hyo0ebR%Zinju=X;r&
zsGdok$FI|NJw6WdrAW`TA9zXXmy+iJkLtNhN9E~M5jfxLHLWG|pFn=wK+Z2r9OHB3
zH~K@Kze{p_66D_|J-=EDd4AupiH^rtY2SFf&ZkESe?t1#ov-OR(%UsI1?jx6hK`FE
zjf?whgNN@?w<W<ae$L-@I{pCgi>X~RHQ$TS`P8VBkWbP+=iS<0(Yf&3FSTDj7C$ld
zmZtdZF9y!*i2lof^E&9Dx9>#nJN&BWw(H5B)nuHSdU>C-lJ;Xp^OLCRIai;v&C>k)
z-hHrV>M4v1`E|E+5eGK?#rKMI{RH(!--diwl^>}2e}?vLI(!QG^lr#|^V)ZskA>*`
zWw1Wimpuk~Sw6evLCF)0J`KsvJI)2p-`T8iKI;9J<Tu<6`NwHqSS{ythCjI&^~$vF
zmPaN7|A_o&?rY$;Y5uxr5AYbxzqQi9vv1_uq23wbnS;6boPU5PX@7ObSm?v|et$Uy
zcz8Da)3dYv7<laZ;FspOW#RL{`5vkX-GIMD^IZ#x8^%5<io=iH44luQ8YRK=|5(4I
zfIF|{wl^ek!}v45r~g<<@LNmKt{$rY6V-t8x%QSe;QxW*;iXqXK1A*MQTvzgi;K+z
zPez{)d_wEJO8YlT{ris2?{S*ngI?SwKfF-}j<M%+q|bgGcYOcJ)Q2I@_mE6cJ)Qr-
zJ_%3H#*pW8?XN#XyOOjo^x3!CFLW=&Abr0zdU+$4d9|#&OzF`N{OiacrfFUprMz^T
z=5-dWTN~*4mqqLPrD8YZ4}4F;=-I#%AE3Q5O}J&@H^5VLzV^!Zz+<}pk}%?yujNl;
z|5}u9)EW#tb}#(X%m4Q-0FPZ3xVKKf@EPE|KB}$bi|?!4aW(iux({}T>QiF{czFGC
z%QoOa8ZTF?J(F|}Xhp!!_f4Ju26)oM^T7F#=lik0m<xGc2W53#!1tUqC<A$Z?(4^@
z;OG8&P4aDH|A?*&ygcyAOH`lpVdr<$KG$ixYLLHu|0wF^d}&4qJgF++IbZ$uf@>hp
z-|@+efqY2!Tc&Bf$tjBKbNgOgy+`etC40thg?xMh>XqT_mXdd&-u5)_Ts{YQi1MP$
z<-qwn8hbTfeM<AulqVt2_g~%hEAS}o*FCHINO2mE(@H^}?`^nJ?a%j0?bLG|ey;3=
zT5A837^fZ84okIN{GR)98BoR_60>2q4JyCn7i|~mfBk64Pb7Oj@F3(bCHuUY1kQO*
zN}s1nP+l}t{e-`>vtQ$0d?@ssp?Y2~<H*D--fvu|`Hw~WnRn{`p-uaT-^;vac=%q#
z@tx6L{yx_yI{!XI`&|=;L;fh?t8NGXZ-lqiyep%5m*?l*H2#M){$m!fthJYdpU;aA
zXkP75UfoFN7hC6-nrgQ}5|B;5^FI0A?Sb?6XuiJ<IDfCF^g*;MybJx}>3{n_z~h=n
zRo8m=$#`_*5BX33alrX}{D401nmLin|9cet{9N?vb-?*v@$&V+Z<!AqOY!cZ`u$_b
zTbfsQaR1e^8t8gIr0ac2cir;)Pmqt&IZjCTE%_d#N_|joXaMw&DbKb+;NkC0lqrCG
z8Csui)cDW$s~pj|&G(Z$JQ)1^-I<?c{b>ArH?3=&8mRXL%9jS|_~rM%x}E}mkmmi?
z<-lXFhMgy9dlyPTHuXMC^GMN9$fxgzeePBHi|&McE%KAc)PJrb{Lkr-&pwNKz5BNt
zEd$Q;{@lNT+mz?Wbw4otMXtY{tMQiKM{6woV*HuUrv}@=`F^?j?SY4{1pjJn?{BZD
zo*RJ6^42Z?=(&7?)^{JC1D?=cw0D!rPm%#{^s$cv_u~KQAJsmzA6rG&k4fSmBI^;u
z!|RdWL%<WIIJQ9=V*1ya4*hRYeLCqqFfod^jkVqs)myqJ>J9b;kLM59e+K?A#dBNd
z&A~KY_KN+DK7SFu=?dWd9f!l3KkTG-^%Hqxhs5R3=MHVxQk@5<ksnTtgFi*{Z+Jd%
z{vPO+&j9D|cy|9;af&ln{Q*3E2>RryK40iQgH3U=SmS4k@FKN;n(TkR_>ZyMdWr)N
z)rTGUK9)OmUf^?|L;4-PKPk@Si~k$`p0xgI(;fVL|NISC15cBDz1eDqH2i$I+99jY
zmpU|`-tjPa_<JrbN&>%({G?SOaK6XjvZlcK{-{IpJcDT$-<!H*6!hfpFKk!{ob&KM
z76DIE9{%ud;P$rMe&1LZI6r@N%azcR@5i1~1l(bJ%KU5e=lAR1Rlnu$Xe<^x82PHS
z-XA&%{Cw`b;}^&WDWCjS`LiX^E;+t&%eEIFpPB-^rH5|;9;f}ElPys1eWcI5GNBti
z`ToPG#u@$|;`;-@^Eu5^7dHUT_qK&(UNi9_b_4X>rFwoK{cdnR2U#%<`tbW@g*spS
zM1EMw0ndY!SI1YX9Vj1eu6~lGeE>`2Mv~&jy%Io-ebN_W9CcRx_YOn7Ns5P4HBRz9
z-haFVp5SWOL58PWM(g{t*;By1c=fjKJMs5cMo2t2c8k;fY3nC~pTD>Ah}x6yr#iG$
z^|=D|PFH=}%eXM@<^8CB4sgDQq-0ikXx?lf<Id>G_iQ|V4e&DLKkv>0kG&ms_MS_5
zwH)L_F9M&SdLEbkG{YZk3H_z{ZfX7ic<PWp{OKsqE0C8o$t^pk0*}lCzi0o~A60!Q
zPpFjvp4D>!nGW1?x6T(?oiC~@9`nutf58gnFU4Qf@i>~s<M^w<&(CFDGZJ`nnoqYs
z4?L{v`fkc!QS<inn`p16XW#_nGxQ#J;$q-@56OsAz(bVB*3o;n5|jsCUmo)OzI?uH
zkedEV=>Af3)pJfn`;q)%^$PH$`aypgo^E+$EpWakYLd1$_7>#5eq1XRnR>J50r&2M
z?W_4A-{TSL19`r$<IabGJ1>G~rRsCdm#XJ`x%K`a^QIX`T`6AOPzO8>Xn${N6X1UG
zhf4FnllcSwDL=ZUy5_5K8h7VZgFJsnrpZp=_EzY5uJRnze&>5X8uf<!71S>;uL92N
zk>WRjCzao`|LKh4^!(de%^&#ucd{sE`s;SeuhQx#7mz-|2O-b*rJt5SVeHS(^K{Yp
zIgI+{yy4*C`>2QS1y6+D&-z{O&5r81m~6ke<?qViiR?swiQBlPY!G->?~8a`@mCb*
z^X2z6uEwZ;FA6}P9;AO6J-15Jx#T;V*WE|)aBdmsvzmBrDhqyozkcQr$n*W-cLk8w
zMd`e3iOy@*H}JRi+TK|bA58zApn5ATMZLU^II>4^%Ig~b0z5<c|EYB<zY_J%)q0!0
zqPWKYzKSp20^Ff|X~bW^)1T&!m!5hqVAHukOI<hBq`a}q0_86S9tmr1Y5z2E&Wo;5
zeWIk#n6Dupr?_3=GT1ZR6#U-28EOkWsq^DATJNG^;Nkc5SL%4-`xrZ{hP*uu{4x!>
zrN{q(N9Z}FKh+KwksV6j0)6IEzVwj;o|)vIqrL?mrhU1*?|{b%-=q6H{Jr4W;&{g2
z_@48bQm?_MQoZMX4t|TCNBaCE@C=>5kJj^!w4Qh5!8}&S4&YC>huyq&ao0uQNmT<b
z-QkwYZ-RV7vQL5JGe*w@`E#e);Ng3dDlG@j@Au`e0Un=&dN*jj-E>`?rhVou6;W?=
z3*wkOHsh9`bl&Vm@p--G2@%Q@{w@KY7Bt=qHBM$z;5YY8m~uxIZST9lr8#c7;9}tK
z)4bXEDYPrw8SV1y+;E?^_YL4)ztqt7rl`H;bsr#3`v6ya`P<KpTs<T#xaB;tgPC9W
zyBP0I0nYa+g>^m4_a?2<xMXjDo)RA2(nR;u_`ak0J;BfS2CP~MoWDm|^cwK&rd&I0
z)#tck^j`B1wblOgee<Fi@Bx&6Hd24%_W>KcjCwg=@;AhM5u5>g-lpwM>A2uLt(^D#
z$q&%qyC3Pq`QZPP{P42dp{F$!@-3DB?e&mPsGU83$cUete&l-ye}4mbQ1`bz`7Pa*
zzdQIne=C&z0W(g+%Yo0(diTmiZ}|D1qCxt8N32}~SMqn2e^&F8XafHKmg2J{kuW^`
zoth=Zz%x%kKCbe+MFAtv>!vEN0*{eDjQLRUozUNFZv&0<VT$t)>G<V)%Wv2HOh@-K
zYpOnfD{j$xVNp5M`ys`L^{s*PeJLHie3<TED$w~NtMR9j)_a3K_r>2yyh8ccP`+JR
z<}cHaw(e(o_WWQD^a=d{J72B*K3%7Tu7llVTf;3AcR)T({&ThD=Z3$S{C3+I$n$q?
z#!dvz@9!si0dGP1urm<+CkUS@<Jb5De_v+XbHL;GL4R+&T(K6oy%_k-+OGQ!0T1m4
z-c|8!de2yl_A&bC{VLhxja<neReqLqkkN;qo0{7O`o~*<Uz+chE7T5rzsmoXfrsx$
zy6Jh~;fKH@v~HOy3YdCRlvmf17xN9C(dX^|R{7T5pii<Zc)al*lmybqPbWJMxCQtk
zn#aR+z{CFYri^#P!{5JnP#)|tIOh*lz4^#oEGEU9kIs1u@)=rx)&3NCO8d(j*LN2}
zpNDDOjcp2???HNC7;s1Jxkv5O^)BF@X<itm=V<)?{_A_8k3;LJ;<~VZ;#b7O=E{Gg
zEUb(j_<5eY`=j1B$^LI0Ry)u+$QX^ceD8Oy>5#XUV_Xl{dhfXo{NWqX@4FNaNPn5}
z%jYS*=7J}k1y54tyFCt^^YFUuz!RW6IbY(u;qOCv=_Ru9YwW=9mn}OAJVyHu_tgd7
zit?puI?raRqFrJ}w;Xx_@;q-=*$+Ir9rB+4e5Liqsou<S$j9lq>mq$_)}i(3je1Xe
zmex}()Zcjg`o@Ew?+2*A065=gxGDtvVv6$%#erRW(z<e-o>TL@{6CF{e8158FnIXh
z&cD@vQiEW(b!z9Abv>R)0{8sw55@UD=Z5+mNR;x`_hsHResYk;@8E&J6T`tjL+ibA
zEcDFMINhyz1>YYM)%l(GV<WeN|3`{%zpJ0~y!p^aYG>Mq+@k$%(|mDf8_1uc{^~Xa
z_42vudKs9eA5*WRy`F#8`U$v2`vd!%fTtAkSJrsT=gwbsgFN4pf0=Zssh9J;f{nm~
z8*}x!=9tRw%f(CTyu<H#UFya8pBlM6=*9WR)&BgPOwl;#nb!Q%+kcKv1s>7o!#(>P
z(fgVCeX-3VVA>UGj5t<7{imCZBja!3D$rlt#x0+>1<v<-jJ_VYJp}SIRsQ0v^3(jg
z{(r#ZlxJ3v1k%*Y=dC5jK%U=ck30&T?^l%Y>9#AW-vcU8eO{>t{s_g%;Wq*2`<6eF
zacb<t`)QY)1|B_#_Iml}FY=?&pYI8|_FCY<evp4k^=bb+c=+D$Rv!b;>NxWH@sY;h
z$&&tW+yi_qjrT6Kz;l@L&vXiSlAasfs{4C!+TY8Q^<xFAv{mCKnP{Zg?w^)bMI6tx
zH}(nqr=2SLPZ0e4eTqMH9^Xgv_?k52Lp1NiBr!1j-D&*VqaZ(=+FR#-$XBBIbiZd$
zIxm=813dhk&w*yZ!&L9362Of<hbbODIveu*KG%bK{%+~{`}t}&pZYo9qm@;E;O}lc
zaU=LkQ@x$UZ;gNQKGNX*z#Xz@!4JR_2Xg&(t?-z3*`tBaP(7cNe9!Rn^F@X8f%A75
zpI-tz-W)uGlxJ%Kc!t)|Z8iQx^t`pZ%HLNW`XtuEuN=km9|jM<FQ3qSDf%etm19}A
z^wo7)iq>Us>2r^v&mr%PuLCl^Ondpe3<q05pHxH0U#Y(^HvxIR-|5e?kmvisc2xr&
z{21->-Wz?fHgNu~%9kU+Uyb~{d;s!EeIKEhwl{Ja@Ms8nPE@?iX7E_F4>0sx$UFO>
z&m$^7Tz+)@;dkKk6u(#OWAIox@b6LlJ2AMi8$U1ftSq36yrcWcroT;j=q1QItHARq
zU~AeD;8{99I#URo->+ZP0{qFk;5n>3zupdc{%%6&k&x$e)z^PjeLew?s%g!shII;m
z|D~@8nD!>rZ!gk%^ZejHL-!{<TLE}h?=6|7^7Uk%H9U1F&b-tPctu(-ER&50BhTM)
zSUV8>d|z~bJ$K-B%_Y;p!}lOvqwzLb6L#><Pp?vcvvuF8x9UGup38Opq+KHy>7n>x
ziK_-r(!7(r3OL7s&qo92@3TDYjicw`hvSw19JMFk|NQYa;Hg0K%V~{=DUFAnlxN!$
zutTx~+AHqrmU~wNPZa|fwcWC7H}DXxBZ7acyyi0!-rZ7I5B5p>V4pgQuk%4qzGtId
z7<j53>Xl`MTjs9-Kj#lKwSRft@#M>p=XFQb^}r(=!0-9_54sLY>VAlXDYp!22z^#k
zzELz8c-SO7QVzL)TGp(k;I~!({)!J%JM(=(-Tzd5^xVzUXM?UElSgy)T&T~{{YKBx
z9rK<K*YDrS@NvsW>OXJNx@o427c-7{->iHT?P^W)?1gs&pF?qY$Zz0DYusp~^$yhi
ze7ji#7xC<HSk_slUj8o3X|?lOvhy7`K>jrObCY7={65<1=YjKmO7HCi9;N-FQ+t8O
zX@BNM^@m_v=-*vCrb+~Ms6zVqB>yz+N|1f-S3e2<n(HS^^n5cx=bKyfITI%X`L1fu
zt_7&K0`+5cNkmP({G8Ye^&y}52XW(Z<ryXc%E%w1xO(_C;2}DPE!YR#p>w5Hdj6gv
z|LiUPY2x0G<eyilA6gp6Bu#b8FPi7@{ZS{nL;p8O&tEma3h92SmtWm@TJ_ZRh&PVz
z`4V`V_HBmz4m?WFzs;=zd?3Z`3w55ojpnIpVaQ)edWJRrgw~)Rz4_wTcOh?+Jqz_c
zpwL&j<9)u`Ek*raB;9KIaR7~%Ivrs*j^`K5*Lr>6Kce=ZJPz{N<-lW#FZcxVskec9
zez-&Pe9qq<{~7WL+J9Is{6<fH@BU*yaDMN8Z*}Oy_qlz!0P>u74bpjv^Q%fp$n$$-
zXTAWQ(dXGbKN+t35~+?D?=sxn^6ezZ^ZWP*HO}z+Wlh&W-kt}$i5s}()(r5V=5?Mu
zTk5_szrTM}^$FereI#tU<!gEF&e+qMj`>s`J8{d1(y$xf6Y@e^)XVp!?5PTz?}@E3
z1$b8X4K4+}_2TEiWBQzcr~eB-0#DQa#>OVl$EgH6$n@uy51In!`#lny!4szZ{0Dum
zCaTZXNHg7XtHuYuxBv7l>OVAp6+Q=kUf<2v`!V8rKgIy%4|@AiKf=y`DSocRD`Wr9
zX*^z67xhMUKYX~#*Q*DdzYEi7A$a&+rY9Ex=ls)uBk(+G*F6#dO}wp6c!BmK-^;Y@
zJMd&bM8BM`dd3RiSLq|by>U^~J2%&J3TeJu?v(&;>gD_H7R!Rm;F0gZ^R&ueHWl@z
z9=*Ut&M3Z5{NI(Q`<?F32haDkj@T-Bh_OH48!&1E@C@a3$22abC@%F?e+b<JJw?nd
z$KL^uP5Hp0GQj!yv1$^ZjXoR5AL<98zeVwR_w|tHdsG^V;~6{fz1&yox*)y~dbU(O
zd#nHt-xJkQ;~sxMw5Hl8MfRDc`_dNeOOMw4{1DlH(mBvG_73`Slj<{}8gTwD{EdBp
zCk8-&ac{R2->33V0^g%}Gu<yr>;9YPpFb5qKBC_pzD?z8ybn9rJHX@l?WYewp6?rZ
zS><CYUsHKT>Uo@_elE>)OAX0OP5<(}vakOD+@iSGQu7}^S6wb)#PsiG@`tD1P#)T!
zDLf23PUm{Vz2`${-SMa9+k8KXt?{-T<t?K}K%dH#ubz1o^`<Ef57GD^r02xi{0{jD
z<xy?BfS;dJdVB`t`F+5@l|QZgUcVgD_!Fn+c1nt47<=-)A_X#zOn>qI>iD}L&);(&
zEaS__^ZU#1PJnz$=N(%e>ru%EOueZr{LOnl=jL_b;XG=*_7~^>`6VIG&xsxW0C<p|
zOPMS4snL_4m%bqaoZrV^YD3S+MvT*0>d!yK)DD!N)EN&v`#jn;O6B*?13#}L7C#1@
z@55Za3^>29H&4&?;&iSzI*57;Xk09f0`EZcTBl<0M`+*gwJ(8ZDbLUJ22T%Kul3h?
zGkh1?D{kqQ3oC-hru@I`-QeMSuwR@CdA^V6N6p{Dl)t?&5<J#y)Vo;gJ)`GSd|%K3
zNx)3MFQD<e>ss*eJzyJjy=nah{?6*RO?7<{*7b!KSDWc`EMbcO_0I)A-?MGH$@o9t
zf8IgouMf%2RU}Xw`S?WWBVora*Qq`E{^yf#qP=Narw{lDc#O`2mVE`>rgik2iU$?<
z;z^*A>a!K&!g~%gJQzGzlfT_C6!OJ{ZxDIYuD*nCzZrOD9NK%Yws*x^@bkS!x29G9
z?U2{*vI=#7AVTf>pT3VAr{~K2nkUDpA1A9man<J<)#oFLhsLiW6#wgwf&R{8(8r4#
zmu`i8YIClhZL0%cMB`<-&f`Hn_wm}bQRf|gFZfU)>W!~Jzf4d)$ICj!*m)fBU#IJF
zzW@1s@qg1UzL%r*T;QCa&++2+3baeYg<IZ`#K!Ps=v=#2XW)DfVUr(#r}SLOYwuCr
z$4ec}wc7xhPz-;Vp1-YB03Lqd|BE2(5c>}HX|8(qy9n|-D1J7R@n!gVzix@<w^`c1
z?K2Gg&yYXARRK8Pt5#3vHJ%4oXk5C1;y{x-z{B^7r`AAE&Tlt;3p}Rl6wjVTn&%|-
zyQuxu4lNslpYH|wRQ-+5k1o0g^6|y!mwe^9{1L5}^4nil0OvgRf<J+WY5(@v-@qe>
zbNhXJ3Dnz^)(cnE0?y~u-QECChR&&fEdzOH75qV_Z?}9s7Wf}DuIoMw++GWP8mmFy
z(Dw2@kR6t3d+GhPXXV$;c;x%In#p)H{>J-buWBA{nO~H$!2Q#*zS48{#FxOm`N(X0
z8h*Y<%NB<>cqf_%pKk}8&xuyaKsNexB|mI_3*<xNQE!pzQ&JKh!_VjG*FFfGzekfV
z3mL=D_Zqd|20Z;L_&Y1V^#Sn24&a_YUt3T4=^pHOGsw50_V&^CI<3JoMfnR-kdKm|
z9MSpRp?TqF?Js^mdXwsDQ~p`CH1rHU4S)MY>ups7^*%%K`95ivvCmZ$mwM=Uw5LOV
z=?=GCPzn6?i09_3!Nc$S&v+3y-&3?o=cz2sQ!}(*LS&y`uSC7EcF?D%>ND#);Cw&H
z>qCI^cW3g30*~Ggo?DgYbzOH`G>?qYIGmw4-0@wsD@pqgKkNQXQ0I{j%HKxU5qw`k
zX&u-6Uf!lw;OFn}lzkd_cp&r__jXHsHE`~)ZJM7XDL*-)=ND-`zwrFLwXWl$y06$#
z`6t{CyM^ifg^xXb216f7o88h`=ZiG0UnZoWXPo-C-h04<wBJ5?AMhmMH~*yav|lt^
z*Y$kvc89hrt?jC;dN#cbJp4VNzU6@PcXRI5@yOrDd80k-!{2|Y+6z4VU5u-)hx`*1
z_wLp4!t0Kn8^IGM|BO}yeh!UClg67kc{h#gK|O%;xl&cV&nm8Y;8kiL(@lnFEb-hl
z9rbb^`0`BP{N0mAOH}`+@Dn+<bIXpSz#Uo-zH%CP@*w0r|NKefs)<+p-HrVc9}Ipu
zo$pk56u9*%;-Pon^+R(Nr+cL?IS4$m3-$h=^~jgN6Qt+kvH@l6!}pDUFbwj155rnr
z7ssB2J`>c=z2`ul^Tys;;9=Uos=EhxgyK^82=FvLmooo<z#TpJ_w2bs@8NW4Ke?~Y
zBi72?d1Qr51g5=9DgP}08~78Sz-|?^UCXZl&i5*;N&?T)zG6rDb;CcB{OySYkhkf1
z-Puhc&-a<UqT`hBG4Qp4Jb%BqsrnT^_de@;)ElMsLV3-f`F`xr?}z+(^jzOd&j9D|
zZ-zCF*%Zf0i$2B<PAk|`wn5x-s~lJu+?ou0x8kX#%CGaIOp9(gasqfn-w&1P+bu~w
zH|OupmwF!ZoOfOOw(`*P%!N8XCdm)CkAt4k70`2r)?5E2@Nk}a+7J22eE6HZ_UM+{
z^??Wf2EI;t7V3UQcoO&}jJsvgx!~vb^vg^JPxdFsd;2X{yb5`T_EF!~`Q4&<=Q=$H
z$+T+V)_0-md7>ov`Cf%xwZYH#ras*d@*61r-yrj>iDUd6d+j`|LnHbepcj{}QU9+*
z`RA`XFZ2EC;|_yAwFh?a;=n_?PT_mlCY6Amjc6V8O>gk<eqCxP@JA><)Ds6V?c#ON
z9mj!3>Aj56zW}#1o_KmTS3gO|;ScxdKg@a-^1M%&)_t4s_T2Wa(fKZN9QgU_KW8*g
zNa{XzHO1=%U<bb6;C->1Y1cZkPxU(?&-dURd>`_0T3>vw-}?yDzU#d=f+v0_^z_Ej
z;k$u5i-G5>p20ZyV_(BQixuy?4|sz7`L9vni7W(9OO-E>0Ak|I81m0cp9GIp1bG=&
zZYdrCoX;uBl?M->!>-mi%=b>mu7G@Pn#Y6UZ$=-!_v1AiINukSehc~pDPA4b{Tzq(
za|Wz}ygd%>60^8PmLu{!`FU2=ffu_!iyOG*9$jBV#^&1XpyIca|96nMZTy6v$Np|S
z^ymAR(pw>K>H5_3|2~>0ho~QWl!83pAJSjP^|R!k)#^aLF3qzSbOJtw?4Q#7Ge-I6
z#y=tNd<r{wdfu<+^<h1)@1k})r2Wg^|J|bN_ck<dHhBU37RC8i;;6=+4^rHFSo6s!
z<!P7rVCMsLPC8ZZv*PbBSeDPqbN`ob`Q?8F^0W6p;087SE1`cwuB86t{i{!Y4k#0T
zFV8Av6$<M8^xxnAS>Qhl{AYpxEbyNN{<FY;7WmHs|5@Na3;bt+|19vI1^%<Ze-`-9
z0{>ayKMVY4f&af)z;bs+cgcHQVc9d5?S1#>qd7nOtTQ=37g$v;&NH9#bIS*7?<lZj
z*Y#tOF=b5k;j%kkV127UM=l&Er`v^A3Hcqvf35p-DXX0P{C~^)U3v5GRfZ2QYgzeL
zP~{iSSy;ldN?MKd=a<}{OIR)S=U3dHeO6fc4Ce~je{N>}L;jXWab+{w*TFa0b`k;0
zX&?3Z;+7RHnjG}YzgG<z<t=N1{1q%7Vs&*yiKx#X4ipB<^m0PG{I_*=QWbsSyi!(Y
zf6$UQ_btm2p0LPWVEO0zr4A`_QWmrvyNq4VaVlDgd?zojn^j3<{C;1Hz{J^BsCl@<
zE`K<f7qEuRC{9|A{Kf>oQ?uCAUePc1FZNZHf3zmnGkoDdO8z{j`*fdWPqc!8fK_Fp
z9S8)iu=A4BdQ-h{UY*q7i9XwwopZZ%ZAXNiSiSbapx<BJnH)+9yD#8`E!!soLCXnM
z@C94=J4iK_V>MVVh#X=S+CiT`=nLdcw)%?xK09F9g@L@F9Vu7R4(8XZGoynQ&$F!#
z!tQs(H|)ul<9j-kmscjw=gSYszPgwxXcx*qpVQpRD=rbOEKOcM-yf8}q^&+Hm`HU9
zTUOl43+4rDi;XNRk!OYS>V>VKy|`w?f3;OCm}LhmS1c#TCxJZQ1pDGz_c!;IscB!3
zSEhG+tIJzfdT@g&`BtU8l=ZBY@Y`d<0mqST{hgDAf3m;!%=4^D;W|zUU(tl=fq;K%
z`wkVVI#zFAut89|Oa8Bn>ElvX^`Kp6g8UpdH3pYR*H}fxVPku1xs|uw7qwPawhGHw
z9SSUepr13r8YPZsidfUOLRK)ZsudK<pih1dPY8wr&X9m<lk|ef*kznNt82J|olb<M
zA*G~J`8Tnc{NWa<&#`W^>iPV~s!k3E`z3v0>jR&7QXp7Z=r0OdRjqnLF=v<Mj|MG&
z)nK5k)mm(_Ni1#oJKBj-R@XoUEAL#Zo85I{)XMmUuR$P~f2rshwjK#eEz;jsFH@W2
zG|vm0|FEpNet!w8pi{thtd5owH(H53I#>ZaGTo8O3?niYCi^d}ZIuevvnF&(%$#f`
zLROuEfK?|@wtcwB3i>C6ePKBU_lKv-Z^}8X(Wp2q=kO`vH^olK9|0NY6)k^8Tzm4q
zz)q{sG}sreEgfK$2v-UO0^y>NIA%qE-U$DoK8~-d*wwPS&WhH}D`Dk-l5ZDDJBq?}
zeD!KMmTg-wU9{@4y=B5ycX9r(b-=P3NKFCT?mw|=-U1P^Ct1NVeyPp!1w}>i!vZUu
zS9y|CEDg>pkuPfboTshQzOdhE;Z!YCxV>;<q`<D~lYeNP-yk4W)+y8Bwtz3<SiyL4
z8B2QDAGK2x>_i~AI6OHNY%dl1$^<L>pOdZ+caWdL)?`s!iX$eMn#%;u(US>WVi+r0
z&yi8#t6(e@&WjomfAATr%6W1Vj99LWuOQ%*@ykCr^1p!a+1rC*xhJF83R}(Ov@j5N
z$?_udbEyfIaXnL!`~Sh=U9J3RkrSTm54W)LZGSRo71ikwOjsA)*1+;rmmrjS+OpCC
zE8)n%6oV8?gRHI-1MNFlKHF;M4_0m%wVeu;FRoeWSd-tBs)9l-nn_<-C2ij-Cofdn
z@>?x@6`%H5Ed$-em%=7m$sc82@&z3$Y&=~^(?%m-I8a$4N1#l=p5XXr`SVINkVZ#*
z)^W@3Wc%%u48d?XU{!EDXORg)d?_FtQk;OVYDM|mKY6`qxhmW{&mRm2t&H`Yv8V0p
z+Re)AYR}8BP`{$(ONGU0t@1NO**t4v?auOBW%^lVE}w9@??QjY$+o}DF~{=T!BXL3
z(Q-~+QB|wHjH*(OWz}nM`KSBKNY@rgi!G;vCI8v}!lDKf>_AkU#vkw}j5c;bo_LMu
zBU6ySeL;zO6B@bWs6p|u3gM!*RvRmD@!WvlFAWIWeTJqdhTF>+3iu1Hwu3v=?o(!B
zU{QN(ezDVFLPAuj7|5$_#ihP*bV6DGR4Z_GsOT3HYTH|ZqN>q=otM`@%u%MIGjWDx
z7v%-(`DKnN@3T7O7YmLe7O{PKRv=L(Y{%uDF&TAvQ^dwrk>v!X3xY*<duNzqcYR45
z(2=pexI}13;b7mMV4$*$L@Q8la8aUSo?Sa!5VZVV<?)JYyGmI>X{E2>Bg3rx#AM6o
zx6WxX(LB%)$g{&G^1DxVmV{>(hQ$`rt@hOtIHdHnCI`F9uSs(fK|2^PBU7~<lu_2n
z3Dn6K*ynTV*g?k@a&e|&pLD&TWrHSUma)R&=84*&dJ=+tVJFY(Br=v=*2?!eot!+o
zU}CAb)nL9aCBNaf{k_cHa!*SyS|PvV_XS!6tjZ-P_yT1tpB2oLu#x8r#Ph<of0oFj
z)T=0M3R}K95`$*><yWhQ{rS&ZdCTg_zXyEN0uy(Nr_C4|l(-eNm-}l+3v4H_tAfvZ
zF%`_4Ak|t{kyKV<yH$H?v_z@U1-FSy_O$YLS(7dQCFbzCNdDisrXZZ(J|+79UzD8*
zKn(92$EQ_EI#wznSxKTOl~jr?Nrm(ap~H4<I=8iLr({b=k`O`>D&;8Br9y=eaz!M`
z73C_F-2Z3V-P$xW>%V-Occ16`JoC=XJMUcY?m}Ks8#olH-k+1=8VAQuD~Z8c2#GR|
zgNRXdpeE40nZp{8NFs5lVk9aDp@@%%dBZ}bTZzWOx}zjbgu0FMq^K&zi6+=sP^8Er
zlb5?Lu|({W#SkeGkfO&C=&JM)vM9XMLL^khC};{vlAMGEVvbe_jxo@(ITT5>JEe}2
za33aOCxWO!4PcX4m?JvRIndrefe;aeb`92^7fE!S2)gISDXPNJ3-)jjP(`7ul0d|{
z0#2o<4xA)G?=2EL4E9<Qhe8sM6@w+3B0+-o7Y#CMIs$1_rAW}qVWwm`aTxB?v*Drt
zrcR<DaJ~fw4s2@?k>L`s3elGYT5pmF%+M1`RH&2$5)Ug%f}|!cDw7_M*dytPJQ~%r
zry|xOP$yD4iG(Df{WcXbldvYskr9!F)R72D9f8`!QZvxZBAjtlgi3;v86rnTL`BGy
zVX;-v>5_+%Z0wLB5^4xXl$=C?+#C+f8yXul{}eM49Gf9AR8@k)p-Lq{n~f8Jw#|Ks
z=tz;NNVc*xWCV*VR6;|76urSLgK|lPf>cl_O!F8q4uZDyW~z*xhMK6kC+rU*kk1_5
zOXO|hEQX1pu~JGL2{<ajYDJszB!se_L`4R9+QXU?wU;9ei6e_3(Hz8S$Pi=*LZwn6
zg!dvLv6Ep{i69!VxSR?dbTQS`EYo#1z~0~nnXM@t5sq$!zFiUmtB3+C;}wMpB{_;H
zLZZUK0(OH6G#M-`EI16e5kba_AV~>S320=nH<LiZaYfK0AVPup%aP1=5Yliksv%-X
z3>C2$1WstNuyQ5iHo#Ui5Ebdx*C4rCMu^62OyY<`Lt=`GP(`O9aPG<xfsUN41T6#1
z!=OWpMK{zC#9Wkuh$tC}oPrHBWR!+Tx&lI4LW$Ffbr+XLDk-QANr!bGD}sobC&xiW
z5rh*vPQ;8#Jxn4&XHG>##!)g16X3fE;iQ#}kjKGNB8a>^93Bu5$Rm_tu#1?32cg8J
zOBCpfS5PU?MxcjOilczkLrIhab+I7D#KGA_Mof%S^rduHa3JEKL*Rfwq6|`{Y`jW_
zLk+Zjut{(bSA@!e9kEhe#2Z1+YEeB!B0XA^1OF(oR78}jGzgBe2nFGwF%ld_=-S*>
zQ5_Zi1MLw4;J^z9<6$urHHqPK;5`ILQPp7fVTcR~TFns&3Dsz*I&5W2IQq-WBRV89
z2`*+t5ifT*V}@%CbWioxGQUk$f;n;E8VjLPpq|ub38`U}2t<@30)rSuDLNFA966nm
zEFxm1uSAZ6g&>QHx~f4U1ks@+N!XDPYR|bF1q_I#oCReAT0o!`kcW_vai|U%Ocj-q
zL!!xYsuU7Zd5XIz961UFAuCszxf+tiL?tLz;>jaLQjkH)uu0KXMG~o)J4{i;Y??d;
znj0B`^J+x2f};^jijIaw7($^)D5aAuG^9j4MUeyqJxG%{<9eFP$OVXq7*!0NHSd*<
z9D>y$885-1P(^y`1RFU{UJB&scV!@AN{GaEk!(@&=(#?2a6*lU!<MHsn^_<tSyWL~
zWSkL+g6PP=jx$+eJ_kXv%`_-fwO9^B$D(}!LX|*|V~8*8*Y;vm_$nE>VTcx(sT?Xv
zA{qL0F@&mPgP5ziYee_VUj*U5(xCvm7yPB5h$C^?RMBV=&O^9Js6b*|sncLn45~sz
zpK+|*)M63nrV;mHN=OEJE*OVob6{o3!%9UcvG4~GL$WK($XbZ&xM|+8)Il+atSD3j
zA(=@iF*%mbs`}6yQ<O;(6ts~mAnTLFGtemX6^q26%^hmZfqih0JR$~_8U`za_lkp!
zpd&#NizSme9I)L+sM2yI#BL;~LPatjRsq=LspwcQL^VkR?UAVf*=&%`T+SdnSQ-md
zv90H<5(@BwT?1a|hU3VY$s;0{5KD1V4D9FB|B&<v9MlQxW*V#>*gT@>#uA}ON~M#K
zVKK1q795zte+cY1=n6zn8?@9l%n)RJG)V^;bd_W&>I$vX&Q(H=iil8=Xb6whK*+HL
z<f}HubC5)+Fo!IUkhhD#Tw>7%4|RrB9Y>8F1_#^>RWv`WsA=Ff0N`2*F#)Im%m$eY
zSOE)xCBO>634mdsh&$j3cmckEAHV=$C@Act2(k`<Y9pJ0L?9X11MCN2*d}rq$N+Kx
zblaW*-~<;b0nl-dmw>B44R9BD2s8oDfY-n~;62a<d;@*~2rMUbSv@atm<|EZ(?RIO
z5%{P)0>h;d_(%?-3QPnh0n-5Vr4Lg>U?zZmAFl;4A6NwZ2P^?r04{(l;0gEv3}7`7
z2CM-%zy@F|umji)>;qDPg8=%<gy{(&A1DM$fpb6=fW9un^g2)r+y?Fg4}eDCDew$<
z4ZH(B0$+eG;156|K?e;C1<;o?OyL9$fp(0*6&*4PPzKZhbwCT43g`kxfGL2!s4$%a
z%mWqxOM#VuC$I{DjsOV&0)ZeP5{Lm}0XVQBTY>GsUf>Xr1>^vEz!~5yPyt*9t^hZI
z+rVAmKJW;53_Jr~0&js1pcD83`~;9e@E<@=79;^VKpq$ci~%Ts8ZZ%<1ZV>WfFWQC
zm;u&+9k3W!0k{Be03GlK`~W7v2Eu@~z&c<(0QX#wIA9Bq2qXd9fn7ifun#x@90HC4
z$AOc;X`m1&0nP&#0rXW3(;A=_xCPV!jle6Q6=(-KfUm$e;1_^)_Co;~0B(~Z3czS!
z3@{E*2Q+|5z*ImFm<|{MMt})m4p;z|fHklXumctY%K#_99iRigfFFRqR>L$12nE7`
zNFWN>0Bi=5fD~XqkPaLLP635LF;EUv0hfR)z%Ae|P!BW$&w-b~YoHDI2z&*;0lxrI
zI08xlLjh?(0T>A=1FC>JFd3KvOa-)oX@CJ>0?Y>H0P}zafGw~XSOPc#PJk=m0jvW2
z0R|8Vuz*k?3WxzV0r9{VU<a@dH~<_5jsnL3^mQDjc|ZYB3|s)vS0zj@0oA}Q;0|yX
zcmO;CUI4Fv_dq+)348~{#NqS<7z)S&!vHxz0T=;{1;zntz$8Em&<1n?eZUAX1<V0U
zU?K1yuncelTmcV&4)_2J0DZAx8UaKBn}9eV0Z0P21L!LSrU!u|Kqhb!I0X~}Wx#ph
z0#F592GG}SnAQOez;oa=@D}(8d<Hs!Zr~^I8$brbe?SV51>}Jdz-VABFb+@yG=Zsr
z9xxp+01N?Rz!aDbSOD{Y{{RO7eL2FE2DkxqzzYZjf`Bj}99RoP0~>+OKmvfil3<z)
z>;(=2M}RCK2gnCXfHI&Gr~;~i>%dLmE>I6V2A%@Xfakz#;4RP&bOJvBI0-?-0k}p%
zWB~Mgu7?4mfU&@MKm(WzOaZh3J-`?+1<V2T)MXxE11tjUffax=-~zY;?!YR*2M7Qd
zKp+5@7)S^Z1w;d}09^VY@jxQ56W9an1r7p7fE?fia2hBCih(lV9B=`+2wVkffLh=l
z@CaxGnt>NU8_*7P1K)t}z)#>0FbGbv;MOf73yc89026@8fG#itFanH$*}xoNE`a{_
zsTHsYZ~&GA&VVaG2Ydl0zy`vA2p|@S2NHm7KoYP6*aM^j=>Yo5hUqcj1aJl@2F?QL
zs|=>+fD6DS;3jYzs0SVcjldJ233vsx0v~{nKnKtT`~na-`xFC)0K)+#U=*MNj0ZG<
zDS#ee0-!Hbm|6f<fGw~DSOK^Iu7Eq>3HSg301F5PLV*Zi9S{p_0^)!zKq9acNC8rT
zG~ghR4rBmFfouT%joTvNEKmmA9k;ZDT{gG4$l=5{hl+VO>%DjHkW^Y>{XUL9Smi+C
z)aLqw`(%vv-FqK7qHUk^?OicjSF6|gCB9iYTl=M%|AFG#?ezv$xqp_AQc&G+{Q2hf
z?m3fxK4+Vz-5F0>@V%Cie(X_+0%x~PCS!f^r?Y#<j*+^3W90|k3*8Cz*O(zU%!97h
z){~j@f8QH@WSOsTzSUvf^s+r0jlRcc=~R>%jaWjrn0Qe7b9KF=o!XD#K_<zOyF)!c
zwVc`z^1Nisw)A6<ZSP#4I{eFKqwN8i_l7uFKRq4{n%I<4SFC2s4>7sAaNVA$D$n}6
zmnTY`__nPs`hJo0Q-}5b2O^GnFLRyx#v&k8h2hW<BvxB~LcFQFI^)I&lhu!hwXCi<
z_f&fK5E+m9)PmBMgYoGuFV&8YZoYKr^o|3iPMuTE+s`J=I+CM9okuzx@_4}qTUp&a
zb=&ZFp8F2lm^Wu<wcFk|q)O#Qn^%q`A06TFoZL1vf344}l9x=2$6>EVY0U_2_cT2s
zn;tnUty-}`-Xi#2&BSr5e3YNf%M97Le(K5aQ}r>*kGogunTQ;Iv)xK_jEGFK;gPkR
zsa3DeeJ1-L2WP9SoboBq{O~Q!VeS@(Bn}Txps!H)UE8oz*?i}J31Rb$<)hY+_Z`o8
zL(Nz2l)Zc0l%*)M!6<j`#r&#!{zoJncUTRNy_BhSd1Y|pa>Rr7?%YX7$`cy>OZmum
zhV`j;sl`ppwr^T?BbeNjv3RG-tnA6-m_BlrlVeKt^jkjZ9k<9h(WR5LIi!-he==)<
zO!}g!@^39PQx9MCH7Ffi?%1FmLms-eYD8rG;V)-jlb88E`L-rK-o?%HsD8q4>GFN=
zavToFSA>rU@t-%!%x(-qYLD1T-8SZz>WTnod5do^?SDLSoG|x!tL~c{!(*R$s7j5v
z{<ATElpe*qu64Z(^B}VH#ofcdo8&U5D1X$cx=>|br|YecXz4e+vC2$veJ&fEQdZtL
z*Fm$Y0lB78m|&%p=Wjl8jDlnF+9Qow6B#C#rd~^1Uh(RZjahVd+q6n~gWcyU6L&sw
zG#B$xwbn~e^j9v_stRGZjo-2@D>7TM;F7H9GTS==hp)&qo7!e2)juSuZ3&33V;G;I
z*)D6S%gB;C%w|o9FO1JtOfwyMOSAlF#}0?s#+Kltb*+{n!Lm`l`+{vBr$uU1`aj(x
zZ~D1J(@){k+;ycxbJJGc-D>j6`BlEY4&|0bl)~ta9j|G{onP*2nWQ2LS@+$W8=rnX
z{r1#cFQpXrv6l+P3kJF5xUAg0Q#qs}QYQQH70vCzG14Ye^lKj4Yv!C^$e6rwi@~tB
zwNrDW(!%}r`=@j*p{WOn{z*o%_U}}FseH~pa%P8@(wM54+ohWAi_U$?tXi8ZRwKJ`
z`oRq+5*)Y0E8DtDhIrRZh`K(pQC;PkM{Ucq+LCW2oGeFX!N-w%Xl0brT`E}@wy4F)
z?SI<b_25hunWAP}`t9t(*~V%XDQCj$&eJum-aeL4oT(M5@#Up?T(qNgfX_F&U12u)
z*oD&gjWU@=(kzQZwpt@h6^c4Y?ZKPEmaM(MMA~Xtrsb^(Rlc4n7gNv4T3bweb?UY5
zFsI~YHKTQ$HjMIaDlGhVed|_AZXTy7=yCV<^o5+4S#^yy`*Y_W(3@?&FmQ)etOo1e
z_u$~zeX@ok>A&qABNKnQODdEudHdwR%5u5#%7(BuN28d+kp|5#z8!{nXomaE=IDJX
zdT>sp(^q!Y)r=uytP3t4<{13k*S2};W)qX#RM$^to0pxvoZv|+r`0^4HrkAAdZdOj
zG&*zDHFM?2kITAO+LZn(nqs+BM=HK^W)XYRsm92UNrTuI6NA*3H^cm#Z|2&bc=a>W
zLM$e$Ow%@{Y4_Y>=ZYb-<P-Fw6U|N+jF%bz;%8UkwT(~@3&Z5rAj*5^tI~r>Dw+Bv
zv5Oi=K8pHUA7o|+54RhA^82JhHAhpX?v%GdZI-9UY#Y_N+%#or+vB){G!>JRpN8kS
zc4f9Xtdo!0*nQt|+iv%V@0ceZY}Pz-J3emHoST0#Z&-}Y{8<sxY+v%jFpfGxRca?K
zXOrySOAM8$GK-6OPBRLpl*dm^k#1kVMbu7qvWrG;25E2nhRckJHCk#HFTDC%lk<gr
zbL7>8dhxa2pBT8s90<5`GpIuEyT<L3Q<VlKg%vYTWe?g~$=S21!yrH5)(5}$np3~*
zC=YrYH(Y1$@@rvwdfuNaCu`>3`Mq*cT=S6Br>(wGLuDo*gPXdqUUOL6c_Xc6yh8Jr
zlJa?J%FH=+@|knPLq6PLY#e^1a!^yIO6s?n<c*ODZ$@0SwARh-Iz9iCW^2jzZyL-=
zrW<O<(aTg;_`H&2P4gTVKH1N;yv*3-?4z%w)682BYHnzbKKS5V*9^|c$wzj*t6b+D
zj_moSaiE+uZt5NPBl)9;yWWp(FkLSDpk>mnYAa*u4QGme6nCZkQT*L?CGGXY#lsYC
zR4lsHCUc->qv$l*kv6n*zRbWI8y~Hpjy21^$n2i*LPLfUl5|ho$O&@f8mZk@tZc2g
zRDEAm-^R@6o>?^|d+lbCyyio#Kcn)d?r<>wL%E+&{V`;x-KsmA7kVX4<rr+=HClgh
zTDS4r&n>RLd6g3~Cwn!WJC9`kb~_xuH&)h3ru>2q<+rTj<+q!vXKQ)eohcse{v{xL
z!G)?3Vdd}sIQ^>i><TnWpXhY{p2o}-R$-QZW_-9CAIZ>T{W<aU&t0R5j$s-<PBcvT
z&-0I5q<CIdL34LJ=|iMh#_ek|x!X8Z8Y;4K#+KIme$!M~H5!@@0ltZIz~|~R*5=ds
zd9(DLZkFyp@oo@VEi+!yyy4zl@O4$_4N@-5zf=i*;m`4h5UsM-s*SR8Mxll+NQ0-`
zk~YO0rIK;!$;~I<T(kx~XVrvwi@}q##rJNBKcH}I^qVt#6Z21O3W`~{C9|&1Z9IIw
z+SuUjjp0*AU*E3Wtd`8!>v?3QL)+KLGqg6Xh)*lT(>}abkcp3`E=-G@JpaAHZVRbj
zcJn(<N63pv{<q-TwHWikk`A^>o~_!8i*{?O|33AVW|Xtu)J)NIlXZ8;g|Hcuoh#HG
z-yC23G~x8*D+_4k>PQ(*BWchKmtjj%%XTV9+Fd%*phBJ^VK?`as*1LYR8-)f!#Cz$
zKd(K(Y-Zh(Gs%Z4Q`#&p?EKj{xOP_ED3)ko)~_cOQAVWS-On>?#`;s(r%1Q&%n-|X
z(W%>*&I%c}bLr=-*tudij!DfL74TB2EM9ZG_pith2Rq%>I|8EKY%wzLwh;}MRbAUr
zpD_6KP|?R*XM8$;%G}t<Mgf@<VW;=LbKQ;gF&*|hMqgyl+&<5-Zo8_S+qw;G$)wC#
zFN@Vrmg?+U0=|kZ%GKQ_db5u~9X_+=pGtiBRFb;aVArdb0}H)pDyLO5tffxbB*#RZ
z8dI#55VW!~r;Dy-y<bN^_i0NKeczLjA}eZ_f0^d^X5^FZ2pff6wkn%kp6}(%?R1lQ
zmo;%#-ttFdk(jC|w~RHht(3gW`lR=Bd|y0MQHxq+GC5<6+7?gkddBSq@091#9jlWy
z9<5%a>D|g&%?hd39lx`g`EEpkOOBt~oVP`RFRD!{e4IkkgO(ne*HQh01M|ARf%)gB
z)9=FU>3cWsxj9cwXa3|3_V3Cfb*7rFb$bx7OHNTFN3(_$o&IHNxWnU}$}0PmK1EiP
zcKaspE<AWE!^nI>hjtAd4})uG(j`tktGenO?Tl<od6U?AZDyVKWO>P!B}qFs-k#yy
z`r~Px)m#mB=8`cn-4}EveWz!BD+sGSKT*MeJ@rg(;t^!`ojZHx=)k<1QkYfk)`e>3
zg9A6$PG&_0ZVbCryydk<jepRKB*Vgp7`15WPQUt!60_o1bur_oXMPm~7qmKd#9nyj
zdwuql6(&!j<!dKwLmpi74tM(*E#nzx`KF}IQdc8I#@C7}Z6$feSw(Jog6Hno*8d)X
z=3&-%jkRa@k7@2)e6(Qe%VWEXmd4d>)T&ZECcWnLTa~mV=gvw|XRFiFBZ?=qkIkw|
zNE7i}*rxF$VTghkW95?Nv4N!6qx&zOSGUk~VI_*L%7L=geVXQ@wRy`N!*qOS{rWWP
z)0FRzbX_(rI|ja0lsnfl<dPMfw_KZeB57Td?F6+~S;_YpYArjh+&b=i=2i_IS6C9Z
z@VB{tMp(o0&El__E^SeDQPS$`_3gSQktN-_GIJr1ALRMeZr{OB%5Pd?Qhb{AJ$m$#
zGj$nPn_DuMh^y_L-Sn(6cT8jA5|g<W<VrR1$-1;+TczP1)TO^ZxG<k*78OfcmG9{q
zwM*W?s;lUmhH;mD#|Oh!c8Se|c;l+tmN_o_Lu^X-MOJkTRuotHe3q8{vh`(sbDHb=
zg+aP=%ckFPdmn78zxPw)9O<l@4Ra}f90CiHjOtS@W=Z9pPA%8de|G4Z$(wPjeOk8F
zd%rrIol2Q}rmj&Ose2X>OZz?6`}h9JYB}RUh)s>rr9TtTysu(*YIw7*4{z}rvq<`}
zYujPYksSR$yHpxtPp~p>U)=6=D%d)r(D%z7m67e1Y%BR}f780s$20QhIo;8eN|Z_S
zJO7i?mU(;4_g#k)hQwG$OUf2zcIlot=#n3jFG_#YnY7!M9G(00h-ARns{dAo$xkfN
zITNY>q?^LlTmK-C7Op9NH~-k!J&jps#{OCVyM4srz%{3TxyY%AJN@yB+!7=!wZX`0
z5bMLY!!{w3j#VC$Kj+kU|6J?hsJ7#qhDP`u&*Vu~DvI@=^gb?q^l;yHi9ZILw4OR{
zZI|7)d6}=$W>4R=oTU``agQvz4MUE{Y#loJvV8gkUHc7ZJdauav6b0fUOD8IR-5^R
z_?mYC#hKv`#S(U|Oh|^lmP7kpMZTr8v~<6d%J~172b&M(Tyo#7qAL1Dj%;J#q8<I~
zrRsvW#%mj-6gPJUZOoL)TR1^y$(WpW#&jL&IU82#YCJlpViz!?Xs)LIoVw?&dPys*
zibf#i`BSDoC_hjs-zYb0&5=<a!@guCj0~Gpo4-3#d8@|fSAIx}>DZ7flO}XGe94?L
z>rTQYIQMLP8AUD_Syr84!MN^ETjpD8Aw9y@ZK}b8@P_WMaGdmYnUL<g=H6!ZZ}y(e
z`BfuGiZ_OTC>B>q-k7M<CFM5xQlRy$>}L@#$5-DwBVDmeCGPC=%O_TQRmx0k@go2C
z=W=`F6V*qp#p=z|4nFOuG*?KzJb!dWUV;io@rUD5m`?%adi*hI9fj{#hvmJuU@udr
zJZ*WW(@`0}b%aAv+m6>UW*pb@g_BF2=Y_W$T>KQ_AGG1}!u9iBOTFBxZZPcDx@jeF
zzJB-SHP6Y4z9xD17_vTEMKNZt*+18Zww3+XX)T%V_VGqjYl+uMPyef{L_!=(?`XbO
zUDJ|lxf?nC>XM7R%ERuf3vQJBtc{O8qgKmM)Mm@RG&pVeea($A<hPMq16U6ujDH-A
zd}MTgtcPY}YMAYbr;)KX-6mHfoR6LHW^|;iIB9ODlgII2HcM*TH~PxQ#v4A%6qd(7
zuO40~?>@PuS#R@q*H`8v=g1wnI5S+?#_M*D<=h=1PX?<P*}S;5`F2{WT-QsriwEv3
z+9FvJux!mu7Zt_0sH~aCj+n#^yWBKs#@I%6V^vNq)w$!|_@m$5|J+{Yx-}|Gg_H4f
zl|hT5ocNNC8wKhM0}2`@incD8?rK}C9zIt+{?j49ue)3F9@?_CJHP(&zuQvbt<Y6l
z|Gdaxn)|mGWvK`JXXPFE!I16Vd0+N{|NOX><p+xE#xFZte1W0+K(qOAMe$PaO=;)P
zF|=DH@2-s5;hyJdZ8_0LfA$i_jX!IjjNaX*+Rpg4W{2g(Td@+d?Jeq~=4N^fnvvL$
z^7!V*%1P6i5wwNb8s4fg8mAm(rzK23RHtp5Vx%1|eyvm1h7wiVaqQFgyKIR+&#LE7
z)lppB5$T#5$?&?RR+m!C=web<CWO4SHvRa}$N1oz<L<KxJiU<Y(zm(E@{V^ej9jqG
zE1mk~f^F@Usu2nKwGvXAHH>HJ4FO3`BjplLR?9c<y<6Jibu~O8pl!Ub!L2D$?eCLs
zo*yssJAb>M;h!1VKkQp&K4>h|`q^eJuN%^M=)=(ezF9lnn3c3jMXo&id46o?l+^_Z
z8|D-RmnDc&Pvi}I5!z;Q<`HAJ!J)MFi)w0)cNG^MuKcpWImE`=cALjKLzQ!S6U@BB
zKR6jVKioR2(%tdKA^jm5;ZJCCmhs|;8s4sJeQ;`g{EW!G+E=|t#KgLPiHlehWiqF0
zfo`N>+w1Y9x7PA;W~_p2r!6D4pHG<~-!kpx!KkWNVJ{xuEN91^p8sK(ZKO;%LfY0c
z==T*9vTIIAjOUhmN6%kM<bAqHrcW-7%dzo0$f}e0=s7fRi)VuV?>6%loBnKcdZ%GA
zWJSVtMy=)&+XMIBXiQeW>!`g&S^e1gM-EAzwwI42kVQYqmi}Ivn7G$aWJi96%r1v_
zB-L;BF5f22*VWnD*uv6FHr#ai=86)>)+0srXUAwB+Fs`Uc2|J%^|X&_hJ%l$q&jRn
zGk%9vf#UtTrXurorJ>>}ZPYNEl^ymSxf_(rWW2SPP0>GDnZZu|H3H^U+c~%1xqi(e
zM`!8G>!<1!)!Hv)P|s4GzxZjzsCaW4ChKTx-3V+m-^(7-DjFd1r!>fZy1|PpzBBGy
zPdFA$PqeFZ@EP23aRxO>?BabdeXklg|GeRwQuaLdP8wW*+bexM|Kh-)?MF)YEr0h5
zj$Msz2Dt`rAKeW3KHq3u`OMm^!m#n~tw~L$s#aI+Lo$oyT;CkOyXx!bs)V)5VOuVI
z-759o;KQ$iRaZUBGn?h>Glr*XW+iXTdr>X!-QllTu6WsbvS;<8r^lP-uZ@;{<g@5&
zRr2)kc8<JUZ1b6|#R*d<&)bk2(iTNNJUZlL{!WARi!q`{NzJ=doQtBf)ttO<w4LOf
zHQu^BS@TSGoYviMoE`V7<fjix^@*FVgfvRuS8Oa^x!}>T)M@AUIr$r>oUN?2$^EnW
zCnC>&JK|bg-0&la^4Q~??Jc8@o;<t1OXShI*U6t8f9CBR_xt8&4a%({zvP!c+wwYL
z{nRzkkH*nWmzFE+OfCrCU~HE?L(JyR!s+iXsB`wa$CAxMCv8s{GKp-Mwl0ga50QDa
z%&pLpKG85MY^lm5J(+ii68+Kc&exlzrj)lvS0dN4VyaaR=Y^$8i4Vy#v7C~#Yz^l%
zc};5Mou#cZ;i@Ca@9s8rrz+&g9x6E(?Mt5$m}?#)%83^{$|`oAw)1k1@1^XeG}nub
zwx!XwnL2j6=*GJ9IwP{q-rUr_JF;D`qjJI7YxZ6rh7@Qzt(0S};>eFuiA)Gvc2m>&
z%&x6LcB9*rh8YJ-`R%ZJc7dIF4$dKT%HX~U$2<I|-`#XZan5>)@kvvYHmiTRBPPAB
zi4<(D7JT^Nfn&v--*0Ywzq_l&{9#sw-g94xOryQwOWQ;`Z)O*YPjNUImE|(nn=&<i
zW&Dy~Vm6#tOA_jncPcwDiq&UXW&JX4(w{VW#TBa=C!@)hb^AWt_KR9wTxd4_c=(h9
z@Evwlqhwq@bCSoEO@CsXMX7z(ayZq+>e!vcF2ja%VrQ!t7^s-<^x4SKAiE8zp5m2C
zl3f5fLL?6%m;Ssv@m+ezAmcUD+C4u`*7~g2(kVVJy=YwhRAsvl!(JdCpWN~*igY~n
zde`Ee_r6|@n=gH+kutg9z-ar?_P-3$&+F~5x|ICHu_#NkHTUI_DRW2@r~5<xiLyVK
z<K`wbO3v6l!q#NN=&1$Eo~@tDHmVpg$vDLHrPP_!u!)}6r^g!?1*F?_-kns^ARlbi
zuCwv^xM9-`URl10cs=&)?q%CF3PBgM+K$s%5JN6H^u?!q#i8VfDzRfyyq2$QQIouR
zV&Q$p%G?Dm#i%a#`c{W0<u69ATIpJIS<XD$b5ErEFT1jzPLdba!*T4v%GdHyE{}BF
zm3ORZpP~^|KQ>p_z;%T_o0Yj+aiQPM=B-kT>U1aSPL?LI!q&CEF1oNVLEd)LGKNFw
z)-uSyqv_C^)0^kFJ$&pm*nWHL4F@-xN~gNNKBa5)=gH#>)mmg~MjLIurJ1|^dBV9+
zspBHLPrJibiji2=%j343s>nPn?-X>QU~y2_gsiHNf`--`#}}JBx5ms3k-p<^@#5mk
z1)sC8<#BF>s@-&v^cW?2!d<Va#i3zjiuO^BA_D~lx-;eZ>E~@tNep@KR;Qt{OGNKD
z`Xp~H*Bkj{<hbmW_Z%EU#7xiLi?Q|H-1ar)x&LyHZ8O|=C;yn_M|Y9WH@e^b)6{KP
zw@Gx)tk^kwOdFrdJUmm*B6BkCWXy=U|7ytQgU15q4gYfY_^(eb`Ub~}*Tg>lIQM#+
z*{$VCYsV=%K6t7gyz#SCHSNNLm*oqO>>J!jt5`Z-Ccm&)UHw_ZxjButQSFzrZj|12
zkF{SrjlMOiaJR|E83Fm}K}u6CoSnUP{-LryW<EHxH1Kw0{GBwD!F6IPb~<Zi^B1>C
zaNZmj^^BfyJM{hdwruCA!<X4l*Gc-O(N!NjBD786v+Impwa>X%eKroS9bRW^P!fCp
zPGs;geWi`ljIyhvSDch_UFJJ$#9$FIxJhJ5xy2q;LJN!8t!Nvve)5Lz1$5&fQx>*8
zzgDJxe&a_OgT)#%$UoA@JUsivF>|j&o0U$>Y>5rWYHg-Z-Tn9_YxW7IOrCl{_6OCn
zkCjV&JJt1<4Xz+xTdlr4;Mc@1v!vle&)%j#6T7I}<$8GRZTG#yoE6RnUpR4}ft>TN
ziu84`j7U?H3WfbT$iKAFaLU3CW@nsfk!->Qwa%MNxw%`cA1EbyIJK;KSaeAIl--RA
zvwd12vP#TE&XUK?ce-B}7Cd_U%h&s1&ZG8KpLc&hJ>=s4Iw_fVO*@rmi>4QqKgp9H
z^_G)xYV7sj5?|)OLGA8O@8p}DG?L!)+`-&;yHm}juq~OZIZnk5<b4OoS3f!$ZlBdu
zMAdD(uzzi46`a4Qs#(^t3a&H4IES=%H^#~ibuyGysXA|L;PmErVMqB)r~PkEU3xQm
z|Eww1!@qXym2F#JPgXrRl=Nt=h$dy)>-i-rIRU3E&);Njb&__Lku2ygm(8mpU%O^0
zx?JVh9T9!kV`H+Ob#{r6&q#erdAKUAmie+qy0mG<cjuAJ$jix+b<R~4%uP3!+2&{E
zulU(ks}fN?qg!kFc~{j33tVIO1pDU)4L4LiXB}}(e$eK3`^VhwT1J*p)f+)x{<u4<
zeOtE?_^b-d{>VLEo7rF}XZhsO#jI=zjb#qAv|oDeu$R|!=?KzrU8B-S@>PtSRQjQe
zyzElc(kH8P)&>sFl8BuCa!cny%^3y?iE(Z6ySG+{+UR_qwbSk{v)wnR#4XUc%C27g
zph(@ak>xAfE4vmpwU~%xN2S{P-DHcUEo2X$KXh&N#h2vzHyP<aoa$Rf{n3cOW+lJ>
z?F@?5UN2vk&K9fVi`R9Rs<@xwTpum#o%u1`caGWA_s91<S@6ntL>J5>;Nrs;=Ffvg
zC!9|HZaV+q*tH=Z6Qs)DAy<bz_1fnm_4HMnS9uoXUSvEX*CykuHzO@W{p|QPQTYl*
zHl(33r&{Q-rOxN{MMCPEUgfGsFIjQpG}-&(U{f#sE1EAxK5uP0@Lk+S`NTrSM`S&V
zBo_OkWZvht*m&hLNi*+dQior9-Wo3ZtYp!ksNc)Bojynn&${%fjFWqC{!@`Md&fC1
z;-yDfy*Yd!M%qI2ZROR}Y05>dKgy*eG|YCtSBf6xTc2OAZIQ+H$Y1Vx|4nTh-68X*
zdxbJ*<blY#oO#x@HZpTc_UOwcyq+|D)oZK52MMEYIfOKwVve-^X?1U<O66Jei83?%
zQ;i$u7GIrnrxbmktX^7kVzah;yx-0j=1JG-7r_=|m}X!9pry5W;}ubvs)*yMTPNPT
zpt5ho*W~A)KWPm{PMyE_64_GnOljV*vF%$^=HE~mljW`9bY`-Lq5c(!{~&TbK8uuh
zx+&lI#r8CMtZ&P#>Y9~e2A?MSs7U;Nu;L11!y@Z3S4VN!G17`{A-~-#Z?|2WS+Tw}
zfBcqm=fM@pvPRSy+uWul&Un;v<AB}jrd#tYyB}ncXHIf3aPZzcOw>RB=9-DD8Cl0e
zf2?#Fq<^HNgXC~8OH}dOg+TXjE2YHZZr0tn85^m#rKY%aNo2drrLZj0@fQ23HxlLi
zEzYFB(SIbn_@QZ{^I_}9;v~^;HD8yUF&SJs{YGZkkqHt-T~&<-<b6Ndca&cC4&PJN
zxv$#KY-YySlB#LvmYL+1wY+SK)Ee0l_+OL82hVc#NzOSpq=L8E?5cg<ytrm`SgOyf
z@_9py&evWEKQqY1ocyv-?PvF|B-^Ww@dmWf<O5Err{BJ)%P7)gR3)tcZu+F;p_^1g
z#02lR%I1zSC)ppmDy-yP#`z6Bu2-|-WL}j_?3fn$SGGCJm42O{VfJyML;eUPsk|ip
zu3r9;@1b*#D2%zIsZ;Ue^%L<6YUPmAaGXhi1oP9LQ-jBhbty4dh#%{pzhHS6;tIJn
zYSp%lcir{mL8m1-=JU0wYOZz6w8;8Y;}P4%9p*mx;M;Yi=+XDch0ZXKLxUDJe5IyS
zlfw=~epoSzWLF}c-8IgJJiKzn(&hJ0@033n)naEeR(I>*a=r4vCkwq=4_e2HAN%57
z&{4Fy*=oe8V`M}7lXK3xFZ}SzF|R_;rI2o1I#2Jvq=KQGjh~19vAnb}jvSM*CIF5N
z)Q_8DO$^1C$BH~^D_W=7s(I41&D`DP>F*~WU7g>nZ`0qg{@!^LqqaNuCdpU%FWKL*
zJX+FXjAghoV|3ABMJnskj!g+lqY&%qkyRtKk=o;~j}cK17Uj7mCpmI~*NH;PZ<Die
zloI7*+c{0Ki3eKSWh*=`bvGWR?F&qc*zcWX7404Fe*8bb+pp^6*1)msw!KGm`k&gd
z$G$kTAH%W4*E>kM>-^8{H87u)2Fm+8E4J4;#GjwPbE9{7(&^@|nyPyx+uqz8`e|i&
z$+@j%*DRw=4mA2c?zRY_g^wn$(zf4lE~AZ-#JO9#!^*N7$&>I^-pY)yzjbD7mFHdM
z&K-x5@VXH?^oHkyb|Sh)nN>TExkksS?XaTVUp1-5=w9t=*H0A1Iqg*CsivQ&4^d(s
zP`#2(iCErYdUlsek<l#+D^1sx&TOM|4V;OrZIyElbT`i28|AlL#p&4AB+Kk8SM?W{
ztTCTEp(@J!U|7neuFsKIU60@N&$@9*?(RyBydAIU)teJT&8FrQ1)WVJX@^}aPN@qp
z*Kzh4dMT~Ol~beeuHkBNN>_5i_c*Wk6*J#_C6#Xpc~NiUUb{}BuKneSK`Y}7!j8_H
zv;8rn+WIs-b8hK<PE~c{v#+PmA7#C>zFzp|r`4?PhaLB?_$^a$-4|qiV%YlM5*xm@
zd{{Xw^D5ZV3+CT&pgw60O<3_fjM+4$z;)A@DceSgSdLM!6Ro(W>2x>V<+Xf#wB6VL
zR{pH>JhCHT<PmqL6(-u>+dePo{v_=gR-C7`bk*1?r!uN`#7o#V=3mOpRaIP+pM6<>
z#nJg~j0;uktIx>Q8=4+pmj?YHbLZvKz*T<cO`mhu?0a>Gk#G_29eB@MW_9q{@x}Q%
zjN5LX6n=+%ON*7UmfyB9D(iPp`$NmGtosr9b?-g9Og1DgQQoxbP(4{fEB8;*ykhk;
zk!#&q<J5PEDvFOeHRBK4S;IzrM(PUN<ncwPmGj(-jP;kR{QBhj^MPjNmZ#@vWy_z`
z#H;!iHXDcr?X+&VZCc{Ab4#e{%rr%<`QbiAG?jNPKbLwhy=3)8uG6z#C&=8S`g3s%
zD=&R_)T>_(%O<9Mo)M9ruu()_!AW$?k_5f_W1{V|teU1oXwP}LQ(klGNw0TLTXxUU
z_~z9#zuIj?jl9yBB5k>doB6jl(-e!u8>2<Lg3n%aG_!r<IcS@{xe9VlYKAwdX3ucX
z@{hrmA%g<GB}NT#w)fqdeLPXdETL+IvbN;$1!^fL3+6m3dbukiS^VIw5~(-Z=~G5a
zp4W=4oM^jf)^v8@(gg2Mrhg)rvAgDf$}JPWF>g-xx%RTU;muDu9&4Y7OL~b*+zNbC
zFn6Tmtw1wFk^A$VZYt$exx;*>?;VkyelSPYJnySY-tSE+U%vToE86{Kujt6~S%0o<
z&75W0cq-OKdtF;@vihjkq1^`;hmO=8Cl+f=wfgC}dgDvwr^xP8p@qA|x82s9qc5jn
zXmwYosk5%zV(GjO#s?C@c4rl<D<S)1mA^fbeuXqoRD3a8Hdjtk?O}`usYN3<KI-ro
zhvDaKt=J)LwMo)q3qC#2sgGjLFZ}fG;ZhUt4ZiZlGu{5P3y+Y5`ZymlF>Qw28OlB>
zW75iImG0W9tf6}4){b?C&L?hPpJg*T$@7?uWR|W%=<(O13{{om8e)`2Ecn>&y8d_f
z>?h}^EiVk+*s{OX=a2889NQ6RyG~E>)t5G^CYO{=Pd6C*EXe1(li{TwUK^7R7gFvI
zDSNUdV}c6hUcBafg>e~C!!sUx6pR=7CPTI#v|Gh^pH{;cv0pDO-9EQi4vWpQzkFV;
zKB(v%>dV^p_I1oqI_u@@RnrdNnJHWH-a#wpgX0j+xQ;*cBEQNmkKg$PvMxatS6{e4
zp-t#IzduoN^6h7;VWsdaV)5oEN5z&G30In3b*I-He17}s>~oXUat=o~joxR-+&1-~
zn99#n|M^|-ES`U=X<B{M;YluWS>E9(bwlLTShA)ce;yg~Gx1}n`HDBkGlHEu&$!+!
zzA;-W@4;yoS@T&r6V|u5s3eDU6_1hd{2VdAxy9kj%lOET>&vLlpDJT4MT6Z71CNU@
zKO0<o=k1hH)+WpLs51`8&y~vijvP&jpYCFTxIbK6`ri8CUd`KWDOEo-f(L&y-0^$k
z$4}DiU)O2#94Fknj6}j1qlGgvOd=xbZN|2_Db0fuGE<q?8K>h5DnkNaFp;14pVoNo
zunLQ)Z?Tpi-}!FRgY3I2W;}2-JtA{iL0?O<K>TUs@XlE+`+rWzx6SytKvMG>^N00@
zmCw6M4nZER#SdCGb)J9GJ$SFm;~5L3t4_-oOLZ8XEJYfQo!VSyYd1{N@A*{K+Z#M@
zF8D2SNse;P;9}jm9ad!;o~?=Np0cKC&Au0+URO1Gq}eF*YVYt@^NWg0lbgB=;aKz`
zT5&Xsby@zU%rA`uZRUk1;{sE4HcB<!w$j_tAbOj0EEVanpR;OXRrtu~sl}8mN8N^u
zbuKO{-5Czw<X?)-bhD_Pl@h)wuOc_kxKTeodh)d4ZRXFWv`#&!?HlYJ({^GpE5R-&
zL1p*rAq9)=(kI+AE}k;2?9QC`#)@@axr2OXefv}Hexrr1{$o#Nsku~)joNz3PVM33
znBcL7Z^k;PG<Sz8`}_zPFL5ze_Q8gk3~v=i!~*hiyFY<P?oZLB{xZ4hP<HxUz?qF(
z0>~})JA648h8}R<ZmzI;a|6>#?rZVgCo`_7W>@VUHFVB;6@Rv6`Ib$~Ce)2+P=oO{
ztiic|#zx3;j%6HZW}e;``CwB1kuF_->ub*rT}ZM1D(}3vUfWTj>VT7W+}nNd-qpTq
zY4h^2m+ar1)6Kq;CL6Qv=(%LQM3=;0-mDvO<7FOgc(`ca+4Cd2o@A##JbhH-?)}w!
zO()i9{-m$`e8^!JT~|%M@WaqMkCz-g^<qoxRfkWM<SEM+_)(l+IfRL(e7$<*x0d!O
zw?#kCn@?*l9}^mKw|KQa<1Vf3UHtBVhbuxh1x-_IZb^xi-7Bpa%QzHzRkBg@h2*uj
zOGOK&yol82Yz~;aRO{-fLrTUHba|~G_YW$nH2a>vt(WFIZhKn$_&ai4^LAdJMDlHW
zA^GM(d&r8|xZFLzeCMptmOK)%t9#m;&IsB2jMew-CjDHbft+wZNZvSh!%yvXr^n|)
zS%3A!6y*DnK}xeqYBx^2ruqK2ds16g=ok5^)M{7A^jqzrbHqF=?M>iQJu7z6UtRo1
zW8OS#c`I6~%gJ`zh=^O7<}+xmanI6fHm5mQRy7T_woUxC?q*!VC5uB-M~4@G8s%_r
zP|-o{i`!1!f2JBDQKC+L*!WaVC10nka@hOtds7$>Zm({OyLi*U3Hn|y?UgApEtO2e
zM7nvd;<&852F=$kn=4HphBb8W%Gur~zSk{n#i!d7mmhg^oRdD=`MRivOa5G~;plsv
zpGsuoiBc^&s}ob(w<`QryME2g{oK|;q)o>um*4!*M>an;jd(qD@egf@<1^}`<=-ul
z&imrE+3;X`u69kX@jVThhbGrc$Ji!MxH<Jh=W7KG)*Vfi8A$0M7~|6g`8-Q+s3{J<
zGe~Fq_4F+_?dg`2scDV>ol5mRZr<`r-f+dyEFa&JAG*j<HRjc$#Yz_C$19VJk9Xc;
z8a1BWnHU?T;k9K)yzR#qbk-chA#=A|skq)*q&w$WqD+Y6hjRNHVNK-eB_{Xp9Um#{
zq!KydVwFw%B(hFM?31b$hwmC@=V$0xBzWq*v)-vX@9-bBz!?dzEoH3XH;FggPpBtL
z(xvj&mIU2IXf%HxcMn<++l|Gh1-SVzz_KbJkioOi5SsOpg)~n(i@wSyh)riLSvbc(
zkU?MK=I&4D`mjB;AUB2&+b4p~qWLf&j^XA{3!n#hFvDR6*uH?k5J-vR45G8a2N{^G
zKrb4TMfZX90+t|eH_yOOR1nAtr+J1m+yZ<&Xl|aKEP7B7;^80Y;YV9VXS00*=tz)<
zH{BC`=fMny6m%Ag5l91t8`~!kUH-o6frds8f*?0GM7a6)h@cb%3pPL)7|Nitg1mj0
z13Z0!?ywpM&Z5tcnYh23=H(V7xa8;x@$skAJc9$!IeUb;(V(_r{nm~r-ODZ5pG^y7
zqKg=W1hHBEbhI8l+s1>=WCyZnY?fO9Gtl3ML5Gcx;pZ7)!VngWA>fChQ_o<urug&5
z5cGxhm2eWk2E?34pB+{Y+cS{Y+HtbrYx;s=J{~<Q9VZ8-pJzntC@dI5zz;*SxCPOL
zWx&=51o!0oheQnAOuZgR1^)>*>_9&+2qx$Z2KxIqN^FflFcjH=euQ-x=;sAN-vR&e
zV=?_aBU%M-;$<J`=LK;mj9`EKS>s9i>`)R~0}KH_45mR$;tpY;pBDrZkAcp=Hrzk~
zKMbZ!cNXzB!w~SpU;_~D1ekuF(VZ|7!`ov*gNe2^rk`ho3C30sSXbBrzSWCG4<qP0
zvD{o+&tEV628QY98G9xKjT6()GqUJzp7?Xc@b;J>m?#X(&9%XK0Z|~9n``U&YoL!`
zRcry@iq19!KN!o+we=(*h{W{sjNFOPN{0+k^ah5qeAwuD49?*T%gwcUVYviaF{Ynq
zWD1GJ^z)3|$$$kCm@|%|*W5F6f)$SG=NY+42!gQ%d@FA%y!IGdj9UW;^@3B<ApA{+
z<>uPFn8AdB7;bI|hC^_D56s#Vz+`!_3Hl5yH`ms4hK}1LvApOhJ$hp9kLBz;r^S=>
z*?UVgf+IP;NYJw?fVeW4ex8v#A*ut8qSx#}Jaa5J*VglwXwKLIzSS*=n+A9DVGH<H
z-jrzmc#=MQfFJR~WBPeU^qhh?8b{G<4hiybV-Or?G5tJaNRZHAECD|(#0$$Su&ZJV
z1gt@U9!5qq%qj|`fsIQb#PIUfLaHYaiKWDH^D+xt@V-z?C6*g9>+1dEs=yO+^Y-)d
z(_$^)fK}WRiKP@s-0S9L##}(m%$Ul)%sh8*e)Jv=#(e^e*3J#VOfOIoOh137!sd$Q
z@7sB}tpZCUF#q0-j-?yu<mSYzG*(hfVQ<zxFR!Xtb-+s7w~PXtqc?4zm)jWrDJwKG
zEFCwgpp%=kuj&Fd7*JJ+!xZ*r?ep@gid6@!w0+AcF#q1PJZ}h-1!uHGy)dSL9~R<;
z=@zICj-uBb62uUi0!Jbk+FPc1coSS7)1aT?%@0HPw>Kn6c-1f!0)hM@#_R;v5;|vY
zCJfzQ&JZ`@)xweV3+3)m{4EI$L!g=Ts~Z0phPkNR28A9A``o<DLfb}PEVdNW&r8nV
zjQHpGebG2tYym%i-{J^cFW8$yAUiLWu#A{~o)LQm32Y{8L7$aZ!f0LsNoX{DS$`k)
zG7F@_7W7$pC1Lf!m%%M+?z-tofwPGPD}>$82n6zrLhD4-gY-*+vl|N33P&Mm<~B&o
z#vzcXx4B^oF}&QgeGNp|?vJhPO)TK%X71b6!ZKqkdov5Td6}`dB-YB{Z;iitiN4(Y
zR6c=(*Wg%gt_|)nFhk)9+Fx@Js0@yx*W6PG_93|Uhok5<Ltzy>H6*SBmZCSXr?5&$
z9agahDtZH<(6b(hu1Ik;0*;;{tPP=gurva}Q0!S^A$hPh0>M2+SZ<+turva}P-KSE
z8R*kUoc#eu(QEEWCM?4$)<8vXAbNL8NG0$jeRi~jH5Y*vk0<G~^D81`t@c!QfJPvg
zUzY(If!g3|1RTAKN0bO(B<Sf~NMWgZR^kAWU^qO;7qYo<6uo9T!&68G97V4gQVH4G
zIEr2~q!N+=N6~BU9g+Y86!7;S0u_NvprQ1tFz##h_i^9B5@IL^B*d2WPtltmhFSEK
ziMnFpVk`xd?TtQ1;f>`GI+K5qV{2fyg3&Aibb=ehc#=LltTbUs@FacqRsMnQ`0WM5
z+haoi5IuOn^!FMel*V8O5<GdN(Qq{V9H<6{nh-o^=E^ZN0uE0G%_ER#*afzLZ-ps5
zM0H~joSWcD`s~cWAfGTeS$7L1SQvbfphuugu!0Fx{am4LJp+Gm4#ybY9uxY9Xz#@I
z^Ni?(DJ*!PfFFiV_3#@smYZus{}Mc~#d33PFu;i|q&eeB`s_U=jCLx}Q{qYb>^&tn
z!W{xn(q|XwuzCizG2P&q7yb(U7df^DIx=)Toe#q+5I;S>NYDdiIJJi{d_q#;hy=r-
z3|CNosFJW&I3mGtD0>G6vE9+ZgT%>jW&Kn1zC+g=VePmHF=YKy@ZJqP9TsSIxEcXR
zZ&q|kb^oDeSh~NG^m{wt6mVb#xvP1A@b5Id_przVqyuwesrn`8&5o|>e@N&ajw$>*
zP4D}j_4H49v6TZ7fdOr~FmRJ_D`9H{g55mOr#JXHF#S9ucS6wDVhi}zfI#^A6x=IB
z3~!GK{UapoZ#Opt{Yx<N0L#s_vFIKl0YR(qtBNh)TYIL$;|D5w1B2k=V4z!|m17I|
z)}E>G_<@SvKyU>Sofl#Hc}AG9JP5|dVETDRXs1j%I!0Vbw~HeZ3=awqB0j2M3;5PR
zCTs&Ag1#DCz_+3oBZT4D0=|_uCF(!$Bz<;XiRf$}Pts@aE&W4X;H3WV6($(Yga+(E
za5%;E^NeuW(=)1uARJH9XNTtD5lV2jkLl+bVM6l=W}!ErXs}^vLYt^CHD*E>KEe)!
z4;h5v_dR%$K6_6oSa=c)Nxx7yl%UUjg;$At!<j&fEfC0$Q5AvSu75a&7DEu=9>fj|
z^&q&Tf}`j)Ljg91uoRe*zECJJFeL(uhpQ2AKsDHb@E*Ti;3#^{P=JaEOMxlr3xyK8
zIKr}EYXpKrJ$kYbEe)1{9~SC?)`2J*!_Nz1d%&LO#wPlF0<NSt6uLEkrW@NEzX~{t
zUNelB31s-dSVi~%5v}KAJH$Q8Kk0Eq9)S!m*p2Zki!I<=dp{-R=H-Qc*b4pN7S3%2
zVbX#AUW<^cRYKigPFPm>C|4lC0^=xp&HTcE<C(y2i>vA92xG27yU?Ec5M`s$2$cie
z=r|d|6AWwt-^!a3og3gu`s}bHtzrcRGYNVnJV~D&K4?a8l8)i+F)=Y50^0`5&9#LF
zvOEbtI)N?VTZ4R7q4$mm_5e%)Ka4jOa$?F&fgutMhcbE}+EZwC@KpU0z#I5PB|98W
zmI#*(M<f^yW$4fRg=NCl2n0hBnz*nun3BFwC<U^F<-k((2123dQh-5lJ&3CjaG*t2
zphtjP5Wxs8e377se^(A{#Oa8Ie|z2CJp5oEB05FKll0kpOSGWy0QW$Dg1$kR6vNN^
zG%L>02X5yN*fqH}=xzdFpp%dX)R>aK(B2YJKaMXF^uVXA*<J*z2Gh?o!l9fv7+b)%
za-#`5{+|2;6up7GOalZ0{V$%R&yHq;5$6P*4W@t}){}@Z9!Jq@=4B#U3Oq@lou7>;
z3$8}Mfu2?gJCx#T1RQYR8a`(aPw3q6S1hhZz`-pNKajxH2snCEqPMmGp-LFC{wcWc
zP#NLF5+YfDANL*lS$AR8!_Wu>bBpMjC+ZytRQ+7sH|Y8oo()sgFF}xZ0O9yAOh3=a
zf&*+|08uS)6uoBtVFtbCLZ8oj!sVBTH@XiItnz;kB2xbCTNUia^5lK90?skyUqkS9
ze|v&F;HDsc<N6mlwq}48)~nzDw{)--FhlTDV|xCNr9)@<mmE9g04w@k9YQV=|2c+0
zi0y)J!|))u?)~>5BIVyc9|rX1KJ1>W+rQSCz_IrKh$58!?c`tW(-=gf5qRTn_&Ya#
z5K(%*4|)gu1Q?4<sK-glHwOg67klul&4h1_VbKY04gViegi=f=JN#cdh<+8u2*FQ{
z>4A$8cA!Tf(X}^k+xD9=P8jhlaJ(?anC=6IRe~$4|1(mk98<swbPpyvJ^A+_A|-+E
zpLVf+n}J*9fpJ1+!gsSm;c1HB|K0lkC5(6$*j_&$hUdU#!Pw#ZR{%SVcouvw+}!m1
zzn5Ql5K($8U(ZNI;eET{jIl!SQ}aFUZY-7$o%Me&@;}81nF%|92{*U#mlE604dVKU
z&Oh-KJ@F98WCs!7TE`aft-L8w4e%s=b~ancP$C=wFO26UsspZsn*u`FaPN+IGhhpN
zR{l1In`!9k{HNU&SBsI1Z-Qg`Kj+8QVkCpn03Lxv_ZzVMyddu6?@=ZK{S&r;Z{<!2
zj<nbUzLiA}SRG6-(hk$lGjb;cHOCh4t=uU=&9Mc1tN$vufB=G95tx3Skvk!%Iktdr
z<xUA|jxFF@xl^JY4oA^z<`%q!rx+;d3+0ylg(n#(=?nElFU9o@iIU(+`t1I~li*4E
z>}U?6YGC?#M%dRe?!XA_gV=&TE0jFx{yqf9A^5^eY>j{eN726?So7%(^nrsp!SE<7
zH`j(f$RX&!Fx|Wmwto=1Y&~CtiGLiX(Fk;ZIsM^FKzlBq@YCT*`t1IFMchZqxEcY6
z*FWUJ)d)CX6$x1;97V6$zqf?HKsb`VQ1nw$Lh6Ah>9a!)_&|}cPhMh5`a-#-fgg<Q
z-`pX|KuKRHlzhC<(X#}N7gr<T@bQ9S2k`I~?urtQ4<yw6<%G9rG8lA)X2x&Jgu1_+
z9&SviHGVEkKhFpg!l#v(exA__?E&y-i{<9p!aP~<y@E6k7@R_MPnJm5-v{r4*q%Ot
z|B{kG_;(t3PYd!6WD%@eJV~D&Ed~Ee8f=kZI9i6=$oQ)qTfnz+QwdEFj4_YCP_zW!
z!%T1s1xL|q=H?QfBN%Hw0>Nm}8}4Kg47<UT^x3)j{vk~;=DY;M(K5PJC^R7(30L=5
z5^jE6wLrHhL^;5X8lWi{3acrmMj*H+F{bG6%3+EG!+BY!8T><?Fhqjk{InP{jA~)1
z`X%6H*3<omieZQZ!})nJWEd60Q1wf|%M9b7g|s(V_ZT9<aDHA48Aio0RQ(d5Z&nAx
z=VOJ`3sWNy+>`X5G6iF+`XxZ$xCav5g2qwwn$ZHC5lnDSkE7@{qXiGR`A^)5V<>t9
z(L(S)<iOMj1fxYS|3Eh(n+QwN7mAiV1A}|MlOOF*u)eDeS0ms+i|*lUqU#MjNuM1p
z!PrN_8~nH$0S6S}qcQ(-GJz}Wp90P8iGIg;Ai*;|e377s9YlA7FLK6jJ~)EjuwdAr
zq4?o68kV5X8Vu*HETJiIM36?n1HZZ;G!u@7MiX#^V%1on)1uKZ6$0i^!D`{OVpJ|r
zc|p}*u26xx3Cj9c2}cCEdOcp~r+e@>6_%T8^YRoDiRI@7!S`7U3C0xg!+I(}I6Mi>
zK0v?^<E0tk7g%?Al0G{+2lNLy&~ea2&jAQ^e>r>JhW=|(T;bnoydWcLBiPE=ioU==
z_~Z!;a-jA6!Ut^{ekK~Ra-h4P{$KO{rR$#+>3z$fhkAGiGyWc%A+Vi=NeBA9y!?ZM
z2)ag0KhNj|kGX<~u4b|PydY11;!TL@=NUcyu@Vcc25bS}3K#xNH~5_y+`B*60=^Z!
zf@gWb&&mk<@)aS%ztZs4FdO1~1VV(owEQ#y=tE@uMa6P+Z2`dyA)%N8eppWu_$?eE
zvmU7E4di7Ss1VrDxEcXRPdDii80hCi)Kt;dh$#{b4}v=|=txfdg~Czvn$d#Lvl@Jn
zpa(4r`CtLIqBjsNuvy{4lVFPk!+Xlm^n`uX0i7d`tbYph9eh`ckX4PL=nX^*uor~5
z`3Dnhcmh>F7y5>{$HGzcn$ZI6GCoB8H?Bs&!GCTF{SSP;$b;_xkHa(x^*Bifm@&pQ
z3G9ylBFEMQpig#L1ltK)z_+5G+JK)9qlLgISmIyz5TYF57BFD`+JxZJ_f-N@Ixv+7
zT>AwOeH#z9fNw>=ivYhX!tnN(&_7<RK!WRW97V4g{R#s7`C@o`Oz0mr!6R}^KhH=H
zfzO4&F&Evq_;V&u^>gvx^lT;J>9Az|Q=oewejW^Oj|u%lbkdIL=NWr;di+^pxw*EU
zzeIDz7W7%edg_jQ?gSr##1QmZd%Q&R=Sc@h1nvG<^AOk*;dc_T6ar?hf51#IatzS$
zvw#B*FEB!t=;;EEqSxG0AnF0I1$=AIROlo3V63O|eW5+2FyV`VA?XX{mV_=z*Z@Uu
zAh+-j3loN+5eSA~v<YToX#_Sdu13HCDdA3^r&|cYa~oWZfCGMij81q8j^*bCaVIn;
z-P4W1_VK{q;faNRd(q$JbSFC4V)=PN@C^ZkKPZXm=NZv&7Z4VVA>fC>6n^#-e<fi0
zc}6scuwV=UKMbaXzk`YC=NaLaN|+am;CPEI;9Gf98o{L$o<z_tSjN4>h9~OzHBNLP
zpcb%H#EXM1;9Gf9qQ$|J2-*e9xQl})f*Xe1g@9VXp4#)NK6H2re$ygQ^>d*oD8#4u
zIEr2~zd+Q`aWw*to&lQdK$<s<=Yk)cB}ykuI?#_^3Bj|6e@QM(I?xZp5eYucg5m8k
z!4*Go9G0IKgm!Igx(C7b$5Hf}(JBl$QxG`dVETDRXh7)v1BD%daYTaQ{PI71D`HPJ
zOx<5e1l|S*xDyQ>5TYF5<{MzaiAEd}$@=^F?{L1YP+)rqQ4Vkm7<yL1ztk6Mfvv?z
z1}6BTQ(-LtQ_>g8En#lc{)-NIFjf5$@ZX@L!U#LYo@^MZehJ|2Lx_7Y!BAmr0pAK!
zVbKHqyddtxi>T$|N&4(iLQgvf&O~6%;%WpO@B{3`--nDX;9Frz52F*D2x1HPR&+}I
ztvfi1UNaQnp%KBkBesBVWfI>*!*X+NOd+@YG5tIvG{oQ_^w-|$ME9c!RQ+7=1|9B9
zbOQxX(r1Sf?Ch*SCc(pPT#bNZbr1}-BkGK>1$=AI6o2Q&a&v9&z0byRPQI`Od@Fn_
z3j6{n+$bQrFv1rJdV<lj6Z}QMa&v8NtW``Jd{~+22^OwKz`?(vDQsvaq3$my|E({K
zdBtBg0#!ekuyM2Si2m<kwcHT693Z@A!u0cuFu}N<66`H81^h5H3(?Mj>E{_?LU^%;
z>F+i67I0d2&$ZCN{Jmy~X9%l0hJYUiQ#R3XD@;Gn2wyYdgWp^*ygerPLvU@0;q5VT
zXGsu<E#O<x90b8wZoaMGm?_A|y`un+{Na=qKBz;72ML}u^!tqAfdKT>tI*!z?-<@)
zh5=^ocI{^lfG3cF=n=P{wP$lPFy!!wbv7Iv+<JE0AUAkg2;UyaqJ`4k{Ak|r6cQb6
z#r-Awe))Q>I3Wz|7@ip?$ipp&jvd$M!3l+Z7cW-8g%b=5^Y{7!6W59pg3b;*4Bx{J
zM4xP9C4|NR|NpK7wiV{;;f+;aurh+tHv0F5!qEpY(Z2d`u?M!9>CPgkJjO<Y%41mn
z$K1QdNRnOoebKY@l5Fcitte}yUD+`s_DuCmSM_6NM`Uwm$m)7@m%3i8teyvF>74A!
ztg2*ZRVFjD`Y}rhw6TU3h5+*eKdfxQSjYsthGk=I6YzeJELvDgvbC0JU}3Ca%UT!|
zB-k=c+qAK^CDFhCx%Wogh|Gv6s@ba@vifG;d*bAch<nfboO^#xpCD&%h~9bgMwojn
zZM!gS>y)fM+dK0-WOA4TIooLm*_R=$(-h~zPYJ19TZK-8t>~Ra5xh196xmVtg<f~o
z`^?W7h6=>9(=h0&?pZ8K0#XP{VItw!_d=(g&|U%zC$bj+eb`8$56YDtwLX8qgvY{=
zKiRzWli7LYrT%b;cmYK?W4r_#&JLg7Jv)=^`0oO8Hf%WfyU?GVg<a}ZG0d68oS&p!
zfaZNr@;E+NyJ~nY*8KA9(A+uKQ-yi{T;1h1T!UWY^U=?}r~-5@Y&QS;(fKIzi)*e{
z1?qY4BfIrF+`4$~)%nq{g~y$p@mpGO*%7?u4Ol*Zcjmh6Y|Fy$l+MR3UlpCDS-LDN
zwkot?JU`pA0zq`lWEI5x-<aQ9`FnoAA5dEOsr;18LN^TB{l?sD!s(Z#N5)0PBTfq_
z8=vR=o66Q_<=y$mU3n*AykgejjLyPjq4VK*&e?vK`dRJ|2W8IoyA;lHe}3%c>_|AI
zvOJO>J2^WN4w5X7goFIb{o!!ELcbW?+zT`?_lj`AEFTD0%#LKgGJnW^XLbmC+}x=V
zp)$80__;F%er`XuDuRodwL31b@LaO7dF<K^@^jULU@aWcS3n-r{#qagvy<%#SM$5`
z2XwYvnN()y%@t4-PLP{3;t$~}jD)k^N;DupzpXS79idj9R5Uar)cP>T!cVKuA`)hL
z-5F>z-LIB^Q5aq;ciMSh{tUhX3LBXB3~T@ecSeJV-l)}>3)S@sXnCWuzx!&`<sli!
zDu#3tk++|fuX2Z6)ed@_|GX=mCu81-b3EKAKQeA1|HauI>xYA<Xg*@ir}9$&yK~3&
z7~G*v1;p-5r1DI|NlB$usbh)Cpz533IY)~M#~Br7ppd&0&fO^^R#;qB_ndyZ+)+UV
z<n+2jfC{}l>FDI1!caJOr*!8s>AN!b1!(DT)!no2!Nbq0O4-2dHQ8}Ag<pn_ncg#F
z_~G8%;W7nqdgmPkQ-DF|$PRs(9q`AU6b3SZw(`;<#5^3F;So+ZDd47`=@7$EUKb4H
zN0}5Z0XUoqQtr1?JZ4aFL4HI?=@Pr5{M*@q9|hp}oQ}c(Z#5jxQRw&Hr9vMH!~BCd
zicr}R8inrc;E7PSvOMt&U@mRt^#zw%5v}i^U6e#BmJgIDUdipu*%(r6=$>8wtk10O
zbODh+U!mAHuQQyCP};b-QRXp4-#w3IFcgRH7k-&BOt9g2g7Q#)lt6ieyJo}Ayp;S<
zfbs=KZmgQZ%t&6mzc3V-C{y&s_zS~X5&QBavVo{je&Nbc&A!md`^u`+&mWjwGm4qD
zO6!FStorzDw*#JCC?0%vKo__|?6c_S^n37GJW)0D(o%}*o0mrNn%U*2o7cadKY{?|
zYmFCX?Jmp9i_aPK0itt8vf|%!#w32tc{8Cfci{@>0iJ~pD=pM;EEdu(peTdJg|BM?
zL$T_5pr~qeGhq1zl;>70dd`OorI>4fxl}s27B<jsSJH4UDBJEFP=yMuc}d~97z?mI
z^Q9Tk0;^on-tz#*1$LX^vog5n1+95t#X_wO&!PyNS9P5`kGK?S1Kt?awBM?wSXtz>
zFoUd2>73qqyET*@Er91WK})nOuK-$rkL!9C2eN9Nh3>FtV0+T9YF-g*x&p!#o-98s
z*E^$fgesM#+i@iz%Q#%`431DQvV_BxFihaArxcPb<MLG!=XcL2K@NKMHF+#m1Fo)O
z;R)w85zEsR;r4YF3s<Or_6K<W^LR;AYp{f$3qoPR0{T!fu+S+{YDVrZ)$l9fkBjIP
za7C5&N|!{n^GYL3Hk7a{;;I^uwQcSd^k-G-3aH$`e6wX<aISDksrp-K&ZRnTrBPp@
zZMJuggc)kB6@HwuAvj9&GA8B0QYEzVa8|QyW+1OrHZw$=NqK{)dP%4$HUqEHR`V~Y
z8VR+!W>Do7x@Lw71+->HU7<0-Sh=E^aqh{L$(k8Cs~%#+mzA1(Rn-`m`JzvAFRL2k
zGGCHu?qyYDS^1^etMYP6Gh@Dl(oDZEj5O2liy6)IX9bF8MncV@nSrd-(9B3E9yBw6
z&g%*Sl}E5gRqu@`vAu9zUR|fqo!7@HcSj{-N<&$hm(mVbH5p%Er*uW308M$U3Z3Qr
zm$z6}Af|u?bA=CSEq1GOjsoSNX9omcD8Ev`W}}7rD23s?t_dcksGd|-?ISO9QZrt@
z-|XtB>`3AAtTafm7b(s{Dhs_?Ige8BTws`nFVInd7P~AK-k3Yg)$_b3q0&X+8eb-&
zeCt^aiqbNRs!Ehb;wllPA*Js7q7jAPR)1a!VrFQrWrj=gS`0InRGVhF&X-D<xroOV
zN+isT&0E<~C4$n8Mq(|dtf(+S0q+ddJjE*H^_PDk&#;$KcsAN5AD<tN6Ef#Zcx7fU
zuL1S>H2K2(WqL2eW~u81$dDthsg|z@V{@aFWPd4NpuSnM;c~i5ca!I87p9ZvV9)EG
z#U1jT%QFf6vsbW`LOOKeC#-+g&1T*6#A_Sy*~YVoTnh1={&}dPeBRPi;z#oI-U2q4
zEnB|rat>_?oq|s-rM7?yBf5)DE%(lgKLyxDTh9Q`@17HU3gh;<nFSoa+j1@o0~W>V
z^Eyj6#BAoF^4;@LRSxl};}pxn4753Taf*eU+dsP#Od&2MHWn^5ZdpoBEC5)(VCna=
z^uZ#ej5#hbu-G5c{R+L@rx}0dJ3nnGOS_tTA)_J1Zti%M`!hm4SBDMbc{*AdeU@1^
zXP|P<apG6uI<zgH>{ae|`BQUxU3yk|KwEmwC1ex%B&xy%bo=C}nQou-RPLYWYzjF@
z1=uXTsL(4y6(tsx$DEP4yt287_vll93YUbeoYK+^SvUpQkV#X5#SRlC))cM?nJ|UU
zD32wMq{h=r+~}Z9$ZflSGQz?lN3{8&^Kk17h%!y*fGVImC$37c9GUUf`bkxR1SDQu
ztV2=_5|B))L9*vl0~64&I)rF=*zG7atZq)tpaCiS@w`ZzK?BlG_sc@6%%Qqj)`JCP
z!X#5z9i|$XfW{t)gsnO}bwB~pDIRB7cTZ77b+{Vf0=mwiV+A1UP}RT$G|b$^CDq}n
z1Bwxy4^t&ngvv!RHq5CKC_>b$1j@D$cUKJ+p>ih2hB;LNMTmNpK!rI~LPe-D6KTV=
z>fnm#Dt|Se8|W(GB6Oolpcq*-Q~}kvY2#>>9#ug_s3xX2G(50Mpa@YGLeYS$4lbaB
zNFr-A&a56RLUt}xl|T`qarNHFUZTyaa-&7aDq&(&l|T`qy7e66s)H+{t9%wEbd_)s
zx<h{ePu;rUnbm+5kX8O<Jhj8-9m!8t@3k0LHB>aO${iA8ssxG<)oqU$R~=kHM<k`}
z9CZ)A8YCdeTt0PZ>VP6dJoAS6_+xzLPC6jsc$Epq9}W7`CJC+)GN44P&!e8|ZnOz7
zMwkanHDRoU3@D-KS@d`v!Unj2j(2e`bPaF;orFhDk-*$w$0yZIuo<+7lpE3|4J*fC
zt%aRIEC0m!Oflij^96G)co8{I?c8&&g)O4R_l<mZuKCu12BhV5h`Oh64%i~v+E<a0
zGKaPnwusj2I}Ojkp{<22qGblK$8%w=11%zL&|XRpp#eIeKJ6bgt$=EffW*b*vhwW1
z*}#hT)V==mUNG|mA?Wv9$ZLR$h<OJ;+{~LVnE8R4$z*TNJr<DH02dMS%tIJm!)nVO
zc|Cj)JyWR%pPhO=d=dSVes9pE|GRnDKo(IlW9(r4;TV42KIt^sJF+K!j*IGks8l~<
z*4Q@hhpH|r{?K@A2aTGm7~kKyUh}!G$$p0Q=AT#-@+QcDQcL)JC>tRI%E|E=l{y|Z
zvDbRAfNb0&#G^@itOLps;p(HNlYik2&kvr^3>{EEkae8%_Nwfm+16_U3<!IV?JR^k
z(pe1>BWatu&~UFpRR<UlhLpUzY1P2QXc|>~;i&?O5mlPevS%(#l~4f{#F_IGsBS~1
z8|7^p9@x2{18RjRv`JWl*EXPSgbXO1h<84u=YS1pwK>p-YL$}gv7QIGgjm^Z7GYPn
zfIQwh*dkhq+u?MKX2cE95$fp?592Fmy|a+x*?{~!@B#gUzSU@|!&nUxkW7!NVY1g$
z2b3W?51wk6fX2PKAL5BMKi`0Q0`Kt&Ms?4u9xOw~_B{`>TCfZm@!8Fsz6K~D8g=k^
zHV}v!n1E*T<WwcKYM)XiNI-IU4lFe=8Jccq+BpxZdboh@em5McRd;u_PyyA+@XTU^
z>k!qz1T<$gFxd{Q0}6<SgKiyIhN}TCpqpabAhe}vIn{#&WQX-&+1wi70y-aqRrlnp
zL1HA9zSo4N3Me4T69MZm)j|bSn)Tttg172$)q@3OamHaCwg$L>E`+nyp{juiXzbS-
zs7nn@KttSX^()TSQ!P}4s%G9Xt{SiiS<UQYWHn#`+1WX6u?8lf5tehUsVbm=$fjHc
zl-*k`R6u2qvJq7+R6qrf4S6qhYqK6KARE+!WjEOX7tkG^6I}ybK!*r(o?ER43&;lb
zVEL`q!v%D4{!!g~Xn+go;$*5ibPaGBx(2xXZW{mt!b0X&-2|He14456@DZHn^)^BV
zl!Hdd{I(lG15(2B&O6a&(15gDaC6q8RtXVMWXC<!ORn``88U9t4!Wp&vnMq`0g-{U
zDX`F^ob^$oli+;d0r|-}1mhZ*3{5zFqxNQNfHFiXt<}KWL#oG|-^$QrhnLO)%2sTi
z9x?|iL!}wHQ%dJR<+y6WGGuOE=Rns0m?0!hqTi^oozy{PsIoH}YwxoTDxezm9}X#7
z-)u2$0t^T#c-xGu4k#cxd2l*xav;>e1T;RRz3xF(g9Id#CnwcF*}|%Y3aGM!N9$%(
z2NV#Ed%h%C9ilp*fau^DV`IZcu7L??4r^etM^p!tBRZWnxk3&PDq#v}8ih_nR0S1K
zjryY|)wmiYMxtVCb+1cUDqsQ{mupgoq!J=RQSl?$djdxVNQC6ohPwet1xP^BCqtw$
z9JdZAM|9A*W<pWWInOSjA#=5nr`5m&H2tB@BtL74ysQU{kmbpVbLO7CGtkunMhFq|
zE$^+#Zvn~%&<N?|?EXok7fSbB0~eulnTrie40LsH0iDtUC6+Z{s|PD1tD+j&3km3|
z00Y7(^Q!Ls)`JCP7FgAApEWQ6jZZkOn@=@JKtgOyqrg)G6VRy6W21|$4k#dc$Ps);
zjoY#oDxf+zZuVE#zyvgd8klSy)d2-WFxKV?)d2-WXXl7UtAQz?X(BI%rvfM-!uHZ@
zQZs5`GBh<X`Gc+j3W&4|m7-j?GiqQ0nviZ>ho%OmfTn3WSqAo;9}9@`;~(Y#We=zp
zDnn(zS4H%4T(w{svYKa^qpAVQ5Y;_}98(=sKm|*5#Y^iRRUJ@3bn?JepRB`E0~633
z95<t>feC1!4$XM#fC8fNNi&`rn1E)$89isCcK3j`-^9Al0~`>4<!nezzD9Ym9xNc^
zJ$K0>t)ODr!m0-=BWv6|0bLbfK!_NcmsGA>W=((rAwPURgiU|};e8G`Yapl9AOQ)r
z>rP#Ac-=kLLIqTkBSuZ(V|749M6Wg6o5NHA6;SzSc@BNQ5i&v<6_>7iYM`u#3@9b;
zHr#DBNQ`7UJO`R8pcqkQ(sH&fHK9tV0;)==BCblXfXr7ose9tpATbhO5~T@IHCRBV
z{oI&tQwLN;H0t!nP4CZsu^O<5Q2FoYLRbwL5X#-tYYH>10}6<IZRfg&UkwtFOvuq_
z%)6=s3W$jEz~FEWWKDnpVNa7kUye>Q=scv?8(wIPwF0(;wvnBClvQv=bcz8z*A%M(
ziwH;6fEfXCGpvD}gR;U(HxFk8Xg~^`S1IT7F`0VEfD)mi9x$6`BV>fqo#}Xvd4jYF
zHlQVTBvj?7+tc-60oi#l)xZQairqTLUDksIWW3$+px<ntYk&*rL`WJyvv+$A*npNd
zcCJ}Azy)+8hpb9S&SuyE7tra3&n7r_8z@5qTtLS;6O$&(Mh#3rgT%oPHcY7wC_?1U
zCaK#2AgTn45V_+_8W2?iMTnZ!G-gx@6cAA=ulX6(0Y!+0O=f-YRKY}OD(9T7O7K(y
zMTjgwz3K;JWVL`9!s8Pw^i}ujCe<(znsYqx<Lb8(A?h|QZst=36VUWWO+n6;5CH|h
zQu#Xa$6Nyx5S@IbU2QtgFx5gusOntBF|s<i2%RmGYBep!*Z^8aidNI>KWw6%+_vGZ
zg^p044yXOYrtE%@*1<*SShVmKof*+YP`@JnR|^}^jt*3F{IGx2?ho&CxN-yL^MTJI
zmkspXsH=fz5t~W$+?Z>i1M116&dAPJt0giuKm|m0u2&<bDyR$<b=Im>gKW_&P^}6m
zLgdfush!gkP69ZruyaJHs>0FZDOJHlXa+;oDQ}oiB~U;_zR94k;|J>y)d2-Wr~Pv#
zRW<<zgxTq1bu+623W!MOYd`2U(UV%JfNI<$vx$0q&FC5-14>X;uD@(W)By!VgjA5i
zt00Ajx7h?(LfFjnJ;F-BfRL%F3O9C${z230I0tM%d(uA|ljm}-@XT|-W@riKJs-{n
zzzkuv%{zP1C)F?k4UhTUPw^bE0d3!LQ|ebXT1lEg=OL{y`DC{qW37NKqt%{$)R`U}
zH?4SoZw>He^y8Bs3VS_xKt36rwVSKX)I!Cm&Q)_q*y;cSLJPfYR26EWa#Z*W&w;82
zDxh*wZX>RGuz<|Rb2egYfD7mpLE4C`4k#dUL7k1L>cIlCLl=b6h^+xGL+9dfn$gw6
zW#}xZv>971ScHrine!m40Sm~a!Zo6*0}6<+DOwU$BeEvIfY4)Ww0bqb1$0ss>Q-e&
zl$!tpLR)aBlg9gPfXmRGPL4a{CavnM4l1CUjF2{(Fx3GCL{w-#52|{wfNa`pno=c1
zKq21l4y<d-yVbw~&jlS&OZYlJ>T^K{)Xake)|)3=2^0`fH-Mb`!zT5(9;}S)wT6iW
zY!z?;ova-?OQnwIW?Sq$z-7cda$VLi+km+ix{R6^^h2Pog$}6waU2b^t%WL~s&IQ{
z&(!0p01L?E#1MgNj%;d#3@As=W&3)=z1D+e$g17V*`1E6VFDWEm{r*mGCXxa0TFTi
zhi!Yb=P`Dk3p$|Ip%LxT(Xh&-lg+aUFd)<$^CwZ%*$?W$0y25*+NDD=>ad;@KA@LG
z(JmdSQHTDV@BuyB|FkKTtp+BbQAr>|w%ZR+Y|}IasGbje4)VHa&1@M&*z4c}`orP4
z$tzb45|BKs2Fd1C0~64|LkLvtHycaq;Q~5WaOYeYn?VCo$a;3XM%_wmfD7n!%Y!LZ
z3>p)0ijalFG{VtV8B?7*`sJZj=_`zJK+l{re&|(w6e#r!@JQCFa<5ClD)eGCosQ@9
z8|t}8ro)+YSveTHU;a7zX5c_d?@YHqXF7?$<md=&ZR~4s$H&(3V$A|~+Ore!X2$(R
zG)fY$y?72lxQzC|9zgX^&yHso-T$yRmL0v~;&yu;K<J$}7Txr`;m}w3L1yEgUb)xb
zcDdW#PoXzVskE5F&+!Y;u|vHFcX8C=emd%~U+oUCOYmSqKRcQt#uE_}f7njNBbzg7
zT?capqK7qa*!7(dz&Ditwc-coj{8o<^wnKEcieX(uhm~XA0YJlwlf~v0WcMluYoEI
z@fs*Zex%^T3QQFcQ8(U~1HI1%M4Ss(16Uq4WIZYw)<3H{u;Q3|b8RAJ?o$Y1(*LlI
zGT5)yz|KLYF7G_6dXn=YvI`BV1$oyX7|sBM%Q(QX|M(p5p%$(*NcGp+bv}38caG)`
zMq6_}K<KsGI)FWOEQ_J;k<5oGAt<cxKoyFe;bsfHVc(S~U||=P#-hDYKviDjrOS%z
zwKVAOrnr0y*ahJi;7q?;HHGfdn=bua_U?+;%<nEeX>Q+`0?xeCX&Z~`*~!@e_ZHE>
z_A-MQgiA2k7eW`(*ksf@=pXhElF$bq$@+4>;j7e!-<nK^W^4=JH31bo(Lb}%iZTun
z)(Y<#b*6H?gv-3^D17ytseRI)5Px)V+&(z&9Xx11lyfJ*5E3#RL#Jl5K?#!-Y305(
z+3Q7(x;Xf1QO_?9mQ1GkpxtyOUK_1MY@;R2?uD6<%Q3Q@lFgBK(>^#GW54I6iY(YG
z4QE8>GPdB3dnm)6#ui7!t<e;xr7|?S$f~=HcwBC{J9y|sg&cu@X&ibm9Fwq(c04Y6
zVf{)xCO3;$lwWASI@8-Z_{v!y3IAZ+nH<~VEZU>`r#uTH&OO%7X}6srDygISiYnRj
zbYsP}W?axc<Z^1t@Ki;?%OxGb(B)VPRv(kOU)Dg}OJg+-$4|K0mVM#gc?#Y8!^drq
zhj61WZePD1J)Y~YiJGZsu2aI+`nz`@?mkSij)c4L5wKxbDi{gcfTuO|U^hC<zHos-
zoQ|?C`yNUGbO-JJwC9}O$KA30Ko-QxF2nLTIy<oPmV~#s7!Ym_gDEycAwPQzEz&0)
zFa(M4)9c)4S0^qzv5zsIBUTh1L!eV)wJ^eDZ8D-P(&%iVR#kmJ7(Q%M)v_2~sg5=y
z+(9YsR^YaG+M|Q^(OG9)H1Ez1yN4LP4g0kcj_Ti{%yTJRBL9hl(yR+3MF(|Zs7Q<o
zLq)T3Vd&F^RTm8n*+y;VQu&Jkpz?PEK;^FofT}r69`(yU_I!ff{=+K73{{~#DxLk8
z<KxAny_V9AG;C~O<|(On_RiHiJ1h0hPRQ^U*^7S8v4@%8v+mg+wC>qowC>rTbhu}K
z^V9dK5^nnzrEYd<Yg*L7AG8k#9Rd%_ff^bf9n857MsS~)n-2+RDM!qPfujLoj4~tS
zR3GZ#i;K5KyD`nOSK8RX9F~K}9SG|Q#TRBua0H}+KL<$v6v+!=1H(j#ujLb0ap1!2
z@}Yxax5#+%AwyYl*}O8sVdD9u@0~tkHMVa-=R8mMJ-2()joqaE#nVUV<6nfZy3c!e
zThq_LC@7fqs6BNfvwe?FTGQ*-O}=SEhy&o9&KbyAWSugy8IbM(cGUi2x8oYV&4MsC
z+X<xZOBY(p_t`mWbWZOq>@5D-D`#OehdJmqrE<4BQWObcGVLC`^2*X=xb%9m^6tCK
zw>H|>mtIX6IdPr0H#csrv~MikSo#_D9vz?rY2<oxdu65l>e4me+BFBoYhF*n)d-K_
zBa&kF@in1@vkV6hb>)pDNxt_Jukde@{5XFX?34TFx1;{&qW(|(!5sc$QU5~J|NEl;
zYf=9vqW(`u{XZ4;e?IDeG3vh)^}iJL??(L#QUAM9|K+IvgQ)-IsQ)<XcX#LpM^XQk
z2>x-@zZmtuyXmg}D1HB2)c--$uV)?pJC6EwP55sV^?y8c(msy*FGl@ei~9fhsQ(wD
z{@)Yze?98|y;1)!M*Y7(>i?yvKYowD9QFT!2>w^1{-22Ye>Lj=gHivtqyA4u{eL6s
z|3gv#ccT7(A?p9lsQ(W~{l69U|76tv52F4*67_#C>i?rr|8GbAKOOZaTlstcV^ROd
zqW*s|>i<O4|1U-TKN<C>QGXisKOglkMEx&B{nw)YKOXgeI_m#a)c^UY|5DU{C+d&)
z<8IU+@5gte{&+ur5cS9V@i^*__v0w)kN4x_s6XD1UyJ(V{rC$}|59{cUyu5)M*Y7S
z^<Rtn!^aCrObtWtwH5d2+M3$WcigvK-=_EdsBeGk1NZ%VwA$P8pJ;0Qr#%(_iAL)$
z{HL`R|0Rj$;^q-xzUJrgb>AijZsZfbO?>|+e0zam-Q|yY1mFI^efW-VQxA8I_Cj)@
z@Sh+*@b@|L?JxQ3e#N)H<@>)!t34h6@iqCppVzK$6Mx+&eET(j{nu&#l`(SP=;Q}K
zrVIH`b1(n*wGUj+*L|D%_FDLU;Kr}fKIcAW_b%EP|7rN!{{Fw?+Yfv@qD>i(|8m;>
zHAd$D_e=izU!nc6IUt|^z;*8W_VfP!cYXV9-+qTyJSYCsmA~cjf6uqS<l%qCw;y=C
zBi|;z{}Z&LiSggU!u{S+`bsL@`ts%UjW^SUh3(<#iZr#>2;0&v8-8G)TLzRYEF26v
z_~-jwUBLwpw$Dyh)YY3@zMNb=n~bl@?lic1G#nzdUp*e4^sY{OUmhNxa-!<h!QkPE
z{nI{q_3FX!WQ6f=yf~g-MVh(l?R8g|magU+0zfBRn0I)?7AgWmM6&m~OUKOXG)!o#
z^H_5_K0AG&DIIpG&6Sw1^I-BmfP~_VuLJz1?)MrH3kxq}<sr!<ee2Eil$OEg8N7Bm
zp?a_iJeybaK{9et0A`ZwQ`h1%pJ_id+s<LqJ=JVR304|K`n>*Hdz@saySan=3U&?j
z9vgN;kGtb`@AT{>(FdZa2}`t(!3g(AW(>+lKGKAEybqEARuNU}2hS-b8O<K`aN;Jz
z^$(Jj)xDjqy|v}l{hgJy?ft#?lJ(Vh*7o*x_HM83CwEpicG`Ss-(6eVZLRI^Z*1Rg
zCA+)38#^nvSJ!UcyPdSQS6B8o+bcV}d+oh@+u?g_?F;Rld;2@<x0bh8L+Ae1Uc0rn
zxwf*Ow07=oueP@~-d*D|e!jiB_V(6t>uzD-oxP3ywf5UPoA<WXlDGE(++5vg?Iw3u
z?`>}_@7m|Bea&TUe`RNTedBhbZ>=>=e(T=m{s#J%b8!A>R$Q<YM=p$s%*TAk*f;4P
zOqLEtqogI{_|5dvhi6-<_b&9h>HR0_NoD$*Tx2yZv%oec8>d^nlLco)oDN6rbTMVs
zWh-lad4G8mRG^Ff6sau;-spIehL?UbHElh8*>-BWaOuM?pZWJPzj=xtnBMP!gc56{
zUt+^g`={yj$*5=G?Wv};F<EwY?gfCZq!0Eu`h0lm_H;I-r+bw72A$clm`1x=wdm<n
z>Xr=0*X?O8EMVZrVjuQmuv}ZO?OcpfXHGZ9GJ1%1K1i}R)SeC|b_SyMbAnmI41H}G
zbs%hKhljoK`=V^aL;vlv-oTf_kG=G>*Is&Rf>m?R(6a-(OTA$&4eX_=s3Q++IyoEZ
zZONsczI18wQre%S=DpzMb&fhRYNt0+DR`Z6zXt*X!a!mlE<Poht(moXoczl<^n5|2
zaADz0ba9atY^!PKw6}A(@Ujdh?Ir)-uB0zbUgDn-&ox_93k%V`wzVoJZ3vy-x$&C^
zQ0rFyH1-9+Z?p?<_hx#e$H^5a)35RL<ZNQ0e>P<ey!mt29$)(K@fG{`Pie73(;ab#
zI69lNTlFvM3v)~M>}Z13es9HF>lgOrN*V7Ti+o4@moFy6v+3gSa1mO7@m>Vr;6aL*
zlJ>jlrL&tClU4R{(h9aO`X^}@t2Peo?$X7?2(H`Zm)Ov!DNN~<9D!7emIaV5j1JOE
zqxAkr|4!`R{Yzbb_vtpHr)Pnq24~Q4?3ZME>Dmpb$ik)9p&~a~dn24W%#*KoQ)93q
zbAJL5c~ipY{wX<{4l314)EHg><FpRC^RUw&c!=>t*BQn!C5Fgf_TN*)g((Ef`s`N+
z1AcaFH0fq)FS1Cq-0C~N0(CrcfdMR_XpQCM?UWfYCoSD8Ddd|w$G>+1T`XK663yc`
z$>!~T@6pYho0x4E+^xLd|I+2fx9l@=&f~C}`mCIf+p}@jQ{Mj-Pmwzhdsq3fs~jzR
zIlVL(oGtNuY0|3z**_P#)ZCB$+J*QM9#ioee<{tyWNU3}cW-CCy|W7o+-@b&$rM2b
z@qAyxz>)d>kUdnMi~<+zq{pKp?95~5kz)iN_K-;3AdcX8VH@Kq;%Yn6IvbH}p_o)0
zM116Q$Nov8sO^apQ=r7xU&pFQwF#)S5<L<FFNCmSot51cdYrVx&o|bWS6BB6(#ajZ
zuf46oTO0fS)7`cAR(4j`{I}gZ@3k`VWo2{EP_>r#c7yz3LdxdOawd?hwss}n+}ga`
z+W6vH0Q0@;Tl=lhC2?kN4GE~VytTWzMt>x~I5A~;bANYbYtBIHz3u%wPB3ZhtnaTZ
z?;*f&dmB3melT&mHVGyiPuo7~jNZI<GZm}$Bb-i|y1|seD{jwqrt0SVOA|kM@vC1=
zKY0HV0GCEz`XJ?7%P`)KVCel(lM5`%5JJxfSobFBYfrK50!>@#oW3+oU7RE&4<9>>
z{mIQV!4s%3X73Z!hIvGoP7iNz@91HhLdsno1-|x}&}|meDNH?;FIrDVp#WnZ1{`W}
zcsh6@wr<`AsBpr*rl{w1_<%TOA_k6#3fMSISzuRE&`;7%dcY}H!;|#USmbPMWO{mx
z7^Ij+s5U3nDIzwV4%0*I?I{$!m)f?NEJbn!;G8l6AAu@*BC~LMHW-*<z>LlYU3-%Z
z^Hcq)Xgvt|XW>$p7X>ln-w)5AtH@|q^b4m$K6jx3mqv@1MjyX~nD~xPo#kz7KFpDu
zSKM=Y2>nC|Sg?z@<}yrV-StgIokyp-q?4MLV9bSjO+{cIo3j{%NA3er<B+f;-if>z
zOL~tBqK2jbb^9z+9_RI}|0!8djdWqca2N4zYD?>wV~manDvUU`Nq59#h+yH4TyTfk
zwnbN177*<lq9%)PMg6*!%gG(CyqUsGM}+21Cwj+vtP1forom%O`JtRqm>fKxIVfCT
z+Te>Xrew|;FY)UDjnJ=nlllWQckkgMN~Rr^{G(yr^-8dlv2o9#DX8q|P_DcoIsQwW
z-NKC?FS(uBwKwgxSpo>1);gJ>I2<N{Z*;;3$h^v~-G8D-Z5k}Q6@i-m6Ml)Pl5xfz
zV!ko5M7nM^cAwD4zCs5aTM5I`Zke%hjYkKQQ<j}jizE*3KS83n50w}jMO}M$Wz&)Q
zjrG;mzA+7``Q9>AJ?N?Ho6DKf+In{xa=e1_SRmA<fo`CU&h~C@Y{M94sQupBN^5V$
z4?y?ZjtQ)9Z0@h^CFsX(`;mWE@3^Rwx_3RDSm_D#)%IkU@vj?UHPcp2RGZOM*#v+G
zGtTZ1jRa!mltYQ(SXH1HfM6IwlJ|ro>6}qRHFJO_XkPaTY$T2Y)6sN1983Zwp5jM_
zm$f!-@9!<|rk?g=SwR)+4vmUR>Z7zK>aO;yhq#hn2O<o`BdL@i<0&HgkielhK#E9x
z1&t}crhv{ax-SjXT_im$#mI7cBy!rHUS3M!F2=BfsydOJgvRRPL=gB@+-h0Utt0pN
z#v#_|0jPJ9RrPH0Cx2Ds{HmgD5D7<<XGc*z8IX1aUG)q*NOo*3+kvN}h0vubEnL2m
zzNmqW2gnGvlJPZ$ItRy&f*OC3aihSHSQZfE{C4}N_?P?LMI)y@wLEM0O$<Ir3_Pk7
zMxG82Q>RYd3}TP;Dc;!liQWgV)DS>Toi4z#B^K>sqU%}H2M<=oLNJOT3VS8mb3)$P
z7`^am#i2SDmT7uz7oAyr^xWQIv;!9wjymJ}=q&lO>I@!ro=jLVPN(rZP}}Q)-J5Bm
zD8+8~!F~=pm}1#n+J!(lgXIDGi6XDmscZcg(4O{4X;v_gv@(WG%SMi2$mufL?HDUf
zX_GjIK~%4+o!f%3m~3{&N4*un+(E+V*-|k~PAdXdOGYtywmDhuR7W{UJBRFKWEqIO
zy-9ZTVSkLWXHu4IRD^z|3-2uNZDVk}nci!yrOW&2dpq~`(w%p<)4h$>-RCbS{`F(w
zHSN<PL&k)WjZbAfAleUW5G&KBZAlz16J@J+vg>8Yn=+mpU<k|EBVUlXSfU?lfgoQz
zKBSfg@o6WPiHg2tx|3-*iF>yuaV5wamM|#ZdE-?{Cgjqf>1Et5usLGbIPGaPooIKM
zj%0>h?m@Y9Nqb1hfM^Org*`@ev%zD!rpZ?(3UnPO#Eo+0I(ptMln9wTw{C5$Brv}9
z&DO1Z>&6jR_e}F~+LSRoXQ`DcBOZtnf<6;8D8Agig#lQ-cW<@yDU!DWUC85n7hd%2
z<VtFRfM*ixC1JJhEU)go({8OSZ>~9Mc41+9+@CDI<?bqQ2X9xu$0dj8`H=z7#b`=s
zH=FzwTRPdhP9JenJ<^rdlnh{!Y@rnHy=ygHldN1Y$HY=Q6ovmMY+chC6f+#Jcq7m(
zY>WX&{TMCaYi+X6IBT(e`Ksp5;QZ0|uVFOgZX%-Id)xzi*nwssbNg~*vO9d#8}A(M
zKN@<Tkg?;1g%&@9<@Sc(%C~#dU1)=PSD?5(lhwex0O8&~JGphH>e`cq%jt{hOqVkz
z7n~RaYUH(-Afj!8y9d~_7EJ5vqk>Fv*DOtlf<BFi)1~AUGqqe!9)J4v_Uo^qPQ|mo
z4}Iwpyo)Vrl`Fe?1wr67NmImm@L0_6W#~)pPB}C$NrJA0?O@Cz_Z#P-mpZvSEpv;5
z=d<dA+p_nc6$dY?ft*OaM6tBSgA*${A;ZPss6$k70$t<^?=hJiE2h*^Rv)np*rPpy
zr9dOY19F7XA8o9a&9!&cB1S@RY^%90?U}7*ycT;c=N;jDTdsx`L`2*gT-w@L&*e*r
zh9+HNv}h%_kqhr_Z)o^;LNd%Yjbf1%Vg2e2^MM(?llqhaEMUo6N8H(Ua0miM%e{mc
znI}t(+3TV3;!6|Hh2?j+?0B&@CjAl$VBOkFq6klIwXEU0PS7y{=jk4B&2I$RFLP7y
zOy4<1C312mhpwVj8_~*;U>-%3TzzkIwY{;8duqQ;%op@|W36R!%)=pe;t}3a?VkQr
zeFw+=0mes2*=f2w8eukIvrZ*pPL|Tu-eKo#Fm=+<S#K$M8zaf#6MnQqXUxsu^64WI
zj)nMKn5t#-TlZQ)@cvyXns2K5v||=05_(L4M6O=MP{L-%>cmQXd@wjO8s)5Nmw8(0
z*8R_#j?po(fAJ7Gil?QxRx=sN0yG&sadskPCJjmnk=8OP+LnrzB`5hqWKdvDx8S3c
z+04X*EIZG}6kH)~3jDE238RIyV$`B`jD%fMaHf4N2q!1`Utv~sO@?Ra`KhR)*|4~~
zW3rtk3E3um(^x7u)Wr-X`x%R3@Z|E6ZiCv>SWgrWpI(36Z(w-#*lw70rF#~~9mDI*
zKBmQX(1Ck&)7IMUw@vUyp28XR#Cf7@IX>#lI<3y*=#mE;nGzuk6Wg$RMhcMVzl>0l
zS5x!exo<$&L~lrpMM4wj^rWkEpU}rM3i6?Pob_ZGeg3p}#KU>m%iJ#BaK^iki2(zp
z-D+-jT@n%VE>vl3_zoauq|abSd9su)-`d%eV;tL&gyeo_VzwZ$<LH}r>qgN8EKR&A
zfhF1vv}I6{5Y>?oip+|w%IPdVy-GWN(0|ZlIcn*kXY#8hg(rAS%-ejoe=8_c($Dmj
zH6cGjdV;RB*}>X*tQ8&krtBW&l>xKvI~b%N_E|N|kf{yf9w0g#+D$Jr>&#?!>&a~d
zCep;Pc|NfyiY#^{dn@+74y1SZ*QA|sfa^zj4t$=BP7{<i4nlLW3r^%#^u;47ijNME
ztxxU;85-$9CZoZSEx;eXhVH<f`X`@*g{C6{=-tYEB+;3#ghM#{mGNvG4$e$u@emgB
zaBp|T`xVyK*9lpWYhkN(dnXHVkedMRzmBnKOY*XhgfMe|8+Xj|+ndS0LKBu9o|c!j
z+i%}U)>~~P1^v$3YkLR{+h#IyqJcaYTe*&sZm<?GgPgKp&rbacWd&mY_tyU;3KG!H
z8e-n;D1hDUCBee`)ZqcGds|z}d+))*<)A}1opQ<*DOIdvrt{WY>5XgJm)_PO@`+H2
z4VH1d+0f?p$$c%>i^)ddP-)SDyOa>oLKz$ahr$Js8sIHKs8I;6#v{$_;N)$7WO9`~
zG(A~Lx3EpJK_E<rXM+OS&KNS5d_*01gp2o=|B~l5(cS=wmnCyXYlK?#aR~#EPcv9I
zV-MiqLF5$`WJ|)a(Ssclk8~+*^`OjQOQRWvA8;vRz46j<=7ex2B<WNK<Nin+Z7IDs
z>7}b{>&r-)`Uw^<<4yo@;6}c4563(%Bx$!dm7|yJel;ef@C2B>;#TH!q8m>srsg;8
zRz(z^bpla-{A>j^gx=HXq5a15$2d~GJE%-BS<E=F``yWB^<gWsDjDY~=sIS&k~z#%
zt<0V$Zj||)QQfXf&CARexsc53dJx&j#4s|Kks(D@)}nR>7i8@=ib?c)mqwS9aes0k
zvUtygMfh!|`=ptzbqraS*5UmfyDQ=waO)1F{B#ku^x(|-qhveP65HxLAvmZb<+_`G
z84=)f&(JvLV|8%RoISQm0=Z2Ra~I1|nYL`j>1FYHNQ&3HMY^M>Z?Cs48fS5P{bbS-
z?>H3?ak?dHpgAxyhsMM5cMrhU@SH{NwyyYt930M*A&2~E`kEPdl9T%rcn@9))Wy!g
za*joH_$6M6=>TnAImKTUICp0E(Z1KMpzCrN3}nJW^|DlrrPr`0$pR#U4zY7zun)U;
zgR^^knJBpRRd~kk-WDJB-@)Cnxt6P;t6R(7=h52QyXW{?W&}bjUE5xf-$N>@>yAVB
zT!4z}u{b@9Z7XXm(C4fC1`&@^z>9nc&U8cI24+ERE8-Hwj~<y^V?H_9IxjZlazV8{
z9S&r*>lZNu+A=dEfzu}wdu4RP8Iw~d*Rip*U{b6CtY;SprQM~+&1P~|hhQR=uZuDE
zsM~WmoMnGrf^;*m%Ie>BZq_ZPCuWy#VE8J;I0)$c6x?ief3wxz?meE~g7Yd_IgU8H
znHGl?zXO%g++=O>GkD<dW?x+lIU5b0u}N=XdN6&w_!j&Hrx>FV@o}bM!j)v)4L8AR
z#CD2LG9P6F#dnxfuNN>R$}0_1u>jT=bX|BXI58I%#M?xJh?2{M+L^w&qOI86ScYfO
zN{Q#w{sBhn$@C3{h`qI-7_lr!Ko8k_2Oz&b5(i#t1<s8Y;INHrVi?U<I7EOw!KbhI
z#twj3)};GQAK@X88z|S9==HQj8u!b#F8tdl(P8Js_i`#e@ey|-yE-?5Y4-k*9Zrl`
z(tq{Suk)B9{Um-tA`fk|3%aM^g9#TGJ^Nnl5JK>#Gpl)e&LZxOC0rM1dB#fgph7S~
zOvlB1kU{uY+>QS|=6@6C%8pEpxC^rN69R@>?`_@M*(CgBuf4lWpb9crq%hk0&wl>w
zv`0LKV(KKY;Oin5&1yXG#3ee3+G-yLXEU#9>1DRar1gGsumXL?<GlEm1)ZV@tyswR
zMh?D(CrYt`H*c<NZ0)ykrmp4lm9rYZ`9!A9ycerEr&5@%009nR))~P+bTWWUd=PdD
zVT!a%$qHsg=f@UjQ66n<yXR!2#|frT42ncUkx+ly1xrTNO@0Za-E@ls%4A{u;Z#>|
zdptxjaaPCypZMX&f=>>+M&%C?!iole+$GctL#=!$;4_Pea-SZ<li)e9LzyB`I%Jd-
zn}8V*%)|#+feUyx2C=@nf-^ibN8S;qARH>9_wU5?K4>yqB_(EKdwr+i`!-9aJmRmS
z*|cu$-Bm0${({ww<(0i>Co5)VN_0FF9LNY}aI9{bT{5E3Sm&ex<zB(q9w`xsm#+vK
z=<uvvQd?%}+1-cfL?(~o3&&nph|54LdPvYxO1R?#tbbG|kzT>BLIHOXBf<K%l-|Ln
zaym?V&}FO@{X_Htb4%;miL>!sNyQQm(Xa+c;qC4a>9_L$haQOW6(A2fBMd~7t?cZT
z!SQ|!dcvgmiY-Y>&Dp7`7~UV_16Q$@u+=q5XWw;IJcvWKNqaMaE_cJ;T9xZ6!H()D
z4>Q%QsPSS!u?e(9@n9;1K}{;uJ8_ihi4;Qd0$Ffi07Xw?$17gu3V=IIWYS@h+>M$3
zgSKYTVX4l_!Fi;}Dt_3-m56b8UC>#t<ZYVt@s!BEYO!TzhSO}tW3Y*Z8%x(7Q_XIG
z;PcYz+UmXC_WsV@wQZz_;c$^im&FqTwqGjPZm%uL(t{(4r!*X7R=Xw0B=_z@fGxis
z=aVyeDGvIc8(5cuZPG&Su+7|8W~P%+Q|-g51PP8TF9ZeKDX)NAku;}JhTV5onPoAu
z`;}Lir5U(hT2#<HB96%}dT(cnAhEH1({NhW=fxg_Wsxatt?e`*yW`-<erBfpc`Aq^
zmd$R*LPm;;zs?Fe*Ix^_Wg3T!tfxb}9nqQ0@a67&j57<?2(!ZPo=ZYnhV9E9R+^s-
zboj4^*s>%bz}GScykgOm_FLLnxsV|3?vPk>lPu%P2^okEW63HY;FMIllv`Y~(ggWC
zVy&|3L1>O81uJ0Sk#?bq`pDbfwyJenl@^n*PS>*P*tHP2?fxZaX*k7EdHvqzX8Hyn
z&qmy)1V584dB9m#nT02SU-1Pt`uyC`G<dN^UHZT@%waszLorFy-(a&Ao2+{^cYWlb
zA0_6F6ztJ>ct*HQS7k+C`!pA|71f+aE~{KN9fko7S@&yv7#VzW+@r#!I4&l`5OHQI
z)yDd^T!W-AxhUnm<y$Ce*lnD)CQiNg_Wpajc~aH(jxwzRTb5sbcWrNFO99Q^J_x4|
z_CaU4;YCP3w~Z9^OYSvP)mq4$@_f)!b_yA8Y!=>Q;*5DIZ<<=>-Sj)_>-<I*OixZh
zy5Cw|#g}gyio!?i%2?zgnhYOGuidBqvdLpf&RN=>sXC{k(q?MFQupMFiL8#rgajPL
zU^){7%h>3zLSq$R<+37}GX8f|F?NF&Clyj5-5%7=Re`_k>_li;`+7VCf{v~e{D0$0
zirK(L&_JC#U5LB)XLx4E>PM3k>l$4*9bt6C>zg5%Y+{117cerR_oX+<!O`x#m^k_O
z;(T>)H5WoIjh>+;#Vzk*Y|;2(SRkyFgqNL&6glO4-J)#p?nG*}e=<D7(E%p^r=zvJ
z2KY7MY`$=uC<c$&yw0|>-O}|!n2W4fg23#INvBbs=@nDZc!AE=!+Osg3k%3?6{#I8
z33dU4A_v$qUBI)2wu(`k(&qeR=D4$Ai75{$i-8EhE6o|Xtp~pL$9y8#S~0UJgv-_`
z-Z!|o|AD3CrNvj$KYr~;^vss((^+|m$Yq?2IpS$*D;$r}S@mgjpMui&wKaI`{NJfV
zrbIZa7PKtab41Cq>Tg8tnpZ7TYl248KgFHhcgd&nu298<^SoTJuz6d$!!zVG`^Xe{
z$2eV_I@7jbCAgNp&Xf}OBpX5Wfip!BEwS&7RZzQ)^%YzxSyt-G4l>lL`%WI{%KqMF
zAg^n?{!^fk>#OUV_gZ%#x_is-tZX^G2(wpfbNTIJ2(PIzMDT8|y*(4byK-lFdwXrO
z<rJGOd2cp%A>Pub?zQ;5zkwO<N4rbLok!X=B7jOCU0B#Mf9KK}714}lBe26~U%dez
zWpjJ$^Gwdio&M4BR_`R`KjnJNjBgGfwJa^fjhiV-zm{<nt!*!Vp{4g-pwXR;Z5TxV
zFnuySlN(Ce9EvKSJ{v)rY!nP)CgD#E;-vFLdlB+$S{T83^bm%pusn;(d-M85%l(*O
zRq8)ZB;ucRx>RO^(-^@Io0$=mltQjX=22(yLn*>0Ko7vq!&hyJChvO0PcZl;$I}xO
zO7cgX7|-MIQc)~&2U*(`pl3<X(RJA1a(`hPf41Hci)yo<kWmSFTp_Wkic3j6I_)|+
zU1EUc|6fXP_oiD+F81z9l5-SYyasuHC{v_@UGilbtuI+oeChSz_aMfYJPISwl2?{3
zsF-;ewH5D{6&Y{FDrOtSz8UgB;4hVOCvy{`tv!+c%tClhi(wWNa1N8C;YJ>|rG8HQ
zfx94=1mmWJG9NpYTQRdB50uR89xlqxen0>)veCU$;(Z@HB_XQu&t-Z8j`b*PyrhZP
z8?xZ096nM-ypM`v4vW$Fp_Ug=JTe3+TV7!qvA!%Of?lscNC(Umh_jIv2@gTeh8EAw
zN=hsTQuF$2_7=8$qN=69lYu{(SeTEze4Z+?GVu0ctb^cT3X(nEL<hV{kEuDyED~o|
zG))UuX6?hI!||+G5fhS&sGKS%kMdJ27=cVl)xP)kz69Y|A`BtGR{cH;>NOi*`#Rzy
z%<+KYrv087^0s&O_usQmEvgpCwAXrPW4ramz$0CBf>$))N_)innw}A&3SfJRPFN@S
zH!}0>t-MXItDT?}$9F9RaeW6KdzTEv7S+gh__TF*e}^=e&0ULSiDD5#IPdMf<?VZ$
z%hY3#|KF6%=jDZe5-SGlu2#W@1a_IQjVtISliQK8gUD3p`((7oU~@Xb$$(9U>W=Jp
zFL~#oIj+QpmW$X`=73d%H;LsF1+}v2Rm#KZoLWA_>Xt{d7~b+guzENfSld+}tqz7;
z-(3slxZjn*aN=mi3XX7V)@`m!5l=1}r!BaAYY)q>Lb&d2-$g6kPD#btUM91OlzSp(
zA-lV#Xk!49)nAt_ynk)+Ghcf7^5+&l{OaJVgTdv`{ap9T*_Fpv-u+Yf(MSBfzxbsb
z=x3LH7FfC14=!R@QYcaIA)s{3^_i_J`7(Ex`8enN<*)WmC%r8*PCUOZlJa(EkdIn;
zv@-eo(}lfFa=Z51h;S=+VK`&J3T(^vs9!ivcfi#)Y5xwSItFz(&{zfi8&A$m+p7B!
zI7LpqOiS`igP2Al(iCL(xPNkXV%|yj(u}jfK(bVMS+pM9v(hkIor&VF%#t88qUBM?
z$*ku8Q&zh<@UAIPpICO65Ar@H$E-xgF(Yxh<E06Sy5k;3LVpw9$``q6RXDWdt9{aY
z+WaT4b2gL(@`kT$C!pO~E~JD>4hiWj_tp;Ak8ZOh={ePTDjKnXw6m#7MTgwTD6)Wb
z4<A{e(xqmN>WKuxDG+(YF}jZkEXb5BI!}1~E>=Ctu@B0Zi9<xj_r@ny5*aa7xg9R6
zkxx=bAm;ETE6{@lr|oS?Rt;x=XE($F_!>R1lhw6NZF0vO%;9qnx6RudtEfh~*I&xb
zt>soxU2>_7a{JuC?ep#qxg+L#%Cp(mwml<ZW_d;lUp7(R?ylYr9-_@%)T=$GP;Kme
z;VGq;@WOrTGOfG@rMz+ZDU~3hv@mm#BR6DD=+|F+Yhjn{-Np2lxwlE5crkq;TqbXT
zH;l0SsijRIuKzAkfXI*+{LZj#B2#Ixl#lZeqeDczD!0>XV~TT{_7h3PiS1PFGX*en
ztyxzwwL0g!*`Jd&$Re8>jnPuUxqF2+BeTFZ<*N5#PPnVDLE4e0PW2SXnN6J|KPwo-
z?`Yzoz{SP{${>2_GE4%Gy)?OcX|jZjw1iRm>Ga+@ant<jR*Tr)U=%*p%5v$_M>wV|
zK{F@<>=YNDjH!a0#plVL$5Df`7rS76?Tk=w1L@Lg)|3t6b1bLfeavLw@hVDX97Hta
zt>xvIy*eWs-e64~FpP?5ts)k(%_yvuYkbNWQARGv2r<%_6QZW53BYI5$AX1CIaHY;
zg$?dJz}`3=tkIb*W%-^;PGl5gPD!s8nJp_D7*3*e=U^)_%l6EGt8Ec?J7S?q7<h=G
zQ|~xo7Q?nH&_x-qca}Hc#QPgqW!Ap1sUTR8@9(XwQ@-m`H<kTZVc)az{j*zsnRpsA
z-$?jmcV+Ah{+0&>@x}J6=^9OGFT2EpNV7%Bm<jSoF?*Oq?eg8sA!w;?6EZc)v;CBU
zGbDeRsWR856f>(*8CEt!wsA9rn(YK#TR%g}fH4%i7-A#Q+^&&f)|4Hj4;P-BUEkGF
z7<Od{TQ!}EE6lqa<+$-=CWH-gWBev1`cT;|rbz2`QY`^!&y^&AOtp0SAuHkR6e*pP
z0-%-%i}*ViCYKQ_wxlD_)0jmNYGlPp3DO{*w_+CueJiYm!Qz_o0tUUqY5U8VoBD@+
z!t%uqUPiMJ8KhuUM97~R4HQM!<HQ^#=bSCYJ>u}0@y4X0l-A-)SJIar#nW3@aF!cs
z$!8OU=srFxMVdKL&w5RkD>-LP%!@C@iJ@q;OcK?~@&fhcumhwD!b753ST!}Wh~NPw
zi{vfwKW1c3KYd0~+r^sE>t<=h+_O3Hm!=*3f%)7s{%t{IHi(mBlD=?h@<NbvvxPKg
zpHYIQD-)(M4w?0{L=JdIu_m8v4S@;H(jv*_ik(dru*8+#QI=l<x3r+TP~1l;IP#1!
zS@s|n93*yWtXJ({2+&nuv@x`FUG!<kxF~T#KDGi&BTg9S)XT&qN&?Rg2MU!Ko{}77
zW%Xr7b~b(TzBqXg0k(f6;1KepJhLSHD0YwRBu#JRAUgG6;<KK$ov}qez#MX<uwlue
zd&-K`hg+>L>XR`TjPn+7j8D`YopO4bFuJ>w6^zVEZ}<X|Q7sM^!6X574(m~&4aZC(
zzo;{an4yG(x($?75@)C6wUW`Qx8~#dw2kT3B9lPmgghB8JWVwCNS51*5-;?+1ctr~
zHyZSMa%CA6rCOIulZ7Ce2F?5T=jm8gLfMXOyQwb5{%}q83w@1@xjxw><3D9<d6H3j
z1OeVeb~Hhzz(`M#dV-LvJg<TBdda>a<>_>&+LQ=~5EYhP9LH*QyEu&1RVGsXK%(`W
zlYj&ho~f~pEZ{Nh2e(_DEGA(TBdz5Y!QA{u?WXEDgu=Zn%;bzz+;p@dJylwp<)q}k
zz>Em!kr#sbSWU!7$9;Hcf1<75NnLHd>_^5+=`Jp81@`I^LV65^nV^ZsDp@ZnC94EC
z7%?X+AXGbK;@8OEi+yf8)&@EV*wM6!sGT1K0y(e*?#$MUVq$ebZ5cW1dhQA3g(^^@
z!W$~@xP@E3pRhKxlhHIyT@AU_4<}jV1PUv9BxO>mMSDt|D?9z*p_PO)anzz95l3a@
zRMb1P#IY&VS679Wh3G!<;nU<^A&5(05yg<3U4)|L2!)G_kbD_gDBm%|L!_5(Q`KN|
zXXS2yGF_FTI%^f7h&N_wMr(I%`7UL6Vq%gjvDZ!NgiK{pr=(EHs&W)c3N?+@;xJHH
zm-p7z?`=9_29OtHkMXsI^eZgbi*Kz+V>4^$Gml!ziG?FVEJs|zQqrWSoh25=ODZBE
z%CJjiQ7*hFGS+KT>(B^7TbzL!O1(knv19{Jtz6ko#Utfx42jaJHA<+nbXeOUhaJML
zElzIHDrv#gT{NA`?O<EL8J!AsRpB#B$vddcSb7yrmx+{^BuDZgDv(JhL&*`(hJjUa
zHN9?wo%FB`i^u?>N<7NJ%A*}ohfUseH|C-z-8e{=2;g-EUz6LtQ&pOxb`TCf2#)K9
ziOaybl(@hu8Rb(a++ONlonoUgaxZGB9~MnBx{C-cvWtf2O5Ef*DmdsR`C1X#f}z?3
zj_W=_l?NRtspzt@-sGkrwG%lU_29UJ9Eyai*wwB3=|l5TY~3ewUf`4A;q;u~gctHO
zT_WkYx9C?6n2oGED<Y&4FNnOc2YpmaY`YNiH}r3&@`Qp81nqcjM{eF@Ico^oB>dFn
zTZ-zU1t`P+r{2GQ@>B1>di^9R#6rs2eDhK_>5NaHI?qg!j!3&4>X|^8YjjN0TyfA{
zOPH7=Dw;sp=S?Yw%|-afHepvv&V?>nGQNxxJ{_7;&mDqADX7Pvf@oR2oz!TC%U2Ow
zP!#gQgw((gbL0%9SU7qFSyF5&T_qNZis>hoqan5w4Ce}-8fvYNPgBR*G8~$P-hnOF
zGAHu!cJP+WrACz`OC@xX0WE|`Hh;c6#l$mFgB^AACYynzPfI6^!CZ8YN%ZC7@TP5O
zg5;jPrFcqFgisI`9ClHtH3V=7Z1d43W{LIkXl6$VoD7S|=#j!Ch|%|DP2oHuA}SHb
z=)ofkomalF<6fA<*voK6swpUoO9A@>&|Vlg+;@4Gq=iF7SX)lxdK#kjx;%`$9-
zLwLtd1~&6cYpsO=hLRzx#I@}2!Ny#ypijPvINu#Ah6X0LwXwR<-rZZX;F6eaWfAw+
z<;2_BUEBTwHQIe0k=Dwd>Y!s!wc4ZsnK&kb`A%zhnGCL{+hn|GR$U&@HKuf+i&2>t
zg9A-YA$IU~&$u9zG9xzBtnaF>dEOC(Wqb7#<=S|SP)aBk)u)?t#$dc69c7&k$RB}M
zZb9e7V??U8H=~TmN5_yf#}y>)yV!Mwml&~=bt+4qw5zobhgd}x*mtn67gW|%;v=~#
zP7PZc^}8=nVWH&`g@N~&+?7u77Ai`BE*&kUFDy+TPhY^Sqk}i~?9u0BmzfEtlzR_G
zhS*_c@rq%TT;PHvCC++*UP0~@MAK_{1PkEzMFo$~NF7rC5b4g67ggw!x2|w9tCk_x
zDsR)jATvm3@w~6W;tCayI9!#$+9rF&<Jev?*Y~#ySso85n2*^_Pn?1(&gSOpq?!*p
zV{R*+o5vcCv2t{6S<Ix^>SZE)cQ{=F4ks2hV~%j4%$CI`ec${By1l15tfFQy8V`0>
z4s9u)2Zn>&wAA1=bDSrYOdaW|8B245izAir7?nvvi5YbWU*rW>GB=-v>>tJaI1NHV
zSs>%ZkmIEiHPd4*fr)2UA$^sw7dV>oYG6+^v&G8wRErTCyfJuz#gg+roF_(iZ_mC(
z1ftnAy7oX=>H}9&mr1z+-J)}9M2{val702knf|Y3JymqZXiUitafKs!=U@;nTAU8J
zae}WK7;i1fxKJU{YdpM<WTsNrWM3ap7DmXVmcxBc<bV>laK+lZ5d~4_h<&WQQEv$+
zW6Nnm%ePwo6NMkX;6Cnc5y4?OTtth@h?3dsU%jEa*WtK}K<n4H+OO^Gw&j2$6*%S`
zPT|?#ThqY)3YFn}5o%7ycxPjCHQ42eGvSL<c@*vFSo>`Bpc4?NDD9#Lv7&n3fP6r0
zhyJAPGdS#JZ8<v%BvC{Pk^^;gE}G~~ExW*--m}|cp5^GA$U+4yx5WzSuTZXNYHV9{
z1lFX29N$n06>%}Aln2pW6dmKYiGU<^5Y6bK(?qdts~AUc93&(O1#7*_MSx1n<X}lz
z;9<jw07I!*G=#P*mRX^eZ~+QcQ_@04twnPS%dFphcFzrqxieCFW*^&ySD>Q&ID(t;
z{Gu6|Css;Wc2FWDmcN{|IWU+_V_DJ?!cbAoGI@KODB$Jom7TY<8umV(4;7Q#U!ARB
z))BNApY^LqFdq&lgH$3*Ik+^nABiC07x=j%3M|CETvgK~TQXj~y+*RF;%M;D9<iE|
z>&#b%AMfWCi^USI*u?>vIujhgxiI`?BFRj$<@C&ur|PXbuIzwOp&&Z*fIBF3Y`!NV
zRxT<HdUq%R$fco@4bR$3?t60$d8KBbb!1^h(P(Dt#pL0n+qFMQ5u7&uJhS)iZZ5xv
zbi2XfKF*rk$|KasjPWtok+h5dZ?#CEOthU}GJoUt^2YWYYPJ*gXO;DQ&1Dyo5KO^U
zS_?5A`Rag3?GGJYTW~Uxim~f-jU@Hf3$!sz#k`o*4zwax$!eY|5Y|(#;aIEA&VMig
zLcr$Rr+0dzy>#4}s5pR`hWx5jj4_nS1+0ZQ<B+?T56G*A2#b98cp8_Vtb_<LY$77!
zIb81Ok_BAry9szBNZG^Wj*Jd_t(G@BAiHkeUy>J6f~Lx)5>-t&UI+(<JV~!I!>TIG
zBS0Ft!vBsJ#VMgG)lUUTk^3je&`P|~5190<)4WA1jhb2DLf@rNI-3WwDHq7C7i3*G
z1#d$lEXs2ct1TfXC~2`qdF|zxr_=?RXO{m3BtKs#R57#`5<Djy-D0|t&v!7LlTWIQ
zT4le!{@SAC>+CvvG;A2$vpQ=>l7}y}g_{!UmerVJKzF&*x`JpjHHq6!5Wr}IW|7a^
zv}&vYDI|*{NCxaotGqcu^WXL*%7R5eCa_Qmn;;?iiQ@@CrxXH`{W7}$zDjUVCny}a
zU~+I~^-`RDxw;W^_G(#vRa`lhMaBO!4>7kqKRTbupgUwy_?p%1F$#|HMXqiqB3RU=
zvXe<p)nY?6Gp89{rA%Kv;+V58X<|;}(anVi2ZEK|d%USoJ>1kgg*tlc6;Hvv_^f6(
zvg}SPeb*isXYedR3Q`k!sei~$fBtUo$p(eb_HW+oPu5hRZ-KbOE9t%r41{U)7?12H
zJo4E%OjpA}%Rw>8R<>G?NpiKbn_WA3hOGU=M`kVN6bfY1lZSue0f%hN#3N{*<2xkR
zxvyGUKg?N0bsB*2J8y*wY){M)ZV0b~9`8DZm%?{>j74UDa>rV|BqBz&I17TAj%A2h
z;k+BCvRWkw$Q|oRK|o@8a-8BLm9I6+>J(@6S)hSs7UX=DsE}w6cRnHn@6e*Yoxj5y
zg?bMW&&U%sr>RpZNGcV3l_BaOS_ze8Ldixb5p|7RP^K%2>w`M&e5l7VgIs!ywRd8s
z-U-U7w;K6KZ~0K*zL~eRb??^7o$Zhcr69XZO}K|es0Cl%FC><7<p!dam8gxP{Wvpd
zj#nk-ve5~BEpx(fPG2jJCr?_GyHzPWjsWENp<8S1J3B3`T6ypO^6lGet3g4T$w^Yq
zlH<UZvy`dy`}orm@6R}pp)<JA6`XH5N+k3Ty4d#zFkn2LGGoB<RHzx1-$NIv73Hm>
zBVSlB{2$6Pmq1m9uEqg0a&r_44(Z62kAHkoD)GXKa|owUhqIoGwD$J<Om~)T!<SPb
zvs8f;lKn;e#wPv4R4hx!8QmA1>D{s|iV-bQIL^U|{Pn%W^DS9!o>zo|q>LYlOk88n
za1sKU;YZ5%g0-3Q@XX97UB!PZC*4GEBNFn<z(!SWcwzidq29Q8EF(WCE4W4>G(~fk
zBnY05uS8`<I*4E%j8uoA5QzC8vkk3Vxj@5|L8Jkzaf~-Cm<xRl3F@tja@;6J<z^J7
zg5^f$I2b9fe1-s*&OmtDq%)Sf*>RWZlT0%y$*aq+zsAFTx;Oa4{<2elLWvjE)w4<{
z<O*Z5WflKu+^b<$zLmNa3NE@cJhfvpBU?k=BNvJ3-l98Y2Q`AiM>072{E4BwKss%T
zw;<^kQZNrYd&fKFPJv`ul*=i}DTiqt_mO;vrhubg>bq7jghN$SC8u#?I-Uw1b~J{C
zi4odoxR!+uy_QHo*Qq^H!lUwa1Oaq_o=#n<RLP}YV-){W<bGxvgV7OIaL^kBcFE1<
zNA#E`d3*OZ=dVeV+}m9yzQ>fumLk-RW5Jgwh@NvS=PL>Iiuk5t>T8^;mNPA}#GUOO
zl5kDq+g!Q3zqdtEBrzq<Em$RW#aq>)daGvMvQrJ7ZZX$TKUdI_qD1D8Cj7+Z&X^q8
z)wAp$plbP}I{9=;xSI={q}K}}DF;_B{mfEA-BOCPJ0Br|zJxRdpSOt{$i#$VWdbh>
z$q}XN3B&A+?c^^Las?^Rc)3iN6fH|Aaki!V#G@Rc>~>W|A5z4Ux^D51*|*m(CU3rZ
zX`+9+Z@vj3yP5t;j1dyf%_pH65UeUx8?7kkqF%YPwsO~taIz5N^K~J53&zB*IF5EX
zRS~>5FP)h+K060FSS}W>xoQB|L=Hx0R=;GbLurucv6=YQuh5u`p+-WZ#oh&PHDY#%
z%|&VT57wLuut#S^f(KLtz-KavYShWlZiP3Eu$%6QqkQo#^OgsDm1n;4cjmS#83|NS
zhIsQ=Z|I0rmCWUYUDy>h2yS5=&F;xp0nAI%IYXO0Uv2ZiXngS%yMcWLzEK^B;Y$a}
zJskQWbt{<fU5PgFBNw@$l}R!eujc^Kb97n$?ptRR0NvoQ&73-|5R57f-X{+b=#=8+
z!BGBak?zIEi&l0n5n0LulsJhntB1;rGucBhQ9Zis=@dYMt2oFoUHnng4oJ+t5=LSC
zDn{dml7i8n>eg4)aW(n4@;FZVoH~QWk~`%nPW5Ct0&*M;Fofkp6S)NKms1v9soxhD
zYx@z@Y?LUdEzSisQ3Y$!UV4xY3ER&McA+4tQDR7f(d1&~8iv4b+6szCR@;?x5>M`{
z-n<JUbtXT{KpkXNY((IlZBLY1zU;`YZ6A+OYh1Bx{Mjf_Twryu<$g4Bx+N9Ec_*;+
zxdc<WDS}7BRh4tZSOsfTkYR6me~w|#ABSmio6GO`_|0|uk&W#yIAXnTF;+&KrNeCM
zh;gL_v6*i2_t`YrgQBcsPTwCo-w*<`la&1PVAo>WmthNP*a<rP4;iPFNnbhDIz#a@
zlfpkO&O>Bwp8*S^>O9g7DkHf7{_*e1Jkmh?a5Ss7nL@aH=!&tW&&oU+icvox+n_g)
z$D$*T;Hx*xf|eay;ZE+6$V;8hz(~6UN#_oYlq~~0N^}gIF&dZlU&(5Mvq2)v{0d{4
zvjXx|^i-HSREU-L9#<o<b*JQ5Q8QI<ojEH9!Yumg>xBk=OCp~;<xTTEVJ2HjJs-Kb
zE$hSpUB}T*dbXHd?F@nVv-nKL9r9sjD*F-#M6_{G)mW7>1{oiv=JG}nLS#$J48c#U
zXoMF#o(;aRXJhAjO13bq&H3T(<z+=BNk(osC8Rz^I7dE${+m06!yO=~c~BC{eqz2(
zp6iKxDgLpq=PYb1w<|>wLSQdmNH8<p#pKH$PB*_I(rw31&NU7>CvYqUi+;+_B2tX(
zTnuS2$_tJ0#5kwHa`6!4fW7Fn%Snk6XSa2pCDJlv*iIzQlD9-DuRE(ybgY+^gpQrQ
z=2O{sK40Wv%d!%AR)i}j5^S$*?CPt|r?esfvxXWg3LsZ9OI2*~!!wmrkY312As&&h
z3aJp|R5pjiDgim`{4-oU{%!HLp+v-B6XB+#yCiZd;Zf2-YDa<`y7Urr!5|Nb-`|&^
z23Kn?`%KaYtSP$_c^!h8ABpX%=kd}ui64E(!d5N92Rl(7-J_tMrF5U2W8xLG9(%`G
z&&kv;P%ViK`GR>q^aUdX(6A`1fWBZZ$8CAKX}*LEinyP_C}}CgGuxBMozKcSW3v(g
zD7#_wNYG($527p}-Oo5;%;h2@twQ0w>xCK-o9P1El@%?u0xlrj_Gt75@|RdD3Urw#
zGX+&3MMpy(rNu#7mqonV4E4-0O}p841!W;kt`I6ZulykXS9pI!rhw6Vi}E`txizeQ
z7B9!VJC%M7_G;|cu(vkr*gziKMtN)XMiN`BB%(hNj>Rel<O!6jSkQAV{Tw%>MXddH
zHu3B#y;<npbEPr4biT)ZVS(q3RUMt;9A$cZg{OA+zGW2;4>`ryrV!~-2!``gy;?fD
zP*Xj4@D9$&+h_O?xr^stW@}}|zKP$9_>Dhk8g*dKi6o{!IkTqK9D>BSN^8V}<K;3P
z!r4O{5Y`4SKVVovL}Ouc3A@8eN}C;T>J(6$<0{q{g~-i00CXtj;Te)E9*93hB$ReA
zV<<WN7%u$daZy(vAjMBUkiSLAJHc$gZLxV)JL5<FQ`~u%Hq%S%JZT^9XD_aee=>_5
zP+`~=7YpcANo?ZcV)BlJBG|lCvjydrAb6EB!yslEHv`BNaeH6JnocH1R?1DbBCS+f
zVz-?JB4bhhQHjiC51lB;DsawF(}0+K2rZ9gs<SVhDjUY!4-RoQJ((E`#sINb)Ec&U
zRE3UQmGJElm5C55g<%E0s!SGH%Xto+DO2Xwlnl{%cyumL)8{`=Hk62)%Zk#LfW3pR
zO|^%(j2)W2j?`|Oxy5DbzmdjW$S}EFBG1k;xhI4MTlSh6$c)chghd`+vDI34U&PcY
zY3FTcYipsxDE2KBGmZ;VoLHPwBQu!F{=||&p8qnZdy8s9hzsKVo8;a|L{Nxvh%dGp
z-_p<|HxR29`Y(Fh0OkTL+?hR7CIV@n)ZHg^Aw7740|!=1g%%kJp6_;-`jb1o0a+;n
zf{43t!jr|fN@p8ndKX!2ywCQMnbYGS3ddGMJQZ#kUeCxyXM})PXDV<!S>XhWL}m>{
zW6XjhPtwBnjzXvR-d-y>ompitg9SYC#%<V}VoX}K=5@+Uv?eo}o@l%y^RFwQNwyVO
zkkfk5c;uGmx3NE=57;rdljAewgh!`D0Ju<y=z6xUxqv(lf{T4&Y#|UuQ;eC63q&pe
ztDs#q)K9ixvI&ZEf2tD~6Z5HO2f*<HBzy-}Vh7bcGm|JrPG7~&4#6M;7E>q1ynP@S
z(&fkr-&z+$fk)E2X;LsmalDv8dRC}}ZSCq=fGFWPi?(qg9-ezTLZtXLeiU7b!cmSK
zS;=W*u+lloBLJ85M+1%L<`br^hz@Tf=ZFh`+SQm4&tF_j7TyT?G%mLHt!EiyoE6V-
z8WSEelbc65L4vCcIvlyB9q5$eYxgKK$qh$Z7ArYV{m6r;LK!~;cg2C;?`M^P2w?L8
zZ)VVTkGYNfMghAg$?r_vRQ8GrU7L-^`DV;P%;vPRKU6h6d;IRH_K+3(dD!uFHFDFO
zYP@0LQ~t;!PC4e>xfx4!0{Jll1sNa__hcm7-T;o_%yEiE*Q)17bl$9w4A!C2WUt6&
zf*b=Yw&R^m$HVL4PHb=<kXw7J_SGBah`Gz0C&i;@SA%W{0_yc-Ig~Qh`4o|Iesxu-
z-O8#s0(z*~=uCgyc2Rw97I^#Z@CUA6Sy|hqo}A_??4gXDl2wvI%zffZN1GVIA(Vu+
zI{!Ig`Z^$0MD~ac$qAW|H}8*Crz|K+m-4aj5N;yQNUe&bB}C~(U!K$)XIIkebl68H
zSeXaT*MAQ9L?o#xd?SN&QAVZ`I@cZKHPqMW9})YF!Np0F+Td7v%$&h}VZ+ku5)rm_
z)T~Y*vcH@~NmXbQ`)jTJbZ2|>Ju_P<Qb7WkPvS%+v*OBjt<U++?P}=@ig9^Cen8WS
za<_+x(#ae0F1*Di@3ZTrI>*`IQR%ayEraAks4ku?&N#}dhL=HHO2QG^@FYDF#ZNI7
zsMdzaJT%a0F>D!PbnGcEycj;oJU<vHAqdj(jukENTBO~$3=JU<#L;nSnG5FQS5gO(
z@;VwRY<6;nqV1GWPu&?qK=v5~Y&;(25!+Qf$S*$k6IeN7d<sk(9xTe<VL4!=X)c=f
ziC;~1*9cgIGsrRRbhDw4WwOX|vRge(wK7r9OD+)Wl5I~<hfCDu9JuldXV$ArXNmLz
zaGO@cfH^-@g+Cl)#S?lu&dt(bBOEq_HwV9FSkbnnoIs=$CGAb^R@1_;&U>%YQo1(4
zo2cLhZD1-Kr#^8$ByTv$#>q+8S(f*<IYad(c7QA<#(WgKV3%U7nD|Was`QP&vAU)}
zW6oJtOwwDT>&j#rncp;oPJiYz?U|F|f|6w6VuzGnKt?<qh7xUP4=&*Y;os!~(q>1v
zwcFY9kp98h>Q<a1s5z`M=O)+Q2}PVNN=GYBo?z#S;Yo%QO5p}R0!;bl;j;>5%ay*Y
zplm*%O=jWQQW8X}Nr&gw3BfkEc|`4<0<gACeQUA;oobN9P@4}&L|%r+kOnUmU`6?b
zlnPzjq;}Niu1Ys0_a#efD_f>Vo<t#mI<nm=6D}mdB`Z32#*t&pKq0$fDWdU;yTK$D
zg<(Wwm8GiJnE$3uWa+sBPNJX0c=zz?nOD=i8K{SaNTve3tR=&&$)7*x9MbmhpGx0+
z3wrQbQ8(3B@D%Q|*!T=qxvM5Hb^%cArn5CT8PMCL7N`y4Db8ggC>@F5)R!kLWJu5)
zkuwCe$gy5Z?bc?f&IR8fP>h>|+G&f4p1OmXWuN7n>O84FPl1fG9pxyNNzBNdkFqR~
zyjL8DSX?|Ea#EQ;?ljw=-pOqVF5*bRBNrV;f+yJ*66JoEp>>&Kc=mAM$n)F6%TZ1b
zb1IxsNpL-ADHh_EjtFt&))P{HkiZ5UZ((1j(i)ubT~X!I6DlMuhtQ^NZ1q>t2-EKj
z8TG4Cp@|me?TR)y|AyH|xuviY;2i@A<Hk`Q*vz@-t<F*ZKx{FPN-db#Y=p@CnN|Ap
z3)zPnTeU8G$H4kj8n~gaGD>}|jaxU(eXi^|+~`Y}uH7(~x=W_pDqC446R^1nZy<~n
zK-}VPaS=4adPihFUl|G&CpCNfc#~LN?mtuDp7*Q@F}{#$mcwKZ#V=?TFf$W$P_<mx
zEq6V5{Iq7jz&G<rdFfBJM3u=YrUOZT7*q-8|A*^@4$1v0;{=5x6Kdp63)Q~VrAQyF
z5)0irL9WDclzaQ>cGQgz#lS2(-^$Ei#(RtpVIFXWM^BWjIPN7OXjxld%EuXAOtV74
zWm@)OD`F@d`0NiHr7j>gON9Y299f%`X<kS@m!h#vI2M+qV3BfNRw%I0x4XQJ)v3gX
zveOsLG?vG9QKEWz7o;3t>Vgk^jypX+x*YeVbH+w?2LxlFMb88?bHD<dL-_(Om>G5-
zTxdLTCB<0fgObZ^-IdMOZq6bRGHPLVVbVx&p0M03%VR87BFgK5Lh*R-z(iscDbcY^
zA3`mH*e2jdW@Bc}MYP(KiZhqLYlp-HIgKC4%4ql<ZmdJ8?vQi`Yfd1uTVvru*uxl4
zotF6(ldz&8D})%0rNm^owmUWMNyO{f0xF&Pbh2%Y8(M;fOUNC&0A*=|SAD$~wGz|N
z^a#y>d}aCyG6at^i;A-tx#v%PqK*>;w5{}RVpYoJ_hYnEc8A_EUOl;0sq2zmGzcCT
zZ_MU7;Tx2XlVhBn=;C7JSzRoFz0N5q`<~E|c_m9=wkV_<JKFAylj$ywS|XfuILAw9
z%+B_C<{W5?4>^q8JH8_0<E-&ATM0^6k&+2CD~Br1PGgpFQv#05D4F=fO!IZ(N;>8x
z|1*KWS6@Zrv$%(&Q^|CZOhBp}LYH@Y*urLA%Z}zUyY1in9-@pDi~n<K%7StOE(fD9
zuR>~nEPA?hO!EetB<1R#dE+h?Yig>XbxG2~j(Q;Dspm}aHf4;v%&0@OJr#&nB`qWA
z?%?PI|H+smJyn6rxlV#!P{=r8H(=xFqYWHoK9KPvR8N$H=~#&?oCn#J7i4PYB^W@T
z+(0|!howmgln`gY&*v4n?28>XfC}Ra400#T83rXyp=JID3#D}j4(Q|7Ezxi35vT*V
zbs~?IGr6^Pn-7lQ8!K4D%RsaNndP1=i`ehqQO^!Z<+1<4A`>`&8aAW4u*+n*x1^*=
z<8Y~?76-P*Z?z)2X%PpD3Ol%*O26oJDIt#)fR7k%^vS(wN!CgS#QAS_8EMVtEijo%
z#mig0sZYsp(uqm*5|zCq{7et(a#l#Qygr|iz~I{8-Qu%zIi%Ur;^4UDdlZN<DZR;5
zW*!8q9EELl0?`llD2qUF1}x{13UZzWrI9+rW1>qz>X0VP;g32`CSN^8P8ff+bB0+1
z??SL3<MqP4^#sFQAQ_dua`}U?1e~3kLQ-Mea`ZnWtfc#t574o#WVIIvGB4JJp5{g=
z$K77L_L<jtukUdzk&@(d66g-1B8T%5QP|Yhf-AYe>nASWdT)PowY|EwwtI{8WveUe
z^H<?e5|lU_ZrWGPB@>i0PNHK_<#BOEn~tYb!bY!NJ;D!uc7KVht{x4CM}uBoZDw+{
z&k1$C$<^1dUpL|+lU0A91W%rzj#GNnQQS0EJs2>P4YM@MLT1V%up3=hvDrk8VQ5H8
z^Oxn+-?HVdn3l;|kfKowWT;IQY@E}q>~X#impz&uFI>*-_5OTnqH31Ve)|?wBP8P`
zOldr7JOv!eYPBo~>J3DYx|^&bzMHzDjYq(?JFQ{=a{6+*O$qihRV5MsWS5$s!D?#Z
zW^tQOt`~;3WC0=nc6+#dAbYQ84o3<za1IqJ*!ge#OdMDoMI1Pb-TZ(9V<gN;v$X1J
ze74|lZtNV91*#itGTPFvvq($1Wuh(}h`J}ybM_U`e&!_M2MA|{aA6I<#O^$K5Jd$`
zD+t9HCAq&Uxja9u?FMunU0#k$`4)_lCx?fFv&pfYJtIAX3t*bmh#-_KH>x`N9nvq9
zJ}A-Nh4k)mc$!gk9L1S0;WiZ(ePvdBypZ8M%oiHHbC1%8mOmKTlpIl?L+B|7E9GI|
z7>K9%n30%4a3cpF^3#cVUImNJ8<d)lktR4ET|kZEw9Ryln)?2w(G{ZI-4ZZK>9csN
zSu}5o0|a};#wlEglaig|?nMKLUT#DI&G>={ToS9zrtHXmzLPT&!k*WGXz9Z~>GRlW
zyd8sZCPrjVy=wUMvvMLst>5mA?^6bY!nFgaGg;&EcT9jhB_&<14wCiZ0O$!so*{|g
z)p&(o$2qJAT5i5!l|s>>sm{w8e!_*6;N05PsFPa5nAI_PyFbxPThlXqW+((O0xacA
z7|9HH;rg{Juic=2!qQUi5U*qqPGmCf#7CdA)Wk4s=LIhdM>se0A#lqZ(}XxwJ6E|a
zch+>W0|y*)NR7Q+y3JuPxlyW++|GUHvNsvnOSh*kBRyG1!MKC4H|UKw+*zq6_$w>|
zW7-p6{y8uu6%$aDtCRwl2$QKCcEeU{GOxP2aD${6l_AivX+0|u5-BPEECX}`BbY4r
zOX+oS$>)zdIJ%8Ln5bki%)A~~l*N&5iPfM~aR`)16M|hd3t21i*K%drr`y6Af1MXF
zjC=67M;&W!$M&=Z5a4q&#qAA<wn}*i0p%@9OPa^TVyTZgT&t}cGcz>|teyAd%&Sz_
z(H)wxxajH=3mLIdKnKg5EBfenE9a2&hHT49ZID&mBG!xnKmWkOyNLW}!JJMb!==|9
zHYB6*a<>p=--hXt#w_)4Csy{U$QFW&Wcl*du=3e!ee2X3bQH1xop?~-%mw4yf%7vp
z6oJ($P$tPFoOT8W$`V*y8DURA9tNLD#<BEZoXdnv7Zi@5=>!$Vc(-_OHd54qoU|5l
z5BXtNp7AiKHmWadS@=BnS8u4;F!GpAYi8LRPqnh<W(8wTW>$4(E?LnaiX5IHx+=~o
z#<{+_yR*4Tp{+Qrg^F66JIgqbwmFkRtm_t$TG&p!{`l<d5|HnQwho+{fST97|FFHp
zjm&BXUffx+4Pd42#59P*@&mgaR|7kQVnJLEivNR5ix5b(&I6{MlTnPpLaJ->R(uH!
zBXYkh6IY1+B>qqO1VLP6eUoLQSmCs1p~5ahsNmUu^SX)U7Athc=j60se)D=3z;!l)
z3Fz1|ShT<L<u^Gu+MD5Iv+bFjPWnfCNIZ=;IBT$dB0Ay-@S}d07lYxjblOBDOxGXv
zFpf(nE-bukx&2GNaLX&N1S$Nbo4`f>Zs#nob#%8vso)hkGbyCNnt>i6xTd=+<dzZ$
z4{OkxnN5Rt{Nv1yq{K)9u|@O<Uzv#28A%pbxY&4(sS|Fo+0*kr^Zh8)L$51$e#roE
z%XBD{+$~^9{E{^sL^1iYr92qNcI+xu3461=4i6ScNxVjoA6UO~*5|cA5=_{l{@?=Q
zV3fzS(85KO^82FdoL{i<u9*I57cIo8ZcrP$w1Z531+C2P!T&&9iKyc^guHE7Yjqf@
zNmMK$OP=ytCtS>e*u4dHDX(54)6s-FVMz7377+gfJuMoBMxJa!Df)*Z!IUy|>Ph<{
zMGi4aK1>D@TcG2fWZS7h7IeD1e2?S4ES_j}k0W{T_~t4>8?$?+AGA3uY<r&*AcJOL
zmDzXKhzs?qf>p8I*j}GgQ$!J_u{p?jqd95}J5!`k<WZ`x&gl<z-OW<8|K9GJQz<M$
z$rMYo_?Q}MvW#V6B!?X;HoCgx3WGEmxQ4RnFhsw<$G<BnTTZo53NtJENb&FrvY8|*
z8!hvNTs{_dD$6V-HABJ)nI{eci3{Thrdy~Sk*dDrmDcLRA>KT1)J3zn{`&qa4xQKx
zbkYtmu@Lp0mXsEiE_7!@>mmml?I<>n+q{9s6fR-n^3J1$Axxpb1f|G1JkF<0D2F2x
z^KEnL^hncw%%6q^<rEvKWSWlYSKPF#5Vp`k47E5hv?R&zG9Or!urssM>EeobIA)P6
z#H}en{H8BQC69Ylj|E}6KjxfR$Eod7jKId`pAgbZ$S#l1bj5<z%XqVkuOOhcyBxeG
zu>$~7X1On6EQ*7$gb;Xvv$BNpN;T{v6}xzLay+@{VRf41UXB=ZQDSewO>g{+3*FM$
zeHMrDW+6aB?5C;%*P4!XDOTUgtU|$CqP6Xba6@f?52?pwabU1ORSWi)vB4I_f)<gi
zNN+uoXdH~KNB{{2o&<OK<K%OnW2zVuk0W{9A}WGv;j%wOuAM859VFg3x7zOFVUI&v
zclIdabej;jbDguBGf%NGIUZT_oZ8@6kBborWettX>#zYAA)q+AmNUZ7#usEMq_8nv
zA-Nk<yfRv;T$E$Fi@~1sJArIjny_(hWzxmf1temu_=LV?_hvB3=MVdzGK4{ifv(<s
zc-`H$p04(USv379x)&}5KZLIhf+z`(2ndvNMXN6H&>&3^3Mv0OeJscurUXq=cH}M*
za5ikV0R#+1u!eed&T@bxhz}lVRCPhjwyB8dCpm5N*c{0tPAZG3<7{MtCzU^;go045
zZe*3kMC7qU*tK?aH31mp4RG3qJEj_ez>4Eb2SR&9RdN9bcFqfZxtY;aEzW%*)yKuT
zWh$4cSwiIFlofZz!B4U!`sCE?BU%&{Ub4_+SFqSu2hGlYiv0J8R+*L2WQq4USv0ab
zD*9C-ML`lUljDrsV7<lIHdndK2^1Xv`)&b?>)u;i*<5?u;vR^WzW1(6$e}(mX37@2
zpWI_>d)%KtSkY|>cE$mxCXia@q+^Z>mQSj?6_%~swr=lRK~5(eBchwX2Rf_8X-P4-
z^IUQZvfZqX_sqz#@o>ed<dVc8<?ir;b`GB8uIQm8Px4UPguG&RDV-4Ls+6f#rce8a
zw2B?s5uPSF7u~KT>6tuYKaTk2qNufFFj83*wYc^KE0qs)$oB>tcYdi!SrJEUHQ2eV
zi259SPV+^8j8uzgW9Lj<l`e%rD`4*I#Pdv%Dx4TE^Iqh^_%tsq3osH7my=&Lc+M>4
z%N-*S{PFmMBnqHvULvz;GDMYdH)+S^=$K~eIQt?9!E$8_=1{-N1cI#zf<ffkplwly
z=Bk08E6;D!sbR&?Vlt^*VA)ch;F5c2(5##f2K?UswC~~oIkDvEnAaBK1tIUSq&Hm%
z!RWw$n2g+qLg9Cwgi-AAm|L2g-}#j8mMdh_@@B$QQi-cD`^lp|D>I8ul;-Gf0kJ4P
zh2;`Y+y|@~-G#Lgg<>D&<I_$pS8bNmloxF#uhxHY@8>DSBqk&3o_Q>s%TBvd?rm&l
zs`g8kA<WcAEhiIab@N9|=e9W4oduU=Nr{=72-%Km$xILUE=yd(W-+1IhZ6Ym(1!^^
zf^11lqTL=?6?@t6Tw<k<DlVj=w5U~Y{ZRg`EHrCnXLrv>g}Dq8S=XehK~_ywvvflH
z#KVwBGueMM+>}>i%Z{Aem01<r`D6=Q9lCk`)+0-IbkqnL`}jYNjK~0B1(|(Dp`*PL
z!;BPr9RNm(7N{hFTrO2!or?=uN3XO%Vlr8;D#GCXOOr1pGL%VnP%CrA(*_aNn7y4(
z6vbUdYdPCXi{s+OJ8R3kDOop^yI#{|RUJTFhcKd6Z@L`LdTrT8HXg;&Mh$k4fXxwd
zEaGg77v)6|3FaZk;PwK%=fOo4djd)MEulKV0TE?t#FQXUmB6~F*Fs{M(qFZ!LXB(d
zk4svIj!?nc=HKE~_9$i5mGPttpXqU^guUosT6%^r!Jdi*yg7ea&jpLglJz=Cd>Q32
zLV3~s%M&ED-G`t!tE#_o>y}*=PNz~P3IF|}Kl5DjG5&rpe?P(Bze?Z5-Q9lorbd(G
zf@`vWf8hW7i0{e9(+tF}d5*8EfBOeN_$$BqgC8vY=^y-H|KI(=58n94Kls6?`K-TJ
z`FsD5{&;dBJ>TD7_~{={{tkcVg%{k%|5y(HZ~5Pff73quY5$vlPXGPypL0rJl02sy
z;6L9aUAynU@@?znpY5w3O}=sCpY7lMiaC6X{x4jcKevBU(SM$MS-Sqw<S+6UVE?1&
zbLdn1ZTkNff4{-scb7hz{C(R0k3Wt50sij3_R-`ffB(~~A5H!j{=U!OKj!bp>HGcs
z{Um?S^Y=1;ukxq=6wdm`FA<<ZVf^*<(x&>h-u`IvclrA>{C%CjKgZu+;P2n(?_2!+
zW&ZvefBy}CzsX<x-{(Ge@pGTMdF^88D}{?DH&;nz*?f;Wj@!$dK;2AFDU<`bzJF#X
z1WKK?Ds7HV5;v&Y^0IMpbg5`rPJukLT@uG71&KZNQ$P0EU*1iUPlESbfB8c4YkdAL
z?e}TF`JY`#KK6wqdF|U5k_Fl>?JDgzX+NO-4ce*3|LTS07id@iiwnt5?<L7T?FY2q
z{c9JJKTG@bfBi!8tF+&w{pnVc{2J{w+K>E=3(23S{RZu?(@y`(3&}sC{p5djA?fb(
zo%S2FpZc4O(|&_CxySggUr0Vp`)jmQegCg7B)>%aDcWz*PHBIG_M5c-koNnuKlwJl
z_qQ%2S7<*)`!VM88tpaO@6-M|?esS;B!uH7)8D+1e2U-e{%!78*ZrLfyi?%+N8t4R
z?_Nkg@($P0ex7#v-3!St)BgP5yO4aB_Sa~CoAw)jpX=Y{IsLEP2kj@ncOm%>?f(D4
zebRoD_7m?h{(oLbzDoP^{}<1R_WS=g&!2Ynx0%Nm8K?a`?RWp*3(42?{U0+A+Hd_6
zeuwrWKlWU*`abwR`dspL+V6ktx#Ta=e&qK(m;9ce<@Y}ET=FjM>Q6kEe3SOqKKWeo
zTeKhf$>)+QU*bM#@6dji_G|k7N1jW5mG-qi`dspT+Wnt?F1hwEbNwHCE*a5wX@8#f
zo9T1OZ_s{=_8+MK`R9^Pw&|z6M*Hv4uG0R8wC~dXC$t0FU#0yT?O&$-2JK&^{bky}
zPWzW=f1UO_wEr#bcWM7S+TW)A?`eO}2TAf<v?=Z1qkWC`f2O@d`@hltEbaGcN3{PB
z?a$Nx6WU*-{jnFGOa3D5$7sJz`+I4BgZ3wA{{ihEru|2>KTZ3IPLe!N`zhKNX+KT-
z3hgHCHQEnoe}<N$u9KUzzd-vr+CNXbO8YCc8??VhyG{F>v@P0yNc%2raz9DFMEgnF
z`?L$RN3?Iy4rq62ze3xkeMtM5_UCB-EbZ55e~I>Aqx}}`zfSu%X#Wk`|AF=!wBMus
zbF?2hNRoe>_9too9oj3jzfAji+W&xdpY}hdJ*NFnX}?PQmuSCE`=8T(llHIC{wnR?
zr2Tc;@BII?-4A@6`G7z0C&{!8qJlvX1cRao20@YCo!t#$1%n_623fK;?4ZcnuwxZj
zTMUXi#h|EDXAl&{AjqA%8>cAZ%$=gBGe=P8_x{c#?c`~?x9#q)ukF5PKF{<0KHum2
ze4jt_%%5_R-(#IWWM?jle#&0f*w0@w!{2h0f8ZpSIK#iO$o4zh{@lQQ?BW65m^I#<
zOT0C^o*hMf?Bku7=G{2NdvT0udifqbkOe-JJ9rNFaESZ)C|3Dc9_AByjAQJ6P82<Z
zDL#h-d_IRc#c{rvJ~kA+oZEOAce22}d>t!%BM<T|Ji;A}o*PB)W|HsYCVq%pxQ8SB
z1atf}xAOCx<$mts*I4GaIM46!5UV`OA22ZyML%H=f6mQ3%t8K!S^hUC_$N;D7`L-6
zW&3kI=Xe7yu$zm#8SA_iJ1>l)+p(7^_VcdH@SYsyeL2Yi&hQ~D@@($r`P|209^hkG
z;}f{Vr?BgJQ8dmzKAUMik3+neW6X1kFJpnP<PKiWJ>16qypmPEnTPpy9^+1SKR=4z
z%M?Gz0e+Oj+{<x(ig|vH+xR8!WQBYAO;-3XJjj3J5gueT8AU&4lK;U?`~|o02uJvP
z=J-c$<zG3==uWmj*Rjm&bDlTiAtrg0w`Ag?D7r0sct>vLCJyo*%<?{*-~%|#E!@tt
zSmJq{<0HAi5iauatn<n2d_fdFoxRMlpBFO27tk+CM3-=qTRFo^S>&s^o3G_Q&hh}?
z#2VkmCBBPY7e~=9_VEKu^CKMM$2rC_r}$YG_(ksESGkAt+|PezmH*1a{63HI5WA<M
z=x0pvzc|2ObC^dt&i`Saf8jR%ojaMhv+d98vBHf!$eZ#Adl<biif+Rs@4!vG3%77H
zM|f}Mcz<r?gE`AV?&7&D^AViqqj`u~9_5pmcu^ERjXiu8H*<o6yog!8h<=?ndMT$l
z&Fy>@OT2<}d;=G_or`=c>wG6WFNvb}u$LwF^TW*WA34fT(hv4VbDZH9Smal@o8RC*
zF7N=q%NqZkOZ*YL^3MP4<9{;EUvY@P;~49l;-6XI|8fUA?qd71llyr?R(TU1<}G-P
zz3hIm^FLF(GY5Eg4zr)*ydU#?5V!GR+{q00^5LxT0v_b!c!Z;jUgG@EB%jGmd@i?e
zk|TT}eNrKM3AgeUoaGF6@ii>-^_=HbJj5c8@*PaP)cK!1d_OmHHwXDKX8BK?;Ac3^
zecaA3v&64+j^E}24{(v+W1T-_=T_%`_Oix){*oE~mZSUwC%MEK{*6VpZ?gTlf&19S
z1H3V7yg8S6Yj(ZN`Ja8f6VtpKhj=fJF-@NVj2_4WAIcp(hkH20{d^Rwd@K+1i9E(J
zcE8;DpD8|v1AIP*ImL0ln0daO+jtpwvcSE39V>hz5ArQM!X1oW;r!1e-^We-5Vvp-
zNB9Zm_-StC=Q+#$+{Lf4%x`g?-{B!vd6Yk3V%qtiJ^VR0^Dqbb8)o_6oZz20&12lo
zw!7N?T+cb)fD7#AB5%ezZ^h0_o&VX(6#IEsW_VAI^1htp0B86R7I`*z^L*~(Fc0uC
ztnmq4;#1i5O6Py}@!3rCc^u-!9AlnSd>IRTC3o<0?%_7>=asDT%{<Jv^B8xs`!eT$
zruab)@S_~&UXJrq%=2^H#xHRvE8NR(vciAiLH-+$@F1fZ=YJ;oAKb)Wa0`!cguiEw
zf8<vFm9vcQX8Us;%e+44c_SWTl1F(<CSK+I&mP{9o4JXDya%(q4=4BlPIC*l^DLHl
z9_RQ-E^vg4d_3!XGCN=G{LfzI*v|`@;R`s*OE}4`oZ+P`^3~kU*K!|cd4O+Xjc?-;
z-^H%Wo&VX#4=~M-aEKr07|Wbuoh2sjuKmpOXvn#_U^{KLzAWCudNFZN)0t%AHN2O8
zVT#i%^jlvR*?9%;ZM|6HBvbd*Zl+la&(qrVTF*Jg)cuS*X6|o&m}S>C%Xxt5EV9Vd
zfO^7nCSGU$2ii`ov&8a)w1X9PzTWc(8!t?3F<zMER#sVI>LJ#P1t#C%_{;%j9%{KP
zvlyN~OuJcS|E%peXuX*}%l2mCY}@ln)47|ObL8PU``>8(jQLsTJgevG=d09zp7F%u
z`P$9okomXU9vowZGps$_e!}b{?0;`ko*8BzX?l3ha(F&$eOP1a&FXoS{gH_atOtuc
z5b{UspSN(tc4D2|S$Kl|gvlowM@8F@Ii^PKzbtc+nJ20Lt=h*iW}j@oV~sT?#*Fv3
z>7S>X&fL?KXW{9ND=dxc@3$-e4BL;jXWD<6$Qf@;J=^x*VLoP9e!liGGpRfi7g_#0
zlxLp#7Z}G(UF<l;DhJ=GT~oF<Yuv}u3)TBB`HPGT<}Pu3Vm@!Xu<&C2vQxb;F+N#;
zsp-tTOuw=4a>t8ztN#`HjkRg*V*OInnZC^S`3LjOn9g!Rd8S_D_{tg=nZCmKeUI(I
zO-#R5KeNDX%xts0S>vW%+Wk7~!OZKmlSLk3jot56{tfnb7C6e(tZ~Q+mzcZK`n*rS
zz0vWW8P2i5gRF2VOux$dy<dNDgf$kK*seZiS!IP?CCh!2?ZXnMnSHZ*SuNWBAMpI`
z#wY7M!2Ay7J}7^O;}C1y&%!%x&kyOZcgdOEX*uCJyLM~uyR8??+|KMj=ttJs|6%!i
zEQk4Bme1sS?FTGz^G7V_ecH$B`;}w9B>$-A%(C<W<(c}R@+@$Xb@uGBz8_Ma`Q656
z@WakWAJdMH823zk)bd#7mVeaWdu&J6SPsuWroQl;T_0EOAN3nEA9vhha<BcLMfUGi
z|36tCGoQ3Pmf86U<;wbjb(Wa>l<~kak1+LV`_(^LAC55j8RLdUmRbI+_4%aybC%B{
zr<wS?`k2{g|6$<^`lW2Wzi8YtvEO+8l$?Xie%W?l<}22V7502ueP4B6V4<QPnfsdh
znEJYMpD~?Vnfr$Ju*OB^=dJgg=bT{no0iKW7nuB(asFA)ImGI>O=q3+OfBet=Kfi`
zKWF)zVwpRb`WNGmb@qK;eFt3sF#TQQj1_k6v)q5xFHHQK@xUBQtZ+U&uj+>{nEpNM
z!yI$0a~BK$?)b^{_l>VFTFycJ%Pe=Y%u0Cv1N$3m%zVlE|Im10jSJ!VkF4*0&zWWZ
zko5@9c_=*pvGw?}=gcwl6WfDDmRaXP7JjOpuV}}jeq)6NW`E{*$QtV`{fBmZ)qFqK
zUKUtpoky6f**`1#ony@Xr}i+<J*=_D?0?zszh*hiGV=?^_3)e(rVkqrEHV9c%loD4
z0~T0forjqFm2vzH%i}OJzt(P+xsRzM+RY+2&)Xiq(GSdXC(GQ&^l#f;Y|7ckI&<Oq
z@AOxA&bjdXsCF^&d)xI}>Sckc|5gvnJj~1=wCmg2#X+X)&aW(ThKc_%4p`tqc>YJ*
zIXq`-!SXo9+@Bm*S>_zGOZM-7Hl3p^{@L+1_!r~yU)1wg+lP5hGkHuomRV=!Z`$)6
z`Ttrk*0{*x-|bfi)E{+3GfcL1L<fTH9Z}zRwTq*yahmy#j;PE!7nx0TL_PoNIkz&m
zt|QtTp7RI`>pP-N|E9hT9rhL5mAjbk?1&Dr!k(&nuGbM|S>tx*uip_ZFmZ#9sPlW;
z$qZ9n+Q~BaF>}L?Xo*#(|K0L8c0^OGa}NtQ>WB_A+1(MPzHj*)XZpq+(N0!)klCAb
z*cUCIBh25lBbsF**%8eLZ>F6;;LSUt38rqLeN1za8P-|l<{zrB$NDnItt@c|Yn)^1
zmL2wG)0y~@=}fW45$0~CzggyPrf;pCEHQCN`CjYG<ZaZ?#BKElvpmc)Q$IHScGj0!
z&N9{45mlJKy?*_P>38Ue2AN}y6>ba9?`V0fviqmzOLav3%yNW9PO-}EEZwOis<Ohv
ztnnBVcUFE;`#HcOCs^e+ChubXnB_c6th2`C&*YnoKNjz*|5)R`kl)RGtZ?&x*v@zF
zi1N&B)^29+(GhL=xpEw1>YkRvD!Xdd|6X#Y`#YljEHUw)>SsSo_f{Ws_p!a0y07&K
z(>eHGrl+-^x%=5)n7Y65@e9*A&eQ{pQ|7pfCGHQ?xfrGobhw#dI{R4TFtZQrhzcR+
z4(7Rs1@32!OUyn<JAdgpCs}x~`Iy*ZT(QW$Us=D0*iNi+KXVVY9!x&W@_w!VxRb>}
z`zupt*<X*yxs|1}Z5JlbvE1;SqrWkolT2qiqEdLy#BY^9*Y;qEIVR6DKA7iRcz(Wd
z_dD(3DDy+M3$qWmJ%f+1pC484BRirZD_mq|*m(ZEam@+Vxs8QK8PCjKV7&ad^?$VG
zGRJMKvCQOSw4ZtQ{Xx!AW*=+)nSPw@QMVi>nagT7>paZF<MkW!O#hGebApv8I1Ys8
zTx5-P*4g<-<(}9P^)khNrkP=eqs(%WInFT8A`9HjBKNVx11z(~3YS=A*PkqxeXKLh
z#Art}#3aX<;uO;?FvA_pau0Ld&pfLv@Gy%!#uB@iET1V>IKV20S>rhC%ro&M{m&$K
zGR3`2v%(AyGRq^(G5WLRGsyxsvB)hfafD^&Sm9PyIm;S%vCc9RPd09u<RPYblxZgZ
zV)^W0mYbR5AoI+!zzG&P%@Vh>%n~b{W0ec6aglY_nRtrr|5wXrFH`Jini*y}&K&bB
za2re9$vO`(F=l%+$t9-Pbxb|%V}@yFIm8^tnCBD=EU?HOEO8IZ+|LTDtnx5xJjOb^
z|7Q75wY{0*0Mi_1hU3gK&pfxWz@02|FH5Yj%!91(2&;_#*YcQT;%T-wlib1-N0?@g
z8E$2kv&?fB3oNt9d6syHWgcaPiN9MOdsyRU);Y+;(`|1iIl&aCndWw8SYnoQ%yEHv
zF0#Nni|mZrq7r*qW<M*;u*y-^ILSI^m>9ReGs)ddaUat>zzl25a)~*1wfXE0`&eL_
zMGmpVF_t;S3Ja`q2W#BJI`=d24BMY69%Y(|cFSWQb4)YOAr?5sBBxklfmN1R;~eW;
zVB(qjiAmO(VrPftu$LM3Gs_Hf9A%!9EO3TJ7FptMmbs4=9$=L<*0{tvyAqcFtd6LU
zNv4_N5YrrEhEvS4z#Mll&pj+~KZ~rg#?E!>XD^dE?PrP^W;n_m^UQM_3*5;f_p;0d
zR=CI-kFn0~_1gVx`yG=UV47KGIKeEZnd5dAxQ9jVXNgsod6-ouHdqdOn0Su<V3I>j
zag1qBF~b70+`&A{EO4Gh9%6|{Sz%YF<*<*L=h}YEa)uQygy~#lYC=1%XL;Pr0>@co
zo;B`fo%@))(D8#Q9$|)^*H=G#ndblt9A=T@EHTeIcQf%k^)bt%%rS8T<=Dp}(=2g_
zWsb4JDOOovjXPN99wwfz-<f2UDIR8;$CzPvm*q0W90!=^Fbf=Kk$IN5jb-j+g?m|L
zg*6^zoky6MbUeMG<*<)MW?A9{D=e_e9jvp=#6`9zvpmKeyEj@cGpulwHRf68HfCO+
zADQI==6Qq#MmMs2CRySpmbrx$j<CucYuw5@XPLOzah*w)nc_UtJj4u-GRs7__Oge0
zZf1dlEHcXyCs^h*E8Na1ORRB@i7DfgNiH(QI@9dDvE{LsIrcO8LgSk$&M?g)Gu+KA
z_c6x<%(KP<msn!*CfdVItTDqnN11$){$`5XnC4DqxR*IDFwaF6SZ9%)H?_V@vBCk?
zm}Q+4OkAQrnB;b*SYn2A%yNNwF0#Nni|kBlKYLkbKP$|z%2C!i$wXfNFv%iQ+|4xi
zF~bAQvBo@?SYX%9ERTIGG0ifESm79JoMPg|#t)O+#WeRZ!voB+#vGSeVDjeb=O&i8
zg%ysl#!1#W!^BIppGod!iu;)60cKcZj!Vq5>lW%~A4^QL%pq1d#ww>+V}W(<VB)2Y
zD@<}fQ>-%0!_4p)v+VA%T&9@k01F&uk>e~e&oZ~M!kw&gFKeu@&Vx*BwLde-=$4kx
zB-7l)47V`L5$2d<o?BVqEQ{R563Z-eo)sQql}A});#QW=9wuI9e`b<{OfkzeCz#<h
zv)s-cOU!eQMJ}+!Mb?<Swfeb<iI;0XQygKMIcB+)InJ`cT`aQ966aawAy#;lRVI2Z
zhdr!wGZU}SekPe^iW5w8ni+0qjwR+f#{w5v<RVL~v&_!hST1{6Wk2i8FfncaW0I3h
zafWFYnc;3`xsN#>V4gJ=xWpp6Zfm*hV})r}Im8;rn7GvVWs(J^xPxi#VV3)uW0iRx
zW`W08WcTf?7dNxQLDo3NI;WU=rG8?XC1yFt92Z#NB8#lE%+5abv6of$v&IbT9A)A%
z{lp|^m|~F`?q!x0=DElM>#Q(&d-ZY?Yuv&*N0^u~o|xoTrZ~$CcQMC(%<}*Xtg*-?
zW?rTL?_l}tV~JUoIl<Jc)x$K)%y6Dr9%7D1nP=jT>Sqs&+{_XOS!R|s=2_=9W-ix0
zW_f@))|lrK3+zg%k9{mL%`%5r;TWr&Vx6;06ttU3mYL!_(>%lsk21%uJ6Rt4SmYL#
zIKm1iS>+7t+{wgiY&T|EV~$HKbMu{*;~<k)SYM{NjTx4h<s9>@vcSVEF>x35u!mKq
zS>q5BueH6I<W{CxWQMz$V}*GhWRZ22*||x5OtHoR);Y|?wvK3=N#>d2Hm13g8SZ74
z73O%5c^+Yb(OoT%NtU>YWo}`GBdjvV8n?2}Sted*dojr}Q=DgphnVG2=9svf<+6tb
zZf22#EHTS6Cs^S$tK7~SORRH_$=BPjnc^bTtTV&TyIU@MnPWfm%&@>w7CFfhXIN&D
z74Bx0`&i=v)>&iX4aN(T?AmPk>|>f~W;nzw$C%?3^DMBy9jvg-D(9J+)qbYgc@Ool
zms$3+z!4UiW9dr$!aC=ec%%Mcl8a2S&MN!wY5lI!4kowTUM#-J_zlnBVtMz{9u6@1
zR{hR8ceDC7^D(u<cI&sCcj!l^-)%dw*pqN?T>EaBh<fg=T}-pW5!N`#<gF4u(__7u
zzmN6$bRyap@-HQ#l;?%7B%<xCe^q@fFC?O_`<i|r5smYf%=5O~#yfH+H*qi1tgy&q
zOnx^JZBDC~Io7y?nSV`03#|TIB03tLR};~u`)N1Fnfjh~FwGJ(oMV;?%yW?iF7b8j
zx_?{rM)vV7Omhc^_->BzeVpQlSl}M+;3v3;pXPpko>lJWVSbIr_$_unpe_0iQ>=1;
zneQi}36>A4kF_6Y7gIk<M56=x^H3r>#LQ2$^MU-S_A$GtUBRDe4+|`@#QiL@8vKuh
z58=vxuAf+}*{<O^b1ZYg^0NP_zr*r4%j7Q-(bg~@cQbc5;k^j{Qa?S|IQms08fNm>
z`kl2uB%*m{{$w0&QI2WWIKtGDal|~gv%(S+e@R4jR@wg$?fGjWnr7;6+Qs_1by4p_
zEoc3@sGn(OnC)B_O|i)BEU^@(Uw>UR&jJsJ={y#uv-@F|%M=qgSQia2#bIVR&K&bB
za2re9$qM(f&Uxm#)Xy5b2IcHy@`mbVn!_yIXkAohk@GC^5HsEDqAh1x{*Bi~IcB((
zInJ_hlXcNt$Zx(bT43eY*8gnF;}Fxm$}xEt?P8Aetgyx^msq>2`OlHxV_h`H#68zV
z1tz(JS<W%nzb;B-tOvKSc5my?)P2@PyP3G}y67O2JQ8w7=V}L&!TVbdYn){H0qdfD
zEDV^RnFp?mdd{=`xtT={v%(zfEHLpP?P8L9nBsn>S!IStSmLJhEst4NAFThF+hQEB
z$T_ASqCZ$?a>#rSHGY_V*t%$%HST3<Q2#N_L&39b*N1B_H?hDiEOLY;=2+%dRyfNl
zcd^DY>zogsqkkWv+<D3eA8y<RA7OdSW%bJ=t^ecI%kmS{!`hRqZ+QM><968eG24YT
z9tqE%X8(AU=}%u5jWRpFE}CJ5MW&vi|3c0~;rTPQ;{yH29#%QP{Ie{Vb<Tu5r`=3E
zdtG#t6%IVwbPhA~oORIzbDU=7x#nk;hnSvN7j=$k7YCWY(0t)JcQXCFb<qJ<n0SnG
z>|y5lmcuNEnd3O~%(K94EVIN4=a`$cpD=Ne{gMS9W0BpD)z3_^&Os(#VB9dpsW6>~
zn0cY?`8f4oV!o{TUuyg_wbgiLk)4k>ox9CnyVUqFfAKQw&&sQevrz8UwofQmuzk#5
zyTbNi`gO)PE3Y>$nVMDK6SV)zby1oH4za`>vsYP9X5O^Udyb~F?}_Svv;B<4qV;Fy
zZI&}?e)coZA(okAYKQe=nzJmu%l;gm@6`WKGXKYoOBR`9YOn3YEa#ZxLYU6Otg-va
z_R~)ow@mz#;|tT=$^v(>%zYvMq;er=@+s;oTMmod$_i(h{FHL6er8>?$ikfU8Iyn3
zcnf~c{>mzkGQCf~J=J_1VVRRGe!(~neo=j_u*}?-Y$uj@h}kczhsBEZd79^6Gfr6M
z0Bal$enUH%nAc8bxQhjrS>=3~{!R6=bij6cx}0gIzH56h!!gzlD$mjnlxOaT$}{mJ
z+nFhLj%yElnfs}Bu)uLvf40s(&cpii8S3LErhe)8#tcVU;Wk#eGfe-L?HO_&W#ZS4
zbI+8sk68{g$8i=o%_6t6#1hM#W0ea`9I?HaW}P{9K1(_Fvdn%~nPH8iO#a6H#}o@p
za|g5B!#wx1$SO-b%o>le&hDJ$|JHV6jw38_f@My#!tG4_&iG-H6{e4B=d(@!z3s&O
zAB;;DIm0q{vdX=zv%*B({=+1XFwN*W`khH;xrupZSl}p&oMeeJ;rahqzwn$@)_IiG
zKdy`Vo@+UOF+Y=k)gR1oKeMbd$HUC?7!$`VcS8R#%i`aRS62R4dztyW`Iu$*g_g?{
z3(@*07oKw~E1YGGyI5zL$+q>;JX1WxG><aF#PhU+JuGrFbM5P+K_)uZN8`+L8w)J4
z%>Asf%4A}_@0+TZ(eu^IB(vPa^t$!YB#Ye18s}MAZ~2q#T<`m#re9C{S>i5MufIMz
z!WyHCtPgvbyutdYpJ@&;!?EzZOL^wFhgB}H#zm%XxIXH7f$1DzVdHx5k*M!R`ic4O
z_0h4A-*|nr<znsO2usYd%&kn^#QZGW)ch>6d&+dCSm6Nc9A^Gz+QkBQG3mEcx?iY0
zOfk&?R+(j;lPum=zcY0^{mv2>nZ3RBf06Y_t&e7yzw`R&0L!c~bC>ne5)16Q#PT++
zkNTM778dThKAK?SZtJ~=Am=XTS!RjzEb~zC9>zspJMXD|EcBb7`FoqrGMB>i`>l76
zUw=PHy{taiIALXreq!<=w%<#%|6%H3@@)06#>32<qn$4`ooN<1!g^-Cd*<dl*Lt#g
zp8nfv`uVm$OOH?w6OYtxrg(^H9%Y%Sm#O!H^}a`^o)P1g`Nvt_%eCVP+ROA4*GIcq
zA2se+d6IGb3iUn3{=o9s`e>TDr&$gQPhamIuX@I9k7@HW!#YQqdWP}E9Je#i5(}JT
zomD2EY5kezG3MBPsri^<ode80%Xnh;+3TadOgz{8Og&#eyi)z#!8)r<yukWhrk^;@
z#KqQ=`4?Iq(|PsHSk6n;$I4bY)0gW1S6SYxj4#$NH?D$LSkG7Mhu3L8tF!hy%PC)}
zUpz0q(f)k7?ZF<Fxj8()%6MaayY{j0X6<9TsC@<N^H%L(>22y^>h1cA74Bt~l`wtB
z`sg6bJi^R79A93eJhw2%k>Go*Cu`in>MrHOa~@>zz2?8d^Y_^=S$w~oB_3gwU9VMd
zX?@hk90yq7FpD2_9An}`#v?O4%;av{Yn%MT#y3-(Wbz}H!x9&m{;2K7#2)j%PR=ZA
zA2S~_|7aXC%YDrA01K?K#3iObzCP-Gz5d`9X7=i5);?iBX5v$hn{TjuCRzQA;}~n)
z$~tG6n=_tS;31Y+XN8@!>iw+oz#>PO_?-1(mfKiiiOJ8~-<e{S86IJli7PFSea!9C
zURF8E!WV4sFrE8C{zdyOYfQY+^Di0SOzyWoG0$Bru*?$YS>_>Dc$Aqh+m2UR9{ZU7
zisiBRRqbV>V!MawOl(&#ds+LMeq`$F#v{|;usqgzn2CAYFXZfblX4tn`dj*wh3{ww
zYgOa_&D#Gx^|A7O?PdC)=^_7t{roNFV~TYSu=+#CDJFiTUzlczwI6F|(es~}pVdYC
z9}_<_KQo+Vmb;i^nR(8$z(XwZC`(MdRll)^6>esggRC*jIwzR;5B<R;w==~O)0|_5
z3(RtnIo6qH=i4lwy)3eyC1zOWC@Y*~jWevX$i&a>4@}mKCuaZCe*bp$GQ$e@GWlP|
zUC7z9L(Tyvf1#h4<21|M#l+$D(SBw)p<Izg*0{vPFZGw{NgfU5*#8dGf2|(ok7ySY
zzp-4Vd6*@3zf--x)jp<wr@xp#YJ0HAdDeNDsk(N)%X}PWhU3gK&pfxW!u>4$N&P#O
z=Kxbn%CW#aOWejXcQW~B%VUPMVA}@wk}a=&Lo~%&=Z0v1cz*p2(NR{|`wy1iwZZq{
ztl!2BQHjYLZSXxfIivSj5B4$Ly&)QBf!mn9vEMRdo(m!8=3UywStf3>A(~^Bweb9=
z8=}tlT94$0Xn=W+FnhBN(GJ$RFXT5jKQp)35cR)LJGrR-TF-`PP(6iPYp*=ns~s$`
z!Wxe-e;e(6znp0nImF~`H$+>RyPf%&y}f#wxr26<v@f;6_vJjlllCxsXX_R6yQr5H
zj(k8rY_eQd@4CVF=Cu26esArA>b<+^tZ<T<&FW#^@5R+vW#U7o-`ji)zvZ@<1;5+2
z$P$k+{FYnKZvEtU+cGS0l4WjV;(p38&x2t)kFdhd4{P83tv53qVu3m42R1~zSm!>b
z9%vje%dU@@|3T(wmSfCuig^}T;0_kKhb8W3>cN)JJbONBJ21=a7QZRS3Ky7pi1pv2
z-iKL#rq9|C?PUI(4bfp%*!MB*JJ<RK&o_Qpd$@A{XgR~mF~`a9{88G&!Ug)5)khmI
zA6MUFHbm1*KHhSfWu2ub7@vF9%TZ>YxFMQhanyKV@=3M>%k2Jy<#IFAPqsd+jOjmC
zSq{@V&%{$VM2DE;QD&c}9si_V9ASk8rk`$on9tcxpVS`qG5Kuuvv8sML(Vx?xWF10
zS%04WwQM=hS3gS}W1U52C$%R`=OI?u`6=zY$oxz)6P~}odb517`B|M(@25?Fq3zES
zXT$Rs8Q(1C^~Yz_$IYxx8-Gk+>i96{Ia922fT>s79;WA*^*no-_J!%(&%(@xXo<B~
zYtLsb@A3^%p5=n|XXQ24J9vfd7JRMtea>>WX&-B^Gk@^)#!v7KwqJ18I1FC7!8Hos
zxFOmSyvq0wZns{+H`#u{H|wY1Ta2%LELy+dTeX|@x9KM)b{OwWa)BuxVVYfEFpjvH
z1r9U+PUAB?=PZ*^^H1B$9~-r8?QXku*M`pRZ5z5n-s|t-$+qYY%ku=*p5O0Rn6@&1
z+w%P3dB4BKi`pWWyng&(WB29l9gk|;*m>?o1$smy{w6MNi|)qe^ofm~9)@X?LDTNX
z=Cn&1)55a0`5Rsz-z7|Y#K!KB?euq4_SEGDOnbi9-_lfDbUkl(hdR#Q*mL&AzOy%O
zI(uXPSsMq=+BkUmhS~L3u6twRs*Wq#JKob4vi`<&bD6jMtc{&#ZyapTuQlP^je{d=
z<qB9_Xj8woB&XY=E|0^XkY!}yweU)|SvD#=_6qL{hM6qYvd`Msce%HBuWE09eWM{~
zZ|oGAyvg5f=0EbHw&=AC>vcu@IX7HCa$;RNUesnwHs`gwMI-sP=pAM~sqQzd)ZJ{`
zqIoOky=9o!pJ9LNvOepOcdKfRK0JTpfwmJ1)`_=Tw^pB=Wwzz@Z}(FjmT~K_0Dr<b
z8I{e-PHgK7!|HZF+!h})eXHrcuWF0_y7qR=Y#d+Jgd26N-GuSb*S=yLja$~MJwB;F
zp10B;&Atxv`-*0?-~8WQYyNkxo<C)tlr~Pi*n7cY{>GSpVPjpU8vk#u*A|n8%{NqN
zi+oLNr5(+&X#`%res<lJ&8`2wFj$udV`B{Tn?1D4M*^bVmh<=%<xh;6%<6Wf&9l$^
z2hG2Gt@)qPGXK_<$Ibn>uH48=+M;JImpiLDHap(feo=U2tcHQ9<C>47u&hnqKAu%3
z?1Lxu@i{B?H;XlfyRkZ8)^X*k%Dpd?Yt$F^*RwWm3VpJ?m$pAKJYDV)yQfKwePXAw
zLoaQM&I@JF+t{741GPVIW9L`T2@f9C*p<I(QrHLQ%{yscH2#D!=q+=<+0+(2A|xy2
zU(qZUj(f*U-(mXQ!*p%4B^%34+AsEA!{vIL&}S>#qOdE4KcU{NC*hvbjYFW`M=y{0
zNqM-3;UgI>eR2HQmDxD8YRo_V6k~oW^sBc+qRA_Ke!@O@%6|Rk$^9C(rCZL?xC71C
zJr3LQ#P)x|$~Lz-44UTl_qe$&H+$*7q@1r6u8e_}Z8_T<-EV4H=JM#ynSDapJq~i0
zh7ELl9JH+a%<5y#y2M&L{J9%v+8(~vgtJU&jJE@pQ<?Rfv7uj@$K8|L_|=o!7{*YS
zON<>?wnY!}IQ$96sXkdn_LWur+|urrW3)r%m1~}F&e}K<j=L_AqM@rC6SZ;8KG?gu
z4}Nuu<L+K<31eWN`Uc+G7TwqJ>{RR8@wm2=R(6xwmz3S|cGnflWsL#H{f^5W1Ul~A
zzOw0?`&P=nJo1jV=mO=!p9y~lWV5onuiY=P^Kkq58wZ?Q+jnjp(ii7$96fvE_}LpL
z&fPe5_QtK5Q*YG4#(I{*^55AOdHHne^}PGZ>lylJQQ6A7+M)}urfmC@PwJw^@woST
z`e|odbk2#g_J!qh_;**1dFM9H;XRf$Z2lgvWv}qqmnsI^pB0W}gHbp)Oqe!n+WJ-7
z$~aneZs-`b7AKt?hLqo}{MfFx$j=n4HFjeC-SN3ojKjfAir3W@I`hX^z5l;_Wy3f2
z`LNu+>-&zKdCxx8e!KdZRXWu%E7UcquDK7jMY~Q_*Xn&~>=bp4gmsuxSC`ju=Pz%Y
z6UVTQfwq>l2>V^ly!~F=-OzDwjs5QYjWg}54!r-c=CQ{iM&F0^NZ-KkTYBwv&f41|
ze(iF%_O*^!>IrqtsH+}ccZKyh`M4P0u3tXYb`4{FL0vmO)fT<v#27n1Ue8|{u9A*_
z*F(*d^zrT7+2wrsnYQS`YoD7=x+c8cnj5Eiou5_L0k4g++7#BQ{es42e&ae~%Czu$
z>C!N5)w!pAcxAKMqg?HrIQfKg)^6p-lzZ99<%U+~jhAce|H_q=`%Wm=?BC@+xuV^*
z#PSOQC7-qOxX~qz<4*4l9p}9Ey2)~XHja^w(T<UqtSs}ajeFZVu4ox?4$fhJA62&Z
zi}7p0mFu46bLEvMA9q4C!?|dO@(aq}Bn18p`@2VWP<F~~w77a4x2?I2&f4gwUmD9e
zroQ<v`<?g35}L0qx;Hvr%1R-N_t(yqK5A~~E#@6^VtnxOyqCAXtg)d(`$tWiG%f6M
z;ZMjWWlxP|GqMX}SyA?|Shic%AItX1ZXe4I$ZitLYO;=4wj`@t&(`*J-N^5y#<D)y
zzsIt)>>IIcNH!PC#$+FkWmB?u#<GI!4Y6#8Y&w?hkzEwa_RF3Y%c`<R$Fjq+!B}=o
zcJEjgUU=UjmZfAjjb#I}L@XPY{m=f@eL61tuUM9seLt3Mlg-DnowCoyvc0m8#IlO)
zU9s$-Y&Mo1kzE?g!bSQEVp&r5^jNk@HWJIW$j*voBeMI%vYhOWv23d>8Ovs6>tfk1
z*&n~Ox=+ipU&OL`*}+(LNcPQGc2xGcSe9^*`DiTbk?o9Sn`KwVvO(D^V_8;qaV(pV
zjmNTS*<)hacG=mntR%Z{ESr<1V%dW1X0dEhwmz2CWq)!UZQb`fZ|dGyEbEp1AeQyZ
zz7@+dvd_n|QQ4kYHYs~|ESr(NF_sl&m&LN(vZ+|MPxg#hc0l&nSXPsr6U&xl=~&j~
z#p|77S)c6Yu`Dgy5X**SOYT3m_UV}Hm$7V0_QP0KkbOIr?U3z@WqV{Fi)H&|{}9Wn
zva4d*VcAS9J0^Q!EbG3R-<pnPDcR#<*?=q)%Z6q5i)G`oJIAuT>=v<Xo2)aI?Ueo5
z^+#);?v?#2mQ`dwie(37;r?-J*(0(q#Ioq-etSHYC1vl4Wt(K%W7!tjt76%R>_xFG
zCwo>b+bRq9y<6KiD?2xq?ULO;mX&39iDmP$o>+EBcD-13RQ4CwgRRR=+`{kH$Fd$-
zxL@Bo-)7lA$Ff1$7h_pg_VHLYA=?$pre$x6W!q)1j%6j;C9!Ny7T#xQZQp|I@v&@C
zc3v#2%N`KRI=z716w7*Lw~S@|vg^mPjO?$j_gmXCDhuzAv}TjCAIGv8*}ufHqU=kt
zY`1K0EZZl0Z!9|?dvh$S$u5s&OS163PHX$R+$4W?EbEg!A(o|O=f|=k*+48Clif9z
zP04N*%L=j^#IhZ-W8RNyZQmZ*Z(`Yg*-v6wRrZ}&c38GQmK~FQB9?XE%5Sa5vXtyC
zv1~wAh-JgF7ss-3*>hr9UiQRTwoNt^%XZ2h7|Zs`?iR}`vRlWpgR-tzc0~3!?*+E@
zX?WA(x3MfK`)MrOBs&nxw#dF5%SL4X6w7k5_s6oWvhY4-Yg=YzuZd;5WG{(jW!ZCM
z*}QBtmK~BkJeD1mJt&qX+$_I)EbEc=#<I<_8^*Fh+5ft4(7GO3+3#Z6glsXEP0PL;
z%eKqD63a@mPsXx2St*t+$le;u7G+n&vbyZ0v8?kp?$O1vUfGjkS-<QNu`DBda4Z{@
zh4-&p`(;vgn^-m@+ZfA=vcG#TytV9Z+0j_GPxiA|c0l&8v8*QhYAjolm19}gZT((+
zEbEiKEtaKauZ?9xvaPXfOm<-`o02^_mK9`=jAc7yTVmNB**#*}e%WnfSygtUSaw(z
z#j<0v-@9Md+Rxp$^KT8rvXt!KV%dPK63d2VpNeJUvJb|xyzK3<Y@2LbEZZr2SuEQt
zdtNN7$et3*4$6jO*%8@8Vp-JZ-=2tNN!jgU*(O<cEZZV$i)ACS|1PiY)12(*v23fX
z8p~#7UyEhCWS@>@W!Z;f*}QB=EITB7T`W5)dwDEN+}>9VVp)$YeBPyXY;2Z2DwYk(
z9vaKCvU|m{30Yq(o0i=;mTi}{$Fh>_5AG4Swq;IMi)9P4@5Qo3+1F!PUG|w+)_Di_
zSYuhQ>>aVJU-tS~mXW<8mW|3LW7(wasj+NEc0nvF${rTWcFX!>**@9rW7z@OO=4M1
z))C8=WObj9YVFgmJG%cD%lc&h9?R0QZ^W`8S@^tFYkgy~567}8**jxdLH34NwnH`@
z%l61Fie>v{Pm5($*`s6GVcB3TJ0`n#EbC6W{};<rvYW=T0a+rJ4a@$=`y#D<FfRMA
zSeBQ4KbCEih0nLOmfb1)Y%JR=`$#OS$levp4$5X@*%8^Lu`Ifi`+u=4DSLV>+aw!_
zWm{xt#j+9EePUTocE?z@RhEopv$A!uY?thh-cwi?HUGGU>ECX1|1Xx!%MQk}L$Yti
zvZJ!k#j?bm-T#YaJ+hs#Y_sgjST-noWh~3eE{<grvhi3pEqhEX+b%mhmX&1pjb(GP
zR4iMN-7J<Z%GSrSy6jIMU)`sjcX9tOmi5Yh5X<^y--=}!+2>=~sBBLxo0Ppfmd(iC
z7|V*X%VODX*;FjsCwoRLJ0N>(EUU@RiDgT&bS&%g#_FA7S)c6Yu`Dgy5X**SOFkF7
zu4SB#$$lBjrer^iWd+%{W7!VbzF4+L_OV#DU-l2NtSY-ImK~PO#Ij?u7sj&gySo1u
z%Tlt(#j*ieCYBA$?ib6(Wp|EcdD$&u*)~~cEZZsjv-hoA`*g4DSFx-j`%x@AC|ihS
zM`T}!W#NOG{}{`XviHQYO|tE=Y>Vtwv1~;4qF9!bJu8-Nm1SeutnA!awo7*ZSXP$Z
zC6>+0dSclj+4W-CQQ2R7ZlJYK6Fva;>sZz!I~2<{%l<i*4a&Y4%d)bM$Fd38u2?oL
zds8giE_-z>E6Fa2WplDzEL)I0K9()Y&WmMr*#lx(=Vtf+Vp*^3ma(i~cKukEk^R+Y
z?pynGRCXkmP0D^8%VuQ%63dFRFU7Lmvc0ivpX|M{?11dev8*P$JeDoV^0BPz9`66e
zvOd`pVp&>tek>c34aBlB*<EAVl<ZcqtRTBVEZZSF=6epUeY!{Xn^?AA_LEpvm3=3c
z9hQafPqfx|O!kRb)_qU+|6*B6_Lf*SAPe8qXsvHp_TpGJE_+Ta%gde^%eKkF_d#0g
z+bMfsEZZx)TP&-{ZXL@G%EI?dTI)L^`<w5ev}V!0-2aPZN!d?h*(TY6Shhv><ybZ%
z`=?lzlf6HdZIu;c*{tj}v22&@C9$k5du}Y7myO1<L$Zg*vZJyG#j-@d`+u>lN7ft5
zHp^}p%LZlt>-#{heVUd1E|yKm7Gv48?7OjSyX-5mtR(wnESr;+V%dW1t+8xTc10|!
z%U&AGI`8fNUo7jDJt>y;%N`NSGO`E9vQgRQST-rUO)Q&{ZH#3_+24ImuC-5h%Z|pf
zeX^g$vIDYzjb%02S7X_dtQ^a_?&JPnEbEiKEtaKauZ?9xvaPXfOm<-`o02^_mK9`=
zY{_h%XoqZz?1ABWa@FU=p1tzXtJaTuH9lXopv;gm9n06CX)ek#vS)-W{&}WHt$d!z
zXTF-BulMP5$8odv+}G=G<)%Zq=I4mb)cow{c%zsPH~XY!`?)JmOg8JVzgcDXD|^7>
z<+6>>bgura=lJSRdiwr~?}eQ7;m=TCNqvLH$aPU)pZe;ded?R^IP}To?T_2o>GSdJ
zgAE#e*`xEi?CWh|9)HID-7E{=`TKhantj<^uQk7W^Q^T#_2`Q<?R(dn5I!f{t=jFD
zQEFLkURIKY?-D*SH0J+Y?u*x|-f|zcRvtc-Tw7~GXiM+?+>><<S?PnYE&65Qytm}>
zs<teDj<@-}qt<QP_`Xqp`=?sA?ecdV!ZK#mmv-)7S;n}mDC?KyuhIHmxK{J6Z^vEN
z{H&<@!*Z7_W7u_E>vFs9Z~u|~-1JrJw&wP2pI&Rhp+nDJYr^t!r!1oyTW&#iKz8&R
zE%%h;>b+|%HjFF7$8RvYpX;^2jl&ABI-dC8XJbq?_U5myJPCVq=%>C1G+*bf^ivoE
zY1yFHYM=D@)cf~j%LZ>8EBf1WtE2Eem(aF?`u3`C=0ttRk2UR&T{*ru+&yLGaq}3p
zPr2}#ds`^i+%J~PUD5uu#;lF8*Vq1pl_$+(RKMAmlpXdOeqy<-&$foLzBAYUm_{Eo
zi#~kiaaixo4vgEB*&E6<kBRpG)t_m7*6NS6hNId=tD~^i&Ew>>`ocYz>#Dv5^$oe#
z^SRS&-%HnOU$;7XoqXohx3cXsvVK`v_Oq~z|MOVX{_VBKWT<<qddA%AJ15lRPsnCv
zJ6f_K*)CaCc1f7&`0`fzym_=~A8A>G=CS1wtCQwudylBA(>>cqov7>hc)4_Ch}fci
zruRI^_BQWhmgjAJ_sX8#{Q6{H6z17to{9fso^kV(%=6ge_4Hrf-kxdHbA0@pXVE+t
zF3)q~yOLMfWtKmptz9o%d92l8JnmDj$9?_nq1-9P<Hao<vAh>HI<K$!WzgksC>b}Z
z=z6egX!X4+)OU)$ZXL{xU8KKt_%u5|)HSNEo!3EK<LWA@D|{#W>mHwI*YWQl#l9o(
z+_i>WIPbn>tqI}y7?#nw#XjkMr)##1i`J?tEMsb|2`kH(w2Zm`kIM+h(gT)}@}A`B
zj-}3r*dLX<uEx@+`gW@Cx)@6(buC^8b%pJGR9z|WgCF$xT8yRFT-|ogKGbpR8ZYCN
zW9hQ1TgIGa^!nVyHCvC<Uxrgj)cY{UW6QWS^i}itvV6R}s{J{QW6|<!fWeg~CYyDW
zzoW{A@6$Za<4|_x{QAcBcHfq599+T$N8>nJFmJcd)jTrHYtqW~c&>Rl-uNEPZu4w0
z&&}JK^E3`$4r1Xog|C{0uNN<$45q>t;ZMAdXdI*l+h2Q9>N+MYZ_xXAJ|{I99<OSr
zi-u-1&)eAd%9Y2B73??bfO1=uTj`q<>(^Q?TzNDr3H>>#%r<3S?s2?7SG}gvLyhkC
z?YTa)hUM)ZbU*tvmv=~+9-pf_ljZfC<#?cM!Lm=?FYQllUB6*v=9GEWa+wq7v8&ow
zt=}U~UcYV1cKKZ8=`3%bG84+2?()KRUs86TvfsV>_1m?weql?5b8i3H?mPQT?e9;l
z<4M<m7oL31-MKnyUIT`9%&0HvbI&u&^%+MQ-wgK^tCs8X{`T`uw4=XquAWnFOgTTV
zyVh|m{$1(|*ZeB2qJ#b>T!iiN{h+HnJ~1xCUe)+Y^M&CWcX>dDbKJvQo`+LgqmChU
z9Jv;Cgua+n$G}%t?Ne*@#qr}u=tYb7TQ>Spxmk~|LHq5`$Jgh0`#W9SE?kQ`Li>l+
z(Nj77_7{{JSMJ)he_kDXu0<W8{axqUFVxX;j5X}f-?+U!3<R%?e!J3hjqAMa?aH{^
zh%(A8e$DTyw0Y7zPc-+5^Ed8lzkJ>Dm;L?_-vxH9lrJcM;Ol2rKD4`{{QhsWMK=x0
zZyavJYq^$o|9b6qA5(7kd|Pz8t8MquQ?`4{d5-7bJhSql-P6iXeCzbvT~==7+h?}i
zu)XTa_b;4T`Oxn4`Oeq>+!npk<K=V1TKj3scUYUptV!iYmD{*px#o5<PL5y4$G<l`
zyXG<Ltc~M?iqEO5_uty0D?Ppj`~3MUBcSnuM+M<neoP$`>X<uGM|0eUL*MeSJ?Zt+
zpi3VY9h!ao9Id)WeZyY(9K4R|3w7>P-%xe=w?mryl?9*jxI7kL$56+jI?C6wj?}~5
z2l`%H^l|lFgT8J(Uo?)dVa!aZV^$rfGiC;r+ofDpxofkXYoY!Be%);6(5Cbw+^1FF
z`#in|{c+NE3H`BE9mmv>IC1Q0Ic8ne-u|OiTl4s_cCT{P@3%!aI8knmW9^UQL-hEu
z_NelugMNqMYRexyW%+&=>G>b{4UelU-_g;wRt4eMGZo7JaQSyLPI<2Q^IFCI`@-%M
zDxdq&iQheFeI3v^-5md&p^lAhtFQSRyME}1P|uQjCJ(hm4_UPy&DRsLdJ?CqC#=t)
z14-`3ZQ-8)Ss4o_Z;y`dHTxrsgBj)XKRNwzFsIz`PtSZDEGgf+cxKx%^xuG+XuUtX
zcH3cdSbz1T{=@IQB|LFXIO%u4c7@;lI;(4aw|&?wd&B(y<@fG94)d?PR=hO))QEY)
zYo)4rrp)u%Fpn{I;<Z`(2Uf15Ti-nEGh62c_D5x0N-o#?n8vHm#!r|mFMG3jHvOV4
zdh+pkn(rHJU0!y0jW=SRoO#|8=6Pi7chCGhz*qm}JNEPDh4lq1rx3ID`8%uZyt1ci
zgH0CgHBa3<SAXyQMJ+FY8?Vv(-4H+Vnz*qi^f%5~(WABZ@QL5{YTchsyr`aPDcrp4
z*FLm53fJPHeIx2CtM9sM-!Ao~etF%rFZA(I_2t!fsy+_$BuAWg{*QSE%(Gw~mr|?7
zp?emMaTw|uGtbi1%oF-+#yqKC`Asa3Ph1l<ZqXk9eX!PRkmcTgc*_B@aWUSnc&P7y
z`tsLdeSL0#SJd~eRqbmYCr^xz*70_tg${(xtEA-?hW1UVFZt{1roLYF?N?t;ect9?
z-VRS~?6*VS$PUvNO)p%{^l)RobT!laOs||`dN_U$nqD*g!sX?!9BZy<e@5d_)98)9
zm##dv?<-^8t;+QtX^WnJwdJ1PQtpW@kDKq8>{G{(I&Kx}@F(1NJ0Q!-LSKjRbfP`Y
zdsxl(H1B;KGw)XOKKrD3Tif3Jsh7qk>}!@wyQ#N#rR}SZuPyg1`tptTHV!q*<tCLo
ztla7JRXC=Wl<WJ=^7~}g$)H+3hOT~pB>XB(qdCj>&Bm2oRJN$>%bfR5>|ZUfIadGf
z+Qn<#b3NM+bamWm?O!np%gAK)$M63CEhB8p-IlTN`?ly!9xq>WHm@@rcu)CSrQaJ?
zvDYf$I1&1ENqxQl-4<PTqQ2wjrB(NKn|tueW{k$opMJdi70Ui3lx=R0#;)!J<JRkd
zYyQ@sle%Q>E*)$?<^AojzB?@AkY#L-FC(n)@teITA9r4P@@c8j=ELgi{eyqkFkau9
zZGOcm+PvoQXuQVpGAkN>g4ef}ar<yBcyhnQU*C3~>hlTXitkX@miqs{_J(8EG0WI$
z8Bh0k^|7LP+uA<wURiTLUfKV{dZwS~{HWY(m&-M;8CIXOp5HR?8poXe%T_10V`#&S
zx@ziLIS#h&+s|CtosF&+uRJzN!}yp}uJ?cZyOdYchUczcVt6n9Nvor9)zUm~c8+>I
zpuSyKQ{Tz!)INHu`bO2aPko=bn)*(DFSY$CYt<L}X}9_||8Z4)Yaat=RywMAtX@>E
ztX$*#ead5Xd-FuIyq6DHUeA;C@1K_M$H$Jj&ErF3tM;`w@8z!ykH&c?tL&7rjn{~+
z`@yPxxApTO%@NY>Wo!JJVO(2E>Kk7=<!25u8&~~q-pkhBcSqL##bZCm*tnnG>7xFK
zWt`@C@<At?p?Qq760YU6H><9}Ket6s^Em8RE3Z+Szm)r=Fk9nXJ#D_N=KE@x&$c-+
z_Pzf9RAW2ZBF*i%+q}JhX^UPQ=8bKS*3SqW{~Uv6g|;jzms1YQZE3!)@vGEo8Ev#B
zjIYk8xZh;HmDg1#t{Ymf1zxo38f09#LFKB-ea+)Fj(M?j!Ksd`E|#1&`WvU^6TKM@
ztYKd-S;pXB+oF%fm$7O*oqR3`dsBQoO{$}=zFq43OT513x!{_Pr?WSXp0{!Q5gSK*
zZg9=kkEnm#-`tqjrN`Q$A6~oVo#L|vBaJ?qw~RfO@&5QSR`t;uuj4K_xsSpzzUQgl
z5BOVK^nq3Ntv<%DaeSSK?+>A_oVupe_0d&z#a}}_Z*@b$(OPY-+i#Qdd(~A{SL3?H
z2yT2X=c$d4)0p+8jR%H)WB)p2-cA2|&F1a2vre1$h0ARWH?+M4=la=x(*Kf`2aWfP
z)8;*3-WQx$pEZxgFIhPhn`-k!lgf4e{glVC*uK&}x7Mig`NGUvdDu@D)W1Xh-#XE*
zM)x^HwOli;F$P||x}VM7{QXB420c-G^okSp9beDZbMx|NfRCR8!ZsgOc1&5f!`C`4
z#P+d{d#`ydwEVllC3PKbYi}MWo5w!uwR$YBaV~w?>aiHss(I~OSKm~7d-U8B>vd9n
zevNb0k=<}?>}UPsjz`MAbyeB;SYD%#&po-1LqE-^udcp#uc~kLx~;LFopj#mQ+`2R
zLmlnWGneaHIq$rv<(|Y*^AyeF_UH2Xpm`5#<+B#yoRNHn*VSv(b5Wz7=J_jJBWy8W
zXQI7%Pd|1}T6K+Z{Ilg@&upF}w<<fVY`m@UW$ZFv@eGzRZaIgPJ*e#M)_(05+uu(0
z8JUhxonmYbhU3wo6)5^qW0xq6{ycHsTz=2KacR@nIBrauciy}&3G<%vTI{k_ubad6
zC@I&yp*?y|D5sw<_Yu|bdG&nDIXr99g84?x_kgzMe2vQzr_-?ihj)#RzifK$>Ko&Y
zCJwg$;-pj$g?dxZvYwsSeZAA_-E_U{zTT>OyRU!U)*HtAW*0|G>Yed8e!aHpHFNCq
zJK@+oq3o6$v`61QQMTon(|n7(^*)~qzi<X@{cvUD>U`z(%AET4sqdGo>N{~9YW`ab
zt=B}$HNK(cIJ8`2<ND7{ps}v@HNWrCx^6~i<4V<8u`jIGpmHVUzOkwuE61->uGb|k
zN7q&BwOxIw8@5OPxvIWvuwJh{dA$bJv8cX^`hFbhJARB>eh>AE_UEp=%eK4^Otd@?
zm!FaUqLdwPd}Dj_ed3erct%T^vwc41f|lovIzs<WsN;}2PWRr|F6ELpYL6Dw5g&7_
zK2tT@vSG{`j!lP@-L33&DBC!0EPv0%?@@bt-tuXoabD|mGCXG9!t%V$W5>#NY<{$E
zjq}i8%XVy>|HJqU>pQBxE!`*9_rz<P)%WaLKhr#@+;-*i%B|cFIB`5}eI0V@`=Y(-
zsH$U^I_~3fbG=X86CM=RP2X>NetG(-+Scsf#%lGAwp`1F^&6PbU+OrW*JI<#O(~bT
zaeMTz_%XWq+9P&OTJwHW7-K(L>l!c&wjRr<T1KBA?Y@=A$G5wODSwZemNIQ~n6~yg
zC>l4Z$3Wd|zW<-?oVJY4o37o?9j0}ec6IFx+hM=?lIFX};}iRL^Ycor-vN0=%k~>K
z>ymQA%6;QRxt8N`bG)~Hj<0ci==THPtF8*0*OcSx7<!)buljC&?Y8UZPqke`oA+48
zwxsKs_%bx(#L3;qQ>;m2{|&ER!dc{>S4W}Mq3@2VZ}evE(W^Z^#aK{N>t_#}*D+~3
z=U(NuU46OZzh@t=`oj5hQn}8Xw>N)VJ$ApN^&Hdw%+<%TK4o_+JFTpjI48!mYoHe_
zAH>2u3+9=-nt4JSj+&=#o_Bg2Kc22?Lvx33oSXX0+BpAD>aSaze%VldLfL)Fo~r$!
z{#o-ZnWwQmj-O|mW4ZMnPyF>(C_At0aL?(N9ac7B=h>y~2R&ZihfdlTp0Rr1hfx)K
zt>4@iLVek=Gv2a2`q$H|Z`<lcH0uk;^OE|ut8cC2c{rw3Oy6sIyl-3Yw}tc45%V23
zU*kQ1RmVQ>MR-fe4LJXh+lhaBVMv)iFY>duYL8yy@u~Wz^}4OO$DMqb7*u>xT|3mZ
zJ=C>o`K|Apy=<i^cAMsVek+48)Uj6`HFX%qjrN@G`7zW{S4a1)uU$I^E_S`Bj%(A7
zX?1M7mhITDj_S2+M`B8+^j^FE7*fZuI?l8|Hfh4FI<~6g|9UOiXWlvU?lJEL9v?r)
zJ03o>@yf9Id6GlsIcOeleXW|ueqqk$by~RLmAp-R^tv$5DaWLLY;1Y|2*>iq^^bCs
zr&%r>r>2yvD7R;Mc_*&DV&5rg{@WxgL%*@iJ<1N;w*B~L0awQUs?W;zEq{M=xnyW-
zO}Xu-SuPyMdSB#tq1^t^*5>i}q~qeVS0A*(anW0ldg=HHxOp54?U+>GsGAXuF>=bj
zKKXs~l5*S6pxhzl=Fgy9@)C{hJN<SJD>rbO<-(XKC^w~C%WDrE*gWqw&&iFW>Phc4
z8N$(iWh=^F&*ShX^mA3VU*^zoHDh}9zwZ{xb>*FZPqY1@Tt>P6+n;`aO)EE{9Ip0S
z#HUA$mgayC%PT9e_YBG%QEurB+L`hqr~eM8zkXxNO`JiwqH?>=pxlCT2hX5f!VS6Z
zJD&dfZBcIM49ew|n>mAWdz70ygK{<Hj-Ele-j}+MkUIT-$tpL#X1U{Ehx54t?^HG~
zxxy)>vER-rx8pR+h3mtLatD;VkH^h(;p*4>HPerp?r^wr96x@1X}zv?SD|rK4)?ZP
zc^!Woma%E8*TZ)@^)iM`A2$6=m(e)?t7DrwY_8aGYx%=u4gpR;jo0yW<~?Ly#OJ+u
zrDF$8S~NrY&j0s)z2o<itoxH&jyIu>)XN;7)p7N&_q@VB`4YOZZ9^Rs>e#1_*L!^G
z>x@&^5srI%)X{mDRo5!vMxO7=UAVGrw?K}6SJPW%&HG?A<;InJZCJK8u71z(#jD;k
z?0&iNWZpNdIq&NC7q^&q)28M2H0m9>+^;<}-!mMUUU|^?9O8s|XU&^kuJ=)mYp~|4
z*D$`enZ989=&I?>e_=OF-)(x=UE7=Q&wF#y2zO!8cxR_SjGcMY$4tL$x%}#NT;1-v
zdG}t!dHcNim$+N|HF{rk*u10Wb!+8nwsG?_1S=PIVY?QT+kTqm`jp$J+`Mubk6W*S
zn|r_gQM_oLx_KJ!uQkr1_Upct`<l)B)2;V;8oOA3d$#2ouF=$fb%px+r=9Qb-roFt
zcVoP_*4Mmmw8ndpk8i2Z#mtG<$}{RKUPtwfYu^F&)ztSFk3*j{E<;_6x+k>8XY)?=
zTS068woSOd7`9jXQs=MDYi;M&<NNV_xp52#<tCMzQ_jnh*!XBI=QE2fCx!vDmXtfH
z+}oGSo$@uq<690~<_%>RmCfGc^vez@+xtrGRkr1ROv~6g)jjc(FaN{1nNZg;b<KMm
zj(^SVvU(i0-bYv-hxwLqvuYg9sc-t8*Ij*`4jjkS_u<gK<Kt$feWx7Pt^acDs&PH4
zzOj3?U+w4LTKBu=eUtXsN9<1QC%e_Ra2?jSq`ocv?XlnTTD`tcXz6_IJI`youdR+w
zytW&faeqmDZmq2JpK*Wkedy_y>xw?J&dfN!-uvobqnFGxZl1<Dsc}_e2k~1)vDdwi
zZryLfIpL6UbIN)78{6O7$5!SI*Arpi?0l8;%YE99e@<-Wwe=OtKVi^V|Gws>MPozv
z*}lpRDwkI7O&(wKapQeSYtYySmOo(=`@x=b!-q9OyLPERul{GAsQ-Asw!StzejVoq
zUfAysDz`_u#`Q*XJRdK2{CyoOAMO?$ztB@Ilud>m{=R2cc0}3i8J3+@cJ~>UomckQ
z8J104?t44_A?!2ht3hQ;XIOSx*`+fqJE!dM{m!h-b!B&*VcC8+De7lfc2e1)`=42x
zcPqQ|49nJ(J$i;^``q}>JmAdQJf`f9Gc3D9*&}CIwyNylz?rqV`wG`%XIOSv*~4d8
zcAK(W9(ZPLt|+_x49kWWMYS_5yG7Z72c21)w<<e(hGomjE}lWza2<A3+4O_kW4~Xy
z`n=zI9kzU3vg@Q1mbyZHgWL3<`u^bYsm_0ET<5%Xt@_4Qv0Z(|E$vZXTWfvCuis<8
z>vD?g_o?=MYrl^?)%cygx@C+!q&@1gi0~)eNA2_mz;H_z&b_^|A=&mY)BmIGY~U)Z
zs{VhtFV0nQK&7Ojykl5Yly?e?Nz98fdeA#YiAsryMTSO&$|MyP3Zu|K@e~ym71dPp
z)Kv6DK2)PJ4$feap^~DaqLJQlMkOT&4QKw}wa+>Cp0hc3UV8eU&*#FqzrEJlYp=cb
z+H3Fga;>~W?%O2qaOLY&?q>(8)SO)n-ynP}y_{Uvo_oD}{H)YfCM^3cqC+=)N%*Ac
zauo0x1e*gpRRHGkcB+%B7nR((F6E<?hu-5~E;-OF<%N`=X7i`>nop<{NcC^jsw<lb
zurz}#sjHasnhfPqSAz1EL#Rt=TcGWR_F%c0&~Acu7TWi6E;dNFA)rnrg_1n3nBqxj
zLuaJ-klbq}=_$+LLT&AhNK#o1{m1xj_p$U9eH=(#d~{?NL)(9dvg3%Wg+2-Wye-?(
zMO;X|SLh?oeNt+zQx)iSeV4~F43RO2jK284FGK7%hm3f+y-%PFpz>#>1iKL_lSuBD
z+s*yz&{lJv9?$9f&#aYDtM`Tq*4DzCfVcDC;1wM^;oSo7!TeOl<uJ6P(EgEg+aJ`p
z^yKPQ0j{7aXJL3%DO>J=ujb5vQJ#)1^tIg|!W_k&um<jQHcvHdcYRm=A$7-ku+LfP
z{g1RSUhY(fyKrwns_v)}g|7j=3HT~G7oDAZlRxIVf1?|kMQAS6Xq@Yn3Gg9kN-7R^
z|FzGv1m}6XOjd5v9^=dGKOwKtuG7CgpLI7!#hp_4w!*i@=6kPd2d&||#EKfqXAV%_
zLiy4G%KIrVSnDr8j{FhI%P7B-b9+BQ^<|yk*s2TM==7!9M5wc3{t`0Ak#VfZ;E))c
z`xUOAfgPt13o8Vh1<SLsLjIS6?E!nMV4j%tF~?Ra9@Rm&0Nn*PT{<4|J}K)yStkT5
zyY)g}c(#8$7rPEoUU7i(Ny_Upl*>4qr@Vu5!iyg7;!1H9jG{`Hf?n2R8Dzy_FQa^%
z@=q?4<-VVT)h6~)c8^}*I)@*=He@Uyqbxlp89(GB=d5hw-C0(+39ic0^M$qmmuTT1
z75;NF-_I{TJ_k+I3>@}z-UTe`YYvmIg8HiA>xFNa^R#bTj_JO~%6-it{hnvs7`8e-
z1IUR~28=1r#mCfsK{S;Mu^s7t&N#z!fAPRI{<%<G)nR-Uf1r*h|8I5Nkf{#w*?Q{8
zJ=fo-P0Bkck5S&~(ZQROF5gpLzRy?(|0J@Sk@XXgti!gM?=wcS|0^-L<ZJjHb@Y4c
zIGDcN7p-slz+cCw19`dU1&o_L@(x?)>P&qQM{bfj8mU9&Ii7gsp10ih6<cGK;G`T<
zVn=Zw`42J<Hb+Z+HPCKx+N9=Gm2>EOcGa$ckjl5DLXk0mjG}~Xmw!{nEHXCzA7vDN
zo%pft-`7K8LIW}ekntPN?XgTVt^Lu|-koF)nB7W2D8@RebJ#@A5OVU)|G&#wL{9gA
zKu-J{<lFE4zv&|}q7yl-$ay>GVxM+itt#sr=RGEyDIcKx)Ks~59W+k))&rC$DW5rj
z{0J9UmJYyQM0r7#fBiBxOv=kB|8z<}`&k1TKyy;h8kA7p3~ev8AF*lub1t3?P~+4Z
zf4R_Zfo~kXpzv`BHVQVTVN+nEE==l7f^7x6R2V&D$+ecX_qVON+S-4P66EIKYqI&$
z`HT0Rw%oOeREtyA0yXdzyf0vMr}?aToy605Uk6X5rz4`uJ7Ov)=!3Tj-tT#M(`yjx
z*`7d$E2Fi!EuE*}pM<~F=6CGtS+_@t$LYjG?R#f`i~IiHZ(k=@tx+*ct&`mIw*Ab%
zV<t5xnD92j`+bkj&YY0(IiM=91=81h`jNNwKO|53BZ<813q1W{#~<&WM1ZR(sg8tC
z=DWggGf%<0>G0a-zUHfJ6|?0vBX9ISC~pjT(dzVi(8`k>+gmB5&z{!uo0QQ{Y?b#N
z*4@Z!Pq)|KR%b7_RTaDo@Dg5~IPdcNjMouaol%8S+`b?LyV!mZc@-D><bX1cQV%q`
zpL0{oi5PcubIIE)u^4&t$eTjm6`bqykZfzI(4clxtm|9R0qk*+XOB~f^}n9ZFHw}D
zm`YvJb)U&&lzzCxuQudYApg~(4~N96KCmh<znEjsk7~R}`E?S@S1ny9;OqSVq-!C1
z6@Qm?C-UDfimExvx{Kus72+>Sy-T7Rx_Rh6Cv;XEaruB13wZJ_-Q%=I=8$gqN-wtC
zt>oKth#EcVIYe|BfwmRe4@w=1cKLfP#D8X?8-|Xw*y(SVE*Zzh*)Hu=3h0d87(qEy
z6G**VFtsMJ{W0TnmzTNx(T)b9UmNn8k>?#R=~!=Rc<9fQPa<y&dDGI~47S&Tu3Xpt
zxvRZsXN|oAf_cpc0><70$Z}sJhf!qx+AAv^(;JZ&`C!0!NaWFHy6vpC*18O3k7eGT
zu8(CC5nGYhkGvZ^@(vdFKJ8M^>e@K`kwNk!WL+(?R1i>epO&wG(wQ>dkBJbt*k6P#
zG06ORNqRqk_W8z5c1WyYixlQ&>m14gWb`7V0vUdJp2X%M%Ihg->&CI6UB`{~Z!##G
zMeTV^H82XP_}ZR9#tppVoM*PbPb|H9x&1|6*$)`6wSfb^BV_ryl_;AY`Bn!YE6S%E
zdHu*s#}{k-6JONx3KAoRpqYY3<(Zy-^4=pEhjs?qqD$GM%HoUVZ`dm1e&?^8-cU+a
zG4I0`{UP!HL+)pbte9uLQz0GmOz0BOT?tRRuI1|+{dw0m_&VTg6h2RTGVLL8!-8r{
zgtl1g=P_gqA>&bz;T~7kJzv^;$1*%qTE}w>K64kZ+g5pY(T|A#mj#TYqz;Z2K4oA<
zV8^BH>z*6M&?~x+=&?7kGSpiY_^f}uRrP-One1g(Q}+mUPf&NSD5FLY<0xvyP5WI3
zRw!QYeX9X6is|^8WuWYB<RU7UBrjB#zBVTFaUqxi_9^~l?j!H>1TtUL!dD94MZ!nF
zIC(fbpDHw1nGP?imk!10>LBZdryiaR`%#)rGyK+3RiN9?cCv{tZCs#^dFnWZb2$Wy
z{FwLvCS56qU<F`FupEV0SPX1VqpJX$1v}P7SK}kE(U#|~qs>QNpDoW#w;60kt9L8d
zw1!QBO=;L1*rbLnf=y^xHk)SS8Wsf`)36e-Q4KS}wrW@{*ocNTfemX|2iTB?^@DBE
zuq|MlHEa}YlZH)!4Qf~tY(T@7!1^^T_ow8a8deC_t6`;J-5Qnv>(sD1unrAt25Zx>
zPOw%D8vtw3upzK!4I2Y%(y(c;Mh)8o)}Ud=W^25;eJu~HPNORVtJSbFuo?}k0;|@r
zdax=DYXM7WST~rdVS`{58a50T*RXN0G7XynE7h=huo4Xm{Wts18kP@MtYO7qMH&_d
zE7Y)Ru&9PLfE8$1D_FjU^@8PT*e0-C4I2T=*02e%h=$FAg*0pd%+RpN&)ENBK5_d&
z0obC3#lRLctO9Ia!)n0xxUhVxYy?Y!NuFnr(X_K%jI7<RVr{2JFY$N4rq2_44#9@N
zr2gX-Lcj5U3{2W{wqTwdB6Y95J6~0D7U9wRBm3ve`y&db_eTks-XA8I-XFDKdVe&5
z>HX0GruRobnBE^-!1VqY1=IUu3QX^hB$&HD#Fk57(jV?V&)vfOreTF(vl><kHltw)
zuxYRpq#rn>jyfNC%|7xvedG<;@|?W^)&D;7#(d;W`^ej4%X7C^UaB~ywIdH~Qp1YC
zB;RuLmVu3H?XB{WSMMXQ#YbMZEze!YAlR5z?=aXXnA<MnKJsRK<jwoY3;oiP=dL#&
zY^zppG1v&$O4Yko|Hi?FHM(lBAq{H)+oEBuV4F3p7i^P;Z2}wAuo18U4VwV#*RWZz
zJ`Gy{>(#Kxub9s@tN^T2!(w0^8dd?;reQT;ts2${)}mo;V9grV2iBxvo5312Y%5rU
zhE0OiYuFrEorW!f)oNJw5c9c)MZu~ytOTq|!%VP*hSh?Z8rB3>p<x|haSiJSE7Pzo
zV5J&13Ra?FQ(!R-OM(@<Fqw~*z>2_56n|Fp5$kPxKFa^~a{6L0q4%4=OpQJlk+n9x
zdt5bv6}t2gx(=|YhV_FLxG<@63s^qbn>m;DR68$<;e{QtbW*W)f^yN%UEeI2Uf%+k
zUSH&Yy!DA*1z=L2TV4z-Ps1v}ay6_5EL+1G!6F*g1{Tt=J}^VWHiIp(#&_$o6>L$%
zCczdoYz}N*!xq8zXjnEEXOkKh1)I~b60lhfGr?vwtQKrq!<xXRG^_(`Qp5VeCNyjd
z*tmv`f{kg|6xgVSCBe43uoxy@0viEa%egmivaUU{SFkK^iVZK<zXDA3KVIZ;i0{;Z
z>Go&@6MHyxG6vhg#J+xGYJf7mKZb02w;zmwx%)%pPJ>DPesULl=p(<)OkYTuTfVff
z)TVdqkpOe|x9Cv^CjITFN2^9Ja(iuhx7<x&ZvPOuBVghmesZTYdXby7>D_Xdz})^M
za&vFC{K-#lF=aY^`~dW|8olVzWYfF#=m69GwI9swufn?pO#Id16@5p+hBMe<Mxz%Q
z^ESOBL$J{QTKk6%CUWz^wq%f7qS2esn>PJboXc3U_V%5<c>9`<`ynqixOgpTdM_Bn
z8asW+a`(5$-wY=G{hyG(fUJ3!9ZQfO8L{HwnQ8smU$yR8W7l7`NTNY%zaBcVT^v4l
z{EmTDgGu}ry}fZI^=_ACaitR;_k1UE2Eb(gIUG5Y@VN7VeB{i5Nj~5=&&!1@ojwmt
z=>5h(%!j_>0Q7Ylz1XbTrg!{Juud>{exlkBCiD4WwI7}ecpTXxXAUfWm~!%e@68Xz
zj>TX#$nmqIsnJWDYHfN)7r~mqs$KdC)&W+fVf|owKDh<V)aXXR-0c;4Q()3wcfCn4
zJ)c|xE7j<7|KRlpkyi*N{@|8Z3g+%Fp-X`2`BxoSp~l+`7S*s$Ft`7Uya6yh{~7|*
z^RF?mTut6In4W*_0n_s@W2<-kO1*htGJc)*$~Y(jTOwwx=G@9XKdP>J%X26(Ad_;@
z|8RV@6&|<!#m9QVWIPr~J=z}879aZ21JKWC^rGjyO@BCg7T)6ZPmxm!CjNOiavI@j
zaM@Mb+Xhyjp}hkdz0^Bo(;rT~bMUlj^)7<7W~evsRxf=n^%j8%z2g&tm4UTrSQS{a
zhSh^LX;=$bqlR^Zx$Pr$41$S$-1QEF)oIu`SgnT5fYoT&JeVGTLbrMQSL)3Nlm2zr
zTMTAuSR71`Kh<DyjjjQ#Ov75i-2N~1_JWE3yX)NqrpKQVuwspF0<1{GX2B%>I61e}
zvEU;wGP+z|0hlf?1}5@OK#m+DuVU1zUny7(nCQo)Syw)kzOTc+hp*RlWj&q$z&oh5
zV>8%*hHVAw*RV;jJ`I}#>(#JDux<^@-sau!k?|P?>wvC+b3GoF+4PQWg}w?*>i45>
zq|B|a8vh5NAJFK<)<ZUZmgvJF*ch0*eIj=nOxowC$AU&LG9tGxw?_e3r^`knHwKo;
z9tj`%x&zR+YV@K<uTAf6-zG43{1UnmFg+ekfVH5n(;q@N>mzT$mglC6Z1-tDn7jQV
zF9s&<ck5RHrt4P&rt8-TCi=N~+rXN^jzzW{Qb(VUyv;uHw))7MwB@<$m;=+}?;=>E
zR&Vy$a{Z!UqMut{379U=1QU5~x>~RX4Qm3c*RT$-Ixwd{#LxO|d4B#oOqsiWp&vg0
z{hUTGHeR&p-TG$V;ni2<M!`g1KRwDcdXZaY)4S!?gX!&S0TX+j2#>z^*hiVWeIj?W
zP4AYw6->A9B$#gBIWVc;&ASLzt6|wYtohRE2dOs-rpqe<bGJw6Ot5AB52owa<Rh=c
zmgm;5A57P83s{Y&^C(!ghE0K0X;>00p<zp4riSJ2Tt1%{g2gquQm`@&OMsPXSRGi2
zhBbr5G^`V>Si=UuiZpBptWd+oz@i#94OXCGd%*HFOm2qD)37|STn#G%%hs?mu!x3L
zfrT`z9?a0N7BE>)xcj^tY*E7o!SwyyVKDdph4|k%*dC2{229^SoCnkQb3=D7-`~pz
zll?=te#Kz!{XMBS4yNztR)e|sZ-lM^O!jZw@>;>%`)NYg3#RYqZUWQy$40>P{jmu!
z_x_m3n+22oG57qkpvj9t7x|+l?=;T6dp6edtbTjJW$@_wSApsJ*Mo`v?zqwdrrV?2
zmglA$1k>$13?}w<(~X1a{WSxo_t(6SywIPP_g6lcu3s^j-rhKv-rj03y}ug3^!{oE
z)BCFzO!~|1C!4^AG;9QH3z$0(pYV}4>mzT$M_y#YlIO0s0Bo~XZwzdchE;$KYFG`}
z0GQkF8hzxo`N-?@k+<2F=hkm4Sie^9Bv_w@&4Kl5*dkcBhGqYmd{4uoU>zD(0@kKs
zCRi(&yFY4u<Td%o>#*gy{kb1Z*KdoDyip%{Q$F&NKJu1)<mKMAyuF2B?)FMvR0<~T
z^~;N@edrqwK;NO!i_CtT{;d*^)U{6bn_SmAM=00rGXW<3=k9}9Fx?*(z{DS3EwVWz
zPDcJ>`K#N%3c$p_+;lOpW&RJg%>RAlHQMsrb+m!$`t^b7{<;~=-CpT~tzgn#zdo4u
zq2F@=`pBeLzQ`;96Zvv|!0LlWPal*~zL|1&d#b>6`_zN!_Gtl=_~7R42Ghs$Aee5S
zVKA|eGhS5wZ_D$u#T;dN{fjoeyZ-FEm)9Q!llp%wHsHwRQv&AB-(p}UnB;G!5P9xF
z%Y4nF6&`o~DR%A!t3b}>wjB2}6!I$seivdyr*ZgNwH#{(OkZ!#gX!zJ&^^o78~I?3
zn!IAL1`UgY>Fa}PFj*hCZQB6m&gZ3{TEQfre~YwLjq#6o{A+-6cYR{#Auy@$OyTj^
z+4tEL)n9Y)v}yV*g6a0kzIVC3qF}ncO2C>lc_x@{uUas-{?boPV7k3Jz})r{f9?kp
zd-<&ohc$ZD|2Dnb<}+YoUpF=nR;^*7Derno>dyyDK<77}N_^-|o8C{~dLR0h1JL*R
z&~H8f{g_5Cell&-AI^Bp{;RjYL{1b;`s+N-z5d3p65z9bd2uy7ZhsRw4PfGLha+bI
zo_<$fNxuw%^?~`>dBTT&_5k!tKJ>ZwWtLw|nXXU#0Q9v!^i2n#@Ajb|JOKSxANt7y
z(C_h~H}22e-U7;`y~j)6bBO(8V4E`7KS9}MSAWU4th4FeW3U-)P^0Sv8_=);Ft@!$
z-Vm7dzgylISg(dngLP}z9<WXglLsX`G%OFy?Y~lQ5ttq?%fMPR-YPIXUe<%@@v;R>
zkC)wGdb}J2)8pkZm^)sIe&b*gFWq*T0jtrlc`%88C%_|z@P-~(E-xQUmsbp?%Zr1F
zJa@g-U{xB{04DL&P1g!$YFIB=g@$bci)+{jSeb@RfR$?4ELaKHv8uIJdl!7<MIKx(
zuK-M!7XuS{D_!y`z+zgxHDJXW)(BRlVQpZA8rBCE)v(QA1sb*$EMLPW!SXb04lGy0
z7QwPLEc>D5`&m)2kVaPmW@wlRwxs2=wP1@H)&#boVI5%e8rBcCN5i&&C0$rPYK(%-
zfxS!mLtQWbI60K{OvMak5tp7tkj#Tgd&-6Wob)q!Uk;>xHe>B})<-ff3TBq~R}4)0
z>uBNU5V{I5cYjIFPy;6Y6%!(r+ckJ{yB5mb{vo<`gNc9S${>N~#=u#}9C5z$mV?0m
zWUq*nXCIMk{r(5E^1Q`aBHxn-rsid(+bneQyiZu@I7HtCFnQkRNQGEf<YDx2VWpH6
zfXVYdn}kj!-tvs4c2Af40bh~<9ywL-FsXre0AA8<7w@v`pS%ZEJ+^7*^}>xFeEQ)p
zyF6gjac<Y0x*tp4Tw<*Q;H6l2#-VS7{xiO|3&yf+<Gr8%YcZnmEWtDK;X~&Uy<(5x
zKlPsLx@o<vhkKP3qR__q)Ir|?{n@r&)_rlF=Txo7-h+`W??b8TDP;8<<C~Dt`_X{G
zkk$G+@ZU}nh7#&Ge<z`>xZ>cnqT3R*i_rdGbc;WVJsJ*PUzGY9p&fztts%bP-@!=8
z854XX)&loJGu9X|F8dr`ET2o+H+Z)7E3Q_~nYt5Cny%iJF#&J#$$;?`tRg?XcI1g$
z-tMP{i99yVuRW&DFL9mo2Cfu-uIoJYo<ex}Ra1+`%7@~!%+JYy@jAQC_P___Hb#{z
z)l$}wp{#|nMxU}mc>5@8qD*|$JGN7M(JBUPg|6iwboGdsgRZwDX#7DGv+SY<9YZfx
zzOqz%r*QD8kpIKOO@Tdr1#4F}AvPhO$FP5Q(D-DKFBr+~>k`GRdme&|0q>yaUH`mp
z!yN{AWrW1`0pyL-pOFE+h`-$1qVBoERg4kJ_IxF1Tp{C%L)tk3X8u>u_}~AJ?X04n
zlE=wkzZEn-bl`R-52c-=yR@ehnOg>eo;W=K_FWAd0{gOtje&hp!=}NSTv!Qp>;Y>4
ztC32z@xiZgS#C`~q+A~|iFt*8!++qrTKKGfO^wMwg<`aZB*jusHT1L4U*Sh@I`pZE
z;I(3SH+=cu4O+i{skC8*soq7*uVyHHnO<Z7TIXtfjKWiK7(7vQT7ahso*wZX@vDX`
zPaKjq<j-+E<$H(M2H~rMFYo(@Z{tq*%HTT`8!P{Zz8iYm{gn?*fOTrvELaDa3<)`e
zcLA&o?9~cEU;an_Zpq8BG5II7QE=gR=MN=d>ov>-yF$Zi!7kRYCNNXOI>6RwSU=dC
zHEavmYcy;WEL+2-z+R^B!ggDv9Z9ffG;9g%Q4Py|f@@(KRtUCD!%D&aN5c|eKXPF*
zcIv?Tz^=4?P33e6>&32t%k5+&@Ej-BSnGv;iuN1|eGK`d(C7UiXuQEk{$-B%Cpl4>
zBl#RW3-HWt38vmJ=e(DKcYc4^8C(oJHNK)xGJhcdRJZ(8PWmyYhCNOVBEK4*=r4lC
zw>bCCU%b@N9=wkHHfYCw7&NvYP=0WYlfTJ4GYn51@?Rr78Rwb6FTIlr(+qE((WV&{
z-$F@zeN)gl*R6x=-6g?nr*>i?RhB>>`*F~Ck#mV3TAuy{wor7n&_#X{G>&r1P1!E6
z*P-ixZXUV?e|e7yT>)iVpd0^b&^XCWcV>I=VhCiuoS=M~@>kjA?YzQ0!nyK)%4R4F
zN*PBHpCy}qp8!rfT<s3*lA5hH7ww|{&B0V|<?-VO(sWhOHUIa4=|rCv=*oWP9i!b~
zrC?$%Ib{3{g7x969RlccPwIX4;!RfmI0^j%^rL)VMxT0L9gknQU)X5#W|KI!;8*|f
z<Nc5`Z&4SoUX<BVO<d55ElqgKei<|dZC=KdysJgtqrWlj^Va)UnxP$nHp^~kJv^OY
zTYeQZc8bB({6-B?)(QSO+Drd#jy6v{c_aUs+(mpYz8nAeL(r(>T=cG28JeB{Qr1A(
zJM1!f+kxcy!q-AsH)Zd%%jC&ZDU0&GkFp`k*4bs&o%R(rh2_s9(9A${uF!BuT$ljM
z-x@SZ?D3Y$KOGwmX1QNT6nMxloh@HEDD0(t3idF5ZwVTI;rsw`<`!>M!51K}fp-92
zwj2EVv)udc>cD1yRpP2$n~)W|HE1+sYFC@5UBNeG`AchqU+lDiysF!R#x{RnJ7?vn
zS6*ktU*0J4WGq)a&78S2n0i0FU^QSfE=<NqBUlpb_yAwDu^bEsQe#SDUBB?(8MNN7
z&yQL1)-47eZvvxC<d0CkM7fBPBg$t2Y!U1x+fMDgs1&Z;U^)36|3&URH1&T9TKgu*
zeZP{+o)KjtH}@I*k8*eJPzcrwCSv6fx>B%iFv*?d5G(=K52k$D#_GVvz?@u5=$gSM
z!Bm{K={mt?G`az>q=pTFEoi)BV8%qc4$)~EEF0|goXfbpoEEC~gBOB-%EoyQqjOt<
zvSIGC=tsHAL*Y|#-fRyB>a7?r`j<d635}eKY^l!#n*e)-jq!U3s=N+-3Op(}hs3&O
zuvsu(2Ym=*BY2vxJM-(VfnWLgrTW=dRrd__jNcVBR!GGhqVGJ|m<y`~3(b>9fqh?y
zXpa^*^s!pE(&SBAXReH2xYFcxWR)vRIrn{GI^rZ>s7Jo>mtgAsY@WIEuC!gdp&Nnj
zwUEmp`VNAPxiA?=!(fwO`QniJI65jTHI9UTR`@4_2E$z)b^KibYnlog*NKxKw7tQ2
zID?sKIV*KFM=?{`>qo35{!qgt+Wmmt7V5YJe~`G}1l|W;#X0-0L!p4Y;o6Fs-IR?}
z_Sd5T4zAbFjygd7)5xlw3mWG*^?Y%q%8}-!e((=EI6pk#jQ#BA7&nwTIg-T6DA*L(
zN`GHHXXR|jJ*NbB`8sPkWLAGRA+PU=pfO`tBL2|<R`*oUc(>?8f2Q;C)cbpsFKmIX
z1-fjZ<B;(@3f2sEh5*>Cou96iSjTR&nulhkKFTxbulaWH*$nE(OAufDyv%FC)cZCa
zJ#P$H_a3Mn%jmT#-LcNCeyxXZ2)@7BI%?xfd#7FCcmCa4rP@1$tm>y(kEGiBiBte9
z2Aib3nQ|`eIzGg_VZDbjFyc5V`5(nHJ|yWbd?x65$6X#+RKtqE3S3wW)-tesuope^
znY?{ay)rwvD&Trv7)wXtX@X~NJ}9r<QTp*y&h3FfmwJhXyeU<Dt`C}~e*}%&g~l4+
z<bvA1J3m20!>nOPB1($$elH@@tQ>j{dGpBImX>GR<ST)|fD<l&D3|z>_fO`#=Yqy>
zZF;Al$z}P4T{SRqv8b|N9NNO?gDTDmx%gx?SR72oyZEG<V{^|^<17!T*)D&hT~2<d
z_`1Ps;9F(mjxRHdTmDaZ1LYF-<dFUw1*_MvDX=;XOM=z9FwtWPOzLywiA>4+@?Ic6
z0<RGuWq;xvCQa{S2DD8}%XU@pHp8o~M`^K{UmR12#G5FeR%kn+eT;K!>{^xz{Er&K
zsxJnisa^<L?|aqc@y1C@9-Se!n1FT!TIoZPC;w-`roegy)A+eFE5$EkCwnjL|EI_A
z(tcv?f#YeGp9p;!^wZGa?ISz=o@$G}1e!+ZJ6}leai#t0a>uP0n5mej7{!M-!8Zxt
zSL}MyeDccMls|sJIhVejhNpOM&^TRqAo-l_L-XL(;L-<jNIOD{^f%Z`0%$f0-0m_G
z#p2T?&_@@8#(RVw+58d%e=UEkqO5{4N51q+Jy;y<Xrbp2tOcx0qw5AM)#wJnN;JA*
zuo&1%CC!pI4pyx3&VUuUcuS$52P*`7QJmgt>$2DqJU?q$C@~=~X2T=-V$j&^KNsqA
z$v~#LqzqXt$Vw~)jen(Mr?uYojIm%q+e24DO>`N7KYM?A&qSM3G+hE;^`BF$^;L+D
zA4S&x9Eo7%PK=9aYVCc_%K4yYj(;J;dWf)Y?SCULc4WxtL)<cds&e_$dgt7VJKga0
z!}r4j^i7~<S=Xp}ejMKHqeI5)C9hcC25pWFemCG}khLqTWSr*x3qLs~q^*0k`R6vT
z9(C{~;Okr&O6{W^q)+bgS0R3}A5MoFQpXT=H0Oj;*FsZuc>G4*F0Z;U72skCFT!r%
zCzm0}mw1r3MEj}Z#B@KT<KQoxKC^NN8B-O|&*p}VpQP!RkEypUA5)V1wZU6`T<C!B
zr<ZuK8M+DRuH;<$w}^jR!HSL#S-+R3$F1`mJ*}9PgtiviHweGFPoj2ZWV^LK3B63d
zLD`*Bh8*Inq$RR<b*mFoMTcVO^7BHedua;5;$VefUlv~UBANN59S+qzR7ZIq<=>QY
zUB@1;j@{5kUJ){y)3UCSq@jZEn<;CgY|1W6^}DsVqmTDn{d`F5J&&BwD?_TE?Y2ta
z(pG+jNR69_sxO~S(QN2`dua82->*KgSsijZkP{O*9Qk~j!3M#~1)xsHU_#$Z*%W23
zl`;;Ab(_Fu!7dX2QaXC#lh*cN+&AvRD)Q%%wQxcxH9v_Th4zDkeUS5IeUyq<R&0ww
zTbLg*KI=nkT{pB+1bG{~>Tdc^#z+Hvjqu%;=JSd7fnRvXq?J!@68+#kO?Z`$eM-5Q
z^X`o?%5z^8GOo2{yYd6;2fa{<(aM!dtAnL47U8Ld=Z57xsh5ADV2Vd<S4^fj0*`cq
z9LoRsHVk&T07|z214|8}i15@=9(r|}N3dp*0Op>*JHeKa<HiQS7G0S5&JfrF*qb>w
z*fCYVhil*OG(lMexl)H5@_!ag>R02B?;>LXtQYKCoZC8oQ^uLl=dvlC_nMIPetr4Q
zrx2_SOr{n&1S<t=1#@g6auZ-JU~0{6*<!6Dw}G-Pl*!Uw4xw)a+YBb>mLBQ%>G#|8
zetwqv{RBpC`ua0-E@CF(3%wy^yiN8tQt?K@nYAXnJK$N9#bkVi0$Ii=dVJeIuhHku
zS>N%+mQVs&Rmd8CbIAC)7)H69XRPw8j^142q71^?Ft*4WMBdnIL&it^bvtKeZ@`nA
zJfA^cFZ|Nhd6D;)kWu6>PjBnzGl;7yLRMLj{^MqzVri>#c?OvE+reYbSOZXdehttz
zo)|LD2tm8tF8VzvfA!5z%2zf8cKStH>mC&G%Sq~~czwut6X$Zsd_D(O2UagY$|o#`
zVEZYRBSa|{U(5^9ptmts+I<+f%bL+devGmXWWA1aIZF9d2>mI<HnFKDKX_WesUWd3
z`cdax%wqlir^s6!GWgkBhu0@A27gBOpzxHkIJ@-cDExiz2pM0pb=BrYy+7ZXsXtN2
z$PQ;2(<S5+wk+oIuc?+Z$Yg~heIA1*@0}qd?%0NnWs2n2>eBz96I<0nTlB7wwI|GX
zv27Dr37EuA_n0r^Yd2-3l>Jg<TXv+)<ee2<lS%z(+P}Psr+vLf#>xczn@dB+vmSnZ
z-5&h%a$8=zvh>`Q@iXoHTuiHSmx3d*4D;RgyyNljE8Xs``oV;@7TS;7c3IXBb)I2*
zdWL?elRoW3R@-SI<NKLpxvtUP7zm!5K~_~AvSyJrRTeUSaY(W*@E=@N!HYB1A^lvj
zBFl)rCzRSl%LA(cD*$^1=hDwy-?RE3yb%2DHr~z>UOY$SbW>(hc9rPLA=n_;CW3}D
zZ-{+{!3M!TEE5vE2eI`DLBCe<!zSX-Mud2HW+=6{fGmk&YO)Ai?+jOKtQA9BbXLeX
z^?2WU^*rn~$NSX_tF)^HSp^rR^E_F1b&Gtk4V>$7B;XDmspsIVF*OEX8+<Ca#~$tE
zvJpf`J+qYcQRdiB#^3^2FPMsvir)I&YRQjtkHp@TiJe6US1y(MePoeQ3f%;B%7#jY
zb!|@KZ8c@Hlszsotp2fW%6*u3DVsXm!^$T*pdY+=xh&CVkg{RQDn*u8AEg?dulDuF
zp__qDZ<FXZN7+1OSKBf?eVj@rtTCU<%^DRoq13zd-L%|=rlOMAR^m+=v<=Wsi(IRn
zR_wOcB>HvKzz_WVp`5F&uq*#Z-Xii=+47YCgXMjIb-iF5f(?Vkz|K{OHIIygHG+*E
zcpf>`Z$Cxa5;;1{*mDW7S7fSI+Ifq2U48koY3KqkLeteANDcB5wdwrYZO6dJ9Yf+4
zsrlDh=k>z32|ku-em2#{Sl~%NlgeB&jhxy`L&k{6InNa%yzTj}SNrV~f4=K4MjV#>
zZ2SrS_v>*JewkELKRp;ElyettdUnROg`ZC?G8R7+QobVo!PCS1waQv3E52;Gze@l1
zQ5L66?W^)#=E}`r*>xdfiF1)f%S9L)#+FZvQ@)3C88dQ-oEfml<@P?5jm?AgfvpxI
z4xtMjL;ef)D?9E=efeOu9|;-nvSTc{z~5~36;CPUtyhGMUyBe9k&^%``WQA-h=tXG
zHG?^N$#`rA>jm2{^?AqiZ`>2B$Q*>W{z^Mv;JaYMU`=4BaxRBp<6r||YK>@PGhpLj
zDt6e|JXq)|J63|l`Gi(7e}VO{;0s2@SKZ^-GjVsUSa!uz)o&uB^=j`ptp)1`D@Uvx
zqGuD>6j)RMjyRtVuvxIL9YXzI%uxS0GRi&^GHwx}98%W|Ski@w?dQSzK5P2~ZEX*%
zSKVseDG^AhFQj|IUyy_Ue9mk87+4;d6OSa`SAfO9R)qNC>CfO@L3f8zEOMKn&;4TB
z26bSaU<F{N#INOjppE<B9^ZU~OaGa!NlT2|LmiRpL&o9}zI9Zr?DgLF^4=r-qDwhs
zRy)dgP-o;O+Ko6lWXxBA4THr6pdD6vY|%GTHcgr2M{<bm+Q8<(J|}=>yK^MlmCZM{
zUX&#m=z}s;#CAi_mvp9cEva)1ECyD=xzx$*&tGf5W|p#g%KD_tit7YjRfUz~eAKfZ
zR`rO?{9~El;X6b4kXC2Q6dg(^o2TqVDdP~oGQpO>-Y7t-9xIRHrcZ?ss;vHp*62#_
zK}mg`HvcPypZe6bKbwA&P4B*bGGf!au?et6bZ}#{U<+VUha95+f-P^ALM$x8kJuHU
z(@8F@0IU@3WEU0#tJbgzuqLqAxaexYTEU!kqSV_6)(Ph5Cw|Zd)&Zu*6T3|xBA^Rx
zKlp%)R<JE#{a|t~hx{J}>jQJwJq0!h=J=V=CBe3UIejMB64(fs^otyV<sOg!f~kJD
zu|lv}jjj}I226OQzr+U;VAEii*qHJ%E*X<VDi{8&I%9oJc?-1JUkX{*&}jE(>~{Bp
zmw>x<+5{E@bN0q$ei{KQ20PuZi(9$c1E01k5S^wePf+gIIu5o6%mjP23?|D*tvIjd
z^tkKN?u*_jU2SQ`m7mA*Sifux8J)Hal2CiEJa}1Fdc4MDY}Z2D*==7Jz((p?e~WF7
zSn!-I=M-A;&u-`=Uv{sJ)6WuZrYzg1Oxij|SuSPn{+<Ra(6BvVMPP2*8n2)|8kPqZ
z1yjC@-cm;qSTWc!oXa6t8JNg(`bFAMWy^b&5W(xd2f5LvcltzRwAu7-o%+C}{v46P
zQO0L8n6&2^0g#)TqgCHErSq}C1Kz&F_oSXA{1fnJ3qObGx&$^3RxSWComee>m;XxQ
z8D;m_Wmf(w+f;;2mN9oYVH5c#G$!<0z7jItV&_XK{j9Zs_8XkR?_}8Ds*|?%A*=VR
z?6-RSRPBQ>Lxt^^>EEpHgY*iyv~>#py04{kF4capRxr2im%v&yEcXQRH87bX<tXA)
z2-XC4lK_kx*P8J1w97{oZ56ar(B7Y>b*?4-GEHl(5n7?`><hV{FHNt#yPQb_$a?0E
zQE08hhv756Ztpv~eZl`)P||-Ej~h|Zk(=n|kaeA$Ba7e7mC2Kv()bG`pF(Kj-|&u`
zQm`_xGlh>s`Yi$00yZT;D!$Nf+~*cZ_1h2hejAdMN9yT@KmVH{<2d2x5Nr^v5bOkn
zSlBRF3{1_HHr+T_8JPHh9Kt&TRtn~h`SV~E8eQmBv`@qG!E|}WU?OjoRA5?hAr4jz
z=8hHBV0B=33LS^U>jtob{*dt<0Wj@_%D~*a#ks5vIw>EZT;=Luj*l_axbs8-YMnm>
z-57LlvE{S}t`xUW<9ULzX;&GYsO~8jTO=vlLz$De3bq6m`BunytH|Pz_?gS(Pza{t
zmqIOD7J(Oo%Vh;Qgue`|2+Ya<;$T%a|AU+_k5g)nv7Yx0e0y1(BFR-bVIML^k$XW}
z27|!9hJiuOcEuhUpJUL^Lw~-|d&Va>7+^XzeMPOk>Un7MzMUSQR-D2|iK8hO{hIEg
zF|{95%mX(g@EYQ<O0H*}r03||MN*1Xbst4k)ng?B@lx1ZkkvR4GXBiDXRI7V53&2O
zJoxk7wB1dxaj+%i70UovuA6p!A=5pEO{!n>3dnbm@62`L7e!z*U}{c6KKG+5e~E)n
zfvYhK&NB`Qud(Ufxai&lJ_o%sPGzp?0F(Nj<lNTX)87xe`dh}*F!Y1pv;CGj*}dbh
zb$^|dPf<QZ`6q07DmP-ERm?9sEKqKIKkdunLy^~FKX~3QJXY*V#o`7h6jLO$CD7)=
ze=u6<=Q?N$p_P!G?ia`3?4dB|@;7wE1ynp8fHyK|?}yUD)SL=|{Tt7=fTm?*N!CI$
z@Yle9si$u1dVt!TOKsu=b}SQ6ZtcrSTrYeb^WP8Xdy(Uf>pUsp%*Ri-8lgtqsE~`j
z>*0_8Fr9k{)&f=r7U5hD!Medp!JNED>^BHj0`^`Za@Xw|C+qc^W7b~U47?`1pZ9J5
zWL7$93RY+N7+?}U(W{UfVcLG=)wL9?73^5ii$mxVV4Yx2?jTqlSO=Keo94UNvl(m<
z?CRzAY)j3P!BCdotTy6~$Q*`$c2jx`HG++U&47Jbg0b6IocycFvyGD-UfFWy%C;;n
z9P*r?I<4e%!^)O&P&sY&4_-}(7adBYjF+E#ZIS?61e0dTAvtCp*aBF;0NgE}73fv-
zmBf@*%38MA*8o^71(_6urw_ahT+Nk=hs$u_QZ__c=oca5M$RpHUr=SGe4P+^l&M@M
z<tzI2FYe4pZ&s?DDgkd~RhF?8-k<5dV%ZPvyw{m)G6eTQ`bgx}Ag|z8_=0~vtTH0k
zkHZJQo*^NY<VfUgM&8V?S#SE!Jr(qtr-L8NpkJHq(~HO(_>FhJF#9Cd17J5$pB&Q9
zQLu{Pkh;$6U?pHpVAl)4A^l>4?E$M20ROY@O1I)=J!R3~GFHU4^b5I-lN0|rJweBm
zoL*@2ZVnmiMGk>Gz%3C3VpX<<vKGqTCuP*p9=J|Le$jKBvaOUIDP<gj&47)7Nz9f*
zuz9d47oFHP^!hAg5=`C4zI^N~>-(!R^p*6z<Z{)>Yx`Z=4_)nTblPh|*9zS<bmjhi
zs^=4L@vRm);``&sn)yS>dOkJf`_4RLFPZ*Jk0_SSLxrrjw%UEH?E2}{n2J(1Oxb6|
zk2A;(et@egbdr_hsMsG9yVk<HcpGC%lE&qBUFNeJd>beM$!g;k<kj3^uYoh@65x$G
znkvdZNqDn&96qo3cHzn7qwuOZh<3;umSk}xc8^oGK-uZihi;uD<Fcp5;8Dx$)1>+z
z-o^2dG3?*}TFw@%%wV1-nd`@q)%d57^~}U_TWh`)*y0!6OOTWM2HJz1i#fO3lgfj>
z;N(Fdse9dohmQ$g!9+Tz61&xc<%6j_37xDp4>qH$g|afrYVDZdliS!o5%a8fW>)SM
zSGM1ZjG8|mexJ%5ya;dhU57tjU{%-mO5aGlfqz&4cfY&xlE5afFIGd^1YgBp$ld+t
z1%01nI71k(3tY0Yf!l#9SJqiEXB@dpcc<@#b=Hq;qjR-fIfTl6=b<aP$D6l=ipbBv
zoV}44dgg<bfjRpE^bftBUW2W3<>WM3t&bA$bi(t^O#bY>Z)1OkwyFCz`jIs=#a!es
zOY`U8-+g5r;ChIRza{F3+)vJQXmw<oQsZ=iVZMp|)(1k|uf~^yj=#g#H;ULT)Y176
z`I3wo>Iht=ra$g6<O@)o&m=S*Ga=)52OC$x<Fu658dx%a<-Hl^M?%IQIk)E7R#gS}
zn-cc|vQ@3-)H3KcLwBKFm&@lo@41jT*#Pa#qamX!tye0p1b*)X7Vc90taDB=e9}*w
z;Eg>NGJYVu2eobB^M0`nhfzAuBdhOm_L!IH9L&pd1`5@P&IND5-hVs%7$JR818>=!
z_u6M8SPNJY<<bs`k!@goV5@9Q-OtLolnqd}nX>oTWn3E}(pd3!gt7_BoG~kPPJoTO
zFp)6}HU{=lTLyDiO2!gp`G5EBkL9v~Pzd&Fq2~~}g<vr-rMHchf+fJ**Ig1|HDHax
z$RWISV8dV^QwZPr-wc-fgnggd^8VEC+52sVu`N#?6MD$+)m03+#-<j}RpK+#)G_d6
z$oT0Y`Ao3Ww+@^}WEH*@A-k|sh%bKnde>Hw4EI|#i9MQ-HMNKP9XXdn{Hz14X=OO|
ze6$s7?TvQ#yVovIV?#AEYV(fT@>=<xaUOcFcOG)$^%A`KPcs)CTFgpjvXRWE=Gz$G
z&!qRr-SfHY!V*vKq&mw=6yc5X>4d-dS?=@YTx`ty>LjYMsl#8OIzC&VX@Q1U0s8fW
zJ|{fjtM38k1gRtU6yoPULdMoZt7C<4PDUN;{CzVnbu?4Q?DO2e6y(ctf707=U8XuR
z_`iBqd73&VUI-cQ7XMcxL*{RFSv_?h8$e*viFtexU(b!<pL;{b6`6e9dmradGx&O)
z$f`xw;=kB)&Lqp54?N}DPVOtOL)K<w&Fl{u9sd2I$Bchw==%f4jO^9q1A(xy=wFB4
zFU1-3RWYLmSzAM4<2Q#UE7NW3afuI`kQF=n@Z*E@!7RKBM}&=69h$F=XKIs-!Q!_w
zAEJl2lpKP^!8%-+=vxie2KJN?;d@prIQUw$81i0tOR~ep`L^GtV<FFKS9@~<scRJa
z>SMzHR$j3L?`Goee<H6i-@$q!7dtN-Z^2{Td4;sG8Cv7mLu;ehcoX!|<HD(DQVPIE
zz`DV{&$+}9YbS)eWFuC;Op89phdukNd%z-K+iV_@EiVfibJ2-S^T0;IzGTy-V!8kI
z#j{qL48O{iagkRAZ(g3a-g>ZHFz4D)47wJuY_OLOZEryN+7@`GUlq3QV?{<0pHZ-i
zSBH(=oXdTV<e<*p()~_obKYmX2inMM(z?|N&#CMug8jFgA2b&8LLJ|fBYvqL&NQI*
z?&_(diaN^0H{5kF5vTW{13z-jjm+^pq3OQ2)V?ktV|rNR6@-nSADYhG-0znr#g$Es
zcaonWOH`9Xd@B!Z2<#*Q(6?QlZm_Z|#ZyB0DCJj6Ifv}!nqbAR4O{m@z{8Dninb2C
z5L{^FNbqR}i-8^K9~;kE$=1AkZ}ckPU|tTH^~}>4^5$O`HtMAUYp!#A;JuCyU;*BK
zri>pET^2>(6T{B^H@prlqH?j^cM(s($8A}u*rj%>oNFDw^(Iu%%D7Izn}ql8!pq(z
z*TR@p8Im&5s!P2##JWv%XoY@rG;BTV2z?DYi2eJ(hrpi|IjC%1ze=j~R&uRj%G*{E
z^X&4}_@t5E>q5=my%T&$^jv^{68^6UzZIX+&|cdUX4{;#JsDadB!#8)*GXYx!lrkP
zv2Bh`Rr{fvhHgNUTV+2^(BqO@0!=IQ=Ig`ag3h==54Og$tlU>Ffkdgya=mhJYh5%9
zUsYk)+6U$duq+lea#y0npGnI4D0BQ-usN_^F!%iei(ms9UG^I8F9Vau8{~+}2W%5q
zrvP+BfJHD$UatP~dO_*03TSdq4qMOdTK)%#<XZOrH|0%~-ykwP<*D}>LvO8hd!e0z
z_UmbzsDzocIaV$MIqn`Bfrm4VZHW^z$SQhc*eLLy`}Mu1tr><{o!GF5L~`j(X}c!C
z%D@)Ee(gVwE2Jac<M^)`Dv*AaI=Ye9P@JA81RDgacVRL=4TIHzUB!9Ye{CmQ>*NUR
zC0M?r))`SqMczETeegQ#RvCw()3_c7reCWr0N)Jmtc8o9je!k=wQ%l@HL0zAMxo`W
zwa`Z18n*6{!FSX>q*1G^g|afrJ}z=7Q}@Q(Wqp+OX5bs5Y>cu`*?djP5iOY$l;!i_
z=Q&bF+)yKlzL9n(DN9i1%)!#`B`_1r-EP72${1hZ>iR!&&Lb}6gBODfy&B&H8tb9A
zxGFO#o1m-_I@%MrUOi`()l(KdC2V|H%8<vs^t>{}+DC1pteLVONm(iusjJi0{khGa
zZXA_iA^k7{-yZl>eJMWM2R`ihfHgnNK^u#Ot$W#0wCQ-a(NjfC*(?8a)*J9R_KH(y
zG1xTN<D5%7GRNJm%i=BrSLxk^jE>b|>;BS|E|%V|*I_8y0ciW7y-@1YXalbI8%Y0-
zLpuR&j?i)lHUl;e=J>B*^I%h8&OM8Qh2Fz_;KHP@^T9;kXE>L(%6eQbdI)_9WwE!X
z=X8nXCRiDm+B39tC2o=P%Y@`ioB!*bHYG;2!8Zlp`~Bmcwyq02?H})|)U(eM$V$E=
zY&0NObY@uLYL)i35O>j8`$If(BPW(08E256l!UE!Zl`#7L4?Y;xXfXZs{ExG+8Su(
z61#Iq8!Eu7z(00KZIHLi_#|Y=5!?16tNGNhx=%pri1OJ4HVAgKE#GR_S&oiI1vKN(
zbiXrf+|0ShH~r_0K|d9w{@ghE@VnA}FTPL+W`Lby%XY5gNNy0PERV8xNg4f4$hGIb
z)bEKHVxtD=ilMtLlZ|@4bHz_G7)j=xVPp-KhK=(wtsQuNz&-Em%J8z-L39y$ArkY!
z)5FF^{_>Vxs}K9W5Rm+YSX+v`YUB-_Y3G67u~3y73%~b{1-3bqoqFM|KPzk`Gug?z
zXY!W}W-_F2W{?$GYsXzr-|!<1b}k%~_MeIWLf4<EPVc=1%1=w-*?M+*EhWB_02={w
z?thWEPzN>)_9*9e40GZ_+7=UDTS!jT4`1y$Vb^_*))P8%N4V5ALU|+QZx*>6wR|SP
zI>ENk4+bMQ6`!p!pkGhj;A^|J+jq!Aa9mq!UvXog`-q;f++(Hk^5V0I|CQm?dwIF@
zl8B)a!dz)<g7V0@Ve9&r$KLOA%+1&cUK((HleTog*OUmS?!%NZ*ALbW=G@;;yHoda
z4pTNnnRJ&NB4ZqEi-ygBZPu`PuuU2ks=)s>EFWw@!-~QBH7pL+r(xA#y&Bd4Chc+S
z+zQsIVZC4-8ny|nO~XdOS~Y9}tVP3S!J0K}0Zi=e)+MqQf6}l5um%l_fz@kR1z4Si
z)qvG%SR+`ChP8oJYgivx6_|V8-s~f9t1Zu&^W*rzq%AL^kL6`+!~1-56p3GXXJ;AF
zbzy_;G1pw;?B7|p90m)sT!U6krz#Jug17K|dkw|tc%MwP98o@vloe4XF*lvhNP?hJ
zJ(ctHLNiqrww~>^VyDa}iopH+v5b+e&@MuIp2(vvCVTb1l_+1QDa$@UT|%=2O(`_h
zocrk`aVUBY_p4HeoLhCMRlSUQg~chaMuxg?+ZyxIAhq_3!cs?h#{u*ZxgF39LF2lA
z*&|~U<+BHoF%C`kd;RqgTO}zkqg*}Lr1G3tyM#%J9U+tcrj5>cC<Ds}D*)@~JRP@}
z-Al)<m&>o`$b4S|e=Gc_NF~&_EEap`u}rUxRqt8Zgsi&vhmCGqmvoN7yN2v}3@1_k
zGYRb&wATx*Rj=(o+&FH%^Ikctw0jBKq6@;t$wJE^b7pQO`-x!c-eFG+mp3{ofsq(=
zrO<Ul_ibU+{4|jI9W2qxI>aW8@GQbJC(v@6=xcl5w>!%^A`1HmvIeTdso$l*_j$zx
zLV0DG{FC<0Kr?b-I!_m0n+F>MQ}_2k2cI>u@>j-X?z!ZDBtoZT+B48{t9N9$U@(X?
zi4Jwho2v<@-oGGNGuSMcdkxYFHUlQ*a>)MyuxT*oSp=D5hirN4geYZ0Z*2Oa7MY}7
zN%)qKHIvC7wRUB?A5+>Teo}fK{eOwQ*7o=bKdz<pviCos8;0&A(U(Jbo53c*-Y<ZM
zmq#0HX4QV^BDHqxfmE;|uspDk@NyLK83W4&`+xwNFIvBAWNm>+9GHi;0$Szss{h$_
z<iFS`JHh-!*~^*kA@=5h!MCsQTScVr*Q-Tdq&{ptvu5dK<$laXsXW-{em&8pAKtC-
z{z{U`R6nb0{jqZS%`UCK-k9N8W$CYCL?n?{@NxT^izP1=7lKD;r8gO)mQC~55ic9V
zsrSaZVh*>yDFK)*s`C8`Xa}MF3U%rBmp2Hirjx!!QxljK7bGUM!MpH@aHjXu=rN(g
z-{Ntl#{{x!J{2|sX+4(Pd)c}prslB-7u<@Q!b*<VyPcoH5x<c4)=1naqC7#l%neq#
z^^`ShL4}!=H&Xs}DNptFviaf~zxhJOL?`?c@H=}~YK^<zS>jr2_$YKk&@DjsXU;_r
zJ9>$>DnAxor=S_UIz4AwZv<DVff_>#c6qkQ;E;ZZyqEZMUD&!uS#>Yn7ESq^-VaY@
z>IWI4)yS%%{yY)RA@Uo*23o?#_cFyFmK)AugLfL$WC*KGGTw)gm5cm~L^!r~`KJ5Y
zzKRXA&{jZuw$RcphxW#R^`2pCo5L`o6lYhF??U@yo0hqOclp7@J~w~WIEq2D0L^jY
zW0sv#<0x>CGmP+cilzN^(9d0yu3uiIRRP`tUT|&LnC0A-<F>cH@8kQ{vnZ^i$Qee?
zAm<_noq5$UuAmxb{Z{B4G_huTFUxDMPdH;)`Y8K-tVf_*wuj}8S3h%2E`(LC(p7j7
zE5_F#D|>y|_`K-ltuysdn#83x=n6j*HacuN@+8}yT!``ceV-xdJD{%@ddvQn-9*Rq
z^E}nAai;Pe8Do3k9f9`}n>XzX_Iu5tj8T@yX59QC>LjAX=S_GLpH1%@304bcx-e;P
z6Icb9x^`&AiFA7_omiQUkAu+nK~JjYZLj~n)^w(EDRbW*Wc7cJ{aagB@Qy6y*kYT=
z1;nq<hm9vVxBW;xd62#Xi8-I57<46H2pc0J!_wc1vF`i5uXV4D)ZE)t1$iU9BVP=s
z-W{jq#CnYRvHG12lf87`XK}2(?NMZIYGWTJtq;aat*HVtp1tk3iscLNv|Z1fgHF;`
z*$EE>exk-q9Z+61@#O~EXqO*EPv4*ZpfNm3T#8q!7)m6yFXJIo^0*f2Xrzt?PaWPo
zj$NuE8k1tJfOZJJP4InN+N-_SNL`|34A{T3`PnS*3YU(s-ftvw7Ll{m&NV?_Ic!Fz
z<OE*GAV#$#ej)4Cj>Bt*_+kru*&D*fWt>YN@RYBLu@c|=D669E3sR<!k<?4;t$Ae>
z+D*{DIb_Mv`bxjX_eiGuwPfDRzKHnOmDUNT_<+(1-6TF1Q$9}lGn{*3kN;S_-LL!c
z3$<6*f~>|b*|9bi$F%;x%-jFOTFm5e2`&7pP^adpY2?o!e@|LhJI=5U-0nD|HBU)@
zL@s82-e~W+piet58jpZWSs`VElsR)Kc3~E?^29i0Q<S~UrygF;uOh$5kr>km?KHGL
zd4DRWl=}Lh%k6gO=pRwKh>Ri13Mo^%DCugxV84N<Gn5^z-`<;sE&<(@@Q4pNv7YAv
zQuk&{j5cbBuh3=7IfvAr2i6NF<J5|UyrG%XB0e$7hAI1)l<B_ddeM!F|IjW%8x~p)
zk=+Eg0Os7&?EH2>DP`T1<@a#kSUQ$jvDH2Q-s^Gz6<V!ln8x9q_;T2|Mk-SM7vR#U
zef^KJ*jK{VJE%Rj_=saO>TymjJ@Y?6J`B&Qlst9GQq2J+l*K7~O3Ex7EZ5WLSu3$g
z6TA!X%JNzc!8*X^HLM?OkA`glOKR9C*qnw<fz5)68Rd|EOM=aSeOdsI4G1Wf->6Af
zX!AaZy?X6^HNB5<Bn4qx>9aVr)zCVAmWn@iK4L$_bFnKQq1P<lR(L!AP2LfBxBeTv
z;?oQ8&coZndB*;0@cNkvY4IiO7ycJ>My4FzdvEN5@AU=VyD73-kd^(luyKcUlCr;N
zy`km5!L|&t)V=Ow$m;FmKAJ<*D=)*^qE2j+TT6cVjj+0=n8`NHUel=cjS1fXd?z28
z4#7-1h;90i)z%-jexu{ywz>a6wwXm%$+yGCV~1wv+YThF_)_BccZiFb+UVWW^uGt0
z3w;s?TaY#JU3ZMVTurq_(tgV3DO1;SJ!6$j&^1`aR-@2%e2+bB|MqD)W$@j;?L&^p
z%Ki}f@%OpDb!f8Qbs$+a$Ql@AU3O@)Zp<VrfnJ-CHUERK!B1BEjTL!9+_QK-CPR`l
zi!L&UE+8-dWAbp$y|IvA6tZHWn)?baV?PBtWe;yZq%LlgK?M37E+VM(Up0J9KS}S!
z6BF&+<0}Dn2c7V#bZLXO58AVq>GC;8s?u{abX%cwbaBU7$@Wv}YLHi4$H=3YkrWL(
z?-68+&+S3R_)lH;bg(5tb5*_+sbjv|95(*T*2B^{eUHoyu62aSDusRk`Zo$aLY#N*
zihZjo8=|aA%Di^tZKsxj#ot<?n}+VDLmN+D^7RqMlgy#x$SU}`cV3+V%LjAv522d}
z%L8-vlm!c2&h-zl8qVdAJ+6GPMzF)3<Aq;x=^FS8wxsh1e*ce#lSoq*<I_TUG36=`
z^4N%%!&)Yy2UFX@if0QvrSR0l@3m`cO94Mi<uvL#+blfI@SGt=P;x!<%sDH0<Ai(T
z>1P@4m1v`n@(>aGFxNkSZC|VA8vBQ7k+jDIZv{V8-Bka>GXl>?L%zD{*XxhTFlRK0
zUgOA``K^5~E$x)wTa><?1)l?d9p}=1!4|;gU05kt<Rh#H!H)FAD{Vc(9oiV!8dfYd
zNO4ph74S~pY~PE9tRg-&V8#C%PW@(wHJ;=;=8XY<AyD<W_=3#29ni+1RoDMgeWcYL
z_tq^o-wIz1eCj<(9v=#*bYF%(`lg&n+CC3Y2Rw2vhv*urXTAmdpa9fw-Cv|`rnmTI
zGmRn{+c9W|p;hmm_0*Z#N>j3Hpqqfs$#ccVjbP(ohdZV!;N1*w>j?L}OaD->>IAxi
z4q*e13lr%8c+&C!XkC-2f^{A_bHC&Iw5Oj_`@GtEna86VxD(`@m31HXTg)O)>{szo
z;@$7V)^GKCY~sDYHx6wRv}4eAaPA$$Jou{m-qU4t-&HQ0d>gi5WaR!K!@ZBGJv3&f
zRIYA$fifE%De-L)8D+@0Mq=+e`*&{Fr9(mRoB%(}s`g8ElxH82>%JLA#s4elk6Y|D
z6?(VJ!^ntg<5NvpFJ)l?QvRH>u|8jO6LT`FV&y!qote!FzA}q%=kx7MzJ1VL@{b_j
zKH$*#-@U^Rjk0#3+G%tsxFc+Qh;ttul*_o{o|+%CKSq4rkzSj&2U=8{qKGJ>tQdJ;
zwdJt~VC~r0FIkiElz?V4G|v6wVy8N=VK8TZ;CyS!RejJ(S%muLMP|yUopEo!v*Vkt
zaj(X+w9fLot?-Y)-zR|OcQRqit8-mH&|tl5FbQq(&alyA)28FmR|2UmCESFo0+yR_
zZbBPvAYQ;%eQ0y^_cG1V#mH(zR{x#$zM|D$YO(I^NGC=>&=&C-fWGLD_>nzUJ--|L
zcidAP)21DVHv3Npr<EAC0PQ@q|KMDF!y|Xr`aNQyE4Y$)HF01%(X#@&!9R1qv|G>f
zo!$7rZbwt0ZGyJru7lG`{R7Z8{DpN8=P5nYa-R@Qg>DSG_~gOq_CS|^cUXS%H7zgE
zu5NG({4L$?ysNO!A<&gUH-68-<<&zsc<;gKx}j^C3a8%juKFe2-d#?6hoP(b>%r+}
zpewrX;B=vn({cA7oURzU=|iBahHhy3!1BZwTA}NFfNS|~zj3W01J7E0BD9;KZG7<H
zw32&HL0k2ZJ;yu#eV!#t$`>e)Q+_n(DZe^9U7puS3_U=33FXlPlvh)pOS#N5Df!o>
z<u}{%lkRp0|CS{bqF*27#>4*lORhIe`TR`!{t>B0KUP0o*>(4a%&y1H_PP10>b0v!
zSKn#&oc>!(xc?Q;{u5`s;%zx6b||)-6XVAA3GXoyPrSu2PnW;V$Bzh^xnZ>V$lC<i
zKYK;*_A9*nw<`MUpHct1=FM%-oMCQT44PRx%)9qBADfb}!j_=(KcDlum)cH0Id+6$
ztUvxjv+M84&+gy9e@FOukgQ9$FRM@JcYi6Z^97Y7i!gAUX!eBPdHM<?u`ze`=(?`E
zE8BNvRpy+v&0MpX)BX)<NY61=`KuCL51QMa3MN+Fer`|pac29r73P{3uMJnO`Pa2g
zQ01>&VYGcF%gEVq1Rb(&!#&OAiLN`%ZM)#uc79L6v8x}dZ2#Ab=k_FyjhH!WA28Sa
zE9a(%s3DPckJ)~E#Fm}N`lGq#rE6ceYg8(?@FUEg?7xb(RG4$aPo*GhpSfmU>J9go
ztuV}u5wmOD+?J%S(Q|w9a$!Wf=dKN#YyNTVxoG#<V=V2~H;QKOPiqD>nwjk{xix$K
zy0wX{Nwa+vt(43});RTDbBej`S*qBX$hx0C<7`JF>%L2Nz04!<HZiJ(?yyvyzxHl?
zDCbu%?>Bp15gLDa|E;xXzWPqQE4gu+hvri*)q8>2b#L;fOM=GpIiHG~SC~1U{L>XV
zpL)fHYhT`fMNZ@0SL94hKA-c&D)W-<SLB>nVHiShcI`F0?n|!Tw|{>!VYAkxSkKj1
zYhX>+XKvWv95TBWw^yNIL95bm)m}7|8zn3ack!wlDzkhZ7tLCv?X>Qg$Z2BlW9gEY
zFb+uG?dF=hKOI~>y4~z4kC;8zWML{Z=bZV<t~)FD-d&mXkIMEJ^RHd8dUTh>u1Y+j
zvVB|Dx~}oc_SvP%o~9*JhCnia3A4((ZZB_tJZQG>%f2@IikxRhS7U)^@7(=q6I!$9
z>ZUa2Y7b_s-q&lH=~ET^207aXme@DM-`ahn{H@<N3HlhH#})M~XZP~KuyD+CHpW@@
zK2H8<w(rD)MwizARGj5)bm@zH&8^(_czE>-GX6zEOn7eREH7s9?1)ybd2an2U$$M5
z(=`4JC*$<T7sl7E+1C6j#tg7+PcxXs_=$+!7PjY{Xk1u+ars5%7nWCFP*vHrul$7_
zD{?k`iyW$wKK_Giq;{_n+Z(H2FuQIy+n>rZbIy7$vF3%G_BI(4J;zw(N19!K5l1yw
zJ!<w8uQ1#1Sdm!s&uha8#=-}o`s7j0NPUj}AX4RYOv*{u-7;FuRSzV3LRo~w_Qe;?
zo<x?6)?En5xoNvlW$nZ~j0;PC*8Pb!dvk91Z%Wo>?Ja*{Cz>ztrLukJVq(p{lxEMB
zisoo`mkb<p)gxw4*%1grv;Cski%PSpBOT3dl&VBCR$wm8;6gLAeZO0?mve67`W2e(
zFxz*anI%8#KB@nP)1{>6t>1o^)V~gycjk1E(cy;A1S?lPQkk`*lBQIydbG0rFIkmY
zJJ)qBt$Wh~ab^DOTqAK>UCXukiLSk?Ur5O0Fc!I_yte$(@(-0?cIp1d5<PD{?G3p`
z<;F`9`_TIA<VW8{9O&9!*|nA`&)RQp*q*cDRn)bvr}%Z1?RV@q_dZm)>fywye^st|
z_}Yp@*Hei%J=Yvj;`d)@RKDra=A+H7N2RWp_-?ju-=FAu*6ey@^*z5s3V^%*U2lAt
z+BU8=GN?CecasFqmn9<6{W<Ly6D}C@MJ;N~|7Is+{t-m){<v*Aq5h{u{bEl0c8mI@
z>lAg)T~N;o_34QCy9IvXsD1lyeOSOEz)1^Cd=*N0h-3HhfPb*SJYjzf&<Ze}_o9B!
z>BoKPW#ZR(&`5rhmx1M+*am^%?|ah`#;&=D0Tef6ic1X<^ZGqC;AYq7C(WJ>{d;Lp
z*M?5{w{t@;|CE58YkTE?GQew7+7|=Yot9jz3>saEAo(<>%+H@QyFSsIoV=L(MY@!n
z<P-%^UvoKjWk%UBh`8O=$zMQH*|h_wc&H-R_)^VD$)7{G`pylb42?gVUF&+=7qhNA
zcKwP(*Kb81tD4=fsmQsRn0Slmmgr*4KDGMJ@~&;kFTjlNe5&XO`tB3`ITw%S9J_Xq
z&xpBm?T}J`<67F*U&r)VeyenSqUWRiyN^%BkIMEPS(WYk-r0P9<(2#REoJe8$&H~>
z8Z~EbTtAuUDSMAhk?lKj&93$1z0_L1bA?F8p%PtBm|YJhy8e;axOO~Q&faGE#uXbz
z*FU&>v$^r}L-3Cz|MVU@=lU_RbK2u&^0DRv>5B^zJ*RJZqcmZEvfxdk!l@_IBD3o`
z^DVD<y_{`4rcnNE+jUf8V|W#(mD`eebZzL(`}Q*juDFOFd+Is0`~<!|wELCmaDGKE
zu1|pcSG>8Pa%114o6*}O5bwg%Wi*T?CoT*byB|c<u3ax9>Jk}{yZ#r@_MdVR8RnF(
zeUL=ZarHf`@7#4U2C&Q5c5SO9Um)P2t)hDT4Y=E>io%*-|6$>O1phr&Fv>@yACh0?
z(l<j~(z~$PHGRqUWF7BMQ)lT0^eX#QcHLhoz5Xwm-u6D(v^J6RtLIVZI!7cvk?4BS
z7Q99TBeh7RZ%e+peKCLi>+!h9-cI1Mg2HqAb9d(^<Gft6W3)Lp`Q$_VFx5j+wKRV_
zL`vXG$!7|}uH-_-kFC#Lqzy8qbWNIjpU7#G-FsSDne(ehB_UWv5NkG3@V=9!8^uGC
zKgw~MyZ)lxCsX!&%9K6m2ncNDTUi`fR#|^a^1bM_JAvv)gox%XM;Oh8FkTPi>W8Ew
z)^*L7t9X`Vrvo-y|38R*cef=6SxfFzcNiouY*9@O;U3etnX2-+<Yij(q`xX@5>}=a
zZ<Kz+j*{6NqZ)F1f-(2tN=L)yH>=*F_?f~~TXsNWw`J%khjRUW$@T{w8#9o@rv~i?
zy%z2vxUB}+tsx0it=X<TWmhXv)H7ZzXJFJE5tT{f1YCOk(aF`T#eHtC>{@}$T4Z_*
z`}&kJ&CH#)rIJtc>cHLe%l%E94f#Jg(Ix-xSX%y>Oi8z|CwPaBC!fKd_nCViF<0#{
z*F17fDZOvzR6K_k_q|?5wbQqQN75_|`cB);dQQKrgOL5L+X>loypuEeESnB?hi&?_
zbl7@=j`(6S{2CrjjH9eQL`ik*_;HaWI`_%g=s5?acCJ6EoblLm`Xy9tyIzr9?;EXY
z??c;@H>!$WDZ}tIZ$(S5rlR}FsaKCW)M@LGU)lbDpK^Yu&YZ7r!(npzZjbzN{aX^N
z7T3Q41s*&}+P=@y!S;yn$!sG-)3QKvA3HKT)%zimzge#ue0<tFw}~nVrHVza<}alH
zO=biV0zOHTpUL_9sJY>VoW5<5ZR;z#rrT#%P{;arSK{YAr(b{?(miSv+G?C&s}VCd
z+_C=cILzv&yOMTu$w#)|qAat@(&+DY7u<k4w-8#4-S;APb(5ONAC)=#7vz`LOuUab
zxVp_1r?vZ=Pr1Ns-$PGguG7a>p-xYAA#>s*$=lv7e!dM4ikfSt*B@nWtSF4^4wL_5
z{S9~GM3jx?x)&U>)JWGrd`JG7F8TFt`Im`&Qa`%Ksv@V4UTrOm7&%|RJvttp%(+>5
znF^?qDT(o4Wd-ZkikPm)ldlyC8?y;JRZ&Z$yAjA-eftM+LFo1N%|%`9dvwj=w{PP)
z^R5Q=O7Q7LX7U08<Lc25==%jbZfYXy%Gy;zv8>9Kd#J&C>SxMwjq;qEzmQ0N?AjnR
z^UcF5N<X^#p5#x6VGQq`uaV)cyd{!+u9#l|A$D|~6*qf6N4?=B+ZE=<+*@ULef6Em
z>v=t$IByIRvvT8&YBHtwwmp_7Hc?v*2B8=0Y%{ys>(DDT(9g-*Cu`T$FOayMj2f1^
z{N>J&QaZhzFp*xrkev04!gzMaQ7XJY&x(e%YI5W|L1XRexx`H$nNRe5ZkC+<kHssD
z_RljZcl-@sEN{E@YDq<NI{s>X6BDtUu#<0uK5MOc%$MMibwp2S+uE+j5?OyTx9ygk
zmF<M_Pe<0SetK82G^VGDh3M5RM4w|J$`pUQx%VD3>p7PB`PV9{myliA^^|Nnuvd`9
z{9!Fd8(54!%i@N`=+3OJeeF+B{IatYeMl`u8_o77S&B9^simmGWIgzBFAF}kntV95
z{(M-iKOa`>&xa=|Ppv;6_N+f2o~KOKpNWSJ&RBqMX92o%Y5h+yWb<2D^-S|M*WC6*
zc=gl7GpX+s;Zf_)Q!JJ{7>>{A>(9!ZFZ?;N=JxgBMAu(;=bhUV8s(%n5iDYCvHl$Y
z^g^O5!MN@TSG<MZC^ectVD>Z@=C0nKJeezn4DDQ2jK!3_CF{!ZBp2w+k%vldzW;_t
zUUQbYx3Mr^?L<t{JM3WXJ}zr})}N$H%Qv1A>Z&LVm2do1Wc@3CCku+*uW*B_A4>j|
z&3znkXGLKhoni35LH-NSXal*D6C}DC3-jbpf&3{nH#SQgYA!6U=&CO)kw2y2WnHy}
z@i*-@`B%Xw!KbS7<9F0c-*0;=+Pq@*J?7pQc86BqvzKYIqOh4yYgb2M8#OTXsVti@
z@Vt5DsA{DzTSuYHUvPC>|74A^vAQtSRb5z(8EaOLuK6Qz9nrPr8_%fh`9=ditn8Vr
zJRxY9w?NO6g!}hbT4!}kWxcEh=+Aim6_q)koRDpwFJJE6_Q@}A|6;b${MJO*qPZ%W
zSd(1;4w-MZJzFRadIozXr+&J8^+O3Z)HeJT`MaLvPl8lHfH48?;}0R}^a3@xCEvP=
zuU*?!`6em9nLkL$>1e^=?fZ_scBS~;7TIFj_GD=Hi9P4fZ`&0ZQE}qlSqc7pZGSRo
z=KShK*$_~>iKL1K)2LM%T_3LKT4ebL?U64+Zuh`m)dPFY-$885;}(fkFCt~9NZI&`
z?$;29w>`1K%o-=NUDNvZ7&{!(=-GS%8?qz6`^G0H<iqaLuaCCcn#{TayAt0{p%ZgD
zzKK+mCH9)9*I$Z<$C*Is{lfOg&oH}osx{aH$wz7Z?yJ}J+&WEU8OkcZ@kTizZTXZW
zz|}VUJ1cu`omHX7kzlhA3j5nCd%h5_yncuJ!39b3A51Qu1peyf_$+w&4x-leV-gu%
z%PCno-SajH&kwEd+WpXNMu@H$vX<O8HK>{T!ljq@SEky1t1J$&XfiS84Z*JX+rFo$
zvTMTZ`m1CbU9B>F!<WBN4egwcB^s1G<r9*#<ZQ@dFt6J%m(y{ijOzB6SC!uuMSl4$
z@;8xl*3;-^uRa+Ul|2{kuiWt0oDK8HNOTnzp4St48z$bru4_-_-kHQIBKo?hh=KJ*
zHd5FNIx%O%{ZN~KKp@i{5qHfa=Elat+?<=f15ml<(VUy)5^3VJ-^#p`bJGUCm<*;!
z^3C<CTkRgDCx1uwLmAs4<mxXauX;V-WPj+F!0UQKWY1>w+@1pB+V&F@tNvzgkf@eZ
z@c{d7??|lKmUGipqIb_nW>?=?*>y+dnlbW=uJiU_owMFxTjzR2C68rMvipn42S2Wi
z)X^gP_ne;hIP1)Z@BkK{IcCV%HAriQvTod9xlYc8GZfu##bX>zMa}Ie8O9Dpq_o}e
zemp+;Nn|HKV^y%B6G5`B-B?nXTz@Q=J0(_{8;c9eC4QpbmXocxe1^o4eByG?Iu)1m
ziOYSI`N!p`Eb>%Pe(Jca+q3?-JFxnma<ME+d?wKKk>vEddB<Q^h)DT@+5W_0Gconi
z<h8reZ(oIqNQMNYyCfjJT<Zx)xvW|fZ~BLsW#(31zdy?e7}v0sH`=iKBECvDUw0JS
ze>v={g?67-nRCniX8Utz{QC5wyCt!{_ZjBY+oX~AKDqb)@>Tn#(A@TTz&!G~^7j2K
z`yT|loz3I>nXML)+hn#I>=rXAty{CF`NYbeb-9T(kLBF-Yw7u}k4DmQL2bNQOOKy@
z75^b?XdfeMced=dC!ddD`nO$W_3eHFaq=HmT3@yxjdkXq=<K6Es>a^$zew7{O>oxE
z)(iCt{N4kAa>YsQY&~Xy;VS^I2DB2SUyd@SNt&&~#$!pgzfNKD8y3FB!dJ3Gf9nwV
zx~|#e=Ty-x;!VlhcsgU(5L<TU#tkwoNV3hGGg$!3=7+g4uQ2)YN?wc5^<?tvIlN}i
z$sg7BvC6xCNK<55-f&M&$2T}<*-_cGz``O~c$PKSl%}o|oOFq&YpY7t;@(-#IBURx
ze=%is-63)IG+gr1lOzd?kXo|=#r!vKu6d9=>t1VfX+QnUe%BM;wf+O_Dki#;sm@?M
zefkeSqHLP?aV+t7$EH0N_}Fg1`IJpR$S;GW6HZUOD{Z^&8q7qKStB%`fPSa0&NbRw
zs*L8Nl8?WWzSu}XrIqO#M$`^R+`UY?X!iV^Wx7$h6#-U=QppPL`RMKwQu|$O-m>5J
z@;h@nR0Q7m-WZ#=IPHo=Psy?7#?Y~BxE+hnN6ekKi)*jGGrii8^`E&Rx&D>^kGVI2
zkE&Q2zB7<Og24$&Fe*ZTpoyXq6(zD|B!M$BK?HXZQ4~>85oQFHB`^s%j3ao(4RQCX
z*ZsO83M2%=A|i_jhzbb88Af(w73TX__n9pTUf=ikKHqb>*UFqey;N6MS65e6_d#+Q
zX>7ND*zL6c`%?G2cmQmn-;D8xaZV;Zbn%seOR^&`J)321Xpo0)A4Mmw1Ja-T#i3JN
zY#2`$ftXgU>h!$l^fMDHZ7$!Ekl*?pnt+^|^fsKu64&3nn%EMxV=JAhsOG-U2%Ob_
zFP0E!omBd0YLW*McDY8Zauw#FD};RP@9dxVLrI*e^_dPs3FtzBFq>Wszm6+NNeaqj
zG{!HKX3XQ~STr}coS)d3=PA+dGKAFCMLB#2A~0Gs$SU(*K5`+7U6yh0H95IML{$%b
zr5khQWSC1^S*yXLASwAW6r<lKPk}Bc2gQOr&%9KPbx}^P@M#MdRHC}IL(rFnnrkQ3
z7q7$+7KrCOI}LyToYZQs;G5+5O5)R;b7kcCx64w_2*+{G5mb}l5Q{;0?L?)}+8ewS
z%Ch#82khn5Qi(TYvrmv;CH^bgm-yB>D>pUl73iGW%b%PYaz*SP!U~l;n~vWqya#j4
zl2*_@*{LiszLG@RoNKiDG0LljM%s_`nH6n?Xg{a9AD3}&b=*~MV!?2g@3uc(#Zq^+
zkrpgtE*33wzaaVSp0(1txB|2t*|*-@aJFn2*T*{HVhX1!%)LG6V5pH*Bt5ho`Pnf#
zWp9x=yYp5xXFvX$>DfLyJ-_-YetPQtU_W-{s~yS$%C5CZ<G5XG=N5X`8ZfYgEV7rs
zF5NHRVSl=NM|=4Kd5G%AM&Pk<OA-#QHCin)^mw2riL)n=vVW`E>#}kgzM%OGUurPW
zj_<Zg-$6Ftz3ro>w(7HZZ<*S(DGb-%1YP@&Myq)&s6XwYcT(F<Q#K<-z7Z%C;&l;^
zk92>~gV?@a{E$_|<XcQ6ELy7QQANQyT73yReHG?Av((#n^xHlSSOXrSLcKr)co`FB
z{KA?Jv9Qe^C>_TZ>};36XLd=!?vXdTZ2PP&I(>H9DqICsBX6}Zs!|H+k7hTz?JC&q
zx!o-8QqzOn{n|wFZbZt;w9&Y|@hO;p&pZ%&?>^nBecz}~-FY3aWH)a3)|v9{DN3!?
zucl7rl`L13lK#%(ZLCXT&k$#(nx(Qy!tf?$0RL4PIOOe<rjz6fv^~~Hc%pfZD8T9F
zIn8Bw@SY`OxUsxvvd=;dI4X)J-ZP^WW6+^(3T6pR;w#N9Il@N^o&HOm{_EHt<2d#$
zG+S(Bru(}({ST#@^<efAUstDZ0Fyww&y=w|(4}aO>SHTeeEQH4Jv4h~s$QO>UvApY
z%d2@=;yYxn(XVcM01kNJeK=t27m+?EtVc$PuiA+R7LBFQ={sV~r4RBiJ38YsL^v2e
zY!{mw;u}N)x^l9emD_ThN>e_0APw4%I{nF6&OmyW@)VOJE4sMERx1}uBI+12Q#QOE
zqw*#2nU*ZOJ!gunN!w|yEFR1&lfXr-J?kQCk0WmF@g#YEl?5}a#JAKeYYl559dcux
zDDiEIYhLUFtVE|8<%c|NRZGWZC8ClSowz2Y8Do|RRsL-V-U*B(2XElP;%)lNhQaA~
zWn~B{QeC#iF5eoLZzn3WB(T~p5OgxxsIYS>s}jBoB%kLLr~F*BIQiF&y^Pbx<il~X
zS$dG`LdJJ=`R~YSZ!ddEmJ09D)ZuN^at2+!&Op<b@W9)>Pe71ff%MnKK{|0f59~FO
z!(uV|QB*DzR(A`lSh837Q~d3o@U?qPb<bNJ_Uyv2RNp?KKpK(NlB&HZvFVZVaQH8I
z!snj6-{E&Zk<5=n@*|ZWf69+^-zAf&D%mLb#<Myo@pjBN!@KE_5xAS}^iJi`#^dMP
zt0E(gWsOMAuLzAs`df{)qp=0U@++eHio>3fJQ3iUm|<^HR6f<-#8v(Z&qWinRcd=l
z{YX+PRBEA0?IWqnC3T}p9jH==N$PG%g@BlRz14quUKV#;v9`qEKo4B<iMSe_f#iu~
zk$H%jDy9Jqo=VHPyX_h2xzu$5b)~a&T!6W1yXQBfV4b}@z^1pvmo1E(+RN6zNuOdu
z99ZTH4VW(*E9Sk7xNuHX9opqoYbLZBs3(mq($)ln8JujR^dicVpB&@fP`+NT6W8BE
zPJ1zX_;my-xiD6!W%e84<B}wI54FxNb3vGCm9Fx&*r1I-^9zi$qh%{tXMb^jA{lQ<
z@~ly_M%$6R^NrZ58N8oQ!BDB^3M5cmI-)G4*G5^uvSd@7&F0qZE*uo-GA3Ol+%I^H
z+3ll-%_zXnGLnxi%7AAWer2&!%X)g~AF_T#<gJn^^$dUStdB5$uCsgp1%<Tc2nEs5
zs>DBTjNdE@28)S((lp7yBCXV849u*o^F*6f3sGw%c6L6i&f$@RWHQ7pzk7^=B@Y=P
zm(%x?GjAzN=LFUfHh9EIm@nc16dmk3SHU6AwdMzxMwX?Bh3mcEHI^mhvtXPkq3>Iu
zB=qKTgs=(vvI_Nu2r&O@h}2uZZYi+_--<-=9hQ}-y^zTp?m*lQ_A+lcV`Pd^xiPsa
zV2BNCmh4ZBRln+BmwMifV#BI5Y|E==Cb7g?hWZ3~p5-8p$+ub@%HAW`$jYgK%|OP;
zG%<ZtZb=U1I07Yg7|4o!zdO8J>f`p^<h0Fq`A4?L=$)3F!6@p8k=zpBLb3X@Qlu99
zmYHoclaq?YFV8az9E%R?p+PiBiEkInL&IKvcUL4N4EN!Dc7^r{A;znhzo=`UUgs5c
zy&;<XJ;}voi#-MYp1Iga9-A-2dbe44g*~ZP{h?il$f}`Cs2S$og~C)Q)Gm=*M!^F6
zB-t`X7huuMqy^??5T8Mgo=UPp8Mg1RyJH@&gjLsC`k7e!*2S!S`;@hBpTy4iDqX&+
z6122oQoX-BhKODnD)fP$@XG1E0!_OjGy53+iwxgz#6VHkfxUdLuEWgU;MmJo)O85_
zkrkU)RgSM^$bzqHi`nA4SeNU+$-3P8qsY1((MS@;bCg)T>5t4L=8KFho8x25#{23K
z^5bt`g6;E6XTi=9=NmrLm9`r@O1Aq!e&?@rVnV*(9CNCu9io7Ck`li5WFF4$_kb+4
zl{>TDgIvDdVsHLK*o<BBZ4@@r7O9HvQWY5o2{7Q@0rwq|yeKk~cMuY*9xQ&JwC6a8
z#Yr~u(<&Tu)eM%QbLK4!^(&Gl4*sDU5P^KaePLWpp>|UI;2zb6XNHy8p_*WA#<>w{
z6ia04DVAV6))BcQ%Puk|!%SvpD(nA1;^6FM??CMm-#%w$DA|dE=9KWJf09NdirqdD
zqBBEH`bcCUo3Il5*WW-F1KWJo@K&H^q{+){dzpxw2&&3R@e~Ql68Wv-4GOygh2i`i
zp{9{|Aoky`g5T_AVx@8T`@f&Wnv+#Tc<HJT7A5gt|CF%ad|s4OX#d=m(?^u~Rl8&v
zxq<Bz?78Y!<(`Kwt2R1q$wySENx{B{mWHNV^bnNF?_&`GC3n~SmEu447d_2pfaXlA
z)@qt<Q<QUqJQh;bY=nY%GgkfPw6(*}giER5d(T4l)>)izGW_-7aZ%BRcp95sn}}A$
zZ^MVaDz<d&d{?s*T468078S?v9)g|&<(D>E4CI&nYm^dHp;Gq#C~*fArQ8gcc-k_^
zD^dd`a%&DDZTai*wB@f1jwe@%?>8CD_RdvX4ByX!AR^0{DYkd9*gIRzFG<^PRQ{H1
zv|8Xw`vbd@?0(MWOPQ{`-;i)>$q@%dsu+#F6?@<gBT;;(!t1o&F{*!9M74A71+#8!
z95l6%Gz`XCs3wRFH2j?p3*~khZARx9!~=RV%2s;5{urm+PodA$dR@kdTmr)RD?@MS
zXbV>-j0GaIav!=_R%qW^S6YSkDoDrJRk|CKC^HsQVrA$gT<kKPb`uIEks^YwBHd~R
zuEh6UiT%Svf=#v(ayIg~a)t>RnSY9wA~dpsA`V@6oKi)tM1=h%UFG#KXWCMWxJKHy
z7IDAD;qKz4?to$iXl+%ASW2Y!xSl7Vc3eC!@lqJqjDmf3?-Wq%&8DIR0d@}iLKj;R
z?;17~$0dc`#6nw%tvY0npD*OGMhOPlae`557nD)jVYC^VV;BX;JZCUU6V)iGSeN+v
zWV}`*mSMr$7>=@)hSt^wa-HE_QL3#%O)nCPZ_Hr<R&4oYj4hWs)8?~5f-S!>(%9?5
zmcN)!EfKa<<ziG)vC=p8s6izbi&1fl1qwr7jH<Y^dwEb8)-deKgxf_Yx~Gu%K8bo8
z7(&)X7LNq|>N3SNDkxP%;xI}m^J>_z&qkIU>^8Fy`WtzNzU-#{qD?Mx7xEHQ%HD3i
z6$jSoXbu(aWIzFQu!#>*OR2ndB9jU#>|QSk5}Q&cc4JvxBZiV6q*D9W%MYwI7A}Oc
zVKs4_zUxz+zHu3<ZINsyno2{vcNe)SDHfI&b-_hW-&I-Bw%JrWeLb_)VsC$ak$u8X
zk}HthB5u3AD&$<)BbRg#TJLt01O{h0{XG~ovpy=wh$hy+G-BXL&&AJBfxB94I7!a>
zopZzimxl-T*ZTPfOwVdI=Dq>BWXz)B+#34c#~-Nic~=bMYJ74Tv>{)q#OqBGXFv@#
zX#I3zt4$=nNTMFJOaFx~6X<fOe%V$mRBtQ#RZ7TxmVWyh_AB!zE(b8rd0S=gtXqBm
zQL@k1*`2<%W>CLvq2Df_#M=e>t+`CUS~g2zwq)6{utQOP&g&|}I`e%p#BmS$`E~sW
zjF5!~D~LTeWpwvrVS8S34nHf^PbNLT?<|sJrUbvgw0^hecfiV1$ZwnVyAQuVyF%w5
z$nV1r{d*X{$EftOoKgmX$Cs=}F~sorp~_j7^NN0)swynY`A9#uvNBib$B(VI%k^VF
z>v5xg+;2VZ){mJA=dv8pv1#Z!tDy}2_^g#VTR-Mn4T%wv%nw_Sh5GRwD{~+Hc(3(1
zP(P+ynPvJ>c8t|fseT-5y`8Eb!;${-2zy9B4c}HXwR3XPT<Mmx{@C_vN)+cW=6@^m
z4}@}J!c6ZM>tA)qRo||bHSexc3<;OIPci&GH;U%^tJoszH~lGu;=8jc?&hY%mX_o#
zqOH%^y1;s8otK=XT8P*K@Z$WW{W<1?f{qS%z0|V}C$Vq|_C{+bqS<JssD5$rzsye*
z??WBk7vm(a-0il@pF{4ZMnUIF&#$4=usv1clp)RYyZk<+UZ6l+QW>hE=9oU*$K@Z_
zUR=duit-n=MER|7U*PnubfW0-oL%B~wIpt6bjv;xXJ9Y;St#Z`nlbVe7e<Q8P068l
zGL3^G;2nWpiF>Rcph&$U-#YIm*=8*>Y`BJgEb)Eo%sVFD6G!Rj(~20A%upYhN*&&t
z&mW0FB={n2iG*z!zRo%BMxsh%WvmVn#Bp$s1M^y?*~nSBy^+&DwB<0Paz`VUdRay+
z%hW(nw9SY9FFIvU`eS~D<C<$7VRy@cj=mTs`H|tcHjD;fdU3IUe*CWdOW9P+5|2Po
zytQfoBKvFYMso$e7{Y*vTSfkgFI^;UiN9w@sAn%bs2&i>-a|Hz8<slYI8b48x5A*e
zSGGcVM5Jyt@|H<-5_Ts070Yo)6X>u6_1|75kr6IG>Tacdf|w3n^&P45;9VjQ;`m%A
z^%rckPZYzIM1lqSpgb<ImlH0|wro-8(GudDy_=3p)1lMQU!4Vi(hOlm)%Bbhxypvp
zF{i&X*lafP*ne&`U{{p>f~w(Yq)T-66^#1cRQ2+Z%E*zU!^r&M|7PbBdJq0Jo1Tny
z)#`S68kIun>IOnlOy>P{SaP6C(t0H*3;%@-T?mS1;B_T=%^y|zFZD2vFOqbRPG7Fm
zXJNaZvzT;oklv(UeWqXCg917ypNXOce-~aR%G-!N1T)nw9PMK{ymZ8aYN6iEhP&kM
zO%gQUJJ9)@EBvl=lG$~6h=DIyWG_2~N;w*m956VFWcO7R``nW_8F%G*MQu_!1mztD
zjcatT@H5qotna|-BGA=1{p&T=$2R)qRo!HfxRkeZaG9Y(E;b({L+?P-Hcsz6)c>!t
zK*;e`1}}|;)q5ai=qy`>uP5PXcBhI4tG3}^6SSF!XS;C%9B=FHYXo{={>4~$s0o%$
z-yVs}IiZPzy$1UU8`1Cw_jD<xrQs<v<qxInjrfP!7KQwZ{X=UU{zjT2<vV42Elzbz
zJ238IQ(WS_3yOkgQC-I`3MYy~hi#1TDgnMa-qx8{9Y0sTk#|_A3_mVKfASJ^RHFPy
zbosk?RGhw6a<Bs=WU=Fw{-Sx9L6Vnfn?W*sYGhS+F9hrEeQanFsYYJ+8!$g`S9woH
z%6VRoRQZ`z?Ul)?9RV2sQ63whF`2D$v@t4nRIX*`M9uLA%XgrhKEcYyY8*SSRfGKK
zmke@Vl=0@jA}_y^hRoEHaNFlFaLApLb?ytra^(jJC{NV8`LxR3W%n9Y*B^a(D-I5G
z4Cx8{OXMy4aEzhKKqAUP6LD+eJIyA4lD|Qbzjv~8MX;z~(eO_A7+iUad<~rA=gEN)
z4C52UFfQRDbQ?4JO0)kqi6$70uFbwD*W8f~+IaTFp*7)Sh0Mi*%tO?zP+eqhe>ol{
zCI94jn(}L4wpgjW#ds`NAPZc!!y{T_3>nKQdBcy9t4rx29WHa2#cv}%nqZbpE9@g;
z(OTv7hSP?(a0afzgz%lcJWbvO(oYjh5>ac)&GU^!k`P!jvbiz-sQPiL{2=GZd{#?4
zv4+W^fkt3dp^+jnSZWJ+*wlx`s-ywC_XT<rd|A3=pS>y)f+AMc#1NELIG3UkDBh#d
z02CELvEDsS7JIu_g(=+~lKFR4Am@Va{`3n(Sj(yh+74fO4mN_vGkW=57YO@3q2xs6
z<{U9S8fj}SKkh(~u_^S1r(uH$bL`C1AN@YMPmvKxLhhNDKZS-`{-WZ&4SI|6#S(_7
z4Gn4mW4xqZ&eJb<ULF-NInrfJrRV4j`*_0=``eS0cJc}{si?zgyhVho!$9*SdlohX
z&XuG&{OLKtT#@CR*A+VU<9G0SK#WD>^ap$tO!=U_Tx+*Wm7C8J1nfJ6PcWWhDI}s+
zEQLhk;%i-km02PdH#+TC{AAcxxe7uf+K4@km{83^53dGo@#Sm+*%|nuf>%%#8Vt_C
zlw>O#5las?f()|iwoxtNMIs#72)oJ;+snDRJIQ5(<tmL<L9tSoFLv3B#29OyksYx~
zM-JO&bxafkhO_*4dwI6h8mV%Vy}T{Y_=?(L61Pt{p9hSzP8%VLM3{;D$Az^eZvn@;
z5X%Hw`EJ3uNnGNkyi%ct5rsiIgAnY93r?<ew0jk<_SqZ_?2Wr>0I!XJ0~1IO1VM>x
zxtPoC7CEYW6rz6hFhuh4)r_@dKwkA8XcSZufy1e$9s_%Mi;fSmAz6vbvE7g%Aj?rb
zQAxv4eF?#4y|-3qeUwqS?&ejBP3DL<_&V4m_>8bopv!OsgZbW6xbfh0#c&ttjLT)i
zd?$HA!^m}(PHc;*V$Qvicr%HqI&sEu5{F7+4-)sSRP7ABpTzq}gns6j7gWt_^{Y;A
z^Xh(HnRn{+>943FE5?y7TR@Bdc}a|>lD&_o@H>v06F$~9;fcD&yBS&sOJ7FPoO7k#
z#;WL8995g@JL~t(U>4V7td`+u+B=V$#`kESRH(-%_>in};-5#UN0U%O?6|`}Aer6p
zxhq1(_-YZl&OmPl{5w`i**m^ZBTS|G4j(&d6+Hlj=cpBamR{kjYOe6}GSns@vcex#
zEBsNl!gr@;G#I_Yi*fz`)e1i#IoL^0>)3pjm8V3U@ZUwFV6v?!80aS<R_zH+XfKo?
zvIS8HTP|0sw}`?daSS+Z^Ca4ZS(3_>=RJll%UVI~W#9v`J1XnDiafXCXH0QaSGm&m
z8Lf_rL#=s^z3fdYqy}4PW5iM2Hx^2Y9F1o6tW&->m#-=I2Ju$oP|0=?>N3B+80y!{
zs>jkNUUGla9I?D%zhhO!bA37iG@SnNt#{&`NDJ*Qk)}Q8PzY^3M_M=LLL>RNA$Tn?
zk{mST!M8qmll+|%aYFYBw4>JxWFbg}T#H<JxM^-uD->=V5d~#3^TiY0ns^;?@J|+5
zGSNq8A=%5gjwj$+vwKI85OGAiYzRW_h|Y8dI#2Moo8s`bo1)N*^FNDXf)jQE+r#wu
z9V{{GkL`E(noi;GA)-L1B=h4>`H|{x`k4Gpr>bP9@;`5(U8PcP!vFj*!t;0y|8rRU
z&tdUDA8-cR(HO4haC~!xN%(tb<MegSe#K#Lk~|mSnwuesA4y^b&qZ^yRcd=lT`s8`
zRcfJ1?IWqXB{fN<4pgbbuw7=y{LdZm0n8P@GqK|kWtKi6A+G1C6u*$-=@-`aH@n2&
z{3et7DqH_l!|?@zG&#o^iBZ^PIg@cKTTWv!ChK4%#+ijvJkZ$9!{NCc!cDHrpA1Lg
zPiCKiKY6h$Z4VYhHnqy1oSx&pT=|oK5Kr<n&YvB!{K>AgP`p1G9*pxRo9%2I_)DQ8
z&p@K;MRkj647Xq!ZJJZ!TkB*Chl<yp6;oLMVTKVIRHP2Q`xY5_YlZh=*x?1@QB?e|
zW`gjdeZoJ8Hw_XBgPO?15ECuoewK&1v^YPkw0*?)y0?@po-!3#yn!ZY*qrlpysju(
zLV`RYlB60(o48Obf6KAL7zfJK7H?{Q)DC{(O?CRdSKib%fh+59raFAz$2n6=Y$p4G
zY?uGh_E_%s<MJrsu>Zt8f|1xwn^3D9mpHf_7HwdR#~x|J?fsh!90fayRzMZ%7^-r5
zv-k6!LZK4dLgo2x=VZPSg(XVLA^W5$%wVT&mv%6Sfq-qcvtZ#vLxUbQx5a_#LtX?L
zcPN8E5{ar#5pFl1{Myn_W|2|qDkqi^q4bOSEGI(5(r-llDo<ZnLj{omMO0BS5=>U|
z`?FlwW$NFL0OmjJuueR~d7NxG<VxG9+>3ZU4vE)ezu{XTE<PR{2lx9HxcnnBoHpEX
zc=y=G?J*Z^z%Y;A)LJ^=??t@XpY};^(K}cvBa(ap0sZwHO4%o@f3?$lBzc6GV|>ek
zxgu<6-A7nF7r6ZTr*uPZzF1o%f#dV}wY&&#QQPnZ(yIrS47Tr=<1O0<+efU<=HYjK
z{K1bAhf+Dfgc1LHoJ-%cQV#zUUppqAdTe8C%HH0Jw#c}*o@LmGEGHFm9Y>M8dtfc^
zS0(d<BUU56PqtTYZVd&DgR5P(Kad`GyuV*gN%7yjx-&ypBd<zwzPJ(jDJYRsj$&1#
zxyof0%RJ91I}TyNBbL+Bh!wQ5sI`l0FOYAu_d5$#GVxr#WA1#8ZsLsJZ`cT*SLY4!
zI4QsA8sp#^Dc~!b4cE`Jxolfq1uH!p4BNdK#0mW7vTZVaz2NKa?UhO?s{(_=x}~?q
zlcthT-MuL36B`z<M0dDF6v5G6F40xNdjOV`{2LZ1@3`EgQTJHPxxhedus=6<za<+1
zLW$&&Sp~<j?;avkEBGa)N*^ssa(5;){t2f4kFTrgpA=&HyWnGU-jhmoo2S!Xe~0wm
zlKu|qA~r@+1Nhod+Hd8OKdsx3{8`MRY_QDw%_#0Y+QB_o90g)lWXJStvbtd6eqCp`
zVbmEH-_eJf>y!`@9A*D-1v?86)dx0+SHaxD{(-=X;5P{USu%G+8?-+6gE#`MqxM~t
z4U|X@-&%)vV>e899qf}*VLONXDmzL4YBFM!vhazyk8lQYPX)a)MBask<{HjUsGZ0P
zHE5~EtjKvKqRc#LhTo8gc@Fh|$$NDQEanw6T5;nNb`zy^vt98!GMv5!)W*uOC^&)a
z4jsQz^9<Hz>GL{KhSb3kR;DDQ6>E=pm9ss`zTNmsc&H;NgK||;j$n3Y<;I3r7WB8Z
zAAWl5V3yGed+l!NQ2B-Mk*FezBHHX%ij0qos8|2u<o)>H#gu1AJjEmx!0p3m;ru#6
z3T-!#b4j7rF`YqlGdz>?*nA(kRG$zk+XEZvCE{oOD`RAq9Eq;nnjC5$E2`#Ko&4bY
zK~8VlKjffFHTPp6GS^6qrA7XMBG3ual!j5SQD#9RHQQx7;w)Gr3S@hwN)Ep(c&qe&
z<v~tUmF0X&-}ob^1^83JA23h%P0J~hl*e-(<4@!epgtw&@OKqUOrdl!+w8u9jRvy3
zy*%bRZ3k)0?mY%)vTV6`Va#Pu>`5!}k4z;B#nW2Wuo$}#0IeSeLYjg_p1CEy<0byy
z86_AnKXC>uuiTtNAk4p(!U%!JH|)V6%Y99@<r2VvGS1?+2Xzk@o?P|=byiqsXSW2!
z7xSOa&Ta`Fr;`}x@j%2KoRwRgI4!tKgr&`Q)C*(A?r_?n4$cnO2WN*3QwL{Nnib!{
zGI-6-sfb&3es<?8{8INf?NsMycdGNVJC~ChJwLm1BTv!$n|8`sSapAs+`5F!Uch#W
z<Fjvx&ypbTOH8FFR&Gwo=S~KFe^XZ}qt4HE)n(8v3y#)tes(G6XXXB;<cer~gR6eF
ztP$4v*>4eU4R$wBr)RIo$JoMfo7+xPr?d(gft|Y=oZys}Ax5UiK`niH76<BjhrdTM
z(<Z&xe?MAS%4?LDleuZ8n{QSS%#flEg%}IVa%4G>fBbw-|H6D+mNUcpjlr;xAw^c*
zky9kVCEyz2z>dP>{Rz&wga3vU0frKuGOS1+p>^|nv~NaEDTST)&c8$KHZ@Pt#wNAQ
zj5o;19|_Rm&qw@$_Lh~f=*PrUwZ!qTEQnWUU+_(S%@J`Dyn$b?z?ecgtZ|eC!#ida
zU*mQ8Jx2ce_z7Q#&mrd*r~fjn-xzu}dM;-fC05cbZV0yi^k49jc)Al#%+zK}bB)?r
zce-WN@I|=SL^{PO&ny+U%~NYyv-YerF`q(kvRc@R^velf@bXMZDZYqh`c<BORUqY>
z^GbI9pYRHa7$*-ok12wLy`VD35?GzTc&aGb1b;9F5{VYbS<JR^8=K~By#md*fo^&<
z)Z5Y@?6Wp@9-e6ghS?;Ln}Z#mB=<eC{R?f0?2TI(0jN%_Q6vLe1@^JZ{c=-LsHH&_
zU5B@0_YsYTEi}@O>$*IjA|N4nT(tJw(Bw#3sI-@V^tLRyG@(haz$GA9oh|Y!BaDSw
zUUA?i_DYGi=qrM3Vy~1CRIje`ySE{>4!Db*-m$4k?yk<jfGmz<g-%uST+UlThK!O<
z_orf7_CI<^stJnmHqve=jwPTc*y9DW!hYGkxyt9+r&U=&!tB^wL~A@<nZ3^1%_DG;
z3LTwVoF*k@C-RpoZCROIWU<#GbxWySR~fB{;-NK`{36y8cZNg{2?@s^?%udQHy1Rc
zl~68+X`q;O{>Hq0A19e=<@{YXUfzVGiq^_nbA9c@E(ukI!+b9PrD9QRAnwOcU_Y1t
zJm<Z|XWdckFD9B#t^e*JD*j1=NM~35k=uXDygza!sErUrxlf^<agb|5<kGi##=Vty
zp#B#7OWHd8#d&Hg)8+aK713Lyj9O?>*WotcB+qZvBnR3adQ7HGm#rA*LielCsv6CB
zp6B&Nx(-TbiPIt0Zsm3wohOYt{2m)fx25qsO0+p&Hc;y*u@#RB?Y%$?v^Nnwkk{RK
zO+QoaaQI8;rxr`vLrgzI4OGx$Rh<nv{Uwb{{Kc92Vig)Frvd8f@PAPkt#<)whYEi|
zk;KPrUP^Uzro`dD_G;c=%ljJ<X1r-GZ{i%Imf2NYs4I5YPYM>oDe(iY{As1*3X|Lo
ztjjq3J=q72>`16(J^O@4B8ZTE4i!|O(#k(0O{xB6zaZhR^k1?z#?n)rfnLybiLqk4
z;cvcNg^9@~?@C|uwTADK<xW;1?_t}>wZYd&#b3<f&MW;*Q6n?1^rcHAtM7+cZSr1U
z0E&}(4{k^OP){C+NBF^Gga*XtE<%+1FIjGMx@3`fm+=@aiz_nXTO3xzQZ(OvWvBiX
z8Iog<<S6*T&6G-CE^iDUNP`Hsb#dF_@bOq)!uE1;NeE34BRVh?BW$=x4oNhmHw`gT
z9S99~a<o~_5=h|(WsdG<MI8X++^8P#k`1UAdEi9#0GiitvUy03Jmm6_5_!PEHK0M{
zp(76s)q}ShH~j(6G@N7ZR<-(Xwa+OIw5G6UXLZj)n;u$gORLASm<Y#H2T#T0%jS1p
zy%G$8WoN@2z5Sin637sZ9|_zfbHrE?gtCjc=Ws+58Aq3|^CB5$yjUZDat8X@dizJO
zO;zJ{CHcQGuvfzvlK(|3KjY87i?RPH{(jiF^(*c^7rW@;_(HynDD$F&rn&rMQ&C<$
z>@y;}DSP=Aj0$WnOZ;~s<+)-&&X>tD$jO+V5<7;Ert#dZqL^A&x%^7__vMg=m)m)n
zzC!MQSo9Eg90kvVvuh}&M06(HEV?6j1jem(vuGc5QZ8z>bW(kp%aL^JUQw-+-mi7i
zKEvX5Qgy$|_Nl0aN*~=G)kn80eRR9hN4GEMeN-Rao}iCz7u{0nqgXo<_0a>Ok2Xag
z#o{m5xkUBR%cYFcM=#f9L?1m~Qy=9%(QmE$MC0^PBkn#I6vl^t(+VU07R*NZcvV-K
z*Eg0C`B7Q-0*!AbN8J|(WYtzlBlorxvn!$YQ?1(VCBD_>uY47;#J4?8DJ{ww$XJ~C
zIu{8DZz`a<fE+pV6L*tK5o(gU%B3ie4OiS<E=J;AF3y8{xpE5A79lkMzg*@bibtw3
zex5#*3|Y;QE7;4`y3|97@je7IW%oN4p1XF|dcrNt&cCzj`%-@U6pjycc!*oZq2o55
zCKon*EM+Y}s#CrL@@FxBu$FXL#|b{%wvFE7OEbY4e8VzfeNg8`<z^*+Z-yju?eplM
zrBI!<CZj=eQWyC>q5u`DeQBp8cdoAj3#rOr<W-Y@4M%5G{29xvwWklR#fKKz-_w55
zQ%WmdN=V<Fv!&+6;x%41PVQJtvg{`!skt63^TxR5pQIYy{H2c4uBvUU3~_DWEXQ^v
zo@7s~?f!{vmtl>j+CGVGU$5Gho^fC9b%Y`u_Z|<s@A3U%zStD<+Gh>S>TLJQSpp7H
z{n$(>jxjOw>*p-m&|zPYKI}@4V0dkWKs{>`=yM{`A_PKA-x8cfAu4+yt}?X=qrR7;
z^`%Hw+U!z=vz^>ih+0&{Q$edAf20yiWGf-J(rSv1ADf&smF`3@&cb>vu5>7CE;u#T
zFELenIi2rLEB#7>Mv}NIHo|-Zs;a$`X0yUYxP~A(NAeiHZR$XpDT;#Jp_Y;4xkTTk
z=9V*rr$jSMmkdfc_zBvzEz~==ym>_J&ixly$gzF0kS95Z<L&-l!gjYkD?MUM6XCsS
z_}kRr+r2HM!a%R~r3FduY+nV=gk-c}ol`Sm-zpUxIgm0;iguq<7eU_KGM5EWZ-S`)
z5u;^<A9NwbuzMdgIrmF@9roU)vbubG$(Sz#<Cmy_E<1**3O~eI;{Qr^#^(G3yhJ`A
z)UduNRLzgLmW<hV_rDu(Ic0K^yMzDH(dN6tLB56N2vOP5Xga){tJ<J}<$|AX;BnnR
zZyMmxg1(8$d?iTH*hqlK0$k%zrx&^Bv;z%wGw!E4koYR%!PMw0mUuzuyo(6cxJh>Z
z=$ihNNc^EPhRaCTI8QBRLsaDVzvU(ie{<_R^_pL#Pl@NL#Xg0HuqD=u*m>&J3#G3c
zkW<sPNq~0rJT(Ma`C9BO_2*oSRrguyL#3+6tI^Hl&QibmD0B*RS+8F{{XQ=vXQ}=A
zRpvzm7hE-r;DYIo#P=B&gw4kP@!9wqR`*Ozio0AddY_`c@MQe&;)viL*}@&-Rz7I9
z=YCWp&|j?B2$=)s)JNpG%dldaa<34Qz{;?5yB5%*x-92O{@AdY2zv{Mt7-p>d7~7I
zLnEc*ha$Ys&*u-9_nxQzbZN_<>XaN{v`ur|OZ4ZA*7cL-WKf-O)mc2q4rWRUwPzc?
zAxxeQSKK2sYy1WgWnjuq#ylWV>6>s5MGJ1>ff6oV;?N*f;(j@SuS?9A5+BBw_&8Ex
zl(eyjkO|uG#M=0Wlo%6VqEwe4+l?R4LmyV^K!>l}s)8dVf<rfTqZBNPFW56$uvM(!
zus2n~WqqOrv!&ok@dZzg77V^0Z83<f1c!^-MGJ15LBW;8=|st~ng=0Zpz*(B1#grR
zfySeVj<QhwKngw|UvNgW;KQ+k_uZfhPI@<5&?5y0#upqEE$E09+}cGId_#guG;ump
zke|YC`J;v1I-D-uQFAJ3I+f}mzztYmm}NVkS6SX~p|Y48o|9f3G%uSX$tfEb+m*F1
z#T?8XM%;=x87|@~8<AeAW^3ZYoP&DjNZb(II^q;Wk4%6>lh)Td!qAJLbj&1>E=h&-
zn3>2s?T-qij3P)r8e=Z&DOLq`Q!@C~Z)n6WeM?=FJ(8qTUeZZzbkZFpU684hGIY|_
zBuyEkla6r;v+y$;k>uPp+o7D>qacO3jZ~7(C22OEW=Z;9C&`5~7n6(Yc+~aIJC4>*
z3Z>@na6EPa)ySnmEUWSQ56G^<JDXbCQ7HCCQ)&w|9wL?EE<yBe=0Ya*eFV8II{YVe
zeNN8H{-fe%T}|-jGz^Z_0vwG1=_di!2<V#*xCLD~xCp;+a1{=f;D9E88&3fgoeFrM
zDd5~pz&{1FZw7clKo3m5!94=rI~{OPz!hfzW(!C?6EH}?908{{2b2l0p9PpMpe-UP
zxLd$iEddj=0eb|DZUt!A8t@MR_q72Stnk4$Z2>O{I4qzDO)YrqIe;YsigN)K0y^aZ
zJ{RC10xGyv!1d<=eibmARVq01d_Wb^$-&;20M-e(hA{Zx1_Ae63iwUHu*(2P1vD!F
zI4%ba?F5+A8Sq*c!0fJowcP;w3IQiy0eId3<TwEv1WdmYP}~c!Tfhr0K(i9Ss{;D>
z1~j}1a7rJ*Syuyc`vL~_13YmJVBNKVrq=^*y#a8~jR4P0fRAnl9PJM{`!>MUw*zJf
zcy<urR{<~H0qA%qV4r{|1_Ra!$hiyffPh&7t{no{Ex>;_;Ou(<w+WabV2yxN?gfky
zkUA95Pk;ow1@rC$OcT(JShnD-VSo-J0LumRas$rx0I=jTz9RvrjRFi8a7@6EhXM5;
z0lXyO#zz57M*}_-aQPU3XDndgIKWW>XO9QGD&S86cTE6X<OTdJU~L)TlXAd-iGZ|8
zfKLTn_!!_F0d0MN(#e2ce!!anz=KZ!eiATp3gDEffH49V3AmWmGx&jk(}|D`P80Cq
zG(e-L0p$XA3pnptz~JWquRag>X*%Hg7XYni02;ptcuYXQmjGQ~2Auv1;ClhxUIjFI
z4Nxv1<8{EJ0=5ch^#)*ufYbg7c<oKV`ELWB6ENssfU)lYo_`l0?up=s0&aRA@R@*&
z4*>59=<y-oRRQfj0(>Fh_J0Gqd<+;RV8kbY;{tr20`?0S_ZeWj0MF-ujRNld0`P-?
z{F#8!0@et){Y$_O0ng0>wD}6~fq=We27E7|+ibu)0y5_SN(G!b7cf!49s&I;0BZ#F
ztpv;#kUJ0Xlz`&`?ydrSFQD6ez&ipm7XV5Hgalky4X70Gr+|?Q0UrxEe-Yqq0c{oo
zUJ{VE1n{YVZr=cw2)K4BV55Le-vXWyU|R+lE#RPl;okv%7GNv~JSE_Q6@bM8?pO)9
zbroQVfDYdSmJ7IdHQ<1N${zrauK`@P7O+>q`|AL2{|GpBJ>Y%;hXuHQ0(>Xno}U4$
z1;`b8!H|H5HUMlJ0gnqfZ4=-%0q6e;m?@z9X23E5H*NuJ5%9oofFlCRw*oS@0bUSr
zwh8#RfX+d{LIHiZ1AZ1TBm~$eVCV0E;0{2}F2IKZF5eCC2>3`qNWj^90JjO4A>g#V
zfZGN9EWp?Ys1oqOe!%m80JaF|b^tJ4z-j@n90Y_80VW;>tPs%d2w<Uri;e>B6Y#cx
z%>r8d33&Jz;Clh~<A5s#lnYoc;G{6%Ap!qJ?FjxU;4akj;7S33dVnGI0sWEzPY5_Y
z1u$R0t$YtEct%4&xqu1*j}lrDyp+iQ;2r_*VbKoWMUX|Xbvoc@0narCtP*hY$pEK-
zG68c191-BL12zf>2{<6Yh8`d6FJPg7zD)o-1PtM5PjH`rC-|IaFs~`#QvtVT0va^~
ztQIivG{8{-KX7Ux=r|p)RKPX?ea`^=EMO+?z+f93f5EW=PCpCqy?_VU`UbCT2`J76
zWVZsmE8y(bfF}hk6>vQf>%rDJfNurlwgtQ{VA<J#ymJ841cU`_N68DOoC|nGz)2WI
zgM$UMx*afj5a8K#hUYc;RdNR)_fCLU!12L=hlc=i?gl(?58%ps0d2X+GC1r$n`O^A
zZhm?)suDUHdz31(XMG<0C;1;_Cq;xAml~*0=3vVrwslrmv*rJii`b*5(XkI{eG=19
z%GTwUU!32+@Vjs_?$8b4%p<SzIS^lUVc8B(OI~?5*po_U!i;FTcDYDH4CN?@BZ!ya
zst9dY@oiA1m~klE@vRl%_>k|BD7P42{789-cF+tt4(U&sPCI%>M_I?M;f=iSYvI&Y
zzX)e~6fz5Wb@Z1Q9?c{wO7P&)T6=dcF*#^%`9;t<q^hbi-~6dwQs^t$k4DeOC>*#P
zJZ0~98V9@#DJ>KZa(+Fkr7Jv^kO#lMz06Cut#j13zoB;EuD}ou^zSkUy;o1jM1{90
z%$TWAs9otyCO$vp^tJf_7pu#aw)jj#4f(Q=%ic3YI@f2Jmw>N$Su;VNlOA#-t&I&t
z_v+-XNpci`?~g!R>w45GbM^XqEOEG1YWJsgTz`7MA~dbtpCij^_vat4C-mp3cO(6I
z@@JuRUHy5P4butwbH~rM`jgKl=|ufWiS!5F5N5mu1cfmg!CE(m+dlKM?C4XvYqW{X
zC0OJ)5)S+%s96*iE^G`&;y8yH2+!aY(+2ZL#$MObhg!@Re&mr`>?%WV2tOpu2KY1P
zPr-kCKz-sNW%;)nnX}-KrqqF_P%*wfI%$1lO4#*vGAcHCmm$wzulUXWdApXy_Rn)T
zUr=RZ61h`~I{xN1F-#PIyrXnBvL600_@(**@5#Ihn^(x*Q+GukPo*1?Jd)v{*-B-o
zl6v*@Fn2*k`5+=K*q^El^iK=hi`Gh<U`Hkha)Gl$>F|t-M^wi?E$^l|aUJZ`i&uP)
zFitwuq{sJBl6<*T3yXI2`rrJPvFZ$Yj=dNmj~t&(IJy$vsOeVgMQ|qDYH@at%3nW~
z4=N~nwQC`?YFgBBhesXeb{703SFLV#__zo*<@k@WaW8>lu7b@Y47Fc&ly<%>$xV<v
zpIs{0@7XPLiI?X^{BgW*k>qYJM3X9czd3X=;SBum0Fje86$#$2#$CFVZ!2=+e%PEY
z3?noy*i2I7FnAfM`4ymNf2tzTKPimHh(9)g+Jtu#qiDW^p|Tdmqwz^~(_4=3S<Kh(
z8%^EQpzd2UBGeU8AQxRK>C}!aitSL6VY~PdBq;y$<d%T+7J)u<14FXCN6xT!Hv?DC
z4LqoS@5%c^ddE_dT)xB2_o&*wx=#4=>zqV({N$Mgb}WNqqM0;5KB;oV^W&{#686ir
z*e_@{7WTtRps}wq8*}S;&`DxseCno$^}W3q|8-fSsgey<o_`@im0GN&$kUj-iagUH
zkMeQ8tmvfq1=0w8l3N5GoEx}0+j|`PtO$&n8yJ@D4fA_95qlYYEfAK{EW_0>n=xEG
zx)v6krA1FSiv{}*o_YfQi{AIB`7bY0nc_dr)BF-@X4-xbk-5cQE*G)LG!~Aue;(x9
zWD*n?&yx+f&$Bj9?uM9w;<<ravb~4E{AULb+;=8V4tw`+^S+4_1r~G4b$+#(Gb#e)
zd#KR|MXyBpsDejR-dm)F!9Zr_8~mP(5q}pFGMw39y66rI%hub=r$KY>SSnfr*}g*i
zMdm#y_vqa`O|2m!(E6yK^Gm<dRORbWyUfn|wOTUI(bXo3mEd$NQxcN-Q;F}GGjB(U
zjYt_<-)U}wo=)$45spX!b0v?809=}cEaY&d*%Cra+-8EDO6c)_SReFa_0huE`q2D&
zv6G5@F|k&~iC!`)PMjgcP`SdH+n-Z3!eE{I1GJf~X>)@tUERcl$bkT8bBX#n-&!Qx
z1QBd7v-N95o0EkEe<%|CtVjR~JI!CX5;l$m-|=XXfU^aP1kLLtf$3Sryp;1&Xls3<
z>e@mAV?@$7W7c&7ZN2R1jt)>ge3cxkhis^2#lCj62x_)C4EK`AN`0XyI(#SOpg91Q
zKtko@_eFvny}Hs$liw{!3;jc~H&)&;$=8T9^&lCJ5w4)I2xtE5SxAo@<DTs*vE|$1
zDq2mx(3#P8w>}wbcNVYI$JwlYRGFhzf=M9d`R}8++sg}~MKA?WV?HEsi>2QxcZa&d
zfX~Wtx<eG2#}{bQwSaV~f~btXsCjr-qk5ep;Q(?%Lb-`9mL;6I>KS2^1yp5Ic!^C}
zMnYvW|6siI@@j79rxLm%lzhL!=NxF4>-?@(L#>3bTKB8WDfq=h&&BsB^8GEWv?;6S
zYec&7`&)ib#<Ld=XZBTyRF|#yT*(VJcGuFGY(_!m^2h+6A?xM@PNJf}grt;G!-IuS
z$V@%p-o)&1<tdF_G8QP@W#_=R@nk>Z<1#*>*$p%*>_*H{%9f`U&3rq_rdPd7E%bUr
z0^iPX<~(v}1$^Vm7*4qnoEnqO+$n~8aB3j)Gf%DblKce&9@)E?k1U5M??%?6Yy0(y
zptGP#)1wH~X7d}=JUBB`9*k>_aET4(W6C{104Trv3orRo&X1t4m`pk#Nb8570i`1C
zMg4~=a**2l>6YAQP|L1qYNb;;O7z|i#WZ;qZg)daUWYU1lND;PoUdG=mvf8_j1fo5
zu<tVvDngatU%>;>g&eZDPm(cNAUR^n=QCu2KzfGg+d|B~-$5t!{X}(tbOhhm=jlNc
ze0!b{--pR1OdiFzc@*E)weh_y4&RgN#5Xe$->r;cO?+QlRwusuDr5M*{M3o)yO>;p
z?}!L}-DK7D9ll)C7wZl5?Jo446o}FHW>AUJS8losn?vL^TzHa92G*kZ$<VG2%uUYJ
zgeL0v;+pa%ZapsX&DTN)c@5+I_?^rGOhRDQvQ+t&lH6@+Ulzj!3?6b*e$G-qNo9-q
zYj_ornRn)|7j-;G_AhK*+KGPDu7v|tyjetVF;Rkwd^-s2*@5z$po$3pnRn{?<>yWO
z<g%H+p($r0p#DM`QTm7aeB>UwcUgrVku&*yA-|c8aNKWC@)65ZiAv<?oJ=zL8yw};
z%<MZ7{1$yDLEc5{+cq~=-v)l``d+N7zT8-SV`|o?#v}I0#FVL4wZ)+M7{9eug~d0V
z>7;IyBQdP-#rETtJYpSp+KbfAH5-nNswblDv#hZsM9e!R1iu}hXkoGJIjhuV{&G=j
zQ{5H$QbbC!g-a_GBpI0s_zIdWo+_esSCL%$@uL>6rso@2ZY}nq*r)$e7SB;s@wDA5
zcbwpUO!<TqnH4#6lHoGp{QIbKFK2pr?N1q{D&Bz9$hbMY(QGAku8yZZ>t$U1cSq{q
zC-tXOf4^GwH>Uo5Qhz%2XD8NQx#R@(XS1BFAIeFdKuU_$MyvIs32h6c?3t~5&?ef0
zcbC>nB6HLAaw8t*uc|3f$2&j!X1*UQ4J3ru^1Wka&RWHGP2EfTPwJB;D--mdq@>b=
zqhzJQQmetd$eVEH-O>UoOL@guWbZ*4nX^GcT{eO@)eJ7C+2xk~F0wwzv41PRTnuUF
zTkU4kZY%23_2}h4PHL;Q;@H=^&t2j(p@%*J8mLd)iMEilSZgg*UXW440EK{vD$24@
zn|%eXucQ3F3`Z}qE{X~-(}GYzl${r-R{Cmvs8b^o-MT(#;c`)2mT=}>f=ZMkN}x0a
z4G9%js=AsB=88oiu&@R{)rpTi4xdy15}!?9Y1$;>vmsCmpE7u%20o4x;-eJ8J`VZH
z@P0+0u{0wDY6*cRQ@QkhYmx6rWT@h7N>^DJc_MkjnKw{EG#wbGq!_i|C>az+%TIO8
z7M+meu`!xV<z}#)Lr#CY9HZc8Y@>X2Z>PD4kt-=UHX_5w3(4kzvV5gwy@z{|qf30m
zGBlKrI)=((6~?VMEH-gndUuxY5@$eC<~|<l68dc&y?U73A~_z2^onfZ%t@59I#whd
z>sG5{b@t~Fg#Ww!nNdxDCjX88tp76BpM(DY)SrKmyLNx(QttnvKe_)0{aL!8W`D%I
z9+gL#Rxd;z6<EE9iFpLa?wK0#IC<2Be6iW5<k2&5K~x^yC{6wcdGsRH3WJM0T0Hr$
z<<Wf>3N_@>KPXm*JUU4jvMzZvZ$2;TmPd9A8<9udsWwj7!~}|xjhnuxYyR5>l|*?|
z1|QX&|AN5Zl1JlcB_bcc^8FP)SJ#QpB+AslXFr2d1D}8M@r}CiiIYcvD0#H=bB&P5
zql+wrcGfMA=8_dox0jV7YlM)JFP!=3V^LPWp=1y)jjeM0R~#cZftgHZ2cjm`G1n4N
zk^PR-KL+u1ABSY4;wg^;B?a3o@w5=}#2Hb<)Ax0Vr*G$Br+5gxF;ZV0@=15`B&&-e
zpRNbl2<mmqr*)rcJ{TL9Eu7gh*2#W}o%AsF(#g&>I!Tact^PU5`*-`dqmurO`aAtQ
z4~&JUq<`Zk{ipsN{<N<C^`x97uWIz~X?W@H_U~@;{@wmHuGv4)Z!muUplnWJh3E&S
z(l4=pe>gF+KSK*v_5w9&+gY6{(3>-*T$hnmq{r#>NA_I%$LTGZ{9Xjp#C&Q>nJKRz
zsUjLLkBAfl#F|f3+n#KGFXXpKc&w5e%r~vt<o6@g25x6cgQA{WhCl3K=7m!0cd^Ar
z>C>v_F}mi6e`{+wg##&rtTtLmy;sDw(M;M<p0KaUX|9<|y||a6{+Vl(8eO8kW~9`@
zKU6EHS*@H5etrr{2;|OCCt5oEvF=Te@kbpc*Y)%C;+tNc9;kfgH)A7&I%vMk=uor|
zMdwO3@j_Y#CFEvC#Eef*i@``&<^D{y63rOSY(jB9XW%_*_i&}owpn%rho~X9VJ6|*
za`q&1+b-H+usR~t%ty9JquQCx$0!z>H@4+UowEeNEbq~-o>TeuWEgv3D%A+WM<BAk
zYA1?N+$I@r!hR`(*@!L+BV|BHOX`Q)jww@w-%g0R4DO}i7tS0)qM8Ql-DhxQFYIor
z8k0c@(3k|)^F4s5G~#>rJ?1bVe6Sq<a<ska)LE;&Ox@liR(t1DXE-xS66ro)im#5g
z)P|PkbhP@|Oc<$|xnVY~ONnJ(Rl>7OJBF?9fIysoT_#U63kUJ9zeX87L1Hs!??<}d
zo%M@w<``Wu-|n<fk@@l*gBwM~FT59ZNUO<XfL3U>wok<MpSo-Hzpd0CNO{C+SNA`|
zYBaI`@r^c<M(O|8R{wvQ@HhK!;UNA0m@>8c|6z^(2Xw_X`p@Xb`Y%IWqyL~)NB^n2
zR{uLl{q)~z_YyUl*;b<k5uyvHwV@qh?@o`#=A&KOZf36i$|A$I@tO<^6WJgpaPUV(
z<1K_A8dx|fdQi9~J%0T_)1!NY9|F1s_t_EDgdZMbyalyaB=Fv{-n)ed63LPenxW5Q
z$Y_6CbbdTSTeaxZR@w`s++^XQ>62k0q39!1Lg-W9LgLIgBzDO_G&5&%fs-Qan$rJ6
z`fPn)<D}*bg=^Ah9`8j?MCmg|x1i~xXDjwS&=C5RJ{+UZHG*0qeVTygU(n|!3zr0W
zl_~87QZCnc=nu3+O&7a`F9MGys|52`{gkSozSmFb`l(t!W$34w`l+dYdPhHH>!%s=
zBqHa9akT6`6n38?tPmN6F}x;}Wnncc3Kjxu`-num<VzHn+exfB{?>iZktJRtaE867
zsV?yqooPl#tV9i6%HFIN`tD>@x~Fm`5(A>l?Wd2@WEQU20ZpUD!<jqBiX@kCp`|Q4
z5m!J9a+sS(QYvEM-e!JELCmq9=Am}ttj^o2?QmE~x0%oBckWZf$JRJ13C63o5{)8V
zxc?np)m}b`t_O$L(SNpQ-rsSM?vv|R@t?e_`z|gu18m_N3_u9<ETpvRbZ>G9&voTC
zG<cziD~2=b%O?qRR@PUF`KmAma7JvCZ-eD1+iNyqrYMVvR8?iZ_60-rRZKtDHy3l+
z+Zu<mO8C~8-#rvz*s%FJ1L!F2hN9m&g@u-bT^tACSlX;?s^zR%zY+E2<Y)$642LqK
z@eFbxE~#v>d!&3<-2S<}^l)1bF;1L3<LN4!x*nf2_r@~r<oy-1p?u&d;8@-{ZR8yy
zTblE+ZCqB-`s|-K2@GE@mo*fY9&a>!cj=KFPeUOB1~A8Dsz2AAuZlm^H8hm<Z?Y?=
z9aWci2ajMU>aW3n6!3h9){$#-OumY(MWpfe()i1KZkoEA%7woXdz)abvR%Er^hl%O
zySQcPiF2v9^zeP2##EFSt;jwHyL6YucXE2^I*0>yEb4r-9ZVi`<(#W1<CvV@P(Gj-
z=xHOZUFqSQJ-0Xm*AuGbC>>`TP-vf4&B2Iy`75s|9o^M-3m=T-HXx3G*(Uel+bn$4
zxPD@fl1h)<;yzO{*guyXo?Stl3vrz;F5BWhxAgFBp0<3B=-ebvi^&<KM@G0$E<JLy
zy=)U>ReGeKCyjob&vAF|aHIR#%orJ+WA!-4$cS@*zkoU^pDU4i!UT#j%K4CLIM_nJ
z%H#WcngCM^Ai%Z5A;4@=(S!!&3uH8K+(5OVZlm<@5PSJyEax<P?hqLVQartRI&Vm*
z3+<c_*S#jV9lp(8KAmEY*-%_CIF~o0q_Q$nrT9=?e5l7R#fK{X3O+;KGG@VP{NO-3
zr*wza%MX{1QWVCzS@c8xfzr_pZMP_b2a+#}BXJ2e$C#${NLzc^<s=EA$HftPYcPjY
z-Q^d#wbs1-LA-c#+&S$;m6Ka~(&Zygfi8{hye#IW(6+sOPJeO)x?F$tu`q_-^GVm8
z`w~>;QZhGdUdT`BkVpJ82RMafRuT8f9f1$iMTRLhMCG?oD0nUOpvHci{*)Tmb4D*x
zmsOL!>^ibchFV-R*|hUedwCC@q2_sL!@<k>tu2S@m@Fqtn}P+Oir|Jyn|E!v$o`>O
z%ol&l2+d9MpTrdexg)dXhOzqHGq^O?cT%W@%9~VVFAnPW4{bzN;UM6Eny(k%zow~n
z_j9RB-naJ0b~2C=;`p;1-eY069e8{u=~Pi<hU_B9i(nw!-3~EN4#FS2!93-&dV({I
zc9r2j6CvyFNtfDn9^|%n?&m(Gb9YbnOyQDHTQ$?bib7|Vi$qH<S1%Zl4Q5D2LO6ZU
zZh8FrQ(4~>!Q(jlG`^x9>glx4VG1us@&>vLctB1rWfmR&Gd!mggpecF!>*81lPUT4
zb4=CWo>S-9a$ihV-V*Y!Ojjpd0}G|KgiAnUyruRFN<Z`d4n@Uirvnae{ij|O&SX*O
zG)k!<vaj@U{{xPQXmso)WK-Hm?UjMGSn^|+8Cmz@u>Y{-Yt!{rncjbkHe%_EHSqcF
zjTk<kP$9CiE_~h~+ke8xeFA*$sEN<Mk2F3t*2^Lb6<IIaD^zN(mn|$TI@P9%WDRE?
ze#jyTjK+E?o7%nR7OrB8TP!`&`kd%~ss=sgz8*&pYC8cvrjbqPQCJ(3==iwnLgxAi
zGLgaA`XT6ic|v}<Fb<#dPK;0T-@xa0PaVWrc(4YuTo=Qq#`wJPT1=mLo4RW2Gh@ji
zR5{`JJXjYBMG+LVGPC-FzZ##G77{Xl&QnO#$*=og{m<j`ySom0d~u)#J$eW_Cmx?E
zG$dOc!RK`<JORJl@i*|fqb_{Tsfo|$??>=a{>kybb7%!Ichum{4nt1Z_mTU|ZjigZ
z2g=>vLs%HP$oH&<+4COHDJ8Q#Z<@NY&Ym|T=N0w*hW_qU-g`L}>iN^0<?8R(IUD(F
z1d4JBm3C?b%G3#Mc}O*8iVo0TGWg`u-Sf*{ffQp1NkEn5CO9@0eMdmy1)&}s3?IZx
zhnK4;dg==fsDY1a@$oG%LqO{MtyO;1T~rb7@$&7mg28??v;SaNOpe(~ZxNIDPTwag
zUz+ngDfN2ZHfN9!dNn44Eq^L^Ivk=95?E2`f1E%ZH-RFGKpAla;ts7rFGB<LZZ}a3
zn7=k4Xv3LZhYJ^mt|XkNo4ogS&Pj5Y)JddhnMSHij83tQ!J&PMeaFw2E?Y*tDs#fS
zTH*IhQq_B2HHVQBnjWK7ME(%(^WQ_Si(jgPUJV|M(#tOI{|>!Y4zp&u#aA(UE!tZb
zz52dWlU{|S)TGyc+x$y@vF7~S{(zc)-I;%zX9%sBf80gR7E2hk_WYZr^21k6tobL$
zEPga!=Q{~CCSUu1S(9&?ntbn?ElCKy5zkQiJkEa;KW!$Oyw?2t;eN$WJ(+)V<-Ovk
zE9)dubpE}j%EahoDH@OR`9&0s8j~;OZHt*E>&f?qx$iA#^<pHx?!RZgRT0u%i#{LR
z7p2e3^8WA8=Mhz=7JY8sRTq7Jd$T5eR*?dI{@do)8^nJnu5Ygmjm|GGFaP8G8m{uk
z=(0=CukJf*&9CMEtTDgllJMU?zr4@Y!9P#j8|9x{<^A8`pPN*f7@e&7b>@z`_~)N*
z)a0M1NcnG^UxS{lgFb`riPEQ=y#G7&xk#0%MW3VGeO_Dsl)qk+J|jpG`qa?x=Rc!W
zS=oQ((n4+h{v~pVe$R2RM0+J$<sSdU3Oni=xe-f(-w&E+p@Br~nCtEq3$rqOR+;->
zqxso`ncreCmh(T@D|7W_O6EDY%dTD@8#t;)wkvj_yRcI_eOowLH@p!yo#r^Xc0_Ct
zd`Or7wisOEJLnJdn`r*a$UiGrA+QaD=7a%MibXS3Sv2_;V=1qa<T9@$`S`+9{B_!v
zVa06FxvhuzS1y+0p@$7SX;M<>w(egg*Jcb2+<9qB7K7%6(!*ZYi9vHuBtGFp_RMq%
z-Q`wH?tXOG=Ugg*6HRW#1VUZh&BeDBDm+^prDMbAdRkxw%Esu}Aasq`JBxC1<X%}$
zP;;1Cw=Q<IZ0wH>0><*v!)YV2Xdg7I_+B|GIG?jfTz{nt(u&H9eY5ig_VP*eh{kN^
zdb(0oD;n#;C70FpRIl!n+C0=ouLe)8t3SQa8>By>E1ZF`HmM<Lazj<Nwb@zFD9TBW
z&_j{XqFAHhdqQ^1;xPpAUqKAaDI>V0$V_5{9FtELi(*+H`ilj%wG^eN^5-!~A;Xd-
z193HGNi0~Y7%#8VKhybHq>Yu?Ty{sLXEo!Vt2^kc`Q|N*y%;J}$iNp9y@zr~p2fA{
z4qrn|POXwW&Bd~p<2`<#y=*R4&(QTwZ_m_J_0GLT%spJOqpP|(QWf5TNL9^rRk1qw
zf^j%>s;VQF$W1tGQ*l*y`Q^x;Ve=F!DzeWRox>$zbf^qGdDW5|jf#B7&HeqM1s^O)
zN}6pM4(;VDbe1c>{ZnOGK^9f?7BGQNS_@rXBac`Bouq+&n<RUg9E%GU$TP;-yOeQO
z&MY!+t%nE~V5}8`D$i|(g!&h8zyN1J>sc}}%#sgnY~BRFLp)o3x!*I?>D!=9f%Zur
zE(^z8=&avU%!PcaGxz{0Qu|P;9b7tEwGR~@3tqsJ<7=sXw#-CTCmnCMADSG$S#`-_
zFZw;wmU|k;Lz;7EV@hP2doFhPIWdZn^1g&tL-$ze<Qe&B;;T{ApTX5c=1aPMd)XA4
z_a4h|Kj=M{?jGtrmI@nzlH9M^<rUu9Cp{xMDl#0tp2@-YFc-#48^g~%+gMDek*;~f
zBa8U1;MiIj<<@O`*}bIa9|+zWdF`#B3mfsKRp967nHucHi$HOhuYV3d8D!jP-ebx3
z@<pIvG3!a1!-B1)24T%$7QeVWo{MpJl^+<}-2CKDNQb{`5R=a+;Dq!KJnjnqeuOV=
zt`SbrzA#B!OBxh;XdpZ-9EPcL!N=V5l!if+*|*qp3t~DqdTQyGDu2q|Y@YZKtFQ4u
zejuxULD-*uW5g~h7x?C6D7)y#g~~4Kk@~*m1Wcfdon}Ha)X#&w3_VHO?i-c0^@q(^
zTkFG_Vr{K=esWU8pIyTqci~jMD2aXLg+V8@uOyRARwhRsE7N-~)mWLl7$l>1ns8<j
z)kdwaRpzxbSe|BD_CdWk;nTyJ#u3$n+Os!(ibe{faLUOFDa`qt!e)cOS72qCi#ay$
zA+KO|dmWgal8D(hGl+E%862%FocZDHq8^l2jB4up1wqKI!A^TmG8{C2ez9KE2WXw!
znf?N(KO2?5+8--ENtf>%b%eyi|Hmg{`1huP_$F)O{|<Rign!=`6DYgQ%&Ud}uG>z8
z|5>&1{}G2!BL1QpWb5#AP5PD=>dM|m-5T{;_VfJ#jrDbPP<_UX1gyj62n>+Xdcv8m
z2+EB$$)WXw@PslUN36E(1=H7Kt69W;qVVOTG%AK)oWKu&NW?z4Z{P{>E&2<5e}AC{
zzE#)+<M5q9wTeYz_*TLQ3HX*i{}=ei(^ulVBlz|^F}~BtCR{P7CViu^TU|kc{$E%V
z`38x|@47XDJRkg3^fr@gB470!$Ujp{Uu|lkD*Eb2R0BzB_}6#(HI{YhtIteNpz1;M
zK1__dig4y_RHr8b-t{7B9(`5E+eG-$^5;yUi3q(eR!C5ZSWTi5MS1$~urR4hKBd>h
zWb=N(#HuHp`O|;|`SjVdp!3)A=?Fp1f|SUoHz}=|rEdAOguEx3-@~6zpwTwdsW6L|
zPd%x=Zu!&&mQy?!5r<Xgk!L_XK|a-*-`y<KWq$9wC4zb_dN#GNu4{gO{agapVe=`B
zqVe;EYHQ4wCmG^|`O-%)t#f{BgS!Z$|80Z&S*H@t&CSw$c_q-LKEEAaY|B0BGF=5y
z-g?#|N@G#+vLS~#8r!GdtA#T=_J_x!)^#}p)ca{H5LEuN)arT1?8<XIMf7}#w->9$
z<$vA*aRU1~7ZB7ZhFehXge`9$@y>cL-K%bbJo8Kqf`rX`v2$XX*Q^rGY<IJwNrEN4
z<1|ebzE6go-t&fe4hf+bb^M#P-bVZl90`I+{wIAgaiqvXnTz>i&N@WMa)cBnzyFdH
z{-<k^Vj!l>I8wB_sWvIjf2sy4nv)QEF}5bw^uPSC^T%eQl_839zeEQ4)*D3_gl>jq
zDtN8wG@x#l88odHojPLDjH6Q%RU6vcNr4gS<^$mu1WOH*@ZKjC>Fnj|nkU6c0rO`P
zISL%TlcSb>m{Wo1nEyj=y*O6NPGqI`%W6|C?}jMVCiD6)sdoNTwWwAPgJ&Go)?aT4
z<OtR3!AZ5L_Vv^{sP-0#p^35ir_4za86DBs|Cc7=2(PAGGlCH`zp(_fo?oYkcPG}Y
z5^^}a;xoOcZkF5qWDVkNGZ*|W#EY-cuPTJ=P7$t)?GV0#qZApn(=>X+*oY|>A@5)l
zuA29ZHwTay97#enJ}+jT)AWq)ApR%Z%5nTBcDp!HcK(DqsrbybQ7Seh$6r$MovF2`
zI8MG3nLx#2Rbg!^df>3SsCfPpbx<*jL`B88{bOW3P#WX^=K8p~G=3(Fei_bu;2K57
zAtL{Ht!MJRbugo)*oVzKVH6bm>!#G8=|S_n?bfu`RX0^ti!gNhyTO^}d6cX@t$%ym
zqT^u2jk2tkn#)O4hrepbL-9@vH~&}YUr&uw^i=wnm*0P-f1UG0Epq*73c18q9?sm|
zx2FEJ%&*82)xYNO{9mqrdB<vci2gN#-+!reGz-)qNZ1T*v-n3g7~#w<SJzfLW=_^r
ziRxc(k#GY2>wjI}7%?V(zUfZ2q*Io@(TCT6xpMCF*CNrPtqCMr(MM{F(>E5uCAFv6
z1fMm<;`NRDNUW`I{5P$i7mwETtFeB5e^pfSHzvnlO8$=~*C63Gv-~$9Vf^~(RTb8j
z{Exz0bxHn<AG63AzkaqLF>(F8*6{rnzlNGhZtZu-H9k-C33UDh#3SYMl}o6-%NNMv
z58+xD+4IWf!W4U6puPI@v|LeT&wH_t`t#3$>d*Vb)E`f&J#WlZ!*^c=-(RdS0_D3g
zvKzkQ<wl^GFpuIz#>|Z9f?8bhr93DbdX7pr>K8AUP~VaYmtSttnEMDc=L~Q;7fL7b
z!Ad_Bs9@icMLNSvkqmMXCIu$*jQzk~R!<x-T|VgD026vyLmqgU?-TNiCjuRZqkJ3N
zZVV@ZWaHoOOiAiwbGIhRyHOI>=mY{!8mA9VN%E%^EOOY3m--g@8~=_{oxiet@z_Bw
zzSkrg_OehK>#OgVg1uu}2RdKmJ6aiPX*;^&s4?iMQMoPoih|^ZBQ|lTpYO+ly`#zH
zzA)4s=LRR2i3iQE2xr>B(pRO%Rz@Yx-tAk)g@DbBLGradIp&PT+D(%M$z9zKw+(~5
z!h~~{9d02x)Ff_w@*OT%GNx6ac_ZJ3%3UpO8&-skK^x@j)CI{GkNB0Cb>ESKC8Ia+
zHMw$e)u=Y)o5!@#Aw#v1Z!dq0H~ymhWbdJdBggT582kbwN6Oc@p=JoJ+Xytz@E6rf
z_EmZ}HS{j7=RI8C@ZB=g)7n?bmxPOaTPt^FjNcmWh6m$Ns=fOjRu}V=N%fNa-IKi=
z8#+F-c@N=U_)gjnr2H@_C8?+&+g?7GXXOu&>}6yR%6*gM;Dhc-eyXZBoRR5Lf9an>
zL@f5pxuCSjcjwz=8E(jD9*RF=C@amzZ4ZTQyxCM4Y(Y%5?P1;xN}tT<jl-L~i)^7(
z@1lC4`rbwLeZ}voF4&)*Cy4%|%5iXh5?5Qh&uYJlYu*C4ydCSDy{sjy9_$PNi6bS)
z!dy(G@i4$cK4M-xGhuvNxnz8c%W%}3#&qx=#SwEdKV1GXGu4OGpk4#-;d-7ne?hj<
z6LFm56@qSxXy4+*qrb1-ZV_!Z&rZI(=NxpY*@wKr_ee0Cl_4CuywyvlK{+Aif^s^4
zp~mUrbvd&kUKjNluk@yPJ)+ya!BegKak~)uEXF(Qe%!&-M5}E`6*JvM5*@G3Mjg8X
z&yIYHmQeQ<<r3;n#AQdxt?$Sa|GQl5K>W9tx16$H9RImi>^E{=Ph3=QkQ0S!jws#c
zM5=Q7TSS|ACa#%#)Vl$ELEXaP;kcJ2`XyhewO;m*d)Z#U%#Ob77WeX0{SwF8dg5s5
zPV3l9%&1&vJ0Q3VvRc163tI}NN&VE>-;*EG<VsU#s0*)D)cS+SV4+r)(4?KDX_}-7
z3n`jBFoD%?Q;ht`6N~<{DH})s15Q=r>sF0&tks<0h>`v}K}?$YRx;{lP60dJOsX`q
zlr2>eDMIV9yi$f_A$T?qW+oNH6TEF)<%bzLr$3J1sc|oV;$?|{Y?jmCQYa@^dYfip
z470CQ;agKVE+Zr6*#NU<g_5f<Ej#n%5KSs-mO63}e3GnFH9hALg)H6I^i0yd|0ju2
zdj4)iJDL}zXUp-m=owx6qPLehBlqpyXf@Kx@TdG!RkwlLCiInweB3!IkCp8_*J}J`
z>0raS4z7o|q{WvEXI@8jzACxotTBB~bMlEW^AskgO0hOE+ss!e3LoI7v+hg0wG{kc
zO5g__pK?vV628Sn1c$nkNnNS&0%6KRBDqpyS5KOgE4S3C_&p_ZFS+u@>6;9Pl2r&-
zl$a-+`Gu5QAg2z~IsKl>4#^yfu+Ya4Q{f9n$vgs4<FC}6H4Y>{OYrZBzE+#QhY1iC
z428be(ozj7Mf<JjyNJw!qoVIyU1JRND9zU>$-S?Fcn$itu0`KM!91S6;jw?8zMU-W
zg}yKM_)GexTe#Pv?_E+Zk-npA(RVt79#7w!1j#6U^?dk%c;nc7$f8=L@zW$m=R>lj
z>52a@ajz@+Vwb9F$QLEA1|-zCyjay&9ADqplBVlhjHh4MH+M|!`t;=ojjal;^<tu~
zaUafl!H^qMkxdWAs%#KNai~<eF~;%{`A13VuvK4IUEk{o_2tFtJ3ESE^BVQ3`6Kt`
zF?ZI;#f56}46v$~!QEaYJ=L>EZcDWUeB|=$3#~Sb;`4+vKc$om?SaVD>1yWV(U*kS
z)e&={KQYUC^^p7Z%rT>7aAV!o@n^O0oAW?ya`*jf{HBmqP3}ben%Bn92GJAoYgr4w
zpC7FYKQ+B$@<-F~3=2o`?|t4qMnhZ|wItB?`!$lq5hSfl;mpyLtT9D`c)}A!&`1U~
zeu`WsSbnP3v%2J?#_ti@6c!TvTKot6t~?=rTdt^u-<QM}B;vP*QH{s%*+-)I>G`{p
zSY&YFl5f6!!EYHby|z70{pMZdSDM3Eo?^@){(Nix$CZC6q5L&c{+vkp9w#V&e_Z(i
z3FVth`E3td?Vo&t@)yLFZ;?=bi-YoSMauucQzHJ{Z_BJxlP)+^4~SfT@EIA^oKurS
zFH;M6FNC>3y$EMMNgmNDP9xsuR0#L;wQR}SBTCG>7=P=NvCOGFS`U3=>0@#2Jsj6w
zNgeH7Yqi(jYA=sGoJ{4D;b?pE@z#`kqb>dcg<~y#F^U!!Ykew^(%7mejeh-B*dQsq
zG1!36O093j;S;=9Gr?KZypG?}*S7pliMBF@R;c7{DoG%Y*+UxqUicxdzUOs)>f<J3
zY8IhhkJZMxR{w4!kI?f}=xJ9QEurO^2|fFHBn`#gD5Ugr>(o*p<s7R;33fPDYD>J2
zh&F9jo9*<gaOUsb8mK#&V~xu0?oEa)`pw@e^Db%iTRop+{PN{JF@AZUY7$67TTk;o
zo?m?AiRTwbzX-ov7HbZEIs6c<Mfrsdh`uXA@o2Htrrc-eh_uOmT~2!`Jtf;}^j>M{
zvgmC(s~I4!#z<8O=0g3Hp`X6gPg(luU7l1=s5PAVb5~8C2#*K)3eG|zFZuE-^eMBR
zU`%_Ssi6f4@ydqNw-n?Z5_2M#u}->MQ>+cx`>7deODrc1aT`BeZA4Wr-Ir^~E(B~v
z(ezOmNvtoOoJ~2iO$T%^9kUlY&0OG-{-tak8O`PSQH30u>lr(22lLM$7|*wPmVwqN
zZ1qMv0(b-G`{++``IsbkKPM-C5XGFRbK(ihma{+Ea?xt8!=B+1$epaY_9pz4)Q&0@
zNNPt(?KpVw;5!U|a$D-Bf#+r_r9alaaOU(bnBjcYs<p!Zr>!mIc4MqA{3|X$U@x1%
z5BZKl8J{8(qP%)Fd+|=~?d1!3o+AX1OFgO>Gr1g-T~FBE#OasQiI*%8rP_R3UWvn1
z->(w?MGfwsBtqQqw_7V?@bbkGJ$RQnLU=$v5z(o6&hT}i0kW02G1}U-CvNTXNNetM
z`Tj{l{oUf~KQ*ELB6ZbD9K%HRyMM`l4eyHa->IGdBmaFrIL3dEd!n4>R{Zz+Me+Ps
zJVNo`;8?B*|J|%OvqMo<BLC%_tN1Ui&;M`y*SgdH692WjsxJQP?mi*^<;ttS!+$OF
zWBj*IM&N|}x0<&n;PzSaN?NPUe{<W%>VGPs{uA<_+RtAj3Nw)iKSV!&5V%8go+x{-
z7Fa8j7+SgOMeE<OVAxy8D`Ky!TBEj_r%TJSl3Dq~nYU6yjo_^P!x7uo^}tr3>%WJw
zj(iob?qFXL{8&&3yCMEgu7%q-NMk1V3)&-?aV5YJW%N*M%2KijLY+i#zI$PWCL7Fg
zG^#L@{Nc>y9WAsP$xi>EIZ)xG>Zvl@2sU%#&a?C>;Y(n?Wxmx>A`P)nX$vZkQ<v_v
zZ@W_1t!;{Bi_i<i#TH^?6LW?$dsCV#_wZI+trnPV=JyY(`;FX=629`&xujsHyA9Vh
z4W4TF5rB*3gGm*st^(7;g`Q^qN|LyGhTGj&%AHPuOP0Ix=;A^132M=I?O|_=pu+d(
z9>B0U3K<+ZuX9bm;+Q?e`kKS!+hZ*7`DLni>PsxL)hMoE5xI%HLUHa3+z_9M{Rky>
zD52of;sQMw@_R1?*46}lJ@m)-L-4sVc?i9iMs7%49o=c}zfEJ62hDR8R(hui(e-yT
z{y+A<J<i7J`+o+9kvQXWTtc~w+{-;vl)>ao&6JXE=%(nNGA9&LW9W1Yy3qZq(p5w*
zg)lLcJB8$y@Ej*p$|Z4r@3q$6&$FL%&WygFKHuN>_j`Rmf0Q}TUVE+gT6^vL+Iv&S
zY$(eP@dYvwyz~5_&WAft{2>l$Ac8&keReR6)t#K+uU13qd|X)zt#b&{H>O7#?~T#<
zrX0GkzIz%%$+OdNW(gBwgyH#dds1pPu7tV8k2{*0x6~>}ttjkHGe$oTUC0Or@FgpB
z9!?+Rw#_LRkb*s*L)J0$^|A<j$XW>kMZM4ZlBrvy(dE%}zv6mwj>}c@A?poJ?#F|+
z_E`n@rC|LN1!BEt^V?w!*ZfG(1Nh+&Aw%9$UQAr?2WsGlQ=RhTeiU?^xF1EHBZ*#<
zmP!Q~sHldzhpbk}&ULtn;asM*P=8NXo@uu@<#|hc?&5get;F(--0YNRb7V?Xp1o+t
zET%lKyf?l)^>vdkgC|XIamk2WH`y=RXj3}?D)0<f;+i7~@0Q>r-gUunwF+H^qs_6y
zst<2bs}mR%@4g4JR_pbM3~pZ4#I2PQMfax-CvJzR{rRoCZd$26u-}$ef-iI^qQ1*0
z2iEG2<W|QP2woN{@VwkZk8)qV&uLbrTJ<3uU)u3MbpDo_up``<QXDZUO;gA}+k!7z
z?IXec_UQZxq*KLdsS={TjYP%LtB`cjo;<lBOP2BVzPllFrL&$u-ODDuR@6qxq+sCj
z;0AXi_0?VKw(jMW=xcO()OQQ0=|;Pj7?x6v(WXCSy^DgSB?Rm-c}89;#)w)IskqB#
zEJQ<x16KO3J6<=+B!usvez1S?dJ$RWC|hMBVu{HrR6f7A5v!cyW?dZ0#@HW(WS(AR
z9-)ALAP1prA<Hn@LQyN3K_1Di36NVdb%{l&d^`&XqkC!Vt-Fx$uELk0*4yNs*h2{H
zC4h84b{_`Fjqab_<5t@igOU(li)LWm?PyjoA;(!xw3*AS2O(KC*OW8rO9d+;&<`>V
zYNFdA-0ws%OPTL3u^ms|Nj$8Fg>~Sa$h5fn@0t&`SIB8T_<n1cR$6aoKA5Ti(;RB~
z-L5h04r@na0J7&$k3%-7-g2m947<a69;K@}M|~M2r!enWa-a^{yl?uKPWpmQtFbBI
zxnzm~lmRQ-xPl}GKkcX4V~-&S9wxifeEo1Ma$Tw~$|<<UUIhxW_jS}h@Qz+{!mM`m
zq9{_y6ZOqRe%QJJ)`X!$ceq1G>wI7%SY8Ed4Y>e}t_Mmuw=ixXSYYb<c4W4yU)sJh
zW!;FNUJdbjh8Sz%X`mp8UD*w&;fear10@{`44g~2b-mhQT>${C56B{C8J>t8wJyXr
zTAZWnT22iBz&Vb|obiM#5Zgng&I%S)sTJod%VB(|pVP9dRH~r!j{cU5NEvZ&BgcrJ
zqu6P(h6l;^S*uh6{rx5<P@6=1p0w36fjorDNJ)zNav&Q6Ez;;7+>ad?VE4rRL5m$b
zJe(2nItpRD)u##Bq%$)GGwVWP8hR94qS4%x(BM?7DSx=99yLj8VN|Q5jrNdrCK4l`
zV+=-Jq_oH@?%*Qy<~Kl{54Xnu40%p}Nk_aggKIDZ%Ai$=Z*}|m8v+KXgqy&WW6rbN
z&;8h+M^Bx}ok&fmLyqo5AKnu_HXHe<+#d*@iNO!IxlW#_?;^0`{{3PaRMg583m>xX
zNfXo43Fk}TeDv1P5lY83%hc{)Zl@o$&Oti+sv*A+L!)Zex3?o&0e?iEaC9~M2jdBZ
zDoQ`<AEvdWCc-{@gK?ehDEqDxb!t%Q)UlV}UV%q;4_y>F!@3P3siyp-8DJg)c1%tx
z@Dfp|ZASXh3~yJ&abf(Ay!I*@HE+)T@d!vGqpSUsvxyPz|5g~zhF?T|<(d1U+=gnO
zr#^Ww?;CCE&wd#7?QKB-76rC(L)xcMoj<a-jkm)!%x&X?4IIB;f;8+5V^@)PhFhxr
zXL#gtgX$IL;NBljiIjG24_iZ+=q4QVyokKGu~Q}!v-&?=&AJZhaPbt(Pcbfx2KMtN
zGh$M9_?w>KkM!U&iXz++lumB$Pp|Ej5&TZ?31=dTne0aG{jbf*S@DcocXRy@1mu<N
zan_$|*AG-`iYV$^NLkPzR#YZkjX|C2g`@!*aT}`FR}KQT-@UrGh?PBoQN*Rd^BF#p
zsJc{G_bSc}6z5zTE^=Wt152W~Oeg}|by?~4RIS@xN(c>}8WFMcL>%`86njkansNf-
zlaxrUOtPa4cOhs`r`-y5);j9w_LMe+F&|pZyVleg_%(>1t`SXXKJ;EQ2Ek(ijGxgQ
z(9Fg?2xep-A`>CI1;}EkqrTg~MTRbCtHv>ET?z-re4`?Ew8>#SRwi!_)83r!78-&?
z4`7^`=C)g)(2I^jYCcWd+so`Hk<|W|(u9^A_BJI=r02qXlV*0h{bU99_YwuJiy?~o
zK0?k|^iQHhUFgU1rm>0s4k~nXYPyYXM7ZD5<3oL)1*Srnk}k!;;o!G;u}dz0ybV}%
zd)xuiQPuD?S8zX$4d(={99}6l+C$*SlH~QYUYkmPVLPWgCR(`rUFkF;t3T%MQENSX
zA)Xf|XsNyZp}uLUy`sLNCNNXpS2)y;@$lB%bGW=vrq*QS#oB71qN?E@i;lLt$IK6b
zMn=U3x<ae@%~%^4MxBz=J{$U@Kf<eZtNhd-QM`76=eDaI{VxU}w4}HYnTE@lTJOw>
z?~iCwNgYxnWP}&e+&MM(4nMl3xUAUkXdGXjb5S_1@|;84-&h;Qvm?>mGuS1qXF<gN
z=+Z9dCTiIrFfzf^@0wnGyqDfblFO9~JfW*ZE!xt4fIwhvoSuBb>PMEdY{A~x7o=33
zE~7$l22Yg<fh$+k*9e@b^Qlgxm1#EZ@86dCo}XZL+Mdu3_;x=4Mej8uGT*5^+8=4o
zO1F>lCdH3E1t|5GtIs8%3H!@AAR~}5{pG!oOZ{a<tBsQQ6NW&V2i9+hBk#-OMTf1q
zHS-S)trjZXJ^fgBZ`Wf0j(LyO1Uz)r=OK=R*skxG02`RkQ<&=ja{~i2roh?+Y}rzY
zLC--cb`LcK=n;LzOaN{7^I#J7t!$JS_fFv6$+-7ggTTh9^z83+b~RO28iQAgVJib<
zKG;!XiOPnj@!-67B9YA55?`af0hA2$)Y3Ijp4_u4L4tZ-k8&2O&qaOg&y>4~<PI!}
zY(c!BEx)`WzhbO>>k3o;!89$OnqxqcC;6=LC{y+P9?MssdqBSWO!B8oZd-mm=&#=2
z%ZC_h+^Xp3^~Mox9zCxIBlK4wTIkgK6^?BSIU+f@gP!fPQh<V<=#zZ>Y3YsfDYDe@
z71iGG_(AV+dz5?{Khy$q?i0-M!_FMBgKTmmCF=XNA(a)c(4av$FckA$>gD);b-$ju
zbO!(|?V0PlauWn!7jN~3U*YUJwI`J_LJwmUbOgQLCHR^X8jwoUk9t|5hcO2@f{QWv
z;iP?5eLFQ8I3v;v9!X0Qm*<4;uSrAduC!{$FTOb{576(U)k(`vKY{1!0d%{y8O4*G
zeqeBFR#SMKy!QrHfejdj)gwrCARDuiBRKTvTHy-6hMewCvGZA(UvU46Dw$!#zhyh^
z-JAxtrGG?iitT;x1u6OdgVd7I`qQ)R^=d|408nM5a^k&2a6=D;o2}Y)$FL=@u%lyu
ziT1#KHwpwhzRN>xtvhKfmRHyXx<>=s(MRbvq0lP0o+f`s0#9L3Vf%}-p7q`3q6DvJ
z)rS%>>sg;7osf4{30Thp6`X}MfoTs6tqpa0-bm{XrD>6XUf0D6!9lAU*DE#cEe8~l
z`3d?*neTEVU{!%a*#+3bmJt%xe^|qNWEY#}iND-H6!*wRHqEbg-~((>PqiZG(5#j6
zsfqn5#hNFdT1h|mA^nnZe8r5=1>&fa@i99T;I5{27XF~mK5^8e)=-to?_1=AMxe_+
zrPKN%joiB{`EF-i@RP!eB{24ASO6i#p^<q1>%8prJ@D3_$uiA|Tlxv|)}K*obU77O
zkifTc-Ymqi!&T`=3Jx2Pp>g}usnij;F-I;{QBN0UCF2sBdr~8f9PdDXhkXtNknB9>
zFKAI3{*dZVF3e8<V@RQHXsr)X7XA>@6kYP>?DSuTOo18IOBzoRLBXTyC5_>l{8;6O
zrXW~-@h`cr1o0N;A?r3ti4?fxJL_*48|YT)fh=(|T7P@_9J{V0Sbxg^B{^KKsuZ%&
zHV}E#m<<m?9vSIZ^#Vu!H(uvWid1tT=z3th**fn!y8G=*@y{Kq&g5c19$6o=%6W5d
z_-Vew%o#smJrUh8W)XC7rNHwV;%m{&2Pl+gJpLxgt!TK20IR{c6#kIa9YUe)XJ?ni
zpb;}A+B}I_ZnS>61~eF$eS=qHwZG{0CX>edD|rYYZ;mz>ksNP35L{`<af)|jc4ay>
zp8MLrdI+KnH%-)cKJxJ!Cs=7Kl!ZZrKS!<SuZ7;_{6&Qr#sceRr~SwQ#K>I7B6@w^
z7@Ir;Y1HM>c<Q^l0!-cgVXi+6RG?zUQ*|I()*pWCMb>bwKa2%C8Ya}CS`fefuoJ~q
z%=*L0O9hl#f4Gzw6?k^zD-~5^9)z@3+?g>__nc3yGLo#Z%ov9Tt{_a*<$HuSurU&L
zjKT47eqI7EaLe0Xhvf}c@@T!oy1I(9HgPTl;9H+)qgkM|_FWAt%X?=k{<O23-(=y}
zQ1Qbg9GD)bkC^O3n?moLK^TP<S*ysUcv}7}crw|ILKfJf=jHZk3y$Z~@keSG^Q%lT
z94*;4p`2>Q>#R7x1`0A_Cp#(XJ5?LyPV1r46w?5`8<1c`fXB1ZFqZeu8Io$X!Y4hR
z`28xVjQeDizBx*)DzDk#hZ1S4M2Ws{(#IHmw{Zc{`Hm@I$of7aWPelNjolr6{gg9K
z-)W$T)%R>r#^~D%NMiKuf=^oCGD=@(y%KX0+^0psQ#>uXTmhl?4U5jAY9ygka+?-v
zl3pa>Dqc%Kc}LS*5L875!CZ4&dXmzc1j^&jN%%8zHU9X>`Ib-LF2=VvoNo_agXXgt
zzCGuByOq9G$G36(&6~e~`tkzL9W8YI7zp1o@I`&q0T?K}qpL6+Tr^0p!XQt5VM~La
zQuvdrS9q`RtLUWd*Q~=|OWJXKfGU)4buXO0qG}nQCTV+|wApsrGm`eLq+Nr2vzse<
zkzqKByBrA_=`GT*Us>Rp!&JDK56zt(d0Wqr1BEWi4&9SNgn=8ff`8#uh4l!&hbvLT
zh+|&D{tVrl;t`Gl&t3Adrp?KZ99Uqd%$@P4ZlZJn;$E!3!gQ;lb?6F+q}P-#!LKk~
zN;uJouT1aCt{I>h&8GO9oUkR8HSd#@uH-Z)n|cy-&kHrlZRGkVM3Cay!to*?usX6+
zzxQMA&uq$YoWDOZ!^koFGc!OzaxajisIL=|p%Usnui+IP7(D^Qz2{YEoe2byB~F_u
z@x(VN!Lr`?OAYWW@MJ1kO{sDFfxbb`d1}3}njy!!lBzmY`e`|EL{Lu?>cINAHct1a
zG)?4C6j_AYhUii1v}k{27&4<5#I8qhKgL-0D~nvwrh^a$@Q&We4wj2SSuh&hAN6%Z
zF1!>*OG_y<0IZx9tdqumL3=BifQTCspIe-_^i<x-hTdlur7Z819Jm$(zua>MUy0*z
zCl~W^xaS_h6@c3+>f>-Hf1rCuPejht;{h5WdS_SgE?ige(9yh8o`(+R{aG(JJnDG!
zT+gWEEdynU0Ov(lUV$<AygvXkDYr89>ANeSVAS59H=CEClUpAdy~yv)JWT0RP%!eK
znXPc4byGHcy*hk7ZaflWzgI7D?3Yo!1p5sFVM6;=K(6BL=K;Es?YCeN+wYoTF8eLm
z`Y+mVTuzMrrkqi-{rbc8;_P?!<$u$D{VsOwSKe2G{aS%Aq5U>gE6ILKQII9uZ^V~u
zziLBW_8am2zi7Yg>=^s?FJH3#8UkUQ{kCWQP5agD?AUKz)e`LYOQ&M&_cC%7?|=Cy
z$dc{Xc_G_x<HIidb>8wX+V9NEW9-*3rDXf<zAS<LUh~J=PsIa%hpCvfe7i3a%TYYw
z>s8ot96)5eH#{FC)aa{MQ+dZb7o=#4X_i~ZPmwIuv(s^y2`~BKp4G*BOHq^36OefJ
zfJMzhs|v707P#UiH!=!Z1n;L%)B}q8i~_3$*juN~3%wu%F5CpSRB#4U-zSHg&>LAl
z!dD0mjS-GCq`saoti)Gdf>6v5Us9P0x`;A2pm{{yEP7wJhClD3WST+Q_>Wo>Ij{S{
z!|Wc=3A2(XBhT{s1?wZ{4RIhzwwBbwe(*xEET1=oPz^(EE!iVDR_b~XbG@cz0>V~T
zk14G3tk?m>onSPAF~o5gQY%`~R&+PM#z=_zwx|AGUEk*H$81oA)@_%vrg9%wWV{|P
zNIa?IKT{AxN<Am}&k%@%(PRB*I7rBUI@)QtPZXree`>P-RB`-=%B>pkIKJ~;CVXcx
z%}<K)pSlVK`Ok5$W9vlzbD*QZdBE*IZ=qzhS!dVR{xc3=wONT7;`+f}`On$pKegC@
zu##x|59qCHIG@{hQlSO;&bY|)c7Ixe|6~CtS(f~#rJ++o|EVcT$usqC|5<^8qy<#%
zKlAvvRFqZqw4zh+RV%6_M18X>{T*HJ<gBLeoZDUb4&9v<dBz#;VZDGa*sq5LgWw;s
z0e0&JqGv7WnFE19!OJ74_w{I(n(IeL5RngYp-~mD%E+nWhhbJKh)eO9HD|<mT(F^m
zBE^w4gnQK=j2CRFMX}~n85DThcOU|Uwcz9TQB0L7l$xa#0!2|pT&@x2tCp-j-E^V7
zj8WQkbGeo8y5+BE#@u?SstZ4@g!&rXV)fckI*HeZL_fX1JQ{+i5NZBA6l?()7){M?
z0tIQ-gGw!%bwHdANP|lWexvOEf8?F4Ke#h;VlZx<Ak*R<)%L6afBY?)%G*bR-+I5R
zAf5%09@Zo>4$xnj!_dcWwxKTvbZp02FRUl%vOa0dH&#FdL8rywrDw)WB>>?aq!Vy`
zKJHD>kPdqN<iT^)gv;sy%xczyP5Sb{XP<W>|HqtPHJR5MB5wpcDKrk1U@vGRBE6#@
zhv})zIfN!98C|1#9^nTmu3QfVBGmxO-Dd+uM_e>w&8Q5!&@IG8!ZzwVT7itnomfSx
zn7sJ)K_xaHN({dBagwm6zaIx{CwPLov`aGB^St>Flbb(juK%<X+o|=7$t0-2GZ<gV
zV!YAGv$h<Ae|>^ZvzI%G8grdG<ic3QX<`ulMTJXq9W}GDzUabmvC)Ze8=S{ugJLbg
z-rp>xpwN0;b#T^ulhUoyej}T9Q(M7ykxq~LtTVXwlbvf&%UC5VeMgqJE8LK)(u07K
z9aK#m?^{~u0Ze3}Q{8EQh>F(+6Mz)*DcX;E8tk<&yPo${Fq-NV?0<lc{7SZar55}F
z%j?r<^DcqVwejaI%PU#rzb#o7k5A|@C2PE=jcBaTTb2h6tZtmQEUc|PIG^RQ$>6~~
zNTQs#l=<sxU<I}KJneYJR_3zv%VRyS2IzTpZyJ6%)^V-3$(hvN!ien?1TjzFG2pW8
z(@Jq~r&Dk*GwA&3n{z!x1c%vS9X$!3P%)F=kblq;F2x&;zZk%cEkh|E<GkUwt)23z
zoJjt35RePEcjSjtfRnciYjKq%1OWp<77*xVuZje~j5jjYzvMm%T!$e8)5{bTIy^WY
zA}}hz=tpjp5Z~1cC|sH~1eka9cic2^bY0Xs@RwMEh7^lY3hq;Txb@?WMYNI=T$&x6
zz8rsH=8ijQrh-M5xBEt@gQ@g9`Gu1T1ORbJ07JVEMYxGbj^X)>@+mXuxTDRaJ&7)A
z?tjS2g4`_Z6453_LAwJ)Wwpn*)>z?1LME1KI&J0!(R1*X=kMv;0*IAR5!YJHLTgPY
z9-?^j>qA<YuEFB<QnylAMP{Wi#ORLmKlb{Liz+mV0HdVL^&PwY;CK~;?b7`oc6oe$
zzPJXJN7-_xEswDvhoi)lN52zJd9;QvxIhJ|Jk~NUDv!<h>+h7uu(G;5C@+~8<w2S0
zPa62dl*hcz@#XOjzQvcv1bi*2JTAHDZ<YrEJ56~I;DqJD-6JmvcRWu@pbAQ0oJTbd
zcFRy5sZ=xDa?OnT?gj;I8y^^1i(1cWaLS=xL{`!4jH>9Ta3<axriLMKA?oEabAQV@
zF}Ap+h^1hYx?n0O5y}(OgG>S)0mvnB8>lH*jn)2wP7&E??FxDH%b!<NW+C3VNU~bt
z3%2L6ANdfDmy#=jY1B8b3;{1{eSrK?<tQsTPQl<R2kM8b>zSDj_UOaMhI@;#)p*7p
zV8fqrL=XOq%_uqux4(ts&sRXCAaGAPWown&mpLG3f1XVahDF**i=bkLchb^~dP_4#
zolFk2Xng?Z%WQqG23EJeT^#hAd5I7xUhv5Pe4o`ooC!19QIJ3lRw{8|y9lbN@7>aD
z8r<QPTQ8A)wxby1?Blc_6FOo!fUeDR`XId?irrb>e5YgEYnzK@&O(8Af<)|JVlfnD
zqxS*XAn$^T4YHICa)6eh4_U*HlGbzq*CDGX(AaqqS5k@{5}JEQ=OYgs(wjdVe=MYX
zx@nd49LiAOsn>!oGdt2fzhiE>912ULjrM3x=}vg!YIs`8Wd-d{^10<-P~<c&xv2Hl
zNUG}psk&mAk_%D_3#5a~XaT|mXV(!2+Gofb*+4>^z0yMKfeS!^WuE*tuu(Vz|8w6r
zFsGRqh6eM^p^X%dss?P7r$Iq=0Ff`WHZcl{fyla)oRdp3>iaD@(G3J?pwJrwIHs_t
z10mTOSP)t6loiznr#*+E8XI_~fNeaUWQC`@8_#Swg2r<<GRlc0U;b8l<kB;)iufeX
zKZ4FUJaMH$H7Bw}&lh;OX`U~AwyD^N)&pwTm|W*e?*$1NsgIs+KdS^rt-1uvWx5JB
zR6cY1RZ=Y4?Ugm!ohUvY>Co7GN_%#Wz#EZI91Gg}iwBwrN;N)z7+j(F27D!84PhIM
ztLP)J$W;-#C?ACoj64git`NX<=|UISjJ;sJe_7z>f41-+?BZWOA^+N%|Bvky|6NfW
zC5JCCe{FAj%G6LvB71)cN*SEH?EP0`QR^zVWoMu?sRWz`=XHFs^GAJyz>b=6#{fvQ
z9sv;#y~)a*2_&$6fJ=nz^{`%il2c9T@*tc>{DiM@VtjoQ<EtEpFA9?z_%@$PgztM0
z+4x><8-q{nFCRG9RsNH~iOL^eo$`O2KE{{7ut)h5Yx0vDgigwb<~zoCI$z15`A$0M
z^r+eHwQo@(JG;xq2Pv91>WiLCr0B1(y`BSo)<*PP7+K+XdxH7JAB{wB!ZQP0wG122
zS`d&4V({ccmZpvRh9tyutAVFc@px#$-%~G-+@bK`D21mfxWb^kJYq(QbXJi1`}?27
zO{S$=A8EbbDsu-0aTtULi)cuG@GtD<%9?H?I$2XMtN$VY;ytCojzX-8q^M}D8{BWb
zXt5XJ3B)f4(h$fsjJUToI)=%P4b^;F<NYj6>_>PXhtNcL3qS)CdPhG--5QQ%z=x`#
z6G}zL#T#<SWVI7W1U?;nuqXkFZd4sF&RV>e&YRF;PbE-c0M|Stc#uY?7|&;gT0o^k
zR<&PH;Hsas$k^QPZ4Y1+<<jSsR#28Pc=Ly%LyJ7+=tlk_^VjVN`PB2*)%cq{K<2M+
zpYSAE*MSMoUnk>NOnw<my($d!`JRVChx(MKSjqXGTj?Vv*l1lbOD(eMbLU9gwnxub
zRq0|OG0*q#{N>cbCLp$=C>-;6kN1!^!VYUUrx~|G{&vtwTqcPKXH~o9-x`VUOCrbF
z@lBg^okZ2Qhx0+B{&q!#>h%`;e5sgYd;@V43Vwg+Pj(adAsXeFkT2NiW{{G(+lj9V
z4ffS37mTG3yZnw|0QX)xpC4Ut9^BQK(7VSg<rEAWuN*$kFwFTLr0=RP(z9*Ld~v`r
z+XR$5U?C6ATOvR3;7k+WJxbxrbI;H{?*I@QqnE}&1%`o7B8!dn#9@#{Jprx9+y`#l
zo-jLEkLj(j==GQ^k!Y;P^k9Kheg1&QI4g>CTEYsl9^)Dxl~WQ)|I&%|uNSBP&xZcu
zm#lx`u{iywm$&tI;ACL*));sa=nq+1{}JF8r@z9Y^}j(R8v0+t0$G1p*wDX4G5RZs
zq<>X#ld@DL$m7*rh@>R%Gs-_==uhRS%74SrIQ^3x{T(>0Kky{bAF{Om1>hE^zrv#R
zze^+<`cFt9gR}myu%UnRV)R!MN&kl6mOy_*K*s35-_W1RQR%<?NSyw^l(Wm<fy4R(
zPXhfROY8qCxW(zOuxR~<i9|#H$5<fi4+|UmH?#Feebaqad6CEQLdOxWNP0T)LL0)4
zo>Qs3LY`Cz1as8^qf%NofiWrK#0#nM)5qXPx~TE<_CJslaqLP;9puT~vk<pE6F<vO
z?79D#4LYm5V!8uyKzh8~&&_YL@av}dHDrFos;+_hSF$p*9Y}0HZ7Ntdn|I*Iea?D&
z;V1$a2{rk{MaUy#NwYqz0|nVBlO$HOH5da?LLBv#LPi=DpM~(o>NFq@VmzU@ANqk%
zl?yq-*$UHa+PoYEbGYYk0}VVge^2C}873ayF{rJiku?sHC6#{;Mt*0oPNpYpt3YAk
z8Q(-~G&{0f<5By$IJ_ZU*M{0+BOb47Os8ZCT5Z681QIe*e=-t-8naJKK%j2apeR$+
zxBD<Wo$ftpDRT`$7PZ<EGCm8<dpO+{2LfFaN%lN$Yh!`@Lga(^`LDuC<(UULVhgi8
zlNHn~cX`gHa#4o6j54BU(-CZ+?{#z;4?1_RTWDRT0DT%+fb}w0fAc)T9<_g+_$?8h
z2SGumV?1oS%MEeo7_h#e{D4&jqb22sMb;^lUo4)_&lLlG8u`juzoYv?<gSa-3KK2n
zJOG|lzcI*AM$)@KTWSfeVJaE%Y@p>+Cqu8p8hE49o}{^h;8f{NF*PWtSRcRfXB$-d
zO7v%w0L4)rErkXVqlCa&kp}pA3_p=VyKuWyO70oKWwf-_9*Wot+_HD{TO;T;%4N?U
zK&9>Z#jlC%SzqC{?{|C-WKK0GqpHC&l$lWt#*(O2G3(<%xe39HT3)Y&@@9a7%FmY9
zBB8u%-STP_BX6J~FY-$w{W4<ZeFZW{KUxN(8nYKZVCXkBEwO$_5u>Q}%S$M49VlG-
z^>xY9_I%ha?}B3FjWy)ePAKozSb4u6N~B*|x4fc;wtmihTTVe&Gek}YFU*zfa1<G4
z9z>uaso2#{iu%TY(!9Te>`HCfcodwzzeC&l5q#6zSr-!!*Zm!+W$9oRO&DK01fsS5
zYcn_+Tbu3GRSFypa(LMJ`$4gX-c)ZDEyIybxc>`AH_Ciw13+03GyXT^%><+P_Hmal
zro6`~dG7mB$GBznAXy3O&-37>^}9Twya!|Dr6rVC-7RlZeMg@3SI+K4LnV{?s~rcl
zj_&m@gho<{E2AO}_ho~WI;84UnRsVeO}Sr2k=n~gH<4gtMa8~v!jV4(%t>dGKQOWU
z;R)n-Do%cFQ~vgPaq{i`JhWGhm2B$&4*#rmH~lGDiMHFTl^}E44_%+%MS&W=G?2t5
zDBn3%o$?JOl=n0!T<yp03FUQg%R72bVtG9bc?}cFs~;=xuSg>OmZ2OC{l*p}Z&MXV
zzc0WjzP`)=g-gHZ63V;QEw4r~@&+36dL@*X5i74nLV0^pV1|BE&rWR5qm>=~BKs5R
zw+<98{k{TOyuUo`mUlrh^2QqS@)F9sHCA5Vg!0O|<rURUtX~a7UhRbPenr@!OTXX4
ziS(O{0yFG+T`}^ef}3vdW`a??zdRl*Z(Kro=ep(XtCLv23k-ReCzMw?R$f{{d9zVq
zhJHhekynJML#^L#E0KP0g2H9b`5=q;mrLF9%DLsq#N$RKEEQJNO<1D<NiFbPsKm5K
z8t$>fM_T*E?8AvTDZSMhwr@gtNh~?7BNtlF*M>bYI>!1bjW0#se#8ipaxQrlMc&s)
zBeg`{v3&{TJ^Q^QZ(xkP#*RE$UmYp3I6mlp#apefg12?_EWN&pU+(qQnE0vo3N)?X
zKL}zSC1tPQrz-AkbV}5h1v*S5Bs_>li<`Fybo%=v6ouYI7z7xRBDZ|OzYt-BuodH<
zhQG-+g8%ouz<+_pzZbvU`D5@0gb~L#Ee5ehY2)vyxEuIi1f2_iO#}bWFtx@%y;eMa
z*L>nALJCO@I`p;=f0G)1dlI{Nvs0pzz?R&Y=ceze1hUs~B*69+fhnf8Tdg4;%mOpb
z1zGVowHG!-$<K!Tfl5AI)OSv81p7Oeo$s(ZkVJJ@r-rq;CIRytKg*`l7xf*4;elrC
zp18G)nEhGYOswYjhY<ouZcpQt>yVGNaL?~of`m-d(N4pAQ6NPjltn>d-%pK|uG~q$
z1|e=fYV`*cd)#y`5UY4?bG$M{$)@qj?w{f;2pz9dqAwBSMt$>%jMrlHnEX$enTtjR
zSRGZ3MR*t4U#(8#qL4Ca*6TI+o{%zWk?FQQOg#I}5L*zQJ_+$G1_hbI#?#KmLt`hP
zwC=X?sL_<hbCrRIFAgbVRRSQsIHU|NderR^j#4#|s1#iC(FI26sbJNRYe9G(@YQbE
zA@8du)SiK)sls)kLGmV%u&LDLA=Vlc1DR?WA4P=wDh}b<NWeWJb*vH9`5uuvkvVET
z$hbdc1>*n6Q|SJd9OBLGvbw+JJtUH0<WAS~l;|4zItsD>l@NpLTX5C9zP;tlG<Ahb
z83>k}UGRUpP!H|8^&=|~_#7`j=<+h~q#Jma5~+!&rotol$-FC^aG#7SCANQxwNTri
zwlQ#@3_j!HVRfs^gFbz$3yKJz8B<?{X|@^<6P_;UKu}b-;6J*D>CF_eQ_U?aVAE86
zmrFCsnfFx}zUgHcXvkb`+%#ns&P^I87&NhJ0ws7ik=*nN|FyH$RpZa?D9J7_0#Hu@
zD0h9d(fnPITHq<K<TZ@?KEM|)4qV>6{mU})q@guS>|ZABdb(5xb!9NPb6G`JIxYBs
ziNt>qqW#Dbi2v=v|9yNSeHul5SL2JgHw28Z1@wXdarNuT%-z$<Sg8=9ydTStqfviQ
zgW&v7s1?NApu|*y1*5*wBnS>q-hPpi)|gYmJ3Z{Y5Ld|-_Y(>ad~g;CSzxX@)>~DT
zYkzI(WB60JtmEIWf~U)$mLriA6n`2?$u!*1<)pm$en=HBhFs@<qQ(%dK9gT}k=*wE
zKp1gQ0NP!K{6?|z{Vw@sW92KCJ*X{;`^=R;cTYRhnp#DBGW+oI9gN<VUr+p(Y`(`M
zHn8s}dYfqJcmmnI5UFGY`+lM^!YOx~wgW6(&AL+@)4ZRk5fGp_s+nD(ZttA?U<WGc
z)c&?5*1YHe@ic7X{izIvpqV5^eP@D9qLrLytt^<^h*EmLGCH3%dVSk$?~fwPj~tog
zeX~80ymg=;jcs{vgDh6wP`A8vQ=Zx%DNvHs{>T6lhWLNQ-EJnWmS6?daPdPokXQ#(
z0sTkQUpagcmlsM5Jx+>)O)JHm4l7qfHa=U9(1!2Da(o3ETd%Z|*VCrJdi*wyHo(&I
z$Y%EO<abeAx;wRA#1DEQkWa%cOSXdeZATR_^Fk|=fWGgnH}jQ#=)@#+dqQckGu`j<
zDCEg-A-LbZyTCqma4RUu-ar^dKgT3dUk~J=AOT=(s$fN}QYOYfU~{@mkq#UJ;$Hle
zGdwDA1o2;9Lg?I2f-47Ye1{ObM}9!~cK=WkzQv%7$M*^H#NoRQ7;GE%cH_G=9$#q#
zpTt|sm5yA{zh`m4Zx4m@T;S}+I4Ka5FL|L$rF3x{QL2>%uXpr#YFp7_$sLq%F>48m
zjG|7@q_Xwnny4B4!F(U?b^b#47KP5oeM+V*oSWRrN@wAYq^1XPqZfiu9ss><c00@z
zU?+ptX3%;?_)0{cSd}Y*nr=6^i{eo7?eUMAkKLFoh2olz^(LX1{~?Wvp)C|ady0>0
z{#OH6A<0QL?|xbsB}KQuvgYjX#ftERWk!3T=6^{E<Q@2)<$aAbl1J|`M17m_5%R27
z7n1^uu<04<iQ9wyts-SwuW!8qG;m$3O*S3%)wiS<S`A3-mnvS*8GjE}Vrl$6mZVW`
z_xL+oNxy;$kT;3WT?xtRK*yhCMx6=!G|Yp~F!^Y#M=>rVaqCtTb;DZCv~>7`g7qez
z$psMj*tpL=4&R2ThwBF6JG3PszK=jc)^YHS`NqL_qZ{9c%%d25=eh9xQXw9nioY^K
z>e?uPbe`#2FxLzGs@u>zTICeTbe)cS0zs`#Zi=a}06dazw?hF~Q&3!XD<?Zrw*XIM
zk<%LU{v+0(=`z`v`?+5|C045kP%AbotEpSFbu|T3Zk6u?EtzkSUA#TMcFb2BIH;Op
z=DE*0PbJXbrI5f_6<e%>u%Kd0yMYQ@_lm03o^Xa3I23tRkH2*P5~I&5#Nts4N%|z8
zMxR$f>(*z;3P+#Ify374CY3<?v|)g-y{*p+o5*m*=u<j@K3|rP)5rD~^}Z&zqftcI
z>Gu922vIJP;C0P)pd?*-+hz0Naz~fP;1EV$#2tQD0^+O4oplL_M^#dQ-;K;w?U9Z5
zJ4E!#7S>e?EuFWR!~)b7R*Z)?QdkEV8(DXNULL~fN^$QL&Hdv~(l8#f7D@5**Olt)
z4(kJarG|rOXY*HD=XefZV?raZN<npa)~pWQ)%zq=9US~mpAc&k{%zlqffD1t19W6-
z6aUAckHLSgjX%l2pK9aZ51VTIf9}@1V;k``2LIFLipMWq!lpk3c~#Ui!`O|^AuS3#
zvycqH-qG`^bETf4EW@RhNbl$u@sW=-OvE2N5CDmlO(h<xjgtoCQA2S<-56k`;|gm2
z6RJ%KH}kcbl%Y0fxCb|8suLB>mT3n(WSv40*mvjSwnFQGe5gm4YNgPJALIjWw~Xda
zp%33ssl6Dp*FL5!4uV`r=Nn{wP7WV#J}&Ch`J#3kRW(81A^WDYkxIoSCl8~(Dxip4
zg&na}gWX|~zo0`T28C8NNW^47$NNzHAJs@TK3=_n4ctqKaN>Ph8vK$;H6AO!4svUW
z2S2gg79a0pAMq)tq|kVOFp1!vT9vHCzac7)l;XqJR0DqBrH2vEz+EIm)Yk!-$;WCU
zwt+k3NjQ>Akqk$zdq8F^rMFP1X3=q6)DPGC;}A%L)`4^g8mJ_9qv9vYkJyrHf+a~q
z=*JGLxh*M~0EIlCu8&j1nq5Y0G1t9(OZ8JPcb`D~8(G{7rfdPc&+`DWD)9V(Br?Le
zNQ(My#78;KS<$}1iu^_0P>izlLt12mjNf#8^f!Ef2$K*xKr}(gqVaWAhKsQz9qtNl
zF{;&0P~NIU(`LLTiTZY|^CYSFA`amYGGd^&;`RcW7E;(N--M>u{qP~Y7go1slm;wg
zGoI~F{*C+e?fc$;M~oYpfm~t~#E>=IH|nnj1zDq>orL$(9Y8zCpThwX*o6dcF&jiW
ze{CFRAFG7n_D)Z2u%Hz<zWHBj)a^<V2Ln~j$h$ZRk3-SPI7qgtLW0o49&2@x)Ih61
zl*tBV7{#Fj5<V^^j__%uP?wK^=jb1zJmGm3JY(=|L^5ff5KkVYXgt@Ii01|aPu&vn
zP~09B2H~j?kEeSgJY|%94#W5w&zd!f?4v1HfKuD1Ah{&_Y(l)7#*+u0G4`2-<T(3i
zDg6yRJxaurZs0kYR1%&Z2A&hE<M5;<!lReTH^B1RK3^iQRB6+HH0ArC)b_crL_Bj2
zJN6kIkLNig$Js|q>1N=$phP^i3_RPS$=t{%_UFb1p2(^=`;;jO&rEoNw$CTXTfBXO
zpw#xsDG|@}zd82lACD&&$;H~Ip@FAyiFl5qj5MAFr%JL<IRnqum2vj@>DLnQJPRq>
zKCdHh@%FjPz|*osJVPK**RNjjc&;f4PiX^BjS}%}K^bZLyn3=E`|SJ0!Sl^GarU8D
zySVbgJ(k=)4~7(Np9#ntZy!yWW#Fk;BA)9EJl*2)v?~eEZdhL9DOVz%d7#wx8Sz(1
z_9;S)nl7&|zK*lc+enVHkM6&E8F(H;-gx_H%BBXM11CztbAf@UZ9JZ8CE+QA<+Xi|
zt|$r5i=fo@=~p72sRteVd<dSx-ySiKMsl2ebbry-!1F*tJesn+foIj9CE2Hjfu~_S
zp5G6YfM*IkLEC5F@<jI0c!q&e+oyAhc*a7Y_Mex*GsZr*C&Huqi>d~mn-k*Elwp*S
z#`E#<lI(NzX9rK!csv`C99LdiN*<(W`)n#oWFJj=gMp`RiFgJYctYSAW1sFN;W-S;
zYdlw$h-U>TwS5YXm1Lhy5yw8I<MGTwa<TU5Z{R6fR+4?v4Lm20mV~E=f#*^1jImEz
zB0Rd)-2lsL`&^O`kEVPdl-fS`m565!V#>5XA6^=V=Q$+D*+<u!ZU&w?$Q$1tYRXy$
zo^3};vQJ|J&pq*Y$|S<0wVnx2(Dpe$As$T`1f{l5PKkJ)4?FhR{Z*WOa*-ToA1$S!
zfoCf6#@k0z9!D8zJPZCP$v))_JbmKv{A4A-qf6mgNYVB=Cm|k9d6$8wWr=u(K%n-Y
z4TW*`xh4@F-Kvx}@H~&a@%GV_TTn*YKCk{BXCFR)E$`{tOOD(32{1aZ+7-<0CA;`H
zbw2?q3eh|Ic^U{|V!o1=DvI((BqaqJ*~?zNfr0mkVM5DKP7}H)Bh<m_On>V2Kz?Xw
z!xCQcL;mDIx}D!==1;*7I@OIW34U*{3OLb&9A{VuVQ#)0Bxb4a-NUT5H~$u_afKV>
ze|o)5tyl1pfLd?eg;*{s6<V*%0UE)Fr*NDV2VN`s40NpdIMLiY`f2>ZipxdMG;Qw-
zMA>vtXViM|Q_SsDbW|q!1?$aE?Cf34hih2rwzkssK!g=U+Lq)+L7Yg!`j6s5e@9W8
z{OCx`a^9TZl)4W<Q)n?3>>;i7k#GlmFc&D~yt00e3>R{);wSM|;ngl#0^PVrj6qwD
zerB@+?{>*E-&w&hOw!T5BNVF`!E4gWU>WBn<YLpzghh00fYlRlzoffe_{9FzjrhcV
zcCblya45D9@5D~kf$V~cPk|uY+kF)_W3qAg2!7>wyB9%S$i*AlW&FX#QQv|^_KMIB
z>ll@8sPhbb^M|_O@o-!UL$AQq8&K3et#uNvzwS#Pwp-DUh!F3KX9ox2E*K`;ZoU5*
zzPj|l{YSqhCxyGQ^~(7}S7HycD7uUGd;B5#4Q}c;E$`(0jL0~O;F$W87)xzg!ZJ5S
z8#N$U1)i};qsHz!AT5^|X>xLQ0bd|~#Oke)`a?rga5F)p#_%0ljobJoDGB7X#dv6z
zMvkD?CBMKK-qQPbd>;s1NncfAqW!zqky&;EUFY|HMzj_|yTDG1`f^DQ_BLx^NjE#a
zW1tFqCDq=i;9Tr<vX)C5i9JX(u7K&Dl;k=MkKet-G6fKUI>iB_Ba}F`)>{RVEDtz9
zKg2zh#!tsX_4+Wyo9X<W-TA*?i2O?r>HOc|r;*=0|7!3!jzB9?na-ZSi5!q^9CIB4
zZjn8#kHWtS^jiJ`l~>`vo8)KO^82{uYyFM<Bi#AhP=1e{zo9$7fj`UOk*;_U{^JW`
z@y|JE<A;Ab@ipZX2U>4Fig+rr0IxUm8uq%6rG0`etanfzRM-pDH~RZ&Dx;9+84(Bn
zQ|%$n1z!q*wDvwXVgy&Q{dm1uwiAMoNH(ko>BE%tCM4{7LvRZb(jG#%d1M)ASghR#
zpwqgE+Q04i)wYT&>C{kh<=c%|r;{H!I<13^sT~fhZ}j&9vOC`So6jb6ZM<6#LLw<O
z$2{@lvge$gqES1@o{GQ8o)?owV+^r3ve={6L$=tX)}<o$W-Zn*<%yr6=NDM49gUU`
z`vtM*al0v*q;Fd#)6uu0A$S)Gg>*drq0{fIqJj!}4t|k9-^U?2USE=r7Os`OiLzt|
zD~)H@@t&~eLkhG)bEmenUjhw`;T=5&D!?Fc7Q9D8#As6Pz@EF^$8`cAyJ<O}(RIFa
zJE5b1qv2^asSR(6Haxi3dgI09q)~`|q_<_zil-KJNs%$&^{Hskfd_FcAWMG>7Ni@(
zWcyxV6<Tja(4-HdCcU8Z8@KS?x8<AQa`s!nqxFi+){Cq+9xW~QcImqwXyCT2@3PO8
z-D33p05oy>zDUHe`ewtaY<({w4-0i3x-GH3(_i>o`ewbEP~T7YJNiz)nf0yJ*ro6E
zN0h$Ut+neD#e?aVZUg=%pQgTJ!#t(B5u0LN0U7T2G#*bOAVPwd@E)Gt#SR&KDF%%A
z((p2m`QrKP?tHns>uo&iU>N+rcbaqkzF-3W%?<vw4gSqb;y*r~zjI&0mLD8Ttia!d
znA+Q2bJ-F`f6;^3NPp4W7!JH34D=RTN8B^p=p)LE_D}J<6_q3EdmbNU{O*T4+_jbX
z^Wdbf9>X9HMh4bvlgVK)fFZxCh`P`UgY3MaDanC$GMr(mj<3^zpNBNqo%_H7xEJ8`
zANFG>n07gw;9fX=q$)m6mX8m1!^ivVnlO22=_F&n2-RP@@nwKUb)hm~s~QiTcFtj)
z-a#T)g1J=bL!ICJp7fwP*;iL%{N8JAnZ)f(?t^}Me3$mMeO#-(`9WmN8<CRi&3}$r
zNPWKRHPDlphC&WM^L7CDlY}~F#PLt&W?~re&qL&O5?;YoZ9*gP!1d4SR}K8aJMq58
z!ca!f)K0)$kM2XYxmcQsVSU+R!!OeoYXXbiE*~%Hsw~E4yiz{iK_4Hs&3NSyn9+Dn
z3*#Z9KOYACWK8m@cV?5Vj4=p&q&L#Z=WeE1Rdaku30Ja@g-{7t=e$d!!etclh+6=#
z*}B8}8KupPtlM@_B3L4RlDQXJpVTaaPes-R)#%^N(zg~`PwwRc3A_?ns>fT~wg@&g
z#;HUM5dVv>ga{W=qd1O#2Cpr&Zd6QvtXhU{X(+VP!7j4RK0m-C;{rFoWylJCgB3q#
ze?R9Pr@yW*s$fj`Jjksd_XtY&{=RN+5U+$tN&Q&X*-9Ku;cg?C^jH?hN&f1(v+}pi
zqEHXK0Cnz?GD#Vsc5v4n*1K>5BTv*<mv!Z^5_;U?Od_N)(Fh_$Je_`KxHCXTK6T36
ze6J}8VY<A(ofR{_fi><>U|RFQC+eF)G<=V8^}Lf3)cC|=9gR?i1XCTy?bU_WvY#Lp
zG4x%MdQMLs<2iqmCu#Je+%l~f-9Ta69hX(&V*3SukA4G1kGRN!&X3>zR}`II^Wzk+
za08+fMxoA0{zKon@*i93<nM0h&yf6)WAXVvap#|F<S%3APnP^2CdmJQJO5B4|5vYT
z{EPcD{(Ic{dAx-FyNf+usRCn=*=f8oW+qoN8M}5-<CW}!;c2(PG<zVGy!+8Fo4^n}
zN1cX43;tlu3~y>V{C8!lx3a&cw{jOeFJ7@fXy~c!diZ;Y`Wwwd#gO_NCa(Cp`n#hy
z6|+$~MAC{5n1BtAz|ghx7!@7<6p-?8EJR{5|N0Tl&!cF5RC)MAJrMeX(|Z2UCDEO&
z7sbx6kd7u)r~89zT?8~=?ZKUe<ev-1QG){gsl<bXdekE><*}j?T^n4A*5m-{0VbXw
z;mD5dXB^+n@e}$^q0hc$;|9)w{ocXfWItHJ^X*1Y(&zP9eQ?2p3L#GwQk2>w`?+Du
z3uPngoZ15ZYQEG@@x^<Yo*czjwMFY^pfUJ%7dLZg8w&B$;h@{Ex<>wr?))!N{z-QJ
zP{RD%fFDl7^VL=OoAgrhZ=?LZ?fkvm`DuP&-^Ka7;zG}Wj#XS5<L{XZbT#h08+za#
zelS6E!r|-cu!Hs6P8essvPX`Ar?VT+kJEu?DTrZQ*kc2J8uoC_SIP+&4tALdI^-~?
zaX_|fgXns`Bt?DWKnBO4^HCH^UuZAl<)Q41IH&G}_G>jh>*as^=glbm47pnwi$WMb
zWT#<yse4+NZ2aX%>vVy)Y4^0A-lo}Uw|JXglQziP^p&(x$mDJMb{d7ud7FNe_6Gku
zD{VUeJ3nm^{`D82F46K%s=r`#S`s>2`jO(FOQ+mg5kas_+C2K<&pY09aC<)%jZ!($
zpSLIlY28De&d57nZ%7M&X)wuKoEkhcEksUht(QW-GSZR;S*YcHEcD>ySNBjy_Xj4Q
zr^wL)c;3g~<Rf%I*n09crxWWdj)Hr(DcmXCdQsGu@d+&d_*Y~guJmZX%kF^na3C4?
zFUmlxpYYZ`SDmsHc)BSm{9b4SL&_1m_<x=TZ<w<k?Bsq(WBy%U@ucy_Dv-J6*WkWq
zo#@a^^*`@}hK3%_)gPo>2%=QE19%QeT4VT|A@4=-)BW8Y3FHN0<Ygq3SKB3T+qM{a
zar5JbN-E7+j(!XcIGo$H-u2g7QFM@<hWG733O`rpS^WidD|}fAHM4?0e?_2z)nP65
z1B?B^(pd%7@X+!SMAI+#AKmP)zt$gI;xGTTfARi$xo5Oq??3u024s=4t=Av@RpJ5r
zplI!LuxAtepkppQpP>h%aOCBioZwek!BsLH<H)(*fJ!#R@d8h4K%}TRnt<^F?9MD5
zKc~-H@VazerQU;a#8KaYk0?m<D;mgjK@|4Li?Z^HQZg2x5~2OsjrKPy^cdcwKHs0d
z&F}3Nq03TVLJ{Yr|1mh#-!zhi%fyibJ<is^x7op+(kf`_h1SJCK))yd3;G4tIQrcH
zY;lm?5?H^^$b-7(9X$f5<1$2jl|@JNXCt$zdE@8gqt<%Zp7ieMQaN7V4~Xt>>U+gj
z^wz`Br`qc^s{X#ZTAC8-zY6fT(SJPhA!?`(8R30sukXQ6$6)$=682+^S5mGNZhSte
z8|X=CjP~(hw0aDUqRH7d^_47~a<ECFJ}>g%O$)L}6)xGX+oZR#7NEfVs(+^e2&(iW
z-=ho{>GdZWU$cMa1lPbnGlHuOWS^~a%5TO8CbDo&aA%Ijr#*x*jzAu=M>hwfB98j*
z1CPW=tC~nPzd~!}7NDK&7|`C&G41D7C^`Prg)-KDyH*OeEC(B&_}cNljf(Cu8?}|@
zVe(#hj6Lt;a+PtpvS%|wXrDJX?3tOso_9dLuI~>*ssn>el$p?;EfwZoal}zyOE>am
z-xrH~CW_tm1joMjZU*{y4g2!&hTEoj-w0OjPu?S}di)~3I0@-g&MkC*0va^2`u<9L
z2{0X!ogU@HCJ0<**C(St$r5=yCQDbmN%jq~vaLD9L)K&5;~#~Xf_<AK7q|b;+AE57
z*Vlr1ApU-F3mK~vwT-!H`Z@-lcTb&6-GOV~or_Wyc2VEjsqQfoc(tCX4<cL+LSx)?
zCZHm{r6ViwoaxUl{5^n<yo2M{AtWAueFOiN&Bfv0Hz^MPER?gxUo#$m;8Tr11VRIU
z_IHf`K1Wsw_{Xkr@PF}cBK)_4f_#STU(4YcWB<nS_<tums)goTz}g(Mg4c{08vi#S
zH1Lo6mhpeQ$+lGq_=}c1_)90m|0XCj{u>;gG57~Q7ia%=2LAWniNpWLbdA4)8~@6U
zjQ_xQ#o(v-bK|g_M`=L!#biMYS4|llNu}LcBntG0(Oz(v#whUavvCRxh9hWh7-ziA
zDyS!@;Qu!M%R#3vXz>d8R*sy&3z5&A#zxII4g8saM>T-(KMfgLA_mfezn+1A^S8E9
zV({;MCJz5hID+OD^|g<|ANWMuKLk3B|H=(){6UVK)8Wrw=HTBlB?10hKtK*e_OI#i
zjlq9zJpRM5rGtM2c%ur~_Mb6L<NpS98vlg#jDPw@+fJv$zjUdCzkVY8Z-7AKzuw^+
zgMYx&arSTP!oTz_H~v39*7z$J_}8ps{J(4{9>3gIR7qfS*Kz7AK`8eX-3~$4Sn%P&
z*tPg&x}QBCTZK&UJ7%yzF;MFP&4@!LAz>D^T`cr^M1hfigggIlZz8|P&c7Q!9S9Qz
z{}o2|i&b{PKc8|JtyTD^xiTB`$%+PplZ9dch5rfSkPi-mf3S;%#=i=AZ2jH&FQfdK
zcK(YJ<}Z-^dOmwLh~VIZ6(`=m<g<w%Q85FEl}4^ftIUVr04ZFVuj9qtBQ`}TrZ8SU
zT)YNZc)xa;V?c}_^jcMUg@M=GiEGSy`<^9Y7uu7iRrAL|LzjS#tc0x|yum=Q3dmFJ
zW!EbqF=qWLW`zz_%~)<jdlvhf`gfV~-wpB9w539hT5X9os!KQvJeMhGjgc1hrQjo-
zv@2+rUj~B{!u1F2Zyq-PY&DRrhdfyi)fZluRWir~U%#%`oOl(2hvq*n78A6Q=|R-@
z4v6T8bl#$5JSph+cHQh>yt|Z|Jd9$Y$2dFQKOvDtBu~ymV}e34G0%b}dCpb%J5RB{
z84Ro^-k|s&3hc3l@Sk5@QP2qsV|;@xoxrXS_IM)W9j#+!{@rN`rFrl-bf^S)U|=s|
zsjDum*{gu%CGvm9L(>=YO`_-`!E_Ta9m3z#VBH5y#JPfCAy_+KCxlD0u&IVe$xdZ?
zyB(FMtdCmxD0yH*t0eXh@b69^YP{C%<cuLt;57}9Zy;6UYJ@XXBZT8{f2dRb0?J+f
zohd)cUxB=oU}XVTwo~>{-Z}!?BPr^uNc5;Ja3XqBgVMgIaw)~ngDSTwv?i~_x0%{L
zWE*?Ua`%^P8{YFRdR4oK>;BaFAR&`=BQ!6E(3GpI2p)ha@eRIH^+{w0j_mYfgS|X8
zScaRyj#^m^7MmuRZx&h=2+C}GyzlrAJ<evM%2BB5!KUd3szmQO*H!TFcJ2=AucsV%
zYhiN@Pu^}m_=-Zf(}i#<2=)G$Z0!_U{l5Xuk2(HC{M&`^<Aq{1!uKYanqu@4h7OJx
z0a`L2<LhhSYZZs@+Qj(ECBRqswT;hJo>dhxS`YIY$nbcZ2K4g+0purxMP4C;IW5b}
zC?Bm2?#W&GvR$TFe{J`ws;^cq$Y3&6s2vE*4DDYD>@@-+sj(w_Jr0&yLZb4_QSh!J
zc-b-V{z9>snWMfo8sSbXSs4h|gEF>I$FBg+_nh8B?(<E6ch!6c@0OQJ#``L0seBpl
zD=)jvnH`7sDX`;O5`(uv0=(Oo$KqA%9k_1?qhxG<%7NRn6<i9iXv6$z0~F)@_#1Z)
zeIaP^N~1bpzGAGdGKZ{JQINC-T+t5V4AB?f8x?j@-?|r}0B9Q1>RJPwh(6ojzpZX4
z`-0rgdnJC@b1z<OstAQ3I50NyqQo0;e`U7Sy?~EAh5YvclElrut37I=RKWE=JFE&e
z`W@B|6t7Jm^|j*cg|rNY3sGpd#ah4=+HD^zf_Y=-9~>XeZ$;wNutc(b`MF@?`+{C@
zM)7zLUNy0LJ??2xk}q(>J!p(J><+6t1nc2CKO_G#_)w#5fU76q9F~RFwq-y#({Wf?
zFNDKp28(1194DJI_*SU16iBLKmx6is;65zp$|iVb5WTkSS#!U?f1bd66eVk|$k#AY
z{wN>2_$J~<440@c@;r^y108LuyQxSE9xcNktiJdaYKzyXu#>XGx<a#q`K=86pqYr?
zJ5Ppgy!Gj*N9*J2ytY_A6F_M+K@SbISD-JlMIS=>Iih#TuM}Ku^d0>f^7H^!HoCI~
zIc_NoYP?Wwtk2~HSHr47z9>pvz%-_aW*TSll2G{|f|jh6=V3n@`#*EVfQ=y~Jek@D
ztiSFJ?hkH(sdIwA>OMeU%~W!Fc+6Bhh0F1i;GLs*V{_Is3B02_!;|fFt}pzawp_LK
z6ZZNYgtUX(Oc0h+Tv~u*bdFf@41@<lhbBcR@Q^2iaU*K(Nbq~rIUwEn8SV+GCEI6!
zOFG%-qn2!+AS6;fAp7i^^pEW`0D&)JpI%U>m8g?j4$JA|z#N}yDS5S{zDMW_hD7yb
zNMtqRAsYp%Oe{=J!u<x49@wc~P<On~-G80pXGyIP9ERQ;gEznNgsvz%tXzD~35`t2
z3Z0WNpHE&;i-`AMK{wQ(%Ld)$p*oGTu%Akkl^v9c&AeCp2)gBPBaoT#$2qg5#Zhy%
zY0t5iZSj??%CC!ZylX{6)UL0%^^5vO5w$ZYw3=YUg8FLv&G@lPKnXAlB5P#6x&1Q{
zCv!}{NLz}JB&{wDG}|ej4U|<k;yp6vX<Z0rL{UX30^7A+B_5a^R$Z5b_Eh?-`;|QE
zuYP<MXjRjg;6|QH6p9R!ChGeLxq?gW$jUt^T68PktkA`XWWZSwkJSbU-8&r<7o+aa
zO;Fx6UyuMsdGK0hGAm|@QR)L^CWFMpLmnWf+t;We<Lb>V>I)zyWA&w;k&EV`QI5A<
z3V7TxIoa_Z?lVxK&{m-VsjyX4*eWVKa1tt@vr>Wzt<{U9;9jS})+ji(s-FgY-Dg!$
z=*h$rND{5q%R9oZm`hZiA7%>n3^z^GcbyyID3o*z!c~81gz#c)Cba}W)5P}p8DfS(
z4E5}l4abi*I`K5S_Vt26Hu<8yJ@hqyk$3bH&=GTAu8S1P_7}=fRmaMYIR@T?&vDhU
z3FgC8AFYuN;d~GlQy;B2$UkaWcYR6CM1O@Ke<W%TuJ5sD+uVI2d}=hAU*?;9KO;?k
zCx0p`k^{K3tSeeRgmid`cl7(nYI};`z*h$tkQx2#M5)AsOzls>yh{73My7rE9H)IV
z=}$=i5r+OCEmnUjJOon5>S|rLkloIke+6t2?q}|QO8;<8FEvt{G(!wP>$R<+0ri=>
z$LD%72n1{aiuk0|sIR=z^kGxeYL2E&4frQfHgTG^a;h}9zsH6<`mRKV;`E)4G_9`&
zS!msj-SO~k&@ra{!UZ>W`!#L4s7aOU>Iu62Lc~Y5?)s@Zu)>dllp2QC(tZg|)K>?&
zB-l?bpgUqU02+6DwR1l3C3ycCFtQNdi1$Af4B8*sFka76#PQ_*)MO>yUf}r%WaN0Y
zkbTi~guC+co5~V?jS5;t{#dt?c-M9C-JSCZjDgM@P5DGjLF4u77z6qRG&H|C^UdKP
zArmx1zb9$9Ph8=k)h`S=m^ge6#H$(QPhUXz8vDMe&LV}^V_PbAs&Cw1fc41FKn_kA
z5NyRy!yoMN<R8+g|2|FNlF8<Rlq#wbq6URlgM!fFN+>l!dhZeKPk7%P-eKp7`T}4f
z#YoGU&p)r1Gkb$fpTVp}*w8u6iY(LdA8P!*l6N{3vKtjL8o!@wBZDEcgS%qK@68m(
z`=}^Ma$Q^UR+O5J{NOML`4UQCZEc=7S2vC>w)!czHiq={PS=*nq(PSV6SM*QtO>|s
zj@YmKoOo{yjCbx2RP#}}K2Y~P6FwFr(|oBvBoyDf-wH}HHa|^Gid@40fYYi1;YP-&
zuNF9A^g*xo;BrLVC#saAzxQzYON6u9hkt9Hfc+pc&+rEouR(D!)E6lqS{4Fpy#I`^
zgr**ubq_u)o1_bLwC!Lof!4}Y%(t3z2_8S1=dZ?!6n<}Dh~n)h-aJ6KYN#%p9abZ-
zr|C4mH_%Zf(4^^1k{j~0B!I@kCu+B~c&;eudT)U5pR(ik%PKW#7aR4OL*Ao}tfyGp
zJs+~R-qCMUXB6tZ>jRPzAHQE~Kyp%-&yoo@hC&pFhp}Z16~aFjdFeocH~$&@g|U|k
zxqth9fTMowQ3yj2f0xk*0&O8=c$^Rk;-7T~=p~51f#z`wc(`Kq*UUlFd^MV8gfp*d
z!Lj<gtEdS42qOAA&y|l~oTUT!>Bz$r`8bj&@55A14*yK)Q5qv|#NQQob^;SQ1I;Hk
z2H7?%@QFO3G48#kFVGlIDDkt!FFouO+x_GQG_tnZ0b0~IS!e?r^%$7vv%zoe7CWFs
zQUCZLh-FzKxF4;abQe99;HEkS?{^SW_e#MLDtcah!<cK{Y&4Z?)p(HBm+esKa!MLC
zb`_WlwY6sdu<L!%ldd^OQq=d;le`gU$A8LqD<1;wm7GHBQWCk6{N0%!A_%();)Qr+
z3Rf*3g<;2nI~lei<Ub3>9Lu&JYf^}90g5K}$d+{H6EFpyD;1)lS$WY?4>igu7=k_u
zhhvm}mld=)9}PAJve6CbQM2-vMFWxSARR2BV<*|c+tcbH5<3&ln2o?u1S0pPv%pXe
z%VGO*e>h;|ct62l7@rPWAEVgZND8gHKEogj!UyL?-ks<Vb;Zfyt5c&pW&Y{Xw`Nul
zO}KrpblvCQgT5Duz8{VyeWNfc<e~|Ol$tJm3tHX|aymYq3tdl@dZ=ku!Tq3jpVz*u
zpbqri-g<pTbR#hP^Omx{EpQs)b|{9!auoTB#;YRLi>%;ITV~W(4-lX$j0Ls_t;Hxb
zH;O`Q#4H>+17bSv{p%PvVjPiklSg+FS{Vge-H<cforZ$Offr-{1@4f_;f-_=YnI-V
z3rrP{XasprL!J}^R`)P)5~oPDjl{#JF33O?v6VkqM|8#^@y4Waa39acC}-4{NxbrK
z1->&^dTS<pbVY*m+fM!1_pYGf`k~Rd>c?VGDDm(Ysvr4s6Qq8R@6HL-Lr_b78{EdI
zA`Mx*j<+xH3{jjqpk6$Hzq>p|`;JlH%V0>2qh@$`yzX);I}H1w<&g<RzwjgPV=ctC
zx!WK8!Eb47NOv$+fDEd8G(^1wGOz-L%hGYddW;_;-nk5gXk$@=std+H?}*AHaF2A`
zjNp#U);~GoCSDQB-2+}nt!E#UW|00K2w%VgM{-X3z5s@PA}9pLRQpo1f}6APBPHsa
zGa99YZp0m7+!VQEU9HgCJ%c8j-U*Mu#UdY~^2vP5A3VxerG^i~zG%N)yR8_XBHh3E
z$Flz5cj0xQm>8FiP8K6>ECzoUo(_W8^#A;}U5n>G|FOq^?4hdO|4M(Lg}CZ8X|Ue9
z*B<xv2?stw(?R2(*!n^AmC%ZQ6r^T6oI`X@^NIE0{OHh%{`~bgVG}$WzT&-7W-`TZ
zIO8{Pq8ZU`%n9wkl>}xqN#J{7sv;|&I6#9=Cw0O<PUG&KPwhAo4dMSQ%P&@cGCf(f
z$&=E}ZVG?!F%^;SUt;M`_nYn^ss?^*$ATaCKEY+->l91$N2kmG?@rhHbs;<$G;Ioh
z{n0<s?{xUI{x6I%^;aVQ8}wIIXH)p_hon|q|9(2W|4IMf-Jg;F)IxiX$>Sz#%>biG
zf9riXRGdE<?Y~JJAmT*r|3CFFhK80O-NS0j3n?^Su7U>Ufjp_uJNkr=y@zE|b-(vN
zmG__g|FiXh;{h=yXs4n%(4El8z-o6pToXl)A3LnZ_(2`NmllOocVCqV)Kb0E-6!&2
z8gB^K@b><(<BizzSK|{!KP`^lsSnQ}O$7q>3E%PVzgQm}eqBv|zT)^f^i@pyJ@5Qu
z`Acpy^qKU}`8{Ov%lzm38kqe4c)J*V9sB%ttD(=*;^<?`|G)a%k@titulK(y&*9g`
z<mV}lpA$bS>rB5B11ztfuztJ6*56t-g+|WmQKc;^fp$1P<^&A!W!X$B6z}kcpFxuG
zyO{R2c=-<*@-H>z-(bjZS-kuze_KA{Kl!yI)bbiwX{P4mkDDFkSG+~aM{{v$7A;50
zR@*Fml&v<W@krF4>G_DZ>MI80LCp9_Zf#lv%2NqO6V(&xx``@od>zZblfggL<liu!
z|1WQzo<B8x)ZepJcHCrJrBC=%<Swbd|I{STC*nl*as2fOlfHLx^rx#|vG!=;aJ7=a
zl)3@7cU6^O*n8(2r?*Ee|CRl0lUQ#czs-N9N-+43`WyUR_I|)&-Y47@x#R79I(z&l
z{ogx2=>>-!dJb+m$UO^}9#kKROAqoc!n-q8w+V2BU}@UvDn<KQ@)N^MxBOWD8P~`0
zpIcrt{AaLAF#M+r(ofeOIQ6NG!?{n`gRG~if2B<NZ(l8jJ{JGV8y);_Ad}<&D#5@%
z3hAfAAItw<gMSy3f3JA{_5KEb$39gY&V9mrUiq8){&0h%?{s9c^(|BhhQ1S#emZ>}
zJWn{B`-FQJhbPvb{`1>izy7H`sQs;o5deo3hF8A?w=iq4``6!CpUFXcK<j0{<kIvr
zJ6IHcgn1U*pVN6Til+n+?|8Zn6frK?4bE(RAW}o(@en9&wFeWrTqGy`WbW;kht53c
zos)sR+8b(Shi<JmAn#am;0&7mcra)S;DYg=x_Bq?D|?fc3_w)ibBpV(>r(I6QOA>W
zKL>G2U=|JTKH;eI(4)ubaodeAlJ6F#Vp}R2*dKY;*%&m<5x;)xvew_Q&wrKPsV`Sw
zYuCg+;YNVV?HkhV_2Et7^FZU0fH|RFkFH|o-}6E-_1~fYv9}@bv*PGo(h}(J@O#GO
zcgH{H*VW|bD~_Kdzluq}=lNpvKb=0Y>ELh7m#1A5YyXnzN1m2GHvd2UP4!*xXYQOF
zQ`-}i-@jAdp#KZ<V)44mr&6!j@^Sp*x2p|*UplE6|8VG+nDmp1qmRWC``h97kjXFe
z@A6an@%|9%KX|w#p#B^48Luhg$LFA8-1?6z_rJj(7$0>Ch!-ddkH0DwU#vVg{Y|H(
zkIny|-~Zn6!GG-ef7hN=RdDzaC$m?FyF7jR_L=4nIEa=_@qR0C43qCQ46d=tAg^o%
zm{FkN0~5_5`roMEs(n%GyQ_2lR{f<$hxe;r`!CYpe_Hzg-~3VUul{nE-e0|Y0+niU
z<Kh2q{apS3_|yB>e>a|g%AdTR<{P8#6#K3TpTS!FUHiKG&x4np&c9EG*Ii#;`^WrZ
z<$=G7FX;;2_I`@507Y5ydSexne$TjK=zDQXVviB6`5@F=iggCsTZ-@Ca~}wFk2cVr
z*z>;u_WX}T73=Thj}#9vn&@>rL=TlELHnfXr<?ROil>jY-@p5}Qy!oCjq=D36=PqA
z{t1)5cX9Nz{zjqXCvd+r{aMl9e{c#Np0^qxi$CwtdP(?(cdkN%oAe1+!Z&jX?!Q0Y
z0ho;W=d>QN^~cfo8Kl|%ddE}6*ynWRgZl0r{W3=|ds6Pe+2Q_4{`6x3Y65oMuA6{z
z;7)50_+X?jwbS<nIeoJ`{hd%H4uM${E|T<>G3os^|EZEb%Sk`eA4>O!9!v?3Lz1CG
z-1s_ny>IxXwpFa2$ZM~!^;ZeT`dTXztF~~9Yv1_ztSq6B>3L_XrXl0!CrO4n5fjZ6
z;n?W0HgEza3lM1SwG)oY_RnWXjo<!BP(Kab|3~7-uyykd^|=+*^^u#bKgYsqSR1OO
zz91lCA3jm*l>on#58etmOkC)KkZo`_G)$YU8OTM&Q{B1ZNR>u-GRcVBTu;;<|4-Y8
z+E06bE85L=zCPjgW8k@Q{`ybx8ui~4b_WSdWFHs4o6Rh365?~wpKsEaIz4@~tEqo}
zBK^f5zO(DnN+*l;26EW-b*4%%>gy<^Yp*D_zH<ELYEnb*#olFz=xmDU884!)OGF9&
zZ168{@ZUDtP~n#h-5l6aBMXr3sE~la%l@Az28coF6Mh0&<Ne7+|CmXC_37!aH0c|h
zp8gz@{<pl-;yKjC@P{R*r~lHVf9CY`6HNNsPEUWcN#BO);fU1VuAuqVn()OSaMe)v
zde1p#{@<QRq`xb7g7W=Oddc&DJ-<uoD}#eg;in#lor@js{=aTdaxJV6ugOMO@}g0s
zNy+-3E}a_Al+che;la%8-`>BRPX7Pz>C|4frg#)Rda;^4h9<e-(G>n*)uCT`g*A0@
zpYR#@QoR2Wzl)qfj?QLn9BFc0M!&!n4LKr!^eAUKzMjsWRNkKO7@|x#zT(gt(9oHO
zzSmQov2Gw?Drz;0KXL!j6`%VQ#>(sM5AIPhRCq+B6>ie8kCJBv|FS9~lHOmCzx*iO
zHj_V>{wc~|gnxJtSCqMk{<Wq~ER7M}eT)1B3z!6VWd`@RUXd}EE^f-0N5s|?G?zu0
zKM*~EtLQHX`~X~0-`zcMD6J@SIj0n5uH}CVe1nOwAZIzE#BlyH>MMN(&aoF_HLnUD
z(t7+WLP~pza_vcnkMMGD>qU7Zen`sjP6Cbs!Xfci!6nv^DbOOAx!fPhT#EpUzyhZd
zS|=#3NxtX)7MUVNdCDI`jK<bLrHR&7q|=ojz>FBC9o82gBqv$l-GYyK%P2S$_mb1a
zY$_`BBtnB7=9tfiD?ktD3(m88GIHonDf_)4PgC=^j%P!0uDH*W<eiOMi&g{%;z%)y
z10Sd3BOQp$rD?^4bh?>xDBbc%S83idobKk^XAQ+?99U+{Ewe78ugSXsm1sKQ7ci{K
zrW1)c!Ks{)4T%GvdB5WI-p)7Tbvl26FPR>xqZftgvBtyYOD9>Yhv7_TzN3!1zmNBu
zaeuN6*qcx92+Or{g8SILjN3b!j;RAcor?(o!KR3pGSRgWk2MOz3%dVP_N8&UG|Srq
z6i2PGkBVZicQh4ykiII#4qE;33w-~&Tr0*`);U)MiDsklU>r1SOz*~hsWhuan*BZ$
z$7yemIZliIo9BOMF66?G4$uOJIs@mSB+eC$ayyV$cz}#bLRtB{M+h(%$~X8Exb9_K
zIQEF^Sm3j`KHv#Rj1;=-Un<u>x>{uUJ<48ZqYmNseEb$?@a8wbCo;>#3&>ty(aBF1
zw%59egcaoPl<K-xs_SN_y3!jsNmdG4w7g@bhFoNwLYolGCj#p*{zOUOM!NepTi=bN
z3e9}1?~wrNhi*kJfy3upppIN)%>)TuCyRB0VU?1ey1s!m1B9(t<S+7jGx20a&Spdr
z45=I_mmOR{ctlL3c6aY=ypgtmuu<W2I}u!LjYejF!DsS$pEaq&DKt3rm|BIq)hT71
z<Ngs3TXF3tJK41#x}!lbKZPG|2S<?-AK!&bygmL|xK)*S<8pNOz$h|CarXB^Eb5te
z^x0znPWT;b|I`k#_TP>&J-z*(9HH$$5`QWC55}Li{jUu<_P<f6W9{D-Bx3(thl>4M
zgYaLre;H&p?C)!@?H{`HU$Fm?Hn#mMUBdSN>|)qItn5G4W7}Wq7w*7<7}eO_Des-g
z{d$Y=o!YyHP=~B^ZZlAaoHir7Gf7gnCHFv^hbp$rx^@WFKX}k~>vH^wyk-297cNKd
zxTq?qJmg+LLz_^Q5dNVMdVtW{nTr}a)@ki%el6#{5nhAi_3+L{Hf(SHDdeTg2yk~H
z_<70-a_?vldk{Y~9;GV~9!G~q*GbdO2oyI{ISWVYdEh{e_sb8%4-O#2CU5Dfyps*R
z&n!w=-YGfIH#_}A?m2^dqjl~TK0$c*zj-3Ma9c%vpGL1TrJ>-7NEJ6;j5?CNvnzNP
zt}A%xXx=H$LkIKztd|=eb-a15XVmeQfigsZ+Y9PEjC%~{k@~nGjMlH(wxzaK-rVqJ
zH$6dgt&fbR+I^VP4+qG^zo6seL;u_lD}&bV%m!W32?nLUpSq7(rD4zsgVD(TihJQ4
zjMU%4Slb5Z;`ipige<f`i}x?_l`G-9{An***U!(upHW}uOO*}req{P~80HEX=0b}N
zv+fO-VJ?JZI6|a6kDuu6PVelp-i6Bxt~#3cC)wjT?9r)tZkbMOjb&)V!L;diqNJw%
z3@9qMpzzU7rQ-!q4u!D&BIsBi1xz}wEd?Dvd<Z)Bq>hB;Dg92~kx0MY_?lS1ewhjM
zyKx`u_v`B}{ceQRf1+PAc)5MK+D(N@zw4kF^gDGS^sA}#`z0Cree<A8znJ>qGxUt9
z4@YiS%Es1*VE2;hLkK=oa((Ewm(BA1Yc8{N1K591AI=;cQy&_(D!D%FM!6SLA1c6z
zu;r)euw_GK%eRwY%f|*4S07gOPvkE%@ino(WOhs7FU|I_VHUsYGE6f_{wMzOCmbN&
zUm8O>=(xHgbZn_~Tof&pWWD@=%U^nq>ik|;Pg1gXbg42)Ny|Jh(+{gU{ze7PU?%Nb
z;Tw2e_B8sSf{NYnkL5w={pdx3`e+os?<udN`jEkYCP{5y9{EuJCXPlC6?FdOCsMop
zpBc=hl5i<UT)spIyyEhl!KIXN8G?M^a+z@P$;Xd`OW{PuG9Uj~!yGPG7+mI8U@Y74
z54e;OE}QX`F|dpgE>8=Wr-(~ihf9jV<zeCS2J(T+!aUY!MjCTzC0sIu%Z0?{!0Wa~
zORjY6(n`1tLq2dBDqJp?kH2+cyCeyh;}gJTs>5Zt!Das$EVim>)JC|ppTX80EnMD2
zc9hXe#N}3pO9z9?+rs7Kbhhr#Pq0QSmN1u*!X;0*3@0wN9WK9I;b?S|aCsW}pwYX+
z<(rq7OBdnNRk&P4T)utHw#x?wmzu)mS>ywkTZGHJ>a6DeOV}>Ijc1J__{W;yaJk*!
zvc5cLcpUk_<t*WHKX(Z2zY;Esgv&hQlI3tY+u$-zxO6~1aM}1c+hy%Z*63m35)dx;
z6PMBsm(4kjT`m<aFCZVdj2A8)k206Fm}8+H9WG#ve#Sr6f>&+3JY#T45-v%|2QGf$
z5=~|`pAjyf3YQOw%Mgc4w!!7|6wXkDeBe?FT&$KIX;|qwS5bLUJ^!3WLT(8EII2|A
z^%&uI{<$4x^Hrv<G#yQc+z*y_q7>aPqsE8re_nMW`el=?9g|wGw^9beRzc6=ex#n~
zbNVOfnSy_;?d?ch=eggD{+(F=UgKE*Z2V&_KwP|P0iMZrte7OoM<E~S4`kr-_6yAA
z8R1ezxKto6LmV#I2A9vvvHtfWAL%b#e#K}6Ex<LxWnGA|e2sss)()4l2A5pnG7tH{
zW&UG~rQeOry1H<gC|rWXCGxVZ(W1*8jm{G;jgSvq9u_WV3zxa)v0WPFF_yFOkM)kj
z<q?C6RhGr>y@9#35-zXpW{s{EF5f)CT)x6T*3Ax=b_SQXgv$Wr1D5?GS)&@Ym~}Pb
z@|1ANBQCWZE(b4jH0mo{cGP4nZwr?t=Q5Y?TCzr2!lf&5*?<UlRnqSnT&fF~r;ra=
zZW1m9+gYO*giDHWNhU4@4wu^uE^9q3wm0&DOHJWYiSM~<f17Yw`8eBUDgLo~I9%!&
zT%Hmx-!*3}>qoF%<}6?=X~HEWTt*X@WQWUlS&m)03zx@`4_L+tm!B(3!L?w!Tqaz)
z5tlCz>8|WD(cp5b3}={yeBg4aaM?YZvAic-$_tlL#3kTxxy;})SGatLeBhEKTyDI8
zS>G*OmXBh)EXF_9`3{#d2A79~O9SKsm(Te&+0L&?@6|%Mj1?}Uh)ehdTcd@3$1W{|
z%K+)Ua)rx|RK{|uIoqYDaLFPrQyeZs4K902bB3wN2Q23amlLbmE{lYVSGc4QmwpbH
zHU^hBgi9gvfs6Ghm-NRgn9CEwW%Xlhmm>UQ)o{4{+{4l6M&S}bK5%(UxLp1jbLlQz
z3WUoT;<64g`KqMfHMmq0E>)2aT>1)^Nn4pqS>ci`Tr!DE$l-E}!DV$Rwo5<c1DEQ;
zW!SUKWnwe7t~HW1+J%3tOoz)^2A3y=O9t|R%i2fSE>qFeqs4qexGWVe3yDh<0q;tq
zjh8xh=_*_vKt6DJO1M16v*h-<9T-c^F^r`${;}pcT*ez*{*vyl2l9bSci}SZJ;u^O
zxNOR2E^G0RHQ3?eH@M6eE*Z!NE~kb`oj$}|j<shjlZDF!;?mOLQp(^mNVq(PeBd%y
zxJ;E!>WQYzW&Q}p@)`cI_941m*=2sFW0z*a<pJaamxqMQRr0ZgaJg5w+(BI4bhtch
zaQP{TOS%^FflCYFQeh=)bg&6yX)avSh)W-bODluRYr<v2H>}ZKzG1a<pKZ)#u5dXt
zoHerWkL7c??C<VqbiHtCh<w2EhHx1Tp756j!exkX8AM#xJZIbGZG%e{;gT&_ZWJ!T
z^~@zZo=ec-a+ASjWt1~qg?zwLO}Olr*1M8$=^$9nCoJ6@E;S7<V}#46(t58R!u~Q#
zKCW%d*8SsAw#z~MW1W1~)@Xe<$1YtY!*KiqEKdrT+Xc%r!ey3l`IxxOakz{#xcqsF
zHEJPPx(b)F$N1wl!ey{<xtF*+=y18z;4(|Ntd_ClUk|fJ`=qU_E?n9Om*&K!xx*#N
z;Bvok`9#{f*}`Q2<`5|9&F8XRetm@PvLF9gd!DiF@_ARsE=`2Xc{2AHBwYGQulAB~
znJ!%3CoZo$TyhOAKb~Z<jigs=CR{$B${+g*mj{H)oy6q^hs${emsf<#6Uc`$`YFJ6
zdA=uCuCs;9dBUYJajEKXu`(Qut`#mfBOkcDCS2C9VJtfvv0V-fW4r9dKh`RQt*e&d
zErUy{aCr&&z~y@3GD*0+C0srdF7FVRu@0BM2A8k@VzK>%OBLbbZO9tkE?n*rF1Hhx
z42Mf~gUe{)vZ(=Uv@)0N(xHG^pDSFN373Y%<pd(`l}2m3ICi;MxSSft8jTSypYkGB
zdn=9Y5+2HS`4RtEpE+EfGPoQ&!D1glKG>y;aQQ!k-F09U=d(BP;2J2!-K}VF3vR*P
zrMSBlmtY|Tm*Vd3?oNw)@lxOfC{n!Gotf|MdFJWi-uwRX?(fbrpUliYXZM^vn{-i5
z>1Z{Sl~5M3(#<Gkm9iO12g)Pm-(J#D(aS=2hSCno&#YuM%GVO6qs~wwQ64E<q3nDi
zN=_)5prmHy0X+uq%w?xiGC>(ed8BlOlBA?`^dY^><w-x8%LCeY<{71@QttgLKlUEo
z%O@oZlmwu6Ls<o7F)Q_rl3gjYq1>VkDS!5qxum7r*z}qn1f@Nc7OccG%A?|DE_I-U
z<MYv6D79{jQXEQVC}~*vjUHcj=CV*JaiAolJgTK0lrj}$E`CrR^_98YrHyBlQ5q@b
z<VX3ju_%v}AE2yQBEiGyWV*|s%x9&9Q4%O+1e9)+N6KmW!>)mGaD^QSr8ShMtbC+r
z)18ja2Aa7PgK`yDwUJP^)BQM_Zh0u_p`>7Ci&4fZ<sDA<Lnx1GDGudifGE+S{Mkq5
za+5ZmE=DP>l#Ng(Q64EDy33L7hz}~K)5=^HL7B};W}|#AW;*H!B@gA1vKh+I{W9Il
zP{N>$X5~ITweHMiyHYYhsX%$8bcT|tvUD^bN=+!0Seav#?n=3XTPn{hNiCV6q`xb*
z)PNEd%D3L6)HTXaO8FJa(>qejy>2p>MP$-3+MPz`GJ(?Rj@lSj;u__zqGm3&pmabj
zv!To<lV3+rszNEx$|-t?-I>dLrNoBP7PZuYQelHA#i7K85`~puqcl*;@pqCCm-1-3
zaiG*WC&7wPKJ=1~UeU%AXp|q7G7L%z$|L1uSDDK{(?$6a%3dhjSb0y+qdOh_Uc}6$
z2$bEFN6H8&V^YX;kEWLC_JGonl}$z&t&}%!<;Pa1JW`55DM&Z1Xu3I}eC#P5{X-j1
zC!>^9%6cf@)=4ezx=2TRaOdP@Dyd~BlufK;G|In)O-Jpa1mn)hMn@r&YS|2>AC&H_
z+@%NFow;mPN;*eDEghjOm%DEH_d;m}r3ovujM7ypx8BH)?Mitxmkdyz;fy&NN@gf&
zSgCE4EJ~RHWh>5@cjRxn2Cn>D=t59#^^oaap^eAiD1R0*bEyHPJLOR=ze4#NXUxSZ
zWxB1QG-c%kJ#FvIWv)_ULfJ-nq|}1)YM#{68cKR7DOee9lzK`z`dSipQ64F=p^U*t
zj7(7e>@FSMq>ZPjQGQU$5Gd6tkCfw`WWB`8D{FTwlqjg>D=Tm50ePpR(*@033PHJ%
zN0ebuc2AK48$mgXT28RC!6+k@^3Ok#FoN=^mLgF6cS$e<lqpa`S?OSu;!0Ty<;qS`
z-gK0X3ZRzfDdZT{gHoN9^hWtmz;x6W%Gb%FtcS7^_kw<b@=rIJ?o--$Zqq~R&RjMt
zB@LABxEItON)>#XxR_jO=?<j>D>IGKSt)<Kk`W}pr-^h>rlpc&v=&NnC<R%mX_QP#
znFgf^<<WF+b&$Dut_xic%JZ($(PP?pVj1OLelwS9P&QH?DKnt##a+MLQ1(LE#>z2z
zuHBi-Y^6kpG7@+FYCu_>TRJ+NOy<%QN<CJF8l{d>4!@M*J5nCi5);apGZKuAT6#d~
z$Vy?O#8Jv1DAg&Cl%wrsE+tY*Emu)XekeIvc}<V2I~|?OXXa7>N;JwNWeAkGSw;C3
z%9}1S-M?w$S!a|HN_qK0iocdcltNH4;2KaG%2FuvSZQaJVoF&J<=Hyv=pX*pA}p}&
zFrlX<mFcF0l8BYGMtPUlbkrJ3PRgT>)<Ri2R+P3-9(9(E?$X9{lOAh#=CV;Ksi1VG
zJW|?1DPCIUk_5^eDAQP(Zj_Enx&B;!Y*fl4B@LAP2}L=bL^`SgC5V;kM#-R*DNw#7
z5akc~yRLz=E(pCfktm*y($Rg|cw!jkP98ItDo|Qb9@R1p${sT5+SnJ$3MdO%IYQ5$
zI&=9|DN&&Wqn2t=$|G0^N?Rz+Ss84UT1q)c7b5zxaVU>!i4J92GO48q<}$XEbTphc
zo`Ob+t&{;!nou4ohug?{*;Yi9&xvI&6`+)4<rO^`?{suLz|18dl);oo${;Aw@o{HA
zl&Da?aYt*6GE6DYpGimO=|cr61)vndmnXxZ9K&4pv(m;WMU=7<iXY{X^0GBLIwy2#
zC?lZ^W+k;z-sCnNwS+R5@<>??g}<w#kw=D721+qjZqSqQ&Ro_jB?Xj<lt)TyC|&P~
zavVoGB9u=ZNSSJs_DZ?>R1ykM9x17yB*0~NER;i(PVeA%vr^S4>69`F$}PGqk#fD2
z%;hxQU!smGK^X?6KP%CUax0gaOC=~%D328UJ=nm?c>fX`N(m^1SUEJ*JC_+si3}w^
z-oI2qErW1{J)2Ng-PiWg(R<o>1{$S?QuaTQ;>+R+8x=}4Ty`fx*#~7iEBTEQQz^ed
zIY*Zz>gZrgnaf<d-qKuZK^X+44=XQ*csn|p)669gl-86-$^a-MgGEUIr6`nqtgJH1
z5T!iB7YJ1-kCc2+PT?cQ)dVt^f7{7iUem_Y$|!}DvK-1}$|L1@3+d=1zOeWe$}T9I
zSxIS>e{z_Peui?0@<>?;<tXl)G=lOAlpd^HqlY=2SHN1OB!e=Z@<?e3CBqz<OA06j
zq2y*|vQgS9<+4W-p3N2|1(YvWC3q*k%;jBMnac~>cq$tujZ!8+IYxO@3qMa!*TxhR
zMOg@C8<h2|L@~-A+09%kLiuZgD3hQpyCq6XD7~R{VdVfl{O-(Uno=S{=|Xu_OC=~l
z2xfqi7fN<k`WvO1QufNzdHLf}9x0KbJf&+tt=&gI%3S_wBXfC58&6)NL{~~*C|f9x
zl>N<RF3s^lWhInNP*$_@_dst)hqIZv<c5-&@<{mwN-JDRIzs6Vr2{J~j50_mPvpV6
z{Mm3N$pa-Peg314vO~!QB?~Jpj8Z@;OQF1@&oWfYvu4s!_)?k6^LR3s=dEQfk7?sc
zZj_flnU0!4*+hAyEQj)TyeL`XijuOGD2Zs}xiY{zm(@y10%afNk@7Q?V{~VTI_ij8
z+M$-8xt57WX|0rt@?2g1Fv=q(8I;^-M9B^%6O`1f1Q{ijQpQ91@Vh9No61~X(f>uY
zJpVyDdWeqha7U4hay_e=OL-`}piF@BHnAL|jZl_Cna9e${@%GvQHmdweUwKXRfKY;
zkJQo=N-HQ$Sm|e!DoWXn2k)j+9w`x_Ogtn?J}Bv+BxfbSC{dNt8%ll3BV}(BnadBM
zqP)YAzTZ-g^dGeGJpIMn(ZMWcE;*rGA16v*D4kY|vJ=WeD6?2uW|RR+@!*C{Udp3d
zazpVuA<6(K&7m}4rMXe^DP=K~lgCAQ(pWl*J6DuqP*Ooj%t}(DJkM-8Y69gNov*2u
zrBL=bUwg!n^>Vv~%;hR=JeTOXcjsKUQYnd`1XCU<&7efUJ+gyP=0cgyN|;evD&_p2
zQv9jpvR;xvN&iHGBcL>fQiqiaMoFQRaZsMpWr=FJD1T2i@Omy;yJevygA$*W2u8V@
z$;_oJl!cT>%6KSg<BAd$%8j38E*ELz+0)lMmq|+bdS8kUk0VNXC~fc!$9aF5?mQ?n
zSm|SwN=n%YB?skEEq+j<EfjhRlqOK>vXaXvk(JUD%DDxi>~1LQr6XPeb)Y1Nl7N-)
zKHiS@XEbxk4kZueQ7yfp<c5+6${)>TE|+NISz?r5l=28~kU!vcloQIaebSL9w#;P-
zl)0=lHA)_(EQAt9c~pz1fpk<RjU1!dP#Qp~$x33QJj-A@Y6K+~<&m-&%DgpFOBE>o
zP@=MOp|^J~%axJ<N*l@}r3sX7^F>Jo<!UpT%Q@P3LXGmXQqJPme?iJ4B@vWNIi#a|
zv1BgOp-f<<oKcb~Wh|8E^yC;R=j+Q{`qL9`bPd=Hr5Tj^tbFU`?dWoPGndj(O5w@4
zaZs)dk&aqJi4P?<E4z#`K`Ec_NWwhIqgu*B={{3}iJ+WpDswqR8&5BzR8-1#D1|7G
zl&|%qqvK;m@x+w5OolR!l^jNisFdzdCQu$JJE8o9yIZTF)PNGi%Hy8ix$I46=JFGi
ze}0hZ_Jp!#hSbs-N?a(>Sy^P1zDoJ)wj}hTJgOx-lrB(mLOI(+=5m5Ip2kMWt(5st
z@<Vx4SLRZFq|h&8$XvppjAkXFQJ$nV9W{Uwhw`YFg;4q&6lF7%Dp1O?a*iI=cdh|T
zmGUE$K*}Sf5tIQx$*SuOB^H#(tb`b)nNohIx68ClOH&>x37`bMlVEWu=NijgPSVCx
z#wbaYG8#%e$|L1$9hu9!BbZBcnafBhgIW2~-P_T{G-fU(q3owTQpQ4=hxaeLp;UxY
zij^Hk8LyOoZ_1BtgZD3`omy~~8U!U86hBsa7^S>YwnEvDv()F>(or9}uF$ZGLpk0^
z=5l~Go@_?(Q%YATfw&%RhmvNi6zB(K6qF&XJnZJ3%kI=>E?J;>w#Zz%lj7M^-On#H
ze-3$@8kmzdp78e+iISZDC;i4|?<VwT>2uNlt`e4i7iE_V_!e;LX+rPcUkI!5jeZmC
zhoHjOdSwmm<q56*At?Angu+*QKB^d6`DK;RNc0<cbO_4Tis-i~LTHp@^tSKmZ|@G#
zf1i4MRsMaC^ik;dJ?Nv}-wUM`fBHY^+3&UuWFXCH<4I;(d6CMr(iHc5woxAW%g86?
z)ml+9M-}Bgl$Wer?&=-LDy1ZbvWxOaX$~dKPY%u|D0v%7E!k<~nP8MwO1W@Der#RJ
zBPA)6dbkPH3(7yJ<tf)v(I_dE5(*_1<&koU{xmCf^w&M1^F!GLWi=}ijdCrenM*k+
zYblSEFev@-7WF-*+Z{>=R`z!B&SkPvzTx|a5|l?u1t{Zj`(_uETu`#G($^@Jm9h&;
z9o*K60A)f4>1ZI7=k;Z}k7?t{ZImcV>E(PELU~lno*FWjMtFE25Xw3z%UOBS+1t^9
z6lN|tprpgY1AU-W93i!Qi!AG<Gn6*0EHz4hr98eS2|rLC)shQJdU_*IFXlr~vO&qf
zN;9M6RmvhLPmhTbUR^r+XR#>3P@dG2xjdkaCy7!1PHsAC3}rXvQ7ucLl*1{Z9F$d1
z7PE4(lXosFl#&pNKjo3q6v~ACQcE-_?V+?_WxP>ZDCOK$NvKD8q$Gwic(4RdMUv@e
zhLVPr@<vInln^N8D36p2)nqQOr->2@<xyRk%U#-d{ETuXnVCx&D0?W6lu#%I>6Ayu
zs4A3YQ0B9;yQ6n56P5DiiX?p7EJ`^jp>$)3l(<k@LutxNZ=(b$We1enxFPkes&rIj
zpC}h1%3RVzNx@1^qeN0l4=52Sk2=}~Wh*X<Q=$A>N9J;qHXct0Z%6x*nz>|yG6)yN
zUQpJ(m0IdTSp;P^D~pZNPbm*COF|#Yqgrx630@(=#87^Q(vX!VMhQ^L0w|R!kCew%
zWG<UWi*h4^%q2CHB&;Md%F`sKqlQohP#!6Zpgepd${Z+nYRg=%(Z+MWy>~9lloB7x
zI?5xZF_b2m<r>fw$~-7DSQ%%O=1MtpNq%ew$|EHqls4lfm>NnGD0NvWYm}r)83QE;
z<&ko(vdm@mc2Vy8$y|~{Nx;h2cHWLIB{p*@1*IqDkrD!B|71~?K>4GV%;geoJUfjN
zrj$<?C1Et>kx~ZAgx3;m17#MJDXjD~N(H5CgThxsQodA@j%KtHB{P%;P-?Q0-6#>1
z(hW*8$|Ge5l*1Y27=?eMKSLRoFfo)LSb5aeJC{9)%v`cU8Ao}f^njB1o7A!f%GH`O
zmvgl7EHp|Vr97Z-80jQ%_p8*B4N6ca33h=p9m)h&8W|;*QszO4M|o7s!yuVU$sVHQ
zf>H-cRaO!hB|M?&s6LeFlt;<}D5>$AF0a1IT;fBC&C1y}-nlGMN<1i!={G@0X$Ymo
zOsQoHl#4ZFF2B*nGu9|gmGawpNytihq{N5Pdx-@5K$#3>94n=bl2|FDpcJD#QqIU9
zaSeQNRFr~HYCs8M<#TIqM;8*9xs-sigYrlj1Ev2KQ9gc=xx|GMot5oI302BRd^1^!
z@<=HKr2}f&1LbUWnac^<c)A;<oKm(x2|z8MD#($3yIE=(3?&T8XjXnQ%D4EYqb^YP
zQ6AN@4NA6^qLhGA1xh(q{%Yl&%Pys4hVmNc-EL5Re<4Z)D6ybKW@WxndMV}pSxJ~i
zc~nbQC=a5^tMSNZneORoGM6K?@iZ_>4yDY2vWD_Vc~D;Fk|s>(QBXplj9}$Qqdfl6
z%%v_A56+nLpfrdtwN!vo5lSgmesAfW%Oa)3g_4HysFwOr8t;`_VnB%o#gCQIMro{+
zQ@H1!p7KbE2c`ZtQGWX*b2(mB=5l~Go{~mMsFYwRsVR??-{eol23A`xN*I(;P=>Jb
zZwqfn=i-^U1VTwfd8CYj(z2f@)uEJy63EI{ql75s{qK_S6XlUo0?HeF#E1ta5|qzX
zNa<>nGD_J5WiREC^0BNO>4Q)%{VQ`hOzHF``5soX80E_krlU?!#zNTwrS(TCa5|LX
zPzJE_=g;1`>`+QZDA_5GYUu*yJkH+rp_GJDn3cIk>7kUnzsZmN6=(0vPzvmpU{WaG
zDoaNnXyd78lx#|w1?B55spWndnM?1bqTKu_bJ-7N2P;1q<zZYim)cN@Q66<P2g+7l
z>gGWi45cqCr<;4{vOp>RPzF&RDRrTQzLHv+Ln#I&KPw}R(oiWUPD?^<$|EH%l;W)=
zm=?;XN-~!>wDA--N_?dZhZ2?YNI6wn=CZN3C=Wi!Ty{g*!petc-j2@1F>@&j<rm5$
zB^b&reBxUMr9YIOtZX*Q7^S>DB|mm4KJf)YS&rY2ZwI9ilmJ#b8>N&|`1h9RJwsQ@
zqgvjVl8z#yz^qW-2gzJs(#Dg?D4+aIM;)O2P8(7-LGgPa^wak;m+eqCuyU`dcP`tM
zk{-&*KSk*Tr4T+et%K4BN>^598>O34Zl9DN>qmK1OGYR|$)q;BLCFUt2P<`ql2s`)
zp;SOEcT37#N)Hz$0Ltr%GM8tx@x(F8gV<&+HKD|$JgQ|DlwOBKdHqi2vKh)6R!%nY
z&SjobVnHcNd8E{a(gzpJZBTkZ>B!0mqtsW*F}h_!BM78CQv9Kuq|X)<><1+`l&q{2
zGfF(A423cYpGi)Xkh$c#FG^u3FDl4fJhbt=YwYdlw^(K_g`vF1y`bSxf>6t+w=$RY
zP*$+A(I}&o^7^>^*m$U=D3sN>FSQR!7btC6>1dP^N?8Y`Cu(_HTsrE22grs($qpqW
zD;bRPF{bIL9h3k(K(+x&mM&6YDJW0N%Uu4Vjpt4y?_9PhB`uVfo#hyHfbz!>Q6fQE
z4P^-{zZ#{BQf|`i8#<_qDUXhHdMNwnOYqnmnQjLtEm^5$l*~$*4rMguk#ajw<}#|D
zC}W^xfs&S$*haY@!_1{Rl&X|R%1kKf@I6ovl*i>{F865TIo{AampMv_0p-aZQEEcz
zHc4uU1!Xyu1*{A+N?oNKIVuSSDUWK21!XgtG^{hPWiD-?G-IWRQQ|6PFq8qP<ybM9
z%WB*?nFu8Vl$5NzY2fYXRCF_!f>3(m&dE?H{`h=U6Uu|KGM8Jl@vJvWuu@(fk>ao5
z^HE_aH9^ORvKY!7R@xgSP$_GmB&9r>%j=@jQ4zYX&|I$kBXem1r4cLXjPgF3>8K5q
zh?GalIw%F-i82#P8YoFwxmDjgmrY7Z4dol|;<SS@_lhVDq1-JabGc3%&kUn<Qp%0P
zlF*v+sFt))YVMF=awzkm%w(m8Q8FrJDwL#@N6O72GM5GTG;#Zt%%v%mdaT4W%H61D
zE>)ql!>5VqP)bsNG~ESIQb0+_%F%k>xy({ZG$_$Am+DaVw-BWTl$)hxE|+QJ8Df;$
zN;!l}&uGe{j$$~qG!`X2l-W?GvQo$>{z@4LB|7Dia-^`#r5b8^_)_N55K1jp{;BKj
z=tLAVm;6u?p_ai=CgZW_6;P5uiO0%XqYPKdi-S^pLp&B;5Q-nJBpsk!D<yL|Pa98L
zqZC!jDkvv$FX&Yv>1bFxp|e4m0c9d9X^iqVvgxQ5l!}x`b6EprNKaAzej#(I3#A$>
zf7J2LWrI>uLMcyqq_lxD4HwJ}P!d4#XJwjEIw<AZ0r{~_D36rXP)_5HQV%GXO3GY*
zr;VqYQPL}AGL$KlN6L+YGMCc#gw6|P3Y1V*q8sIQBr})FP-0RZDN~{Rg<CsspUYfo
zLaD^c;o9E0%v4GgC~GN?l&VnXH;`I(K=}bm3|0mirKVC2?3aYLlt)T5C>0(`us@V@
zC1frqY2ztilvqmX4<#<;k#eYj%q1L;A{T`+0m>LwUe@w<bS$EoOI|3O@F?;?D5p0|
zEnohYxm1Nxo|V-`8LE`Oap{>tc~nb&D6<ENasWzfC{b8xZIr@FSplUb<&p9tzjU<p
zizvgP{8n7%a+EfnR7QCn!F1FDN@vO=WfhbIYeXpnWgL`XR<76d&Sjlal0z9od8D+0
zQX8+vC{Ti+lxAg$QQ9fx3T~1l$Ez_VltT1~1)X<KJd^20hZ2F6Dn?1Gl!;LM@My-h
zd@`57iPBLBloNq6mxHwNL^aAyKQosgC{ZYnI+_e6Y?~;Rp^Szyl$C=uymOhZlt@tO
zQXVOlq3jqSia(TcP>QoMz$n#~vTu(h45d6$qCgo4<=j)5OJpctijk7fC^3}M4@wy*
z2lC2XE({lXGL$2fPIpfBvhuvTx1%H9{N)ej@}ETvfHI5nsFwavoDWU4p^Shskd>82
z8LX72yXD6wrH@IZ$lrqv3!I7DObMZsf>MN)mPRS4lx0wwQywXQ=aG)eZV>v~6Pb%2
zl#fM8Nnw;%Urk5Np~R*<QdU4YGE9_NP!3Q!&1EMmSF3sFvPLOMp@dQ%DJ`HJt1U`n
zC_|w1V`Y+2+9>4`-M*p49!7bjB!}|%GYO`I5(uRLE0v6rS}9>rwox7_R{~@%rBTb>
zaGA^JBGS=Y+IS)x<;E8?mkLl~qLztJ{>81GMNsxY*~-fPs@}OwRZ0XXhbfP034)UB
zk#y7w$^a<6Soy^$Rh6=5rzHG|vv(vYMGi_ZBb35W^01P}DAAPC2TDZBqgwXmmbo0>
zB+6rt%;iI2naeBMc%D`9c68{onM*Dx<0y}meo)@`6=fBa9Z)v1vfL;Gl@gA(wg)JW
zlmIA;uZz+NN?$15Sozr~`IWK+N=M2g<!LVI=nOth<baYNN={ai8Rf+%(@|3>)A4Cy
z8I-|Kq?Q+tWiD?D$z1-XjpuS@?_5?XB{7t8lt;BRhcX$#O;EN#S<A`<qqI`Wh3)cV
z8&V!ANuhlDB*9)#dP3>MN=2ijR7xn6Ba}zVrJORCu~S9K4<!J~Ppm{V%C&#ZT*^VI
zL3yNvLHRURl=qKhE-wqpT*7JN*;~mwm&r=`hKt4#$|I!$l;S^&vJ1)vC@Wd%Ym~}L
z*##vk<&hErN<5tF214lyr5!7|jS@vEy`bD!FUp=AGMD!V213aJB@-)8g1j9a_-N*m
z0|^IcL$&mQQs|s?^zEU{<yirl%R}0DmKvqMQXX%W1V73nB^Q*A!z6eJ${Hw3S!rgJ
zyh>RFB@gA15}sW;I(lD}U??4-v|=TRQU3m5I%*7MJmrzH1j?djqLhP@6-qi*E>`r;
zWrb1_Lg_$xq%?(c4L69QLGk35x!k9XXS`8bDCOK1`LPQqkCenvI?R#aslQ~pE1)c7
zrMyv+D<uR<0OgT#A)Cx)-$7AAp|pk4oE1N#TzPNiQU=N>$|EHd$}lKZp=5-Tik00J
zymOhTlrMOLToFn+DCMBUh4NQEnage3czPQpNGUs@_(S>jlXNs3x0x<Hkhv^@GMAN{
zMv0`99#DcPk2=}~Wkze6%Ty>Wp)_X2Q{LOrzISFW*`PF|JW_f=nY>Vxx=_+WNyf@z
zqx4hCLtJ_qQXVNepgegcN@6JY^2%Ip(8klmC;>`Y0A(}fk@7gJ%%#jMQEvPxb6Ei8
zS5^`k<>_0~Q9~%PD36pyP%76HWe${PQ0lXCzMOY1%ajryN_@&Ar7@JVl|*R@B_)(Z
ztc){CbETZwC<!wtkCcQ^%6*k!YACnz$Xu?_##7cPNtH4Nia+I%axRO^W#&s!?%$WW
z%z-kEm9J&J9bI~3=28kubIKzn1WKm|qAY>Z2uf{Mb{Zv2DW7nn&P#culz|cer45v%
zP<~{kr%@^>WgC=_e~R)YvvibwzSNQ#%Jl%5%LUqavKu9WQo2ETGf$KqP(C6Ueoy8y
z6Uro39+mOVWzTCfm#j!QL>ua;2b3+iV6K5u4@z}b78<3GQXbHSh~DoGq&!lxL3v(6
zf?c2_gc669Mn=h{lzC9LQywV~Gs#>A)fFWdl*_qgE@x=tNnn)le@sX9p_HXOQWikz
zgj!zRmAOoXGM<&QrM+`mqLg@0@}ic8P+IMgTDCx`1*I}8V~x^ODZj0gglv>YwZw<g
ztiJ^NK#2z>CM%_ll2|FDpk$&vQqE+QxhyR%N<k>+bIDvz(Z=(+l((Y`ugqLZK<P|*
zq>O=b`xj9@-jTUXgff<u?M4Yz%Ez^mu#ECZDJ4q&o;~R=ZzqXFf0z5BXF@D0FC<ER
zdG0St4c>%AStWkp6I?^5;-_W8PfN`|t)ls9Db-I4#ZTLeYv`p6(s_#mQbJQG4|7WA
zcWC2@Xq0O&P3PsHWTiZ+ISk5MdN)W*HZ_!`Q0B3+x1@J|la=xfH&gae9w`-|Ot>h@
z{o69XR#2L-($^@Jm9h&;J<1~`0+bq1mOx1dB{?g(jS@vEy`Ut4vM0UFB@vW1Q10iD
zx%@#J&yy0~jt;yqbIAea&IPHZ50nkK;$()h5Xvl8mKvqMQXa3C5%j|qCl{36tt1$J
zOXkuXN&{A!86~e$7C{+7c{G>sbkfo3qN1#Uk_t*<R+1Ry@8_nY#!#kE9w|$pw8#0m
z3zXa0WiD4~<GEPeJC_woNeCr3<&n}9%A@vDOD-sLp-g9Gyir;x<=iSsSWkJRB!;r1
zh6G>Tl({s9QiqlDMoF%e5GX?_kCY2(WiHO|)og*13`%@f{ETwtZ!?!NP^R4yB^1i~
z&r(YtC^xdnTrSeavpdi`mx)UGvQiTIQXbV(4$471I#3YGuTUnl(%UFOO4$KrEaj2%
zEsb;(qqfi=|B$)Vhf;%;oJNVHlpatn)DmSElpx%x+yf;Ml(?*Tig`QQ_sq=2c~~wU
z?o{@I@^XOGG8oF0pJXm)Y2#UJlzvKixIz+kQXX}b1IqEX5-b5_8k8_rniwTODGQ)X
zpgd9@r<S=)tR_kXD7B$fVI`4Ko<21lHH1>1@<>?(rSm;ej@*#x{s<)&E9Z-P=dw&G
z@uB3SJW?7%DRM}ZQBW>qmARazjc1%unk(hZa!H6td88zS(zmAsD?ph9C4`l-MoFra
zF;I$A9x3Ni$z1j{5G4kb>QE}O^0kP!qf1ZBTuMO+raV$YpiB%A<+tlHmpD+Ov9i-B
zVM_V5OcLr-9w}v@IDau949b}-GMD4D@$@uG1*L3*a_^xiUs6g(89EDH9m;qpqgcsq
zln6@c2IW~NQFcI?OD0{_;z6klr7SCt3VY|WC)~^>E0or#r3aK|1w^@YP3964N+ebm
z8l{g?9xRoFOq54+$p)qHSP4#taw@aT<uGkLjf|2@Df6I2pgd9@rjWUKu8C3~%2+7F
zSxI1&aF6MzK9r4=N6G>yIr@o`6iNjsC0RLJ$UB!MN{I*MZC_CuLg8mAXu3DA%3PvC
z`Id>4u|{dCl;4&}LPE-;TH-_bh%Z^@K{-b0bWeIeE2WK+SSh2R9HKl@&Lo$)Y(6h^
zb0{OB3})qXL2pMF9-Fz8fHHvcNErj=Z=Ai;LMa2K7%SV25~`Gsi{-~|!`ZtOlw7Fg
z!4;WHL@1v!lG5EM<&?4o%KLLt%co@0QKOkcFN1Q3(rGTcS^3E*-yWHcx<E-sdDPK1
zC=2OR6&<5?P=-P2&&po~ymQ&5l*~}tQywYZplqHbN>(T(pcG<dzEOH9<^Cc`7(jWX
zWQDQ{?_Zu?mbrY*ARWD@ji-T8awugElpc8h@*t_qrS3_g*Fo6_WjiZB8s+grGncwh
zQc@n(G7pMpr6}E?41&^!mEZGw=dwsCaiMIaJW}dIseM+I04PPF<YQ&DQ5q}d6mF&@
zr#w>PLHUZa)ay$!mw(gCTwc@0Q_?62l@bi)IL=bPC6T%O87#GIgR%?CW>)^q=k4g+
zUuG_WP*zhO)iMf70el+k2jv$iJy_Xlln|x7Um!pBD?S00fbzJR1Penc2qiZwU5!#k
zDVv~dpggMOV`Ay(=`K+|U6i@JODA)AK^spNqkMT_I_d;v3+0ir1xjbCj9vlzplpM(
zo|QlIdgro3DH);ULq}bpMB6ONP$<2jbYW$#QF<ul?tDo&N4JismdsG{UzA`?D1lH4
zuu{(`*_1L1$`^blxt~bpa%-n3@u7T9D|2~E8_y3$dHAQ9OKm7iDUWKI17*Q)qHMb$
z(_H~&AuFfzc;~V}DgIE}Q64FEp)5tPAC$IGnzJ&}C=Hczf^N{z;pj|xq{M~d|3HF4
zP;x=Z!b)+Y#8=92C|7WnI+al7vOlLNv7kIpBXfC78_$OTZ%1eDo4FK)GMe(JmS8AP
zCyR3Cyv$`Cl;x~!Hp&>KyqzlvTPTl|Kq$-FOK>8T&QRL0(%C4bl(GRz7s?~$eFEud
zQyEceLdgat11p(~^68%Gr~{P#lt;=YD0g0n5+BNw)H0U`wDH`_?VZaurKE?lobpKN
z1f|t2QLdbmxvYY+n3dT^>86z1b0i@f<&lyRN&#G9XF_QYr3EW>jFMF;GogGvDazgW
zGMDj<gl-5WGn6!}#4*Z)yJjvmq12%~s$~|GrZ_Vshw><u%;hd^JSTH`=Q2+zv7ltc
znV~k6n5gCUS((c+DDznvVU+qxIW}90zt~V}@rN?wkOUV%X$_?*E5(cwPbov8G^0G~
z=){jQmy>^p(gI3)C@ENZm($zPZ+FaG3PYJhd87=75{xf=(nI+(rOf3fZ9E%|GD<10
zXUUJPOnIafh4QI^1ize-xom{8ij|H=DWQ~gP>xU@DR1LRM{5d;asWy<DD7FvV3d!y
zO-JpZ^rAdcHbB{PLzLlAaze?>%AFkExolBNS}4JkM@k1MMRE2n1Lf}&GM7iR@%(C(
zE=sxitNhqVlt)T>DBZ?LFbb5lP?oV$%P5(ZG95}m$|L3W4>FgkVWQmqU8Xw#N-tJo
z8|D5jGneX65>g&1Gohs2CCVZwg`wnO<#=}QT;?bx29)rfqSS;kV4x_ipnOO!b9qG@
z&oHCZRmu^(wJl9~R7)%<?LlXRvIELSR*D!Uu2Kd=$xV5r9E&S+`FDWS^7uEIOJ6A6
zSb3Ap+tI0;W-bMx9Hu-{hC+$IUzAl)@<YkV%6g*&E9KP;Nw|s6B!!_I%`L%BP~Iey
zx%^EVPkW;TDrF6n@svlkypAIswP`6z4k%lotYsygQQrSyI%)$Y2j!8n4oWz#uyLV`
zhBB0uTR(Z{vPmhap=_c&QrbajT~TVea9XBY4oY!WW*DWDQf^F_gzS_@N?Ir*aM_&-
zB{GyRNlB?;l#EK53Z)X|k#f^t=F)Au&~>34p>%%6i<Ov0xqHLRr7Dzylt;>RDEan^
zk{HSeC<9qJn$<g(SxSip<?|j<szdS5FUpNmGM7?Nim)=oD7BSxXqqHk$tOw-C{Z&>
za1IndC?AuMQphO&N*M^{Y(`Oz#Fn{S!v~e7P!3Q!KjX#9KUusTow#o1k{`+f%A>gq
zh7wX<YDo=c2$X)TtToDTrM$ow2sJ5>l!8zefxdrI<`M{{04r^cQdB9cpmd--QeN?w
zCt-oHic2j^pnOg&9lfQECyi0wUNarFf^sQPlr>Pk4i}{jls!<kvhqh}?_4%0B_)*O
zlt;C+ff6x?D4C%QfYOVVX-4UwlxtHY;dFLUQbU=DvsCyAnM+|Pd044tl=MoO45dEK
zQa57CToU&cx?^SDBwQ+(|Eok_zf}o~@`N^?7QpB~7a#bvkNn%A_=&&MURo8$_#fTM
zB;9AD|1j*PtC(SGN(hl}i4yt??!T7PL(bA1n&M{<DJEV)c*EfhU~fD;{tr*La6Ep(
z@*Lh)m!~{Z5<{uhK$MTiWez2w6lSHoQIabq1WF>xBjrL2nM0pSqU?e4Eur-Gfi@mL
zqg=US)<+pAfs{u|D3l|Ai82_<ekeOw*`3k5J|-&V3vP;pQXVPgpp2|9N(m@~q4Z^?
zw^4$WvI9yL$|L1lbm^$X8c`xZDF!7!D>;o4Nhv*`M58=Xc0rkn_u5B}$#g#@kh#2}
zjmMM0+tI$uW-i&FG{)VmUQiNll3K!`9DuTumBmKsr<8{iCE@l)sU-)Lmjy5vD7~R{
zVWo*t0+g}<%1+9ojvhyoxqKZfN(?B4paigz$S6-QnT{Gl;XkKG$|5MEPm1!}QJKs8
z_%fH5wDFuz@14ssrNoC)jq*ro42AFR(g)cvDBGcIU}c<9nk(hZ1W9;K*9uY+LJ13&
zV09>cpmb%WtWlCGWek+Mlt;?Bs4|y%okZERQ#wi)TRKWk8_(Bt-i|I^G;=8hr48kg
z5(34)kthS9Jc=dCUD|kd8YN6ApYSI1N<&e~K&g~klq)-=mex?3veMHi6_m0K%8!&s
zwS0*p9aUH<%1|f?p!l<r-6#>1(hW*n$|Ge5ls$OI9s<P=%Ey?bJW4Bb$wX(VJr^*S
zWOSCw3ME+Hu{SQ&tW@*x&P2)oI^rocG@{?6x~1yXE){aMV#w=|FO<=7-KTFM&plDs
z(SJvK)WP0a<L-|YR&#6czd7TMnd#@}w=+8Ze--!Bf26x)+EVnt;x4(`s~vGo`M={`
zyp80ZUW?o$l)Li>`uD*fI}J@g>QwKp!5^#izWDU#fP*E6ed*rkWWd2;Mg8JVJ9y|`
zBq4%7_UOGO_+#fj#c9vL2%TwHSma~;6Y&#*|4q>+VZbTc)jsaDLpvkVj;tqT*Qir{
zV$xxD{$apD36_ex{did9QQk293A?g>ezhxxd}`aaZR?7mkuy?)U&YX2JHLIStHIsI
z-+oThcGCa3Kg<8KKX=<nG|n19A@BZ|pAuTK5dEYov_JT0bgJ^(z5nmum;BGar>kF-
zvA;+13yn_yV+`Fe3iT^|sCTK*gpCi;3LXCM8#*>|gV+2*ozYP}^e8eNn2NM3BzEx=
zWs65n%Poibj~_|vF{DcI=wX#2g!rf7)G+`2%*KG`ueCmw_U}QjCIbxf590TWUhN|7
zC++EPMmxYEx3&_Nw$^CrP+KHR8wqWZ(t2nuhNU$#n$u~W09{)FX!VtrUTc^7>AC%2
zG^f)Z`LyOaAf5Uv?Fxt7PUl(LVcKOMh4mG+HXGWB(`IgCwARkjdK=A|TU!27%X<{6
zKnqb?VXXyNS|+3M>xB-VJPha6zV4S!Yb)(3huls-_0@B`PP?2=o!Xus(zR`bmQiW*
zw06wW#u?2yriba%is`f^w40~Q+^TDBqNNoxnsZDCY|~l-Xp@x|L2C^y?fE`$r;hel
zX008k{WQ!9O54IAcW&t|ZK2Vexy|6uE#9N>3$!mMsnd9c5BIJb>fc7Rk9gJp-G`6D
zS?5(>i?syB>AiG=<CO4YSb{4NsmPEz#p#@o@HSsE>3>HLK164LRFGFFIhK~!Sn`Lx
zuKeQ(UpeP)Egz+SOB1NcvU6_P#36SA9Yl7|RP$N3&Qx=A=n3?N7ENibw3gS>YWQgL
z_zsbGo>`#n<0hQi5^C*BZ@sD`8qGO675O8TSNmg+blOX4CphGGdcxAS(k{ExPn)$i
z9$HqV_1D^DOKWR1XB}qb&m!L1ibK203&H6$o7NgzS`r`a4&PStYA<(7r_+^on?r7=
z87=KJ?Xu4Xv*?qo(H29ith9+*d(lfDg+WGh&+wL32U-NBmD1WSOUr3AXKn)r>)N71
z+i}dy?e#9zX|Sc;qh0nf9s7&cc5-T%e;3xA*Y7gXoY!wv(cIVX6xQVR`|Dv!;MZ>i
z^ZG5$-&A>Dzs(>gR&`bu*?B(@%c`>w%XpQ=AdyOnP*2zS1GF<oO`Tu(UG#9L&MQ6j
zdFBc2LY<M1_}00huC8;tyczeu%m2`sz!EJ_wd6@&dGU895L`;jO(2IUIY`SjExCYK
zuJ|1}TXQW(gj`U`FZdm}JB_%Oe2aEr8qW_o(@<yKdC9fBl9@37`&@!Ejp<rG(?idB
zgjb&S9r=&TTCN6pqLPbhIn<J~c;zDBk;5uz`OS8zxv-M&bI6@WkR_j`U6{tbgTB-7
z%d6$tkRNaf&NRkoIhrN+_R6EbBNyenW8POsNyuSJ&Y|UF-SwQ4dFAZikzXv=@}q4s
zje<(P#36SY!IpfGc3~P95BN^wa1$+$hy0LBaHi2;%cU&2tyliVmDRZ-Gd~*PokkwW
z6P27=%ip@`X~g!*slFrM;D-ym^7XA)8%jRLA$J=4EO|5S!ZeQVcc!6gUY1JBLm)rk
z5}avt((*t{Zs3(Wen-wyQ_C43hbcLpmJ3_**DY?%KYT~NSwqXG>EF_;yP%SHaLAp;
z`>uM<OTF^;eZJGE5v1krkneK|&NP~8d7C9y^2$xVBkwPx<wTILDLImsds*_!&F(aw
zyRtgh6`?z3W^L@He@oLCuH;o5a;Fhs$uqq2uiufY%I7Bcy=zOz16g+7+f*0Xx%R#2
z!gF@tVHRT<*FNWpUnV`*>3$YY04+di5wy0;(w=YfYNc@{-6!v>UF|sSr>-A!6HaYg
zIOM)l3bwR`KH7S@UbxyX(B>(vjn=AKS}mhFofcoLJN*e-ZKWmFT4GC!Y&7THZ?|Sz
zyR}g|jiIzt9CACo-&r4p?X=5&r{v#TYZIUy*lXrCP-|-}t-aBlPGgMFT1jYql$Jwl
zeJm}Rk0uYxc~{6k8>G{0O1sM;x6=}q_B-uzI(1gahFZF|rO@v0F`Z7<S`14YVzhWz
zZ}Y8B4fUYSR9YFWUFxKdLT;n|2rV9cBV%3*(V<mU+S~Q2(|MNmC+%`NbympGpS8A|
zQ^WitDQ$(;+F9B(A1%6EAKlkNXK1^2o4GaAT7adMH(Duly5*RzEfutGN{gknPaXB#
zKCJV0>KxODa^`nyJ5T$m)2vF{$02ux9J935Msrqg=PA0j5zy}LGM#qQ+C)oh;-l^D
zuC;v7rnBZ;JJO2gyti*Cn)?|f4r}rmB;O87_|H9rk#xh?oZ<giD_xgRb)Mjm+jU~A
z&aJe|J`X>&?&^$(_I9VKv%hFgo%cKN5pnBm%bL_VcDqxjdM}ZgzaaCzOUVOyfs#{e
zd4VOz_R7V-BX{kl<?Cx?0zpbX#v%6_*VK|X(=OC}YnyM)N8~KwUgL&9e$5BLegC87
z<d)pPD-ZaNoYMN-lmYTQCCAh9jrMxZUst;|r}>WjJ3kBLo%3n>xAZEgpyVAKa;GuV
zl9zhr<6C{FA&<>@<?fJQ@d0t)|7f|cC0FvwoxdX|;V;9yaw5oclpIOR@h$n~Dt8*0
zTsb1Wgh}H&Y^kGYc`yB2nnpP#ui}t9jSKDcoM(9D^IM#0D7lrtmRmx8!3V^B|0A;V
z{%4}d-uFK&Tkn6i9AwQscO`(fOlc9c*3i<Pul%o8Ts~{K+Hu-XT{mXUIZ|6V<aV9j
z>Utq-@+v;O+38xniU;zSS>7Yn8FCa=XG4*luJ5<iXYBGUTV0<#pu0{5Z4VztXNIw~
zw#L#vtoX0Cq`%hA(|+o@2Ww8(`#9uw-A6R{+_joD>H7L6r)zbr$Mw`*_l2BX$*r`U
z*OF^^<q58==9y9Mo4IG06p)Yca&u~qq2*U?^!j|a+^spEE32B@*U>c}rhiM*=*F@$
z+YKCYuL~PQcGu@zmT{y?<4XNFot|xPXo0*yoH|>G=3E#0TXj}rO=i1yqwj2Ax7Kwg
zgZ!8qa^z@QE@;Vbmbr4<@5mQoX!#)hTbjURmYoT#<B&Unx2^SBoy{^PkPZ`gomNkv
zC$tKx&gP;y6F4B6ySl2dCKK4d!FK|W<-VW029iL2#SJ-iMiJS0O%1l{{Aa1RPCoY3
z;DZod=V{tc!4*o|!6CQn3YNCiXwGq;8C`3Gpfy!m2d%}kw0cH!YO5|+YWKX96Iy(w
zCD+=`R{DrU_t89=bZz&RNT;WGz|P#xaLDa+nWgQfUC!}!YTFY{Yg3>NRoYOkb+fe2
zMoWvM@Ja5fxt*4UmS1TBS}Sa6sf^|vh1i94ZSNLKr%%_LP9Jc{?KG05ou^%Pr#s~w
z?$)*v+9IV**V>tu`Y4R>(JokL-A2&rE3JaoW?EW4AMJ(pX(cu^f2Dm~q&jV3X^&`^
z{hGZ~P|s~Yr-u0-S!d?9Mr+wD?N_5Y$F!MMTX$&vS#!=*O+|Cg0B>9H+}txjCD!B&
zuyYM1{O5aw8S)K<dj?1ZIlYo2X?cq!zg+0bdA=jp-_H|p<-PQ8X#!`~nhC7pkh^kv
zi0obkX0VKNU@5HXFzZ{MuF!&&)>vyrEv=%D)@zAAI%%L?P@2EiqFCC$3%s>aY24IX
z>uHLMw4Z`wSaXif0S>uW|8qa<qqCMZIXY2S`yQRbx%CWxfgGUfY$LLBehw4aonbAO
zt@k_)tS2jeg7%mXopaVothM@<7THH@H&@T_)_m!Dp3+Wn$UVMkEp0pPvTJ*~NNW?I
z)n?6^;Xu)x75=ojo?&~|WQM0#`p)pBb&bmh`3n!*k<)5<l_kgV%5_{hqM!F2{zdDK
z{U7sW0xOh!f<x|cZ)eF{X%}k#y27bh$uaxr<30>>O_rS_)kS3IxMvjEJ?@QI#&I7(
zwbD}jY<-8G2U;{$XKK-$*}iC|XB(R}nQf!*)EQf@gYIkm`dpd7rd4LP$2jCpV4Wpz
zrd^ODe@BiuO;2D5<i<+wq~%VQ+`ubuUhX^Fg7jpbIWJ{^9ED|Pw(&%EW}8)HceY>W
zcxOwa<=IBEu8&t~KLz*jVsW%19CBX;ubc90U2UU}mN<!i6^w<}gEeQZ_7TlFQu{=6
z&+{!=lOwfunQzw<mg%l@LQbya<XRqR$<e)Xv+u|;<q0cy4P2fr6FA0))S18`4!IL3
zY)xQ2?Xu4U$$r%n7!7S8Yt96EiRMfol2zx=tjPqHFLfpm!Ms}UTHj-4hn!K>nN(!w
zxF2ewk5p8at#j!B>xqavv!v^ryxg2Rf8&t5@<T;)kJL`ug*sz?r_Mxi^pP3~d6bfS
zXgSD|n|bA(OMEA=XOfnGf}D$G=SU?M*_l8zYXXs3wkGhw`flRZuQGv$yg(f76o=g7
zexb3R=XTm<YllDotL{QffHsFUXPyH^bB_BQ(cF2qXHAa#@x@Np>Rh_&HOub$%m=xg
zsxz&~PS*{sI^(d6uG8W5USow`pO0or*Pr+>I(1&+kbB(ITXi0!U8wV?@6`FvY<+ab
zLte(RGtd4aJ9XY~r03a|WvkBD>vf&Qp*2=oHm$9(v?M-S2l>|9ec%3ahIAcIb$y#d
zZr6Q8bC1qx+J&xjFY-M)DXj0)XF%S=i`VHoSY)T`yjGpvS+=@9^-8aziqLu}Ex*>j
zG}JRp=c6T%rwH7x|D7&fXI9!{4!JA*gr!}kU3P6Ftj~;Vq21<%;B-1mYm+T)w2xM_
zoj$(JpiNa;Wvw-~v_eL6u3`z~j*Z*t572^?_GOytG^3?Gp<Pa=j^;_MYdg%TVgBFv
z;5(ge(AtXz`Y6ox(M}fCT5o9El-5FPyDY7mk9N$utB?s=C#A*LTCk=0`Dp+0Z%ugb
zonD_Rou*aVF%G#aq^hNDrd{^jwprh(j)NABkAib<=_i_V4oobX`zmYAnw$e8&ZmU`
zd{a}#`Zggq<iQJ#oKnkI>hlqC<(OW1*gRi(dP$yudv?7#MJ7<6C*ah4ghTF`VX7r>
zq+O`FmMg34^?LcV;>v>{pIJb%vx+*1?3@{DitN6#s>iZ*W_VXZAE}(sf|ZtBYsoDw
zx{ucHZ+)J*H(9zath6&6a*zANdU}QLrd@Vz7d~li3bYq|7@basYHfq1b@tJ=%R6WH
zwOkh3GNlD*t-qzEGFn7B6!fw^TU^iW-6ZL>vC<xJ$nCU@rJbi;PN!*c_3XS-Yb&9}
zQ`&T``CHluA8opIr>haP6TFn1xmD2GwYvH!<nz%s*VeVghBjDfA1A6#7g^dP+GTf|
z#lHW;sbT(km9|D}oh<EFA8pBxy0-4n!snV!n`$k;rByOo1RT?@AN6`m2W<gs&TBTV
zXwG@&TOEBi_&k9+mGeyBS(NafyGfghah>jK_9XpV>N-ft+c@N&4fa{`Vy}GgS6_LK
z-0O4Yu8?2PF*P^V@<2<j=#>wDM{Z($en|j%o{}SIxv(Wa4|Asx`#bXMuRLeB=H2vf
zX&M!jyn;jSG~U<NbDrjvH_vpYq2AN>=&0xXGvrsZ=^r{PzpBX2xpAAw?zyoD%htK^
zS~<PCeuTDyHD^718?WjdYSsCSc40kyo#9*OD!HP$6WGZN{Vsr#muWfBlBam(rQeYk
zebKXR2Kf^&NvGz@TK-f^&$f_PuJj$bh&%!A)*Kn~GM1g$z6#}OIJ4a)vOC*5w9A>T
z^IqtDdp+CjoJ!w^Ds73@f-P;5kCwc;*4jghr@F2yn)Ax7YIR+jHF@R!Jl*$5Rk1#G
z#(}(t7qv6bPvg`Yh-Jwh+6DQaslIXpIp@3c+{#Rte;Xw)((<XA`bbUi%4?@NvU-Ow
z`?+2NjUdNm***V@>>Q~vBD+T_AIo?ZIM=2+@~ygi-HZ+G7{B<O349Du6R2s``G|I5
z0w3H7s5)QE+1s7KW@hO7P?nuK7l`cC8P}?FJj<xlSy8il>ow3CT1Hi8P0^e+aHEEv
zXK~hK4b=EfoqMg<R7}X{_$A^@;Qd%Nfti;47wv-lZVDy*=en?=5!dWaU?VeO{{2~Y
zCNNKAX99Iac8}CJmaPdaN~I^z5?U@*XLZq>38c2_EXJBlpwV~gY#K|~86ENsez`ak
zcsoW-;C6LA+dpX+<le5V&e+4O_sQ#-3G)wD@*FMCvE;E{Igu-?)iv*_eihV#oI}Z_
zwA|2=b9&{2lbxfcYTnI%iNJf*{v9nfU*i{|GmXa_a<5;BE%`F-!ZbR5N4_f0+`8*w
zA>`po4%70LYI@ECymIXC$jhv6-pfPI#<H{0@`~)Nw5cMyD=iJn)=FDpeNXgZluY10
zzjT}lJmio&fu>fS7ikwJ(0Y>ZY)jwQM{PdjP$h?IIk_eO;+5mLvRV(_t@j3HA?H_e
zfR=Am)w50Il{ZdwYF6@L`LydEwI?HG8h3d~I@7quA$J-xE%_+z!ZhlAN4_iPURRz3
zIU(2V9JRqByXOay-J{l#WgIo<9^G6I&(=MYmxlHm*Xd}vw3gb^Qut`CE$vOPbUj>Y
z_c`Rg9|*6)^K@%FOS|k9HOcx8ZaK7qN}H;+EtWRSN6S7(p9vd4`<q`DPN(IxHrUeg
z_-IS5r+;HYTcWi0BUGmqEbTAaWgmr<v2|^GIW^3`q0&}qEuN*#@X?}t)5o+cw75!Z
zthJk!^--wkqwSO@LfsXT2HG)xF*$Sd*V-~m`*-+%J3ViG4!ubGsndZ<JHR1#g><vD
zwLaR_40>)Op#><dht>*PS~H_L?-r8H(pmv%kNG9&bedjkku2>8qou{?_sDV;bLZw6
zCY{b>%{c>H;gEX<I8#ZVTMpAMoB`U0Qo?`UA$%Ukb-HJO36Lu&d7zd<ExEl{j_Atj
zDt6m?!ZRP_R};(x(u(YyTY^M(U%zo!w$3f53h8-18Y&Z5p|ndJa*uU<OFKxrY%N!P
zt^EqEsnSMi?RJn})jfT*5V>BvN2e0B_)06NwH20@!ABcl{SL+FA=2roFf+Gs4!NE7
zu(YeR%kH#cU)||?XhW4YM{7kbZLE)$%F=#@mY+3e9aa_1c?TFpH1~dH5!U1#;9p~X
z*WvCP`nnty^4;;K>(_%-*GDSqBXW;+IbA!izH#zBzI#M=a%!0WbfqoRT9~Cx@zH9?
zXEayq2(7Zx>T9jOrIq#3npvMrQ$UNLv=~}TYiaKWc{_E^3^nEbj9c4T+E1PC;FqIw
zp4r18ceOmNpy#&IXwKZuTK89nLF=NlE?V1WX^o8L)RxJ*Bbo<VdZne-+E7c2?W0Y!
zo;v$$pmcg;oLPGpIOLuKgG6)J-hSGJwbyyHv-Z@P>PuvO4jczLm}O^%{X}-oRMD(D
zTeEDPsh+&oSIlD20+p6kYZuDvBa+By&I~_P(XZd<1ElNMA*QyQ9CDBE97{V%yX@LJ
zT-CKLgtkg)VOncrX#;$;Bl5Y`U3;~lwNqLNt>v<`>^@pU>uy+NXepKUs=w;=V>vyy
zJG9I0G-XOXx9yx7=6{J#JkBv)qO~KIHpxfh|7M$cXWJfHh|=n6EzHtN`)KK`cR|UZ
zl~7tVt<|@*H^2P1)A80h`FGk+oxUAoI^D$~cZH<2wB<fp{6V_YA<)(+t&`TCmeoh0
zfsYpJvR-ewp|wz2O08|Pw3t4cpFBt5uD3t?NvDaGc8)`Cr$a4mFYU5d$m5o}(`nGo
z@ae%>A;Yy6WNBS}v?bPgxIDCArRCLHd`nB?qjj>h4}GQ6!b*F{A-B`pW%N<FNW1J#
z7v<8Ou7>tvl<9P))>c^BNFS|p7OgdbwoGY3TI*qH1$?xr*5`{j(3&glQy<l7QA_jC
zF1ynKqjYTtIW^2bsnXVIEsCYh_R+3e@49<JJ2%qIt-03DmDWe0ijVfrdIy^U+9;*P
z)7mUc``Y`zou-pJX6|d@D($CEiz@91hujs?%F;IaXnra5+{QwC6>K`~qqQ8C*3w7I
zVg2Sv5ojxvmRW1>OX;~K@X_L1&u={IC7m`^+6@l5ogT8Z<Fw0OA<3=pqvu15ue4CD
zg<9G#KHB|cdTuqKof={07O1tlmiCj6mZgT)B0?Lgv==>9r>QON7VWY-ZE9T?wsI={
ztvRJF(pq>)eH146Xc4aIGkIHRPluaMYin(brIqy29$QcNB!RX_X;HK`*wX&#@!wAK
zHPv%FP5Y_S`byiuA$NsTu(YK<+CJ;LFbJBz(mH4@o~70E(GptsopM4uGR({^xz=u$
z&_^M<kCxc_O`m(+rPKaOJHsKj(`A;nn|9eN<YHYtw<*waDQ&3Mx>;IhA8n-dsxJ%e
z;ZW0QfYu6IS}Gr{M|@q|yKd6y9Hl+rklSe_OFK`y>`o^P)!IsEb(J<<YiEk<qcFlp
z+hjep)d*T#rB%?{OiRn>qs6lBV#S7bY>1iL$F8c=7MAvicG;b_O{6>B&#81zOlfPh
zmd(<B_0jf^)>?OH0ZMDCwYPzKZk2qrHP`hyBptNJgH5M#wRXVLK6m+Vrx)e>V)qJo
znf6ns^OSaoL+%O*v9$F*T9J#ow$adPE3KE-YFpaRK3WI*JKg5<LLq1|l$KFzDJ|_s
zAMMUbUE7n+(&>W%W^UIw<aYYFm_7<eX_vi18d~oy=R%vMv=FUrvb4TFT7dj6r+XBt
zL94E`qFNhhX<2-<#PWMLuI2|Viqih>q&h8UX@AfzyHkJby0Dp3>30PBo4GB}T3kyT
z@1vcJppR*5Xg!oxQ)@Sh>Z4HHN85QtAJat8GAk{T)|Ob>%Z}bo{pkNuQ$PNsYdcB%
zDR}!A>eRU}w~a&Yy^$`Wx%WmEvnKcD()6K(|2)B%`!Uz)-XHD?d5n@9YdM=GSM<uw
zdi%;(x^T^|oB(nF%g*Po2qHUoUBZj#c|Py(-w6a+-_9SW{WO8c{fxGSL+)#2i={2}
z(GtkB>+U-I1=>8;oO!kp&AHDxSTuK@wOEsRp6}&!9f6J#b;^I8;8ADYbt=delpIUT
zMJ@S5dspuJ9eH+JEgz+SOA~n2*VMd;L+%>*SeWPO);!-Ur}&PX!}<=rBjh<suCL`C
zmR!~=H|ps-jpTB*a@W8Qkjp9gOFLC_A4`5hyD(?J@5saDeZDJiXC}=51ut;t2rkib
zUQ3?jm6!MMtvOj-J?AEnXR++8+aQshb^EH2Ubh8Uw!Z0J6q7Y~rNx0(UDf%it(w3d
z(cGivp<S53*X~Z85zR`wB%e3k32bF1%>N_55S=;~iR{$b->P#0%hp{@Kda8R&^9Qo
zw${p6T1g-6X`t>p3A8p!i=s7uOZ%sdx3(BK`vnZq+G*NPohDb>4i34m{c8pF5m{<9
z!Z@+C^*qubXczcp;moar))rY>J)@<Aw!nIqk`vk(r6t!|CrgX&qXk&sQQm7UodzoH
z42RrK^IO_(+T|S6w5aW+^%vNtKzq$ECg(LWR5a%-_pN|FV{~Rs&T=EVI%`jz<?>p;
zp;-v>JeHjqW)#_3d;3Iozx(@<Wvo4?&h7U5pH|ZK0oC<24!K>Av9zPK%YMcEVcpf4
z3oVMOEktWIEv>JQwx^<Ad)1(2=GvSY78T8zVRCDRSy+=9{?Wymp?dxHs>ZVW`hC+<
zx^BTIBuBo_A@_WKBfma6XK5GY2H%kz57Y8&$o<p=#)#}3otYxLN2fQ-)|sJ^_4{a5
zpyg0nVXd{Wv`j{GUVX9l>-R-pTS(X4`1Iy<{ggv)*V(MDuhTAc-K?`SPqiikbLg&@
zLQbmW$yyH2r{_7uEBk#%o@zaBTnX}#Zd8J^+6#*8%yWgv?mRQFY+Vt4vOW)f{#hn)
zU)34TA$I~jtU9mKE=*u-C*KJSwyp?^A&0W;)HzXPr_Nkfor74m>dX>dulzdDN~t<a
ziRP^Qk9qYxbFwBYzhp<JPPOvO%GJSL`TsVT2|VR_I&0uDhujJ5u;k0M3-Y55zVc6%
z^%__RdAgFrwA{y%2YBW2-;uM(-x6_aE)ThclJjaguO+AP%6YycH<r7CuKcW-Oe366
zy3RCiaLAp;t2}zn$7vU)alO6oG@ksRr!fWcR3#79@&-%p?3IUnM_y>jg&+qiIir@l
zSn`iv`S<V0_=M<A<4#kV#$!IoIcxYg4!PHYY$Cg_wVkxfS;M^A=vtpCNUz~Z(B`q`
z%yzJ7&KiCjpl92WHCe+M+&b0S=9~3ALjlMYRGsNXc3x{+M0V@^fo1C)dt9F5b>Ag=
znn>4Q_*CR{eT74A*MlwXFzvFn-cp;Z&4#u`X=Aij!P0sg&AB=^v)&n2f!0B3g|!yX
z(lYsIAD&s~|Hjg3YNb8pklX3a+<Zjbxm~ASPNyU2P|)$sRaGC~jnJ;}snMC+JgqIW
zv~fm@=tuu)oiP9HA9QUkq4nb0oYhiYG-tJR6U}|EUW_$aEyr6^!hb$xkF-8vMTZ=~
zbvp9fM%=X{=dk2IX&2=4ZG7dS=k(|I^~{9%-{g}n%T+@C=V<w9F1^CXdgULb=0kr*
z4n7^x`5@a>ZqT|f#JYe-Gwh7Ew7|{+UoEhEpmH&6T}<v-m;GqaiYe_)Lv`4DTiSiv
zWoxD6H196(U7Sk4QKz)!S}Sg8Q+>3P@AP5s1Z||!8fY!LrIqv1j#|HPpAuR<KJPn+
zI;LpOp}v??ACUJA+(Uh+mG7bMC9-=&j?llQuJ5tz9FdJ2a$h==M0StJJeF}pV&c@8
z$Evdrv>8fkskKIyR^4dM$DlXX16i4&wdV7t({%#ToUSuiT}NO|UOGow`gT24eu>lV
z`b>T4I*yWebI5(^{Fy_~a|P{!+~qrR3;BA)m3u+n+0M-KXOW#1zEEU$o>f`4PT{ev
zFX1yn>#DRLwbs$nzSaA$R@%DFe~tFjvHq;Iqa1RNbv{em<fBE9hxgsrO9-^9ZOzf?
zE1Gk3zGl}mY{i-!oq!g;N2k9$Ea1*C7vu)2&J-d$Gu$h(JHr?(TWj)T8+|&zUst-$
z!*x0<{49suu18zi9@=G}CFWTdhpEsGDs7n7YFJtqA8qYF`Ycfn+H5`}J1aboXwD3i
zSu;${n#?e~xidrcRwC%QK7~K6BVDIZbzbL?+x7Kqe01Cy9;01$*PR>a8P0=tk?VA3
zI8JMeEv=uA*2B8}Ume;?zFIh1F|BpBw5&ebV(ZKK2+*3b=FISUZSK^WVFA(H3+zqW
zg&EFk<~zg6xAhEHFcan<UCGn5{N^XU!iRh13g3};9@TPH$ea1}>`b7D$j%DiEV6ra
zGP7)52gg_siF~Uib+%X9GY+}OdXS~vpk20>GhDC9P0*ewZNApZTUw}(R;0ULIjx|b
z=M$%MbZUs^9GxGm83wW@D?C<H-=j15h@N2#$OBk*X85kA>iTR}y}}>RF1zbX>vY$9
zI5o`wgVI)N?N>{i?xQ8MZdG=H_9vg}oZ1>`t)-<^@X;p9uTHvW+SJf~Ra$JVWw*4C
zHU8V_SoyVYSGz#_sncpo+s`3)wY<xs=eEX2ixNp6-(YBwmDXKr2Q96sj~4ZfJ_`Av
z?f%)!EuGfJSz279Iqx41SYOCIt}dN+SK4I`xt-Rrv_rJZSs{*==8>*#7PL%C8?Cie
zme$K?&Mm86){T(L&~EXG+*u)owC2gIk3vQt?V0ts)0b+}=@g|s;gH+uW=p$9yX@CO
z>4&=04bUnoZLZb^Sz3sX_R2axw}AGQPm0bNpqgmT8KAsq?)kYWYjOtgZ%7IM`2dt{
zHP`9BPlyJ2rIO!N<*pq$vL)ZAU68jlaAbA$EbPat+LhNa6XsuwW#{P37TGyJAI_wY
z&KQ=h^K+7&dY;XpMORuCt&O*|!bXdL50o9PuWjQ&liHkleyyUqt|yv1&!@BtUH7Ty
z+x0~2rqK>&!u(tFsn?O0YB_-=Pxi_~>-)-A<eK8nvnk{_O0J~kiy3*IZp{U~a@Oz2
zFXT4AD@TI7vx!qPeN=r}SxsY-CEuo9m`1z0T(f*sjo^G#Eo%LxiPfxx`L|=(SpqW!
zc9uYOf!#~=NQSWlockT$the4xpruq=kk*n~S^=XuCy(y(2H8D@$ANaKv8nA-C2rcO
z?Lh{;JUz6_UY`A}+75DRn12*^g!JF7Tqm0IGFmN~dvIp6CNI!QGD7>mTREV#ei^le
zyn*X<7F2C5x3T1sUb(<`<YIE!aSwZJ$o@+H7^Dt+8cTjeyHNA*b@;H`HQT@4x|tdJ
z-IzwE<^@{5nO-l|@m{&>cjVAz`gGL*atkGw)A9^U&f}F6xUxE34Q`?5{I#Og9E)XM
zt5rgyJmpXz{==iS1U_7mRy+MS#Bb1^kl1;s#{;$b--pFMBXE^GwAcRx?qcML9(*uD
zNR(ytZyWG`Ji7BH>Le^-PWp0WKi!n}3y;TUSYRE*6LFlr2=EL4NY_q^7eo9jJ>ayT
zzI^cuzmIqZ#2;~-zU1-?KZST~#Lse^zK-+@--P&^jq-E1bDX~3^$VYc_$|cebDX}E
z_X{76_))}1bDSQE@C)yN_*%rfa-1GM@e8ks_%y`pahx9P@e41E_#niKbDSQ2^9xUl
zcx%Knal9nQV<27y@i-hW#qn2k8lXAkL;PJfikIg24a8F*euv{_IDQE6$cP{3cv+6G
zLj2hVnZtUHm*e<k#4jU0gX85n-XHP3h!5d-1&+5sd@15>IbM<DL5Pn>yc)-YI39p_
zZ^R35yb{NgBHje?R2;9&@d${QMLa6Ut8hF#jm#lC;?L<yw!bRJFCd-(@v9uK#_?T<
ze_k*B?&o-QjxR#|FT|H~yavZZ5dR(V2^_D@@g9h8MZ7P^>u|gw;&Tyi#__rwFNOF>
z#LIKM9>=pH-Wl<n9Iwyuc!<|QJQ2qmaQtIxtbfG6W~F#Tj^9T-1LBW3-iYI;5RZ-c
zS&ld6_$I{P@Q0MJgxfjZgyXXizlHdGjyL7_aKw)yKAPjrINkyAwTO4+cyo@|M0^_J
z^*H`B#|tAq2=U?^Z^7}jh_^;O6USR}JO<)b5Rb$0wj6(z3hN*7cXavM-;U!q5Kn>l
z9gg?r_#wn2BYvFYeK@`f@n>tX{yE;4<C77;jQ9+W_v3hf#P=dTgyX+(yanP*5pT=!
z{u~cNd_3aSI6i>m0f_fTya2}say%*GO%PAT@j)DqfOuKNqjG#O$HP-%{UiQ7GsTB+
z`~u<$5WmXtp&Z|Z_~$iP{~RC2@kNOLh4^xg59fFY;=dz4f#V}M-UIQii1+1qFvlAr
zJ{R$393RQ?QizX4ygbK8aXc&Hoe|H;@zETQhj<;t6LEYD$3LdP`bYdLeM;FsmgDyk
z&w%(Nj)!pk6ymWFKg;oP9N&ca8-Bx2>!0JH9G`{wEyU+@d_2d8BYqU|(Hsxscn8GS
zBHoqb6F6QI@o9+H<M>357e;&#;>9^WiQ{PzZ;f~+j!))z48*G-9*5&oIQ}X*)<5F!
zGE#gh$8R8>0`WT>pT_Y+h(|{JILD`Rd==u)R$={fd<MrSBYqk2862O<@&1VKMSKXy
zf8}@!#Frx8mgBQH9)$RK#H(?9Hpc@H?~Ql?j?dwEQpB4eo{HmhIUWJ=vWQ3J_&kn>
zC&T(j{5d_Cw0}OwFCd-(asHzU`xkJ07vi5+V*PV`A;%XX{ukoQIlhSFA&CEu_ymqG
z=6Dapw<6w`<4ZW+5b?Q)H{<wHj+a7wB;w^czKr8p5$}w6PL40<cs#`GAfAZhD>(i!
zDb_#YU+K}Q{VO?sAMp%`KjQc*j-Nt2HsWVFzMA8k5P!1*>!0ImI6e#UTZqr+_*#w+
zNBk(_qdC5g;~fxRi+ER#ujhD8#HS%%kK-FSUKsH~h!^MhMvkXNyfxyPIKGMFF%YkU
zcpQ#z=J=~5SpSH>qhI>izlGyB5Kn>l9gc71_#wn2BYvFY+c>@o@n_4i{yDy#<C77;
zjQ9+W@8EcU#P=dTgyTCo-U9KZh_~hVE{+EwJ|6LE9N*3H0K|JEUV!6!IGz;oCWxov
z_+E}jK)fvCQ8~Vk<Kc<1{t<tkmg4(4egW|Wh+pOS0gmrN{4;-Y2urx1;|Dpu2=TuV
zU(WGE91lVKcf==f{4mFRAifpxz8pWo@rH=cMZ6iuk8->e;v*3+&+%g%&x&|w#B*}|
zILG55UI+0+96!PFkBPAU5&xQo;wL$NAMp%`KjQc)j-Nt2HsWVFewyQ(5P!22>!0Jl
zaeNlyw-BGt@!vT<9Py)wkLLIpj(0$OE#h4{ewO1k5ub*5J&vE_cwxi`Azqy0=Q*Ah
z@z#iE;`jxQ$3VOa;&C{Bk>jrtV*MljE;Yq3ar_42DG>i3_TB_MiYjgY?uIl31u7_D
zkS)=OL==gD5+#^G2vynvR0I@6RE7};6`_MVq6U)=mTe<C=%}L(uA`2QIO7H?CM*dk
zVn9STAt+$A%@SO~PQLp-&#CSX;EdzDT<`n;UM@{l|DIE)&ib6~Ij1&}uO_b$o+P}I
z{0s6|gztD>%0KxU@`r`57XBFdm*jT}|3LUX<ZH>V6aIqm@#O2sE#VIeA56ZUJWF^$
z`1#};$lD1YExZ%?Mslz4eBn*Vzao#cmhvxr7mkKou!(%7@YcfDkZ&fRC;ZTJQvS(n
z$e$IyRrpKfTgYwU%Z1M*|C;<Z;cp3_O8yP`Rl=VVK8AcNd7ki6;RWQ|$TNjc6yBYD
zJ9#VN!-Tgb|CW6J$x{A>AL$70kZ%#5A$&V|gnXIs=E7Hye@FhN@E@O*@=qQme?s_1
z;m?roAfGOLvG6kTo#YdQzb5<^@?GSY37;)|IQee!Y~j;{_am<*PZvH;co*_L<jsWp
zg|{UCo_x<qQvQY4VdL<EAIR4U?;v~=`H$ok!jptolK({hitrtCr2LcDkv}YaweZKt
z_mbZ!`~%_lknbbEPWTJL$CK|Tw}d|^d@%U|@+{#2;pdYdByT5twD3;khseFc^MyAd
zKTIBJCFNiEE*QZsI6}Tscx&Nn$m_}H2|x5_DgWe{1V;H6zE${3<UB4oVhdj`d?q=M
zhK}4O{4L>A$s3bjCHyJjW5|=p^MscQFCh1kX9}MvygPXl@>ar!32#f@lze}Rlz-tz
zu<?08GWiza8N#=dHzQvryt(id<R_57Dg4K0r2LbgNdAQIjl!QHZ%#g4_+sH@<Soc2
z2!BoZE#xi9FB3jn_;7NAJX`oQ;r+-{$kT<76W)cq6?rq^e&H?2Pa@xANck6D*ADz-
z@^!*H2;W5Bn!G}IlJH9MHsr4e-|@7RfAY5E4+~!{{4w%V$nO;Xf$)3CQ^~Ir{(|uF
z<n72U;SUNQOnxePmhgb^^U2$jw-Y{Ecqj4><X++V!kdtHB#*R|@-KW>DtIUImBL#K
zUqjxRe4g+_Pf7VFPa}U;_*UUBk*AZ}!j}u5Nq!pnZNlFYK9&4*@~ebDC43Bd26>+F
zQsD*UXOL$KpD4ULc^C3l!iNcOOMWK#{uWaHg&)DE78Z0R-y%Fi_;&J4@@2xC3tvHg
z7WtdPfBch_fAVhRPYB;A{2B7I$)^inEWC{T9P$amUlV={d3W;5gwGZ}ocvt!Y~j;{
z_apB?o-TZx@Gj(8<jsWpg|{R>k9<#aDgVOju;8$uC;2+z9fWTp??qlAJV|&Zd2jMp
zgztD#%0Kz}<PQs9E&MU^3&`&j{(<m&$or6AC;SEB-2S{Eo7@uqpzy)u7m{ZQ4+uY>
z{37yp!bc15MBbO&D?DF#6Y?DL$ca+^h3{$u-j95x@YcfDkY7wbPxzrHr2LcTl0Pea
ztMHe|^T=)C%Z1M*?@xZ4@VA6dB_BY3mGGy8k0H+|&l6rMynuWld8Y7*!n>0fkhc;(
zOn6)JLFD^Skn%75NNaGDe2ef5;oHeA@@2xC3tvHg3Hh7Ce|%iZKl!EPPYB;A{26jT
z`E=ong_n^Rl1~u+n($l52a{hWe75l6<U`1_g-;XSk9;V3y6|zryO3W--b}b(cuVrj
z$@esq@-Ms&R<H}MAYUiEgYZq{!^kUyCkd}4A5Q*?@Ew1Y@=rdJ{9)m%g+E4q75SaQ
zKM;No`PJmt34cNOc=FNYmhcCK4<^5cJWF^$`1#~x$lD1YExZ%?SaPrMeBn*VuOp8n
zOZgYR3rB4)xSo8a@YcfDkl#Q)Pxzt7r2LbQBY#%-R^cy^|AyQazFhcB^6})i34crY
zRPx`FUnTr0;bX{eB+nCGD!hPv0(qwJiNd>+-$dR@_%PvZ$$v+_zp0de;YV75PbA+W
zJVW?)@|($*32!cZ1^F%HZwmkMQ7QlAw~{|0e53GZ$S09c7rt0{8ToDG6NJAe{1)=b
z<d+GbEqpln6!L80(}edU50Iw|A1AyEc@cRt;eO#Q$)}R<X(HuccpWx+EV!L~o$wCA
zH<8~#ULib5cqRGW<gW<dF<Z(%`90(h3tuh#G4gxK?-c%l@O#LM$*&Xsg7ER=)5$I2
z4+<Ylejj<3@PP31$xF!F2_G%I6L~4QS9regCgef#h)>GD@Lkvxyr7JHrSR6m*O1R3
zpC|m#AEf+~-%tLm@U6mMBDcwH;md{3B!7VXHsNmxpGqDgze@O1!pD$5NS-IWRCob-
zm^@SXMB&}ZXOg!PK1_I9@`uRxCrSAiegsEiE|^8WMR<nr?c@)WFB9He_zLnz$lnzH
z<0Df3$^St9gz$~RpCO-3K3(`?;br8Hl1~u+n($l5A0vnT9M@-O3m;DYNAhgp(}edU
zf1EsB_&DKR$e$o@CfqN)CHa%&dm2mm7hcx_{7>ZTgm)0WiTo+@3gJn@E6JZGe?|C?
zho$_JKSPeAl^Ook!XG35GdYfiApb!4J>+x9VV_6-g7ER=&yrih9~3^A{5kS0;Q`_2
zlRr=1PWWiyoycDx_X^J!-h})`@<=1~-m``8Y7YJq`AXrfg|8w13;8_Zhh|CnCx4mz
zS>aoSzeN5Dxh;IT@R{VVlHVr$E#Xti|4M$9@TY{2A%BfLPk5>D0`k|%GlfqS-ktn!
z<gJ7c6W*5m4f6f?gLLjI{K$#mbIG>|&k(+y{7v#@!kY_ULH>7g3gOR>4@vna{|7k>
z9r#A!&yc@GPGttZSa=!v+vIG-z+V%73;8?bmkFOOd^q{L<k`Zf3GYY#9(lU(al*Th
z&m(Up+%LQ(`F!#{n9RU?dkL>Q0sMV(PC9~j5Wb201M&*tNy011KO}!e_>P%U{>c}R
zKP-H;@W;qM0*_9E=N}22PFPO(TfkQZP9&@#yd3aRfg=bP66OFF3+zw0h_EZ*?*yJh
zxR~%{z~KT@36}t7EgP6s?%epb*Rx#zyE1P?-tfFD^M+kvmh9+b*?TOmE_gik_4U?!
z_``p@t@rEeF?_DC|H$N;0soD)+`BZ3>kIssVul8%n)Xt&qUHpg1(DI!^e#2+3bP{8
z#LTKN=YQ4b)8)ZEC(odhy<eLZ-?cO+EWGqiq<3b88M@f~-MTyc;a;693%YvLt4+Hg
z&2LXlHmeFSiGk3U7vGZ!GrT}XifK>FNH*=!8EHoFTgEmi?}5C`(sCnsjcV_cI-4cs
zsb<LE8GnW$B-tyCQjb!FO4Ns<K8k*)6oaB8Nk=NW8q^=|+u16qN-;~Obx!dZrEJ%&
zP!rR;*(|AQZdNU1;$$r|L!&dg8l6iSM2|mMQ<Q3kCwa~EHMN%U=J`hH>%eA7g*W;Q
z3_mxbS(;jChUX%=Jg%Ff8q+@P+=lUkY446FdqQHzG9aEk{*XT-C2LvqET%=O$;BD|
z%Qb`hiq1CeO0(G!6#z2?u@U?|!pguSAm)4q9REIw6ymQ-=`PY&fu?52Og4kdEb$e&
zA7Fx}WUVqoJ<RM)M(JFtmy(5E&@?1i(YN#^JPB(;VU`s$xFXk>UF8oRF&<r*n|5Hl
zn?6>!ZzH5$y}#M+(p9FBufo_b&{{nkTw%;!B+sO+kFJITgejTni)?-YDW_i296_l-
z=KB$AHxCNKA7rTaGA|;3K{^<ni<umZZ{&}>(P=-z<norsgS<4om9ZT7QYDkK(?TBb
zrT=a&oa7HWMQKQOf3saG*^7d1y1J>oIeL;t<GZ`bG0Pv^X9UM0(bc2d-S8N}0c4>-
zMw%H;x<D7^3sTMObwzD;t{|`Tito(y1a6FUVQj+9;jRKDd)Ngi+3Vx3iVV~^ahen1
zB(Wb}S|#Nu?!zuXabIr)IR)*v%M(gmtn#Y|jo>1<K=8BbqZh@4KYkVqTE(u`)3-DA
zkp`wQaIa~v^rx@$H`{Lp>yX7CGd)@wrKhvVAc4{=&7gyfHs`@hY8Ar_E;SylxFp=V
zt-slKrJvxLHG!A-RNJBzDqpBkJx(zO>i<^0#EVbltAi-$_GM|7orn6oxX-A((RtV8
zjmf+A8dwjuM#3$GP8pWbgGqwBFU;VoKzB2AH)=tB;0!ZdgtBjR&PCzqWM0JjURsVI
zNe!q(4fwKXn<P{zTO%YC6m2{;&5w|q{I@%r6Hh(m-%;sj83Sv7CjB`5!sJdr^6S#C
zVYDOX4f1+)-gZFFsJxAbIG)bY3*sUEH}X4h!7tD2S9M-Tj3X%YC^YeEUz%}%^&lT=
zAlx^Vde^zL;MdgA$cFvOZ=>|^Vb@1!P*ktJ$@Dhz72b{XI17qChqsk^U7_9j;OA04
zP?C+rUuu-Tj~^B=YKu|&H%fv#i~4+&i_(x)m$l02b`NuD$$SFSuF9%&)-~~Zl(IIu
z`dUmqP;ayvh+>qFvSxQRL)Rpm7hU5iy2Y&crinj%0~)+<l|SFyjXXha(+j`-wett~
zVwNnbH7lY%T)mdv$qWrlw(NprD|}B1OQ;#TG8s0sKIe8X+}gYG?x%}Uuh~$j7y9iU
zR%lGJ-~P!A5Ad4yKxol*X2ljCbWA4oJKDb%DbV{ykgb_zUzHa5<d9o&ZcB+Q#E)KH
z<DI4GTP%K4^>~y85?4&wvuL@VSrgB!WU07S`M)a1S~BayIiuu7J7}^44WM51;yT@h
ziFaQ@1qeCv<S^At=pHm1$;J0{MfNRYPB2b8S<L6DdWZ~Wr$+r#Ev|HLVmfoIZfBum
zkLl=8VX2M|UeR=5R$cVMcm<htv_3?At~2eA{pkz+%~pv%Tv=-xZ=Pb54q-s44>z$9
zV8d5~#?lHm&tjwHZx$*29GY-N;5E88idILO;sN%6{Go!>z|H>b3L|(C{is3&8RsD>
zs;M!vhoiaVA|zv7+KFl-bQu)cDx<U&f0QglYXg9yf)=J#SJR%9YNX`1N;O*LB2Ix;
zP=Gzms{B@}*A@8+KB<>&$HUSw%PvSWN;$06YJ$~P`i<-vzR%#`Uz#ONtxB+_{;-MM
ze1PqQ6<n5Q%wAX++yk|rpSJRna4(O)*{0HEmXW`QvArO0jyWCOD|95-rwFc5$x{c{
zY3nWtcYxy!rOS{wo9MW6(fU%v(}&<hw&JxavWao@u0(>SnAuAMZHyGJkz%D7txWun
zl*2QKgtXBAx=cue^|NI{6^871VfY#Sj`Fb1pI+f_woJ;yCn|pf0}{%@LRA(jtZ;|(
zSl9i{qNN`*x`DY8;nmPsQnzC5SmgRVRI*@Q(HSvByZBCjQ?>NFDF<K|jM%FF{~EZ2
z%8M?9gTrFb`kU|IU<@2Mmrxqc*(SV@{P*PNfiGWfmK<qh+&2m!j}4tC+P-0Y(XH@W
z*|kP#cSsBEg?@WwVfd2zLi=ND<#&F2m9p`#*pd`VI_07yV@kE@Jz#qGGg~tAGaMtO
zk<rR)bmeWUMpnsw@9hhsuf^ZM_6HiUCdo)?svk&dYP4#qpGaz&Rc_f8iSGU3B##c$
z<kR(!MhM{Ki>>U6qEV3<h(n@#HC{pRnj3ywZAiGN`6-hdTXuP&U6JtklnTGS44U(3
zhlTc1IE2@MECy3MGExvqiC>aow89UGVG>@l)XLst+;=53ky%n%Zv@-pZjn1WTfY3i
zbA0xEsQY~9xlngKK3n-6nuSO3e}6qb3wKzC@!978*!b)!8J}&|UE0m5$BfS?Lc{3a
zETc9(J|jin*1s8jTa4xO_-wNtpKVSYpN0Fv@v#H6Vr2DyWPG+V2NmOg;rI-i@4sVw
zHYYYdL+bxmjn4|Pmia$#e708?=Dq)7d<J(FDA~jIqGbPz@!4J|Ee_@Oawzw|WPDb0
z;eXZm3<(z>pY28W<lh^gp$7c#8=pObnNB@EOJ&EEWAFdr<FmC?yUw~kzow4<lkwS7
zdi$R>J_}7k53#}cEPKBmBVl}Y&V-*CpWTG<nJ;mCHUR!4j?aF;6bi;>NoL6>|72{I
z<ZOVIp^VMEQ+~<V40|4bc5L=p;@GS*VQkh_taP8F)sH(io4lOaB4aaVlpdRPyWm&l
z)=4rp`$)`Vhw?@4*sK{|6zjkoJ3iwm?D*reuUO3g=Zw!TW<dX)@!4K<d1K?Vacaoa
zjeh*|@mX6u6&s(a)r$DoC5fXkOsP*g&ajO6s@|~~9wZFQxGMTzHY__|56fcnxxajT
zrmSUGBhW{U&yvje_^bs70{=DRvMdBSUu9^6`AQj||5uF1R`o>1I%YgJQ|xK49-lBC
zTe}o4-SODX;^2#o?s#lIrQw__g?~t{#$(TcFJB&Mj}(ne<sYtSMU>497s~Yf_mz3W
zuQ2V;^K6Vd51V1%+-^x8%wA!99rnG9`{F*Hq7$KyOnZ|v_CjWkZ`oPcnQiaR=OgP!
z<&C+fzNW#?NHPqf-3e^gZ@$2?>nt1Nd*2ZK0?e8E2I1a#hYJZsnSLAd!ke5!*(kMe
zh}%7K>v!LicWvaBT7;lvj~DU1w@3|KKlDJeT#uM>;Q9u}>+!*w%1`xv%l;DYH|>LX
z=Vu5#Bbbf*D%>-IZSiuyeZ;vJZ#3=AybbU+<lC9axPoAg?EvDMcP%5E_&ajeuEY@{
zVmmbUt{e2b&c(Y%>31P9`M7hkykMHVfOB}?V*)lA?@(_@1d(^YkL6bb{~IUxe-Iw&
z%u@dUqmPO}hhO6Vh`9elk+fefWzzQE(v_R?+sx-2X1K*aaB(*M*5fHHFG@#z{&7kR
zm5y!jK>FwEfj{Fp<iZ55Q(@u&TM6OI^Ww{$;>%?E@_5{rn-hH5uuA!|1kb^j)5Vvs
zm0ex$--s`L#Fs4ka(UdBGZTDyRD5|A&%u|kV2fv#<eY)WG1K=D$_YZYWSn|YJMlAu
zb`kOW8W-ot_*nctZyS$ap7<HYbMVuqpW&R(_bXpEoT*ajW%1=%`tqLmavD-8tbJ*V
z2M(@O5Bv$wWtCqQ=?`zz(FAW#CU0}ba@5@22%#*(euG6?qqFCj@@pW}ZuxDB`^575
z(fP6R`xb9QzH=D52Icjb@_SP>vHTvb59YuFS06l%`$T;(RNgVK3*Yfi^ua#F>Sy)A
z#rg%*2N&S}D1ES_cT68_k=sOlFlGm0Df(dF^(qx#KZH~qrS-v|vHBhBX~@l@qx8ek
zzY&TjHa7Asa>I*~@h0OJ$~G%@H(r2MUr*Mm$QsOJyY0mhv#N^o#F6Ps8hMsSE`^K8
zJ5aEZbU2RSbQe!#28O<o%bViQ)3CaWJPuRC$n&tfj64G))W|aa*~&k=`R6+r;zquJ
zRdS>pCfZk;_BPR>(|4reVJJ~&-#Dhpeyp3HTQm{M3Jq2oW<F}ENCTt%P;(X0)-lj?
z`Zt&-^%VDQ=Lz(4KEShBGoXs&4hyK-tU7c0VtTFA+cl8~Zd5Lug_zyNY*+m0B1{Jc
zoxP|nc5p2(Rlyou5MOj*2G;QEw18GC+1Kdyi=4)|vy}s9A5>AWJaCt3-(2HtC};$4
zl|RmA2;h41bzG?ctc7>cVa|JS!me}Jx4mZz&jTY*bKZf&tYyJ+tR8-9+J!YG`@FZe
zDr)Mt-)4B^HPQC@#ygnO{E%T&h2*=eWqEdmV^b4`-xgdyA6esZuB<&&-yqt^@M6Z`
zc<Fur4Zl9U{diWVw-*U|ob)~wdJHa{%aKX{c6vX3ft%icz@1L-`*9mf@81oKr}wSm
z`snmF;Y8AV>Uomh1#tMQ)4LrWK03X(?fQl3Esb>d)0L(Pw1MtwkWG>0s3IsB&ENoP
z$tYcmJp0$haw;^H6w8UY)5&xlZjnsWRI$v@k0+BUt|wogSS+t}(&0psX-HNJPyPdU
zoaS_b%YU6t1C4j8j#Do8K%eP!LNuL5$0U*SQFmy8PS7oJrRA1S8stphk)kw$?%C{z
zi67K+;8aogNH>Z(+s|d}$a#`-pTU(%IWF5E<yeJL^{RQPQuUhE02P`k>v8%bkdkue
z;Z7>Pb0%(e%B^)y7@$&){x~hgbu8srJEK=R-}Yz9Aq{+OdNAe2p5|=r)ga~8#%lD3
z^hn+lZI6^&jFe;GsOM~5qu<5DX82JRX6a4&cB=A~7~w@u@i|Dob`6qm)^XF%vJaWz
zj`PxxeitGAu*6FJaa8&}0)_3|eJ$(o7NvT7`R(luRPX79_P18W?wIN|O3#I|$(#Ny
z^>6>;zS*9l7mRmKRBMfBxDW+j9^R>P<DmNeuX5vWJ>A@R0e3n#p1`fH?rWWCumePP
zs@y0R*RkrZa$}ft6`V+J%;+xFeFPjP6yu-C46R;|&WyA0u+9ugILFy>Y-Z?II@*Yx
z!ZuBRLMv?|%Ivl8hh3esx*PK9AlC_@3I}mdf?biAaz=1Ggpm*NCtexQKZ@n2GEtK@
z66w0LliQOpg4_;i*=sC&ugYiNr;w>MS2jLjY0K>3QWP@O07efFum)7~>Rfo9o8~=m
zCpE=61GhTOtDWRrmF8?<oaW*>mgdX|G-=x~Szy^q5VDicku?7%t3jGq$11?<^haJ3
zYtk4tzr8C~0p7vGu%r@<!g>Uw2j}7M5_D|bdPTZNlf5e8Xa0bMAA)Ex;fv80I-|})
z!q+ib)p#kEo{9QJrQ5$Mn<pVmqA8Mbr_1Ky9<g*<>(um%m(8sZBsTrJY<>tQQZ}RA
zB%R)e!(UxCXX4=oWs@eK9%pa#*aCV~`=6*^f3%=`{-vfoNn(VqPcrV@_@A2?|D1&Q
zLvXzK!-<anSvvl3`0L}3hhy=_5O=$dKTC?eNcBs$I}>igc*b6Pl)hW^A1x1!B!)<j
zeQ3sXdhEn)OjlL+ji<+I2#%8;ufvI?$A(NvkH5m<uTGD9@Ng_WsN|i|QfV4A9B~CR
zUbPoj3`MWRIZcIjGYvTG@6=HJn#f=jq_}IY=VPvevkWh`OY+R`in5kfSC8e0wWHkr
zWmb8E5yx@$m#eHWAM>x2)6;0;q|*HXcUn2!kK34X`W;L_&>$$~bgQ^NS~;0;V%cbD
zeN($iA{W46LL$eN(~Cc=oVLf5Q#(8yQ%<|JatdIW9qY-(l#`_IQR5rLn%Y1I`go&^
zzntyH`f1$h*6tD9>R8t}MHi}A(;w$faUF|wEg)3#<!~b1x#BY=);#(=A=Wi9mHbOa
z8K>jnc(2zf-JTF%Wk%SrKC$<v$8-NWzVD)Olw5utcRId2vOE^wV0JvdwzxhzzSqNv
z#CKK~iSJlA{MGU8iH8&7`{TCb$2W4(fW+yASqY{KYCz8Utoi7sqM|q{BiK+^I;q3i
z$=ahjS!3A~OB~-0@!LN%7~h^5AE5@$b2eUyU~zm~=RAzLbwA8OWLAyZ&e;X^-*Akz
z0uwWT+NGurDt?EyF@r%-R`AzjyTbD23?Hoa9vvw`yBpsB+MwmdgI0%d92K-HjvF+O
z1g#5$h61L8R)rlFdR~KdN15ky(lJ<Xls*Gj7OcfTAI4Win21JjH6Z&B2?=SG(stW#
zM-vlMrpV$6YLq?+N6rlRr4n^lVX!VR)WMPv-mw_7rcPh<9H^jC65AAJ|5#`Y+Kw?U
z-bI^4V;vs8B=)ewj#acB9*dbEMGkQ)?_2PJS$K*ji6<bb{r0|?6P0loS2WBx^as(g
zN9JwXp)xP&CZy0sa3z`dBDSAM`QrSuOvujlmNmuX<?u^F`9EN)q`YW=ANx}pwZPKp
z#YnoM6P?bj*%DS+c2hGv2xetfMLo^%4EEm5y_?N+3}AYni;a>f58FC;J(t+<r?Rok
zn;?{%oqOARJ<&M`)VseuQ19${5wY)!%`-Zy;^8svAK0a!sgiSU(Qr(5GiR(&ejURV
z7--p>IgW^dPVt^F#&=z0s$5OiL?$B2%!k%UjYL;P>#a~(c?-@>!mKX14x0{wON)A#
znTyQEm!_C|*P7`|l@BxmTlcL>!_;L+!%XC@q~Uc4I)qUyML1_LE|ai9NOZLh`JECG
z_N*&*ry*VIRrs%Gk-9XLe?<Mtaf?%`Ct${C=h)q4#yhQ3-}7)GtmHuNqR|p6>?FWx
zuhBaYH_lb{Mlp-~Q}iCf^*GWPLy1?pQ*nJP<6lD<^)p^P)4=b*3&jUgZf~MI?i{|P
zwj$E#Jv9dSyaq}EHKkF>*Nsi%jg_W%p=no3KoT<#_b5%W5e9?qVccReEhBAwRena6
z2a6H`f-jx>k%Yz5GJ1FdJw6IDWVHM3c{xk)qDuS?-{GBKs`9PQ+3^JcR$Z}(H!?Yj
zX*}MBMkb@MHb!CXVutVXQkhXNp{$AJGhgI<&kf7UQhkgne2v<n@KuMJc(FO70E=dW
zusB%I99EzBe?)6oH=0KOrKZiF1^8KQ8iT4$8<J2#2e|GKD#!>8$;e&`SN4#MP(jzw
zkgoVO2)|&HQqT)-dW8!5hKBUjZu*7_azjIMwVPbH83#AxLIo2;LndlB6GH`)LPI8L
zH<REd05^e9!L-njY1+-SP+2iz7kY?)#ihCp6`Xz1pdKY7hI;HlXNL-U*n@h63i7j-
zWe>_XW><KJ3=0i5vsPsfF<~m|9Wl}#ly46)k8WT7JMH5>^*wp$t(FfA(e`oJ#gtiX
zu5?UqHR^dQb3*xm7RW)_%k(R4<)SEB$LwXI=^F}-Khah$DTxIwiJ}Bsxm}o&)-=IZ
zZY?PGGBvQ3;~rv|l)(D^IJR=}E-&_{{ObKo&thd8>jv?i1PRtF|C9GK{l8={*Rc)q
z;D6kHrvI~-L+$f8b5H*D_Hyg~>3*jFKibP(f71U7d$}|kP9>OOHFsXYri_2Tjz+%x
z+5JrasrGWCEHo(%?B$xRLUY1BMX;AUocBxZ<xYk_9G<%SnHH^tzrVm<?yt-FehwT~
zUaH^A<m>TsdzpTfz1+TOs;@bD4?4)My<C4B50IEq+FtIU@vCy{B(ax!NM4l27nx83
zoTgoe^9f?T(AannT`Aemw39vO<L+nrKYO{-|Ff6-f5u+!x#p-?$Jon_xl`55vo1!3
z8KEo8%g@54YcH204qkj44%Ggo2Ph5aoFP0+uI%L|gD+n`v^eK+^qswd4yJvi()S?m
z;o8R4nfrVr%(xYGR#rQm?pH6Khf(N)&S<S=%zO?!@!wF6CyvZe^4V8Hp04Ch3WV^#
z5BuhU=AoQkxEUF_3BJPp3UPnl5V$`h=HFLy8d|jR`F)M~JoaoK3FYj;{m94#;zdZj
z*gY6tU{yy4>B$COU@H5_12e*q_H8t|T~hu1WxNmNybO<=-Os5Ao`?wAdq+lQ!GuF1
zlO;Zn4!~zDaKUG10KA1($|{eh#D@dee^pX-qnW)ofI-KQk+etI6_;{;+<51*R1Szj
z&1m(vC;Az-LE+GWrOEDkLtMd??6}bkdHvxVF{ln;RGhunC`F?HZGVuP`$30gqzug3
z>c?<6E!sxg<CHAAksGj6aU28PhJJ@o><l>m@LgS_Cn9{In^B5(2EHu58}ku+j1(B2
zga#yY_f?u{bJHyb^L_B7#~4vgn#(^!&CnHTn4U<VGL>)bV}_??<Z@o(9E|7Z=fXAS
zt~gBZh~HMpl3c4~u{Zh$X&yod1w!h#)2z%TR`wUScf_#$#@PJUHH#PrS2*<>l!d$Q
zP-W-jG9))94dtwk@d7`<MSbB!3fu?JBo>hIv*O7YYU&T&n`~uP8Kuv|4@@|P`Z(uc
zs{p5%ZpZfZDON>&<Gc^l<m-mK4>5sMKYkVhp5jmcE^s0CnXbl6N#m@##0*5e>Wg`e
zcG!!SY-WF7RBPGGqb=huW?9}Jx%oJI^VAD4d<^wn>&!nzUYQhJRrD+(_ss1oa@}xH
ziZa}kc>IXNf+Q?{yC#-4i)O{g?=Y*xAq<$@7?EsdF0||~tct^ptq``zWt!ouFx9jw
za3>ck?OqvKJM$JWI#?ezLwz07z6iS{Q=A?MN!Bud`uBlM6qZsX3JXZDrO+yYQ@Fd%
zw0puKwtM3YjO>q#zQIO&jFvxD?(Fq=t7%`duVe|Lw{vO}E0mNGor3_tiwraK6EpjV
z+YOm)j;(JXx^G}dH>Rlul_I(i!Y7HY57CwN)_8`M>@fYIOcR4{b%IG~D&mR$&`&1g
zX|veY8Koye$Ega?JVPd+N)9*5TL9>>jQ+d)ArxX9)8V(rW%S^*R33F5Dv$HyF6mZw
zMeZA=oRzYsFPV)?AqsAIBI=9SKNZ@42_WUn5KPKTD!o?tqiOi5rmLLXMNBW{{Vrz;
zm!z|Q3S7-Xo??bPgF_j$JaMJr%W0mm%bDuF-J+k<tC$d+fqfVWG_y!0&;eRJKL;)e
zwDnI2Ia1gkf+oR4TVdvIEBnja+arO^Sn;@KQDOLQEt<Lwe)Si>*53!eUXJ<oIQ(++
zJ6@i!0CPWs0aY>y6Xs0Kv1axfCjK1(3wvoW$r&=K!-~#|7jVp{K)>W?jHQNC(!h?m
zbo?yQZz!m)e#@%2K0*rC*K?x$18!Nyzp-Y;x5(yc88>PzlR~wVMp0Hd^%?fyPs|vJ
z@@s_(5yaArVu)}M1%yc}kuED4>^=;$X@<|n{^cd{^Je@u)R-Gz)0kh=cQTJpaZjA-
z5%?Vocyyb4;tA|j=O&ys(}SnY>~<PszqFqnX&fQbiN5rkjFV(46V8#5^J97xy=g(i
zvC7CcX&f69wKa+svvhE(3E>RoFU2G@N*~W#r>;%*<NOj79h4{-pZha^GD{BcGfG(q
zSmqHauR7bspZNpVrz`d~HnW{6ccV%cWA(wR*nyK`_9eZS0{4ESe~p#B1D9==g!^s-
zky+Od`%?2^d16BV<q|6ln6_Q-++I$Fy%%GKr!VH+w2Ty-I|Cbcgyhn2(zgs;+VQe2
z+6?YshQSb*RP9p>S30}uoOJri6l)k~sd4{}abAXHkYnRqg5@5G^DF)S^WuCCeLXhL
zeg3=i`3r1KI5y4?EpX%fcFuoZoU`fc(Q)=K<+&KFl|#ks%PhGw8TN@8)Yl>CR~Cvi
z7OY%<cmlgE(JU)`_GUADw=PELJ#79k<}>_Z9|C=e6#qrnB^xFGK#{gWI2i=3_cmR7
zP;N~->d!oAmK@v{c+0Ffh)S~vm1ZK|k_naONTpdjWwunBN~>a5gGy6tWp6^I`2m&Y
zC%ix^&Hd>2v5(YAt@+XH4g-9rKLbZ|Vdq9vD*BDc;9QZj*Ip7%g63B-gXW54LfOUJ
zSLiVn_guP2^CM^@k5{v(Fv3IY{lTw!@(*5xe^8cY@H>5|L$5wD)sq%ixU<Y=N7P9V
z-}8u=9)Yw3PZA!Du66Zc>9V3z&5+ioIE%o{;iS9~#4(oj(OwBPH{6%gBzjQa1<ORo
zduj}x@`uKubH^+$%-n|7l)b9L;7%%V6or=%kGxRd4Cfy$r5kh?`YE^PEs$Q9bh=`v
zJyGA6RHm4^|AlXJ{(}D3ilh2pb-Mr68?W8y^wqCr@9TNAkZHH5*7o&agWlIvoYf&=
zV%fQonTsyD%G1ILyV+W!AR1Q5wtWa+Q>$eAKC|T0K7Kg!s$$Rw<);VLVh^>}HHa3#
zurET9pJdvpm<KxJT+RbECV{&FXzm82!b%FRewQYRZV$jIbps$oyI_Xdy+^xn2llt2
zRc3icDRegffS!priZTv&qv=k!A0z!JqkPDDt**Wv<*}QD-JaIb<5v%$H#03`s3&k6
z`i3giqn%~H*YW#zyQi8~sP7K9=UCTVdXC=(K8u$<d0zJ)-~E~XWAqub+i28AbPmyB
zV1>nbQB6^aPG==k$uWH~`ymOP8f4R*6IC{~`hA05<7YToG&U0?H`tYcMH=7EpSU)N
zW;kh|H^ION;~kNI`g3x2Kd-Pp+NZiG^_|U7n}u|WjxSE-^M`xS#xJ!Fgl;nW{DB)0
zxUmQv=A|mQeiQONLiZI;V}JTSE7Z~q_d-%*a8R)e<cR5ofkkVOg;-M0*?>K2c<_Ea
zNIyd@hO1{IaJk>~R-hAs=hUxgBb=MZqg}+CtPQ5U&@B1Ri#5z2uqK5OWBq>Qev@R=
z$p69gehkmMq6)+FZp!oX*{9Dl!v*j`1r4FVLn;)s^T^r&ryk|Bt@ihG_|f!Ji0UBP
zhIJ7OM|p(@#5^cD=$(gr@|eberLtl#5!?i?PgGu4d<TN#O4h@tdvFdm{CQffue7=4
zN2ZYyGvZucUZaw;M=YIqOqzID*}yZN!E!D`T<#2|XW^V%@yRj#>aTwJ7QukQykSxX
z1UqDZQl>Z?IuK<CD|_fsI&B3tzYR!fo^q!#kQF)~kF8LTi8;YeSU@C}aFnPWsU<nj
z{s4~xI9F<MD(nTI<g@CX;`WF2M78ZJgS^Uj2mW#-qMYgRM51z1_HiXSe&t!CQSiz4
z=x~<v77N~oa<p-9REu5^q9HYc>F}x2mq9O`5!Yfh<i6H;U%2Dv-WHzFT^Ts<3Ty@D
zJ{@a0v_d%Nd)_~d-L1ICm!dt+!w>P^_eC9E5xNxJFf`Afp}g#akvuN@ofJ%cI2-nQ
zJRdPloITK5&Ib=2sdv5vb1vTi%alTx6F6KSg{Ku5#5fyws+Se<1+_54aXxt;hEHMN
z>_v?|#dkEuybfIcmeTuRf?~Y$KFf?5`bRn6aPCSxa5aX0j*o_jAy0lNH@Ohwpy3Wa
zk?V0jx)#Y0_KmD*P6OotDpO81CUcO1ZICaOzP8`PRh2Ip7p5~(P>g%)9#Gn%;9+F%
zqY!Y`_<5XijORCq=*Jr&`V1n>+}M!B+h>)(6+G=XaDp=igOmoD{HJ#DOnw%Qs(gRI
zU%uQq&vo0NyhPdq$H}Y&WhDAi^bsbmenww;kC@p<rgHmfJ8={DOwLQ<z9&@Wq@_50
ztqPM6dnZQ|USit&P~)KzTdc3;E1JP&h1Ynush(eo%jYM^Lrq=#%((o?1Ucp%j(Ps~
z3G(sw1;@y*PLM;a>qFfC3lrpK?D?V21G`Z)Y~Mu?J7e~-<RvYkb~c;&&IB8o5LxsW
zhN@GRYK;otNf4rM*rs86Id^Tr{1J4e@5v)5Rd)>a+~acwC8?TGQ;sr=a)yS<w|qBJ
zOA_u`-eSL7xK|Z)RMS8+GxNyENYDKqyj$#Hwe2Dd7x#&wrn1+pQv0`1G-x|t%d4|1
zg$24rXS^D_2lCRe)uz@AKP~2++LRWXwC2^*@fK8IT2a~U@Q~h;c1?rW=6gItomYR`
z$m6uBI8^V{Jb?LBl<_m}R&^vR1sf_OpPbsj-Vnp;LktR9+C#ACEJB^~mBS1|HI^Rl
zqWFBkP$y+6dUN64&q8e3P1w=C8od~2T^+&&yQ#46qm5EAea;dznC6FWt4W;>*L{_(
z6&f%zJ)+w|w0doxUW?jkZx++*9e+@5U-Tq2X<^?j;$hJn(ZAz3$Z*F36|xiA2r57N
zN*Tw_Lo<E1>4gDWFOW+5I<ojcy?FTqyetfLx55KeTV<v@S<9@F!)z;gW+rmCC)*Y?
zeV5<6%Wa$S3;Pz4d-Z1Kmzdeye@S?BSGK0>%&Kx#+>C)$aE}9>dT`PPfqfILN%0+C
zPaq9C&D~ZCJ1u2j=}vh?MxiSS$4_l&jzJQ-h>=43P$8`A&cyJ{S+Ij+i2WR?+4S=K
zoy-ZSKo~9tkxOuEh4VA&t>9uK_%wj!g=t(AVUMC4XYJi=rNc6AS7R%Em*ri7p?WXO
z$JASUH(Ke7F<hb|ZjTpJd1A^V=l59$FyzPPh_9g#Pio14wRy}U!+&B6e^lrT(ZjOr
z&zv@HKxq|Sy&iF~(_o<(3tZ_9OgXfjSa)!eG1Kf^0jn=`$?Tk&I~sea3}k*W<h!TA
z?}YMZ+O0FH2C?a3f?WV-4EHSq?av0<m6rXfW&g-yf1k{RLNBcg^kk13rL2&3q|NCZ
z?_zdvvvVqLF=3Oh;%SDOXF&TG+Iw#D4vn7DfEU`%z^r9Ad2aGJYuciUp)u!iB;0fE
z=AP7V)hCSC@qb~AeL@S@uDH(uRPzV!Nau#SDz#li$8A(p;`UqT)2@t6eE?HeDl*t^
zve_|DV`NYyHams5)m!D%=ft=x-j1jk?PrWS99QT@tG<nj76zqi6R2}Be<yBkU9=o2
z-3%v&q598HgTBG$S0rhp`kKfU7+`b0IP8l=jwkOaa_KMQfCA3@-p+0=4CqFrpmWCp
z_rhP4Ci)9zX|2$RK33@VT;(F1Q`tpH!Xin9Buz`M?yV$KMA8)!b_(g+6AP8(Do6@L
zgD@Z&G1LkT8Oy$7<_hJ8Z>@;5F63HEIOo|a<vtVc@#`uz@!UIQrh=^jq1*1o=jpOG
zL{E;DPc>gqa`5~>J1aD7^CZrq@4z$>I=HAZn@@mA;aBkM5Web#-=E<(!pAXy<0p|$
z<9onvc{jslOjk2|-_(Jyot69AsoC$LgQ3zl<gdi2(O>0YQn0d?cbI1NSMKB8J`~?d
z)7u>FPB}{EzClIRi+(U}ZG!)e@y6<<xDHCepAl*Jb8U|wjC&2-3{DZ*y+bG~NW-s%
z5DlWEf*$w<Kkgj_*S8kr;?Iy=*ExR`3?&-sBK|5EOElI+{8ccCXp)QgtE?Ci!d00n
zYJs|^Y?hYs>dG`Lo1<mCx-!km=4u(Qu1vGCaxLT4m1$O1t!2ErGR?|rw2W6*rde67
zmhtM!G%I5-kMZNxm1$OnmR8Alb!D2BrD++ju1vGC9$Ln$E7PnjSIc;HWtx=@)iPdP
znPz2UwTxF+rdin}E#uXdX;xOOWxTpF&B|tJ8LzHPv$8o_#;YsStZc59@#@MnD=XJB
zUR{}HWz|~7t1Hv2tVYXtb!D2B)oK~9u1vEsT2!n2!_}2(R)%hw<X?h}?5Op&!?E*j
z6A$s1Gu6>&tc9XPS0ntC9IcM2VEjW!pl80K4ps9kXeT$vyQH(s;95+;@fdV<mN|}R
z?fnYe25wes5?kCgiLU7B^x<LWbZ|c$6m5o9a&AGhtfv9-9;{5X#nHGpj)eXlQEo@m
z?LQE>D;NT<O4#2;AHeSc`gi2_2$2;IE6o)4HUAdQvMZ*dyW1iSPOR)tr<|kuRZx-W
zZ$c}63f(kpDJD0DeTpODTVo!i0&}SQ2Nr!3xe!|IhdkrfflOgv`S!-R-s^i0S4F9q
zeDgK^yGzqZ-_rComr_yr{-)_mF3lx<S<^W#rMmJxr|FX}9ZUL@rn6m2rR94})0vtE
zi;+_lgtZ{(3TA0RsTNFg1#`6EE-jei3g&9TWG%SK6_jhi1TDDE6;x}%SS=Xo3Tm`q
zgcb~O1=L9l?qDq_a0M)G6b#gY99NK{1%0)kmn%rqf}UE?%@y>}g0r;XG*^(T1?gJQ
z&J_&Rf>bR?aRp<wz|exGu3(ZDG|_@1+oYrfi?!e|0E^I1u3(lH{HO&xT)`YIh-$$%
zu3)Yfe60oRT|v1PtkZ&(uAo{As<mLLE2z<eDlPcP71V0M0xfvQ6;Q!2Ki<}Y*Ifa%
z3k9!f!3(&euXT3~F1jzKw*Rm@mQ-II^*IY)L0u2|LU5RH6gSph)W5M>4RDRXT`j0b
zKauCdVTl!sknt0`qX}m&LshWn<bID}pPZ+U^Sze&o2|s8Ax1#zyyX@r<Gq1Psabgb
zb}e=s;KLa>f7=Xq=<aWJuyiF{>hqR6e!CS)XEl07+i{~7;~RFXBipgV37+Fa3k|#5
zc;sC?60MH)$1uL_2)E(;%Qx_ma-@;PG%^NOT!lFVbZ2*TMN$Iy->j03rYOk?eOXDo
z;LCIaiPIj*#r4+|vm3s+qGk)Q^P>XYV)~Of{#Co2ebG(v9Fg9!_}CLZ#=hi|X1=fU
zA;o#J6<&Z1OW55W%1yPxg=wY@;{^B18~(7e<u~zH{Yz*JIG;oPjbRGEMxwuMQIr}O
zry7H;Ze!5xc%S4bN-bK)Bg?q9GGaS}up8~c$=r|7UbP3?xgrEt|G=slR%_W3tdGue
ze?-gTyA5KI$#p(WVY>BxtT{czF2e7f_JcVh++f+?B@``noF1aHJ6`taal?;a$b47B
z@>Uc3{sqe0Iw^1J`xk|&fg4m|tMmt}S=i>Q1XfF6)yAWkeQ1qh_jTzy6uUKnR~WY|
zQ;R-j^i_fL+lvbAFPyuv^c_22J-4typzXC$7$wvh!KD~;TH$AO3wQ#Gmz7;Pr2u7W
z(^ts+>_t;AR4riaWcvgQFBoH>DfA9ofxdd{rIxq#idb{FHR9LJVH37~gH!l-cp7Vc
z8T{q1F+K42-X=gTN(9}jdhcz5`j>Lvafbe-oTJxHFI@NN!^5Kj)db$F-JugtHkR)0
za@knLRP%Sc%O;VP6(ecsbdtsjD9J1>8ShF6lw^*UjCLgiN-|eVE_WpaN>Z*RrYj*(
zl4>pK=Sm2aq()16xe@{;snwFRTnT}ac(gW7b0q{ylA<MTT?v7bq-jZWS3;m9J+!2e
zD<M#lTrJtRQRU{n(q(6g4%L!fu7p56GgeEsx)K5<nWQD_T?v7b6l+PfD<M#lSz1!*
zN(hu>j+T7jN(hu>u9p1Wl@KUNxt6@_N(hvsT1%dBB?L-Rqb0Ll34xN-YRLnxgg{B$
zMzPqH5GaY;Do%AJ1o3L8l_*-lAha@9#Bxe;6Yoxs=l6k%uuhNu?=H8yd=;D+rSG70
zC^h<(t441)Mvby_-3_jb*7&i&-)c+ZyW?6U){u@((12L_4d?7d5s7O+sNYkme$UnF
zm%e`r7e-Jl%6#Q?o@gkvIg$e<is-$hC6ROZJM4Q5zp0bX!~IPgL??AoI_aP4pZ-6m
zf4aaACU}DWIbD#}KWFG)qJPfRzY_J2>poHcbk^<?^-p`3jpYjql>RxzWexRDipIz4
zpC+zkmRrZ?;D$Qu(0X0>2vixGt0mvN5&|VD*AmB-5GYBtmTYn*1WHn)C97Qtfs)i}
z$;YmQKuJ8h<d?e=0wqb&l6PDQfs&+Y$!o5JKuLON$#brRKuL17<Z)L*pd>@JWTq=2
zP?E7)QtC<wlw^{Y{N9xiC`qxFOmZazN-|4J#=8;%C7Gioqg@GslFZeT%Uub9l9X$S
z=}HKcq*_b*xe@{;snL>Nu7p5IYPIAnS3;m9uKr1LCCBKWwyuQw2lbJ^(V5AD{lG8j
zzo4eK_y!*a4Oj_Xm}9=*<#u7*?|0#;Hm~EUBCg)Qh8xw9apuER4xcbWZ~0s+RDcCO
z-{X)?X^J(mhj4RL4;pqU85lT34#DmN-b{dt=m&8HIV;{@K4~=`rj^n0`pZ{hu|fLF
z(I2Ui@-s%7-SS}xqr1tm{(0QLg`fWu|9mn3{-pf#(?37_YfQ6E+@&1T6PNjhW6p=+
zgtL5qG1l`p|Jj>EPeF^zRE)=yIBlaw11%UDw`-1!9*DJVcLGPZJ18}n!083S@g{JN
zGk@cS)4*8O*2Ly-I^mN&UEKz$=1XL<n!ouQHtHX5)@A`+e2DP8jT_ZOVAf_fe!r-H
zM`uXx-0ffu{(c7&+%n-_@W}y;`e4*qiRtMcKVltb!_!!g5BVnILCZBr#Rn>(tN8D@
z!$$5v)?-Fx;Q@EK(cY3DN0on$7xA`Z&x3Apa1B!7Vpaav_=9Wz>3H!oxK-oD6}XX7
zzj&Qn>QB;TE@4#$1^Z>VjIKj{3_bKX!Uto9z;$MLx*8>hAFlZdDEs@VL(K4FX{^H8
z3#SgqD(84HG`m7M^0uzz{ZbAqy~ALjgCEuU2W)U>SFF`PFji8N@A*ZGIQh<b!j@=x
zIQd?Xf<MF4@CPlUac@i9m?<JdJBnYqY52v-_X0Y~?SWsMd@q3OTXS>qXK=3ToWF91
z5)E|`f8~xP8tWqd%AG{Sh7@7AcQAhy7b6XDoyn^fsC&w0X&JAsOta!STE?p@)2w)|
zmhtM!G%GIGGG1MoX2sQ7#;YsSthh$Ycy(o(71wGRudYn9Vh?pU;>W8i)2ui}%XoEV
zniZ#M8LzHPv*I3F#;YsStT<Q8cy(o(6%W-iUR{}H#bdRMS68N4@gyzd)s<;hT&!ih
zx-!j*XK5L)u1vGyIa<c6E7Poau9orY$}}r3*D_vRnP$b+TE?p@)2z5g%XoEVnibb-
z8LzHPvtrf4GXGE+U72RZY-5?Xy7SLMt?t#)6+CZ9pC8V~FIXNp$qe=3b`0m)Z@eD6
zWEM&b?G5Y_r3Y+OS9NN*YkyKTgPi{`U4-+<igWH*1&f^`9Q)DnOg0=mj0~m*Q$R7(
z(E$#*@S5KXI@RvzgNsUE0iRF~6ZhZb*+-5y0TdskRO2xYQvOPH8lR@eDLV@wv5vDG
z0DXqfqH;aJY_`d5E^Ijdr|SVX!U5ZpdS}dPgg&<JAT3LcUJux=)&u&|E!P8j;)X2?
zEJf9N!0Gz8S`UDN<?jV^kQz=T1u4ws=qem>@3hb2YO3$Gd$`KobbMabj0fS!lY|>)
z1^-#`G{=iRUWvH6vt7IOC+>cx<@vb@%Rce`#!>yRpPjF~0<YEW&vE7}ziBXEc_!Sd
z_NNPOr2WbNJl6i;@TH?VSx6oaT(bSS@`|DM;o_W(YL=-kC{v%~&L0{uvJgdefE}^2
zmtx^JG{7`NNmwaC(c}kIJNn=XrA>Vg6Qykp6V>kcN6<>&+RxyZ_M+%R*ul6@<+Q(9
z*|puB0^_w4P-~nOzinuqnCpq#Q_xazAP;X^p2x#Jf8yj|xMS;i&Cv#4B<DL_{4onV
za<;K4=Q|m}naDq^Q-8|_)IH<r#lK-~dzIrwlF&@Sz7e}|ZcI^@$=q0#Qk-)JgIv_i
z44tzYcTpH$o^v{G5X0p>2OisJe(b>*p7S<nrLS%UoaY(ueCdaA3(TsqTRq?2;|%Rh
zH)|2!)P{}y9UWm+^7m1Q@ZsMyjIevYVDI<D%B59kN-}aM{UWtKzdMQ$hu)VCLVcjg
zs7MP$Z&ZQ}73woT$8Zn^ZTZ~d74}8n)jq+SQI{R4N0~=g>P7k#zAM)`^vI$5;+)=$
zfQs){+^P8X#0}!R@u0*vN#eT%w9?mv&Xw&+o%5$lSbyGUAH@0dH*g$2rx>p^Lq%#v
zyt(u0QYLeyX=85#?RA5dn9vB9Nsd4%K(g6+8)pi@*x;JT4xEXBkPh#F6q<E4{M;}u
zF@?}S8j^}4lszPk=1+_GI}N`t8j>3OO?fxU2S6^iE+sqC_@EgYl!nDXW5(~<he9fa
zVWj~JGR&9aH*TOAa1&~pRbO&PvWH)=Hp8$i4Rd=Zv(^buMes0sNHH4^NQ0p}ykw3y
zhH<++pocT&6o~Bs(C3#zaz&bj&--Oo8>P3R31O&LDqV*8`2%W>9+tKpxb1ZYjQG)~
zt2N4e>Sm=;1{*n~gYTy*);>%;7uxHrl8Dzb2JFYjtc;TT8Bmd9&;wH;m`ydgG~DY4
zWj>a@W{M9-Ku!4#oUDIb%({#qzF+4l3~n$=5lVEPUF$Y{gHVj9s|Vu~yIFM>Y_Bf~
z=cmG2Y+x;R%o#IaPUwL#7W^)UMGWn&tNq@kc!6<W1mW<9Z^Kr>y>6g-k42zOm>$E-
z1lFKdI0^e#%A5MLYp3}9*>x&V17m^0h(QAN7H)oFpx}!!<5{{FzY_w5w;1=8K@`i@
zu+Mo*sz5t4)L2!DoL66kkr?+Apej7+ASFu{r^#_(R>k(lu!l=#a$=vsPh8VrL`zOl
zGgmoxoTyY*<t0#A_u@6t3E1j~T{*B+RhF+whxr6UQNo326y*YWVp$_RQOqYUfK43^
zi8A^tt9Fq-dJ3d>L5dc`S$e&y11&^!Cc?VlZHQ#!<X<5~QOd;nRR5*${@W8&fZoT4
zm=?fJO?fZ_ZnQlrPcuV7Js2<Dv4m+@%%`NeMLn5CJ;ix$iWGI0?pRT0xtF3o1s2Qk
zq7E1M=4A)OjOUOISct%3YptKfd+F^bVzcd3nDioz(N^U2mJkntnR`OKKEIA(R_D**
zY+m${^nuRUO3?>8w>anFWm-pd%>7t&RL6&apriUIBh}5$*lr9`$8?A*eb>`D_wX{u
zxl8qcq+fgJE1q)L)T+W4l~IIL1!h0YQrkC5#9mH)2}V6`qL;R;s3p=oE@CupX{HiB
z?=3{bHOg*@NW>>6*gsSHCySd7R>2TfO$s<&d2$OIo+{^cguqGt0KuS!+>FRx3WZyo
z^TJXUS$`<!3dAFtfj-uAK#1(6m<VClXv$eU2rs7>#Ff4xI+xO|Y#+auxSp%NW`8Lo
z5Ws-&u9W(f(G%%R;?#^?Kpq~?==?cZMQ8Fq5~6eQ*oM)8y#)KO{?G+X!ON5e^*!)r
zb5BT}?c2`jcQ0}voHH0!Xlg6wLX_qUv1H|JTYxAB%Zn~VTkWKNgdd^yPBmfnqN4ti
zvN+~0oRccS$mxY(RK*JYJJ%|I4>na{{6{&2pC}<d%<4QL!c8Jf8x7&75MmTh>0OY<
zvjpwMAZTW^87Qf!g9Gqc-Tu`kKEe7}1qRs0AIsC0CN!txXAO1ElAyy(I#kS-_o_ws
zx(oW=V*feRIe~J!LM<6Gq0yhWrG;J=z#bScwg6lew->#_S&x1)`~67$MQmjP*eT&G
zf%ZdfLIpW$|IdH+@7Qz``cYEHlyXpCIN?W`o*E02N@P^)fmu7bM|zX<0Ej=k#xnAk
z@Zd~r@O`wRFnhZ(8%Y)G*2L07mG?s0w|<7NLYXx0bU&ct6w`0?&9oPLwHGXau~=X6
zfG@J<M0|t@=@a%nQ;Fj@7vZc#-xmrR!RL_!#W_px3+oQl^C&aL_@xhHb|%`~R`MG>
zN6C0x=hK)O73*?HfI^i`J@bpvZvwhl(7w-s%tp%9*$Q?1@NewGFsMbzj)PX4;kJD%
z;1a?5k{8>z6qib0Hs0>7^z~$rAd4<xkny6^0h~gV;o_XOlof4=3`L6&FN<pZSfv+N
z=7X#9)HW5rG4KE;M`5&Y|KRL8g0d3M`6K*9R$#dr#??;5hZms$^_9BI)Oqg<CRx#W
z4$seLy!k-5cNMUH=q)@{>3ax3Wv@EhmAxG}z6;QKp%N`qIm)|fAE@*NfoQ+o<bLE>
zs0SRimi<mQZ$V`?N5$<yb@=P}@h{kxc1NZ=eIyWD7B|L6qS?1nVLyO#b)B6ZCG7p<
zVL!B5g}sl%^E~mUSMeCaej<V(PeXO{(ZA!-O5dk&tzxi%_F{vjiJb@Ne7yuQn6JWP
z+Mn<+2^=wuJ^p*Yo^%%y*l)bCpWDQ{IZvR8#(VlV44z8b!N&&g8VTMg=X!bUL<!zR
zU^jT`(UVp1x+es$-g!gC$X8zGregHKe$=Utk&94_=`~2pLsu!yw)b<yAdNBjV`2f(
z?T1@Y_j7m2-t9&)3qB)6HGHA7lw*=`&TZJDG>^#*)r&N9Dp&_-hP{ER8f^=qACJb=
zK8w!Xcr%L5)y{q5OE>s}<bM!Ydv?INU3{wa4FN^!Zzy_$O;EK;5#N@e8@;Skp`YF8
z_1}!>b(H9(GJ1K75xvI~qo>;wRs8D$=`8VwINk{7ytvRU@w#n@G)DR|yq~U8;XTV)
zB`)uxON>*1Y0j|`6CRbmzu*@;bufKbt;*y0?Nltn`<EW){q%OKcss_q3*KgJzzq3m
z@Ob>Jx!(R|r9E?@LXb!>?^%><PCrQXQM4`E5xZi0n$xQ&b#8#~iQ-E7x@n~fEVdxV
zg3^LPIiEq9jd*?t2W|}00ny$ZI8a|->3bdm_VdCmo<u&z16^5~T#QqH@lbiWnN$3I
zeSOi6O5eM5JyO{ZVIK#6G}Ol!#`h{kCULi8aFG#&sXICx_~t=UC-_ru?%iOfV>d4>
z-Up$t^f3=Y$zFf=UFdYY{YrE6kf`;c2Wy~8*#zLj=4K8pLGz#((F-y9@3S!8ws4;E
zX!MycqMvFn2S!i5zqnfM$G~{|VZ_g}G3kXbse7?Uq$M_^wqg9CH*ne!tZ|Oiufi5n
zwJL-Ta`w7uolNf{%O1mQlwL}|mKYb_;YZ0@G+b4jzw}pKf=h>?)#r#nIzDAMDoW<}
zv42QTdOihHH|Lfh5wiE8$;TXlFCD+sQBOCB9~~#bkC$V9^rs(KAIrI|Iw3uab0*-Y
z$9S|HNsRJa>AMQQy|MHz&bg$4Joo;b3lpD5p5KUFD_P}{(>7x2a8_|n`=i~TNcV1o
zNPQ;#VYH{!NGkPxD~!yl`O~{}3QqU%U3;Hf1Fw4nes2){Qh_0P1lt0T=upr5;2tBp
zW|KSVyfT7ESPa6=aWqZ^x@*uZJT%&xUhlz7=Iy64i*Y>(SNL&zZ{(_h!Cz8zoE3hL
zCr?|`%aie41wV`v&?;R4N6<EWg;ByOv<RoqOyk39v*i1W-^}TXBTXvJ&v%t<Z;b9=
zef{U(?)@?y6H(t^j9u`J%x2$X_m)|*9@`Ta)~cA62PR{S!1+c=kiq16AX`tkG~A)f
zrJ?3s(9beTU!(uwJZK#5JO2cW9HT!;HBvn|dAdtVt>rCCS&Ii5<!1Owd_A@n6^)VU
z7_}otm#7^WWjS3gL5aF;m?}{}q^lD3{B)FtIe3MXs9skp;TjRH-?P6y?>&4R%A*P$
zO0;vbIG86V)T}h|v;{emhC{la;J0kM@MEJhoUg|C9-SQY%1?}3faJ*9ASrh_?jytT
zM;Czj{1-~{1XSOM5qwhRDY|b*cwUk^|9hzsd`i(T%y4s9FION{7Mgq4o9Q28H!?~D
zJ8=Csn%>X-+0iLoP-eOnyJcp=gL;i^J-&Q|u-aIvitt<uocnEtdVhZ}9Ab**`U)gB
zx^AT$Hd!Hj@$eD^r@{zgW*Pa=s}aZh>AS7WLzs;78~NYjs3<GA!wB-kaVxmo2x2-6
z(+JJa<`m3*oQy#K-14sFM{H+cq=E$6W@dijw{K2SlSTG57`R}zq5z_~-n_E1Ty)V&
z%Z=dWd^9vO1!@<zFP>sl$C5E%C~W9>U53vuKc-}MA@<=6hrT)c<GjkUR5(X*C`-c?
z14Dc|29J9&Mo@QF=&?Qo*%%NE`WTSEtdi17Dt9kR)3q#8*QUZF1_L|UwnrOdWiock
ze{72tYWPRUeQZTr{yP)`eCxmyc?^HRi!o$$<_|(y^C1oZkBP%g1#tKqxQR{zoG!5E
zk^S`xen&V8aH7ChGvRU~A<X%5MhLv+48WTKIo=70&wWK?T?^4I5IIwXfB6R$w9V&(
zv&2a09uA=uUci_+PjWQcFm>;HPP%5_4pI*XcC+*|-50c_*Qt2AfxhCmn%wclPwW|%
z+{3d{eq_Fd-}RtcqAsXz-4^E)PIRfwVn!)G=ZNm+04|FdrL!o&`9G60mCb3mHFiTU
z#M#Ur8|p|ql=CHNc)Q<=y_i?<%;u?gVsQek*WOiX13tFdZ*LqpJ>GPRy=G|ICn=Ua
z?b9^142?s<igsXoWMkvLE_gZgeP|fv!yw09D(|}TE{%6Rc-NJ8eeiS=<~E8xi|j%m
zP@~O~E%?aTWjLP<$9nJ(mP&7)U7J@@gUO)-&8H;6+~OXD=ulRJ5rD@?$v>E4r1%e}
z<r%H=5A(18a1T=a@gK}plz;t)hmzut|KM0f`PYAV5-I*Xc(51&0V+MLZyr3XG;m2(
z3IcPe=+Q!q87@La<u{v<Y`bvI0w(6F(yWRaU)Cx;Uj*MysWnoXpMub|tu<OTN8e>(
znw4E$v<=ZUf`39uurQUYv~g(InK<!jRiLGsbl|$VGrBoa1GV;1>>blpViEE@6;<XV
zHQuONv<KP^8~v~YR|O5{*-WoQB|~N(>|&H|K&NsZ8Xo>UBY>d?4imw*FiZBentVE5
zfbDRN=CeUEKmS@zv1r63Hr}Y<se>4XdvZ~zSMA+XvM+aXzxVKiN()~DF4@;_vhTfq
z{2gwM3>%JCX7E1TFJZ9WYfVA+O5?t9RL13zYj9z&r%LQN@br|0QGDMkXLWm32bScc
z0^t||U4hyR(N!QpUt<WYmML(W;iqQBHk`K6>s)JkB^3(#hNb872A#?xe>kI?HGQ#(
zpJwR6Zt98|Cs_Sq)(YR**i&flb6&u!&|XZjumO~_47cF8nr=hYBS@dwa*H!ya7<Wf
zw~^rd$9BMiv36c%6Wq(h)-~#Dfs0T9Eqk@|`g^R4U;j)H{uH@GrABlCB7VlDD&pnc
z5%D~DE}EkQ%%|PD&ucvXu%uzy!Eo8KSKv55o^)sT%|NH`@jDxN&~r@jz}P<p>v8X^
zJTgjGVeo>6zSf*RFqM+Ly^FchXaqMvz{a6sXKS1uh)xJTd9qP4{eiwDc$?4fqe5?N
zg<$$N8hqT}p(}c*Qj~h5=$9N!xxJbC_QtnK(bL119_IrQdG_YKl5ZPxY-M??)V}5C
z;GZs4d3*PwSk232(sK4SxJ+C53`F(c*Dp^MFLZ~?(NRu8{|G&)6wmD^AdOQ{Ch;^{
z<ic?4*4FfeM034`*<TxF%bLPF9Cy>ztoY^x9xIeQINYI|RkA1*`(ANe9MAe25<a_c
zVfGhB$s<je6caQ06nYOU@By-#AlC)&XkfZvt_DM0@TvwQUGRbiV_oo!1~<CkaSbN9
z;1Lb(aKVELyp~an#Bo`PX7?ws-)lzQ<hr{}v&R$Ijhf9#VApE)asnH!+1vzHs2LJI
z_Ktkb$`jZ{nk`LW=V?}*!0?T2r15JRYZF+SW;F@y6wRCj)>5<D1m@FhUjjSSSl&V1
z=BCRJnk6SNY;IOzNl9Q^G)qljFxpb?(h?XJD;4XSz%blb4C{Nb_r9xHp9J<-&2kgi
zpEWZR*dH_-n!sjgHZp<TrP<g7cB^JLCa@bco0PytXm&>eyHvB{1a`4z_b0HPn$1dJ
zXKMC%0&B0?oCIcQ_A(fDS$DyD1v;2G+^`YGwW;h-SoUD7DPbM}V;QvIr6<58+aL7v
z&E#ki&R-4j`=9|V%k0O{W-C0D*5L#m9)*>cjB~NDey&nv9QUwyPYrH`{EJhtRIwNX
z98C24&rQMB>W4VMVIy^N<d5_yl)tfudeY{vs@!CF05y$H3wZc8tk`lXWZ2Rae|Du2
z90ApCg$5^^n4-i#4yl6O$q*I%pQ6mMBi%s!P)<1Bb{LQT#8JGM-Q|~2L#;EMmRO8H
zkGwP$FXUp0ibSDZ%Yz?rp29Lej`mFThwymhJb1)%ZSSDkx(^?LuaAVFoAcy79M<N|
z!*5RnNAlnnK6)ZAs9V$)Pvi-8>*MWAbqhu4iIk{YC{Ry?g&|K(Sw(JEx6Meg5(at#
zZ!w<7Ee7xSYPY&YSH%;-396u&(D6jhSGO&Bi<Kl$v>~2Inz~KlZ5wrq{ZO6=Gd6GD
zNxX%Y$L-0y{ZZYrFA<5T+cv!2q;A<&j=<yqZcpJYl}_HgRNj85Zrkzp@9Oqc-u^}1
zw&(3r>K0R5p2)-M7L#F~2xkHF=5@l~=skJ!I`cX;Z(bU&x8%)B=XHGEywi9clQ-{l
zUWez+%iwiz-n=t-Ey$bKh1Y(0^Umb;{JeQxdF`GzFO%0UdGpTVwNu`_ZoIb5n|C&^
zE%WA`!)uegdEIfvVJD%p(Jv`4>KHvIvB!<Y>v0>=Sw?+8Ex^(UYC%EV!T4E)_NRsR
zUKA-+ba)(GSM2D;K-8Ix6jr4f-GhYEjM7zTA1TC_&oI(6?V+&U7{?O5um}edCzj>_
zzOU7wZbnmBUlU8W)K)8doe>;{;>jT9m|T-FN|)iKNBP(4X#ZLq<JnyFWf#APfX6z&
z*I>0MinHr-o@%AKPjj|3m^pAZ=Bt6=dor@RU>nAK=i&M2wRq%tf!RN(;g-jfynv%}
zoR{Ie#HYA3?(ie1MlG@aiNi}U&yd*fFv?27BR2lPuMy%`%L~zaD}pc5dm96P5_@mU
zm+SfSrDnx;s3iXd=-)33^fRHYru5@T&i+L8Qo_ADLr-;Pis2Xt>L^Ux`75cSDygFI
zqcaE0N=3zH?;eMt55~n$)ig|lwF5Xg!WBd3Lktek&GlKKCegp>^Cx0+VGOA|AI4$F
zC=18fl^i&K%B`x0S&P%5{ciaBB7FYN=n*apOy&e8#xA9tV?bb`q>Vy5PzAdYt6P2t
z2D3CnsnUiEyx$orv`^lOBzuE}TzxjM8v={;{8WT1H^-vF4v61ghVNx#fjQi|gH^IH
z*$RJ<!J$vy`_vz1Ri!)j!5NE-I4nbXsbXi{EIER)&l7N@$392Y*rxz$cbz--;Rh3c
z@z`fdHuDA<chs<F1cp6Bjx+3;fbx%F&-J*tgQlW|EXfEO#y%fL%dOA}Iw6DaVi2gl
z1&yP37BbG%Z4gGDfuLb|N*E8oehV{7{t&FnZb>#*?(}DGgE`9%%U)*LtJDe2D0MX~
zbxl>N8_ZH?lmyTyFn>`XG5yyavwOWoFJKg*+DFsIn)WaxXUM+@Wv>xkFjIrlJT5D0
z7nug9964k<D411gZ0kyI<m2`P6p@`qX@o_i<Z#ktobi+gTc4zzh!G~MT7S#oFU+lt
zGB%T#F^^y#GzG^lZA-&%f968J_W;EuFXJa#e$KjK{sAYd;wUD}Kb+?tSB!nd%1<+M
zv)_vuk*YL3AJGAG(W5I+7|PLSz;wh;nT}v<g+X>>j8N5cR_1C1r!RujCsBVwV=i=X
zrZqgk9Wpmif%WacLQMAtDlmu93$u-Zimvb-(-ljKs#R-c&c|90|J}=zK+~(u@JP&C
zY=#93<}Iqc_7Q{}{TI$!Y>3`zrhm!!B7Sasr!Snv*lvJ6#eBi~S(uYpKL-;A>ryZy
zu`Z2lE?GHQ53*dcYO)%#p=4vpIM=YAXp+(D!F9#(7L5fGqv}x63tR@h1o1}=SeO=1
z$G)5WnK-(NX^4|_w)?U+#1jNT$?r@*JHyXYYcYRuD*QtRZsPpK5+gW@^B0*IhcA(7
zs4vwJ`(uob6KwP%QyC5%f+H3<J2u2B`Lhr>T`#7-+O1YKSNU-yu<9HoJ06@J$Q}>Q
zy#|xn-NuotWR>jwo80-<M>;EGzS9=NtHYQEZBSui`i;j62d<Q5bRK(O@eL;sIwwpZ
zbk-9Hg|LF^VEQ<3V1*v+pyv<pY-n}|&JbAkDTVe2>Xu#9D31~{^OSC@?+M+8Mnmhi
zA^p@Wwev?5=Hq%=^jE8joh?j%e$aul46etj6KC4Vi8BpqT|Zv`hpL$X%v$TUVx#nR
zcx2jN6b82%rRD10;n=>Y7b}l^Fmo@*^nueYd#8oLKUx<BjM8d2&y!Jp|Kh$~Jw@1<
z{*n3|CQhSY%bLxpfG}L4VS#tfxChy@T_Vj4W;{4oHD^1TMm6=wZ<}tzw@q;%>}I%z
z4YAX6oO)K7v3%#SFCINN@#%*1DUo6DOT;|Lb~v2#Ow+XuUC}Jy@oQAdfBrO5egd7T
zR^^MnstI(?3Gp+ZYW70K%Lw*GHH~z}A1NlPd~kl14d8WH7^e9KR)W-=$OG`N(Ebr8
zr}JZ-->9^^6FRrh{)x#KWB$Zf1V+So``F!Y`s2%*U8Buo{g_2rDCtNo7$NM5!N@JJ
zAym^;+np&7Gz@vsE3^v5GLbob5toU?ej)xS`^);#K6o@bGakPyV#^vUjo?dg=dOPr
zv@*G@I>TKEU=JCcG@4psfR*$y&F++ih(A=q6#%X*l)Ont;ar4MF+`EdG3jG9(M%&|
z+f1!2NriK$o|3M(qW6MT%FGhALa=u&nvy{&>?;%`^A?WP3V;>rXW<(Ts&f<h2|+>T
zR`yVt`^FW>TrPV^=I*<YU55A7?0Vl9(Q!ACoh%CC`UdEO1tFCj73keM%arq+--|SJ
zC%S|WLh8H;9_#<$@HlkG{NZ=$Yhs%0hSS2(3`QRlzD6nMC=2mT4U4BE1~8#imseNr
z4WwhZ2D^o?vDVAM9L^idhN+IShO)93E5j%@lJoGiA0KJqiVddhJpRUc$tqPp=8x<x
z#*B6JKfA^#n~58C5OB)38T=TNcKD42%IIB%c7;{4<p^ILZEa;A!qge`ZDk`1R#&CB
zA}MeZGlKGsnR!sP(Z~@rsf#AV;O1j-z7nQ+-=m|C27I4#2*V<n!LFVBCZ>R+uh9}@
z>+=nw9c%`Fv&xTFM@aP3&sAB}0tfpk12gC9Y}KJVQTShB=dyr9GpE1sA28pEKI@Dq
zvVntwLc7K}Q$%Y;bipQwVDPSU`veH%DqZQnD~1ll(5R@+7kr>O?5rp9?7uH%7f7Y=
zHZuD&=lU0rUZLK97>9Nu+@b(0j0kN(#sTz)vD~}gJ{$WW4?t~M_F5eOhjqFw7&bbi
z(d~r08<q1mXTYF-K@LJG1@GQIDr(=Ij%B9-=fL&M3lfdtXrDL~GtF1j`D$$j?2<fm
zsO|5uSX}AbM1t@%R^iDR>_alxEAiT!_}Xpg5aYFbY8lEl6JVac;Ii0jljXH_sd(*j
z3>TeK@LIjTAma<1k)5k|!Jv)5K<R-xr}$2I3U%KKyEB-lwc}BuytqABd{OtJR$1l6
zcX`i+(HOc?18{&e^{Wgm654+rzuk!z@C(jmCO-XJc*(^Xbgpndu#vxL6b57o7Jf9B
zN5>QkqF2Mzq4<tMjQw80lk(+be!ZhrJ`B98k}?26zm@G2OVESjiQ_3Av{tyHtxm!~
z@1&T67wAA4ies3DlRY`V;X!4C)!DM^aYtiwg)nPUiQllP!KZIJK6*WKzxUw1v=jOk
zCz?RrBF$eynyc|%NBTk^F;Gr+{y*m41w5+iYWSZa2}vMu0)nC<1{)MAYNFf%1T&Jz
z8Jr+0R#a4cv8b)}LT12v4NL+Y#!+gqT5GFs?e*2RSS?1>kP9RMDh5zMQ0~f!5eP^j
zTulDIwfC8sGicxUeZJ@aJ<o@anRE8pYhTu0d+oK?UTf_qA$l#j+>QZ4Ip1g@W*vo7
zfJd6|WNA;}M#-D38=VCCw%Ql5#|_@h^5@P_#2LtzS=W>wKQ}apP@uPl&c`l89WB}^
z42A@g9U{0imSqrVe<P<j1zUn$77`x2o-`8yw#Lhc<wW%4DrCnH?w<z~kqn##k;ppA
zctFGUZjOWXeMX!Wim|V(WRWXDki8AHzhIt|z(&ynd59)2KW6mN!OL@aAXE5Gj4#HU
zp4R^E4uB5qS*-Ds$?=~Q1CyUDB*mQok4_Z}7wej-5JG4R0qUnykf(JW%bGo3hQvw<
zSY?i>N!&k2J5xxjBwqCY*sA>$s#<ksY&;U}>Gi*BkK3ICf1LP&Ke-r5Gqu!e^oivr
z$0=W{@Ho=6x3@Y?D*<68(k)M;9HGFXqalqk<`WqwO{aM!AnmM}{i?=SiQwxYN--lP
zF?>C)@l_hbR|&9!FD9$~1CTFg^qkda6k9*+rv`;8FyXOJ+`_lUlAIiJGXYW;mYAnW
zL5Trl{FOgk2Ya5JcLjpQVJm|DV7bO+DJ5b3lcT|f#EV@1K$A<Ye->L*AlwxCk%Q(H
zc@&|pDEQqbkxc3x<tF|;EFTYM7cIP}pK8<iv!SJPkUW`eP2U{v#O><H@@#<ZN6Tax
zn=sj-Jk5_H*G@OBv;Ec`(}luYQT*I7o&Tq*4YJ>wFc}DRP;CKY0@|rL>|c@&+Y%k-
zd|6+cmK&pwf`>AUUtn+bP_`H!H;?s-u2@|zv{yLQWH?jJ?=VIUS1qP_9=W&fIG%5{
zJ+7)C)hp4m`<P=E%Ez)U%A9E+`XYPEMfBxmR;5JUdcPS>d&pf}cf6udB^MqEXNl36
z8^01usNzhRRQWiVVh<irG70-I%CEfLhi^=&^^M>AjhI2~=RyplLPQU^x*S=xF6v%>
z{QIHai-j{R^#W)tla%1W3Zx1co~QhHu2E6PgHILzSDv6Ob+4V>c)Pr;l6PUd_S?L3
zFerDHoWU(X>;_7WtG;6NFf-ZTno(+f&U0{vuL0VJu_DPq`NB}P+d~;nU6lQw)8(;&
zo7+=8N(&k?kov1cziAZL`WDLFEBmp#^&TLCt(dlvUhzHT2(H00D$e1db>LF3P#hg@
zeVIPB_VT^$ZOu|WSW=tO<YRJ{@ADR8OZD#vfZ_b*Jca5CsX+S&U~O1IVl`)i(r){#
zCF)l%VOyuu?C4Zcpkjl)U8djWJNRDkb?{8RP|{EI4tD;{9?$rO)RE#yg|(cve^cu%
z{(R4(?8ad%W=GeTcU7Hz?$0SJIy}`~|MQY&9n<yS^6mO1Ucc18G^zeKrv97yRd@p>
zuy&LezW6g*62iB=@Mk>fHa!2Vw4tzkZX5@l_8F=Tow^O1s4#(p9fE_6&_tl%WbiD#
zt8;L0iE2Y}p=2-2qXPRip`GZoTs#f7z+ATY!>7DTuW=Srzb82qp=`CsvuE=)qz-^Y
zkSb!P3yX#S+ano!R_GuG2s?CxN~A$aa)m77LvX##uZcaW`6m3$o-?11tU|X5M~W8C
zyGbbk?tGI}Vs|3}UB)!w@Mn0zspCoJAw|+U*8dSsR=K>6z!{=XHdP@GYi#K&85GUk
zp0uDVe+?~Z*>fQ`Gm>`BEAR~(a|fMaciY)a+L=xo@$lkly`3~iI#c<2I%$H47;HBu
zjkQ(MoK|%1{I2E{5jop+8&0uD`8(POY0|QVFYA^~q$E|{0w<v$mN4H*V0q9vDxHM0
zVhI&a0uIzWZtdLZUQE?^oL)au_1^ZPs|-)eRrAFDC-HuH;@w6FX}LhX!~O<A6Niyi
zuu>jyqEiJ@{sG4}Z1#0Yx3(Mu)z18T!#lr6`){&P4Z`@A)Ic7>L5pP2#{{BjQ9haB
z!b*NT!Hao{s3-Jwg+EhILwR~$J>|){V&SjU^W~I->a~^YJ^J24idu(Saa8O(P2ReI
z&(?EzyItNyf@<a3ioVn3O_(EuYmGK*cco6Wg5C^Pw1kRX$ve^EGovRuS4{5a!)(F+
z<atIX+GB$c(bURl;8<}{_k{~hJ}Pmjh|7M;bEHcCy<^gm#1Lao)mV{X<L@LN<+<%-
z;<+8om`+d4P``6)b9lg}&wMJ!6Iw~RMbskyZjH;g#7k5SNpF}yL%J?p5O2r?8lnPH
z3qX}Lg!PO%(_QtayXsGO)t@dfW}2$LP}SZ*VX@j>M3c06pnTUSMPK0W>lNbkprnui
z>qH9a7dwlkLlP7~)zw%o2LsMyPFMZdP`*?!0oBIkGm+a`HZ`j&xAc73D2A4esvI-Y
zwKLlqpCRa}P(>(y?~<px$W%Yq9OP-EC1b{ChYWwg0%=4j*I5ii-Ku7;J-M4Kv-m8f
zIC+Af`kesweoF!22_I`UkFi#}*PDdAs<u^;`;mnnmjksAkRRf^UNh1g{gqbZnUP9q
zk2KDxTtO$qtZVKr)nKtWd!3{}q*5wV3XKX1vNqwKX|;uSUb%udk5BEc*j8-Z4eZ8!
zk!TLM6?I1KZrq5n(Df|oo<{+u@#^8S+p+pIE8nXfMMChpQckb(SG(|y&S7=%lE_mZ
z4_0Lmi2V!77`84@x#S8dBiPI2(t_wj7UcIDq>Rv7^wX$FI0Uezt<f17C}hTGh^guq
zD&fbRiz&~kiUYjMx}p2$AH#ODa;u~jL<I>(<yZ1j*lF-Qzn4N;jeqV=L58SZD$gfL
z`KSk?Kg_UdxhciZIXvB3Q`R%Q>q?G5KFH>+cGk@(oTRmbq$s2wR6X|2Gn5S(YO8eR
z?9L$@%L@dw%6XE*)Pw45A0@Wc;=`Quz8(TDj9t7(s-<<Zt8L&nNo9^f**^OYwT{Po
zTiIKLCn>SRKEe#ccR*ct>GZXtk1VT=B|`Dkp%7p;R>la7#~6V*2w89t<vnb72j_>d
zv+<kMxhOJI_5Gj)wAxQ!q!b?2Iz#T+>^Q~s+MZ3imWN#QIrpnAs^X0Sp;GHHPM$GH
z^hZMo;Jw9pHP5Hb0{PU>KBFg9h(hhpG6a$C>W08l8IwaK_!=r@=CT29@AJO0B{Z(G
zeb!$FGX#EYlxd_f4^uh#sT{d<Hc6ye?Cp^zb2y6#kc}&n;L}4B%v13bq!B=ladjrQ
z?uJ*9*(uCRVKNjbKM{a2+fNV)os~}?<;4i;QWC`0+Lb3o<q2?OUo`jZVr%u>U5ZdR
zy%-#bytSriQ4XtwP7FleQ}0OD(le2;>5lq^$4XV;Ct$HLBzSHhw~ulk7VqjTyQdJ}
zjN;rZOh)&jRmHNl<SW@L@q0w*$Y+(D;<pdKSX-a!5}d}_$)_yuTnB3B7?s~J;(~#1
zBOojnu0jkO(7r(oBR)zZ204_RJ4g}31(TGt^)*@@4(Daa<Kmr!t|<JOF5uXiF=Fv+
zxS#6u&o(9%EL1jFXLw_ks<~8daW1D=sj~;4+#~4PPr(mI6<hVrZWZgxZvv5-+2t!_
z{qh8RFe%YQbb=A?L&0=Lt|G}0`ha(g@GBLQKT?saSfzE?sKDl${_GacAhv^801m}Y
z8afU@60&pKtA<A2#(0g&P4b2xd*uf_lu}TfpMC?~S#4YmWHpBOUFmz5a$^kjxG>a1
z{*8O&3e`Mc<n>(5S6h%Bgs-;vBd_;x`0A<vWRO?(5cb%C%9oBS=6aX17ZSZimSVH}
zkq0D%NK5ciU!>xnn!(m{v_)b4>q{$_oTnMAT<I6R*GoAk6@x`xOkkIS+G5VtvXpI+
zs>}!@2`JZq64cwP6vLfNindaUjKDii^kn^lH=;$0Eh%GqhOYJ(Jf#pBx{uA^P`Iub
z)hII^2cv4D|6T}8Q1%iJ1e6w9rjMYwu4SW|KG40oz?RKNvGswSjijY2^wCN2=f*Fk
zXRNN^5<e41@jAbytM@lnDC+<!C&r8(5_wjQ3SJ+8Jqv$%jk^4EJM@|$Q<*~o_f&qB
zq+&IK9D~z5n3wTtU0H8dN3j^o4j8Wz6X7{>QSe}o-00j@Trkwi8ZY=`Ds~rKV^p36
zh|kq=k?yTJn1ww8Mxb`vECQok)2Y~6t46y76`NQKD2m1aSU*jMvEB#DI67rmI5a(M
zeIjd225W;H`N0Ts0+BopG1cg((RSKeA@(^g1nP<K?+_U4)B>?yV~YQEtD333o0Fz8
zL}A}H(CcyakiI4Ns3pLf$r47#?75uL_^eM_eyt943vbqiuITOX4dDzuDUSNa=}(?I
z4hDV<UhSh8k&%9(PJb!}VsF>!ch{=+{n}4ae;~is-q-RGebEz)=;h+bs%1Dwdczg5
zF+;b!C4<q3$;V3mGsB}jL7#GgJsF735piS!vm-Rj9~mh-k8=~niAb2m;sRCgu%8hD
zP#^Hh$%&Th5E1P8i};{mLn~W9OE5U`m$*8+yh=%}!JdxzYNpAuC)z7*uOk|EZcv>d
zSK)CiZM@g4jsC3l{*Ai=7RMWJHk;N2j?JfR&Mp<XP^^o|S7$cXGxs_Xm^Ri21|s-Q
z+ngOtFU_nCSl4DC+;ufGn_D=bKp0seoDn(m$HD+23-aKq>foui(^%LALY8`PTnt3}
zjKcgnr`teJ>ov&lI6tKNjO;GPkknj$D@Mr|yePWNG#*IJCA{X7I>{;0k2*JCsnW6U
zY4c-XZ!CO)ihai0-PpKzWuXuMLl}WbpI>HKO>*g4<fYAuaT~AJm-S~}D!CNHkw`^d
zKWV-3YLhYVA$m$N6T8IA`N@A%juC#8{Qs`3L40eug+FHaNu=+}+bM}INF!KVTF;Ej
zyYzqy+m%ca{IQAzvomv2v`^Ed$XjQrCBkYf9e6Cb1(Wo%{B!Fisnr!Od@&~S!FL&|
z51pZ6p!~+0A7U_q!)3*Xs&~hl&4CqbgeF4G{(;*|tt(oalqml>&B@=;dO?Dqpr=4%
zw%PruMg?i{B1D5gu%1d#fV~N|yoS1TP4HtZTTja;YJJs51_Za%_v$pA70b?G7pLR_
z0f*2gMW#r(5ZVx0@EwFU(VEr>q4_h{3!y<HT?3gwzSWqATx6!C!z8lx?7$C-ArGX$
z>e5}6&opM&5ch3DhWzO#e+J8+QS!%>KU3w;J@V&a{w!EXgXLF9Nq+z2{8l-?P0p`~
zFY?`Xe%qbj3@BXE`#HaZonKDkRJ!T>PIZ3oaeg0meiu5wFF3z{a(=6v-zMjGi}Pzc
zzwP|e?F`Tev2|9BLLy9z=KtaTdz9?{W&bUcSpEef*TOqc9Lx5Q;acx(_~0MAg=XXY
z1NK0Gqu>w7gX-VSJ2)(Kcn9A1H^q2IY)Av%nwG&@p4@tmI`94Bf)ky3Y^$-6^w`7B
zRUG_1{q^0Ga^y>6G^ajGf1H2}iYR(&JZ${;@sB6Y0Wv|BECX|NAqS<9ns~kq<U7rO
z#-wKMit|}G5`BdKj2OvrhE?#i;5#@dcGkn*NfOvFWy3e+pKPO;Zt~AR_mq|#`cN2$
z*KWz-0+M`l_sVyF^nIDzrnSpgH5f2u1+&}!Nd}Ly>scv9MV^xpG4YJ;<Un`9Qqq`&
zvDbOLGJl8t(~P_?wPTMex~}0#_E`mQeI^^cAWp~F1Q+ZG9cb;7Aefnpiz5}E^(TWF
zeFQtr_f>thyHjWSthf5+!J225SikFgIS(aPMc-lS_r1QO)bCq;$MP$pBGL`I@#NOd
z7t_YpU2Fjat1g*^0eO0Z8o0xar^yGq?839vu>VA7*hu6ey`daLx;o@{?I$@#4j`{*
zh|Lx|?1&mAC1^#cQEXK$R^z`(_x+9fl+KKt2{*nlfdJ8$ofIKk{^jpCyWiKrCg63I
zBo;q|#9P34kMLn{^8afDgosFaW<g7c>p673*3~d{J?59FF;8p`5$eW}RAQ}j<T`O*
zl?4iwK^fNB74=d#0wc?j!0MxIqv8cxqYd4nuL_yG&o{Jd#ge*D=>{;_KJK?{`zaxq
zf>psDW_Xp?9D_5{>`z;#$-&A^s4S82T#JXC=i$%RC{DaE(K5f8(%o;V+u<0(RiQls
zc7Z=)AO>@S%k?V=a7?h<$`}T0@I3Z!5K+MrJL+U*1bLV1yyH9LWBOx(xDN=vzNEk2
zUgbz=;yHVr{xnyA`kdXJ!vI9PAe*~9ongAp(2(H}g9v_o^`{d3=_EHUSC0nDvH#%^
zEoKp-&pW?1WAeP(mx*}1&78YO>%6uYmA?hO7{$ueVs0Aq*DL&QbZ2=|k|J*_IR}yM
zZr|KJlGitPxBOsMER#Vb_G8<Zdex>*ZbX-38{^p;QFLHfIY1OwY9fYDc!L)jZ{Kvp
zU$8N>jfbQDf(@Y!k(=7BUXg20`K*kH;j=Qax~RzoA|B0R%7<!1mZv9hA!qwt=`mOy
zp})Sk@-$R_qVC@&LtmZYr>AFFZ?EjDaW>g6TQ_I(qqKt&jJFdM%{FEYm?*Z^IbytZ
z7_qCXNSm^Dc|;$$({H@mTt;+@z-HMDtv2TUjW0x6Rx(*-ZEzTrzgY_$wE^QbyojID
zuLZTC;WYF9b*h)%hv=oWp+#7onTZx%6`?AUt;5EhrWGrQd9{;mZEBlnpk>a;KXpEv
zqf4u(KMgCZ?GNG6RM+`|s_utf9i-ZC>-2p(efAGX7fqa|t{3?jqb^6@7MwLM@Ra9B
zov?D%7L1zPCPq@z)gf*T#`E6x)`7xrKFn}35`&=ov?ln?ODah;$HOtcf#1jy+vGQ1
z`!HawkYfEXZupJPcRCRIP$YtRx^h^NmWjQrP^HA>scp?-GgaH#)%pa5<)lc(4#}z)
zbe405YD*bjp6Gu-pSPV#nzG5z`n}T3=1G>$?fTqUp*~ALX+D|+0+5_<rb(HM<XTq!
zCiWGoIdV(w5Hr#xFc3B<+^r>xKMCHe_?dF~+(l`n$h2A+!pJHWNIT-A$i)gd=oXBM
zOyChU*2P<&5Q0lHcLWB0J<&lW=hW?`m0N-XCn6I|cl?=kldL8Bv7jz!wVv(R1C(ZN
zVSgdbkAJV-h4cwcHVH}s(X6J4ku0P2e67y}KaDbe#A7$i?$a7%^i+bfaTk9sw6BaH
zvCiJCWds}j1;>JGFza6DrVxdcQ2&_2MFJ;_W#CN+VXsgDgVR*D-vkBH2eUG`CL;)U
zaSjqJMRCyY^8_7$P5<I>l8|&5bv7R#(I4ldn{~-o!d};tLEASf`&JhLtLvAE2q5TI
z-U220ds_!-{ivIMH<;b0tOEl}-1NKKMZf<HSbx;?`;pMElkYU3(ApAfqYn!8h4);k
z%)NZ-l%W{gyN&yNDy3NVu3z})o=}?I1+u~P*M6CW3D|%H@??DrL{x>?C*a=<SwSvG
z6gOsyie7CO2tEWKkyZsFuOE}KRK^qt7?Sw|!Y|7f*xXnb7>HNF5|ty^Es(j#v|bh{
z&L`LgQ-*We(T9j1CniRQZ3t9$1akrsj(`IU@1}{7;P#BxbKT8j{hk=@zk$0IP3sDY
zWwBA|;=Efq#0q3Wz0<aGKf~8X#R#FJf*K<%_qVV_*NPcTOAaqWGO@#Qn!8fA>R;$?
z342rxbVyWA(644NU=Oh_nFFb6iK18O(if)nwt|Qj-k{B-?;tK_5}B;G|I1Kxbh*xu
z-<CvVQ9dS^Ay3KjnP~Y%3t?3B7xaqG`jFom!2+Y1l&sm9sjInWXIURgNKbL0FkbB_
z8-R9HGo@zKSM7Rv;mbN>-ev|C$4XHRs!0mlZ!%Mq{x&sNqKUG`cINMwka}2q?`^$4
zF$nql1#9eUR|;Ro^DT3?nAZ5cAEIN;!Z|?=uXml9+2DA{Ni14v^%h(3kJ!cZ3{6qC
zF;H1(se3z(eg;k471%~&F*sQATqJoGu{TC)(rryi&7Bmv%LXlbAs6G#{VhLI<0^MY
zi%W{zF+%j^@7$_p_qM*QLRWw<y$bc|sYc}`{qawHbPbz*v`1`MZtziU<QcVJA>2Y`
zgUkum?AQo=#Ji%|11u98WtrF--hE{t(*Ky=%O@@B+saBbaT5rqcoP#}R&T`Y0M9Jl
zK3U>LIu0K%GQ#i3AVqy%lcP)#a)te}CPBdU{!EUq`V*~ICQ65z+vugFl*lu(laT^A
zHq^k9Pz<oh-NjbLhq7CN2NLCgh^K|)8`RpJ_E*SqEHxsl*zK${d@3gr7HsjUK1sL<
zSnKW0IzzgK{guj41aS3u{@Cy6uPyqxp#ghvUzPfLrF{JrUtJVGSyp?e+se;4dY#Ln
zMCf6|!{f~8EHv8{S|?>47JCvO#Ha9rD0EDTLc$%vkFXpq8X+j~jtEnzCyQ+V{8pXt
zJ*z>^V-GV@lOm~`UD`cXjJC4xJ50zQ+mZV>)`Mr{u}vBNfopLu-jso|rC@ES*VKCd
zvGt_`n<iPQn@I?wDb^!EU-p>vMcThr+E4S<&WGl=N}FZlA{<=}IBiz%438#&1u|-#
zO8h!LRSiSh8A-zc?bwsMGfvQ=)v<z(YQk=L4M96GU9~;XO{0aB3fo4AMTFR1-P+Qp
zLN-K2%AQNSGs0gz%q9>!S!jpjm#-<0?H^Jp3dC@G7h|62+wB1}1%(Y5g-s)HN?=T0
zs2k5y<vEfUc{C&3*%|B|D0nkFhtey4D<Es1;Loy8nEBB`e|(IL(RZ`hJQ2J%OV;T;
zA4>i_4;r&*-#>boqDnnw!L)0%)%Z1wsS@@3C@~}&)QV$x-|OX^)XB#Y0QN=~x-tcO
zEO)-yPpW=T;S9#=)OOH+c?=PId`7svGngN+R6F4q56hb1#MkoU-9R1Rr!6}CZS_#;
z#$IG6!nYs7ib*ujX$8cG5u|H;YMzouSH(Kl9_4%#QO2s%AJ5>U90yVMjiaOroXo55
zQYGx52X{RqW$mzEkHNYN-uyT%n3us(or({huuuC&wBtyh`|KcaX5p{dN*UpY;c}(K
zGnXJ#UFB3?=}jV!XGAA>!|f@?0z&Z;0M~0&p#JyxIMIfIcB=DSweCw~A&kv3E|*An
z`|JXWmL-U#N|MhYV67xy;3SJ=UmAT}IySo<F5)1cHI;5fpQeKH$FraY;gr5bxAmwX
zL{<~eoPhOg?dw~38%pG5x5=wlKK+GHM)gVihPmqSe-FntoeHbHZ<mCSvVU%%2Rw6r
zdtR3}U|=AwY~**0$Zr^K8Id2vKDtL_E+6fo>0XcCZC)V<2c@%UWCBz3HzCW|8D#tx
zmMXXM$w$mDeew{KDSSrtFckCEQ*sHgUmeKpQespW=ce!%V`%<P$xY+$wA`+IotBGY
z_NxqL=9@RlnePq08+_N>%bx<x;gvb|&$@D+BKvRA!$d|pqp9NT*d-GDa&EoMB;_;0
z{=?biN-q{Xgw}hDk~fp&iNPXIVBG_6p$-H9%&&!r(Z4aavMeZBLk7Ck!km?;q(>Yz
zO5v?jvmh!7q#U}?o{uLOd;kN9*0U4LM)*lF*xvF8@ZO`P1qVuv@&Ce5^3{PS#A${#
z9sP<#+P6bxGB9F)jltGnknXoq@fRTmzZ}kUff6~9PQJ8HJuIZ!X;fIW4|`R|%$a5c
zm*gMizCrlkki-6zVY$>S$N4#;)tN~P@MpY09np!+qL<T6EEj8nLjwf|f*1J<j?P)7
z!rSU*iRcr_vv<%C>A+clBN}&;=k7e?CLdloI4(He#sQ&p#)0}XP~`sf+xX__Fd#U-
znqi{#%P_$~hcB}aH$v?}fDxtlCc*5bNYSD0_RxC-7ep@Z3Q-Fl7$p1H90v(sDI_$Y
zp^Vv2`6EvfOYCG>IzE@JSAmac?z!4G@N+DW1IGBz0|ooa29*}vol#~m6|k1Es}`PO
zIG#i-!=37cKSfOP_vXo%->uWO8uOp0CJHbXye<zMrFl6rJ|+m}WQ>m6DL+*xzERMS
zP1s98Zl+n#YRnVYZ1URc89Lp(9dx~HkiXy#-3C~jJ<MqXcJuKzOs5T4sb{x#7w~5R
zCtw{X#UQ_*3;E2KNz)aR{*eTdQ*x5^iTF8PnW_YrJpJ{l>-oBz=EI7FC<gM?<)6UR
z!@s-3zU5}skuxF3<ceBy(3n%%IVZ4BFTRCm;evQe$pFbT^IRd1E5FO>Mwp;Ic|pvl
zMj1%WA@(Zbyr{c~Qi*4d%5-r&lh727H<GFyZzf;Hld@hRsg(QDy0qeIv|l)7IV*xu
z@hjE2gVm$?wR_VVWjS6S7}AP2?D+)31a6MHlUD?DJQR9<A8g2g`3trf^XCC3qIN@S
zKQZX#&J+D*_I;*m%MPU}-ED6f0+e@8{Bx-T1TY~T&Y3FsGb%)1trA+K*;c*scvAC3
zTz@JafF)@ksAMdQh<=z82$qjQud#f{y(;4DTE-q#&SO|YQOsT1#na-&EOYT>t1zDz
zd&zB*E$xy?5)(DSf{=G2=Vg$ABjglT2ppvfk6%LmK7^IP!3SyOXkix&7G=qpejjbx
z?h;0MY)JQ-jtybMglx%w^FL*QAqqQ$=H)-Bm)klTCk*D%(=Z{AF`6(+KNoo<7y0-{
zuHu=E69OJ@<Allj^AxW!IPJw>q3rH8VXBjpFXX(3oDX`v4HITb^-#VLa*x4FrRrzO
z$uqN2ow-rr0?xJ>eDOC@H~#SR3JDGBc*E1U;4NMjo}nPBcvJ;FCn`ManHt^^n`QOo
zr<tF|1zY$l?WKZG)l<WQ?UK~EV5j`>4;*|d5oaxRir2WHO(%Qk%k8Q!3yzS|pl;pp
zm}<i#Xd*uni$I^3s-VSx40}w$Apbk|#>;bfZkc@%?fo|YWWD-25FLA_ghO3~z<tbk
zzE()>5Hx~H&zNt^1F^EBxn~5T_vPkb<8uZFD&Gj-%~9p~qm#U22Zio2su6u24Bj1X
zzssnYOP%5N|1c__<-tDtLkL%<_EJh)grk^IF`Flh^^NdPc#v&noZrTL$5gg!ov&n_
z^+k&q_bgtR(&Cpwj|jiu9+rG;HC-0uSeIeI-FWxRnKLE4Z0!NI=<OGH+oHof(V@$%
zjv)t%${X^E76!A<4>xAg6%NGfI>Vj4=OA>aV)#&#6DY8a@W<4l8u(8h2#m;=#({z8
zHQu6yR|E<U8sRylmlhm1!g3g(m%$-h0@0hh_>U1>bqhh?@GRg&i{qK_{thJ^qx9SE
z51-62=3Of=$38O)Y&;M+t2o{vl*F-r2@wT{rkY-WR$YKyR2iHf$)l!=S<q<Alc6oO
zD0k;hf3#?)Fr(W2X>zdQv%d6&cN3$beR^2|j(#swnsVc1GqS@rWdI1AjE`-LkVZqw
zpg6vTFQNBnv8Dd-=Y*`R$o-i-N5>MlYNo_bNas%={S_P^Q@q4!Na-bMOC^mdQk*hS
z(()y(S<-e&8U_=Al<Op|jo1t6l60peJ!FRMavFRp^hb`{gO^BPPSwF){@UGXgu<k%
z;?V*#8kkWMuVECJC*I_V(C4LI?ir{LWfIdxy3#rd)ov>7M0jW%$p2jRwRW+wY(nQk
z5COL>I9S$8B||-Sd_4!!w+KV3lg6{qdN5O-=8eFF2lvN|go7nZBJj86Yna^8zDseV
zQfIuaXOQ@No%m<jQe6wa^d@A$H}1`0Z2yN1_({9$cb%QpIf7ZM(Voo<lF7l|T%};;
znbuUy=(EM;CBTtmz<L<AI{eoc3)AlL8~Kqa1a)>1;P%DtaxRr};st;J1vz-VG`Pnz
z<6@9XA=}~w+_gR<_I@qv^VU~s7z>$d*k`=ZjH`P|!$wKNXh~Pr4a)nNRYVqXTC(hX
zp^zVE*uQoIzrqXo3g{J%0R((CakB%yFG@So*Y1<PDfrnMKHI*{4L@Bfms%CnVyvZr
z%49R=Kox1Ie)dIfsP8MN9Gt!!z_lr=x{EpKa{$vdV2*v@x30b(;w4ettC)pS-8{@b
z9jNsKR3``8Rc@$vlb}jpQjL5i0re{lb%8z44ON*0wdHRL>VBNGow|Ogp}uDa+)&ph
zK|Rx<p!ScAx9~Czm202rhB})Uda?m+R?;wB_mzVG2LVf+S`VJ5dN|nL^q(%ge8Nk>
zVr@{v@Jttlm)Zo>I~pp_e#H&-s)C9Q-UrGRzQ>*T9XoY}HPk5kM{cN}B-QnIWiZj>
z<pflzhAOg$xuHfT)m8pm)vtTd{5o}=rJ;)Lj#pg$@+zntr(a|Kq@ZT`;@H^LUtz<v
zo7_-qc}Y;oBCo>6*1yN=dQC(5?PuLk&m}=^Q;NJExo^gyLK<qKJ<SbuM-tSYgQ|rs
z{5bu(QbS#1U+9J!k_1(#4h4HWMwJd!riKdGZ7;j9@ii}re)+#tb=~-vIMl{-6*fxk
z<!-3=lc3UvD{TBNQP;0E)MUHD4Yfc)#W3Gkt?FvMFpl}VHPjS)k{jx}q`G<*sk-J4
zjYH)F6^Q<Ls@=~Gb*?}~2Oa4l94sfGiTDqDRFx-6;=o7FQQeznZ~Ki4DcgAg${~?v
zTF69t8x4qq%4!WY-Tu8B>Q6~f3%97c?wJ^`YoUg^%YMKOH7g0~2-&IY6-+^$x~|ty
z_t*t)sIf^<|5VDf9zXqkyoLQW)O~h}8>(v()CM+g)b-WoIMf$sD{MSyfADJ;Hkx@!
zkjO<ZtA0I{z~NF2^^pC%8|uX*DEl)7mA^Y)*CQHgmVKui>aHZH8FYvizKk7!gN<Sh
z^{{=h8|qSliVnIcOAWR&Y%hv99PP0JXFLZuT>-yV&LU^GzU1oR0bT;pOBQq&_!!5V
zt>OP`WE{S+pP=Wpg#!PY8~(2XA78N+a-AX-H(?dzRQ$9C6y1tF%MBPx0_^__)!o1U
zBi`K`HK1%o>;gC7*d)MTv+hu11ySIg8vAR&#RBMY1Ewgz7-?@*9x*+B!9wH!?m0{K
z@dW{F{FMtwYj{b_iNUmy8lS>3)dBo34fu)xKJ5nlSrXurlT}mOI0<q9AJTw-62KeX
zfK!tIFJQf+#y1l+j?#cj1+c#x@O%XrTZ0y|yaKp@CEBU6^Gwx8*}>U+UUc>G3tq_j
za;91`9XdKBo3dz+Hxd=E(C}3P|Ee4Q54<Gkz|fM5@m9Rz`5N#Y0sN60@Fz)t{WTYU
zq<<Vor5dnC0Q22|!xUhQWDoybb@ih}Q~PK@*^1ijzjSr=Brni9aaWygzN{DByA#Xj
z7NV3w?#%-Go*Q<BhIIg^usH=kI}+UgWevDR0K;y;`3f*LFs)bz0eFAUcsFNiz)u8l
zq8qR@3Gg#`4}htJQ*uW15)HUj0DHLs`y>Gd^#Z&!!Ml$1QGMJdfLmU0;b$8!iJ`n$
zloQb&AC<;yd{+a0Du6G$0e_nW*jukScO?Xcc^dFD0i5Xuyf+E(ULdIP&9~#k<JW-O
z1@IC#U|tg7z$+ATx)sEmnymqM3gD6FU41;p3v6#`S2x@H*)LVaKUxv5xEU=6bRb(p
z`z<&8+X5e7CL8s_@J?cxd|m^#3E-n{z$X-7jB20ItJnL1c#Ch>fJX%IDmP$>0*n#R
zRxJsC&e5ks2ZJ==zXUMT4VaY#_%bJ3AoqH<at`2Ey;UEN3SjduTsYduOM(vSjw|FG
zN~{TQYQSRx_`DnN#U#KBi0?{`9R$mFYJ5ZkwhQ3xZooT}06(QF0DHa=Z>moN9v8qt
zZot7wfCp4y*&a(0yR{4rctQZbdd}6ygS;gAI2{8tYODd?Y3hf)6mm`q;G1s1Wl4ZN
z^-gugpW{vag$6t&fRDHVA5Q{&_B_?cE(xsOrUAbZK%X11I0<n4^Qx&YH^pnr)qouW
znBfNOo&-1tGj95r%0Zgb$JT7s$4&wKaES{!8+b_|XDRW%04#;IJAi-JfTB>bf8hrF
zWfI^dy*nvNu;!m?K(7Ga<_4Ue1bCj72oGX8;M9nUK<1+u7umUPz>AUq_lhzy+9Q_$
zW)5Jw1{5DpyY=U;J|5sD(Z|1`d;qX#V)<ETs2Wl1d+fiv0soN%Xz6{xQ%Jr}jn8R7
zQBc}Hbpt-80Am~=SIaNcfOi0=X+Vq^Ja&;AaC}mYM-U@u>b#@z8o#FjF)Z=e>2APo
zNq|53qw3>N`EhFeQXF}suVJskedB~rj_X3!^O8W$xq3%i!8XJJd_x1uX`cO@8}NlB
zz<c!C<4;Jy4{Jct*V@zEfOjasn2a{)71h)~E{`|$N)0G_K>K@czzdUVTnr}$IY<8x
z2kfc=#fHrO@@K9-?&l@Z$3}QRfKUD_4)}q%dq-cx6@hqzZos#a0Ec?iaE#=~LCz8Z
zoLhgc06y#nd^85AR7-t6RW<H{FFSy@0%%_?5IIaqQI`36;UKyW9|K}15VlDv%eYtk
zd;~0H5mA;sNNhP;{t!P-IcG>Y9gAJ%c%__p0jOpb<jMC{3;qKhrXf)Ps{6q7#V9Rn
zv)n4Z#fuimcxNmSQ7$r}m^xn^o#nB-EoEuS?ne9DDb1Q`&FM#IhU!7WT2e(G;|OmO
zjs5fG1k;eSBhF@@nj#u@=Tu76d=2)^wuC@@B7<Z5A8=mSVcXp>s;G}0{M*00kym2U
zh(Sd>A7_4A%l9ssT}LkH9yhA1&ZB)tF+C+zgo;oy<$l`p0x@u~{&rt7tFQdaTN(=i
zqT#s)FoGtp@4whG#*66nT}5bncKkw#LW<qrvE?+!V}Ah>xdT1JB7$IQ?xKDb+lZPF
zE$mRdP41unAxA}6ClJd|VgL4c0Fqe!s`~r7Gs>|ZJo}UH>VdxRUKE=k`fe<zL;=1T
z<mc~~BZyaprlQ0D(nEic`>G`QDV@BkLb|uZ{-a6`{*;^-B4bF>5|Wgc17k?AqT%PL
zQ>z{JV>+u0>-F1o)_ZjlS1S#-SP}*L2RdmKNukem(s-RT2sGRV7Zb0A?I2<r{-Ldz
z5^&TST=WmGWizUB9)5L3qugBMY5meQxI8A|C&11#j-Oee^xGr+-Q#n0cRfDlX#>=4
zqi_Nlpp`%R?g5I`uj!+kEIqLSIz(5#)c`d=BE#RmIm?5I!8CfmVS~gbv}aI3xbSzR
zd!8T?PT@)HAyAPA*u=RVhBzvRs>pu@`9;ld&PaO@AdaxG@vDDqM`PM;QU?F@3~`?h
z6=Nj8{fWdEgg}`Ho9x@r6f*tnk0l&%bPi0R(f)*I3CtuxCsmXSabmlwA$1Jw$C-AC
zLS6PBbH5S=0hHfRocknHmbOJSu<Ewlx0k3sK6x*Fd^E%JM9|OVhz`14w%^ghpC6I&
zmmp#n<1dAi_^`p)B@8C`MyEEvngE=fN}Dr?fEZrY&tI_W@qCr@vBBAxl`Su)VjA0t
zN3|HjJSkS57jP`k3GH8DQ9rrpL}%;oadnagCZF`rz;uL(U0uoH#yGyp<%CoBa{d{+
z&sq%NqG>(iUCEHf*4S^|!?eD3kDE8D^FWT*jun-6T+ZoAk+H17Tz-b9+>-(6Vi{Wp
zuSvR2Th5;e{-F4ixM?l-T*%oap=ym~!3<93I!4bKoItfIpJEP|oxzE2KXL4e=bOFH
z72izRZ_#e$9>>9Q<PK#(!7*e0QU7~hrw+OPp@?D1%JXwEpv2xcmStVRVO-&yFN0TL
zsQAya-SqqnLk}XBC6%di9vLl6WBUNkdhi1#{_AcqQL<Y`euv!youtrHB_WJs^km61
zhmjc9NVwz^ww<X0qm9FQqm*b4kF$VH;1I5)A7`-z5VCjClmc)0i3@_6<tHxUdPj!E
zjLzXsnNYt)#RXkBp<PlJ%wIIEz&mOS@i(J$QW8j#OvbX*eCdzQm^x}ph#>V1ofY)p
zMiwur!5l(}#RkIlciOjz#?Y#_HxO;YjFzb>u!(00u%|9z6~l(%G?{2<{Q)lN=k-Yl
zpC^CZvP4=&$iZ-}#EI`hJYT$sQb$Tx2p6r|P<~?coK|C59r#>XhI3}8{pH!ZzW;F5
zH}mxRR%JN#sj8m16k})!FJA=8N<0{|kAex<9%O#lF#%UO!<->89~%1!57=hT=rQd^
z38ZzrPuXy+bgi^fd5xz1VG2ye8rIa>SC{kHD49N-ly*vZ_B-NJLyRW;=nom?aR^g7
z?n^bi!efuwC8JUJ+A=lxfB7Lv=1xgUYe5UCr~6)XDS#d%kR$|RNuFGtXH`5;jm~r1
z2o;Uwhmyxb9^p@p{oyfj)qY}_3f^=|Y{ezMzTB!q{|5^uMTDULSL*&_Gz5=O2-pe&
zZl$XV0fh<y0eOw4{r(0)z}O~n(ukSV=^`~JF(bl9;@^nc7A-7NyR^Q;mnI|Pa#|aY
z*D4FXe?@Hi5@i*<UgJ#BDKeEIgCT1Cj8`|8okQ5w<d3uWwRY3<n`7Y~%p_W2=bges
z^G#a);dNP_@WZ*DC%S5$-dL0i9e)RdPl^W$msdBG9JBWgS0qPC`|~J?sI}+ge832;
z5#6FuvXNfCPs{}>WxPI_L2kKd^f%N|8o4{WH1YtJOm}1x{Hh;G7^9i*$GZwwF_r~V
z=;HCwvvZSgyJ5ZYi)D*D&z$bCun%VKgGC3i$dfwl2`8Cua490;BxjzzvGr5w|76|&
zC)1stizQj9^#GaOvZ}=ERH0t}N+Vr_w28q9h&@{5ZUj=Ik;ePsKfNDabwB!xH%+<p
zmDstlCh2c;x-*qy?RN%h!3kJL1Y?EBX_=k&8>hEFU_FXtJ0oFr7LQk^*HdLMo#TrI
z#YWa2dl~f0+((8EW!f`~q-DGndTAw|1^mSpuf~jG3ngYD@F61INYJ5F;S@jEr0=C#
zEIq;sOCsSL6{~m)L}q}ON5Kmu>x31?huMV}S!Em&{*Lke7A33k{aD6#L$KJv&)vrb
zKi@GY1V3+j2`?I&h#?}z42A!}-asyF7DAkSC-PptjQ^z5#vj~)KU$47X9i|pluiH@
z`$@mY^QKqXs^-WXW`}T_%XmxXAO4d~=8S7I@Ud;8`(@_^a3GJS`u)~0g3TA#a%?}P
zHd|Q6gbZaKf0a_1_5C~gNMDtJRR5hn_AnHnQXcZ(NynZ`nM_T{wyiLmt{Txfu@FAR
z3xRLqTa4D;iM2}L#X!SYmTpuxMXx&+K9M%-K=^o0ux0GU!E?r565_IsMuNrG7}e=f
zvpIY`bM~3UhnOL$zNOxH>YOhnC4Yneofm(p{+(?$)^_p_|Ay!=4CdFE(LT!vV!s~o
zsHXZdI$#cIkO=qDDcQ|j@VlO(tcGwyvy>C==r*g#tP*(sJz1hI?kK?N=c+n{!u94c
z?jMox;TufDUz>zp=KJLEiEhD5%^B2Kxa2Xl4BV6^rtY#2s#eOP<7lg0_Ba2o|2^n`
z??Ch>>3^#s4|EwPw55m&C$Mhr0!H#TfRi~>68*-dLK+>gHpk{~G)wnC%i?mKwbK9P
zy8i^tz(Hk-x7lw@*l1Kw5HlLDRU57a&)sG<isgHXX^l%p1Qw_Y5jTGu6E`!58BfTb
zki_jV`O&D(C^!@p!z#+J>kJ)C3Ye(;PlA2T=vA0{H~ZA(xa}!r>Gb#pGMeGtr!Yct
z+jkk&M!_M|C~gcyuY~}v^%fio7$r@hrwe`PQqWdvl+djSDSS&QI7A%hI{Jm{Qr)Q#
z5v6K^1Kq_HVG4g1WI%b+dq2(?q5ZBQ)$KcvHpKh0EUW9NtaOQyLyx#+>;UaFeqCcM
ztMS$V?gG~9839DaD0zX<T=+!BEF#Hqhi*abY<8Dfm=+(J{a{J{HZkiu*6jB-o7QHN
z3rB<JIOE5?8(9OcTA0;OS(9O>7s5WZJ!RJD)4!83Fu;4&!azT-dh8;PUDRWWJf=`U
zYM@|i=mXGtBBv~aT3dF|D!;YSAKrsS*`quC`0UrVq)7~z601oMN9S~PSMeQO*}z@J
z*k`4V!^~@3w$Uqf6eeAxva#42#}z+;ad~30l_P0+lExK3fpL>1tw_=)OBz@F1jaom
zY11X`K{I@?oZ3!>{v<<l70#@qc;&vOKH~_dldWzdcDXCDz7dlL12O()jmF5VxPZrA
zSp^IpHe&6eUAU_o73VSoj@I=vD*n*H9fM$2^$?O|eol;x?K;sKiyT{%A=YFE6+{ae
zTOQ_*`17=z7(#|$kO5=lp8g;<azq?ZBWKKiPm=WL8T0?f%Xf`nKULqq8^1^X&&RKI
z-0ZQm?1_<#V`_q)P%dbCx=hg!Y(@GDRgCLc7#M}P-^kkD>7BD5<6I$tzCr*4kfV6Z
zF3aa-FE9TyH4M-1Y<XG`fMO}Ur_GE$+NS9yHJNVmjPL=nk$b$9tH?*}`~KgjpKTC=
zq8|wX^S`2>KMVbw_y#GCZbH4;5)3L3?T=+`eVhA+@5NAV>KJG?74pq49wX$d>A85!
zPW8-R(wo(@EJrnJel!u5U8cwgxf*TAiAfAK>0Cc^ScRP;XcZCFWxS<)nHi}vC)xtI
z-+GVZ&nMF&Rdp*Xc#Yp`#)kBiwOMX9)+L=XF<XuHp}lpdN(s<*jJ-xVqldFj);T^a
z>umNIr~C!mWWEpqPePuxyv3jJZlfj2Z=63eeXQEd^+#)s;0(q1enlYx_`9_}yo)fS
zjc~8@S(n4v+j*I}xM<<XqJ?uXtDVz@|5NxsmH*TDzbpUazk^yQT-P4jEb{X8`5THB
z4kJ_yQ7Q`7hrVw0sd6wPEvTD4+?;{?u7qdJFf)0=eRE&7nTeW*nE1PL%uIr#G%V(l
z?+nxG$>tZ{yJ=S2>HNBQkx_yr=!D`$Tu0vE&95mLRT~_IGJ&8`vcB=Hzp<>FG{HA&
zb*L@*x(WPq76q_yXKkRAP`$)FOI~llPmxi*PL`eZ!QL|$SFqhNswXQDy402ok+=^*
zuj{;*KgOIjqqllPN0@qOG*BWph7hu3$MQJH=At%M8Gkfu0GAw=Jpxzg7hsin2&tAp
zG{-iZ-W^BlQ-b}h?cC}HJud6hb73EljQ&Ngo8y|g&XidrW9wb)JHJzUdKXT~!0n2s
zE<ANnPboa50L^uBYeVaCb)W%GI1p@|S;|+p-~tA>Kx*2>cJiT}qEV{-61!!IHPSJi
zEJs@^rQxW*!~1qJ<Xq`}kngA|!uw9)g%lS6=s)y5f<J)~vI4Q>1oE<F#*D~Oi6j)~
z$r_ZGr&5bl>SRfkRVZ(=N}aA!ACy#Chw@m54wO^hsn9D&>r#RylOj*8Ltes{H)LB|
z%Xd#NuiIJkWm<Wiw-{}PERA|`qUv#NtVZKx!k)Go?ExiRB`DL5iwG+o=Rv&k6((4X
z%IP!v`C(_8*05}_f&OxR=%9l6hwzh9{t%C&Mg+R}{3?G8Y?{#`Z|Gl2-ddEtsOPAr
zQ2*uNSo}E<RtKQ}ISaAjOu_J|rr=lk8%8x5Kd+0X_7(vpi6j?GOr{ONi{8=T(66zq
z*yt=CUEH%X)LTT~tbn*V7ZCWYcGm8C<L3!UNAVkod6URwYl5T2d%nG>;;0bxgPNcZ
zhYr>=W6sH69iu04nl68gGZY+bLx<VA)Zmi?9Gy}4SY$|K^x%21{6_U^E$<o2uBFUI
zxlX3lHK!8#SJegPcaxBE`D0x>J)y%T#L5+cG(kb!qIQ-i*d0EHf%pdIZ<++QVb`;v
zWkfnO7`?dz$L-P~>nBB1&k8Vp;WZ!@!~RY`mx+bteA!$sBf`HYg{_7=@%9Tu{jmPy
z8M7{6qXBcNBX=`#|G+Ad?`U@&{?&(#`M)C}e?zg9X07uN*@PW?hfikXImnOD6zk)%
z{%86Y_7{2(6fz(E;f@qz-Y=;Ir=!uNr6hWz^Dms1{f)6~+L>M+I^>}X4_vS@?aUNi
zQmP-y`ncsrDFc@W)sy$>iNsKQdbCQV?E|o<Cp;-p9onRqx-#0`X3Q5kQSJ{)BR#$`
zLe*x3y#g&++WAH(LDV2{i05kQU+Iu_v^~|B_d7o2*Kj`*--?VStBmLB1Ky3M_auu7
zdNPeLa9MUv3|U1BZLHJHA@zdcd3OO%N!dntDeXWN*dkt)#{6>9RTt){498Pvo#}L;
z>>5||XOELxV9^l@|AxE=bodq)_d_6}^6*LToUd9&YWSS8L3C0D=VB9D_F(>o7P<MH
zfu(8@;Q2Z`mC+U&+X*k5ip0F#UwZ(FxdDm!YXXG83QtaVa&K!90my$0-??M{@V+*G
zbONGMJt1!pl{mQwx91VER!E|z<!Af^U=HEklzWT2TwUN8dW;UR<$Zp2{YYL@DNgFA
zICuS&I*fKB(Uq&_e{rfbI(9VL(Fai>W~UB-dk%nU_9{MqF@FQ+zXB5*AR_z#m?o)9
zRcei-7D?)4mD;RQcS`DXNqtbIw(&RXd^6lyP8;B#5^k@+A02uQms4Iw60-c*E2tV-
zeotEdHetW#1H*-!0zN>26LzW|@qL?!@3jQfrqRBmwQ$U5ctSO;`>W-5?v)<kw}zsV
z;_8cnEun)gA?ahZKh8V|qljB3=;asm?qTx?d3B}EUI%HDhZ=_*l(!x7v_qagl&25n
zX@xwkV1CsAeTsO4$F``cc}*$XhavU8XrIjT-FfA8Z8cw}>SbWJj|<zmlQmV-s^5E3
z>KmFuX+xyG6jdK3B8Tn9)g`pMw4lSNSjQ0hjy7Z&6_4@+h(rW<u0|r+3_%COq+!4~
zK|Z$8?<E+*#=sEHQV$1)oG2Y~jK$l18YcRUq31;IBL@zN)_rP9A*byK?JJer^kz^&
z%OL*52%yyZ)F0lv(;r^BlUpI7+m&p3_lkD;Dk$Sp{sb<~9#6Y3Tg>0UWv`G@d})qK
z&r|73ReFs|=bjAmPgd#8Dt)I)pRUp$ROxN}&4Lp4Lm(?pg<hcNDZ$Cy-1~iXeJK<&
zDVlpJ-?EFXZRLBWm)GyCX-zAymv#1}4++&@yU!t<{)%w=e~WN-2;r<ZDniU5O*k*2
zkVJ2;CsaGzTZpHExhVU;{v)oj`m?+@u>bp7<XIrc0NL_G0Rs1d=pzS9hivwHkKy-r
z7A-rd=r&a-HkH~n1@|ge%_%;Ar?jU(e@Smv&&?1}jhYUK?b~EVC?)=o?USNoM=}=<
z@{Jlc`MpP(72K%{Z0Z0horzQrv3zEFKMQ@SCc{g#nLcF&AD3WRkTk)v;4*>eEMf<a
zrFD%bAyi$M4DPW{6RU&pJ6~jISD!T$LAnOISQ$E4Dz}>-psTILe(!3cYxu*f4W3l-
z^`bj~L5>#&36BCFFvwP}dk^n2TK)mlqW)c3o;(!irG25chT_qkvM?bL*L1|s4Q`9;
zb0KdON#`cym$B&{M;%nq5F$!ZvINd_XJ2qU(TDN^j~VrQL&wl@vXY+*Rius+xujR|
z!r}rLI6@-MVJsJ$8xUM@dyET<N<N`HiK!T`-|CYQnbH#mQJY%1A#|udAwTG#U4&l|
z*`WNfo+3Z^qgQoE$nWq!7>3g5L{>vo1Jj`vw-CVhLS%<hb~)^B5CT}nMED~231j{N
za;xySF(KeH67o0rqPKMVhiss-4keBo%T|YXrKt6PVSl~FVX1~b!zWn&KVzA7_VZb1
zF3du_3$WY=&hpAb7aqFELkbTpKy|G5&8+u-0!Zz_UgkmeV5;>*CS=vDF`llFrCZv|
zvYSN&@Z-jOaZKce>(LJx^RMCs2XWdN7;>0WSuyvdoUT6;{l*CNQ_wY+mqRfj=f3I-
zRf|3G?>q!Xh<<9M(oc;L{nSXMpBf?hsgdZX8d6IO+Cm?+w6Yu<^S|V;`Hq&fMAT|B
z!kbA#YXM3`#;7l~HkA&+rBY;!`}jnVe7+b<YURSvA!`U_#VD}I7i=v%D?S1uR;*QG
zMFAipR+Rk+4QtC6^wyaFAZZFYck_VA3w|hk7%vWvW^>nfe@+Z*B@Whtb5#-T*+%$P
z2n1=fMMbI(x95}<@v;lfO9?q;w}8i%H)WxM9}ulQyc1!-Wc6N+zD?;(SOi$MBdV0n
zmR5eb&nF=b5XufoRjq0wUc5|WS>DeQ&Q6$!@E54>ZbxX?O7I=B)gg?mXUfKe5wekc
zyUJbX5AUY(TiRqFLggYY`>fSYZrV^lGszw12QH1f@3)s#M6ftNMO7!PulGR^bfvJr
zZU@!-4)x4mzI~{k`OEVP^}GVcrPtYN|FIn~xRl_H<8dx7-<4Nh+g8((T3#y)fNGYX
zi-`!7Q96V>$xpHvx;=gu`=fobbp5RK-KFf_2k8<_l(1)OLOV+f8jOmU>5E=*hfC))
znLNi@%fi046cI|TxQ8Uvkas;dDzeb33Rhx5P&#Cld&Om6-QO1pbUO(_UF@?0y+w#s
zEAHnkCr0I7klb<^wdjR>CsfT!t0hFK#$A?@M`QAsaDNfTvO$s~jM16%WiNYvo}_6}
zOwwd0d;Vle(}I|!$v*b{2jg<spOolw9V)2vp&Ajw28a;G;&Iv%i_H23ORU_Q9@8JW
z1+;UOfOal@L_pgv>uj@8F_}JcX)prXNIAGfLQ$m3I=ic#^>x5ROV!Hy`iZQsKnxJ9
zbd$`Y(N74W|37vb{XR<M)xkp4SoFD$KKG~3^$4E3ROj^iFSH_^%Fa4hh~*sBKXoU(
zpVsbz0TD8u*`@pEp~-|v(sK+MqDbV2p7hUPA4UJx$$AQe1g&V<$nuDe1NlMe{SG4E
zus%09(t}hdi@dI0i@rQ7>0yA99?n(L!vG~cK*Fa&T6$O=B-;OabI2!@OSSb<Zciy!
z=0*N$N=)r3WpnAOko{WIyEgP?sbKw8xCs5o3O=N-Tt;@Ng(*-D?W#ZNKepBM3S;M*
z<+TVZYW%N?4DCJM8hQp8JQX1JEB$K<jcmQO)Vomx((qb?r&1yx!ZrjS!q$6sz`%_T
zwY5Z+MPA5PHE}0Ft@pI`Sd>Q{3#S+?CgOWf?){inrMs<?+z=uU=#~FGmhfU&^s=mR
z=m(62$}wE6{W!90M$0uy)S5%ui>-#tbwx)T&_9a8^iiep+FWeag?A$ab*9f60rzc>
z%L5HwPXk-9245HcPvQSm{!ioouKb^loWqG|xJKz8Hv})uKiZI*Pa;cQ!C~}|=Sf+9
zZ<D{EX?8EpKiQA>Vin{Bqo8)y5rm*b`}^zmN3yCRQp3usiis}!b@Quq^U*4GNpxYl
zbm3tXnO&qC(P7W4e!#Ok(q|zv?`&WEf4luRxVf$BkJhgs#)ZDl->|rR?3oB?*Ly=p
zMZh2?B&S6flyauv4W%)7JPzTOXuow5<p38682KBL#upu8|1(DSKQpAXRU)_SZ#^fG
zW3kW~#EP)FdN$<-tY-HWe(Zlv(~pH8Tfi0b3H{jV@@IJLg8x?j)bXGW`Ky>aDvm~g
z1!Td;F!Vq)l~dR`>2UFOhvCsLHgxk>GW$_a8x<1hN(5MrCZPz<Rhhxfr6OE;YXb$V
zIe<k!QcDo1)jp-I;^dOG=y+;bH=1d6w21g67bme!n9-|G>0o+YWIb2*3L|KR|6Pn?
z<lmFb#V(}-H_kK;*2wwfK~!uqKi=-2KGV3L<mnq`8h36r-kz4u-4*c!5)ba3Y0TU+
z)0nlV-%R5_KAmZ_ZjnMrtRr!^BsP<{UO-a;#j<WXU+=7ugsvp4B>{>0PX1SA?F4M6
zfUN|K+y@DtH1iV@cS$0-ThVLOFpugxjLKVSyYFb7H+W5~sm9yIs)@8hwb^On!Tpr-
zWvoRXQwq(SPRH&f!_3{W4<AYK{9ilMc(CaJ4^1;o;|XTuA#Twms33DSK~B&MhMh+B
z)tx4BbqkJ`5mjvxia?C>*3H`MFW6*+M=}NkH9@(s2E%|g{>Hj?TFjY?KfEU0?_Gn4
z+0U4Vv9|<xMQE+_mdci4*D0#Kzs4W+Bc%{X)5apM2EnRc>x7PHm1TgK)kJr{AKlR@
z{mgvgMv({ltfSe@@>IMoTe>2Tfpt0ZGi_a-{M@y!h!4B?f496&SvQ$Sa1jHW&f`iA
z_8^boyEdB!oC>`x+s_ef(4eXst%Pv}Oqsd(YYNL~1Eh|pKD1H6ve>T;K_;hTwG=(?
z+1GOUU7UaVyLV+R$^dn<j>vXrUYKbkYySL4cu?jN>jO|f_K)E&(y{tBsz;ld?W{WR
zNsb#3){dvox-MQG-@BQaL?^Dk%FCZwT_n&c$2q*G@MmJG{`g3mnfaj{?vO;Z`s%CF
z&CDiUZU#5+&8m_KldvWS-OeNt(laXO%K23zf3VUxaEr3i*oT2j^~DUUeKWTGaU*NF
zslgOhpknL9pZYsbuKnX=0*+yJ?w7-1`Im!nmHvlalq$rGzT6Ku!rjUQmMgvf!ady{
zbA`$#6cy90?nvOH(#8sg`}#5um;*Pp`HeT0Nqd5GgptVg1;;;S4>J357Kc&p451wy
zr`VT%&s}JuBDwM<P`NWSP?h{eyx7p6{gL@nzLXA=L#vl~Z+TEALgcSG@_p_%o>aI^
zSVABAQCLJvFiV*)pTsip>d!%K?dNHJtK!nZq_H$2qEuX($Fr}Y;?g31Ir>ml`QH_6
z@b4f7g!<ofr=?&AODy@vXEtemwu6Setne?_IXy+KV-wR`txpp5rw_s3$w?P60AZ)I
zCNOYKAd|CzqnLWEm+t;TVR3@=f8h2&WjiLdCV!v9@-yqwK<2KBt>j!6Tq=2=mb{^l
zxb{iK@kutiYdJ9xbP%-<X(!1T&p)?8s08z~)(^1ZTDzMw4`mGgK3+1*1$&ryT{0Ge
z%UA}69Q{;6A3^-IsS?BPU8kpUDzb=U7JWuh5@DDI$H@sl9qF{cS^>zoyr=Nm*|{T5
zia93qG=N^4ncE>3i6_S8;cvXHu%T({vlva~5<@)i^h}{`Vskdw{wPN_Qmy?>WBJa)
zSwkEObr%<ULwr5VNV$})>1lUs`Dtk__t8XUI98mC{T-Hh(s+|tY#BOy+kFK(##=vq
zci|jjGjrvdP{Eu&u52aoY+@0Y9OmubT@WKh_Y-?r70+GSc3?4=v0Mrgg8dx(X0^sX
zF4c3pUO$gfaSby|Sqqk9F?zRZdALeQ+!MT6y+vOTLdI@?Kw`d%nLfKE^?l5rBzM`+
z@-d#!7iRd;b`R4s@~ACNJ+ICalf)obgV12R?{3;?jc;#Vf%Cv*g@94Ex;Z*BG}jEj
zuWB_aUSw3Ujmk-oytFS*mKt+RrDQU~v&myxl8Taoy${=kA<&{sL&<E?O&z|gLafo`
z;=EDjkGI?t@S;#0O3qcI$SF~&dx?C{uJIlT1y-?%Zu=t-Ol|Fzs9fe-<R>6*e{gg8
z<NZ9rL;eW2iH@`z=BmzJ;5)h!Zu?K<7T;T5;*@51LhD+;mndN7V$9!x?+m^zV3+Ob
z?8LtFwbn)^%vEVw%2c<H+;<q^cJ9LUIK(k$j*k*1Qj+2IN3OI#-6=cJ67B`y+S7<!
zxm8X>jqqsd)Df*yjqH*Pwt8Lk8MoDqrnJa?SV~@^qZ}y!8pzElf3Js(CnGP0L%#C8
zv|aA{qcJ^p=-C@t(j$*7{zy;S^T`9MJ@?WcyBMp;^7jWfDey;S2pn7+6+cmL(*(^m
z_ei)a1}7&F{r%t!&>y|zmQ-<cDf}m%rLEbEp`CloG1G{yRxup7@<&Ziur&}Vo7$S8
z+FPowF$`|;l|P~JeKoFG(F}@G66jy3-z=!i=6w#fdX@d;;c^{*%=%pM$HOT1f}A-Y
zG@}H=3$MZsVnX|zeP%)W?A;>tfQ_<yaX5nC#L9)-If{&x+ruf>SnvQ?Oq5+t-QfIx
zQNA&6cnts8_cNU^$Df_ykNki-yE7db<US(>g}MP;2cy;+xv0k(Q+B;qGpH45(o<nB
zCU+k3*z&22mxKBs!>Z73-;<c1tcs%^Bah%Yyy^w)9Tdk<_On+GkV@lB%e69Tp6c;T
zch)GV=YL$8oJ9;tfHf+{--Y$Y*C$pSy*_;s7`Qr+xn8eNYlW7DD=$)9*%w=*+R$vu
zO2tjF(f*g3nI|hSXD!|yeBF#>sr<i`{Gn#Wpw+P0x1Wmzahj}7vsj&`y7cpq?^EJF
z8SQJ8i)F81xcWf`(7uI3R=(b^zrOxM`6@$OeS>@*tG=dvvrWwhe}rq5O501h-Y8m_
ze!ZFww_t2<563D_1-YEtD2rgrEXJEG_u|b(l`l<o0#WLyX*QW-_(4y49DzG7e@zyG
zvO0Wng|?@vCx28}=u)fBpkFniF3LzbU0<a5G+z4EuaF1X%Bry!OCx5`{5Y%va~%;V
z`x-OuxD^od@#W<dz^Yx{$*(<h8x0q+Yz2hXX%1wCa+Rmrg0VI(GVvC5kNa8OE^Z|E
zeWJSjH6G`#rv2-$(&b{0N7bgPYN_2S@i!5QF=WytUv`#|d*XHkxtVETpGMW?KDylO
zEETp~MsFD`&wh!5-TYPgyU|1GlyZAJ>9_VdUF_6d^i!|9i>bNa?jdou6Fs~@q625g
z@Pt#7edfm$F?Xd@qCGU|c$v7HM*E&Pmelc6a42{C7~gLVZvKWiGWGBd)<yj-xu`cN
zYOy)D)612&v${j%!LC;Ou`NXQ)oIwO)T?;jH=v6(XKV<}IPRaZrHYjv<?IPAOhB_|
zq&Z%KG5V|O&SxVnetrdPtvTb9Dx8+UA2?_N#6>}rt4zBU2;(X5{bNO__s%NDMHeaW
zSfkC@R3*;;W@F<^LQ~c{SABdB%;1K%^`~V)7%#QfkTHQgML*EDI|b>0c~PiPk```1
z6+D;(P$csqDZdIQnhf%l3uN)P@|UX4xT{Fcucj-qdKJ;NV_nqKPPf;Y&LVq!I}7{s
z8J$&#T?u?~RVZoV*ir6$kbAPvt}(k8=Pre*WxHA{>l^SnZg^m49IeWwy8I2G4=tPK
zC{^A6P5U~kl)aiB90ij=81f3?lC6#^6?xSezs5QlM%~B7RsCX>SMUG^-q|*6+2(oM
z&mAv{p<}zSd?PmT6!`VMbwyXP`5V$myi!|9E5*5}jvmWxtNNY*S{*cZan%@}UGy<o
zI=D}z+0B3U;D0gqtHmn&NjqnNL<$aeN0~Htr6k^3&p?>gZa1QtuCdw0dQ#<S^LW08
zY+UFmr=BJFXU1XmcnSWk4k-t>nsRqZ-%-p9g`T2VF=}WQSII`r)qWXErpT8%btd-K
zs)9Ysvq@pgN~r92U?x@5W3SNWy;-@QH7|}a?}hm)OnK2ka^a@ZuY!)fx-iw=xZ9JI
zi~Co3;l1bKz8f0XkT#tMFN{e7L*j^i6|eZlj#ocZgsoFdHd;!1FaZ?tob7HlrkyRo
z{Nw!23>Ws4H>2Wt5_rqdZx(N|g}8nTb2)NeYyT_{@_ulZ_)kXq?&Vd0RnK;;BMTbc
zn+b4DqR7{LU^jq$i2W>4T3OK2u~7S7VQ>P{vey&5EMT=Fs^B)?_eDxFhio-d`Yx0w
zRp8SUNLjKFE|XiraaV3rsEIrGO^Oct9ts+7>*t6d@E`P>5}2Fjzf;cy2%cd^(w6ch
z_bj8wX&EE4B04+`_x2U#<7Cp<EyUIl|J7*WomF7Y`a&onTO6zu&a&y>YWv75;&pNi
z@>5$QztvT77G;*`o<BG@=epk$`eS+Gz%@E{9nz_HlWC1CZ01zm2)_w}+2FLUH1$=)
zG;ikGMDr#}=pVFt(ZcKTxBNHFyOHLBGMe`TCE!^p-_<;|pZV|GN4(j!!QQU+m7OJR
zW0j_@LI{F?{Myvo|KNJP1U@Y?)Q4oKevyMPyZsMz%h~VCXyh;&7uU&X<S`l-@Pr$t
z<H5vUv`NBbV*+u68|9+?A^m!0<QA&QLAf3se#@E!F2*UZE5XGw{8QuJa5xwfS6&EQ
z57va2W7wD`vN8tI{{|h<r)-lG1Lo5A>+w?fNFJ{@zCB*cRwvr{qD1OZ<MrE<|4kcT
z{Y|WmQ~tX){;%4XG+uq(<K;qRVcmynv;xuO(VF;sx5RRTbvS=pmDq$<m-9!C+{MA<
z6OjsJKD)lp{jTrI{Z1`OKjgDsG!p1l>mU44*3ZK4{9bht$u53cb${~vWXAru3INpi
zB_zb^b6Humls)fv^h4HHYeUs1F(6qlg&&KAOsVi=k#{%_gsuLXw1mjx*yqZJ-^8Hd
zCspCtO9c<Svl;fRV2C~NH`XtgKWCA;wDxNsKCi6iIm>g%1j~TqH%fqN>vNUD#*~zQ
zND7+kF(&a4WWOv6BOWeg8Ox=q_V3s5eMWd8&t?z)V1ieSDl8J#Ya!YX#SZ;5I=OMh
zZxny^u7FiZRwb@~Nyk`OEL_6&21ip9_d2rilUp+UqI{C6(@$45nyO@y_^gUtRUa>`
zEETy@m70e|317>_N-Aa(a|F&T^pGsi_t9qqTq2g*h#PPd0TJ+|ixy7!K#|^fG=t7o
ztcG#3Km4tm^c4SA@lC>CP4>yA<D@mRaY8D>2UGol49EWx?_TLE&W0?hVACFQp{UUC
z%+<Ff<ePIJ)e5v;@hn|qSwtI?bVtHaS+c}0Rtk&uy;Man`Mi2E6;O7ub%?!s#$Zq@
z5-n>zyelTw9x&3Z@u^xCm1r+Be|emHE5xyNBt#7Vnhcvf-9I%+|2bH~xXgGmRiux(
zopKwhvEXqysNedRqF8Z$Fw(d@m&TDAMHj*V`l`qx3a58z62pyWZ0m*Gb;3OuGR9`g
zC<tuI2i(Xb-}NO#e(U~zsDk{F`*Fg4qMyz^`|NoBVyoUB2#C+xqM#%{I&e`R8uL$4
zj(^4zsiMTNzH}xRYqC+%%okC^DI^%-zwlaY-NhwE(he?QLw}2t@~|N#_{9_)Fc$2J
zH|S__ZkE@l&JqHmn=mS-Gt8=O(FLAU3`zk~+w45-Kd&?;ApL0hTyQ_8NWFv}<YEZx
zfPG1>Tt)Ikzm`*MUzI*DQi6=Lw*+BS8rWSGTIz`eYdO?BW=jiI2P<ldjbhGhC1=a$
z0Q<R2%{s{Gq-}k1k}^vOpTYb$qb7>~(rj+ROYx_ac+{N1vqvUSj@{AVU^(FBo|JzX
z;q$4l*m(PgD0)gDfp(01icgA2wB1-%az<+N*vvT}nbF73us_0&)nrcRltu{bKyCh7
zJ0(z1GwW%il6{}Y?4<Q_UO)rj6ir5j94wl~`^{$f@Rip3qo3#_RxWxMJIeX8D3u?+
zQjfM=*0@tF-RBJvAT*0R)%=_C1RQgf9y9y;mm!YG2)_0p9u`zPH&^7qnqsR4+I8Pn
zLzek0a7>Q@1o>hDLypx4n82phVt1z)GCZJb7&(&~&VW9l3cC$P%BLy%)9aJ?R72&O
zU2oJMN9vD1eTa|hoY}%`smcBaUla1~6p<qC@yXWYq_}hUJxVAdkp`o8aS4{pRW6jG
z2Y1keK=jcJk;$`3lIww1+kbvRDQh245gRa0n1KqKvlIN)v0^B@+Z6u~i2{&vjbe+2
zlIidzvU64mp*>3K@Log-x0`qH&RAZcm-laom2Hn@(#Y*KsNK9AdsT7{^jw4D&HCb*
zdmP?bA3_QeHQtA|g`k8$joQn;f1t3gYx=d!(c?5^heGqAyBsXqe_@eIET2lhc|tgb
z!!G8eJDYDa@(6m;S=lYqy14;Z4@I}bX*w5XXnH>FT@9vdiz)U2T8~1nx1;fL?<19x
z<Zyg9J&cWNv~V%k+gJ@?-*<rwci%Ujl0NkKEwD=X7lzm%pI!(Qw3VTxcD$#eh50HT
zU*9!1)7!UF70qBgXutCkU5Lq#jyxZ0QepM$Qmhiu8J8rf@4DyIIA0_pxT^WE{hLo^
z1m&(Jq&&Mw4nY{Gg=?`6zKQ;IT=&1|-_qkw|Lz>9`nNu!`d0^RynkQFr+EKj;_-DV
zGG5;=rzZFBlV7>}7mL>vozSUlCfuVf{0B3l@`qV<#N-3jbNlR}W^^(n*QED5oJr3f
zU}CvjJ{rH+HFDlus}}~A;?^uKzRk$rRzB)7V+mIfVqS90Uc`D&2p`>cx$NK*5}<0X
z!_c~-s}`k>N!>WMi$qRSLD&*anwfOJLM7K~os`KJ!7%deYe$tBZ%@E??QJ3s{B4a=
z*T2tK2~!f8lY<MxNbJqKPmGQohiMRpVen2)iTAhmahGp;Kjw1xV4v_~m$iAZpN^zn
zW@5fzOlWJ}D=w1vms;QWhy06(8Qs3>?rFs~@@+h<>H`IPf;WW6c<H|rM5Bk<SR(rY
zkD_pgi-yKA+p+`R7W!>s9%g;WtRHL+6x$;x6V)VVhC@1hotso&TK`j1s}o7)HW6iB
zpUiDeUp&pVZu!3ydMNxcLA)XR{~XUKZ-I(6dpp8MB_GM*&o!dFyIO9l4DU2TR|klN
zf%)F;u(K!le^R>=Dg)MbYqL^=p@M_Wgw8PY_fNf}vsqS~#+tF9(nv#^YOMJZyeYA&
zO+_^A`)6YDR(D6Mq}9SdrRw<nsk&0Z#r1tS0p{0WCN?Mk9T>OAc=dSMMdl2avmP7%
zCi}9~+K6237s8Fd$EfaE$UWpL|LKFCptoMh9WGuW{Zsk~`19Pal*%YvIQlIhLg`r8
z&IeM@+kl9iN^*qnX{GF2E18+XixPTgGrB|xpuYE%EF0H_!^KOj-5Mto)S0O08@6&O
ztbG9ATsZ@Q6}+Wj6B=|;vN}L0q_&{-kp{-NG2`|o2U3jNXqL3<S7@q6-q;sT<DcQe
z>2CrU6fe#PsP44>OFY3v<u$<Pb{Xqc7z%GumlIWx27yG<!=t)~y3c($qnua#V3Hja
zlZuo1PI+d;e3%GegRxjCe+3{ce`+RuZ9F-lU;X0K^Z!rpSPEu@y!DgCWIS5<xSS(c
zo8ZVxP~zGT{KDN?wLZxH>X(EH0yBdN4Oo36JX2Hj1*nB)E><3g%=>p}m*@(k^E(lq
zxGIdHH0HgkQjeDxe%wuzQu*bFG2q`&H4X#@tPe4giLV_!!|hu`nGI>LlhNCn;rlyQ
zN;ScmFy+tDeNc;AKP*x(RUjYFDdqwsH^0cQL1bgrkbx;3Qw*Ot75u{FvTA{dFOhRk
zh}zg^JX=?6HOB%lPL}z3B@X^34A`t}45i_Xb#uMncz=Cd&{A_i{o%&@0H%;bk+hVI
zh4E-vp_+;hY*sjHECFW;A?LtzibY(nKQ6h4k0Ru7`?s9mH0!U!^w+n8d==qL?gsw{
zpVS%Ajq3VFxMZg0_Z+{6znL+o-`9)f(EU*I^@-4reJk8SK5cStFvBj21y>wa${8FL
z&nA-I+9$TYvJ9|W=n7&YDJ%kQNSkfO8{I=+%XXjgCd0Wh+q@&Ewsav6#PPySs-qhw
z(%MTYP!mU5ti5LBnVqCoH6b3(eMj~OH%y&;b*Nip?ngX(n})O}Q7rm=8Gu0%WA5<_
z(lT;_XU;u-VRm{>u;<+4c_}ZP9qL{s3d(w=cBVyC7w+sPE)W<8hdaAerSg>sSGBS6
zY`ORt-o$YqnZ@DGEVRB7Jg<VB9*=9kp6#<Xi2({m_t8P`ecqi@Zw|`2MKR0%;wXKU
zTNK&GL`$HHRiO|bXcwzvVlK8CxJpbp5NcQeP^~Iy?topyZi#dD3=Dd$1Is}G7f#lQ
z^6)0Cbwp;g8o~AV<%QitmWU$f@q#KA`;vpU_fE+rim&YWh8u*B0n>J9Aq!5;v-%3y
z5(zvQ?c4CGOlGz$9rpZNlnXWE<?NrOZ{oFQeZv+EBS3YBZKM4gm?|x#>VxRXi><>>
z1DA<CQ>=lOdU4uu+6Gqit7^&&mC~UNR0^D(ZMT0XK=cYlMWa1|7kZT~J$#gXT}<Ya
z7*L2H<OeCUq|N$1+9^urM4Qb(T(MUMqCW$%je)rBU#ekxAjUHg^`Qqb_B&q>#5Qwa
zePe2_8V8Huds4E!93=<`#xe-o-Gi`UuOs)!ApD*e)m+hRzrYL4WxGne(>p}Z+a6o@
zolAY+HW`CM^BIG&KT{Y?4Ij@6rQs*ix`B}x`&$_a6x~PJKiS8py8PJ_`+Yd8;xI<#
zEM|f%jM4k)&;g@T?ph<-hy-(srd_e8I|9{Uxy`e;eftq<Gnb~N^_OI8upeslSFSQD
z&jaR~Xey#qO=rMs`;UF*AGnGQtYF|z!jWb7`~NZbCh$=d>ErhdBoNLHiW)^BVAMoF
ziGmVIGy@6D=mhZu)>{zqMlu7aAc0B5VH{=GW7*YJyjERx6?GL*LqZ?{Pf$@1<q*Lh
zMoyIkkmUV7Ro!!hu&cl8{qxaGcU5&APd)Y2Q_od{R6|QaUtb9)R8ReSPtP4*?!?R4
zY?Rl6pPZ58gCDR<5vH1{dw|FCD%h)+zw6*ID&*RD0n5LxrbOr}j4;;udq$5e5e~fX
z{S>hiJDP9&B$a^V9DKuis_rRqa;sTAPRoPh6IK%3>B-sP8))43w5%V5V0|%K;sR%-
zDKM{)zSxM=h}jTP9xI+#eQh#NqZ?WToe;tpuB`9DP4TYGP=<W09Op1*ZBPZXcHcv?
zQVafahN4|LL!n)lQKh=+tXwA=gtD!?oJ38~`p;2Hk?0@!?0Q||(yf%ZN%}0eg_>p;
zieSm0VF*G$JrW`+QO?vrNSH%{#UII;O5^t>^fZp&)AlH!{RjLmls0Yr{+bhBOZdG9
zGcd*13w|#Xn=pZ*;P(PvZ2W%W%9il^E4D4OOF59b@<EN?VD(MH*v7E>#LGp%LO)g!
zp-eWTM0$Fez5*N<g-?U4p$uFmOKjWm8)c|%gqKY-AKoq9Yh#NuD%da_!CSdnJKP5=
z<7Kq8j4J~@J-18o&gNK(D+=6(RBz8Ma*gR30(;TTFeol0HCMu$K+PaC5WysZyLF=i
zt0uS$<po$EJmY?BUVue!mWnwCk%}g`1BKw~fY`$0FE0Wdtpb3fBn<aEHLD{b6)>7h
z>*&XuHTogIxmaEDBzJHl9F!%3EQGcHEy^0%0w{6HlM--I7Nih`?=RKzDd4B%3y~TY
zDexSD0%|UOQs%s)Yvznlxn@N?DZZJh^MoR4<_(9-Pug7*_5-|ygTJuVK}xP`>fk6_
z(2bStAe32nK>PXonijRuqcA^uPxVDz4a-5{pmQOfBMu^v1rD~V984jj7^l_^i(cn*
z^K^P4^ET!nq|dx)1Hv%aK!NYrV3ZH0UnT5%C)H(;vI>6RJ%;eX`JJ5RnS!Cgn!th9
zQ=>q*lW^Eg=Bw1fTW5!_{bK2$`Gig|9KJ5f7{v^c5KUF>*n6eyQ8qq8WmC%<;{!xB
zG$4n_o@#*R4!*M>or`1|_$&Qv@BpmD^wCM3J(A7}zT5o`sIfNh@4@xW`un3!pug>P
zf7g*<byV7me8_uzN^Sd%I^s!~=?qQp!m_8gqM#U}v;~~4J;Cn%327?ZQT${3{Dn?K
zeeX#6e#y7L<$JY1sqI%q+W8opadllbtcD}Fj1A!msZ=GYXt)q~*;)QcHb#B-eIqOU
zY%##<V9wtz-Iq(f-C-OMDr$|2=O_l>bf%{MSmcs>7J|FnCWYn;K`nSBQ+T8^>8>6;
zn9_!q@+xe{*6Z~NwO6qoS(~f-C>mae4yf`IK9A;+Sfvw>?@QuJX;X(s^l0Z9^TyfI
zje@`GQs<eJT16>M>Ce}hUlg+DJ5OiLj}FR_D~Y~cs?dZvas|JDWh4v;vX4irg$G^X
zROnqC+=#i2d7B8Ff;CVpVOwcb@p*OT0Mb#;o>-)~uW-TaGZDFNFOm@Qd%Q%Y3$B4i
z6vCZFQ3rkN4w^|^V0_Bs`X+4_fg7h{8{`Na+@Ou(a*m~8NL|lv0QYUD?4Psfi)DrE
zcGqb=kM18tO*+}n90C`mO8c%V$@^h4?C!0SbtRgnM1LjCIhsdNn(>?=cBLqFi_l%=
zRr|hExFH*ZnsDUOvmW5Zo409loj0#x;u5A_9pKGJj}uM7F54Evj^!!s(dezZPAc|F
z#moI&6oK7gKG{>6^?w&#D6*jLA{--o@~KSj`==_rcuEdnKhS72o5UA&;`<kpxK|S8
zYSV{x;?ZR!ZY5Ep!I1j_5%KpVJ~7ih<3|Jm%-@w+6Jmf}d5okCxS!e<i)IAU{VGh4
z*bX4c$50)$oTSUcc)k{#8;kIdsXYm%RN$;w;x(BHc=;@##kmq8eJqX85*i2iDJcXS
zLF`kE-`VN-`V^4KV%45d-}*)H`m{FB{eVoN6WNv8*t_1$_!=a%G4I{AiXfk;dTefI
z8<H8AU>+%0GGnQX(iV!)YDc>gR^+PT5}zqtP|1leQqGDLdQ(`|ge;Mekk<N;(_7oJ
z^Hds<mlF7S`)n0D<#ng)zLYHH?G!=F5AK0Yx`#6pRv~*#FMTEOb`}_;y$giIF;=rR
z3_$eLM0}q|S5>fR+)(fzo0>HRN`X;@joE2#&H|JH`K`OrjrPCc0`+a}>_Q?aPErmU
zZvZVOGhJy1MW@9)_mh-?Duss$szK?DKs>|zwY!>IHojXW>B_8(Xt((CMINtN^wql9
zS*XDZQ+(TG*89n*JN-*GDF0enQW-)?`OSoFVr9CmLs94GIkJrO8=+W3vQDv37n!%H
z#Ex5|XCEbBo%#M(YW7c}S#$TrdhVx~$AHLs_06OEvBRdqI#mg?|G8K1f0(|83e)5B
z9cqh!uxj5+2b##2lXdK@Z76-2<R>H||B~{*5*2T97j8@7`ke?1bxg_(MA-0Jsb>>B
zIH7!e{jH&EYMU2YTfUt)N<Y6>w)PZZ4UF=&{EZ=m`No?Jub@(L0<s5~^JV7>AGKao
zy0W?|my@c1go<e+%c>uov@Gtyt<oHeL}+7P4Y(9j(rWq@N!&o{>jDk<sTe;{Dquc@
zf7wnT<|O{*8=Vm62R<Z0pLAwOW7Qio{ez{cbn`l?QQj|bm7Tmni7W*M_UdFMu02W7
zxU74iTsP5QO}T;rx>enfNr45rz)w{aKxyEJ9%E%Zr8B;kjJwsBB7nM(FPqJuEl05p
zl;h`DRB!{`LD?@P-m3%?tJrHicpkrS{R9#~dhT%|n0K)gfwY9f!VC}%H*yH}h-fE~
z(`L*OS69rnxS~4$BwsJ~rt;CSo<i&JYm1a_ydRT&L@Undmu$hn+Y`RrSv-5V6Xka+
zoWaZX^%dcdQM7XKY;Uf!D16b%+$vYu{&vzx`eavfG+n+Y#zHuyES%yBRIN1LKA+l`
zIKy|Y<O+xO6E`mghnvdwo#s0-T4c$`>?Hr0Wp_JI_4jqb-2@u5CmZhIZcpnA(*v~_
zN}L@v#0WjpEy~C{882m)v<l@F%_b6@tL(tt(#LTB_GS6)Z*>P5wtrym_Og4Ow-y?Y
za+vRUT3P;SxB7ddeJ;Dpd8^BKWNAt8hY}}e@lSBINmx*YINwkz+J--5B5xoxvaX*&
zw?~#)g#p~T!Dvz1=Clt78<qXft@!M&D+Elw%Uo$|!uP%yXv!2Hn105r9JnpEuYbyV
z7}K%R?cCk8$rU=Lxcqygd?Yv#*pGn;oi%2CDJ2nb%6OMB-eV%4lcMGWz?JIyDt4wo
z>ed+*EBKWunA~1XPsI}6gVptG)m!oM1PJc(v>w(Rf-s8{6a+B|!*eG*Af4j~6I`oq
z(HT(j*^~j?yKt1f-w^e!ACt)C59@rB^Y-}s%DqAELf>*fQXz#p>o}oarBr^opLz3k
zOdKdQ5?5Wpr?RPBOQjsTe^KwjmGx`#s;3T$JST*s2l#Sjx<V4_KbDV6cii7yRV8h@
zv9(BxeuIgxRH#3lcDvAMkk>4|Hr&4p@^+{HEDl?r)_;|^-&Yd7Qszs;wPLJ>vN1~{
zIe9`Ca&5;d3&0{1k~+H7IzTHbgoC0z9`It(mHE5WuMK|S+$QTM_<3aEuT4$$>!U{{
z;|l@z@^(-AQmV>2q$){WpXlH;ZjK~<cu2wMf<WU*MnyR?Okxok6+L;dhiL@PpygV}
zNgCP_EHIi>p<lrf7b_yLFWXSJNTGpL6J8Apv0y+gdz~ABpV>bmatoS`fo}o-I0imL
zPBI2cNEp@3qlwK38=urnM9u_-<=c(&Q~7`jCwH^}9mzciePyEszMb9Si$J=g-Jvd$
zB37{=c0X2PwQF`~*6O}d63DM?pMPOG^;NP_IB=IFVhb1w2}Im*+0Bl~dD8#*^nOk$
zS~O;T&qMO`ZX!jvI@)f92!yuVHv`D#{RE$KM1JL`<$`d97M}H}bts-Yjw3RQ?P?uU
z^px0LRDE|T0lGF8x!G~E!`zReP)--lnm`S{_qEQ*9ki+X&34cX>Sd%TXi8QRG;e5Z
z0h&3|b%AEteg#aiGeXU3xL3QYx1W~?zJ&tPn;n;#x5T=m`hqI+&`Dy3LR}&wX;HcW
zDNZ0wLW-NsDP3@n$%*(O?l#PfSufC@+%aT<3J?HIeV0Hp>Cd$D@KlpiWjYTq5Zh5m
zRj%N3b$U2(u1Xwx6N&#IQSZU2Q-NH#U<ew2bHxPO4Hw*YHP7$MGvqhhe3K`$@orV@
zHf#sXXCz^}PS`Vsgg;8cH#%Vu*GijZlJKEUI2|TD`ke5O1;L#i#F(R;#?g!<E`R77
z2Nz))2%#^;DqHs8ezG@mp$x31mnS#pwzG{K+%SzCZ`QR&&ZSusjGQO3%J|I4c`j>?
z`h6v9p89<&>vi?}URD*q?l3#MP^ENtxIAm4qzH*}D{HlW>Q$KJ?{blReQC<l;fNvq
ziS<_wR;(JPP`N2=HSdw_pK}O_6#9Rp|4Q~GLnsj(iVYzS-~R)UW&pN0NH>`i_r|*t
zpLE49i}=;jw1@8Ch*@tw|DqBG6MB$6ox4c#ekeW3n>Qw{l*O}w`+&@!e?0(V$}J_|
ziQQo9L7b&BpBLHi7zj;ktTeK*(jcBu9e(n1OIS=o8fiDLMg0adYtm&)qXwdl0@2P9
zL<7gXf@6<ll~F)&ZI1eV&cd-*EF61F;h2&0Ue-$T1z|@BoZW)Nh3*g+yoB#)a=$NV
zC4f=ZUy37*gJTY^q)_KQjA>GPJ*xAclTmSb3%JWt5CjHyGZhkY$Z{iuX2gTM&Yj%w
zQj~*!WIs2HnXN_hG*>_*E=xHP<QtQ_VP=YWqCYOEyGkDglIE)iG#I0^^f}DA3}eO8
zGu(N%Xi&?6ENGRnOf(g!|Me`j2X9HoMU&qM+_AibOAwoF$6gnsV571loc0O~F=i|k
zxW?eGtWf;tC@S>ryHKIq7>bCPXG~@#g$uTb6GQl@j7+lHvsX&iII=YVlfci<__2X{
zi?M+Tu2$-agJt{Lb0-(kU@Jc0!`V+{^`mM@OcOpYK$@sW)yphiTxAX2jlhG_228@E
zRhetVp0d2j*A3?B7*FUjhwD8mfUnt8npWGYFL!G>oJF%A$QnqhE4!-fC#(bZDWO%g
zhi!iyTuw~mdNg^<+`+Ct!*#VHa^J-9FyTFn@|}Dr!m(;I)5n?Cw2D@PAwP4i!NJmA
z(ZO+K5P5}^sjnBCZ(V{2#wvVCrvFiCiLAtSO6Q6Z5HWkqpqaC4Qup9)?EgP<k6G!i
z{;oAEr15FYJFNi(ceuhS{VASkz%<7dSn4FL^h`bt<bUMwXW~E#1a_Bh;kfk3lA^Tz
zRo|tQeUN#77X^;!Cq==HMQI0o9i!bVAvR4s`*8`^-=x(#HPm1R`TYKGfu@yb1ZsJ4
zmF;%Asy3y%%Bs%}?orHVKWXfg!rA#>AggJQuDJSrSuHN3#4L0iEWAHQDiA9%%v$;7
z+R<H9U5<sIyh+gW$YS{*+F9U75;11>JMX=oYA1G)x_^wGmH^|oul!<H*@5=K9fr5Q
z(6PSo{@ld+rTlyF)L)o~FB5&-xr^~dTk{8G7o<!^_AOVk8J6C}W>{HDC&L9<Z;)`k
zB)qB<Y7nED7fHesI^nj@NGOzqkWTn`HVLOo!rdgq`=78L0zbMv!H<O4aB8jRq>I_0
zi`CD*X{0-ako7wxl7s90IiRFSw5y`o3SMK@;NsJxD7YhMy)Z*a@HUz6>}qOqt|`hh
zjR#~!xdJ<!#!O*IgdZ;SK;zPlY^s+qXITP;F{UsL8A3~XyYnDWrPyQ1(HnAdPQ8#E
z$%VNR(oH(3Ip!&$C>%??oJ}C>QFry`%#tw&J$XAP^(kR#=o&<`(6Co(kfLx2IosT=
zMS`T<61r1~myu_nmNm*hr-yRZ!!i(=Mivs6?jWc+T1@7H%gmmG<+oZaI9B+uN<1nJ
z8;Y{K`*Hi2XUw-qE>V4vdRtIQKx-PBnz)TMtjvXMAelL9++%i8sDwgFf%XK&G^oO>
z;rAl1@JDo*z8m##pM3qBQcvTd)M;DM`(n3()lO5>N-223{Q_2jl`!_EcLU`DOWnyB
zRsE)-kv|`2(a2oF!#0?O6CIB5FtO2YAE;X?b|!zSstFH!DnC^cUWz3w4%Ds4Z?gbc
zG#P{If^}AgI!}s7fVCx8I$hKho+yHbc9RnX=z%3G1N+-e%8QQF^f)~6DT<^{J|?i_
zCCSK)o*PF__@VH`#VS8Luw(`Kli9pd%7Pdjqzra*sG_OFhUQ0~N@55Lt?{&DJV??0
zVh@M+m+bfEqtWR$fBBonV=$h}TN;nScnVq^&nap=#^fGIyiypCb7c#|c_TV6X*lp-
zBiw=Mqa6O;$Q9k8k!jx0uxu5JFNIj2tz3bPtvsttxf>%`%PC}3<bBWug0c51HQ#>|
zkw2AKO#6rltVPRTP-#YuS;z(mBjpaZ6_WU*tWboqW*Wbu{&o0<VpWxzjfsk!isz~o
zJfIz~VhV%TL%}bYX_#xi5!@6eIhgu02}Cf=GP<O$<*_z(8-HP4giEP6A5=BplaBwU
zRQxG<qCCZ4?qw;4qhKiY4W2x~p~THh{WFiyh&HvU&q;Q)of~6!(<7CL8_|ZQ5ik@n
z@4Ok5G2cV{?+y*PfoJ+?REz>?BiHZ;s9Y-r9+v`vf<aQDe-1~C1o4ybyW&g<48BLs
zXTe~wbTy0ebQS-l0qu0%di<D#m}}N0j9<)1tnu^6`1wmsOEW~#zFvwStGl>_XHg_B
zOuYU;?khSV?Y%`V?d8zbIZ}fH&7b8HO!ooJR342kL(*zfgOV+tg9T+gneVPq<GVsq
z{U1g*95TM?nc^4B*`ti<WOqqzR}sSkWsNx~wFe)|@VHtqSQpuc$|8QA5qr(xuuS{=
zg#ASG!^#?>5p-p84%t2wQP+Wh#_SfNEDWi3V>(PAp!`@dMdOmx71#cSp`w$>(CP{~
zn6Y^cl%zsiUok}nDMlG+D>6Yrxfs+ae_3r}-+RNaWR;O09j@4~4YJ)FNC7DLO2pvf
zMl5wfy8D)f@$lD+##3BuF@EM2da4y0EUCFEow(v?JPD>s)hY;&tL!|7?;CG$i@2nv
z%Knq-?k$==^pVo2{8IqZ>Pb>%eaSy5F;6`?9JS?H%jI!VCV$-F_p)B(m+f!tT!`_=
zDx*tz)*8v-la>6+Ka4Qg_1G=V>1PQppsR5h6{0+_BnPrf19|r$goH0eH9ch@pDkE6
z{Kuv8pSylcnt0g*6MS}3F<UieZe^D;DxQ@p#c$_d@?iVz%qK6=Z%5s-?f!g=J8zkB
zpY*;3x1BQiCR}jbtD0<`57`pSP;shx{1)kVY=65|(7UA*LTkJ#*jWD|x`H^#yGp(e
z7gWEZGJ_a4#k_R}>#V=)tVY=y<A$s!>CB+c{IGyb%Uj2TB~)8-E&NFK*RU0GU|>2U
zRhrQ>^+5a#Bq)b?GiwL+S!0y8yq7!F6&?)LfI779HG4=EqNP(?`<<dcvB({pPr1}<
z*>}o;n4Mx@D`Vyfa%$l!uW%nixG!rQS69@&u{{U4)y93T$kKLM+z-IfMfkVbT&%Yo
zt%At2j|^0=tXOfEH}V4&TjL9lc&coF>g0?-^-E;sMm{1dPEy9r36EGzzA;OQGpkn+
zHS_+Lq=?-=T8cNo{YW$s*k?={7~4PN^-Fp)c7Xj{)jv5qP`!eD+$yTKq#$N>E}K0B
zu{C#q#Crd)=(F$-`KSAi&+jlb!*1VrTbCm}CBOajUHPYr`XRDKjSxIp#Bv8-8xX{w
zhRDf$k9CE?N5xW-qn+t*)yhCa`-uaT(|%sGw2x8Qnu{9CxVrNDu=}a9eW{Z>2C80)
zer?0cy;#y#2C7y>YjK&iOHV#p^9!oJc8p&_B+|ZBM+S8S8iXCni$hoDC#>r`k^0Ez
zL>}(=HV$jy{%$>bt22||;Xd%{t?fBnWpT)aGlT~0b~SxfHmJS2T&muk7d2*fmUR|L
z3TtO8zN}x{eor>f@;Ly3qk)*bitwHw^0WR*5>~6z7K$N^ZMFKs2UN>NC_#i_+7!~L
z5JL#hEp;kDmIRxs7voi+{EV=o7-4^AUiZ0@*+@9BTP{x@FY4PT@iS2&$0ntjGry1^
z9Ort1S0ne0>&jWpKfS0{^V7$&DkXNc7D-OLk%<f!e5o_9`wN*DktvMsNS(Fm1SYuK
zm9X^V#@QhuWxsWYM<V55fW$mHMmhZ9NTcl7R+vA+JFpLTcGtCLK$>6=NfTr7a(TDy
zCu|=w+4j<eS9*s$?xKY&zbDh%dJI=~{a^&1A(3fg9}*#ylCqnRKcJuhQ(s~f5o>gV
zc^8rxu^fcO!5PyX<bvT!K!7H&qFxcj4d&&lh=dBG%m~UHX_d(|Dt1$bpqrHUwF~T<
zXjDv+WnVWmTe?f>R+PrWYNEfpBvRO+{IHD5BLe&T`j3nrNAUar$=vazlBi|>WTT>(
zTKF2ai?3Tv_S2&kTwy*uJ6>vvQPE$QN>Qa+Pf?`^_HLD$Aa#`*6({Oasj5^;sVc=D
zGaM5ZcN;xmH}{3@V>EJ|jsRt>vJ93WX3d~N88Q2nGKG{mB>x<vLZW#~ex}OrpdBDZ
z;=~;@(uRbHH7W<1yD&}_qow!?ME)FW&p^X$eyoac|9a?5Z(d{R$znR4vn&3=oU(H>
zu>)7SGi3$lN50mm+<Y+{v5J?3#$^h-;LY3UH3n~#U?E!&+Id1lGl-p?iP&C*kb_=_
zz$M{*uR?|Qy#@%5_GRKKPvwt!*I1dEf)t(m)*{Kxl-xd&dxg&3s%86+#dFV++yb4u
zT<1=S=MI$I9y<3QI=43w#9Rx`k(%X?>D0Lo=-mAHx8k~GZXTn0b-T`8!m6*7E=LRK
z>S!wwwLoP*Nfg_qBAs~E^(2brtI|H4t`kezkl3F@z0_w>w?(fK_)p{+B0kL)iG~vY
z_w&>hs%_ZfLo>8(3)P6OpipklC~u>1<Dk+vZ)US{izEscWn5*AS-})3+A)hw6y1qp
z>oPpmJGs{4VCgL-VL!8^xXPVG^iJ_Qf3SkytT(^pbdF;l`*@1}xXHm)L8Z5&)kh44
z1P-YzLHbiloLkj5%9MYzn5I7=Uj~+?y9eEh7^dPjPA<_~jEd9giPGj^*OK=UMZ|(a
zJZfJdLnI)X-q2XYATUa*hmouK(5N_Cf7Xiaz-*QHnO}XTzF-8Av6L2BX6|-zP9NwU
zget`-ZwG9ME*8gb_E#b$mV}0C@XUlH#=zsiU<VNkdM|D(*<N~!CwvujiNJ@nod_Ng
z$h?Dh2WIUcd>ld!EZd@`x3GW2AcRB*sK!DG$(Pgk61`cv?qS940-hU;3NJme!E+qm
zt}*b0$%um|)<X+CXX?)scs9!(W<&LI^%>w1rDbLWSqxjz4i6^2Qdz&!Q!qd?DsYWZ
ztSP0$6l&GSGIdF!mjyu3kt8n_H8+02xJc^zC$VxsH_MK=%DoCN%})hXY~;iq=#%dm
z)Te*xC*YOWxz$K-W8`qGPH)S3*vOUxHsSC0J1190Ur`<cRZMq;)&#9`x91XagnP%M
z2#r^Ij7o1GVyJ$o9@=@FpDF|kE|Cxg<%1ZTCzR5MYjJ!8)LrLWDc%VmBGgP5Iad!)
z$j>C}ehs?x>;``)SP<+#svF!vXY^Tw)+oSO9?Gx3+T=>&>e3t`u|)ACiU3!Av@UU2
z3j@jfR&x<<CDNcmI-}xaS%8RjXv4f_h2X_b!Rw{6Y0-!1TZ!mbNKpoAUA?HBqU64W
zECG!?;2w)aQNSY)9p#G5kDGlT?hn<Mc>lE%PvnXbUX~znUoz3<&imLuCi+xj@gl+R
z50Hg39bJj_Ay<&5cm{RNpU5`I?TA?a#40FVET(RfAIi_i=TJ@Jp#1()bQ>B6syE8d
z1=s=;NupNOWxMZPRhR9)(;6z6WyZ`4_$1Osz6%$G&rut|KHP6AF?MyB_g+m8!v&Uh
zWRs-#l=N{r{e!1e`9EO{rH(z`U_{s<x6XY<=lbHgT#V6Ro~m;n)VY_(b0c!}>8Nuj
z=-h#JE-oUHyW=X=pNn<wadxixCFz>qP2w-b!nzrM`6Nm3mnXcI$X^btV))JYOU0WC
zxRbo2U@%=TF=)kLIzT5pL{z+4LXiS+;W5v_?Zf6VyR+z9<}n|_n}OqGfdS~^JZ5J-
zzKJ^XAaS5#JmybVs-aIi5+3uNi{LR^k?_$+zj3^M|Cu=6k&A!FcrOx$SB*D)c}wFx
zAHNQJyr%(?mdE?l-_&?ZIwy^{yB^d`o!KdIysdRs_z1>ZH-hmVafOU`1jb>(kyh*)
zHr!!q1Fju59wGk+fqgC%vVpp6acKTvROG2y3axvh-T>B*Ha5Ut6oLSiC-BF7nhI_E
zTEud0!j~E42ZdaS!%d^LsVg(&Ov<wmFj3ROr8XP1aH(ISa4B2tg>{r^tJsG1b}{zN
z%w${G*3vij{eB`zl6^H(Sjekh7Z%Z|I4JG$zMV6Syl>~->QIYeul-g_|3TU{JXTCe
zJd&hc6QP?4(k>(-daQS!O1dhJUf^nTT`zeR@1LAqdi1XMWZ+y@u@rrQ1V9-7C#_SM
z0I%W6(W&c&3%!tx1X+utj@NK%@Z}B?xY^$=+FuJI%v!;-4P@P}vwR#&Qx}OCQ3XVQ
z@Xz82v>!d})$64O=?rd6g%ZDFAmLP0uoC!<73PB%D~|hY)nO43mk3<dV1y4b@1-Sq
z&6EP!<{k2C?XcQYF|{bx>VgaGR!wZT&A=#Tq(_2$MC>`srjG}Q&X?nd;1GQq1{cNd
zMUeGg>Ut0|;H;vB_W4flWWzF{+Nok6sO50%hm0!2+%a9Oxr<KE#$NNvEFxm(i`HnJ
z;HW5+K2ym0EgoG_!<HvX@IRE`#@ns)Gkv@B+xfUJW`jNozK-jjL>ZLe6a;wiA02&Z
zjv(1`P?2<x^BW2K@MzKO)@Y(n^gTemCBZF;8&)CW^0)Ed<&Lt+M|g~9YfC$dubC(B
zTmM#1aJwsXM_R>p<H31AnGoEt2q?1s$3`c|>yWbtDm0^fFnvEil)pcBF9CAm`P|oY
zHun>!bFJW+3<SHxDt~k~41se!H~ChVoqLY|Jzy)s30K}ue{p0#ZN)$5{uj<Rc?XU1
z??^vCl(L^#?EY<KgU<19vA^jO5lt_hqXMIR5ziXvEbj~?pEpC#L`VkL<t!pN26GWg
zF=ZArJUQQb!qfB}+JXIzAiFZ(A@&F8z0EAVSH2{PQ&zU4Bs44oqc!+&5o%T&GrKf4
zHTiF`d6Ca$&rz5NHzNF?l#Of`o4UZ#W)@V}YPsdzk3&4MJg`nFn=S_p!l}>9wAaP(
z?n*Q0L0w>ef0<+!WY1NkZx@mYwL_b)@KGFGN%2P@qK?4yG>7rvT-t!MA^w`$Jr8mT
znT@Z6R$>A5Un9o#f=}QQ6c;1pF`P#7YNl?8UhOKIdP26tcey7pEz{w<&>ifPmAej)
z`y;geTgSWi47x3|^vsfwpZHn%-Nl0DdR3c$y(cuaJIb6!bN-2H@qX-aqWMk9@&^08
z?o<~*Tc=jbzc4-;sbBWPOJo%H4$z{=<O|e*GI}oQe!aPyjhgNsrMr*Xf$n3*uD!w-
zM2~C<FJU=}+2copk(r=Fo5Oyz3#AzspzOjxrg~$l8<SiW!QkbW73k+y0Q#2|K&#Eu
zn3Y5O>$2QE!8izkxPz^yymCcq%w4B~eQ<J$p07e`5rpXz^JvArA_SfzA{?dgrfmXh
zrDWLwwFH8%-NV8T4MiAW0Ud@fFjMLd4NnhDJvQCp``+zbE2mocYavtoiABzO&OaXM
zyYF60^6o23l)5KI%bGN$1(7?H7xGu|!lMcbr=AHEY>|8RrJz?pd*!Q1tfAn&y5P`m
z7N&B3F`F(?NIXVoz1y0sxCs3_ojFuz-hVQgmIz$}UM(e8j_|*cKkD>+B7b!D+w{F<
zelYbAelS;<QN<tasuVh!zz-sTsxwc$TPA^P4v;x^X3@9IA5DKm!KrU5!-(^PjTb8b
zT(2{?my>DB95?H%+uFe&y)cXwzvrSu_(8?PwkTgc;km@|4*S=k;}!Yp`Qa3<D@+is
z77Jsl#+uT6tQ<q?%yaLGkF^&}X3JxJ_+d3xS4z@YkI(~}q%)m~W36Xspva6g#`^A1
z#@a@WHEDjsscUP*mj!1-Hs*5aKK@{RpGCt<$e4y!6T6RP;h!YA?(o|iHFi!T_z-Nw
zSNM<^|J+3=AeCNMLvGVp_bf5n@D<~E-Srf#MSBT-scaC3G&ap40AQCpkUDW$2M659
zX3@eO=M9}`wMOJ26_=sPyqn62_i7O>VJqSkAd&vUKOm+`Ju-0=e`b@^-XJ1T$`r{H
zHs3vP(7*JV4h~4M46J7h1ACG6Mob^Zc3?Ggs|PmL8-f)4%K26dt?97;22XGYN1C%U
z@W7LSAvN#)o*?m3tWfHz1sY>_d5!r!jS4aFQ%<JbG*Eriw51Y0;TKn^&EY`12T3>0
zPo;TUGRi*^z|T0)B=aIHkTKKpd0j+7F;^>>sbREB-7{*OGCea6G?HN4cb_D)0f^VE
z?;7Z%|7+D*C>;I0=UwS9@zQpc81wsQ`G>|kxdu=Iqmtmy(#aQF08@$X<X1`+@N=|6
z;_-YRfoci})KtqxF$$1G#z~hD7<mFykH>If0HQqQRhpe>2@_mqee{EQqJra(07-^l
zAtQ>4)B_U>I0U(1F=vIC52w*XW<h#*s+jm$exe1#SpX=4bE}#Lq%)4o%roeuP))B=
z;0d9&W{8`^kB8+;8|%&Ur29``mhVc_6JyL&A|&D!?rxy6Poh_N2YItHCZ0%$!_w~7
z<lWB7>N;Iw1W6I5Yz1cyN+K^gFTnYQEwZ05I*8Kh%&SO&#95!7r4It(f}H1V-uMCe
z*sfZ<L%+FKRZYEjN)|z<6^}%1GBZ<;PmjC>UlijnR6G>!PpOKnWv+);u}fJS*F439
zDxBvopMvM72#&*f?V<9nRBofWJ2)zRtT%`oj9PS8CoH%*AVG>n1j+=x6Y?y3Bg9?}
z$`emXfbIA%vi{Pzyy%qt(xLb*OH>9(V;wpj(#^*@S;w<NoF)r11d&4*WQQ+u1~17D
zx-$yzA20WSw=uequQonh>>0sr$f7xzt`-)dfc2%N&+Ry*#Nd%6f5w9mqH-X9J5%iP
zFj$AS-tc5vCupL+E9{jlh+4~+g~d!#&_8g$)6odTyzREibzs~ga;6V^8SV<(V&Cg@
z8rnd0SD0k_H_M7>^A0OyfwaWG4c5gaY5>tAywrHCs&R*^PVRN$Vqtwh*c|IQIx93J
zQ#hU3eC2`T*8U|tapApOWFv9qhJ?E|xx!ul%nsCZi57Wb=g=ot`1^{Jq>z)?37>j{
z{j;#Bn|^|0p#UCjkBa$)zD<^wpf1p9op3qJds2*lzw6qtT&6_H>k{r`VW=(M7Y3k?
zBowiASJ*~0!nIu8LL^J#!pUu6k>c`3w_F!CUm(=Mzgm>=+S&XYwqop%MEomTm^ORD
z?ose2BazgE3+7#+u*ZBF?D-04L@x!ULOSuH%Sl`#iRY<$-GPHmzMjrK9H<18xQ}c}
z#S|C+l0G@=xbzN-FuIls*8M@%`d6erO4g@w(tkdE3Dk6C=ZlPD&Rf7wRFvX7A4g8>
z!W+?b%6%^vG8G-Oi1sTule@wjW~AIcCdYMkGIm4A)xQLjBK)Z;b}9UH%YAl@a=Y5u
zK6QqQ<L3bG9dp>3{>{puA^5p@{aa9k;MehL<4ti9SrVE`t^R+=(Lf1`H$;QhbF|C_
zOAZCRPVbvNfhbSlXdijPIe<&3zjuZ69}pbn9Jo);@d))hVWPYlMI@!%&X19`mfdx{
zgW&mfPT<_+3JlATKgScPt=4q{vgwOB2DME<(MkoC--)6uYgm)tfS-)_7+;Z%pWG9a
zbeCPky36h@fwqBbyp5cioZ5k1BDrq@VC$-&<nTcSsoGek1lCxirtAc}Mk&mC`w|N^
zHD+~lBaZ`2pw|JYz%RhZSubb`G!L-hf~Fm@PQ3C21<`WZETUg<%I}D<DRMKwjB``M
zpT+4V_^61bz1oxRS8Pr!Up*YHjMGPTpW<(%k6Y*eJNmfMqK{jD1ASDntrUGcLgAj!
zM~PK+c>0(ReMDE&NueI}@nhrdlH*ae7!^ZE5@J~tng$n{ND<%vub|uM>O7x}Z!O=S
zsOaH2Holn}-z2cFCWo7f%J(R8_=YBjU&b)b{1|ztMGl{%k45oFV)5ugbD?$Yq8p#E
zkxl4eKqq=c1)M|&n-b~ZS143Kus(UTm?vN>|J0^~pI@eG{nN8lFp#D-w%%4Oh`L*@
z1$jdkxK|3~Q9%4chO2B%CJO~O6XU!7pV7yglK1a352cTg$04ubH~-tI<0}c&F=SK6
znHF`tUQ<V*cA7e#CQq333<>v#I`(->s3ThG9Vo4!j>m(c5XX~k4E;23WQM=H@%Dup
zP0yF~y0*!L@_&FlwuG-O@8AD3^s&o7HGRA@nQjW>w*?vq%J%)Mjj}=?SL(#ahFd7B
zG48<NB>MQX(8phdJ_1kiNsw)vxbw&3Tul(VHw1lG+vM>UP|(8G6kYksc~lM?qp)=z
zRgcg-?af6<AB*KqYIRNvG&cEe#YL!*`_P%jaLV3r*ZW*Vhbij$0UBSCb9qBu5#;Xi
z-@#qgZf?1H+pT9$tn<pIoIrdjcb!A#Q1&|6>PFSn7Rp~n>L00w|Lxs$M)=<7Z6vO`
zSS21ShE<B48AalII&o%+1(WCyEB=09f0Hk($hpLYzMD>Yf}eS+ze`16(Pen3YZp3Z
zl;1?-(J!eaO;=JUyU_))rTmO5wv>N>N-!O7N&&A0oB-MEr%HYllg|nNPr!V-%#{Ml
zgsm6PMWN}L>YmVTeCrL}hy?A2R1w^9G>Bdh4}|Z)R8vCJ$9F9F@598ymjGb9Zj|Y(
zLAJQTD1VT;J%Rk=4^I9ODc$ifw;aiu2fLqdRLH4X8O>J7gKacBSuo+S!S3InXbU2|
zV)-as@cj%0of~@q9b1HVL!kh#nF#MmO%dDNMZx%S<{T*sK}OU9jJw=I=1AJp*nNV|
zoS~cWv?bGycHc^8jni3`I~x*(e_PH|m3nk$=m@?`5dPJXt4?npeS{z9E-)BNgEKuS
zj|8y?D#RY<4H<&h$KMB9_xw;eMGy$-UKe!nUEFzJ`UhV@Xkpp3f=jN5h;)Kc^wFd6
zJw}5HNoC<4qZ7Xsvtrq6^w0*=3D>*4verJsl%mOZDiLrSxmMUqxQ!y`Iu~N|mKyfW
ziee-287B)kb=`U!YS9Cd^RbEnav*eJT4-2$xNA6t%y52T-u}{I^{brKy)Wn(T*6i3
zd8;PnDHMH|{J@Yjv?=E{`BDNyIwFq53a*XPCt@K}H&h*1RXA<22fMn)$*?z|me|Rc
zm}SHN-i=)j#%yemvq|F#Hso(Sc|+a8{a1v$&bQ8JYD4ve`wuD!bsgl*Tjje?3yRqs
zJ%GR{uizsOU?^#~1TfSVIX{4`s543PPmOY<nj$&law13~xO0Z=@NdTeys1D<$zOEl
zCS;^)N;LGB>ddJ+bFe3AYF;3-q(b7M?TUN{jk0h>ew6+I&XYBbq~+`Ymkk!uWn)M!
zOQ~YHpDU`XqXUwbk}|?cl!qJlMfrpU6}vYg&_Dt(EkbrLaTMefET`W`NtRes7YW6<
zAl&s_%E-!p8~MIFe8S`h@y7_ADNJ?;^O>hujS?EGwOfaS^G^m_v6K<{tb~H&V6K!E
zkr!X0WZe*D7#_MLonZDoR~Z!|3?|eJQNF-)j4ykXk-$cqk_MQeGu4^g=IY<6^&J4+
z1<eKMWwPvp^hd+>)_mz%!QMR$O=iDbM6ve_QVENvsnSldXGmCy1-kgtIyXy%`!)&b
zrL(^Fst-@4!b$_s2piRwENe~@_<5I{FB^lC(*f8u#lbDwC$9}s-V4Jb%|2axeXqxW
zI~7Z>Nw@K4ly8tB>WITeg^*_)z52>{iSdehlfc7>KVL_0CjNS#nWw!VlD{_5ww^j!
za*kp|caK&z+rQ0>CuJsxX57|lZ1Th6$J@bd`dcg`Z@V!oB<%%FZ#(T^+n(p#&ph?N
zMBB=PtNs_zXu0R}EyY}(>-v0SF-}PycGi--W&R1?-~j@=mN<9nnxEG-Gx_3wc$;X)
z8D2$CMq&nIt=c0kKU!uqTrdK$pE>(<%B+S%CE?hq3fMhmPM?S^q`Bv)%n>NKnebDm
zlldAGjx~=tO=k_zSy%63l3rI?gxE99YsgJlJ8Jz%JmWfjqmYm%cT>-5e_s(I-XyMX
z3@=e3$NpEBa;(gU>~rqCHU6DZ7yVoYDWxk%NqaU@>|qG36&5X+o~G~fMVuBIjz5(<
zBQPw@sm>#TTI%`ID4zxJy`j#Yz`lc%w|PTti$Wt|M^jd~!X0P0@X{^nNyLQZ(jn~3
z9iF_kK3vwTY=X`0eFvI+bKHTSAlN@asdg1P*9yT_pM5BN7mE-5?(rFX>R&~lyXZdO
zdOYCk8-s1W0NxTqD){Jlc*K1E_xA6z6ZJDMJT7>3sQvpP`kChu8bd<B<gw2QCMW2k
zlz!&PYe=<pQ8S-EoPOpfHgKT-^fsU$*F{a_6vQyU)S1`a%rI?T)JUDRX$R=mjpb#I
zJKIJKq_(aLQBVil21Pol&2{RAsU1gcK*KWYhE0&QRy)jx$JwwcwZr;3>So<9N&IUr
zpB+JkyN0U@>C7aUMS8jg{Y7PWN^2UmoYrl9!X;82z(&oh1h)Ma6;jSz4MS8r<PQAW
z;>?(m?~Mozn-JL8h&bMV6djYjm0oA_10`-dx+#-Z9lF)ak=|DJCPTM+F|Cf9qkGQu
zOY{vG?Z9F+61k^9*H6%vVLmn=Kd~!^P4EO7TKmsDv>{@|rHyXdNMjGrRYUAAjX`Q@
z%rr5rCg(1l)9$2wO$)Ifa8yDy9p;B*#>X_LY`PCuyTJ4bP9GNkZJVx5r!gt5U*_<}
zdS-qmX9h=35kXz3kOlQg(?%`tBwq$YezrR}GX4GweWG0Vr4$8+P1U6{(Hb;uwBb#8
ziZhvslc?Ilof~*aY%j=zEO`qVcvaX?H!M@WtQ(dkKN>f(L5P9;FC@jSlDo{3eXMTQ
zJbB|End8a4RzJ@w<CoVd@(REUr9I5Wf){qA!zfJtx0-4}9x9%ma(kbs3a|w_7?ag!
zVhp7JAs8K&fn+pOOR8<zeMz{onK){pz>9wykcT$-j}f-8-CmK`b5iwZ_$I3Z|M#}Q
zevmD&SGoT0l$nWy|427Wo`L|xYKsf*MJbhzSoBD5#YTTORsU99x&K*?EE&>(H~2xc
z%zlZK(j8Hcwk2hhFePlha`9H7QtUwQC8M7wC?}qb|E^?nKn%XZ0mbe^r?V}}-x>F1
z8FE2Pt+(fTB*@JKoSQ5G=bJLZk_`V>3P6?+GEvGN87@0L+!rP;iRm>MRA7&?GrBTS
zz9X8M_pF)CwC;UZ!f%=5J2GRcnUXM~mn6*$)2;q(5P7$0&2OWnk15UYU7b9?C)x9h
zT&*;N`NjI)CrrR%)w<eyl4nvET!@dQHIsXykJ&HL2gE{0-n^lE1}4nZ`_@=~mzrcy
zoc37j{Wr`P-Mb_st}r>gfKcg6F33n!n7sT1hXi$H{SgWG4A|1aKn>R88=`At_!H+R
z#a>SqML)DeS+3w$hg+g3{*ac&*J*ZaB0XUwJc$z-*EPi26Z!EW>-E~?`8>s*&xG}w
zZ9XHL{$XcS7kx(-msG~_HTsDC5}nP{6ydoBc|z}(F^%rfL-OMZhSeKka5{MED(pR0
zju2SWPjQQ09uqzZBOkrV6S{4bD|8uVQLBxB@Pjy{NI;B^!JV!q^wdI6kUxw-r#%WA
zH!j<&HUe(SS;qOJP=#QTRN@vDX}V>W^r_ixB*Vh)4eNi7gXd5_7(qE@l#5STF+MFt
zQqLp2gf5(*#Mbk8W<dSWjt{iNCH*+6ly>ok2RloG#Y8k-X;l1!QsRr8;qdkLgx^=E
z9kHQQcXgPnPjoo;E^!*=w@WeAJQOcO%*p6Kb{7|6MRi#=yBGP4Z5X@Vh3Q52PuJUB
zTMB2{J078Bn?__7VKGW5yd>JDXX|%rZIT=^Y5F&;7fs({0`-R9Lf^8resStwST0(>
zC?L|8>NPVr%$%cGj*)vCu$LCXRQVJ|%{xz%xh-g=bNiLTkj#_x(K>zGL+Y!PCb-6@
zNtc7%xjO4poi*)yvR)=DN@)0kMLN+xntGm>db&{9BCL^1Y1RJ+joF`?FZ@{yiqYoO
znm;m;f^m_@4uVJF0-vtsZzAV^l3Fw`R@+y)nmTK=Z5ny;7-fyJ;|-Y86@An=I^CiA
zyZUE3B{C=Tk9>rF`a)LD3g3zDkY*-AbIPdJ9sINU;r!f@UcqEvI7Kd=a>qH<SaKo-
zjQMXOX1y^Zzw|;0wHz8YN*3qYI@T^;G)Kfg;fTR8Xi_&1RvvD&5^E(O^fqJy_}Yqf
z-l*ue_Hk5OC>LZ8R^ym+0}zNy-Ig-QL-b)^DNf|q82^!^60<3s=xAd`1c%j&+5Dc+
zH~2;P5v3mkGg))K2{Loh%{D`ln?T^m_HPmek6HZ~yhzJN1YJe#dxENVuWZut5gG_q
ztW=CZE;=jr7P$*tQJ`_XFUTsOJx}Z<JIVN6Iz-{5o(iDnhx$Z>QNa^RVi+`ij&-r2
z!TP2CI+{`xx=q<ZCNeGmrG=fV`Ol>F%wURl;hW&?`2m7JSPS%Ct7L?i0=2^~=-n2>
z@`jYefXAPpA$$4J*#1|f43ZL6v3P;Pr~Htx|Hb9m8X-p~AY?%dA-lx(FAE=gDe!qi
zeb#c3j^HEU6O;sg?Jf9;b6?PmnJ9>yKYX4-&0pl~5=YIPQxoi4ME@WQqkx6Ss#t-o
zQD~%_dw@FA1vx*}u^#qKIu5g){W7yV-FXu-{gL{@2?v;K-^S6bLS=5-Q@0gNZtMM=
z#M4lcy*^^puY@uGwPlNi7M6o$vJl^t<JcR6mmsdJ2W79I`19Yq9_By%@79B9t%n`I
zX+3bGjt~n^-g4u<bhb2^Bgjdg<svpJKbn-HP_djoTWL$0+Twl)SDH8_ho|Wyva%l@
z&ndb14)t~7a<I=Yzl$?Gqa}Xuu=+Py4~J#oZ3~4K)`LwPH3Jw4i^mBJ{B_Ccf}EWs
zUPkz+!j;f}iR)+n@f5W$^a`r<3lMo#ZiMvf9mS_JQ*~w`nd-8Q%I_#)jwI7sYRY&p
zPR^6h2cQ2t74YvvIGg?oz6pOB$G6EDh}wkw#?FKN63FjlIWGwH9rDM+VjG7*(>eqy
zY~!1MdV~YEXfOI-{Suww3eB1a1X!4pwGOygkWE?0NT3=|=po@UFyulF-1QMlNuYce
z=>%%vA8`DqocPX#yj{>|4bwP#mHK#?LRrC0IcdgmQyiI(#2{1SCT<q*3gPG9&e+!W
zyHNA2_=E{46V8{1#m|u$zGB&*S~_3a=&8<^7pn86W!{yHg^`^PDeN>V#Kbi@PcU@x
z0ELNd5!%L;P5;n|6Z(-DS2nFiXkr~;Ig*tB(1@1UHue(5N($h{z~Kk~hTvzB2g#-S
zj^kAI=Y4E1fH_ox>X;p)DI(_k&G-kRz9<p=VNAUKkMTF~;Qs{v76muOa85K!V7A*E
z%37{Wmft7A#gSqH_T;ZHLt=!?ZVL7_jp(N`TI<Wjg}P?T?sT0i^w2y?vX5P^FZp>&
zushIzy6g>}N<vC}74BaZ?)tju9nq$%>A@YFV~lB3yd>qtA-FCjooT{KY}Q;Vl;0=#
zb-HjFR8mO~4@xX)^{R;&g)qrm&A(DZG!yPOC}L9OF!vgJBi#5dv>VYj_>*P%oU(-f
zyUPZHist8)|K0Vnm{=x4&5I6QE9b%BL(M0CU?X!)PXNhf=_oBppH|Fr5p|}f=6@&C
z!u06ll)CH;RqAh3td*uqmFX%5>deMINh@v)nTws&D5jN0`B+(T`{7n(O0{!#s1L_f
zQ?0mD$<_Rf=x+n}B0)V%<k6vF*ShQOE29?qCm3`JhFuff!TDwK9xR2Cmaz3ht_@w#
zQEukxsYiZwjmis_j#tXZc!5b92-rK`6M9g8f5CWIDhy)lhD@M7x{9ul%3Pu0RJ<?6
zs1R=nSKUyR>#7~vkC*#wv49Rg)-)p9Mc@#+;R+7Tz$zEqPJ^uoOawcT8cp#e4}h4y
zA*sQu$KVaVDQ_|iL{j>NpEA<yu1|@L37c-M&=0e}y3}2yT&utaySIm88Q}+$<To7B
zFD1|gES$}eWV+v_X8DI@HqKNCDbgh9>@dVYoF2Skb_ygm>Y$+Zi<pqhvrv6Twr7#`
z)FcZ8uLv$xwgYb=D@rim0RLs_<A$scOxSygeD-*?AET1B`9Gsv4!D?$JEI%8Y5tp3
zhhx&EZrx{w=XTb{%Zj}vzPIMH;yfE`KcP>tekKf6+!f)O)v@Um*_PVHj!l@ds$&y;
z^dd$q|29M}=Wi@M;^xifC*o}5#qdO^Sfv;wu@58>&hau5U2u*7Ges2GA2TjjNJ)}-
zeY7e_bcFH-5|NB3ig<%7^gz};5<r{Lnk3#p9hxLg;mH$vBujV%p^5z!O(gI^y|rbX
z!v*aislbS^6X*)h&@@rUvV$m=XDO;kRNi7ysu&fkD2o(HDOQY%Z+Uao6<Y+e*rJ!k
zjBBU3E@To+34)HBg*o<DXyb8WkCS?l$EKT|`zyM+;F<q472<wd&LGfCq_Bz7UHimX
zsK6}d55aGvr&E72e-S?qe(i3@0VnA^I7F^s?zs(ajLQ>hy%o!iM{CXxcRa$|Zg*k%
zdQOfX_+Gb9gkQvE*yh|>jHQF7H$pCp*~3yCK0n)6meYT`dH<b><I%pZv1QZJI*1%C
z!WPgnBl?WCzj7;+)Cy*!$XP2kSUsD~n*ZD2-pDZWS=`yNL!t6|c;qTYZ0nZVq&nkx
z02VHINgD}##%5KS%EW0Re4v}kd_ZUR<qA?;4Yo<=Ow>88+9#>Omg>x@5N*i+(uqlg
z{RN$Qjm|s?=dJ`+YBrg1elUsNWXO3jf&Gbz`wyeY@q`a?!l;x5$iEG?{Y6YZto$!H
z-^0IOo6Nr_%7-=oE(BsG{<86-sMg&yXWNkv+id$Dkq=vJJM!US{n%}c@=UcBMf^Lg
zpPU&j_WNYDFcVmzz9hJaRl~M%pq2k(zAseYpCRe9Z_V;)8S=0~F1PTdDc;EGZ>N}*
z)+`5`M;Vc>F24|}A?rdYgui!0RP})aPMoBqHeqm*zp;z9z+U|-q(QM)2H*Z?_Non~
z{w;e&uPptqQ6Ys@cT%LkhvT<SZD1Y<M3n9#MyBj_bcxaZ_t~p{Ey$1l9e))QA2ru^
zwQwGh=nm3P^Hvsef`PU6Hwh#uyI7i?vD9Q4<%?<R39+SlEqYrrmm<L>v%bn+&u*Q*
z_AZs)DxZ(Udr+_E-a7L=D>K4JS+UH{I`b)=*^Q%)CO4H5M~)aU=JsQNt#w2Z18bW9
zOSWoe)9p!1d?^LVoT@VipP017Um!DqcN0_N!}D%(g(o$2(0#fNX={Sg|0-&V6N@DI
zlIVYN^K%j4C-`~r7f~<8xRmDF!9z8`u<D7Lr36V+%y~=6Q{wtsr7whj_%n=h*^3kn
z*=UPkixXr3ekB6{1;qa_y$BvE#q2+WJ)s3*Y!SXkDdKWyX@__DyT|zpD*B74KrF#P
z<PK)Ueve!)FM;AhCqaQi#rjyE!ZcDq1l!A%{-*;8mK_A^|2W=I!LaO@>}xWMv_*p0
z7mUEv=D)9MIql!<zs=;aiTdN_`W0Io`=9m4oy))-OZ?Q#d+R~%piuq3y=sm)N@47?
zlN36CG!Q(Kz=L|#{L)or-lsEfI+neTJWM|LYpx;FR-D`e)Z+-*%>Mul%3+)bhrt>X
z3q3CQKV;*I;YPf6MW2%QFQsVqjZfacPPcK-(pQTI;nI6s#<%ACnEvE7TUjll{|D@s
z=*&=H9#D{t>{S4v{RqAKi4*rop4_3AwFIi8xDq~ONuXA#AX%ZQ;--bKj5x+<H?5-3
zb)1%zlU5)wm%5t*0mWf@LqkTD<ZU$WyG~U)#7%6GjqZ>;+ncw~sJM|t@vOo5Q}LM6
z!-Uhd%?$o7p%m(aBD$5#FSbnoo9WLje2qKUL~I9}7WC)0Op9#*?=fBiydHYEJ`3^*
zg})bxsTM?C20`qp6Ms0tBCXNFm;lPxtC{dA-0^hS#{vcI^0|=P=o)HRfC`anPoF>y
zR{}we>g|tEsD8V4Sa^DowLawDhn-ziR+;|pp`-ckTE3I0VqZv>*7irTSpzmd9_Qai
zxKt#B7|)id{E{GZ!W@OKNn8VmRer^nz?`^#-`0Ok$~Y|kL!BondyNg|QWk1U^tJ^X
z96otXVs~0pHu`}55}jET+()!TO<TJG338yVU7@X9DJiZAq4Sjvy}UQia^Np7;K3a_
zAI=MhFa$)r;8`CWob3r$%0eMrU7AGEE9Z==6y{1i8{8mmwuI4^@&tfe?w-J2PCM8b
z>65c{a^>2yXbbY9!`km$p5YrH8;!_U1)n(tar&|R#S-mzh9h6qtjyB{+OSxeSJg<8
z;t+#x-L#v>@C7!>eTpuU-tSI@hojE~pT+}{=#5fXC)SA6=Oz+GoqdNc@6cKA92EVy
z@P?H)Q^34P=Y9t|gVm3Y=EGa856{wBbvo;Dsq;3nG{PTCCNU=wE!IcQP9on)`~>*>
zv(Rswm?YvaxZ8|={|4OE^d&~WZU08>jeZ0DPC@H>c>Ha!)CY~?(1(fA;pjKQE>Zev
zT!m=5i!?=d1x#n@`C>rA)sx4@mmB-H;Ah?{qkNZ4aj1WKQQo*T^tpk12|4r~^EO7S
z_5Zb+NxmBxN2>Tk9L43{P)HEGDD<$<^`g+rLf50$i&ulMuE@DTTlZQ%yhK^-*v_5z
ziGO1-VLKIhr-THoOUZ+VA%4J&IH;-<HFHr=Rq3wNat2OB^mL`~e0M0jDDQ?e-wt=k
z&$`>|BNA03BEu#b0X|knd3(AT?zlIc|6q~rL~M7!_n<aORu&Ozi&2JJWBMg5#+VGt
zIu8D(#dwA#A4uL`8d}s(H``whvCrD~Z}yiCSlVj)tfCh8manmwX1Vkl!<vYB)GX79
zA;NPcoXmH15P)fmt`}nd0heFG?E#+PZf=tRw&!?4gxq9G-QuL;8|e)lgVA?U`1l!e
z5mnEhyvol<qB<Ud%>4*=Uc^5l_cNRxuCkqdKJXHG$PW+thEpYjQ)lV!+$*}}SRZ@p
z#2;iAiYsEg^wFj05f}C%8TRG61HUx7+kK0;W=*m4Cl{`Jhjs%8p}dGu?gy!&E6{bh
zjpCZf?Yw;?@hAQl##h`naeSBKm%$u7d0pc8TspCB^7w9T|35pvnr8k3&EJ#aH_7^K
z7SHpqBoWWE<cdUl`N_A+k^DE<%lpM+P1(!$2lj{B%irFQRLfre;z@_Im#@*5|98F%
z$%)&`4{j%Pr{M28Gj|ZMwe96k(OL5sV=o^)*w|#Am1f(^EB_+`j9{Dqv2Vf0E5DhK
z*J)4`qx^Ni(A>wXEk0fZY6%WyL*J#{dTht8trU+rc46H5+Z?-CHF3wTKmIPquEbA`
za#1cS$F6TP6yl9W6Pe)Hl?F6`T$RpDyT-ywy<0}XTr!{4nKvCtW_<O1LZ<E5wH*@u
z|EE7!%Ax*TFN{j?=lTr)RN?Obvp?4wY_0#jKUYdi{#;zBkm%2KC&ZLR{<=t9;{IGu
zv{B$ZnPaGV?Ez5nWf(u;yAwexrE}L(I>DdoHeGy`&b@RCxyxjKg=Ai=v)b!s{_q}I
zw(HK>WX0&UIKZg;L|XReO2waR`p6`Iu9V;D&$ZEP0J7rGb+t&k|D!*b8#9-fKbJIS
z`*WQh&-kzVa}mw`P=Bt~H>xI?{6qb@7V11tkf15V+N@ZH!43<CNhyQEn2Yu?OR2D(
zBzT6ZS-ME(H15x}TxXUYr858EVTsuOT(9b^8+6u#a=3b(t<D=%{#*}{+l(HIz7n?y
zRwNYTn)L7mO+DA*&xM}|78&CtAXe^*B>pV!PYC{fX!&#f(p7b(SlY9Zq8E*6f38ow
zL1*adwBYD8@#2ctrsN9*V^{uM<#fOsYJ<rV{#<ygOXQWR;&9jd#Vx%Ub2=iNOXTa&
z@oApCH5ijStCO9%9*#S6?JjnHE$p89>~k3e2Mdh6#hGi3a^@=19e!#n9gaJ5-GN-%
zR#sYY`oE#qXB}v2`hS65_l4X-6<)eFiC&KagFwb~ojFPlKEfqwdOb*I{)CG*UoPyF
zM6bJ(nM|*_k1Tr<Je03DPbaIV(|F2KPw-L&z0^~Go-)<b*<ytoPW{<0SC<WZ-6=0S
z6JNF^zHCW+*%*6?<DB@Hy(sB<1yA$i%jRL8Je<tKQ}VzCk_Au71KRz9zsbV@F+r4A
z@O9C%fWw`nDe0WLcH$O!W0;QUapc_~Z=&6ZcI53Ed1DzkkP(ObuO!@Unz}D2=L2*U
zt<8BkLibY#P)h#2?i^i%{;(jI_t^a~Be>BnlLXPEjO4ka^*oyA2A@v9g6Hq8=c{;L
zX+2-V^Ly6wb&PxNL<zbkm&iN(V?qnKk%If~E`#)}q~M#npzj6s_BZS8dG$8SdV5yA
z-D$l&t=_J%-k#zuoH`)hgzs<W>AjMr=S8xU6pZGJ*!U9UH(r#>Dj4DyE`l!v51K3K
z8*CocK8bw8i>V8aFoTv8mlwNm*j7wb(V!MdQ2LErh?!e;2{9wm%pBsrvWn9Q8EDKu
z;iu9L4*!u1@H-xWqOVocO7rjV4iJ&{i)G@>?9Vu^+4|IA!F&qF*woyr+|Lqx2ovo6
zkpH{kOkW;B1j;~umcxg5DfJh;zqL(H`ML(N$$0|(3+X4^N#mipax(RXUYy6z>@wl$
zP`bN=$D>n!kIA^7z=0A|S#C!aa|TABtQz9jJ8sT6k8yk5IOF=%&2t+tIk__R0`;4{
ze4KIA@;$~V{vX|G7Kg2Ia*1@mIg7{Aih21e_LLb2-h>6N-q}@k*eHLBJW&tR^OPX}
zyW9&uw&<R_ev_LZ174yIWJ>>I<2pLu4h2{}Q>^w;8k_Z#WMXlcMoWz*;osEuT`9`W
z)K7C1lo0uG3#Ekmb63lbz72TYswy4JRV3k)MqDT}w$9hin|HvNc?k=Ni&MB`)U~(P
zX*^O>_%8|5v;3Earpy}uQDjX#19yikr>{4$8^Z;&5%<v67`rWunn4lsLmDvNK488H
z4!D9lxTrM~7|0X=Be@c9kH<I_2%m=Mka6Gpq*2!u@Td}wvda7jJ<8%g0Jlnn@(v<}
zBY_xCA-THmbmox$8s%Gg@KoBM^oIMcN593)=B^|BWPCPd|BKD%k$=|ndCz0=n>?TI
zaBx%enQ6`EeS|SG=cmW!lX0FnTt!*;-F!hf<vEVOMV|Vt_6(w2;S!E;!5_}C=Ps3D
zlmve@3;a?CXttV3!%`|`i1nE}Z(C^(eRoI1nxtnr&LE!$2z8?N2u8hm`B}!ycUeP8
zV9PY_6Z;d|%evYI+oODu0Jgt30yd`<0$dff0<QNG!1dZGPKUs1Z?p%xHQft#e}yV1
z&<j>Qy>Lk{V!i%eMZMeJ8ul`ygu5**dkqW!KN0*#4(AGRsbW-!N_Dg=ka`<P4fWK2
zFOYKQeO+ops$U++qvETsn=ULS_NYE47gwp75|fNZ-b4}*ecJPWyxRjBh<Kzf#$&3Z
zCtLEDa6$L0q%I<Tnh#M|0u<&Qggm;9+n33DKJrm({vT;!{y{(Hzhf45C?KfYIF(OM
zOUau2v>x8%j~U)Goz)`b0v%RVYFD~P(0$*h4017p9OkLtq6fLHl&i*NAM>Ju(u=PC
zUK!&~^Yl^+8$ORCk@eNrr4{m3aIG~Bk8z;T$goI#{EgNJCL(?0BtiP`@qcNrF-3xv
z>JHa_M2B~GO2pqW7w8Uu5uegS@b|B0+xS~dql8c}r%kl+H{N6Qncy#`QF8Z;jlY{Y
zvgj_iRbBouasyKdtP1I<CfZ&CDj;7+R4b%n9>vUV95KvE*Z|n`nR?MJkfp!^q2;o(
z1QyIwPwmPU2t{}Jyznk#yMoic?qvn{+{Wha&#O(H$b-<UclQw;ac|`3YgE&>ov%kN
zd&ffb<hmP6&Jw$%$hD|$680WTo{MC06V1T5iv}7Yf^wzC4i59FEa?E(&uW2r;Z(xA
zVjIMDx$slygYoOmt%@EIE6_s<U=e|N3ztN4TLhhUxO0{eCBKO1qUyRUm4ptVR#hyP
z*$V-SP)r0X`rd(oCy~I;+M77NuG%(q0R0D(>(z1mW98B*M-gFV)|3r$_>fy{FdwI&
zY94#7H1p%JRHe)luGaqa@C&Y655q5nZ*KBdZ1Yb+oFZgG)IYxxI=QDu_S#~#*~7!F
zDb4dZHz+f(Mu{bhe|>k5hR7h!HQpRk^ZRzt!y2>OALs$GYx!_uUBXw%<s#2f2eWy~
z3m{DRsQGg6c?{rbp7@k{c9I@tyj%8+mtNuuexyV`7#KC0v(TM$r1rMk<*qg}jq;Z$
z$R$ij3F`^yw@T@5ULegKzQt+&K^J!GC?mSCiZqf?`0r9dj=D8LV0!k_Ms8;K-j|2U
zK}FErZ@zVcgZqGu@;|F;Q^Vth6<o)Q!*`M{B}QDhoLv>kJ6d{Ue@O5QKJ;C0XV<XD
z=6+VcTX(a3v|2(1J^rrX@akv4;W;W5ZA#C*M4%%ON~|o9m0jVd=T+HW$8l76-YWac
zFDmOt*|US|`8W&xnYpu>^+k~lPkmo0;^3Q=DD4a7_a+n39npX}p0RrK8hj-s7{re>
zkG)RNZYc~35n(XXOWc4RB{^Fr^)Jb*FFn1)`GeaTRaczIHSAyfKuz7HCTt}0>Tt;l
zG}AYwrV%bwFW5P6C}!^tG*sp`gp+3R&w>H;xB2>&`y$O3?v%x}!R#S+YN6C?ZxTMI
z%Gdgerj1v`I8V)pK^HcS`_J^|XE}8Xz3D(*ir%#1LFml|Ko)vaeuV%y^<Z(F-gq$D
z(cQlo3ux%gt|`!)X7Yg%=Ip%~Z8$nIQKQI&w#`*-rABs>5&4xraeyY;Be-Y$C?rye
zFK?xH64gr7mVWhIUs&Gc^LT>aFP77i*tIXACb94oU7I_My_L`no*m}>!b1bDGA!ai
z#c5XX9#0+8K)Ub~e-x>bw-^zmhJo_YJydJn#~+SwLcKy(Gt8Z8hC*A#hgh038)ZWH
zoh0Q&ajMu^XSFA2AZCg@Oam6RMQBp4>{_K!iFSLd@V&Gd2Zow|;&X~IDh(eY#_iNs
z$Pl<NWYeVYy?JA^1{$;eMGd?g_lXaJR@Z*u&e?`;$!!c>>7KEl<iZ8=$88K=LRZ{5
zbf~ksY|fl2hn*Qq1=7h@n9tX!RW>Z0RdzGE(Xk-I9G%!&<a)PA;)U)p5`b%pPJdpM
zYIl$>%lG;{)U0y<pmSe%#v*c(TO_&Rg7b7%iO|1kd}z@>GA;UNZBurL?*Vsle6V5!
zn<IV}Xv_8k^7oVRq{V&7&CfDuL50gw+fvh|S!tlhBUSeHz~RW+DB>U&{ULiIHF5lA
zKaq(|F~wwqFi_Xx?`$`{8l<87J8ocUiTgT-S1PL+mvCRyDD5q&_7rh&FQgWIUfeNF
z_SJ^-Cbi977pWk*wkRcCS~%u%21n!Rix7x9%-1tndh6Kj0R7agqurd9)Bw{Kp=5EG
zn~p-o$eCAJ_LiM@8ah?+hLnEm0#y_cR#oi9UBMp2z@`ukM^d53RV{7abOzTjgi#im
zj-9U+JVL3RWS~HqTT#_b@&mp=0G8XteWbvF^dm#+hxpH_-;EV=qe@N9f|X?K+Q)uV
zBlNsFx2k-FZw=f;*H->jvOE^dW3bpBOsxri5csZfL6yAjPw~CA;B}SJe&RoTy#o6>
z5T#~cM}H1qc69oCE!ZVWfMBZ6Zf(IXN|P`qmf%2(!kx{=Oys3afqj_^C1F98=vIuu
z#H36eXcvtmKy3c$v+xrz7c41Zj$}mWcZF$UsUH~2LOY2A6ipHmr_+`(44O5C+%spn
zPWoDs#EL>KOD-Np`~}1w-_18btV1YG<(5kw%@FP#ObT;Xu+DFm(Y8X^LJt-yI5nh9
zy)3u{I4!i{G<2bY6PHBi`!WIz0;h(K{-gLZ#p&<7P~a5pwNRiF?WRBwXfP&r;fAIK
z0*{3jJm_?%Z0RwP=bXF{k)7y`HI}^MQZ?)n4*$Z2e93khJg6e8*P2mgRl&*~{Ej0)
z{(fGC@==2dOP&M8IhyEaXNZljJJbO$l}RVNoyrR)r&b)<&x3PxBPVF@37%~i%2^J*
zR9&#R3SeIbr1bqgG6(QMWzz}ffI^pBc>(Ato=|u1Ee+%2ld0`vH<)u5Ao2;+jK?r^
zu73x&4$HhbqQ^sux$|&nnzbKPItpbcsW+#vPvGp5fg^eUEWDuzjTkOCl263&<>g)c
z3T%YgshJaFqv0PL%kJiOB*h}N#8b;fZsw2pC2N+sKn!3epXr{#>Qs-4uU3WXVudWe
zT2eJ%{i9GdOpVhesg@Xi!u#rr*ml3iFy(^~jx>8srC(I9g2->dXhpkn5inTS<m+IS
z=?(_O>7c+%$*X}ck$HYId<PO7{`bH)gFQ#VS2jFh;`KJY36@P0Z1ZF}BCwCH=L7?=
z`)OcH|I~d8Gk%;Svk^{hbG5LcvbO>E4^>*hKiaX*_M+Um^J<noS2dKsL55s<yLr)h
z@}#!$uIH(<1Q`2|bL9q-y~cyGk8(3t|0I5dd(yaWozXeV_O+k5TUcN@hH{vbg-f%s
zeA$*MQ@TBOJGZk>?rop<I=(99+|G#cb`_zdu{&6{GX9m+N+j3;ujGqLtGkG+irvO*
z-$uJ(9}wIM!8m^hUqdJ}QD3yLsKQQw3z^oiXA2CJ1p+5z8l{pOW0m;dHUCs1Fht4S
zCi>6#nt-diU%JD_mR4ED6D^FHe1MCta1$1iw^$O*E!!zGW2qFa(VO_A=$2q>CpzH=
zhQV{mCO&;zqA{uBSDN#y?%;klwoYF+G%bb#foze#QgysPhm;Se=3dF9PAPQwQX|86
zH#@<y^w^&R1y`oRe)$SvelOw49a1DUm%H42ee@`V@uS?YVo~fn$4uSCvb%|;?D*iX
zgWRo!XV6=ej9AK=tqTFyp(V_<z81f<=b!aIVsQ_t;IkivN3AutNkzdLbK+ntD9L)c
z^H8o{m7b-*Mw)*fq!`g-II^NyDj3M218B~_4aVPVO1XklV2FVB;QeAkCjF~i4$L`w
zc8nx~9DYOeZ=92J4*0rF?8%}PR&f^f(+<b*zN#fc0|a#BR;vDJp{n}UWw2Z;sgzq4
z!>QAdH)vk|W8$+&uv@%6IH*;b>*y6<J*L0f@;YCwR$UPUxR>6>=TfiFnEqB>4~JcQ
zllj#p`rsbq@E37^zF8b+u_WE({(KACRJhb{afEam!#GUiYEn^aRJ=$5Br+;`si%Iu
zhv=o@KD{pt=by#_%lt|#;y971)%_qv-Qgif5NiBBzUd>Cxa-%sdxHDldUCdNMDv|!
z%oJR<oVjj1$A(WQp5*?_GBUYGjx$_IPCbykntebb92CS#ZqAdE{z<a-6fk2)exTF#
zNe+9iZxBj7uvLNUMr;U~zs~IWm0JN9qH0fF6?_Qm`3F^^SeMuW1FWZE1k<pWApUw9
zhH@DxcLu5(b2o}=DD~@D-l-~YKSCxwMFYj0g>&&c8fRWhnxb`dC_Y8W`iE>z7w+)z
zCU?)>MuiBSy&@7}iX<NFZuLHm_kHf35u@T)-qn?Pzlv97n%U}N0b-4Uc14X*@u=j7
zQ@Co#2EQ9^u)F74qv9K~5IM;8*J&Q(wH>iZuVL=jxP^xe=g$@XHq$%+%Ow(JHMg=c
zNK7fD`B~jOVT4Mails&c$^?f@l@kHWMtA)hZ_frf2M|uL&f7abt0bpRn#%O`HfBCa
zQ7_3sWH%{U=4}I|)3P#-<jD(5BD4RB6cYEf66dd8=Q6KGm@(?p9}vBoXT?7I1`AC2
z<WFR}7s!P2Z@S=GxK9uCOX4<|zK9ujEX*y0T~S5ks?i!gR36;w;0Sk?)@tGauk(P{
z`BQv)#Q$Jt?q0-9N(tx*t}#DU-_`iLQ4k~kCjYlOj9D|MxxN2VlV8~iglm@~P~(iZ
z#{Bksg&H?L4t_qp7X18ivr4Gb#cz@$nL%+`%=2~GS9DhT+hnC%S&!+gKkBSjuaecp
z%DP8qP10GLPNV&!tgLHw))hLdwS1LH7WQ-YS~ypi$kQd(iieOQNnFETkfjrMKTV=C
zXOIz@W(W02>PH9>^h!DYfUFc*4QiRZ8Ovx*hPwM&^m~l2ix|{E|K3uN>h=ot#wpob
zz(wd+^Zh8UUu5T*Qg$AE#&V`!!n@<2EejzgI<R=Y4BHV*GWvlC9PhG5L87K+4P2DT
zE68`UL2Dzcj4nl4<E7ZOB(wd??OvSzgoGEp+SoMC6_}PmgGULjy^jNG6Aw()-slA}
z{sVl=OZT)!P93^5FWc?hQxq!7Bl>1_V3X4o*x1Uo%2ag<3TD36D3{keUB)~3%H-7=
zfp=B2CF!198V80_@&XM`-#-HlDgL#vsMHo{=rr*YZ#X{>&N&-VeDt(A+_Mns5V|A<
zWAoOK1<Y;v+2(P6gyU&15;HtbZeLl7GkSID!(8!T%BZ|fEN_o*ADG%Dt+l@@Fet_U
zZs>v(uX7V0V7*ao%({WWxx@E#R4WIDp`%F8R;I}Xoc;$?b2XgEEZAiKv#}|1rhKtr
z+3+8i%72Qqxa)67!xqv_2z`&bRJE#<yQ$px$CsqLIT^THPXoZRKF2d|b3WBW%BLjA
zJ^Ul&M$2(lSbTpf^ex)w&|uywYCHQFH-b*OJRy#E>5W1Gn7y;^qH4MkEd$|}_L5Wm
zQ4hQb!r8&=i#N%53trM$&OeeRcF0=%k*zZy(3zjSNv36DB}=-!&b?jd?na#zGqKva
zF)3ELh~gp_e4of}&569bIqBW9PuUALqm(-N4B<l07MG+B=0!v3W3X1)84YE7t@ZbY
zOrMGTKnE1B_x9ZD&S{)vD4PVuqDM-Y7R{_Py4|QE&EBh_I+HfA32Av2{?iu=(~j)M
z;atQ<?g)JCh#bXtDu2Wx&NU7*m3Jkr8J)Bs3z@&vF6#TQDdzCCTMR*AMMc}z==!bm
z7#kJKQyt;|@tw#8{U><U4Q-Od!J-%_kO;<#D<k0&!l`$65Q>sve!U4xCXlV51NTox
zMX_?Nl~ZNswE09Ri-?)uAIpwDcfEAJTsoxrQ)xuWCC$FNabn}EQ=XPS)p*BLQGaVq
z;O5L^21m&#R+y&=oDv-P7_ZXNH0s)V2+qeVUrB!$L?>OjPYuSZ`vpzp`X|)=8NH;{
zu+Hg;FspKsYE@|eztj+l68q25&|%)Dn`p8BLv&3mXT<w|JJlT8|HV?3yRMerH<?#7
z+G9`+bW9q9o)Bvcn?Zcm&4h&dJL@s5&^dporRB2N7<PZ#)HLT34mk5jQl|~`-Xe8{
zFJ}S{nu|op$*B_h`}cNq0=k<<iY_YCi9I0PwV6kkV)Qj~%Z8-&GNN+36S$nDx{Wlk
zIev~XkJDL?k|q8CL=lxK%jBGO;C6M+8on3L=2pJJi}%yGd0-=*3g?R$kj}w_K>a+p
zxYL^|H@$?fNu#H-DkujQ(fv1cMKa+7EnvNk1_5Bb0-LTvD1uCzzJbG0nD!T)`Ggo(
z$tjo2rDV!}r}}3@LfSza@6bKFZX+#LZ)~MlE7i0bh}mDa-c{s^nZob%GcWhh(|Bic
zbbEh2m-gP*?OkfMC;AJPR0-AwWmE3tjAwogLM%q<!06Yj`_~tXSYBQV`Bn2MFK5b&
z%$K~JEHAS1<>gp;N!2e`5Em|K{b+fH#tB6f%$qrvAuhT`-J7g%f+Gb${Zm^xud#CJ
z-ao|x;O)XYt2cJ*L^Fj!Evpx4vfL?pHY_{~FNcyp_R$IvAsE3f`<fJ0c;Q;WF=OdG
z`lN}Wo`xglt2uh30I+;S9@9{;E_uOd`HLLmFs6hcKT7o;;^K24BT$QBzF#0H0V728
z<jd2!o0)5aGm+rr?Vi-zJ%benNb>fG<wcv2n~@xq1}eSo9Mqn$l;PASCS*_=SgU&_
z^KZ+0kf)*kY~)gjUNk9EKn9p*mZ<g+308T-DP<*j4U@(zOfWB9FSG}J@Ve;v-~qR-
z!L{x;xA)a2JuqIKbYJbGPP&G73;{u~&Ft(S!WV5y@)|L~5xKP4n=hmSvlkEQY-Neo
z0O%%;GK}Gkzk$@8e&8JUnH|0c0^x#Mo%QFJ8=4kkxZtQ1_cn9K+GKDRIwCJ)ZxKg{
zBzdNmWNM{vpWZU+Vv!^G+nJZ=3z&K1egZdfH7XvX2^gW)FhXS1aJ=;N+#qEQ^Hf!q
zfIAegU+3+)S;?>(^%>xdLQX2dUFPay=`|5y<@i6hHzQq!yeOMFX|3cJ{fdzAFhvD7
z_ERPbjFi0|!hGe`$n9{%ZZa>%;Ha?d7boYafWBA<0i+6;5qyY!wnl6?I<H9_Y;1nj
zv8~B;K!1ZZwA+(*Kg4pN*)B{gA~+06_tt0>QThxVOup2Y<$;90W0c^RWw6j5Tmw`#
zdYoJjE2+I0a{Soy(WwH-*52Sf45HMU+JyOlU$bh8%%>X8KaJQqHLE3ZAGKKggD92}
zKz|VFbv|j7x8Fa5T8^7D!V_G@ed{!~#GCWM#Ir;qpSR0@1Siwn+!ZBx%W<7Sylq4~
zdcqxZJ>Yd-HGB;*;Hl~VWADx5qpI$Q{~3}<z~Bu68Y@eXP=g>6O)L?j8Ir(^Oe7Yq
zSVeF{tQ(RUz=9H-1i6l*vF=u_wc6U+s#S~DB_>1?78j7pD!afPMs_7(%RKMTId_(T
z+VAuG{hrtB`Qw*X=H7G8J?rOu&SyV(l8*~NNH<uBn_*Xgy*<cg>a_drV(XDrD(h06
zRq$7`L^EkkCp{LgBT!U}PDF%PU2~2`!!0)*NdmuQ{Se%0o18%!v&T@d013E6_=-T$
zNL5outGVA%e59<`s7u&XE}prGopZSe5`e`yjSfc#fLBmg;fBg<Bu?W1Ps`V6uVtl+
zdVIl1lr7SjYgGR~cwEVwT)aiNdgGvi!+yCMk#*&Zv$?22f6<FFT9<E4h2qlWlG5|r
z^j16mD67o;tkhg+a$C|B+!L=1Dy)mn7d2H=P5q4fjGEswHFA$*%@TPaFjmcG9$exG
z)kiDG&Ky9@7;{gZoI}>btUE8(NhB(KZ+*H(kbYy}NXo1p0^iVb;?Ilxha|IcI&nvd
z*8}>b1C{|KYw2DOo;){#-q?ax79X$+PG!bq%iaBqnvVUyMqJFL9v!LjNlHkcI@0!o
zJgW|-OHXF;rh_6wH^xOkB19!a(H3jzD^-luRT$NM*mb%S#ZqLC7IwcQrS@3ok|dQI
zHEkS&;VjA~HiwK)>zkz8N{eX?!sAF5ofIIp+47#P#v+*H`k(6+JV!x7sA$7|jB6b@
zB#V*Rn6*(1<{od1z2&WoJ;mtPM0=cbAD<`iVz;#A=c*-mfV7tWCp$i%t5LBKlWF9n
zDY{FVtG8Z5(GaVYfG`=cXM2AHqL^ORC#(e8!;gx8$>I^2i^Js{Pw=u#Z((2G9cs=r
z|6uNCKl?hnLsOmJ!kp!Pgh>mgdOwz3REoU(OAb|Qu<mwunV^Am8>605x*RMQOO&(0
zVLbs4RFg%ct7VF?9-!m<$FTrxB`gQS;^`hZ)M2V@hPjA5b%|w^fPYRG%7Md~Q0J3s
zOBJ}iPx)f2R@!B4MBKF7daJ*hua2U^OZBZ%r^0UQIhE=kf<`ZM_-y|f#{5qMhr0Q?
zDxyaXz#2d`k-sPb<aX;JYJS@p863D5oYI4C3&c9$Omvru=u+OgkGoPTHu{~hb2J%1
z<Mb?Hbh8AD4<U?h`kTbgJ>Qq3pfiTQxIwe<kj(E4GV>S6{4T;~R)Se4;88>$vg(0m
zt}8FW{EKh~D#UwzC6Fi__vkE0#}s!;37|nqJG^_dH5~60b$&JaFB4uH)wVif^~G7|
zp2%w_Uz{~<Y%;Tb6X@FVW{EznBr7tHdHFTsR&e%NZ0I<ry>7Xzf6*m6>nUL-5}bCO
z&OA$J-t;n=NzP0!>fBVF`+$hP7lQ7gzpNUv6g(#2x56EmosG0c)I*A!h6KWNcj;Lt
zLpc;Z+*htS|0da4S$&VJ#OWP&V|FEm_a&Jlf`c}x#l+lGAh|c>lLDuM){XtZ6dHoh
zWc``}g;Wk~Wf%Gj>}4bWQuPVczQ%aNoKY(8Ov!tODNm03oWL;>)*y++uB~vJ`#3)l
z5AB`Snap&Qxr<Z)<qNdGXGFvieZbDvoMoKXO8+1O_hEbAmbXr{vBCV+ic(iYxCYEC
z>NC%<>$0aJ@X>mRI6Nli4^Cv9FhJ=+u99Jg4Yei$$+nb?unWj{nOVgHWht)DsyOY4
z>|T*7$LmvH3dm(yPWF*Vw0cVXk076-BLi3EqU>`Mk7<a%M`TPbM&RudeZ!S-M@fS{
z4cobnzMa9fV{|b8Yz%U&(j(Dmu`z!avf#1^i<n+sxLqRWQg7f$&*@zladFi)V%FRS
znzStlW&j>fG=^W0QHVz09gvi2+2V2+BnYY3^VteoCiY>2htADGMtL022)w9|y~8GE
zS6;zwJSPB6n4`r8n!$Y`u1^N(WdSmu2-1szOezudBX3QrRq(=q6s+tSOXBk=9w9Pt
zcY<rRW(}h<wHo_5c@#1qmvr#8*C$NYRH71AhzeGVvkuW8kGZf6BY@KsBG)n-J9nM*
zmc`9w799Kzv48XBSJo*@d<3$sv1(MMImJ?VsLBkTJFSmU?c<@^*ZDK77wxlA5$=dK
zvjLWZN&AJCnUyn+^Xx8kIKs^W<Vp#==Mq8DMs*=PULx*Ca@<6TK^6;o;|{$BJYtwH
zjXOXDo{w2u`P-PZpNN-`C2pIYRD4G~<AFGzx~U)$@4h!T?hdi9Zy}p__ssARYZ;<`
z;(HLKag!a)TipOHB|erl&qZ8{O~T>Gv+=jcBLK||7qbA#;pi>H+gK-xRx8DR5iZ#}
z*h}XK|7?Zr7vZ_Buk9D%xvekk7ujgm0{cbSYwKU~66*0VE#=Y12J5??Bs6TwHfqjD
z5ytFx!iRrENr{FT_0*|pk$B5GjVmwgxD?Ew09J)N#g{WKVQg2FN-gdXt|ad}I#!7~
zLIl?5K{_g8r=SN7EBcUCyoaWNf$q|n>~xO#jaDhcm8E=BC8k~3OVdI^oXA0c%&&-(
zQ!b}MnMoQ*gdj--V~pg<(j~=rw&Ra1_x4@v?sN!R*-cbM30mRJ$u+7wsndWUBnKt@
z$m0zXibR5tbRX>Pw556tWmo!Mm(o|TgxoaCB5UgJv<pS~<jZk~=SdZzw2ykq5@uO<
zC~Bq4eD&)5`1(7pzgqUdM5a8po&H7<CPxj!^PWH?Q~q(%7?Lgw5&r0e@n9m%{Q?V;
z;z{Zr0;MobefP)XN$SC5oa61ZIT24%kKO;Dq|vwwKpQ`2a@3HF>UY!{%*x5P29pyO
z8(lJu8o3u&WR*ivwlQ{**-!MJnb<o*Usz{;9xLQKAY!}j?ej%Frm3y7prq`5JPL>a
z$@P=P4z#8E)-t1G&(sironz7I4&-U|R^}MKK>kb)cMr=SZ=;jmKzkJeo#U(;7N`s<
zcsE<V9Gm=erP07Alao*&^=D~s&~c|A*twEHYP@&-V#+qOW>cU=lMu81z}fkR0-6Sq
zx?UnBfQtMu7D~s{69_U}=^}CA>O@kih_qNY{+xrH@^b$13@ghuYPuo)O2mD!_aPx}
ziHVlUtvvDgvy0uK=LKMFV4)+P5AB`0F-HH@tUfMYyg&0t@0XHja+&@uVl&kpfqkM{
z9i=D{<wY0#xAaHjuq}}%V=oe~2!)N}C+N(cI3oVviiUMTtpDxd-HunXhW+d9gkikw
z-=z{0wLH@|l)a(i2W?{Ox@GGwKug8eJyN}#VjFxae%ts=|0)Kk_w98cBs+xAlL}!K
zJp{6~Zds_2&qUI!_s?P`%!PsyMS1Yr2EdFn2QZig&d^G~A@LKh&H;k9=5{#}rF^j)
z6*s3x+MhAxv5U5I%Wz~<_^+1=$qWDBE{bA#GTk;IlSgerCWWfwg@W2OLq63eWPjZc
z2@V3yR7tQgZd8iz3(i-{XUmj)G~?21w-g+w3d+v2#xpo!e<mggPOp~op<ep|LWQd^
zY__P3aiuHit(KXV@MY3RIr5QUw&c=sR_W3`+Lu1K?*~fDVWvy(fXqp%d(jV-meWp`
z{&)M*&;C$pISzH{VEfXy|4?a}J6(Ew`_dQxP-!_(b?NgcEntyleUH7r9A7V!Cn?#u
z9;K^mv=qM7=b8|S6Xm==O%aL1aMBm*@bm<RoK&m%3GBiv?^7ql@f^jceFQpTP!C#N
zPq&G~7(X#FMjY<&0OsxaiDyv5D;jaB<UIpm@9%-*Y8?=-{KVv*>leOPhwNXa1cdWJ
zlJ1Zlv1oxp&%eS~d2ykv19+#oEZfPiTc8t+n{IW>qhlc8TzMU0rSd9rmN}mY5^Rrd
zUq!uOtpH_Oy&Gi$i>BN{!U9R?LV_oBZ_n`MM`IGBpPi8`QxSQ$>k5%_<SMrx4#O)5
zbg4NWWA5KnpGKtzQx^gK7-tEY&J!XzdF6@ixk-NENJePU9m|!e(ZeHr%{g>AYjg#H
zMR7qLuYM>HF>Lu|pfTNBxYdXDMDA_&A+L&BSLyGz%8J2oF)LM1lZ5G>vi*vjoL7&B
za0-Hx7BN0jrXL7@p)`t~N2dc}b|wBZnk$Pi*$IGXPoLBGeAqN@n8IdxsBm}Q;9Z%X
zImevrJcWroM#=a16yeLbw=&JT%Zx@Z3k}C$4>Pf!EB+Ey)k1kyx^#0b7)4ys!3<+}
z%N8BanFHns!adx^NM>f@PMD&MFjl(<9G-Rt;js`3470!K!#BD-nBz9Yno}~%2kaKM
zi%IZ0TV<Y+<RTH35WnfcAs?rX5SSqvAEaWs73JLb04|w8v^AnT{0Y)+|K(yg1#OY(
z36&KSr;F|k@D}c!I^7*Cr%xO~4dE0pJ@LC%drU5`E92O47p^e^><z}CH2A)G0*+;w
z;gcmTl$P2}lM`%AdLp+SWd<_|C53ul7>r<k4(ItBBc(qwexSGv(Umpzv`9HtJ6r_4
zE5+md%1*5;R9&GP-okJt+8v-^x8<~(OrBzfl9=MX-?kz4xg)dcfHu}o7VyO6cfyZ*
zg9SHVc(Me~Ah;NE!Be4kP;7T)wy;?6U+4+kC;WQeI`oq#@awOgNvQ<C-kf$CRjCyZ
z*sOay{aQQz9jALO7ZUMr%(&MYAS@&+3G{8W2JwQQ`cQcuBJ$AEoZ*Uo=b7|kd7jL(
z2yg)+%wFZ{Q3L`-w{i=&!a$}7=!3oPFJM@h%T7UXl5Gyk2^>4gf4+zf&(GVH*Z?9`
z3Fcb~dBoD6$@msJ-NSA}9wCv(>isUNZ-fG~dc(K~8vchscyrpBG@*u$I;^=^A*|pa
z7MiBmi;5o^&}{u1z!Pa8(w>6#nSz@8zi0?*CdfMYxEcA*%<RhNZ23-WvV7-a>5LW^
z2D*xT=M>OKPk8l3TD-GE4R{;_j?CBiBPQSJ^qh^4he;;*xahVUK!P8=4H8Ue6frj}
z*57TmMv>iBtD#J4?&wT5a!SI7lbX}=^KtqTan-pzFr2gbiLc}JTd4c!&25;4$@H-0
zqP#HL$t&bQ%l4Wx5cU=1-obtdby;v!`0kBj`??y>534cx!zTGqzL+9koV`hX@vPEQ
zs4xC3U%1q6<h==|&$m&CjDpq$s#fdf4HR&!Zi`kG-G2d#Bmzs(CVoUxGKTzgY*o=@
zRiMK)jm7hd#`6qzK1~f^qt$zp%>Tw~!mmJnD*RSe<Wj}6nYdTa6BXINW!`+qIE#kJ
z$H9U^e&Y5f!q`;5#8K96BMjBr)Q`nTU)iGjCkl4t97!bIKoUl*mxcx%lNJ>1#Xv`^
zl3Iw2oNA}l`5Up?LTp|$oMyb{O%MlxXa{vNh-f4ioQ*UC$%)%suOmLBLl#K5fO@s<
zB6*4JPz#`xVRyp$FZ>5nTyAa?7-dVOmz&>u3*k|H_j+-B7W5&cz4RX^XzQiXO5<Bs
zZhr3_cn6^rw)s1$1B*KY-p8aN9YGp>^YxTODBtdlC^2z0lS7@mXSQ2^gj9ZCW<-m&
zrOwN!6CpI62hBqVh3{s~RbASEvv<5MK?S+7+QcMtEVZesIeSjOL1tIz(=O@>6Ayr3
zXo6}8e}8>jv{4_XrzK`X*=N+FM`-mCvM7T|hXa^ORW0u2SaJ?J;x7EGa=qIew8Cv>
zAo4y;I7a96uOkE#YK`$5Si2h4lRzHg+Y;<f9o;Cb!o4h!`a}7|8*D<B-c)%3`Agly
zx*wRzk%-Ijemk=RnN#~oG_K*1H<Ei-$`JMUQiPs_hBM{#X4vgVGNlp}p{PTa=*WN^
zLQA2i=<9G4aQXZ$M>k^Je-Cr2!b1vwT@f@uFJ}=G$s0_&YY5f__~Z1k36CC^k>Cyt
zU*hm}7^f|J&D144g^HqLOd0BN{*BNX>Nd2%`?6!ijl9BL*do5zM<g}^LQQ>4pmVM&
z`ko!+khx7urDkv80VI~8v8kTIl^$ahonPZ_S?+c&C%|0KN|(7HEM!zu+Q)q5v~D;8
zA!F>RKVZyGt0M(`{F+q9UtpL++au?pzCAGY?|iB3aT~%NguUceZBJlR>m=jO`box=
zdW?D6((<opg)ppuq~JcKJ$N@YJI%HLght?SR%OS$FT<;44+eG|&HHjvnK7+?QmJuQ
zmyfR`mI4nwCXLk(y^c}MB;&4g{gPnZRZy<qi?&HgzPMMX_ekSi;S&;9PjVTDSM#um
zhqld(r{`n*bJJs)c4ph!SmsvJuYD|A<qlD~M%#LoX|!!udBrNP!p_@h=Sij4J~mP1
zO~EEJn;|b?j7N={nSw>x%9pVtA0rFuXM0*6ot?Irk9&Nr2hi*BUrL9InJc<!9Nx^Y
zMI}3>%8T~$@1To#1tyxa-Gy!b)gwaPS<ePvuS-Kc0|}U|4J>f70RoMwvM~G272-cX
z5(l}X=dyxAcNMOwJPmmIM&6MlX3UnIleSG><D_L#lgdq^F2CO@JG@t&q}u>SCHG_2
z3{XD(H9&a}>!;-|;dcda?DMXw=x)Aoa9?XEZS{ZzJ}072(L|{&Sa2&p;rzpRngGwp
ze|n6QbZ&B$GQ;IrQMlv6Q=}82qVHN&t7lKNw}Ni9l2-XXNtdG5|NnnaA{W7D|EK4J
zcn^ORQy#P6CHV;z4FjcwFI=ra;s@HNys69omY??ijylYy@Jk4|Qto5%MUy1#kHi89
zrOi8=7J^ULlVH7*2^@#IY!R;x6P6>?W3yN->EjL`f;HH#d&K+IItL0^ImBYApxMSn
z=<v;jI6U}c3f@cJiRqlLy+}$I^dQ)Q(xGz7Ef&$CZ$LD9X{h`6wCxb~p-m@`CmHRO
z*h?_>k!ea#EKw`!tzU}rBhcP*$n~@W<taDQ)q&KcD|z)D0cA}uDzXNaB2@G&<^`VM
z9~Mze$nJZEQh!}`1SIzqoIiQj!#RY28iNlRPj|(t6-_~BC=%>?D<qu=o_o?cGIIqj
zY)JQj?@PsoXQyuaIcR@ZV0Nhk4mnh`P6g}Pt!+^r9V#M$7Ha@S=*TkN5u>_F9_UP8
zyEBzCyYT9EcYc%Tj!Z8+ebFWa2)aMVsFVKOCmjhE%s#O{8SVP>+H(2O?vK;%&oeBa
zjg-V7D@J+i--?^uI11B4V6mzdTzqx=0z!X_G$NbxQkg~;yvi-u9<eHjS`w8JINKy5
zM0HW$1%i6awO(@{bLpaeDYZtFvEAR2HA~p@CF;8uF_B&01YKWkygn}$i}CvQp?;G3
zB>1QG$v=;)4@v`9E<kbPwbH-sBJ`Czb?9HQsxQlWQN0%RT%~3gMHQNI`uU8ix5dhn
zdT*!R#Brhg*F+X0bc`5=7OY$h<Rj;T9&E9csGRYykB$FaEZsCPl-iYwYms%j>a4&)
z+@3L8h4jGSr{GWUMX~WynEuutU!R`OeWH|+@k^DTylIbLtxlqUb8(A__YYSSHU4;g
zm;M%WM3GDCPX0mF*kb)cYMe@q+WxEkde^7N*XLvTGL*LWYkt&vOHily`dn`JFH9*b
z=STYYDCc>+e|PEnUX0iG?)9<xOSQZ9oK&{z4H;||kILSNxf7?&^R`Igt+KU#P7WQr
zDqhhdEZ|w%I`lh||10vXN2%pg1<!GNL2LcvLhftZKkLnT#HcHQHRy5#UkF-wo$0E^
zqqU41_=>_PNAk2~YWUCjF%EVtU?G0qv|dGNK5F@ZAER_HpUhXESTiUsvL@@1BD!&S
z95^XB6G=~HCj8V=8oeg`J^fG{axC0mu~T-tSh)J&3awjDl|iKF$SKF4Pl`X5a51I9
zg30|yqnX6fw&*&niC}zN9H>LrMY0kAT`Qs!_nae+VrULr)2laT@+d8BSArpBUE10e
z1ZE;8vQ1yka);(cqnPj^-Rrx^<6Ic|H_?0${lpm<p}_wHlD5|W3BEV?by=<?{H)^T
zvW+k){DP>tw>CWyUVe?)&=AhbYq+&RUJ7PiHE$(SnbcdCsJtZ&JG1hZ=$BjTB`I%_
ze9$GYq4d_UymMK2gJrnpY@;oQD{mpvfXO+8qXdN+CH*p9=W3wt&^s^_Bh1gl;%_5#
z^(l<FEjHqQbGAk=5xD7oO)#UjwD6|dRR1!hb*Y8E+H`+Yc#QzfAOysbm3g~Lf<0fK
z1H~UTrn@5NCMNVi?TP<GdXrTARCCRo#Z<M|T)nr%_-DrJzB2>II!$Y`8aiPAkz91z
zSCMVPW~L8At?0jAd7<t#8qTflH`+nLImf6a(+IrAk4ORA567sUPrBHJq?_S6oAt;t
zJf-GM^HSwlF|U}HZuXp4Vh(!!O=n~rak8#3*G#Hqex5QvbD3X&S&pmG*Vi>?htxwL
z=-KfZ`aC{EpF}!ZUBH=<)g@-n*I{*}ZI^i-XHBx?PxXd*14Je)l^xO}Puz*bszm(d
z*8Ls1mPE)47c;xcUWe@oLr`Y!GY`yZlQCD8+snr;3t+G04nEYXS8=fq7QEVisrQYH
zEolZZLrdqSMmmU$p#j1^+eP#Xn=)s1bbr3!wpIGmM{e>H%TIA%8ZG^~kp4v8mh(#d
z`LW>%6`V}FkyGOP2g1oWA#ax}nEFDIf{LLcfG&2N2n`2nS-V8JOtV3Q!(JD_@B;p9
z0K;PI9hBtB<)*EUY{$fT{PFvp@r1g}?t8KW_BLuvstx9+_g!h`!`9ZPx<N0+4*$1F
z->CUi{U6OYN&PQo|DWLrez-x;W7M}@g_p1fWg)Ix+mzO??`YW?y}I)mE7yxx)Y=`y
zIk3~JwkTduBLG4z$iQ;zy6}OO1Q~_jR`h~-<W1+?v`yKgQ8)N+VfH@5Ziu~Sw^r{T
z36i?{XlK$cA>ISdT-99!$q~PUpwXkC*_!<qyF(9Xth3S0trDSv$4^#XCzqZk*hyE_
zls1z#Y$pb~c_(ky8swdkAa7kcO{~#>03$v5*BHJ|h{3SLUKPA7J$PM4UcI@t6GDn7
z0Rd1WYi|UeQQ+{M*g(%Y6y@b3`G>$P;h0f~9iGV>x&r|bf~ehM{U4%G!`5`;-KFkM
zVYi9&a0bHod{Ak3rKPaGUMAQd+}dEFYTmSWZ<*0>`$<`shEgR=GhrAkOea{!Ob1F|
zkJF&2+@Px<H*U^bYRngmJLq=`d>?^GuInWqbHUt0j5L9rCe>maj|vIr;-%PmH1zS9
zYdsA+yL&>z&wy2y9pBwkc-TK!yv8L|Tg{(UJwNksh4ZDs($gezi4IV~&JxXhD0THU
zjh;D&R1Y7J$Q9)wS5#-ZuxZM4cm}MKuS-F9BaDX9)3UtI)tGGAV`_9e&!98j0qf)@
z63S5I;w(bdVYZ`+6dR8>yPZUKQyuP|=r9JGs<TElW)e|u(Fzp}1d*_RF9$GO?+GhY
zS3!q}q;1u_D(uJb(->r?H3Nz#x%+&%uU37z26rUB<kLDK5xnMNYcyXbCa~7rOAoO!
zbncCfpgw7PF~8;k!BtFgpb-F;S<GZvC1i{=&5+dTnq#*g%vdc`$ZkzH<}aVK*#T4$
zHzUj4X~{L1u!{YJdW_2l67b6TIoU1u@U^kGMrEJ|s2K?Mbv-z;i>L%rcpl!tsAi9H
zkL1tQ1>k4#><JRdaLy5@qY9`VOaeDx8phl*p4fw5wRWcTd4%a@4>~%_&LYfhM>JD-
zvM)4hE|CvVgAYN|S;jP00RaX1HAKshXKV23@<p@j!P1;`Y5WficQ}&Zi&VqibIL;x
zsBZ#?I~mm@B+=aIfqL$Q$8A~OItgl=qze)Ji9vY#c<hBMeWQ%|Cz;KG9kbmlb|_h8
zrz3%ej@9)uw%FmWNPo51W0W2V9O9?sh`ZAfx06tK13ZBSd=6In{~E59g)mn}c6d;f
zb`q@{?!BH69cD<#cPrcw3>bXbzRvQ74v1)GlH}%N4jmq(Xs(fof=GO$Q2<e1(lRBj
z!ssd&tVgY<ko|I&AY1Yg4!EfLVFhfzgYT#l;+PZ3`sNix+l38Y^m{df*!krHT0L`6
zC9_I3>+IHsc%7C+EASB3ukXS-$&}i|OnSL_u@^_1meoL78N3MW*lueIgd%Zkx^%$1
zir)+oMI~DTc|!MPSRWV^(lAr==`l7E5f+NJgj5gTzLOqyrZ<|`EvAoTF4UP1Vtp)o
zrOHv4K@mYVeRP4Bb%C{yNdb1DqppixpzUQ<!@O%m|NRVzxC;rMhSn6Sy;Fbno9|&s
z)DG;&N7kqMi_3N9(~WefPrO5KkO|E2ah}e<WF`4Rr^(KJl-yd{$4xe6RJ!f|p(6d)
z`<#v8si`<h)-qqy)3sAhcnSvvSBr=ox@8R+W5_`0+Z;X{0@&s^Z21Da?jspl-a2q^
zTM~W%lMXHw{JP)jE6Yaw_Q1kS_nc$W4-6-jMXDT-!g_N)w%^a9oKaS1=2z-=w`1PW
z^o;oSu3*Es9xUsU8f9<YFH!2tgEyo1_{OLn&5QB+UaqoE*NC;Ikd)+2`T{JB-lS$X
z8D^uaD%wGG)wvR|BHO5bk?xYlZarJtQAS*1)>xky9sMO3LU}Ot6Wo>DM0F>Oa>KEX
z$fS#>o^CF$T3iLz#yiA+s<|?-y;#J!=~Y{17r*N?mwSYCt0HFOh<O<oh*v)%6p4vg
z2ShtHF7>|@In|}WIw|d8!kk2|%-a>dlV$gW9?y|lB#JN#I;FmJlJTFkaTkj)d$wql
z*qp_TLv7Aty%BRsmyIY2w+hkg96A~JfNHHMoi{VxGhl<)ya0!b6$d&yl+!{lPs47s
zb4A-92bLV!0!t!}YD6sK4PM|0&5cGM;cSxAZ~D3F3o+U2hv<q@&r{0Ym^-Wr;K}uF
z2W`fk9YM}>uAf%C!FyS>C<efPNLA=kT=6_mIi8wkqech~qQYFPH~L18T%fhl>qHqK
zs5APJ4BtE8m<%9$ug5tO7~O5nJw=vuBVsjc9jogx$2(l_VtTuEuV>CkLDRZZ6tS+M
zXxY4byGRAO5sZ~vtoN`Ph{ILjRgeILtcq(3x@m_o80e>0v3E~;lD&KRi)u5^M_tRI
zdkoQ6om{}DaYgq(%9bfs`W7||3*J-d@85K~=BD<B3gng^LL@+5$-W3jPGjTrwg%Ia
z<a9JG$JyVv2ghP!RdnnLUaQBrOx3g}=`>DiAFlUQZ4Z)Y)z`-poYfnOCh7~fpC{`=
zsL#lMAZk%Ai$>AzMz}X|oRhahF@PontFIedEoq{|-)AWCyyWBQ9~UXx@)T`$p0v09
z=h+^tz37NsPEsZ3>L(oz*;{rcNE()37Df9x%Cb2*p6$=JN1EgE5JhVXJ>c(BHM5I&
zFEy29IIe*}oS9*L)`@{xNg}^?_{Yskxr2YtsR9OJqjH{>UrsZk3Ug^oXN=6ep$}DV
zdGMBWIht#|h1-nk4&-oSy+Z>1aVT$5_6nU==tFtEZIDiS-oaOk5rwSeSausFE8U$4
zLU-Qrc7vXj3Ao}-$n;%L+-6a^Jae;*Q*4^h&5RYrYI;mFlW7;37B3Rl$zZ2tCMjfY
zS^}L7;gX3i5?6|z;lZ5AFi%81{zoF+s6L%!C=rRYBU0mO<)QAYv0et>X?q5=)a~dI
z#_#j0u{Q;_GxqIz>@9lio9!q9q$`4CpC(B5xjVJEoktkF4AgD@Ol|@67JljfW2B?k
z+!oo&R?&u+SZzU8undm8FBSZ%`F<ADMPt6-VYWa7m7<jtItUYyPKUV#reksS!5OP<
zN=RqsEiOLTBr|`=6*wYOe@Kj}oih8<0nQ|N;(?e58G+BK8p%&Rwp`*mre@2iZcUY0
z5q{2Nt`U4FR9T`wlgK`C4i3cEzX<|88k>U_JqIoBP6xcsrL+wq!juX+3J+HxZIORq
zskumcG+<w4F=}qilTsVpW(RL*YI;~i!pZx|E;m_wZ^?xkE5jGdVQ9;nP!}6><){}3
zf@P2!b@DMIyCNOLxs1}@!j(pK249r(I$*CSP~VE*cUI+<Qqu<TS)w(;C#pWV>gTxn
zZck5FmF+gF!>Y2Ko&ozkfdy2C!gh#M_D|Bi5-Dm^E7$W7EmCOFC6&Y7&SfAlo||;J
zTWQowSeWC=jY%q3BK2KQed5JNm!0J(c;3T=Q2s7~IP`?d(E=@k`#{57?{sn82ezU-
zK9o^8GA|l_9e{%G<I@Fvx-9wAdQZsxBcB?9{(|45;Yav!|J!bkU{Z?&G^zzk9Y$tq
zn~~{l%jC#CY;^S=7D~7+Taq}1OAlvD+7L-AmNZD@(!<4)Rv~E<C2fkNO_a1L!ZXqT
zqyAUR*^Ed4QZZ=&*N29;NtUN!Yg+Pl1U{+7*lT2F7*vwJ*XWvoB%~qR!_7_GE(_)F
zEHAWteeh=49qX@~E0`p5XH&ZJl!Q0f1zcdxXK#Q~bFtnT_h-wX1x&@kaPBXdd@<*+
zLCR;Oy9eeQ?uNFMu=B*(NrIvOtLJAydEo-zAm*p($L43Zza4{y7pFZxflb(yD21Ay
zBr%GFkRGs4aGT9OU6E5P8$BDSMc-bJS)ua16ib<gO{o&*FiVBgf#w!RnW0wQ$6#$i
z*hUSm&e6y%V8?J1Pe#phfX&P2o|*=Kn%7(u$@dr^K`-pT#9V%GjWV5auh`_3wQ8U!
zj+i<yA(ZHfP59Ltfkh-MYi2CAtncJSRyRDEEu>dgNUTUI_C!&uf0`u5Y=2MZ4T(2Y
zfjchp9*_CZwlFv0lCv%R2h{TrOZTi%w9(&-p^cOk%CV<Z`t>^f@mM+m<g6_^YoyLP
zWdfhXqAxY;tbsb~j^B_aZl5qY^5I`~ZWo<fJB?g%`xF-FVV$*qwdzLySaYI~tt1ON
zK<Nvvz$Qjm50nnTy^5>Bg=7TjFdle;Bs2#fSt6T5a-ba7c8F8#araRQL+z2vWJ9Ta
z*rRqtL#UX|N-eVy)y9hmxaC5g;Dss}2tpwi+FHw<GqF1<>rxae>oKom+Y05UUH6&E
z^YjqbyBVGEL7K{Z{Yf>Qr|Qh-7m*pelkzd0)vFwG^{OD`>Pt_^fZNeKlhF{`2kp=p
zTMyoVzKCBfafO(v;hm}OY_nu%hX{J^#(Xkc)K&-$w}}~9R^&o`9=by=I2$2Wxa`-R
zl96M35+hIFZtWrBt@vJghus87R|l9fn$3pbW(FWQ_%0V39g3ba)^b9MK*fD76i<at
z7W=x3gxEs5048-b7Yh>#M<IQtn8XpoMz_JZ-$$KB%`2SdVqruG4D0-;*f3%&++>Zq
zR-l>U;c^A+(%nESJSDc4?1GFOy;ZOQ#~!ra=_y2>w1svq4dx@EXii`={Ix<80EPAL
z<-0Hz3rj4Q&)jDBL)dd6?%acHN$k(wPkiAHcI5!(ILC9XdxKc&txRNszd`M%g4(e^
zYsSP(zhF$(JoN&R+qfgwC6_Lh-sUfN56eF!)@Astqd-;P%O~_C#ErPYp8MRxt_>^u
zGxYXt)Wg+P!QsoXOhXhpXgT(0{9>2ZG<A+Zu2FN9%u1*_=S4NQ%oBUbJ<tu6l8wzb
z*lZDtt<O6grr>`M3IDsaYH1Y%5<7yf=I1v5+p%i%?6;ie((=OZjOw*0k6iP5<|wCF
zv!VJ+U%lveCd&F{LSJ!poKc;wfb4bwvTJj)%gxV{M~oR`t1)*v^*|Km#DM40U@C@U
z3V6gAT<i%dtOXQ;h@$F>>c#$SSq-wFiIFxPgn&5?hDyyDiVDkAE%cBU<csyv&Sk-j
zR;Iu^fHhr|+?`^CmH*@M`v$ci`zOlQScc9Z%M0OiBN$}kpvmk<(#p;6B8=qkdJ%|-
z=QVG30JFuO!Y_;(QQh-Z0wq@i-7?lJ_b~VsQ9VsGs@E`;>}1(r;-$b0A%Mr9&%Od7
z+j@a&uwcf!fc+%Op<o4uvr>x?WKxs9%3EBBhDLe;pTW!N2(aG1j2!tUR{pdAXz7+H
zQq23cqDT|1^ai<e>IyuA2hgbOzHB!4n<`h9#kzqhhkD9kTa$UM>CU7bGpr|_&u#qj
zBlG=5U<dLRB7h{|%{S?|`HoLTQZMy<sD3@lAef9UdNR^=dumEN_K4LqB&MbYW|Ck(
zvQjTIDUeMf71}Aser(-j9g+bCcsUf47p;)}0&9Ip<VCCO{wsOWe5biWZJ2+u0TY8*
z0%zl^6F5L(!aCuFXO5Cz4lS0j3<_NQ|H02eoW#&Fb=)GS)`gsHNVCsBAcq#=0Z(#=
zba=pfpeOI31<|tb*rbmw{GhqScB6GZ7a8l&rO7f=j-r>vs3z27)lg0wIn+hZ)4x^@
zuJ6L<LKMrd#vnzhSiMv9j*BRIAq|zAt=8d31i2JdkK|ygtL9+pq!Y?@wLOrpsDs~H
zPP)1;o3mK%ly9K|FX1dN_6C2I9*9Q$o#8Do=KYrRK(tN0a{0Q%;0dKhL#Ypxgz~?1
zh4SALzh`%+y%<tvyIWSeoeYl`>oj-XUM7U(F3HL1k{n8{S6RCacb+c3=}dRaGEsXd
zhs8d5O_=2N{hWfZ$Qkvn;GlX>@Y;GpbQJC(Hb&uEUxmIl9A;W1f{@t6s+p$mEL~S_
zlr0S$Ybz%<c&YQC+#xPi^^~esQ`Ks!TAfZs3#FotkvceV^vPytr2%E~{1{R&uODq=
zuHqM%Pt=J&f0o46z_KP0f)BGIHLAtP2hcm%|H3?BrGYmQSFDG+*MJKhq2ber=%3%R
z%(-`9Nrt%}-l%XHasN?=7t1}=fx@UCxV<Mm>ProjWB^EuVxlQ0c2FCwd@9jKW#Xzf
zQrTC-xe^4ep8hU;LG{=5EnS-*@7mrFayMH)S{j@U`u#<q&GlX}RT(8s7+cgV(qTpl
zemcyEF?S!IB!wC=0?1nF&c;$Aj!5ufEhaLmMX4=z@>fEsr4XHZcG8{BZwXoo^;)r^
zGsVT%i#^k}N9`FzPw_nieW#*ayIW+VAcs-{sc`CHN;VVUpTFSP%+RG#iOKvs=?0dm
z*)Y7n<iAC=&FHm(|K{R!^&6;%vRdR1YpE79^?R%kS=qIBF;imLr5Ca3A+={JO4u_G
z&SB4d6;uglVAezFYlK)%3WV~ME`H=Qm3!Z9<O)SBC*Lfcm9Lwr9zm8^MF~ZGJy}Wg
zVFkBxA`dgCXEI@vFfSpV+Cmwg;#ER}iXn^tygp7A`zFYIx(nA&yA2}QsNN(cb(n9X
zMz-h)gmEXv$QnTyKhPqs6(>bH+BZ88dct@DL4l@#$7^MQb+__ZQ@6m|O-+A7)8khq
zay*3_jR#bSadWk|Q>!R(@iied+n1h}mPki=aJ94mPm4S&cDM+BvSO3V90|3$iJJsd
zi#dk<Q($1mgmRE;N|1^p+&eBY4Mav=`1Q0qJ?6JEzM5>-&)T3fwQ-ONX>L>_^cMW~
zvWvCaFq^d+@SsY#<5RU7ep%V);(Op*o%yo1jK1#+GWFfghpgo~%hXwK3G?!%Lbsd6
zF5_)-llhls`%dRknQWZC4R{U6oXbu>Aa;OQG!2x%gp85@8shX@vs1xgv&)9Z2H>&5
zAI2KocLTRICEH3Vc^A^@EXgbr?}?IhIlMq^t=LNh4z;PrHXi|+m@@PVTdX4ZK#uu-
zLPZ&wDRQ=t&g5u!$<5G_w_;0dJ4e0%>H?e(-Of-&{jY(4v8m>7j?P?u37IjFeVwc%
zVJ?qjgA<@nHxLs7DM{M2sF%%Lx|d5d>V0B`5KNsXlHj88WLU)+AR0R@M$fkY{=($%
zJ5zRVAa$No)o`ZN(1t1{k*ehF;;$<9XEfr7RyeGg-w4y!r0vkQw|Rw|ylZV|u|vxg
zdf&D`tHS<-+f+(7s;-6;#%D9Ru;oPKvy>#`vk%Li-=qADbK)|-pEFC=#Cw6h*DxAM
zP$0xx)%W2LQcpyy3SUwF%C-u2tI&r6F$=71>#<8N$<Vh5%CgGkGL}UD)sJ!bsbH3e
z97c8Y-d-{tzCz<+F&hzcm;P-}2>&)9nAV3U7NCIWK7Jpqb>v;!9)E;ec~jcqV^7F!
zW901P$;u@$n6{Hg%<q;qrS0JfXQi}F{K6uLB}`E%%puJ6M^gT2wf)l?cm@=mO7;<0
z<}lO#LJA4Khnmu!;?XfKTqF!(sOXgQ6`)Px#H^c3cZpqcBlE<1R0OAIfbT@5<vPal
z4caIsz+#CY;bDvA<HBQ?JHpbwdWUfmuB)gYsd<aTUbxf5bSryVcvmR{Jr+HBIn+`M
zt@WpvAQc6u7y=#CW=HkD(*Y+XrFFt1E`nQ8NKgVE7cD;eDEHBDl~iP1qJ=BRR+z`I
z#Epae?`Z?w#okzjr?szer>anep}a@xdrs>6I#J(PXz0t#L*|hgw<YQvrRsdhu5+<C
z2qhZ0$!;L%OP87JNl!Z-W#;w+LXg4{LyS1kG}sm$i`^0qM4y6vm%vv_y0So<&6njU
z)xiMIi!Ht!)`IC4*Qou!<Oto})rsytoaiq8(zZR?pN?FfY|n)Wi|=Y}k5*HEPj2Kw
zTR-#@_Gm;rj2WX<*~V!96l1hct+R=FYa63c*}6zrz<cBo1>V;NiZ0>F=`SGwWD!pe
z-^8Z0;qt(Edhx3veT9$`!J?idSj{*FH>agZ8UApT>JWPj6)innAv&({;RP5NQNzZ=
zd~g~c2Z~nl==41uSwph-JT+0K$b*DS(5Ui->g3HX@X7uxnn9ITBh@vf)l!A1eL`I-
z4_~fq=4K}*Q_BC4{oJ@D4PHC>Q5G;gka{qjS#`G9F%#JfL&`phDr*Jt!&lfEZlhWZ
z!rW$R4hDd8#Wr<z-$nlOBWvRLgzhdrM0ej=C(xr1628e2!Rv;uT=x?!?UxQF`77dn
z%=$s{p5Sx#-dX4CS8BYA!Kn5+?#6-UY4|2}M9|j{xb**7_Hg9y)IaACfb=pOkYhc<
z1&8gdGsnu(-gZbW?ciYA%{&R#x~MtrS3JU^vblO)#Un(3?@E?#6wmS(IfJUt=f%1e
zq9B+ykVnBiO?m5@($0`VQcvHew7&98*$(__(0ET#5qPz}v9m3z4wpO6Rjd0NJW}1u
zRIXRI{(k6X=yzqm_;zbMc&`HPj<;V#du!be<cR%Zu=P=Cwbv{@qwyME*@vlOzvy{g
z*)Ns~!OZJLyi%g1l@mcDpYXdFh}!!nCNJd_`{Rnv`X==II$J%SR;QdDMk1~Ecby%<
zDmf9%hL$ykdyz3;Ovd-Lu(Iz9USJI?)urV3-OeglD-p&b8KubAEtfm29ek+oSiek|
zX(s@m7W{`y{SfYP?p!{g#r^)e{Q0Zy-*9G$u&6Ct<Qjp7aA(ASr&7s#oPLNDfO9nf
zB}5H(IEi)>?}_;H;Adj#7kv4p4Ci8fuX``MwNTMajtYq5PecF?pYt`x@q;B<$O(R?
z=plWd8X1jb81p5^ABk79^6$FSp`ro;K2XiQR1@pgb>dl)*iX`p(%`#dvWh5-`SsAD
zyj_uBBzBXM54xE>A{;j|N6HQzBShN{y;&(iF^d*p=nIaH#14VDF?+o=5U(4dCnEpy
znqr4p^y`Bx%2#YROx>_c44vyb=%oh-BV7d_pcQh8>55eO3oIbNKb}5WhnIOWeKJ^e
zDn_4R{}X{eIp%RHL7Id|u$RE5>gm95FN9Wk#|Nzvz3+H(E@^%sTyWEdB|$r<tad~+
z4S$+~u{5_Xa?|J#sADYmjA}99ala>4@sIuG<t7&(RQIa)kqyrd?_a=mHv8NkJoMJ6
zr{q6(-T{}njC(hnp1>ieA6-XeZ25qOyn}Ikgo(uW4lejYzIcq-80We3j`3!>^VZ9c
z<fiM~BS0p(M|fN9&f}6KegaJ$JcaxG9}wf%gUL0ti?6RIc)P05(g=)Cc2jn*J8y&f
zF%Ni+Hx_x#FR<=B&ztuJiLLIujp_$FqhU86v`I;c5zU(??;v4U`m;YVJWZ{3DV$o?
zFQeKM`Vi`_7zNem2>`;r{h;q=lR>v&E2Hse^>;BSIh~6}Fr(j=8uaC&j?d5aFAw8E
z0$xA_?8X}*+p94$YQIwRGixH#?Bezzgj`v+2%a)ZupW8GE&kzL)2u#L-yu0KLk_<L
z!rok1iVyc$*y)L=ABmY<US$=N!xWtiVTAB7-OP<#uYx;r7&knZBO61AWoc~&QPvdc
zR37|Lb>C%vD`AvxHr|cM05GOGPlS!RcnVpazQ^Lnyy-kdV^XU!QWQI0AjhEtb0m45
zH{zWj=I}y{)N9V`cbHAv=J~6w;rGJ!NqlM0Rpe2vKN_oj!8kcE!u0=&U2;HniNAz%
zvHEl2kl`i8sQZnL!b>I_bz^x2-6^}pHtPO(J$T|v%Wt(sFEy#BDJ5pj4URE(fXrW{
zXQM7)%gni(E%WKj_FG2dlPkuSi=^+UH$)(f;v|B!*8Cm_xIiS$G14=>{M{0SoUDGj
ze+3u0V_$g>AQ0KA84P8%m&|cNe}cF2b_GwaS$t2Q$hkj4Ur9F9)si_9N4m;fwAc7&
zH(w8-u0FRGV*(C!rD|9oiP(}Q+c8hchkjyyb3RL&-!Vj)_@Vg~6lu@z1bmN~-yY25
ziSzr$HhX?|<T1Z}r!c>_|F-@7x<jL!N#u~CZ<62b{(g>F7p>DWcVKlCp)7g`OvHhj
z%JbE&;762VKh*szX2W5v{$rcmXy{A3%=8_}bT<dm6<d~M$3OD7iXK>wfxGLyDBsjw
zEFZo>=MtM+okvOMNtyY%i(VLWMYqdEp!&{q@EDOwcP*7^D}#!u_hEm2Z)S6u=v^G1
zIY-5Ayz-o|h_1Yx6V_p_NfUD*4-$|QOHRF`yzmoat{A?1xm{<1hd@7jaT=d?rMWh+
zr6a;G|7aBn+om5%(U4t)tCyI2OwIEaw)wgs`^E|VN?1?dF=Fs+tsEfox55q6PL*pC
zx`>faN4YYO{oQcl=`rvgH8p(@jn!SI`mDqwkKZHCr$Tge)&*uiEprFPa@5_#KUO6L
z$o+PP`h1fJ!tBqjab(5h0SHU?l_Dx7zOFG>M3-d)7L~H1M)hBKadG+5)Iwk)lk1d}
zaE#x@Zs#GCj_8jYjJYq;7S0SD&Gk~|i&#eB0Wt_`X)f@zd|B>XAO@v@Eh)af-q3`2
zK{@YH7JAHltZ2pSVL{Zt8=-fKasL2Xx9iIVm~vzNB@RymN6Qt>mOSm1r@ivT`E^Be
zt2@{owz9kQhM*4h;P=gRqtvVqY&z;zb}yfEC}1+frBmj97aanQGb&Sw4>;8w*nSib
z<}ciH7F0>!7b|!}i?*Z|tCb}$qB#B{6RVXkvL)?JNvoH%A(B=sY0D&SlcZHh+C)hc
zTa_h}G{xxp&=*p%nxR0`QUA;3NKS_#HT{BS7c+S5Cbpz;Hpk5gnIGynU+OrYI`X9X
zyli@p!OMO>_if`7@GPQeC0gh(U9EH$=9+b4*eCV@cTR7=fr#I(78_;o)N-c98$2Ba
zFj0v+_(!OtD6igiu-@rA4;x^Xtl14E+Okp1rh?tqdW{k2@K)(G_kjI4$E@-eF5st8
zd>O83EaxJSz~(l)qDU7dKL|8-@D?udy-E!Wj{n*d9FDq@pe*|^$d-DI5nMB|&Rw`_
zYKq5f939Lcdy#v<UZDUfWQ<q@MY5Ki)EY~aZ!otStB_IQ7gHbcnyVuZCNQ|*D;;0W
ztxO;u0Ir0zFpr*KmuS1c%aO41&|zq4;>0-o`C3NA%_8-YQ!z)>EX}{90LLxbDU3#(
z%?btRaE5Eab|QMpq$R*RHK&Q6P+DY%$laf|BoRBsEG}Wo?i{Sz#5+Vsj(m~JGs{M}
zInf#uZyGe|=2~Xf!Q2qNJX7LP*@82p`aa6K=wiA9tqXhu`1N8n^P<Gdiqm%h3tBGd
zPl`j-Pjd;4K-8_uuBs+NV=D<A3?$mif^RUh7XePam2;F)$P75SjX`<M7i_vCmf4Za
zNLR|&&y6L0cZBjX7Z}x~TPruq*V=l<mVI8L#hbUh4|v{5XCmFePUq;vdtn~xCXlH4
z!1pk2i0$_ze6LT*s@b^?UoWfrG?~X{QO2BchAe55P$XxBl@iTtv3y{8ooG~VrDi3c
z>_2Hc=Gm|X5nM&9Nrj#cUdYj8RKLwL0$=F78JX4&gkRurC@Dg2j7HFU58ySE(5089
zmj_3tSIx|IpxXM(Wz7G3&IUg8az7VdF;VPDu5?9zDYh*D#>_15u-TcuQ;oW@5>=|(
zLyALMDvFfg%xppT_XkTdkY<~sGcXKz_-@EDVzx8mNaqUXAiHB7o})N->S+6&F;aqW
zt&>O%yLj^k^YK7wvA#ldDdRXntDw$nSPR-?l;S#}j!8B^%rb@p8IT|YC<UA}e1CON
zZ-6X^A8n9q1hoN(fe<h9mJ|V+m?hHM?}3#sFi5J+(tCjV)g}Nm1QsWwL>2J@=O8J%
z4zVZ_kE+7eZ0OYtNmS;4z=g!sl-ircV8^I@)fS&+T2mFV3HW}J9oH_ceM;fLJK?zM
ze#?k7JWZaOfG6ea4n=40{vRCt>hJ&S;Qu8>eq``#p8v7I3xj<d)@^#+T;%}e%^JH8
z3P{K#s)0QtRGy5gz5fU&HFuq319JrLE4Ks`EY4e8bI@ayLHRr)Bo)D+e1;If{(iQm
zE^;ytvIh9PV*=Dp-&AgX4mE#1eS%dI){R)~3y6W$`D!$LXg*pyOzRq1YVNca6An@*
zO#iUZs(ep>f%XIDqujKFLuyxZc-+Cne5?H{H1ucaYP4N^%=pTxKUJWSKpleLB>Yyi
z7fZGaG?k-#p@rl5nGlXWpc2H<O01@qzS$NnX)3)q>QEOCVMC^9_Sk-gV>YU58L~qB
zN>LH`mgbkL=JD^uK&$yg&{v{81^Ob2TXo>S0>C>`<VOHt_3wWG092qfkbbTl1{*eu
z{Ad#k1MVLrJ7oyZtdf`o@m4HvivQI3ALBBO|J||yv&9c&tJXZ-D7*6~c4@{>^mE7$
z^%K-{IXGHvbYQcmg3X`armauJw`m-R|Fcd3aYaIF!;w_VddvEVKUwnnOjMw4gGW;b
z<gH?U$3onR{>gd<RIXtJR@yrAbUDb3>JONqpW4LT_W##4+`I3`*RVTu)d?8P`kY3Q
zE%(_ZUOp}`UzWN}M52FYtJCOkbS%fE!gEL<wphIoHUh2rQBYWXZr)p~=4N8j+{8(~
zpV}|AAo+It<%^&F_x<wwfBfKniBU)uA`hOuRmp>g68P$N9_1wMXT8*&$cl0<Ffi*C
zUKse39Gc?gY^Ie`t|_gIe;w*Jj^R96WAGHGj$FHrjgj~DeO<!cgwn!T9#p-zo*Es#
z@q+&^M}Dv|C=^80`VLr>^IBme(a@DV*CyBE<srEoiNC%aa?4a1kOdeEql^VsAIg9O
z>pe#47jEa$NJo^8DcnnpK3pFOliJ6dMO?BJSx--e3%D?sbFCZ+TPJT}3i@P(-y+A!
zW(N#<UAK52E!Nc2%~!r`jbhY-lB@qY*zwS*`bXF#AmK~|;}}GU<z=W~C*juEG&@l6
ziSe7us5?DyxWDh5YNU6+R`pA%PSvVZgIy}JIzjQ*^2v0Bb<NJu$aR6%%Zvxc@R{ev
z)M~GDQ*@JV<@B@Q-c$HG?1JBeP|>7qzIuV@HTOjA{!A#MljZ`tbOpuTftllQHx*|H
zx?F)<AJ#=}>_uJeTYqeghA&nm#@KN%wnmMMJW3+$q5o6WXQMMqp_0TVi)O?L9$U7_
z(F&N7nZ*vmLdR>n;WRbKuZcLVt1m~3PDhqX2x=+J99AtL$XoTV_$F_c%|9FSuNc^q
z4|G7t7gXVe5*)R7>NjC=oQ2tAiH!HR3^(o{q239tQbRS3yjZ)=7YNjy^veCks*_BC
zH`HZWJ{)rNm$9yXNO|3}cYCOw<;A=D()}tSJE^N*HAq)q19p==m?zb?@9*xtGPqcO
zBeR*jalz3OnKHsIF=b$e$!;@02d7f6I1<YYMk}&py~AG|6Y`IV@!hGr)l>K4Syd<L
zh_1)b3*P{Q*KzuO1Rc0QCJb>avwH`HMWpQ=R#`lCtFGs|U#ijkQtA<aW#Uty^Hrn2
zrcw29IS;|HH^JfvIO*k$W=19i$4;acRwI0=^p7|;5tpvEBho|GqGvdd44o9IkUhcr
zgj<Bf3zh(5tx=^9C8*A_@G6x$bDWw7Yx9s8XxukWfoZsbd|lJWgMh`i;;+;3)ns&!
zpzKk-{gbMQv8cBv<Sn4XP$$Ihk1VCqP>%~>G3q3Ywx$KIolIG-#;lSJewaFgD5tDR
zx4<W>^NILA;HrXwf0G_<wC;UM5CF*QWwAsu${Hi>!gUrqsu%mt7TJX{KfUTufB#;i
zj@zbq4;<?6+YvZ4#kVDJ=uY4IH!-RZ7C8wYRVWeW7m6JJj<vl9!$2fg-_CB0M$mL!
zm5ub6o>**Qy7{=+03aSV>beO=$-=hyUhM`>=5#wVvbol%tEBx#XQ1u2+5G}-{e71N
z+HUb*PkkW29RLZJ?8FZn<O_ke+kL+av{m{q_5^3+Eqc(p^gLM>Day8qRO?>6+zXUb
zD|$TY4kJ~N30L;}i3u;D=SnMO{Q~{3b?PMMRF8LTMX9;8%xuD-*IXq;Ap2m5wRtde
zyX(Hos@F+a=z`NLWl02GnPJBCCmQ3%i~C?X3jVlIWtA#N-3O%?p@Mm1R9alwguiw8
zE}eDWuVOV(&o!jc%7uI~WP(a3v|GJZYFFQz^emmOs@KC3k}}*^YEz?B6sCp`(Z_S8
zpo-NFEq*}m{K{e=RP*{r3T8CE{e~$B96Q-~;8{QqFswqK4Z#}fa<-UB-IzMZ?OYVy
zM8?~MQHqS=VRTO(M)EL{kYkY|9xCVakk7*vSPewD=&r}1yLliyQ;%D(;{lCqkH9Z@
zfJ5!^y8%3O<l(oOG!QCyf$PD5V1{}%T_IuXnJNJqT1B-QA9ktqqJP1B^V#|Iqb`TT
zM(>mSo3&h+{cuEGPmbo)@hBBFmsGI&sBz=_<%l~tTW$gFdotO<nQk6W(~skLJefyT
zfpq4e^<izI4(cjqcJN(IfTYZ?OM=m&SRr_Ki?vP~HOrtm#%GqAa(76X{zwJO(f56c
z2^fy87hA7W*kg_)1j|Y8fhE`rbNN?E76x#fSYu&<iJ|EU-G>QBWv(?MDBUelX5kbp
zwwF4oFCxE^@w=NlN;eThrpKn@`$yC;#@SO5-%)lvqqdKy@PUf7d~!$We`EZDml^Ro
z{+Uz<cabteVP=J)N$P{>!|8%IL=*!|NSVR+J4wH1R`B?Sew@hT%RH*iK`8NhWm4Tt
zvs?8)I81~NY1XT<ePuQ5=EAq|v3>r&U#vPF?oI{q@dq!V$`QJ;EFOQMA9HyeB<;#K
zs={l+Uq77eK^^ypW_6^Fwsh7fI5Jnn@*te+vb?vo4)r)UCf45_97eg7l#{cD$EEsF
zMqV$EfySZU!Xy4wp5~FcG;3>*;^$%0sL9DOJ-I(F_<%@5-6ucrB0qivHSel-^C@{(
zlGC%UQ9EGDKiC1ckf?XS`8NpuvgL<VGfJqA?BPb>Q;H}rD%4NY0j$|vU>Nm5akW{O
z0e|A-A)DT##U|(GC0R4!Z-YP(IT;d;HY;0|_92px>*%h{$(*`bNZ~M3=z$Q;Di>KD
zHghqzpZdzX5n>ZgSk#pmbb3-!W9~;v5H@9=%q|~BvgOlEPvL4~?laOA)o3cW3rJ>)
z><$}y+rqiWr>Pk@`V2Er1YM+OV950tp-<;lM5EfI7TOloyq^H>jgcWInk*~nqkF|x
zq>Iz;X=klGm#@{4vcZ}~?P^HjJb}Vc(S<Lmrf=;;(_(EeaYo*qsItz~Sz9r^iMz@@
zr?VV7YaF;P>H5UkI&&+8EEugnO2nAeu-I8$uX7jbT=ym9#;k@%l9hzdLZhZA))H?6
zd+Rrk%3=(3Zd7)i1(H<q)F#kZQ^`dBn~afDiEMdCbIA<-GfRF9f>R*04^;6i(t>Qa
z8K{+X@%|17946yF*>AGnwV=~&{=(XDJ)#$f2__59RZz>a$=*o=NklgZrcn4vuFt4?
zQQ9D{Lv&g2nsn<=d}?OtAc8SwkKZ^vi#Slaa-2V)KrT$qKi(AmS<D}BRfED|+6+RQ
z0oOc4K@>k|`jUJBztUtA3dqOt$8YBgi2qiz^?8jzC_8^zy7kslw1VcvKi~`4??ttK
z%VlvZ00|9^FI}gA;v}8<Xf2sBU%&@+R#)hFYxG!Xk0-~+w3=KKnm{hlJO6|Cq}lrP
zQ_8n{Z~vzK4Irq?|MLVkp{}_nAyUi545@&3J9a}C%YBYOj*9O%K*im#u{_Y<gK}2Z
zBBnRg%bECQO=Jm~pAA#<@Rtu6>KRP8o`+`{TVhDEp6KV1okXqd2h}nWgUqRpo?3^j
z!AVLm(I)cLF}S0!`I}O+l(o96YIq-r#nn;GEGzmqE|#(D{M2k;=Wtc$Cy6?b1eEZn
zC*~Q0`luQ`f%QT_TKlTVF;C(V+?|>tRArJ8Z?e6Wi0SP1u05{3Pou{eLsEOeA8l_A
z?J->4-s2tqXYG9p&XnUhHfPr!-`<%&+FsY>_HurzJ^XKx+-kLx^`MA{*{7WMjotxU
zBPYvTeIHx8nR@BIYLj7iRMWg&mhSQUW$8M~(v^pX_hh1bI(&TEV!h9&9w!GtYcf)g
z$wL$R6O;hCVp){Q4Q@%7vxyu{{!mzqn&J+PnIQ%(;JpPx+7O-Wgi9trlnnkh+VVWh
zzMoCqFDVzRP4WFod{QIdU=#2)_Q=zS$*!-LRJn31DWWiI>Ov9TM7nWFq5n+0j=Y`P
z<RUcx6n_Jn`zcZd612z*v6sl>q6Ogm-e^6AfEl0=R+0E2g&Zj|q2riHq0y+z+J;Sx
zC;0q41}2gwsA+CLQ~F57b6Za!J(c}wZWLaGvwKT4lNix(<ksKFDyu#ymq}a$vR&u@
zRC<tfA6%W9FNfp*?0s-kd>6^2^xfdC+540C!A<e6QWL5bH{MTB&2fp$LI0^<E{1B)
z6?5~?EG7+e)HuP~4nDAF68p0UCrqwhC!oU3(zA1Yr<RMazK}CR9sa@Lrx?DFDIIQO
zijhXUt#P-D5J9qy>M|C?g_wJ?Q++k>+MJG}%oOiexyKa)+C<y39_VwiVsf$vcU#N)
zw}~@_L=F-W*ksSJ-cvV5dY2QzK`5CcG+a&%T$sDdmyXe07^RY<YF181gt0W?yG^=v
z+izmsYLE`v-Flspx?6d>k_j^Cu}j=e?$%(iRz*_#NOA1;d+7G3&DL@Ad}$u%uE-dv
zc$~v`0eZ#QxYT}-@wmOz$aoyOzV3QFQs1>X<EF0XlZYV%+Bk+aEw?;HoEeoeBRPyJ
z_`l1aIL~H!InSDYrOv+2&}R@f-7e=qm<1Igv{DdP#Ke>)$rto~$GBK@9v?D(`OCI%
zi~Z64?g>dSUx$6?EQ{&+NN@M{`@PxQsm0xQm2BAAQ-0t+#^G7<TzkK}5qnwp2_b<l
zNALShvhS&BhpxuYeT<+WweNSzeT@1g@({}kU|J%gVxJq;RSd+7${Q=Mfs@mN#aSj{
z_>^Re?K}E_$dFpBKY_Q!q{n04kq-Im!lTp^d_yCIUA|&*?gt?N$zmyJ)qqiKsh8-Y
zkYQy;D+q4OwTUg((`1%a@A9AG4&DJq*yUd$cEG!&ga2>Q!XWlI7Af85Y?oR0*4X#E
zav$mvZ>KNW#%`l@xlGSh?eW0^bw4D>R>oj%L#<v*vCsAWkYn28gW>gYe1MDF3HYEH
zd=O2>2Qm6k3#UBhe~@ult3+oBhUJntcxv1p2ae;nPtXl(eTpneLhURJMFJd;m%n4A
z14r}U;E@Jm$InlIqo&v0+;4okf}$jNoY4OK`8h!Y+QTDu3UU%!yXyoVdHP|1d@lxr
zD4rru#9ksn8zAfD5Mf#>m1ldH2m<ivLlu&9o8<~H%QCH>%@GO%Kukd22pjs)huQF#
z_TBODmmaO)L{jjVMdtqn@K>009{o4?8=j+&qmMrN-vht)_>1!hR#z##k?<Ug<F4$)
zm9jrfwRl6XC@e-mME(Sey%)n`3E))tnk{$y{jyVhziAIYd0culRIbNvN58!FAqXa3
z?eTZb?zL%}gZ4-Nh><ZU*&`3eULsXC4dZX+tb)u1ny{eeQpADRYRd@doh;FVL+b6Z
z?%E@zIvz)l=t2H}Lyyqgvc2j)=Hu|sl}Y%g)p)$!`tM7|KjrQ5&xaqx0HB8ivQ&s7
z`Iw85Br9!xpglbzsNy5FhH`Yp4lzZldPCD^B%z7ta@sGrJAP~>FL*K3puSJo+iAPK
zqSJrq{EDyl^eOs2;ZNRY7stU#qYKMk<K42x6pWu=gmFnezdXS|#0b#>Z|J6E5~voM
zRnDKZy|Vv>9z`7_3_*@c&;;nZ<4&5`Jp6AFpy%88vi*5u>GeMGKH@P`e*8Y-Nb-He
z6IgGMnBMN6|2wwY5OMANj-cn%QIq6~r}kg_^;SkoEwW!FO3~4z4gUjdO42c5b18`h
zN>DAy%dhMYEM&+`LZ=wN;>(%h!-@iiWowLKIXh{?lTBDxEZeLGE_6+_00?x4H23!s
ztk_9yb16Dsgj;bU=F#|DWG<duIWaK6q!vmz&W~)j(sswzA80_Gim!_BBE3nLSk1j-
zyvQ;eUTTmBNST^?4AJ(eR+31O|3?_%e6=3O(IYlKI1xU)%vzj50U@-zu=;=vAaBXW
zxRzO2DHEtc<h6HWAadRh61}U>Q8exy$6BIwFR}Ht(Bl=a(DI)ELOUWKCYF?V+#;V3
zH0N5CN(MC3Z7rF=R0d`ab@)y~FsGfF8?g(s@?22TvqmSA%Q62|r9Wua9_A?f&b~=#
z>yCBBw2#Kh`pbc1(A{08NU|6{iv`3uc;w<tIaQP}ub4=V#ac$#Ts%Zj)}-d*q1+x<
ze39Zn&#W5e@S(!#lA`KXstL`nE8o{GXm6nGr$h1A$rUonI`T7}cs*jRy7eU5@}G7i
zNbDa(445|^BQC6v+A;oL&X-U@#!ZahIYS7Mgs9{0-Hb}$4A}v?zO~knsW>Z__HsCg
zq&Lu2HLI5cHf%$*>Rtqv)n70q*TH6|zerdd?nmCiG^^SWPC`L>pqaB=SOa8RLm_6B
zUnIULX3v~R*TjO=it88AazZHW&D$zO->78ebOV(JJ)MeR*XsCN5u?j}XEP@vTlb$6
zE|7Uv_Cs3p@vpt~9aWS}v|Y-mmert&^Q4MT?opEX8Z1@m%S=&`%+cRWlyAb{NV;6n
zs23p?YCn7}g%aIOreB#~i5EcwbvK9}agX)pwDuF4y(=NBQWLsyiY*)C(&B-x2XF5M
zxwpgWMAw99u9gS}TvwMZ{9$ap+@aBJMqLTv4;!Q2!p+mtL+)jPL!GDY7=DreMx!nz
zP`@nD+G+YYLOV=EuXU4%L+0Hfg0_(pn@TV<`ibxRCBA2+65qSqeP8fuHyPdzYq`Ju
z@Lb=@@J5bH2&JNTiBM|fIFU*fR~-XdHp-XGb&oyAc#2D;Y$Y-LRy%kV9FOqy&g-SY
zgVsbE6r>iD#Vw0SgQJjI0ioN)`0HWS;HcaYO{uv=Q64m?q-yveM40Pk-S-}7ktMax
zIIiEOszyYJN~gWna#IUtr9BaJ4;pBLxfWSoe7p2u)DS(03N{GqEL~fO`(E<iDWJ}F
zd*{15<9jD3zIVP>{R!WuDjll8jd&Y;87gS=>Pf!+xQf5tsm9F4kul4ze3qIa0J5~y
zc90VN=^^W`okI;`ITf3=XQ@HDs)j2TlUGv#6LtZpY@>Cp?o2k%C&2zhx|UcU_ndEE
z1wd{6K&W6fF>c;R8t<@P1?~lYckt$_x)|iJo+o9qUMt9O04gqw7&9p=@$VE?oWw&<
z&|zs#Z^iTAwXL-7ZPsrp+fVJ<u=JlQWCk$vT5CNGd@4hFCAS{b8{uc~#0<1qxyRM;
zw5p*L0y@6rSE?#Vl3DbE-s0FzqOcXq%$$l_9KW*K9Wf}8DNPEIoXPWxj7KzRY8HfX
z<Cf;<Wf|M7CxAKOgBhgCIQPbVEoAJr9s;MT+xcvn{81~mI%;q`tq%aXrqqE*b#n$Y
zu)%C10h^81Ter%WN@v7U;eUHfPd~6JE%uAI*h{7Cw)(2368Qa5)s)8X3X&?MDZ%fE
z>NOaNs9v_rNV_ezn%r7@RiZ7RzfGiTL#?-fGd;xK`cQdyJb3o+ceh`?o=J@=9JmgE
z`+-A6T3t3-j}S?Yw7OY9UtE!b#7veu-~atsj^}sse|O7fcj1ak6e}fYKM4tT+mWH3
zluO>{d<CetThnPn0b<R)$cQ4p^xxo)Hu`2#97NkFF_zXu2BEOT%V_T<X2ZcXu77i!
zIy@^jd+_9%l7ERy#nixkWyL0Ba+N}<iebmbbPCa%$XxfFPh>O1EjGs!re_%<x*0X<
z+%7(uBU(rETw@(7kz=T6yw3U;SSd+G`L)h0)|vNXBxkEA@eo~P)#==`bnYu>k*igd
zYpg$!1^(3b04v<VU%~~A&cd=R8*K6|DpGYHfj})Mrzi$QG`^ul=*NUxz;%u4dokxb
zi5YMOTCq?$gC~!<M7;UTed>gpK;VNepJ%k<Va5f-1t0#ELEhN?Nw;%<bPH6<MYxQs
z+k*g80+g1U%ds-qiV~~HQ4FaNdQ9dvcpr4q2AjkQ`)PwZn?_)yGKPoi4z*zH!o$^A
zpG1mynEMzH`8?b*hlhbYWMJVFIg5u`=)odq@bF}J9<X2Q^2$j(bmSqlOST!2`9eM1
zSG4ODU8;z8>7hFmX;Ac^Un^s(r<zec)F+W&H)gA4+$77`Kvy-THDrWZ$j7l^qyDb+
zJ;^|(8=0~mI=<Hm(b)zuo*SLcl3<Th@ZPqTsD&mUMA9rv(oG4CHdsgC^t}Ts)4k5h
zAu#rE#4+t#71Nej-{iU~Z((^ysJhhhii8|4Q7!Rkgi!65I6=PH{fk}4mRU6$5q-_i
z)p~<jk9djlZv8rwT}TJ}(NQpo^|wy?U3a+2`Yl9^VEgs-4!U|ss_{T0kK$&8exUy;
zPz#Z><MtOe6AN&b3=umxw<*7^X-42-KH^fmKs4%`6qwmP>YK<2tfQR8()vI-_&Fok
zaZ(Il1hMcX^qxfSI{jaw81VJ+7Q!{bzS)d&I%&LMj!FeaWNfgK&vsD$T~teUQ)$+N
z@5oDc+BK@L0}QNtNKp4rSHUE}!eb3sDB#3H0TegiPv`rllA%roBbo6{ukYgxrA;0u
zGeMYYQP5$R(>yA{2Vf-<vlVC_W%B`}z*F<Cx(Mc$@QnmuI9*wt?7^fCu&RbNNs#<&
zb>f|!NKBCYf4M|ptxBdw!y7S<FapI_DC(QaItFGlowssQi^0_2IPPUa&J|pcG^1jq
zj{6OY-~}?Ljc=<t{h7BA)rTKaTMHj#PJ4RseTtmHbTTD_yS7GYi!ZJh)e?g5gE}-a
z^U_|}tLSs#$FNh9RZg=E+R#(aO9x(bE7wH158C#>vhk5`n6iUxi}{8GXLom-o28#;
z%4dZ=bBxCuvm5=Vxq{QAS5IRbK<4L-bg9GJ+RI&8P9@PV>i$7YDzVXN9+fMTr<}lU
zLWjt-DC)$E3Smyf!Es+XSj$H^+B}65vMUXHdwtq)m_unc2P2ma9QLGK%t6)4X-QoD
zTu<;W=%}6U0c)pq@uc(y8<M$BW!}XHJG}#T8gox!x4D=RPjEUnV-XENjVkReHTwjy
zpeNk0C{1_^_e?!IGA4QZ7GqapcNJgSc(KZJ$_ww(eH0oa*Y2Z48i@7r3OUiG@^X{D
z01m`tMsP?U<Y?Q8lnG5+etD40SPO~CFz){zWE0L}x44L*$b7$brkd~T7RX_qLo5c&
zp*XAQb3K>!DWjw@xt^3!${)he-jLXPY`C#4`X;1WJ5dnTx$aZ2?*m8LAvVKuH{i#^
zF*q8F!IAIQBx9q6>g-+?3;nIn>W`RKhkL~~ZmyVib&|1B<!{u&?Y6ohotGYlWzW;(
z=l7~6pG;wA#o}&{p*^zR)>$Wi+wP=3UuQn4GgqI&cXkLu>1ubKJ45Fl`5Cz}W1>UD
zV#X&}jvQ}45{^9HmK@RzBm53iGplBuAMNb##7|piF7&`uIA&*pG~%ECi2F?-Kj=19
z#!+<kANb+IhpP9!7RaWUmjjgiU2|2&BDaZa*KgC^=B2D(+J$MPnj_c=BeJD@tor|{
zJ<#(HCEEiviwVPV@xjF&$a7osP_A}sXA-nP`oqrn(wg&U*)}RX_-((+gdJ+O>aG{w
zM%x3u^p_vA2kJIUM%JT2)+b>P^t+4H;vJl+GUwdR3dHPzrt2&xG`KbFJfuEz3u8-C
zK@_WZucWWOQ^iJxP-T}AYCtwf2ML3Ab&~Q*;WLGwf@w8xDz_LPTnX2DG+AY`y)Ygy
zo1_l#l*WmIf})!eq=hYCLVka_I&KS!s7E~7`tz~nIKC2Bbfxq}{6`fsqpv5czIIc%
zMRoW0MXJ00bxEznwfV{N3i!80vL<2~J3HNa8(Kv9O~I;R10*I`5<#Sx?IP|Qp|rOi
z#3zj)PASGio7h~y*h4KXf@E*Cm1M<AIx8^($eRrXMqUX0jwsOFD(_jHccad`+s@Oz
ztM}==Q9AERI}bk-?9gt|c>{G`p`C~5hrE$G&(L|j?7Yb;PlY}W6&(P$>5J3Oo1*dr
z@6yjzI&T}w8tLZ@mA9XHA@2j7w^-%9lT#(B(&P%A{hZGJo1Hx`mMyN-e0{IZ4%*qZ
zv1~k|B>Os@eY>6gw9fYZmaGz;b-A7OqMbE?tX!RSp`G=romEO!7oByco%N=jHGr)B
zscHfocGi1#7H-5vt8~^j)G)lIR;Yrjcym?VcM-&(zS7&at1*U26ffu`*+>{TWTq4w
z&oAVmdy0z((OoFz#Ey&Xa&lE061~#wU0$Q?2nDlJjOXh4?a6OLdIQ4gqee{%rHZS*
zcPb{!W3D%9B%Uo?G1#De0M26Qsz$(nVbpiAw=m+{7aV3iYYGvOYW<Gi$R@g7=<wm_
z6;{ZYOF!{S=enADYk>@^X=I@@{5RU;rZnNzGs=Q@q`HEGaz+Mc9qq2*{yK;tKpw2<
zQ;ffII4Hx9_1_#6QR{;VQ6YEd&7q5CBPVEv#7`|V@9S<<-%Q0H6Ht(NQt53`cJOZO
zM&EYI)r=RC#RL!TZ}%pj-h}&-sH|ec>-b2fFIy<=R}0nj`3jk+?37u<`N>I{RiJ`*
zl$*zdE2zN-I2!2><DFqWGp`LsENVT<FXI;T;}2LLM_9~i%Lx~SE&+05k5ysUn?r@U
zDYN8fP|B=c?_nuIxozm*2=2;F8&Y-W4<r>9jQb7fmq@&<O#Aymd^iL&ARxwLK98^P
z_+BdiNT_f^P}N*aCE>?O_&`#{eXE*ZNcUOa;y?cYOE&A<?$!n#ls{Bq#J|nT`~~?#
zn5A#=#8PW^3R*R+-sgNGm?}<ZtLTlVY)6_G>XD11PMug>X_oBye1-J)K{$~4?5$+V
z9?_Xql8J3lFPiv5m$;Z(>X2XCC9cz-zoj#0LH5_pif6jWtmQoJRGGs0TY=HQX=oM*
zE>X~e50Sd(!TRjiiWz!bF^|FzX!`Y_;P(UGV8QsmCE@pMsGoL+#1G*2fn#VNNw?6B
z3cr8ABykA*5AZwj!f+dU>=md3M(7)CReXI+*rcmK=G&?@GUv{RKhOXZGOSE({pV0(
zh2^ZKP`xTtmiNs^ytDBsT&cm>v8n~4{~LDI?Ud(ozH1~sP*ign1%2I(`5&$4!RkX_
z<xII7SR(hXc&^YSR#!WBvd+C)av^Rg_$&Pgg1}>tRAj*2|N4Z3eQA0BBWoyHaJ1o}
zv|fpENMt3(@z%kXX!u=u4vtTkOYpe%{3y^MF`3h(m*&R0{$yKICoucebiv0G@kbfN
z&i+b-s|(+k1idiUr+H#08bQhCQnP>`q14UZe(CVBC`ucn`dm3N%}>B{;I8n?x*IpW
zN_4}_V%m9dja2%HguEt8=5^7f@&OXeP}=x_y$(|a88;nK2Wl27Fi(s#(}ZlOsGC$M
zuxu4lp%UOKC+M3nk2BezLd+v5gQ6!Sj9d;P64H&9cRBN=97mFae8K2oP+UPsASv_o
zMBAeaC;J7~PTqu{SCJ}11tm9XLG)vGvV>a)<oz9W@+>=fK0Pa|=4g=cW(?&!(Wvbi
zq^6k2bR>$LBb0Xg{jmWPAEfB7h7ykgM>+H9L#W`~zc9+IVwRk>u2`*SPUH@4k>Pwj
zxZT&?^5koAYut_ifUo!bIqB<DlfG8>spJufSZoYxUIaG@^zI73%oEkFnv>kAA@gF@
zK0>v1Fz2c_3sxP!jzc|q9m{dchmXQwOza)A5plbS#>0JPtYJ1^0|@+MT@NM<=jax1
zRxP&MT3ITmsAxEzjV~UJqE_S1=jtmocXATsjrHO{dpa*6Q7J;wdc}Hf+&@;A9CIh%
z7wPAnn-m&+&j}tI#}X*5;a56wpa>6T+iKi+(|IxzMdw|qu)vduK&-n5tE34O)U#YA
zsn?L2U_omute**lg$jl~%$LLq%ah%a=6~f0d}rWqK$At^B2@z$i;_{rp3*Jury`yI
zsm}ivNUQq|Go|ytRQZZuS1wML3zG+Ia0Q_#cM}4rfPbB|J^cx0=a<N#M3T;yp6Lyd
zC;7pWT=#%uZo*Owa2MW|U8#juyR9B?iEID=V(+~Jqo}(7;VC;io84r069Nf6^d=a3
z6AdL06ciA_{-B78iXuhuQ8Y9m1`sI<D1wNJ9S{)^A%HYNL_|bv*f*kr4ZFObbIzUF
zoehfed%o{q1n%DZnR9MA=iYm6nL9Iz0pDh(mGMTb&O5+<<HlosEMn=@Xy=?BZ8s0c
z(}(%AQB}Xdyq*`Zwpzd*j#a&Y#@-`pLQdawg}lQ*WVL?}Z{1<YgI8D3PK6GEa2$Xh
z|HWYZ2%a3}Ex#?X$;+rP*VjLEKO%R%iIo-p5s_sbS{mEh*jb+7RUXZ-h+DS+MtwEe
zSZzdiG~k<Py<F5Oz*B_6)x6{zftUHx1{=Q5T!W}BUd6ZPrl#F7kwEIkg5vY?aC{2i
z=Gxk2BehR$6VJa??2#J}?WPAC=y^LFy~paLp!mPowHpSV<GY<j2Ucr&XwySb-9?)*
zsDxlh=+?*-+$Eu7{P<EmMVI$iI>*~N6}<gy7^ZLuiDWy8&@e~$E`It1J+;v&oZ%gn
z%ZCzawL{94=P58koKZlcw|7cbHqB3S`F%ze@vt-Qm2uHJes)YY8+h#$UmkHeogbtf
z>qco?rOS{6o}Z!jg^%y>E??|S5^&LJZ*>i;V4mc}`y|T3x15MUh*nQnUVj+7br(7a
ztTQd!@D_2kX(;M(zeiC#49e}39pgvXad!@_*imWQgNTpe%Q-~J?@e{e8{Y|#E42TI
zrzK=fH64N+|M{J-Z%<sWXJPGky!CpS=Xc7FqQ*mu^}YAGi}m7uxE+HxrD44u?h{|H
ze}@Ks{O5NDVl57Pkp4cJcznJ7aYHsYU4{6@r7#B0dVQ;qdVhrVdU16ap?wXU4#Dy|
zuW><hU~%z=ziV>vs-#?PM4!ad+?bC!kGd5U56wgOi><wgM!OMs(>pz5cUkVR4-QrD
zS)8BCC(5&E*eb?AT8vM$V`#<U-jb2GbT1y0y^h*n?7261B<@~Jj}2~xR{ks{8La0z
zQKvU4{WJQLYCEH=;lLge#-?D;-0tctUE^tjT!1fz;UID@eK4%M*)xaV7~*_2EVi~6
z;&bCs1rBk6N9I9+=i(`Sx;+_Pmq+PPEC06p))Lr$TJ8>ej$&dHo^JG%kD*}WUrBh~
zN{@lC1|7B?U{iJDUQ`7Q(%A2~7`h$Vo(??GSHpgvq2>l#Vh`iq8I`{DY#Ci1#CPaW
zir3#jT<6KWs)&qa#|o+Y5oN?;mQ!J#{nz?<p8flIc<pfEn4TExK7|x~NCg_hO9J@Z
z4CXU1P-Wgfb-~joe0xhXVu9u`2#^vy?_lfK7rH^R*$|l@yp#7&wcHM?OJ{Fs2`G>M
zp8+k#ep&;2gm!=zdd;Wzc@-4nR%7{==Tc{&o6t?S_bmFT<FS$1=O4ps9MQ`qOYA4)
z#Gay4C0qhJ1rvJ@+k4=spRkyTkEYIgf$qGtb{-e2LN?@uDzwNA#Ck&aiI0fUH0c*S
zCVJ?Av;R-e15dT)&*itt6;ffRw!Hlv)G|_ey&pGrtbbZI-Y1vV%B9S>IpqCVHj$W7
zRXv}2KsCGs#>9Rw-Mfx8L{T~x-w}vy=#Dq4<JGOsW)w2Gk&6-+NIl<+0;Qrrg`<0F
zBl2-(bNn56D_tit-gwd!@5sSfme`xy&~h+EL}R^n2L&;b4n;j2qazAi(DWbxj{>DB
zoA7u!-&+8+=~21Z1qVv#yxdQ3=N;YZk4(G}lky7+iatxU5A|bzVsVZ(cjp|ke)E-f
zF5fSzScb72rT`;+%g9@D=^?ok#7&Ju%{s2s4>g8PQu-w5Br@?Z+7Rg_qq5S=69qg^
zfoiw~=WoL!=_RNL)Wnz)Ipp>j-dj{Ke#~xdWXFQBW41*wZj8ch3x_<3$i?})>5F6q
zgMW^KTwsnV;hWM04K@@EeuFMyx-zXdwd4HV)Qo!+55l)w(O+n3;VvJ#Z=F!vd7Uaa
zmvT9_=F=7sVvGr(>($_cXCs#uOub+gW^fJh;BAlMo=dUfUw4eSihh*|%ys&)k*f}6
zC0yr;8oteRopv48Q_rK0(Dw0c6zY5YVQ6q_%Z*jhM<QL0Ky9mWa~6w^ST0}L50$}4
z4S50rOtqfvMFQu-a(ws=FM-NGNoI!zSuhoViS{1`EQeyrRN&iE*mpo1YA4oNTF|{)
z+!j=KyW4`0W=;o8O|idBPbr|0Zij_DM9ZxJT|XL2p%XX7V@e8&M{bLqffl^Q+B^@t
zba=V_a`N$))MF`?@fb5peVDVyTRXMC^C8Vatl6nqgR8^&kdOC)W6LqYN2Z7F#=sUv
znwOL=#FYel=$xmax43nrS5W>Ddk9U5I)e4uZ0gTT3aKnpTRzRr^Kk4KhT94F+6;E$
zA`@t@jaGtm)0N&80bSpYmr9^T;|5I24tuCYACB!r6bxdyIGfTD#q3^uI{J1#t<oo6
zYEuHoV%<?J+>r{z4(6kS=hHlDWDuz>#l7_IMeiZzHyN}f!2i!tJ5OR?9wj`4H9M*h
z14I<V9G*?@2^-mAQ7tM+`DPFruMX`88<`Vpy95!?P>_Pu@3lxl8T(#jm1%5+n0Qc1
zddp~P|ETZ_X(M3MEPA&VB<ANRA`^c=M=uzAd+lD-i6WCW5R6CdKv6LWTv+-f2+&0q
zbkP(e7U`k{y$6?4@Vr&iyccxbMw;6AG#2AB`hW{8USwilKN0|Mi!2!6F@f05E~t^s
z=-YV{u!OcA#61V3_%816P&0?bdvHG#TA78L8FU7)%6urZa7-==mA0)X6{ac8B(g`;
z!&Tf~#*_FwCm@C8V6KTx_YN;@rt;yfOdb0js+OeCbDAJ0p|LW*LO!HSE>6qh#-wh`
zPo<LLbe7^+bZ@2jGT64p-wmUtgqICL^vs@kB(m+MVO6<^BOCFKT`VvGY2t|#ArAip
zMY?w<cgs7F46}~75gVv2#{Z1vMcM$_Eql8^>);LS6G0=b^9bdChq#TgMF)*^lDIf`
zTN@=o^>GnKro`rA!;!M1w~nHEIt;=Lkngr%1Fa%rid%MUc%Uw%Q4Cz_HkttiFmem2
z(cq7!PX$zCGgk90)N`rLd`iRmRfvtP(1Jsi;$3z+><T@KH%ORk6tumxcEPAG$?J8?
zCr5Eq+c)w(F;(Lp#W1^@->(-ty3Ea-=lwa7GZz0+xGfe6Lc65YV#nSb#6~)OvWePh
zeQBq(MB}t1!fc9wcA8td8H5C@k4{k5?B9P5G=`2sYkEXOkqf5cPzfF(-zr~VOL&Ji
zoyFqgyB&|cO%|#0Gh{^XfD!FE{x+D<wsg;3jlt(_a4qPk7-ZELj4By+7iwZp>@0c;
z4rxq!Nm<Z~>H5*w)-Pm^gBL^Llene7qpNvD$SEtr-m^snnsh=0s@bXdrXw|jt@-G*
zxVUwqGjP#+4>1*>vu0SSAM1<Gj|%SnG)jZ}Qq$@7C(q;}6TT%aLy!2v`?cBF&Zh}`
zAl4d>SkaT;ZDWy1uY=?7jLmNk%gmCDO^1F+TV8P5E8zvF$}GJ>oMc;QdRO3`VE)+M
zQ$w(6cor4iqTmOz7hp%b{s_BIyfZ|0qIPHcP*`Nb*CaXZC()GSoU5m!!?uZ$IVjC9
zI@l54&F(n<QylLhI{}+VrsoSvv3xl&<rjU_wB&+V5z-}-z@M%dk1w%Lco>Oxi>$n=
zCG7ozLeAksL>=@x)>AL_0Qc0i%Gi8CUT5Qg3+%#78ndRh44~J?=79nGffMnPRB%)c
zVbiNeaeJDb(%b%9QG3YBklE>TXVL7`hM;LXuH^K7t^YH`f+kb%K_{4i9gt(C&GAFy
zOh;_~A1XmMZE^qhP%1ToC{STbiYY2u_XrAyHih=x;nq~P2ERQ_mq_8%)?!m|*9WC~
zW*Qp7OLS6A!`pGG0TdLE4)FIi(L~NGpjMHKM1ID42dY1cEx^Lt&<D;QKL&lE4I3lX
z3C><ejhrW<A7ZB9bOL0VjV^Eo9v>x<C*a90Dl=U@8r$-zbbuTxRQ=dbxD!+ML?E@<
zo_Ar^fUQwZBags>UlOMf-sjs2<SEOJ{FEi$cNBXF=k;Q^3xl>PJwlyWt^mQ&in`D#
z=wIoZ508Qo;?5q<I@=0yV~Z*wqv_|zo}`=KMc9ToQV>~Ib??2iwGlPq1`#jV9nm<}
z3VUga;dwBO<PSbALkKl)@E3X(16)#i$3)V;b+{|(mnS?us9(#3XG4UD9fIR9;c0|2
z$CmO1OP`9})>ZV{*;o_g6VIFGPg|_+NSWqdt;olp=Z?G9E3z`JJ}C5kfsqCul==y?
z9Q1(wIPxQBKh~W_xy^vQ_#PhZ*c7!cDC&>rw`Wt?$WVS5>x=MY96dwuBD^}GsB;Sn
z-iYAj&TSC9q;v+tN$*3UFS{k}r!2%JOt|J>pSYj0^>)e#M?kT4;Cupd2E`q`mFMzF
zM<CV@f&jd$Li$ukAO`n~GB0*jR^`QaRUSl}IsUFn+9j-pmv(q|Ri;$rhHyl-SZ_Ng
zVOQm5A^k>Hpk){>q!0&su-ymil0E)c{g!LxJu}W@FPLQp@S%dSztW<BKf(~{v=LJo
z^w|zq;}Y*%#2&1QknFcDdl>r=M={74d6AiPE)nyABc$ae3&;Qr$F>@s`?-7*eJ{Af
zCD;w3T5Roprr=H_8rz97#aoE8o-Fnk7vLae*FcxH8-`y&k07@|zBqUROOtUORYin8
zjT4nyl5KYluPh73SmWEMW;S#$vK`W}&oBTYwW+=JqFxweMr?~VEopy#0C%D`95f^v
zinsPXE?x|$N!DSpqQ#n{U|6pw%-^9lei7z&wCs*Wuk)uT_8^Roiw(rPr=dBIxD8KC
zA1i+o!q7)#!d{TEk*~%hY{mL&BW=X^B4*M=7KSWZ(18>l8UG*=Fo6hsk%iS_8u5Hc
zsS^*q<{Qs?=i$3AyH*oBtrF=2^iwnOft=ViondJQLQT%lT69`%?C*^IC7~)e@ctG)
zz)H6vaAx|=%1A|Cpb<lTp(4j5RpjB=hx3W|vS{f;$dbxH5AV=}7%)ND@$Oml%dzq^
zr$Sb|$pkMQEq^D4C%!v+P{G*W=&9St_#=qu6bkR*=z+&mV0f_PckUM#6gQ*K2;p(;
z&GbPD3<VwjM4S<4%2u>cu>LE-M;kWgVhTC(wL^FaMyEHz!4}g-PV8EmU~*e?9^=`h
zIB@=fX021p2iN~}j1|{WJ}*N1S+QOfxbJL`J+jX&R6X@#dgWe+r{!V;3|1X^*L@wG
zYNk&?SCUx8!Z7MrbVquGNp5QVQJ#~$QHY1O{)E_63F{W&hhEq49vF_(-p`9VA5a;E
z13PHtr(Hv>ygI#qBoymETG<<%QI#X@Hx*1BPwk}Or*{i#U;)|k5<EjAHin)zS2hNp
zf2H??p_Sy(fZXehxSeSM>!<9D+XtS)dlR9nR^3QuE;Vh?#u|uB&%^NG`-HF~Z8DwR
zS?vF-J(%-x+$bCKI+VU(>O@RdP@0L@c)>h&B5p`gYBcjZNPg_a;y>t{6QkP4o<XRH
z8DfhN#MI=rf}$Y@V-x3c0+ddyNVtbxfl(ZrofEO?c<fN$*p*<BdydoZ3Tk&>STi;N
zjPyES*de@Vvgbi8Uq+>mw0ji2OzB8AhbrUUt5Dd>R5qD!Hr?K_XhW<yA{R`ZO_!;a
zA6*U6wb=Zwm;<6zVGQ>q9_xy30=*#de~N_$CRXV5lhViIcRTu+kC4*G3<O<$jGPnK
z$F0JhtdBlm5q%U+6n*pp<4NnI79xA}fxGxxY-;|A^zqvjC(s8{JyH9(zsS+YRD_g1
z#v$nHW6=F^eGC@vWPNl33mt);Y7NC90%{+5U_7oqUii!QfkWtDO8Y2>$O-!RQ`*Ov
zF3^XkX~a7>pC2sS9{!2^lU?}{a^y!aF8_TA@(Xtv`N5*(#~}okAB-n2KO%eN=k_qB
z^FNfIzQHo~E1L1}9RM;CWERk!cayi3Z3P(z7>WJl$7W1&lrR|~F($Dw2r4D)j}4g3
z2~aw*tAyKC0@WytL<g`)r??(R3rGp=!I+?g{c$5f=}0!Nk)X0A8i_C>LkV*E3)f1F
z>Lk4aM*I~Lkv9MNekSaM@ctD0<Z_u$ID6LrS|8)?a`Z78A*GKY2)g>{H7l-<zQUcX
zkH%mTeO!W*1*DHgU_5Dk{5dgMAKWUg>3Cv&ym{GQ>H}M&IMjBc_Av~3%OG+sLP{T(
zA?WHOZ)RK{U4%PXA63C3`Z%Xh^idg%C#{cfCM4(scbGmGE73oXdx@uMsQWpNeMsyd
z|4@G9rQ}D*ksrah{4)~d7w$6hgGI@Ir;;CxColi_Wcj&0^gru_^2g^7|G++uLN+4$
z5QLO{yb?hr`oUQKy(~J<AG!;-EBX;J4K=|c_OaI;B6<!O6GV^CAO6Ys@clRveX2|g
zFr99?rz6iBV+Y!?@Cmbr_z;2f+GXZX|3rS|rQ}D*ksrah{L>TU7w$6hgGI?dM#&Gx
zlb3&NS^2s0548Q?%a6R2{0KSnBN&(eo&@=YyNvu`QS#%k1ewAdFrK{pg=OXEDdd4R
z|9kn7my#bLM}7q3@=r^UU%1Q24;Cf=ZAyMHp1k~bmX%-Xzx6+n|0L`mG883Ti;!dg
z5L8OoAIqD{O5piH7vXl5pymfv!J_Qne?<wE!I+?ge>Xq)<_=aue170sfxWF*3rRyj
zUi=<*!hGN#+JEGw<VVPnAHlf%QxfDC?lSU&Mah4wk{^sGFF&5Yh7t72&#ho@%m2Cj
z$V<tOkRv~WaruiA<QMKT@`FXmKU&ET#*>%-_C)#VNy+az$|$h6#s6G><fY_C$dMnx
zxcqk~$S>Sw<OhqAf0U9Rj3+PuZO4&c>c9E_Tz=%G<VVPnAHlf%MG5i?cNzJ?qU66t
z$q&Ypm;b-Vk)KC_z0LmT@*^)LKSGZD2*%}~oFKn&mysVVO8${belVWA{I?!QeyRVa
zCy<}d_&Iw>`{bA(76CM$!Xj}Sd0Tm{s1}L4d675-8Hy4HA|#7ReBahp!fvehH~~s0
zc9w9vN}!Pei^OcO$m0fGM~D(CfH6S{c;$}TNqUshk!;*WB9)Eb)uMI^N=LI2<TAfM
z!?gmta#;&97j!o(k2fEbXZHWKJ}yJvqL1?tQu-)BP&JC(SntL4ak_9P>mvXb(MPM{
zq7Oe9Pg)=Ej5?k^xK*q>{doFt&c2dQ^J97BYSCB6j$O8ia15`WLeSk`DTCkk(&>nA
z(U>2`kNE-j>FH=r>@hIlUi8nU5ujp_4(qW!>RLf@{?BxKl`@d=85<YV!M8R4cWQG*
zV}33u8hJztm4-qgp_rg!hZr4<b(OUC$2v%0cZ|>Z7v*n@T_RUXa3U$z^ynHKSkq$t
z+e1UeV-7mFo8bKpZY_AXgX;<2=HM*BB@VV3=f_a8C3yJ)Zp@}28|1JgHihnQ7U%B=
zX`zR-Fm^LY>$g#P-bUqs=^&Wqc$ntIdPw};AmLf5gruj(nhI$fNRvIJ$*~GTDgmj`
zLn@5@_T8~#Tl3c<TW$Q9wMJz8H3$`tSv!8K89R&bRpc*4$(K5^HEM?zK4vXyl2j+<
zttaJO3PUlP)XU{c1-YR3Wx5c%51Et#YsL9%`PRVNQq2BwCjuKbRJH0byJ~N-7jOCl
z-!B^a2O3#HarrkI&{@3FD=9wiOXAjivDLUDg6LED*aF&0V3S<u5VN^I$6EHmal&JD
z`r?<`)3G{)uO)gZ&V3&{R_AQOxLKl=?q=MHFpU|deA2g0JHpgkN|!Rd8DTtXqLnUW
z+>kJimuRI}&>_8=gmJ1#D=lPPi7<|SX{AFMXAs6Fl~y`{agZ>zkJ4U@4Z@_`(mck$
z_6Ej*MXfZK@ppttxurRbza*SSILi2A!qm%3HO2=BQ$H&`g6ce0XAj{h;e(7{C7e!p
zH{*?jGYFS3UPHJX;iZh16V4=zyWU9eNy6m`PiMS{Fuo29`5Dh6j9Vp;pYaUB*@Oo$
zzMC+r6Y?`2N4OH<JjVYejGJYUpYbrlRS4%WzK$^NPC|ahR}!v9SYv!K;p&8OQW)j!
zLpX=<LB?kju0eP=<4%NY5-wrfj&Lo)OBpvKOt%qB7cy>0xDMgzjB66EOBl;J<X4Gs
zJ;Fm7XArJWcmVM9suWez7XH$Z{St+9N6B@~l<P|eQ<SoQ2k#WV>4l2FEQ{yL($Q2D
zP1~*fBg&87D%bOr?}-GEGgtD@b^NJ<^AtWZMQ|^L2Pi)|ouxa4uSEH~m7nvPN?-ZA
zm49%LT+ex1rLX)F<tL|K5{9+FU$2Px$@D^XefnR}@%A*15{(`|exACXn|^%y0Cjz-
z@_Ss@lGE#@`~u~7QGVZ~>kd6nT~99m!ReCk^dj+hD?gbpxxPxi7JjF`b}M|M`ii!Y
z_Loyn{9NTH=Xd08p${z<f8jmiADJ$G$-Uxh3Ma=eoGbL{%3rGd66G&cesa9L6%udv
z3*zUj6yJ$=<R!t$@dm7u>jPFR`daa$%Fk8)KaID0o5Vk&{J)A9-R{kA;U>X3ud3@w
z5vQx`yOp1_L--1nKVA7r(X=D#dhSl)Pu8cS7aI8JZ(Ab?Ir!rwd=5Q0g+GV>a}vFR
zqSHW-_(uaD{pC&}ld5R}{zd;(38g4Z0|NcgKtX?nQ-z-fV*1-pBQXAyr{(paBTt$}
z0}=f>`Ov`2e`XGQG;q?NlfETT+xwpy911FYu`2(Me;&JuE(IzJJx9@3rAmHsH7*AY
zT=X|b(P^Nfza0M6G#be0Z@Qw>z(s$K9%vw^zmi|1o@gMazx@gyIVR;<<&&H-q;dIL
zR1%C4hA~Y@{>fkTuo^GA&=3{>%l%q#p7P_A|J%_1kENb(IwroOmz{nZay9Lv!D2t1
zcJO~&np3Ht(!ePVoYKH44V==zDGi*`z$p!!(!ePVoYKH44V==zDGi*`z$p!!(!ePV
zoYKH44V==zDGi*`z$p!!(!ePVoYKH44V==zDGmJpse!w6oT)&Dj%$N(E?f>=A>14|
zI`feOM`sG^!x5d%Nko-TVWJf(KL>su99<`$XrtjM9{CiH&T9;XBN~+<AFeUNL~~pY
z!sG}O?`}AXp9e=+L!1J5xo~u@fzGthflWI5Q3x8jD8fXe^C@lM_}mLF>_?agU~%J8
znJ6x$K?gnQ@FpGD#Mu!3(>s9|&V=*9S#Xp=eRyZUkt|){=<E!Iv*9Rz(gUS!s4$hc
z5Uv*-#ig@1*TUt%5$3ZjV5Yc~2Bk%AG#r(Q=#&S!r6?%HUkrZ)d`g>mDIM~sD4(v=
zH7)~`KxHDjjtfMq4M+J6fLj7r07sY{<wJfCxEwf2r%zu^>kVfXX<7~Vz2J0&o5ERe
z&EU$zRfNlc%YrKhR~4=XTxB>PTm`sFaM^H~aMj>y!c~DY;Ho0sRJhu35x8=2S#TBL
z%EM*D)r3ofONYyW3&Yidi^65VWx{=gv<|}M<!2Y><ZFc!@_mIyexNWLVITa8@cnQ$
z-1K~3o{4Lf5UvatfYZT07S~Jiv-1k`bMp{~c=E15g$#fj2zMpi<#3n5{Ri$UxIu7N
z!(9VMr-6rN1PX7?2;>ceKMpUgO@WKRMd5Tf6V3-0hAR)32A2w#3Fn8i;4<J0xC(Fq
zI2$em7lbPZmk!qv_1f9U$vX#rH@NO_1x9LKx=~n|Ve~4D7}3J=M*YIDky+?7atgbG
zo)1?JG<qj(U%2T;VcyyBdl;E{osiZ=2={}_LcHo|LpodvTnH`=E&%6)3&I(2wc+Z*
z)q<-5R|l>poH-;=c;N_NVMn-&;rhY#8j(}je}peD5AGtk-f(^3&Kr@F7ec+A50?rz
z0In}w9k_aML*c5!Md2F2T><BZy8x~(Ty3~?xSQc};Bw)vgtI$mmgt?AzHY)#>Ad(J
z1HRT7ap4C#FMi$XoSEl?vwHao{ctv%A>k5)OG1577jQaU3S1pHKU@ILhBM%Na3)+3
zt{+@QxXN%<;LeA;2(B;O1#tc0D!~Q%Ar4#$ToBF&XAH?L)Q1G}e3UQB5bgy&I5Hq8
zUr7)SWh*h^Qs8u4kM#EC*}d!MrS=|Cq74D9x39$N?JM-bh2Vm48NCB}>Ag{I@M`c=
z;Op=M@YCRo-cw4#h(~hReSLWWIDcQ{1x#{T2<vdcp1wjI?svEV+@EmO;ZosJ;QVl5
zxCmSnj%c>>)8PByLU4b;9fPyreuFdN3^)xAd6XcJ5~N>(_$5fY1ZkHb?GmJ2g0xGJ
zHq($c`$)S4X_p}F5~N##bW4zC3DPV<nk9bdR)^EzEYdTa4fieD#}T+6;J$<V5$=09
zUw-|<6xagzdHU6XLj9INp>cH}&$uO!H}o8|O}MM!2E(<0YYEo_?rgZOa96=y30DAj
z23$|LbK%Z`>jpOnZXjHDxDIeW7~UXU3Y-q-hYP`_!dY+zTma68_F)al&AS<Ku7JA+
z?pnC(;K;^Igqr|29*%4v*+Oz3!+i!v*FJ&!9FF9s>*Sp4#9IyZeKFi6aMj`Rkalah
zR&Y(>+QW5$YYvwS*9@*5Tt3|Ca2?^&dPNIU;h=FXn%;=KGJpPG<@{y^<=0hyE9G}p
zem~`3t^8Y*f0y#_Q~qPhU#a{p%HOB_&y{~v`TpNj{>rbb{8q~Es{DS+zgqdXDE}_y
z->3Y?l)qB>Ta>>~`JXHQsPg^4tNfK;SNW}!-&Ohjlz+AIZ&ChT%J0^>bGw}ST?Sn_
z_=+2Inl^9JtVwR;77bJgRV9RhH(YV!H95Bo8a90JO<-%$B)3T`t;gUSuOBq5T~3c4
zU2>YX!KGX+iH|Sl)W2r%h&;G!M_$=v;7vo?-EhM#Lr$lk#zTkQblsqVBbtDi)3QbL
ztJ@B0-KJ@7^JZ7)wrO?6)mOA`(Y#G=tCm-`9XP1<6>SIRwjOvzvnv~Ddb(bxPC3i0
zGb}5bRk=>OenG9_StD?e*=VAfb#mw<WM&gRy>f>CzZFfKzV%hro0hwv)`g*qLj6PN
z5mmKB=qE_b4&5%$?kqP>uQ*o6+2R0!F$knk+U-K^LuXVx3ulmn3R(g~35i&gNos*M
zUe9(?Nn=FGgsChlH)f2ug<}{^pmS-{o1p1uxJ72Ba*lLO%VEHG0OkRSkrD0RD2kHu
zx!H)K9NGPe8nL~K2zV8d;*^FIA9Q8o^jJw;52B(3oiRNvvG}CK<0va$S8;X4ll)f_
zOVfxaC^9FPfn}$fbL3Vv?#Qj$1DXfP>uFj4sx+?t6Pu#t)__;(cA}O833_Cuq<Gq1
z&~1AmDkB<ZymcpPoTG+a2^gNRae6B_-b`7S<-AR!j5^TghW}l!O|BY`SpMXS;%-Y7
z#a-6vd4Jz+{|7~iE7R|-qcUm#YcmzM6s%4*RB<zACsaGr<EARHiXB6x+G)tGF|Od$
zghcp4*%onvN<6Vt$<>%#aa@y$ayizT3lvCT=L=}vVv&=2TUitAt&cQn{oyjwXr7@&
zHOYTfcW%#(%U~t;&*Kj;+)}+IblR%2Fl;>&tpV3v;@FYut<1FK%4GW)FI5?-SlYiG
zI8Ru2PG$U4>zG*l_;|qWH`P%ZDYR@mOl;IAA&JwdIIA*KfTXTgwr_duV!RC|mJ0SY
zaf*;yrmcCWE&l`UyY<gfajE0g9k&PGa{VjiN#jmZPqO+vhWRhYCNASY(_C_&=hhIH
zEZ(l!B1GH~u8e5^-a0uAI?0;b2}NQ-%GxDwWt6c_P6Om3v)yHZ7{WGj%TYa_N8!Yt
z0tw^eOf<2dq<eIq@z<L8_k*oi*(5!joQC67XL+?0xBaY#<7hd_0;MF>14qI1MhaS{
z_o)?tSd(}%94l!Y@aor9V!F52pM>mXI!^oomm_g(5DhruaQlk45Sd?kJ8~$oAta9E
z+(3CWh%5eYCH~*b8P_)#rpzo_O-BNWB5>WtOPo0Vv2K#bi{#<1OfT|uxww^xdc{Q-
zig&J)tK)c~*)%2&+J7y%r_0A>XHgSnX9boW`x7-+cI5wmlqsqD-1!W3roWUZp=)xP
zkKb3FnR!xemQnn_A0C~qlVq~o0f`FFi=}vVB^Hs(oV;XpZ9&p#E-QLRmXj#F(>6KR
z#8wlx3tUXG3W=3nW?XmYi%GSXq+v$_)%6l*SX_uQLY#Q;j9arZV(^8;MVbiWHUGz}
zjkwf*ZQ4Cur%dDjSAzWY1SfF-F4N|nU4R4|#1hB*F~gu!akSZRoIdQCFve#&tgGa;
zcw(21y6v)zu>YfhzU)x!oulwXE}{MZzptbIdOs%KU;k#j^p1^*9h<f3tb<j>CDsMk
zRD4FpFwLnrY9Glp;WlDUxl9e5xZfmo)H0JI(Bm?ftqI4_B`+AgDo7NO3z|4Z;0q^g
zaM;46b!wEB&i_hG=~W&vSV+&vnA~^bJ2**A&8dsy?kPHkREZcjNg|@#QJe{oGJlbT
z+?g|JR!~OVf8Vc^_R81}ckbgcV`VZtxn}fN8fF=}DaBh>d?fL<Xs55Jf;uy6)pbsy
zpz%uK&UO6ZF0tf)*%T7y7%Wlpc<r6qC3kACn8_`|X=c=T%LsYm>B!#*$x_GHIt<H*
zsI*-s=M%RBrwhcD8y`dyEoM@_UTu`Mn8`gPlsE!8vkBEG%FHI>Hj{IW?<8YiANC}1
zqvRwcPBx&4RmT#?8#(lxa3>~l+0F$^NtQBk>dj4z?Pxr`<88Xd)g5;24X-Jc^-Nsc
z<AzS6@EL9`Kr!VKHC#q5|K0+rCA?cEXP55y7KqoFWpe#T%`dU<CiZSt@jo<xYKfb$
zZ1*;}<gQ<4-j}>B<k|u*g-n^88B4m;V(8{tLeJ%%|9>@?amMX33lz^l0IRG<4)<mn
zS){*dm)uOt*g9wQWh!hP&Uz%b^)i)xVk?*2^Wx2y?${+-8_qXw<r15=GjUPv&>3%>
zxtUWpBzCe`yo;Y$W0Y;&x^Sk6H&OXQ897hfsFUUVtGaXL=ZxYD8ir+!i+5&|w1bn-
z4oRx`^o$$u@$ZJl4b$JO3I4sf$r}EfK8$<g-kx?6+D?-E_$I{ZyUDW{H}}8Lxl?t{
zJVv#;G9#GNVg5?n&M?c_cy?h=LgM81_16kd>^NobIwuY-Wr}g)?(E3KGXH}qx~prB
zaJ*a3>gJqxlyg+CdU%;JG+zI#+%kfkxB({#lBC}FoPg7hTP3eNiE|(pkazLb;Knik
zr6A!%Qx@^+1pW1Yx$hWPA(uSS67z+>ygTHzU}YsvXql`6?|nM2OwN>A)w6eB{kW^h
z6F1SMsw{IRLgke#pSNHq*P^`Bnz)8p4~ZI<Rz-~|F-m-W<qqlb^@^uC{gp}bwCXZ4
zoLG0B`0;H9RsOg@@fPAKe{$8w@7{T}%R7yU-9(N*(d;*AQ@)H0f7i3!U4+C!m8pNF
zJ9nFxb_vQ(^AkJU@!NQOB#qBGSW)qDm|^nDMyWzILkQ@dKDq+_E74^$?Ekv@RGGcg
z5O38c-$rs<a+%4b$DaMa+&Vi!wYw!Kvzkdb^LKnp9<SnmYsVA1HaCL!jFY?0i8hbi
zM(}q>6g{mxx$Bj+HqNvrVZD%K%DpYq5!tmh2`jC56K8R_i6<7vYslR}lT{I~YL1)n
zug&}k>f0+tQa6XijaO>&vLZo;T(p0%BkDG;q{U>J&dav<?~j}&D)9I-S~8GHt@v*;
zqV#U8Bbx_w#8smU4%C{~#L-qwr)PCEU8|u&0C*5X{w6Sy9ttu{!)I8A->^+>B96<h
z_2m+azN(UsYEL&VHLcT#&*&fYH=`@`K6j&kb*lw@)41?-92TW0K7Ca~j$$E-pQGTp
z4RA^{GclT<quDDcT3{7^EiENrJ{3$cJL^W!2pOqHnh`er^rXnk{+GdE)5Ayl`zGgL
z>mNv=0#53I#MDzLb&i$7u|AYoLH(eVi=H)c%N0tD8RD4hK~ANY1?yi_@NFWX$I6E`
zAdp6{IyPU$LoEKy2!$yjy|jF)-X{`R0Odu~^nU4Z8TKPU<x=%tnJixUz~jIb0#D&L
zOG}BE-BP0Fyp(kFt6+xNQ8&sNnPxWC#=7uwuw+v<dQ`XkOC4nOHZ}VLpo*HgUZ0JO
zUyJ;JXM^hN)$~a8JA_K~b%^DEJ{($s=o@qrJ$Q|7hkpifqyDtO>d~VNa{$wyNcX>i
zXTUb<gsdJx&G$Qq4boo@S-&FCFzw=?#vGgIu^laaSA@|kg4U#?_WlH4BmatMqy(9s
zW{36G(RB!I=Ju6dEycf<5uP~E(rc51T8N&H+oGo-sGp|S4Eq7M>K4H!dZVz<h2|);
zQOJnHvkb%!2XSGWo+9jRLjf0>x0(&Ka1?mEPEVWZ&2%dqcA+`ktPm#d?VOi6&dds>
z#$moeZ_f^A;>xRfP3T!~0lj80>;j5^RW$qxM0r&wWJmS3TsH#K>!bM5Y%uT8KOpAS
z;YbCb#=#-FD10ZT0?@$cqHsa3!keW37-of^P72q!{$7*%8=@o6KqaKLOAl0s_qraK
zflIsf!1zo}dqWRQM7j3}?M*#U1hiM@HusjU8RhY>f>~)9@-80<phcwVJ@tq$h)~#H
zE9$3~UYTC|ZTHBqQgJn6_ryD&0~aA!C2$#jD|5T9$|+XUQnCVd%zb(-=%<2jD)Pz=
z4zC}gRNmGpnP8D#AwoT-X>hWh%3WqZNg#bji&B(#^ajK^y)nmoM~6@xFHG?c=oJV$
zkq;zBz88_-6}8N?{P)Dsd`sg}zNfSIz8;v#MDBDS=-lf*6r~&#?<43Y+o)(%GAbKY
zjH*U8qq>oUPz|G|QOm483G!77*G8r0)If3cvklW<7pMjryOwfuB`ri1UbDTTg>!*w
zJwoUlBQ@N_MJYxkdYXg$i&LWw3A6-bs8hLU3xL|S0D{d@L(Lpu4mDb%1+@aMLpCUs
zUnP15KwauEp>AfhodbeR>`)sAm?MogU_jl~Ltm@MzxDaI0sl7S-$sE(pq>^u19j3^
zOR1e&+5BD~gKFOGC!_I$jy9zQmsPSlq8ge9muE%u5%`h&d+_<{q0Vu%$v91mb^|`D
ze?pO72(qCOlzyikWJ7aQZw{hkXau?KzJ+6GTDpejC$3+|(EKcj4NZ4s)hf8v<c`%k
zeVpE$Gx$aCN*Sn*CCJU7jnlE(a>vpF=OB?^^}sTCzvzKUXtKZQfqTQ6_Pex_KXfgn
zjyb@nOaII!bCLA=a3^SngZjC~>ES#VwK8h>y8xX&5p4cvqR{=l1i6Fe@^4d{I%_j6
zJ*A#`*r;zbFd7<-jMMP1vC+gh-N=P&YBV#N8!e2MMk}MW(Z*<Nv@_ZpXBZufGtK5B
zP`_<L_aTuMs3lF`q^E@D0JXf7Ow<-V5}F6pst3`w>J>5`1d8gf>UF~l5o*1ku5Z^f
z{SOhM_u)rwv$e<~pmcqYUdi_uLK*t3YW~Ftv>~hNAD7{K5}=%ad{$@)0&U4^2Yq_^
z$g@D1LBF1!@iao^gA?>rEwmKDc4TjZ0lh+KIZ%6(P>P<(hK!n;7EIOAsI|x{fC~CP
zy^Vh*LLIQNsp-!Kwa6NvEd4z_oy&11jTZh(GW?YNS+Izj|I)@<WFt_v{s9U?V@*YF
zV>H&}QL&w&hN?%6G@NQB{h&&<V`8eEoK(YG5TP^c_9KiJWa1f717iy>i28eA0W#_m
zcm%)sfhG9u8h9GN-2zMTTM$@*-|m6lNcil)1^7LOe|rSJMYt#bc4jMju9k9^xgwaC
zk)x+{EO%K-r_2f|oy)g2Hybo(INe8Lo@1{n$5QrirQBlBbj}fT3q8lD<n&#Jsbf!j
zXKLt0*odtLX~1rZQ7%fdZ8JzX`!$ng6XXh~AWKKnu<ZsH@m2A5h_}<wQo2;Q(|pYs
zi~_dt{Ri2d8)|JusXgp6S|MKO(uyH!6N0#5M6Utc>jr(JA#_<K|0Wb|Hy16sqLP0w
zXm4=AqASq6{8s?)VH|2#J30vHO@r;-nW_GPE~;7+hMtPKR|@u)5x5@2eTHVfZFJ_c
zeFJ=+Jt;l%6#~8N#u$W1Df>k!4QS|R1^2d#q#5;b75t7z!S5s}m<x2kpeAF_NaF$t
zq5?7TT?3V`*)xSGh>27K?@8vfq;&~$Yd}G&0sj${`F)sGq%=D_KzjYa;Kuc#VN*9c
zC}Mvk-pAs7VrWLbc^(ZaB^ig&qblnc8^J&XYV8$x9<uk=%s#)NOFqiO@%czc({IqN
z&?W@>(yT-uqE}=sUBKPoX3$n6UOz^KdUdAtM*%f`h8_%&iMfyld;I}DojYX}eZ1<F
zRb6dfl-OD?iZ@p7Zx?eR?o$1Yk*g3vqz^W2mz+a6J*a1eUqYEKJ(ti^dL94EK$lUC
zL?6l2LhFGp=Mp}PM9Ip~v-Fz243=gCXaiv4HT?-&3sF0~g0sI>4+S;@UCDiRU|>7K
zS8=}{6qpMBt9f9!hOb{7n2jsf^7X-ixd>m!cHsKJe1vZZY(?@zxT<apJd7(h1s=uk
zP)`45jx~%M?r{Dc!M`H|Pk`r^z%%$ArKNPus%&&K3XJZ?*~U3W52I(wxq)8hmspsg
z>Ub5>*MCNs>+ldaC|QPlhc@#SV}F+v-w!|+`19*WNLl@YS$ZQ@)Ysg0NmH8dC|LUY
z^E0ErBJd5j`Os;p{-0e`wVI{z_<v!jNv5?99r&V9<6MkwDCc1#@H-ff7(BK5*5Ikt
zchV5Q7w-r0el!9T@L{l{upQ?GYNzxvkEHYtG)n1{T`T4Migi-@n$M?PP^o!Jzw8<*
z{j+Nu7aA8C7n`HUL5~|E&m*U5`sGH=&@zN>rN*QGqNjvk0QxW2oo-a(k%E!`*7j&D
z)bug7f%S6L{BKXwdyQe`-@zT>&Omi6b_y}XUy@w`Yt*s&8(`nanT(@nr8Rw^VTWpg
zX}tblf`gILEr3`pQ-7MMb2@kFfo;fWlFn&Q)&sABrAXJzyY<fzYg715;9^<Pe2W^O
zC6<EIF<?>)oWk>3{ad{fH=3!E{xm&J?{kl&K3yiN_oB3yWoH|g8~-t9=ocXQS9sby
zQ>VdPzul<LeZIPW8;VLR&{>kyY$@t}nCA{K@7I@rc{4|xBM}RYa*<TbSaS5ShRv<3
zMpm~Vx30Mfb8b$&re35HpC^ewpmX&<D08OyqKAjD6hVRuB;$oB_Z8Wd%!l=X$a)JW
zyhwzeX4sr;Eq#`%s7EA)#d=^XQg~EScnm3AnO)f!Xg;prjx@H0c7V7<=YBm`rBPd7
zpwf6k(s)u2gkaO2k~E%18dqgkH3pf_=!=lXwonAbXC;lrsB0P<p5xiA{)}OAB`=kP
zmgxa9j?1MQo<~AgXID3_DYt^>o1IXKj0-`pqi-;(2Sy?E0vf;*W;ay9I}w^y-Kf{Z
z{E((qPnu+>oO#tv&b;a=6U{k3u^Q@5tZt4^te!UO5f9C*Xny{TNj6y~Q;}D(aWPD<
zg$2hhgq}5_9Zk<L3j8}6LNZN%I920b^PI`eJJYDZB@#&eCPa;MDJQE}#vsPCm}RDB
zTx+h>&qfm4!zAlUmRf(!2!=@N7xlK3Iz-o4_El(RgU!`?BV2toG#nAu=yizW4b%=P
z=Orj-EvK;~Lh04j-!g)%z<TjiUPda{nd`XY><q64+d7{A>j#W1?o$djHM(%0QV@L#
zyE)h6-wl{I4>4{;=q6*RaWno6!@uFi2xFvi3;uO5ZV8SutLf%)GYx~qfyf?cYPjAO
z3;o>)J#YSsVbZ@o9HxoQ3bQ-X4c>yWk!Cgu(v&x{56mNicjzgRy$Fre+aW~L?OXIN
zdNxnBUodMSGR?KaR81?*j--t<^$f177o|3wx_Z$>w(+`JWzq)~b*HWbx^=ZWURS&j
zSYwVrlhZrt<#@P!$vl@Lch)QMIK9@yLRi!D^@i|#`%#op{*q{fCXA!)ih4cn056*;
zyUZ9vB-%QY#%_>tnP<4`O&;t@OdcIKn1KV}+Gqyya$xOD9?V`5Z?kw?OwHVCcE-r{
zZZxF^`kna*Qmj5Q9JXDQK3xxnX)kIU%bSkPPOepfw1kV$X4I|e>ALJj{g>8o++Me{
z4$`sl#qd>uVcKG`sPW%c{nkwG1v^Zd6I3s(#Aqk;RUco$mxfZ@UVX9|2_Y~heWu=;
zOY)i|R9BTmAeAIcCE3MlNzc{8e%c%pDC^2Jz5nYbes<H(4*Ge+)KW%QJT2wc$|2*w
z#%;#!=EwRLlx0_FD{ABuy*9$ZcZ_T{|DTGH`Aode#rp!iVT^Hy8PmT6@9S*5OLf{2
zFwZv{^Cn9(gi_eFei^sfq3y`v5O2Sk7Z?@~3Sa4g4p{Pjt@FbF8_D3Xct^yTe=ElP
zI~ck<%}b3!W2`X_i_kO8Kd}#q$vRILkLhf(zA$QsX<wkwpoSYPHCpn<fP%C!5Sfn&
zV!Pm1Mu_DdYp}e#jBJes9w&2z@#0Ou{QWLtBXZfpJ#UiHfs!3yR^^_jATKu8gD{yl
zI)Yc2Rl*M=mm(>|4Q56Ef;gOuZn_D~X9P!^qN2Owlco^uZWJ46E*k7X1(s!sBr^)!
zw8^0$$r5@5$xby;<y!D|Gn*AU%?Lb-tM?dzCz;4-y3C61m08gYLn}AYXopy2O?buJ
zo(;P(-k4xaH0}ycGTZCsOrs@C^P3qYS%+YXQ6=;=W?Zv)UKVVEQTkcHGlR{IdR)=7
zMT)m|Gh-<TXPNtTG|%M--e+{95}s*fgjN9FZ;&w#ZpTh3<2eQy<X}4^BfJvixdzRv
zgI$dV+_2^vG@S^(tJeure)9~n7l1jO={z^n5M}y+K?aI*)-qP%LSC@DQ75t%p^m|`
zjjYHTggOO3#f+^4p$83`wgivBG}2`9L4)_0^6N30&ok3tS1Md+BghZot{Aw_HX3l!
zor4#uq`L&KR!Qf(NiQ(2PfU6NuQr1!>4nKjKP*XKZ8QkKf}9o^BPsG=qg<E@tRQ+4
zm--Q-zmqhzmPc3x!7dHB)QdebJerUZw~oh*z!s#4N~9Ut<3?Zwl3F6(6XHE7P5&t@
z-X<GG#@$A-F~yi_Of&8=rW^MfGmM$WEc0pX3t_PvqP5O5ydDlN(yK=*!g_Z_D8!ZX
zEH6~e4Z6*I&vDB!H(`IBt8%F{^Tk-kZ9plOdaH67ueqfWQN=GeFdfi>Pw17Id$~dO
z(?o0Iihn-N&7G@T@EKKz6_P967ztNJ@m`?UB5A>=QRkHz(ufl5kmjonR1iEfEnEwM
zm9Pg$z8NNBH5d|?*%-<V)d7A{%Gw;BzaH=^&OUfnrWV)%8@F0ixJJB}r1Pzn&iArh
zTqoXo@k*fV*_aI6XWVbhG3FZcjG^WxgTAF%muFX-MEEv(CEn@k9&E2S<9>TK_gmi3
zdj)AcU_5BdHy$z;7z>Su%}<THQ0ljMYxgq)#neJWWpDR$gRQ07+!Z9dyB6*4o*%l|
z;+@?uIN#7P+1eH4?ClDY&0XHR{lZ`y9utvDQJh8i_lU6=3&%%|$Bf6#pNzw3HIMqK
zx_@G`W%sjuU!c!kZ1*qE6?ci<O6|`7EIOu&)BH3W{)I2tgRE#NV*JX6+rBQ_Ps5g=
zYS__c*5BA<+Sgl*ewT<hSbo}i`GZ@feN(zcjrUS}czND-xy-&rwp}iFHwymbtrKTo
z>rYNa?i5*vP=f#1qcV8A=$NE`M^#?97V^k$7pC#TRS+#)nK)L6V}&S)iL@Itj%VZc
zIN6O6<ZeDF$opZm-W_iSzD6M?m^^GvG<n@~m&wcCNix@&EMAd#cZ*ky3D6Ytebm4{
z-Yy#uf<kE=m}<777NWMx1jV<@rkUu2?snNdJTG&$%ck?>gtyBA-=a+SVv@eZc*1xx
z<tcNZIm7IPrNP_0i$BxkPQ0U%pDfoblgGK8vXwuZ=bFLSWGi1GcPsxsGlu+wugg}x
zAg(Byr`>1Lh~jMJ3vzebuW+{VuXMNa@0Zz>x@R~?kh_&XF!+UR<zGe0=dJv?=2?`1
z+R7K?W-!Rv%D-A|<$s6IjL#EoKLBk%XfA{T_Vds_UqtqsRk<Ty6SU1N?#S25%8Dii
z51BNT3RW>gk!_eV4h~jDh-&D%;Gaef=Dwb}WiE3=u(~O8nFVGuib!*rFm0hM<Y{5B
zmMN`GAhos(+SD8p>}=+8<9XQRyAO*@o+~^eC0{Jwqb3`d#{@ktXbCF#Y2z8=S&RkG
z;onkYnem)?z`O;$`+eWi8k#m%xA#~X%Mlu<n-3YGj1>ru*Ubf(6;kkBlkTmVj~iBq
znBL=(n$H+n4Bt0t&(eI($Y$mbgn2o2$8sN<w24ov8(zPBXwnewu3rv%*O~`S&sy^%
z?qKfvMWDNW`Iy@QuQhr7@`-u5p0IxT)a*k!IqR3tOfsmne&PEWpPTy#I;*VDO`5gP
zDvQ@AU&#6-X7Zw?6zW@UJa4QpUNBagd(2Of&3n9Ed52y>m)-R{T`Tk^n+(Say~({q
ztWZV-sw>p>ENm>2{n#r4Wb3k_F3>d~3bFyATR(3}UG5X_ZSnSt_l|gbQP3AryQ_@V
z#v0?L%rEIRiCXzJ$o3Gu*;CWC@G1t>J@n3);H_kcA59PHseEMMD>fE-NSzrFNQVZP
z`n6C~6;&WnvBIO3!8ZuKigrbZMjGgpB@T^D#>MYjV{HvC{tHcpVVM%d73gOSF465J
zA6W%W&ob#Pqj=>Y{Tdcco^W8nXLJo%bfyExOAI^3_c|`82M6F+*Q-?xUXFla4Ll=s
zDFP;5B#GQOJ)fRq@kg7jn#5bhvNpo^TeO6-Iimn)l)~X4XBX1!fd5uI#eaMJ1c`xH
zmKy0s2L9oQg;72wH3QPr;1XDR<H}qDzf%HRN|3e-xugxIjGJeNcM`Y~KjxiAmGG;C
zV0|@nm{Bvb8-XzNp_zWnl)`%uisTa<ZiJW^C3{X2oXB1T($OF^ljbwnLqISC7K@T?
zpan^j0lh^PmOcgA)Y9I=rE<ifpOG0k-~hA8s3EjWrnRoXGzLQZ7_8-K=7D|^KIi~*
z3<^PK6PP3u3RY?bi(dO0;omG)XEy&<v`FYm))G)FTTkP+3ah`W^(n4b<Ll+ErMO<5
z)t{rK)c72fnvoT_R?}IiMppp^X`vcfiBK(80Zlg|YXEEWj1S9HYb{V63sW?$E+<-#
z6Rpp`4J=H%w1(C;{5I0;3g%%u%dT%{n**F%_;yvZn_bPk)UIxR70AJf8tpCcQfsaZ
zpIdVksH><w+8fHqfBF&Zm|Ypd754;`5inAy@t8W9zbvi{*~rQ@eT@-|4~gh)EZ^w>
zW_|qV?fq7B_`dI{+ni}yO$oHck5NwdwS;exb@Xk__MvfWeySMX#`YTX*))_G<%KVh
z$d^KTa`}SHS3&qfiF~OZzBJA^OZdWxd=XE+QRb^8eCdgN86LiJ%vW9bG86f@6}jcB
zz<f1@FDsER+ml{J=Bq1wl@j?Xd-$p_Uwz@Ln#fnp;nOVCrB<E9*Uz(A;S8+yII`3q
zm7L?hl$rql2KYC`zhVAO=if~J&Enrm{9DyO0I_QLuflIl{~-KU_YcNzE&mYw*4FG)
ztGXUYvtog;*)}B*G4ldZ-xfRFf{n{C>nuibvV+6Gs`(0R|8oe`<==YzTc3X$@NYx@
zZRCF*Jf~@aa(0_QrghjTZ#E{uD*9f;^(NGdeCGxIdjU?@0u}78TEo%Ou%EysZrD@M
zu=kvab_8DjyLNPjrcL99ZEB>+#^8{)jI9Puuh+!t4d1YMP}lrE<4(V2+-mr3^LAXH
z6u1b#KJKWqbSp*gWd-#Ce(O$L#Y)V!ni~N-K&Edpb8=HP7wBeSJgx<)f(RjA$XXAO
zYHh-A8pjW7{uF;lf6(9FX8TygDMUH1CsYdQoB|M1$lw&paSEB7LV3*y*%gi<#k&JK
zx?C2`b+Lerf;*e}D)Mh7t0kzFIZhQVWLNDESj^c~<LrJwb|fgb8WTrcIJq3!oG_1a
zay3#%BTh~0cKp`T{HgvkLv$+9uKhE3rlj48%XMf1VwN{j0%Hi#PPAzmQATwc`HiTC
zqSfO<)#u*^)^0F2<f1h~(N3cs+^L*i<7`TykxH+zC%q<adR*?)S&HVkYe7=vGHPL%
zoJ><5p<1efH1#CYj1{Je)tn2_!ny>Fr==D$B6h1!klM5`^&QQ;(4eYs&82R`zis)q
zT`G3BwD#70_&r1ONBx~cc84Q~aZi9Eo_Qff9%4}2ISUEoaios?+lhZWTW^A|3+J8>
z6?FXwF{ZO5-B^;grp@(H!1dAIj0E-byIZv2JR8MKx6h%Oz`YR@7j(0NFUnDSaMTyg
z4AOW{WPC1XaGqxOBK8@~ZkTlyd+$W{K43qe+52kt1;jp+*-f*RV(*v8-XH82GW$iE
zkzrrF9}#Ay%|LN3p<ZQ9P?0X>NE@U`mm$*S9O*xrJ%FOj<|tS6qA0Uelq(aX3`CTx
zILaW+zM7)k$5F1~D2r8;YZIdkMwII~%Jmq7>>Dbeb={vvHed+tvYFrbEH(x=rU1bh
zXj~&VL1jZZ*3F1N%(@l)!`V)ZU^{W6=Fjx!8RhMfH1V4gphIJHYg98aIZM$8j6$r@
z9RF5?{;S!y{TIZ!%y&EU)g``OV?e!w`R>&0LY)$u$4p~&=E~*N#$jGNV7$(J6DW+6
z$@X1*Q}zMoo+RAuIL*n#Q6$`VgRz)z>^{hhQ-pDhm4QM|CB|vOcn|j1D;Qac#{li=
z`Z#RN%#ZZMuMc}9DZX<Mx)=9+VEUGu(er?2@JS|nbBeDw&`iBH(W;i$7|!C8r}nN;
z`uQMQ_B$yV7a-ubskcVx425my>(KyRoUPZQs3F`2AsC<=E*wg+E9q4t7vWmau7WZU
z^*-MDxBJ+P?$@b<+tebt^5*Ck-6fwZ-aOqJ1N4A+4~jQmyobbFpj%_XvQW^&I;XNo
zxAx%TBZ3y|);^#|b?aStxEt%Q=CALs?r&xP82u8tAC8tAjr;pi+Fh`RO1Tv38q}hz
zF|+$gh?9h<&^bYki}!Psy<ei5KgVBCu7<C(?yu=PV$`x9$NCHl^zetsH)MWhQKc=R
zpC@$d66oSdslcbi!#!T3w*3sB*<BDW2bH9nF^=;r#d%KREEVQu;^B<7QOACs<17r5
z#Yr;{ahw$t=LLzgQkY*9kM42SwO8xqP_~D|bkZPf9_Bb}D9%d~XRR>5EZ#cAsb{a(
zt0T^$@N<Y0F~8$DB@}0a#Mvm!o5Xtsaq8QfInE>DPY@?+{>X8*P#m0;CLL@OW}IcF
zVfj_WX<+Z*vMmm0LG|h8&m3nb#d%HQ>=NeJ#lsk6G_>E~d>##-gE$%HZyaY2#d%ZW
z>=ov>#M_5BjqJBM&ST*_5vQE_C&$5U2z|gi66b(0zboE*h~pl2wBN_wGi-@Ro=4nF
z`%gV3Kt1dO?xg6qW*T*|4|SS1+Y8Y4sgE6$j)FeM-J`s{NKcKtfCMVokLf9CFCz4j
zbjk@xk79hRpGFDXiklxAb7tAk>LKR*gl~n~4(F$O%{V9Joo%ns(>UADq>~}r2zBO)
zsC((nmAE&jQ49Z^56s$WI730bR6#UxrhXyRbVXGV)W}PSTiLE)L|LLL_9op<qYfX_
z|3k&uta^P+ilBOZDW6-QUeEU9OHmq~h3DK32|h#RrXY2<O-S=Au?=5KkN-x{VeyWL
z_pQ*r6YqQReh}|R@s5i3lbDa61^pu4uR2fGeiQTYyV#9C#QRf>$uS%QC^S}~I9*{<
zdCbP~6F_sgU&r7u&ahsFX2u(6OcM-l+7k^JevYz)`6mhgXy%_R{5YPX54c<Si*Yk?
z3iHpvx`Cril_(EzlxY&>9*QzuqTGv3bR5(*eCVZ5gfGCvxvKf7U+1FCB)(a~H(NYR
zdW>fF{YD7vPlhLey_&g1vCkp)xxzP3yayn_gN7f>Pcd_K^BKjAdtds1hlFc^cngts
zbNgX~=B`hN4<hXxbD5v|)*>nQBNS(`Fh45ZW60ofzK!t=$Eabh<QPjN#uF6dNnw3T
zJS^so7WOj+9aecZLRC@IT;tbSS<kxlp&{OLlEzX>W0^$8Js~nVShm<Jj1EZQIZmOL
zxn8C4f+vNQlERCW!YYZnT0E@(jh6OH27Se8X?Q0}QQO?aC0{FXUZyzfgn7MqC5Y3?
z-oRCbvtg*=I_6fT&W*&rN%&q7Z!_3i+grG*mWNLVdtGyfV&6*a+k|hsc&~!JjlF}l
z|9tpzu-7wpafx<HiE#c!AFxZ9Ul(sT;<UBjU|p_=P@Ai7?opc9<2F3f<(rbmUP|LF
ziM&s|w~>Z>=G!>S$TK?H`;FR|VZ9L9j}#l2b?hpEeF(i{kYx<CHd4X|02>C{7*zrk
zI$)6D2%Ldik>U4%8=0ruX=w)$lEbKGecSSV;6fi($$^gn-!m?u?$^K$20jIR-=Hly
zvzc89h5W#<reQ7io_HTpxerQ#KQef3@Ufx!JNXV9osBL=zR}hG1dB-A)#G*Gr$!c~
zbXp2FIg!X`e8<#ml41pB0e)^^(@`@|2fPpP3xQ1mi8&_Btrc^rP}?ZxF9o&(oCD@V
z2JO_DXQrIa@GFCsLj++@zBbtQeIunlOcETCOW%t39a{1CT<RCO)IUh6d#O_YC<W=G
z3UX9(K3^5&CxI6LQmKCy=1UawFG9UkG5;#?azHBeZ&K<jRjGfMQUjvYe@NN>lu{p~
zQsV{}8FAdoB5xdajJw(6&G`tg3izXRgNm)_=RulcPc*rQ-en@<By7S?HXjA!YGy1F
z#x)@ujCTuTF)>a7<5cr0Fs@<7X~Os>Gu|VN(}@xHo$MLrM__!37f7d>4O3+eI#Z(1
z`b(pOtoAJPP4w-xykMDa_CpkNisHRbcxho2c?<$HHm9Xpfh7ptFLx<Mr3M+!5qMoH
zTBkjiMShtJI8RcDrIN@GNC6+D0?vna)Dc?yA+t3~ur94OW~mFz9aMs2X~DF*fD26;
zhs>YT%(VJ|4@>zDrCEW7fQ$H^hWSn!%o438AEA^Nn=&PqT6j$Id>q-GYcJu9)~C%x
z+$SU>Usy7F(wos!-i)43%IF!&=vm3=Imu|LWV8&5U(VT-usqL8HW@0L72a%K@Mg0z
zDVrB5n^ls{YRP7eWb+cTS<BgMU};{KY_e1~>%7^l_hwU)l+6apW}{@YNwRrGve^u^
zooDp2x0qX@;f;Y;(P*~vj;dKHni<#$xQ%b`m{p>gsDSO1(yJ182a34Uq(#Cej`y0x
zYoOxol6a?cyw@q-Zi)8>;_WdXLw>JtZ-0|dV48t+YC?NWYh*dPVFUhsrgbaAZ%efO
zc-pgfu%W+Cu$ghb(bs;*9Ev112gZQq0N)KVbIY57I|1M2lYIoS+I)|adf((lG@e)8
z3i^kV;z6W%L8zzE&**PlXn$npK$I<k9bov_tcJABPFWQLuK|9-x24R^S!UpMz)wZJ
zU9xDh^qCa+bIRol(`t;J60q-sVU0;HxC3TClTOp)Bfdsht38`eJJ6}GM>0Od%Cm3k
z#~D5fJeLj*E%i$~S{Y1B(?2D2C^fAjql!S8IGg9oa$$Kon^zt1f=ryv%czP_Ke{O!
zAeLq6CdI(nJci2!(%C%D?0LQilKNFmMk|<_dN?z^2BP*)Ju11rAdzqmk8>4Bxu#LB
zD>;cwoWt``)&lvaaI_Zii|L0+fZu~r65vJtWK`x8l9C`wDV<WfF!N<8{zVj>^I9cY
z?}{*5EeJQQS!EZe9?M9pgNSR=M^GM?9_CEeNQpj|Oa!@^T$1@?n00h1moY#pe<{5@
z#SiH@4A+VpB6?<oQok&<nx4uDzno4Rz?mvxL2kmAI|=`X6Xt@gOE02?E9g;<zb?IU
zLVQ7P`~gn<E2Lnw5L=&4`)#Ssbt~{b%2^@>YXwNu4ScI0wFBUIux(7oH7zwyF>ey)
zPJkriD?%;MrI~I{=kB#7oqN{Sbhf11#M_>(*{`NwZsIFy9Ce4J)K^dEQtT9XvF>LX
zUrQfO*<PZjv5c>A^L2D2$d&O*N5+97BMG%jn&5wQKWFxOI_-m{UZK}!xLb;St)9hs
zzae=K)`JZ9@aA;tb$T$3TI`$YBPs6fdW3t+o6MZ)v|E8x7Va%qQIqACFeqM_y;7KA
zNRX=jt@J^Z;Akb=TOQd2xw2jD$aW1U#|qsi3VmC={pp&~&;HV!jx@Ihnq<=rWi~H8
zlzI6|%uCO#D4UnB#k`zH=4HS)Vp70DrtmOXnXkpX91-*KEiCv&Fku%Pm%xa9XC6T=
z+agntOB1tEwiTF$(Dx>OG;@GnIWQfeA0)pku-QHX@JE4H0?q<FD(3hqZ0p0k{X{wb
zY+9+XK#2DOTJkTF@vkzT{08&(Ym`>fM||Z+V@efGG;ev1dHX$@Xx{#aCYrZDqR2LG
z-UPZ8*3mU@ok+uM&;E>}mB!86F@dgmE0pR~=54Gr9%bG-JLYYix0wi}T-m&h7h~d>
zH-TKkY~Ch#%-aNyc@yL|kS>mS%ctmU&?ZXO%Dmkr$T4qS9rHHHYu+YFiIf!*<Yv;%
zX@vz`M(#tCz2>b*)Zm!6?yh;e+hg7Yxe1@`Bzz7h%-y5dYu<`I=1q_rzlRgQrxc6~
z*c7jMn<@o!%-b}tdArAJ-lhw)W8UudnztEZ-e!t<n<f2kws`l!yxpIEqif#gNJ@@*
zn=8;UZ}Yt7ZJx&j2y$gS*OBo&k&(>X1JVQ?^Y)<Eyv_HTw}&Ke$Gk1@nzx1NH=wb*
z=4~M}yXH+Gm4(e)FW0>Fju+-(DU4&@7NrlN1Re9X$RnE|SGGQmZ0B=wY~CIbg)SEF
zQRww|Hapwd?3`}4Rc7Z8ui5$2Yj%!#%}${Y>oLt#W@oGqW@ns_%?{#`*%|NS_0$9(
zmtmq$GcJYMyUf1JN2ftvjf}@;e6BemyJ}z}LX&(n=9<^S<V^yc>?8AIUXQViEMSq3
z#!M5>U$V)&+s8LOhOx;TP)vDGk-R7SIPa;F_cY1-9^`#FO!{<RN6Z*^aL!H5nc0;&
z=X-r*%_!$E<=o88bcUO0nCxtGlk$rk15XQcE>xytP?#x^N2+4aqVI@m<}J!J&lVkx
zl5*ZBMZ8~%IEP9-S1!$xOY_8gfG7`&1oK6Lhaka!i~;rnzGJ^LvI$wYG=Io80~-)p
zD7k)}ZS&T}!*Ub)aJI=?7XoSPLgS5zMLx}bgwuJA(`jY?qS9IHP3KW>Is)VAJce{0
z=X7>)I<3t=RXR(&={(_0M_@diCy~xmoX+c<P8-v#C~`gRP3IYJIs)VAJPRFNVL#_v
zfg0Ey_^di6<vza8{AR@p7#Nq4R+sy%tQ_pZiE>x?H2;;pZuSely@>Nhnh9I8($}79
zFjmpxeZ3bYPjz#26{WFSBCkQQU-Eqc<~_VpU+e2Z5r0$>UzUjK7UnvNxLzWbAmRq!
zWH7(US!@)}+LdTIwTT#C5ys76+~T8k*Is7aDvT|eaT_si7sgk?xWhLGjBhdHPGRi9
zjIR;nE@6BfjJtg+!MKkZ-w?)2m~js=zA21*!T6Ss)@yGw<33@$y)vCHc$*ma3*$Rr
zJm7l>jQg4KU16M4*|etAz$TcF?~T3hL$nVh#)nY)K!11RD*K>s65_oRm=4B|q`z<H
z3_kYpe$6M8&!-aWGvxERZvprZF#i|Azl-@}!e2`KUkd*r@PEbp?=t__!v8Mw<5un_
zq;#10j|l&_XgpWh-}$EC`g?(zxV!Pa#Q2P3{2+4tNHLB|jGvIh&%VXrf1mk(5&kck
z|5xGvjre~T{y)I~C-Z;6{Ktg<C+07-SWjc=+`u@C?dEt(vnN=MVX!`AgVol2sg4+|
ziI&Gy-eq}A<s{2vDkodC6%#jI?acLc#B>!|Z1)_~bvJ$HIBvR%C7v=}Q!KXZQ!Tcx
z)2NvD$fdjG(sU{Gy%rn$8J1=YvS(TuDE-0kPz<l_%?kCX?aV^dOJ`I3`y|Hw)avI*
zcrHYq$0xi#;{N*pOBR0LXNRdfpFtt+zYkjM;RQCv)y6f(wZ>rMI{P7up1Arr`Zls#
zV4+QEdXedS2WX*%Z4oVk7tr|L1$@}5t%x-6FXBw}X{L|RBZBS`wAk_y=YtlnV>{>z
zQmtzccL9}u5&b-BS=ZyjW0rLjyvK3E|9WGB{{~~Me~5pOy~OH_=$~-pGxfp<Gd*Eh
z^T7M0WjzG%DMY!^DD>atJKq>;KW)8%2%q{Ef%1$6%dY7q6@0Wa`>aLhCG`yzf_ks#
zEWTs3)Z$jWj1;(B^!~i)aRv1Ff^{pj@fmAlrD)>=(Z-8jZLH!GVf1FOO~_%jWnEJT
zD|PW+5^t?|FN?PhGJ|D3<nL8trQU?_2FZRSgt<9d**`4(V*hY|J9`tC;d3s-D^iB9
zqzs$AW!NHRz@mxDuvPNeCf;`OUKMYLcsr#GuTdFxNf}<3GVF%zZ&-_v%onW2J=|IJ
zqt#fAZ(6kBs~^ooyuG5vw@82cB=Xy$zx_z#9jhz!7vnw81J*i<W#B$5?RmayT?%MA
zd!Fy{Nk!T7M3Z=5lKMbu=R-jUCE<^d@W&P%N-gD-KC!57@uZ;1yQiPVQ$j+YNz~89
z`$AHPA%#*-;Y&{8OG%-!O5u<vg|8&)*W!I6DI7)$M_9N+EZnyuTwNvHcV6MXk0(es
zV}6jNeiY%33i?SB{uv4X!bSUvi}tG&Emsxow|GiO=y!?whj@QV3dfK_p`VW2e9aS?
zv3{<tGj*BB2&4g@spEutmZA!DspEy3r>Fv5>I9*7R8)a3b)rx^DXKu1dY4c;E2==3
zI!UNq6jh*0oh;O@iYm~h774YRq6&1WcMBCeLp-e$=u(S?+Felvy3{E`JzG%)y40yc
z?Ww2&UFtNUo~x(=UFtnT?W3pyiE8y}h_h*a>z+|K3gqY0Ff#<r6f{fFY(e)4x?j*7
zL30Jo6ZC+f2L;U+^pKzhf))yTSkNLtj|f^U=uttB33^=65<yQ0dQ#9+f}ZwUGa<u0
z{!}{A^NgQIxM%&E{ha?lx8WIp!2O^;=dVjPL95@6+vZZ%ifPrCx)SK>Y?+rT(4{W-
zQU$uy=Y{H5DGPL|D}-t*sz8_ef=~mBD$u2_6l#j13UsM23N@&x0$u7Vp@tMypi5mX
z)Ko<k=u+1RHBC_ky40718dg++E_JO?BZ?}}rM@iGsG<sVsq2KAuBZZC>UyDOD5^l0
zS|ZePiYm~hZV+mwq6&1W8--e4Q3blxO<t-%m->oOE2y{vUFv3`W+|#bm%2r$*@`OA
zrEc|71-jI2LanIc3UsO4g<45b1-jH%g<4rr1-jH7Lam~x0$u7(p;lE?fiCqmp;l8=
zfi88IP^&AdK$rTuP;(SjpiA8?)EbH^(51d1)S8Ma(53DXYAr<-=u+PlYHdXo=u-Cz
zwT_|+bg6F%wXUKHbgBD<T2E00y41IYT3=BGy43wbZJ?+EUFthRZK$XMUFrd$Hd0i9
zF7;iZo~Eb*UFv&6ZLFvQUF!QnZK9|GUFruyJzY@+y3`ManyaV+UFt!hHdR!CF7+cX
zRiI1#*h>}YQa|xh1-jHvy;OlN^)oM3piBMSOBLu+zwlB8y40ALD$u2t3bmQihCrA4
zrBItIsz8@|$V(OIQor(21-jI)y;OlN^&2l$pi4dMr3!SZM}*o!m06%m{nkqr=u*G)
zQU$uy@4ZxkF7*d5RiI1#(MuKRQjdD60$u7)UaCNs`m>iR(53$3r3!SZzj~<xUFvUM
zsz8_eyHHyyJqvWHe|V_^UFx4+sz8@|%u5yMQVVTb8JJGK0$u7@FIAvR9p|MAbgARL
zRDmvaf|n}LrB3ux1-jI`gxXr^N}x-f<fRI9sgu1_fiAVkOBLu+@Ags!y3}GXRiI0q
z;-w07sZ+gFfi88LP}?YZ1-jIGgxXe71-jJfUaCNsdaqF1skj1N>I^Scpi7<Ur3!SZ
zv%FM+E_Jq0+pBy9y43r;RDmw_elJy^OP%AT3UsM+g?fg{SD;Is=cNjCsSgOXgNiHA
zr9SAT3UsOSg?grnE6}Ar<fRI9sSAX9mWnITr7jd|o}vnLsSgXaqoN9Qsf)Z+fiCqC
zFIAvRUF@X_bg7RDwUf$Mpi6zsOBLu+ANNuPy3{2??W|H3=u)5XQU$uyCxzNY#TDpM
zpYl=#y40tIny=ytbg9n>wX32Ebg9n@wVR>}bg9n?wLnn?y40m!sz8^zOsL&eT!Aii
zxtA)?r9LmzvsGMyE_H>MD$u3AAk=eIT!AiirI#wurM~E;3UsNfyi|cMb+u4?sC)&w
z)HOowsi*>7>Pud#K$p5!sOPG<0$u9MLOoAW1-jIALhYrf0$u8Qq4ri(fiAT~sC^Vw
zpiAB0r3!SZ8@*J4E_IVo&sQl6bg8chwXdQIbg7$#dV!(}bg5f}+D}mhy40;g?XRc;
zUFtTWUZ|)7UFvqBUZki3UFxesy;xBNy3`#){r~8C3wSN6?)`iA<UMo8=}x-4yW0Q@
z0~Hk!yStDOPzezb5CsGTRIpGHL=jt1EI`E$R762+ME$R|u9<z#JiqrneAqMJxz_Ah
zvuom>85OEPt!|5}K&`$RSAkl63+kz1%s{Pfhk9D50=4=!)YC&1sMU9%jt*6zR^Nqs
zMyLX{`d(ZGYW01nXNLO%S<Nh<i)_8Dvl?4+eQgKq2<2KRP{*<p>X=XkYIPUXvqKfA
z)eoSK4OO64Ka8tDt$q|&fm+=S^_(zfpjJPQt3a)O0(D%tFHozWLOnNBfm;0x>Up6G
z)avI@&kt3gR=<FHL8t<?`ej@NYV|9q7l!)+wfc2j1#0yhs27F%0=2p)t^&3CE!2y{
zeSup24(cVL3e@WNaTTc5y-+U=_XTS82dI~YDp0HYpk5xTK&}1=^@>mhYV{|mSB5H3
ztNWo|6{<k39)Nmvr~<Y6Gt_HB6{yvNP_GSDpjLl@dR?djwfZa6>q8Z&)!(4r5UN0}
z{torVPz7rB52!bVDp0F`LcKXufm;0w>iAFvYV{D*TS676)xV)m2vwj~4?~?8sz9y&
z19eiU0=4=t)XAX=)arjwZw*zTR>vpgOgK`3TD=A8lyG05Rwu+&pjIbBof_^7)aoRt
z(?S)f)yYtA3ss<2Z-sh$r~<V*C9VRsIu+_2;l4nvPJ=olRDoK(4eHEL1#0znsIx*9
zsMR~-Dp0G_q23wp3)Je2xC+$jOsKQNeSun?6<2{;y%XwP;l4nv&W@`<t=<K7PPi{n
zt8<{v4OO64=f+i_R_8&T7w!wx>fLb_sMYy#6{yt(Q11?725NO-Tm@?N9;oxfeSunC
z1a(2E0=0TC)P<o6)are46{yw4aTTc5C2<v~)unM2sMY(S-V^2usMTe06{ywaP#1;!
z0=2pV>b;=~)anCJ?+aC+R#!q@9I8OAu7bKGRDoJu4RvX#0=4=e)cZpfsMUv{E(=wl
zRv(7CJXC>NT?2JRr~<XR7U~0`3e@T&P*;X3P^*tZT@|W8tv&{Ib*KWh`Z&}FLlvmi
zC!jtQsz9x-gZgl&0=4=i)HR_B)ap}E*M=%kt4~9HBvgS~T@UrqPz7rB8K{qiDp0E%
zpgta|K&@_s`b4M#wfbya1!{FuTm@?NIjHNxn1Nb-KCS|_x*6(|;l4nvz5w;9Pz7pr
zOI!tN^+l*phx-Dx`V!Rjp$gRM%W)N`)mNZC6YdMt>Z?#Ugep+0uf<iMR$qs@G29oZ
z)i<C%8>&F9ZjGxzt!|5}K&`$BbyFBKP^)i2eJ)giTHOxy`A`LF^=+t|Llvmicj78g
ztMA5DpjO|5`a&2pP^<69RiIXP#8seHcS79~#thWzuDA-+>IYC?4EF_U^+TvHg(^_1
zAH`LmR(C^vIoubh)sN#UP^+IneI?u%sMSxQz8b1Pt$r3)fm;1Mt^&3CMO+1H^~<;l
z)aqAp6{yv(<0??A-^5j*R`<kJpjN+)t3a)O7gvE={XVV&wYoR10=4=>Tm@=%Ut9%h
z^~bmh)ap-h6{ywyaTTc51926o)t}=kP^$;yDp0Gx#8seHe~qg^t^O8Qfm;1Nt^&3C
zM_dJJ_0PBp)aqYx6{yuiaTTc5zvC*9)x<q?ZSHV_pSZSO3op?917kwDKnK)O{To+-
zTKx~|>*2mYt&Ycw6W<6`pjK~*t3a(zh^s)YPK>KStxk%oK&?)Wt3a*Z3UzCkXP{Q6
z#8seHr^Z#FR;NMT7RC(J>TPiqsMXt{z7_5Z)ao5^6{yweaTTc58Bn)}F$1+aGp+))
zIxDUMwR$Jix5Jo$TAdB`olpg8^)9IIhAL32bK)vct8<~g7w!wx>O83LhbmC3cgIzr
zR_8<A5$+4r>Vmim)at^x3e@U7aTTc5MNoH!=>oNSFVtP33e@U-aTTc5#c>s=)g^Hi
zsMV!$6{yww<0??A%i=0ftIOjmP^&BADp0ErK>Z-}22iUj<0??AtDt@u?hDlFYN#KD
zDp0Er##NwJAA-6&+!v_ThoOEFsz9x-iK{@Zu7&z(xGzwvkHl4=Rv(4>S-3AytB*nb
zJXC>NeH`i+p$gRM6LA%&)pbz64EF_U^+~8-g(^_1PeJ`URDoK38tONp3e@U)sCz;c
zsMTknejBPlt!{w&eW(Jpx-qT-wfbya1!{E@)V*QMK&?I(SAkl6KCS|_x;d@_wfX|o
zAHsBjTHOM5U#J4L`eIxKYV{?kKZg4Pwfb^g1#0ybs6U1K0=4=o)cv6f)aq+-6{ywM
zp&khL1#0yTsDFkkP^(*^9tu^UR<}X@J5+&MeG}^8Pz7rBEvWy5Dp0H2q1sI_Hv?+*
z?YIim>N{~2sMU9&I$_K}t-cpmfm(e(t^&2X1F9Rw4Akn*xC+$juDA-+>IYE$FlL}u
zKa8tDt$q|&fm+=iSAkmn7-}+17pT=w;wn(9pT<?7RzHiYK&^fbwO*JmP^(`+Z4jzJ
zt$qo$NvHy~`W4h>p$gRM*HD{>Dp0H6Ky4AKK&|e9+A>suTKyJkt55}M^*gAoLlvmi
z@1eF0RiIY)LhTT$K&}1&wR5NfwYo2^0=4>MTm@?Nr??8#>i)P2WYxKsPQ(uQ&Qe<Z
z?8`5&4*K%Tt6xCBf_?-24*CQ1C+IKGA<*BT!=Qga|APJljSnP+TLMXM0%#&=5@<5$
zR?rmCRM0fgZJ^shcYvmYW`JgbW`XVm%?8~Cngf~(ng_ZYG#|78v=DR;Xc6dM(0!oA
zpe3NCp!-3~K+8cZKo5Xcf>wc6gB}Du1bP^>2DBFR2<TDJW1z=DPk`2eo&-GwdK$DI
z^bBYNXd~!Z&?eAxpyxrGK`(%|fL;W>1bP|t3g}hPYoOObZ-BOfwt?ORy#?A1dK>f(
z=v~lzp!Y#LKs!OZKp%iU1bqbB4f+`L3FuSMXQ0nPUx2;@eFgd&^bKea=v&ZtpzlF@
zK|g@@fqn%21lkWe0Qwnp5cCV^SI}>u-$8$X{sjF6It2O~bQtsx=wHx(pz%qxza-ip
zXd-A5Xfo(l&=k;A&@|9(pxZ%rfTn|HfM$Ybf$jv&2Hgdk1DXq(2f7<HAG83p5Ofb{
z5$ImfeW1mlC7`9C`$5Y<%RwtZ4}eyJR)JQ79t1rEdKk0@v=;OT=uyyPpvOT^fYyPY
z1U&_M8nhnt3}^#rBj{PsCeU-B=Ruo6FMzgyUIe`adKvTz=vB~bpw~fffVP6Rf!+kY
z1=<dJ8}tt7UC?`=_dz>AJ3+fZAAmjteFWMK`WW;H=u^;VpwB^HfW8EM1^OEF4QLPO
zThMo)??HP(KY;duegypl+7CJa`WbW(^b6=$&~KpML4Sb$1pNg%1o|6v81xV5U(kP`
z@hP;w6xttXB4`q5GU!&&6wp-AG|+9J+d+4Lri17eul&p7nV?yqJ3+HScY)@B=7Q#d
z?gq^VEdVV9-2++#x)*dGh+cNVzujH}S_--!v<$Qyv;y=1XeDSBXf^0T&_keyL2E#3
zL63kQ1w96O9P|We9q38WQ=q3o>p{<eHh?y!oCl~3&!(J}wAhrA7a%<cljmWw8T0~Z
z3+P4AOQ4rQuYg_!y#{(6^af}vXdCEF&|9GGptnKqfZheY2YMg01GE#g3-kf#L(oT{
z-Jp*_pMX9EeFpj*^abck&{v?ZLEnJ(fW8IM8-n@pe-GLV`T?{L^dsme(0<SX(9fWQ
zpkF}0f_?-24*CQ1C+IKGA<*BT!=Qga|APJljZaHHZ%IodnE;vyngp5*x)n49G!--r
zbQ|b)&>f)Zpc$Z<pjn_hL9;=3f#!hbg64tl2F(X804)UF16l;S7jz$JF=z>BDd>LC
zGSG6+3eW?fm7rCi)u0DK4}l&AtpTkCJpy_Z^cd)I&=a6_peI33fu07f2R#GY0NMz8
z7PJZU9O!w_X3z_uEua@cFM(bLy#jg_^cv`O&>NtwplzTxL2rS!gWd+c19}(q9_W40
z4$w}}F3<;{4?!P+c7r|!eFFLv^cm=L&=;UDL0^Hs27LqC1Ns*99q4<|UJ$)=hKGWE
zpdUd$f%bzAfPMxY1pNZ~74#eEchDc8KS6(i4uSp#9R~da`WN&cXnY3kFN5|6nh2T%
znhd%XGzByjG!1kc=yuQ@py{9)pqZdqpgTdcL3e@XfaZecf$j#)2Q2_C1l<E#1iBY=
zA80XX31}(ke$X<|a?lFU1E7_lRiM?N2SE>k9tN!etpz;-dKB~+=yA{!pmm@pK~I66
z2CWA@1KI%E2znN@3G^K3dC+Fi3!p8a7eOz9UIx7adKL5<=ylK=psk>7pf^EpfwqI*
z2E7A%7xW(Jeb5fjPS7sU2cQo@AAxp*J_daP`V{mT=yT8)pf5pRfxZTP1KI=n7W5tH
zd(d9c51@UZA3;BX_Ja<9eg+)`{Q~+G^c(1R&>x^bL4Sb`f&K;^2K@v27xW*9UMt4q
z^etIvf1rt=NubG~TR~GmQ$f=}w}Ea49Ztxlq&u*2IyTM#%>>N?-3givx(hT1G#4}v
zbT?=|XaQ&;=pN7_(7mAhK#M_3KubaQgO-7ogI0hZ0IdYA0<8u;2zm(gFlY^EE$9)@
zqoBt?kAt27tphy?dJ6P3Xg%l|&<4;((6gXTpyxo(gEoU+0Br%i2zm+hGUyf1tDx6F
zuY=wIZ3S%uy$N~?v>o&|=pE3zp!Y!UgLZ&+f_8yE0DTDh2(%mYG3XP}r=ZV3pM$;t
zeF^#s^fl-k&>qmYpzlE6gZ6@c0PO?)2>J=MA9MipGw2}b7tpVu-$1{E{s8?6`U`Xj
z^f%}*=pWF(p#MPQv(XH*(F{QoL6bm}LAQdYfTn_`fo=oc4!Q$09W(<p6Eq8SCulb4
zF3=p%T+lqw-JtoP1)znXdq9go_k!*NEe0(CEd|{VS_WDUS^;_hv=X!mv>Nmv=poR<
zpf#YiphrNDf*u1s4tfH#4)i4GDbUlP^`K`!8$cUD&w@6Ao&!A(+6;ODv<37c=q1q0
zpjSYzf?fl?4tfK$6|^ne39d1WH?i?8Y}^id8}tt7UC?`=_dz>AJ3+fZAAmjteFWMK
z`WW;H=u^;VpwB^HfW8EM1^OEF4QLPOThMo)??HP(KY;duegypl+7CJa`WbXETV9^_
zOSY3Chx|$(FT5hR#Jw`F*u2WT+PuoW#vYY-t#znT(t99hEWLN}V4@M(t<0ghu3_J2
zCKF8vSLM)0It+U;edd*Mbq>9W*|1ldju<~EFDbXzm_>pQ0Ut4o#rR<uKW_S*bXneY
z^eSlcdh-VJM)M}~W^=rGi#frZXihRGo41-b7>g(m{2A=^7QLR)ppUwbrgyCV;nO$y
z=uNeTF_>Q4Xj+zWB4r@3>rp7*m{TJyM-8X1QrVU@mNq8so}5Ga4uGL9cSv&Ri|72t
z$CT{j@kb-nhrCb1uzVKMA_OAPS4rqw!1TRg)3s95DMJ}|I-AHl5r4Go+ausxKO$vB
z`U#SNma;_J4@p2v@)=?EGb8~m$!o~fK}Z5Dd7~*ax!xEG^JG0=)(d34P}YlNy;#;u
zWW6-O_GR8U3d;@Ob4vV#eIW4Z`;}G&J1a|^RmskBkL99Q8CEqrI9q&DhhO39ii7Fp
zynP7x_za_iA;;%oIzE5Xr{zq8Kj<~?WH~;HU;pgL?j{);s$aR&o0jw$8QZ$UsGv9F
zawIFGNVZWV&!;$&@Ne)eiii&ree3-HeC^y!@n?<AEXz2T%GAwn#yhEoo_yq7nN=yf
z1#K|BYrEO)XvMPV`cWG;k$0DGt&k6!+x8hLmp@MC*w@epHS8X=>RMZy%Uh^Ddu*1|
zg|?XdN?&_&eUB{#67>Ff{CT}-X*HriHA*?`7`qNbW6SGDOL@_%ORv851`xUYa%++j
zJt@ncgo#97TKD~ODs_;krZ7phk?N>~X$dnDW=WWxuu07^f^5Ho^{U!T5q6T2w((}=
zN=e&BC4DVc)OJx(+t(DeLsZm`QBmpBoD^eJR!~$}yIztK^JW*>b!eq??&`v16|Hp1
zZJ|=B2DJ8u;%c7TgyXPn+R!z3bt+}j%2q|0HqvkK!f!+ywkav=n<`PCjNej;LfW`p
zB}!>Q-}nr2{I=F6HHmf_Y9A(Fx3KTXOCF+?mN_@8!1<Hbo>N?@aw>iAcOkAeIraH<
z%T6Z>o<~WhXXL1bL<iY6D2Z0~%og6iwA*H>r1oV^CBLoh%aGp}q$J3%=h6=JwNr}I
zp5-|yq8Ft&_8co6Xv&$O+Ifi@Y)m^OGQ=;bL@q71po}l0jIU4`bAz%VFL{EM%N4l0
zRN#bai3;pT6*!10a56fqd9^EWDmtp^nxmSDj%rqPRP!bCVFeb5jww|`uD}ISfm@*h
z7fJ<gjS5^O4L_>D#ZrM=qXL&m1um5eTqYH`Tn=i5RMbidt0b(Ju#SXvr2^M8%zXN4
zeEo)0^d1f$$p&;Z4As%|{-Zh~V3yQ1q(wE94AIB6`;kebScV!)hPqh(O%z2FNua5O
z%_M9tVG9XcCW^@otrF!Fwl=Ib{MPiI(pzMk^&M$LIgzt-BWL5g<6&=+7dbn>#@Pjt
zvkN0<x2x@J0qyKWHEHd|Z5<@+D9-C7ZtE;fp^Fg|CQlBEQtg6bziFl<O&xSs{;*~*
z@e=j_<fWeQl7RM7ckxmmc&SIsOFhL)eJp=DrQb`u(Obek684p_pM?DrJ!#Y*AYK}1
zSc4W&dcDO<MP_q&sW|ddN#v!$wX0-l<fXD2FO^4Ls))Qaq_&p?YP{5!axhf<GfcuG
z#7jqtmyQxI9c=`q$y0(dzqK{|MRHCban1;F&OkUvfSn@^t5OXtmr_(k4XZjTMV+V=
z$J8!G-KZ4xYD!T*Dn)~+6#R)@-CGLiQlRP{CkJH+4$ASdgK~l#lp&TsjC#$HQqU9S
z5S%38$r7F-;V21DO?09W@-%V4>2iQZi%ZThtTW5V34Nu24b5&iOpT%fHjWB-R_y{d
zi3-@Xrhv_&0yd8dIHq<11#|(uZj_s|xyLvA6ni}g#_}gg%>JHqDd%uu%>j<UI8qI>
z2Zp?JrL5=4K|Mdg4fg`sa-nRwNSgA+vVKW|hs8^!l$S}lFE@hn<S4VkthBBeN@eaR
z9mJL5<>B!1RWUDLEnXf@-(#e1<r)dEmGC+Vub1$K#A3?ejgrDm;@O*xpelK)HD3Ki
z8SgKN+@d6vkw|5NOgZ()K2b?JBazM|CDBgC;#MrCU@=uC=-bD}G#UV^lgC-N(Pwh0
zy-5nUBZW~&;f{C;(~-g`q%Z@K&BS6B7I$JX8!6nS66aAub10!YSw~xQ)vr{Pfs)WX
zBy=_sx;viGd?a)>5?X*b7h-V_7K^aBS0%Pl3inY8b(3dWi`BoB!XQau2~wDd6qd$Q
zxF0D@L<-9g*>WsaVDSJJD^;Q;HR@HALcQb|Yqi=-DGZhr9z+ULk-|gq6dpzjQ<1_N
zM79=-N3eJli^o)A7NtPnRuAeYPq&_s6oyC&>yQF{Q<TS@C*vtRg%oBWg{KkOdMuv7
zVgnW%RpKC}@GPa!AUWD>Xf`q%n@!B7X0w_vjGJwOw&_7ZyY#4_eR^M}Lz+&HI?}&R
zW@qai^)n@pNnSVWBS$8A-J>@BZfug*BWlz=YZ`U0s8RQh8Z~_-onnk8d7!9SOYD8D
z-K8>FexHg!)7x<T>l?-1uO{~XQS1Yv*y*e5wPFW_vD@V|JsW84PPvt|GRV53(2%yh
z6Vt$5m=u0MrvP2dt}@f+6NDQnJ~_)U$c;~w);d8b9DN3T=%|>wmj9pA1It)Kd8v>g
zfWH@HsKR`l7jPa`u`i{hQ%Qod9O*v=oKnXSPuOBt0jJBSJycb-Oydnzo#*k{0evu@
zW~4mjba_JRRQfibC%8Nxbub_0DV8UR)U^_Pjv?UCj8T#V@r2RtLt=uvX;n}8CDb#R
z{zBT(_wS?7^A}SX7*^_4IxLm;my}+*9~qn5vx2V)Oq0Hj$t!fmK(yC$_7Y@$$M8cc
z@f|}60eLmY-V>2eCo|qpM3zHe#tin+nr+j!*OF`@pNezn!<a(yEiFer9LLGzO7BnK
zjbq58ycy>HeD?r>WB!ix@_DPn9~b1<ggCtfr&qwcJN(T!PE&}}HJpQlh5I==di08O
zUD%8*>3f(Qi2z69awM*^33>K-Uoj|>H0g73Y+WosUyBnDCCvYF6Ti~tk}EiR`WB|d
zUm`i7Z^iK*0y=)*`7I+p2_$bxE^283@0?WCLS<5L`cRz55hqRh4yHs}&Y!__!bk;l
zq^T&<v_vX7s+h)+(nm05=ZdmgI}7NYGttgj$kFeVY9(K@FzE}J$v=szBuDgxxFl=U
z^bukje2ESb)!`a#_6{xUiY0v>PD)ZwH2TDWRJ!^SHt-MAUJZS!G)5x{8yl%Cw`tmH
z!r#8Ek{z2$H1usaNxqp7eHu>6)%<c!kUo4V<)RsRjZ*}4xw4~j<)B<#k`_|K=(BK=
zOiLPV=qppIzECU4Eq(J+5^XJs(l_CR+Q?LdzId77+Sv9=-j6<bDNUd)Tk0w)#Ceb=
zkjvFk4tHKnCfZ3R=yPyVp7v*RWb`#SDNp;_<q^>3$&boYAW@0$I!M@2!cK;nZRVJ{
zW*(mlnT2MN8PHXYVzb07O_jNAGvz@)WmIuy`C1E2ilrBvN-bn(eeMPM>qL#u;%^|p
z<gCbK5t%&LpjJoOurrypr|U6%*=#7qIG21Uy&_$XsH7`qe3{Ep?J2^R=q%O1N|!q$
zT)7$C!|g|4Ra2k_7rFyzX_I3OHN^Msq@^RF>&>Ng&q#PqGe2SVQ>JeQ^bug%_#`Fh
zE~J%&T#q}Y%3Va@^Py0u=6gG68AzCvI8sI;={4qSJXU%)X&a;6GxP-*LtRznen?AP
zR=dU*ke*82kaXETM>01tM-=I%)TFc|nMY1C)b&U*UlJ=Y_(BpN45yGUkhPMG7V+gy
zwWyv%Q7rM@>)U*_##BoS?f$f4srwy~rOQyulI}p-S}Ivwo^%HjlyT!zD=bO7T#CD-
zz@tkXMuj(mBCVA5D#=H+gmnz(c<$fZxZhHE0{h+Ho@LY5jf`CX%UqXNMslt5_j<u%
zf)l@E_#u^~4>B1ia{r#B{@uMNq6}5;)C!`!B=zt1GFr<|QD2#6i<3A@N$TH)P8Q<+
zU6MJ4E55(4uD~eP{Qdcv$7r_#|97N!s_a4iyQC+?=_NV6(|GqJH7J+U6ykJ^;Cssc
z>D<5jhl7A~eLDB=$<>HNfFsFpB!$UM$a4|zE2)f@G~db+I75K?ckxiM|6gwM2a5ho
z?%#dt-zEMtB`13jzkrUvB>7uLd{QcTOS5~=;{M&I{#|H{6rB3^6h~a<Q~xfJp3VJx
zGK^F}M_L|5S|O22j>dBT?o<CRJCEi5J=$47?_3$}T!kF{NU5H~{ku>7dwM_7ILQ(9
z?`hV~<tpY=|DJw?@_ruuH+<^f#qxZyr2bt>a)D^nzk8q2eiur3k@pp?Uo2Jc5(zIg
zoXfatZ!7y;E-_HoF7aI<L|wZS=}PX}ed^k!NLNxkx<~@LNYzo1>Yzwmh^wS>QP(cX
zT+Lm(PhGpvHIh;4+9lCzB~j|yg|6eS-KVZS$rbT>?%I9o+NJhi&z8xkHVSbbr1sb4
znkYxQUQH%$kW5h5F6FuLY>te&b}7$|waX)*%TqrpPXmcc+;)?MH%mC4`rg|8y7cL0
znOV+fu4a{4ZPqdCn)S^3W&^XKQ}+c*sdd18a=pta2kN~h(h1w#H@kP!=K6fEMeWE6
z=o@WD18x>-XAmr<Wy5ux<tsBu?g<2vv#2!Os2VmWNp}T8+8K`S3S2IMH2y~7yr?T^
zEW}*_T9)Tun;)#EXq$Y@DPHdf^zk&K=`PM#owW3rO|uEyUfG#7q$J#4j>{FY86PkA
zV$Tq2Uc}Ugwo7C!YDd<xc4RGUN9IS7wTea7Iv!b@+L5)b9a-Djkp)p??P8I&*O5tg
z*MVo@{`k@impSQAPG-2gDJrWJFC9hFk$ZbxyiQ>x;*ZWrmU>az5@Q!0L;Q(_nI|Zs
zj6c;%aRtsw-laOp@#-3@yh41uq$}+f>q>>B=26r<sCMV>LG7-zd6>;spEPoxrWL*>
zW2jFm+-C@Sa&1ui{R|JKIqDBf2CiOo9%iU;1ntckh(^#pwMWptoIH)7hP#RO?k5A>
zAtn7sf9XF4$ofE8A0+F8WqpXO50&*{GQJ-n;gQlzjuQ1~ZxF>k+~eyxBSbw$?2i?j
z<BUY3tfq;^S#4`hnvEfpQ=Sxcp?t}t=n``Ej7~h`=}nskpF^{FYBD_AVb3(O8&PP;
z%;Pvk!!~bhEE|<<Jf8NHIgn*ep<Fbvk0k?(hAKmCx15o*ln-mF9hQB31P<T#t0HU1
zrP-L_x{WFH+~yP}T%MZwZW{^%w<Co~w;P2iw>O1pm*)-{nI2@hN78z>JA%R-_XG-a
z-IFQIb5El%-#v@M0{0vW3*8GSEOIZUu-LtdLd(65!V;IRzZj+N{S=l-SZ;WpQ?3#T
zw@JeHO8rET=<Ow))0CHT&Qf05$+Z(1=gUNvU*u&w^;IG#(cQ~+DpVpb(It_e80Zx^
z<LGWh(9kP#x+|~PX`{Rnrybq7Nc6MXRg(kS)2AOP<o2URrvvsbz~(z6i8AMaSMJdC
zufnS6M_SY5O3Iy~{>J^+N_QJ!mE4Q1HoVGYwMy0DL&A|tWOd}IaAa29j<mg*Y^^7i
z)u-zVW|dW+X0t}~H1AShW@;H_i4{mPVm08|Pzy0?AV&G25eP;NIi{9k)KH8{Ln9E3
z8nIC;F=`}6)u9mxMvXbS)?(CHjOep|k|z+1ns7{Q#HfiFH4lwIVq{iZO)n<1w)W>#
zKDw7d&THhkJY8<?rYUTZ$)&KRk*JeC+pA0G=ZSjGXbdM-t4qneol{5#t()+kJuREh
zg4^(g(F}ivvy`Z99Z2+(MOxv;M3=JAn2sPnPwLK%=}1cb&vA5cr7@w2n4X!iM9Hx^
zdPBAG9IgS%;ugU(IYs9lhU#j0atpR0Zz<{O2&l4Fb{pDYsWLy&idJk3cLm?#jsCm5
z#otMx<9|e<E1?&BO6v*#OA38|4~0R%YErhO{JpfE_J5)<L)+|L$-p`~nY245Qx^Tw
z$y$<3TZ57rYkxA!+Ly|<=rhy{$(GCfbX4X=ROa*OwybF^ro3Nsu3_93b|4;^2IxTK
z)Pej;PZ-DzLOfxhWOHdimlI4^ykmYw`=lK1Dl9Iw>SEc_iSQAy=m<#F=gr<JEL#&C
zBe8(i94elDvz()I?t0UtNV>NVD0hGN;@mBwN#~=KyLI6au?$)<mQ$XQMJ82L?cm7J
z35TnOQ;%ozc0NcVC{=FJb}ESD*~PSPL4u+*oRfLy4wnBEEnNvc2@?|f5;_tF5+)@~
z1#F+DKF-?D#pjbgGa22V_cB-0kWpxFCY!8H6sWO@)N>K=wKMx!rsfpT6`sU~2s|45
z(k~I2bcM&+LMxU_Qx!3g=OmoBvIOMG2#3x9jKnKsm3u4IZgozf_#>|_ZBgR_DYi|;
zrkl{T#Zl*~%xkpb%0)QNK>lFfz6Ti)&>8UPL|!tG;0#b+)o?mx%=QY06BOC({G$4_
zswQMhqJZ!q_?#&3QtW?1h9!nqx`QGq7Rxd|yI02`djVV8v&g!9EpI-qI5i^z@e_MZ
zq{Nv`#cAaYX-niZ0iEY$<g}D_S_O~jk<%&#*l9+34yBT&r)M&=X(huRNwaUV5@VW6
zGF8+_v}*)(GJMTOT$2OW;1E==por9ofgJQYe4?jL4&<N<=!0H|r)5$81$4x@QU3FE
z{_Ao}jPhSkKp*t_<2l7sQgU?j)hJ7jZh=0!4R|UR9bEyPQejktB7Jll`tvBdMiMrr
zOR!X*=>kDA$G_HVGKeB7kz?Lej`<xp=78{+7e~junH=-xhS!3dR;ldPQg)k--2mZk
zCDCrJWVhCI4&k-Qn0yf-L5HhU-P>GZXw!irQ<owJKo~=r#2{{OyM~RfKwA^gZg1O^
zq{!_8+8O1M+bgu&<?2`D_V(KC;^|6t9okWq_H>5<9C_G|1a#!pk*Di~m4#YGM=5G2
z|9sl3v!A4{qKkAYUHvRt?_ebJ=)O#Ta-7%gV>%?|4!7{S>L|MAPoEl=F1(CdlXQ~D
zr48fSVcbcEe?{{t)*q>f@jS3AZS06^TNII@s*?`4c#|JSY7q6@wYx6MV%K-_jv*^s
zI<8iU1h;qrcBXd}6AyUgO!@x20N?EJPoS*^IWstpmWs<O22NOq1>_rB(s9}Rq!k`W
zYfaag2@-OQTet8}BQRAPl^jJY7Q0!J^~3H;A`;Na+T><Q)(PE=Yg>FmY6TN0md-&_
z>WN%6!1G(vil+ut`7LQB;g2gw@%+{Ix|mf)(3VyMe_Ru~9zbhJBc1QjoKCJ(TB)xn
zqAvE;B&17eRUMh<U(Z(RsH_Zs`&u4=^LT8xIkl{gY(!6%Z70Zf-lH(b*+pTlvzx*^
z!!7m(xh38xx76F0F7xOH7@aIz`Rs<S-j`$|_XpI;mi>7I9?dtLJUU2)Sw$2U8E&O_
zz$)HByL7W3r7gHDH#$%DFJ;%OGqAq^?{7a&c4gcds<SM~X}RQdj3s5OpyAd~V_Qqv
zQfe&9R>@IG*(CWgXB|aSAw{cl*3)`**0U7WkpkA00@kBBt6A;V3A(s-z0q#H<Os9A
zRiDo>yW5+|wtj{ay0ocbHQ<dsWMc!_czFvWyCHAnC%<HSLz!`0Rb=qxgPQw^R-?Zt
zrk+0ArPcM8*f!!WUXiWu(&mi1!4lWV>AhN0OqXeSbb8VWI2YM!1D(M7DcY8!Zgzrz
zR&rA#13N=2P1qNykyQa-Y0XQW*b0XEYP@5!q1*(BjGNUqE?{Fh2+idnOvFJDkb}Sn
ztA!MbB+h<IvAfA}0y;FUoD7An#n)}5F1D4horLWTx4Czs+aftUy94)mTudoqM_Mtc
zh#nWQlblgJ>4;A{@wmf9mFPQ5^j#z}y6f07yW19$S5cBuAG%9@xC3tMA^Ezakv)>+
zo&qx+Nv#*p4Afl5;fZ<o95uYRMA*kLTUmWyAg!0rQ5LGD*%EEP+R^rxvQxC|hygU~
z<CqNRMylxp9fvx)LC(z-4wlIK<*H+bNaRBeYuJ-y+}oZ=>M+U9tCl$W2#Mr1I9fm)
z&2~qMU$&)Xr=tYkw4BML94#&$?o6Zg5zcf9kCOFcB(-BDJWj&n4YzgrbhC}y);r#9
zml@=?&kW8!p+4n>oh?p2p@54)PBuGOBOB3rADb_w$OUTkPN5X_Bq{g1Ig$!>d*jHA
zC%!yFOP@Z5T7crSbK`E3mRy9soO?^xu1BG2%t4QAXCN(23Hk9-vp045@`R^r4<*H+
z`Lm&hW;^H5(xq9SImDPjXPlhYjZs=tDXq69Ei&T2Yd@#-UarGw)|~p#5W({S4*6d=
z&$Wy%XousZ|2OC<RHq%GQjPLlKG`vC9xYn&rUV6crJs<~0xrERw}|CVvUxLGG_CJ-
zqNT|dHqj_01A&0wnLg0ohqlT&MWS&A&z^Gc8{AOn^<agoK(eo&=oW!Zcd{i4c@8?+
zFC>R!4{oFg`_YQqi-JUxY%z3M@<uq`AYSsc+&jie4B#d84Tg6FpTtN4@}f}H%=3oP
z7I`a<YGL_9BCu(!5<8J>zP}yt5J8u;vfM#tb}Bi8Op9nU4W^WLH8L%3vsDMU&Czb7
zr(tW{ma648P8pA6cSgzrI%N>2Yy?MAkO_E9&yw|QSx*O#(Uu&;I_@vZLqmri3o=R)
z-7{%=R>(M>-Rbfembiu5(+P};{=PGrpfoX{u``iY%7RauC8p5INbUhL%eY!iBP>r0
z$oHnx3O&_DX+D@r+#Z2n$-X#937-_qq>U#B|B}rq!SA$X6xA)mI90;aBs@L%h+-aX
zSh<ZUqT{^-WR=HRp))w|Ck-1cakeO_5!4DLfqaRSPFLtgV=~V@kjZyxXe~$;&Y}p$
zQHSLGO)<FYH2B_Az2W<W_I(j&KdQ1t|Ks~&$wxo<UO@XERO9=S;5N!ssrbH3){DjW
z<%ad5;%k3rJ1uCe8j?8IahlP}(8LwFP8(Vo7QB-3TG7f&@~lg+wZh4v4*ZDZ87cc?
zYO_Zs`Ppr!9a$U|Y;Ep*PTP)7j?MBurInZIY7sSw$@LR#3QExgjsFrIXqQ(MwSOZ)
z17zX`D(S0oVxKs_IMIp3*A(q!BrfuTE<~@ZU?(l!P*QWYs(`z%Z7PfF)SIBUlpM0{
zDriF^Hr?jq+qzc87K-JB#8`57B|A`!^&IxSuPzM4|3T#cWaR&p_P?BPMgFh)AOBa^
z_P>DkKS=zK%3nvEURMr4Jz1}o15jToN&~4V4Gpu{YV;1}ainCov1IpB&*5g8R#&H`
zsb-?4niVzGY@J<t2)CxXH2I(GHkIs#O;tc=7liCeKh%t-^M<<Ck^D9nm$#7hma^VT
z^4?nV-bV7?)-X$~c9+s#Cpzy@c5~ECmUJk&Y6vQIp3bG*sgIo6{(o}WL2@|^m0CdO
z5>!*EJ7#em=p^~-EcKy-K~HRbM432KYVq*IDX7IGqFRh6wxU{mOjL`Ht*OPwMYZ_&
zs21ZHt*91*qFO9%<b=ei7ScxW#8wph$SC#`Yhphsiv8p$c08jM#SRK%&l*l6=T6k-
zT`KEQ>ia(ceMoh?l>b>>W>RTSk|(M1)r2Y;=epLe1zoj&o&Mz70yVM3<2wr7y3&pb
z`f+s|S{4Ss)RkSk@mL)l8k(UK;3=wsv`bN9UA7FV#a>2r68rA8lj^RM3VZw>2Hkd}
ze^yT(A5Zr7QY5`v(yE$L;(f=!bHT*=Y#Fx%ylmA)x>6w-x4eC1*}IZ0pUHSbXxW=b
z!QijDVkjVnhKPO0+E7!gL=<3ADoHN`Hvz4_hsdU($Ky>uj#%9t$e7cQ_r;hqfY$o+
zn5!1%N#`^`=B*2}WUqmuEU1vZ*f-cqhS)*DA&Pf!wi+=+hL)kyK!!=*bcB(taz`br
z-F=xlE}hTSwT|pbJD-x^5$`C;`6?^nQI~qO!0LLk+i)W_f<4WnkBmAaL=8tB0U33y
zW7ukx6#W=6*+N(GkL8W0%En`5<GXb6u^wHpmOG$)3S*tIiehdkxj2KJ>z&{@<Ed}1
z@J1H6?50Yuqe@9><qp31$Vf~ele0KV?`%gvZr6L~1J0%)$M7z25|hYmELRTi;vAbt
z>niW^0_h#9y{ihOO4ZT5!#SkuDtHb@jw)#1O0m@SuFe(rjFVJvs*pkcTpnh;@o5?4
z1$dAbN$$vdw?M+}d6tIJ4(D?p=S^^=@(Jjk2E;v$ba591Qz`8W<!D|cNAqGiESE@l
zsf3qFc)4L+!DISq&K<Nzy{umxZw{?o$>$qcpOz%%5nR=TAfU%DorOg8vp%bG=F>`p
zU`!pUXjgM-GV4lht_BS=>r#fzgJYwB=HrP)WO)r|JZo>#NX#I*mc}w8>klU}i{QF>
zG?r)mnMlkgxSme~vkp1IeFQfI+?(7exxC4UZs#RixM!Iy%~ob>vyIu-Y-hGNJD44<
zn}4M}Pfz?xdyJR+b-_cWa=aP`52H74qL0@t{6w`rV7JH+9v-pZY5%6dnqucqT4@$M
zk)Pl#6T~E&ZbK&y5l)n8dGL56IVh9lheg4Y=>I|bu!Jbb<uL6!83X>UbdCLC#rNRO
z6u0-xS_ikUQMLWPR>d6OD{_4A$nlTJbS3h9AMN?R+VhX9$n{<cbq9~h3?*xQDp91#
zuS=Z`&mtd3=AKYN8Hwwt$EF90`Po`)ih7*%(e6%~-!u<KsuG`Ps#C>}0bOj8Y7FFh
zoaR<GO`Jwog`AuZXtx$YJ5}g@L~AW+&5>zWtK<ckcHI`6c7ehn#pl|1yXs6wEI3US
zNqxLSwPmCV>2rmgt_CrVR^=X7%jv2aV^}w5C~`YjPHBWQL!t-9xQ5PBT;bSR?$?yF
z*1@?d?S4sXZGt0JvCYT6ZFXbd$k{{N+6BW@;C@GI?Sp~z+$GQ2J7mqMlvc|R(^J(A
zr|lq_w;Sobw8w14BU3PhlJ_~KyVPdpOH@*#?#NM#;T*Zc%!WSaW3J+pJDr(MoEbxu
zdGNtis!+1onU!E4?Q*x06M*?DI6$-j$8;eU_h2wy1iF`wa5uBN*~9E<_M&s(K4xEf
z*1VtDKRLj?EVW9_p`^LEQqmq#Nms{83c?u**P#cI`a@K~?~@<ooguBTOY#o1!mhd%
zE|zu^jogbB^+D0uEzj4Nph`^7$V@yWnz=i9Bz8;AY$4~EOXUZ7$;+C`nOOJaW$5wm
zm%0~r`31DYGQ>6uC!fptwzH9(gC+~oO*TYnvV8uzLdod3Oi5?@fRf&nZcv*8%|Yg1
zbBH<AdQ#P+2+vA$m3m51az++CEiP51UzVwg#ew}J2lCVZHB|)^R+YPH-}UNu?h%tJ
zk+p#E862w(@ask;-TSl1#U{$dFzY$Bo#GoKec$u^0FDuCMBg_ccotqCsJ*^9*7t3$
z)%Q6~XxBl(rhKVSgR|+`Q|aGcK>zj}oCVU(3Q?TJhkpy@=m_&j`gfFhG*zr4&EeKh
z>H#XySg8l+dh3y+^K_2(i#wuvuwQPEhEuqibaH&Yw=E_8$pOi$*M^Ry<m+e2m-i;}
z1=9HnQRnL*y!Z<izhdzlRnm{7A&!vA_bG{4n0$|lCf~bbr;w*c)8o@>rpKp8)8o<6
z^!VdgUk1{X?>Tg}<c!4ZB4;+OoawDDba|$6mPfydl|p`kLdtJqrAm+SR-;OPDs>iB
zI%^*7{TbTw=P1h;sGeV<Vthp<`&#kSCTB`YXM0m}B&D%BrEhAd)Q&36Io>o(Y{uzB
zvnO_F_S8BwGMIcTotig0ErSV&2NMy$lZxiu1ra3TFiPutBt8c-8IV49A@Z@yx}9pk
zUX<_$EcRjXBUQ9!<_PO|`MC>@-vz-(j-1|H7#+VqV#n{I==fb+bNnuej^Cxx@%uA&
z{6PBn@dbv<g59NZ^7NOuCpvk$Jc|8_n%J+5V!tYi{ZK4+P#C+zBi+@(?v%}Qu4{OB
zlgk2sqr``Cy#G-~@)&otd8|=D4&p14*U~5@S3jFmcX1Ov@WVZ+{COW-MN-rMb44<U
z?zmg(buw%(KPxfMrB!uhF&?})lYm~eQ0_G2l?o0nGwaI4R~~etM;GXdGuIn?0E?HK
zNf8}>riJEZp8QT;ZduVBji$aX4`d$C{3zNchWunc!Kkin;&!2>fH}>qgP*ndH1HU7
zztG`}jRN%c2hZixGSHPK%?Z2;s^hkVfj5cPeQz>_NxIVgF<JI@`D*wP{P3)Ktl1*r
z6S=hJyB@6_N!c<?zFz0{AfFw@-LiQ+J><am6_57#j^c2MZiJUd>&HlVtVa(*7{__X
zQFy%J)5V3+(G9a?>TZgm8%1T)@AZuI6+~*WmExOH8J>lxjaK@0QVjKcT3!-?C(<|N
zrDoFBZ1Ki(Rfe~ZC`ZC9?*Ogm8bO}ZHpsWSCi6H_j$LBSuN|w3VlCiusW8^UFjk4K
zNJ6^2M|b6{;`hm{z~=mxOyoUQT9WUQ%NfFam(q19oo_Qrp{#Za<x#$k6uYs4$3L~&
zlB6mnER$03D<Ybc)<Mkbu(#D?bY54)zhSWI@|}r}saGk6dUCJhwQT!!g8DqwQ?Dn}
z)2YywO|zWTR@&S^a_~m3$M;Vfdi?ZKefpKJ(=;fvdYk1|qt>)rr!+shWUII6HwzV3
z<36;hbE*YxY9fB4Prmc9Y%0gHnGsZ)`!ZD~UGJ>6nsbfslH$kRTTo*()R&Y|c?j2P
zc`|WV=Ms`y9>-E$Wx1CTwJK&RR@E*)_S{<PZyAqP8(H)=Q`p9PiNdx<roLICf(BOK
zprO?*+3q^pk#8zWr0tL9XqtwRc8EpVu_jVJuI(l24ia@IiMq3qX+%E?qVwY}&r>wr
z9FDf@c^qR)Wl8+qhS!e2do2DQI({^Xo_uDl+R+o>oLo0auDc}HOOop?VIRZl%XbjE
ziyQh?^FCcwmB&wz^b_m;boyouppMAs;jW>r1Ep4tSC03N&<+X*89`I?K+w!O)ogAJ
z=1SDl{gAc~;nQ$+CtY_nTUbN6h4sq*nYKHMrnq$fjZX!GHs+w9tvM>w&g`d*E`{hA
z&!AHu=@>gxuic5}a#U~dcg?J8=@q^4j?trsk?5y&{9ILEn|nPqo`c?b@&H=+N)iWM
zB_(gTogH+Q0(9R5KLNDF*#lg>FocU?-C5vp7hp5kH)IT(yE}QDz?z*9JEt>msZNQV
z0Nc@@`z{5FUjE^YO1RXS8=Nu8kaVT0A2K8mWeA`%lsKKE2s7l@&X9o4P{3Vll%b#%
zL6jkZFhk^`<RH@e*`v4<YQk|J6G$QCu8Y;!mqOqc3>U(kONMFsHDaQv3_Hn0hU@!<
zbXc7QB+)BbD2_9eKAn!x${5$bhgOvJQc_~g=6=Fzi&zDa6IVJ(y35g$Jm24LO3~#?
z;qEDO_%gJBJuT0$+Y;p+&#^4Z1g$mD547!;Or<2*>r0&G3;{#iY`Y^tz8tW2j>A`c
z@luSYRB?UY%e0@VkaP+-e|8&3cIVfU@a{?8-Ll)!5XpIwUl`gOik;2`xY4nZs-$GP
zM9Oz>sTc}yrX9WrT3VlCvzF3hg9K&VDO!sw9KL2*M*j_ajpOvDm2&P1ZF(UdzX+j%
zykXe%T0Fi-%s-o@M;7u0s%Xwg(9)3Ts=E?dOAuXk?da;%j*ez?HPOifwNZ5Sb#(0d
z25e`o@T3$1;`)IkHN1wUm4Q8&pwVao=jgm7yT!JD%8^6Rn7bK!e}VY0F}El3p~F6G
z!mU93IFzVq5z~PZk>oK~`xnZFeH3ZUd7facDo!3j;Mi}LND{97dQOtP?%7)lBrSTo
z2kr3&(rQsVt(F3G%nYCBDhX?KQIgxOFHMANQL7Q0#)GIufT$L=;yFfCi(1vLMXl*b
za}lH#arL9)9@U~Cszq&T*P^zyYf)0yqITR+qFN+RR@qXEII|si7i%p#8Ue}lF_c`#
z!z?{g=nQ9|b2P)gB}<^wFANh4&>?l^W^Fx+4ym(rQDKK9AgxZ?QWu^mT92osEp<7G
zfxI9`o*%F`?5Fd@(_L#ftgf|P(yeyG>Lyi8yCfC41Q+VVhSj6COVZjUJ!`w9mn2Ru
zNgYQer8kyP#wOAQ7xx6hJ`(nou%Co9CZ)osPQ#z`#eRcRAiu1n-&WE;z9mqeuHeU+
zQvde{IW<qz;1~MVfQ2;B46sk3WCwEL)Oi_c>H=2Fdg3YpZX||%Dj5!Xj}6ZcoYM&g
zb4jeP>L$-1$XH)j`TSeWEbHqWpSyx=r+<eex9=QkjIxr;?V4)p`Lr=lU67MGmsav!
zep>b%S{uT-a4UW1YzFq3EB8oz?t}}}g;tU~;=-sCE{Zzg;;0iID$Wc$VF9TQl;Y(S
zLy5X5$XrS*rPfH*Hghqpm02`%PhClC!}!2k7xQUPz?qlHIN3Fm?&=6R_+fW-q%?xC
zyE;m0Di_PXnsz&yo4&dvusI!iH5<jbEkAXwhU@vlo_!;++;&h`s^IzvTzaxVhH8;q
zKCyJ^C2qzs^g!XU^#3^b56a~6E?=%6A>+ykQbR^cc%orFqn@WDdXznk+U*8)BPH%!
znvv0IgQB^HVIOj2&<lx&c<z!ms&iQU8{)Ha>DAs=Bt6uTQ4e)g)I%K|^-!B|)*W_H
zo8&S_)F0w4R$TunVs7})%l&BQvb1CX#K*?x-!%+(E>9<Uycl86!dMYr@SzvM@?i0T
z>}=1@vbi>GQ3V|9oLuRt!k1RHY-33Od5k^3T!ycgun)a1mixw+70(}Sx*p;4FmSAM
zMWOV&$8pW}`F9M*JJ;rCb8mkFJq4c=@YwkZiexKv!XeU+u2P(hSD`wpIMtN)e@!l6
z+pa1xeC|C*I#-rS_wu^pDiL+LuSaf@k^2p~iW+sgZ*bkH)$5)Zbun8}mavc67DeIn
z$a7M(_nWw4#vPP2+_&UH4E4kC@yX6rc{09k$M!k7;;&O8f4waiN$j~<s1xs~zLXc|
z>RgA%sduSVlrtKvc#Ikq)qwY6HQ>D{Uk>N%eGu0Gsh&HOEOsixI*G2j8Ut;fN1fb;
z;<XMsHqWI_;l3ei%A;yGWdSs0pBq-W^-I3A-wNG+Pu1;LTATbV*W%XZv>>mZF4KrZ
zC2lK%(UOxhBs|lw&N_=4%^>GanuU+yn-FS(@83l$m1=6Wq*i4;)ml=k)~TJXQ=`6x
zjq9jsFs>UJ*VD#hwXtOF94VP}1nykgb)5SN%^uE`?40KgpyPMGtY0AEg%V!G&)e-%
z{V1mcrFrjClPRlCdkllS<W^GDyhDnb_Xp}yGL4$|hq2~;YSg?>i<<Z8(ZT&Fc5pw6
z9o*eD2N$(#w9_e1TJRa%g1LEo9JNxZGG|8B>=VSw)r=eBr&8;j4%t#=LDD3-37zG1
zER;#?7-~I9p7x$i{fpH4u~h4mJl#E~W|A_Fnu|2*b7`6@=I2FI-1E67672$N5R#({
zc~T<2{R}npb2{&$dv(^ujmgP_vxm@esBd*v<pKAmm+;96QCd2)OJo2DJ5xa9>`O=1
z^7UnOT#cwFZ4fn|hLMLamuSNN^YYqLyesq+kAB%irEFw%QQ1-j8|zwiWxQ5hCCIf(
zhOeu+!CGDE1{QaqSId+m9KNoR$iv}_FUrYmJ9Q)#^SZ&jS2vZH;i*XztB1<TKAN^S
zwJxWD(H%jn*Ne|?knlzcZ<6q43CF{iw@@pep!k{VA(H=Q7ImVM|K>XX6Xnoj#1*3!
z^nT7fF>0xeCe<>M1STVaTd|lzr<`rQ{rc|e7wR@j^JtlUT;xp5kt%s{bOQ3FT+NNn
zK)#YIoY5)B*Ql4FSH3~zrBU5)Nqzez&eVdSIjw{jn%mROEFK_2991bN1bbpl<lCAi
z611jWzeBg%T;)hr5Al@7hD7{5tXryli8#bC;*e<IF;eX*s=adS#%@Kwx#yUFK+NqF
z&T2xWm<_)z$(K4)T8rI2*p<>PK_2HqB)b&jkGN`HPCdOG=bzwVy3fJWQo6c_o2=GU
z)r;!SP=77urj@!c2zdH9O?I}LDS8OF2E!R7M43WL|96{=W>yRGvHvK=(poJE0-jsm
zuD<2!ceL_lc6o;!l4yF_CYoNhji#6FqUmM(XnNTJ(@RNZI#L<Vsql2OqgsaPW+$F*
z`aIq298EQ6i1%yFHM^+gmYnqSo5*NH2+#VvMib6%(I_=b2HohiA5?qF*<C&0NLSK>
zrjof*&3o#}XD?mN@2pkLrTNZQ)$EFB`Z*gDfUp7Ig?b#;)j1gVc?Rk~PVVfjR=aYI
zqK~!Hk!uuvqukDo<rY+HRPAT&!l>F`XKY?g#>A_4%M?Y%Rm@`NODj@zJC&!gAU%%_
zQSOka;R}?^Sr;m|8=XzeS2Ux&hl52*=C${trcGBlEZ+xOjKvZxmSREABUnH4<I~1)
znLb`-^~gv|zdV{g9)yR&>Ekc(UO0XH6;r`*`uLj^(5C65&r_Q#c%0&K<##lPdKp7z
zlYbyW8n&c4{E3YGg~cH({-!q4)mkRcLJafYr9uo)Yh5|HSPsVzR2AaI6=*JlC?`3o
ze8Zsoj@Dt-n+i07hCIXg2S<N+((g=krFA@tca^;uv*Uk}8~S<yckch82um?L9xo47
z+skvL7`M>f59;4}cFdEqtL+sf()cHsaz^?<k;9uOnml;eD|6_24kb9rtR}QhPNK&(
z>8gktLtBoJhBQ#ELqi&*&)Zf>C5T$nD(Pp?n&gCSuzE6(Q=Zk*&eVET7LcwiA<9`#
z9h%`kNKSc3H6*7TDYLL4)+P*rLv`(Z_<w5W8n~GULAiLeR@LW%JcmJWExh&!{Pc*t
z_>Qj<@!@_Hg8}(TCNGcaHYF8onDsm+HT2{OMM$R}Qdy3OD$5h`%Cb)G=E+o=e{21u
zoB&vxF;xROWAf#^1SD-xoc$E4($iS1ry7+jk3R6j5B&H89)QR(`xwjV!17(SQ)!MN
z7mugX#i`wQ(jJsU`SV}9g&yc=N7n%9Ie$y7B~RjYdxoMX`J0gq-|sT4<}|Xe<rVjF
z3T^i(3LSSNg|392%a`aA?u!)q?yD3A?luaO?%NcmWIZkG8L`ijE!naq$K{(hxrXQ1
zr+co`BjLHF%1hV-J>Tx`1x~j_k{;IcQg%1`0e{~_hM(e=R~FS=<FNAfQ_efvpHW0g
zouA`>LMx_4op`XD*5rDpMb`{IWZ+tD;kn8$XtiJ|1?oyC;DH5qn66gIZz7;MN;~H(
zO;Yu^WJNB$IFVjQz+s7$sla`T)=S+D6qdQqQCKdetdLUDFaK$;Dp{|V64w!5)^+JQ
z1EZd7sZYN%ushRR2acte4vg?hofEt=_du!v-;iiQ$5P5?sq0WaTbHjPH{hsz4Qt9L
zMQX%TIdvn7B%q4~k|MPtBl^8(xz*%+%1#%j4ONk*{6LYqHP4+y&`fgM+?`76E!;aO
zY$->(l{<^pTg&z~V$)W_cCw|t;Z?YUyh?YpM^9dQ)lP5Rd5Ru>v^wx_&bvBuD2|SN
zT}-W|7ZVb6;tOu-u{>uUL1!t#(`C+l0=X`y))zZ`WvVOJPPINiSVYi`hBtzA`aXj0
ze5ul<o44sDMDpV$mv7#RWlz4`pkDFCvRBNqH^(TJQtUob$iD8C<kfys)c#Vup0Z_t
ztPhk5KS;vChIRJe6hn_Rmtrh?(51VzKEDPkL2G_`DXqzamHhledIizAI&8)FZqo-S
znR9tM;nKZZ$-sHy>+{8%7f2EpN;ppH(?wEmE;i!t)wG7JA+L4|_EBs@FQnMij`HMQ
zh6;|FZhlL_<#LwGH@}nYz9WXSNH@PlJd!VlX#dH>vEmiO`H3PrT8_x>N{2H{-wL24
zc6-i2f)RZETYU^ThFz;ZDHgM1#q1L($5EGLs82nK?RbgpvkHL|1U_#R{7$irq{)h*
z_Ner)5x9J>+~qe}pO~qp@FcfBg(r(QPLYFhgoLBqCba!ju{_OX%hSb&qs4n?7+x!9
zxYye5nrP!zDX*>F$!uqx*_Kl4ZgW#})E|Y?+}x<SdAhls#W{?c8($a*yAx?{@(ga&
z+yr!U195XRTyAe;3_S2@q{wI7PxB8a&`;CE;3B$?$#Kh{8a|n3ZSA>Jvt)cJ#BDtu
zYE(7d&!tzmIOa^E)%4%)OMyj~4GeX0eY-zz;RCNO@kDV%xxA$)E~QC7miF8`X=zDF
zw|QOryYPJm3A)jhpx=5W=yGv_-bv?mpu80&=8|=y3k7tCRwDH|QfEuCeK#!wKDy49
zZg{mrvOC+A!Y&or-9sDGTtCE)<IAx7=^c_1Ushzt@!_QhQG7Xkwa%;uyWGf*<C9xn
zI=*~~uL0}|B0G++kQ=$SD-yfLuq&=@C%>ZAcBNw13U=~#yQsW!+e+J2h+R9_Ro0HL
zN~e1-WuRL8(jInoBD=+;)YX1jKvYlS>j=C0ksX&;o*dBWHWa(Auxk|Aad{i-bUD5z
zV%HOPO(Q#wubH;v<I!B~dc%%hB~x>J<u|I5E|;T~*!71Uy#oezT#h!e_}YryAlS8w
z>^Oh;Z9N}fPPc>D4TW9D+IF2{@pTrvBVpI2ww(+M+ArP2?kL!GuWcv8g0|}^cE`f5
zS8coAv2^>0-3hSkTiZ^4!Jz%pU+hkT-GIoB{UX0v(CH2myHjB|II?5E4AJQ>B^oO2
z?M&DWi|m$>a)fSgY<Hx@HwJb`)wYwLHAuQ_H(cz-!fr%uJNfm3wmVkr#=(wXH6ZPr
z(>*?xzZ1mnJlKt_ZFgeK?j*6h0Cp$Wwv)FmYQKyUyNh9WYGlWLkt?^_?sT!c40fYy
z+nu4)<$5LEyLkod&Z=!UM%%3*yR*g5SHW&<WVe!(bF`ni9gP#a>tJ^-oui3gxE-CR
z<KuMCm;BuTy9=WDxEvS8>@E_!n_+ixZM#cic9)9Xc-UQ5+wO90$K|*}>~4YGm66>_
zQsf1M;y<>#TI}fYejYaD=MCJivE8+?_^uPXX|TJ#c6{<KK^@<XVmBLhH`TVgS;xnI
z8Lz}}E(~vxn<G(=ik~S+l-$KkRO0J-FrB1LxiL+atJS(T+=^Hi!E}l?Wq(hV(>rZC
z4W{(;HkbUinCa~`rhKgNn7+9Trqi`4AMF`&)0wIRZ8RT%>8zM3F5*ZAIM&(7z$%#D
z6*HYv6DvDwE=(VU={#-9#kxCgIv=KMV7ee?x-f2f4@@6{>7tnFy>Zj~V7d;bi({rs
z;-*Vs`ZP@MkC`run=XgxGca8dGkqX#x)P=vV7e-1y1K@ctILC^E_A<<59~v7=Ywk&
zSC@xtO!*M3L9AO~x>m=^2lkN~Q;zjfn9@Byj`cBZ%CSCPW6DSC37EbH({<XEkJgiM
z)2Cqi227uhnXZqUJ_A#_xyKpU5HsBvH+>eS+hMvXX8K&*^m&-R3)9UpQ+y3bH<&Fj
z-2u}VW2P^~V|^K>AHejLm??f~tYdu*rgX!Ii}iZU6dzX7rdwh9DNMJ;Oy7)W;4PSb
z0n_a<)3@WM@4)nHn7$h`eJ^hMK1}J#J?C{t%yehmbQetb!t{ff>4$OCk6^kFrn_UN
zAID8Uf$4sjei}3VEN=QaOb^2Ji<l|yb7;SQ1=HVP`gP0{PuGhnw}L$|{S&6&YEy0n
z-^DZVJxmY5bgwq$R`5gIbRSIrh3Stm)1S1d&uwczOqGJ|ftc;jaodBiwL;rpVz$55
z*m6(s8*J^+_IGW|T>&ocvJ)f^f5O%YZU54?Tr_$;D23YgZ`gXF?ctd1KXKcCVH<?D
z|HW*_%e6Qi?=7YbpXtzcf*FmpoQH|pHo<3vlgwEZs_~Xo+sR@Yog3aNkHkkOhf|=1
zXNObG@bu7-Gecj(l!RFl=1N!~VX=f|5>`rBN5c9NHj=QZge@d&EnzzeJ4)C^!tN6G
zlCZCY10)<Q;V=o0l5m8C$4NL+!jmODRl?B{o+aT}3D1@A0tqjc@G=Rnl<*n}ub1#9
z32#xdn1sa?EN;VMIu^6AxC@JUSkP?@KAon!7QCPv6uh_}3%an+v<i!duvm-5V_2-i
zg0Aqg<pwM^VX+yD7qNH+i`TK(hQ)R)-o=6*s^C~Yz+yKRpJMR^7GGoWEf#yRpqqWX
z-vKOs!Gdmi@J70+!HdIK{D;K^Q#MY<Vv1?VHwvdo?J7=uNLO-h<DXU06-CS5rOEDW
z(bxZS=+2&Lc-dY*Gsl{4HYBqWJ4IV(m~s*Fyp-FVXeQa@S_A2AUbD>QWK)`yClZ~(
zo_i(9cbc-8jm2G9%)w$V7V}K^TC%*GBFeWqdj<A>Yrfe{C$~V7bB+!qIl97VEutTI
zM47o4>LpO=Dx<a3{F1X_b2jcbZ=rSPc;san(p!$j3M?MLVkH)<uvm@7gUH!KCS8)I
zi(Py@XlHai=rCP#d5Ny+nZ{q_!MA77Zy|9l$hee}EVCO?Xq->~Rf`;FC@&@CH@unq
z=|>>eK-xBk{@X`!z-KY4o#k9XOUL17TIt0=uGPo&%<s&E=Wz@#D!RaP1FaZpI6ayk
zzSz=AY^MB`+p}pqe$_I{^Eu*x`yzFEwj_`Y6EH+diF8g{q;w<&^LPh!NhUF$R!sG9
z4y@%Okh&1oS&_9Qn9Y|Y)!0&j9Dz&IQqWw7ZfY2T6ev#$lrQTAvR)|bMY3M(@NcwB
zl4^LVvyj%xWdCy6KU>x-WF2m-l=Uhjm8PFkxEZT+DvKV<_o~-YMmYbHcZ>2avDQhe
z;dLd}dIr5Cv;G9K_=D2$*_RE(3s*Td`?MiBQT*yGq?JZS^eqgl@jt{B{x7sc6COm=
z1k3(80+U_|;nNTOjHWADnrQhPd$X$;W>^8IVyiHP=1~fCgwLS$7E-pRl3q(ARbbLZ
z&sOVcPmWLGX?;<xc$_F6zKSh5HDp(M(FniQrQI>KSEbEYjoX*7!96YMW4UzYroKQ)
zz_Ge&sTFVzy(o8yDAtZTR>MhBW$)zhlk=S=i7t}Ut`6U)=|=CyGY6$gsHM53=4j(M
z+K(H?UDSQZh0^i)^fR$1xM8fUOFuWIL5^Pv8NIa1GTx#56rg#iBD`x<hE$q71bAPs
zO4BEdT9BCxnDmpET*|O@m^N^u!pl>6VAXq=dJjjA5bcq;j|^P;y%tptdPA_hH?c5S
zMjI1#c-M>lumM^Af8>v%U;}Lp_^BFoRmOgfmX^B4N<K>~w$*|^5kzZABbBn7TB#wl
zUM!KND`-_M$&%eO^%(A_nuc$|X3eMEl!%lqt>($1AfFP;ksLf#Brm&l)H6=9khZ#3
zYkC17zyH=#8?h<Do9H(PeJI{ii6>8b>n$0>%D;;uFO!W068U?G&sRIJbD-l(s$JNW
zlKA-3B39uhinBaTzrr_)Zs%-&VFi5OOu%Z-$riIog_x8~Ws~n=A|NJau2srCO{HW5
z9~b!zz9^>{4E1|f@-Ri3R@6uGhh?k^k1o_#IZde2&?h;`qJ^q*nh{k=!CI;+IdIkd
z%4pQ96R8`QdX*@k7W-1I3-CFw-AS(z=!m*y8MKelMlQk2S87y2O6rhYnm3mfqC#=5
z2q*~Oj;wo8YcS=at$j8b*jp)3B?<Se2uyy|`3xrRGR)OE&gl_2^!g_6Tq48VluVpM
zOT{195d*1>=37|;_^~3tBi6f+tyO&`?-{jBx}7PJdxLrV9z-snBe%`pG7`BXU+<w`
zZjxIQK?e%`pbLfc8nd+9HJR}aq_X&RmH+!KAO7Y8KW+a1@zn<_`6t~<Zzn}heaHo9
zgrW=RqNfIM?!uxA=-jzc(dDaGQra{(#Hi>Q0WP}ToAR1<0VmR=+@&{M8(Gx^9^e1!
zOO(wA$m9E8@>=?wBN+Mq7mK-k1JdLBhXaZ7I8sl3{ORzwZRC!!YT7KpZ|_Rbi{NA{
z|BBU6ty?-r(AGk!lpS(x?ko8nQikeU>JE><yVh!wK1$O7lWLqpZ|KKYyHvMI=LCWh
z>I4ndv$+8O0MJmq+X$2i4D$u}J<HTfH*rp+eJTWwNjoPKRLV`a6MV@<mE>X!av`8|
z0YWbL*HG2ebPRQNwq&V}#CT4neHtlsk74D!w4_y+{o<dSn;1=s@1IY<vuB$?o!4CS
zdbRZ=>yqF465$2D*fd}pbuseWP<qA7eR~W=)2JmY*C_YQ2)xU!Lh)fEE{Zq7DiR-p
zB=?5R-!W=T6DHm>JvIW5?;A>ENYQp_3~AjMnsAw-#vq`pTPA7@_~aP!FYo(Q19}H5
z$W1vCHVaaerRG64g)OATw3PK$0oRJw(t6s+_O`O#PFhlXS#J=OkWGhxfAG{%!cMZi
zv)FWzEnVrE%RQ9K_G$hdQ}=Q%KHpg8d;i(A=BN2SsNB$y*R!ViK4VYr_B^_k95f~B
z74YxNdP`z`B+|YT_6yq5_WqLC03+ET=$LBg^-#%1Zs%lUZ*-~&|5zmaXk)UaHz?W4
z8<lMB?K2k9Sir+^9U3#G+x(g<@wVFCrp!7!lXLm`5~DX|XFUCv4-v`}Flvb<kC^~g
z(C@L`QDi{VbB8{nWmz=78$3FTku6P%jqtX4)3==KN&K7s%e=~Ex6mi2oa<-_c*vqx
z7~VwaNlSRt^2ZYx>Tydl!5uw~smKI?<3}!h`Va=6|K33VB}yU1QS#6KtR#;|zfq0#
zIR{ETWl1hlwQ~W`xkyV9!d#e9E;6-q0gzl637!vT(QJpFYjDIf*@l(Vi}(*tc08?E
zYGVs2N-lMIx+s8}qU0H7D#&j{7AGh8;eY~}w!KH5Ob3NrOHN79aEBYC>NDD$3FIqa
zqY}GmbFt*$8`_)|l&~L9O&p}nB~q9l0{$d1Z9GjjmdeKcWSec3@#Z5aNL12*S0>6Y
zq~uuTJbkZ2EkDtjlnOq~iau>_<_48@XuX|t3~jEG36DXes*LB=bxHX?O~>Lq66;70
z<<l>LFCeN*n+-)opG$2(tEY=ied$`VLz@O-Lu8l*K|{70?L0&KHWI6%(5kUm5z$oF
zYEnj4Z7C(^Gcs%{9_HtbgJzW^oFQBJ#MDsBIhRG2{KC2!b#G2K1vHl<=_QtmgO>FO
z&P=k{O0KHVRY<V{v6T_DW{;iau)BCVMjt!kolAl?d<4cg<7j7oN`M}>;O(VWTh8Dz
z+Q}JD62A!2P+yT0v}c!|?XY({$kC&hO&Dfb(6NfPk9FwB>P9Cyh5CcHmj|7>5}f0_
zPTM<+KlqVBvm)rim2I5!1#Oq#y(yO-3p6VO{sgjduJaddmv8c@Bt0M)bmyeblkE49
z>=&4p%fC$QX?Rr@{~SH&bqblBpJb=@mcw3cdg4bAd)Gak#6Cu<j&o7aw<nojAm;e^
zPUFxV#O6lOpEGlzWS(EMXQ+;p8MAIMfG4XLIrmX?1LUxF=k4`^fjwyZ#nPk)i8qIs
zmh%)@4VK|&C>ODQFociACC<mReTWq1Xx^LNI~@$2O11yeG=KEcR$FP`-)U{wVov!5
zdJu!@h(%0WcyjUo*!m7Qzlv)AJ9lpXwe7pvzMI`mb~njpH@$D_rY9sJh4h4k5CSBW
zGy((&y*EJxL8|nkfJhYu6%@P3Lj@b6qVV+j*Y|w-{@?SRd++Z0Uh>)b&G*jSDQC`{
znR{mD%v?y<C7v8yTT1^j`j@jiA7^EU=?(sH7>bKl+=o=pV5+YAZk+1Mdj_VuYH+H@
zGo{r`bq)P%nd&;6QIAd#F?6OYSKXZ;lNt3BR4ZC9PtW4h^?driRJJ49AZ>BB`whh2
zNPh7|%yveb<oSO0=Xkz}gK7rPqs>C_10?un)5qlL7I}J*Pq*-CnWQm7Zx!0lK%m=-
z$=_DecqoGOHfbNZ+M7+ZjZM@c?~IJLi-0=hijBz=S)MM*VTUxyVfGpxufdfac)C;a
zJ;xORwv+izku*j{CsBC2w<8U%+^X3Ud)FTk`&>5u$r!jSb*450#%{90SwJFnPhpcE
z)-H#hJ*@kcx|m_7GVDeTV`HaDZO&tA^s2{Xbm&S<x<#jd58>x$#LC3pt&-c@dTd7a
zmF$&<(3p$5bwK;1?wWhCWRu7la|L29%D^)#1z|4AY|cWsr8yVPhy*<wUl`bArHd&_
z4T318Slo|h!!|4Jdta4Q=^pU9K(DjCF>nuB(Rb7j5$6i`R8-`<O2moAydrlxU;)0+
zU50qyQ_NKMsF48XRF{@bYQo(=P~IJA8K@dJ;Qb$&pSP6v0uq7ZQ3Sx-3I`LMdD(IL
zV{^N+7-Pe})x>DmL}RuqIwbP(XhxXgur#*%4cbFGutCBd?j!`B(<*@<=xk2t<*9;G
zAa#yNlL~c6xpRd!LOBcDNYPG6$5MaLwHbNo{C0zKeDNqZ`;xBMgKAk9n2pFE>v0)j
z-3z(hSA7Hy5GIbHa4`ve5x4iMKkJY$A2VkH{-#^qnRxkPlUCMg$jk-2gqE#vF>tjz
zn^3xemWv16KHy7D>|$p^ZULaLY^Q`&zs&3suC(3!A_N4w1k*k_X&UM>yM_&QxoIe@
z+qrT=^a_>(R))b|Z2T*g_ZNKqUZuRN!NbsDgHy18&33z1ktd+hmR7vp{M=xEpx>5<
zH{mMpo0UdKyCC!5c=Hp?PBl1VSx>CbCpF3a6Ty*cax$d+EhYp7W=(FT7Dl5c7U}g#
zQ{|yf7nu|L|Dpe|KCL_#KTxBG_`&9XDJ*vV=YXJ87na7Ze<{~zQLgvYPvJ9+{PV$#
zT$D1hS@+T@BmdG6C}Cu1!iAB4ZBm7mC`lZ8abe^S3?u&*j6BO34qJ26{n`4@c6N?4
zJ3H6G{p9M?h%C*)mUw4yOZ+5l38wi|lQi`3lEb^0GSdWGbeC!YzAz(`CbLwoMmR2~
zPcg*&DI+^B?;#1*NJqI#Q>f)Dryy0h7sXtlx4@#?ed<yq`5$<ek%_|nB#n9i%qLTZ
z7|NW1jQlwnP%ZVaQ$%KXK(Rp{H08vhOnXuEuzCwoF3K@U^_mkg-A4we8^bi+N7HmI
z4pfgR?_?15aYJw{d`J^NX*dvj<wZnc8AOQl6l${A=@mO{(bGIIc8tR68J<^G|Ig{5
zu*#~xI>n^CSv6-H7pYCpn%2WAiin}-Oq$Ol&1L#SdwFC|C|~S;8HIVlaI^L?dK?$M
zsAeJeOWb!5@*<l>t?=tCdxgC+dP)5NVV7oP^8d1$2fx|~yE1==Cx-F2z{|ACn0%7N
zDE%7vGSBv^$F*}5Z@$9roqQ9#LV1*Yn#9O_6<8UW{eQ*Of2*(fb`0Z>fv*@56Mxvm
z$h_^n3Ssb7Lx)#EhgI$p`}pWJB@@HT+;1T8Yl<_2e-BzLd$qmBUTd$5zE1O6m%G12
zxLKlCCx=C3nb!>`?g_)pZxH&1k(8%~e(>uQ1~ALc(dwHf?X$uhhpQMS<JGr}7(F}m
zb28C~+3hgTnf#k34@~g$GTqN}s?T2XZPRFnL(N_iL-Z1q1b^~<n#bVH@6#&Gcqjc#
zzE2-Q&|36Y^If_CC}(a}K;o9F@EsXmjG>%r41oZ!tmQg8;%m;R`pfgKM3-YqV}Zcq
z>n#O;4c_oEPFQMK$cqyRv?%Lt!&478&bSAaToK&~Sa3cISMIp8RF@9Fir>in23%N~
z&A5kDki`jz^giqxlj>|ib#4&ZVI4x;mr<SbArhs=4<%8~LX;KnNtF0s$#)QK#UxX6
zQA}!@?4HaTnZ>rQYPU;btFFi!iC>LGE!q~NwH^mrP-waG{@wU>{rlm1{!k1FzCR3Z
zKsWR|;m6Gna5MfGxLJQ9+#JH~K7F;?2hbNjjVfQGwn&0|Ro43=pfPKzR$;2)%f7ek
zlrJ-$>y`gFLT|u*j379Uw~0k2?2&51&&}osgOMQYhB}BBZdJZ)OZ%kqWn}{Up5zBd
z$b&$)2BEVni0p@hjQwGhwP(U$sXYg*C30;FDN(4EGR-!WP-?cJ1ot8LQ-T|j)f_{K
z5>ld+lqe$&%IQ{+5|yMx6)7=<l&CiAqczer*mI8TYB(D`-Y6!xrC6=jEKOaLlx9P7
zM%txbK0VcjJlm!oRgBw4KI<DmB1>%zvX24SYP*{i_R{L4DmJO3P73*2WBM9nx@xz}
z=4^uA2OrbEoVpmw;<xy6&Z3o}ZG1kA&xiAQJFC^fYITyrBj}Fw@5WoB=yq8_kscNl
z+mA#g_G}fE+I>oo{uatLjg{*rsrMREdkU+oLX4iYgk-ug<D_EyvMk3~l6o9TJ)WhR
zKzAZZJ&9$S%rZ?Osi#_44VWfJji)om=`x>IM@){sI%0C1(VwFbt(RXN=MLuuk~2B|
zKevcwnKeOPKhKn9HcK>zC7R0;&0|^m=+0+}7O+GMS)xT`<Hao35-TWo=E;g0t{zzW
z8>BkJ`2t#SS({{VpP!NLv3#R|Cn9!^;~@7f(>Kb&>MPh8Rx+zq>=DPa3#_KQhVEJ`
z7~)J1s&#!(qsIodx*@mjRHPzlGtCo{G?~D9zO;evM!K6Uy}1&!>m}{BaA^2m6Zt7n
z-z#EkZB1$Ce+lVs^PffOwv#j`k|;Y^#+_u2lj!auQBEdNc3VN69xdi*^eG#W)lBCF
zyuXKx{^!c<O8}wzYsB|Xwe)EP#F^zuoYUEqT2<zS<0BQpzS|hoZm&NM-zEF}NpR1g
z|4cIPSuDrdWZwN`&jVJ_Xn!0uVWGTPACynP+1@m~CH7b>RjDxIi?#Ojx+#5FMuRzp
z^@x*EHLAF<4!*39Rh!Hj_~eW$Zgz##7l0yKzA;d<75E7LELE>mxH&<-tjKIkk*IC`
zZwa)54&4mHpd;*&{YJv|x#yyubHnc=#(6s>v1U~pepgT(5Di=Y5a@gv|7h4Ud<*cX
zk|B<kEhY2<o^PdTOA)zdvu$kK@@<4)6i>%h3yFM8HAzlGRi*b1LNDe)NY#p}zYCbd
zx|%Bd6u!KT=_ebafU_b{9Qa=bF^($!T%vs0R(`2sJ76cS3>NrY5%F?`VpCj+<D9oC
zc|M<uv$u2dB-SnqBKV4E?NWrpXE^3l5PL_F=pCF~i@i(cUW{3bk!N=*a>Hj$rgxc4
zF_D%`?=eHdz2*m-O=YaQA1~nNbEZHKV7M6LOti=9t5h5Eo#)9#I9HkaPsfqrV4O20
z81HNjCOB(@iF$%P$$A-0E<^Bjs6QticS4i#<=+n&g0sUvi&^btv`2cmlD4iv!Y%A!
zW}<vmF&7an^?GT%P<?uV9$TFW!x$o=%xL{X6Ry>FN@>tfHz5cXGyQJ-I>M{!E2X@S
z)@c`TD94u#I2}m|X<ezEI-G-XFL=CWK~I!2qAwq(P$2{y-06nFJ)lt(fIL=)ZkBG2
zs^vWW1^SEVFQ&hQ{!;qOEIm?gD;vfvMzIr*D>z?5f?d>M;s$l8Vxam8jKY0BYQac8
z4b_qY6YFMTJ@k*Ee=Pmu=pRr21o|h^KZ*XyRv5@Y6xtt#5eEO{CM$EQRF@w>#;qAy
zt*(&ni&NyKSy`?ovM;cQTNyd&$zwxmjml(k2G^>RVoQ(8N+o!3h#e}6PYoE8l62Op
zn(T`}HKI}j(wqU;22;U|byifOr|1>$Bd9(56!Kju!=mthPGo6Dct86Le5=4HX2fS=
z$KylE63!2vL*VK_WaXOR3Ak(7O6%yJ!0NAO^*315rq|=}wAIFHHgE*q5LdGS?^>*8
zRtE9L0-)NgYI9O)2fiGu*u-*(8j8eeW)-)qsYF914YgBMFo_nTQ*g3fDAk&iuv1iy
z3A*YeRp9fsK@_A?o)pYWOL*#3%+Kycd3^Elnw;UJTsz$kR<o1t2<Z3CpbA^8Q;P7;
zaQ_k%Zz{)=f7WJ?CBWWmPcV%+4c*aFz6x2C>C!@JW15KpYbpfkg``9OR+Z#}nVb!W
z@r$!$3YWe(n?hJ#WXWZ8D$ZVp%;$)5Qw>)-a}|)7^Re(@tFm%U6p;v0b9vc3R<Dol
z{NQ#_azXG}xC?{(;4Y%On60}cxF1jE1P{So3Y!Vbf`{RoOI}z`!W@T%8ofEJ!CAbb
zsMh{tQCMd`s9`I1i`>v?H$}}bJv!9hU_FQ`iVQn|bt%fQ<rww<giSrDzWG;Obq}Tx
zQ&3r`JMn|dbHN<aWi8bW^QMkg7s4;Aq3%q1c!4<MBSulG9aYw-qI~DX@7ZcHOdV^9
zCo8FZOCOcDW|+4$-{L2WvmDRAjKX;Mn6X?+sJX+u+X1-qLCu}uzYgHzYXi%O7>ov8
z%xo8hK{1$Wbo6aRa?~O>dP~xQwYiKaLp@ye6uIArFO=Tp)`#AA1z<3<Tyt_<R85ie
z0QqitCjoLtsrPn@c+@j`7uZ+FnLGk@rpY5r^T-U$Bl9nLWJAegzsZ9j&BGw(;eQLY
z$@$-fo2S1(w+I{3imiOHFL$PI(IsaiEvYeU8zr^Pq$MdGSd!9zDM{JD+7hHCG04nZ
zP=s<8q{7mb$00MRJ~PZFRhOHz40)>tW?1zv8B&HORVPR@G>BCfS=Qg(|L;vFVYS>2
z+U-69w`LE@qt2<#%0|nYMgY;l7ajxRHX`cavrholEtWv&;2Btt9zacU;qwR+^>M6&
z_nrc71jpiwqSJc@pozoTk;+IG7xU`6eBn8GxI!r!&!VpY6*N?#nOET}65VZqwxXFR
zRJR2hix%1#&8_<a;<av-EUqqL{%s<l^knT8#G2nQ$q=JQ&WfZ{6sv1<>=?r-SmKUh
z_7l#XV7cEyoDK%vpbBp!fT}|<OpQMkVxx|3C-4aWZs3vdjbfkbVC#0#Kbmeg-5$DQ
ztZ>r%D9xC{tsu!{8CYaFA@eDqD$&BSrd%<kf{vu2c}!VM@NYx3Y5tvXr!$)wOthEo
zOkeC-%<?}6ceWMI`CXdL+(Fq4NweWG%l>Rcw{E4`%x5+Wn9V|Fu!!zrX0wFZEah^a
z7{v_7&IF-)A`w?5>hX|rtW-pnE7EG*ZX~UqC{St{5KFBxO07u5W4U^&mhV-Fh;&6Z
zjwd%hUBdSah-G9Y-}7%mqzO5-8OE^NgXyvhBR@j-<wxob`BD1Api9qG*5k-qr1efL
zc2HUmud%HA{wdP>Sqvz*-bX2|4#6K%$%O$4G3SLYu2J+p1Ha?lC27$0==yX6x}k>@
zV9-O{jQ1eiERBKW=;rAb=oZl}rdvX{6xM2WJ4}0=gEQp^eP8zmGWW9fpfFPAw~{ii
zMVX&Qi9u!aFX+7=cMDFFvdiiAWXDHh?q4>SW8;|*1r~>^8Z)v;6T>tkzfNtWjjeTc
z{6Zz*c+ZKkrE0UD%yl)G+kz#LkVgI;nVaBq97o{0<z-A+_?(x9%5%QsiBOhzOe|Y9
zo3v!VVSieIC=mM7(rTzli>9j6w4(mBCLq3}+B7c)$S|$X;3|R{cq|rxEc*_WQc$i*
z(ETV4$YaS3!Iu4qT}f0Rdd#k3%%TPfdlDser@&XtUO!dwk^%xwG~C$;D<M;KmwPh>
zFcn#ma^?b-X5gsq(f$Jbx@v5R*9T9T5IZHETuo!D8!SL0Rs`q6t)#z-{vq^N(_ce>
zE&X-D`-oZ3Ck?^x;BORdvDHMkneI@!Ep%Jyw$U91A^d6NTj#CC8)KykHUy}5D$qC<
z4@+N+IiBtWx)UuuTo!?Az14`v%bjJS9i9MWu2hiJcrBy(#uVifZ8USDy#awEf-P`I
zvgo6jXcyhlfmnIy4rHT7j};Z$TlA#<tSArDtbCc%v*476{aH~Srdds6R@0f)3}(_x
zcP6u%#jIvq8PTno!7!@8G_KO#5LMY9W`@WIPPKIl<cRdQqZleEBYuqjRta@A$Zh@=
z?Ro&3)af7~`<u3w;dvJolV)>PReAqTEp{Xtv9QS+_1fQs=-6uINiie^|9{WAv?G@)
zjCM{(RF{%(er+haD7O)g=q}uS@OY6RSql%(gwL08SX37b3}hUZ88*9k$QG9ww&ls}
zM#QtF7IZ}i@C#2qIwZepqR&h-%(T^HnrTE(nrU`$rny0x<_BfUn=WE)iDW8*+)`~#
zEI~2B@E-3RJfz`v)lm_hFIlP4DswJ8nw<`Xy$HZji*@z_cwE`u??}W__R<rSE0GEK
zf-m2RKHW0^Zuran`{7pbNsex%FSD8||8cm*zD!Jp_%aEproYCQc|<MUI$vfA^}ftN
z8vNJcHu8BX-6n=M`|H8QLoGc|M)dLS6{uQY`c--rP)<15{S<@+%{-HX-EDYD)~)$f
z>R*Fs3;i45E@IM)$tX+cF7-cw=gXMQa?DfR<@pkKRZwbIs-Vm+RrzvQZL82z@|E_M
ze3kuCFa!qybc<X$-yKQ+^VH4d%&(Q-U69Wi>3BY6b5?p{pe|{0cETf8wk&mBDVdhQ
zt0=WRDeN$QKiug1y8(uiVD0{C@OSuf7e=T55Zn<g@kkP86ieL2C!?*PTDn-e*E!#)
z$LE{;j=}~JOt*qIB+$l0Ho3Rq(WXQ;F^$c1x6s{c>ElFT7;>)|MDBb_?v+f9tJqS5
zRfEWVJjuP9<X*!>*V0`_a-YD$ugCT%d#LVy1zGiYBJO(@3V}8UUKzj`{K2GQmA??4
zv2@4zOW_}n`#$u<pw*tH>^40C&1a4ARw7^m-?*a++EPu!9Ao)-5+6ULz1{FkmSU=h
zGc3&%>_)?9C+1dmyPhh0j>lysp-?^17Tp9-q&{E7#+{aoESXQ>aYh<9Cv?qX=3hhN
zP9H>E*{)&PX3v9ErWb$M@LBH<MR8{`^I3Ff)15<iuBEsA4)F}#w-2IwA*K6?gXz9w
z5Z!l@?kAD%yO`+7ba#{Pr;zS@u-8iFa5m;}xZ`7#{h@U;*jYsL4lLDCG(V5b7vg4R
z+ggv3|M+3-%fYBWa2lkoFQBMJMu#VY3llkanEVQGAV?^xWly=KjvL7dkfm^|Xcr#n
ztUQIe)pm&s?y*$6d7D7qws<eC%dq2Xe96rAzX(^;FRPc%$3f(5NmOSVGdqYJ?|Dg0
z?Jr~60ZTU5q)i}cdoN0As&5GMkX3&yG-lnG@f)$c69ZScGQ(m{_nB0dMVVoeB49Cj
zb|!a;B(boE>E+ngae8?cAWqMmC5zK5^5mU3Jp!g@F|%R@o?|j2kU$2>4LvxN<&xM0
zX;3MHE;T^}X%J1$sjE#8K^i1S{UDZ15P<}d%Avm+oKZo4CH+<O4`EAG(=}~V%jb1;
z>*+RFLBZLe8;?ioB`kT9%rn)eO{ECZjG8SyRGOib6dcMD-3`)-NgZL#cN4}etFx?~
z#RKLHORPuHVS8id#4KG~GVu0Lr);STvy`cv)~SXc?OwS9l9y{49eQQXKLan=zO3kA
z59Hj}fhIXiQSpiTyJ66R$CLCw)ReFpzIjqK{Y}*rHKyo%)eZBwfPD;nOSxeO(0pk_
zY{?lC4NuWZ*nD&)>SX<{%J8KH!Y=y#7>Tg6^haU}G$WfB7O_~1>0g469v4%a!P6R^
zy~1Zmd;&&0I9O_p$yFm0_X#AX8p^|}1jy-@oSeRLM<58NTfDLaggDsa)B|apZpqie
z<M`MszzF!jLF_jR1k%kco2Oga<Ym=Z;s}vEy*ea~f<ZJgNN7Z=P@R=h#h{cbeoB=n
zp$dvI8i@z0RRyEq3FX8DtB{!^<#^vzfgr6ygcB32LPn~9(OB9x%hhhGYK(4HmOOJ8
zf?8uqt^9L{sfHVB5v0`0i_l7_MUZBdO{kSisZ|(AE!l$|Q)@Z0%B%Jg@>W4FH@sDp
z@>VhHobZ-GDcwp$&Zl%UD5YC*O1IK}x>-5dz+P4=wHlSltpzBT!BW5BhSIGK*sU5P
zIn!v+4%LDw)h1xtC)3|_!wo4oOAU=qNYOrtY8s~mcUrM=O4G)voP)rmaR^e1l_iZ+
zKA>@`EIs5Jq&<#JQGJgTdc0{0g0v~BJ0;<maS76dE0U(DOq+s7loH0}5v6{{tx^*V
z;|@s}m&caU&TUZI06f2!Hh@8C15~FCz@-c6&M|Fj)D%OT+LShp18GB$R-rDTP5l7c
zG?9HpMBWJsjVFa#o)rpBHxwdBDbzBMSqakQ8xjgNrW9%&$gFLHnYBsHFwEMVFze8i
zS%*0&KuRlvQd$ik#H<FTv}#Fd)y_0XE5oejbi>?`%-T7SSq(^-)u4W69l@wEvl@^v
zt3d;pwN=eC4bql2$jE^WLXc8ySkfTF2Q<hiHb|GHN7q5FOyE${t`?Y~rXv|@I&BU$
zj&%DGYN4Uz$b^!kQc89YqNG77CFP!^{%&tjO3AL2lAJ6Xt}~Pz%MZ9^rrVEW4so|P
zAfco|{gfPU3S+vx0SP4yGL$3%CeWRTvmyVg^VXA|1z{$VFr#G$(MZ&yJ0VO@OqlyI
zlO3bhhPiv;p}jo)`(YeG47gB@<tS^Aki>ExLe#PPqrCF~Jmb{5O!NpmyaCC@x0m}E
z(0CWq^WX{iCb*cMN1umpD*h}NdYR}c_+)dn<r?EbmVYhKbk=1C-Cnvg>CU1%o9-M-
zU#xzIH>XDn(Y-I>)&Lh;n?QEIE`~2;Uf5qC$;+n^OLrN!2DqYJZo+&98xF?e2OAD#
z84^amWN^C5{9LX4c07bRFEMs_EjA-8JszQxgB1~Ki9Kt@;d1zvNk7#OsrqPXinhuK
zD^rx^418GCkaz}>pH?C6acti&VTvL{fI++BA;2K=0%s;G823f2Qm@9F@?m7@mz2*D
z!Jz%|h+q&|qaYKfsnEj6lwexQ<f~YyWNb4?3dM|$XJ9gK5<~v|EN@WC^3zk6U&Hbw
z^QN^#oHtqiS`>E`$-0j233U0ri7!ssNuxJhj^a&;=A+>@;?L5T+3jSR8Ss^}88?Yg
zNm+)+O~^8VY>e*JH{eL4umN`?>F4ZLa=}da%<J5bL?*BT*(^OvUul<->f0zCv&!Yb
zYmiW#t+$=DC?HTKm<Eyc{S`>$ME_&Sl6<ryBmA?Ibvub{vx{7FGN0_GdkS5A@wFw_
zYUxw4QQeyAO3`QQ2kr8p2dT`_PugvKHdjAmx3Mbo;Hz+@^Yoc&u)3$om#}`=?s26f
z&4;g&&rj#`$8Dd?y_Y52NB0c6XVN{3<vg3^+|RNdpnH(+Axj^YiRZLX2zGdr<kS?i
z>S6g7(+SZGN{M#PAfg#Wq6NZ@3vlDZ5c1<f^5bMw_z2&41M~QaDLRz&pmWJg39}l+
z9wgKH^VoXHwBDeUJr||ydHx{wgr!6&KKV10erS$%=x!B~C1RNL#2EH>#S0-z@cuBB
zC7~`d(&Cu1iaOoCq7m+vqLJ>0*-<XEKwU+n-3@lP-EA$a2mj*Z>AP6!=U06d9(=lK
zz{k_$>ar&xkmh1Wg*Eh<$YO~j9|rt}S2+3YQn&gSv@M_G@EHPoaJtBGHJZqJ7<wyj
zJv^ALVSeqpP7VAI;;AEQRI@R8tLDqpsj6+-SId3Ttq%B%(ZL?X_JZ5s2B*MLC;K#v
z5K?8f&v)*`LoRmfyDJDXB4Yup;2wCg!X09D{jLN!vaiCq50Dmq5$1djz+4=>-+%%K
z$pcu}aEnetAhZ7rJjDaE<*G!IZRwzF&4s~nwgmmzmKP(TB-;v>FV40y$=2A-7-UHv
z3(30bBe2<0Zm1aYk|c7ZBDx=s2*d<*fRhyl1=~7;y9HouwDbeTj&yDUnZaaaKd#a*
zWIUXjz!xsQg&3il%=jj@eB*Hqfqc$c&SSvl{^9c~GdT2MX?20h!ng^PH2CbwJOQ5@
zoH&Ac)`4yC;cD_ENb~gcU-Hb;H?t<htCt!qW2G9Tk<>9$DT1_8L`kI@gcXxY5u_Oe
zNu|QHQjG&EC7ZA<%}TKzk-EgxBa_x6o7AJ3apQUrq}k+>dgPOOV0nXkWs^CN9x4<5
z<*ZlB<5I6HOuY!wdJ!e{Y8lvA1Zjqaq+Ug7y;=v>tF6CYfh-&i8!uU2RTs*L%1O97
zPF01nc0_~AYNZ;*HKq18iMR@8DA~vzl8(BzGP4OWi`9+!oUAC7IG8wvqSfw%P_!Ib
zgmgr?JEBvfKN6CbrRs4*%d(V~<q0iE4&(`fv}zRzE%|kZ^zv%ssKW11kgmL1KDuxL
zhTCqsJ#@#=9b32r&&L&Bg*%>4CKTibl!*noh;I^~z%IeGcah+6nS*#X#QjW|-49Q-
z3#CnT7JR(d5=H@<!ZT7td43ph2I^<In0!R%0rg7%b|KxvBPkN;o|C9GE|fN*_+|?I
zTueTQW)V@i7mgxyHj6dKDynm4EB9hswD7B-Q4KpF9WUY1w}Yu@I34g(TgDRyb5Xto
zYj;A3%gxUfSZw$&PC2)PvZDTXPB|S+IkO`GKc(Oend*OY&gsaUlMVO}&N&^-ISZo@
z^Pj39$F%=P6~^P|s7d_-^K+qHn2d)P*(510S}3k}J}zo-W-52eY?Mv-jQll)Oq1{z
zXVk)9Ia#x;B5z>_SEJp8|1!-QiV2Tc7ulMfgXBN0T+CjhS@2$EljNAbhQ0kz=C5q6
zt4%N_uxzdvN_$1~5d9jP)W#Gxl6_Dy#K;*o_d|}HY@2IMbj)I-`FQ&}12K(_IuJdE
zn8$|0`ynT?CBjU%K}Setx?QE*biFBG!cH+v`RN7|oG=tjzzM4)92LWqr9Pulp1R2t
zEn%vg4K!2Q!a|h&dedH?Fh94LAKXX5t>M@luIC;>Rx2_ef`@C>7j))#@YDs{r{+F@
zr(PS)r^xsQ^;C=VM?9PdUbEEG4SBV>kBo=QBNq6WGp(dVTgc((MI+G&Qi(>CNVJ8c
zkxU|~3DR!gm`JpyRH7}COm#f`EGD3&_M~Xd>LpXOB`n&@4U{=U)vHG4w4^epHIX?>
z=Sgz0{3b{<Z%bs(uvmWQ)yid-yZlnTu@%}?l4`j6h9OmZLaL4dr0P_!oAMmT^1LCW
zT0v5+v}8`LS8YdPt1^<@2=#WvBu6Gmj!Kg3GD%YS9?yP+WVhp;)go!tH!CcrxyI6K
zcYyrIll<#mmav~0@)M-wCrZe_ZXo#yQVNVt$lslk|Ac|$mm>u6K<NoX^r(01ICa~A
zxuT_hX+-`Q^(#aEu_^h-CFI{Yko*K`=HnCcPl(B1cnA~7O|0o=x?AXOwTj!LURjmW
z+nz`5SF`rppOF&$#?+o5tvyjv`|Sg3PmmU1Vp97_Y3)xG;wQ|vgTPD^9z)S4s}CAD
zh)i*pn?Yo1+U<F)KOIC2qEM1ygO_Edo!lVy@h6b#H0k0TK&GdieHVq8QGJo{Gt_Tg
z68>Zo{<pQ{`(E`sL-?5~;b$d;-#u4q67xMlTJPBj;pYq_{3#^%9=fMmIMmUJV;$yT
z2aN^j(-uN7uOabIC-MJah)<9bpC}>z=>v&RkQQQYLi~9t@%IiS{=R<VlLCF}k98Tj
zV|~8*YmTk3z<oZV^j(;?!WrOED{TdX(pHe$llw)jL1`;2N?T!Z+6rg&w}R7%oX;kH
zxhN1DQ2S+IcE4<f83T65ql`fuWP%ZhdVo=1fsLaSKx@V!#RSFr$59AA2uTh@Ga0t;
z67>($zDv{gU6!=(p@Hp7kdk(J(!R&V?TZ=IVf>v_sK%$)5zdg#Wjmio_k6lX6@M;(
zR_Q`D8M1b5_H&57Ld{mi-hJ?`^oEBPCy(*zn|YW^%IJPfHA<oTFo4Uq-NmX&-~ttr
zV=plaY4K;?rK0-A!abB<lKN%lXz~(O#w=Dz7AWcErYu)rhhPu1hhyX#8hxeOi3V5~
z$)S#`)Yl~$t*ZI$V_?dI1}02+70vG_<Bvfpt&UIFaCORtSF^y05V*!5N`ainz%|N&
zUO+x=uQl6_B2~|i+_mzNn-vyar?RS;JbFUHqw7;1z0T<Sk_kc#Q!a&71Jo=P7BNiu
zbVJIg8&f{TMF#Q;#RVDlM)M%%RCBjt!m2S!cr}Lo%xW%FbYpIfQNpe<H2j)-6g1eR
za;gx?$KWT_F=>(_%)I22&_LnaK)wqNOqkc)4v<JEgVHA1oUrefe)hdZk#}+5qD=2U
zsR|zz$pn`ii~AQZz&LfA`MKTve8&9TVSesZR`K{~hur?DKda>I#|dPkyVPS+=2~SW
znSm)A8JMuqU8)Ch6E-p^E%Vldjkcw1gqs*7ems%6$Dn>T+O8TD?5Ls|_u>Q}3egE=
zkHGuPk$XV$(ur!6GV*Rm!b>|-UczB8yqXHU7^b`g6V4Kx%r0Y?^3q8uFYQWs$y{JK
zfR}KegQOAj(nE%MW3d;bgqLF2&r1)R*fB4~DB-0T#=Io5?_|}5b#757J%Vp4kfaA9
zWCT13)K*qFZMPbWqbR~;r=)UmPr_P{s)2&hptPAzO<3!+e%5+SLEt<t&J%{oo-{o3
zdGmvN5QV#*GCxn7pJ&X^7tPPJ<_E{B>`D4LC702yCwo1wekRqJ7_*mwDSH{1u-EhW
zg6U^3gVJ)Jp0L;6l)cPV^$B~u_^<4>4<a1j{aD_8iAO2bH1LxU?+i6PCf=DT@y<$!
z_wpd(8I%^{?1Xsx`-%6msrr}A&nxEVD<H%O{Z;ih5Ml!f@v7p2<hqy;2Bw5CFd@XN
zYUBVy7?hUbKthOvDIs1P)X%;)fDl=kv>j6GRViih;bhi!PTGIIZaPRJi({DfpKloc
zP5Ms^*?;6S|47<va5o0ZY<i6qZG_734PL&iHmQ)ag>RaU`i7Y<#3<=EG3@U*@kJ5e
zG&6=6CH*Fbalerf>0Gs06|r&8Q`_UlJwI*Sqbh0LHwU$@K`CV~P)YORur0Smm~W16
zn>PK9`N07m{XHcIOg6G%zptJW4#zn<j>86~4QpW1u-{kGmlFYMP@4TksUq?iv#0X$
z2ZNdxmwL$i{UY>Yg`;_OtoS8Lofg;pQk7O6OKY(<@<)TJZcv){<!RNi>K0dhex+r-
zW$OH6^Yga(`8V_Pj`{fs=<!qaF3PouJa(l@^h-E{n<$umM)iCurDG^uMjgbvCOA<E
zyk}S?VY(QmOo!P?qGtMe%5_ZV7X}(d(}yVkFHISLg)+WEUCR#YS|0A#wS2{BBNAOp
z3{zdpS50uDYl$Jdhv-^fH9Cky*Al}-*Ahc=HOu#!iJs_MzGh&eYl&f^Yl$JboVu1r
zFc0Hw|IvQJe%+KW?Y;)4g#CuW31MF!tZRv3O4!Hx3HydATB2+Drhx-=Enhd<m2a7!
zH_gwtp=+6}e=Q3m)@JsO_l?-w7xxYW)81iV(mUQC)H@7HIrN&ecU;@wJAN~$cYH9Q
zcSNhOrt@1qPzO|LD4G)k9Wg=6dBt@~9ab)x_Ij1fEN&oe*;=?223##7Ehk^8H<CL1
zOudO->O+4&c!Ci_At{uW^2ziet^6djhd*%gpUgz#TT>D#7{h)kXy~2HL_ap&B9VeI
zG*XZv;AW-Hi$%aE`bEGksR;OYL!kJJ*r1fax27Th=2!+uKNt};2H=XPJAK77oDIdj
z&X(et&WH9adp7>hvFF<J^e1guFWwf)X6R3GP0YoDaaNSNcPMzvsf7JB^yMH+qT0PJ
zRqa~Y)rfhga<OPkMegl54*}X^)tHLh&nkFL8lN%hVNv8-*|m7<F6Cn3n7Yn8n4pVA
zQ0h8kDD<S_^iI<ouy#zvX$+;aSOu}f_*uJ<!9?_Sn=0YE>{8JM-)&g?9-{=l*C>JS
zGY{`KKc6!{4}kmo?D_Tr81#D3mZhWZIZ;aAtz0Z4XBR_nagTDbj?Aa`Di;gMtlfR$
zt;mV;`+h|gwM8O4WMd*|xmZiii6$J^b7L`?ArG4ntR_=!{fH^?qvq!^^Ygg*dBPO@
zNmKC8o1%Wf{5)lTo`x3v8G8r{d}1UNea7A-`C?_6RDH%4W=^RZ!<4FD98A?1QbaCC
zm7g`i#i}w~+7FMYnP@)IAU~j7tUgn&J=ibTzGMVWBG+QbL`9wSoN4G}got4($R6q!
zWX}&4WG@U5WYm_wXdYmlnRSg}BEe(WFTr0jwZS?w=fp1?0^*#8y~ti{FSZ^KN?}R)
z1F)~MnqU&R!LmAFO5V0UhbcqR3tUN7cOY8Eob!_&p<shs4$x_R1yI(R4A&8&t3`n`
z6~8XtwbV-2y%4{idkI|My%KKVspX-2EqsxC9bCB=)?FF+ZYi$kc0USX!fK+Ju;Kyc
zoO3gxaA8<&MfZ6Ie^S^@U@jK+Zz$}Vg;Ln~pM4MoetU%!T8@kV7YaMv95>=N@Y>ig
zv;Zyp#$|h~?a3w^?G1zL@Coch!0*xTd$PMX@MO<w$R`owXBa=rCppH?(_f%lM7NkR
zOBl11Ps$jxoc;=iRx-4TPlhnGn*JJw)-tq?PwE-kKz}1k)#QyushT~ppFdQrTYnUI
zdXmDv`A1<WO@EBb&W{TsU9W<ShCQm5eW><E(2x`{Ceqg%Q=|k!k<4DC)s_*u46}l)
zepAP5+Wq$U<D=1f`0tTiyK@Lp+xNh$j#On22-qgnJ>Ra)o-Ihel;t>d?i`>Fj7uOj
z#A;S-&UDTdDCY4ca0G@xOXC<;?tGw;@|~=C3@Zl>mo<u0$@6h_j&m_!mq2x1ZSJB3
z=*#UU%YnTYYcw1CYLyikPoU35s+XonEFoClA+Sv+-EQwa_<Oux!5u^YSo%kMzr~Yr
zmY)7TavPER2xtZbjwR=h^Kzh2$-&^<AMl(DjXW5fTZw>~qCu2{!Jceco#n}vsM$<z
zjwcqi=F*+#t%JXhEjZu8E$!ojA^HLBSL-cu`bUqK{@LM5`r}LBRiDrWDTJkNDPqDC
zm<+7Y*Xh2XUhEPy=t<3p(CM}y<;m==ztG+yfGOT$xKoSN@@X(a(5Oc#S=jT3!d7q#
z>=x1*QSky{k`n1{3-(Grma{qa@70d5j|n)vo@}&Gnn}fML?CR0f8?Y^<fO$|pu!XB
zx|=~gcHJ*DTGkHiq_i!lq^u8K!a6Eak|+m~C|^gE_ZS5+<X`@8h*rMRh@=o?n8O3Q
zb|b>nfg!w%N4rOW9QOjaE(MQA!Q)ft1ny0E9@35IW^kX8?!jT=IvnwIiqUx-*pCd{
zjp#g$SXA^t(u~fYh}Y&ipM-0yt>3TRF9GBOwWeTs*`T+-n}kKbMu=?Jk6wnSm;@TG
z%%v<uk!<DoD#AoxqO-4vCjvL+UW3n8-73SxOI#V^N?`)2M3;Sppm~v`TFwvizo{&f
zFCUaVktAP`q$<vql#yH{nN>0-GIS(EBFWTZURBI%h?SZ?Fb2e`MN;=g(&1{hN@8!@
zI+89{D<a>!M3+X=$GFAB!_5f=>0<`Tw6$<GV4bjocahFp(Y1i}(tq$~Tj6@323alR
zeesr?LyI=LXTfc<^eMkXz|w*X2HwLr(Bm?f0)=wpec@vGc+;&X6RUzW%xTi&<c@o{
z4}qt<XQRG*UD@olk1c%$-80=~o*1iM3HNNv8<P1Vj!jqFRsDzUbn`cn(ER8y3Nmyj
zyy|J#CO;@hWSLxbUpNBP!g`4;i_QaVT_AyC+Cc)_aK6QotL_U&foOQfTy<ZN^9jRI
zC`&zSw-IsIwE7YZW?zD^4w*x#=WJaNtrQXOr+4R|ty<}Jx?*Up-F*&j2WvQj{*khu
z^?kf<=w|k_f?S4hxs-5bKWjhb%zoCCbeAjA?h3`E@qH|P1|P@rgKt@(A5`o@K(wdd
z=)tc2elc#bEII^|9}qgIOY9=n{9qQy6QdI6AWW`Fvpg{>QIMuQOpbiOZX}WjmU_@G
z%813uNIhm}bLYaB@vw=7EqsI&mlK&6;Ca>)BNxI563g;1OD|jmA9wJ1+zaep417Kr
z>!>OJ1?J~M^K+5;!L~cE$=d>3ZKG~ON(;#EU2OXIu_=^8${>CpE$x@`yFn?xcMs<G
zo<aQ1E6NSO7cM|5V;hl@`VE*?xPP3SbR0h({bFHGIA5z`UF6D%$O-NgxD(wOa3{I5
z;Z7!-<2I|7>=dunZSvaO25*>N=MDGz<n-fZNO(~}mgc8P3Dl>w@16}b-8~F<hI<~|
zUd!)Lw`i|3H`^bPZ&Zm@I6d=blwon<BE*>`#1(VjM6-$BgE?DiK-mXksbA<?Hp3hV
zBUdqSF7z2<O}mbPbJM`w<#^LkztfsNSN$H=xMzTy=DD-s_PKp<=evvGE}(xQ{fo$W
z*p!XuOUR2$-L>#9qklP0fO*Tj9`AH-j62L5>+kZ$c^~@YJ>0Q2!TZ>rD5k;nadKj0
z2}`$vMK5!TS-KTz=~&B^tmP1=K(vbMUqhS>Up&59s@3F}I@44Z>&(h{SgA18@>PqY
zGn-JrK(#r#umQf+jq<94JhuR?;VdO|V6Iwfzm~OM#~Po&8n35+1N|Fak#?KheQ-Ck
z##=~_t)#~`KG{z9M7lfZ?xcGX-Cd;C$@K3=CxprIscySB&D&s4#}>F5cCS6tp5@K<
zJ27`KBk>~0I}X-GsPkVo%3ys9`t#ARVtTVmMqy(_L#y4W*BECQTz&?tDP=sHXQ`<b
zUOS!+u=XmNVsk)rT7`cBfE!#4*Yhuj>-$&34dm9dzA(@uLzq*Gj5)PTI153T-CNlk
z;NX_ON}fuLcxEwUg;}Qu7FRJ^=XLfJ#2D*JWLxdmd_m&{%l{J|<@jEn?-l44S-DKH
zBwq@yXKY6uE@QM3M!-!c1zmPJVvNIw3d)RIPZ`fcJd<iUqgOC`C0!g_F49B#@X~l^
zK3=LWz^g8I{e1v6{(iW%{$aSV(^hPsq{0%tH7fPC+b}FzFLG#tBlF+}&ikIM@Ud8d
z1NAS#ZSubgx0z3dGOrf8tvK!KFHyLB<)|IPSc7icgJdQ$8FL{0`3fd5oUgX~e?c-G
ze6`aGtL!bZ2~v-^8!t^_kw>z~uj%~X5pfh)&r;vg&VMFAtP8L<U98P$f3A%!e@w5(
zf=RHC^%yC@WOBw>89dc*8`$WWQjh;0@Dxv&W&$Nekt$=@ME^k~Gsze8Nt1n1Tu<@E
z1kzNx)0q5px-+b(Hh4;fb-FpMcXx#i8Yahd?=O(#R2l*DRWI)3lPqS+w*?lfstcX)
z1n@=VOiDLnB}1Z_3qVGz;XK|v2i?+wg@q!E1)EEE9xSv&O0i(`S>px1oD*G04qikK
zUTlR;dSBRVKMZ-kWCl{1#u@n1CP{Fya(SM6nJ=8boZG$hamT|y-8&sAtsps8s48zC
z(8_9gXQirQzN`GZ@$7gjY}2jwFunS91kUj8$FnuY7=hA7(pYQRSf9jdoU|4ZTT3=@
zuk+<FCeBd6UGK|j_6>A5(%s}&;mKwThg`8Uxji~ZMIH876?WPmMI)lKIY#I$asgQ{
z_7qyy);1~M+e&$L39_r4A(sCn;5N3|cK>PkPo#eb8E_~4C;4B(^Iat6$t2}gKHp9E
z6uNupp2`$XqyKced--G^{byKVm!1`l*5kr%y;GmL3niLKGMq&+yr*1}frw=I59E6`
z^WD#U4_IN3?$rm^;muim^N>tj)bHariPAR@^UZVk<`Kv)M3oPhF;Ev#T(%6w{z>fm
z0@Ip*jeEb21NRh}+BylOTWl-~i7-^xHn_*|Yr8UHYxgR+4qcZT5|7V)x&dP*pDz+K
z!zWpM#Msk4SMTyX=VM&C+N*OJWV6Kg;DYb%Lg2usxF0xxFE8_M*$?bTK(>RaRG5X2
zc1Wo3%mWn#63a}bihzq@Ee&6XbW7=$(Ji+I8fMXNa);~cFbB5$cDlpWzX}th25xYb
zoN!Z-B+d=4QljrFW+wA@G4u63<%;31?<@CzK*k>^=JP}I^CRU}hO*e>*5K#IxThhH
zeYvvjK2z~D5}kuL`0ZaQNvapyL-{PJM4MZx5l0?Kvno<_2q|4n7iP0@i4-!mmWHq6
z1+#I!1y-IibzOKWUpfq^XG#sWX1Y8eD(SL_jaH^<0wT*@6D&hBS@6xtjN3f83_~p#
zo)}<kPPd47UzVMY3be96|FABcngDYYp%<`C=F#d$xK$89t6!D{QeMX_6-e!e-69d*
z30VRj*G#BjK0SmAR#t9|Y%i0Tzg6M91URA{aizICXfC#s<QhS8jU=H)(d}Z(jK((<
zY&QI#7c=%7IF_w|Vsuz{%Y<}!fWs*mM6Xiic{zOEL*aalYRKZSe~eBE9I;(57s^;J
zj)({t8v`^$25=l2-%>ZgI<p+m_S8*SJ{Oi7&-w0hY|4PK7mP0VNVHp2V^%CEOlX$`
zaj>E|0RcXjbFu>DyDwx4mGA^ysg?*6g&SOK2oWH5pYgSfONB+HV^k|osOG$=b7*Zz
zeG%e9P8k!?7D5wHsGfy2FUg$NLS3w9NOA;e=8^lCJTo^Z+{2-WF^zP24x(R@$*Py*
zWJI}cCCLz^$>fq`%mD|MjkZP=uG?YRh@@<xoD}XQt$w6*I1Ff})KI+zQ_}4~vp7Ko
za<9Yl*>rncu^u;vU34yc=sfpMJn3`AkluV(4CyVPyO8c8x{K*9p}UmsGWTJ`T<$&&
z_c;1jkik}xwWczcRm|~t_X~)#nzC>W<E&+zUb5dh#yNpqZ#~@&mNzP#?{(?T-e{-K
z>vlGHJ<fi7d~fr{hReNiVUIUHT;)xO=J2Dy^gp>ZXsQkS0roU|y56{2N=4?`biYuf
zNj0(kHjNO%$IL^J626cyk2&Ze4ZK-8d%`?h2*5mS%VO0WC%JX5M2cC0AWgC)Y0FZs
zZ{@#;Qf-s(P}K&rx?cj?F6B{&vJ@C6QYfs~6c_~k0>fNvwI1V5{zbgE^E7#{Q`P35
z7bG(?2WpM*WuTKF8K4J;S|k4o;I88YLao8TlSQrJK&=se6=Az6!KtDpK84f8kzl6R
z5W0u5Zj>4oNZc|tUb%UZO{Y>e&4wLc5vZrJ6UQR@G(@y0qAhan=`u=T_lxOOMC4rQ
z{d+0f=fi^U>v(IQXc5&yaPHR=L^+m!6Yva{3xU~hrs#lA@n?e8`4oQyd<{(^Jb$Ju
zM~TmJZ-jd`{rf2x54gAD$u7DFT`{+J$j!q&O!pkRN9dkQ_dL4i(}f}eD;78}Sfn<d
zO3?~s#APTwlo2wbA2UBNh%N|QF-wr}rDhmBgFJtk8UCQAk>L+&8W{|srV(@%R2eh$
zmZ;a+r1Tf53^L>JvxCRsx^x`=(!Y$u&nZ;Rl7lAf<j<RN_ytpd7tH|tlKFYrWc_86
z^()AFX4vh`3YSH*oz2Rd<9ry+#i<iqLo^j9oqvsFSNPKA<!Y>5;7EFnB9kOT>9q#R
z=oB`hHrJUOT2_PmngOqest>Jvn%(VQ4v3|1`N2B3%t~&;o{5{)2k@=*kAa6P)Kohc
zTnJyKGtw@z!b{*amr1Ef_K@&0z*{i2M}kxBa{mgzDm5L8w!*Er=tGDA)($1DPbwKD
zh3_q<I7H2`>-kDGd<DLO+d$%1Oje&Zb?LQ-@s%2+-9o0mjS<y+JES}|kZdgzq}$DF
zE9`pr8bBN;Ifi6qO&ULUDn9(I;SDS#OO(5n%)ZKS-+P}u*`5#cuD?T6)9POx-0Jmd
ztH1g$t^V5JR{xr5^{<;&|AwLP>*nVT)9T+etqv=}_5yv6+J-z=Wo6cLuaZwJ_4f?>
zH*DYwiswJ~^*69Z8Gpa2-{+9ZLj8cc70q{iI1i~jsOHOi(17J7(T9}Gz;HuPj;TI8
zDAk7tr0R=^sa2Uc;v=kAGWmIAKtxU<9yNQL^U8JItYu+CiBS7-)9Ict=t)ES&l{qD
z!TdaBi14%_!ZTp~Mb1ao6ck^+1ezOAR#{EGtOYAo&}gGf=AY{QHl~+rtW<d-r)D*`
zBNdr%f?<fLMtR2C=qUoaHRoZt2LP`XDc%<X6)I+auT;^tz9Yg<zcR&QJhw;Um6>?n
zz<9nig~YR*aqtCTbW4q?@K7Wy%8BcF--jD|KY)vG`_TCy@FV@2+)H~89u<c_hHL5h
zzE_T_X6feW=IIvb7SS#CWci}R^0V$lKj(Y|#Y`!(#-T(pvQZ`z54|k%CIghippQGx
zTHqM+eKsNAPe8ub{R7As{tcx(Jb-*!bs(K?v&P#Fulm?rZ=(qQVD1z_zeULP?KDP3
zWt6%eMez>cv4d}}htWPgeh*55OAjMgR#iO?D6_OL+E`rH1H=08*fy-EhEil7OiAmh
zM1dIP5$ma;#G(--v7Sm~tg*s+Dr06C^GR4w#m%OOo28qhn~!?&q!7)4TNEvTTTFim
z{iP9lx>ZKE+!KF=)qmzXlXW^WIs2n*9t+=klzdT5%v}O`>8ml?y$qf%Qe<o{5{vMo
zqif-IM>oRliEe>ACi*novC$oH$I%@hiLPM+3pFu%2PsanG6hIMt_ADLC8+Zl|9(8z
z>K9G92jFqkC8OMj;1L}H*x$GK-ViHWYU2w{mX7<)=vRwjtFiuji0-Q2HnDMP*iuI-
zonPQ-?NCYNV5R#jfVxTnXW?>*_krqH2s*Ej{~bUB_=Wa6LlOTCY#ECD4*?q`MPper
zIujYzMF-$Ev0TkorrK-NlSN=o3Pp=DMc5Li`9>DRKw}_A`oag`<Kal^i9+EH+8YsT
zs?j3|oE|*^cSiIS++HR;lkTkOSv;8?y#N=7M~99?+VaPaK9qB}m_b_xGH9y-Q0)tU
zwpn(K9ySfWagKbP44(?G`m>?8r>?4SrIzh7XsfHR7A5V}VPWmKQxr$zLs|KySCuI3
zoeVU>%GTLam0ho?-<uGMUCNQ<cV1BpC0e!#rZO|ldUIx|o|kF#r({~}k29_QtPF1P
zHkKzc9d=)))83F7VQ<Ndv|+5k#!%m-=Snx4OpcpJj(e!!zJd&BDH|8jF=W2<TJ@9(
zS-`SCucP^3nuTP>MP$asWWyz7z&<kLQpQ=vILjI5IL2AQI13nOrIi`2r&zBb2a&}$
zHX<jH#ouYg<o4VF!o<20D#$r&(cY;ddm&yjcbQ=Ri}xyxr5nu>R%)`S!!PEVG8)4n
z(DiMWcSQ2@kHUp1L*M=bY*K$DR|IH(5uW9pV{ik0)7O_dmjKZ8ziQUrQGkeM4l~}%
z@Q1jd%^4YH?JJZe*SAY8s+;0kltJBgDa;_-`~WS2xZX3M2L1BD&T4fl5_8FH=qg1n
zTLVO}+@6q1Q6<2`=dTf;?wA6{7F(~(`&|gq`e0~=3P-Qb(@eEpBG1q0aAYuY^%>b<
zf;<q8H5<WD{MQ78uS<mRO}L&F<(xj~Xg-AW0FU%<B}rug^#`SPtz8GM^t;eOi=7|J
ztLj)OhonG==pHKehvC<E&&jxN!4oc!r|0FJxA7aeE32HJ%2U0w;JyP-B+aHTZlQfv
z0;w!geHRY56}hYOmiu#{>@GO&MOmgFxtj~#FYzEJP3`XR-QVDsmLJ`%q4(<q@UzVO
zEnq=%a<BBA-{UvqUQ@<`Wc40X5bDvAf)K<7`2(UC%Xgc5ZPxvF{L&7bd%gBPN&!^D
ztZeCAiFa|9mCYrU%_o(m<?JL6f&rD~z5rHMGO5T&N|*Y~!~oGdhgX8`m7y38sG@%e
zPo7uPUt{T>PvNy5r_9BGt*<K0mcUaHLP<=s^VHOYxYk0<(vR^FHI(Dr#_T7pSLv(s
zYzYGr)~eLkjwEZ16M2=eR#n1U<NI05t3b*V*rwO{ZZ!aJ71uZETvY->xh>-`<OqYp
z;doAdoD_;;Zvwe;GP!b!6*UAqGL51+3~X$K(Y5ly6*xOk|AM-Fn0pdDb)tfCZ_Be|
z6zQ#5c8p@_7#175#QjVK8@n`VY-33-<~oA7Aw>4_k_Y#Wf;rURNYXaRs&DfJ$Y`kh
zKH?1B1FyPf1R1ksqZHs?myB7KFlOsO#%v?=#f(W{_QSFzhhcd^4{t>xE$(d4rCqp9
zT{nu{)`8qC_feOOV1O`!1;w4Yn@r9_cb<wTt?og*JVFYiZs>9M3+R)Vr)DS=N3vPw
zn+7%jBy#0)<WUe^mR{nzM+9)HIxA!w5rk++?<j)0WM-siqnQXti>!2UYg#A<xM3@x
z9cso{jcD#Ji=f-FXzT?7@|TH#`~U)S^Th+i{LH`oz_R|Zu3yY2LQ-45LKfR&0ohr`
z*|Yf~tc6$*2`JMvOLdi*i6hE^<si+;vr^q{Mk1;+a9Yt7G3cNcmg+8Zjc^pu6Zwzg
z*Y;0TA_%oV4!<V`C&SfhQ3<6d%yn1L<Eka)j;s!PYDH6cIy`2IeoY}fH3dh9x~C+-
zr#K7F1oqX6ajtmGu(D=&C{z4MrZeh<rf7@+4)Ic2hGl9bY<(lcjAdL2V@b3A*+?o!
zKfb>*GI`7PZ*kJ=K~8J8BNrW%sAI)h`K~GrAPU0&0sm|eLIw4>mRW!>R6tSSNR`Ue
z#RW&IQ?8B<4Wud+tVSre6KoV2pf2rng+Logd{{4WMRhNRO)#T$!zKW#*3^|MC(3t%
z03F|z6)1zG<NIPiWs{XJNM|86gbG~YV4GoERnZze+2Kx+0f*^N%(EqpP2Ndmq-d!K
zGc~=$KLgRqEM4A(s5@EtK>gEHK1?ewLMp9%MStaUQhAxc$yaO(8`Pk<K?z8SNEYd;
zNQnXtsD?n1D!?|J4<E0o;Y2hi1gox-1WVK~Qmi_mSXS2dGPr55A`t7vS@q+RuoRve
z`2bTt?skg>Ocd}^4#}4T)*cjCgOd+JZc$@fP&DeC*b}Pf#AO&nz1);qFXM-ToXkmE
zG_YHAD$^K5X=50~#*ibNjck%8zZ3puGS^W5AyBu@e++I5pS03#qdUz1Jf08tpMl%X
zCmnP<>5k~%ttOh=|7En0MoVBDuFz)p6^7Zn^!9&XFxlhoL_avO3RQ^K<Q$>OkZAo-
zQe?*rsTe5Y>_(jGX#J4vE_iBq1>zp~YI%h~_B43w<obY|JlrW`gj^pG?G+?#fQ>X|
zR#XPko+RU!`TPhZ<N9b*E<6h!8VrcG7DZ<y0DCa97Mtnh*?1}17P3$$i!_l!g$EIH
zw@Arow;%0K0PjuUYo|!Bj4rI<Yo{!WUvmy4W<zu=bT5QwkM#QJ%yNs2(HLD^%U4hB
zmB*J<@zqnO#jm<Y@CKE#(PcH5XUVoYiFer;CfXETopX=kDa`;xr{|f&KH(L~fpMCn
z>rI?9MAeqWITJ6LILF`{8hxTDkixY@=i;?X;A@R;DT}Uv=PZ&FJHVoDymGcoKcT00
zvsWRkO^mg(SL_#VwXp*%x(3gAMBf-|cP|4zD3iYEmiq7pfJ5a1Zd1{93E)i#;U@rv
zQVhE(0p^y3TLI7U#XaIj1=(;W{Cb$~dE~|O=^j=5xxoD3RGvuvJ#;TpR$;h(pwMn_
zDRkftjZS<m<@wPi0-wTuB)eQJ><x%z0|8}2k;vsyR=!K9DFT4~zKdOOi!Kt0H`>Z~
z3r9-7Wq0j~yK9jYdW<#j@=rbXCy;3;WkrzZuv63svwSMd#tjr^<D`5>l2e$`ekcyD
zn$gLixWEJ~a83m81eplJZf+6BnhDY+D1j-$CQ^jWP_3-W#6*PUdJt{WSrWAmW|`z3
zmdT3*eP^O8lfo%;1>7alOXhl0#|m78vrqD=J`H~sE?LyB*cv60)q6_T=^PWc#FDk3
zw#1T^3u8vI3h8>0vZc1hBCnSehRCCEoGA>Yc0f3a0Gx$COYM#YAW<p+4N6C&+5Vp}
zQJUl51%k|_JC6de&lmAHpA5BtPiE4;kOFWK1>j=x(GpAFsiYxKb6)}{ewOo<sIevd
zGSFST?L8V=6N%U|h~mNd3PRhXwo>+@yZMrIBTpn&2d}npUqzIAc=#)7Z418vbgvSk
zMy=KE*N70;%3c$?ZvfrLBk<W9iY*EW=y4@#dt3M|1b$8xgH5A$*ZVe5rzp*$@51*0
z)va>p5aEM7H!o_lNaC5p=%P0xd|1q-TQ(GDjQog+ESfXsh(m>v<H!oK#$)OPqKLfl
zxLOP(%n^tL8xf6erJQ<#>j_a$v-^GI_oS(8Peu3^(C3wmjZu#u{xC(^Uu0SJ&y%j%
zcS84VgndCZB4+kyU=XJI6pw31WActL>BwkImW6m)EkQIXMEEY=dPe3*Ry5uXf109f
zOLO#2Kax|N76s84S=nfUjzkg}L{Y?6GHW@}w9fG7c==1DQ#7rOMa9uhDco{U>6Zw6
z&hW>~PL|<$YQv+M^H~Oik}|}|l;PKS^#wIq3Ouuc>As+3LL#mI8-%^cbp>gCQvW5j
zRHDu`6@S^N?on};_RCy6iRKSwX<soln_tgVar`_kEz|jmI#tqH7Lb5nRhxl?fX;6b
zZB&HwQoo0%E4v09Ba@_8O`+t}oy?L(XT?^l%#vQ?`T#pK=7E2}OJ7s|$B>y{SN>6;
zZzxLh*A>IwFkv{<EgcJ|wdI93&Cj<@7<Q}3!|y6~RIH^-%<mfk{{!>$L&eVeBgNkO
zmU)QHDTN*=g39&MK2&Cxb02zsk=onr2s;$Jd#7^7yi7iyk{R<dBpu6H`FLC240v+*
zE?zVytIlc;W!g*N?wih8FF_!3MkX6$^@<hnjR{M@qbo%_stz_&X0MbOxb3u*6HtR<
znN471w#dI#GEU-%c9vtvDvD+KG<IcHb3EsRhcwBs+RCpHnJSqvlQkU3l1vPWGa=~D
zWUb6*(oEJSnQ)|9XXQ^oIV_nsv$SHk#==JfzkdjrklxaY=cjNCtHj}1Glt=Qu=s0a
zt5i9m?bZQfn8@yJl7l)F%Wk4nb{k}9I?f(1rbuKLF}Df`!d3zNpMn3=>|Si1ooveq
zQ#oIXKYfhLaHo<_FJ^ZUwKDm1n5dP>r?|ix`IAp6nJ;Ar6~!|73^!UI0oTF6w6GhF
zy32e52BzibD#LD98+N+}tDo1ZKO!-c<8^~`9G>QQ{lDaR!{8imG&$a6a=h7O^$GKH
zi^&mdqcR`=BshB>uABLk`ZRKt=@@JDX_J036}!zOk1b>@^z9}k+_f!LxC5(xeIR77
zIfTk1iPP*Q_ELKph`QW94*#dyD^T`TAjxWbjec8+0m22jH=xCMn>l51<TCn3Ka?Yv
zxo-pi8^vN4o5Z|t0d{9f6|6o=$U9s{&i)w3MJ2>Q6Z}KqpQsa2p)Ag7=Dyz#A$y|l
z5=!sP<W+0*7T`}=zf1w}ZNQ(Y)g~03H7j}bWO4ML1sKy`0AoHNi-X!MfPoC=J7cpr
zN6nD;SW<?NU_VDI|3Y1dEKbkKDDq2F6R{gE!$=H8gp)$QGQnawTqqPnQs`aaUvosl
z#mjE)r~Po3YK-3LhdfG3vb}Fo6Jr2EHUmwiehqBSwr61+Aot6DC<euU-Vb^B+y4z7
zd|(9XZw)746~|tS|LgGog!un@djtM&v^U}ZX8n70En0qW^m}{(e8_1;Rt{_n-;IOM
zi2Xls@Jqh~{+KeK#erjdR&0FB!12-Gfn!2CaF|2tX5h&F0eMZ#;;=U9^Y2V8n+4kl
zePSr|A+i5t$b-Y$Bz6pi*p~kh68bZ4h=;WM$c!NWp(x}2VygfDOwanO=~;g>5C7Br
z{FnLpJ1Dfp-m3pwJpwxH^Q0D&vT{!QU>Rn{Ssc^mvng3Qr7ff3RQM`984ahI`mpBz
z;G`#uGuobfDouy4k^%oT0b=l5WX}JXavrrQh%PWc7n+}oY>srtYzl&ljdBZSP^C{_
zYJM)m5$(%u8xPJP8E0ghVm7;+N5NAv#xN!06+Ez<O~@ESl960^B~@}+9B5{*>h14U
z#tLb6I6A8Er-2NPWcqrwjp|xioZZd|^IyX?;VjNkvvFgXFn<im{2chNHPyP#{9F(9
z;x<bRX3N^l4>3S<h3Wa>kkz*jP(4n)?x#>a;&6Jh!o<~nOY|<YVzuL}2C9~ojirJv
zJcp1%bZC!9CcvSRk4i^d&5C?cVrh98Tq;p5z55t`mpjApB{$l>sx)kar`s33d<%T1
zfB`HYdiiDn__KVYm%j^*vxgG~=;b9h1E98b?n6+I4ZZw5@SG|WKO0Lv4+4y_&&WpD
ztzn%e0~ZgzK&=TvP%&UbFFy)d?3Gd0zDZ+kFb2Np<vUU&gKAI?xGxan3#ZF1eAXF3
z9o#dabk%w)5|~XAIci=xN#rIZat4x!z>vsai*k8to+e>jYv8r6S?^|~I0of%?nEqK
zEgb0x0kFcG_oecoom4!svDPIO_t9(9if59F2mKW{RR~%5h=tEsxgxJGSL}V5FTv5H
zfqQ##Z+!M!NN^4tDyuFs4V6n9DkpQmq@i*$%ES#N6wOPn>PACRf|Q~L2}Ma@qu;#P
zG(^$B*9g+r3}QpPgA9rb@4+nzhJ!@dI9u(#sr6=A!&~ZJiLc$w=s}iRTA<3Cz{B^n
zcQt~xFpuSaa4o=Ay4wQL6>Kl;#)R%fD_<jrXmf0iT3sfehI3s}3zW+gGr(e`urz@*
zq$}#xinS7i1xZnT<k2Foa9_nRJWU#+0&>rAAX}?aAlTG$y73CG*E8)}B<*Us((+Zi
zm}Y}3957@KK*9yZN$e7d?F#!>qc2+tCsfCrz(Ul>+&3u%_bZrg5OZ`b`MQowQBS65
zuw+@&#<IFBGeH!1=FSS3=x_WCh(6B~>)kNpj}#6B?g)V9_XHd&_T<$2P>3r_wN?9a
z(6Ceu8-b0j(3OdfM-BI}yA|p(QD>@lXzzuTWdYVl$Kr9@*+2o-M@1h~9$<ZRB0Lo~
z)<;Fw3=@t^WDP;wktQQnWgu&#)8T`GN2G$~!l^01Dv>9>tsPBn1z7j&MNn0Mbx+A)
zNYYa~(6`eJ2uub{wo|%sfTd1J)&L|M90ntg2;`LC$lzwUqk^Bo?IL52W@xvi&pC#7
zjmVprh`dQ@Pm+bEv?s}F+@vQ7XPk@lEjt!@21w5rKAW6~yz>T%yeUarpD&C)K;#`Y
zvyE8h!5Wt&91A^z(#{scRP0@-DEE#kvdKk?d~!@#*#qm5$iE{aHspi=3(<cDk}=*2
zun_%MfJ4%30oMEek^mR>Z-UR@@nI`_)`<v}#%H6CO&Wb%(&%RoY;=MFjXoZYP8#pG
zieP`bM`p8q&bw%UYW4kytz4r%s%0zJCauhs--L|_Oo~=83&j}A^u}4)@$-<FB*kRv
zl4R<WWG0A+Ns=Kj$?$z#N|d2>=1KP=)_g~Xp9b|-sk;}R$<h@9S)@f1G}>6C6^Y)&
zp3lsh11!=CXAVs=n=+6J1ST_2EYMCBjbv~~!6MU5V*#dHc6<Nwqn=R$?qA@3gxI}1
z5nH{3^?QJsvY-@TxmNmNOVWI^1~%X9fz3xC%?GmlC*E&WKPh!&0B=)|n(=#%L{EkV
zo=D@cKv_RmrZnpF=rz*#^MbqJ_Oa0Otz1{MBR|^vw#L#(w_YH`U+CVC2n$8(s$Z14
zGB_LX+;ApG;0u$w4<O<q%AH?k*;~bQgcV?^SbD1g=&fk59=+r}{4R1`h>4}Q!>c~X
zviie<r*Rb{Dm6U#QkLz|9(=Ky?a-050~H@hqZ1^&BV)&MlKwa=H^%!&lp1#;^~H{C
z8ShjdPU1j1LfvTw(vfNRUx~7(6ca&`bblF0SD`$Xy59_>1ZgKVC>=<TCviUq>jFa7
z)g<d0%N~dS<3$^w*B(MrOWePrIP0XNs6S`jzX<392xMU#1Uf+qV`E`lSkeF)V*ZYJ
z>zgD93*-MOKytIOFfNlM0$&!!|AC+lr4q!2ae*79p9EMK4=x4UY_f6_adaJeiTo6A
zioz)dQA^CwJI+O{W$q^sW#=Av)nys?rW9O0o}>Cs6603Q0hmBWb>~*Z8>K!O`JaNP
zOPxVME$wkqAd`e$f%s3h?B3i=Z-ezXQkQA)3qz5MY_tkG@F|KjoVN8Ec;)4v18qYE
zv{w5NH8SUfMGDJhsdMm)!Nd{MC9Yf|qj~Z)<1at}bnMmsBKcJ(m-<WKvDIl2pJ;`3
zr5o|YWfC4;pKf3Uf$qaBGJGH5S^pNej=HCr9#`Fi!ppgADf76NxXg1<X7%WRsk>eE
z3!IzWF6B0-oReJxyq-2*3)fK1La1;jm?D+8R+(J{JEW|zfn%;WZxn<B7GAe_a^lDV
z3s;1Mo+}V*i44TSNg%8xdNM(BrG3oep*I0u5A>8|+bw?*e7?+*)wUtNXs!bZQ>PXj
zS;Lg?KTGW&?@dd9P3t(4m)QnzK)JIMV8c3&H#0@FjuXs9%{1Lis8(cyhk<heG!Rxk
z_%+;urB?}k+r5P#=JBEc6a`nb6iA7>$y*HHYSusw`Fa0=m)5X1K)n?#N8s9E72I`9
z^aLilp6-S~4)$*h<jnjgOBeOx%@#+tIm)uNr8*luC=3j+FR+JO{vQEL7E0JyKU61_
zTA3nyOSV|HEetqdr^{qptZA-tnHs@l5u2-gmy|wfuJT4fNpn@84-9Co%J~A5=Bla@
zlr)!Fo9B5LHkWKsGR>9y7#UmYJnc&@M7?UMHMRbq0BYF>*Od7}pE{W^DcttL+SY?p
zq3|qbZR?qs)Yg$!Y~Y0a0@I2FX)7Ak--?xN#m1lyuQid_%_K8!VQ+Z^&sxd9tt7}X
zLy*=%1ZizVtdt;ag9$Qh5JAL#Tt7kD|Al`$Qi706#m)dsg;bJD3HrHo1nY-$FT$k+
zY5ff9uir4%ZzSv28T>}e{89LHl$F5+@A&TO`V>$bdkJlIsILU9a`%~%|Es3T-KR*&
z6CUUuDk$LrxlwHZ4~!X9<*|dRJnmnrJU*>5yX=I;lI3edoZq9E6PFA4dX{ovA``vH
z&QT6b8k8t6Or#tjUx_IY%RUAP;s{?&rGozr!&@>ur@UpBL1W&Up7PdQM4=fgg+m-d
zUxJjr2KCc-4C&hoDKL{#U>5Og(im4L&aFThhLNE2_DbeIGz6VDh==AiA}A$j-(Vh^
zKZu|U2GyN5r3T1AIY4SU6?tf}K=tEc<ROB79$G>k`ZrTYg0zkX_1AGe>$EgfD;Edt
z5If6QyXDv)u6wcn`Z%e}aDOC1R!CXZdl94I$`I~Z%`{*$bR#xHH(?`mvprMc{E8W2
z{<8(KWmNo+4$RG-k`bmhz_-$`ahz#8D)PTZ7mc&`2<MhuOH~z{5kyRo(qI9Vkh(tP
zen8st)KuqiUm}MVRp<LM&S;^rs_P)jIKq~y2_qhU2{be^oTeEh)1L4-1lm@Vbw03j
z+C2|wi2o8?50$Xgcmp`ORuhMWf5*ch{2#a>{n-2ke+C1LKP4*o(_#K1D_^$Nz6LRJ
zCrcWC!m%5GymU5&(|QalEEDvq&sCnHtXhsGoaMPtHb#_W)d_i^Qv8*LVl$&WJO^$?
zcoc4Bh;JsV3idPniMbM|ROL#YB9$w{o=$9O`Z!l1+l?#rkPnf5P3CgMHT$JDt6HX2
z!?ZAT{RzF1bC^~XL|SW4C(Lj^J77NG2k&Ezm@TF)`8@n7{-~g$#!gk^rg3{7p3Ooo
ze6GoT&DuR(V$yZ#a=O!FW;=3j-uG4@Mu#KP$VWl2$FR{6Y0R6pRZqEd0Kg3CV=|7+
zI>mqk#<~XQ9Q<b31;)CJmIaGiYFw%NHhy!K7hzc{qw}a24p|^cDK<C6sfpuQ%!17B
zr1+Md^^2qioq@zGZjk3whw_aQ`G85@C`sRNrz2vi<>hpTo%hS6g+~NJfbw36KPzAk
z6)_<!*1nQ%72OJV4k8Z;g+kSqUn2>R<U91@242gr9gBx~KCCN}RF@ly)-#0$x^=M6
zBHPOS#*-v+@D-#gt9#f2G9>zP0(d;$c@?l3JrLU>qyJCYRlrGcEdA-3n)TarKj3hO
zAPF8wkc1>a2oQe>At6TcA}>Pxki*@bgTq}8cR3_TaCZ;x7UcU^&F<bV2jBa?``y2{
zs=KGAr>Cd2x>_#IFK$6`3yE7;3+^?rYMH$VcR0$pqtN|DC(Gf90HMrUIC>tY6S(?g
z{~waj;2<B>uO3ev2!qWnxcC(n<CL?xg|H;}6PsIbrGSzKlFco+@`5YD$+FBXcv-5c
zevCDDgD5F(DS4-*O`fhRV;+H9R_Le1EhlbyaVv;hQL`oKmBW+qHP~yMh3KbT2k@Fj
z{{<TXONSi-XX7n_k``-YICs?~$+aXgybbgSqCIDxfcw0825xPUUPs)z=6U$kGcUt^
z!B-jjXS(c9e~X)zb?^?9X9|H-ZR`%bQtcTY>)6J6T`%|<Yo~Gpu$8#hvW<_>s1Z0G
zv*WT9VX6p|7UpP-kg7Zjr_#cx37BeJyRnr3W@M`l7)`PW69+gT)>ar>!HZxkcu{I@
zD@NzTi=$@Hit<$$1&k;HQT)|cE(HyNr9of4_UajoLO()^@B%sNe1Fte5OwIXps3G&
zji{d^YG{>iXg}aC^<@TIQ?ZfOB)?SWr4fMraVRIK&Qae$F@PDV&ZW|p^*Lx{N~JAK
zpwhNDskBGoB2?Ng6P;bR+jtr54!}MRj_eFn1r6s$Fs5@0TpP~Tv{mth+40F5cK&2v
zlI?yDUX@U#V&yugjD2B)KrVF2g>EO$+<Kh6^Dj*EgxiXFBJAbeUKWwygkZjFxJY@%
zk`fUjB~qkBiIixO5~I0e`D#AiSFVnet7}<T$A?^<AXg{K)k$(Sb|i?(>dTViJQBO1
z;arL2<#n!sn@=uD6*|A>iL}-BMA@pzoIH;-t<5K>e~kkCvA$y4f@=zn=3IE);n?hF
z<W+5CP8!vTk@i3R4BMin9pH4FaS*l*grQ?9M<NvB!iuvVgefcxo52s;g;=Ei6;1m<
znCc^CLPbQg@r$+bU5eD@>rPpB5hidKJIwvdT^xKDv<M@S?4mdYqnKtC=W4A1@fHK8
z+LkJIgsj8Y*Mx_E313UQ|5w8d{|20Nw$0xRRJit5;7b+f$NtL{rgSl8_L;I-tPB@S
zwcjfe%etBINi2)&po-;__>^!WQLZajo}(VMt}D;U^<P(>n=8H!N&OG5s^I()Zbj!P
zc*6jmUy(Q2lVYBC<>3=>W$qky@$k3}<t<g3#PU9|-jXnZw^S+kEeR9IKbQY4xdU%W
z>KFf8lKMpsE3o~F6jhZxY2~p04yKxo?>HyPo(876WW&59>uq|}x7OPXgC2>zO>f|B
z)(C!^!UR$i?thyRL2vU}=YLRCYT}*N(meTXRo&0k!F@S3l8Wa+1X593n0ZN347eX&
z82Qsk@~7Tpj`ypTKf(m^r`}Tt!if%2E`<rCDAJ!lQGxt<A-ELt_B`t_2g#rMr~xsg
zwesgBZY})z!xg@!{P`2D+FJP&qhPXtOM7e}e_jdBA7KKi;p0W6w8sbKPXo!HhRB~r
zn(-=|wt7__xbbqnV|VL;g$X=x<KO}zOyCv?{s&GBJg{{{y6>}X5_n+ZSCFK7Sr6Nk
z{qeAxe8kDHEhCSW5_qg;!H*?O;GTK>kCiv*v0m4VH;&?A8q34HIi1B1u^vX4z{9-B
zZHE8t2@?>U&;Kx~froi3`0a@<m6UF&gYv7P)>|o{U|fLTO2I(#-w95>FoC-i@+ZHr
zm3+L3cQxZ*6;bqFlf=KbfD@l-C0>|7;@=A{dcp+67V#&(Xdv<LoBXK0H_=>}Km}S%
zjj<A6TureOUm}qB59;xCz6w;Bz+FoE6JIKj_!d&_e5m>N3h=q=#z%7@qDlA>^eS3P
zxh|$6!;gXcnCAghag}VI0#_PTNjd?ljJoD@oCWcTbai*Am~cL!Q#g~)twvQh3eKmR
z@mW)3byEkom7mvUX^oUy8n9L&&D|H$c75p_hWr29$tFO)l1A@qp}!IO+X!xo|Dze-
zHNbB(lVz4wPhn8SP5<|Pd699r`YBaOrN~`>;F&|VsUop{^v9BhzO~%-CuuQ%mZtU>
zaetN2-z4;Rx%VHMr<koWFNqjwDg(EFU4*w$FR=7qd}(Nl{X}5Vk8p4gp`Ch@*<Y1q
zV4$6Po>^aKp?x5(6**ls<IvxHk<(SppvWD9BJ*Q+RQ%Ren(w}y0>N_M&X%Y)K2e=n
zCns><E`hjKXQ!)@$Lyx$nY&wD4<-4~Q^~`pTVcJdu-;Z!A1kb*71q}Z!&z<I^THu)
z-1Qn@#e~%nesP1K(IZB&tlex_q#3M+qnddm{B<;!L!eiLM0Qc(lCeWoeP+A*3d^uS
zq9t^=723@gIwB{OyH+FBJA7?FD57y*jZ}4*_4Tqc0&%TuA7$m$Kwsoh>RFELdq<-m
ziJU1~*J_9_a;B0J?Tb7nC$jl2l2J|#Q(=+Zb183PTolfQbgU?3n;00G?;)Uq$tHc9
zgR;zqm>3y_w*Wg{raertMCW`Sy_N}9#!j>{c9O+Swm2w`^Jg<vNz7?h*mNswhV=?&
zS}$vs6=k*+CCiEegOc3QoohvzXGNKBMOk1)fyy;US!6|7teo|bq$Nte@uk+!GV5o#
z^@A~}r@ZZ1ql<nPPyJ5#9n=$Db)GXfF%UEV0b2u<Q=Yw>s3w_W*qSq`yLEU3hGNe9
zh}c7SvQl?X-D%>dmo9hhZT<AorAyUUmp)ZL3{NT;D-~A{Z-&oD8dr(-wOX&rJMV#A
z4MC{VvXyXalrtUNS}Vz`te<sAc|}`2W4%auE4(ve{Hcl}S?av9M|K5UQQc6cqdTY#
zsuc%ZQ^}D%!9K09wN`WX2Dwo=2jFKDtem1E*vx0Q{iURy*`l075wexNwpl6IuAC#`
zwZr<^X?ddK?0MR}?AOC}RW*~8s~XI)da8S#_tfyzaPL+;HNy3=)WtXiiWB3nj4f7b
zW680-`23!A?^C@IiPJ7Aeuh&Vz7y<znOjq)9wn(fV5RaPr}B`Mpu-SQQ?I2zr$3MF
zUdh<PUdA}0I-(_fJ3J2c-ceD#SCP8gOaNO+#p_{mB;qmAv@@}45RQGO(Aq?bu@;24
z9-bHc32WGwpr<(aI-E|#kCA|jIrb@q&%ufvnF_wLO4N(X4XUVoz8jnly#Z7${g=q2
z`HmSXrWXq@014KyK`Ks9H4A~Qs*1ygIVa?tBqZ>yo)7(2s|CK*3zmHUw`yr!`o7h6
zcn6o1^9Ft{D<^YTI18^@S$NGVW7n;a8y0ud`nhHO+_rwu)iWN7IV<bqNelgWss}zE
zXe#o<20os)p&w5>>*Hx}eLU&bSJT1z>1cgCove?iv#xn+dup*Rx2K-@g69SEUC)bV
zbM1@`;xQADRTOW9Ck<AYhI$lcY4GB$KTqlvwD+G`GXaKZ6VJvMC{tLv)aq}Rkm+3`
z4=&(oC};@6<eGX9r=gg7HC_eP9S&Pfbb6{dKSG!t-wd?B?9{@~u)QeP$KW(I#btJb
zcs^OhZuSBN`=D@fya8M+`oi^y8z!z-+;DLt#Eld;O5A91W5kUWH%{DmaTCN%6gSC4
z1EwX5n_><Jok!ffl7@UpO{zHxzWL3ua0`g)R6z+VB-a#{*hR!GDn7-;EiOJK#4RcD
zOPSLkp|rSV#4RiCQ^*(dRT#6qY<JTox5qXIy2ti>cCYy_7@EL2C=uod?nvI>ALS^=
z*;rLhpj2pwT=Fu?2fvI8zL%lN+o*`Qfx&2yZBEBCEeYd5&v~RpSGTeEi$zwFH***k
z>iE?N$FD}hDo66`+3yPzj>9~>SZSyus<4;W|Ek1rtoM{R>?XLX(H58Nz0REUK5>*7
zCJy*+)#Prcd=iBdiM$F#FjkvaVpW$|SlW>^3CFSIZZ+ar{wC#-n9oW~SiO>%!Uba1
zl$bMoF>6UosESKW;W(y9c}|{lvM)@yK-lwg9k$KLb;5C&bzyA@JnFkpI0=-L){%U<
z11;8jc>KCj66%SI;~nrOUNm{TeSMSHy<ajta9@_tSLFRQkd3SjP2SktNZeOtA4_A2
zQ$yUE;=U&PXPQ{MT%=qz6aUxEj=1&>q2CnxEzO?jEN)NIpRp&qO4w7}&$#no&B2~m
zf7YH)uWU~>UvsqP_WfBeX4$l#xOwM!Ru6%*^E|6p2PLN-h_Mv!LfqjOe#c^N&xXKX
zO&45P4TSr<q~;HC(_oSeURdwtRJB!_J3p^&6mY!F#kRV)8+_l11*slj)8iy??`rmf
zsJ^iOna4DGG4887nmbWaYS1)<e*mBjFCmO<bx~J~htGhosjh{4XMy;TUo2a_^Janj
zh$~h$iKm6n2WwN;!@YAs@Mod%Ot(Ca43T)ITQeJ?{`$mx4eqD%44;{7TJLl5X(c{i
zX!eqNZO34#*sWX}60le&+2#&po3*hdC*Q1{C4r0^4!wJ15bWh~)?rA?bw-BuGs?<5
z*vsS0%T$`9uWVYWCGP`!UHAdee0l%IrTwL=$+PexVt^$ALwpv<eRi0!)yvq+(hJbX
zeAix%n%m3ca5*{fnseFFPO$;3n%+1FYj3sXMN~0w0@!pvTRnn99jYiS;BqC<LE2VT
zTDhTl%fp#eBnfPaCofp1Dx=~fQb4%^`P@makc<}5oh{K%!IB-9d02%!iukY!$y_&O
zo&<-(JNO}Rcn80To=Qr7FUu1r3UXK<oaph4Ue#U+MjI<*rr2uhGK(QK+}3R=gZ(kb
zgW9^XFW$t|>E~8km$Re=U0;_d%7htWRNrf<$b8LDRlr&vR-b|ithcLF=KSmJY@Il0
zz1?_`S-;K&B{S#OxyYp=gD3MX4_5<aXGyAPvW-W*!dT10&w+JF|FD=cd1NV!EDzTR
zaI!pX&yQ>|#C1B<>n?j=@D4jSer^qWu1+EP3$`P;ZO)!6Cr_-F79|*79SquYWn(jg
zL3^%PDHeOK?C?c7WTDt|bq}7{bHzW{bH%1Pu;=P)aM*L@^6KrBJ6vxbYDUT9;Rj?b
z`moUuqNy^GP9Byk6AQ;Q0#q1hjG5Qz9mWi*RLzu?24Zb~FMq?>|1GM|!oa3x1@9Pt
zkjX0zc2?MjHIR7wD?Cf<32%>!cJM8e;pnPfxiC(Ep(@5nBFLWzGmwY~zK=f<yoMH>
zh)7P9KM}$N5@B(i2qc3Kri&6cT9OpWtMrxiWKYZ&2tJEYFB|mZg0HEjVfNbijSJ`3
z7Oz>$^|D06y)2i1txESwagDV!vyB8c^-;&W4Q?W@(5R0S!*+qOsZUbE_>t|-q$2Xz
z4*z32{g3VPKejvY*h#^U%`1SxkIlxU1CK3C;IS>vdTdlt`*3B-mmK@E66PLexbl~;
zhPUy{K}(im*&(obU*$mlYB(@o0(1VaCdeqBmlqQAOJ03m%6kk<fxLW+RxUaPrl2$`
zKSp^^_!(!OVw(31xI!qA*gF|ToaKezVCGjtvc8Bf>ovNFprRN~VWL>s&ifhJ%1l?m
z7E86l=(3;DA6Ax9Sd43BRZzL46bi?s(7cI=#broT2)2K(`#CwvO0%PDjuS1ogp?Gw
zl$4Otc66<^yyBLzZvkD_z60)4_C0XR30+>wNd@`5i^`XXWtDcS4^j~o@jV{31njmr
z<)bR8JUGsaKdScDyoB$oYA<6(`K;_e;)---{{=tT<oO4FaOzS!{9sQf|3DeR7@{si
zx>b1$4Q$wy%A-u6^04f~$__56Jci3HWs4|R9@t9BmB&b{^2o4ijZxOmXshzbv?>pL
z%<dxk>+YiZC3i9Xa<DqHURHlfFNe}m!Ki4WKT<dB0P@A7E}K&7!_;i6)O-D<KHOjG
zBmAX4(qHPM0;OJzNBBQ>F&+_A<5=b*d^L_Rfl_aAfl|+<u#yz0EUOd>!=*5cKZt0x
z(E1oEry?^0A48Qu6XN8LAtvx4ROQkg_z+~}*wQYN51~4q8>eqsA3_b*^H7UjQox^W
z%FHsWfY%Hu;I*V7S#A|@VWfa-VTT|sR$aAnF-~P$xft)y#RPvYCi-(R$)AhKfn0o!
zZ{^R$=kO@Ob%(Ve<jX~20=Z~${#*>>Tuf1Gq9XY_&J*~CYfJ66B`T5|u)Jz(gqMjr
z;(IWWj)AG$g&+Kg;XMJSo_u-T;=QN9@HuNRR93@!5&6_n>qWen3h%&+s2}<wQUfpI
zrH~i#GJoztFT&F6I%d6y1}sq>cZHq79rLSGhNQ9~roS{btg`ei8vV%32&-?==qoUR
zzD1){=KOt&R|Dx0wY$cRIMCm>kW--og+q>k3Q|35%Fhe2#kH`rAPcC`cKJREssYxU
zE#!Z*h5c`~i2uzN^}pF-fj8TXAKCwAo3ZSm@8flO1->^cOyJF0od3<zMO^uY43-8K
z_n!AN&U_`&VsIDeO_r=C#7GxNn4B)qTT*f-T3sMv0$m`BlP-`Jc^NW_tJ8*j32$@5
zqW-qNgm*%|1fJ*B)LH9G5XSlvq%QuKxbI1g{EXDO?@RsLT<FSDJAWW<3yJxm#QaD?
zTT0B2h5p3O72l`g^O?Axi%%<|zYzCJk@IhHzjCPezn0ql8>!uE;n2o!<%<7Ex300c
z-`UGR>i2TzAH@Ap?*F>TdrMk`)^h*1Mb=Lu>s|5xS#wv^UvNFGhbiM1p3JEyT?aX@
zPTfhg{~MBjmA2_O`&XcUw_inu{vndntlGJa(xReV&*&J`RdQF>o4KnPE!|a(YVK<K
zIZqdvw&f;LnnoA`;B5A{jTRed9I-~gq5js<Vx@WH%>hO?g+9Di!?_i)N?`y}(fb#u
z?sAb?3NujL?d*Y$=Ce9xPGo}9{X8@qYwPE+<sK=(+}D`mxB-s`ZkI*Z3hC0ZB?r5d
z{?W7y^iQDbul2)CIsu~XZ><Eh$!ER*A00YJEE+l(cB4By66h6<h;~O)jl&!b@vEuV
zlFXm+i^Ck@+T=tq9l9F+VFmhY(D<_3n<7ubTxp2o_!y2FTHO6RGt%3^C+}$s&dFbZ
zbd22XLAm%2{K-q8Wsn&iBTol-dw5F1`s7`l{IGZ7sQ#urhS$qXn|zu>szfy*#nl5o
z;nKgEs65^Lj1>AX)jY!d2s!2wHxhgl5N)!mMvOdHthjOF#*3SPIfnTVljz|@CdocD
zOd)tT;2NF#1fKPvWY?i|2|Sy?rf}AAk3h04kNFq;^2%+j98Wd3!zaJF8*Ty38HvGz
zXvI5Y^>>|d`gvPCiXFO5*313A1Rj!K?x#;dRqGnR+@`2r2HnFwL|YXG`5?$e1#wrE
zpPi=+voI%t>n#17B+V=e0ycOh$kq<OB(5p0O<cRU4qHcrI&EFyLUYDuB;x?qrZ@_^
zpyB4bScgt~<SrGhyM%?@#VdEIXo$3MaU;Zy6gNuTXt_&_+$B~s;xIq!OMkpbu53w8
z2$7s9k}F%1lSEFkxGCc15jU?$&ZikX=j%&tevw(-l35@`W<ilz-I7^IBor36h`2??
zEhaLHYetDM<bajWB}HZ}OJ=DMnWaT$ElXw@kx*9Lr^GEMZh4VeL2{s?W<1T3t-O6k
zB-gPdR|=6_StQr7Bv%nRRmH6)Zgp{Mh~#HQa!t*sbtjOw&xz#vmgMI{B-a+n^)1PD
zL{43C>xuh<xG#$2`kL_)%d}GavdC;`$$TY5W&@Gg(306uBs3EDRdE}O`<lpXq8UxE
zL#9W{a5ItFqNJ2zX<P$k_zjWSqNH5=rnqm3`?k36i2JVXISk?cg)t$$VZ6t3!$j`;
zBDbZL*5)B;{Xpclw9?u_q<kpuN8+{=_hXU!iDrD7lh)5f=4Y17&qHLk5}BV_GQSWB
zUyA#0alaDxYmxblW_+8I&i{zaFD;qhg~<F~WPWMM{6QrAC~j+Ue-ig+k@<^e{E8~V
zm#4pp%x^52zlX^DLu7tq$xKrsqK);_*7|8@{a`r78JmugQU}GNt!(XR)j8i=vOB2|
z;hn{bm-^n4-o+Bs)%xjX{dBi}dRWq7RmA8e(yhYK+mim1CB09m^uCt#pDgM9EJ6LP
zp8?j-K<j6aB^?ts80F=peW)e<H%t1kQ0c=h>AzXhM_7VJT0a@q&nW9>v?U!gnZ}r)
zw2!r<x3#2?3za_JlHS&mKEV<+(fXNW{Y<ugrdZOaLi)6zv`@FBceJF>2$eq5lHSph
zKFbm`+xp3}e&$#|b1msOG6Uz+kn$Ip1(x)#mh^?8(id6MyIRs0TY{EYKTEBjW!4V{
zVO(ZbK>A9NZk3r;mh@hh^wpu#vn}bpEa_`3L2Ip_b=J>%>*r5PIyc7epgeL{=#am)
zoPgGH@ITyIeu_SchFuQ2rOyjXU%J)1vMIZ+qj#m*9m=H}OgUXDOsDFg+#VHSm}ek5
zOvP&4b;J%&Egc~-kjJ`xtpCWTn=}Ky0XhLg0G{lE(z3P%I1StZY{&$jG)n=B0#yK3
z%ljAbIlwBH?SbAvCNKk70_*^e0vCaQ0A5|;iL?ShS%9|})dyY&J_No7_=xQ;z;J-q
z50(NOfy2NB;2(fbGY$umfucZFpaJj>@EPzUz(+X`05X8tz!G2suopND+yD$rV@Cq{
z0N#yL1>p07n*r|wp8?+kZGfJ@P+&a3c4U_U8-Tq4PlsOvxF;P06abzASb6+KfcGu(
zf(LJ3?gI1&CIE|o^}uf6B*2^V9|CT4!T6lLX8_)@%^MxR0(f<&BhVKZ2}}VN0BeDr
zz;WOT@DK<`-!mUj9^e(WhQQkZTa^6)Xa|e}W&uhg_X<V#)I&uJ^i{fej8d4)U1a%+
zM`3EwQsOYzvb&;Q$`^+{R;zo89;!^8#_0=mTBUE&X}f+#WCe%KV{gnK;1XZ#AfGw*
zzZ^8qInhZc-M8Hw2R?S{=ICj8vrkXQA_pyXAfi9cDaSY`O?2*b(jMpEPU`D{Sv*9{
z?Q_g^-$k9m28U@XiBdnMerl#-vP2G9_amQg1?o<{iH6fiqAXfOnyLWP1?}uK#y;0h
ztL=vz>~kuY&lUFZ2d}nov(phf-YMdSFqa&69rTy$x{E{FJNr9nuyeMPt~zma6nqvt
z*E(sgYiVv~t#gNy_B*dSsf#PaMN3`Th=T>Es5McvA!<t$20L_H?_khI`!3uzgux7I
z)aq#35q&J0ubB})C!UtZXU8*hBfd`pElSv!U@<*&m>CIk66jpQWeJ&(I5m-WB<@Y*
zXtR>$CDGxe6G_ZWP1%$}ds9xPFq4*dYF=89H#;vg-E){_ioRZ`(*R?<f$1+rOI5a_
z{py&aKKfu3ipP7+(1WiLZn$4US97pQ<IU|hydA8#v4s0(dmD|lP07XWQWQ~<UT%TY
zE)^BEs56Rcy+eC3&A_TgROdBXsb?c4pvR(k&|3AEqHcP>CwiT<jd##B$9<FsTocOe
zuph9~eS5kU7Q{VJ)T9NqQTXbvs$P(tu9Fr`F%|f*dupTi&}qCrU#FSIN@P@DoeEW;
zhRucAijRt}>$h<c{11h2yUhJ29kZRrlOwFX?U;?G+1H^M__>vSuD4@=gSOzCk+1`f
zEl%3ux{acQn8IBZZi-HozoWfIAC!O4htyp~>x{if6S#D}i%vcCzmSquIz8Qx-uO9~
z%GN2qF!j?j5dH`~RLxM-!9ZJ~P)(GmfSM)?v34U93esWJB)RB<<*1$B-HH>SPw12{
zKNV|ApLW0%XLV}&E#{QaY%<**n$##?x2%v*?v~oD(?9wGgLWG?kO)Md%Vlh-zQMY9
zp}th74f<wqw{*(;0-Z7LARg%a&*#un^f@}6qX)<wgv`^|=+x8fhlV^C_ou#Fr~Ud#
zod%h!Oj>JV##41OC?bKL$L<o<$Dkh2e53_jB0B3Z3HAi9d*)<2^>?85=Ln(P3EO!a
zb+@B+;jkcXm_da<ps9|Txatvllv=1zRiK*c;h5^6nU0kXI^r0NVu6T04$Vn!ZV7)E
zx79A4RyvMY5iV0(lZKnqO*&+rF{zhrAj;uHgY0k7Y=yoB)6sQu=#teyV={sLgXy<s
zE1IO^tZp3eLv<%OX`vIQp_n<U=5f=q4EK8sH&mw^Hec8#NAPvm9qm}kZamgngWmg?
z{^*EXs9Cu8WB4s<r=oM}GK#2<Q%pF@W&OHA-6#`pI+WXHP+}pJB5Y_=kI+S5riQZ=
z%~7inwh!?p>U7??2@znPt4nWxfCgp|(IlElw2n3r?W3bs=qKOOY+6cmoG!ux3Mzxc
zC?jl27|jT~6G>@NJEQ1G)R{QC9CszJk>D|sW-6n?b(itlKX#*wf-%Kqs=wXXh1M7r
z9@i-245~Ndn!4{W9-y`{()FrF2Ysbs^wsBhj4AqTGFIqWWUSO18msilx{<AC8pbC5
zZ{669?tpPZ@1TscdIvIY>6c8Sn{iMXeT<b3W0;ZQG$tBrbz_#%-DRvbR*|s{d136s
za~g+@MK<HC(Nx=pekb=ov9zS2W2<owwLb7V@FDOu@EgDrVFQ6N0Bg;!0Pyr-MXBVQ
z)ciN5`@-Hc%`&o)rT-f<QD3LiMjgw0h<47nWzY<JHolAhjcH?UF)7==N20Yi$D1@c
z$0t9x5<8K#urG4VZsZ4145{O%LI)WxdVZYR!sJd<G*Cf>&46~FQFx3pFsH(%kvQ&W
zh{NmP=y$;EG3&U>k<%LMXGK5|+pMbs)B~CT&4Dk0pMj15njviz0OKOsB48b`6X3;A
ze@{IDJ?+9kC7=%QI`ARz9gq$T0MPtu^MUoiKHwa{no)edLtcP;=e2-0fzN=Sfv&(X
zU@E}9@`C`1r}hwtKyNxfz}vCdwhS+Bv;=+tIs$`$dB8e=d*HmX@DN}%q*R~+P!D(u
z_ypiycRQdrFcO#wECSX8yMf>yH`}P=9{1CLzqkDvXv~poJ%QoCWMBcX7T5_K2iU?A
z_qN$oUNXR^o>c<s0w^e(BJW4ifT*SDQEAkxRR2;msMNty%=}&YVrjZkdTJSF7L;9D
zmR6L#UzV9R4Td+Q<qc22O3NA_YfRl<>)C|vH|f%p!}>NF*ocNUTGohGHCofiA7SKc
z95%Gkq((Hg(TqmS^l!YZF&%5%?KSGz<bD$x*z{&o>fdZ+GY;$7WJD9nY%;D1Gt-+a
zZ-S7a5|Z8IILF;3_d5MSMhlwVVoeK<*6pKpAJL(Yu71SK;*W2COg%ms{RuOtKOO!V
z&G>B9XA-Sdb}QQ7>QXCarhIwwOZw-_p8sZM?N|4|qF!HTe$CAJZ^nL0i@yEqTe;@D
z?C)s*cbC3n=C2>Rx2938$NWqOer@v`t^4hie6gp0y!0d8`tk0M9OvGTojA_X*3A6f
z`bKNI)B1jEn)TDspV;R{>pQLKe(Pz%T>h!oBciVVl=d^V`?&+U85$k?<-o7h=C}U8
z(Uspi|8DtQ{DrRna_bjn*7j1@d!bv|TaD?BYF?u~eN=j1jA{C+MbO@W$MJruSARvb
z`>Qqm#iQ#$wFiBMivv|UbY~D_#bDI~jq;cwY7vZnz~jMCwR4!F3&T`LeBkg{Jwgo~
z2^k~RCfFZ_N7qs6Bt}L5j8grfIs=cjnd*8b(lJJj!G{lzJ!4fmPA;7=PAwWI9{-G2
z>n12VG(p{f_6%Yyouv9tRy1|8S_|bbcpROo4oy?kak?5ZLp%md^Lvb%4pGz9r0L>6
zafa%RK5NfeY7Dn+8eQwCx9y};hfew?4DjL6zlS~)oy$o*^w~Yd<LGf?;t7LRoiL7Y
z8>Z3Llg8jv2F*TYY{B_a@Hq0fG2yI1^UoS*&x*$cRH)|+YIok~aNeLL=Z*FFIeXsd
zcR_+To-=j?f?2>hE4~$xJvyB?z!2h(w=T)NYD~R~{>D{fCyXZ`#__Ah1$bP!YV^M*
z9+}sSsn-zWnz7UJIDXBz0FNuzjQ-ap#+g59@dl#F8)-Y{3m{|D21?sVG+-n9vB#;c
z^w%~ts@rJvcJa8}*1Ul(%dNKN!?x&;;gW0<|FkvR#6O+PHt|n?XPfxv;9J?|U3jFg
zF*~ghk1fk=JD1yN-+uey{b)AsM$kb8)5W*deT9y3Ut}=64(O+KN;5hdbkN|(eZ1Ft
znr71Oh>S>y(AL)3Mmuc#ZPdq(`irl5jMpUlbUSUc@8&T^Tl9$#>xAQygN8UUvd_)!
zay@X<#V}av<*+gC>2BKRK8j&>E^dcszlS=6^}#?HVL{x$NXp_N)(P~EG?k0nhM@sH
zG&&%65f;j=HaD1b*}R2}Ls$?uH8NMO!Rygnd%TzA+V+TH$TdWGG>abZ^=PgkR#2|x
z;(~GwVL`c;iwnvpgavVbhU8i<ZoRR^pKAyU<+990CLJ*`W2N}I91fkrUWe@j3>xS%
z?nW$%GzLaqv>D?fPwB?g$a#^*pOLV2*&2;AcRaYeRS2DoGC*bEdEgbG8SozPDex`u
z2f!NuFxQN2QyLDk(6r^iCV)4Y9|u@z{RZ#=u%qL_9hXFaZ334BDgreC?!+_#-UM0#
ztOkd}CSgn&@_}&xdimON0OKx=b>fZ!tO0fncmO!jDTxO71gT;`1)v7-0?-(E2WSbf
z`9R)Q+YJ~7Oa-`uvkKS*>;{ejY+&LxfRQE^7kIgtt${%<z<N0N%A*_54_&49%=Z{7
zf55L;F)Ce}F9;A->eCM$<;p6x>4!E#wKJ9&bjrZ=Ct~(|BGWrKWRNk)py9@Fxh8~}
zXkg5G3jJA@Hqn^P%oQ-@2x<NNj}QVWwS0W(E9XyWIYL4|M=J@UHhuoRre2_=7pWzh
z)gU6p?^CcKF65v9QLzrS{DFNcK1(g7SJ1<~*-cl%a6S^!HROrRu;7qPccz;rxF^Uw
zUkH=sUeD6niL@;DN@nh%E^0xA-lOB#F7XIiBFDE}IeO+94h|6Y-o_>9U&F6nQ;-)y
zE>Q&IKA-*P5h_mu`cHc+xRQ@tt0?MuYWgY0eTrfNRB|$<rixz<RjLLx{ZRZ$Jwr|3
z73%YDX!_N1$i(HJ|4e_hWu6=B$M8S@L2VT-m@XG=RphBbUt_^afsDx^eRqUs4pqDg
zea+<!p2aKB*Lb&}YBr}aYO&>8vnJK4WBCTC`b`kdX+hjzx_Jxw{A*gQuzbKlpqC1b
zDyM^kdO5OPw8hoeOBvp+d5m%+l<~q~WwaOyyA`UvGS(vX##VJjVb7#h_we42cY#lU
zZ-Mr}0AK<z2UrX21<nDt0bZ()0`dW+fGR*epb5|%_!9UTz^4X1XZdE4#sMk=>D4&7
z7U`kCc}A{16qINPhK-u?K(!uIGrf#~23;`j<GXpB&t6(;((#BXXj44Y$2Qo8sg!dz
z8f~A7hWK$l3+>D7bj*I*PTd`S&<uO%y5pgPCOK!K6@G*{<?7*~2Vo;nv^|vNUgoB=
z?ko5L9$`*+&Ut7+7*4A}NDxyzxcPmA32Hcs2e%@KmeXtmGlrI-40jvd@T`qAiledM
z%9k69qIe0zfU&?VU>UF#I1bzZ?8x*)pa@U_s0B0t-Uhw_eg{y4@&*SnU4}A<U7DJ9
z1h@b^1R`;-g1}P%Hx@4eZvY<wydL#CfXNqa5HJ>)1;9d%wh_Qv&XY=2YcP?i(3#Fs
zSOEmn+dhRSvL@jrXf(y1Wv5Aw=?+@xSRy`0ohO|1(Am~SLtMjU+I*UOj+-{Sce?4m
zyS=O$<o3z%O!rVfZ>E<zMGTB!$(ua8J#^S}2X6<11n&kfZTDUXXJ$^svItrmaTH|-
zW38zDQFJqET}*)aC%`O^q7KpBqG@pSz!;hoW5rn)lNC#QVz0*X1tH9UxaILQF>$&~
zyAMyCm`KwTrzFviq|=ytZ<ec|H_KJfALkQP(3|Bd=#TSxw4gT&F6fUiK?S{8a6x~B
z2`byof(!b$!3F&hCa9o)8(h#4EvKLdGlq60$!LZ`{w)eQ*G62(Q3g?=0jnLG9GLUI
z<IrC71z}=BW^+>TTL^$&n#S8Ys6;vX1>2<pzv)H~g9h-XC+cmCG-$E$rvYx0@PnD0
zOIi_J$u~s7g^TVCOKPuY@=6)zW2jSrd>T<WW<kVY^{>9b5MUCp5Xc6$0Q-Sc0BZ@|
z1NeK42Z{paff~Sz09TmYx%)T3-z3|t;|dc)RrxOG;CDH|vrueG=oG;F1{Kob0pbDf
z)R8Na67tAnSFWK^5BFd<O?NMJ(|-3UcW!<_bVf8yjGh*)(X{A?G1Mb=AP!;-BFDr`
zkD)Cw`(rc;B6r0ejHOet=MZ;7-1In_AGah<qY?4r;wd|RQ#_rEzZRdH-<!}WiMEtH
zP*S7Ell>BhCDPi&zY;YHA`j)ike|907+8SL7PwPDqeuD75>rdkfs!X7Hi+C&;&2J-
zQnEjMgUAl0x|gB>rG}Q$Xcwl4=v1k5AZL`$EJt(7EiI?fmeRXR(~;6AOKY^b%-%9|
zsm$Fn)Vu7kvbp(_Wdm0QkvpC`{1l~?>k3&x<i>JS%hR;-L#tqOyH#*lQL#DTZ)JOf
z9sx`SSP_s5&~D%az=h`l;KsEHKw*FjPfdVF23&By22kE*G%y*z^8pn92MSH<NPQA7
zk)YH5Sf6D^JCpjGBV?Q$!er4hq9a&0;R{2UvF03;c9^&2nmOhslWv=BY}Caz8Vd=i
zT+PMogLy+?7BF*9nC0erGZ5{xt&N@9I<QENX!o6cT{OluM{4`at~PG!5Yb&K@!sxn
z9-0ubNSJ%>ejdv7Egau-_s(Iacs7SofA1i4cf#jM9Y4jh#zSdg?JRG;NxQHGVYDO+
z5y$xA><R1dr9s}!UfSk8?xkJfICBmWbNdX9=^jf1VrN2;P#cwCyrYbFmGLiSyr+!!
zl~x(`1y=~Hyo%nKhEA1+#W5Lm@rbJkPywh5yav1vaO2qyz^bA)9GDKQ0I-&&?FX<V
zuiXH2jDo@eR_Z7YJOk7Q8Uj43`V{yX;3?x?07fI)Bw!wZ6%Fk#-~jM9a1$_4net{Y
zKGWAfqT(s!R{*vh^fB-q&=%+gi~wc=tATC65rC(YZv)uyDO=1E@LLezBS~SS9Gh|>
z3-|>14rl{(2L=OUftdg+daMU_0*8UW0ao@<m^S8vP!fTHfM4OGF6hR<J3vd|8{jvf
zBf!(otn@J!;6u|E0Be9<z$xGsU|<TFr=VGlwkS{mVC#nsfH#2;fiHm{fwn+5U=T1G
zm;%fPRs-9C!vKnjrpS)Irbcmvu>QPVx;8=NZ8ct}Ed98SWhso25ig6ED<8`&GLB>Q
z${0&Di<Zk5ciFgWJf1m79Zf7?ZZ+w;*;c-|gS?{G(_Ee7)03Cm{lOv3G}~?)U9!!v
z({}q&N&Ph2HulNP@wsHXV+#a_Ft_X*9JJkWR6g2~j)_lTZaUJObjW!g9cxU=J9jzh
zyz`-t8R4X<&LzTZ3=Romno^T*uw;G-nbs6zt$vD@$!PKs=C)d{(<ObXK^u%S7$tnb
zE5Lsm`-RN^e1LqQI_hZbQF0Nbvpz_tE5z%2k1!*3ESBG=38<>Rqu5mHhxJh=(@;Co
zM$^v>VrHv^^fiN+K^$$DMEiq^@ml#@l!y?rLVcPm<>U`){uzCPS?%32<n5>^KbWkj
z+gQZ_^VXNNKI*JRe~FRiMSno+4nfRi=Wq-@qOz=bm!0V@n&#etK?)>HbI)?q0{3Ez
z8{`_{qCxJl7B|9m1>5}G!52@+?KL7|Oaz^cxP^Kb^6o{XMN-GeF6c0W+Zeeel6FV#
z12-ZtL=U0-Lv(ohhv=9)k84VAW7Z1xyQoA(95{6sFBtSNUY|ds2e|3XU9iF?;Vjr3
zZaT-jDlz9MDw(Eeq1geS0GR3lW+?{O6`{0&Zj^e2uJ{pk#=NNbSF8_u1n5<YD*Qr|
z^*?b(ra#G{pAMp{SEuKmlNdQvgBFOp(7LX{JD@R7#$)m1*D!rB5wbz&52AfjJn+q-
zsy9GfUUPtN^?IOr$pUoKuV|CPT%OQP>8-aOq2Bv6Cp<v!3(SLn^38*Q^36bmP`-H(
z`1$5RK;_JXFcUlv@)$mJ9)vB}6r@FJ4^sRH4MmW{>Tk<GKwngr+DRsI>=Al|f@VQy
zV;sKYn>n*9ApNr|Abn31LiuM`;OU!P0Tq~CVcMEg384ZrD)0@agJ)L|<eyzJv`${5
z=^exPV1j9U_m64(aa|c5)vRb^1kb8WKxJi2QKJoGj>@zdOQC&XtWtmKm}%LkjD2dk
zGES(Gx^YpBGmM)&`@)Oq+;`w1M=DSns0zFQGzC5Y{tf&DpfbH|yKSRxcC<=h(j1)}
zG|VweYU2ziR4Eoaw@J-9$GyN!%iXK+#iKv%8R(&To`o`S?vt=Pf%+$o!w~gls{RIT
zPzQJoe0-wuG6?99NNV_QC(wi8p3DQlKYz>%-z9)b0IQ`o2Hpc&0j+^_pbvm)9Bn25
zB}GwD<t@gj&b500&lSf31p&4o`aIAGco+Bt_!j64WB}ZYSr4F2*HFc4SX9y;0FP5q
z<=seZrLj5i1@Jq-i{XO+Hjc3vSP$$6SVff=#69S7BmzZ%3P3G@RaD;wxcBlC&;j6u
z@kzj9fO{>wfa3t~kwf1WhIpVd3lsqSJ(k*_xyQmPs$T-X0G)wh0QXXs09ybIcD3`s
zUBH2!MKq8PU^~&Qn)(vJJ(O0!Pe4b2RZ}y8<-lg(0B{{(!$%PS?+#+s)LKA8;C-MK
z@FUO}=nqT<P&$qoyb6dOet_v9Oca%Ri5k47sVJ)JCzcy)3_60o3j5BWl|%>WEG~=!
znN1*(uVB{FpG4beuP~SBE>Q<pZ*=XWD1_;6&NgW-<^nimS}=3o(ZNZbojp*Nqo|ED
zz-W}s%Tt|fQ1eC7H<*{u@3D=y(PSHxeV^cSjN@2ClM{TwnBWW6*`SiRz_CI;p%7-b
zt-YPH9sT8d+GjgxqjioA673d;9QK6>bKJ+Yx1$#|#l9B0U(S7Uq4+$8d5apiMoTss
zISKL`AF0uH*AW+8;{nhh_c}KX4*N@Hul|4Do80H!)YUW8Lmj;nWOm-?)Atb{EBGr)
z#C)THk%Ib)-fKl&6)&`ZMPL7ie&I&+E9%dSiGLaRnuI}f>t|N$TUbvtORinb+Qb%v
z33Pi2I;$`@s2-t9H=wTQYKeb<)@VTZuy7h5J~f;=L`_DY2Jbq2RwP}EeIWB}LnAUH
z=u8C8^F~N!#Ks7^9MLI~x<&SrNxaATV2<x{L^}yy!55@Q23?R9IX{vXN3M>fesSaD
zSm+admWqh=k(VNAPE`A7j(d`s&B4r8AJa8@cQmbxyDB1kL=R+UpD+WX=fQxfFC>IH
z5j`n}+Qn{*rSWkqCGLsn%hB-3mPbErF}-4`ug~Xl^spGp4!+=WbgxHz2F6c}r>XJt
zMQwO@{GoX2lQ2Y-gF~33@h9SGctQq3pv*u|(H6`aSai`G`fc>ChREzKB7F>IqA&{-
zJ^M1XN<$$5`Eo-_Q|&DJ)h6`LXX4vSLE&$NxeQ-9;`BoICWqUCR*~)_f`kp#M?jSp
znrx5ZGWD4{&8H<qJLoEkAw*^B<JfBvd+oNo*6TwIY9q55h;T&j3VlsuK#<oBzn4fD
zV$3mUIaJ|<8_C?Cey$JoXSMP%5WG9*DM1FF6149rL1nRm<^r4qNa-}rx<<SL+&=xN
zPWKQ&<Q-8^zF%yl6C$s&q90++{Kdw0e*|=*78}bg?gWP|_lKR}4%2c`uEU-0N&a$P
zY&R}K(*hc>IYRI8&}X{=qyPLA-J!9H>Kj^Q%=Om!#qfpa{1=M9yoQUiUWr%}^jXls
zMM^bwR8ip%F=gEa*E7u~kbLxL*2TPl!3-3M(P95fKaZD)IA_!aMb}kNa97abYlWHw
z8Zx&P)|JJhP!9b@D;kDCjUV+xto*G|n-ylSmLn$ne3e7@XSL+n3Ok7w@)F?&MC5N$
zTSyo??{m@-=PEC4@s3HL83~&cjHqcY<E_bt@zn%^!Qd&nF<Fh%jRk6%Y3x?eggmOc
z7{(Q~Q^z^WxEs&na)-=?-(o-opcc>ocpLZx_zq|T@J`^t0Pf!K9oj5Ujk!;nNVEYB
z3;TyK=k3?*)Y~ysnvoEuSJd<<njf`ND!w668INUJTH1{`7TVBOMj0(ZG{fCFwdhw#
z7g8*@^wNXX8xNK<y6Nf4=&lzrdg)Mf9Hdt>hUzY3m|ol%slRAs=*5gt`Z07|K}^-_
z8`EIzpr21L_ML=GuD=AHZ`vGa1^fVTR?u|TkKN6UKD0Bp+Gz&VlWDv&8|Q4Ebae8Q
zr(7M~Gz06)m~O?&F%9?5@?yd*d{sCd48Iz#QOC$WQ8X)Ra}=g#Epm;J$0O-Jc+q^|
z2lnEgm4WAhCIIiiXSGd!H9R5D3u3$&b{Pmq54H@zi(g*=6oGN!GPMP-r%Fv~)C^S{
zw8Jvh1bA%Fw`0<jw=K}#sKe28E_ynqw}Z%qiD}8SIeBFYjmtA758cSKI4|u=9a)%$
z6`fa9qyG}zK6zm>{h2Z{56#YVE)T8FyDODO7VcJ*mKWWEtN%;zvf`IZ(1cR`OVf$c
zXG>GtGJlq#sbwElpmmi_<5ZIW5`4eJgi<uT^a0k2yj7aEmKj%;9#)uNi7r&?TN!h(
z|9kMcY8R?euj*r~%bEqhf+%F|ivWKWUjRI4=m2o`X8`;PRs;MR4gpVEIIe_Of}HQ6
z2J@mi>d}*;w)8lk{rY60g|r5>kp@NmAZ7>jm?@38i=t5obIiP8Qb$`K8_l#WmTxk*
z&vk76z`mR_^7+jUTNp;i!%j=j>z}ZWUh3-IDcz{9-acMh<y|Yxjqvr6v@B|aToc0F
zMz4b=M9!9eLl9FfNIUsflu-=}ISujb!FkD_Rtw-O;8&nCFcg>oECkj9JAk9WMF3BW
zy@RkC0Ps35FAX3A2J;YlDeO=(vs3SH&^%)Ys%;J1DT0`@MjBCD?49C}E#B#ov@mjy
zr20tYfG8RnbvTC3#B{J0I#VvD(3rd%Wv@`Td~5Sjo74xXl%0QQ0lvCTYI-VlPVJt`
z%uo(F;tRP`Afq7tT`;o{hukf)wJ7Z^daEcir%RwXcP!bbBuy*1tR(v^FMqT=oh^T(
zJTseem@SwaX@bpbJMlnAK)*p7)j_TX@02%q9{`5dLzQua{V*RdIU0@3c?Ea__z3t0
z_#Nm93<AaiTta!(d>3#W;L74Y;DQWZspCqcD!>&764dy6`U|&bavxNMXgi&doSVQJ
zfR~_5z#;o>>2~_Zj!_nvbbAjw_45taXV{O~>7k>S3=9@J_d98}YqhL+WU%tgO3!u~
zp{Mh@+}xP8vL3xQZhahejUO0KSK_-SP)5RR>B8Mi%uJ%iNq=Ek89nFZ5y_OD{8w@=
zW?!C>d1-OpP4dn{m_vC^=ArX>uH>PcIdO(5s@oj<g$AI_1>FIr$Ehc`vjg<;Ix>4j
z0b>{@wIz$V(6SATia<Sp>)@8ax4<7jH-MYQalmZAUue&RMw-vlN*kTG^|sRp`)!%b
z7-d3d^S;?nzR*nDR1R)yr+L2M&6sqi!>$XGUs<t##!_b7MCmKfj5{1hXX4Hab2twB
zEzZYX7Upu??Kqkq|EF|;S0-&pqP<DSB+L3Fk4&c7$@7zGOY-4l>XI@bg=VCzPN5yX
zopHDFp*U~Pe>uN~y?Hr%3NGh2Uc*-#jpmj!AQ=zy1<)Gcay1;t0#*Q<0bT(<1Ka>~
zlp$`2Qh?&XGXR$>*2dsQ=_i1-F);n7WdYdEEZP_+@e3_ZnTA2e!8Sx>G8L%Enb_b%
z`ubONsZnpv#f~KQk{<OD>?6S`fo<^`H5!gdNm{ONM9&X9MJV0CDiOT<(kX0bF|q6d
zW;6!#V0PqY{$YLBzp=c>(f*-yX0S~2C^ODB!$#+9SZF~=2s6N*VW;(WX!9c^gz4gf
zm90UZv0T^o_R>)A0;yI9c_w;js%NHr3#&NzL2&R<_aG0g3EL}`bK8hM5tI=zHG-B$
zY?YB=+lXH5GnsuhiqG+gYY~(d*)@_fA}2)B^2kk6!DU2^i^7ci9;v-EqB3)sV^P<l
zC@s2cG-X6jh^7_M8!+Un5-0?!4%KQvjo-62BN|$p7^4cd^3=i3L~iW3Sor%)PlJB}
zcnjdZ5^tXA35)`!1IUpthH!HSRqr9L`BJne!gx89lK;fe#<fd)x*CfOYEM&%cF9Im
z^id3`R8OV9iLSf)$Q0jsx<xd@>}`XB(^;usI`jV5WnAO*aZi#sm)-Z>)W*}(L*2th
zAp7u0c#ij>)q9a^(h*@s1=OKcz*I(Y<L8J(8tVX!0q$S%GjLH)2l@bGfSJHjU;}Uv
zI1AhYxVT3E$pD^nzk>4YAz7wDISOW0uv)D@B!t;w>^3kqu~f}N8S|<M#jWE`n&PWL
zCpo7%X_j-YJo^#*eLHr=Wl0shCG2Pz&GgQZ!u2q0rkD15*M(!PcZw8`z1~A!I_^C!
z!B;rsiiFh5dEt$NGYa~N{*9=HHYLKShj-ujBG!YEcP&r`xIW~(!z86_q>96Det^D)
zwbtI!#k`@e5Op`F!KCq6eT)Gz(}WRV>$--B%lZEt@DjkvAuWL)f%ZUe0IL?7f1N)Y
z^frKJYgsA8K-ClhJZYx(YlNX<z)=F2i3BDBa{-LHWv2EZetD)=;Q_orGQb+WY|Afr
zruH-NKLG83$E`bWg+HG)$6J=5!l%U$w2pzwz)Jwn)V>Gswx$07odNViv?;(mfY+P1
z0*8RVf$IRz*t(EUu|QFPw=wa0^Gm?%0LmH8fqYxGu;eB#;>ZHbJoV9OiN0E@#@s%8
z_2CBge!{#OWQ8!Ijd<7>vc`Bw)KxZ4Y81lo9&|d4x|IclFg>v2lP23|*>QTzIjIeE
z`=s%{&e^U3^1VKp$#N`qPzSda?Qh2z7Y%TamTN+o9o)sZ=WHjnNC?xz)89jrJ+nM?
z&~rj6+T1>OJWIl$OmR~*B0`uMVe`W1v@c{&SdVa;6|q!4rVwT|YusJ*-jFXVgqapO
zCz3WqZo^i9sLoOW=k^&9bvTN)$IOww=aZRjQG26kT&xwXQ}pT>niIQ1t_fkTNB<K|
z!(uX}PZh!(jXe`f+u}}3hvCW0nb^^BbUJQ@bS0k5435i)qgnA*w6ugl2{bWjvGh2e
z%#2AGnncHwdMDG+<OR})%T7p3q``@g^&XMSdqm>sL>iYkNt6!fbMS*4VRebB(<@8S
zHwdOGRHZ30f!B~+b*Uw`CaQ+?%~kaF^wBy^(U<A8UjN^GCqN+?O4?T;_gzYQhvw)P
zA>&=@qSh%IgH>8)#;{!qKeJA?(P0@D3p?sdDo~F4%G&FfRP|dLEZez3)&@BhmwrjL
zu@BnkTl*!DyzTr;nx~d4G*_5c+8`z?ex~<YLW}qoQV~MV=0PNX+?s;*?VRd*L9$U7
zheR~QRL{Ge4dcsEhVk<-?6YO70<WMm%k?(5XQ&;tt-x{MNjf+$qE34o_yXX=;)3hI
z#o#vsM}P~!UBHgIi|fGrKpCJKP#0(f@G)JV0N(*^fF3{w!1mae0UH6#t7xcIv}*to
zDtb43);QOL6@ljfHonfrRXzoN2D$;Pb2AQL<0{L6O~5{Yw{|`N+*oyDGv$SW@<2_1
z4{?7B_!#&PkOp)Ic#G#uU@5Q@;8iQutBC;e0BlXJ5>N+d47>;Ms#R+s9T*5q1m*&(
zfPDaO?{qj&rvn9mvOsm9KJX6kCGa!Q8<+rK<G8jF*bAHjt^wTP3ImdWqCgekW#CPq
zCGaiq2ha_`o_TFD01XuFPXL?EwG+S<-~r&qyk7!P7~mreYXXgd4}q_N-vAi@&;|ly
zfSJHzU@fp6I0@VWyl%(<ssr_b*MU!gpMh?`U|<|D3s?d`aYH)?+yU5Lcr=h7C<{~r
zUI3Z^?*pxX-+?Z`0ALI-9asY#1kM3ye$w<bow~sK12czgr)+eOSFRS>k@t`FY3qPF
zlU>fk(usO9bKP;zLBpM6C0Yox)V0P%2V5syl<w{>4N`8O3GOLwO7onDe!Zuo^y2?7
zK4UynJoFb7x;f<`Ob_n>FHP}gd1<%zu(UO~eGY~9kDx^{2V-bp>~v`z4uu~Jr&Hl)
zrFA<LIWn52$1IZ8;mOR7r~^@SKPp{TfI^t%vHN4`uxNzWAXgn~aTrxkFjeh&TItge
zON;9jM~CC?NrUrbrf=N9INBJuU7}r#zZFm269y#E%7hIv0g&70bi&mH%Jek@a}&>V
zGtg0*lP5D{lBOon=A_-yLWMBBl1C-e-Q<B{Cg#b^nB)n`bUgX2+-FqEkrX<W(k>6b
zqfsdn|1ZpgJRS4W@Vw*l(zd+&Wtt+l&z-#8^3lNj)AO_3?0lW_)8F}T{1+xGbxA56
zN<E#**Mu+^3*0O~-3s<ANXrYZE6BmQeGV5KSBMrA8B~;mpUm_rG`JA8DcVV*4K9>X
zh&mPRDa^D&%L>t;qBDd!T4+)cno)G|e_?hMK2VtM7fvt2Hw|H?6`Na(HWu4ajM9pC
zD$c>VeMT3bU7Svqx?hTeFBRWXf-+0bEltZyZ!b;9N_Qz^d1w7UyfaFSFG1Hz-!09T
zjxRB-1l=v&M&t}CxvLZnDV-@HPiESe>Q;)TmYO3;3SowqSyzVkl=)kv4KFjQ3{5FB
zU6?~<kCmkhWiOZIn?5MlwLDEKKf65LDc`oj<9#MnSW|&|Ri03pgO67@UV+Y5I4g16
zR6JFYc2{asneJ8YS%n5ynNo%0w5ho6|KXigaZyD&Tj@$AzI0K=l@;kqrQ5=ssyOOt
z%C3A*Zk+YZl4t11Gk-tB(L$JMRp(Zvja7G4rCU`WR^{N_K2xhLtwu*`-mb~Po2yT+
zL5FKytwr~0^?Z&-Jh$dK%e(LY!~0?N4mD`{b93d=4mG;hpt;X25oUUg{WWOKb9;rk
zRbzTh+Ft8$ExyB(nG?^Rf0nw}>@VpHVJ<#@_j&47dr)oKTl;8jmXg~iz0QC-G@#zJ
zdK|pIPRDw5rr!1c!erH5T$c{kJz1Bp31Ke2aPtM~_F}&mY1xZwUz9uM@;UV4#QL=I
z<-xCT@ROOL^)sOZ_=**6UH!e#&U|I7TywMjjF;*BE4Tj(bK<4*FHz5z2fobr31Kca
zxY>ZZHSE`rmNi`4P*R!8=V-&L4XJbEQH?oxU8B3N()ibAzxKZ~S+6dBl@7jo@>Px&
z#2i$CW+#*kYIvqKncIXmHrdgH9yaONlm+DWS=e-OQySImYBQSn`pVa7#G7m0<cL1+
zRsT2dna$=mqgk&nc%3hu-)vbkTJZXEVYa@$^L5I2quZOb<;}e!`Tydx`i+fm(4{wS
zy}|N=n7cWBd+3>vw?4fG?Rh3YKc}!?2g}=I**e!sF%b1+?@^eQp#wNdOT)T!&S(a7
z&~OG^(0B%1(0~RzECM4M(19Th)BZ6H=)j<cY5%AO^rOR?dAc>OxsC)qLeIdc4K|N>
z+_f_2i9tT@iJ$Bp^u(A^z7|VqakF2e<*)61)2MM+!EO+oYf2f{fg8Y0;1+NjxC7i}
zLq+L2ro6f$Z;ini))|v@45#M7JRIy3;WhBuZ16l-BKSf8Ug3=J<>7Rh&%;#D24>j|
z?P`qiEGE<Hf1?${JQiOYj`-C9UR(Ga_z`Fi^ae%%lY#lbpTHjABybgY2)H2w_d0Ap
zWv5M!9S%C=IO%{+nYH@z1n;}{bq*}#be5iN|EMuhv@hzWOi)})7@9~!VMvdKj!ImS
zNb{1m3UeZ5a30D|J(S8JWAbdwgKo!dVGib5m6vwry_T28r)I;aRq(_yA|EK!voMV)
zd{i3z1BH$kqSJ-W6=E@&g%1~|u0=S+8o-K4BLrxp^3#m`M~fJ(_yw??qi%Rs&MORE
zB!5fcw-WF?z$y!G0v`hZ27Usl?z{9;7l^u}!_-B8ScJlMD42hO;s}PFAYo<1A{Ri9
zk9M-Q4GqC$CHD1f!;Iw+bAq27>)jSkM`hK1srP`_B7ee|nbVUW_j=sB%7*_o;4p9n
zFz}vuo;VLs0;mqu2VMt01%3uP0{wtdz*=Aja1_8)H71%d_5<<(rGP2`@3Ude)PDhZ
z(iH}*g55P(&ybmH1G^DnYyiwdV=>W5vO+F8t~sckvy;615N3xv-9tlTN65%wk*9MQ
zt&BZv4I{8hO!vb&cxj|h)9z3%pQ{n=qv+4*(=u$xid+#%$0E<k8bk=QAnMO(5>>A@
zab2a?-#)r~G|h-!AQy%(vfFNP{OWi*8Gk{hq;mVTPgs&j+fxrnQM{gTFM)<9j+Hs0
z5N3Yz+GM(rd=IlvDU(uYeafB`x|Px)4~@-}m4~+HIVQdN4*7@VrzHh;7T{d_r@+Gk
z)VJU$={)<Gfx^rvxV0c%E_A&R-77S<Fvs0f@IXO2Qt*TXA1HXFAe|`K2L0qh-K3zO
zFLb#OT`zQ7nD)gx6sIG_*Ot(D&C$Q;#ya`_DI8V_^3#U4rv$7CqsCapwb5pt#3_dI
zz=aR*9WEwZ0Obx=hIxps)KGzmuBNT=#l=`%uH%XO=K!97ZwUlfHksh30eGP1U5PTN
zn~mu;P$3C06L}47A*~<|8E6~FN~sHMw9K~3M%hpwW$z)D_pBVBd$w(C-sP4ImS=ia
zduWU2f`{X7ir5uFdn1NM($J{+sDr8oo*aSN{3u!;g&MOeo|>ywero<2$Vw%)bG-^>
ztTg>E(@-xN_LmJ$iuM~Ka6GPxKLWT)XbtdkZ3mziFa#J2%m5YvoJ+hqeiS$l_?LEA
z=O6(n2$TVM^FM#KO#xnk{~VwQ>=hX(OCC=Ls0IzFSu-qMT0d_?H`I#pMSxnZGEACf
zVpABV1IOw#k>><yzP^n&``iS%j2AO4^0t0Yr}4&Q16FKyp(g7{Q&4BgwLRD*tVQmn
z>qI@w{*ZVCD#||cw23v#)iB10frgn5lZ-aZaRrloY-4S(6v`3$*hc!vo_3tcI?uic
zD!}&Rb`8r=Sxy@2n&=84*Ew&)*sW_Je2)dldw4ti0-i@kI*>upKq^ogU^Rmm0M5V<
zfPVwO0G$A+<7!iY1;82rTg|j1zy;tgU`Kh60rCT7fNH>tKr`S2fHm_m3-s9a9w_Q*
z4$My`0Y!mlftP_dfRBOJKnDPdi`r;l2Cx+1^&MEE5~I=g@XK?X1puDgWP{PX#`7ld
z5x@qce+4=L{ebbnY+xyX!kqd%Wn<3>lbzVGMlUrrm>DR{Ew*|*!NBbi?ID|Aonc^i
z0GJ-Uq_fCaCd_2EYJAOr?HGhC;EC+p#sgv2@Q%4OSGuf*gfQcvJWf|#?c^K0VS8wU
z6~cQkXY3s0q}A@jK5uM^yK28-XSrj;W`x1KV<+hugfN{VGb3qQ)G`UV9CbH}I!E`B
z`a6Va6Wb}4=EW|PHY9}U6x%hHmc_1=kS8;;3HxH~4T%=Sd`Ok>9SwkuQcZnGZPfr)
z+lA)kL*5F^ik(5;<5j??j}<Y<XT1L^zTgTQ%IXYjx6HKX@9J1MP9+C35(=_3I&3`V
z@jk>h;w+Q)m_v9sVNYIshO*-VM_*+4huHGJ$D|oBc}oNB^XzojKFdM7U9(ZseMlLu
z-7cE#+3cZf-X0=$nrj8L;C;+?&mIq5_KXgrd1099>w%d&zG_9-!!VjCO#7TxLG=Wk
zv#hN$6g6+7U=k7ra|eb^M>^rNlhuw7tK-NdaJ(9bWA={X>A>|jhr=eSeGpzWxCORu
zcA9HnYH>&1)7akDKugM5cXto;7lOUod&CPnmaSNOu^lUh5->XkYP`x~Gs5?ASK**Z
zJ<IaK{-S^U{(oVbkoK`669yza7+IuWN6J9=P#f9cD(p!zH-T011KdW|^S!S3L`lDg
zjd9iR*2h5t=sG`P2?i%s3*7+p0*h|&DLxOMe}kr=ms|eJaZo&+LSst1<;EyC+8cCQ
zlu1DU^a~ru!Rq#nSmVR{;YNFIdwW8Iz!<_BSQ%=1n6VdTdtuPDmom<&o(8mPry53Q
z9gEIb$(Q=C2I{}-z<t1m`is?KxG~8Oa38P&Pzz`PybXK;d<V1vdH|>qv<zS(Fc(+_
zYypsIIN2PH6hNrgB7wX>NuV<D=>F^9;fD<h0plg}K>OEz4uL)g+y+dfI||4LlmghW
z{;R-SKnsA^fOswZ51<p!8{mD|*ukbv2NnRU08|~64>qEiaHmkAr<llBfGWteQGlrP
zK7IT>2fh3*ef%vG?|($!|IEY;R<Ulc_s6@^XbEdSuT(4Xq90+_s!KW1_69SZjlOLE
zVx3fpK}-x4e2G5!MN?Qmf16sg2qC7TLPlAgg&W0C5R*#jFkD2tWIUQmvO9ButgWYF
z@%##E9N59qR4O!&F61y*^cy-xAMGV%5L+4OCmQ=ugoP5ZkI`V&6+3<Qhy`YirUo-v
z=8Bw<TjnquO|Z?C{?!8$TAqGpxosG4#({O2xptWCM;>jn?X}TS+fj)d!kn_*w9#%~
zv{SZgIZRJjD4=<4%>IBFP}b;-W2uui!#oX(Jma|P0Ap?T83QX>GzIG$9J1WK(M{bw
zL!>kJx4VspGJHx+qKM;XR(V#0(W<Z`s8cjr<=Nz+e|*a#A<Th@za!`d^h!C}iHH*s
zbUNa61l^CgA3<#*+rayCzpbnw!oLmo^T00L@2D6wp5M~ccT~2zkEmIjUTCDLAIagO
zqQ&VY%zFLe7ZybLh5Zr`rjd4@=rXlOjcHDeG-7(|M(POD_#8VSzx&Ae@g3ur_DU2l
zxz}UE*PZ|r1j+zafI7fi08~4n_H+dYS);GT6E~Y!RY=+=?Eb}fh#~~5+&zpb@a^gt
z=%Ar6u1yCW7hvhinE@U1k!(Teile=g#yaOXHCDaio8qezH5~5q76KrD5KJ}Fc>5X@
ziKc;3hM^6LGMZvMo!56EK6_k_x(Og7jOsS_{|xrU;Luc{G*A_246v5e*T8Q;CjhtW
z1~m~nZoerdFoe0syYSN-ha7arG2TftotEa-JvNNd!HyFsaCN$4y@UEWgM#PT*V{3f
z0XsNYVagc{t#BJPQ1`LA5kEI-K)f9cuVIQ$83wemhJxjgn($?7G$x{WfmEP8P!o6s
zcnkO#_z#c<^Z~{IGXdxkXd8e%z)65BS|@TU4k!pb1=Iju0^SAw4g3PYsHuiJOB)SL
z2UY;vfg`{L09(H`J6=@`Pz0z9)CFDx-UqmX{R!ZXXkQ=$m<piJr)>oG0;d2r8R)`u
z#skFxUS#KOq0N8~0Pct)Q_)$iE>p(=qK9pOji%VLY_!RSm2Jey?K9dwmizq2<h`7*
z4`n+BmcCgCv%_(~LH8Z$GK~<zTyoxYQnpWr?*UI54D*ihQnq)Km+zR{XLk5pR!6&y
z{tian;gch2N#q)-0YaD)5$7YQdt`rK-4Mj|h#nA4Q=+q?X;<_i>4xU^nHiH6Lsw#M
z%i`xhF_YqGaa^{{KZGzRV$a7?_qhHN62feb-xW{i<FCilsDz0LoYdSt%M(r~(B!0<
zGGlf<VR8~JPg<XwIgxlik-8`Kw<gwtnWf2VlIcM5iDXJo=`M?sxqT+5OiQ67DaR%F
zWXj;YG$HRySyK#QcH}vbhwkS|myi%<DQmSK%y%jubxQ4>%1@HpXIAQ*R63V>QG(O+
zFa96i&I3TIB5T`?j9CN|iWmkl07^!{5pWV=l!1{MR77lgx_f3?db*qLo?*s-sB2cl
zoU^XG<~8TVHM<53tFEpt=B$|4{J(FV_jcdz0eAOf`Tf4$ha0MH)u~gbPMxZIXNQuH
zclde--dk|+b-$ncm0YsJHSS8m!H50t+Og!}9lza?X+G?KWB%ZaU2Ymsa^j%V5&sC3
ztQhd^fRdXB-r<(}o*i-G$da>1zCW_$`;niH;^k*YyfC8V<q;o@;K2nWUm96*?kKym
zr0l%1lJobxZ%<x%xa<k8K7CzQa>}0PyEEsEzi@oXCF36-U-H`cFWjfu9^dPyy-NO8
zvAlu_&M&{ZyyS-Ro6AdXt$ePMXC5nmzP#kK@{1};F0Z)S-Sb_=6_q8=Ri0bT%P&=)
zQC0F_6%Q~$gFUiP<w<M#gPq+2=T;+V+^zeMuth(0Tf6(u-TL4g_Xl+UmGPUC)`H=p
z`Js8jHXOHyawgb_UlPBg@H-E`%kjGbzq{~z7{8bCdlx^(5n1+imabd!0r!6XxE^+c
zzOep}8<Z@^AE+rg8r$7?!?KNz?Saj&*SfKGX^(R@Dfw`dtDQpMyYbmQOCDiAdUA`O
z+@$Kso_F@bWxN6O<!0Dy{?it`ac;?k%94o-_>aS{#7nU5RmqN=6~@btB@I#hT^HCW
z{`&@g3E?ID4Um#E5u(tkKk;u3U?$I9+n%_(2cGWTea_n558keOaJ%kvPh6{eYrK16
z&+f@--Bb15=lANKKCpYHd-rU&?rq(>x9{G4LHF(*@Vk%chS%E8q~Co$L&0=ODZ*h8
z$5Q;B#_v`9-p7ybb~lKa7N1*k8DifZze@b3<985##UY0)aPB7j?!)g1{9eNE1N^YB
zDzFwTc2E4a!EYdbqw%Z4uMR(kT@S%;0e-*3?{xey^9)>v-(C1Ug5UG_{S!ZiAHK(L
z9h7}D{ODp2!4Iv4KX|}53ID!U*8{wyi8nA1e&B8XgS)yvkiSxJa5UcndPJW`C^%T^
zzfy4U)Y>oQ(-a(hwZYFDm0Yvw2b-1r*yjXit-jjewhdQ)aKeVZD+LE<ZFp%uO~Ju=
z8@c?ScX5Qf{0F~obT$6?`eyI+DLFxQ^4E=?=&|yHi+lL46dYXJ<F<U7f`cb|ywRiN
zm5pC>LsG%PuN$A%v*gO2H@VU1$xU1_&UT~5NB)CnH-CHcl20~&(TxQ6^!~Va$*;Xn
zcL%@r{%XsT<F{JL!6W!)T<%K2!Q)##m%mbQ@Y+`IY*q3pHoZ^@PV9S5-;!(l-ietu
z!ajW;;`=bqyMqU}d1jlEWq5Zf;6ekQeZsbTgb5l#2m`D@UxZKa%hn}}`hK<zUJ;#u
z2`$VL{!)9lE`H>e=tn!?ht~%Yi++P!=SwbM`;Tjve7g1$#0me57acHLcziwgmE*N>
z<6C$~WD@@IA>73=aCNuELd_(wvKjlZOX1G!if?qq;co4MKbnSjmaf<b?$``;D%{rt
z@#~v#+%Jps$fFOAf2~4*_8`oIasQe4z1|-k59fOiMcfVPxYw_J6ufH~(Baj11qFHU
zj=Z+zbsQfB7ku|1Tr~Wf|6Pmc|AI0;j^|u68S`&E>#S0^Xjpi=0r{VU`^$0vMEqtn
z<5d(q?-=C&E4T|{<l}e5{2H8z{OWPf-;mGsD8sWzKMm!&8F{{fXUxHM?m-+`hkwJ!
zUx|FDAnp6Of4x1B56XB$7;8cJ_W{T!)e9~m$~O+j_oBYDk=Hz&r+?mxe;<spyoG1P
zk=HT!aj)M4d=~Pk#xssjp}dgu3rJgqJYK-Huko9KXU@j?-cw+BkbXS`Bfdu&sv$3Y
zMlkRvr0Yc9A0VI4AiMq%xWq_50DK$Q=2fFzwz02(+=e=AUXK0pNVg046Yk%5Dq;k<
zH;6QMAfM4VA3@nVQTAP0aUNw`d$&NKZAZAX_<evp$Kd=iz3f|POOYmtw8!FyPfi6s
z$M0dB8-YA)(7vzW-v>b6!_l7j0$<=ElwlmwFUI{Bmm+S9dRz_SFN6ioL;l;Ktgj=@
zWTgKZ&pLe+j5NwLCJO^S3+;{TA0LFhop}B%oL_-^55xI=asT1?;R_&vi;<=p&o~js
zA>{uP@?Nw#bPebGLw<MTdGFxgg99-1D90Yi^WI%hKh(Jv>E>6%_#^*6<Jtkp?=##x
z5%(O2Jbpr%E=9g^)G3a<H=B<E3i7E!UKind5@mW6d7W5?SPiZpfb(T&yGKe<C-6-i
z-;MOOkjDWyehv4YjPftRd5FjLzg^HK?0@Vtr{Fp4Z&#pfzn~l&><(EWFR!nh)WhUS
z*;1w-BF!a8Lp|{N(Ltya|GORM*_ZtNmhH}d%eMFW?XyVBy5O<7GChQ}EYsJ>e=f@1
zoW(v><XejJWbp4r)#$6`n0g`IlgRTq{N6x5m!f>9;2P`Dw*$*kxX%9`09m)4aBj=}
zpg+iKS@-<&Zbdoyyuac;_CeO0{fvEyWxiw<o|%Q*@cRVEW6-YW<KKIMUm^cfkl#(X
z=XN~*q8-sreM~-sc0(Cap0!#a3*>no?%N*M_CdNaXv+)mtou}6_T`6={#iWpJ^Zf0
zy+4EdB3(1|;vVEb7wP8Xcog#Ji9D}H8TNy&9D{Qg;J4{qtno*nw_qIScSn7HI0)<g
zQRox$sKWV2k?&J@rYFBToa;L$;Ql)WY3@OuACw}d2)TEMGjlL>hc<T-%DV%ex$PF%
zkBswI;@)df{^xQ3B9tqQ`xb-OmBVR5`L@M9=i&GY{N~KXG334<$`is5FQ^6niFCA$
zKcn8%5$eUOsN*fD2Le6?wwrd3eQqb@aVzp4jxy{C+5R1Mxd3IO-DY2k;JL4(Zuq>W
z>vx0an67Qq&&G(4k^hDR%>L1qvEO?3jD43jn)aFf`g7zn3iUk__0FRFJEA_Pg5RUu
z$4!ME!fuU4{^#KOa^&+}58MkGT!nk?!EY#@xjCNoWI23u+&_b5VSD4>Q&ILwsKdLE
z;r*!Z(q6EixbGX}GX?3c!}Fg*9<y=JW^K@)17V|a-{x4AY{K<7aDQMT-Veg{<MDeQ
zd1g`fS;+TBl&1o6>cqbXqurlGULBD6R><Q=JZ~V%^dr*u-UR&xX{&J0b_hQWN8Vc@
z&)sqTU8GrzG&k%BCv^%Qg=^DrzQ@KW8_s`$=Pd=X7bI{ce(P<EWntuV1j^QedY%h8
zJcGI}!oLS2&jGl81@id_br_20ay>7CJg>uj@8G`UkoI0&ABr-rRRMi#L)#$TeyB&U
zGTe)2--WWa;`kLj_j{x{7kTf~g8qW~cCUw%j{A{C&W6<E8P2ZlVSU)scj^cG8g0aS
zwWuHRoPy`wgL+U;F2M2Mkr70d%j;QOV_wxL-*vct7GymP`LB(8=);6i4qm5Uz_E5a
z<V)XxI?HkCNYu3t>P36mA2Rg&B}<VH`wn&ZI^@gs%OJ`_eWv|k-(p!gPHc#Jatx<!
zqFv&+!Lec|<n4_Y-WXEscd(D03Z3wLDE2A(OrAedY?nDEhEWdsA@pzP=bYXTbwpVk
zAcwkggs0FJhvS~3AbZ;IU9exEV*vU+j(0}8y@Y$0LN>RdtzH6u=acD*xh6B(3EBph
zg<}wHMseA_GSZe%9_+)EE&Z64^<$1rtRH1W*;0P=u{>RY*m7-m4rIc%Tjo3WIO@!E
zx1injglz9a`S#it_7?SLUX%sfl4IO1XzOzHAJ&6*?GVU@<y?C-`XkPB-1V1}eu8In
zScmJ8=K$zL6=cWqdOpPccsA{BZ{%}VAJ}Xhdv=!f_iU9{*N^a=Cs0oMTV6e1@zv>C
zl!<kEy~V~kuRolD@^W09gnrq*6E+Rcdlt`|hVr}zvafvx8NG)3{R>=YYowopadcnk
zHe@mfzu8DT635%(n%B4X!ZH2ZuOa`>k(M@_J|TSqzy5O#>P6ITYv}e6q~B;5`XXdf
zRfdT~4fL-9X(69Gp;MC}>zWWI<9OawU)?@Jome-oPEX-ke!qkLb&J8M=T0_fpsw>j
z&j(?h#^QKw{MesgK>e8Ct_WE(Pk(vef{f`SP+sh_b5Tx+FPC;-l#yw#kD3oK9cAZ#
zo(_2P08a-NBE6^I`{8-?J(dhW*-&@-i1a1rLXVoz9#<TQu^saHBc2gOzJG&EHiB%u
zaxcJreD*DPHs=GYdM16S$!KTqxqK%5_fJu`YUEGdrk?v{SZp6WS+l+AH$99pv7Bt*
zA!y41koQcqRRi+71LgMSci!CYVVrxppY_uhkoGG)`#8vS{MNALC|eBaDCeC!;Va<!
zQ>A7@hT)jL=3?Bt7&4*Fpx)6RxV0QUB4q8!qx=3Qk3CVI_aLh=C=>1fWj>juSYOC&
zF3MVLci6X>KW!6b!MUX;Gs>eTW^+!?!6*mLL#OWx->V14M&!r-%C8u5?!+%6dbXGL
z+w*}bXU;)5_weR=w5|R*2yHO!u{RIlJZ2R1V>$Bv1oiUlGv_ndkrNn<KGjrV?KyrY
z*i^`h!KVkJ*0!`e-k3BQvKb4RF2McN*?k~Oj#;Ndp0uIfTxnn2_cHnw$2f0%qrKu7
z!8X4bg6CNFEuQ~9%EYmXw)Yt1`wsdyZ3*Y(FGJps;NJ5?=xfN2;{xZ&oSO$G*gS}H
zYwiK!DfY)pP=2=2Z_PvcqfY)g2+PFzDC<wXrVLmf`iT6-ILrPW^osST{MlCY-D&eF
zd(Sqo{?9~V+o7)~l)?AHzdyvkIUk5Zzi2<h&{wv9%hskt862;y@35_pN5Acbdj5cP
z=R-Cp;ob{yZZgVw9Ax<=^8X5Q>E0VM8DQfom!8&^EW#)|WwF+#`7tvLyTQ6rr&y1l
zP=^kbh4tp#`Fi9(r4zn9>iS1Kv(LI1n{a#}^mhXK#4R}2AI~d;PGwQ2kI_b3p^UA_
zmt|hw5AB5eFG5+b33cr!8&5)iNBWX!=(FHkxOXD*{RHW}{u0^N{FX4v!2hP;S$E>O
zFVBZOkZv2~u{P?&rLSJB8-8D-92?9=AA%hA#r>R5e}=rNGZ&)o4?vnulyPgMzX|0!
z8fEXc1L_7@o{00qaQy@1-xGNRnlSf5olb^ae!`(A&kG@Qum5w-OFPsb`JG>8bH>g?
zq1PLm9N6aLkth3kv8`B&y7>Jm&OO)1GY6s`jr*E@P$y_NsGDr7-brgW+Ft4)?E>f4
zUd+PNcj`9vm3G<F@eNQ<>N4lQr=#9CaeUavd<N?7A5i|0cy=7m&O-LDK!zuwF7(sz
zxLo~@#WmKS`Ey*D3!UJ6n)7YWvyVV}2BQZ<CbRJS5czOid=>RQXY2gj+CSf(fwuSN
z<D6@A&b`R@%m>g`d?x!0|Kqbc$L4eILAfhX?p@GU8I*-_7|NUSq%K~Cx_V>Jk+|L$
zZA%$Ywv-{|%Q{oWj0Loz%p;K3Rig|2&o59Xmf<$kf%DnhkZ*l=dj{v_Z0p&m2g}Aj
z#qyOww`l{OfIe_+*aB_f#UnYN?}PSco3K5oH}B%UTGWMh<$ko!SE%d3IDQ56^zVJh
zk>$7$GVMNm)nj(9A0dBltnu`N`r^f8`bUr-^n~#_>J?*iv}yCu?`ZdF)4xJlzeU;3
zgPdqL7_;;A>KC+c8}6mNDMQNKZ_6kTzb~*0<UbMeeF$=)ZgH90tJ{5$1M4=e665l^
zmKWvr0d%g{K(uiqo`Lhs^JKK;_6NXT;o18`&O1T2({OwU<n;_>cO%;GC#3ruu8&5#
zb$i0b;rgF({zBwG2(mx027*nSzAT!CazdVKjlnVUcpdq#wIS>a%E`V*9l0Db-wF5p
z9&J^FemxKE|0C`zmg}A<3*$%5#_wb@V{DPK#&-0a@6>;wjVFvjvM(}jOTA$K`wB&)
zPW=tf@?=f<Qr;X#s3Vj!^@;uWsjX2C$o56ltv~F<#tmpM)b|RMn|6k=?n=mRHe~TL
zY}g05zehXf*k#a7)ctVOrx)sb8u%m54M5q(AfM47eQ}Oe6CkSvs1I!k?G)oFJK#B!
zao@$rqXy5f!L{Cy^=&xreF)M*p0tq-IKL-kzTTcF6UwzO?%f9ej-fmUL#8`#hc-vL
z2at|q=FUiaF=Vwp%6LEIT!&|jz&+F%s9;WKuE#ypnKIl@8_TjDRgRcEo>hkYSr*PG
zSROZ?;`}i+kOj)s%a_knxXygInCI!^3wSQi`Sp1T(o>%)N9q{IV{hy~0r}GRVmndB
z9H;rr$#@pmAgB}6v3CwNd%)Nrb%pX~J5%<fOQ2_<XX`j76ptPLaf5AFtbf$Ad3cVe
zH|(1eAs^}!+k$P)wq-w}ZqI_e=*RsFzlEq5$FDl*N-ffG4z|~H*m)dtAM(g{jE`Gd
zJ6!LRUq76qthR>^?T`L_2c9tq{~iGOQwE-lXk(s*PWs0!me<)$l#l&{_Q<PKhp!$i
zJIC4Mpo3$PH|6Ya8_&)}@ce-ds1xpa1m(B@GNj(T=qvA8$d~1v2A!_QeUw)aWLe*@
z{B4;Z3^_y~8$RDFXR%FXJJMHVy{<)hX*cKKS*)MGzk6+TC;G%Gko~cc<Jll}!|UH(
zy*wE<p)Fp53}>SaS+CuE^|}+!V!h}aJc&N)<w<+Nx*dW#9`3up|IT(l%jeaF<z_iq
z_HHBcdc-*AX(%7tkiH`A7iGYFSJn<*d1=??LtZQ^>%QFg`~+me@_Nsvy=Q)&JgFPU
zp^TI8EY``B7yCHl`LspUPx`|2xjBYX&si@1N4rgbz-x<Jp$msXeypDxw=q_KcPPpL
zont@YSjPJK+i0cjfHpcBvZ1eX9P*onGW=Kx+l=zCo(*_59_!{Km(^Guy)wVLDcS{T
zI9K2p%f8O?rjT~SzP297IR*PB=f>0fVBEpDpMYKmeD-OBy)9pF{e^X)>{d084~>Dm
z4nzI%oI@bbcuC&h9*2Iq1oi1ep3hIh7=&y10BB$wY%XQTwx>Q6>+b^OQ;D>%q!Hh0
zM4j-XyxvA09Gf0Rdg?Fh=jp>Gcn<5f2=!SX_a2Gz9FJcTx;YN^Wf+8a8rr-K_B)9(
z^E)5Qkk4^Arfe_r$zUMrL;dk$FF~|JaCY85^7QIhl#@EXD%r5jJ)fyJ%Eb8}=Yp)S
zr#nrkr_)WyfjZ^MlX9dy{q3-R73LXzQPv$z-k(4|E0gi5kT?5MGs@i$X>NpUIEF1o
zeq~7i0rZzX*~<EtCsX#Xm9+zPnC-wepv)=n3y|;mDDPs(dpzWPIrt6kNkC@HaUa*D
zu)8U+`DEw<&h-O-!TAO7ZRex@_v8Kzx5xShWW=^O5^dvepX-tTIG^sHjAyca=m&F5
zzGz+Z8`xL*ALqiYsQcHrkFvcS*YCo4ukYN6{F&bCJ52Kto^k&Hcn;cu@d1ueY!CVg
zUVE=>?UA;`vmXb+cVYieCrvl$e^JKNRhHu`$o<bKFLjuDPkp8So&cGg3!Q{G0>|Kg
zzDHTkISg^my-Xgxd}aRzZO*b+`}B1%WXE$H9}a^}JP*g0?Tj@7<aHO?=nCY;=kuA2
zL3yzk`r%jL*)QXsA<&a+k&Ze`-SBKTZRRz|hx-)#{=HXc+DPimow$c}<v35DkiN&{
zzE<~5QQtd|H~l%rV>w6i=80!RR~Z+53FV-FL;uI?(`<_yP>*9#r+twZ{UiFAlpDuC
zuOCq-sjr+1y^VZ!!!sU6o+H6<270xZwJq24xpq>8x>C=+M!kFa<oGnwQ;z<A;qCLF
zPwSOy>AEIo%JfFGTRWb?F@}ES@o4+8c<vi0>zOF`^=K28mooD7jr9tnKJ>F#U&`tN
zq@y25-*26fh_fLrWx_hM4G%_{|B5<YxFxj}b1R&u{age+r{7E8n!YrBWA0hG^Dy)=
zoWBbBF2ldq9R$A``St^k?Frf8zL)T0Z1G{}{bJN%0?Iudzqvk{{0ZftOa{&~-<#!S
z`S~1A{?se~*haZ<+~~iN>66!gs7utX+aME9e<+u0QAW-^hohZ1|D;~B4bDK_{I=DL
zSAT%|c(SB@odmgWc@W+`UKewLu+0hR*Lgn9XDF*T-z&vEeR0pPIM)f8ZaB73uZKfE
zmp~SrTW}qM^9s%>XiL2IXWyU>vmax-ao`cuhyP7N`ROB$?T>ax*)N8iFW(z^;XK>w
zO|;(wI8WV!naJ6rv1k{{*xO@p@Z|g+gSR0U#v~X=@W(g2{RF??()<O+I9B-boQird
z&z8QhN63$MihX-m$cgR6y&)_U<5`R?@V^I;hVjW(q+$E99oUxJ;y$(^`#;;9?MQv+
zvpL2zp?+M0*%fVeB4kP%KwHc<IAww93u9J{Y5Dt>*FN5uLwoxQ@}sYH9%S+k{yhtA
z`vqkD63X@oetSc{|3E*X3=hR~%TaD5b3V}xlH&_#kDD^k9bB(KeXs3_8}MHT@~TGL
zYyr722HF=g*a_{w4A;Co?!mp3+fn66i)%Nb?Qh)MY$jvUEdQyy<?R>i;OWI^TnnRo
z^o1zDTzng4xC{AkKIzRNIe)w#=jdb1MwvK1Q`U@Y?}>bVLEW2?7r!sYxf%N@WlmWY
z&)wKgo}WQm!`S39UmyM+ddE6-Lt6T~<;b5pz%tx}x;?Qyu0v*gK7B-Qd|-XKX1oIO
zz8LLV{9J!q{tbCB#*=}}#-q$!L*1>H=@5O=L!ndtIVR=Cx#8$hRwqw}5AF;(Azcgd
zScEie2m0jvkFuZ)PQiJ<P4V=SI?lad)NATG+rZ5`AZKrk@azudME~t;)cfx!`?GlN
zeM##V>=TVc%vanUc8`7#+w?)mswZsgH{hnopMDViHP2W3{4n!(>9^9~^?X;)S2_l=
z8B~g281`;5_Vi`)?aVei9?x0?9i?ucin@M@a{K+DDZV!4-!x^LvW{MRv8^aC>Mh%r
zdi(Emwzy6GZA9OPI&mNR`7o55`u*Yo*guAU4}=^jlgZF+Plvmq6Z!SGSmz%_KGgHZ
z{-)cPq5S;sd7O7`i@JC^&NhDxvf!A#1ae`U4#qRNMn-*S`z^!ury;|kkSF^o``$Td
zH?E0qgZ4SMukHVP3-{pjRe=vt_KQbZ|A2^ceg5RVOqT2utS|NYWt3?=@}-_WhP3Q|
z4f~+In$XvLY3JjaYzI#tSLP$GQ-?mcj@e*O|L7~SFY|fSRmLyg$1|7VS?r@M3;*L-
zLw}#P+27wh+v)Y2pHR1rAb+maPDc6PM7e?}FLf!4d_4Qci8{+$-0qCK|Dmd|k8<3h
z+-Z*)I~ZRIokD)J?SDibEIWMx+U>m|s}%CC?u9rZ+GZ}w@VM`}Q^whI*}fcO7z^G5
z?a4h2C!n0HFXPs)qD{9(dX7n43-tS={<)Cnd-(khmW^fd+TyHvHdZkX?Zvh!Lyp^?
z{QP6j-Up%2<N5@YYX<825aclj^`Z<>t(*+6LcWwC-+A@qx_sxA%hlhXST@#^F?!bT
zE9f0#vy{hI$e(id{2kBNp-sIGdOUel-llnN{!2q%j+EVrP-?d0ez>=fPY#p|*A|B2
z+<CaC4!XQD`LT`Jj^3Ev50#q?S!@b@yccC)oQ!Snjph9#rl0<}8`qOoCS&Tx4#;Ou
zl;LgY_!TI}Y{+;PWIi3w_#EdJ^oQNY@!gQ^p}0SW>-$0QL(o5#p)CeMFW46Tb=g-p
zGP$>)&MhcAWq4G%jTv|18I0vnNBJLpZu;8vbvfUozsoimf_g4MTH3VInI=QZvSb$Q
zNL@arvjAmd{V6BPl<Qg4{dq{oc@^d1mxU)|wmI8(W&PLFKictbDErOhtRJuqIH!4!
zHV);UiE>s!4xS7s59<DoJ{|1b%w*x||H^C;%jma3^n+IBQ~34vA9&U`XrEK}v3k>%
zy-;F)75yywfb_K*QAW?lqK)R<lJhbC$2K?>GX58A9ObYKZOr*9W27@suPtX`9DvMw
zKn8Ifvww1)LiutpJNIO8>|pyI1X*z$;94o$&znn9&KwumKlpu^&GGvH&!W!Ywp?HB
zxs}O@drO(Vu7{P0&-7$RAD3-Od$kzt&i3)-MtQC5nJnv<kQJYYX3RZr5z0?{;orl<
zeLLJY$9?mGa?`orL63HU97<phy!3~le5V`^y9n8Q0#@P~TV){+)bmKlX$Z$fl;;iT
z$oU6he+XpU6ZzGpV~K22pdubmhC|t8x;E4~BbseXC#JQ<<AE7%iP{h^O-Ux_Wdgxq
zxT7Q398Wfe;=%S%Iv7jDvVq!AYX4Zar3yDThQjlvr>kSpKpp-SjZ`Ju64_|FBApI(
zjtd4eA>1E3G@33etBtlM)163~ilz&%)Z>P#cqo&}m1*=S+|e3sO{J4fWn~q0?%0>t
zR1`g45s9RunM{2u6pro}i!PW@>i!RlH>tLGC>x!Wjz(QM0;q5_oj?sT*);w?P#a5(
zYfEHe&53BF6c;B1YAl<;lu#lPkD}VBDAI(ZDM*OP!mX*|vkStRj;3^LFdi7%mPrq#
zIO9W`lgZ|IbZASmH99mKJtWzZ$Rra(<MH-Z_dmhbkwe4D)>JGWO%G0IhsI-#Lsd_Y
z{wHTB1e63#mWhQkL&B+4zA~<v*n)wXnP^RG3XK;{#UYIP&P+DiYRwqH4Xv?ouqm1i
zw**6xNTKL;wGGc0IU)!_<is9~CqofWOJd0c8)-&#ep@UZjZ8|Wr%jAUTce3=#=8X<
zn{itp6V0xsJaX+Egrr&_z5IztG!%iH=LI9tY&49j4|gI-CZG`WDw@z@kU?_{)lW}~
zhnmN^wq3o}$v?bHY0!9K_xu8ViYF7zrTm`>`KHQrW}qB#mS?xpoaXVS|LV}6V>Xn|
z2D2?`NMy~)JC)4DI;@KM-W#cpHnZsilcU+HWF$HxlxU7ZB_=_s15=?jlhVo7s!+Hk
z8i-{k&Tk9FA&79=smrvs)`nzjV<wwSL^FZrXtuHqRwJ94n20btTyBh}r#CgYdSwEX
zTQbpvM$NRfMqwVj3-#!`p?FmiowOqxs1GMo(a6MfI+;cnNT#z?hrslvCMaV=vL+I!
zjwUiun4Y>&EX}6|TdbegrITTp5LuE*=};?51zP~tCL?XAPj$2v)r&{#7sQ&f6^TgI
z0S5${lF8K+d2yG|b!8`OTGQ%uD3%D!Oi<gMpxe@E$lU5*pADhoLFD16?x;zGT{8r#
z;>k>OdMXObW+$dWKy{(!XaKenT{uufgB*(EpLH;jP04g?P2SSc;QQ?lk21;dh><jb
z*<@QQUXVaD)up5Dv1D7OCV|s!;Vk-nT{y6x>yd#;@wQA$L$Vr*=`Zc-^}=F-Wu1aJ
zY0U^Na5aEF(BcQGlBv#yq#6F&P=`~*2u&mO!n+ZgWP{)Iw(Rt#TBqBzT$LEy&_^{`
z#F{bwgj++Id4Wo{gDG7me~0UQP%CPFLsHr7pN?gt0XyJm7fUox4Rt2lpr&o96h|zY
z$=VR?aiF?kYJCurvf@Mo4e1bvnY_(GnPBTe*_I&f2M1fKZ#>o-SZGQQRY1Ww1ad^4
z+!jhl0{g_`an#W3EB^e#$@XA2jk|-XaIl$o+lcO*f+?taPBK-YL?W5>c?PT3fcYwQ
zF{2q)A%gbFy7qKMK`{dL81~R(vn^1!rdS6YjC2NLTWe#|X>S$UE(NK=e`&<v!{(iq
zOib*EhBXrXR|O6>w#DKY@8DC^K(En!ff;aRBGt(SiH0Q105m<>8K_NWvQ&duQ!E@x
zWb2*E)FdL&jsUG5%HA4Ece0PvMzf(vC>x?>E9z9p3jbexwzbW5b~JOBD=?qQbQiCr
z5pLE0+7QKKi8eF%MP-JnJ0+!GhgWVgZRsdIf|Zz)iJ^47vp|VyaKmQ5f|g4*8VNRb
zW?^hy;|Cj)S-2^IDm*0{otkWhaHYN!r8O=!A>Z}jd?muE&isl0c!-_AEGyE@8Cdg)
ziS~RBag-$+sDOV8Gv5+Q>z{Q=XAv;6HP@o^hMJ?(6P^$4?ww?sX=4{g4;p|_BgPJz
z-aus(Ha5C9G6^)NlM5iQOw<{oqz!%JFu-N9g-$`~&OWyQPcPkh!fFmUztR1yKL+W{
zpcA?-2mzV#sHqM#K=Y?3;+-=St*}ki(J)5*`l%BuhmROL&K#Jc3WOoL(m@lTBwcUK
zxjx=aQx9EW_vQWZm1fG5rGs`W_1UtrvihmfMij#`NUuQ=Wjv#k4myvhKv**q3u1{#
z5HhQ&3&0@24M`-Y+ia?;B^sWGak|Q>{q*)|x+xAFutTVVjWnJmhnL^%>rhqB`=uY7
zVZPQP)8LvE+lO{ImvEAz9dLZRjNtP#FpR-2)&>JnDAHb>Q&X7k156pl(K-y{l^k$<
zG7q+4ly%c4I0R7Zkt1w6X>$#XMcIrEy!mNF3bo1xL*Z~+tDBWYkdoeub!z9c)wi`e
zs|n>osWKQ!v!Qq`)vXyl6`e624Z*vF(VU)$2I$;4SJPiE*F`zQYmK&I%*=Oc7}^A;
zx#*U4VRKjLx<(`UJDOa#pVS6Va{my<{~Am~W9?D(ej58|1SNxEj|ZY@6t7Dc=G?1L
ze<!fP%$h~ukicBFMy>rYRZGJm#Y4jMJY%pE<|>%CR-a9#8d{?JC({^=s*>?!+D*GS
zxMkL$31|t$P|F~m)*MT~;qb=dndrlqgrlJ7r~x?HY0fvt+t56zE#X{0r;>$(v-6}i
zKG$SsAbG4cmYvuUjz%Mp{ZzE_s)kXGE%aytXiP=`f(3(5(ZFOL>K=OptrK*)nsEx)
z6b!delM9?%XjdvqU$1MIg4$%0X!4HYG@A6YeNE{#2Pqm1tCHyJ7%DJVfRk=dZKB&{
zrX&-9(R5Q6C?BS?X!Sr-8mHD^8#`;_%zJfpz-G*o&<yye<`9^nq0|#2&Cv{fnLu?c
zJ<&`GD(f4Ao8Z;NgHg`5^1TQGi*r0ilVQpbbgoIRQ~uW2jGGDvJO3nC)8=_-q-Z>t
zjkQMGLvfA+Naec^4-L1Zy--KA1EUM3q^pc{q_Sy-Iy`TKVXMHt@P*;fp&KB46qufX
z(VmA6*j^WgHwdfH63Vm$Tu7uLITINaSjF7<0IwQ`4enstrjMA1hB!NdE3jbAHEr(k
zY4jAD3fIaO%HlLE-f1%zIzF5VQdc!C@}2R-=M4KQWjB_RgP{-4!(0Vj6!Aii78Wfu
zCGpRxVB6S(DMIShjBMNnj9l0Dh7&hHLUzXeqUlD29<qM_&SG$Am)20Wo)NL;KsMIJ
zw{CA*sU2v~tn_rwczDSs#X499<eyE(;kvV>^J<KcMkLr7i^4V+nG(z0Gc~TO72ks4
zD&gJG)fR<hr;1ZKL*(Y&MNfnwrYw_;gH6UznZj@#G#la%fKG)^5a755!Bs~a+nSpZ
zBlC}(p|FeT1;ZHHTHM%-uuTLTp;1{_rUcwQgmI==o%3#FPAKSmG#frCsnh0sy_JT!
z3Og_?al{1^bJ@btfK~plrf^*XtNGG)nRB*sxc7Wr4qy>Y)hDgVXc}FiCim2ujOS9+
zF9@Z^O>9T)-Nd9Jo+%ni>O&ZW+szHk&7LqDLBKN+p}&Cci2p%P#*{soh=twY8<^OU
zg(0erM%q#h$$8O)94@~-a53XGCgL{pi`|T}z+py7yAGW=>C6(B0XVoiFS;Q)aoY6S
ziM4_1w$>DMogNd~DG(09J)m_c2ySQE8dqtyWbzkIMqn%*3!BG{o*tnb18s>^5^=6X
zFxwh%Jvj)4#BxWx4Ffi3xvh*b1XBz^1QRYs315kwC2b)bCXHuYDumemCL^0h0~B}a
zoY`qIgAka68ANp_R31fZ2}P0%U}4kQoRMszKVFZ5ySSZMUrZmHn$VZ&XtJl;G=wt?
z7bU{!smTRXqwUdng`!J{Qp2-tPPo`@J`89NgrUajjK;)TxP#`dqU7k{&IyZg7(_GK
zh5Bf`37>%orCgTLh+6}~uu0#*IrVvi(3Xiuqp1Kwum~DdSsY+uDv9uyc^s|@^5yZu
zlK$yW&MahN@U_t>&OS4)?8WLlqm#|haB%*ayVi=wKxtUmX-Pzqqv`gjEm^tn8)xY*
zyysfMYA2_zvU7T-CgPW=;y`d1pdsltMgs!1E`ReTAdOHakcy=c!G+E;pcjtTg)J^;
zQ)twyt4B?j8e?iTJ#TZz&}wMgP&(WK%dKFv3oSMz>vPl5fF~?3OdV(p9g0bMV=R;?
z4aeaHmBJ5Yz_E1rs3F6KjAjE`gS&;gz(S4>h=96acLuRE#Y-6qv;f%56ndO9Np2cu
ziF29><$z8$CjuJTiuz5_n3dD}cImw{e(lLvq;$Zbfdkw>1`VWtA@95E%*w5-xVzy&
zBW_}Wgxy5*43R_8gXI(~U}!ejnrUu9A1%5CI!^NvY;WasqwCZdD#z8nKwDc3Q`o}L
zmwFid;R*~Z=q63*&s1ya5&fEo1&I|TUt?CciQxh_#N_lp6KiflL^xA0EmnWQO+N42
zMOpYi69Uu=>kAp`3}&YhObMEj>4{j_88;vvYHf^!N<-PwHCB_X3-Jd->1H%Y7RH#h
zDjPetZ1@;7Fs;$V))dD8nK!K@t)drKvpLXhVQFJnp0IVZY`QZD>&7+H;xHTpvqqzC
zU5CY#SK%zCC(hW08TXi;rfGz!#GDJ!8MtKVW1I=GS)ARfiFBajrr;7J=e4Cw46qr6
zV<8qyL$P>SnSUiLAG7GH23ShBm0dW090}oRV`Q^ndnH5VHIW8y2&#tBuSql|13AOQ
zGez-wo>D}>3$mLnq4VlGP-V24bz+1BYxqum8<Mn^p>VbiemIj&fs@L*&cKQQ3?y#1
zrpy`ahM4PkXlZ9#*o-tSg~lRhXee46R5AYHUN~r2XRqO?VoU#B!f7dIL)x)9wa9*$
zZR1Lr3&s^)U4?g_j)uF&;5gXBu5va#1L4**oIDH-2paj__Ix0YlhWa%%@2V*I8%i<
z-84Qx?;Q)ET%SNFhi(D{L*vBD=nL_vX82Cpil1?-L+;o&b@vUahy*1le8kUV2wUWD
zZ$?NZ1zo{4x59<Txj~i=j(ae#F!-*HXg79X*lbsTEEDke%G@LZ1!8}MTVoBbINBPs
zK3(Iyo}yuYi?{gZ937kVIw!_faPrbP1X}E^(dS0!)hn{RvD{{nM05e-b#ZKma7N#p
z!l<c?&CnrmTM$G4!XkuRX3U7-&PIb?u!JrZ41b}~(7oIpA_v;HR_WrT430ueZEe+=
z2{a{7x*qNR?QIpnY=zDj=I#vt)+VvK$l1y?Oj;Uf&g$D5^O8ep66^p)-KZmyMr~Qt
z&efb77RxamT0k!sJ;KScj#D)n9JmB-byZ8asbor%?3pR7?nN~z;S4fR%Q>angyNJT
z*M%_hM(5?L29d*_&v3!EifQ#V6Jan?(RysHi6-EIFicW!;SKJkz{J!xBCMX;{Kiz6
zjt^Yp4CZLw+TR*`7D8;zCSV!_+8<{R;&Isbj7Dg09L#t#3=9+4bCh%45tTCsx;h%i
zDC4&l5c&U~RUrRj87A6r^0W2mAz0YShNzvXmRQ&s2j{VCRarfwvU4)TfVB`fqn&QK
zhgQw?7xcHH?Re<CZj86SyD)c`mS7ora|<p`M=i9Lo341_`q^{bBr(65R2^*!wZ+j1
z!=ZSXBWiIsOQpjI)X@!Ig|o~lCTrbgDXOW7yFZHaL%m|f=g<K|ZsuTJEx$fw_Q5~a
zVNk}b5^Fp7pJX@)R}_6P8V$C%Es7ZL+(IigifhDhOJWHw!h1Wxus0^&7KvH}CD$ui
zJ7~PGW5Q2{C8e9}X3)7iF@YbE@4ncD!_JO|;lnC2SlU4-lB$e_jwrU&a3o7|8v>_@
zl<t0vBk%w##1rXbVA<Pq!`Lz?I9yWLQE!BCD+86Gm8>)sd#TQ0$6~R$H(drx&bv8#
zgsT+G1ah;-Og!0)twpG5McmDwvj_=uKb3Pdur}qGTdiDJ-<IK;cvSm&u$Ys{PNlI}
zO&k0hMhOuww|?RUj)EBHoMDFC%^+epu?=3RAT}TQ^#zGBWne=VpB>v6!&U^_qM9)$
z+iySo0-V4W^CG8MkDC|ZTpatW;FF^pLc5@ph|%N<#4(_tP*tvTD`vy(W1b6;&aI@C
zqFU&l&>OCM*18xBBlMYu<n(%PsP=~FRgTT0M=>a$anWfMI-F{=y<u(`%7u*@lGrJQ
z@rq-d_e4IXhTblQ?pn_$LnX{?Stv|VFYaZ4vMPRKs;Xcad<ksREbidB$^I&QUv4UP
zK^YfZSiK>qcr?%NTXeCpiu$QDu$-0Qu8b-yTQf9+W!R=^QBL{24LcYF{gxRG>-s?@
zw*h4Wk$5}SS*R4;BxdVpwzmTVQmWIfnEBN$XFgCidNRl>E#Ec3Wp!)@LB5isH|8Pt
z5T}2M6rLAEumbC?h&wV|;SJMnHi)tzP#o}*xim-%jYKrlxqWHN8CwZ+<H72Q_%}MM
zdn_DCNY?dH&$w~7WwFJASPFKxm<7|4k_&&;wPl0c%ciZ^t_|FUqMax(LWqppem`@V
zaud?rJhZOLwj;W&cU(hvOX`_?|1;MaUC_d(frYL&z!WlCs8|i=n{|+D;of#dUu`$f
zRM>nsOlSOqCJYU%JyvPFJULny_Y&KO(RDhRfx+!^ukFq#@`=MPI$U(prZ2fMHXp^s
zj1x0T?#F=U;{ob9u$R8`LFcF8YQE8-UBQ-UC}p89>Yodt$Y+CLuw&O+Fd)><hWcup
zUvp!G^9JZLIi<s+y6iK^AexZi@~i8N8LX6Ls9CUMXx$d5YoNX**%pV3ja@njk03q`
zi^#dHOThWCZ~p66or|JnMTZkKhI52O6r(0XKcLNagRvhvU#AK3u_@D<@0y;~&{ler
z6gq;fIPF&Y|09r>bNb6h?ol=tu{*a{ga*@10MJN2$4HGD#ePh0d4qfB;2q_fWEED=
z=_BIvVeA@(QgBq6gxs7jhnbvt4cfZymY6L3WOL&wZOu^*ytoS!Ij;6_vk;QQF@keX
z9s1v!Px>kOc&)#;@k)g22mkwr+Uq#pK1{4)rSn?`Y+0<r0;&l0<~P&_YN1J7YGx#x
z+X3N0!0sRzr<H@8$xXw>aJI$scHGW+M&pp}0(&h0=EKD9qJ~(u)FWvfV~KVwR|e{-
zq&a_#qeRi>KNlEGN2kJ!72cPNT+%e;7AjKka4>lYIDbG^OSaD}m(3P@(}r4G_p)h&
ze*rKaO*CUec55<%UBGTN9fd}B*ULgVzsK8iU~3)T>LLB8B4h2Xh7_OnRtF(-?nsyl
zyHs?`%6Bz*>56WKg4r4d-dIvUU?-_``*5Ad=l(g4FDsRmNjPO?Q_$j3?5;!~@_Q2!
zj|orVSDJjZZJ=Hgk9rI$^qtg~YF!g>6twLkn3!P}R<z&GIjv?P=&0s;eGFmY!W~De
zT*4?a%0-91FlK>qFHknO%v8M1XaTlSFt}NVP2kXf2EA(AxGh#=2kbgu-M~-@gXS$q
zn1@i4$o}Sj-w8RNF#pm-n%flJfXZg7$zWZOyXbI|&Ixv^_=6(3Q+UZBn$8DHynZ_^
zX&VkTO|VTYtDe~D?E>}7LQ{Z##W%_rcZTtZhft1HPEG8kvivqei`ihq0>|FmGG;>(
zUJ@EU6n6%JGkLQsnA<4`r<Li31<5I~<`&2o?+HaCby#TSs%0~_PlWzEV~Lp@wi;lA
zv#R6V*HzX-;ZJLHwSAqLl-xYI(ZyGD3X2Xzzn|*vrgSqH`=W1--E5pS+5)Wqc8rAP
zuFgy(+(t?CF<8?k8>n!=6?z&27&6QYm<czq1C!kgnner#MceZdjO*7T={%}Z2JfOk
zf1yl8RsuGHJBy0Wx=;-^KbaBr4^@Q=5!evyzvxUK%nmATubl*HqiMV>#_fgvn!6oD
zsE5W57fziK<tl@Jl7^M2jN2DFJux-bn8v)#BBYc_gm16bBisR_;l2Hne*)HXVL$SZ
z<erS&RnF^!7V^rSLoCRKeQbcjFh_mO?Jw*SI$sbC&0D4BSP!5|!#u!FLcVxoD!>|3
z4AbKg&r-mHHEYW1S9WTJ15$wvK(LN2hu0<8a|$L{g?0-o;6*3!a%<5m2X0l}*Gy$)
z|CiOlHRhUNnr*N6m}5@^FaqHwFV^m@4c26^p(Pq9+|mOhGEd#J%WOc#48EL;IG}&R
zr)lz{irCA6=zvXc+|prz7~$9_(Y>Gq7>Ln?&W+%5I5s1?Z9m+HV2O)*PHtV$a|deZ
zD`0BjiWA_rAu}9axOPxrp{Bm8Ct=kuw^EeOx)pRNk7qzC-8*QUBw<ejUw=c9xXSB<
zh<GdNnP5+GqcQ5uI44w=HV2JT)6A|BQBU-?%iGGue)ZFAs^eDwY7%bSjJ+u6A}IFK
z=YMCg|C{Z7kT9aMZtJa^^4MyN-+l3^KWDj-#$-njyST6@6>I7YG(kk1IJjVoKc0f6
zOKj!G3ZRBg>##Jz=09Sy4Z3-K9k&iM*r?axGcF^Bop9bZhR=ZMUA+d*PtGj>=X+1u
zSzH&Och%Pth3y0M^xi@r*xH6gLFZ*y^u;ZFK@ZTvm?|V073O9a)i9ZGD-o60QLK0N
zbMJpUucYfXFnfa#CfS=+cmpo|hks)=**O>o!Na|{V+=dZY7!WEp|M3h4F)n4cRM4B
zF62D&mG2Fvn)4MAJY{;?wsGZl^>LwxF%ny<XoR!Mcjz%9Fm{q}fczln3^bPGky%be
zHp_vln~hqRD=-0^rnnx*a1k2AdmB06wtu)cQX?J1@WOzCjMx*<hAmy~Q7bAsC-f0}
z_1&B{zj1nv=@vwS#R7DFr&{?D)7ydPeOJNeYz0=dXlyDv{Er;-l?7oCJ#;UCyAYhM
zWMr@uhj@6c^O+F9<6F28Q2qtn3b<MrlMALVYz6d6G+ln1pWRRxR)E8qD%h=wE?>co
zfAB@o%JJZDMJPO<TRxy!19~lPS&6%9Y*~XLd<-t|&|K$qM#$Mz^B$)!Kum;zXH4bj
zSG3|ab<7sbtFtgCHf7oyvU)?!W#RQcr>FnceRsS4X}fF{Bd{<pP!z)z$DES}`eGP*
z&DEHmc5a?&aBd2Y-ia;>4>#0o0~|lrKqqu&g3(93RC_2sE#Y2w^G~-^crgR-l(=_n
zbJ~X2a3HZjEQy6m#kSzW@zpF%#J!i3v<=AJ7eR2S%ho_|Q+}~EfqM)qSY%1uTc<D;
zMW<m3Y@e@-T3Dg@Feq#8!7L@J675m|Y7FJlCC@_J^Bd*{(PJlX(^YGbNE~hy#(ESi
z*;#0H;4SlkdPAO@7VMoX%!QJ8vkvZ!Yf3ClO>@yO-l2@VHAEM<%QeQTr_sJ1j=Oyw
zya`?~jjqopZ$cl)aHw%@2n&x_39&7L&5zscwcMF9E;_VnAUzlS%lBQ0h}973yf7kO
zQ}KeW=vUmsXu@Jm(dDbL?&6&aFy3fa(YM?TE5;pop+=}GLb{rxAjbsXLD9PvItz`q
z&yUpmcJ2F{j#0hhH}DbdGoQeXKr<7V<JdPJQf)O6bRv{7x9QJ%4C|5%=VqREQo~{e
zFFDr-Hc7j^Z53=~J9Fr*Q0N6&AixppKv#F8UOpH1O^PX@coVh>(@N4A)YCB6lw1QW
zRXP^ILK{~Ftv}~B*mKb@*roq)qQ_Ad${G!|7WV|4)pvG^x`=0H8AXDM<QFrT6~j=u
z9rO0V60E9f*~Z@@`LUZG3Hnmr8R8)B=3X{e$3Abgl-pT{PXaV^hkY2|NUDL!;UJ!$
zQF<MM&Ip#y!+c{c|L+M-XC!eKx3Q%MGzG&)Yi!`AH-$nw2gZ%vYwk%a+qxTw@bz9g
z&*-cZwGRRvjBli1`Hdkh=o19%_qSo#81<OxHrm&7@>}>Zmwf0K_Js?JaH6`tP2Y72
zT<CcI*VEr(yJjQoq!8kQe$}G15hliWWZb5er28OE5VJ<UC&-QDQi6hEn;)R5<5Ux$
z!C8>R9SD53NBuD%7fE6V<i5E~HiWoN!Fw%KOJ2_PO^dptSNJEqHtR$KX!k+UUVzn8
z4m~S3`Q&KGbs0=3Y<GH5S4JPlaDWv$)TQVWt(kM{yS~KUI`22#eHVRdD$Y;e+30fG
zY=-BA+el{t@d6*#MTRF(FDcB`@36&^LVIRC5#=Ws)FiI+>Z=*B%n%E_j--3ZI5+TX
zKZ=#j&LZT5@f+W@uxX3wj&EPE-t)sNf1LW;w&5y%1;CB*-f%`UUf5^Mg)?s(!CLq=
z1@{)nY(fB|IHpcb$i-(RQPcd<0{7XRoKI0W>gnx#tP3sFxNQrptEZ%S@phR&<DCHa
zZ6xeJLdZR1pM7vf6Wd-JGs!rdBKJWO-;}}z33MohFKq~N``P`GhlTJQrWZs+yzf6?
z*f08$k0uM~$X0DEs&m(!Hg}ltJzshu8_cy_;R~M5<H)%fv<=A42052m1GK}NSa44-
z`y2Gi{}wN7i8ueY7bnbH@H>(@Uj%8r)qz5V(S;0}*kYn?nbl`}^11+P!1uW^tVbaE
zc)Y30T)(-MJ-@3(;I@fuvbb-%Z(zW$^yV?@yw1_|_W5Jx!u4K`M&A3<k}TJ+V%%@z
z9Hm@ZoZ_}+-{+?!4=q(-N{%k!z4s%>qo9x2Smk{Rg`XX=-kIwJo*!ZFEBFQ-n_J_)
z2?0!luo~&!Hsh{?{JUIGL^v;6L`tUcZ@pjEW%m&lU=eKIueyG^)s!mcy|dPcR~+4k
zkY?gFQjNx(Punasw~+>aWI%?DFN;8hikrX}26gX8*u*H!UC$Yqwpa(^j8X`<*qPw;
zu4p94<+1WO4s0a8RW{Gfqr0T^J3mk?x8KfY8i;V2JMGzYwzg9ZT(LlqH~T?9;JlKi
z)ZVvtn+qUl=Lx${Be}1Ip(O4Nm?o@A1YLO=V3J`{@C`bC_Nz++7g)<K?Ob#fsf%v$
zO{_eX>3UO98ZB<3kLBYPa-<uY`2{h}u{1ZL-GJ|9F()%dXdb@>ZHH5%J(bwPrBBz<
z)T}WhjSj1A$lf+lXQBPY%B>~n8>TMkV{41F(r~$Rn@BZ4VuwHyqaQ-1E=r$Yq~yMH
zj0NmHUd&TV3C{1M)|j8gPOndD_=^Ii@OE8bA<fNnW&F5A2(tn|mp)jb)wJ{0;|j0P
z8vQT7n#vJ6U>Xq7_VWCDz%|u12rA(dgjl*N>RYZS7oT+gT-WzEa2kt?UBW6b9rm(^
zXG3(NIL?L{o%`}6XAcZ~>4oUpno5RPbF(}1#@*ZP=yv{(NT3b4fWLYg>3T1R4K)8y
z|AFrf3hIDuLwC*qT!HEs_cQSo5BDAvod+%{L77|@ur&-KVc5f~0fEfCSSq-n1&jT*
z;EB%#;LAK~a4%i{-H&`}{)2$-e`t7+=c?|+C0QaE*Sc_hs?xVI%T-vsr@(HGNzoL1
zWDhsZsIP}Jdq`V`!y7-!i0DOaTN?jAg1hZUk8-akVthh>buU`mZJaw=#1i9Jjn?3N
zz1z7!*4yay=SH?tt&`9^aEQh#AnxK=ffupeyZZe6t_4wCm}M2g6QpP9k&g3*s+Kmq
zdZ2F|xpjDV#`kSW_8^XdFwpo$BjOIfF{Eld%rzauluV16Cf6siZq7^i@&~u1c6s4&
z<<lhY5KU!A<%adF&FRq?m}K~B@8n-(5^Q?npF$PwDMEVoSQoIzhj<vO_w^Qt37b{C
z?}efHu=?$Z-!&!$X=9K(gcXN(2A38y%)D(c=vU643`HW$28%bE1#w;!!&@78zWZQb
zewLC6U|rKzIQg*R$&RxvEM~gd_W?nO8@sQaoXpRIk;*SJ_&+hkcVwKmgYu`{kizZY
z-fnQuQ>M!P+Yr5WrZA3_+v8Ixo`z%!5~{RMEW%ll4BS*Bzo*Fio+u>irWkby2P24S
z+XV|&F5R}kKy*Ip7LS{}-3nK(Gua17rcT|jHg^i^=56@uZ!C*>69R>tPVVPki!obW
zkF9}$;DKW>zF=)DdWiEwvF4AhGI_5Z%NxF!hJSwI`W><@-jOpM-!f0xCNysHFVfQ7
zvS7RShJrbzHW}oU58C14_c4f}I5mwm0v15=6`PwBpGDQ$n-JhCKJWF_d_C!BBood=
zd6Sc(l6#vOvE7czR&zXSHItN!)nsT^ohRj2#6ssoSMuKtmtBKCqqEjyxy`)|i9U!|
zj~kNtg#>$L!i{_G4m*~+<fSp!E4TOhH~n8&HmwPI|3Ywv8+hr|LxC~#)%$MKE&Sx0
z`r6h;s8wzS&2?<}e&_<613)K7;)@#X7N4|_hB$*~o5!$N=Z>s8{*TIYf>!iW1MLVs
zJzIBi!;NcWjQsGDZPIPxbL#9iWHMuT-K*!X6t^$r<ZV{<0!;-QSaaQ>NbA~j->9-)
z5qIAkP!~aPmy1Q{p{`P<xZXE#i*+7uv9^{JJ%XoK5(0Fwp|0cB%1`HJiP{|dzZG6<
ztPz&h^$yoeX20|q+YEgXb&cQL-k^0=({qeJZpJJv&>jK2*63c+w@`}<OVC+dz1sNa
zYcw*xEhCK0*&gNG`xJ;TaapUDAHa4><DAr0ZoBolcvy`tbmN>K4lg*rb=CE>I!-$1
z*SUY>K8clIruU+k{<Q<&LLU~Q3RVE@+rDw1XXvdFxmTo60B77>J1xX=Lhf6eE(Dja
zB%+VlTE^Xl{Bj@P{+nhSgK9Duwqp3gJtmKMhn=DQ3iuYV1Ge$WH{f~u-|@xxipm;I
zEA!X6=QV=>5f{MG?eSMMux`e$!#creAoP^Ut=6y70^9q4>gI45Or4~M7|r0;_+Iq?
z&E1NY72a$Q;d@OHd<PK&C<^JqCEUg894F3FomT|c6n^%9qVV!kFh+Cj)x6(=H4AhC
z+jmwxWybfnv8k#h=Du*IQO2dO?4%>ZC2p)NH0tXSD66f%+8|0lqsu2e+E=yY?{f(B
zrM_ar#pTPc?`q{Oj`!W7;&gEN`A!5sE|fD``M#)_jq^ByF$9W&*t~|<vG8si-!|p?
zHXU~t#z(#I(Q|z7oPTTstHYEVJ1KH^Gfk6Y!gXB#IdiWWlWfA6>_XRAX2K*?E9?c!
zmvC(|sje?U<@<;2mR*&G!HWOZrz9-;%c0G=ea>@5cY=H8Ui5b(V-x?d#58ZYmYoaV
zf3vT^D>9Em>NPY9gFLtIxy5)cB3QS~ZM4I%Ta$1z4SpyGk3|@c-vV}SF60LfC2jpV
z*Eu7uLps?qvX?s#;U$wJa&r;xM2AMQD>#j0{??&o$`v0%SFemUd;O$Khx|9<vMxy}
zng4;eCfw8(9!gy{-H`OYPhL%vRbUX!3E^w$RF;VyEg?*rA!o=f_Z_>HeA<o^DMV@T
zf&r!tZjdWhXO!4KziftnBI@gXEN}IOsA6wLOI`T@MUl&(aI*y_A%4Ebs8=2Sw*|?#
z$$k)>vCYPRU*zAlx2>f4_cx)b_~n5B_PXdVj$qRig-?aX-jObYYLTZ~bcoyiu5Q6q
zElYOE(WSBV1K;>u5GBSfzDuyw{H}v6HOxQ$vWh8?dj(maKZVLSbmAj-P=ACi-Kc=I
zqjqd$a1&|IDj_*nPvGb9<x6gvNV^3!#gVbKgj1-1Kh8r%*SjtMlV;4DS#-QYmj|ON
z+=J)>er(%fcLi}|`W_+d$HXV4PzS8S)3x>9;_(er|F`Y)w7$-{$8w-N2^w(j>r1aW
zFE5kZThFR;`NB1~^Jfbu>>5<~9Z^n(+&7Q)Z?}(_>p^&%$GwpRt(H}G)6xAyX}2}1
z=mUgf@d3hNr2`PhEXS{<t#Jriq%8NRu?Az1JOo-Y1Q$zpA2qURkLZ}O!-tI=(KKxA
z=ulH=%&3uLhmGF7agT6xOlXhrurcA#h{l2NtN4k1C>{)((=<4#)FHjRUAQKu5o^`%
zlz(-y;BTB^;AkF!V4T~G;YJ1yV7cg+B3=u<5_n6xuA6&43B8eVjqv*>Xme-4IRlFB
zxi<(9s)HF1CcErf@cAab^)vToG2X=Dhu=61QB{lVV;T-Zm#I@Ox0Sy@7!2wa!j*kB
zeP#P1-3G1&Fx?m#R<eUD7(|pqd$@ew;(r?3^4^T-r>|sk&TVfkE>tV$w|=>%%_)M$
z;bd9?2JU;PjB;GWUN9|bPjsImrN!b{?{+;X7FMvII7TbR2~Zne!$a&g_uVS0yaLzG
zw&RU+?wx~KL>Fp;URoR!f4TqxExeml-|4=;lFN_A&)zW3nZ7`U+iO_gWrFfQHbaWM
zFZA9gve1Zo4}jVY=}A%H2>5~|fvIqz^6@GQw=hJ=HFW(wC~F@Vh~X8l3&0s?L?zws
zc|*TBO*Yr(ztI>=;pTU_y$mo6)A5aa*o0hnMQktDk>A(vGv>BK&f+mzrVBQO7zy;+
zAH{)OFoN432;g+h{UH=L0<Yv_gSNiEsxOm5VD23oXbb`$&U{$x1dcvNtK#0A@3`I!
zMc?msZauzcfc5^2`!W%16?ZGT$QuSRv<tW1K^D1+cuqipAJJu%xZtYKPl)2n2KekZ
zyy!%RE8Dg}g(;ao#J&dqW<@3+B=Ej>ip9&K?S*hS+=k{gH=u^R&>c)A1|lm;(YITI
z;YM%!l?%H2U**M21M~m3Mht%Lt$FQl%D*Lm*^R~EO(75?!JaRy48W0eYlq=r8=Uzt
zzi9;1lnFH<2*+@O>&3JnUAB9)+K$A83t>1z<rchx2ae+Grky)l_II1Hpp&^(8n3_p
z`@bWpseV0ocIrE0o&o3Qj(qwDXAp~2RhacvwWbcRf50Eafp6ssArUNLxpk}j*MbKX
zd@k5VQEe20PlAsj`a068f{^GEtJrZE4CX=vD;ofjaHZe$w!g=P#Xye472#|fbfZ%v
zkhef6b>V(%{{pdF0bAWT?{9dz9EO~am+So41xbULKZXMG`wlLyg9$bM;vqJV!<AxG
z*Nd?P+}91=-?EZ}<6kZETTw*ai*~sq6Q>OU{3RJGaU`0L+;KoI`n-ewwv*kNF0xAe
zyf=2?*`N2!slV69Y%$99tDLv%>tQxS=UD5s+<mFd&iVI#F_P`+{J;GGmka0Sc1ig*
z@A^doTjbmSQS`ESE+mCMhnCPEz^vgr!!E3o@6yn%)wE`<Me99`{F>cz{!1la#?O!L
zD-EILhM{)vTN7RvAM8S~|3UY&_2uI64IlVK=o5d8LOc2k7A9Kbbh(TCTXz!Os`$JO
z(?vI9Co%Hitd&BpPDV3`>uUEkM4XL<;iJt9;oG>=6BbdxSNZCztDU{opA~j5(_qqy
z_RLLq-6-qKldr}3v1R*nwRpV*U90$$%-G%SzR!<W{LGCVH+<&QqHch%0LH@k8wv+w
z{=MU}GOiN1kJVvYJ-xP!zWQr^?Y}P+{sd)VWchzoE|$gpm1v7x`dkMX&^V%SXtBJ>
zV(&mi5B>`lA^d;n&+}4Xsfw&XJ{IE38<Be$U~khrz7Q6{-?*HFZ;)Vbh;Og13tkj|
zFuPE?tJ%x)RqGOW&{m4H3st8(ondB&b8p%(KBpVpngw>w^A#@IBWDCS`I*XDH-eh(
z%PqLD($6?*!5>d=#JnT_A<ET@iv32lGITp!I03KXxq-#i3<is@>|&YPnHNs5I1zGI
z`oAlS3l-469qPQ!ChTs>46U0yz0T%YLu(QUjc2@d>>*+7m^pAHe46&a;rM&;YX{Z~
zEZcS`{84rMO?UCE?d<q`al7s9c&0eFpB*Q}tHl93*!j`oUSjY5c7B9dD{e0CDn`XN
z@htIH@qisIpNJR}>jvBT{l&TBNn)pXiMYuSyMK|=pC?`_Zn=|PpCLxXhr|!W2Xz1A
z;tS#@;ugB^pW-)SubnOb9mQGVS>jb<smi&JS6=bWp_YF3FynE<ji-wPN9ej36;Bgy
z5bqZMA$}tMD3*`3e5UBR-|M-v^xVmM-dyno@lvIKPVA+828d^gD^$;ocd_S`ikInn
z_fd9!b8(b-q<D+?s<_qecK>*>Qyehb&hIH6EuJr4EFLk&?t4zpZ&$wOh@Xq!ikm3^
z8RGupJn?k#74cK?OYsgp_i6EY@t@)dm1l1;DxN7`C=Qd{CWuM#EU{TUQCuRPFWw?9
z6`v8`6n_?X9ANSpF3uJY6HgZ}Q@Q$%wfjrOr*ytV$3w-*;&Ed6Kufoe7!;2X&lE2c
zuM-~;|11UuSw35e+lae~dy3^^op_+wA)YN>CH_%dDn28wwW~dETXCp3N-P&^#l_-@
z;!WZM;#=Yhag*IF|2lDwIA6R>yh^-PTqb@YmdXw;k-P(YSiT#Jn~HxFZx@$}^<{Q{
zR6InyR(x6PyQkeZMjR<VpyxiT{C*a<QM!KOIPrJlPU9@UVPd(szj(j6O#GMFAU-K>
zHs12tN*pcLi*v=_i`R*titA0V^gD~g#0D`XUMt4Li^K=S&&2ijvi!CXw-!sqIx#J_
ziKEIb-9F;6m3Dlxj$5nj_;=!o;$`CF;wR$xYP-KuTrA#P#1%STccR@tMZ8P=LhL@t
zu5T{ZiSu;*4e@B5zd^i3Tq^!F+49+Bit$qMW^uzBJ6|q_#Dw^?_@+31Z@Yh|eT);t
zx5d4t+W9$RMC=eZQ+Z}6zi-8Qr9V~oeIo{HEx-QaSg~AOB%UhXD?Tc|A^t3GJk9c}
z7N?6li`R;ei!X|Q7rzrXoo@Mz6wAd$``Y<s;_EZ)I8blgQoKamwZYEUi}S?e#4E*{
z#WQEx{ZEL`i*JaZ>Ar3{9wQzgZnB@HFBJ!fRpRC1b>eN}U&JTHXT=}HZT7c(_YuQl
zO3aE!i4Th}h~J8V11z5gF)cnI4w+@w%XNIc*sk-3iRX!Ti!X~Gh`kQ9e1?lRiqDDP
zh}~w}eS^fw;uYeZ;(g+S;_sCHQt>6R<RHs$eK9N^CY~q0AO_~x{cYlP;%(wmaiq9h
z>~XNA+eREL4imo?Hw@Z+|CnpMGi2PM(Kt-37H=0D!gl>QahCXsxIAjtw`($v7Uzgd
z#D~RY;>+UO;tkD~{xxxg7!p(BK(VyN?vIKc;@R`;d^l!o5j({T#XpGK>;A*UbK{oo
zO0ib=jnVZ}#5P^OPW(>XwAIoN7Ke#*#J9w6#eEZY|3dLH@q4kaI9fbYT$HqQ=Zn?z
z?RbNfaf*1T_?kFK_oc)YVtLxqg~SEoij4Bj8g~-MiqpjF+jL*Magg|!&TpsVJ`3#n
zFmbrpD8|HUG1ATE-%E6Sv-q~yqr>uBBCgYE$5-n34)Hm$o38g0gW_EA58}PzBjVHI
z58?)g+H)=y1Bco1rs83`?>%uh-8WY}S^SfDpZKo$saUG~(&8fVDsid!t$4TiiMYYx
zR<1H}rWg{N#NSEIDIG5nKM}hfVfhRfM~hYBA!1TIT)arUT)a_yMtnhBF0Q-Kp0knI
zM=Td7iZjGcaiMs!c#C+a_^|k~xI+9{j2&sueM<Ems^f{`q2gb~<>C)wk42WgR2(VR
zhzE%oafx_?_@MaIB-8ihCt7~ri=T*>on+^K7M~E`7PmfG<q)qG9~Ac$6XNCK$Knd{
z&{Hg*)5U(`1aZE2uULMn-QRt&u|vnth@Xo)>Ux=2Cr&=i(j6$q#S3)(R`DF2Un*|B
z#M137ey;O#bbO?ErFg5jOe{Oy(oYg!6F(CJXV~?P#Eo_S&*Dg(pDUgr-YzZ`UlhL(
zho5Qr&J~{!&lB$upA-Kft`J9^W$E`3SDbIhofjC-5i7){;*v}4dY{XTPwDsru|enO
ziXVzsh!2W8UT*1!iFb=>@pSQPvGgjt|9J5(@gVVV@hdTKwcQ^V&l4-fkobbQT%7uQ
zOV=UpAXbRCiYu<M`)>P#@lkQ2m=zBb&lPVH?-m#R(b7LHUM$`vzA1hr2ClXH4-f~6
zRbq#@NPJqnSiDPoQ~XE_T&H}+fnt@|Auhea?rXZy__~hg==c<|^k%yr5^opp6ZiO&
zT|fL5<JGqszY{BT-@{^s?mtO9S?Lz){tLxh#aG0&Z?k-#5;wcuj&~AM;+bNvd+hoh
z;(KD?UOT^;xT82sd{x}|KD%##I6<rv=ZYKLZ}&|Q4-gaLB5{d0`T@JYNjySaB3>@u
zEFSo0yMM8GwRo@iw75(>bgA8cqxgvUn)s=>LOka|yZ=$~b#aB*?Jst{mw1o(nYi{t
zcD=7SSR5w4DsKF+-M5ptr&ud46Nf%x_Z=^uDsJ$oo!>${SG-gVivu6C`?BKeV&HK*
z|B^WR2|NB!&plhuc~;z3`OOor5|@i*%J*3D8SxXbr=F7$-x4eJ+-;w<=kFoT7OxZE
z5L17(`%0cN{z=F0i{Fbo>-sS9VR62Ajd-{Cp4e-d<$IO5uXwz8zWAuP=sCN;;d$d+
zF)bb~E)k<I*nL^?1o3?FGBNd{-FKvTrg)WjvpDN-cHep8HR3bk+u{oGw3qDuXT;aV
zl9%m#AF)*2`4v0=l#Vyl@wQ^E*eIsN_}?wvrsBS0SUgkwgLtzz>s3p4wRoHOnz%v?
zyk_^kC@vSj6xaHPUGF8nD}F7m_qtv0BbJI!i0i#!*S8cWi1p%JvFV?7-$vp-;vDf*
z@e1)~@xkSm?xZ)3^WQQq5huTG$J2FvD;*CNuN7Yu7ky#(Z}FvZ53yDZi`R(viqDCq
zUs<}v;$zD1Q}IXf65T&v_dO>DzP9vdi+77pi*Jj&d}H_RDefy?CT{$#U7xJ`TgAh~
z?R4K4UV7ah=)SXChdxRiB_1r!7l&<N_uU~z#Kq!O;&L&uq21p{tP&3r7xl2~>u+pa
z+S7P!FXNqJ=_YnOMO?I*9e3N@c$<#j5Wf(IZEe?&7S9&{EdC(Q6HgW276<pW^o?Rl
z{EPTE@mH~I8@vBH9X~Dh*ZEfQ3h`!ft!*vc7UEvwq2kB7uZNDq;tk?k;tH{HJIimT
zc%*obxT{zzUMYSm?y$Y3uN!Q<Ok6G=C%!E97-IL8ic7@6P&*&d@t5Looxgc^yFPHV
z@eLjSByOPd6=H)p>_EG}P3#nF#TGFoE*38lhqYPy?(N2Y;vn$|UB6NMSRAA4`-my=
z2Jx@rJK_@YHSr7a7qRXzd;a6%a`AofNAW1}QgPkGE!}ovc@gi@@iOsc@ndnmc$)Z~
zxbYE|?=W%ELOXs;d`tX9{9e37yjxuFNK4mS+*_;@PZRGDKNr_sWa)Z}X`R1Dd|6yB
z_SJm@#o^*a@pLh8jOE*1+(g`6EElW9eZ@9$k$8-Fx_G;|RD4K$PW(n(aiTqc(n-ec
z#c|?+;uYe};;@tL{>kDmI^SQ%M~h3uo~PLTW5hRfeq9}>#ie4uQ|<mAblgYBlf*jl
zui_=*L*mzBV6o*hUn~=6iRXxyiMyU=_kX72&2-!#_B!9LA1Ll2){6_pUKiMX2Z$5J
zI`L}pdU4o=c7LlF6x+ll;_c#6aifbY-7h-cT*o8CIpSRLdhrtRHt~7!XEAWG<vU&+
zDozw5;z{BX@gL$d;ydCxmsoxy#B%WzaiMsYc!&6w*y|=sKV3|Uo#GPlHt{L(MRA1K
zEM6pDEiM<o5Xawa&r986yi@!{EWOpvSBY`)2yxh7?fMk4Ry;^t`mtSqTl`2|{}Veu
zNt`DBQEU}Y5+4zliSLTvh(Cyd&n(~F#r?$^u}+*X{$6}cd`c|;-10d?JYU>og`Hod
z<GaMah);{}i9L1yY%wLCE#56I6*v9D^6w*_E(XN0Vx?FsHi@T+uZb(fExxpT_Ylj)
zn0SJCg!nu09`QACxwt~y>nqDQCSEI!5l<Cg7kB&G?jJ6eiwnhj#KXkji@m<FbX$m}
z;=$qt;%xCaafSGe82HxmnJ#vU_lT3li^WgHao<_G3b9UHEIuioEZ!lO{>#$sBn}gk
z;_c#(V#W7%|MB9};$}bC`B~zn;=AImKic*A;+^8pV*j7)`rdlZJaLKmC-EimJ8>I5
zf4tZz9wYuid_w$8?5T2%5@(8siWiDY#W%%uRNh_0sbZ^mhIp&^iuk>_o$9%VI7gf-
z-YNbpR{d;pIZ=F8+~OBIKU=(9d|w>=t6k5CcZ<J@69c=r`P|9kbK;iW?ED<@O7TN+
z=vsEYO}tm^*4@re7Ecvl5c`za^`LmQ_^~*AZM(ieykG3Tj-9U&PZR$p_FdPmhs0~d
zPsNez+4WBG&*Iwa+xdOO)5Vv?Z8xy%VeyaR=i=@g+V#W42gP+avh%g#nd0BY?R(hu
zsCb?Dg*awoyMBcDkhp$NJ3n1KTYOFI*UPRqi#Ld0iF<5f*N+q*5jWh_&hIOpE50uF
z-^{MZ#GAx##635+>qm)?i9NQk^Y!BS;y=ZmdfWAR;-AFt#PM6&^<%^*#GYH(`I+K{
z;+x_wee8Oxc&qrmxYyQp{aEp@;wF9V{Qly_;@jfDZR~ndyj}cJtk~ABA1^*FZnmAB
zpCw)@zANs!y<ML#-YNbpR+ZZI6UAr6E&AE{+2ZBm`{Ljo?0QDLTl`g=*x#<7EIuc0
zxucz*BVH+fC=T7puD6Nzirsd$^OMC>#TUdryV&)hc(wSkIDCLzUm)Hub{}ZxYsAyU
z0|(jp%f$D@-FCI}Y4I-c7qNObyMB_mOzb__&L1RRA$}kZ8DiJ7;yq$usGXlAo+3Uk
zZZ*uVA1q!aek2YXZr9tz`^2?I*!d~qV(~?B>ydVSuK0WL6LG{SyWSx_AeQWI=l2$u
zh%brTjJE5I;vdA%#8G4H`k~@dah<VteyVte_=>pQ9(Fw<UMsE;N0-_4!^OXd>+Nah
zr-^5YuZpGP?0S=Uz4)a#cD!9*C_XH1Fu~5(iRXy_5O>(iuD6IcieHOm<#v6M_^7y1
zg`J-vo+rK`?pSHp4-sz`zZJ(-+4ZBv$Hk4S?R<lHfw)}Ud7@p9i?@jX5+_Wu>%SA9
z6njm!^ZSVxiEoJmrr7m_c$@fxSYBh-j}xB~H{ILLA0S>Lz9SCW$F8TuJH(&F%Bgn!
z1o0Vh^IAK9pm>@1p19jIyPg*B5`Ph^r`z?D#ARadIy--mc!l_ZIAmYDo)zyA12gRW
zB=HpSd2y?HyMC~EmH3f3tii6gi}#6Z&9w7V#Kq!^;@11w^||8j#ZSZ$``h&n@d5Fl
z;%*1n^|W}G_={LQ%dVd!E)#nnXy*?SuMj^Fhs?I?S@9k*aFCs!B%UHZFK#u*t{*I3
zC4M9hJJ_zbi}#6Z1?~J4ak2QKxb<ASK3Dv`_=z|oWY;^y2gH&_JHNNMM0`oyCT!Om
z#XpFjiK8NR{ZMhKxK7m0PZiG)UlF%!vg;A?T5*Lqy4kKDF8)Pauf@(!6VDP~6-#4w
zy-B=Y{8Aiyh+SVOJ}ho9&(7D0=ZOChcZl2d7V$>$Yq6}=t}hZF6*o%Q`5EGQ;v3?Q
zNxOcCc(eGeI4)(^j}{*nH=b|j8^jC5<>Jn1yMCVdfjA^%=d<EHVjyehCyA$s&x>2N
z+4Y0PtHh7QVeNLkUA#|RYk{4gA}$s?H19h@yj6Tf{9fEn^T55th<L1ct@u}Qg}904
zhoi;)#lyvm#lMJei|c9LI8dA>CdISF+r?MKAH~vu)we=y5|0<J7oQft6gTT;>Bfq)
z#D(Ie;=|&*;s$G3x?ROOalUwtc&GRe@n>;|?v}nvY!OcsZxo*uzZSPBv2<nPY;loz
zx%jB~zPQoamTs^(L(GWhiFb={h`)+Eu4CyZiie0Ni#LnUiQkG_u50PWiF3rG#Vf_f
z#Sg`e*Rym(#RjoWyg<BHTrPH7-_q?YP8Q?hsp2i-3*x`TJ{ws231U$Eop`nQr1-Je
zYeP#nT-;AwAYLTiFTN#q-^kJp5NpJQc$#>d_&4zfv2PDcUoM8k<HT#kr^HXiO*gi5
zBgF&6PVo})&*D4c+C44ZAaNftC7v$cA-*jBByQWw(pQRM@dWXY;xpps;^v!Jy4}SC
z#lytQ#0SOq#C12dbi0YQVp=>?yi5GM_=~vxW|qEMjEX0T*NMx-FT~!PTe>mgLE;hO
z72-qU2jco$Sh^wNbTKQQE#4!(CI)(2x_;s$u~|Gtyg__k{7T$vOH03pc(8b+c$N5w
z_>s8bR+es<xUbkQo-5ubzAmoS$I|r|r-(6ev3QgCqWF!t_12btPjRkzl=yq`G4T_z
zM_)@fLaY}%#Ph`m#D9t<+gQ4t#J$CN;u7&s;!EOp;x^k_`tf3;c#QZ5@d@!WvFCP{
zZj?AvJXE|;Tq?dPuCu+R+eMrzwu)znw~DWb-;3LoTKc`jh<L1ct@u}Qg}6ySOE+5F
zUp!pASp19lwz%F7mTsUpO-zbsiMNZdia&~_{Vjck*d!h=UN1f^ekpFYqoo@w&Jq`j
zmx>RI?}{7jWa)Ml>%{ruIpUq-Kg6HK9d@?#Rbq>HqIjeDtoXIK#V(evOq?w)5-%4Y
z72g*(8er)Li!;QGc%FE-_=fnaxZ^-eKT$kHJXyS1yj^@u>^sQP?I#{4UL-y!z9p`^
ztKC09tQ8aDnc{8Y-^Cxq?RT^E<ziGkPP|ThO8i3Hbg-oxBOV|gAzmUrB)%iAKg7}v
z5~qtP@oe!9@ip-$vENWjUnw?=Cx|zQ&xl`%n-8;edx!^$M~at;kBIMy8xFU0yNUaX
zY4Kd~F7b8o7qR~cOJ6O<#FNCE#AV_)V(*cbZcp(b@hI^M@iFlOvBxM&H$<!#v*P*U
zJ>oyb!0wiACvlQEPdr8ZllZ*&ow(I#OFv#bSUg6&N_;~6NbEVr(hU=5itXZs;(g+q
z;#y-Z-7ex3u~l3w-YUK*elKplho#?3oGTtH{$Bj6_=&hlnWY;c?k{$T7mE*wZ;K^+
zTDpPa-eOW*BHk{(B>pIFGtSaih>ha$;vdAP#m~ge##_2k;w<q{@ltWA_^!Cl1WUK8
zI8~f4{y+Ba1FDN_UHtwJhWYUWEZDnZ?|>~fRP2h4M2%p@s92%~6*U4PV%MnHJ7O=f
zkk~u1*N8oKjJ-sO8l%SYe}?DFDf!>?-jj9iUGG}&dhc4-54$~k&&-}Zdj{i~j@$4j
ze2)d|X#4uu9Vg&gJdLTC!(ZEdf*mjlm*9T9kB++9t~|EHK{yk4;1&FcUiGwnLkz`9
zxDL<YbIes=+f~C(7>&#DAU;I*2HMUC1933U#$9*~of>MpBG?#v;bh!^=kO)wX{7CI
zU>A(R6?hmQV}{1st^$6BLvb$d!5e5a(RRhKDfYpsn1~nf73OQI?ftMT#^NeGioal{
z0Bu(Z+u$&qkNYqM&1TxJ1UAFII1RtYOZXZKG}rdEu^W!YHFzAKp=S$iR|P-ENL+|N
z;LqsVQrnfn7T6zW;AXsxZ?RA-ZSRjga4dd{C-GOz8mR56Vh|3;cszi2(Ct%gR|Z>Q
z7|z12copAa;m@>vJ?x3&aXtQsFED#+ZC4%J;|NT^LwFB8+Gx9S_$dy=Ik+9K;|DC-
zR@*ng-Z&9AVlw`QIX~BSHL)X(#HDxyA7I9I+RhtWV>r&ko%kF}1!=n$*dJ%$X1t7V
zu~4wK^T!@I7Qe-l_$y{@ukEU05Dv$9Jb-u5t%J5JgRL+OXW>@7itn&+M{QpZd*XOp
zk3ZrI%-%`cRmb)?0u%5M-b0Vh+O8aaiUV;DZpZ8R0gHCg_6@K%PQ;CvjK5*d5N%fz
zJK{)OibwDPX6&l%ys<Tg<2>AnH_`TmwkwW}@Jsv#zr(ZmJLc}D?Q3CY9EHpA7(T+x
z-L;)Bw#6a10C(dpbnc<;N@5f2hf{D9p2s(sH&ol#!4MpSEAa$A!7M$sT}5n%U*jU&
zi?`9Gm$oa70r(Zh;TF7ze_;OJ+P*G+f#YyBp2DY?t&g^=jKLUzi!llBqWw#4R~DP&
z0Gy87@F#qa1^a6I`q&*O;95M5shFdmw)+G-U=%LF{dgZ8{k2_rY>9(#Chov1_z}In
z()JB86er<2JcG|MSD3b|hMh1Pm*GKti0%Wloeu`$V4RJ+@ESS|)OJO%F@A+J@c`aI
zw?W#j47S2BoP}HQD!#+Q;o80)_QdhH9)H9an0>IetB&n)1Sa4iyoVk`v|Ty;6bIrQ
z+>Y1r0~Q^s?HgckoQNB7A12{RbbPJtuaM43IwnV{w7=(h1{dbkbRzc4rRh;P6BF?m
zI_1`S-gp(C;`%(q$xD2^h!gT@{!jVUcj#3>(`~RHPQeS*D?qv%p2eC4wY@*K$DOzb
z>le~`0XPk3<2<~N!@acras0Zlrf1^@OvKk{FQWCTVjS+kyXaL^>kUA^Vww)bjKwuw
z6f0vO_QoanBSx0cc2lrLN#c|uJ`Tasr8WQSGU`H1!c=rCtL0^IF!_7%Blf5K5~ko=
zY+p{}bi&*C2uqdM^7815(U`+q%fCQ}kETmtZOqTVD-=R{B`!yQUu`!7r(q)ghA+^w
zg4S<~tuYK|<25W&QR@%E-8h}{ZTJ(u$AZ+Wj{|WIZpX{`9&MGhzt;FEhT&G+jKySC
zu>O7Nz{=V#1}EYyJZaZ@wkle$FjmJ_I2mVRhEKG9ZqjYA3l6|&+=P4ZDBePM;&s3<
zoQ7*~AHGMIsydD+98dm1JdL;TDHf%Ed8~$?Vg#PR6nufjs%gJvaXDtj0BnO@@M~O(
zPtmEm#!IQMIyF%9U@G~qu`>B}@H31?PvVur&e#KQU{f58Q*a)BkJqql4IP(1#?aqF
z+>DcO2A;+%_zUJ~sPO`c*9#ZoW_*G#@XMOoZUX5Un3eqeSQ7oQH*UxK_ypge7vl=R
zP+W(b@CUquwQvH?!{vAw&)`jbh_BG2kse=7^zzem2=>IAI2Pk@0dB_Y=u?Y0*aO3G
zJWfT|+FE}D>Fsy`uc1l3U<||YI2B#%5RdeBJb>5Gq+T%g#2B25i!qnK#@$Q$D4xS7
zn1g!VF$LeFOI?i<g+;MER>My*0#9NJzQDIQu%5=vgGJFB8)F#m#$>#WkMIlD0XNKs
z1+glI;5zhYKDEHs*a?T?0!+XGxDO9vGTy;=ShukrZvYm;aySAfVcefwPI?EP#>?o@
zMB`V+I(RVs-*qHCAGc#N-oa=17Be)}xS8=3`2|V)qJMzan})rbX*vpPHP>`&?2a)w
z56|ES{I-R*+l#|n5~r0KgDWvOQ1e%Rs-DDD^!iNmt7CgyLHRG}Lw-EEw$^r@SOzEJ
zWX#q^>*d8D?1Uk>0UNf}`qQ!6=bHW;`(p<7FJb9v7u}~MwA1>j*eyuYJ#ic^!%bK$
zSnK)XE^Lj5@d?`6YyC`E1RG#)?1%F)5U*ki{)(<0G;SD<z&Kom&2bSP$9LGIqs9ru
z{+NRAF+(S<w;Z?OPk0@hb=G=|@FbqWI$gAU8YbZ(tQ4Z<V{jo3@2dHCNarJ+5g%ab
zFSOoLjO(W9gIKG(rlWBUuE&#DpYe9Xs*JB4mSr5x@N4>?i>K(n0sZ#D9hilFD&s`F
zj#+x>IMZIr|NlS!*UrE?)_E_^w?8KL)boF}m-;KZVJ0kqMX)4R!aCRkzrt`FhS4|*
z$Ky;)z!kU`&)`LTf*!qf+_kVBhTuRPh4Gk-sp#29<M?7AhG85gVlsMtsqIQ&Lk!08
z7>6@)8UBd=eKpRf7{z)XLVA2ZE#HD+<d4GzcpghqFA>jT%l;atKStteT#uFMKaq42
zhJB^&!*Lv*!#m8gz%Z>BieWew<FM)gt+#{pIs64(DQ}J=aT+eeu)!K<9)2=J(~a<R
zyf#epKj81^8L9cXurfBp5bTfZuzr-b3&b@TG+gstMyki~Hom|?W3+rM@mAs>;`oo%
z^2XR52V)%0!OeIGZR50kM|^_su*!HX_a@yMvyq=0ebIA*)^{;<f06o)rVCA0Gh+;{
z!A4WG{BsP&2;7KqQ?*{!IMoYlVLP0OpW+DofW|a!?}^#50@lVrY=@mO5=Y}1`~~l!
z$8_x{GkRfh^hJLR#1Nc;els-AQ9OfPXKH>wtU626n`n0y$5Nh~@+ugNAsB(l7>|1}
z<7|!ZiKVbJM&Ke`fM+oUbIj2=&2S*TrraRyz%X2biMS6_@im^sd-xb_bG6?x*cf|Y
z7$)HlcoPq!*F25mi~bmhA?U|C8Af^x&cv0NjCU{<U%Kjj<T+paD}i;f4R*l(I1*=K
z5+1?TXj`Ch8(?27f>p6K#$X)I#7(#zlhC$M<9p*z<iEho<k!PM?2L=?6b{AdxB+kB
zM|4@F{d%D{j=*);6PsWcoPujG32)$Sw8d*bKG+7cV=1hSF*p_z@EXoS&&3)yCzikl
z7=nFqA#TGXn268NGeP54!&=xLL$E6jz<5l;6!cuedB#CF4R2!5Qmxm8^eLQ3{#0Ct
zr|>bpL$75Tr!MZq8+acxEZ2IaaVoCFZ!r<)&~5|1BL5F;yF%mj!-;qllQHjVtydC*
zaRg>rspSQ6`zlSFq|cK+LH%UBgRjwdjkc?fJ+K@e#AM92R_o`-CfEbN#6_5hyYVPi
z->7kd@#J@!F8RIM92cT1?e<f@Ao(tvw0;e&kFoea{h|{`+-vCAto>(0U(C5h^NV6z
z9D>o*+kzMIIOQq$2L^4`c+og(yQa5eGJZf~hn9O{DXfmCaT=yzj-A@BJO<);oQgr%
z7h~{SJc3uyW0%IwhQ9bI_QlaS1^3`7e2tFX8t;Gl`g#&KH&(&~T#K9VFh0Osdvshr
z=!=c<40hhD^~Pd6?!b%qpB~?4XMOIc5HBW4`;Eu%a2FoHRJ8q|^~+!-yheFl(zg9t
zFB4Y4F4zlm<3vorBlrwo;|BCPpz-TqH=K%d@h~>UooG9#?Q^3qcEMSA0y|<VdL7dC
z6|p0B#V{O+wj)|U57xyJSRF%f49>$Ja3$`*6nutmN41{<*bpPJ4A#OPn1E|=7bYIp
zc*pTPrr^(*ivQ{S{8#NOJ9OU0ozU^D!tZbo9>nwLds6Gyz(&{tJ7F^3#h)?ol(z4R
zi!lyUE@}C5e1{o+()_$w1}kG@9F6DjGN#~Ne1-WgYuv(E3d>_X48a(ji-+(SUcyxL
zyrS{Tpf|S1?l>4n;S5}aC-D-d;JK?BzxXw^BKl!-{2W6u49DU+Jcjo$({+uT%=P&h
z-(ZO!wfrZvC2QL2wCaa-u_Xp$2!4-YI2xzoa(sfH(N7=rWSqWO3H`7?j>H?dEZrMA
z-dz|*+|$_bjF$Jt1-PE_pGkK)tM&Tf9NdoQFo}L%;Rp1*sqHIaRr~@6;|Ls!-(WnR
z$9gFmFC3?0BA&npn2N5qv|Smzf`4HCpS3)N<0)}Y)2q*`ZAlNtQ5QA80p%U>F;>2y
z<&E((?2P4TKbdp_?!+wglT5tZ<Ud6R?HA)&e1y)dGlkF#OJXgokAc_|=i)T<;`~&`
z#u$k0Fa*EE2t0;6@e*dat>>vUHo`y*!u}Y8%WxH5z+dnse#DG-w4X9q1=rlw{PnmA
z=TW{06Y&Q;gjewqdeXi!`eP65jZZQ2J?-BUi=jVu!=BjmzSbX&Ddd0hK=WH;2+qQV
zxCA$#^FwWC#}+sQBXA-%!QSYPC-D)cV)jQGKOdGrUz~ws@Bz9#)^;VaAqL_!+=quT
z8Ly!2iN?u-4KWT!^SwAdolO1>bpA!#SHT)M80X`6cmUs_$5U<Zg{80-cEs*D45Kjy
zH(;e_8ZT?Ax{&<eF%#weFd8S~BK#hA;%PjOuQ8N<zQQn!!s$2(6Y&+g|El9Gh0QPs
z!*MO%!iV@9=6cR~#7Y>5akw16#qV(&UO=B08m}e>V|U7j;Ze*;d2{TKOK=}1VYZhV
zzXbYXE$oe(a2xK&YxoQPff;|(xCPN0FVgR0e21R2Ym4!ilXfN15Bp&PuEpK>ulC>n
zX?~O&@167$_`4ow@>}&fKEmREXnsWu!2vh~N8twCgh}`mU!d)s#w~`W@e}NXU*K0b
z1LxujJcd8wRs4u9@3p@ySOsffV;qbTI1azXMBIb7@E*RvTpzT*{8$=W;AhwcN8xy!
zg*$LBp1^1L5}iJ3f5ov3R>ck&g8gtB&c<bU1W)2+e2>ny(rHsMGgiWC*bu{TFvj9a
z{1&(4HN1sSF{_jImkWzw6KsL)F&al<9B#%Pco-kyGkk|$&f0HrtcW4l1BYP(uE6i{
zGTy+)nAPC8uoyPOK<tF0aRSc9J-8oF<173F9WL6R4_3yy*bDn%1g^&QxD#*TJ$!+A
zP3@-;mc!N<ggtO7&cr2n6i?w5bh2xFJ7&WgSO=TqAp9D~;5z&c_u^fAgui1!SM9eb
z`eHlmh+#Mf7vdUB#tZl}x;Zp{Cd`NRurap5NQ}YBxEXigVf+Q3<3}vvru~-1YS;z4
z;{cqCi*PNT!Haks-Q6{QX3UTEu?e=tC>)7Xa0~9lBlr|wpv^=3D~aW>I)-2m9EkHU
z9>2x2cnR;IM+WW36ANGiY>J=baE!&NxD|KdQGA9k(J7<$R|?Bx4eW}cI0)zCVqAyk
z@F%>BW+v^&9dqGl*aZ`CBW}ke49u)?g0L_Cr~8<H)jo#!akv0CViJ0K>bU$d0E2NL
zPQZn@8IRx<`~_XIXk1?m!9+~|`%x*R-(&f#+Aahq;vkH~)NG79ySfB-;T7zkL(3=Q
z4SbK=b87iD+)BFxcmjt|KN|1hQ}m_%FkFF~@Dl!kwp`k8y1t~R;08>@V|WEUb8CBl
z?1<wq5pQ8IzCr&y+Wu=?f;-TbSIcvtH~xTSusvQz&wSdhE@t5G!KILXi*xd8y(D~#
z2?aF&KIxcZnx2N9#Wn4P)iJT8=5NNc=vzwj{qa+@`D(r=hG9&4el;zxi^=FxUGw8G
z0TVH%hL*?UN*q^P^DFcFt~k;Qa0Mpf1&paffA|=W;|qT+e~q@fn$CyGcn4pidp)g}
z9SdV1_Qqe)slL|lgqv|czQLRgv|c&%!w@`%nHy@oqF4zh;YZBSNb6O@1~?GM;FiW(
zuO{P-#9#0aJkB_s7)MF0jN5QGjwJqA;xEU;cmiMJzgmy~)BHB+e?O{;9%lgz{FA|?
z!!Q=(aScAkR19pYaeLrMoP=xf4yNK;^a;@Rff$4_7>6@45tA?(Q}8z0nrU1EebFBq
zU<iicP>jO_T!l$^3{&t4+L~*>o|ql|F%UzrH^$&7Ou&_xgvsdAf_~5o3#0F!Y)jf7
z<8U!1V+y9?Yjka?{rO@@D@~8aqJf&8LwYxULVgE)fT`&7skS?Z^{_YI!;GJ4z1f(6
zNtlAS(bih)d!a9eU>L^WI84AqOu}S*gzvC$8||kPeuGKq)mF=EVgR<oZa5jY;~9*=
zhiLm;<CVfn*aK^|)ADH2u{aBNV-jA#M_7>d9dQsQVTB-#UmIJKKMGf22Fj=4V!VK<
zm@!!6l|^5S#0j_z6R|4}#UwnBH_^4d#?OkC&>sV_9d^M4T#JYBCcedz9W=f#PQ(@1
z1qWgbj>mm?1zkI8yd><}Nz+4dH6F*)_yof`YyB8Z#<!TWi<XzhF8CQPN3RgAR}7Qz
zB|3H0@=RC=t79Mz!vFL+_UHC-#GQw$@Hu*Zq2nuye{TPurn^f3>)$)@C4Lj^gFP??
zN8@~4hKaZZci>e_|L+o0NVhSy-;i#4d@(o+zrzc-7E|yI+PiD}Y*+|uqCd7r&mLO8
zAlASRSP4Th8YkggT#ozjAl}DR^a!Ottb^fr7b{{foQKPC7v8|)JvDActcQa!0@vYb
zyosrQ@&oC-y)<r3%!ZAyGk%F*<7ixr%W)4T<3qId)_#g$MQn)gsaKA4AojtbI2z+{
z18&0G_zuJRXumP|2w&oIJcBne<Cog5C|1UX7>7%651zw2_#4Lc)p#fHE-u3@coPfs
z)B0s_7|zC}xC;IIYyAjZ^p&O)@FJeWKd^e3*83cL;b0tv8!;I#U<$s%>;p8OFSfy+
zH~^z@E$+m<n0cVanN7MV=};VmBXB*YqSqj8zXCU6XZ#A|a4DX`+xQaAaE+H43t>B)
zhX3jM{8!sQ;r%+1{&No2aqj$DO&q4yj8MDdQydwo`D01vAbl0@;|qL`&QaPf6K2Cg
zSPXr!3I;}NyDm5oBXJ%s$4&SpeuK$aeT2qqi(lbF+=zSeFy2F#7;V3v^f~+;yHFm1
zG582~jnwufV%1us)Oy$wTVogOj$t?$N8(tF!@XE)w8pQ9pW_!e0EggQY%oUK^}sj8
zYee~YJc6h3Jr<*#A2!EM7>)_}SLgA+TR)2Vp59Li{e_Ly<4Z*Uahh(0oiOfC?kD{b
z^NrVb4RJg94=^kF?Jxur@i2bG-S_}~CusanFa%TZJ!YGz^$K7a494ErA5Y`MN!soe
zX2mAh5$E7aZ1|0~Ye1aoxFtO<wxQlAjK^Jg8AB#(JY$Ob5jSJZRL%c)ulrxEmq`5c
zn2KfMwBLuM3z4pk{c!@$!uhyd24d}>6G*3E{uvtgCF$bRG+h<jWAo{nABJ(5i07~@
z@p|GIT!ug3O?;2RGc|rO?voJG!*L9r!C$cOENwRkQ!&GA&99Hs(Ql6CH^HuRHGLM}
zV}W^^pF+Fs)c*k=qMiEf=(jsg!38*kIHPbq?!;$Ufp~uS1>=aoxwsY2<8ySMujBT_
z${2`KuqnpiQap>l;2Zpae|0|F7U+12p+81rbqvGlxF4_MLwt&T)Bk&J(!XQQh1y>o
z^vCuXh7mXpx8h-Zj82O*o(JZ}#@G%A<8Z7OukDr)=N#U~iqs3mxtM_8VG{m?&crQ(
zmGD#i8i!*HCS$_{9ak*+F4lAt98UfoJdPhR*AlItA1k9jw!*y_gFoU;3|p#kVsJE0
z#_ur+vn<p4-LNlyf^Vs}o%AjIfccke`@cMY|IG$o#BYEh7>~gihu`DR_#WL>=r}Xs
zzuEoYs(I3`G6v#AY>H7Bk3Zr={0-k?F@8U2TdCvAk4<qnPQxj<5|i*ZEWS$P1mQr8
z#{1}4t@W~^7w*SgYqY#34#C;jmN?7Nq+L_&g^Tbhwxj(_+=$8e2s5nJ{sVC||Bh7%
z=`lDFFJTHgzSZ{2F%fs*H7vPa>y=oien~n8S75UZTHX#rFb0or)bb6ax8P1ZhKH!<
znW*)v;6Y5r%lHm`zSsKIzEcO0o{kB42zzhRdV?_<<MGjEEx$zi2HwU8c$a$qTeN<<
zO-T2^-*Evh#g$lltG4fiAvhY7vEVkXR~8e=k0iYk58w^_1M_Ux_Fh;F%VI5Dh3l~=
z#^YT~#kcq#OYUHtSOL3YFO0>txEViU{+$}9ItF4F?1Mw_3ckeLyEtClO8!&KMt(TP
z;22znd+{=+;JDoy{}z6RU*j}<jIMjMehDm#O|Tn|#5i1tCH89j(pVmYuroTa06wMt
za?*)-9#gT*K8^p7bbivcu|H10SvVg@yR+XXeG%i5G;RXM{h;YNxBypUBJRiIcm{9d
zGqmm3cp1<O%VQPv!<N_{zr>*!hkLQx0gWGqvA7gt4r=*QEPqJTHLwZ#9@hL&{D>Kj
zXnsyCge9;&uEv%}wSF`XBfo5V8oxTG^~T{+JdF?WCFVV@^($gkY=wasjJ79Q-yO4H
zQCx^iu>;P;1mgUJPcZ)pjq8O?@G-u@(kHdv=U4}QPicNH9D&;~36J5gSoBA2=Zo#|
z4|F@N<&CiicEwi7THY5YU;-Y+UT3u4U>t!f(DR&@-zM#JR@3iEdr%&TAvhjOo!9zR
za4K%W*o#_zo^%R6z*MYuLF)}8?Lj&x`eFg%)xZ!Oj)|D%lE%w{^|2WSVl;NgN0^E+
zKWUsIm(?0L3YXw99C}sjJs|xO-{X=iT3#2MV=t^m`=fXlKVtrC+P*q|jv+V>3tiWG
z<!}a`!Kax0hSsZ{qE;eZ2OHxgoP&w9zk`wFkHwgq8gDLc!L#@R^V4s29EEY1gd=Zh
zylHp_AL5TcYx!ez+}5-gHpKh*5<g(cJH)@MwkF*kyW(uTNWEV%!#(=Lrg$G;Vz2vJ
zuOrrdpy>fP71!bsOko{+gukNki?)x#SgeWRm_i)$q1Io8n{hAJeWc~bup35U435J|
z_$#J9*7kpUVdIA+#*v<%{HJt!ehT?6Pc@Dgw#Pi!9Q$KD?!ZHM9Mj|fyC&1_2HrxC
zXF9ITm;-BIUF?9Pa5DaczhTx?juU;b1~$UAcn}|8DvrgycpDr1s__G{Ee^sdxCRrk
z-E(dC9TveR*cEr-C47Z{VD1+hrxJdOA=np(;46HK#b0XsviKwUzu;aRKzR&K!D+Y)
z_u(OYgukN4Z;Tgf;doqu`!W6RZ6uQpe5LKLV#@EDuK8N+g*MEARk1C$$1v>6cjFk+
zad;2|-e~+*xER;qOKkgA>vh7JnD-COFN7Vi8_vTMSm>SB55QJ91}EbgEd5^Vdt+}L
zfGhC|-o$(#v|UjQ#uyxf6EP02;U)Y7JwFl;KS6(Nf`f4k&cVCb6<6bKOvV@JX)BYq
z<P63gnCz_i=kYS$!l(G}u2+A#o}v9W$Hw?2rvE*m7}DwAed0(b;xreH^OST$(gD~O
z2Vw+9o7!$6?!Y8`i7A-TuJ!$0)dtuNr(&Q(%ZK7-EbXTGvq<m2Wb&`!&zRL+>*vR^
zSPk1^2oA-GI1`uPDZGWx@B{9^3?ABV5%fhrtcL?J3Mb)GoQui$5%Xuz_$BZ#e!%36
zT7DiqGHJRzHpH;ZnxBBHa0A}M33wUvcxt<%SRY&9R~U}7@d27yw7n;G#aXxzPhh{S
zT0apxlfMPCkzW7<unkVeLuh2vc$qOf`d|R|z!+SL7cg^njZ+T)^8Nd7Ht;9@7dQpy
zV^9tqM;FYIQ`1$jJ|<BfL3#mh$BXzY{+q-5%hjIndof!s9j6zT#fI1s>tKHzgA*|<
zx5gQT-(wP<!4%AtN9&hGU#yP<F$QPhay*OAFcm%WYFuZmh*dEL7hoK&z$CnkA2EME
z&J$L_I@k{T;5TT?uk8w9C(Mb#I1HCzBJRd>=v6@D`CxtQh=XtxuD}g=0iUC1L5)`&
zYhy?3hcz%1C*fQ?h(F>l_#AD8w4a>liB-`bTVo&Wg`+XfOXKXs5`{H=jC3;I#4MB-
z!ur?}L(sd3#u-a`3+~6`cpmSg6YX+i4QzyMupbV=;TVStaTzAz13Zs+&{0&!>x-XZ
zU2KViG5vqvErIlQe2r!?jpvPl7>3a}6Sv_(yp69gu(-w>hf8oH-a)g3)+>Vk*a1Uu
zE$+ut_z=yK8pjKLu`<@eE*OGgI0h3i2`}QpQX2mzRw=FN6v{i2|2sNUo*ye<0}R0x
zxB<7~S-gt1%4mN_@gmMGtNAZ6YdK9X$IJMtyykm)tDj(P97jK=Fcq`VPhm{O_vlW)
zxv&^kz}nahJK_&`37_C5#(NRBFzy@p5Nk5-);I!}U?RT9$v%4A^Km&2^VR%loQes!
z6HnnKyn~(<w0$Y8ib)kU{|{_eNz-$1F|NlejPoY;CjJmyi|?>C{Y=JP_z-i^Z$lh}
zOYmFFPXC3lI=023I39Q7b^IBh;tTYwtjAjdD`9*59tYqW{0(hYw0$8gilJEQ6D^-l
zdL1?-e-zdw{}w()KgtK-Moh)*RW-gBX2vQQf?@a^ZPm2hL~MeWD9=H<A}+=CcogsA
z3w(`#d87CL(FU94xU9dYXsgb3!dBP@yW)79jPr3P?!y%P-#Tr}`Tk_9q2u<&rWlCr
zFa)<_eLrp22Pfkhe2kN*ACFtGa7}IB5~t!)yp9j?1=@-0Sxej3!A=;0WAJ<2i!pVy
z-55NG%kg)#`D?vA=!GTG7b{>B?1!m!wY^zS^+I1<fGw~u&c@Yv5U=42e2batYy4{X
zzcpX{iQflL;~VVKK*#X~>7h6SH{&IIhIZN&#Rk|D`*MFJkPd6C{Y2v=oPlw;1(Wa~
znoYF*1JbY1ll=65Kg5@`KTgDjI2X_1Pnd!)u~Sp+?<<^ut1$^5qgQ~|uZ<ls3}Y}3
zzr{nioH!rx0QqJ!jq8q{SQ6`DFwVmHn1G4650mjNW@xVc_+ou*h#?q;!*Mn)$89(h
zlkp1P!Pl6zh4%Xi*2WCj1N-1iS+cElJB;*FJcrNl9Tsh=@!MbsF2KWh0pDVWR@$x?
z&cXyt!sB=zAK-2D3gq~(Ek@yNT#a6zYW>kz^)pS^z#xpH{(ihn{dX8hy#)LLf5xZy
z0<*W)ekx!q48h(QhLIS9qi`msVvaT%*BfhM3tWb)a6MLQtL=v1HGGZbKiBexI39b_
zz9;n-;Q>s>U$F-5df-~zh$k@_Q_!oO_V0^<*bj%|H#iX!aXTKyWK6+d(H5lr<i!H$
zkBzVmhT%v|#KOTEr!9`dM0B**@_hJNh^7-sKgZwEvxAm5!D^U*DV??4)<vy=HSr?l
zV@WT-T#Tm%Mv=b;Z{r8d!#JyAKm6Z%o`w;BCMM!jT!Sg-+*OaiBDTO_?2LbPydkt3
zfFtn?rlQjq+J9~wgkR$UOhMmnTK`k*iPLa4E=RBKTE9Hj#$X(di?Lu2t?!MWVn5u8
z$MGztVx~}SpB+1581BIX7|>JecgM^eX9?_#y|E?Z?1O&vI}q>V2OL2^orymMci}yJ
zj=y8(UOFyMEP_?B5jMv*cpe{N+uj-{1p8n=oPl$28Scj8nEEF_l6LQ-@pED{&c)q$
z5I@K1xDE4usc}kRS!{~kaRkPoPhV{}3Ez-kh;((Fh2P^@yo4_?b3ct&9(~at*WgY(
zg!l0r=IXC;Dq=bQKE6NcW*CMsI0?VUBy{~s<K@8|7>I2#5s%>u%o?WcKgVj=8HZ!)
z5UuwCvkldBJNydca24*r9=HYHq0`qI#}lhzYYai>VOlRAw!tv8M`(F=^udmpNWFV#
zBflLk$2FMDalgdJSZsjy<BPR19Aj`2uEedFYoN9-ioWQNtuO?~;c7gD^Y9_o8>Ddq
zu@jEMF_?ho(KlS%wZOg@gQM^~rr<mD8m#Tp^(9>kKgaGk6x(0|CgU!Ai*C%fU<|{d
zxDdDCZA|1k?!g0i3h$#wq>kScZ;|gqx-$CXS2za~F$wSD@8}t&@e5#4ERT)xOAN!h
zI2C7NoAmc%(n<IND-75E{BbrmkJkKtI38nSH2)pnhr&i`dLk~uwfH^W#AoOdtL?&Y
zGhV|J)bk@9g^#e*C~fbH322Vi{JW&XNKe8g_$_Y2+xP-=jnO!TumpaBb?`^(J4m<0
zuQ3M4V?3_LL`=a{G{<WHx$y+~B}s>j)ACpxiwkiLeuua4SA2t?j@LNBI05J3UObGa
z@H(C(jvwh37>-ZSHbMJof!(kl4#pL@1#jaUjKde`n5c2Ruo4c&zq;T3ck4glzKWxt
z8Mpy=;Ym!zjFWV{6)-=RM}Pbj!|<nXwB4x5YQZV$_muadyc^{?DBpmYr)vGtI1A@v
z(ljlfI$iyV^g7ZFxZe{=Z^mT&4R7Kj44k3y+T#HH8pq%SoPu9qBJRT!e2vCTjsF{I
zU(z+PBR0ZMaXik%1Wd&9xEGIMDmu^7ew$($tb+cSh_=~UKM;4}G5nc&o}^1+BD&1c
z_NB1_cEu4m9}{pj?#Gjuj6Y!t-p81^+TU_a!ne2spJ1kWTE8|1V-M_u`MGbxNGD<v
zuH(B~GHKg<joSzVaS(=J7$)FGJc66?AU?x)=(#}qDTGzA7*@tE*cW4PDz3trxExR8
zHB3eCh1#zj^I=*19K&!v7GI?8D&l5rhYK(+Uh7p_tk%K9<aZ)H12^F=yo-(mZC?dz
zU|S5qiCAEX)-QvVu?c>LC$J9o!}XZ3RO9T$lehy@G4C?1w*hzIA-su?vDtF1pG^7{
zmM7n9g_Z~68?>*~{G!+Z^P$%&EpLd|@ex*8t>wkgbB(6!;w^lJZP#jfNv!#;rsHrv
zuETTKXr0yz#4wzWBXBC_T(9*@V>9fFBXBV$;%!XWpzRM}GWjX!N`4T|#f`WRAERxf
z#>;}0F$g0t5r0H8QQMWmX4o6OsOO74aVRduU3db2$Lz!@hM!|7j>ZL;fGOzmosO>z
zw!nV)GrE4S_1fS-JcReL`6jK`7bEZ}{pBQH2@JwsxCXD{OLQT=H`c-y*ag4B`M3u6
z;(5G-o{X;$euwwbWwRbn5v+wVn7BpDuaka-VdPK1Qy8{Y>rcZEI2%{uR(ycjw`sfT
z*aCaw5FCeVFbS_>_3awRA6H-~PR1p;4?}inyTP~&f54NNj90PoPHoo%hvO7Hf_Jgh
zF0KCwZYGWg<v}<R_hVt|_rqzp0Z-z6Oy5_hrl)snzn<ue>Gl4njU)bMOh)%TI*wzc
zKVm`hTVih<j1gFje@``rbSl2X@_RW>^uiFFgTrwfp2KJ8u}|Za#QN9;BXJIH#S8cn
zGbd@BGT0D9FcK%=5<G^V{Gjb?V`rR*EARxK#V6?4ukEX19Sp$)e2C96>j7<7A44z}
z7h^JB!;A;DT`FlCX@BgF6Y$U>t$!KI;q}9s{|@sV(R2+Qh<C8uQ7wN$`<%23p`X#X
z8T(UjB+kSQ=%D@AI2IS-W;}z_7{^WYVmwhe5vO7TuE0dxiU%?MzDy>aik`>xc*|e_
zw#O*^23KNj{1R{CFZdes9M||>7>CK}`6skI2IDY&yni;8a^I8M&L6vD4DQ5XxDJ2B
z518ka#wm_v&=*6n7p}tX==!6!&x7IQpU2|lx5TkH6BpwFyo!H5?*F&a<ya?E7?;Os
zJ>J4t7As*l9E6AQ0=C5ccn59C8aD$LMSq-&8?h^fV=}(Pyk|6CRrJR|T!+W;6sF)K
z{1wZd)p#|qHtxq8=ygu(eS$-AG)}@Vh&uwO;}LWt-dKEr9<<MmT`(2H&ujm2I1^uE
zrVCnL1&jQo>55ntd*Ii&11HjMCDy#C?H{7oB~6#d$+#Ai@H*Zl?r-=On_t#AJuni-
z;7mM-=W+EFZTAD7!>m^|zc}{6;kXkkaGtv3R=kh7IqudNhRg9JR_1(k!11^gZ{Y8k
zi}MkW$MFyB#CchQB{|+;+=%aRFvmR`D|39y@fg~!>3J!FC9wuJ!(i-;f!DQN2<dqk
ziWBi5p2ifk(cTrkurRj7|E+Z|j`*7~89i_4aUCP=Ou87h!LKk9V=zDeK3#fx3HjM?
zYFsa@irsKMCgN}CNYQo~u^@hiov|-AyQTFX(!K)u7w{GaQ7;Y;<85^LS>t$Oajb)1
zV|5J0Z*UtPz@ymRu75u-ne=UZgkRj&e%&w|7R0I;f}8LHrr-;Fiv#ay+&owmy|FQd
z;Q_phsc78QIPMsWrLhv$#&#Hk=P(tGd&I+7ERB`0HnzhUJcp_1xUcat<2dxjYFHmT
z;uyS)@6q-^<K@OeI0tKE6KsuraTY$rj1M)QCze7VT!k$$2)khvuE00w`AFlILLXd(
zEinkYVHB>wH|Y6T;}yj+xD=aWYwV0eF#%tp=M&;#8C;4@u{Czap_qWL(DN7KVHsSC
zO|dn0#-W&iuh8=;@vsao#irOAJL6DHz*p$`OyiY8A6$hkF$lY16t2K`=$T49^ublw
z5`(ZCM&Sy4hn~L@4}EYIw!|RphEccz-=XJo;-L?&!j>3>-7pGQ;5+nuK|J)qRoD`P
zup36<3Ves2FNudfxC&ch5O%{TT!HV<^EZuG9xLNIY>l0;7mmcW==@6KcwsGUfV;69
z_QS7n8t%lbziYh8vT;cJ-{~RkkAWD1VVH=|s%pFA@_SP!C#OF$Wwelox=sFHekf$b
z$rCI!o6WYSu+3&T6>`pO7W7&rwQc9Sr>-xxWOt#cT|M2lbq;kk<UfIT`g7)pOp;ma
zLdOGTs~32=G5c@VZ9R%DD&r=ZlPjcDGDg?^sn+my1M9vWI6gdwWVSw@JHw)S<?D_d
zTIBTA5novUijB=_-lQ9sR<wDReO}U=D+8*&o4BT$WY#O*$;UnQ;%D(azPROczi0M&
z9+Fw)@^2q2du4q1joa{D?;Cx2v}iubEWWEmOsCMSjb|Lc64K#nOz=Ca`PRmV25tL1
zNB+>!KK`Xo1|B_R^`7_a(xV0Ty}!|I>6a1pe&28=wSZ*04!@JJ|CQn?ueJ@S<2|R_
z%Rc2L^Z0{h-fxRkY&mh^pb=X?n_l&Hamk$d;+M}p+4g>2)TT;rKh|thVSsg*W2)r!
z`li6mA<bhCT;0B^cZNm<CG)qV`Mb2(cl_HQ6EAETaB#~w-x8A9>w2YDFBX0by578N
zwcYs!b{Xp{nZpk>nmwv()BIDa6mNAZ>+y~o<SIC?-SB4GvTwGH8av9?bHReCEgn>q
z%>JK^8#{Q@%-W%o9)Hnj`7is96_w^aKP2B=w5Cn(9ADKwSLo)T@2gwaXh6Q_9r_>r
zwdR@*y>eHs^>f5Qxx!9W&fXohu<6iO%aa}~Thi<3<bKwvYnc*$(-!%C)}5J(J{fF_
zKfcUQGP@j~J0tg1zhUJ+JSr36{7tb;l_Ycc{OG#(dJpt%I&kQ!TV1@vwt7ltT%*>Z
zm2wPnE0wuU=CY6P<@v?BMxR%Da5n1U{38$Qnm4L;K5^|*Ny&V7@`s2WpB8Gdq~5Um
zu@gI8-)5b<4+-VUKK1o}ZR?fX?_OKC;Q7{>uTf-naK^xdM;H1WYdNk%sRDcCgg9+{
z*YWy{N5zhvoBClx)_n8I+T5Iz3&<sr|1&l_Ik`BS72S<VBi5Ndc1Nh0-)`jcGu@4_
zP&239;b&H~8=mXT?1Rmc_HeUYfR*kQYPuR({LIY0W(K=sr&++yv>EG1gqp>Tb2dM-
z$VJnz8*YAP7Gp@bnaTd#O1k^GIGNRqYqn77&&L=)!q3c4N8bOsBkw<VBtfjMyd{Ws
zW~I{lyCyxm``F{9hIDt;7B5w@81rRRHoLK5MA{&vjupUeEFa-*H+DHWmvS<_+?$&@
zf=wsKPo{0OX}AwHy+)h4$T%(;*@D}dHs=z#%>3@z&D`zG{EotA7O83XGuB2+n4g@y
z3)<yJ$5G>=Q@H%#T-Hf`+9^lw*kyXyL(NP9ricFN@G>*SrX}=GDF`ywL|-&B8JC>o
z<Wnt3q8~|%DOq+YI%<}vX=bqpn1-L-*R-{iV=L`$Oc-&|I$GnkEx>Lpi?+_JT%9D_
z(Z7Brd;N<msb_PNlV4A(OJnQ^>jWCcTU&sY;bWvk2mI|i{u>kKZ{1jz!J0A>Bz@9o
zSCgeHGl!$rVzZ%rotf2M%d8-0uu!<^8ep1s9~Z-P7<Z!`lD1!zdE;Ydx5t|G?Z$Iw
zIXOnga5G;|vxsDQ`N<LaOU$gs!e~F!&F&{PE2LcxH>=fX`O(VuF)MA5SaJftcHEKC
zWHIhVJJwmVR>$*=lV4iy%V<aZ-*cxp`8gIinmd|XlWB3Z!&}BBv+lSN?~vz$++bnF
zxWNufn3C>J&KWd6CO!YCi<66+S=`-N9VG`--e?eR)|4Bps@?EgXZ37cHe&6@)=1}$
zR=q-YIpyAR1m0%W05hAtdD;Pa$msL<m>C9}<`@@uv#OCeLI&wD@~@NO<S@1v;igA(
z7Z0<Pu{X+^>bYeCnE}S05#BPFj6=3{W-d7guHMEj$*C_Ha=T};8<qS_&%tscnmYoh
zAmL;L2aEu-goMZ*AOm!=8wYJc@=#51v74FPjT=!xF0OL2FWT;yW#vG!+80aZaI;c?
zSvOp+Tw}WtBlEn5-5BjG6J3X5ddd-G7?pM@T<tz~<IV7+QrDQ^>@8i`j3dsDI5UI$
zG-pped&wm@tLLRSgZo|SK8IPveXvU|StkxfI^xpK+%A_O$uNz>kwIzcoi0JnrUvjW
zNq}o|`lXS(aVyfghun-^MyTmkDeWF}vpe>f8Rg8_jE|Ata$*bEjVCe<tZK8IW#-91
zKSVmRTDRs~!#l07l~HkaBP6XqyK%$CTDRgy#7TsEHpe>c>6+D(vBD+PZY+unHH+Gf
zhc14mqovF~Yp(jHS;YQ{HKis*%1TkpZaj5~x02H%WtGipH&R{h$c0~*cG{ebVUBpa
zF*`EOI%9WS<XYQ<q_n5qSfF|LT>|6;?K2(jgRR)cj7V?k$9N{aT3!BQS8_F9yTqEs
zj3IJK=`Stsj$G2Ot*~_Jh_oByBZHjX^?dr5=6t@9^C`EcyOA8ZPNE-8zcuBgdD@Nd
zT>PwaU@VJtq^-}!H7?PL$n?`Dy?%5#sxeOda}2$`{kXk-^!5&wfW_^`yD00rxEed<
zpxmpNMRY36j&!8WQRA`nBX_2eB#WbyOo{DIX_qbSJelTnmt1nL)^XwFcqf|PrtPw{
zsPAqhMq5L5HO@GN%jy#0E!`S3O+AiDa(SfC$#~@wBvB(|G1qgiF{k`v%vn}UM++Hl
z9(TD~)@*Z?amn?SL%uI@U5$CBbu}udouU{$MNY;@GcC0B0QiUDj*7ha_u(d3p_gdn
zS&_1$NUSGBzLpkwxfx_!iwKh2%~&IyxH=Zg!FkCONAHX)PJX6`kLlFjZX`z>m8)#5
zuzH<tH!er4vlg`VX1L7ZEb>H?Ynv@?+T@aX8y#=DIb@<;l8Gj#?SLs)aZiMo=`P*K
z+_x56$)4{dH^3&-&t2}M<JR05Bm<NpBDgzhNzG8Jo8?XpIjl_)UdAcwknBTEPj{n-
ztjQ-L?no13s~KQAWD2^<B(NL#WLgik;(hr~@%}a<sdB*@(YOCRBFSE7jcB8bi?v{#
zig1z>W*FNfwr!8BCWTF7kF$%;>%j8N>nYAoQazWuu|L|XZW>8W;pvM>NP4B?&Q7L}
z`zW~&cba+Z?X3mTI6pkxn#@M2_<y(^tR>P@9%-43b^&IQW@dp!rn%e9Xq2)pM`a^c
zo?*r0u~tMTsbT+GDx1dH;i1k2bWmR23~HO<WO}$evRfzGI2;vfEdYBASyv83#kyp)
zPS}C8jZSXkokVnNC6|AlnJ?6=6)RV$r|CA*%xz4KmaU$nnVID$=_+j}BYT$E^li*K
zYa3JgZ`+u{G70~iZA|Fjw=qHg&NfCm@Rn<8T#|=~t3BRWBaa}tLN_FtX|PO@05eCp
zS;5<E_}r`}&mpN9=4}=<{*VLuWTedVvBPD0d?$@$?{!=rbdu?Aj2|BAl1XmM4&l>B
zxn!2pe99JQW*=o{4GuOlE;c=ctifjXW@a%taCw$ywg=14EzB~~T~2xUWlu{-n0e(w
z77mxtj+$ov(SyxQ>rKyQW}#-X28YX)lDlcMjH9B=W!Ig?mZ%_WZTQiU0J3j%GY$@S
zT(q7V#${W$%mp`N_J}x1mo?tWBfLi1v$2a@x&Y&k;nrg=o835P$Ubs?)J6HhPg=+g
z^4)N2p~z;Oc9xaKQ)28Ne$mW2-MZouO762%iDg~>Hxk3sdTP&;xt(jhnXQ&Ai{WM-
zd6EanM#-w3$8PL2<&Ldm=ALfmkj<>!*dO8WHcQF6=5*1_YvlGb^E5MS7|RT6&wev1
z+!~!(K$abAUds3mJIOSX&elhH%lwkw&)9Za=fN0ntTWw$WytPE%y4U&vig(9<#eae
zv?G6(w(d%YD`d+qr$8Q@5=zd2l**WIMEhBLaqCd!oZgUUsmxyEZd9C07PGd|RmNdN
zc&A18OFP>1;8vcS@m~cG|G$D;k0Y5@-ZHEJGi$i9R9<vkHN#5F-7V!qWGf`gTQeDm
zVSHmaN&lv7n&jMDhgKst?dVsIupTN>JJBiLxwzhw36;1fFUlI==I&Vk(lk$*Zqv<7
zj@r^F&U%*EjVm^*rD<#&A<vSTGD!_HpDb>&pGw=RTF)kVmMxQaA~$1Dfb57f%K2`V
zwp8CqyOT}XO~`zoI$Z8TYp)wGd#hMu%5d4K%87gLET_J`nb|lU9VgRnlXbNml})d?
zW{GB|Tl2IyKiI5orJJ#DM37mqxoOIDo@N$lF3;(8*3GqTxV0&WmuuV0F2j^t*lxUZ
zmXm(WZcG|3`&*WkaJ@32dS&G8$++hnY8Du5okrvI@aJi#ad28s-^m$qohCP(>3ZC5
z{1&xSZZ~6r5hsh{Zka3p?M?4xEEyrIfZbSVc$=kz<&4Fed8|1mJ0=<WXfsc+S;?_m
zR?JYTB?l_gKD)dj$lYMPmJ?;A3(5Ynm_0zwKw;T`I)_SOK_gZkUu7hiuYFY7q{wft
zEvF&YEaGRDiIt(t6&@|yJejo@WIaurALnhd6_yDSEX75P{noX%US3`0MMC;Ins)T-
zWVE>!>m)Lb>r%(qASY${2zk`I1)CmnCRgb8_NH@yk<k(0Xl3;=RnD%pfX_1INnzxZ
zmjTI4j*^X%%t5>1B{L>%Us*zS$IjMn#$EQ5)+*#`EEo}Pbi3osD@Q_Ars8dRJecm&
z<YY(18FvjoOKXLCB<DVFu)JJdmg%wHIv9Bn%JOwbf@Uyw8bNaP({@vKBU^wx%^X9F
zu%ptAhpfLcrno4Fcly3Bt{Q#KOPltW<izhZ?hf}h7FhdZdr#Sr4iA#MRhF20&d&9$
z+uM5eu(tn4<tkYH{vHt^M_9)g=j7}ykF#24eqUK1LUnyvD({(D%re#rY8L2gExYnv
zqJq2%255PFT6tFEg-uS+?&w&voGi&sX`5s}nJ5*kTGqRNT2F1PNn6GkBU^Z@<3FOU
z<s~g`H!jKo#nkqeTv>GfHnmN89?P<vlD4iHOXMEQA+uS&X2?sU-MB96aW%Q3lI_}5
zZc6L0@5!AmhppFbr(QR^k>DKkPf;=&Zq~Ap!#L_<y{}|2cFESwF#d1~lJ&lTwc=JX
z9u1cpSsp^J_Fyxcqi5O$`qk-<nafC#4&`EekWIM^&^5>`6K>{;m2|uuoXp(w@&L7F
z>R=g$hq1`XI{BIE$)%_uC(GI?q+Nvl|8x;Nj1x|dShKQh+dcg3Gty4pjcDg$)-sq!
z9xUNz{hnqMxy?N6z06!E(#mf|Tj$HwSS6*NvKEb(Ev0lKZ%WoXn{;E9+KnkAtUJmu
zYRcrknYNK}ahKO)Uq9y0uQuKrf}TtD^w)-<f86p*STAF(+|b$NdFy6xCJT~WSXR(@
znb_g-vM-h723allmz&`!50&@PG75Q)PIZcxst$R>4UqWO7m9!QV(^cz6a<kW9~y4W
zlUzn!S^eE*=!fLqmoxq-S{`+FW0ds<;c5(W9R2&zgw*0_&Y0FiK-Pes(@aBdZ!<;?
z-pjZf`CL9Rr3G8);<zZyBVrwk<px{gTu>e(E{1hRY7^r}X=1vMlBbOIZgj>aR7U?M
zQtre5_Irms2`0#fzL4yzOUS7z7vPfLdRG4wX{~>rvNMmBGva66h8{Ala?~=7W-^C{
z?v$@ohVe$eZ&(XS+U4=8!(cW#Ihi%wjSq&iualXvnLL?}M?IIN;$qsJ^gP14OY>W|
za7`z(j$8;2+1&ou_py8>{IBn0>v0n+VdVAppp$&^*c~0Dce8VG`FJDa{l9K!tG0Ff
z25>ve(}@i0ezvw+a^c@N#hXRtQ-`1FyI8h9vWJOwJhvW}KRC&HY}y_1#=hvCk_r+X
zS>=ZS`5{j9wlch|5?Q`R$*GhFvc0A?rKR9G`7hG)OUVZjsr3(kS~b(wIzOo_-(0M?
z5=O3e33<@U%Rq^p|MX{Z`y#v4yhB*$;@PBPoOC!<I*gMY=yWrmwG<agJJy@g)>3BO
zuFlQmg+cGv7TmATrO3Kl<#X77b+@LUXHR+6mqU{oEw`*(Fyn=jJb==c2*YUUZB{pi
z+%a?CHC<yIz2$7lZ7tu5-pY>0dgV`>9SQZB9k*q=mvLWjW@#p0Gi2TLGv-J9!=33R
z+dOw`5m@9btMuFm>#&SX=25ei@rkw5=`2t8o^pw;b;T$qH|@*_Z{vyiT;4cmMub~W
z?Bk|HmCvna`?UKgHB#1K>pm(hUlim%GJ4CcG(;BKxe@Z+OCFY|%mBTktTiUF0mF=y
z{hVx}-Hqwd@>cp-UVLP(&5=9B+U4ew9fP~v22W%<Sv!YW(O!SvMqB0b((5E6d>7?q
zj5dOtozl-mYI-Nv?R2s&LOyS0O-K)JW}0T^+V!_>xqJ@~ka)JVt+^~ra*VRP$h~cB
zm3Nt(vMIF5oA8zqLDmmuc57Xe%(-^CXLrfOk{z+1(986aj{?4O7uS*!H+#)5)27GH
zVb&U)%Qz}`jfcG6RJZp|`xfV6zbh-@upqOMyf?@fa@ms_r({oHeJ_;;`HT(n@vFEo
zGBQYRZAVsXZ8he~M^iSZ?&c@*b|UWvrlY%@p?$+-1<cso%-__^HdHR-0qIR<)97K|
zrnT_OMyj0T6qX%XsbI5Yd%5V==j>qXnI$_7IbcKn^Dw4TQX{U>xb*Kw`=jK!BPZmj
z3@p1W#<KS-Ea9riZy~IYtT~N!QH~%fiFJ{s?V#+LtWs}pv!<MNIXBtum8`)$iIk*|
zG1KL_Je+EpH4S%fv&v|*uAD$mneFme4-1m-t#MWpZ~63=-BDDV#iq5oqpd2L-o~&{
z`C=9sZ>=2bTz1MbI!1n#;AXtH#hT^&q@Br0Bf{lO<_$MK50x=X=jWo*o;fnt<+GF=
zh5XVZK&EV|Sn0|dXxc^0Yws&Z>1P~}Pk}zh9v69e%qtVYPd+^MGJR#Y+tjSmQ#SL~
zFG{SBCKc^Q#6>etJDCnavKLw}pZNV`iy-T`?AnXlTczz}rMVp1GwIIymMq7XSAto+
z9Fb3dGLPg-sBDLWtT)1O@{17-)l*^zm_9)gp;lTn+1<R75P2onHLa-}X4*ux_BiXL
zl~j-!DhGDLW=**2vNx3XR%=#$@wZcvb{ec^n~Zp=<7Uj1_tQ+qQ+bN!m7lHszn`&9
z4&K*p{AxJvn0af-SMay;F(Zpz=2v-n6SY1Y8y);iuMK9tZ>8-o&a#46l*aPTnl?7;
zIDVDe**cDj@?j&toC+WLwUiu30lBv2tXssGB7J!nGvzeOrCA!~EhpHxX}xbRk)LJZ
zd1lBBH%<DL^K)80w|dCOi0|ZdRgsjR{N!9-Hc)cFGXL`$%NY3T^np9qa{8x{2mRy7
z?Zysy+4Qm-$K<1Uz9WA>gk?_8<q+g_`pJ%|r}aK&eQx%Wy-8*<zkP`5Rf`{e%!0Dx
zS!{i7mZRM%qxgS3+O=s%`&w?^e|5ADS&QXx<+#Jei+cWptb-_SjFX+T_35R&JQXCw
ze|aTF%4A<^t-jL1?8rOL71Qr!TMO>x`Dvf0Voldt@(yxaUgp!@zLpx+?a@$f?6g<n
zd{W*=?uxGR*~cg%+aW1*H99+FX}+G;L|&Ws$kAGNmh6B!T8;A?1Ld>C1?vM&x#`yX
zi#6qP#9MWO((B|ghDe>6X36T-k!WAp(tb4*`}bcBNvukC;|puf<~HP)Tk?^6SlT<j
z6xfW@^3mV0UYOR(2XyP+$ZBsPi>!4Y%GLNz9y4<Fa~qrFOJ|*(W~ETwW{r%pdh;+w
z%9r>I@)j?lG8v;~+i9l#lIX0w|H`9qp&{>m&8%~GTVAKEoYiuFSf8ki$nOs$99bR5
zq@i^p;++eppIQG_oY^Dtww2HQglRgzbC%CCn}^@A8)xN(Ae(d1+-BxxW-ViwU)sjR
zjFsK8ue|=sbHkc3KGp-MW<N7$Gg-@JNcQgX-9_GD^2?ivv$eMjkSEEv|A)QzfU~kl
z7QW}4<~egX%)kHx10q2bLBS#DqN{^?U3J$1!>(ySU01ITrZp{KBshR#B&mStkPS!{
z6^UXXClwh$LC_%?2?J(*|5e@3=`+Ki>)pNIz2En}yZ)GSs;jEItGlbKtE&50QiK(2
z6Rj3j-sjJ*@@Z6lbYdUGND_|Xmu|okvj65YJv#;sE>&=SiuI#FTL-4A-!RvN>|TK#
zrsdbsN%))|?#ZBW-ob=e+*35ZnG-F%lr|N}&P_NMQ^oJ8NXVEekwLm)zlYBwWrGzr
zrSp4Xx4a%j#iJG4#NrKthVgG?-$Dk*()6Y@H(OK<O6CrkXNhPl%!AoICq-*W-_tQ6
z(k^s;e;GTtZel%_RywB-=bmE2!9B%1D#TOE{xBC)FF>ok)vHiGhkr5ux{B>AF4WAu
zG9C$B7&4!N*o||R22BUMFJ=e9af6l_%ZzNo<i-E1KC<A{u2?7!p>i|JU?W5PCyK<d
z9PBvbpI(O}c|Md>kjB@1{fh1@lrn5Xcr_{DP+D|QYcU>}h->kYStnRJ()c<}99DPF
zj(3lDH#^gqj^c%`QcVo2i=0$A69jTnIS;nSv-h7{C|h=WL~fc+GJg|Gyq#$~sY*3$
zl8Zz(g-jS^*fw?>GAICUVh>QuKja9)k~kOrPAfOEx7Lkc2CrQ9qYgp6lYnC6>r6OB
z;sf4@W0yk?N8uaDL7&#fUQc>Ik&>w=0!^4nQI1k$?nU!`Vx-!-Y&5vj#$j?XxlZCl
zRubgI>#8S32foQAu8aK{co?Afbz~-{j?6moUY2k$ZGozY4${dnLhR)fH$pn+{LeXe
zr+19k4jL_GDHa9UviaeE<6~X^qRpCjX3C3eqb&gxR0e*Spl!DxAtxM02JL2#^ywUF
zP9xT!ZW>IfHu*MLfe~$K&O;}kB(sX14dwKMm`ZxaXS<Q7@3@iDh26uEg80&Cy>K{h
zz0~#fOU~j%krU<*_}k6l(qjoC6Wr>8%mT9qld=6V!RZJ*i2VoAGA1O`|7#L2s&@$5
zwGNK#9%L<ss<9b}U-)PiS+*Ty^C>>^;Bsr5$7Ep@xH?<1xhA$V3Po&)64e$5XWJnC
z>6kem{fLq~Wt|K$T-gwcum;No`DIsV$$rj}J;tnis&_Zl37ZIQr@H@QvguF2;z7zm
zJ@F)%Zm`A83g08l#^@x-Dsv3B(E$_3U^fu~2Y_DP2q18xoE#jv0Oi(=>N_)ViNDg4
zW@a&`ujAM|IaioRQ3o~oyo9FJHs{NU2e&RRLo)<-&(F~%D%1`4qVwl~l5U#C_X4Vh
z?q3od89y&(E*>PLh=6Gdt<}k*m$JGz@vT+&3DL@(bV0&JBYHB>6X_|M{S}-%vO_+=
zq!p<F!OT&|bJDg?m&yAXdwp#)79A^EsTSck`5TS{4$QpZn3(xE&Fazwi%iPX%rn%3
zOt#^o7Pa+dG(~L<!J%C5x*$_yd~$ljn@VB|$2SQ;J`p8EBJCaClgtt!-jPZ?0;tU2
zp}jR7;7b-z3smYLRN5l!Jz_C6#XD|Tc1-qC?1l0^JhXxY4$>LZ4dV1NFR`fLcZtS0
zNn&bXR-_e6WGG!W0fIlVj*<j)#NeQLcNk}wZNzuP8=5Zi8p-a|w4;nNjEYfQx+<I6
zRf+zaNN@w4$r%Vq2j(w0j-+vA(RerQ<j9_k@dfdY{9od)>|M5m7L{TJxE0F5OByTz
zG2dW`ywZZ!`N8kc3F>m<+DY3Nrxiu3hRZkY3O5%%L1S~s?Y5UlB1uM7TN8tr7noDw
zcJu#(3iViAzG>@h`{WqDC2ZeP>q{y%Vo^I^(j<G}6t}E(CaFJ|edUn;;0k`EB{C{8
zq9dr{6hBg&9VzKZLNDS+GAjr;BFa~B13T}qC86?|X=eS+zvNUBGemqxjp74>dPGoQ
z`Ao0)Z1V|W(`WD;eG(CU74MAMoX0M74j7WEBQBkq@(FFgzXjpHf^#Wm1IknwiLdAr
zGzJ_-fnOZd#xCOD+5X>VvDdKljuCd=L)17nM9==O_Y$|5x`xjDH@b%KSBUf2n48h`
zBk+r1f0z$xXL@`nl4uhI>+$dODy43A7TiQJb3cg_49Q24qG$t|r0Vaz+D+2FF+yDl
zv;UrWp6uu5>E2ji-m+6~i4S6?dQ0fFc?ii??G&{8lbA;+p;LougIQ2nW<LJ%2J9V8
zCec_1Y7$muHPZs8c2)Dd%#GLXmLQQ?LK1{?IF+OU-g;M|85P^hUvVVhZ$ibzF}8+C
zHiCHPQCss^)Tk|)GiAM)yVTIY?8U-v(I5X@x8S5av*E7jk&)n3aHUzeU$I_J;v*xw
zfEpI6w^2WEfW_p$J6gnI$Tb@S5$HKV?qCe1XF1=efVu|Nt_Aws-ArErb20_7Ll9uP
zFSu)x@Y$8Nmz=zf<Y?qFBdRf->QL!P>a5J8ZKi5&(553>fo#~_gTt3f*e_>kVE&HI
z5-$#8`9_R=Vg6mE%tkYe(|smx&Rw{2I06ojalF`x+o+{{#D+ePbJAaB6UaCBlU{=A
zUlGfX_lTJpcj46Fc=|n3Y1WaMP%MMXgyGl<@@K3(&UF=CZZB7ncwBX};vP1<3Zd&N
z9D>+#sxU@<?~dn(>2qre-8>qNzlcb4!Yn7xLbm=VdyAVPHO79AG_(rt&OvNbc8Y<)
z%=Dpi2~v|0)sJLNIV?t`^bz7<Y;RV_Kf{JuooLFbl(jc|*fXjUeP!>9nJ@4WR5e4Z
z<nu|qAetn`tVVa4=c9OyKEFr8PxJy`hVbY_QGN3%hvTZJ3WL&nWC@^qG;;4<goljW
zpCeQnxj&4hzy?(=XfFq-e6tU6Xg3Do`l1_VH)mA_YFri8@1}buMblZ5Xyqo?IYGWk
zf=eRkkT7}CvpC<ym@0CvV(`X@qcp{j87D2HYf#%+*raJB9hn%Uoa4j{7jr1yG^n*b
zNMB3>#u2y?NuxL?nMBb&XxyEtAZ<Jx?Mv*4;|yo6Qrf@_`5{_-N|1hv^uRnB;e6%C
z0T*@3I7s}p;yBnb*gycz4ncgjj6v=}V^Gz6&KTGMkXgTwFn2vO1PkHt_|l+eydOLH
zK?yOE8IzI_BPok=(&Tid|FVq#)x2Jx5F?HToEov;1=Z*1if%h7IIEB}m~|Qv$1(9X
zO+NWPjP!@GG;P=(PuElqrewdwOsj&qlU@4DY%y|@=`SR(VDEIc=`Z3alk}H6I1cBV
z1HG{F>Y9f*IAq8Ab`OpUvtV%9#buo^iC!m|mW~;R4c(9L1!fm#b(a0`D71m8FV=Kr
zr`OhuNYyj3Zqm(FMT3K*N7!vo792)XJ29RDa~(2FH#fv*$4A+HqcPL8&EMTL-HmaR
zlZZCUW;m;x85%KaX*NYf?^f#)9Nrog=wibwNP3ZY&@^oEhBwM=W3IL4Xw}Wy3d^*-
zme}g3x1Jh19>DCaCH@Q-I$qRkuU2%CjWx01>DTdqg|+>jZYHrrWLkEq(il^J8Eur)
zxwu=Z;^)%EKZAMo??uTrtEuR)E(|=eCme}P0n!xM^f=C1Mbj8_o}+<!6-g3CG54A+
z9l*|6LI>)b{`B|k^+89X+0|IEy19+(MZ`@@Hz2vQJ5ef;nSMdl6*04_XFh#wLhcFX
zX<f90kw_HEZ6UZ4|6SR$xFUi~#hgO*rZAL!ZgE5TIF6sXB>z!qx|tlFm*seb$u3Lx
zTUI{Ha~CIwBh0X<jRz2z$qPER!KUrbl#>nZoEVn&AlbITA-g$UQ36vvH)uv~9NG8z
zK`wT)!#xZ8T1RAUT67t^uXS}uZ2Aq16v_7)(2M=<Z$+Bkr(S?tZ*xsv1%A~;=fK<)
zw7)87C1;Ne^ET-_;y5!zSpu?sPBq=h5H}g6*l-$T?^Z}a1Qii8nX)`q_A2bixJ*Oo
ziDd|Q6^SV6?C7z4yd3p$-VoWy>d*R<!_|B2u$Vkk*;4BO5O?0m=5Js)o`>h-Vh?fF
zDNdSs2f)ercZu;jv_@FM0_7MvooE{K6Jv<|n1hehD=nI*Es|(LOFENpHWI{Bht_+}
zp*6Wzyuiqpqw!=3-=kq)5FB0fPy8FZPkVcGEZ*0WI>R`d$b><V#BfOOmy0Hd6AS8&
zCBH1g+*BDhj<rJUcd)`thIl+H7kes6nBcklA8u%0V>Tud<UV2!rEE8LUH{UyUq!ck
zzujQ@R<`3GyxWkn<#rqHk#K{+?>2II#NQ(E>6g}b_;1~9Br<a!`l5Um76Fppa?8Pd
zgil8T0p)Tq*?6{d#!20HGy@Z(=^ZhqNgI<7{LK&dp-R#)$%Jxgm=*S+Fb$In2LD_d
zrdkjzaC}GAa2>}wM{8N*rp+C8aBHHG>LkpPh-4;n&Xybn^9FI%mKdm{c*M-&UWICm
zq|xYZ(QL?vxSy(BC3ov8yYh!C(O&s;*Zty?<c?vBv)gyy^}wW?AD__o^hE>mw$!f+
zc|~BQm$@pv?pNvUw)9%?TocRLa#UuMZI}GKYuD&ga_*`0dRnEI&d|YwoM22H^elNQ
znWj4HF5X(yZ5PSoG0jD%Nr<^M_C7InP2|3n=(-V~LR$ReATvR_2qat)5$#tFlcTHY
zjqd6uI=h;?c5tnp6dGZOnNcLLMQ{&(+;eu2IUY9Mf2rooSo}6{Mg@^u1?{imcfkso
z7uDo`Aa9a77c(u%R8NeGnTzo89w&BtU<PSD*=8^G$Vssk6TO>kM|B8;nw#zQn0MrQ
zOh2}bW4LEDhWUfJ8#`A1n)k5n+1sWN%4sxC;yq;06Yt3nPAlRv5cB;$t{wPb&n1I!
zxs+*+XEihY9%8swuC{k_bE3RlOGU2m&saw^S`r)imS;LjKC(Fl=kR^HzkEqTG14;T
z@p}ozmY^pJgQL)sSuxrACt>ik5sigNI&*^SE%7VUULsc+X0Jr?#eY9ggHnynJ7%5Q
z_V{D<>dsvJvBq3CNu@Q|3e>RU+DyMLF40dDllt5fZi2$`-;kJOCHUL2lFs_gtMazf
zSe0+x`{@gxb)T~}TD+=o$)Crcu%D$HsR-=W9^etD<<`Gt$o0iN7ru7Qfrc3kR><|z
z3#Rp2({;~%+dt}Y^61ACs}`=3E0&E$yl_;D(QggByW7xH#_!vAXQk`6((AgFE`L_#
zt@k<!H<7!Tk%YOu*wkU@|Hw+JD#@Ak6F=a)EJglk?c)oABPV0^%*Lwi95n75)a)xL
z&l5hGB;+J-7O(Ga$ZW>7Zt95L*N|kXBGyHgc`#iT(a~l*+iVloH%7=1_VtGG`zk4x
zB$9u*c+xL3{~s+zLI#gMUZPslgJZZ#R;%;!z-;HRQ{5D3Ca@v9GUy!S$ciiC7O7mp
z<Bltf=dvJzBPHILtD#5c$SqMDJgUNYZt3mDa}g$ZJ<)>ZmV7d#N`n@nRwN_2G&sh7
zY|h8@SgR9a<`1O=cRJ*SIL#<H*5Vv(acFA>>h9Iv#6=k8<!Qxwn6DyL4(j8oTuGHj
zCrH#o>!A+{SkTQ-F~&46j7Hhej#zGQw3pcn><8ayueiO{eU{~swybdYnh%=aI%Z7X
z$8{H;-aGB>-?I`b0xR8$nr49cUht2M)u+8Wv{|!t6I;$tYg_4>sq|8GrE6xs+`dj+
zcl9l=JXYhvSBrjHQu4%sb^&+wBNLx&JZ){yq1XTI@Gj%$9rbCn(sekpKA1(1JwI*U
z-jhCQy<+06FVDzZRW}>p%)i`R=kI%39@(<VS+nxm?;cg@_32L#^+C7U56cDEd)+Y|
zKKrr%j*G4tTII&aSL9WCm%h>)@|9lRZk`U$SI*pU?De%?Y=8QKOVg`8b?LF!%N_M&
z*EYF#$purNF^{zwvj60~rj?ot4BNr41IYAz(9IQ&&UIyWcGgXpr~}(n9x9}+Il2Jt
zAe#?PtuB}Vq~N7(XOd^5X~>U=?hVotcX3@x2!Fg+jDx;7@_)RyfWLCO85%^kvkj3`
zk!Grk@sOB;=ekwxAcwp{@>t~}Zxs-hlin>+79(}P%t<+d^W|Z3a@d|={R~F~li04P
zuct-t5yzGw<1{!upH-UR2E<<ZjxUHLo~>M5VPTAxtdYg?Rhc}8KQCm`z8sHy%cGy;
z@q_Rvu{<U^9wnB?bmgIW94pD=Fp|_nNATI<1X*!!e6N{FWHOD9MbbDeMXClLek6gf
zX+JH9qd8{Y8~r^_K=S)dA#s9Bf?9V3IX&QnMG$KpJ&HrxM9qK0#CQXa)ro*}dk&U4
z#f-v{)O@RC$jr3O+KhaZ{{hB>tX~EsBibs9Q^j;>^2{(YoT%(mlgGWMzQJL)aiHY9
zY74M_cSA3!hl9Yw^w>kE>tayUX9(G3fDstW!-n9KE996eR$)r%W0caqGJ}0BLwci)
zo9Q6c$?vPM8&Y6(`OHq0(zV}}(zVNwRu-rEq__wx?71g;yh>|ml~TE@kZrChh|Ola
zITqKLZK~0^jzQBlxQ>`wxQ3jBZs$Vh9xOH{(5Na)LIE)Vrv^2nd397W>+y;rnHQ>b
zvy91L5XOMo&K8!3{bwku^_YT)YN}qEU1#3mE25glR}~e`g%n||ky2N9u&Br@5>ZVc
z$*ksOVKbO5RZ3M#a(^T_zqxQ?W%pt#6xW-|;mk=%Ig~OdTC*Xj1nY1E!by+`QQNK1
z7rCWr%=_FO&cT}_>kk3moI5!PCNp3eNF8rZyodw3F*Ri{+A<hb2|FYQ19wVh^!Lf-
z3wdWHj~-7Q=4eY}xRBgLa>7*f-;ru)5~F1l`rRW-=7%cyF6zbhX#;X3#*bqz&GbPi
zY*hWlB5o~y6w;IL#>xdY7|{xBS&+Rn$eII4{34&?%)oXYhBv|N5=^Y?;ey$gNVI%M
z1&j(LPv8np+Oi6d74MMt<!f1oB-=&X(-+dPoGzLyH+3TM{?Od9gp4XK%+l-y0b6qd
zg)~>b%_Du;aMN`*pYr)DYVPqMBYvUW_lm@C2-1eZd`M6$dMv!xhpZ*ogY#rPu9rw`
zE*$R0={U(&u$7pOtAGS1*A}U3gmi<rL}|d^G)9Q5%EbDhdbFJz0!fo;&4Q0an=zlw
zUHGjYvpm0-p8pzdUyJ(~+C^|PE8*L^E`0BV?{4BhG-))4Y!%_RxM7KqNoyswOR_A|
zV{N$~X|h>&3kW0;W}69TIYw5Gb?qEvarctXjqol>c#4zGio395UQ-WghIy9Yx}Eln
z>c(jPWc0&;o=F9f#x*?V7e4P8^CvA%WFgVjP3)J6+8MK)CS=5(4iZz5RaIo=5uQ_o
z4F@A&y6NUMLJ@YLe{p({$!1tI>_HjkR`MH7y=YS`#KbatOzlI^jph3@?ce}t#qtmr
za7>~f`tQ!!=8^Qm*jzlMcQS__Nski(D&v+8!h0$m=aQn8g7!3MbI3#zw5JuV)S`{E
zXt4MIv<d0?g0}HaR2bVrIuDp!B_+ynseDb$MWG#+6I7$oBJC`(3#THHJD6iHTf`62
znYM}eX+c)>q`F`pp_rs@+?mUeeZpWobEk@ToAg9y(pXFRdAiIFf;ly#?WK+m73yHK
z9AYtlvhDsw)IJt<a9kxp+_+}+#O!3t>oLKwt=Y4X`ZAhk%q!%^lJ(UiXnd(^$LBc&
zCryP7_zLUwIS3y)eL{{2i%b;J7k8+r#-J1FM;B^5S!%TXpvFO!*0}vQR=CesxXD&H
z{ZJM5`L|Tqm&agS4_Dd>U)KtCGSs1i*?yp=$dED_=3B%ysQPd+)CM!03@_dxli~e%
zo_X;OoeU)`HM51Je@nNS6Gu1|q=uBKAT`|0R2XeBzjWiPv*r+}gDmP#DO2GTLA8^(
z8C8+}iL;+OC^8|K1lI+P&(TRxoH_|UNu30*>!f*|Ny8*q<VDoSR`ys`gytrJZ<ej=
z#d38u{f%|4^L6dEk7pGBLR};OrMkBJy0+N5CTm?Af*Mg2cKS9^Hp?l&ntr7!`{S}6
zc0Zg|RTlT1ILOvmx&;DAjA6g+AvALZ53#>dRdm~Yi?gaKwngGxDWBRycJOzq$|W1!
z4JSJSKvJp;ZF5!Grs7<E*2j|YRs|%oXJU52VOS4+q-jf4?qyWJ0@nq(u(5VTb7U*Y
zGf(zVC*kK@S*X60b4Oh>3#PwzGC{+3F2ZbMF@Hsef~7%ZIa`Em#62)OddR+y?e-H~
zxiM^gWqpIhQrsvUHDafUOxU#g+4et~kkdwwpnkLkQ*p4gdV6=3<w&u?&28xGvhMsd
zoXaHKJx(8N5twkJ<cW(CsHZ^9F(+NqSwFPY6O_8ErIzXWLPAw7qp=wbs+pMJ*igRa
zcP;HyrM)7od6KVL5a9Jvr8O_H)UPS^g}&x_$(onhn&-QkYeZWxf3(kYEbXF1-cmN{
z?sj6lPQ^7Y*Dgyr6)%&D89nK9jc6;Vwa-&5^?OP^NvOds(Q13p;g@W+>l3;S^nfy1
z?-)zDMJY$QdN=xdr5s+RmDl^YrT#*xhq!w8B<mev>)q?>WsH`V$p3=sC8Q8B1xXJy
zeY>mfc_hKgVW6e9pA%yBb!U=1VmS<~CIXSSH`2u*#y3<WemZ8u5rmSvMVm_i*o+_p
z+ngf+yf5y1aHq^oCJ{EJ*rX?@rOwFA?iRfuVir{+3!+<m32`^(!xX~%)Ny|xXBLUb
zt>DQ3j=M2)0s(kV7(b_x+pv;nr)1H-%#{y_mXivQ;0tx<V-sHyRJ}|uY}M2^x)SD2
z@_hczUXV>nfU4CFLH0RVI%F`&%E%C7&OA>jFoSQbnA+T=$|QGrHvVaRN&0lqz~1a2
zg3YbDpepm>k3sEosdW)q9&s#0Lp(8oYdTx9u}IjIcX5R91|_NkQC+i<G*h`YoKGC!
zecCM5rJ0Lo)0CF1)7a@jwXS3lyhyeuy_x{V;pE-19q>Ujg!6dnOpeB8Gd!wMTXn$c
zQ~1*@;8D7Cnn<XOTszc40ujlMSchgbj1jfqB7Svlx0AuuUL5A)&%24gWY%jV3(P!H
z8W8kMZdD+8T4yei7bFUZQIZiOoynuIrX^}DwkfDqA`dTQh=Ang7LQq(jfhb24<}Z7
zB*<U&EHs08BY}zVLN9W1Qql4HNzQ>eAXhN6`of`@p(71?3V#S_G7#L1%F^UP>Yv6{
zhg5MPNS=t2dx<+}HBD{F7$eCeF)V3+29La;&M#gkArdf~gD5Ucy7wISA{v^`titM-
zSy>P?=*e^xWM$iuR3~{!Ek8JMHaY3?njPVrNf`Y)85%M>+K_Yxv?&0_uvY9wYTVKj
z)1jK#PYA!3X^EUM7Bq<{*UYewSFGXj3dz~8L--<#zty>S66^{xr|2-q#3FwZ&$c9@
zJxlxspU6j1$!y0<8zUVFo4A<cfsPTo5**TF4>G8k5_4*g08b|hEL}K`hhQ47V|z$5
zpTu>!iB<S&w`f7cEM>jD+6~z<#a6IfBMB*+sbzfUlPz1LWyockE>php35OPBYdJ1h
zvPnv|ma<9RsE4u=7L_e~LCZ=gd%=}$)G}B;t1LHC^sHmKImvRYl5M7JtRvf+WI0O9
zwo*3Am2K0q=-=@SjqFB-qS;n$Z_Hd}wM#S6E=+^QrRFLLm!W2ANgH<(GmRvoH$yaB
ziFQ_#;%hMGkOSGMD}?IsQge;i1PDYQtop3l-jM1ErTR>%{t6Y6g=(o#H7zBHoxg}!
zslKvQ(!jw=1;92CxzfNj+Q8Yife2U|xVIX%sBPe5O0-w2x|XVtDpi#TRfE~)TB!>0
zYE}Dn#I#!vE7g9bx)LhJ2`a`dW{#L`t`sT;OsT%HRMNl!O7)F2@M=3sG;pI3HJWX%
z77;OA+Q9F91N$q{_gYmURWU|XwN<DNn{5iEDh5ld`pGx2NU45Os=q<Ss6)jp;HJ)O
z$@LJBrTQ#GCMoTC;vZc7Hhkq1_XTvn_!^Cms_Vdh!Lgln(~q=8+>K+ZuuXh)&ul)3
zX6+!uzM1eOYv%{<!NklgZmP<DvOda5zl`T@pyuv|?k9WjWhVII_LKANesT$UPCUIt
z$(>kVx}Tu#S^fd+EAA&eP=PyGD#dPZV8+4=CG{ZW>E@egQ4q-fBVPm32WghMLfi)E
z57}Kr4`uS4pj<Xnl_Rh5k=08=DBVpr?8lO5O<zWm!zH^9F@Aa6L3E@U#<dku?-XAo
zPEN~|CsTOMgAfntPrMCUz8`X{xbl9`gTG0BZEDTRdL$|>4(|l6U@A5@$&DV<x-`f*
zTe@9-Q;V4k<Wem!4a!cY8(MYt%LY6!AsMH3D>@8r#J%gPGNd1)aj&MEkL7JCv83%z
zbv%E$J{c1SduEpq^=U=jZc$HjD7}Ns%jBcvrsxjAW*at4<YI=)pqQQaGMX2cCN~N|
zohddgnG)ePz}`jDCpU5#gpwe`)|<&Lmw*YG+T3!bmC_L|+f)^+qR4E)yQdS8SfUwm
zFL}NxbTowRj}XHWX>$fQB9Q}5|5C<%JxPYr$t(^D_4b*XxOFllYaEZ68H^{hp1D4H
zLWCJm$_ojZc;Po#KcZa9tk+-QkaI`$PB3qdb^`OyL36nsDVNG>%GJoe;=B)PNQ*BB
zYWCrVxy%^r<G)9x+qVk#Tg2oIf=3s|6rT8;<=FB&wwcq&Jdts0$tmETbt1~SRdUTA
z;|03oeC~Ig!T%5o?<-a;pN`hd(|sAbcOdge{}H`7V)kW7-U7q@c@KH@!umNdKOe7}
zFDQ1y`6h!HKtkL~1C;2SkZ4DFq7gzgz!BY_sYFf4&A(lmaero>eB!wdr2jD25$!g$
zUke|mkWaR7V6t$}?a?bDW*~)pvV}v*Xj66c#qFYw_-ekmo!fgb9mXXm%XF7A9Yz_S
zY{y4th8^E_yL5bHrtA1y<@+#K`mpVG+woD!K5SKvqu|IV+mZ>|O!U1uo7jrz$9O5~
zv49U4yAv{Jo3mwWf8rn7xaTv)t1Z}njGhekV8Q+h?7@Qld}b*Fh?N9bHT-1WlLh_y
zTIP&O&?kd_Mk@OB%;c83`gU%Un(3LcUtEl8|L%4wx<9jk4@EKv*lJuIxmZ*@MS3~u
zl}rg%M6Y2FTBg*mDD?%EQn!Knf>i3+wyCn6FH-8+nNs{(Z^)TJrOdGcJ`|<SH5OIs
zRFiu}5Xc2_D6~xrth}U63zh2v<vR1Xa9sh{ndP{?nJLa>-J|Cz*EhANooFgE4Etpu
zg4%<im~)(Y=xQhF%N{4F#mOP>q(jckD`g56Fa;UJ#SD2T_5a<jL%uvY<ehX3cM0||
zuy+aea>lTej3Hnh@>O1rGqi21v~7Q;ZCizM6qH+O8j}!BPt*P^O`}NV^{yR58HFiI
z{jSRE9H^PirOfvNJ`|<S`_w^T+M5%L%sDay;48~VtK&IIL~qL3$x#y@10I_i@kZ=n
zxWenLK<`Yd5*O96se7S5*db3P%VD4`IU<EYBmH&+3a=3xqdsUW@v8@`6cX);<(dPi
z%^Kz*E(;KDlml8LcBZOaD?W_BjnKjQrc6+{a3BF4lxGHc=^Uq2%YAi^_)*Nv>N3q&
z+(y2VHf1nMSWaB^a)V=ekpktICuzDZrK<CA5~b8Dj>gI>&O1cK0rxkPT!}U07SdJ@
zlX+(0Jv`mRn+*$zK(PfaVVjV*bn37<2Y4zn`*DC6B=Lut_Y|5H<aQCkk!O~c@W3+r
zS~Jnyf$3X)5WUDWd%DxM>K!KswWbn*DY*DzQEne%J#iqB@Ly}$f|u~#Z7yGP<uA2S
zX+B)ov8%?>VCxWxG-X<KHs5%gm|1ljcLU+h5rJ&7=Cr#GXPRW|7_2-qp4NRBwr=!2
z5|_j(8&SQW3*+~0TC}~yBslR#o2vhzA$Fgcqmgq%Tt~eN=W?7(PjtkgcbmM$WfqX;
zSDj0%jH$dRX5S{1gOhv~Qz86(X3A%=HY_|6A4AoR#T|Gm={)-a-i4J&{YC6Ex2duC
zV|6L)BuzTpx939QB({75$%E-UARFN2Jw!w5VBTa%QnHvO9He=AQ;vrjSciE`=E(JI
zEw{<bO=dg2p*buMY0u&1+Vd^%y2vI}L)sJXAwHT+95fkmW;FM@u#LBX!Dc;%oxcWi
zG!k2eTSguc+F?m2Skf+p#ZIgXA#rmd%roy<!YSnl@0i3}xt4H74M}t;UuS(==i8Qa
zZaGq^bA=^+{h&IPFwe}ggiFg2N}cm9;X19e;-EZl3pXj(iV_-Q3qRMwPC-K%GdF)2
z&mp#SUkwSG@WxO1(uZwn-z*KOv<XFozs8Da|1B!pN3v37D<ReIDPULzWvLrJh0yhL
zrzIS55aE}WaDpY|KCy(clB3C>Si&jg31v_$;cHpylujXZH7>A(ZyZGUwk2GxgzCGV
zdJ8%l_ji^k(5bhe?$>7F#+J!PpC;PUby_-6N;jmGPOzmPXz2tg-JDW7+Lmt7($P}-
zg)QwzM_bipAHHRh==L&N8Fa;9yM;{(h5HSmKEizL5`%4Pzl8$<Mx_emZAYEi?SkMo
ze3&;|qJA|)qW+d>059ywq9l%(F)&D*AslR|^9vzC=?tkUTjQ5E>)?&9DSg&DP=T9n
z)|QRO<0dWp;AZugKCgA$O~W=@q8Dq*h>xaWxe&d3Gj2Wt299WrC3?vcu^Yc<iRRI;
zus1~BZB1`EqB*WROOoALXsMRg6dBA#F&>AaxqR9B^+D!`JacmMI;nG$BOmL?w<`G!
zPGNpPA`)wNNc5E2@jabU+<)I|Cf3rYXF2uVEJ21ErcW(KovZ~+b<TqGZsSX5G(!Pe
zG@W-(E=3STGr~`hWSPa)*@}pErpG%9m6`eLMAgkZ5Xmi6qUPp}zZVA0%=}uS@xsqE
zmCzio%U54Ub0k91IjDz|({KzLbhAEWv<I=*(4vk(%lOH-LQ$6TUWi%6!AgR~$^tiP
zWi0fqs67a*5*Ez9H9QoO#;c$bwD2OEfyh>cBKzp?R%GMLiEPBL_dd&fRzYNgex0bg
z*`Fe^VinoPDzfmiitNGvqR94okv)j}^uHpqG8I`>E3#9gC$TF{c7aq0eDMUk>_QSq
z9sGA)YzJz|3QVkzww4X7gA0#3+|h!|av5N82jeGZBxGfFkO;Rt`DHPAr6LPwY7Y+C
zd1l*9xxBnEk{jshB$DHAChyln<&)+EKO+d8g2*%~xmE4xo3usG*48DWnp8rl61a2i
zVdo~!jM80iQP<Vh_{H3yQS?8uIbuj!9%idVl%Ka4^J>ffEDJR~ekvaErK~847q$x0
zFJrpucisDOmZitqISjGbz__CVhIxgav8Au3<<*?>6PcQ&Ow&vpxd^ASwv4iE*`sZd
zel@Qf)GP^t?shaJT>7X*Dauv}*8%5u;E*{aNV4RNWO00dqe|$pY`KXr1th{@HRajD
zkQ4fFx5XKeEmGVL&heyUl+bKh*;{VZB&ab|Q<GI&HO4(u7W3vCt>&JP%~CTx<Pc%G
zyuuSQbV{<aXiv3$j?J5v&HQX}ad<Wh9h*0#ZqtJXFR^UafC!tWupuETy*q<|M<h0#
zL8&cU+q0k%(8$I^eIte}OaH@c^aw2pGSPWlZy?`BS~8cBRJ}R+#=T|>Tr+vs$2F^l
z+&>&~qde4OcG7N~HE^=IIzl@dg`G<_0$?0pM1Ld|N)${#dV@v%BkXEmw{yQoXid?L
zBvoQq){&NQ<4z6UnqrVFqaV4F<U(KIAiwP*QrvxmzWOnkyB}Fvx5VLkFeC@i{`9iA
zjvTO<!y3_(=B3A2afJ5L?VrpQT+`q;R`fc@`*34BnOujx0ILvO-r11pEB-3XX}KA!
zA5pb8SfS0LW-eRs{+C4l8e1i>9vkikTv+A}I+7lHBRO-W+0~VCcjBeO0>Q0<g#$A0
zxWzQ(2D?K;EsV4y8|tV@yq%65pRJ)u(wr{hc50A$B}bCG*ay!f=TAIM=G(uE1Jh;R
zK2QrLPu;+woO3WWrJ05zl1@Q0G5>5>_`vlt7apoB)1aTQZDu3j0*CC8x<Yo1Ol~6V
zynss&WM8U@2uVkmyDI_j#w4)ngHA!>PeAfw?RbXphC=gg9dYAs53!a82_8R9Md|}m
zA8KZ>YT#tD^DStGnNuRL&$VffldAgp_1cFgNisD(NeZBV^5J#G204{Pcos@ufs`u6
z)8#wU*TU&cv34dQtGORCkbwSVAzQ0tlc;E|t!Ro=^c3cTt?1RdI+Nxy5C4OoJK@a1
zB+6XMv6CY%6O!Y&D5@?3R)vq7DP|U1;#Jy8ls6`>5MxpQ3NZ?Inx)-QSM>pNYM8i1
z%`Se-fJBYVX-tJ<bt-5pc*ml$R&XX2LX}lfj`TVXrys5pPW^IpIn?8Uu7jIn_FX5X
zW*|t?CE1s7qX?}s8=0f#K9?sX*-MOx%-9M{vFn)Bkd5aB^#+r%9k0_B{alVLlOCMV
z<)B*w98r|vT+VWHnY93O4$!a2`|I)6@@pO$W&%5aozC$6fcS0%1rxtVx`Oc)JwavQ
zS<JF5Rx69nmc=8fEXo`U(^#fkKeFeJ(fo)2+pzC-oUWNAed^(g?Q<PQF|Qxw@cBWw
zX0aROU>K50@qkj+NTK|?tEGHID9gGkWncOHV^`6;<^fzAA>~{=&yviDna|55^Ci7P
zD*vpj-4ce^W6AEcEC$NwJ+NSk&Zwt4K$F2}alFGFL4;&*Bks)=7G)ffbmUSy$gs<l
z)z^?|DVr<DZdNt=i;d`T@}AL5vW)qaskvOmeuP^rEL#%v<T+DG&?9!>^Y!GOHQ7JE
zAzu$Yx}SrcRLD<IG~s!kWQ7sAlxPkoU_euK0@YUS%@)CG<WQc&(=)0iw*@DP4O>V~
z!o`3@0koG6J9WE6^7Gh~Yerk;@Gc8nS>}b^f}_pTlIM=6D>6ldjC?*VR1yAAPqbM4
zv{CWe`v2AtWvr(PmX8&c=(0nKs@`12wy=;29^pqgm>VzMjJT4yZ2f$GnP@$yfh*5t
zGq02UiI5(xFUyRGQJ&dXC`OzB8ZiC!U9Qfu)u|tjJ4N(vPa$y%yd?F+wcfyNyMal^
zB}mRAKu5}MV`>)hZp<yRv&ita2^ua_)U7wz2fP$>6qvuGoWurD)(15<aEjooT+eL_
zD_)V!#q%bfp*9jYbaq4zdu7)khwT#N9N8o#mcnz8%b+PDj8AAUhWia!vxa8DgQOTN
zn-VMh*XygfCMPCXuwN2ZDU>k3d80lV>9T`<Yo`PWq>{rZxrq2l%?t#CZ)kOW4y@CW
z0L<U1&w=n|+>@5|2lZtpc#Try(7WT;*N5r^mJiltc?~bro#ivyo3;ry=cA^Za-tnj
zsNu$M8EQEO-(P5#=MU1aW3ZCSVWc4>k(36RzQ2}knf`|r<DR6K-(4#Tk3@I?=KE`P
zCO*p3r-F@-Mzs%UpHo7co;ZwscrNs$)i%6b^RlFbP7gbb7LviAG>78O9Fi$pb!m{!
zb=sj2cE<oJwbjhv1qrF9v9D&at!8#gHH~dGQbILze?v73Y&Gv|HM*Ejts}3H)Z}15
zUXjj@!AveLApf>+T}5zDUrX!}MRJ&g2Cqf@8Kxm&z8$ITLF*41nKnOwmQoW1eQm){
z2Nm3B3;HyuNbxgf4y=j}D)`YB>|)rYj#l|**tNMvzNTX{)m|zFcD@;Utt?pc<zdX!
zA=gTH+w^N7S0j@Br&ZryqlrQHrxvu~Ji}IVja(%#k2PQb_g=$lVeK5o8j~9|Z}ZNu
zY*&8qY`Vryt1%5^&lUSKOKg<UWZq#a+nl|{)RkdgYmjFa3pUK>por~>ZUtpj`bGmW
zuumoOa3HWI)T>Ig6fMNrPS1aAVN8*}WPMNsljRM>By6kenOhx2*T~%`M(S(&)Pj?b
z!D{6g5p)b1t>j9is5rTg!UZDHbtcw6X2xB^ns0(ft-%tsj@*60;XnL3dDgJ5*@bn{
zoIorBklotUj-G^Pe7Dr3ZNBetwZ^FW$N&6quDV`W^^gClRS!&7{ndYW)klX_e`TwF
zJX!Up|J_x$46FXsRy|g$UXQUvGI9eB$w&6pL<f0$EW%5K*mj&CE$(#83$&pex^RYM
z=;+HSN-QX{AJXGz1r5#!4qq;Jl$7>%mpEBZCL+#+^k@eSf1uK;EA^BVYJSU;B`QBb
zD{r1T4!2=x$O5RVI`|OH?C+X6!Zx$)aPfKmYBOymLfg$Xv#!{Uzqy&hA#7$&Tx|L>
zS$!o}%V+aKLzuH$h?L?nPXDYEv-WB^)|zSPJas3C71p{dUFNo625}93K~B0I+)K|j
zMNT#bcrLI!7dD&?FFA7Onfb87uwEpr!V_HUzLpg|<9DLOIFuY7E@e$ue#b~GDs)y)
zxLRGpA3`sC;kc{qUicB#B^wp$>V>Oyb?j&;j@l^<VH=!Vwljaw77U~Xr=Zpv76fN<
z<j)lkBzW8Ot{u~eDCE^*PMEK$BZDyFgI8OH^=(5@ShsR^&=D4$<053jzlNOW?J@TI
zt3=oJZ)DXLKNE6Qfq8&anTcI!X*1;fvz<ANe@5IO@}QGWVB9410!!s9SRlaBjYPf3
zS?@S7Nr(ZlPNCUwl^$kZ5T52PJo=k0`m{#UY3n`r9(p1nw_})JWhM2B5Z%fN$CAt=
zc<X`;3$N0nzyg@0nT1!uwzLtpmDFED>UW-`2w9oIMdXcKWyP|aezNR9f6B7_wp^pj
zQhFyUTmlOHE#kPwYI);IjGkr(sTM+%$128^1et4tT01y>t9MbPvy^P)zsUweHgaQr
zU+ia$af#cp4IB^On^<u=FgKO*)($rc;?wv$p04~}@<LcBDfRFsx73p}o-JZ6`*?+V
zhPiq^tE8SA4_QxBTMwy1(PQfdS^I*3i0O8kcBi;e|E2>vtVwdg?YL5Gvk^`Bc{RJ>
z^33)t<+NZPYLZ*R?=v>i_&CQO)=Mo6Pjd(3)0Ec$By0|t?JtwDe8<fxEO6h+YOy7z
z<VuapKGY<?95>{51l+{)WRE-PL?^4o3FN+WrP_T%nh@kKR}H5o22M`!>p@{&&T67N
zF4{qydOWjD8#Mxo({fu-dl{hO;5ZVU>CnmMO@1diAPcP?(;JK3%)T-=ILa()!qrI~
zKz>k7eTmh;LC>05SK67iLWE@q=jgcBlPaAfPjtB+^PfeC99}-4hU{3OnQ)~>h&E}1
z?_?FrO0BM^)$)`W8MU#7#LBTARt!7A)qo}EFmZ{CBM&c~-8rZY8r})_G<P`uHB|C)
z<Fjd9^a_tx(xQP!*cR>itF~w`B4??h`z|ksl6C`~3Q2cZ(x)8hc1!wnDrpJ|cQ$2l
zy2i?QvO|bBhY;6aC6mldfsrW5cPyjn%7~M`_OzO;y~>E+w#0KC@tc--K{;X#xP{H0
zVVk|$HG8@xcqg^lDJYyztWMDoT5=#q|5YPsTV1WimPx6tm2r^4$#QOo@|2HS$}&g!
zu%-OE9Aynhd2+N(U5<xr*}PmYyBwn%bg#m-2lMP--HXbbjv#$YyiDED!+rj$^Zl`=
zVi9oY-hG8+1hRw%H<g>GGOxP^S^LEgjq5gN8{cth$O&T<zmfG1ucF#HhP$tjNtGmL
zEoJK-tYO{_SEvSluBm9?4OgI2*ECfP>_X^vdzl1_46~L7I4@~K#-4jZ0!JsTYbqx3
zb)we0hwc~gl4gj7AsX`+K1brCp{=FT!l~6zj=$*y8E9s)*7|YNJU;0$wUnHE^DcW7
z1(bi*R3?;LWo(F|4{+#-lRU!dKeG=mhAZqAw4WA=R@70w7b?5CSKP7sI`p3v{Vw*6
z0k*0EM@r{@v8vsd+p31}!oWXK)pm#ew4zH@pIxrKn(*sYz2|7BR$5h#@6`!Ga7)ZA
zcIfjJy&F}nvQ;hq^{S>j+IK3gs;;lfZu!qS^beA~nq;dg)vD|fPHdMnGvaboE{B}$
zN6HQ>i!C9RhdL1+y<9e`eMh1qSuz&?k;@rve!xY5U_TM!_E+5R9j>fyVl0lgLWOmv
z8pkQ)w}ORrnJV(ZM_HBi<z;pYc$^rbIL>UFs@h|v#&Sf}#ZTult|csnuns%G$Cs(T
z9bJx2`{Hz6I5k=MVE_lGv3Val4$PCFqeFy0sJ5G{wvL!<zPXrsE8MAe{$;Wh-#7}p
z);0L8qr^zOlRaH}%4NnRr3m8)Srzs`JVQxWK{~@Wc}+RWETLS_p3=_6-oyfDSb37E
zNv<irEZ2N|R5yCu3CBOaiCpvaWqRV@c@)WL*Mz{QJdmp{EW|N1?pE~L1Zro|?iDf7
zHHm4lysRj{bz^Y9ZPyPfO0i)cu|#8<Nx~`~q>s8tw(po`xqMn8?8b&Ao3&&tC7W%@
zxUgiCmW-ohlPwt^mVBrs<0<*jmQ0}JAP<v7u4KI41BdaN!u4A5`A1@5z->bp?b)nm
z>oEU_Fq$mMKc=;>c8OlZ)e-o=YWXi~h6a_ASG4A3l)Pd~)`o4Gq$O)9nPf{gYnx<T
zT65`hq-JcJ?Jx?6F^uS<X8A6;@{30u)o9zYkG9ys6r%er(f5w1pC$S!N%XKKdf;fu
zBBpiUb`~i=aC9!8RLTR7cJkk&B?BqhV@rmGCA+j_7$v)G$*8botCozSWUDQCE-cxg
zCC^c^!In%7OWx6vsg%59OI{62mTJkXlq|I+3&N6xTC#wWg|=jISn`^dET-f&Te3PV
znXV<PDVc6dO2U#Cw4{WR7i`H#l*nyX?sl?{x822x{SR5Oleh!ZStmf5$6y6*Q%_y!
zMFQ&~i`B2W@X^ipy8_*O?{6-f?{@`kzC)V3;}6F#4r!bfbTexk1eXxQEU+4CL~}-)
zB*f1PwA#_l<xs8+H(Q2dmEmS#IOUfZu5=7vb_~~8hI5r6+6P~?s1S`CTgCdiP@sk9
zH^+ofeUrhi$9m@XbR@xOMgiF}aKmw3!1Dd1xkfs4^41RG@=B8I&2;A)99{+V73<+s
zsr@RZwyk|nb1~LNJ3pSSf2p<uS$1GD7s!Q7v#&Y5d(@JA-&_s#Lz4V(sgnHIoCVU?
zlJsvO3j{+v5}&{*-c>YT&f_VynSGbqu0GHLKl!(ps;Gyz5R1dd&A>NXFJ*6`8)I6~
zppP!4@f=HAFy7mt!G%%W0?ka^Odd<>$Lz5YH=!)K)W*$TY(cx;x0K^skUse#l*G-R
zZ^1bPO2d^lahXKotEH(nZia~@Jl?+4S{?JGm2X-eue3n)%WR7mwb1ctO8BA#g=7(y
zNmE~Olq=-R%u5*q5-8;JEK9kvg&s1Rat48Nnx%Y4nmW}{z9(NMTguh)dAg<ikk^}~
zuaWq95>c-+2jN_bn}8qXl2(p(<R8nIVn`_1ET6|(@=sgnX6>ac2eg4sR{Nx$M{Q$1
zm(Pzwgp_x;urWqlO({|q%Knb>N5SrEDG$i!hoD5t-#JQCjY)#zAtLeBP|9=kbhz&l
zE98O4(8@iR=nxN(&);04)8ffv+<8wEHE^2P{b8B#L7ncjSfh^-SBc&E#IsT!X|qL|
zp-5^3t++&tpqa<GNXqg{)c%q0W`#wbe~g>NH8o#`DItC=9A!&+GJyGJx^w^Sb&LDa
zF>G0S9dyhk(T92Xnay+CF|r-WA$w#Jwg(ssV~CVLdI^4N?66~)3F)yiJ;5)<AkaF*
zsK6rPcNgSGJF~dpToZj!pF$0;wh1Er$rR+t5;$~YrEB;*M`t66TXI4EXwEKuk#{y2
z@?gQ<i&dI09xHNkik90)QLgkpbxCp10Dr;D`V-07OtoXNEoMO21H$(#;eeJ*;_!ee
zMk7D;CppsR(n4;q;BvEZc(pZUihBDjMD3V@dZ%@}eFx%p;;cB%GB}NUR(E04Y`#dF
zJhP>+3;n55wd%~2HgwmgEo~ckj6_vuLOmKBku-7cHxV_pZjh7;x~5cc=Q>+KO_?5L
zSPmyh`jE^HG1%0a;V<T>Ou^&_)T_L@)SYfUZM(I*r4HABg~MZ(!-K7qgG_py=UEK6
zKIZY&g&F`J+Dbys+_Acl`xj<tD-<rywTh`ni1pw!q@a5L5Xv=`3-x$7p%vqA<euvO
z3w2l~!(Se+sv+Jnd+=tuiAzvy%Xci&EPgl19Xl3XC@XGuD|_r<a<c~By3lR|3YD3Y
zk$RNSBSBrJBa~>NHLVB^li|{rjrT%ssNucRHr?jTdQCsK8!MFJ4a>oYt+_j+!AB8(
z&??+mv-FVe^7rsbLZHbtlD{{^HvOkoqO#&mqwVX$iU&aK_KLG~BK^1{j)v-EF&{ln
z$6*;;<hJk8VSiEiJbRoPs=W1ie}o6362<?&{8Ue5*ha?swZF*tFFp>fm(GiBl19*k
zAg4GmM5Sg<51Q!fY8D`2XVHa3cWaZK%+iuXC^HG!GVmqGi6@cwwsfi2(!tcI$N0ZU
zrkwfkIEi}LN^;PSj%Q~4M$!vpsVX3)4!E;2Fq{6ugrtJcw1V5T0%>Vx?C;FZMQJv-
z1jo_ZKS$d|%%t3WG<Ut#K^>+@%?>;c*KvJdHj@0&Xc<4X-K|OnffDo4Ur6qgC(MJY
zbMZ`NOPX_4^U!ex6dSWcO8I>-BlSLYoHYMBc9yk&)>$+nmz;?NtD8n7<ZjnczSi+y
zO*6aiHHx;#_BAdqUt=SwG0QwIo#Ore?o~3(hkudmNLT9wsg>NfS8Sah<XTl9J`g0~
zj5JgH=UiUZ_?Y;LZb1m^nLq2<d>c(<(X(>KS^6(><v?Sy*ap82SN{(es5RcNHOs*K
zZ~>OkgROO6*WGx(Eqz!^_e<#@TiQ=%&X=}yNNXOO`tkx5%dpmB=PT_FTRK8%cL;59
z3g<1hbhMUkk<tk%oHy9g=Y;bH%XyY^wrWIn03&fAQB@Z>Rn^+*koc9fz}<bA;|kw`
z5qS=l)AH7`*|QC@bEk5e?vPiumW^Aq-tktCW9DmB-=A-*>f0txdw)D%d;CZn@md_r
zZMQ>ytWC%*OSx?i8pzIMof8}3K_KF}HhD|g?`QpGiA+p>M}u4nekXu=IhuLNeB{=c
z{%mCi*A7<o;=Z1FDN?|60lb)>`Q_n0vyLC%(5X75jSTxg$PE7Ie44qH$3uAL^*A20
zkS(3cIml_2$_iWg%eL}~|CY+<YUPrayvkR;$yUBzhU@k7b-ulhRQRkC+}K79IuZp*
zBecaIw&C}2#@pggQi?~~;%!nq(iVTF#X28{+R`uDFl&a|(jVH$e3Vd8^EyHL#M6=T
zwm;{plwT7m`_<n_%sCMuUgPKaS-A`(|DK4NfY6jkL2DKEJbojFpP;pZ-rQOQEphN7
zDq|QwkLIE&>BUfe%YI;6v5%%uWyzoAr^aMxwNf{<QWwvg;n^Z{jOy4odudwpAsf-v
zH9{U;=>`@7tIu1Dw#qV-FOa*1>1HjHyP?_a`ZzXqCS{oxb|$?PDMZuEq%t>g=*Gkd
z-#kxiW_IUT_SJc=Zw2(tkjAJNSs&6Kj%%y=&p0?GuU7K2H=f7)xzDy0XAyT&Szw2=
zR3kACDCTbK!h8^!XhegXYLk8OS(ql&gz)C^3rS9eEId!PrUh+NO_K%Z*+r;OS)|p~
zG>Ny7#L!GMd`$w5lB_=$Qn|-4&9dFl)}CTX=%=X~Mo=UCVKk4$*w?mDe#RzY)|^Ks
zpPASeJL)fTofH<5ac9i3w%y{d2We9++H#L3KRzmmFM34o)bM<aLtT`DTFat7n}V8^
zg1RsTHOHckNkNrYVR>;%bk26d`{(jdj+xy~GLGg3jZ1<fIs~mbTBOIKZRB!Ud;_h%
zgR|43b5*v(+RYCB#5P41cN+$VoN0sGB1U>azJ;~}N+N9t{S!M}Mckvny<UFv@ps@J
zmS<M+OTpN1FSW}ZMayzT&CHG8B_laSuC7wFvYi-ZFWDxPw37pX?hwzNtGZxAJK1}l
zI~QfO8O*SBv@PADrK6>EXG-alw)9gieNsyI+ESb{Quo8>inb+tvz_SbM{M0cwv!E1
z>%Py@|E%=)34Q<L{dpEIR2WjhuI!C%-AT99$Bq|cTV7%Owv!5U(DBmVXxBPCl*eyF
z){q~~z>ulFmXA#EeYP>XxRowe#52c>KuF%l|6}P#9WTbZw&<fyGTFx*?@qp(g@z6Y
zz<gw}Up`*6naw-EZ_!DAjpDWAIY1D7Tx6R#SDJWB%q+De^N;t_o<tE#@`fXsXGz|6
zB(GbNRZ8OAS9tWSeSHE6TXD0ulNe-6$l;LJS;Txf7K;IP?Z^jhT^O7tr{p+3+7~<N
z>14eMOrDpSW|?n2UX{J%@Se^*<t~pgt1l!RUl!)lK7VgYF72To2c6l9j@eg=pRugJ
z^sLX6t5duff`NnxmtV;B^=*8T#%&MlDNe1Y@|;~6vsZmRgKRy0PLM9>nEljIXK>L8
zGJ}8W$Sf*8L1z&Wc%E8KK2eJ7GO$0+Psv1mo8NoZds&i(vZbTWrWfIZS+2sVM-SvN
zJNKr>i>b8hgjO@b#f|ZvC4Tb+S$ei&$&S{9%9lHG@5U@YLGC%yg2&Gv#6IKll)n%z
zQxM`Ev|x^9@=m!H9BqHf`&nhOF=Wy|mB|kwlOEEP$(GA@ZHmrQL_pGqIzU-Tx6*`;
zx{`LEAS*!<iH2L6J*v}mAAi(Jb>9i><B!_XzQ2>1KRU>f>qYK$Wfoz6e2{tIcarJU
z*9SpywD#dbP(BKP@q2_~S+a8I6hd@Eri@=jUb}?Xu*!LDcf6iHm{;}6c&!&+&sWat
zeaCC^!MrMM*An41t8!j%J6`h+!D}w^l52T%o5LJVPT{pA<W=6!{<7n>((+;(@bXf_
zwIZ|Uh1vSbnN4uaHvO~Ah6}Tul{0(7G5g}5Wi~*VeN#EJ{*Kv?2Q#ao-8@xh$Tz&&
z!K@k3USbsAoWr_)tUYGDTlKqb>7e$q>UT@&lkNS!wbPak*V3I*`kXCQ&Ly_=1?5~K
zrBhQluePPrm3Fn1zM8^$u`PW~OBYM&g0M7vKtgsMo4qjS9JSjPwP)v%pDUPkPJvm{
zp0&VBQ+Vpca;m8vc4=(YtRsba5=7SP4>!CB2W0Vad)csw@QE+q|B(}94^rz|u+5Qw
zRTh}%k&-3(rahK1mT#8WT4tN*9`i}3@Xcph@C@5#WP4y9A+ljf9ypPvCDwCb@6HSE
zljE)TN!@$@gid3FIWdlv{q>);Jws2Ffk4lJ$`dN~3fy8;vsG-Mzjl27TqWQ3C+Q&>
z_9w>KL_f3i&z&e+cG&d3uxaf?_8thB@l#IZtoZJqG{!hhkkyPVvD8bn_7Z+0>XvA0
z(%t0xMMU0QHv<_gi?i}XG0!+No?vd!v|3YFkbVc!Cq#=Z(#8|nn=H~57U{?oqzM+O
zRFO)86V1i8ww8iJvWV0+(&Fs0^5_841(s18K|0yi_M}BRaH4Lzg7gQA)FB1wL5sBW
zBt>eAIZqYptJABKxO~#dlFK-a=<7Bd<bPBRe&8fgtleD%USmXh!&z<#hR;#!Y?=bv
z-HhL-XY2S)_`Q07@!w&GE3SQl+T*evZLqkn{9de=_;t~fvT<s2cQdZ8D7~t@e8YM|
zIA?2HR{vhway^wSv4rb?Z*^Au&gC0qB0YUJL&5{>zZXmNS&K9DWPT|)K_-sVk8T$1
zI!hi^HN#Jq7huz($K-G%$QB-3DX2*#=VX+^bSUMfk}6gqG;{S#z4k1A4}y0jPL9)8
z?!7gstcEvj4fAacrcZ<)qotco`9&~vvhLgS&(x{-<|(Y+e#AY4qtM0}c>-rsd;1iL
zHr}MMs>$qR{A7W=?r8{}k=xgli<$D4rX&u@b1UM$m67elz;XwafjtVjTQ%{Lt!R~2
zgq3z?fms7{WaYHPy%3%X!M?!W6_{uVO3JmYFI~1R+n_D0rVD^H3b`w2a6a>6yrlRB
zH3+E`$zBvhx-ejJ-}))&WTl^?!Gs<(@%P!%M7bz$$R(|ul3ZvFtz_to)%RVbqArI(
zmq5!8f6$7C<ynKcsqTy_>}`M01kR`P@|Fel5#>?hh7?hXpt$48S8_(=<r-Yoe_-~K
zf{>RRG_^w_)C|e<c{t-}Z!#I0J~T~=c@34LTBaXV1wYPPv5G}h0mFHD;$FDLg-zt`
z1qswOjkZO@bJ7CA@_g$co=Xqr`I6<iMtR=Gv3`M-MxPF9aq39Hm-f<ZHB0YeY<@aJ
zMfPw985@GrU}*R&!gI!AQxTB17*BT4<)-VOh_diFAQ;&FJ1xeT4x)9{Xx?;&Zm#2~
zI?Zf4L$<LFx_z>sqz*Uu5-1Qx8Y#OAk2G7>aCm5bG4~88b0|y|w$U@-fr<NW2YVcv
zEZ-61`yE6z+~KmxZS=_nlHiaf`=#r(#z?qWEfvL^hpm0)3^|s4LSrQ>3lbzh@(jDL
zC=`e(&eRU11UXcYpnk(^X?h6hw>%A<p!;q+6_z-LroP&{GS3jxOuc!yqCSgCa{NE*
z9}?G7u{eoXkK)h?y+lKn@hl}{hmY~q-r_Kdn}5PJ#OV$J4i{P^$fQ|3vp1u40FPo|
zdc^Znom~&F)VrA)ICna1&y3zW=?AUk2S_ynbzxA$t{E+Y_mGwPmPQAREt~ybo5xZZ
z5k0A)B;*G$<Z+Ct!U}0_UYTay>eXbTR1MGQyp6*d{h1lPb<f@|OfB7LNB6_ZG&LHA
z;H#U_o2}IRw31$B;xr-VVEWAH&6{WaSEcRcvetCoM?Q=<30Bc+S^7<5zr%(hjU&Pl
z()^=TUg^|piZSz#f}mC+N1y)E7=US@p()&r=Ei8A<gcdi77ujg%;QIJ2R33d(JUXE
z>yfau@njpUUX`Xwo-<)nxg?X|imUv#&wHq#=}soA=<X4?Z)zbm+YEg`XmlV#;s7Ka
zl1p{kA@-1mruSw7BK>cL(hBNP$87v+dYzs?jm;N%ubakC$l?lHjoA4mFwUCO9PPB#
z&}MMUfZ5+p)-yMI@uW@b9c>vgAFSqgS?eHNd1J1Q?A8owqxgNmWJ^NsZE$SBN^_C7
zNS)%r7&Wc8?&hBeZ*~WMeuiCw=jp0TTNY@uO#)Z^3JGIWML=Sb@_nUz&*Xa<ZMRxu
zihPOSzT!_4`PnY{mLq?TDEF;R_^rN_rGDEVGz-h1+h2jMo-K3C+J>0Q<w*N2*ZwJw
zS>FPi{%d6&EV+|HSy1u2YgQ|ZR=!Rrjh43T6#271l#Ofh?`>Im%wH-y*%F*(|CImc
zx_X=~EC0<k@I+fyUQ!XJG%a&HbzEig9}%Q=37ovc!RF-%u2wMeC=MRTi4a>RAj8WQ
zevgn<lg6RpKWtU5H7D6`sRLaW<XK=Op@Z2j{HBxV^za*Hu5BVb#Z*xyr&()@sj}Fg
zoR+MEw6USI-4L|5m|<twjnaDHW7Rq9Mnb3V{-FFf*LYW!B&UWkVGT!FF0L(S+iz|i
zBnh-W*Rk>g^1Dh25aelrYqT~qIei*h{G;uk<L#eQEBkhu1-hoVZ?4a7aNRJJpLT9G
zTwuxFG(T3p?YD~{*%mjZZbg?D^`9((j@$k~8<-r$<Ou6{yLpl-GMR)#dD;Z$+eWAy
z1=v((W=9Lu1|A4!M)`5$F-M)4edS-B(EY>lIyY>XR^Gz$a&mNo;o5hMEjz*fDc|VB
zEYPVTHyTc=<;9s}F`L*wb?u+V_D>`G$5m8*-KH*|{lOmvle-!8iv?(AaQ%jq;nm#W
z!Q81zbSV6nBYzr62d)h%3l_KE`cU0g1SjRIEB_4Js?^oxlzwsuQWr#0W4IA?^FY5H
zu&c5niksOLx#;LRE{B*EsVK=-rvj7P3&XoVNNtPL5as(&7PxM@YAfL>ba6KMZFi`t
z^)$AoGm4W|ZD(YDR8Yf?k;N{t*b{#p8&CdHi#$t_?fC%b3divzORjRxvq*1wjuAp3
zbTTPMkmMJrI0UV&94TcOSzc|!C6Vm34u3^SqypWxlSH9L`4uN>9L(k6<Z*&#m4)gx
zJps}Vi}ch%NNz1BTcE4nO+csW+!A#Q{7frkRZ{z{en_e%T_k0J+v*$_HwE03EB`HZ
zjg<xEHTmIo6*wK|G^MM-Z41>Bniy#&^yp?jAhKIUUbElbqY&SQ+&Df(mMVr7X5f=m
z<<~yzn$LR3`|br6_yp2OlSraS=ou@aQk9UN&UJLl0;h+Q64J@;x>z2VlqKx?T6X$V
zPHc#(1$9);vE9--zvDLn7`g`%6^G7k0p-~z>xBI-OF8@?%48d;Gr7{7L7;m`S>Tkc
z+w`4T<&>=3rJN>owq<gKrdpbAAj@y2Ze0Fg`&gc5`TEN1eL6~BgzeKN`V>J_>scLY
z0GFQ_PNT`L;_M7l6V+VCPXptjLN%9llklAxQf(b6Lw8SNN~k@~vu+%D@N&dD1xIw0
zmwOv?a`~#8{$R^cZZI}Ommd~`Jx7eH+{8QVH9q1%<VSf(2Z-Mh?xAX7$f7s4mnw1D
z26aEv-7WboIe@x`52^KaO1VKwk|?l~t;3bMeX2Z)TW0kwrW?8PDB7qL-J`6}vjlEk
zY7OSQw7r2Fr<BG%AcNrsvm!Wk#P?g3lkC0QGn2~7E(2aBD+`>NmNZgyvv9pU7~d%v
zle0HSmGAy?-^v0v7gL(3dv1AIR4QgC1ZNevbyA)#RZiSP_l9`+I#jK@I?5Z%$@7cv
ztJo7sgT8-|wjt2LEC*KM>l{m(eL8Xne^YZkIXm@R^5k>Sx3ch*pwd1NRF~P1H0GSD
ziHK9UEO5)&eM_~&4%iJ+=-g5*ulsegNS;xXCrDi><u!-yF0M}p>>2fI;aW}Cc6md$
zxt+OgqL#N3lWkQgyOT*IQBOLj1B&8c21<*8eGSZ>p!B}88me;8rl>4DfTawP4z=q=
zMX<AAD}vpg=H~Pfw&m_y`7*b_Tv_VDrz~82Zmhz7hyrxSp#6c{N*%egcAb)TWsYY^
zju?^$1D&x>{QJWVTzMr=WNs`|&49AN0ne9yl|KQF3D>xr_bGy5rmqhVS8k~|QItp8
z5b6_Wg(YXD&Y$G+M7LhXm>mS{V(XA`w}%m%YAvVh%LBD8{FDgy64r^+ld{32o>D`p
z36MAHq`Lifxt3WqwBJ^@KhOnI{#!-etLwQU(5Z{`#1SOJ+@x1n#`02b_z1p4sJaJa
zitEm(4L!2lbV^mlWubgiBwAikAq6L(GAoR-@Nkk&-?9f2;voeO!-Rbk&(yOw=^IiX
z)RxDk_|dmy!FXG+>7atKw&0_K3P#(4t*HfED!~~y7rC0lm`XA*SUKO^sP@r=yzZAY
zZMjr_AW%guNZ9V5TBKnfi6?rk!9|a`spQNT-Aea|%FNA5-GGulF*gF`bx1gStcEC&
z(btVls<*mhpRKJkwOt<0j8v<@RVvF%hs~Wjof<2zzml_BHxj2*%9~EfZN*PkcfPwU
zaQ0i03&DuC;VexBwy|?IWeS4XCU7=^3V$`vU16Ng;0?i~@b$Q(Yu=5}n-*!lBH4qJ
zoQZhfBGd>B<Es{FSp}r!$O8>lz6}STiL_y=V9H~<)~1{)wX>CAwzX=iBBoozsbV>}
zb{&TDi)aG_@2qRB(WxG%Yt>G02P3zi?hosCm~IDkO2FAb&b}z`0ZAVB4ih;!<G^vz
zjyjcH7Njozii+3~l=pNw-H~fmhrsg*QU*#5r`8VE{qGnRo*8gfa4A=Ps77<kc6
z4|KLCMc`&wYVG^&PO(2cX}N{!dXy^G13^P+i`0txE&r`7G+f;ha7#yJ?OHT=X^C}V
ziFXpms{?OSFsa#&ubXCU0Kl#TRgXOh>-k|xv<|A_c@4ui;u1r9CF=1jz-i{$>oml>
zv`USj8DWm#ICEw<UWn=zoES5ClRz=1UJ-@048JoH%mk1ge~2ZrpC1`zu{{!zY}g!g
z)*$i@_3^p4(~5W~U6RX)ck$!0Ir4T+(7p=~;4}s`-OMJQkb@D~(OepV<V>?SEusa3
zd*|!-uhJs07JWUZN>%eHp|M}WLe*a(Z6HN>dT$;Bfb*#koXR8mG@d~5smgp3ahZx`
zZG-xKgO;4U`9Uq%G?gEt*K6}KNfC}*N#c(@huNQm*<Ph2_;bC6h@<HnWD|IrJUK{^
z?ZVYUk{nj{lHbRK&)kT_zlqB%<o9FE0)A$d)7@r%nKW@{kbX4}!1B9?l5;385rKYX
zLC^|ZvfCvir#Naea%el3mV0&!vI#7O_1q;HGYi9dmPW*9lOP;7FUX!l&L?rM7(b^2
z&ndk$vtl(E#J3qLVC#rI;%PcJFlD58@Ob-V9rMR~msU3DU!)P?8`~>-c?2we8TC7&
zKl00NwfL!24y3Zj%Z~`w=ap8T5k)*d^@<lGcaFs$@%yau`!!6o4Bm-jB1qb!sVXl-
zRnma_D{sK}(tyYROAYwq7aCv^myw6eQ0}+m@_et^m5s|O=KV;44#+Nk16PumbU=pp
z%4f*Qg^FJ-T-<>F5Ro-BkMN`n_|K6dKJ1l@llutDwkRQDn{Kw$XPW)9Z!sdXUw(|S
zO2j-_NXpQcQm1Rh{ZT@diE~NG;u?T?Fq+4Qy<DF#1EZ2#TEi@=PlQo^NtY{06&`;q
zK2&}D{9&x4^xeEtn7_@)5955@(^dFHR1!+4aHmu_oC<eRA>Dq;RXFEgs&F9{J}1mw
zb)&U9q`XP08y79&!(Q>9P4H5l`|p<WY$xSSVJjwvQhrZbF_})kXInAVb^6(VsnhFh
zr{4>!dsRE#l&4Jt)BQ3uS#^GiIND4UC(oxvyTu19Y<m*+R1&r=2^*P&l_p_NCt;r?
zVb3ICA17hON!ZpTY!t8|TJJ|m*q9{V<|J%v5{CF(o5v+#A0}brldunxunEA1Y8~s7
zu;-I_B}v!|z-(LI?<pIFVNAN|Nr?Nm_BYhtTA3CvCB33f!Yr4FI=^Bn1&5P<xr&gu
zxuc8LGnz0tjg%`T1i!l_VW<5v%#8qH@{oEf)DBwgp&xtbGagD*J;Z(5LrXn$q=#<v
z(5F0fyN8bO&>bE++(UPI=r9l6<)Ke{=%*g~gol3Sp^tm$=N>xLL%;CQAs+grhYt48
zuRL^+hwcWd;~+3M4gz!19q2Llc<5suy4OP=_0WAGSAjXM0&`p+@t9>E`ml%Y_t1wt
z^niyx=%HVG=l~D>#zP<Q&~H8Teh>Z5L;HK^_a55MLx1qlz8?CchZcG0Pab-ohyLuL
zeLS?!nxu4oo|>HJ_j%|~9$Ms~KYD0i5B<SI`+4a19@^hSzw^-hJ@i`-eZWJ%@z4Pt
z`n88X=%EKZ^dS%3@1YNSXqksT0#xUtz}(FF-v)EDc%N_eqrNYDJ@m1#F9LIY`R{<a
z_U!RB4-9J-n5+5!OEA~t-M+_z!X69E_4xk{Ft@XQ>3cFH?1{i!PyQ!iPS#)e77nF_
z4*I!=J|4DPV6NQ)bC{ob%qK!jfjP|I0&^Tc^&FoJISR~i{4FrYahK;fEaWIK$5CJo
zbEn4~9%2g2VG7J)?(mo+LQH`<Oo2Jf?H=<f58dXWBR#a#L!S=03e0i+EihO6C!XUo
zAxD8Zj)#Id>W@8jaY!vNM=daixz%Hi3NZ!dFa_o?w|LCa9{Q1oj`7gV9y-=TH+kqd
zpt`RJ%+)0@SJ#Igb9{&?Fo!8HhxviWoDgCP%wY=5VQ%!8&xV)+bC|yc<~VNf9G~;`
ztoP98!+Hef>JgZuFY%Z!c<4G0o#>(Od+3WEy4FJ{dFXo{IvJ?SSzxXOhl06Qz3Zu`
zc%JWg=u{6~<Dt_$bhU?0_s~@yI>SR(dgx0Yy23+edgyWweOaLP6tvVsUj?eY6`1So
zp<u4JOFZ=~U(;d_ea%A`dFX5pecMClc<5UmIv1$6N?@*40&^{R(__vP%(y9fsdt=b
zJv-yGDCE#KkDLe6#`DbZdvkGI?KHeH!;h#K`9+Y{+<49lj?)*8`Q4Lqcr%pDiS(06
zr@6OS4tzVgL!WM{76o;N=s|Dy3fGu7e6{m^V-|Yo0uO!NLtppM1s=Lk8ZJk5fw?Xo
z3g)^!-&4O4QVYyc3(R58^O$dX=v)td%R}dQ=-VDT+d~(5=xZLj7^t>iV6FuMb1j(V
zF_(mx0&|!GbC|Ds%%vfwz#OK)9Of$?bD4*}?4ip;dVx85fjRn_9&?3<zT}}RJ#>bL
zuJX|79=h5?r+Mfa51s0v?|A4G4}I4|Cwu679y-ZG*LvuS9{RqAPV~@q9{PfZmU!s%
z9=hH`pYzZS9{Q|@ZuHOz9{PcYj`z?HJ#?IhZt~Ev9=h2>$9U*R9y;1Xw|M9%58Vn>
z=c2&exC_jUd$Gs-IK&i~!xWgqe8yvb;-OD_XsL&e^w4b{`jm%m2db?Sn5#=*uC5Ut
zbB92!Y98jHyFBib9{Q<=KH;IC0oA$$=IRodtLt%(`FV&bFo!8HhdI<^ei32{%wY=5
zVGi+_Uxt_hbC?2in1em$R~|aZLwASt0(0~NbMyl}=AIB!U=H(8Fh~8Er`{V<3(Qdq
z%waz2G52}sBOY4jp$~iL{*bG{9M^vp=34ZSujN2ki@;nh0&|!Tdd#mqbbyC`<Dn0D
z=(iqvzlVP3q5VDddk^jBp+9(NUl0A!LyJ80Cl9^PLx1+rJ|5cVon+sBp5om}9{Q7q
z7J2B89@^JKfAG+L9{Rn9_V>{5JoJ7K{nkSt@X&8ObbyC`?V%5P=m8IX$V2yg=))db
z=An;x=spj9)I;}r=wm=t3kb~3kwd}UyxHTa2Zq!FbJPNJn7cjZAP@b@LkEZS0(0~N
zbM#+&%po57g@+Ce=>_KK1?K2K_n41+=w}}KL`W|%M=vl(|Eb4((nEK7=&+DpV2=J!
zFvoMJrylNk?(onN9=hE_pYqUc9y&6tUtq5OL&04ArJnlfkXm4l`cN=O{fVc3CZraa
zqZXLM{Mcg_d+1gV9Tn0G%+U+X(QomXqeD!AIZS~$%#S?g7!Td-p<_dOfjRpB3p3{b
z%HTK3fLNa)sKK$(#rnvlAcnC{73d}=S1TLS8y8MEfw>7MFgIa8^iy-ZhkoFp6GD1{
zIr{$&m}}2QU-PqJ%>r{Z9}4EEH+bsjLTZ6IYJoXJwZvn-5Mm0<VIB(R>RIQhCx+Al
zbJT}|IqLU4^^4LX8{b~*p_4rHJrA86)+8`jlfWGPyB>3jhrZ*XQ$u=zIeLLP`ZXSN
zT8JqyhxuDzj^k?2ak{T(m50s<>k*i%=YJc_^<|}R^-E!^1?E~UFo(IqW6t!@<sSNS
zNG~u)FEB^H%wxVHP<yai;-RxbdVx85fjRoc9`m&jQ(z8LU=DMU$DHk<Z+qw*4}Hr+
z=Z0Jb=C}&XaedQc&hyYWJaoQ?F7(g^9{Re6zV4w5JanOl&iBwa1Zt%<*F)a|syb0%
zuC<4PxgN~%)NhB>0&~>A1?D);_8b@aw!h|~i^F;Z=IRlcqo3t5mxP!CbC?2in6G-w
zr5^f<hb{~01?K1l=ICGcn9Bt^TxB=eL*EmqmGX-o`o8Bd(L>jH=nEcN;-Sxb=z4(;
z)7r*(=tn@4FxT2c!CVhUd+IGAwZI(pp<s@Bl&9VrQVYyc9}4F3c%Jsur6IMz9QAL3
zIT=6YIc^U*3hYL^yO)EVQwhw~EO@3~e2~hr$n*ROxT~p;hxU0F=;Mm}!{p@QwVy!k
z$baXd_Y2hSVBdP^10MQ~hYs-2uRZiZ4?W<a4|(W*4}Dmmwhj9{^ihG@Osd@;I><x6
z@X(<^wPylz()m|m<YJXniIWLBYdXJ}kxnA$PxlrE83{9&7f{mqL2Z%}Gx@n&E<z3j
z!tKT4h&&o3iNBP8m@F?c-;h;VhaX^<L{`!m8`z5;J9%qcsd%qMe(}1=I38Bf*Pm+g
zkmbiDzc%4XB%Ye7&ck3J^3xo=-Nl101AFURM8y$4gGRl%@+1rI$MEdSB;o^UW_y;T
zIFd9UpGszM`1f4*eGJ)$q-yH(E;}esevj5ji$to#)2c>qy{mbe{Lk=gnc;F+ljx<j
zqvsv!zwGFF{HLW|YDf9+keDG32wmz3=5P3~F39{+ahJ%iu&=ao+o)ftE!q6!2a7B%
z4f|x7RLVaUn8Z)AN|xFp%O~|S2uZ5c(o_x>ZZVS#EB#P`bEP;In8dUn045<75NuHr
zA^9PR&xZ;?zW(Ze$$BebT5c79d_5HWV9ftXNOrUf|Jm;A2kB<=Lnx?9TbSBGp$cV~
z{49_hKTH;0t~4Y`^0%d(%fp3_{+FapeoZAz{hTBLnEE-1bMS{$=JqrHrnV#r)DNhz
z>}Y36huwH@E<cq}@&7&IJr06Z#JS^FFz$?Z=hrN|{R&n^uK)M@|27GPbHUDtU!NBh
z+ke;piW$%`p8UVucA0Z;JU5z<Z8(E%xKEuHb{k&7&yCeeM9<|X0L@HpcJMn;aV|U6
zh%M#k8dmh;LQRb+LG{7<+(Ke3_dLc0S@ST7(kU#`3q)PWB(EXA^r3fPvSQa%S{nKF
zQZck*=8azb?1R`rks`Ic=Jc9fB|B(oHbvs4LA6tZ!&?WrvHU8Ta$jgguRO|*HPa%K
zf`&Kh&zv5P4b+wg1+{!xFK+ux?cEKeT09XL<=f7ldb?;qH2McRPI5$pSX)6hi>Oyp
z6fWAUEI;J>k$fG`4VDUkb+oUZS;C!~AhyIddRDI@fHlqXh};6o3yzNcC)pRg#;MjL
zdG-86M|M!(^pECs49@NzbVS6}V)3S0^SEB|T3WgRmS=aOP=3B+yjH)jC%;22UF^^E
zy^)~dldh9(>EsgI$>QFroqS7-x4GDOT%np7iMB7_$)~V5wUZ(eVQM@1eJ|T-z6Ymm
z52|$t4!@3jVDdu~NO`QSU=ls#^2|?M0}>X`X_sr55A493KKxJz!*^e_n4Zb6l-yPc
z+7gc@f;tph9%y2o!i@(-!4p<IFTxP<Qae({$UK4f{j250&Q%d|nM@@(A~GrXL)YL>
zNU$0|t(3D|OSksQwbLMjTRmn^B-&B0r48(rA4FqjdDQ-X#m1g;CnDYKz}n^)8JH*a
zctx1oXISF4{^3kT8RgG(lnYWuS;WL|P1OopIw0B7=&_OiQd6@<*54pomhJ$a*;LsN
zj?!^{49GU;abF3>^daQYt#Wz555`k_%i5~OX&}PAwxIeMykvvmxt;f2L>}m@Z;o8T
zH@V1u(-q9g{GbL?6YZe?f_6=zIg+1MYpcJRUc@|(nNA`_25V+inwevsVQ4Zqz3f0}
zHRY~VJ~>IGA@T9ZSSKa?MkRlkc__MsOMCgj8Pt={kEM4&26C$5tziE0823r}f*&(q
z^vs86U>=V$*%$DZC>8O`ENRl_I$UJq$Jn=VQ*yKNHC&D2s^ehZ4mxL0kS#wC(~;YW
zV|#H2CR_UY62)1uBC|4llypfndSWEnAk>woH*&giRW!mmZLkUg#aKIboeTu`ouoC@
z%rqWBjnmLr?7EnFq30kuW^If{&$gF2zw1etxQp2Vit{7CWgcJ0%gJ@ZiA+tk%)?QB
zUWN4l%NpiGrf3?UbE(*D=F*iY=L0<pwYKiIHWACbsCfV7_1Wv_ZeCEYFlflPJkD%g
zgPeXcIMN+nZ${_9g5M3&_Gamy!Bk9@6FGxY&3B?o@23t<pWh=s_?rGbRky@UvuF$H
zUqx*<M7uDq7ccf&m0TZoTunOLYP@Q(ejx(HRn69j{Qg<C$y>q?ef}Tz-a9_3D*ykV
znUi~GGAV=<LMNfuBnUPLf?WbCR)S?$T@AXHwS?X@!QO(s5ql5nx{6A%uLe*Tb(gm4
zq6RE$K@i(ozR%aWcaq64A>i)k^ZWcBpAUb8x#ymH>ifLUyT2DOY6avh)=r!CvxvP-
z8yajct*?(1%6H%pBrc{s1(m`BKZhc9IPgv&-mkVVQtPc}h3Oor47C?9q;PRQR_Fu6
z^^Sl+v0uu}C(|qB5_*e6>|8Hck=aNQ!<+9^04H3ltZ_T8rHI@CUjXssfmBj-^qvD+
z>F|T;MSA+wkan?pqLag&rV!Nn(xhz$x?Unll2c6<PxEXLJx<nz;4&pvdz%kw^HB#I
zJ_>T&5_N*ZD`V+GoQFIG)uQD$f^1o1Wd%Mh^+xj;h&#UGN>zN5R`Fc@Bg5BymdUM(
z1E$(}TH}~k%?OtU7YR4=@$WE2w4#Hy6`^44*c9~mKN6$)?fZWuh9x=vkHq*t663#v
z#OOT6=`kjb!02xYjD5gV|C6YT^#lHwP#0F6AnF1d@V|+;*!-)A3(^5>OI)n_uca*l
zp+Ep3df*wj@?bFdugnbn7u1t2=oYGR1A$FB2+o7uLT#L`;RD6LZ#w7M$$uKQ@vV}_
z?|Jdl>F-|<j#B%Yw|m}I|6Rk_uSWcK=8?yr(PcFf_u!AOy)gZm!@3=O=RVUus=Dm>
zcPsNKJYsqK^!fV?+WUej-T(6UIVYXmKb68O2d^Ju=I*^_#H_k=_x|DBOZtQ<T)5xb
z{-<R>yxXYf#)s2x8Q;Hb1ciycLmr(_>_hWk6>KEBoDAVnb9`t|`R51BKnNG5&w<{l
zof8@?Y1{{j9M5xVsE8mZ(f8~Ji{^y(_r7>2NG3tDI@sdB&khauiWdYS&tk=IHWoka
z6&Iqa$b?Xdx!A+1qjXZ)Q;lWI8p{?omMsrfk_XpTe4?@NwZ@8nYAkylMm?w`di_B!
z0rMfO1fyAP2YHN!uF`&X<ja~?K!v^S3J`*dC=LD!{GAmDbz{4qXF~-8)A@ggU?AlT
zwkFW8{)ko18Sizz<gfqu<cm{}$XPa$&l3yhWKuZdZ^ykfV9zdJKbpJs-M}e5=O^86
zHgVige-rl%IyTDZmRM{$qi}*-3DcOvw066NO^qJPkb4rjr!<t#s5fPJb6D<yZR>EH
z`}3a6q57*C@BQ)mdHYwMJf++H<4<~GYC6MBEWE=|c*BYjN7i4n=!NPHU)(j~#CvW&
zqlm&avyS<$<E<sjhmXj=;IEet==eu{e%SigS7)6$W5w+qYSy3H_mNFWA^yibsJZaK
zKcDy1(RrOK<`$3Y_u9Dn2UWOFuj7lOn^qlOd&)`g7k@M{mr(|SH(&nV>U)p;GWX`^
zzh8IG?mu33YX=Ieu6p3HrJMKou=ury&VKUo(l>rTh{EL`-(B=W_gxOT?6&jfEIj<0
z-EUFhu<kcZ$QXCWr+@qXvO_LDwr`h(SrpbBa74wByz^4~wi(%Gz&|&3NV+RY&u}{i
z6@23Gv+~aTYCw+x1^Yi;y4$APKkP-}bz43^e7C25s+;+zAwT>w{K&x<l~6eAr2}rL
zK5=yCt9I>m=<C_bkGnaI!tQIg{xY;6<C7~=XD$BefYa+sdQ*7nyF(7$a@#NE?~Xlj
zk0qVX{VgyN*wu%Lo<X)AZRPYxk!wy=s#f^8mtodYVAkf(DN1ob?_>3tZiWW!)1hAA
zn89eA&8K8h0QjFoAm&*9O*P^6$a7KTHJ5H00u?}3nDioL^wrj66p_3_(Ul&tt-c*V
zC{$#AD~)9H$}S<BRuGpbNzA6F=2W?+T+w^nor&Iqom@Z8`Z9q0U~k^%{@2g@LvP+g
zc3{@3s#(93eAdrt*5iK7tOv(t?R1ZtGr`eelUwVg6D27knyQ!Cb91s~K8XGf3?09d
ztBgoz82xz}Lk=nrIbafCE^AL8=15Y)C{7`PN7!`sM={Tig5<?XHhUbScy7+G8ODnN
z#z8ovFI_XLaUA1@`Qvz*OO1gX&p<kuo*GC>$doJA)qnaR)cCMnX%E7`kms5bJK=r^
zDU!^%oEej)XJ^5ig<sw@<EQ5kSx83=kt34FxFgm$j8K}lH3g;qlDuy3d8IZY@kx;(
z1&fAB?69O8^IL{V6pddoOiWJ@GOThVUDcBx0clmBPQ67<Cw3&ss#MV3gTJb~HMYCW
zpsL7eUKj0>cX2vh+_Uptv>|Gfx`-4Lo-A^)*9E%6WV5KGi(7ZT3-)%Sokg8y@fZZk
zJSR##Jd>`lZWwV8AAhMAB6q9T{~w@OlB{Aj{r_a+j_nlcr2n~!J7%+)Mn-yF{2ok_
zq>FsI`0iJAajEU1N>d!=b#Zd?F1pag=3mvt)p1=oHrD4>MKSawT_Nz)`hESwP#ARD
zDpKtR*Btj>b~A<$e6K}<*rCKJwzr~}Tk`!tUHply;I^{$ENPjdVV=LAKTPuY*<s%9
znj`;9hxz`t!(5wenAB?;X2JgcFn3q*!Nt-WngfxFx@I_n$^Ew(aK(;N9I7aZaWje|
zhG@O@+QfjNPWG?h>NU?plSEHa|2*yD<jr+b7a*ceH@6utjVF!0G1HPL@93&F2iK|n
zVNfe%d`1;QI&bKQR|EByY<YpyV{s_8vBOXgP^y(Jk2=7g`DGMjDS+eCbLvyt61wq3
za+R3r9&JT-Ld9)7hDM|2J<u-$WLOc@esg38)|mF%*%ydMCt*wmk$;O(qEtEsk~;T*
zV4GEHmu>%5R=PjI6Y`JXr7?R2>65Idb6|u|sG?~mvD_3zG|Vt?v@$l#t0;i{i<(_g
zQvN5&ze>#FQrT)JznFH(-?dpQr##-G_hN5TvKi&g!6HW3(LBT<EkJ`o8Ic{)3uid{
zn(HuNWA>$#_BHDvIWwG2=1v6e(kHPLVNI=`rBF*Z1f}mJw>d1QovUH4H3#}zvIadl
zGd-RBa!Y84*^+7#I&~+h-7>!1mfEHFHf6S{w{6=kQO*Kjv(8qrjj$jCF&HdoSsoyW
zsAE13K$PE$UNA=rv71;ME#KMA$z`uMy?BJ&X10x&IF+^yKFl^fl^F-M9zy?2R6`4#
zoDp2<&^#yt%FG59*Jzdh9xM&H07W;14&qxgc9iE!J8^fIsq}d{_kJ?$%bPb&bLPz%
zyYig00dO9Pg-Z)5eDufT-o5tIp3C08`sYWoJKa1WDJQq%d{0uQ?SQUyw6(tTZx4QM
z`NMBK_O~a`dHKnUlX6{28NsAXo8adlAAWMhHbtVuhb1QW1SgGTLrXZn9TX%s%$}Az
z-AY*2e6u^o073LUW3apco{^uRMGDqa7BLMdf+bM74Nwg_1!g19DSX~jaM*+;_<XSa
zm^hoR8A4KV=^gpf7>Q+4UBxBE@bh~<cWhF^6RJX?*zqnI?GO28K)yX1u1unwc12|?
z1(1}n1Kl$I_8)B(P=r5Knbfv~@8&7r9gCulCrY~lMV-;xq1q~?U_|qRPVQb-Bix>v
zy{Xv~1u@B2(M(daSwR6crB~}h%@S%Bp|m6-s8mlQn-}zT30F+bOh(a<ntcceTSTCX
z;ZEk>6oA)JZ+iuWXWCEm88yvz8DXb0fSwBZE~kpiRZw5*<YDH*&#Vzna75FcKdQ`M
zrPd5+8j>2&C<1De0Zk~ufF{}7CIgxl-vU3}KQf>R(Yy?3B=eF1jbjaHL^U^A&~SZt
z7Bmi)AW1A}?E6Lw8i!Tdjs=bGBB{63Ar>@hQWi9Nv=%hY3sy0}CJP!nYFW^T*H2<W
z<G<JKm#9<5f<`@C3mUymu%J=*)`CXeC$*rllWer0v6F1sf<|4*f=10AS!pLJ3mQGj
zg2wA4!Ggwi&}cyuk~&uwH2OvsG}`&?SkP=+EvW^~w$-+^prMXuLE~&=L8E!ef<`O2
ztp$zVHd@exI3-CfXfy;_&<OI9s<&+avY^o}Z`Xpx)|3T}H_~PnG&?a=M{6BtL1Wjx
z(SpWyA`2S7lXwf7=A9&3(D>`YJfIY1K|>2ISkQ!6&>gj)iLlI#7BnGkmMmy=BgldV
zkz1k#ji=tyXR@CO0pYTr*{*tv%AWCzc!Ngc8T#;yXF^TJGsYUv=*DQ-cm~p^x$z9u
zA*Y&*XT)@59pU52B*rs>85z%nwre~ibeCW}vlAff*?TnYS!>2K>Mg-|M$6K=@r)P=
zHZ8e&%fd=4wlnr~Ydf<op1a0JbH42i6=XZpOueNkVq60>dbTs1)I{4E0n_?f3F<9x
zE#uT%Ekb|dQd;oeG1Hlq5KSTx$4Q93BkHYCr_NM8_?FDslfL?U_17a)S3lIf=cwC;
z@_Aw*Kz8=gK9F^hV51OmT#c@)0(hTEPH>B321J(7{ZhSjx*`Tqz%2I^ClKo2&iml#
z8XbZGeBf5t7k%0}S@^f~rY7`44@hCB+6jK_m_d$nWvC0YSl`xH*R(|TnwZM&L5Nsi
zZI^X;JLdK2thxK8JaSjvMfv}HeA>*k3lA)2UWtY0u>!%v{`ykoxi7r<N-*>Iu1`KU
zv`tbvGb!!*b!i|FI)3^S&wsMy#_x~(s%qh(W&Ot-Ka4hCIq9qg$8Gp>%Z&>s9{XIs
z&hKRop>WnMv-jS3+PNj8&z*Ar`@bzZJE>x1$Eoq83Wua5`i|4sN$GZW|Bmtg?nB;m
ziheD$JfQ}p!iFjvL(u7JZkq#1w*@>cQ@Qb_v6l*B@4YNM44@#(oI9EdSy(ud#vr_x
z)9E@4r{~njDGQ4`ehSgXwYx~Bv5Qc8_@#?%gOoZs<d~qro|vo|r?>gV5$6{)UrsCI
zxre#l#rhDE&ip(L0(v4W!7?Xfi_>`w>00LC4Th-}Y+bI+U;yDcSC)4Tx_jS7e_}@P
z;UtYnLSN}4){wOw=~qtF9A0Bh<=SfAVB=jF-#s+Qm+$grr@@Xx0_@s3#E&B{K>#bp
z_J_a?<0)3;3`VWuVD(z$bgz^qhdbg!Qa<!_&9qWt%+YTteK550A{x&%{b+Bb-=0z#
z(A`IPl?6=ZDsL=L&Qa!UsCo?siRbG9(hkX-*l9ZgmVJFd3tnn2qASMtuK;F;S&XfW
z*)?deV=(~6-Vig~^I_yQ%3gKOU<`6)xt8H7oOzHh{=^qFxCi3SGHZ3+9qNZR3na!C
zbq_6Hp=q33)@&NuH%4t%Gw9$(vYIpWHj<69`I-R!aLQ_5<&7U=;tyEUnTs%z*Qnp0
zU14s=0xr*8jZq>wrQ*1{t}-Z9@O8N4-?DDMW0XJ0ATQrsj+~-1fI)Lx(cPrd|GXd`
zf2p=Un=+}&Z@$P|?4fEW&wLh;VJ9hl{ca|M?z<`-i9@~&-ej3V|H<mt-kXifARBT&
zsW*aOyJq$52Jg|zx7L!H!Zj;{EO@6V?;_kQ6KZ2XTtlngsp#~KJxq16H*tq)b<2kd
z04alsGQ}1qGtKF;gq1Nr2WM>{QkfFv-hUjzDb3?bhQgH`6SM12k8vewmMf`DwClI;
z;Y#>8+q@dAltq7)jNLE6ycPRvxY5DbjfxxDY&(KbS5x9f)bBkMH$8_<oea%ympCmI
zx^y}0G4_ru0BIENhtwAsq#|FT;kFnNyJmOU<_nA5&Zz=2KMYjCaPrEg+RAf>a5a|!
zoRe&|4EGxTm%M2#J6^@_#mJbVD5rE>!>H{zA$K@A6D;=0brB5kHBfLi_6^ER0hA+2
z)+c9{x#kGuY*9WNq@n!Zp#Pg!Iv4F$(CMhmLFMKbj{gP026mC%`dy8aVUDv-Dc(NG
zEu=5l3t^8kmWHcRAiV2<7)fr&@-oLtVlrZ&CcU{2YHbBf&EQPZ7vQX(;vEFCc?m!!
zVRX$LK@_=#xSJ-(44%v5iaC)w0E+G`-I?a`*PhW7Hpf_G>m=Ph8>GmgX~tgj(n(#1
z037-~Rj76|2f^AtSD35jR8ieB2OxakvOb}bri(GLz3X(gIC34iuCVIinB-zD-Ra<j
ztEy(_t|}TS_8QSlYc*2?axg1Qs07&=qBVQSNnpcH;aX6~2iI~8!NuU_Kh5Tn3iV~T
z$a@1ORR$;2(~)DGupFcAnO)<woxmL?eX%iJp{aF?*&C-J*^OgI)7s@m*E@c@f!r@g
zkl=DzUc=UgQRbW^H7#<zA5h+Mt;_+vGxe=_FnNw&sSrz0pZ1^mb#g0VpC-9gYJ%J<
z+1n<$Rf}&u5R}|Xt_zY|2~$gM<=+rJ<K<QfH^ercU$Gm)TJh1scI8&yO;KSxL~hk|
zGx&0=<^_tE)FijE2S;)%-5g2eR{r}UT1_Ii@=g}Q-h_iC&o52i%B|FWQn{6#WTV{5
zPO@dWmAaDLO3nJ$q2yM2l-$beBtdRvJ7|<!L0@=sD}5umm3DqRa;t5tC6!xkTWwpp
z6?HthRoXUkE6q!CE3M$Rax1-Ulv|~80F%nCGz7`51bIp2R<?i1t+dPAm0Q`Gl3RHr
zZ6>$ciJ_)xt>ffYcI_MGR<;w#t^7{n<yOr*Nt9dp>%lxg3X)sVLJM-MR2Fnc<yN9i
z8s%2je@Jqxs4urlZAosW&m^}>-4VGJl|8vtYP{TvK0LWqYLnawfUu34?Z~bCtEtK9
z3nILk+^QL#Od_`u%t&sPx?Q=I&|QMuY9~P06Sg$%S!;4D^_C#F(z5J`+{%Jfa=Dd-
zl~&|d_Val9A<o0~8lQmh<yIWjZRA#(qU2U-Tx9b3;-n_ZtprTl$gRA!jFVfn2>ppm
zX~BQT<W`$gJQ>d^J0s(1^Ft4=*h}Z_`dPQHUTypOq&sImS$$&e6O;uL3zM4VzKo@J
zaM!msRNppw$~X5s|MA^-o?3rpQk`c~Y0#rcn1W}YQ2p}@ceS~HaQc!7SFHIwlGG+O
zsrl(?va;>}*Mgb#;~t%Ft}3}^>#)-9Nv&5OL97*e=#lT|-F?MV)fZI<Cf<7M)d%mm
zj3=pt2@0j)6knY4Hfb<JPC?Y<BW+kM(#D*MKddaODQwqRNRrNg>4Kwbbc5rBy5~51
ztZ{NfJxFnSfwS9MCwl~R+d(jh2t9T~>4~LecBo&DQelRvIM;C^gPh#0j`OfnvQ@Q9
zBIh~Tq?e0yQ-!XP#ZE5lz4JH6taS?3IOgHCB#Z2?gpqJ|0Mi>}Q={`SPVfc@O_}N;
zT7d*BoCK#|4ks8SpYF3>;#2eRoXylNWG=4;5W*l*QaZ>SrxQ}j$;UZ;kK^lF_n2dk
zUF?LPMgIsJkB=WdcQpEVu>9CNCLF`R#Y>zlNKv;A2}7~ddx<j&^}MNck97);h0JXS
zd!i^mev-zjU}f>Kh@UdeTWEYG@^}m>M5joJlWhlQWI1AnMOM+(fKU%5r8Fl&^qK;q
z22F5Hb&@$5)2Wu1NL9PXl23a%%DvtBcy{CV+$<;N<s^J&ACQ}TH%rp80@wT#QGk;>
ziSg`)LL2?JOP8h4uO&|4QLmNv)Ji0WD&PnawRXX$dN0?+y?oHVlq11K-pjY*UOt-m
z<!g;EF+}ENPzk}k?_P;}wOFr6oqeJA>iM`=FZf@XrR+>wwzCafTaG)$*^~d<DE!hS
z*78Zjg<z$q$&k8!ghV@}fpr13@lO6OupQX5%DueU$(q0$m^*&%5kd0EJcGFD228Xx
zCDZ8@HA9nH<{ezc+iX(`<6tw<rkbU5kTPLN8$dpYASv2o0$XGTvUJfxybesI*c){E
zX>(ajX<pcfbOepJGr*vkT4oj8AAs#7t-o3*Ye%H@&OOML10&nK_xQq3ju<_u$0a%U
zpZ?|JKgbF{v2e%nSW@f+emLO_A3yXBI{Bm?9l$}jL+9}Da`EEDlK9C?0<v+k4nWwX
z<FN`DA42#n*X&yB46J6+u@da;M)q>eL4M_4k_M`>YYvr+&g@ZyTq)Ee+cB~Lh~#2z
zAmdw(NhUcp5S`*aJsdU>5sfz1CYWubM2Zp9OBjJJ%tDSgqNEi8<gic9Dc29|7U&4(
zcbrtm5mOBF#<^GsE4;63mU09zS&~^uj(L;7U`*^3Sa#ECv&X+aW@-^9RmpsbEWk+d
zW|Cl@oF0q8W1|o^dPx_z?bEXctGA7Z^4ZrbcY2rKK>ZS!zx2w`_+ICo@X<bJXMXjs
z`M3R1o+XKeNq0FZ!cThHvCEI0<?pgJoB<(?WVd0*Co=1x*&A;n`2<m(OWXp=Po><L
z-&1}cW;Pq0@Fdq9yGU!_m#yFsxMsW}l$ieP-2`U@+ZC3iH8M@F#v%HKaUP7R>~}0P
z2X0tW`{jdVQju+5z-5R2A_(Qn;Jlr7XqF7}!LT{o-U#DmK-JWfISTkN11il?98U0f
zNu}4PmC#~GADj@%WJ9LokD+7Ny(MBJ(FZJjFHEKak7nJw%f(v{&20GYvULX^IN;Ol
z!E8ukVNxwiQt~0`*6p}DhXHNtCKJx4aHX5SzGp_khNFM^<dZvw=g&yG^GS74NvRKD
zWZc<4j6e&7qj|tX)O3_>Q`l5gI@uF!K?m(rz?{MsSg)m)t4ywPhD7JWYH-}p6qGpw
zwLznueb*4NqRc7fi39JhR{R-fsC|mMYn)w2Tctbi_I1}<`E72Q457MKr~$r;y2P9~
z$eg{3)5&#SjBz~3!^+C^vKUyAtA7aq%kgwgpCYUk+mMHqZZK=U6Wk3L@5^lo(%7pC
z_K>><?j`<j8SRbc3tc@WPEI!}k(U}FvT6}6AmQ&STaxg%iVsSitpES~7j&V^@TrrP
zV`kKh3iXq}(MWS~koX-oicN`)Vq;^5Aj{xoe>4zCjLzVW5RO+E9Y3ZB!W7<T6Bw~h
zCZ1NO@C4eESd1VVu|gD)QAgI<rY^KzPZFZkG(HOpT|1{n`&DE{RGNKRd3LQXZr5Tv
zl>?EGg?>4Qq|`h9x5n20pZAxm^*Gt7G@Y+njv$wp$UWHip>~(dIWjp+LtGX*JT7L9
z7B{H{61>|AWSmx#!-rWYwB*h}NSsBU^nz<Vb01E4&}_mkRyF1!mDh>?YBuuEcCrSz
z&gM8apIdJTh+Nm?f~%wEW+Fq`0qX-FXT%SfU^bRqh60A(>wwh!IA(%~+A7gT3!|9P
zvSn+Uv(QeGCXhS-6=kb*0OkCW5jNz>VS+ddu2O*!bSWvgRy7CLdM$?p+O+m4f_C3#
z4XULv>Jx(ihe;9E3}{tX;zVhp<AJtJ2GgQY(N_LVD&a2ZHy%cM?2WRQoPvCdn~Ekc
z=;m8il@Mmg-?G>ctwf7{h-Bmoose7EeKY3e*+|cupB6~h7?Nv3Y*{W>`aYr8T)8Mo
zWg!bZtsX>Ki;LH=y~WchMmqem0-iK3UT9QP(2i!y*WfRo+y}ILkO1C#O6ndv^o!!o
zyj22QOzlk~YlHoAV(3Mt;4Nr<OY4~!#>!t$KHK&P5C=ympr`qWNG_eg`(aGAxmJRc
zsU9XeY=dqCQItSFS^-RAqv*DG6!xzK_Tf+RxpnY+;6lpN_m}SQ_t)}$rwP!k<<1a=
z$B8>bYdv?GuhrmBih{Odr*U&2f-}!m=ukaC5cX~L80`$!=l)K1fLg@*u>%Crw-vLE
zJ`<e%j&aQB$X<5CIrIK@EAUujoUY~vC~m`B#d-dQ?~i4Nhw|RVbVA!}R?e<NHj>9G
z3k1wrY}w={(!okRMce3(=OOmDmpeX2jz9!w6Zyyd0aJ|s6%>SN8!5F+?_6RZi26Qp
zN2%{Nn~;<Ry3YH}2XE}XE+x9|{1bj#Qaa@ylD;Jt?l`IYZ&Ke$Df%5J$@i4(@0?DX
zk3FmYtKknGdusckBR-g()XO}n;4&$b(GnZK=?b5C?-+D4qIbXpLo#Ckkp<{xmU93n
za?Qz73l_b^DmKmGFjs4kr{u~-H()Y!iT8w<NF}NSU!}T7Ib9;-owk#KdfuN^q2m`d
zQ~oE=g<%l)one`!rJA)<z`-3JaSCs9_PJe7WeoG-7>YRTKZzoOHu9t{jb#}7SQN1g
z6al)e<RWI_aDhrUACN1wx5<}6Bi%fTJYg5Nh!>Cpcaa`6eFEkE%)5BH>7F}I=2_FQ
z%dsnMzi&wR9!CMsw2HemU|TY;7;Ry|yf_m}F+Pe!BTuvCE%>TU<hHQBYC&1{%$<e#
z86wiGG0q+cI?O3VnAD!9UK!i&P9_iWXXKcau-O{&=Xuqvi2Ux@|AW1O?^@|(MI>jd
z1ldh+f)kNSvK{TB<}A66`7~L%SGiQT<6dSeCQOJR4H3=5_)2cuZ|_Q+#fj_2j3D~}
zNW5L!areq{4{6S3V)G@q-?4_SxouWyHG};iBwMS1xqlWkO#5gC@+;ZTU58Ym2kJ`9
z*#afntZ7()HNNSM($0I+9@v&`SemR^tGJp6(}+2@#-1g5X#>S|AyRLK_HeLro+fMV
zoa32ytw#{mfe0pla|({}446Deo(^2o=?e6Qr5QiN?pV6E{A{?wypFLHBDT*Ep+ibs
zhj|=UeT!vA=9uYt%3zJyz|ZVb#GbpDeJbE}5v+7}4Nf*uxuFprJ{#yVM3K*(gwJH9
zSY)10#md&SwG?fI+I$;J!E_f3$1WUsvjEXpFUlkeUodACO9p8E5-6QWH^pXDnbY~-
z>A>y)UC`Jzot%&TAfhF}$lMLBE%<*A|3fWgS)71<H^j9OnVqpJKg_O`NboI<*2Tka
zbK`LJe8HTEf>b7{Kv&o~@>(nWEh}uMvXsMk=Cc)0n_f8~_hzsR17xQdiO$`2QwPGJ
zbdbbhCw8`!owe;J?yP(Y+1TJ@$2pl^k3y_zq8vr+<_N#o{1n<%Y2$feVu<USk*wX<
zDJkXvFOZ}p{Sh-9P77G=;O+)`m*f@E#uNB1dImY{YrU*7hd9ogSZDX^05wCN<H)g2
zXRL3`?<Gg8!`h{#o+T6YdX9WcKO%zEPnk$#udmlxSx+(I_hn3`EL}M@0W%n#J=F$A
zQ5Jc4;Cl&fpRsOsVmZ{lK^0LGS)qESdXB_R#{3LRr{^k2Jv{@Q()t)Ptt#@YY|qUZ
zTtYgXf10BOz{bxz=>7>poXhz}&Kz)8I3>~Ujzv_+p%CT-VS8{ng-x-iF1?lt=IjC|
zvW$kjDpAwM14;#oXjJ<sns4Ay+Z+PWHq;H62iWHVvqoU8fQ=<iJ9J;1^PoG%^8l3Y
zyWnR3YL+=<$C#(P-ju}Ntl&+r_&1|sZxj%scl?`iu{W}w>lFXygxH&vyy@e=Spm5s
zI_xwF5|b$(l9|<;{SbNv?mlhlT}!_>>Vo5(i@$h0rNe;}WZ0fqI2x61=(<j$4tuim
z$crWos6C^zb6`1BJ3BTQAT*0R0ZCnCgI3ra#T|%j7{UEr;6x;3M5{t@H-aVg_lz@9
z09_K<!^wKS>4kNUeW;T+oy&KWtZaJ)o~VJtfA{9BGaelOz;B-DdEx6X>{B^j64b=P
z?0gCnV|bme5MYV%e-0$b(l+sbK2h|c#JFIKaLwH&1lZGZMM#Vk)SwMYjFL32Cxt)X
zF>t^aB}Lx_PF;EC#-mav{7!L-5@R&YQv|WZ7)=W__{1Pf1GRLAtgrk>V8-Lwi!ytB
zHYrfMe70OB65|xzC((C!Rzt>F>w2yK=BaZ=7Tx%}udf#rB>GXX;aRu%P>yk9NVjZM
z1X3B7tsYHO5|J`9S;rM*?CyqwNj^Tvxd^tym4Lr3w!rer4?1J14s1vwfovvS4w(6-
zi<J;W&F?DF1o7L|McfWV-bDA5LpWsNT$(XSeZFfo9Fsg>e4$HY$|dX&c2XxVa)67t
z6x$Ml224m!mkbcSKy)lgS6s7aDTiNow&^D8Zw&qKv)+nHhn#L?=x<Rl@4F&$k$Qz*
zb5WLowGV{FK0*ErTt9yaNL8309m`{<NsgJnhLJ?;q1(+#P?HQ`Z##gOL3{oPI&pDn
z1ei&;v}oz3N8AW<(5v|)Fsa<|8pEi#F>GMbGqg<;AOK8XWJb53TZ0(-oK7>$;ZSIt
z%_s)g@gp#qM2NojSC4`HjUt=BmDnOF(%&^L$)><&Z#ai&I72F&!ZR4fQt5-552G7-
zlz<AEace_g3DnD0v_(it+?M*-qG(&tly_t}^NFUCf`l$^I6o;X5bot;P${z=Ps^$R
z?_Lpk2=ktp6OrPo1vnTOw+m4@DR^*zNELaypzlsak&3~>1$Zl{ttj+e!YMF@O;W5F
z`7PSc>bx3gwslUT7*GC{Tg-7)9xzv7d&@0cARFZN<}KtzyrEP&)TCn#nvM48IXLv#
zA(;ZJ-BF|)SSobf_UICK!_FZx(EeXd^S#Vnc(XfrzH`o8^A6Ww(=>;O29F}1it^)Y
zZoFeStNWV!2u>~H-8Bo*!&=i}S@Qd;S6X=;#-u&F-LA|T00ArS7*6=(tbP0m=W5dl
zvc5Cs&oBg0caeC5-^8;9exu#d=h{Zu&3>HiTyB_D@B`XibD`62qx{)8kMBT-v%N&B
z$c9Sl(O$!inj1nwV8;A&R;9B`F=gpy5mOKs9h5)DD@5K5a(V}Hy{^M(BS%kx8@aeJ
zhJk7fxTo1u4)>-gi$P+wp{TrO+xIp8`z_vk!}o#jwyk!)Uk&pUD`Bzup9=2uvF-o_
zz=VR7?yI9amj5x1hk*&-hWZ^GiPB{FDoUlDVX>-UL_J1IydkS4Yt{jF3-AR>6*-D2
zL?%v7vZAQKn4}nwpEA!+=wA4TLWsfkR)bbavnE6B780Jlj?cPcz*tKNgDL_7R3rU{
zp%7)nbN5oGe`QJ=LFu$g<p1K<osO?N&LpSpcpb<ABY;(ri{*5Ms|YS0wC#j34!R}<
zCMDFMug$7ik?hD~s)PyD5!p;xnj!9EJ$&sUcyD1)%h$|UVO}!IY>+J*YNg<Af3=9)
z;BE59j2PD3+&l}oe29s420TM@RM{6%xV0Tf0<g+6))$i-tlD#keD9^|$i`kp^h|8o
z@0kO$*3Ts*ryNxTIMw#OL!5gTTlX)BB_lH3N(QbGn4aonh)q=Q1+*Tjw?n&$fGQN2
zj~SLX)KYkDp<&nw*L(qi-`6Y&%4gyg02#KvtwPEfJX&E`8)hcrMkG7J9%1%rE#LOy
zPO%E~-`DF<eg~tKyTA$Sut?56#>uO83O6`?Zl}$3v%x<BVb#5r6VQE<-LXHRqZ9kV
zG4Af6i)yW>E@V#DTvpOCha*bRxV{eIw4|Bbs%;!+^Qx$IV;=F|Fu4ZOs|Z<w&cKfv
z>uMyycC4WR5V9$4AURgv>u={*Afwc%naDN0i|JGcw=eM(mQGQC@*k)Z1vGb~sfhwH
z4YN@v-V0|4$TaYyoc}>zZe$BAVYxF1>8&^l3JPsOK6WrnDekYT^r1?)R8dF!2Wi08
zFQ$I2laCWy{NH+-#P%&eRGh>68lJAE1C0q5sFr;j3i>VML4pS2nBYf<vieL$d_pbp
zbuyS#dvhKOC<7Sf?1+lV+oCWvj3GcCyCRt_DQ}%-Ehovrj9!Pcx+%_#uJ-nVtf;BO
z@WI?Ulk4t7Q+}j=Rxp_Mu368Op<T~&$IwYP*yl1{^uX<}o?~2UZlrS>(03e)`D)=G
zrra*>O|z@47jH^b`|^OlFAuOU5H&Tyh-`Dh<}woCcvqMYgjr6vp|Z-;Wrno>Qs5-5
zm{?8S%s2A5>&0LI0i+fkB}Z>({!qoFMV5F+i3|o#bg{tBU?rv-NCP*lpm)>$b9Mta
zZJio)4e?OalR2(dZN3mFiyQER{(%2RN{Sv)Gd_a3%6U%eMxC>_X6Xbm>s`Eac&DkM
z69>4eWLW3m<T)uNjS<&lb&Hvld1g)>&t6ob-ONN)9%x<(BCRpc<LAPrdD~^f0ts#i
z3p6i2ED!=)gn}*t9H;k<{*Ki?yHFct{foM|W@CuJw<HTNw~#afccza*(1*jvTV(5$
zQZmtd0&4z0db&Um)7bm}J;nCnf742E?!xBPSq)$!xn7QN(yg1^IN@PPvzJ41`Q(Hv
ziONJ|qzY<eu7O@{15VscR)WHMvMvy%u*Fge)!R}EQEtBLLlCgf%&Il7l0Jp2=_*cI
zp7{oc!CZ3%7j%2553uG|Fq|AxEkans+2MR6f$wE{3U!Wh+MP<LW#;Hr3;}PBdnAkP
z2)7b-hLa~mw!B@Sh8&$cB7C{fpP?1T@^p)5K{4A&ftSa`O-3OzX$2|AD>mKG0&u!U
zmy5k{@=;m2RQ#yIL*rMlota*vQG)u97IFb`YbBLB1f)>{<vhV+&_@|!!t}OjNJL^|
zLtNC~hhPLRR+21Bq}{12Qx4Vj=3HfQdS@2IR+=V$Ns!0y;7qttS=Nx`r*wAtteA6L
z8utJa4g1e10H5V?6oAjN`hEaDi@9R~_|mwWVgdLp>AzzE_|hcYR{%aWmKcC9joW|#
zd}6y|0r(&^ToVhxCo-buPd@g)oa<_6w3;;$as}?V-=L@f%DcFjM5<n&lvHg)C+`K|
zBu!iLyF^wv9X}QoYK;?Xf<;zvWeJv&C;Xlkh<l1w<8L#YP9Br2dZpLXpCAZQP47Bg
zQ|*>O%U7~;gIz?Uq9AqIGWuTv4A4woge^UbzH&C#72|z}6gQ$5I92~?df~WVNhU06
zMkb^}xK}yh4NkjAPRPat)bg)?(qELHS(L#Nhj#;%`-!6f(Eh>7i{%0(ApV~vuU)0d
zi6I5{<nJx)Nh-`SNZ|*N!bMQ|)~{wc2d|?i<9T!rB4m<WMar@Jso{qt_MyQ!_Nevr
zi9kw*sKgR~==dQ7{l6l!#(Un#r8H2fbXoi#Y2b$_D7GdCVh~C&C;o2@pS|kjh4xab
zdWfS17qj(+O45aWwbVf}#EGmBdm5<(70LJc5U8>BKw5bInvkM8TXHMw8GiwvvQRY_
zOap0xtNSzP!+f$UJ1N;!;FB3`hah|vz6C{#y?2nnBaI*+USyGQ$wi`^VwKwzJMGAW
zMg4#oEs_TWfDKk0`So!iXrtF#4t0HObnhdTMyN}04(OKouv(B*8~9F-w<1!CB3rSM
z&(px!^5E8aCh|G>Mv>3;MC5Zl5&1kCANd@r%i1r+s4gO(3o&}ZEyd10h{alZ2^zv;
z6s^Qgh-iWLL&HX9D0t?`2*DcKMK2C1?-YlWO86tMl8p~0>aZk0NI0@af-)JCqED2f
zV=A~(W9o(=(FWD@`weAjdz7NNWU;?JYwQ9j-&v`-kNNo%mamp3?1h=z?u5NS@7{E4
z`OG6SZJ_t~>DIt^X}J!UuWU<;M-4CdrQZz0vB58yw*mMnAve(sG4FSf5viGhAQl^N
zHK8jfJeWnrNX~8*cG@z``)+ofrvT9kG~NS>aLTN`2&eyvZPxwMw9Q_GQ;jti;k5Z%
z%XBKjso3vWgj36YE5fN*{>BKW@b@;tsoOllDQkmDjk`wdt0J6=Yi*2hs%WlROZ?rw
z&>lwl+4$YXar<;f3=zl=ga{A$hxBHv6>E}$hoN6SgFg?C0V8(Z0`-EDSJLP);71WN
zaOmiKVa{PF>=p5#y&_5-V$DY-VCGzQjENt^#0GgD_}fDy6<7DQpcAf+)w4L@9-GaE
zpCERfuW*EqEc0@-X-Xwa{V6@kl-fntSe$Y>GS@hvJ>K$?PC$z+feRI9f*m5i%VPgC
z)rk^sq(MJr5DY<B{SX}mDxbc1nbX~#3!Tf4Kt0+Z-MNkOk@pa_>@|@^zl@`tvE0&E
zz1@STuXqFV{FVvUxC&zMS+S8)&%KC5RF4pS6?*4Sl`EG(yosarR_w2m1z1KB$pwmI
z?rAf9kuuPlj{}j~CZ4eos1rij0@kh#WxJE;tBgao&3T5bQz%<CrBJpzf0{W9388E=
z;W%w5TQwF7W$T?2zc;<Gp=^Z_nnKyiHLCScwp<=fp=_V@jlgz14bKn~o&?VQ%`fu{
zzuW8QAI>}EvXcEb4gZT&PKkv{HET(Aa7i^*W;ksorW#4A?BfKif7U+%nAv7WC%Z5R
zP}-<!toRuBCa}VA67U?ucAXtr1XX5o5V~eZVF;GasdvpX2)6K4$D9+%R~>>)sZQHP
z=on-|Q%Z20A_=?K>A-U)I>_sQ15YAK;2rY@MVs=j=!p;)rA{~Qq8eg9SwEZif!RQ0
zFB$`I_0m0yuj|o5KdV3`1!j{fxR0PPpG1W^L`J%7FpQ{9fnX&Thv`@6WZ%hgtU!>|
zU;mr8XGrdD9!aTmy55ascm_<A`BO@nGg7(_?(Pd_isVi+7f>S|Rs^2yYZO}wA@$u1
z@Y|(~oGE={4Ls7#<0evJ<>_zQ7~kE@)tJBb=J@7HlY^n;=9zLltqC=tc#t2$W^fZz
zD%6y)1h5)02@*4;KP)moqH_S$E|kzd{TT2e>=#^xY;6#zEVCFQV7{dmW1UOT5u(FT
zXfmY&5cVl{kdvXACc~(~h%>{6n)wkb+(oiUKe#bE<}?=0bihsn9d>(N`*Ki&`kCyc
zPI7WQ+2{8Jo+1Q1JdtpzY<ZDq32*!>P(?vEB#xC5Woh;<krgBY>u7FZMhLXZAaXUD
zF?Li+$*Bc=2f4c_thP{OL6~NfqY+?*vSa-#yz#H^J_z;4?O;yEp(SdvP$b58ORCDm
zX*c-Owx&e?ky*1EhY8BCx6-m@nxBZA)^@RVEsbE@2ps1uw-tW0i=PP`;H#sR%3egk
zv535y)66waBzw}BG1X3c#U4grs~Z4O;H%T@?YCVXJB~HBk!_a43oC5s%~s_k9S|)2
zN74b&YvdZmxDlV3?<y#@2Hh3vkwZHVvR$6Qe?)ky*Syuo-rL|1*DRP_VBWUIC_Oo#
zg`63bj#XTNz^#Il2c7-R14KqMBb7LZfQf;~M^q%kc&sAq;aUslYM7JsAZrmo2ZT4L
zL?~pY4GX25W|2b%KRMZzNOP;y#TS4&`D>B8;55S%UVJjpIMvCy%I-U}Oht<+!Ey&m
zI}-M93vgL#E~HBY4nOgI-nmXHcBR!^`Fd=F7`y#)CwsM%`iRqRv1-eU*9UU*uUSP1
z8PI;Tacj)?=m3O?z9CJtxLu7OS$iPz31|RCn91DC4>K8uX7e}YPZwf?DqRq)b5g7A
zi~?pOPS6%7m1qSr&1VYJ_FLuSkiC-(mx5#*FehV%lD*UIm^KV&iG1Xz%L`S=zU0?5
zlw)!#obHREk%&kIvH~34F{m<QV;|&Uk&Cgobht?{%{83(h<O-6f``?2WAYpWIDRHy
zINUL;`AZ}!resN|$SZLIQ!!}ZJA1FNeoLtU+IgAv&RxwfNs7p|mOJ0>F1~f9R8~=h
z!XuRldtjosJDrblIAI=fR^oJ?^Xr}n0vM2QC+1Mt&<D)3`08s)39TR1to0p!3d)~b
zuG&E*Q8R~awr8S`E(fWzCMQ0QKpVXuZ~J<0@m_SC+qu%FZ$_76KLYYnWk!}!$>BpG
z{{%aOBawLn>6#3p18#5bBXfkpE0)qh0cB_?3pfVN{+jDp@&fZ2Tfz(0ejxDv@l4A<
ze-;Xj;P4kKFoGAi5PLjAKYAKnMfh!`E8n1VCyM*iV>8g#yva=+G2hQn0&yfm++K|~
zv7TC8VJ|&xx~OnzQo7;JN8ESsWUQGX^p#~EOhHk?c|_=%#xcz!iU1i_%#;cQ7KRY^
zky;66?!fGxGZ^@e(ImAROZ1(Mo{Lpc;Do0OcX@N;?A?1!xw%hi=%Ajj$*L%^FsYf<
zrM;<n?W^DZrJ;P^+-nA1cyH#yZ?kVu3WIMBUwG$J2L$`R_WBvYGY;zeN@{ls2cFRD
zm`64qcF1{8z0~`y&zBsQ)F3M<g}766U(E`i?jvJX8^YJ-MTW45`6k2jiak$^S1wNI
zy5`$MRbiQgQ;QoKn8zVgDhLqO*RanrBUms8a_%}OFDg;ScALNn#cyI!d=o*89ckhb
z&JcKUD{#+$UpoUhQE98~0LEyq)A4qfHYs}n-L^A*@uP=IeUz{Easn&36!4Oq1TL15
zB4pMbq$FHzKU<AW1=d*zOJJSk5yvfo@>=#r+yRGr23hug8EPfe$ZYiDL&RNaSfCi`
zq2A}l%QBzCtjT(dMRCpPLe1$WpFvIX1{H*KwlYzmedX)^3V#Hr)rFCyAdG*~Uiw5v
z3&5$}KsS)l|0ENZfkE0z4|o%qJGg7s2P^r%mj7$H-tobzafYzop3TmD?z013NMCYt
zq+oFWEbWw{|I__1a<HGJAWQA}KvuSpi!VF~aV3#GhnNpJKHQmnFYT)Lc_p9Md0$s~
zUo)T#J`_DyYL-KYc+dDf53(zH2@HAeM9lreX1sV>n0pq+GjU^uPqO^~2hhw~plz0)
zTbt#t<)xNib{JMJxD;$uYO)Z5nsaIqPNAJl(FOWC0XzGd_h*$e%(B=d>E#Ws>3*&u
z$3rRNW0~I}2=Kc62W&y96@;|GWzoLGE*sCTu`DOEsCBHCkqmWWKu^S=j7wz%Z3Im>
z*-sF=g^(Op@P9&;WEsikk|oVf0(eNh83PZ&R}*;9<};zX(4ZVEegHtuUg-nKHx@uZ
zX_?nBI_pi|wu+{=*vrwlr%S2~Q=e|$3yIdi#1Az;VWmi}Vm`&SIEY>Q&FE<LOlUHl
zN);dg@k)>tUo%TujxUh|mKqXgXobD$X_cu3wPfGLuRhCaz70Vn_czaBc%mT0#3#sS
z{Zi3VV^!QaaxH{SIdYmYXr{_y|MR4Po67c~|4~kz!{y7bw&3>86XJ$xuAUv*O^e>a
zI@^AVBW`Rk7*YtgT!RK6)Co*2UiB@1!EQua(<K^N;AD(tiHO1M;ws-T+gf?l0^i-S
z-b&HAIUiSTyNQT!IAVA=gz9izRb=z^LEPk<jvE#jI%W_kI%gah4r1duQ=|;<q*Spc
zv4bX{6ss10&~VtG1VH>S2n5fE4o!aI)Xol{IM)3$XbsZY&9Z?HA&o5N%|&1?UBX^C
z3jlYVJ}avt6|3M?rgiaW55l<#^~KJfs9)_YBL<rz-XKVxU8p*&VHC+35Lgjt5Hgs<
z%Dp{Hl`U%2^ybvrsJ3ZB&Iu^e-WyFQlA!6;PWEE;1l|%{b|Od-r`}vQyVO*3z&ZFB
z{Rk{M^X50+iLcY*Zs)`UwzBFW)&~PHUGUrSF|=fFgfz4G#B0_my$OQneZuqU#Q%si
zO|q~Fj3}rGt8&b20D$2~s!&RmJ0ok+bKwy~K{>x5ADJQShn5E`E^H5wG}bqD$gp)K
z_-EdqFB5tu<eSD`GfM~5J5qA>U^gzErL2UEv_QLyH3t;cm!T^2<19>g=E`CyNFrk3
zG!S{i$%Pms@{!lZN}#->IS7Rs*amk2s4!~`S&I9X82)h*NCYW_z`xKJ#gtj!4MccP
zAl((_z8YCcbJX1jOB;KsjpFn09vb5c!85^+<n^$EiwC*KLb^z;w(B^bd%%+^s=E)Y
z?HS3Xtdn>jbG==zbg!{cig${iU+tgbXE?>_-1J#s1{{$)N-#u}sfk#yz&AdJb}`5o
zDN3%jKK2cnhFIusQ8SktQGy2xtUqzT<5xx28Qo}I%;U4<_<_v4o2jg$n#@H_j#sD3
zh*`}v-_GQX=iHWFiXBh6`GlG7f)Jw;M?x9sm=8dD+U;M!2k29j*otuua3f3UEzK;0
zU3RLMQ6<ML#1x=cq%-HJil848>7l^f3oDa*nv*xZ@$#x_0hAkPW2ooGRpfnp4_xu~
zzP09e_TtmGT=KYp(P=lrdFet5W8nux{hfVq<QPZbF~Kk6t(4WN769ig#@EX{4vq@L
zc7YPHPtdfx0&ptv*6YtUHJH{*#1PhB7%hRx$It?Ilw?+1v;30(AkstK4`xqVAa7MK
zb22nN9%X3I#Kzk_nn!H-6spsnA)<Zu7@+ljX^BbnO9x7d7x!IzD;(C|-e_bB+_<+D
z|Hj_3KmmmJm_Q=h2!hsL$EcHYv4C?SYWGXPyu$)?nC;!r`JuKcjz{(SB<nSHLArrx
z1X@}YEBMkFf3yuWj<E&!v&-Ka{4qw*>?^@ibFM|)S4h3gtsf+@xsK)E6;;5ZoX*1K
zl$>1WN$_=EvflO2*e1?cR~S2q`L;{ET1*AHjbUM-%Z}z}#Cu)X^!5sBB{PgPR2{i)
zCE*$Bi9E`NVJquS!d!HfUSsX2NLc7T>#11@aW;t8KwT-NU$e3|04bfh`1-q{qKw!p
zNi}IxZ8`3x=1W}EBU?Njjpp+Fd;VM&V!_rc3it0Q=(20B$79Z00mLSzj;1jhS16A(
z(-k510T{%~WHTw$K{^QPfXgo|-gL~9q1tA%r|CR%w94McO1fJ!!dhqnjB-V&jN{dw
z+d-0t3YtRXECm#Dfq^ap9gasIZwZ9i9|DJ3^$|QYhHgtkoXMW12q$~_sJ1bqQ3{QY
zEeb9dsbm-nL)g49i`}EU=i$10lAuIBl;;zPpKBAAc8eJk)7@ybHm&zptBTb^%J|Pf
z<uc~rlYEw_7OBU@9@}YKMz@#}Xal5N`hgKTJg;)v<Lx0*IgdS3eDdC<&X6K!H=thE
zaxqdH1pz6xa<D>}t~|Gh;G)(%<Z6ZsSbhaAX7FExNF4wvR~SR-1t0hWT?~ja(8q&i
zaRUYXybbW^0HDz9E*fF9&Qv^)BPxgHYmiHdLfga^lmbhTdK+P<@`?QbkV1NFP(2C1
zsg?$<$eg+$#cbd~>{B-WcdUVMCp&d7*w+Ca6tTL0vG=c%rN!)j9F}?9hyLh4xAvI0
z>pe23%0i)T@~4S8!-94O900|5eAY4^YzC1Uav~PwMtLB_jj|$kqrlCW-B)Q*#Cnm^
z0+bF870`42-L5&|D{N8rJMhs?ug`&2EX?Joc!j+LdV%cW2F(UeLN9YGixR7056yF!
zWqT->{WkkA63UghACO46PKENY^ELZ(DVs_Tzp-~JRId3NMWsd;zZnbYCJo0|HbcS#
zw^m+C+(`8@O3No6&mt$Qnxj+}JH+xnkC^RV9hwVY+sPohMZm>mZ4gf+vU9MsPO*0z
zq=u$z+k4Dn@>26JuC@%b8mcDWk~>|QVSwz@3dUiZunxKxLoHE&e;=N!=GstXkq68I
zhpdf0IAFg4N{X7_W5)(c;%<q-0*c}P1T2upg*`5i)H75J)w%>#J8XYfP!9BoH{g74
zdOtVczFKV`z?fwFyxX0f6WQeLTq8A9YpEgF?9E)5Fh5q-$9Bd88^`=N3;^8MgEhRs
z0-v|XInc=)g*nWD*6?h&M+AHq5}EHoO^i_6c4*BmO0a4C&<f0*AZ*8cj@_Pj?`a=S
z_}JfvpJAzUqC{SmMauacaLAM@dpj4H7i|<D?5()+Uj-A!5oj$*7XTl!Whv+phwLY)
zD!>ui!G!kIep_M&hUO{=!tP%}$(1=pmCh)|Y=Zaw>+E{iXo!U@i7u%Qq+X8#F%5D-
z$8N6Z9=;`mjQIm0Um5TQ@UJ#iRH#}3=swYkf6rZ&0$)wYYHy1%!J}Y^xu4V5%k0P3
zn2S>JbqoFw5SPNDm_w)&HunV^fZMi!ZPyqM-t>vTT*HKd$6z?58`T0ABK78oMmOPT
zxo0&)G?vbTuM?R}(${c_(-)hLfn4#R(Nt}6nmGoXbRL`i65=b1Z7nJ;D0lk52Cb}`
z5u{*GfvA~hsaB|f94&ai3uFJn{$iW-a?_y4{nH<`e2!BQ5#sVxgz^BPmW<}Ht8g;v
zpfZZtq%?D@L6==k%T;c*kh|GBQ><wL3_8N@rLoPg6)v??gd;|=D^zeHBg$Ou<d0`*
z3CL=}??-$Svh~K!m<W-^q<dgGTGXoqcv2yz64J3m1MA0V{DBo|V6_?)H!g}Or+N!D
z^UbE2NYXEkjmhGatHYj*`i&ixQAa-Bi>eEbF8@RJ<o%E8{d4hej{95bBY%^SIk7OQ
zj5?{1IjO}#4|16WJ07#&^o7xiUFY9F>ikY?7R^s8j&8}~pyo4w|2iPwiQ?}QxdCC$
z7=$?Rs&3>o3KzgzW)5d{oEan@7b^?t+lL#31<rJr%w-8-QRHv!nnATl#H>g&3q_bD
ziDA@%9gmen54?0xB{b6>qE2sj@*{ubrI^8U`HgLW{f!!8?|L2Q7uj1E6<icLMp`K}
zh^a+*zG)ohx8mXlqAy39;5I<kp;ZE?2&Wtw%MaO1qIwdTS#?uB!yw2s(XcYA85H_Q
z_e{1vY|14dh?kazVa_U;Ss9jTi=?3cLIxGOL^H@o4HHn|l%Sjg6j4zY(KSH`@|mIf
zB>u&55NOIW^VpmUvl6g1WLwEloR1tcNl%!J^PemQPGuZb9FehTUcksCBjREfw<;N0
zgT^oGQH>u0h`WzjBbbnMLE>uhbrACIVh7bq)9_f~d7gQnEG0QL*1kxxfWn|N%A8f}
zjAjU#a1HrPqyV<H!r7(58IA<lJq2|)N^OQ3on+T$v^tGn8-sBOL-#}t<?)h+KC-%Y
zB+|78PnIP*Urs{8`I(DxlXfEW=%W-#kAxg-?{a*Pn!GI~$6|hDnG(}qVR@`?xZ%K=
zm!o3Mmz}gy6D!XWQXWK^sJqiQu!bU#zk<wU@vjAex0^ycrRGzx_^z@(D#K>G!Ujg!
z%;)2vj1e9%e+kpL4W@WBcCI@WJdXIL)CK-Fz=JcGdt`b34!^nA9|S#Rn~4obPyvTh
z2wR;FXwn*H1A%e5v0~nc<s&uEH=E|k2aMTQq%lwKzK@{4fJap%Gz(3nWBHvTtA=cI
zJ(q{B$z@CdcH?OT%CZ@K*s}(p<^lGE;fAG9C0YQ_kOs^KzU}%)XC#n;0~Q7%p6QGW
zuEcakD@;h9vc<C6t&vc23G1c9nhF)s);$!skXb6K8nR(}?D(I9l8EJ^VXUGTdW%`@
z>_BH><Y`DTUi22Qp1x$2RDhfsC_{=JmH~G9JcQ}TI;aG7!~^CLb`pQezj8CXFno_f
zVaKe7a_?jaTnDis)HE3mLDk&NEgNq*lsJ1NXEP+`gRK2PfXQT_%qLeUetWlx0Es52
zg+@AK#a38^U_K<(!@JyB=?Y`t(Y(pt^q0>g?1+3GznNikGhLn{b|-4?!cYlOxcQ@9
zHV6MVs0c;q?q}0ZXd9Qqq)OHM6#4=^FT?jzb~sPZqB&Gd6hdlTk|GazSuA1U<XbIm
z1eV#ETVSd2oySuGMIn2&0^_ejr~O0#Oo}!%aLYNt2;>U9Lo9UyhZz3VQwP~YY=wmr
z7FqltHH%D3y(9dfd=_I_#a(DPE;9Sicv|A>BwFGYA}6(^CAM25EwRhIThbEaalp8`
zm^WwOZ`Q_bpfV)P113@sk~#X}M5d71c*n2gn+U`a==S@GkDYt<tyj$~T|Q&PRX<#L
zy`o+u7Ix-B4E}KTt~VV1)$Gp(7Ob4!VP4>^q_=O<i}nDijl16bV9&mHynOZa%4>(;
zvt`TFq}P1!U8y<lvDqspes#gde@q^B$92&+USP@BZ}qR__R`=6qisfAbZBI(X<K&D
zA#>MZrFq8GLnDVDdivOl)?7E~KH{EFy648VlP9AqzIMvp>+UdvRz>nC1!B!jNYVn6
zfdFFJXUaD`AS(oH<GGbEMIiPC_HE-EI1Xl{%}Tj9sDLBVUx=lMI=cv)LcK+}t_bla
z(^<t>MAy@X_gU^}3T)c5X-H3Utk3;H1k!iS2;?p8BVPfPs(%dCz*A_d<ajSOKo*C4
zlVV*uLJzmJC)d01-rfynWVzWpbSZ)e@ZX;yQ{_#cfsTu4p;_1I5Io7<?2TC&9<pJo
za|v1-|F_1BVg$Lgjxl+*o-C&{5^8*J%No5P3jR4*5x*UT5)8FX5#!UOgpBs)G6IM8
zgHYtzE|!bKjFLHGvX^R3XyD*<W#WwJk2am>7fOxLj~g-t(0(VZlY5A{dTKol;#5Hp
zYAc`2uXBb(=Gr=In$<yx5UT^35QzL8W}{1#$V!%9hGC($<McKgrzq|c+#y0@Yb+L2
zE=Yxm?Sg(pxG1r%1b~fo5mtJ2x#5!3l~}Q{?)q5W+mqI{61LbFK8V$wpR{hXb~nW8
z-qL#A&ti42ZN2WMSlz2zuS=j|Zyl~mTDRGnZHd*D$=$Z=+iY&%#_Hm`)>>V{(RyP*
ziq=|PBCUFLt6Hy1yfUxu&sGzW2oKH1J2zJMU#-_gcH^~+qOY~_UKXpnwe`A}$Lh+m
zx0Us+iPinG^}5%_>VDRG-5X+cKWV-0EwQ>ATCaP1tgiIK+s>`oIlePi_r0Wbo9)}(
zvAXZHUY9Tg-oCACz3%<7x@%gmi_59k?yA=7*2U@)5irR*G@Ba^ZeF`^QLWVl_j+~T
zXua;AVs&3jTDRF=kfPaZw;s!Kf8V%!rYKi?Z%^>Zy%2{ipNW0&O4A2%2)kL`C9%55
zVPcK9RF~v1-ayD&*;?IaV|8C>z3$$AvwOAL>H&VuJzK4Ls9%%NuiJupvlTepuQ{mI
znn(CG2PUi8Y|AG2HG3zk*{pXsb$eg-Xtm}Ee$8&J*8H7cvvabV&92H*{F)t;)oeD-
ziGI!Yt=9a#Uo)@OnrHYmb6Tx=j$bpg)tZ%l&9qi)PWEf2wpw$FS94}-kTl(^H!W6g
zdxtb~fwtWmg-IaWjG~HP_{fE-cXq7F7LI1*VhcpmTboMcQd@6oYxS^z_rP*;t2NX8
zn&-7zvyET#>{e@L`!&yOwPv1Q^Y^XRL}zK|KC#uBo&1_7w_3BSU-QIPYZm%7f75Es
zo_@_^Tdmp0uUXz|&3=B(qg$;x(64z!t2K-Ln&Vonxr<-(;8tty>eno5wdU@A&4XI4
zIoz+gU#m6u@@wwhYE8H~515`KY*;JT%L}o(&$eFo#aP{?t=D}yR(EmhbzhCueY*9!
z%VTw)+<sk2lPo2r`TyHIrpXNLqI7JzR<xaqt(1!r1p96Yg?6v8!WKjl&u8}Yq(JM?
z^5o*~L*}whgslo=|B<}~0}|{@<vc>@+}E7+3#{-euRN52sD`h{wei2+z}J1Ehky^1
z@eU<fL>7PYey{MHxTQ7p<V}G9Ar}>S7e_)luk}DE-dDDqiVxS|^z(l9;b+(#h_%+f
z4MZ#$Q5vh$ix>U9I=#I*RyQQY7F<0N-4%h_P?+Hn&oHh_F9_}!6n&C_m~xnV#>l6A
z;s$trY&wJ=a8fXtQxq@bjtH4pdo{5dUr?hr+7!RNr!Z9X+9O(W(~O^wRo_DO)aY_2
z1%WYfelwYD25n<~qIstya%u1WLU>LuriE2?XZcI}BcGPh(`f4VC~`GuH?E6Ov3hS&
z50{gZF&G`KCEFcG6*WJxX;~+tPiela<t;C(>0A`QHv{aKgB*ju*<D&{R4A*^8Xz|!
zN}FAIMKE{&ppvF(l(hAyfz9Ifrg1{7UKP@zW-CQ3*6pv<W3f*aeC1$PYOGcY6?)T3
zEp$Ao;*RG}{D!EzfuAS6bxGg-#-C8I_>OfW-+J%HG4l9LEKDjZOe$4ODk}s?*T*I>
zK;bRqsOIG#D|Yf%Bc`h~#}%R0Acdh64y71I&bRj7=)N39&d2rYK4eN`iQZW4v|Ej;
zjKG;?k`21|a|qMkbqp%H$Y2cA@bD!Sib6}~ku?odakC#*MiSMb-Wo;QiU~k2%)*zg
z8Y&Oym*H74mfrL$joxZ9{?y5v!V26Ju^N3dI-E!+#l7rcHH^xaN@~l<NaWQDWEtvC
zP)v6nM_%ryh!a=2mrK-#=M@s~m8j)xX_<Su6C8?I6AzaSj;RLX85JS8iS^&H3P%9}
z8%_=g{EYbncCxNn<i*t(Oj^MON#;$o4)WU^<b)LkXAz2^)lPb~ompqJJcJA(<0M9O
z=-Wsiw@S*S&S($#s~wo=3|LH{OC45I#yThKz%fIK=z>_28Cc`jeJfec1}7aolN!VI
z#0pUd*ChWw&j#V+TkCy^e&7LY*ZR>XPRPXwRg18c9u&6~D6hUjd(?9oTAs}s*yRL;
zljUEb4KjR?bzqfG1sN>Oa5M?5(F2$cV+Qw10RK-mK42HEi}&n55|$WCth^}xTY!Wd
zLG4ghwJ;QBEA3kRPyho=!<@_~W252q*YdRtXu}%VdaFT=?Xz|tHD&zfqgiER`RqHn
z>|7!r64(3)f*CATvlkk@_!Ju;6)4U_#{`{_iF13Mz0ux9nj8uB%(o|`qkK<sU>pOy
znf(}rqK6d_42L{nIe?kEw^k6}1<-4%FRaKst*$Dl=YjRHj*9{Rh{>r$Hf+w5@RnHc
z_G&<=E2VUjjxbmF_c}tAb`2T=Tm=`-;ihWJVIe)9xyeY?6hYNQA~0T7A#J~u=L*lu
z^WU638zHea8GwtNeF!xGL?HjQrj&8p(X=X==r3Gy#Uafn;0A7x<vd7Zg`QQe?-}^u
z<FmO8VLyJuWgwW&pdg#>!}J4bbnigqTb!xb<~F!Q4&)$332+qL%U+{*6B3;@--*n(
zAIOdH;g|kee%1R=j6Z9?YnL6H=2X9orAIKaaK|xsQkOqr?oR&gmS@d>{~6pYWdl2h
zH`ON0prq5k9)B5g35lImY(7Q-Dhli-<yhMyH|9&FZ~PYhBZ}WEu{TPnMF6MUA>phg
zXf~>nu48zz7U9|Ml?jy<vqxRf;N>%|GZrB|F#!vq!mKBXj5>aLreZBrhol)s$7zkT
zf1>vQ-^sbQ(CMl$qb*G1N)5Hf8)`+YH-AESKQSSR&8|``$(F|Kq#kNN{192Uw-Q)Z
z?CNBopZ_NzY_ULTKIEWH>6|_=Hr^nH6an)y_IT|eZiy>kPPP|A7lqS9=Pj$fK~_Cw
z%Jd3AEC3FdQ$5dCKQ_Xu_pRuHdR(NJ{UjZ=6F%#4e+$qS30)BcwmMeJFT*MX?QE>M
z!=)}W#dVfUK@HX^)_W<@lhpezSQvB#SrZS!o$Ca?<;#(FAO}Rs%rI;(%gr#<$?8Ww
zm7V;KDIhc{1zbz0sK^Q3__q^r!)&%v{I2nqIiSJ}t1<@=Tae~xNWPgw39WIWY%pd!
zvA>-l9MVI(<nX$VZXRY3HRgawf!F<FisekEY3p+R6f*2ki%xU63uSw;2ni(jMj1b9
z_+K_c#NtY4h~4-KM4<vLDR&;@Ap8M?kNxe&n1zP1QE%fOY_kD_+bS^^kuk_TK2@TO
z(H5N<V#ec8%r-6JngAk}2LLYRW9EK91gmnJ6<u$5S~w>}SQYP`ld0{(ER+=>0*0Nt
z_qQ9<hbh7~f$4W+`VP})P6hs47V|=eiJFQit#xE4+G58=vQ}b#kP<ZZm-~Yhp|Bbu
zQeHF5FvNRSXjl%lg{@!JkgTWO+lm4|2*25OCQCW7O0kfk(SiuM>p`rya`j*`LkxA2
z(}6$FRaR0h<KB!xmnZq;WH#|`0M|G>ygZbt>V#zkv@!tft(Xhw98{QV$Ql0y%ZaH2
z=08%o%GHLaoAGW%?duNY0TT)B00C;r#mSntfyPr?ko8v*=Vc2+foSNC>Nq-?>fY(#
z$a&ura4G!&{pJgJvk!3~nlCI5M}<LG)C>oOpwk8Y9jIuv`-1uyMjW7PJZN@3KqvkH
zQ)w&!841KaaJPR2jUJ#2t&C)q#+Xrnf*`(<X6yKm??b&jm=?${vLGM(>w!EQqZ6?W
z$gfTSd3^I4n8nu&s~ngQkZlwaHW!eVX1fTdAG@4YxIkzLa@+w#85zR>KodnySEzJ_
zi8w<Bqy=~eX<e3Z+6g_*B8YHfD=&w_OP7rpX9LPY3hAKxcf8!D6DQO@Ttdej8=_H9
z^+LO(7By|8DAQ6lvS}l*o=8WXZ89K>8RJS4(>%t0bEOiaeI-`mQJz(9QT^SR>jW;e
zPHx($R2X|<CWfEOT8nZYO!ujfMX$})Ug~2R6VY0^+Gfz^OG*?$s67`uXP+{{6$%@e
zg9QDh*1`!RoTniwfsA@ge=>RCoj_WTkQaZ3>{i{yoNAWk72?JCIA${sRX&c9IUY6%
zO$SqlBh*u8m|rZ9hiYPfyKy>vB(tY7KLYH8`?4KT-VBS`9&`|vDN#XqgT*4J;H0k-
zZX{a(fO+^y5j};U6xdrfWM^B{BwW;lqAVh^P8cy11yZue3zi&Q*T8rBLZ9-`gx2Db
z7aWR^mxbT>zrt6Up?=#SuOeF1(l;Tmu<(}fDM_YjiL8>Wj{g_D)!^YRyn(@xVK*`w
z#C5n3m+TvVah$j4rn}dT`d?(n68r0oV(%75VX=QA4ppQ_um2WsXzw^2x_>hqs^tgK
zLNjIptp4(|mVyHFDtxC9lrT16wun;j&Jsek1&;;Jil9N#qa_}sTGBrTsb(H<%(2;|
zvtSNeCB%hAxDKg8s1gfBnVl?zQ%^=L#ZDnv3<0^kQe}Q(L~<FkeDzjhrHlk>>KO}>
zFsQ?6FhzV4=t1nOZ4Qw}-cxcJyiHBrPM-zcw_4!g0Fzb$lNEau!1}R6;2jICihmSX
zl^RUrbk6&52vTvNN_r%kUF^$fcjqtmDU!ogaDre!-Ffy*9T_`QdgX&X7CPM7gZw)i
zgw8$R{4jxlnvRUEKwMPsxaqjq!xOvtMFg-Vp&o)A2v4s479edeAWb79Q({(VoT}S1
z(9Hw;K7#9(DTjSp1p9adRij9s+NG8o8nj~#F-{%j3ADG}<I^+l`0Uy2_=vjF@e%fF
zc6=yjg+#2ylU=~cRRF#ra!_OnV10~t29UvUE9(plkDY-#IRha0_c;UP^dKJr%)ah>
zvIy=ZjbkYs?v!^i3eRgKZ8)44>H0jl*(kglv!HPlH+!S_gi&;76n!c*kk5ea_<=0j
zL1L@LfoMs%5#gz><S@65zY)vqjp$vGZ8`P2u0c!obel^V9VFUMLCDx@elw>c8u54n
zVqAHp-60l&#Gx_^#bal%)6G~2&1g9eCBiG1ki{r9+OuJ@KzoHkUyT&Yu`Qvzb#P{k
z;C>8O18ZUPFwyOxRE3=k_FrID4gC=Ku^uq{mQW?CRA~R8`D3V-Uy)V%K^34e)eY@J
zygW!ANL&3E8WMXnZxHtz?(%U00;GnD{9=afj#iYaNCmBw@?*2s>2Zc4DyRpzOhD7l
zWn7@(*Jg!|3z{b|Oip)~QNvP%sgQZ<kzqo6@!<WXsM4&Mh3F2yQcRDrAyzes<DkJ`
zqeqb%PloE9lHoBzk}T|;ECnvuKp0cTdmsS`c`S3vXr6~MqPEmJWyF5rFTczE-@C$o
z2#c1`I@oF!Sn1(Vq{@2GzT<n6Wb9~CxwMspD+(MjYwcaxPY?KJ(@9UvJbvrUi~gb%
zY>9<Qoko(%l9RfEB+cNKR6d<l9+EWl<rf_o=e~10Uj6vq2ftc<)}ZjJ4pSyiNTD#4
zv-r?Ozgzoq`t1F8N&oboOoSvgd-+0=Tv@$##sxJLB=j5Zh`<{sqzox!cf{2shRsj}
zkdT);9RKU4B=Xj@_%5Q1gq;336Ks<@+`g~1@3Y}yxILC&o<2}97%<$7<qoG7p_k<u
zl4*{tvu$076}}$-MT!@-t!GJXHgO6vC3&ng71)xKzzsHa0n8c>lrkHwcG@h_hZ57J
znH9m%QE^#<)z5oG_y{>2;Jp$PiR<nAc$g`c*8RNY5MRPzVQ{t(Q^rdjA0%d*7ruHG
zr=NJ<3~D&>lo70vGRr0KKPX9YE;6H$`Ltw{t?!>?#gv6|dn?8r`Y0mQkVHB!E0)0$
zy0Ggefs~{XgORgp*&t$7lP*z3NXBIs3ndUMmr#g-dY)NJuo2J1f0eTx6aVsNCjP_&
z@#7XMHmDrSJO>BT2+y$8UJ5t=&xDqMs4!K8``j18>p<0nwqCD$%8Pw+$INn;>u9Of
z%s~0mRGA}-^s5KNetQZUz~suFfLnEuGl)TOzu^e$UO-dn<_9Zb?PxBc2ZHTsJ~%v(
zR!J8AFHd7BNH&X?$?n2!Rc3SrNrHA~@5no5D)k5rxraVWH=knJ&SxFXO-goiz$y~R
zHn4dI)DtC}EA)z>jcMAIwZ*F0(+U>41R7^~FTp91yWM;@t<>3*y~uD6!GQzWK6|wc
zsz29E24l4WcX!na^v400G|~8>m>>9JFZ;znt`<^jskwhSinNq!E>vRRb++hRM?%3o
za~Va1sQ32qC@%yNQ7?KdvqEs}q<3+T;br(B&#^#xndIF7B&<WNkjE0-2LXo#B1Na6
zBd&D@i=7y@_ISEUH_sW>?ntg_>s)giF-1C>1{ptxWpDSf14&1PG;{NGYjcfaCbl|q
zo#|?h_1C5xa-&28G5gXL`q{}TsoeIsRq6(TDjyHRH~j(X>yfOmL7yrS5|@fXrFOOf
z-&0{Gvdq!k5;BHwcOLCa^t`V;4S}sB?NTG42Iz_&$okXRMT6G`cSx&!WJvJhK4MpW
zoxc(HO;<o4rTY;kOhuej>GWNO0;$p|DtC5w=h~T`=y`j|bt~Px2E(0i))PVstnu?y
zuz{%x<KamR1{?I`BBB8FX5)STlBRQMan!Nf;3$$QB}Gzj4AuuWO4va8;(9oO>3;`%
zWf7LOwcjGPVjl({w6c*@Gqu6|4z*J@$v)}<$XaJNA2KgMEUq2?X<F%8k^6q8uOAQs
zb9RZ@Yoqtzq$xoc;psdTzG8<rS`EBG1CBCYY!l6JC;e1h<QYbODoO#r9X}PM<*{gn
zB{-Q#4e=qP*`a8Lsb4AoZThfJ-#YK?g5JXlzYRBhI{5jb0zOYH%os}HhI@iF^NMdc
z>XUOn5A=F&*DXo!;-t5B(rfobd7xwt+1D9(*k_aa{J4MV`MX9xN=h>%B@bBktb_bp
zyB1T55L<$HkyA|AkOn7+0*4^7<`)Ou&ah~Ey*F1(BVL;MnmtE4KTSzmV5{dLUUkh_
zivp4IEDB2*K~VImXzdh%1ihXi9fmPy%5UT{RK;@i=EKWK`l@sH3wq8dN(nP!gwtuR
z)9wzZ(=aDztU~5Wdt`o@rVP>clV1oPm9#HTm)BRAxUlv%`^JW<3xa)N`^`l3q^1+O
zJiK{+gR_?XAlo_AVEkgtDY}XS1WyF0Gp+7VG{t1lfe_w6CwPw?D>=fc*Ds1MWEBXz
z)EO<2+a5G_8T|xIT7%cYON99i5>p-t!1a{u870aL7ss9yK8PQA;5o0A=Vx;ghEw>W
zYAO>XO)rTRz7i`O5jFR#P?rVxI95#ECnWMy1Ann;DR1yvc#={L&bY(8R~1g+dtR}_
zcfsIfngJ3H=&FNXcuLKE<h{Msdkakzduyk#)O&k{eLE!X?Vt56S#1;vjK&uc0553X
z4#71Lw_M623U8amxi?$<LLnoj^V!VC>D-N`nM|On&5O4QAi;gh(7@D>@+$bK^s(N{
zmz%y?s;{(Qe6y0`4hUwn4v!lY8@(x~=apnNREdUjrdRZwy!^~dhP`?tR>C@%R}3^{
zU{|l`+L_wCFVq7;`J#6^8S|Zv#9#vMv8APHG^aByWotGX&ca~tv)k=wALujf0|UE+
zaIXHQ{jB2Sop5vkB6a8-_VJz+*v5Beh$yAS!P<_n`O-kW1S1y_+1G3J@|n8Q!{%E}
zBN(~R2~6=`d^kg+yqCD@f$O~PzMR2D=dZ)HDmR3av*+gX8CCkooT;|@n1@mpL2BZ#
z3>yfu=w9o6|2E(I!&N|c^txzfa6W^Vo9Cb^fGzBs4KXqX7o!<gtFb9XSF*%s2>f%Z
z<tt1Be0e2t;rn4eVBgEFoYY+0(U($;^!57`s0U6x(tgC287|(9d?$O2#_!0c${jg~
z=9SS%OrpNF#wZt{80l!HSs?oh!ViXcI^pMUcM9&sjqL~m%|%c&Zkn!aQ(?eCx{yQY
zrObSWv9aj38y)taKoF23sXR!ih%65(g@QD5lf!Z0x8#H7W<C#-@MeMDZ<vm<fY^rc
zh}-L<=2!M%^8~05z~MFepvNbs!{_|P2_Zjanx{a--tMGF+C%M1*mEkvKFjmxRWb9W
z&|I-$^=v-%a!tMVR}QPCsVD2hlEuGbdE=-4mN)fBrq{rEIH=!%PEg&qXiv-mz;Do8
z?{KRGPW8$^Zz{jtmYXg<E?EG{E4&~i_&0}x`kDvl4@~#Y$;p(ji<N^fs;28i{i5!5
zA}IM;SCpv+62Sx}+7nc?&THc`zYSuznjiGVIIsMCzg)QJ0)02iE3cVe=?!Fy1NzsU
z0Dky<TH}GcPIXVGm|j1!z20Jb^-4apC3o2n@kI)f!S(slv<M3{w!sV)3jwx#39BUR
zE(q4l-$9W2R3oy=G*M+;%u29VZ<@>%;3)Aj$v@5aQ!8;fH<zQe$s|G+_lw0v_voJI
zns`S6I`c-7zSZe4#wi%%42iXR%QSHnJ<P|nD&wTysJ%OJf{hD;GP&00kuhH6U-6fQ
zn_cnX664mn{i)hPBI170RhX`Z&5hH<+YB|IzziFBmWLE5Z!$0XRVju_GcR^AAGplF
zF5hwGKJ}AM*g1=_G@N9Qu{7pB1Nt*>OeOBJn93)@5z(fE>X~Yn&Rhn4?G%GubJ^h2
zuw+7XhCz<Gb*jC?(6y?)_U7u8I%j04m%^<n9t*2WPwat43vd4<ue+N`tnp#~Sj}HR
zc<`qTPWAoj3ddrr+sFy#8H9?d%PC98azk8qKiFPwL4^m#sDn6((jW%#Mb&6v$jGC`
zt5kD)7z2|lDdkg)z*4kndzzJ~GY({<2{LoZR4W|26#i4K-#n9fW77)5DwKyny_`#J
zFz>Q)e`gSMlkJ|w|7G~+gZq`(1J3hezWqZz-{tGl>u%N%cTnwJMoQSAv%C3*=J0qo
zUrwol+MY+Qv@fRs0hZaio*ST$`2Ma35=3gAiBRnbOD!G<uMw82P2osRb+X9^04SxE
z5$Zu)dnXLTWVG_jlnT6<O;tn&kKN4!GZi}LR3-6+%eTM66P#Tqa=q2TVDI}o<xnX!
zCt`AFx>s2v<^9Z~DKeFoGV+S4ME%FWifh&}N82E|VRP?HLgOg<Hc{Z7WljOJDn-@@
z`$Z$|096^w5&Iy*==#|&Sng^Tj1|%03PT3KZ;NZ~ZZ3rZwu34>o>?nE4q$SF&99tp
z{$;u3eEu(MiktAx6svN*j}fI}!q^5KQ!m)<wX7xseZbF(`_CB-&aTBAGLdfNb9%%H
zUtvdjV~Y5x4#X26X^VG$znCIcw;RMnr8dUN)hM}5UQ6N91i|EsfG4ntiQel3e$Rh-
zJhzoFi<h@efy^;IxGo?9frk>lb<8?2LH^emfi98qz&RhQu0HL#lU{c^g?i_*hs54<
z7uXGSB*f<iUOL$l9gsF$^V@6C@&_a{CUS^rC)JOMUPx;g^GU-k@rUNEBk=fSt~>Jv
zrju>Z^+VudC`lE8YB7G?lUGsCz9vplCl&-^>(R;Z?yTe6=)coWfhZ9T!F3bqNjD3e
zF4sFfMhdS<LIhb{2=GC|m|G@Sn2o6oPN5Q;p!YzMR7j=0i?F+u{JzjVnUN54#LQz%
zL(P||EKngMl9andM~NN4?qe-)r3lPr4*QZk4r>WNyS-2yn0QO7rA4lsEa+zRD`=s>
z62QHPQNpYx$R0to6Uag$15nABzf6OUGgon@Sd(HvHxlg*MiIe38J+kf#d9@RfaCki
zJn~>Z8~_W&hlh#<`_@2Cczb>0xw_vz&EW1Kb`I1aT2j{P`EzQ`^H631elaaj&fA2_
zk%91T%w{tSI2ifRIR$2wD!Cs42dgnKwE>5L_Mt5>gbK4FE{$2QS4PW!@O4x<Sd4k|
z9FOs2MnUsJh7~~EPQ($2WmMuEF<3GZF$CE-)2qSh&Lg5^wqDYGAp1GYAqG;Izn2{*
z5NB%5y?-b$f6tPM#-omx|6v|XS`jH8Qq4`)WI#BNP(9~62ND&GLxG(&hBm~@4@M36
z!u!$A{s{9Ts6#_a&GJYG0BB#9t56gH%Rd4jC12l}NXWAFKdZ*(f0V85s{~jqi(sBR
zQ)Utal$}9w&+^@KBz6(nM%?!WmC@gNCwTFhIx_=^A}I87XEuob@-xMHLK^lXaFO4L
z_z|Hsa}Rm-w2pV$?}n>1XdcL}vyp;r+}{Kl3+Hni8H>WWc~FYg2T<yCl2tfaE0Cuu
z{#jG92#eGtULmw<i{h2WWD%7%i>E?r)|}`|9G)c~E~^KqSplzE3e<<EvX3*ZM(tG|
z3(c`Q?4a<8PH;KDrWHk=CF7hTnE(12wGi0E8E7wPPiC~^lR64r5D*a9@ziA>X0j1I
z@@tmw)$XI>hl=+cvR~HB!=5ifl#^JPRFsoc{gYH?lvJgZROH0Il$_-6OE#k7D9L*W
zk>P{lAjiS<ObMrk=31cws`Ww6&<Xm@_5*wUg~^7S@f@Z>PH-2GbIpv76J$OORA%16
zuQmf{u?R#qgpZ=cbfYY2j)BvH7K|3E@DPg3u#;^F!X^KYz4ri*vP%27XXfNgCX*1-
z6KX=0ngDhQh+qxawrD_Eb=83FwFO10CSYF;MO_=9;;v%AzA9i_%Tg5U2C(lMu&u63
zv9J66{^xmSG6@q5>ib^%UEk+kOJ<&V%6ZPY&t3obeKSj%*rWK{CTCnIdjO}tvuB4-
zx82$E#d}8W_4;R@&VTPR)pKcikoSEr_P%eEPjYaygDrfL;Vm%N6U5yUkSdV-gPdi>
z+q?rzYmnVfopMUer7B@c9OBXt{Og<PCHf%#vsNx+_>YN9aizA;NZ(aw;(_vlzJKDG
zPcN7ndUE;X$;-!|`N6o#LdM_nVA4fQ>LN@^7~kUO41Q3tq{GtRUHI5D|9<Ma7iWJt
zFezOnC84Bb$&{XWvQI+D?<g|?Xt6(>wgep_5MBfsfOPMS+13PfuOYI7M1+Yq<26tL
z>eV4z>>xF!LQe`uCK2;T>x0&I$HZ7k5i>L2s3O$9=HqFI3~3)M!rLKKhKwl4s6n-{
zOHm6KAv5lnj^!UAIfZI4=HOD`uwZgI#|%Y+A0%7C4AZS`u-5j!*uUY2Zq)CzdB}vI
zy<S?)hWr;`59Uc5oaGqwcOki8(#-c7w=-zU&7(vvps6*Sh$AebZn>kP<|En>OJ@Im
zI<by>TxB@s3r6XV-l(HX+))ILupSA8%DjnP;7u&8H9En>yQ5a<QaJfsVX&Q5e>=h_
zBrW7MciU8R4L_BX;Z`8m%@riCBt|-cDWdcvQ-p>Qk=Y$F>1grvG2qFpi&B&lS`B$C
z%ClILKf%xuZksV#ghbc`BES<V{EQ8BtAm3J^s5Yvqg5(MQji7rpUB)HjZG<!qZ~+y
zo~_rFfYaT*CcA$n&ZJs_nh_)~bNLZ2?EdE0ICEuPAaQZ>@bh5JErfWS>esarQi#bs
zo*+6*zNp)@sE!a{a|=4k+@ls{J|PytB%-c8;aUCzsjQH&K#FNX`OK!T3%c<0T2x&(
zH?A|lf2Mh=Uu<Tf)mcbI&~PbeXrmIhM{Rt2Tteu^{5eKzPtqiex9C8DvIuMNy{jxv
z_`AaH8(9+@Ym||>v91MV+>QI=rw~g6@aDckMfxfTvVQfyenAgzR$x2sQL_T2YF}nk
z%@Rvhs~bvrP8ICm05D{{`g@s?@wR9yzo@!ijk%X`W`bUe()m{UG0m|lv+A^pJ!GO<
zb9)FtB1;#gY+$>Kxs8mY-PKDo?R@v|t^7-MMS@{Whx%UxsOZPTJ2K3K(O~a{kEJ@>
z^io1<QU%S~D4jSUl0OvH>W3a`gJLM_sY2^oEPX+^V5v=I4W64(&s?bW<7uR}Aia)@
z|FnXip5v#y*iYkZJ36SDIU3Ld;1CGAy~IqY2@{GLmA{PZ8sMn?8-Cwmb9=PY)Xs}Z
zgF-HwQ9i+n0h;g@ys8b6F`?eDVzf-vxOdv)8Y3efPpj&7Q^#)%_3=mrE?aE}tRycm
zZ0?u=7iCWWt}q~}J@hHi9kW;C_$26&YW_uji!0&#7+*`BsglQ@q*Oab2hajzwoz;U
z>KPR%z`{TY!^H{P2!k`Odgf;|bdCec0im?>_hZ1y6Ye+{&e)zyt{~NjA$+IS?6V|t
zaoTQ2#c#SSj}#2u%;GpeHtTX#vu}@tF?3gTyPa}-P~m@wx9|p-kZUw-bJq;&D|g`e
zrBbe;&o_ej(Y^y=#2{xQ&K2(emUNkc67x{hft;)Al_X8Net7kGlOuF$q=_)JLY^<o
zN-Di4L8wvYNb)M~)Sv45+`#7wFSAd4$*5i(UbDD)?5kXd3eUhFW=@B7p;g@ANOdsC
zbaLS28M}*UR;p?NqGrTu0e%m`<kWJXaw6$G{t-yBsS(R6v}A={Yi2BOs_VH3p2J+M
zfE4mwEkzH#K`HbfSD;bA7@TbQg3edH^EBtUiq9?c`HkQ%N`8Ec3C%N$X~aTlMNaLF
zLF1ZomeAC(()JZho}5;p?rC%|h-nr20>?WZ#}MupnheR=6fqCrcX2{oo-&SKo+7Ch
zKh8mr7t%=Enf=6MFm#$k$x2qoc~zFW)tTdIbB<X4#p=j{%<~Yw1V!A}%wZ56uOJN1
zb{v(t_zKv2o>AF1+TQz0t-T<$SB<BS)q9cZ|LnP<dNd%=bbP#tRHpuBbEuk6%9lX6
z(+B`}qXBIm`5HcWqEn5M@j=AO5A&pUrU;&K2h+imz{7nlIv#7O2^<{5hBz~l!|;|<
zI*`F!3T6(!p@H$!9j?`$g6#JlEW}nsW-CUO$Iugc3c7`AoGQga>Oj*%DaP7-q4=3d
z3~dPGy$2iOy$88Xf3wY3{9K)afl*K0p3+uxA?4uqoVxEZv%XvT=z+hy^w1mU?_U4q
z+aGb8V9SF^)$OF3HCpg})xH*dF@^=Y+>kl!0oddKcr!)?UM%4amMe;MC0Qaa>B0%j
z>!^w-XsFAdILUbru7x5y;dV!POCx)#Xbi|0<%WC!<^`4tyT3s}-$J<y#fMI{-ACJw
zhe2Ejj2>!Fj01=VZh|lL@V=E>y7Hxw>XSPt3_y+Ctc8xt{|$+g>tLdJIIlWzJ3X=E
zXgn8nwtp$(-oQ4Xotii~PPI3vX2)q%9>x9A)10RyjNBUA%jsYnVcw`<OLS*p$6USy
zoolH1K7;YX6&@8=;I~x_w3u&l{H-E~(p|NMdv$4?We1Z(We%>egO%~t9}n2W8t+=}
ziUzoV1Xd(C6G+cOT+j1Sp@}LJ4Mba!*Ra$bLfQRPSY37MiA^ssXE^o=PUkNcFR3;s
zaSb{Ym_tP&bHWcGk97|nox;B2pt~n0gAi~E3yjI#M%kcdivy>-E8xRR{b#YwUsIy<
zHKfvQVHC_)8BIA&k;`lU3(8Tkc??O(C;n<AW-~%Xuf@%WQ6d=E<4_zd8vtPCni2Ab
zOjPz5bIx0Gx}Fp}-L`w(eQxK<x9bKle}NLxOkV^&-5Bb)#1U9Vc!Ek`E(4fYMrCL9
z<&Z3k5g*b()sYn#re%;;XsKJF*H|G~%b&Sy2&I3`g<QocDo9*xmh&Rd16f`ktl%#`
zqp+DTf!|5JB};gv{UN9*ZZ|@5)WF7PSzy|^iFnmb#4@hu6N9SYZ-c6bzRXf<aI{SG
z#wHi9nk(H0fei_pXM|;ZoX(DX;MFPjEXLxT&lHsKX2Z=x)a(Hip1e7QQF9GwUxxjK
zh8=baTY;%YEOG1=_bT@?lvj9_dl?GKYW!6OnnM-ImhXHqX^q|<u1~lV!+DP3tYA0+
zKGEuiv5M|rhVJ=P$%h5{D+w@=s^RMqQcwGd8g8Tp$cW%=6C*iuS-z7zJ*GjpaW0u1
zq<efMSLyaSHtjne5+%tKktDb14Ze$Wt@c1PbaI|@C7h~yt`2<Vx&`ZkrXSY;g3#Av
z@@%2bc3A;E;Jg`8vo0t*H3xtRRa8_&J`X)fcu^FP)~Z&k97T(U?=(ME#OOYj2Ec*5
zJa7TZCrbleM&A$bVSfpRP7B&H^DbVL?z9$F=q2Z9h)tCb6jpJOGba+fkRBzR8CRdk
zdR>q6S1Q0Lrc$APNfr#ZCxZeccA+_yaPLg2j(2x?KOFC)dFMb5<jfmEpNb~t%|;EM
zDm|kox@$<9M-GauxM7>$xeV$Sir^ul@!n@kuXo0vYB>hzJ$`&N|41=6w`5k*oI#Uu
zn~Ok2BWNhUY%l|>!^0LLfR9p+wHZ=1j*W1J_Akb^#~t8c6n3G<7n`sUJ=nn16&u1t
zwBi1?4<-6#iggN)W17HpB1H6Ac2pnWO#CRzD1MFFJ*0CERTvb$7)WIdCekfeb%l#?
zvU4DBto;mXGh`mhHzFNWx`r~{?xQt@inuVmE;JaBcoMoF51*K=a(9GhRen%4AKVqk
zh=;i=9PRfrebCpWeTk`=U&|LzVNN0_A%h?3sE(Q9cC3x$kbF5>TJx(d!vIY7#b?d;
zOqC)~ZGJi$xU<QhWlXvT1s_|I8|a!ESj3}=w`=wj&6XSPV0lg#t2LAGF62gX|IIS3
zW4+iik(I1ozRZIh0MIPpZe;soPYsx43vOj<RN~(Sfx_lgJTr@ISMV|PPv<Il>k_nZ
zR}MW!RGJROvSE^dkrk|U8%gx%>DNS+Wfo_dcc*(qc^9b~O+@KYZ>!g<xy!HNi|8WT
zT}5)ZxLQhOFST831xMXule0iNg;`qFmd**E4g;>XJp_#&N~m}X!V5e)0t$Z(>#;ag
zKiiCEIclv{N9a8@s$%mla>9eAGFa@l#aJz(@^^xbUSpXgt*dK)x|->xD?!a<^4SG!
zLX5V!`bbf71{wo<-WTx{#<sVv-r5y~ali>{+Q0k$qC@j8IpN!Wk2Jjdt05?iEe|Gj
zuqM?af1=klDFQU9W|}m^19f%LY+qe1gSdcE;Y65ZvrT~(S=nRej1rrtBYE*7XHiwc
z<Z53y&+cjJAXzOtnnM2WlJm3)%K$N`<%XJX<jvs=P0q5wvcmz`(S3!eaF=)u>yRr_
zONmwP=UOZBVD4f}C#<dyG_c>gfn_K|L&_Mx|JDsG!*q!sIYntno!!7B;OA*zywJ}v
zFky(A4Xh^v+YTS-ZaYmvsAykf(4LO63JdC8#pXE-mLoBTkF$^=zK=WT2|JuR4XGRC
zwya@X=0gDfy<zR)Du+E!L8Sp+YGvk2378@)wu^bm`F0h^#|6Qr><}L7Tmq+}UCG>^
z#94*LA&Dv|Wsb#!!Xdr$KG&EGR=VR|Q(axJjtkw5PQV|Ma3iT272)G;a1KMUW{f~8
zUCfJcU_zNXBpq0o{ZxP`WPgJL7fZaLO|K%Jyz^HHnvOyDU^NX(crr^G>4ilJ4DRTv
z7L(NK>jX`ScQeYDC5-sDjJOEB2~Q8VolW%yx;$^h*g+c6Z3<p2mrW3uMPM$S&vS7<
z=_}TaDPZNI5jDxpH&*}*_;Kkkk(j3Bqm(=cN{B^Q!b_q9<uyg+!!#(IDeZ~Hye?&n
z52#TB1#;s0X>x|=vF7qDtb*=V*nH*iRnW`WRTaBhI_Ne$bAv%cD+2`nf0wP?G;8S!
z-NJl(Q4NU?8F&dOqN;}Q^tWVXjdQPeS^<HJjf=p!1>0{&0_UDX%qE;9PIXO0=FDjw
znKK%R@k~ItoJ5&%rCJ_hud$JlH&j~ovo_{|8TAl443J-*n&y6C8$IiLG+&u6AWT{m
zpib=^b*xcQvR>#o16VzjluYm``QFDMdnpjfC-wAx&Z|&&SL*g*e8{4#;Y^2PMf<Np
zp+sZHrlK*YlEP(Z*D_adBDI<45WPUrEOZQB6g&1$=XyMe?0SWmyu-2sTyiwP74Jq1
z6S-t<6PHBeI;kc(m+Xo>P8-f}&Jo~7DLr@rRw2I8xEkz0SKF$is!&X@SzB2xmJ|d8
zzJd?2!yRW&HMP9vAc=W!b8sM+>%MtL`7FIf)(ZD*O3w5}X*1{XzUy~=l;Y)G5B096
z8zP?D0(C?ZdUmB7Hqx0ffsGM+MpMYB3bk(~uKzPA2yUPJ5L}_Tg5VGa_g(9Y$92%0
z-G#iA63`N|eZRu^=nlhcj-VKT!0?Bp-Pk&UqWO?Ah!86#-u4QI*AYGrtoh3?zI<rl
zm$#q2x@&alswI2PR}N>(gGtqsq^d+x?MA9Zoj+GV2zo`wxK@V|#2|2iYPQ`N+nI*c
z7uvvLI5-V7F?~ndeuvq7(S@L0<EseUkaft06Nq@VJDf)up&T|O1v+g`WD=yK&!hQN
z2t+f#7WI^D1S<Q9ket0_2C2g;KPLWCi?6jUN5(qU-94Z-GO5<)@gOx>f_V&~1(jEE
zMe4_q+#aKXU#P7pORC(C2IyH08jb8dx1pXzxWigd8^-ZsWE#udgd5yQl#hFgpxBf+
zvngiOne)1YkMqS0R+<^VJhDci;K&;NkKRkGB8NxD5Uhdojq$suj`u5Gw%zF&EP@^`
zn)Ax5*>!hS{C((z>o)eU>2nGPgDnrXogNhWpYJbyd%?=619(>oR#?6a1ZNx-n(L-D
zgiV!eTiJ<1js!I%>%Kr>0OCx<OV+U;oJ^Bd+`*iq-dmbDB-VD$c9=>vkB-Z|LIc){
z6H?9BN{|2s8b?S{4h)Wy7XGF8kW_P8UE=QrYY_OfDxa!Hk)~NvPn<*nCDRS()K)jb
z&v!OLEOogUaO?FD=+7VKKU*2*`GQxbnyo*2lp>xRl#LKp+=|4<>!#|(i_Kf6u9*aV
zNLi={DiI0)(@hpZ>eW{=n@MSa>)s0FU1dYEkP!v!pLFS|_*Y4<;2Jt`g87Ba{?6HG
zuOFl2*hv%co83o{ReBNjy}Y3=VVPlEIu)1Puas<S_LXg|-K)5bK$DUm>08;-&TEuT
zT0VK&JcL)-B5bOMpY1}UEO(TL2`;ViplzVoV2#R`cBCs!IE04R<G8pxgj0tDP<|WX
zfPJ8?PZUyDvrU>e+i!DXB1tSmlf70^Vz~<R{^{l!d8_9vgxH=468OxbaFXLkukrf4
z(!Ev+wdCv|c+RghQ|6l~4v3I$E~UDFiJ#cuPArn5$!vZVJLz(r^a^)FRNqf(iEx^k
zJUWjdFu#xb&nJH)NULOV1}0}pePjx+n#wd0RbuzM|7lsug=zTFq9_TNPtdm?VL_)f
zv*mv(p3mLv=AT8{{(P@k<~s@6-l!}~y+ik(|NQpOJx<#5>Nkt(R;Ts6d=|*GV9SGT
zXRt{DS4n$^z9Gqe)>GqqJa*(mllLlm>$Mk0cV5_9UCYm7v)u_xF4}b%)J9!N1X=n8
z5HPtI>ZLlGq%IMMDbSuY<}~bHvl}kQTwxxFEm7J_7XpV*`LVW;qdjG8#GG9&n5>ZQ
zwb(*xAl;qj*EQZ_Uck5a<4msB9+PUmr}rQmYaF(@>0OCX7|XLVh-=IK>~Ain%%;9S
z!rVl|%3(%CtU+-1zn7h@^o61PUToebS;;qKs9Fn~b>%s!J}zW#Z?h79mxsJjPeAv3
zFX;gKnJh0A3nQxa3aBOYC7k==qhV1pS!!Xo7(~v$1~)<zNi>#TWrl{Di^Mmm7y)wG
zl*m5C#c%})=ONz3kRCt*LuDek86f?F$Mygqg<MFR2orR%uao3Kmg^XnxXIeX;LsF`
zR9*(%=C(2(@vK2M?-IsW&quYV2Ng^qPK+CY`UctdxHyX`xA;AF60GhLlzxRfDX6E&
zg_1@~DZzo3$2SFroac9U9@x=T5#LZxR7KoK5OmQty17LNtFth5#Ec^5bMz^sbYUqb
z9j<1@QJtQ3HMicEY(UkujNdXMUreDqFD*}o7jGj7gGp>jj&xIU*GvT181hpve1Y>5
zp2@_F?vIH$MDM6v1MkF}okt>#4xkOFY+4fH3D>hEF5e>|9R;J;WonpzOvOhkn;o)}
zo0y^~lvXw7xJJ8|DL_dAbQ)F=Bh}1ijTH!3GffqLg(=F%-7Yy>Vcuridg4yiyveVg
zPDHpA6RxRlw3dhFvnsM>3kHdH%tG|gGV?das*)AjKS<prM&gMS9x0v@Kp=QZsf=I;
z^A-TXl-qd=pgLsGM;NyOv#&Hcy4-rhAl<?ZLl>hs>^KaD>Hyq#flbLg*#_rviwmTR
z5K@|DNH*Z*|IM>WKk;KfU%955hK8d~K{kzqU3Av1(`(2-5A`kdE+%t=!v*GAa3oMF
zVL>q-H@7Uk%P?weqHyik?G7PY=B{uGkoF-{@J2`ho;26!yW=E|bvIv9iHJ2s%a3cL
zN-3hKqnA?D-%lyR$`BsC8`qP_Fdf&0#$etJ5vcW(hrH(6&1B=Q8jqb?gHt$a-8DbA
zlhP%N|H}mH&#yaV-T5P5v9-zmky=ra*;N~TfCm2PqQg$R{^;Yo4|-+jS+CX2n6dY6
zNb;5klPUs9LkLNg3k<cuAN*NTtBfLA2>^nHf)Q57Xipf^-|G~Ao~yj8=CV<f<vi-;
z!WsbeS3M$*DOsn30&bGpY{GHuc3jZ}2F)b5$wzqAXJI279UJL@6QBsYUs;vE6OY24
z3rI)L#7*r9!V(FrR5MXQdfjXpig3^zrMAt65+izUCqWG9E(a4IBKDhU&X+tkrlTZ8
znpZDaAb^PsZ8`4CDhkWcn}tKb&b0p->&fs?Za+r1=iU$acP=p}X#m_(zHM%aP&lB{
z>|aMKQB@Zr(po}sALYn^_NS<>N)5?%e+7J&_q+j)hp_sH7QgOnT&|pqP^Vi6=ann3
z01KuRKn>}bZqn~@^2Feaud>}j{g7@K(@UVMxmXqoDXC<oyAdsR=UEEqVQ1n4=;ve)
zt9BdRW(1b~yCSQEwv>nu<^!kiCAQr-TBf_pZ*=wZtqV6WLb_q*c+c<Ulo|7huBP{<
z<Q||60Voui#oHYj!lKDR(*QC>#wI89@+jpC{L&pMXN`N9QXv7VRpki~9NDb1qPVL?
z2E)zw3>iz&fy@T`b>7=vK<#>w>#cu(>YXFgp1R`nkAtH(e>M5zTU7U?<-w$>3IjWG
zW{(+H9lGJV-Y?EO{oEVOzs|m4lP<5j_sg?4p7PAdi4Q+=^0{Yp>yUKAw0*xV0ZbO`
zSuo_d<KMlivEL)%n=fBlF)(S>m832;xloq)JJS_VgVgT|ts-zUUF%CMyHMKOAoCP0
z%cYw?jW56bCX@#iFr(5keGABmfDmpA>4bp!+jOP)g;|MEpAP)uYKQGZG>Y8tL8vy$
zK8vc!i}tT-htWUcOo*o%y=Jh2Sl;hV)z(l_lT};eF0R@dPcpyK{!+m(wf|16wx+$?
zADg|3AjLA^56S~#KT*gJYP@?I`K=N0gFjLy$e_wSh%gr5v6Q5_2yTa$rVBlw->{Y+
z3_+{6xx8bH^X`gC2}L(3Y>uzdW1&y&Vpj^Of|Ml`x%Bez;d-5fa&6RtnqUu=21A9H
z!j0}%%C(8N=q1nszXBD)=q_iXd8$_fN$_eBKbxJ07^eFH(-=Wvx@#J#$q`*v@4_qT
zW;t0cnWh@xoFR)j)11od#NNi$|K>JCF1CZr<0&V29xS0{T}^#0AqXt(Q@4b@vKW+G
z!l6HA31hDXZ+JH77Sb>OMieYoHt$C)q}(swLORCR)`gVEdxwP_AQq`5CINN<{A;jT
zhq<)(@ytXHelDm}joabPo`;HBe24td;>yu7(H5?4!{TCn_!X{}!{xNFi?y&?1eR7|
zHn+6WFXKxq?|#yym0Uu~tOHCas&z}b0_N(L(wswxDz=#Yf6QWXv8o^b7w$*sd0%8E
z2{TB@{VLpgoe^KJL))@myRaFVSgIzmAGKV!P=N0zZN#f7CC6$X>f!A6^)!*EgE%_k
zYxoQ$qqG}QP@0zTr|(B*EMATg0MY;beoV?(pYnAU6CR>XNu^$T<;3=+HsTd{F?=@L
zrj_00o%o`^6X*Q+ohaICwiBVyHQ+hD8ZHQg?ZFb>uD^L7yuL(}2LwzIP0L$KrYa;%
z1(Iq1Rwa|nYo7B06PT?Sme17wjTY=r;1P~=AE9VT0qL@%8KgBI2jM&BreLI^B^isr
z<4niN6HI+`DualPA;xET>Ozl<X}hI(7=sX2HBANAcu17)FHWa$X@%;t<q781Da2*W
zTAc0dfa~xySn^*h)YAr(1@qTb!J|4QDy=~-BxCo(U%mx7UX402(PqkNm=7+!l8d^V
zSCTkIexOE2(A?k~)Rv;t(!m+dGt<2{S+6%iSr8h)4Js)bfRQ^ifM<g%-@N2eU#(gP
z9KU~*aVZPMS3+1r+yU>9cqSfl51>3fVp@FBH9b;CwCIWIh&SOeDTb-=g?ju@%vGA&
z>qsw4@15m4$`8iwucagk)rIwalzGPK3RJK{pv2a;xvpU3_<9^7ZAbf@>QF{3Q2u%B
z?XDu5p0~S_Le5cv4SqKRE$Ovs;$G%{oND$>i0|(xk9zKWYRmuuF>`tD-azw+t<cm*
z?lY-o!>!%KfzFN}zDn|Df0E}282!+!HED5en>J~2amgzGnj><5Vnp<9n;16hl_59Q
zHArfymFrcJ&FPiy>jv+u<hW+>Bk$`C)5=3>v1vSNxlbgumO`Xh#hpe&WKmpMRFg5|
z$b3ECD@@Q0jTyprKA@!x(SkgW<Sex`Rg;5sPx2GN&~hzS-wFz(wVJdSPch{lUXOyo
zW;`y(E4k%A$@?6(jN^**ZL-5IaeEBHuF*o83xs_#SOAgbW(v5})ovVXW<(Y=6-Ib(
zh5tgg@j_WDW<*kxl(im=Fmsp&XcnPVF+Q#iU?;mA-6eoBcn@#<i>7+;*->b}>;NxC
zTaGEv{90jl&Oh*>w)+-C_<Zu?&gYaYjjh1kFtrgCKs+KqGYVl`P10M(GFy4KS7?f5
z?7`FsRT^10b`;QXl^D7TBF*s%xxAu!VE0)!K68^RPo%J`%RA;=IMt>U$o}OiVnM8c
zx_Zd&E^x6nKiM2>l$2ZZ3s(^-)yxBe9$DDLFX|rmXKrCANQSGyIhn937r0dk;Q7-^
z%z<IxtncP8Q5cXl5P5X!5{Lb7H=I59VQ7mzkfiDw2Zvznj^$kH>QnD+zGZ}ca6Xb7
zHk@Ay%w=wC#!R3e5;VSAs)<V6Jgsw;<`N^$z|zb*c$YEhk>yQIJ@0!W#yR1{b{FD}
zJ9DTx@gWDMZ8PZQDr%@Bu>;-PT6I0++x>AGT>G1K{eScNU#=eg&8CgTUmrSSm11x$
z4<<G6lX?=8`UjG_7m^}vlRD`>lhfekaN&DB+m&oO=06{Ov}kDI)V4bflKLDFd5fO(
z7ml(uBXG&2-a;6g$}PB_wBsk?!Pno>HeISmswwws4=;3D$$?1SZDcuu?j^}N)GObn
zlTMwAjXPdd$xTlj1XSS}!-|l*$WPH_HXW>7?Qc2qw(b`-<ZcK}>fLZWH&np@j!%|H
zD8r<AKJzI;U6>+jcygca3oY5u?nXC}YD~+mBC_-g8MvJNwRXQ#U1s!Tmp+>|$+ib6
zJd7vybO*Q&EKvq;1TKy%vfv3sF4de=>9*-+r1SilAx&=z6JTHuFsy=MD#6^@tPfUG
zu)fBD8ek>8t1lo*mv62`VMreXy=Iu9H5aB(V`Az8W=?IA(*?X8ihEDlYjkFDuQ+Vj
zF#mQHKjwBpv@&jY8#9Kbv1Deblu|dI=?_dYiTB1h?@di>-W!?zV#Ln$r=)q4sj++q
zQ-ep67QxB#;YfG(d^oZ1@;m*qj!DTdcPdn6-Xe@cWlUB5>mX0)XyhwCOLrxS`M8$E
zmoU|jNMiaK@D?|cE#Ukx$xL+(1ns3|Eo!8=Z>nDoq8F}jz9wcGH`1<f;b^Ar_L<Up
z{i!ScF;mCIGPFrRk2}5^c5%*>{OG%SsirSy`g;LqZsBqb(KBZU@6-;n($XM~&&rP8
zDefP{weC&IHbr;ivvMtPZAVbI%na=npA{;q|3req86PL8U7Z_j>8=U~GwfnhkC;VN
zbBrFyH3*{ubGDkksq`xaA7Ed~Pl^hfTtM>`YPgPbnhL~-$J(_DC^4Txk6qdRnL-o(
z3p)mM&>Y-&Foq}&&lCk~f4`$$((O7lgqoOTqadR({|c;HzjU?DUIFW~*@xMVij*k)
z3^1IJq~={p2G<4`&wma4PJc2aU9#w1Wbrn!BLcGF3d-iYIQ5%~<>IR)CsbAXGD$#m
zt}yM8U$li5TAI*tT5#xvW}Zq%3p8osRnEpUvKkm8TsE=*jZyTMYABfEIaR^a14+c=
z+N`von}o}VRHI0{#;{ED4frPQt#2`p)FO5<+5pLy5qVv^M#<?DcW=u|;;ePUn-gMd
zEg@UJAympeFr7Y|UgE8B5Z$WAg$yLm*7)-50hRrdUVf#jLOe5Fr4<?1?JobngPG?C
zhSx$qW#(apP&%OPm3wFDAsXL3pvyDq=*MbW1|n|uk&6ch4Hb2Ce^V1wSR_C!CgK8A
z*v&Ikgw|&!$-U?vAFkUOAC{5h_%P9{__kpgTDp83AJ)m@!(Nn2ol9chHRP|Ho4*4w
zatGVM8qhmGNkTcCIQGgf_^<r1_sV|0UYb8`e^X1^cc!}30p;`xACO72M~^>2lgYg2
z9)GH9ooj2Dj%&BVpT>CXY0pINbVLIev8>wz==VD;>s|h`{>i-}MMlV@Zd%r-{dat-
z`8zr>1@~=wM|}1#6PVEYej3mwS8H&^NZg}01xg$#Y+@P&J8k*U2Je}`Tj-_*Cd%=G
zz0jm81p9^0P(~M(0x!TH$s67%XO6U`MzVSIq(w7Tz}|7-RWsBF!zZPEcfx`HFGxwH
zVE5Eq$@@VK()EnCi+P>7G^6JryPsAzx^sh%_q0FyovU0UJ}_PZCq85{x>eS->txa-
z+|N)kAZz~MYAX>SogS&CXo)yi2k<Cy!FHw#oFML>Z0info2Jy39oo94WD}xYs@?1S
z5a4w@uS9uDx)<$IE>+Cb)4{$dm-t(9*$5tQwwhuBUaU&4xe`&SDY*?^ow1rjWL2^?
z(#>PIEq`c{j%;<2+M<Cq-^)#lbgs;O_ERu1R+6s$UmtAbr?}fPAy%UIY~?j==>YeJ
z1I32v&cvEGt@L?93$f6T{WmSN_Rq5cZ(VM|8{pQZi+NuxYqZiHfd{bur`tSvf|W`{
zXGUW85QvaicujCcwt+&N19GPnlFq(ePgK<6M7$E6XD~IIYwX^aLMKsE2M8^l3QKcX
zthGUy>}7-<cMns?NtC&Os!p!-(YLTI$$%v;sNyvDWUl=kbktV$l2+(uh+E~~`CEIF
zmYJPFDIO;qnu4G%)9`KA(%kpwiQK5r>7{%5vALf{Z{p7HQ`REep&TmAllWxAg3XiK
z!9$Oy_JJYSGc7#ArtSgXSCNiM(kS`yD5><E`Yg(8XCwky=17^SZcsqEqZ$_~2$*9O
zk#BYyC!pqD>#4cQV3(Tf=_F3gt>prP3mG^bS>+Avqmh!%sbF55Wi2(>8L)ZO*P^t?
zQl~-`_#@v)MYLo?7nYu(lb~*<h@hBtbBr_{2EC3~FlLGj*+*sqKlJw6A7+H=oX(}t
z^On|)g;mY9o>g(|fUM8B){|%~P{yKdYLBA{PY+`8so<tMc3(`T86f4OoZ{Ebe2u)4
znoflrK_gu<hy<}rizau26rGGB=JIu{&fE!1hP+Vh!|VKg2m-J*PA!*iJP`0>_ThZD
zVhg9%(T#e&O{Zp8Ee`mXf{h^kQQ<%aUB(Ki0O{A$eLz;P(7w_Wv<(|uGZcLZD;NWt
zuo;XYu81`^4(M$&Qu|+e35x&RP=nO&O5VwF?a=<pmTk_ry^w_7dMA8aw&x==s@=~1
z0_HDg!MFa-=C6xJ4P6lNGKz0DICt53o|B#jw%pW|E~4(5M}$|lRbgtIWKX2RT{c;@
z?Y4S>z23F%T|a?^`doiLau}}PbFNK=(YkgbZ&B+DX#Ab3#Z^mE%54M`_EQDL@EqbU
zz|BY6_D&*bEiPC<NNq6%XyDWB%_e@1i;At>;QvBY7;_DxB8qD-CN641h>Mq}%V1@Q
zuLFCNYi|Wo;>Z|vh7S8l%0+YfRI9NBn9W0)IddA#^ukD0m;uB>z$eA|4k^~_iH`R~
zU>sjeO$<(a%l}G=45vu^f?7k;7=3vn4Zn5Li|vHQ9rl3-<N3jTV9bkNB=s289(*L^
zVf~`)ve2No;y+?1ivRNF+_VXa)y$U%=ZM>o>T{FSh-|5%DpKG%_oPu?i|pt<){*hc
zO=NtpBV(0nnuF-^eXa5oXPRG&sW`nL8Th8{dO`CgZ_+29!(3w8%?pl!7%D8rwf-k&
zCaV4KfavtN!;4D!W>h-?M|qZctK5N$Te|sLv&madr>4W3&P0u08lR`}H{sioY5We~
zmLA8qNh|c!{TRLt5%)H(38{3SvM4@!V6vN{&rlm3-^ntsy9HB)Q`DHS!LE0U^(_7o
zf3;e-<P)_2UG+46jNZ$RGV(OWF$=yJaqrR!2F`n9h0q<$U&-|@e7ksu?M>AyZv)>%
zR}Qy>-NyaIgAGT`Q`6PX&0qVyce11Ych_D<z~j`W6y@GQMAR1tZ+ZI&v~DhrL=;D|
ziYZ|{ofb6gB@l#h_xLU=*}Sv5%A&2c04T!Qf>@8fLE`VCceIN}M;77*B&6ep&|J}W
zF+Ut_?|tYy+Wr~x6~Kr8J9d}n@sc{1O9eq}^DAgGN3l`#RwGrzfD2K|5yfQ)b$4X0
zyW*6Q$6}Bxc3B-`>d;eyc9Q_6@wqx#o;w)dvPxxLZKMrb%(@y*ZNnNi<T8}DLy$Kh
z6~XJu9LS|WQ}v7$ZV){gL<QFZZKe9IbjVDBNp|uq4-?^$hh8jdHgH$%;;l1w;%NB^
zyLdJ>pLX#^w~Kis1mYl%Sm1F>EM$e0u#iaLA3llP?7?YfQIn$y7?LinTyPrQF#%5F
zb3X;{rA4d1w%|1TGZRAhdT^RV?&s9SJJnSEP4Ks@Gex3dhR7qJ0vzY|4}>Zc05(3~
zAuHEkUvBLIY-}qylEjCLkvV`(y$7w)o4)t|i8qaCP5uACOVGbJ8uaSHHv45Qzw}D{
z<*847I(UI!x+eY-znd~ldMEz!a_pLQEq{q$^Af*wO1vii%gg+-TkMw<txx=J5Uu^W
z;Rz{ih9{(GF!5pj)GTOG-pil)a~lVBHT(dmOSCRfm+CJk19fEy)HQUtP+eZZ&19&q
z<eL{gtHthhzNiG<CCTp9=93RlOxgpcc)L;itTj|ub5<ZF|EHk3G>;{p`}3H9>Y~8>
z523omyON^1QreE{;vTz>>PkV5C7`-AjwV!>Ug*Am`7d-^a#R-sjH9|No}3n_E;s(2
zqq;yjBu90n;53S%x^(LnsIF$WZh`8u)<bor#86#=QY1rlY4|NsT@LCQM|HW!{1B>(
zOPZm&G;=3z^=B>_s;dtek7|lKYz24cf$qp-_E2{%AzgXHZQ4-e{zv$Fh3X=TH5_dN
zSeM&9f^}I`0l~WZQ-NEe-otb?Ir=?JSCiVWUE8Ggm&Ufv@)lc1GBx?uNfcyFH-i%-
z!E{}`?f+*jKasWc^=mLKhUd4bFH!i_o}Bu2`N;mK&3f(lKfe6u$LbcH@@37wT;B3v
z(%i75(6O;<CvN^Ys(8VqVKcA$&Dc+tAH3$wq!={`l<oei=Lc;!nwX{GCGCc8n&e+I
z(WNqWMcbmjB2McLv|$+kW4_qHGAglkrvL%*jcWJ2B>$S<xaalfc~om9B+(-@ENJ)N
zZ1)+6l@v7@(y8~S-<l1+XaoCGb;7H4jE@cMS=!^2EiTVTQ*8j-p`>@P>tS;dD&rtI
zspv+P^xr~$<q|cX2t*E`AA)z&d9>t+07RQb#Rd83E<y8-nsfz;ac4f^3Z0e#!DckF
z@KZ^R5YK)DHxqiZ*?`ACH;P{j8*Fzk;V~Nau#0Uc7J!a{K*{Y{h2K089B}I`OLF?J
zZP#afHmlw8U{dU2QWChK9hdi7(fi`@RXK;uTl2<=$Gv&+*rc=uv0VI>Pb@`2djROc
zz&6j(LW5QrS{G7yJF<Sg9SADNbS0Z$l$-KiLNj$+NMfqI=-9bq>s2MFZzbzeV|P=L
zT$SKekB0^H+{}^Ry_DkgTPc)TWB2fWAI1_^&?fr)usM<=WP<J8XuCX!u0hp<E){mK
z@ByIUXO>bkuu^|zSgj3&`V+r+cxEFXYgm(&5lT-yFtbt3JBsNbL8V~*?2q!q#8KKV
zE4|_2KaY*BB0jou+i$5iI`^|195I@N!9mz2Uj?f5&ZpYciLpmh{AaLw7FIN!4_WrY
z4apRxxqc91tC%a(n_9_-?{o~bS)*K#`XXVDy%+#S5PJw#(F*}wivIJDMO`2XyJ-!Q
zy<PcE-wud)!rQ3`g*E?E^#iwT>x5;J9LD<G8X3d;n-2Bl73ZT+Roi}AwaDR%Z9BZ?
zs>YCMMv2y}*$dud((wBQOujl?;&=ukM%_&u0Q7J7s=(CSoZFaw^(pPAzx1uu)}(JO
z9l`Hl&ZzAY?+uKh;MFIWN$s^QIi`-@o!da=DOU77rN522hgun`r}+kWvMlmPO}Q&+
z0-{`X6YXflZ}QINC(5b|0d5HG+p!F+*nCIK&h=aM%Dlh|=(C~LM5i@z8BQFf^j@#I
z*G9GRqMHLp9b{bzt$KanP%n})B-B|)_Jt}vb_#EGZeGf-2gAd;pn&k2koIz<alXoq
z>wJo715SIMX=|rN1ot&8)Hm1Wh6d)6?yhufGP-N|NZ70a)WMZ{q~nGi%w)W=sefhO
zRp85<SEp-iXh0!daMkZJ;#d7yL+p~en2%DhDg%t1d+6@&Z#EAHB9|>z%)K%QIpz*m
zm40&QF$$h#25F+nlmBMT&2K;pcY18bWT;5JxuC)!X=}x7gTq3hSc(Q*(fw!E@vc9I
zxraUlE$muJNZ4b*^mF6LhJJzSQl#hyNLWBvVD&*><U+1w7I&eg;S^Pqs=W7u&!V-#
zm8F~BJ8azXdzV;75Nvr+oZBCo$<~@99a-OCc!Hb5Y(#(_J*71ey{SV~eV!W#+}$k@
zFu~$bd)qZ@U%m(S%{t_tzw3VW-|u;E&0qgK`}2RCshs+j2b0!>yk=-15L&bNg~?Cd
zKYq#=$L;;x_#2KNb_cR7G-KL5?}bYqU3vbJPxg!))b5O=y$h4dvZP|`uoUh&ztf1L
zAMU*WrIY$EIJ2`oq%x^|Op2}_uost4J?8l@XSB~uDLd!vFGuZ`_I}d(T1o3|FJs0+
zOO|~-=Z>o%slBu|FyZE#uRT0z766M>7h9xvpdEFT%|PK8$I=vPpCY=hpGm;Y6=b-M
z{sAzTf2cUae9A%~LjI}Gb0~kIk{DYE#^G9)HrJgUt=xS$ND(E4fO@=KE3Tq`L==au
zwFR?jVnM9{^(IrHuQYm+#9DWO0#M6kgz4A1{bi&OT0T~W=i3gE5w_cCJgdYs%4vF)
zwas=LCiI*6l1L`LxT`1@5i~gp(jBfa0DJxjKJTUyb!Gzp;!dn3XOHNFI@PqJG;f+(
z%15ZJ1lJ%VztjT@3`B>*ZdyQwQX^D5mNYK9#NCank2`c%HH>Ivuq{3w#zu?Y4e+*O
zaU(_}vsJB1AlsMVNEojVs?O9n^a540*cTKi?5vcA2UMqGJ7k2JZFIX#=1X<2GF&^H
zjw+(C+D4D1UIwScj!~*R7^cJ~Wiv?LP80a(d)K%k)y%?&>9|<BnNn%9|7z1I;yspg
z3JLKlv>r6?E7AT1Arr_s3pS#}=qV4vt+fntOkYDSU=pR({0e_4BkXwrIzc%dM(cDZ
z6$2AvZ@N|Tqq@g;-aYs@(GB7?RSxhk<ppjf3CTp?u&#=e3%*^Y`r}b`>`7nAiqLF=
z>oUzUx|Az&ulGy8F8*SQ0F$Q7Yd}soWJ~!-G_<X6<BgD2m)n>2LWx793KPb65Ii!T
zDU%I~G8(y#(rHucX*EFer}m_FU}38zy5toV<x_EHrHw%epwcl}E#-EfQagae#VRu&
z$&c3esT$7nBy_r5HNh$7ObRn+OKOhbA)6FD48#7Z_Kf&4x6m-F5M+r%D}6tijb>_q
z^7O;FFRVekGJgV4LoHL%K4vjDR7K7<8(lG~uE6^_eU1%o=DxqIhrZW{7`Zz-$~q9}
zF&$;sx~sX;94d$x_S6*CBpXz69p5tmwo6w5BtVLWzW<Z7bQM~eE2O)F&||i(Bg?s&
z9)f!CxRp9lK^qp2%7fwAjJihycVRAW|A})^XkMhgbk1rUTx#-#Bg$w%M5;!hItxv-
z^c0qw5HPbH_{e-ulXye!T=pb)JVU2Mir_*N@?o(H4kPsgLyqac3R^1!u258NwqHbH
zR*+L1X}^?D7#S+bcZn|JHDG@$rbvMjz88h_FYG$=&x@*9cr!N~A^qhu`lh?{TJ%fh
zeE7^-+o)YJn0|SW@g($vRnVNFLfa(_e2>sT)lA~dFK7(Mt3*~(8lJD)oo?5Q(a54y
zF&vXk|Dve5sDc7=3+%8}c7&$hHH+UonK3P(wolrxNs+Q0euLEu<fGEOBg%tM@>7RJ
z{y{GKmF;kR>>EQ&Uq!8YiZR;V)^cy!oK%|SlU-P36&LPChl_AUc>OVCPz@*~`N^AH
zBX)D&WNkq63Z#wl7km>oeGy1f__e(?GEuL856T}AmsG;vozWR>msvbm<N;ruJ*|rQ
zK)0lJF-fr$e3E{s4&h1(RmDxFuow9iY`#4pvJ#5XnWbDi%B&}0$~!fl^@=Z_Ty;*3
znp3>mNDPrSvx2g92H{%*B1#KuPS$>xq~X^x{Ypy+*K=y#WKKc=ZB9k3F!q}!3k1PH
zUr$xBrdxTjZu-MdODAuE*+xQixR8ily%n0ca0`e%GmluTMz{pf%MLHY&DW$L4?nq(
zDTnX5M28IX0%`m4iQt`khs|>M)Zn?+)Y7ByLe+?1dLB0{s%ckaUGv>?*L+t2Oitgx
z=~4RmDhZG0`Y*l-lV$P(^J;1Zf7PnTEJlLu?oGoIl4#@epjpz%OhCV|`2(TxF5X$A
zwIq3wMq9vO5ZTP%c}Px61uses9K$h05{@6Wl`7H9e0d=fn)kdmLLTu#{&zbmpbkRw
z-i4Y@ewrQO+6?S*1Ui$tDREBkiKwt-Fj^SP)EJTHDaRmB(I;H7uX6L<lq#kKYxSXU
z{hV2~b7)QADzbcYQDsVdZha;RGXgyJg5OI-s(Ls0xNsE(bgQ(1ud)ziZ#H6A8i*AS
zgLEpPv<XN^b)y<a-or(!puq~8&t5G@ftaB5K!h6ByFrNSd}j3@6yPm0^KAoV6HUIF
z95bv=4w6;E#bn->h_O=l{vB+T9CHRaN8)6=88Y(^7-vUcVh<Z;J01=(RNDPjfYKho
zcBcutt*Ex`MfPXu!DTAnN$tvk)KMxMYKw4&yg6kR&l&>cH-+^`GZS&57|KfWsUlXq
z*zx@9r-^d~%`7W2N+tECu|*NW9rMs32iN(53W%w{g#eZUmWVtNS?y(mhh?+)|LwBv
z^utB$-KaNE3MAb2fZ0zPQk>mLGF-tz6~m}{!~Ok{NG7(moPKetXB1`cicyqapT~6m
z=II*;@0D5qcBi*zJlN-rqq^E*7)31)CN*G^nmS1>rKG0L;q0)`$)AkRUHzNi4>jEa
zca<%j^iBJL9DMSTPmli9y~}HR>|OZbZ8u((wL8VuLPO_%n|Ib;&%E4t$o!SRIe$`q
zQlluTNhZ1}^tUr$UB7K@7EYMJb_@twXlJ#dz5yJ01Qea?iW-Q6ISHu}an$>CX?p?l
z^)&Ud!4!fct(d6-e=1uN#do}q+^|kx?X<<o*qEOpI2Rd9f*?jgp3R;wK^nY|xD1T*
zK(4XXR4}_P_Ax*`QyAYr13U)lP7U}1(?KE=Lu&$eH=S`I^2;Px%}jKJ4cJxQ8E4p_
zDuA9;X}Zb_73!X^8=O$b`}FW{KweD1^!9GRPQPTP@X5M^cZz?9V=GM`-667;Azz9k
z6zFHE>eqT+2-oqtkSc(UQYOuZHn5TRXF|S0izu-Nae5z3!#0j~gFV9M+^mpuY@%~*
z*Lx8Tu81!pK9`fR2r?*@J{I<<lH}BGm0HG|OsYfzY8=~b4jmfuh4MQCueEoLs>E)x
zg>xLo<O%ALVo)V$_N3`Qf~VBmKEh^<U9JE4Wg-;^S%h4;E=;x*H&63dnip(5p34L^
ze|Uz<MS|_CcBaP08CxI>WA4IW3t~7L`Kz+=Z_2TfA-H>l{-knfN#*luJOrT#s)MP)
zT!r1U-0of_E>Pl-`d?C#p~zf;;W8k6tZhFWuNnE8bebx)Lrd*G7#183EcQrug=@p2
z)&jb*k3;*5n&f;oUHYS$AA$`3LH;A!<_%Z|=*?m5PfKr<P2g2<Z7<+xj;c&b2)XLG
z%$FKBoJ}oeyq1Vj8Y?=&D&%9nG6~r<SIG}1GWH+s^?`4pOI1meP1^q5Q}Hp;fWxZ9
z%Q_&|xVJ&cW^A;3vy3);40;RweKprB$mt5svgYuV5J@2d6tUaX>N6wypjxOGns3Nd
z@wc5Wcu_I;`K0`>M*x;E1%x;&@C$ICG&43*H=K1M_1|<v#O0{H{fqn=&ViGms#JY?
zfd~`F*Fu=1ZSNKAe>4|m?+${>sw6biRZElXT~yW4u5fka4sKAusABGr;~>xiCToLQ
zlTMNoabeheOQCBNn#-wyk}v;ZF9@y+_ff<_MGdk=fQSQHAEsgDVXXpkNHaG$5A6vN
zw#0js!@(^3J$JyRE+WJ?3s^~|fVYLJBjarFA)ed|D~_h;o_{%98s1CGtaQ=b@EHkz
zwTfj6SKHj#NP3hb?9eK^N3|UYkE;w1+D1|9Q-t8IA#9;MAd!hV2SBYR1hUJ$p!r94
zKSX~YUdNmkDVa2<>68fzS9o9CVr8lGZgdfCqr1DE^gIQNWDu^9AsxBYcDTd_Cs4$B
zT8(MsUBxyyofij|4BdQ8?!IjO75-(<o8??~x{Y!fX)T+n7o)ydRZgkPr9=gD64*5n
z`6nEIfx>C+zk2udn;#DUrSPafU3KCAJggvA%Y#X!VN$`DRG20$)sa-jC5@sc^#&&O
z@!dv?lu*{^(eIb<aoxyKd+qg~umAqZ8HmV5J$w;)C3v`ksJWG94NP7+j%D?+Fn7~Q
zgB8aHa?z^0QlxK}+kyH~I;xs&5~MUAfITQeFQsc#4qsJ*oGLRI@%pS?eFTOGa9C2c
zgIK2@$h*Z_{y|wep}u+lKy4*ZQ)-7ew?~>f(yMJQeCiP)=T?*Ng$904kj+dBW^SW&
z^&n)9@nk*?QFpF@{n~8cf-xry9Lspefd@~=;F?-OKX=g?+%LMCkZAXVJk53o9oLnw
zj_J6yDXTa)Rn>T#*V*1t#l?#CR9yToi7KvRXkSj3#e=6oCC{J1=Q08`uH=ZsSp|>4
zD16mr2}eyfdOkR5S#Vo;@uxH`)Kt~pI~tpRhPo5}S&q9W#HR5YtYMl0czZDZxKT8l
zCprc=Wx4x)guWl5kLDan5%&aH;1n)5OQ*9@y=kp94<Z=dP_A`N1>A|&ESp|#NBk8U
z&`^2aVD?%bcsjZ#(&&f_;8*v-+&wo^125uw)nFWMbfwoC$#T*x@`|^oh?C`+r>DcV
zZ*4j%yF7jrO2RRTf`A#44gu!<N9W4oqWlTU1S=dccjHMlt{lqU%qBedL$z7{JxOv1
z!^hoe2@_42vj9`KZ4)goNWw%zjZAb7FM!MHa<+=Byu5_a*+*_4+^(wkU7`pXd;$?Q
zwsR~L-Kn!VHd0BMl{zLXhz>$2H9D=2wY!a#e83dIi%MoV4~UqH<V}ysU7gN$yBs0j
zF<h;s!YpQQNbKJuAbMERp%;!q>!lX#o)lhgf&XvBbJ<$yT<cN`&@)%!8ykQL45_tD
z)Zj_xS3=5pS#J8OZ2#DI=}4DQAj)pG8KL9g%nA~`GHro||7J+N^YU??vUZ`}3K0oq
zt2K1<*)(Za*36^kwZ4Cfj?FEVyMZOYL)j7Qim|Lpp4QKGLYwmzm(P$60aK}=h+4@)
zn2vK<4TYcld7AT&Xwo!iE=h!-LMu;dC2uYEB#H0KnXz=g@b2X{b%Ol$xW@fzytvFq
zGL$i-&@nY^=2T*%V~By{Hh2`l&lq1c(i}*(3)ka|lCywz56lItOx-zoxtAg^yQuX>
zDUG<%a?lT;fej=PMNJ_Nx{)>&cODHCL>z>GVS(n20I6$&n5~_GhlJY%XjKp2BT$lx
z2GLPkNCl}%@_DqKf?7{d@F<9K7dxpzqXnjJk4G^wfD<Vw_C2=zp)swlN&<{LF{RgE
z6+hd6h(W9zGH*}8_$u3zroprVMOqbOE5Sv?9APPUVMBPFi>LxwQ8=oQJ1RGjZ{9@h
z_kN?<{B3Ls<~tU{uWO3o(^B>hmNGGh?}Uh)<0-y#Tp*qQOm%D|!)$X20fkTsF*4Nh
zIf`8<iv&LA%&c7UY8!3N3TsuskJgvdMbrLK`m6pQAH_*M-4(ch95yd2u6OuSJDhPG
zh7J0iQb?faxK!)uNi{LFM5$JpRI0ULw!T#J=~6quQuk#QTUX2v<^Op`&}A#X7gB4W
zo<|hZ@g^2VuwZF=i#c;0a`De>L}SK9Du~VHxc$I3Qpm}go(kk#r}Rw*8KwM;Gp~Fh
zsqm7MeQ9baoHL(uia_^pGz+-SDhzqZ1`jLh`Wq<%FHoS!HMvA*cl|JPouHx1iSAl%
zo_8hw)K}*wW~7dx?ZLS2r3eog6LkZ~jz&iFknZN(Dunc8B)>TTuRJw`w%DA*W1>WY
z)x{bfOlhPA_i`*tk7O<ZRUM-<v5cs`c9%ym?`ZCVgWq+juj4HS;jC5R0tii4sn*QZ
zDN2Y<H<QTKGH2;nsp(qbT!#HIS!AKo%hwJchcg;h6ehHpe#~tsBFq5$=OrBE>irWi
z_j8~--(^SNPp34-AZH>y(7jP`=Q5M0vE@)9o43+d`mO(V6KvP}9aGpcRnwnl?ntQ)
z^}@SAWQ+1IDe^V1MTH9jX}t^qrCx*qvuvsh@V=fBI#^6@0eMS6^N*I<D4PxkXFpu4
z&gMcb_@yYQZCVbZz6{-S8LH0lJUlIGw%RRYZ6?SljHQmuhyLa{^%#!g|Fz-K3^3DN
zNsf9Eb5bHcbFA&OO#|XhlmiY4U5@=P^&nur!XF@uRz`h-@+DI#sqz)zEbTTl9J$P2
z0(&FZVh6=A3fY0uqdHPy2fUFWL8X+ZsH9Oar%TIo3LI~fxGR(Eno?a7R3JFJ(=`!1
zSBFX?g1a<JC|Jj1?az?i;1Q<aE<ZN4;pZAjv%V_-FCEMlJjODM+m7ozu^5xM;mSof
zeD=$pgI+y5`yZF|IO4Bk`(P5cJlG(#=6NgceJbPFr<Vo(cFO|~EWY-ZgS&IE>y3B!
z?6c_EYo}Dt8@hPw*2zgt;-tp2oVok@mTDGKTV7nRLt-%M`n4EZZUGllY9dEYssF-l
zw$92I?m}c)rbw2#B31RudYjEe#7girp$I7G+$C~JlAmQ@=x1)!hu$G<lBkJRuC`{g
zU<kMb&0*BzHG4ZZV^c+AKU5Kbv&$1G%kS(82MXyUc~*w+v{61iecX+Yso?>8077k~
zq1iBMsd_*3;)j0T$KKvK2!)JNjA7||h_AJQ1)<a(pk-dh(r8Qvu~1Wq3D^sB12{{S
z15pk4;{|Q+26#VMslaLXzWnT8HXQQY+Cbo)rP;U4Uks`$*z#c8*D`54zob5o`_<RA
z^JhiJ<=we(=yzjJU;X6l;wzHknnWM_`SkG;o{UtS@4@SAUVo}?Zh|mymq(*Cn?e0x
z;XbFtRUK|z*G%j8rp`>M7MH@uj|0lQ2j>h(LgMXwZ-!iztV=jDH*7{WC}6pNC3xRl
z1&uwPbXKEoWIj&80SJ!gBf^DHC_W2}n$*xxEI3+$=SZVpjj(r|mWE_3KMo7AddkW1
zmoqxiEH3{7ZtPkWQDqOi4v;Wp_<8^frJm1wq5N}O`CY)A?n#W*VUykq$ZNj{tzk$Q
zbssedDC#AurmpxbfRV~c3(Xf)%p(zTMhU}3hS65lA5A$qAm&0<2gs<#NmxOJ2UA^T
z%lX8!@pxLG5v8E0cIcl<Wfu^U7}-?Ow0I2=H`G+8yp`92l>3}g8oN(tjO7-MM%9rN
z(f866oI%;-nEH)M1FqL77tf)bvp<Ti*tN|&kjmY`gF$TZU=8kZR#cMyk!b{nBg7fx
zvqX5_NkvIjsGmhyIeu6571y|pXo+)W>~6ke-94}&4T|IlGPk%efP*lJ0R+uFS5H*+
zyrg&r%-hp(khfz6X|;-LL7d=2<Sq`}ps)(pV@se%eZ_(k$Mp!!Yh2gayvfr@(-Ew!
zUa$Qxie)+6aK4!%gJUHRAS-=Zy`XG$3iGS<zIR0_pR2htv|mu4?7+}3fui%i1M9s%
z%WeNHcA);%{R#YYnw<FFcOJy|3Vy0|zckwZ<+f+3?W`0aNK?(4a?FD`T}7CvXcys;
zL$k}FiM8*vuw6ewf2p?nH#XBTL%uDexq6s%Vr#C2_5u_eQ8O&|-r_dxy|<y=z!6$5
z3I~Nsz~(X;<>nGs*bvO|OqB+(rojQ60|Yy8ogFL)%T{~HPW~M3{g(94VacxZ?>XB0
z&7~1+^<cg&|BUob+1Z~_I%OMfY4rLarH0Y>%&RJI>TbV{Xaya__#p^jP73l(kW+Wm
zei%3LK&bK8b+B}UV}4)!KWL3KE0%u{NK|)X3W5)+MEM}uT*hR>Njg(5(}Zino46@D
zAUNn%p)QCE*yM;&KAZfbOMpf`t%nGKwIOcmfrGgg0NvqA9^A>@b|~nrD+GTmVd1-q
zdO_Or`y_(ax#ssXF_mO3d=kLjt3U>KV`tWz7yP?1$b`Ec28qqn-DER#Y<ah9yt}Ci
zKusuJBJO{s6JawYj($lvx@+*U!0W{H#@c*l4c1~VY~H}Wv!6WG_6GJ(NpJek0iQka
zhc7Q!H*Dd!Is2dU&GDV-dme0gFsTGRMQ2_VEPQ>{@xOoe#{7;O27P_gVM+60<jg<N
zUt2bONpv%P-Mb6!Vj}z3hLi!%DWAtq*~>d6jSAHEW=<JVYXkG#RT+^UkeiFdas0IS
zrHc;o@2-^T5`cnTahtf?=W*>5{<Xc_?T8<ib8x;pFD)|C+MBr7cn(c==k<+^Y`Fi}
zH2`m!^2kEIKh7z$>@eY<7={V2j}PITL;N9hP<@BJqo&cd*+?@t&!`+N)eeqdQi-$%
zsph!o<Vbla*L(A)V%K&Ka%P|CVLBl)ZapTC0(xU@5pGv~6pPtMAe!n>I7WY2hx!j*
zCB(dwEeeDTQUes?uYB~N{;&HD?Z0Bt+mu`v-7TF{@94kJs~>+De)6LehDUBXw|L6y
z(w18uOsXQy5pLt?_g-9|b=K6i3p>=kH?hyM&y!Np86})qddr2|uiJF<2^)6*``}r>
zK7DS|08-LWQ&P1&Y4{2%8x)(aZ<wzBj{PyI;E(Te)zo^JfdXvPCLN>$KM)8yPWdAl
zhRULRRVOrk>xuh>K&Q)^y+G=4wPci=@KM0W@qIVJ@~XY24(KCSP1o8R(+4k;c_Ab}
zGD?AB83>hQ<bt2TdRK<aCeDk0oG`&g;h#Ecqm%Fl6`MTOy-tN?gSO7)NIQ78^adSL
znrf83$E<s)S%zNL72Sr~KgyuVm9~*-o}>?(dX)Sv08AaPB3ERZvlNq(LV>n+CaTE@
zorB4Jc@SXoS5#BxN*vI-eJ3+iAVduU4elWt+S)w!hgx6c<NVP)Wtv0es*atB`=(4N
zB6E&hsp%$H(OX&TgSp~DuMBUq3s4vpNHVhsQb=?C4yDJTkmyWl0pxCE=tjlhi_M9y
zc%^x)>7}TxLEHs>^iv1&raD+UdYH<iWt(o=LaF9J@C&(UXn1<O8N*AZTPB$^R}86@
zIg@Udbk2w)dUFQ-ypD?G9>(OR8~Ow=aQlg7H`IeE)p?MCW?Ft^^bwHSYUlg`B8vIF
znJOZNFw?<a->fk}^K*4Ed64c^j`Rp}aFB~r%}{T+w9VukZ;AH;iNkO0%@C*kn_r{t
zUJXKWH-Wq;tc^ToKqkw(&smWz31hm(jj0=`*e%!v5OuYAi>2^rL2wj*g|&o^7NF#v
z(}Hl9&1pdyemkKB&CcFrG~zgm4Cjg2Q4m8KLaHnNt(o;G@2_lV@Wh-xG;B^qS6;~!
z^oWw6xTV>CuELz)B6TLnFdf`JX1>vkA9h{6oo21(IRwQT$fw{m7H55c7)&b0n|YM8
z(jdW;*f3^_HxN8_y5(lhr4u6K;Eg=qu#z3P(@r%$bLfGw0&~z?9n|2+r8dj`nZ!Mf
z86M=BmSrPV0D@<^h@Tqg)9Q^SDu2n|ts(8GGUufq`Kd?kf?eS22fp;n;^BYn`E+K^
zo0<8CFMj!6@!*yRlXB~%>^3Q5F6qHNFFoRg=ijgSaA;~|@Q9DI-}x%3tVqi2ld|w7
z(mD=$XWkjpN1QroR_&}YbKCb{Jd`Y>;G-RG95DX29xD&OXmjzM`6Kg__V7sBRs^wd
zj=+EX2b)J2tjE@blCd>0Zcp}2vUlmR>9wc_LG&W|i)ppCCqN%qKSt`LdfX>f@{^MI
z0+O~LyJa8M7m9{wLXsu&$M#U`qk^OPB>Xx@Gt5&sRYwB-!YA@1;HP|%a(h~<^JCDQ
zF8lWMMr1@k3?*F#q)e5K3$EaIywdKoIa3i=(q%Ec$5A0mO^EOs>;9?cV_2Zt1}#In
z1?}$2sU^7v4sZvA$qP29+2ImbTLD+|FkRG}?z|eTIZn+#-7tyt61f}6Hbsbi$T$ZM
zb7vf@8Ht}!6FcLh=_1jnIaV^D#aMN=H&(pVMBu3d&pkk5NHP@QNzi_ekCXSu&ZC@o
zC^W#IfTvwC{MgLlNOxsdo}oZn1ay=bXN4FO8I*sf2b*ecU*v{yo<0vJw5wDumm#GG
z!i$0vm50@v1rLT-vkWsU&C}GOWN<4{;eb&@&GTw88PTQ}#F$MlPX0H-qu>$b{s%4M
zL+&;O%jZUN*~by^dLR+;XM1`90YK>mj2C1-(E;iSGexe|a?q3s_V|#h>f`BW7Pr|l
zlFsO}fv_N|LEMIdt7#&VzEpRm`!ms^uo+N3*5>X<2bEzq8wHibjU04@c0_D*^x}*P
zPZDHOq%>F8DeAXUOS&T6@1n$APgMe~L{8R$vKj`!57E-iC6Gl*@f$cyzY>$m%{h@q
z3K${N-ky$dxHCj4s4%p3FR%rV{ic8qJq24Doxu5*24E`V#-TMb=Sc!$z`!5J@+;{`
zSa<yTSAv5Pk)VTyr<->X^@4Ypbcd3sNtF|SSBDlvwt}%E7J#~BHaZYb<TKsjZ~iG8
z>7N)-Mjnm^mxkq_T&n{vJdd&Nx+QBX-~l<#MvFQ52*qaDO}`|}Y|0qzy?m!7)iLsU
zCA<XoM&D>;v-6CCYd55_1q`h#a|pkw-*%~al?*4=;(nS22yru)g=$GC^VQ-jMkrf}
zn4+hf>roMvQsju>OJdqVJJNB3RCAbOkWu3fiZl$k5}{{SPRDD4@9Aar!OAs9S3nZ!
zbh-e*LB#;V%IDiOG@7bX)3MT2;g#x4io{$Q5!@VK%>%f<+DnL6nfr*xbmm5s&DM9@
zxs*dhesJHykjmK$q-9(;6GuQhoV@9{GGJ^<itM7~K0ns~s;|x7$0gk&AFzcG<G}U8
zWsza*i0P}e4Ukg-!9L^Ykff+g(JO-YdU7#JCb+lXFx{E%#tf)5|Ask;;Qj^YiD^=-
zzu1;XK_U&NVlD_3CK}|XgZMXBy@CKwrJ5SNs1UH2<19Hp1EeXygYwa>wSaWtaqH(x
zcR@g2MSYHl;Ob^J#tAkb)D-@8=pmA)!9~qU0C{~P&o^D6^c5DwUw%|!-%Q-4N<V04
zR?*!<tNS9*8JPkD>a<Z}QFPCP_h^XltgfOYyW@7c%#ljMW$dIYRFjo{ghJzR?%j^i
zjd>G72PK+EOBcP-lWv-tco*mH$)kY)a~}Q~4ad1~wH1^Hmuk*uUfj<1bP$CHgJ#d5
zW*e$-_>0Tpw+NWeV|?;O=(sucXPGx}OX{Ag<QH4Kpt&RDn80OVM13ZJVpHkmq(>!>
zD}8y4R>zYFcFU2BADS-gt5Twj8<#nI;%-J0lsoWAXao)k8(>0@t#kabq6L36N4r*h
zgH)}XSfhvzd3LRHx93x89Pu^_w%bSYG*dlKAeS^Cd@!YW<pUf@v*EKix|_U^l^k$h
z$i&mSUgm<S`!^4GH#sA7a2>Ko*UUh@fPWBhmKvN+;S+2y{ie6XD&?Hpc}W)#xQnQD
z&XN3(WgeQw+b=}T_ryX3E6tY*E#Eq+R7f8xH9=3f9ZM?<jhXhQJcQ7AEurxAS#B~H
zPlMklOy((`u<_8036qJ$5s&dDPVC-kXdQG6LM~AbCzI46xieBpht&(Np)5oc`X9~v
zsZ){v8hlH5-VX^k?vQ2<mD-$PZZf67(xRe;&ET2UN|wB8xFH{v!s^Z7t)`T7GEF1;
zn?D?z899`>Z#n_r%lS5A4(An`kGYi-inw~OB7s6{b!7@cfo8wcs@3~hu0fsG*z!YY
z54jDi`7N}M`(#G~yc<zxN7mU9s8$Q&|LXE`WWjbCwb5j*PTxz|ciVA}UxjmAzbv;S
zDDp^JRE6z^qe9OX`H<vI4RXD$>@h^^wXk*?kLZQB@*??&dX5re92v&v@N+$#Cx7fe
z*31^E5}YJ_5w*R{Yy$bDX{MxWbT`bb7|Xc2n{SCF4AB+}$CI>F?iVg}y-SkLp{@rk
zx`{kog?@4hafI$UfuzE_p{vvpY<l2m-Pj-8&-D$Z_6BU3iS4!q&%n705J^HjssXVe
zkiyx3f6>rTqULM+HO}BcZmzCO(R`=6^a9EwE7NN=)Bao_*8KCo$=T~X&*z-z^T>7V
zD7y<lhUU3vN-%|0C1!m6NEdW-fA~+W*C;V?;z|B176el=Bn9qHiR8>?!Sc-r&~oEg
z$*yLhqG}349j`qMJ=~B&#zR3kPE%??*4vPmst#sW1^Ls$XQC66om0VWVScYcL1|{r
zbeI3S1?7m~*Y7yiX@j-3+_XFGqg9e<B7|#-bS~M2l%j_kJ8{ocvMR|D%s_W#cX}=>
zf~{UGQ$s<?@<N$aY{d*T+cw2A5l-u<%i1i03G2!7LYRT3m}L!HTS*jysB(WIXZVrH
zByr8Q2QcwlGz(yAhKUptwnoExgO|fOczcROWiI(NaLGwHx0U+AsJAnNUA89IN$ES|
zbW-zM<c)5j?O0^4nI?m~+#Kcj2Q0QuES8lrcc@elEgeu}-CC_<UF4)sL!}@%i&_gB
z2xvD86~zv@oq!x0+H5e0?RYnfDt^nP`mV%OuK5h%6W5)IW+hdg87Ss7PmM|$&XIJ?
zFk3)lHx0MW996@h`op*^&EKLtcF|NGTT#c!4tHJd9REJ<8e$Ebo{)#hcPvrBIUv&1
zT8L30gq%iiX&BRe_$8xc!J1*tz8yu*8S#DS2#$7uE)&o~O6kMY6p8=jG1{~7LukTu
zTgLW5N+qo{KOpYgmx|eG=2|yx|IxHT-b;B*CV^3DL%HTHblv#Ow7!Dg#5k^=QO{tO
zC%l7p$7|EjN4!_S54(!wJ4X5_jyN~cusL9rxcy53k9?$H#~&JL<*5m_3=}pCTPAZf
zL+8G4Q?(&O=EM3FSU-Y=$;a;43b3yg`<v4TA!Fpc>twc(so<yCXGlDLPAfA6g!w(0
z>)f53UvU?+ms~J1ALbj(uT&5dk(^MGqA(dSs;r4Vqgw5C3vXa{<Heg%(jUOycXcdp
zAr!x-`4XQnpdZ$lN0LjXB9~d_Is7L2zS95x6bW+$racCx%U4nH?JO+yD1V9Ot8Rzl
z4&EVr#_>b^K(ci5-*~{jn3BZN`?H@*!-IKpF1{xuE2H88jH;TG(!u+es%Exd;QJQ9
z#X7pijcre}bcQH@lz9vEfyTa+ZKv<Ap+#pMZ95IMS)++AOi<O$miMnryg&9W4rjoy
z)|`+~c3SI6KXI>-xMujo)BG7;<jwH*;LiK8yS^)C7CSxzg1nt!sUJHEkhxRb$(%(x
z9$tSm2p0EGCQ=vb9$U=MV~<LxZX3d8`+=V2TDm0FZRI|M9NX5jJVt>qvhh3gEO))J
zVIgxvb#~Za88_~sgx2LE&V(Yj8ctJ@TTS7`XZTZiTaqHTD(jLNmatQ$+ZxEHftlN(
z(ye=!Rx91QVigGL<dtqiP;;!(t#17=Y8>IFSpV{7w~l$uc)?ew)1G0yO1H>&uYWlY
ziC4M>*!;?rRWKZi#cKFBxt;G&BiMnWQ_RIpDKKXBFL#egTIm+<8tY%qCCw_`YWfD9
z<xgL{(rrz=e>qqgoD;Y`=CC%BV`^jWmI`IwUn{)C`5s#44SpPM{s53$QAWoXz98Du
zoG<K61t^a%Y<GJtUEg8k4V1@E!(HQamDfnPDxqQp-lRNIDPsk64VhTdS0*W8_P5kX
z>jZ5d;}<Gx*Vafu`evN%fhY4f9eMR(4R3z_Nc+OuSKL$dk)Ura4<-%vw%ygAG<=XW
z`j9jza7GD3Y8gO2oiIl*7+!OAzvud$7QFuUTgn=SZB81zU~NbIkH5B05xwaf?FN+;
zn|`W@nCU``as>PhyUPXBbnsDHFw1|_#j`%2hoSFfP8Fy#@i)3n+hW~~a+JyMrD*Lz
zx>eBRi#9WGKKr+7_LB`v#FsROGkSz}>lh1YxlRHEsC69yYPI$|-n&ar7udq{0_$m0
zfEv$4EI{4ORLd8On=foGHu75q%yxm)Go!V_LUfqJ4oB2t%iG{=n|rEPnCDU4W8DED
zfbc=~D5CZ~hCST%8RUCvLdYy=HcS2yE=hCUd(%DN2i|M<1_%1yC$#_%8oFksLd00O
zd`+Y6r&?6%N}WriI4`)$xrTb3G!<?y7j{;73d~Gry8L4A)Pv%uvTnm=>z#+2pD+Ol
z)*f&|CNd?^RuI(&hxr0BJirb1p*DU7qy(AVcaBdcLUQgMtqNs&$6t;ee}&_vv|I0Q
zI+L>@BKkM+(&kh|+Omox3}k|xw|q06YAxWyGvQ%h;SX}IvCq(OaA&u2kH!}FsPQnP
z=Ej*W^z7fGfo{75GNWcx33Jsfq#Q!2I9TziRQdf0DQg6Tly!iRvc3^gj_{3;vc|8F
zvbPJo@iWQ@>Y`kTl<;!#Q{@{*WLhsGY^=?(Hx3w59%Bbpdy|7TDG_s^N3(vk*&ps3
z!Bw0}w7H^Lw3(-aJkVVfZDueA&;fpCFVGzEK?fQm7y|Dkc%n)kml5J|po0jS3kBJl
z1)6ubK%fIIfT(sLSnfTCSh-Xxpfzri?f`uuf2R7nX1btn(L#PyyTNfbeIdts)jP_4
z#)XD5@iWQ~t{aSj>dFLt%!D;)!nabtLHC*@r#6L?BC<c1xt6+#QEt6zQ<R%|-q%IB
z-IN`HhlyoCgekp-ci8SC5gFzh6f`|!8(MONRlOPtMRQ=(`1<lR;Tfu{kjE-!Va8_s
zieF2aKLIX_nds!y%Vnu5ipp&wCQC0?Pmemcs2-9*TlepB*#<UiPATX~nn!acP|J+E
zt*o-DgREU>+pTmn_6;ev+Uu7vW8MkoUAK#Ptr(g@%pnUK$YXjB;uvdMjMsz&SZEJm
zYfImI6z5pLtW$O&+>>_NGva^%H&b#EGWoL{(YFKy7%8H!z`&wV(#Z}G;2fP#1r?Vi
z93?LU&i6{0GM!YxPQ?(rmx8QbpzNQQ2||Xs^ajUAJDLxH*b-0@E>iZo($Yw0W`rn0
zud|!FyYj4Prb`I^?JZ0-Is$AqbZ7x5mg22)kGmBmU7KmE+SyV*VwGFtl`}Pp44hWp
zv^?R2R0i{)b9bY=GxSF!r&7rVsb<I1wVq9<m$7U9C$cIOg~_F;Y02jo`0TGt_vfuA
zsf5`m>OQ!VR`Hr;4%846D7MEwI}=wPuj$#UxPmQ;nLw;^8MA<qZ+3ROpGW#Z-d)UT
z%=~#W5{`_Nxa=WKLDpn{3brz*lxJCO+fz{UN;lC&&<BW3k--|kcDiDw(jT#SdkZl#
zQuXEbbM=Q2i5I{b$F4HFRhY32{0L$1j!)ScZQ)^~hh3fUnS1m$=is=)?=rR$Xczw`
zx#XOLOKipw+>*+Z{xQacH-aRN4lXl*rwJ$)AsqD@4Xs29-gkgwi1ijID6s)N6k?Y`
zj>TkvLyjr`eTBPKni(SE$u>uzX8*=HOT-8w09J%>LJO%4isezIRJ5-;%4wwHGrQlo
zw&<~~O=O+|3rk=-ux=L~S3qeaz77Ht8p??+8B%?dD1?^*Jf>A!t$V2hVG6n*@}{^F
zO^yAi6qVuZg~?7j3xVZPx1788P4SnOf)gIkdX%7@<5?&u3j1rOe5}tn2GfM~acWlE
zus%?*hxO6Te5}t{FV3dwu@37ag~!ACDDa61<YRr@CLA#}rh`C%(P*eUa*+4x>T_&B
zemhwz3CM({;(-|`V>MbN#Ij`f#z3hDZWguQ7kk#5j-#;S57TMLVLe51m=y8W<*?+)
zF3Dl<0!EQm;aUD#m#k_3?d#Ifco?j<W3Pwy!m_&P>EE7p%|jQQ*tg<})ze=6^Fv1f
z)C#sdnABoPY8u_aJ%fKb<hk=7IVQhz)vU4+eP1|g74x?!Hh=V}p#Dy0WC40YAp8{6
zqjEI*{J5vv6BY$8Ogd3L(#h;!B^Z+kJ#79*k|oDv;l1477E(iFtSftU5<;n`ig%jm
z269teAe+7J$Nag3+jOQTF)f6hRR9`8IwqWQZ2(5`FmgLjAH6d>RWzN+KnUIuoX#Ti
zMc}2q%_x}WTzV9IMvsEIz;yh-NL=$dW`Ws{0p^e;aHLc8R2Rup_{tC%WRi%pnS&_-
ziixEle3t1f4Lo+=YVIctkm;pp(h9mB6hu~;)4>J=aP}$TS#_6S_y#p?^9%q<1O>M6
zsgA#x7GXS>Qoy=~$#vjwVK~$?mm@GC?25#Hy>q;%fbP^+chUu9#O0%h6{1il{#(aA
z|3W*CQr%8p0M5IF?0MGoWprt3h(NV6xk5|n=T^nvI{sGU<C|ayA7pc$v}FfTx2?jK
zEoV!5^Mf8rCO8-TfigDzxvStu6<jh_my}^W<#49|u_SzvCR6B7Y@5Y6A_j-OlLD@k
z*kH2|{0?_d;~M2+q`7k)wtu9b+t>EpQ80+z@vBeI$^Orlah+cr`upQ<d}oiMf0shu
zEe|H$2)}_}1iyRX=_!vLU3~a$BQN@(=F0JJx3)XZxy0u#Z%avvQvUNcuyTGRyh?}w
z>Joz%Sj(#^^}C<`HU;{VbVBg+)a4jR(g|U(d7t#}G}7!FK=qq}iWaFT8BqwpCu42i
zSSowqLzhmCBB!S471ZaoCD_|lOzrNheHBO<Z4J_FHd8?ZBDTM<MSo&jfzaVpkW+Z#
zT+k_<B3r4Ee8!SN_LMepJF{*YTklF&P7gzzpNa{gqN*!LAy$RppM-RjW5QNAmt;-k
z^ldJmuLs`nVpXN<H38V8l-J|x#UFO#osjFj!sqMUaWzfH4E;E8IrpP_Q(w*?o2xK9
z!Hwrjf+;e?uOz~g&YSc2L1Ypwx5Jg`GMf8SA~bxBEsPxCex$q^b_r|Xu;J+{_$Ao=
zVAPsAxf0T$_QFXjOG50Q%3u^GLC2-CHQsx$jON@L-X>c%|I<VyOmq$%ESuK4&R=Uf
zSKp>J-f^wjdUrHc%%D<f4XliJSH+AS*4l&je2*tJVvdnJRq!#>U(;QNDkBuw?xumH
zajnedY&<I8rf>#}!%y^9@d8e(iFTElcL^#MlXF+0h<1F9zGjW6G%WO6T8LJ?5N#BP
zP0jzti(ER*Q`GNpG-S*>)1+w^n;O&5Y)lFTl3ddz8rjGMV&EwVa{&x`MkR1W;O61c
z^KHH|%5osW4X#)qg}2J#NhgC4v}t!ZC_!rYT$^^G`<SDNJJSY20}E_>m2YZSOTZSi
zT5l?zl=Df+t>k~AalfnvrVqe~D5H;>HAy-%qg!f&r-9S@1zsNPJf_+0oIlKrs2zMh
zg~cE@x6N7lR<`*rFMS$y*1>1`8ys+I(*~at+u&GY)c#>anp*6?kMZLEG{VbLa{<T)
z6OgKB)br=YWkfgHMs3CBUZ~BZFkln95#sO#!7C@n=8h$akeGxYkdQaDnCj*bg97Yb
zd=h`Y{j;9epkzyrdw7uyYK?)wflHhF2w`?v$}xrvy0YbvaU|Km{ZI>?6VL~ConFA@
zRfXEg?EPOseRym`6(ic7-&dXD>6mTLke@gLDf-nh6NaAq*V8_k^=fEX{q3uMeZoG-
z#+C<@y5Ey}+LOxLGij_9{POuxdkuVebl<t#ju}{Sf3MGzI{%YOW>nO=D}5oHCoYD!
zs8j}LQ<4QzqOZ+(!YOUbpaG@2P(0i*$||p(Gi6Af3r$38M$1282l6>pbua@aof`Kz
zuZ2U6IJb+i41+0o){Pnq#rjJwYXj`r{JLI5U#@J|4+E&f(a6WZ<Y6+HmiNF~(p2@J
z6p}NA5f|2ef^Z@Lfu&v+DneH-phc2`uYdq|zW_L-{ysHT>}cc%&OS8Ohl6z}!SFv^
zeQ$AJ39bI&xi<S+0BOS!X?T%{W+VS;#(-eUHc^ncoKzH^=S1LsNVuM84TtfL<l2|m
z_RDxot}WvKHoP1pqywPb0EdxFsvjz35trKvegWa<bs#R9=<d~<2lpI{m=*)l4d>59
zT(@?^h88#E-o)~ql5lfuGNdo3np5j+y9*OWMBx|UIk@%OBVU_F#9Y&+$)CmLq#me2
z#DV6#N(#GG;8(1Z&HOsjgGsHYJIe0Dd)AXEF{O&ivArq6UJi~O?Vy66-qcz5G7cto
zRwz<$&ZS`)PC(Tmcn!UHDU;0#zlU$om|KZiq~g7eEC)dX%gAt*58MiyFg?8odUplA
z(=L3xWoop7A5&I}J1ka!{H^Jw`oJ>}usx%uy|M?@kvw2l(b|I5UjjN>MbLrbwI`S`
z@Oj&AigzR46SCCFID{slWbJ+j)5ExH!gn*`$vnh#K(TpOLo;b)4Uj5WhyN~06dsbI
zZku1Ruf1^p1ZL7Bx8pYkIW39!8l!55p@zhiW~Ot3ku}J{0y+_Ig#e_K2&sKLwRlmH
z-VKG$#R~_Vne-y_4AnMWVzwm)$VGw2U?y$K8%;flw4cDjDgXRnw*eU)pIf#6@{2BC
z{o2h#xV+`Tr0ihoZk$<j&0Y6BzGa{H%U)P=&chFsuS?2zl5(gmC0w3&)cI*;r|q47
zZqM~)Yp<~v$dxhIXE8-$F~meVIiu4|phPQ~y|YMy7p_$h3uUOs|LWwh$(6e`*PPvm
zG;!xY4yn-BhDppuvIDnhH`*P)fZ6MZyVoV6WtfjIQbvIklF@jd=#-)&K|Lw?h+Q3o
zfIlKRMY%RNy6>sxL&{!J#9xQ5_e^i|KB+?bzJL@^^YBG-hO%0F;>n_<EOqgE(@tzp
zSIHq7KG=<tR~lxzfL>XH2%Kbl|9i*Z^G$b@Vcp<xSllFR;qj1Y(_6=8A<5gPnqowK
ztu-{`h+OE+<(Xc&zM?JuEP??^HjAff76Yc3P+}I`SRqDN*J&N46oc6156UpP2aO##
zVncu(A<S8kugAsyb_k!#RYm4~Bd<UZd>C)~F~ySUG|3A+&iiz;?YG)?D4DEee@00|
zj%Y1HgWgRl)B4PGRpered`_NEE<TS!LOTnvB+g(+RW8)%hILhnB32#EC)g(f-J~tR
zf*HtpOqI%l?7^l|`JV3Z8xFL92X-6JnX)7ypX<?{gTw;1aJB_mNYOIIjq=SuxV5l=
z2$w0U@X}7XN*3<shs2&y!ORqmbsp{Dn~J%jD^K1L|LXNCz>))uXwCV8DXChv*SLV^
z!5E*K=5C0g@vH&is)Oe7={PwGNIO^cz#3`87o=eCfgtGS-jAotKoLZKfuqwtK#Qj+
zR@T`(AFQBAXpIXf?GIUkLn8jGB>g~mJCb|>k_^zgAomnv!3P8j#dQSjjNw<d&HqE&
zxxh(1^?&?V(`?&CO*giiri*O4jY<|_QrNa)-HXz6FKsp2?JBGaA&lHYY7pWn1|fvy
z$zvrfQX!s2*oQEYzm@R+oZt6z_VR6KY`1wmk8R)ko$vXcbH3+(!(VHgn-=gGBQAu;
zZ*@%d++B_WnESf&m$2aO>o(U!?-`igt!}RD+m&7Kp;m)0Ib*9^5uy_V(d)9i-STt6
zQ5T&)^7sF%RWi2g)*$L94STc&y!wu;;`@$%EamGvUvo;f5B(8w58uPMKZ8{}_iCwM
z@dM3)`C%@aYG_`cYEsi<yQ(q!1w8fYt@wum`-j&s&C#5%y%!uIQL6sg(ay~grJEnl
zI_r@(zpQ*Sz2EsQ#vj$gk2ex(XjuI|WtBa}Dl7czw-;5u5m)6Y(<+m`%0?v7^-e`L
zX`8U~+TE<UqVtN`FgYw(+k@8w8hwN6!tARsXD6|&E-RLDw&rxVziQ6Z)b`B4oJ(pm
zjCYT2j`93zZ`b)U`C*Rcpen&lt2Ml}JTo`D1<Q|i1^aYyACdE<8U4>hU&ocFu&G9x
z8(}{dLU1L}5)?R8{3)j_hby0IzA-uZH=21Go8Cd??~MMZ@3gA)ogmiq(J8Hcv8jO8
zpnC6wrDMz>;<Yo3&CQ%?Jw`4hk!x2Txh7^!487NG-lL5Txc@$NBGL~l5!twC2I+h%
zE>^6GD?@ObWvc*s(AYn0lIcOL$a%cx3gNvjfOY@A%$y#266>WNGxA<<u3=t>IfX8j
zFA|0*PG^~;n(j5QM&Y$sPdC4si-h^R86TKt8JlvqV4?=kXQi6Qxy)-@$0F^_P+|o_
zO2vc}mfxi(aSnnvXK=Bu`8j%`4RFVH>alqGpj^(hZaC#<n#cy8OtaqN-uzqfj#*>g
z=h$B0K_1_OZDIY!do$6ou7`f+5pSDxz}zhQs1i-gHk6|4i!4}(h@$`h3nqx&tA1<X
z{&QK}x(nwn&~%%=3FOY*j#0n4fHq~b=V?q7zl*P<;h5oG_lb=R^I^RqcnkwuW?sWS
zTAYM@qlURwsSaAzPUtSXZKXM;j0Wy=OmBE`9t^90I9z3-R~v`kk?^Zw4;BsHMGR`2
z<Dd0CqcMZ2wWB-Wsy7pjh5w-3%o5bWVL2r73MfYdR%~)OiapS-4(O_6VFa0PRvcj^
zHl({&8WpuY8}J5U4bPEovMt3~5i{Xh^LTR<86NoR)d$rDF&->)K^${~|Fp#R-0Ha?
zjP*~U+aVsupugZXF4@Wx$E9yxZwLHta<z`<;W?Pho{vm6mkJFuQ*p@YTirF>3hp@$
zDR+yw=PxKhu6qtPOWI{_3~P5?WscD{z>0|}Yp!>mUf~AhgXW$2?k1Gl*bh?CJD<nU
z&2QdP!yw2^>M_=4Hr(g=X-*$+EAw!t>7}5*eLJ>>8vY}AYKFH4<`X@qrMl+V#cJ~*
z%g4w8<fFOT&TJoe4jX|GIZp(8Ma>#qe2zIQZ*x@Q)Yy|{`rKwd{%?1qPldcRdky}A
zDpWP|aY4P!%irA<Usvu><fS_uuy;)pVP5Vuw~Uq5G>fk`J!2H9d#=W_+i)hgRu3N3
zS{)M_`&FjQIqyo|Wy+M<xNMI1doCX5ZPydOm~j9~Ld0!$4pl{+id7?w`|z+IlBl-l
zE39IaCE@a`&7})S!bMk`wE!mJN&Az8_;O?HEW)mUDsD!vc723u#WahRTD^vF#jnK`
zm*SNL^RfhbE-847t*l-#wNw*NLuvMIbiKt*&Vl<{vmh4cZhZ_6O^b&iYo!+N5&Ge$
z71QuhbM(?vcMIAowC+;1xf{G(cRV#oTJH5u0g}nI=7{f;?!{7WCnKvu4Ue^ks%m)K
zsW1!F6+0QGIjWqZW-YG?E_c8jl%(jxnCa_?6vd(Qd?Wjb*>@v-4oOt3BDU#?;ikFO
z0=~Kj)i<l{#EHx21gh#(7*pL&MS`OK41Cq&nc;3(tBr@du+6s2Tl->=YL?S+zRK-D
zxH}h3M*@R(bZ5Ju9{b*G*sF_isb{|X3~Q!)4}f_v;X!wDrIROe1#UI>%-L+NRcYqw
ziY%UiG{G7EdGqml)WD(MGjQ7_T7OUPW!?d9KzPG=9`+(2aUQ}r82c2=7JPJqx0u@+
zyJ9jJYmim<oLhMQh12#pgodT4E|;NKhwWO}E{XjS-ack>b*Q(|Md)l`T>?3co{pJT
zgKkWBeu)K&rUSk826{WhknHH}Vq9RJc<*7<q0{&U>Jjr;&2(?W1unw%?bxi)+!bqq
zsbBNw#`rVlQ_Wi6CFb&W82!tYNW`Q@H}Twz8$RlzsjrPm<0kIIEE{lQpo8Z*^u?Os
zAARtkRX6-*9<Rndb6BwJVERF5DqcaW1Bb_T^tc|v+-#PG-U8}5M17FA$>}By4~nX0
zo^N0gqfs^xsAoKuqa3q>h!;<BTgg0(T09x(ydn0FR)~FqulW%r40@W~WYt%lwH1aq
zc(=_n9@Dm45<3E<alNWLAhOcpHwle!M`*@~Dz!HAG&L*KW^iNn1;1Sr?G*MEj>ifE
zvKK22byFtdZ&{w-+#_T-GridvHv@+wy{QX~yEpS-n_E{nwOKy5<EYBX$3bG53FTs@
zR$(fe+do`O8+l4<;`ZG6-d0=S2j<+N*_DEQ4zTZJv+W=IdEg(nz||S-Udbvk_LPkR
zFUETI2Pi(r!2@xdC%%1?`F54*Ap_M4d4e1KF*m&o$8?eLE!=M}bi+<kS}SDI#c=0#
zo;Y4r!Tg+go<JT4McTR(bY_DcYU;ai&aH-LTTOFyA~%8Gh^C`_!mV?a6K-u$CUM~0
zj|n%VWDTs)o6a_h#+T+yGP(!1<Hm;acS9kA;)`2cn$?|&B@J^!6cS^Amm7l3t&ORV
zduv<<sWdnQ>c`rPaSc9R_rNx)^&WGxDs-=#i-Z8!h-pN5oN$s^H}E_;ce}xFdTI`F
z<zASJqres9yr@dtw~42^2S;BuGl3ZDpw@EhQ)u2c+f<5=dyIikz(72_wwHlvt5J1f
zpH|A5-Wrq46!YPAXP}3TeH)cTJ0UJeO2{!n-C%U>x^J1Jc3)B4fd!Gap6Mngrq{Y0
z`v$P4_9a$YjMFtVMb}fO(A(0?$qw<{>b{Zexw#hnudio}dBhjfB&KHYG#H1v3pK4-
z@n3*86YYQ*d*dQoJPd|mWeizx<ZW(i>r25jM5^Z;^SWll^XC<~ZWz}BU<m|Qf|#jh
zXuEKks||-K&2yUAkYHMA_qXvF@Zcp2X#5S6y{1lvglD>YJok3&>c)!D9;BS(83Etw
zgtsLVH%%wIGoYR>6`P{J%nqDJbzgF`@|wBbJb!y|yD*}SrLSucg(MNf(b$wNW${9u
zVi=2?8f#z`pq_ccrw?9vfpQpl9R{~f>J#qFt4XvfqKR83%`4k}_-_N_fcSP<LXPtu
zIUfl!&hs9&wV>&L8drs!?4A-w2%Aj+&m%n@&rr-cW5x%Tq<M}uhSc>8|Jmzb3{c0@
zVi{Ho@qM(Ds}V2L2AUh^k9-QRgJ5-YoD@j2r^d_uQ|*kKxQ~IDM=1kFz}<5re>V?r
zU0?!f?P>U;iM(f;=^FVw4WRr<Z@a;$*vyJC2L9MeVm1%CH?U%LFb8q;KVkk<{uc-w
zlguVFg#HQ36(;P)ruuB+nPMgr%mQIg6FSBUSfD|##=J9yjKDm(i9enWDT_Zq(35MN
z&=i?ZA=eHYU);Me?nFPJtV%LdBp%P7I5}X}1VYF{GoN4hkF`?8XNwnAsD9+xVsozo
zCUM;lHbRxp2G=rwiNC}d=k@qY*$lG#opt=n=lIJ}m_g>fy*|r-#T<1LOir6s=59E{
zhAm6TS996>!mIq4d+KK2vJTj9{Dp5ccXy=L!NPp=6mzN{Ee(!vwL5v>Ky+mnxL>K|
zt!VPWS_Tk5Giv~*Pg%W7mR(CC*8F|7r1HHR=8y!^s_fp4F7O<;Ei!LDSKp-B^<@tK
zq(rAWJ@)>`$FRs??|@u`EgQ~v?>)HwnL%^DzW?LxAJyFQ(M`9TH#0P>a?ZW_Q}$I(
znpgRLa+U8@RyjLg<t5WqE*p0|4l&A(T~#!G=QZE_cgBg)JL<ms+5qgNsd-1s;5pB>
zI`z8gotNdb@}4;a9+v+y9UQOO*RI>qTQBNu_AhUZ;VC$+=IP^ZK4@afU|nnhh~Ys4
zRLt*V0kJ;XuNo<-*W$uaOqWWH`PkKHM{)GW+ul6WYF+{U-P}D_+j9v9DNy@S3T}Bb
z_2s8H2#M<*F;ZyaPGzC!)OIIXab*%N7&f&CcJrgrbPvIpn=8zIG1E9r^!C_-AEvPz
z@9kkugCEp?n%aqa6aTZj1Z)JV@lP*iQa(0U?zq=omSCp56>P>mT7HbF8hGZQHa1Tk
z;MO{99$tcz1(>({5eaN|PjvVhJHB7UuP@@~IJEwlpFr=%JZIDjVK+pQw7}1L7{Hik
zj*xWN;Of3=dp)*|K&^QZ8fM~13VO1?Ah_wv=407@AYM?xI)qu8PtU?_D;Vdx8wd|e
z!OD6a9GTn*gxxwX;;KAcXy6$<(VM#5yeovR7oM{;$GI1nLH*ArlS(j`P!<E2-3u`=
zJJrpe4ZM-<=BVd3JU#$Bo~@CEb3+?2FX4U>4)71(9Or)16*l8grc;Gb;F_4?_{mv?
zL+H2@1U1Wdu03YQka_qC_aCKPh)Nb7=I)?*1Fly)7oFBtSVP4A6Lgu(<73z`hwUgU
zF;R$Lx0+wkL)_yOx@UFpa07pL|BMUty5TkD8s=nhbHpkOj~=^^z}$$vC!M>%9<!l7
zbunE0Hf(Umt0n6&)-zp-stSk)>&s@eelISgC@&-keJO6Mz*##7CgK6>v1Y&LD%_K1
zj=8+)UKr)|yog#3msQ}oGq<=LWm;yhxz+)_jECp`jOcr|)HH<`Phnxo7(b4}^4F2(
zgi;}<NX%Kj`>>-`zHC}>uje&PxOT*Zo9VZ1MJdJ>8f?Kfmk6OzGcQGMgWlFiDs)-R
z%Q;7&_i3WG*}d8mIsP{Gm!@Lm^TkZ}D#J9)1eoQf2A;d!{Rdv$!r^uvm*cfyoJKHT
zz$G$xJQELuA*$x39b7fyx!w#V{Mgrna2kFEclj9(c^Q5pIt^2%xc@+H;P*7dEu!cZ
z<6Z?MO2wEwidw`Rcla9@mqC9k&%7%np*&m7z68YeMoKL8QsV15=xD<MiqHYY6!G^S
zOa|Urrq*mH#Z7=>zIh{ALaRC4Yy<Int}|WPBRm~U%E8^*p(1JK8Wt<lV3N4DXB8f}
zxgH6OYiSVHBj!F&_#U>aZn@N~fO`6v)$O)eE_?^0gQn;rpvHXGlP7xv%=sjCV;uMV
zt?iSzr=*GsS($(7_G8+*dzsw7#O+^7D*Oeyd~exrpjXlwXSh+hAlcA`TaSFJfg3s2
z!+Fg_l{e*EWAAO4UvvMy5}n?rII;wxc6cN<3$(H4Rm`cFZehj#WZF~k9+bx{@2O_T
zIL4jm;6vIyCV+1@$3-mW+wCwT<L)QPvV&AtSZ+u2c;3SHR$PqhUQX*-QB!(^74D&`
zJ<9FrAx{1cjMImRQ$zI4n&Wv4q+=uaaC2O=lahy5Zm~@alZB|uFe30gg4!AN-KXYP
zcch29!-GNS16TP`ci?SYQ|ac_tc48+&>yk-zrcNt&G2w`Cqxfp#wHll*1;)Pj15-1
zuR%e!$jbf?vc0ea&D<y44Ra1ntZb{*_@YVPLGIpLWB=Q<{}+6xj%j#r!mhgGFa*P-
zT0eZhJHNldd#q`Bvpm<~8V*xonZ0=nuAFE7QPUjWgtC0A>`zx27Nl`td#LS{M(khb
z;eMG;bx}F*!9W#Su0^>+?U}OEd)Odz=T`&NAD&Baz<MsW3VFK8OAM8qvEtr~N`I)-
z+phGNioI=^8n?#W4pWc90Nd=$#Gs;C$~9&gz1iblPmYaNU?bCLDP+rf>}Kao(Eep#
zJE{kH=L}2<yh9@&fRSl+(IntS1G9NjO3+QU%u(5NJeHB=?Sl0<%w%Dj3sJttZPI&S
z*OeJnU<tDpR$$ST)Hgeqn&B}d<7;Mj)-dFeSpf^ws)e0$rYLsxe2JqGb#QJa%R3m0
zHQkqavF8oLzRz*o%#^Xej4xN56GCZ|@fS;**<b9%o2G@ZDT>~N7n?@2*5Tg%qne^p
z#P9P`K7b}H#MHptHs*>dltFC2LQl<XzmiL$D!n{-pjYEnQg*O1S1P1^+Y{BBgO7HJ
z1|RzJ`TD*FsIwP5oO|W)q6@C@FTU@W-u0gR`ztf>(6IVDfDUl0QI$iKhpIYEIR<-S
z_j>ZC%3Vc&G=}82S&uhgee=v0{{3gco$p-xTUV5W;&)jN+L&TsmKI<1j2MTc$DHW=
zTB{EJ5sFmL3iH}1PHVIo>OEqhw|zbWtM6G*%e^z%gV%k{yAo|YUtms4j#0qr(ac88
zIiA#~Ff5UKk#RYVd3dAz0(6|D!fD<4D0*htiXLr;iW{0Pso~C9OhvaLWExL2m#3gR
zRX5v<NFRfq4c;lOheoIq{@JaYyR3;?8(mUdly&ty^AD73&qUJz<F-6At%I<63foMI
z*s#rNO?lYnYFUqGs)T(>g|Ig`n~~fXLv)q7(!%I|7o#)NU_6aqPmsG8dLSw|6mRB<
zps1C39q~96^G=vtGgFy$%|W6Ks0FYA#&K_PJfdzh=rGxLyt(wmeQdWQZi~mjBT+07
za<^p(V}-$vNl_asxi;M#3vY%OwXqsA1F6vo2Qwc)ZD&@R%v9C9^23NYSo)ATpoVEO
z&oj7D_|%`hJ<Zk(x1(W>?%VT-xn{Ywr{@doJ~f%%IK@<_sh&|UVpA?W1x+d1KP=Uo
z?$}Oep0}5InKmmWb%x2vMOP!Db<9&n?PZ2)6SnFa;CYz0QhbVo6DOJ5P?jO-@ElW`
z=NvQWF|*O;$_sNIuoo`sbFUysoZ4H=_0Hy2Nb_PWjBn$4!98q-&K47BBb?Md^x-|?
zFhrn+rz>9RFmq)OUhOY;JZu}msIYE(?8e8u<h+z8kb5O&(!O5ZRNT>RI&x+-_Y*2{
z{Id{!3iu!TuGo(%?rGd*g1Zg3%oWikkga|0fx=3eW{ZJIkijTP!%brZ=U#&$Rb31^
zj)xTk%z7V&lbNX>yP3NIZA_)pQ|NumE5qi&ZVWrn{m0bpkq<!?41VVdG-qycDL2wB
zL&m%0BKl@#(omHddY3J)a$x3UH|GYMp1TKUh7s?!I6;XW@wl)Nj@-&qh?`C_aXusi
zMIa4Edo#TmIFOlD{wqYEG56@wC(MHjMj0M7KpJmB1XD5kHe-R!p69Ua?%tpBK8_HZ
ziHv%#N0_sOI1^rqjKG^!B5@r?>N3@ValNT2%`%xe(hW0UCFTvhDNlMEY++I8;fC{r
z8&26{OwfeUi+M>E*COMANprsdwt^vbkH9|jNUeIv#^tz6s2jTaxXU61TPiSg!TWT$
zlE?JvaSt(8gjT`%2V$sat`x)?g*hK=rh(0&OmkVqP8{<=J}>4y;yclGuauXCRcEDS
zBMT}*O;A$Jee|ZJn(Ems1FuHd9#2mkx_t@d>K1o4!4@p@{<SGs|4@x3Dp=;O<Yi{!
zz^yyX__CTdnWqpvo=@=ZH_k+);VIa8aL>({=`mF!7ECs{vsUAf#qJYXknU~f&Rh?G
z1lqoD(VyQd(M;_&uDo{J=QEhK(Tep+MGh5T1&6^#iMwlH&ge|cFymK@-0Pa1iZ~>U
zekFFgW5w0nw2lXnOdEt(Zcz(t-m9H52L|wnlhoj5i2`NJbR*qILCV`DTgEC@2Ip*c
z>!#V}EeAA;XofL)Pzwj8+|<j*jqj$8E5_ohG{iiDiTqgQ7KJ>|CO9xQnSN{bVdC~K
z_yu+mqn0&!w%t7kYr4axinX4Qq>44KyCAA%Rjg?rOck4Ff`>_t3Ew#OFEvct@5MGv
zv;Q$P&*L9b)~Nr6T|8Loc?sV={!-jUi6OB$|K!<iOx%*%WQ$!T2yJ7>2UGeutI-w-
zSQp3NOtHgQ*_24re@9*4z*Mw#QK1YmTW7Gr1LrkR{ksWW_9#OQ&pbRUV2sMgzRen#
z$oan-h0DFQuX)nj5vQfhwXY5yp*5X@y4bpkT*hb*Z}mtku?Z7a&Q|;pyQTiR(r@%U
z110@Df8a`3qx9OVy45_=ZmG}yjF$gi+l)a^KN%O#x^+$qR0{H*x*1f@!}OZ`#2b|5
zmzR{yP#m;wy4#%Z+u2d~6l=uBV_z*PU$)4@TuxQX7BtEDbahu!TJg%1IY^*t9V@HV
z-v`l~I%WNF;Wg3cR)6;XwsvRT^2`cz$)ko<jvT7IM+H6d{Lh)ae_}caN1-q$gP|TO
zt$a-RrcFnu!IVoB)z2}ZQ#Zw&971V(@Je*5aj6NMt;sWJT(Ck9fE^M|a6AkRJ6^g&
z0XiF(gh1!TX1%~{)kBS9=6zDHHy+Bi5Ph@FrXhj%*TAHvxg!<VLYr$#v22L#{-)#9
z2ljNw-t<1+Km?&<eHJwbDp?fHPf&|ula*X*V9xr~-EtdVzQY+)MBxs%k)7w+fqJ81
ziaBQQ^^7-P#-$|)3H9!LtiVdWi_P1Ypm{-ER<Y~h&Y3>}p(BSQU|e}nsZl@=dyv_a
z>ZTS>ZJE?^FEmJyTHoxQSUo&G?0(0MDXF)6>)qzY)q}X=pyJ^(k)VB08Q}mUnlx_<
z(`UNKFqn>t;w$&P<BpzXm@Y<N()_(O{_gHLUxgp0+uRY=DPHVw>oX+Q0}ws@EAw@;
zD+`8uJPWTx1)MrC^>;}YncjY|h8Yy$UN(=p+Qw@#@JZ}7b6YagXjS>BlecB}DvF1B
z6bI*xs#8K{;er};lu`@LeANgn&|v`Ptf+4JNI9YzM4zu+^N%MMhrTXMy>i&mci;Eu
z(S1=qG~8p(fa{BbE;EOCHk3TnZFjpKXWh6o_v+`)U3~n#2A<gNuAzrzN5A{%vbWB<
z{*v_858+r-jS(A`+@AJnpV?ji`tF!HH5YY2Nh<o1C8?V^2Xj<ibF(H6^?Lf}VegK6
znOz@KdOX-S568)_In0P*gWPTm_L^ehrm<<fa6vgz%nS{iclXxE4G!0u&C97~#u<I+
zK%V(xJKXh&Llr5hW_LWoIi>1g`Wg4!pHO-sltCtxUKK+*vFcFHL@1g26ABKjJ2ktQ
zn^_Qr-c>6f2<83vN4Yt_UZ%VUwv<(o4QE4naDS8+m`CG+*k$a&zmN}4N*uaX9n5(M
zX5H2M6V7@Q4!(+j@J|F3lYlx^9ncU2^!$DWgtx-Xd8@-keTS;m4~P1v_e=e|M!lyg
zj)qGB4XeuR&bXe&>=@qXgzB7vi#GFcE*+ySl!?U>#$i>3;ql}ljQjRI3>3Yu5k?K{
zbaZS8b*c_w7DBjtze4z*3L&*;6<&K4_SEf{_8+u%G*)|6O&Ivyx?kFNy4tZuiZ@D4
zflj@=3hmcn`C`Ac<K-ZveYtke>?&RK#uTik?X&+NZ-3YNr&p=pQy+Odf4@Tbvn+&D
z#XrMUhk&~eP~Go)96Y!H8Qmc_jTpkDszW#&A^g4{As}r{5EXM_JZAm(_90ITgz)Qr
zg>aJ_f;r=8PVb>vz}4+lh0q2e#P=%%^CD!8)RSvAMi(OullG>Ik)5!MaRqh^Wu02K
zYpCX>=uJ3H8u&L&+VNcE1@Fzd0~$}h?K5+;gNA0w=QrsNG{dr8y=5r@^J7TL4E1C6
z@-1Jd;6c*mbyKg~vgI~>elee2I9ZeGcWTzc&FNVwn2a3C-;T~oxfmNTF66JrxPQEW
zzjbzhJD<OGaeq6Hzje(@xzy-O$xB)O^)JhzBQrJ24Im>gWhnkp%QI)lXDRi%xnsAS
zoEeixOzzoz^wi1Wi8*7Zb)A;iJ!j&m?vuw(4o{mpu6ws`-E&7ynV2_m&e&-qAT)XE
zlnP*E-qgtxM|Ydxbf2C(t$WVY(Ia!ZkDoeqe9qYJ6Q)ic+dXgWWm6|i$;Ed}IPQOv
z1;#)}Oq`N8cG~E%;k>ESaz_B-_h}QS<W3x&+ii3>?EI6>R?W1$oTT!}B9>z!gYZq9
zM@^hEb_^n!DBSAlnJ{+rq}&5esw82J89Q<ete!Mt%-FoKqhb58ZtM>%Ym$TqHyM3s
zLkLgJojB_t1Yw=oI>tzwyGh;S;Cw4LZ)Bdy_C(I3L#gQd)%e;!lusMc5o2<4XHPz0
z@6*QSO_&C!b5n8Rl+iiU$BgYhCTHB}DM<`Z;!u*^iIXbXSEo%KJ$Ym}cSO#_DbvMe
z5+!&NExFUf;b~)Yb0<!nk}S)TXejfn5!1$or%uaD>R?Iq$hT3FM^Bw*s-gW$(44WO
zj_r|gz`35pPI&j|Nn;Sz{f?=tA#d8q(PQ_$ppO_eePYg-Y8NefYhnk}?maGN<oN0}
zIVgr@1sC2^=@<Is!EPiE$kPvf-7S3u#C_Cg_{aX1gL3Z}5k?mP-GW2v&9f(unwsNw
zb=<nBTGm^qNnR66awI9n71X%SB(<sOgP~j2eei(OPDdwbcK@;Cx}m21N0(1-OnEI;
z-PS3&KIU~<3D~>amdxfP)r<5mM~u!HIc0p3wEIU5`}bYf<HqfKwX|1fI*EOwCyX?`
z%MnwjO&mXQ3M%}6s!A*PNfNacau`j>lrbZx%}$aARwsXsm^N}I9j_W++Gp3OmIGT;
zCc81IMze0?klY;9=y@a1QK&||tOdCfMvj>}b42dgyy;Sn9kLEJ>xg7Soa$tGlDJj%
z_0dyjjF5)T6w*T$Idng5-CB*Vk<Cq;3CObW#H4kA{)XFf9NMxwd3xU1gE2g*Dpxq-
zu22Q-M}KzA*eSVVjk{w+J#Okg`UNeJP7}Jb<0s~#0fG|S)1N*C#Egp%na@n0GIQdT
zF;e*VsV%l{<F-gsM&|5a)ZI=};@)DCY%^{4{)JgZx5=h=_#gYPNi53En~Lv`ojG+{
z&i*85@>+FoAK4FQEo-t4ZK3_=K7Eo@M62iyGY_THyZ!oVd3zE=QK^reHgfz}t7pWN
zu`{cmt_Rq?%*)9gF@9{`h>>H)?B~F>%oj|re=<5RJ$oF=B;UKUe_&ObZi|rk=-K69
zTF%XbM1{kCEqP(9*e4z_E2DRkdZwBRbL?Cv6T5e>YUs38?3l5+qo+*_=ectxrt5NW
zldeKnG+DpM#?)FRr95%nt>sy@CJQ1-Y1@Y^NuuRo&2}aWV()3P1Iw-?VNJuBExEr}
z>&v$F%N;#opC{qVyS9lvA&C)g7qWarcF0nyLf5!D(<Rn-|Dl>FE7b?w37R;5%G7E5
z*`F!vf+nd_l4QZ&jo~57g@jG!0an*GACe@%zUC;3f~+MSm}yl_+l-q!ZSu%GcZhyq
zDQ)A#pCig^w*xae2ZN3zNtwi#$~zUg73M?^ES5=vb8C+g=y#*&>}R_&VPx)v{fp~<
zkDRJoskm#sBSw#eD<-kfIsvref3_+wo75aTIUDnv{YU1FJO~b#BrD2tzWQC@glpAF
zg0Z6mu9F|4z{(r71Mh7miA-6V9pFmV0WDLMx4@%D<|cErBoR3X<*>YgPU7pyOemlG
zs?ZqkQ_bUctIMWd@+MABmKqg`T@tIz@XXXO$)`h;C>S+k{Irqb{j8x(4}C;<<g{FO
zNheX8kR<dlrgBYdg(2?biFvuzXkd~k+vjQcB)Td(+##GCtKgK$niuN}wjG{4ag=2E
zKDpmsOB!13LD_4AluNo-B1xi`PwO05ypjZg#YgN!8fiwlhp1}jl&kAN<xArGQ8p@Q
zNIdefS@;i4zPrWiAD7^&sVv7%HRNKI)hNxy2Ct;cF-bCHuMJPAoex>=V`Rp0P_ojN
z#M(-;V*8}yA2O&cO)Io5iF~Gd*>nIC-|mhHbo3HVAy$<_mAf>FY_IB@w#1Y-uzSzt
z9hklfi&clrxAt<4Ll=^}VQa+5Y2(qY%bS6fk$v*S@+>It9&#{h^w<tQmWAL#`_flR
z?&9>v14|X_`uvFrSv7H)$WQba*@Bt~A6$#1`P<J788tC)hP&%?Uy{8%Kiw6p1CLU<
zLYZ8suvhq?_SY)y)Z5P#)2miSO_>LyuHWBf!}5-qU42Pf$h3R8ZFiDP)beE|u^Qh=
zUd*c3CAZ-x*TuP|+}XL<6P}Y>tvw&fHI~J*(wb1BLYu@9%a`r<<ICketNqH{YA$Ca
zvA#SR4oMg45bqh^&*V$)8Iqv~+R@$DC8P4%`oLNNx7TJ4J0#dQx}UXMxt~;?syi@a
zlQ`x9P4rYXb+$jHv#cNJR=KRIx{V%PR<UEjbVTo7U0q|lj-N8UJKD|M?*Fv7RJ}US
zb#7TNO6-h>D7*K6T3l-Cq)k+kTxSK3$`JeR&2ltSH)&;`M3)`EpldcR+5UT*8~j;e
z=I6kiTkCc6bAJ=&pQ@82=^!O3u?r>nD*H4C-oM{lQTfI%*f%LjXCsMuIdf(v`JzsL
zNkTK2Jh=H<Rs^w6IcfG<gOWEtRV|}UoH`nZ&knxn6~-R>Q9ZJ#AHrFZie1S4Y`7|P
z5!~I6m>*82sd{---Wnne{^>kl^;DMInZ%`&l!brlwIunfo{3E43N|AAS?OG*o2Uo3
zTpi@S+GWGpgKOnp`?(XPUXlzbcPHCs?Vk_m;$nxNd-YP#%y!4gw5ii^EEYi>Sc)Wx
z*gtF&J#^ugb==IbY7`EY{qy{{_3q@gb9q*ACa=65!+z%Ed&-j}RQ0DBt(jF7!hJa0
zoWvG*(LWt(Nb+r@cZG@VLzlx9&sLfW{g8!Jp5qm!CJ$L?WfNzn8o`10+`R7RW9wV4
zS(?o4o_*%oW{ZP4!Zlz@czRy$0Y+OplXQ`)(>_}Flg{NK&48*V)(%<vmp8*@Cj^rv
zN0Ne2dERc{>L8oCYNzkM#kzVPWUpp0quQh&+<M{EaB4d1Yx-R|`LE-o9gSyUYBq3u
zt#S1;K1R-b>f!&joSlEWzf}0o;rMqAr==4d)VTt6oc8#wrqjUjQ;+>4F5Pl@w&DZH
zBZ`j%ui><I3I>aQYtLktH*q5MD~Edr^%N*QQCGgH6IJphh;NOWEu4kzE9-d`@>Rt*
z%Usf3vzfD&dT>FR`}r_|{Qtm>{e`rD*5@LR3-{f02+WjpK3}kt2mL1fi)jC~jPTS>
z!b5FDUTzg}oHxlM<ap}G{p=<$B1bgc&uRFg(eL<0&v5D=O&)41ybf+AG@t2679*eC
zRXDCXcR#n0M>2%pN%@z^!yPKi7n6sm=PBwrwTtKv^bz^?v}ZSYKJ{Er{yOerGxqpT
z75OXijFI`|cN6aGDEuD5&T8`D*}{AAwT$lFMLtIP=9JGT_jeO{+*##*QjZmR=RDyL
z(VmUuvGavbBp;6ZU|c&d6yA@#Pfy`R47Vl2y^TChZu9XY@@P-d^9R#;W-r<^M0g70
zRnl8{;Ar9VsDI+|!VAdXCx4DSdc4SgLw*4Ys<A(sE_@03E9AjW!Y?4Nc>?9hSCY>s
z&pt-vN0YZXk@k?!C4ZJY*jeNsBtQEkk&ly~L4GxP(KwMmlj*z$*YmmQKSB6mUim^X
zdHz)4+bQ4iWRVX{7T%BP*%BB37(Jnrh5v`)t|t$iBK$k@)<Kbv_7m=-{^jKH{=#1;
z&+IGmF>*xT{nS29c)=MWe_V?2d&r%^!h6!rqxy+_{yD<WqMkR%i^w}O+|K<)K75hL
zUq|`3$$b|K|B3qNoG$XtaN(a(K8TDk=~GO88q4Fq$pfQAK1zPknGE-G;WMcJ5At}f
z@RR80N5kH#>ftjoMNfqF=asbQIdb1D(KDO;8!J!#9{HF7q9-(4<QvnTW#su+2>*ui
z=VnzM?p>BE{lNtGf1hsJk=9PI3o1H%>QH|H^@Qir9`diOesY^WzYjEe4tEOJ9{DN1
zYp~c^e6{FVL7oZ6F!>e97mniNe$Gu`=Ol2q9gm4TE_8K2caRs%l$PZ?!Oj};qV*!*
zAVvPL4cxSY+fS7E(jA<pNN>Z3bQcaW_tTlYM9KFhkCO-6i2S+W#?I{R67CDsb2E9t
z*TR>R|HsvHxKqq@gL}B2P2edG-mTy#l|Q#rPwaQm)0T!F4L>#EI<-2wlK-asNb>x{
zgnvPPFL^;j;U7}Z2jC`L8=n374Y|4Y*L*DBO+8L$(bGcy?3{#zb@Pi{QWfnZxY3_R
zIjiR`a=+rMs3+2;vi_aqF><T_M6ZNfe7wltPJPqJvu6mePks}5Nbwcm#vfw6MYFZ@
z1@gefmH8I(407u~yQ#lufrQ(L;WkPUJLBXwr;j1`T`TfUDc_eoK#uN*`?-)jdxgkf
zLB5zg^tAB0<j;}&*9f=iu$8=Mhwy%s-vw^cp_qPd<JG&i*dN*>@*^1VP;et3WI9-X
zSw#8pY$>OyOC^AZD4(t5Kc;+amdGDz?IABxya6)E_*-1@PUJp@8=#*4<Uz&fp+n-f
zBR3r7N}}7~ezubPZxjx<cRyV}KzvbuX0!gZe)2f^5XHBXhZLXnq38)K{yzCE#nU&5
ze7@pOkuOxd=|>_TQG5#d62*6sM->lj7CkE!zm`0v_{-#L6>qae^b{z533;L7>&S}~
z|CM~J;@!83{$j<af}8w}-y`{G>%(U$U!vrjeJpz7if56RD*i8Wr)%Z<;Vbe~#anzL
z`hAL@OP;3qLUO<2>&Vj;|Cv0X_%WYKxEYF%A`dEl4|%5IAChM)UVoeDAEJ0a@{r<J
zkWWzjS@N*rKatN;JRKcEQy%jbA49%S@nz%@#W#^JQM?Yu%|?Gz@sr6{D!!CFruZiE
zwTd_WujnsO{2cN^#cv`nQhYu6R>k*_7c1VoSi&t)d>VON@zvy|ihoP)bgS(DZBfun
zx}_@qFu70h|B|OE-u6q;<5xU~JYDg%<N?L;wv1~}hT><D2Nl1UJX7(_+eOb3#T$Js
zJgWFv<SP|_oP4d~-fu)tq2gKOMT*}|zE$yW$cq&}?px7cqIj483D0JK*tTmol7|%k
z5BUVeYobGL@;$8hDde*hpG%&v_&V~1itizhDBkCL33rL&bI7BLKS#b&@n6VeiXXc}
z^siMsOkSXPjJ#0so#aJ|_xwThZ&iF6d9mV8ke4X_9eG^wjz5b2QpLxSJAp2a`Ph7a
zpFCCZ-QcD@_5CF6X$$vTj&spZqTi?FA0kgv{5NvH;(g<yCtdMJ$ODT1mpnu9i*|~h
zpyIcaXDa?7dA8!dpGD6Q#Rrjx6kkL>LGky<!;07YMfA^7JV>6e_&wwc6@Q;RqIl<B
zqJN3vW67h6ze~PS@%q1to|xic^0kVuBQH?=FY-dgga0S`ixgi#zE$xJ<i&~~_M7M_
zQT!b8xZ?Mcmn!}RxpQpga*<vt`coCpCHE=*0(qL^zmoeE@AkXsPgi^_c|h@d$TJjw
zk36V&?cJh3Q}NTtvlYLRe2C&}$wP|&Og=&JWB!nE!-|h5pQZT2<oSx%+9P@vDt-!i
zMDaP~OB7#A9#y=Qe5K;Y{VCzb6vt~ZZhN;@@qXk5ir+w9sQ3ruMT-0Ic(Q3vw<>-S
zd9mV;l9wp{19@EWW3V7(^p`4r1G&?qvj4wHo~n2)kLd9!-j6&@@qBW>;xCY=E53_7
zp!kKgB-{+e-y;typ5_(#OvQ(hXDj|F`4Gh$r-+`A;@RXA6u*l+toX;|vlMSuTlD8E
zejfQk#TSuB6fYoOqIfBJRPmvy67EXH?<S8a{t5Y7#aq`AJq3ziN?xe={p3Z8e?q=h
z@wB?4zgY31<RywPCyy)s6?v)RDfLCa)3dVwXOgEXK9Ag|`0M0pivLaSSA5uE5^lQU
zcasMc-%XyOc<%<HC#d*c<e7?pNS>|u2@OTh5XEPchZKK-e1hV?lZO@WgM}(n|IAW+
z9(lgvZ;&rkymlke6H$C1`4Yu%Adf13Z)4H3Qt|(h#}psXMC8{h{t$VA;(wACDt=K@
z(Nm=OLU4E8q<=@Zr)}43-l6<fCEu-?=qXnGHu4h1cag^xKj(1KQ>yrr<W8^3<-&J_
z$fqhkjohdBr{rmhpN4l)j6e7lf0R63@dnLBKA?CWd4}R!$%Be#v=BX+ir+_`t@!uk
zLli%%rRWJM9wMKh_)_w);#<jQDc-b|=+9R?n|z_-caldG|C)S>;+<NH{;1-k$X6=9
zlsu;RN91c2Z*ZjOFHk&_yioCL$cq$zoqVg}d&r9wKfaBGTcUU#d0g?Q$x9WFlRL*%
z_Wy2v(VwdLL~@_v50a-T{$Fyx;;q|?{&dAJBo8RQh&)5_9ppj9FKs9KGZnv|JX`V4
z$cHH2vc2dDDL#aJg5r0PhZWyMK1=bg9YlY=;@6WeRD36SMDY`j5<N>4pG_WBykSR?
zU#a*R<T1q;kgrvIBYA=1zN1Bdq2k%(MT*}-zE$xJ<i(2DP8a<pil0s%S9}3^sp9L&
z9kotcr<3SURs1}1pW^qBrz!pkxnJ>C$B6!P#Yd0_6n~IBL-DW4gNmnPgNkW)G8Lah
zo~`(5@*#@<L>^MSdl%6^LGelCVZ|RMpQZTM<oSws=qmabDn6P#qWFX4OBDZtJgRty
zZlZss;)B6W|HR)u-R++|&k4jO<i(wZU&;m4zo=)e($lBA=r2%w1h~<kKU4ZYH8|h+
zIOPkKd<l7x;++Dbf2-o-$%_?#l)Oao@5tkdcRW_~mnuG-+&R8-dvO<es^UfDKE><z
z5dCS2pH1#pd^Wh7Zp|e9Ut+#|Kps3&_!Q<#R!`BNuJqhY9#H&!@(jgW_7Xio#mA9n
zD*hOGw&Guq4^h1NaiTw@_!jaBiZ|*l@?pi#BcG-C8|3+lPdr}qtW<m@c}($d$=530
zF+=neC_a|FQ1QpeixmHke5>NeoFMv(6~BzUMDdm6am9C$mnuH+MA7eLRQBiF$x{{I
zOzu;>{Yj!HP4V&Me#MuOrz`#uc|h^Q`iTAv#ZMy-DxOcCsrVb@*^1XZS@aK4d?<NH
z@ipWV6yHG}R=ndWqJNg+qsa3WUrxSI@z2O3inlyf^e<8TLh`8Mi^*3iUPK;K{IH<t
zU#s{)@&d(gA}>_@NAe=Yr}q{8amAk?FID_oa_5A~ew%rk=t))l8gifFFOsJz{(t0t
z#e4S?{ppJ5kp~oyk!L6#Cl4y#v%l!iR6LhFTk)sJhbUf39#Z^_(?$OT#be}Q#Sc3}
z<Yy^<H+jC|yU7<Se*BrDC!+XV@+FG@hdioy!%We$Qt=_=F~uJtU#s{J<OPa%8zA}%
z6`w|4r1(?hTNVF~yjbypS)#v0@f*qGioZ!-s(6QiqQ~LBK6VG3Yspg;FCh0RUP_*(
zc=th~->>*&@^r<YAP*@119^tx#|#$zLB&UrXDWU#dA8zP$%iO@!dap}r1*OB35tLG
zw8)1QzhsT{y8=H{p0B$O+@0rC@=s7csO0xhexZ^-?pe_jQG5#d62<=@k15{kInlFL
z@jJ<j6n_WY^d|$Q60fe<k83`ipBFt*B|n&arQ(;9=PSOLyg>1%$O{!OBHya`khNmZ
z1hr4^O7d*A?`0WzNbv&lA&P%#^($Wg1qpYS;>VMR6(2>OulOSJh~m$aFI4<X^0?x)
zUKD#$)qano$<q`cNS?0vRPs`#e=&JL$-hh<RQ!MBnTj9&lGvG{_-WwA-(qcLoHC9B
zzxkA3sQRTV$RmosNxnq!@5!Tz_gg3S_>?_4<bK8PA}>(sQ$QY5{Cjdo>1p)0gu7Ms
zTY8cgD}E_?iRP%-t2(!G%`vim9lE%3|7U3i*X0#&3qLpMA3j#n|7NCt&l)w|dTfG{
zzmR;E;<L%~6~8utojXykO}N>;BwRb*e;0ZT4=L`cDLhOLweBZKo=@I_{5J9^c|-EM
z6WDWJZP8zJhS*a;`7g+W<O|3Lriy%VKapQXejj;$f8qC#Us^}x{ih3W$p|;B$8f2~
zrb93C$eALKZA<QFKDcRLA}XC9BwwQVzsaMDe@(tp@!Iu8e@yXX$k!@<E_s3CGsp`S
ze}KG5@eSl#6)z<(R=m|=VrPlsr;^7NpGaP+_)X-_C6&v|6XdCi$5N{<cQ@bwg7Mo(
zrsP+Y`L!8(3=f)vz4*L|@0(BW2Eqdagx6rkk0ke<C*01b{Ttt_s=R*$Zq8-+Xs7j`
zU&))2vl(|@qyF%PqCezX<2XCXi^zMDw`wSM`Ysar-sG9&+2n)C55vM`Rd#MZS<)wa
znCN*8_L<LO>WNZMmSCrtJWk$=5j)ApbQ>yq-lP0<a_3^<*uL(5ULr3hpFlkw64==b
z+?+Ql8ZUaTr~Jj_@ymp_r~Yfm{gZ@GC0|A!A-|k_9l0+@<bCA-vwFyzk=Ly&_5{iO
z<h{V%^EfJh--B9HKV&Gs{j<C94obci{MN{KQT#mefYNgY9KiT<0n-P^*4&Tx1JRRC
zdFzKm8%chJ#!7sf@C)<E^Qq?q>KTrBRpmeF_@dh`FdYU{&x{1}3z<IsRlJVxO8cjX
zJvescer_Z$B41AaChd$V`PB)+eU;%xMo4{P^S6XNJY2Y+_Sdh6bVK>grk$4e05^UZ
zx<c&y2;VoK%gB8<2;ate-KVH0%5YOf+}TXNQrQ#jB<T>JBYIj>{)WRPowLK0`D5f^
z#or;1kf$-@caq007d@}i&elhW{_u3+Grbb<@U*JK{TcRG<qzKal5Pczm-V-k$Y+Hl
zol_aFG2}&*UrBx&xJl>yo5dbfgYM@g%I8y$&BrgvBZ@zMtfXgjiRekAe%}D$LFzx1
z{CM)nts*~<d=<u<#?Rx_)0W)xjPL?-IX#DQ8M!|qdcLCkgXFQhg+EKaliYWg@Yl&N
zv-SLK!jH4^EO*6}e~rAjndDbsvB=y0RUPJgQ1L67zwt#Pe<JlfK^|cGAbH(SW^>cN
zAMT_d)>(chUjc5?C!2PDMBcguxe7OvJVgFG^Y=Z=?-YAhQ%|&|=m|5OJCMIi?!QOm
zt-t+29#Fh<E722Fd>DDQ;y03q6!#rgwg2zoI6X}HA+)m*#vLYoipY;AKRSV)J_-0a
z33yHdz5v|RW6tHh+z@U0JWpQ0@)Go=x_|$MyoQo*)IjvqRJ<>FI=O$VL~IJUDZi2X
zBwysej`IlR{YuXV<f-JgTz~KK7EYA+w1}0C(;AL#;*~~u%d^PaD*dy_+bQ0ti_`<f
zQL)hKSq5(IhgqV+%|<esd~_ZVc~pb$r#}j&o4?d!`6BWHrT>>xM1Mr-zi*)E$zERB
zo(<$7a%}T<KdJS^uN<cHE#xyTUnY8NKc@@FO}=}De?j>z93T1b7ydr^Be)pB#4AMm
zVYd5O&vM~sI{Z$@n$%k43usS-@}H3xzaaWCP3nGbI8x-ZmkPJ_UX3=w11|~xgnIr(
zp3ivQO8ybKkLiPMt^28ghTGU5Rq5ZGJf`w%Bzc@1IqiOykoy?lRPq9H|3hNuO4ehe
z+e)}a4-5Z;e2e8Ph2yw~`x)Af@m(YQL8i|e<VBAOe;WN!^SPtF$VZ+LeurSEX$RrO
z&kAqP_WgQtAJhLm%KsmE;AN42gyAkaO7!^F32#Hb9ObC0c7F}a*Q^23LQ&lLgM6;y
zK@@OPe)GvwY0qf#2)V<G;1<h=ihj%2T0T*DHuZc+UQBN7*?5d8cer;#h5KoDDR<5W
ziSJ12Y0G|f=q=&%$R9^RaQ*6a;d$ik`%1Z9smk@_!4hsZ!?okShipImP0=%ldU|wZ
zKE5J+G5NLR`L7C}Nd7!|ocsdvV)E<)k(b$H+zD~C=+7sAi}L4@hhGzUTc12f?l8W0
zQ~q;u|G!0k6M6G=(eJDmj_TC?oK7Aff0q18@;G^bJVx$+L-Zhd-Opz7FuCm~?;($n
zU&xGa-%0F=|3~zsQqLLW4%7c)@|Vbq$zLV^hCEs*dQKv*dkpi7JeNF=JfD7L>ywwr
zBZ~i*JgRuj&Z0j?4mWl`x!}g1L&~r2A)lc9d_8$s@j8vFF4u8}8+lveYv;|{c9DKy
znEk*I?a2T)<sz_A<Y$l%Cy$Y{T5@h6FCb@jIUC7~$j_vn-^l&%h<*;goQ$pvm)!RA
zr;}%s+w!%Jym*V~>BjL+n{J|~i1PQc-X24q{gueSO8zFf?=#{5Vgv?t7d_!3;iD;k
zE4l9*;r70%o#X|d3x^xKpQ{3*CtfVP3-y0T9x4%jqYd|1kuUy0_^GUDZX=KXPxu9t
z|DD{=bm+u%4)zc|+24zNkn)dPPWdsE|DN3crO0=r{JFTO%J@S8<&UKNJaX+P$0YE>
zfeHAd&{LykJ*V)J_O51IuZ~C{AM7mkc5H{(*^2gzW`8WkdOOW8e|VdAMkwEm@@;XU
zoQYQv`4f!qyX5&livFL-L&u4HI4<1Q8@G_h$R|<0gxvqL$S)`F)tl-0i}2OtPlKC!
z;ERdU4iY=l5~RaJYzIpy@8=h4v7cG|tLVRs{8-yB{Y|(ncM}FldWQZGeq0ktpdM)N
z-1f17c$BT5H_`qO^^9UgQ0sW6=Wfw+E7SRGa5LU4Vm;$0pGh89{C;v@17tKle)qqQ
z^BQ?T@!jOvijT}-I@A7Zsb?K|c0-AmZBKWQ=abv?X?}v}iICg!btZWMc_#HgOz!iE
zq@;%9yiXn^|B~{JP89utMwR8yB@dAgp!}8OF>-6qbqV}r4)Z<MQY5W>Yn$)n_|^S9
zO&)A6l55HTv^-7tpX7__C&jeqa`IQmV~s_g!%8Q@baq+@KbQXb7~*C8KXsD$|4QoF
zVAFwm&LA%(k2DcIZOJ>H#C#{0TE%hBB9D`2QT{6OU{ldkz<T>%<jxVopP>9U@)-F%
z^16K_T%FF*1o`q@0^a5<DR+URC0t1j$C-?IU{fA_<hI_c{h{!p?jmpFI}jZ?BOj)`
zt*;^<36Gp0@^;+u-_62{I|%P!(`SotCtbKLFI$kIMt|f;;m1-w=VRdott;!f^mF09
z4B;<RzTX$Zi|L2&lTRQIoG9`(zJBZzGvQ{F+xR|$eHw2193}Et-gQ5V*uRZ29R^T;
znDu;s@mj(O>uI)Jw-P-#&g_0VGk@cZmrb|x$@7_R=TiPEawi~q`jW3C&nEYi7m<g_
zZ>K%G$%|CFbvRl4EuZO7n+@k%3H<6z%12HSdu+IK$z$Z1^#3QwoxUPpnj(hQIYq*C
zGKJGj=LB-C|BD3n{0eTyj|D96%r2)zW8pFKRHjdme68Z6Twc%F$o_-1=T6GMru6@a
zcGLCSi^YCM%ej>GWbiEEEojhTKZ$&RydLd2={WJXa7g6alD~(2eXf6I2|t>=JND@r
zo=<M$^%8lE-0Hs?`=*S1_)^hh<!9UZ>OqzH7V@IOmH8ab{{(5LP3KR@vlTyu^UwKA
zA8XIWEhXRm=T+99!|_z$65*&W+|ONhoJBp?lXvF$D0H^SFQh?tlNT{wc0Ruy`%w|b
z3*9{Ta|!z)0rE@8Kd0Y@sHY|Q6#A!+ej6lzf;^kNDfzGDVe%`;1E)$l#K|!|;(n%)
zI|C$M{mIvmXOr7{<2S2^+{V`*6#W7ELmTQjlRQFxIQca40_8vVkQXVwj=Y%Mj<dcX
zcLs|6)}L$lr9I^B7;YEx5V_oD<2dJ$M-;z;ynx*1%LC-a<aT^fNbWEntsnkC?jyJL
zf0NV1&Y+U-MV?KLX(jh_5qUnj&G#$GqvSUIA0hXt^7SFPpWN#2fPtYYzs2Ns-XKUG
zCqJ3?Otw5*(!XpTk34#=aN7>PN?xG&&*a7AHeRjzi#-m@ujT#81FBq4BoC6?d|6B$
zB0r4wZzK<s+xYGz&nItA`Lxr;o+!D^-@fEAa$E0BAuk}e<t{>AME)c7ziH*qmvpO7
zUiS<Mw}`wi`B~)t3q<|{^2^Dc3x#*~$RA!L4-6IVVLkR0d1R#U`>3bhnG$Z0^`Y%w
zbt4a}`t~An-!Rd$f_knZFCe$`3D1+qhl~77%Ku27&vxWA^25>48~-dK-$b589vdNg
z(2TmDJS$Ir1o^|{adKNP{g>RQ+LxvSBwRnaEnlaRhm`yn@+i6W!zJVe<ZscQwdA_K
z{ewKN>e~)kv`6LpndCaZ-a~>^HGf``fbUMg8)74nN&mnoN#_Bye{mn-`Q)9+-$%n|
z<YS6Az<9kXd)g=90}}Ab3Hag!{NE)qPrmL9v5?(Q=Nt01ir4v{$d@WUn7oVPx1b=K
z^}AvHE9)N@7as3dng2z;PRY03De`L-AN7my?Mlzj<eL<~ahIfjaJ1x?t;df2RrIV<
z`V0RM-e2*HFmN{U+N$_$@&d&lB`;R|ZSwVs|3O}&xc@H`Zfj>f`@=TB&T)AQC$8jY
zP`*&fKT7#MN`52xCdGdw-==uuza<^ED}Frr4#kI&mnwb@d9Bkc`@_TJj^b~T?^1fs
zu4(oUw{U!lH?Kvm^voeoRs4Og$aheBTBaIa&nZ>&oNW&i`3xoh5_v%Jp$$YnU-44%
zu;Tw}B=Ql(-z8tF_)hXW6mQyC^ej~T1o9<{4<lcs`0BQzr<LNrw-=tO__-Z~*H?Ts
zc^}2^J4)m;72g1E>Q5ibZ&%i<SM?V8F-pFzd#<pBvr*LpSCAJe{uCzIjsL7rdg{C9
ziRwAdFzKn<{>*nSZ{egX`7;NKo-T_2Jy`e-74F3tCz@~z6+ih7;W<iAzk7xI6<<d_
zSMjDVi@c-i=iV-_=cF=Twm!Lp^0k!wT=G=Kmy_35ynx)N_?P5O75Ba(cD7Qy6M28d
z2a%6ad@A__#cw0eQT!S5u;L$)=P6!F?x=Kb@~YTBOUWNk-c;2y!^r0<`K!qD6_1iH
zQ2cfBg^F({U!-{L0<klqcqj5Z6dy>wMDZN*rHbE59##A)@)e42B44TaujH!~Z~mIt
zpQiY!<n>iLk0SRg`Gw>i6kkoAuJ}jf0mc6$@1=PA*Tv2Z#k0u!C_aTesQ6vvnTo$e
zK1lJe$g>r%_iwT1e8rC?AENjW@?nb4B@ZdSf;^`92J%vsFTaqlQSvR;i=BIve1G!P
zGb{HK#*^1q{3ddr;;YG<D*hgMn&LajTPfb;4YAX&cyIC!iVr1ES9~sc7saFG0mavo
z_fq^@@(ji6{zvTWqj*>HpyKC|XDU9Ee30VH$g>rHi`=jB@ptm`m3-?r#m*s$_akqm
z^jt<hOv&F)-bKm3Kps-^UyyfD@(l~co-s;3gM5PGA@Ur>7m|k+e}X(u@sG&^${z1q
zV$Uok-;KPNk{?1oSIN&M&sY2b@&$^oCts*|3Hc($>unG_BZ?nOp04a1Mt+BqpGTgd
z<X4j~QSu*>FI9Xu`2?lssJF$=9K{Ec7pQt^8u@y~?;tN!{CV<?ihoXCq<D>uV$UYU
zk0Rfy_yF>4isz6QD}F2acEz72FHw9G`3}WP$>WN*c}MKurTA&&dlVl--bbbXP2`Ts
z$EV3_DgF_8RPo=)S18{6U9t0g6>fj>l}dgr`6|V)Cyy!q82K8--zFcX!u^$et&(r?
zp4b^w@&n1&Df!FE`z!ee$qSVHo8;>i|A9PH>1kOc_7p1lzT|_He3*Qrl3z?-r1%>0
zO^Sa+zE$x(<l7WK@_n&0TiKIEUaaINk#AQ#LSCZyTJjx=e@-4(-1$K4*`;`U@>0b!
z$@eHenS6+f*D`V^vvR-aKjgI(|DJq|($n-qu`^Z4_aU#Z_$YFp;y06rl>TSQn=1M3
z<Y|igHi<p06hDF7ulP9f4vODGp04;B@-B*hMIKPR;YVUmFU2#+GZY_7o-?3wzFbef
zM5WK;<b9N$jpSh^|C8lPf1}M}=Tar#lRT*O4<qld_&oAV#aEIKQv4nAY{h>gKVNbG
z7O``P;%AT#Q#^+}r1<URV-$a$e1hU%kmo2~Ypd9kr_!wxd05F0Bu`WFQ>`8)znFZM
z;?I)LReTG1zT$t9FHpS0$726N#WTqlDV{?fQT#UYI~0GOe2L<pkS|sIFY;Pyo}|?$
zVrNvzpF+Mu@iF8p6~BpmmEtk-nBtqr*C<{}zE<%gKNUOIDc+a7K=DcBi<CdyO}<{q
zzec`L$$v*)sN|b%6MHr)-j_V0^o%DjQu2$*H!1!y`BufhA>XEWlh4GSV#QA(->&#X
z@;p^vEg_$!_<HiWiti-PSG>jNV$TA_`;#wJd@A`O#g~&u6n~5S4#jtoFHyYRf5o1q
ziVq}@Dn6Thh2oD|d6mB(lCM<qf0Ey!<U16LJ*$-bIpi_LuOeTg_-gXCihoAFPVqWl
zNVo-x_a<Mj_!#mP%Klr)3zhr}<Qo<LhP+5|-<M*~CdK=bZ&my<@@<OWO<t_{8|2#+
zkCU%d@k;wj>?u+5r<2DO&n4fb_%iZR#or>Yl~uVP@(X#Y;{NSoPkqJDA@?bsPu^7V
zC&|+k-$vd_@djT@xPHZh<Q)_blcy`boV<(T?~(@;|BJkr;@!RxdomQilsu^TBJxbd
z*O3oW{5$e&#an(W;hwMfVDce~UqwDl@fdkX@vq6pDBi3@!kwV_8RR*N&m<2k{up_l
z;vbXGQoQ#6B;2`*pGcms_$2ZLir+`RQ1Q3P7b(7nJfe8l@5G)v6u*>wsp2=2uTcCI
z@|BAJK)y=xHs4FQF~!dzU!(W}^0kUTOTJF=Z^;W3Z@xpqEm7_D8RR<@pGh89{1Nh9
zihoF6s<`(D33rd;$B{d#9=m|Nmg3XNXQ_Ji9`aNr{{ne^#XlzZDZZP$sp2hu6#LT@
zKZ(4R;={=Oiq9qQp!ogd>59KX-bL}x$pea~{3LevQoIX!hT?<C`zRhJ4=R2)d4I)U
zA<tBNJNY2R>&C^-Y{h$!pRf23^7YD}uOuI$<R2z4RPt|;uTuW=Bl$3;r^!ySbAgi2
zAm6C;4<j#9JfA$I^gl$Nuk^n~K1Rv^Kt5N=H~U%aoS@`SAm60y8BV@c@df1D6kkPN
ztoZxn+ZEqUUZQyWU&PKGiVq@>D?Xikm*UIG*Qj)OkGxdL|3bb;@z%S<9%o?X{?Zxb
zwG_XcJXP^~$?Gfr2Dwl1Z^@e~UguZZGpMpX-N+ZH@pUG7n$nX)UZT?H4)Rt?ejT}A
z@z2ORDE=3DEoEoh{}cOD74Ju$uJlYKU$5+pkk?mwo+kGx{t<Z>rT=&GbxMDm-^9*<
zlJ84ipybDq_fql;$ukuH7kN`<=ey*6l>AQeLKW^2rDA8A(sLqtQ0W;?-b%?|P2OM0
zKTPgd@^6xNQ2aacOr@vc?_%d5#e0%xD}Dj_`HIgZAENlZ<mt+um&u1I`C{^r;@;h2
zXBVX>oqUXvA3#1q@hRjvir+>aR{SaQJjFjGpQU&y`CP@D{~`A0D}ECB0>v*OU#R#M
z<ck!4h<u~+!*|FdO8!^!I}|@+kJ!0H@e{}cD*Z1dU#jHikw+DOjC_Z(^L_FaO8$59
zRf?zoDR#ybzl3~^;y07`Que<@9#{5!OP-<RoBbu>#+CjP$%~bqvE;jy{4M0$l>EQR
zOO^Z%@;!=Q#{Kif+^=rW>D&zN?tfPM6<3pQQ~Z7MV#RlnmnhzHpy=PF_(|lYijO4U
zqxf~?O;!8&FnOBdZ<F^@{73Sj;ztY;I|nHqB+piSBKi4>-$_11@fXNLihoW%LGc>6
z_|fD`p5h(IXDOcL@_Noz?q|09TE~zVD}EJuiQ-Gh<BC7$a@-rH!re-~Qt@5nF~u96
zCGlFLcyIDT#fOq_RD3RZk>bnAW2(HoOTJd|Kdc_b)6W(=Hz|HL`BufJlW$i%O1?|+
zH_7{}b>W@lJ5)da$ZWA^kK&o+j_UVbPToiHD0zRy-y>h5_}}DF#k-#)_N-L(%y9Cx
zieF2<PVv>`1&V(}zFzS^$qN-f`dqPRqvB_i7b!l2e3Rl)@~w)$LB37#AIOUpZ*rd4
zvt99%$V(Itk?&A^q08$z_0_)a$H{$)KbPQq(6$6T4({FqJVDOUbm6(V5#8i`O~Dx9
zXK{c2RGx>5P8Hse^7Bp+`QotfvnjvlRN?vD=Wp+wYl4d--2MM!Mc$rMo_vO!3kY2%
zoa&uPJO>@39$qixjA$?C#p2Y1$he<z9fgM}Z_lCs-cxvXj_B_}JI~|&0}i>>e=m8M
z_IykEzsL(1Ut8Z6@O-pS>Hj~TOASquaP4`AjmSrn4*p!>movh@k%yT6=P}%t=QF<J
zMgAS~)5!~%k2bzJ<PPn(=R=p1`^fG2%{R#X<Tx(pe!eFUD&F`4u_wZKO{V+_mNOlW
zV7x|>$0m#Y_8yqq$b(aa52yTf7mEHE)7hRIjgl9Um)TDqxLovHNj<x)o@v5uezm_y
z^cRrh8U*)qA$c*my_et`@;G@%%C9Dm=81j=>!jgeZ&l|3yMnv-@#c$s3eT}#3~ugW
zC{p(-Tt|85N|A5Pa9<^l%opyX|2OU~<tsE-_!7!b#RVp=zfq4p4|`*#@B;GdD8G4t
z@S>|k&*O~PM~rWb@x6r(S9^%0XOQ9AbUueXn|kbdSWmO6)2A)CNr$+)FCs|3mE4|}
zzq6m%S-eQ>wEADRet5m`MzsH1%Wn`~hw&XoKZ%m3ldq!v(Q8HiB=Wb&i^=Da|3DtT
zPUJJlQ-;!?$*&<liaba=v&k<fk0|^9uzD7X{^^v@xLEWDX#Y9nQ^|F_?jbKy>F_3b
zG5PQGt6rDT{#zy7@2LMu^3ct~aV>!Rxhz5cUYmfgNWc$gz7$79|7@0*`;ZRiKC2k-
zo3i<}p6L^%yiNbt66oKSfd7$zH^jkeQ+|E-iaqw6`*SuO?hyVs8<NJuB;B0*D$CD@
zo~r7HdlK+<3HYZ8`0fP!JlYedJ+{5hCl4+aJ8^x7`&mn#zfAZV@^8ugD=NpU`*4YG
z@B!hkiy9}7-1ngHi^!iQ50l&WcL#Ysxoxl8Un=?w$cM39&m;FeEc&f~zD=&<+w6#{
z(?0;7;xu-wKiPUF6Wr8GrK)~Dyq)x03LcSgYtuh>)Db(29us~6<GZIh{ekjp$e(E;
zJn*>4+xDoG_GgpVqkK4l{YQ@wJt4}^r2IH?|60+%f!rGs`C`i7&HU{_p1)e;=aG*f
z_r-+YLLMQHJ}o>-{wjI!S>dgjUpvV|&k28l{Iro`PwW-pHXrAZ$7%lsl<zu9<g;HC
z`5VZWlNY}v`~mXTqeVXQvhY^qkp$`a(EHN91l|?-v5eQ%M~I&22H|s=p2caj|25$`
z<bLLR^i|<~$)}PB{v*5%4{*Ls9@-|nJ^2y5|0hx`d;#rw9q}^t;a1j%nbiM3#y8Gz
ztzS19BX;@=L{D4F_a)CJNB7?SOeK$yTR(h=JgWE>@)-G8>PbZcy8RHQPaE=+$b;l9
z$gd_3kz4zpBae``r2LoUF>;&!?Zz?O*CoE=DL<AxLXKrk_wx{W{(6yrl>8HNXNU0r
zQqPp}qQCeD;nr_oCeNmypu6IJej+dWQRL5}{_7{u&bNivX8vv_54<D%dg|#hQREBA
zN0aA~M~g&$9rZj#9{Nyt7VE3+<gv}dPozEFE))InEyBmq4@Z#uzY<<b{dZgbjqqp4
zH<A~tbQ?KI^ylvqd7BQq$%{*bH>aH)au{ExGtG1^Brj6@dUAhU^w|30bMn|u;kJJ0
zJ6ZIHeqp!_;4N}pes_}V^3{Bb=<zFmdm=$SGYs<PUV^n8*V}kqMgD@~g$eY0l7R05
zH|>{`bDZlTwj8xa17~<Tc`E&<FL^-miR2mN_PqI-NG?;Zi+_`JyODY>#d4?N4*6^3
zKQxl`Df(69XOK5-Oip=g&x1_Q?Ee$_<&@t@9w)c*_ruPr`d2T~{vhqQ>HIT!riyP1
zTrgt#dyeNAH^j>suBm2Cyb2uQeW~b`W@5jurtp7JzWY?+ky5dLH#<5J%l{H?^}j;y
z`&;<;)N@!^^f(OH#`h$0ANgaHKLhqxl@6VIiT(ME7h>ps#wL)zhUpMak$4qSeigX!
zpKO(GZ@V1#3Da-wyu{y>AEM-sL4|M9IXhMK|3v)*$U}97ZzP{b9&9T7Db_Pjo-FYy
zsw4bX%74+9{4n7Q$tTp5^b9u?ekb$s{U*W#jIV76uVz0jNc}fZ&yg9Fr#;rM=5st4
zW4M@hb3Yd_e~ao#xV`y>FUj*Mk8ZsCdGvD8A7OrB`0swa(}c&!>o6VGlNWeJ{}{@D
zMINjz{A=?1xwN0@Z|50KvYd7<rTlbqAMHGc{BiOyxlM;n<Wa@Fc@l1@f!LErJ)Ovl
z$>)(zBKI{C`BrQfet<tz<>$50F{p~S2RHr}KSK1(qW)9KomRphB5#Re==NJEe;IjU
z7vaG+B5(Wow{ZTmnB11HSYP^~;=kE?kNOcE_mka(`QB3Wk0QT;JlI0G9XDOY{-KY0
zEbn`q=*gzMo!|S-)}PHq&zaOe-PSX-6VY)$50l5J-|BhK);~vzo-ZlCn><QA4auj|
zpB>6uJEzk>!+z0Y{XC2L9-|(cKKGM5D!#9fN62me@OScXn&>}`_OzQW=~F;HiTrHx
z0Mp0HUqPNvK9%w>k%yVicHVV|l_$65$Uj5E^)r8MfAuW#82!-lc~-u)*#7~;T|*vc
z`phQZP99-;ZXj<zQ^F0k6+P#&pq@`&&`$US%FiYbv==^<d?|T^+}87X3Ci95;HJF@
zvHs7eo&xe>#s3R#%4w{dgnJ<!rDiMP@!rC1x#$k=>YptA9NRt)r+nymkzd02t|WJk
z6OQf8?x!g#z#2|Hr-<#By-%`$@r^RR<0=0OdG=AFryhCwEJ<e{c^}4i40))d$UjGZ
zFS-9{;kI0CCJ&Hzr+nkt%t!L$$<H8<lD|rOuC(%<ME^hr@GiI+Kl&S@V}nmR_54YB
zUoVmWn!G2{r>c5&cmh6S4%6WT(PQV;my`R+?Re@9aC1-E1m<H+hC3yJ{+ko<7`X9=
zF!eAQoFB<&k^8B?&5`0)fz!m!RDNM7xY4sj`PKd8QN_QYo?t)G(}Q|Cwh=usB|nCI
zrQ#2|yq;4`KeYb*DC>XcMA0v~<~WaCCFz{sU%2gWtcRYe@~b2PUyI`7_UF$KJp-6N
zm-QANAs<A(iac<p$e&8y6%J|iL}`zeZ;~NAuK1ZJ2oIhvdd{bwt>jU1YtN(;sfXO^
z_nt&OwDT<L`IPAsrkz>j{;MV33djeNH_4~pW{N$iF5FLTf7R)E47l-=waiDKBP~Wh
za8r*J4H7-~x>_9P1nLh^zxB^i<U#T!l)s(4n7lg+#%tuhfuesA<$olPl3zpK`WlH>
zfc6|keinI%+^*{_u=2{z*Zoze!)nTh2a7%1ssC%M|19ChF~SY!i9N*^3AgRbY2?wf
zh1>qXH1Y!Si>UuDa(}kScW3|bJ@W7n;eWEB^39iUv(FP=mwHYl50TscTP}H++_nq%
zkmr*(r=9}xDEVOWU&-U-+nA3T3&ft#NU`ThR+Mwdi!Kyy`)Lo5`<ZU-sDA@_Hu<^a
zrQ|Vk>(4!}m2iv5?YMp<d64O5{p1eva7gU=k@~+U_e~IP=W`Zb$9PQ?9-(}<>xIY3
zZ>2xnMxH-O<X78*u~6g#lZDqL-{!B{4}S$W<*{(`OgF?-)Pm;IxUKM_$yd0)`ygdL
z$AY{4jj5vl9hc(SN6N=17r4Yv{vhSUGerJM@?XiL(}iy!Z;cEx_7qQ^>k^;S-~Ab9
zeH#mly!G>iCrdpR%@tluJ$18$=U*=TVe;N-l0H%DsZZX4{q6ibk&lw!#q<nL5sqrv
z{rrbKHb?jZ^1uzEKcDipKQoEEfc#F%FCh;xJ+0qvvU+Hz^}|2O<4jMxZad&c3D-AE
z>{-V0619A8WjpKKB=WH<gkM5?#*zog$B@55UPOB?B;QRQU^<^c-f5BO4@?vNjcdps
z&L+=hep&gM<bL|YZ0b1y{!mrBJiM#qZ;bKsQvM<8i7LL8ynx)!Bc<Lfb_TB$J2PqL
zspOHXgyUGN`<X$WpD+9yhI?)TJ9EKJc?rxH`5^Ul$)bN=D||Ke^gf*FcAfCG<n68y
z9w)csjwfhO_!`Pn{{y#(e0YIy+fREfft_E1n{t;nrN9l5YMfT>h5N{D|7w)W>p7uI
z@eh0&x&L*XxJ|bkCEO9@fr#{<GueL*l83-e|0hoQtH@W7hi?)+$4Is09DbX~M{ljH
ze<ZmR5pL_-x5?w=8>qkMV$oA@o5=4VzZTJ{%HLKb;5VUxHui_^6g}1t2QmGNZx?R;
zJdZqZhwzCEclE7|7rCvM9;2PH#UgLV^>33Gkz4&c$^8ts8};|PUG!&@+xBiId5qk)
zU-y$ol|3(6dGcP={~39ld_1{xhlCqu`kX@EojgX4<D%|oDtZ1AiSKume~dhOxA1A?
zKa=|@A0SV^lkp{QOMWr988`WrKVM6puKa%$d4SyZXTG3*oiCodBwU@IoyfKQ)5x{`
zE6KB!{Vyl*!%rw5zDLr>!}$8|7JKp)pF$o{{5kTd;ycJ=ig#Ng`U}Wyy>St^880tn
zdAIH6R|)K|)j{gX*lMwVFcpnr`6{CR88%+Fep@Q?hu4%p9LI9xe^@w<!?>Tf$wN`$
zy(u4~{n_^lx8s_R$)n`9oc>MjyHDgT?|2W>S@FT-aq?NTe?EC+ndo_gcCIClJ}A6D
z<s02A`s3tB(f+f^{f~+~u7hwtdE~ywg!iKS`{ddGBBy@mKG7c`e~$cAaFdS(Dj#no
zFID_m>d9Uu`p>65rR2_&!lUHJEtPPiG2!>qac?1ytP}nn?fH;AxJG!8y!ri1e{$<T
z6Uf8QiM-|alNUWJ+>Vz&BllCjAN99EhsxCR1<#AT?Vn_k2iFR}jq>x!i^-2Be~R4q
zg2-o+|3DrhNB7?S9KBq^jlU%Fwp?F8?l669yEC8MM{dViZ;%JbhcMiJQPHpSeIB`v
z@2M!KRke@967abR_(R~PzY$dRaFeEDPpnYli)kqLGoR%yxL)|!TEdsnpNsx2d<^AB
z(?0{g&MvX_Pff-v^oq!z%7j}@p8vA&w$$?ld5qk)U)~3pFXT4d?&MM0gKnz($pJU(
z*D>~&>X0Axpp>tuvU4!`R>i~M#t(xV#m?`j=N0lIrW>Z$+|M5JxZ=k>Bzk;riJm$v
zk5kC~8-(Xj&m-g!a!en&pE!9D`B~(>R)~K8+oH$r)4q#5pWL>m|0R!*UrarLhnddg
zGHdNP)5!zxi2i@k{zolm`q=XF4Y;YtCaCl*gdbMr&n4jQ{5JE;uFD*S4wB*ZN3?e>
zwETYZV#;@+{nwu^{lnn<V&^oL>&49^UIiZtxBZrLt`;8JCVVmV<dX-BgfAdpL0&{|
z^Xp4;=QEMF^PKe`5&gyQk~7@~k~`FM0`<=!k5EsX{4H?PkIGj4rQgXjRk=RyQRXA#
z>rE9!cauAe@1_4w+<SnxRb_44N*g+e9vB8h%MePCWIIkMgHn`W(nK6uC~}h%CAK9b
zJB2DBp(Y?&LPvz&MG3u#-of-9Om72-p@#wfwa<FCt@Bt0-tXnVUR`qy@vWu%ti4X#
zXP*ZD&)^YE*RFS4U!@)&xb5G*AoVXdAHV8@uQx>d8@_k64vR6}Y2e=Xl-~jV6#5%^
z#81O~?1z3uW`)L|h4{lHPiK6d>aY}if8u%YI_8p3xpqC#2k+{Ge~xzLe$qVcdiPr&
z@vF-YtDk2r&ucg8a*k$st}wj+-#sDR*J=59vxAe9PRy(OOQHI2?D4Lv+hH4jp2YX>
zzmBJXd@Rp7-0OLb_zfL8-}B&W;|G3;Ur&y&yEc;<EuXExtHJX>XnXB>tKEeAJeztl
zTf6f)SopfKUe}$jhJo<3I3D~_@}HxGd&_uEar`BL_~*@i@Mrtr-w8ME>fC&&W4ICP
z>AKq~&-j#AV*@t~ys)+Mn=su5@O)hUA)i6uSIVF3n&bTgbxXA>cvv<>Tgh}cKThju
z<Bt%&9%4%W(_eP1<@n9AgE0Arc2tk8=bPXW@Vy}W4m<~L=gx*QfacTzd>rB{!F@Zq
z^V}cY4{paLM($ZY&-W*4dpnO-kF7(y?8r<VIx}wn;liDKh@X#m|H&F(k393>FH`&-
z@GyTJ@i};ofafdlH^}n?Jj@4A(foa<Xg;0r3<H0MJR^jgb}>H?@kR2a!9O6+QSdNd
zhWNrF&ByMi?gsyqJP*Rd{5`~n7OUqswCe}(FUj*8Jj{I&txx1s_3VfBYdH8f<QXO0
z)Q5Qt@rBdW^CR*(3j9a%oD2{18xh}`RZlzec>??w@^r$(yomU^Gt~1w@)@+ZwzvN=
z$_Igag`0LUuS9&^nHpb2{3P&I$TJNd=Iw}&oTVN+UVReyn&ipC!#t1p%-QO>1^GMz
zzAky5hKKpT5g%Hjo(w$wrfPjQBF`GaO?{XTLwo_A6X6*L{&(_BhKG3q@sXvP&rEn0
zg8Rs`2p;CwA-(`l9-arlw<XUL@G$=j@yT;EAM5c*2i|NCw<pi4!cBdc4@JE1T=m?A
zeD(m}nLOj*Vcv}RuJhD02A;#gE6J0AhxtW__n+_f+yp+7Ja@pu{58a9;7KE&FTnpr
zp6}pczR@(Tr{@CAXA(R+f{!6jP`Ifx^Su!tfkzf^=Mw|plRPc(FkgiDJb5kwpG2Ok
z;9>qa;=Sg8NItSSIiI(|r;_JAc$j<k(fTCeiNmu#_<rQsM7XIB^AU*ef+q{lMDT;i
zGZh}@hax_Ck+$m?cou@!lV>qJ%x^%v=VG_#Ven@1JP8l;PZ6Jm=TYSI3;1mE43rag
zrasKK5pLG&x}4@?JtM)B<QWYQ^CramFL8Sg2k#)yG4L?YA>Ol0J$Am|3Vs-Q?uLi?
zn}|=sW9NGj{Alw001xwx<-!S5AJ3)k`L!eX0`gP|H+5#dAL6s*X#+orJRR^bKO6DB
z%QT<YFyF5MKb1T;!^8YV#CtAxdp-p}lRV$R!+fLtwLV?s83uk1c`AjQ`Y^9WeBlbs
z$6hBe1N=hrw8F!D5#mEvsz)}z&L;=Hj69da!~7w{=ir$D&nw_plIJaWnE#CUB0L+R
zU$@QyTDR-Svw?6^ALc>Cd#~1Z+5U4F{3h~Dfroht@fmn*olge8jXX=>VSYE_J=bVH
zhoL?%gWpY_x8Py^U&LqNvH7ek1Au0|E0AYX;if*!cSF4AT6aFR;E#~!0C<?sLwp7v
zJI=NU{7Leh1rPJ(i1%Kn`Plu%Q{c~$=S6s!e~b7WJT{-z57hd+M4k<VoBA-XM!f%e
zcRo|VUnkGL@Gzf)_yRo7W4@mPzJfew!NdGE#OH3%e7=U~8SwYX^AbGFzqOtl)sul|
zwS%-ipO9x=;if*!gNV;BSI=qi><Rt_c_Q#IKNRty+uWWL!2d&@rSLE>AifhGyI!mS
z|A9PT!NYu$gS8Hk+clp>Xx9kv|B|OhxTz2GG~&C+a|U?7W7R(h{$=nme+u!*J2d`C
z#D5C@H}b56hxs-#P;9mjMR;tx#(=Luo@v5OyO<w^_{^Q|`dkR^CC{bsFn`d--=!YA
zoqq*<L-M>05A$CT@4s6;a{IjVSx+vcG40)yJevqN?P5L>@pbpOJ(Iz=AkV?@FkguH
z?7iw)ANec;--bLl!^8Yd#OLo*&ld3f1U{5JtJP_{m<NTMI&|Le_C&ykktYfd^AiwX
zEV$cs8F&SGZiR>W+lco*=+5V7@Db!$qh8y^yi&Mnm;WKRXDaya<e3f+^TQDzd00J1
zdGvES_-OK+2M_am5a0ErdhGd;7s1Dq=WTeHdm6M|$*0tF2l5#VK9M}z2{+3#^Su%8
zf7<OyfbUJ7dGIhl5Aj8KmLZ>;!1pCj0UqX`AU^ku=3_mB8nwL#l4l*^rd`Z~h>tw0
zo*R+RUf^})IS?M^3lZ;mPCa(MUj`l{&u#E9e;4r?c<g-d*QD*5MV<|Wn|3h|A>P~R
zo?rFg3G$@jVSW+fJK?c)z7M>eJdeP`{6oZNp4WT^_R~VI1fNfy{xNMA^R0!O^)zcv
zILgP~$21cBDDsSlhj|;~b1$mrZ^-9l@Z-pH20YAfL45ut_1OCbo&Z0QJg>sT{8z*m
zUslf!$Y;}LZSP|8Y%Sc(cjjTl`(9Pgcz6y1KZ888;bDFz;*+nbXLoq=;7iGKCp^sG
zM11~r^~mXX=kp)%3&_)dhPI3O4#LfR_q?edyZqFEUqYVg@GxJ1c<)>4vCGe;;8&36
zR(P1NK)mm5_1O9T3;4C<Sw{wz&HQ4%i*VDfx)tiN^J_YIo;-E%FhAPHzoVXQ25Ozp
z1Yb^`3*cdXAL6^-SC6co&gW(DyU4Qw9_AhyI5O?>eyE-?@N5izKY4t@O?{Y;MtmJS
zayaIE4gi0cJPq(LUtm2Sx$`+6{0Z`00T1&h5ub<W7Uc6j__O5s5+3I3&C>cLKh}KC
zfoBKs7s)eRxTz2G>4^7!q8^)18vHf#90d>aWr**D$L4c4_}k=p7#`*yAwK!3yUxFW
zzek?G&DM4?-%hw$ehTE-1N>w1)WE~M1@Yd`G@o&(^8)aHljl@;nBRo>I`TXL{xx}C
zf`|E!h|j}wHu70l29!*DzbDUT!p(eV9zuNNb8VOP902|^d1CM|KN;~wc+N*Ymw|hZ
zQ~x0FJUq-_Mtt_)8gD&cfe#|jkMJ<xv{mci|3W=MOg8|&I(c>xZtBDQAjBuhlLlXh
zJV(I8{368X$+H}M1M=Jl5A!a>7s>Mj_$K5T*rx4bzP)g>ocO=g`pD+R`GgVA*NwCw
zK8JW&{G88;;9Jvlm%zjPUc~#q(&<iw=XLPy$nym}%r{MF{uy{;@azV@6M3czH}jYI
zF^Dh1W7~B(c#u3d!o&Op#Mc$w_52LHnmpga!+hhUwyP5!d){_u@ZHE$DcrP+`BcPb
zzSex^_tyf&z(<j%1s>*$5bycc?YRVe9C@yThj}OBGvBM{bj+_$!6%UCTX>icK2+=D
z`$0WN!xI3HkY}WDQy=Ej5ubx+1w2XceaLedJj^ddeC9{ZCkxMV@ag1v03PNaA>R9w
zdd`H$lhXD^$+MPl(=O%}i0@kI_DlwEB2O(m%nwC;=4bWD@Q(9Y03IjLDey4A7V%y1
zoB&S&yp24M!^8Yj#3%o&`PlXJH}Euh2Boz=%(oG4wtHRhT!?%|fX^e(zu;khFyb@6
zXg)c34h27wJcq->{368repSy+@Z1J|EO`p>Fn=HMUF7*0`~>n0YS;QO-%hxxPwqF(
z$L@ba;ETw!FFec_B0l69;<RW1+I2Pf>Eu}s5A!z=-wBUBKKd4X33)sn+AikX3ODWY
z_SbygMLv6gpHH3%@Gwsz-ZMZwcKJCOJV%~$;9-8hjfdwD<nt!@<>dJk9_HRT+OE!l
znvaAzpB=%kAy2h%v)y2RAmSr~)MJm2+QDxm&vEcDzZUU%c<gcC<KVZF=QVhk_nWKj
z^8HQoc?9j+0{l+$>?qu{i}_T<7vZ@c9&?+#Ssv~q&ynyjzXI{ORWzTK@H`Cu5P4pP
zhxu>Tv#NUDfM@f0n$P3p87kbgi}@tPXI4|srSLR^KSQ2Yc$hCnynl7|Tn^7N@E6E)
z4Lr=BLVOoI*TVBI_^agk2p;AG=W89ZYiK@p|GqK!TjcqNa8n=VyCL4YrrQ$%f0sP_
z!o$1+@mY9eH|=~*0{@6Sr^CbiX2g5f(tM;_>wF#u|C~I}!^8X=#3$EQ&y(=1ewfy+
zNS^hCoBA-XKzwK&^&AdQ4fuEDnF<f{cEsn_RnJ54oC3a*JZHee{AR=_%?%y$k>M}r
z^BDMV<arhz=3gSd3!by#86XWb>-)e3x?dXvzK(EHALf;a&#bTc*zNpe@YTq(FFefW
zBHp)wdgOG2^H~hOHhIp5hxu)Y&%!eSo~OarC(ldpF#itmU6junM`(QplV?NWrasI^
zAU?98w#%NUh=BitJk#M}ek9^M;j#5O8+<GBTmldCM-d;|$em9Y_&>?>F+9vyIa2FT
zfM*LFH*E&KBYB1hH}zp&gZQq&n$H24U-jSt^2~&X`H6@RZK598T{@o&!H1LQW_Xx)
zA-(`lGdu%i#+mhESMsbS+_a0iAMyT8HJ@sX!|w?mBF|KKn0FvP`FFQxF?g6fIe3^q
zjrja#ZqFCsdy&T@I~3C{<~s^EbtwMB?U@8Vg*^Mh!+aj%{adIfgF2rEK8-x*!o&O?
z#OL983Z9q14<OGw@Gu`B37B?ee45Xv@N5Qt2zj;@ZtBB)JmQmEsz+8I=W_^nBY9>D
zHy-AvAifCCV~#~G%LJcEo*Up{{sQ6)TWLNY!t*(JD|x<whxrD_XnpcqtLJTawg*p<
zCm`Iki}~J&&u*ihJK$*spG%%3Jj~BTd}Leo*m1dQ!H*!%t?)2^1@Q%Vo<=@jfFDDi
zfyZjQm{$rn%TIQQ=40pk0pQ1zryd^W$0FYQPxaX2!E?Y*A<s4NFn<~GIe2V+z5~ya
zXW((#F6KK5H|_Fnr}^0Ngc|U($ukum=Iw~jz!OBBPXRxVJWJqVeiz~k@W|n<^LY{c
zV)A?d5A(I9u%=z1q1rBcJh&71W#p+6ZkA`}`y##o&mR3WpBC_|$ukEY=I0_lvV-Ph
zJvV{hK%TqdVZH+KT|25rhP$25_u#jX=Vy4BZ?aJ96WK{UcKfp<_#Na43ODs(zBl51
z!_;H16N`c0OP*GEn4gOH(9Y_y*Hc{v{vdg-gNOMuh_CajM^*>t^FH`v<oOC7<{KQZ
z^(n%0B0R&upC-?)!cBdc*CRd?7~&W%gXakF=gD&-Jj}00yg#U(L*RJ~{1x)N01xx;
z5uYW`Iwxp*-z3i_!cDuF?}qpyJm(>wY2aPtIT#-1M<YI4q3yDsCEy>D=Ms3BKZ1CF
zrFv|AR)Bv-p3mT6zK+ajXFCs%U4Djwe?^{=!cBdcHz3|urTJJ-2K-y{oD2{1>k*%a
z=L*#M5%8bL^C~>d`=6xksvEBPya3Nu;J=b*JK?5X%*P`>RIMJnoj(wKz`|`E^C0+R
z@Gw6S@fmpR{K|o^N}lWBVg9`JjL>{!b#^|VgRe!N|G>k1y_2;*-jV9D=P&*Vz8-mY
z7H;apJc9TfJa&KD48AdW67VoT9r4~>-1%Guz8QILhKKpfh|j@e^Z5dNOY(dN5A%&q
zL49_0=d&aD5b{(BH}zq@AL4WH%*Om`1K)u>^Wb5AKH|N*x%0UN+)tkS;9>qA;&brK
zMm|4-SCQv$i?m(Lw-@g8qjuMP>~ZcG@Lk9=86M^-#3$jIjC@W3--A3$;9-6j;tTNX
z1J8@#W6AR_Jj_>JtnKpsOWP%@kMr3EyoNkM;by)wKLGK0@^pYtCQk+)=9eKpw1?(%
z9rC#syp}vq!o$3X_#!;=r}J6kRBiA6<QXj7w2S$_5T6NYKGt&p_`&3<gNJzr@!nDD
zDIlM-!5heP5j@Nbi0_2QF3+!m&mhk?@G#%*G_6BuwB}<yQ^8xv(<0o|hxx^b&y(jd
z@I%S-HayJNlnt_3zkFld`S`)-kY_L9CLiWUB0fu=%fSyP&)x7a{|xb-v6_!`vz^b{
zr)!=W@(dMj@?m~3;&bpM;5i<AA$gX;!~9jmd&BPf^gBcIIhj103pe>NpN9Ahd5#7@
zjXdYW!@LvmMR@FS+)D7X$g}R5nh*0)!p(e7j?;EoPc!(r<T(x==652#2#;N!KLEdo
zJio%jJaCrgQ#amS=L5hmCC^;prq0Z7Kzt{8-T=RfJU_$3yh;Z4O?^Upy7OrQzn(l9
z;U*vE%MqU^&pY5Zljm1>m=9l~`S@zw`80yxPM%|gn|zqxf%q(WJ^;UmJpGrdhxth1
zraqp%-1)@7A0W>$@G!p(@k#P@fj>%~mGCg%=^V|ci#+>)KSiE);if*!uS0xfg1cR>
zgLjgr3m)d4b2Xm=c{T)pnLL{dH~BCRA>KdHozH&YZ;&Sn5AzJ-bMOSQJzN6*4ta9$
zFn<W~UGSvfc@z8t^1KfZ^MP_eYL=h6N!l)(&!*s?l4l#?rasKaBR&U@%_j=}C3$AR
z!~8_V`zC8XbI`7fz`r5SP4F;(8}T`K=E3tT_>bgS_k3*^^WBA;`uL`3K6YI65b$5f
z6N88OLd56bk;7Q$a~^pA<K4%V%iv-D5aRt2%|{MroX_jvtB~hic$oLU0P6)jw%<D#
zd`<FfA>7P&=6fJMvbW|V!w$}88u+^8IS3x+M<Tup9yvU5K4*e&M4t2EVSW$dlT$Sx
z+pZVD|4yFQ;bHz0;ytzMnTCAUz7Wd;c{UPm>f<9`jrclv_J?N@__h?k4?N80Aie-k
z6FiH+w<pip@G!p}@&0MrE;*g#e4Yj0nLMw-!~AE&=gG7FMOvRq^87=%sSooI;{E$*
zK8GTo{lQ0)rvV=33lX1(Cl1er;Qu1emGCfs67jx$H6QDF4}1)HK81(*Y8PuAvha*W
zKAVH@NuD9XO?{Y;M|>ANQg-KaF!&_$%!G&eBE)C*({{<9&gXLQspPo<9_F2h_wDcY
zd<?!HdA@*$`I<Rwf8be(e0<;sk!Of-Qy=DG#1{^5&#wc(>&Y_{9_CrZ`=_fX>Cw;i
z;LYT@UAW1I`5TDO!6V%y=kp)%+2r{}xbZOG@)E62=s?ZKdUgR%k|!+Oc$mi#-vy5y
zr#%+DgFL6e!~A;0ClAtm9@l1h9tJ;*JkP?z{A<LA4t9H1Tc+(jnmp?ZH|=6R9P!>m
z)N?iRnGC*wJp04L{BXn<;JFE&Gr><H&xP<XFCab>)qJeyHSkl(^FBPxSGg4Py-q#9
zAs-+3ndI47xTz2G{j8^6J#Fx`gP%j5<KSU_E#mX=>;uo^;1`nTIe3_Vg?Mj+=41Eo
z1LcH~>0d1)&l<u_yO{5Q_#8ZTe-Q$|l04(!VID)guTk@vj&{ulzm7a7z{C6o#Ao0+
z9iAt_Zz9hd@Gu{6xz@qgr1><!vnBX#<k>;EsSoqXh|j`f`-jcocatXt5A&sn_r)|H
z>$wTMK%P6{Vg3f<^W-UlKSG|B@G$?!6<9Bt-T733KS`c{2{-j&UWfQj^2`N)jy%W0
z!~6=w*UcE>%qd%+`@vr#&lB)4|2N``<QaISw)b`Nc!isGF|R^=W~MuzDc~!}a|k@l
zk4L;W?)F>?{yuqbg@^eH#OKKK3-~AGS@kMy7xNv3o9%;tmOGy?_!s2a8y@C!5MLlq
z7W_Ztxd0yK4<J4=+nvvw;6IS(6L^@fEjujJt}gNn1^+L3s)U>Q&U`xJlP&Ii+QIvs
zu&uLv4#M{27<iapiFi+|+w&m!-^lYUJj}mAe3m?GU8C(?gFKrGH|=5`LcFidozH>b
zUh*`<!~A5#=gD&^_=e=k!^8X)#D@~@e7*+XlsqfpVZPb5T8B>ZRDf?mp526-`Y=BP
z@pVaeJ{{oOkmo3PnCB2*B+u>OL&@_XJj_2reCAMhKEHtvBhPAbV#~CP`HsTPdf`pE
zJ!8Qu$TI~V=4r&|$g>E11bLRg!~9Of`_r1wk64eM2j87MZ^6UdbG^39->#l-;MoLx
zG<p6h+|-BpWb5frkFTGKW`U0<&m4G|pO5(D9QDZQ9_O<hd?I=7g@^eH#Cztd=Wuww
z1>c)I|AmM7KW@<aWZ|iTrviLm^6Vkp)Q5Q!;ycN66!?MUISC%-*CXCLPusOC@_7`z
zjy%u7!~AQ+N8qvTTJ=V4Z;U(}3ODUyzANIh<k=T|7I_-rVSWPQJE>ilfG5avB|OX@
zMZ9;u*2nh4-vDnX&%5w2@0Z7Xhi3}vyb<_(@@y{L)Q9;Vi0^{OjvMa-eiV5QhKKo4
zh_5?L+cg9EoCSUyc`ky7`2&dWgvaLdI{1m?`2-&3UOA97+r!A=nokk=>;%4;Ji7}w
z^<mzO_|Oq<&jRo>$g>z8<~JhVf24ZG+Vui_DS2Lnhk3u7wOtu_?D(n=`~vdqEZnq<
z`F@D+B2PQ`CFD6C9_BY8K6I3}%dYRwgI__O*Wh9PBjPjgJc;_Ob&Iz5TJmfl+_Z~%
zCE_DTYd)i}ov#7UlP3ZX^CaSP8TH6;oby=-zMMRZ;bDFQ;zP%%rv{#f!S5o^)9^6=
z0`XaR?0CX}TeUv-lV^S5rasJfLwpgQvysnq@Q2AW3m)cYB0hVpwyOo6o4}tS&+YIq
ze+BWL<J2<}p3lIaCC`80VZQ!ytwV@BL&0ApPf)n25A$h=&yZ&}_-o{u0}u1_5MLnA
zE#PmH=YDvYe}H(;0<F&zsL!w9?~!Mf+q7NGhYB~_!@7m)sf8y5{xNwb!o&Pf#24VH
zg69<Qf0O4dc$nXYc+c^gkM%qQ{xx}ChllyEh!2rx@a@{(@5wVnxS8+FCm}uqPXz6X
zga1sP6g<q&LVPFXb1k^%ME7y!9q=$;fq3r;TAw!L^Aq?W^7Ow$+r@kv;by*P;Mon^
z!(G8wC(k%|n9o9d;Y7`60`fTyd>!(f2M_Zn5$`=oJu!H`0^fi<gYMLJF|QQvw2M6Z
zgKt8fI(V2LgZLsmvUzbnOTjlM&sFd+e*y9NleJwh!Sglv*5ny*m$r-fFyW?M*;CZ>
zF+7vOw<FKK@Gzf?_{<{p*!62M_)g?G10Lpi#24T>5&1j>9wg7x@G$=p@sY)v&lT_t
zx?9^@O`g?-oAs3Wc8K?!s-AP;*%f>@@{ERuc?05e@W^Jw`OF0$MV<xlFuw-zb*E`Q
zw%_{*_&D;s2oLj>i0_1FSLCzNJzAd$<oUaBQy=Cd5%0}vKDJ#G!6W3Ug@<_>@tx#3
z5qux=oC*)~>k*$l-JQ<^;M2+T1U$??Lww{6x5sm@)+b7yRfU`SFdvF|@0se6Zk6-d
z1H6emHSjQRL3}597J$devlt%cHzGcJmOGzE!Q065JUq<5Lww|Hw`cA9v_5I_Y%JW=
zhxtgvdzYvui26jp=aFZBc$m*ed;}i5{W%T%Nb;Nu5A%BvpC!*r;K!0@1w73A-LG}%
zq;_oregb*65^n0ld@SO<OSL|>T?c|MB2P0s%uhyqgglpmpH7||;9>p(;<NC~!TkCh
zd<l8Jhlly51+9<w9Bo%4JQd*QlV=y<rasK4BR&Jq0C*DMIr1D05A&-KpC!+u;Fpu<
z8F-j~f%qJG`ahuUy@oui2{-LxJ{0kJcn<dH=U?DAl4l$|%x56pd#=_w3D1$>w~}W8
zJj^dcd<LFH@Z1T0CwU%#hxvPm_nxQuTm;XL;P;W|H+YzD@u1c-56=<sRDwT5p526-
zIx{~Q@&5BQpA+CogFjB5BjI75Lwo_Ai{ZHg{2B5*3=i{95np$K=3_kr9@6%{K%O;(
zn|3kZ5%Hc2)nn(^Xz*9bGZ7x<3B+gNk<;SN=LGP#$a5w<%<n*a=pxO>dR_p3mpt#l
z!+g-gT8AP$cHG7X{t<bG3ODs(J^}HCi#4C)(5?pX&&ksY5A)LzAIYibY<R8#FOugT
zc$j}|<KeOW`Bfg#_I^j6O@y0vF&~Tg&Pz0(X5`ZZzLGp~c$lAn_|P)-oD9!-;J=aQ
z5_p(DfcQ>$HihRE@PQ|7>ySa<?+7>TV&3mj%y)R~c5E>CYUJ5MxbZOG1Mzj24soVv
z+cgb*ZSvH?!+as)i||ZE{uhI<Po5j#Vg3r@J1-mJ7;L+~0Uu1Bfsbjsm=6<fj^jd?
zt7kg$nFRh1@*D^c^J5X8gXau*E&|_*JU7C_{B^{KuF!nsu+;hd2>ws<tns+Ei+Qzh
z(=N}I>apwB{@^>3rvV=33lX1#=QZSWA$Wj1*TBQP6Y-g=G@l>f`5b&Wd47b4`Q|dP
z=8Pv?t)4gG8412Cc}590%QN!^#MfP;o@3#e2Oc8N(eN<881Y4T4u$7d@GyDqgNOOM
zh)-Ut`E-vXf$v40fwBQN?P9*2a8sWycx=CS6!;YK)WE}h7UF%^X+Ec-T^aCc<T()@
z=2s&=50A`N=W{>!0pxic9_F7RK6Ab1b2&Wyp3?RnLY~!xoBA-{9`XJg)MK|lA@D}>
zjDv@H4DnfbY=3?}_)PL-;9;Iay!S@UXQzJJu;t*b<arPt=AR%wlvmFosLz0>wLU5G
ztR>vkhq)i|Ie6mm><K=XJhkvJpNn|!O`1;&JX!D~$a4-n%<n{e2A+4|c@F#-^1KWW
z^KTKKyjk;E4|QJk8LiLp<nanO^<f@BeD)UgWRcH!@KeZB3lH;oi1**Bo&-E+SU-8r
zfhSA+cEso4*%zLtz|W@mSKwj367inpn$O4ZtpBXm=RERkCEV19`Cf=Ge6AiZ>O2Gd
zV)C@Z!~8tN`@T?*?dL29zl=P0!^8Xy#Pj>LzXZRUJm0~?e8cCo4iV&Y8S1k=_zmO<
z3ODs(UW<6om)fr9;faIaLY^c%%(IB^f~O9i%fas;&vo!He;V<juQZ?em|yRL-%Fm)
z;9<UIr`D%XbbGc2e~>&o2sia%J`wSu|EOmf+SLI57<uCGFh2?LMR>-cT^E5rO`c2P
zVg3N(L*HsXZy=wSz@I12>+mrD0r9@?+@95C0WkaPSID!Ta8n=V6^QqIuO8d38t^yC
zGX);zZHSNj;LhhO#QRU))@f-y+B@Y1&8HaO&LP9W&jimQ-i{|c1MWLTJ<@#Vv-6AU
z>8e)G5X_e_c#+}{0WU04PYXO32{(1~FIEnX=WfJ@z<)#hH;50N>W*Jm4j4_HBjEc%
zG)K7c=TCFTFGYL-{73j-MSLOaj{gbqo#5^8@A|U(BWJkd_ZM#RsRMr>{yS~_neO;E
zY&>`o@teMacAcg1>tec7g`0LI&vttfh|hrE0?(D;y#4&!tLiT-QO}8p9}6B?s(dE+
z6*e9`5559Cd5*?gK3EPkOdWjZDwoqz&S$D{lYbYuo$nVTzUw@VAA|UQudBy@zMEHq
zXTZ~lKMUM*fyPe)e-@nIYd=sn2qsT{PyKG-{NDL?aDI>cmEind__uBRMViks$Y*^y
z@G;Zn_qI<0=l8H52hQ(RzZ;z2lm3JCFVpGTe&9}T>2#CeS>zJ~_gw1E^FnZb&-m-$
z{NC_&-$uJGSC7nA=Q9Dki}*?4{GRUn!9!Q7rxu>?z;oaez_(kW(=CGA=ir?P?!8()
zcKdTTIKQ|0Cvbib^{&!!HS>$#E1d@C_e9@l<FD1}E<m0ig7bTvx9CzoznA#{aDLD7
zQgD86@^jX6gHHErkABv87ycWSXTVPd=l2vp4$ki#?*E>8_&vfC!TG(w=YaEjem?-`
z_x28x4W_9xzlZlQaDK1uqu^e8Pwt!G{NCHI!TCM5t9+o-<@eHV3(oJE9ShFyjco+y
z_rM+l&hK^2f%_lQ`C^~Dct1G5cl9T5evfL^hnhdX7c~ve?>RjKoZnmeEI7Z1^xxq8
zUeWD8(&_ShLg#?<dp{or=l6JSAO|dFzVmxI8^HNJoAbc=y_u(h^LsF_1LyZzJ_OG1
zsq6yh_fGx@&hL@j=o8JK-wQbmoZs^}37p^C*b2_?VO$K(?^V13JWuaQOns{PbiKc=
zv!*v;eZK+R`=RnN;Gcolflmh?@|k+-K61x5fb)9;mw_ig*7$Sac@Lc5>$lbCnD3vs
z;~T(}pSq`e5jej`?+tK%FW%sPBmd9TGY<Le3(oJYTLRASp?d|K-zzux3-$AR;--W1
zd*7CT^LyN00O$9zt^K9?`8{iUf%ALQZUyJ}qwVsQdiZ^38E}4o*$d$OKC<nKs3*N&
zY%w^$FYGCBe*f1f*)ci&pKo=(*!P0X2j};5-3iX`+xiKd-=9_SANBM5unq<1_gmcp
z&hM-G!FuTZQ`>)|etw_SvEclEsC#TYz3*wGZ;>axziA>kzmI7-IKN-%8*qMK(uD8S
z&+k9F1f1Vz^gcMhpJ?0f)x+-_ngO00FvMB24#E0$JvhG)=o@f;zfZ*v>gV_M91hO!
z-+2U_->0+MkLuy~<J4Nds^%%(a_4g~IKRK<9dLdh&Bi~epWiRDH#om9W&t?A|K%<l
zzozDMDDo-V__dU`fp5Q3r_1kKX$I%_r{uu-eJF2P&wA>g0sjU+qg~)<gHHtK_m7+k
z&hHcX0-WCu^3VUmzkyD-0seY$et*Zs;QT(0x54@S8e9CLetuuZSHEGpgLS$GV7fbb
z`Z?PlexF4QoZnAzCpf=v;s<bke?(P3_4E567J~Ep9bN|K_cg5BAD%5V|EG|D0-WEc
za3eUsAK`Owe&4~+0qW=X7sSE&eFT?+^ZNzf1n2h!{A(cm+iLz#Bmc9(JyqK|B;HTI
zxEh?F^WSffdic5hyMps`_)oI&+v#*?W4bHB`8oN+<%Q{Hf5gwdp99X%v47CU4^{sj
z$p63K{G9n+R#87cH+~*CKL`F=aDJ})yWsqs_R*`VpP##a1UNrO{XTGhE_(mf)Wgp?
z9}CXUEnfi6&mn)%##d;4&O`lwwf;)wc7O4&)pfdA@M9650q5s<KMBsy<^E7!fNtg^
zKWDpQP38RD>^L|-2m4CPM`)f4k<X{#{G96WTI%QLP9F`<&yjxD#_ytjyL@i3wtBp~
zD$ioNM}hNmoA0vmcz&{t-(nr)18)6MaPRKwx7)*u!TGtz?|}1jjJJ>zL#EFBT;dow
zKWF$_aDHy^=ivMt;GJc_)A;$hzG-lNPVcdDLeRwXb9e6p=jZ79*4KD`F78p_{G8jD
z!1=kg!41^I&!N2(oS!Q@U_*`P=fuXq`MIx8fb(-)caQ-t(_Vfq>v7=xoYjxP`MIg#
zjn%`?LA@57pKH4DV2$VJl+Fj|=Z?M$&d(7YzlnPIxuDmB^K(Ae*;M2Cxt*=x{2b1g
z!TGtGYyDk4{G7}haDMLPap3$M%Ll;uxs*R!&m>*HQdoaSZl=@a=SHT%`8kje+4w2y
zIRc*b|Dhg!PU8e{e(vHz@IpjAC&6<+I6oKh2XKDQVPJEeE<d+$HaI_r@KWLHVp^sD
zxfb#KyulB_`T2oc$bgtx&iQ$O2ZHnS{m!-V`)GS@zxq`h4<1KS8~QL`_SJaHYr*;X
zc&C8#^X?u2=jYcA+7kZ#)qgRjI|{r2F5PnHGuQG1-1$5U&d+}fZ>4^Ip4)NY{Cu{D
z!1;M=1GZKVKR;~`aDE=zk>LD%v%A6hd1V`Hqkew=SSxslo+tLA<@H(zdmQxlZPmli
z`)UN|=Xd=C&d=it3{ek1Uuy<9KQHTg8;|E!+2fZV!1;MrRsV!Prum<2=Px)vZ|WxS
zP_sM!D{y`u)DGLJpP%nE8=RllbTv3Xf9Z2@ex6drQ1$ckk&XuE=N;V-&d)FEx4nA!
zc|>Et`T0U8fb;W$o&e|P|E#rx`uTZ2Q^5K8JZFOQ^LAbY=jZ2ayrcU0c{sJ;{Cu0!
z!1;MKPl5CEXV#Dl#?1DOpC?lT&d-NA37ntz@|flAy8Os+lk-_+nEL%4%4IX<eBKA=
z=c{bHv&QrDQqBbD=bt<S&d)Ph$B%Z+)9LQ2ll4pi=jV-_0?yA5c@CVP2eNJeemqaZ
zZs*5?XThgnx<`Za^Ed7U=jUmB1J2LK2n2Py{Je`DDwOl{D>}gWc@#H+^YbNs0q5sM
zge&2v=RceY&d+mr3Y?$MFrZ33{Jez%I6psOB{)A1p>nu-`1uA`fb;VTis1bGfkD-1
z7u~<V88|;5pc0(#_n!>T_wQ%H`F{MTt!JSwpH)~+)*hkx@cr_8f%E<Ei@^DQ_7}kU
z{`9p+s-N#Sp9;?RkDm(8_k%xcJ-DAbhWv-@qW%o=Bf$Co>)XKje(L_as)z57j)3$1
z&L@NO{mb`)^Zm#_f%E;vJIamoW_!r@3pauD{l8a%^ZmSEg7f{kV|G_R-*0;{INv|}
zGC1E4yUxGV!}qt=fb;#TCxG+)rw@Vi{iFl-fS>N8+yk8N_nZ&T_ix?~&i7+}XZ>_v
zWFVx|<@+TM1?T%8Ujpa*88;ZE9=<<udvLzrum+s(AB=+Y{eVls`ToA=!TEl@)kdS9
zbpPFKaK4}JQgFUM?rm_s-|g>X)X(>?9SqL*qb&vJ`^#Pi=ljJr9;<%7|LZ_-zMpF;
zINzW3s`b$QR@;Wv&-YKI!TEluA8b6`-?Yy-<Vp7{eE`n)A8j@s@pM1YA>e#}&=uf(
zzt4Zb`Tm^|d#a!B$5{Z*_t(4x&iBi#SEC-j|7Bmx>3){;!1?}^m%#acli*(J=le&J
z;Cw$w9-Qy*SZjiM_<oI6aK8WIYH+@v;(c(wKVp-K>gW3%_6Fzs7tRLf`w?EW9=g9^
z{Yj`Z-7hc+oUi{s4xF#&zZaaZ&);UU`uTeMYry&X`Om<M%e9}d7WT92Oi_>LHs$D6
zd#b>_;Oij%KyV+$9}DgWw__%kfk!C*G4ML#pMoceuNgr-iSG=aC0+}jBYq@!p7>SZ
z1>!G&cM|^vyo>nOduyIW;(LL6Zg<zY72HewbZ{T><=}qeZ-9r0_nWHuM2HUsuOmJY
zJV`tWo*{k~c$WB`;5p(e!1Kfh)@q&w;@g3D5}yR#MLY#wBz_LK=MH!M?*aD`e;3?G
z{BP4VPe1V;!9&Dr!6U>E2d^W(3_MBvN$?EuufVg!H{3__%n{!OJWsq1yg+;bcqj3z
z!Mlief)|PZ2=2MlUH{GZ)jYk#!{9#Rad1EJQ^7;TZv~GKe-pfp_;27z;@j+}d1i=D
z0M8OngXf5!2c9SXAb5fJC*YmL*V$k5=^|bQUL<}HxaTf+{f`0n62B7MNBk*pKk*`X
zi1@|_Xr2+`dw|!0Z-)I#J$Q!V7l3EMH$ePV;Cb-%!Jh%|1Rn?fHMs9?cO5pGu6g>2
z?*Sen9tV#QKOMY|_+8*h;_riJh_7{^=948}37#W<5O|*Wao`2w*MfHve;K@sxaT0v
zr$~GUaL+yNdhQ4ACB6{cNBmZBKlsL&U+;iND1MEDHJ>`-Rp3eDb>JEBP2oQUJV)`j
zgXf9A2VNk)_92>2C-ITsUBqMHMdGJ{d+v4D^Db~NxUJ8};C_nVFsk{4h*yJ0h))Nv
zBYrY?lK9==8E{+AkHK>kzd@bmlP5kFyg<AIyp#AP;9bO@1uufHjrscvxaU51J$I_t
ze7wXD0{0PL1nwuk96Ut)9q<V8^&51$b;S1sPZB=@JVX2@@GS8Uz;nb0H=_Q;M}ZfJ
z&j#-#einEa@%zAw#6JM{-0!ac8cnD_@twha#HWG#i5~+VB7P%yg!rrAb;La}%_m8G
zEAR~QFnE@DD|n9h67W3nJHZRY-v;j_-mh8n>>@q{yhwZkxToN*|DoVs;unGYh(8VP
z2j2qg*LUC%ir->}=2-`B_wRdwXDEISc$WAw@Eq~y!1Kgcf)|JnnW_17f^U!fYr#Da
zxa)HaxR-by+(*0%+)uoJT&Ei%zAbo!cnx?R@iy=z@zcRG#Fv9-iN6P)BkrB0dFF`^
z2QLsm7`&7Cao}CVuK_O-e-_;Hpu7IxfP0B=G+XoZ5#I&ePy8V85b>kIBgB`1*Aagd
zJW2cu@C@+{TQtus@m;`k#OuKG#E%0nfQPZ%UJ34f$X&Omz<uDhKU@S4QT)cOnrDRK
z_W)0V+x0gFo+ZyB@Eq}5!1Ki41TT>P7w|5M_qA!BMT#E_?tR!@hd8*8_^IH2;<ti_
zz-@cq0I#F?ehJMdNqjr-4Dkqfmi%e(JjE{sFA%>Ayp#9}@GkNXNNS#*N8I%p3ho8B
z?VSSdr}#PGA>tQ+M~FWJUPt^R@Felo57j&~;5MIO;5mw~1<w<o2VNk45qKy09|A8@
z{HNfaN40LVu->hc!h8p}<E6vFGZcR?c#+~4f_olQ|7iHH1CM}P|4ZO?;Cmu|C3v3V
zw@hoE1&ZGj-21qDc}Rl$!0rBhDR_wD?+1^7+xhhoc#`7RZpZwk_~GC=ijRWliJt&o
zAf5;B1h@5h9o+YXyS)QCG#@{>jo%48L-G5AXTfcKjs?$C{Po}k;?IJ25-);x5#MNz
z=2;}ZJGkdbcf0Dry~K|P_YuDp+)w;{@DTC8&DDG&#J2~pBR&~CNjwdnA-)7WOZ+bI
z9PxL+^TY?u(>x2rcL47sJ{7!+_+j8h;+KMZo^sd!ad0p3FTs7p*PoB&5IlhOdL(!f
z+%BK>;05xW4BiQD*T);cixmGNxaVoj-^PCr?gO{mpG^-#p5Qj0-M}*x9|g}+`~vVU
zioXWD2yXLv7To)cyUyQ$`@n5J8y~KDCMkY5@C>;1N5OLxe;jz8_?6%V;!lHj68{Rk
zi}?CSXr4vlBfve+y4!mQxR-ba+(-OUa6j=!!9&D91CJ12<4Dc3j`&XCN#aw$GsIKi
zS>k7b=ZN19o+th$c!9X*D9y8z_}1WE#K(gdiO&M}Jm;?eso-94+poSA+)wduf`^C?
zI9l_J5Z@8J4&09K?F*iv_+!Ac;LEVxxC%Vf>CV3sJVN{j@H*mKWHiqt@i2IXcnf%z
z_*vjN;`f5*!R`9^K6od%t@G-~Xg)=X-wE9Nyt}=7gZsd(KMfwD_$A;G;&+1A5q}#z
zN&bGvYMxn&9|E2uJ^?&W{7~=$`OgCHqWC+&ixmGBxc3Ejefk}zdHRTN3+^Yr7kG$x
zD|iIl&aczKlN7%kJVWtsfafUw7w|msEf;A11>#}wPU5q`yNE9aFA~2A-1DN=&0f#&
z61WfC?jL^y_Y>c2q2?JPJ_<ZSycxWX_#*HmxXtHg@GQl@4xS_a3wWORR>z~B#K(ho
z5^n|XB7Qn}5!@~hw}E?K(t6tU@pW)NxUJ_e;310louGNvQT$l&B=MQx8RCn<v*gc%
z=PCXr@B(qqiJDI*xNYxF;6-qIyig17ec8P}9tQ3so&)z2e+)cC{44MX@xdpdZp1_2
zNpPEg96U?$r-SDx{x0wW#lHpKN&GkPF5=sqta%oR?*;C8#a*{Ha4+#Q!F|N<0QVDL
z0Ujd0$|;&pgm?hFj`(!&B=H5{8RA!fXNf-no+JJRc%Jxri!{#y@oMl+;s=9w5kC&R
zNc<Xb&#UhGKMn3B{uQ{7_=bx$Pe1Y9z(d6Az$3&Lg4Yqh9z03>74Qu4U%|7)|9PtB
znIk>{JWo6cULbx3cqj4Y;9bOD2QL!;72NZhyZ%E?(>%Szr-1v2&j<GtzYIJ?{5kLl
z@t?u#h;NnEe3HcX0?!anfoF-I51s?J`@4t13l#q)cqh2MPiOGysL$)}x{U$%5pMza
zgWK)l67UGc-w$3#@t=ZcD1L)8G@mTRhrsg`-wa+Lz8Jidcpkiq_)FkL;w!;DZ@BBT
z<(a4(@jbzP#FOBD;^%{hh(87%A^s(J9r2CM!g2*}kB=I`vlM>{c#inZ;CbS2fES4W
z0^Uh{%d<6~F5+SEBJo+^o;S75c6mM(+y`#^KevGUiN6XSBK}|S2yx#M&9jbp7(7Wl
z4xS-?DtMOoP2f4=FN5ca{{&tjzWGwkvlHCT-*MnYicf%h-f}OuXMlUb?fQEgxS!(R
z01pxW4Lm}8t8-9K@{a@0Q2Y$=Eb)`TbHuL)&l7(EyZ~<d&p&{7QT#v7)jW$7KNj5k
zw!3aE;6Cu3FrKgkJVNmgfY*WB`Sm4uhT=CnPxHxw+v|nm;CYHa9lSvC?}B$x{95Ou
zy~K9~_pETYcNVyp_!4j*@rS_u#Eak|;+tQf`9#3&{2C3Or1%-&8H!&7o}>7i!Smpi
zm|w4e7b*T{aL+sLI`}TsJiWxnf%}NZ!TrP+gNKOU1Repm>*FinNpM@AmEc+OY;lq1
znIk?PJWo6cUI4!w^*IYX+2yXoo!}X8JD$7(yp!SwUaa|af!D*o6L{oZw|_tII^u_e
zCyD34GsGVU&l3L{JV$(!oaUJ)J{r70ycN8Y__^R+;5(qb4}*K(bGNq$?gO{yHwRy$
z`6MZR6nKVs6L^;RiQqZnw}9t~zXM(%zS=U)r;~Ukco*>~c#-%i;GXxj4)%EeE^r^X
z-M@bZ?k7I@Qq3nsd^~uBcnf$P@iV}a;CB7G3p`8l?}O*SZT_oXruh^oei(Qs#n*xt
zDSke<=L2_rE(Z63+xk2T?gzK);}_r=ir?^Z%`*!=0n6t|@Ggo!1iVP`3&2Akx|io`
z!6V?dJ}-bLDSjn*hT^xpLi5Z~d<}S>;uGMV6u$(#i{kGA_k85e|9x;TxXpi+D>WZK
z#qS6nqWEdxbrgRjc#`5T1J6?Y)8ILZ{}#LeZub}exJvWvqWH1ko{!z_Z2|WZKONjh
z{C03Z@i)Ok#64GQJ`v(W!0U)l0#6ce1J4l8f@g`}3!WqXIe4D<;A=F`0`YO+oy2E>
zcM(4gyhwaGxaSjh{oexj5+87_=Hny21Gt~~e&8YE3&11bw*Pq(c#`7Z0nZR$^E%BZ
zOMDmb9Puc4p7=uW0`Y6XJBhyl-bMUp@FMYTuh%?1pStTj0o+SG1?~g4$D3z^hbaC|
z@Cfm@!Rv_kzd`d!g4^rucLvW;{K4Q^;tRoZ#BTu46MqT3K>TO$PU73&sCjk~pAKFm
zeiFFnGk2Zu1osku3*1NClh^6`i4Ox05kD9_LVOW;9r1g>lf+kmXNdQ^N%P4P-v&HK
zd{6K^cs2H?3GgDtF9r8}u5}xQ_y@ov;CBE1Z}2+sH4wl4%~-A|epm1exF7LN;CYHa
z6`bGywa2aM=l6Mi1kUf(+I+dj^Lw)P2j}-*Wx@G9R!@WTd#OIK@&D1f+4K0D-=@>$
z_eSjt&hLSm1<vnvItrZM({u+ozjrC~n@*SCvvam^^S+zV&f7VaAB6f}VB?A3gZMh|
z{Sp5>c$nf}hllw}#1{~MC_G+Ie<z>4$g{C<^V}8Y)rb%IHUDOKCW22P&op?LcOX6w
zkA3dwDd5w{b2>cC^N9BbG#~4E81Y=UH^8|LUxRbIR_Uj8;5;`6=jomY?x}Fs=UVV2
zxSfypfOmr1`LevfnQrgT-NM)P{KIqN^c|fq2O^|XxbuFiO80X0vg48W84>>m@%+5y
zzYS2&a*AJ5xLF>U?|}H^aGh=(8>T(L?;_7Qc$haMp5G&OIQaeKITjw~%MkA!srghP
zpWDG7CeQuwFz-TqWEb^31kd;2PmpK8K&=n+p~B64@$IIbz2F%O{w#SW!NWX(_~h>D
z`3dv=c<>j=vjiUI_gl}u)U!JBc^mvS@_Y^t^YsR4yE1#IM_xDJe1?I)O`cK0&GN&%
z)p|ne*$SRh!QUg#Qh1o(iTFG`o50fv{xNx8gNOM~i0_1F06gpbP22l#@@y{Lw2S#T
z#CO5-D?E+hUz2ARJj_qFo>5vKd2NdGxfuL=@>~fI^T!e2Ioj=60sb?2K8J_-x~pg%
z3S-n`_g}+T)jIHYU?DiKkK<R<cwWEG1?Tl;IXEx>&sa{&^WfFh&&y|(<+Qx*2hPh+
z0-TqJ<H33UUS>JXue+>=>i;}A*YgW-&k0)RwfpOQUm^b}w_jLwo$|c055CK~S|49b
z{q{KkHQ*uQW7aeKoi#jvk8bTG;?;CKhX^;zfATx^*yr1f-avUB-Y;q2E3`Lw0UVnj
zPX~A>xXouVxPO-V?fH&tz$4&xeSZu*N%0?mXNmXUQ1i@_#|Pd?@v}EVdlQ<^JE+5c
z8!Pu7t~>(%9(d?@<>SHs3!Vj^4!-eV^%TMF_UGQs4DbC3{!`b-$OG!P$EV}AS6+NV
zxvldd;K?W5eCQ53-7auDe`~;t;7gIh5yLe;gZaK8_!96Oco6)cG5tMT_D<ik+y2P^
zh@sB6HcxLt{mCDduZrI$g!{^P_B&KPo&EKWRJ$K}QFyudpM{sJ=g3rfKBf=8PanLY
z4}N1Ge1~-T>CWzhpW6q2rVqYSc)9v-(O!PKxA(!n>Vsd^QQot>5B^~veDIv|o*Vn%
ztIaJR|De1;s@(iN_%K~gJR@~EvCGvr!p(Nfzl(DHe~)L?!<E;8Ux)2*t#H$?^X{&Z
zldw#t!0W)5B7O-xo{+}d^TPLo`@wgI=Vf@3;C8+E3Oqx6^&>Q&Eb(o@bHqd7dEy6x
z7l<DL-bwsI@Gj!_fftFt1MV56b+GOFFSwWZ#z$(qe8hv`e&Q3sL&Rr;M~EK-UPpW>
zc#`<_;2Gi%foF-o0iGlNHF%!*sz+&i3&gho?<77Pyo>l`@FKX~ZypNn9qq1v7TgDJ
z_v4QUA0lf;2+O5iU*?{M_2LBe$mImiCkyU5N%=<Lt7J9ace3)C@V$^T?d|=!?2_`^
z?Z^7yU-iLPUsm38*k$GULw)ddFE1ZI{VLP0-k+8}_!AD7_maG)br^*DJO|$OzMH=c
zUi?7$*O=6&mVfBxzgYf}^0nY!?`o~T=My*oC%E@h<@UI8FUvo3^H$40S8k8{+AaUL
zn;&lZ7jB-l{7W~#-14uK{}=7O-twZGFSq<_H-F0V|G4?PmVe{sgRar~c)xY?p_YHA
zd?4x>0{4CI=KF#Be^74wQBm*^xE-gR2_7My1g|4L4?GFJ8|t&b;k}nb&#-t~dn}do
z3~NiY$A*m>U9(er{S5pyqpf3DLr1)&X{UITXIRtxw)Ff~&oD#h#8T;aqRsV3G^Kye
ztWVF>=yoBgSWCV6DgKpgX_tQ`n(Eu@J;P!%qs^)M)>w3AlSJCTJj2>!^CT+X7MK5T
zmj8H$IX`P}NT(G<>r<)v`5I#XolM2dFYKRY3TEOmF(;Y&)_9}jmoSr$wWJc!WPMXp
zyiJ;_xwN-N8(R`>v1ogJLrctw)TufXZA`Sb#@gh!O|gcK8PWQNhE!~h{VU$qY^LFS
zlWLpM5>K~Fp89vXJrzwfH>YDRWWSXZq~RLb5^s~q_WV04KlKcgic3QB-<k5K{!9Ps
ziAol6$##aPG1Z=K?`Uow)+lh=q%qNnHMP^CQBPAW6`LW+#8Oe)y0oVZibk6f(HSj?
zhWeIhQ+pznj@Ea~^GK7EEwT1k(=fki%S1%YtkH5ur8cSg9-Wy@9j&eN<tHV7MSJE9
zLR(|4$yA~_in%tdF`4vOR^QT+Xsnk+92%WId1N$-?<S6jx24<b+ZtnGPjvjmDWgYC
zj7HNP4bd7K5q1CCI-6qY##B7nE(>kiHL>QF`WbHGtPqW{q^Sflm>3AA8|&Meqh>is
zv_<B`8zK`LBY}!&bWVNL%y4H$M5j*-2AUJ8*82A7j99yT*U}M-rsIdj8YR5JBi{!q
zk|~)$kEy)p&jp49!KtxiA|*3vij>6hSH}~L?JZ8h<Mk~v=_UscSL`g@PDTz5lu!p$
zb+jcD@pPh1R*!UgZc}7VWO5)NC9aoMOJu3Qh>|}LB(p;Dt?d5kj8vi{+54B(rN4Ay
zoF9&yJzf^QF^QxsGotY)&5|gE#^05ZL^?jNHd#Nntwt8ensM<++|+EsoZ8B0WJ08|
zwmKRP$7OQUqP6~LZ7?d_{IA;l&rw#)O~u<~Ba@I-p{>1lVzPL3x6ejtn}Yhw*7OY7
zLiPTks}>p*ZgD0VNX6=#qW|;XRhq@DDV~ybt3}pZEv#i`jjVTAP!g+8CSy%e`<=!|
zCj8#q60fx1X<2W9$|HZuP+Ose9ge9FuPn{CgqCDnNf~$lhNl?xm(C8&d3dE}T}IMJ
z-hEhX=61%)zOUR8;hwQhkS-9F^O{nrSYvc<O8!@$l%+2k*O`PbQcd(lRrfFIlhM}t
zq=^nzmTY5W!_{7I_Cfeo)`1a|WCJ+6CcRHvx+AIU_ndIGv*Ez%+&%xzawZFAW2=*e
zEa|#fILoc@3HYKrJ{^nA7Cv9I?v9sTUpf|<qfOC;UN+C|iE`C(m)3^SVs&U*HcQcz
z*<i{tYt~kORQ)}RUPC%opK6@xigC*0w_EM@na+A7Tl|!)xAI--z92ZV#%z?Fjoc*N
zoz$4V3^v{2IpILn-1>C1QT7P3rX;Y1436me*%)VYUivF%A`|~0Mz*^>GcBg2cWq#0
zZEX3ICin6I*+@GdXSA4JNjfI00p`w#lCMm0%72qc#>{3!=D#ybhnM`QMXK3uwxBw<
zt4hA=`NqzGlI1jQ_OSA|?7q_RM)xjL`Ys*wT)Wj)XE!Xn#$I2<8)PkT#U{-b9(&r-
zeSM&^J>3+Y9h>j$&0<Yv5i9+^8efOIdk(4kxu#bUZ;81kkZNx!rytq9)WqAJWuUFT
zC2XovI+38iXK~pxK4Yf*CfwH4D=grLY-~K%`^&(XnX$&%`?SrCm-%ThfS*oj8!I!k
zWqxgZhRH<sX|mE#I)Tbo^qlIYmmF{JHLYNUbU+$oQZiW;n(O6X(uHhDw4|L#SJT`%
zhex(`NFP|{Te`cR>6!ISiMdhfy>_@~nJg)dv&+^LGrln))1TMwbTxYBk6FNMpUPF_
ziYBwa)YUyLJzoinnr$G@^1z7RfgBU`ceKrJOU!M9Pjws>7&$)HKB}>?qqU<&O4(Fv
zrWuY*u8jtQm>-IG1`MAROU;N)bGA(ONA50CwSGB9OE9`V9Sb`vTWN~}BU|g|$rj&q
zb!FouGc}Q#-z!JIv&}~(?2Okcq4=dUG#EgLPJgP?`4_8f2m_`A-qRl}%_Lai7Dx36
zp?5aHYMpX-Ks4DR%h$|AiyUZh>cI*r?<8GRUF8VRqJm1+9Z^|k?cBnW(@QwQsg*e_
zC>!0qrIH?g$!5oeTk9L0vbmtFlf7a4{G=QS#?tN1S3SMM?i^5lcXw-Ih2K%j+S_v&
zQrZ=F8s%D8CI*MQf6}X{6dt5-Gh?Hz9p%i`?og+6jrEN)V|~PHuf(0a6JJJNQK?5B
zHD)s-KN)3@*`{=~PidAJZx&~>XQT=RE4oV<O?0%&v2p3L?`~_ABP}!W;K&x)Dz(Js
z#8Bs`?3kqDvfVUC>E(V=$-gigx(3;Zl?$%w$+5M5zAW$U9jUfxTSrSvndrcXR9ZGx
z@m84+v3ZTLSd-aP^(->d1Cm2VEH6}!V1-t3lI#-4)yG>pQn68OO=cI;DtqYKKs4SU
zofzqGb~wW=(o^b?K?u_ga#bo=1x-g=FHvMpYabG~Lk{3(MB5YXvgo#o1bbW8WF`iK
zvUhEaNpsuGsEsa5GA>c3D#1WQysbVp|BoZ9Cym=Pk(iw}lIazt$C}al#zQ;gm?hen
zk|8B$Gad^EWGO8LC6zY22D8(MHp#F>qiiSwwD$H22vR^Sm1;{6>+VhT%+j7Q12>c7
z;Z!=N!xkmqbTrOP$LmSk5^vCrQBR;CS%MQO?Yxw7+llHm=ZDKyHrzHREK5sRHY9SG
zYYz8m@hA<dj<wB+r)1>rccUs=Vr3!%LEVy1bjFmbYto}y;&Wo*c({6YBo%9p&+9EU
zJK3pnM4M=pU0`cNI2cUNjwj9FNXntoIgw+?mX?ya=yVyQ>57qls+iq+_Y|hJL;4sL
z)fSUs!X|>UqiQ$fGP;_kWoRlE`{Ssd&TG#k+4QF-*VDo6Ta-?;)TiRwIVy=N?HapR
z#z2L$)rilFHJO2taBHg>lPW2s4m3HH=(QOR1V_p6L0i-0j@Aa*c}F5<fNK8CnGFqQ
zl!~?t!HS48W+j^@-Oh~Zd)pZpUOTfs6>HL8j7c=b!hH;t@kA>5hm#YdWl&SHYlt*D
zLq0s*R#GO}EXhVoj`d16wlsx+UpH9Y{YrN|g8}F3CK-}UG)kAZMfOahEZvlwZ=A`L
zS$6DK&1Kj2?q8N&rtEKfFX7$4E5CTiN;E^a7Ch5}m3Gf;mXdv)&Ee$47}+S+OqV{c
z96HN>J1saYe~lPrhSR6Unrq~bDp_A=#KPgQjFn8CEJq+UGNvYfR)y!7eQWj9n6r13
zoqJFH%V(EvPbAxC#^xqcEltt#b#aCCZocbl+fgf-tc;GNWOvj(sN3^ZdrR6Gf;A)T
z<yV2qCW$Gt=L(qPAXzC)*Tih!WJuh^m8f-4qCPdl^bBN-mo_%kOXhE#o@J^&jsu|{
zZ}0ZWERv2*y&1GF-v)DVs{JqNGuXAVY@@37ksVV>P}zo;9Hm;BvsZSnNC7>XHT@Oo
zCCGLoE*+Km^TQ*X<0&%`F6WAR8l@|w6tbbQDW=Dez4ko8s!8?p8)9-Wm1-X?T{1H^
z+@!l2Id-W|8}97a+<8n4jFjm#H6-TAX&mRk75zE)d4b=}rRU3@E}wLe8)x%|Soccq
zW^g){va_2g4c%8yj)fy}Ga#*p@a}C*z%P9*SwhXJfie}AzE~fs@bKB=WfEh`Y@~x&
z?(CTkIZm>AO4RfRrpYoLX)T}Jh$cCIAd8h87DtmZ@EvVz2uA`zq0V3CbXTh$&{C_a
zM>RD~N;Gw}#A;(ROix^<VNR2{ergV}e`o6?-v)vc6ZO)IR+VgeN|q<tMz|KHo*C%u
zWL@9!HgV!`8OV^1dN^M0z$D--gf-^$QcH^r!PGaJ^-e}%#?8@@n9?*{8>T<CY&4kN
zjv26z&yXz2t$uzvkp7*ogVR!Sw4K6wQLg^MiqeqMmE9TCa?h+_6_%HDN80oRWP?|7
zYOPlnpsG}@15|s;%GV;h=eRRy;yQ^UjWkP%4tq?WJi#^TU{x(9iIqHUI+PR5NMWE_
z7EK+ukabA<8?_NR5;BW~4BG03u4i{v(;AjVI6Qem#WqoYqjZ3JuM`3Q?C`wyRJ|x&
zn{e0c(=MEA$&sb1F)=4vuAeVl){GgkK6Yr`Ut?1dZHvvtwwr2dnq<dh#z5-ymBNAH
z-IiFJIU8;I^po|RgnSbScAxp291R3UCL5zOI_l-bgBdwVC1kK$56Jrb!G7j+n66yi
z6Lva)kqNC%5&QUMu(DRRv~o^3(Kf~$h{jU(=YgPeWGeG4+7O%JEZqMahb)f&ua2=D
ztx21ej*b406RA!%{x6#0{Q3_XAp5%+@y62aiOg1As-zPjeNSi8!@HoKJygm$=}R4p
z#+mhU1%aIBNST#Sj)9X29lh$c8LV=?b1Lj!GXsHXa}yKIu*#(Rr0EN~7qwt;j80>U
zGl{Wg3X@F_f8zYW@QM+Y-TyUP_`pOthu<PQ1k*v2)3=SWedDpY;qdS=a@L|<1~^JK
z>axj|qjp&-dmZ{Z-~3KfMs}oQ&azmx)8lu3UTeI)H@NP$1uErSkkgse4Q!xda%}E?
ziPY@LiS~H2jG^}0xX|ybq^INj9^;s1g1pANSKL3EVtBIIu*u>ZZ<~{tZO4#Zn^HMd
zXw$AWHPoi|&UNAl>4!V>t8}4rsv}#+MjmkJ`NFkT<ORuHfndN%!P!qtlc9g-2r!lk
z`Xh39Bg+65g>mMBpLFlp@8luJUU9Qs>0^so-7^K)&G^EUl-ZD)o+8FV%T)dkei;~1
z(q!b&Ylcg&Z1&h#bA3lk`#9$skMgyQrezd}hj#QFjJbe<*4mygOsP!=w(JaXPT`oJ
z|LzQ#7^sez5vXy}Gnf5otj5^~oAo$=>ML;#6O?37<^Zf`!mahG**FW-)9EVt@t^2_
z1%{{R#+%z^+tez@m@Tpob(OGZS|uB}UTaaXLY9J*^b*X8@v)^R%;Y>rh0LLzQ)s;x
zgo$><)mf6J$VRtW28NySlc^I-p(?a*gL60GS<?HH!(Tant-TN(k#NoyHO_VxbvaJ1
zjLHdNnQ-?NM!lyiD~DXsB8N+<R3fFR%l2`0rTITuax3IYk<#s9PbGRE6m~dg5oI}$
z{?hL*4er%55Y*F;QO!CMsEPuhtz~+Yf7DHP^+T$=7hYWX)gCTCSbzrDmbK!1rgW5K
z!7x{xIjacpu*ODvHgZ_2b7&DYBfIWv-o{NGH7OhoPaZ25y+!3PMK16&7m&%RPdTX<
zZ;U46F^~Cos@>BnJ4ugowT1p|n=obExZ3cv=(JIzCx%5*GezR%^1!I<U!?ct)L9KO
zC?p2CAkJK|0Y`L(jPT=Y`LWpyd7P``B+guqhs<iGO_CPs={4!dPa0#cH=O2NZWuM+
zmH#H%lukrvI(==?^{jf%hSHp@kxDh1i>3Pbu2s4geSRs2BJ!Q7mgAA(rtVAfgxgso
z`!#J->Dk7L;JB8K^vr$Q+MH7!lVYv1p>LK!JUL@5e>mrVq&L|u!MS8{%2!djBGg=w
z8Lih2g4x*iT;&%C%q}_A?*6XAp5)Ztxm=jo)^6rf)LbbfSH9^<RM%cIC|Aj~#N;%r
zrWKLidwN=8O05i;8n-L9kHHNSH=!rNV1=kmv8120k2&XJhfAuav<ZT^Bt9VfsWBaL
z=}=p{_AhFk)6LGF`p<Z)r?ffK>1B3~m`2%?Z)42aJ?A8Lg}taiCnwvPdg;`eIMYXv
z3OF62o(!E42c3CNe=6Kc=V_%dQp1#n*veI!&WY@=%_TQ-upBU#y*Ud~*+X0zVReRQ
zY0f#BxMxD8nMbv<zjGR-2LtX1Q&1aJrgIjooEnn>uQ{>O<xvMVTg|Dp-lN<jrq<7O
z4vfr@MX$x8!km+H=2lP1Ms}BdY_RX|0+r)r-AT`sgIU#+B~Wq|Q@5~hT`*81TNl~6
z&yG3W9-PxJSAwcqxxm3L88Wyr)j5qh*__4f-Ta{6DR+xp_1XQK;gvNdD`n-VhylOT
zwGW47&{=w7WrkOR!P=M{o;JxD!EXIz=?M0o_d!4ATobx1&RkQmnRCu6+cOjXa5)e7
zrH3r%X3fANmo?~@rAXHb>;z2_%G5I8m-Q7p{6Mg0U5+%C3m(zCmk{vl&K75CB6Av?
z)yuU^|24B9AOrN~boI1EccFW(YLP9kY@U*`=+supn7-4esqBu^ZITpuR+;UhG(fg<
zQ_L8$IhE($C<m*Z{<s~ZA!G03k-w;^a%dr&pOQ_ftDeK1Z=22fe3(03bNQZLer`?#
zN^^Q$<lK7>n)4*O#!r(GeA;^UYzhK?>454==hR6?uUtt5$}AjZ)b8H|r*^cBlT7tI
zIbTcEb4i%lN#n$Q&l(c!zGAFBE=OMN^{q*#hhhTd+Cn+-o)K#|U%6}4^D|?WQGNSc
zGms{GnI`kI#$<>4SAhx{K#is3n9R9KD%>i)>Pop5vb7o&cP_3nM<Julq{CCGI?i#*
zo^rNd7SkH-a?7qu#!Y2^+w-f5twFzhD$LNZ*)hATHDaQinu(cl!tUj;bUW7PzT3=(
zii+Mpj<lzo^J8PR%VNU%9Nh?*%buL3jh3sg+GN?|&Al0Xcl89yY(Xl@Zj7Z6^)1d0
z(b)=@jVW#DpV?rC!|nyPc6hYh(0<_mw5G0AN~#~=O;(vhmq3--EW{gYlkv7m<}!g1
zW;v>rgFW5}^sGL5^kKGc|Bq{J&q~)L)l2-$C^jvnWi~PHb*X2gG6D;-jGFx^Pw!2Y
zd)mXLXCOzSZHZ}#@Z>3z!jsA@gEBmwo|=}SdD(Y{?PX29x8#9p*}cl8KCz~fl}%QV
zzh>$DlWSzSr$A<#@rSFAEahR_g)P0{C>ZQ+guB1gyWPX}AijI&;wrEVQIvGOO0S6P
zEi(ISGm=pnCA(m=n3r@qdiRrLq{4JboI$-vhjSYL4}MT7dlBvI*!>9Y#RDU7RAYK^
z<5G#%sqMHP+TA*|9kr5vzv(qiOC(0eXP5=_k8?K%WTwr%map=^Ew@_9rcibue{y{=
z>y^6+(#bRln8UV2qa6n7Gx?qbg59?!m?Itet#c#dpWbBjwSosEAJxgoxNLv^+{Pm6
zT(0IEy_=)2-mS!pg4@}N{SVq2sIt8zwDr$tCgtw)&``!IsQ1*fv6PD`oVohv+V2`N
z=uS5IKb5$bjDRi~5w-M`>kk*r;kH`xU*}TPKe=%J!N$4*>q<?VWE8hsS()8(@f|LA
z?PHZIS!%n-Oq|4|?}j@6$pwwpo}O!ek&+~W?SDy9f1Fi!FI<O?yXzMDvwPrPOMC?`
zghjC;=IXggm>ngl{n5M(_RLGCP%^$`M(Ol=3hC16xKU4id-l{pGcF-hG-E?!X3BjC
zqvxB$Sm_752k4~l9F@L!T<#RLbHn*pTDGLF!8#Ktw=&3?)tPa*cdTTL&wOLI8BON2
zt1~RwGd}44zO1<C=jQ8ZbA8&32FiTEIG}SUO;qmZHFvGKx1fPabEZ{B{LHBx>8zLO
zL5!@G$|q;a$+yb=<Z>tOcl8u4FkBBBCd$=TPTyJYRVf*JD2df;5zU6%x!Ok#F#0@9
zDAz5d1B{+|C|%s&JJ6_>Q$ArCT?@+z0hy*;8yA@q)*DQzFe6I#%jWQOl#I8O+3i=D
ziyq`kd1p*JEf+QSu8^EGh^2IZXIesrr=;5EV&wn*L6f<vuk@NffA33frbc=Xf|=nx
zv)42?dzKlwF*`BywK@A#)jgC}W+=gRc~4cN8CEP)p_0W%*5h`)Z^XT=Hpl;RT57(O
zSoXv+uG!a+lIfs~`CpE01^u#^?-y^MX_q}63vq@Md#xVjN3lkT;dg)4_eGoYOV6e7
z$~e*->_S|2aPzO}jtBi=xgj+bGeZ$hXTmwHZ$^=-q_b)++ex%JCm73&tN%3_1co~o
zX<<0u8N+whesjkbjcv8|9^(vD*V^0IrMKGh|L4LIsBjjynx<(9v+R|*HmUp)QXxx-
z(+w@N_Ek?x$TK<;ZSls*^>XMY12SdSn!l#bLFolKXH3kj*4mn4>m?z*YaI^HiOg!1
zHDOw`_v-%FWE2<?k)xeh6D~Y*7QZpUz7v{1Uy>=aTK*MVf<U!w;##x}cAIRjzVF@k
zfEn#^2b%qdi4B*#_VO>PRq42BnRW3GkB_BG`qyM4!%M-ja;<aQp7kwqu56y!R#Znv
zMC8gbvo#J!%y?6;x%7WJ+T<Qe=rz>ze_T#EqLZC(%b%;G+*oY)3sc(0$@%IQ=kf#R
z-m+dr|7&Krv%s67gYI2&Und#LoMZTlQmBq3lFmK7-TSG&a_GI|>Q$Ei(`7L*LU*{e
z(yNk9pE;iBeH(X4-?&UaS}u>>r$vTz!}1(~aI0LXFCF~GdG%2_+K{`)O>=v?z(ITC
zvz{}qX_9l6W+yo&=oC;oKzd}LU)xJ$dY7!)IVWr`O-Q#TO&3_s!8FR@AfJZmYkGRJ
zZ<hSvPv=J`tNXkYnuO+ZV^?cN&z}}=(gOj$QCW_7oGW}9aPRm3s3yO2B~9b+_3V0X
z@X+grYR%&pW>o9VTjR=)K9-IQO8c}ObdIECXtByUO~yxOea)LOfj?^ZpIDv(T-&N~
z<|#}8Io^>gx|}QQdL04=s?4aQ9t}CsWja4{je>K5L0>l4(Ss{kE~8A&H`}eS9NU$-
zW2592%HGSn-%QWB@Gd3g>3yshG`}_vIjV1&n3(HaU)!^J>>VDc?iJOuQ!;zYY5US4
zXTT(Y%kgU(^rV=(ttBZ2WQ;_g3T7|hmle(FMLTl=(0OWsT=~|!wIl7%OKPnf*pi7T
zD4iW;MzBlb0|BRgx_Fz#+L?6sP(+h16j8aKOpY<qy^|_cNl#I(fmD?<OZEQPUO&=H
zDRhbI)s{a<y;|<bHW%|m<qE<E8D(lOci9nc?%t<OE=)?yXp`fBraqV4k_H3=&KhY#
zORg!z3gMouG~=tBmYeeRYHXmYXQSMnYHUq9MK-$w9@ru0a9rzZ9x;?|PdPUR_gwU2
z9@`)n^d=K>Iik7TuFUs<@}IUp`f-^6`!n<3rVcX#=Cnhf@k7)8|A_m}_BM7T>-kA)
zQt_h#<;%4KSCoBc=Sx2ngG8I@GDz0``u7wHK%*N~KzqWz`^;X^jZ+7OoC*a2P>EH)
z6awt6iE&ilF=Xy>lbP-R6c!y*2_qg9Kr$(DxD_HD<G^|u2mmKWEHH%2G02!e*zmO=
zTB%Gv1XTE6GB@IID}+B3K^$Js5E&B)<L7?;GpR!yZiSXpxQ0ZxtW``g+<qh#-qLB0
z!>thifl~@SY7J6-$FHmg0>Fvk?`}iZuu5wX!|g}>8RvurfD^;t&C51jW&*&8p#YMN
zKn%AZUy_nSsQE;~sj?Qs?MM6(N+GD^GQfSEZKuBy{f!jN-^_n!dijds_M`cesGMTB
z{fGlcvbt7kl@Mw^K{zEQ1@kx9H>U)pVE$(QdwQ&^Dg)eCiWc*~R63i%EMf?nK|ys8
z!|g}@lU4dXLU_Rsx<9C^Ed}$pnG_VLEr#2Vn~ba&Za?y$C_zEY3c`OjktZ@-47VSd
zBq1yV+*kWIuf0%WtN}!4ujGJOt!Yv)f17O%NkJi)MTCGbzhhP-u-U?dU)NI#=5K7k
zsI??1lq??U*k~aIR&!bnQhoQo-~e0AVI35Pye`wqh~f4l|4HefApB?CIwdPmEfb|+
z{$~E0idhD@ucIB)s4>d`_tpMQ#jioC@5`U*#EaqfBh4w&(h9<VekVt|iE&m8w;$hk
zSz!s_#NKC8XwXz;K=5_99$#FPqKW|x3O4~1LLEAzULYwUrC`gPOL!$JBBfw3ce*XJ
zLIW~yE!5FU35>GXNJ_|n*aur?F@Ykph^&}xu;+J|gBtq;LTzUekv>pF40n;{VwJ+H
z#>2Hj!UXaALYHuHAajaJOQa&uvsR&2RY?blwn}4A9G<oc<;u{C6_rfFF=-gcoR#W<
z29c_e6M>$!3bm?AI!Lrt8iV*ftqpzWvkH9xlmYJR`z$LjBfQ|dJ1$A6pg=-V|GON&
zCDzAyy1<bMz-H+CE}J?T5eT}ULUgr4OW;T?R>Y9JkK{-LIt^0$&JD_yBq~NwO#&{i
zF*MkTnIQ#RX1U}=Y8hH_2jQ076{B8lDrp5P1027YT2XnqI0AM{m17jc?Z+Sk<YJNm
z)mQr~R}FO--Zo)f427KLAbVs$_0|5$>7&E2MIGv64_)=@pj3k3v-_FV$;Kt%pi~%`
zZ>b6*{LB?ntQ<jp`|2hFg;1ugDoC#R%ScZg1T%A0)H!Au>6wFI%bYFiN>+x(;pR;k
zz4A!G{0(~yn~&8fu@|l&{Acsy00)B5N^aW<!hgn(SNNk&>mVV#p!xBd(cyI)BfKDf
zn0!dBNL5S_V16PuvOE$d5XMiF;tY?B353m0q&v$aVFF?N+<f5hG*ikk!V4O*I6UYB
z6omgwyIUD0G)VQGe!bE$u$~ePQhj%dWOCIg8({)rr%QwpawWn9!ajKsC?L{7VTey;
zj>=?W1PaA>+2T?V{xf{CWeQwD_|Krql(vHKpFx#jP!RsJzp)LsYI?0ppb|kr`FfPF
zuT>A@e<6heG9r{=bqY_9%jM>Gsw>WeB?P<@(>aR0!Jdtz4P3^cS;1y$Ne_7?1~d^;
zi6YZPM3B20r20;eswI?&LyZ9aCw1^3hWn2YEm!aXt;buI3Z@o77`(=pIA><KfC&O7
zcs2ko`gCPvfcuKSI7EdTrX+y~sYLmr8yv)aBod>8!jRy1B25WYE$9OmwDg1tq2?3)
zC)GN|aQl(|Y;wXez=_e{Zd&7JFpJQC69=|ZFn@!A(>_z3HAwZ{{1WPE(3Ukw_1*l!
zb*Drf(je9Mpu;lmJT@bodFdJ21}<aJk&%<HIedQT^g1XE!Ea3M;fpLorGP?l{Kg9E
zCn1wsDWFgszu#~{?S<Tfq{&oRox=Z%@-3+y6AG|8h5c7@qN-wo0MAM(?X=KL!Zk>0
z|BR-9LAyu;o1ymKNOB8K2~{KT?3C3l2wy%$V3WrG+vHO#UKtq?;RDrN*vlqMNyY@i
zqcK@p>KtOY{m6e3eXR^|U*YXBaUv>&nosnfj94?6Md-f?%L6Hxzrncoo!ob-GdvmK
zzM8*z`555^cUT<6!9mNB0q(2$o0pFfUJyaM)VxRxw;%CmJ5Jcfh~f4l>gi@mua9NA
zT?G`1;~Ap#s3HU0SNq(iUI%1=`$~%uxUba3Wq|u?|EBs64N`r#zf!Y(4N`sgzgp;m
zNqsZLq4)+Bn4YxO)=gr#{fK3cv8qXkYmn+Ys>k>vG20iz?MM76-3<X0ta$`YPf824
z2!Z*?evJ$8cy~+7B@I%2H@^;f5&=>Z#ZW@$zIm0PYZ@y90vSP-ptq}w3s?cLe1Y@%
zk8xs0MM3z__<_R6QHYRA6u(v?BuJi$2?C(T9$vW;$x#sgGk!pk9EAv}MDdG~lVg!F
zf$+!Ju;{g12Dq;ua2QSA0JJtDhTD((F<RbepSqu*LcA!twZ&$%f74Y^5dJglW$4~N
zy!Vis7Wmi5^eC<H3J7jmXK2Q{vNFJZ^}ZoTvl%%xA-^}uSxycbhx;#?)oK#g2t-=p
zM++{+bv<Q-7j!?O@hoVzDyA0jYx7E7sL?0IR+dR*6ijw9$q9xGa9{1;)Z9RWRNv8%
z=wYu-qF2%&)puMpOH9&*Q1fXc1O#Ox1@kvNT{lTyosa?UE1p53y`x0U5&=%k{>@BG
znL+@C;`mMb(37k>8mdNsf8Zwg{u7g>WZD!^D30Ic&>+VoV*+9PL>14`h>%JYzt*@q
znog92@Phb3k7?nKb~0mROdw1@&8v{Rgawxl3Paqk9KJ0_DY`VU8R}nq&=aabs_*`n
zd!)VvnF1B4*u#M~`d~XhYMT5F-OUo#2dQIS4N`sgzf%3U4hloyvw1&Dbem$h{fMDF
zEi41vSC&q8{Dwb5ctOh)0fPQlgH+#%4HoBm?T`WPt079tD+vGDGNn?XL8|YLEybc@
zf&iLU<%Uuf69i!28V?HLgl?o~tcnQ&_-Fb+Ld66Do|3k;Tc{+!>J<Dhf+Y`2Qm}d$
z|BLRr=hPE0fiVB1oKN)*2~;hJ|3UemXA@C1g5BFPc?U}jw;#6{VAso(77D_DcEb#<
zx#)ZzPr1YEBDJAJmbVv5A)8_5QAKi$sh4$I5gpeI+0ipm?IV=nGG+9Yyhs0D>QD-z
zh)ykr{6^|1!UkNXjK0zf1(`)+YncR44uUs~abn%uUMPiZhRs*#&GD)<rM!`6!C3nn
zQb}kt(5?2%RmvK{+TFOutqY~-T^M=?oj^!(60eZD&r5njz^T2#Q&xE5KA*`gq>>Sc
zfqOhZ@|#f$B09Agg2l*wJzl5Q-nMM(X_m#(Q7ECXF`mvbtR_}gzbTo8i-7I=<&c`5
zv=>St6Jf2ISgjaX<ybBgi3{F-;56muZ`b}dV(BR4(4X_wBu<Oa;Oh#=V(BPcC^k3o
z-XG6MTDShZP>Nm(GnR_^wjV|;iRjcW<OR<f%;^F6-%N5VxlAPc<pw9ZdMf2`7UHnf
z{8K|1c?fRyyYzXo3~*oV-;@R#r24+!XH1C{fEBZ$DIHV@Q$Tc&y}NWV$N>A*l`%Hc
zC_7s<R~0Lexlb6=Tl1?}feh9F<8^AkK?=7*{HJMU67)|pz<uSviIG+YxUbvk@~=a}
z(oPJwANfy0a~a^i^52B!GQfRZypER3#8!kDZa?y$gyu59edWIi&1Hc5%70T0PC@w3
z{9~4<h#{3I|HAXPdgrM@s_zq8iXWe+NHj?GeUT^8Ak}x`NU0%)+mC4L!VXN(24z6?
z6~FKt7`<ev?jQ5^M2sn`fIN$W@So=x%oFBOQZRp`gPcSMCI(xP{bEN+FmLmw(x1SU
zWi4=lw=T@`NvG2w)%Qj9Q!Nw~69mj!@Tiy|09lktlY;P{7yg%?AO%v1(k-cs9vY<j
z4h~!>%H6yaA(bfqiZ4din;l4ENF|D2^BGysCMvIr2?FLbG+Z3sVplY2ADF~70`X5G
zgV7Z+>vcJaYXpu*8)&U_iQ)Dm{v=N;#c=y^Mm>NYtrX1PeoeOUD_D+UoRZp;O~LA6
z_!qskzHURA05)mx?{bINhZ1?KL8|Zc>(GWq28H7E8-_4?mdgP5m8r4>2O*A1lz!pS
z`NS5Ag7BZ2E6;I|;t7Q5=Nopi5_1&|QhjH>7BmMju27tQQ>y6y%S<7uxCROHtLUIx
z%pjv+`hco%p3_vs1j2?`1T~ctL7}*1ElykLB!EJ3M=QVtwFFd+z%q9`Eff<#p}6A}
zV1i--szzX$VF#V5Z^(NA6pA}u0VXIWplSs4kG~GMADhuj#00{Iv=!4qM%4)DA1`>e
zDcykz2ro!KrmI6@POl*RXZkV5=6_lT0pSJd$Dsv>hzW%0=WMYaWr|3GRHF22vOeHx
z!*qf~Odw1@Uq1J#^@awizSA$*J|;K#RZI|opST*+W-_6|>J-F1As^%G#F?HB3Pbig
zymdED7y>H*FLpqIe&~f&mJ*=)dceGOo25`dpb0|Oc=;g5q+rT22>6gR4mz!}6cNGa
zk5Dc2mD6qfIsm}~<wH~>=p)cAgw<72K;ZHrGrHZQ7nKyu-)57Hq!ez2;LkcEECQUE
zQ6OoaMC=j3iCLN`Jjgo@Qhlcw=;05tgHD|TByo*E^UsgAq$;pUGXo=iOO&#Vs07!z
zc>*H`e!sOWbfz+mjDu_Vw!95oo%L`2pR4J|m|ml5z=LwmLF4e*Y;o8nx<)bFew^W$
zYnK?r#Blp@vPc)97;ZmK7MX};fcuJt{|3+C>A5Ha+*ew`Z)cMddjRRhd<NK{acD%-
zj#dF$S{J&cLu5UvQZRp89#$Dq5#YoO$tZPgKtcG=mMP7_I>eYj*mRTQ_4?;LRaL1r
zil9(@haDKQ<ka(1LHN(~BRi|nL175}&Ri^z5nhmf?6;qJ5*ZYVo8PE<xvLHoqXMf_
z(7)g-C6m<x!VA(5v;sVqen=F20#ys5f24aX1>rx_4^-EAs{d9oL4Z3qMgN2QMdRtB
zrI}_DuaHKc1)HW|_-O-|F-tnx-6qOO2Dq<uYXdK$m+!EGO~3AuK}NyB2l2IuF!hxZ
zA~dENW7fP(dE=1*?ko8s(7Pfb)O@1<%<$%3T2erGL2^N$OADdq6a8nOI3gFr?Z@dm
z?uB5?rNY%?PM~T*?jNR<@^licPC+eD=Mh59C;Crn(k_PEkEYS<QA)x5&HOhNs|={V
z{xZ6&Cug_>DFCXkCNvelg+`F@`$CE%Xo+PhB7)Bx9z`N>L>R|{01Pj%P)(gKC<y=A
z{7CfxGQfQ`f3rvK2kBJ45;ebm<&z;mDpCHGP06na;RX3ei^TjAFo7^$v^nf0Oi;Kg
zCI|o}cKRlx_r$r4iU|Vn{@64*1((9D5dSGR)5&PHpW}T=I)9LMTGGH~=n_NGOKPx{
z!mSWI1SB7n6vORD@-C1Q7648Rf5)kgRt&cvi6bX00Gt^9Cf~sn!|g|Gsr;I46XUH6
za9{1;GO-1(LBj9hZ>18MH#bSuD6TLBsY)kD2Dq>GZxlt)IyFf6J^ZaEM+0HQkP`td
z&{EB<J~B_y+gAcr3!({1dWx?g{Ac$gRnb*U5MT^6_v`Wv_FV8>k)q2%<8WFD;q0Le
zc@7$f!yGYjn?(pUpRnRj%Squ@h`P4@bet%E1>rxF3eij<sM2D%{fG)g@vn0uumTFj
z-R~A&5e=J&Ymfy0JS<;QrUMyd6zo1A;VFy56s#WR{>8~^!>J%6qTqwcgqGYgRv^PC
zI6BSM01;A&@-MMWgQ}Q7)q>nV$SN$^F+wZ$yx2h12(bI`{&Gl|3d;cZ6@PQmakv%Y
zKeJTL)gaY({Gw@R97&L~DkccnPG8ghkY#}Tioere)5Jtj2Dq>EH|=#tLHN)3fkUIj
z+))O&ubcE9niS06HkmDdG2DJcMLg`L+uhqVwI7{8)q?O3GIvgGplFclJAR?DZ`Yf|
z;kJqi0{+^;)GpN-$pH7&{B1Pbph8Q*{O!e`jVqHc8Q{LcD+*o!#YRDDVWfk?kOR$5
z^dv~Z{B5*--ITbjoI1f$F+qUa(9D@;CUK2G_s<`&dDYXFK-GeBcC$F5Z<^Jz2-XbA
zHIo=_KjM$Do(d|L3~*nGCDk~@aQhK|l8ZGl+<){xl$veULNUMp4Ea?~bphbS@OPgw
zw-UqcN4S}o>(xaFHJ|XG#M)8_HJ>=jgrE#?Ux_0jECt(d|9H44$x;Cm*jdpXPWR8|
zS&0X91HHjl42}PIC<>Iu2nTg=Be5-x8SpfH#S(A(*_8QVg(}w5$5jtrih8uUUfXbi
zxY@o4Rp@lBF6LwqUN+0^VYPxiRnKU095Bq+tD?XXZ9x`SAOgZA1UuU}|Cp5Ih8gD=
z7+_w}sWnq_Vyfh-A~324G2p>ZQ1ccFgqx9fT;znH*qf<anR=a+1cFmm%j45fGs6zm
zXv?D(!x9IzYnr^&zM7W9G1mid`G1dbc0_%kpwo#-O$DL^`BE=GrlX(b8;*S`N5<v_
zidwAht3d3_2x1A+mCVYY1==e2nHP{m9ufw5I4nxaMlD^V-Bi|M>_P-ZIO8Y-7?ez-
z<Ea-7)v{C+r@p4;;{aE)O5l_EDk$#@2LPcW)|v=GI1kfVXyS?zNSbR})UuE@iRAZd
z7+$1O0DzYD=u_~&X12JmDk%eqG^*=w$YfOxDHy+Hvy&jmP8wna`qcsbxe&v#s|pYY
zMpBL<7iE2uQPit-I|ifZx7)~lNL8CQVuv`^1ifa2kkYhy*~-Q5B?*xzf77o2(dph#
zm-yMQ_WiHd%jJ4Ccpu!(2VdW*gXrNuwxg&O45|aO#6tlmoB>b*<yPy9s%B&%Hl}9L
zzbO0{rqC%g${Qk7in~%OQyG~9mGRx&6>TOuQ=cGh>r=fRV`2LXJC8ygJ~GKM;Zb_b
zS8QwuVw@fZ$Z~R%3FPAT0#$GZ9>>O6PN7m%vP|I<CP;0xn5>iukK!_4X)83=u;8k1
z5s@TChbfmJcZ48Zu%_@x%CN~|oQ(`n<P=4_CPJ1xJ&t6pMS!ZQ36G>^x|l6+S#Fb)
zV?Y!ie@>^+P-`hn<*E@BN`t?PrD#h(Evcx>g9xYSUC@{A)WV`uEfEtQ<<Jbry-6tw
zkEApqZh*4L);RZ5)!R@wA_Ab+4cEpuS~T_sV~lXo#DFbEV|rGJLZeh*B91K>gowGu
z`Ry^fH0?db@fJM)^YSIlE$W@^>r#!JTLS|WIVD<dw=5~dIaZ_O9>#=6F`BPRgj}aB
zB9a{K^cY#vA~Z=QRxTYYE2MMzQ%xx6Ed!!hF%8_PlAN~=63M=k&+*hS$N<PSEBUcs
zJDL?fO)+CoH6g*#Ev;S$U+G3pZRCI7={j(BTSTn<EGx-~ND><0jSPC6US;TT<!T^P
zOga@8UqEG`2`)WhohDZ@g?MG8WePn~(aMCkOJM~~aJ5V!aVaUZ%G4{9qpg2+HLU*C
zmxsvYLuE=IoZ^P|32ua@xJg))es@o`3q81PFpP+@|J}c~E{JkZ;UKZBd!b$&Ik<0o
zcbG`JL_G{&xR3gPM4GH(_C0rznx%17lrBxwdzX=a2VO|CRoh_%Zr4xf`b8~L$$Vd9
zToy|i+2I0=unEmvzKV35cNLujr}z>{ZEB~LiKD2Og|B_}&t+)?r1JBn*OajVqKvI+
z&ANaMxkMdwl^~X~bwEy(&s#NhR$NVq3s<y;FV8(MIBS#4lm6rN^Ih5dj=x{7`p+oQ
z<`7+jTwV<`kfDb(cc6JFG>QluXROJP6zShyKR(<>wF~q7%?FPBq&iw~9d!^{)_B_y
z8gL*Bh$Q<;Z{DTjH4s_Wc;89I`?>{0k~Qg7W`evA=~66A1x*kVl$SRkDSnfvoV*3e
z@<>=--hd=|`+j^X?+iqeweP;CWDO*er3IrNG57W!BTDO0SJHCaF6vyK8U5rkH^e!4
z3lg=r++ua-#hp*RpaF%YRHzs<9bht}XFs=92Z{8@MU$MYgG90*H7bb-_wgD?Bn#4<
ztb;_d#$znqSA(N2G43<7B^AQJfqGr*%FnSpJB;_{j@@npe+;|-)nigudgt}12-o{t
z{4I*U)A1`gucr~752B+~1|YXvX<>1CTE?4$VWjH?abxdri}OlV=!nr7PM{<zl*q|-
zSj0!HV8dZF?hQ*!1#9OqkZzXig)LVm!Ieg-V3A4LNIWK1ak0j0eR1CL77{!(y$E`p
z(DxGiNO<OKc36!CNXI}{IL6(N-^5szEkC;pgb0NFzOuwu`!0N}bHT%61&ZS3Abt`)
zBTt}6ezGdrj872esjWpwk~n360hyH9wkklDF_APbyv~1>yUplBKF_k1vO=X8<g*U$
zXo861BOJal94DRY3WY||(nq;*T9XI{Kn}FEKCJ_5FM4OK+)I_7#5M{s7lcKK8g<1&
z;7KaUbp?G+ZCl(PcB9v+rjvXCQZdJe3a~ES>ouDD2#<;N>U2UYf%(e#5hd-S<61+a
zR{bC%vF00TY}Cp1FOViotmfH>JVZ|83KDdr<}nTXj7VI`I#I`4;iE!Er%LwA{7zrY
zW%sEAl*B+5hIs~uk{GZ>_>;oL>c;7T4h|(T&~r-8vRg<8582?Ta5@WHbmv(Jr?D7Q
zb%DY!NJtTYG<i--N6ZjA<b;Xoq<Q3-uHAU8<%%#_wc$`!rYX2b3}L!9lxH$cQyWI=
zOjKVw3Y8rjYdIvRjOx^SgPSd>va+B^7qUiEWLX0SC~|Ta$el6^#+~4DJrU0kxJpoH
z6d8RKWeV<XO}Ho}AHG*o(%Jj?eYwMhP$4%tVRT6zh$ueuT@&SXg@Z)0bO~M)<yb93
zmefSuY6j$z#`v~VW+`0w42a^y=Pmgq5VFY20T9WdeQl;w+GClD!oHhysEA?r33vQ6
zxEK(kw)w;eNCHxuWt`bj-Ev67I$d2b&-tqK77;0iwbLXmX%U*F+zr^-qLmfezYS3)
z*gRHyj21WM+48c53&hI{@o!-6J)Vl?!Bv3)h`CuD#~e~(y@IPFbOzr*blfzIZg#*l
z<aI}Op0xHgOWp>$IIZ}wyDBrf#j{~#RFd;7<ou<Ox|N$HHL$j<w7oB@T4S}awybtK
z9cO9bG94{Zj@M@+5Sng<%NB!CvLytmsSH87(m~0R(q?9<mA*}wh&i{V35;>>A$5sh
zHzo~omFD*3G05QhNduV2AM&z~TLS|mxpI8iZfSyoUc^2^k2E$m*E@dh6~QA;v=!VH
zSdMV-BPO!_ipR*(hZfR=iKO|QgAOHAN_2R5ioqE^O)Zg6)NcvrfNNM8h*`s0IY^`n
z0+6wqFxL=|fpoK`c8D1s-sQzE0#^xoF8>M?X$QUbYlJ6K<{*+RqSk~tWe{eJBu$!Y
z;;cEuEl7|z(xT!#w112ZLXf0rML}+LA7cVUa{DowBR$0>igJ2A2GY%vbQ22W;%SZ1
zDc*_2(7D)*_VB(av_Ur~WJnyrIZ2jM`+~jcwAuH-HfOz>!{Q4!VoE&gj<=zdc~-}6
zG>ZaJF5#UNJdrHt5toZaM3QFFP8;6tHAyKK>>+l9)!Ec-gMyb3p2`5h(<rx2f#jHJ
z1ik1<_X9^usqBEVH)++fbC6hBE33?_xG^|sVOj-ONSjycJ&gu3i?=1HLt|Y~GtyI2
zvK#?IlT1!7t*o$ti-LELX$u+1&C6q=xHsnVSS-k8Orf!eCS#Y08&T>5m$EdYaNF-N
z0`WC!QgLC0%MWnC<M(<ti;YP!v7wDdsq$NMX^V&wWWTE`9-f{DD2ag%C9KcJI5+La
z2PiyD;=onB{U@e9Vj8>_4ka<5Lt1DAwl&OOEl83_1=Ym4CLsxok)+qS0xT<USIF~M
z1CeCSl8|2Z5OU9JZb70vd<HH@JNc%F&pTXoG^mKdZJHu|8jIa_Y9J$~7hJ{GS8JS$
z9s}uSW$&5`D!7_>Nmu-O%fZ92>>dp<+*~i<Y}b6nj_|hIxg*2lYdkML?E}*hF%)uq
zIp}tJ^nYx0QCqHeTy(b>?0UEh^KXo<Id=MFm%Gp9%X%SBpX^8#VWW)3$&S+8dE?21
zM^fktm$(5$uI7ZA=DSPA=+g37QSE}WyfEa@q;w`G+}~&=fKV~qI-MD$p%fDcm7M!n
zi-^+2g31VF8?WIu15mTdy7j(eN$93T!TD~r69zeAhS8~5W}GsM5c18<A1y*vPUGj~
z8{GgAvku>r3A0g)ndf#OJSL@}5({^bPDLUA$k%dWmca5AC`u&u$FVuI+OP4xrO<3x
zz5*%o>S0$XZ%@1GD}GN+hasX0d~Q?2DOwPPcIp<gSe(Mbt`jqPK_Eq*@>1zUc-`QW
z0+_-<S=EArVG+mfZRxBS8~Y!wCo%xYHQW0Mi<a{Z>|)R|6A&PC5MrUZm%!qi?i>6Q
z<KSA{k#yQnR0@yppe#gcV(f}W$xge5#MEaE$H-Aux;cS+aC(Bp*!YdIW-3+IE_Yc8
zsca0dhtYPj!=ymu=K-+E?`5*=yD>K4CFaJFu}iPP=UK^1&2Q?AJY<=6?7yQkF<Wb_
zW=@${o3ob)MC)30mE3jv^2VnIyaw`9<cSk@nSZ^QkKx)V?r4-s#1R;&Y?jSe=+)<Y
zBO*q@)xN1%D3*=GZAzFYv|FblkWym^c03evy949&uM4QWOL!tS50cWJ%4Ub>vZmr*
zbGM#5l32Sq*cH5xH}z&*i-2U>u^ab<V{%L*#K7||mqE1{oLt*jV2VDSW55%aa9wK)
zwddB*h{%h!iwiDy5ArOa(I#rqu+;Cwpwy}^&sM}t<47Q=#<Tsvs_v)j4pP;tUN)j!
z%UFKG36++ZR4i6bjRiZBlQt=-4|jr8s~C*YsaT8a?Ih$kGAjGaXuF+Hw{4P8FLPNi
zif6yvU~^bf+0L*SpvWbsA}lFRMRwYTwDTIUH;5?K4S!dnb@jx^3i0mupP6_qB8u03
zPsZypu{5^Bv`prDw`W8o=^8NhtVnaEf!Nco7_9PP)M(FFfg?VcV2s!Dl9%Te3{d1`
zLms<UHq5aB5EF_ch2KIYo#H*=?R5FqLADoRT^SJR%AZrZT7)8*(v>A+T^Fw-_zO<7
zN(S`)hP}WQu+4cO2d8wlh)8Gtp3>PO6iKCt@H~V?g_dcxQG~f&dyEn@e1?b_7D@vX
zNl$21b$p(b_LxX|(L&l|B54v<BDQtX03tawNp*40?Z+Y_$&zM_lj(w7>BN&QZ%gk$
zk-Q5KrMZ+HB$AzD*pqF=cyu)3krW+A$XyIenQ%xUHML#iF3!x%^_Zo29$S|C#f~z=
z*iMVd;RB{RFpj8mbn|>SEV0cI5@MTrcvLRqmI=3nX67!&XzQxOM0v3w$Mw26&j2b=
zBtL5-Ux6ZdWS1VhFMR`vWEV+c*Gg>!ku+U0)1zi-kBOuqjH_1KZBUl>7)Uqj(m7q1
z6mKN3kapcOB<A~`O=7wSsS(d-WHl^TeJ*LlZb7lmh2K!3xjY#v3q}cZvd_nvCpPI6
z3juqQUI#|`i(n#vl@Ys_6?ztKOm$$yGahY_Cb4O7f5?D1wVg9OolC}n0F-aAz{F46
z{-q7M&Pf2r`d}1*h3jjhjdp10wE`eY&5XJxF)`e8nea#oj~T;fnSWea$V%P;uHht>
z9WH9E6Fl~?5IiALVk&oU@H{Q1gsV6`z-%z0^dTY7$DSl_Q6dAb8j7*=SVz)8h>`~i
z5oIgQ4Mvn+tmVA|d-<FqCWt6|FhOF}+<H0Ds{9+aPf{})5;qtnc3nQqbDe<{qyMKT
z>V-<k?b+o30~JluOxAd%%8djU%JN*%r^-_X$+{#6f?P>3AgDN1RHzs%O4UK7;5{gl
z7by;ebvtdeAe5?wN>SHB<$xBj7niF?EiVYAYN1lpwNN?!I*c%eigmQA6AY4dC=(2U
z4i+YIysYKT%b=eih$uT<9pt$<&!?y^5JZ$6W9L^)c_m8_QT6~ID2jBIZwwOU@iINV
zGbD5kRnKZLqV!~acv(zE@hWx(iSjR>du3Hn#cnX7G^Pg1F0fqzQLZyclE>wp#JF1c
z@wvi*1sPRapA?svl>IT<z1@#C{Xx<F`(*nx9sju<ZP4%EEsLVpI~$GvI?T7Qd3hLi
z%hNxw7+lbn`aeUpoi3*EaonV;pJ7w7{fosy$yTk}qN3c{ySE|)X}j0O2ueoLJ*+ls
zII>w8Gu1&Ohfc#wuDAWWUKAT<aj0VO4X4It5t1aPs#{f>QwnK^5*XI+2?DOgW$8eI
z{D-bQw_iM6W+0-hQ%p=e_gf_IU=oMZO{8!+@ZnGqgLy4ml(q9)x*`=zP3W^I#<S^}
z0mO3KsVc}dYsQFJlH?S#6f4PySdtkPS?+0Utb#=gk&@4UbR<({kDX^xbs)4zHt@~~
zJB997UZU^m<};Qtcgt>ZbkgWKI&FK7R>VO+5)#+@5tE>gWYX>Gqk)V|03?JF>kGG%
znT15M<(lqSjDMCdcnxAz77~Xuoo+xgToRLADc2*_c~;Yp@!Nsk2ao$M=g!S3<**Uk
zY_>S;<SPZXEg3*8heM~`REpZ3`pgU<lAA2lvf%g_0J&xjKDnlcQ6JVf)9L04y9Cy^
zd=qvwG**}Nl*h%F>HcxshgBH8&5;0hj`}NjNtvSoOz+o|!(w`eU9$K=kI6z$HFpCR
z5hWXY5*ygHiuuKo77<C(4!{;~Sz~)Z+YGP`p@EFEuybTLG}!Np=JChzK%<s1T4bdA
zl1_n}CkuE)QT%L9jLjo2wBYuy)>=580%;PN!DEakJLXd{UwQZmP?3H+?2?c@peW8=
zgM)~&`H2pRYZDy`Q;qqn*a;%Wj>l<KaZWqMU{Nwd(wv_sp@zfZL>Bmd$>ktHL|N1v
zG`z!9ECHT^kOB@Sa;P0jm4^foWl<OBat~^!@f16QBKhF76n9?6Q}gJ#n39Hx4w`Zx
zkmuD*XBu{Do>nxQfo&n43=-9|$DS4u>x1hA^vcjXi~=>E`*skdn+G8@#ihG`w}V$e
zrZ|Bgo7-}t-I7g{MMQd#^8~5R@ob7hMUmN--tmivTf0gpIf6v8<ZwZf<LD?<6xpW^
zn@MU<_N4WRlevwI@q&s<iqhWF31KRS$##*%nh?L^iUHy>xQK-;LnUFGXxXIp*bY%7
zES)#dcV#lo^9Ke%ndW#O-YmdYkZ_h^8nC3Xz4K+lqhy+|ib_1RTSO#@i+Nfe+3IT%
znxqmduPUspu&OA_g&r|?nhqkw%(6<<Zq4dIkvw{GSRN`H>KwO&$g)Z2T}D)Pi+a${
zyE(s%tZ2(_LVjsV2hxO*wf?239i$i#MOHni=iFxXM9uNK#9{v{BVU0Kf3ww`*!FmU
zEfze<#kZ0XJGSvAqj%9H(vG4PC`uw;_rbKj*^BYlax<NbR{MGM1{}TS_cCn?;X=55
zZ?Ea=z^+Lc5UD^H$u&W4!4@G&ninsH5rKOn2m&C<@oSgO?5<w$a8csn0(r=(G*e>G
z9_RC#@!Nd$;RD6}Yqm|rXc1AG?DtZ({NYyOF_APj>NsXa-t8rMRK$P`%V=&9nxqml
zYi?!Bcn2O^WE`_1?=<(QD1NjKb8U%CE5zZ4C&D2t;-KLk?Ofap@Us(!sIu~KZJV%*
zK&Z{e5OBXP-^}hZZUIP%u#>t78)-vx_44Jj6<dUj*kA<5ek*G`=NC`Y4bG$Q5%?$W
z226W+B)d0<>%+Jq^K6A^z};-FU9(c97d8!G@V=bS=k!K%a|!FTknbR2jVtq4&zM*U
zVu;wuhJ8BbOs}W%-Ui*9hcy4OUQKtfDiV`v>j-tML?)#5tV*i`QF<eFkP>Bbz#R|e
zlrSkl?Izl_Hq3G}3Q$o|yRd}cLoUoZFvW9NzQ`x&cD)lY8K}yIR4G<^Jd<`r2B)%c
z%4DKgBQp1m5AIw{;4&&JY<^;53Z()|y1_rER%(87#27_7QDVO90|CQS4l+@aT5#o%
z(Qptf;=tPZ{pFA{ceMyfQWLZb5-5HkGvsFHaULE{0fI|o3j0j1goPbcO$5M5(lqze
z^yBFx5@tb>JUnD&+!kAekUBSU1Y#;q98JKK-)?Gli#)fn2!Y&LrU79bMN1P?lHP9D
za+Tof?I2RZaVRqtYm?WF5Rv3Y-HqZCEfXF~>HErDi8k_XIvO+CtnJ>Wv^^RRURBli
zuqbYD@U2jhz&BvVNYZHg+ciDDF2wIwC>9jS|Fy$7EcZcn4Qvszq!=mZw2_OXUb3gl
z77<C3Q;@|T3wBzq2uZbDt`!Ol%_i@!`kF%}af9Qa+EH?{On8DCPY{|pmSnV8RBxv-
z&EU+!o1^$>`??`-M#>Qn7i$Hg%&Oj;hEq)V>{yhW?X;XPH}n*z^@A#bur=HQRilEl
zJvqWUmkO)MXpbzTlm>Amd`6Na(IJjxr$c<25*SB=woF9p*<?hsP@zaXTOR62F+j8>
z%|jjOX!3(d;ZQKCWg6zlhzVzFNu~tG(U>gBRfrVG5v|}24O<aYtUQuBWpIvev>4Hf
zJV}+nI2vefT;C^oQlOBi9g~U*sEkd0PA3%<5KUE3Sm||CKxNGFb2_P@fCy#-Eh?0b
z3aH%mgPd9^T%E3!ZlF_f0o_Igg_*CTYE;`?!=YOGs-Xg^U7OrjK>?9-E0fFo6wcA1
zlf=X`t63TrkhP*xK>^WLZyUZ4jaF(y3&>PVNTq@TqEvkjxl*_~othU7Iu#et&G=4G
zvR)w160jPX8fB>3EZ-s#rQX&;E*%x<rM8feNd*N&spb)KrErc8ZvtS?E1B^+#F4n`
zLP?qwC?G<fXY>J)3UWj+-GwzPjFI-a8C(|WD6w+Ep6bNdD<>101BQg&6JeGyiFB0c
z(cNoQNx}@y(IJ~#(PeOsjwX+|8HPP@@^zF{Fb}PhB0(h5iZzLiXvyk!dw|_z3w^<y
zgxATDYpr!oV(YZHRE(X(R{Ex(jucxau-r_>@B5q<*g9>pC>pd$Y@ODwcQSdv+9t7e
zT4;dbu@!3yS|`n{UK*K@K|5-AozB$&9pXsXYcopT{D3(pHUX23pshd@f%S~ljywUb
z6Qc}%PxHJ$6hT56Uviu+AWwkn#K?2!)v9flr}1@qsFH@qs;BXF`p@}vkz1~Do+OZU
zN@$FIu=yINjGyFwy;ZKX6G}xi&vpC0Xansup?XByt?_waP6J`P*(yP;WPCKJZ#Ktr
z<T<3Hq$=EwGK+MSySKxBvZi+x(~Um~b7b2oF8AeTk10?^gbiwJsM{^rjB*$g@@Pl>
zbBaT9+(_2;Lw(cB0UaTGHcsZA4hcz0vs}oem8eocM@S~s#~H2!CZx$O@rg$Rg+%H$
zn$L4$Qc;cyoim>Lofvl|u+q&WvK87<lNEi5r_(UjW@xiWN6AyXcBCz^j+Pn+)<elf
ziO<$H;0AH4g_h+_Ve7Os-L{VRlMRN6Gw6VN|BB4`>o7go7b|n*ZSao%C*1UBDuWJj
zB>Pt#<BKT;@<eTTbd00n4E~KA%jd77-eK%vxBSUcNW(lCnr|DjB+Qe+M_7KOPk<cJ
zW`uL-Or1zz9L?@?MJA!C<1HQHNM<_3SF8la)6lKSHdJYxqkEay3wg=ZB~gxQxjvAc
zb&@E7aWsbn#%ELt<cQXbNebq<GB`&E-&|xvo2yP~m?N8|VP3Zk&e5@1e=_HFNJv7j
z0c2@3kR!4$4rG{;C`UzF^D_V~v?4P=3FgRbj2f)U^mMF_&R*Y;V)RrAjH5AH$%=cL
z1jf<e#;sn&o+yb5s1kaITnSh}me4*VOTZi%JQS0)Z!*gh7)L`a*;<eSIU=KSmMMvH
zR3<C4R7sSh!We5E&m}V|4Rd7l@=+VM49?NbTGC~3jt*0pHp-=8jx6x^m&~{f&e3`2
zewhlJ!8y9X5n@`q%Hg7f^K`BGGC)UIIbqDDw<RH5&EkxpP4Q-tPRB(S={ux6)RDq2
zv#p|es3U#Z(jBGLj8jE83TIoN;Yq_h84^i18<Hw56Cg)qApFMTX7rjpBxnbB<jWQl
z;{?XjnEMgA#3n$V$a)?~m7{Z7AYeR=cVQrae2uLf9SA5-Rk=M7P!(4a=E+zu_LY#N
z%K$y$X!1VKuGp3-l&5m{&(k$Fg>qDz`TH6+!}%T49MBPt-&&BRK#pknbG6PLAS5u3
zhA(m@GfIaz61+aAgM7tGq8t@FK*+Se6vz?bQEKBQHkq^($Pta-u-TYVD1mV_vjoOx
zQ3_NMt@gRiuGv_{)M&DkPD7-j9Mxt{j~gX3T8D%rofa$##?hFYs7X~+L_ncL;>!sf
z3M3%uW>NzZ1>#8NxZ#mC$4!AMBJ(m?GGFj$j8?B44II#!`_rlZK7nyGaP*6%z#%uK
zNW%g$?;9tRKIa}sE|-81YX;-ABDW&~4KvVyba!}JlK-Eak||t3$8KHnbSa#pBmW-s
z^mc|V4QrBVy@qc@j!pwQLU$cFqg@*2$c$lChADw@G<@|cS(G}&k(i5CIbGQInG}<w
zf$3rXh%E(jMDMs&{hDo-NtB}+zva)X6BtLcNML+DN`V{^2G+bnDUc&Nw6I1?U}`kE
z?#1vZAV-9~muYUfn80|N1g6U91jrFlD}=ivslIxi1jf-=cR)#+1g1ulQ^)WqAV-Az
zWuuAKQTc==QJ%`gm*+~tJXs<y>uTi^n97BUDTQ)6VMrW*<Yvq%kRw|DoXoeGx|F~;
zn(<qnCV_D@=pFJrDUc)DF7rGIjH6k=9pPb9PMll9WLs$icEosi3U@}exuCBLX_zBJ
zIZtpQSLww(SsK<P%QhZIrvV)yrlC7bY1f<FyetQFgowBuVGigBU*LW;(*$&gBf(Qf
zE915!Sx1s6M@939Y;L7Mbt2{1&escOQc#Y{=4FfZeij)}Zl=?1JIXZDQPOlbQ<6F)
zBq3LQ$$}&r4HOcomzI6@6O)RnQK_gpmkM)a>@6{wJRK5}u>Zs?kq&cY)Sk1~ZBn2*
z(Poq#)0j*L)(PqUd#*y5B<Mg#NT)O8y*im|DUc)Dtg|}l5J$4Z9gXb0s}#r)!O8<0
z4lT%XKt~9VPVhO<>W)qxDv`qbcd`sZRt4Lj%?{_5Qp45h$Og2PiaM}PxY2>W17Qsl
zNP$w>rIR=ns3XPD@dmMWBSISKC^4x>1HGDAWI#DtFGurL3q3)a!#Y~nj#y_Q$!eK~
zIkGlP35=s5+pQL<Ov4-*N_UHACbP9VgL8D$kz}C0Y+GO*ElRAFmKmI*+b|hTlAGRR
zyw@X>Dc*-|YU4LxlPZael(OZp!|`vX|H$AR-Jyl$X#(SDFfT!b8I@9?fM}PU=YdEC
z1w>nN94+OguR9=8K><;IF$W?Q<cQ#(JfBA?P(ZZKZT^Et!vr*{&OV30qk;k=W5%vU
z7?LG{p70H>!ZJm-(=h=}iwu9$OC=zhw9rSxIGXt;X9}$%90gLX7DtuM36LXN{#EK5
z?4Bu!3aC;C${|?_7tonXqA%t`#tgJcinY&l{+{bQk!-`8L<iKXHJ)Z=uWB`~o^BMt
z#R794W_{nKMgc_a*rc8-h!j;D<J4%s#};36_;Jq5z(#;}@rHz{xHn^ZyLmQ$E$8bO
zyfT()sO{h`VHKKoW?NJ1;F#F#!M0{jpdIz@W3*Z4mVGHujc5kfmQwcnT5p&%l&6AY
zH*GCaF(p6&(R?+_Zkc@UbW}j4*}sP>X_$a!vCesw!>vjc<cM&=Wifxr<aP?=h*tBK
zw>Wb^N4SBX$L!_$6vz?5gKGJ6nsZN*L^-PM6j!O>05DINMLJ4Q<tu3l<cMIyfjh%w
z6_F{;9I!!{pFUVZ73c_|8odsOjgR>_R~#*{j&?bJ-Qr>m{6FUHL0e!wEv$#zab|#?
zP~T+tO@FCl91XI$buPBRI@&o8x%PpnWapcQN~FpjlTUics$iS6)Mh_M`!ODbYo%}o
z-=yEtgIDd@({M+=+Z@Wgo0}vmq-wQ+NNg#fBQ&<2*;PjpRZ-!9vISKF<*0bVoaIWx
z9GNvB&9Y^1j*d)Lvs@{VBVsGkEL9rj$YyNyk!8!^JRKYU<mu8lPiL%H^K402KnA0n
zHe?ByBcs(|mMR5uMA*bMx3;omIiMr7Y}#40??f{=M@LJARIT)UIiMpn6>hc5rZ<E0
zbgLbn>dvjI4k?tQ+HEk~$T6irj_7@vLirp^!yMV3ytOCCEEVA>Xl%!`Lb$J3(s5db
zt`*u*)8wl?bt|-^h92Y1A+M|oazxlqz$O1#Zh4%BHOY=Mia9m~=jdqTgdRanjk&(U
zv;jAXk>w<o(bh31(M@U;&~H#D(T>_a-j~rfiE2<O=WIStEtdjwWOQmm9bDd&DvNZK
zaF|JF#HX3GreU5;Ki~E#-RKxcLzmjL{RGdG0y!dFB$$=FiPvJ^v=!P>(<?jW=5?)Y
z>3BVJKu1U=M%YY2q+yPXjtI;8i&4qGyCvSyQ)V`vkV?|G#5;O4|5eU*E`f10<bx=D
zEH#s}?cf4=$~@FpgMdAScl5J$_CBl*aU}0L#Osy7I2w#0u#BANH%rqvN5>w1S~2FK
zjuhRxdt@b96EiqRM@jZ?kt7N0$l9}H?)_pnsQ%;OqBI*J+x4t-)?4ik+vz>57M|az
zPw(IKch~DJPA}YR>di-c_;IZ|Ymbf*0@}`(>Hhivmeu}wKB33i&AaRTK6`~Ly7$=D
zf*u_A=`}RKD{X_@!TYc%d-(q!gNLGaEOIoYH&vc#i?fIA*T&&R=g7Za7C3N>qCFi}
z>X}sc1`orsp^ZsXE%H;O3dxD+>ZvFSP7M^@?t7oj1e}ic>*ai09-t|9Vaxc{NSC!g
z*5s+NkwW{_Pl&-=Z-e1cX`s!*;&-)b=$KM0Q53;ZIk?4t@V}nnuRnK(CmPd_Vln{P
z7|HoZF`Mf0qe~DZgd{mu9w4%Ba6zTbrzB@C9yYVGdUg*n0?`|Mjh-OIV#9=qsZf*&
zQXy-zU&Cq;+BB2zQvpDf3e=nE6+V0;`@>p$hrakKN8mLhgcJkIRV}o0(TB^wIsBCW
z@6@){{tC}=lOghYyW^%1rQ^(FZsEL%`fSSD!Ez}-IO>y4Wgun@zvJb~LdNjBNif--
z{t^LP%PAMr!IUpT*)Tn2I0LFY>K;{jG+4lK(t5k}8Bk?X*o}3MD-mQXx`OR|wVRIx
zv1T3Nm%2<=tknEsW$`@I%Dza~WU*PEMa&Wi&QMf7*vwvBlsdA*>Qu<~(6;e+x^3ep
zWXt#gO~uc7h{Gd1jxN_3+uTwRuGza)9y8jdFdWq^0cn;(3d%{V3M_@?m@8~5DfabB
z53cDOHjv{X-izxON9JNQ-5nkm=hg3$o-4#YSFiUzcrKUE%HYS<&N%u=m5+7cAw#%)
z3<qqzWeh))W@%KE+$my?qxOsZW+<9EXMZ_<`zb5nTB55)dE|SJ)7O^wJgL*umSz=u
zSfa&o|MB|yuDtC(UJn1~;hIxYL!2?<;qkc~JoSdXhi3uQ^)%Oay}t{96A$>(dw%@$
zL4Z^o4_Ez10Z{4skd0-i#d6Au4Dsjk`Tp|hs}PY@mM~uiU!DeO$WVb;b)Ug<{rE5x
zNCHIf??_TS2@DX(w;KUrb){GoUC-g0xU|n5HYZy)H;nY3{fFT{gczLZ(FYBN|2t*V
zqb5<H=>1h+sz-H1COhttT0xbN10^gVg6u$zef}Q-5|BJ(bN%t#^M4B*wyr)S*9TuS
zdJM0#dLS*CvehbCpvdsKFK8N+@K7t^b0h<ufN2vqH}~J}L?-vB>>Q4}{*r4!Ky=f)
z(}--3VwzX~_VVWDDb~AlRO$4ke|z<G(+YPpe3o%Vpc?pY?=HVcRq&z~>wWL}Nvz93
zCjzqj=Wp2rJoM1n45iK{$c<r}+yJOmZUEORH-KxA8$h+m4T9xz0~kFw!eZ(@55HVX
z!%e5aa_TT9XX5K|xqHRFw^AX)^6Wi-@88ONosjQ+KGO+p|8jNr_4aOf_3bt;_`+5W
zK+uT|`~T<(7~2N{^K_Z)>I_2e>H?0(EXSkFQ6=Z;X>j?J=*cL>PnS6sKx9>_Od0mB
zpJkdlM&_v$9uEJJgIu+O$jaD?Se4@Gu~3dy<?EZL$NS!YA!P!_*1-x>4*!f{9ov%|
znCUr(y&<|tVW!k^R><SYcD*c@hs7R#;&x~ZE4f<^wH#c28%j0Orq@J2xV!wH`=L~~
zFKk1xG4Ez#gq?!BQBU^#^tJyWjd1mp>|u3D&#*Tp7j7&tbWgp@o6B!E(gd+{TGNg;
zPRr3@LS}>#Yp5i#UOzs5?FmXo(ceGb^oka060Q^D+)(6`^K_Z352W7n<2B~qPr{6*
zb6iXJ9w!^vy7+9DYS8{64#2KY<e~=Y+r*lkH;Tt~cs4z9bF!nH!+g7!>Y#1h282o~
zjh10m##o=~v6$xhQJTJowSD;Q>iUldVfNkW*76l@sK#%rg_y0iGX4H|cm4d7CwyNo
z#<;%LijZo6P?f^Ir9^jFej5mLZ=aC)YIE3^qZeb^t;B27%uuWxq_$JoczfT`(7*I<
zZ%v;bHC(s4WLRHHfGu)HeC-LfyB-%H*JxHkR@9j8q=xJ&Do6HkDb_(-!5B06<A0=C
zVlAqt%d4C=Pd%v))>wvO_h^TBRz?~%>Ph<c(3ch~wZi(|dy+?$BGl*U9%Y&?&!LEL
z1H=qx>$4fotMc^+v*I>oI+SzKX3_1<_1)9p>0Vd*_U5K{D<XXJfTzCA2K}a;Tnd9m
zVMpoPo7=l@!#`57VIq9h`;tNQQpJu)?w>Jq#vC2jIn%E8Xm7USl+M|9ytg;#9d4e!
z%NgKHjWb}f#f>Uhi*HBEom_sl|8%VAiG`U@N2E>VZbRAU|B>2F7&nStiQCNuY?;m`
zSSa9bm9l*ia^7G5z4!DawEK?R39+1bQLe-*pEyjAGs#y?WWAX0u7A9}xgUOc6x)-p
zZ-fx$g06SGy%{{--JyWx?5JYGWaIi?=!yL#!X|Dz`Y6XsrTh0<(T1Q<@`W!*1go{q
zF7dHU1qagi3^u7r`N9ECEu1QaK*_mnJGk4M=RdJYa3>8SUhh9q1s<Q!r^Jn~N*sg4
zRY<e$8@rBFZE>^AKQOf|Z=U+!d!<-TE_Q=M)L(kSR;gbgztOGx^%4dU%5>0{fnzJX
zovXf7ptcM=?&kh7&Kb9c@hA-W56^vR7p`+;`vGWZx*KJt(Xl1sDZlo51C;P{|3TWo
z>DV^s;k1&ve*7lyBJe83V;>%=K!s%4isOOb9%wJ8kJ-2Y4s#TWw#re$$vGM(pHRP%
ze1f-4K4DuWpMa_4yJmR*Y-Wa1RdV}?%RLNkl0By(8DQ%oo{3Z>n{~STH6-#Xz^$8>
z`}Iat#4Th8g8?PS9Fg2#CI%%7yy;zilZJlViY!XocC%$}QJB;Tx!CD&<+Y%{>#Eh$
zFw}u=4a1bfR%sY&X|Qun%Y|P1TAcpd*|vi{NgJuPlnxUqbu(DZZ?9<=Sje`*4dGk3
zb<#RTSBHt`=2ATVakGojcBAe++Frx3^zKA$LdcUGlkL!m<%^84mC7Lcx4nmJv4YwH
zv8X?21QdENx1O*txvtwnbC{kq>9%#)VwAO3UGw4H<#%aJ4l?PBl1ZUPm;l{NE%b;C
z!C~+wHuSK_huLh#4t9%d21?E5AbSA2>puvS{IDzl%9aIT#Igvo8JHH?Je0H9^*LCp
z(s9xC<KU_M<MK{uW_~g`#x-87PF@bL<;y|YEVS)AWW9Toc02s6#;}5KHkB7WZXOa+
zKFQk#Zo*K9e#dN77<Bw@C8Y`uV#KN>Mt*booR}}wB9~{>t{1OCzX=P1j%zMKA>q)T
zRv2-LBwhbhGOEi1PN=}2O0rz-W)*IR1KOIxPP%*;(kKE8$I@3A_Zz7r?8D>3)!pN@
z(103wMqo@Pr0QC)f&^3+HjNA;BxXd=RmfY{?5bQKC=N*OMH`An>oU}Xp0tl8De>HU
z5*GrlXAG41c6HPLj?q|9*=~v3GBzrRMj@R#?d^Se{5U9k50_WcswzfP`z~Q_AWop&
ztZer(hY2Mu6P6gO<+fHYxO>=1gZ5Tf5fyGPVmY4>?MyiosJ9+N2&<RpzO+}^sEotj
zf0wlH`-Hs^A#dxAg1~4L{ReGvQm;K04r>Gr@8$_y(1d?@BK7l)h_SH#dHzEJ*Nd9K
z(Bkp->QdUe@>^*;oXGO!8W^p&DLWz_Ts}z0?Ls~edQaDMydvaty-E<R)|TCrsLC>f
z9Af@(hh8{ZbTs-CqLyu@1cje_!)GcEfi=$H=RasC<nd10rm1(srqq-w@0ZxP4Yhx3
zUFVq+Gl|RZ(h){o+voeIlI#K=A8`IDq^+(kp@+RY@tCyHsTt_wx2J1~wO+&o#s1Mi
zX!VVPFi14zE1Y8s%DVc1?Sh-~zK>l%R2_6wnBoksg-Nd4n;13)z%NxF*8?=+n6OK5
z*A+!@p<2M&nN4__;G5oe+3rM@=so;#2}3|3e_WT;jHCbMvM<d<wDk5VZ{yab!7)4j
z9c>6Moz8076TTy;lfrieylWWE+_@LKG>!dH$xDr0u9#X)e(J%>@O!MhS{Wc9S07wK
zkSiP{hG?}>@v2b_2ffR$n7zajuJdT-D5h|j?4V{#twzgO)2+pksOiAeM4rrO3CF3g
zc&aTW{Ld%s>FE86kOr1eIu@aFF#cNVD%>P5$W}({Eni48i)c+>O@Y$(RlL&pRfV+H
zpA?Xqe^Nkd|3df0c_eoLD!<kPsQh9Vpx3(J=>s?cYLixLlG>)F=EjjI?Nd*O)<AU$
zt%dqof@u-RP9FAg5F&LjwSx2DYYdtXy*o*uR&ZY6wvUYyq1RKCkrx&9v=TRGW<r}<
zORFR_bqYJzdcKCeqjWgu!?BtXwm1s6Q;O?pOT`kaiUS-%4p!mAE9o@c9m+pnQ@FJE
z=as|?TvF1GG$25@WcW}62-aGGigFS{0@1Rr21v&6_ebfThC4qr=Pq~GU!Sr16HiWT
z1u(eb<L&eHrBDlfmSU9eKS)L;c6LfM=ySgb6FPS^O^_khYmX1NePIu(0YeqIgQ-U}
z8*rx+mhH=*eQ9~rK*8c&LWOxXY?)U>J+Ev7|NdNFUO)H0%gcW|K?ewH^2<ByScy2d
z=?ffkLSEwfLyUL$&Rb6ulZHwwYE@*QA!MpM4gnKcM~DmcKqc7HJ^Z(1f*4?)dUr^n
z)Cz5tLDI0imF=S44lC({yC%xXo}KS_BBAp6#V?rP+g880QgP8P=}@epAx0|`J1jt_
zKQH^wrXmO#?rL7|zwb({uph4b1EpYidTnqI%9g#`k020LmP_oXUEc~#tN}E1Ss=Q?
zc8mZf7;V?<4g6?#x%@88xLnz@<sqDek{6Z4DV;49YpM4n*?GD>RfBPMW^(LyE^|99
zk@J+D%E_kNr>KCcMV-rb3q?M);{;IUbO%*mU?qa3nbgdVx#|$&_E9o?a3&;#7+ycY
zm_eX9gO3;(ib^(cHseVghts08Z+?f>9K}diMa90wwFk?kYU9SEuXGJavi+_AxOgN2
zZ13|G^25vI1;ekfT2T;o@cc^Fbh_E1+iU*$LauJr9@-X^A(IVOz0TCU#qCbl@q*+l
z_Vq8X#i7&M+WdT(%tzylxtd`TPbmDj^YbOqEUQ#~#R6JbfVyLP6A?BKr3qP;@UK@&
z!b6C1E7kQVKVRT_UOnv(iOAwm7$!Xgm0cO!e6sJqKEqH?An|nv?OU%qo(HH*_5q>f
zjUD^H(s*`xecc-f=TlY98R|8s(&p=xTqi<APG6K|rnsSsk|bXQfGEZ7Q%wd+S~)o$
znv~LdPberm6~7x?UOkFt8#W>fph=OyVk5d6&;<MTo3NMe77c*>Ax^?=-rfz+{QFPR
zEZl1X1kLeqw<n%C`}8OWA#O~$Bf~m59y^NP4dj!sDkU`eiB)jjyt4|^)&kzHdAa8Y
z3G{<HJD)a;P+MbkX@mV%X1pXsdb}irK0?`1kwDt0=$1MaMl>*O44!lNdPxqRmQGd8
zpHo~9aXf0aZ3j<}SH1EN@ie^awsk7_^W~Q>y&JP~5jREsK13wa@=TbHnyHcx2bbc}
zgF6%E>s(|?qSp*t5Wyifm>(5pHy3(FzQCaLsedUn)ZkQe3$Ht=3j%J=M|_||%*rk3
zuDe{cn;K4Yi*Ez}8o$9#1AuZ_+ja~_jWAiV%bDf)Z94wR?Y4oiE+-gKZl;sbYCj)~
z(1IP5kngaUDG+cdQh6o9Z}<TE8$9O2J0HTrAh>5Ue)LyUa)B0y+mHCOgm?O+umEsk
z_-h<hC8CbQz0k}5PPg=ChN5>_N&xp&3JVe~1-oyx2%!z<PV=ixcJni*r}ovU)F>Sr
z9j^8@%D-SwGBy+Ca(B*V42cY3uP+YZJl)Y>))7C*<y=+%!3R!U=*q5pQh91}&YgO{
z$o&=W!2U5>O%~Iwb%PanggU<GNN(2C-NSl6Jiy)Pb`OtmsL*%h`uufDuCs3b2j+_X
zYZ^x@;ZElk75L^8&M@cW;oE4k{$N*f5Qy$W-j6mlWDh>Jo{X2Rim*6!+AM~j%ZKT&
zy>WClS-@92s+mhZ`h$0*g&Zux?LwmN3!&!ISO^HJo)paAmhhRgCeLe1jTgi1M|xN_
zB`b#8kGQs*7M20-tNlCLZb$SghoXD}RSQBfz{}bzJq4sNDTv@RUZfdQd1i=8IKI|_
z%}_%&#r@`VUZr65FfO?g4-vjBu%n30w?vvvtO=|@MxX41LRwDB2rp<z%}d(}0)DtW
z1UNC=6PW*MQ4UF5BM|1q$$Bn^+m9PDFld@Gz<o8z;yZ2R4mV73z>z@JgPMKVxgms4
za;{<Uu(!2t1?$eCuGjR)%XD%{89^vm^_88YA?nZdVzk@&@7)fC9A6H)onEhppWV*g
zX!|<7$M%p14ffkhYPj4d_j;XUIQP69?+?i5Pm7vPh#c-UspI`xa|H?~pC9Wxn4K-q
z1NYzVKD(W>)AQcl=kd|SDaF|-?x5x3ax&S%U&3$$f2H5&(~n-yIXR`Sx0aW-e{5#Y
z@eTFbUT^qByP=NTeCVA!O?xrjh%3x*@m*XSDRRPUeKpp~0A34mxQ3!)7%=2FwJVl^
zN&fW+)4g`(MEe-^3~~KzH0mX@<m`!ibrf5aC$$c@5nIr0){3Rs9X7S;6gN0VD&Y{L
z)~QO-*So=H{3hocJzY%`YDTtSi@tMbRQufMKpN5cE}<?Xf37!RKV#UC>%$U{mmIL9
zK%K!APRwx7M}*Uv;^Jw#S)lq4Kk>l%^4=s<XyDnFoNb}a`6PSAqe)3>qG+0(46us~
z*4RwGnln~L@N`1v;0dd(crt{2^oDg)##vW{c6b*t-fLgIE{eLd*S=bvlsRp@QT-Ir
zMTMriDr4+KpBOiL8N!8EufsBu%YZAcjNs_(a3({z48nTOxLWsDP;dUnX!k~a{vepL
z7w72heoiO%Q#gK|t*<aZ{p{T>J4g7xZm)MX8vk{eZ!wZQ43EoGUNsp0<Hh2Ad4{fs
zRwP9yls~zjV$AA2eT9ceQ-1#*RknC%0q<_cs#NQRv*+OrEqdUG`+e}<`<wF~jS~j{
zGDq0LPl3nai8k$!xNW)HOvm%t9JBf+6u!4<NOf4OGG%OW;?`T>zoA(H55$$C4Ts&N
zw~T|o?^m+yr{VnSb3fhn>R!dq%~arNI>S(PR2H4P^=N|9Tu%-Q_!u=MRmIvl8Jc|*
za9P^j^ZLqM5&Ava&!<5ndx~RBN5(%ZHJ`rL6+Qh9KNpz!d`3!jDy58rbf-~*A&$Ru
z>{;&nS5%dK%AV?q6pgk3@9F6_fQn3c8zg|E_k+m-d5#%?Z3`Gl*4t^l@^rcb8gRlX
z)!pcGeXx$|s}d^Ec1(++TKtT=c*1Z~oS3=E)qFMDem>xx3dU3<7>}lV9<8p|nD+cK
zwWs$ZZi~mz=w?2pInfJBkD3iyO*BtH3GJ_G^XuCR{w*g>xKBBC1#>Kn=s)xz1~Mgh
z^p{>3nwrhK&1jGJ`x?_boEdw6-LCocM%%@wkOkcW8r`iw%ry3rXVugD`RbCEq10Sx
zQrY0YFe$hk!dpSS9nm?znVUr&9uSz`9F};=q(6JWtI4|^GC9iNPNgbubD-4L4p*CP
zV_O-wbCAC`^#0c8&^%$}H(!w7bK|W(&h*nu6dd<tLXp6P$7j|xw)?iUE%RcWz&Daz
z6s9UIc4k3fQeM@R!cOU{M-+s-DUL9Uf58i$*xW0Q?=V}c;wHE+{y9_%{%N;l%H@0q
z_bcN!IM+oLD&Og)N7q(x?`6Hk0)0$5U8^n!%+--un3+-YDozuD-TI?Hb;-v2X;q+U
zrYf%qRf$a~F*h>XO!V$YzrJC=`uluKOETN$rF=cd&_gS`FL-zZv(W8)FV*!<7d`8C
zkEtm3;lf%ZQ63Gp>peQ!FQX0Zr<qm|6nFi(2b{p8(c_hy4O6MW0-kgT`Lf>Q1qdnL
za~?&o>@;0Cy>W5Moki~lmRj%2W&Ocno`2i;`O)8Sur>1u>P<km-(7bi{b;`7!%wa;
zF|aG|APsel>44Kb7iQja3wMFoWi-!5ggRk!b8*<cJ+G+@{5u)@9Y4TPiwvzf==d<D
zD98Mwh}$PXt$v}wJgPv&F*83+U-7`(_OpBRbXd9m_7)QftV$Wd(>62F!9un;<FB^A
zzcTZr_ujCJ30Omww<#=!c4*8|2(34qUJaqc3iTA{8ZNj|?~8L&?l-8TdsJ#uYU#Yb
zE}G$;BU`4mlw3Tn_-U0%G>4J}ilfIBcA@9Hx2b*Zq}CrdGZ=c{%@!pGf5hFgkkP1~
zG_*c{52n?!ZUSASyo(N}mh+&ueEE@^Io{xL>g`(UIGRZ<Xm=j@V!4IsK*#I{)EI;I
zRPQ_r_@<#gfof8-C>yg|^I`;=l$oKL;powvLG0rE=p3Q?u!5#WU9Kk2iycO`CE6qZ
zFKRRAMr+7r)G*Vcn7fr;QC=2MZ)-K}{rIE`bi~}xpBuOFR<C*0n=R-O;BK7Ntz)`X
z@~4YgCP77)2ABS>zcP6x%z1>`diuDc4or$X?nBNWSO0~le9)4(8rGGK@o23kUs?;C
zJ+7XnJ6Ci$dTQG@Q&c-@+t6sTcsB<@&EDwjKfB>tZC$q7LqsRs6Za%>KF52K!M<NB
zM%1?Ko|9etHL~<Sn=!?aO$qJ7wn~Y~8=4+Ots2%a_1rNFU6lI2d*}B5xbk2VoOgMD
zH;>{RLRZT5$F6|ZTbdLIEt{5B4dl0#+368hLTAF#%FRG%y0Kge-FBy69b!Bl_K!1l
zLDW4qGqFGBSPr}6kn7P3PtiY3_lNE3;efYFVzr73sP(kl-OZO+6yX$mifs<z88xis
zqXQG(#eY~oZm|3Hjb<`FJ!nZSj##L@X@?^9&Y(E<i9$z+$)QlQ|4oMur`YtFBd^fT
zj?44GbZd^1;P<UtO09{Oe~H5;j3ss%eMM_Ep42uI)2>N(sJn}5>%S@0_2xrq6-K6P
z=UAK$)*G|&e~UTq9j4m<YJ!n99X!M5bD4Iagk0#H^HH8z**<Qu_k!8#2>Sy)a*;`=
zQuK~nqf7dCWnIB@OD^CgRTOC5h1SnlfbQTYh(rEU@t@pm@mAx%tG=F8-Dsl5(kUO6
zOPs>Sq3Hj0w^7hc&+N8eVW~%t%n5zo|G84lu_@}#q{i0etqH2wx1;&uu$^Moj+))0
z%8Vm`dHmRAab)UyfA<KF7<d}6QQ<o$meMRc>!g?fNQ5UtyhFDcU}JHxMZfrq?1aSQ
z2L|#6x!F9GT3m6?3K%Vihvkws0UcwoG~V)lrtxoE?id%Pri;{&b=Kv>u;0V`Z5wR9
z|8<zIu;oR24W#($wZ|Lk_32t=Uw}2<_-%__BGMwxu=>7#mnPJ}#htK%GpiTQl1s$X
z8@vfcRy$faLloZ*Si`i4rZ^c*CUzU_X|%#dC3b_{US*>h)*>lRVBfq)pRseBXhMpx
zzdFVN<Zk-BMmI{8Q0R7xvzuuPRQDOXN^~52&tsrcg~;9l`)ssjS#7`9OYZF3W&)lJ
z3g^qC$=!PO8sY|0xg8TS*zuP@9h!XyAahE95|a4PW*ziqj^il_(=ijZ>*WMT_tH7U
z8M5JW`}$6^wV+MdlgPLA`e*IC*<IqVjpkk)V=`!rGs@j!I^9So71iSTz!=!AHEB+U
zB#m9NA$FBllbytK>@fG3JIzJN$)BV79xONHk?(c7?fpO{uO-DEisDESC#uGBU|I_&
zhg_q@qO@57Q_Iq)UfA<He>f~(rrXEa_2ApimRgiLGOxyw^V{@ey~RW4Fkc$Ou^(+C
z#84Q53s4bzaf1c5Ihk|TJ+vQEp55HsE=I3rhXxy4waVSO5$=QFBlf`}*+{Z;#FLQ0
z+b13k9WCr;R6O{B5cVm&xzzyEG_*LqxoLck^Wo;ExZvpS)~`WqaowTVF!-6Rv^v&A
zb^=e>H3liJi~9qPZ0Kx+CzD6W64U}dSmQi32%*5BXqUsxo7V6<Cr3+PQ8Wq-@&v7#
z)^)B-Y2|}GAZzb+HQ(Dq*t=yxmR`j%Hj3xt>iZJA1~~LzO+OxRAy4Q?kxw|Hq;y_Q
z;Z}cY7Mw;`HMQ|ja(49{M;@EEe@?rinD5Ygjo-)yv$uqm!LeXv@NdeBa|~U*$t#ZC
zO(HLv84UZWKH~57mOtWAnKBC<w?DU`#d$ifqH6W8WmToU$)zQp>x^RfcU79=c!r~4
znt(m8t6GuBu>Yho=cEgj8?C5*3{T5JTsr?^*1FW%8Yrd4l)q^#motoSRj%l5C2lN*
zn1%mbyGCU;N-UT0=07QW7PM@;L6okq|50@dT!)}}Gpig`TvE7W6m|y|X<Z|Ne_PGB
zdg@)~1!hWiK@sT~sfBdWp;U7eu8*X}Db^gTb=jj=VxMlrcaJ)qYwRr0khXclWhB~Q
z_!DQ$*qIfV289WHUf;raqc_KT6y1f;Y-o>{dNhI+uvlGhrmM(QqBdW+xL&MxQ>Kkl
zTbx#NKRojU>u5|yMFr0v5tCs-uo=AQAD6~*0BaW{D{kX8WxKS4XmqDuigF>==A^%)
zz~PkMbs@v87;`}r=3m${!k8dX*a?T3RSUs&jeNZ@8pV^K0-JoJ38m9pGa=3aa%hQB
zM=YL>k-)l=GaSohxflaxo)l1OsCL5Bu$_(5Vv55a*4JdaHO4q6<?<;G7H!whYZNhO
ziX)>^kafYT0qaR;9W*0js>*KYwF=pwXyP(E-tsDZhHSz+f5P;tGd#TP_VYdFEtuES
z#P<7WaWD-*TKUjIijLo51GE}14wETQg}Ld{@4;vb0}md{3mmB|*DH0lT%1#owfM1o
zq+)YcT@)b>Wnt{((x`b047C(%x5>swrmeIP#9T1SYj`H}W=GCOD@?!1qGpaXdM+V9
zPE3YsmBwfIg@Kt$g!~c49)bE41rF;Yd4(Hgxm5&PhEdYv@@Y>!?<G>$=~g}99NQ1L
zSG3>m*R->b21)y88C~YR_L?ZQA)#J9-HLpB1(vC-sK>QTr>lEd{amd-R^<#8L@*^l
zcjIeLafH=4vUN2%Y$#iWX>f=xX0#>G4-sW+<@p>AnCOy>FjmxbINAN2Z_1B148e>S
z`uk9eBaA^i7*iq@StW{Mdsvlke5sSrVigYAi;7(y$U@z*v=@}EhHjYi(f%YNexD0@
zTLDo9<K2d~yXD>48giAG4HVUe(8qX(>#@DT$KdNn+*E3e3j>tr7r{&fj4Mwmq^Cf7
zOB6fZcBZ7nkc+$dOaFC+6*97p8qL`q?#|MHGT-BJJkH{TeVa<CIKz%7%TK_xwcF28
z>H>+A$CX902{`E$;fpih*KBF406Gy)8UzDs7-?8)S`?6dxlkJ_JZ7(LQw*zdSc}cg
z{^$K@$IIi+Q8@hX=C3q2*!QH}{LYb24{$ID950DwZssxVX^M_HiQ(~)m->nlagJVu
z8ej^tfVO64(3&CG<)6P!ra1y(I5;2Tv5)B_OayJlp%mIH5mVe`8vPt|AaiIbO$n>!
zxx^|CoAPA1F}FsQwdFz3?v|s;J6N%37ZO=*5|3#;AsibO)ksRK5~|q(j{u5S!vPgF
z6x1lXlGogDu2eEF*YT<;hE)YT!#fSst3i8#uF?$K$EZYTm_n~xMY?DQn;95V5NlR-
z2Q+y$rhGz<b$k`4;Zp!qiUZK;%>_tR9}X+m?Zo<2sI9p`KzSgw`=Ixq{+f>8c`E;P
zFc$X$s~jthNqwvyY^>kbhXwA<jDOnW#%Gv|a2<-&JbixsN|qP%&zR}_VYFJiiyM5e
z64?(s$>|Eqnq~zeYscQ|eZF0<q%QTAHd^U4f|nX6Sdqf9LU}-WbUWx0<9d5xMSdbj
zs1t4>66EE^oGPIF%$<Z--D)iTfKqgxFds;TUE#2J+bE`vRaw(G1Z#=GyWBril@x3f
zT<+L$2VZm`clcFKa6b(<KH|&>&Xv%qPBAVB1r~rd+wmTy>g?w0y;RlDk1*9K9X86O
z`j9kQ>`n2bBI80@1P;0x?52yRTUERm;YMTf-BHrK!dyYvnl48mu4{o)&Hd-^j!Sh4
z+~JtgAysNriCaU_;qx7`uz`URx`sFlYLKEJC=1y=_w#<V9pNsK)k~;GbVdegc%WB`
z@#1NHSWUidp4Wr*2c5g&fJ8X!s4FjyhQ?eO_dW7=J-T)VMF+-xGvOBdwF@j)$;SWQ
zSliP!1bT+SIrW%0P2AA6Mr_Q*3-h2BV|2uE?cf%TxQFqU+SGvl152Mc^J!<2_a=Dq
zV>SokLh2NL2taj)yOsT$85UtW{|;)3C*lt3&kxuTdlAzL8pr;yn|h_Y`FvZg=s+v8
z8%ZlT?iiB-oua>c-X5m6qr+kkV+v<3AT_vpOs90$3zlzg!v`gk7lLK#e2=kab7e2i
z)azl_Tr|6f_poPr#3knI9MgW;<vWaJ=nRWZC7opiw(F$aG+W46-qGEZce+f|97?1)
zAhwbJ)j6QOrcqB}t)*eQA6J-0+Ywp12Qu%#?Znb}Zj&<Jz2LoHioV^v-}DWfXnZKo
zFuO-kU$^Un{D7%{_NwY(MVGD}ckx&DG*_Iv@Fk7goT|8>&K!zt6f{r)Rm`$SPPg;f
zNv}2jFPgiIaw(??;EM?96j+cS@cuk@P_aY8Z&68U_dpyCmxuk1Zg-{`*Dz8@0%4?J
zq6RRB_2s->kom6|vG==rK&FuIJ1H53%aQ}KMB#A|%G+Nui_8Ay?k~K^Jdqu6pP>j#
zbZc>EZ9*oqg%W~QRUCrrHY41%fvKSLOi;U2?=thSU&?j-0;31+<^GcP9nMGODR8<!
z6{2-sdV7o6pXK0fBEKbBobc2UuJ5KlsYwe=Aq2KjqYUIgJM2;G@vqB|esxYtY+)^t
z`CN~r^6t38oNs=Shw~YVgyI&9)q_{CtI07}ea0Amt8o0J^iZy<>;O~iT;)0u>kk~#
zZcCFqDMdcs%?1TcCgyTltS{2*o2SS7%Wv=oef8~@+{wo|YF)5ePmuKb7u<y6&Ao5Z
z{=cOSz;LK4xqrow%ugTjyK&~FGVy(B?{_nJX~#S$cK7&AvNN(i8>8pM?!QUC`#ZIl
zJovdXe*HtCSMdtJ^P^e$^n^FdvVU_YK0A73GX;Mx`-8vtpKq_l%xmBVPyM0zf^3NU
zd*=jhzrRT@NITE6Ly8wTc>k7hu6QjDNf}%|-4BIY(})>Jk<mGGbum9YiZJJb-ofCr
zQ;E26<qs9{xYgjUFSun7(~E3|7k~K;R`(DFMf-qL^y|l~Oj;kvS!oN^9-f|_@%(Q(
z;~t(8={(-e@eJUX?<Kj8f07)QdjvVJ77=8jNehk%EhjK9zGZt|VZZKz&Uy`$rkB_N
zkT#Fa-GzrWUD(^cPtENGW8)q5DaA3--O^}dQR2OZ&S@Ri;wl6jLSmPf3>F{OeDsga
z!Z?19qQmgV6?{lYs58e!C^n6dV6X)@2_Eh-{~m28x3DY~X1piJWICICUh{<B?BB(@
zb-KV&WHj}!<B{=#Wg^`yhi?h9KER^Zts2A>;RZWl5d#%ByBNXz+$<LwD<}ziVvIzD
z6-yzZkJ%q5Z8HVIZ~?#`YvQ^e!n}Gn(%Rq@6?gC<9W%gH_3rXzw_ae1t($SsYUvt-
zvdJ_$@<+LFdDwHYMrJhDpR_GkTSKPdP>J*1*2w${ml~$IC!B|#!@6m_ox@jfypSwT
zVAj5dp$ZJvO^KsYNG<o``|!aSbI))NPHlTobj$L6g#F1KR$rsns0DT4cL+A_SoO~?
zR};3Hsf<}^7lOPS2UZ%`mD&(IX7cwq`GjF;t~;Dw!kYk0^dxP`9nb4Md0u(N?I@?W
z&>>y)cSt{;&R9rCublxMpE|%miV4;}e@uS~!skYo!bG{SBK!JYoTAFuT4U;+-M^|`
zX=vFfuv12DHnRQ*+L>=a0;RD5qB{ks7FLT>`sWaL%IOn1>kz~wb{J68b?aKcVx0!p
z_XI@YE}ccvB^N^?)nG|X+-eK++@~EkYiL_jXc-bOZh>I8hc*~-Kh=21;><}{JW=OX
zFF<Kld>n64T8k+`-g|iz#s;pl-&XTqoZ%=mI+)>Ww{U_$8*SoYtx;fyVUdnrL!V&f
z32!X*dg%F{5SU0Tb|im0b&fufDmVv$6KU*NlY?fv#Ye}JXpzD&Du9cwX^vEnTt^;t
zGg2f8tCmDAP>~gZ#;}LS{D3aUN_)hDx|j%<8??p<V_=S9YJ8-7V?YsDigUQ@3)En@
z65I>6kUkAWud&^28%YbvZXkInInt7{x}YJ`q#GLyCdtyaeViA+SKkEr-z*!ltd;BC
z8F>-uuZ%4*cErrSJF3DleHQx46F#HB{U=;ohX+UJxP^?Bkp;^ab{cmKfMlE0xtXqZ
zG-{Kl9=ZpCVV*zqmsA0p^v5@}V?rTbl5QCHVWfwTrxQ5Em_r9gG8oOb{Uvt%ahsz@
ze}^aK@Vx95#!?d_I+`JGo4~o9>4zE>qZz-w>xKL5am#CBDGs=q!O}pQ-89j~5#J;s
zKO=3}FN`fcqlF{Ch#q-D4nthI<kc@|@`)n>xLMv=DYZh2Hj?^iuuOI(zV!aD>jXU1
zO}BdtA@ID2+Pm+gPW%@t&Hz<r*ef6L8sp~iSvcbPk5pQ6AJ99-ZvBUFr2C(X5l7|D
z0fNZ{@oM1zuOd39dk{4KJuW6(Mwq!!>0HsdTQp6K9rFl1SPYWqT``V4gwS4rJbRb%
zba6_Wr^V=^1A2R<L$ov#oEn)E!{3x4ha$h_+z7LRDXu-}uEm73kS9*;8*JfWy4T7{
zl)}Nc7%{GVI*(xW*>^0aPmLQc$7m|{TOQDUPyWXbp_gt@6yyofU4r$8|4YnA#Y1vQ
z8i(-ayx&O+OM+D6{qdVa>kl)q{-BZd6vJvk@8+4i<Kt=nzn&=c0Yh$ofY6s7jj_~&
z3TNc{SYgJTxLDJ((Pqwc|NDS@EO?7(HHE_ysr~&2Gv!nqS4`$mz6@^?BTM9O4R=T2
z3eH&_ylA;YHV=}WLnC84-HOhxNZe3=`!91}MBG=Ok2@R*@YFEg;%g?hLmdsLmAcuh
zPjV;PN8^|FFq*<N38V>+%`59au-;oVa-%p7stH_N@&sO}d-3F{$#g_L>hDB-K{L_A
za>_C;q4|j5$=YU-mH8peO#g$GyV2smwr=v(Bwd*1{fpg@(&CJArpcuybIpnbM}gQm
z{V7bsqyaeNbu+F^kz>zV6%FQj|4g2=m_Gh9wEyxP@t=tlSIgZnBToa-m0rX{i%>{d
z{0d`nTr}dAe6xDcN;)xZkcRyW9Mt2%WcbHC!HOC8@8V{DG#kNGGmXKLpQgikzvbAd
zfR`Hdoq6cV<qPh#(f}FkY433FU^2y>!xD}vj30+s;nOP^_mXK*H9Eyp4_Z+^4TlYS
zvtLf0uyJC8&2zKU7e&y)Zqnz=)XX7WRBjxMBPzP)80&dG{M;?p^0lGPk-<E!;P-5W
z)2bAd2BINOy<XP8u;&^c00=(G>Dr_@D68#)v;T3-&B&Lxaz24Gv86pP#OBj<>(3Z9
z{Lu|8$7nOR=6zi}yz|UCQrv!bN(5(%(vS;(Y-G}Hyd4kWX~mU`uL6x~#2HBM?7++6
zd=^gtI_K^(<p6hCM_an3T4lRvl33j-@J4HzEpUMgvh*IOGaq(1{~@k)RAG5UHYt5x
zZ?I={#ooW!7w_L*r?{?7)>!^IjM09-g&k4wWry<y9}EG5SZ$*Cy45uvXB=ms5ZO^q
zBL@bw^S$(BCV0aBS|U0+yMk3cEl;KOL&rMm#d;A3Ya3i_Ca-<c<k&bP_MY=6ixOA&
z=(Mfezz`~mwt$UqaI{c#(3hEBgUm%iPdSE{<6ft+<jI$6{X<TGxThdDx8-;PGfC{<
z#zMy^lkIvAmkzHshMj5kF_I=rrh~JmYP@i#J_lB}^h`s;m=t%zd$?A?I*$&&&|62C
zqA;E`X;b1E*3;7Li`rAwlp+Qk|2Lk1M1s&!U`G`DjMjO5ZB{_l@*8d)n7oAdv6zy!
zCb&;v+vz_JuQ(>g18U?T07~wBt!<_mJ|n5BQB~;gaL<jZwz&O3FOATep0UOsX@A8}
zv&5OuF)o^M^f9CxqAiH+0iuD2T8wCX`WxyV-h}wu@+a;*#ECbj41n#f*9+Vbn7|S1
z22YI4|Awb6_Ish*=~BH|piZwqJTxZQCSk~6RHZTO<>E+AzG1gelZsC^8Yk;GaWyeK
z;&F)C0$Y6X7}n|Hgx`2|m`BFZf7{GAadnjk9w>_XC)Ui9%gK+!t|~<8VQU=^Qn;fM
zrW;XF78EB)0-i)Yyr2P*6F6#4crXCF72R?!-iLu#35LR({|Y0iWIu9j-HutL1mWD<
zPkDuh+9{l$*$wdF0Z$X*G2tzIlJZZxDif-~i38h-#g6(hOb!me+%dfosvi;HEg>64
zBvrrz0Q>EGMq`a5bf+7RU`uW;Xc35Ach=ILF#kr3ET^7yCACGb!s4)<i<@w)?J?c{
zYU*0+sA8Ejo7l8iho%$HZ__`?#~Ypc(kT-8`WCVpmDcQa?94FnIiD`5KWy#=ClcPH
zn%zE&=?KVQlR!^B9AHC&w#}rIJ{*i-v^JCJ@%kOJ=owwjDB+b32W!R!-nk3Jw(4jF
z5Z(3InLXAUM%zl@fiNlcx2FRU?m|4O3qxt5W48e11T_&a6Hd)7F<kHZM5==+zEKYI
z$lAvVKEPU-U)714e}_D`viSgV;c+%YXMy?J0LQ~Pns=$jHE-MwCl+?qS8m4TD>mcm
zzD?p_o<M~v8g0zQJkJp**GLOa+0Db@$b(3OaU+>k<Orchc@&}g>pC?lynv%goohjK
z@>r%~bO1G82T)^o08NM8#~dg0ih`5*yGD#4jq-I`uR~Qfq2RG~N#k`RtIWGb;@3Fk
zgpIYLudzag_cS6l7Y5^EHlO1N&F45q^Er-^x#Kv`UvOK)OkeRlcU-k8lU~;Ag=wpD
z@Mi4f{_*^0hwnU)80HR`{gVa!*t!{wzcyyIuxT^b<k1E5E4;?q(b~4*5QGMC>d$y0
z_)HEqO&3|~*;6Fq`588IO{_s=v=Mwv{}E<Cc-)O9Z|FQ&G202&bkMvL$LfpAA6X$h
z*Mw{lRid7sCebuEw(d#eX<KoA{RUUPWbsS4RLtTSM@t4+A7msX9ROigg{!^on7ac^
zY=8V12>V%r9@JyQi)%aZ8I=s`NJ~vFIN#a}z!^DeE*T~B1hrj)vE3#B?V5!i9K@s)
z|21zF)~hg$NR5_YtGJOkG0g#E7ag}Hkr&3l1eu+9SFzGf#gIG@F_B>dC>aG>1&)b@
zatab!aTW#2^>O=cg|!v-2qa}L?*8|KY-A@>wyQZF-c4T$y!L0l;WjfSD$hj;;|6#R
zuqQ{2+8Rb}yAE+(D5u-%$mh4Rj-enKhb!kifgW~gBl@HWV8;W>7k+gKUb>tKUfijL
z8dTY<iK`ji!@P?!dhpfQFuA<u&s&^{+MI?hlH{^A&^*-!16aE41;7?<E<jgV(3EpY
zPgD`#NKB_Ab$?<^=j3?Mn2yEGJZ-Ef8-JlW8|$ySeOyED#Q7KoLYepnT8wavffA)N
z?`}!%aLwsB{iS2xGn|i~+`$|8VZ){?m@Ko;T-eAxmrqzZp9vk$|CMg^&QQbr2~OBp
z!2vWgo^wf2wza7{v#XA&emlRYbm(o0KJtw6D}QA5FGumO9$)>o0?9fdA7-7E!%I^e
z8ddcavje;3&i)E6Xi;aTmf{?va89q$p&CEaktdY9&u-`J^t^ZXd3tivJw;*YkNmg+
zPZ+%-#X8u`hL0AfU3pmTMl*4H`M63G`HsC9SpziDP^tkwF;ni|53@mF`){7I3VSt0
zL-?4M2t3R-vN-^{<LRbw?`~e;|2s#$UUJs<dsJpZha#vYWguN#ib^(!$hhGBHx+C^
z+afEFr8Ebj6ht+Rrab>rJ{#BC-0Sk-5tMXoWq`eD-%Kz=F8`eG-?+I@^{(Y@JP!+G
zDIbfYSDb&cOF?N$c2-4|m@!{esMJK+JaTK>cXoOl<biR(Lif<niJNQS*j}QYm^gL9
z3$-x5BG*b^rK5&Iz!yjSr0p%7Dw_ieVWKLD9mDp@WW((ir!ArlwIqf?lU#)j#{CtS
zw@Fo_A_aMIX0A%%FotR&4Qpm&e-U205~hA!j3xpjKi$&hxyP+NQWBO9)nm|we;9gj
zE#>)%51*LycORx7ZE)x2L0`Y5YLjHa2V<%ThjjJ;1ND|AXbYaD1exH4><+D#d3%tI
zw`yC2$RxUb(AXoy6PEnCHum>wyEaw8>tg*PoWY&btpHk(X{$rWOs4?ux4B7Rc5c0R
zeNU8{{nA{IDo5~8vNk)M<!lYF2;@;R+F`cNUvXMy3OTGwaeeC~&;{xu4rFiPB9FF-
zRTJYRNAYBiJxo(ewFH#I94i&-77KgmMmgDgk=7=u7(2%t;{%=~+__vH@dXm$(qf8=
zoP*PujVYo;D|TB|RRR9C(l(~y`mi07hkdKC-g8iX&3`|0Y&_f`TE=5mk4JWko!+C8
zQdcWAx6xD^3~Pw+jc^@Rz%$cg!7-cKfK*4?#A8cXfg%c?q17)D#uUx2QcRIA?{uh|
z)(g`rC}AoDa|G8>Y|qgy{=mWw2b{O>IBb%<Vh3<M3}bo?YPXLcD;zzNA20fa>qg|D
zS85Fj187_^uCHF}<rc(@?4Wks#gGRJ3wV8_157jPKdw^qjE!cjv*N4|h;X#)Q*Xvi
zF#D(A)#xKBaU(d)*`nC_c*?X&F!aG5`tp#0!9i2SXcrRbu?6n1vdZH|l?mLV%cvmR
z0QgVkKfQz@ZS2)@vr>3#N=-Ci>~!FQj~-*%)6RZX&RvXm*5dhY{c#5$Sah~)PY9yg
zqOsuM(&azRg?J(0KXE?Bg`u+LhU><LuEvA?i6vg#TPRm`8m$8>J(mU^K20*JP9xSo
zkl0fDI=Nkt{Wxu5MbiW&`vuaVW)&T08K=!hJH202p!?_bPj=UeZEQS{5fvPsoMa$6
z3F?XnCYVnY7WEAdtsG2PO*q&vfW<aaos&>GayWT2Wy2!REo$nEVcRkH9=Kn{zL}k<
zNOR~Q)4Jw%L)96Fb5PH~6H!x7!=YBlw@RC!xZ75cK&_y1)OXRsww$T40Uh4(#Z!%#
zmM+tkJQTrH4G;0{-Z*c>!=~fAyYKf^lFa~M-bH~6cd2`5*7Kr4K=nJqoK+Dxa~}bQ
zH>Yd51kid0b~@&QyeA!8ThlMp^R_(^cKhKj!&A@{v`*qDA90KA@J)L!Y$``xTC@_u
zpm9NS!iAY@31f45=R1^5%@i6X^3QeE&2mM{G%t&waSbtPr&mz0S~6Wmr))82jH>|I
zzSM}~w%AO`Zw?d9H2f3oIaZrnh~SYlhGL>rrLd1}cFL|em9Zd7cSkmN(iv!sF3!9a
zyj@wF?pEl=1zC7|VwWYpHgBa^tD({ie3@=5j)!&$V52;cHm{X-Psje`i)GRyv!kkc
z3EM&n14Rml1m7v!pDqg=&PY@GSXmg68H-0ZJJ<%ckMVI;vzB3Y==veK3$1r%*ah(M
z*dC7=gerFIy8i3G|NZi+&(*0M{~E#Kj3+*E?dm$2;m3StrUEerZ|<?U!JvtQ-__N|
zZWMX_*<x;k8<uS1nJlLI8bXj=8=Kw#*V_4n#!-a<{3RP})T*)mD{T*3i~pl0ZB#sn
z@eg7_Wg!>a%Vcd*n!4H4H0{!&xZtg{n1Vv_VoqB8LsLXpp_V=LplHBD4to&Ep<X3o
z5d*>a{hfXDy*Ha!9LRj{eee5y-<vlxZzi*E-X!qoZgK)f<?=1OH*n~vwoL*9y}Yf=
zHJQNa_Teis^yo3?@zNM{_BloPi!QWJa0N>T{k3;i^h!tLpSgMP|F58_JM{2$<qDd}
zz5r`>M|kjb+#l=G8#`r!T2fG&`N(UZ*pI;hxv_we=D*I%TFldby{yE1v^&VWTgB)h
zOUG$&l(g?2+RfR%os2ue|3o(A$n@P&Zymam_W2J7_;v1t>{$}r2t=-2hFEux1t)ND
z;DzTV9>n;mM;f{|!e6(iOGo(BG(L8Sah$UW?lfQysPx^0*qwKWo&X;%M1CQ$T|uUE
z%^7$16wl2aZfuVI;M3F81~6z}X0VSvQ;~+EuLGap7b#>UR`&e|7=-rnak%K>*C28?
zD*Ch&W`nAZRbb>le6>j<lyL5S^vI7GO{mT>o{S8+$U4CLy5q6^n%7c;FCH2{@D|+9
z;{-_1*YS^K3BDaik6cm>+=c8dxntEDjEhauCo=eg8hisF!Ov?r{lfa>3H<B<3=KUv
zMUxe}>G8^V8Gc(23X=WR8lETcLKCiQ_+jXoXW)gqLoky}GQ%aIt~NO`GdNhr*Arnv
z#EKe5l!0aFtZbbHCSw(v6ltu?0&<}Vl!Z#L#O*?mB;><7VrfJh+J^o^GPJ=uMBHPQ
zLiL`5Q&V`s8v?l-6MNn)Pm4T>5>BiG$KYSi!|rHh97a$7FH0AA!M|?tcD}nvsq>=P
zA$k>P>n}G!=0^Ba{)+HdL{apOA>Cv0_d?$1uL^%v_^rm?@{d6IE%2xDU7^TCfkmT>
zYupQ2OrgX@{qBbJ+S=Nb@{OLIRNm-e?&RH+P3x_8DxZVme>{F@8}mcknE#n^ii>r(
z^?x07e|dcGcIJDxGyl^|>_9oKzqoa8Kms>a>$8|JznCz;zn`VrXLHVtLIL_#e&F<d
zWIAw~H6;OTd~LtO!oNV2bM-UK*UvEDlKp@@9!pw(<!3;LA#gw7GJ2Nz(X-6|xWIZ{
z_h~}pJ0Ou~Yv=ltT~yMaaNsXXVkknlL$#+M;q$Ev%(pHuzgm$75J2m%{D;DCaz#=t
z?YLj$4J3d4g(YQRW$XVj=rF=2F=(#f5$#-a<z|w2e)x;6{~Q$hd`tM2@OXaui{(Fs
zLZ4q2ep&cAZCG()>;DDlu}uo%cy^WRKfB8He?>SIw*4+b!R^S4!R+QoDW$x%*SyN_
z!1^|iKPUV-iC-jd?Uiz>`2iBRO&Y%;d_#C^Z}Tl+vHg^93f~mIWE$M^zX6N;UirQ|
zc>DXhuvc?#37x~DoJ!pfzeAZHe|Zb@%iYY+65U=YubMw0fjW&}zmxfTf%*DowPY#P
zNtNB8@Z;BaW%)}FrKXgy@p}zF{3`Rq{Tx3_x7$JWTi4-7v2xyG<L`uWfBT!Sas0W*
zIR1R`Bae1&IFobk0f|3g^vZv2^jaO2PY7?!+{oh`{~oSRTCAKZp3UJW_A!5XB}-&;
z&h5?N9~`-rLM%*bB$k%nm&0FcGQVUR#`4&`qRyDP#Xp$8h8?`OD6g_6yk|nm@2l7=
zfZu{zQkG(Iir4Sw@PEIJy^?d^tY$mFtT*RA5Z>1RtLhfwFF#KrT`8~f6G&tuu<4O`
z=7$&9@?lUPfdf^KU%h^Q2E}^rra>Sv|L$Fs{Z<1>p~<?7IsC+X%%A!^TRY4D3ry(O
Aw*UYD

literal 0
HcmV?d00001

diff --git a/llvm/test/Transforms/PGOProfile/Inputs/memprof.nocolinfo.memprofraw b/llvm/test/Transforms/PGOProfile/Inputs/memprof.nocolinfo.memprofraw
new file mode 100644
index 0000000000000000000000000000000000000000..c0db6d2e0f56ca64de0f94fec9bf12430fb224d4
GIT binary patch
literal 2352
zcmZoHO3N=Q$o<dE009P^P#Vg&V1n}5Av6b6oPj|B%$o2=3oZg;z~nbLTkyIZp1c4>
z5}8v_n*6Vxf#H2Ugl2&9MWAYa+}|R#q|M~-;#gLLJ&$LLsq8kXMb^%6pws>%Q2)%d
z*<hN1p#e<8^#9^syL!*H508?6G$}_MS-X6qf{ZANeub4_^#?sSK=doDK+%6Ih*yv?
z_=eqVt$7>Vr#^RZ;=GcKqTgXTSiRwX2+be>p<(tP?RU8lw9%CBYQg!HZ)V<Z+%=<q
zFN*#TwP5`dXV*jQKLFvw^!pcZRD@?6lx@_xKfT>$u4_>G$@?h!8Ir*I?l_i1^e>19
z(=h#7M}M&&(w}+$>K*svJXb$F@1E|IiK>4GSp7!#T@d{rwxj4@bN)|w%aJ9S7bbf=
z{{AL-xx=*Imyz{DlL-SOR1iiZtAVo>pt2wu#+GGcU;tqdfU;SDOb{j)K$9v1$Tk=o
zo#uiX4x&N09;g6>(E%a1z`_|sgRp`h!~_rx!m#{As$2H+Gc!2wLUe;@5GK_v*!%(#
zBh@Xm2p3kU<4AQ2Ey4ww`bZ8JTDS#TybcPtz{+q^;s_KbpnU8IO}ii(gegtC$mI+y
zo1)X`c^X!pV9N*i$`?X;8e|@_dq8}cTly?CKwO5qC%VApC`=qy4#Ymu0?S|e4xvwq
zLCSBKJdB2^dw`}6=581bQ)dfR_YzG#ET7?0{|QY!EHA)lbn`u+>LZ}^Z#4a|^h-#8
n22>x*57cmgr8yXlZht3K-&`ED!B154VDSW_Ni`2<54yPk8Or<N

literal 0
HcmV?d00001

diff --git a/llvm/test/Transforms/PGOProfile/Inputs/update_memprof_inputs.sh b/llvm/test/Transforms/PGOProfile/Inputs/update_memprof_inputs.sh
index a88e6d73ecb71..cbf9a401a6072 100755
--- a/llvm/test/Transforms/PGOProfile/Inputs/update_memprof_inputs.sh
+++ b/llvm/test/Transforms/PGOProfile/Inputs/update_memprof_inputs.sh
@@ -82,6 +82,10 @@ COMMON_FLAGS="-fuse-ld=lld -Wl,--no-rosegment -gmlt -fdebug-info-for-profiling -
 ${CLANG} ${COMMON_FLAGS} -fmemory-profile ${OUTDIR}/memprof.cc -o ${OUTDIR}/memprof.exe
 env MEMPROF_OPTIONS=log_path=stdout ${OUTDIR}/memprof.exe > ${OUTDIR}/memprof.memprofraw
 
+# Generate another profile without any column numbers.
+${CLANG} ${COMMON_FLAGS} -gno-column-info -fmemory-profile ${OUTDIR}/memprof.cc -o ${OUTDIR}/memprof.nocolinfo.exe
+env MEMPROF_OPTIONS=log_path=stdout ${OUTDIR}/memprof.nocolinfo.exe > ${OUTDIR}/memprof.nocolinfo.memprofraw
+
 ${CLANG} ${COMMON_FLAGS} -fprofile-generate=. \
   ${OUTDIR}/memprof.cc -o ${OUTDIR}/pgo.exe
 env LLVM_PROFILE_FILE=${OUTDIR}/memprof_pgo.profraw ${OUTDIR}/pgo.exe
diff --git a/llvm/test/Transforms/PGOProfile/memprof.ll b/llvm/test/Transforms/PGOProfile/memprof.ll
index 5bd97b0e934af..13f370a4071e8 100644
--- a/llvm/test/Transforms/PGOProfile/memprof.ll
+++ b/llvm/test/Transforms/PGOProfile/memprof.ll
@@ -16,6 +16,7 @@
 ; RUN: llvm-profdata merge %S/Inputs/memprof.memprofraw --profiled-binary %S/Inputs/memprof.exe -o %t.memprofdata
 ; RUN: llvm-profdata merge %S/Inputs/memprof_pgo.proftext %S/Inputs/memprof.memprofraw --profiled-binary %S/Inputs/memprof.exe -o %t.pgomemprofdata
 ; RUN: llvm-profdata merge %S/Inputs/memprof_pgo.proftext -o %t.pgoprofdata
+; RUN: llvm-profdata merge %S/Inputs/memprof.nocolinfo.memprofraw --profiled-binary %S/Inputs/memprof.nocolinfo.exe -o %t.nocolinfo.memprofdata
 
 ;; In all below cases we should not get any messages about missing profile data
 ;; for any functions. Either we are not performing any matching for a particular
@@ -28,6 +29,12 @@
 ; There should not be any PGO metadata
 ; MEMPROFONLY-NOT: !prof
 
+;; Try again but using a profile with missing columns. The memprof matcher
+;; should recognize that there are no non-zero columns in the profile and
+;; not attempt to include column numbers in the matching (which means that the
+;; stack ids will be different).
+; RUN: opt < %s -passes='memprof-use<profile-filename=%t.nocolinfo.memprofdata>' -pgo-warn-missing-function -S 2>&1 | FileCheck %s --check-prefixes=MEMPROFNOCOLINFO,ALL,MEMPROFONLY
+
 ;; Test the same thing but by passing the memory profile through to a default
 ;; pipeline via -memory-profile-file=, which should cause the necessary field
 ;; of the PGOOptions structure to be populated with the profile filename.
@@ -66,6 +73,7 @@ target triple = "x86_64-unknown-linux-gnu"
 define dso_local noundef ptr @_Z3foov() #0 !dbg !10 {
 entry:
   ; MEMPROF: call {{.*}} @_Znam{{.*}} !memprof ![[M1:[0-9]+]], !callsite ![[C1:[0-9]+]]
+  ; MEMPROFNOCOLINFO: call {{.*}} @_Znam{{.*}} !memprof ![[M1:[0-9]+]], !callsite ![[C1:[0-9]+]]
   %call = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #6, !dbg !13
   ret ptr %call, !dbg !14
 }
@@ -78,6 +86,7 @@ declare noundef nonnull ptr @_Znam(i64 noundef) #1
 define dso_local noundef ptr @_Z4foo2v() #0 !dbg !15 {
 entry:
   ; MEMPROF: call {{.*}} @_Z3foov{{.*}} !callsite ![[C2:[0-9]+]]
+  ; MEMPROFNOCOLINFO: call {{.*}} @_Z3foov{{.*}} !callsite ![[C2:[0-9]+]]
   %call = call noundef ptr @_Z3foov(), !dbg !16
   ret ptr %call, !dbg !17
 }
@@ -86,6 +95,7 @@ entry:
 define dso_local noundef ptr @_Z3barv() #0 !dbg !18 {
 entry:
   ; MEMPROF: call {{.*}} @_Z4foo2v{{.*}} !callsite ![[C3:[0-9]+]]
+  ; MEMPROFNOCOLINFO: call {{.*}} @_Z4foo2v{{.*}} !callsite ![[C3:[0-9]+]]
   %call = call noundef ptr @_Z4foo2v(), !dbg !19
   ret ptr %call, !dbg !20
 }
@@ -94,6 +104,7 @@ entry:
 define dso_local noundef ptr @_Z3bazv() #0 !dbg !21 {
 entry:
   ; MEMPROF: call {{.*}} @_Z4foo2v{{.*}} !callsite ![[C4:[0-9]+]]
+  ; MEMPROFNOCOLINFO: call {{.*}} @_Z4foo2v{{.*}} !callsite ![[C4:[0-9]+]]
   %call = call noundef ptr @_Z4foo2v(), !dbg !22
   ret ptr %call, !dbg !23
 }
@@ -110,6 +121,7 @@ entry:
 
 if.then:                                          ; preds = %entry
   ; MEMPROF: call {{.*}} @_Z3foov{{.*}} !callsite ![[C5:[0-9]+]]
+  ; MEMPROFNOCOLINFO: call {{.*}} @_Z3foov{{.*}} !callsite ![[C5:[0-9]+]]
   %call = call noundef ptr @_Z3foov(), !dbg !27
   store ptr %call, ptr %retval, align 8, !dbg !28
   br label %return, !dbg !28
@@ -118,6 +130,7 @@ if.end:                                           ; preds = %entry
   %1 = load i32, ptr %n.addr, align 4, !dbg !29
   %sub = sub i32 %1, 1, !dbg !30
   ; MEMPROF: call {{.*}} @_Z7recursej{{.*}} !callsite ![[C6:[0-9]+]]
+  ; MEMPROFNOCOLINFO: call {{.*}} @_Z7recursej{{.*}} !callsite ![[C6:[0-9]+]]
   %call1 = call noundef ptr @_Z7recursej(i32 noundef %sub), !dbg !31
   store ptr %call1, ptr %retval, align 8, !dbg !32
   br label %return, !dbg !32
@@ -145,21 +158,27 @@ entry:
   store i32 %argc, ptr %argc.addr, align 4
   store ptr %argv, ptr %argv.addr, align 8
   ; MEMPROF: call {{.*}} @_Znam{{.*}} #[[A1:[0-9]+]]
+  ; MEMPROFNOCOLINFO: call {{.*}} @_Znam{{.*}} #[[A1:[0-9]+]]
   %call = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #6, !dbg !35
   store ptr %call, ptr %a, align 8, !dbg !36
   ; MEMPROF: call {{.*}} @_Znam{{.*}} #[[A2:[0-9]+]]
+  ; MEMPROFNOCOLINFO: call {{.*}} @_Znam{{.*}} #[[A2:[0-9]+]]
   %call1 = call noalias noundef nonnull ptr @_Znam(i64 noundef 10) #6, !dbg !37
   store ptr %call1, ptr %b, align 8, !dbg !38
   ; MEMPROF: call {{.*}} @_Z3foov{{.*}} !callsite ![[C7:[0-9]+]]
+  ; MEMPROFNOCOLINFO: call {{.*}} @_Z3foov{{.*}} !callsite ![[C7:[0-9]+]]
   %call2 = call noundef ptr @_Z3foov(), !dbg !39
   store ptr %call2, ptr %c, align 8, !dbg !40
   ; MEMPROF: call {{.*}} @_Z3foov{{.*}} !callsite ![[C8:[0-9]+]]
+  ; MEMPROFNOCOLINFO: call {{.*}} @_Z3foov{{.*}} !callsite ![[C8:[0-9]+]]
   %call3 = call noundef ptr @_Z3foov(), !dbg !41
   store ptr %call3, ptr %d, align 8, !dbg !42
   ; MEMPROF: call {{.*}} @_Z3barv{{.*}} !callsite ![[C9:[0-9]+]]
+  ; MEMPROFNOCOLINFO: call {{.*}} @_Z3barv{{.*}} !callsite ![[C9:[0-9]+]]
   %call4 = call noundef ptr @_Z3barv(), !dbg !43
   store ptr %call4, ptr %e, align 8, !dbg !44
   ; MEMPROF: call {{.*}} @_Z3bazv{{.*}} !callsite ![[C10:[0-9]+]]
+  ; MEMPROFNOCOLINFO: call {{.*}} @_Z3bazv{{.*}} !callsite ![[C10:[0-9]+]]
   %call5 = call noundef ptr @_Z3bazv(), !dbg !45
   store ptr %call5, ptr %f, align 8, !dbg !46
   %0 = load ptr, ptr %a, align 8, !dbg !47
@@ -241,6 +260,7 @@ for.body:                                         ; preds = %for.cond
   %13 = load i32, ptr %i, align 4, !dbg !84
   %add = add i32 %13, 3, !dbg !85
   ; MEMPROF: call {{.*}} @_Z7recursej{{.*}} !callsite ![[C11:[0-9]+]]
+  ; MEMPROFNOCOLINFO: call {{.*}} @_Z7recursej{{.*}} !callsite ![[C11:[0-9]+]]
   %call22 = call noundef ptr @_Z7recursej(i32 noundef %add), !dbg !86
   store ptr %call22, ptr %g, align 8, !dbg !87
   %14 = load ptr, ptr %g, align 8, !dbg !88
@@ -300,6 +320,32 @@ for.end:                                          ; preds = %for.cond
 ; MEMPROF: ![[C10]] = !{i64 2061451396820446691}
 ; MEMPROF: ![[C11]] = !{i64 1544787832369987002}
 
+
+; MEMPROFNOCOLINFO: #[[A1]] = { builtin allocsize(0) "memprof"="notcold" }
+; MEMPROFNOCOLINFO: #[[A2]] = { builtin allocsize(0) "memprof"="cold" }
+; MEMPROFNOCOLINFO: ![[M1]] = !{![[MIB1:[0-9]+]], ![[MIB2:[0-9]+]], ![[MIB3:[0-9]+]], ![[MIB4:[0-9]+]], ![[MIB5:[0-9]+]]}
+; MEMPROFNOCOLINFO: ![[MIB1]] = !{![[STACK1:[0-9]+]], !"cold"}
+; MEMPROFNOCOLINFO: ![[STACK1]] = !{i64 5281664982037379640, i64 6362220161075421157, i64 -5772587307814069790, i64 -5772587307814069790, i64 -5772587307814069790, i64 3577763375057267810}
+; MEMPROFNOCOLINFO: ![[MIB2]] = !{![[STACK2:[0-9]+]], !"notcold"}
+; MEMPROFNOCOLINFO: ![[STACK2]] = !{i64 5281664982037379640, i64 6362220161075421157, i64 -5772587307814069790, i64 -5772587307814069790, i64 -5772587307814069790, i64 -5772587307814069790}
+; MEMPROFNOCOLINFO: ![[MIB3]] = !{![[STACK3:[0-9]+]], !"notcold"}
+; MEMPROFNOCOLINFO: ![[STACK3]] = !{i64 5281664982037379640, i64 -6896091699916449732}
+; MEMPROFNOCOLINFO: ![[MIB4]] = !{![[STACK4:[0-9]+]], !"cold"}
+; MEMPROFNOCOLINFO: ![[STACK4]] = !{i64 5281664982037379640, i64 -6871734214936418908}
+; MEMPROFNOCOLINFO: ![[MIB5]] = !{![[STACK5:[0-9]+]], !"cold"}
+; MEMPROFNOCOLINFO: ![[STACK5]] = !{i64 5281664982037379640, i64 -6201180255894224618}
+; MEMPROFNOCOLINFO: ![[C1]] = !{i64 5281664982037379640}
+; MEMPROFNOCOLINFO: ![[C2]] = !{i64 -6871734214936418908}
+; MEMPROFNOCOLINFO: ![[C3]] = !{i64 -5588766871448036195}
+; MEMPROFNOCOLINFO: ![[C4]] = !{i64 -8990226808646054327}
+; MEMPROFNOCOLINFO: ![[C5]] = !{i64 6362220161075421157}
+; MEMPROFNOCOLINFO: ![[C6]] = !{i64 -5772587307814069790}
+; MEMPROFNOCOLINFO: ![[C7]] = !{i64 -6896091699916449732}
+; MEMPROFNOCOLINFO: ![[C8]] = !{i64 -6201180255894224618}
+; MEMPROFNOCOLINFO: ![[C9]] = !{i64 -962804290746547393}
+; MEMPROFNOCOLINFO: ![[C10]] = !{i64 -4535090212904553409}
+; MEMPROFNOCOLINFO: ![[C11]] = !{i64 3577763375057267810}
+
 ; Function Attrs: argmemonly nofree nounwind willreturn writeonly
 declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg) #3
 

From 92d8a28cc665d73d9d679b8c014dd04f95d1df18 Mon Sep 17 00:00:00 2001
From: Pete Lawrence <plawrence@apple.com>
Date: Mon, 30 Oct 2023 10:21:00 -1000
Subject: [PATCH 430/877] [lldb] Part 2 of 2 - Refactor
 `CommandObject::DoExecute(...)` return `void` (not `bool`) (#69991)

[lldb] Part 2 of 2 - Refactor `CommandObject::DoExecute(...)` to return
`void` instead of ~~`bool`~~

Justifications:
- The code doesn't ultimately apply the `true`/`false` return values.
- The methods already pass around a `CommandReturnObject`, typically
with a `result` parameter.
- Each command return object already contains:
	- A more precise status
	- The error code(s) that apply to that status

Part 1 refactors the `CommandObject::Execute(...)` method.
- See
[https://github.com/llvm/llvm-project/pull/69989](https://github.com/llvm/llvm-project/pull/69989)

rdar://117378957
---
 lldb/include/lldb/Interpreter/CommandObject.h |   4 +-
 lldb/source/API/SBCommandInterpreter.cpp      |   6 +-
 lldb/source/Commands/CommandObjectApropos.cpp |   4 +-
 lldb/source/Commands/CommandObjectApropos.h   |   2 +-
 .../Commands/CommandObjectBreakpoint.cpp      | 105 ++++-----
 .../CommandObjectBreakpointCommand.cpp        |  25 +-
 .../source/Commands/CommandObjectCommands.cpp | 151 ++++++------
 .../Commands/CommandObjectDWIMPrint.cpp       |  10 +-
 lldb/source/Commands/CommandObjectDWIMPrint.h |   2 +-
 .../Commands/CommandObjectDiagnostics.cpp     |   8 +-
 .../Commands/CommandObjectDisassemble.cpp     |  12 +-
 .../Commands/CommandObjectDisassemble.h       |   2 +-
 .../Commands/CommandObjectExpression.cpp      |  15 +-
 .../source/Commands/CommandObjectExpression.h |   2 +-
 lldb/source/Commands/CommandObjectFrame.cpp   |  76 +++----
 lldb/source/Commands/CommandObjectGUI.cpp     |   4 +-
 lldb/source/Commands/CommandObjectGUI.h       |   2 +-
 lldb/source/Commands/CommandObjectHelp.cpp    |   8 +-
 lldb/source/Commands/CommandObjectHelp.h      |   2 +-
 lldb/source/Commands/CommandObjectLanguage.h  |   2 +-
 lldb/source/Commands/CommandObjectLog.cpp     |  42 ++--
 lldb/source/Commands/CommandObjectMemory.cpp  | 147 ++++++------
 .../Commands/CommandObjectMemoryTag.cpp       |  30 ++-
 .../source/Commands/CommandObjectPlatform.cpp | 102 ++++-----
 lldb/source/Commands/CommandObjectPlugin.cpp  |   6 +-
 lldb/source/Commands/CommandObjectProcess.cpp |  93 +++-----
 lldb/source/Commands/CommandObjectQuit.cpp    |  12 +-
 lldb/source/Commands/CommandObjectQuit.h      |   2 +-
 .../Commands/CommandObjectRegexCommand.cpp    |  10 +-
 .../Commands/CommandObjectRegexCommand.h      |   2 +-
 .../source/Commands/CommandObjectRegister.cpp |  14 +-
 lldb/source/Commands/CommandObjectScript.cpp  |  12 +-
 lldb/source/Commands/CommandObjectScript.h    |   2 +-
 lldb/source/Commands/CommandObjectSession.cpp |   6 +-
 .../source/Commands/CommandObjectSettings.cpp |  91 +++-----
 lldb/source/Commands/CommandObjectSource.cpp  |  33 ++-
 lldb/source/Commands/CommandObjectStats.cpp   |  13 +-
 lldb/source/Commands/CommandObjectTarget.cpp  | 215 ++++++++----------
 lldb/source/Commands/CommandObjectThread.cpp  | 140 +++++-------
 .../Commands/CommandObjectThreadUtil.cpp      |  27 ++-
 .../source/Commands/CommandObjectThreadUtil.h |   4 +-
 lldb/source/Commands/CommandObjectTrace.cpp   |  21 +-
 lldb/source/Commands/CommandObjectType.cpp    | 108 ++++-----
 lldb/source/Commands/CommandObjectVersion.cpp |   3 +-
 lldb/source/Commands/CommandObjectVersion.h   |   2 +-
 .../Commands/CommandObjectWatchpoint.cpp      |  84 +++----
 .../CommandObjectWatchpointCommand.cpp        |  29 +--
 .../ItaniumABI/ItaniumABILanguageRuntime.cpp  |   3 +-
 .../AppleObjCRuntime/AppleObjCRuntimeV2.cpp   |  22 +-
 .../Process/MacOSX-Kernel/ProcessKDP.cpp      |  11 +-
 .../Process/gdb-remote/ProcessGDBRemote.cpp   |  27 +--
 .../Process/minidump/ProcessMinidump.cpp      |   8 +-
 .../DarwinLog/StructuredDataDarwinLog.cpp     |  12 +-
 .../intel-pt/CommandObjectTraceStartIntelPT.h |   2 +-
 .../ctf/CommandObjectThreadTraceExportCTF.cpp |   6 +-
 .../ctf/CommandObjectThreadTraceExportCTF.h   |   2 +-
 .../Interpreter/TestCommandPaths.cpp          |   3 +-
 57 files changed, 747 insertions(+), 1041 deletions(-)

diff --git a/lldb/include/lldb/Interpreter/CommandObject.h b/lldb/include/lldb/Interpreter/CommandObject.h
index 004f5d42f1e44..7b427de0264f7 100644
--- a/lldb/include/lldb/Interpreter/CommandObject.h
+++ b/lldb/include/lldb/Interpreter/CommandObject.h
@@ -401,7 +401,7 @@ class CommandObjectParsed : public CommandObject {
   void Execute(const char *args_string, CommandReturnObject &result) override;
 
 protected:
-  virtual bool DoExecute(Args &command, CommandReturnObject &result) = 0;
+  virtual void DoExecute(Args &command, CommandReturnObject &result) = 0;
 
   bool WantsRawCommandString() override { return false; }
 };
@@ -418,7 +418,7 @@ class CommandObjectRaw : public CommandObject {
   void Execute(const char *args_string, CommandReturnObject &result) override;
 
 protected:
-  virtual bool DoExecute(llvm::StringRef command,
+  virtual void DoExecute(llvm::StringRef command,
                          CommandReturnObject &result) = 0;
 
   bool WantsRawCommandString() override { return true; }
diff --git a/lldb/source/API/SBCommandInterpreter.cpp b/lldb/source/API/SBCommandInterpreter.cpp
index d275da933919e..c3cbb00145ed3 100644
--- a/lldb/source/API/SBCommandInterpreter.cpp
+++ b/lldb/source/API/SBCommandInterpreter.cpp
@@ -70,13 +70,11 @@ class CommandPluginInterfaceImplementation : public CommandObjectParsed {
   }
 
 protected:
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     SBCommandReturnObject sb_return(result);
     SBCommandInterpreter sb_interpreter(&m_interpreter);
     SBDebugger debugger_sb(m_interpreter.GetDebugger().shared_from_this());
-    bool ret = m_backend->DoExecute(debugger_sb, command.GetArgumentVector(),
-                                    sb_return);
-    return ret;
+    m_backend->DoExecute(debugger_sb, command.GetArgumentVector(), sb_return);
   }
   std::shared_ptr<lldb::SBCommandPluginInterface> m_backend;
   std::optional<std::string> m_auto_repeat_command;
diff --git a/lldb/source/Commands/CommandObjectApropos.cpp b/lldb/source/Commands/CommandObjectApropos.cpp
index c6680f8b140d1..88c214d4fc56a 100644
--- a/lldb/source/Commands/CommandObjectApropos.cpp
+++ b/lldb/source/Commands/CommandObjectApropos.cpp
@@ -38,7 +38,7 @@ CommandObjectApropos::CommandObjectApropos(CommandInterpreter &interpreter)
 
 CommandObjectApropos::~CommandObjectApropos() = default;
 
-bool CommandObjectApropos::DoExecute(Args &args, CommandReturnObject &result) {
+void CommandObjectApropos::DoExecute(Args &args, CommandReturnObject &result) {
   const size_t argc = args.GetArgumentCount();
 
   if (argc == 1) {
@@ -90,6 +90,4 @@ bool CommandObjectApropos::DoExecute(Args &args, CommandReturnObject &result) {
   } else {
     result.AppendError("'apropos' must be called with exactly one argument.\n");
   }
-
-  return result.Succeeded();
 }
diff --git a/lldb/source/Commands/CommandObjectApropos.h b/lldb/source/Commands/CommandObjectApropos.h
index 042753f240328..f43420c1815d9 100644
--- a/lldb/source/Commands/CommandObjectApropos.h
+++ b/lldb/source/Commands/CommandObjectApropos.h
@@ -23,7 +23,7 @@ class CommandObjectApropos : public CommandObjectParsed {
   ~CommandObjectApropos() override;
 
 protected:
-  bool DoExecute(Args &command, CommandReturnObject &result) override;
+  void DoExecute(Args &command, CommandReturnObject &result) override;
 };
 
 } // namespace lldb_private
diff --git a/lldb/source/Commands/CommandObjectBreakpoint.cpp b/lldb/source/Commands/CommandObjectBreakpoint.cpp
index 18cbb9528b717..e1d1c5e42c32a 100644
--- a/lldb/source/Commands/CommandObjectBreakpoint.cpp
+++ b/lldb/source/Commands/CommandObjectBreakpoint.cpp
@@ -528,7 +528,7 @@ class CommandObjectBreakpointSet : public CommandObjectParsed {
   };
 
 protected:
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     Target &target = GetSelectedOrDummyTarget(m_dummy_options.m_use_dummy);
 
     // The following are the various types of breakpoints that could be set:
@@ -577,12 +577,12 @@ class CommandObjectBreakpointSet : public CommandObjectParsed {
       if (num_files == 0) {
         if (!GetDefaultFile(target, file, result)) {
           result.AppendError("No file supplied and no default file available.");
-          return false;
+          return;
         }
       } else if (num_files > 1) {
         result.AppendError("Only one file at a time is allowed for file and "
                            "line breakpoints.");
-        return false;
+        return;
       } else
         file = m_options.m_filenames.GetFileSpecAtIndex(0);
 
@@ -613,7 +613,7 @@ class CommandObjectBreakpointSet : public CommandObjectParsed {
       } else {
         result.AppendError("Only one shared library can be specified for "
                            "address breakpoints.");
-        return false;
+        return;
       }
       break;
     }
@@ -647,7 +647,7 @@ class CommandObjectBreakpointSet : public CommandObjectParsed {
             result.AppendWarning(
                 "Function name regex does not accept glob patterns.");
         }
-        return false;
+        return;
       }
 
       bp_sp = target.CreateFuncRegexBreakpoint(
@@ -664,7 +664,7 @@ class CommandObjectBreakpointSet : public CommandObjectParsed {
         if (!GetDefaultFile(target, file, result)) {
           result.AppendError(
               "No files provided and could not find default file.");
-          return false;
+          return;
         } else {
           m_options.m_filenames.Append(file);
         }
@@ -675,7 +675,7 @@ class CommandObjectBreakpointSet : public CommandObjectParsed {
         result.AppendErrorWithFormat(
             "Source text regular expression could not be compiled: \"%s\"",
             llvm::toString(std::move(err)).c_str());
-        return false;
+        return;
       }
       bp_sp = target.CreateSourceRegexBreakpoint(
           &(m_options.m_modules), &(m_options.m_filenames),
@@ -693,7 +693,7 @@ class CommandObjectBreakpointSet : public CommandObjectParsed {
             "Error setting extra exception arguments: %s",
             precond_error.AsCString());
         target.RemoveBreakpointByID(bp_sp->GetID());
-        return false;
+        return;
       }
     } break;
     case eSetTypeScripted: {
@@ -707,7 +707,7 @@ class CommandObjectBreakpointSet : public CommandObjectParsed {
         result.AppendErrorWithFormat(
             "Error setting extra exception arguments: %s", error.AsCString());
         target.RemoveBreakpointByID(bp_sp->GetID());
-        return false;
+        return;
       }
     } break;
     default:
@@ -726,7 +726,7 @@ class CommandObjectBreakpointSet : public CommandObjectParsed {
             result.AppendErrorWithFormat("Invalid breakpoint name: %s",
                                          name.c_str());
             target.RemoveBreakpointByID(bp_sp->GetID());
-            return false;
+            return;
           }
         }
       }
@@ -753,8 +753,6 @@ class CommandObjectBreakpointSet : public CommandObjectParsed {
     } else if (!bp_sp) {
       result.AppendError("Breakpoint creation failed: No breakpoint created.");
     }
-
-    return result.Succeeded();
   }
 
 private:
@@ -835,7 +833,7 @@ class CommandObjectBreakpointModify : public CommandObjectParsed {
   Options *GetOptions() override { return &m_options; }
 
 protected:
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     Target &target = GetSelectedOrDummyTarget(m_dummy_opts.m_use_dummy);
 
     std::unique_lock<std::recursive_mutex> lock;
@@ -868,8 +866,6 @@ class CommandObjectBreakpointModify : public CommandObjectParsed {
         }
       }
     }
-
-    return result.Succeeded();
   }
 
 private:
@@ -906,7 +902,7 @@ class CommandObjectBreakpointEnable : public CommandObjectParsed {
   }
 
 protected:
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     Target &target = GetSelectedOrDummyTarget();
 
     std::unique_lock<std::recursive_mutex> lock;
@@ -918,7 +914,7 @@ class CommandObjectBreakpointEnable : public CommandObjectParsed {
 
     if (num_breakpoints == 0) {
       result.AppendError("No breakpoints exist to be enabled.");
-      return false;
+      return;
     }
 
     if (command.empty()) {
@@ -963,8 +959,6 @@ class CommandObjectBreakpointEnable : public CommandObjectParsed {
         result.SetStatus(eReturnStatusSuccessFinishNoResult);
       }
     }
-
-    return result.Succeeded();
   }
 };
 
@@ -1020,7 +1014,7 @@ the second re-enables the first location.");
   }
 
 protected:
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     Target &target = GetSelectedOrDummyTarget();
     std::unique_lock<std::recursive_mutex> lock;
     target.GetBreakpointList().GetListMutex(lock);
@@ -1030,7 +1024,7 @@ the second re-enables the first location.");
 
     if (num_breakpoints == 0) {
       result.AppendError("No breakpoints exist to be disabled.");
-      return false;
+      return;
     }
 
     if (command.empty()) {
@@ -1076,8 +1070,6 @@ the second re-enables the first location.");
         result.SetStatus(eReturnStatusSuccessFinishNoResult);
       }
     }
-
-    return result.Succeeded();
   }
 };
 
@@ -1168,7 +1160,7 @@ class CommandObjectBreakpointList : public CommandObjectParsed {
   };
 
 protected:
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     Target &target = GetSelectedOrDummyTarget(m_options.m_use_dummy);
 
     const BreakpointList &breakpoints =
@@ -1181,7 +1173,7 @@ class CommandObjectBreakpointList : public CommandObjectParsed {
     if (num_breakpoints == 0) {
       result.AppendMessage("No breakpoints currently set.");
       result.SetStatus(eReturnStatusSuccessFinishNoResult);
-      return true;
+      return;
     }
 
     Stream &output_stream = result.GetOutputStream();
@@ -1216,8 +1208,6 @@ class CommandObjectBreakpointList : public CommandObjectParsed {
         result.AppendError("Invalid breakpoint ID.");
       }
     }
-
-    return result.Succeeded();
   }
 
 private:
@@ -1289,7 +1279,7 @@ class CommandObjectBreakpointClear : public CommandObjectParsed {
   };
 
 protected:
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     Target &target = GetSelectedOrDummyTarget();
 
     // The following are the various types of breakpoints that could be
@@ -1310,7 +1300,7 @@ class CommandObjectBreakpointClear : public CommandObjectParsed {
     // Early return if there's no breakpoint at all.
     if (num_breakpoints == 0) {
       result.AppendError("Breakpoint clear: No breakpoint cleared.");
-      return result.Succeeded();
+      return;
     }
 
     // Find matching breakpoints and delete them.
@@ -1357,8 +1347,6 @@ class CommandObjectBreakpointClear : public CommandObjectParsed {
     } else {
       result.AppendError("Breakpoint clear: No breakpoint cleared.");
     }
-
-    return result.Succeeded();
   }
 
 private:
@@ -1445,7 +1433,7 @@ class CommandObjectBreakpointDelete : public CommandObjectParsed {
   };
 
 protected:
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     Target &target = GetSelectedOrDummyTarget(m_options.m_use_dummy);
     result.Clear();
     
@@ -1458,7 +1446,7 @@ class CommandObjectBreakpointDelete : public CommandObjectParsed {
 
     if (num_breakpoints == 0) {
       result.AppendError("No breakpoints exist to be deleted.");
-      return false;
+      return;
     }
 
     // Handle the delete all breakpoints case:
@@ -1475,7 +1463,7 @@ class CommandObjectBreakpointDelete : public CommandObjectParsed {
             (uint64_t)num_breakpoints, num_breakpoints > 1 ? "s" : "");
       }
       result.SetStatus(eReturnStatusSuccessFinishNoResult);
-      return result.Succeeded();
+      return;
     }
  
     // Either we have some kind of breakpoint specification(s),
@@ -1491,7 +1479,7 @@ class CommandObjectBreakpointDelete : public CommandObjectParsed {
             command, &target, result, &excluded_bp_ids,
             BreakpointName::Permissions::PermissionKinds::deletePerm);
         if (!result.Succeeded())
-          return false;
+          return;
       }
 
       for (auto breakpoint_sp : breakpoints.Breakpoints()) {
@@ -1504,14 +1492,14 @@ class CommandObjectBreakpointDelete : public CommandObjectParsed {
       }
       if (valid_bp_ids.GetSize() == 0) {
         result.AppendError("No disabled breakpoints.");
-        return false;
+        return;
       }
     } else {
       CommandObjectMultiwordBreakpoint::VerifyBreakpointOrLocationIDs(
           command, &target, result, &valid_bp_ids,
           BreakpointName::Permissions::PermissionKinds::deletePerm);
       if (!result.Succeeded())
-        return false;
+        return;
     }
     
     int delete_count = 0;
@@ -1542,7 +1530,6 @@ class CommandObjectBreakpointDelete : public CommandObjectParsed {
         "%d breakpoints deleted; %d breakpoint locations disabled.\n",
         delete_count, disable_count);
     result.SetStatus(eReturnStatusSuccessFinishNoResult);
-    return result.Succeeded();
   }
 
 private:
@@ -1709,12 +1696,12 @@ class CommandObjectBreakpointNameConfigure : public CommandObjectParsed {
   Options *GetOptions() override { return &m_option_group; }
 
 protected:
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
 
     const size_t argc = command.GetArgumentCount();
     if (argc == 0) {
       result.AppendError("No names provided.");
-      return false;
+      return;
     }
 
     Target &target = GetSelectedOrDummyTarget(false);
@@ -1728,7 +1715,7 @@ class CommandObjectBreakpointNameConfigure : public CommandObjectParsed {
       if (!BreakpointID::StringIsBreakpointName(entry.ref(), error)) {
         result.AppendErrorWithFormat("Invalid breakpoint name: %s - %s",
                                      entry.c_str(), error.AsCString());
-        return false;
+        return;
       }
     }
     // Now configure them, we already pre-checked the names so we don't need to
@@ -1741,7 +1728,7 @@ class CommandObjectBreakpointNameConfigure : public CommandObjectParsed {
       if (!bp_sp) {
         result.AppendErrorWithFormatv("Could not find specified breakpoint {0}",
                                       bp_id);
-        return false;
+        return;
       }
     }
 
@@ -1765,7 +1752,6 @@ class CommandObjectBreakpointNameConfigure : public CommandObjectParsed {
                                        m_bp_opts.GetBreakpointOptions(),
                                        m_access_options.GetPermissions());
     }
-    return true;
   }
 
 private:
@@ -1806,10 +1792,10 @@ class CommandObjectBreakpointNameAdd : public CommandObjectParsed {
   Options *GetOptions() override { return &m_option_group; }
 
 protected:
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     if (!m_name_options.m_name.OptionWasSet()) {
       result.AppendError("No name option provided.");
-      return false;
+      return;
     }
 
     Target &target =
@@ -1823,7 +1809,7 @@ class CommandObjectBreakpointNameAdd : public CommandObjectParsed {
     size_t num_breakpoints = breakpoints.GetSize();
     if (num_breakpoints == 0) {
       result.AppendError("No breakpoints, cannot add names.");
-      return false;
+      return;
     }
 
     // Particular breakpoint selected; disable that breakpoint.
@@ -1835,7 +1821,7 @@ class CommandObjectBreakpointNameAdd : public CommandObjectParsed {
     if (result.Succeeded()) {
       if (valid_bp_ids.GetSize() == 0) {
         result.AppendError("No breakpoints specified, cannot add names.");
-        return false;
+        return;
       }
       size_t num_valid_ids = valid_bp_ids.GetSize();
       const char *bp_name = m_name_options.m_name.GetCurrentValue();
@@ -1848,8 +1834,6 @@ class CommandObjectBreakpointNameAdd : public CommandObjectParsed {
         target.AddNameToBreakpoint(bp_sp, bp_name, error);
       }
     }
-
-    return true;
   }
 
 private:
@@ -1889,10 +1873,10 @@ class CommandObjectBreakpointNameDelete : public CommandObjectParsed {
   Options *GetOptions() override { return &m_option_group; }
 
 protected:
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     if (!m_name_options.m_name.OptionWasSet()) {
       result.AppendError("No name option provided.");
-      return false;
+      return;
     }
 
     Target &target =
@@ -1906,7 +1890,7 @@ class CommandObjectBreakpointNameDelete : public CommandObjectParsed {
     size_t num_breakpoints = breakpoints.GetSize();
     if (num_breakpoints == 0) {
       result.AppendError("No breakpoints, cannot delete names.");
-      return false;
+      return;
     }
 
     // Particular breakpoint selected; disable that breakpoint.
@@ -1918,7 +1902,7 @@ class CommandObjectBreakpointNameDelete : public CommandObjectParsed {
     if (result.Succeeded()) {
       if (valid_bp_ids.GetSize() == 0) {
         result.AppendError("No breakpoints specified, cannot delete names.");
-        return false;
+        return;
       }
       ConstString bp_name(m_name_options.m_name.GetCurrentValue());
       size_t num_valid_ids = valid_bp_ids.GetSize();
@@ -1929,8 +1913,6 @@ class CommandObjectBreakpointNameDelete : public CommandObjectParsed {
         target.RemoveNameFromBreakpoint(bp_sp, bp_name);
       }
     }
-
-    return true;
   }
 
 private:
@@ -1955,7 +1937,7 @@ class CommandObjectBreakpointNameList : public CommandObjectParsed {
   Options *GetOptions() override { return &m_option_group; }
 
 protected:
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     Target &target =
         GetSelectedOrDummyTarget(m_name_options.m_use_dummy.GetCurrentValue());
 
@@ -2005,7 +1987,6 @@ class CommandObjectBreakpointNameList : public CommandObjectParsed {
         }
       }
     }
-    return true;
   }
 
 private:
@@ -2267,7 +2248,7 @@ class CommandObjectBreakpointRead : public CommandObjectParsed {
   };
 
 protected:
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     Target &target = GetSelectedOrDummyTarget();
 
     std::unique_lock<std::recursive_mutex> lock;
@@ -2281,7 +2262,7 @@ class CommandObjectBreakpointRead : public CommandObjectParsed {
 
     if (!error.Success()) {
       result.AppendError(error.AsCString());
-      return false;
+      return;
     }
 
     Stream &output_stream = result.GetOutputStream();
@@ -2302,7 +2283,6 @@ class CommandObjectBreakpointRead : public CommandObjectParsed {
                              false);
       }
     }
-    return result.Succeeded();
   }
 
 private:
@@ -2383,7 +2363,7 @@ class CommandObjectBreakpointWrite : public CommandObjectParsed {
   };
 
 protected:
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     Target &target = GetSelectedOrDummyTarget();
 
     std::unique_lock<std::recursive_mutex> lock;
@@ -2397,7 +2377,7 @@ class CommandObjectBreakpointWrite : public CommandObjectParsed {
 
       if (!result.Succeeded()) {
         result.SetStatus(eReturnStatusFailed);
-        return false;
+        return;
       }
     }
     FileSpec file_spec(m_options.m_filename);
@@ -2408,7 +2388,6 @@ class CommandObjectBreakpointWrite : public CommandObjectParsed {
       result.AppendErrorWithFormat("error serializing breakpoints: %s.",
                                    error.AsCString());
     }
-    return result.Succeeded();
   }
 
 private:
diff --git a/lldb/source/Commands/CommandObjectBreakpointCommand.cpp b/lldb/source/Commands/CommandObjectBreakpointCommand.cpp
index 921243829fc6b..fefafcd94546a 100644
--- a/lldb/source/Commands/CommandObjectBreakpointCommand.cpp
+++ b/lldb/source/Commands/CommandObjectBreakpointCommand.cpp
@@ -334,7 +334,7 @@ are no syntax errors may indicate that a function was declared but never called.
   };
 
 protected:
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     Target &target = GetSelectedOrDummyTarget(m_options.m_use_dummy);
 
     const BreakpointList &breakpoints = target.GetBreakpointList();
@@ -342,7 +342,7 @@ are no syntax errors may indicate that a function was declared but never called.
 
     if (num_breakpoints == 0) {
       result.AppendError("No breakpoints exist to have commands added");
-      return false;
+      return;
     }
 
     if (!m_func_options.GetName().empty()) {
@@ -412,8 +412,6 @@ are no syntax errors may indicate that a function was declared but never called.
           CollectDataForBreakpointCommandCallback(m_bp_options_vec, result);
       }
     }
-
-    return result.Succeeded();
   }
 
 private:
@@ -506,7 +504,7 @@ class CommandObjectBreakpointCommandDelete : public CommandObjectParsed {
   };
 
 protected:
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     Target &target = GetSelectedOrDummyTarget(m_options.m_use_dummy);
 
     const BreakpointList &breakpoints = target.GetBreakpointList();
@@ -514,13 +512,13 @@ class CommandObjectBreakpointCommandDelete : public CommandObjectParsed {
 
     if (num_breakpoints == 0) {
       result.AppendError("No breakpoints exist to have commands deleted");
-      return false;
+      return;
     }
 
     if (command.empty()) {
       result.AppendError(
           "No breakpoint specified from which to delete the commands");
-      return false;
+      return;
     }
 
     BreakpointIDList valid_bp_ids;
@@ -544,7 +542,7 @@ class CommandObjectBreakpointCommandDelete : public CommandObjectParsed {
               result.AppendErrorWithFormat("Invalid breakpoint ID: %u.%u.\n",
                                            cur_bp_id.GetBreakpointID(),
                                            cur_bp_id.GetLocationID());
-              return false;
+              return;
             }
           } else {
             bp->ClearCallback();
@@ -552,7 +550,6 @@ class CommandObjectBreakpointCommandDelete : public CommandObjectParsed {
         }
       }
     }
-    return result.Succeeded();
   }
 
 private:
@@ -586,7 +583,7 @@ class CommandObjectBreakpointCommandList : public CommandObjectParsed {
   ~CommandObjectBreakpointCommandList() override = default;
 
 protected:
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     Target *target = &GetSelectedTarget();
 
     const BreakpointList &breakpoints = target->GetBreakpointList();
@@ -594,13 +591,13 @@ class CommandObjectBreakpointCommandList : public CommandObjectParsed {
 
     if (num_breakpoints == 0) {
       result.AppendError("No breakpoints exist for which to list commands");
-      return false;
+      return;
     }
 
     if (command.empty()) {
       result.AppendError(
           "No breakpoint specified for which to list the commands");
-      return false;
+      return;
     }
 
     BreakpointIDList valid_bp_ids;
@@ -624,7 +621,7 @@ class CommandObjectBreakpointCommandList : public CommandObjectParsed {
                 result.AppendErrorWithFormat("Invalid breakpoint ID: %u.%u.\n",
                                              cur_bp_id.GetBreakpointID(),
                                              cur_bp_id.GetLocationID());
-                return false;
+                return;
               }
             }
 
@@ -661,8 +658,6 @@ class CommandObjectBreakpointCommandList : public CommandObjectParsed {
         }
       }
     }
-
-    return result.Succeeded();
   }
 };
 
diff --git a/lldb/source/Commands/CommandObjectCommands.cpp b/lldb/source/Commands/CommandObjectCommands.cpp
index 656ace223b5f1..74d97b0db16cb 100644
--- a/lldb/source/Commands/CommandObjectCommands.cpp
+++ b/lldb/source/Commands/CommandObjectCommands.cpp
@@ -129,12 +129,12 @@ class CommandObjectCommandsSource : public CommandObjectParsed {
     OptionValueBoolean m_cmd_relative_to_command_file;
   };
 
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     if (command.GetArgumentCount() != 1) {
       result.AppendErrorWithFormat(
           "'%s' takes exactly one executable filename argument.\n",
           GetCommandName().str().c_str());
-      return false;
+      return;
     }
 
     FileSpec source_dir = {};
@@ -144,7 +144,7 @@ class CommandObjectCommandsSource : public CommandObjectParsed {
         result.AppendError("command source -C can only be specified "
                            "from a command file");
         result.SetStatus(eReturnStatusFailed);
-        return false;
+        return;
       }
     }
 
@@ -155,7 +155,7 @@ class CommandObjectCommandsSource : public CommandObjectParsed {
         result.AppendError("command source -C can only be used "
                            "with a relative path.");
         result.SetStatus(eReturnStatusFailed);
-        return false;
+        return;
       }
       cmd_file.MakeAbsolute(source_dir);
     }
@@ -186,7 +186,6 @@ class CommandObjectCommandsSource : public CommandObjectParsed {
     }
 
     m_interpreter.HandleCommandsFromFile(cmd_file, options, result);
-    return result.Succeeded();
   }
 
   CommandOptions m_options;
@@ -384,11 +383,11 @@ rather than using a positional placeholder:"
   ~CommandObjectCommandsAlias() override = default;
 
 protected:
-  bool DoExecute(llvm::StringRef raw_command_line,
+  void DoExecute(llvm::StringRef raw_command_line,
                  CommandReturnObject &result) override {
     if (raw_command_line.empty()) {
       result.AppendError("'command alias' requires at least two arguments");
-      return false;
+      return;
     }
 
     ExecutionContext exe_ctx = GetCommandInterpreter().GetExecutionContext();
@@ -399,14 +398,14 @@ rather than using a positional placeholder:"
     if (args_with_suffix.HasArgs())
       if (!ParseOptionsAndNotify(args_with_suffix.GetArgs(), result,
                                  m_option_group, exe_ctx))
-        return false;
+        return;
 
     llvm::StringRef raw_command_string = args_with_suffix.GetRawPart();
     Args args(raw_command_string);
 
     if (args.GetArgumentCount() < 2) {
       result.AppendError("'command alias' requires at least two arguments");
-      return false;
+      return;
     }
 
     // Get the alias command.
@@ -418,7 +417,7 @@ rather than using a positional placeholder:"
         result.AppendWarning("if trying to pass options to 'command alias' add "
                              "a -- at the end of the options");
       }
-      return false;
+      return;
     }
 
     // Strip the new alias name off 'raw_command_string'  (leave it on args,
@@ -431,7 +430,7 @@ rather than using a positional placeholder:"
         raw_command_string = raw_command_string.substr(pos);
     } else {
       result.AppendError("Error parsing command string.  No alias created.");
-      return false;
+      return;
     }
 
     // Verify that the command is alias-able.
@@ -439,7 +438,7 @@ rather than using a positional placeholder:"
       result.AppendErrorWithFormat(
           "'%s' is a permanent debugger command and cannot be redefined.\n",
           args[0].c_str());
-      return false;
+      return;
     }
 
     if (m_interpreter.UserMultiwordCommandExists(alias_command)) {
@@ -447,7 +446,7 @@ rather than using a positional placeholder:"
           "'%s' is a user container command and cannot be overwritten.\n"
           "Delete it first with 'command container delete'\n",
           args[0].c_str());
-      return false;
+      return;
     }
 
     // Get CommandObject that is being aliased. The command name is read from
@@ -462,17 +461,15 @@ rather than using a positional placeholder:"
                                    "'%s' does not begin with a valid command."
                                    "  No alias created.",
                                    original_raw_command_string.str().c_str());
-      return false;
     } else if (!cmd_obj->WantsRawCommandString()) {
       // Note that args was initialized with the original command, and has not
       // been updated to this point. Therefore can we pass it to the version of
       // Execute that does not need/expect raw input in the alias.
-      return HandleAliasingNormalCommand(args, result);
+      HandleAliasingNormalCommand(args, result);
     } else {
-      return HandleAliasingRawCommand(alias_command, raw_command_string,
-                                      *cmd_obj, result);
+      HandleAliasingRawCommand(alias_command, raw_command_string, *cmd_obj,
+                               result);
     }
-    return result.Succeeded();
   }
 
   bool HandleAliasingRawCommand(llvm::StringRef alias_command,
@@ -653,13 +650,13 @@ class CommandObjectCommandsUnalias : public CommandObjectParsed {
   }
 
 protected:
-  bool DoExecute(Args &args, CommandReturnObject &result) override {
+  void DoExecute(Args &args, CommandReturnObject &result) override {
     CommandObject::CommandMap::iterator pos;
     CommandObject *cmd_obj;
 
     if (args.empty()) {
       result.AppendError("must call 'unalias' with a valid alias");
-      return false;
+      return;
     }
 
     auto command_name = args[0].ref();
@@ -669,7 +666,7 @@ class CommandObjectCommandsUnalias : public CommandObjectParsed {
           "'%s' is not a known command.\nTry 'help' to see a "
           "current list of commands.\n",
           args[0].c_str());
-      return false;
+      return;
     }
 
     if (m_interpreter.CommandExists(command_name)) {
@@ -683,7 +680,7 @@ class CommandObjectCommandsUnalias : public CommandObjectParsed {
             "'%s' is a permanent debugger command and cannot be removed.\n",
             args[0].c_str());
       }
-      return false;
+      return;
     }
 
     if (!m_interpreter.RemoveAlias(command_name)) {
@@ -694,11 +691,10 @@ class CommandObjectCommandsUnalias : public CommandObjectParsed {
       else
         result.AppendErrorWithFormat("'%s' is not an existing alias.\n",
                                      args[0].c_str());
-      return false;
+      return;
     }
 
     result.SetStatus(eReturnStatusSuccessFinishNoResult);
-    return result.Succeeded();
   }
 };
 
@@ -742,14 +738,14 @@ class CommandObjectCommandsDelete : public CommandObjectParsed {
   }
 
 protected:
-  bool DoExecute(Args &args, CommandReturnObject &result) override {
+  void DoExecute(Args &args, CommandReturnObject &result) override {
     CommandObject::CommandMap::iterator pos;
 
     if (args.empty()) {
       result.AppendErrorWithFormat("must call '%s' with one or more valid user "
                                    "defined regular expression command names",
                                    GetCommandName().str().c_str());
-      return false;
+      return;
     }
 
     auto command_name = args[0].ref();
@@ -761,18 +757,17 @@ class CommandObjectCommandsDelete : public CommandObjectParsed {
           &error_msg_stream, command_name, llvm::StringRef(), llvm::StringRef(),
           generate_upropos, generate_type_lookup);
       result.AppendError(error_msg_stream.GetString());
-      return false;
+      return;
     }
 
     if (!m_interpreter.RemoveCommand(command_name)) {
       result.AppendErrorWithFormat(
           "'%s' is a permanent debugger command and cannot be removed.\n",
           args[0].c_str());
-      return false;
+      return;
     }
 
     result.SetStatus(eReturnStatusSuccessFinishNoResult);
-    return true;
   }
 };
 
@@ -868,12 +863,12 @@ a number follows 'f':"
     }
   }
 
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     const size_t argc = command.GetArgumentCount();
     if (argc == 0) {
       result.AppendError("usage: 'command regex <command-name> "
                          "[s/<regex1>/<subst1>/ s/<regex2>/<subst2>/ ...]'\n");
-      return false;
+      return;
     }
 
     Status error;
@@ -914,8 +909,6 @@ a number follows 'f':"
     if (error.Fail()) {
       result.AppendError(error.AsCString());
     }
-
-    return result.Succeeded();
   }
 
   Status AppendRegexSubstitution(const llvm::StringRef &regex_sed,
@@ -1126,7 +1119,7 @@ class CommandObjectPythonFunction : public CommandObjectRaw {
   bool WantsCompletion() override { return true; }
 
 protected:
-  bool DoExecute(llvm::StringRef raw_command_line,
+  void DoExecute(llvm::StringRef raw_command_line,
                  CommandReturnObject &result) override {
     ScriptInterpreter *scripter = GetDebugger().GetScriptInterpreter();
 
@@ -1147,8 +1140,6 @@ class CommandObjectPythonFunction : public CommandObjectRaw {
           result.SetStatus(eReturnStatusSuccessFinishResult);
       }
     }
-
-    return result.Succeeded();
   }
 
 private:
@@ -1222,7 +1213,7 @@ class CommandObjectScriptingObject : public CommandObjectRaw {
   }
 
 protected:
-  bool DoExecute(llvm::StringRef raw_command_line,
+  void DoExecute(llvm::StringRef raw_command_line,
                  CommandReturnObject &result) override {
     ScriptInterpreter *scripter = GetDebugger().GetScriptInterpreter();
 
@@ -1243,8 +1234,6 @@ class CommandObjectScriptingObject : public CommandObjectRaw {
           result.SetStatus(eReturnStatusSuccessFinishResult);
       }
     }
-
-    return result.Succeeded();
   }
 
 private:
@@ -1330,10 +1319,10 @@ class CommandObjectCommandsScriptImport : public CommandObjectParsed {
     bool silent = false;
   };
 
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     if (command.empty()) {
       result.AppendError("command script import needs one or more arguments");
-      return false;
+      return;
     }
 
     FileSpec source_dir = {};
@@ -1342,7 +1331,7 @@ class CommandObjectCommandsScriptImport : public CommandObjectParsed {
       if (!source_dir) {
         result.AppendError("command script import -c can only be specified "
                            "from a command file");
-        return false;
+        return;
       }
     }
 
@@ -1371,8 +1360,6 @@ class CommandObjectCommandsScriptImport : public CommandObjectParsed {
                                      error.AsCString());
       }
     }
-
-    return result.Succeeded();
   }
 
   CommandOptions m_options;
@@ -1567,16 +1554,16 @@ class CommandObjectCommandsScriptAdd : public CommandObjectParsed,
     io_handler.SetIsDone(true);
   }
 
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     if (GetDebugger().GetScriptLanguage() != lldb::eScriptLanguagePython) {
       result.AppendError("only scripting language supported for scripted "
                          "commands is currently Python");
-      return false;
+      return;
     }
 
     if (command.GetArgumentCount() == 0) {
       result.AppendError("'command script add' requires at least one argument");
-      return false;
+      return;
     }
     // Store the options in case we get multi-line input, also figure out the
     // default if not user supplied:
@@ -1598,7 +1585,7 @@ class CommandObjectCommandsScriptAdd : public CommandObjectParsed,
     if (path_error.Fail()) {
       result.AppendErrorWithFormat("error in command path: %s",
                                    path_error.AsCString());
-      return false;
+      return;
     }
 
     if (!m_container) {
@@ -1617,7 +1604,7 @@ class CommandObjectCommandsScriptAdd : public CommandObjectParsed,
     if (m_options.m_class_name.empty() && m_options.m_funct_name.empty()) {
       m_interpreter.GetPythonCommandsFromIOHandler("     ", // Prompt
                                                    *this);  // IOHandlerDelegate
-      return result.Succeeded();
+      return;
     }
 
     CommandObjectSP new_cmd_sp;
@@ -1629,7 +1616,7 @@ class CommandObjectCommandsScriptAdd : public CommandObjectParsed,
       ScriptInterpreter *interpreter = GetDebugger().GetScriptInterpreter();
       if (!interpreter) {
         result.AppendError("cannot find ScriptInterpreter");
-        return false;
+        return;
       }
 
       auto cmd_obj_sp = interpreter->CreateScriptCommandObject(
@@ -1637,7 +1624,7 @@ class CommandObjectCommandsScriptAdd : public CommandObjectParsed,
       if (!cmd_obj_sp) {
         result.AppendErrorWithFormatv("cannot create helper object for: "
                                       "'{0}'", m_options.m_class_name);
-        return false;
+        return;
       }
 
       new_cmd_sp.reset(new CommandObjectScriptingObject(
@@ -1660,7 +1647,6 @@ class CommandObjectCommandsScriptAdd : public CommandObjectParsed,
         result.AppendErrorWithFormat("cannot add command: %s", 
                                      llvm::toString(std::move(llvm_error)).c_str());
     }
-    return result.Succeeded();
   }
 
   CommandOptions m_options;
@@ -1684,12 +1670,10 @@ class CommandObjectCommandsScriptList : public CommandObjectParsed {
 
   ~CommandObjectCommandsScriptList() override = default;
 
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     m_interpreter.GetHelp(result, CommandInterpreter::eCommandTypesUserDef);
 
     result.SetStatus(eReturnStatusSuccessFinishResult);
-
-    return true;
   }
 };
 
@@ -1704,12 +1688,10 @@ class CommandObjectCommandsScriptClear : public CommandObjectParsed {
   ~CommandObjectCommandsScriptClear() override = default;
 
 protected:
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     m_interpreter.RemoveAllUser();
 
     result.SetStatus(eReturnStatusSuccessFinishResult);
-
-    return true;
   }
 };
 
@@ -1748,44 +1730,44 @@ class CommandObjectCommandsScriptDelete : public CommandObjectParsed {
   }
 
 protected:
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
 
     llvm::StringRef root_cmd = command[0].ref();
     size_t num_args = command.GetArgumentCount();
 
     if (root_cmd.empty()) {
       result.AppendErrorWithFormat("empty root command name");
-      return false;
+      return;
     }
     if (!m_interpreter.HasUserCommands() &&
         !m_interpreter.HasUserMultiwordCommands()) {
       result.AppendErrorWithFormat("can only delete user defined commands, "
                                    "but no user defined commands found");
-      return false;
+      return;
     }
 
     CommandObjectSP cmd_sp = m_interpreter.GetCommandSPExact(root_cmd);
     if (!cmd_sp) {
       result.AppendErrorWithFormat("command '%s' not found.",
                                    command[0].c_str());
-      return false;
+      return;
     }
     if (!cmd_sp->IsUserCommand()) {
       result.AppendErrorWithFormat("command '%s' is not a user command.",
                                    command[0].c_str());
-      return false;
+      return;
     }
     if (cmd_sp->GetAsMultiwordCommand() && num_args == 1) {
       result.AppendErrorWithFormat("command '%s' is a multi-word command.\n "
                                    "Delete with \"command container delete\"",
                                    command[0].c_str());
-      return false;
+      return;
     }
 
     if (command.GetArgumentCount() == 1) {
       m_interpreter.RemoveUser(root_cmd);
       result.SetStatus(eReturnStatusSuccessFinishResult);
-      return true;
+      return;
     }
     // We're deleting a command from a multiword command.  Verify the command
     // path:
@@ -1796,14 +1778,14 @@ class CommandObjectCommandsScriptDelete : public CommandObjectParsed {
     if (error.Fail()) {
       result.AppendErrorWithFormat("could not resolve command path: %s",
                                    error.AsCString());
-      return false;
+      return;
     }
     if (!container) {
       // This means that command only had a leaf command, so the container is
       // the root.  That should have been handled above.
       result.AppendErrorWithFormat("could not find a container for '%s'",
                                    command[0].c_str());
-      return false;
+      return;
     }
     const char *leaf_cmd = command[num_args - 1].c_str();
     llvm::Error llvm_error = container->RemoveUserSubcommand(leaf_cmd,
@@ -1812,7 +1794,7 @@ class CommandObjectCommandsScriptDelete : public CommandObjectParsed {
       result.AppendErrorWithFormat("could not delete command '%s': %s",
                                    leaf_cmd, 
                                    llvm::toString(std::move(llvm_error)).c_str());
-      return false;
+      return;
     }
 
     Stream &out_stream = result.GetOutputStream();
@@ -1824,7 +1806,6 @@ class CommandObjectCommandsScriptDelete : public CommandObjectParsed {
     }
     out_stream << '\n';
     result.SetStatus(eReturnStatusSuccessFinishResult);
-    return true;
   }
 };
 
@@ -1945,12 +1926,12 @@ class CommandObjectCommandsContainerAdd : public CommandObjectParsed {
     std::string m_long_help;
     bool m_overwrite = false;
   };
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     size_t num_args = command.GetArgumentCount();
 
     if (num_args == 0) {
       result.AppendError("no command was specified");
-      return false;
+      return;
     }
 
     if (num_args == 1) {
@@ -1965,10 +1946,10 @@ class CommandObjectCommandsContainerAdd : public CommandObjectParsed {
       if (add_error.Fail()) {
         result.AppendErrorWithFormat("error adding command: %s",
                                      add_error.AsCString());
-        return false;
+        return;
       }
       result.SetStatus(eReturnStatusSuccessFinishNoResult);
-      return true;
+      return;
     }
 
     // We're adding this to a subcommand, first find the subcommand:
@@ -1980,7 +1961,7 @@ class CommandObjectCommandsContainerAdd : public CommandObjectParsed {
     if (!add_to_me) {
       result.AppendErrorWithFormat("error adding command: %s",
                                    path_error.AsCString());
-      return false;
+      return;
     }
 
     const char *cmd_name = command.GetArgumentAtIndex(num_args - 1);
@@ -1992,11 +1973,10 @@ class CommandObjectCommandsContainerAdd : public CommandObjectParsed {
     if (llvm_error) {
       result.AppendErrorWithFormat("error adding subcommand: %s",
                                    llvm::toString(std::move(llvm_error)).c_str());
-      return false;
+      return;
     }
 
     result.SetStatus(eReturnStatusSuccessFinishNoResult);
-    return true;
   }
 
 private:
@@ -2039,12 +2019,12 @@ class CommandObjectCommandsContainerDelete : public CommandObjectParsed {
   }
 
 protected:
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     size_t num_args = command.GetArgumentCount();
 
     if (num_args == 0) {
       result.AppendError("No command was specified.");
-      return false;
+      return;
     }
 
     if (num_args == 1) {
@@ -2057,27 +2037,27 @@ class CommandObjectCommandsContainerDelete : public CommandObjectParsed {
       if (!cmd_sp) {
         result.AppendErrorWithFormat("container command %s doesn't exist.",
                                      cmd_name);
-        return false;
+        return;
       }
       if (!cmd_sp->IsUserCommand()) {
         result.AppendErrorWithFormat(
             "container command %s is not a user command", cmd_name);
-        return false;
+        return;
       }
       if (!cmd_sp->GetAsMultiwordCommand()) {
         result.AppendErrorWithFormat("command %s is not a container command",
                                      cmd_name);
-        return false;
+        return;
       }
 
       bool did_remove = GetCommandInterpreter().RemoveUserMultiword(cmd_name);
       if (!did_remove) {
         result.AppendErrorWithFormat("error removing command %s.", cmd_name);
-        return false;
+        return;
       }
 
       result.SetStatus(eReturnStatusSuccessFinishNoResult);
-      return true;
+      return;
     }
 
     // We're removing a subcommand, first find the subcommand's owner:
@@ -2089,7 +2069,7 @@ class CommandObjectCommandsContainerDelete : public CommandObjectParsed {
     if (!container) {
       result.AppendErrorWithFormat("error removing container command: %s",
                                    path_error.AsCString());
-      return false;
+      return;
     }
     const char *leaf = command.GetArgumentAtIndex(num_args - 1);
     llvm::Error llvm_error =
@@ -2097,10 +2077,9 @@ class CommandObjectCommandsContainerDelete : public CommandObjectParsed {
     if (llvm_error) {
       result.AppendErrorWithFormat("error removing container command: %s",
                                    llvm::toString(std::move(llvm_error)).c_str());
-      return false;
+      return;
     }
     result.SetStatus(eReturnStatusSuccessFinishNoResult);
-    return true;
   }
 };
 
diff --git a/lldb/source/Commands/CommandObjectDWIMPrint.cpp b/lldb/source/Commands/CommandObjectDWIMPrint.cpp
index bdc17c9cffc77..695f3d7931cd0 100644
--- a/lldb/source/Commands/CommandObjectDWIMPrint.cpp
+++ b/lldb/source/Commands/CommandObjectDWIMPrint.cpp
@@ -58,7 +58,7 @@ void CommandObjectDWIMPrint::HandleArgumentCompletion(
       GetCommandInterpreter(), lldb::eVariablePathCompletion, request, nullptr);
 }
 
-bool CommandObjectDWIMPrint::DoExecute(StringRef command,
+void CommandObjectDWIMPrint::DoExecute(StringRef command,
                                        CommandReturnObject &result) {
   m_option_group.NotifyOptionParsingStarting(&m_exe_ctx);
   OptionsWithRaw args{command};
@@ -67,13 +67,13 @@ bool CommandObjectDWIMPrint::DoExecute(StringRef command,
   if (expr.empty()) {
     result.AppendErrorWithFormatv("'{0}' takes a variable or expression",
                                   m_cmd_name);
-    return false;
+    return;
   }
 
   if (args.HasArgs()) {
     if (!ParseOptionsAndNotify(args.GetArgs(), result, m_option_group,
                                m_exe_ctx))
-      return false;
+      return;
   }
 
   // If the user has not specified, default to disabling persistent results.
@@ -164,7 +164,7 @@ bool CommandObjectDWIMPrint::DoExecute(StringRef command,
         valobj_sp->Dump(result.GetOutputStream(), dump_options);
       }
       result.SetStatus(eReturnStatusSuccessFinishResult);
-      return true;
+      return;
     }
   }
 
@@ -216,14 +216,12 @@ bool CommandObjectDWIMPrint::DoExecute(StringRef command,
         }
 
       result.SetStatus(eReturnStatusSuccessFinishResult);
-      return true;
     } else {
       if (valobj_sp)
         result.SetError(valobj_sp->GetError());
       else
         result.AppendErrorWithFormatv(
             "unknown error evaluating expression `{0}`", expr);
-      return false;
     }
   }
 }
diff --git a/lldb/source/Commands/CommandObjectDWIMPrint.h b/lldb/source/Commands/CommandObjectDWIMPrint.h
index 3fc6c01d47297..d868f8964c2ac 100644
--- a/lldb/source/Commands/CommandObjectDWIMPrint.h
+++ b/lldb/source/Commands/CommandObjectDWIMPrint.h
@@ -44,7 +44,7 @@ class CommandObjectDWIMPrint : public CommandObjectRaw {
                            OptionElementVector &opt_element_vector) override;
 
 private:
-  bool DoExecute(llvm::StringRef command, CommandReturnObject &result) override;
+  void DoExecute(llvm::StringRef command, CommandReturnObject &result) override;
 
   OptionGroupOptions m_option_group;
   OptionGroupFormat m_format_options = lldb::eFormatDefault;
diff --git a/lldb/source/Commands/CommandObjectDiagnostics.cpp b/lldb/source/Commands/CommandObjectDiagnostics.cpp
index dfde50a236abc..ac87f869f0127 100644
--- a/lldb/source/Commands/CommandObjectDiagnostics.cpp
+++ b/lldb/source/Commands/CommandObjectDiagnostics.cpp
@@ -77,12 +77,12 @@ class CommandObjectDiagnosticsDump : public CommandObjectParsed {
     return Diagnostics::CreateUniqueDirectory();
   }
 
-  bool DoExecute(Args &args, CommandReturnObject &result) override {
+  void DoExecute(Args &args, CommandReturnObject &result) override {
     llvm::Expected<FileSpec> directory = GetDirectory();
 
     if (!directory) {
       result.AppendError(llvm::toString(directory.takeError()));
-      return result.Succeeded();
+      return;
     }
 
     llvm::Error error = Diagnostics::Instance().Create(*directory);
@@ -90,13 +90,13 @@ class CommandObjectDiagnosticsDump : public CommandObjectParsed {
       result.AppendErrorWithFormat("failed to write diagnostics to %s",
                                    directory->GetPath().c_str());
       result.AppendError(llvm::toString(std::move(error)));
-      return result.Succeeded();
+      return;
     }
 
     result.GetOutputStream() << "diagnostics written to " << *directory << '\n';
 
     result.SetStatus(eReturnStatusSuccessFinishResult);
-    return result.Succeeded();
+    return;
   }
 
   CommandOptions m_options;
diff --git a/lldb/source/Commands/CommandObjectDisassemble.cpp b/lldb/source/Commands/CommandObjectDisassemble.cpp
index 6f78fc9f62876..d975e39801317 100644
--- a/lldb/source/Commands/CommandObjectDisassemble.cpp
+++ b/lldb/source/Commands/CommandObjectDisassemble.cpp
@@ -437,7 +437,7 @@ CommandObjectDisassemble::GetRangesForSelectedMode(
   return CommandObjectDisassemble::GetPCRanges();
 }
 
-bool CommandObjectDisassemble::DoExecute(Args &command,
+void CommandObjectDisassemble::DoExecute(Args &command,
                                          CommandReturnObject &result) {
   Target *target = &GetSelectedTarget();
 
@@ -447,7 +447,7 @@ bool CommandObjectDisassemble::DoExecute(Args &command,
   if (!m_options.arch.IsValid()) {
     result.AppendError(
         "use the --arch option or set the target architecture to disassemble");
-    return false;
+    return;
   }
 
   const char *plugin_name = m_options.GetPluginName();
@@ -466,7 +466,7 @@ bool CommandObjectDisassemble::DoExecute(Args &command,
       result.AppendErrorWithFormat(
           "Unable to find Disassembler plug-in for the '%s' architecture.\n",
           m_options.arch.GetArchitectureName());
-    return false;
+    return;
   } else if (flavor_string != nullptr && !disassembler->FlavorValidForArchSpec(
                                              m_options.arch, flavor_string))
     result.AppendWarningWithFormat(
@@ -481,7 +481,7 @@ bool CommandObjectDisassemble::DoExecute(Args &command,
         GetCommandInterpreter().GetDebugger().GetTerminalWidth();
     GetOptions()->GenerateOptionUsage(result.GetErrorStream(), *this,
                                       terminal_width);
-    return false;
+    return;
   }
 
   if (m_options.show_mixed && m_options.num_lines_context == 0)
@@ -508,7 +508,7 @@ bool CommandObjectDisassemble::DoExecute(Args &command,
       GetRangesForSelectedMode(result);
   if (!ranges) {
     result.AppendError(toString(ranges.takeError()));
-    return result.Succeeded();
+    return;
   }
 
   bool print_sc_header = ranges->size() > 1;
@@ -541,6 +541,4 @@ bool CommandObjectDisassemble::DoExecute(Args &command,
     if (print_sc_header)
       result.GetOutputStream() << "\n";
   }
-
-  return result.Succeeded();
 }
diff --git a/lldb/source/Commands/CommandObjectDisassemble.h b/lldb/source/Commands/CommandObjectDisassemble.h
index b5146863628d2..2e4d46dd0ec58 100644
--- a/lldb/source/Commands/CommandObjectDisassemble.h
+++ b/lldb/source/Commands/CommandObjectDisassemble.h
@@ -73,7 +73,7 @@ class CommandObjectDisassemble : public CommandObjectParsed {
   Options *GetOptions() override { return &m_options; }
 
 protected:
-  bool DoExecute(Args &command, CommandReturnObject &result) override;
+  void DoExecute(Args &command, CommandReturnObject &result) override;
 
   llvm::Expected<std::vector<AddressRange>>
   GetRangesForSelectedMode(CommandReturnObject &result);
diff --git a/lldb/source/Commands/CommandObjectExpression.cpp b/lldb/source/Commands/CommandObjectExpression.cpp
index 2834be660abaf..3a2dc11e1e71c 100644
--- a/lldb/source/Commands/CommandObjectExpression.cpp
+++ b/lldb/source/Commands/CommandObjectExpression.cpp
@@ -594,7 +594,7 @@ GetExprOptions(ExecutionContext &ctx,
   return expr_options;
 }
 
-bool CommandObjectExpression::DoExecute(llvm::StringRef command,
+void CommandObjectExpression::DoExecute(llvm::StringRef command,
                                         CommandReturnObject &result) {
   m_fixed_expression.clear();
   auto exe_ctx = GetCommandInterpreter().GetExecutionContext();
@@ -602,7 +602,7 @@ bool CommandObjectExpression::DoExecute(llvm::StringRef command,
 
   if (command.empty()) {
     GetMultilineExpression();
-    return result.Succeeded();
+    return;
   }
 
   OptionsWithRaw args(command);
@@ -610,7 +610,7 @@ bool CommandObjectExpression::DoExecute(llvm::StringRef command,
 
   if (args.HasArgs()) {
     if (!ParseOptionsAndNotify(args.GetArgs(), result, m_option_group, exe_ctx))
-      return false;
+      return;
 
     if (m_repl_option.GetOptionValue().GetCurrentValue()) {
       Target &target = GetSelectedOrDummyTarget();
@@ -642,7 +642,7 @@ bool CommandObjectExpression::DoExecute(llvm::StringRef command,
                                     nullptr, true);
           if (!repl_error.Success()) {
             result.SetError(repl_error);
-            return result.Succeeded();
+            return;
           }
         }
 
@@ -662,14 +662,14 @@ bool CommandObjectExpression::DoExecute(llvm::StringRef command,
               "Couldn't create a REPL for %s",
               Language::GetNameForLanguageType(m_command_options.language));
           result.SetError(repl_error);
-          return result.Succeeded();
+          return;
         }
       }
     }
     // No expression following options
     else if (expr.empty()) {
       GetMultilineExpression();
-      return result.Succeeded();
+      return;
     }
   }
 
@@ -691,8 +691,7 @@ bool CommandObjectExpression::DoExecute(llvm::StringRef command,
         fixed_command.append(m_fixed_expression);
       history.AppendString(fixed_command);
     }
-    return true;
+    return;
   }
   result.SetStatus(eReturnStatusFailed);
-  return false;
 }
diff --git a/lldb/source/Commands/CommandObjectExpression.h b/lldb/source/Commands/CommandObjectExpression.h
index b2b8fc73a1ee8..6fccf10e5dbc1 100644
--- a/lldb/source/Commands/CommandObjectExpression.h
+++ b/lldb/source/Commands/CommandObjectExpression.h
@@ -75,7 +75,7 @@ class CommandObjectExpression : public CommandObjectRaw,
   bool IOHandlerIsInputComplete(IOHandler &io_handler,
                                 StringList &lines) override;
 
-  bool DoExecute(llvm::StringRef command, CommandReturnObject &result) override;
+  void DoExecute(llvm::StringRef command, CommandReturnObject &result) override;
 
   /// Evaluates the given expression.
   /// \param output_stream The stream to which the evaluation result will be
diff --git a/lldb/source/Commands/CommandObjectFrame.cpp b/lldb/source/Commands/CommandObjectFrame.cpp
index 1390fd8748dfa..1fad638f21453 100644
--- a/lldb/source/Commands/CommandObjectFrame.cpp
+++ b/lldb/source/Commands/CommandObjectFrame.cpp
@@ -133,7 +133,7 @@ class CommandObjectFrameDiagnose : public CommandObjectParsed {
   Options *GetOptions() override { return &m_options; }
 
 protected:
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     Thread *thread = m_exe_ctx.GetThreadPtr();
     StackFrameSP frame_sp = thread->GetSelectedFrame(SelectMostRelevantFrame);
 
@@ -143,7 +143,7 @@ class CommandObjectFrameDiagnose : public CommandObjectParsed {
       if (m_options.reg || m_options.offset) {
         result.AppendError(
             "`frame diagnose --address` is incompatible with other arguments.");
-        return false;
+        return;
       }
       valobj_sp = frame_sp->GuessValueForAddress(*m_options.address);
     } else if (m_options.reg) {
@@ -153,7 +153,7 @@ class CommandObjectFrameDiagnose : public CommandObjectParsed {
       StopInfoSP stop_info_sp = thread->GetStopInfo();
       if (!stop_info_sp) {
         result.AppendError("No arguments provided, and no stop info.");
-        return false;
+        return;
       }
 
       valobj_sp = StopInfo::GetCrashingDereference(stop_info_sp);
@@ -161,7 +161,7 @@ class CommandObjectFrameDiagnose : public CommandObjectParsed {
 
     if (!valobj_sp) {
       result.AppendError("No diagnosis available.");
-      return false;
+      return;
     }
 
     DumpValueObjectOptions::DeclPrintingHelper helper =
@@ -180,8 +180,6 @@ class CommandObjectFrameDiagnose : public CommandObjectParsed {
     ValueObjectPrinter printer(valobj_sp.get(), &result.GetOutputStream(),
                                options);
     printer.PrintValueObject();
-
-    return true;
   }
 
   CommandOptions m_options;
@@ -205,10 +203,9 @@ class CommandObjectFrameInfo : public CommandObjectParsed {
   ~CommandObjectFrameInfo() override = default;
 
 protected:
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     m_exe_ctx.GetFrameRef().DumpUsingSettingsFormat(&result.GetOutputStream());
     result.SetStatus(eReturnStatusSuccessFinishResult);
-    return result.Succeeded();
   }
 };
 
@@ -299,7 +296,7 @@ class CommandObjectFrameSelect : public CommandObjectParsed {
   Options *GetOptions() override { return &m_options; }
 
 protected:
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     // No need to check "thread" for validity as eCommandRequiresThread ensures
     // it is valid
     Thread *thread = m_exe_ctx.GetThreadPtr();
@@ -320,7 +317,7 @@ class CommandObjectFrameSelect : public CommandObjectParsed {
             // If you are already at the bottom of the stack, then just warn
             // and don't reset the frame.
             result.AppendError("Already at the bottom of the stack.");
-            return false;
+            return;
           } else
             frame_idx = 0;
         }
@@ -345,7 +342,7 @@ class CommandObjectFrameSelect : public CommandObjectParsed {
               // If we are already at the top of the stack, just warn and don't
               // reset the frame.
               result.AppendError("Already at the top of the stack.");
-              return false;
+              return;
             } else
               frame_idx = num_frames - 1;
           }
@@ -359,14 +356,14 @@ class CommandObjectFrameSelect : public CommandObjectParsed {
         m_options.GenerateOptionUsage(
             result.GetErrorStream(), *this,
             GetCommandInterpreter().GetDebugger().GetTerminalWidth());
-        return false;
+        return;
       }
 
       if (command.GetArgumentCount() == 1) {
         if (command[0].ref().getAsInteger(0, frame_idx)) {
           result.AppendErrorWithFormat("invalid frame index argument '%s'.",
                                        command[0].c_str());
-          return false;
+          return;
         }
       } else if (command.GetArgumentCount() == 0) {
         frame_idx = thread->GetSelectedFrameIndex(SelectMostRelevantFrame);
@@ -385,8 +382,6 @@ class CommandObjectFrameSelect : public CommandObjectParsed {
       result.AppendErrorWithFormat("Frame index (%u) out of range.\n",
                                    frame_idx);
     }
-
-    return result.Succeeded();
   }
 
   CommandOptions m_options;
@@ -524,7 +519,7 @@ may even involve JITing and running code in the target program.)");
     return std::nullopt;
   }
 
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     // No need to check "frame" for validity as eCommandRequiresFrame ensures
     // it is valid
     StackFrame *frame = m_exe_ctx.GetFramePtr();
@@ -733,13 +728,11 @@ may even involve JITing and running code in the target program.)");
                                            m_cmd_name);
 
     // Increment statistics.
-    bool res = result.Succeeded();
     TargetStats &target_stats = GetSelectedOrDummyTarget().GetStatistics();
-    if (res)
+    if (result.Succeeded())
       target_stats.GetFrameVariableStats().NotifySuccess();
     else
       target_stats.GetFrameVariableStats().NotifyFailure();
-    return res;
   }
 
   OptionGroupOptions m_option_group;
@@ -821,7 +814,7 @@ class CommandObjectFrameRecognizerAdd : public CommandObjectParsed {
   Options *GetOptions() override { return &m_options; }
 
 protected:
-  bool DoExecute(Args &command, CommandReturnObject &result) override;
+  void DoExecute(Args &command, CommandReturnObject &result) override;
 
 public:
   CommandObjectFrameRecognizerAdd(CommandInterpreter &interpreter)
@@ -877,33 +870,33 @@ Process 1234 stopped
   ~CommandObjectFrameRecognizerAdd() override = default;
 };
 
-bool CommandObjectFrameRecognizerAdd::DoExecute(Args &command,
+void CommandObjectFrameRecognizerAdd::DoExecute(Args &command,
                                                 CommandReturnObject &result) {
 #if LLDB_ENABLE_PYTHON
   if (m_options.m_class_name.empty()) {
     result.AppendErrorWithFormat(
         "%s needs a Python class name (-l argument).\n", m_cmd_name.c_str());
-    return false;
+    return;
   }
 
   if (m_options.m_module.empty()) {
     result.AppendErrorWithFormat("%s needs a module name (-s argument).\n",
                                  m_cmd_name.c_str());
-    return false;
+    return;
   }
 
   if (m_options.m_symbols.empty()) {
     result.AppendErrorWithFormat(
         "%s needs at least one symbol name (-n argument).\n",
         m_cmd_name.c_str());
-    return false;
+    return;
   }
 
   if (m_options.m_regex && m_options.m_symbols.size() > 1) {
     result.AppendErrorWithFormat(
         "%s needs only one symbol regular expression (-n argument).\n",
         m_cmd_name.c_str());
-    return false;
+    return;
   }
 
   ScriptInterpreter *interpreter = GetDebugger().GetScriptInterpreter();
@@ -934,7 +927,6 @@ bool CommandObjectFrameRecognizerAdd::DoExecute(Args &command,
 #endif
 
   result.SetStatus(eReturnStatusSuccessFinishNoResult);
-  return result.Succeeded();
 }
 
 class CommandObjectFrameRecognizerClear : public CommandObjectParsed {
@@ -946,12 +938,11 @@ class CommandObjectFrameRecognizerClear : public CommandObjectParsed {
   ~CommandObjectFrameRecognizerClear() override = default;
 
 protected:
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     GetSelectedOrDummyTarget()
         .GetFrameRecognizerManager()
         .RemoveAllRecognizers();
     result.SetStatus(eReturnStatusSuccessFinishResult);
-    return result.Succeeded();
   }
 };
 
@@ -995,33 +986,33 @@ class CommandObjectFrameRecognizerDelete : public CommandObjectParsed {
   }
 
 protected:
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     if (command.GetArgumentCount() == 0) {
       if (!m_interpreter.Confirm(
               "About to delete all frame recognizers, do you want to do that?",
               true)) {
         result.AppendMessage("Operation cancelled...");
-        return false;
+        return;
       }
 
       GetSelectedOrDummyTarget()
           .GetFrameRecognizerManager()
           .RemoveAllRecognizers();
       result.SetStatus(eReturnStatusSuccessFinishResult);
-      return result.Succeeded();
+      return;
     }
 
     if (command.GetArgumentCount() != 1) {
       result.AppendErrorWithFormat("'%s' takes zero or one arguments.\n",
                                    m_cmd_name.c_str());
-      return false;
+      return;
     }
 
     uint32_t recognizer_id;
     if (!llvm::to_integer(command.GetArgumentAtIndex(0), recognizer_id)) {
       result.AppendErrorWithFormat("'%s' is not a valid recognizer id.\n",
                                    command.GetArgumentAtIndex(0));
-      return false;
+      return;
     }
 
     if (!GetSelectedOrDummyTarget()
@@ -1029,10 +1020,9 @@ class CommandObjectFrameRecognizerDelete : public CommandObjectParsed {
              .RemoveRecognizerWithID(recognizer_id)) {
       result.AppendErrorWithFormat("'%s' is not a valid recognizer id.\n",
                                    command.GetArgumentAtIndex(0));
-      return false;
+      return;
     }
     result.SetStatus(eReturnStatusSuccessFinishResult);
-    return result.Succeeded();
   }
 };
 
@@ -1046,7 +1036,7 @@ class CommandObjectFrameRecognizerList : public CommandObjectParsed {
   ~CommandObjectFrameRecognizerList() override = default;
 
 protected:
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     bool any_printed = false;
     GetSelectedOrDummyTarget().GetFrameRecognizerManager().ForEach(
         [&result, &any_printed](
@@ -1078,7 +1068,6 @@ class CommandObjectFrameRecognizerList : public CommandObjectParsed {
       result.GetOutputStream().PutCString("no matching results found.\n");
       result.SetStatus(eReturnStatusSuccessFinishNoResult);
     }
-    return result.Succeeded();
   }
 };
 
@@ -1107,35 +1096,35 @@ class CommandObjectFrameRecognizerInfo : public CommandObjectParsed {
   ~CommandObjectFrameRecognizerInfo() override = default;
 
 protected:
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     const char *frame_index_str = command.GetArgumentAtIndex(0);
     uint32_t frame_index;
     if (!llvm::to_integer(frame_index_str, frame_index)) {
       result.AppendErrorWithFormat("'%s' is not a valid frame index.",
                                    frame_index_str);
-      return false;
+      return;
     }
 
     Process *process = m_exe_ctx.GetProcessPtr();
     if (process == nullptr) {
       result.AppendError("no process");
-      return false;
+      return;
     }
     Thread *thread = m_exe_ctx.GetThreadPtr();
     if (thread == nullptr) {
       result.AppendError("no thread");
-      return false;
+      return;
     }
     if (command.GetArgumentCount() != 1) {
       result.AppendErrorWithFormat(
           "'%s' takes exactly one frame index argument.\n", m_cmd_name.c_str());
-      return false;
+      return;
     }
 
     StackFrameSP frame_sp = thread->GetStackFrameAtIndex(frame_index);
     if (!frame_sp) {
       result.AppendErrorWithFormat("no frame with index %u", frame_index);
-      return false;
+      return;
     }
 
     auto recognizer = GetSelectedOrDummyTarget()
@@ -1152,7 +1141,6 @@ class CommandObjectFrameRecognizerInfo : public CommandObjectParsed {
     }
     output_stream.EOL();
     result.SetStatus(eReturnStatusSuccessFinishResult);
-    return result.Succeeded();
   }
 };
 
diff --git a/lldb/source/Commands/CommandObjectGUI.cpp b/lldb/source/Commands/CommandObjectGUI.cpp
index a63d1718610c1..b56e49b073b03 100644
--- a/lldb/source/Commands/CommandObjectGUI.cpp
+++ b/lldb/source/Commands/CommandObjectGUI.cpp
@@ -24,7 +24,7 @@ CommandObjectGUI::CommandObjectGUI(CommandInterpreter &interpreter)
 
 CommandObjectGUI::~CommandObjectGUI() = default;
 
-bool CommandObjectGUI::DoExecute(Args &args, CommandReturnObject &result) {
+void CommandObjectGUI::DoExecute(Args &args, CommandReturnObject &result) {
 #if LLDB_ENABLE_CURSES
   Debugger &debugger = GetDebugger();
 
@@ -39,9 +39,7 @@ bool CommandObjectGUI::DoExecute(Args &args, CommandReturnObject &result) {
   } else {
     result.AppendError("the gui command requires an interactive terminal.");
   }
-  return true;
 #else
   result.AppendError("lldb was not built with gui support");
-  return false;
 #endif
 }
diff --git a/lldb/source/Commands/CommandObjectGUI.h b/lldb/source/Commands/CommandObjectGUI.h
index 49bad49a957d7..fde4342724c9d 100644
--- a/lldb/source/Commands/CommandObjectGUI.h
+++ b/lldb/source/Commands/CommandObjectGUI.h
@@ -22,7 +22,7 @@ class CommandObjectGUI : public CommandObjectParsed {
   ~CommandObjectGUI() override;
 
 protected:
-  bool DoExecute(Args &args, CommandReturnObject &result) override;
+  void DoExecute(Args &args, CommandReturnObject &result) override;
 };
 
 } // namespace lldb_private
diff --git a/lldb/source/Commands/CommandObjectHelp.cpp b/lldb/source/Commands/CommandObjectHelp.cpp
index 10aa49ae01ba0..ddb006e52d2c5 100644
--- a/lldb/source/Commands/CommandObjectHelp.cpp
+++ b/lldb/source/Commands/CommandObjectHelp.cpp
@@ -74,7 +74,7 @@ CommandObjectHelp::CommandOptions::GetDefinitions() {
   return llvm::ArrayRef(g_help_options);
 }
 
-bool CommandObjectHelp::DoExecute(Args &command, CommandReturnObject &result) {
+void CommandObjectHelp::DoExecute(Args &command, CommandReturnObject &result) {
   CommandObject::CommandMap::iterator pos;
   CommandObject *cmd_obj;
   const size_t argc = command.GetArgumentCount();
@@ -142,14 +142,14 @@ bool CommandObjectHelp::DoExecute(Args &command, CommandReturnObject &result) {
           }
           s.Printf("\n");
           result.AppendError(s.GetString());
-          return false;
+          return;
         } else if (!sub_cmd_obj) {
           StreamString error_msg_stream;
           GenerateAdditionalHelpAvenuesMessage(
               &error_msg_stream, cmd_string.c_str(),
               m_interpreter.GetCommandPrefix(), sub_command.c_str());
           result.AppendError(error_msg_stream.GetString());
-          return false;
+          return;
         } else {
           GenerateAdditionalHelpAvenuesMessage(
               &result.GetOutputStream(), cmd_string.c_str(),
@@ -197,8 +197,6 @@ bool CommandObjectHelp::DoExecute(Args &command, CommandReturnObject &result) {
       }
     }
   }
-
-  return result.Succeeded();
 }
 
 void CommandObjectHelp::HandleCompletion(CompletionRequest &request) {
diff --git a/lldb/source/Commands/CommandObjectHelp.h b/lldb/source/Commands/CommandObjectHelp.h
index a0ed157b9caf4..9b2c89e6654fa 100644
--- a/lldb/source/Commands/CommandObjectHelp.h
+++ b/lldb/source/Commands/CommandObjectHelp.h
@@ -76,7 +76,7 @@ class CommandObjectHelp : public CommandObjectParsed {
   Options *GetOptions() override { return &m_options; }
 
 protected:
-  bool DoExecute(Args &command, CommandReturnObject &result) override;
+  void DoExecute(Args &command, CommandReturnObject &result) override;
 
 private:
   CommandOptions m_options;
diff --git a/lldb/source/Commands/CommandObjectLanguage.h b/lldb/source/Commands/CommandObjectLanguage.h
index 7a280902a07ef..2f9f8fecc80da 100644
--- a/lldb/source/Commands/CommandObjectLanguage.h
+++ b/lldb/source/Commands/CommandObjectLanguage.h
@@ -19,7 +19,7 @@ class CommandObjectLanguage : public CommandObjectMultiword {
   ~CommandObjectLanguage() override;
 
 protected:
-  bool DoExecute(Args &command, CommandReturnObject &result);
+  void DoExecute(Args &command, CommandReturnObject &result);
 };
 } // namespace lldb_private
 
diff --git a/lldb/source/Commands/CommandObjectLog.cpp b/lldb/source/Commands/CommandObjectLog.cpp
index 5dd6f89898372..6bfbf98078e6e 100644
--- a/lldb/source/Commands/CommandObjectLog.cpp
+++ b/lldb/source/Commands/CommandObjectLog.cpp
@@ -162,19 +162,19 @@ class CommandObjectLogEnable : public CommandObjectParsed {
   }
 
 protected:
-  bool DoExecute(Args &args, CommandReturnObject &result) override {
+  void DoExecute(Args &args, CommandReturnObject &result) override {
     if (args.GetArgumentCount() < 2) {
       result.AppendErrorWithFormat(
           "%s takes a log channel and one or more log types.\n",
           m_cmd_name.c_str());
-      return false;
+      return;
     }
 
     if (m_options.handler == eLogHandlerCircular &&
         m_options.buffer_size.GetCurrentValue() == 0) {
       result.AppendError(
           "the circular buffer handler requires a non-zero buffer size.\n");
-      return false;
+      return;
     }
 
     if ((m_options.handler != eLogHandlerCircular &&
@@ -182,13 +182,13 @@ class CommandObjectLogEnable : public CommandObjectParsed {
         m_options.buffer_size.GetCurrentValue() != 0) {
       result.AppendError("a buffer size can only be specified for the circular "
                          "and stream buffer handler.\n");
-      return false;
+      return;
     }
 
     if (m_options.handler != eLogHandlerStream && m_options.log_file) {
       result.AppendError(
           "a file name can only be specified for the stream handler.\n");
-      return false;
+      return;
     }
 
     // Store into a std::string since we're about to shift the channel off.
@@ -212,7 +212,6 @@ class CommandObjectLogEnable : public CommandObjectParsed {
       result.SetStatus(eReturnStatusSuccessFinishNoResult);
     else
       result.SetStatus(eReturnStatusFailed);
-    return result.Succeeded();
   }
 
   CommandOptions m_options;
@@ -257,12 +256,12 @@ class CommandObjectLogDisable : public CommandObjectParsed {
   }
 
 protected:
-  bool DoExecute(Args &args, CommandReturnObject &result) override {
+  void DoExecute(Args &args, CommandReturnObject &result) override {
     if (args.empty()) {
       result.AppendErrorWithFormat(
           "%s takes a log channel and one or more log types.\n",
           m_cmd_name.c_str());
-      return false;
+      return;
     }
 
     const std::string channel = std::string(args[0].ref());
@@ -278,7 +277,6 @@ class CommandObjectLogDisable : public CommandObjectParsed {
         result.SetStatus(eReturnStatusSuccessFinishNoResult);
       result.GetErrorStream() << error_stream.str();
     }
-    return result.Succeeded();
   }
 };
 
@@ -315,7 +313,7 @@ class CommandObjectLogList : public CommandObjectParsed {
   }
 
 protected:
-  bool DoExecute(Args &args, CommandReturnObject &result) override {
+  void DoExecute(Args &args, CommandReturnObject &result) override {
     std::string output;
     llvm::raw_string_ostream output_stream(output);
     if (args.empty()) {
@@ -330,7 +328,6 @@ class CommandObjectLogList : public CommandObjectParsed {
         result.SetStatus(eReturnStatusSuccessFinishResult);
     }
     result.GetOutputStream() << output_stream.str();
-    return result.Succeeded();
   }
 };
 class CommandObjectLogDump : public CommandObjectParsed {
@@ -398,12 +395,12 @@ class CommandObjectLogDump : public CommandObjectParsed {
   }
 
 protected:
-  bool DoExecute(Args &args, CommandReturnObject &result) override {
+  void DoExecute(Args &args, CommandReturnObject &result) override {
     if (args.empty()) {
       result.AppendErrorWithFormat(
           "%s takes a log channel and one or more log types.\n",
           m_cmd_name.c_str());
-      return false;
+      return;
     }
 
     std::unique_ptr<llvm::raw_ostream> stream_up;
@@ -417,7 +414,7 @@ class CommandObjectLogDump : public CommandObjectParsed {
         result.AppendErrorWithFormat("Unable to open log file '%s': %s",
                                      m_options.log_file.GetPath().c_str(),
                                      llvm::toString(file.takeError()).c_str());
-        return false;
+        return;
       }
       stream_up = std::make_unique<llvm::raw_fd_ostream>(
           (*file)->GetDescriptor(), /*shouldClose=*/true);
@@ -435,8 +432,6 @@ class CommandObjectLogDump : public CommandObjectParsed {
       result.SetStatus(eReturnStatusFailed);
       result.GetErrorStream() << error_stream.str();
     }
-
-    return result.Succeeded();
   }
 
   CommandOptions m_options;
@@ -467,7 +462,7 @@ class CommandObjectLogTimerEnable : public CommandObjectParsed {
   ~CommandObjectLogTimerEnable() override = default;
 
 protected:
-  bool DoExecute(Args &args, CommandReturnObject &result) override {
+  void DoExecute(Args &args, CommandReturnObject &result) override {
     result.SetStatus(eReturnStatusFailed);
 
     if (args.GetArgumentCount() == 0) {
@@ -488,7 +483,6 @@ class CommandObjectLogTimerEnable : public CommandObjectParsed {
       result.AppendError("Missing subcommand");
       result.AppendErrorWithFormat("Usage: %s\n", m_cmd_syntax.c_str());
     }
-    return result.Succeeded();
   }
 };
 
@@ -503,7 +497,7 @@ class CommandObjectLogTimerDisable : public CommandObjectParsed {
   ~CommandObjectLogTimerDisable() override = default;
 
 protected:
-  bool DoExecute(Args &args, CommandReturnObject &result) override {
+  void DoExecute(Args &args, CommandReturnObject &result) override {
     Timer::DumpCategoryTimes(result.GetOutputStream());
     Timer::SetDisplayDepth(0);
     result.SetStatus(eReturnStatusSuccessFinishResult);
@@ -512,7 +506,6 @@ class CommandObjectLogTimerDisable : public CommandObjectParsed {
       result.AppendError("Missing subcommand");
       result.AppendErrorWithFormat("Usage: %s\n", m_cmd_syntax.c_str());
     }
-    return result.Succeeded();
   }
 };
 
@@ -526,7 +519,7 @@ class CommandObjectLogTimerDump : public CommandObjectParsed {
   ~CommandObjectLogTimerDump() override = default;
 
 protected:
-  bool DoExecute(Args &args, CommandReturnObject &result) override {
+  void DoExecute(Args &args, CommandReturnObject &result) override {
     Timer::DumpCategoryTimes(result.GetOutputStream());
     result.SetStatus(eReturnStatusSuccessFinishResult);
 
@@ -534,7 +527,6 @@ class CommandObjectLogTimerDump : public CommandObjectParsed {
       result.AppendError("Missing subcommand");
       result.AppendErrorWithFormat("Usage: %s\n", m_cmd_syntax.c_str());
     }
-    return result.Succeeded();
   }
 };
 
@@ -549,7 +541,7 @@ class CommandObjectLogTimerReset : public CommandObjectParsed {
   ~CommandObjectLogTimerReset() override = default;
 
 protected:
-  bool DoExecute(Args &args, CommandReturnObject &result) override {
+  void DoExecute(Args &args, CommandReturnObject &result) override {
     Timer::ResetCategoryTimes();
     result.SetStatus(eReturnStatusSuccessFinishResult);
 
@@ -557,7 +549,6 @@ class CommandObjectLogTimerReset : public CommandObjectParsed {
       result.AppendError("Missing subcommand");
       result.AppendErrorWithFormat("Usage: %s\n", m_cmd_syntax.c_str());
     }
-    return result.Succeeded();
   }
 };
 
@@ -593,7 +584,7 @@ class CommandObjectLogTimerIncrement : public CommandObjectParsed {
   }
 
 protected:
-  bool DoExecute(Args &args, CommandReturnObject &result) override {
+  void DoExecute(Args &args, CommandReturnObject &result) override {
     result.SetStatus(eReturnStatusFailed);
 
     if (args.GetArgumentCount() == 1) {
@@ -612,7 +603,6 @@ class CommandObjectLogTimerIncrement : public CommandObjectParsed {
       result.AppendError("Missing subcommand");
       result.AppendErrorWithFormat("Usage: %s\n", m_cmd_syntax.c_str());
     }
-    return result.Succeeded();
   }
 };
 
diff --git a/lldb/source/Commands/CommandObjectMemory.cpp b/lldb/source/Commands/CommandObjectMemory.cpp
index 97f2dde7b1eb2..b02b7dee5619f 100644
--- a/lldb/source/Commands/CommandObjectMemory.cpp
+++ b/lldb/source/Commands/CommandObjectMemory.cpp
@@ -348,7 +348,7 @@ class CommandObjectMemoryRead : public CommandObjectParsed {
   }
 
 protected:
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     // No need to check "target" for validity as eCommandRequiresTarget ensures
     // it is valid
     Target *target = m_exe_ctx.GetTargetPtr();
@@ -361,7 +361,7 @@ class CommandObjectMemoryRead : public CommandObjectParsed {
                                    m_cmd_name.c_str());
       result.AppendWarning("Expressions should be quoted if they contain "
                            "spaces or other special characters.");
-      return false;
+      return;
     }
 
     CompilerType compiler_type;
@@ -441,7 +441,7 @@ class CommandObjectMemoryRead : public CommandObjectParsed {
             } else {
               result.AppendErrorWithFormat("invalid type string: '%s'\n",
                                            view_as_type_cstr);
-              return false;
+              return;
             }
             break;
 
@@ -490,7 +490,7 @@ class CommandObjectMemoryRead : public CommandObjectParsed {
               "Mutiple types found matching raw type '%s', please disambiguate "
               "by specifying the language with -x",
               lookup_type_name.GetCString());
-          return false;
+          return;
         }
 
         if (user_defined_types.size() == 1) {
@@ -504,7 +504,7 @@ class CommandObjectMemoryRead : public CommandObjectParsed {
                                        "the raw type '%s' for full type '%s'\n",
                                        lookup_type_name.GetCString(),
                                        view_as_type_cstr);
-          return false;
+          return;
         } else {
           TypeSP type_sp(type_list.GetTypeAtIndex(0));
           compiler_type = type_sp->GetFullCompilerType();
@@ -517,7 +517,7 @@ class CommandObjectMemoryRead : public CommandObjectParsed {
           compiler_type = pointer_type;
         else {
           result.AppendError("unable make a pointer type\n");
-          return false;
+          return;
         }
         --pointer_count;
       }
@@ -527,7 +527,7 @@ class CommandObjectMemoryRead : public CommandObjectParsed {
         result.AppendErrorWithFormat(
             "unable to get the byte size of the type '%s'\n",
             view_as_type_cstr);
-        return false;
+        return;
       }
       m_format_options.GetByteSizeValue() = *size;
 
@@ -540,7 +540,7 @@ class CommandObjectMemoryRead : public CommandObjectParsed {
     // Look for invalid combinations of settings
     if (error.Fail()) {
       result.AppendError(error.AsCString());
-      return false;
+      return;
     }
 
     lldb::addr_t addr;
@@ -591,7 +591,7 @@ class CommandObjectMemoryRead : public CommandObjectParsed {
     if (addr == LLDB_INVALID_ADDRESS) {
       result.AppendError("invalid start address expression.");
       result.AppendError(error.AsCString());
-      return false;
+      return;
     }
 
     if (argc == 2) {
@@ -601,19 +601,19 @@ class CommandObjectMemoryRead : public CommandObjectParsed {
       if (end_addr == LLDB_INVALID_ADDRESS) {
         result.AppendError("invalid end address expression.");
         result.AppendError(error.AsCString());
-        return false;
+        return;
       } else if (end_addr <= addr) {
         result.AppendErrorWithFormat(
             "end address (0x%" PRIx64
             ") must be greater than the start address (0x%" PRIx64 ").\n",
             end_addr, addr);
-        return false;
+        return;
       } else if (m_format_options.GetCountValue().OptionWasSet()) {
         result.AppendErrorWithFormat(
             "specify either the end address (0x%" PRIx64
             ") or the count (--count %" PRIu64 "), not both.\n",
             end_addr, (uint64_t)item_count);
-        return false;
+        return;
       }
 
       total_byte_size = end_addr - addr;
@@ -631,7 +631,7 @@ class CommandObjectMemoryRead : public CommandObjectParsed {
           "Please use --force to override this restriction just once.\n");
       result.AppendErrorWithFormat("or set target.max-memory-read-size if you "
                                    "will often need a larger limit.\n");
-      return false;
+      return;
     }
 
     WritableDataBufferSP data_sp;
@@ -645,7 +645,7 @@ class CommandObjectMemoryRead : public CommandObjectParsed {
       std::optional<uint64_t> size = compiler_type.GetByteSize(nullptr);
       if (!size) {
         result.AppendError("can't get size of type");
-        return false;
+        return;
       }
       bytes_read = *size * m_format_options.GetCountValue().GetCurrentValue();
 
@@ -659,7 +659,7 @@ class CommandObjectMemoryRead : public CommandObjectParsed {
             "can't allocate 0x%" PRIx32
             " bytes for the memory read buffer, specify a smaller size to read",
             (uint32_t)total_byte_size);
-        return false;
+        return;
       }
 
       Address address(addr, nullptr);
@@ -673,7 +673,7 @@ class CommandObjectMemoryRead : public CommandObjectParsed {
           result.AppendErrorWithFormat(
               "failed to read memory from 0x%" PRIx64 ".\n", addr);
         }
-        return false;
+        return;
       }
 
       if (bytes_read < total_byte_size)
@@ -699,7 +699,7 @@ class CommandObjectMemoryRead : public CommandObjectParsed {
             "can't allocate 0x%" PRIx64
             " bytes for the memory read buffer, specify a smaller size to read",
             (uint64_t)((item_byte_size + 1) * item_count));
-        return false;
+        return;
       }
       uint8_t *data_ptr = data_sp->GetBytes();
       auto data_addr = addr;
@@ -715,7 +715,7 @@ class CommandObjectMemoryRead : public CommandObjectParsed {
         if (error.Fail()) {
           result.AppendErrorWithFormat(
               "failed to read memory from 0x%" PRIx64 ".\n", addr);
-          return false;
+          return;
         }
 
         if (item_byte_size == read) {
@@ -777,12 +777,12 @@ class CommandObjectMemoryRead : public CommandObjectParsed {
             result.GetOutputStream().Printf(
                 "%zi bytes %s to '%s'\n", bytes_written,
                 append ? "appended" : "written", path.c_str());
-            return true;
+            return;
           } else {
             result.AppendErrorWithFormat("Failed to write %" PRIu64
                                          " bytes to '%s'.\n",
                                          (uint64_t)bytes_read, path.c_str());
-            return false;
+            return;
           }
         } else {
           // We are going to write ASCII to the file just point the
@@ -795,7 +795,7 @@ class CommandObjectMemoryRead : public CommandObjectParsed {
                                      path.c_str(), append ? "append" : "write");
 
         result.AppendError(llvm::toString(outfile.takeError()));
-        return false;
+        return;
       }
     } else {
       output_stream_p = &result.GetOutputStream();
@@ -823,10 +823,10 @@ class CommandObjectMemoryRead : public CommandObjectParsed {
           result.AppendErrorWithFormat(
               "failed to create a value object for: (%s) %s\n",
               view_as_type_cstr, name_strm.GetData());
-          return false;
+          return;
         }
       }
-      return true;
+      return;
     }
 
     result.SetStatus(eReturnStatusSuccessFinishResult);
@@ -852,7 +852,7 @@ class CommandObjectMemoryRead : public CommandObjectParsed {
         result.AppendErrorWithFormat(
             "reading memory as characters of size %" PRIu64 " is not supported",
             (uint64_t)item_byte_size);
-        return false;
+        return;
       }
     }
 
@@ -863,7 +863,6 @@ class CommandObjectMemoryRead : public CommandObjectParsed {
         exe_scope, m_memory_tag_options.GetShowTags().GetCurrentValue());
     m_next_addr = addr + bytes_dumped;
     output_stream_p->EOL();
-    return true;
   }
 
   OptionGroupOptions m_option_group;
@@ -1010,7 +1009,7 @@ class CommandObjectMemoryFind : public CommandObjectParsed {
     lldb::addr_t m_base_addr;
     bool m_is_valid = true;
   };
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     // No need to check "process" for validity as eCommandRequiresProcess
     // ensures it is valid
     Process *process = m_exe_ctx.GetProcessPtr();
@@ -1019,7 +1018,7 @@ class CommandObjectMemoryFind : public CommandObjectParsed {
 
     if (argc != 2) {
       result.AppendError("two addresses needed for memory find");
-      return false;
+      return;
     }
 
     Status error;
@@ -1027,19 +1026,19 @@ class CommandObjectMemoryFind : public CommandObjectParsed {
         &m_exe_ctx, command[0].ref(), LLDB_INVALID_ADDRESS, &error);
     if (low_addr == LLDB_INVALID_ADDRESS || error.Fail()) {
       result.AppendError("invalid low address");
-      return false;
+      return;
     }
     lldb::addr_t high_addr = OptionArgParser::ToAddress(
         &m_exe_ctx, command[1].ref(), LLDB_INVALID_ADDRESS, &error);
     if (high_addr == LLDB_INVALID_ADDRESS || error.Fail()) {
       result.AppendError("invalid high address");
-      return false;
+      return;
     }
 
     if (high_addr <= low_addr) {
       result.AppendError(
           "starting address must be smaller than ending address");
-      return false;
+      return;
     }
 
     lldb::addr_t found_location = LLDB_INVALID_ADDRESS;
@@ -1051,7 +1050,7 @@ class CommandObjectMemoryFind : public CommandObjectParsed {
           m_memory_options.m_string.GetValueAs<llvm::StringRef>().value_or("");
       if (str.empty()) {
         result.AppendError("search string must have non-zero length.");
-        return false;
+        return;
       }
       buffer.CopyData(str);
     } else if (m_memory_options.m_expr.OptionWasSet()) {
@@ -1067,7 +1066,7 @@ class CommandObjectMemoryFind : public CommandObjectParsed {
         std::optional<uint64_t> size =
             result_sp->GetCompilerType().GetByteSize(nullptr);
         if (!size)
-          return false;
+          return;
         switch (*size) {
         case 1: {
           uint8_t byte = (uint8_t)value;
@@ -1089,21 +1088,21 @@ class CommandObjectMemoryFind : public CommandObjectParsed {
         case 6:
         case 7:
           result.AppendError("unknown type. pass a string instead");
-          return false;
+          return;
         default:
           result.AppendError(
               "result size larger than 8 bytes. pass a string instead");
-          return false;
+          return;
         }
       } else {
         result.AppendError(
             "expression evaluation failed. pass a string instead");
-        return false;
+        return;
       }
     } else {
       result.AppendError(
           "please pass either a block of text, or an expression to evaluate.");
-      return false;
+      return;
     }
 
     size_t count = m_memory_options.m_count.GetCurrentValue();
@@ -1146,7 +1145,6 @@ class CommandObjectMemoryFind : public CommandObjectParsed {
     }
 
     result.SetStatus(lldb::eReturnStatusSuccessFinishResult);
-    return true;
   }
 
   lldb::addr_t FastSearch(lldb::addr_t low, lldb::addr_t high, uint8_t *buffer,
@@ -1291,7 +1289,7 @@ class CommandObjectMemoryWrite : public CommandObjectParsed {
   Options *GetOptions() override { return &m_option_group; }
 
 protected:
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     // No need to check "process" for validity as eCommandRequiresProcess
     // ensures it is valid
     Process *process = m_exe_ctx.GetProcessPtr();
@@ -1303,19 +1301,19 @@ class CommandObjectMemoryWrite : public CommandObjectParsed {
         result.AppendErrorWithFormat(
             "%s takes a destination address when writing file contents.\n",
             m_cmd_name.c_str());
-        return false;
+        return;
       }
       if (argc > 1) {
         result.AppendErrorWithFormat(
             "%s takes only a destination address when writing file contents.\n",
             m_cmd_name.c_str());
-        return false;
+        return;
       }
     } else if (argc < 2) {
       result.AppendErrorWithFormat(
           "%s takes a destination address and at least one value.\n",
           m_cmd_name.c_str());
-      return false;
+      return;
     }
 
     StreamString buffer(
@@ -1333,7 +1331,7 @@ class CommandObjectMemoryWrite : public CommandObjectParsed {
     if (addr == LLDB_INVALID_ADDRESS) {
       result.AppendError("invalid address expression\n");
       result.AppendError(error.AsCString());
-      return false;
+      return;
     }
 
     if (m_memory_options.m_infile) {
@@ -1372,7 +1370,7 @@ class CommandObjectMemoryWrite : public CommandObjectParsed {
       } else {
         result.AppendErrorWithFormat("Unable to read contents of file.\n");
       }
-      return result.Succeeded();
+      return;
     } else if (item_byte_size == 0) {
       if (m_format_options.GetFormat() == eFormatPointer)
         item_byte_size = buffer.GetAddressByteSize();
@@ -1415,7 +1413,7 @@ class CommandObjectMemoryWrite : public CommandObjectParsed {
       case eFormatInstruction:
       case eFormatVoid:
         result.AppendError("unsupported format for writing memory");
-        return false;
+        return;
 
       case eFormatDefault:
       case eFormatBytes:
@@ -1433,13 +1431,13 @@ class CommandObjectMemoryWrite : public CommandObjectParsed {
         if (!success) {
           result.AppendErrorWithFormat(
               "'%s' is not a valid hex string value.\n", entry.c_str());
-          return false;
+          return;
         } else if (!llvm::isUIntN(item_byte_size * 8, uval64)) {
           result.AppendErrorWithFormat("Value 0x%" PRIx64
                                        " is too large to fit in a %" PRIu64
                                        " byte unsigned integer value.\n",
                                        uval64, (uint64_t)item_byte_size);
-          return false;
+          return;
         }
         buffer.PutMaxHex64(uval64, item_byte_size);
         break;
@@ -1449,7 +1447,7 @@ class CommandObjectMemoryWrite : public CommandObjectParsed {
         if (!success) {
           result.AppendErrorWithFormat(
               "'%s' is not a valid boolean string value.\n", entry.c_str());
-          return false;
+          return;
         }
         buffer.PutMaxHex64(uval64, item_byte_size);
         break;
@@ -1458,13 +1456,13 @@ class CommandObjectMemoryWrite : public CommandObjectParsed {
         if (entry.ref().getAsInteger(2, uval64)) {
           result.AppendErrorWithFormat(
               "'%s' is not a valid binary string value.\n", entry.c_str());
-          return false;
+          return;
         } else if (!llvm::isUIntN(item_byte_size * 8, uval64)) {
           result.AppendErrorWithFormat("Value 0x%" PRIx64
                                        " is too large to fit in a %" PRIu64
                                        " byte unsigned integer value.\n",
                                        uval64, (uint64_t)item_byte_size);
-          return false;
+          return;
         }
         buffer.PutMaxHex64(uval64, item_byte_size);
         break;
@@ -1486,7 +1484,7 @@ class CommandObjectMemoryWrite : public CommandObjectParsed {
           result.AppendErrorWithFormat("Memory write to 0x%" PRIx64
                                        " failed: %s.\n",
                                        addr, error.AsCString());
-          return false;
+          return;
         }
         break;
       }
@@ -1494,13 +1492,13 @@ class CommandObjectMemoryWrite : public CommandObjectParsed {
         if (entry.ref().getAsInteger(0, sval64)) {
           result.AppendErrorWithFormat(
               "'%s' is not a valid signed decimal value.\n", entry.c_str());
-          return false;
+          return;
         } else if (!llvm::isIntN(item_byte_size * 8, sval64)) {
           result.AppendErrorWithFormat(
               "Value %" PRIi64 " is too large or small to fit in a %" PRIu64
               " byte signed integer value.\n",
               sval64, (uint64_t)item_byte_size);
-          return false;
+          return;
         }
         buffer.PutMaxHex64(sval64, item_byte_size);
         break;
@@ -1511,13 +1509,13 @@ class CommandObjectMemoryWrite : public CommandObjectParsed {
           result.AppendErrorWithFormat(
               "'%s' is not a valid unsigned decimal string value.\n",
               entry.c_str());
-          return false;
+          return;
         } else if (!llvm::isUIntN(item_byte_size * 8, uval64)) {
           result.AppendErrorWithFormat("Value %" PRIu64
                                        " is too large to fit in a %" PRIu64
                                        " byte unsigned integer value.\n",
                                        uval64, (uint64_t)item_byte_size);
-          return false;
+          return;
         }
         buffer.PutMaxHex64(uval64, item_byte_size);
         break;
@@ -1526,13 +1524,13 @@ class CommandObjectMemoryWrite : public CommandObjectParsed {
         if (entry.ref().getAsInteger(8, uval64)) {
           result.AppendErrorWithFormat(
               "'%s' is not a valid octal string value.\n", entry.c_str());
-          return false;
+          return;
         } else if (!llvm::isUIntN(item_byte_size * 8, uval64)) {
           result.AppendErrorWithFormat("Value %" PRIo64
                                        " is too large to fit in a %" PRIu64
                                        " byte unsigned integer value.\n",
                                        uval64, (uint64_t)item_byte_size);
-          return false;
+          return;
         }
         buffer.PutMaxHex64(uval64, item_byte_size);
         break;
@@ -1541,18 +1539,18 @@ class CommandObjectMemoryWrite : public CommandObjectParsed {
 
     if (!buffer.GetString().empty()) {
       Status error;
-      if (process->WriteMemory(addr, buffer.GetString().data(),
-                               buffer.GetString().size(),
-                               error) == buffer.GetString().size())
-        return true;
-      else {
+      const char *buffer_data = buffer.GetString().data();
+      const size_t buffer_size = buffer.GetString().size();
+      const size_t write_size =
+          process->WriteMemory(addr, buffer_data, buffer_size, error);
+
+      if (write_size != buffer_size) {
         result.AppendErrorWithFormat("Memory write to 0x%" PRIx64
                                      " failed: %s.\n",
                                      addr, error.AsCString());
-        return false;
+        return;
       }
     }
-    return true;
   }
 
   OptionGroupOptions m_option_group;
@@ -1595,13 +1593,13 @@ class CommandObjectMemoryHistory : public CommandObjectParsed {
   }
 
 protected:
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     const size_t argc = command.GetArgumentCount();
 
     if (argc == 0 || argc > 1) {
       result.AppendErrorWithFormat("%s takes an address expression",
                                    m_cmd_name.c_str());
-      return false;
+      return;
     }
 
     Status error;
@@ -1611,7 +1609,7 @@ class CommandObjectMemoryHistory : public CommandObjectParsed {
     if (addr == LLDB_INVALID_ADDRESS) {
       result.AppendError("invalid address expression");
       result.AppendError(error.AsCString());
-      return false;
+      return;
     }
 
     Stream *output_stream = &result.GetOutputStream();
@@ -1622,7 +1620,7 @@ class CommandObjectMemoryHistory : public CommandObjectParsed {
 
     if (!memory_history) {
       result.AppendError("no available memory history provider");
-      return false;
+      return;
     }
 
     HistoryThreads thread_list = memory_history->GetHistoryThreads(addr);
@@ -1633,8 +1631,6 @@ class CommandObjectMemoryHistory : public CommandObjectParsed {
     }
 
     result.SetStatus(eReturnStatusSuccessFinishResult);
-
-    return true;
   }
 };
 
@@ -1747,12 +1743,12 @@ class CommandObjectMemoryRegion : public CommandObjectParsed {
     }
   }
 
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     ProcessSP process_sp = m_exe_ctx.GetProcessSP();
     if (!process_sp) {
       m_prev_end_addr = LLDB_INVALID_ADDRESS;
       result.AppendError("invalid process");
-      return false;
+      return;
     }
 
     Status error;
@@ -1767,7 +1763,7 @@ class CommandObjectMemoryRegion : public CommandObjectParsed {
         result.AppendError(
             "The \"--all\" option cannot be used when an address "
             "argument is given");
-        return false;
+        return;
       }
 
       auto load_addr_str = command[0].ref();
@@ -1776,7 +1772,7 @@ class CommandObjectMemoryRegion : public CommandObjectParsed {
       if (error.Fail() || load_addr == LLDB_INVALID_ADDRESS) {
         result.AppendErrorWithFormat("invalid address argument \"%s\": %s\n",
                                      command[0].c_str(), error.AsCString());
-        return false;
+        return;
       }
     } else if (argc > 1 ||
                // When we're repeating the command, the previous end address is
@@ -1792,7 +1788,7 @@ class CommandObjectMemoryRegion : public CommandObjectParsed {
       result.AppendErrorWithFormat(
           "'%s' takes one argument or \"--all\" option:\nUsage: %s\n",
           m_cmd_name.c_str(), m_cmd_syntax.c_str());
-      return false;
+      return;
     }
 
     // It is important that we track the address used to request the region as
@@ -1832,11 +1828,10 @@ class CommandObjectMemoryRegion : public CommandObjectParsed {
       }
 
       result.SetStatus(eReturnStatusSuccessFinishResult);
-      return true;
+      return;
     }
 
     result.AppendErrorWithFormat("%s\n", error.AsCString());
-    return false;
   }
 
   std::optional<std::string> GetRepeatCommand(Args &current_command_args,
diff --git a/lldb/source/Commands/CommandObjectMemoryTag.cpp b/lldb/source/Commands/CommandObjectMemoryTag.cpp
index b436a185cd145..f45d6eacab3d0 100644
--- a/lldb/source/Commands/CommandObjectMemoryTag.cpp
+++ b/lldb/source/Commands/CommandObjectMemoryTag.cpp
@@ -42,12 +42,12 @@ class CommandObjectMemoryTagRead : public CommandObjectParsed {
   ~CommandObjectMemoryTagRead() override = default;
 
 protected:
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     if ((command.GetArgumentCount() < 1) || (command.GetArgumentCount() > 2)) {
       result.AppendError(
           "wrong number of arguments; expected at least <address-expression>, "
           "at most <address-expression> <end-address-expression>");
-      return false;
+      return;
     }
 
     Status error;
@@ -56,7 +56,7 @@ class CommandObjectMemoryTagRead : public CommandObjectParsed {
     if (start_addr == LLDB_INVALID_ADDRESS) {
       result.AppendErrorWithFormatv("Invalid address expression, {0}",
                                     error.AsCString());
-      return false;
+      return;
     }
 
     // Default 1 byte beyond start, rounds up to at most 1 granule later
@@ -68,7 +68,7 @@ class CommandObjectMemoryTagRead : public CommandObjectParsed {
       if (end_addr == LLDB_INVALID_ADDRESS) {
         result.AppendErrorWithFormatv("Invalid end address expression, {0}",
                                       error.AsCString());
-        return false;
+        return;
       }
     }
 
@@ -78,7 +78,7 @@ class CommandObjectMemoryTagRead : public CommandObjectParsed {
 
     if (!tag_manager_or_err) {
       result.SetError(Status(tag_manager_or_err.takeError()));
-      return false;
+      return;
     }
 
     const MemoryTagManager *tag_manager = *tag_manager_or_err;
@@ -103,7 +103,7 @@ class CommandObjectMemoryTagRead : public CommandObjectParsed {
 
     if (!tagged_range) {
       result.SetError(Status(tagged_range.takeError()));
-      return false;
+      return;
     }
 
     llvm::Expected<std::vector<lldb::addr_t>> tags = process->ReadMemoryTags(
@@ -111,7 +111,7 @@ class CommandObjectMemoryTagRead : public CommandObjectParsed {
 
     if (!tags) {
       result.SetError(Status(tags.takeError()));
-      return false;
+      return;
     }
 
     result.AppendMessageWithFormatv("Logical tag: {0:x}", logical_tag);
@@ -128,7 +128,6 @@ class CommandObjectMemoryTagRead : public CommandObjectParsed {
     }
 
     result.SetStatus(eReturnStatusSuccessFinishResult);
-    return true;
   }
 };
 
@@ -195,11 +194,11 @@ class CommandObjectMemoryTagWrite : public CommandObjectParsed {
   Options *GetOptions() override { return &m_option_group; }
 
 protected:
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     if (command.GetArgumentCount() < 2) {
       result.AppendError("wrong number of arguments; expected "
                          "<address-expression> <tag> [<tag> [...]]");
-      return false;
+      return;
     }
 
     Status error;
@@ -208,7 +207,7 @@ class CommandObjectMemoryTagWrite : public CommandObjectParsed {
     if (start_addr == LLDB_INVALID_ADDRESS) {
       result.AppendErrorWithFormatv("Invalid address expression, {0}",
                                     error.AsCString());
-      return false;
+      return;
     }
 
     command.Shift(); // shift off start address
@@ -221,7 +220,7 @@ class CommandObjectMemoryTagWrite : public CommandObjectParsed {
         result.AppendErrorWithFormat(
             "'%s' is not a valid unsigned decimal string value.\n",
             entry.c_str());
-        return false;
+        return;
       }
       tags.push_back(tag_value);
     }
@@ -232,7 +231,7 @@ class CommandObjectMemoryTagWrite : public CommandObjectParsed {
 
     if (!tag_manager_or_err) {
       result.SetError(Status(tag_manager_or_err.takeError()));
-      return false;
+      return;
     }
 
     const MemoryTagManager *tag_manager = *tag_manager_or_err;
@@ -284,7 +283,7 @@ class CommandObjectMemoryTagWrite : public CommandObjectParsed {
 
     if (!tagged_range) {
       result.SetError(Status(tagged_range.takeError()));
-      return false;
+      return;
     }
 
     Status status = process->WriteMemoryTags(tagged_range->GetRangeBase(),
@@ -292,11 +291,10 @@ class CommandObjectMemoryTagWrite : public CommandObjectParsed {
 
     if (status.Fail()) {
       result.SetError(status);
-      return false;
+      return;
     }
 
     result.SetStatus(eReturnStatusSuccessFinishResult);
-    return true;
   }
 
   OptionGroupOptions m_option_group;
diff --git a/lldb/source/Commands/CommandObjectPlatform.cpp b/lldb/source/Commands/CommandObjectPlatform.cpp
index 54115b51be78e..790f1dbb47535 100644
--- a/lldb/source/Commands/CommandObjectPlatform.cpp
+++ b/lldb/source/Commands/CommandObjectPlatform.cpp
@@ -169,7 +169,7 @@ class CommandObjectPlatformSelect : public CommandObjectParsed {
   Options *GetOptions() override { return &m_option_group; }
 
 protected:
-  bool DoExecute(Args &args, CommandReturnObject &result) override {
+  void DoExecute(Args &args, CommandReturnObject &result) override {
     if (args.GetArgumentCount() == 1) {
       const char *platform_name = args.GetArgumentAtIndex(0);
       if (platform_name && platform_name[0]) {
@@ -194,7 +194,6 @@ class CommandObjectPlatformSelect : public CommandObjectParsed {
       result.AppendError(
           "platform create takes a platform name as an argument\n");
     }
-    return result.Succeeded();
   }
 
   OptionGroupOptions m_option_group;
@@ -212,7 +211,7 @@ class CommandObjectPlatformList : public CommandObjectParsed {
   ~CommandObjectPlatformList() override = default;
 
 protected:
-  bool DoExecute(Args &args, CommandReturnObject &result) override {
+  void DoExecute(Args &args, CommandReturnObject &result) override {
     Stream &ostrm = result.GetOutputStream();
     ostrm.Printf("Available platforms:\n");
 
@@ -235,7 +234,6 @@ class CommandObjectPlatformList : public CommandObjectParsed {
       result.AppendError("no platforms are available\n");
     } else
       result.SetStatus(eReturnStatusSuccessFinishResult);
-    return result.Succeeded();
   }
 };
 
@@ -250,7 +248,7 @@ class CommandObjectPlatformStatus : public CommandObjectParsed {
   ~CommandObjectPlatformStatus() override = default;
 
 protected:
-  bool DoExecute(Args &args, CommandReturnObject &result) override {
+  void DoExecute(Args &args, CommandReturnObject &result) override {
     Stream &ostrm = result.GetOutputStream();
 
     Target *target = GetDebugger().GetSelectedTarget().get();
@@ -267,7 +265,6 @@ class CommandObjectPlatformStatus : public CommandObjectParsed {
     } else {
       result.AppendError("no platform is currently selected\n");
     }
-    return result.Succeeded();
   }
 };
 
@@ -286,7 +283,7 @@ class CommandObjectPlatformConnect : public CommandObjectParsed {
   ~CommandObjectPlatformConnect() override = default;
 
 protected:
-  bool DoExecute(Args &args, CommandReturnObject &result) override {
+  void DoExecute(Args &args, CommandReturnObject &result) override {
     Stream &ostrm = result.GetOutputStream();
 
     PlatformSP platform_sp(
@@ -307,7 +304,6 @@ class CommandObjectPlatformConnect : public CommandObjectParsed {
     } else {
       result.AppendError("no platform is currently selected\n");
     }
-    return result.Succeeded();
   }
 
   Options *GetOptions() override {
@@ -334,7 +330,7 @@ class CommandObjectPlatformDisconnect : public CommandObjectParsed {
   ~CommandObjectPlatformDisconnect() override = default;
 
 protected:
-  bool DoExecute(Args &args, CommandReturnObject &result) override {
+  void DoExecute(Args &args, CommandReturnObject &result) override {
     PlatformSP platform_sp(
         GetDebugger().GetPlatformList().GetSelectedPlatform());
     if (platform_sp) {
@@ -374,7 +370,6 @@ class CommandObjectPlatformDisconnect : public CommandObjectParsed {
     } else {
       result.AppendError("no platform is currently selected");
     }
-    return result.Succeeded();
   }
 };
 
@@ -394,7 +389,7 @@ class CommandObjectPlatformSettings : public CommandObjectParsed {
   ~CommandObjectPlatformSettings() override = default;
 
 protected:
-  bool DoExecute(Args &args, CommandReturnObject &result) override {
+  void DoExecute(Args &args, CommandReturnObject &result) override {
     PlatformSP platform_sp(
         GetDebugger().GetPlatformList().GetSelectedPlatform());
     if (platform_sp) {
@@ -404,7 +399,6 @@ class CommandObjectPlatformSettings : public CommandObjectParsed {
     } else {
       result.AppendError("no platform is currently selected");
     }
-    return result.Succeeded();
   }
 
   Options *GetOptions() override {
@@ -430,7 +424,7 @@ class CommandObjectPlatformMkDir : public CommandObjectParsed {
 
   ~CommandObjectPlatformMkDir() override = default;
 
-  bool DoExecute(Args &args, CommandReturnObject &result) override {
+  void DoExecute(Args &args, CommandReturnObject &result) override {
     PlatformSP platform_sp(
         GetDebugger().GetPlatformList().GetSelectedPlatform());
     if (platform_sp) {
@@ -453,7 +447,6 @@ class CommandObjectPlatformMkDir : public CommandObjectParsed {
     } else {
       result.AppendError("no platform currently selected\n");
     }
-    return result.Succeeded();
   }
 
   Options *GetOptions() override {
@@ -489,7 +482,7 @@ class CommandObjectPlatformFOpen : public CommandObjectParsed {
           nullptr);
   }
 
-  bool DoExecute(Args &args, CommandReturnObject &result) override {
+  void DoExecute(Args &args, CommandReturnObject &result) override {
     PlatformSP platform_sp(
         GetDebugger().GetPlatformList().GetSelectedPlatform());
     if (platform_sp) {
@@ -517,7 +510,6 @@ class CommandObjectPlatformFOpen : public CommandObjectParsed {
     } else {
       result.AppendError("no platform currently selected\n");
     }
-    return result.Succeeded();
   }
 
   Options *GetOptions() override {
@@ -544,7 +536,7 @@ class CommandObjectPlatformFClose : public CommandObjectParsed {
 
   ~CommandObjectPlatformFClose() override = default;
 
-  bool DoExecute(Args &args, CommandReturnObject &result) override {
+  void DoExecute(Args &args, CommandReturnObject &result) override {
     PlatformSP platform_sp(
         GetDebugger().GetPlatformList().GetSelectedPlatform());
     if (platform_sp) {
@@ -554,7 +546,7 @@ class CommandObjectPlatformFClose : public CommandObjectParsed {
       if (!llvm::to_integer(cmd_line, fd)) {
         result.AppendErrorWithFormatv("'{0}' is not a valid file descriptor.\n",
                                       cmd_line);
-        return result.Succeeded();
+        return;
       }
       Status error;
       bool success = platform_sp->CloseFile(fd, error);
@@ -567,7 +559,6 @@ class CommandObjectPlatformFClose : public CommandObjectParsed {
     } else {
       result.AppendError("no platform currently selected\n");
     }
-    return result.Succeeded();
   }
 };
 
@@ -588,7 +579,7 @@ class CommandObjectPlatformFRead : public CommandObjectParsed {
 
   ~CommandObjectPlatformFRead() override = default;
 
-  bool DoExecute(Args &args, CommandReturnObject &result) override {
+  void DoExecute(Args &args, CommandReturnObject &result) override {
     PlatformSP platform_sp(
         GetDebugger().GetPlatformList().GetSelectedPlatform());
     if (platform_sp) {
@@ -598,7 +589,7 @@ class CommandObjectPlatformFRead : public CommandObjectParsed {
       if (!llvm::to_integer(cmd_line, fd)) {
         result.AppendErrorWithFormatv("'{0}' is not a valid file descriptor.\n",
                                       cmd_line);
-        return result.Succeeded();
+        return;
       }
       std::string buffer(m_options.m_count, 0);
       Status error;
@@ -614,7 +605,6 @@ class CommandObjectPlatformFRead : public CommandObjectParsed {
     } else {
       result.AppendError("no platform currently selected\n");
     }
-    return result.Succeeded();
   }
 
   Options *GetOptions() override { return &m_options; }
@@ -684,7 +674,7 @@ class CommandObjectPlatformFWrite : public CommandObjectParsed {
 
   ~CommandObjectPlatformFWrite() override = default;
 
-  bool DoExecute(Args &args, CommandReturnObject &result) override {
+  void DoExecute(Args &args, CommandReturnObject &result) override {
     PlatformSP platform_sp(
         GetDebugger().GetPlatformList().GetSelectedPlatform());
     if (platform_sp) {
@@ -695,7 +685,7 @@ class CommandObjectPlatformFWrite : public CommandObjectParsed {
       if (!llvm::to_integer(cmd_line, fd)) {
         result.AppendErrorWithFormatv("'{0}' is not a valid file descriptor.",
                                       cmd_line);
-        return result.Succeeded();
+        return;
       }
       uint64_t retcode =
           platform_sp->WriteFile(fd, m_options.m_offset, &m_options.m_data[0],
@@ -709,7 +699,6 @@ class CommandObjectPlatformFWrite : public CommandObjectParsed {
     } else {
       result.AppendError("no platform currently selected\n");
     }
-    return result.Succeeded();
   }
 
   Options *GetOptions() override { return &m_options; }
@@ -839,12 +828,12 @@ class CommandObjectPlatformGetFile : public CommandObjectParsed {
           GetCommandInterpreter(), lldb::eDiskFileCompletion, request, nullptr);
   }
 
-  bool DoExecute(Args &args, CommandReturnObject &result) override {
+  void DoExecute(Args &args, CommandReturnObject &result) override {
     // If the number of arguments is incorrect, issue an error message.
     if (args.GetArgumentCount() != 2) {
       result.AppendError("required arguments missing; specify both the "
                          "source and destination file paths");
-      return false;
+      return;
     }
 
     PlatformSP platform_sp(
@@ -866,7 +855,6 @@ class CommandObjectPlatformGetFile : public CommandObjectParsed {
     } else {
       result.AppendError("no platform currently selected\n");
     }
-    return result.Succeeded();
   }
 };
 
@@ -911,12 +899,12 @@ class CommandObjectPlatformGetSize : public CommandObjectParsed {
         nullptr);
   }
 
-  bool DoExecute(Args &args, CommandReturnObject &result) override {
+  void DoExecute(Args &args, CommandReturnObject &result) override {
     // If the number of arguments is incorrect, issue an error message.
     if (args.GetArgumentCount() != 1) {
       result.AppendError("required argument missing; specify the source file "
                          "path as the only argument");
-      return false;
+      return;
     }
 
     PlatformSP platform_sp(
@@ -937,7 +925,6 @@ class CommandObjectPlatformGetSize : public CommandObjectParsed {
     } else {
       result.AppendError("no platform currently selected\n");
     }
-    return result.Succeeded();
   }
 };
 
@@ -982,12 +969,12 @@ class CommandObjectPlatformGetPermissions : public CommandObjectParsed {
         nullptr);
   }
 
-  bool DoExecute(Args &args, CommandReturnObject &result) override {
+  void DoExecute(Args &args, CommandReturnObject &result) override {
     // If the number of arguments is incorrect, issue an error message.
     if (args.GetArgumentCount() != 1) {
       result.AppendError("required argument missing; specify the source file "
                          "path as the only argument");
-      return false;
+      return;
     }
 
     PlatformSP platform_sp(
@@ -1007,7 +994,6 @@ class CommandObjectPlatformGetPermissions : public CommandObjectParsed {
     } else {
       result.AppendError("no platform currently selected\n");
     }
-    return result.Succeeded();
   }
 };
 
@@ -1052,12 +1038,12 @@ class CommandObjectPlatformFileExists : public CommandObjectParsed {
         nullptr);
   }
 
-  bool DoExecute(Args &args, CommandReturnObject &result) override {
+  void DoExecute(Args &args, CommandReturnObject &result) override {
     // If the number of arguments is incorrect, issue an error message.
     if (args.GetArgumentCount() != 1) {
       result.AppendError("required argument missing; specify the source file "
                          "path as the only argument");
-      return false;
+      return;
     }
 
     PlatformSP platform_sp(
@@ -1072,7 +1058,6 @@ class CommandObjectPlatformFileExists : public CommandObjectParsed {
     } else {
       result.AppendError("no platform currently selected\n");
     }
-    return result.Succeeded();
   }
 };
 
@@ -1114,7 +1099,7 @@ class CommandObjectPlatformPutFile : public CommandObjectParsed {
           nullptr);
   }
 
-  bool DoExecute(Args &args, CommandReturnObject &result) override {
+  void DoExecute(Args &args, CommandReturnObject &result) override {
     const char *src = args.GetArgumentAtIndex(0);
     const char *dst = args.GetArgumentAtIndex(1);
 
@@ -1134,7 +1119,6 @@ class CommandObjectPlatformPutFile : public CommandObjectParsed {
     } else {
       result.AppendError("no platform currently selected\n");
     }
-    return result.Succeeded();
   }
 };
 
@@ -1160,7 +1144,7 @@ class CommandObjectPlatformProcessLaunch : public CommandObjectParsed {
   Options *GetOptions() override { return &m_all_options; }
 
 protected:
-  bool DoExecute(Args &args, CommandReturnObject &result) override {
+  void DoExecute(Args &args, CommandReturnObject &result) override {
     Target *target = GetDebugger().GetSelectedTarget().get();
     PlatformSP platform_sp;
     if (target) {
@@ -1220,10 +1204,10 @@ class CommandObjectPlatformProcessLaunch : public CommandObjectParsed {
 
         if (!process_sp && error.Success()) {
           result.AppendError("failed to launch or debug process");
-          return false;
+          return;
         } else if (!error.Success()) {
           result.AppendError(error.AsCString());
-          return false;
+          return;
         }
 
         const bool synchronous_execution =
@@ -1242,7 +1226,7 @@ class CommandObjectPlatformProcessLaunch : public CommandObjectParsed {
         if (rebroadcast_first_stop) {
           assert(first_stop_event_sp);
           process_sp->BroadcastEvent(first_stop_event_sp);
-          return true;
+          return;
         }
 
         switch (state) {
@@ -1272,18 +1256,17 @@ class CommandObjectPlatformProcessLaunch : public CommandObjectParsed {
 
         if (process_sp && process_sp->IsAlive()) {
           result.SetStatus(eReturnStatusSuccessFinishNoResult);
-          return true;
+          return;
         }
       } else {
         result.AppendError("'platform process launch' uses the current target "
                            "file and arguments, or the executable and its "
                            "arguments can be specified in this command");
-        return false;
+        return;
       }
     } else {
       result.AppendError("no platform is selected\n");
     }
-    return result.Succeeded();
   }
 
   CommandOptionsProcessLaunch m_options;
@@ -1310,7 +1293,7 @@ class CommandObjectPlatformProcessList : public CommandObjectParsed {
   Options *GetOptions() override { return &m_options; }
 
 protected:
-  bool DoExecute(Args &args, CommandReturnObject &result) override {
+  void DoExecute(Args &args, CommandReturnObject &result) override {
     Target *target = GetDebugger().GetSelectedTarget().get();
     PlatformSP platform_sp;
     if (target) {
@@ -1398,7 +1381,6 @@ class CommandObjectPlatformProcessList : public CommandObjectParsed {
     } else {
       result.AppendError("no platform is selected\n");
     }
-    return result.Succeeded();
   }
 
   class CommandOptions : public Options {
@@ -1578,7 +1560,7 @@ class CommandObjectPlatformProcessInfo : public CommandObjectParsed {
   }
 
 protected:
-  bool DoExecute(Args &args, CommandReturnObject &result) override {
+  void DoExecute(Args &args, CommandReturnObject &result) override {
     Target *target = GetDebugger().GetSelectedTarget().get();
     PlatformSP platform_sp;
     if (target) {
@@ -1627,7 +1609,6 @@ class CommandObjectPlatformProcessInfo : public CommandObjectParsed {
     } else {
       result.AppendError("no platform is currently selected");
     }
-    return result.Succeeded();
   }
 };
 
@@ -1649,7 +1630,7 @@ class CommandObjectPlatformProcessAttach : public CommandObjectParsed {
 
   ~CommandObjectPlatformProcessAttach() override = default;
 
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     PlatformSP platform_sp(
         GetDebugger().GetPlatformList().GetSelectedPlatform());
     if (platform_sp) {
@@ -1673,7 +1654,6 @@ class CommandObjectPlatformProcessAttach : public CommandObjectParsed {
     } else {
       result.AppendError("no platform is currently selected");
     }
-    return result.Succeeded();
   }
 
   Options *GetOptions() override { return &m_all_options; }
@@ -1788,7 +1768,7 @@ class CommandObjectPlatformShell : public CommandObjectRaw {
 
   Options *GetOptions() override { return &m_options; }
 
-  bool DoExecute(llvm::StringRef raw_command_line,
+  void DoExecute(llvm::StringRef raw_command_line,
                  CommandReturnObject &result) override {
     ExecutionContext exe_ctx = GetCommandInterpreter().GetExecutionContext();
     m_options.NotifyOptionParsingStarting(&exe_ctx);
@@ -1796,7 +1776,7 @@ class CommandObjectPlatformShell : public CommandObjectRaw {
     // Print out an usage syntax on an empty command line.
     if (raw_command_line.empty()) {
       result.GetOutputStream().Printf("%s\n", this->GetSyntax().str().c_str());
-      return true;
+      return;
     }
 
     const bool is_alias = !raw_command_line.contains("platform");
@@ -1804,12 +1784,12 @@ class CommandObjectPlatformShell : public CommandObjectRaw {
 
     if (args.HasArgs())
       if (!ParseOptions(args.GetArgs(), result))
-        return false;
+        return;
 
     if (args.GetRawPart().empty()) {
       result.GetOutputStream().Printf("%s <shell-command>\n",
                                       is_alias ? "shell" : "platform shell");
-      return false;
+      return;
     }
 
     llvm::StringRef cmd = args.GetRawPart();
@@ -1856,7 +1836,6 @@ class CommandObjectPlatformShell : public CommandObjectRaw {
     } else {
       result.SetStatus(eReturnStatusSuccessFinishResult);
     }
-    return true;
   }
 
   CommandOptions m_options;
@@ -1887,10 +1866,10 @@ class CommandObjectPlatformInstall : public CommandObjectParsed {
         GetCommandInterpreter(), lldb::eDiskFileCompletion, request, nullptr);
   }
 
-  bool DoExecute(Args &args, CommandReturnObject &result) override {
+  void DoExecute(Args &args, CommandReturnObject &result) override {
     if (args.GetArgumentCount() != 2) {
       result.AppendError("platform target-install takes two arguments");
-      return false;
+      return;
     }
     // TODO: move the bulk of this code over to the platform itself
     FileSpec src(args.GetArgumentAtIndex(0));
@@ -1898,13 +1877,13 @@ class CommandObjectPlatformInstall : public CommandObjectParsed {
     FileSpec dst(args.GetArgumentAtIndex(1));
     if (!FileSystem::Instance().Exists(src)) {
       result.AppendError("source location does not exist or is not accessible");
-      return false;
+      return;
     }
     PlatformSP platform_sp(
         GetDebugger().GetPlatformList().GetSelectedPlatform());
     if (!platform_sp) {
       result.AppendError("no platform currently selected");
-      return false;
+      return;
     }
 
     Status error = platform_sp->Install(src, dst);
@@ -1913,7 +1892,6 @@ class CommandObjectPlatformInstall : public CommandObjectParsed {
     } else {
       result.AppendErrorWithFormat("install failed: %s", error.AsCString());
     }
-    return result.Succeeded();
   }
 };
 
diff --git a/lldb/source/Commands/CommandObjectPlugin.cpp b/lldb/source/Commands/CommandObjectPlugin.cpp
index 8661ebb5022b8..f22885144b09b 100644
--- a/lldb/source/Commands/CommandObjectPlugin.cpp
+++ b/lldb/source/Commands/CommandObjectPlugin.cpp
@@ -44,12 +44,12 @@ class CommandObjectPluginLoad : public CommandObjectParsed {
   }
 
 protected:
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     size_t argc = command.GetArgumentCount();
 
     if (argc != 1) {
       result.AppendError("'plugin load' requires one argument");
-      return false;
+      return;
     }
 
     Status error;
@@ -62,8 +62,6 @@ class CommandObjectPluginLoad : public CommandObjectParsed {
     else {
       result.AppendError(error.AsCString());
     }
-
-    return result.Succeeded();
   }
 };
 
diff --git a/lldb/source/Commands/CommandObjectProcess.cpp b/lldb/source/Commands/CommandObjectProcess.cpp
index cbf2652dae1ef..c7ce1b1258c19 100644
--- a/lldb/source/Commands/CommandObjectProcess.cpp
+++ b/lldb/source/Commands/CommandObjectProcess.cpp
@@ -160,7 +160,7 @@ class CommandObjectProcessLaunch : public CommandObjectProcessLaunchOrAttach {
   }
 
 protected:
-  bool DoExecute(Args &launch_args, CommandReturnObject &result) override {
+  void DoExecute(Args &launch_args, CommandReturnObject &result) override {
     Debugger &debugger = GetDebugger();
     Target *target = debugger.GetSelectedTarget().get();
     // If our listener is nullptr, users aren't allows to launch
@@ -174,13 +174,13 @@ class CommandObjectProcessLaunch : public CommandObjectProcessLaunchOrAttach {
     if (exe_module_sp == nullptr && !target->GetProcessLaunchInfo().GetExecutableFile()) {
       result.AppendError("no file in target, create a debug target using the "
                          "'target create' command");
-      return false;
+      return;
     }
 
     StateType state = eStateInvalid;
 
     if (!StopProcessIfNecessary(m_exe_ctx.GetProcessPtr(), state, result))
-      return false;
+      return;
 
     // Determine whether we will disable ASLR or leave it in the default state
     // (i.e. enabled if the platform supports it). First check if the process
@@ -290,7 +290,6 @@ class CommandObjectProcessLaunch : public CommandObjectProcessLaunchOrAttach {
     } else {
       result.AppendError(error.AsCString());
     }
-    return result.Succeeded();
   }
 
   CommandOptionsProcessLaunch m_options;
@@ -320,7 +319,7 @@ class CommandObjectProcessAttach : public CommandObjectProcessLaunchOrAttach {
   Options *GetOptions() override { return &m_all_options; }
 
 protected:
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     PlatformSP platform_sp(
         GetDebugger().GetPlatformList().GetSelectedPlatform());
 
@@ -334,7 +333,7 @@ class CommandObjectProcessAttach : public CommandObjectProcessLaunchOrAttach {
     Process *process = m_exe_ctx.GetProcessPtr();
 
     if (!StopProcessIfNecessary(process, state, result))
-      return false;
+      return;
 
     if (target == nullptr) {
       // If there isn't a current target create one.
@@ -348,7 +347,7 @@ class CommandObjectProcessAttach : public CommandObjectProcessLaunchOrAttach {
       target = new_target_sp.get();
       if (target == nullptr || error.Fail()) {
         result.AppendError(error.AsCString("Error creating target"));
-        return false;
+        return;
       }
     }
 
@@ -384,7 +383,7 @@ class CommandObjectProcessAttach : public CommandObjectProcessLaunchOrAttach {
     }
 
     if (!result.Succeeded())
-      return false;
+      return;
 
     // Okay, we're done.  Last step is to warn if the executable module has
     // changed:
@@ -429,8 +428,6 @@ class CommandObjectProcessAttach : public CommandObjectProcessLaunchOrAttach {
       ExecutionContext exe_ctx(process_sp);
       m_interpreter.HandleCommand("process continue", eLazyBoolNo, exe_ctx, result);
     }
-
-    return result.Succeeded();
   }
 
   CommandOptionsProcessAttach m_options;
@@ -504,8 +501,7 @@ class CommandObjectProcessContinue : public CommandObjectParsed {
     bool m_any_bkpts_specified = false;
   };
 
-
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     Process *process = m_exe_ctx.GetProcessPtr();
     bool synchronous_execution = m_interpreter.GetSynchronous();
     StateType state = process->GetState();
@@ -543,13 +539,13 @@ class CommandObjectProcessContinue : public CommandObjectParsed {
             m_options.m_run_to_bkpt_args, target, result, &run_to_bkpt_ids,
             BreakpointName::Permissions::disablePerm);
       if (!result.Succeeded()) {
-        return false;
+        return;
       }
       result.Clear();
       if (m_options.m_any_bkpts_specified && run_to_bkpt_ids.GetSize() == 0) {
         result.AppendError("continue-to breakpoints did not specify any actual "
                            "breakpoints or locations");
-        return false;
+        return;
       }
 
       // First figure out which breakpoints & locations were specified by the
@@ -612,7 +608,7 @@ class CommandObjectProcessContinue : public CommandObjectParsed {
         if (!any_enabled) {
           result.AppendError("at least one of the continue-to breakpoints must "
                              "be enabled.");
-          return false;
+          return;
         }
 
         // Also, if you specify BOTH a breakpoint and one of it's locations,
@@ -737,7 +733,6 @@ class CommandObjectProcessContinue : public CommandObjectParsed {
           "Process cannot be continued from its current state (%s).\n",
           StateAsCString(state));
     }
-    return result.Succeeded();
   }
 
   Options *GetOptions() override { return &m_options; }
@@ -809,7 +804,7 @@ class CommandObjectProcessDetach : public CommandObjectParsed {
   Options *GetOptions() override { return &m_options; }
 
 protected:
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     Process *process = m_exe_ctx.GetProcessPtr();
     // FIXME: This will be a Command Option:
     bool keep_stopped;
@@ -826,9 +821,7 @@ class CommandObjectProcessDetach : public CommandObjectParsed {
       result.SetStatus(eReturnStatusSuccessFinishResult);
     } else {
       result.AppendErrorWithFormat("Detach failed: %s\n", error.AsCString());
-      return false;
     }
-    return result.Succeeded();
   }
 
   CommandOptions m_options;
@@ -894,12 +887,12 @@ class CommandObjectProcessConnect : public CommandObjectParsed {
   Options *GetOptions() override { return &m_options; }
 
 protected:
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     if (command.GetArgumentCount() != 1) {
       result.AppendErrorWithFormat(
           "'%s' takes exactly one argument:\nUsage: %s\n", m_cmd_name.c_str(),
           m_cmd_syntax.c_str());
-      return false;
+      return;
     }
 
     Process *process = m_exe_ctx.GetProcessPtr();
@@ -908,7 +901,7 @@ class CommandObjectProcessConnect : public CommandObjectParsed {
           "Process %" PRIu64
           " is currently being debugged, kill the process before connecting.\n",
           process->GetID());
-      return false;
+      return;
     }
 
     const char *plugin_name = nullptr;
@@ -929,9 +922,7 @@ class CommandObjectProcessConnect : public CommandObjectParsed {
                   error);
     if (error.Fail() || process_sp == nullptr) {
       result.AppendError(error.AsCString("Error connecting to the process"));
-      return false;
     }
-    return true;
   }
 
   CommandOptions m_options;
@@ -1032,7 +1023,7 @@ class CommandObjectProcessLoad : public CommandObjectParsed {
   Options *GetOptions() override { return &m_options; }
 
 protected:
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     Process *process = m_exe_ctx.GetProcessPtr();
 
     for (auto &entry : command.entries()) {
@@ -1071,7 +1062,6 @@ class CommandObjectProcessLoad : public CommandObjectParsed {
                                      error.AsCString());
       }
     }
-    return result.Succeeded();
   }
 
   CommandOptions m_options;
@@ -1115,7 +1105,7 @@ class CommandObjectProcessUnload : public CommandObjectParsed {
   }
 
 protected:
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     Process *process = m_exe_ctx.GetProcessPtr();
 
     for (auto &entry : command.entries()) {
@@ -1138,7 +1128,6 @@ class CommandObjectProcessUnload : public CommandObjectParsed {
         }
       }
     }
-    return result.Succeeded();
   }
 };
 
@@ -1184,7 +1173,7 @@ class CommandObjectProcessSignal : public CommandObjectParsed {
   }
 
 protected:
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     Process *process = m_exe_ctx.GetProcessPtr();
 
     if (command.GetArgumentCount() == 1) {
@@ -1214,7 +1203,6 @@ class CommandObjectProcessSignal : public CommandObjectParsed {
           "'%s' takes exactly one signal number argument:\nUsage: %s\n",
           m_cmd_name.c_str(), m_cmd_syntax.c_str());
     }
-    return result.Succeeded();
   }
 };
 
@@ -1233,11 +1221,11 @@ class CommandObjectProcessInterrupt : public CommandObjectParsed {
   ~CommandObjectProcessInterrupt() override = default;
 
 protected:
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     Process *process = m_exe_ctx.GetProcessPtr();
     if (process == nullptr) {
       result.AppendError("no process to halt");
-      return false;
+      return;
     }
 
     bool clear_thread_plans = true;
@@ -1248,7 +1236,6 @@ class CommandObjectProcessInterrupt : public CommandObjectParsed {
       result.AppendErrorWithFormat("Failed to halt process: %s\n",
                                    error.AsCString());
     }
-    return result.Succeeded();
   }
 };
 
@@ -1267,11 +1254,11 @@ class CommandObjectProcessKill : public CommandObjectParsed {
   ~CommandObjectProcessKill() override = default;
 
 protected:
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     Process *process = m_exe_ctx.GetProcessPtr();
     if (process == nullptr) {
       result.AppendError("no process to kill");
-      return false;
+      return;
     }
 
     Status error(process->Destroy(true));
@@ -1281,7 +1268,6 @@ class CommandObjectProcessKill : public CommandObjectParsed {
       result.AppendErrorWithFormat("Failed to kill process: %s\n",
                                    error.AsCString());
     }
-    return result.Succeeded();
   }
 };
 
@@ -1356,7 +1342,7 @@ class CommandObjectProcessSaveCore : public CommandObjectParsed {
   };
 
 protected:
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     ProcessSP process_sp = m_exe_ctx.GetProcessSP();
     if (process_sp) {
       if (command.GetArgumentCount() == 1) {
@@ -1390,10 +1376,7 @@ class CommandObjectProcessSaveCore : public CommandObjectParsed {
       }
     } else {
       result.AppendError("invalid process");
-      return false;
     }
-
-    return result.Succeeded();
   }
 
   CommandOptions m_options;
@@ -1451,7 +1434,7 @@ class CommandObjectProcessStatus : public CommandObjectParsed {
   };
 
 protected:
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     Stream &strm = result.GetOutputStream();
     result.SetStatus(eReturnStatusSuccessFinishNoResult);
 
@@ -1483,7 +1466,7 @@ class CommandObjectProcessStatus : public CommandObjectParsed {
       PlatformSP platform_sp = process->GetTarget().GetPlatform();
       if (!platform_sp) {
         result.AppendError("Couldn'retrieve the target's platform");
-        return result.Succeeded();
+        return;
       }
 
       auto expected_crash_info =
@@ -1491,7 +1474,7 @@ class CommandObjectProcessStatus : public CommandObjectParsed {
 
       if (!expected_crash_info) {
         result.AppendError(llvm::toString(expected_crash_info.takeError()));
-        return result.Succeeded();
+        return;
       }
 
       StructuredData::DictionarySP crash_info_sp = *expected_crash_info;
@@ -1502,8 +1485,6 @@ class CommandObjectProcessStatus : public CommandObjectParsed {
         crash_info_sp->GetDescription(strm);
       }
     }
-
-    return result.Succeeded();
   }
 
 private:
@@ -1676,7 +1657,7 @@ class CommandObjectProcessHandle : public CommandObjectParsed {
   }
 
 protected:
-  bool DoExecute(Args &signal_args, CommandReturnObject &result) override {
+  void DoExecute(Args &signal_args, CommandReturnObject &result) override {
     Target &target = GetSelectedOrDummyTarget();
 
     // Any signals that are being set should be added to the Target's
@@ -1693,28 +1674,28 @@ class CommandObjectProcessHandle : public CommandObjectParsed {
         !VerifyCommandOptionValue(m_options.stop, stop_action)) {
       result.AppendError("Invalid argument for command option --stop; must be "
                          "true or false.\n");
-      return false;
+      return;
     }
 
     if (!m_options.notify.empty() &&
         !VerifyCommandOptionValue(m_options.notify, notify_action)) {
       result.AppendError("Invalid argument for command option --notify; must "
                          "be true or false.\n");
-      return false;
+      return;
     }
 
     if (!m_options.pass.empty() &&
         !VerifyCommandOptionValue(m_options.pass, pass_action)) {
       result.AppendError("Invalid argument for command option --pass; must be "
                          "true or false.\n");
-      return false;
+      return;
     }
 
     bool no_actions = (stop_action == -1 && pass_action == -1
         && notify_action == -1);
     if (m_options.only_target_values && !no_actions) {
       result.AppendError("-t is for reporting, not setting, target values.");
-      return false;
+      return;
     }
 
     size_t num_args = signal_args.GetArgumentCount();
@@ -1729,7 +1710,7 @@ class CommandObjectProcessHandle : public CommandObjectParsed {
     if (m_options.only_target_values) {
       target.PrintDummySignals(result.GetOutputStream(), signal_args);
       result.SetStatus(eReturnStatusSuccessFinishResult);
-      return true;
+      return;
     }
 
     // This handles clearing values:
@@ -1738,7 +1719,7 @@ class CommandObjectProcessHandle : public CommandObjectParsed {
       if (m_options.dummy)
         GetDummyTarget().ClearDummySignals(signal_args);
       result.SetStatus(eReturnStatusSuccessFinishNoResult);
-      return true;
+      return;
     }
 
     // This rest handles setting values:
@@ -1774,7 +1755,7 @@ class CommandObjectProcessHandle : public CommandObjectParsed {
           if (llvm::to_integer(arg.c_str(), signo)) {
             result.AppendErrorWithFormat("Can't set signal handling by signal "
                                          "number with no process");
-            return false;
+            return;
           }
          num_signals_set = num_args;
         }
@@ -1831,8 +1812,6 @@ class CommandObjectProcessHandle : public CommandObjectParsed {
       result.SetStatus(eReturnStatusSuccessFinishResult);
     else
       result.SetStatus(eReturnStatusFailed);
-
-    return result.Succeeded();
   }
 
   CommandOptions m_options;
@@ -1873,7 +1852,7 @@ class CommandObjectProcessTraceStop : public CommandObjectParsed {
 
   ~CommandObjectProcessTraceStop() override = default;
 
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     ProcessSP process_sp = m_exe_ctx.GetProcessSP();
 
     TraceSP trace_sp = process_sp->GetTarget().GetTrace();
@@ -1882,8 +1861,6 @@ class CommandObjectProcessTraceStop : public CommandObjectParsed {
       result.AppendError(toString(std::move(err)));
     else
       result.SetStatus(eReturnStatusSuccessFinishResult);
-
-    return result.Succeeded();
   }
 };
 
diff --git a/lldb/source/Commands/CommandObjectQuit.cpp b/lldb/source/Commands/CommandObjectQuit.cpp
index 650cfca2c050a..d7caf1546fb57 100644
--- a/lldb/source/Commands/CommandObjectQuit.cpp
+++ b/lldb/source/Commands/CommandObjectQuit.cpp
@@ -62,7 +62,7 @@ bool CommandObjectQuit::ShouldAskForConfirmation(bool &is_a_detach) {
   return should_prompt;
 }
 
-bool CommandObjectQuit::DoExecute(Args &command, CommandReturnObject &result) {
+void CommandObjectQuit::DoExecute(Args &command, CommandReturnObject &result) {
   bool is_a_detach = true;
   if (ShouldAskForConfirmation(is_a_detach)) {
     StreamString message;
@@ -71,14 +71,14 @@ bool CommandObjectQuit::DoExecute(Args &command, CommandReturnObject &result) {
                    (is_a_detach ? "detach from" : "kill"));
     if (!m_interpreter.Confirm(message.GetString(), true)) {
       result.SetStatus(eReturnStatusFailed);
-      return false;
+      return;
     }
   }
 
   if (command.GetArgumentCount() > 1) {
     result.AppendError("Too many arguments for 'quit'. Only an optional exit "
                        "code is allowed");
-    return false;
+    return;
   }
 
   // We parse the exit code argument if there is one.
@@ -90,12 +90,12 @@ bool CommandObjectQuit::DoExecute(Args &command, CommandReturnObject &result) {
       std::string arg_str = arg.str();
       s.Printf("Couldn't parse '%s' as integer for exit code.", arg_str.data());
       result.AppendError(s.GetString());
-      return false;
+      return;
     }
     if (!m_interpreter.SetQuitExitCode(exit_code)) {
       result.AppendError("The current driver doesn't allow custom exit codes"
                          " for the quit command.");
-      return false;
+      return;
     }
   }
 
@@ -103,6 +103,4 @@ bool CommandObjectQuit::DoExecute(Args &command, CommandReturnObject &result) {
       CommandInterpreter::eBroadcastBitQuitCommandReceived;
   m_interpreter.BroadcastEvent(event_type);
   result.SetStatus(eReturnStatusQuit);
-
-  return true;
 }
diff --git a/lldb/source/Commands/CommandObjectQuit.h b/lldb/source/Commands/CommandObjectQuit.h
index ccbd863cd6f5b..c27c0d1da3b9e 100644
--- a/lldb/source/Commands/CommandObjectQuit.h
+++ b/lldb/source/Commands/CommandObjectQuit.h
@@ -22,7 +22,7 @@ class CommandObjectQuit : public CommandObjectParsed {
   ~CommandObjectQuit() override;
 
 protected:
-  bool DoExecute(Args &args, CommandReturnObject &result) override;
+  void DoExecute(Args &args, CommandReturnObject &result) override;
 
   bool ShouldAskForConfirmation(bool &is_a_detach);
 };
diff --git a/lldb/source/Commands/CommandObjectRegexCommand.cpp b/lldb/source/Commands/CommandObjectRegexCommand.cpp
index 6ff1d281504ac..f638d707e17e7 100644
--- a/lldb/source/Commands/CommandObjectRegexCommand.cpp
+++ b/lldb/source/Commands/CommandObjectRegexCommand.cpp
@@ -54,7 +54,7 @@ llvm::Expected<std::string> CommandObjectRegexCommand::SubstituteVariables(
   return output.str();
 }
 
-bool CommandObjectRegexCommand::DoExecute(llvm::StringRef command,
+void CommandObjectRegexCommand::DoExecute(llvm::StringRef command,
                                           CommandReturnObject &result) {
   EntryCollection::const_iterator pos, end = m_entries.end();
   for (pos = m_entries.begin(); pos != end; ++pos) {
@@ -64,7 +64,7 @@ bool CommandObjectRegexCommand::DoExecute(llvm::StringRef command,
           SubstituteVariables(pos->command, matches);
       if (!new_command) {
         result.SetError(new_command.takeError());
-        return false;
+        return;
       }
 
       // Interpret the new command and return this as the result!
@@ -73,8 +73,9 @@ bool CommandObjectRegexCommand::DoExecute(llvm::StringRef command,
       // We don't have to pass an override_context here, as the command that 
       // called us should have set up the context appropriately.
       bool force_repeat_command = true;
-      return m_interpreter.HandleCommand(new_command->c_str(), eLazyBoolNo,
-                                         result, force_repeat_command);
+      m_interpreter.HandleCommand(new_command->c_str(), eLazyBoolNo, result,
+                                  force_repeat_command);
+      return;
     }
   }
   result.SetStatus(eReturnStatusFailed);
@@ -85,7 +86,6 @@ bool CommandObjectRegexCommand::DoExecute(llvm::StringRef command,
                             << "' failed to match any "
                                "regular expression in the '"
                             << m_cmd_name << "' regex ";
-  return false;
 }
 
 bool CommandObjectRegexCommand::AddRegexCommand(llvm::StringRef re_cstr,
diff --git a/lldb/source/Commands/CommandObjectRegexCommand.h b/lldb/source/Commands/CommandObjectRegexCommand.h
index 47d493a8fdd7b..c78b0b586c375 100644
--- a/lldb/source/Commands/CommandObjectRegexCommand.h
+++ b/lldb/source/Commands/CommandObjectRegexCommand.h
@@ -37,7 +37,7 @@ class CommandObjectRegexCommand : public CommandObjectRaw {
   void HandleCompletion(CompletionRequest &request) override;
 
 protected:
-  bool DoExecute(llvm::StringRef command, CommandReturnObject &result) override;
+  void DoExecute(llvm::StringRef command, CommandReturnObject &result) override;
 
   /// Substitute variables of the format %\d+ in the input string.
   static llvm::Expected<std::string> SubstituteVariables(
diff --git a/lldb/source/Commands/CommandObjectRegister.cpp b/lldb/source/Commands/CommandObjectRegister.cpp
index 6e6071fd54606..a4d53e5c8dd5f 100644
--- a/lldb/source/Commands/CommandObjectRegister.cpp
+++ b/lldb/source/Commands/CommandObjectRegister.cpp
@@ -161,7 +161,7 @@ class CommandObjectRegisterRead : public CommandObjectParsed {
   }
 
 protected:
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     Stream &strm = result.GetOutputStream();
     RegisterContext *reg_ctx = m_exe_ctx.GetRegisterContext();
 
@@ -234,7 +234,6 @@ class CommandObjectRegisterRead : public CommandObjectParsed {
         }
       }
     }
-    return result.Succeeded();
   }
 
   class CommandOptions : public OptionGroup {
@@ -348,7 +347,7 @@ class CommandObjectRegisterWrite : public CommandObjectParsed {
   }
 
 protected:
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     DataExtractor reg_data;
     RegisterContext *reg_ctx = m_exe_ctx.GetRegisterContext();
 
@@ -378,7 +377,7 @@ class CommandObjectRegisterWrite : public CommandObjectParsed {
             // has been written.
             m_exe_ctx.GetThreadRef().Flush();
             result.SetStatus(eReturnStatusSuccessFinishNoResult);
-            return true;
+            return;
           }
         }
         if (error.AsCString()) {
@@ -396,7 +395,6 @@ class CommandObjectRegisterWrite : public CommandObjectParsed {
                                      reg_name.str().c_str());
       }
     }
-    return result.Succeeded();
   }
 };
 
@@ -447,10 +445,10 @@ different for the same register when connected to different debug servers.)");
   }
 
 protected:
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     if (command.GetArgumentCount() != 1) {
       result.AppendError("register info takes exactly 1 argument: <reg-name>");
-      return result.Succeeded();
+      return;
     }
 
     llvm::StringRef reg_name = command[0].ref();
@@ -464,8 +462,6 @@ different for the same register when connected to different debug servers.)");
     } else
       result.AppendErrorWithFormat("No register found with name '%s'.\n",
                                    reg_name.str().c_str());
-
-    return result.Succeeded();
   }
 };
 
diff --git a/lldb/source/Commands/CommandObjectScript.cpp b/lldb/source/Commands/CommandObjectScript.cpp
index 7e4f18a0d5106..25f25b8e65947 100644
--- a/lldb/source/Commands/CommandObjectScript.cpp
+++ b/lldb/source/Commands/CommandObjectScript.cpp
@@ -65,14 +65,14 @@ CommandObjectScript::CommandObjectScript(CommandInterpreter &interpreter)
 
 CommandObjectScript::~CommandObjectScript() = default;
 
-bool CommandObjectScript::DoExecute(llvm::StringRef command,
+void CommandObjectScript::DoExecute(llvm::StringRef command,
                                     CommandReturnObject &result) {
   // Try parsing the language option but when the command contains a raw part
   // separated by the -- delimiter.
   OptionsWithRaw raw_args(command);
   if (raw_args.HasArgs()) {
     if (!ParseOptions(raw_args.GetArgs(), result))
-      return false;
+      return;
     command = raw_args.GetRawPart();
   }
 
@@ -84,7 +84,7 @@ bool CommandObjectScript::DoExecute(llvm::StringRef command,
   if (language == lldb::eScriptLanguageNone) {
     result.AppendError(
         "the script-lang setting is set to none - scripting not available");
-    return false;
+    return;
   }
 
   ScriptInterpreter *script_interpreter =
@@ -92,7 +92,7 @@ bool CommandObjectScript::DoExecute(llvm::StringRef command,
 
   if (script_interpreter == nullptr) {
     result.AppendError("no script interpreter");
-    return false;
+    return;
   }
 
   // Script might change Python code we use for formatting. Make sure we keep
@@ -102,7 +102,7 @@ bool CommandObjectScript::DoExecute(llvm::StringRef command,
   if (command.empty()) {
     script_interpreter->ExecuteInterpreterLoop();
     result.SetStatus(eReturnStatusSuccessFinishNoResult);
-    return result.Succeeded();
+    return;
   }
 
   // We can do better when reporting the status of one-liner script execution.
@@ -110,6 +110,4 @@ bool CommandObjectScript::DoExecute(llvm::StringRef command,
     result.SetStatus(eReturnStatusSuccessFinishNoResult);
   else
     result.SetStatus(eReturnStatusFailed);
-
-  return result.Succeeded();
 }
diff --git a/lldb/source/Commands/CommandObjectScript.h b/lldb/source/Commands/CommandObjectScript.h
index 9d164e864a8bc..3a8c4a890404a 100644
--- a/lldb/source/Commands/CommandObjectScript.h
+++ b/lldb/source/Commands/CommandObjectScript.h
@@ -31,7 +31,7 @@ class CommandObjectScript : public CommandObjectRaw {
   };
 
 protected:
-  bool DoExecute(llvm::StringRef command, CommandReturnObject &result) override;
+  void DoExecute(llvm::StringRef command, CommandReturnObject &result) override;
 
 private:
   CommandOptions m_options;
diff --git a/lldb/source/Commands/CommandObjectSession.cpp b/lldb/source/Commands/CommandObjectSession.cpp
index 6bf1ec99c8888..d140bdfdba57b 100644
--- a/lldb/source/Commands/CommandObjectSession.cpp
+++ b/lldb/source/Commands/CommandObjectSession.cpp
@@ -36,7 +36,7 @@ class CommandObjectSessionSave : public CommandObjectParsed {
   }
 
 protected:
-  bool DoExecute(Args &args, CommandReturnObject &result) override {
+  void DoExecute(Args &args, CommandReturnObject &result) override {
     llvm::StringRef file_path;
 
     if (!args.empty())
@@ -46,7 +46,6 @@ class CommandObjectSessionSave : public CommandObjectParsed {
       result.SetStatus(eReturnStatusSuccessFinishNoResult);
     else
       result.SetStatus(eReturnStatusFailed);
-    return result.Succeeded();
   }
 };
 
@@ -127,7 +126,7 @@ class CommandObjectSessionHistory : public CommandObjectParsed {
     OptionValueBoolean m_clear;
   };
 
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     if (m_options.m_clear.GetCurrentValue() &&
         m_options.m_clear.OptionWasSet()) {
       m_interpreter.GetCommandHistory().Clear();
@@ -189,7 +188,6 @@ class CommandObjectSessionHistory : public CommandObjectParsed {
                      stop_idx.second);
       }
     }
-    return result.Succeeded();
   }
 
   CommandOptions m_options;
diff --git a/lldb/source/Commands/CommandObjectSettings.cpp b/lldb/source/Commands/CommandObjectSettings.cpp
index 7069cb1d83993..5fb7dcc80279f 100644
--- a/lldb/source/Commands/CommandObjectSettings.cpp
+++ b/lldb/source/Commands/CommandObjectSettings.cpp
@@ -169,27 +169,27 @@ insert-before or insert-after.");
   }
 
 protected:
-  bool DoExecute(llvm::StringRef command,
+  void DoExecute(llvm::StringRef command,
                  CommandReturnObject &result) override {
     Args cmd_args(command);
 
     // Process possible options.
     if (!ParseOptions(cmd_args, result))
-      return false;
+      return;
 
     const size_t min_argc = m_options.m_force ? 1 : 2;
     const size_t argc = cmd_args.GetArgumentCount();
 
     if ((argc < min_argc) && (!m_options.m_global)) {
       result.AppendError("'settings set' takes more arguments");
-      return false;
+      return;
     }
 
     const char *var_name = cmd_args.GetArgumentAtIndex(0);
     if ((var_name == nullptr) || (var_name[0] == '\0')) {
       result.AppendError(
           "'settings set' command requires a valid variable name");
-      return false;
+      return;
     }
 
     // A missing value corresponds to clearing the setting when "force" is
@@ -199,9 +199,8 @@ insert-before or insert-after.");
           &m_exe_ctx, eVarSetOperationClear, var_name, llvm::StringRef()));
       if (error.Fail()) {
         result.AppendError(error.AsCString());
-        return false;
       }
-      return result.Succeeded();
+      return;
     }
 
     // Split the raw command into var_name and value pair.
@@ -227,11 +226,10 @@ insert-before or insert-after.");
 
     if (error.Fail() && !m_options.m_exists) {
       result.AppendError(error.AsCString());
-      return false;
+      return;
     }
 
     result.SetStatus(eReturnStatusSuccessFinishResult);
-    return result.Succeeded();
   }
 
 private:
@@ -273,7 +271,7 @@ class CommandObjectSettingsShow : public CommandObjectParsed {
   }
 
 protected:
-  bool DoExecute(Args &args, CommandReturnObject &result) override {
+  void DoExecute(Args &args, CommandReturnObject &result) override {
     result.SetStatus(eReturnStatusSuccessFinishResult);
 
     if (!args.empty()) {
@@ -291,8 +289,6 @@ class CommandObjectSettingsShow : public CommandObjectParsed {
       GetDebugger().DumpAllPropertyValues(&m_exe_ctx, result.GetOutputStream(),
                                           OptionValue::eDumpGroupValue);
     }
-
-    return result.Succeeded();
   }
 };
 
@@ -368,7 +364,7 @@ class CommandObjectSettingsWrite : public CommandObjectParsed {
   };
 
 protected:
-  bool DoExecute(Args &args, CommandReturnObject &result) override {
+  void DoExecute(Args &args, CommandReturnObject &result) override {
     FileSpec file_spec(m_options.m_filename);
     FileSystem::Instance().Resolve(file_spec);
     std::string path(file_spec.GetPath());
@@ -383,7 +379,7 @@ class CommandObjectSettingsWrite : public CommandObjectParsed {
 
     if (!out_file.GetFile().IsValid()) {
       result.AppendErrorWithFormat("%s: unable to write to file", path.c_str());
-      return false;
+      return;
     }
 
     // Exporting should not be context sensitive.
@@ -392,7 +388,7 @@ class CommandObjectSettingsWrite : public CommandObjectParsed {
     if (args.empty()) {
       GetDebugger().DumpAllPropertyValues(&clean_ctx, out_file,
                                           OptionValue::eDumpGroupExport);
-      return result.Succeeded();
+      return;
     }
 
     for (const auto &arg : args) {
@@ -402,8 +398,6 @@ class CommandObjectSettingsWrite : public CommandObjectParsed {
         result.AppendError(error.AsCString());
       }
     }
-
-    return result.Succeeded();
   }
 
 private:
@@ -461,7 +455,7 @@ class CommandObjectSettingsRead : public CommandObjectParsed {
   };
 
 protected:
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     FileSpec file(m_options.m_filename);
     FileSystem::Instance().Resolve(file);
     CommandInterpreterRunOptions options;
@@ -471,7 +465,6 @@ class CommandObjectSettingsRead : public CommandObjectParsed {
     options.SetPrintErrors(true);
     options.SetStopOnError(false);
     m_interpreter.HandleCommandsFromFile(file, options, result);
-    return result.Succeeded();
   }
 
 private:
@@ -517,7 +510,7 @@ class CommandObjectSettingsList : public CommandObjectParsed {
   }
 
 protected:
-  bool DoExecute(Args &args, CommandReturnObject &result) override {
+  void DoExecute(Args &args, CommandReturnObject &result) override {
     result.SetStatus(eReturnStatusSuccessFinishResult);
 
     const size_t argc = args.GetArgumentCount();
@@ -543,8 +536,6 @@ class CommandObjectSettingsList : public CommandObjectParsed {
       GetDebugger().DumpAllDescriptions(m_interpreter,
                                         result.GetOutputStream());
     }
-
-    return result.Succeeded();
   }
 };
 
@@ -601,7 +592,7 @@ class CommandObjectSettingsRemove : public CommandObjectRaw {
   }
 
 protected:
-  bool DoExecute(llvm::StringRef command,
+  void DoExecute(llvm::StringRef command,
                  CommandReturnObject &result) override {
     result.SetStatus(eReturnStatusSuccessFinishNoResult);
 
@@ -609,7 +600,7 @@ class CommandObjectSettingsRemove : public CommandObjectRaw {
 
     // Process possible options.
     if (!ParseOptions(cmd_args, result))
-      return false;
+      return;
 
     const size_t argc = cmd_args.GetArgumentCount();
     if (argc == 0) {
@@ -617,14 +608,14 @@ class CommandObjectSettingsRemove : public CommandObjectRaw {
                          "or an array followed by one or more indexes, or a "
                          "dictionary followed by one or more key names to "
                          "remove");
-      return false;
+      return;
     }
 
     const char *var_name = cmd_args.GetArgumentAtIndex(0);
     if ((var_name == nullptr) || (var_name[0] == '\0')) {
       result.AppendError(
           "'settings remove' command requires a valid variable name");
-      return false;
+      return;
     }
 
     // Split the raw command into var_name and value pair.
@@ -635,10 +626,7 @@ class CommandObjectSettingsRemove : public CommandObjectRaw {
         &m_exe_ctx, eVarSetOperationRemove, var_name, var_value));
     if (error.Fail()) {
       result.AppendError(error.AsCString());
-      return false;
     }
-
-    return result.Succeeded();
   }
 };
 
@@ -709,7 +697,7 @@ class CommandObjectSettingsReplace : public CommandObjectRaw {
   }
 
 protected:
-  bool DoExecute(llvm::StringRef command,
+  void DoExecute(llvm::StringRef command,
                  CommandReturnObject &result) override {
     result.SetStatus(eReturnStatusSuccessFinishNoResult);
 
@@ -718,7 +706,7 @@ class CommandObjectSettingsReplace : public CommandObjectRaw {
     if ((var_name == nullptr) || (var_name[0] == '\0')) {
       result.AppendError("'settings replace' command requires a valid variable "
                          "name; No value supplied");
-      return false;
+      return;
     }
 
     // Split the raw command into var_name, index_value, and value triple.
@@ -729,12 +717,9 @@ class CommandObjectSettingsReplace : public CommandObjectRaw {
         &m_exe_ctx, eVarSetOperationReplace, var_name, var_value));
     if (error.Fail()) {
       result.AppendError(error.AsCString());
-      return false;
     } else {
       result.SetStatus(eReturnStatusSuccessFinishNoResult);
     }
-
-    return result.Succeeded();
   }
 };
 
@@ -801,7 +786,7 @@ class CommandObjectSettingsInsertBefore : public CommandObjectRaw {
   }
 
 protected:
-  bool DoExecute(llvm::StringRef command,
+  void DoExecute(llvm::StringRef command,
                  CommandReturnObject &result) override {
     result.SetStatus(eReturnStatusSuccessFinishNoResult);
 
@@ -810,14 +795,14 @@ class CommandObjectSettingsInsertBefore : public CommandObjectRaw {
 
     if (argc < 3) {
       result.AppendError("'settings insert-before' takes more arguments");
-      return false;
+      return;
     }
 
     const char *var_name = cmd_args.GetArgumentAtIndex(0);
     if ((var_name == nullptr) || (var_name[0] == '\0')) {
       result.AppendError("'settings insert-before' command requires a valid "
                          "variable name; No value supplied");
-      return false;
+      return;
     }
 
     // Split the raw command into var_name, index_value, and value triple.
@@ -828,10 +813,7 @@ class CommandObjectSettingsInsertBefore : public CommandObjectRaw {
         &m_exe_ctx, eVarSetOperationInsertBefore, var_name, var_value));
     if (error.Fail()) {
       result.AppendError(error.AsCString());
-      return false;
     }
-
-    return result.Succeeded();
   }
 };
 
@@ -897,7 +879,7 @@ class CommandObjectSettingsInsertAfter : public CommandObjectRaw {
   }
 
 protected:
-  bool DoExecute(llvm::StringRef command,
+  void DoExecute(llvm::StringRef command,
                  CommandReturnObject &result) override {
     result.SetStatus(eReturnStatusSuccessFinishNoResult);
 
@@ -906,14 +888,14 @@ class CommandObjectSettingsInsertAfter : public CommandObjectRaw {
 
     if (argc < 3) {
       result.AppendError("'settings insert-after' takes more arguments");
-      return false;
+      return;
     }
 
     const char *var_name = cmd_args.GetArgumentAtIndex(0);
     if ((var_name == nullptr) || (var_name[0] == '\0')) {
       result.AppendError("'settings insert-after' command requires a valid "
                          "variable name; No value supplied");
-      return false;
+      return;
     }
 
     // Split the raw command into var_name, index_value, and value triple.
@@ -924,10 +906,7 @@ class CommandObjectSettingsInsertAfter : public CommandObjectRaw {
         &m_exe_ctx, eVarSetOperationInsertAfter, var_name, var_value));
     if (error.Fail()) {
       result.AppendError(error.AsCString());
-      return false;
     }
-
-    return result.Succeeded();
   }
 };
 
@@ -982,7 +961,7 @@ class CommandObjectSettingsAppend : public CommandObjectRaw {
   }
 
 protected:
-  bool DoExecute(llvm::StringRef command,
+  void DoExecute(llvm::StringRef command,
                  CommandReturnObject &result) override {
     result.SetStatus(eReturnStatusSuccessFinishNoResult);
     Args cmd_args(command);
@@ -990,14 +969,14 @@ class CommandObjectSettingsAppend : public CommandObjectRaw {
 
     if (argc < 2) {
       result.AppendError("'settings append' takes more arguments");
-      return false;
+      return;
     }
 
     const char *var_name = cmd_args.GetArgumentAtIndex(0);
     if ((var_name == nullptr) || (var_name[0] == '\0')) {
       result.AppendError("'settings append' command requires a valid variable "
                          "name; No value supplied");
-      return false;
+      return;
     }
 
     // Do not perform cmd_args.Shift() since StringRef is manipulating the raw
@@ -1011,10 +990,7 @@ class CommandObjectSettingsAppend : public CommandObjectRaw {
         &m_exe_ctx, eVarSetOperationAppend, var_name, var_value));
     if (error.Fail()) {
       result.AppendError(error.AsCString());
-      return false;
     }
-
-    return result.Succeeded();
   }
 };
 
@@ -1089,39 +1065,36 @@ class CommandObjectSettingsClear : public CommandObjectParsed {
   };
 
 protected:
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     result.SetStatus(eReturnStatusSuccessFinishNoResult);
     const size_t argc = command.GetArgumentCount();
 
     if (m_options.m_clear_all) {
       if (argc != 0) {
         result.AppendError("'settings clear --all' doesn't take any arguments");
-        return false;
+        return;
       }
       GetDebugger().GetValueProperties()->Clear();
-      return result.Succeeded();
+      return;
     }
 
     if (argc != 1) {
       result.AppendError("'settings clear' takes exactly one argument");
-      return false;
+      return;
     }
 
     const char *var_name = command.GetArgumentAtIndex(0);
     if ((var_name == nullptr) || (var_name[0] == '\0')) {
       result.AppendError("'settings clear' command requires a valid variable "
                          "name; No value supplied");
-      return false;
+      return;
     }
 
     Status error(GetDebugger().SetPropertyValue(
         &m_exe_ctx, eVarSetOperationClear, var_name, llvm::StringRef()));
     if (error.Fail()) {
       result.AppendError(error.AsCString());
-      return false;
     }
-
-    return result.Succeeded();
   }
 
   private:
diff --git a/lldb/source/Commands/CommandObjectSource.cpp b/lldb/source/Commands/CommandObjectSource.cpp
index 16452c1784bd6..db158a7f52630 100644
--- a/lldb/source/Commands/CommandObjectSource.cpp
+++ b/lldb/source/Commands/CommandObjectSource.cpp
@@ -532,14 +532,14 @@ class CommandObjectSourceInfo : public CommandObjectParsed {
     return true;
   }
 
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     Target *target = m_exe_ctx.GetTargetPtr();
     if (target == nullptr) {
       target = GetDebugger().GetSelectedTarget().get();
       if (target == nullptr) {
         result.AppendError("invalid target, create a debug target using the "
                            "'target create' command.");
-        return false;
+        return;
       }
     }
 
@@ -562,11 +562,11 @@ class CommandObjectSourceInfo : public CommandObjectParsed {
       }
       if (!m_module_list.GetSize()) {
         result.AppendError("No modules match the input.");
-        return false;
+        return;
       }
     } else if (target->GetImages().GetSize() == 0) {
       result.AppendError("The target has no associated executable images.");
-      return false;
+      return;
     }
 
     // Check the arguments to see what lines we should dump.
@@ -595,7 +595,6 @@ class CommandObjectSourceInfo : public CommandObjectParsed {
       else
         result.SetStatus(eReturnStatusFailed);
     }
-    return result.Succeeded();
   }
 
   CommandOptions m_options;
@@ -910,7 +909,7 @@ class CommandObjectSourceList : public CommandObjectParsed {
     }
   }
 
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     Target *target = m_exe_ctx.GetTargetPtr();
 
     if (!m_options.symbol_name.empty()) {
@@ -939,7 +938,7 @@ class CommandObjectSourceList : public CommandObjectParsed {
       if (sc_list.GetSize() == 0) {
         result.AppendErrorWithFormat("Could not find function named: \"%s\".\n",
                                      m_options.symbol_name.c_str());
-        return false;
+        return;
       }
 
       std::set<SourceInfo> source_match_set;
@@ -958,7 +957,7 @@ class CommandObjectSourceList : public CommandObjectParsed {
         result.SetStatus(eReturnStatusSuccessFinishResult);
       else
         result.SetStatus(eReturnStatusFailed);
-      return result.Succeeded();
+      return;
     } else if (m_options.address != LLDB_INVALID_ADDRESS) {
       Address so_addr;
       StreamString error_strm;
@@ -987,7 +986,7 @@ class CommandObjectSourceList : public CommandObjectParsed {
               "no modules have source information for file address 0x%" PRIx64
               ".\n",
               m_options.address);
-          return false;
+          return;
         }
       } else {
         // The target has some things loaded, resolve this address to a compile
@@ -1009,7 +1008,7 @@ class CommandObjectSourceList : public CommandObjectParsed {
                                            "is no line table information "
                                            "available for this address.\n",
                                            error_strm.GetData());
-              return false;
+              return;
             }
           }
         }
@@ -1018,7 +1017,7 @@ class CommandObjectSourceList : public CommandObjectParsed {
           result.AppendErrorWithFormat(
               "no modules contain load address 0x%" PRIx64 ".\n",
               m_options.address);
-          return false;
+          return;
         }
       }
       for (const SymbolContext &sc : sc_list) {
@@ -1134,7 +1133,7 @@ class CommandObjectSourceList : public CommandObjectParsed {
       if (num_matches == 0) {
         result.AppendErrorWithFormat("Could not find source file \"%s\".\n",
                                      m_options.file_name.c_str());
-        return false;
+        return;
       }
 
       if (num_matches > 1) {
@@ -1155,7 +1154,7 @@ class CommandObjectSourceList : public CommandObjectParsed {
           result.AppendErrorWithFormat(
               "Multiple source files found matching: \"%s.\"\n",
               m_options.file_name.c_str());
-          return false;
+          return;
         }
       }
 
@@ -1184,11 +1183,9 @@ class CommandObjectSourceList : public CommandObjectParsed {
         } else {
           result.AppendErrorWithFormat("No comp unit found for: \"%s.\"\n",
                                        m_options.file_name.c_str());
-          return false;
         }
       }
     }
-    return result.Succeeded();
   }
 
   const SymbolContextList *GetBreakpointLocations() {
@@ -1213,7 +1210,7 @@ class CommandObjectSourceCacheDump : public CommandObjectParsed {
   ~CommandObjectSourceCacheDump() override = default;
 
 protected:
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     // Dump the debugger source cache.
     result.GetOutputStream() << "Debugger Source File Cache\n";
     SourceManager::SourceFileCache &cache = GetDebugger().GetSourceFileCache();
@@ -1227,7 +1224,6 @@ class CommandObjectSourceCacheDump : public CommandObjectParsed {
     }
 
     result.SetStatus(eReturnStatusSuccessFinishResult);
-    return result.Succeeded();
   }
 };
 
@@ -1240,7 +1236,7 @@ class CommandObjectSourceCacheClear : public CommandObjectParsed {
   ~CommandObjectSourceCacheClear() override = default;
 
 protected:
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     // Clear the debugger cache.
     SourceManager::SourceFileCache &cache = GetDebugger().GetSourceFileCache();
     cache.Clear();
@@ -1250,7 +1246,6 @@ class CommandObjectSourceCacheClear : public CommandObjectParsed {
       process_sp->GetSourceFileCache().Clear();
 
     result.SetStatus(eReturnStatusSuccessFinishNoResult);
-    return result.Succeeded();
   }
 };
 
diff --git a/lldb/source/Commands/CommandObjectStats.cpp b/lldb/source/Commands/CommandObjectStats.cpp
index b0b497cd80ba8..262de0bda144a 100644
--- a/lldb/source/Commands/CommandObjectStats.cpp
+++ b/lldb/source/Commands/CommandObjectStats.cpp
@@ -26,15 +26,14 @@ class CommandObjectStatsEnable : public CommandObjectParsed {
   ~CommandObjectStatsEnable() override = default;
 
 protected:
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     if (DebuggerStats::GetCollectingStats()) {
       result.AppendError("statistics already enabled");
-      return false;
+      return;
     }
 
     DebuggerStats::SetCollectingStats(true);
     result.SetStatus(eReturnStatusSuccessFinishResult);
-    return true;
   }
 };
 
@@ -48,15 +47,14 @@ class CommandObjectStatsDisable : public CommandObjectParsed {
   ~CommandObjectStatsDisable() override = default;
 
 protected:
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     if (!DebuggerStats::GetCollectingStats()) {
       result.AppendError("need to enable statistics before disabling them");
-      return false;
+      return;
     }
 
     DebuggerStats::SetCollectingStats(false);
     result.SetStatus(eReturnStatusSuccessFinishResult);
-    return true;
   }
 };
 
@@ -105,7 +103,7 @@ class CommandObjectStatsDump : public CommandObjectParsed {
   Options *GetOptions() override { return &m_options; }
 
 protected:
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     Target *target = nullptr;
     if (!m_options.m_all_targets)
       target = m_exe_ctx.GetTargetPtr();
@@ -113,7 +111,6 @@ class CommandObjectStatsDump : public CommandObjectParsed {
     result.AppendMessageWithFormatv(
         "{0:2}", DebuggerStats::ReportStatistics(GetDebugger(), target));
     result.SetStatus(eReturnStatusSuccessFinishResult);
-    return true;
   }
 
   CommandOptions m_options;
diff --git a/lldb/source/Commands/CommandObjectTarget.cpp b/lldb/source/Commands/CommandObjectTarget.cpp
index 7c20893db243c..c84a6550d6c75 100644
--- a/lldb/source/Commands/CommandObjectTarget.cpp
+++ b/lldb/source/Commands/CommandObjectTarget.cpp
@@ -263,7 +263,7 @@ class CommandObjectTargetCreate : public CommandObjectParsed {
   }
 
 protected:
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     const size_t argc = command.GetArgumentCount();
     FileSpec core_file(m_core_file.GetOptionValue().GetCurrentValue());
     FileSpec remote_file(m_remote_file.GetOptionValue().GetCurrentValue());
@@ -276,7 +276,7 @@ class CommandObjectTargetCreate : public CommandObjectParsed {
         result.AppendErrorWithFormatv("Cannot open '{0}': {1}.",
                                       core_file.GetPath(),
                                       llvm::toString(file.takeError()));
-        return false;
+        return;
       }
     }
 
@@ -290,7 +290,7 @@ class CommandObjectTargetCreate : public CommandObjectParsed {
           result.AppendErrorWithFormatv("Cannot open '{0}': {1}.",
                                         symfile.GetPath(),
                                         llvm::toString(file.takeError()));
-          return false;
+          return;
         }
       }
 
@@ -310,7 +310,7 @@ class CommandObjectTargetCreate : public CommandObjectParsed {
 
       if (!target_sp) {
         result.AppendError(error.AsCString());
-        return false;
+        return;
       }
 
       const llvm::StringRef label =
@@ -318,7 +318,7 @@ class CommandObjectTargetCreate : public CommandObjectParsed {
       if (!label.empty()) {
         if (auto E = target_sp->SetLabel(label))
           result.SetError(std::move(E));
-        return false;
+        return;
       }
 
       auto on_error = llvm::make_scope_exit(
@@ -353,7 +353,7 @@ class CommandObjectTargetCreate : public CommandObjectParsed {
               Status err = platform_sp->PutFile(file_spec, remote_file);
               if (err.Fail()) {
                 result.AppendError(err.AsCString());
-                return false;
+                return;
               }
             }
           } else {
@@ -367,7 +367,7 @@ class CommandObjectTargetCreate : public CommandObjectParsed {
               Status err = platform_sp->GetFile(remote_file, file_spec);
               if (err.Fail()) {
                 result.AppendError(err.AsCString());
-                return false;
+                return;
               }
             } else {
               // If the remote file exists, we can debug reading that out of
@@ -381,12 +381,12 @@ class CommandObjectTargetCreate : public CommandObjectParsed {
               if (platform_sp->IsHost()) {
                 result.AppendError("Supply a local file, not a remote file, "
                                    "when debugging on the host.");
-                return false;
+                return;
               }
               if (platform_sp->IsConnected() && !platform_sp->GetFileExists(remote_file)) {
                 result.AppendError("remote --> local transfer without local "
                                  "path is not implemented yet");
-                return false;
+                return;
               }
               // Since there's only a remote file, we need to set the executable
               // file spec to the remote one.
@@ -397,7 +397,7 @@ class CommandObjectTargetCreate : public CommandObjectParsed {
           }
         } else {
           result.AppendError("no platform found for target");
-          return false;
+          return;
         }
       }
 
@@ -438,7 +438,7 @@ class CommandObjectTargetCreate : public CommandObjectParsed {
           if (error.Fail()) {
             result.AppendError(
                 error.AsCString("can't find plug-in for core file"));
-            return false;
+            return;
           } else {
             result.AppendMessageWithFormatv(
                 "Core file '{0}' ({1}) was loaded.\n", core_file.GetPath(),
@@ -464,8 +464,6 @@ class CommandObjectTargetCreate : public CommandObjectParsed {
                                    "argument, or use the --core option.\n",
                                    m_cmd_name.c_str());
     }
-
-    return result.Succeeded();
   }
 
 private:
@@ -492,7 +490,7 @@ class CommandObjectTargetList : public CommandObjectParsed {
   ~CommandObjectTargetList() override = default;
 
 protected:
-  bool DoExecute(Args &args, CommandReturnObject &result) override {
+  void DoExecute(Args &args, CommandReturnObject &result) override {
     Stream &strm = result.GetOutputStream();
 
     bool show_stopped_process_status = false;
@@ -501,7 +499,6 @@ class CommandObjectTargetList : public CommandObjectParsed {
       strm.PutCString("No targets.\n");
     }
     result.SetStatus(eReturnStatusSuccessFinishResult);
-    return result.Succeeded();
   }
 };
 
@@ -520,7 +517,7 @@ class CommandObjectTargetSelect : public CommandObjectParsed {
   ~CommandObjectTargetSelect() override = default;
 
 protected:
-  bool DoExecute(Args &args, CommandReturnObject &result) override {
+  void DoExecute(Args &args, CommandReturnObject &result) override {
     if (args.GetArgumentCount() == 1) {
       const char *target_identifier = args.GetArgumentAtIndex(0);
       uint32_t target_idx = LLDB_INVALID_INDEX32;
@@ -570,7 +567,6 @@ class CommandObjectTargetSelect : public CommandObjectParsed {
       result.AppendError(
           "'target select' takes a single argument: a target index\n");
     }
-    return result.Succeeded();
   }
 };
 
@@ -606,7 +602,7 @@ class CommandObjectTargetDelete : public CommandObjectParsed {
   Options *GetOptions() override { return &m_option_group; }
 
 protected:
-  bool DoExecute(Args &args, CommandReturnObject &result) override {
+  void DoExecute(Args &args, CommandReturnObject &result) override {
     const size_t argc = args.GetArgumentCount();
     std::vector<TargetSP> delete_target_list;
     TargetList &target_list = GetDebugger().GetTargetList();
@@ -620,7 +616,7 @@ class CommandObjectTargetDelete : public CommandObjectParsed {
       // Bail out if don't have any targets.
       if (num_targets == 0) {
         result.AppendError("no targets to delete");
-        return false;
+        return;
       }
 
       for (auto &entry : args.entries()) {
@@ -628,7 +624,7 @@ class CommandObjectTargetDelete : public CommandObjectParsed {
         if (entry.ref().getAsInteger(0, target_idx)) {
           result.AppendErrorWithFormat("invalid target index '%s'\n",
                                        entry.c_str());
-          return false;
+          return;
         }
         if (target_idx < num_targets) {
           target_sp = target_list.GetTargetAtIndex(target_idx);
@@ -646,13 +642,13 @@ class CommandObjectTargetDelete : public CommandObjectParsed {
               "target index %u is out of range, the only valid index is 0\n",
               target_idx);
 
-        return false;
+        return;
       }
     } else {
       target_sp = target_list.GetSelectedTarget();
       if (!target_sp) {
         result.AppendErrorWithFormat("no target is currently selected\n");
-        return false;
+        return;
       }
       delete_target_list.push_back(target_sp);
     }
@@ -673,7 +669,7 @@ class CommandObjectTargetDelete : public CommandObjectParsed {
                                     (uint32_t)num_targets_to_delete);
     result.SetStatus(eReturnStatusSuccessFinishResult);
 
-    return true;
+    return;
   }
 
   OptionGroupOptions m_option_group;
@@ -694,7 +690,7 @@ class CommandObjectTargetShowLaunchEnvironment : public CommandObjectParsed {
   ~CommandObjectTargetShowLaunchEnvironment() override = default;
 
 protected:
-  bool DoExecute(Args &args, CommandReturnObject &result) override {
+  void DoExecute(Args &args, CommandReturnObject &result) override {
     Target *target = m_exe_ctx.GetTargetPtr();
     Environment env = target->GetEnvironment();
 
@@ -712,7 +708,6 @@ class CommandObjectTargetShowLaunchEnvironment : public CommandObjectParsed {
       strm.Format("{0}={1}\n", KV->first(), KV->second);
 
     result.SetStatus(eReturnStatusSuccessFinishResult);
-    return result.Succeeded();
   }
 };
 
@@ -865,7 +860,7 @@ class CommandObjectTargetVariable : public CommandObjectParsed {
     }
   }
 
-  bool DoExecute(Args &args, CommandReturnObject &result) override {
+  void DoExecute(Args &args, CommandReturnObject &result) override {
     Target *target = m_exe_ctx.GetTargetPtr();
     const size_t argc = args.GetArgumentCount();
     Stream &s = result.GetOutputStream();
@@ -882,7 +877,7 @@ class CommandObjectTargetVariable : public CommandObjectParsed {
           if (!regex.IsValid()) {
             result.GetErrorStream().Printf(
                 "error: invalid regular expression: '%s'\n", arg.c_str());
-            return false;
+            return;
           }
           use_var_name = true;
           target->GetImages().FindGlobalVariables(regex, UINT32_MAX,
@@ -898,7 +893,7 @@ class CommandObjectTargetVariable : public CommandObjectParsed {
         if (matches == 0) {
           result.AppendErrorWithFormat("can't find global variable '%s'",
                                        arg.c_str());
-          return false;
+          return;
         } else {
           for (uint32_t global_idx = 0; global_idx < matches; ++global_idx) {
             VariableSP var_sp(variable_list.GetVariableAtIndex(global_idx));
@@ -1016,8 +1011,6 @@ class CommandObjectTargetVariable : public CommandObjectParsed {
 
     m_interpreter.PrintWarningsIfNecessary(result.GetOutputStream(),
                                            m_cmd_name);
-
-    return result.Succeeded();
   }
 
   OptionGroupOptions m_option_group;
@@ -1064,7 +1057,7 @@ class CommandObjectTargetModulesSearchPathsAdd : public CommandObjectParsed {
   ~CommandObjectTargetModulesSearchPathsAdd() override = default;
 
 protected:
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     Target *target = &GetSelectedTarget();
     const size_t argc = command.GetArgumentCount();
     if (argc & 1) {
@@ -1094,7 +1087,6 @@ class CommandObjectTargetModulesSearchPathsAdd : public CommandObjectParsed {
         }
       }
     }
-    return result.Succeeded();
   }
 };
 
@@ -1112,12 +1104,11 @@ class CommandObjectTargetModulesSearchPathsClear : public CommandObjectParsed {
   ~CommandObjectTargetModulesSearchPathsClear() override = default;
 
 protected:
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     Target *target = &GetSelectedTarget();
     bool notify = true;
     target->GetImageSearchPathList().Clear(notify);
     result.SetStatus(eReturnStatusSuccessFinishNoResult);
-    return result.Succeeded();
   }
 };
 
@@ -1187,7 +1178,7 @@ class CommandObjectTargetModulesSearchPathsInsert : public CommandObjectParsed {
   }
 
 protected:
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     Target *target = &GetSelectedTarget();
     size_t argc = command.GetArgumentCount();
     // check for at least 3 arguments and an odd number of parameters
@@ -1198,7 +1189,7 @@ class CommandObjectTargetModulesSearchPathsInsert : public CommandObjectParsed {
         result.AppendErrorWithFormat(
             "<index> parameter is not an integer: '%s'.\n",
             command.GetArgumentAtIndex(0));
-        return result.Succeeded();
+        return;
       }
 
       // shift off the index
@@ -1219,14 +1210,12 @@ class CommandObjectTargetModulesSearchPathsInsert : public CommandObjectParsed {
             result.AppendError("<path-prefix> can't be empty\n");
           else
             result.AppendError("<new-path-prefix> can't be empty\n");
-          return false;
+          return;
         }
       }
     } else {
       result.AppendError("insert requires at least three arguments\n");
-      return result.Succeeded();
     }
-    return result.Succeeded();
   }
 };
 
@@ -1244,12 +1233,11 @@ class CommandObjectTargetModulesSearchPathsList : public CommandObjectParsed {
   ~CommandObjectTargetModulesSearchPathsList() override = default;
 
 protected:
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     Target *target = &GetSelectedTarget();
 
     target->GetImageSearchPathList().Dump(&result.GetOutputStream());
     result.SetStatus(eReturnStatusSuccessFinishResult);
-    return result.Succeeded();
   }
 };
 
@@ -1280,11 +1268,11 @@ class CommandObjectTargetModulesSearchPathsQuery : public CommandObjectParsed {
   ~CommandObjectTargetModulesSearchPathsQuery() override = default;
 
 protected:
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     Target *target = &GetSelectedTarget();
     if (command.GetArgumentCount() != 1) {
       result.AppendError("query requires one argument\n");
-      return result.Succeeded();
+      return;
     }
 
     ConstString orig(command.GetArgumentAtIndex(0));
@@ -1295,7 +1283,6 @@ class CommandObjectTargetModulesSearchPathsQuery : public CommandObjectParsed {
       result.GetOutputStream().Printf("%s\n", orig.GetCString());
 
     result.SetStatus(eReturnStatusSuccessFinishResult);
-    return result.Succeeded();
   }
 };
 
@@ -1962,7 +1949,7 @@ class CommandObjectTargetModulesDumpObjfile
   ~CommandObjectTargetModulesDumpObjfile() override = default;
 
 protected:
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     Target *target = &GetSelectedTarget();
 
     uint32_t addr_byte_size = target->GetArchitecture().GetAddressByteSize();
@@ -2001,7 +1988,6 @@ class CommandObjectTargetModulesDumpObjfile
     } else {
       result.AppendError("no matching executable images found");
     }
-    return result.Succeeded();
   }
 };
 
@@ -2064,7 +2050,7 @@ class CommandObjectTargetModulesDumpSymtab
   };
 
 protected:
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     Target *target = &GetSelectedTarget();
     uint32_t num_dumped = 0;
     Mangled::NamePreference name_preference =
@@ -2100,7 +2086,7 @@ class CommandObjectTargetModulesDumpSymtab
         }
       } else {
         result.AppendError("the target has no associated executable images");
-        return false;
+        return;
       }
     } else {
       // Dump specified images (by basename or fullpath)
@@ -2140,7 +2126,6 @@ class CommandObjectTargetModulesDumpSymtab
     else {
       result.AppendError("no matching executable images found");
     }
-    return result.Succeeded();
   }
 
   CommandOptions m_options;
@@ -2163,7 +2148,7 @@ class CommandObjectTargetModulesDumpSections
   ~CommandObjectTargetModulesDumpSections() override = default;
 
 protected:
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     Target *target = &GetSelectedTarget();
     uint32_t num_dumped = 0;
 
@@ -2176,7 +2161,7 @@ class CommandObjectTargetModulesDumpSections
       const size_t num_modules = target->GetImages().GetSize();
       if (num_modules == 0) {
         result.AppendError("the target has no associated executable images");
-        return false;
+        return;
       }
 
       result.GetOutputStream().Format("Dumping sections for {0} modules.\n",
@@ -2231,7 +2216,6 @@ class CommandObjectTargetModulesDumpSections
     else {
       result.AppendError("no matching executable images found");
     }
-    return result.Succeeded();
   }
 };
 
@@ -2249,11 +2233,11 @@ class CommandObjectTargetModulesDumpClangPCMInfo : public CommandObjectParsed {
   ~CommandObjectTargetModulesDumpClangPCMInfo() override = default;
 
 protected:
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     if (command.GetArgumentCount() != 1) {
       result.AppendErrorWithFormat("'%s' takes exactly one pcm path argument.",
                                    m_cmd_name.c_str());
-      return false;
+      return;
     }
 
     const char *pcm_path = command.GetArgumentAtIndex(0);
@@ -2261,12 +2245,12 @@ class CommandObjectTargetModulesDumpClangPCMInfo : public CommandObjectParsed {
 
     if (pcm_file.GetFileNameExtension() != ".pcm") {
       result.AppendError("file must have a .pcm extension");
-      return false;
+      return;
     }
 
     if (!FileSystem::Instance().Exists(pcm_file)) {
       result.AppendError("pcm file does not exist");
-      return false;
+      return;
     }
 
     clang::CompilerInstance compiler;
@@ -2286,8 +2270,6 @@ class CommandObjectTargetModulesDumpClangPCMInfo : public CommandObjectParsed {
 
     if (compiler.ExecuteAction(dump_module_info))
       result.SetStatus(eReturnStatusSuccessFinishResult);
-
-    return result.Succeeded();
   }
 };
 
@@ -2308,14 +2290,14 @@ class CommandObjectTargetModulesDumpClangAST
   ~CommandObjectTargetModulesDumpClangAST() override = default;
 
 protected:
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     Target *target = &GetSelectedTarget();
 
     const ModuleList &module_list = target->GetImages();
     const size_t num_modules = module_list.GetSize();
     if (num_modules == 0) {
       result.AppendError("the target has no associated executable images");
-      return false;
+      return;
     }
 
     if (command.GetArgumentCount() == 0) {
@@ -2329,7 +2311,7 @@ class CommandObjectTargetModulesDumpClangAST
           sf->DumpClangAST(result.GetOutputStream());
       }
       result.SetStatus(eReturnStatusSuccessFinishResult);
-      return true;
+      return;
     }
 
     // Dump specified ASTs (by basename or fullpath)
@@ -2359,7 +2341,6 @@ class CommandObjectTargetModulesDumpClangAST
       }
     }
     result.SetStatus(eReturnStatusSuccessFinishResult);
-    return true;
   }
 };
 
@@ -2380,7 +2361,7 @@ class CommandObjectTargetModulesDumpSymfile
   ~CommandObjectTargetModulesDumpSymfile() override = default;
 
 protected:
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     Target *target = &GetSelectedTarget();
     uint32_t num_dumped = 0;
 
@@ -2395,7 +2376,7 @@ class CommandObjectTargetModulesDumpSymfile
       const size_t num_modules = target_modules.GetSize();
       if (num_modules == 0) {
         result.AppendError("the target has no associated executable images");
-        return false;
+        return;
       }
       result.GetOutputStream().Format(
           "Dumping debug symbols for {0} modules.\n", num_modules);
@@ -2440,7 +2421,6 @@ class CommandObjectTargetModulesDumpSymfile
     else {
       result.AppendError("no matching executable images found");
     }
-    return result.Succeeded();
   }
 };
 
@@ -2464,7 +2444,7 @@ class CommandObjectTargetModulesDumpLineTable
   Options *GetOptions() override { return &m_options; }
 
 protected:
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     Target *target = m_exe_ctx.GetTargetPtr();
     uint32_t total_num_dumped = 0;
 
@@ -2474,7 +2454,7 @@ class CommandObjectTargetModulesDumpLineTable
 
     if (command.GetArgumentCount() == 0) {
       result.AppendError("file option must be specified.");
-      return result.Succeeded();
+      return;
     } else {
       // Dump specified images (by basename or fullpath)
       const char *arg_cstr;
@@ -2516,7 +2496,6 @@ class CommandObjectTargetModulesDumpLineTable
     else {
       result.AppendError("no source filenames matched any command arguments");
     }
-    return result.Succeeded();
   }
 
   class CommandOptions : public Options {
@@ -2601,7 +2580,7 @@ class CommandObjectTargetModulesDumpSeparateDebugInfoFiles
   };
 
 protected:
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     Target &target = GetSelectedTarget();
     uint32_t num_dumped = 0;
 
@@ -2617,7 +2596,7 @@ class CommandObjectTargetModulesDumpSeparateDebugInfoFiles
       const size_t num_modules = target_modules.GetSize();
       if (num_modules == 0) {
         result.AppendError("the target has no associated executable images");
-        return false;
+        return;
       }
       for (ModuleSP module_sp : target_modules.ModulesNoLocking()) {
         if (INTERRUPT_REQUESTED(
@@ -2711,7 +2690,6 @@ class CommandObjectTargetModulesDumpSeparateDebugInfoFiles
     } else {
       result.AppendError("no matching executable images found");
     }
-    return result.Succeeded();
   }
 
   CommandOptions m_options;
@@ -2800,7 +2778,7 @@ class CommandObjectTargetModulesAdd : public CommandObjectParsed {
   OptionGroupUUID m_uuid_option_group;
   OptionGroupFile m_symbol_file;
 
-  bool DoExecute(Args &args, CommandReturnObject &result) override {
+  void DoExecute(Args &args, CommandReturnObject &result) override {
     Target *target = &GetSelectedTarget();
     bool flush = false;
 
@@ -2820,7 +2798,7 @@ class CommandObjectTargetModulesAdd : public CommandObjectParsed {
               target->GetOrCreateModule(module_spec, true /* notify */));
           if (module_sp) {
             result.SetStatus(eReturnStatusSuccessFinishResult);
-            return true;
+            return;
           } else {
             StreamString strm;
             module_spec.GetUUID().Dump(strm);
@@ -2843,7 +2821,7 @@ class CommandObjectTargetModulesAdd : public CommandObjectParsed {
                                            "or symbol file with UUID %s",
                                            strm.GetData());
             }
-            return false;
+            return;
           }
         } else {
           StreamString strm;
@@ -2852,12 +2830,12 @@ class CommandObjectTargetModulesAdd : public CommandObjectParsed {
               "Unable to locate the executable or symbol file with UUID %s",
               strm.GetData());
           result.SetError(error);
-          return false;
+          return;
         }
       } else {
         result.AppendError(
             "one or more executable image paths must be specified");
-        return false;
+        return;
       }
     } else {
       for (auto &entry : args.entries()) {
@@ -2885,7 +2863,7 @@ class CommandObjectTargetModulesAdd : public CommandObjectParsed {
             else
               result.AppendErrorWithFormat("unsupported module: %s",
                                            entry.c_str());
-            return false;
+            return;
           } else {
             flush = true;
           }
@@ -2910,8 +2888,6 @@ class CommandObjectTargetModulesAdd : public CommandObjectParsed {
       if (process)
         process->Flush();
     }
-
-    return result.Succeeded();
   }
 };
 
@@ -2952,7 +2928,7 @@ class CommandObjectTargetModulesLoad
   Options *GetOptions() override { return &m_option_group; }
 
 protected:
-  bool DoExecute(Args &args, CommandReturnObject &result) override {
+  void DoExecute(Args &args, CommandReturnObject &result) override {
     Target *target = &GetSelectedTarget();
     const bool load = m_load_option.GetOptionValue().GetCurrentValue();
     const bool set_pc = m_pc_option.GetOptionValue().GetCurrentValue();
@@ -3025,14 +3001,14 @@ class CommandObjectTargetModulesLoad
                 } else {
                   result.AppendError("one or more section name + load "
                                      "address pair must be specified");
-                  return false;
+                  return;
                 }
               } else {
                 if (m_slide_option.GetOptionValue().OptionWasSet()) {
                   result.AppendError("The \"--slide <offset>\" option can't "
                                      "be used in conjunction with setting "
                                      "section load addresses.\n");
-                  return false;
+                  return;
                 }
 
                 for (size_t i = 0; i < argc; i += 2) {
@@ -3094,22 +3070,22 @@ class CommandObjectTargetModulesLoad
                 Address file_entry = objfile->GetEntryPointAddress();
                 if (!process) {
                   result.AppendError("No process");
-                  return false;
+                  return;
                 }
                 if (set_pc && !file_entry.IsValid()) {
                   result.AppendError("No entry address in object file");
-                  return false;
+                  return;
                 }
                 std::vector<ObjectFile::LoadableData> loadables(
                     objfile->GetLoadableData(*target));
                 if (loadables.size() == 0) {
                   result.AppendError("No loadable sections");
-                  return false;
+                  return;
                 }
                 Status error = process->WriteObjectFile(std::move(loadables));
                 if (error.Fail()) {
                   result.AppendError(error.AsCString());
-                  return false;
+                  return;
                 }
                 if (set_pc) {
                   ThreadList &thread_list = process->GetThreadList();
@@ -3171,9 +3147,7 @@ class CommandObjectTargetModulesLoad
     } else {
       result.AppendError("either the \"--file <module>\" or the \"--uuid "
                          "<uuid>\" option must be specified.\n");
-      return false;
     }
-    return result.Succeeded();
   }
 
   OptionGroupOptions m_option_group;
@@ -3245,7 +3219,7 @@ class CommandObjectTargetModulesList : public CommandObjectParsed {
   Options *GetOptions() override { return &m_options; }
 
 protected:
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     Target *target = GetDebugger().GetSelectedTarget().get();
     const bool use_global_module_list = m_options.m_use_global_module_list;
     // Define a local module list here to ensure it lives longer than any
@@ -3255,7 +3229,7 @@ class CommandObjectTargetModulesList : public CommandObjectParsed {
     if (target == nullptr && !use_global_module_list) {
       result.AppendError("invalid target, create a debug target using the "
                          "'target create' command");
-      return false;
+      return;
     } else {
       if (target) {
         uint32_t addr_byte_size =
@@ -3288,7 +3262,7 @@ class CommandObjectTargetModulesList : public CommandObjectParsed {
           result.AppendError(
               "Can only look up modules by address with a valid target.");
         }
-        return result.Succeeded();
+        return;
       }
 
       size_t num_modules = 0;
@@ -3318,7 +3292,7 @@ class CommandObjectTargetModulesList : public CommandObjectParsed {
             if (argc == 1) {
               result.AppendErrorWithFormat("no modules found that match '%s'",
                                            arg.c_str());
-              return false;
+              return;
             }
           }
         }
@@ -3364,10 +3338,9 @@ class CommandObjectTargetModulesList : public CommandObjectParsed {
             result.AppendError(
                 "the target has no associated executable images");
         }
-        return false;
+        return;
       }
     }
-    return result.Succeeded();
   }
 
   void PrintModule(Target *target, Module *module, int indent, Stream &strm) {
@@ -3601,7 +3574,7 @@ class CommandObjectTargetModulesShowUnwind : public CommandObjectParsed {
   Options *GetOptions() override { return &m_options; }
 
 protected:
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     Target *target = m_exe_ctx.GetTargetPtr();
     Process *process = m_exe_ctx.GetProcessPtr();
     ABI *abi = nullptr;
@@ -3611,19 +3584,19 @@ class CommandObjectTargetModulesShowUnwind : public CommandObjectParsed {
     if (process == nullptr) {
       result.AppendError(
           "You must have a process running to use this command.");
-      return false;
+      return;
     }
 
     ThreadList threads(process->GetThreadList());
     if (threads.GetSize() == 0) {
       result.AppendError("The process must be paused to use this command.");
-      return false;
+      return;
     }
 
     ThreadSP thread(threads.GetThreadAtIndex(0));
     if (!thread) {
       result.AppendError("The process must be paused to use this command.");
-      return false;
+      return;
     }
 
     SymbolContextList sc_list;
@@ -3650,13 +3623,13 @@ class CommandObjectTargetModulesShowUnwind : public CommandObjectParsed {
     } else {
       result.AppendError(
           "address-expression or function name option must be specified.");
-      return false;
+      return;
     }
 
     if (sc_list.GetSize() == 0) {
       result.AppendErrorWithFormat("no unwind data found that matches '%s'.",
                                    m_options.m_str.c_str());
-      return false;
+      return;
     }
 
     for (const SymbolContext &sc : sc_list) {
@@ -3855,7 +3828,6 @@ class CommandObjectTargetModulesShowUnwind : public CommandObjectParsed {
 
       result.GetOutputStream().Printf("\n");
     }
-    return result.Succeeded();
   }
 
   CommandOptions m_options;
@@ -4159,7 +4131,7 @@ class CommandObjectTargetModulesLookup : public CommandObjectParsed {
   }
 
 protected:
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     Target *target = &GetSelectedTarget();
     bool syntax_error = false;
     uint32_t i;
@@ -4180,7 +4152,7 @@ class CommandObjectTargetModulesLookup : public CommandObjectParsed {
         num_successful_lookups++;
         if (!m_options.m_print_all) {
           result.SetStatus(eReturnStatusSuccessFinishResult);
-          return result.Succeeded();
+          return;
         }
       }
 
@@ -4190,7 +4162,7 @@ class CommandObjectTargetModulesLookup : public CommandObjectParsed {
       std::lock_guard<std::recursive_mutex> guard(target_modules.GetMutex());
       if (target_modules.GetSize() == 0) {
         result.AppendError("the target has no associated executable images");
-        return false;
+        return;
       }
 
       for (ModuleSP module_sp : target_modules.ModulesNoLocking()) {
@@ -4230,7 +4202,6 @@ class CommandObjectTargetModulesLookup : public CommandObjectParsed {
       result.SetStatus(eReturnStatusSuccessFinishResult);
     else
       result.SetStatus(eReturnStatusFailed);
-    return result.Succeeded();
   }
 
   CommandOptions m_options;
@@ -4679,7 +4650,7 @@ class CommandObjectTargetSymbolsAdd : public CommandObjectParsed {
     return true;
   }
 
-  bool DoExecute(Args &args, CommandReturnObject &result) override {
+  void DoExecute(Args &args, CommandReturnObject &result) override {
     Target *target = m_exe_ctx.GetTargetPtr();
     result.SetStatus(eReturnStatusFailed);
     bool flush = false;
@@ -4764,7 +4735,6 @@ class CommandObjectTargetSymbolsAdd : public CommandObjectParsed {
       if (process)
         process->Flush();
     }
-    return result.Succeeded();
   }
 
   OptionGroupOptions m_option_group;
@@ -5066,7 +5036,7 @@ Filter Options:
     io_handler.SetIsDone(true);
   }
 
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     m_stop_hook_sp.reset();
 
     Target &target = GetSelectedOrDummyTarget();
@@ -5163,7 +5133,7 @@ Filter Options:
         result.AppendErrorWithFormat("Couldn't add stop hook: %s",
                                      error.AsCString());
         target.UndoCreateStopHook(new_hook_sp->GetID());
-        return false;
+        return;
       }
     } else {
       m_stop_hook_sp = new_hook_sp;
@@ -5171,8 +5141,6 @@ Filter Options:
                                                  *this); // IOHandlerDelegate
     }
     result.SetStatus(eReturnStatusSuccessFinishNoResult);
-
-    return result.Succeeded();
   }
 
 private:
@@ -5209,14 +5177,14 @@ class CommandObjectTargetStopHookDelete : public CommandObjectParsed {
   }
 
 protected:
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     Target &target = GetSelectedOrDummyTarget();
     // FIXME: see if we can use the breakpoint id style parser?
     size_t num_args = command.GetArgumentCount();
     if (num_args == 0) {
       if (!m_interpreter.Confirm("Delete all stop hooks?", true)) {
         result.SetStatus(eReturnStatusFailed);
-        return false;
+        return;
       } else {
         target.RemoveAllStopHooks();
       }
@@ -5226,17 +5194,16 @@ class CommandObjectTargetStopHookDelete : public CommandObjectParsed {
         if (!llvm::to_integer(command.GetArgumentAtIndex(i), user_id)) {
           result.AppendErrorWithFormat("invalid stop hook id: \"%s\".\n",
                                        command.GetArgumentAtIndex(i));
-          return false;
+          return;
         }
         if (!target.RemoveStopHookByID(user_id)) {
           result.AppendErrorWithFormat("unknown stop hook id: \"%s\".\n",
                                        command.GetArgumentAtIndex(i));
-          return false;
+          return;
         }
       }
     }
     result.SetStatus(eReturnStatusSuccessFinishNoResult);
-    return result.Succeeded();
   }
 };
 
@@ -5266,7 +5233,7 @@ class CommandObjectTargetStopHookEnableDisable : public CommandObjectParsed {
   }
 
 protected:
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     Target &target = GetSelectedOrDummyTarget();
     // FIXME: see if we can use the breakpoint id style parser?
     size_t num_args = command.GetArgumentCount();
@@ -5280,18 +5247,17 @@ class CommandObjectTargetStopHookEnableDisable : public CommandObjectParsed {
         if (!llvm::to_integer(command.GetArgumentAtIndex(i), user_id)) {
           result.AppendErrorWithFormat("invalid stop hook id: \"%s\".\n",
                                        command.GetArgumentAtIndex(i));
-          return false;
+          return;
         }
         success = target.SetStopHookActiveStateByID(user_id, m_enable);
         if (!success) {
           result.AppendErrorWithFormat("unknown stop hook id: \"%s\".\n",
                                        command.GetArgumentAtIndex(i));
-          return false;
+          return;
         }
       }
     }
     result.SetStatus(eReturnStatusSuccessFinishNoResult);
-    return result.Succeeded();
   }
 
 private:
@@ -5311,7 +5277,7 @@ class CommandObjectTargetStopHookList : public CommandObjectParsed {
   ~CommandObjectTargetStopHookList() override = default;
 
 protected:
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     Target &target = GetSelectedOrDummyTarget();
 
     size_t num_hooks = target.GetNumStopHooks();
@@ -5327,7 +5293,6 @@ class CommandObjectTargetStopHookList : public CommandObjectParsed {
       }
     }
     result.SetStatus(eReturnStatusSuccessFinishResult);
-    return result.Succeeded();
   }
 };
 
@@ -5377,14 +5342,13 @@ class CommandObjectTargetDumpTypesystem : public CommandObjectParsed {
   ~CommandObjectTargetDumpTypesystem() override = default;
 
 protected:
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     // Go over every scratch TypeSystem and dump to the command output.
     for (lldb::TypeSystemSP ts : GetSelectedTarget().GetScratchTypeSystems())
       if (ts)
         ts->Dump(result.GetOutputStream().AsRawOstream());
 
     result.SetStatus(eReturnStatusSuccessFinishResult);
-    return result.Succeeded();
   }
 };
 
@@ -5403,11 +5367,10 @@ class CommandObjectTargetDumpSectionLoadList : public CommandObjectParsed {
   ~CommandObjectTargetDumpSectionLoadList() override = default;
 
 protected:
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     Target &target = GetSelectedTarget();
     target.GetSectionLoadList().Dump(result.GetOutputStream(), &target);
     result.SetStatus(eReturnStatusSuccessFinishResult);
-    return result.Succeeded();
   }
 };
 
diff --git a/lldb/source/Commands/CommandObjectThread.cpp b/lldb/source/Commands/CommandObjectThread.cpp
index 64f3edcad5639..a9f5a4f8a4fbd 100644
--- a/lldb/source/Commands/CommandObjectThread.cpp
+++ b/lldb/source/Commands/CommandObjectThread.cpp
@@ -412,7 +412,7 @@ class CommandObjectThreadStepWithTypeAndScope : public CommandObjectParsed {
   Options *GetOptions() override { return &m_all_options; }
 
 protected:
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     Process *process = m_exe_ctx.GetProcessPtr();
     bool synchronous_execution = m_interpreter.GetSynchronous();
 
@@ -424,7 +424,7 @@ class CommandObjectThreadStepWithTypeAndScope : public CommandObjectParsed {
 
       if (thread == nullptr) {
         result.AppendError("no selected thread in process");
-        return false;
+        return;
       }
     } else {
       const char *thread_idx_cstr = command.GetArgumentAtIndex(0);
@@ -433,7 +433,7 @@ class CommandObjectThreadStepWithTypeAndScope : public CommandObjectParsed {
       if (!llvm::to_integer(thread_idx_cstr, step_thread_idx)) {
         result.AppendErrorWithFormat("invalid thread index '%s'.\n",
                                      thread_idx_cstr);
-        return false;
+        return;
       }
       thread =
           process->GetThreadList().FindThreadByIndexID(step_thread_idx).get();
@@ -441,20 +441,20 @@ class CommandObjectThreadStepWithTypeAndScope : public CommandObjectParsed {
         result.AppendErrorWithFormat(
             "Thread index %u is out of range (valid values are 0 - %u).\n",
             step_thread_idx, num_threads);
-        return false;
+        return;
       }
     }
 
     if (m_step_type == eStepTypeScripted) {
       if (m_class_options.GetName().empty()) {
         result.AppendErrorWithFormat("empty class name for scripted step.");
-        return false;
+        return;
       } else if (!GetDebugger().GetScriptInterpreter()->CheckObjectExists(
                      m_class_options.GetName().c_str())) {
         result.AppendErrorWithFormat(
             "class for scripted step: \"%s\" does not exist.",
             m_class_options.GetName().c_str());
-        return false;
+        return;
       }
     }
 
@@ -462,7 +462,7 @@ class CommandObjectThreadStepWithTypeAndScope : public CommandObjectParsed {
         m_step_type != eStepTypeInto) {
       result.AppendErrorWithFormat(
           "end line option is only valid for step into");
-      return false;
+      return;
     }
 
     const bool abort_other_plans = false;
@@ -494,14 +494,14 @@ class CommandObjectThreadStepWithTypeAndScope : public CommandObjectParsed {
                                                    error)) {
             result.AppendErrorWithFormat("invalid end-line option: %s.",
                                          error.AsCString());
-            return false;
+            return;
           }
         } else if (m_options.m_end_line_is_block_end) {
           Status error;
           Block *block = frame->GetSymbolContext(eSymbolContextBlock).block;
           if (!block) {
             result.AppendErrorWithFormat("Could not find the current block.");
-            return false;
+            return;
           }
 
           AddressRange block_range;
@@ -510,7 +510,7 @@ class CommandObjectThreadStepWithTypeAndScope : public CommandObjectParsed {
           if (!block_range.GetBaseAddress().IsValid()) {
             result.AppendErrorWithFormat(
                 "Could not find the current block address.");
-            return false;
+            return;
           }
           lldb::addr_t pc_offset_in_block =
               pc_address.GetFileAddress() -
@@ -569,7 +569,7 @@ class CommandObjectThreadStepWithTypeAndScope : public CommandObjectParsed {
           new_plan_status);
     } else {
       result.AppendError("step type is not supported");
-      return false;
+      return;
     }
 
     // If we got a new plan, then set it to be a controlling plan (User level
@@ -600,7 +600,7 @@ class CommandObjectThreadStepWithTypeAndScope : public CommandObjectParsed {
 
       if (!error.Success()) {
         result.AppendMessage(error.AsCString());
-        return false;
+        return;
       }
 
       // There is a race condition where this thread will return up the call
@@ -624,7 +624,6 @@ class CommandObjectThreadStepWithTypeAndScope : public CommandObjectParsed {
     } else {
       result.SetError(new_plan_status);
     }
-    return result.Succeeded();
   }
 
   StepType m_step_type;
@@ -672,13 +671,13 @@ class CommandObjectThreadContinue : public CommandObjectParsed {
         nullptr);
   }
 
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     bool synchronous_execution = m_interpreter.GetSynchronous();
 
     Process *process = m_exe_ctx.GetProcessPtr();
     if (process == nullptr) {
       result.AppendError("no process exists. Cannot continue");
-      return false;
+      return;
     }
 
     StateType state = process->GetState();
@@ -698,7 +697,7 @@ class CommandObjectThreadContinue : public CommandObjectParsed {
           if (entry.ref().getAsInteger(0, thread_idx)) {
             result.AppendErrorWithFormat(
                 "invalid thread index argument: \"%s\".\n", entry.c_str());
-            return false;
+            return;
           }
           Thread *thread =
               process->GetThreadList().FindThreadByIndexID(thread_idx).get();
@@ -708,13 +707,13 @@ class CommandObjectThreadContinue : public CommandObjectParsed {
           } else {
             result.AppendErrorWithFormat("invalid thread index %u.\n",
                                          thread_idx);
-            return false;
+            return;
           }
         }
 
         if (resume_threads.empty()) {
           result.AppendError("no valid thread indexes were specified");
-          return false;
+          return;
         } else {
           if (resume_threads.size() == 1)
             result.AppendMessageWithFormat("Resuming thread: ");
@@ -753,7 +752,7 @@ class CommandObjectThreadContinue : public CommandObjectParsed {
         Thread *current_thread = GetDefaultThread();
         if (current_thread == nullptr) {
           result.AppendError("the process doesn't have a current thread");
-          return false;
+          return;
         }
         // Set the actions that the threads should each take when resuming
         for (uint32_t idx = 0; idx < num_threads; ++idx) {
@@ -801,8 +800,6 @@ class CommandObjectThreadContinue : public CommandObjectParsed {
           "Process cannot be continued from its current state (%s).\n",
           StateAsCString(state));
     }
-
-    return result.Succeeded();
   }
 };
 
@@ -920,7 +917,7 @@ class CommandObjectThreadUntil : public CommandObjectParsed {
   Options *GetOptions() override { return &m_options; }
 
 protected:
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     bool synchronous_execution = m_interpreter.GetSynchronous();
 
     Target *target = &GetSelectedTarget();
@@ -939,14 +936,14 @@ class CommandObjectThreadUntil : public CommandObjectParsed {
           if (!llvm::to_integer(command.GetArgumentAtIndex(i), line_number)) {
             result.AppendErrorWithFormat("invalid line number: '%s'.\n",
                                          command.GetArgumentAtIndex(i));
-            return false;
+            return;
           } else
             line_numbers.push_back(line_number);
         }
       } else if (m_options.m_until_addrs.empty()) {
         result.AppendErrorWithFormat("No line number or address provided:\n%s",
                                      GetSyntax().str().c_str());
-        return false;
+        return;
       }
 
       if (m_options.m_thread_idx == LLDB_INVALID_THREAD_ID) {
@@ -962,7 +959,7 @@ class CommandObjectThreadUntil : public CommandObjectParsed {
         result.AppendErrorWithFormat(
             "Thread index %u is out of range (valid values are 0 - %u).\n",
             m_options.m_thread_idx, num_threads);
-        return false;
+        return;
       }
 
       const bool abort_other_plans = false;
@@ -973,7 +970,7 @@ class CommandObjectThreadUntil : public CommandObjectParsed {
         result.AppendErrorWithFormat(
             "Frame index %u is out of range for thread id %" PRIu64 ".\n",
             m_options.m_frame_idx, thread->GetID());
-        return false;
+        return;
       }
 
       ThreadPlanSP new_plan_sp;
@@ -991,7 +988,7 @@ class CommandObjectThreadUntil : public CommandObjectParsed {
           result.AppendErrorWithFormat("Failed to resolve the line table for "
                                        "frame %u of thread id %" PRIu64 ".\n",
                                        m_options.m_frame_idx, thread->GetID());
-          return false;
+          return;
         }
 
         LineEntry function_start;
@@ -1003,7 +1000,7 @@ class CommandObjectThreadUntil : public CommandObjectParsed {
         if (!sc.function) {
           result.AppendErrorWithFormat("Have debug information but no "
                                        "function info - can't get until range.");
-          return false;
+          return;
         }
 
         AddressRange fun_addr_range = sc.function->GetAddressRange();
@@ -1067,7 +1064,7 @@ class CommandObjectThreadUntil : public CommandObjectParsed {
             result.AppendErrorWithFormat(
                 "Until target outside of the current function.\n");
 
-          return false;
+          return;
         }
 
         new_plan_sp = thread->QueueThreadPlanForStepUntil(
@@ -1083,20 +1080,20 @@ class CommandObjectThreadUntil : public CommandObjectParsed {
           new_plan_sp->SetOkayToDiscard(false);
         } else {
           result.SetError(new_plan_status);
-          return false;
+          return;
         }
       } else {
         result.AppendErrorWithFormat("Frame index %u of thread id %" PRIu64
                                      " has no debug information.\n",
                                      m_options.m_frame_idx, thread->GetID());
-        return false;
+        return;
       }
 
       if (!process->GetThreadList().SetSelectedThreadByID(thread->GetID())) {
         result.AppendErrorWithFormat(
             "Failed to set the selected thread to thread id %" PRIu64 ".\n",
             thread->GetID());
-        return false;
+        return;
       }
 
       StreamString stream;
@@ -1125,7 +1122,6 @@ class CommandObjectThreadUntil : public CommandObjectParsed {
                                      error.AsCString());
       }
     }
-    return result.Succeeded();
   }
 
   CommandOptions m_options;
@@ -1170,23 +1166,23 @@ class CommandObjectThreadSelect : public CommandObjectParsed {
   }
 
 protected:
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     Process *process = m_exe_ctx.GetProcessPtr();
     if (process == nullptr) {
       result.AppendError("no process");
-      return false;
+      return;
     } else if (command.GetArgumentCount() != 1) {
       result.AppendErrorWithFormat(
           "'%s' takes exactly one thread index argument:\nUsage: %s\n",
           m_cmd_name.c_str(), m_cmd_syntax.c_str());
-      return false;
+      return;
     }
 
     uint32_t index_id;
     if (!llvm::to_integer(command.GetArgumentAtIndex(0), index_id)) {
       result.AppendErrorWithFormat("Invalid thread index '%s'",
                                    command.GetArgumentAtIndex(0));
-      return false;
+      return;
     }
 
     Thread *new_thread =
@@ -1194,13 +1190,11 @@ class CommandObjectThreadSelect : public CommandObjectParsed {
     if (new_thread == nullptr) {
       result.AppendErrorWithFormat("invalid thread #%s.\n",
                                    command.GetArgumentAtIndex(0));
-      return false;
+      return;
     }
 
     process->GetThreadList().SetSelectedThreadByID(new_thread->GetID(), true);
     result.SetStatus(eReturnStatusSuccessFinishNoResult);
-
-    return result.Succeeded();
   }
 };
 
@@ -1221,7 +1215,7 @@ class CommandObjectThreadList : public CommandObjectParsed {
   ~CommandObjectThreadList() override = default;
 
 protected:
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     Stream &strm = result.GetOutputStream();
     result.SetStatus(eReturnStatusSuccessFinishNoResult);
     Process *process = m_exe_ctx.GetProcessPtr();
@@ -1232,7 +1226,6 @@ class CommandObjectThreadList : public CommandObjectParsed {
     process->GetStatus(strm);
     process->GetThreadStatus(strm, only_threads_with_stop_reason, start_frame,
                              num_frames, num_frames_with_source, false);
-    return result.Succeeded();
   }
 };
 
@@ -1511,7 +1504,7 @@ class CommandObjectThreadReturn : public CommandObjectRaw {
   Options *GetOptions() override { return &m_options; }
 
 protected:
-  bool DoExecute(llvm::StringRef command,
+  void DoExecute(llvm::StringRef command,
                  CommandReturnObject &result) override {
     // I am going to handle this by hand, because I don't want you to have to
     // say:
@@ -1539,7 +1532,7 @@ class CommandObjectThreadReturn : public CommandObjectRaw {
               "Could not select 0th frame after unwinding expression.");
         }
       }
-      return result.Succeeded();
+      return;
     }
 
     ValueObjectSP return_valobj_sp;
@@ -1549,7 +1542,7 @@ class CommandObjectThreadReturn : public CommandObjectRaw {
 
     if (frame_sp->IsInlined()) {
       result.AppendError("Don't know how to return from inlined frames.");
-      return false;
+      return;
     }
 
     if (!command.empty()) {
@@ -1570,7 +1563,7 @@ class CommandObjectThreadReturn : public CommandObjectRaw {
         else
           result.AppendErrorWithFormat(
               "Unknown error evaluating result expression.");
-        return false;
+        return;
       }
     }
 
@@ -1582,11 +1575,10 @@ class CommandObjectThreadReturn : public CommandObjectRaw {
       result.AppendErrorWithFormat(
           "Error returning from frame %d of thread %d: %s.", frame_idx,
           thread_sp->GetIndexID(), error.AsCString());
-      return false;
+      return;
     }
 
     result.SetStatus(eReturnStatusSuccessFinishResult);
-    return true;
   }
 
   CommandOptions m_options;
@@ -1667,7 +1659,7 @@ class CommandObjectThreadJump : public CommandObjectParsed {
   Options *GetOptions() override { return &m_options; }
 
 protected:
-  bool DoExecute(Args &args, CommandReturnObject &result) override {
+  void DoExecute(Args &args, CommandReturnObject &result) override {
     RegisterContext *reg_ctx = m_exe_ctx.GetRegisterContext();
     StackFrame *frame = m_exe_ctx.GetFramePtr();
     Thread *thread = m_exe_ctx.GetThreadPtr();
@@ -1682,13 +1674,13 @@ class CommandObjectThreadJump : public CommandObjectParsed {
       lldb::addr_t callAddr = dest.GetCallableLoadAddress(target);
       if (callAddr == LLDB_INVALID_ADDRESS) {
         result.AppendErrorWithFormat("Invalid destination address.");
-        return false;
+        return;
       }
 
       if (!reg_ctx->SetPC(callAddr)) {
         result.AppendErrorWithFormat("Error changing PC value for thread %d.",
                                      thread->GetIndexID());
-        return false;
+        return;
       }
     } else {
       // Pick either the absolute line, or work out a relative one.
@@ -1704,7 +1696,7 @@ class CommandObjectThreadJump : public CommandObjectParsed {
       if (!file) {
         result.AppendErrorWithFormat(
             "No source file available for the current location.");
-        return false;
+        return;
       }
 
       std::string warnings;
@@ -1712,7 +1704,7 @@ class CommandObjectThreadJump : public CommandObjectParsed {
 
       if (err.Fail()) {
         result.SetError(err);
-        return false;
+        return;
       }
 
       if (!warnings.empty())
@@ -1720,7 +1712,6 @@ class CommandObjectThreadJump : public CommandObjectParsed {
     }
 
     result.SetStatus(eReturnStatusSuccessFinishResult);
-    return true;
   }
 
   CommandOptions m_options;
@@ -1804,7 +1795,7 @@ class CommandObjectThreadPlanList : public CommandObjectIterateOverThreads {
 
   Options *GetOptions() override { return &m_options; }
 
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     // If we are reporting all threads, dispatch to the Process to do that:
     if (command.GetArgumentCount() == 0 && m_options.m_tids.empty()) {
       Stream &strm = result.GetOutputStream();
@@ -1814,7 +1805,7 @@ class CommandObjectThreadPlanList : public CommandObjectIterateOverThreads {
       m_exe_ctx.GetProcessPtr()->DumpThreadPlans(
           strm, desc_level, m_options.m_internal, true, m_options.m_unreported);
       result.SetStatus(eReturnStatusSuccessFinishResult);
-      return true;
+      return;
     } else {
       // Do any TID's that the user may have specified as TID, then do any
       // Thread Indexes...
@@ -1829,7 +1820,7 @@ class CommandObjectThreadPlanList : public CommandObjectIterateOverThreads {
           if (!success) {
             result.AppendError("Error dumping plans:");
             result.AppendError(tmp_strm.GetString());
-            return false;
+            return;
           }
           // Otherwise, add our data to the output:
           result.GetOutputStream() << tmp_strm.GetString();
@@ -1899,13 +1890,13 @@ class CommandObjectThreadPlanDiscard : public CommandObjectParsed {
     m_exe_ctx.GetThreadPtr()->AutoCompleteThreadPlans(request);
   }
 
-  bool DoExecute(Args &args, CommandReturnObject &result) override {
+  void DoExecute(Args &args, CommandReturnObject &result) override {
     Thread *thread = m_exe_ctx.GetThreadPtr();
     if (args.GetArgumentCount() != 1) {
       result.AppendErrorWithFormat("Too many arguments, expected one - the "
                                    "thread plan index - but got %zu.",
                                    args.GetArgumentCount());
-      return false;
+      return;
     }
 
     uint32_t thread_plan_idx;
@@ -1913,23 +1904,21 @@ class CommandObjectThreadPlanDiscard : public CommandObjectParsed {
       result.AppendErrorWithFormat(
           "Invalid thread index: \"%s\" - should be unsigned int.",
           args.GetArgumentAtIndex(0));
-      return false;
+      return;
     }
 
     if (thread_plan_idx == 0) {
       result.AppendErrorWithFormat(
           "You wouldn't really want me to discard the base thread plan.");
-      return false;
+      return;
     }
 
     if (thread->DiscardUserThreadPlansUpToIndex(thread_plan_idx)) {
       result.SetStatus(eReturnStatusSuccessFinishNoResult);
-      return true;
     } else {
       result.AppendErrorWithFormat(
           "Could not find User thread plan with index %s.",
           args.GetArgumentAtIndex(0));
-      return false;
     }
   }
 };
@@ -1965,13 +1954,13 @@ class CommandObjectThreadPlanPrune : public CommandObjectParsed {
 
   ~CommandObjectThreadPlanPrune() override = default;
 
-  bool DoExecute(Args &args, CommandReturnObject &result) override {
+  void DoExecute(Args &args, CommandReturnObject &result) override {
     Process *process = m_exe_ctx.GetProcessPtr();
 
     if (args.GetArgumentCount() == 0) {
       process->PruneThreadPlans();
       result.SetStatus(eReturnStatusSuccessFinishNoResult);
-      return true;
+      return;
     }
 
     const size_t num_args = args.GetArgumentCount();
@@ -1984,16 +1973,15 @@ class CommandObjectThreadPlanPrune : public CommandObjectParsed {
       if (!llvm::to_integer(args.GetArgumentAtIndex(i), tid)) {
         result.AppendErrorWithFormat("invalid thread specification: \"%s\"\n",
                                      args.GetArgumentAtIndex(i));
-        return false;
+        return;
       }
       if (!process->PruneThreadPlansForTID(tid)) {
         result.AppendErrorWithFormat("Could not find unreported tid: \"%s\"\n",
                                      args.GetArgumentAtIndex(i));
-        return false;
+        return;
       }
     }
     result.SetStatus(eReturnStatusSuccessFinishNoResult);
-    return true;
   }
 };
 
@@ -2187,11 +2175,11 @@ class CommandObjectTraceDumpFunctionCalls : public CommandObjectParsed {
   Options *GetOptions() override { return &m_options; }
 
 protected:
-  bool DoExecute(Args &args, CommandReturnObject &result) override {
+  void DoExecute(Args &args, CommandReturnObject &result) override {
     ThreadSP thread_sp = GetSingleThreadFromArgs(m_exe_ctx, args, result);
     if (!thread_sp) {
       result.AppendError("invalid thread\n");
-      return false;
+      return;
     }
 
     llvm::Expected<TraceCursorSP> cursor_or_error =
@@ -2199,7 +2187,7 @@ class CommandObjectTraceDumpFunctionCalls : public CommandObjectParsed {
 
     if (!cursor_or_error) {
       result.AppendError(llvm::toString(cursor_or_error.takeError()));
-      return false;
+      return;
     }
     TraceCursorSP &cursor_sp = *cursor_or_error;
 
@@ -2217,7 +2205,6 @@ class CommandObjectTraceDumpFunctionCalls : public CommandObjectParsed {
                        m_options.m_dumper_options);
 
     dumper.DumpFunctionCalls();
-    return true;
   }
 
   CommandOptions m_options;
@@ -2371,11 +2358,11 @@ class CommandObjectTraceDumpInstructions : public CommandObjectParsed {
   }
 
 protected:
-  bool DoExecute(Args &args, CommandReturnObject &result) override {
+  void DoExecute(Args &args, CommandReturnObject &result) override {
     ThreadSP thread_sp = GetSingleThreadFromArgs(m_exe_ctx, args, result);
     if (!thread_sp) {
       result.AppendError("invalid thread\n");
-      return false;
+      return;
     }
 
     if (m_options.m_continue && m_last_id) {
@@ -2390,14 +2377,14 @@ class CommandObjectTraceDumpInstructions : public CommandObjectParsed {
 
     if (!cursor_or_error) {
       result.AppendError(llvm::toString(cursor_or_error.takeError()));
-      return false;
+      return;
     }
     TraceCursorSP &cursor_sp = *cursor_or_error;
 
     if (m_options.m_dumper_options.id &&
         !cursor_sp->HasId(*m_options.m_dumper_options.id)) {
       result.AppendError("invalid instruction id\n");
-      return false;
+      return;
     }
 
     std::optional<StreamFile> out_file;
@@ -2419,7 +2406,6 @@ class CommandObjectTraceDumpInstructions : public CommandObjectParsed {
                        m_options.m_dumper_options);
 
     m_last_id = dumper.DumpInstructions(m_options.m_count);
-    return true;
   }
 
   CommandOptions m_options;
diff --git a/lldb/source/Commands/CommandObjectThreadUtil.cpp b/lldb/source/Commands/CommandObjectThreadUtil.cpp
index 504d5fa0118d4..d7fa4190a2450 100644
--- a/lldb/source/Commands/CommandObjectThreadUtil.cpp
+++ b/lldb/source/Commands/CommandObjectThreadUtil.cpp
@@ -34,16 +34,16 @@ CommandObjectMultipleThreads::CommandObjectMultipleThreads(
   m_arguments.push_back({thread_arg});
 }
 
-bool CommandObjectIterateOverThreads::DoExecute(Args &command,
+void CommandObjectIterateOverThreads::DoExecute(Args &command,
                                                 CommandReturnObject &result) {
   result.SetStatus(m_success_return);
 
   bool all_threads = false;
   if (command.GetArgumentCount() == 0) {
     Thread *thread = m_exe_ctx.GetThreadPtr();
-    if (!thread || !HandleOneThread(thread->GetID(), result))
-      return false;
-    return result.Succeeded();
+    if (thread)
+      HandleOneThread(thread->GetID(), result);
+    return;
   } else if (command.GetArgumentCount() == 1) {
     all_threads = ::strcmp(command.GetArgumentAtIndex(0), "all") == 0;
     m_unique_stacks = ::strcmp(command.GetArgumentAtIndex(0), "unique") == 0;
@@ -71,7 +71,7 @@ bool CommandObjectIterateOverThreads::DoExecute(Args &command,
       if (!llvm::to_integer(command.GetArgumentAtIndex(i), thread_idx)) {
         result.AppendErrorWithFormat("invalid thread specification: \"%s\"\n",
                                      command.GetArgumentAtIndex(i));
-        return false;
+        return;
       }
 
       ThreadSP thread =
@@ -80,7 +80,7 @@ bool CommandObjectIterateOverThreads::DoExecute(Args &command,
       if (!thread) {
         result.AppendErrorWithFormat("no thread with index: \"%s\"\n",
                                      command.GetArgumentAtIndex(i));
-        return false;
+        return;
       }
 
       tids.push_back(thread->GetID());
@@ -92,7 +92,7 @@ bool CommandObjectIterateOverThreads::DoExecute(Args &command,
     std::set<UniqueStack> unique_stacks;
     for (const lldb::tid_t &tid : tids) {
       if (!BucketThread(tid, unique_stacks, result)) {
-        return false;
+        return;
       }
     }
 
@@ -114,7 +114,7 @@ bool CommandObjectIterateOverThreads::DoExecute(Args &command,
       ThreadSP thread = process->GetThreadList().FindThreadByIndexID(
           representative_thread_id);
       if (!HandleOneThread(thread->GetID(), result)) {
-        return false;
+        return;
       }
     }
   } else {
@@ -124,12 +124,11 @@ bool CommandObjectIterateOverThreads::DoExecute(Args &command,
         result.AppendMessage("");
 
       if (!HandleOneThread(tid, result))
-        return false;
+        return;
 
       ++idx;
     }
   }
-  return result.Succeeded();
 }
 
 bool CommandObjectIterateOverThreads::BucketThread(
@@ -167,7 +166,7 @@ bool CommandObjectIterateOverThreads::BucketThread(
   return true;
 }
 
-bool CommandObjectMultipleThreads::DoExecute(Args &command,
+void CommandObjectMultipleThreads::DoExecute(Args &command,
                                              CommandReturnObject &result) {
   Process &process = m_exe_ctx.GetProcessRef();
 
@@ -191,7 +190,7 @@ bool CommandObjectMultipleThreads::DoExecute(Args &command,
       if (!llvm::to_integer(command.GetArgumentAtIndex(i), thread_idx)) {
         result.AppendErrorWithFormat("invalid thread specification: \"%s\"\n",
                                      command.GetArgumentAtIndex(i));
-        return false;
+        return;
       }
 
       ThreadSP thread = process.GetThreadList().FindThreadByIndexID(thread_idx);
@@ -199,12 +198,12 @@ bool CommandObjectMultipleThreads::DoExecute(Args &command,
       if (!thread) {
         result.AppendErrorWithFormat("no thread with index: \"%s\"\n",
                                      command.GetArgumentAtIndex(i));
-        return false;
+        return;
       }
 
       tids.push_back(thread->GetID());
     }
   }
 
-  return DoExecuteOnThreads(command, result, tids);
+  DoExecuteOnThreads(command, result, tids);
 }
diff --git a/lldb/source/Commands/CommandObjectThreadUtil.h b/lldb/source/Commands/CommandObjectThreadUtil.h
index c8f51eabc043f..74d1136bab7f1 100644
--- a/lldb/source/Commands/CommandObjectThreadUtil.h
+++ b/lldb/source/Commands/CommandObjectThreadUtil.h
@@ -54,7 +54,7 @@ class CommandObjectIterateOverThreads : public CommandObjectParsed {
 
   ~CommandObjectIterateOverThreads() override = default;
 
-  bool DoExecute(Args &command, CommandReturnObject &result) override;
+  void DoExecute(Args &command, CommandReturnObject &result) override;
 
 protected:
   // Override this to do whatever you need to do for one thread.
@@ -84,7 +84,7 @@ class CommandObjectMultipleThreads : public CommandObjectParsed {
                                const char *name, const char *help,
                                const char *syntax, uint32_t flags);
 
-  bool DoExecute(Args &command, CommandReturnObject &result) override;
+  void DoExecute(Args &command, CommandReturnObject &result) override;
 
 protected:
   /// Method that handles the command after the main arguments have been parsed.
diff --git a/lldb/source/Commands/CommandObjectTrace.cpp b/lldb/source/Commands/CommandObjectTrace.cpp
index 52fb56ffc1fb7..e0c74e29aaa6b 100644
--- a/lldb/source/Commands/CommandObjectTrace.cpp
+++ b/lldb/source/Commands/CommandObjectTrace.cpp
@@ -103,11 +103,11 @@ class CommandObjectTraceSave : public CommandObjectParsed {
   ~CommandObjectTraceSave() override = default;
 
 protected:
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     if (command.size() != 1) {
       result.AppendError("a single path to a directory where the trace bundle "
                          "will be created is required");
-      return false;
+      return;
     }
 
     FileSpec bundle_dir(command[0].ref());
@@ -125,8 +125,6 @@ class CommandObjectTraceSave : public CommandObjectParsed {
     } else {
       result.AppendError(toString(desc_file.takeError()));
     }
-
-    return result.Succeeded();
   }
 
   CommandOptions m_options;
@@ -194,11 +192,11 @@ class CommandObjectTraceLoad : public CommandObjectParsed {
   Options *GetOptions() override { return &m_options; }
 
 protected:
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     if (command.size() != 1) {
       result.AppendError("a single path to a JSON file containing a the "
                          "description of the trace bundle is required");
-      return false;
+      return;
     }
 
     const FileSpec trace_description_file(command[0].ref());
@@ -210,7 +208,7 @@ class CommandObjectTraceLoad : public CommandObjectParsed {
     if (!trace_or_err) {
       result.AppendErrorWithFormat(
           "%s\n", llvm::toString(trace_or_err.takeError()).c_str());
-      return false;
+      return;
     }
 
     if (m_options.m_verbose) {
@@ -219,7 +217,6 @@ class CommandObjectTraceLoad : public CommandObjectParsed {
     }
 
     result.SetStatus(eReturnStatusSuccessFinishResult);
-    return true;
   }
 
   CommandOptions m_options;
@@ -276,7 +273,7 @@ class CommandObjectTraceDump : public CommandObjectParsed {
   Options *GetOptions() override { return &m_options; }
 
 protected:
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     Status error;
     // TODO: fill in the dumping code here!
     if (error.Success()) {
@@ -284,7 +281,6 @@ class CommandObjectTraceDump : public CommandObjectParsed {
     } else {
       result.AppendErrorWithFormat("%s\n", error.AsCString());
     }
-    return result.Succeeded();
   }
 
   CommandOptions m_options;
@@ -345,12 +341,12 @@ class CommandObjectTraceSchema : public CommandObjectParsed {
   Options *GetOptions() override { return &m_options; }
 
 protected:
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     Status error;
     if (command.empty()) {
       result.AppendError(
           "trace schema cannot be invoked without a plug-in as argument");
-      return false;
+      return;
     }
 
     StringRef plugin_name(command[0].c_str());
@@ -376,7 +372,6 @@ class CommandObjectTraceSchema : public CommandObjectParsed {
     } else {
       result.AppendErrorWithFormat("%s\n", error.AsCString());
     }
-    return result.Succeeded();
   }
 
   CommandOptions m_options;
diff --git a/lldb/source/Commands/CommandObjectType.cpp b/lldb/source/Commands/CommandObjectType.cpp
index 2969f82f95882..411dc2fb723ce 100644
--- a/lldb/source/Commands/CommandObjectType.cpp
+++ b/lldb/source/Commands/CommandObjectType.cpp
@@ -276,7 +276,7 @@ class CommandObjectTypeSummaryAdd : public CommandObjectParsed,
                        Status *error = nullptr);
 
 protected:
-  bool DoExecute(Args &command, CommandReturnObject &result) override;
+  void DoExecute(Args &command, CommandReturnObject &result) override;
 };
 
 static const char *g_synth_addreader_instructions =
@@ -389,18 +389,17 @@ class CommandObjectTypeSynthAdd : public CommandObjectParsed,
   bool Execute_PythonClass(Args &command, CommandReturnObject &result);
 
 protected:
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     WarnOnPotentialUnquotedUnsignedType(command, result);
 
     if (m_options.handwrite_python)
-      return Execute_HandwritePython(command, result);
+      Execute_HandwritePython(command, result);
     else if (m_options.is_class_based)
-      return Execute_PythonClass(command, result);
+      Execute_PythonClass(command, result);
     else {
       result.AppendError("must either provide a children list, a Python class "
                          "name, or use -P and type a Python class "
                          "line-by-line");
-      return false;
     }
   }
 
@@ -649,13 +648,13 @@ pointers to floats.  Nor will it change the default display for Afloat and Bfloa
   ~CommandObjectTypeFormatAdd() override = default;
 
 protected:
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     const size_t argc = command.GetArgumentCount();
 
     if (argc < 1) {
       result.AppendErrorWithFormat("%s takes one or more args.\n",
                                    m_cmd_name.c_str());
-      return false;
+      return;
     }
 
     const Format format = m_format_options.GetFormat();
@@ -663,7 +662,7 @@ pointers to floats.  Nor will it change the default display for Afloat and Bfloa
         m_command_options.m_custom_type_name.empty()) {
       result.AppendErrorWithFormat("%s needs a valid format.\n",
                                    m_cmd_name.c_str());
-      return false;
+      return;
     }
 
     TypeFormatImplSP entry;
@@ -688,14 +687,14 @@ pointers to floats.  Nor will it change the default display for Afloat and Bfloa
     DataVisualization::Categories::GetCategory(
         ConstString(m_command_options.m_category), category_sp);
     if (!category_sp)
-      return false;
+      return;
 
     WarnOnPotentialUnquotedUnsignedType(command, result);
 
     for (auto &arg_entry : command.entries()) {
       if (arg_entry.ref().empty()) {
         result.AppendError("empty typenames not allowed");
-        return false;
+        return;
       }
 
       FormatterMatchType match_type = eFormatterMatchExact;
@@ -705,14 +704,13 @@ pointers to floats.  Nor will it change the default display for Afloat and Bfloa
         if (!typeRX.IsValid()) {
           result.AppendError(
               "regex format error (maybe this is not really a regex?)");
-          return false;
+          return;
         }
       }
       category_sp->AddTypeFormat(arg_entry.ref(), match_type, entry);
     }
 
     result.SetStatus(eReturnStatusSuccessFinishNoResult);
-    return result.Succeeded();
   }
 };
 
@@ -828,12 +826,12 @@ class CommandObjectTypeFormatterDelete : public CommandObjectParsed {
 protected:
   virtual bool FormatterSpecificDeletion(ConstString typeCS) { return false; }
 
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     const size_t argc = command.GetArgumentCount();
 
     if (argc != 1) {
       result.AppendErrorWithFormat("%s takes 1 arg.\n", m_cmd_name.c_str());
-      return false;
+      return;
     }
 
     const char *typeA = command.GetArgumentAtIndex(0);
@@ -841,7 +839,7 @@ class CommandObjectTypeFormatterDelete : public CommandObjectParsed {
 
     if (!typeCS) {
       result.AppendError("empty typenames not allowed");
-      return false;
+      return;
     }
 
     if (m_options.m_delete_all) {
@@ -851,7 +849,7 @@ class CommandObjectTypeFormatterDelete : public CommandObjectParsed {
             return true;
           });
       result.SetStatus(eReturnStatusSuccessFinishNoResult);
-      return result.Succeeded();
+      return;
     }
 
     bool delete_category = false;
@@ -875,10 +873,8 @@ class CommandObjectTypeFormatterDelete : public CommandObjectParsed {
 
     if (delete_category || extra_deletion) {
       result.SetStatus(eReturnStatusSuccessFinishNoResult);
-      return result.Succeeded();
     } else {
       result.AppendErrorWithFormat("no custom formatter for %s.\n", typeA);
-      return false;
     }
   }
 };
@@ -942,7 +938,7 @@ class CommandObjectTypeFormatterClear : public CommandObjectParsed {
 protected:
   virtual void FormatterSpecificDeletion() {}
 
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     if (m_options.m_delete_all) {
       DataVisualization::Categories::ForEach(
           [this](const TypeCategoryImplSP &category_sp) -> bool {
@@ -965,7 +961,6 @@ class CommandObjectTypeFormatterClear : public CommandObjectParsed {
     FormatterSpecificDeletion();
 
     result.SetStatus(eReturnStatusSuccessFinishResult);
-    return result.Succeeded();
   }
 };
 
@@ -1077,7 +1072,7 @@ class CommandObjectTypeFormatterList : public CommandObjectParsed {
     return regex == nullptr || s == regex->GetText() || regex->Execute(s);
   }
 
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     const size_t argc = command.GetArgumentCount();
 
     std::unique_ptr<RegularExpression> category_regex;
@@ -1090,7 +1085,7 @@ class CommandObjectTypeFormatterList : public CommandObjectParsed {
         result.AppendErrorWithFormat(
             "syntax error in category regular expression '%s'",
             m_options.m_category_regex.GetCurrentValueAsRef().str().c_str());
-        return false;
+        return;
       }
     }
 
@@ -1100,7 +1095,7 @@ class CommandObjectTypeFormatterList : public CommandObjectParsed {
       if (!formatter_regex->IsValid()) {
         result.AppendErrorWithFormat("syntax error in regular expression '%s'",
                                      arg);
-        return false;
+        return;
       }
     }
 
@@ -1154,7 +1149,6 @@ class CommandObjectTypeFormatterList : public CommandObjectParsed {
       result.GetOutputStream().PutCString("no matching results found.\n");
       result.SetStatus(eReturnStatusSuccessFinishNoResult);
     }
-    return result.Succeeded();
   }
 };
 
@@ -1557,20 +1551,20 @@ Alternatively, the -o option can be used when providing a simple one-line Python
 (lldb) type summary add JustADemo -o "value = valobj.GetChildMemberWithName('value'); return 'My value is ' + value.GetValue();")");
 }
 
-bool CommandObjectTypeSummaryAdd::DoExecute(Args &command,
+void CommandObjectTypeSummaryAdd::DoExecute(Args &command,
                                             CommandReturnObject &result) {
   WarnOnPotentialUnquotedUnsignedType(command, result);
 
   if (m_options.m_is_add_script) {
 #if LLDB_ENABLE_PYTHON
-    return Execute_ScriptSummary(command, result);
+    Execute_ScriptSummary(command, result);
 #else
     result.AppendError("python is disabled");
-    return false;
 #endif
+    return;
   }
 
-  return Execute_StringSummary(command, result);
+  Execute_StringSummary(command, result);
 }
 
 static bool FixArrayTypeNameWithRegex(ConstString &type_name) {
@@ -1773,13 +1767,13 @@ class CommandObjectTypeCategoryDefine : public CommandObjectParsed {
   }
 
 protected:
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     const size_t argc = command.GetArgumentCount();
 
     if (argc < 1) {
       result.AppendErrorWithFormat("%s takes 1 or more args.\n",
                                    m_cmd_name.c_str());
-      return false;
+      return;
     }
 
     for (auto &entry : command.entries()) {
@@ -1795,7 +1789,6 @@ class CommandObjectTypeCategoryDefine : public CommandObjectParsed {
     }
 
     result.SetStatus(eReturnStatusSuccessFinishResult);
-    return result.Succeeded();
   }
 };
 
@@ -1875,13 +1868,13 @@ class CommandObjectTypeCategoryEnable : public CommandObjectParsed {
   }
 
 protected:
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     const size_t argc = command.GetArgumentCount();
 
     if (argc < 1 && m_options.m_language == lldb::eLanguageTypeUnknown) {
       result.AppendErrorWithFormat("%s takes arguments and/or a language",
                                    m_cmd_name.c_str());
-      return false;
+      return;
     }
 
     if (argc == 1 && strcmp(command.GetArgumentAtIndex(0), "*") == 0) {
@@ -1893,7 +1886,7 @@ class CommandObjectTypeCategoryEnable : public CommandObjectParsed {
 
         if (!typeCS) {
           result.AppendError("empty category name not allowed");
-          return false;
+          return;
         }
         DataVisualization::Categories::Enable(typeCS);
         lldb::TypeCategoryImplSP cate;
@@ -1909,7 +1902,6 @@ class CommandObjectTypeCategoryEnable : public CommandObjectParsed {
       DataVisualization::Categories::Enable(m_options.m_language);
 
     result.SetStatus(eReturnStatusSuccessFinishResult);
-    return result.Succeeded();
   }
 };
 
@@ -1943,13 +1935,13 @@ class CommandObjectTypeCategoryDelete : public CommandObjectParsed {
   }
 
 protected:
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     const size_t argc = command.GetArgumentCount();
 
     if (argc < 1) {
       result.AppendErrorWithFormat("%s takes 1 or more arg.\n",
                                    m_cmd_name.c_str());
-      return false;
+      return;
     }
 
     bool success = true;
@@ -1961,17 +1953,15 @@ class CommandObjectTypeCategoryDelete : public CommandObjectParsed {
 
       if (!typeCS) {
         result.AppendError("empty category name not allowed");
-        return false;
+        return;
       }
       if (!DataVisualization::Categories::Delete(typeCS))
         success = false; // keep deleting even if we hit an error
     }
     if (success) {
       result.SetStatus(eReturnStatusSuccessFinishResult);
-      return result.Succeeded();
     } else {
       result.AppendError("cannot delete one or more categories\n");
-      return false;
     }
   }
 };
@@ -2052,13 +2042,13 @@ class CommandObjectTypeCategoryDisable : public CommandObjectParsed {
   }
 
 protected:
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     const size_t argc = command.GetArgumentCount();
 
     if (argc < 1 && m_options.m_language == lldb::eLanguageTypeUnknown) {
       result.AppendErrorWithFormat("%s takes arguments and/or a language",
                                    m_cmd_name.c_str());
-      return false;
+      return;
     }
 
     if (argc == 1 && strcmp(command.GetArgumentAtIndex(0), "*") == 0) {
@@ -2071,7 +2061,7 @@ class CommandObjectTypeCategoryDisable : public CommandObjectParsed {
 
         if (!typeCS) {
           result.AppendError("empty category name not allowed");
-          return false;
+          return;
         }
         DataVisualization::Categories::Disable(typeCS);
       }
@@ -2081,7 +2071,6 @@ class CommandObjectTypeCategoryDisable : public CommandObjectParsed {
       DataVisualization::Categories::Disable(m_options.m_language);
 
     result.SetStatus(eReturnStatusSuccessFinishResult);
-    return result.Succeeded();
   }
 };
 
@@ -2117,7 +2106,7 @@ class CommandObjectTypeCategoryList : public CommandObjectParsed {
   }
 
 protected:
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     const size_t argc = command.GetArgumentCount();
 
     std::unique_ptr<RegularExpression> regex;
@@ -2128,12 +2117,12 @@ class CommandObjectTypeCategoryList : public CommandObjectParsed {
       if (!regex->IsValid()) {
         result.AppendErrorWithFormat(
             "syntax error in category regular expression '%s'", arg);
-        return false;
+        return;
       }
     } else if (argc != 0) {
       result.AppendErrorWithFormat("%s takes 0 or one arg.\n",
                                    m_cmd_name.c_str());
-      return false;
+      return;
     }
 
     DataVisualization::Categories::ForEach(
@@ -2157,7 +2146,6 @@ class CommandObjectTypeCategoryList : public CommandObjectParsed {
         });
 
     result.SetStatus(eReturnStatusSuccessFinishResult);
-    return result.Succeeded();
   }
 };
 
@@ -2570,19 +2558,19 @@ all children of my_foo as if no filter was defined:"
   ~CommandObjectTypeFilterAdd() override = default;
 
 protected:
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     const size_t argc = command.GetArgumentCount();
 
     if (argc < 1) {
       result.AppendErrorWithFormat("%s takes one or more args.\n",
                                    m_cmd_name.c_str());
-      return false;
+      return;
     }
 
     if (m_options.m_expr_paths.empty()) {
       result.AppendErrorWithFormat("%s needs one or more children.\n",
                                    m_cmd_name.c_str());
-      return false;
+      return;
     }
 
     TypeFilterImplSP entry(new TypeFilterImpl(
@@ -2611,7 +2599,7 @@ all children of my_foo as if no filter was defined:"
     for (auto &arg_entry : command.entries()) {
       if (arg_entry.ref().empty()) {
         result.AppendError("empty typenames not allowed");
-        return false;
+        return;
       }
 
       ConstString typeCS(arg_entry.ref());
@@ -2619,12 +2607,11 @@ all children of my_foo as if no filter was defined:"
                      m_options.m_regex ? eRegexFilter : eRegularFilter,
                      m_options.m_category, &error)) {
         result.AppendError(error.AsCString());
-        return false;
+        return;
       }
     }
 
     result.SetStatus(eReturnStatusSuccessFinishNoResult);
-    return result.Succeeded();
   }
 };
 
@@ -2730,12 +2717,12 @@ class CommandObjectTypeLookup : public CommandObjectRaw {
     return m_cmd_help_long;
   }
 
-  bool DoExecute(llvm::StringRef raw_command_line,
+  void DoExecute(llvm::StringRef raw_command_line,
                  CommandReturnObject &result) override {
     if (raw_command_line.empty()) {
       result.AppendError(
           "type lookup cannot be invoked without a type name as argument");
-      return false;
+      return;
     }
 
     auto exe_ctx = GetCommandInterpreter().GetExecutionContext();
@@ -2747,7 +2734,7 @@ class CommandObjectTypeLookup : public CommandObjectRaw {
     if (args.HasArgs())
       if (!ParseOptionsAndNotify(args.GetArgs(), result, m_option_group,
                                  exe_ctx))
-        return false;
+        return;
 
     ExecutionContextScope *best_scope = exe_ctx.GetBestExecutionContextScope();
 
@@ -2827,7 +2814,6 @@ class CommandObjectTypeLookup : public CommandObjectRaw {
 
     result.SetStatus(any_found ? lldb::eReturnStatusSuccessFinishResult
                                : lldb::eReturnStatusSuccessFinishNoResult);
-    return true;
   }
 };
 
@@ -2858,13 +2844,13 @@ class CommandObjectFormatterInfo : public CommandObjectRaw {
   ~CommandObjectFormatterInfo() override = default;
 
 protected:
-  bool DoExecute(llvm::StringRef command,
+  void DoExecute(llvm::StringRef command,
                  CommandReturnObject &result) override {
     TargetSP target_sp = GetDebugger().GetSelectedTarget();
     Thread *thread = GetDefaultThread();
     if (!thread) {
       result.AppendError("no default thread");
-      return false;
+      return;
     }
 
     StackFrameSP frame_sp =
@@ -2894,10 +2880,8 @@ class CommandObjectFormatterInfo : public CommandObjectRaw {
             << ") " << command << "\n";
         result.SetStatus(lldb::eReturnStatusSuccessFinishNoResult);
       }
-      return true;
     } else {
       result.AppendError("failed to evaluate expression");
-      return false;
     }
   }
 
diff --git a/lldb/source/Commands/CommandObjectVersion.cpp b/lldb/source/Commands/CommandObjectVersion.cpp
index 9b3c9e67a1a74..f13ec18e240c0 100644
--- a/lldb/source/Commands/CommandObjectVersion.cpp
+++ b/lldb/source/Commands/CommandObjectVersion.cpp
@@ -22,8 +22,7 @@ CommandObjectVersion::CommandObjectVersion(CommandInterpreter &interpreter)
 
 CommandObjectVersion::~CommandObjectVersion() = default;
 
-bool CommandObjectVersion::DoExecute(Args &args, CommandReturnObject &result) {
+void CommandObjectVersion::DoExecute(Args &args, CommandReturnObject &result) {
   result.AppendMessageWithFormat("%s\n", lldb_private::GetVersion());
   result.SetStatus(eReturnStatusSuccessFinishResult);
-  return true;
 }
diff --git a/lldb/source/Commands/CommandObjectVersion.h b/lldb/source/Commands/CommandObjectVersion.h
index dce1a8d67b88c..4ba081bf8706d 100644
--- a/lldb/source/Commands/CommandObjectVersion.h
+++ b/lldb/source/Commands/CommandObjectVersion.h
@@ -22,7 +22,7 @@ class CommandObjectVersion : public CommandObjectParsed {
   ~CommandObjectVersion() override;
 
 protected:
-  bool DoExecute(Args &args, CommandReturnObject &result) override;
+  void DoExecute(Args &args, CommandReturnObject &result) override;
 };
 
 } // namespace lldb_private
diff --git a/lldb/source/Commands/CommandObjectWatchpoint.cpp b/lldb/source/Commands/CommandObjectWatchpoint.cpp
index dc5be0da43f5e..cd1d226988f24 100644
--- a/lldb/source/Commands/CommandObjectWatchpoint.cpp
+++ b/lldb/source/Commands/CommandObjectWatchpoint.cpp
@@ -207,7 +207,7 @@ class CommandObjectWatchpointList : public CommandObjectParsed {
   };
 
 protected:
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     Target *target = &GetSelectedTarget();
 
     if (target->GetProcessSP() && target->GetProcessSP()->IsAlive()) {
@@ -230,7 +230,7 @@ class CommandObjectWatchpointList : public CommandObjectParsed {
     if (num_watchpoints == 0) {
       result.AppendMessage("No watchpoints currently set.");
       result.SetStatus(eReturnStatusSuccessFinishNoResult);
-      return true;
+      return;
     }
 
     Stream &output_stream = result.GetOutputStream();
@@ -249,7 +249,7 @@ class CommandObjectWatchpointList : public CommandObjectParsed {
       if (!CommandObjectMultiwordWatchpoint::VerifyWatchpointIDs(
               target, command, wp_ids)) {
         result.AppendError("Invalid watchpoints specification.");
-        return false;
+        return;
       }
 
       const size_t size = wp_ids.size();
@@ -260,8 +260,6 @@ class CommandObjectWatchpointList : public CommandObjectParsed {
         result.SetStatus(eReturnStatusSuccessFinishNoResult);
       }
     }
-
-    return result.Succeeded();
   }
 
 private:
@@ -297,10 +295,10 @@ class CommandObjectWatchpointEnable : public CommandObjectParsed {
   }
 
 protected:
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     Target *target = &GetSelectedTarget();
     if (!CheckTargetForWatchpointOperations(target, result))
-      return false;
+      return;
 
     std::unique_lock<std::recursive_mutex> lock;
     target->GetWatchpointList().GetListMutex(lock);
@@ -311,7 +309,7 @@ class CommandObjectWatchpointEnable : public CommandObjectParsed {
 
     if (num_watchpoints == 0) {
       result.AppendError("No watchpoints exist to be enabled.");
-      return false;
+      return;
     }
 
     if (command.GetArgumentCount() == 0) {
@@ -327,7 +325,7 @@ class CommandObjectWatchpointEnable : public CommandObjectParsed {
       if (!CommandObjectMultiwordWatchpoint::VerifyWatchpointIDs(
               target, command, wp_ids)) {
         result.AppendError("Invalid watchpoints specification.");
-        return false;
+        return;
       }
 
       int count = 0;
@@ -338,8 +336,6 @@ class CommandObjectWatchpointEnable : public CommandObjectParsed {
       result.AppendMessageWithFormat("%d watchpoints enabled.\n", count);
       result.SetStatus(eReturnStatusSuccessFinishNoResult);
     }
-
-    return result.Succeeded();
   }
 };
 
@@ -373,10 +369,10 @@ class CommandObjectWatchpointDisable : public CommandObjectParsed {
   }
 
 protected:
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     Target *target = &GetSelectedTarget();
     if (!CheckTargetForWatchpointOperations(target, result))
-      return false;
+      return;
 
     std::unique_lock<std::recursive_mutex> lock;
     target->GetWatchpointList().GetListMutex(lock);
@@ -386,7 +382,7 @@ class CommandObjectWatchpointDisable : public CommandObjectParsed {
 
     if (num_watchpoints == 0) {
       result.AppendError("No watchpoints exist to be disabled.");
-      return false;
+      return;
     }
 
     if (command.GetArgumentCount() == 0) {
@@ -405,7 +401,7 @@ class CommandObjectWatchpointDisable : public CommandObjectParsed {
       if (!CommandObjectMultiwordWatchpoint::VerifyWatchpointIDs(
               target, command, wp_ids)) {
         result.AppendError("Invalid watchpoints specification.");
-        return false;
+        return;
       }
 
       int count = 0;
@@ -416,8 +412,6 @@ class CommandObjectWatchpointDisable : public CommandObjectParsed {
       result.AppendMessageWithFormat("%d watchpoints disabled.\n", count);
       result.SetStatus(eReturnStatusSuccessFinishNoResult);
     }
-
-    return result.Succeeded();
   }
 };
 
@@ -489,10 +483,10 @@ class CommandObjectWatchpointDelete : public CommandObjectParsed {
   };
 
 protected:
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     Target *target = &GetSelectedTarget();
     if (!CheckTargetForWatchpointOperations(target, result))
-      return false;
+      return;
 
     std::unique_lock<std::recursive_mutex> lock;
     target->GetWatchpointList().GetListMutex(lock);
@@ -503,7 +497,7 @@ class CommandObjectWatchpointDelete : public CommandObjectParsed {
 
     if (num_watchpoints == 0) {
       result.AppendError("No watchpoints exist to be deleted.");
-      return false;
+      return;
     }
 
     if (command.empty()) {
@@ -519,7 +513,7 @@ class CommandObjectWatchpointDelete : public CommandObjectParsed {
                                        (uint64_t)num_watchpoints);
       }
       result.SetStatus(eReturnStatusSuccessFinishNoResult);
-      return result.Succeeded();
+      return;
     }
 
     // Particular watchpoints selected; delete them.
@@ -527,7 +521,7 @@ class CommandObjectWatchpointDelete : public CommandObjectParsed {
     if (!CommandObjectMultiwordWatchpoint::VerifyWatchpointIDs(target, command,
                                                                wp_ids)) {
       result.AppendError("Invalid watchpoints specification.");
-      return false;
+      return;
     }
 
     int count = 0;
@@ -537,8 +531,6 @@ class CommandObjectWatchpointDelete : public CommandObjectParsed {
         ++count;
     result.AppendMessageWithFormat("%d watchpoints deleted.\n", count);
     result.SetStatus(eReturnStatusSuccessFinishNoResult);
-
-    return result.Succeeded();
   }
 
 private:
@@ -616,10 +608,10 @@ class CommandObjectWatchpointIgnore : public CommandObjectParsed {
   };
 
 protected:
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     Target *target = &GetSelectedTarget();
     if (!CheckTargetForWatchpointOperations(target, result))
-      return false;
+      return;
 
     std::unique_lock<std::recursive_mutex> lock;
     target->GetWatchpointList().GetListMutex(lock);
@@ -630,7 +622,7 @@ class CommandObjectWatchpointIgnore : public CommandObjectParsed {
 
     if (num_watchpoints == 0) {
       result.AppendError("No watchpoints exist to be ignored.");
-      return false;
+      return;
     }
 
     if (command.GetArgumentCount() == 0) {
@@ -645,7 +637,7 @@ class CommandObjectWatchpointIgnore : public CommandObjectParsed {
       if (!CommandObjectMultiwordWatchpoint::VerifyWatchpointIDs(
               target, command, wp_ids)) {
         result.AppendError("Invalid watchpoints specification.");
-        return false;
+        return;
       }
 
       int count = 0;
@@ -656,8 +648,6 @@ class CommandObjectWatchpointIgnore : public CommandObjectParsed {
       result.AppendMessageWithFormat("%d watchpoints ignored.\n", count);
       result.SetStatus(eReturnStatusSuccessFinishNoResult);
     }
-
-    return result.Succeeded();
   }
 
 private:
@@ -742,10 +732,10 @@ class CommandObjectWatchpointModify : public CommandObjectParsed {
   };
 
 protected:
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     Target *target = &GetSelectedTarget();
     if (!CheckTargetForWatchpointOperations(target, result))
-      return false;
+      return;
 
     std::unique_lock<std::recursive_mutex> lock;
     target->GetWatchpointList().GetListMutex(lock);
@@ -756,7 +746,7 @@ class CommandObjectWatchpointModify : public CommandObjectParsed {
 
     if (num_watchpoints == 0) {
       result.AppendError("No watchpoints exist to be modified.");
-      return false;
+      return;
     }
 
     if (command.GetArgumentCount() == 0) {
@@ -769,7 +759,7 @@ class CommandObjectWatchpointModify : public CommandObjectParsed {
       if (!CommandObjectMultiwordWatchpoint::VerifyWatchpointIDs(
               target, command, wp_ids)) {
         result.AppendError("Invalid watchpoints specification.");
-        return false;
+        return;
       }
 
       int count = 0;
@@ -784,8 +774,6 @@ class CommandObjectWatchpointModify : public CommandObjectParsed {
       result.AppendMessageWithFormat("%d watchpoints modified.\n", count);
       result.SetStatus(eReturnStatusSuccessFinishNoResult);
     }
-
-    return result.Succeeded();
   }
 
 private:
@@ -866,7 +854,7 @@ corresponding to the byte size of the data type.");
     return variable_list.GetSize() - old_size;
   }
 
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     Target *target = GetDebugger().GetSelectedTarget().get();
     StackFrame *frame = m_exe_ctx.GetFramePtr();
 
@@ -875,7 +863,7 @@ corresponding to the byte size of the data type.");
     if (command.GetArgumentCount() <= 0) {
       result.AppendError("required argument missing; "
                          "specify your program variable to watch for");
-      return false;
+      return;
     }
 
     // If no '-w' is specified, default to '-w modify'.
@@ -895,7 +883,7 @@ corresponding to the byte size of the data type.");
     // A simple watch variable gesture allows only one argument.
     if (command.GetArgumentCount() != 1) {
       result.AppendError("specify exactly one variable to watch for");
-      return false;
+      return;
     }
 
     // Things have checked out ok...
@@ -943,7 +931,7 @@ corresponding to the byte size of the data type.");
         result.AppendErrorWithFormat("unable to find any variable "
                                      "expression path that matches '%s'",
                                      command.GetArgumentAtIndex(0));
-      return false;
+      return;
     }
 
     // Now it's time to create the watchpoint.
@@ -975,7 +963,7 @@ corresponding to the byte size of the data type.");
           addr, static_cast<uint64_t>(size), command.GetArgumentAtIndex(0));
       if (const char *error_message = error.AsCString(nullptr))
         result.AppendError(error_message);
-      return result.Succeeded();
+      return;
     }
 
     watch_sp->SetWatchSpec(command.GetArgumentAtIndex(0));
@@ -994,8 +982,6 @@ corresponding to the byte size of the data type.");
     watch_sp->GetDescription(&output_stream, lldb::eDescriptionLevelFull);
     output_stream.EOL();
     result.SetStatus(eReturnStatusSuccessFinishResult);
-
-    return result.Succeeded();
   }
 
 private:
@@ -1061,7 +1047,7 @@ class CommandObjectWatchpointSetExpression : public CommandObjectRaw {
   Options *GetOptions() override { return &m_option_group; }
 
 protected:
-  bool DoExecute(llvm::StringRef raw_command,
+  void DoExecute(llvm::StringRef raw_command,
                  CommandReturnObject &result) override {
     auto exe_ctx = GetCommandInterpreter().GetExecutionContext();
     m_option_group.NotifyOptionParsingStarting(
@@ -1077,14 +1063,14 @@ class CommandObjectWatchpointSetExpression : public CommandObjectRaw {
     if (args.HasArgs())
       if (!ParseOptionsAndNotify(args.GetArgs(), result, m_option_group,
                                  exe_ctx))
-        return false;
+        return;
 
     // If no argument is present, issue an error message.  There's no way to
     // set a watchpoint.
     if (raw_command.trim().empty()) {
       result.AppendError("required argument missing; specify an expression "
                          "to evaluate into the address to watch for");
-      return false;
+      return;
     }
 
     // If no '-w' is specified, default to '-w write'.
@@ -1116,7 +1102,7 @@ class CommandObjectWatchpointSetExpression : public CommandObjectRaw {
       result.AppendErrorWithFormat("expression evaluated: \n%s", expr.data());
       if (valobj_sp && !valobj_sp->GetError().Success())
         result.AppendError(valobj_sp->GetError().AsCString());
-      return false;
+      return;
     }
 
     // Get the address to watch.
@@ -1124,7 +1110,7 @@ class CommandObjectWatchpointSetExpression : public CommandObjectRaw {
     addr = valobj_sp->GetValueAsUnsigned(0, &success);
     if (!success) {
       result.AppendError("expression did not evaluate to an address");
-      return false;
+      return;
     }
 
     if (m_option_watchpoint.watch_size != 0)
@@ -1173,8 +1159,6 @@ class CommandObjectWatchpointSetExpression : public CommandObjectRaw {
       if (error.AsCString(nullptr))
         result.AppendError(error.AsCString());
     }
-
-    return result.Succeeded();
   }
 
 private:
diff --git a/lldb/source/Commands/CommandObjectWatchpointCommand.cpp b/lldb/source/Commands/CommandObjectWatchpointCommand.cpp
index 37052ddd62c88..b1629ceab2709 100644
--- a/lldb/source/Commands/CommandObjectWatchpointCommand.cpp
+++ b/lldb/source/Commands/CommandObjectWatchpointCommand.cpp
@@ -366,7 +366,7 @@ are no syntax errors may indicate that a function was declared but never called.
   };
 
 protected:
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     Target *target = &GetSelectedTarget();
 
     const WatchpointList &watchpoints = target->GetWatchpointList();
@@ -374,7 +374,7 @@ are no syntax errors may indicate that a function was declared but never called.
 
     if (num_watchpoints == 0) {
       result.AppendError("No watchpoints exist to have commands added");
-      return false;
+      return;
     }
 
     if (!m_options.m_function_name.empty()) {
@@ -388,7 +388,7 @@ are no syntax errors may indicate that a function was declared but never called.
     if (!CommandObjectMultiwordWatchpoint::VerifyWatchpointIDs(target, command,
                                                                valid_wp_ids)) {
       result.AppendError("Invalid watchpoints specification.");
-      return false;
+      return;
     }
 
     result.SetStatus(eReturnStatusSuccessFinishNoResult);
@@ -441,8 +441,6 @@ are no syntax errors may indicate that a function was declared but never called.
         }
       }
     }
-
-    return result.Succeeded();
   }
 
 private:
@@ -475,7 +473,7 @@ class CommandObjectWatchpointCommandDelete : public CommandObjectParsed {
   ~CommandObjectWatchpointCommandDelete() override = default;
 
 protected:
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     Target *target = &GetSelectedTarget();
 
     const WatchpointList &watchpoints = target->GetWatchpointList();
@@ -483,20 +481,20 @@ class CommandObjectWatchpointCommandDelete : public CommandObjectParsed {
 
     if (num_watchpoints == 0) {
       result.AppendError("No watchpoints exist to have commands deleted");
-      return false;
+      return;
     }
 
     if (command.GetArgumentCount() == 0) {
       result.AppendError(
           "No watchpoint specified from which to delete the commands");
-      return false;
+      return;
     }
 
     std::vector<uint32_t> valid_wp_ids;
     if (!CommandObjectMultiwordWatchpoint::VerifyWatchpointIDs(target, command,
                                                                valid_wp_ids)) {
       result.AppendError("Invalid watchpoints specification.");
-      return false;
+      return;
     }
 
     result.SetStatus(eReturnStatusSuccessFinishNoResult);
@@ -509,10 +507,9 @@ class CommandObjectWatchpointCommandDelete : public CommandObjectParsed {
           wp->ClearCallback();
       } else {
         result.AppendErrorWithFormat("Invalid watchpoint ID: %u.\n", cur_wp_id);
-        return false;
+        return;
       }
     }
-    return result.Succeeded();
   }
 };
 
@@ -543,7 +540,7 @@ class CommandObjectWatchpointCommandList : public CommandObjectParsed {
   ~CommandObjectWatchpointCommandList() override = default;
 
 protected:
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     Target *target = &GetSelectedTarget();
 
     const WatchpointList &watchpoints = target->GetWatchpointList();
@@ -551,20 +548,20 @@ class CommandObjectWatchpointCommandList : public CommandObjectParsed {
 
     if (num_watchpoints == 0) {
       result.AppendError("No watchpoints exist for which to list commands");
-      return false;
+      return;
     }
 
     if (command.GetArgumentCount() == 0) {
       result.AppendError(
           "No watchpoint specified for which to list the commands");
-      return false;
+      return;
     }
 
     std::vector<uint32_t> valid_wp_ids;
     if (!CommandObjectMultiwordWatchpoint::VerifyWatchpointIDs(target, command,
                                                                valid_wp_ids)) {
       result.AppendError("Invalid watchpoints specification.");
-      return false;
+      return;
     }
 
     result.SetStatus(eReturnStatusSuccessFinishNoResult);
@@ -598,8 +595,6 @@ class CommandObjectWatchpointCommandList : public CommandObjectParsed {
         }
       }
     }
-
-    return result.Succeeded();
   }
 };
 
diff --git a/lldb/source/Plugins/LanguageRuntime/CPlusPlus/ItaniumABI/ItaniumABILanguageRuntime.cpp b/lldb/source/Plugins/LanguageRuntime/CPlusPlus/ItaniumABI/ItaniumABILanguageRuntime.cpp
index 711a696ff9b4d..53e856bf3514e 100644
--- a/lldb/source/Plugins/LanguageRuntime/CPlusPlus/ItaniumABI/ItaniumABILanguageRuntime.cpp
+++ b/lldb/source/Plugins/LanguageRuntime/CPlusPlus/ItaniumABI/ItaniumABILanguageRuntime.cpp
@@ -339,7 +339,7 @@ class CommandObjectMultiwordItaniumABI_Demangle : public CommandObjectParsed {
   ~CommandObjectMultiwordItaniumABI_Demangle() override = default;
 
 protected:
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     bool demangled_any = false;
     bool error_any = false;
     for (auto &entry : command.entries()) {
@@ -372,7 +372,6 @@ class CommandObjectMultiwordItaniumABI_Demangle : public CommandObjectParsed {
         error_any ? lldb::eReturnStatusFailed
                   : (demangled_any ? lldb::eReturnStatusSuccessFinishResult
                                    : lldb::eReturnStatusSuccessFinishNoResult));
-    return result.Succeeded();
   }
 };
 
diff --git a/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCRuntimeV2.cpp b/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCRuntimeV2.cpp
index a50cdc88cd012..1fd7d027731de 100644
--- a/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCRuntimeV2.cpp
+++ b/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCRuntimeV2.cpp
@@ -917,7 +917,7 @@ class CommandObjectObjC_ClassTable_Dump : public CommandObjectParsed {
   Options *GetOptions() override { return &m_options; }
 
 protected:
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     std::unique_ptr<RegularExpression> regex_up;
     switch (command.GetArgumentCount()) {
     case 0:
@@ -929,14 +929,14 @@ class CommandObjectObjC_ClassTable_Dump : public CommandObjectParsed {
         result.AppendError(
             "invalid argument - please provide a valid regular expression");
         result.SetStatus(lldb::eReturnStatusFailed);
-        return false;
+        return;
       }
       break;
     }
     default: {
       result.AppendError("please provide 0 or 1 arguments");
       result.SetStatus(lldb::eReturnStatusFailed);
-      return false;
+      return;
     }
     }
 
@@ -997,11 +997,10 @@ class CommandObjectObjC_ClassTable_Dump : public CommandObjectParsed {
         }
       }
       result.SetStatus(lldb::eReturnStatusSuccessFinishResult);
-      return true;
+      return;
     }
     result.AppendError("current process has no Objective-C runtime loaded");
     result.SetStatus(lldb::eReturnStatusFailed);
-    return false;
   }
 
   CommandOptions m_options;
@@ -1034,11 +1033,11 @@ class CommandObjectMultiwordObjC_TaggedPointer_Info
   ~CommandObjectMultiwordObjC_TaggedPointer_Info() override = default;
 
 protected:
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     if (command.GetArgumentCount() == 0) {
       result.AppendError("this command requires arguments");
       result.SetStatus(lldb::eReturnStatusFailed);
-      return false;
+      return;
     }
 
     Process *process = m_exe_ctx.GetProcessPtr();
@@ -1048,7 +1047,7 @@ class CommandObjectMultiwordObjC_TaggedPointer_Info
     if (!objc_runtime) {
       result.AppendError("current process has no Objective-C runtime loaded");
       result.SetStatus(lldb::eReturnStatusFailed);
-      return false;
+      return;
     }
 
     ObjCLanguageRuntime::TaggedPointerVendor *tagged_ptr_vendor =
@@ -1056,7 +1055,7 @@ class CommandObjectMultiwordObjC_TaggedPointer_Info
     if (!tagged_ptr_vendor) {
       result.AppendError("current process has no tagged pointer support");
       result.SetStatus(lldb::eReturnStatusFailed);
-      return false;
+      return;
     }
 
     for (size_t i = 0; i < command.GetArgumentCount(); i++) {
@@ -1071,7 +1070,7 @@ class CommandObjectMultiwordObjC_TaggedPointer_Info
         result.AppendErrorWithFormatv(
             "could not convert '{0}' to a valid address\n", arg_str);
         result.SetStatus(lldb::eReturnStatusFailed);
-        return false;
+        return;
       }
 
       if (!tagged_ptr_vendor->IsPossibleTaggedPointer(arg_addr)) {
@@ -1084,7 +1083,7 @@ class CommandObjectMultiwordObjC_TaggedPointer_Info
         result.AppendErrorWithFormatv(
             "could not get class descriptor for {0:x16}\n", arg_addr);
         result.SetStatus(lldb::eReturnStatusFailed);
-        return false;
+        return;
       }
 
       uint64_t info_bits = 0;
@@ -1106,7 +1105,6 @@ class CommandObjectMultiwordObjC_TaggedPointer_Info
     }
 
     result.SetStatus(lldb::eReturnStatusSuccessFinishResult);
-    return true;
   }
 };
 
diff --git a/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.cpp b/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.cpp
index 79f8b15a7f229..0d1caf4d7318b 100644
--- a/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.cpp
+++ b/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.cpp
@@ -881,7 +881,7 @@ class CommandObjectProcessKDPPacketSend : public CommandObjectParsed {
 
   ~CommandObjectProcessKDPPacketSend() override = default;
 
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     if (!m_command_byte.GetOptionValue().OptionWasSet()) {
       result.AppendError(
           "the --command option must be set to a valid command byte");
@@ -907,7 +907,7 @@ class CommandObjectProcessKDPPacketSend : public CommandObjectParsed {
                                              "even number of ASCII hex "
                                              "characters: '%s'",
                                              ascii_hex_bytes_cstr);
-                return false;
+                return;
               }
               payload_bytes.resize(ascii_hex_bytes_cstr_len / 2);
               if (extractor.GetHexBytes(payload_bytes, '\xdd') !=
@@ -916,7 +916,7 @@ class CommandObjectProcessKDPPacketSend : public CommandObjectParsed {
                                              "ASCII hex characters (no "
                                              "spaces or hex prefixes): '%s'",
                                              ascii_hex_bytes_cstr);
-                return false;
+                return;
               }
             }
             Status error;
@@ -934,7 +934,7 @@ class CommandObjectProcessKDPPacketSend : public CommandObjectParsed {
                   endian::InlHostByteOrder(), endian::InlHostByteOrder());
               result.AppendMessage(packet.GetString());
               result.SetStatus(eReturnStatusSuccessFinishResult);
-              return true;
+              return;
             } else {
               const char *error_cstr = error.AsCString();
               if (error_cstr && error_cstr[0])
@@ -942,7 +942,7 @@ class CommandObjectProcessKDPPacketSend : public CommandObjectParsed {
               else
                 result.AppendErrorWithFormat("unknown error 0x%8.8x",
                                              error.GetError());
-              return false;
+              return;
             }
           } else {
             result.AppendErrorWithFormat("process must be stopped in order "
@@ -958,7 +958,6 @@ class CommandObjectProcessKDPPacketSend : public CommandObjectParsed {
                                      command_byte);
       }
     }
-    return false;
   }
 };
 
diff --git a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp
index 56fc5490657ea..dad1396698050 100644
--- a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp
+++ b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp
@@ -5179,7 +5179,7 @@ class CommandObjectProcessGDBRemoteSpeedTest : public CommandObjectParsed {
 
   Options *GetOptions() override { return &m_option_group; }
 
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     const size_t argc = command.GetArgumentCount();
     if (argc == 0) {
       ProcessGDBRemote *process =
@@ -5201,14 +5201,13 @@ class CommandObjectProcessGDBRemoteSpeedTest : public CommandObjectParsed {
             num_packets, max_send, max_recv, k_recv_amount, json,
             output_stream_sp ? *output_stream_sp : result.GetOutputStream());
         result.SetStatus(eReturnStatusSuccessFinishResult);
-        return true;
+        return;
       }
     } else {
       result.AppendErrorWithFormat("'%s' takes no arguments",
                                    m_cmd_name.c_str());
     }
     result.SetStatus(eReturnStatusFailed);
-    return false;
   }
 
 protected:
@@ -5228,16 +5227,15 @@ class CommandObjectProcessGDBRemotePacketHistory : public CommandObjectParsed {
 
   ~CommandObjectProcessGDBRemotePacketHistory() override = default;
 
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     ProcessGDBRemote *process =
         (ProcessGDBRemote *)m_interpreter.GetExecutionContext().GetProcessPtr();
     if (process) {
       process->DumpPluginHistory(result.GetOutputStream());
       result.SetStatus(eReturnStatusSuccessFinishResult);
-      return true;
+      return;
     }
     result.SetStatus(eReturnStatusFailed);
-    return false;
   }
 };
 
@@ -5255,14 +5253,14 @@ class CommandObjectProcessGDBRemotePacketXferSize : public CommandObjectParsed {
 
   ~CommandObjectProcessGDBRemotePacketXferSize() override = default;
 
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     const size_t argc = command.GetArgumentCount();
     if (argc == 0) {
       result.AppendErrorWithFormat("'%s' takes an argument to specify the max "
                                    "amount to be transferred when "
                                    "reading/writing",
                                    m_cmd_name.c_str());
-      return false;
+      return;
     }
 
     ProcessGDBRemote *process =
@@ -5274,11 +5272,10 @@ class CommandObjectProcessGDBRemotePacketXferSize : public CommandObjectParsed {
       if (errno == 0 && user_specified_max != 0) {
         process->SetUserSpecifiedMaxMemoryTransferSize(user_specified_max);
         result.SetStatus(eReturnStatusSuccessFinishResult);
-        return true;
+        return;
       }
     }
     result.SetStatus(eReturnStatusFailed);
-    return false;
   }
 };
 
@@ -5299,13 +5296,13 @@ class CommandObjectProcessGDBRemotePacketSend : public CommandObjectParsed {
 
   ~CommandObjectProcessGDBRemotePacketSend() override = default;
 
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     const size_t argc = command.GetArgumentCount();
     if (argc == 0) {
       result.AppendErrorWithFormat(
           "'%s' takes a one or more packet content arguments",
           m_cmd_name.c_str());
-      return false;
+      return;
     }
 
     ProcessGDBRemote *process =
@@ -5331,7 +5328,6 @@ class CommandObjectProcessGDBRemotePacketSend : public CommandObjectParsed {
           output_strm.Printf("response: %s\n", response.GetStringRef().data());
       }
     }
-    return true;
   }
 };
 
@@ -5348,12 +5344,12 @@ class CommandObjectProcessGDBRemotePacketMonitor : public CommandObjectRaw {
 
   ~CommandObjectProcessGDBRemotePacketMonitor() override = default;
 
-  bool DoExecute(llvm::StringRef command,
+  void DoExecute(llvm::StringRef command,
                  CommandReturnObject &result) override {
     if (command.empty()) {
       result.AppendErrorWithFormat("'%s' takes a command string argument",
                                    m_cmd_name.c_str());
-      return false;
+      return;
     }
 
     ProcessGDBRemote *process =
@@ -5377,7 +5373,6 @@ class CommandObjectProcessGDBRemotePacketMonitor : public CommandObjectRaw {
       else
         output_strm.Printf("response: %s\n", response.GetStringRef().data());
     }
-    return true;
   }
 };
 
diff --git a/lldb/source/Plugins/Process/minidump/ProcessMinidump.cpp b/lldb/source/Plugins/Process/minidump/ProcessMinidump.cpp
index 99d0b54c40f95..0d5ca42691d3d 100644
--- a/lldb/source/Plugins/Process/minidump/ProcessMinidump.cpp
+++ b/lldb/source/Plugins/Process/minidump/ProcessMinidump.cpp
@@ -795,12 +795,12 @@ class CommandObjectProcessMinidumpDump : public CommandObjectParsed {
 
   Options *GetOptions() override { return &m_option_group; }
 
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     const size_t argc = command.GetArgumentCount();
     if (argc > 0) {
       result.AppendErrorWithFormat("'%s' take no arguments, only options",
                                    m_cmd_name.c_str());
-      return false;
+      return;
     }
     SetDefaultOptionsIfNoneAreSet();
 
@@ -904,9 +904,7 @@ class CommandObjectProcessMinidumpDump : public CommandObjectParsed {
       DumpTextStream(StreamType::FacebookThreadName,
                      "Facebook Thread Name");
     if (DumpFacebookLogcat())
-      DumpTextStream(StreamType::FacebookLogcat,
-                     "Facebook Logcat");
-    return true;
+      DumpTextStream(StreamType::FacebookLogcat, "Facebook Logcat");
   }
 };
 
diff --git a/lldb/source/Plugins/StructuredData/DarwinLog/StructuredDataDarwinLog.cpp b/lldb/source/Plugins/StructuredData/DarwinLog/StructuredDataDarwinLog.cpp
index f8a8df84ca37f..c46dc54c912e5 100644
--- a/lldb/source/Plugins/StructuredData/DarwinLog/StructuredDataDarwinLog.cpp
+++ b/lldb/source/Plugins/StructuredData/DarwinLog/StructuredDataDarwinLog.cpp
@@ -766,7 +766,7 @@ class EnableCommand : public CommandObjectParsed {
     result.AppendWarning(stream.GetString());
   }
 
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     // First off, set the global sticky state of enable/disable based on this
     // command execution.
     s_is_explicitly_enabled = m_enable;
@@ -790,14 +790,14 @@ class EnableCommand : public CommandObjectParsed {
     if (!process_sp) {
       // No active process, so there is nothing more to do right now.
       result.SetStatus(eReturnStatusSuccessFinishNoResult);
-      return true;
+      return;
     }
 
     // If the process is no longer alive, we can't do this now. We'll catch it
     // the next time the process is started up.
     if (!process_sp->IsAlive()) {
       result.SetStatus(eReturnStatusSuccessFinishNoResult);
-      return true;
+      return;
     }
 
     // Get the plugin for the process.
@@ -838,7 +838,6 @@ class EnableCommand : public CommandObjectParsed {
       // one this command is setup to do.
       plugin.SetEnabled(m_enable);
     }
-    return result.Succeeded();
   }
 
   Options *GetOptions() override {
@@ -861,7 +860,7 @@ class StatusCommand : public CommandObjectParsed {
                             "plugin structured-data darwin-log status") {}
 
 protected:
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     auto &stream = result.GetOutputStream();
 
     // Figure out if we've got a process.  If so, we can tell if DarwinLog is
@@ -891,7 +890,7 @@ class StatusCommand : public CommandObjectParsed {
     if (!options_sp) {
       // Nothing more to do.
       result.SetStatus(eReturnStatusSuccessFinishResult);
-      return true;
+      return;
     }
 
     // Print filter rules
@@ -924,7 +923,6 @@ class StatusCommand : public CommandObjectParsed {
                   options_sp->GetFallthroughAccepts() ? "accept" : "reject");
 
     result.SetStatus(eReturnStatusSuccessFinishResult);
-    return true;
   }
 };
 
diff --git a/lldb/source/Plugins/Trace/intel-pt/CommandObjectTraceStartIntelPT.h b/lldb/source/Plugins/Trace/intel-pt/CommandObjectTraceStartIntelPT.h
index 254baaf3e6736..82714dea3fcdb 100644
--- a/lldb/source/Plugins/Trace/intel-pt/CommandObjectTraceStartIntelPT.h
+++ b/lldb/source/Plugins/Trace/intel-pt/CommandObjectTraceStartIntelPT.h
@@ -105,7 +105,7 @@ class CommandObjectProcessTraceStartIntelPT : public CommandObjectParsed {
   Options *GetOptions() override { return &m_options; }
 
 protected:
-  bool DoExecute(Args &command, CommandReturnObject &result) override;
+  void DoExecute(Args &command, CommandReturnObject &result) override;
 
   TraceIntelPT &m_trace;
   CommandOptions m_options;
diff --git a/lldb/source/Plugins/TraceExporter/ctf/CommandObjectThreadTraceExportCTF.cpp b/lldb/source/Plugins/TraceExporter/ctf/CommandObjectThreadTraceExportCTF.cpp
index 33d05ee2ac137..ee8970fb4de27 100644
--- a/lldb/source/Plugins/TraceExporter/ctf/CommandObjectThreadTraceExportCTF.cpp
+++ b/lldb/source/Plugins/TraceExporter/ctf/CommandObjectThreadTraceExportCTF.cpp
@@ -62,7 +62,7 @@ CommandObjectThreadTraceExportCTF::CommandOptions::GetDefinitions() {
   return llvm::ArrayRef(g_thread_trace_export_ctf_options);
 }
 
-bool CommandObjectThreadTraceExportCTF::DoExecute(Args &command,
+void CommandObjectThreadTraceExportCTF::DoExecute(Args &command,
                                                   CommandReturnObject &result) {
   const TraceSP &trace_sp = m_exe_ctx.GetTargetSP()->GetTrace();
   Process *process = m_exe_ctx.GetProcessPtr();
@@ -78,7 +78,6 @@ bool CommandObjectThreadTraceExportCTF::DoExecute(Args &command,
     result.AppendErrorWithFormatv(
         "Thread index {0} is out of range (valid values are 1 - {1}).\n", tid,
         num_threads);
-    return false;
   } else {
     auto do_work = [&]() -> Error {
       Expected<TraceCursorSP> cursor = trace_sp->CreateNewCursor(*thread);
@@ -91,9 +90,6 @@ bool CommandObjectThreadTraceExportCTF::DoExecute(Args &command,
 
     if (llvm::Error err = do_work()) {
       result.AppendErrorWithFormat("%s\n", toString(std::move(err)).c_str());
-      return false;
-    } else {
-      return true;
     }
   }
 }
diff --git a/lldb/source/Plugins/TraceExporter/ctf/CommandObjectThreadTraceExportCTF.h b/lldb/source/Plugins/TraceExporter/ctf/CommandObjectThreadTraceExportCTF.h
index c9f02a372deda..1a034e87cfb65 100644
--- a/lldb/source/Plugins/TraceExporter/ctf/CommandObjectThreadTraceExportCTF.h
+++ b/lldb/source/Plugins/TraceExporter/ctf/CommandObjectThreadTraceExportCTF.h
@@ -48,7 +48,7 @@ class CommandObjectThreadTraceExportCTF : public CommandObjectParsed {
   Options *GetOptions() override { return &m_options; }
 
 protected:
-  bool DoExecute(Args &command, CommandReturnObject &result) override;
+  void DoExecute(Args &command, CommandReturnObject &result) override;
 
   CommandOptions m_options;
 };
diff --git a/lldb/unittests/Interpreter/TestCommandPaths.cpp b/lldb/unittests/Interpreter/TestCommandPaths.cpp
index 78948ae5b7065..0f0a2791ebb80 100644
--- a/lldb/unittests/Interpreter/TestCommandPaths.cpp
+++ b/lldb/unittests/Interpreter/TestCommandPaths.cpp
@@ -48,10 +48,9 @@ class CommandObjectLeaf : public CommandObjectParsed {
   }
 
 protected:
-  bool DoExecute(Args &command, CommandReturnObject &result) override {
+  void DoExecute(Args &command, CommandReturnObject &result) override {
     result.SetStatus(eReturnStatusSuccessFinishResult);
     result.AppendMessage("I did nothing");
-    return true;
   }
 };
 

From 1abc931d469bca47193efa4d0676776c31438f3e Mon Sep 17 00:00:00 2001
From: Justin Bogner <mail@justinbogner.com>
Date: Mon, 30 Oct 2023 13:35:25 -0700
Subject: [PATCH 431/877] [DirectX] Remove redundant resource kind handling.
 NFC (#70691)

The frontend passes this along since https://reviews.llvm.org/D135335
---
 llvm/lib/Target/DirectX/DXILResource.cpp | 14 --------------
 1 file changed, 14 deletions(-)

diff --git a/llvm/lib/Target/DirectX/DXILResource.cpp b/llvm/lib/Target/DirectX/DXILResource.cpp
index dde7255e04254..0390a3f0a558d 100644
--- a/llvm/lib/Target/DirectX/DXILResource.cpp
+++ b/llvm/lib/Target/DirectX/DXILResource.cpp
@@ -260,20 +260,6 @@ void UAVResource::print(raw_ostream &OS) const {
 // https://github.com/llvm/llvm-project/issues/57991).
 void UAVResource::parseSourceType(StringRef S) {
   IsROV = S.startswith("RasterizerOrdered");
-  if (IsROV)
-    S = S.substr(strlen("RasterizerOrdered"));
-  if (S.startswith("RW"))
-    S = S.substr(strlen("RW"));
-
-  // Note: I'm deliberately not handling any of the Texture buffer types at the
-  // moment. I want to resolve the issue above before adding Texture or Sampler
-  // support.
-  Shape = StringSwitch<ResourceBase::Kinds>(S)
-              .StartsWith("Buffer<", Kinds::TypedBuffer)
-              .StartsWith("ByteAddressBuffer<", Kinds::RawBuffer)
-              .StartsWith("StructuredBuffer<", Kinds::StructuredBuffer)
-              .Default(Kinds::Invalid);
-  assert(Shape != Kinds::Invalid && "Unsupported buffer type");
 
   S = S.substr(S.find("<") + 1);
 

From c92c86f66a72c2a07dce4976ab26446e5f272875 Mon Sep 17 00:00:00 2001
From: Philip Reames <preames@rivosinc.com>
Date: Mon, 30 Oct 2023 13:21:56 -0700
Subject: [PATCH 432/877] [RISCV] Add test coverage for "zext nneg" [nfc]

This IR feature was recently added in #67982.  An upcoming change will
improve our lowering on these examples.
---
 llvm/test/CodeGen/RISCV/sext-zext-trunc.ll | 158 +++++++++++++++++++++
 1 file changed, 158 insertions(+)

diff --git a/llvm/test/CodeGen/RISCV/sext-zext-trunc.ll b/llvm/test/CodeGen/RISCV/sext-zext-trunc.ll
index 98488c9a589a3..7297bfaf0c62e 100644
--- a/llvm/test/CodeGen/RISCV/sext-zext-trunc.ll
+++ b/llvm/test/CodeGen/RISCV/sext-zext-trunc.ll
@@ -350,6 +350,164 @@ define i64 @zext_i32_to_i64(i32 %a) nounwind {
   ret i64 %1
 }
 
+define i8 @zext_nneg_i1_to_i8(i1 %a) nounwind {
+; RV32I-LABEL: zext_nneg_i1_to_i8:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    andi a0, a0, 1
+; RV32I-NEXT:    ret
+;
+; RV64-LABEL: zext_nneg_i1_to_i8:
+; RV64:       # %bb.0:
+; RV64-NEXT:    andi a0, a0, 1
+; RV64-NEXT:    ret
+  %1 = zext nneg i1 %a to i8
+  ret i8 %1
+}
+
+define i16 @zext_nneg_i1_to_i16(i1 %a) nounwind {
+; RV32I-LABEL: zext_nneg_i1_to_i16:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    andi a0, a0, 1
+; RV32I-NEXT:    ret
+;
+; RV64-LABEL: zext_nneg_i1_to_i16:
+; RV64:       # %bb.0:
+; RV64-NEXT:    andi a0, a0, 1
+; RV64-NEXT:    ret
+  %1 = zext nneg i1 %a to i16
+  ret i16 %1
+}
+
+define i32 @zext_nneg_i1_to_i32(i1 %a) nounwind {
+; RV32I-LABEL: zext_nneg_i1_to_i32:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    andi a0, a0, 1
+; RV32I-NEXT:    ret
+;
+; RV64-LABEL: zext_nneg_i1_to_i32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    andi a0, a0, 1
+; RV64-NEXT:    ret
+  %1 = zext nneg i1 %a to i32
+  ret i32 %1
+}
+
+define i64 @zext_nneg_i1_to_i64(i1 %a) nounwind {
+; RV32I-LABEL: zext_nneg_i1_to_i64:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    andi a0, a0, 1
+; RV32I-NEXT:    li a1, 0
+; RV32I-NEXT:    ret
+;
+; RV64-LABEL: zext_nneg_i1_to_i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    andi a0, a0, 1
+; RV64-NEXT:    ret
+  %1 = zext nneg i1 %a to i64
+  ret i64 %1
+}
+
+define i16 @zext_nneg_i8_to_i16(i8 %a) nounwind {
+; RV32I-LABEL: zext_nneg_i8_to_i16:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    andi a0, a0, 255
+; RV32I-NEXT:    ret
+;
+; RV64-LABEL: zext_nneg_i8_to_i16:
+; RV64:       # %bb.0:
+; RV64-NEXT:    andi a0, a0, 255
+; RV64-NEXT:    ret
+  %1 = zext nneg i8 %a to i16
+  ret i16 %1
+}
+
+define i32 @zext_nneg_i8_to_i32(i8 %a) nounwind {
+; RV32I-LABEL: zext_nneg_i8_to_i32:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    andi a0, a0, 255
+; RV32I-NEXT:    ret
+;
+; RV64-LABEL: zext_nneg_i8_to_i32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    andi a0, a0, 255
+; RV64-NEXT:    ret
+  %1 = zext nneg i8 %a to i32
+  ret i32 %1
+}
+
+define i64 @zext_nneg_i8_to_i64(i8 %a) nounwind {
+; RV32I-LABEL: zext_nneg_i8_to_i64:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    andi a0, a0, 255
+; RV32I-NEXT:    li a1, 0
+; RV32I-NEXT:    ret
+;
+; RV64-LABEL: zext_nneg_i8_to_i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    andi a0, a0, 255
+; RV64-NEXT:    ret
+  %1 = zext nneg i8 %a to i64
+  ret i64 %1
+}
+
+define i32 @zext_nneg_i16_to_i32(i16 %a) nounwind {
+; RV32I-LABEL: zext_nneg_i16_to_i32:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    slli a0, a0, 16
+; RV32I-NEXT:    srli a0, a0, 16
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: zext_nneg_i16_to_i32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 48
+; RV64I-NEXT:    srli a0, a0, 48
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-LABEL: zext_nneg_i16_to_i32:
+; RV64ZBB:       # %bb.0:
+; RV64ZBB-NEXT:    zext.h a0, a0
+; RV64ZBB-NEXT:    ret
+  %1 = zext nneg i16 %a to i32
+  ret i32 %1
+}
+
+define i64 @zext_nneg_i16_to_i64(i16 %a) nounwind {
+; RV32I-LABEL: zext_nneg_i16_to_i64:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    slli a0, a0, 16
+; RV32I-NEXT:    srli a0, a0, 16
+; RV32I-NEXT:    li a1, 0
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: zext_nneg_i16_to_i64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 48
+; RV64I-NEXT:    srli a0, a0, 48
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-LABEL: zext_nneg_i16_to_i64:
+; RV64ZBB:       # %bb.0:
+; RV64ZBB-NEXT:    zext.h a0, a0
+; RV64ZBB-NEXT:    ret
+  %1 = zext nneg i16 %a to i64
+  ret i64 %1
+}
+
+define i64 @zext_nneg_i32_to_i64(i32 %a) nounwind {
+; RV32I-LABEL: zext_nneg_i32_to_i64:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    li a1, 0
+; RV32I-NEXT:    ret
+;
+; RV64-LABEL: zext_nneg_i32_to_i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    slli a0, a0, 32
+; RV64-NEXT:    srli a0, a0, 32
+; RV64-NEXT:    ret
+  %1 = zext nneg i32 %a to i64
+  ret i64 %1
+}
+
 define i1 @trunc_i8_to_i1(i8 %a) nounwind {
 ; RV32I-LABEL: trunc_i8_to_i1:
 ; RV32I:       # %bb.0:

From 71bf052ec90e77cb4aa66505d47cbc4b6016ac1d Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev@outlook.com>
Date: Mon, 30 Oct 2023 13:33:37 -0700
Subject: [PATCH 433/877] [SLP][NFC]Add a test for bool logic ops reduction,
 NFC.

---
 .../X86/reduction-bool-logic-op-inside.ll     | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)
 create mode 100644 llvm/test/Transforms/SLPVectorizer/X86/reduction-bool-logic-op-inside.ll

diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction-bool-logic-op-inside.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction-bool-logic-op-inside.ll
new file mode 100644
index 0000000000000..2b5f62bdf9894
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction-bool-logic-op-inside.ll
@@ -0,0 +1,19 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; RUN: opt -S < %s --passes=slp-vectorizer | FileCheck %s
+
+define i1 @test(i32 %x) {
+; CHECK-LABEL: define i1 @test(
+; CHECK-SAME: i32 [[X:%.*]]) {
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[X]], 1
+; CHECK-NEXT:    [[OP_RDX:%.*]] = or i1 poison, [[CMP]]
+; CHECK-NEXT:    ret i1 [[OP_RDX]]
+;
+  %cmp = icmp sgt i32 %x, 1
+  %sel1 = select i1 %cmp, i1 true, i1 poison
+  %sel2 = select i1 %sel1, i1 true, i1 poison
+  %sel3 = select i1 %sel2, i1 true, i1 poison
+  %sel4 = select i1 %cmp, i1 true, i1 poison
+  %ret = or i1 %sel3, %sel4
+  ret i1 %ret
+}
+

From 55c9f24344a49cd1deb86af1d79d4dc3a798c6fb Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Mon, 30 Oct 2023 13:48:27 -0700
Subject: [PATCH 434/877] [CVP] Infer nneg on zext when forming from
 non-negative sext. (#70715)

Builds on #67982 which recently introduced the nneg flag on a zext
instruction.
---
 llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp     | 1 +
 .../test/Transforms/CorrelatedValuePropagation/cond-at-use.ll | 2 +-
 llvm/test/Transforms/CorrelatedValuePropagation/sext.ll       | 4 ++--
 llvm/test/Transforms/PhaseOrdering/AArch64/loopflatten.ll     | 4 ++--
 4 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
index 9043c434313fe..99b4628cc68dd 100644
--- a/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
+++ b/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
@@ -1017,6 +1017,7 @@ static bool processSExt(SExtInst *SDI, LazyValueInfo *LVI) {
   auto *ZExt = CastInst::CreateZExtOrBitCast(Base, SDI->getType(), "", SDI);
   ZExt->takeName(SDI);
   ZExt->setDebugLoc(SDI->getDebugLoc());
+  ZExt->setNonNeg();
   SDI->replaceAllUsesWith(ZExt);
   SDI->eraseFromParent();
 
diff --git a/llvm/test/Transforms/CorrelatedValuePropagation/cond-at-use.ll b/llvm/test/Transforms/CorrelatedValuePropagation/cond-at-use.ll
index 8e5ec878bb894..7ec1028d65e0e 100644
--- a/llvm/test/Transforms/CorrelatedValuePropagation/cond-at-use.ll
+++ b/llvm/test/Transforms/CorrelatedValuePropagation/cond-at-use.ll
@@ -519,7 +519,7 @@ define i16 @ashr_convert(i16 noundef %x, i16 %y) {
 
 define i32 @sext_convert(i16 noundef %x) {
 ; CHECK-LABEL: @sext_convert(
-; CHECK-NEXT:    [[EXT:%.*]] = zext i16 [[X:%.*]] to i32
+; CHECK-NEXT:    [[EXT:%.*]] = zext nneg i16 [[X:%.*]] to i32
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp sge i16 [[X]], 0
 ; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i32 [[EXT]], i32 24
 ; CHECK-NEXT:    ret i32 [[SEL]]
diff --git a/llvm/test/Transforms/CorrelatedValuePropagation/sext.ll b/llvm/test/Transforms/CorrelatedValuePropagation/sext.ll
index 62e0bc036e769..0db520bfc68a2 100644
--- a/llvm/test/Transforms/CorrelatedValuePropagation/sext.ll
+++ b/llvm/test/Transforms/CorrelatedValuePropagation/sext.ll
@@ -18,7 +18,7 @@ define void @test1(i32 %n) {
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[A]], -1
 ; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]]
 ; CHECK:       for.body:
-; CHECK-NEXT:    [[EXT_WIDE1:%.*]] = zext i32 [[A]] to i64
+; CHECK-NEXT:    [[EXT_WIDE1:%.*]] = zext nneg i32 [[A]] to i64
 ; CHECK-NEXT:    call void @use64(i64 [[EXT_WIDE1]])
 ; CHECK-NEXT:    [[EXT]] = trunc i64 [[EXT_WIDE1]] to i32
 ; CHECK-NEXT:    br label [[FOR_COND]]
@@ -85,7 +85,7 @@ define void @test3(i32 %n) {
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[N:%.*]], -1
 ; CHECK-NEXT:    br i1 [[CMP]], label [[BB:%.*]], label [[EXIT:%.*]]
 ; CHECK:       bb:
-; CHECK-NEXT:    [[EXT_WIDE1:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    [[EXT_WIDE1:%.*]] = zext nneg i32 [[N]] to i64
 ; CHECK-NEXT:    call void @use64(i64 [[EXT_WIDE1]])
 ; CHECK-NEXT:    [[EXT:%.*]] = trunc i64 [[EXT_WIDE1]] to i32
 ; CHECK-NEXT:    br label [[EXIT]]
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/loopflatten.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/loopflatten.ll
index 2150026ab5e0d..77f53ad56e1cc 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/loopflatten.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/loopflatten.ll
@@ -12,8 +12,8 @@ define dso_local void @_Z3fooPiii(ptr %A, i32 %N, i32 %M) #0 {
 ; CHECK-NEXT:    [[OR_COND:%.*]] = select i1 [[CMP3]], i1 [[CMP21]], i1 false
 ; CHECK-NEXT:    br i1 [[OR_COND]], label [[FOR_COND1_PREHEADER_LR_PH_SPLIT_US:%.*]], label [[FOR_COND_CLEANUP:%.*]]
 ; CHECK:       for.cond1.preheader.lr.ph.split.us:
-; CHECK-NEXT:    [[TMP0:%.*]] = zext i32 [[M]] to i64
-; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    [[TMP0:%.*]] = zext nneg i32 [[M]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = zext nneg i32 [[N]] to i64
 ; CHECK-NEXT:    [[FLATTEN_TRIPCOUNT:%.*]] = mul nuw nsw i64 [[TMP0]], [[TMP1]]
 ; CHECK-NEXT:    br label [[FOR_COND1_PREHEADER_US:%.*]]
 ; CHECK:       for.cond1.preheader.us:

From 8ca565cd3b68929c0691fd97a6d0f7a52d3e3cd7 Mon Sep 17 00:00:00 2001
From: michaelrj-google <71531609+michaelrj-google@users.noreply.github.com>
Date: Mon, 30 Oct 2023 14:04:00 -0700
Subject: [PATCH 435/877] [libc] Fix printf long double truncation bound
 (#70705)

The calculation for if a number being printed is truncated and should be
rounded up assumed a double for one of its constants, causing
occassional misrounding. This fixes that by making the constant based on
the mantissa width.
---
 libc/src/stdio/printf_core/float_dec_converter.h | 12 +++++++-----
 libc/test/src/stdio/sprintf_test.cpp             |  3 +++
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/libc/src/stdio/printf_core/float_dec_converter.h b/libc/src/stdio/printf_core/float_dec_converter.h
index 8c5ba0d241eda..0e152a2602564 100644
--- a/libc/src/stdio/printf_core/float_dec_converter.h
+++ b/libc/src/stdio/printf_core/float_dec_converter.h
@@ -85,11 +85,13 @@ LIBC_INLINE RoundDirection get_round_direction(int last_digit, bool truncated,
 
 template <typename T>
 LIBC_INLINE constexpr cpp::enable_if_t<cpp::is_integral_v<T>, bool>
-zero_after_digits(int32_t base_2_exp, int32_t digits_after_point, T mantissa) {
+zero_after_digits(int32_t base_2_exp, int32_t digits_after_point, T mantissa,
+                  const int32_t mant_width) {
   const int32_t required_twos = -base_2_exp - digits_after_point - 1;
+  // Add 8 to mant width since this is a loose bound.
   const bool has_trailing_zeros =
       required_twos <= 0 ||
-      (required_twos < 60 &&
+      (required_twos < (mant_width + 8) &&
        multiple_of_power_of_2(mantissa, static_cast<uint32_t>(required_twos)));
   return has_trailing_zeros;
 }
@@ -568,7 +570,7 @@ LIBC_INLINE int convert_float_decimal_typed(Writer *writer,
         RoundDirection round;
         const bool truncated =
             !zero_after_digits(exponent - MANT_WIDTH, precision,
-                               float_bits.get_explicit_mantissa());
+                               float_bits.get_explicit_mantissa(), MANT_WIDTH);
         round = get_round_direction(last_digit, truncated, is_negative);
 
         RET_IF_RESULT_NEGATIVE(
@@ -733,7 +735,7 @@ LIBC_INLINE int convert_float_dec_exp_typed(Writer *writer,
       // Use the formula from %f.
       truncated =
           !zero_after_digits(exponent - MANT_WIDTH, precision - final_exponent,
-                             float_bits.get_explicit_mantissa());
+                             float_bits.get_explicit_mantissa(), MANT_WIDTH);
     }
   }
   round = get_round_direction(last_digit, truncated, is_negative);
@@ -979,7 +981,7 @@ LIBC_INLINE int convert_float_dec_auto_typed(Writer *writer,
       // Use the formula from %f.
       truncated =
           !zero_after_digits(exponent - MANT_WIDTH, exp_precision - base_10_exp,
-                             float_bits.get_explicit_mantissa());
+                             float_bits.get_explicit_mantissa(), MANT_WIDTH);
     }
   }
 
diff --git a/libc/test/src/stdio/sprintf_test.cpp b/libc/test/src/stdio/sprintf_test.cpp
index b2c321c0b15c9..a8fe8f2557c8e 100644
--- a/libc/test/src/stdio/sprintf_test.cpp
+++ b/libc/test/src/stdio/sprintf_test.cpp
@@ -1041,6 +1041,9 @@ TEST_F(LlvmLibcSPrintfTest, FloatDecimalConv) {
   written = LIBC_NAMESPACE::sprintf(buff, "%Lf", 1.0L);
   ASSERT_STREQ_LEN(written, buff, "1.000000");
 
+  written = LIBC_NAMESPACE::sprintf(buff, "%.Lf", -2.5L);
+  ASSERT_STREQ_LEN(written, buff, "-2");
+
 #if defined(SPECIAL_X86_LONG_DOUBLE)
 
   written = LIBC_NAMESPACE::sprintf(buff, "%Lf", 1e100L);

From 428af867d89eb28b09e80c6826c4c6daad1ba8cc Mon Sep 17 00:00:00 2001
From: Justin Bogner <mail@justinbogner.com>
Date: Mon, 30 Oct 2023 14:04:15 -0700
Subject: [PATCH 436/877] [DirectX] Update test after `opt` learned to infer
 datalayout (#70726)

Since e39f6c1844fa "[opt] Infer DataLayout from triple if not
specified", this test (correctly) emits a load of an i64 with 8 byte
alignment, rather than with 4 byte alignment.
---
 llvm/test/CodeGen/DirectX/typed_ptr.ll | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/llvm/test/CodeGen/DirectX/typed_ptr.ll b/llvm/test/CodeGen/DirectX/typed_ptr.ll
index 2975d85b9c964..5453e87651dd7 100644
--- a/llvm/test/CodeGen/DirectX/typed_ptr.ll
+++ b/llvm/test/CodeGen/DirectX/typed_ptr.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
 ; RUN: opt -S -dxil-prepare < %s | FileCheck %s
 target triple = "dxil-unknown-unknown"
 
@@ -7,7 +8,7 @@ target triple = "dxil-unknown-unknown"
 define i64 @test(i64* %p) {
 ; CHECK-LABEL: define i64 @test(
 ; CHECK-SAME: ptr [[P:%.*]]) {
-; CHECK-NEXT:    [[V:%.*]] = load i64, ptr [[P]], align 4
+; CHECK-NEXT:    [[V:%.*]] = load i64, ptr [[P]], align 8
 ; CHECK-NEXT:    ret i64 [[V]]
 ;
   %v = load i64, i64* %p

From 9da19e4340f21455b52d5768439cfbaca4112fe4 Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev@outlook.com>
Date: Mon, 30 Oct 2023 13:12:19 -0700
Subject: [PATCH 437/877] [SLP]Fix PR70507: correctly handle bool logical ops
 in reductions.

If the very first reduction operation is not bool logical op, but some
others are, still need to emit the boo logic op for all the extra
reduction operations to avoid incorrect poison propagation.
---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 20 +++++++++++++++----
 .../X86/reduction-bool-logic-op-inside.ll     |  2 +-
 2 files changed, 17 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index b6895c649f838..4bb6301f4612f 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -13667,10 +13667,12 @@ class HorizontalReduction {
   static Value *createOp(IRBuilder<> &Builder, RecurKind RdxKind, Value *LHS,
                          Value *RHS, const Twine &Name,
                          const ReductionOpsListType &ReductionOps) {
-    bool UseSelect = ReductionOps.size() == 2 ||
-                     // Logical or/and.
-                     (ReductionOps.size() == 1 &&
-                      isa<SelectInst>(ReductionOps.front().front()));
+    bool UseSelect =
+        ReductionOps.size() == 2 ||
+        // Logical or/and.
+        (ReductionOps.size() == 1 && any_of(ReductionOps.front(), [](Value *V) {
+           return isa<SelectInst>(V);
+         }));
     assert((!UseSelect || ReductionOps.size() != 2 ||
             isa<SelectInst>(ReductionOps[1][0])) &&
            "Expected cmp + select pairs for reduction");
@@ -14104,6 +14106,16 @@ class HorizontalReduction {
         // Update the final value in the reduction.
         Builder.SetCurrentDebugLocation(
             cast<Instruction>(ReductionOps.front().front())->getDebugLoc());
+        if ((isa<PoisonValue>(VectorizedTree) && !isa<PoisonValue>(Res)) ||
+            (isGuaranteedNotToBePoison(Res) &&
+             !isGuaranteedNotToBePoison(VectorizedTree))) {
+          auto It = ReducedValsToOps.find(Res);
+          if (It != ReducedValsToOps.end() &&
+              any_of(It->getSecond(),
+                     [](Instruction *I) { return isBoolLogicOp(I); }))
+            std::swap(VectorizedTree, Res);
+        }
+
         return createOp(Builder, RdxKind, VectorizedTree, Res, "op.rdx",
                         ReductionOps);
       }
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction-bool-logic-op-inside.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction-bool-logic-op-inside.ll
index 2b5f62bdf9894..b66967d183cac 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reduction-bool-logic-op-inside.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction-bool-logic-op-inside.ll
@@ -5,7 +5,7 @@ define i1 @test(i32 %x) {
 ; CHECK-LABEL: define i1 @test(
 ; CHECK-SAME: i32 [[X:%.*]]) {
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[X]], 1
-; CHECK-NEXT:    [[OP_RDX:%.*]] = or i1 poison, [[CMP]]
+; CHECK-NEXT:    [[OP_RDX:%.*]] = select i1 [[CMP]], i1 true, i1 poison
 ; CHECK-NEXT:    ret i1 [[OP_RDX]]
 ;
   %cmp = icmp sgt i32 %x, 1

From 6995183e174280f3987858bd13a4eca9905f6365 Mon Sep 17 00:00:00 2001
From: Jungwook Park <jungwook.park@innosilicon.com>
Date: Mon, 30 Oct 2023 21:46:21 +0000
Subject: [PATCH 438/877] [mlir][python] Register LLVM translations in the
 RegisterEverything for python (#70428)

Added missing register_translations in python to replicate the same in
the C-API
Cleaned up the current calls to register passes where the other calls
are already embedded in the mlirRegisterAllPasses.
found here,
https://discourse.llvm.org/t/opencl-example/74187
---
 .../Bindings/Python/RegisterEverything.cpp    |  9 +--
 mlir/python/mlir/_mlir_libs/__init__.py       |  8 ++-
 .../dialects/{gpu.py => gpu/dialect.py}       |  0
 .../dialects/gpu/module-to-binary-nvvm.py     | 64 +++++++++++++++++++
 .../dialects/gpu/module-to-binary-rocdl.py    | 64 +++++++++++++++++++
 5 files changed, 138 insertions(+), 7 deletions(-)
 rename mlir/test/python/dialects/{gpu.py => gpu/dialect.py} (100%)
 create mode 100644 mlir/test/python/dialects/gpu/module-to-binary-nvvm.py
 create mode 100644 mlir/test/python/dialects/gpu/module-to-binary-rocdl.py

diff --git a/mlir/lib/Bindings/Python/RegisterEverything.cpp b/mlir/lib/Bindings/Python/RegisterEverything.cpp
index fed5c36a625bf..6b2f6b0a6a3b8 100644
--- a/mlir/lib/Bindings/Python/RegisterEverything.cpp
+++ b/mlir/lib/Bindings/Python/RegisterEverything.cpp
@@ -7,20 +7,17 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir-c/RegisterEverything.h"
-#include "mlir-c/Conversion.h"
-#include "mlir-c/Transforms.h"
-
 #include "mlir/Bindings/Python/PybindAdaptors.h"
 
 PYBIND11_MODULE(_mlirRegisterEverything, m) {
-  m.doc() = "MLIR All Upstream Dialects and Passes Registration";
+  m.doc() = "MLIR All Upstream Dialects, Translations and Passes Registration";
 
   m.def("register_dialects", [](MlirDialectRegistry registry) {
     mlirRegisterAllDialects(registry);
   });
+  m.def("register_llvm_translations",
+        [](MlirContext context) { mlirRegisterAllLLVMTranslations(context); });
 
   // Register all passes on load.
   mlirRegisterAllPasses();
-  mlirRegisterConversionPasses();
-  mlirRegisterTransformsPasses();
 }
diff --git a/mlir/python/mlir/_mlir_libs/__init__.py b/mlir/python/mlir/_mlir_libs/__init__.py
index 03fcb10130c3a..71c074bc955e8 100644
--- a/mlir/python/mlir/_mlir_libs/__init__.py
+++ b/mlir/python/mlir/_mlir_libs/__init__.py
@@ -83,7 +83,8 @@ def process_initializer_module(module_name):
 
     # If _mlirRegisterEverything is built, then include it as an initializer
     # module.
-    process_initializer_module("_mlirRegisterEverything")
+    if process_initializer_module("_mlirRegisterEverything"):
+        init_module = importlib.import_module(f"._mlirRegisterEverything", __name__)
 
     # Load all _site_initialize_{i} modules, where 'i' is a number starting
     # at 0.
@@ -102,6 +103,11 @@ def __init__(self, *args, **kwargs):
             # all dialects. It is being done here in order to preserve existing
             # behavior. See: https://github.com/llvm/llvm-project/issues/56037
             self.load_all_available_dialects()
+            if init_module:
+                logger.debug(
+                    "Registering translations from initializer %r", init_module
+                )
+                init_module.register_llvm_translations(self)
 
     ir.Context = Context
 
diff --git a/mlir/test/python/dialects/gpu.py b/mlir/test/python/dialects/gpu/dialect.py
similarity index 100%
rename from mlir/test/python/dialects/gpu.py
rename to mlir/test/python/dialects/gpu/dialect.py
diff --git a/mlir/test/python/dialects/gpu/module-to-binary-nvvm.py b/mlir/test/python/dialects/gpu/module-to-binary-nvvm.py
new file mode 100644
index 0000000000000..70c08ceb7a6f2
--- /dev/null
+++ b/mlir/test/python/dialects/gpu/module-to-binary-nvvm.py
@@ -0,0 +1,64 @@
+# REQUIRES: host-supports-nvptx
+# RUN: %PYTHON %s | FileCheck %s
+
+from mlir.ir import *
+import mlir.dialects.gpu as gpu
+import mlir.dialects.gpu.passes
+from mlir.passmanager import *
+
+
+def run(f):
+    print("\nTEST:", f.__name__)
+    with Context(), Location.unknown():
+        f()
+    return f
+
+
+# CHECK-LABEL: testGPUToLLVMBin
+@run
+def testGPUToLLVMBin():
+    with Context():
+        module = Module.parse(
+            r"""
+module attributes {gpu.container_module} {
+  gpu.module @kernel_module1 [#nvvm.target<chip = "sm_70">] {
+    llvm.func @kernel(%arg0: i32, %arg1: !llvm.ptr<f32>,
+        %arg2: !llvm.ptr<f32>, %arg3: i64, %arg4: i64,
+        %arg5: i64) attributes {gpu.kernel} {
+      llvm.return
+    }
+  }
+}
+    """
+        )
+    pm = PassManager("any")
+    pm.add("gpu-module-to-binary{format=llvm}")
+    pm.run(module.operation)
+    print(module)
+    # CHECK-LABEL:gpu.binary @kernel_module1
+    # CHECK:[#gpu.object<#nvvm.target<chip = "sm_70">, offload = "{{.*}}">]
+
+
+# CHECK-LABEL: testGPUToASMBin
+@run
+def testGPUToASMBin():
+    with Context():
+        module = Module.parse(
+            r"""
+module attributes {gpu.container_module} {
+  gpu.module @kernel_module2 [#nvvm.target<flags = {fast}>, #nvvm.target] {
+    llvm.func @kernel(%arg0: i32, %arg1: !llvm.ptr<f32>,
+        %arg2: !llvm.ptr<f32>, %arg3: i64, %arg4: i64,
+        %arg5: i64) attributes {gpu.kernel} {
+      llvm.return
+    }
+  }
+}
+    """
+        )
+    pm = PassManager("any")
+    pm.add("gpu-module-to-binary{format=isa}")
+    pm.run(module.operation)
+    print(module)
+    # CHECK-LABEL:gpu.binary @kernel_module2
+    # CHECK:[#gpu.object<#nvvm.target<flags = {fast}>, properties = {O = 2 : i32}, assembly = "{{.*}}">, #gpu.object<#nvvm.target, properties = {O = 2 : i32}, assembly = "{{.*}}">]
diff --git a/mlir/test/python/dialects/gpu/module-to-binary-rocdl.py b/mlir/test/python/dialects/gpu/module-to-binary-rocdl.py
new file mode 100644
index 0000000000000..fad088cbd6d89
--- /dev/null
+++ b/mlir/test/python/dialects/gpu/module-to-binary-rocdl.py
@@ -0,0 +1,64 @@
+# REQUIRES: host-supports-amdgpu
+# RUN: %PYTHON %s | FileCheck %s
+
+from mlir.ir import *
+import mlir.dialects.gpu as gpu
+import mlir.dialects.gpu.passes
+from mlir.passmanager import *
+
+
+def run(f):
+    print("\nTEST:", f.__name__)
+    with Context(), Location.unknown():
+        f()
+    return f
+
+
+# CHECK-LABEL: testGPUToLLVMBin
+@run
+def testGPUToLLVMBin():
+    with Context():
+        module = Module.parse(
+            r"""
+module attributes {gpu.container_module} {
+  gpu.module @kernel_module1 [#rocdl.target<chip = "gfx90a">] {
+    llvm.func @kernel(%arg0: i32, %arg1: !llvm.ptr<f32>,
+        %arg2: !llvm.ptr<f32>, %arg3: i64, %arg4: i64,
+        %arg5: i64) attributes {gpu.kernel} {
+      llvm.return
+    }
+  }
+}
+    """
+        )
+    pm = PassManager("any")
+    pm.add("gpu-module-to-binary{format=llvm}")
+    pm.run(module.operation)
+    print(module)
+    # CHECK-LABEL:gpu.binary @kernel_module1
+    # CHECK:[#gpu.object<#rocdl.target<chip = "gfx90a">, offload = "{{.*}}">]
+
+
+# CHECK-LABEL: testGPUToASMBin
+@run
+def testGPUToASMBin():
+    with Context():
+        module = Module.parse(
+            r"""
+module attributes {gpu.container_module} {
+  gpu.module @kernel_module2 [#rocdl.target<flags = {fast}>, #rocdl.target] {
+    llvm.func @kernel(%arg0: i32, %arg1: !llvm.ptr<f32>,
+        %arg2: !llvm.ptr<f32>, %arg3: i64, %arg4: i64,
+        %arg5: i64) attributes {gpu.kernel} {
+      llvm.return
+    }
+  }
+}
+    """
+        )
+    pm = PassManager("any")
+    pm.add("gpu-module-to-binary{format=isa}")
+    pm.run(module.operation)
+    print(module)
+    # CHECK-LABEL:gpu.binary @kernel_module2
+    # CHECK:[#gpu.object<#rocdl.target<flags = {fast}>, assembly = "{{.*}}">, #gpu.object<#rocdl.target, assembly = "{{.*}}">]

From ba1349fc31295a3670b34c189838a133e18c0bed Mon Sep 17 00:00:00 2001
From: Danila Malyutin <danilaml@users.noreply.github.com>
Date: Mon, 30 Oct 2023 14:50:57 -0700
Subject: [PATCH 439/877] [SCEV] Fix "quick and dirty" difference that could
 lead to assert (#70688)

The old algorithm would remove all operands matching %step SCEV when it
intended to only remove a single one. This lead to assert when
SCEVAddExpr was of the form %step + %step and potential miscompiles in
similar cases. Such SCEVs could be created when construction reached
depth thresholds.

Fixes #70348
---
 llvm/lib/Analysis/ScalarEvolution.cpp         | 13 +++++++----
 .../Transforms/LoopStrengthReduce/pr70348.ll  | 23 +++++++++++++++++++
 2 files changed, 31 insertions(+), 5 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopStrengthReduce/pr70348.ll

diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp
index 2368003177e74..f13e508a6c454 100644
--- a/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -1335,11 +1335,14 @@ static const SCEV *getPreStartForExtend(const SCEVAddRecExpr *AR, Type *Ty,
 
   // Create an AddExpr for "PreStart" after subtracting Step. Full SCEV
   // subtraction is expensive. For this purpose, perform a quick and dirty
-  // difference, by checking for Step in the operand list.
-  SmallVector<const SCEV *, 4> DiffOps;
-  for (const SCEV *Op : SA->operands())
-    if (Op != Step)
-      DiffOps.push_back(Op);
+  // difference, by checking for Step in the operand list. Note, that
+  // SA might have repeated ops, like %a + %a + ..., so only remove one.
+  SmallVector<const SCEV *, 4> DiffOps(SA->operands());
+  for (auto It = DiffOps.begin(); It != DiffOps.end(); ++It)
+    if (*It == Step) {
+      DiffOps.erase(It);
+      break;
+    }
 
   if (DiffOps.size() == SA->getNumOperands())
     return nullptr;
diff --git a/llvm/test/Transforms/LoopStrengthReduce/pr70348.ll b/llvm/test/Transforms/LoopStrengthReduce/pr70348.ll
new file mode 100644
index 0000000000000..35b48a03a14e4
--- /dev/null
+++ b/llvm/test/Transforms/LoopStrengthReduce/pr70348.ll
@@ -0,0 +1,23 @@
+; RUN: opt -S -passes=loop-reduce -scalar-evolution-max-arith-depth=0 %s | FileCheck %s
+;
+; Make sure we don't trigger an assertion in SCEV here.
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128-ni:1-p2:32:8:8:32-ni:2"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @test(i32 %phi) {
+; CHECK-LABEL: test
+bb:
+  br label %bb6
+
+bb6:                                              ; preds = %bb6, %bb
+  %phi7 = phi i32 [ 1, %bb ], [ %add44, %bb6 ]
+  %mul13 = mul i32 %phi7, %phi
+  %mul16 = mul i32 %mul13, 0
+  %add44 = add i32 %phi7, 1
+  br i1 true, label %bb51, label %bb6
+
+bb51:                                             ; preds = %bb6
+  unreachable
+}
+

From c1183399a8205f83a418f20889776589b3b98d53 Mon Sep 17 00:00:00 2001
From: Youngsuk Kim <youngsuk.kim@hpe.com>
Date: Mon, 30 Oct 2023 16:51:49 -0500
Subject: [PATCH 440/877] [clang] Remove no-op ptr-to-ptr bitcasts (NFC)

---
 clang/lib/CodeGen/CGOpenMPRuntime.cpp    | 4 ++--
 clang/lib/CodeGen/CoverageMappingGen.cpp | 4 +---
 clang/lib/CodeGen/ItaniumCXXABI.cpp      | 1 -
 3 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index c1be7c2d03215..bcd67b7205c7d 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -1809,7 +1809,7 @@ bool CGOpenMPRuntime::emitDeclareTargetVarDefinition(const VarDecl *VD,
                                /*IsInitializer=*/true);
       CtorCGF.FinishFunction();
       Ctor = Fn;
-      ID = llvm::ConstantExpr::getBitCast(Fn, CGM.Int8PtrTy);
+      ID = Fn;
     } else {
       Ctor = new llvm::GlobalVariable(
           CGM.getModule(), CGM.Int8Ty, /*isConstant=*/true,
@@ -1858,7 +1858,7 @@ bool CGOpenMPRuntime::emitDeclareTargetVarDefinition(const VarDecl *VD,
                           DtorCGF.needsEHCleanup(ASTTy.isDestructedType()));
       DtorCGF.FinishFunction();
       Dtor = Fn;
-      ID = llvm::ConstantExpr::getBitCast(Fn, CGM.Int8PtrTy);
+      ID = Fn;
     } else {
       Dtor = new llvm::GlobalVariable(
           CGM.getModule(), CGM.Int8Ty, /*isConstant=*/true,
diff --git a/clang/lib/CodeGen/CoverageMappingGen.cpp b/clang/lib/CodeGen/CoverageMappingGen.cpp
index 76ed10091b025..d77aa4dd78a06 100644
--- a/clang/lib/CodeGen/CoverageMappingGen.cpp
+++ b/clang/lib/CodeGen/CoverageMappingGen.cpp
@@ -1728,13 +1728,11 @@ void CoverageMappingModuleGen::emitFunctionMappingRecord(
 void CoverageMappingModuleGen::addFunctionMappingRecord(
     llvm::GlobalVariable *NamePtr, StringRef NameValue, uint64_t FuncHash,
     const std::string &CoverageMapping, bool IsUsed) {
-  llvm::LLVMContext &Ctx = CGM.getLLVMContext();
   const uint64_t NameHash = llvm::IndexedInstrProf::ComputeHash(NameValue);
   FunctionRecords.push_back({NameHash, FuncHash, CoverageMapping, IsUsed});
 
   if (!IsUsed)
-    FunctionNames.push_back(
-        llvm::ConstantExpr::getBitCast(NamePtr, llvm::Type::getInt8PtrTy(Ctx)));
+    FunctionNames.push_back(NamePtr);
 
   if (CGM.getCodeGenOpts().DumpCoverageMapping) {
     // Dump the coverage mapping data for this function by decoding the
diff --git a/clang/lib/CodeGen/ItaniumCXXABI.cpp b/clang/lib/CodeGen/ItaniumCXXABI.cpp
index 0c89871420bdd..7877235a63356 100644
--- a/clang/lib/CodeGen/ItaniumCXXABI.cpp
+++ b/clang/lib/CodeGen/ItaniumCXXABI.cpp
@@ -1324,7 +1324,6 @@ void ItaniumCXXABI::emitThrow(CodeGenFunction &CGF, const CXXThrowExpr *E) {
     if (!Record->hasTrivialDestructor()) {
       CXXDestructorDecl *DtorD = Record->getDestructor();
       Dtor = CGM.getAddrOfCXXStructor(GlobalDecl(DtorD, Dtor_Complete));
-      Dtor = llvm::ConstantExpr::getBitCast(Dtor, CGM.Int8PtrTy);
     }
   }
   if (!Dtor) Dtor = llvm::Constant::getNullValue(CGM.Int8PtrTy);

From a41b149f481e2bcba24e81f208a1938247f040e0 Mon Sep 17 00:00:00 2001
From: Nick Desaulniers <nickdesaulniers@users.noreply.github.com>
Date: Mon, 30 Oct 2023 14:59:58 -0700
Subject: [PATCH 441/877] [MachineInstr] add insert method for variadic
 instructions (#67699)

As alluded to in #20571, it would be nice if we could mutate operand
lists of MachineInstr's more safely. Add an insert method that together
with removeOperand allows for easier splicing of operands.

Splitting this patch off early to get feedback; I need to either:
- mutate an INLINEASM{_BR} MachinInstr's MachineOperands from being
  registers (physical or virtual) to memory
  (MachineOperandType::MO_FrameIndex).  These are not 1:1 operand
  replacements, but N:M operand replacements. i.e. we need to
  update 2 MachineOperands into the middle of the operand list to 5 (at
  least for x86_64).
- copy, modify, write a new MachineInstr which has its relevant operands
  replaced.

Either approaches are hazarded by existing references to either the
operands being moved, or the instruction being removed+replaced. For my
purposes in regalloc, either seem to work for me, so hopefully reviewers
can help me determine which approach is preferable. The second would
involve no new methods on MachineInstr.

One question I had while looking at this was: "why does MachineInstr
have BOTH a NumOperands member AND a MCInstrDesc member that itself has
a NumOperands member? How many operands can a MachineInstr have? Do I
need to update BOTH (keeping them in sync)?" FWICT, only "variadic"
MachineInstrs have MCInstrDesc with NumOperands (of the MCInstrDesc) set
to zero. If the MCInstrDesc's NumOperands is non-zero, then the
NumOperands
on the MachineInstr itself cannot exceed this value (IIUC) else an
assert will
be triggered.

For most non-psuedo instructions (or at least non-varidic instructions),
insert is less likely to be useful.

To run the newly added unittest:
    $ pushd llvm/build; ninja CodeGenTests; popd
    $ ./llvm/build/unittests/CodeGen/CodeGenTests \
        --gtest_filter=MachineInstrTest.SpliceOperands

This is meant to mirror `MCInst::insert`.
---
 llvm/include/llvm/CodeGen/MachineInstr.h    |  3 +
 llvm/lib/CodeGen/MachineInstr.cpp           | 45 +++++++++++
 llvm/unittests/CodeGen/MachineInstrTest.cpp | 83 +++++++++++++++++++++
 3 files changed, 131 insertions(+)

diff --git a/llvm/include/llvm/CodeGen/MachineInstr.h b/llvm/include/llvm/CodeGen/MachineInstr.h
index 0b9ad764af265..4877f43e8578d 100644
--- a/llvm/include/llvm/CodeGen/MachineInstr.h
+++ b/llvm/include/llvm/CodeGen/MachineInstr.h
@@ -1812,6 +1812,9 @@ class MachineInstr
   /// preferred.
   void addOperand(const MachineOperand &Op);
 
+  /// Inserts Ops BEFORE It. Can untie/retie tied operands.
+  void insert(mop_iterator InsertBefore, ArrayRef<MachineOperand> Ops);
+
   /// Replace the instruction descriptor (thus opcode) of
   /// the current instruction with a new one.
   void setDesc(const MCInstrDesc &TID);
diff --git a/llvm/lib/CodeGen/MachineInstr.cpp b/llvm/lib/CodeGen/MachineInstr.cpp
index 448725893bde0..048563cc2bcc4 100644
--- a/llvm/lib/CodeGen/MachineInstr.cpp
+++ b/llvm/lib/CodeGen/MachineInstr.cpp
@@ -2481,3 +2481,48 @@ MachineInstr::getFirst5RegLLTs() const {
       Reg2, getRegInfo()->getType(Reg2), Reg3, getRegInfo()->getType(Reg3),
       Reg4, getRegInfo()->getType(Reg4));
 }
+
+void MachineInstr::insert(mop_iterator InsertBefore,
+                          ArrayRef<MachineOperand> Ops) {
+  assert(InsertBefore != nullptr && "invalid iterator");
+  assert(InsertBefore->getParent() == this &&
+         "iterator points to operand of other inst");
+  if (Ops.empty())
+    return;
+
+  // Do one pass to untie operands.
+  SmallDenseMap<unsigned, unsigned> TiedOpIndices;
+  for (const MachineOperand &MO : operands()) {
+    if (MO.isReg() && MO.isTied()) {
+      unsigned OpNo = getOperandNo(&MO);
+      unsigned TiedTo = findTiedOperandIdx(OpNo);
+      TiedOpIndices[OpNo] = TiedTo;
+      untieRegOperand(OpNo);
+    }
+  }
+
+  unsigned OpIdx = getOperandNo(InsertBefore);
+  unsigned NumOperands = getNumOperands();
+  unsigned OpsToMove = NumOperands - OpIdx;
+
+  SmallVector<MachineOperand> MovingOps;
+  MovingOps.reserve(OpsToMove);
+
+  for (unsigned I = 0; I < OpsToMove; ++I) {
+    MovingOps.emplace_back(getOperand(OpIdx));
+    removeOperand(OpIdx);
+  }
+  for (const MachineOperand &MO : Ops)
+    addOperand(MO);
+  for (const MachineOperand &OpMoved : MovingOps)
+    addOperand(OpMoved);
+
+  // Re-tie operands.
+  for (auto [Tie1, Tie2] : TiedOpIndices) {
+    if (Tie1 >= OpIdx)
+      Tie1 += Ops.size();
+    if (Tie2 >= OpIdx)
+      Tie2 += Ops.size();
+    tieOperands(Tie1, Tie2);
+  }
+}
diff --git a/llvm/unittests/CodeGen/MachineInstrTest.cpp b/llvm/unittests/CodeGen/MachineInstrTest.cpp
index be409a56adb1a..0841cd3a7fb04 100644
--- a/llvm/unittests/CodeGen/MachineInstrTest.cpp
+++ b/llvm/unittests/CodeGen/MachineInstrTest.cpp
@@ -478,4 +478,87 @@ TEST(MachineInstrBuilder, BuildMI) {
 
 static_assert(std::is_trivially_copyable_v<MCOperand>, "trivially copyable");
 
+TEST(MachineInstrTest, SpliceOperands) {
+  LLVMContext Ctx;
+  Module Mod("Module", Ctx);
+  std::unique_ptr<MachineFunction> MF = createMachineFunction(Ctx, Mod);
+  MachineBasicBlock *MBB = MF->CreateMachineBasicBlock();
+  MCInstrDesc MCID = {TargetOpcode::INLINEASM,
+                      0,
+                      0,
+                      0,
+                      0,
+                      0,
+                      0,
+                      0,
+                      0,
+                      (1ULL << MCID::Pseudo) | (1ULL << MCID::Variadic),
+                      0};
+  MachineInstr *MI = MF->CreateMachineInstr(MCID, DebugLoc());
+  MBB->insert(MBB->begin(), MI);
+  MI->addOperand(MachineOperand::CreateImm(0));
+  MI->addOperand(MachineOperand::CreateImm(1));
+  MI->addOperand(MachineOperand::CreateImm(2));
+  MI->addOperand(MachineOperand::CreateImm(3));
+  MI->addOperand(MachineOperand::CreateImm(4));
+
+  MI->removeOperand(1);
+  EXPECT_EQ(MI->getOperand(1).getImm(), MachineOperand::CreateImm(2).getImm());
+  EXPECT_EQ(MI->getNumOperands(), 4U);
+
+  MachineOperand Ops[] = {
+      MachineOperand::CreateImm(42),   MachineOperand::CreateImm(1024),
+      MachineOperand::CreateImm(2048), MachineOperand::CreateImm(4096),
+      MachineOperand::CreateImm(8192),
+  };
+  auto *It = MI->operands_begin();
+  ++It;
+  MI->insert(It, Ops);
+
+  EXPECT_EQ(MI->getNumOperands(), 9U);
+  EXPECT_EQ(MI->getOperand(0).getImm(), MachineOperand::CreateImm(0).getImm());
+  EXPECT_EQ(MI->getOperand(1).getImm(), MachineOperand::CreateImm(42).getImm());
+  EXPECT_EQ(MI->getOperand(2).getImm(),
+            MachineOperand::CreateImm(1024).getImm());
+  EXPECT_EQ(MI->getOperand(3).getImm(),
+            MachineOperand::CreateImm(2048).getImm());
+  EXPECT_EQ(MI->getOperand(4).getImm(),
+            MachineOperand::CreateImm(4096).getImm());
+  EXPECT_EQ(MI->getOperand(5).getImm(),
+            MachineOperand::CreateImm(8192).getImm());
+  EXPECT_EQ(MI->getOperand(6).getImm(), MachineOperand::CreateImm(2).getImm());
+  EXPECT_EQ(MI->getOperand(7).getImm(), MachineOperand::CreateImm(3).getImm());
+  EXPECT_EQ(MI->getOperand(8).getImm(), MachineOperand::CreateImm(4).getImm());
+
+  // test tied operands
+  MCRegisterClass MRC{0, 0, 0, 0, 0, 0, 0, 0, /*Allocatable=*/true};
+  TargetRegisterClass RC{&MRC, 0, 0, {}, 0, 0, 0, 0, 0, 0, 0};
+  // MachineRegisterInfo will be very upset if these registers aren't
+  // allocatable.
+  assert(RC.isAllocatable() && "unusable TargetRegisterClass");
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+  Register A = MRI.createVirtualRegister(&RC);
+  Register B = MRI.createVirtualRegister(&RC);
+  MI->getOperand(0).ChangeToRegister(A, /*isDef=*/true);
+  MI->getOperand(1).ChangeToRegister(B, /*isDef=*/false);
+  MI->tieOperands(0, 1);
+  EXPECT_TRUE(MI->getOperand(0).isTied());
+  EXPECT_TRUE(MI->getOperand(1).isTied());
+  EXPECT_EQ(MI->findTiedOperandIdx(0), 1U);
+  EXPECT_EQ(MI->findTiedOperandIdx(1), 0U);
+  MI->insert(&MI->getOperand(1), {MachineOperand::CreateImm(7)});
+  EXPECT_TRUE(MI->getOperand(0).isTied());
+  EXPECT_TRUE(MI->getOperand(1).isImm());
+  EXPECT_TRUE(MI->getOperand(2).isTied());
+  EXPECT_EQ(MI->findTiedOperandIdx(0), 2U);
+  EXPECT_EQ(MI->findTiedOperandIdx(2), 0U);
+  EXPECT_EQ(MI->getOperand(0).getReg(), A);
+  EXPECT_EQ(MI->getOperand(2).getReg(), B);
+
+  // bad inputs
+  EXPECT_EQ(MI->getNumOperands(), 10U);
+  MI->insert(MI->operands_begin(), {});
+  EXPECT_EQ(MI->getNumOperands(), 10U);
+}
+
 } // end namespace

From 91cdd7d615da38a1f025646f526c2fce265a37e2 Mon Sep 17 00:00:00 2001
From: Kirill Stoimenov <87100199+kstoimenov@users.noreply.github.com>
Date: Mon, 30 Oct 2023 15:01:21 -0700
Subject: [PATCH 442/877] [HWASAN] Enable memcpy, memmove and memset
 interceptors (#70387)

---
 .../lib/hwasan/hwasan_interceptors.cpp        | 27 +---------------
 .../lib/hwasan/hwasan_platform_interceptors.h | 12 +++----
 compiler-rt/test/hwasan/TestCases/bcmp.cpp    | 15 ++++++---
 compiler-rt/test/hwasan/TestCases/memcmp.cpp  | 15 ++++++---
 compiler-rt/test/hwasan/TestCases/memcpy.cpp  | 32 +++++++++++++++++++
 compiler-rt/test/hwasan/TestCases/memmove.cpp | 32 +++++++++++++++++++
 compiler-rt/test/hwasan/TestCases/memset.cpp  | 32 +++++++++++++++++++
 7 files changed, 125 insertions(+), 40 deletions(-)
 create mode 100644 compiler-rt/test/hwasan/TestCases/memcpy.cpp
 create mode 100644 compiler-rt/test/hwasan/TestCases/memmove.cpp
 create mode 100644 compiler-rt/test/hwasan/TestCases/memset.cpp

diff --git a/compiler-rt/lib/hwasan/hwasan_interceptors.cpp b/compiler-rt/lib/hwasan/hwasan_interceptors.cpp
index 0889831373a80..5171f035f97f7 100644
--- a/compiler-rt/lib/hwasan/hwasan_interceptors.cpp
+++ b/compiler-rt/lib/hwasan/hwasan_interceptors.cpp
@@ -90,8 +90,7 @@ struct HWAsanInterceptorContext {
 #    include "sanitizer_common/sanitizer_syscalls_netbsd.inc"
 
 #    define COMMON_INTERCEPTOR_WRITE_RANGE(ctx, ptr, size) \
-      do {                                                 \
-      } while (false)
+      HWASAN_WRITE_RANGE(ctx, ptr, size)
 
 #    define COMMON_INTERCEPTOR_READ_RANGE(ctx, ptr, size) \
       HWASAN_READ_RANGE(ctx, ptr, size)
@@ -147,30 +146,6 @@ struct HWAsanInterceptorContext {
         (void)(name);                           \
       } while (false)
 
-#    define COMMON_INTERCEPTOR_MEMMOVE_IMPL(ctx, to, from, size) \
-      do {                                                       \
-        (void)(ctx);                                             \
-        (void)(to);                                              \
-        (void)(from);                                            \
-        (void)(size);                                            \
-      } while (false)
-
-#    define COMMON_INTERCEPTOR_MEMCPY_IMPL(ctx, to, from, size) \
-      do {                                                      \
-        (void)(ctx);                                            \
-        (void)(to);                                             \
-        (void)(from);                                           \
-        (void)(size);                                           \
-      } while (false)
-
-#    define COMMON_INTERCEPTOR_MEMSET_IMPL(ctx, block, c, size) \
-      do {                                                      \
-        (void)(ctx);                                            \
-        (void)(block);                                          \
-        (void)(c);                                              \
-        (void)(size);                                           \
-      } while (false)
-
 #    define COMMON_INTERCEPTOR_STRERROR() \
       do {                                \
       } while (false)
diff --git a/compiler-rt/lib/hwasan/hwasan_platform_interceptors.h b/compiler-rt/lib/hwasan/hwasan_platform_interceptors.h
index 86d26b5ac12d4..d92b510521942 100644
--- a/compiler-rt/lib/hwasan/hwasan_platform_interceptors.h
+++ b/compiler-rt/lib/hwasan/hwasan_platform_interceptors.h
@@ -56,14 +56,14 @@
 #undef SANITIZER_INTERCEPT_STRCASECMP
 #define SANITIZER_INTERCEPT_STRCASECMP 0
 
-#undef SANITIZER_INTERCEPT_MEMSET
-#define SANITIZER_INTERCEPT_MEMSET 0
+// #undef SANITIZER_INTERCEPT_MEMSET
+// #define SANITIZER_INTERCEPT_MEMSET 0
 
-#undef SANITIZER_INTERCEPT_MEMMOVE
-#define SANITIZER_INTERCEPT_MEMMOVE 0
+// #undef SANITIZER_INTERCEPT_MEMMOVE
+// #define SANITIZER_INTERCEPT_MEMMOVE 0
 
-#undef SANITIZER_INTERCEPT_MEMCPY
-#define SANITIZER_INTERCEPT_MEMCPY 0
+// #undef SANITIZER_INTERCEPT_MEMCPY
+// #define SANITIZER_INTERCEPT_MEMCPY 0
 
 // #undef SANITIZER_INTERCEPT_MEMCMP
 // #define SANITIZER_INTERCEPT_MEMCMP 0
diff --git a/compiler-rt/test/hwasan/TestCases/bcmp.cpp b/compiler-rt/test/hwasan/TestCases/bcmp.cpp
index a83147b0f3205..9b21bba56b1be 100644
--- a/compiler-rt/test/hwasan/TestCases/bcmp.cpp
+++ b/compiler-rt/test/hwasan/TestCases/bcmp.cpp
@@ -4,11 +4,17 @@
 // RUN: %clangxx_hwasan -O3 %s -o %t && not %run %t 2>&1 | FileCheck %s
 // REQUIRES: !android
 
+#include <assert.h>
 #include <sanitizer/hwasan_interface.h>
 #include <stdlib.h>
 #include <string.h>
 #include <unistd.h>
 
+__attribute__((no_sanitize("hwaddress"))) void
+ForceCallInterceptor(void *p, const void *a, size_t size) {
+  assert(bcmp(p, a, size) == 0);
+}
+
 int main(int argc, char **argv) {
   __hwasan_enable_allocator_tagging();
   char a[] = {static_cast<char>(argc), 2, 3, 4};
@@ -16,13 +22,14 @@ int main(int argc, char **argv) {
   char *p = (char *)malloc(size);
   memcpy(p, a, size);
   free(p);
-  return bcmp(p, a, size);
+  ForceCallInterceptor(p, a, size);
+  return 0;
   // CHECK: HWAddressSanitizer: tag-mismatch on address
   // CHECK: READ of size 4
-  // CHECK: #{{[[:digit:]]+}} 0x{{[[:xdigit:]]+}} in main {{.*}}bcmp.cpp:[[@LINE-3]]
+  // CHECK: #{{[[:digit:]]+}} 0x{{[[:xdigit:]]+}} in main {{.*}}bcmp.cpp:[[@LINE-4]]
   // CHECK: Cause: use-after-free
   // CHECK: freed by thread
-  // CHECK: #{{[[:digit:]]+}} 0x{{[[:xdigit:]]+}} in main {{.*}}bcmp.cpp:[[@LINE-7]]
+  // CHECK: #{{[[:digit:]]+}} 0x{{[[:xdigit:]]+}} in main {{.*}}bcmp.cpp:[[@LINE-8]]
   // CHECK: previously allocated by thread
-  // CHECK: #{{[[:digit:]]+}} 0x{{[[:xdigit:]]+}} in main {{.*}}bcmp.cpp:[[@LINE-11]]
+  // CHECK: #{{[[:digit:]]+}} 0x{{[[:xdigit:]]+}} in main {{.*}}bcmp.cpp:[[@LINE-12]]
 }
diff --git a/compiler-rt/test/hwasan/TestCases/memcmp.cpp b/compiler-rt/test/hwasan/TestCases/memcmp.cpp
index 5f8a93f62a44a..31915527c27fd 100644
--- a/compiler-rt/test/hwasan/TestCases/memcmp.cpp
+++ b/compiler-rt/test/hwasan/TestCases/memcmp.cpp
@@ -3,11 +3,17 @@
 // RUN: %clangxx_hwasan -O2 %s -o %t && not %run %t 2>&1 | FileCheck %s
 // RUN: %clangxx_hwasan -O3 %s -o %t && not %run %t 2>&1 | FileCheck %s
 
+#include <assert.h>
 #include <sanitizer/hwasan_interface.h>
 #include <stdlib.h>
 #include <string.h>
 #include <unistd.h>
 
+__attribute__((no_sanitize("hwaddress"))) void
+ForceCallInterceptor(void *p, const void *a, size_t size) {
+  assert(memcmp(p, a, size) == 0);
+}
+
 int main(int argc, char **argv) {
   __hwasan_enable_allocator_tagging();
   char a[] = {static_cast<char>(argc), 2, 3, 4};
@@ -15,13 +21,14 @@ int main(int argc, char **argv) {
   char *p = (char *)malloc(size);
   memcpy(p, a, size);
   free(p);
-  return memcmp(p, a, size);
+  ForceCallInterceptor(p, a, size);
+  return 0;
   // CHECK: HWAddressSanitizer: tag-mismatch on address
   // CHECK: READ of size 4
-  // CHECK: #{{[[:digit:]]+}} 0x{{[[:xdigit:]]+}} in main {{.*}}memcmp.cpp:[[@LINE-3]]
+  // CHECK: #{{[[:digit:]]+}} 0x{{[[:xdigit:]]+}} in main {{.*}}memcmp.cpp:[[@LINE-4]]
   // CHECK: Cause: use-after-free
   // CHECK: freed by thread
-  // CHECK: #{{[[:digit:]]+}} 0x{{[[:xdigit:]]+}} in main {{.*}}memcmp.cpp:[[@LINE-7]]
+  // CHECK: #{{[[:digit:]]+}} 0x{{[[:xdigit:]]+}} in main {{.*}}memcmp.cpp:[[@LINE-8]]
   // CHECK: previously allocated by thread
-  // CHECK: #{{[[:digit:]]+}} 0x{{[[:xdigit:]]+}} in main {{.*}}memcmp.cpp:[[@LINE-11]]
+  // CHECK: #{{[[:digit:]]+}} 0x{{[[:xdigit:]]+}} in main {{.*}}memcmp.cpp:[[@LINE-12]]
 }
diff --git a/compiler-rt/test/hwasan/TestCases/memcpy.cpp b/compiler-rt/test/hwasan/TestCases/memcpy.cpp
new file mode 100644
index 0000000000000..830449488fec4
--- /dev/null
+++ b/compiler-rt/test/hwasan/TestCases/memcpy.cpp
@@ -0,0 +1,32 @@
+// RUN: %clangxx_hwasan -O0 %s -o %t && not %run %t 2>&1 | FileCheck %s
+// RUN: %clangxx_hwasan -O1 %s -o %t && not %run %t 2>&1 | FileCheck %s
+// RUN: %clangxx_hwasan -O2 %s -o %t && not %run %t 2>&1 | FileCheck %s
+// RUN: %clangxx_hwasan -O3 %s -o %t && not %run %t 2>&1 | FileCheck %s
+
+#include <sanitizer/hwasan_interface.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+__attribute__((no_sanitize("hwaddress"))) void
+ForceCallInterceptor(void *p, const void *a, size_t size) {
+  memcpy(p, a, size);
+}
+
+int main(int argc, char **argv) {
+  __hwasan_enable_allocator_tagging();
+  char a[] = {static_cast<char>(argc), 2, 3, 4};
+  int size = sizeof(a);
+  char *volatile p = (char *)malloc(size);
+  free(p);
+  ForceCallInterceptor(p, a, size);
+  return 0;
+  // CHECK: HWAddressSanitizer: tag-mismatch on address
+  // CHECK: WRITE of size 4
+  // CHECK: #{{[[:digit:]]+}} 0x{{[[:xdigit:]]+}} in main {{.*}}memcpy.cpp:[[@LINE-4]]
+  // CHECK: Cause: use-after-free
+  // CHECK: freed by thread
+  // CHECK: #{{[[:digit:]]+}} 0x{{[[:xdigit:]]+}} in main {{.*}}memcpy.cpp:[[@LINE-8]]
+  // CHECK: previously allocated by thread
+  // CHECK: #{{[[:digit:]]+}} 0x{{[[:xdigit:]]+}} in main {{.*}}memcpy.cpp:[[@LINE-11]]
+}
diff --git a/compiler-rt/test/hwasan/TestCases/memmove.cpp b/compiler-rt/test/hwasan/TestCases/memmove.cpp
new file mode 100644
index 0000000000000..40dc3deeb3935
--- /dev/null
+++ b/compiler-rt/test/hwasan/TestCases/memmove.cpp
@@ -0,0 +1,32 @@
+// RUN: %clangxx_hwasan -O0 %s -o %t && not %run %t 2>&1 | FileCheck %s
+// RUN: %clangxx_hwasan -O1 %s -o %t && not %run %t 2>&1 | FileCheck %s
+// RUN: %clangxx_hwasan -O2 %s -o %t && not %run %t 2>&1 | FileCheck %s
+// RUN: %clangxx_hwasan -O3 %s -o %t && not %run %t 2>&1 | FileCheck %s
+
+#include <sanitizer/hwasan_interface.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+__attribute__((no_sanitize("hwaddress"))) void
+ForceCallInterceptor(void *p, const void *a, size_t size) {
+  memmove(p, a, size);
+}
+
+int main(int argc, char **argv) {
+  __hwasan_enable_allocator_tagging();
+  char a[] = {static_cast<char>(argc), 2, 3, 4};
+  int size = sizeof(a);
+  char *volatile p = (char *)malloc(size);
+  free(p);
+  ForceCallInterceptor(p, a, size);
+  return 0;
+  // CHECK: HWAddressSanitizer: tag-mismatch on address
+  // CHECK: WRITE of size 4
+  // CHECK: #{{[[:digit:]]+}} 0x{{[[:xdigit:]]+}} in main {{.*}}memmove.cpp:[[@LINE-4]]
+  // CHECK: Cause: use-after-free
+  // CHECK: freed by thread
+  // CHECK: #{{[[:digit:]]+}} 0x{{[[:xdigit:]]+}} in main {{.*}}memmove.cpp:[[@LINE-8]]
+  // CHECK: previously allocated by thread
+  // CHECK: #{{[[:digit:]]+}} 0x{{[[:xdigit:]]+}} in main {{.*}}memmove.cpp:[[@LINE-11]]
+}
diff --git a/compiler-rt/test/hwasan/TestCases/memset.cpp b/compiler-rt/test/hwasan/TestCases/memset.cpp
new file mode 100644
index 0000000000000..ae31a3bfe9cda
--- /dev/null
+++ b/compiler-rt/test/hwasan/TestCases/memset.cpp
@@ -0,0 +1,32 @@
+// RUN: %clangxx_hwasan -O0 %s -o %t && not %run %t 2>&1 | FileCheck %s
+// RUN: %clangxx_hwasan -O1 %s -o %t && not %run %t 2>&1 | FileCheck %s
+// RUN: %clangxx_hwasan -O2 %s -o %t && not %run %t 2>&1 | FileCheck %s
+// RUN: %clangxx_hwasan -O3 %s -o %t && not %run %t 2>&1 | FileCheck %s
+
+#include <sanitizer/hwasan_interface.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+__attribute__((no_sanitize("hwaddress"))) void
+ForceCallInterceptor(void *p, int c, size_t size) {
+  memset(p, c, size) == nullptr;
+}
+
+int main(int argc, char **argv) {
+  __hwasan_enable_allocator_tagging();
+  char a[] = {static_cast<char>(argc), 2, 3, 4};
+  int size = sizeof(a);
+  char *volatile p = (char *)malloc(size);
+  free(p);
+  ForceCallInterceptor(p, 0, size);
+  return 0;
+  // CHECK: HWAddressSanitizer: tag-mismatch on address
+  // CHECK: WRITE of size 4
+  // CHECK: #{{[[:digit:]]+}} 0x{{[[:xdigit:]]+}} in main {{.*}}memset.cpp:[[@LINE-4]]
+  // CHECK: Cause: use-after-free
+  // CHECK: freed by thread
+  // CHECK: #{{[[:digit:]]+}} 0x{{[[:xdigit:]]+}} in main {{.*}}memset.cpp:[[@LINE-8]]
+  // CHECK: previously allocated by thread
+  // CHECK: #{{[[:digit:]]+}} 0x{{[[:xdigit:]]+}} in main {{.*}}memset.cpp:[[@LINE-11]]
+}

From b1c59b516cbbbb17ab8ceea0a9046924d1683583 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Mon, 30 Oct 2023 15:07:22 -0700
Subject: [PATCH 443/877] [SCCP] Infer nneg on zext when forming from
 non-negative sext. (#70730)

Builds on #67982 which recently introduced the nneg flag on a zext
instruction.
---
 llvm/lib/Transforms/Utils/SCCPSolver.cpp       | 1 +
 llvm/test/Transforms/SCCP/add-nuw-nsw-flags.ll | 4 ++--
 llvm/test/Transforms/SCCP/ip-ranges-casts.ll   | 4 ++--
 llvm/test/Transforms/SCCP/ip-ranges-sext.ll    | 8 ++++----
 llvm/test/Transforms/SCCP/ranges-sext.ll       | 8 ++++----
 llvm/test/Transforms/SCCP/widening.ll          | 4 ++--
 6 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/llvm/lib/Transforms/Utils/SCCPSolver.cpp b/llvm/lib/Transforms/Utils/SCCPSolver.cpp
index 4b96b02ee2ecd..ea8425c5d5ddc 100644
--- a/llvm/lib/Transforms/Utils/SCCPSolver.cpp
+++ b/llvm/lib/Transforms/Utils/SCCPSolver.cpp
@@ -171,6 +171,7 @@ static bool replaceSignedInst(SCCPSolver &Solver,
     if (InsertedValues.count(Op0) || !isNonNegative(Op0))
       return false;
     NewInst = new ZExtInst(Op0, Inst.getType(), "", &Inst);
+    NewInst->setNonNeg();
     break;
   }
   case Instruction::AShr: {
diff --git a/llvm/test/Transforms/SCCP/add-nuw-nsw-flags.ll b/llvm/test/Transforms/SCCP/add-nuw-nsw-flags.ll
index 97b471d985410..b8f5d5dba0c4b 100644
--- a/llvm/test/Transforms/SCCP/add-nuw-nsw-flags.ll
+++ b/llvm/test/Transforms/SCCP/add-nuw-nsw-flags.ll
@@ -124,7 +124,7 @@ define i16 @sge_with_sext_to_zext_conversion(i8 %a)  {
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i8 [[A:%.*]], 0
 ; CHECK-NEXT:    br i1 [[CMP]], label [[THEN:%.*]], label [[ELSE:%.*]]
 ; CHECK:       then:
-; CHECK-NEXT:    [[SEXT:%.*]] = zext i8 [[A]] to i16
+; CHECK-NEXT:    [[SEXT:%.*]] = zext nneg i8 [[A]] to i16
 ; CHECK-NEXT:    [[ADD_1:%.*]] = add i16 [[SEXT]], 1
 ; CHECK-NEXT:    [[ADD_2:%.*]] = add i16 [[SEXT]], -128
 ; CHECK-NEXT:    [[ADD_3:%.*]] = add i16 [[SEXT]], -127
@@ -219,7 +219,7 @@ define i16 @test_add_in_different_block(i1 %c, i8 %a) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[A:%.*]], 0
 ; CHECK-NEXT:    [[COND4:%.*]] = select i1 [[CMP]], i8 1, i8 0
-; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[COND4]] to i16
+; CHECK-NEXT:    [[CONV:%.*]] = zext nneg i8 [[COND4]] to i16
 ; CHECK-NEXT:    br i1 [[C:%.*]], label [[THEN:%.*]], label [[ELSE:%.*]]
 ; CHECK:       then:
 ; CHECK-NEXT:    [[ADD:%.*]] = add i16 1, [[CONV]]
diff --git a/llvm/test/Transforms/SCCP/ip-ranges-casts.ll b/llvm/test/Transforms/SCCP/ip-ranges-casts.ll
index 980d7d87abc76..6fbcb5d166dce 100644
--- a/llvm/test/Transforms/SCCP/ip-ranges-casts.ll
+++ b/llvm/test/Transforms/SCCP/ip-ranges-casts.ll
@@ -112,7 +112,7 @@ define i1 @caller.zext() {
 ; x = [100, 301)
 define internal i1 @f.sext(i32 %x, i32 %y) {
 ; CHECK-LABEL: @f.sext(
-; CHECK-NEXT:    [[T_1:%.*]] = zext i32 [[X:%.*]] to i64
+; CHECK-NEXT:    [[T_1:%.*]] = zext nneg i32 [[X:%.*]] to i64
 ; CHECK-NEXT:    [[C_2:%.*]] = icmp sgt i64 [[T_1]], 299
 ; CHECK-NEXT:    [[C_4:%.*]] = icmp slt i64 [[T_1]], 101
 ; CHECK-NEXT:    [[RES_1:%.*]] = add nuw nsw i1 false, [[C_2]]
@@ -318,7 +318,7 @@ entry:
 
 define internal i64 @f.sext_to_zext(i32 %t) {
 ; CHECK-LABEL: @f.sext_to_zext(
-; CHECK-NEXT:    [[A:%.*]] = zext i32 [[T:%.*]] to i64
+; CHECK-NEXT:    [[A:%.*]] = zext nneg i32 [[T:%.*]] to i64
 ; CHECK-NEXT:    ret i64 [[A]]
 ;
   %a = sext i32 %t to i64
diff --git a/llvm/test/Transforms/SCCP/ip-ranges-sext.ll b/llvm/test/Transforms/SCCP/ip-ranges-sext.ll
index 6fa74b379f4c8..10f5fa3b07eac 100644
--- a/llvm/test/Transforms/SCCP/ip-ranges-sext.ll
+++ b/llvm/test/Transforms/SCCP/ip-ranges-sext.ll
@@ -6,7 +6,7 @@ define i64 @test1(i32 %x) {
 ; CHECK-NEXT:    [[C:%.*]] = icmp sgt i32 [[X:%.*]], 0
 ; CHECK-NEXT:    br i1 [[C]], label [[TRUE:%.*]], label [[FALSE:%.*]]
 ; CHECK:       true:
-; CHECK-NEXT:    [[EXT_1:%.*]] = zext i32 [[X]] to i64
+; CHECK-NEXT:    [[EXT_1:%.*]] = zext nneg i32 [[X]] to i64
 ; CHECK-NEXT:    ret i64 [[EXT_1]]
 ; CHECK:       false:
 ; CHECK-NEXT:    [[EXT_2:%.*]] = sext i32 [[X]] to i64
@@ -29,7 +29,7 @@ define i64 @test2(i32 %x) {
 ; CHECK-NEXT:    [[C:%.*]] = icmp sge i32 [[X:%.*]], 0
 ; CHECK-NEXT:    br i1 [[C]], label [[TRUE:%.*]], label [[FALSE:%.*]]
 ; CHECK:       true:
-; CHECK-NEXT:    [[EXT_1:%.*]] = zext i32 [[X]] to i64
+; CHECK-NEXT:    [[EXT_1:%.*]] = zext nneg i32 [[X]] to i64
 ; CHECK-NEXT:    ret i64 [[EXT_1]]
 ; CHECK:       false:
 ; CHECK-NEXT:    [[EXT_2:%.*]] = sext i32 [[X]] to i64
@@ -105,7 +105,7 @@ exit:
 define i64 @test5(i32 %x) {
 ; CHECK-LABEL: @test5(
 ; CHECK-NEXT:    [[P:%.*]] = and i32 [[X:%.*]], 15
-; CHECK-NEXT:    [[EXT:%.*]] = zext i32 [[P]] to i64
+; CHECK-NEXT:    [[EXT:%.*]] = zext nneg i32 [[P]] to i64
 ; CHECK-NEXT:    ret i64 [[EXT]]
 ;
   %p = and i32 %x, 15
@@ -126,7 +126,7 @@ define i64 @test6(i32 %x) {
 define i64 @test7(i16 %x) {
 ; CHECK-LABEL: @test7(
 ; CHECK-NEXT:    [[P:%.*]] = and i16 [[X:%.*]], 15
-; CHECK-NEXT:    [[EXT_1:%.*]] = zext i16 [[P]] to i32
+; CHECK-NEXT:    [[EXT_1:%.*]] = zext nneg i16 [[P]] to i32
 ; CHECK-NEXT:    [[EXT_2:%.*]] = sext i32 [[EXT_1]] to i64
 ; CHECK-NEXT:    ret i64 [[EXT_2]]
 ;
diff --git a/llvm/test/Transforms/SCCP/ranges-sext.ll b/llvm/test/Transforms/SCCP/ranges-sext.ll
index bd924a7393015..0661b8605137e 100644
--- a/llvm/test/Transforms/SCCP/ranges-sext.ll
+++ b/llvm/test/Transforms/SCCP/ranges-sext.ll
@@ -68,8 +68,8 @@ exit:
 define i64 @test2(i32 %x) {
 ; CHECK-LABEL: @test2(
 ; CHECK-NEXT:    [[P:%.*]] = and i32 [[X:%.*]], 15
-; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[P]] to i64
-; CHECK-NEXT:    ret i64 [[TMP1]]
+; CHECK-NEXT:    [[EXT:%.*]] = zext nneg i32 [[P]] to i64
+; CHECK-NEXT:    ret i64 [[EXT]]
 ;
   %p = and i32 %x, 15
   %ext = sext i32 %p to i64
@@ -87,8 +87,8 @@ define i64 @test3(i1 %c.1, i1 %c.2) {
 ; CHECK-NEXT:    br label [[EXIT]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    [[P:%.*]] = phi i32 [ 0, [[TRUE_1]] ], [ 1, [[TRUE_2]] ], [ 3, [[FALSE]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[P]] to i64
-; CHECK-NEXT:    ret i64 [[TMP1]]
+; CHECK-NEXT:    [[EXT:%.*]] = zext nneg i32 [[P]] to i64
+; CHECK-NEXT:    ret i64 [[EXT]]
 ;
   br i1 %c.1, label %true.1, label %false
 
diff --git a/llvm/test/Transforms/SCCP/widening.ll b/llvm/test/Transforms/SCCP/widening.ll
index f482ed3a4e7f6..2223ca44bccdb 100644
--- a/llvm/test/Transforms/SCCP/widening.ll
+++ b/llvm/test/Transforms/SCCP/widening.ll
@@ -450,7 +450,7 @@ define void @foo(ptr %arg) {
 ; SCCP-NEXT:    [[TMP7:%.*]] = sub nuw nsw i64 3, [[TMP6]]
 ; SCCP-NEXT:    [[TMP8:%.*]] = shl nuw nsw i64 [[TMP7]], 1
 ; SCCP-NEXT:    [[TMP9:%.*]] = trunc i64 [[TMP8]] to i32
-; SCCP-NEXT:    [[TMP10:%.*]] = zext i32 [[TMP9]] to i64
+; SCCP-NEXT:    [[TMP10:%.*]] = zext nneg i32 [[TMP9]] to i64
 ; SCCP-NEXT:    br label [[BB11:%.*]]
 ; SCCP:       bb11:
 ; SCCP-NEXT:    [[TMP12:%.*]] = phi i64 [ [[TMP10]], [[BB4]] ], [ [[TMP17:%.*]], [[BB18:%.*]] ]
@@ -487,7 +487,7 @@ define void @foo(ptr %arg) {
 ; IPSCCP-NEXT:    [[TMP7:%.*]] = sub nuw nsw i64 3, [[TMP6]]
 ; IPSCCP-NEXT:    [[TMP8:%.*]] = shl nuw nsw i64 [[TMP7]], 1
 ; IPSCCP-NEXT:    [[TMP9:%.*]] = trunc i64 [[TMP8]] to i32
-; IPSCCP-NEXT:    [[TMP10:%.*]] = zext i32 [[TMP9]] to i64
+; IPSCCP-NEXT:    [[TMP10:%.*]] = zext nneg i32 [[TMP9]] to i64
 ; IPSCCP-NEXT:    br label [[BB11:%.*]]
 ; IPSCCP:       bb11:
 ; IPSCCP-NEXT:    [[TMP12:%.*]] = phi i64 [ [[TMP10]], [[BB4]] ], [ [[TMP17:%.*]], [[BB18:%.*]] ]

From f82bee1367a1d612d688790b81c1c146ce99f2ea Mon Sep 17 00:00:00 2001
From: Peiming Liu <36770114+PeimingLiu@users.noreply.github.com>
Date: Mon, 30 Oct 2023 15:22:21 -0700
Subject: [PATCH 444/877] [mlir][sparse] split post-sparsification-rewriting
 into two passes. (#70727)

---
 .../Dialect/SparseTensor/Transforms/Passes.h  | 22 +++++----
 .../Dialect/SparseTensor/Transforms/Passes.td | 23 +++++++---
 .../Transforms/SparseTensorPasses.cpp         | 46 ++++++++++++-------
 .../Transforms/SparseTensorRewriting.cpp      | 14 +++---
 .../SparsificationAndBufferizationPass.cpp    |  5 +-
 mlir/test/Dialect/SparseTensor/codegen.mlir   |  2 +-
 .../test/Dialect/SparseTensor/conversion.mlir |  2 +-
 .../SparseTensor/convert_dense2sparse.mlir    |  2 +-
 .../SparseTensor/convert_sparse2dense.mlir    |  2 +-
 .../SparseTensor/convert_sparse2sparse.mlir   |  2 +-
 .../SparseTensor/rewriting_for_codegen.mlir   |  4 +-
 .../Dialect/SparseTensor/sparse_concat.mlir   |  4 +-
 .../Dialect/SparseTensor/sparse_expand.mlir   |  3 +-
 .../Dialect/SparseTensor/sparse_foreach.mlir  |  2 +-
 .../Dialect/SparseTensor/sparse_pack.mlir     |  2 +-
 .../Dialect/SparseTensor/sparse_reshape.mlir  |  8 ++--
 .../SparseTensor/sparse_tensor_reshape.mlir   |  4 +-
 17 files changed, 92 insertions(+), 55 deletions(-)

diff --git a/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.h b/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.h
index b1979f032393b..a8d4d752dff88 100644
--- a/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.h
+++ b/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.h
@@ -114,17 +114,23 @@ void populateStageSparseOperationsPatterns(RewritePatternSet &patterns);
 std::unique_ptr<Pass> createStageSparseOperationsPass();
 
 //===----------------------------------------------------------------------===//
-// The PostSparsificationRewriting pass.
+// The LowerSparseOpsToForeach pass.
 //===----------------------------------------------------------------------===//
 
-void populatePostSparsificationRewriting(RewritePatternSet &patterns,
-                                         bool enableRT, bool enableForeach,
-                                         bool enableConvert);
+void populateLowerSparseOpsToForeachPatterns(RewritePatternSet &patterns,
+                                             bool enableRT, bool enableConvert);
 
-std::unique_ptr<Pass> createPostSparsificationRewritePass();
-std::unique_ptr<Pass>
-createPostSparsificationRewritePass(bool enableRT, bool enableForeach = true,
-                                    bool enableConvert = true);
+std::unique_ptr<Pass> createLowerSparseOpsToForeachPass();
+std::unique_ptr<Pass> createLowerSparseOpsToForeachPass(bool enableRT,
+                                                        bool enableConvert);
+
+//===----------------------------------------------------------------------===//
+// The LowerForeachToSCF pass.
+//===----------------------------------------------------------------------===//
+
+void populateLowerForeachToSCFPatterns(RewritePatternSet &patterns);
+
+std::unique_ptr<Pass> createLowerForeachToSCFPass();
 
 //===----------------------------------------------------------------------===//
 // The SparseTensorConversion pass.
diff --git a/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.td b/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.td
index 99dbd7ab3677e..995e842289035 100644
--- a/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.td
@@ -167,13 +167,12 @@ def StageSparseOperations : Pass<"stage-sparse-ops", "func::FuncOp"> {
   ];
 }
 
-def PostSparsificationRewrite : Pass<"post-sparsification-rewrite", "ModuleOp"> {
+def LowerSparseOpsToForeach : Pass<"lower-sparse-ops-to-foreach", "ModuleOp"> {
   let summary = "Applies sparse tensor rewriting rules after sparsification";
   let description = [{
-    A pass that applies rewriting rules to sparse tensor operations after
-    running the actual sparsification pass.
+    A pass that lowers high-level sparse operations to sparse_tensor.foreach.
   }];
-  let constructor = "mlir::createPostSparsificationRewritePass()";
+  let constructor = "mlir::createLowerSparseOpsToForeachPass()";
   let dependentDialects = [
     "affine::AffineDialect",
     "arith::ArithDialect",
@@ -186,13 +185,25 @@ def PostSparsificationRewrite : Pass<"post-sparsification-rewrite", "ModuleOp">
   let options = [
     Option<"enableRuntimeLibrary", "enable-runtime-library", "bool",
            "true", "Enable runtime library for manipulating sparse tensors">,
-    Option<"enableForeach", "enable-foreach", "bool",
-           "true", "Enable rewriting rules for the foreach operator">,
     Option<"enableConvert", "enable-convert", "bool",
            "true", "Enable rewriting rules for the convert operator">,
   ];
 }
 
+def LowerForeachToSCF : Pass<"lower-sparse-foreach-to-scf", "func::FuncOp"> {
+  let summary = "Decompose a complex sparse operation into multiple stages";
+  let description = [{
+    A pass that lowers sparse_tensor.foreach operation to scf dialect.
+  }];
+  let constructor = "mlir::createLowerForeachToSCFPass()";
+  let dependentDialects = [
+    "memref::MemRefDialect",
+    "scf::SCFDialect",
+    "sparse_tensor::SparseTensorDialect",
+  ];
+}
+
+
 def SparseTensorConversionPass : Pass<"sparse-tensor-conversion", "ModuleOp"> {
   let summary = "Convert sparse tensors and primitives to library calls";
   let description = [{
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorPasses.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorPasses.cpp
index 095a6ab9a508e..c5fd19a811d6b 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorPasses.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorPasses.cpp
@@ -25,7 +25,8 @@ namespace mlir {
 #define GEN_PASS_DEF_SPARSEREINTERPRETMAP
 #define GEN_PASS_DEF_PRESPARSIFICATIONREWRITE
 #define GEN_PASS_DEF_SPARSIFICATIONPASS
-#define GEN_PASS_DEF_POSTSPARSIFICATIONREWRITE
+#define GEN_PASS_DEF_LOWERSPARSEOPSTOFOREACH
+#define GEN_PASS_DEF_LOWERFOREACHTOSCF
 #define GEN_PASS_DEF_SPARSETENSORCONVERSIONPASS
 #define GEN_PASS_DEF_SPARSETENSORCODEGEN
 #define GEN_PASS_DEF_SPARSEBUFFERREWRITE
@@ -120,23 +121,34 @@ struct StageSparseOperationsPass
   }
 };
 
-struct PostSparsificationRewritePass
-    : public impl::PostSparsificationRewriteBase<
-          PostSparsificationRewritePass> {
-  PostSparsificationRewritePass() = default;
-  PostSparsificationRewritePass(const PostSparsificationRewritePass &pass) =
+struct LowerSparseOpsToForeachPass
+    : public impl::LowerSparseOpsToForeachBase<LowerSparseOpsToForeachPass> {
+  LowerSparseOpsToForeachPass() = default;
+  LowerSparseOpsToForeachPass(const LowerSparseOpsToForeachPass &pass) =
       default;
-  PostSparsificationRewritePass(bool enableRT, bool foreach, bool convert) {
+  LowerSparseOpsToForeachPass(bool enableRT, bool convert) {
     enableRuntimeLibrary = enableRT;
-    enableForeach = foreach;
     enableConvert = convert;
   }
 
   void runOnOperation() override {
     auto *ctx = &getContext();
     RewritePatternSet patterns(ctx);
-    populatePostSparsificationRewriting(patterns, enableRuntimeLibrary,
-                                        enableForeach, enableConvert);
+    populateLowerSparseOpsToForeachPatterns(patterns, enableRuntimeLibrary,
+                                            enableConvert);
+    (void)applyPatternsAndFoldGreedily(getOperation(), std::move(patterns));
+  }
+};
+
+struct LowerForeachToSCFPass
+    : public impl::LowerForeachToSCFBase<LowerForeachToSCFPass> {
+  LowerForeachToSCFPass() = default;
+  LowerForeachToSCFPass(const LowerForeachToSCFPass &pass) = default;
+
+  void runOnOperation() override {
+    auto *ctx = &getContext();
+    RewritePatternSet patterns(ctx);
+    populateLowerForeachToSCFPatterns(patterns);
     (void)applyPatternsAndFoldGreedily(getOperation(), std::move(patterns));
   }
 };
@@ -399,15 +411,17 @@ std::unique_ptr<Pass> mlir::createStageSparseOperationsPass() {
   return std::make_unique<StageSparseOperationsPass>();
 }
 
-std::unique_ptr<Pass> mlir::createPostSparsificationRewritePass() {
-  return std::make_unique<PostSparsificationRewritePass>();
+std::unique_ptr<Pass> mlir::createLowerSparseOpsToForeachPass() {
+  return std::make_unique<LowerSparseOpsToForeachPass>();
 }
 
 std::unique_ptr<Pass>
-mlir::createPostSparsificationRewritePass(bool enableRT, bool enableForeach,
-                                          bool enableConvert) {
-  return std::make_unique<PostSparsificationRewritePass>(
-      enableRT, enableForeach, enableConvert);
+mlir::createLowerSparseOpsToForeachPass(bool enableRT, bool enableConvert) {
+  return std::make_unique<LowerSparseOpsToForeachPass>(enableRT, enableConvert);
+}
+
+std::unique_ptr<Pass> mlir::createLowerForeachToSCFPass() {
+  return std::make_unique<LowerForeachToSCFPass>();
 }
 
 std::unique_ptr<Pass> mlir::createSparseTensorConversionPass() {
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp
index e9bcb5dc070ad..528e70bd3b1ef 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp
@@ -1303,10 +1303,9 @@ void mlir::populatePreSparsificationRewriting(RewritePatternSet &patterns) {
                GenSemiRingReduction, GenSemiRingSelect>(patterns.getContext());
 }
 
-void mlir::populatePostSparsificationRewriting(RewritePatternSet &patterns,
-                                               bool enableRT,
-                                               bool enableForeach,
-                                               bool enableConvert) {
+void mlir::populateLowerSparseOpsToForeachPatterns(RewritePatternSet &patterns,
+                                                   bool enableRT,
+                                                   bool enableConvert) {
   patterns.add<ConcatenateRewriter, CrdTranslateRewriter,
                ReshapeRewriter<tensor::ExpandShapeOp>,
                ReshapeRewriter<tensor::CollapseShapeOp>,
@@ -1314,10 +1313,13 @@ void mlir::populatePostSparsificationRewriting(RewritePatternSet &patterns,
                Sparse2SparseReshapeRewriter<tensor::CollapseShapeOp>,
                SparseTensorDimOpRewriter, TensorReshapeRewriter, OutRewriter>(
       patterns.getContext());
-  if (enableForeach)
-    patterns.add<ForeachRewriter>(patterns.getContext());
+
   if (enableConvert)
     patterns.add<DirectConvertRewriter>(patterns.getContext());
   if (!enableRT)
     patterns.add<NewRewriter>(patterns.getContext());
 }
+
+void mlir::populateLowerForeachToSCFPatterns(RewritePatternSet &patterns) {
+  patterns.add<ForeachRewriter>(patterns.getContext());
+}
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparsificationAndBufferizationPass.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparsificationAndBufferizationPass.cpp
index d8a24ea3527b1..f3f3828e0c5bd 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparsificationAndBufferizationPass.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparsificationAndBufferizationPass.cpp
@@ -141,7 +141,10 @@ class SparsificationAndBufferizationPass
       OpPassManager pm("builtin.module");
       pm.addPass(createSparsificationPass(sparsificationOptions));
       pm.addNestedPass<func::FuncOp>(createStageSparseOperationsPass());
-      pm.addPass(createPostSparsificationRewritePass(enableRuntimeLibrary));
+      pm.addPass(createLowerSparseOpsToForeachPass(enableRuntimeLibrary,
+                                                   /*enableConvert=*/true));
+      // TODO: DemapPass here!
+      pm.addNestedPass<func::FuncOp>(createLowerForeachToSCFPass());
       if (vectorLength > 0) {
         pm.addPass(mlir::createLoopInvariantCodeMotionPass());
         pm.addPass(createSparseVectorizationPass(
diff --git a/mlir/test/Dialect/SparseTensor/codegen.mlir b/mlir/test/Dialect/SparseTensor/codegen.mlir
index 8993333d6e533..c53ec7408bc3b 100644
--- a/mlir/test/Dialect/SparseTensor/codegen.mlir
+++ b/mlir/test/Dialect/SparseTensor/codegen.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s --post-sparsification-rewrite --sparse-tensor-codegen  --canonicalize -cse | FileCheck %s
+// RUN: mlir-opt %s --lower-sparse-ops-to-foreach --lower-sparse-foreach-to-scf --sparse-tensor-codegen  --canonicalize -cse | FileCheck %s
 
 #SV = #sparse_tensor.encoding<{ map = (d0) -> (d0 : compressed) }>
 
diff --git a/mlir/test/Dialect/SparseTensor/conversion.mlir b/mlir/test/Dialect/SparseTensor/conversion.mlir
index 092ba6b8358b5..27d8f296c9ad0 100644
--- a/mlir/test/Dialect/SparseTensor/conversion.mlir
+++ b/mlir/test/Dialect/SparseTensor/conversion.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s --post-sparsification-rewrite --sparse-tensor-conversion --canonicalize --cse | FileCheck %s
+// RUN: mlir-opt %s --lower-sparse-ops-to-foreach --lower-sparse-foreach-to-scf --sparse-tensor-conversion --canonicalize --cse | FileCheck %s
 
 #SparseVector = #sparse_tensor.encoding<{
   map = (d0) -> (d0 : compressed)
diff --git a/mlir/test/Dialect/SparseTensor/convert_dense2sparse.mlir b/mlir/test/Dialect/SparseTensor/convert_dense2sparse.mlir
index 4dba16df39f5c..4f37ae9207be9 100644
--- a/mlir/test/Dialect/SparseTensor/convert_dense2sparse.mlir
+++ b/mlir/test/Dialect/SparseTensor/convert_dense2sparse.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s --stage-sparse-ops --post-sparsification-rewrite="enable-foreach=false" --canonicalize --cse | FileCheck %s
+// RUN: mlir-opt %s --stage-sparse-ops --lower-sparse-ops-to-foreach --canonicalize --cse | FileCheck %s
 
 #SparseVector = #sparse_tensor.encoding<{
   map = (d0) -> (d0 : compressed)
diff --git a/mlir/test/Dialect/SparseTensor/convert_sparse2dense.mlir b/mlir/test/Dialect/SparseTensor/convert_sparse2dense.mlir
index e2dcb068e1185..730a5452df394 100644
--- a/mlir/test/Dialect/SparseTensor/convert_sparse2dense.mlir
+++ b/mlir/test/Dialect/SparseTensor/convert_sparse2dense.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s --stage-sparse-ops --post-sparsification-rewrite="enable-foreach=false" --canonicalize --cse | FileCheck %s
+// RUN: mlir-opt %s --stage-sparse-ops --lower-sparse-ops-to-foreach --canonicalize --cse | FileCheck %s
 
 #SparseVector = #sparse_tensor.encoding<{
   map = (d0) -> (d0 : compressed)
diff --git a/mlir/test/Dialect/SparseTensor/convert_sparse2sparse.mlir b/mlir/test/Dialect/SparseTensor/convert_sparse2sparse.mlir
index 0280e27b4e312..896bc02212971 100644
--- a/mlir/test/Dialect/SparseTensor/convert_sparse2sparse.mlir
+++ b/mlir/test/Dialect/SparseTensor/convert_sparse2sparse.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s --stage-sparse-ops --post-sparsification-rewrite="enable-foreach=false" --canonicalize --cse | FileCheck %s
+// RUN: mlir-opt %s --stage-sparse-ops --lower-sparse-ops-to-foreach --canonicalize --cse | FileCheck %s
 
 #SparseVector64 = #sparse_tensor.encoding<{
   map = (d0) -> (d0 : compressed),
diff --git a/mlir/test/Dialect/SparseTensor/rewriting_for_codegen.mlir b/mlir/test/Dialect/SparseTensor/rewriting_for_codegen.mlir
index 1e72f059baec2..93e802bc6065e 100644
--- a/mlir/test/Dialect/SparseTensor/rewriting_for_codegen.mlir
+++ b/mlir/test/Dialect/SparseTensor/rewriting_for_codegen.mlir
@@ -1,5 +1,5 @@
-// RUN: mlir-opt %s -post-sparsification-rewrite="enable-runtime-library=false enable-convert=false" | \
-// RUN: FileCheck %s
+// RUN: mlir-opt %s --lower-sparse-ops-to-foreach="enable-runtime-library=false enable-convert=false" \
+// RUN: --lower-sparse-foreach-to-scf | FileCheck %s
 
 #CSR = #sparse_tensor.encoding<{
   map = (d0, d1) -> (d0 : dense, d1 : compressed)
diff --git a/mlir/test/Dialect/SparseTensor/sparse_concat.mlir b/mlir/test/Dialect/SparseTensor/sparse_concat.mlir
index f3d3dd28563e8..e4e2748112d78 100644
--- a/mlir/test/Dialect/SparseTensor/sparse_concat.mlir
+++ b/mlir/test/Dialect/SparseTensor/sparse_concat.mlir
@@ -1,6 +1,6 @@
-// RUN: mlir-opt %s --post-sparsification-rewrite="enable-runtime-library=false enable-convert=false" \
+// RUN: mlir-opt %s --lower-sparse-ops-to-foreach="enable-runtime-library=false enable-convert=false" --lower-sparse-foreach-to-scf \
 // RUN: | FileCheck %s
-// RUN: mlir-opt %s --post-sparsification-rewrite="enable-runtime-library=true enable-convert=false" \
+// RUN: mlir-opt %s --lower-sparse-ops-to-foreach="enable-runtime-library=true enable-convert=false" --lower-sparse-foreach-to-scf \
 // RUN: | FileCheck %s
 
 
diff --git a/mlir/test/Dialect/SparseTensor/sparse_expand.mlir b/mlir/test/Dialect/SparseTensor/sparse_expand.mlir
index 3ee6e84a2382a..0f367f12483f6 100644
--- a/mlir/test/Dialect/SparseTensor/sparse_expand.mlir
+++ b/mlir/test/Dialect/SparseTensor/sparse_expand.mlir
@@ -4,7 +4,8 @@
 // RUN:   FileCheck %s --check-prefix=CHECK-SPARSE
 // RUN: mlir-opt %s --linalg-generalize-named-ops \
 // RUN:             --linalg-fuse-elementwise-ops \
-// RUN:             --sparsification --post-sparsification-rewrite \
+// RUN:             --sparsification --lower-sparse-ops-to-foreach \
+// RUN:             --lower-sparse-foreach-to-scf \
 // RUN:             --sparse-tensor-conversion --cse | \
 // RUN:   FileCheck %s --check-prefix=CHECK-CONVERT
 
diff --git a/mlir/test/Dialect/SparseTensor/sparse_foreach.mlir b/mlir/test/Dialect/SparseTensor/sparse_foreach.mlir
index bbce42c100641..5983289c752ef 100644
--- a/mlir/test/Dialect/SparseTensor/sparse_foreach.mlir
+++ b/mlir/test/Dialect/SparseTensor/sparse_foreach.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s --post-sparsification-rewrite="enable-runtime-library=false enable-foreach=true" --canonicalize | FileCheck %s
+// RUN: mlir-opt %s --lower-sparse-foreach-to-scf --canonicalize | FileCheck %s
 
 // CHECK-LABEL: func.func @sparse_foreach_constant
 // CHECK-DAG:   %[[C1:.*]] = arith.constant 1 : index
diff --git a/mlir/test/Dialect/SparseTensor/sparse_pack.mlir b/mlir/test/Dialect/SparseTensor/sparse_pack.mlir
index 9af998be2f682..80cfa3c635f36 100644
--- a/mlir/test/Dialect/SparseTensor/sparse_pack.mlir
+++ b/mlir/test/Dialect/SparseTensor/sparse_pack.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s --canonicalize --post-sparsification-rewrite="enable-runtime-library=false" --sparse-tensor-codegen -cse --canonicalize | FileCheck %s
+// RUN: mlir-opt %s --canonicalize --sparse-tensor-codegen -cse --canonicalize | FileCheck %s
 
 #COO = #sparse_tensor.encoding<{
   map = (d0, d1) -> (d0 : compressed(nonunique), d1 : singleton),
diff --git a/mlir/test/Dialect/SparseTensor/sparse_reshape.mlir b/mlir/test/Dialect/SparseTensor/sparse_reshape.mlir
index 4f105f3e19b3e..d3d6d8c91fa45 100644
--- a/mlir/test/Dialect/SparseTensor/sparse_reshape.mlir
+++ b/mlir/test/Dialect/SparseTensor/sparse_reshape.mlir
@@ -1,8 +1,8 @@
 // RUN: mlir-opt %s | mlir-opt | FileCheck %s --check-prefix=CHECK-ROUND
-// RUN: mlir-opt %s --post-sparsification-rewrite="enable-runtime-library=true enable-convert=false" \
-// RUN: --cse --canonicalize  | FileCheck %s
-// RUN: mlir-opt %s --post-sparsification-rewrite="enable-runtime-library=false enable-convert=false" \
-// RUN: --cse --canonicalize  | FileCheck %s
+// RUN: mlir-opt %s --lower-sparse-ops-to-foreach="enable-runtime-library=true enable-convert=false" \
+// RUN: --lower-sparse-foreach-to-scf --cse --canonicalize  | FileCheck %s
+// RUN: mlir-opt %s --lower-sparse-ops-to-foreach="enable-runtime-library=false enable-convert=false" \
+// RUN: --lower-sparse-foreach-to-scf --cse --canonicalize  | FileCheck %s
 
 #SparseVector = #sparse_tensor.encoding<{ map = (d0) -> (d0 : compressed) }>
 #SparseMatrix = #sparse_tensor.encoding<{ map = (d0, d1) -> (d0 : compressed, d1 : compressed) }>
diff --git a/mlir/test/Dialect/SparseTensor/sparse_tensor_reshape.mlir b/mlir/test/Dialect/SparseTensor/sparse_tensor_reshape.mlir
index a1578eb20b8ba..339d65ce5716f 100644
--- a/mlir/test/Dialect/SparseTensor/sparse_tensor_reshape.mlir
+++ b/mlir/test/Dialect/SparseTensor/sparse_tensor_reshape.mlir
@@ -1,5 +1,5 @@
-// RUN: mlir-opt %s --post-sparsification-rewrite="enable-runtime-library=false enable-convert=false" \
-// RUN: --cse --canonicalize  | FileCheck %s
+// RUN: mlir-opt %s --lower-sparse-ops-to-foreach="enable-runtime-library=false enable-convert=false" \
+// RUN: --lower-sparse-foreach-to-scf --cse --canonicalize  | FileCheck %s
 
 #SparseMatrix = #sparse_tensor.encoding<{ map = (d0, d1) -> (d0 : compressed, d1 : compressed) }>
 

From 83c560b3bf46e9b5a65f9a41b60e21898e286c9c Mon Sep 17 00:00:00 2001
From: Philip Reames <preames@rivosinc.com>
Date: Mon, 30 Oct 2023 15:29:57 -0700
Subject: [PATCH 445/877] [SDAG] Prefer forming sign_extend for zext nneg per
 target preference (#70725)

Builds on #67982 which recently introduced the nneg flag on a zext
instruction. Note that this change is the first point where the flag is
being used for an optimization, and thus may expose latent miscompiles.
We've recently taught both CVP and InstCombine to infer the flag when
forming zext, but nothing else is using the flag just yet.
---
 .../CodeGen/SelectionDAG/SelectionDAGBuilder.cpp | 16 ++++++++++++++--
 llvm/test/CodeGen/RISCV/sext-zext-trunc.ll       |  3 +--
 2 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 0e6129aaf5219..c518b1f95e902 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -3524,8 +3524,20 @@ void SelectionDAGBuilder::visitZExt(const User &I) {
   // ZExt cannot be a no-op cast because sizeof(src) < sizeof(dest).
   // ZExt also can't be a cast to bool for same reason. So, nothing much to do
   SDValue N = getValue(I.getOperand(0));
-  EVT DestVT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(),
-                                                        I.getType());
+  auto &TLI = DAG.getTargetLoweringInfo();
+  EVT DestVT = TLI.getValueType(DAG.getDataLayout(), I.getType());
+
+  // Since we don't yet have a representation of zext nneg in SDAG or MI,
+  // eagerly use the information to canonicalize towards sign_extend if
+  // that is the target's preference.  TODO: Add nneg support to the
+  // SDAG and MI representations.
+  if (auto *PNI = dyn_cast<PossiblyNonNegInst>(&I);
+      PNI && PNI->hasNonNeg() &&
+      TLI.isSExtCheaperThanZExt(N.getValueType(), DestVT)) {
+    setValue(&I, DAG.getNode(ISD::SIGN_EXTEND, getCurSDLoc(), DestVT, N));
+    return;
+  }
+
   setValue(&I, DAG.getNode(ISD::ZERO_EXTEND, getCurSDLoc(), DestVT, N));
 }
 
diff --git a/llvm/test/CodeGen/RISCV/sext-zext-trunc.ll b/llvm/test/CodeGen/RISCV/sext-zext-trunc.ll
index 7297bfaf0c62e..20d73acddea01 100644
--- a/llvm/test/CodeGen/RISCV/sext-zext-trunc.ll
+++ b/llvm/test/CodeGen/RISCV/sext-zext-trunc.ll
@@ -501,8 +501,7 @@ define i64 @zext_nneg_i32_to_i64(i32 %a) nounwind {
 ;
 ; RV64-LABEL: zext_nneg_i32_to_i64:
 ; RV64:       # %bb.0:
-; RV64-NEXT:    slli a0, a0, 32
-; RV64-NEXT:    srli a0, a0, 32
+; RV64-NEXT:    sext.w a0, a0
 ; RV64-NEXT:    ret
   %1 = zext nneg i32 %a to i64
   ret i64 %1

From ef100c228a5913c6c54bfed8e80fd265ce8beca2 Mon Sep 17 00:00:00 2001
From: Peiming Liu <36770114+PeimingLiu@users.noreply.github.com>
Date: Mon, 30 Oct 2023 16:04:41 -0700
Subject: [PATCH 446/877] [mlir][sparse] implements tensor.insert on sparse
 tensors. (#70737)

---
 .../SparseTensor/IR/SparseTensorType.h        |  6 ++
 .../Transforms/SparseReinterpretMap.cpp       | 61 ++++++++++++++++++-
 .../Transforms/SparseTensorRewriting.cpp      | 57 +++--------------
 .../SparsificationAndBufferizationPass.cpp    |  4 +-
 .../SparseTensor/convert_dense2sparse.mlir    | 14 ++---
 .../SparseTensor/convert_sparse2sparse.mlir   |  6 +-
 .../Dialect/SparseTensor/sparse_concat.mlir   | 12 ++--
 7 files changed, 95 insertions(+), 65 deletions(-)

diff --git a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorType.h b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorType.h
index 7a1f1e2144e04..0761cbee52407 100644
--- a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorType.h
+++ b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorType.h
@@ -251,6 +251,12 @@ class SparseTensorType {
                                        CrdTransDirectionKind::dim2lvl);
   }
 
+  RankedTensorType getDemappedType() const {
+    auto lvlShape = getLvlShape();
+    return RankedTensorType::get(lvlShape, rtp.getElementType(),
+                                 enc.withoutDimToLvl());
+  }
+
   /// Safely looks up the requested dimension-DynSize.  If you intend
   /// to check the result with `ShapedType::isDynamic`, then see the
   /// `getStaticDimSize` method instead.
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseReinterpretMap.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseReinterpretMap.cpp
index 10722ccb6eea7..66fd2e4d94a28 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseReinterpretMap.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseReinterpretMap.cpp
@@ -6,9 +6,15 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/SparseTensor/IR/SparseTensor.h"
 #include "mlir/Dialect/SparseTensor/IR/SparseTensorType.h"
 #include "mlir/Dialect/SparseTensor/Transforms/Passes.h"
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/IR/AffineMap.h"
+
+using namespace mlir;
+using namespace mlir::sparse_tensor;
 
 namespace {
 
@@ -17,7 +23,60 @@ namespace {
 //   (2) rewrite linalg.generic ops traits on level crds
 //   (3) compute topsort, and resolve cyles with sparse_tensor.convert ops
 
+//===----------------------------------------------------------------------===//
+// Reiterpret Map Rewriters for operations other than linalg.generics
+//===----------------------------------------------------------------------===//
+
+struct CrdTranslateRewriter : public OpRewritePattern<CrdTranslateOp> {
+  using OpRewritePattern::OpRewritePattern;
+  LogicalResult matchAndRewrite(CrdTranslateOp op,
+                                PatternRewriter &rewriter) const override {
+    AffineMap map = op.getDirection() == CrdTransDirectionKind::dim2lvl
+                        ? op.getEncoder().getDimToLvl()
+                        : op.getEncoder().getLvlToDim();
+    SmallVector<Value> outCrds;
+    for (AffineExpr result : map.getResults()) {
+      // TODO: we should probably expand the affine map to IR using our own
+      // rules, since affine.apply assume signed value, while the cooridinates
+      // we provided must always be signless.
+      Value trans = rewriter.create<affine::AffineApplyOp>(
+          op.getLoc(), AffineMap::get(map.getNumDims(), 0, result),
+          op.getInCrds());
+      outCrds.push_back(trans);
+    }
+    rewriter.replaceOp(op, outCrds);
+    return success();
+  }
+};
+
+struct TensorInsertRewriter : public OpRewritePattern<tensor::InsertOp> {
+  using OpRewritePattern::OpRewritePattern;
+  LogicalResult matchAndRewrite(tensor::InsertOp op,
+                                PatternRewriter &rewriter) const override {
+
+    if (!op.getResult().getType().getEncoding())
+      return failure();
+    Location loc = op.getLoc();
+    auto stt = getSparseTensorType(op.getResult());
+    ValueRange lvlCrd = stt.translateCrds(rewriter, loc, op.getIndices(),
+                                          CrdTransDirectionKind::dim2lvl);
+
+    Value t = rewriter.create<ReinterpretMapOp>(
+        loc, stt.getEncoding().withoutDimToLvl(), op.getDest());
+    t = rewriter.create<sparse_tensor::InsertOp>(loc, op.getScalar(), t,
+                                                 lvlCrd);
+    rewriter.replaceOpWithNewOp<ReinterpretMapOp>(op, op.getType(), t);
+    return success();
+  }
+};
+
 } // namespace
 
 void mlir::populateSparseReinterpretMap(RewritePatternSet &patterns,
-                                        ReinterpretMapScope scope) {}
+                                        ReinterpretMapScope scope) {
+  if (scope == ReinterpretMapScope::kAll ||
+      scope == ReinterpretMapScope::kExceptGeneric) {
+    patterns.add<CrdTranslateRewriter, TensorInsertRewriter>(
+        patterns.getContext());
+  }
+}
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp
index 528e70bd3b1ef..02796bc9a7e7d 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp
@@ -846,11 +846,7 @@ struct TensorLike {
   }
 
   void insert(OpBuilder &builder, Location loc, Value v, ValueRange crds) {
-    // TODO: Unify these two.
-    if (isSparse())
-      val = builder.create<sparse_tensor::InsertOp>(loc, v, val, crds);
-    else
-      val = builder.create<tensor::InsertOp>(loc, v, val, crds);
+    val = builder.create<tensor::InsertOp>(loc, v, val, crds);
   }
 
   Value finalize(OpBuilder &builder, Location loc, RankedTensorType rtp) const {
@@ -866,28 +862,6 @@ struct TensorLike {
   Value val;
 };
 
-struct CrdTranslateRewriter : public OpRewritePattern<CrdTranslateOp> {
-  using OpRewritePattern::OpRewritePattern;
-  LogicalResult matchAndRewrite(CrdTranslateOp op,
-                                PatternRewriter &rewriter) const override {
-    AffineMap map = op.getDirection() == CrdTransDirectionKind::dim2lvl
-                        ? op.getEncoder().getDimToLvl()
-                        : op.getEncoder().getLvlToDim();
-    SmallVector<Value> outCrds;
-    for (AffineExpr result : map.getResults()) {
-      // TODO: we should probably expand the affine map to IR using our own
-      // rules, since affine.apply assume signed value, while the cooridinates
-      // we provided must always be signless.
-      Value trans = rewriter.create<affine::AffineApplyOp>(
-          op.getLoc(), AffineMap::get(map.getNumDims(), 0, result),
-          op.getInCrds());
-      outCrds.push_back(trans);
-    }
-    rewriter.replaceOp(op, outCrds);
-    return success();
-  }
-};
-
 struct SparseTensorDimOpRewriter : public OpRewritePattern<tensor::DimOp> {
   using OpRewritePattern::OpRewritePattern;
   LogicalResult matchAndRewrite(tensor::DimOp op,
@@ -939,7 +913,6 @@ struct ConcatenateRewriter : public OpRewritePattern<ConcatenateOp> {
 
     const Location loc = op.getLoc();
     const auto dstTp = getSparseTensorType(op);
-    const Dimension dimRank = dstTp.getDimRank();
     const Dimension conDim = op.getDimension();
     SmallVector<Value> sizes;
     concatSizesFromInputs(rewriter, sizes, loc, dstTp, op.getInputs(), conDim);
@@ -969,15 +942,10 @@ struct ConcatenateRewriter : public OpRewritePattern<ConcatenateOp> {
           loc, input, iterArg,
           [&](OpBuilder &builder, Location loc, ValueRange dcvs, Value v,
               ValueRange reduc) {
-            SmallVector<Value> dstLcvs(dstTp.getLvlRank());
-            for (Dimension d = 0; d < dimRank; d++) {
-              Value crd = dcvs[d];
-              // Transforms coordinates for the concatenating dim.
-              if (d == conDim)
-                crd = builder.create<arith::AddIOp>(loc, crd, offset);
-              // FIXME: `toStoredDim` is deprecated
-              dstLcvs[toStoredDim(dstTp.getEncoding(), d)] = crd;
-            }
+            SmallVector<Value> offDimCrd(dcvs);
+            offDimCrd[conDim] =
+                builder.create<arith::AddIOp>(loc, offDimCrd[conDim], offset);
+
             // Enters foreach, updates the SSA chain.
             dstBuf.val = reduc.front();
             if (!dstTp.isAllDense()) {
@@ -988,14 +956,14 @@ struct ConcatenateRewriter : public OpRewritePattern<ConcatenateOp> {
               builder.create<scf::YieldOp>(loc, dstBuf.val);
 
               builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
-              dstBuf.insert(builder, loc, v, dstLcvs);
+              dstBuf.insert(builder, loc, v, offDimCrd);
               builder.create<scf::YieldOp>(loc, dstBuf.val);
 
               // Exits the ifOp, update the sparse tensor SSA value.
               builder.setInsertionPointAfter(ifOp);
               dstBuf.val = ifOp.getResult(0);
             } else {
-              dstBuf.insert(builder, loc, v, dstLcvs);
+              dstBuf.insert(builder, loc, v, offDimCrd);
             }
             builder.create<sparse_tensor::YieldOp>(loc, dstBuf.val);
           });
@@ -1064,10 +1032,6 @@ struct DirectConvertRewriter : public OpRewritePattern<ConvertOp> {
             ValueRange reduc) {
           // Enters the loop, update the SSA value for insertion chain.
           dstBuf.val = reduc.front();
-
-          ValueRange lcvs = dstStt.translateCrds(
-              builder, loc, dcvs, CrdTransDirectionKind::dim2lvl);
-
           if (!skipZeroCheck) {
             Value cond = genIsNonzero(builder, loc, v);
             auto ifOp = builder.create<scf::IfOp>(loc, reduc.getTypes(), cond,
@@ -1076,14 +1040,14 @@ struct DirectConvertRewriter : public OpRewritePattern<ConvertOp> {
             builder.create<scf::YieldOp>(loc, dstBuf.val);
 
             builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
-            dstBuf.insert(builder, loc, v, lcvs);
+            dstBuf.insert(builder, loc, v, dcvs);
             builder.create<scf::YieldOp>(loc, dstBuf.val);
 
             // Exits the ifOp, update the sparse tensor SSA value.
             builder.setInsertionPointAfter(ifOp);
             dstBuf.val = ifOp.getResult(0);
           } else {
-            dstBuf.insert(builder, loc, v, lcvs);
+            dstBuf.insert(builder, loc, v, dcvs);
           }
           builder.create<sparse_tensor::YieldOp>(loc, dstBuf.val);
         });
@@ -1306,8 +1270,7 @@ void mlir::populatePreSparsificationRewriting(RewritePatternSet &patterns) {
 void mlir::populateLowerSparseOpsToForeachPatterns(RewritePatternSet &patterns,
                                                    bool enableRT,
                                                    bool enableConvert) {
-  patterns.add<ConcatenateRewriter, CrdTranslateRewriter,
-               ReshapeRewriter<tensor::ExpandShapeOp>,
+  patterns.add<ConcatenateRewriter, ReshapeRewriter<tensor::ExpandShapeOp>,
                ReshapeRewriter<tensor::CollapseShapeOp>,
                Sparse2SparseReshapeRewriter<tensor::ExpandShapeOp>,
                Sparse2SparseReshapeRewriter<tensor::CollapseShapeOp>,
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparsificationAndBufferizationPass.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparsificationAndBufferizationPass.cpp
index f3f3828e0c5bd..41940f731e76c 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparsificationAndBufferizationPass.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparsificationAndBufferizationPass.cpp
@@ -143,7 +143,9 @@ class SparsificationAndBufferizationPass
       pm.addNestedPass<func::FuncOp>(createStageSparseOperationsPass());
       pm.addPass(createLowerSparseOpsToForeachPass(enableRuntimeLibrary,
                                                    /*enableConvert=*/true));
-      // TODO: DemapPass here!
+      // Handle dim-to-lvl maps on operations other than linalg.generic.
+      pm.addPass(
+          createSparseReinterpretMapPass(ReinterpretMapScope::kExceptGeneric));
       pm.addNestedPass<func::FuncOp>(createLowerForeachToSCFPass());
       if (vectorLength > 0) {
         pm.addPass(mlir::createLoopInvariantCodeMotionPass());
diff --git a/mlir/test/Dialect/SparseTensor/convert_dense2sparse.mlir b/mlir/test/Dialect/SparseTensor/convert_dense2sparse.mlir
index 4f37ae9207be9..96a1140372bd6 100644
--- a/mlir/test/Dialect/SparseTensor/convert_dense2sparse.mlir
+++ b/mlir/test/Dialect/SparseTensor/convert_dense2sparse.mlir
@@ -19,7 +19,7 @@
 // CHECK-LABEL:   func.func @sparse_convert_1d
 // CHECK:           sparse_tensor.foreach
 // CHECK:            scf.if
-// CHECK:              sparse_tensor.insert
+// CHECK:              tensor.insert
 // CHECK-NOT:       sparse_tensor.reorder_coo
 // CHECK:           sparse_tensor.load
 func.func @sparse_convert_1d(%arg0: tensor<?xi32>) -> tensor<?xi32, #SparseVector> {
@@ -30,7 +30,7 @@ func.func @sparse_convert_1d(%arg0: tensor<?xi32>) -> tensor<?xi32, #SparseVecto
 // CHECK-LABEL:   func.func @sparse_convert_complex
 // CHECK:           sparse_tensor.foreach
 // CHECK:            scf.if
-// CHECK:              sparse_tensor.insert
+// CHECK:              tensor.insert
 // CHECK-NOT:       sparse_tensor.reorder_coo
 // CHECK:           sparse_tensor.load
 func.func @sparse_convert_complex(%arg0: tensor<100xcomplex<f64>>) -> tensor<100xcomplex<f64>, #SparseVector> {
@@ -41,7 +41,7 @@ func.func @sparse_convert_complex(%arg0: tensor<100xcomplex<f64>>) -> tensor<100
 // CHECK-LABEL:   func.func @sparse_convert_2d
 // CHECK:           sparse_tensor.foreach
 // CHECK:            scf.if
-// CHECK:              sparse_tensor.insert
+// CHECK:              tensor.insert
 // CHECK-NOT:       sparse_tensor.reorder_coo
 // CHECK:           sparse_tensor.load
 func.func @sparse_convert_2d(%arg0: tensor<2x4xf64>) -> tensor<2x4xf64, #CSR> {
@@ -52,7 +52,7 @@ func.func @sparse_convert_2d(%arg0: tensor<2x4xf64>) -> tensor<2x4xf64, #CSR> {
 // CHECK-LABEL:   func.func @sparse_constant
 // CHECK:           sparse_tensor.foreach
 // CHECK-NOT:         scf.if
-// CHECK:               sparse_tensor.insert
+// CHECK:               tensor.insert
 // CHECK-NOT:       sparse_tensor.reorder_coo
 // CHECK:           sparse_tensor.load
 func.func @sparse_constant() -> tensor<8x7xf32, #CSR>{
@@ -66,7 +66,7 @@ func.func @sparse_constant() -> tensor<8x7xf32, #CSR>{
 // CHECK-LABEL:   func.func @sparse_constant_csc
 // CHECK:           sparse_tensor.foreach
 // CHECK-NOT:         scf.if
-// CHECK:               sparse_tensor.insert
+// CHECK:               tensor.insert
 // CHECK-NOT:       sparse_tensor.reorder_coo
 // CHECK:           sparse_tensor.load
 func.func @sparse_constant_csc() -> tensor<8x7xf32, #CSC>{
@@ -80,11 +80,11 @@ func.func @sparse_constant_csc() -> tensor<8x7xf32, #CSC>{
 // CHECK-LABEL:   func.func @sparse_convert_3d
 // CHECK:           sparse_tensor.foreach
 // CHECK:             scf.if
-// CHECK:               sparse_tensor.insert
+// CHECK:               tensor.insert
 // CHECK:           sparse_tensor.load
 // CHECK:           sparse_tensor.reorder_coo
 // CHECK:           sparse_tensor.foreach
-// CHECK:             sparse_tensor.insert
+// CHECK:             tensor.insert
 // CHECK:           sparse_tensor.load
 func.func @sparse_convert_3d(%arg0: tensor<?x?x?xf64>) -> tensor<?x?x?xf64, #SparseTensor> {
   %0 = sparse_tensor.convert %arg0 : tensor<?x?x?xf64> to tensor<?x?x?xf64, #SparseTensor>
diff --git a/mlir/test/Dialect/SparseTensor/convert_sparse2sparse.mlir b/mlir/test/Dialect/SparseTensor/convert_sparse2sparse.mlir
index 896bc02212971..0673f915a1cf6 100644
--- a/mlir/test/Dialect/SparseTensor/convert_sparse2sparse.mlir
+++ b/mlir/test/Dialect/SparseTensor/convert_sparse2sparse.mlir
@@ -66,11 +66,11 @@ func.func @sparse_convert(%arg0: tensor<?xf32, #SparseVector64>) -> tensor<?xf32
 
 // CHECK-LABEL:   func.func @sparse_convert_permuted
 // CHECK:           sparse_tensor.foreach
-// CHECK:             sparse_tensor.insert
+// CHECK:             tensor.insert
 // CHECK:           sparse_tensor.load
 // CHECK:           sparse_tensor.reorder_coo
 // CHECK:           sparse_tensor.foreach
-// CHECK:             sparse_tensor.insert
+// CHECK:             tensor.insert
 // CHECK:           sparse_tensor.load
 // CHECK:           return
 func.func @sparse_convert_permuted(%arg0: tensor<?x?x?xf32, #SortedCOO3D>) -> tensor<?x?x?xf32, #TsssPermuted> {
@@ -80,7 +80,7 @@ func.func @sparse_convert_permuted(%arg0: tensor<?x?x?xf32, #SortedCOO3D>) -> te
 
 // CHECK-LABEL:   func.func @sparse_convert_slice
 // CHECK:           sparse_tensor.foreach
-// CHECK:             sparse_tensor.insert
+// CHECK:             tensor.insert
 // CHECK:           sparse_tensor.load
 // CHECK-NOT:       sparse_tensor.reorder_coo
 // CHECK:           return
diff --git a/mlir/test/Dialect/SparseTensor/sparse_concat.mlir b/mlir/test/Dialect/SparseTensor/sparse_concat.mlir
index e4e2748112d78..86dc9a1175071 100644
--- a/mlir/test/Dialect/SparseTensor/sparse_concat.mlir
+++ b/mlir/test/Dialect/SparseTensor/sparse_concat.mlir
@@ -30,7 +30,7 @@
 //       CHECK:    %[[RET_4:.*]] = scf.for %[[TMP_arg4:.*]] = %[[TMP_25]] to %[[TMP_26]] step %[[TMP_c1]] iter_args(%[[A1:.*]] = %[[A0]])
 //       CHECK:      %[[TMP_27:.*]] = memref.load %[[TMP_4]][%[[TMP_arg4]]] : memref<?xindex>
 //       CHECK:      %[[TMP_28:.*]] = memref.load %[[TMP_5]][%[[TMP_arg4]]] : memref<?xf64>
-//       CHECK:      %[[NEW_1:.*]] = sparse_tensor.insert %[[TMP_28]] into %[[A1]][%[[TMP_23]], %[[TMP_27]]] : tensor<9x4xf64, #sparse_tensor
+//       CHECK:      %[[NEW_1:.*]] = tensor.insert %[[TMP_28]] into %[[A1]][%[[TMP_23]], %[[TMP_27]]] : tensor<9x4xf64, #sparse_tensor
 //       CHECK:      scf.yield %[[NEW_1]]
 //       CHECK:    }
 //       CHECK:    scf.yield %[[RET_4]]
@@ -51,7 +51,7 @@
 //       CHECK:      %[[TMP_27:.*]] = memref.load %[[TMP_11]][%[[TMP_arg4]]] : memref<?xindex>
 //       CHECK:      %[[TMP_28:.*]] = memref.load %[[TMP_12]][%[[TMP_arg4]]] : memref<?xf64>
 //       CHECK:      %[[TMP_29:.*]] = arith.addi %[[TMP_23]], %[[TMP_c2]] : index
-//       CHECK:      %[[NEW_2:.*]] = sparse_tensor.insert %[[TMP_28]] into %[[A3]][%[[TMP_29]], %[[TMP_27]]] : tensor<9x4xf64, #sparse_tensor
+//       CHECK:      %[[NEW_2:.*]] = tensor.insert %[[TMP_28]] into %[[A3]][%[[TMP_29]], %[[TMP_27]]] : tensor<9x4xf64, #sparse_tensor
 //       CHECK:      scf.yield %[[NEW_2]]
 //       CHECK:    }
 //       CHECK:    scf.yield %[[RET_5]]
@@ -72,7 +72,7 @@
 //       CHECK:      %[[TMP_27:.*]] = memref.load %[[TMP_18]][%[[TMP_arg4]]] : memref<?xindex>
 //       CHECK:      %[[TMP_28:.*]] = memref.load %[[TMP_19]][%[[TMP_arg4]]] : memref<?xf64>
 //       CHECK:      %[[TMP_29:.*]] = arith.addi %[[TMP_23]], %[[TMP_c5]] : index
-//       CHECK:      %[[NEW_3:.*]] = sparse_tensor.insert %[[TMP_28]] into %[[A5]][%[[TMP_29]], %[[TMP_27]]] : tensor<9x4xf64, #sparse_tensor
+//       CHECK:      %[[NEW_3:.*]] = tensor.insert %[[TMP_28]] into %[[A5]][%[[TMP_29]], %[[TMP_27]]] : tensor<9x4xf64, #sparse_tensor
 //       CHECK:      scf.yield %[[NEW_3]]
 //       CHECK:    }
 //       CHECK:    scf.yield %[[RET_6]]
@@ -116,7 +116,7 @@ func.func @concat_sparse_sparse(%arg0: tensor<2x4xf64, #DCSR>,
 //       CHECK:    %[[RET_4:.*]] = scf.for %[[TMP_arg4:.*]] = %[[TMP_25]] to %[[TMP_26]] step %[[TMP_c1]] iter_args(%[[A1:.*]] = %[[A0]])
 //       CHECK:      %[[TMP_27:.*]] = memref.load %[[TMP_4]][%[[TMP_arg4]]] : memref<?xindex>
 //       CHECK:      %[[TMP_28:.*]] = memref.load %[[TMP_5]][%[[TMP_arg4]]] : memref<?xf64>
-//       CHECK:      %[[NEW_1:.*]] = sparse_tensor.insert %[[TMP_28]] into %[[A1]][%[[TMP_23]], %[[TMP_27]]] : tensor<?x?xf64, #sparse_tensor
+//       CHECK:      %[[NEW_1:.*]] = tensor.insert %[[TMP_28]] into %[[A1]][%[[TMP_23]], %[[TMP_27]]] : tensor<?x?xf64, #sparse_tensor
 //       CHECK:      scf.yield %[[NEW_1]]
 //       CHECK:    }
 //       CHECK:    scf.yield %[[RET_4]]
@@ -137,7 +137,7 @@ func.func @concat_sparse_sparse(%arg0: tensor<2x4xf64, #DCSR>,
 //       CHECK:      %[[TMP_27:.*]] = memref.load %[[TMP_11]][%[[TMP_arg4]]] : memref<?xindex>
 //       CHECK:      %[[TMP_28:.*]] = memref.load %[[TMP_12]][%[[TMP_arg4]]] : memref<?xf64>
 //       CHECK:      %[[TMP_29:.*]] = arith.addi %[[TMP_23]], %[[TMP_c2]] : index
-//       CHECK:      %[[NEW_2:.*]] = sparse_tensor.insert %[[TMP_28]] into %[[A3]][%[[TMP_29]], %[[TMP_27]]] : tensor<?x?xf64, #sparse_tensor
+//       CHECK:      %[[NEW_2:.*]] = tensor.insert %[[TMP_28]] into %[[A3]][%[[TMP_29]], %[[TMP_27]]] : tensor<?x?xf64, #sparse_tensor
 //       CHECK:      scf.yield %[[NEW_2]]
 //       CHECK:    }
 //       CHECK:    scf.yield %[[RET_5]]
@@ -158,7 +158,7 @@ func.func @concat_sparse_sparse(%arg0: tensor<2x4xf64, #DCSR>,
 //       CHECK:      %[[TMP_27:.*]] = memref.load %[[TMP_18]][%[[TMP_arg4]]] : memref<?xindex>
 //       CHECK:      %[[TMP_28:.*]] = memref.load %[[TMP_19]][%[[TMP_arg4]]] : memref<?xf64>
 //       CHECK:      %[[TMP_29:.*]] = arith.addi %[[TMP_23]], %[[TMP_c5]] : index
-//       CHECK:      %[[NEW_3:.*]] = sparse_tensor.insert %[[TMP_28]] into %[[A5]][%[[TMP_29]], %[[TMP_27]]] : tensor<?x?xf64, #sparse_tensor
+//       CHECK:      %[[NEW_3:.*]] = tensor.insert %[[TMP_28]] into %[[A5]][%[[TMP_29]], %[[TMP_27]]] : tensor<?x?xf64, #sparse_tensor
 //       CHECK:      scf.yield %[[NEW_3]]
 //       CHECK:    }
 //       CHECK:    scf.yield %[[RET_6]]

From 8116b6dce70ef284f9a0895b0f9f45bfa9a3ade7 Mon Sep 17 00:00:00 2001
From: serge-sans-paille <sguelton@mozilla.com>
Date: Fri, 27 Oct 2023 22:48:08 +0200
Subject: [PATCH 447/877] [clang] Change GetCharAndSizeSlow interface to
 by-value style

Instead of passing the Size by reference, assuming it is initialized,
return it alongside the expected char result as a POD.

This makes the interface less error prone: previous interface expected
the Size reference to be initialized, and it was often forgotten,
leading to uninitialized variable usage. This patch fixes the issue.

This also generates faster code, as the returned POD (a char and an
unsigned) fits in 64 bits. The speedup according to compile time tracker
reach -O.7%, with a good number of -0.4%. Details are available on

        https://llvm-compile-time-tracker.com/compare.php?from=3fe63f81fcb999681daa11b2890c82fda3aaeef5&to=fc76a9202f737472ecad4d6e0b0bf87a013866f3&stat=instructions:u

And icing on the cake, on my setup it also shaves 2kB out of
libclang-cpp :-)

This is a recommit of d8f5a18b6e587aeaa8b99707e87b652f49b160cd for
---
 clang-tools-extra/pseudo/lib/Lex.cpp          |  6 +-
 clang/include/clang/Lex/Lexer.h               | 35 ++++-----
 clang/lib/Lex/DependencyDirectivesScanner.cpp |  5 +-
 clang/lib/Lex/Lexer.cpp                       | 73 ++++++++++---------
 4 files changed, 63 insertions(+), 56 deletions(-)

diff --git a/clang-tools-extra/pseudo/lib/Lex.cpp b/clang-tools-extra/pseudo/lib/Lex.cpp
index 4b89ad017ef1f..2111476f04dc5 100644
--- a/clang-tools-extra/pseudo/lib/Lex.cpp
+++ b/clang-tools-extra/pseudo/lib/Lex.cpp
@@ -87,9 +87,9 @@ TokenStream cook(const TokenStream &Code, const LangOptions &LangOpts) {
       llvm::SmallString<64> CleanBuffer;
       const char *Pos = Tok.text().begin();
       while (Pos < Tok.text().end()) {
-        unsigned CharSize = 0;
-        CleanBuffer.push_back(
-            clang::Lexer::getCharAndSizeNoWarn(Pos, CharSize, LangOpts));
+        auto [Char, CharSize] =
+            clang::Lexer::getCharAndSizeNoWarn(Pos, LangOpts);
+        CleanBuffer.push_back(Char);
         assert(CharSize != 0 && "no progress!");
         Pos += CharSize;
       }
diff --git a/clang/include/clang/Lex/Lexer.h b/clang/include/clang/Lex/Lexer.h
index ac0ef14c591bd..899e665e74546 100644
--- a/clang/include/clang/Lex/Lexer.h
+++ b/clang/include/clang/Lex/Lexer.h
@@ -575,19 +575,23 @@ class Lexer : public PreprocessorLexer {
   /// sequence.
   static bool isNewLineEscaped(const char *BufferStart, const char *Str);
 
+  /// Represents a char and the number of bytes parsed to produce it.
+  struct SizedChar {
+    char Char;
+    unsigned Size;
+  };
+
   /// getCharAndSizeNoWarn - Like the getCharAndSize method, but does not ever
   /// emit a warning.
-  static inline char getCharAndSizeNoWarn(const char *Ptr, unsigned &Size,
-                                          const LangOptions &LangOpts) {
+  static inline SizedChar getCharAndSizeNoWarn(const char *Ptr,
+                                               const LangOptions &LangOpts) {
     // If this is not a trigraph and not a UCN or escaped newline, return
     // quickly.
     if (isObviouslySimpleCharacter(Ptr[0])) {
-      Size = 1;
-      return *Ptr;
+      return {*Ptr, 1u};
     }
 
-    Size = 0;
-    return getCharAndSizeSlowNoWarn(Ptr, Size, LangOpts);
+    return getCharAndSizeSlowNoWarn(Ptr, LangOpts);
   }
 
   /// Returns the leading whitespace for line that corresponds to the given
@@ -665,8 +669,7 @@ class Lexer : public PreprocessorLexer {
     // quickly.
     if (isObviouslySimpleCharacter(Ptr[0])) return *Ptr++;
 
-    unsigned Size = 0;
-    char C = getCharAndSizeSlow(Ptr, Size, &Tok);
+    auto [C, Size] = getCharAndSizeSlow(Ptr, &Tok);
     Ptr += Size;
     return C;
   }
@@ -682,9 +685,7 @@ class Lexer : public PreprocessorLexer {
 
     // Otherwise, re-lex the character with a current token, allowing
     // diagnostics to be emitted and flags to be set.
-    Size = 0;
-    getCharAndSizeSlow(Ptr, Size, &Tok);
-    return Ptr+Size;
+    return Ptr + getCharAndSizeSlow(Ptr, &Tok).Size;
   }
 
   /// getCharAndSize - Peek a single 'character' from the specified buffer,
@@ -699,14 +700,14 @@ class Lexer : public PreprocessorLexer {
       return *Ptr;
     }
 
-    Size = 0;
-    return getCharAndSizeSlow(Ptr, Size);
+    auto CharAndSize = getCharAndSizeSlow(Ptr);
+    Size = CharAndSize.Size;
+    return CharAndSize.Char;
   }
 
   /// getCharAndSizeSlow - Handle the slow/uncommon case of the getCharAndSize
   /// method.
-  char getCharAndSizeSlow(const char *Ptr, unsigned &Size,
-                          Token *Tok = nullptr);
+  SizedChar getCharAndSizeSlow(const char *Ptr, Token *Tok = nullptr);
 
   /// getEscapedNewLineSize - Return the size of the specified escaped newline,
   /// or 0 if it is not an escaped newline. P[-1] is known to be a "\" on entry
@@ -720,8 +721,8 @@ class Lexer : public PreprocessorLexer {
 
   /// getCharAndSizeSlowNoWarn - Same as getCharAndSizeSlow, but never emits a
   /// diagnostic.
-  static char getCharAndSizeSlowNoWarn(const char *Ptr, unsigned &Size,
-                                       const LangOptions &LangOpts);
+  static SizedChar getCharAndSizeSlowNoWarn(const char *Ptr,
+                                            const LangOptions &LangOpts);
 
   //===--------------------------------------------------------------------===//
   // Other lexer functions.
diff --git a/clang/lib/Lex/DependencyDirectivesScanner.cpp b/clang/lib/Lex/DependencyDirectivesScanner.cpp
index 2bd2c5f8388c0..980f865cf24c9 100644
--- a/clang/lib/Lex/DependencyDirectivesScanner.cpp
+++ b/clang/lib/Lex/DependencyDirectivesScanner.cpp
@@ -565,9 +565,8 @@ Scanner::cleanStringIfNeeded(const dependency_directives_scan::Token &Tok) {
   const char *BufPtr = Input.begin() + Tok.Offset;
   const char *AfterIdent = Input.begin() + Tok.getEnd();
   while (BufPtr < AfterIdent) {
-    unsigned Size;
-    Spelling[SpellingLength++] =
-        Lexer::getCharAndSizeNoWarn(BufPtr, Size, LangOpts);
+    auto [Char, Size] = Lexer::getCharAndSizeNoWarn(BufPtr, LangOpts);
+    Spelling[SpellingLength++] = Char;
     BufPtr += Size;
   }
 
diff --git a/clang/lib/Lex/Lexer.cpp b/clang/lib/Lex/Lexer.cpp
index 675ec28e51479..1c53997527732 100644
--- a/clang/lib/Lex/Lexer.cpp
+++ b/clang/lib/Lex/Lexer.cpp
@@ -287,9 +287,9 @@ static size_t getSpellingSlow(const Token &Tok, const char *BufPtr,
   if (tok::isStringLiteral(Tok.getKind())) {
     // Munch the encoding-prefix and opening double-quote.
     while (BufPtr < BufEnd) {
-      unsigned Size;
-      Spelling[Length++] = Lexer::getCharAndSizeNoWarn(BufPtr, Size, LangOpts);
-      BufPtr += Size;
+      auto CharAndSize = Lexer::getCharAndSizeNoWarn(BufPtr, LangOpts);
+      Spelling[Length++] = CharAndSize.Char;
+      BufPtr += CharAndSize.Size;
 
       if (Spelling[Length - 1] == '"')
         break;
@@ -316,9 +316,9 @@ static size_t getSpellingSlow(const Token &Tok, const char *BufPtr,
   }
 
   while (BufPtr < BufEnd) {
-    unsigned Size;
-    Spelling[Length++] = Lexer::getCharAndSizeNoWarn(BufPtr, Size, LangOpts);
-    BufPtr += Size;
+    auto CharAndSize = Lexer::getCharAndSizeNoWarn(BufPtr, LangOpts);
+    Spelling[Length++] = CharAndSize.Char;
+    BufPtr += CharAndSize.Size;
   }
 
   assert(Length < Tok.getLength() &&
@@ -772,10 +772,9 @@ unsigned Lexer::getTokenPrefixLength(SourceLocation TokStart, unsigned CharNo,
   // If we have a character that may be a trigraph or escaped newline, use a
   // lexer to parse it correctly.
   for (; CharNo; --CharNo) {
-    unsigned Size;
-    Lexer::getCharAndSizeNoWarn(TokPtr, Size, LangOpts);
-    TokPtr += Size;
-    PhysOffset += Size;
+    auto CharAndSize = Lexer::getCharAndSizeNoWarn(TokPtr, LangOpts);
+    TokPtr += CharAndSize.Size;
+    PhysOffset += CharAndSize.Size;
   }
 
   // Final detail: if we end up on an escaped newline, we want to return the
@@ -1357,15 +1356,16 @@ SourceLocation Lexer::findLocationAfterToken(
 ///
 /// NOTE: When this method is updated, getCharAndSizeSlowNoWarn (below) should
 /// be updated to match.
-char Lexer::getCharAndSizeSlow(const char *Ptr, unsigned &Size,
-                               Token *Tok) {
+Lexer::SizedChar Lexer::getCharAndSizeSlow(const char *Ptr, Token *Tok) {
+  unsigned Size = 0;
   // If we have a slash, look for an escaped newline.
   if (Ptr[0] == '\\') {
     ++Size;
     ++Ptr;
 Slash:
     // Common case, backslash-char where the char is not whitespace.
-    if (!isWhitespace(Ptr[0])) return '\\';
+    if (!isWhitespace(Ptr[0]))
+      return {'\\', Size};
 
     // See if we have optional whitespace characters between the slash and
     // newline.
@@ -1382,11 +1382,13 @@ char Lexer::getCharAndSizeSlow(const char *Ptr, unsigned &Size,
       Ptr  += EscapedNewLineSize;
 
       // Use slow version to accumulate a correct size field.
-      return getCharAndSizeSlow(Ptr, Size, Tok);
+      auto CharAndSize = getCharAndSizeSlow(Ptr, Tok);
+      CharAndSize.Size += Size;
+      return CharAndSize;
     }
 
     // Otherwise, this is not an escaped newline, just return the slash.
-    return '\\';
+    return {'\\', Size};
   }
 
   // If this is a trigraph, process it.
@@ -1401,13 +1403,12 @@ char Lexer::getCharAndSizeSlow(const char *Ptr, unsigned &Size,
       Ptr += 3;
       Size += 3;
       if (C == '\\') goto Slash;
-      return C;
+      return {C, Size};
     }
   }
 
   // If this is neither, return a single character.
-  ++Size;
-  return *Ptr;
+  return {*Ptr, Size + 1u};
 }
 
 /// getCharAndSizeSlowNoWarn - Handle the slow/uncommon case of the
@@ -1416,15 +1417,18 @@ char Lexer::getCharAndSizeSlow(const char *Ptr, unsigned &Size,
 ///
 /// NOTE: When this method is updated, getCharAndSizeSlow (above) should
 /// be updated to match.
-char Lexer::getCharAndSizeSlowNoWarn(const char *Ptr, unsigned &Size,
-                                     const LangOptions &LangOpts) {
+Lexer::SizedChar Lexer::getCharAndSizeSlowNoWarn(const char *Ptr,
+                                                 const LangOptions &LangOpts) {
+
+  unsigned Size = 0;
   // If we have a slash, look for an escaped newline.
   if (Ptr[0] == '\\') {
     ++Size;
     ++Ptr;
 Slash:
     // Common case, backslash-char where the char is not whitespace.
-    if (!isWhitespace(Ptr[0])) return '\\';
+    if (!isWhitespace(Ptr[0]))
+      return {'\\', Size};
 
     // See if we have optional whitespace characters followed by a newline.
     if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) {
@@ -1433,11 +1437,13 @@ char Lexer::getCharAndSizeSlowNoWarn(const char *Ptr, unsigned &Size,
       Ptr  += EscapedNewLineSize;
 
       // Use slow version to accumulate a correct size field.
-      return getCharAndSizeSlowNoWarn(Ptr, Size, LangOpts);
+      auto CharAndSize = getCharAndSizeSlowNoWarn(Ptr, LangOpts);
+      CharAndSize.Size += Size;
+      return CharAndSize;
     }
 
     // Otherwise, this is not an escaped newline, just return the slash.
-    return '\\';
+    return {'\\', Size};
   }
 
   // If this is a trigraph, process it.
@@ -1448,13 +1454,12 @@ char Lexer::getCharAndSizeSlowNoWarn(const char *Ptr, unsigned &Size,
       Ptr += 3;
       Size += 3;
       if (C == '\\') goto Slash;
-      return C;
+      return {C, Size};
     }
   }
 
   // If this is neither, return a single character.
-  ++Size;
-  return *Ptr;
+  return {*Ptr, Size + 1u};
 }
 
 //===----------------------------------------------------------------------===//
@@ -1964,11 +1969,14 @@ bool Lexer::LexIdentifierContinue(Token &Result, const char *CurPtr) {
 /// isHexaLiteral - Return true if Start points to a hex constant.
 /// in microsoft mode (where this is supposed to be several different tokens).
 bool Lexer::isHexaLiteral(const char *Start, const LangOptions &LangOpts) {
-  unsigned Size;
-  char C1 = Lexer::getCharAndSizeNoWarn(Start, Size, LangOpts);
+  auto CharAndSize1 = Lexer::getCharAndSizeNoWarn(Start, LangOpts);
+  char C1 = CharAndSize1.Char;
   if (C1 != '0')
     return false;
-  char C2 = Lexer::getCharAndSizeNoWarn(Start + Size, Size, LangOpts);
+
+  auto CharAndSize2 =
+      Lexer::getCharAndSizeNoWarn(Start + CharAndSize1.Size, LangOpts);
+  char C2 = CharAndSize2.Char;
   return (C2 == 'x' || C2 == 'X');
 }
 
@@ -2012,8 +2020,7 @@ bool Lexer::LexNumericConstant(Token &Result, const char *CurPtr) {
 
   // If we have a digit separator, continue.
   if (C == '\'' && (LangOpts.CPlusPlus14 || LangOpts.C23)) {
-    unsigned NextSize;
-    char Next = getCharAndSizeNoWarn(CurPtr + Size, NextSize, LangOpts);
+    auto [Next, NextSize] = getCharAndSizeNoWarn(CurPtr + Size, LangOpts);
     if (isAsciiIdentifierContinue(Next)) {
       if (!isLexingRawMode())
         Diag(CurPtr, LangOpts.CPlusPlus
@@ -2085,8 +2092,8 @@ const char *Lexer::LexUDSuffix(Token &Result, const char *CurPtr,
       unsigned Consumed = Size;
       unsigned Chars = 1;
       while (true) {
-        unsigned NextSize;
-        char Next = getCharAndSizeNoWarn(CurPtr + Consumed, NextSize, LangOpts);
+        auto [Next, NextSize] =
+            getCharAndSizeNoWarn(CurPtr + Consumed, LangOpts);
         if (!isAsciiIdentifierContinue(Next)) {
           // End of suffix. Check whether this is on the allowed list.
           const StringRef CompleteSuffix(Buffer, Chars);

From 6cc363ed7c9b56a2f3d26d0a88d72b32c1ce9c34 Mon Sep 17 00:00:00 2001
From: Artem Belevich <tra@google.com>
Date: Mon, 30 Oct 2023 16:12:40 -0700
Subject: [PATCH 448/877] [CUDA, NVPTX] accept/ignore any -mcmodel arguments.
 (#70740)

Code model has no impact on NVPTX as we do not produce any object files,
but we need to avoid erroring out on the -mcmodel argument passed to the
top-level compilation and propagated to all sub-compilations.
---
 clang/lib/Driver/ToolChains/Clang.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 43a92adbef64b..fb90fcd033b1a 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -5743,6 +5743,10 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
     } else if (Triple.getArch() == llvm::Triple::x86_64) {
       Ok = llvm::is_contained({"small", "kernel", "medium", "large", "tiny"},
                               CM);
+    } else if (Triple.isNVPTX()) {
+      // NVPTX does not care about the code model and will accept whatever works
+      // for the host.
+      Ok = true;
     }
     if (Ok) {
       CmdArgs.push_back(Args.MakeArgString("-mcmodel=" + CM));

From 8a786be384fb1279413bd123ce2dfeee9c008bc4 Mon Sep 17 00:00:00 2001
From: Med Ismail Bennani <ismail@bennani.ma>
Date: Mon, 30 Oct 2023 16:29:46 -0700
Subject: [PATCH 449/877] [lldb] Fix misleading indentiation warning in
 ScriptInterpreterPython (NFC) (#70732)

This should silence the "misleading indentiation" warnings introduced by
b2929be, by adding an no-op if-statement, if the surrounding
if-statement have been compiled out.

Signed-off-by: Med Ismail Bennani <ismail@bennani.ma>
---
 .../ScriptInterpreter/Python/ScriptInterpreterPython.cpp      | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp
index 968cc8ca03001..953f8b3aba18f 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp
@@ -183,6 +183,8 @@ struct InitializePythonRAII {
 // Python 3.13. It has been returning `true` always since Python 3.7.
 #if (PY_MAJOR_VERSION == 3 && PY_MINOR_VERSION < 9) || (PY_MAJOR_VERSION < 3)
     if (PyEval_ThreadsInitialized()) {
+#else
+    if (true) {
 #endif
       Log *log = GetLog(LLDBLog::Script);
 
@@ -199,6 +201,8 @@ struct InitializePythonRAII {
 
     // InitThreads acquires the GIL if it hasn't been called before.
     PyEval_InitThreads();
+#else
+    }
 #endif
   }
 

From 784a2cd561acf3cf532cf182221be0826b0ec6a1 Mon Sep 17 00:00:00 2001
From: Philip Reames <preames@rivosinc.com>
Date: Mon, 30 Oct 2023 16:35:30 -0700
Subject: [PATCH 450/877] [RISCV] Rewrite RISCVCodeGenPrepare using zext nneg
 [nfc-ish] (#70739)

This stacks on #70725. Once we have lowering for zext nneg, we can
rewrite all of the existing RISCVCodeGenPrepare login in terms of zext
nneg instead of sext. The change isn't NFC from the perspective of the
individual pass, but should be from the perspective of codegen as a
whole.

As noted in the TODO, one piece can be moved to instcombine, but I'll
leave that to a separate commit.
---
 llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp | 50 ++++++++-----------
 .../CodeGen/RISCV/riscv-codegenprepare.ll     |  4 +-
 2 files changed, 24 insertions(+), 30 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp b/llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp
index 2fcd9a40588a7..7bc7e3924ca70 100644
--- a/llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp
+++ b/llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp
@@ -62,38 +62,32 @@ bool RISCVCodeGenPrepare::visitZExtInst(ZExtInst &ZExt) {
   if (!ST->is64Bit())
     return false;
 
+  if (ZExt.hasNonNeg())
+    return false;
+
   Value *Src = ZExt.getOperand(0);
 
   // We only care about ZExt from i32 to i64.
   if (!ZExt.getType()->isIntegerTy(64) || !Src->getType()->isIntegerTy(32))
     return false;
 
-  // Look for an opportunity to replace (i64 (zext (i32 X))) with a sext if we
-  // can determine that the sign bit of X is zero via a dominating condition.
-  // This often occurs with widened induction variables.
+  // Look for an opportunity to infer nneg on a zext if we can determine that
+  // the sign bit of X is zero via a dominating condition. This often occurs
+  // with widened induction variables.
   if (isImpliedByDomCondition(ICmpInst::ICMP_SGE, Src,
                               Constant::getNullValue(Src->getType()), &ZExt,
                               *DL).value_or(false)) {
-    auto *SExt = new SExtInst(Src, ZExt.getType(), "", &ZExt);
-    SExt->takeName(&ZExt);
-    SExt->setDebugLoc(ZExt.getDebugLoc());
-
-    ZExt.replaceAllUsesWith(SExt);
-    ZExt.eraseFromParent();
+    ZExt.setNonNeg(true);
     ++NumZExtToSExt;
     return true;
   }
 
-  // Convert (zext (abs(i32 X, i1 1))) -> (sext (abs(i32 X, i1 1))). If abs of
+  // Convert (zext (abs(i32 X, i1 1))) -> (zext nneg (abs(i32 X, i1 1))). If abs of
   // INT_MIN is poison, the sign bit is zero.
+  // TODO: Move this to instcombine now that we have zext nneg in IR.
   using namespace PatternMatch;
   if (match(Src, m_Intrinsic<Intrinsic::abs>(m_Value(), m_One()))) {
-    auto *SExt = new SExtInst(Src, ZExt.getType(), "", &ZExt);
-    SExt->takeName(&ZExt);
-    SExt->setDebugLoc(ZExt.getDebugLoc());
-
-    ZExt.replaceAllUsesWith(SExt);
-    ZExt.eraseFromParent();
+    ZExt.setNonNeg(true);
     ++NumZExtToSExt;
     return true;
   }
@@ -102,9 +96,8 @@ bool RISCVCodeGenPrepare::visitZExtInst(ZExtInst &ZExt) {
 }
 
 // Try to optimize (i64 (and (zext/sext (i32 X), C1))) if C1 has bit 31 set,
-// but bits 63:32 are zero. If we can prove that bit 31 of X is 0, we can fill
-// the upper 32 bits with ones. A separate transform will turn (zext X) into
-// (sext X) for the same condition.
+// but bits 63:32 are zero. If we know that bit 31 of X is 0, we can fill
+// the upper 32 bits with ones.
 bool RISCVCodeGenPrepare::visitAnd(BinaryOperator &BO) {
   if (!ST->is64Bit())
     return false;
@@ -112,9 +105,17 @@ bool RISCVCodeGenPrepare::visitAnd(BinaryOperator &BO) {
   if (!BO.getType()->isIntegerTy(64))
     return false;
 
-  // Left hand side should be sext or zext.
+  auto canBeSignExtend = [](Instruction *I) {
+    if (isa<SExtInst>(I))
+      return true;
+    if (isa<ZExtInst>(I))
+      return I->hasNonNeg();
+    return false;
+  };
+
+  // Left hand side should be a sext or zext nneg.
   Instruction *LHS = dyn_cast<Instruction>(BO.getOperand(0));
-  if (!LHS || (!isa<SExtInst>(LHS) && !isa<ZExtInst>(LHS)))
+  if (!LHS || !canBeSignExtend(LHS))
     return false;
 
   Value *LHSSrc = LHS->getOperand(0);
@@ -135,13 +136,6 @@ bool RISCVCodeGenPrepare::visitAnd(BinaryOperator &BO) {
   if (!isUInt<32>(C) || isInt<12>(C) || !isInt<12>(SignExtend64<32>(C)))
     return false;
 
-  // If we can determine the sign bit of the input is 0, we can replace the
-  // And mask constant.
-  if (!isImpliedByDomCondition(ICmpInst::ICMP_SGE, LHSSrc,
-                               Constant::getNullValue(LHSSrc->getType()),
-                               LHS, *DL).value_or(false))
-    return false;
-
   // Sign extend the constant and replace the And operand.
   C = SignExtend64<32>(C);
   BO.setOperand(1, ConstantInt::get(LHS->getType(), C));
diff --git a/llvm/test/CodeGen/RISCV/riscv-codegenprepare.ll b/llvm/test/CodeGen/RISCV/riscv-codegenprepare.ll
index b7530c80c417a..b4f0918635650 100644
--- a/llvm/test/CodeGen/RISCV/riscv-codegenprepare.ll
+++ b/llvm/test/CodeGen/RISCV/riscv-codegenprepare.ll
@@ -9,7 +9,7 @@ define void @test1(ptr nocapture noundef %a, i32 noundef signext %n) {
 ; CHECK-NEXT:    [[CMP3:%.*]] = icmp sgt i32 [[N:%.*]], 0
 ; CHECK-NEXT:    br i1 [[CMP3]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
 ; CHECK:       for.body.preheader:
-; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = sext i32 [[N]] to i64
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.cond.cleanup.loopexit:
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
@@ -60,7 +60,7 @@ define void @test2(ptr nocapture noundef %a, i32 noundef signext %n) {
 ; CHECK-NEXT:    [[CMP3:%.*]] = icmp sgt i32 [[N:%.*]], 0
 ; CHECK-NEXT:    br i1 [[CMP3]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
 ; CHECK:       for.body.preheader:
-; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = sext i32 [[N]] to i64
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[N]] to i64
 ; CHECK-NEXT:    [[XTRAITER:%.*]] = and i64 [[WIDE_TRIP_COUNT]], 1
 ; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[N]], 1
 ; CHECK-NEXT:    br i1 [[TMP0]], label [[FOR_COND_CLEANUP_LOOPEXIT_UNR_LCSSA:%.*]], label [[FOR_BODY_PREHEADER_NEW:%.*]]

From c2f642d90d33a4e6c987b52e22eca4221c86c601 Mon Sep 17 00:00:00 2001
From: Peter Klausler <35819229+klausler@users.noreply.github.com>
Date: Mon, 30 Oct 2023 16:41:08 -0700
Subject: [PATCH 451/877] [flang] Derived type structural equivalence (#69376)

F'202X 7.5.2.4 describes conditions under which two derived type
definitions are to be considered equivalent. These rules are already
implemented in Evaluate/type.cpp but not exposed for general use;
rearrange the code a little so that the compatibility checking of
separate module procedure interfaces and explicit definitions can use it
to avoid emitting a bogus error message.

Fixes https://github.com/llvm/llvm-project/issues/67946.
---
 flang/include/flang/Evaluate/type.h        |  4 +
 flang/lib/Evaluate/type.cpp                | 15 +++-
 flang/lib/Semantics/check-declarations.cpp |  5 +-
 flang/test/Semantics/separate-mp02.f90     | 10 +--
 flang/test/Semantics/separate-mp03.f90     |  2 +-
 flang/test/Semantics/separate-mp06.f90     | 98 ++++++++++++++++++++++
 6 files changed, 124 insertions(+), 10 deletions(-)
 create mode 100644 flang/test/Semantics/separate-mp06.f90

diff --git a/flang/include/flang/Evaluate/type.h b/flang/include/flang/Evaluate/type.h
index ff784ef51f902..99916aaf39978 100644
--- a/flang/include/flang/Evaluate/type.h
+++ b/flang/include/flang/Evaluate/type.h
@@ -208,6 +208,10 @@ class DynamicType {
   // SAME_TYPE_AS (16.9.165); ignores type parameter values
   std::optional<bool> SameTypeAs(const DynamicType &) const;
 
+  // 7.5.2.4 type equivalence; like operator==(), but SEQUENCE/BIND(C)
+  // derived types can be structurally equivalent.
+  bool IsEquivalentTo(const DynamicType &) const;
+
   // Result will be missing when a symbol is absent or
   // has an erroneous type, e.g., REAL(KIND=666).
   static std::optional<DynamicType> From(const semantics::DeclTypeSpec &);
diff --git a/flang/lib/Evaluate/type.cpp b/flang/lib/Evaluate/type.cpp
index dbadc07fdbbc5..82b8f28f961dd 100644
--- a/flang/lib/Evaluate/type.cpp
+++ b/flang/lib/Evaluate/type.cpp
@@ -293,7 +293,7 @@ const semantics::DerivedTypeSpec *GetParentTypeSpec(
 }
 
 // Compares two derived type representations to see whether they both
-// represent the "same type" in the sense of section 7.5.2.4.
+// represent the "same type" in the sense of section F'2023 7.5.2.4.
 using SetOfDerivedTypePairs =
     std::set<std::pair<const semantics::DerivedTypeSpec *,
         const semantics::DerivedTypeSpec *>>;
@@ -513,6 +513,19 @@ bool AreSameDerivedType(
   return AreSameDerivedType(x, y, false, false, inProgress);
 }
 
+bool AreSameDerivedType(
+    const semantics::DerivedTypeSpec *x, const semantics::DerivedTypeSpec *y) {
+  return x == y || (x && y && AreSameDerivedType(*x, *y));
+}
+
+bool DynamicType::IsEquivalentTo(const DynamicType &that) const {
+  return category_ == that.category_ && kind_ == that.kind_ &&
+      PointeeComparison(charLengthParamValue_, that.charLengthParamValue_) &&
+      knownLength().has_value() == that.knownLength().has_value() &&
+      (!knownLength() || *knownLength() == *that.knownLength()) &&
+      AreSameDerivedType(derived_, that.derived_);
+}
+
 static bool AreCompatibleDerivedTypes(const semantics::DerivedTypeSpec *x,
     const semantics::DerivedTypeSpec *y, bool isPolymorphic,
     bool ignoreTypeParameterValues, bool ignoreLenTypeParameters) {
diff --git a/flang/lib/Semantics/check-declarations.cpp b/flang/lib/Semantics/check-declarations.cpp
index 2c2866d590ae5..ce16b2df54b05 100644
--- a/flang/lib/Semantics/check-declarations.cpp
+++ b/flang/lib/Semantics/check-declarations.cpp
@@ -3354,10 +3354,9 @@ void SubprogramMatchHelper::CheckDummyDataObject(const Symbol &symbol1,
     const DummyDataObject &obj2) {
   if (!CheckSameIntent(symbol1, symbol2, obj1.intent, obj2.intent)) {
   } else if (!CheckSameAttrs(symbol1, symbol2, obj1.attrs, obj2.attrs)) {
-  } else if (obj1.type.type() != obj2.type.type()) {
+  } else if (!obj1.type.type().IsEquivalentTo(obj2.type.type())) {
     Say(symbol1, symbol2,
-        "Dummy argument '%s' has type %s; the corresponding argument in the"
-        " interface body has type %s"_err_en_US,
+        "Dummy argument '%s' has type %s; the corresponding argument in the interface body has distinct type %s"_err_en_US,
         obj1.type.type().AsFortran(), obj2.type.type().AsFortran());
   } else if (!ShapesAreCompatible(obj1, obj2)) {
     Say(symbol1, symbol2,
diff --git a/flang/test/Semantics/separate-mp02.f90 b/flang/test/Semantics/separate-mp02.f90
index fd9c4c3cc18f9..39a469b6ccc09 100644
--- a/flang/test/Semantics/separate-mp02.f90
+++ b/flang/test/Semantics/separate-mp02.f90
@@ -51,9 +51,9 @@ module subroutine s5(x, y)
     real :: y
   end
   module subroutine s6(x, y)
-    !ERROR: Dummy argument 'x' has type INTEGER(4); the corresponding argument in the interface body has type REAL(4)
+    !ERROR: Dummy argument 'x' has type INTEGER(4); the corresponding argument in the interface body has distinct type REAL(4)
     integer :: x
-    !ERROR: Dummy argument 'y' has type REAL(8); the corresponding argument in the interface body has type REAL(4)
+    !ERROR: Dummy argument 'y' has type REAL(8); the corresponding argument in the interface body has distinct type REAL(4)
     real(8) :: y
   end
   module subroutine s7(x, y, z)
@@ -72,10 +72,10 @@ module subroutine s8(x, y, z)
   end
   module subroutine s9(x, y, z, w)
     character(len=4) :: x
-    !ERROR: Dummy argument 'y' has type CHARACTER(KIND=1,LEN=5_8); the corresponding argument in the interface body has type CHARACTER(KIND=1,LEN=4_8)
+    !ERROR: Dummy argument 'y' has type CHARACTER(KIND=1,LEN=5_8); the corresponding argument in the interface body has distinct type CHARACTER(KIND=1,LEN=4_8)
     character(len=5) :: y
     character(len=*) :: z
-    !ERROR: Dummy argument 'w' has type CHARACTER(KIND=1,LEN=4_8); the corresponding argument in the interface body has type CHARACTER(KIND=1,LEN=*)
+    !ERROR: Dummy argument 'w' has type CHARACTER(KIND=1,LEN=4_8); the corresponding argument in the interface body has distinct type CHARACTER(KIND=1,LEN=*)
     character(len=4) :: w
   end
 end
@@ -330,7 +330,7 @@ module subroutine sub1(s)
     character(len=-1) s ! ok
   end subroutine
   module subroutine sub2(s)
-    !ERROR: Dummy argument 's' has type CHARACTER(KIND=1,LEN=1_8); the corresponding argument in the interface body has type CHARACTER(KIND=1,LEN=0_8)
+    !ERROR: Dummy argument 's' has type CHARACTER(KIND=1,LEN=1_8); the corresponding argument in the interface body has distinct type CHARACTER(KIND=1,LEN=0_8)
     character(len=1) s
   end subroutine
 end submodule
diff --git a/flang/test/Semantics/separate-mp03.f90 b/flang/test/Semantics/separate-mp03.f90
index 33bf1cf8e414f..1bbeced44a4f7 100644
--- a/flang/test/Semantics/separate-mp03.f90
+++ b/flang/test/Semantics/separate-mp03.f90
@@ -74,7 +74,7 @@ pure module subroutine s2
   end interface
  contains
   integer module function f1(x)
-    !ERROR: Dummy argument 'x' has type INTEGER(4); the corresponding argument in the interface body has type REAL(4)
+    !ERROR: Dummy argument 'x' has type INTEGER(4); the corresponding argument in the interface body has distinct type REAL(4)
     integer, intent(in) :: x
     f1 = x
   end function
diff --git a/flang/test/Semantics/separate-mp06.f90 b/flang/test/Semantics/separate-mp06.f90
new file mode 100644
index 0000000000000..9c76466d726dc
--- /dev/null
+++ b/flang/test/Semantics/separate-mp06.f90
@@ -0,0 +1,98 @@
+! RUN: %python %S/test_errors.py %s %flang_fc1
+! Structural equivalence of derived type definitions
+module m
+  interface
+    module subroutine s1(x)
+      type :: nonseq
+        integer :: n
+      end type
+      type(nonseq), intent(in) :: x
+    end subroutine
+    module subroutine s2(x)
+      type :: seq
+        sequence
+        integer :: n
+      end type
+      type(seq), intent(in) :: x
+    end subroutine
+    module subroutine s3(x)
+      type :: chlen
+        sequence
+        character(2) :: s
+      end type
+      type(chlen), intent(in) :: x
+    end subroutine
+    module subroutine s4(x)
+      !ERROR: A sequence type may not have type parameters
+      type :: pdt(k)
+        integer, kind :: k
+        sequence
+        real(k) :: a
+      end type
+      type(pdt(4)), intent(in) :: x
+    end subroutine
+  end interface
+end module
+
+submodule(m) sm
+ contains
+  module subroutine s1(x)
+    type :: nonseq
+      integer :: n
+    end type
+    !ERROR: Dummy argument 'x' has type nonseq; the corresponding argument in the interface body has distinct type nonseq
+    type(nonseq), intent(in) :: x
+  end subroutine
+  module subroutine s2(x) ! ok
+    type :: seq
+      sequence
+      integer :: n
+    end type
+    type(seq), intent(in) :: x
+  end subroutine
+  module subroutine s3(x)
+    type :: chlen
+      sequence
+      character(3) :: s ! note: length is 3, not 2
+    end type
+    !ERROR: Dummy argument 'x' has type chlen; the corresponding argument in the interface body has distinct type chlen
+    type(chlen), intent(in) :: x
+  end subroutine
+  module subroutine s4(x)
+    !ERROR: A sequence type may not have type parameters
+    type :: pdt(k)
+      integer, kind :: k
+      sequence
+      real(k) :: a
+    end type
+    !ERROR: Dummy argument 'x' has type pdt(k=4_4); the corresponding argument in the interface body has distinct type pdt(k=4_4)
+    type(pdt(4)), intent(in) :: x
+  end subroutine
+end submodule
+
+program main
+  use m
+  type :: nonseq
+    integer :: n
+  end type
+  type :: seq
+    sequence
+    integer :: n
+  end type
+  type :: chlen
+    sequence
+    character(2) :: s
+  end type
+  !ERROR: A sequence type may not have type parameters
+  type :: pdt(k)
+    integer, kind :: k
+    sequence
+    real(k) :: a
+  end type
+  !ERROR: Actual argument type 'nonseq' is not compatible with dummy argument type 'nonseq'
+  call s1(nonseq(1))
+  call s2(seq(1)) ! ok
+  call s3(chlen('ab')) ! ok, matches interface
+  !ERROR: Actual argument type 'pdt(k=4_4)' is not compatible with dummy argument type 'pdt(k=4_4)'
+  call s4(pdt(4)(3.14159))
+end program

From 4b3cd379cce3f455bf3c8677ca7a5be6e708a4ce Mon Sep 17 00:00:00 2001
From: Med Ismail Bennani <ismail@bennani.ma>
Date: Mon, 30 Oct 2023 16:52:17 -0700
Subject: [PATCH 452/877] [lldb] Make use of Scripted{Python,}Interface for
 ScriptedThreadPlan (#70392)

This patch makes ScriptedThreadPlan conforming to the ScriptedInterface
& ScriptedPythonInterface facilities by introducing 2
ScriptedThreadPlanInterface & ScriptedThreadPlanPythonInterface classes.

This allows us to get rid of every ScriptedThreadPlan-specific SWIG
method and re-use the same affordances as other scripting offordances,
like Scripted{Process,Thread,Platform} & OperatingSystem.

To do so, this adds new transformer methods for `ThreadPlan`, `Stream` &
`Event`, to allow the bijection between C++ objects and their python
counterparts.

Signed-off-by: Med Ismail Bennani <ismail@bennani.ma>
---
 lldb/bindings/python/python-swigsafecast.swig |  13 +-
 lldb/bindings/python/python-wrapper.swig      | 153 +++---------------
 lldb/include/lldb/API/SBEvent.h               |   4 +-
 lldb/include/lldb/API/SBStream.h              |   9 ++
 .../Interfaces/ScriptedInterface.h            |   4 +-
 .../Interfaces/ScriptedThreadPlanInterface.h  |  40 +++++
 .../lldb/Interpreter/ScriptInterpreter.h      |  56 ++-----
 lldb/include/lldb/Target/ThreadPlanPython.h   |   2 +
 lldb/include/lldb/lldb-forward.h              |   3 +
 lldb/source/Interpreter/ScriptInterpreter.cpp |  13 ++
 .../Python/Interfaces/CMakeLists.txt          |   1 +
 .../ScriptedPlatformPythonInterface.cpp       |   2 +
 .../Interfaces/ScriptedPythonInterface.cpp    |  34 +++-
 .../Interfaces/ScriptedPythonInterface.h      |  20 +++
 .../ScriptedThreadPlanPythonInterface.cpp     |  92 +++++++++++
 .../ScriptedThreadPlanPythonInterface.h       |  44 +++++
 .../ScriptedThreadPythonInterface.cpp         |   1 +
 .../Python/SWIGPythonBridge.h                 |  21 +--
 .../Python/ScriptInterpreterPython.cpp        | 122 +-------------
 .../Python/ScriptInterpreterPythonImpl.h      |  28 +---
 lldb/source/Target/ThreadPlanPython.cpp       |  97 ++++++-----
 .../functionalities/step_scripted/Steps.py    |   4 +-
 .../Python/PythonTestSuite.cpp                |  45 +++---
 23 files changed, 401 insertions(+), 407 deletions(-)
 create mode 100644 lldb/include/lldb/Interpreter/Interfaces/ScriptedThreadPlanInterface.h
 create mode 100644 lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPlanPythonInterface.cpp
 create mode 100644 lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPlanPythonInterface.h

diff --git a/lldb/bindings/python/python-swigsafecast.swig b/lldb/bindings/python/python-swigsafecast.swig
index d5ea514872713..fba3a77d8f2df 100644
--- a/lldb/bindings/python/python-swigsafecast.swig
+++ b/lldb/bindings/python/python-swigsafecast.swig
@@ -37,10 +37,6 @@ PythonObject SWIGBridge::ToSWIGWrapper(const Status& status) {
   return ToSWIGHelper(new lldb::SBError(status), SWIGTYPE_p_lldb__SBError);
 }
 
-PythonObject SWIGBridge::ToSWIGWrapper(std::unique_ptr<lldb::SBStream> stream_sb) {
-  return ToSWIGHelper(stream_sb.release(), SWIGTYPE_p_lldb__SBStream);
-}
-
 PythonObject SWIGBridge::ToSWIGWrapper(std::unique_ptr<lldb::SBStructuredData> data_sb) {
   return ToSWIGHelper(data_sb.release(), SWIGTYPE_p_lldb__SBStructuredData);
 }
@@ -115,9 +111,12 @@ SWIGBridge::ToSWIGWrapper(CommandReturnObject &cmd_retobj) {
       SWIGTYPE_p_lldb__SBCommandReturnObject);
 }
 
-ScopedPythonObject<lldb::SBEvent> SWIGBridge::ToSWIGWrapper(Event *event) {
-  return ScopedPythonObject<lldb::SBEvent>(new lldb::SBEvent(event),
-                                           SWIGTYPE_p_lldb__SBEvent);
+PythonObject SWIGBridge::ToSWIGWrapper(const Stream *s) {
+  return ToSWIGHelper(new lldb::SBStream(), SWIGTYPE_p_lldb__SBStream);
+}
+
+PythonObject SWIGBridge::ToSWIGWrapper(Event *event) {
+  return ToSWIGHelper(new lldb::SBEvent(event), SWIGTYPE_p_lldb__SBEvent);
 }
 
 PythonObject SWIGBridge::ToSWIGWrapper(
diff --git a/lldb/bindings/python/python-wrapper.swig b/lldb/bindings/python/python-wrapper.swig
index 17bc7b1f21987..5c28d65282407 100644
--- a/lldb/bindings/python/python-wrapper.swig
+++ b/lldb/bindings/python/python-wrapper.swig
@@ -229,133 +229,6 @@ PythonObject lldb_private::python::SWIGBridge::LLDBSwigPythonCreateCommandObject
   return pfunc(SWIGBridge::ToSWIGWrapper(std::move(debugger_sp)), dict);
 }
 
-PythonObject lldb_private::python::SWIGBridge::LLDBSwigPythonCreateScriptedThreadPlan(
-    const char *python_class_name, const char *session_dictionary_name,
-    const lldb_private::StructuredDataImpl &args_impl,
-    std::string &error_string, const lldb::ThreadPlanSP &thread_plan_sp) {
-  if (python_class_name == NULL || python_class_name[0] == '\0' ||
-      !session_dictionary_name)
-    return PythonObject();
-
-  PyErr_Cleaner py_err_cleaner(true);
-
-  auto dict = PythonModule::MainModule().ResolveName<PythonDictionary>(
-      session_dictionary_name);
-  auto pfunc = PythonObject::ResolveNameWithDictionary<PythonCallable>(
-      python_class_name, dict);
-
-  if (!pfunc.IsAllocated()) {
-    error_string.append("could not find script class: ");
-    error_string.append(python_class_name);
-    return PythonObject();
-  }
-
-  PythonObject tp_arg = SWIGBridge::ToSWIGWrapper(thread_plan_sp);
-
-  llvm::Expected<PythonCallable::ArgInfo> arg_info = pfunc.GetArgInfo();
-  if (!arg_info) {
-    llvm::handleAllErrors(
-        arg_info.takeError(),
-        [&](PythonException &E) { error_string.append(E.ReadBacktrace()); },
-        [&](const llvm::ErrorInfoBase &E) {
-          error_string.append(E.message());
-        });
-    return PythonObject();
-  }
-
-  PythonObject result = {};
-  auto args_sb = std::unique_ptr<lldb::SBStructuredData>(new lldb::SBStructuredData(args_impl));
-  if (arg_info.get().max_positional_args == 2) {
-    if (args_sb->IsValid()) {
-      error_string.assign(
-          "args passed, but __init__ does not take an args dictionary");
-      return PythonObject();
-    }
-    result = pfunc(tp_arg, dict);
-  } else if (arg_info.get().max_positional_args >= 3) {
-    result = pfunc(tp_arg, SWIGBridge::ToSWIGWrapper(std::move(args_sb)), dict);
-  } else {
-    error_string.assign("wrong number of arguments in __init__, should be 2 or "
-                        "3 (not including self)");
-    return PythonObject();
-  }
-
-  // FIXME: At this point we should check that the class we found supports all
-  // the methods that we need.
-
-  return result;
-}
-
-bool lldb_private::python::SWIGBridge::LLDBSWIGPythonCallThreadPlan(
-    void *implementor, const char *method_name, lldb_private::Event *event,
-    bool &got_error) {
-  got_error = false;
-
-  PyErr_Cleaner py_err_cleaner(false);
-  PythonObject self(PyRefType::Borrowed, static_cast<PyObject *>(implementor));
-  auto pfunc = self.ResolveName<PythonCallable>(method_name);
-
-  if (!pfunc.IsAllocated())
-    return false;
-
-  PythonObject result;
-  if (event != nullptr) {
-    ScopedPythonObject<SBEvent> event_arg = SWIGBridge::ToSWIGWrapper(event);
-    result = pfunc(event_arg.obj());
-  } else
-    result = pfunc();
-
-  if (PyErr_Occurred()) {
-    got_error = true;
-    printf("Return value was neither false nor true for call to %s.\n",
-           method_name);
-    PyErr_Print();
-    return false;
-  }
-
-  if (result.get() == Py_True)
-    return true;
-  else if (result.get() == Py_False)
-    return false;
-
-  // Somebody returned the wrong thing...
-  got_error = true;
-  printf("Wrong return value type for call to %s.\n", method_name);
-  return false;
-}
-
-bool lldb_private::python::SWIGBridge::LLDBSWIGPythonCallThreadPlan(
-    void *implementor, const char *method_name, lldb_private::Stream *stream,
-    bool &got_error) {
-  got_error = false;
-
-  PyErr_Cleaner py_err_cleaner(false);
-  PythonObject self(PyRefType::Borrowed, static_cast<PyObject *>(implementor));
-  auto pfunc = self.ResolveName<PythonCallable>(method_name);
-
-  if (!pfunc.IsAllocated()) 
-    return false;
-
-  auto *sb_stream = new lldb::SBStream();
-  PythonObject sb_stream_arg =
-      SWIGBridge::ToSWIGWrapper(std::unique_ptr<lldb::SBStream>(sb_stream));
-
-  PythonObject result;
-  result = pfunc(sb_stream_arg);
-
-  if (PyErr_Occurred()) {
-    printf("Error occured for call to %s.\n",
-           method_name);
-    PyErr_Print();
-    got_error = true;
-    return false;
-  }
-  if (stream)
-    stream->PutCString(sb_stream->GetData());
-  return true;
-
-}
-
 PythonObject lldb_private::python::SWIGBridge::LLDBSwigPythonCreateScriptedBreakpointResolver(
     const char *python_class_name, const char *session_dictionary_name,
     const StructuredDataImpl &args_impl,
@@ -502,7 +375,7 @@ bool lldb_private::python::SWIGBridge::LLDBSwigPythonStopHookCallHandleStop(
 
   auto *sb_stream = new lldb::SBStream();
   PythonObject sb_stream_arg =
-      SWIGBridge::ToSWIGWrapper(std::unique_ptr<lldb::SBStream>(sb_stream));
+      SWIGBridge::ToSWIGWrapper(stream.get());
   PythonObject result =
       pfunc(SWIGBridge::ToSWIGWrapper(std::move(exc_ctx_sp)), sb_stream_arg);
 
@@ -753,6 +626,30 @@ void *lldb_private::python::LLDBSWIGPython_CastPyObjectToSBError(PyObject * data
   return sb_ptr;
 }
 
+void *lldb_private::python::LLDBSWIGPython_CastPyObjectToSBEvent(PyObject * data) {
+  lldb::SBEvent *sb_ptr = nullptr;
+
+  int valid_cast =
+      SWIG_ConvertPtr(data, (void **)&sb_ptr, SWIGTYPE_p_lldb__SBEvent, 0);
+
+  if (valid_cast == -1)
+    return NULL;
+
+  return sb_ptr;
+}
+
+void *lldb_private::python::LLDBSWIGPython_CastPyObjectToSBStream(PyObject * data) {
+  lldb::SBStream *sb_ptr = nullptr;
+
+  int valid_cast =
+      SWIG_ConvertPtr(data, (void **)&sb_ptr, SWIGTYPE_p_lldb__SBStream, 0);
+
+  if (valid_cast == -1)
+    return NULL;
+
+  return sb_ptr;
+}
+
 void *lldb_private::python::LLDBSWIGPython_CastPyObjectToSBValue(PyObject * data) {
   lldb::SBValue *sb_ptr = NULL;
 
diff --git a/lldb/include/lldb/API/SBEvent.h b/lldb/include/lldb/API/SBEvent.h
index cc116766e85f4..85b401ca8cc10 100644
--- a/lldb/include/lldb/API/SBEvent.h
+++ b/lldb/include/lldb/API/SBEvent.h
@@ -15,6 +15,7 @@
 #include <vector>
 
 namespace lldb_private {
+class ScriptInterpreter;
 namespace python {
 class SWIGBridge;
 }
@@ -73,11 +74,12 @@ class LLDB_API SBEvent {
   friend class SBThread;
   friend class SBWatchpoint;
 
+  friend class lldb_private::ScriptInterpreter;
   friend class lldb_private::python::SWIGBridge;
 
   SBEvent(lldb::EventSP &event_sp);
 
-  SBEvent(lldb_private::Event *event_sp);
+  SBEvent(lldb_private::Event *event);
 
   lldb::EventSP &GetSP() const;
 
diff --git a/lldb/include/lldb/API/SBStream.h b/lldb/include/lldb/API/SBStream.h
index 0e33f05b69916..ee329737d594b 100644
--- a/lldb/include/lldb/API/SBStream.h
+++ b/lldb/include/lldb/API/SBStream.h
@@ -13,6 +13,13 @@
 
 #include "lldb/API/SBDefines.h"
 
+namespace lldb_private {
+class ScriptInterpreter;
+namespace python {
+class SWIGBridge;
+}
+} // namespace lldb_private
+
 namespace lldb {
 
 class LLDB_API SBStream {
@@ -101,6 +108,8 @@ class LLDB_API SBStream {
   friend class SBValue;
   friend class SBWatchpoint;
 
+  friend class lldb_private::ScriptInterpreter;
+
   lldb_private::Stream *operator->();
 
   lldb_private::Stream *get();
diff --git a/lldb/include/lldb/Interpreter/Interfaces/ScriptedInterface.h b/lldb/include/lldb/Interpreter/Interfaces/ScriptedInterface.h
index e4816352daa5d..fc0e488da6982 100644
--- a/lldb/include/lldb/Interpreter/Interfaces/ScriptedInterface.h
+++ b/lldb/include/lldb/Interpreter/Interfaces/ScriptedInterface.h
@@ -10,7 +10,6 @@
 #define LLDB_INTERPRETER_INTERFACES_SCRIPTEDINTERFACE_H
 
 #include "lldb/Core/StructuredDataImpl.h"
-#include "lldb/Target/ExecutionContext.h"
 #include "lldb/Utility/LLDBLog.h"
 #include "lldb/Utility/Log.h"
 #include "lldb/Utility/UnimplementedError.h"
@@ -50,7 +49,8 @@ class ScriptedInterface {
   }
 
   template <typename T = StructuredData::ObjectSP>
-  bool CheckStructuredDataObject(llvm::StringRef caller, T obj, Status &error) {
+  static bool CheckStructuredDataObject(llvm::StringRef caller, T obj,
+                                        Status &error) {
     if (!obj)
       return ErrorWithMessage<bool>(caller, "Null Structured Data object",
                                     error);
diff --git a/lldb/include/lldb/Interpreter/Interfaces/ScriptedThreadPlanInterface.h b/lldb/include/lldb/Interpreter/Interfaces/ScriptedThreadPlanInterface.h
new file mode 100644
index 0000000000000..4dadda4d97898
--- /dev/null
+++ b/lldb/include/lldb/Interpreter/Interfaces/ScriptedThreadPlanInterface.h
@@ -0,0 +1,40 @@
+//===-- ScriptedThreadPlanInterface.h ---------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLDB_INTERPRETER_INTERFACES_SCRIPTEDTHREADPLANINTERFACE_H
+#define LLDB_INTERPRETER_INTERFACES_SCRIPTEDTHREADPLANINTERFACE_H
+
+#include "lldb/lldb-private.h"
+
+#include "ScriptedInterface.h"
+
+namespace lldb_private {
+class ScriptedThreadPlanInterface : public ScriptedInterface {
+public:
+  virtual llvm::Expected<StructuredData::GenericSP>
+  CreatePluginObject(llvm::StringRef class_name,
+                     lldb::ThreadPlanSP thread_plan_sp,
+                     const StructuredDataImpl &args_sp) {
+    llvm_unreachable("unimplemented!");
+  }
+
+  virtual llvm::Expected<bool> ExplainsStop(Event *event) { return true; }
+
+  virtual llvm::Expected<bool> ShouldStop(Event *event) { return true; }
+
+  virtual llvm::Expected<bool> IsStale() { return true; };
+
+  virtual lldb::StateType GetRunState() { return lldb::eStateStepping; }
+
+  virtual llvm::Expected<bool> GetStopDescription(lldb_private::Stream *s) {
+    return true;
+  }
+};
+} // namespace lldb_private
+
+#endif // LLDB_INTERPRETER_INTERFACES_SCRIPTEDTHREADPLANINTERFACE_H
diff --git a/lldb/include/lldb/Interpreter/ScriptInterpreter.h b/lldb/include/lldb/Interpreter/ScriptInterpreter.h
index 0146eeb862620..7e2a7286e2042 100644
--- a/lldb/include/lldb/Interpreter/ScriptInterpreter.h
+++ b/lldb/include/lldb/Interpreter/ScriptInterpreter.h
@@ -13,8 +13,10 @@
 #include "lldb/API/SBBreakpoint.h"
 #include "lldb/API/SBData.h"
 #include "lldb/API/SBError.h"
+#include "lldb/API/SBEvent.h"
 #include "lldb/API/SBLaunchInfo.h"
 #include "lldb/API/SBMemoryRegionInfo.h"
+#include "lldb/API/SBStream.h"
 #include "lldb/Breakpoint/BreakpointOptions.h"
 #include "lldb/Core/PluginInterface.h"
 #include "lldb/Core/SearchFilter.h"
@@ -25,6 +27,7 @@
 #include "lldb/Interpreter/Interfaces/ScriptedPlatformInterface.h"
 #include "lldb/Interpreter/Interfaces/ScriptedProcessInterface.h"
 #include "lldb/Interpreter/Interfaces/ScriptedThreadInterface.h"
+#include "lldb/Interpreter/Interfaces/ScriptedThreadPlanInterface.h"
 #include "lldb/Interpreter/ScriptObject.h"
 #include "lldb/Utility/Broadcaster.h"
 #include "lldb/Utility/Status.h"
@@ -253,50 +256,6 @@ class ScriptInterpreter : public PluginInterface {
     return lldb::ValueObjectListSP();
   }
 
-  virtual StructuredData::ObjectSP
-  CreateScriptedThreadPlan(const char *class_name,
-                           const StructuredDataImpl &args_data,
-                           std::string &error_str,
-                           lldb::ThreadPlanSP thread_plan_sp) {
-    return StructuredData::ObjectSP();
-  }
-
-  virtual bool
-  ScriptedThreadPlanExplainsStop(StructuredData::ObjectSP implementor_sp,
-                                 Event *event, bool &script_error) {
-    script_error = true;
-    return true;
-  }
-
-  virtual bool
-  ScriptedThreadPlanShouldStop(StructuredData::ObjectSP implementor_sp,
-                               Event *event, bool &script_error) {
-    script_error = true;
-    return true;
-  }
-
-  virtual bool
-  ScriptedThreadPlanIsStale(StructuredData::ObjectSP implementor_sp,
-                            bool &script_error) {
-    script_error = true;
-    return true;
-  }
-
-  virtual lldb::StateType
-  ScriptedThreadPlanGetRunState(StructuredData::ObjectSP implementor_sp,
-                                bool &script_error) {
-    script_error = true;
-    return lldb::eStateStepping;
-  }
-
-  virtual bool
-  ScriptedThreadPlanGetStopDescription(StructuredData::ObjectSP implementor_sp,
-                                       lldb_private::Stream *stream,
-                                       bool &script_error) {
-    script_error = true;
-    return false;
-  }
-
   virtual StructuredData::GenericSP
   CreateScriptedBreakpointResolver(const char *class_name,
                                    const StructuredDataImpl &args_data,
@@ -566,6 +525,11 @@ class ScriptInterpreter : public PluginInterface {
     return std::make_shared<ScriptedThreadInterface>();
   }
 
+  virtual lldb::ScriptedThreadPlanInterfaceSP
+  CreateScriptedThreadPlanInterface() {
+    return std::make_shared<ScriptedThreadPlanInterface>();
+  }
+
   virtual lldb::OperatingSystemInterfaceSP CreateOperatingSystemInterface() {
     return std::make_shared<OperatingSystemInterface>();
   }
@@ -584,6 +548,10 @@ class ScriptInterpreter : public PluginInterface {
 
   Status GetStatusFromSBError(const lldb::SBError &error) const;
 
+  Event *GetOpaqueTypeFromSBEvent(const lldb::SBEvent &event) const;
+
+  Stream *GetOpaqueTypeFromSBStream(const lldb::SBStream &stream) const;
+
   lldb::BreakpointSP
   GetOpaqueTypeFromSBBreakpoint(const lldb::SBBreakpoint &breakpoint) const;
 
diff --git a/lldb/include/lldb/Target/ThreadPlanPython.h b/lldb/include/lldb/Target/ThreadPlanPython.h
index 64854d66b8f25..da106faf951db 100644
--- a/lldb/include/lldb/Target/ThreadPlanPython.h
+++ b/lldb/include/lldb/Target/ThreadPlanPython.h
@@ -13,6 +13,7 @@
 #include <string>
 
 #include "lldb/Core/StructuredDataImpl.h"
+#include "lldb/Interpreter/Interfaces/ScriptedThreadPlanInterface.h"
 #include "lldb/Target/Process.h"
 #include "lldb/Target/StopInfo.h"
 #include "lldb/Target/Target.h"
@@ -70,6 +71,7 @@ class ThreadPlanPython : public ThreadPlan {
   StreamString m_stop_description; // Cache the stop description here
   bool m_did_push;
   bool m_stop_others;
+  lldb::ScriptedThreadPlanInterfaceSP m_interface;
 
   ThreadPlanPython(const ThreadPlanPython &) = delete;
   const ThreadPlanPython &operator=(const ThreadPlanPython &) = delete;
diff --git a/lldb/include/lldb/lldb-forward.h b/lldb/include/lldb/lldb-forward.h
index aa099d4abc3b0..6138e6fe5a60b 100644
--- a/lldb/include/lldb/lldb-forward.h
+++ b/lldb/include/lldb/lldb-forward.h
@@ -185,6 +185,7 @@ class ScriptedMetadata;
 class ScriptedPlatformInterface;
 class ScriptedProcessInterface;
 class ScriptedThreadInterface;
+class ScriptedThreadPlanInterface;
 class ScriptedSyntheticChildren;
 class SearchFilter;
 class Section;
@@ -393,6 +394,8 @@ typedef std::unique_ptr<lldb_private::ScriptedProcessInterface>
     ScriptedProcessInterfaceUP;
 typedef std::shared_ptr<lldb_private::ScriptedThreadInterface>
     ScriptedThreadInterfaceSP;
+typedef std::shared_ptr<lldb_private::ScriptedThreadPlanInterface>
+    ScriptedThreadPlanInterfaceSP;
 typedef std::shared_ptr<lldb_private::Section> SectionSP;
 typedef std::unique_ptr<lldb_private::SectionList> SectionListUP;
 typedef std::weak_ptr<lldb_private::Section> SectionWP;
diff --git a/lldb/source/Interpreter/ScriptInterpreter.cpp b/lldb/source/Interpreter/ScriptInterpreter.cpp
index fb3fa74d0b978..aee2ec94d7979 100644
--- a/lldb/source/Interpreter/ScriptInterpreter.cpp
+++ b/lldb/source/Interpreter/ScriptInterpreter.cpp
@@ -104,6 +104,19 @@ ScriptInterpreter::GetStatusFromSBError(const lldb::SBError &error) const {
   return Status();
 }
 
+Event *
+ScriptInterpreter::GetOpaqueTypeFromSBEvent(const lldb::SBEvent &event) const {
+  return event.m_opaque_ptr;
+}
+
+Stream *ScriptInterpreter::GetOpaqueTypeFromSBStream(
+    const lldb::SBStream &stream) const {
+  if (stream.m_opaque_up)
+    return const_cast<lldb::SBStream &>(stream).m_opaque_up.get();
+
+  return nullptr;
+}
+
 std::optional<MemoryRegionInfo>
 ScriptInterpreter::GetOpaqueTypeFromSBMemoryRegionInfo(
     const lldb::SBMemoryRegionInfo &mem_region) const {
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/CMakeLists.txt b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/CMakeLists.txt
index b22abc49c92a9..c60e4bb503a37 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/CMakeLists.txt
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/CMakeLists.txt
@@ -24,6 +24,7 @@ add_lldb_library(lldbPluginScriptInterpreterPythonInterfaces
   ScriptedPythonInterface.cpp
   ScriptedProcessPythonInterface.cpp
   ScriptedThreadPythonInterface.cpp
+  ScriptedThreadPlanPythonInterface.cpp
   ScriptedPlatformPythonInterface.cpp
 
   LINK_LIBS
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPlatformPythonInterface.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPlatformPythonInterface.cpp
index 9ba4731032bd3..6e93bec80056e 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPlatformPythonInterface.cpp
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPlatformPythonInterface.cpp
@@ -20,6 +20,8 @@
 #include "../ScriptInterpreterPythonImpl.h"
 #include "ScriptedPlatformPythonInterface.h"
 
+#include "lldb/Target/ExecutionContext.h"
+
 using namespace lldb;
 using namespace lldb_private;
 using namespace lldb_private::python;
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.cpp
index 6f22503b279ca..7d072212676e1 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.cpp
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.cpp
@@ -26,6 +26,15 @@ ScriptedPythonInterface::ScriptedPythonInterface(
     ScriptInterpreterPythonImpl &interpreter)
     : ScriptedInterface(), m_interpreter(interpreter) {}
 
+template <>
+void ScriptedPythonInterface::ReverseTransform(
+    lldb_private::Stream *&original_arg, python::PythonObject transformed_arg,
+    Status &error) {
+  Stream *s = ExtractValueFromPythonObject<Stream *>(transformed_arg, error);
+  *original_arg = *s;
+  original_arg->PutCString(static_cast<StreamString *>(s)->GetData());
+}
+
 template <>
 StructuredData::ArraySP
 ScriptedPythonInterface::ExtractValueFromPythonObject<StructuredData::ArraySP>(
@@ -48,12 +57,33 @@ Status ScriptedPythonInterface::ExtractValueFromPythonObject<Status>(
   if (lldb::SBError *sb_error = reinterpret_cast<lldb::SBError *>(
           python::LLDBSWIGPython_CastPyObjectToSBError(p.get())))
     return m_interpreter.GetStatusFromSBError(*sb_error);
-  else
-    error.SetErrorString("Couldn't cast lldb::SBError to lldb::Status.");
+  error.SetErrorString("Couldn't cast lldb::SBError to lldb::Status.");
 
   return {};
 }
 
+template <>
+Event *ScriptedPythonInterface::ExtractValueFromPythonObject<Event *>(
+    python::PythonObject &p, Status &error) {
+  if (lldb::SBEvent *sb_event = reinterpret_cast<lldb::SBEvent *>(
+          python::LLDBSWIGPython_CastPyObjectToSBEvent(p.get())))
+    return m_interpreter.GetOpaqueTypeFromSBEvent(*sb_event);
+  error.SetErrorString("Couldn't cast lldb::SBEvent to lldb_private::Event.");
+
+  return nullptr;
+}
+
+template <>
+Stream *ScriptedPythonInterface::ExtractValueFromPythonObject<Stream *>(
+    python::PythonObject &p, Status &error) {
+  if (lldb::SBStream *sb_stream = reinterpret_cast<lldb::SBStream *>(
+          python::LLDBSWIGPython_CastPyObjectToSBStream(p.get())))
+    return m_interpreter.GetOpaqueTypeFromSBStream(*sb_stream);
+  error.SetErrorString("Couldn't cast lldb::SBStream to lldb_private::Stream.");
+
+  return nullptr;
+}
+
 template <>
 lldb::DataExtractorSP
 ScriptedPythonInterface::ExtractValueFromPythonObject<lldb::DataExtractorSP>(
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.h b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.h
index 7af9816397099..cc760938c8995 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.h
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.h
@@ -224,6 +224,10 @@ class ScriptedPythonInterface : virtual public ScriptedInterface {
     return python::SWIGBridge::ToSWIGWrapper(arg);
   }
 
+  python::PythonObject Transform(lldb::ThreadPlanSP arg) {
+    return python::SWIGBridge::ToSWIGWrapper(arg);
+  }
+
   python::PythonObject Transform(lldb::ProcessAttachInfoSP arg) {
     return python::SWIGBridge::ToSWIGWrapper(arg);
   }
@@ -232,6 +236,14 @@ class ScriptedPythonInterface : virtual public ScriptedInterface {
     return python::SWIGBridge::ToSWIGWrapper(arg);
   }
 
+  python::PythonObject Transform(Event *arg) {
+    return python::SWIGBridge::ToSWIGWrapper(arg);
+  }
+
+  python::PythonObject Transform(Stream *arg) {
+    return python::SWIGBridge::ToSWIGWrapper(arg);
+  }
+
   python::PythonObject Transform(lldb::DataExtractorSP arg) {
     return python::SWIGBridge::ToSWIGWrapper(arg);
   }
@@ -329,6 +341,14 @@ template <>
 Status ScriptedPythonInterface::ExtractValueFromPythonObject<Status>(
     python::PythonObject &p, Status &error);
 
+template <>
+Event *ScriptedPythonInterface::ExtractValueFromPythonObject<Event *>(
+    python::PythonObject &p, Status &error);
+
+template <>
+Stream *ScriptedPythonInterface::ExtractValueFromPythonObject<Stream *>(
+    python::PythonObject &p, Status &error);
+
 template <>
 lldb::BreakpointSP
 ScriptedPythonInterface::ExtractValueFromPythonObject<lldb::BreakpointSP>(
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPlanPythonInterface.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPlanPythonInterface.cpp
new file mode 100644
index 0000000000000..df9f7db6f62b0
--- /dev/null
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPlanPythonInterface.cpp
@@ -0,0 +1,92 @@
+//===-- ScriptedThreadPlanPythonInterface.cpp -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "lldb/Host/Config.h"
+#include "lldb/Utility/Log.h"
+#include "lldb/lldb-enumerations.h"
+
+#if LLDB_ENABLE_PYTHON
+
+// LLDB Python header must be included first
+#include "../lldb-python.h"
+
+#include "../SWIGPythonBridge.h"
+#include "../ScriptInterpreterPythonImpl.h"
+#include "ScriptedThreadPlanPythonInterface.h"
+
+using namespace lldb;
+using namespace lldb_private;
+using namespace lldb_private::python;
+
+ScriptedThreadPlanPythonInterface::ScriptedThreadPlanPythonInterface(
+    ScriptInterpreterPythonImpl &interpreter)
+    : ScriptedThreadPlanInterface(), ScriptedPythonInterface(interpreter) {}
+
+llvm::Expected<StructuredData::GenericSP>
+ScriptedThreadPlanPythonInterface::CreatePluginObject(
+    const llvm::StringRef class_name, lldb::ThreadPlanSP thread_plan_sp,
+    const StructuredDataImpl &args_sp) {
+  return ScriptedPythonInterface::CreatePluginObject(class_name, nullptr,
+                                                     thread_plan_sp, args_sp);
+}
+
+llvm::Expected<bool>
+ScriptedThreadPlanPythonInterface::ExplainsStop(Event *event) {
+  Status error;
+  StructuredData::ObjectSP obj = Dispatch("explains_stop", error, event);
+
+  if (!CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj, error))
+    return error.ToError();
+
+  return obj->GetBooleanValue();
+}
+
+llvm::Expected<bool>
+ScriptedThreadPlanPythonInterface::ShouldStop(Event *event) {
+  Status error;
+  StructuredData::ObjectSP obj = Dispatch("should_stop", error, event);
+
+  if (!CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj, error))
+    return error.ToError();
+
+  return obj->GetBooleanValue();
+}
+
+llvm::Expected<bool> ScriptedThreadPlanPythonInterface::IsStale() {
+  Status error;
+  StructuredData::ObjectSP obj = Dispatch("is_stale", error);
+
+  if (!CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj, error))
+    return error.ToError();
+
+  return obj->GetBooleanValue();
+}
+
+lldb::StateType ScriptedThreadPlanPythonInterface::GetRunState() {
+  Status error;
+  StructuredData::ObjectSP obj = Dispatch("should_step", error);
+
+  if (!CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj, error))
+    return lldb::eStateStepping;
+
+  return static_cast<lldb::StateType>(obj->GetUnsignedIntegerValue(
+      static_cast<uint32_t>(lldb::eStateStepping)));
+}
+
+llvm::Expected<bool>
+ScriptedThreadPlanPythonInterface::GetStopDescription(lldb_private::Stream *s) {
+  Status error;
+  Dispatch("stop_description", error, s);
+
+  if (error.Fail())
+    return error.ToError();
+
+  return true;
+}
+
+#endif
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPlanPythonInterface.h b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPlanPythonInterface.h
new file mode 100644
index 0000000000000..2eb986e0282f0
--- /dev/null
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPlanPythonInterface.h
@@ -0,0 +1,44 @@
+//===-- ScriptedThreadPlanPythonInterface.h ---------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLDB_PLUGINS_SCRIPTINTERPRETER_PYTHON_INTERFACES_SCRIPTEDTHREADPLANPYTHONINTERFACE_H
+#define LLDB_PLUGINS_SCRIPTINTERPRETER_PYTHON_INTERFACES_SCRIPTEDTHREADPLANPYTHONINTERFACE_H
+
+#include "lldb/Host/Config.h"
+
+#if LLDB_ENABLE_PYTHON
+
+#include "ScriptedPythonInterface.h"
+#include "lldb/Interpreter/Interfaces/ScriptedThreadPlanInterface.h"
+#include <optional>
+
+namespace lldb_private {
+class ScriptedThreadPlanPythonInterface : public ScriptedThreadPlanInterface,
+                                          public ScriptedPythonInterface {
+public:
+  ScriptedThreadPlanPythonInterface(ScriptInterpreterPythonImpl &interpreter);
+
+  llvm::Expected<StructuredData::GenericSP>
+  CreatePluginObject(const llvm::StringRef class_name,
+                     lldb::ThreadPlanSP thread_plan_sp,
+                     const StructuredDataImpl &args_sp) override;
+
+  llvm::Expected<bool> ExplainsStop(Event *event) override;
+
+  llvm::Expected<bool> ShouldStop(Event *event) override;
+
+  llvm::Expected<bool> IsStale() override;
+
+  lldb::StateType GetRunState() override;
+
+  llvm::Expected<bool> GetStopDescription(lldb_private::Stream *s) override;
+};
+} // namespace lldb_private
+
+#endif // LLDB_ENABLE_PYTHON
+#endif // LLDB_PLUGINS_SCRIPTINTERPRETER_PYTHON_INTERFACES_SCRIPTEDTHREADPLANPYTHONINTERFACE_H
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPythonInterface.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPythonInterface.cpp
index 18e268527eb2f..ba2ec0e78e9f5 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPythonInterface.cpp
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPythonInterface.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "lldb/Host/Config.h"
+#include "lldb/Target/ExecutionContext.h"
 #include "lldb/Utility/Log.h"
 #include "lldb/lldb-enumerations.h"
 
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/SWIGPythonBridge.h b/lldb/source/Plugins/ScriptInterpreter/Python/SWIGPythonBridge.h
index 7cdd5577919ba..41f3a80a02b13 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/SWIGPythonBridge.h
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/SWIGPythonBridge.h
@@ -96,12 +96,13 @@ class SWIGBridge {
   static PythonObject ToSWIGWrapper(lldb::ExecutionContextRefSP ctx_sp);
   static PythonObject ToSWIGWrapper(const TypeSummaryOptions &summary_options);
   static PythonObject ToSWIGWrapper(const SymbolContext &sym_ctx);
+  static PythonObject ToSWIGWrapper(const Stream *stream);
+  static PythonObject ToSWIGWrapper(Event *event);
 
   static PythonObject ToSWIGWrapper(lldb::ProcessAttachInfoSP attach_info_sp);
   static PythonObject ToSWIGWrapper(lldb::ProcessLaunchInfoSP launch_info_sp);
   static PythonObject ToSWIGWrapper(lldb::DataExtractorSP data_extractor_sp);
 
-  static PythonObject ToSWIGWrapper(std::unique_ptr<lldb::SBStream> stream_sb);
   static PythonObject
   ToSWIGWrapper(std::unique_ptr<lldb::SBStructuredData> data_sb);
   static PythonObject
@@ -111,7 +112,6 @@ class SWIGBridge {
 
   static python::ScopedPythonObject<lldb::SBCommandReturnObject>
   ToSWIGWrapper(CommandReturnObject &cmd_retobj);
-  static python::ScopedPythonObject<lldb::SBEvent> ToSWIGWrapper(Event *event);
   // These prototypes are the Pythonic implementations of the required
   // callbacks. Although these are scripting-language specific, their definition
   // depends on the public API.
@@ -146,21 +146,6 @@ class SWIGBridge {
                                     const char *session_dictionary_name,
                                     lldb::DebuggerSP debugger_sp);
 
-  static python::PythonObject LLDBSwigPythonCreateScriptedThreadPlan(
-      const char *python_class_name, const char *session_dictionary_name,
-      const StructuredDataImpl &args_data, std::string &error_string,
-      const lldb::ThreadPlanSP &thread_plan_sp);
-
-  static bool LLDBSWIGPythonCallThreadPlan(void *implementor,
-                                           const char *method_name,
-                                           lldb_private::Event *event_sp,
-                                           bool &got_error);
-
-  static bool LLDBSWIGPythonCallThreadPlan(void *implementor,
-                                           const char *method_name,
-                                           lldb_private::Stream *stream,
-                                           bool &got_error);
-
   static python::PythonObject LLDBSwigPythonCreateScriptedBreakpointResolver(
       const char *python_class_name, const char *session_dictionary_name,
       const StructuredDataImpl &args, const lldb::BreakpointSP &bkpt_sp);
@@ -262,6 +247,8 @@ void *LLDBSWIGPython_CastPyObjectToSBBreakpoint(PyObject *data);
 void *LLDBSWIGPython_CastPyObjectToSBAttachInfo(PyObject *data);
 void *LLDBSWIGPython_CastPyObjectToSBLaunchInfo(PyObject *data);
 void *LLDBSWIGPython_CastPyObjectToSBError(PyObject *data);
+void *LLDBSWIGPython_CastPyObjectToSBEvent(PyObject *data);
+void *LLDBSWIGPython_CastPyObjectToSBStream(PyObject *data);
 void *LLDBSWIGPython_CastPyObjectToSBValue(PyObject *data);
 void *LLDBSWIGPython_CastPyObjectToSBMemoryRegionInfo(PyObject *data);
 } // namespace python
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp
index 953f8b3aba18f..b71f856efda2e 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp
@@ -17,6 +17,7 @@
 #include "Interfaces/OperatingSystemPythonInterface.h"
 #include "Interfaces/ScriptedPlatformPythonInterface.h"
 #include "Interfaces/ScriptedProcessPythonInterface.h"
+#include "Interfaces/ScriptedThreadPlanPythonInterface.h"
 #include "Interfaces/ScriptedThreadPythonInterface.h"
 #include "PythonDataObjects.h"
 #include "PythonReadline.h"
@@ -1535,6 +1536,11 @@ ScriptInterpreterPythonImpl::CreateScriptedThreadInterface() {
   return std::make_shared<ScriptedThreadPythonInterface>(*this);
 }
 
+ScriptedThreadPlanInterfaceSP
+ScriptInterpreterPythonImpl::CreateScriptedThreadPlanInterface() {
+  return std::make_shared<ScriptedThreadPlanPythonInterface>(*this);
+}
+
 OperatingSystemInterfaceSP
 ScriptInterpreterPythonImpl::CreateOperatingSystemInterface() {
   return std::make_shared<OperatingSystemPythonInterface>(*this);
@@ -1551,122 +1557,6 @@ ScriptInterpreterPythonImpl::CreateStructuredDataFromScriptObject(
   return py_obj.CreateStructuredObject();
 }
 
-StructuredData::ObjectSP ScriptInterpreterPythonImpl::CreateScriptedThreadPlan(
-    const char *class_name, const StructuredDataImpl &args_data,
-    std::string &error_str, lldb::ThreadPlanSP thread_plan_sp) {
-  if (class_name == nullptr || class_name[0] == '\0')
-    return StructuredData::ObjectSP();
-
-  if (!thread_plan_sp.get())
-    return {};
-
-  Debugger &debugger = thread_plan_sp->GetTarget().GetDebugger();
-  ScriptInterpreterPythonImpl *python_interpreter =
-      GetPythonInterpreter(debugger);
-
-  if (!python_interpreter)
-    return {};
-
-  Locker py_lock(this,
-                 Locker::AcquireLock | Locker::InitSession | Locker::NoSTDIN);
-  PythonObject ret_val = SWIGBridge::LLDBSwigPythonCreateScriptedThreadPlan(
-      class_name, python_interpreter->m_dictionary_name.c_str(), args_data,
-      error_str, thread_plan_sp);
-  if (!ret_val)
-    return {};
-
-  return StructuredData::ObjectSP(
-      new StructuredPythonObject(std::move(ret_val)));
-}
-
-bool ScriptInterpreterPythonImpl::ScriptedThreadPlanExplainsStop(
-    StructuredData::ObjectSP implementor_sp, Event *event, bool &script_error) {
-  bool explains_stop = true;
-  StructuredData::Generic *generic = nullptr;
-  if (implementor_sp)
-    generic = implementor_sp->GetAsGeneric();
-  if (generic) {
-    Locker py_lock(this,
-                   Locker::AcquireLock | Locker::InitSession | Locker::NoSTDIN);
-    explains_stop = SWIGBridge::LLDBSWIGPythonCallThreadPlan(
-        generic->GetValue(), "explains_stop", event, script_error);
-    if (script_error)
-      return true;
-  }
-  return explains_stop;
-}
-
-bool ScriptInterpreterPythonImpl::ScriptedThreadPlanShouldStop(
-    StructuredData::ObjectSP implementor_sp, Event *event, bool &script_error) {
-  bool should_stop = true;
-  StructuredData::Generic *generic = nullptr;
-  if (implementor_sp)
-    generic = implementor_sp->GetAsGeneric();
-  if (generic) {
-    Locker py_lock(this,
-                   Locker::AcquireLock | Locker::InitSession | Locker::NoSTDIN);
-    should_stop = SWIGBridge::LLDBSWIGPythonCallThreadPlan(
-        generic->GetValue(), "should_stop", event, script_error);
-    if (script_error)
-      return true;
-  }
-  return should_stop;
-}
-
-bool ScriptInterpreterPythonImpl::ScriptedThreadPlanIsStale(
-    StructuredData::ObjectSP implementor_sp, bool &script_error) {
-  bool is_stale = true;
-  StructuredData::Generic *generic = nullptr;
-  if (implementor_sp)
-    generic = implementor_sp->GetAsGeneric();
-  if (generic) {
-    Locker py_lock(this,
-                   Locker::AcquireLock | Locker::InitSession | Locker::NoSTDIN);
-    is_stale = SWIGBridge::LLDBSWIGPythonCallThreadPlan(
-        generic->GetValue(), "is_stale", (Event *)nullptr, script_error);
-    if (script_error)
-      return true;
-  }
-  return is_stale;
-}
-
-lldb::StateType ScriptInterpreterPythonImpl::ScriptedThreadPlanGetRunState(
-    StructuredData::ObjectSP implementor_sp, bool &script_error) {
-  bool should_step = false;
-  StructuredData::Generic *generic = nullptr;
-  if (implementor_sp)
-    generic = implementor_sp->GetAsGeneric();
-  if (generic) {
-    Locker py_lock(this,
-                   Locker::AcquireLock | Locker::InitSession | Locker::NoSTDIN);
-    should_step = SWIGBridge::LLDBSWIGPythonCallThreadPlan(
-        generic->GetValue(), "should_step", (Event *)nullptr, script_error);
-    if (script_error)
-      should_step = true;
-  }
-  if (should_step)
-    return lldb::eStateStepping;
-  return lldb::eStateRunning;
-}
-
-bool
-ScriptInterpreterPythonImpl::ScriptedThreadPlanGetStopDescription(
-    StructuredData::ObjectSP implementor_sp, lldb_private::Stream *stream,
-    bool &script_error) {
-  StructuredData::Generic *generic = nullptr;
-  if (implementor_sp)
-    generic = implementor_sp->GetAsGeneric();
-  if (!generic) {
-    script_error = true;
-    return false;
-  }
-  Locker py_lock(this,
-                   Locker::AcquireLock | Locker::InitSession | Locker::NoSTDIN);
-  return SWIGBridge::LLDBSWIGPythonCallThreadPlan(
-      generic->GetValue(), "stop_description", stream, script_error);
-}
-
-
 StructuredData::GenericSP
 ScriptInterpreterPythonImpl::CreateScriptedBreakpointResolver(
     const char *class_name, const StructuredDataImpl &args_data,
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPythonImpl.h b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPythonImpl.h
index a33499816d8d3..da8e3a63d0847 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPythonImpl.h
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPythonImpl.h
@@ -77,34 +77,9 @@ class ScriptInterpreterPythonImpl : public ScriptInterpreterPython {
   StructuredData::GenericSP
   CreateScriptCommandObject(const char *class_name) override;
 
-  StructuredData::ObjectSP
-  CreateScriptedThreadPlan(const char *class_name,
-                           const StructuredDataImpl &args_data,
-                           std::string &error_str,
-                           lldb::ThreadPlanSP thread_plan) override;
-
   StructuredData::ObjectSP
   CreateStructuredDataFromScriptObject(ScriptObject obj) override;
 
-  bool ScriptedThreadPlanExplainsStop(StructuredData::ObjectSP implementor_sp,
-                                      Event *event,
-                                      bool &script_error) override;
-
-  bool ScriptedThreadPlanShouldStop(StructuredData::ObjectSP implementor_sp,
-                                    Event *event, bool &script_error) override;
-
-  bool ScriptedThreadPlanIsStale(StructuredData::ObjectSP implementor_sp,
-                                 bool &script_error) override;
-
-  lldb::StateType
-  ScriptedThreadPlanGetRunState(StructuredData::ObjectSP implementor_sp,
-                                bool &script_error) override;
-
-  bool
-  ScriptedThreadPlanGetStopDescription(StructuredData::ObjectSP implementor_sp,
-                                lldb_private::Stream *s,
-                                bool &script_error) override;
-
   StructuredData::GenericSP
   CreateScriptedBreakpointResolver(const char *class_name,
                                    const StructuredDataImpl &args_data,
@@ -136,6 +111,9 @@ class ScriptInterpreterPythonImpl : public ScriptInterpreterPython {
 
   lldb::ScriptedThreadInterfaceSP CreateScriptedThreadInterface() override;
 
+  lldb::ScriptedThreadPlanInterfaceSP
+  CreateScriptedThreadPlanInterface() override;
+
   lldb::OperatingSystemInterfaceSP CreateOperatingSystemInterface() override;
 
   StructuredData::ObjectSP
diff --git a/lldb/source/Target/ThreadPlanPython.cpp b/lldb/source/Target/ThreadPlanPython.cpp
index d6de6b3c3cf04..48b3e3a1bcab6 100644
--- a/lldb/source/Target/ThreadPlanPython.cpp
+++ b/lldb/source/Target/ThreadPlanPython.cpp
@@ -32,6 +32,23 @@ ThreadPlanPython::ThreadPlanPython(Thread &thread, const char *class_name,
                  eVoteNoOpinion, eVoteNoOpinion),
       m_class_name(class_name), m_args_data(args_data), m_did_push(false),
       m_stop_others(false) {
+  ScriptInterpreter *interpreter = GetScriptInterpreter();
+  if (!interpreter) {
+    SetPlanComplete(false);
+    // FIXME: error handling
+    return;
+  }
+
+  m_interface = interpreter->CreateScriptedThreadPlanInterface();
+  if (!m_interface) {
+    SetPlanComplete(false);
+    // FIXME: error handling
+    // error.SetErrorStringWithFormat(
+    //     "ThreadPlanPython::%s () - ERROR: %s", __FUNCTION__,
+    //     "Script interpreter couldn't create Scripted Thread Plan Interface");
+    return;
+  }
+
   SetIsControllingPlan(true);
   SetOkayToDiscard(true);
   SetPrivate(false);
@@ -60,13 +77,14 @@ void ThreadPlanPython::DidPush() {
   // We set up the script side in DidPush, so that it can push other plans in
   // the constructor, and doesn't have to care about the details of DidPush.
   m_did_push = true;
-  if (!m_class_name.empty()) {
-    ScriptInterpreter *script_interp = GetScriptInterpreter();
-    if (script_interp) {
-      m_implementation_sp = script_interp->CreateScriptedThreadPlan(
-          m_class_name.c_str(), m_args_data, m_error_str, 
-          this->shared_from_this());
-    }
+  if (m_interface) {
+    auto obj_or_err = m_interface->CreatePluginObject(
+        m_class_name, this->shared_from_this(), m_args_data);
+    if (!obj_or_err) {
+      m_error_str = llvm::toString(obj_or_err.takeError());
+      SetPlanComplete(false);
+    } else
+      m_implementation_sp = *obj_or_err;
   }
 }
 
@@ -77,14 +95,13 @@ bool ThreadPlanPython::ShouldStop(Event *event_ptr) {
 
   bool should_stop = true;
   if (m_implementation_sp) {
-    ScriptInterpreter *script_interp = GetScriptInterpreter();
-    if (script_interp) {
-      bool script_error;
-      should_stop = script_interp->ScriptedThreadPlanShouldStop(
-          m_implementation_sp, event_ptr, script_error);
-      if (script_error)
-        SetPlanComplete(false);
-    }
+    auto should_stop_or_err = m_interface->ShouldStop(event_ptr);
+    if (!should_stop_or_err) {
+      LLDB_LOG_ERROR(GetLog(LLDBLog::Thread), should_stop_or_err.takeError(),
+                     "Can't call ScriptedThreadPlan::ShouldStop.");
+      SetPlanComplete(false);
+    } else
+      should_stop = *should_stop_or_err;
   }
   return should_stop;
 }
@@ -96,14 +113,13 @@ bool ThreadPlanPython::IsPlanStale() {
 
   bool is_stale = true;
   if (m_implementation_sp) {
-    ScriptInterpreter *script_interp = GetScriptInterpreter();
-    if (script_interp) {
-      bool script_error;
-      is_stale = script_interp->ScriptedThreadPlanIsStale(m_implementation_sp,
-                                                          script_error);
-      if (script_error)
-        SetPlanComplete(false);
-    }
+    auto is_stale_or_err = m_interface->IsStale();
+    if (!is_stale_or_err) {
+      LLDB_LOG_ERROR(GetLog(LLDBLog::Thread), is_stale_or_err.takeError(),
+                     "Can't call ScriptedThreadPlan::IsStale.");
+      SetPlanComplete(false);
+    } else
+      is_stale = *is_stale_or_err;
   }
   return is_stale;
 }
@@ -115,14 +131,14 @@ bool ThreadPlanPython::DoPlanExplainsStop(Event *event_ptr) {
 
   bool explains_stop = true;
   if (m_implementation_sp) {
-    ScriptInterpreter *script_interp = GetScriptInterpreter();
-    if (script_interp) {
-      bool script_error;
-      explains_stop = script_interp->ScriptedThreadPlanExplainsStop(
-          m_implementation_sp, event_ptr, script_error);
-      if (script_error)
-        SetPlanComplete(false);
-    }
+    auto explains_stop_or_error = m_interface->ExplainsStop(event_ptr);
+    if (!explains_stop_or_error) {
+      LLDB_LOG_ERROR(GetLog(LLDBLog::Thread),
+                     explains_stop_or_error.takeError(),
+                     "Can't call ScriptedThreadPlan::ExplainsStop.");
+      SetPlanComplete(false);
+    } else
+      explains_stop = *explains_stop_or_error;
   }
   return explains_stop;
 }
@@ -150,14 +166,8 @@ lldb::StateType ThreadPlanPython::GetPlanRunState() {
   LLDB_LOGF(log, "%s called on Python Thread Plan: %s )", LLVM_PRETTY_FUNCTION,
             m_class_name.c_str());
   lldb::StateType run_state = eStateRunning;
-  if (m_implementation_sp) {
-    ScriptInterpreter *script_interp = GetScriptInterpreter();
-    if (script_interp) {
-      bool script_error;
-      run_state = script_interp->ScriptedThreadPlanGetRunState(
-          m_implementation_sp, script_error);
-    }
-  }
+  if (m_implementation_sp)
+    run_state = m_interface->GetRunState();
   return run_state;
 }
 
@@ -168,12 +178,13 @@ void ThreadPlanPython::GetDescription(Stream *s, lldb::DescriptionLevel level) {
   if (m_implementation_sp) {
     ScriptInterpreter *script_interp = GetScriptInterpreter();
     if (script_interp) {
-      bool script_error;
-      bool added_desc = script_interp->ScriptedThreadPlanGetStopDescription(
-          m_implementation_sp, s, script_error);
-      if (script_error || !added_desc)
+      auto desc_or_err = m_interface->GetStopDescription(s);
+      if (!desc_or_err || !*desc_or_err) {
+        LLDB_LOG_ERROR(GetLog(LLDBLog::Thread), desc_or_err.takeError(),
+                       "Can't call ScriptedThreadPlan::GetStopDescription.");
         s->Printf("Python thread plan implemented by class %s.",
             m_class_name.c_str());
+      }
     }
     return;
   }
diff --git a/lldb/test/API/functionalities/step_scripted/Steps.py b/lldb/test/API/functionalities/step_scripted/Steps.py
index 7527607be847a..3325dba753657 100644
--- a/lldb/test/API/functionalities/step_scripted/Steps.py
+++ b/lldb/test/API/functionalities/step_scripted/Steps.py
@@ -47,7 +47,7 @@ def queue_child_thread_plan(self):
 
 # This plan does a step-over until a variable changes value.
 class StepUntil(StepWithChild):
-    def __init__(self, thread_plan, args_data, dict):
+    def __init__(self, thread_plan, args_data):
         self.thread_plan = thread_plan
         self.frame = thread_plan.GetThread().frames[0]
         self.target = thread_plan.GetThread().GetProcess().GetTarget()
@@ -99,7 +99,7 @@ def stop_description(self, stream):
 class StepReportsStopOthers:
     stop_mode_dict = {}
 
-    def __init__(self, thread_plan, args_data, dict):
+    def __init__(self, thread_plan, args_data):
         self.thread_plan = thread_plan
         self.key = str(args_data.GetValueForKey("token").GetUnsignedIntegerValue(1000))
 
diff --git a/lldb/unittests/ScriptInterpreter/Python/PythonTestSuite.cpp b/lldb/unittests/ScriptInterpreter/Python/PythonTestSuite.cpp
index 7f3359f6bf26b..72dcf45a867e5 100644
--- a/lldb/unittests/ScriptInterpreter/Python/PythonTestSuite.cpp
+++ b/lldb/unittests/ScriptInterpreter/Python/PythonTestSuite.cpp
@@ -96,26 +96,6 @@ lldb_private::python::SWIGBridge::LLDBSwigPythonCreateCommandObject(
   return python::PythonObject();
 }
 
-python::PythonObject
-lldb_private::python::SWIGBridge::LLDBSwigPythonCreateScriptedThreadPlan(
-    const char *python_class_name, const char *session_dictionary_name,
-    const StructuredDataImpl &args_data, std::string &error_string,
-    const lldb::ThreadPlanSP &thread_plan_sp) {
-  return python::PythonObject();
-}
-
-bool lldb_private::python::SWIGBridge::LLDBSWIGPythonCallThreadPlan(
-    void *implementor, const char *method_name, Event *event_sp,
-    bool &got_error) {
-  return false;
-}
-
-bool lldb_private::python::SWIGBridge::LLDBSWIGPythonCallThreadPlan(
-    void *implementor, const char *method_name, Stream *event_sp,
-    bool &got_error) {
-  return false;
-}
-
 python::PythonObject lldb_private::python::SWIGBridge::
     LLDBSwigPythonCreateScriptedBreakpointResolver(
         const char *python_class_name, const char *session_dictionary_name,
@@ -170,6 +150,16 @@ lldb_private::python::LLDBSWIGPython_CastPyObjectToSBError(PyObject *data) {
   return nullptr;
 }
 
+void *
+lldb_private::python::LLDBSWIGPython_CastPyObjectToSBEvent(PyObject *data) {
+  return nullptr;
+}
+
+void *
+lldb_private::python::LLDBSWIGPython_CastPyObjectToSBStream(PyObject *data) {
+  return nullptr;
+}
+
 void *
 lldb_private::python::LLDBSWIGPython_CastPyObjectToSBValue(PyObject *data) {
   return nullptr;
@@ -319,6 +309,11 @@ lldb_private::python::SWIGBridge::ToSWIGWrapper(lldb::ExecutionContextRefSP) {
   return python::PythonObject();
 }
 
+python::PythonObject
+lldb_private::python::SWIGBridge::ToSWIGWrapper(lldb::ThreadPlanSP) {
+  return python::PythonObject();
+}
+
 python::PythonObject
 lldb_private::python::SWIGBridge::ToSWIGWrapper(lldb::ProcessSP) {
   return python::PythonObject();
@@ -328,3 +323,13 @@ python::PythonObject lldb_private::python::SWIGBridge::ToSWIGWrapper(
     const lldb_private::StructuredDataImpl &) {
   return python::PythonObject();
 }
+
+python::PythonObject
+lldb_private::python::SWIGBridge::ToSWIGWrapper(Event *event) {
+  return python::PythonObject();
+}
+
+python::PythonObject
+lldb_private::python::SWIGBridge::ToSWIGWrapper(const Stream *stream) {
+  return python::PythonObject();
+}

From 5908559c1091245989672ca486c6b3c0a8f60b5a Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Mon, 30 Oct 2023 17:03:04 -0700
Subject: [PATCH 453/877] [X86] Don't set SHF_X86_64_LARGE for variables with
 explicit section name of a well-known small data section prefix (#70748)

Commit f3ea73133f91c1c23596d45680c8f2269c1dd289 allows SHF_X86_64_LARGE
for all global variables with an explicit section. For the following
variables, their data sections will be annotated as SHF_X86_64_LARGE.

```
const char relro[512] __attribute__((section(".rodata"))) = "a";
const char *const relro __attribute__((section(".data.rel.ro"))) = "a";
char data[512] __attribute__((section(".data"))) = "a";
```

The typical linker requirement is that we do not create more than one
output section with the same name, and the only output section should
have the bitwise OR value of all input section flags. Therefore, the
output .data section will have the SHF_X86_64_LARGE flag and be
moved away from the regular sections. This is undesired but benign.
However, .data.rel.ro having the SHF_X86_64_LARGE flag is problematic
because dynamic loaders do not support more than one PT_GNU_RELRO
program header, and LLD produces the error
`error: section: .jcr is not contiguous with other relro sections`.

I believe the most appropriate solution is to disallow SHF_X86_64_LARGE
on variables with an explicit section of certain prefixes (
.bss/.data/.bss) and allow others (e.g. metadata sections for various
instrumentation). Fortunately, global variables with an explicit
.bss/.data/.bss section are rare, so they should not cause excessive
relocation overflow pressure.
---
 llvm/lib/Target/TargetMachine.cpp             | 14 ++++++++++++
 .../CodeGen/X86/code-model-elf-sections.ll    | 22 ++++++++++++++++++-
 2 files changed, 35 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/TargetMachine.cpp b/llvm/lib/Target/TargetMachine.cpp
index 45fb612cb91da..9dc00ff85e009 100644
--- a/llvm/lib/Target/TargetMachine.cpp
+++ b/llvm/lib/Target/TargetMachine.cpp
@@ -46,6 +46,20 @@ bool TargetMachine::isLargeData(const GlobalVariable *GV) const {
   // restrict this to medium.
   if (getCodeModel() != CodeModel::Medium)
     return false;
+
+  // Allowing large metadata sections in the presence of an explicit section is
+  // useful, even if GCC does not allow them. However, we should not mark
+  // certain well-known prefixes as large, because it would make the whole
+  // output section large and cause the linker to move it, which is almost
+  // always undesired.
+  StringRef Name = GV->getSection();
+  auto IsPrefix = [&](StringRef Prefix) {
+    StringRef S = Name;
+    return S.consume_front(Prefix) && (S.empty() || S[0] == '.');
+  };
+  if (IsPrefix(".bss") || IsPrefix(".data") || IsPrefix(".rodata"))
+    return false;
+
   const DataLayout &DL = GV->getParent()->getDataLayout();
   uint64_t Size = DL.getTypeSizeInBits(GV->getValueType()) / 8;
   return Size == 0 || Size > LargeDataThreshold;
diff --git a/llvm/test/CodeGen/X86/code-model-elf-sections.ll b/llvm/test/CodeGen/X86/code-model-elf-sections.ll
index fe659fa9a46e7..5f579edc440d6 100644
--- a/llvm/test/CodeGen/X86/code-model-elf-sections.ll
+++ b/llvm/test/CodeGen/X86/code-model-elf-sections.ll
@@ -17,6 +17,8 @@
 ; RUN: llvm-readelf -S %t | FileCheck %s --check-prefix=SMALL-DS
 
 ; SMALL: .data {{.*}} WA {{.*}}
+; SMALL: .data.x {{.*}} WA {{.*}}
+; SMALL: .data0 {{.*}} WA {{.*}}
 ; SMALL: foo {{.*}} WA {{.*}}
 ; SMALL: .bss {{.*}} WA {{.*}}
 ; SMALL: .rodata {{.*}} A {{.*}}
@@ -24,6 +26,9 @@
 ; SMALL: .tbss {{.*}} WAT {{.*}}
 ; SMALL: .tdata {{.*}} WAT {{.*}}
 
+; SMALL-DS: .data {{.*}} WA {{.*}}
+; SMALL-DS: .data.x {{.*}} WA {{.*}}
+; SMALL-DS: .data0 {{.*}} WA {{.*}}
 ; SMALL-DS: .data.data {{.*}} WA {{.*}}
 ; SMALL-DS: foo {{.*}} WA {{.*}}
 ; SMALL-DS: .bss.bss {{.*}} WA {{.*}}
@@ -32,17 +37,27 @@
 ; SMALL-DS: .tbss.tbss {{.*}} WAT {{.*}}
 ; SMALL-DS: .tdata.tdata {{.*}} WAT {{.*}}
 
+; LARGE: .data {{.*}} WA {{.*}}
+; LARGE: .data.x {{.*}} WA {{.*}}
+; LARGE: .data0 {{.*}} WAl {{.*}}
 ; LARGE: .ldata {{.*}} WAl {{.*}}
 ; LARGE: foo {{.*}} WAl {{.*}}
+; LARGE: .bss {{.*}} WA {{.*}}
 ; LARGE: .lbss {{.*}} WAl {{.*}}
+; LARGE: .rodata {{.*}} A {{.*}}
 ; LARGE: .lrodata {{.*}} Al {{.*}}
 ; LARGE: .ldata.rel.ro {{.*}} WAl {{.*}}
 ; LARGE: .tbss {{.*}} WAT {{.*}}
 ; LARGE: .tdata {{.*}} WAT {{.*}}
 
+; LARGE-DS: .data {{.*}} WA {{.*}}
+; LARGE-DS: .data.x {{.*}} WA {{.*}}
+; LARGE-DS: .data0 {{.*}} WAl {{.*}}
 ; LARGE-DS: .ldata.data {{.*}} WAl {{.*}}
 ; LARGE-DS: foo {{.*}} WAl {{.*}}
+; LARGE-DS: .bss {{.*}} WA {{.*}}
 ; LARGE-DS: .lbss.bss {{.*}} WAl {{.*}}
+; LARGE-DS: .rodata {{.*}} A {{.*}}
 ; LARGE-DS: .lrodata.rodata {{.*}} Al {{.*}}
 ; LARGE-DS: .ldata.rel.ro.relro {{.*}} WAl {{.*}}
 ; LARGE-DS: .tbss.tbss {{.*}} WAT {{.*}}
@@ -51,9 +66,14 @@
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64--linux"
 
+@data_with_explicit_section = internal global [10 x i64] [i64 1, i64 2, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0], section ".data"
+@data_with_explicit_section2 = internal global [10 x i64] [i64 1, i64 2, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0], section ".data.x"
+@data_with_explicit_section0 = internal global [10 x i64] [i64 1, i64 2, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0], section ".data0"
 @data = internal global [10 x i64] [i64 1, i64 2, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0]
-@data_with_explicit_section = internal global [10 x i64] [i64 1, i64 2, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0], section "foo"
+@foo_with_explicit_section = internal global [10 x i64] [i64 1, i64 2, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0], section "foo"
+@bss_with_explicit_section = internal global [10 x i64] zeroinitializer, section ".bss"
 @bss = internal global [10 x i64] zeroinitializer
+@rodata_with_explicit_section = internal constant [10 x i64] zeroinitializer, section ".rodata"
 @rodata = internal constant [10 x i64] zeroinitializer
 @relro = internal constant [10 x ptr] [ptr @func, ptr @func, ptr @func, ptr @func, ptr @func, ptr @func, ptr @func, ptr @func, ptr @func, ptr @func]
 @tbss = internal thread_local global [10 x i64] zeroinitializer

From e137af60cd012243b33843d752638d8663347e4d Mon Sep 17 00:00:00 2001
From: Johannes Doerfert <johannes@jdoerfert.de>
Date: Mon, 30 Oct 2023 16:32:05 -0700
Subject: [PATCH 454/877] [OpenMP][NFC] Fix test to actually check for the
 result

---
 openmp/libomptarget/test/offloading/malloc_parallel.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/openmp/libomptarget/test/offloading/malloc_parallel.c b/openmp/libomptarget/test/offloading/malloc_parallel.c
index b8e975ca55a8f..4908e00694d99 100644
--- a/openmp/libomptarget/test/offloading/malloc_parallel.c
+++ b/openmp/libomptarget/test/offloading/malloc_parallel.c
@@ -1,5 +1,5 @@
-// RUN: %libomptarget-compile-generic && %libomptarget-run-generic
-// RUN: %libomptarget-compileopt-generic && %libomptarget-run-generic
+// RUN: %libomptarget-compile-run-and-check-generic
+// RUN: %libomptarget-compileopt-run-and-check-generic
 
 #include <omp.h>
 #include <stdio.h>

From ed5faa475b401aa91284d19afb52b2927aa59c97 Mon Sep 17 00:00:00 2001
From: Med Ismail Bennani <ismail@bennani.ma>
Date: Mon, 30 Oct 2023 17:24:27 -0700
Subject: [PATCH 455/877] [lldb] Fix build failure introduced in 484038416d06
 (NFC)

Signed-off-by: Med Ismail Bennani <ismail@bennani.ma>
---
 .../ScriptedProcessPythonInterface.cpp        | 27 ++++++++++++-------
 .../ScriptedThreadPlanPythonInterface.cpp     | 12 ++++++---
 .../ScriptedThreadPythonInterface.cpp         | 27 ++++++++++++-------
 3 files changed, 44 insertions(+), 22 deletions(-)

diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedProcessPythonInterface.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedProcessPythonInterface.cpp
index e86b34d6b930e..313c597ce48f3 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedProcessPythonInterface.cpp
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedProcessPythonInterface.cpp
@@ -49,7 +49,8 @@ StructuredData::DictionarySP ScriptedProcessPythonInterface::GetCapabilities() {
   StructuredData::DictionarySP dict =
       Dispatch<StructuredData::DictionarySP>("get_capabilities", error);
 
-  if (!CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, dict, error))
+  if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, dict,
+                                                    error))
     return {};
 
   return dict;
@@ -90,7 +91,8 @@ StructuredData::DictionarySP ScriptedProcessPythonInterface::GetThreadsInfo() {
   StructuredData::DictionarySP dict =
       Dispatch<StructuredData::DictionarySP>("get_threads_info", error);
 
-  if (!CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, dict, error))
+  if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, dict,
+                                                    error))
     return {};
 
   return dict;
@@ -106,7 +108,8 @@ bool ScriptedProcessPythonInterface::CreateBreakpoint(lldb::addr_t addr,
   if (py_error.Fail())
     error = py_error;
 
-  if (!CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj, error))
+  if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj,
+                                                    error))
     return {};
 
   return obj->GetBooleanValue();
@@ -131,7 +134,8 @@ lldb::offset_t ScriptedProcessPythonInterface::WriteMemoryAtAddress(
   StructuredData::ObjectSP obj =
       Dispatch("write_memory_at_address", py_error, addr, data_sp, error);
 
-  if (!CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj, error))
+  if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj,
+                                                    error))
     return LLDB_INVALID_OFFSET;
 
   // If there was an error on the python call, surface it to the user.
@@ -146,7 +150,8 @@ StructuredData::ArraySP ScriptedProcessPythonInterface::GetLoadedImages() {
   StructuredData::ArraySP array =
       Dispatch<StructuredData::ArraySP>("get_loaded_images", error);
 
-  if (!CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, array, error))
+  if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, array,
+                                                    error))
     return {};
 
   return array;
@@ -156,7 +161,8 @@ lldb::pid_t ScriptedProcessPythonInterface::GetProcessID() {
   Status error;
   StructuredData::ObjectSP obj = Dispatch("get_process_id", error);
 
-  if (!CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj, error))
+  if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj,
+                                                    error))
     return LLDB_INVALID_PROCESS_ID;
 
   return obj->GetUnsignedIntegerValue(LLDB_INVALID_PROCESS_ID);
@@ -166,7 +172,8 @@ bool ScriptedProcessPythonInterface::IsAlive() {
   Status error;
   StructuredData::ObjectSP obj = Dispatch("is_alive", error);
 
-  if (!CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj, error))
+  if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj,
+                                                    error))
     return {};
 
   return obj->GetBooleanValue();
@@ -177,7 +184,8 @@ ScriptedProcessPythonInterface::GetScriptedThreadPluginName() {
   Status error;
   StructuredData::ObjectSP obj = Dispatch("get_scripted_thread_plugin", error);
 
-  if (!CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj, error))
+  if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj,
+                                                    error))
     return {};
 
   return obj->GetStringValue().str();
@@ -193,7 +201,8 @@ StructuredData::DictionarySP ScriptedProcessPythonInterface::GetMetadata() {
   StructuredData::DictionarySP dict =
       Dispatch<StructuredData::DictionarySP>("get_process_metadata", error);
 
-  if (!CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, dict, error))
+  if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, dict,
+                                                    error))
     return {};
 
   return dict;
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPlanPythonInterface.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPlanPythonInterface.cpp
index df9f7db6f62b0..0a1bcb5b92c73 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPlanPythonInterface.cpp
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPlanPythonInterface.cpp
@@ -40,7 +40,8 @@ ScriptedThreadPlanPythonInterface::ExplainsStop(Event *event) {
   Status error;
   StructuredData::ObjectSP obj = Dispatch("explains_stop", error, event);
 
-  if (!CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj, error))
+  if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj,
+                                                    error))
     return error.ToError();
 
   return obj->GetBooleanValue();
@@ -51,7 +52,8 @@ ScriptedThreadPlanPythonInterface::ShouldStop(Event *event) {
   Status error;
   StructuredData::ObjectSP obj = Dispatch("should_stop", error, event);
 
-  if (!CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj, error))
+  if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj,
+                                                    error))
     return error.ToError();
 
   return obj->GetBooleanValue();
@@ -61,7 +63,8 @@ llvm::Expected<bool> ScriptedThreadPlanPythonInterface::IsStale() {
   Status error;
   StructuredData::ObjectSP obj = Dispatch("is_stale", error);
 
-  if (!CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj, error))
+  if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj,
+                                                    error))
     return error.ToError();
 
   return obj->GetBooleanValue();
@@ -71,7 +74,8 @@ lldb::StateType ScriptedThreadPlanPythonInterface::GetRunState() {
   Status error;
   StructuredData::ObjectSP obj = Dispatch("should_step", error);
 
-  if (!CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj, error))
+  if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj,
+                                                    error))
     return lldb::eStateStepping;
 
   return static_cast<lldb::StateType>(obj->GetUnsignedIntegerValue(
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPythonInterface.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPythonInterface.cpp
index ba2ec0e78e9f5..8af89d761764b 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPythonInterface.cpp
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPythonInterface.cpp
@@ -45,7 +45,8 @@ lldb::tid_t ScriptedThreadPythonInterface::GetThreadID() {
   Status error;
   StructuredData::ObjectSP obj = Dispatch("get_thread_id", error);
 
-  if (!CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj, error))
+  if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj,
+                                                    error))
     return LLDB_INVALID_THREAD_ID;
 
   return obj->GetUnsignedIntegerValue(LLDB_INVALID_THREAD_ID);
@@ -55,7 +56,8 @@ std::optional<std::string> ScriptedThreadPythonInterface::GetName() {
   Status error;
   StructuredData::ObjectSP obj = Dispatch("get_name", error);
 
-  if (!CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj, error))
+  if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj,
+                                                    error))
     return {};
 
   return obj->GetStringValue().str();
@@ -65,7 +67,8 @@ lldb::StateType ScriptedThreadPythonInterface::GetState() {
   Status error;
   StructuredData::ObjectSP obj = Dispatch("get_state", error);
 
-  if (!CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj, error))
+  if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj,
+                                                    error))
     return eStateInvalid;
 
   return static_cast<StateType>(obj->GetUnsignedIntegerValue(eStateInvalid));
@@ -75,7 +78,8 @@ std::optional<std::string> ScriptedThreadPythonInterface::GetQueue() {
   Status error;
   StructuredData::ObjectSP obj = Dispatch("get_queue", error);
 
-  if (!CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj, error))
+  if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj,
+                                                    error))
     return {};
 
   return obj->GetStringValue().str();
@@ -86,7 +90,8 @@ StructuredData::DictionarySP ScriptedThreadPythonInterface::GetStopReason() {
   StructuredData::DictionarySP dict =
       Dispatch<StructuredData::DictionarySP>("get_stop_reason", error);
 
-  if (!CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, dict, error))
+  if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, dict,
+                                                    error))
     return {};
 
   return dict;
@@ -97,7 +102,8 @@ StructuredData::ArraySP ScriptedThreadPythonInterface::GetStackFrames() {
   StructuredData::ArraySP arr =
       Dispatch<StructuredData::ArraySP>("get_stackframes", error);
 
-  if (!CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, arr, error))
+  if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, arr,
+                                                    error))
     return {};
 
   return arr;
@@ -108,7 +114,8 @@ StructuredData::DictionarySP ScriptedThreadPythonInterface::GetRegisterInfo() {
   StructuredData::DictionarySP dict =
       Dispatch<StructuredData::DictionarySP>("get_register_info", error);
 
-  if (!CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, dict, error))
+  if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, dict,
+                                                    error))
     return {};
 
   return dict;
@@ -118,7 +125,8 @@ std::optional<std::string> ScriptedThreadPythonInterface::GetRegisterContext() {
   Status error;
   StructuredData::ObjectSP obj = Dispatch("get_register_context", error);
 
-  if (!CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj, error))
+  if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj,
+                                                    error))
     return {};
 
   return obj->GetAsString()->GetValue().str();
@@ -129,7 +137,8 @@ StructuredData::ArraySP ScriptedThreadPythonInterface::GetExtendedInfo() {
   StructuredData::ArraySP arr =
       Dispatch<StructuredData::ArraySP>("get_extended_info", error);
 
-  if (!CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, arr, error))
+  if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, arr,
+                                                    error))
     return {};
 
   return arr;

From 2b7ba0155dc06de1f14a2a085f423570c1c896d0 Mon Sep 17 00:00:00 2001
From: Med Ismail Bennani <ismail@bennani.ma>
Date: Mon, 30 Oct 2023 17:38:36 -0700
Subject: [PATCH 456/877] Revert "[lldb] Fix build failure introduced in
 484038416d06 (NFC)"

This reverts commit ed5faa475b401aa91284d19afb52b2927aa59c97 since it
introduces test failures:

https://lab.llvm.org/buildbot/#/builders/68/builds/62556
---
 .../ScriptedProcessPythonInterface.cpp        | 27 +++++++------------
 .../ScriptedThreadPlanPythonInterface.cpp     | 12 +++------
 .../ScriptedThreadPythonInterface.cpp         | 27 +++++++------------
 3 files changed, 22 insertions(+), 44 deletions(-)

diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedProcessPythonInterface.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedProcessPythonInterface.cpp
index 313c597ce48f3..e86b34d6b930e 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedProcessPythonInterface.cpp
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedProcessPythonInterface.cpp
@@ -49,8 +49,7 @@ StructuredData::DictionarySP ScriptedProcessPythonInterface::GetCapabilities() {
   StructuredData::DictionarySP dict =
       Dispatch<StructuredData::DictionarySP>("get_capabilities", error);
 
-  if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, dict,
-                                                    error))
+  if (!CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, dict, error))
     return {};
 
   return dict;
@@ -91,8 +90,7 @@ StructuredData::DictionarySP ScriptedProcessPythonInterface::GetThreadsInfo() {
   StructuredData::DictionarySP dict =
       Dispatch<StructuredData::DictionarySP>("get_threads_info", error);
 
-  if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, dict,
-                                                    error))
+  if (!CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, dict, error))
     return {};
 
   return dict;
@@ -108,8 +106,7 @@ bool ScriptedProcessPythonInterface::CreateBreakpoint(lldb::addr_t addr,
   if (py_error.Fail())
     error = py_error;
 
-  if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj,
-                                                    error))
+  if (!CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj, error))
     return {};
 
   return obj->GetBooleanValue();
@@ -134,8 +131,7 @@ lldb::offset_t ScriptedProcessPythonInterface::WriteMemoryAtAddress(
   StructuredData::ObjectSP obj =
       Dispatch("write_memory_at_address", py_error, addr, data_sp, error);
 
-  if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj,
-                                                    error))
+  if (!CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj, error))
     return LLDB_INVALID_OFFSET;
 
   // If there was an error on the python call, surface it to the user.
@@ -150,8 +146,7 @@ StructuredData::ArraySP ScriptedProcessPythonInterface::GetLoadedImages() {
   StructuredData::ArraySP array =
       Dispatch<StructuredData::ArraySP>("get_loaded_images", error);
 
-  if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, array,
-                                                    error))
+  if (!CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, array, error))
     return {};
 
   return array;
@@ -161,8 +156,7 @@ lldb::pid_t ScriptedProcessPythonInterface::GetProcessID() {
   Status error;
   StructuredData::ObjectSP obj = Dispatch("get_process_id", error);
 
-  if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj,
-                                                    error))
+  if (!CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj, error))
     return LLDB_INVALID_PROCESS_ID;
 
   return obj->GetUnsignedIntegerValue(LLDB_INVALID_PROCESS_ID);
@@ -172,8 +166,7 @@ bool ScriptedProcessPythonInterface::IsAlive() {
   Status error;
   StructuredData::ObjectSP obj = Dispatch("is_alive", error);
 
-  if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj,
-                                                    error))
+  if (!CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj, error))
     return {};
 
   return obj->GetBooleanValue();
@@ -184,8 +177,7 @@ ScriptedProcessPythonInterface::GetScriptedThreadPluginName() {
   Status error;
   StructuredData::ObjectSP obj = Dispatch("get_scripted_thread_plugin", error);
 
-  if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj,
-                                                    error))
+  if (!CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj, error))
     return {};
 
   return obj->GetStringValue().str();
@@ -201,8 +193,7 @@ StructuredData::DictionarySP ScriptedProcessPythonInterface::GetMetadata() {
   StructuredData::DictionarySP dict =
       Dispatch<StructuredData::DictionarySP>("get_process_metadata", error);
 
-  if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, dict,
-                                                    error))
+  if (!CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, dict, error))
     return {};
 
   return dict;
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPlanPythonInterface.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPlanPythonInterface.cpp
index 0a1bcb5b92c73..df9f7db6f62b0 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPlanPythonInterface.cpp
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPlanPythonInterface.cpp
@@ -40,8 +40,7 @@ ScriptedThreadPlanPythonInterface::ExplainsStop(Event *event) {
   Status error;
   StructuredData::ObjectSP obj = Dispatch("explains_stop", error, event);
 
-  if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj,
-                                                    error))
+  if (!CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj, error))
     return error.ToError();
 
   return obj->GetBooleanValue();
@@ -52,8 +51,7 @@ ScriptedThreadPlanPythonInterface::ShouldStop(Event *event) {
   Status error;
   StructuredData::ObjectSP obj = Dispatch("should_stop", error, event);
 
-  if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj,
-                                                    error))
+  if (!CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj, error))
     return error.ToError();
 
   return obj->GetBooleanValue();
@@ -63,8 +61,7 @@ llvm::Expected<bool> ScriptedThreadPlanPythonInterface::IsStale() {
   Status error;
   StructuredData::ObjectSP obj = Dispatch("is_stale", error);
 
-  if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj,
-                                                    error))
+  if (!CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj, error))
     return error.ToError();
 
   return obj->GetBooleanValue();
@@ -74,8 +71,7 @@ lldb::StateType ScriptedThreadPlanPythonInterface::GetRunState() {
   Status error;
   StructuredData::ObjectSP obj = Dispatch("should_step", error);
 
-  if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj,
-                                                    error))
+  if (!CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj, error))
     return lldb::eStateStepping;
 
   return static_cast<lldb::StateType>(obj->GetUnsignedIntegerValue(
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPythonInterface.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPythonInterface.cpp
index 8af89d761764b..ba2ec0e78e9f5 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPythonInterface.cpp
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPythonInterface.cpp
@@ -45,8 +45,7 @@ lldb::tid_t ScriptedThreadPythonInterface::GetThreadID() {
   Status error;
   StructuredData::ObjectSP obj = Dispatch("get_thread_id", error);
 
-  if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj,
-                                                    error))
+  if (!CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj, error))
     return LLDB_INVALID_THREAD_ID;
 
   return obj->GetUnsignedIntegerValue(LLDB_INVALID_THREAD_ID);
@@ -56,8 +55,7 @@ std::optional<std::string> ScriptedThreadPythonInterface::GetName() {
   Status error;
   StructuredData::ObjectSP obj = Dispatch("get_name", error);
 
-  if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj,
-                                                    error))
+  if (!CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj, error))
     return {};
 
   return obj->GetStringValue().str();
@@ -67,8 +65,7 @@ lldb::StateType ScriptedThreadPythonInterface::GetState() {
   Status error;
   StructuredData::ObjectSP obj = Dispatch("get_state", error);
 
-  if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj,
-                                                    error))
+  if (!CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj, error))
     return eStateInvalid;
 
   return static_cast<StateType>(obj->GetUnsignedIntegerValue(eStateInvalid));
@@ -78,8 +75,7 @@ std::optional<std::string> ScriptedThreadPythonInterface::GetQueue() {
   Status error;
   StructuredData::ObjectSP obj = Dispatch("get_queue", error);
 
-  if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj,
-                                                    error))
+  if (!CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj, error))
     return {};
 
   return obj->GetStringValue().str();
@@ -90,8 +86,7 @@ StructuredData::DictionarySP ScriptedThreadPythonInterface::GetStopReason() {
   StructuredData::DictionarySP dict =
       Dispatch<StructuredData::DictionarySP>("get_stop_reason", error);
 
-  if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, dict,
-                                                    error))
+  if (!CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, dict, error))
     return {};
 
   return dict;
@@ -102,8 +97,7 @@ StructuredData::ArraySP ScriptedThreadPythonInterface::GetStackFrames() {
   StructuredData::ArraySP arr =
       Dispatch<StructuredData::ArraySP>("get_stackframes", error);
 
-  if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, arr,
-                                                    error))
+  if (!CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, arr, error))
     return {};
 
   return arr;
@@ -114,8 +108,7 @@ StructuredData::DictionarySP ScriptedThreadPythonInterface::GetRegisterInfo() {
   StructuredData::DictionarySP dict =
       Dispatch<StructuredData::DictionarySP>("get_register_info", error);
 
-  if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, dict,
-                                                    error))
+  if (!CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, dict, error))
     return {};
 
   return dict;
@@ -125,8 +118,7 @@ std::optional<std::string> ScriptedThreadPythonInterface::GetRegisterContext() {
   Status error;
   StructuredData::ObjectSP obj = Dispatch("get_register_context", error);
 
-  if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj,
-                                                    error))
+  if (!CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj, error))
     return {};
 
   return obj->GetAsString()->GetValue().str();
@@ -137,8 +129,7 @@ StructuredData::ArraySP ScriptedThreadPythonInterface::GetExtendedInfo() {
   StructuredData::ArraySP arr =
       Dispatch<StructuredData::ArraySP>("get_extended_info", error);
 
-  if (!ScriptedInterface::CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, arr,
-                                                    error))
+  if (!CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, arr, error))
     return {};
 
   return arr;

From 6eafe2cb7a3286c1b13eea7d8370374553fe81a9 Mon Sep 17 00:00:00 2001
From: Med Ismail Bennani <ismail@bennani.ma>
Date: Mon, 30 Oct 2023 17:40:11 -0700
Subject: [PATCH 457/877] Revert "[lldb] Make use of Scripted{Python,}Interface
 for ScriptedThreadPlan (#70392)"

This reverts commit 4b3cd379cce3f455bf3c8677ca7a5be6e708a4ce since it
introduces some test failures:

https://lab.llvm.org/buildbot/#/builders/68/builds/62556
---
 lldb/bindings/python/python-swigsafecast.swig |  13 +-
 lldb/bindings/python/python-wrapper.swig      | 153 +++++++++++++++---
 lldb/include/lldb/API/SBEvent.h               |   4 +-
 lldb/include/lldb/API/SBStream.h              |   9 --
 .../Interfaces/ScriptedInterface.h            |   4 +-
 .../Interfaces/ScriptedThreadPlanInterface.h  |  40 -----
 .../lldb/Interpreter/ScriptInterpreter.h      |  56 +++++--
 lldb/include/lldb/Target/ThreadPlanPython.h   |   2 -
 lldb/include/lldb/lldb-forward.h              |   3 -
 lldb/source/Interpreter/ScriptInterpreter.cpp |  13 --
 .../Python/Interfaces/CMakeLists.txt          |   1 -
 .../ScriptedPlatformPythonInterface.cpp       |   2 -
 .../Interfaces/ScriptedPythonInterface.cpp    |  34 +---
 .../Interfaces/ScriptedPythonInterface.h      |  20 ---
 .../ScriptedThreadPlanPythonInterface.cpp     |  92 -----------
 .../ScriptedThreadPlanPythonInterface.h       |  44 -----
 .../ScriptedThreadPythonInterface.cpp         |   1 -
 .../Python/SWIGPythonBridge.h                 |  21 ++-
 .../Python/ScriptInterpreterPython.cpp        | 122 +++++++++++++-
 .../Python/ScriptInterpreterPythonImpl.h      |  28 +++-
 lldb/source/Target/ThreadPlanPython.cpp       |  97 +++++------
 .../functionalities/step_scripted/Steps.py    |   4 +-
 .../Python/PythonTestSuite.cpp                |  45 +++---
 23 files changed, 407 insertions(+), 401 deletions(-)
 delete mode 100644 lldb/include/lldb/Interpreter/Interfaces/ScriptedThreadPlanInterface.h
 delete mode 100644 lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPlanPythonInterface.cpp
 delete mode 100644 lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPlanPythonInterface.h

diff --git a/lldb/bindings/python/python-swigsafecast.swig b/lldb/bindings/python/python-swigsafecast.swig
index fba3a77d8f2df..d5ea514872713 100644
--- a/lldb/bindings/python/python-swigsafecast.swig
+++ b/lldb/bindings/python/python-swigsafecast.swig
@@ -37,6 +37,10 @@ PythonObject SWIGBridge::ToSWIGWrapper(const Status& status) {
   return ToSWIGHelper(new lldb::SBError(status), SWIGTYPE_p_lldb__SBError);
 }
 
+PythonObject SWIGBridge::ToSWIGWrapper(std::unique_ptr<lldb::SBStream> stream_sb) {
+  return ToSWIGHelper(stream_sb.release(), SWIGTYPE_p_lldb__SBStream);
+}
+
 PythonObject SWIGBridge::ToSWIGWrapper(std::unique_ptr<lldb::SBStructuredData> data_sb) {
   return ToSWIGHelper(data_sb.release(), SWIGTYPE_p_lldb__SBStructuredData);
 }
@@ -111,12 +115,9 @@ SWIGBridge::ToSWIGWrapper(CommandReturnObject &cmd_retobj) {
       SWIGTYPE_p_lldb__SBCommandReturnObject);
 }
 
-PythonObject SWIGBridge::ToSWIGWrapper(const Stream *s) {
-  return ToSWIGHelper(new lldb::SBStream(), SWIGTYPE_p_lldb__SBStream);
-}
-
-PythonObject SWIGBridge::ToSWIGWrapper(Event *event) {
-  return ToSWIGHelper(new lldb::SBEvent(event), SWIGTYPE_p_lldb__SBEvent);
+ScopedPythonObject<lldb::SBEvent> SWIGBridge::ToSWIGWrapper(Event *event) {
+  return ScopedPythonObject<lldb::SBEvent>(new lldb::SBEvent(event),
+                                           SWIGTYPE_p_lldb__SBEvent);
 }
 
 PythonObject SWIGBridge::ToSWIGWrapper(
diff --git a/lldb/bindings/python/python-wrapper.swig b/lldb/bindings/python/python-wrapper.swig
index 5c28d65282407..17bc7b1f21987 100644
--- a/lldb/bindings/python/python-wrapper.swig
+++ b/lldb/bindings/python/python-wrapper.swig
@@ -229,6 +229,133 @@ PythonObject lldb_private::python::SWIGBridge::LLDBSwigPythonCreateCommandObject
   return pfunc(SWIGBridge::ToSWIGWrapper(std::move(debugger_sp)), dict);
 }
 
+PythonObject lldb_private::python::SWIGBridge::LLDBSwigPythonCreateScriptedThreadPlan(
+    const char *python_class_name, const char *session_dictionary_name,
+    const lldb_private::StructuredDataImpl &args_impl,
+    std::string &error_string, const lldb::ThreadPlanSP &thread_plan_sp) {
+  if (python_class_name == NULL || python_class_name[0] == '\0' ||
+      !session_dictionary_name)
+    return PythonObject();
+
+  PyErr_Cleaner py_err_cleaner(true);
+
+  auto dict = PythonModule::MainModule().ResolveName<PythonDictionary>(
+      session_dictionary_name);
+  auto pfunc = PythonObject::ResolveNameWithDictionary<PythonCallable>(
+      python_class_name, dict);
+
+  if (!pfunc.IsAllocated()) {
+    error_string.append("could not find script class: ");
+    error_string.append(python_class_name);
+    return PythonObject();
+  }
+
+  PythonObject tp_arg = SWIGBridge::ToSWIGWrapper(thread_plan_sp);
+
+  llvm::Expected<PythonCallable::ArgInfo> arg_info = pfunc.GetArgInfo();
+  if (!arg_info) {
+    llvm::handleAllErrors(
+        arg_info.takeError(),
+        [&](PythonException &E) { error_string.append(E.ReadBacktrace()); },
+        [&](const llvm::ErrorInfoBase &E) {
+          error_string.append(E.message());
+        });
+    return PythonObject();
+  }
+
+  PythonObject result = {};
+  auto args_sb = std::unique_ptr<lldb::SBStructuredData>(new lldb::SBStructuredData(args_impl));
+  if (arg_info.get().max_positional_args == 2) {
+    if (args_sb->IsValid()) {
+      error_string.assign(
+          "args passed, but __init__ does not take an args dictionary");
+      return PythonObject();
+    }
+    result = pfunc(tp_arg, dict);
+  } else if (arg_info.get().max_positional_args >= 3) {
+    result = pfunc(tp_arg, SWIGBridge::ToSWIGWrapper(std::move(args_sb)), dict);
+  } else {
+    error_string.assign("wrong number of arguments in __init__, should be 2 or "
+                        "3 (not including self)");
+    return PythonObject();
+  }
+
+  // FIXME: At this point we should check that the class we found supports all
+  // the methods that we need.
+
+  return result;
+}
+
+bool lldb_private::python::SWIGBridge::LLDBSWIGPythonCallThreadPlan(
+    void *implementor, const char *method_name, lldb_private::Event *event,
+    bool &got_error) {
+  got_error = false;
+
+  PyErr_Cleaner py_err_cleaner(false);
+  PythonObject self(PyRefType::Borrowed, static_cast<PyObject *>(implementor));
+  auto pfunc = self.ResolveName<PythonCallable>(method_name);
+
+  if (!pfunc.IsAllocated())
+    return false;
+
+  PythonObject result;
+  if (event != nullptr) {
+    ScopedPythonObject<SBEvent> event_arg = SWIGBridge::ToSWIGWrapper(event);
+    result = pfunc(event_arg.obj());
+  } else
+    result = pfunc();
+
+  if (PyErr_Occurred()) {
+    got_error = true;
+    printf("Return value was neither false nor true for call to %s.\n",
+           method_name);
+    PyErr_Print();
+    return false;
+  }
+
+  if (result.get() == Py_True)
+    return true;
+  else if (result.get() == Py_False)
+    return false;
+
+  // Somebody returned the wrong thing...
+  got_error = true;
+  printf("Wrong return value type for call to %s.\n", method_name);
+  return false;
+}
+
+bool lldb_private::python::SWIGBridge::LLDBSWIGPythonCallThreadPlan(
+    void *implementor, const char *method_name, lldb_private::Stream *stream,
+    bool &got_error) {
+  got_error = false;
+
+  PyErr_Cleaner py_err_cleaner(false);
+  PythonObject self(PyRefType::Borrowed, static_cast<PyObject *>(implementor));
+  auto pfunc = self.ResolveName<PythonCallable>(method_name);
+
+  if (!pfunc.IsAllocated()) 
+    return false;
+
+  auto *sb_stream = new lldb::SBStream();
+  PythonObject sb_stream_arg =
+      SWIGBridge::ToSWIGWrapper(std::unique_ptr<lldb::SBStream>(sb_stream));
+
+  PythonObject result;
+  result = pfunc(sb_stream_arg);
+
+  if (PyErr_Occurred()) {
+    printf("Error occured for call to %s.\n",
+           method_name);
+    PyErr_Print();
+    got_error = true;
+    return false;
+  }
+  if (stream)
+    stream->PutCString(sb_stream->GetData());
+  return true;
+
+}
+
 PythonObject lldb_private::python::SWIGBridge::LLDBSwigPythonCreateScriptedBreakpointResolver(
     const char *python_class_name, const char *session_dictionary_name,
     const StructuredDataImpl &args_impl,
@@ -375,7 +502,7 @@ bool lldb_private::python::SWIGBridge::LLDBSwigPythonStopHookCallHandleStop(
 
   auto *sb_stream = new lldb::SBStream();
   PythonObject sb_stream_arg =
-      SWIGBridge::ToSWIGWrapper(stream.get());
+      SWIGBridge::ToSWIGWrapper(std::unique_ptr<lldb::SBStream>(sb_stream));
   PythonObject result =
       pfunc(SWIGBridge::ToSWIGWrapper(std::move(exc_ctx_sp)), sb_stream_arg);
 
@@ -626,30 +753,6 @@ void *lldb_private::python::LLDBSWIGPython_CastPyObjectToSBError(PyObject * data
   return sb_ptr;
 }
 
-void *lldb_private::python::LLDBSWIGPython_CastPyObjectToSBEvent(PyObject * data) {
-  lldb::SBEvent *sb_ptr = nullptr;
-
-  int valid_cast =
-      SWIG_ConvertPtr(data, (void **)&sb_ptr, SWIGTYPE_p_lldb__SBEvent, 0);
-
-  if (valid_cast == -1)
-    return NULL;
-
-  return sb_ptr;
-}
-
-void *lldb_private::python::LLDBSWIGPython_CastPyObjectToSBStream(PyObject * data) {
-  lldb::SBStream *sb_ptr = nullptr;
-
-  int valid_cast =
-      SWIG_ConvertPtr(data, (void **)&sb_ptr, SWIGTYPE_p_lldb__SBStream, 0);
-
-  if (valid_cast == -1)
-    return NULL;
-
-  return sb_ptr;
-}
-
 void *lldb_private::python::LLDBSWIGPython_CastPyObjectToSBValue(PyObject * data) {
   lldb::SBValue *sb_ptr = NULL;
 
diff --git a/lldb/include/lldb/API/SBEvent.h b/lldb/include/lldb/API/SBEvent.h
index 85b401ca8cc10..cc116766e85f4 100644
--- a/lldb/include/lldb/API/SBEvent.h
+++ b/lldb/include/lldb/API/SBEvent.h
@@ -15,7 +15,6 @@
 #include <vector>
 
 namespace lldb_private {
-class ScriptInterpreter;
 namespace python {
 class SWIGBridge;
 }
@@ -74,12 +73,11 @@ class LLDB_API SBEvent {
   friend class SBThread;
   friend class SBWatchpoint;
 
-  friend class lldb_private::ScriptInterpreter;
   friend class lldb_private::python::SWIGBridge;
 
   SBEvent(lldb::EventSP &event_sp);
 
-  SBEvent(lldb_private::Event *event);
+  SBEvent(lldb_private::Event *event_sp);
 
   lldb::EventSP &GetSP() const;
 
diff --git a/lldb/include/lldb/API/SBStream.h b/lldb/include/lldb/API/SBStream.h
index ee329737d594b..0e33f05b69916 100644
--- a/lldb/include/lldb/API/SBStream.h
+++ b/lldb/include/lldb/API/SBStream.h
@@ -13,13 +13,6 @@
 
 #include "lldb/API/SBDefines.h"
 
-namespace lldb_private {
-class ScriptInterpreter;
-namespace python {
-class SWIGBridge;
-}
-} // namespace lldb_private
-
 namespace lldb {
 
 class LLDB_API SBStream {
@@ -108,8 +101,6 @@ class LLDB_API SBStream {
   friend class SBValue;
   friend class SBWatchpoint;
 
-  friend class lldb_private::ScriptInterpreter;
-
   lldb_private::Stream *operator->();
 
   lldb_private::Stream *get();
diff --git a/lldb/include/lldb/Interpreter/Interfaces/ScriptedInterface.h b/lldb/include/lldb/Interpreter/Interfaces/ScriptedInterface.h
index fc0e488da6982..e4816352daa5d 100644
--- a/lldb/include/lldb/Interpreter/Interfaces/ScriptedInterface.h
+++ b/lldb/include/lldb/Interpreter/Interfaces/ScriptedInterface.h
@@ -10,6 +10,7 @@
 #define LLDB_INTERPRETER_INTERFACES_SCRIPTEDINTERFACE_H
 
 #include "lldb/Core/StructuredDataImpl.h"
+#include "lldb/Target/ExecutionContext.h"
 #include "lldb/Utility/LLDBLog.h"
 #include "lldb/Utility/Log.h"
 #include "lldb/Utility/UnimplementedError.h"
@@ -49,8 +50,7 @@ class ScriptedInterface {
   }
 
   template <typename T = StructuredData::ObjectSP>
-  static bool CheckStructuredDataObject(llvm::StringRef caller, T obj,
-                                        Status &error) {
+  bool CheckStructuredDataObject(llvm::StringRef caller, T obj, Status &error) {
     if (!obj)
       return ErrorWithMessage<bool>(caller, "Null Structured Data object",
                                     error);
diff --git a/lldb/include/lldb/Interpreter/Interfaces/ScriptedThreadPlanInterface.h b/lldb/include/lldb/Interpreter/Interfaces/ScriptedThreadPlanInterface.h
deleted file mode 100644
index 4dadda4d97898..0000000000000
--- a/lldb/include/lldb/Interpreter/Interfaces/ScriptedThreadPlanInterface.h
+++ /dev/null
@@ -1,40 +0,0 @@
-//===-- ScriptedThreadPlanInterface.h ---------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLDB_INTERPRETER_INTERFACES_SCRIPTEDTHREADPLANINTERFACE_H
-#define LLDB_INTERPRETER_INTERFACES_SCRIPTEDTHREADPLANINTERFACE_H
-
-#include "lldb/lldb-private.h"
-
-#include "ScriptedInterface.h"
-
-namespace lldb_private {
-class ScriptedThreadPlanInterface : public ScriptedInterface {
-public:
-  virtual llvm::Expected<StructuredData::GenericSP>
-  CreatePluginObject(llvm::StringRef class_name,
-                     lldb::ThreadPlanSP thread_plan_sp,
-                     const StructuredDataImpl &args_sp) {
-    llvm_unreachable("unimplemented!");
-  }
-
-  virtual llvm::Expected<bool> ExplainsStop(Event *event) { return true; }
-
-  virtual llvm::Expected<bool> ShouldStop(Event *event) { return true; }
-
-  virtual llvm::Expected<bool> IsStale() { return true; };
-
-  virtual lldb::StateType GetRunState() { return lldb::eStateStepping; }
-
-  virtual llvm::Expected<bool> GetStopDescription(lldb_private::Stream *s) {
-    return true;
-  }
-};
-} // namespace lldb_private
-
-#endif // LLDB_INTERPRETER_INTERFACES_SCRIPTEDTHREADPLANINTERFACE_H
diff --git a/lldb/include/lldb/Interpreter/ScriptInterpreter.h b/lldb/include/lldb/Interpreter/ScriptInterpreter.h
index 7e2a7286e2042..0146eeb862620 100644
--- a/lldb/include/lldb/Interpreter/ScriptInterpreter.h
+++ b/lldb/include/lldb/Interpreter/ScriptInterpreter.h
@@ -13,10 +13,8 @@
 #include "lldb/API/SBBreakpoint.h"
 #include "lldb/API/SBData.h"
 #include "lldb/API/SBError.h"
-#include "lldb/API/SBEvent.h"
 #include "lldb/API/SBLaunchInfo.h"
 #include "lldb/API/SBMemoryRegionInfo.h"
-#include "lldb/API/SBStream.h"
 #include "lldb/Breakpoint/BreakpointOptions.h"
 #include "lldb/Core/PluginInterface.h"
 #include "lldb/Core/SearchFilter.h"
@@ -27,7 +25,6 @@
 #include "lldb/Interpreter/Interfaces/ScriptedPlatformInterface.h"
 #include "lldb/Interpreter/Interfaces/ScriptedProcessInterface.h"
 #include "lldb/Interpreter/Interfaces/ScriptedThreadInterface.h"
-#include "lldb/Interpreter/Interfaces/ScriptedThreadPlanInterface.h"
 #include "lldb/Interpreter/ScriptObject.h"
 #include "lldb/Utility/Broadcaster.h"
 #include "lldb/Utility/Status.h"
@@ -256,6 +253,50 @@ class ScriptInterpreter : public PluginInterface {
     return lldb::ValueObjectListSP();
   }
 
+  virtual StructuredData::ObjectSP
+  CreateScriptedThreadPlan(const char *class_name,
+                           const StructuredDataImpl &args_data,
+                           std::string &error_str,
+                           lldb::ThreadPlanSP thread_plan_sp) {
+    return StructuredData::ObjectSP();
+  }
+
+  virtual bool
+  ScriptedThreadPlanExplainsStop(StructuredData::ObjectSP implementor_sp,
+                                 Event *event, bool &script_error) {
+    script_error = true;
+    return true;
+  }
+
+  virtual bool
+  ScriptedThreadPlanShouldStop(StructuredData::ObjectSP implementor_sp,
+                               Event *event, bool &script_error) {
+    script_error = true;
+    return true;
+  }
+
+  virtual bool
+  ScriptedThreadPlanIsStale(StructuredData::ObjectSP implementor_sp,
+                            bool &script_error) {
+    script_error = true;
+    return true;
+  }
+
+  virtual lldb::StateType
+  ScriptedThreadPlanGetRunState(StructuredData::ObjectSP implementor_sp,
+                                bool &script_error) {
+    script_error = true;
+    return lldb::eStateStepping;
+  }
+
+  virtual bool
+  ScriptedThreadPlanGetStopDescription(StructuredData::ObjectSP implementor_sp,
+                                       lldb_private::Stream *stream,
+                                       bool &script_error) {
+    script_error = true;
+    return false;
+  }
+
   virtual StructuredData::GenericSP
   CreateScriptedBreakpointResolver(const char *class_name,
                                    const StructuredDataImpl &args_data,
@@ -525,11 +566,6 @@ class ScriptInterpreter : public PluginInterface {
     return std::make_shared<ScriptedThreadInterface>();
   }
 
-  virtual lldb::ScriptedThreadPlanInterfaceSP
-  CreateScriptedThreadPlanInterface() {
-    return std::make_shared<ScriptedThreadPlanInterface>();
-  }
-
   virtual lldb::OperatingSystemInterfaceSP CreateOperatingSystemInterface() {
     return std::make_shared<OperatingSystemInterface>();
   }
@@ -548,10 +584,6 @@ class ScriptInterpreter : public PluginInterface {
 
   Status GetStatusFromSBError(const lldb::SBError &error) const;
 
-  Event *GetOpaqueTypeFromSBEvent(const lldb::SBEvent &event) const;
-
-  Stream *GetOpaqueTypeFromSBStream(const lldb::SBStream &stream) const;
-
   lldb::BreakpointSP
   GetOpaqueTypeFromSBBreakpoint(const lldb::SBBreakpoint &breakpoint) const;
 
diff --git a/lldb/include/lldb/Target/ThreadPlanPython.h b/lldb/include/lldb/Target/ThreadPlanPython.h
index da106faf951db..64854d66b8f25 100644
--- a/lldb/include/lldb/Target/ThreadPlanPython.h
+++ b/lldb/include/lldb/Target/ThreadPlanPython.h
@@ -13,7 +13,6 @@
 #include <string>
 
 #include "lldb/Core/StructuredDataImpl.h"
-#include "lldb/Interpreter/Interfaces/ScriptedThreadPlanInterface.h"
 #include "lldb/Target/Process.h"
 #include "lldb/Target/StopInfo.h"
 #include "lldb/Target/Target.h"
@@ -71,7 +70,6 @@ class ThreadPlanPython : public ThreadPlan {
   StreamString m_stop_description; // Cache the stop description here
   bool m_did_push;
   bool m_stop_others;
-  lldb::ScriptedThreadPlanInterfaceSP m_interface;
 
   ThreadPlanPython(const ThreadPlanPython &) = delete;
   const ThreadPlanPython &operator=(const ThreadPlanPython &) = delete;
diff --git a/lldb/include/lldb/lldb-forward.h b/lldb/include/lldb/lldb-forward.h
index 6138e6fe5a60b..aa099d4abc3b0 100644
--- a/lldb/include/lldb/lldb-forward.h
+++ b/lldb/include/lldb/lldb-forward.h
@@ -185,7 +185,6 @@ class ScriptedMetadata;
 class ScriptedPlatformInterface;
 class ScriptedProcessInterface;
 class ScriptedThreadInterface;
-class ScriptedThreadPlanInterface;
 class ScriptedSyntheticChildren;
 class SearchFilter;
 class Section;
@@ -394,8 +393,6 @@ typedef std::unique_ptr<lldb_private::ScriptedProcessInterface>
     ScriptedProcessInterfaceUP;
 typedef std::shared_ptr<lldb_private::ScriptedThreadInterface>
     ScriptedThreadInterfaceSP;
-typedef std::shared_ptr<lldb_private::ScriptedThreadPlanInterface>
-    ScriptedThreadPlanInterfaceSP;
 typedef std::shared_ptr<lldb_private::Section> SectionSP;
 typedef std::unique_ptr<lldb_private::SectionList> SectionListUP;
 typedef std::weak_ptr<lldb_private::Section> SectionWP;
diff --git a/lldb/source/Interpreter/ScriptInterpreter.cpp b/lldb/source/Interpreter/ScriptInterpreter.cpp
index aee2ec94d7979..fb3fa74d0b978 100644
--- a/lldb/source/Interpreter/ScriptInterpreter.cpp
+++ b/lldb/source/Interpreter/ScriptInterpreter.cpp
@@ -104,19 +104,6 @@ ScriptInterpreter::GetStatusFromSBError(const lldb::SBError &error) const {
   return Status();
 }
 
-Event *
-ScriptInterpreter::GetOpaqueTypeFromSBEvent(const lldb::SBEvent &event) const {
-  return event.m_opaque_ptr;
-}
-
-Stream *ScriptInterpreter::GetOpaqueTypeFromSBStream(
-    const lldb::SBStream &stream) const {
-  if (stream.m_opaque_up)
-    return const_cast<lldb::SBStream &>(stream).m_opaque_up.get();
-
-  return nullptr;
-}
-
 std::optional<MemoryRegionInfo>
 ScriptInterpreter::GetOpaqueTypeFromSBMemoryRegionInfo(
     const lldb::SBMemoryRegionInfo &mem_region) const {
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/CMakeLists.txt b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/CMakeLists.txt
index c60e4bb503a37..b22abc49c92a9 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/CMakeLists.txt
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/CMakeLists.txt
@@ -24,7 +24,6 @@ add_lldb_library(lldbPluginScriptInterpreterPythonInterfaces
   ScriptedPythonInterface.cpp
   ScriptedProcessPythonInterface.cpp
   ScriptedThreadPythonInterface.cpp
-  ScriptedThreadPlanPythonInterface.cpp
   ScriptedPlatformPythonInterface.cpp
 
   LINK_LIBS
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPlatformPythonInterface.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPlatformPythonInterface.cpp
index 6e93bec80056e..9ba4731032bd3 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPlatformPythonInterface.cpp
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPlatformPythonInterface.cpp
@@ -20,8 +20,6 @@
 #include "../ScriptInterpreterPythonImpl.h"
 #include "ScriptedPlatformPythonInterface.h"
 
-#include "lldb/Target/ExecutionContext.h"
-
 using namespace lldb;
 using namespace lldb_private;
 using namespace lldb_private::python;
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.cpp
index 7d072212676e1..6f22503b279ca 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.cpp
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.cpp
@@ -26,15 +26,6 @@ ScriptedPythonInterface::ScriptedPythonInterface(
     ScriptInterpreterPythonImpl &interpreter)
     : ScriptedInterface(), m_interpreter(interpreter) {}
 
-template <>
-void ScriptedPythonInterface::ReverseTransform(
-    lldb_private::Stream *&original_arg, python::PythonObject transformed_arg,
-    Status &error) {
-  Stream *s = ExtractValueFromPythonObject<Stream *>(transformed_arg, error);
-  *original_arg = *s;
-  original_arg->PutCString(static_cast<StreamString *>(s)->GetData());
-}
-
 template <>
 StructuredData::ArraySP
 ScriptedPythonInterface::ExtractValueFromPythonObject<StructuredData::ArraySP>(
@@ -57,33 +48,12 @@ Status ScriptedPythonInterface::ExtractValueFromPythonObject<Status>(
   if (lldb::SBError *sb_error = reinterpret_cast<lldb::SBError *>(
           python::LLDBSWIGPython_CastPyObjectToSBError(p.get())))
     return m_interpreter.GetStatusFromSBError(*sb_error);
-  error.SetErrorString("Couldn't cast lldb::SBError to lldb::Status.");
+  else
+    error.SetErrorString("Couldn't cast lldb::SBError to lldb::Status.");
 
   return {};
 }
 
-template <>
-Event *ScriptedPythonInterface::ExtractValueFromPythonObject<Event *>(
-    python::PythonObject &p, Status &error) {
-  if (lldb::SBEvent *sb_event = reinterpret_cast<lldb::SBEvent *>(
-          python::LLDBSWIGPython_CastPyObjectToSBEvent(p.get())))
-    return m_interpreter.GetOpaqueTypeFromSBEvent(*sb_event);
-  error.SetErrorString("Couldn't cast lldb::SBEvent to lldb_private::Event.");
-
-  return nullptr;
-}
-
-template <>
-Stream *ScriptedPythonInterface::ExtractValueFromPythonObject<Stream *>(
-    python::PythonObject &p, Status &error) {
-  if (lldb::SBStream *sb_stream = reinterpret_cast<lldb::SBStream *>(
-          python::LLDBSWIGPython_CastPyObjectToSBStream(p.get())))
-    return m_interpreter.GetOpaqueTypeFromSBStream(*sb_stream);
-  error.SetErrorString("Couldn't cast lldb::SBStream to lldb_private::Stream.");
-
-  return nullptr;
-}
-
 template <>
 lldb::DataExtractorSP
 ScriptedPythonInterface::ExtractValueFromPythonObject<lldb::DataExtractorSP>(
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.h b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.h
index cc760938c8995..7af9816397099 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.h
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedPythonInterface.h
@@ -224,10 +224,6 @@ class ScriptedPythonInterface : virtual public ScriptedInterface {
     return python::SWIGBridge::ToSWIGWrapper(arg);
   }
 
-  python::PythonObject Transform(lldb::ThreadPlanSP arg) {
-    return python::SWIGBridge::ToSWIGWrapper(arg);
-  }
-
   python::PythonObject Transform(lldb::ProcessAttachInfoSP arg) {
     return python::SWIGBridge::ToSWIGWrapper(arg);
   }
@@ -236,14 +232,6 @@ class ScriptedPythonInterface : virtual public ScriptedInterface {
     return python::SWIGBridge::ToSWIGWrapper(arg);
   }
 
-  python::PythonObject Transform(Event *arg) {
-    return python::SWIGBridge::ToSWIGWrapper(arg);
-  }
-
-  python::PythonObject Transform(Stream *arg) {
-    return python::SWIGBridge::ToSWIGWrapper(arg);
-  }
-
   python::PythonObject Transform(lldb::DataExtractorSP arg) {
     return python::SWIGBridge::ToSWIGWrapper(arg);
   }
@@ -341,14 +329,6 @@ template <>
 Status ScriptedPythonInterface::ExtractValueFromPythonObject<Status>(
     python::PythonObject &p, Status &error);
 
-template <>
-Event *ScriptedPythonInterface::ExtractValueFromPythonObject<Event *>(
-    python::PythonObject &p, Status &error);
-
-template <>
-Stream *ScriptedPythonInterface::ExtractValueFromPythonObject<Stream *>(
-    python::PythonObject &p, Status &error);
-
 template <>
 lldb::BreakpointSP
 ScriptedPythonInterface::ExtractValueFromPythonObject<lldb::BreakpointSP>(
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPlanPythonInterface.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPlanPythonInterface.cpp
deleted file mode 100644
index df9f7db6f62b0..0000000000000
--- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPlanPythonInterface.cpp
+++ /dev/null
@@ -1,92 +0,0 @@
-//===-- ScriptedThreadPlanPythonInterface.cpp -----------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "lldb/Host/Config.h"
-#include "lldb/Utility/Log.h"
-#include "lldb/lldb-enumerations.h"
-
-#if LLDB_ENABLE_PYTHON
-
-// LLDB Python header must be included first
-#include "../lldb-python.h"
-
-#include "../SWIGPythonBridge.h"
-#include "../ScriptInterpreterPythonImpl.h"
-#include "ScriptedThreadPlanPythonInterface.h"
-
-using namespace lldb;
-using namespace lldb_private;
-using namespace lldb_private::python;
-
-ScriptedThreadPlanPythonInterface::ScriptedThreadPlanPythonInterface(
-    ScriptInterpreterPythonImpl &interpreter)
-    : ScriptedThreadPlanInterface(), ScriptedPythonInterface(interpreter) {}
-
-llvm::Expected<StructuredData::GenericSP>
-ScriptedThreadPlanPythonInterface::CreatePluginObject(
-    const llvm::StringRef class_name, lldb::ThreadPlanSP thread_plan_sp,
-    const StructuredDataImpl &args_sp) {
-  return ScriptedPythonInterface::CreatePluginObject(class_name, nullptr,
-                                                     thread_plan_sp, args_sp);
-}
-
-llvm::Expected<bool>
-ScriptedThreadPlanPythonInterface::ExplainsStop(Event *event) {
-  Status error;
-  StructuredData::ObjectSP obj = Dispatch("explains_stop", error, event);
-
-  if (!CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj, error))
-    return error.ToError();
-
-  return obj->GetBooleanValue();
-}
-
-llvm::Expected<bool>
-ScriptedThreadPlanPythonInterface::ShouldStop(Event *event) {
-  Status error;
-  StructuredData::ObjectSP obj = Dispatch("should_stop", error, event);
-
-  if (!CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj, error))
-    return error.ToError();
-
-  return obj->GetBooleanValue();
-}
-
-llvm::Expected<bool> ScriptedThreadPlanPythonInterface::IsStale() {
-  Status error;
-  StructuredData::ObjectSP obj = Dispatch("is_stale", error);
-
-  if (!CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj, error))
-    return error.ToError();
-
-  return obj->GetBooleanValue();
-}
-
-lldb::StateType ScriptedThreadPlanPythonInterface::GetRunState() {
-  Status error;
-  StructuredData::ObjectSP obj = Dispatch("should_step", error);
-
-  if (!CheckStructuredDataObject(LLVM_PRETTY_FUNCTION, obj, error))
-    return lldb::eStateStepping;
-
-  return static_cast<lldb::StateType>(obj->GetUnsignedIntegerValue(
-      static_cast<uint32_t>(lldb::eStateStepping)));
-}
-
-llvm::Expected<bool>
-ScriptedThreadPlanPythonInterface::GetStopDescription(lldb_private::Stream *s) {
-  Status error;
-  Dispatch("stop_description", error, s);
-
-  if (error.Fail())
-    return error.ToError();
-
-  return true;
-}
-
-#endif
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPlanPythonInterface.h b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPlanPythonInterface.h
deleted file mode 100644
index 2eb986e0282f0..0000000000000
--- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPlanPythonInterface.h
+++ /dev/null
@@ -1,44 +0,0 @@
-//===-- ScriptedThreadPlanPythonInterface.h ---------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLDB_PLUGINS_SCRIPTINTERPRETER_PYTHON_INTERFACES_SCRIPTEDTHREADPLANPYTHONINTERFACE_H
-#define LLDB_PLUGINS_SCRIPTINTERPRETER_PYTHON_INTERFACES_SCRIPTEDTHREADPLANPYTHONINTERFACE_H
-
-#include "lldb/Host/Config.h"
-
-#if LLDB_ENABLE_PYTHON
-
-#include "ScriptedPythonInterface.h"
-#include "lldb/Interpreter/Interfaces/ScriptedThreadPlanInterface.h"
-#include <optional>
-
-namespace lldb_private {
-class ScriptedThreadPlanPythonInterface : public ScriptedThreadPlanInterface,
-                                          public ScriptedPythonInterface {
-public:
-  ScriptedThreadPlanPythonInterface(ScriptInterpreterPythonImpl &interpreter);
-
-  llvm::Expected<StructuredData::GenericSP>
-  CreatePluginObject(const llvm::StringRef class_name,
-                     lldb::ThreadPlanSP thread_plan_sp,
-                     const StructuredDataImpl &args_sp) override;
-
-  llvm::Expected<bool> ExplainsStop(Event *event) override;
-
-  llvm::Expected<bool> ShouldStop(Event *event) override;
-
-  llvm::Expected<bool> IsStale() override;
-
-  lldb::StateType GetRunState() override;
-
-  llvm::Expected<bool> GetStopDescription(lldb_private::Stream *s) override;
-};
-} // namespace lldb_private
-
-#endif // LLDB_ENABLE_PYTHON
-#endif // LLDB_PLUGINS_SCRIPTINTERPRETER_PYTHON_INTERFACES_SCRIPTEDTHREADPLANPYTHONINTERFACE_H
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPythonInterface.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPythonInterface.cpp
index ba2ec0e78e9f5..18e268527eb2f 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPythonInterface.cpp
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/Interfaces/ScriptedThreadPythonInterface.cpp
@@ -7,7 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "lldb/Host/Config.h"
-#include "lldb/Target/ExecutionContext.h"
 #include "lldb/Utility/Log.h"
 #include "lldb/lldb-enumerations.h"
 
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/SWIGPythonBridge.h b/lldb/source/Plugins/ScriptInterpreter/Python/SWIGPythonBridge.h
index 41f3a80a02b13..7cdd5577919ba 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/SWIGPythonBridge.h
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/SWIGPythonBridge.h
@@ -96,13 +96,12 @@ class SWIGBridge {
   static PythonObject ToSWIGWrapper(lldb::ExecutionContextRefSP ctx_sp);
   static PythonObject ToSWIGWrapper(const TypeSummaryOptions &summary_options);
   static PythonObject ToSWIGWrapper(const SymbolContext &sym_ctx);
-  static PythonObject ToSWIGWrapper(const Stream *stream);
-  static PythonObject ToSWIGWrapper(Event *event);
 
   static PythonObject ToSWIGWrapper(lldb::ProcessAttachInfoSP attach_info_sp);
   static PythonObject ToSWIGWrapper(lldb::ProcessLaunchInfoSP launch_info_sp);
   static PythonObject ToSWIGWrapper(lldb::DataExtractorSP data_extractor_sp);
 
+  static PythonObject ToSWIGWrapper(std::unique_ptr<lldb::SBStream> stream_sb);
   static PythonObject
   ToSWIGWrapper(std::unique_ptr<lldb::SBStructuredData> data_sb);
   static PythonObject
@@ -112,6 +111,7 @@ class SWIGBridge {
 
   static python::ScopedPythonObject<lldb::SBCommandReturnObject>
   ToSWIGWrapper(CommandReturnObject &cmd_retobj);
+  static python::ScopedPythonObject<lldb::SBEvent> ToSWIGWrapper(Event *event);
   // These prototypes are the Pythonic implementations of the required
   // callbacks. Although these are scripting-language specific, their definition
   // depends on the public API.
@@ -146,6 +146,21 @@ class SWIGBridge {
                                     const char *session_dictionary_name,
                                     lldb::DebuggerSP debugger_sp);
 
+  static python::PythonObject LLDBSwigPythonCreateScriptedThreadPlan(
+      const char *python_class_name, const char *session_dictionary_name,
+      const StructuredDataImpl &args_data, std::string &error_string,
+      const lldb::ThreadPlanSP &thread_plan_sp);
+
+  static bool LLDBSWIGPythonCallThreadPlan(void *implementor,
+                                           const char *method_name,
+                                           lldb_private::Event *event_sp,
+                                           bool &got_error);
+
+  static bool LLDBSWIGPythonCallThreadPlan(void *implementor,
+                                           const char *method_name,
+                                           lldb_private::Stream *stream,
+                                           bool &got_error);
+
   static python::PythonObject LLDBSwigPythonCreateScriptedBreakpointResolver(
       const char *python_class_name, const char *session_dictionary_name,
       const StructuredDataImpl &args, const lldb::BreakpointSP &bkpt_sp);
@@ -247,8 +262,6 @@ void *LLDBSWIGPython_CastPyObjectToSBBreakpoint(PyObject *data);
 void *LLDBSWIGPython_CastPyObjectToSBAttachInfo(PyObject *data);
 void *LLDBSWIGPython_CastPyObjectToSBLaunchInfo(PyObject *data);
 void *LLDBSWIGPython_CastPyObjectToSBError(PyObject *data);
-void *LLDBSWIGPython_CastPyObjectToSBEvent(PyObject *data);
-void *LLDBSWIGPython_CastPyObjectToSBStream(PyObject *data);
 void *LLDBSWIGPython_CastPyObjectToSBValue(PyObject *data);
 void *LLDBSWIGPython_CastPyObjectToSBMemoryRegionInfo(PyObject *data);
 } // namespace python
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp
index b71f856efda2e..953f8b3aba18f 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp
@@ -17,7 +17,6 @@
 #include "Interfaces/OperatingSystemPythonInterface.h"
 #include "Interfaces/ScriptedPlatformPythonInterface.h"
 #include "Interfaces/ScriptedProcessPythonInterface.h"
-#include "Interfaces/ScriptedThreadPlanPythonInterface.h"
 #include "Interfaces/ScriptedThreadPythonInterface.h"
 #include "PythonDataObjects.h"
 #include "PythonReadline.h"
@@ -1536,11 +1535,6 @@ ScriptInterpreterPythonImpl::CreateScriptedThreadInterface() {
   return std::make_shared<ScriptedThreadPythonInterface>(*this);
 }
 
-ScriptedThreadPlanInterfaceSP
-ScriptInterpreterPythonImpl::CreateScriptedThreadPlanInterface() {
-  return std::make_shared<ScriptedThreadPlanPythonInterface>(*this);
-}
-
 OperatingSystemInterfaceSP
 ScriptInterpreterPythonImpl::CreateOperatingSystemInterface() {
   return std::make_shared<OperatingSystemPythonInterface>(*this);
@@ -1557,6 +1551,122 @@ ScriptInterpreterPythonImpl::CreateStructuredDataFromScriptObject(
   return py_obj.CreateStructuredObject();
 }
 
+StructuredData::ObjectSP ScriptInterpreterPythonImpl::CreateScriptedThreadPlan(
+    const char *class_name, const StructuredDataImpl &args_data,
+    std::string &error_str, lldb::ThreadPlanSP thread_plan_sp) {
+  if (class_name == nullptr || class_name[0] == '\0')
+    return StructuredData::ObjectSP();
+
+  if (!thread_plan_sp.get())
+    return {};
+
+  Debugger &debugger = thread_plan_sp->GetTarget().GetDebugger();
+  ScriptInterpreterPythonImpl *python_interpreter =
+      GetPythonInterpreter(debugger);
+
+  if (!python_interpreter)
+    return {};
+
+  Locker py_lock(this,
+                 Locker::AcquireLock | Locker::InitSession | Locker::NoSTDIN);
+  PythonObject ret_val = SWIGBridge::LLDBSwigPythonCreateScriptedThreadPlan(
+      class_name, python_interpreter->m_dictionary_name.c_str(), args_data,
+      error_str, thread_plan_sp);
+  if (!ret_val)
+    return {};
+
+  return StructuredData::ObjectSP(
+      new StructuredPythonObject(std::move(ret_val)));
+}
+
+bool ScriptInterpreterPythonImpl::ScriptedThreadPlanExplainsStop(
+    StructuredData::ObjectSP implementor_sp, Event *event, bool &script_error) {
+  bool explains_stop = true;
+  StructuredData::Generic *generic = nullptr;
+  if (implementor_sp)
+    generic = implementor_sp->GetAsGeneric();
+  if (generic) {
+    Locker py_lock(this,
+                   Locker::AcquireLock | Locker::InitSession | Locker::NoSTDIN);
+    explains_stop = SWIGBridge::LLDBSWIGPythonCallThreadPlan(
+        generic->GetValue(), "explains_stop", event, script_error);
+    if (script_error)
+      return true;
+  }
+  return explains_stop;
+}
+
+bool ScriptInterpreterPythonImpl::ScriptedThreadPlanShouldStop(
+    StructuredData::ObjectSP implementor_sp, Event *event, bool &script_error) {
+  bool should_stop = true;
+  StructuredData::Generic *generic = nullptr;
+  if (implementor_sp)
+    generic = implementor_sp->GetAsGeneric();
+  if (generic) {
+    Locker py_lock(this,
+                   Locker::AcquireLock | Locker::InitSession | Locker::NoSTDIN);
+    should_stop = SWIGBridge::LLDBSWIGPythonCallThreadPlan(
+        generic->GetValue(), "should_stop", event, script_error);
+    if (script_error)
+      return true;
+  }
+  return should_stop;
+}
+
+bool ScriptInterpreterPythonImpl::ScriptedThreadPlanIsStale(
+    StructuredData::ObjectSP implementor_sp, bool &script_error) {
+  bool is_stale = true;
+  StructuredData::Generic *generic = nullptr;
+  if (implementor_sp)
+    generic = implementor_sp->GetAsGeneric();
+  if (generic) {
+    Locker py_lock(this,
+                   Locker::AcquireLock | Locker::InitSession | Locker::NoSTDIN);
+    is_stale = SWIGBridge::LLDBSWIGPythonCallThreadPlan(
+        generic->GetValue(), "is_stale", (Event *)nullptr, script_error);
+    if (script_error)
+      return true;
+  }
+  return is_stale;
+}
+
+lldb::StateType ScriptInterpreterPythonImpl::ScriptedThreadPlanGetRunState(
+    StructuredData::ObjectSP implementor_sp, bool &script_error) {
+  bool should_step = false;
+  StructuredData::Generic *generic = nullptr;
+  if (implementor_sp)
+    generic = implementor_sp->GetAsGeneric();
+  if (generic) {
+    Locker py_lock(this,
+                   Locker::AcquireLock | Locker::InitSession | Locker::NoSTDIN);
+    should_step = SWIGBridge::LLDBSWIGPythonCallThreadPlan(
+        generic->GetValue(), "should_step", (Event *)nullptr, script_error);
+    if (script_error)
+      should_step = true;
+  }
+  if (should_step)
+    return lldb::eStateStepping;
+  return lldb::eStateRunning;
+}
+
+bool
+ScriptInterpreterPythonImpl::ScriptedThreadPlanGetStopDescription(
+    StructuredData::ObjectSP implementor_sp, lldb_private::Stream *stream,
+    bool &script_error) {
+  StructuredData::Generic *generic = nullptr;
+  if (implementor_sp)
+    generic = implementor_sp->GetAsGeneric();
+  if (!generic) {
+    script_error = true;
+    return false;
+  }
+  Locker py_lock(this,
+                   Locker::AcquireLock | Locker::InitSession | Locker::NoSTDIN);
+  return SWIGBridge::LLDBSWIGPythonCallThreadPlan(
+      generic->GetValue(), "stop_description", stream, script_error);
+}
+
+
 StructuredData::GenericSP
 ScriptInterpreterPythonImpl::CreateScriptedBreakpointResolver(
     const char *class_name, const StructuredDataImpl &args_data,
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPythonImpl.h b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPythonImpl.h
index da8e3a63d0847..a33499816d8d3 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPythonImpl.h
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPythonImpl.h
@@ -77,9 +77,34 @@ class ScriptInterpreterPythonImpl : public ScriptInterpreterPython {
   StructuredData::GenericSP
   CreateScriptCommandObject(const char *class_name) override;
 
+  StructuredData::ObjectSP
+  CreateScriptedThreadPlan(const char *class_name,
+                           const StructuredDataImpl &args_data,
+                           std::string &error_str,
+                           lldb::ThreadPlanSP thread_plan) override;
+
   StructuredData::ObjectSP
   CreateStructuredDataFromScriptObject(ScriptObject obj) override;
 
+  bool ScriptedThreadPlanExplainsStop(StructuredData::ObjectSP implementor_sp,
+                                      Event *event,
+                                      bool &script_error) override;
+
+  bool ScriptedThreadPlanShouldStop(StructuredData::ObjectSP implementor_sp,
+                                    Event *event, bool &script_error) override;
+
+  bool ScriptedThreadPlanIsStale(StructuredData::ObjectSP implementor_sp,
+                                 bool &script_error) override;
+
+  lldb::StateType
+  ScriptedThreadPlanGetRunState(StructuredData::ObjectSP implementor_sp,
+                                bool &script_error) override;
+
+  bool
+  ScriptedThreadPlanGetStopDescription(StructuredData::ObjectSP implementor_sp,
+                                lldb_private::Stream *s,
+                                bool &script_error) override;
+
   StructuredData::GenericSP
   CreateScriptedBreakpointResolver(const char *class_name,
                                    const StructuredDataImpl &args_data,
@@ -111,9 +136,6 @@ class ScriptInterpreterPythonImpl : public ScriptInterpreterPython {
 
   lldb::ScriptedThreadInterfaceSP CreateScriptedThreadInterface() override;
 
-  lldb::ScriptedThreadPlanInterfaceSP
-  CreateScriptedThreadPlanInterface() override;
-
   lldb::OperatingSystemInterfaceSP CreateOperatingSystemInterface() override;
 
   StructuredData::ObjectSP
diff --git a/lldb/source/Target/ThreadPlanPython.cpp b/lldb/source/Target/ThreadPlanPython.cpp
index 48b3e3a1bcab6..d6de6b3c3cf04 100644
--- a/lldb/source/Target/ThreadPlanPython.cpp
+++ b/lldb/source/Target/ThreadPlanPython.cpp
@@ -32,23 +32,6 @@ ThreadPlanPython::ThreadPlanPython(Thread &thread, const char *class_name,
                  eVoteNoOpinion, eVoteNoOpinion),
       m_class_name(class_name), m_args_data(args_data), m_did_push(false),
       m_stop_others(false) {
-  ScriptInterpreter *interpreter = GetScriptInterpreter();
-  if (!interpreter) {
-    SetPlanComplete(false);
-    // FIXME: error handling
-    return;
-  }
-
-  m_interface = interpreter->CreateScriptedThreadPlanInterface();
-  if (!m_interface) {
-    SetPlanComplete(false);
-    // FIXME: error handling
-    // error.SetErrorStringWithFormat(
-    //     "ThreadPlanPython::%s () - ERROR: %s", __FUNCTION__,
-    //     "Script interpreter couldn't create Scripted Thread Plan Interface");
-    return;
-  }
-
   SetIsControllingPlan(true);
   SetOkayToDiscard(true);
   SetPrivate(false);
@@ -77,14 +60,13 @@ void ThreadPlanPython::DidPush() {
   // We set up the script side in DidPush, so that it can push other plans in
   // the constructor, and doesn't have to care about the details of DidPush.
   m_did_push = true;
-  if (m_interface) {
-    auto obj_or_err = m_interface->CreatePluginObject(
-        m_class_name, this->shared_from_this(), m_args_data);
-    if (!obj_or_err) {
-      m_error_str = llvm::toString(obj_or_err.takeError());
-      SetPlanComplete(false);
-    } else
-      m_implementation_sp = *obj_or_err;
+  if (!m_class_name.empty()) {
+    ScriptInterpreter *script_interp = GetScriptInterpreter();
+    if (script_interp) {
+      m_implementation_sp = script_interp->CreateScriptedThreadPlan(
+          m_class_name.c_str(), m_args_data, m_error_str, 
+          this->shared_from_this());
+    }
   }
 }
 
@@ -95,13 +77,14 @@ bool ThreadPlanPython::ShouldStop(Event *event_ptr) {
 
   bool should_stop = true;
   if (m_implementation_sp) {
-    auto should_stop_or_err = m_interface->ShouldStop(event_ptr);
-    if (!should_stop_or_err) {
-      LLDB_LOG_ERROR(GetLog(LLDBLog::Thread), should_stop_or_err.takeError(),
-                     "Can't call ScriptedThreadPlan::ShouldStop.");
-      SetPlanComplete(false);
-    } else
-      should_stop = *should_stop_or_err;
+    ScriptInterpreter *script_interp = GetScriptInterpreter();
+    if (script_interp) {
+      bool script_error;
+      should_stop = script_interp->ScriptedThreadPlanShouldStop(
+          m_implementation_sp, event_ptr, script_error);
+      if (script_error)
+        SetPlanComplete(false);
+    }
   }
   return should_stop;
 }
@@ -113,13 +96,14 @@ bool ThreadPlanPython::IsPlanStale() {
 
   bool is_stale = true;
   if (m_implementation_sp) {
-    auto is_stale_or_err = m_interface->IsStale();
-    if (!is_stale_or_err) {
-      LLDB_LOG_ERROR(GetLog(LLDBLog::Thread), is_stale_or_err.takeError(),
-                     "Can't call ScriptedThreadPlan::IsStale.");
-      SetPlanComplete(false);
-    } else
-      is_stale = *is_stale_or_err;
+    ScriptInterpreter *script_interp = GetScriptInterpreter();
+    if (script_interp) {
+      bool script_error;
+      is_stale = script_interp->ScriptedThreadPlanIsStale(m_implementation_sp,
+                                                          script_error);
+      if (script_error)
+        SetPlanComplete(false);
+    }
   }
   return is_stale;
 }
@@ -131,14 +115,14 @@ bool ThreadPlanPython::DoPlanExplainsStop(Event *event_ptr) {
 
   bool explains_stop = true;
   if (m_implementation_sp) {
-    auto explains_stop_or_error = m_interface->ExplainsStop(event_ptr);
-    if (!explains_stop_or_error) {
-      LLDB_LOG_ERROR(GetLog(LLDBLog::Thread),
-                     explains_stop_or_error.takeError(),
-                     "Can't call ScriptedThreadPlan::ExplainsStop.");
-      SetPlanComplete(false);
-    } else
-      explains_stop = *explains_stop_or_error;
+    ScriptInterpreter *script_interp = GetScriptInterpreter();
+    if (script_interp) {
+      bool script_error;
+      explains_stop = script_interp->ScriptedThreadPlanExplainsStop(
+          m_implementation_sp, event_ptr, script_error);
+      if (script_error)
+        SetPlanComplete(false);
+    }
   }
   return explains_stop;
 }
@@ -166,8 +150,14 @@ lldb::StateType ThreadPlanPython::GetPlanRunState() {
   LLDB_LOGF(log, "%s called on Python Thread Plan: %s )", LLVM_PRETTY_FUNCTION,
             m_class_name.c_str());
   lldb::StateType run_state = eStateRunning;
-  if (m_implementation_sp)
-    run_state = m_interface->GetRunState();
+  if (m_implementation_sp) {
+    ScriptInterpreter *script_interp = GetScriptInterpreter();
+    if (script_interp) {
+      bool script_error;
+      run_state = script_interp->ScriptedThreadPlanGetRunState(
+          m_implementation_sp, script_error);
+    }
+  }
   return run_state;
 }
 
@@ -178,13 +168,12 @@ void ThreadPlanPython::GetDescription(Stream *s, lldb::DescriptionLevel level) {
   if (m_implementation_sp) {
     ScriptInterpreter *script_interp = GetScriptInterpreter();
     if (script_interp) {
-      auto desc_or_err = m_interface->GetStopDescription(s);
-      if (!desc_or_err || !*desc_or_err) {
-        LLDB_LOG_ERROR(GetLog(LLDBLog::Thread), desc_or_err.takeError(),
-                       "Can't call ScriptedThreadPlan::GetStopDescription.");
+      bool script_error;
+      bool added_desc = script_interp->ScriptedThreadPlanGetStopDescription(
+          m_implementation_sp, s, script_error);
+      if (script_error || !added_desc)
         s->Printf("Python thread plan implemented by class %s.",
             m_class_name.c_str());
-      }
     }
     return;
   }
diff --git a/lldb/test/API/functionalities/step_scripted/Steps.py b/lldb/test/API/functionalities/step_scripted/Steps.py
index 3325dba753657..7527607be847a 100644
--- a/lldb/test/API/functionalities/step_scripted/Steps.py
+++ b/lldb/test/API/functionalities/step_scripted/Steps.py
@@ -47,7 +47,7 @@ def queue_child_thread_plan(self):
 
 # This plan does a step-over until a variable changes value.
 class StepUntil(StepWithChild):
-    def __init__(self, thread_plan, args_data):
+    def __init__(self, thread_plan, args_data, dict):
         self.thread_plan = thread_plan
         self.frame = thread_plan.GetThread().frames[0]
         self.target = thread_plan.GetThread().GetProcess().GetTarget()
@@ -99,7 +99,7 @@ def stop_description(self, stream):
 class StepReportsStopOthers:
     stop_mode_dict = {}
 
-    def __init__(self, thread_plan, args_data):
+    def __init__(self, thread_plan, args_data, dict):
         self.thread_plan = thread_plan
         self.key = str(args_data.GetValueForKey("token").GetUnsignedIntegerValue(1000))
 
diff --git a/lldb/unittests/ScriptInterpreter/Python/PythonTestSuite.cpp b/lldb/unittests/ScriptInterpreter/Python/PythonTestSuite.cpp
index 72dcf45a867e5..7f3359f6bf26b 100644
--- a/lldb/unittests/ScriptInterpreter/Python/PythonTestSuite.cpp
+++ b/lldb/unittests/ScriptInterpreter/Python/PythonTestSuite.cpp
@@ -96,6 +96,26 @@ lldb_private::python::SWIGBridge::LLDBSwigPythonCreateCommandObject(
   return python::PythonObject();
 }
 
+python::PythonObject
+lldb_private::python::SWIGBridge::LLDBSwigPythonCreateScriptedThreadPlan(
+    const char *python_class_name, const char *session_dictionary_name,
+    const StructuredDataImpl &args_data, std::string &error_string,
+    const lldb::ThreadPlanSP &thread_plan_sp) {
+  return python::PythonObject();
+}
+
+bool lldb_private::python::SWIGBridge::LLDBSWIGPythonCallThreadPlan(
+    void *implementor, const char *method_name, Event *event_sp,
+    bool &got_error) {
+  return false;
+}
+
+bool lldb_private::python::SWIGBridge::LLDBSWIGPythonCallThreadPlan(
+    void *implementor, const char *method_name, Stream *event_sp,
+    bool &got_error) {
+  return false;
+}
+
 python::PythonObject lldb_private::python::SWIGBridge::
     LLDBSwigPythonCreateScriptedBreakpointResolver(
         const char *python_class_name, const char *session_dictionary_name,
@@ -150,16 +170,6 @@ lldb_private::python::LLDBSWIGPython_CastPyObjectToSBError(PyObject *data) {
   return nullptr;
 }
 
-void *
-lldb_private::python::LLDBSWIGPython_CastPyObjectToSBEvent(PyObject *data) {
-  return nullptr;
-}
-
-void *
-lldb_private::python::LLDBSWIGPython_CastPyObjectToSBStream(PyObject *data) {
-  return nullptr;
-}
-
 void *
 lldb_private::python::LLDBSWIGPython_CastPyObjectToSBValue(PyObject *data) {
   return nullptr;
@@ -309,11 +319,6 @@ lldb_private::python::SWIGBridge::ToSWIGWrapper(lldb::ExecutionContextRefSP) {
   return python::PythonObject();
 }
 
-python::PythonObject
-lldb_private::python::SWIGBridge::ToSWIGWrapper(lldb::ThreadPlanSP) {
-  return python::PythonObject();
-}
-
 python::PythonObject
 lldb_private::python::SWIGBridge::ToSWIGWrapper(lldb::ProcessSP) {
   return python::PythonObject();
@@ -323,13 +328,3 @@ python::PythonObject lldb_private::python::SWIGBridge::ToSWIGWrapper(
     const lldb_private::StructuredDataImpl &) {
   return python::PythonObject();
 }
-
-python::PythonObject
-lldb_private::python::SWIGBridge::ToSWIGWrapper(Event *event) {
-  return python::PythonObject();
-}
-
-python::PythonObject
-lldb_private::python::SWIGBridge::ToSWIGWrapper(const Stream *stream) {
-  return python::PythonObject();
-}

From 7fbd427f5ebea4a4ebf25747758851875bb7e173 Mon Sep 17 00:00:00 2001
From: Greg Clayton <gclayton@fb.com>
Date: Mon, 30 Oct 2023 17:46:18 -0700
Subject: [PATCH 458/877] =?UTF-8?q?Add=20the=20ability=20to=20get=20a=20C+?=
 =?UTF-8?q?+=20vtable=20ValueObject=20from=20another=20ValueObj=E2=80=A6?=
 =?UTF-8?q?=20(#67599)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add the ability to get a C++ vtable ValueObject from another
ValueObject.

This patch adds the ability to ask a ValueObject for a ValueObject that
represents the virtual function table for a C++ class. If the
ValueObject is not a C++ class with a vtable, a valid ValueObject value
will be returned that contains an appropriate error. If it is successful
a valid ValueObject that represents vtable will be returned. The
ValueObject that is returned will have a name that matches the demangled
value for a C++ vtable mangled name like "vtable for <class-name>". It
will have N children, one for each virtual function pointer. Each
child's value is the function pointer itself, the summary is the
symbolication of this function pointer, and the type will be a valid
function pointer from the debug info if there is debug information
corresponding to the virtual function pointer.

The vtable SBValue will have the following:
- SBValue::GetName() returns "vtable for <class>"
- SBValue::GetValue() returns a string representation of the vtable
address
- SBValue::GetSummary() returns NULL
- SBValue::GetType() returns a type appropriate for a uintptr_t type for
the current process
- SBValue::GetLoadAddress() returns the address of the vtable adderess
- SBValue::GetValueAsUnsigned(...) returns the vtable address
- SBValue::GetNumChildren() returns the number of virtual function
pointers in the vtable
- SBValue::GetChildAtIndex(...) returns a SBValue that represents a
virtual function pointer

The child SBValue objects that represent a virtual function pointer has
the following values:
- SBValue::GetName() returns "[%u]" where %u is the vtable function
pointer index
- SBValue::GetValue() returns a string representation of the virtual
function pointer
- SBValue::GetSummary() returns a symbolicated respresentation of the
virtual function pointer
- SBValue::GetType() returns the function prototype type if there is
debug info, or a generic funtion prototype if there is no debug info
- SBValue::GetLoadAddress() returns the address of the virtual function
pointer
- SBValue::GetValueAsUnsigned(...) returns the virtual function pointer
- SBValue::GetNumChildren() returns 0
- SBValue::GetChildAtIndex(...) returns invalid SBValue for any index

Examples of using this API via python:

```
(lldb) script vtable = lldb.frame.FindVariable("shape_ptr").GetVTable()
(lldb) script vtable
vtable for Shape = 0x0000000100004088 {
  [0] = 0x0000000100003d20 a.out`Shape::~Shape() at main.cpp:3
  [1] = 0x0000000100003e4c a.out`Shape::~Shape() at main.cpp:3
  [2] = 0x0000000100003e7c a.out`Shape::area() at main.cpp:4
  [3] = 0x0000000100003e3c a.out`Shape::optional() at main.cpp:7
}
(lldb) script c = vtable.GetChildAtIndex(0)
(lldb) script c
(void ()) [0] = 0x0000000100003d20 a.out`Shape::~Shape() at main.cpp:3
```
---
 lldb/bindings/interface/SBTypeDocstrings.i    |  13 -
 lldb/include/lldb/API/SBValue.h               |  46 +++
 lldb/include/lldb/Core/ValueObject.h          |   4 +
 lldb/include/lldb/Core/ValueObjectChild.h     |   1 +
 lldb/include/lldb/Core/ValueObjectVTable.h    | 105 +++++
 lldb/include/lldb/Symbol/Type.h               |   2 +
 lldb/include/lldb/Symbol/TypeSystem.h         |   4 +
 lldb/include/lldb/Target/LanguageRuntime.h    |  26 ++
 lldb/include/lldb/lldb-enumerations.h         |   4 +-
 lldb/source/API/SBValue.cpp                   |  17 +-
 lldb/source/Commands/CommandObjectFrame.cpp   |   2 +
 lldb/source/Core/CMakeLists.txt               |   1 +
 lldb/source/Core/ValueObject.cpp              |   5 +
 lldb/source/Core/ValueObjectVTable.cpp        | 274 +++++++++++++
 .../DataFormatters/CXXFunctionPointer.cpp     |   6 +-
 .../Language/CPlusPlus/CPlusPlusLanguage.cpp  |   4 +-
 .../ItaniumABI/ItaniumABILanguageRuntime.cpp  | 387 +++++++++++-------
 .../ItaniumABI/ItaniumABILanguageRuntime.h    |  27 +-
 .../TypeSystem/Clang/TypeSystemClang.cpp      |  28 +-
 .../TypeSystem/Clang/TypeSystemClang.h        |   4 +
 lldb/source/Symbol/Type.cpp                   |   4 +
 lldb/test/API/functionalities/vtable/Makefile |   3 +
 .../functionalities/vtable/TestVTableValue.py | 186 +++++++++
 lldb/test/API/functionalities/vtable/main.cpp |  38 ++
 24 files changed, 1013 insertions(+), 178 deletions(-)
 create mode 100644 lldb/include/lldb/Core/ValueObjectVTable.h
 create mode 100644 lldb/source/Core/ValueObjectVTable.cpp
 create mode 100644 lldb/test/API/functionalities/vtable/Makefile
 create mode 100644 lldb/test/API/functionalities/vtable/TestVTableValue.py
 create mode 100644 lldb/test/API/functionalities/vtable/main.cpp

diff --git a/lldb/bindings/interface/SBTypeDocstrings.i b/lldb/bindings/interface/SBTypeDocstrings.i
index c49e9647ba046..b056354922094 100644
--- a/lldb/bindings/interface/SBTypeDocstrings.i
+++ b/lldb/bindings/interface/SBTypeDocstrings.i
@@ -139,19 +139,6 @@ SBType supports the eq/ne operator. For example,::
     "
 ) lldb::SBType::IsReferenceType;
 
-%feature("docstring",
-    "Returns true if this type is a function type.
-
-    Language-specific behaviour:
-
-    * C: Returns true for types that represent functions. Note that function
-      pointers are not function types (but their `GetPointeeType()` are function
-      types).
-    * C++: Same as in C.
-    * Objective-C: Returns false for all types.
-    "
-) lldb::SBType::IsPolymorphicClass;
-
 %feature("docstring",
     "Returns true if this type is a polymorphic type.
 
diff --git a/lldb/include/lldb/API/SBValue.h b/lldb/include/lldb/API/SBValue.h
index b66c2d5642b6f..bbcccaab51aae 100644
--- a/lldb/include/lldb/API/SBValue.h
+++ b/lldb/include/lldb/API/SBValue.h
@@ -374,6 +374,52 @@ class LLDB_API SBValue {
   lldb::SBWatchpoint WatchPointee(bool resolve_location, bool read, bool write,
                                   SBError &error);
 
+  /// If this value represents a C++ class that has a vtable, return an value
+  /// that represents the virtual function table.
+  ///
+  /// SBValue::GetError() will be in the success state if this value represents
+  /// a C++ class with a vtable, or an appropriate error describing that the
+  /// object isn't a C++ class with a vtable or not a C++ class.
+  ///
+  /// SBValue::GetName() will be the demangled symbol name for the virtual
+  /// function table like "vtable for <classname>".
+  ///
+  /// SBValue::GetValue() will be the address of the first vtable entry if the
+  /// current SBValue is a class with a vtable, or nothing the current SBValue
+  /// is not a C++ class or not a C++ class that has a vtable.
+  ///
+  /// SBValue::GetValueAtUnsigned(...) will return the address of the first
+  /// vtable entry.
+  ///
+  /// SBValue::GetLoadAddress() will return the address of the vtable pointer
+  /// found in the parent SBValue.
+  ///
+  /// SBValue::GetNumChildren() will return the number of virtual function
+  /// pointers in the vtable, or zero on error.
+  ///
+  /// SBValue::GetChildAtIndex(...) will return each virtual function pointer
+  /// as a SBValue object.
+  ///
+  /// The child SBValue objects will have the following values:
+  ///
+  /// SBValue::GetError() will indicate success if the vtable entry was
+  /// successfully read from memory, or an error if not.
+  ///
+  /// SBValue::GetName() will be the vtable function index in the form "[%u]"
+  /// where %u is the index.
+  ///
+  /// SBValue::GetValue() will be the virtual function pointer value as a
+  /// string.
+  ///
+  /// SBValue::GetValueAtUnsigned(...) will return the virtual function
+  /// pointer value.
+  ///
+  /// SBValue::GetLoadAddress() will return the address of the virtual function
+  /// pointer.
+  ///
+  /// SBValue::GetNumChildren() returns 0
+  lldb::SBValue GetVTable();
+
 protected:
   friend class SBBlock;
   friend class SBFrame;
diff --git a/lldb/include/lldb/Core/ValueObject.h b/lldb/include/lldb/Core/ValueObject.h
index 3af94f0a86e2f..20b3086138457 100644
--- a/lldb/include/lldb/Core/ValueObject.h
+++ b/lldb/include/lldb/Core/ValueObject.h
@@ -620,6 +620,10 @@ class ValueObject {
   virtual lldb::ValueObjectSP CastPointerType(const char *name,
                                               lldb::TypeSP &type_sp);
 
+  /// If this object represents a C++ class with a vtable, return an object
+  /// that represents the virtual function table. If the object isn't a class
+  /// with a vtable, return a valid ValueObject with the error set correctly.
+  lldb::ValueObjectSP GetVTable();
   // The backing bits of this value object were updated, clear any descriptive
   // string, so we know we have to refetch them.
   void ValueUpdated() {
diff --git a/lldb/include/lldb/Core/ValueObjectChild.h b/lldb/include/lldb/Core/ValueObjectChild.h
index 07b37aa8a405f..46b14e6840f0d 100644
--- a/lldb/include/lldb/Core/ValueObjectChild.h
+++ b/lldb/include/lldb/Core/ValueObjectChild.h
@@ -73,6 +73,7 @@ class ValueObjectChild : public ValueObject {
   friend class ValueObject;
   friend class ValueObjectConstResult;
   friend class ValueObjectConstResultImpl;
+  friend class ValueObjectVTable;
 
   ValueObjectChild(ValueObject &parent, const CompilerType &compiler_type,
                    ConstString name, uint64_t byte_size,
diff --git a/lldb/include/lldb/Core/ValueObjectVTable.h b/lldb/include/lldb/Core/ValueObjectVTable.h
new file mode 100644
index 0000000000000..217ff8d0d334c
--- /dev/null
+++ b/lldb/include/lldb/Core/ValueObjectVTable.h
@@ -0,0 +1,105 @@
+//===-- ValueObjectVTable.h -------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLDB_CORE_VALUEOBJECTVTABLE_H
+#define LLDB_CORE_VALUEOBJECTVTABLE_H
+
+#include "lldb/Core/ValueObject.h"
+
+namespace lldb_private {
+
+/// A class that represents a virtual function table for a C++ class.
+///
+/// ValueObject::GetError() will be in the success state if this value
+/// represents a C++ class with a vtable, or an appropriate error describing
+/// that the object isn't a C++ class with a vtable or not a C++ class.
+///
+/// ValueObject::GetName() will be the demangled symbol name for the virtual
+/// function table like "vtable for <classname>".
+///
+/// ValueObject::GetValueAsCString() will be the address of the first vtable
+/// entry if the current ValueObject is a class with a vtable, or nothing the
+/// current ValueObject is not a C++ class or not a C++ class that has a
+/// vtable.
+///
+/// ValueObject::GetValueAtUnsigned(...) will return the address of the first
+/// vtable entry.
+///
+/// ValueObject::GetAddressOf() will return the address of the vtable pointer
+/// found in the parent ValueObject.
+///
+/// ValueObject::GetNumChildren() will return the number of virtual function
+/// pointers in the vtable, or zero on error.
+///
+/// ValueObject::GetChildAtIndex(...) will return each virtual function pointer
+/// as a ValueObject object.
+///
+/// The child ValueObjects will have the following values:
+///
+/// ValueObject::GetError() will indicate success if the vtable entry was
+/// successfully read from memory, or an error if not.
+///
+/// ValueObject::GetName() will be the vtable function index in the form "[%u]"
+/// where %u is the index.
+///
+/// ValueObject::GetValueAsCString() will be the virtual function pointer value
+///
+/// ValueObject::GetValueAtUnsigned(...) will return the virtual function
+/// pointer value.
+///
+/// ValueObject::GetAddressOf() will return the address of the virtual function
+/// pointer.
+///
+/// ValueObject::GetNumChildren() returns 0
+class ValueObjectVTable : public ValueObject {
+public:
+  ~ValueObjectVTable() override;
+
+  static lldb::ValueObjectSP Create(ValueObject &parent);
+
+  std::optional<uint64_t> GetByteSize() override;
+
+  size_t CalculateNumChildren(uint32_t max) override;
+
+  ValueObject *CreateChildAtIndex(size_t idx, bool synthetic_array_member,
+                                  int32_t synthetic_index) override;
+
+  lldb::ValueType GetValueType() const override;
+
+  ConstString GetTypeName() override;
+
+  ConstString GetQualifiedTypeName() override;
+
+  ConstString GetDisplayTypeName() override;
+
+  bool IsInScope() override;
+
+protected:
+  bool UpdateValue() override;
+
+  CompilerType GetCompilerTypeImpl() override;
+
+  /// The symbol for the C++ virtual function table.
+  const Symbol *m_vtable_symbol = nullptr;
+  /// Cache the number of vtable children when we update the value.
+  uint32_t m_num_vtable_entries = 0;
+  /// Cache the address size in bytes to avoid checking with the process to
+  /// many times.
+  uint32_t m_addr_size = 0;
+
+private:
+  ValueObjectVTable(ValueObject &parent);
+
+  // For ValueObject only
+  ValueObjectVTable(const ValueObjectVTable &) = delete;
+  const ValueObjectVTable &operator=(const ValueObjectVTable &) = delete;
+};
+
+} // namespace lldb_private
+
+#endif // LLDB_CORE_VALUEOBJECTVTABLE_H
diff --git a/lldb/include/lldb/Symbol/Type.h b/lldb/include/lldb/Symbol/Type.h
index c5ef1f5302120..15edbea3cc7ae 100644
--- a/lldb/include/lldb/Symbol/Type.h
+++ b/lldb/include/lldb/Symbol/Type.h
@@ -420,6 +420,8 @@ class TypeAndOrName {
 
   void SetName(const char *type_name_cstr);
 
+  void SetName(llvm::StringRef name);
+
   void SetTypeSP(lldb::TypeSP type_sp);
 
   void SetCompilerType(CompilerType compiler_type);
diff --git a/lldb/include/lldb/Symbol/TypeSystem.h b/lldb/include/lldb/Symbol/TypeSystem.h
index 5ac16be3347ff..cd5004a3f34db 100644
--- a/lldb/include/lldb/Symbol/TypeSystem.h
+++ b/lldb/include/lldb/Symbol/TypeSystem.h
@@ -437,6 +437,10 @@ class TypeSystem : public PluginInterface,
 
   virtual CompilerType GetBasicTypeFromAST(lldb::BasicType basic_type) = 0;
 
+  virtual CompilerType CreateGenericFunctionPrototype() {
+    return CompilerType();
+  }
+
   virtual CompilerType
   GetBuiltinTypeForEncodingAndBitSize(lldb::Encoding encoding,
                                       size_t bit_size) = 0;
diff --git a/lldb/include/lldb/Target/LanguageRuntime.h b/lldb/include/lldb/Target/LanguageRuntime.h
index eff79a0bf0d06..a2a9c0163f082 100644
--- a/lldb/include/lldb/Target/LanguageRuntime.h
+++ b/lldb/include/lldb/Target/LanguageRuntime.h
@@ -78,6 +78,32 @@ class LanguageRuntime : public Runtime, public PluginInterface {
   virtual bool GetObjectDescription(Stream &str, Value &value,
                                     ExecutionContextScope *exe_scope) = 0;
 
+
+  struct VTableInfo {
+    Address addr; /// Address of the vtable's virtual function table
+    Symbol *symbol; /// The vtable symbol from the symbol table
+  };
+  /// Get the vtable information for a given value.
+  ///
+  /// \param[in] in_value
+  ///     The value object to try and extract the VTableInfo from.
+  ///
+  /// \param[in] check_type
+  ///     If true, the compiler type of \a in_value will be checked to see if
+  ///     it is an instance to, or pointer or reference to a class or struct
+  ///     that has a vtable. If the type doesn't meet the requirements, an
+  ///     error will be returned explaining why the type isn't suitable.
+  ///
+  /// \return
+  ///     An error if anything goes wrong while trying to extract the vtable
+  ///     or if \a check_type is true and the type doesn't have a vtable.
+  virtual llvm::Expected<VTableInfo> GetVTableInfo(ValueObject &in_value,
+                                                   bool check_type) {
+    return llvm::createStringError(
+        std::errc::invalid_argument,
+        "language doesn't support getting vtable information");
+  }
+
   // this call should return true if it could set the name and/or the type
   virtual bool GetDynamicTypeAndAddress(ValueObject &in_value,
                                         lldb::DynamicValueType use_dynamic,
diff --git a/lldb/include/lldb/lldb-enumerations.h b/lldb/include/lldb/lldb-enumerations.h
index 206ff4ed7e6ad..633a3ee696c20 100644
--- a/lldb/include/lldb/lldb-enumerations.h
+++ b/lldb/include/lldb/lldb-enumerations.h
@@ -322,7 +322,9 @@ enum ValueType {
   eValueTypeRegister = 5,         ///< stack frame register value
   eValueTypeRegisterSet = 6, ///< A collection of stack frame register values
   eValueTypeConstResult = 7, ///< constant result variables
-  eValueTypeVariableThreadLocal = 8 ///< thread local storage variable
+  eValueTypeVariableThreadLocal = 8, ///< thread local storage variable
+  eValueTypeVTable = 9,              ///< virtual function table
+  eValueTypeVTableEntry = 10, ///< function pointer in virtual function table
 };
 
 /// Token size/granularities for Input Readers.
diff --git a/lldb/source/API/SBValue.cpp b/lldb/source/API/SBValue.cpp
index e14f1196c6316..34d01d759ba55 100644
--- a/lldb/source/API/SBValue.cpp
+++ b/lldb/source/API/SBValue.cpp
@@ -114,7 +114,7 @@ class ValueImpl {
 
     Target *target = value_sp->GetTargetSP().get();
     // If this ValueObject holds an error, then it is valuable for that.
-    if (value_sp->GetError().Fail()) 
+    if (value_sp->GetError().Fail())
       return value_sp;
 
     if (!target)
@@ -1038,8 +1038,8 @@ lldb::ValueObjectSP SBValue::GetSP(ValueLocker &locker) const {
   // IsValid means that the SBValue has a value in it.  But that's not the
   // only time that ValueObjects are useful.  We also want to return the value
   // if there's an error state in it.
-  if (!m_opaque_sp || (!m_opaque_sp->IsValid() 
-      && (m_opaque_sp->GetRootSP() 
+  if (!m_opaque_sp || (!m_opaque_sp->IsValid()
+      && (m_opaque_sp->GetRootSP()
           && !m_opaque_sp->GetRootSP()->GetError().Fail()))) {
     locker.GetError().SetErrorString("No value");
     return ValueObjectSP();
@@ -1505,3 +1505,14 @@ lldb::SBValue SBValue::Persist() {
   }
   return persisted_sb;
 }
+
+lldb::SBValue SBValue::GetVTable() {
+  SBValue vtable_sb;
+  ValueLocker locker;
+  lldb::ValueObjectSP value_sp(GetSP(locker));
+  if (!value_sp)
+    return vtable_sb;
+
+  vtable_sb.SetSP(value_sp->GetVTable());
+  return vtable_sb;
+}
diff --git a/lldb/source/Commands/CommandObjectFrame.cpp b/lldb/source/Commands/CommandObjectFrame.cpp
index 1fad638f21453..e7cb861c2b01c 100644
--- a/lldb/source/Commands/CommandObjectFrame.cpp
+++ b/lldb/source/Commands/CommandObjectFrame.cpp
@@ -490,6 +490,8 @@ may even involve JITing and running code in the target program.)");
     case eValueTypeRegisterSet:
     case eValueTypeConstResult:
     case eValueTypeVariableThreadLocal:
+    case eValueTypeVTable:
+    case eValueTypeVTableEntry:
       return false;
     }
   }
diff --git a/lldb/source/Core/CMakeLists.txt b/lldb/source/Core/CMakeLists.txt
index 9073e3e9b2ee3..10525ac39e6ef 100644
--- a/lldb/source/Core/CMakeLists.txt
+++ b/lldb/source/Core/CMakeLists.txt
@@ -71,6 +71,7 @@ add_lldb_library(lldbCore
   ValueObjectSyntheticFilter.cpp
   ValueObjectUpdater.cpp
   ValueObjectVariable.cpp
+  ValueObjectVTable.cpp
 
   DEPENDS
     clang-tablegen-targets
diff --git a/lldb/source/Core/ValueObject.cpp b/lldb/source/Core/ValueObject.cpp
index ebfc1cf4d6fe9..bdb1bef633d8f 100644
--- a/lldb/source/Core/ValueObject.cpp
+++ b/lldb/source/Core/ValueObject.cpp
@@ -17,6 +17,7 @@
 #include "lldb/Core/ValueObjectDynamicValue.h"
 #include "lldb/Core/ValueObjectMemory.h"
 #include "lldb/Core/ValueObjectSyntheticFilter.h"
+#include "lldb/Core/ValueObjectVTable.h"
 #include "lldb/DataFormatters/DataVisualization.h"
 #include "lldb/DataFormatters/DumpValueObjectOptions.h"
 #include "lldb/DataFormatters/FormatManager.h"
@@ -3155,3 +3156,7 @@ ValueObjectSP ValueObject::Persist() {
 
   return persistent_var_sp->GetValueObject();
 }
+
+lldb::ValueObjectSP ValueObject::GetVTable() {
+  return ValueObjectVTable::Create(*this);
+}
diff --git a/lldb/source/Core/ValueObjectVTable.cpp b/lldb/source/Core/ValueObjectVTable.cpp
new file mode 100644
index 0000000000000..177ae4167a1d4
--- /dev/null
+++ b/lldb/source/Core/ValueObjectVTable.cpp
@@ -0,0 +1,274 @@
+//===-- ValueObjectVTable.cpp ---------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "lldb/Core/ValueObjectVTable.h"
+#include "lldb/Core/Module.h"
+#include "lldb/Core/ValueObjectChild.h"
+#include "lldb/Symbol/Function.h"
+#include "lldb/Target/Language.h"
+#include "lldb/Target/LanguageRuntime.h"
+#include "lldb/lldb-defines.h"
+#include "lldb/lldb-enumerations.h"
+#include "lldb/lldb-forward.h"
+#include "lldb/lldb-private-enumerations.h"
+
+using namespace lldb;
+using namespace lldb_private;
+
+class ValueObjectVTableChild : public ValueObject {
+public:
+  ValueObjectVTableChild(ValueObject &parent, uint32_t func_idx,
+                         uint64_t addr_size)
+      : ValueObject(parent), m_func_idx(func_idx), m_addr_size(addr_size) {
+    SetFormat(eFormatPointer);
+    SetName(ConstString(llvm::formatv("[{0}]", func_idx).str()));
+  }
+
+  ~ValueObjectVTableChild() override = default;
+
+  std::optional<uint64_t> GetByteSize() override { return m_addr_size; };
+
+  size_t CalculateNumChildren(uint32_t max) override { return 0; };
+
+  ValueType GetValueType() const override { return eValueTypeVTableEntry; };
+
+  bool IsInScope() override {
+    if (ValueObject *parent = GetParent())
+      return parent->IsInScope();
+    return false;
+  };
+
+protected:
+  bool UpdateValue() override {
+    SetValueIsValid(false);
+    m_value.Clear();
+    ValueObject *parent = GetParent();
+    if (!parent) {
+      m_error.SetErrorString("owning vtable object not valid");
+      return false;
+    }
+
+    addr_t parent_addr = parent->GetValueAsUnsigned(LLDB_INVALID_ADDRESS);
+    if (parent_addr == LLDB_INVALID_ADDRESS) {
+      m_error.SetErrorString("invalid vtable address");
+      return false;
+    }
+
+    ProcessSP process_sp = GetProcessSP();
+    if (!process_sp) {
+      m_error.SetErrorString("no process");
+      return false;
+    }
+
+    TargetSP target_sp = GetTargetSP();
+    if (!target_sp) {
+      m_error.SetErrorString("no target");
+      return false;
+    }
+
+    // Each `vtable_entry_addr` points to the function pointer.
+    addr_t vtable_entry_addr = parent_addr + m_func_idx * m_addr_size;
+    addr_t vfunc_ptr =
+        process_sp->ReadPointerFromMemory(vtable_entry_addr, m_error);
+    if (m_error.Fail()) {
+      m_error.SetErrorStringWithFormat(
+          "failed to read virtual function entry 0x%16.16" PRIx64,
+          vtable_entry_addr);
+      return false;
+    }
+
+
+    // Set our value to be the load address of the function pointer in memory
+    // and our type to be the function pointer type.
+    m_value.SetValueType(Value::ValueType::LoadAddress);
+    m_value.GetScalar() = vtable_entry_addr;
+
+    // See if our resolved address points to a function in the debug info. If
+    // it does, then we can report the type as a function prototype for this
+    // function.
+    Function *function = nullptr;
+    Address resolved_vfunc_ptr_address;
+    target_sp->ResolveLoadAddress(vfunc_ptr, resolved_vfunc_ptr_address);
+    if (resolved_vfunc_ptr_address.IsValid())
+      function = resolved_vfunc_ptr_address.CalculateSymbolContextFunction();
+    if (function) {
+      m_value.SetCompilerType(function->GetCompilerType().GetPointerType());
+    } else {
+      // Set our value's compiler type to a generic function protoype so that
+      // it displays as a hex function pointer for the value and the summary
+      // will display the address description.
+
+      // Get the original type that this vtable is based off of so we can get
+      // the language from it correctly.
+      ValueObject *val = parent->GetParent();
+      auto type_system = target_sp->GetScratchTypeSystemForLanguage(
+            val ? val->GetObjectRuntimeLanguage() : eLanguageTypeC_plus_plus);
+      if (type_system) {
+        m_value.SetCompilerType(
+            (*type_system)->CreateGenericFunctionPrototype().GetPointerType());
+      } else {
+        consumeError(type_system.takeError());
+      }
+    }
+
+    // Now read our value into m_data so that our we can use the default
+    // summary provider for C++ for function pointers which will get the
+    // address description for our function pointer.
+    if (m_error.Success()) {
+      const bool thread_and_frame_only_if_stopped = true;
+      ExecutionContext exe_ctx(
+        GetExecutionContextRef().Lock(thread_and_frame_only_if_stopped));
+      m_error = m_value.GetValueAsData(&exe_ctx, m_data, GetModule().get());
+    }
+    SetValueDidChange(true);
+    SetValueIsValid(true);
+    return true;
+  };
+
+  CompilerType GetCompilerTypeImpl() override {
+    return m_value.GetCompilerType();
+  };
+
+  const uint32_t m_func_idx;
+  const uint64_t m_addr_size;
+
+private:
+  // For ValueObject only
+  ValueObjectVTableChild(const ValueObjectVTableChild &) = delete;
+  const ValueObjectVTableChild &
+  operator=(const ValueObjectVTableChild &) = delete;
+};
+
+ValueObjectSP ValueObjectVTable::Create(ValueObject &parent) {
+  return (new ValueObjectVTable(parent))->GetSP();
+}
+
+ValueObjectVTable::ValueObjectVTable(ValueObject &parent)
+    : ValueObject(parent) {
+  SetFormat(eFormatPointer);
+}
+
+std::optional<uint64_t> ValueObjectVTable::GetByteSize() {
+  if (m_vtable_symbol)
+    return m_vtable_symbol->GetByteSize();
+  return std::nullopt;
+}
+
+size_t ValueObjectVTable::CalculateNumChildren(uint32_t max) {
+  if (UpdateValueIfNeeded(false))
+    return m_num_vtable_entries <= max ? m_num_vtable_entries : max;
+  return 0;
+}
+
+ValueType ValueObjectVTable::GetValueType() const { return eValueTypeVTable; }
+
+ConstString ValueObjectVTable::GetTypeName() {
+  if (m_vtable_symbol)
+    return m_vtable_symbol->GetName();
+  return ConstString();
+}
+
+ConstString ValueObjectVTable::GetQualifiedTypeName() { return GetTypeName(); }
+
+ConstString ValueObjectVTable::GetDisplayTypeName() {
+  if (m_vtable_symbol)
+    return m_vtable_symbol->GetDisplayName();
+  return ConstString();
+}
+
+bool ValueObjectVTable::IsInScope() { return GetParent()->IsInScope(); }
+
+ValueObject *ValueObjectVTable::CreateChildAtIndex(size_t idx,
+                                                   bool synthetic_array_member,
+                                                   int32_t synthetic_index) {
+  if (synthetic_array_member)
+    return nullptr;
+  return new ValueObjectVTableChild(*this, idx, m_addr_size);
+}
+
+bool ValueObjectVTable::UpdateValue() {
+  m_error.Clear();
+  m_flags.m_children_count_valid = false;
+  SetValueIsValid(false);
+  m_num_vtable_entries = 0;
+  ValueObject *parent = GetParent();
+  if (!parent) {
+    m_error.SetErrorString("no parent object");
+    return false;
+  }
+
+  ProcessSP process_sp = GetProcessSP();
+  if (!process_sp) {
+    m_error.SetErrorString("no process");
+    return false;
+  }
+
+  const LanguageType language = parent->GetObjectRuntimeLanguage();
+  LanguageRuntime *language_runtime = process_sp->GetLanguageRuntime(language);
+
+  if (language_runtime == nullptr) {
+    m_error.SetErrorStringWithFormat(
+        "no language runtime support for the language \"%s\"",
+        Language::GetNameForLanguageType(language));
+    return false;
+  }
+
+  // Get the vtable information from the language runtime.
+  llvm::Expected<LanguageRuntime::VTableInfo> vtable_info_or_err =
+      language_runtime->GetVTableInfo(*parent, /*check_type=*/true);
+  if (!vtable_info_or_err) {
+    m_error = vtable_info_or_err.takeError();
+    return false;
+  }
+
+  TargetSP target_sp = GetTargetSP();
+  const addr_t vtable_start_addr =
+      vtable_info_or_err->addr.GetLoadAddress(target_sp.get());
+
+  m_vtable_symbol = vtable_info_or_err->symbol;
+  if (!m_vtable_symbol) {
+    m_error.SetErrorStringWithFormat(
+        "no vtable symbol found containing 0x%" PRIx64, vtable_start_addr);
+    return false;
+  }
+
+  // Now that we know it's a vtable, we update the object's state.
+  SetName(GetTypeName());
+
+  // Calculate the number of entries
+  if (!m_vtable_symbol->GetByteSizeIsValid()) {
+    m_error.SetErrorStringWithFormat(
+        "vtable symbol \"%s\" doesn't have a valid size",
+        m_vtable_symbol->GetMangled().GetDemangledName().GetCString());
+    return false;
+  }
+
+  m_addr_size = process_sp->GetAddressByteSize();
+  const addr_t vtable_end_addr =
+      m_vtable_symbol->GetLoadAddress(target_sp.get()) +
+      m_vtable_symbol->GetByteSize();
+  m_num_vtable_entries = (vtable_end_addr - vtable_start_addr) / m_addr_size;
+
+  m_value.SetValueType(Value::ValueType::LoadAddress);
+  m_value.GetScalar() = parent->GetAddressOf();
+  auto type_system_or_err =
+        target_sp->GetScratchTypeSystemForLanguage(eLanguageTypeC_plus_plus);
+  if (type_system_or_err) {
+    m_value.SetCompilerType(
+        (*type_system_or_err)->GetBasicTypeFromAST(eBasicTypeUnsignedLong));
+  } else {
+    consumeError(type_system_or_err.takeError());
+  }
+  SetValueDidChange(true);
+  SetValueIsValid(true);
+  return true;
+}
+
+CompilerType ValueObjectVTable::GetCompilerTypeImpl() { return CompilerType(); }
+
+ValueObjectVTable::~ValueObjectVTable() = default;
diff --git a/lldb/source/DataFormatters/CXXFunctionPointer.cpp b/lldb/source/DataFormatters/CXXFunctionPointer.cpp
index d7df280e56efb..6543433d17ff4 100644
--- a/lldb/source/DataFormatters/CXXFunctionPointer.cpp
+++ b/lldb/source/DataFormatters/CXXFunctionPointer.cpp
@@ -13,6 +13,7 @@
 #include "lldb/Target/SectionLoadList.h"
 #include "lldb/Target/Target.h"
 #include "lldb/Utility/Stream.h"
+#include "lldb/lldb-enumerations.h"
 
 #include <string>
 
@@ -76,7 +77,10 @@ bool lldb_private::formatters::CXXFunctionPointerSummaryProvider(
     }
   }
   if (sstr.GetSize() > 0) {
-    stream.Printf("(%s)", sstr.GetData());
+    if (valobj.GetValueType() == lldb::eValueTypeVTableEntry)
+      stream.PutCString(sstr.GetData());
+    else
+      stream.Printf("(%s)", sstr.GetData());
     return true;
   } else
     return false;
diff --git a/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.cpp b/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.cpp
index 8b8d330799cb6..f4537b4133b93 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.cpp
+++ b/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.cpp
@@ -45,6 +45,7 @@
 #include "LibCxxVariant.h"
 #include "LibStdcpp.h"
 #include "MSVCUndecoratedNameParser.h"
+#include "lldb/lldb-enumerations.h"
 
 using namespace lldb;
 using namespace lldb_private;
@@ -1415,7 +1416,8 @@ CPlusPlusLanguage::GetHardcodedSummaries() {
                   lldb_private::formatters::CXXFunctionPointerSummaryProvider,
                   "Function pointer summary provider"));
           if (CompilerType CT = valobj.GetCompilerType();
-              CT.IsFunctionPointerType() || CT.IsMemberFunctionPointerType()) {
+              CT.IsFunctionPointerType() || CT.IsMemberFunctionPointerType() ||
+              valobj.GetValueType() == lldb::eValueTypeVTableEntry) {
             return formatter_sp;
           }
           return nullptr;
diff --git a/lldb/source/Plugins/LanguageRuntime/CPlusPlus/ItaniumABI/ItaniumABILanguageRuntime.cpp b/lldb/source/Plugins/LanguageRuntime/CPlusPlus/ItaniumABI/ItaniumABILanguageRuntime.cpp
index 53e856bf3514e..17c8b43578691 100644
--- a/lldb/source/Plugins/LanguageRuntime/CPlusPlus/ItaniumABI/ItaniumABILanguageRuntime.cpp
+++ b/lldb/source/Plugins/LanguageRuntime/CPlusPlus/ItaniumABI/ItaniumABILanguageRuntime.cpp
@@ -54,134 +54,228 @@ bool ItaniumABILanguageRuntime::CouldHaveDynamicValue(ValueObject &in_value) {
                                                           check_objc);
 }
 
-TypeAndOrName ItaniumABILanguageRuntime::GetTypeInfoFromVTableAddress(
-    ValueObject &in_value, lldb::addr_t original_ptr,
-    lldb::addr_t vtable_load_addr) {
-  if (m_process && vtable_load_addr != LLDB_INVALID_ADDRESS) {
-    // Find the symbol that contains the "vtable_load_addr" address
-    Address vtable_addr;
-    Target &target = m_process->GetTarget();
-    if (!target.GetSectionLoadList().IsEmpty()) {
-      if (target.GetSectionLoadList().ResolveLoadAddress(vtable_load_addr,
-                                                         vtable_addr)) {
-        // See if we have cached info for this type already
-        TypeAndOrName type_info = GetDynamicTypeInfo(vtable_addr);
-        if (type_info)
-          return type_info;
-
-        SymbolContext sc;
-        target.GetImages().ResolveSymbolContextForAddress(
-            vtable_addr, eSymbolContextSymbol, sc);
-        Symbol *symbol = sc.symbol;
-        if (symbol != nullptr) {
-          const char *name =
-              symbol->GetMangled().GetDemangledName().AsCString();
-          if (name && strstr(name, vtable_demangled_prefix) == name) {
-            Log *log = GetLog(LLDBLog::Object);
-            LLDB_LOGF(log,
-                      "0x%16.16" PRIx64
-                      ": static-type = '%s' has vtable symbol '%s'\n",
-                      original_ptr, in_value.GetTypeName().GetCString(), name);
-            // We are a C++ class, that's good.  Get the class name and look it
-            // up:
-            const char *class_name = name + strlen(vtable_demangled_prefix);
-            // We know the class name is absolute, so tell FindTypes that by
-            // prefixing it with the root namespace:
-            std::string lookup_name("::");
-            lookup_name.append(class_name);
-            
-            type_info.SetName(class_name);
-            const bool exact_match = true;
-            TypeList class_types;
-
-            // First look in the module that the vtable symbol came from and
-            // look for a single exact match.
-            llvm::DenseSet<SymbolFile *> searched_symbol_files;
-            if (sc.module_sp)
-              sc.module_sp->FindTypes(ConstString(lookup_name), exact_match, 1,
+TypeAndOrName ItaniumABILanguageRuntime::GetTypeInfo(
+    ValueObject &in_value, const VTableInfo &vtable_info) {
+  if (vtable_info.addr.IsSectionOffset()) {
+    // See if we have cached info for this type already
+    TypeAndOrName type_info = GetDynamicTypeInfo(vtable_info.addr);
+    if (type_info)
+      return type_info;
+
+    if (vtable_info.symbol) {
+      Log *log = GetLog(LLDBLog::Object);
+      llvm::StringRef symbol_name =
+          vtable_info.symbol->GetMangled().GetDemangledName().GetStringRef();
+      LLDB_LOGF(log,
+                "0x%16.16" PRIx64
+                ": static-type = '%s' has vtable symbol '%s'\n",
+                in_value.GetPointerValue(),
+                in_value.GetTypeName().GetCString(),
+                symbol_name.str().c_str());
+      // We are a C++ class, that's good.  Get the class name and look it
+      // up:
+      llvm::StringRef class_name = symbol_name;
+      class_name.consume_front(vtable_demangled_prefix);
+      // We know the class name is absolute, so tell FindTypes that by
+      // prefixing it with the root namespace:
+      std::string lookup_name("::");
+      lookup_name.append(class_name.data(), class_name.size());
+
+      type_info.SetName(class_name);
+      const bool exact_match = true;
+      TypeList class_types;
+
+      // First look in the module that the vtable symbol came from and
+      // look for a single exact match.
+      llvm::DenseSet<SymbolFile *> searched_symbol_files;
+      ModuleSP module_sp = vtable_info.symbol->CalculateSymbolContextModule();
+      if (module_sp)
+        module_sp->FindTypes(ConstString(lookup_name), exact_match, 1,
+                              searched_symbol_files, class_types);
+
+      // If we didn't find a symbol, then move on to the entire module
+      // list in the target and get as many unique matches as possible
+      Target &target = m_process->GetTarget();
+      if (class_types.Empty())
+        target.GetImages().FindTypes(nullptr, ConstString(lookup_name),
+                                      exact_match, UINT32_MAX,
                                       searched_symbol_files, class_types);
 
-            // If we didn't find a symbol, then move on to the entire module
-            // list in the target and get as many unique matches as possible
-            if (class_types.Empty())
-              target.GetImages().FindTypes(nullptr, ConstString(lookup_name),
-                                           exact_match, UINT32_MAX,
-                                           searched_symbol_files, class_types);
-
-            lldb::TypeSP type_sp;
-            if (class_types.Empty()) {
-              LLDB_LOGF(log, "0x%16.16" PRIx64 ": is not dynamic\n",
-                        original_ptr);
-              return TypeAndOrName();
+      lldb::TypeSP type_sp;
+      if (class_types.Empty()) {
+        LLDB_LOGF(log, "0x%16.16" PRIx64 ": is not dynamic\n",
+                  in_value.GetPointerValue());
+        return TypeAndOrName();
+      }
+      if (class_types.GetSize() == 1) {
+        type_sp = class_types.GetTypeAtIndex(0);
+        if (type_sp) {
+          if (TypeSystemClang::IsCXXClassType(
+                  type_sp->GetForwardCompilerType())) {
+            LLDB_LOGF(
+                log,
+                "0x%16.16" PRIx64
+                ": static-type = '%s' has dynamic type: uid={0x%" PRIx64
+                "}, type-name='%s'\n",
+                in_value.GetPointerValue(), in_value.GetTypeName().AsCString(),
+                type_sp->GetID(), type_sp->GetName().GetCString());
+            type_info.SetTypeSP(type_sp);
+          }
+        }
+      } else {
+        size_t i;
+        if (log) {
+          for (i = 0; i < class_types.GetSize(); i++) {
+            type_sp = class_types.GetTypeAtIndex(i);
+            if (type_sp) {
+              LLDB_LOGF(
+                  log,
+                  "0x%16.16" PRIx64
+                  ": static-type = '%s' has multiple matching dynamic "
+                  "types: uid={0x%" PRIx64 "}, type-name='%s'\n",
+                  in_value.GetPointerValue(),
+                  in_value.GetTypeName().AsCString(),
+                  type_sp->GetID(), type_sp->GetName().GetCString());
             }
-            if (class_types.GetSize() == 1) {
-              type_sp = class_types.GetTypeAtIndex(0);
-              if (type_sp) {
-                if (TypeSystemClang::IsCXXClassType(
-                        type_sp->GetForwardCompilerType())) {
-                  LLDB_LOGF(
-                      log,
-                      "0x%16.16" PRIx64
-                      ": static-type = '%s' has dynamic type: uid={0x%" PRIx64
-                      "}, type-name='%s'\n",
-                      original_ptr, in_value.GetTypeName().AsCString(),
-                      type_sp->GetID(), type_sp->GetName().GetCString());
-                  type_info.SetTypeSP(type_sp);
-                }
-              }
-            } else {
-              size_t i;
-              if (log) {
-                for (i = 0; i < class_types.GetSize(); i++) {
-                  type_sp = class_types.GetTypeAtIndex(i);
-                  if (type_sp) {
-                    LLDB_LOGF(
-                        log,
-                        "0x%16.16" PRIx64
-                        ": static-type = '%s' has multiple matching dynamic "
-                        "types: uid={0x%" PRIx64 "}, type-name='%s'\n",
-                        original_ptr, in_value.GetTypeName().AsCString(),
-                        type_sp->GetID(), type_sp->GetName().GetCString());
-                  }
-                }
-              }
-
-              for (i = 0; i < class_types.GetSize(); i++) {
-                type_sp = class_types.GetTypeAtIndex(i);
-                if (type_sp) {
-                  if (TypeSystemClang::IsCXXClassType(
-                          type_sp->GetForwardCompilerType())) {
-                    LLDB_LOGF(
-                        log,
-                        "0x%16.16" PRIx64 ": static-type = '%s' has multiple "
-                        "matching dynamic types, picking "
-                        "this one: uid={0x%" PRIx64 "}, type-name='%s'\n",
-                        original_ptr, in_value.GetTypeName().AsCString(),
-                        type_sp->GetID(), type_sp->GetName().GetCString());
-                    type_info.SetTypeSP(type_sp);
-                  }
-                }
-              }
-
-              if (log) {
-                LLDB_LOGF(log,
-                          "0x%16.16" PRIx64
-                          ": static-type = '%s' has multiple matching dynamic "
-                          "types, didn't find a C++ match\n",
-                          original_ptr, in_value.GetTypeName().AsCString());
-              }
+          }
+        }
+
+        for (i = 0; i < class_types.GetSize(); i++) {
+          type_sp = class_types.GetTypeAtIndex(i);
+          if (type_sp) {
+            if (TypeSystemClang::IsCXXClassType(
+                    type_sp->GetForwardCompilerType())) {
+              LLDB_LOGF(
+                  log,
+                  "0x%16.16" PRIx64 ": static-type = '%s' has multiple "
+                  "matching dynamic types, picking "
+                  "this one: uid={0x%" PRIx64 "}, type-name='%s'\n",
+                  in_value.GetPointerValue(),
+                  in_value.GetTypeName().AsCString(),
+                  type_sp->GetID(), type_sp->GetName().GetCString());
+              type_info.SetTypeSP(type_sp);
             }
-            if (type_info)
-              SetDynamicTypeInfo(vtable_addr, type_info);
-            return type_info;
           }
         }
+
+        if (log) {
+          LLDB_LOGF(log,
+                    "0x%16.16" PRIx64
+                    ": static-type = '%s' has multiple matching dynamic "
+                    "types, didn't find a C++ match\n",
+                    in_value.GetPointerValue(),
+                    in_value.GetTypeName().AsCString());
+        }
       }
+      if (type_info)
+        SetDynamicTypeInfo(vtable_info.addr, type_info);
+      return type_info;
     }
   }
   return TypeAndOrName();
 }
 
+llvm::Error ItaniumABILanguageRuntime::TypeHasVTable(CompilerType type) {
+  // Check to make sure the class has a vtable.
+  CompilerType original_type = type;
+  if (type.IsPointerOrReferenceType()) {
+    CompilerType pointee_type = type.GetPointeeType();
+    if (pointee_type)
+      type = pointee_type;
+  }
+
+  // Make sure this is a class or a struct first by checking the type class
+  // bitfield that gets returned.
+  if ((type.GetTypeClass() & (eTypeClassStruct | eTypeClassClass)) == 0) {
+    return llvm::createStringError(std::errc::invalid_argument,
+        "type \"%s\" is not a class or struct or a pointer to one",
+        original_type.GetTypeName().AsCString("<invalid>"));
+  }
+
+  // Check if the type has virtual functions by asking it if it is polymorphic.
+  if (!type.IsPolymorphicClass()) {
+    return llvm::createStringError(std::errc::invalid_argument,
+        "type \"%s\" doesn't have a vtable",
+        type.GetTypeName().AsCString("<invalid>"));
+  }
+  return llvm::Error::success();
+}
+
+// This function can accept both pointers or references to classes as well as
+// instances of classes. If you are using this function during dynamic type
+// detection, only valid ValueObjects that return true to
+// CouldHaveDynamicValue(...) should call this function and \a check_type
+// should be set to false. This function is also used by ValueObjectVTable
+// and is can pass in instances of classes which is not suitable for dynamic
+// type detection, these cases should pass true for \a check_type.
+llvm::Expected<LanguageRuntime::VTableInfo>
+ ItaniumABILanguageRuntime::GetVTableInfo(ValueObject &in_value,
+                                          bool check_type) {
+
+  CompilerType type = in_value.GetCompilerType();
+  if (check_type) {
+    if (llvm::Error err = TypeHasVTable(type))
+      return std::move(err);
+  }
+  ExecutionContext exe_ctx(in_value.GetExecutionContextRef());
+  Process *process = exe_ctx.GetProcessPtr();
+  if (process == nullptr)
+    return llvm::createStringError(std::errc::invalid_argument,
+                                   "invalid process");
+
+  AddressType address_type;
+  lldb::addr_t original_ptr = LLDB_INVALID_ADDRESS;
+  if (type.IsPointerOrReferenceType())
+    original_ptr = in_value.GetPointerValue(&address_type);
+  else
+    original_ptr = in_value.GetAddressOf(/*scalar_is_load_address=*/true,
+                                         &address_type);
+  if (original_ptr == LLDB_INVALID_ADDRESS || address_type != eAddressTypeLoad)
+    return llvm::createStringError(std::errc::invalid_argument,
+                                   "failed to get the address of the value");
+
+  Status error;
+  const lldb::addr_t vtable_load_addr =
+      process->ReadPointerFromMemory(original_ptr, error);
+
+  if (!error.Success() || vtable_load_addr == LLDB_INVALID_ADDRESS)
+    return llvm::createStringError(std::errc::invalid_argument,
+        "failed to read vtable pointer from memory at 0x%" PRIx64,
+        original_ptr);
+;
+
+  // Find the symbol that contains the "vtable_load_addr" address
+  Address vtable_addr;
+  if (!process->GetTarget().ResolveLoadAddress(vtable_load_addr, vtable_addr))
+    return llvm::createStringError(std::errc::invalid_argument,
+                                   "failed to resolve vtable pointer 0x%"
+                                   PRIx64 "to a section", vtable_load_addr);
+
+  // Check our cache first to see if we already have this info
+  {
+    std::lock_guard<std::mutex> locker(m_mutex);
+    auto pos = m_vtable_info_map.find(vtable_addr);
+    if (pos != m_vtable_info_map.end())
+      return pos->second;
+  }
+
+  Symbol *symbol = vtable_addr.CalculateSymbolContextSymbol();
+  if (symbol == nullptr)
+    return llvm::createStringError(std::errc::invalid_argument,
+                                   "no symbol found for 0x%" PRIx64,
+                                   vtable_load_addr);
+  llvm::StringRef name = symbol->GetMangled().GetDemangledName().GetStringRef();
+  if (name.startswith(vtable_demangled_prefix)) {
+    VTableInfo info = {vtable_addr, symbol};
+    std::lock_guard<std::mutex> locker(m_mutex);
+    auto pos = m_vtable_info_map[vtable_addr] = info;
+    return info;
+  }
+  return llvm::createStringError(std::errc::invalid_argument,
+      "symbol found that contains 0x%" PRIx64 " is not a vtable symbol",
+      vtable_load_addr);
+}
+
 bool ItaniumABILanguageRuntime::GetDynamicTypeAndAddress(
     ValueObject &in_value, lldb::DynamicValueType use_dynamic,
     TypeAndOrName &class_type_or_name, Address &dynamic_address,
@@ -198,33 +292,23 @@ bool ItaniumABILanguageRuntime::GetDynamicTypeAndAddress(
   class_type_or_name.Clear();
   value_type = Value::ValueType::Scalar;
 
-  // Only a pointer or reference type can have a different dynamic and static
-  // type:
   if (!CouldHaveDynamicValue(in_value))
     return false;
 
-  // First job, pull out the address at 0 offset from the object.
-  AddressType address_type;
-  lldb::addr_t original_ptr = in_value.GetPointerValue(&address_type);
-  if (original_ptr == LLDB_INVALID_ADDRESS)
-    return false;
-
-  ExecutionContext exe_ctx(in_value.GetExecutionContextRef());
-
-  Process *process = exe_ctx.GetProcessPtr();
-
-  if (process == nullptr)
-    return false;
-
-  Status error;
-  const lldb::addr_t vtable_address_point =
-      process->ReadPointerFromMemory(original_ptr, error);
-
-  if (!error.Success() || vtable_address_point == LLDB_INVALID_ADDRESS)
+  // Check if we have a vtable pointer in this value. If we don't it will
+  // return an error, else it will return a valid resolved address. We don't
+  // want GetVTableInfo to check the type since we accept void * as a possible
+  // dynamic type and that won't pass the type check. We already checked the
+  // type above in CouldHaveDynamicValue(...).
+  llvm::Expected<VTableInfo> vtable_info_or_err =
+      GetVTableInfo(in_value, /*check_type=*/false);
+  if (!vtable_info_or_err) {
+    llvm::consumeError(vtable_info_or_err.takeError());
     return false;
+  }
 
-  class_type_or_name = GetTypeInfoFromVTableAddress(in_value, original_ptr,
-                                                    vtable_address_point);
+  const VTableInfo &vtable_info = vtable_info_or_err.get();
+  class_type_or_name = GetTypeInfo(in_value, vtable_info);
 
   if (!class_type_or_name)
     return false;
@@ -244,22 +328,27 @@ bool ItaniumABILanguageRuntime::GetDynamicTypeAndAddress(
   }
 
   // The offset_to_top is two pointers above the vtable pointer.
-  const uint32_t addr_byte_size = process->GetAddressByteSize();
+  Target &target = m_process->GetTarget();
+  const addr_t vtable_load_addr = vtable_info.addr.GetLoadAddress(&target);
+  if (vtable_load_addr == LLDB_INVALID_ADDRESS)
+    return false;
+  const uint32_t addr_byte_size = m_process->GetAddressByteSize();
   const lldb::addr_t offset_to_top_location =
-      vtable_address_point - 2 * addr_byte_size;
+      vtable_load_addr - 2 * addr_byte_size;
   // Watch for underflow, offset_to_top_location should be less than
-  // vtable_address_point
-  if (offset_to_top_location >= vtable_address_point)
+  // vtable_load_addr
+  if (offset_to_top_location >= vtable_load_addr)
     return false;
-  const int64_t offset_to_top = process->ReadSignedIntegerFromMemory(
+  Status error;
+  const int64_t offset_to_top = m_process->ReadSignedIntegerFromMemory(
       offset_to_top_location, addr_byte_size, INT64_MIN, error);
 
   if (offset_to_top == INT64_MIN)
     return false;
   // So the dynamic type is a value that starts at offset_to_top above
   // the original address.
-  lldb::addr_t dynamic_addr = original_ptr + offset_to_top;
-  if (!process->GetTarget().GetSectionLoadList().ResolveLoadAddress(
+  lldb::addr_t dynamic_addr = in_value.GetPointerValue() + offset_to_top;
+  if (!m_process->GetTarget().ResolveLoadAddress(
           dynamic_addr, dynamic_address)) {
     dynamic_address.SetRawAddress(dynamic_addr);
   }
@@ -582,10 +671,10 @@ ValueObjectSP ItaniumABILanguageRuntime::GetExceptionObjectForThread(
   ValueObjectSP exception = ValueObject::CreateValueObjectFromData(
       "exception", exception_isw.GetAsData(m_process->GetByteOrder()), exe_ctx,
       voidstar);
-  ValueObjectSP dyn_exception 
+  ValueObjectSP dyn_exception
       = exception->GetDynamicValue(eDynamicDontRunTarget);
   // If we succeed in making a dynamic value, return that:
-  if (dyn_exception) 
+  if (dyn_exception)
      return dyn_exception;
 
   return exception;
@@ -593,7 +682,7 @@ ValueObjectSP ItaniumABILanguageRuntime::GetExceptionObjectForThread(
 
 TypeAndOrName ItaniumABILanguageRuntime::GetDynamicTypeInfo(
     const lldb_private::Address &vtable_addr) {
-  std::lock_guard<std::mutex> locker(m_dynamic_type_map_mutex);
+  std::lock_guard<std::mutex> locker(m_mutex);
   DynamicTypeCache::const_iterator pos = m_dynamic_type_map.find(vtable_addr);
   if (pos == m_dynamic_type_map.end())
     return TypeAndOrName();
@@ -603,6 +692,6 @@ TypeAndOrName ItaniumABILanguageRuntime::GetDynamicTypeInfo(
 
 void ItaniumABILanguageRuntime::SetDynamicTypeInfo(
     const lldb_private::Address &vtable_addr, const TypeAndOrName &type_info) {
-  std::lock_guard<std::mutex> locker(m_dynamic_type_map_mutex);
+  std::lock_guard<std::mutex> locker(m_mutex);
   m_dynamic_type_map[vtable_addr] = type_info;
 }
diff --git a/lldb/source/Plugins/LanguageRuntime/CPlusPlus/ItaniumABI/ItaniumABILanguageRuntime.h b/lldb/source/Plugins/LanguageRuntime/CPlusPlus/ItaniumABI/ItaniumABILanguageRuntime.h
index ca8d5ab1a93a1..0f7e73cfee075 100644
--- a/lldb/source/Plugins/LanguageRuntime/CPlusPlus/ItaniumABI/ItaniumABILanguageRuntime.h
+++ b/lldb/source/Plugins/LanguageRuntime/CPlusPlus/ItaniumABI/ItaniumABILanguageRuntime.h
@@ -47,6 +47,10 @@ class ItaniumABILanguageRuntime : public lldb_private::CPPLanguageRuntime {
     return runtime->isA(&ID);
   }
 
+
+  llvm::Expected<LanguageRuntime::VTableInfo>
+  GetVTableInfo(ValueObject &in_value, bool check_type) override;
+
   bool GetDynamicTypeAndAddress(ValueObject &in_value,
                                 lldb::DynamicValueType use_dynamic,
                                 TypeAndOrName &class_type_or_name,
@@ -71,7 +75,7 @@ class ItaniumABILanguageRuntime : public lldb_private::CPPLanguageRuntime {
                           bool catch_bp, bool throw_bp) override;
 
   lldb::SearchFilterSP CreateExceptionSearchFilter() override;
-  
+
   lldb::ValueObjectSP GetExceptionObjectForThread(
       lldb::ThreadSP thread_sp) override;
 
@@ -89,24 +93,33 @@ class ItaniumABILanguageRuntime : public lldb_private::CPPLanguageRuntime {
 
 private:
   typedef std::map<lldb_private::Address, TypeAndOrName> DynamicTypeCache;
+  typedef std::map<lldb_private::Address, VTableInfo> VTableInfoCache;
 
   ItaniumABILanguageRuntime(Process *process)
       : // Call CreateInstance instead.
-        lldb_private::CPPLanguageRuntime(process), m_cxx_exception_bp_sp(),
-        m_dynamic_type_map(), m_dynamic_type_map_mutex() {}
+        lldb_private::CPPLanguageRuntime(process) {}
 
   lldb::BreakpointSP m_cxx_exception_bp_sp;
   DynamicTypeCache m_dynamic_type_map;
-  std::mutex m_dynamic_type_map_mutex;
+  VTableInfoCache m_vtable_info_map;
+  std::mutex m_mutex;
 
-  TypeAndOrName GetTypeInfoFromVTableAddress(ValueObject &in_value,
-                                             lldb::addr_t original_ptr,
-                                             lldb::addr_t vtable_addr);
+  TypeAndOrName GetTypeInfo(ValueObject &in_value,
+                            const VTableInfo &vtable_info);
 
   TypeAndOrName GetDynamicTypeInfo(const lldb_private::Address &vtable_addr);
 
   void SetDynamicTypeInfo(const lldb_private::Address &vtable_addr,
                           const TypeAndOrName &type_info);
+
+  // Check if a compiler type has a vtable.
+  //
+  // If the compiler type is a pointer or a reference, this function will check
+  // if the pointee type has a vtable, else it will check the type passed in.
+  //
+  // Returns an error if the type of the value doesn't have a vtable with an
+  // explanation why, or returns an Error::success() if the type has a vtable.
+  llvm::Error TypeHasVTable(CompilerType compiler_type);
 };
 
 } // namespace lldb_private
diff --git a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
index df06ba0ed952a..f037708efc380 100644
--- a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
+++ b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
@@ -3557,8 +3557,15 @@ bool TypeSystemClang::IsPolymorphicClass(lldb::opaque_compiler_type_t type) {
         if (record_decl) {
           const clang::CXXRecordDecl *cxx_record_decl =
               llvm::dyn_cast<clang::CXXRecordDecl>(record_decl);
-          if (cxx_record_decl)
-            return cxx_record_decl->isPolymorphic();
+          if (cxx_record_decl) {
+            // We can't just call is isPolymorphic() here because that just
+            // means the current class has virtual functions, it doesn't check
+            // if any inherited classes have virtual functions. The doc string
+            // in SBType::IsPolymorphicClass() says it is looking for both
+            // if the class has virtual methods or if any bases do, so this
+            // should be more correct.
+            return cxx_record_decl->isDynamicClass();
+          }
         }
       }
       break;
@@ -4708,6 +4715,21 @@ TypeSystemClang::GetTypedefedType(lldb::opaque_compiler_type_t type) {
 CompilerType TypeSystemClang::GetBasicTypeFromAST(lldb::BasicType basic_type) {
   return TypeSystemClang::GetBasicType(basic_type);
 }
+
+CompilerType TypeSystemClang::CreateGenericFunctionPrototype() {
+  clang::ASTContext &ast = getASTContext();
+  const FunctionType::ExtInfo generic_ext_info(
+    /*noReturn=*/false,
+    /*hasRegParm=*/false,
+    /*regParm=*/0,
+    CallingConv::CC_C,
+    /*producesResult=*/false,
+    /*noCallerSavedRegs=*/false,
+    /*NoCfCheck=*/false,
+    /*cmseNSCall=*/false);
+  QualType func_type = ast.getFunctionNoProtoType(ast.VoidTy, generic_ext_info);
+  return GetType(func_type);
+}
 // Exploring the type
 
 const llvm::fltSemantics &
@@ -4824,7 +4846,7 @@ lldb::Encoding TypeSystemClang::GetEncoding(lldb::opaque_compiler_type_t type,
 
   case clang::Type::FunctionNoProto:
   case clang::Type::FunctionProto:
-    break;
+    return lldb::eEncodingUint;
 
   case clang::Type::IncompleteArray:
   case clang::Type::VariableArray:
diff --git a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.h b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.h
index 66e59ec985fb8..0ec2d026e9961 100644
--- a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.h
+++ b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.h
@@ -802,6 +802,10 @@ class TypeSystemClang : public TypeSystem {
   // Create related types using the current type's AST
   CompilerType GetBasicTypeFromAST(lldb::BasicType basic_type) override;
 
+  // Create a generic function prototype that can be used in ValuObject types
+  // to correctly display a function pointer with the right value and summary.
+  CompilerType CreateGenericFunctionPrototype() override;
+
   // Exploring the type
 
   const llvm::fltSemantics &GetFloatTypeSemantics(size_t byte_size) override;
diff --git a/lldb/source/Symbol/Type.cpp b/lldb/source/Symbol/Type.cpp
index 006a843bacd2e..54eeace93b964 100644
--- a/lldb/source/Symbol/Type.cpp
+++ b/lldb/source/Symbol/Type.cpp
@@ -748,6 +748,10 @@ void TypeAndOrName::SetName(const char *type_name_cstr) {
   m_type_name.SetCString(type_name_cstr);
 }
 
+void TypeAndOrName::SetName(llvm::StringRef type_name) {
+  m_type_name.SetString(type_name);
+}
+
 void TypeAndOrName::SetTypeSP(lldb::TypeSP type_sp) {
   if (type_sp) {
     m_compiler_type = type_sp->GetForwardCompilerType();
diff --git a/lldb/test/API/functionalities/vtable/Makefile b/lldb/test/API/functionalities/vtable/Makefile
new file mode 100644
index 0000000000000..99998b20bcb05
--- /dev/null
+++ b/lldb/test/API/functionalities/vtable/Makefile
@@ -0,0 +1,3 @@
+CXX_SOURCES := main.cpp
+
+include Makefile.rules
diff --git a/lldb/test/API/functionalities/vtable/TestVTableValue.py b/lldb/test/API/functionalities/vtable/TestVTableValue.py
new file mode 100644
index 0000000000000..5b243e0646f4c
--- /dev/null
+++ b/lldb/test/API/functionalities/vtable/TestVTableValue.py
@@ -0,0 +1,186 @@
+"""
+Make sure the getting a variable path works and doesn't crash.
+"""
+
+
+import lldb
+import lldbsuite.test.lldbutil as lldbutil
+from lldbsuite.test.decorators import *
+from lldbsuite.test.lldbtest import *
+
+class TestVTableValue(TestBase):
+    # If your test case doesn't stress debug info, then
+    # set this to true.  That way it won't be run once for
+    # each debug info format.
+    NO_DEBUG_INFO_TESTCASE = True
+
+    @skipUnlessPlatform(["linux", "macosx"])
+    def test_vtable(self):
+        self.build()
+        lldbutil.run_to_source_breakpoint(
+            self, "At the end", lldb.SBFileSpec("main.cpp")
+        )
+
+        # Test a shape instance to make sure we get the vtable correctly.
+        shape = self.frame().FindVariable("shape")
+        vtable = shape.GetVTable()
+        self.assertEquals(vtable.GetName(), "vtable for Shape")
+        self.assertEquals(vtable.GetTypeName(), "vtable for Shape")
+        # Make sure we have the right number of virtual functions in our vtable
+        # for the shape class.
+        self.assertEquals(vtable.GetNumChildren(), 4)
+
+        # Verify vtable address
+        vtable_addr = vtable.GetValueAsUnsigned(0)
+        expected_addr = self.expected_vtable_addr(shape)
+        self.assertEquals(vtable_addr, expected_addr)
+
+        for (idx, vtable_entry) in enumerate(vtable.children):
+            self.verify_vtable_entry(vtable_entry, vtable_addr, idx)
+
+        # Test a shape reference to make sure we get the vtable correctly.
+        shape = self.frame().FindVariable("shape_ref")
+        vtable = shape.GetVTable()
+        self.assertEquals(vtable.GetName(), "vtable for Shape")
+        self.assertEquals(vtable.GetTypeName(), "vtable for Shape")
+        # Make sure we have the right number of virtual functions in our vtable
+        # for the shape class.
+        self.assertEquals(vtable.GetNumChildren(), 4)
+
+        # Verify vtable address
+        vtable_addr = vtable.GetValueAsUnsigned(0)
+        expected_addr = self.expected_vtable_addr(shape)
+        self.assertEquals(vtable_addr, expected_addr)
+
+        for (idx, vtable_entry) in enumerate(vtable.children):
+            self.verify_vtable_entry(vtable_entry, vtable_addr, idx)
+
+
+        # Test we get the right vtable for the Rectangle instance.
+        rect = self.frame().FindVariable("rect")
+        vtable = rect.GetVTable()
+        self.assertEquals(vtable.GetName(), "vtable for Rectangle")
+        self.assertEquals(vtable.GetTypeName(), "vtable for Rectangle")
+
+        # Make sure we have the right number of virtual functions in our vtable
+        # with the extra virtual function added by the Rectangle class
+        self.assertEquals(vtable.GetNumChildren(), 5)
+
+        # Verify vtable address
+        vtable_addr = vtable.GetValueAsUnsigned()
+        expected_addr = self.expected_vtable_addr(rect)
+        self.assertEquals(vtable_addr, expected_addr)
+
+        for (idx, vtable_entry) in enumerate(vtable.children):
+            self.verify_vtable_entry(vtable_entry, vtable_addr, idx)
+
+    @skipUnlessPlatform(["linux", "macosx"])
+    def test_base_class_ptr(self):
+        self.build()
+        (target, process, thread, bkpt) = lldbutil.run_to_source_breakpoint(
+            self, "Shape is Rectangle", lldb.SBFileSpec("main.cpp")
+        )
+
+        shape = self.frame().FindVariable("shape")
+        rect = self.frame().FindVariable("rect")
+
+        shape_ptr = self.frame().FindVariable("shape_ptr")
+        shape_ptr_vtable = shape_ptr.GetVTable()
+        self.assertEquals(shape_ptr_vtable.GetName(), "vtable for Rectangle")
+        self.assertEquals(shape_ptr_vtable.GetNumChildren(), 5)
+        self.assertEquals(shape_ptr.GetValueAsUnsigned(0),
+                          rect.GetLoadAddress())
+        lldbutil.continue_to_source_breakpoint(
+            self, process, "Shape is Shape", lldb.SBFileSpec("main.cpp")
+        )
+        self.assertEquals(shape_ptr.GetValueAsUnsigned(0),
+                          shape.GetLoadAddress())
+        self.assertEquals(shape_ptr_vtable.GetNumChildren(), 4)
+        self.assertEquals(shape_ptr_vtable.GetName(), "vtable for Shape")
+
+    @skipUnlessPlatform(["linux", "macosx"])
+    def test_no_vtable(self):
+        self.build()
+        lldbutil.run_to_source_breakpoint(
+            self, "At the end", lldb.SBFileSpec("main.cpp")
+        )
+
+        var = self.frame().FindVariable("not_virtual")
+        self.assertEqual(var.GetVTable().GetError().GetCString(),
+                         'type "NotVirtual" doesn\'t have a vtable')
+
+        var = self.frame().FindVariable("argc")
+        self.assertEqual(var.GetVTable().GetError().GetCString(),
+                         'no language runtime support for the language "c"')
+
+    @skipUnlessPlatform(["linux", "macosx"])
+    def test_overwrite_vtable(self):
+        self.build()
+        (target, process, thread, bkpt) = lldbutil.run_to_source_breakpoint(
+            self, "At the end", lldb.SBFileSpec("main.cpp")
+        )
+
+        # Test a shape instance to make sure we get the vtable correctly.
+        shape = self.frame().FindVariable("shape")
+        vtable = shape.GetVTable()
+        self.assertEquals(vtable.GetName(), "vtable for Shape")
+        self.assertEquals(vtable.GetTypeName(), "vtable for Shape")
+        # Make sure we have the right number of virtual functions in our vtable
+        # for the shape class.
+        self.assertEquals(vtable.GetNumChildren(), 4)
+
+        # Overwrite the first entry in the vtable and make sure we can still
+        # see the bogus value which should have no summary
+        vtable_addr = vtable.GetValueAsUnsigned()
+        data = str("\x01\x01\x01\x01\x01\x01\x01\x01")
+        error = lldb.SBError()
+        process.WriteMemory(vtable_addr, data, error)
+
+        scribbled_child = vtable.GetChildAtIndex(0)
+        self.assertEquals(scribbled_child.GetValueAsUnsigned(0),
+                          0x0101010101010101)
+        self.assertEquals(scribbled_child.GetSummary(), None)
+
+    def expected_vtable_addr(self, var: lldb.SBValue) -> int:
+        load_addr = var.GetLoadAddress()
+        read_from_memory_error = lldb.SBError()
+        vtable_addr = self.process().ReadPointerFromMemory(
+            load_addr, read_from_memory_error
+        )
+        self.assertTrue(read_from_memory_error.Success())
+        return vtable_addr
+
+    def expected_vtable_entry_func_ptr(self, vtable_addr: int, idx: int):
+        vtable_entry_addr = vtable_addr + idx * self.process().GetAddressByteSize()
+        read_func_ptr_error = lldb.SBError()
+        func_ptr = self.process().ReadPointerFromMemory(vtable_entry_addr,
+                                                        read_func_ptr_error)
+        self.assertTrue(read_func_ptr_error.Success())
+        return func_ptr
+
+    def verify_vtable_entry(self, vtable_entry: lldb.SBValue, vtable_addr: int,
+                            idx: int):
+        """Verify the vtable entry looks something like:
+
+        (double ()) [0] = 0x0000000100003a10 a.out`Rectangle::Area() at main.cpp:14
+
+        """
+        # Check function ptr
+        vtable_entry_func_ptr = vtable_entry.GetValueAsUnsigned(0)
+        self.assertEquals(
+            vtable_entry_func_ptr,
+            self.expected_vtable_entry_func_ptr(vtable_addr, idx),
+        )
+
+        sb_addr = self.target().ResolveLoadAddress(vtable_entry_func_ptr)
+        sym_ctx = sb_addr.GetSymbolContext(lldb.eSymbolContextEverything)
+
+        # Make sure the type is the same as the function type
+        func_type = sym_ctx.GetFunction().GetType()
+        if func_type.IsValid():
+            self.assertEquals(vtable_entry.GetType(),
+                              func_type.GetPointerType())
+
+        # The summary should be the address description of the function pointer
+        summary = vtable_entry.GetSummary()
+        self.assertEquals(str(sb_addr), summary)
diff --git a/lldb/test/API/functionalities/vtable/main.cpp b/lldb/test/API/functionalities/vtable/main.cpp
new file mode 100644
index 0000000000000..498a5765a3f6f
--- /dev/null
+++ b/lldb/test/API/functionalities/vtable/main.cpp
@@ -0,0 +1,38 @@
+class Shape {
+public:
+  virtual double Area() { return 1.0; }
+  virtual double Perimeter() { return 1.0; }
+  // Note that destructors generate two entries in the vtable: base object
+  // destructor and deleting destructor.
+  virtual ~Shape() = default;
+};
+
+class Rectangle : public Shape {
+public:
+  ~Rectangle() override = default;
+  double Area() override { return 2.0; }
+  double Perimeter() override { return 2.0; }
+  virtual void RectangleOnly() {}
+  // This *shouldn't* show up in the vtable.
+  void RectangleSpecific() { return; }
+};
+
+// Make a class that looks like it would be virtual because the first ivar is
+// a virtual class and if we inspect memory at the address of this class it
+// would appear to be a virtual class. We need to make sure we don't get a
+// valid vtable from this object.
+class NotVirtual {
+  Rectangle m_rect;
+public:
+  NotVirtual() = default;
+};
+
+int main(int argc, const char **argv) {
+  Shape shape;
+  Rectangle rect;
+  Shape *shape_ptr = &rect;
+  Shape &shape_ref = shape;
+  shape_ptr = &shape; // Shape is Rectangle
+  NotVirtual not_virtual; // Shape is Shape
+  return 0; // At the end
+}

From ba67365abd26c703f72b0260fa70408d3fa6a870 Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Tue, 31 Oct 2023 00:46:41 +0000
Subject: [PATCH 459/877] [gn build] Port 7fbd427f5ebe

---
 llvm/utils/gn/secondary/lldb/source/Core/BUILD.gn | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/utils/gn/secondary/lldb/source/Core/BUILD.gn b/llvm/utils/gn/secondary/lldb/source/Core/BUILD.gn
index 297f9857cec05..30a9fb3ecceaa 100644
--- a/llvm/utils/gn/secondary/lldb/source/Core/BUILD.gn
+++ b/llvm/utils/gn/secondary/lldb/source/Core/BUILD.gn
@@ -92,6 +92,7 @@ static_library("Core") {
     "ValueObjectRegister.cpp",
     "ValueObjectSyntheticFilter.cpp",
     "ValueObjectUpdater.cpp",
+    "ValueObjectVTable.cpp",
     "ValueObjectVariable.cpp",
   ]
 }

From 3343bd90136ff49cf7eeb6ff8a5c0cd8dbceab55 Mon Sep 17 00:00:00 2001
From: Nico Weber <thakis@chromium.org>
Date: Mon, 30 Oct 2023 21:10:32 -0400
Subject: [PATCH 460/877] [gn] port 15b37e1cfa5f (no xf on msvc/android)

---
 .../compiler-rt/lib/builtins/BUILD.gn         | 65 ++++++++++++-------
 1 file changed, 43 insertions(+), 22 deletions(-)

diff --git a/llvm/utils/gn/secondary/compiler-rt/lib/builtins/BUILD.gn b/llvm/utils/gn/secondary/compiler-rt/lib/builtins/BUILD.gn
index 3a19729bb8dcf..800a647ad6448 100644
--- a/llvm/utils/gn/secondary/compiler-rt/lib/builtins/BUILD.gn
+++ b/llvm/utils/gn/secondary/compiler-rt/lib/builtins/BUILD.gn
@@ -8,6 +8,11 @@ declare_args() {
 
 lse_targets = []
 
+if (current_cpu == "x86" || current_cpu == "x64") {
+  # long double is not 80 bits on Android or MSVC.
+  long_double_is_80_bits = current_os != "android" && current_os != "win"
+}
+
 if (current_cpu == "arm64") {
   foreach(pat,
           [
@@ -270,24 +275,28 @@ static_library("builtins") {
     sources -= [ "fp_mode.c" ]
     sources += [
       "cpu_model.c",
-      "divxc3.c",
-      "extendxftf2.c",
-      "fixunsxfdi.c",
-      "fixunsxfsi.c",
-      "fixunsxfti.c",
-      "fixxfdi.c",
-      "fixxfti.c",
-      "floatdixf.c",
-      "floattixf.c",
-      "floatundixf.c",
-      "floatuntixf.c",
       "i386/fp_mode.c",
-      "mulxc3.c",
-      "powixf2.c",
       "truncdfbf2.c",
       "truncsfbf2.c",
-      "trunctfxf2.c",
     ]
+    if (long_double_is_80_bits) {
+      sources += [
+        "divxc3.c",
+        "extendxftf2.c",
+        "fixunsxfdi.c",
+        "fixunsxfsi.c",
+        "fixunsxfti.c",
+        "fixxfdi.c",
+        "fixxfti.c",
+        "floatdixf.c",
+        "floattixf.c",
+        "floatundixf.c",
+        "floatuntixf.c",
+        "mulxc3.c",
+        "powixf2.c",
+        "trunctfxf2.c",
+      ]
+    }
   }
   if (current_cpu == "x86") {
     sources -= [
@@ -296,10 +305,8 @@ static_library("builtins") {
       "divdi3.c",
       "floatdidf.c",
       "floatdisf.c",
-      "floatdixf.c",
       "floatundidf.c",
       "floatundisf.c",
-      "floatundixf.c",
       "lshrdi3.c",
       "moddi3.c",
       "muldi3.c",
@@ -312,16 +319,24 @@ static_library("builtins") {
       "i386/divdi3.S",
       "i386/floatdidf.S",
       "i386/floatdisf.S",
-      "i386/floatdixf.S",
       "i386/floatundidf.S",
       "i386/floatundisf.S",
-      "i386/floatundixf.S",
       "i386/lshrdi3.S",
       "i386/moddi3.S",
       "i386/muldi3.S",
       "i386/udivdi3.S",
       "i386/umoddi3.S",
     ]
+    if (long_double_is_80_bits) {
+      sources -= [
+        "floatdixf.c",
+        "floatundixf.c",
+      ]
+      sources += [
+        "i386/floatdixf.S",
+        "i386/floatundixf.S",
+      ]
+    }
     if (current_os == "win") {
       sources += [ "i386/chkstk.S" ]
     }
@@ -329,19 +344,25 @@ static_library("builtins") {
     sources -= [
       "floatdidf.c",
       "floatdisf.c",
-      "floatdixf.c",
       "floatundidf.c",
       "floatundisf.c",
-      "floatundixf.c",
     ]
     sources += [
       "x86_64/floatdidf.c",
       "x86_64/floatdisf.c",
-      "x86_64/floatdixf.c",
       "x86_64/floatundidf.S",
       "x86_64/floatundisf.S",
-      "x86_64/floatundixf.S",
     ]
+    if (long_double_is_80_bits) {
+      sources -= [
+        "floatdixf.c",
+        "floatundixf.c",
+      ]
+      sources += [
+        "x86_64/floatdixf.c",
+        "x86_64/floatundixf.S",
+      ]
+    }
     if (current_os == "win") {
       sources += [ "x86_64/chkstk.S" ]
     }

From b0e00ca6a605b88e83129c8c6be4e177f93cbfea Mon Sep 17 00:00:00 2001
From: Maksim Levental <maksim.levental@gmail.com>
Date: Mon, 30 Oct 2023 20:22:27 -0500
Subject: [PATCH 461/877] [mlir][python] fix `replace=True` for
 `register_operation` and `register_type_caster` (#70264)

<img
src="https://github.com/llvm/llvm-project/assets/5657668/443852b6-ac25-45bb-a38b-5dfbda09d5a7"
height="400" />
<p></p>


So turns out that none of the `replace=True` things actually work
because of the map caches (except for
`register_attribute_builder(replace=True)`, which doesn't use such a
cache). This was hidden by a series of unfortunate events:

1. `register_type_caster` failure was hidden because it was the same
`TestIntegerRankedTensorType` being replaced with itself (d'oh).
2. `register_operation` failure was hidden behind the "order of events"
in the lifecycle of typical extension import/use. Since extensions are
loaded/registered almost immediately after generated builders are
registered, there is no opportunity for the `operationClassMapCache` to
be populated (through e.g., `module.body.operations[2]` or
`module.body.operations[2].opview` or something). Of course as soon as
you as actually do "late-bind/late-register" the extension, you see it's
not successfully replacing the stale one in `operationClassMapCache`.

I'll take this opportunity to propose we ditch the caches all together.
I've been cargo-culting them but I really don't understand how they
work. There's this comment above `operationClassMapCache`

```cpp
  /// Cache of operation name to external operation class object. This is
  /// maintained on lookup as a shadow of operationClassMap in order for repeat
  /// lookups of the classes to only incur the cost of one hashtable lookup.
  llvm::StringMap<pybind11::object> operationClassMapCache;
```

But I don't understand how that's true given that the canonical thing
`operationClassMap` is already a map:

```cpp
  /// Map of full operation name to external operation class object.
  llvm::StringMap<pybind11::object> operationClassMap;
```

Maybe it wasn't always the case? Anyway things work now but it seems
like an unnecessary layer of complexity for not much gain? But maybe I'm
wrong.
---
 mlir/lib/Bindings/Python/IRModule.cpp    |  8 +++++++
 mlir/test/python/dialects/python_test.py | 13 +++++++++++
 mlir/test/python/ir/operation.py         | 28 ++++++++++++++++++++++++
 3 files changed, 49 insertions(+)

diff --git a/mlir/lib/Bindings/Python/IRModule.cpp b/mlir/lib/Bindings/Python/IRModule.cpp
index a1c8ab7a09ce1..f8e22f7bb0c1b 100644
--- a/mlir/lib/Bindings/Python/IRModule.cpp
+++ b/mlir/lib/Bindings/Python/IRModule.cpp
@@ -82,6 +82,10 @@ void PyGlobals::registerTypeCaster(MlirTypeID mlirTypeID,
   if (found && !found.is_none() && !replace)
     throw std::runtime_error("Type caster is already registered");
   found = std::move(typeCaster);
+  const auto foundIt = typeCasterMapCache.find(mlirTypeID);
+  if (foundIt != typeCasterMapCache.end() && !foundIt->second.is_none()) {
+    typeCasterMapCache[mlirTypeID] = found;
+  }
 }
 
 void PyGlobals::registerDialectImpl(const std::string &dialectNamespace,
@@ -104,6 +108,10 @@ void PyGlobals::registerOperationImpl(const std::string &operationName,
                                  .str());
   }
   found = std::move(pyClass);
+  auto foundIt = operationClassMapCache.find(operationName);
+  if (foundIt != operationClassMapCache.end() && !foundIt->second.is_none()) {
+    operationClassMapCache[operationName] = found;
+  }
 }
 
 std::optional<py::function>
diff --git a/mlir/test/python/dialects/python_test.py b/mlir/test/python/dialects/python_test.py
index 651e6554eebe8..3d4cd087fbfed 100644
--- a/mlir/test/python/dialects/python_test.py
+++ b/mlir/test/python/dialects/python_test.py
@@ -510,6 +510,19 @@ def type_caster(pytype):
         except RuntimeError as e:
             print(e)
 
+        def type_caster(pytype):
+            return RankedTensorType(pytype)
+
+        # python_test dialect registers a caster for RankedTensorType in its extension (pybind) module.
+        # So this one replaces that one (successfully). And then just to be sure we restore the original caster below.
+        register_type_caster(c.typeid, type_caster, replace=True)
+
+        d = tensor.EmptyOp([10, 10], IntegerType.get_signless(5)).result
+        # CHECK: tensor<10x10xi5>
+        print(d.type)
+        # CHECK: ranked tensor type RankedTensorType(tensor<10x10xi5>)
+        print("ranked tensor type", repr(d.type))
+
         def type_caster(pytype):
             return test.TestIntegerRankedTensorType(pytype)
 
diff --git a/mlir/test/python/ir/operation.py b/mlir/test/python/ir/operation.py
index 129b7fa744e47..04239b048c1c6 100644
--- a/mlir/test/python/ir/operation.py
+++ b/mlir/test/python/ir/operation.py
@@ -5,6 +5,8 @@
 import itertools
 from mlir.ir import *
 from mlir.dialects.builtin import ModuleOp
+from mlir.dialects import arith
+from mlir.dialects._ods_common import _cext
 
 
 def run(f):
@@ -646,6 +648,7 @@ def testKnownOpView():
       %1 = "custom.f32"() : () -> f32
       %2 = "custom.f32"() : () -> f32
       %3 = arith.addf %1, %2 : f32
+      %4 = arith.constant 0 : i32
     """
         )
         print(module)
@@ -668,6 +671,31 @@ def testKnownOpView():
         # CHECK: OpView object
         print(repr(custom))
 
+        # constant should map to an extension OpView class in the arithmetic dialect.
+        constant = module.body.operations[3]
+        # CHECK: <mlir.dialects.arith.ConstantOp object
+        print(repr(constant))
+        # Checks that the arith extension is being registered successfully
+        # (literal_value is a property on the extension class but not on the default OpView).
+        # CHECK: literal value 0
+        print("literal value", constant.literal_value)
+
+        # Checks that "late" registration/replacement (i.e., post all module loading/initialization)
+        # is working correctly.
+        @_cext.register_operation(arith._Dialect, replace=True)
+        class ConstantOp(arith.ConstantOp):
+            def __init__(self, result, value, *, loc=None, ip=None):
+                if isinstance(value, int):
+                    super().__init__(IntegerAttr.get(result, value), loc=loc, ip=ip)
+                elif isinstance(value, float):
+                    super().__init__(FloatAttr.get(result, value), loc=loc, ip=ip)
+                else:
+                    super().__init__(value, loc=loc, ip=ip)
+
+        constant = module.body.operations[3]
+        # CHECK: <__main__.testKnownOpView.<locals>.ConstantOp object
+        print(repr(constant))
+
 
 # CHECK-LABEL: TEST: testSingleResultProperty
 @run

From e6971e5a41fe30264af9a13c8f387c06f93c6d9c Mon Sep 17 00:00:00 2001
From: flyingcat <1004815462@qq.com>
Date: Tue, 31 Oct 2023 10:01:07 +0800
Subject: [PATCH 462/877] [RISCV][NFC] Simplify vector register decoding
 methods (#70423)

Combine redundant 'if' statements and simplify 'switch' statements.
---
 .../RISCV/Disassembler/RISCVDisassembler.cpp  | 26 +++++--------------
 1 file changed, 6 insertions(+), 20 deletions(-)

diff --git a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
index e5ce029449a8c..9bd7cacbf5f04 100644
--- a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
+++ b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
@@ -196,10 +196,7 @@ static DecodeStatus DecodeVRRegisterClass(MCInst &Inst, uint32_t RegNo,
 static DecodeStatus DecodeVRM2RegisterClass(MCInst &Inst, uint32_t RegNo,
                                             uint64_t Address,
                                             const MCDisassembler *Decoder) {
-  if (RegNo >= 32)
-    return MCDisassembler::Fail;
-
-  if (RegNo % 2)
+  if (RegNo >= 32 || RegNo % 2)
     return MCDisassembler::Fail;
 
   const RISCVDisassembler *Dis =
@@ -216,10 +213,7 @@ static DecodeStatus DecodeVRM2RegisterClass(MCInst &Inst, uint32_t RegNo,
 static DecodeStatus DecodeVRM4RegisterClass(MCInst &Inst, uint32_t RegNo,
                                             uint64_t Address,
                                             const MCDisassembler *Decoder) {
-  if (RegNo >= 32)
-    return MCDisassembler::Fail;
-
-  if (RegNo % 4)
+  if (RegNo >= 32 || RegNo % 4)
     return MCDisassembler::Fail;
 
   const RISCVDisassembler *Dis =
@@ -236,10 +230,7 @@ static DecodeStatus DecodeVRM4RegisterClass(MCInst &Inst, uint32_t RegNo,
 static DecodeStatus DecodeVRM8RegisterClass(MCInst &Inst, uint32_t RegNo,
                                             uint64_t Address,
                                             const MCDisassembler *Decoder) {
-  if (RegNo >= 32)
-    return MCDisassembler::Fail;
-
-  if (RegNo % 8)
+  if (RegNo >= 32 || RegNo % 8)
     return MCDisassembler::Fail;
 
   const RISCVDisassembler *Dis =
@@ -256,16 +247,11 @@ static DecodeStatus DecodeVRM8RegisterClass(MCInst &Inst, uint32_t RegNo,
 static DecodeStatus decodeVMaskReg(MCInst &Inst, uint64_t RegNo,
                                    uint64_t Address,
                                    const MCDisassembler *Decoder) {
-  MCRegister Reg = RISCV::NoRegister;
-  switch (RegNo) {
-  default:
+  if (RegNo > 2) {
     return MCDisassembler::Fail;
-  case 0:
-    Reg = RISCV::V0;
-    break;
-  case 1:
-    break;
   }
+  MCRegister Reg = (RegNo == 0) ? RISCV::V0 : RISCV::NoRegister;
+
   Inst.addOperand(MCOperand::createReg(Reg));
   return MCDisassembler::Success;
 }

From a5403a3a69d10d79c1abd361f02460380e08b2c0 Mon Sep 17 00:00:00 2001
From: Petr Hosek <phosek@google.com>
Date: Mon, 30 Oct 2023 20:45:03 -0700
Subject: [PATCH 463/877] [libcxx] Amend XFAIL for failing tests on Windows
 (#70422)

Some tests starting passing/failing after #69431 because Clang no longer
enables -fdelayed-template-parsing by default on Windows with C++20.
---
 .../atomics/diagnose_invalid_memory_order.verify.cpp       | 7 ++++---
 libcxx/test/libcxx/fuzzing/random.pass.cpp                 | 4 ++++
 libcxx/test/std/depr/depr.c.headers/math_h.pass.cpp        | 4 ++++
 libcxx/test/std/numerics/c.math/cmath.pass.cpp             | 4 ++++
 4 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/libcxx/test/libcxx/atomics/diagnose_invalid_memory_order.verify.cpp b/libcxx/test/libcxx/atomics/diagnose_invalid_memory_order.verify.cpp
index a6c1160c7022f..defd43cf267a9 100644
--- a/libcxx/test/libcxx/atomics/diagnose_invalid_memory_order.verify.cpp
+++ b/libcxx/test/libcxx/atomics/diagnose_invalid_memory_order.verify.cpp
@@ -6,9 +6,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-// This test fails because diagnose_if doesn't emit all of the diagnostics
-// when -fdelayed-template-parsing is enabled, like it is in MSVC mode.
-// XFAIL: msvc
+// This test fails with Clang <18 because diagnose_if doesn't emit all of the
+// diagnostics when -fdelayed-template-parsing is enabled, like it is in MSVC
+// mode.
+// XFAIL: msvc && (clang-16 || clang-17)
 
 // REQUIRES: diagnose-if-support
 
diff --git a/libcxx/test/libcxx/fuzzing/random.pass.cpp b/libcxx/test/libcxx/fuzzing/random.pass.cpp
index 79ab7ac41151c..69b496fa3a4e6 100644
--- a/libcxx/test/libcxx/fuzzing/random.pass.cpp
+++ b/libcxx/test/libcxx/fuzzing/random.pass.cpp
@@ -6,6 +6,10 @@
 //
 //===----------------------------------------------------------------------===//
 
+// This test fails because Clang no longer enables -fdelayed-template-parsing
+// by default on Windows with C++20 (#69431).
+// XFAIL: msvc && clang-18
+
 // UNSUPPORTED: c++03, c++11
 
 #include <cassert>
diff --git a/libcxx/test/std/depr/depr.c.headers/math_h.pass.cpp b/libcxx/test/std/depr/depr.c.headers/math_h.pass.cpp
index 87767a2ee4311..7a6c71ae68f79 100644
--- a/libcxx/test/std/depr/depr.c.headers/math_h.pass.cpp
+++ b/libcxx/test/std/depr/depr.c.headers/math_h.pass.cpp
@@ -6,6 +6,10 @@
 //
 //===----------------------------------------------------------------------===//
 
+// This test fails because Clang no longer enables -fdelayed-template-parsing
+// by default on Windows with C++20 (#69431).
+// XFAIL: msvc && clang-18
+
 // <math.h>
 
 #include <math.h>
diff --git a/libcxx/test/std/numerics/c.math/cmath.pass.cpp b/libcxx/test/std/numerics/c.math/cmath.pass.cpp
index 11a3de748cb7a..a6a9dad639cdf 100644
--- a/libcxx/test/std/numerics/c.math/cmath.pass.cpp
+++ b/libcxx/test/std/numerics/c.math/cmath.pass.cpp
@@ -6,6 +6,10 @@
 //
 //===----------------------------------------------------------------------===//
 
+// This test fails because Clang no longer enables -fdelayed-template-parsing
+// by default on Windows with C++20 (#69431).
+// XFAIL: msvc && clang-18
+
 // <cmath>
 
 #include <cmath>

From 3e5187ea83067c3ec0d0e75085858acc4d1dc619 Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Mon, 30 Oct 2023 21:09:21 -0700
Subject: [PATCH 464/877] Revert "[HWASAN] Enable memcpy, memmove and memset
 interceptors (#70387)"

Breaks build bots, details in #70387.

This reverts commit 91cdd7d615da38a1f025646f526c2fce265a37e2.
---
 .../lib/hwasan/hwasan_interceptors.cpp        | 27 +++++++++++++++-
 .../lib/hwasan/hwasan_platform_interceptors.h | 12 +++----
 compiler-rt/test/hwasan/TestCases/bcmp.cpp    | 15 +++------
 compiler-rt/test/hwasan/TestCases/memcmp.cpp  | 15 +++------
 compiler-rt/test/hwasan/TestCases/memcpy.cpp  | 32 -------------------
 compiler-rt/test/hwasan/TestCases/memmove.cpp | 32 -------------------
 compiler-rt/test/hwasan/TestCases/memset.cpp  | 32 -------------------
 7 files changed, 40 insertions(+), 125 deletions(-)
 delete mode 100644 compiler-rt/test/hwasan/TestCases/memcpy.cpp
 delete mode 100644 compiler-rt/test/hwasan/TestCases/memmove.cpp
 delete mode 100644 compiler-rt/test/hwasan/TestCases/memset.cpp

diff --git a/compiler-rt/lib/hwasan/hwasan_interceptors.cpp b/compiler-rt/lib/hwasan/hwasan_interceptors.cpp
index 5171f035f97f7..0889831373a80 100644
--- a/compiler-rt/lib/hwasan/hwasan_interceptors.cpp
+++ b/compiler-rt/lib/hwasan/hwasan_interceptors.cpp
@@ -90,7 +90,8 @@ struct HWAsanInterceptorContext {
 #    include "sanitizer_common/sanitizer_syscalls_netbsd.inc"
 
 #    define COMMON_INTERCEPTOR_WRITE_RANGE(ctx, ptr, size) \
-      HWASAN_WRITE_RANGE(ctx, ptr, size)
+      do {                                                 \
+      } while (false)
 
 #    define COMMON_INTERCEPTOR_READ_RANGE(ctx, ptr, size) \
       HWASAN_READ_RANGE(ctx, ptr, size)
@@ -146,6 +147,30 @@ struct HWAsanInterceptorContext {
         (void)(name);                           \
       } while (false)
 
+#    define COMMON_INTERCEPTOR_MEMMOVE_IMPL(ctx, to, from, size) \
+      do {                                                       \
+        (void)(ctx);                                             \
+        (void)(to);                                              \
+        (void)(from);                                            \
+        (void)(size);                                            \
+      } while (false)
+
+#    define COMMON_INTERCEPTOR_MEMCPY_IMPL(ctx, to, from, size) \
+      do {                                                      \
+        (void)(ctx);                                            \
+        (void)(to);                                             \
+        (void)(from);                                           \
+        (void)(size);                                           \
+      } while (false)
+
+#    define COMMON_INTERCEPTOR_MEMSET_IMPL(ctx, block, c, size) \
+      do {                                                      \
+        (void)(ctx);                                            \
+        (void)(block);                                          \
+        (void)(c);                                              \
+        (void)(size);                                           \
+      } while (false)
+
 #    define COMMON_INTERCEPTOR_STRERROR() \
       do {                                \
       } while (false)
diff --git a/compiler-rt/lib/hwasan/hwasan_platform_interceptors.h b/compiler-rt/lib/hwasan/hwasan_platform_interceptors.h
index d92b510521942..86d26b5ac12d4 100644
--- a/compiler-rt/lib/hwasan/hwasan_platform_interceptors.h
+++ b/compiler-rt/lib/hwasan/hwasan_platform_interceptors.h
@@ -56,14 +56,14 @@
 #undef SANITIZER_INTERCEPT_STRCASECMP
 #define SANITIZER_INTERCEPT_STRCASECMP 0
 
-// #undef SANITIZER_INTERCEPT_MEMSET
-// #define SANITIZER_INTERCEPT_MEMSET 0
+#undef SANITIZER_INTERCEPT_MEMSET
+#define SANITIZER_INTERCEPT_MEMSET 0
 
-// #undef SANITIZER_INTERCEPT_MEMMOVE
-// #define SANITIZER_INTERCEPT_MEMMOVE 0
+#undef SANITIZER_INTERCEPT_MEMMOVE
+#define SANITIZER_INTERCEPT_MEMMOVE 0
 
-// #undef SANITIZER_INTERCEPT_MEMCPY
-// #define SANITIZER_INTERCEPT_MEMCPY 0
+#undef SANITIZER_INTERCEPT_MEMCPY
+#define SANITIZER_INTERCEPT_MEMCPY 0
 
 // #undef SANITIZER_INTERCEPT_MEMCMP
 // #define SANITIZER_INTERCEPT_MEMCMP 0
diff --git a/compiler-rt/test/hwasan/TestCases/bcmp.cpp b/compiler-rt/test/hwasan/TestCases/bcmp.cpp
index 9b21bba56b1be..a83147b0f3205 100644
--- a/compiler-rt/test/hwasan/TestCases/bcmp.cpp
+++ b/compiler-rt/test/hwasan/TestCases/bcmp.cpp
@@ -4,17 +4,11 @@
 // RUN: %clangxx_hwasan -O3 %s -o %t && not %run %t 2>&1 | FileCheck %s
 // REQUIRES: !android
 
-#include <assert.h>
 #include <sanitizer/hwasan_interface.h>
 #include <stdlib.h>
 #include <string.h>
 #include <unistd.h>
 
-__attribute__((no_sanitize("hwaddress"))) void
-ForceCallInterceptor(void *p, const void *a, size_t size) {
-  assert(bcmp(p, a, size) == 0);
-}
-
 int main(int argc, char **argv) {
   __hwasan_enable_allocator_tagging();
   char a[] = {static_cast<char>(argc), 2, 3, 4};
@@ -22,14 +16,13 @@ int main(int argc, char **argv) {
   char *p = (char *)malloc(size);
   memcpy(p, a, size);
   free(p);
-  ForceCallInterceptor(p, a, size);
-  return 0;
+  return bcmp(p, a, size);
   // CHECK: HWAddressSanitizer: tag-mismatch on address
   // CHECK: READ of size 4
-  // CHECK: #{{[[:digit:]]+}} 0x{{[[:xdigit:]]+}} in main {{.*}}bcmp.cpp:[[@LINE-4]]
+  // CHECK: #{{[[:digit:]]+}} 0x{{[[:xdigit:]]+}} in main {{.*}}bcmp.cpp:[[@LINE-3]]
   // CHECK: Cause: use-after-free
   // CHECK: freed by thread
-  // CHECK: #{{[[:digit:]]+}} 0x{{[[:xdigit:]]+}} in main {{.*}}bcmp.cpp:[[@LINE-8]]
+  // CHECK: #{{[[:digit:]]+}} 0x{{[[:xdigit:]]+}} in main {{.*}}bcmp.cpp:[[@LINE-7]]
   // CHECK: previously allocated by thread
-  // CHECK: #{{[[:digit:]]+}} 0x{{[[:xdigit:]]+}} in main {{.*}}bcmp.cpp:[[@LINE-12]]
+  // CHECK: #{{[[:digit:]]+}} 0x{{[[:xdigit:]]+}} in main {{.*}}bcmp.cpp:[[@LINE-11]]
 }
diff --git a/compiler-rt/test/hwasan/TestCases/memcmp.cpp b/compiler-rt/test/hwasan/TestCases/memcmp.cpp
index 31915527c27fd..5f8a93f62a44a 100644
--- a/compiler-rt/test/hwasan/TestCases/memcmp.cpp
+++ b/compiler-rt/test/hwasan/TestCases/memcmp.cpp
@@ -3,17 +3,11 @@
 // RUN: %clangxx_hwasan -O2 %s -o %t && not %run %t 2>&1 | FileCheck %s
 // RUN: %clangxx_hwasan -O3 %s -o %t && not %run %t 2>&1 | FileCheck %s
 
-#include <assert.h>
 #include <sanitizer/hwasan_interface.h>
 #include <stdlib.h>
 #include <string.h>
 #include <unistd.h>
 
-__attribute__((no_sanitize("hwaddress"))) void
-ForceCallInterceptor(void *p, const void *a, size_t size) {
-  assert(memcmp(p, a, size) == 0);
-}
-
 int main(int argc, char **argv) {
   __hwasan_enable_allocator_tagging();
   char a[] = {static_cast<char>(argc), 2, 3, 4};
@@ -21,14 +15,13 @@ int main(int argc, char **argv) {
   char *p = (char *)malloc(size);
   memcpy(p, a, size);
   free(p);
-  ForceCallInterceptor(p, a, size);
-  return 0;
+  return memcmp(p, a, size);
   // CHECK: HWAddressSanitizer: tag-mismatch on address
   // CHECK: READ of size 4
-  // CHECK: #{{[[:digit:]]+}} 0x{{[[:xdigit:]]+}} in main {{.*}}memcmp.cpp:[[@LINE-4]]
+  // CHECK: #{{[[:digit:]]+}} 0x{{[[:xdigit:]]+}} in main {{.*}}memcmp.cpp:[[@LINE-3]]
   // CHECK: Cause: use-after-free
   // CHECK: freed by thread
-  // CHECK: #{{[[:digit:]]+}} 0x{{[[:xdigit:]]+}} in main {{.*}}memcmp.cpp:[[@LINE-8]]
+  // CHECK: #{{[[:digit:]]+}} 0x{{[[:xdigit:]]+}} in main {{.*}}memcmp.cpp:[[@LINE-7]]
   // CHECK: previously allocated by thread
-  // CHECK: #{{[[:digit:]]+}} 0x{{[[:xdigit:]]+}} in main {{.*}}memcmp.cpp:[[@LINE-12]]
+  // CHECK: #{{[[:digit:]]+}} 0x{{[[:xdigit:]]+}} in main {{.*}}memcmp.cpp:[[@LINE-11]]
 }
diff --git a/compiler-rt/test/hwasan/TestCases/memcpy.cpp b/compiler-rt/test/hwasan/TestCases/memcpy.cpp
deleted file mode 100644
index 830449488fec4..0000000000000
--- a/compiler-rt/test/hwasan/TestCases/memcpy.cpp
+++ /dev/null
@@ -1,32 +0,0 @@
-// RUN: %clangxx_hwasan -O0 %s -o %t && not %run %t 2>&1 | FileCheck %s
-// RUN: %clangxx_hwasan -O1 %s -o %t && not %run %t 2>&1 | FileCheck %s
-// RUN: %clangxx_hwasan -O2 %s -o %t && not %run %t 2>&1 | FileCheck %s
-// RUN: %clangxx_hwasan -O3 %s -o %t && not %run %t 2>&1 | FileCheck %s
-
-#include <sanitizer/hwasan_interface.h>
-#include <stdlib.h>
-#include <string.h>
-#include <unistd.h>
-
-__attribute__((no_sanitize("hwaddress"))) void
-ForceCallInterceptor(void *p, const void *a, size_t size) {
-  memcpy(p, a, size);
-}
-
-int main(int argc, char **argv) {
-  __hwasan_enable_allocator_tagging();
-  char a[] = {static_cast<char>(argc), 2, 3, 4};
-  int size = sizeof(a);
-  char *volatile p = (char *)malloc(size);
-  free(p);
-  ForceCallInterceptor(p, a, size);
-  return 0;
-  // CHECK: HWAddressSanitizer: tag-mismatch on address
-  // CHECK: WRITE of size 4
-  // CHECK: #{{[[:digit:]]+}} 0x{{[[:xdigit:]]+}} in main {{.*}}memcpy.cpp:[[@LINE-4]]
-  // CHECK: Cause: use-after-free
-  // CHECK: freed by thread
-  // CHECK: #{{[[:digit:]]+}} 0x{{[[:xdigit:]]+}} in main {{.*}}memcpy.cpp:[[@LINE-8]]
-  // CHECK: previously allocated by thread
-  // CHECK: #{{[[:digit:]]+}} 0x{{[[:xdigit:]]+}} in main {{.*}}memcpy.cpp:[[@LINE-11]]
-}
diff --git a/compiler-rt/test/hwasan/TestCases/memmove.cpp b/compiler-rt/test/hwasan/TestCases/memmove.cpp
deleted file mode 100644
index 40dc3deeb3935..0000000000000
--- a/compiler-rt/test/hwasan/TestCases/memmove.cpp
+++ /dev/null
@@ -1,32 +0,0 @@
-// RUN: %clangxx_hwasan -O0 %s -o %t && not %run %t 2>&1 | FileCheck %s
-// RUN: %clangxx_hwasan -O1 %s -o %t && not %run %t 2>&1 | FileCheck %s
-// RUN: %clangxx_hwasan -O2 %s -o %t && not %run %t 2>&1 | FileCheck %s
-// RUN: %clangxx_hwasan -O3 %s -o %t && not %run %t 2>&1 | FileCheck %s
-
-#include <sanitizer/hwasan_interface.h>
-#include <stdlib.h>
-#include <string.h>
-#include <unistd.h>
-
-__attribute__((no_sanitize("hwaddress"))) void
-ForceCallInterceptor(void *p, const void *a, size_t size) {
-  memmove(p, a, size);
-}
-
-int main(int argc, char **argv) {
-  __hwasan_enable_allocator_tagging();
-  char a[] = {static_cast<char>(argc), 2, 3, 4};
-  int size = sizeof(a);
-  char *volatile p = (char *)malloc(size);
-  free(p);
-  ForceCallInterceptor(p, a, size);
-  return 0;
-  // CHECK: HWAddressSanitizer: tag-mismatch on address
-  // CHECK: WRITE of size 4
-  // CHECK: #{{[[:digit:]]+}} 0x{{[[:xdigit:]]+}} in main {{.*}}memmove.cpp:[[@LINE-4]]
-  // CHECK: Cause: use-after-free
-  // CHECK: freed by thread
-  // CHECK: #{{[[:digit:]]+}} 0x{{[[:xdigit:]]+}} in main {{.*}}memmove.cpp:[[@LINE-8]]
-  // CHECK: previously allocated by thread
-  // CHECK: #{{[[:digit:]]+}} 0x{{[[:xdigit:]]+}} in main {{.*}}memmove.cpp:[[@LINE-11]]
-}
diff --git a/compiler-rt/test/hwasan/TestCases/memset.cpp b/compiler-rt/test/hwasan/TestCases/memset.cpp
deleted file mode 100644
index ae31a3bfe9cda..0000000000000
--- a/compiler-rt/test/hwasan/TestCases/memset.cpp
+++ /dev/null
@@ -1,32 +0,0 @@
-// RUN: %clangxx_hwasan -O0 %s -o %t && not %run %t 2>&1 | FileCheck %s
-// RUN: %clangxx_hwasan -O1 %s -o %t && not %run %t 2>&1 | FileCheck %s
-// RUN: %clangxx_hwasan -O2 %s -o %t && not %run %t 2>&1 | FileCheck %s
-// RUN: %clangxx_hwasan -O3 %s -o %t && not %run %t 2>&1 | FileCheck %s
-
-#include <sanitizer/hwasan_interface.h>
-#include <stdlib.h>
-#include <string.h>
-#include <unistd.h>
-
-__attribute__((no_sanitize("hwaddress"))) void
-ForceCallInterceptor(void *p, int c, size_t size) {
-  memset(p, c, size) == nullptr;
-}
-
-int main(int argc, char **argv) {
-  __hwasan_enable_allocator_tagging();
-  char a[] = {static_cast<char>(argc), 2, 3, 4};
-  int size = sizeof(a);
-  char *volatile p = (char *)malloc(size);
-  free(p);
-  ForceCallInterceptor(p, 0, size);
-  return 0;
-  // CHECK: HWAddressSanitizer: tag-mismatch on address
-  // CHECK: WRITE of size 4
-  // CHECK: #{{[[:digit:]]+}} 0x{{[[:xdigit:]]+}} in main {{.*}}memset.cpp:[[@LINE-4]]
-  // CHECK: Cause: use-after-free
-  // CHECK: freed by thread
-  // CHECK: #{{[[:digit:]]+}} 0x{{[[:xdigit:]]+}} in main {{.*}}memset.cpp:[[@LINE-8]]
-  // CHECK: previously allocated by thread
-  // CHECK: #{{[[:digit:]]+}} 0x{{[[:xdigit:]]+}} in main {{.*}}memset.cpp:[[@LINE-11]]
-}

From 18f036d0105589c3175bb51a518c5d272dae61e2 Mon Sep 17 00:00:00 2001
From: Serge Pavlov <sepavloff@gmail.com>
Date: Tue, 31 Oct 2023 11:32:27 +0700
Subject: [PATCH 465/877] [test] Align behavior of interrupts.test on different
 platforms (#68556)

The test llvm/Support/interrupts.test behaves differently on Linux and
Windows. On Linux the function 'run_wrapper' runs process with stderr
connected to pipe, while on Windows stderr is mapped to stderr of the
running script.

When no output was made to stderr, this difference was not observable.
The new version of llvm-symbolizer (https://reviews.llvm.org/D149759)
complains about missing binary file, so stderr is not empty anymore and
the test fails on Windows and passes on Linux.
---
 llvm/test/Support/interrupts.test | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/llvm/test/Support/interrupts.test b/llvm/test/Support/interrupts.test
index 86730f5139c04..752426c5292b0 100644
--- a/llvm/test/Support/interrupts.test
+++ b/llvm/test/Support/interrupts.test
@@ -10,7 +10,8 @@ import sys
 import time
 
 def run_symbolizer():
-    proc = subprocess.Popen([sys.argv[2]], stdout=subprocess.PIPE, stdin=subprocess.PIPE)
+    proc = subprocess.Popen([sys.argv[2]], stdout=subprocess.PIPE,
+                            stdin=subprocess.PIPE, stderr=sys.stderr)
     # Write then read some output to ensure the process has started fully.
     proc.stdin.write(b'foo bar\n')
     proc.stdin.flush()
@@ -29,13 +30,10 @@ def run_wrapper():
     if os.name == 'nt':
         startupinfo = subprocess.STARTUPINFO()
         startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
-        proc = subprocess.Popen(args,
-                                stderr=sys.stderr,
-                                startupinfo=startupinfo,
-                                creationflags=subprocess.CREATE_NEW_CONSOLE)
+        subprocess.run(args, stderr=sys.stderr, startupinfo=startupinfo,
+                       creationflags=subprocess.CREATE_NEW_CONSOLE)
     else:
-        proc = subprocess.Popen(args,
-                                stderr=subprocess.PIPE)
+        subprocess.run(args, stderr=sys.stderr)
 
 if sys.argv[1] == 'wrapper':
     run_wrapper()

From ae7f7f2ef2033a48fb9db3cb70b88ad62019f40b Mon Sep 17 00:00:00 2001
From: Mikhail Gudim <mgudim@gmail.com>
Date: Tue, 31 Oct 2023 00:37:30 -0400
Subject: [PATCH 466/877] [RISCV] Add `TuneVentanaVeyron` subtarget feature.
 (#70414)

This will be used to add veyron fusions in a later commit.
---
 llvm/lib/Target/RISCV/RISCVFeatures.td   | 4 ++++
 llvm/lib/Target/RISCV/RISCVProcessors.td | 3 ++-
 llvm/lib/Target/RISCV/RISCVSubtarget.h   | 4 +++-
 3 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td
index 979bc0ea8c7d0..ba6c63d8958e6 100644
--- a/llvm/lib/Target/RISCV/RISCVFeatures.td
+++ b/llvm/lib/Target/RISCV/RISCVFeatures.td
@@ -954,6 +954,10 @@ def TuneSiFive7 : SubtargetFeature<"sifive7", "RISCVProcFamily", "SiFive7",
                                    [TuneNoDefaultUnroll,
                                     TuneShortForwardBranchOpt]>;
 
+def TuneVentanaVeyron : SubtargetFeature<"ventana-veyron", "RISCVProcFamily", "VentanaVeyron",
+                                         "Ventana-Veyron Series processors",
+                                         [TuneLUIADDIFusion]>;
+
 // Assume that lock-free native-width atomics are available, even if the target
 // and operating system combination would not usually provide them. The user
 // is responsible for providing any necessary __sync implementations. Code
diff --git a/llvm/lib/Target/RISCV/RISCVProcessors.td b/llvm/lib/Target/RISCV/RISCVProcessors.td
index e4008d145ffa5..5465e0c998ca6 100644
--- a/llvm/lib/Target/RISCV/RISCVProcessors.td
+++ b/llvm/lib/Target/RISCV/RISCVProcessors.td
@@ -242,4 +242,5 @@ def VENTANA_VEYRON_V1 : RISCVProcessorModel<"veyron-v1",
                                              FeatureStdExtZicbom,
                                              FeatureStdExtZicbop,
                                              FeatureStdExtZicboz,
-                                             FeatureVendorXVentanaCondOps]>;
+                                             FeatureVendorXVentanaCondOps],
+                                             [TuneVentanaVeyron]>;
diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.h b/llvm/lib/Target/RISCV/RISCVSubtarget.h
index 6b915e61c1360..f8fab76014068 100644
--- a/llvm/lib/Target/RISCV/RISCVSubtarget.h
+++ b/llvm/lib/Target/RISCV/RISCVSubtarget.h
@@ -46,11 +46,13 @@ struct RISCVTuneInfo {
 
 class RISCVSubtarget : public RISCVGenSubtargetInfo {
 public:
+  // clang-format off
   enum RISCVProcFamilyEnum : uint8_t {
     Others,
     SiFive7,
+    VentanaVeyron,
   };
-
+  // clang-format on
 private:
   virtual void anchor();
 

From 6258da14d6c8048b24254f06c346d2d3d57e647a Mon Sep 17 00:00:00 2001
From: Johannes Doerfert <johannes@jdoerfert.de>
Date: Mon, 30 Oct 2023 22:22:45 -0700
Subject: [PATCH 467/877] [OpenMP] Lower synchronization threshold for
 reductions

This should provide an easy performance boost by only avoiding
synchronization that was unnessary anyway.
---
 openmp/libomptarget/DeviceRTL/src/Reduction.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/openmp/libomptarget/DeviceRTL/src/Reduction.cpp b/openmp/libomptarget/DeviceRTL/src/Reduction.cpp
index 8ec8d29619633..a041d239e1abb 100644
--- a/openmp/libomptarget/DeviceRTL/src/Reduction.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Reduction.cpp
@@ -208,7 +208,7 @@ int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
   // to the number of slots in the buffer.
   bool IsMaster = (ThreadId == 0);
   while (IsMaster) {
-    Bound = atomic::load(&IterCnt, atomic::seq_cst);
+    Bound = atomic::load(&IterCnt, atomic::aquire);
     if (TeamId < Bound + num_of_records)
       break;
   }
@@ -220,8 +220,6 @@ int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
     } else
       lgredFct(GlobalBuffer, ModBockId, reduce_data);
 
-    fence::system(atomic::seq_cst);
-
     // Increment team counter.
     // This counter is incremented by all teams in the current
     // BUFFER_SIZE chunk.
@@ -230,7 +228,9 @@ int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
   }
   // Synchronize
   if (mapping::isSPMDMode())
-    __kmpc_barrier(Loc, TId);
+    synchronize::threadsAligned(atomic::acq_rel);
+  else
+    fence::kernel(atomic::acq_rel);
 
   // reduce_data is global or shared so before being reduced within the
   // warp we need to bring it in local memory:

From a396fb247e0719f56a830a9e4aab0449be7f843a Mon Sep 17 00:00:00 2001
From: Piotr Zegar <me@piotrzegar.pl>
Date: Tue, 31 Oct 2023 06:52:40 +0100
Subject: [PATCH 468/877] [clang-tidy] Fix crash in
 modernize-use-trailing-return-type (#70709)

Resolved the crash that occurred during the use of a user-defined
C-style string literal. The fix entails checking whether the identifier
is non-empty before attempting to read its name.
---
 .../modernize/UseTrailingReturnTypeCheck.cpp   |  2 +-
 .../use-trailing-return-type-cxx20.cpp         | 18 ++++++++++++++++++
 2 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/clang-tools-extra/clang-tidy/modernize/UseTrailingReturnTypeCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseTrailingReturnTypeCheck.cpp
index b81cfbcbfd16c..5a456c58fb5cc 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseTrailingReturnTypeCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/UseTrailingReturnTypeCheck.cpp
@@ -103,7 +103,7 @@ struct UnqualNameVisitor : public RecursiveASTVisitor<UnqualNameVisitor> {
 
   bool VisitDeclRefExpr(DeclRefExpr *S) {
     DeclarationName Name = S->getNameInfo().getName();
-    return S->getQualifierLoc() || !Name.isIdentifier() ||
+    return S->getQualifierLoc() || Name.isEmpty() || !Name.isIdentifier() ||
            !visitUnqualName(Name.getAsIdentifierInfo()->getName());
   }
 
diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-trailing-return-type-cxx20.cpp b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-trailing-return-type-cxx20.cpp
index 63fe7a95fdc94..72fdcc0177965 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-trailing-return-type-cxx20.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-trailing-return-type-cxx20.cpp
@@ -98,3 +98,21 @@ struct TestDefaultOperatorB {
   // CHECK-MESSAGES: :[[@LINE-1]]:15: warning: use a trailing return type for this function [modernize-use-trailing-return-type]
   // CHECK-FIXES: {{^}}  friend auto operator<(const TestDefaultOperatorB &, const TestDefaultOperatorB &) noexcept -> bool = default;{{$}}
 };
+
+namespace PR69863 {
+
+template <unsigned Len>
+struct CustomCompileTimeString {
+  constexpr CustomCompileTimeString(const char (&)[Len]) noexcept {}
+};
+
+template <CustomCompileTimeString Str>
+constexpr decltype(Str) operator""__csz() noexcept {
+// CHECK-MESSAGES: :[[@LINE-1]]:25: warning: use a trailing return type for this function [modernize-use-trailing-return-type]
+// CHECK-FIXES: {{^}}constexpr auto operator""__csz() noexcept -> decltype(Str) {
+  return Str;
+}
+
+inline constexpr CustomCompileTimeString SomeString = "This line will cause a crash"__csz;
+
+}

From dcae289d3a4f77b50efc8b8ecd2d5a58c86933ca Mon Sep 17 00:00:00 2001
From: Christian Ulmann <christian.ulmann@nextsilicon.com>
Date: Tue, 31 Oct 2023 07:34:49 +0100
Subject: [PATCH 469/877] [MLIR][SparseTensor] Introduce opaque pointers in
 LLVM dialect lowering (#70570)

This commit changes the SparseTensor LLVM dialect lowering from using
`llvm.ptr<i8>` to `llvm.ptr`. This change ensures that the lowering now
properly relies on opaque pointers, instead of working with already type
erased i8 pointers.
---
 .../SparseTensor/Transforms/CodegenUtils.cpp  |   2 +-
 .../Transforms/SparseTensorConversion.cpp     |   2 +-
 mlir/test/Dialect/SparseTensor/codegen.mlir   |  32 ++---
 .../test/Dialect/SparseTensor/conversion.mlir | 122 +++++++++---------
 .../Dialect/SparseTensor/sparse_expand.mlir   |   2 +-
 .../SparseTensor/sparse_fill_zero.mlir        |  34 ++---
 .../Dialect/SparseTensor/sparse_lower.mlir    |  16 +--
 .../SparseTensor/sparse_lower_col.mlir        |  16 +--
 .../SparseTensor/sparse_lower_inplace.mlir    |  16 +--
 .../SparseTensor/sparse_perm_lower.mlir       |   4 +-
 10 files changed, 123 insertions(+), 123 deletions(-)

diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.cpp
index b1b1d67ac2d42..7434762596199 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.cpp
@@ -341,7 +341,7 @@ func::CallOp mlir::sparse_tensor::createFuncCall(
 }
 
 Type mlir::sparse_tensor::getOpaquePointerType(MLIRContext *ctx) {
-  return LLVM::LLVMPointerType::get(IntegerType::get(ctx, 8));
+  return LLVM::LLVMPointerType::get(ctx);
 }
 
 Type mlir::sparse_tensor::getOpaquePointerType(Builder &builder) {
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorConversion.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorConversion.cpp
index 570be951cab84..79f6013640440 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorConversion.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorConversion.cpp
@@ -42,7 +42,7 @@ namespace {
 /// Maps each sparse tensor type to an opaque pointer.
 static std::optional<Type> convertSparseTensorTypes(Type type) {
   if (getSparseTensorEncoding(type) != nullptr)
-    return LLVM::LLVMPointerType::get(IntegerType::get(type.getContext(), 8));
+    return LLVM::LLVMPointerType::get(type.getContext());
   return std::nullopt;
 }
 
diff --git a/mlir/test/Dialect/SparseTensor/codegen.mlir b/mlir/test/Dialect/SparseTensor/codegen.mlir
index c53ec7408bc3b..5876eef6f1932 100644
--- a/mlir/test/Dialect/SparseTensor/codegen.mlir
+++ b/mlir/test/Dialect/SparseTensor/codegen.mlir
@@ -664,7 +664,7 @@ func.func @sparse_convert_element_type(%arg0: tensor<32xf32, #SparseVector>) ->
 }
 
 // CHECK-LABEL: func.func @sparse_new_coo(
-// CHECK-SAME:  %[[A0:.*]]: !llvm.ptr<i8>) -> (memref<?xindex>, memref<?xindex>, memref<?xf32>, !sparse_tensor.storage_specifier<#sparse_tensor.encoding<{{{.*}}}>>) {
+// CHECK-SAME:  %[[A0:.*]]: !llvm.ptr) -> (memref<?xindex>, memref<?xindex>, memref<?xf32>, !sparse_tensor.storage_specifier<#sparse_tensor.encoding<{{{.*}}}>>) {
 // CHECK-DAG:   %[[VAL_1:.*]] = arith.constant false
 // CHECK-DAG:   %[[VAL_2:.*]] = arith.constant 2 : i32
 // CHECK-DAG:   %[[VAL_3:.*]] = arith.constant 1 : index
@@ -674,9 +674,9 @@ func.func @sparse_convert_element_type(%arg0: tensor<32xf32, #SparseVector>) ->
 // CHECK:       %[[VAL_7:.*]] = memref.cast %[[VAL_6]] : memref<2xindex> to memref<?xindex>
 // CHECK:       memref.store %[[VAL_4]], %[[VAL_6]]{{\[}}%[[VAL_4]]] : memref<2xindex>
 // CHECK:       memref.store %[[VAL_4]], %[[VAL_6]]{{\[}}%[[VAL_3]]] : memref<2xindex>
-// CHECK:       %[[VAL_8:.*]] = call @createCheckedSparseTensorReader(%[[A0]], %[[VAL_7]], %[[VAL_2]]) : (!llvm.ptr<i8>, memref<?xindex>, i32) -> !llvm.ptr<i8>
-// CHECK:       %[[VAL_9:.*]] = call @getSparseTensorReaderDimSizes(%[[VAL_8]]) : (!llvm.ptr<i8>) -> memref<?xindex>
-// CHECK:       %[[VAL_10:.*]] = call @getSparseTensorReaderNSE(%[[VAL_8]]) : (!llvm.ptr<i8>) -> index
+// CHECK:       %[[VAL_8:.*]] = call @createCheckedSparseTensorReader(%[[A0]], %[[VAL_7]], %[[VAL_2]]) : (!llvm.ptr, memref<?xindex>, i32) -> !llvm.ptr
+// CHECK:       %[[VAL_9:.*]] = call @getSparseTensorReaderDimSizes(%[[VAL_8]]) : (!llvm.ptr) -> memref<?xindex>
+// CHECK:       %[[VAL_10:.*]] = call @getSparseTensorReaderNSE(%[[VAL_8]]) : (!llvm.ptr) -> index
 // CHECK:       %[[VAL_11:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_4]]] : memref<?xindex>
 // CHECK:       %[[VAL_12:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_3]]] : memref<?xindex>
 // CHECK:       %[[VAL_13:.*]] = arith.muli %[[VAL_10]], %[[VAL_5]] : index
@@ -696,7 +696,7 @@ func.func @sparse_convert_element_type(%arg0: tensor<32xf32, #SparseVector>) ->
 // CHECK:       %[[VAL_29:.*]] = memref.cast %[[VAL_28]] : memref<2xindex> to memref<?xindex>
 // CHECK:       memref.store %[[VAL_4]], %[[VAL_28]]{{\[}}%[[VAL_4]]] : memref<2xindex>
 // CHECK:       memref.store %[[VAL_3]], %[[VAL_28]]{{\[}}%[[VAL_3]]] : memref<2xindex>
-// CHECK:       %[[VAL_30:.*]] = call @getSparseTensorReaderReadToBuffers0F32(%[[VAL_8]], %[[VAL_29]], %[[VAL_29]], %[[VAL_16]], %[[VAL_17]]) : (!llvm.ptr<i8>, memref<?xindex>, memref<?xindex>, memref<?xindex>, memref<?xf32>) -> i1
+// CHECK:       %[[VAL_30:.*]] = call @getSparseTensorReaderReadToBuffers0F32(%[[VAL_8]], %[[VAL_29]], %[[VAL_29]], %[[VAL_16]], %[[VAL_17]]) : (!llvm.ptr, memref<?xindex>, memref<?xindex>, memref<?xindex>, memref<?xf32>) -> i1
 // CHECK:       %[[VAL_31:.*]] = arith.cmpi eq, %[[VAL_30]], %[[VAL_1]] : i1
 // CHECK:       scf.if %[[VAL_31]] {
 // CHECK:         sparse_tensor.sort hybrid_quick_sort %[[VAL_10]], %[[VAL_16]] jointly %[[VAL_17]]
@@ -704,15 +704,15 @@ func.func @sparse_convert_element_type(%arg0: tensor<32xf32, #SparseVector>) ->
 // CHECK:       memref.store %[[VAL_10]], %[[VAL_25]]{{\[}}%[[VAL_3]]] : memref<?xindex>
 // CHECK:       %[[VAL_32:.*]] = sparse_tensor.storage_specifier.set %[[VAL_27]]  crd_mem_sz at 0 with %[[VAL_13]]
 // CHECK:       %[[VAL_33:.*]] = sparse_tensor.storage_specifier.set %[[VAL_32]]  val_mem_sz with %[[VAL_10]]
-// CHECK:       call @delSparseTensorReader(%[[VAL_8]]) : (!llvm.ptr<i8>) -> ()
+// CHECK:       call @delSparseTensorReader(%[[VAL_8]]) : (!llvm.ptr) -> ()
 // CHECK:       return %[[VAL_25]], %[[VAL_16]], %[[VAL_17]], %[[VAL_33]]
-func.func @sparse_new_coo(%arg0: !llvm.ptr<i8>) -> tensor<?x?xf32, #Coo> {
-  %0 = sparse_tensor.new %arg0 : !llvm.ptr<i8> to tensor<?x?xf32, #Coo>
+func.func @sparse_new_coo(%arg0: !llvm.ptr) -> tensor<?x?xf32, #Coo> {
+  %0 = sparse_tensor.new %arg0 : !llvm.ptr to tensor<?x?xf32, #Coo>
   return %0 : tensor<?x?xf32, #Coo>
 }
 
 // CHECK-LABEL: func.func @sparse_new_coo_permute_no(
-// CHECK-SAME:  %[[A0:.*]]: !llvm.ptr<i8>) -> (memref<?xindex>, memref<?xindex>, memref<?xf32>, !sparse_tensor.storage_specifier<#sparse_tensor.encoding<{{{.*}}}>>) {
+// CHECK-SAME:  %[[A0:.*]]: !llvm.ptr) -> (memref<?xindex>, memref<?xindex>, memref<?xf32>, !sparse_tensor.storage_specifier<#sparse_tensor.encoding<{{{.*}}}>>) {
 // CHECK-DAG:   %[[VAL_1:.*]] = arith.constant 2 : i32
 // CHECK-DAG:   %[[VAL_2:.*]] = arith.constant 1 : index
 // CHECK-DAG:   %[[VAL_3:.*]] = arith.constant 0 : index
@@ -721,9 +721,9 @@ func.func @sparse_new_coo(%arg0: !llvm.ptr<i8>) -> tensor<?x?xf32, #Coo> {
 // CHECK:       %[[VAL_6:.*]] = memref.cast %[[VAL_5]] : memref<2xindex> to memref<?xindex>
 // CHECK:       memref.store %[[VAL_3]], %[[VAL_5]]{{\[}}%[[VAL_3]]] : memref<2xindex>
 // CHECK:       memref.store %[[VAL_3]], %[[VAL_5]]{{\[}}%[[VAL_2]]] : memref<2xindex>
-// CHECK:       %[[VAL_7:.*]] = call @createCheckedSparseTensorReader(%[[A0]], %[[VAL_6]], %[[VAL_1]]) : (!llvm.ptr<i8>, memref<?xindex>, i32) -> !llvm.ptr<i8>
-// CHECK:       %[[VAL_8:.*]] = call @getSparseTensorReaderDimSizes(%[[VAL_7]]) : (!llvm.ptr<i8>) -> memref<?xindex>
-// CHECK:       %[[VAL_9:.*]] = call @getSparseTensorReaderNSE(%[[VAL_7]]) : (!llvm.ptr<i8>) -> index
+// CHECK:       %[[VAL_7:.*]] = call @createCheckedSparseTensorReader(%[[A0]], %[[VAL_6]], %[[VAL_1]]) : (!llvm.ptr, memref<?xindex>, i32) -> !llvm.ptr
+// CHECK:       %[[VAL_8:.*]] = call @getSparseTensorReaderDimSizes(%[[VAL_7]]) : (!llvm.ptr) -> memref<?xindex>
+// CHECK:       %[[VAL_9:.*]] = call @getSparseTensorReaderNSE(%[[VAL_7]]) : (!llvm.ptr) -> index
 // CHECK:       %[[VAL_10:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_3]]] : memref<?xindex>
 // CHECK:       %[[VAL_11:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_2]]] : memref<?xindex>
 // CHECK:       %[[VAL_12:.*]] = arith.muli %[[VAL_9]], %[[VAL_4]] : index
@@ -747,13 +747,13 @@ func.func @sparse_new_coo(%arg0: !llvm.ptr<i8>) -> tensor<?x?xf32, #Coo> {
 // CHECK:       %[[VAL_30:.*]] = memref.cast %[[VAL_29]] : memref<2xindex> to memref<?xindex>
 // CHECK:       memref.store %[[VAL_2]], %[[VAL_29]]{{\[}}%[[VAL_3]]] : memref<2xindex>
 // CHECK:       memref.store %[[VAL_3]], %[[VAL_29]]{{\[}}%[[VAL_2]]] : memref<2xindex>
-// CHECK:       %[[VAL_31:.*]] = call @getSparseTensorReaderReadToBuffers0F32(%[[VAL_7]], %[[VAL_28]], %[[VAL_30]], %[[VAL_15]], %[[VAL_16]]) : (!llvm.ptr<i8>, memref<?xindex>, memref<?xindex>, memref<?xindex>, memref<?xf32>) -> i1
+// CHECK:       %[[VAL_31:.*]] = call @getSparseTensorReaderReadToBuffers0F32(%[[VAL_7]], %[[VAL_28]], %[[VAL_30]], %[[VAL_15]], %[[VAL_16]]) : (!llvm.ptr, memref<?xindex>, memref<?xindex>, memref<?xindex>, memref<?xf32>) -> i1
 // CHECK:       memref.store %[[VAL_9]], %[[VAL_24]]{{\[}}%[[VAL_2]]] : memref<?xindex>
 // CHECK:       %[[VAL_32:.*]] = sparse_tensor.storage_specifier.set %[[VAL_26]]  crd_mem_sz at 0 with %[[VAL_12]]
 // CHECK:       %[[VAL_33:.*]] = sparse_tensor.storage_specifier.set %[[VAL_32]]  val_mem_sz with %[[VAL_9]]
-// CHECK:       call @delSparseTensorReader(%[[VAL_7]]) : (!llvm.ptr<i8>) -> ()
+// CHECK:       call @delSparseTensorReader(%[[VAL_7]]) : (!llvm.ptr) -> ()
 // CHECK:       return %[[VAL_24]], %[[VAL_15]], %[[VAL_16]], %[[VAL_33]]
-func.func @sparse_new_coo_permute_no(%arg0: !llvm.ptr<i8>) -> tensor<?x?xf32, #CooPNo> {
-  %0 = sparse_tensor.new %arg0 : !llvm.ptr<i8> to tensor<?x?xf32, #CooPNo>
+func.func @sparse_new_coo_permute_no(%arg0: !llvm.ptr) -> tensor<?x?xf32, #CooPNo> {
+  %0 = sparse_tensor.new %arg0 : !llvm.ptr to tensor<?x?xf32, #CooPNo>
   return %0 : tensor<?x?xf32, #CooPNo>
 }
diff --git a/mlir/test/Dialect/SparseTensor/conversion.mlir b/mlir/test/Dialect/SparseTensor/conversion.mlir
index 27d8f296c9ad0..e4e825bf85043 100644
--- a/mlir/test/Dialect/SparseTensor/conversion.mlir
+++ b/mlir/test/Dialect/SparseTensor/conversion.mlir
@@ -29,14 +29,14 @@
 }>
 
 // CHECK-LABEL: func @sparse_nop(
-//  CHECK-SAME: %[[A:.*]]: !llvm.ptr<i8>) -> !llvm.ptr<i8>
-//       CHECK: return %[[A]] : !llvm.ptr<i8>
+//  CHECK-SAME: %[[A:.*]]: !llvm.ptr) -> !llvm.ptr
+//       CHECK: return %[[A]] : !llvm.ptr
 func.func @sparse_nop(%arg0: tensor<?xf64, #SparseVector>) -> tensor<?xf64, #SparseVector> {
   return %arg0 : tensor<?xf64, #SparseVector>
 }
 
 // CHECK-LABEL: func @sparse_dim1d(
-//  CHECK-SAME: %[[A:.*]]: !llvm.ptr<i8>)
+//  CHECK-SAME: %[[A:.*]]: !llvm.ptr)
 //       CHECK: %[[C:.*]] = arith.constant 0 : index
 //       CHECK: %[[D:.*]] = call @sparseLvlSize(%[[A]], %[[C]])
 //       CHECK: return %[[D]] : index
@@ -50,7 +50,7 @@ func.func @sparse_dim1d(%arg0: tensor<?xf64, #SparseVector>) -> index {
 // not be permuted into a query for the size of level 2 (even though
 // dimension 1 is stored as level 2).
 // CHECK-LABEL: func @sparse_dim3d(
-//  CHECK-SAME: %[[A:.*]]: !llvm.ptr<i8>)
+//  CHECK-SAME: %[[A:.*]]: !llvm.ptr)
 //       CHECK: %[[C:.*]] = arith.constant 2 : index
 //       CHECK: %[[D:.*]] = call @sparseLvlSize(%[[A]], %[[C]])
 //       CHECK: return %[[D]] : index
@@ -64,7 +64,7 @@ func.func @sparse_dim3d(%arg0: tensor<?x?x?xf64, #SparseTensor>) -> index {
 // constant (and we should be sure to get the size of dimension 1,
 // not dimension 2 nor level 1).
 // CHECK-LABEL: func @sparse_dim3d_const(
-//  CHECK-SAME: %[[A:.*]]: !llvm.ptr<i8>)
+//  CHECK-SAME: %[[A:.*]]: !llvm.ptr)
 //       CHECK: %[[C:.*]] = arith.constant 20 : index
 //       CHECK: return %[[C]] : index
 func.func @sparse_dim3d_const(%arg0: tensor<10x20x30xf64, #SparseTensor>) -> index {
@@ -74,7 +74,7 @@ func.func @sparse_dim3d_const(%arg0: tensor<10x20x30xf64, #SparseTensor>) -> ind
 }
 
 // CHECK-LABEL: func @sparse_new1d(
-//  CHECK-SAME: %[[A:.*]]: !llvm.ptr<i8>) -> !llvm.ptr<i8>
+//  CHECK-SAME: %[[A:.*]]: !llvm.ptr) -> !llvm.ptr
 //   CHECK-DAG: %[[DimShape0:.*]] = memref.alloca() : memref<1xindex>
 //   CHECK-DAG: %[[DimShape:.*]] = memref.cast %[[DimShape0]] : memref<1xindex> to memref<?xindex>
 //       CHECK: %[[Reader:.*]] = call @createCheckedSparseTensorReader(%[[A]], %[[DimShape]], %{{.*}})
@@ -84,14 +84,14 @@ func.func @sparse_dim3d_const(%arg0: tensor<10x20x30xf64, #SparseTensor>) -> ind
 //   CHECK-DAG: %[[Iota:.*]] = memref.cast %[[Iota0]] : memref<1xindex> to memref<?xindex>
 //       CHECK: %[[T:.*]] = call @newSparseTensor(%[[DimShape]], %[[DimShape]], %[[LvlTypes]], %[[Iota]], %[[Iota]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %[[Reader]])
 //       CHECK: call @delSparseTensorReader(%[[Reader]])
-//       CHECK: return %[[T]] : !llvm.ptr<i8>
-func.func @sparse_new1d(%arg0: !llvm.ptr<i8>) -> tensor<128xf64, #SparseVector> {
-  %0 = sparse_tensor.new %arg0 : !llvm.ptr<i8> to tensor<128xf64, #SparseVector>
+//       CHECK: return %[[T]] : !llvm.ptr
+func.func @sparse_new1d(%arg0: !llvm.ptr) -> tensor<128xf64, #SparseVector> {
+  %0 = sparse_tensor.new %arg0 : !llvm.ptr to tensor<128xf64, #SparseVector>
   return %0 : tensor<128xf64, #SparseVector>
 }
 
 // CHECK-LABEL: func @sparse_new2d(
-//  CHECK-SAME: %[[A:.*]]: !llvm.ptr<i8>) -> !llvm.ptr<i8>
+//  CHECK-SAME: %[[A:.*]]: !llvm.ptr) -> !llvm.ptr
 //   CHECK-DAG: %[[DimShape0:.*]] = memref.alloca() : memref<2xindex>
 //   CHECK-DAG: %[[DimShape:.*]] = memref.cast %[[DimShape0]] : memref<2xindex> to memref<?xindex>
 //       CHECK: %[[Reader:.*]] = call @createCheckedSparseTensorReader(%[[A]], %[[DimShape]], %{{.*}})
@@ -102,14 +102,14 @@ func.func @sparse_new1d(%arg0: !llvm.ptr<i8>) -> tensor<128xf64, #SparseVector>
 //   CHECK-DAG: %[[Iota:.*]] = memref.cast %[[Iota0]] : memref<2xindex> to memref<?xindex>
 //       CHECK: %[[T:.*]] = call @newSparseTensor(%[[DimSizes]], %[[DimSizes]], %[[LvlTypes]], %[[Iota]], %[[Iota]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %[[Reader]])
 //       CHECK: call @delSparseTensorReader(%[[Reader]])
-//       CHECK: return %[[T]] : !llvm.ptr<i8>
-func.func @sparse_new2d(%arg0: !llvm.ptr<i8>) -> tensor<?x?xf32, #CSR> {
-  %0 = sparse_tensor.new %arg0 : !llvm.ptr<i8> to tensor<?x?xf32, #CSR>
+//       CHECK: return %[[T]] : !llvm.ptr
+func.func @sparse_new2d(%arg0: !llvm.ptr) -> tensor<?x?xf32, #CSR> {
+  %0 = sparse_tensor.new %arg0 : !llvm.ptr to tensor<?x?xf32, #CSR>
   return %0 : tensor<?x?xf32, #CSR>
 }
 
 // CHECK-LABEL: func @sparse_new3d(
-//  CHECK-SAME: %[[A:.*]]: !llvm.ptr<i8>) -> !llvm.ptr<i8>
+//  CHECK-SAME: %[[A:.*]]: !llvm.ptr) -> !llvm.ptr
 //   CHECK-DAG: %[[DimShape0:.*]] = memref.alloca() : memref<3xindex>
 //   CHECK-DAG: %[[DimShape:.*]] = memref.cast %[[DimShape0]] : memref<3xindex> to memref<?xindex>
 //       CHECK: %[[Reader:.*]] = call @createCheckedSparseTensorReader(%[[A]], %[[DimShape]], %{{.*}})
@@ -124,15 +124,15 @@ func.func @sparse_new2d(%arg0: !llvm.ptr<i8>) -> tensor<?x?xf32, #CSR> {
 //   CHECK-DAG: %[[LvlSizes:.*]] = memref.cast %[[LvlSizes0]] : memref<3xindex> to memref<?xindex>
 //       CHECK: %[[T:.*]] = call @newSparseTensor(%[[DimSizes]], %[[LvlSizes]], %[[LvlTypes]], %[[Dim2Lvl]], %[[Lvl2Dim]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %[[Reader]])
 //       CHECK: call @delSparseTensorReader(%[[Reader]])
-//       CHECK: return %[[T]] : !llvm.ptr<i8>
-func.func @sparse_new3d(%arg0: !llvm.ptr<i8>) -> tensor<?x?x?xf32, #SparseTensor> {
-  %0 = sparse_tensor.new %arg0 : !llvm.ptr<i8> to tensor<?x?x?xf32, #SparseTensor>
+//       CHECK: return %[[T]] : !llvm.ptr
+func.func @sparse_new3d(%arg0: !llvm.ptr) -> tensor<?x?x?xf32, #SparseTensor> {
+  %0 = sparse_tensor.new %arg0 : !llvm.ptr to tensor<?x?x?xf32, #SparseTensor>
   return %0 : tensor<?x?x?xf32, #SparseTensor>
 }
 
 // CHECK-LABEL: func @sparse_init(
 //  CHECK-SAME: %[[I:.*]]: index,
-//  CHECK-SAME: %[[J:.*]]: index) -> !llvm.ptr<i8>
+//  CHECK-SAME: %[[J:.*]]: index) -> !llvm.ptr
 //   CHECK-DAG: %[[Empty:.*]] = arith.constant 0 : i32
 //   CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
 //   CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
@@ -144,9 +144,9 @@ func.func @sparse_new3d(%arg0: !llvm.ptr<i8>) -> tensor<?x?x?xf32, #SparseTensor
 //   CHECK-DAG: %[[Iota:.*]] = memref.cast %[[Iota0]] : memref<2xindex> to memref<?xindex>
 //   CHECK-DAG: memref.store %[[I]], %[[Sizes0]][%[[C0]]] : memref<2xindex>
 //   CHECK-DAG: memref.store %[[J]], %[[Sizes0]][%[[C1]]] : memref<2xindex>
-//       CHECK: %[[NP:.*]] = llvm.mlir.zero : !llvm.ptr<i8>
+//       CHECK: %[[NP:.*]] = llvm.mlir.zero : !llvm.ptr
 //       CHECK: %[[T:.*]] = call @newSparseTensor(%[[Sizes]], %[[Sizes]], %[[LvlTypes]], %[[Iota]], %[[Iota]], %{{.*}}, %{{.*}}, %{{.*}}, %[[Empty]], %[[NP]])
-//       CHECK: return %[[T]] : !llvm.ptr<i8>
+//       CHECK: return %[[T]] : !llvm.ptr
 func.func @sparse_init(%arg0: index, %arg1: index) -> tensor<?x?xf64, #CSR> {
   %0 = tensor.empty(%arg0, %arg1) : tensor<?x?xf64, #CSR>
   %1 = sparse_tensor.load %0 : tensor<?x?xf64, #CSR>
@@ -154,8 +154,8 @@ func.func @sparse_init(%arg0: index, %arg1: index) -> tensor<?x?xf64, #CSR> {
 }
 
 // CHECK-LABEL: func @sparse_release(
-//  CHECK-SAME: %[[A:.*]]: !llvm.ptr<i8>)
-//       CHECK: call @delSparseTensor(%[[A]]) : (!llvm.ptr<i8>) -> ()
+//  CHECK-SAME: %[[A:.*]]: !llvm.ptr)
+//       CHECK: call @delSparseTensor(%[[A]]) : (!llvm.ptr) -> ()
 //       CHECK: return
 func.func @sparse_release(%arg0: tensor<128xf64, #SparseVector>) {
   bufferization.dealloc_tensor %arg0 : tensor<128xf64, #SparseVector>
@@ -163,17 +163,17 @@ func.func @sparse_release(%arg0: tensor<128xf64, #SparseVector>) {
 }
 
 // CHECK-LABEL: func @sparse_nop_cast(
-//  CHECK-SAME: %[[A:.*]]: !llvm.ptr<i8>) -> !llvm.ptr<i8>
-//       CHECK: return %[[A]] : !llvm.ptr<i8>
+//  CHECK-SAME: %[[A:.*]]: !llvm.ptr) -> !llvm.ptr
+//       CHECK: return %[[A]] : !llvm.ptr
 func.func @sparse_nop_cast(%arg0: tensor<64xf32, #SparseVector>) -> tensor<?xf32, #SparseVector> {
   %0 = tensor.cast %arg0 : tensor<64xf32, #SparseVector> to tensor<?xf32, #SparseVector>
   return %0 : tensor<?xf32, #SparseVector>
 }
 
 // CHECK-LABEL: func @sparse_positions(
-//  CHECK-SAME: %[[A:.*]]: !llvm.ptr<i8>)
+//  CHECK-SAME: %[[A:.*]]: !llvm.ptr)
 //       CHECK: %[[C:.*]] = arith.constant 0 : index
-//       CHECK: %[[T:.*]] = call @sparsePositions0(%[[A]], %[[C]]) : (!llvm.ptr<i8>, index) -> memref<?xindex>
+//       CHECK: %[[T:.*]] = call @sparsePositions0(%[[A]], %[[C]]) : (!llvm.ptr, index) -> memref<?xindex>
 //       CHECK: return %[[T]] : memref<?xindex>
 func.func @sparse_positions(%arg0: tensor<128xf64, #SparseVector>) -> memref<?xindex> {
   %0 = sparse_tensor.positions %arg0 { level = 0 : index } : tensor<128xf64, #SparseVector> to memref<?xindex>
@@ -181,9 +181,9 @@ func.func @sparse_positions(%arg0: tensor<128xf64, #SparseVector>) -> memref<?xi
 }
 
 // CHECK-LABEL: func @sparse_positions64(
-//  CHECK-SAME: %[[A:.*]]: !llvm.ptr<i8>)
+//  CHECK-SAME: %[[A:.*]]: !llvm.ptr)
 //       CHECK: %[[C:.*]] = arith.constant 0 : index
-//       CHECK: %[[T:.*]] = call @sparsePositions64(%[[A]], %[[C]]) : (!llvm.ptr<i8>, index) -> memref<?xi64>
+//       CHECK: %[[T:.*]] = call @sparsePositions64(%[[A]], %[[C]]) : (!llvm.ptr, index) -> memref<?xi64>
 //       CHECK: return %[[T]] : memref<?xi64>
 func.func @sparse_positions64(%arg0: tensor<128xf64, #SparseVector64>) -> memref<?xi64> {
   %0 = sparse_tensor.positions %arg0 { level = 0 : index } : tensor<128xf64, #SparseVector64> to memref<?xi64>
@@ -191,9 +191,9 @@ func.func @sparse_positions64(%arg0: tensor<128xf64, #SparseVector64>) -> memref
 }
 
 // CHECK-LABEL: func @sparse_positions32(
-//  CHECK-SAME: %[[A:.*]]: !llvm.ptr<i8>)
+//  CHECK-SAME: %[[A:.*]]: !llvm.ptr)
 //       CHECK: %[[C:.*]] = arith.constant 0 : index
-//       CHECK: %[[T:.*]] = call @sparsePositions32(%[[A]], %[[C]]) : (!llvm.ptr<i8>, index) -> memref<?xi32>
+//       CHECK: %[[T:.*]] = call @sparsePositions32(%[[A]], %[[C]]) : (!llvm.ptr, index) -> memref<?xi32>
 //       CHECK: return %[[T]] : memref<?xi32>
 func.func @sparse_positions32(%arg0: tensor<128xf64, #SparseVector32>) -> memref<?xi32> {
   %0 = sparse_tensor.positions %arg0 { level = 0 : index } : tensor<128xf64, #SparseVector32> to memref<?xi32>
@@ -201,9 +201,9 @@ func.func @sparse_positions32(%arg0: tensor<128xf64, #SparseVector32>) -> memref
 }
 
 // CHECK-LABEL: func @sparse_indices(
-//  CHECK-SAME: %[[A:.*]]: !llvm.ptr<i8>)
+//  CHECK-SAME: %[[A:.*]]: !llvm.ptr)
 //       CHECK: %[[C:.*]] = arith.constant 0 : index
-//       CHECK: %[[T:.*]] = call @sparseCoordinates0(%[[A]], %[[C]]) : (!llvm.ptr<i8>, index) -> memref<?xindex>
+//       CHECK: %[[T:.*]] = call @sparseCoordinates0(%[[A]], %[[C]]) : (!llvm.ptr, index) -> memref<?xindex>
 //       CHECK: return %[[T]] : memref<?xindex>
 func.func @sparse_indices(%arg0: tensor<128xf64, #SparseVector>) -> memref<?xindex> {
   %0 = sparse_tensor.coordinates %arg0 { level = 0 : index } : tensor<128xf64, #SparseVector> to memref<?xindex>
@@ -211,9 +211,9 @@ func.func @sparse_indices(%arg0: tensor<128xf64, #SparseVector>) -> memref<?xind
 }
 
 // CHECK-LABEL: func @sparse_indices64(
-//  CHECK-SAME: %[[A:.*]]: !llvm.ptr<i8>)
+//  CHECK-SAME: %[[A:.*]]: !llvm.ptr)
 //       CHECK: %[[C:.*]] = arith.constant 0 : index
-//       CHECK: %[[T:.*]] = call @sparseCoordinates64(%[[A]], %[[C]]) : (!llvm.ptr<i8>, index) -> memref<?xi64>
+//       CHECK: %[[T:.*]] = call @sparseCoordinates64(%[[A]], %[[C]]) : (!llvm.ptr, index) -> memref<?xi64>
 //       CHECK: return %[[T]] : memref<?xi64>
 func.func @sparse_indices64(%arg0: tensor<128xf64, #SparseVector64>) -> memref<?xi64> {
   %0 = sparse_tensor.coordinates %arg0 { level = 0 : index } : tensor<128xf64, #SparseVector64> to memref<?xi64>
@@ -221,9 +221,9 @@ func.func @sparse_indices64(%arg0: tensor<128xf64, #SparseVector64>) -> memref<?
 }
 
 // CHECK-LABEL: func @sparse_indices32(
-//  CHECK-SAME: %[[A:.*]]: !llvm.ptr<i8>)
+//  CHECK-SAME: %[[A:.*]]: !llvm.ptr)
 //       CHECK: %[[C:.*]] = arith.constant 0 : index
-//       CHECK: %[[T:.*]] = call @sparseCoordinates32(%[[A]], %[[C]]) : (!llvm.ptr<i8>, index) -> memref<?xi32>
+//       CHECK: %[[T:.*]] = call @sparseCoordinates32(%[[A]], %[[C]]) : (!llvm.ptr, index) -> memref<?xi32>
 //       CHECK: return %[[T]] : memref<?xi32>
 func.func @sparse_indices32(%arg0: tensor<128xf64, #SparseVector32>) -> memref<?xi32> {
   %0 = sparse_tensor.coordinates %arg0 { level = 0 : index } : tensor<128xf64, #SparseVector32> to memref<?xi32>
@@ -231,8 +231,8 @@ func.func @sparse_indices32(%arg0: tensor<128xf64, #SparseVector32>) -> memref<?
 }
 
 // CHECK-LABEL: func @sparse_valuesf64(
-//  CHECK-SAME: %[[A:.*]]: !llvm.ptr<i8>)
-//       CHECK: %[[T:.*]] = call @sparseValuesF64(%[[A]]) : (!llvm.ptr<i8>) -> memref<?xf64>
+//  CHECK-SAME: %[[A:.*]]: !llvm.ptr)
+//       CHECK: %[[T:.*]] = call @sparseValuesF64(%[[A]]) : (!llvm.ptr) -> memref<?xf64>
 //       CHECK: return %[[T]] : memref<?xf64>
 func.func @sparse_valuesf64(%arg0: tensor<128xf64, #SparseVector>) -> memref<?xf64> {
   %0 = sparse_tensor.values %arg0 : tensor<128xf64, #SparseVector> to memref<?xf64>
@@ -240,8 +240,8 @@ func.func @sparse_valuesf64(%arg0: tensor<128xf64, #SparseVector>) -> memref<?xf
 }
 
 // CHECK-LABEL: func @sparse_valuesf32(
-//  CHECK-SAME: %[[A:.*]]: !llvm.ptr<i8>)
-//       CHECK: %[[T:.*]] = call @sparseValuesF32(%[[A]]) : (!llvm.ptr<i8>) -> memref<?xf32>
+//  CHECK-SAME: %[[A:.*]]: !llvm.ptr)
+//       CHECK: %[[T:.*]] = call @sparseValuesF32(%[[A]]) : (!llvm.ptr) -> memref<?xf32>
 //       CHECK: return %[[T]] : memref<?xf32>
 func.func @sparse_valuesf32(%arg0: tensor<128xf32, #SparseVector>) -> memref<?xf32> {
   %0 = sparse_tensor.values %arg0: tensor<128xf32, #SparseVector> to memref<?xf32>
@@ -249,8 +249,8 @@ func.func @sparse_valuesf32(%arg0: tensor<128xf32, #SparseVector>) -> memref<?xf
 }
 
 // CHECK-LABEL: func @sparse_valuesi32(
-//  CHECK-SAME: %[[A:.*]]: !llvm.ptr<i8>)
-//       CHECK: %[[T:.*]] = call @sparseValuesI32(%[[A]]) : (!llvm.ptr<i8>) -> memref<?xi32>
+//  CHECK-SAME: %[[A:.*]]: !llvm.ptr)
+//       CHECK: %[[T:.*]] = call @sparseValuesI32(%[[A]]) : (!llvm.ptr) -> memref<?xi32>
 //       CHECK: return %[[T]] : memref<?xi32>
 func.func @sparse_valuesi32(%arg0: tensor<128xi32, #SparseVector>) -> memref<?xi32> {
   %0 = sparse_tensor.values %arg0: tensor<128xi32, #SparseVector> to memref<?xi32>
@@ -258,8 +258,8 @@ func.func @sparse_valuesi32(%arg0: tensor<128xi32, #SparseVector>) -> memref<?xi
 }
 
 // CHECK-LABEL: func @sparse_valuesi16(
-//  CHECK-SAME: %[[A:.*]]: !llvm.ptr<i8>)
-//       CHECK: %[[T:.*]] = call @sparseValuesI16(%[[A]]) : (!llvm.ptr<i8>) -> memref<?xi16>
+//  CHECK-SAME: %[[A:.*]]: !llvm.ptr)
+//       CHECK: %[[T:.*]] = call @sparseValuesI16(%[[A]]) : (!llvm.ptr) -> memref<?xi16>
 //       CHECK: return %[[T]] : memref<?xi16>
 func.func @sparse_valuesi16(%arg0: tensor<128xi16, #SparseVector>) -> memref<?xi16> {
   %0 = sparse_tensor.values %arg0: tensor<128xi16, #SparseVector> to memref<?xi16>
@@ -267,8 +267,8 @@ func.func @sparse_valuesi16(%arg0: tensor<128xi16, #SparseVector>) -> memref<?xi
 }
 
 // CHECK-LABEL: func @sparse_valuesi8(
-//  CHECK-SAME: %[[A:.*]]: !llvm.ptr<i8>)
-//       CHECK: %[[T:.*]] = call @sparseValuesI8(%[[A]]) : (!llvm.ptr<i8>) -> memref<?xi8>
+//  CHECK-SAME: %[[A:.*]]: !llvm.ptr)
+//       CHECK: %[[T:.*]] = call @sparseValuesI8(%[[A]]) : (!llvm.ptr) -> memref<?xi8>
 //       CHECK: return %[[T]] : memref<?xi8>
 func.func @sparse_valuesi8(%arg0: tensor<128xi8, #SparseVector>) -> memref<?xi8> {
   %0 = sparse_tensor.values %arg0: tensor<128xi8, #SparseVector> to memref<?xi8>
@@ -276,9 +276,9 @@ func.func @sparse_valuesi8(%arg0: tensor<128xi8, #SparseVector>) -> memref<?xi8>
 }
 
 // CHECK-LABEL: func @sparse_noe(
-//  CHECK-SAME: %[[A:.*]]: !llvm.ptr<i8>)
+//  CHECK-SAME: %[[A:.*]]: !llvm.ptr)
 //   CHECK-DAG: %[[C:.*]] = arith.constant 0 : index
-//   CHECK-DAG: %[[T:.*]] = call @sparseValuesF64(%[[A]]) : (!llvm.ptr<i8>) -> memref<?xf64>
+//   CHECK-DAG: %[[T:.*]] = call @sparseValuesF64(%[[A]]) : (!llvm.ptr) -> memref<?xf64>
 //       CHECK: %[[NOE:.*]] = memref.dim %[[T]], %[[C]] : memref<?xf64>
 //       CHECK: return %[[NOE]] : index
 func.func @sparse_noe(%arg0: tensor<128xf64, #SparseVector>) -> index {
@@ -287,34 +287,34 @@ func.func @sparse_noe(%arg0: tensor<128xf64, #SparseVector>) -> index {
 }
 
 // CHECK-LABEL: func @sparse_reconstruct(
-//  CHECK-SAME: %[[A:.*]]: !llvm.ptr<i8>
-//       CHECK: return %[[A]] : !llvm.ptr<i8>
+//  CHECK-SAME: %[[A:.*]]: !llvm.ptr
+//       CHECK: return %[[A]] : !llvm.ptr
 func.func @sparse_reconstruct(%arg0: tensor<128xf32, #SparseVector>) -> tensor<128xf32, #SparseVector> {
   %0 = sparse_tensor.load %arg0 : tensor<128xf32, #SparseVector>
   return %0 : tensor<128xf32, #SparseVector>
 }
 
 // CHECK-LABEL: func @sparse_reconstruct_ins(
-//  CHECK-SAME: %[[A:.*]]: !llvm.ptr<i8>
-//       CHECK: call @endLexInsert(%[[A]]) : (!llvm.ptr<i8>) -> ()
-//       CHECK: return %[[A]] : !llvm.ptr<i8>
+//  CHECK-SAME: %[[A:.*]]: !llvm.ptr
+//       CHECK: call @endLexInsert(%[[A]]) : (!llvm.ptr) -> ()
+//       CHECK: return %[[A]] : !llvm.ptr
 func.func @sparse_reconstruct_ins(%arg0: tensor<128xf32, #SparseVector>) -> tensor<128xf32, #SparseVector> {
   %0 = sparse_tensor.load %arg0 hasInserts : tensor<128xf32, #SparseVector>
   return %0 : tensor<128xf32, #SparseVector>
 }
 
 // CHECK-LABEL: func @sparse_insert(
-//  CHECK-SAME: %[[A:.*]]: !llvm.ptr<i8>,
+//  CHECK-SAME: %[[A:.*]]: !llvm.ptr,
 //  CHECK-SAME: %[[B:.*]]: index,
-//  CHECK-SAME: %[[C:.*]]: f32) -> !llvm.ptr<i8> {
+//  CHECK-SAME: %[[C:.*]]: f32) -> !llvm.ptr {
 //   CHECK-DAG: %[[M:.*]] = memref.alloca() : memref<1xindex>
 //   CHECK-DAG: %[[V:.*]] = memref.alloca() : memref<f32>
 //   CHECK-DAG: %[[MC:.*]] = memref.cast %[[M]] : memref<1xindex> to memref<?xindex>
 //   CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
 //   CHECK-DAG: memref.store %[[B]], %[[M]][%[[C0]]] : memref<1xindex>
 //   CHECK-DAG: memref.store %[[C]], %[[V]][] : memref<f32>
-//       CHECK: call @lexInsertF32(%[[A]], %[[MC]], %[[V]]) : (!llvm.ptr<i8>, memref<?xindex>, memref<f32>) -> ()
-//       CHECK: return %[[A]] : !llvm.ptr<i8>
+//       CHECK: call @lexInsertF32(%[[A]], %[[MC]], %[[V]]) : (!llvm.ptr, memref<?xindex>, memref<f32>) -> ()
+//       CHECK: return %[[A]] : !llvm.ptr
 func.func @sparse_insert(%arg0: tensor<128xf32, #SparseVector>,
                          %arg1: index,
                          %arg2: f32) -> tensor<128xf32, #SparseVector> {
@@ -372,12 +372,12 @@ func.func @sparse_expansion3(%arg0: index, %arg1: index) -> memref<?xindex> {
 }
 
 // CHECK-LABEL: func @sparse_compression(
-//  CHECK-SAME: %[[A:.*0]]: !llvm.ptr<i8>,
+//  CHECK-SAME: %[[A:.*0]]: !llvm.ptr,
 //  CHECK-SAME: %[[B:.*1]]: memref<?xf64>,
 //  CHECK-SAME: %[[C:.*2]]: memref<?xi1>,
 //  CHECK-SAME: %[[D:.*3]]: memref<?xindex>,
 //  CHECK-SAME: %[[E:.*4]]: index,
-//  CHECK-SAME: %[[F:.*5]]: index) -> !llvm.ptr<i8> {
+//  CHECK-SAME: %[[F:.*5]]: index) -> !llvm.ptr {
 //   CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
 //   CHECK-DAG: %[[X:.*]] = memref.alloca() : memref<2xindex>
 //   CHECK-DAG: %[[Y:.*]] = memref.cast %[[X]] : memref<2xindex> to memref<?xindex>
@@ -386,7 +386,7 @@ func.func @sparse_expansion3(%arg0: index, %arg1: index) -> memref<?xindex> {
 //   CHECK-DAG: memref.dealloc %[[B]] : memref<?xf64>
 //   CHECK-DAG: memref.dealloc %[[C]] : memref<?xi1>
 //   CHECK-DAG: memref.dealloc %[[D]] : memref<?xindex>
-//       CHECK: return %[[A]] : !llvm.ptr<i8>
+//       CHECK: return %[[A]] : !llvm.ptr
 func.func @sparse_compression(%tensor: tensor<8x8xf64, #CSR>,
                               %values: memref<?xf64>,
                               %filled: memref<?xi1>,
@@ -401,7 +401,7 @@ func.func @sparse_compression(%tensor: tensor<8x8xf64, #CSR>,
 // CHECK-LABEL: func @sparse_and_dense_init(
 //       CHECK: %[[S:.*]] = call @newSparseTensor
 //       CHECK: %[[D:.*]] = tensor.empty
-//       CHECK: return %[[S]], %[[D]] : !llvm.ptr<i8>, tensor<?x?xf64>
+//       CHECK: return %[[S]], %[[D]] : !llvm.ptr, tensor<?x?xf64>
 func.func @sparse_and_dense_init(%arg0: index, %arg1: index)
            -> (tensor<?x?xf64, #CSR>, tensor<?x?xf64>) {
   %0 = tensor.empty(%arg0, %arg1) : tensor<?x?xf64, #CSR>
diff --git a/mlir/test/Dialect/SparseTensor/sparse_expand.mlir b/mlir/test/Dialect/SparseTensor/sparse_expand.mlir
index 0f367f12483f6..4817771a6044c 100644
--- a/mlir/test/Dialect/SparseTensor/sparse_expand.mlir
+++ b/mlir/test/Dialect/SparseTensor/sparse_expand.mlir
@@ -46,7 +46,7 @@
 // CHECK-SPARSE: return %[[RET]]
 //
 // CHECK-CONVERT-LABEL: func @kernel(
-// CHECK-CONVERT-SAME: %[[A:.*]]: !llvm.ptr<i8>) -> !llvm.ptr<i8>
+// CHECK-CONVERT-SAME: %[[A:.*]]: !llvm.ptr) -> !llvm.ptr
 // CHECK-CONVERT-DAG: %[[C1:.*]] = arith.constant 1 : index
 // CHECK-CONVERT-DAG: %[[C0:.*]] = arith.constant 0 : index
 // CHECK-CONVERT: %[[N:.*]] = call @sparseLvlSize(%[[A]], %[[C1]])
diff --git a/mlir/test/Dialect/SparseTensor/sparse_fill_zero.mlir b/mlir/test/Dialect/SparseTensor/sparse_fill_zero.mlir
index 8ecbc1da965a1..988ab7f85be41 100644
--- a/mlir/test/Dialect/SparseTensor/sparse_fill_zero.mlir
+++ b/mlir/test/Dialect/SparseTensor/sparse_fill_zero.mlir
@@ -3,8 +3,8 @@
 #DCSR = #sparse_tensor.encoding<{ map = (d0, d1) -> (d0 : compressed, d1 : compressed) }>
 
 // CHECK-LABEL:   func.func @fill_zero_after_alloc(
-// CHECK-SAME:      %[[VAL_0:.*]]: !llvm.ptr<i8>,
-// CHECK-SAME:      %[[VAL_1:.*]]: !llvm.ptr<i8>) -> !llvm.ptr<i8> {
+// CHECK-SAME:      %[[VAL_0:.*]]: !llvm.ptr,
+// CHECK-SAME:      %[[VAL_1:.*]]: !llvm.ptr) -> !llvm.ptr {
 // CHECK-DAG:       %[[VAL_2:.*]] = arith.constant 0.000000e+00 : f64
 // CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 1 : i32
 // CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 0 : i32
@@ -27,8 +27,8 @@
 // CHECK:           %[[VAL_17:.*]] = memref.cast %[[VAL_16]] : memref<2xindex> to memref<?xindex>
 // CHECK:           memref.store %[[VAL_5]], %[[VAL_16]]{{\[}}%[[VAL_5]]] : memref<2xindex>
 // CHECK:           memref.store %[[VAL_6]], %[[VAL_16]]{{\[}}%[[VAL_6]]] : memref<2xindex>
-// CHECK:           %[[VAL_18:.*]] = llvm.mlir.zero : !llvm.ptr<i8>
-// CHECK:           %[[VAL_19:.*]] = call @newSparseTensor(%[[VAL_15]], %[[VAL_15]], %[[VAL_13]], %[[VAL_17]], %[[VAL_17]], %[[VAL_4]], %[[VAL_4]], %[[VAL_3]], %[[VAL_4]], %[[VAL_18]]) : (memref<?xindex>, memref<?xindex>, memref<?xi8>, memref<?xindex>, memref<?xindex>, i32, i32, i32, i32, !llvm.ptr<i8>) -> !llvm.ptr<i8>
+// CHECK:           %[[VAL_18:.*]] = llvm.mlir.zero : !llvm.ptr
+// CHECK:           %[[VAL_19:.*]] = call @newSparseTensor(%[[VAL_15]], %[[VAL_15]], %[[VAL_13]], %[[VAL_17]], %[[VAL_17]], %[[VAL_4]], %[[VAL_4]], %[[VAL_3]], %[[VAL_4]], %[[VAL_18]]) : (memref<?xindex>, memref<?xindex>, memref<?xi8>, memref<?xindex>, memref<?xindex>, i32, i32, i32, i32, !llvm.ptr) -> !llvm.ptr
 // CHECK:           %[[VAL_20:.*]] = memref.alloc() : memref<300xf64>
 // CHECK:           %[[VAL_21:.*]] = memref.cast %[[VAL_20]] : memref<300xf64> to memref<?xf64>
 // CHECK:           %[[VAL_22:.*]] = memref.alloc() : memref<300xi1>
@@ -37,16 +37,16 @@
 // CHECK:           %[[VAL_25:.*]] = memref.cast %[[VAL_24]] : memref<300xindex> to memref<?xindex>
 // CHECK:           linalg.fill ins(%[[VAL_2]] : f64) outs(%[[VAL_20]] : memref<300xf64>)
 // CHECK:           linalg.fill ins(%[[VAL_7]] : i1) outs(%[[VAL_22]] : memref<300xi1>)
-// CHECK:           %[[VAL_26:.*]] = call @sparsePositions0(%[[VAL_0]], %[[VAL_5]]) : (!llvm.ptr<i8>, index) -> memref<?xindex>
-// CHECK:           %[[VAL_27:.*]] = call @sparseCoordinates0(%[[VAL_0]], %[[VAL_5]]) : (!llvm.ptr<i8>, index) -> memref<?xindex>
-// CHECK:           %[[VAL_28:.*]] = call @sparsePositions0(%[[VAL_0]], %[[VAL_6]]) : (!llvm.ptr<i8>, index) -> memref<?xindex>
-// CHECK:           %[[VAL_29:.*]] = call @sparseCoordinates0(%[[VAL_0]], %[[VAL_6]]) : (!llvm.ptr<i8>, index) -> memref<?xindex>
-// CHECK:           %[[VAL_30:.*]] = call @sparseValuesF64(%[[VAL_0]]) : (!llvm.ptr<i8>) -> memref<?xf64>
-// CHECK:           %[[VAL_31:.*]] = call @sparsePositions0(%[[VAL_1]], %[[VAL_5]]) : (!llvm.ptr<i8>, index) -> memref<?xindex>
-// CHECK:           %[[VAL_32:.*]] = call @sparseCoordinates0(%[[VAL_1]], %[[VAL_5]]) : (!llvm.ptr<i8>, index) -> memref<?xindex>
-// CHECK:           %[[VAL_33:.*]] = call @sparsePositions0(%[[VAL_1]], %[[VAL_6]]) : (!llvm.ptr<i8>, index) -> memref<?xindex>
-// CHECK:           %[[VAL_34:.*]] = call @sparseCoordinates0(%[[VAL_1]], %[[VAL_6]]) : (!llvm.ptr<i8>, index) -> memref<?xindex>
-// CHECK:           %[[VAL_35:.*]] = call @sparseValuesF64(%[[VAL_1]]) : (!llvm.ptr<i8>) -> memref<?xf64>
+// CHECK:           %[[VAL_26:.*]] = call @sparsePositions0(%[[VAL_0]], %[[VAL_5]]) : (!llvm.ptr, index) -> memref<?xindex>
+// CHECK:           %[[VAL_27:.*]] = call @sparseCoordinates0(%[[VAL_0]], %[[VAL_5]]) : (!llvm.ptr, index) -> memref<?xindex>
+// CHECK:           %[[VAL_28:.*]] = call @sparsePositions0(%[[VAL_0]], %[[VAL_6]]) : (!llvm.ptr, index) -> memref<?xindex>
+// CHECK:           %[[VAL_29:.*]] = call @sparseCoordinates0(%[[VAL_0]], %[[VAL_6]]) : (!llvm.ptr, index) -> memref<?xindex>
+// CHECK:           %[[VAL_30:.*]] = call @sparseValuesF64(%[[VAL_0]]) : (!llvm.ptr) -> memref<?xf64>
+// CHECK:           %[[VAL_31:.*]] = call @sparsePositions0(%[[VAL_1]], %[[VAL_5]]) : (!llvm.ptr, index) -> memref<?xindex>
+// CHECK:           %[[VAL_32:.*]] = call @sparseCoordinates0(%[[VAL_1]], %[[VAL_5]]) : (!llvm.ptr, index) -> memref<?xindex>
+// CHECK:           %[[VAL_33:.*]] = call @sparsePositions0(%[[VAL_1]], %[[VAL_6]]) : (!llvm.ptr, index) -> memref<?xindex>
+// CHECK:           %[[VAL_34:.*]] = call @sparseCoordinates0(%[[VAL_1]], %[[VAL_6]]) : (!llvm.ptr, index) -> memref<?xindex>
+// CHECK:           %[[VAL_35:.*]] = call @sparseValuesF64(%[[VAL_1]]) : (!llvm.ptr) -> memref<?xf64>
 // CHECK:           %[[VAL_36:.*]] = memref.load %[[VAL_26]]{{\[}}%[[VAL_5]]] : memref<?xindex>
 // CHECK:           %[[VAL_37:.*]] = memref.load %[[VAL_26]]{{\[}}%[[VAL_6]]] : memref<?xindex>
 // CHECK:           scf.for %[[VAL_38:.*]] = %[[VAL_36]] to %[[VAL_37]] step %[[VAL_6]] {
@@ -107,13 +107,13 @@
 // CHECK:             %[[VAL_83:.*]] = memref.alloca() : memref<2xindex>
 // CHECK:             %[[VAL_84:.*]] = memref.cast %[[VAL_83]] : memref<2xindex> to memref<?xindex>
 // CHECK:             memref.store %[[VAL_39]], %[[VAL_83]]{{\[}}%[[VAL_5]]] : memref<2xindex>
-// CHECK:             func.call @expInsertF64(%[[VAL_19]], %[[VAL_84]], %[[VAL_21]], %[[VAL_23]], %[[VAL_25]], %[[VAL_85:.*]]#2) : (!llvm.ptr<i8>, memref<?xindex>, memref<?xf64>, memref<?xi1>, memref<?xindex>, index) -> ()
+// CHECK:             func.call @expInsertF64(%[[VAL_19]], %[[VAL_84]], %[[VAL_21]], %[[VAL_23]], %[[VAL_25]], %[[VAL_85:.*]]#2) : (!llvm.ptr, memref<?xindex>, memref<?xf64>, memref<?xi1>, memref<?xindex>, index) -> ()
 // CHECK:           }
 // CHECK:           memref.dealloc %[[VAL_20]] : memref<300xf64>
 // CHECK:           memref.dealloc %[[VAL_22]] : memref<300xi1>
 // CHECK:           memref.dealloc %[[VAL_24]] : memref<300xindex>
-// CHECK:           call @endLexInsert(%[[VAL_19]]) : (!llvm.ptr<i8>) -> ()
-// CHECK:           return %[[VAL_19]] : !llvm.ptr<i8>
+// CHECK:           call @endLexInsert(%[[VAL_19]]) : (!llvm.ptr) -> ()
+// CHECK:           return %[[VAL_19]] : !llvm.ptr
 // CHECK:       }
 func.func @fill_zero_after_alloc(%arg0: tensor<100x200xf64, #DCSR>,
                                        %arg1: tensor<200x300xf64, #DCSR>) -> tensor<100x300xf64, #DCSR> {
diff --git a/mlir/test/Dialect/SparseTensor/sparse_lower.mlir b/mlir/test/Dialect/SparseTensor/sparse_lower.mlir
index 3b8b86010edd2..13245f427a797 100644
--- a/mlir/test/Dialect/SparseTensor/sparse_lower.mlir
+++ b/mlir/test/Dialect/SparseTensor/sparse_lower.mlir
@@ -52,15 +52,15 @@
 // CHECK-HIR:         }
 
 // CHECK-MIR-LABEL:   func @matvec(
-// CHECK-MIR-SAME:      %[[VAL_0:.*]]: !llvm.ptr<i8>,
+// CHECK-MIR-SAME:      %[[VAL_0:.*]]: !llvm.ptr,
 // CHECK-MIR-SAME:      %[[VAL_1:.*]]: tensor<64xf64>,
 // CHECK-MIR-SAME:      %[[VAL_2:.*]]: tensor<32xf64>) -> tensor<32xf64> {
 // CHECK-MIR-DAG:       %[[VAL_3:.*]] = arith.constant 32 : index
 // CHECK-MIR-DAG:       %[[VAL_4:.*]] = arith.constant 0 : index
 // CHECK-MIR-DAG:       %[[VAL_5:.*]] = arith.constant 1 : index
-// CHECK-MIR-DAG:       %[[VAL_6:.*]] = call @sparsePositions0(%[[VAL_0]], %[[VAL_5]]) : (!llvm.ptr<i8>, index) -> memref<?xindex>
-// CHECK-MIR-DAG:       %[[VAL_7:.*]] = call @sparseCoordinates0(%[[VAL_0]], %[[VAL_5]]) : (!llvm.ptr<i8>, index) -> memref<?xindex>
-// CHECK-MIR-DAG:       %[[VAL_8:.*]] = call @sparseValuesF64(%[[VAL_0]]) : (!llvm.ptr<i8>) -> memref<?xf64>
+// CHECK-MIR-DAG:       %[[VAL_6:.*]] = call @sparsePositions0(%[[VAL_0]], %[[VAL_5]]) : (!llvm.ptr, index) -> memref<?xindex>
+// CHECK-MIR-DAG:       %[[VAL_7:.*]] = call @sparseCoordinates0(%[[VAL_0]], %[[VAL_5]]) : (!llvm.ptr, index) -> memref<?xindex>
+// CHECK-MIR-DAG:       %[[VAL_8:.*]] = call @sparseValuesF64(%[[VAL_0]]) : (!llvm.ptr) -> memref<?xf64>
 // CHECK-MIR-DAG:       %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_1]] : memref<64xf64>
 // CHECK-MIR-DAG:       %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32xf64>
 // CHECK-MIR:           scf.for %[[VAL_14:.*]] = %[[VAL_4]] to %[[VAL_3]] step %[[VAL_5]] {
@@ -83,15 +83,15 @@
 // CHECK-MIR:         }
 
 // CHECK-LIR-LABEL:   func @matvec(
-// CHECK-LIR-SAME:      %[[VAL_0:.*]]: !llvm.ptr<i8>,
+// CHECK-LIR-SAME:      %[[VAL_0:.*]]: !llvm.ptr,
 // CHECK-LIR-SAME:      %[[VAL_1:.*]]: memref<64xf64>,
 // CHECK-LIR-SAME:      %[[VAL_2:.*]]: memref<32xf64>) -> memref<32xf64> {
 // CHECK-LIR-DAG:       %[[VAL_3:.*]] = arith.constant 32 : index
 // CHECK-LIR-DAG:       %[[VAL_4:.*]] = arith.constant 0 : index
 // CHECK-LIR-DAG:       %[[VAL_5:.*]] = arith.constant 1 : index
-// CHECK-LIR-DAG:       %[[VAL_6:.*]] = call @sparsePositions0(%[[VAL_0]], %[[VAL_5]]) : (!llvm.ptr<i8>, index) -> memref<?xindex>
-// CHECK-LIR-DAG:       %[[VAL_7:.*]] = call @sparseCoordinates0(%[[VAL_0]], %[[VAL_5]]) : (!llvm.ptr<i8>, index) -> memref<?xindex>
-// CHECK-LIR-DAG:       %[[VAL_8:.*]] = call @sparseValuesF64(%[[VAL_0]]) : (!llvm.ptr<i8>) -> memref<?xf64>
+// CHECK-LIR-DAG:       %[[VAL_6:.*]] = call @sparsePositions0(%[[VAL_0]], %[[VAL_5]]) : (!llvm.ptr, index) -> memref<?xindex>
+// CHECK-LIR-DAG:       %[[VAL_7:.*]] = call @sparseCoordinates0(%[[VAL_0]], %[[VAL_5]]) : (!llvm.ptr, index) -> memref<?xindex>
+// CHECK-LIR-DAG:       %[[VAL_8:.*]] = call @sparseValuesF64(%[[VAL_0]]) : (!llvm.ptr) -> memref<?xf64>
 // CHECK-LIR:           scf.for %[[VAL_12:.*]] = %[[VAL_4]] to %[[VAL_3]] step %[[VAL_5]] {
 // CHECK-LIR-DAG:         %[[VAL_13:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_12]]] : memref<?xindex>
 // CHECK-LIR-DAG:         %[[VAL_14:.*]] = arith.addi %[[VAL_12]], %[[VAL_5]] : index
diff --git a/mlir/test/Dialect/SparseTensor/sparse_lower_col.mlir b/mlir/test/Dialect/SparseTensor/sparse_lower_col.mlir
index 4d2a3d45aae84..a987d59f67731 100644
--- a/mlir/test/Dialect/SparseTensor/sparse_lower_col.mlir
+++ b/mlir/test/Dialect/SparseTensor/sparse_lower_col.mlir
@@ -53,15 +53,15 @@
 // CHECK-HIR:         }
 
 // CHECK-MIR-LABEL:   func @matvec(
-// CHECK-MIR-SAME:                 %[[VAL_0:.*]]: !llvm.ptr<i8>,
+// CHECK-MIR-SAME:                 %[[VAL_0:.*]]: !llvm.ptr,
 // CHECK-MIR-SAME:                 %[[VAL_1:.*]]: tensor<64xf64>,
 // CHECK-MIR-SAME:                 %[[VAL_2:.*]]: tensor<32xf64>) -> tensor<32xf64> {
 // CHECK-MIR-DAG:       %[[VAL_3:.*]] = arith.constant 64 : index
 // CHECK-MIR-DAG:       %[[VAL_5:.*]] = arith.constant 0 : index
 // CHECK-MIR-DAG:       %[[VAL_6:.*]] = arith.constant 1 : index
-// CHECK-MIR-DAG:       %[[VAL_7:.*]] = call @sparsePositions0(%[[VAL_0]], %[[VAL_6]]) : (!llvm.ptr<i8>, index) -> memref<?xindex>
-// CHECK-MIR-DAG:       %[[VAL_8:.*]] = call @sparseCoordinates0(%[[VAL_0]], %[[VAL_6]]) : (!llvm.ptr<i8>, index) -> memref<?xindex>
-// CHECK-MIR-DAG:       %[[VAL_9:.*]] = call @sparseValuesF64(%[[VAL_0]]) : (!llvm.ptr<i8>) -> memref<?xf64>
+// CHECK-MIR-DAG:       %[[VAL_7:.*]] = call @sparsePositions0(%[[VAL_0]], %[[VAL_6]]) : (!llvm.ptr, index) -> memref<?xindex>
+// CHECK-MIR-DAG:       %[[VAL_8:.*]] = call @sparseCoordinates0(%[[VAL_0]], %[[VAL_6]]) : (!llvm.ptr, index) -> memref<?xindex>
+// CHECK-MIR-DAG:       %[[VAL_9:.*]] = call @sparseValuesF64(%[[VAL_0]]) : (!llvm.ptr) -> memref<?xf64>
 // CHECK-MIR-DAG:       %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_1]] : memref<64xf64>
 // CHECK-MIR-DAG:       %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32xf64>
 // CHECK-MIR:           scf.for %[[VAL_15:.*]] = %[[VAL_5]] to %[[VAL_3]] step %[[VAL_6]] {
@@ -83,15 +83,15 @@
 // CHECK-MIR:         }
 
 // CHECK-LIR-LABEL:   func @matvec(
-// CHECK-LIR-SAME:                 %[[VAL_0:.*]]: !llvm.ptr<i8>,
+// CHECK-LIR-SAME:                 %[[VAL_0:.*]]: !llvm.ptr,
 // CHECK-LIR-SAME:                 %[[VAL_1:.*]]: memref<64xf64>,
 // CHECK-LIR-SAME:                 %[[VAL_2:.*]]: memref<32xf64>) -> memref<32xf64> {
 // CHECK-LIR-DAG:       %[[VAL_3:.*]] = arith.constant 64 : index
 // CHECK-LIR-DAG:       %[[VAL_5:.*]] = arith.constant 0 : index
 // CHECK-LIR-DAG:       %[[VAL_6:.*]] = arith.constant 1 : index
-// CHECK-LIR-DAG:       %[[VAL_7:.*]] = call @sparsePositions0(%[[VAL_0]], %[[VAL_6]]) : (!llvm.ptr<i8>, index) -> memref<?xindex>
-// CHECK-LIR-DAG:       %[[VAL_8:.*]] = call @sparseCoordinates0(%[[VAL_0]], %[[VAL_6]]) : (!llvm.ptr<i8>, index) -> memref<?xindex>
-// CHECK-LIR-DAG:       %[[VAL_9:.*]] = call @sparseValuesF64(%[[VAL_0]]) : (!llvm.ptr<i8>) -> memref<?xf64>
+// CHECK-LIR-DAG:       %[[VAL_7:.*]] = call @sparsePositions0(%[[VAL_0]], %[[VAL_6]]) : (!llvm.ptr, index) -> memref<?xindex>
+// CHECK-LIR-DAG:       %[[VAL_8:.*]] = call @sparseCoordinates0(%[[VAL_0]], %[[VAL_6]]) : (!llvm.ptr, index) -> memref<?xindex>
+// CHECK-LIR-DAG:       %[[VAL_9:.*]] = call @sparseValuesF64(%[[VAL_0]]) : (!llvm.ptr) -> memref<?xf64>
 // CHECK-LIR:           scf.for %[[VAL_13:.*]] = %[[VAL_5]] to %[[VAL_3]] step %[[VAL_6]] {
 // CHECK-LIR:             %[[VAL_14:.*]] = memref.load %[[VAL_1]]{{\[}}%[[VAL_13]]] : memref<64xf64>
 // CHECK-LIR:             %[[VAL_15:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_13]]] : memref<?xindex>
diff --git a/mlir/test/Dialect/SparseTensor/sparse_lower_inplace.mlir b/mlir/test/Dialect/SparseTensor/sparse_lower_inplace.mlir
index dc81acff50423..0e09f658fa15c 100644
--- a/mlir/test/Dialect/SparseTensor/sparse_lower_inplace.mlir
+++ b/mlir/test/Dialect/SparseTensor/sparse_lower_inplace.mlir
@@ -52,15 +52,15 @@
 // CHECK-HIR:         }
 
 // CHECK-MIR-LABEL:   func @matvec(
-// CHECK-MIR-SAME:      %[[VAL_0:.*]]: !llvm.ptr<i8>,
+// CHECK-MIR-SAME:      %[[VAL_0:.*]]: !llvm.ptr,
 // CHECK-MIR-SAME:      %[[VAL_1:.*]]: tensor<64xf64>,
 // CHECK-MIR-SAME:      %[[VAL_2:.*]]: tensor<32xf64>) -> tensor<32xf64> {
 // CHECK-MIR-DAG:       %[[VAL_3:.*]] = arith.constant 32 : index
 // CHECK-MIR-DAG:       %[[VAL_4:.*]] = arith.constant 0 : index
 // CHECK-MIR-DAG:       %[[VAL_5:.*]] = arith.constant 1 : index
-// CHECK-MIR:           %[[VAL_6:.*]] = call @sparsePositions0(%[[VAL_0]], %[[VAL_5]]) : (!llvm.ptr<i8>, index) -> memref<?xindex>
-// CHECK-MIR:           %[[VAL_7:.*]] = call @sparseCoordinates0(%[[VAL_0]], %[[VAL_5]]) : (!llvm.ptr<i8>, index) -> memref<?xindex>
-// CHECK-MIR:           %[[VAL_8:.*]] = call @sparseValuesF64(%[[VAL_0]]) : (!llvm.ptr<i8>) -> memref<?xf64>
+// CHECK-MIR:           %[[VAL_6:.*]] = call @sparsePositions0(%[[VAL_0]], %[[VAL_5]]) : (!llvm.ptr, index) -> memref<?xindex>
+// CHECK-MIR:           %[[VAL_7:.*]] = call @sparseCoordinates0(%[[VAL_0]], %[[VAL_5]]) : (!llvm.ptr, index) -> memref<?xindex>
+// CHECK-MIR:           %[[VAL_8:.*]] = call @sparseValuesF64(%[[VAL_0]]) : (!llvm.ptr) -> memref<?xf64>
 // CHECK-MIR:           %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_1]] : memref<64xf64>
 // CHECK-MIR:           %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32xf64>
 // CHECK-MIR:           scf.for %[[VAL_11:.*]] = %[[VAL_4]] to %[[VAL_3]] step %[[VAL_5]] {
@@ -83,15 +83,15 @@
 // CHECK-MIR:         }
 
 // CHECK-LIR-LABEL:   func @matvec(
-// CHECK-LIR-SAME:      %[[VAL_0:.*]]: !llvm.ptr<i8>,
+// CHECK-LIR-SAME:      %[[VAL_0:.*]]: !llvm.ptr,
 // CHECK-LIR-SAME:      %[[VAL_1:.*]]: memref<64xf64>,
 // CHECK-LIR-SAME:      %[[VAL_2:.*]]: memref<32xf64>) -> memref<32xf64> {
 // CHECK-LIR-DAG:       %[[VAL_3:.*]] = arith.constant 32 : index
 // CHECK-LIR-DAG:       %[[VAL_4:.*]] = arith.constant 0 : index
 // CHECK-LIR-DAG:       %[[VAL_5:.*]] = arith.constant 1 : index
-// CHECK-LIR:           %[[VAL_6:.*]] = call @sparsePositions0(%[[VAL_0]], %[[VAL_5]]) : (!llvm.ptr<i8>, index) -> memref<?xindex>
-// CHECK-LIR:           %[[VAL_7:.*]] = call @sparseCoordinates0(%[[VAL_0]], %[[VAL_5]]) : (!llvm.ptr<i8>, index) -> memref<?xindex>
-// CHECK-LIR:           %[[VAL_8:.*]] = call @sparseValuesF64(%[[VAL_0]]) : (!llvm.ptr<i8>) -> memref<?xf64>
+// CHECK-LIR:           %[[VAL_6:.*]] = call @sparsePositions0(%[[VAL_0]], %[[VAL_5]]) : (!llvm.ptr, index) -> memref<?xindex>
+// CHECK-LIR:           %[[VAL_7:.*]] = call @sparseCoordinates0(%[[VAL_0]], %[[VAL_5]]) : (!llvm.ptr, index) -> memref<?xindex>
+// CHECK-LIR:           %[[VAL_8:.*]] = call @sparseValuesF64(%[[VAL_0]]) : (!llvm.ptr) -> memref<?xf64>
 // CHECK-LIR:           scf.for %[[VAL_9:.*]] = %[[VAL_4]] to %[[VAL_3]] step %[[VAL_5]] {
 // CHECK-LIR-DAG:         %[[VAL_10:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_9]]] : memref<?xindex>
 // CHECK-LIR-DAG:         %[[VAL_11:.*]] = arith.addi %[[VAL_9]], %[[VAL_5]] : index
diff --git a/mlir/test/Dialect/SparseTensor/sparse_perm_lower.mlir b/mlir/test/Dialect/SparseTensor/sparse_perm_lower.mlir
index 42726d998ac7a..02738a9e4544a 100644
--- a/mlir/test/Dialect/SparseTensor/sparse_perm_lower.mlir
+++ b/mlir/test/Dialect/SparseTensor/sparse_perm_lower.mlir
@@ -48,7 +48,7 @@
 // CHECK-HIR:         }
 //
 // CHECK-MIR-LABEL:   func @sparse_dynamic_dims(
-// CHECK-MIR-SAME:      %[[ARGA:.*]]: !llvm.ptr<i8>,
+// CHECK-MIR-SAME:      %[[ARGA:.*]]: !llvm.ptr,
 // CHECK-MIR-SAME:      %[[ARGX:.*]]: tensor<f32>) -> tensor<f32> {
 // CHECK-MIR-DAG:       %[[I0:.*]] = arith.constant 0 : index
 // CHECK-MIR-DAG:       %[[I1:.*]] = arith.constant 1 : index
@@ -56,7 +56,7 @@
 // CHECK-MIR-DAG:       %[[DimSize0:.*]] = call @sparseLvlSize(%[[ARGA]], %[[I0]])
 // CHECK-MIR-DAG:       %[[DimSize1:.*]] = call @sparseLvlSize(%[[ARGA]], %[[I1]])
 // CHECK-MIR-DAG:       %[[DimSize2:.*]] = call @sparseLvlSize(%[[ARGA]], %[[I2]])
-// CHECK-MIR-DAG:       %[[VAL_8:.*]] = call @sparseValuesF32(%[[ARGA]]) : (!llvm.ptr<i8>) -> memref<?xf32>
+// CHECK-MIR-DAG:       %[[VAL_8:.*]] = call @sparseValuesF32(%[[ARGA]]) : (!llvm.ptr) -> memref<?xf32>
 // CHECK-MIR-DAG:       %[[VAL_10:.*]] = bufferization.to_memref %[[ARGX]] : memref<f32>
 // CHECK-MIR:           %[[VAL_11:.*]] = tensor.extract %[[ARGX]][] : tensor<f32>
 // CHECK-MIR:           %[[VAL_12:.*]] = scf.for %[[D2:.*]] = %[[I0]] to %[[DimSize0]] step %[[I1]] iter_args(%[[VAL_14:.*]] = %[[VAL_11]]) -> (f32) {

From fcb9a8a5eeeba0823a8cd3d364acf4b04b6dc7ab Mon Sep 17 00:00:00 2001
From: Christian Ulmann <christian.ulmann@nextsilicon.com>
Date: Tue, 31 Oct 2023 07:36:11 +0100
Subject: [PATCH 470/877] Reland "[MLIR][FuncToLLVM] Remove typed pointer
 support" (#70717)

This relands 6a0f6dd8359b38340442b7e6b14629c1d6c54a81 that was reverted
due to a missing integration test change.

This commit removes the support for lowering Func to LLVM dialect with
typed pointers. Typed pointers have been deprecated for a while now and
it's planned to soon remove them from the LLVM dialect.

Original PR: https://github.com/llvm/llvm-project/pull/70574
---
 mlir/include/mlir/Conversion/Passes.td        |   3 -
 mlir/lib/Conversion/FuncToLLVM/FuncToLLVM.cpp |   1 -
 .../FuncToLLVM/calling-convention.mlir        |   4 +-
 .../FuncToLLVM/convert-argattrs.mlir          |   2 +-
 .../FuncToLLVM/convert-data-layout.mlir       |   2 +-
 .../Conversion/FuncToLLVM/convert-funcs.mlir  |   2 +-
 .../emit-c-wrappers-for-external-callers.mlir |   2 +-
 ...mit-c-wrappers-for-external-functions.mlir |   2 +-
 .../FuncToLLVM/func-memref-return.mlir        |   4 +-
 .../Conversion/FuncToLLVM/func-memref.mlir    |   4 +-
 .../Conversion/FuncToLLVM/func-to-llvm.mlir   |   4 +-
 mlir/test/Conversion/FuncToLLVM/invalid.mlir  |   2 +-
 .../Conversion/FuncToLLVM/typed-pointers.mlir | 114 ------------------
 .../mlir-vulkan-runner/mlir-vulkan-runner.cpp |   3 +-
 14 files changed, 15 insertions(+), 134 deletions(-)
 delete mode 100644 mlir/test/Conversion/FuncToLLVM/typed-pointers.mlir

diff --git a/mlir/include/mlir/Conversion/Passes.td b/mlir/include/mlir/Conversion/Passes.td
index cf6e545749ffc..a2307bc243f61 100644
--- a/mlir/include/mlir/Conversion/Passes.td
+++ b/mlir/include/mlir/Conversion/Passes.td
@@ -409,9 +409,6 @@ def ConvertFuncToLLVMPass : Pass<"convert-func-to-llvm", "ModuleOp"> {
     Option<"indexBitwidth", "index-bitwidth", "unsigned",
            /*default=kDeriveIndexBitwidthFromDataLayout*/"0",
            "Bitwidth of the index type, 0 to use size of machine word">,
-    Option<"useOpaquePointers", "use-opaque-pointers", "bool",
-                       /*default=*/"true", "Generate LLVM IR using opaque pointers "
-                       "instead of typed pointers">,
   ];
 }
 
diff --git a/mlir/lib/Conversion/FuncToLLVM/FuncToLLVM.cpp b/mlir/lib/Conversion/FuncToLLVM/FuncToLLVM.cpp
index 3506f50916132..3126d1dee32cb 100644
--- a/mlir/lib/Conversion/FuncToLLVM/FuncToLLVM.cpp
+++ b/mlir/lib/Conversion/FuncToLLVM/FuncToLLVM.cpp
@@ -790,7 +790,6 @@ struct ConvertFuncToLLVMPass
     if (indexBitwidth != kDeriveIndexBitwidthFromDataLayout)
       options.overrideIndexBitwidth(indexBitwidth);
     options.dataLayout = llvm::DataLayout(dataLayout);
-    options.useOpaquePointers = useOpaquePointers;
 
     LLVMTypeConverter typeConverter(&getContext(), options,
                                     &dataLayoutAnalysis);
diff --git a/mlir/test/Conversion/FuncToLLVM/calling-convention.mlir b/mlir/test/Conversion/FuncToLLVM/calling-convention.mlir
index 1ed6770887560..7cdb89e1f72d2 100644
--- a/mlir/test/Conversion/FuncToLLVM/calling-convention.mlir
+++ b/mlir/test/Conversion/FuncToLLVM/calling-convention.mlir
@@ -1,5 +1,5 @@
-// RUN: mlir-opt -finalize-memref-to-llvm='use-opaque-pointers=1' -llvm-request-c-wrappers -convert-func-to-llvm='use-opaque-pointers=1' -reconcile-unrealized-casts %s | FileCheck %s
-// RUN: mlir-opt -finalize-memref-to-llvm='use-opaque-pointers=1' -convert-func-to-llvm='use-opaque-pointers=1' -reconcile-unrealized-casts %s | FileCheck %s --check-prefix=EMIT_C_ATTRIBUTE
+// RUN: mlir-opt -finalize-memref-to-llvm -llvm-request-c-wrappers -convert-func-to-llvm -reconcile-unrealized-casts %s | FileCheck %s
+// RUN: mlir-opt -finalize-memref-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts %s | FileCheck %s --check-prefix=EMIT_C_ATTRIBUTE
 
 // This tests the default memref calling convention and the emission of C
 // wrappers. We don't need to separate runs because the wrapper-emission
diff --git a/mlir/test/Conversion/FuncToLLVM/convert-argattrs.mlir b/mlir/test/Conversion/FuncToLLVM/convert-argattrs.mlir
index 41aff17d86919..85c7cbddfdbf6 100644
--- a/mlir/test/Conversion/FuncToLLVM/convert-argattrs.mlir
+++ b/mlir/test/Conversion/FuncToLLVM/convert-argattrs.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt -convert-func-to-llvm='use-opaque-pointers=1' %s | FileCheck %s
+// RUN: mlir-opt -convert-func-to-llvm %s | FileCheck %s
 
 // CHECK-LABEL: func @check_attributes
 // CHECK-SAME: {dialect.a = true, dialect.b = 4 : i64}
diff --git a/mlir/test/Conversion/FuncToLLVM/convert-data-layout.mlir b/mlir/test/Conversion/FuncToLLVM/convert-data-layout.mlir
index fb33d4fdfbe7c..0e7c16ec50799 100644
--- a/mlir/test/Conversion/FuncToLLVM/convert-data-layout.mlir
+++ b/mlir/test/Conversion/FuncToLLVM/convert-data-layout.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt -set-llvm-module-datalayout -convert-func-to-llvm='use-opaque-pointers=1' %s | FileCheck %s
+// RUN: mlir-opt -set-llvm-module-datalayout -convert-func-to-llvm %s | FileCheck %s
 
 // RUN-32: mlir-opt -set-llvm-module-datalayout='data-layout=p:32:32:32' -convert-func-to-llvm='use-opaque-pointers=1' %s \
 // RUN-32: | FileCheck %s
diff --git a/mlir/test/Conversion/FuncToLLVM/convert-funcs.mlir b/mlir/test/Conversion/FuncToLLVM/convert-funcs.mlir
index 9fe5ad5cdda65..765d8469f3c56 100644
--- a/mlir/test/Conversion/FuncToLLVM/convert-funcs.mlir
+++ b/mlir/test/Conversion/FuncToLLVM/convert-funcs.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt -convert-func-to-llvm='use-opaque-pointers=1' -split-input-file -verify-diagnostics %s | FileCheck %s
+// RUN: mlir-opt -convert-func-to-llvm -split-input-file -verify-diagnostics %s | FileCheck %s
 
 //CHECK: llvm.func @second_order_arg(!llvm.ptr)
 func.func private @second_order_arg(%arg0 : () -> ())
diff --git a/mlir/test/Conversion/FuncToLLVM/emit-c-wrappers-for-external-callers.mlir b/mlir/test/Conversion/FuncToLLVM/emit-c-wrappers-for-external-callers.mlir
index dd474e1401105..826ca9540ae56 100644
--- a/mlir/test/Conversion/FuncToLLVM/emit-c-wrappers-for-external-callers.mlir
+++ b/mlir/test/Conversion/FuncToLLVM/emit-c-wrappers-for-external-callers.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt -llvm-request-c-wrappers -convert-func-to-llvm='use-opaque-pointers=1' %s | FileCheck %s
+// RUN: mlir-opt -llvm-request-c-wrappers -convert-func-to-llvm %s | FileCheck %s
 
 // CHECK: llvm.func @res_attrs_with_memref_return() -> (!llvm.struct{{.*}} {test.returnOne})
 // CHECK-LABEL: llvm.func @_mlir_ciface_res_attrs_with_memref_return
diff --git a/mlir/test/Conversion/FuncToLLVM/emit-c-wrappers-for-external-functions.mlir b/mlir/test/Conversion/FuncToLLVM/emit-c-wrappers-for-external-functions.mlir
index 027d29b0bf079..28c2638c7be51 100644
--- a/mlir/test/Conversion/FuncToLLVM/emit-c-wrappers-for-external-functions.mlir
+++ b/mlir/test/Conversion/FuncToLLVM/emit-c-wrappers-for-external-functions.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt -llvm-request-c-wrappers -convert-func-to-llvm='use-opaque-pointers=1' %s | FileCheck %s
+// RUN: mlir-opt -llvm-request-c-wrappers -convert-func-to-llvm %s | FileCheck %s
 
 // CHECK: llvm.func private @res_attrs_with_memref_return() -> (!llvm.struct{{.*}} {test.returnOne})
 // CHECK-LABEL: llvm.func @_mlir_ciface_res_attrs_with_memref_return
diff --git a/mlir/test/Conversion/FuncToLLVM/func-memref-return.mlir b/mlir/test/Conversion/FuncToLLVM/func-memref-return.mlir
index b584d4ce28f52..91ef571cb3bf7 100644
--- a/mlir/test/Conversion/FuncToLLVM/func-memref-return.mlir
+++ b/mlir/test/Conversion/FuncToLLVM/func-memref-return.mlir
@@ -1,6 +1,6 @@
-// RUN: mlir-opt -convert-func-to-llvm='use-opaque-pointers=1' -reconcile-unrealized-casts %s | FileCheck %s
+// RUN: mlir-opt -convert-func-to-llvm -reconcile-unrealized-casts %s | FileCheck %s
 
-// RUN: mlir-opt -convert-func-to-llvm='use-bare-ptr-memref-call-conv=1 use-opaque-pointers=1'  %s | FileCheck %s --check-prefix=BAREPTR
+// RUN: mlir-opt -convert-func-to-llvm='use-bare-ptr-memref-call-conv=1'  %s | FileCheck %s --check-prefix=BAREPTR
 
 // RUN: mlir-opt -transform-interpreter %s | FileCheck %s --check-prefix=BAREPTR
 
diff --git a/mlir/test/Conversion/FuncToLLVM/func-memref.mlir b/mlir/test/Conversion/FuncToLLVM/func-memref.mlir
index b61287643dca9..d44a07bdcc9ab 100644
--- a/mlir/test/Conversion/FuncToLLVM/func-memref.mlir
+++ b/mlir/test/Conversion/FuncToLLVM/func-memref.mlir
@@ -1,5 +1,5 @@
-// RUN: mlir-opt -pass-pipeline="builtin.module(func.func(convert-arith-to-llvm),convert-func-to-llvm{use-opaque-pointers=1},reconcile-unrealized-casts)" -split-input-file %s | FileCheck %s
-// RUN: mlir-opt -pass-pipeline="builtin.module(func.func(convert-arith-to-llvm),convert-func-to-llvm{use-bare-ptr-memref-call-conv=1 use-opaque-pointers=1},reconcile-unrealized-casts)" -split-input-file %s | FileCheck %s --check-prefix=BAREPTR
+// RUN: mlir-opt -pass-pipeline="builtin.module(func.func(convert-arith-to-llvm),convert-func-to-llvm,reconcile-unrealized-casts)" -split-input-file %s | FileCheck %s
+// RUN: mlir-opt -pass-pipeline="builtin.module(func.func(convert-arith-to-llvm),convert-func-to-llvm{use-bare-ptr-memref-call-conv=1},reconcile-unrealized-casts)" -split-input-file %s | FileCheck %s --check-prefix=BAREPTR
 
 // BAREPTR-LABEL: func @check_noalias
 // BAREPTR-SAME: %{{.*}}: !llvm.ptr {llvm.noalias}, %{{.*}}: !llvm.ptr {llvm.noalias}
diff --git a/mlir/test/Conversion/FuncToLLVM/func-to-llvm.mlir b/mlir/test/Conversion/FuncToLLVM/func-to-llvm.mlir
index 8254e77c8628b..9cc6bbf0873ab 100644
--- a/mlir/test/Conversion/FuncToLLVM/func-to-llvm.mlir
+++ b/mlir/test/Conversion/FuncToLLVM/func-to-llvm.mlir
@@ -1,6 +1,6 @@
-// RUN: mlir-opt -pass-pipeline="builtin.module(func.func(convert-math-to-llvm,convert-arith-to-llvm),convert-func-to-llvm{use-opaque-pointers=1},reconcile-unrealized-casts)" %s | FileCheck %s
+// RUN: mlir-opt -pass-pipeline="builtin.module(func.func(convert-math-to-llvm,convert-arith-to-llvm),convert-func-to-llvm,reconcile-unrealized-casts)" %s | FileCheck %s
 
-// RUN: mlir-opt -pass-pipeline="builtin.module(func.func(convert-math-to-llvm,convert-arith-to-llvm{index-bitwidth=32}),convert-func-to-llvm{index-bitwidth=32 use-opaque-pointers=1},reconcile-unrealized-casts)" %s | FileCheck --check-prefix=CHECK32 %s
+// RUN: mlir-opt -pass-pipeline="builtin.module(func.func(convert-math-to-llvm,convert-arith-to-llvm{index-bitwidth=32}),convert-func-to-llvm{index-bitwidth=32},reconcile-unrealized-casts)" %s | FileCheck --check-prefix=CHECK32 %s
 
 // RUN: mlir-opt -transform-interpreter %s | FileCheck --check-prefix=CHECK32 %s
 
diff --git a/mlir/test/Conversion/FuncToLLVM/invalid.mlir b/mlir/test/Conversion/FuncToLLVM/invalid.mlir
index 798d0a8519efe..e70252ff87ed1 100644
--- a/mlir/test/Conversion/FuncToLLVM/invalid.mlir
+++ b/mlir/test/Conversion/FuncToLLVM/invalid.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-func-to-llvm='use-opaque-pointers=1' -verify-diagnostics -split-input-file
+// RUN: mlir-opt %s -convert-func-to-llvm -verify-diagnostics -split-input-file
 
 // Should not crash on unsupported types in function signatures.
 func.func private @unsupported_signature() -> tensor<10 x i32>
diff --git a/mlir/test/Conversion/FuncToLLVM/typed-pointers.mlir b/mlir/test/Conversion/FuncToLLVM/typed-pointers.mlir
deleted file mode 100644
index 7b3b816cc38bb..0000000000000
--- a/mlir/test/Conversion/FuncToLLVM/typed-pointers.mlir
+++ /dev/null
@@ -1,114 +0,0 @@
-// RUN: mlir-opt -convert-func-to-llvm='use-opaque-pointers=0' -split-input-file %s | FileCheck %s
-
-//CHECK: llvm.func @second_order_arg(!llvm.ptr<func<void ()>>)
-func.func private @second_order_arg(%arg0 : () -> ())
-
-//CHECK: llvm.func @second_order_result() -> !llvm.ptr<func<void ()>>
-func.func private @second_order_result() -> (() -> ())
-
-//CHECK: llvm.func @second_order_multi_result() -> !llvm.struct<(ptr<func<i32 ()>>, ptr<func<i64 ()>>, ptr<func<f32 ()>>)>
-func.func private @second_order_multi_result() -> (() -> (i32), () -> (i64), () -> (f32))
-
-//CHECK: llvm.func @third_order(!llvm.ptr<func<ptr<func<void ()>> (ptr<func<void ()>>)>>) -> !llvm.ptr<func<ptr<func<void ()>> (ptr<func<void ()>>)>>
-func.func private @third_order(%arg0 : (() -> ()) -> (() -> ())) -> ((() -> ()) -> (() -> ()))
-
-//CHECK: llvm.func @fifth_order_left(!llvm.ptr<func<void (ptr<func<void (ptr<func<void (ptr<func<void ()>>)>>)>>)>>)
-func.func private @fifth_order_left(%arg0: (((() -> ()) -> ()) -> ()) -> ())
-
-//CHECK: llvm.func @fifth_order_right(!llvm.ptr<func<ptr<func<ptr<func<ptr<func<void ()>> ()>> ()>> ()>>)
-func.func private @fifth_order_right(%arg0: () -> (() -> (() -> (() -> ()))))
-
-// Check that memrefs are converted to argument packs if appear as function arguments.
-// CHECK: llvm.func @memref_call_conv(!llvm.ptr<f32>, !llvm.ptr<f32>, i64, i64, i64)
-func.func private @memref_call_conv(%arg0: memref<?xf32>)
-
-// Same in nested functions.
-// CHECK: llvm.func @memref_call_conv_nested(!llvm.ptr<func<void (ptr<f32>, ptr<f32>, i64, i64, i64)>>)
-func.func private @memref_call_conv_nested(%arg0: (memref<?xf32>) -> ())
-
-//CHECK-LABEL: llvm.func @pass_through(%arg0: !llvm.ptr<func<void ()>>) -> !llvm.ptr<func<void ()>> {
-func.func @pass_through(%arg0: () -> ()) -> (() -> ()) {
-// CHECK-NEXT:  llvm.br ^bb1(%arg0 : !llvm.ptr<func<void ()>>)
-  cf.br ^bb1(%arg0 : () -> ())
-
-//CHECK-NEXT: ^bb1(%0: !llvm.ptr<func<void ()>>):
-^bb1(%bbarg: () -> ()):
-// CHECK-NEXT:  llvm.return %0 : !llvm.ptr<func<void ()>>
-  return %bbarg : () -> ()
-}
-
-// CHECK-LABEL: llvm.func @indirect_call(%arg0: !llvm.ptr<func<i32 (f32)>>, %arg1: f32) -> i32 {
-func.func @indirect_call(%arg0: (f32) -> i32, %arg1: f32) -> i32 {
-// CHECK-NEXT:  %0 = llvm.call %arg0(%arg1) : !llvm.ptr<func<i32 (f32)>>, (f32) -> i32
-  %0 = call_indirect %arg0(%arg1) : (f32) -> i32
-// CHECK-NEXT:  llvm.return %0 : i32
-  return %0 : i32
-}
-
-// CHECK-LABEL: llvm.func @get_i64() -> i64
-func.func private @get_i64() -> (i64)
-// CHECK-LABEL: llvm.func @get_f32() -> f32
-func.func private @get_f32() -> (f32)
-// CHECK-LABEL: llvm.func @get_memref() -> !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<4 x i64>, array<4 x i64>)>
-func.func private @get_memref() -> (memref<42x?x10x?xf32>)
-
-// CHECK-LABEL: llvm.func @multireturn() -> !llvm.struct<(i64, f32, struct<(ptr<f32>, ptr<f32>, i64, array<4 x i64>, array<4 x i64>)>)> {
-func.func @multireturn() -> (i64, f32, memref<42x?x10x?xf32>) {
-^bb0:
-// CHECK-NEXT:  {{.*}} = llvm.call @get_i64() : () -> i64
-// CHECK-NEXT:  {{.*}} = llvm.call @get_f32() : () -> f32
-// CHECK-NEXT:  {{.*}} = llvm.call @get_memref() : () -> !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<4 x i64>, array<4 x i64>)>
-  %0 = call @get_i64() : () -> (i64)
-  %1 = call @get_f32() : () -> (f32)
-  %2 = call @get_memref() : () -> (memref<42x?x10x?xf32>)
-// CHECK-NEXT:  {{.*}} = llvm.mlir.undef : !llvm.struct<(i64, f32, struct<(ptr<f32>, ptr<f32>, i64, array<4 x i64>, array<4 x i64>)>)>
-// CHECK-NEXT:  {{.*}} = llvm.insertvalue {{.*}}, {{.*}}[0] : !llvm.struct<(i64, f32, struct<(ptr<f32>, ptr<f32>, i64, array<4 x i64>, array<4 x i64>)>)>
-// CHECK-NEXT:  {{.*}} = llvm.insertvalue {{.*}}, {{.*}}[1] : !llvm.struct<(i64, f32, struct<(ptr<f32>, ptr<f32>, i64, array<4 x i64>, array<4 x i64>)>)>
-// CHECK-NEXT:  {{.*}} = llvm.insertvalue {{.*}}, {{.*}}[2] : !llvm.struct<(i64, f32, struct<(ptr<f32>, ptr<f32>, i64, array<4 x i64>, array<4 x i64>)>)>
-// CHECK-NEXT:  llvm.return {{.*}} : !llvm.struct<(i64, f32, struct<(ptr<f32>, ptr<f32>, i64, array<4 x i64>, array<4 x i64>)>)>
-  return %0, %1, %2 : i64, f32, memref<42x?x10x?xf32>
-}
-
-//===========================================================================//
-// Calling convention on returning unranked memrefs.
-// IR below produced by running -finalize-memref-to-llvm without opaque
-// pointers on calling-convention.mlir
-//===========================================================================//
-
-func.func @return_var_memref(%arg0: memref<4x3xf32>) -> memref<*xf32> attributes {llvm.emit_c_interface} {
-  %0 = builtin.unrealized_conversion_cast %arg0 : memref<4x3xf32> to !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
-  %1 = llvm.mlir.constant(1 : index) : i64
-  %2 = llvm.alloca %1 x !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> : (i64) -> !llvm.ptr<struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>>
-  llvm.store %0, %2 : !llvm.ptr<struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>>
-  %3 = llvm.bitcast %2 : !llvm.ptr<struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>> to !llvm.ptr<i8>
-  %4 = llvm.mlir.constant(2 : index) : i64
-  %5 = llvm.mlir.undef : !llvm.struct<(i64, ptr<i8>)>
-  %6 = llvm.insertvalue %4, %5[0] : !llvm.struct<(i64, ptr<i8>)>
-  %7 = llvm.insertvalue %3, %6[1] : !llvm.struct<(i64, ptr<i8>)>
-  %8 = builtin.unrealized_conversion_cast %7 : !llvm.struct<(i64, ptr<i8>)> to memref<*xf32>
-  return %8 : memref<*xf32>
-}
-
-// Check that the result memref is passed as parameter
-// CHECK-LABEL: @_mlir_ciface_return_var_memref
-// CHECK-SAME: (%{{.*}}: !llvm.ptr<struct<(i64, ptr<i8>)>>, %{{.*}}: !llvm.ptr<struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>>)
-
-func.func @return_two_var_memref(%arg0: memref<4x3xf32>) -> (memref<*xf32>, memref<*xf32>) attributes {llvm.emit_c_interface} {
-  %0 = builtin.unrealized_conversion_cast %arg0 : memref<4x3xf32> to !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
-  %1 = llvm.mlir.constant(1 : index) : i64
-  %2 = llvm.alloca %1 x !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)> : (i64) -> !llvm.ptr<struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>>
-  llvm.store %0, %2 : !llvm.ptr<struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>>
-  %3 = llvm.bitcast %2 : !llvm.ptr<struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>> to !llvm.ptr<i8>
-  %4 = llvm.mlir.constant(2 : index) : i64
-  %5 = llvm.mlir.undef : !llvm.struct<(i64, ptr<i8>)>
-  %6 = llvm.insertvalue %4, %5[0] : !llvm.struct<(i64, ptr<i8>)>
-  %7 = llvm.insertvalue %3, %6[1] : !llvm.struct<(i64, ptr<i8>)>
-  %8 = builtin.unrealized_conversion_cast %7 : !llvm.struct<(i64, ptr<i8>)> to memref<*xf32>
-  return %8, %8 : memref<*xf32>, memref<*xf32>
-}
-
-// Check that the result memrefs are passed as parameter
-// CHECK-LABEL: @_mlir_ciface_return_two_var_memref
-// CHECK-SAME: (%{{.*}}: !llvm.ptr<struct<(struct<(i64, ptr<i8>)>, struct<(i64, ptr<i8>)>)>>,
-// CHECK-SAME: %{{.*}}: !llvm.ptr<struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>>)
-
diff --git a/mlir/tools/mlir-vulkan-runner/mlir-vulkan-runner.cpp b/mlir/tools/mlir-vulkan-runner/mlir-vulkan-runner.cpp
index d3ec890bf4859..5b8e236b4618f 100644
--- a/mlir/tools/mlir-vulkan-runner/mlir-vulkan-runner.cpp
+++ b/mlir/tools/mlir-vulkan-runner/mlir-vulkan-runner.cpp
@@ -77,8 +77,7 @@ static LogicalResult runMLIRPasses(Operation *op,
   ConvertFuncToLLVMPassOptions funcToLLVMOptions{};
   funcToLLVMOptions.indexBitwidth =
       DataLayout(module).getTypeSizeInBits(IndexType::get(module.getContext()));
-  passManager.addPass(
-      createConvertFuncToLLVMPass(funcToLLVMOptions));
+  passManager.addPass(createConvertFuncToLLVMPass(funcToLLVMOptions));
   passManager.addPass(createReconcileUnrealizedCastsPass());
   passManager.addPass(createConvertVulkanLaunchFuncToVulkanCallsPass());
 

From 749f37083a2779628b49bdcad5e0109cac331803 Mon Sep 17 00:00:00 2001
From: Christian Ulmann <christian.ulmann@nextsilicon.com>
Date: Tue, 31 Oct 2023 07:36:32 +0100
Subject: [PATCH 471/877] [MLIR][AsyncToLLVM] Remove typed pointer support
 (#70731)

This commit removes the support for lowering Async to LLVM dialect with
typed pointers. Typed pointers have been deprecated for a while now and
it's planned to soon remove them from the LLVM dialect.

Related PSA:
https://discourse.llvm.org/t/psa-removal-of-typed-pointers-from-the-llvm-dialect/74502
---
 mlir/include/mlir/Conversion/Passes.td        |   5 -
 .../mlir/Dialect/LLVMIR/FunctionCallUtils.h   |   5 +-
 .../Conversion/AsyncToLLVM/AsyncToLLVM.cpp    | 243 ++++++------------
 .../AsyncToLLVM/convert-coro-to-llvm.mlir     |   2 +-
 .../AsyncToLLVM/convert-runtime-to-llvm.mlir  |   2 +-
 .../AsyncToLLVM/convert-to-llvm.mlir          |   2 +-
 .../AsyncToLLVM/typed-pointers.mlir           | 138 ----------
 7 files changed, 80 insertions(+), 317 deletions(-)
 delete mode 100644 mlir/test/Conversion/AsyncToLLVM/typed-pointers.mlir

diff --git a/mlir/include/mlir/Conversion/Passes.td b/mlir/include/mlir/Conversion/Passes.td
index a2307bc243f61..5423be0e91d0a 100644
--- a/mlir/include/mlir/Conversion/Passes.td
+++ b/mlir/include/mlir/Conversion/Passes.td
@@ -191,11 +191,6 @@ def ConvertAsyncToLLVMPass : Pass<"convert-async-to-llvm", "ModuleOp"> {
     "LLVM::LLVMDialect",
     "func::FuncDialect",
   ];
-  let options = [
-    Option<"useOpaquePointers", "use-opaque-pointers", "bool",
-           /*default=*/"true", "Generate LLVM IR using opaque pointers "
-           "instead of typed pointers">,
-  ];
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/Dialect/LLVMIR/FunctionCallUtils.h b/mlir/include/mlir/Dialect/LLVMIR/FunctionCallUtils.h
index 9e69717f471bc..05320c0c71869 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/FunctionCallUtils.h
+++ b/mlir/include/mlir/Dialect/LLVMIR/FunctionCallUtils.h
@@ -52,8 +52,9 @@ LLVM::LLVMFuncOp lookupOrCreatePrintNewlineFn(ModuleOp moduleOp);
 LLVM::LLVMFuncOp lookupOrCreateMallocFn(ModuleOp moduleOp, Type indexType,
                                         bool opaquePointers);
 LLVM::LLVMFuncOp lookupOrCreateAlignedAllocFn(ModuleOp moduleOp, Type indexType,
-                                              bool opaquePointers);
-LLVM::LLVMFuncOp lookupOrCreateFreeFn(ModuleOp moduleOp, bool opaquePointers);
+                                              bool opaquePointers = true);
+LLVM::LLVMFuncOp lookupOrCreateFreeFn(ModuleOp moduleOp,
+                                      bool opaquePointers = true);
 LLVM::LLVMFuncOp lookupOrCreateGenericAllocFn(ModuleOp moduleOp, Type indexType,
                                               bool opaquePointers);
 LLVM::LLVMFuncOp lookupOrCreateGenericAlignedAllocFn(ModuleOp moduleOp,
diff --git a/mlir/lib/Conversion/AsyncToLLVM/AsyncToLLVM.cpp b/mlir/lib/Conversion/AsyncToLLVM/AsyncToLLVM.cpp
index d9ea60a6749d9..0ab53ce7e3327 100644
--- a/mlir/lib/Conversion/AsyncToLLVM/AsyncToLLVM.cpp
+++ b/mlir/lib/Conversion/AsyncToLLVM/AsyncToLLVM.cpp
@@ -76,20 +76,16 @@ namespace {
 /// lowering all async data types become opaque pointers at runtime.
 struct AsyncAPI {
   // All async types are lowered to opaque LLVM pointers at runtime.
-  static LLVM::LLVMPointerType opaquePointerType(MLIRContext *ctx,
-                                                 bool useLLVMOpaquePointers) {
-    if (useLLVMOpaquePointers)
-      return LLVM::LLVMPointerType::get(ctx);
-    return LLVM::LLVMPointerType::get(IntegerType::get(ctx, 8));
+  static LLVM::LLVMPointerType opaquePointerType(MLIRContext *ctx) {
+    return LLVM::LLVMPointerType::get(ctx);
   }
 
   static LLVM::LLVMTokenType tokenType(MLIRContext *ctx) {
     return LLVM::LLVMTokenType::get(ctx);
   }
 
-  static FunctionType addOrDropRefFunctionType(MLIRContext *ctx,
-                                               bool useLLVMOpaquePointers) {
-    auto ref = opaquePointerType(ctx, useLLVMOpaquePointers);
+  static FunctionType addOrDropRefFunctionType(MLIRContext *ctx) {
+    auto ref = opaquePointerType(ctx);
     auto count = IntegerType::get(ctx, 64);
     return FunctionType::get(ctx, {ref, count}, {});
   }
@@ -98,10 +94,9 @@ struct AsyncAPI {
     return FunctionType::get(ctx, {}, {TokenType::get(ctx)});
   }
 
-  static FunctionType createValueFunctionType(MLIRContext *ctx,
-                                              bool useLLVMOpaquePointers) {
+  static FunctionType createValueFunctionType(MLIRContext *ctx) {
     auto i64 = IntegerType::get(ctx, 64);
-    auto value = opaquePointerType(ctx, useLLVMOpaquePointers);
+    auto value = opaquePointerType(ctx);
     return FunctionType::get(ctx, {i64}, {value});
   }
 
@@ -110,20 +105,17 @@ struct AsyncAPI {
     return FunctionType::get(ctx, {i64}, {GroupType::get(ctx)});
   }
 
-  static FunctionType getValueStorageFunctionType(MLIRContext *ctx,
-                                                  bool useLLVMOpaquePointers) {
-    auto value = opaquePointerType(ctx, useLLVMOpaquePointers);
-    auto storage = opaquePointerType(ctx, useLLVMOpaquePointers);
-    return FunctionType::get(ctx, {value}, {storage});
+  static FunctionType getValueStorageFunctionType(MLIRContext *ctx) {
+    auto ptrType = opaquePointerType(ctx);
+    return FunctionType::get(ctx, {ptrType}, {ptrType});
   }
 
   static FunctionType emplaceTokenFunctionType(MLIRContext *ctx) {
     return FunctionType::get(ctx, {TokenType::get(ctx)}, {});
   }
 
-  static FunctionType emplaceValueFunctionType(MLIRContext *ctx,
-                                               bool useLLVMOpaquePointers) {
-    auto value = opaquePointerType(ctx, useLLVMOpaquePointers);
+  static FunctionType emplaceValueFunctionType(MLIRContext *ctx) {
+    auto value = opaquePointerType(ctx);
     return FunctionType::get(ctx, {value}, {});
   }
 
@@ -131,9 +123,8 @@ struct AsyncAPI {
     return FunctionType::get(ctx, {TokenType::get(ctx)}, {});
   }
 
-  static FunctionType setValueErrorFunctionType(MLIRContext *ctx,
-                                                bool useLLVMOpaquePointers) {
-    auto value = opaquePointerType(ctx, useLLVMOpaquePointers);
+  static FunctionType setValueErrorFunctionType(MLIRContext *ctx) {
+    auto value = opaquePointerType(ctx);
     return FunctionType::get(ctx, {value}, {});
   }
 
@@ -142,9 +133,8 @@ struct AsyncAPI {
     return FunctionType::get(ctx, {TokenType::get(ctx)}, {i1});
   }
 
-  static FunctionType isValueErrorFunctionType(MLIRContext *ctx,
-                                               bool useLLVMOpaquePointers) {
-    auto value = opaquePointerType(ctx, useLLVMOpaquePointers);
+  static FunctionType isValueErrorFunctionType(MLIRContext *ctx) {
+    auto value = opaquePointerType(ctx);
     auto i1 = IntegerType::get(ctx, 1);
     return FunctionType::get(ctx, {value}, {i1});
   }
@@ -158,9 +148,8 @@ struct AsyncAPI {
     return FunctionType::get(ctx, {TokenType::get(ctx)}, {});
   }
 
-  static FunctionType awaitValueFunctionType(MLIRContext *ctx,
-                                             bool useLLVMOpaquePointers) {
-    auto value = opaquePointerType(ctx, useLLVMOpaquePointers);
+  static FunctionType awaitValueFunctionType(MLIRContext *ctx) {
+    auto value = opaquePointerType(ctx);
     return FunctionType::get(ctx, {value}, {});
   }
 
@@ -168,16 +157,9 @@ struct AsyncAPI {
     return FunctionType::get(ctx, {GroupType::get(ctx)}, {});
   }
 
-  static FunctionType executeFunctionType(MLIRContext *ctx,
-                                          bool useLLVMOpaquePointers) {
-    auto hdl = opaquePointerType(ctx, useLLVMOpaquePointers);
-    Type resume;
-    if (useLLVMOpaquePointers)
-      resume = LLVM::LLVMPointerType::get(ctx);
-    else
-      resume = LLVM::LLVMPointerType::get(
-          resumeFunctionType(ctx, useLLVMOpaquePointers));
-    return FunctionType::get(ctx, {hdl, resume}, {});
+  static FunctionType executeFunctionType(MLIRContext *ctx) {
+    auto ptrType = opaquePointerType(ctx);
+    return FunctionType::get(ctx, {ptrType, ptrType}, {});
   }
 
   static FunctionType addTokenToGroupFunctionType(MLIRContext *ctx) {
@@ -186,43 +168,19 @@ struct AsyncAPI {
                              {i64});
   }
 
-  static FunctionType
-  awaitTokenAndExecuteFunctionType(MLIRContext *ctx,
-                                   bool useLLVMOpaquePointers) {
-    auto hdl = opaquePointerType(ctx, useLLVMOpaquePointers);
-    Type resume;
-    if (useLLVMOpaquePointers)
-      resume = LLVM::LLVMPointerType::get(ctx);
-    else
-      resume = LLVM::LLVMPointerType::get(
-          resumeFunctionType(ctx, useLLVMOpaquePointers));
-    return FunctionType::get(ctx, {TokenType::get(ctx), hdl, resume}, {});
+  static FunctionType awaitTokenAndExecuteFunctionType(MLIRContext *ctx) {
+    auto ptrType = opaquePointerType(ctx);
+    return FunctionType::get(ctx, {TokenType::get(ctx), ptrType, ptrType}, {});
   }
 
-  static FunctionType
-  awaitValueAndExecuteFunctionType(MLIRContext *ctx,
-                                   bool useLLVMOpaquePointers) {
-    auto value = opaquePointerType(ctx, useLLVMOpaquePointers);
-    auto hdl = opaquePointerType(ctx, useLLVMOpaquePointers);
-    Type resume;
-    if (useLLVMOpaquePointers)
-      resume = LLVM::LLVMPointerType::get(ctx);
-    else
-      resume = LLVM::LLVMPointerType::get(
-          resumeFunctionType(ctx, useLLVMOpaquePointers));
-    return FunctionType::get(ctx, {value, hdl, resume}, {});
+  static FunctionType awaitValueAndExecuteFunctionType(MLIRContext *ctx) {
+    auto ptrType = opaquePointerType(ctx);
+    return FunctionType::get(ctx, {ptrType, ptrType, ptrType}, {});
   }
 
-  static FunctionType
-  awaitAllAndExecuteFunctionType(MLIRContext *ctx, bool useLLVMOpaquePointers) {
-    auto hdl = opaquePointerType(ctx, useLLVMOpaquePointers);
-    Type resume;
-    if (useLLVMOpaquePointers)
-      resume = LLVM::LLVMPointerType::get(ctx);
-    else
-      resume = LLVM::LLVMPointerType::get(
-          resumeFunctionType(ctx, useLLVMOpaquePointers));
-    return FunctionType::get(ctx, {GroupType::get(ctx), hdl, resume}, {});
+  static FunctionType awaitAllAndExecuteFunctionType(MLIRContext *ctx) {
+    auto ptrType = opaquePointerType(ctx);
+    return FunctionType::get(ctx, {GroupType::get(ctx), ptrType, ptrType}, {});
   }
 
   static FunctionType getNumWorkerThreads(MLIRContext *ctx) {
@@ -230,17 +188,16 @@ struct AsyncAPI {
   }
 
   // Auxiliary coroutine resume intrinsic wrapper.
-  static Type resumeFunctionType(MLIRContext *ctx, bool useLLVMOpaquePointers) {
+  static Type resumeFunctionType(MLIRContext *ctx) {
     auto voidTy = LLVM::LLVMVoidType::get(ctx);
-    auto ptrType = opaquePointerType(ctx, useLLVMOpaquePointers);
+    auto ptrType = opaquePointerType(ctx);
     return LLVM::LLVMFunctionType::get(voidTy, {ptrType}, false);
   }
 };
 } // namespace
 
 /// Adds Async Runtime C API declarations to the module.
-static void addAsyncRuntimeApiDeclarations(ModuleOp module,
-                                           bool useLLVMOpaquePointers) {
+static void addAsyncRuntimeApiDeclarations(ModuleOp module) {
   auto builder =
       ImplicitLocOpBuilder::atBlockEnd(module.getLoc(), module.getBody());
 
@@ -251,39 +208,30 @@ static void addAsyncRuntimeApiDeclarations(ModuleOp module,
   };
 
   MLIRContext *ctx = module.getContext();
-  addFuncDecl(kAddRef,
-              AsyncAPI::addOrDropRefFunctionType(ctx, useLLVMOpaquePointers));
-  addFuncDecl(kDropRef,
-              AsyncAPI::addOrDropRefFunctionType(ctx, useLLVMOpaquePointers));
+  addFuncDecl(kAddRef, AsyncAPI::addOrDropRefFunctionType(ctx));
+  addFuncDecl(kDropRef, AsyncAPI::addOrDropRefFunctionType(ctx));
   addFuncDecl(kCreateToken, AsyncAPI::createTokenFunctionType(ctx));
-  addFuncDecl(kCreateValue,
-              AsyncAPI::createValueFunctionType(ctx, useLLVMOpaquePointers));
+  addFuncDecl(kCreateValue, AsyncAPI::createValueFunctionType(ctx));
   addFuncDecl(kCreateGroup, AsyncAPI::createGroupFunctionType(ctx));
   addFuncDecl(kEmplaceToken, AsyncAPI::emplaceTokenFunctionType(ctx));
-  addFuncDecl(kEmplaceValue,
-              AsyncAPI::emplaceValueFunctionType(ctx, useLLVMOpaquePointers));
+  addFuncDecl(kEmplaceValue, AsyncAPI::emplaceValueFunctionType(ctx));
   addFuncDecl(kSetTokenError, AsyncAPI::setTokenErrorFunctionType(ctx));
-  addFuncDecl(kSetValueError,
-              AsyncAPI::setValueErrorFunctionType(ctx, useLLVMOpaquePointers));
+  addFuncDecl(kSetValueError, AsyncAPI::setValueErrorFunctionType(ctx));
   addFuncDecl(kIsTokenError, AsyncAPI::isTokenErrorFunctionType(ctx));
-  addFuncDecl(kIsValueError,
-              AsyncAPI::isValueErrorFunctionType(ctx, useLLVMOpaquePointers));
+  addFuncDecl(kIsValueError, AsyncAPI::isValueErrorFunctionType(ctx));
   addFuncDecl(kIsGroupError, AsyncAPI::isGroupErrorFunctionType(ctx));
   addFuncDecl(kAwaitToken, AsyncAPI::awaitTokenFunctionType(ctx));
-  addFuncDecl(kAwaitValue,
-              AsyncAPI::awaitValueFunctionType(ctx, useLLVMOpaquePointers));
+  addFuncDecl(kAwaitValue, AsyncAPI::awaitValueFunctionType(ctx));
   addFuncDecl(kAwaitGroup, AsyncAPI::awaitGroupFunctionType(ctx));
-  addFuncDecl(kExecute,
-              AsyncAPI::executeFunctionType(ctx, useLLVMOpaquePointers));
-  addFuncDecl(kGetValueStorage, AsyncAPI::getValueStorageFunctionType(
-                                    ctx, useLLVMOpaquePointers));
+  addFuncDecl(kExecute, AsyncAPI::executeFunctionType(ctx));
+  addFuncDecl(kGetValueStorage, AsyncAPI::getValueStorageFunctionType(ctx));
   addFuncDecl(kAddTokenToGroup, AsyncAPI::addTokenToGroupFunctionType(ctx));
-  addFuncDecl(kAwaitTokenAndExecute, AsyncAPI::awaitTokenAndExecuteFunctionType(
-                                         ctx, useLLVMOpaquePointers));
-  addFuncDecl(kAwaitValueAndExecute, AsyncAPI::awaitValueAndExecuteFunctionType(
-                                         ctx, useLLVMOpaquePointers));
-  addFuncDecl(kAwaitAllAndExecute, AsyncAPI::awaitAllAndExecuteFunctionType(
-                                       ctx, useLLVMOpaquePointers));
+  addFuncDecl(kAwaitTokenAndExecute,
+              AsyncAPI::awaitTokenAndExecuteFunctionType(ctx));
+  addFuncDecl(kAwaitValueAndExecute,
+              AsyncAPI::awaitValueAndExecuteFunctionType(ctx));
+  addFuncDecl(kAwaitAllAndExecute,
+              AsyncAPI::awaitAllAndExecuteFunctionType(ctx));
   addFuncDecl(kGetNumWorkerThreads, AsyncAPI::getNumWorkerThreads(ctx));
 }
 
@@ -296,7 +244,7 @@ static constexpr const char *kResume = "__resume";
 /// A function that takes a coroutine handle and calls a `llvm.coro.resume`
 /// intrinsics. We need this function to be able to pass it to the async
 /// runtime execute API.
-static void addResumeFunction(ModuleOp module, bool useOpaquePointers) {
+static void addResumeFunction(ModuleOp module) {
   if (module.lookupSymbol(kResume))
     return;
 
@@ -305,11 +253,7 @@ static void addResumeFunction(ModuleOp module, bool useOpaquePointers) {
   auto moduleBuilder = ImplicitLocOpBuilder::atBlockEnd(loc, module.getBody());
 
   auto voidTy = LLVM::LLVMVoidType::get(ctx);
-  Type ptrType;
-  if (useOpaquePointers)
-    ptrType = LLVM::LLVMPointerType::get(ctx);
-  else
-    ptrType = LLVM::LLVMPointerType::get(IntegerType::get(ctx, 8));
+  Type ptrType = AsyncAPI::opaquePointerType(ctx);
 
   auto resumeOp = moduleBuilder.create<LLVM::LLVMFuncOp>(
       kResume, LLVM::LLVMFunctionType::get(voidTy, {ptrType}));
@@ -330,15 +274,10 @@ namespace {
 /// AsyncRuntimeTypeConverter only converts types from the Async dialect to
 /// their runtime type (opaque pointers) and does not convert any other types.
 class AsyncRuntimeTypeConverter : public TypeConverter {
-  bool llvmOpaquePointers = false;
-
 public:
-  AsyncRuntimeTypeConverter(const LowerToLLVMOptions &options)
-      : llvmOpaquePointers(options.useOpaquePointers) {
+  AsyncRuntimeTypeConverter(const LowerToLLVMOptions &options) {
     addConversion([](Type type) { return type; });
-    addConversion([this](Type type) {
-      return convertAsyncTypes(type, llvmOpaquePointers);
-    });
+    addConversion([](Type type) { return convertAsyncTypes(type); });
 
     // Use UnrealizedConversionCast as the bridge so that we don't need to pull
     // in patterns for other dialects.
@@ -352,28 +291,14 @@ class AsyncRuntimeTypeConverter : public TypeConverter {
     addTargetMaterialization(addUnrealizedCast);
   }
 
-  /// Returns whether LLVM opaque pointers should be used instead of typed
-  /// pointers.
-  bool useOpaquePointers() const { return llvmOpaquePointers; }
-
-  /// Creates an LLVM pointer type which may either be a typed pointer or an
-  /// opaque pointer, depending on what options the converter was constructed
-  /// with.
-  LLVM::LLVMPointerType getPointerType(Type elementType) const {
-    if (llvmOpaquePointers)
-      return LLVM::LLVMPointerType::get(elementType.getContext());
-    return LLVM::LLVMPointerType::get(elementType);
-  }
-
-  static std::optional<Type> convertAsyncTypes(Type type,
-                                               bool useOpaquePointers) {
+  static std::optional<Type> convertAsyncTypes(Type type) {
     if (isa<TokenType, GroupType, ValueType>(type))
-      return AsyncAPI::opaquePointerType(type.getContext(), useOpaquePointers);
+      return AsyncAPI::opaquePointerType(type.getContext());
 
     if (isa<CoroIdType, CoroStateType>(type))
       return AsyncAPI::tokenType(type.getContext());
     if (isa<CoroHandleType>(type))
-      return AsyncAPI::opaquePointerType(type.getContext(), useOpaquePointers);
+      return AsyncAPI::opaquePointerType(type.getContext());
 
     return std::nullopt;
   }
@@ -414,8 +339,7 @@ class CoroIdOpConversion : public AsyncOpConversionPattern<CoroIdOp> {
   matchAndRewrite(CoroIdOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
     auto token = AsyncAPI::tokenType(op->getContext());
-    auto ptrType = AsyncAPI::opaquePointerType(
-        op->getContext(), getTypeConverter()->useOpaquePointers());
+    auto ptrType = AsyncAPI::opaquePointerType(op->getContext());
     auto loc = op->getLoc();
 
     // Constants for initializing coroutine frame.
@@ -444,8 +368,7 @@ class CoroBeginOpConversion : public AsyncOpConversionPattern<CoroBeginOp> {
   LogicalResult
   matchAndRewrite(CoroBeginOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
-    auto ptrType = AsyncAPI::opaquePointerType(
-        op->getContext(), getTypeConverter()->useOpaquePointers());
+    auto ptrType = AsyncAPI::opaquePointerType(op->getContext());
     auto loc = op->getLoc();
 
     // Get coroutine frame size: @llvm.coro.size.i64.
@@ -472,8 +395,7 @@ class CoroBeginOpConversion : public AsyncOpConversionPattern<CoroBeginOp> {
 
     // Allocate memory for the coroutine frame.
     auto allocFuncOp = LLVM::lookupOrCreateAlignedAllocFn(
-        op->getParentOfType<ModuleOp>(), rewriter.getI64Type(),
-        getTypeConverter()->useOpaquePointers());
+        op->getParentOfType<ModuleOp>(), rewriter.getI64Type());
     auto coroAlloc = rewriter.create<LLVM::CallOp>(
         loc, allocFuncOp, ValueRange{coroAlign, coroSize});
 
@@ -499,8 +421,7 @@ class CoroFreeOpConversion : public AsyncOpConversionPattern<CoroFreeOp> {
   LogicalResult
   matchAndRewrite(CoroFreeOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
-    auto ptrType = AsyncAPI::opaquePointerType(
-        op->getContext(), getTypeConverter()->useOpaquePointers());
+    auto ptrType = AsyncAPI::opaquePointerType(op->getContext());
     auto loc = op->getLoc();
 
     // Get a pointer to the coroutine frame memory: @llvm.coro.free.
@@ -509,8 +430,7 @@ class CoroFreeOpConversion : public AsyncOpConversionPattern<CoroFreeOp> {
 
     // Free the memory.
     auto freeFuncOp =
-        LLVM::lookupOrCreateFreeFn(op->getParentOfType<ModuleOp>(),
-                                   getTypeConverter()->useOpaquePointers());
+        LLVM::lookupOrCreateFreeFn(op->getParentOfType<ModuleOp>());
     rewriter.replaceOpWithNewOp<LLVM::CallOp>(op, freeFuncOp,
                                               ValueRange(coroMem.getResult()));
 
@@ -538,8 +458,9 @@ class CoroEndOpConversion : public OpConversionPattern<CoroEndOp> {
 
     // Mark the end of a coroutine: @llvm.coro.end.
     auto coroHdl = adaptor.getHandle();
-    rewriter.create<LLVM::CoroEndOp>(op->getLoc(), rewriter.getI1Type(),
-                                     ValueRange({coroHdl, constFalse, noneToken}));
+    rewriter.create<LLVM::CoroEndOp>(
+        op->getLoc(), rewriter.getI1Type(),
+        ValueRange({coroHdl, constFalse, noneToken}));
     rewriter.eraseOp(op);
 
     return success();
@@ -673,7 +594,8 @@ class RuntimeCreateOpLowering : public ConvertOpToLLVMPattern<RuntimeCreateOp> {
         auto i64 = rewriter.getI64Type();
 
         auto storedType = converter->convertType(valueType.getValueType());
-        auto storagePtrType = getTypeConverter()->getPointerType(storedType);
+        auto storagePtrType =
+            AsyncAPI::opaquePointerType(rewriter.getContext());
 
         // %Size = getelementptr %T* null, int 1
         // %SizeI = ptrtoint %T* %Size to i64
@@ -846,12 +768,10 @@ class RuntimeAwaitAndResumeOpLowering
     Value handle = adaptor.getHandle();
 
     // A pointer to coroutine resume intrinsic wrapper.
-    addResumeFunction(op->getParentOfType<ModuleOp>(),
-                      getTypeConverter()->useOpaquePointers());
-    auto resumeFnTy = AsyncAPI::resumeFunctionType(
-        op->getContext(), getTypeConverter()->useOpaquePointers());
+    addResumeFunction(op->getParentOfType<ModuleOp>());
     auto resumePtr = rewriter.create<LLVM::AddressOfOp>(
-        op->getLoc(), getTypeConverter()->getPointerType(resumeFnTy), kResume);
+        op->getLoc(), AsyncAPI::opaquePointerType(rewriter.getContext()),
+        kResume);
 
     rewriter.create<func::CallOp>(
         op->getLoc(), apiFuncName, TypeRange(),
@@ -877,12 +797,10 @@ class RuntimeResumeOpLowering
   matchAndRewrite(RuntimeResumeOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
     // A pointer to coroutine resume intrinsic wrapper.
-    addResumeFunction(op->getParentOfType<ModuleOp>(),
-                      getTypeConverter()->useOpaquePointers());
-    auto resumeFnTy = AsyncAPI::resumeFunctionType(
-        op->getContext(), getTypeConverter()->useOpaquePointers());
+    addResumeFunction(op->getParentOfType<ModuleOp>());
     auto resumePtr = rewriter.create<LLVM::AddressOfOp>(
-        op->getLoc(), getTypeConverter()->getPointerType(resumeFnTy), kResume);
+        op->getLoc(), AsyncAPI::opaquePointerType(rewriter.getContext()),
+        kResume);
 
     // Call async runtime API to execute a coroutine in the managed thread.
     auto coroHdl = adaptor.getHandle();
@@ -909,8 +827,7 @@ class RuntimeStoreOpLowering : public ConvertOpToLLVMPattern<RuntimeStoreOp> {
     Location loc = op->getLoc();
 
     // Get a pointer to the async value storage from the runtime.
-    auto ptrType = AsyncAPI::opaquePointerType(
-        rewriter.getContext(), getTypeConverter()->useOpaquePointers());
+    auto ptrType = AsyncAPI::opaquePointerType(rewriter.getContext());
     auto storage = adaptor.getStorage();
     auto storagePtr = rewriter.create<func::CallOp>(
         loc, kGetValueStorage, TypeRange(ptrType), storage);
@@ -923,11 +840,6 @@ class RuntimeStoreOpLowering : public ConvertOpToLLVMPattern<RuntimeStoreOp> {
           op, "failed to convert stored value type to LLVM type");
 
     Value castedStoragePtr = storagePtr.getResult(0);
-    if (!getTypeConverter()->useOpaquePointers())
-      castedStoragePtr = rewriter.create<LLVM::BitcastOp>(
-          loc, getTypeConverter()->getPointerType(llvmValueType),
-          castedStoragePtr);
-
     // Store the yielded value into the async value storage.
     auto value = adaptor.getValue();
     rewriter.create<LLVM::StoreOp>(loc, value, castedStoragePtr);
@@ -955,8 +867,7 @@ class RuntimeLoadOpLowering : public ConvertOpToLLVMPattern<RuntimeLoadOp> {
     Location loc = op->getLoc();
 
     // Get a pointer to the async value storage from the runtime.
-    auto ptrType = AsyncAPI::opaquePointerType(
-        rewriter.getContext(), getTypeConverter()->useOpaquePointers());
+    auto ptrType = AsyncAPI::opaquePointerType(rewriter.getContext());
     auto storage = adaptor.getStorage();
     auto storagePtr = rewriter.create<func::CallOp>(
         loc, kGetValueStorage, TypeRange(ptrType), storage);
@@ -969,10 +880,6 @@ class RuntimeLoadOpLowering : public ConvertOpToLLVMPattern<RuntimeLoadOp> {
           op, "failed to convert loaded value type to LLVM type");
 
     Value castedStoragePtr = storagePtr.getResult(0);
-    if (!getTypeConverter()->useOpaquePointers())
-      castedStoragePtr = rewriter.create<LLVM::BitcastOp>(
-          loc, getTypeConverter()->getPointerType(llvmValueType),
-          castedStoragePtr);
 
     // Load from the casted pointer.
     rewriter.replaceOpWithNewOp<LLVM::LoadOp>(op, llvmValueType,
@@ -1115,12 +1022,11 @@ void ConvertAsyncToLLVMPass::runOnOperation() {
   MLIRContext *ctx = module->getContext();
 
   LowerToLLVMOptions options(ctx);
-  options.useOpaquePointers = useOpaquePointers;
 
   // Add declarations for most functions required by the coroutines lowering.
   // We delay adding the resume function until it's needed because it currently
   // fails to compile unless '-O0' is specified.
-  addAsyncRuntimeApiDeclarations(module, useOpaquePointers);
+  addAsyncRuntimeApiDeclarations(module);
 
   // Lower async.runtime and async.coro operations to Async Runtime API and
   // LLVM coroutine intrinsics.
@@ -1133,8 +1039,7 @@ void ConvertAsyncToLLVMPass::runOnOperation() {
   // operations.
   LLVMTypeConverter llvmConverter(ctx, options);
   llvmConverter.addConversion([&](Type type) {
-    return AsyncRuntimeTypeConverter::convertAsyncTypes(
-        type, llvmConverter.useOpaquePointers());
+    return AsyncRuntimeTypeConverter::convertAsyncTypes(type);
   });
 
   // Convert async types in function signatures and function calls.
diff --git a/mlir/test/Conversion/AsyncToLLVM/convert-coro-to-llvm.mlir b/mlir/test/Conversion/AsyncToLLVM/convert-coro-to-llvm.mlir
index 8a611cf96f5b5..a398bc5710a86 100644
--- a/mlir/test/Conversion/AsyncToLLVM/convert-coro-to-llvm.mlir
+++ b/mlir/test/Conversion/AsyncToLLVM/convert-coro-to-llvm.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-async-to-llvm='use-opaque-pointers=1' | FileCheck %s
+// RUN: mlir-opt %s -convert-async-to-llvm | FileCheck %s
 
 // CHECK-LABEL: @coro_id
 func.func @coro_id() {
diff --git a/mlir/test/Conversion/AsyncToLLVM/convert-runtime-to-llvm.mlir b/mlir/test/Conversion/AsyncToLLVM/convert-runtime-to-llvm.mlir
index 3672be91bbc07..4077edc7420dc 100644
--- a/mlir/test/Conversion/AsyncToLLVM/convert-runtime-to-llvm.mlir
+++ b/mlir/test/Conversion/AsyncToLLVM/convert-runtime-to-llvm.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-async-to-llvm='use-opaque-pointers=1' | FileCheck %s --dump-input=always
+// RUN: mlir-opt %s -convert-async-to-llvm | FileCheck %s --dump-input=always
 
 // CHECK-LABEL: @create_token
 func.func @create_token() {
diff --git a/mlir/test/Conversion/AsyncToLLVM/convert-to-llvm.mlir b/mlir/test/Conversion/AsyncToLLVM/convert-to-llvm.mlir
index fd419dc95e7a1..dd54bdb798724 100644
--- a/mlir/test/Conversion/AsyncToLLVM/convert-to-llvm.mlir
+++ b/mlir/test/Conversion/AsyncToLLVM/convert-to-llvm.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -split-input-file -async-to-async-runtime -convert-async-to-llvm='use-opaque-pointers=1' | FileCheck %s
+// RUN: mlir-opt %s -split-input-file -async-to-async-runtime -convert-async-to-llvm | FileCheck %s
 
 // CHECK-LABEL: reference_counting
 func.func @reference_counting(%arg0: !async.token) {
diff --git a/mlir/test/Conversion/AsyncToLLVM/typed-pointers.mlir b/mlir/test/Conversion/AsyncToLLVM/typed-pointers.mlir
deleted file mode 100644
index 07cd2add3b151..0000000000000
--- a/mlir/test/Conversion/AsyncToLLVM/typed-pointers.mlir
+++ /dev/null
@@ -1,138 +0,0 @@
-// RUN: mlir-opt %s -split-input-file -async-to-async-runtime -convert-async-to-llvm='use-opaque-pointers=0' | FileCheck %s
-
-
-
-// CHECK-LABEL: @store
-func.func @store() {
-  // CHECK: %[[CST:.*]] = arith.constant 1.0
-  %0 = arith.constant 1.0 : f32
-  // CHECK: %[[VALUE:.*]] = call @mlirAsyncRuntimeCreateValue
-  %1 = async.runtime.create : !async.value<f32>
-  // CHECK: %[[P0:.*]] = call @mlirAsyncRuntimeGetValueStorage(%[[VALUE]])
-  // CHECK: %[[P1:.*]] = llvm.bitcast %[[P0]] : !llvm.ptr<i8> to !llvm.ptr<f32>
-  // CHECK: llvm.store %[[CST]], %[[P1]]
-  async.runtime.store %0, %1 : !async.value<f32>
-  return
-}
-
-// CHECK-LABEL: @load
-func.func @load() -> f32 {
-  // CHECK: %[[VALUE:.*]] = call @mlirAsyncRuntimeCreateValue
-  %0 = async.runtime.create : !async.value<f32>
-  // CHECK: %[[P0:.*]] = call @mlirAsyncRuntimeGetValueStorage(%[[VALUE]])
-  // CHECK: %[[P1:.*]] = llvm.bitcast %[[P0]] : !llvm.ptr<i8> to !llvm.ptr<f32>
-  // CHECK: %[[VALUE:.*]] = llvm.load %[[P1]]
-  %1 = async.runtime.load %0 : !async.value<f32>
-  // CHECK: return %[[VALUE]] : f32
-  return %1 : f32
-}
-
-// -----
-
-// CHECK-LABEL: execute_no_async_args
-func.func @execute_no_async_args(%arg0: f32, %arg1: memref<1xf32>) {
-  // CHECK: %[[TOKEN:.*]] = call @async_execute_fn(%arg0, %arg1)
-  %token = async.execute {
-    %c0 = arith.constant 0 : index
-    memref.store %arg0, %arg1[%c0] : memref<1xf32>
-    async.yield
-  }
-  // CHECK: call @mlirAsyncRuntimeAwaitToken(%[[TOKEN]])
-  // CHECK: %[[IS_ERROR:.*]] = call @mlirAsyncRuntimeIsTokenError(%[[TOKEN]])
-  // CHECK: %[[TRUE:.*]] = arith.constant true
-  // CHECK: %[[NOT_ERROR:.*]] = arith.xori %[[IS_ERROR]], %[[TRUE]] : i1
-  // CHECK: cf.assert %[[NOT_ERROR]]
-  // CHECK-NEXT: return
-  async.await %token : !async.token
-  return
-}
-
-// Function outlined from the async.execute operation.
-// CHECK-LABEL: func private @async_execute_fn(%arg0: f32, %arg1: memref<1xf32>)
-// CHECK-SAME: -> !llvm.ptr<i8>
-
-// Create token for return op, and mark a function as a coroutine.
-// CHECK: %[[RET:.*]] = call @mlirAsyncRuntimeCreateToken()
-// CHECK: %[[HDL:.*]] = llvm.intr.coro.begin
-
-// Pass a suspended coroutine to the async runtime.
-// CHECK: %[[STATE:.*]] = llvm.intr.coro.save
-// CHECK: %[[RESUME:.*]] = llvm.mlir.addressof @__resume
-// CHECK: call @mlirAsyncRuntimeExecute(%[[HDL]], %[[RESUME]])
-// CHECK: %[[SUSPENDED:.*]] = llvm.intr.coro.suspend %[[STATE]]
-
-// Decide the next block based on the code returned from suspend.
-// CHECK: %[[SEXT:.*]] = llvm.sext %[[SUSPENDED]] : i8 to i32
-// CHECK: llvm.switch %[[SEXT]] : i32, ^[[SUSPEND:[b0-9]+]]
-// CHECK-NEXT: 0: ^[[RESUME:[b0-9]+]]
-// CHECK-NEXT: 1: ^[[CLEANUP:[b0-9]+]]
-
-// Resume coroutine after suspension.
-// CHECK: ^[[RESUME]]:
-// CHECK: memref.store %arg0, %arg1[%c0] : memref<1xf32>
-// CHECK: call @mlirAsyncRuntimeEmplaceToken(%[[RET]])
-
-// Delete coroutine.
-// CHECK: ^[[CLEANUP]]:
-// CHECK: %[[MEM:.*]] = llvm.intr.coro.free
-// CHECK: llvm.call @free(%[[MEM]])
-
-// Suspend coroutine, and also a return statement for ramp function.
-// CHECK: ^[[SUSPEND]]:
-// CHECK: llvm.intr.coro.end
-// CHECK: return %[[RET]]
-
-// -----
-
-// CHECK-LABEL: execute_and_return_f32
-func.func @execute_and_return_f32() -> f32 {
- // CHECK: %[[RET:.*]]:2 = call @async_execute_fn
-  %token, %result = async.execute -> !async.value<f32> {
-    %c0 = arith.constant 123.0 : f32
-    async.yield %c0 : f32
-  }
-
-  // CHECK: %[[STORAGE:.*]] = call @mlirAsyncRuntimeGetValueStorage(%[[RET]]#1)
-  // CHECK: %[[ST_F32:.*]] = llvm.bitcast %[[STORAGE]]
-  // CHECK: %[[LOADED:.*]] = llvm.load %[[ST_F32]] :  !llvm.ptr<f32>
-  %0 = async.await %result : !async.value<f32>
-
-  return %0 : f32
-}
-
-// Function outlined from the async.execute operation.
-// CHECK-LABEL: func private @async_execute_fn()
-// CHECK: %[[TOKEN:.*]] = call @mlirAsyncRuntimeCreateToken()
-// CHECK: %[[VALUE:.*]] = call @mlirAsyncRuntimeCreateValue
-// CHECK: %[[HDL:.*]] = llvm.intr.coro.begin
-
-// Suspend coroutine in the beginning.
-// CHECK: call @mlirAsyncRuntimeExecute(%[[HDL]],
-// CHECK: llvm.intr.coro.suspend
-
-// Emplace result value.
-// CHECK: %[[CST:.*]] = arith.constant 1.230000e+02 : f32
-// CHECK: %[[STORAGE:.*]] = call @mlirAsyncRuntimeGetValueStorage(%[[VALUE]])
-// CHECK: %[[ST_F32:.*]] = llvm.bitcast %[[STORAGE]]
-// CHECK: llvm.store %[[CST]], %[[ST_F32]] : !llvm.ptr<f32>
-// CHECK: call @mlirAsyncRuntimeEmplaceValue(%[[VALUE]])
-
-// Emplace result token.
-// CHECK: call @mlirAsyncRuntimeEmplaceToken(%[[TOKEN]])
-
-// -----
-
-// CHECK-LABEL: @await_and_resume_group
-func.func @await_and_resume_group() {
-  %c = arith.constant 1 : index
-  %0 = async.coro.id
-  // CHECK: %[[HDL:.*]] = llvm.intr.coro.begin
-  %1 = async.coro.begin %0
-  // CHECK: %[[TOKEN:.*]] = call @mlirAsyncRuntimeCreateGroup
-  %2 = async.runtime.create_group %c : !async.group
-  // CHECK: %[[RESUME:.*]] = llvm.mlir.addressof @__resume
-  // CHECK: call @mlirAsyncRuntimeAwaitAllInGroupAndExecute
-  // CHECK-SAME: (%[[TOKEN]], %[[HDL]], %[[RESUME]])
-  async.runtime.await_and_resume %2, %1 : !async.group
-  return
-}

From 4fed3d374dfca82d0cb32bb444985ece04438376 Mon Sep 17 00:00:00 2001
From: Christian Ulmann <christian.ulmann@nextsilicon.com>
Date: Tue, 31 Oct 2023 07:36:46 +0100
Subject: [PATCH 472/877] [MLIR][ControlFlowToLLVM] Remove typed pointer
 support (#70733)

This commit removes the support for lowering ControlFlow to LLVM dialect
with typed pointers. Typed pointers have been deprecated for a while now
and it's planned to soon remove them from the LLVM dialect.

Related PSA:
https://discourse.llvm.org/t/psa-removal-of-typed-pointers-from-the-llvm-dialect/74502
---
 mlir/include/mlir/Conversion/Passes.td                      | 5 +----
 mlir/lib/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.cpp | 1 -
 mlir/test/Conversion/ControlFlowToLLVM/assert.mlir          | 2 +-
 3 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/mlir/include/mlir/Conversion/Passes.td b/mlir/include/mlir/Conversion/Passes.td
index 5423be0e91d0a..ba7dc642af2a0 100644
--- a/mlir/include/mlir/Conversion/Passes.td
+++ b/mlir/include/mlir/Conversion/Passes.td
@@ -293,10 +293,7 @@ def ConvertControlFlowToLLVMPass : Pass<"convert-cf-to-llvm", "ModuleOp"> {
   let options = [
     Option<"indexBitwidth", "index-bitwidth", "unsigned",
            /*default=kDeriveIndexBitwidthFromDataLayout*/"0",
-           "Bitwidth of the index type, 0 to use size of machine word">,
-    Option<"useOpaquePointers", "use-opaque-pointers", "bool",
-                   /*default=*/"true", "Generate LLVM IR using opaque pointers "
-                   "instead of typed pointers">,
+           "Bitwidth of the index type, 0 to use size of machine word">
   ];
 }
 
diff --git a/mlir/lib/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.cpp b/mlir/lib/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.cpp
index 433d8a01a1ac8..b8e5aec25286d 100644
--- a/mlir/lib/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.cpp
+++ b/mlir/lib/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.cpp
@@ -236,7 +236,6 @@ struct ConvertControlFlowToLLVM
     LowerToLLVMOptions options(&getContext());
     if (indexBitwidth != kDeriveIndexBitwidthFromDataLayout)
       options.overrideIndexBitwidth(indexBitwidth);
-    options.useOpaquePointers = useOpaquePointers;
 
     LLVMTypeConverter converter(&getContext(), options);
     mlir::cf::populateControlFlowToLLVMConversionPatterns(converter, patterns);
diff --git a/mlir/test/Conversion/ControlFlowToLLVM/assert.mlir b/mlir/test/Conversion/ControlFlowToLLVM/assert.mlir
index dc5ba0680acb2..3ec8f1fa1e567 100644
--- a/mlir/test/Conversion/ControlFlowToLLVM/assert.mlir
+++ b/mlir/test/Conversion/ControlFlowToLLVM/assert.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-cf-to-llvm='use-opaque-pointers=1' | FileCheck %s
+// RUN: mlir-opt %s -convert-cf-to-llvm | FileCheck %s
 
 // Same below, but using the `ConvertToLLVMPatternInterface` entry point
 // and the generic `convert-to-llvm` pass.

From 8d4e35600f3ba90997a59fdb9baeb196e723eec9 Mon Sep 17 00:00:00 2001
From: licongtian <licongtian@loongson.cn>
Date: Wed, 20 Sep 2023 11:21:56 +0800
Subject: [PATCH 473/877] [Clang][LoongArch] Support compiler options
 -mlsx/-mlasx for clang

This patch adds compiler options -mlsx/-mlasx which enables the
instruction sets of LSX and LASX, and sets related predefined macros
according to the options.
---
 .../clang/Basic/DiagnosticDriverKinds.td      |  6 +++
 clang/include/clang/Driver/Options.td         | 10 +++++
 clang/lib/Basic/Targets/LoongArch.cpp         | 12 +++++-
 clang/lib/Basic/Targets/LoongArch.h           |  4 ++
 .../lib/Driver/ToolChains/Arch/LoongArch.cpp  | 32 +++++++++++++++
 clang/test/Driver/loongarch-mlasx-error.c     | 15 +++++++
 clang/test/Driver/loongarch-mlasx.c           | 37 +++++++++++++++++
 clang/test/Driver/loongarch-mlsx-error.c      | 12 ++++++
 clang/test/Driver/loongarch-mlsx.c            | 41 +++++++++++++++++++
 clang/test/Preprocessor/init-loongarch.c      | 35 ++++++++++++++++
 10 files changed, 203 insertions(+), 1 deletion(-)
 create mode 100644 clang/test/Driver/loongarch-mlasx-error.c
 create mode 100644 clang/test/Driver/loongarch-mlasx.c
 create mode 100644 clang/test/Driver/loongarch-mlsx-error.c
 create mode 100644 clang/test/Driver/loongarch-mlsx.c

diff --git a/clang/include/clang/Basic/DiagnosticDriverKinds.td b/clang/include/clang/Basic/DiagnosticDriverKinds.td
index c0ccd64a2a7b8..676f1a62b49dd 100644
--- a/clang/include/clang/Basic/DiagnosticDriverKinds.td
+++ b/clang/include/clang/Basic/DiagnosticDriverKinds.td
@@ -765,6 +765,12 @@ def warn_drv_loongarch_conflicting_implied_val : Warning<
   InGroup<OptionIgnored>;
 def err_drv_loongarch_invalid_mfpu_EQ : Error<
   "invalid argument '%0' to -mfpu=; must be one of: 64, 32, none, 0 (alias for none)">;
+def err_drv_loongarch_wrong_fpu_width_for_lsx : Error<
+  "wrong fpu width; LSX depends on 64-bit FPU.">;
+def err_drv_loongarch_wrong_fpu_width_for_lasx : Error<
+  "wrong fpu width; LASX depends on 64-bit FPU.">;
+def err_drv_loongarch_invalid_simd_option_combination : Error<
+  "invalid option combination; LASX depends on LSX.">;
 
 def err_drv_expand_response_file : Error<
   "failed to expand response file: %0">;
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 7f3f5125d42e7..c8b730e0f7ecd 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -208,6 +208,8 @@ def m_riscv_Features_Group : OptionGroup<"<riscv features group>">,
                              Group<m_Group>, DocName<"RISC-V">;
 def m_ve_Features_Group : OptionGroup<"<ve features group>">,
                           Group<m_Group>, DocName<"VE">;
+def m_loongarch_Features_Group : OptionGroup<"<loongarch features group>">,
+                                 Group<m_Group>, DocName<"LoongArch">;
 
 def m_libc_Group : OptionGroup<"<m libc group>">, Group<m_mips_Features_Group>,
                    Flags<[HelpHidden]>;
@@ -4886,6 +4888,14 @@ def mstack_protector_guard_reg_EQ : Joined<["-"], "mstack-protector-guard-reg=">
 def mfentry : Flag<["-"], "mfentry">, HelpText<"Insert calls to fentry at function entry (x86/SystemZ only)">,
   Visibility<[ClangOption, CC1Option]>, Group<m_Group>,
   MarshallingInfoFlag<CodeGenOpts<"CallFEntry">>;
+def mlsx : Flag<["-"], "mlsx">, Group<m_loongarch_Features_Group>,
+  HelpText<"Enable Loongson SIMD Extension (LSX).">;
+def mno_lsx : Flag<["-"], "mno-lsx">, Group<m_loongarch_Features_Group>,
+  HelpText<"Disable Loongson SIMD Extension (LSX).">;
+def mlasx : Flag<["-"], "mlasx">, Group<m_loongarch_Features_Group>,
+  HelpText<"Enable Loongson Advanced SIMD Extension (LASX).">;
+def mno_lasx : Flag<["-"], "mno-lasx">, Group<m_loongarch_Features_Group>,
+  HelpText<"Disable Loongson Advanced SIMD Extension (LASX).">;
 def mnop_mcount : Flag<["-"], "mnop-mcount">, HelpText<"Generate mcount/__fentry__ calls as nops. To activate they need to be patched in.">,
   Visibility<[ClangOption, CC1Option]>, Group<m_Group>,
   MarshallingInfoFlag<CodeGenOpts<"MNopMCount">>;
diff --git a/clang/lib/Basic/Targets/LoongArch.cpp b/clang/lib/Basic/Targets/LoongArch.cpp
index 4448a2ae10a17..88537989a0512 100644
--- a/clang/lib/Basic/Targets/LoongArch.cpp
+++ b/clang/lib/Basic/Targets/LoongArch.cpp
@@ -208,6 +208,11 @@ void LoongArchTargetInfo::getTargetDefines(const LangOptions &Opts,
     TuneCPU = ArchName;
   Builder.defineMacro("__loongarch_tune", Twine('"') + TuneCPU + Twine('"'));
 
+  if (HasFeatureLSX)
+    Builder.defineMacro("__loongarch_sx", Twine(1));
+  if (HasFeatureLASX)
+    Builder.defineMacro("__loongarch_asx", Twine(1));
+
   StringRef ABI = getABI();
   if (ABI == "lp64d" || ABI == "lp64f" || ABI == "lp64s")
     Builder.defineMacro("__loongarch_lp64");
@@ -257,6 +262,8 @@ bool LoongArchTargetInfo::hasFeature(StringRef Feature) const {
       .Case("loongarch64", Is64Bit)
       .Case("32bit", !Is64Bit)
       .Case("64bit", Is64Bit)
+      .Case("lsx", HasFeatureLSX)
+      .Case("lasx", HasFeatureLASX)
       .Default(false);
 }
 
@@ -274,7 +281,10 @@ bool LoongArchTargetInfo::handleTargetFeatures(
       if (Feature == "+d") {
         HasFeatureD = true;
       }
-    }
+    } else if (Feature == "+lsx")
+      HasFeatureLSX = true;
+    else if (Feature == "+lasx")
+      HasFeatureLASX = true;
   }
   return true;
 }
diff --git a/clang/lib/Basic/Targets/LoongArch.h b/clang/lib/Basic/Targets/LoongArch.h
index ba7fb78ab94cd..3313102492cb8 100644
--- a/clang/lib/Basic/Targets/LoongArch.h
+++ b/clang/lib/Basic/Targets/LoongArch.h
@@ -27,12 +27,16 @@ class LLVM_LIBRARY_VISIBILITY LoongArchTargetInfo : public TargetInfo {
   std::string CPU;
   bool HasFeatureD;
   bool HasFeatureF;
+  bool HasFeatureLSX;
+  bool HasFeatureLASX;
 
 public:
   LoongArchTargetInfo(const llvm::Triple &Triple, const TargetOptions &)
       : TargetInfo(Triple) {
     HasFeatureD = false;
     HasFeatureF = false;
+    HasFeatureLSX = false;
+    HasFeatureLASX = false;
     LongDoubleWidth = 128;
     LongDoubleAlign = 128;
     LongDoubleFormat = &llvm::APFloat::IEEEquad();
diff --git a/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp b/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp
index 65925e9ed6101..31153a67ad284 100644
--- a/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp
@@ -175,6 +175,38 @@ void loongarch::getLoongArchTargetFeatures(const Driver &D,
     A->ignoreTargetSpecific();
   if (Arg *A = Args.getLastArgNoClaim(options::OPT_mfpu_EQ))
     A->ignoreTargetSpecific();
+
+  // Select lsx feature determined by -m[no-]lsx.
+  if (const Arg *A = Args.getLastArg(options::OPT_mlsx, options::OPT_mno_lsx)) {
+    // LSX depends on 64-bit FPU.
+    // -m*-float and -mfpu=none/0/32 conflict with -mlsx.
+    if (A->getOption().matches(options::OPT_mlsx)) {
+      if (llvm::find(Features, "-d") != Features.end())
+        D.Diag(diag::err_drv_loongarch_wrong_fpu_width_for_lsx);
+      else /*-mlsx*/
+        Features.push_back("+lsx");
+    } else /*-mno-lsx*/ {
+      Features.push_back("-lsx");
+    }
+  }
+
+  // Select lasx feature determined by -m[no-]lasx.
+  if (const Arg *A =
+          Args.getLastArg(options::OPT_mlasx, options::OPT_mno_lasx)) {
+    // LASX depends on 64-bit FPU and LSX.
+    // -mno-lsx conflicts with -mlasx.
+    if (A->getOption().matches(options::OPT_mlasx)) {
+      if (llvm::find(Features, "-d") != Features.end())
+        D.Diag(diag::err_drv_loongarch_wrong_fpu_width_for_lasx);
+      else if (llvm::find(Features, "-lsx") != Features.end())
+        D.Diag(diag::err_drv_loongarch_invalid_simd_option_combination);
+      else { /*-mlasx*/
+        Features.push_back("+lsx");
+        Features.push_back("+lasx");
+      }
+    } else /*-mno-lasx*/
+      Features.push_back("-lasx");
+  }
 }
 
 std::string loongarch::postProcessTargetCPUString(const std::string &CPU,
diff --git a/clang/test/Driver/loongarch-mlasx-error.c b/clang/test/Driver/loongarch-mlasx-error.c
new file mode 100644
index 0000000000000..e66f277f7c292
--- /dev/null
+++ b/clang/test/Driver/loongarch-mlasx-error.c
@@ -0,0 +1,15 @@
+// RUN: not %clang --target=loongarch64 %s -fsyntax-only -mlasx -msingle-float 2>&1 \
+// RUN:   FileCheck --check-prefix=ERROR_LASX_FPU64 %s
+// RUN: not %clang --target=loongarch64 %s -fsyntax-only -mlasx -msoft-float 2>&1 \
+// RUN:   FileCheck --check-prefix=ERROR_LASX_FPU64 %s
+// RUN: not %clang --target=loongarch64 %s -fsyntax-only -mlasx -mfpu=32 2>&1 \
+// RUN:   FileCheck --check-prefix=ERROR_LASX_FPU64 %s
+// RUN: not %clang --target=loongarch64 %s -fsyntax-only -mlasx -mfpu=0 2>&1 \
+// RUN:   FileCheck --check-prefix=ERROR_LASX_FPU64 %s
+// RUN: not %clang --target=loongarch64 %s -fsyntax-only -mlasx -mfpu=none 2>&1 \
+// RUN:   FileCheck --check-prefix=ERROR_LASX_FPU64 %s
+// RUN: not %clang --target=loongarch64 %s -fsyntax-only -mlasx -mno-lsx 2>&1 \
+// RUN:   FileCheck --check-prefix=ERROR_LASX_FPU128 %s
+
+// ERROR_LASX_FPU64: error: wrong fpu width; LASX depends on 64-bit FPU.
+// ERROR_LASX_FPU128: error: invalid option combination; LASX depends on LSX.
diff --git a/clang/test/Driver/loongarch-mlasx.c b/clang/test/Driver/loongarch-mlasx.c
new file mode 100644
index 0000000000000..0b934f125c9e4
--- /dev/null
+++ b/clang/test/Driver/loongarch-mlasx.c
@@ -0,0 +1,37 @@
+/// Test -m[no-]lasx options.
+
+// RUN: %clang --target=loongarch64 -mlasx -fsyntax-only %s -### 2>&1 | \
+// RUN:   FileCheck %s --check-prefix=CC1-LASX
+// RUN: %clang --target=loongarch64 -mno-lasx -fsyntax-only %s -### 2>&1 | \
+// RUN:   FileCheck %s --check-prefix=CC1-NOLASX
+// RUN: %clang --target=loongarch64 -mlasx -mno-lasx -fsyntax-only %s -### 2>&1 | \
+// RUN:   FileCheck %s --check-prefix=CC1-NOLASX
+// RUN: %clang --target=loongarch64 -mno-lasx -mlasx -fsyntax-only %s -### 2>&1 | \
+// RUN:   FileCheck %s --check-prefix=CC1-LASX
+// RUN: %clang --target=loongarch64 -mlsx -mlasx -fsyntax-only %s -### 2>&1 | \
+// RUN:   FileCheck %s --check-prefix=CC1-LASX
+// RUN: %clang --target=loongarch64 -mlasx -mlsx -fsyntax-only %s -### 2>&1 | \
+// RUN:   FileCheck %s --check-prefix=CC1-LASX
+
+// RUN: %clang --target=loongarch64 -mlasx -S -emit-llvm %s -o - | \
+// RUN:   FileCheck %s --check-prefix=IR-LASX
+// RUN: %clang --target=loongarch64 -mno-lasx -S -emit-llvm %s -o - | \
+// RUN:   FileCheck %s --check-prefix=IR-NOLASX
+// RUN: %clang --target=loongarch64 -mlasx -mno-lasx -S -emit-llvm %s -o - | \
+// RUN:   FileCheck %s --check-prefix=IR-NOLASX
+// RUN: %clang --target=loongarch64 -mno-lasx -mlasx -S -emit-llvm %s -o - | \
+// RUN:   FileCheck %s --check-prefix=IR-LASX
+// RUN: %clang --target=loongarch64 -mlsx -mlasx -S -emit-llvm %s -o - | \
+// RUN:   FileCheck %s --check-prefix=IR-LASX
+// RUN: %clang --target=loongarch64 -mlasx -mlsx -S -emit-llvm %s -o - | \
+// RUN:   FileCheck %s --check-prefix=IR-LASX
+
+// CC1-LASX: "-target-feature" "+lsx" "-target-feature" "+lasx"
+// CC1-NOLASX: "-target-feature" "-lasx"
+
+// IR-LASX: attributes #[[#]] ={{.*}}"target-features"="{{(.*,)?}}+lasx{{(,.*)?}}"
+// IR-NOLASX: attributes #[[#]] ={{.*}}"target-features"="{{(.*,)?}}-lasx{{(,.*)?}}"
+
+int foo(void){
+  return 3;
+}
diff --git a/clang/test/Driver/loongarch-mlsx-error.c b/clang/test/Driver/loongarch-mlsx-error.c
new file mode 100644
index 0000000000000..bd6b8e2718bf6
--- /dev/null
+++ b/clang/test/Driver/loongarch-mlsx-error.c
@@ -0,0 +1,12 @@
+// RUN: not %clang --target=loongarch64 %s -fsyntax-only -mlsx -msingle-float 2>&1 \
+// RUN:   FileCheck --check-prefix=ERROR_LSX_FPU64 %s
+// RUN: not %clang --target=loongarch64 %s -fsyntax-only -mlsx -msoft-float 2>&1 \
+// RUN:   FileCheck --check-prefix=ERROR_LSX_FPU64 %s
+// RUN: not %clang --target=loongarch64 %s -fsyntax-only -mlsx -mfpu=32 2>&1 \
+// RUN:   FileCheck --check-prefix=ERROR_LSX_FPU64 %s
+// RUN: not %clang --target=loongarch64 %s -fsyntax-only -mlsx -mfpu=0 2>&1 \
+// RUN:   FileCheck --check-prefix=ERROR_LSX_FPU64 %s
+// RUN: not %clang --target=loongarch64 %s -fsyntax-only -mlsx -mfpu=none 2>&1 \
+// RUN:   FileCheck --check-prefix=ERROR_LSX_FPU64 %s
+
+// ERROR_LSX_FPU64: error: wrong fpu width; LSX depends on 64-bit FPU.
diff --git a/clang/test/Driver/loongarch-mlsx.c b/clang/test/Driver/loongarch-mlsx.c
new file mode 100644
index 0000000000000..7d4307b078e1a
--- /dev/null
+++ b/clang/test/Driver/loongarch-mlsx.c
@@ -0,0 +1,41 @@
+/// Test -m[no-]lsx options.
+
+// RUN: %clang --target=loongarch64 -mlsx -fsyntax-only %s -### 2>&1 | \
+// RUN:   FileCheck %s --check-prefix=CC1-LSX
+// RUN: %clang --target=loongarch64 -mno-lsx -fsyntax-only %s -### 2>&1 | \
+// RUN:   FileCheck %s --check-prefix=CC1-NOLSX
+// RUN: %clang --target=loongarch64 -mlsx -mno-lsx -fsyntax-only %s -### 2>&1 | \
+// RUN:   FileCheck %s --check-prefix=CC1-NOLSX
+// RUN: %clang --target=loongarch64 -mno-lsx -mlsx -fsyntax-only %s -### 2>&1 | \
+// RUN:   FileCheck %s --check-prefix=CC1-LSX
+// RUN: %clang --target=loongarch64 -mlsx -mno-lasx -fsyntax-only %s -### 2>&1 | \
+// RUN:   FileCheck %s --check-prefix=CC1-LSX
+// RUN: %clang --target=loongarch64 -mno-lasx -mlsx -fsyntax-only %s -### 2>&1 | \
+// RUN:   FileCheck %s --check-prefix=CC1-LSX
+// RUN: %clang --target=loongarch64 -mno-lsx -mno-lasx -fsyntax-only %s -### 2>&1 | \
+// RUN:   FileCheck %s --check-prefix=CC1-NOLSX
+
+// RUN: %clang --target=loongarch64 -mlsx -S -emit-llvm %s -o - | \
+// RUN:   FileCheck %s --check-prefix=IR-LSX
+// RUN: %clang --target=loongarch64 -mno-lsx -S -emit-llvm %s -o - | \
+// RUN:   FileCheck %s --check-prefix=IR-NOLSX
+// RUN: %clang --target=loongarch64 -mlsx -mno-lsx -S -emit-llvm %s -o - | \
+// RUN:   FileCheck %s --check-prefix=IR-NOLSX
+// RUN: %clang --target=loongarch64 -mno-lsx -mlsx -S -emit-llvm %s -o - | \
+// RUN:   FileCheck %s --check-prefix=IR-LSX
+// RUN: %clang --target=loongarch64 -mlsx -mno-lasx -S -emit-llvm %s -o - | \
+// RUN:   FileCheck %s --check-prefix=IR-LSX
+// RUN: %clang --target=loongarch64 -mno-lasx -mlsx -S -emit-llvm %s -o - | \
+// RUN:   FileCheck %s --check-prefix=IR-LSX
+// RUN: %clang --target=loongarch64 -mno-lsx -mno-lasx -S -emit-llvm %s -o - | \
+// RUN:   FileCheck %s --check-prefix=IR-NOLSX
+
+// CC1-LSX: "-target-feature" "+lsx"
+// CC1-NOLSX: "-target-feature" "-lsx"
+
+// IR-LSX: attributes #[[#]] ={{.*}}"target-features"="{{(.*,)?}}+lsx{{(,.*)?}}"
+// IR-NOLSX: attributes #[[#]] ={{.*}}"target-features"="{{(.*,)?}}-lsx{{(,.*)?}}"
+
+int foo(void){
+  return 3;
+}
diff --git a/clang/test/Preprocessor/init-loongarch.c b/clang/test/Preprocessor/init-loongarch.c
index 4ef42a921ec03..e235a72830215 100644
--- a/clang/test/Preprocessor/init-loongarch.c
+++ b/clang/test/Preprocessor/init-loongarch.c
@@ -807,3 +807,38 @@
 
 // ARCH-TUNE: #define __loongarch_arch "[[ARCH]]"
 // ARCH-TUNE: #define __loongarch_tune "[[TUNE]]"
+
+// RUN: %clang --target=loongarch64 -mlsx -x c -E -dM %s -o - \
+// RUN:   | FileCheck --match-full-lines --check-prefix=MLSX %s
+// RUN: %clang --target=loongarch64 -mno-lsx -mlsx -x c -E -dM %s -o - \
+// RUN:   | FileCheck --match-full-lines --check-prefix=MLSX %s
+// RUN: %clang --target=loongarch64 -mlsx -mno-lasx -x c -E -dM %s -o - \
+// RUN:   | FileCheck --match-full-lines --check-prefix=MLSX %s
+// RUN: %clang --target=loongarch64 -mno-lasx -mlsx -x c -E -dM %s -o - \
+// RUN:   | FileCheck --match-full-lines --check-prefix=MLSX %s
+// MLSX-NOT: #define __loongarch_asx
+// MLSX: #define __loongarch_sx 1
+
+// RUN: %clang --target=loongarch64 -mlasx -x c -E -dM %s -o - \
+// RUN:   | FileCheck --match-full-lines --check-prefix=MLASX %s
+// RUN: %clang --target=loongarch64 -mno-lasx -mlasx -x c -E -dM %s -o - \
+// RUN:   | FileCheck --match-full-lines --check-prefix=MLASX %s
+// RUN: %clang --target=loongarch64 -mlsx -mlasx -x c -E -dM %s -o - \
+// RUN:   | FileCheck --match-full-lines --check-prefix=MLASX %s
+// RUN: %clang --target=loongarch64 -mlasx -mlsx -x c -E -dM %s -o - \
+// RUN:   | FileCheck --match-full-lines --check-prefix=MLASX %s
+// MLASX: #define __loongarch_asx 1
+// MLASX: #define __loongarch_sx 1
+
+// RUN: %clang --target=loongarch64 -mno-lsx -x c -E -dM %s -o - \
+// RUN:   | FileCheck --match-full-lines --check-prefix=MNO-LSX %s
+// RUN: %clang --target=loongarch64 -mlsx -mno-lsx -x c -E -dM %s -o - \
+// RUN:   | FileCheck --match-full-lines --check-prefix=MNO-LSX %s
+// RUN: %clang --target=loongarch64 -mno-lsx -mno-lasx -x c -E -dM %s -o - \
+// RUN:   | FileCheck --match-full-lines --check-prefix=MNO-LSX %s
+// RUN: %clang --target=loongarch64 -mno-lasx -mno-lsx -x c -E -dM %s -o - \
+// RUN:   | FileCheck --match-full-lines --check-prefix=MNO-LSX %s
+// RUN: %clang --target=loongarch64 -mno-lasx -x c -E -dM %s -o - \
+// RUN:   | FileCheck --match-full-lines --check-prefix=MNO-LSX %s
+// MNO-LSX-NOT: #define __loongarch_asx
+// MNO-LSX-NOT: #define __loongarch_sx

From eb49b86f5a9b54b0e3c37024334a3c6f6ca88e14 Mon Sep 17 00:00:00 2001
From: licongtian <licongtian@loongson.cn>
Date: Wed, 25 Oct 2023 17:35:32 +0800
Subject: [PATCH 474/877] [Clang][LoongArch] Add ABI implementation of passing
 vectors

---
 clang/lib/CodeGen/Targets/LoongArch.cpp | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/clang/lib/CodeGen/Targets/LoongArch.cpp b/clang/lib/CodeGen/Targets/LoongArch.cpp
index 7483bf6d6d1e8..26c68c3583b2a 100644
--- a/clang/lib/CodeGen/Targets/LoongArch.cpp
+++ b/clang/lib/CodeGen/Targets/LoongArch.cpp
@@ -321,6 +321,13 @@ ABIArgInfo LoongArchABIInfo::classifyArgumentType(QualType Ty, bool IsFixed,
     return ABIArgInfo::getDirect();
   }
 
+  // Pass 128-bit/256-bit vector values via vector registers directly.
+  if (Ty->isVectorType() && (((getContext().getTypeSize(Ty) == 128) &&
+                              (getTarget().hasFeature("lsx"))) ||
+                             ((getContext().getTypeSize(Ty) == 256) &&
+                              getTarget().hasFeature("lasx"))))
+    return ABIArgInfo::getDirect();
+
   // Complex types for the *f or *d ABI must be passed directly rather than
   // using CoerceAndExpand.
   if (IsFixed && Ty->isComplexType() && FRLen && FARsLeft >= 2) {

From d6bfa3341181a80de6c8aede807fc1acc3ce8d9b Mon Sep 17 00:00:00 2001
From: licongtian <licongtian@loongson.cn>
Date: Wed, 25 Oct 2023 17:41:03 +0800
Subject: [PATCH 475/877] [Clang][LoongArch] Support the builtin functions for
 LSX

This patch does the following work:
- Define the builtin functions for LSX
- Add the header file lsxintrin.h
- Add the immediate number range checking for LSX builtins
---
 .../include/clang/Basic/BuiltinsLoongArch.def |   43 +-
 .../clang/Basic/BuiltinsLoongArchBase.def     |   53 +
 .../clang/Basic/BuiltinsLoongArchLSX.def      |  953 +++++
 clang/lib/Headers/CMakeLists.txt              |    1 +
 clang/lib/Headers/lsxintrin.h                 | 3726 +++++++++++++++++
 clang/lib/Sema/SemaChecking.cpp               |  229 +-
 6 files changed, 4965 insertions(+), 40 deletions(-)
 create mode 100644 clang/include/clang/Basic/BuiltinsLoongArchBase.def
 create mode 100644 clang/include/clang/Basic/BuiltinsLoongArchLSX.def
 create mode 100644 clang/lib/Headers/lsxintrin.h

diff --git a/clang/include/clang/Basic/BuiltinsLoongArch.def b/clang/include/clang/Basic/BuiltinsLoongArch.def
index 20510e18fe58c..9ec19c31095af 100644
--- a/clang/include/clang/Basic/BuiltinsLoongArch.def
+++ b/clang/include/clang/Basic/BuiltinsLoongArch.def
@@ -15,46 +15,11 @@
 #   define TARGET_BUILTIN(ID, TYPE, ATTRS, FEATURE) BUILTIN(ID, TYPE, ATTRS)
 #endif
 
-// TODO: Support more builtins.
-TARGET_BUILTIN(__builtin_loongarch_cacop_d, "vWiUWiWi", "nc", "64bit")
-TARGET_BUILTIN(__builtin_loongarch_cacop_w, "viUii", "nc", "32bit")
-TARGET_BUILTIN(__builtin_loongarch_dbar, "vIUi", "nc", "")
-TARGET_BUILTIN(__builtin_loongarch_ibar, "vIUi", "nc", "")
-TARGET_BUILTIN(__builtin_loongarch_movfcsr2gr, "UiIUi", "nc", "f")
-TARGET_BUILTIN(__builtin_loongarch_movgr2fcsr, "vIUiUi", "nc", "f")
-TARGET_BUILTIN(__builtin_loongarch_break, "vIUi", "nc", "")
-TARGET_BUILTIN(__builtin_loongarch_syscall, "vIUi", "nc", "")
-TARGET_BUILTIN(__builtin_loongarch_cpucfg, "UiUi", "nc", "")
-TARGET_BUILTIN(__builtin_loongarch_asrtle_d, "vWiWi", "nc", "64bit")
-TARGET_BUILTIN(__builtin_loongarch_asrtgt_d, "vWiWi", "nc", "64bit")
+// Definition of LoongArch basic builtins.
+#include "clang/Basic/BuiltinsLoongArchBase.def"
 
-TARGET_BUILTIN(__builtin_loongarch_crc_w_b_w, "iii", "nc", "64bit")
-TARGET_BUILTIN(__builtin_loongarch_crc_w_h_w, "iii", "nc", "64bit")
-TARGET_BUILTIN(__builtin_loongarch_crc_w_w_w, "iii", "nc", "64bit")
-TARGET_BUILTIN(__builtin_loongarch_crc_w_d_w, "iWii", "nc", "64bit")
-TARGET_BUILTIN(__builtin_loongarch_crcc_w_b_w, "iii", "nc", "64bit")
-TARGET_BUILTIN(__builtin_loongarch_crcc_w_h_w, "iii", "nc", "64bit")
-TARGET_BUILTIN(__builtin_loongarch_crcc_w_w_w, "iii", "nc", "64bit")
-TARGET_BUILTIN(__builtin_loongarch_crcc_w_d_w, "iWii", "nc", "64bit")
-
-TARGET_BUILTIN(__builtin_loongarch_csrrd_w, "UiIUi", "nc", "")
-TARGET_BUILTIN(__builtin_loongarch_csrrd_d, "UWiIUi", "nc", "64bit")
-TARGET_BUILTIN(__builtin_loongarch_csrwr_w, "UiUiIUi", "nc", "")
-TARGET_BUILTIN(__builtin_loongarch_csrwr_d, "UWiUWiIUi", "nc", "64bit")
-TARGET_BUILTIN(__builtin_loongarch_csrxchg_w, "UiUiUiIUi", "nc", "")
-TARGET_BUILTIN(__builtin_loongarch_csrxchg_d, "UWiUWiUWiIUi", "nc", "64bit")
-
-TARGET_BUILTIN(__builtin_loongarch_iocsrrd_b, "UiUi", "nc", "")
-TARGET_BUILTIN(__builtin_loongarch_iocsrrd_h, "UiUi", "nc", "")
-TARGET_BUILTIN(__builtin_loongarch_iocsrrd_w, "UiUi", "nc", "")
-TARGET_BUILTIN(__builtin_loongarch_iocsrrd_d, "UWiUi", "nc", "64bit")
-TARGET_BUILTIN(__builtin_loongarch_iocsrwr_b, "vUiUi", "nc", "")
-TARGET_BUILTIN(__builtin_loongarch_iocsrwr_h, "vUiUi", "nc", "")
-TARGET_BUILTIN(__builtin_loongarch_iocsrwr_w, "vUiUi", "nc", "")
-TARGET_BUILTIN(__builtin_loongarch_iocsrwr_d, "vUWiUi", "nc", "64bit")
-
-TARGET_BUILTIN(__builtin_loongarch_lddir_d, "WiWiIUWi", "nc", "64bit")
-TARGET_BUILTIN(__builtin_loongarch_ldpte_d, "vWiIUWi", "nc", "64bit")
+// Definition of LSX builtins.
+#include "clang/Basic/BuiltinsLoongArchLSX.def"
 
 #undef BUILTIN
 #undef TARGET_BUILTIN
diff --git a/clang/include/clang/Basic/BuiltinsLoongArchBase.def b/clang/include/clang/Basic/BuiltinsLoongArchBase.def
new file mode 100644
index 0000000000000..cbb239223aae3
--- /dev/null
+++ b/clang/include/clang/Basic/BuiltinsLoongArchBase.def
@@ -0,0 +1,53 @@
+//============------------ BuiltinsLoongArchBase.def -------------*- C++ -*-==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the LoongArch-specific basic builtin function database.
+// Users of this file must define the BUILTIN macro to make use of this
+// information.
+//
+//===----------------------------------------------------------------------===//
+
+TARGET_BUILTIN(__builtin_loongarch_cacop_d, "vWiUWiWi", "nc", "64bit")
+TARGET_BUILTIN(__builtin_loongarch_cacop_w, "viUii", "nc", "32bit")
+TARGET_BUILTIN(__builtin_loongarch_dbar, "vIUi", "nc", "")
+TARGET_BUILTIN(__builtin_loongarch_ibar, "vIUi", "nc", "")
+TARGET_BUILTIN(__builtin_loongarch_movfcsr2gr, "UiIUi", "nc", "f")
+TARGET_BUILTIN(__builtin_loongarch_movgr2fcsr, "vIUiUi", "nc", "f")
+TARGET_BUILTIN(__builtin_loongarch_break, "vIUi", "nc", "")
+TARGET_BUILTIN(__builtin_loongarch_syscall, "vIUi", "nc", "")
+TARGET_BUILTIN(__builtin_loongarch_cpucfg, "UiUi", "nc", "")
+TARGET_BUILTIN(__builtin_loongarch_asrtle_d, "vWiWi", "nc", "64bit")
+TARGET_BUILTIN(__builtin_loongarch_asrtgt_d, "vWiWi", "nc", "64bit")
+
+TARGET_BUILTIN(__builtin_loongarch_crc_w_b_w, "iii", "nc", "64bit")
+TARGET_BUILTIN(__builtin_loongarch_crc_w_h_w, "iii", "nc", "64bit")
+TARGET_BUILTIN(__builtin_loongarch_crc_w_w_w, "iii", "nc", "64bit")
+TARGET_BUILTIN(__builtin_loongarch_crc_w_d_w, "iWii", "nc", "64bit")
+TARGET_BUILTIN(__builtin_loongarch_crcc_w_b_w, "iii", "nc", "64bit")
+TARGET_BUILTIN(__builtin_loongarch_crcc_w_h_w, "iii", "nc", "64bit")
+TARGET_BUILTIN(__builtin_loongarch_crcc_w_w_w, "iii", "nc", "64bit")
+TARGET_BUILTIN(__builtin_loongarch_crcc_w_d_w, "iWii", "nc", "64bit")
+
+TARGET_BUILTIN(__builtin_loongarch_csrrd_w, "UiIUi", "nc", "")
+TARGET_BUILTIN(__builtin_loongarch_csrrd_d, "UWiIUi", "nc", "64bit")
+TARGET_BUILTIN(__builtin_loongarch_csrwr_w, "UiUiIUi", "nc", "")
+TARGET_BUILTIN(__builtin_loongarch_csrwr_d, "UWiUWiIUi", "nc", "64bit")
+TARGET_BUILTIN(__builtin_loongarch_csrxchg_w, "UiUiUiIUi", "nc", "")
+TARGET_BUILTIN(__builtin_loongarch_csrxchg_d, "UWiUWiUWiIUi", "nc", "64bit")
+
+TARGET_BUILTIN(__builtin_loongarch_iocsrrd_b, "UiUi", "nc", "")
+TARGET_BUILTIN(__builtin_loongarch_iocsrrd_h, "UiUi", "nc", "")
+TARGET_BUILTIN(__builtin_loongarch_iocsrrd_w, "UiUi", "nc", "")
+TARGET_BUILTIN(__builtin_loongarch_iocsrrd_d, "UWiUi", "nc", "64bit")
+TARGET_BUILTIN(__builtin_loongarch_iocsrwr_b, "vUiUi", "nc", "")
+TARGET_BUILTIN(__builtin_loongarch_iocsrwr_h, "vUiUi", "nc", "")
+TARGET_BUILTIN(__builtin_loongarch_iocsrwr_w, "vUiUi", "nc", "")
+TARGET_BUILTIN(__builtin_loongarch_iocsrwr_d, "vUWiUi", "nc", "64bit")
+
+TARGET_BUILTIN(__builtin_loongarch_lddir_d, "WiWiIUWi", "nc", "64bit")
+TARGET_BUILTIN(__builtin_loongarch_ldpte_d, "vWiIUWi", "nc", "64bit")
diff --git a/clang/include/clang/Basic/BuiltinsLoongArchLSX.def b/clang/include/clang/Basic/BuiltinsLoongArchLSX.def
new file mode 100644
index 0000000000000..8e6aec886c50c
--- /dev/null
+++ b/clang/include/clang/Basic/BuiltinsLoongArchLSX.def
@@ -0,0 +1,953 @@
+//=============------------- BuiltinsLoongArchLSX.def --------------- C++ -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the LoongArch-specific LSX builtin function database.
+// Users of this file must define the BUILTIN macro to make use of this
+// information.
+//
+//===----------------------------------------------------------------------===//
+
+TARGET_BUILTIN(__builtin_lsx_vadd_b, "V16cV16cV16c", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vadd_h, "V8sV8sV8s", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vadd_w, "V4iV4iV4i", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vadd_d, "V2LLiV2LLiV2LLi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vadd_q, "V2LLiV2LLiV2LLi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vsub_b, "V16cV16cV16c", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vsub_h, "V8sV8sV8s", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vsub_w, "V4iV4iV4i", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vsub_d, "V2LLiV2LLiV2LLi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vsub_q, "V2LLiV2LLiV2LLi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vaddi_bu, "V16cV16cIUi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vaddi_hu, "V8sV8sIUi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vaddi_wu, "V4iV4iIUi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vaddi_du, "V2LLiV2LLiIUi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vsubi_bu, "V16cV16cIUi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vsubi_hu, "V8sV8sIUi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vsubi_wu, "V4iV4iIUi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vsubi_du, "V2LLiV2LLiIUi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vneg_b, "V16cV16c", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vneg_h, "V8sV8s", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vneg_w, "V4iV4i", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vneg_d, "V2LLiV2LLi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vsadd_b, "V16ScV16ScV16Sc", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vsadd_h, "V8SsV8SsV8Ss", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vsadd_w, "V4SiV4SiV4Si", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vsadd_d, "V2SLLiV2SLLiV2SLLi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vsadd_bu, "V16UcV16UcV16Uc", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vsadd_hu, "V8UsV8UsV8Us", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vsadd_wu, "V4UiV4UiV4Ui", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vsadd_du, "V2ULLiV2ULLiV2ULLi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vssub_b, "V16ScV16ScV16Sc", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vssub_h, "V8SsV8SsV8Ss", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vssub_w, "V4SiV4SiV4Si", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vssub_d, "V2SLLiV2SLLiV2SLLi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vssub_bu, "V16UcV16UcV16Uc", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vssub_hu, "V8UsV8UsV8Us", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vssub_wu, "V4UiV4UiV4Ui", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vssub_du, "V2ULLiV2ULLiV2ULLi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vhaddw_h_b, "V8SsV16ScV16Sc", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vhaddw_w_h, "V4SiV8SsV8Ss", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vhaddw_d_w, "V2SLLiV4SiV4Si", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vhaddw_q_d, "V2LLiV2LLiV2LLi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vhaddw_hu_bu, "V8UsV16UcV16Uc", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vhaddw_wu_hu, "V4UiV8UsV8Us", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vhaddw_du_wu, "V2ULLiV4UiV4Ui", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vhaddw_qu_du, "V2ULLiV2ULLiV2ULLi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vhsubw_h_b, "V8SsV16ScV16Sc", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vhsubw_w_h, "V4SiV8SsV8Ss", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vhsubw_d_w, "V2SLLiV4SiV4Si", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vhsubw_q_d, "V2LLiV2LLiV2LLi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vhsubw_hu_bu, "V8UsV16UcV16Uc", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vhsubw_wu_hu, "V4UiV8UsV8Us", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vhsubw_du_wu, "V2ULLiV4UiV4Ui", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vhsubw_qu_du, "V2ULLiV2ULLiV2ULLi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vaddwev_h_b, "V8sV16cV16c", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vaddwev_w_h, "V4SiV8sV8s", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vaddwev_d_w, "V2LLiV4SiV4Si", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vaddwev_q_d, "V2LLiV2LLiV2LLi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vaddwod_h_b, "V8sV16cV16c", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vaddwod_w_h, "V4SiV8sV8s", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vaddwod_d_w, "V2LLiV4SiV4Si", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vaddwod_q_d, "V2LLiV2LLiV2LLi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vsubwev_h_b, "V8sV16cV16c", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vsubwev_w_h, "V4SiV8sV8s", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vsubwev_d_w, "V2LLiV4SiV4Si", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vsubwev_q_d, "V2LLiV2LLiV2LLi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vsubwod_h_b, "V8sV16cV16c", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vsubwod_w_h, "V4SiV8sV8s", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vsubwod_d_w, "V2LLiV4SiV4Si", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vsubwod_q_d, "V2LLiV2LLiV2LLi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vaddwev_h_bu, "V8sV16UcV16Uc", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vaddwev_w_hu, "V4SiV8UsV8Us", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vaddwev_d_wu, "V2LLiV4UiV4Ui", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vaddwev_q_du, "V2LLiV2ULLiV2ULLi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vaddwod_h_bu, "V8sV16UcV16Uc", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vaddwod_w_hu, "V4SiV8UsV8Us", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vaddwod_d_wu, "V2LLiV4UiV4Ui", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vaddwod_q_du, "V2LLiV2ULLiV2ULLi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vsubwev_h_bu, "V8sV16UcV16Uc", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vsubwev_w_hu, "V4SiV8UsV8Us", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vsubwev_d_wu, "V2LLiV4UiV4Ui", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vsubwev_q_du, "V2LLiV2ULLiV2ULLi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vsubwod_h_bu, "V8sV16UcV16Uc", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vsubwod_w_hu, "V4SiV8UsV8Us", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vsubwod_d_wu, "V2LLiV4UiV4Ui", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vsubwod_q_du, "V2LLiV2ULLiV2ULLi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vaddwev_h_bu_b, "V8sV16UcV16c", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vaddwev_w_hu_h, "V4SiV8UsV8s", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vaddwev_d_wu_w, "V2LLiV4UiV4Si", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vaddwev_q_du_d, "V2LLiV2ULLiV2LLi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vaddwod_h_bu_b, "V8sV16UcV16c", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vaddwod_w_hu_h, "V4SiV8UsV8s", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vaddwod_d_wu_w, "V2LLiV4UiV4Si", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vaddwod_q_du_d, "V2LLiV2ULLiV2LLi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vavg_b, "V16ScV16ScV16Sc", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vavg_h, "V8SsV8SsV8Ss", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vavg_w, "V4SiV4SiV4Si", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vavg_d, "V2SLLiV2SLLiV2SLLi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vavg_bu, "V16UcV16UcV16Uc", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vavg_hu, "V8UsV8UsV8Us", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vavg_wu, "V4UiV4UiV4Ui", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vavg_du, "V2ULLiV2ULLiV2ULLi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vavgr_b, "V16ScV16ScV16Sc", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vavgr_h, "V8SsV8SsV8Ss", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vavgr_w, "V4SiV4SiV4Si", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vavgr_d, "V2SLLiV2SLLiV2SLLi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vavgr_bu, "V16UcV16UcV16Uc", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vavgr_hu, "V8UsV8UsV8Us", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vavgr_wu, "V4UiV4UiV4Ui", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vavgr_du, "V2ULLiV2ULLiV2ULLi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vabsd_b, "V16ScV16ScV16Sc", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vabsd_h, "V8SsV8SsV8Ss", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vabsd_w, "V4SiV4SiV4Si", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vabsd_d, "V2SLLiV2SLLiV2SLLi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vabsd_bu, "V16UcV16UcV16Uc", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vabsd_hu, "V8UsV8UsV8Us", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vabsd_wu, "V4UiV4UiV4Ui", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vabsd_du, "V2ULLiV2ULLiV2ULLi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vadda_b, "V16ScV16ScV16Sc", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vadda_h, "V8SsV8SsV8Ss", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vadda_w, "V4SiV4SiV4Si", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vadda_d, "V2SLLiV2SLLiV2SLLi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vmax_b, "V16ScV16ScV16Sc", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vmax_h, "V8SsV8SsV8Ss", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vmax_w, "V4SiV4SiV4Si", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vmax_d, "V2SLLiV2SLLiV2SLLi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vmaxi_b, "V16ScV16ScIi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vmaxi_h, "V8SsV8SsIi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vmaxi_w, "V4SiV4SiIi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vmaxi_d, "V2SLLiV2SLLiIi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vmax_bu, "V16UcV16UcV16Uc", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vmax_hu, "V8UsV8UsV8Us", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vmax_wu, "V4UiV4UiV4Ui", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vmax_du, "V2ULLiV2ULLiV2ULLi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vmaxi_bu, "V16UcV16UcIUi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vmaxi_hu, "V8UsV8UsIUi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vmaxi_wu, "V4UiV4UiIUi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vmaxi_du, "V2ULLiV2ULLiIUi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vmin_b, "V16ScV16ScV16Sc", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vmin_h, "V8SsV8SsV8Ss", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vmin_w, "V4SiV4SiV4Si", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vmin_d, "V2SLLiV2SLLiV2SLLi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vmini_b, "V16ScV16ScIi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vmini_h, "V8SsV8SsIi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vmini_w, "V4SiV4SiIi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vmini_d, "V2SLLiV2SLLiIi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vmin_bu, "V16UcV16UcV16Uc", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vmin_hu, "V8UsV8UsV8Us", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vmin_wu, "V4UiV4UiV4Ui", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vmin_du, "V2ULLiV2ULLiV2ULLi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vmini_bu, "V16UcV16UcIUi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vmini_hu, "V8UsV8UsIUi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vmini_wu, "V4UiV4UiIUi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vmini_du, "V2ULLiV2ULLiIUi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vmul_b, "V16ScV16ScV16Sc", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vmul_h, "V8SsV8SsV8Ss", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vmul_w, "V4SiV4SiV4Si", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vmul_d, "V2SLLiV2SLLiV2SLLi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vmuh_b, "V16cV16cV16c", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vmuh_h, "V8sV8sV8s", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vmuh_w, "V4iV4iV4i", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vmuh_d, "V2LLiV2LLiV2LLi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vmuh_bu, "V16UcV16UcV16Uc", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vmuh_hu, "V8UsV8UsV8Us", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vmuh_wu, "V4UiV4UiV4Ui", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vmuh_du, "V2ULLiV2ULLiV2ULLi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vmulwev_h_b, "V8sV16cV16c", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vmulwev_w_h, "V4SiV8sV8s", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vmulwev_d_w, "V2LLiV4SiV4Si", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vmulwev_q_d, "V2LLiV2LLiV2LLi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vmulwod_h_b, "V8sV16cV16c", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vmulwod_w_h, "V4SiV8sV8s", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vmulwod_d_w, "V2LLiV4SiV4Si", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vmulwod_q_d, "V2LLiV2LLiV2LLi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vmulwev_h_bu, "V8sV16UcV16Uc", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vmulwev_w_hu, "V4SiV8UsV8Us", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vmulwev_d_wu, "V2LLiV4UiV4Ui", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vmulwev_q_du, "V2LLiV2ULLiV2ULLi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vmulwod_h_bu, "V8sV16UcV16Uc", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vmulwod_w_hu, "V4SiV8UsV8Us", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vmulwod_d_wu, "V2LLiV4UiV4Ui", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vmulwod_q_du, "V2LLiV2ULLiV2ULLi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vmulwev_h_bu_b, "V8sV16UcV16c", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vmulwev_w_hu_h, "V4SiV8UsV8s", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vmulwev_d_wu_w, "V2LLiV4UiV4Si", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vmulwev_q_du_d, "V2LLiV2ULLiV2LLi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vmulwod_h_bu_b, "V8sV16UcV16c", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vmulwod_w_hu_h, "V4SiV8UsV8s", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vmulwod_d_wu_w, "V2LLiV4UiV4Si", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vmulwod_q_du_d, "V2LLiV2ULLiV2LLi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vmadd_b, "V16ScV16ScV16ScV16Sc", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vmadd_h, "V8SsV8SsV8SsV8Ss", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vmadd_w, "V4SiV4SiV4SiV4Si", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vmadd_d, "V2SLLiV2SLLiV2SLLiV2SLLi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vmsub_b, "V16ScV16ScV16ScV16Sc", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vmsub_h, "V8SsV8SsV8SsV8Ss", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vmsub_w, "V4SiV4SiV4SiV4Si", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vmsub_d, "V2SLLiV2SLLiV2SLLiV2SLLi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vmaddwev_h_b, "V8sV8sV16cV16c", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vmaddwev_w_h, "V4SiV4SiV8sV8s", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vmaddwev_d_w, "V2LLiV2LLiV4SiV4Si", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vmaddwev_q_d, "V2LLiV2LLiV2LLiV2LLi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vmaddwod_h_b, "V8sV8sV16cV16c", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vmaddwod_w_h, "V4SiV4SiV8sV8s", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vmaddwod_d_w, "V2LLiV2LLiV4SiV4Si", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vmaddwod_q_d, "V2LLiV2LLiV2LLiV2LLi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vmaddwev_h_bu, "V8UsV8UsV16UcV16Uc", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vmaddwev_w_hu, "V4UiV4UiV8UsV8Us", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vmaddwev_d_wu, "V2ULLiV2ULLiV4UiV4Ui", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vmaddwev_q_du, "V2ULLiV2ULLiV2ULLiV2ULLi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vmaddwod_h_bu, "V8UsV8UsV16UcV16Uc", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vmaddwod_w_hu, "V4UiV4UiV8UsV8Us", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vmaddwod_d_wu, "V2ULLiV2ULLiV4UiV4Ui", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vmaddwod_q_du, "V2ULLiV2ULLiV2ULLiV2ULLi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vmaddwev_h_bu_b, "V8sV8sV16UcV16c", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vmaddwev_w_hu_h, "V4SiV4SiV8UsV8s", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vmaddwev_d_wu_w, "V2LLiV2LLiV4UiV4Si", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vmaddwev_q_du_d, "V2LLiV2LLiV2ULLiV2LLi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vmaddwod_h_bu_b, "V8sV8sV16UcV16c", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vmaddwod_w_hu_h, "V4SiV4SiV8UsV8s", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vmaddwod_d_wu_w, "V2LLiV2LLiV4UiV4Si", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vmaddwod_q_du_d, "V2LLiV2LLiV2ULLiV2LLi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vdiv_b, "V16ScV16ScV16Sc", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vdiv_h, "V8SsV8SsV8Ss", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vdiv_w, "V4SiV4SiV4Si", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vdiv_d, "V2SLLiV2SLLiV2SLLi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vdiv_bu, "V16UcV16UcV16Uc", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vdiv_hu, "V8UsV8UsV8Us", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vdiv_wu, "V4UiV4UiV4Ui", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vdiv_du, "V2ULLiV2ULLiV2ULLi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vmod_b, "V16ScV16ScV16Sc", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vmod_h, "V8SsV8SsV8Ss", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vmod_w, "V4SiV4SiV4Si", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vmod_d, "V2SLLiV2SLLiV2SLLi", "nc", "lsx")
+
+
+TARGET_BUILTIN(__builtin_lsx_vmod_bu, "V16UcV16UcV16Uc", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vmod_hu, "V8UsV8UsV8Us", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vmod_wu, "V4UiV4UiV4Ui", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vmod_du, "V2ULLiV2ULLiV2ULLi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vsat_b, "V16ScV16ScIUi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vsat_h, "V8SsV8SsIUi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vsat_w, "V4SiV4SiIUi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vsat_d, "V2SLLiV2SLLiIUi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vsat_bu, "V16UcV16UcIUi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vsat_hu, "V8UsV8UsIUi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vsat_wu, "V4UiV4UiIUi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vsat_du, "V2ULLiV2ULLiIUi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vexth_h_b, "V8sV16c", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vexth_w_h, "V4SiV8s", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vexth_d_w, "V2LLiV4Si", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vexth_q_d, "V2LLiV2LLi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vexth_hu_bu, "V8UsV16Uc", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vexth_wu_hu, "V4UiV8Us", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vexth_du_wu, "V2ULLiV4Ui", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vexth_qu_du, "V2ULLiV2ULLi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vsigncov_b, "V16ScV16ScV16Sc", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vsigncov_h, "V8SsV8SsV8Ss", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vsigncov_w, "V4SiV4SiV4Si", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vsigncov_d, "V2SLLiV2SLLiV2SLLi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vmskltz_b, "V16cV16c", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vmskltz_h, "V8sV8s", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vmskltz_w, "V4iV4i", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vmskltz_d, "V2LLiV2LLi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vmskgez_b, "V16cV16c", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vmsknz_b, "V8sV8s", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vldi, "V2LLiIi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vrepli_b, "V16cIi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vrepli_h, "V8sIi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vrepli_w, "V4iIi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vrepli_d, "V2LLiIi", "nc", "lsx")
+
+
+TARGET_BUILTIN(__builtin_lsx_vand_v, "V16UcV16UcV16Uc", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vor_v, "V16UcV16UcV16Uc", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vxor_v, "V16cV16cV16c", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vnor_v, "V16UcV16UcV16Uc", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vandn_v, "V16UcV16UcV16Uc", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vorn_v, "V16ScV16ScV16Sc", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vandi_b, "V16UcV16UcIUi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vori_b, "V16UcV16UcIUi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vxori_b, "V16UcV16UcIUi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vnori_b, "V16UcV16UcIUi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vsll_b, "V16cV16cV16c", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vsll_h, "V8sV8sV8s", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vsll_w, "V4iV4iV4i", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vsll_d, "V2LLiV2LLiV2LLi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vslli_b, "V16cV16cIUi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vslli_h, "V8sV8sIUi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vslli_w, "V4iV4iIUi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vslli_d, "V2LLiV2LLiIUi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vsrl_b, "V16cV16cV16c", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vsrl_h, "V8sV8sV8s", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vsrl_w, "V4iV4iV4i", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vsrl_d, "V2LLiV2LLiV2LLi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vsrli_b, "V16cV16cIUi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vsrli_h, "V8sV8sIUi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vsrli_w, "V4iV4iIUi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vsrli_d, "V2LLiV2LLiIUi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vsra_b, "V16cV16cV16c", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vsra_h, "V8sV8sV8s", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vsra_w, "V4iV4iV4i", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vsra_d, "V2LLiV2LLiV2LLi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vsrai_b, "V16cV16cIUi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vsrai_h, "V8sV8sIUi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vsrai_w, "V4iV4iIUi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vsrai_d, "V2LLiV2LLiIUi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vrotr_b, "V16cV16cV16c", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vrotr_h, "V8sV8sV8s", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vrotr_w, "V4iV4iV4i", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vrotr_d, "V2LLiV2LLiV2LLi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vrotri_b, "V16cV16cIUi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vrotri_h, "V8sV8sIUi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vrotri_w, "V4iV4iIUi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vrotri_d, "V2LLiV2LLiIUi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vsllwil_h_b, "V8sV16cIUi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vsllwil_w_h, "V4SiV8sIUi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vsllwil_d_w, "V2LLiV4SiIUi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vextl_q_d, "V2LLiV2LLi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vsllwil_hu_bu, "V8UsV16UcIUi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vsllwil_wu_hu, "V4UiV8UsIUi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vsllwil_du_wu, "V2ULLiV4UiIUi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vextl_qu_du, "V2LLiV2ULLi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vsrlr_b, "V16cV16cV16c", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vsrlr_h, "V8sV8sV8s", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vsrlr_w, "V4iV4iV4i", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vsrlr_d, "V2LLiV2LLiV2LLi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vsrlri_b, "V16cV16cIUi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vsrlri_h, "V8sV8sIUi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vsrlri_w, "V4iV4iIUi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vsrlri_d, "V2LLiV2LLiIUi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vsrar_b, "V16cV16cV16c", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vsrar_h, "V8sV8sV8s", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vsrar_w, "V4iV4iV4i", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vsrar_d, "V2LLiV2LLiV2LLi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vsrari_b, "V16cV16cIUi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vsrari_h, "V8sV8sIUi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vsrari_w, "V4iV4iIUi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vsrari_d, "V2LLiV2LLiIUi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vsrln_b_h, "V16ScV8sV8s", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vsrln_h_w, "V8sV4SiV4Si", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vsrln_w_d, "V4SiV2LLiV2LLi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vsran_b_h, "V16ScV8sV8s", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vsran_h_w, "V8sV4SiV4Si", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vsran_w_d, "V4SiV2LLiV2LLi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vsrlni_b_h, "V16cV16cV16cIUi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vsrlni_h_w, "V8sV8sV8sIUi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vsrlni_w_d, "V4iV4iV4iIUi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vsrlni_d_q, "V2LLiV2LLiV2LLiIUi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vsrani_b_h, "V16cV16cV16cIUi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vsrani_h_w, "V8sV8sV8sIUi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vsrani_w_d, "V4iV4iV4iIUi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vsrani_d_q, "V2LLiV2LLiV2LLiIUi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vsrlrn_b_h, "V16ScV8sV8s", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vsrlrn_h_w, "V8sV4SiV4Si", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vsrlrn_w_d, "V4SiV2LLiV2LLi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vsrarn_b_h, "V16ScV8sV8s", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vsrarn_h_w, "V8sV4SiV4Si", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vsrarn_w_d, "V4SiV2LLiV2LLi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vsrlrni_b_h, "V16cV16cV16cIUi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vsrlrni_h_w, "V8sV8sV8sIUi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vsrlrni_w_d, "V4iV4iV4iIUi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vsrlrni_d_q, "V2LLiV2LLiV2LLiIUi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vsrarni_b_h, "V16cV16cV16cIUi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vsrarni_h_w, "V8sV8sV8sIUi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vsrarni_w_d, "V4iV4iV4iIUi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vsrarni_d_q, "V2LLiV2LLiV2LLiIUi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vssrln_b_h, "V16ScV8sV8s", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vssrln_h_w, "V8sV4SiV4Si", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vssrln_w_d, "V4SiV2LLiV2LLi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vssran_b_h, "V16ScV8sV8s", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vssran_h_w, "V8sV4SiV4Si", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vssran_w_d, "V4SiV2LLiV2LLi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vssrln_bu_h, "V16UcV8UsV8Us", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vssrln_hu_w, "V8UsV4UiV4Ui", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vssrln_wu_d, "V4UiV2ULLiV2ULLi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vssran_bu_h, "V16UcV8UsV8Us", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vssran_hu_w, "V8UsV4UiV4Ui", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vssran_wu_d, "V4UiV2ULLiV2ULLi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vssrlni_b_h, "V16cV16cV16cIUi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vssrlni_h_w, "V8sV8sV8sIUi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vssrlni_w_d, "V4iV4iV4iIUi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vssrlni_d_q, "V2LLiV2LLiV2LLiIUi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vssrani_b_h, "V16cV16cV16cIUi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vssrani_h_w, "V8sV8sV8sIUi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vssrani_w_d, "V4iV4iV4iIUi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vssrani_d_q, "V2LLiV2LLiV2LLiIUi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vssrlrni_bu_h, "V16cV16cV16cIUi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vssrlrni_hu_w, "V8sV8sV8sIUi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vssrlrni_wu_d, "V4iV4iV4iIUi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vssrlrni_du_q, "V2LLiV2LLiV2LLiIUi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vssrani_bu_h, "V16cV16cV16cIUi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vssrani_hu_w, "V8sV8sV8sIUi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vssrani_wu_d, "V4iV4iV4iIUi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vssrani_du_q, "V2LLiV2LLiV2LLiIUi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vssrlrn_b_h, "V16ScV8sV8s", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vssrlrn_h_w, "V8sV4SiV4Si", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vssrlrn_w_d, "V4SiV2LLiV2LLi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vssrarn_b_h, "V16ScV8sV8s", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vssrarn_h_w, "V8sV4SiV4Si", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vssrarn_w_d, "V4SiV2LLiV2LLi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vssrlrn_bu_h, "V16UcV8UsV8Us", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vssrlrn_hu_w, "V8UsV4UiV4Ui", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vssrlrn_wu_d, "V4UiV2ULLiV2ULLi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vssrarn_bu_h, "V16UcV8UsV8Us", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vssrarn_hu_w, "V8UsV4UiV4Ui", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vssrarn_wu_d, "V4UiV2ULLiV2ULLi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vssrlrni_b_h, "V16cV16cV16cIUi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vssrlrni_h_w, "V8sV8sV8sIUi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vssrlrni_w_d, "V4iV4iV4iIUi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vssrlrni_d_q, "V2LLiV2LLiV2LLiIUi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vssrarni_b_h, "V16cV16cV16cIUi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vssrarni_h_w, "V8sV8sV8sIUi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vssrarni_w_d, "V4iV4iV4iIUi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vssrarni_d_q, "V2LLiV2LLiV2LLiIUi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vssrlni_bu_h, "V16cV16cV16cIUi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vssrlni_hu_w, "V8sV8sV8sIUi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vssrlni_wu_d, "V4iV4iV4iIUi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vssrlni_du_q, "V2LLiV2LLiV2LLiIUi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vssrarni_bu_h, "V16cV16cV16cIUi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vssrarni_hu_w, "V8sV8sV8sIUi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vssrarni_wu_d, "V4iV4iV4iIUi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vssrarni_du_q, "V2LLiV2LLiV2LLiIUi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vclo_b, "V16ScV16Sc", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vclo_h, "V8SsV8Ss", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vclo_w, "V4SiV4Si", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vclo_d, "V2SLLiV2SLLi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vclz_b, "V16ScV16Sc", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vclz_h, "V8SsV8Ss", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vclz_w, "V4SiV4Si", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vclz_d, "V2SLLiV2SLLi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vpcnt_b, "V16ScV16Sc", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vpcnt_h, "V8SsV8Ss", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vpcnt_w, "V4SiV4Si", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vpcnt_d, "V2SLLiV2SLLi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vbitclr_b, "V16UcV16UcV16Uc", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vbitclr_h, "V8UsV8UsV8Us", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vbitclr_w, "V4UiV4UiV4Ui", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vbitclr_d, "V2ULLiV2ULLiV2ULLi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vbitclri_b, "V16UcV16UcIUi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vbitclri_h, "V8UsV8UsIUi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vbitclri_w, "V4UiV4UiIUi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vbitclri_d, "V2ULLiV2ULLiIUi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vbitset_b, "V16UcV16UcV16Uc", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vbitset_h, "V8UsV8UsV8Us", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vbitset_w, "V4UiV4UiV4Ui", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vbitset_d, "V2ULLiV2ULLiV2ULLi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vbitseti_b, "V16UcV16UcIUi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vbitseti_h, "V8UsV8UsIUi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vbitseti_w, "V4UiV4UiIUi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vbitseti_d, "V2ULLiV2ULLiIUi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vbitrev_b, "V16UcV16UcV16Uc", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vbitrev_h, "V8UsV8UsV8Us", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vbitrev_w, "V4UiV4UiV4Ui", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vbitrev_d, "V2ULLiV2ULLiV2ULLi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vbitrevi_b, "V16UcV16UcIUi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vbitrevi_h, "V8UsV8UsIUi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vbitrevi_w, "V4UiV4UiIUi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vbitrevi_d, "V2ULLiV2ULLiIUi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vfrstp_b, "V16ScV16ScV16ScV16Sc", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vfrstp_h, "V8SsV8SsV8SsV8Ss", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vfrstpi_b, "V16cV16cV16cIUi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vfrstpi_h, "V8sV8sV8sIUi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vfadd_s, "V4fV4fV4f", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vfadd_d, "V2dV2dV2d", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vfsub_s, "V4fV4fV4f", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vfsub_d, "V2dV2dV2d", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vfmul_s, "V4fV4fV4f", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vfmul_d, "V2dV2dV2d", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vfdiv_s, "V4fV4fV4f", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vfdiv_d, "V2dV2dV2d", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vfmadd_s, "V4fV4fV4fV4f", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vfmadd_d, "V2dV2dV2dV2d", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vfmsub_s, "V4fV4fV4fV4f", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vfmsub_d, "V2dV2dV2dV2d", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vfnmadd_s, "V4fV4fV4fV4f", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vfnmadd_d, "V2dV2dV2dV2d", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vfnmsub_s, "V4fV4fV4fV4f", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vfnmsub_d, "V2dV2dV2dV2d", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vfmax_s, "V4fV4fV4f", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vfmax_d, "V2dV2dV2d", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vfmin_s, "V4fV4fV4f", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vfmin_d, "V2dV2dV2d", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vfmaxa_s, "V4fV4fV4f", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vfmaxa_d, "V2dV2dV2d", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vfmina_s, "V4fV4fV4f", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vfmina_d, "V2dV2dV2d", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vflogb_s, "V4fV4f", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vflogb_d, "V2dV2d", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vfclass_s, "V4iV4f", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vfclass_d, "V2LLiV2d", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vfsqrt_s, "V4fV4f", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vfsqrt_d, "V2dV2d", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vfrecip_s, "V4fV4f", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vfrecip_d, "V2dV2d", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vfrsqrt_s, "V4fV4f", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vfrsqrt_d, "V2dV2d", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vfcvtl_s_h, "V4fV8s", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vfcvtl_d_s, "V2dV4f", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vfcvth_s_h, "V4fV8s", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vfcvth_d_s, "V2dV4f", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vfcvt_h_s, "V8sV4fV4f", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vfcvt_s_d, "V4fV2dV2d", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vfrintrne_s, "V4SiV4f", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vfrintrne_d, "V2LLiV2d", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vfrintrz_s, "V4SiV4f", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vfrintrz_d, "V2LLiV2d", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vfrintrp_s, "V4SiV4f", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vfrintrp_d, "V2LLiV2d", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vfrintrm_s, "V4SiV4f", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vfrintrm_d, "V2LLiV2d", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vfrint_s, "V4fV4f", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vfrint_d, "V2dV2d", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vftintrne_w_s, "V4SiV4f", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vftintrne_l_d, "V2LLiV2d", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vftintrz_w_s, "V4SiV4f", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vftintrz_l_d, "V2LLiV2d", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vftintrp_w_s, "V4SiV4f", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vftintrp_l_d, "V2LLiV2d", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vftintrm_w_s, "V4SiV4f", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vftintrm_l_d, "V2LLiV2d", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vftint_w_s, "V4SiV4f", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vftint_l_d, "V2SLLiV2d", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vftintrz_wu_s, "V4UiV4f", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vftintrz_lu_d, "V2ULLiV2d", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vftint_wu_s, "V4UiV4f", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vftint_lu_d, "V2ULLiV2d", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vftintrne_w_d, "V4SiV2dV2d", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vftintrz_w_d, "V4SiV2dV2d", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vftintrp_w_d, "V4SiV2dV2d", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vftintrm_w_d, "V4SiV2dV2d", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vftint_w_d, "V4SiV2dV2d", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vftintrnel_l_s, "V2LLiV4f", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vftintrneh_l_s, "V2LLiV4f", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vftintrzl_l_s, "V2LLiV4f", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vftintrzh_l_s, "V2LLiV4f", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vftintrpl_l_s, "V2LLiV4f", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vftintrph_l_s, "V2LLiV4f", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vftintrml_l_s, "V2LLiV4f", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vftintrmh_l_s, "V2LLiV4f", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vftintl_l_s, "V2LLiV4f", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vftinth_l_s, "V2LLiV4f", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vffint_s_w, "V4fV4Si", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vffint_d_l, "V2dV2SLLi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vffint_s_wu, "V4fV4Ui", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vffint_d_lu, "V2dV2ULLi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vffintl_d_w, "V2dV4Si", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vffinth_d_w, "V2dV4Si", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vffint_s_l, "V4fV2LLiV2LLi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vseq_b, "V16ScV16ScV16Sc", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vseq_h, "V8SsV8SsV8Ss", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vseq_w, "V4SiV4SiV4Si", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vseq_d, "V2SLLiV2SLLiV2SLLi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vseqi_b, "V16ScV16ScISi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vseqi_h, "V8SsV8SsISi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vseqi_w, "V4SiV4SiISi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vseqi_d, "V2SLLiV2SLLiISi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vsle_b, "V16ScV16ScV16Sc", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vsle_h, "V8SsV8SsV8Ss", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vsle_w, "V4SiV4SiV4Si", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vsle_d, "V2SLLiV2SLLiV2SLLi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vslei_b, "V16ScV16ScISi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vslei_h, "V8SsV8SsISi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vslei_w, "V4SiV4SiISi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vslei_d, "V2SLLiV2SLLiISi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vsle_bu, "V16ScV16UcV16Uc", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vsle_hu, "V8SsV8UsV8Us", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vsle_wu, "V4SiV4UiV4Ui", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vsle_du, "V2SLLiV2ULLiV2ULLi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vslei_bu, "V16ScV16UcIUi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vslei_hu, "V8SsV8UsIUi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vslei_wu, "V4SiV4UiIUi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vslei_du, "V2SLLiV2ULLiIUi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vslt_b, "V16ScV16ScV16Sc", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vslt_h, "V8SsV8SsV8Ss", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vslt_w, "V4SiV4SiV4Si", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vslt_d, "V2SLLiV2SLLiV2SLLi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vslti_b, "V16ScV16ScISi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vslti_h, "V8SsV8SsISi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vslti_w, "V4SiV4SiISi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vslti_d, "V2SLLiV2SLLiISi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vslt_bu, "V16ScV16UcV16Uc", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vslt_hu, "V8SsV8UsV8Us", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vslt_wu, "V4SiV4UiV4Ui", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vslt_du, "V2SLLiV2ULLiV2ULLi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vslti_bu, "V16ScV16UcIUi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vslti_hu, "V8SsV8UsIUi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vslti_wu, "V4SiV4UiIUi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vslti_du, "V2SLLiV2ULLiIUi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vfcmp_caf_s, "V4SiV4fV4f", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vfcmp_caf_d, "V2SLLiV2dV2d", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vfcmp_cun_s, "V4SiV4fV4f", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vfcmp_cun_d, "V2SLLiV2dV2d", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vfcmp_ceq_s, "V4SiV4fV4f", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vfcmp_ceq_d, "V2SLLiV2dV2d", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vfcmp_cueq_s, "V4SiV4fV4f", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vfcmp_cueq_d, "V2SLLiV2dV2d", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vfcmp_clt_s, "V4SiV4fV4f", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vfcmp_clt_d, "V2SLLiV2dV2d", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vfcmp_cult_s, "V4SiV4fV4f", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vfcmp_cult_d, "V2SLLiV2dV2d", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vfcmp_cle_s, "V4SiV4fV4f", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vfcmp_cle_d, "V2SLLiV2dV2d", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vfcmp_cule_s, "V4SiV4fV4f", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vfcmp_cule_d, "V2SLLiV2dV2d", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vfcmp_cne_s, "V4SiV4fV4f", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vfcmp_cne_d, "V2SLLiV2dV2d", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vfcmp_cor_s, "V4SiV4fV4f", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vfcmp_cor_d, "V2SLLiV2dV2d", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vfcmp_cune_s, "V4SiV4fV4f", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vfcmp_cune_d, "V2SLLiV2dV2d", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vfcmp_saf_s, "V4SiV4fV4f", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vfcmp_saf_d, "V2SLLiV2dV2d", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vfcmp_sun_s, "V4SiV4fV4f", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vfcmp_sun_d, "V2SLLiV2dV2d", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vfcmp_seq_s, "V4SiV4fV4f", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vfcmp_seq_d, "V2SLLiV2dV2d", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vfcmp_sueq_s, "V4SiV4fV4f", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vfcmp_sueq_d, "V2SLLiV2dV2d", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vfcmp_slt_s, "V4SiV4fV4f", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vfcmp_slt_d, "V2SLLiV2dV2d", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vfcmp_sult_s, "V4SiV4fV4f", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vfcmp_sult_d, "V2SLLiV2dV2d", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vfcmp_sle_s, "V4SiV4fV4f", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vfcmp_sle_d, "V2SLLiV2dV2d", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vfcmp_sule_s, "V4SiV4fV4f", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vfcmp_sule_d, "V2SLLiV2dV2d", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vfcmp_sne_s, "V4SiV4fV4f", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vfcmp_sne_d, "V2SLLiV2dV2d", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vfcmp_sor_s, "V4SiV4fV4f", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vfcmp_sor_d, "V2SLLiV2dV2d", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vfcmp_sune_s, "V4SiV4fV4f", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vfcmp_sune_d, "V2SLLiV2dV2d", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vbitsel_v, "V16UcV16UcV16UcV16Uc", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vbitseli_b, "V16UcV16UcV16UcIUi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vreplgr2vr_b, "V16Sci", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vreplgr2vr_h, "V8Ssi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vreplgr2vr_w, "V4Sii", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vreplgr2vr_d, "V2SLLiLLi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vinsgr2vr_b, "V16ScV16SciIUi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vinsgr2vr_h, "V8SsV8SsiIUi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vinsgr2vr_w, "V4SiV4SiiIUi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vinsgr2vr_d, "V2SLLiV2SLLiLLiIUi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vpickve2gr_b, "iV16ScIUi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vpickve2gr_h, "iV8SsIUi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vpickve2gr_w, "iV4SiIUi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vpickve2gr_d, "LLiV2SLLiIUi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vpickve2gr_bu, "iV16UcIUi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vpickve2gr_hu, "iV8UsIUi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vpickve2gr_wu, "iV4UiIUi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vpickve2gr_du, "LLiV2ULLiIUi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vreplve_b, "V16cV16cUi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vreplve_h, "V8sV8sUi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vreplve_w, "V4iV4iUi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vreplve_d, "V2LLiV2LLiUi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vreplvei_b, "V16cV16cIUi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vreplvei_h, "V8sV8sIUi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vreplvei_w, "V4iV4iIUi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vreplvei_d, "V2LLiV2LLiIUi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vbsll_v, "V16cV16cIUi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vbsrl_v, "V16cV16cIUi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vpackev_b, "V16cV16cV16c", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vpackev_h, "V8sV8sV8s", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vpackev_w, "V4iV4iV4i", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vpackev_d, "V2LLiV2LLiV2LLi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vpackod_b, "V16cV16cV16c", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vpackod_h, "V8sV8sV8s", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vpackod_w, "V4iV4iV4i", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vpackod_d, "V2LLiV2LLiV2LLi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vpickev_b, "V16cV16cV16c", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vpickev_h, "V8sV8sV8s", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vpickev_w, "V4iV4iV4i", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vpickev_d, "V2LLiV2LLiV2LLi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vpickod_b, "V16cV16cV16c", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vpickod_h, "V8sV8sV8s", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vpickod_w, "V4iV4iV4i", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vpickod_d, "V2LLiV2LLiV2LLi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vilvl_b, "V16cV16cV16c", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vilvl_h, "V8sV8sV8s", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vilvl_w, "V4iV4iV4i", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vilvl_d, "V2LLiV2LLiV2LLi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vilvh_b, "V16cV16cV16c", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vilvh_h, "V8sV8sV8s", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vilvh_w, "V4iV4iV4i", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vilvh_d, "V2LLiV2LLiV2LLi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vshuf_b, "V16UcV16UcV16UcV16Uc", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vshuf_h, "V8sV8sV8sV8s", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vshuf_w, "V4iV4iV4iV4i", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vshuf_d, "V2LLiV2LLiV2LLiV2LLi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vshuf4i_b, "V16cV16cIUi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vshuf4i_h, "V8sV8sIUi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vshuf4i_w, "V4iV4iIUi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vshuf4i_d, "V2LLiV2LLiV2LLiIUi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vpermi_w, "V4iV4iV4iIUi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vextrins_b, "V16cV16cV16cIUi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vextrins_h, "V8sV8sV8sIUi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vextrins_w, "V4iV4iV4iIUi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vextrins_d, "V2LLiV2LLiV2LLiIUi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vld, "V16ScvC*Ii", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vst, "vV16Scv*Ii", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vldx, "V16ScvC*LLi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vstx, "vV16Scv*LLi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vldrepl_b, "V16cvC*Ii", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vldrepl_h, "V8svC*Ii", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vldrepl_w, "V4ivC*Ii", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vldrepl_d, "V2LLivC*Ii", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_vstelm_b, "vV16Scv*IiUi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vstelm_h, "vV8Ssv*IiUi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vstelm_w, "vV4Siv*IiUi", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_vstelm_d, "vV2SLLiv*IiUi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_bz_v, "iV16Uc", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_bnz_v, "iV16Uc", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_bz_b, "iV16Uc", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_bz_h, "iV8Us", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_bz_w, "iV4Ui", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_bz_d, "iV2ULLi", "nc", "lsx")
+
+TARGET_BUILTIN(__builtin_lsx_bnz_b, "iV16Uc", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_bnz_h, "iV8Us", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_bnz_w, "iV4Ui", "nc", "lsx")
+TARGET_BUILTIN(__builtin_lsx_bnz_d, "iV2ULLi", "nc", "lsx")
diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
index 02a0c81644b6c..cee61a7e4fb08 100644
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -93,6 +93,7 @@ set(hlsl_files
 
 set(loongarch_files
   larchintrin.h
+  lsxintrin.h
   )
 
 set(mips_msa_files
diff --git a/clang/lib/Headers/lsxintrin.h b/clang/lib/Headers/lsxintrin.h
new file mode 100644
index 0000000000000..a29bc7757ab56
--- /dev/null
+++ b/clang/lib/Headers/lsxintrin.h
@@ -0,0 +1,3726 @@
+/*===------------- lsxintrin.h - LoongArch LSX intrinsics ------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef _LOONGSON_SXINTRIN_H
+#define _LOONGSON_SXINTRIN_H 1
+
+#if defined(__loongarch_sx)
+typedef signed char v16i8 __attribute__((vector_size(16), aligned(16)));
+typedef signed char v16i8_b __attribute__((vector_size(16), aligned(1)));
+typedef unsigned char v16u8 __attribute__((vector_size(16), aligned(16)));
+typedef unsigned char v16u8_b __attribute__((vector_size(16), aligned(1)));
+typedef short v8i16 __attribute__((vector_size(16), aligned(16)));
+typedef short v8i16_h __attribute__((vector_size(16), aligned(2)));
+typedef unsigned short v8u16 __attribute__((vector_size(16), aligned(16)));
+typedef unsigned short v8u16_h __attribute__((vector_size(16), aligned(2)));
+typedef int v4i32 __attribute__((vector_size(16), aligned(16)));
+typedef int v4i32_w __attribute__((vector_size(16), aligned(4)));
+typedef unsigned int v4u32 __attribute__((vector_size(16), aligned(16)));
+typedef unsigned int v4u32_w __attribute__((vector_size(16), aligned(4)));
+typedef long long v2i64 __attribute__((vector_size(16), aligned(16)));
+typedef long long v2i64_d __attribute__((vector_size(16), aligned(8)));
+typedef unsigned long long v2u64 __attribute__((vector_size(16), aligned(16)));
+typedef unsigned long long v2u64_d __attribute__((vector_size(16), aligned(8)));
+typedef float v4f32 __attribute__((vector_size(16), aligned(16)));
+typedef float v4f32_w __attribute__((vector_size(16), aligned(4)));
+typedef double v2f64 __attribute__((vector_size(16), aligned(16)));
+typedef double v2f64_d __attribute__((vector_size(16), aligned(8)));
+
+typedef long long __m128i __attribute__((__vector_size__(16), __may_alias__));
+typedef float __m128 __attribute__((__vector_size__(16), __may_alias__));
+typedef double __m128d __attribute__((__vector_size__(16), __may_alias__));
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vsll_b(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vsll_b((v16i8)_1, (v16i8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vsll_h(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vsll_h((v8i16)_1, (v8i16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vsll_w(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vsll_w((v4i32)_1, (v4i32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vsll_d(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vsll_d((v2i64)_1, (v2i64)_2);
+}
+
+#define __lsx_vslli_b(/*__m128i*/ _1, /*ui3*/ _2)                              \
+  ((__m128i)__builtin_lsx_vslli_b((v16i8)(_1), (_2)))
+
+#define __lsx_vslli_h(/*__m128i*/ _1, /*ui4*/ _2)                              \
+  ((__m128i)__builtin_lsx_vslli_h((v8i16)(_1), (_2)))
+
+#define __lsx_vslli_w(/*__m128i*/ _1, /*ui5*/ _2)                              \
+  ((__m128i)__builtin_lsx_vslli_w((v4i32)(_1), (_2)))
+
+#define __lsx_vslli_d(/*__m128i*/ _1, /*ui6*/ _2)                              \
+  ((__m128i)__builtin_lsx_vslli_d((v2i64)(_1), (_2)))
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vsra_b(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vsra_b((v16i8)_1, (v16i8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vsra_h(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vsra_h((v8i16)_1, (v8i16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vsra_w(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vsra_w((v4i32)_1, (v4i32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vsra_d(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vsra_d((v2i64)_1, (v2i64)_2);
+}
+
+#define __lsx_vsrai_b(/*__m128i*/ _1, /*ui3*/ _2)                              \
+  ((__m128i)__builtin_lsx_vsrai_b((v16i8)(_1), (_2)))
+
+#define __lsx_vsrai_h(/*__m128i*/ _1, /*ui4*/ _2)                              \
+  ((__m128i)__builtin_lsx_vsrai_h((v8i16)(_1), (_2)))
+
+#define __lsx_vsrai_w(/*__m128i*/ _1, /*ui5*/ _2)                              \
+  ((__m128i)__builtin_lsx_vsrai_w((v4i32)(_1), (_2)))
+
+#define __lsx_vsrai_d(/*__m128i*/ _1, /*ui6*/ _2)                              \
+  ((__m128i)__builtin_lsx_vsrai_d((v2i64)(_1), (_2)))
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vsrar_b(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vsrar_b((v16i8)_1, (v16i8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vsrar_h(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vsrar_h((v8i16)_1, (v8i16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vsrar_w(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vsrar_w((v4i32)_1, (v4i32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vsrar_d(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vsrar_d((v2i64)_1, (v2i64)_2);
+}
+
+#define __lsx_vsrari_b(/*__m128i*/ _1, /*ui3*/ _2)                             \
+  ((__m128i)__builtin_lsx_vsrari_b((v16i8)(_1), (_2)))
+
+#define __lsx_vsrari_h(/*__m128i*/ _1, /*ui4*/ _2)                             \
+  ((__m128i)__builtin_lsx_vsrari_h((v8i16)(_1), (_2)))
+
+#define __lsx_vsrari_w(/*__m128i*/ _1, /*ui5*/ _2)                             \
+  ((__m128i)__builtin_lsx_vsrari_w((v4i32)(_1), (_2)))
+
+#define __lsx_vsrari_d(/*__m128i*/ _1, /*ui6*/ _2)                             \
+  ((__m128i)__builtin_lsx_vsrari_d((v2i64)(_1), (_2)))
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vsrl_b(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vsrl_b((v16i8)_1, (v16i8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vsrl_h(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vsrl_h((v8i16)_1, (v8i16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vsrl_w(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vsrl_w((v4i32)_1, (v4i32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vsrl_d(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vsrl_d((v2i64)_1, (v2i64)_2);
+}
+
+#define __lsx_vsrli_b(/*__m128i*/ _1, /*ui3*/ _2)                              \
+  ((__m128i)__builtin_lsx_vsrli_b((v16i8)(_1), (_2)))
+
+#define __lsx_vsrli_h(/*__m128i*/ _1, /*ui4*/ _2)                              \
+  ((__m128i)__builtin_lsx_vsrli_h((v8i16)(_1), (_2)))
+
+#define __lsx_vsrli_w(/*__m128i*/ _1, /*ui5*/ _2)                              \
+  ((__m128i)__builtin_lsx_vsrli_w((v4i32)(_1), (_2)))
+
+#define __lsx_vsrli_d(/*__m128i*/ _1, /*ui6*/ _2)                              \
+  ((__m128i)__builtin_lsx_vsrli_d((v2i64)(_1), (_2)))
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vsrlr_b(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vsrlr_b((v16i8)_1, (v16i8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vsrlr_h(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vsrlr_h((v8i16)_1, (v8i16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vsrlr_w(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vsrlr_w((v4i32)_1, (v4i32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vsrlr_d(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vsrlr_d((v2i64)_1, (v2i64)_2);
+}
+
+#define __lsx_vsrlri_b(/*__m128i*/ _1, /*ui3*/ _2)                             \
+  ((__m128i)__builtin_lsx_vsrlri_b((v16i8)(_1), (_2)))
+
+#define __lsx_vsrlri_h(/*__m128i*/ _1, /*ui4*/ _2)                             \
+  ((__m128i)__builtin_lsx_vsrlri_h((v8i16)(_1), (_2)))
+
+#define __lsx_vsrlri_w(/*__m128i*/ _1, /*ui5*/ _2)                             \
+  ((__m128i)__builtin_lsx_vsrlri_w((v4i32)(_1), (_2)))
+
+#define __lsx_vsrlri_d(/*__m128i*/ _1, /*ui6*/ _2)                             \
+  ((__m128i)__builtin_lsx_vsrlri_d((v2i64)(_1), (_2)))
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vbitclr_b(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vbitclr_b((v16u8)_1, (v16u8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vbitclr_h(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vbitclr_h((v8u16)_1, (v8u16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vbitclr_w(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vbitclr_w((v4u32)_1, (v4u32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vbitclr_d(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vbitclr_d((v2u64)_1, (v2u64)_2);
+}
+
+#define __lsx_vbitclri_b(/*__m128i*/ _1, /*ui3*/ _2)                           \
+  ((__m128i)__builtin_lsx_vbitclri_b((v16u8)(_1), (_2)))
+
+#define __lsx_vbitclri_h(/*__m128i*/ _1, /*ui4*/ _2)                           \
+  ((__m128i)__builtin_lsx_vbitclri_h((v8u16)(_1), (_2)))
+
+#define __lsx_vbitclri_w(/*__m128i*/ _1, /*ui5*/ _2)                           \
+  ((__m128i)__builtin_lsx_vbitclri_w((v4u32)(_1), (_2)))
+
+#define __lsx_vbitclri_d(/*__m128i*/ _1, /*ui6*/ _2)                           \
+  ((__m128i)__builtin_lsx_vbitclri_d((v2u64)(_1), (_2)))
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vbitset_b(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vbitset_b((v16u8)_1, (v16u8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vbitset_h(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vbitset_h((v8u16)_1, (v8u16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vbitset_w(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vbitset_w((v4u32)_1, (v4u32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vbitset_d(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vbitset_d((v2u64)_1, (v2u64)_2);
+}
+
+#define __lsx_vbitseti_b(/*__m128i*/ _1, /*ui3*/ _2)                           \
+  ((__m128i)__builtin_lsx_vbitseti_b((v16u8)(_1), (_2)))
+
+#define __lsx_vbitseti_h(/*__m128i*/ _1, /*ui4*/ _2)                           \
+  ((__m128i)__builtin_lsx_vbitseti_h((v8u16)(_1), (_2)))
+
+#define __lsx_vbitseti_w(/*__m128i*/ _1, /*ui5*/ _2)                           \
+  ((__m128i)__builtin_lsx_vbitseti_w((v4u32)(_1), (_2)))
+
+#define __lsx_vbitseti_d(/*__m128i*/ _1, /*ui6*/ _2)                           \
+  ((__m128i)__builtin_lsx_vbitseti_d((v2u64)(_1), (_2)))
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vbitrev_b(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vbitrev_b((v16u8)_1, (v16u8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vbitrev_h(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vbitrev_h((v8u16)_1, (v8u16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vbitrev_w(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vbitrev_w((v4u32)_1, (v4u32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vbitrev_d(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vbitrev_d((v2u64)_1, (v2u64)_2);
+}
+
+#define __lsx_vbitrevi_b(/*__m128i*/ _1, /*ui3*/ _2)                           \
+  ((__m128i)__builtin_lsx_vbitrevi_b((v16u8)(_1), (_2)))
+
+#define __lsx_vbitrevi_h(/*__m128i*/ _1, /*ui4*/ _2)                           \
+  ((__m128i)__builtin_lsx_vbitrevi_h((v8u16)(_1), (_2)))
+
+#define __lsx_vbitrevi_w(/*__m128i*/ _1, /*ui5*/ _2)                           \
+  ((__m128i)__builtin_lsx_vbitrevi_w((v4u32)(_1), (_2)))
+
+#define __lsx_vbitrevi_d(/*__m128i*/ _1, /*ui6*/ _2)                           \
+  ((__m128i)__builtin_lsx_vbitrevi_d((v2u64)(_1), (_2)))
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vadd_b(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vadd_b((v16i8)_1, (v16i8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vadd_h(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vadd_h((v8i16)_1, (v8i16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vadd_w(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vadd_w((v4i32)_1, (v4i32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vadd_d(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vadd_d((v2i64)_1, (v2i64)_2);
+}
+
+#define __lsx_vaddi_bu(/*__m128i*/ _1, /*ui5*/ _2)                             \
+  ((__m128i)__builtin_lsx_vaddi_bu((v16i8)(_1), (_2)))
+
+#define __lsx_vaddi_hu(/*__m128i*/ _1, /*ui5*/ _2)                             \
+  ((__m128i)__builtin_lsx_vaddi_hu((v8i16)(_1), (_2)))
+
+#define __lsx_vaddi_wu(/*__m128i*/ _1, /*ui5*/ _2)                             \
+  ((__m128i)__builtin_lsx_vaddi_wu((v4i32)(_1), (_2)))
+
+#define __lsx_vaddi_du(/*__m128i*/ _1, /*ui5*/ _2)                             \
+  ((__m128i)__builtin_lsx_vaddi_du((v2i64)(_1), (_2)))
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vsub_b(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vsub_b((v16i8)_1, (v16i8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vsub_h(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vsub_h((v8i16)_1, (v8i16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vsub_w(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vsub_w((v4i32)_1, (v4i32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vsub_d(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vsub_d((v2i64)_1, (v2i64)_2);
+}
+
+#define __lsx_vsubi_bu(/*__m128i*/ _1, /*ui5*/ _2)                             \
+  ((__m128i)__builtin_lsx_vsubi_bu((v16i8)(_1), (_2)))
+
+#define __lsx_vsubi_hu(/*__m128i*/ _1, /*ui5*/ _2)                             \
+  ((__m128i)__builtin_lsx_vsubi_hu((v8i16)(_1), (_2)))
+
+#define __lsx_vsubi_wu(/*__m128i*/ _1, /*ui5*/ _2)                             \
+  ((__m128i)__builtin_lsx_vsubi_wu((v4i32)(_1), (_2)))
+
+#define __lsx_vsubi_du(/*__m128i*/ _1, /*ui5*/ _2)                             \
+  ((__m128i)__builtin_lsx_vsubi_du((v2i64)(_1), (_2)))
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vmax_b(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vmax_b((v16i8)_1, (v16i8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vmax_h(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vmax_h((v8i16)_1, (v8i16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vmax_w(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vmax_w((v4i32)_1, (v4i32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vmax_d(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vmax_d((v2i64)_1, (v2i64)_2);
+}
+
+#define __lsx_vmaxi_b(/*__m128i*/ _1, /*si5*/ _2)                              \
+  ((__m128i)__builtin_lsx_vmaxi_b((v16i8)(_1), (_2)))
+
+#define __lsx_vmaxi_h(/*__m128i*/ _1, /*si5*/ _2)                              \
+  ((__m128i)__builtin_lsx_vmaxi_h((v8i16)(_1), (_2)))
+
+#define __lsx_vmaxi_w(/*__m128i*/ _1, /*si5*/ _2)                              \
+  ((__m128i)__builtin_lsx_vmaxi_w((v4i32)(_1), (_2)))
+
+#define __lsx_vmaxi_d(/*__m128i*/ _1, /*si5*/ _2)                              \
+  ((__m128i)__builtin_lsx_vmaxi_d((v2i64)(_1), (_2)))
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vmax_bu(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vmax_bu((v16u8)_1, (v16u8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vmax_hu(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vmax_hu((v8u16)_1, (v8u16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vmax_wu(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vmax_wu((v4u32)_1, (v4u32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vmax_du(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vmax_du((v2u64)_1, (v2u64)_2);
+}
+
+#define __lsx_vmaxi_bu(/*__m128i*/ _1, /*ui5*/ _2)                             \
+  ((__m128i)__builtin_lsx_vmaxi_bu((v16u8)(_1), (_2)))
+
+#define __lsx_vmaxi_hu(/*__m128i*/ _1, /*ui5*/ _2)                             \
+  ((__m128i)__builtin_lsx_vmaxi_hu((v8u16)(_1), (_2)))
+
+#define __lsx_vmaxi_wu(/*__m128i*/ _1, /*ui5*/ _2)                             \
+  ((__m128i)__builtin_lsx_vmaxi_wu((v4u32)(_1), (_2)))
+
+#define __lsx_vmaxi_du(/*__m128i*/ _1, /*ui5*/ _2)                             \
+  ((__m128i)__builtin_lsx_vmaxi_du((v2u64)(_1), (_2)))
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vmin_b(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vmin_b((v16i8)_1, (v16i8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vmin_h(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vmin_h((v8i16)_1, (v8i16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vmin_w(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vmin_w((v4i32)_1, (v4i32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vmin_d(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vmin_d((v2i64)_1, (v2i64)_2);
+}
+
+#define __lsx_vmini_b(/*__m128i*/ _1, /*si5*/ _2)                              \
+  ((__m128i)__builtin_lsx_vmini_b((v16i8)(_1), (_2)))
+
+#define __lsx_vmini_h(/*__m128i*/ _1, /*si5*/ _2)                              \
+  ((__m128i)__builtin_lsx_vmini_h((v8i16)(_1), (_2)))
+
+#define __lsx_vmini_w(/*__m128i*/ _1, /*si5*/ _2)                              \
+  ((__m128i)__builtin_lsx_vmini_w((v4i32)(_1), (_2)))
+
+#define __lsx_vmini_d(/*__m128i*/ _1, /*si5*/ _2)                              \
+  ((__m128i)__builtin_lsx_vmini_d((v2i64)(_1), (_2)))
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vmin_bu(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vmin_bu((v16u8)_1, (v16u8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vmin_hu(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vmin_hu((v8u16)_1, (v8u16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vmin_wu(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vmin_wu((v4u32)_1, (v4u32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vmin_du(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vmin_du((v2u64)_1, (v2u64)_2);
+}
+
+#define __lsx_vmini_bu(/*__m128i*/ _1, /*ui5*/ _2)                             \
+  ((__m128i)__builtin_lsx_vmini_bu((v16u8)(_1), (_2)))
+
+#define __lsx_vmini_hu(/*__m128i*/ _1, /*ui5*/ _2)                             \
+  ((__m128i)__builtin_lsx_vmini_hu((v8u16)(_1), (_2)))
+
+#define __lsx_vmini_wu(/*__m128i*/ _1, /*ui5*/ _2)                             \
+  ((__m128i)__builtin_lsx_vmini_wu((v4u32)(_1), (_2)))
+
+#define __lsx_vmini_du(/*__m128i*/ _1, /*ui5*/ _2)                             \
+  ((__m128i)__builtin_lsx_vmini_du((v2u64)(_1), (_2)))
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vseq_b(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vseq_b((v16i8)_1, (v16i8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vseq_h(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vseq_h((v8i16)_1, (v8i16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vseq_w(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vseq_w((v4i32)_1, (v4i32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vseq_d(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vseq_d((v2i64)_1, (v2i64)_2);
+}
+
+#define __lsx_vseqi_b(/*__m128i*/ _1, /*si5*/ _2)                              \
+  ((__m128i)__builtin_lsx_vseqi_b((v16i8)(_1), (_2)))
+
+#define __lsx_vseqi_h(/*__m128i*/ _1, /*si5*/ _2)                              \
+  ((__m128i)__builtin_lsx_vseqi_h((v8i16)(_1), (_2)))
+
+#define __lsx_vseqi_w(/*__m128i*/ _1, /*si5*/ _2)                              \
+  ((__m128i)__builtin_lsx_vseqi_w((v4i32)(_1), (_2)))
+
+#define __lsx_vseqi_d(/*__m128i*/ _1, /*si5*/ _2)                              \
+  ((__m128i)__builtin_lsx_vseqi_d((v2i64)(_1), (_2)))
+
+#define __lsx_vslti_b(/*__m128i*/ _1, /*si5*/ _2)                              \
+  ((__m128i)__builtin_lsx_vslti_b((v16i8)(_1), (_2)))
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vslt_b(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vslt_b((v16i8)_1, (v16i8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vslt_h(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vslt_h((v8i16)_1, (v8i16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vslt_w(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vslt_w((v4i32)_1, (v4i32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vslt_d(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vslt_d((v2i64)_1, (v2i64)_2);
+}
+
+#define __lsx_vslti_h(/*__m128i*/ _1, /*si5*/ _2)                              \
+  ((__m128i)__builtin_lsx_vslti_h((v8i16)(_1), (_2)))
+
+#define __lsx_vslti_w(/*__m128i*/ _1, /*si5*/ _2)                              \
+  ((__m128i)__builtin_lsx_vslti_w((v4i32)(_1), (_2)))
+
+#define __lsx_vslti_d(/*__m128i*/ _1, /*si5*/ _2)                              \
+  ((__m128i)__builtin_lsx_vslti_d((v2i64)(_1), (_2)))
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vslt_bu(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vslt_bu((v16u8)_1, (v16u8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vslt_hu(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vslt_hu((v8u16)_1, (v8u16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vslt_wu(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vslt_wu((v4u32)_1, (v4u32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vslt_du(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vslt_du((v2u64)_1, (v2u64)_2);
+}
+
+#define __lsx_vslti_bu(/*__m128i*/ _1, /*ui5*/ _2)                             \
+  ((__m128i)__builtin_lsx_vslti_bu((v16u8)(_1), (_2)))
+
+#define __lsx_vslti_hu(/*__m128i*/ _1, /*ui5*/ _2)                             \
+  ((__m128i)__builtin_lsx_vslti_hu((v8u16)(_1), (_2)))
+
+#define __lsx_vslti_wu(/*__m128i*/ _1, /*ui5*/ _2)                             \
+  ((__m128i)__builtin_lsx_vslti_wu((v4u32)(_1), (_2)))
+
+#define __lsx_vslti_du(/*__m128i*/ _1, /*ui5*/ _2)                             \
+  ((__m128i)__builtin_lsx_vslti_du((v2u64)(_1), (_2)))
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vsle_b(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vsle_b((v16i8)_1, (v16i8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vsle_h(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vsle_h((v8i16)_1, (v8i16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vsle_w(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vsle_w((v4i32)_1, (v4i32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vsle_d(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vsle_d((v2i64)_1, (v2i64)_2);
+}
+
+#define __lsx_vslei_b(/*__m128i*/ _1, /*si5*/ _2)                              \
+  ((__m128i)__builtin_lsx_vslei_b((v16i8)(_1), (_2)))
+
+#define __lsx_vslei_h(/*__m128i*/ _1, /*si5*/ _2)                              \
+  ((__m128i)__builtin_lsx_vslei_h((v8i16)(_1), (_2)))
+
+#define __lsx_vslei_w(/*__m128i*/ _1, /*si5*/ _2)                              \
+  ((__m128i)__builtin_lsx_vslei_w((v4i32)(_1), (_2)))
+
+#define __lsx_vslei_d(/*__m128i*/ _1, /*si5*/ _2)                              \
+  ((__m128i)__builtin_lsx_vslei_d((v2i64)(_1), (_2)))
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vsle_bu(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vsle_bu((v16u8)_1, (v16u8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vsle_hu(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vsle_hu((v8u16)_1, (v8u16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vsle_wu(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vsle_wu((v4u32)_1, (v4u32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vsle_du(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vsle_du((v2u64)_1, (v2u64)_2);
+}
+
+#define __lsx_vslei_bu(/*__m128i*/ _1, /*ui5*/ _2)                             \
+  ((__m128i)__builtin_lsx_vslei_bu((v16u8)(_1), (_2)))
+
+#define __lsx_vslei_hu(/*__m128i*/ _1, /*ui5*/ _2)                             \
+  ((__m128i)__builtin_lsx_vslei_hu((v8u16)(_1), (_2)))
+
+#define __lsx_vslei_wu(/*__m128i*/ _1, /*ui5*/ _2)                             \
+  ((__m128i)__builtin_lsx_vslei_wu((v4u32)(_1), (_2)))
+
+#define __lsx_vslei_du(/*__m128i*/ _1, /*ui5*/ _2)                             \
+  ((__m128i)__builtin_lsx_vslei_du((v2u64)(_1), (_2)))
+
+#define __lsx_vsat_b(/*__m128i*/ _1, /*ui3*/ _2)                               \
+  ((__m128i)__builtin_lsx_vsat_b((v16i8)(_1), (_2)))
+
+#define __lsx_vsat_h(/*__m128i*/ _1, /*ui4*/ _2)                               \
+  ((__m128i)__builtin_lsx_vsat_h((v8i16)(_1), (_2)))
+
+#define __lsx_vsat_w(/*__m128i*/ _1, /*ui5*/ _2)                               \
+  ((__m128i)__builtin_lsx_vsat_w((v4i32)(_1), (_2)))
+
+#define __lsx_vsat_d(/*__m128i*/ _1, /*ui6*/ _2)                               \
+  ((__m128i)__builtin_lsx_vsat_d((v2i64)(_1), (_2)))
+
+#define __lsx_vsat_bu(/*__m128i*/ _1, /*ui3*/ _2)                              \
+  ((__m128i)__builtin_lsx_vsat_bu((v16u8)(_1), (_2)))
+
+#define __lsx_vsat_hu(/*__m128i*/ _1, /*ui4*/ _2)                              \
+  ((__m128i)__builtin_lsx_vsat_hu((v8u16)(_1), (_2)))
+
+#define __lsx_vsat_wu(/*__m128i*/ _1, /*ui5*/ _2)                              \
+  ((__m128i)__builtin_lsx_vsat_wu((v4u32)(_1), (_2)))
+
+#define __lsx_vsat_du(/*__m128i*/ _1, /*ui6*/ _2)                              \
+  ((__m128i)__builtin_lsx_vsat_du((v2u64)(_1), (_2)))
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vadda_b(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vadda_b((v16i8)_1, (v16i8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vadda_h(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vadda_h((v8i16)_1, (v8i16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vadda_w(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vadda_w((v4i32)_1, (v4i32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vadda_d(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vadda_d((v2i64)_1, (v2i64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vsadd_b(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vsadd_b((v16i8)_1, (v16i8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vsadd_h(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vsadd_h((v8i16)_1, (v8i16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vsadd_w(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vsadd_w((v4i32)_1, (v4i32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vsadd_d(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vsadd_d((v2i64)_1, (v2i64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vsadd_bu(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vsadd_bu((v16u8)_1, (v16u8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vsadd_hu(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vsadd_hu((v8u16)_1, (v8u16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vsadd_wu(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vsadd_wu((v4u32)_1, (v4u32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vsadd_du(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vsadd_du((v2u64)_1, (v2u64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vavg_b(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vavg_b((v16i8)_1, (v16i8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vavg_h(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vavg_h((v8i16)_1, (v8i16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vavg_w(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vavg_w((v4i32)_1, (v4i32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vavg_d(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vavg_d((v2i64)_1, (v2i64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vavg_bu(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vavg_bu((v16u8)_1, (v16u8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vavg_hu(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vavg_hu((v8u16)_1, (v8u16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vavg_wu(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vavg_wu((v4u32)_1, (v4u32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vavg_du(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vavg_du((v2u64)_1, (v2u64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vavgr_b(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vavgr_b((v16i8)_1, (v16i8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vavgr_h(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vavgr_h((v8i16)_1, (v8i16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vavgr_w(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vavgr_w((v4i32)_1, (v4i32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vavgr_d(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vavgr_d((v2i64)_1, (v2i64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vavgr_bu(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vavgr_bu((v16u8)_1, (v16u8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vavgr_hu(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vavgr_hu((v8u16)_1, (v8u16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vavgr_wu(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vavgr_wu((v4u32)_1, (v4u32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vavgr_du(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vavgr_du((v2u64)_1, (v2u64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vssub_b(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vssub_b((v16i8)_1, (v16i8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vssub_h(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vssub_h((v8i16)_1, (v8i16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vssub_w(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vssub_w((v4i32)_1, (v4i32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vssub_d(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vssub_d((v2i64)_1, (v2i64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vssub_bu(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vssub_bu((v16u8)_1, (v16u8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vssub_hu(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vssub_hu((v8u16)_1, (v8u16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vssub_wu(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vssub_wu((v4u32)_1, (v4u32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vssub_du(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vssub_du((v2u64)_1, (v2u64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vabsd_b(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vabsd_b((v16i8)_1, (v16i8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vabsd_h(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vabsd_h((v8i16)_1, (v8i16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vabsd_w(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vabsd_w((v4i32)_1, (v4i32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vabsd_d(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vabsd_d((v2i64)_1, (v2i64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vabsd_bu(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vabsd_bu((v16u8)_1, (v16u8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vabsd_hu(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vabsd_hu((v8u16)_1, (v8u16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vabsd_wu(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vabsd_wu((v4u32)_1, (v4u32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vabsd_du(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vabsd_du((v2u64)_1, (v2u64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vmul_b(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vmul_b((v16i8)_1, (v16i8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vmul_h(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vmul_h((v8i16)_1, (v8i16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vmul_w(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vmul_w((v4i32)_1, (v4i32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vmul_d(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vmul_d((v2i64)_1, (v2i64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vmadd_b(__m128i _1, __m128i _2, __m128i _3) {
+  return (__m128i)__builtin_lsx_vmadd_b((v16i8)_1, (v16i8)_2, (v16i8)_3);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vmadd_h(__m128i _1, __m128i _2, __m128i _3) {
+  return (__m128i)__builtin_lsx_vmadd_h((v8i16)_1, (v8i16)_2, (v8i16)_3);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vmadd_w(__m128i _1, __m128i _2, __m128i _3) {
+  return (__m128i)__builtin_lsx_vmadd_w((v4i32)_1, (v4i32)_2, (v4i32)_3);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vmadd_d(__m128i _1, __m128i _2, __m128i _3) {
+  return (__m128i)__builtin_lsx_vmadd_d((v2i64)_1, (v2i64)_2, (v2i64)_3);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vmsub_b(__m128i _1, __m128i _2, __m128i _3) {
+  return (__m128i)__builtin_lsx_vmsub_b((v16i8)_1, (v16i8)_2, (v16i8)_3);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vmsub_h(__m128i _1, __m128i _2, __m128i _3) {
+  return (__m128i)__builtin_lsx_vmsub_h((v8i16)_1, (v8i16)_2, (v8i16)_3);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vmsub_w(__m128i _1, __m128i _2, __m128i _3) {
+  return (__m128i)__builtin_lsx_vmsub_w((v4i32)_1, (v4i32)_2, (v4i32)_3);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vmsub_d(__m128i _1, __m128i _2, __m128i _3) {
+  return (__m128i)__builtin_lsx_vmsub_d((v2i64)_1, (v2i64)_2, (v2i64)_3);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vdiv_b(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vdiv_b((v16i8)_1, (v16i8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vdiv_h(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vdiv_h((v8i16)_1, (v8i16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vdiv_w(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vdiv_w((v4i32)_1, (v4i32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vdiv_d(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vdiv_d((v2i64)_1, (v2i64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vdiv_bu(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vdiv_bu((v16u8)_1, (v16u8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vdiv_hu(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vdiv_hu((v8u16)_1, (v8u16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vdiv_wu(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vdiv_wu((v4u32)_1, (v4u32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vdiv_du(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vdiv_du((v2u64)_1, (v2u64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vhaddw_h_b(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vhaddw_h_b((v16i8)_1, (v16i8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vhaddw_w_h(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vhaddw_w_h((v8i16)_1, (v8i16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vhaddw_d_w(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vhaddw_d_w((v4i32)_1, (v4i32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vhaddw_hu_bu(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vhaddw_hu_bu((v16u8)_1, (v16u8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vhaddw_wu_hu(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vhaddw_wu_hu((v8u16)_1, (v8u16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vhaddw_du_wu(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vhaddw_du_wu((v4u32)_1, (v4u32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vhsubw_h_b(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vhsubw_h_b((v16i8)_1, (v16i8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vhsubw_w_h(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vhsubw_w_h((v8i16)_1, (v8i16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vhsubw_d_w(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vhsubw_d_w((v4i32)_1, (v4i32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vhsubw_hu_bu(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vhsubw_hu_bu((v16u8)_1, (v16u8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vhsubw_wu_hu(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vhsubw_wu_hu((v8u16)_1, (v8u16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vhsubw_du_wu(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vhsubw_du_wu((v4u32)_1, (v4u32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vmod_b(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vmod_b((v16i8)_1, (v16i8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vmod_h(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vmod_h((v8i16)_1, (v8i16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vmod_w(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vmod_w((v4i32)_1, (v4i32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vmod_d(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vmod_d((v2i64)_1, (v2i64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vmod_bu(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vmod_bu((v16u8)_1, (v16u8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vmod_hu(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vmod_hu((v8u16)_1, (v8u16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vmod_wu(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vmod_wu((v4u32)_1, (v4u32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vmod_du(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vmod_du((v2u64)_1, (v2u64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vreplve_b(__m128i _1, int _2) {
+  return (__m128i)__builtin_lsx_vreplve_b((v16i8)_1, (int)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vreplve_h(__m128i _1, int _2) {
+  return (__m128i)__builtin_lsx_vreplve_h((v8i16)_1, (int)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vreplve_w(__m128i _1, int _2) {
+  return (__m128i)__builtin_lsx_vreplve_w((v4i32)_1, (int)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vreplve_d(__m128i _1, int _2) {
+  return (__m128i)__builtin_lsx_vreplve_d((v2i64)_1, (int)_2);
+}
+
+#define __lsx_vreplvei_b(/*__m128i*/ _1, /*ui4*/ _2)                           \
+  ((__m128i)__builtin_lsx_vreplvei_b((v16i8)(_1), (_2)))
+
+#define __lsx_vreplvei_h(/*__m128i*/ _1, /*ui3*/ _2)                           \
+  ((__m128i)__builtin_lsx_vreplvei_h((v8i16)(_1), (_2)))
+
+#define __lsx_vreplvei_w(/*__m128i*/ _1, /*ui2*/ _2)                           \
+  ((__m128i)__builtin_lsx_vreplvei_w((v4i32)(_1), (_2)))
+
+#define __lsx_vreplvei_d(/*__m128i*/ _1, /*ui1*/ _2)                           \
+  ((__m128i)__builtin_lsx_vreplvei_d((v2i64)(_1), (_2)))
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vpickev_b(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vpickev_b((v16i8)_1, (v16i8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vpickev_h(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vpickev_h((v8i16)_1, (v8i16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vpickev_w(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vpickev_w((v4i32)_1, (v4i32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vpickev_d(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vpickev_d((v2i64)_1, (v2i64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vpickod_b(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vpickod_b((v16i8)_1, (v16i8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vpickod_h(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vpickod_h((v8i16)_1, (v8i16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vpickod_w(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vpickod_w((v4i32)_1, (v4i32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vpickod_d(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vpickod_d((v2i64)_1, (v2i64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vilvh_b(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vilvh_b((v16i8)_1, (v16i8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vilvh_h(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vilvh_h((v8i16)_1, (v8i16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vilvh_w(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vilvh_w((v4i32)_1, (v4i32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vilvh_d(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vilvh_d((v2i64)_1, (v2i64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vilvl_b(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vilvl_b((v16i8)_1, (v16i8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vilvl_h(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vilvl_h((v8i16)_1, (v8i16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vilvl_w(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vilvl_w((v4i32)_1, (v4i32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vilvl_d(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vilvl_d((v2i64)_1, (v2i64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vpackev_b(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vpackev_b((v16i8)_1, (v16i8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vpackev_h(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vpackev_h((v8i16)_1, (v8i16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vpackev_w(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vpackev_w((v4i32)_1, (v4i32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vpackev_d(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vpackev_d((v2i64)_1, (v2i64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vpackod_b(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vpackod_b((v16i8)_1, (v16i8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vpackod_h(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vpackod_h((v8i16)_1, (v8i16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vpackod_w(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vpackod_w((v4i32)_1, (v4i32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vpackod_d(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vpackod_d((v2i64)_1, (v2i64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vshuf_h(__m128i _1, __m128i _2, __m128i _3) {
+  return (__m128i)__builtin_lsx_vshuf_h((v8i16)_1, (v8i16)_2, (v8i16)_3);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vshuf_w(__m128i _1, __m128i _2, __m128i _3) {
+  return (__m128i)__builtin_lsx_vshuf_w((v4i32)_1, (v4i32)_2, (v4i32)_3);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vshuf_d(__m128i _1, __m128i _2, __m128i _3) {
+  return (__m128i)__builtin_lsx_vshuf_d((v2i64)_1, (v2i64)_2, (v2i64)_3);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vand_v(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vand_v((v16u8)_1, (v16u8)_2);
+}
+
+#define __lsx_vandi_b(/*__m128i*/ _1, /*ui8*/ _2)                              \
+  ((__m128i)__builtin_lsx_vandi_b((v16u8)(_1), (_2)))
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vor_v(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vor_v((v16u8)_1, (v16u8)_2);
+}
+
+#define __lsx_vori_b(/*__m128i*/ _1, /*ui8*/ _2)                               \
+  ((__m128i)__builtin_lsx_vori_b((v16u8)(_1), (_2)))
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vnor_v(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vnor_v((v16u8)_1, (v16u8)_2);
+}
+
+#define __lsx_vnori_b(/*__m128i*/ _1, /*ui8*/ _2)                              \
+  ((__m128i)__builtin_lsx_vnori_b((v16u8)(_1), (_2)))
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vxor_v(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vxor_v((v16u8)_1, (v16u8)_2);
+}
+
+#define __lsx_vxori_b(/*__m128i*/ _1, /*ui8*/ _2)                              \
+  ((__m128i)__builtin_lsx_vxori_b((v16u8)(_1), (_2)))
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vbitsel_v(__m128i _1, __m128i _2, __m128i _3) {
+  return (__m128i)__builtin_lsx_vbitsel_v((v16u8)_1, (v16u8)_2, (v16u8)_3);
+}
+
+#define __lsx_vbitseli_b(/*__m128i*/ _1, /*__m128i*/ _2, /*ui8*/ _3)           \
+  ((__m128i)__builtin_lsx_vbitseli_b((v16u8)(_1), (v16u8)(_2), (_3)))
+
+#define __lsx_vshuf4i_b(/*__m128i*/ _1, /*ui8*/ _2)                            \
+  ((__m128i)__builtin_lsx_vshuf4i_b((v16i8)(_1), (_2)))
+
+#define __lsx_vshuf4i_h(/*__m128i*/ _1, /*ui8*/ _2)                            \
+  ((__m128i)__builtin_lsx_vshuf4i_h((v8i16)(_1), (_2)))
+
+#define __lsx_vshuf4i_w(/*__m128i*/ _1, /*ui8*/ _2)                            \
+  ((__m128i)__builtin_lsx_vshuf4i_w((v4i32)(_1), (_2)))
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vreplgr2vr_b(int _1) {
+  return (__m128i)__builtin_lsx_vreplgr2vr_b((int)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vreplgr2vr_h(int _1) {
+  return (__m128i)__builtin_lsx_vreplgr2vr_h((int)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vreplgr2vr_w(int _1) {
+  return (__m128i)__builtin_lsx_vreplgr2vr_w((int)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vreplgr2vr_d(long int _1) {
+  return (__m128i)__builtin_lsx_vreplgr2vr_d((long int)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vpcnt_b(__m128i _1) {
+  return (__m128i)__builtin_lsx_vpcnt_b((v16i8)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vpcnt_h(__m128i _1) {
+  return (__m128i)__builtin_lsx_vpcnt_h((v8i16)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vpcnt_w(__m128i _1) {
+  return (__m128i)__builtin_lsx_vpcnt_w((v4i32)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vpcnt_d(__m128i _1) {
+  return (__m128i)__builtin_lsx_vpcnt_d((v2i64)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vclo_b(__m128i _1) {
+  return (__m128i)__builtin_lsx_vclo_b((v16i8)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vclo_h(__m128i _1) {
+  return (__m128i)__builtin_lsx_vclo_h((v8i16)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vclo_w(__m128i _1) {
+  return (__m128i)__builtin_lsx_vclo_w((v4i32)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vclo_d(__m128i _1) {
+  return (__m128i)__builtin_lsx_vclo_d((v2i64)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vclz_b(__m128i _1) {
+  return (__m128i)__builtin_lsx_vclz_b((v16i8)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vclz_h(__m128i _1) {
+  return (__m128i)__builtin_lsx_vclz_h((v8i16)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vclz_w(__m128i _1) {
+  return (__m128i)__builtin_lsx_vclz_w((v4i32)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vclz_d(__m128i _1) {
+  return (__m128i)__builtin_lsx_vclz_d((v2i64)_1);
+}
+
+#define __lsx_vpickve2gr_b(/*__m128i*/ _1, /*ui4*/ _2)                         \
+  ((int)__builtin_lsx_vpickve2gr_b((v16i8)(_1), (_2)))
+
+#define __lsx_vpickve2gr_h(/*__m128i*/ _1, /*ui3*/ _2)                         \
+  ((int)__builtin_lsx_vpickve2gr_h((v8i16)(_1), (_2)))
+
+#define __lsx_vpickve2gr_w(/*__m128i*/ _1, /*ui2*/ _2)                         \
+  ((int)__builtin_lsx_vpickve2gr_w((v4i32)(_1), (_2)))
+
+#define __lsx_vpickve2gr_d(/*__m128i*/ _1, /*ui1*/ _2)                         \
+  ((long int)__builtin_lsx_vpickve2gr_d((v2i64)(_1), (_2)))
+
+#define __lsx_vpickve2gr_bu(/*__m128i*/ _1, /*ui4*/ _2)                        \
+  ((unsigned int)__builtin_lsx_vpickve2gr_bu((v16i8)(_1), (_2)))
+
+#define __lsx_vpickve2gr_hu(/*__m128i*/ _1, /*ui3*/ _2)                        \
+  ((unsigned int)__builtin_lsx_vpickve2gr_hu((v8i16)(_1), (_2)))
+
+#define __lsx_vpickve2gr_wu(/*__m128i*/ _1, /*ui2*/ _2)                        \
+  ((unsigned int)__builtin_lsx_vpickve2gr_wu((v4i32)(_1), (_2)))
+
+#define __lsx_vpickve2gr_du(/*__m128i*/ _1, /*ui1*/ _2)                        \
+  ((unsigned long int)__builtin_lsx_vpickve2gr_du((v2i64)(_1), (_2)))
+
+#define __lsx_vinsgr2vr_b(/*__m128i*/ _1, /*int*/ _2, /*ui4*/ _3)              \
+  ((__m128i)__builtin_lsx_vinsgr2vr_b((v16i8)(_1), (int)(_2), (_3)))
+
+#define __lsx_vinsgr2vr_h(/*__m128i*/ _1, /*int*/ _2, /*ui3*/ _3)              \
+  ((__m128i)__builtin_lsx_vinsgr2vr_h((v8i16)(_1), (int)(_2), (_3)))
+
+#define __lsx_vinsgr2vr_w(/*__m128i*/ _1, /*int*/ _2, /*ui2*/ _3)              \
+  ((__m128i)__builtin_lsx_vinsgr2vr_w((v4i32)(_1), (int)(_2), (_3)))
+
+#define __lsx_vinsgr2vr_d(/*__m128i*/ _1, /*long int*/ _2, /*ui1*/ _3)         \
+  ((__m128i)__builtin_lsx_vinsgr2vr_d((v2i64)(_1), (long int)(_2), (_3)))
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128
+    __lsx_vfadd_s(__m128 _1, __m128 _2) {
+  return (__m128)__builtin_lsx_vfadd_s((v4f32)_1, (v4f32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128d
+    __lsx_vfadd_d(__m128d _1, __m128d _2) {
+  return (__m128d)__builtin_lsx_vfadd_d((v2f64)_1, (v2f64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128
+    __lsx_vfsub_s(__m128 _1, __m128 _2) {
+  return (__m128)__builtin_lsx_vfsub_s((v4f32)_1, (v4f32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128d
+    __lsx_vfsub_d(__m128d _1, __m128d _2) {
+  return (__m128d)__builtin_lsx_vfsub_d((v2f64)_1, (v2f64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128
+    __lsx_vfmul_s(__m128 _1, __m128 _2) {
+  return (__m128)__builtin_lsx_vfmul_s((v4f32)_1, (v4f32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128d
+    __lsx_vfmul_d(__m128d _1, __m128d _2) {
+  return (__m128d)__builtin_lsx_vfmul_d((v2f64)_1, (v2f64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128
+    __lsx_vfdiv_s(__m128 _1, __m128 _2) {
+  return (__m128)__builtin_lsx_vfdiv_s((v4f32)_1, (v4f32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128d
+    __lsx_vfdiv_d(__m128d _1, __m128d _2) {
+  return (__m128d)__builtin_lsx_vfdiv_d((v2f64)_1, (v2f64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vfcvt_h_s(__m128 _1, __m128 _2) {
+  return (__m128i)__builtin_lsx_vfcvt_h_s((v4f32)_1, (v4f32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128
+    __lsx_vfcvt_s_d(__m128d _1, __m128d _2) {
+  return (__m128)__builtin_lsx_vfcvt_s_d((v2f64)_1, (v2f64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128
+    __lsx_vfmin_s(__m128 _1, __m128 _2) {
+  return (__m128)__builtin_lsx_vfmin_s((v4f32)_1, (v4f32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128d
+    __lsx_vfmin_d(__m128d _1, __m128d _2) {
+  return (__m128d)__builtin_lsx_vfmin_d((v2f64)_1, (v2f64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128
+    __lsx_vfmina_s(__m128 _1, __m128 _2) {
+  return (__m128)__builtin_lsx_vfmina_s((v4f32)_1, (v4f32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128d
+    __lsx_vfmina_d(__m128d _1, __m128d _2) {
+  return (__m128d)__builtin_lsx_vfmina_d((v2f64)_1, (v2f64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128
+    __lsx_vfmax_s(__m128 _1, __m128 _2) {
+  return (__m128)__builtin_lsx_vfmax_s((v4f32)_1, (v4f32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128d
+    __lsx_vfmax_d(__m128d _1, __m128d _2) {
+  return (__m128d)__builtin_lsx_vfmax_d((v2f64)_1, (v2f64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128
+    __lsx_vfmaxa_s(__m128 _1, __m128 _2) {
+  return (__m128)__builtin_lsx_vfmaxa_s((v4f32)_1, (v4f32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128d
+    __lsx_vfmaxa_d(__m128d _1, __m128d _2) {
+  return (__m128d)__builtin_lsx_vfmaxa_d((v2f64)_1, (v2f64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vfclass_s(__m128 _1) {
+  return (__m128i)__builtin_lsx_vfclass_s((v4f32)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vfclass_d(__m128d _1) {
+  return (__m128i)__builtin_lsx_vfclass_d((v2f64)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128
+    __lsx_vfsqrt_s(__m128 _1) {
+  return (__m128)__builtin_lsx_vfsqrt_s((v4f32)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128d
+    __lsx_vfsqrt_d(__m128d _1) {
+  return (__m128d)__builtin_lsx_vfsqrt_d((v2f64)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128
+    __lsx_vfrecip_s(__m128 _1) {
+  return (__m128)__builtin_lsx_vfrecip_s((v4f32)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128d
+    __lsx_vfrecip_d(__m128d _1) {
+  return (__m128d)__builtin_lsx_vfrecip_d((v2f64)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128
+    __lsx_vfrint_s(__m128 _1) {
+  return (__m128)__builtin_lsx_vfrint_s((v4f32)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128d
+    __lsx_vfrint_d(__m128d _1) {
+  return (__m128d)__builtin_lsx_vfrint_d((v2f64)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128
+    __lsx_vfrsqrt_s(__m128 _1) {
+  return (__m128)__builtin_lsx_vfrsqrt_s((v4f32)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128d
+    __lsx_vfrsqrt_d(__m128d _1) {
+  return (__m128d)__builtin_lsx_vfrsqrt_d((v2f64)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128
+    __lsx_vflogb_s(__m128 _1) {
+  return (__m128)__builtin_lsx_vflogb_s((v4f32)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128d
+    __lsx_vflogb_d(__m128d _1) {
+  return (__m128d)__builtin_lsx_vflogb_d((v2f64)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128
+    __lsx_vfcvth_s_h(__m128i _1) {
+  return (__m128)__builtin_lsx_vfcvth_s_h((v8i16)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128d
+    __lsx_vfcvth_d_s(__m128 _1) {
+  return (__m128d)__builtin_lsx_vfcvth_d_s((v4f32)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128
+    __lsx_vfcvtl_s_h(__m128i _1) {
+  return (__m128)__builtin_lsx_vfcvtl_s_h((v8i16)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128d
+    __lsx_vfcvtl_d_s(__m128 _1) {
+  return (__m128d)__builtin_lsx_vfcvtl_d_s((v4f32)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vftint_w_s(__m128 _1) {
+  return (__m128i)__builtin_lsx_vftint_w_s((v4f32)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vftint_l_d(__m128d _1) {
+  return (__m128i)__builtin_lsx_vftint_l_d((v2f64)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vftint_wu_s(__m128 _1) {
+  return (__m128i)__builtin_lsx_vftint_wu_s((v4f32)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vftint_lu_d(__m128d _1) {
+  return (__m128i)__builtin_lsx_vftint_lu_d((v2f64)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vftintrz_w_s(__m128 _1) {
+  return (__m128i)__builtin_lsx_vftintrz_w_s((v4f32)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vftintrz_l_d(__m128d _1) {
+  return (__m128i)__builtin_lsx_vftintrz_l_d((v2f64)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vftintrz_wu_s(__m128 _1) {
+  return (__m128i)__builtin_lsx_vftintrz_wu_s((v4f32)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vftintrz_lu_d(__m128d _1) {
+  return (__m128i)__builtin_lsx_vftintrz_lu_d((v2f64)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128
+    __lsx_vffint_s_w(__m128i _1) {
+  return (__m128)__builtin_lsx_vffint_s_w((v4i32)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128d
+    __lsx_vffint_d_l(__m128i _1) {
+  return (__m128d)__builtin_lsx_vffint_d_l((v2i64)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128
+    __lsx_vffint_s_wu(__m128i _1) {
+  return (__m128)__builtin_lsx_vffint_s_wu((v4u32)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128d
+    __lsx_vffint_d_lu(__m128i _1) {
+  return (__m128d)__builtin_lsx_vffint_d_lu((v2u64)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vandn_v(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vandn_v((v16u8)_1, (v16u8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vneg_b(__m128i _1) {
+  return (__m128i)__builtin_lsx_vneg_b((v16i8)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vneg_h(__m128i _1) {
+  return (__m128i)__builtin_lsx_vneg_h((v8i16)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vneg_w(__m128i _1) {
+  return (__m128i)__builtin_lsx_vneg_w((v4i32)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vneg_d(__m128i _1) {
+  return (__m128i)__builtin_lsx_vneg_d((v2i64)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vmuh_b(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vmuh_b((v16i8)_1, (v16i8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vmuh_h(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vmuh_h((v8i16)_1, (v8i16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vmuh_w(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vmuh_w((v4i32)_1, (v4i32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vmuh_d(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vmuh_d((v2i64)_1, (v2i64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vmuh_bu(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vmuh_bu((v16u8)_1, (v16u8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vmuh_hu(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vmuh_hu((v8u16)_1, (v8u16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vmuh_wu(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vmuh_wu((v4u32)_1, (v4u32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vmuh_du(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vmuh_du((v2u64)_1, (v2u64)_2);
+}
+
+#define __lsx_vsllwil_h_b(/*__m128i*/ _1, /*ui3*/ _2)                          \
+  ((__m128i)__builtin_lsx_vsllwil_h_b((v16i8)(_1), (_2)))
+
+#define __lsx_vsllwil_w_h(/*__m128i*/ _1, /*ui4*/ _2)                          \
+  ((__m128i)__builtin_lsx_vsllwil_w_h((v8i16)(_1), (_2)))
+
+#define __lsx_vsllwil_d_w(/*__m128i*/ _1, /*ui5*/ _2)                          \
+  ((__m128i)__builtin_lsx_vsllwil_d_w((v4i32)(_1), (_2)))
+
+#define __lsx_vsllwil_hu_bu(/*__m128i*/ _1, /*ui3*/ _2)                        \
+  ((__m128i)__builtin_lsx_vsllwil_hu_bu((v16u8)(_1), (_2)))
+
+#define __lsx_vsllwil_wu_hu(/*__m128i*/ _1, /*ui4*/ _2)                        \
+  ((__m128i)__builtin_lsx_vsllwil_wu_hu((v8u16)(_1), (_2)))
+
+#define __lsx_vsllwil_du_wu(/*__m128i*/ _1, /*ui5*/ _2)                        \
+  ((__m128i)__builtin_lsx_vsllwil_du_wu((v4u32)(_1), (_2)))
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vsran_b_h(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vsran_b_h((v8i16)_1, (v8i16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vsran_h_w(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vsran_h_w((v4i32)_1, (v4i32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vsran_w_d(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vsran_w_d((v2i64)_1, (v2i64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vssran_b_h(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vssran_b_h((v8i16)_1, (v8i16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vssran_h_w(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vssran_h_w((v4i32)_1, (v4i32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vssran_w_d(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vssran_w_d((v2i64)_1, (v2i64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vssran_bu_h(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vssran_bu_h((v8u16)_1, (v8u16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vssran_hu_w(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vssran_hu_w((v4u32)_1, (v4u32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vssran_wu_d(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vssran_wu_d((v2u64)_1, (v2u64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vsrarn_b_h(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vsrarn_b_h((v8i16)_1, (v8i16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vsrarn_h_w(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vsrarn_h_w((v4i32)_1, (v4i32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vsrarn_w_d(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vsrarn_w_d((v2i64)_1, (v2i64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vssrarn_b_h(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vssrarn_b_h((v8i16)_1, (v8i16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vssrarn_h_w(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vssrarn_h_w((v4i32)_1, (v4i32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vssrarn_w_d(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vssrarn_w_d((v2i64)_1, (v2i64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vssrarn_bu_h(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vssrarn_bu_h((v8u16)_1, (v8u16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vssrarn_hu_w(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vssrarn_hu_w((v4u32)_1, (v4u32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vssrarn_wu_d(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vssrarn_wu_d((v2u64)_1, (v2u64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vsrln_b_h(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vsrln_b_h((v8i16)_1, (v8i16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vsrln_h_w(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vsrln_h_w((v4i32)_1, (v4i32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vsrln_w_d(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vsrln_w_d((v2i64)_1, (v2i64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vssrln_bu_h(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vssrln_bu_h((v8u16)_1, (v8u16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vssrln_hu_w(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vssrln_hu_w((v4u32)_1, (v4u32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vssrln_wu_d(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vssrln_wu_d((v2u64)_1, (v2u64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vsrlrn_b_h(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vsrlrn_b_h((v8i16)_1, (v8i16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vsrlrn_h_w(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vsrlrn_h_w((v4i32)_1, (v4i32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vsrlrn_w_d(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vsrlrn_w_d((v2i64)_1, (v2i64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vssrlrn_bu_h(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vssrlrn_bu_h((v8u16)_1, (v8u16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vssrlrn_hu_w(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vssrlrn_hu_w((v4u32)_1, (v4u32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vssrlrn_wu_d(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vssrlrn_wu_d((v2u64)_1, (v2u64)_2);
+}
+
+#define __lsx_vfrstpi_b(/*__m128i*/ _1, /*__m128i*/ _2, /*ui5*/ _3)            \
+  ((__m128i)__builtin_lsx_vfrstpi_b((v16i8)(_1), (v16i8)(_2), (_3)))
+
+#define __lsx_vfrstpi_h(/*__m128i*/ _1, /*__m128i*/ _2, /*ui5*/ _3)            \
+  ((__m128i)__builtin_lsx_vfrstpi_h((v8i16)(_1), (v8i16)(_2), (_3)))
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vfrstp_b(__m128i _1, __m128i _2, __m128i _3) {
+  return (__m128i)__builtin_lsx_vfrstp_b((v16i8)_1, (v16i8)_2, (v16i8)_3);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vfrstp_h(__m128i _1, __m128i _2, __m128i _3) {
+  return (__m128i)__builtin_lsx_vfrstp_h((v8i16)_1, (v8i16)_2, (v8i16)_3);
+}
+
+#define __lsx_vshuf4i_d(/*__m128i*/ _1, /*__m128i*/ _2, /*ui8*/ _3)            \
+  ((__m128i)__builtin_lsx_vshuf4i_d((v2i64)(_1), (v2i64)(_2), (_3)))
+
+#define __lsx_vbsrl_v(/*__m128i*/ _1, /*ui5*/ _2)                              \
+  ((__m128i)__builtin_lsx_vbsrl_v((v16i8)(_1), (_2)))
+
+#define __lsx_vbsll_v(/*__m128i*/ _1, /*ui5*/ _2)                              \
+  ((__m128i)__builtin_lsx_vbsll_v((v16i8)(_1), (_2)))
+
+#define __lsx_vextrins_b(/*__m128i*/ _1, /*__m128i*/ _2, /*ui8*/ _3)           \
+  ((__m128i)__builtin_lsx_vextrins_b((v16i8)(_1), (v16i8)(_2), (_3)))
+
+#define __lsx_vextrins_h(/*__m128i*/ _1, /*__m128i*/ _2, /*ui8*/ _3)           \
+  ((__m128i)__builtin_lsx_vextrins_h((v8i16)(_1), (v8i16)(_2), (_3)))
+
+#define __lsx_vextrins_w(/*__m128i*/ _1, /*__m128i*/ _2, /*ui8*/ _3)           \
+  ((__m128i)__builtin_lsx_vextrins_w((v4i32)(_1), (v4i32)(_2), (_3)))
+
+#define __lsx_vextrins_d(/*__m128i*/ _1, /*__m128i*/ _2, /*ui8*/ _3)           \
+  ((__m128i)__builtin_lsx_vextrins_d((v2i64)(_1), (v2i64)(_2), (_3)))
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vmskltz_b(__m128i _1) {
+  return (__m128i)__builtin_lsx_vmskltz_b((v16i8)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vmskltz_h(__m128i _1) {
+  return (__m128i)__builtin_lsx_vmskltz_h((v8i16)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vmskltz_w(__m128i _1) {
+  return (__m128i)__builtin_lsx_vmskltz_w((v4i32)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vmskltz_d(__m128i _1) {
+  return (__m128i)__builtin_lsx_vmskltz_d((v2i64)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vsigncov_b(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vsigncov_b((v16i8)_1, (v16i8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vsigncov_h(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vsigncov_h((v8i16)_1, (v8i16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vsigncov_w(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vsigncov_w((v4i32)_1, (v4i32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vsigncov_d(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vsigncov_d((v2i64)_1, (v2i64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128
+    __lsx_vfmadd_s(__m128 _1, __m128 _2, __m128 _3) {
+  return (__m128)__builtin_lsx_vfmadd_s((v4f32)_1, (v4f32)_2, (v4f32)_3);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128d
+    __lsx_vfmadd_d(__m128d _1, __m128d _2, __m128d _3) {
+  return (__m128d)__builtin_lsx_vfmadd_d((v2f64)_1, (v2f64)_2, (v2f64)_3);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128
+    __lsx_vfmsub_s(__m128 _1, __m128 _2, __m128 _3) {
+  return (__m128)__builtin_lsx_vfmsub_s((v4f32)_1, (v4f32)_2, (v4f32)_3);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128d
+    __lsx_vfmsub_d(__m128d _1, __m128d _2, __m128d _3) {
+  return (__m128d)__builtin_lsx_vfmsub_d((v2f64)_1, (v2f64)_2, (v2f64)_3);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128
+    __lsx_vfnmadd_s(__m128 _1, __m128 _2, __m128 _3) {
+  return (__m128)__builtin_lsx_vfnmadd_s((v4f32)_1, (v4f32)_2, (v4f32)_3);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128d
+    __lsx_vfnmadd_d(__m128d _1, __m128d _2, __m128d _3) {
+  return (__m128d)__builtin_lsx_vfnmadd_d((v2f64)_1, (v2f64)_2, (v2f64)_3);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128
+    __lsx_vfnmsub_s(__m128 _1, __m128 _2, __m128 _3) {
+  return (__m128)__builtin_lsx_vfnmsub_s((v4f32)_1, (v4f32)_2, (v4f32)_3);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128d
+    __lsx_vfnmsub_d(__m128d _1, __m128d _2, __m128d _3) {
+  return (__m128d)__builtin_lsx_vfnmsub_d((v2f64)_1, (v2f64)_2, (v2f64)_3);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vftintrne_w_s(__m128 _1) {
+  return (__m128i)__builtin_lsx_vftintrne_w_s((v4f32)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vftintrne_l_d(__m128d _1) {
+  return (__m128i)__builtin_lsx_vftintrne_l_d((v2f64)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vftintrp_w_s(__m128 _1) {
+  return (__m128i)__builtin_lsx_vftintrp_w_s((v4f32)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vftintrp_l_d(__m128d _1) {
+  return (__m128i)__builtin_lsx_vftintrp_l_d((v2f64)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vftintrm_w_s(__m128 _1) {
+  return (__m128i)__builtin_lsx_vftintrm_w_s((v4f32)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vftintrm_l_d(__m128d _1) {
+  return (__m128i)__builtin_lsx_vftintrm_l_d((v2f64)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vftint_w_d(__m128d _1, __m128d _2) {
+  return (__m128i)__builtin_lsx_vftint_w_d((v2f64)_1, (v2f64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128
+    __lsx_vffint_s_l(__m128i _1, __m128i _2) {
+  return (__m128)__builtin_lsx_vffint_s_l((v2i64)_1, (v2i64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vftintrz_w_d(__m128d _1, __m128d _2) {
+  return (__m128i)__builtin_lsx_vftintrz_w_d((v2f64)_1, (v2f64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vftintrp_w_d(__m128d _1, __m128d _2) {
+  return (__m128i)__builtin_lsx_vftintrp_w_d((v2f64)_1, (v2f64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vftintrm_w_d(__m128d _1, __m128d _2) {
+  return (__m128i)__builtin_lsx_vftintrm_w_d((v2f64)_1, (v2f64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vftintrne_w_d(__m128d _1, __m128d _2) {
+  return (__m128i)__builtin_lsx_vftintrne_w_d((v2f64)_1, (v2f64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vftintl_l_s(__m128 _1) {
+  return (__m128i)__builtin_lsx_vftintl_l_s((v4f32)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vftinth_l_s(__m128 _1) {
+  return (__m128i)__builtin_lsx_vftinth_l_s((v4f32)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128d
+    __lsx_vffinth_d_w(__m128i _1) {
+  return (__m128d)__builtin_lsx_vffinth_d_w((v4i32)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128d
+    __lsx_vffintl_d_w(__m128i _1) {
+  return (__m128d)__builtin_lsx_vffintl_d_w((v4i32)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vftintrzl_l_s(__m128 _1) {
+  return (__m128i)__builtin_lsx_vftintrzl_l_s((v4f32)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vftintrzh_l_s(__m128 _1) {
+  return (__m128i)__builtin_lsx_vftintrzh_l_s((v4f32)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vftintrpl_l_s(__m128 _1) {
+  return (__m128i)__builtin_lsx_vftintrpl_l_s((v4f32)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vftintrph_l_s(__m128 _1) {
+  return (__m128i)__builtin_lsx_vftintrph_l_s((v4f32)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vftintrml_l_s(__m128 _1) {
+  return (__m128i)__builtin_lsx_vftintrml_l_s((v4f32)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vftintrmh_l_s(__m128 _1) {
+  return (__m128i)__builtin_lsx_vftintrmh_l_s((v4f32)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vftintrnel_l_s(__m128 _1) {
+  return (__m128i)__builtin_lsx_vftintrnel_l_s((v4f32)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vftintrneh_l_s(__m128 _1) {
+  return (__m128i)__builtin_lsx_vftintrneh_l_s((v4f32)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128
+    __lsx_vfrintrne_s(__m128 _1) {
+  return (__m128)__builtin_lsx_vfrintrne_s((v4f32)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128d
+    __lsx_vfrintrne_d(__m128d _1) {
+  return (__m128d)__builtin_lsx_vfrintrne_d((v2f64)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128
+    __lsx_vfrintrz_s(__m128 _1) {
+  return (__m128)__builtin_lsx_vfrintrz_s((v4f32)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128d
+    __lsx_vfrintrz_d(__m128d _1) {
+  return (__m128d)__builtin_lsx_vfrintrz_d((v2f64)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128
+    __lsx_vfrintrp_s(__m128 _1) {
+  return (__m128)__builtin_lsx_vfrintrp_s((v4f32)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128d
+    __lsx_vfrintrp_d(__m128d _1) {
+  return (__m128d)__builtin_lsx_vfrintrp_d((v2f64)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128
+    __lsx_vfrintrm_s(__m128 _1) {
+  return (__m128)__builtin_lsx_vfrintrm_s((v4f32)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128d
+    __lsx_vfrintrm_d(__m128d _1) {
+  return (__m128d)__builtin_lsx_vfrintrm_d((v2f64)_1);
+}
+
+#define __lsx_vstelm_b(/*__m128i*/ _1, /*void **/ _2, /*si8*/ _3, /*idx*/ _4)  \
+  ((void)__builtin_lsx_vstelm_b((v16i8)(_1), (void *)(_2), (_3), (_4)))
+
+#define __lsx_vstelm_h(/*__m128i*/ _1, /*void **/ _2, /*si8*/ _3, /*idx*/ _4)  \
+  ((void)__builtin_lsx_vstelm_h((v8i16)(_1), (void *)(_2), (_3), (_4)))
+
+#define __lsx_vstelm_w(/*__m128i*/ _1, /*void **/ _2, /*si8*/ _3, /*idx*/ _4)  \
+  ((void)__builtin_lsx_vstelm_w((v4i32)(_1), (void *)(_2), (_3), (_4)))
+
+#define __lsx_vstelm_d(/*__m128i*/ _1, /*void **/ _2, /*si8*/ _3, /*idx*/ _4)  \
+  ((void)__builtin_lsx_vstelm_d((v2i64)(_1), (void *)(_2), (_3), (_4)))
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vaddwev_d_w(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vaddwev_d_w((v4i32)_1, (v4i32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vaddwev_w_h(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vaddwev_w_h((v8i16)_1, (v8i16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vaddwev_h_b(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vaddwev_h_b((v16i8)_1, (v16i8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vaddwod_d_w(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vaddwod_d_w((v4i32)_1, (v4i32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vaddwod_w_h(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vaddwod_w_h((v8i16)_1, (v8i16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vaddwod_h_b(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vaddwod_h_b((v16i8)_1, (v16i8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vaddwev_d_wu(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vaddwev_d_wu((v4u32)_1, (v4u32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vaddwev_w_hu(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vaddwev_w_hu((v8u16)_1, (v8u16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vaddwev_h_bu(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vaddwev_h_bu((v16u8)_1, (v16u8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vaddwod_d_wu(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vaddwod_d_wu((v4u32)_1, (v4u32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vaddwod_w_hu(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vaddwod_w_hu((v8u16)_1, (v8u16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vaddwod_h_bu(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vaddwod_h_bu((v16u8)_1, (v16u8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vaddwev_d_wu_w(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vaddwev_d_wu_w((v4u32)_1, (v4i32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vaddwev_w_hu_h(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vaddwev_w_hu_h((v8u16)_1, (v8i16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vaddwev_h_bu_b(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vaddwev_h_bu_b((v16u8)_1, (v16i8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vaddwod_d_wu_w(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vaddwod_d_wu_w((v4u32)_1, (v4i32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vaddwod_w_hu_h(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vaddwod_w_hu_h((v8u16)_1, (v8i16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vaddwod_h_bu_b(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vaddwod_h_bu_b((v16u8)_1, (v16i8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vsubwev_d_w(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vsubwev_d_w((v4i32)_1, (v4i32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vsubwev_w_h(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vsubwev_w_h((v8i16)_1, (v8i16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vsubwev_h_b(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vsubwev_h_b((v16i8)_1, (v16i8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vsubwod_d_w(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vsubwod_d_w((v4i32)_1, (v4i32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vsubwod_w_h(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vsubwod_w_h((v8i16)_1, (v8i16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vsubwod_h_b(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vsubwod_h_b((v16i8)_1, (v16i8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vsubwev_d_wu(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vsubwev_d_wu((v4u32)_1, (v4u32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vsubwev_w_hu(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vsubwev_w_hu((v8u16)_1, (v8u16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vsubwev_h_bu(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vsubwev_h_bu((v16u8)_1, (v16u8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vsubwod_d_wu(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vsubwod_d_wu((v4u32)_1, (v4u32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vsubwod_w_hu(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vsubwod_w_hu((v8u16)_1, (v8u16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vsubwod_h_bu(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vsubwod_h_bu((v16u8)_1, (v16u8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vaddwev_q_d(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vaddwev_q_d((v2i64)_1, (v2i64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vaddwod_q_d(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vaddwod_q_d((v2i64)_1, (v2i64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vaddwev_q_du(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vaddwev_q_du((v2u64)_1, (v2u64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vaddwod_q_du(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vaddwod_q_du((v2u64)_1, (v2u64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vsubwev_q_d(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vsubwev_q_d((v2i64)_1, (v2i64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vsubwod_q_d(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vsubwod_q_d((v2i64)_1, (v2i64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vsubwev_q_du(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vsubwev_q_du((v2u64)_1, (v2u64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vsubwod_q_du(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vsubwod_q_du((v2u64)_1, (v2u64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vaddwev_q_du_d(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vaddwev_q_du_d((v2u64)_1, (v2i64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vaddwod_q_du_d(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vaddwod_q_du_d((v2u64)_1, (v2i64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vmulwev_d_w(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vmulwev_d_w((v4i32)_1, (v4i32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vmulwev_w_h(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vmulwev_w_h((v8i16)_1, (v8i16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vmulwev_h_b(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vmulwev_h_b((v16i8)_1, (v16i8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vmulwod_d_w(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vmulwod_d_w((v4i32)_1, (v4i32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vmulwod_w_h(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vmulwod_w_h((v8i16)_1, (v8i16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vmulwod_h_b(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vmulwod_h_b((v16i8)_1, (v16i8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vmulwev_d_wu(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vmulwev_d_wu((v4u32)_1, (v4u32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vmulwev_w_hu(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vmulwev_w_hu((v8u16)_1, (v8u16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vmulwev_h_bu(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vmulwev_h_bu((v16u8)_1, (v16u8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vmulwod_d_wu(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vmulwod_d_wu((v4u32)_1, (v4u32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vmulwod_w_hu(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vmulwod_w_hu((v8u16)_1, (v8u16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vmulwod_h_bu(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vmulwod_h_bu((v16u8)_1, (v16u8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vmulwev_d_wu_w(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vmulwev_d_wu_w((v4u32)_1, (v4i32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vmulwev_w_hu_h(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vmulwev_w_hu_h((v8u16)_1, (v8i16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vmulwev_h_bu_b(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vmulwev_h_bu_b((v16u8)_1, (v16i8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vmulwod_d_wu_w(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vmulwod_d_wu_w((v4u32)_1, (v4i32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vmulwod_w_hu_h(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vmulwod_w_hu_h((v8u16)_1, (v8i16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vmulwod_h_bu_b(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vmulwod_h_bu_b((v16u8)_1, (v16i8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vmulwev_q_d(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vmulwev_q_d((v2i64)_1, (v2i64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vmulwod_q_d(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vmulwod_q_d((v2i64)_1, (v2i64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vmulwev_q_du(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vmulwev_q_du((v2u64)_1, (v2u64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vmulwod_q_du(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vmulwod_q_du((v2u64)_1, (v2u64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vmulwev_q_du_d(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vmulwev_q_du_d((v2u64)_1, (v2i64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vmulwod_q_du_d(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vmulwod_q_du_d((v2u64)_1, (v2i64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vhaddw_q_d(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vhaddw_q_d((v2i64)_1, (v2i64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vhaddw_qu_du(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vhaddw_qu_du((v2u64)_1, (v2u64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vhsubw_q_d(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vhsubw_q_d((v2i64)_1, (v2i64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vhsubw_qu_du(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vhsubw_qu_du((v2u64)_1, (v2u64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vmaddwev_d_w(__m128i _1, __m128i _2, __m128i _3) {
+  return (__m128i)__builtin_lsx_vmaddwev_d_w((v2i64)_1, (v4i32)_2, (v4i32)_3);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vmaddwev_w_h(__m128i _1, __m128i _2, __m128i _3) {
+  return (__m128i)__builtin_lsx_vmaddwev_w_h((v4i32)_1, (v8i16)_2, (v8i16)_3);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vmaddwev_h_b(__m128i _1, __m128i _2, __m128i _3) {
+  return (__m128i)__builtin_lsx_vmaddwev_h_b((v8i16)_1, (v16i8)_2, (v16i8)_3);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vmaddwev_d_wu(__m128i _1, __m128i _2, __m128i _3) {
+  return (__m128i)__builtin_lsx_vmaddwev_d_wu((v2u64)_1, (v4u32)_2, (v4u32)_3);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vmaddwev_w_hu(__m128i _1, __m128i _2, __m128i _3) {
+  return (__m128i)__builtin_lsx_vmaddwev_w_hu((v4u32)_1, (v8u16)_2, (v8u16)_3);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vmaddwev_h_bu(__m128i _1, __m128i _2, __m128i _3) {
+  return (__m128i)__builtin_lsx_vmaddwev_h_bu((v8u16)_1, (v16u8)_2, (v16u8)_3);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vmaddwod_d_w(__m128i _1, __m128i _2, __m128i _3) {
+  return (__m128i)__builtin_lsx_vmaddwod_d_w((v2i64)_1, (v4i32)_2, (v4i32)_3);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vmaddwod_w_h(__m128i _1, __m128i _2, __m128i _3) {
+  return (__m128i)__builtin_lsx_vmaddwod_w_h((v4i32)_1, (v8i16)_2, (v8i16)_3);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vmaddwod_h_b(__m128i _1, __m128i _2, __m128i _3) {
+  return (__m128i)__builtin_lsx_vmaddwod_h_b((v8i16)_1, (v16i8)_2, (v16i8)_3);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vmaddwod_d_wu(__m128i _1, __m128i _2, __m128i _3) {
+  return (__m128i)__builtin_lsx_vmaddwod_d_wu((v2u64)_1, (v4u32)_2, (v4u32)_3);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vmaddwod_w_hu(__m128i _1, __m128i _2, __m128i _3) {
+  return (__m128i)__builtin_lsx_vmaddwod_w_hu((v4u32)_1, (v8u16)_2, (v8u16)_3);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vmaddwod_h_bu(__m128i _1, __m128i _2, __m128i _3) {
+  return (__m128i)__builtin_lsx_vmaddwod_h_bu((v8u16)_1, (v16u8)_2, (v16u8)_3);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vmaddwev_d_wu_w(__m128i _1, __m128i _2, __m128i _3) {
+  return (__m128i)__builtin_lsx_vmaddwev_d_wu_w((v2i64)_1, (v4u32)_2,
+                                                (v4i32)_3);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vmaddwev_w_hu_h(__m128i _1, __m128i _2, __m128i _3) {
+  return (__m128i)__builtin_lsx_vmaddwev_w_hu_h((v4i32)_1, (v8u16)_2,
+                                                (v8i16)_3);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vmaddwev_h_bu_b(__m128i _1, __m128i _2, __m128i _3) {
+  return (__m128i)__builtin_lsx_vmaddwev_h_bu_b((v8i16)_1, (v16u8)_2,
+                                                (v16i8)_3);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vmaddwod_d_wu_w(__m128i _1, __m128i _2, __m128i _3) {
+  return (__m128i)__builtin_lsx_vmaddwod_d_wu_w((v2i64)_1, (v4u32)_2,
+                                                (v4i32)_3);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vmaddwod_w_hu_h(__m128i _1, __m128i _2, __m128i _3) {
+  return (__m128i)__builtin_lsx_vmaddwod_w_hu_h((v4i32)_1, (v8u16)_2,
+                                                (v8i16)_3);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vmaddwod_h_bu_b(__m128i _1, __m128i _2, __m128i _3) {
+  return (__m128i)__builtin_lsx_vmaddwod_h_bu_b((v8i16)_1, (v16u8)_2,
+                                                (v16i8)_3);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vmaddwev_q_d(__m128i _1, __m128i _2, __m128i _3) {
+  return (__m128i)__builtin_lsx_vmaddwev_q_d((v2i64)_1, (v2i64)_2, (v2i64)_3);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vmaddwod_q_d(__m128i _1, __m128i _2, __m128i _3) {
+  return (__m128i)__builtin_lsx_vmaddwod_q_d((v2i64)_1, (v2i64)_2, (v2i64)_3);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vmaddwev_q_du(__m128i _1, __m128i _2, __m128i _3) {
+  return (__m128i)__builtin_lsx_vmaddwev_q_du((v2u64)_1, (v2u64)_2, (v2u64)_3);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vmaddwod_q_du(__m128i _1, __m128i _2, __m128i _3) {
+  return (__m128i)__builtin_lsx_vmaddwod_q_du((v2u64)_1, (v2u64)_2, (v2u64)_3);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vmaddwev_q_du_d(__m128i _1, __m128i _2, __m128i _3) {
+  return (__m128i)__builtin_lsx_vmaddwev_q_du_d((v2i64)_1, (v2u64)_2,
+                                                (v2i64)_3);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vmaddwod_q_du_d(__m128i _1, __m128i _2, __m128i _3) {
+  return (__m128i)__builtin_lsx_vmaddwod_q_du_d((v2i64)_1, (v2u64)_2,
+                                                (v2i64)_3);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vrotr_b(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vrotr_b((v16i8)_1, (v16i8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vrotr_h(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vrotr_h((v8i16)_1, (v8i16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vrotr_w(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vrotr_w((v4i32)_1, (v4i32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vrotr_d(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vrotr_d((v2i64)_1, (v2i64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vadd_q(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vadd_q((v2i64)_1, (v2i64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vsub_q(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vsub_q((v2i64)_1, (v2i64)_2);
+}
+
+#define __lsx_vldrepl_b(/*void **/ _1, /*si12*/ _2)                            \
+  ((__m128i)__builtin_lsx_vldrepl_b((void const *)(_1), (_2)))
+
+#define __lsx_vldrepl_h(/*void **/ _1, /*si11*/ _2)                            \
+  ((__m128i)__builtin_lsx_vldrepl_h((void const *)(_1), (_2)))
+
+#define __lsx_vldrepl_w(/*void **/ _1, /*si10*/ _2)                            \
+  ((__m128i)__builtin_lsx_vldrepl_w((void const *)(_1), (_2)))
+
+#define __lsx_vldrepl_d(/*void **/ _1, /*si9*/ _2)                             \
+  ((__m128i)__builtin_lsx_vldrepl_d((void const *)(_1), (_2)))
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vmskgez_b(__m128i _1) {
+  return (__m128i)__builtin_lsx_vmskgez_b((v16i8)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vmsknz_b(__m128i _1) {
+  return (__m128i)__builtin_lsx_vmsknz_b((v16i8)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vexth_h_b(__m128i _1) {
+  return (__m128i)__builtin_lsx_vexth_h_b((v16i8)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vexth_w_h(__m128i _1) {
+  return (__m128i)__builtin_lsx_vexth_w_h((v8i16)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vexth_d_w(__m128i _1) {
+  return (__m128i)__builtin_lsx_vexth_d_w((v4i32)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vexth_q_d(__m128i _1) {
+  return (__m128i)__builtin_lsx_vexth_q_d((v2i64)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vexth_hu_bu(__m128i _1) {
+  return (__m128i)__builtin_lsx_vexth_hu_bu((v16u8)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vexth_wu_hu(__m128i _1) {
+  return (__m128i)__builtin_lsx_vexth_wu_hu((v8u16)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vexth_du_wu(__m128i _1) {
+  return (__m128i)__builtin_lsx_vexth_du_wu((v4u32)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vexth_qu_du(__m128i _1) {
+  return (__m128i)__builtin_lsx_vexth_qu_du((v2u64)_1);
+}
+
+#define __lsx_vrotri_b(/*__m128i*/ _1, /*ui3*/ _2)                             \
+  ((__m128i)__builtin_lsx_vrotri_b((v16i8)(_1), (_2)))
+
+#define __lsx_vrotri_h(/*__m128i*/ _1, /*ui4*/ _2)                             \
+  ((__m128i)__builtin_lsx_vrotri_h((v8i16)(_1), (_2)))
+
+#define __lsx_vrotri_w(/*__m128i*/ _1, /*ui5*/ _2)                             \
+  ((__m128i)__builtin_lsx_vrotri_w((v4i32)(_1), (_2)))
+
+#define __lsx_vrotri_d(/*__m128i*/ _1, /*ui6*/ _2)                             \
+  ((__m128i)__builtin_lsx_vrotri_d((v2i64)(_1), (_2)))
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vextl_q_d(__m128i _1) {
+  return (__m128i)__builtin_lsx_vextl_q_d((v2i64)_1);
+}
+
+#define __lsx_vsrlni_b_h(/*__m128i*/ _1, /*__m128i*/ _2, /*ui4*/ _3)           \
+  ((__m128i)__builtin_lsx_vsrlni_b_h((v16i8)(_1), (v16i8)(_2), (_3)))
+
+#define __lsx_vsrlni_h_w(/*__m128i*/ _1, /*__m128i*/ _2, /*ui5*/ _3)           \
+  ((__m128i)__builtin_lsx_vsrlni_h_w((v8i16)(_1), (v8i16)(_2), (_3)))
+
+#define __lsx_vsrlni_w_d(/*__m128i*/ _1, /*__m128i*/ _2, /*ui6*/ _3)           \
+  ((__m128i)__builtin_lsx_vsrlni_w_d((v4i32)(_1), (v4i32)(_2), (_3)))
+
+#define __lsx_vsrlni_d_q(/*__m128i*/ _1, /*__m128i*/ _2, /*ui7*/ _3)           \
+  ((__m128i)__builtin_lsx_vsrlni_d_q((v2i64)(_1), (v2i64)(_2), (_3)))
+
+#define __lsx_vsrlrni_b_h(/*__m128i*/ _1, /*__m128i*/ _2, /*ui4*/ _3)          \
+  ((__m128i)__builtin_lsx_vsrlrni_b_h((v16i8)(_1), (v16i8)(_2), (_3)))
+
+#define __lsx_vsrlrni_h_w(/*__m128i*/ _1, /*__m128i*/ _2, /*ui5*/ _3)          \
+  ((__m128i)__builtin_lsx_vsrlrni_h_w((v8i16)(_1), (v8i16)(_2), (_3)))
+
+#define __lsx_vsrlrni_w_d(/*__m128i*/ _1, /*__m128i*/ _2, /*ui6*/ _3)          \
+  ((__m128i)__builtin_lsx_vsrlrni_w_d((v4i32)(_1), (v4i32)(_2), (_3)))
+
+#define __lsx_vsrlrni_d_q(/*__m128i*/ _1, /*__m128i*/ _2, /*ui7*/ _3)          \
+  ((__m128i)__builtin_lsx_vsrlrni_d_q((v2i64)(_1), (v2i64)(_2), (_3)))
+
+#define __lsx_vssrlni_b_h(/*__m128i*/ _1, /*__m128i*/ _2, /*ui4*/ _3)          \
+  ((__m128i)__builtin_lsx_vssrlni_b_h((v16i8)(_1), (v16i8)(_2), (_3)))
+
+#define __lsx_vssrlni_h_w(/*__m128i*/ _1, /*__m128i*/ _2, /*ui5*/ _3)          \
+  ((__m128i)__builtin_lsx_vssrlni_h_w((v8i16)(_1), (v8i16)(_2), (_3)))
+
+#define __lsx_vssrlni_w_d(/*__m128i*/ _1, /*__m128i*/ _2, /*ui6*/ _3)          \
+  ((__m128i)__builtin_lsx_vssrlni_w_d((v4i32)(_1), (v4i32)(_2), (_3)))
+
+#define __lsx_vssrlni_d_q(/*__m128i*/ _1, /*__m128i*/ _2, /*ui7*/ _3)          \
+  ((__m128i)__builtin_lsx_vssrlni_d_q((v2i64)(_1), (v2i64)(_2), (_3)))
+
+#define __lsx_vssrlni_bu_h(/*__m128i*/ _1, /*__m128i*/ _2, /*ui4*/ _3)         \
+  ((__m128i)__builtin_lsx_vssrlni_bu_h((v16u8)(_1), (v16i8)(_2), (_3)))
+
+#define __lsx_vssrlni_hu_w(/*__m128i*/ _1, /*__m128i*/ _2, /*ui5*/ _3)         \
+  ((__m128i)__builtin_lsx_vssrlni_hu_w((v8u16)(_1), (v8i16)(_2), (_3)))
+
+#define __lsx_vssrlni_wu_d(/*__m128i*/ _1, /*__m128i*/ _2, /*ui6*/ _3)         \
+  ((__m128i)__builtin_lsx_vssrlni_wu_d((v4u32)(_1), (v4i32)(_2), (_3)))
+
+#define __lsx_vssrlni_du_q(/*__m128i*/ _1, /*__m128i*/ _2, /*ui7*/ _3)         \
+  ((__m128i)__builtin_lsx_vssrlni_du_q((v2u64)(_1), (v2i64)(_2), (_3)))
+
+#define __lsx_vssrlrni_b_h(/*__m128i*/ _1, /*__m128i*/ _2, /*ui4*/ _3)         \
+  ((__m128i)__builtin_lsx_vssrlrni_b_h((v16i8)(_1), (v16i8)(_2), (_3)))
+
+#define __lsx_vssrlrni_h_w(/*__m128i*/ _1, /*__m128i*/ _2, /*ui5*/ _3)         \
+  ((__m128i)__builtin_lsx_vssrlrni_h_w((v8i16)(_1), (v8i16)(_2), (_3)))
+
+#define __lsx_vssrlrni_w_d(/*__m128i*/ _1, /*__m128i*/ _2, /*ui6*/ _3)         \
+  ((__m128i)__builtin_lsx_vssrlrni_w_d((v4i32)(_1), (v4i32)(_2), (_3)))
+
+#define __lsx_vssrlrni_d_q(/*__m128i*/ _1, /*__m128i*/ _2, /*ui7*/ _3)         \
+  ((__m128i)__builtin_lsx_vssrlrni_d_q((v2i64)(_1), (v2i64)(_2), (_3)))
+
+#define __lsx_vssrlrni_bu_h(/*__m128i*/ _1, /*__m128i*/ _2, /*ui4*/ _3)        \
+  ((__m128i)__builtin_lsx_vssrlrni_bu_h((v16u8)(_1), (v16i8)(_2), (_3)))
+
+#define __lsx_vssrlrni_hu_w(/*__m128i*/ _1, /*__m128i*/ _2, /*ui5*/ _3)        \
+  ((__m128i)__builtin_lsx_vssrlrni_hu_w((v8u16)(_1), (v8i16)(_2), (_3)))
+
+#define __lsx_vssrlrni_wu_d(/*__m128i*/ _1, /*__m128i*/ _2, /*ui6*/ _3)        \
+  ((__m128i)__builtin_lsx_vssrlrni_wu_d((v4u32)(_1), (v4i32)(_2), (_3)))
+
+#define __lsx_vssrlrni_du_q(/*__m128i*/ _1, /*__m128i*/ _2, /*ui7*/ _3)        \
+  ((__m128i)__builtin_lsx_vssrlrni_du_q((v2u64)(_1), (v2i64)(_2), (_3)))
+
+#define __lsx_vsrani_b_h(/*__m128i*/ _1, /*__m128i*/ _2, /*ui4*/ _3)           \
+  ((__m128i)__builtin_lsx_vsrani_b_h((v16i8)(_1), (v16i8)(_2), (_3)))
+
+#define __lsx_vsrani_h_w(/*__m128i*/ _1, /*__m128i*/ _2, /*ui5*/ _3)           \
+  ((__m128i)__builtin_lsx_vsrani_h_w((v8i16)(_1), (v8i16)(_2), (_3)))
+
+#define __lsx_vsrani_w_d(/*__m128i*/ _1, /*__m128i*/ _2, /*ui6*/ _3)           \
+  ((__m128i)__builtin_lsx_vsrani_w_d((v4i32)(_1), (v4i32)(_2), (_3)))
+
+#define __lsx_vsrani_d_q(/*__m128i*/ _1, /*__m128i*/ _2, /*ui7*/ _3)           \
+  ((__m128i)__builtin_lsx_vsrani_d_q((v2i64)(_1), (v2i64)(_2), (_3)))
+
+#define __lsx_vsrarni_b_h(/*__m128i*/ _1, /*__m128i*/ _2, /*ui4*/ _3)          \
+  ((__m128i)__builtin_lsx_vsrarni_b_h((v16i8)(_1), (v16i8)(_2), (_3)))
+
+#define __lsx_vsrarni_h_w(/*__m128i*/ _1, /*__m128i*/ _2, /*ui5*/ _3)          \
+  ((__m128i)__builtin_lsx_vsrarni_h_w((v8i16)(_1), (v8i16)(_2), (_3)))
+
+#define __lsx_vsrarni_w_d(/*__m128i*/ _1, /*__m128i*/ _2, /*ui6*/ _3)          \
+  ((__m128i)__builtin_lsx_vsrarni_w_d((v4i32)(_1), (v4i32)(_2), (_3)))
+
+#define __lsx_vsrarni_d_q(/*__m128i*/ _1, /*__m128i*/ _2, /*ui7*/ _3)          \
+  ((__m128i)__builtin_lsx_vsrarni_d_q((v2i64)(_1), (v2i64)(_2), (_3)))
+
+#define __lsx_vssrani_b_h(/*__m128i*/ _1, /*__m128i*/ _2, /*ui4*/ _3)          \
+  ((__m128i)__builtin_lsx_vssrani_b_h((v16i8)(_1), (v16i8)(_2), (_3)))
+
+#define __lsx_vssrani_h_w(/*__m128i*/ _1, /*__m128i*/ _2, /*ui5*/ _3)          \
+  ((__m128i)__builtin_lsx_vssrani_h_w((v8i16)(_1), (v8i16)(_2), (_3)))
+
+#define __lsx_vssrani_w_d(/*__m128i*/ _1, /*__m128i*/ _2, /*ui6*/ _3)          \
+  ((__m128i)__builtin_lsx_vssrani_w_d((v4i32)(_1), (v4i32)(_2), (_3)))
+
+#define __lsx_vssrani_d_q(/*__m128i*/ _1, /*__m128i*/ _2, /*ui7*/ _3)          \
+  ((__m128i)__builtin_lsx_vssrani_d_q((v2i64)(_1), (v2i64)(_2), (_3)))
+
+#define __lsx_vssrani_bu_h(/*__m128i*/ _1, /*__m128i*/ _2, /*ui4*/ _3)         \
+  ((__m128i)__builtin_lsx_vssrani_bu_h((v16u8)(_1), (v16i8)(_2), (_3)))
+
+#define __lsx_vssrani_hu_w(/*__m128i*/ _1, /*__m128i*/ _2, /*ui5*/ _3)         \
+  ((__m128i)__builtin_lsx_vssrani_hu_w((v8u16)(_1), (v8i16)(_2), (_3)))
+
+#define __lsx_vssrani_wu_d(/*__m128i*/ _1, /*__m128i*/ _2, /*ui6*/ _3)         \
+  ((__m128i)__builtin_lsx_vssrani_wu_d((v4u32)(_1), (v4i32)(_2), (_3)))
+
+#define __lsx_vssrani_du_q(/*__m128i*/ _1, /*__m128i*/ _2, /*ui7*/ _3)         \
+  ((__m128i)__builtin_lsx_vssrani_du_q((v2u64)(_1), (v2i64)(_2), (_3)))
+
+#define __lsx_vssrarni_b_h(/*__m128i*/ _1, /*__m128i*/ _2, /*ui4*/ _3)         \
+  ((__m128i)__builtin_lsx_vssrarni_b_h((v16i8)(_1), (v16i8)(_2), (_3)))
+
+#define __lsx_vssrarni_h_w(/*__m128i*/ _1, /*__m128i*/ _2, /*ui5*/ _3)         \
+  ((__m128i)__builtin_lsx_vssrarni_h_w((v8i16)(_1), (v8i16)(_2), (_3)))
+
+#define __lsx_vssrarni_w_d(/*__m128i*/ _1, /*__m128i*/ _2, /*ui6*/ _3)         \
+  ((__m128i)__builtin_lsx_vssrarni_w_d((v4i32)(_1), (v4i32)(_2), (_3)))
+
+#define __lsx_vssrarni_d_q(/*__m128i*/ _1, /*__m128i*/ _2, /*ui7*/ _3)         \
+  ((__m128i)__builtin_lsx_vssrarni_d_q((v2i64)(_1), (v2i64)(_2), (_3)))
+
+#define __lsx_vssrarni_bu_h(/*__m128i*/ _1, /*__m128i*/ _2, /*ui4*/ _3)        \
+  ((__m128i)__builtin_lsx_vssrarni_bu_h((v16u8)(_1), (v16i8)(_2), (_3)))
+
+#define __lsx_vssrarni_hu_w(/*__m128i*/ _1, /*__m128i*/ _2, /*ui5*/ _3)        \
+  ((__m128i)__builtin_lsx_vssrarni_hu_w((v8u16)(_1), (v8i16)(_2), (_3)))
+
+#define __lsx_vssrarni_wu_d(/*__m128i*/ _1, /*__m128i*/ _2, /*ui6*/ _3)        \
+  ((__m128i)__builtin_lsx_vssrarni_wu_d((v4u32)(_1), (v4i32)(_2), (_3)))
+
+#define __lsx_vssrarni_du_q(/*__m128i*/ _1, /*__m128i*/ _2, /*ui7*/ _3)        \
+  ((__m128i)__builtin_lsx_vssrarni_du_q((v2u64)(_1), (v2i64)(_2), (_3)))
+
+#define __lsx_vpermi_w(/*__m128i*/ _1, /*__m128i*/ _2, /*ui8*/ _3)             \
+  ((__m128i)__builtin_lsx_vpermi_w((v4i32)(_1), (v4i32)(_2), (_3)))
+
+#define __lsx_vld(/*void **/ _1, /*si12*/ _2)                                  \
+  ((__m128i)__builtin_lsx_vld((void const *)(_1), (_2)))
+
+#define __lsx_vst(/*__m128i*/ _1, /*void **/ _2, /*si12*/ _3)                  \
+  ((void)__builtin_lsx_vst((v16i8)(_1), (void *)(_2), (_3)))
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vssrlrn_b_h(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vssrlrn_b_h((v8i16)_1, (v8i16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vssrlrn_h_w(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vssrlrn_h_w((v4i32)_1, (v4i32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vssrlrn_w_d(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vssrlrn_w_d((v2i64)_1, (v2i64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vssrln_b_h(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vssrln_b_h((v8i16)_1, (v8i16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vssrln_h_w(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vssrln_h_w((v4i32)_1, (v4i32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vssrln_w_d(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vssrln_w_d((v2i64)_1, (v2i64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vorn_v(__m128i _1, __m128i _2) {
+  return (__m128i)__builtin_lsx_vorn_v((v16i8)_1, (v16i8)_2);
+}
+
+#define __lsx_vldi(/*i13*/ _1) ((__m128i)__builtin_lsx_vldi((_1)))
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vshuf_b(__m128i _1, __m128i _2, __m128i _3) {
+  return (__m128i)__builtin_lsx_vshuf_b((v16i8)_1, (v16i8)_2, (v16i8)_3);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vldx(void const *_1, long int _2) {
+  return (__m128i)__builtin_lsx_vldx((void const *)_1, (long int)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) void
+    __lsx_vstx(__m128i _1, void *_2, long int _3) {
+  return (void)__builtin_lsx_vstx((v16i8)_1, (void *)_2, (long int)_3);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vextl_qu_du(__m128i _1) {
+  return (__m128i)__builtin_lsx_vextl_qu_du((v2u64)_1);
+}
+
+#define __lsx_bnz_b(/*__m128i*/ _1) ((int)__builtin_lsx_bnz_b((v16u8)(_1)))
+
+#define __lsx_bnz_d(/*__m128i*/ _1) ((int)__builtin_lsx_bnz_d((v2u64)(_1)))
+
+#define __lsx_bnz_h(/*__m128i*/ _1) ((int)__builtin_lsx_bnz_h((v8u16)(_1)))
+
+#define __lsx_bnz_v(/*__m128i*/ _1) ((int)__builtin_lsx_bnz_v((v16u8)(_1)))
+
+#define __lsx_bnz_w(/*__m128i*/ _1) ((int)__builtin_lsx_bnz_w((v4u32)(_1)))
+
+#define __lsx_bz_b(/*__m128i*/ _1) ((int)__builtin_lsx_bz_b((v16u8)(_1)))
+
+#define __lsx_bz_d(/*__m128i*/ _1) ((int)__builtin_lsx_bz_d((v2u64)(_1)))
+
+#define __lsx_bz_h(/*__m128i*/ _1) ((int)__builtin_lsx_bz_h((v8u16)(_1)))
+
+#define __lsx_bz_v(/*__m128i*/ _1) ((int)__builtin_lsx_bz_v((v16u8)(_1)))
+
+#define __lsx_bz_w(/*__m128i*/ _1) ((int)__builtin_lsx_bz_w((v4u32)(_1)))
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vfcmp_caf_d(__m128d _1, __m128d _2) {
+  return (__m128i)__builtin_lsx_vfcmp_caf_d((v2f64)_1, (v2f64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vfcmp_caf_s(__m128 _1, __m128 _2) {
+  return (__m128i)__builtin_lsx_vfcmp_caf_s((v4f32)_1, (v4f32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vfcmp_ceq_d(__m128d _1, __m128d _2) {
+  return (__m128i)__builtin_lsx_vfcmp_ceq_d((v2f64)_1, (v2f64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vfcmp_ceq_s(__m128 _1, __m128 _2) {
+  return (__m128i)__builtin_lsx_vfcmp_ceq_s((v4f32)_1, (v4f32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vfcmp_cle_d(__m128d _1, __m128d _2) {
+  return (__m128i)__builtin_lsx_vfcmp_cle_d((v2f64)_1, (v2f64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vfcmp_cle_s(__m128 _1, __m128 _2) {
+  return (__m128i)__builtin_lsx_vfcmp_cle_s((v4f32)_1, (v4f32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vfcmp_clt_d(__m128d _1, __m128d _2) {
+  return (__m128i)__builtin_lsx_vfcmp_clt_d((v2f64)_1, (v2f64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vfcmp_clt_s(__m128 _1, __m128 _2) {
+  return (__m128i)__builtin_lsx_vfcmp_clt_s((v4f32)_1, (v4f32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vfcmp_cne_d(__m128d _1, __m128d _2) {
+  return (__m128i)__builtin_lsx_vfcmp_cne_d((v2f64)_1, (v2f64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vfcmp_cne_s(__m128 _1, __m128 _2) {
+  return (__m128i)__builtin_lsx_vfcmp_cne_s((v4f32)_1, (v4f32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vfcmp_cor_d(__m128d _1, __m128d _2) {
+  return (__m128i)__builtin_lsx_vfcmp_cor_d((v2f64)_1, (v2f64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vfcmp_cor_s(__m128 _1, __m128 _2) {
+  return (__m128i)__builtin_lsx_vfcmp_cor_s((v4f32)_1, (v4f32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vfcmp_cueq_d(__m128d _1, __m128d _2) {
+  return (__m128i)__builtin_lsx_vfcmp_cueq_d((v2f64)_1, (v2f64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vfcmp_cueq_s(__m128 _1, __m128 _2) {
+  return (__m128i)__builtin_lsx_vfcmp_cueq_s((v4f32)_1, (v4f32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vfcmp_cule_d(__m128d _1, __m128d _2) {
+  return (__m128i)__builtin_lsx_vfcmp_cule_d((v2f64)_1, (v2f64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vfcmp_cule_s(__m128 _1, __m128 _2) {
+  return (__m128i)__builtin_lsx_vfcmp_cule_s((v4f32)_1, (v4f32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vfcmp_cult_d(__m128d _1, __m128d _2) {
+  return (__m128i)__builtin_lsx_vfcmp_cult_d((v2f64)_1, (v2f64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vfcmp_cult_s(__m128 _1, __m128 _2) {
+  return (__m128i)__builtin_lsx_vfcmp_cult_s((v4f32)_1, (v4f32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vfcmp_cun_d(__m128d _1, __m128d _2) {
+  return (__m128i)__builtin_lsx_vfcmp_cun_d((v2f64)_1, (v2f64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vfcmp_cune_d(__m128d _1, __m128d _2) {
+  return (__m128i)__builtin_lsx_vfcmp_cune_d((v2f64)_1, (v2f64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vfcmp_cune_s(__m128 _1, __m128 _2) {
+  return (__m128i)__builtin_lsx_vfcmp_cune_s((v4f32)_1, (v4f32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vfcmp_cun_s(__m128 _1, __m128 _2) {
+  return (__m128i)__builtin_lsx_vfcmp_cun_s((v4f32)_1, (v4f32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vfcmp_saf_d(__m128d _1, __m128d _2) {
+  return (__m128i)__builtin_lsx_vfcmp_saf_d((v2f64)_1, (v2f64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vfcmp_saf_s(__m128 _1, __m128 _2) {
+  return (__m128i)__builtin_lsx_vfcmp_saf_s((v4f32)_1, (v4f32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vfcmp_seq_d(__m128d _1, __m128d _2) {
+  return (__m128i)__builtin_lsx_vfcmp_seq_d((v2f64)_1, (v2f64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vfcmp_seq_s(__m128 _1, __m128 _2) {
+  return (__m128i)__builtin_lsx_vfcmp_seq_s((v4f32)_1, (v4f32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vfcmp_sle_d(__m128d _1, __m128d _2) {
+  return (__m128i)__builtin_lsx_vfcmp_sle_d((v2f64)_1, (v2f64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vfcmp_sle_s(__m128 _1, __m128 _2) {
+  return (__m128i)__builtin_lsx_vfcmp_sle_s((v4f32)_1, (v4f32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vfcmp_slt_d(__m128d _1, __m128d _2) {
+  return (__m128i)__builtin_lsx_vfcmp_slt_d((v2f64)_1, (v2f64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vfcmp_slt_s(__m128 _1, __m128 _2) {
+  return (__m128i)__builtin_lsx_vfcmp_slt_s((v4f32)_1, (v4f32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vfcmp_sne_d(__m128d _1, __m128d _2) {
+  return (__m128i)__builtin_lsx_vfcmp_sne_d((v2f64)_1, (v2f64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vfcmp_sne_s(__m128 _1, __m128 _2) {
+  return (__m128i)__builtin_lsx_vfcmp_sne_s((v4f32)_1, (v4f32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vfcmp_sor_d(__m128d _1, __m128d _2) {
+  return (__m128i)__builtin_lsx_vfcmp_sor_d((v2f64)_1, (v2f64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vfcmp_sor_s(__m128 _1, __m128 _2) {
+  return (__m128i)__builtin_lsx_vfcmp_sor_s((v4f32)_1, (v4f32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vfcmp_sueq_d(__m128d _1, __m128d _2) {
+  return (__m128i)__builtin_lsx_vfcmp_sueq_d((v2f64)_1, (v2f64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vfcmp_sueq_s(__m128 _1, __m128 _2) {
+  return (__m128i)__builtin_lsx_vfcmp_sueq_s((v4f32)_1, (v4f32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vfcmp_sule_d(__m128d _1, __m128d _2) {
+  return (__m128i)__builtin_lsx_vfcmp_sule_d((v2f64)_1, (v2f64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vfcmp_sule_s(__m128 _1, __m128 _2) {
+  return (__m128i)__builtin_lsx_vfcmp_sule_s((v4f32)_1, (v4f32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vfcmp_sult_d(__m128d _1, __m128d _2) {
+  return (__m128i)__builtin_lsx_vfcmp_sult_d((v2f64)_1, (v2f64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vfcmp_sult_s(__m128 _1, __m128 _2) {
+  return (__m128i)__builtin_lsx_vfcmp_sult_s((v4f32)_1, (v4f32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vfcmp_sun_d(__m128d _1, __m128d _2) {
+  return (__m128i)__builtin_lsx_vfcmp_sun_d((v2f64)_1, (v2f64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vfcmp_sune_d(__m128d _1, __m128d _2) {
+  return (__m128i)__builtin_lsx_vfcmp_sune_d((v2f64)_1, (v2f64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vfcmp_sune_s(__m128 _1, __m128 _2) {
+  return (__m128i)__builtin_lsx_vfcmp_sune_s((v4f32)_1, (v4f32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+    __lsx_vfcmp_sun_s(__m128 _1, __m128 _2) {
+  return (__m128i)__builtin_lsx_vfcmp_sun_s((v4f32)_1, (v4f32)_2);
+}
+
+#define __lsx_vrepli_b(/*si10*/ _1) ((__m128i)__builtin_lsx_vrepli_b((_1)))
+
+#define __lsx_vrepli_d(/*si10*/ _1) ((__m128i)__builtin_lsx_vrepli_d((_1)))
+
+#define __lsx_vrepli_h(/*si10*/ _1) ((__m128i)__builtin_lsx_vrepli_h((_1)))
+
+#define __lsx_vrepli_w(/*si10*/ _1) ((__m128i)__builtin_lsx_vrepli_w((_1)))
+
+#endif /* defined(__loongarch_sx) */
+#endif /* _LOONGSON_SXINTRIN_H */
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index 7972919d14201..e131061eb8854 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -3928,6 +3928,7 @@ bool Sema::CheckLoongArchBuiltinFunctionCall(const TargetInfo &TI,
   switch (BuiltinID) {
   default:
     break;
+  // Basic intrinsics.
   case LoongArch::BI__builtin_loongarch_cacop_d:
   case LoongArch::BI__builtin_loongarch_cacop_w: {
     SemaBuiltinConstantArgRange(TheCall, 0, 0, llvm::maxUIntN(5));
@@ -3956,8 +3957,234 @@ bool Sema::CheckLoongArchBuiltinFunctionCall(const TargetInfo &TI,
   case LoongArch::BI__builtin_loongarch_movfcsr2gr:
   case LoongArch::BI__builtin_loongarch_movgr2fcsr:
     return SemaBuiltinConstantArgRange(TheCall, 0, 0, llvm::maxUIntN(2));
-  }
 
+  // LSX intrinsics.
+  case LoongArch::BI__builtin_lsx_vbitclri_b:
+  case LoongArch::BI__builtin_lsx_vbitrevi_b:
+  case LoongArch::BI__builtin_lsx_vbitseti_b:
+  case LoongArch::BI__builtin_lsx_vsat_b:
+  case LoongArch::BI__builtin_lsx_vsat_bu:
+  case LoongArch::BI__builtin_lsx_vslli_b:
+  case LoongArch::BI__builtin_lsx_vsrai_b:
+  case LoongArch::BI__builtin_lsx_vsrari_b:
+  case LoongArch::BI__builtin_lsx_vsrli_b:
+  case LoongArch::BI__builtin_lsx_vsllwil_h_b:
+  case LoongArch::BI__builtin_lsx_vsllwil_hu_bu:
+  case LoongArch::BI__builtin_lsx_vrotri_b:
+  case LoongArch::BI__builtin_lsx_vsrlri_b:
+    return SemaBuiltinConstantArgRange(TheCall, 1, 0, 7);
+  case LoongArch::BI__builtin_lsx_vbitclri_h:
+  case LoongArch::BI__builtin_lsx_vbitrevi_h:
+  case LoongArch::BI__builtin_lsx_vbitseti_h:
+  case LoongArch::BI__builtin_lsx_vsat_h:
+  case LoongArch::BI__builtin_lsx_vsat_hu:
+  case LoongArch::BI__builtin_lsx_vslli_h:
+  case LoongArch::BI__builtin_lsx_vsrai_h:
+  case LoongArch::BI__builtin_lsx_vsrari_h:
+  case LoongArch::BI__builtin_lsx_vsrli_h:
+  case LoongArch::BI__builtin_lsx_vsllwil_w_h:
+  case LoongArch::BI__builtin_lsx_vsllwil_wu_hu:
+  case LoongArch::BI__builtin_lsx_vrotri_h:
+  case LoongArch::BI__builtin_lsx_vsrlri_h:
+    return SemaBuiltinConstantArgRange(TheCall, 1, 0, 15);
+  case LoongArch::BI__builtin_lsx_vssrarni_b_h:
+  case LoongArch::BI__builtin_lsx_vssrarni_bu_h:
+  case LoongArch::BI__builtin_lsx_vssrani_b_h:
+  case LoongArch::BI__builtin_lsx_vssrani_bu_h:
+  case LoongArch::BI__builtin_lsx_vsrarni_b_h:
+  case LoongArch::BI__builtin_lsx_vsrlni_b_h:
+  case LoongArch::BI__builtin_lsx_vsrlrni_b_h:
+  case LoongArch::BI__builtin_lsx_vssrlni_b_h:
+  case LoongArch::BI__builtin_lsx_vssrlni_bu_h:
+  case LoongArch::BI__builtin_lsx_vssrlrni_b_h:
+  case LoongArch::BI__builtin_lsx_vssrlrni_bu_h:
+  case LoongArch::BI__builtin_lsx_vsrani_b_h:
+    return SemaBuiltinConstantArgRange(TheCall, 2, 0, 15);
+  case LoongArch::BI__builtin_lsx_vslei_bu:
+  case LoongArch::BI__builtin_lsx_vslei_hu:
+  case LoongArch::BI__builtin_lsx_vslei_wu:
+  case LoongArch::BI__builtin_lsx_vslei_du:
+  case LoongArch::BI__builtin_lsx_vslti_bu:
+  case LoongArch::BI__builtin_lsx_vslti_hu:
+  case LoongArch::BI__builtin_lsx_vslti_wu:
+  case LoongArch::BI__builtin_lsx_vslti_du:
+  case LoongArch::BI__builtin_lsx_vmaxi_bu:
+  case LoongArch::BI__builtin_lsx_vmaxi_hu:
+  case LoongArch::BI__builtin_lsx_vmaxi_wu:
+  case LoongArch::BI__builtin_lsx_vmaxi_du:
+  case LoongArch::BI__builtin_lsx_vmini_bu:
+  case LoongArch::BI__builtin_lsx_vmini_hu:
+  case LoongArch::BI__builtin_lsx_vmini_wu:
+  case LoongArch::BI__builtin_lsx_vmini_du:
+  case LoongArch::BI__builtin_lsx_vaddi_bu:
+  case LoongArch::BI__builtin_lsx_vaddi_hu:
+  case LoongArch::BI__builtin_lsx_vaddi_wu:
+  case LoongArch::BI__builtin_lsx_vaddi_du:
+  case LoongArch::BI__builtin_lsx_vbitclri_w:
+  case LoongArch::BI__builtin_lsx_vbitrevi_w:
+  case LoongArch::BI__builtin_lsx_vbitseti_w:
+  case LoongArch::BI__builtin_lsx_vsat_w:
+  case LoongArch::BI__builtin_lsx_vsat_wu:
+  case LoongArch::BI__builtin_lsx_vslli_w:
+  case LoongArch::BI__builtin_lsx_vsrai_w:
+  case LoongArch::BI__builtin_lsx_vsrari_w:
+  case LoongArch::BI__builtin_lsx_vsrli_w:
+  case LoongArch::BI__builtin_lsx_vsllwil_d_w:
+  case LoongArch::BI__builtin_lsx_vsllwil_du_wu:
+  case LoongArch::BI__builtin_lsx_vsrlri_w:
+  case LoongArch::BI__builtin_lsx_vrotri_w:
+  case LoongArch::BI__builtin_lsx_vsubi_bu:
+  case LoongArch::BI__builtin_lsx_vsubi_hu:
+  case LoongArch::BI__builtin_lsx_vbsrl_v:
+  case LoongArch::BI__builtin_lsx_vbsll_v:
+  case LoongArch::BI__builtin_lsx_vsubi_wu:
+  case LoongArch::BI__builtin_lsx_vsubi_du:
+    return SemaBuiltinConstantArgRange(TheCall, 1, 0, 31);
+  case LoongArch::BI__builtin_lsx_vssrarni_h_w:
+  case LoongArch::BI__builtin_lsx_vssrarni_hu_w:
+  case LoongArch::BI__builtin_lsx_vssrani_h_w:
+  case LoongArch::BI__builtin_lsx_vssrani_hu_w:
+  case LoongArch::BI__builtin_lsx_vsrarni_h_w:
+  case LoongArch::BI__builtin_lsx_vsrani_h_w:
+  case LoongArch::BI__builtin_lsx_vfrstpi_b:
+  case LoongArch::BI__builtin_lsx_vfrstpi_h:
+  case LoongArch::BI__builtin_lsx_vsrlni_h_w:
+  case LoongArch::BI__builtin_lsx_vsrlrni_h_w:
+  case LoongArch::BI__builtin_lsx_vssrlni_h_w:
+  case LoongArch::BI__builtin_lsx_vssrlni_hu_w:
+  case LoongArch::BI__builtin_lsx_vssrlrni_h_w:
+  case LoongArch::BI__builtin_lsx_vssrlrni_hu_w:
+    return SemaBuiltinConstantArgRange(TheCall, 2, 0, 31);
+  case LoongArch::BI__builtin_lsx_vbitclri_d:
+  case LoongArch::BI__builtin_lsx_vbitrevi_d:
+  case LoongArch::BI__builtin_lsx_vbitseti_d:
+  case LoongArch::BI__builtin_lsx_vsat_d:
+  case LoongArch::BI__builtin_lsx_vsat_du:
+  case LoongArch::BI__builtin_lsx_vslli_d:
+  case LoongArch::BI__builtin_lsx_vsrai_d:
+  case LoongArch::BI__builtin_lsx_vsrli_d:
+  case LoongArch::BI__builtin_lsx_vsrari_d:
+  case LoongArch::BI__builtin_lsx_vrotri_d:
+  case LoongArch::BI__builtin_lsx_vsrlri_d:
+    return SemaBuiltinConstantArgRange(TheCall, 1, 0, 63);
+  case LoongArch::BI__builtin_lsx_vssrarni_w_d:
+  case LoongArch::BI__builtin_lsx_vssrarni_wu_d:
+  case LoongArch::BI__builtin_lsx_vssrani_w_d:
+  case LoongArch::BI__builtin_lsx_vssrani_wu_d:
+  case LoongArch::BI__builtin_lsx_vsrarni_w_d:
+  case LoongArch::BI__builtin_lsx_vsrlni_w_d:
+  case LoongArch::BI__builtin_lsx_vsrlrni_w_d:
+  case LoongArch::BI__builtin_lsx_vssrlni_w_d:
+  case LoongArch::BI__builtin_lsx_vssrlni_wu_d:
+  case LoongArch::BI__builtin_lsx_vssrlrni_w_d:
+  case LoongArch::BI__builtin_lsx_vssrlrni_wu_d:
+  case LoongArch::BI__builtin_lsx_vsrani_w_d:
+    return SemaBuiltinConstantArgRange(TheCall, 2, 0, 63);
+  case LoongArch::BI__builtin_lsx_vssrarni_d_q:
+  case LoongArch::BI__builtin_lsx_vssrarni_du_q:
+  case LoongArch::BI__builtin_lsx_vssrani_d_q:
+  case LoongArch::BI__builtin_lsx_vssrani_du_q:
+  case LoongArch::BI__builtin_lsx_vsrarni_d_q:
+  case LoongArch::BI__builtin_lsx_vssrlni_d_q:
+  case LoongArch::BI__builtin_lsx_vssrlni_du_q:
+  case LoongArch::BI__builtin_lsx_vssrlrni_d_q:
+  case LoongArch::BI__builtin_lsx_vssrlrni_du_q:
+  case LoongArch::BI__builtin_lsx_vsrani_d_q:
+  case LoongArch::BI__builtin_lsx_vsrlrni_d_q:
+  case LoongArch::BI__builtin_lsx_vsrlni_d_q:
+    return SemaBuiltinConstantArgRange(TheCall, 2, 0, 127);
+  case LoongArch::BI__builtin_lsx_vseqi_b:
+  case LoongArch::BI__builtin_lsx_vseqi_h:
+  case LoongArch::BI__builtin_lsx_vseqi_w:
+  case LoongArch::BI__builtin_lsx_vseqi_d:
+  case LoongArch::BI__builtin_lsx_vslti_b:
+  case LoongArch::BI__builtin_lsx_vslti_h:
+  case LoongArch::BI__builtin_lsx_vslti_w:
+  case LoongArch::BI__builtin_lsx_vslti_d:
+  case LoongArch::BI__builtin_lsx_vslei_b:
+  case LoongArch::BI__builtin_lsx_vslei_h:
+  case LoongArch::BI__builtin_lsx_vslei_w:
+  case LoongArch::BI__builtin_lsx_vslei_d:
+  case LoongArch::BI__builtin_lsx_vmaxi_b:
+  case LoongArch::BI__builtin_lsx_vmaxi_h:
+  case LoongArch::BI__builtin_lsx_vmaxi_w:
+  case LoongArch::BI__builtin_lsx_vmaxi_d:
+  case LoongArch::BI__builtin_lsx_vmini_b:
+  case LoongArch::BI__builtin_lsx_vmini_h:
+  case LoongArch::BI__builtin_lsx_vmini_w:
+  case LoongArch::BI__builtin_lsx_vmini_d:
+    return SemaBuiltinConstantArgRange(TheCall, 1, -16, 15);
+  case LoongArch::BI__builtin_lsx_vandi_b:
+  case LoongArch::BI__builtin_lsx_vnori_b:
+  case LoongArch::BI__builtin_lsx_vori_b:
+  case LoongArch::BI__builtin_lsx_vshuf4i_b:
+  case LoongArch::BI__builtin_lsx_vshuf4i_h:
+  case LoongArch::BI__builtin_lsx_vshuf4i_w:
+  case LoongArch::BI__builtin_lsx_vxori_b:
+    return SemaBuiltinConstantArgRange(TheCall, 1, 0, 255);
+  case LoongArch::BI__builtin_lsx_vbitseli_b:
+  case LoongArch::BI__builtin_lsx_vshuf4i_d:
+  case LoongArch::BI__builtin_lsx_vextrins_b:
+  case LoongArch::BI__builtin_lsx_vextrins_h:
+  case LoongArch::BI__builtin_lsx_vextrins_w:
+  case LoongArch::BI__builtin_lsx_vextrins_d:
+  case LoongArch::BI__builtin_lsx_vpermi_w:
+    return SemaBuiltinConstantArgRange(TheCall, 2, 0, 255);
+  case LoongArch::BI__builtin_lsx_vpickve2gr_b:
+  case LoongArch::BI__builtin_lsx_vpickve2gr_bu:
+  case LoongArch::BI__builtin_lsx_vreplvei_b:
+    return SemaBuiltinConstantArgRange(TheCall, 1, 0, 15);
+  case LoongArch::BI__builtin_lsx_vinsgr2vr_b:
+    return SemaBuiltinConstantArgRange(TheCall, 2, 0, 15);
+  case LoongArch::BI__builtin_lsx_vpickve2gr_h:
+  case LoongArch::BI__builtin_lsx_vpickve2gr_hu:
+  case LoongArch::BI__builtin_lsx_vreplvei_h:
+    return SemaBuiltinConstantArgRange(TheCall, 1, 0, 7);
+  case LoongArch::BI__builtin_lsx_vinsgr2vr_h:
+    return SemaBuiltinConstantArgRange(TheCall, 2, 0, 7);
+  case LoongArch::BI__builtin_lsx_vpickve2gr_w:
+  case LoongArch::BI__builtin_lsx_vpickve2gr_wu:
+  case LoongArch::BI__builtin_lsx_vreplvei_w:
+    return SemaBuiltinConstantArgRange(TheCall, 1, 0, 3);
+  case LoongArch::BI__builtin_lsx_vinsgr2vr_w:
+    return SemaBuiltinConstantArgRange(TheCall, 2, 0, 3);
+  case LoongArch::BI__builtin_lsx_vpickve2gr_d:
+  case LoongArch::BI__builtin_lsx_vpickve2gr_du:
+  case LoongArch::BI__builtin_lsx_vreplvei_d:
+    return SemaBuiltinConstantArgRange(TheCall, 1, 0, 1);
+  case LoongArch::BI__builtin_lsx_vinsgr2vr_d:
+    return SemaBuiltinConstantArgRange(TheCall, 2, 0, 1);
+  case LoongArch::BI__builtin_lsx_vstelm_b:
+    return SemaBuiltinConstantArgRange(TheCall, 2, -128, 127) ||
+           SemaBuiltinConstantArgRange(TheCall, 3, 0, 15);
+  case LoongArch::BI__builtin_lsx_vstelm_h:
+    return SemaBuiltinConstantArgRange(TheCall, 2, -256, 254) ||
+           SemaBuiltinConstantArgRange(TheCall, 3, 0, 7);
+  case LoongArch::BI__builtin_lsx_vstelm_w:
+    return SemaBuiltinConstantArgRange(TheCall, 2, -512, 508) ||
+           SemaBuiltinConstantArgRange(TheCall, 3, 0, 3);
+  case LoongArch::BI__builtin_lsx_vstelm_d:
+    return SemaBuiltinConstantArgRange(TheCall, 2, -1024, 1016) ||
+           SemaBuiltinConstantArgRange(TheCall, 3, 0, 1);
+  case LoongArch::BI__builtin_lsx_vldrepl_b:
+  case LoongArch::BI__builtin_lsx_vld:
+    return SemaBuiltinConstantArgRange(TheCall, 1, -2048, 2047);
+  case LoongArch::BI__builtin_lsx_vldrepl_h:
+    return SemaBuiltinConstantArgRange(TheCall, 1, -2048, 2046);
+  case LoongArch::BI__builtin_lsx_vldrepl_w:
+    return SemaBuiltinConstantArgRange(TheCall, 1, -2048, 2044);
+  case LoongArch::BI__builtin_lsx_vldrepl_d:
+    return SemaBuiltinConstantArgRange(TheCall, 1, -2048, 2040);
+  case LoongArch::BI__builtin_lsx_vst:
+    return SemaBuiltinConstantArgRange(TheCall, 2, -2048, 2047);
+  case LoongArch::BI__builtin_lsx_vldi:
+    return SemaBuiltinConstantArgRange(TheCall, 0, -4096, 4095);
+  case LoongArch::BI__builtin_lsx_vrepli_b:
+  case LoongArch::BI__builtin_lsx_vrepli_h:
+  case LoongArch::BI__builtin_lsx_vrepli_w:
+  case LoongArch::BI__builtin_lsx_vrepli_d:
+    return SemaBuiltinConstantArgRange(TheCall, 0, -512, 511);
+  }
   return false;
 }
 

From a4005e729c8d9dba9ba19f3ce4ad5b60e64dc467 Mon Sep 17 00:00:00 2001
From: licongtian <licongtian@loongson.cn>
Date: Wed, 25 Oct 2023 17:44:06 +0800
Subject: [PATCH 476/877] [Clang][LoongArch] Support the builtin functions for
 LASX

This patch does the following work:
- Define the builtin functions for LASX
- Add the header files lasxintrin.h
---
 .../include/clang/Basic/BuiltinsLoongArch.def |    3 +
 .../clang/Basic/BuiltinsLoongArchLASX.def     |  982 +++++
 clang/lib/Headers/CMakeLists.txt              |    1 +
 clang/lib/Headers/lasxintrin.h                | 3860 +++++++++++++++++
 clang/lib/Sema/SemaChecking.cpp               |  227 +
 5 files changed, 5073 insertions(+)
 create mode 100644 clang/include/clang/Basic/BuiltinsLoongArchLASX.def
 create mode 100644 clang/lib/Headers/lasxintrin.h

diff --git a/clang/include/clang/Basic/BuiltinsLoongArch.def b/clang/include/clang/Basic/BuiltinsLoongArch.def
index 9ec19c31095af..95359a3fdc711 100644
--- a/clang/include/clang/Basic/BuiltinsLoongArch.def
+++ b/clang/include/clang/Basic/BuiltinsLoongArch.def
@@ -21,5 +21,8 @@
 // Definition of LSX builtins.
 #include "clang/Basic/BuiltinsLoongArchLSX.def"
 
+// Definition of LASX builtins.
+#include "clang/Basic/BuiltinsLoongArchLASX.def"
+
 #undef BUILTIN
 #undef TARGET_BUILTIN
diff --git a/clang/include/clang/Basic/BuiltinsLoongArchLASX.def b/clang/include/clang/Basic/BuiltinsLoongArchLASX.def
new file mode 100644
index 0000000000000..3de200f665b68
--- /dev/null
+++ b/clang/include/clang/Basic/BuiltinsLoongArchLASX.def
@@ -0,0 +1,982 @@
+//=BuiltinsLoongArchLASX.def - LoongArch Builtin function database -- C++ -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the LoongArch-specific LASX builtin function database.
+// Users of this file must define the BUILTIN macro to make use of this
+// information.
+//
+//===----------------------------------------------------------------------===//
+
+TARGET_BUILTIN(__builtin_lasx_xvadd_b, "V32cV32cV32c", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvadd_h, "V16sV16sV16s", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvadd_w, "V8iV8iV8i", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvadd_d, "V4LLiV4LLiV4LLi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvadd_q, "V4LLiV4LLiV4LLi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvsub_b, "V32cV32cV32c", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvsub_h, "V16sV16sV16s", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvsub_w, "V8iV8iV8i", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvsub_d, "V4LLiV4LLiV4LLi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvsub_q, "V4LLiV4LLiV4LLi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvaddi_bu, "V32cV32cIUi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvaddi_hu, "V16sV16sIUi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvaddi_wu, "V8iV8iIUi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvaddi_du, "V4LLiV4LLiIUi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvsubi_bu, "V32cV32cIUi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvsubi_hu, "V16sV16sIUi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvsubi_wu, "V8iV8iIUi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvsubi_du, "V4LLiV4LLiIUi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvneg_b, "V32cV32c", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvneg_h, "V16sV16s", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvneg_w, "V8iV8i", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvneg_d, "V4LLiV4LLi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvsadd_b, "V32ScV32ScV32Sc", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvsadd_h, "V16SsV16SsV16Ss", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvsadd_w, "V8SiV8SiV8Si", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvsadd_d, "V4SLLiV4SLLiV4SLLi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvsadd_bu, "V32UcV32UcV32Uc", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvsadd_hu, "V16UsV16UsV16Us", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvsadd_wu, "V8UiV8UiV8Ui", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvsadd_du, "V4ULLiV4ULLiV4ULLi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvssub_b, "V32ScV32ScV32Sc", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvssub_h, "V16SsV16SsV16Ss", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvssub_w, "V8SiV8SiV8Si", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvssub_d, "V4SLLiV4SLLiV4SLLi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvssub_bu, "V32UcV32UcV32Uc", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvssub_hu, "V16UsV16UsV16Us", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvssub_wu, "V8UiV8UiV8Ui", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvssub_du, "V4ULLiV4ULLiV4ULLi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvhaddw_h_b, "V16SsV32ScV32Sc", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvhaddw_w_h, "V8SiV16SsV16Ss", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvhaddw_d_w, "V4SLLiV8SiV8Si", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvhaddw_q_d, "V4LLiV4LLiV4LLi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvhaddw_hu_bu, "V16UsV32UcV32Uc", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvhaddw_wu_hu, "V8UiV16UsV16Us", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvhaddw_du_wu, "V4ULLiV8UiV8Ui", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvhaddw_qu_du, "V4ULLiV4ULLiV4ULLi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvhsubw_h_b, "V16SsV32ScV32Sc", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvhsubw_w_h, "V8SiV16SsV16Ss", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvhsubw_d_w, "V4SLLiV8SiV8Si", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvhsubw_q_d, "V4LLiV4LLiV4LLi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvhsubw_hu_bu, "V16UsV32UcV32Uc", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvhsubw_wu_hu, "V8UiV16UsV16Us", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvhsubw_du_wu, "V4ULLiV8UiV8Ui", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvhsubw_qu_du, "V4ULLiV4ULLiV4ULLi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvaddwev_h_b, "V16sV32cV32c", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvaddwev_w_h, "V8SiV16sV16s", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvaddwev_d_w, "V4LLiV8SiV8Si", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvaddwev_q_d, "V4LLiV4LLiV4LLi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvaddwod_h_b, "V16sV32cV32c", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvaddwod_w_h, "V8SiV16sV16s", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvaddwod_d_w, "V4LLiV8SiV8Si", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvaddwod_q_d, "V4LLiV4LLiV4LLi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvsubwev_h_b, "V16sV32cV32c", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvsubwev_w_h, "V8SiV16sV16s", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvsubwev_d_w, "V4LLiV8SiV8Si", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvsubwev_q_d, "V4LLiV4LLiV4LLi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvsubwod_h_b, "V16sV32cV32c", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvsubwod_w_h, "V8SiV16sV16s", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvsubwod_d_w, "V4LLiV8SiV8Si", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvsubwod_q_d, "V4LLiV4LLiV4LLi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvaddwev_h_bu, "V16sV32UcV32Uc", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvaddwev_w_hu, "V8SiV16UsV16Us", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvaddwev_d_wu, "V4LLiV8UiV8Ui", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvaddwev_q_du, "V4LLiV4ULLiV4ULLi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvaddwod_h_bu, "V16sV32UcV32Uc", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvaddwod_w_hu, "V8SiV16UsV16Us", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvaddwod_d_wu, "V4LLiV8UiV8Ui", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvaddwod_q_du, "V4LLiV4ULLiV4ULLi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvsubwev_h_bu, "V16sV32UcV32Uc", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvsubwev_w_hu, "V8SiV16UsV16Us", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvsubwev_d_wu, "V4LLiV8UiV8Ui", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvsubwev_q_du, "V4LLiV4ULLiV4ULLi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvsubwod_h_bu, "V16sV32UcV32Uc", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvsubwod_w_hu, "V8SiV16UsV16Us", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvsubwod_d_wu, "V4LLiV8UiV8Ui", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvsubwod_q_du, "V4LLiV4ULLiV4ULLi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvaddwev_h_bu_b, "V16sV32UcV32c", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvaddwev_w_hu_h, "V8SiV16UsV16s", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvaddwev_d_wu_w, "V4LLiV8UiV8Si", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvaddwev_q_du_d, "V4LLiV4ULLiV4LLi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvaddwod_h_bu_b, "V16sV32UcV32c", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvaddwod_w_hu_h, "V8SiV16UsV16s", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvaddwod_d_wu_w, "V4LLiV8UiV8Si", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvaddwod_q_du_d, "V4LLiV4ULLiV4LLi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvavg_b, "V32ScV32ScV32Sc", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvavg_h, "V16SsV16SsV16Ss", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvavg_w, "V8SiV8SiV8Si", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvavg_d, "V4SLLiV4SLLiV4SLLi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvavg_bu, "V32UcV32UcV32Uc", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvavg_hu, "V16UsV16UsV16Us", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvavg_wu, "V8UiV8UiV8Ui", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvavg_du, "V4ULLiV4ULLiV4ULLi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvavgr_b, "V32ScV32ScV32Sc", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvavgr_h, "V16SsV16SsV16Ss", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvavgr_w, "V8SiV8SiV8Si", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvavgr_d, "V4SLLiV4SLLiV4SLLi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvavgr_bu, "V32UcV32UcV32Uc", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvavgr_hu, "V16UsV16UsV16Us", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvavgr_wu, "V8UiV8UiV8Ui", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvavgr_du, "V4ULLiV4ULLiV4ULLi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvabsd_b, "V32ScV32ScV32Sc", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvabsd_h, "V16SsV16SsV16Ss", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvabsd_w, "V8SiV8SiV8Si", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvabsd_d, "V4SLLiV4SLLiV4SLLi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvabsd_bu, "V32UcV32UcV32Uc", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvabsd_hu, "V16UsV16UsV16Us", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvabsd_wu, "V8UiV8UiV8Ui", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvabsd_du, "V4ULLiV4ULLiV4ULLi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvadda_b, "V32ScV32ScV32Sc", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvadda_h, "V16SsV16SsV16Ss", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvadda_w, "V8SiV8SiV8Si", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvadda_d, "V4SLLiV4SLLiV4SLLi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvmax_b, "V32ScV32ScV32Sc", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvmax_h, "V16SsV16SsV16Ss", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvmax_w, "V8SiV8SiV8Si", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvmax_d, "V4SLLiV4SLLiV4SLLi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvmaxi_b, "V32ScV32ScIi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvmaxi_h, "V16SsV16SsIi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvmaxi_w, "V8SiV8SiIi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvmaxi_d, "V4SLLiV4SLLiIi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvmax_bu, "V32UcV32UcV32Uc", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvmax_hu, "V16UsV16UsV16Us", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvmax_wu, "V8UiV8UiV8Ui", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvmax_du, "V4ULLiV4ULLiV4ULLi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvmaxi_bu, "V32UcV32UcIUi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvmaxi_hu, "V16UsV16UsIUi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvmaxi_wu, "V8UiV8UiIUi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvmaxi_du, "V4ULLiV4ULLiIUi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvmin_b, "V32ScV32ScV32Sc", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvmin_h, "V16SsV16SsV16Ss", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvmin_w, "V8SiV8SiV8Si", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvmin_d, "V4SLLiV4SLLiV4SLLi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvmini_b, "V32ScV32ScIi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvmini_h, "V16SsV16SsIi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvmini_w, "V8SiV8SiIi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvmini_d, "V4SLLiV4SLLiIi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvmin_bu, "V32UcV32UcV32Uc", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvmin_hu, "V16UsV16UsV16Us", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvmin_wu, "V8UiV8UiV8Ui", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvmin_du, "V4ULLiV4ULLiV4ULLi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvmini_bu, "V32UcV32UcIUi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvmini_hu, "V16UsV16UsIUi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvmini_wu, "V8UiV8UiIUi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvmini_du, "V4ULLiV4ULLiIUi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvmul_b, "V32ScV32ScV32Sc", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvmul_h, "V16SsV16SsV16Ss", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvmul_w, "V8SiV8SiV8Si", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvmul_d, "V4SLLiV4SLLiV4SLLi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvmuh_b, "V32cV32cV32c", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvmuh_h, "V16sV16sV16s", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvmuh_w, "V8iV8iV8i", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvmuh_d, "V4LLiV4LLiV4LLi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvmuh_bu, "V32UcV32UcV32Uc", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvmuh_hu, "V16UsV16UsV16Us", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvmuh_wu, "V8UiV8UiV8Ui", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvmuh_du, "V4ULLiV4ULLiV4ULLi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvmulwev_h_b, "V16sV32cV32c", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvmulwev_w_h, "V8SiV16sV16s", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvmulwev_d_w, "V4LLiV8SiV8Si", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvmulwev_q_d, "V4LLiV4LLiV4LLi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvmulwod_h_b, "V16sV32cV32c", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvmulwod_w_h, "V8SiV16sV16s", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvmulwod_d_w, "V4LLiV8SiV8Si", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvmulwod_q_d, "V4LLiV4LLiV4LLi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvmulwev_h_bu, "V16sV32UcV32Uc", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvmulwev_w_hu, "V8SiV16UsV16Us", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvmulwev_d_wu, "V4LLiV8UiV8Ui", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvmulwev_q_du, "V4LLiV4ULLiV4ULLi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvmulwod_h_bu, "V16sV32UcV32Uc", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvmulwod_w_hu, "V8SiV16UsV16Us", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvmulwod_d_wu, "V4LLiV8UiV8Ui", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvmulwod_q_du, "V4LLiV4ULLiV4ULLi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvmulwev_h_bu_b, "V16sV32UcV32c", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvmulwev_w_hu_h, "V8SiV16UsV16s", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvmulwev_d_wu_w, "V4LLiV8UiV8Si", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvmulwev_q_du_d, "V4LLiV4ULLiV4LLi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvmulwod_h_bu_b, "V16sV32UcV32c", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvmulwod_w_hu_h, "V8SiV16UsV16s", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvmulwod_d_wu_w, "V4LLiV8UiV8Si", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvmulwod_q_du_d, "V4LLiV4ULLiV4LLi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvmadd_b, "V32ScV32ScV32ScV32Sc", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvmadd_h, "V16SsV16SsV16SsV16Ss", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvmadd_w, "V8SiV8SiV8SiV8Si", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvmadd_d, "V4SLLiV4SLLiV4SLLiV4SLLi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvmsub_b, "V32ScV32ScV32ScV32Sc", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvmsub_h, "V16SsV16SsV16SsV16Ss", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvmsub_w, "V8SiV8SiV8SiV8Si", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvmsub_d, "V4SLLiV4SLLiV4SLLiV4SLLi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvmaddwev_h_b, "V16sV16sV32cV32c", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvmaddwev_w_h, "V8SiV8SiV16sV16s", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvmaddwev_d_w, "V4LLiV4LLiV8SiV8Si", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvmaddwev_q_d, "V4LLiV4LLiV4LLiV4LLi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvmaddwod_h_b, "V16sV16sV32cV32c", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvmaddwod_w_h, "V8SiV8SiV16sV16s", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvmaddwod_d_w, "V4LLiV4LLiV8SiV8Si", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvmaddwod_q_d, "V4LLiV4LLiV4LLiV4LLi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvmaddwev_h_bu, "V16UsV16UsV32UcV32Uc", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvmaddwev_w_hu, "V8UiV8UiV16UsV16Us", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvmaddwev_d_wu, "V4ULLiV4ULLiV8UiV8Ui", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvmaddwev_q_du, "V4ULLiV4ULLiV4ULLiV4ULLi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvmaddwod_h_bu, "V16UsV16UsV32UcV32Uc", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvmaddwod_w_hu, "V8UiV8UiV16UsV16Us", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvmaddwod_d_wu, "V4ULLiV4ULLiV8UiV8Ui", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvmaddwod_q_du, "V4ULLiV4ULLiV4ULLiV4ULLi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvmaddwev_h_bu_b, "V16sV16sV32UcV32c", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvmaddwev_w_hu_h, "V8SiV8SiV16UsV16s", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvmaddwev_d_wu_w, "V4LLiV4LLiV8UiV8Si", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvmaddwev_q_du_d, "V4LLiV4LLiV4ULLiV4LLi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvmaddwod_h_bu_b, "V16sV16sV32UcV32c", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvmaddwod_w_hu_h, "V8SiV8SiV16UsV16s", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvmaddwod_d_wu_w, "V4LLiV4LLiV8UiV8Si", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvmaddwod_q_du_d, "V4LLiV4LLiV4ULLiV4LLi", "nc", "lasx")
+
+
+TARGET_BUILTIN(__builtin_lasx_xvdiv_b, "V32ScV32ScV32Sc", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvdiv_h, "V16SsV16SsV16Ss", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvdiv_w, "V8SiV8SiV8Si", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvdiv_d, "V4SLLiV4SLLiV4SLLi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvdiv_bu, "V32UcV32UcV32Uc", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvdiv_hu, "V16UsV16UsV16Us", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvdiv_wu, "V8UiV8UiV8Ui", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvdiv_du, "V4ULLiV4ULLiV4ULLi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvmod_b, "V32ScV32ScV32Sc", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvmod_h, "V16SsV16SsV16Ss", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvmod_w, "V8SiV8SiV8Si", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvmod_d, "V4SLLiV4SLLiV4SLLi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvmod_bu, "V32UcV32UcV32Uc", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvmod_hu, "V16UsV16UsV16Us", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvmod_wu, "V8UiV8UiV8Ui", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvmod_du, "V4ULLiV4ULLiV4ULLi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvsat_b, "V32ScV32ScIUi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvsat_h, "V16SsV16SsIUi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvsat_w, "V8SiV8SiIUi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvsat_d, "V4SLLiV4SLLiIUi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvsat_bu, "V32UcV32UcIUi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvsat_hu, "V16UsV16UsIUi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvsat_wu, "V8UiV8UiIUi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvsat_du, "V4ULLiV4ULLiIUi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvexth_h_b, "V16sV32c", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvexth_w_h, "V8SiV16s", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvexth_d_w, "V4LLiV8Si", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvexth_q_d, "V4LLiV4LLi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvexth_hu_bu, "V16UsV32Uc", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvexth_wu_hu, "V8UiV16Us", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvexth_du_wu, "V4ULLiV8Ui", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvexth_qu_du, "V4ULLiV4ULLi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_vext2xv_h_b, "V16sV32c", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_vext2xv_w_b, "V8SiV32c", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_vext2xv_d_b, "V4LLiV32c", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_vext2xv_w_h, "V8SiV16s", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_vext2xv_d_h, "V4LLiV16s", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_vext2xv_d_w, "V4LLiV8Si", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_vext2xv_hu_bu, "V16sV32c", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_vext2xv_wu_bu, "V8SiV32c", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_vext2xv_du_bu, "V4LLiV32c", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_vext2xv_wu_hu, "V8SiV16s", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_vext2xv_du_hu, "V4LLiV16s", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_vext2xv_du_wu, "V4LLiV8Si", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvsigncov_b, "V32ScV32ScV32Sc", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvsigncov_h, "V16SsV16SsV16Ss", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvsigncov_w, "V8SiV8SiV8Si", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvsigncov_d, "V4SLLiV4SLLiV4SLLi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvmskltz_b, "V32cV32c", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvmskltz_h, "V16sV16s", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvmskltz_w, "V8iV8i", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvmskltz_d, "V4LLiV4LLi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvmskgez_b, "V32cV32c", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvmsknz_b, "V16sV16s", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvldi, "V4LLiIi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvrepli_b, "V32cIi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvrepli_h, "V16sIi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvrepli_w, "V8iIi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvrepli_d, "V4LLiIi", "nc", "lasx")
+
+
+TARGET_BUILTIN(__builtin_lasx_xvand_v, "V32UcV32UcV32Uc", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvor_v, "V32UcV32UcV32Uc", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvxor_v, "V32cV32cV32c", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvnor_v, "V32UcV32UcV32Uc", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvandn_v, "V32UcV32UcV32Uc", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvorn_v, "V32ScV32ScV32Sc", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvandi_b, "V32UcV32UcIUi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvori_b, "V32UcV32UcIUi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvxori_b, "V32UcV32UcIUi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvnori_b, "V32UcV32UcIUi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvsll_b, "V32cV32cV32c", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvsll_h, "V16sV16sV16s", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvsll_w, "V8iV8iV8i", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvsll_d, "V4LLiV4LLiV4LLi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvslli_b, "V32cV32cIUi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvslli_h, "V16sV16sIUi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvslli_w, "V8iV8iIUi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvslli_d, "V4LLiV4LLiIUi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvsrl_b, "V32cV32cV32c", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvsrl_h, "V16sV16sV16s", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvsrl_w, "V8iV8iV8i", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvsrl_d, "V4LLiV4LLiV4LLi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvsrli_b, "V32cV32cIUi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvsrli_h, "V16sV16sIUi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvsrli_w, "V8iV8iIUi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvsrli_d, "V4LLiV4LLiIUi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvsra_b, "V32cV32cV32c", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvsra_h, "V16sV16sV16s", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvsra_w, "V8iV8iV8i", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvsra_d, "V4LLiV4LLiV4LLi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvsrai_b, "V32cV32cIUi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvsrai_h, "V16sV16sIUi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvsrai_w, "V8iV8iIUi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvsrai_d, "V4LLiV4LLiIUi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvrotr_b, "V32cV32cV32c", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvrotr_h, "V16sV16sV16s", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvrotr_w, "V8iV8iV8i", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvrotr_d, "V4LLiV4LLiV4LLi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvrotri_b, "V32cV32cIUi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvrotri_h, "V16sV16sIUi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvrotri_w, "V8iV8iIUi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvrotri_d, "V4LLiV4LLiIUi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvsllwil_h_b, "V16sV32cIUi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvsllwil_w_h, "V8SiV16sIUi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvsllwil_d_w, "V4LLiV8SiIUi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvextl_q_d, "V4LLiV4LLi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvsllwil_hu_bu, "V16UsV32UcIUi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvsllwil_wu_hu, "V8UiV16UsIUi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvsllwil_du_wu, "V4ULLiV8UiIUi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvextl_qu_du, "V4LLiV4ULLi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvsrlr_b, "V32cV32cV32c", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvsrlr_h, "V16sV16sV16s", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvsrlr_w, "V8iV8iV8i", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvsrlr_d, "V4LLiV4LLiV4LLi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvsrlri_b, "V32cV32cIUi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvsrlri_h, "V16sV16sIUi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvsrlri_w, "V8iV8iIUi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvsrlri_d, "V4LLiV4LLiIUi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvsrar_b, "V32cV32cV32c", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvsrar_h, "V16sV16sV16s", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvsrar_w, "V8iV8iV8i", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvsrar_d, "V4LLiV4LLiV4LLi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvsrari_b, "V32cV32cIUi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvsrari_h, "V16sV16sIUi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvsrari_w, "V8iV8iIUi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvsrari_d, "V4LLiV4LLiIUi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvsrln_b_h, "V32ScV16sV16s", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvsrln_h_w, "V16sV8SiV8Si", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvsrln_w_d, "V8SiV4LLiV4LLi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvsran_b_h, "V32ScV16sV16s", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvsran_h_w, "V16sV8SiV8Si", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvsran_w_d, "V8SiV4LLiV4LLi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvsrlni_b_h, "V32cV32cV32cIUi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvsrlni_h_w, "V16sV16sV16sIUi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvsrlni_w_d, "V8iV8iV8iIUi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvsrlni_d_q, "V4LLiV4LLiV4LLiIUi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvsrani_b_h, "V32cV32cV32cIUi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvsrani_h_w, "V16sV16sV16sIUi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvsrani_w_d, "V8iV8iV8iIUi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvsrani_d_q, "V4LLiV4LLiV4LLiIUi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvsrlrn_b_h, "V32ScV16sV16s", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvsrlrn_h_w, "V16sV8SiV8Si", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvsrlrn_w_d, "V8SiV4LLiV4LLi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvsrarn_b_h, "V32ScV16sV16s", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvsrarn_h_w, "V16sV8SiV8Si", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvsrarn_w_d, "V8SiV4LLiV4LLi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvsrlrni_b_h, "V32cV32cV32cIUi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvsrlrni_h_w, "V16sV16sV16sIUi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvsrlrni_w_d, "V8iV8iV8iIUi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvsrlrni_d_q, "V4LLiV4LLiV4LLiIUi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvsrarni_b_h, "V32cV32cV32cIUi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvsrarni_h_w, "V16sV16sV16sIUi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvsrarni_w_d, "V8iV8iV8iIUi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvsrarni_d_q, "V4LLiV4LLiV4LLiIUi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvssrln_b_h, "V32ScV16sV16s", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvssrln_h_w, "V16sV8SiV8Si", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvssrln_w_d, "V8SiV4LLiV4LLi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvssran_b_h, "V32ScV16sV16s", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvssran_h_w, "V16sV8SiV8Si", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvssran_w_d, "V8SiV4LLiV4LLi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvssrln_bu_h, "V32UcV16UsV16Us", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvssrln_hu_w, "V16UsV8UiV8Ui", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvssrln_wu_d, "V8UiV4ULLiV4ULLi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvssran_bu_h, "V32UcV16UsV16Us", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvssran_hu_w, "V16UsV8UiV8Ui", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvssran_wu_d, "V8UiV4ULLiV4ULLi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvssrlni_b_h, "V32cV32cV32cIUi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvssrlni_h_w, "V16sV16sV16sIUi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvssrlni_w_d, "V8iV8iV8iIUi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvssrlni_d_q, "V4LLiV4LLiV4LLiIUi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvssrani_b_h, "V32cV32cV32cIUi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvssrani_h_w, "V16sV16sV16sIUi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvssrani_w_d, "V8iV8iV8iIUi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvssrani_d_q, "V4LLiV4LLiV4LLiIUi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvssrlrni_bu_h, "V32cV32cV32cIUi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvssrlrni_hu_w, "V16sV16sV16sIUi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvssrlrni_wu_d, "V8iV8iV8iIUi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvssrlrni_du_q, "V4LLiV4LLiV4LLiIUi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvssrani_bu_h, "V32cV32cV32cIUi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvssrani_hu_w, "V16sV16sV16sIUi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvssrani_wu_d, "V8iV8iV8iIUi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvssrani_du_q, "V4LLiV4LLiV4LLiIUi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvssrlrn_b_h, "V32ScV16sV16s", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvssrlrn_h_w, "V16sV8SiV8Si", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvssrlrn_w_d, "V8SiV4LLiV4LLi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvssrarn_b_h, "V32ScV16sV16s", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvssrarn_h_w, "V16sV8SiV8Si", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvssrarn_w_d, "V8SiV4LLiV4LLi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvssrlrn_bu_h, "V32UcV16UsV16Us", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvssrlrn_hu_w, "V16UsV8UiV8Ui", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvssrlrn_wu_d, "V8UiV4ULLiV4ULLi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvssrarn_bu_h, "V32UcV16UsV16Us", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvssrarn_hu_w, "V16UsV8UiV8Ui", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvssrarn_wu_d, "V8UiV4ULLiV4ULLi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvssrlrni_b_h, "V32cV32cV32cIUi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvssrlrni_h_w, "V16sV16sV16sIUi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvssrlrni_w_d, "V8iV8iV8iIUi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvssrlrni_d_q, "V4LLiV4LLiV4LLiIUi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvssrarni_b_h, "V32cV32cV32cIUi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvssrarni_h_w, "V16sV16sV16sIUi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvssrarni_w_d, "V8iV8iV8iIUi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvssrarni_d_q, "V4LLiV4LLiV4LLiIUi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvssrlni_bu_h, "V32cV32cV32cIUi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvssrlni_hu_w, "V16sV16sV16sIUi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvssrlni_wu_d, "V8iV8iV8iIUi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvssrlni_du_q, "V4LLiV4LLiV4LLiIUi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvssrarni_bu_h, "V32cV32cV32cIUi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvssrarni_hu_w, "V16sV16sV16sIUi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvssrarni_wu_d, "V8iV8iV8iIUi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvssrarni_du_q, "V4LLiV4LLiV4LLiIUi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvclo_b, "V32ScV32Sc", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvclo_h, "V16SsV16Ss", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvclo_w, "V8SiV8Si", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvclo_d, "V4SLLiV4SLLi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvclz_b, "V32ScV32Sc", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvclz_h, "V16SsV16Ss", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvclz_w, "V8SiV8Si", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvclz_d, "V4SLLiV4SLLi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvpcnt_b, "V32ScV32Sc", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvpcnt_h, "V16SsV16Ss", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvpcnt_w, "V8SiV8Si", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvpcnt_d, "V4SLLiV4SLLi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvbitclr_b, "V32UcV32UcV32Uc", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvbitclr_h, "V16UsV16UsV16Us", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvbitclr_w, "V8UiV8UiV8Ui", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvbitclr_d, "V4ULLiV4ULLiV4ULLi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvbitclri_b, "V32UcV32UcIUi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvbitclri_h, "V16UsV16UsIUi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvbitclri_w, "V8UiV8UiIUi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvbitclri_d, "V4ULLiV4ULLiIUi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvbitset_b, "V32UcV32UcV32Uc", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvbitset_h, "V16UsV16UsV16Us", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvbitset_w, "V8UiV8UiV8Ui", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvbitset_d, "V4ULLiV4ULLiV4ULLi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvbitseti_b, "V32UcV32UcIUi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvbitseti_h, "V16UsV16UsIUi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvbitseti_w, "V8UiV8UiIUi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvbitseti_d, "V4ULLiV4ULLiIUi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvbitrev_b, "V32UcV32UcV32Uc", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvbitrev_h, "V16UsV16UsV16Us", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvbitrev_w, "V8UiV8UiV8Ui", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvbitrev_d, "V4ULLiV4ULLiV4ULLi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvbitrevi_b, "V32UcV32UcIUi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvbitrevi_h, "V16UsV16UsIUi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvbitrevi_w, "V8UiV8UiIUi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvbitrevi_d, "V4ULLiV4ULLiIUi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvfrstp_b, "V32ScV32ScV32ScV32Sc", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvfrstp_h, "V16SsV16SsV16SsV16Ss", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvfrstpi_b, "V32cV32cV32cIUi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvfrstpi_h, "V16sV16sV16sIUi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvfadd_s, "V8fV8fV8f", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvfadd_d, "V4dV4dV4d", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvfsub_s, "V8fV8fV8f", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvfsub_d, "V4dV4dV4d", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvfmul_s, "V8fV8fV8f", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvfmul_d, "V4dV4dV4d", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvfdiv_s, "V8fV8fV8f", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvfdiv_d, "V4dV4dV4d", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvfmadd_s, "V8fV8fV8fV8f", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvfmadd_d, "V4dV4dV4dV4d", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvfmsub_s, "V8fV8fV8fV8f", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvfmsub_d, "V4dV4dV4dV4d", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvfnmadd_s, "V8fV8fV8fV8f", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvfnmadd_d, "V4dV4dV4dV4d", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvfnmsub_s, "V8fV8fV8fV8f", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvfnmsub_d, "V4dV4dV4dV4d", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvfmax_s, "V8fV8fV8f", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvfmax_d, "V4dV4dV4d", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvfmin_s, "V8fV8fV8f", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvfmin_d, "V4dV4dV4d", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvfmaxa_s, "V8fV8fV8f", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvfmaxa_d, "V4dV4dV4d", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvfmina_s, "V8fV8fV8f", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvfmina_d, "V4dV4dV4d", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvflogb_s, "V8fV8f", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvflogb_d, "V4dV4d", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvfclass_s, "V8iV8f", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvfclass_d, "V4LLiV4d", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvfsqrt_s, "V8fV8f", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvfsqrt_d, "V4dV4d", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvfrecip_s, "V8fV8f", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvfrecip_d, "V4dV4d", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvfrsqrt_s, "V8fV8f", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvfrsqrt_d, "V4dV4d", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvfcvtl_s_h, "V8fV16s", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvfcvth_s_h, "V8fV16s", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvfcvtl_d_s, "V4dV8f", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvfcvth_d_s, "V4dV8f", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvfcvt_h_s, "V16sV8fV8f", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvfcvt_s_d, "V8fV4dV4d", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvfrintrne_s, "V8SiV8f", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvfrintrne_d, "V4LLiV4d", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvfrintrz_s, "V8SiV8f", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvfrintrz_d, "V4LLiV4d", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvfrintrp_s, "V8SiV8f", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvfrintrp_d, "V4LLiV4d", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvfrintrm_s, "V8SiV8f", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvfrintrm_d, "V4LLiV4d", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvfrint_s, "V8fV8f", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvfrint_d, "V4dV4d", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvftintrne_w_s, "V8SiV8f", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvftintrne_l_d, "V4LLiV4d", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvftintrz_w_s, "V8SiV8f", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvftintrz_l_d, "V4LLiV4d", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvftintrp_w_s, "V8SiV8f", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvftintrp_l_d, "V4LLiV4d", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvftintrm_w_s, "V8SiV8f", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvftintrm_l_d, "V4LLiV4d", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvftint_w_s, "V8SiV8f", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvftint_l_d, "V4SLLiV4d", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvftintrz_wu_s, "V8UiV8f", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvftintrz_lu_d, "V4ULLiV4d", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvftint_wu_s, "V8UiV8f", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvftint_lu_d, "V4ULLiV4d", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvftintrne_w_d, "V8SiV4dV4d", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvftintrz_w_d, "V8SiV4dV4d", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvftintrp_w_d, "V8SiV4dV4d", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvftintrm_w_d, "V8SiV4dV4d", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvftint_w_d, "V8SiV4dV4d", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvftintrnel_l_s, "V4LLiV8f", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvftintrneh_l_s, "V4LLiV8f", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvftintrzl_l_s, "V4LLiV8f", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvftintrzh_l_s, "V4LLiV8f", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvftintrpl_l_s, "V4LLiV8f", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvftintrph_l_s, "V4LLiV8f", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvftintrml_l_s, "V4LLiV8f", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvftintrmh_l_s, "V4LLiV8f", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvftintl_l_s, "V4LLiV8f", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvftinth_l_s, "V4LLiV8f", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvffint_s_w, "V8fV8Si", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvffint_d_l, "V4dV4SLLi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvffint_s_wu, "V8fV8Ui", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvffint_d_lu, "V4dV4ULLi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvffintl_d_w, "V4dV8Si", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvffinth_d_w, "V4dV8Si", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvffint_s_l, "V8fV4LLiV4LLi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvseq_b, "V32ScV32ScV32Sc", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvseq_h, "V16SsV16SsV16Ss", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvseq_w, "V8SiV8SiV8Si", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvseq_d, "V4SLLiV4SLLiV4SLLi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvseqi_b, "V32ScV32ScISi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvseqi_h, "V16SsV16SsISi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvseqi_w, "V8SiV8SiISi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvseqi_d, "V4SLLiV4SLLiISi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvsle_b, "V32ScV32ScV32Sc", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvsle_h, "V16SsV16SsV16Ss", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvsle_w, "V8SiV8SiV8Si", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvsle_d, "V4SLLiV4SLLiV4SLLi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvslei_b, "V32ScV32ScISi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvslei_h, "V16SsV16SsISi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvslei_w, "V8SiV8SiISi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvslei_d, "V4SLLiV4SLLiISi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvsle_bu, "V32ScV32UcV32Uc", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvsle_hu, "V16SsV16UsV16Us", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvsle_wu, "V8SiV8UiV8Ui", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvsle_du, "V4SLLiV4ULLiV4ULLi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvslei_bu, "V32ScV32UcIUi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvslei_hu, "V16SsV16UsIUi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvslei_wu, "V8SiV8UiIUi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvslei_du, "V4SLLiV4ULLiIUi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvslt_b, "V32ScV32ScV32Sc", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvslt_h, "V16SsV16SsV16Ss", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvslt_w, "V8SiV8SiV8Si", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvslt_d, "V4SLLiV4SLLiV4SLLi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvslti_b, "V32ScV32ScISi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvslti_h, "V16SsV16SsISi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvslti_w, "V8SiV8SiISi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvslti_d, "V4SLLiV4SLLiISi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvslt_bu, "V32ScV32UcV32Uc", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvslt_hu, "V16SsV16UsV16Us", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvslt_wu, "V8SiV8UiV8Ui", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvslt_du, "V4SLLiV4ULLiV4ULLi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvslti_bu, "V32ScV32UcIUi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvslti_hu, "V16SsV16UsIUi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvslti_wu, "V8SiV8UiIUi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvslti_du, "V4SLLiV4ULLiIUi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvfcmp_caf_s, "V8SiV8fV8f", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvfcmp_caf_d, "V4SLLiV4dV4d", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvfcmp_cun_s, "V8SiV8fV8f", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvfcmp_cun_d, "V4SLLiV4dV4d", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvfcmp_ceq_s, "V8SiV8fV8f", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvfcmp_ceq_d, "V4SLLiV4dV4d", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvfcmp_cueq_s, "V8SiV8fV8f", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvfcmp_cueq_d, "V4SLLiV4dV4d", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvfcmp_clt_s, "V8SiV8fV8f", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvfcmp_clt_d, "V4SLLiV4dV4d", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvfcmp_cult_s, "V8SiV8fV8f", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvfcmp_cult_d, "V4SLLiV4dV4d", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvfcmp_cle_s, "V8SiV8fV8f", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvfcmp_cle_d, "V4SLLiV4dV4d", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvfcmp_cule_s, "V8SiV8fV8f", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvfcmp_cule_d, "V4SLLiV4dV4d", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvfcmp_cne_s, "V8SiV8fV8f", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvfcmp_cne_d, "V4SLLiV4dV4d", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvfcmp_cor_s, "V8SiV8fV8f", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvfcmp_cor_d, "V4SLLiV4dV4d", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvfcmp_cune_s, "V8SiV8fV8f", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvfcmp_cune_d, "V4SLLiV4dV4d", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvfcmp_saf_s, "V8SiV8fV8f", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvfcmp_saf_d, "V4SLLiV4dV4d", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvfcmp_sun_s, "V8SiV8fV8f", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvfcmp_sun_d, "V4SLLiV4dV4d", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvfcmp_seq_s, "V8SiV8fV8f", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvfcmp_seq_d, "V4SLLiV4dV4d", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvfcmp_sueq_s, "V8SiV8fV8f", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvfcmp_sueq_d, "V4SLLiV4dV4d", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvfcmp_slt_s, "V8SiV8fV8f", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvfcmp_slt_d, "V4SLLiV4dV4d", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvfcmp_sult_s, "V8SiV8fV8f", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvfcmp_sult_d, "V4SLLiV4dV4d", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvfcmp_sle_s, "V8SiV8fV8f", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvfcmp_sle_d, "V4SLLiV4dV4d", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvfcmp_sule_s, "V8SiV8fV8f", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvfcmp_sule_d, "V4SLLiV4dV4d", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvfcmp_sne_s, "V8SiV8fV8f", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvfcmp_sne_d, "V4SLLiV4dV4d", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvfcmp_sor_s, "V8SiV8fV8f", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvfcmp_sor_d, "V4SLLiV4dV4d", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvfcmp_sune_s, "V8SiV8fV8f", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvfcmp_sune_d, "V4SLLiV4dV4d", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvbitsel_v, "V32UcV32UcV32UcV32Uc", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvbitseli_b, "V32UcV32UcV32UcIUi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvreplgr2vr_b, "V32Sci", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvreplgr2vr_h, "V16Ssi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvreplgr2vr_w, "V8Sii", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvreplgr2vr_d, "V4SLLiLLi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvinsgr2vr_w, "V8SiV8SiiIUi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvinsgr2vr_d, "V4SLLiV4SLLiLLiIUi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvpickve2gr_w, "iV8SiIUi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvpickve2gr_d, "LLiV4SLLiIUi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvpickve2gr_wu, "iV8UiIUi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvpickve2gr_du, "LLiV4ULLiIUi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvreplve_b, "V32cV32cUi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvreplve_h, "V16sV16sUi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvreplve_w, "V8iV8iUi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvreplve_d, "V4LLiV4LLiUi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvrepl128vei_b, "V32cV32cIUi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvrepl128vei_h, "V16sV16sIUi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvrepl128vei_w, "V8iV8iIUi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvrepl128vei_d, "V4LLiV4LLiIUi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvreplve0_b, "V32ScV32Sc", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvreplve0_h, "V16SsV16Ss", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvreplve0_w, "V8SiV8Si", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvreplve0_d, "V4SLLiV4SLLi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvreplve0_q, "V32ScV32Sc", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvinsve0_w, "V8iV8iV8iIUi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvinsve0_d, "V4LLiV4LLiV4LLiIUi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvpickve_w, "V8iV8iIUi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvpickve_d, "V4LLiV4LLiIUi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvpickve_w_f, "V8fV8fIUi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvpickve_d_f, "V4dV4dIUi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvbsll_v, "V32cV32cIUi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvbsrl_v, "V32cV32cIUi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvpackev_b, "V32cV32cV32c", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvpackev_h, "V16sV16sV16s", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvpackev_w, "V8iV8iV8i", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvpackev_d, "V4LLiV4LLiV4LLi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvpackod_b, "V32cV32cV32c", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvpackod_h, "V16sV16sV16s", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvpackod_w, "V8iV8iV8i", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvpackod_d, "V4LLiV4LLiV4LLi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvpickev_b, "V32cV32cV32c", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvpickev_h, "V16sV16sV16s", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvpickev_w, "V8iV8iV8i", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvpickev_d, "V4LLiV4LLiV4LLi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvpickod_b, "V32cV32cV32c", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvpickod_h, "V16sV16sV16s", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvpickod_w, "V8iV8iV8i", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvpickod_d, "V4LLiV4LLiV4LLi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvilvl_b, "V32cV32cV32c", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvilvl_h, "V16sV16sV16s", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvilvl_w, "V8iV8iV8i", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvilvl_d, "V4LLiV4LLiV4LLi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvilvh_b, "V32cV32cV32c", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvilvh_h, "V16sV16sV16s", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvilvh_w, "V8iV8iV8i", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvilvh_d, "V4LLiV4LLiV4LLi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvshuf_b, "V32UcV32UcV32UcV32Uc", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvshuf_h, "V16sV16sV16sV16s", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvshuf_w, "V8iV8iV8iV8i", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvshuf_d, "V4LLiV4LLiV4LLiV4LLi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvperm_w, "V8iV8iV8i", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvshuf4i_b, "V32cV32cIUi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvshuf4i_h, "V16sV16sIUi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvshuf4i_w, "V8iV8iIUi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvshuf4i_d, "V4LLiV4LLiV4LLiIUi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvpermi_w, "V8iV8iV8iIUi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvpermi_d, "V4LLiV4LLiIUi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvpermi_q, "V32cV32cV32cIUi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvextrins_b, "V32cV32cV32cIUi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvextrins_h, "V16sV16sV16sIUi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvextrins_w, "V8iV8iV8iIUi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvextrins_d, "V4LLiV4LLiV4LLiIUi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvld, "V32ScvC*Ii", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvst, "vV32Scv*Ii", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvldx, "V32ScvC*LLi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvstx, "vV32Scv*LLi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvldrepl_b, "V32cvC*Ii", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvldrepl_h, "V16svC*Ii", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvldrepl_w, "V8ivC*Ii", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvldrepl_d, "V4LLivC*Ii", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xvstelm_b, "vV32Scv*IiUi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvstelm_h, "vV16Ssv*IiUi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvstelm_w, "vV8Siv*IiUi", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xvstelm_d, "vV4SLLiv*IiUi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xbz_v, "iV32Uc", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xbnz_v, "iV32Uc", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xbz_b, "iV32Uc", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xbz_h, "iV16Us", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xbz_w, "iV8Ui", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xbz_d, "iV4ULLi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_xbnz_b, "iV32Uc", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xbnz_h, "iV16Us", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xbnz_w, "iV8Ui", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_xbnz_d, "iV4ULLi", "nc", "lasx")
diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
index cee61a7e4fb08..8b1e2bc4afa4d 100644
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -93,6 +93,7 @@ set(hlsl_files
 
 set(loongarch_files
   larchintrin.h
+  lasxintrin.h
   lsxintrin.h
   )
 
diff --git a/clang/lib/Headers/lasxintrin.h b/clang/lib/Headers/lasxintrin.h
new file mode 100644
index 0000000000000..6b4d5012a24b5
--- /dev/null
+++ b/clang/lib/Headers/lasxintrin.h
@@ -0,0 +1,3860 @@
+/*===------------ lasxintrin.h - LoongArch LASX intrinsics -----------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef _LOONGSON_ASXINTRIN_H
+#define _LOONGSON_ASXINTRIN_H 1
+
+#if defined(__loongarch_asx)
+
+typedef signed char v32i8 __attribute__((vector_size(32), aligned(32)));
+typedef signed char v32i8_b __attribute__((vector_size(32), aligned(1)));
+typedef unsigned char v32u8 __attribute__((vector_size(32), aligned(32)));
+typedef unsigned char v32u8_b __attribute__((vector_size(32), aligned(1)));
+typedef short v16i16 __attribute__((vector_size(32), aligned(32)));
+typedef short v16i16_h __attribute__((vector_size(32), aligned(2)));
+typedef unsigned short v16u16 __attribute__((vector_size(32), aligned(32)));
+typedef unsigned short v16u16_h __attribute__((vector_size(32), aligned(2)));
+typedef int v8i32 __attribute__((vector_size(32), aligned(32)));
+typedef int v8i32_w __attribute__((vector_size(32), aligned(4)));
+typedef unsigned int v8u32 __attribute__((vector_size(32), aligned(32)));
+typedef unsigned int v8u32_w __attribute__((vector_size(32), aligned(4)));
+typedef long long v4i64 __attribute__((vector_size(32), aligned(32)));
+typedef long long v4i64_d __attribute__((vector_size(32), aligned(8)));
+typedef unsigned long long v4u64 __attribute__((vector_size(32), aligned(32)));
+typedef unsigned long long v4u64_d __attribute__((vector_size(32), aligned(8)));
+typedef float v8f32 __attribute__((vector_size(32), aligned(32)));
+typedef float v8f32_w __attribute__((vector_size(32), aligned(4)));
+typedef double v4f64 __attribute__((vector_size(32), aligned(32)));
+typedef double v4f64_d __attribute__((vector_size(32), aligned(8)));
+
+typedef double v4f64 __attribute__((vector_size(32), aligned(32)));
+typedef double v4f64_d __attribute__((vector_size(32), aligned(8)));
+
+typedef float __m256 __attribute__((__vector_size__(32), __may_alias__));
+typedef long long __m256i __attribute__((__vector_size__(32), __may_alias__));
+typedef double __m256d __attribute__((__vector_size__(32), __may_alias__));
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvsll_b(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvsll_b((v32i8)_1, (v32i8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvsll_h(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvsll_h((v16i16)_1, (v16i16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvsll_w(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvsll_w((v8i32)_1, (v8i32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvsll_d(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvsll_d((v4i64)_1, (v4i64)_2);
+}
+
+#define __lasx_xvslli_b(/*__m256i*/ _1, /*ui3*/ _2)                            \
+  ((__m256i)__builtin_lasx_xvslli_b((v32i8)(_1), (_2)))
+
+#define __lasx_xvslli_h(/*__m256i*/ _1, /*ui4*/ _2)                            \
+  ((__m256i)__builtin_lasx_xvslli_h((v16i16)(_1), (_2)))
+
+#define __lasx_xvslli_w(/*__m256i*/ _1, /*ui5*/ _2)                            \
+  ((__m256i)__builtin_lasx_xvslli_w((v8i32)(_1), (_2)))
+
+#define __lasx_xvslli_d(/*__m256i*/ _1, /*ui6*/ _2)                            \
+  ((__m256i)__builtin_lasx_xvslli_d((v4i64)(_1), (_2)))
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvsra_b(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvsra_b((v32i8)_1, (v32i8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvsra_h(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvsra_h((v16i16)_1, (v16i16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvsra_w(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvsra_w((v8i32)_1, (v8i32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvsra_d(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvsra_d((v4i64)_1, (v4i64)_2);
+}
+
+#define __lasx_xvsrai_b(/*__m256i*/ _1, /*ui3*/ _2)                            \
+  ((__m256i)__builtin_lasx_xvsrai_b((v32i8)(_1), (_2)))
+
+#define __lasx_xvsrai_h(/*__m256i*/ _1, /*ui4*/ _2)                            \
+  ((__m256i)__builtin_lasx_xvsrai_h((v16i16)(_1), (_2)))
+
+#define __lasx_xvsrai_w(/*__m256i*/ _1, /*ui5*/ _2)                            \
+  ((__m256i)__builtin_lasx_xvsrai_w((v8i32)(_1), (_2)))
+
+#define __lasx_xvsrai_d(/*__m256i*/ _1, /*ui6*/ _2)                            \
+  ((__m256i)__builtin_lasx_xvsrai_d((v4i64)(_1), (_2)))
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvsrar_b(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvsrar_b((v32i8)_1, (v32i8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvsrar_h(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvsrar_h((v16i16)_1, (v16i16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvsrar_w(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvsrar_w((v8i32)_1, (v8i32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvsrar_d(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvsrar_d((v4i64)_1, (v4i64)_2);
+}
+
+#define __lasx_xvsrari_b(/*__m256i*/ _1, /*ui3*/ _2)                           \
+  ((__m256i)__builtin_lasx_xvsrari_b((v32i8)(_1), (_2)))
+
+#define __lasx_xvsrari_h(/*__m256i*/ _1, /*ui4*/ _2)                           \
+  ((__m256i)__builtin_lasx_xvsrari_h((v16i16)(_1), (_2)))
+
+#define __lasx_xvsrari_w(/*__m256i*/ _1, /*ui5*/ _2)                           \
+  ((__m256i)__builtin_lasx_xvsrari_w((v8i32)(_1), (_2)))
+
+#define __lasx_xvsrari_d(/*__m256i*/ _1, /*ui6*/ _2)                           \
+  ((__m256i)__builtin_lasx_xvsrari_d((v4i64)(_1), (_2)))
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvsrl_b(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvsrl_b((v32i8)_1, (v32i8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvsrl_h(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvsrl_h((v16i16)_1, (v16i16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvsrl_w(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvsrl_w((v8i32)_1, (v8i32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvsrl_d(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvsrl_d((v4i64)_1, (v4i64)_2);
+}
+
+#define __lasx_xvsrli_b(/*__m256i*/ _1, /*ui3*/ _2)                            \
+  ((__m256i)__builtin_lasx_xvsrli_b((v32i8)(_1), (_2)))
+
+#define __lasx_xvsrli_h(/*__m256i*/ _1, /*ui4*/ _2)                            \
+  ((__m256i)__builtin_lasx_xvsrli_h((v16i16)(_1), (_2)))
+
+#define __lasx_xvsrli_w(/*__m256i*/ _1, /*ui5*/ _2)                            \
+  ((__m256i)__builtin_lasx_xvsrli_w((v8i32)(_1), (_2)))
+
+#define __lasx_xvsrli_d(/*__m256i*/ _1, /*ui6*/ _2)                            \
+  ((__m256i)__builtin_lasx_xvsrli_d((v4i64)(_1), (_2)))
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvsrlr_b(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvsrlr_b((v32i8)_1, (v32i8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvsrlr_h(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvsrlr_h((v16i16)_1, (v16i16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvsrlr_w(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvsrlr_w((v8i32)_1, (v8i32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvsrlr_d(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvsrlr_d((v4i64)_1, (v4i64)_2);
+}
+
+#define __lasx_xvsrlri_b(/*__m256i*/ _1, /*ui3*/ _2)                           \
+  ((__m256i)__builtin_lasx_xvsrlri_b((v32i8)(_1), (_2)))
+
+#define __lasx_xvsrlri_h(/*__m256i*/ _1, /*ui4*/ _2)                           \
+  ((__m256i)__builtin_lasx_xvsrlri_h((v16i16)(_1), (_2)))
+
+#define __lasx_xvsrlri_w(/*__m256i*/ _1, /*ui5*/ _2)                           \
+  ((__m256i)__builtin_lasx_xvsrlri_w((v8i32)(_1), (_2)))
+
+#define __lasx_xvsrlri_d(/*__m256i*/ _1, /*ui6*/ _2)                           \
+  ((__m256i)__builtin_lasx_xvsrlri_d((v4i64)(_1), (_2)))
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvbitclr_b(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvbitclr_b((v32u8)_1, (v32u8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvbitclr_h(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvbitclr_h((v16u16)_1, (v16u16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvbitclr_w(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvbitclr_w((v8u32)_1, (v8u32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvbitclr_d(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvbitclr_d((v4u64)_1, (v4u64)_2);
+}
+
+#define __lasx_xvbitclri_b(/*__m256i*/ _1, /*ui3*/ _2)                         \
+  ((__m256i)__builtin_lasx_xvbitclri_b((v32u8)(_1), (_2)))
+
+#define __lasx_xvbitclri_h(/*__m256i*/ _1, /*ui4*/ _2)                         \
+  ((__m256i)__builtin_lasx_xvbitclri_h((v16u16)(_1), (_2)))
+
+#define __lasx_xvbitclri_w(/*__m256i*/ _1, /*ui5*/ _2)                         \
+  ((__m256i)__builtin_lasx_xvbitclri_w((v8u32)(_1), (_2)))
+
+#define __lasx_xvbitclri_d(/*__m256i*/ _1, /*ui6*/ _2)                         \
+  ((__m256i)__builtin_lasx_xvbitclri_d((v4u64)(_1), (_2)))
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvbitset_b(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvbitset_b((v32u8)_1, (v32u8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvbitset_h(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvbitset_h((v16u16)_1, (v16u16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvbitset_w(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvbitset_w((v8u32)_1, (v8u32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvbitset_d(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvbitset_d((v4u64)_1, (v4u64)_2);
+}
+
+#define __lasx_xvbitseti_b(/*__m256i*/ _1, /*ui3*/ _2)                         \
+  ((__m256i)__builtin_lasx_xvbitseti_b((v32u8)(_1), (_2)))
+
+#define __lasx_xvbitseti_h(/*__m256i*/ _1, /*ui4*/ _2)                         \
+  ((__m256i)__builtin_lasx_xvbitseti_h((v16u16)(_1), (_2)))
+
+#define __lasx_xvbitseti_w(/*__m256i*/ _1, /*ui5*/ _2)                         \
+  ((__m256i)__builtin_lasx_xvbitseti_w((v8u32)(_1), (_2)))
+
+#define __lasx_xvbitseti_d(/*__m256i*/ _1, /*ui6*/ _2)                         \
+  ((__m256i)__builtin_lasx_xvbitseti_d((v4u64)(_1), (_2)))
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvbitrev_b(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvbitrev_b((v32u8)_1, (v32u8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvbitrev_h(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvbitrev_h((v16u16)_1, (v16u16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvbitrev_w(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvbitrev_w((v8u32)_1, (v8u32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvbitrev_d(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvbitrev_d((v4u64)_1, (v4u64)_2);
+}
+
+#define __lasx_xvbitrevi_b(/*__m256i*/ _1, /*ui3*/ _2)                         \
+  ((__m256i)__builtin_lasx_xvbitrevi_b((v32u8)(_1), (_2)))
+
+#define __lasx_xvbitrevi_h(/*__m256i*/ _1, /*ui4*/ _2)                         \
+  ((__m256i)__builtin_lasx_xvbitrevi_h((v16u16)(_1), (_2)))
+
+#define __lasx_xvbitrevi_w(/*__m256i*/ _1, /*ui5*/ _2)                         \
+  ((__m256i)__builtin_lasx_xvbitrevi_w((v8u32)(_1), (_2)))
+
+#define __lasx_xvbitrevi_d(/*__m256i*/ _1, /*ui6*/ _2)                         \
+  ((__m256i)__builtin_lasx_xvbitrevi_d((v4u64)(_1), (_2)))
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvadd_b(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvadd_b((v32i8)_1, (v32i8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvadd_h(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvadd_h((v16i16)_1, (v16i16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvadd_w(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvadd_w((v8i32)_1, (v8i32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvadd_d(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvadd_d((v4i64)_1, (v4i64)_2);
+}
+
+#define __lasx_xvaddi_bu(/*__m256i*/ _1, /*ui5*/ _2)                           \
+  ((__m256i)__builtin_lasx_xvaddi_bu((v32i8)(_1), (_2)))
+
+#define __lasx_xvaddi_hu(/*__m256i*/ _1, /*ui5*/ _2)                           \
+  ((__m256i)__builtin_lasx_xvaddi_hu((v16i16)(_1), (_2)))
+
+#define __lasx_xvaddi_wu(/*__m256i*/ _1, /*ui5*/ _2)                           \
+  ((__m256i)__builtin_lasx_xvaddi_wu((v8i32)(_1), (_2)))
+
+#define __lasx_xvaddi_du(/*__m256i*/ _1, /*ui5*/ _2)                           \
+  ((__m256i)__builtin_lasx_xvaddi_du((v4i64)(_1), (_2)))
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvsub_b(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvsub_b((v32i8)_1, (v32i8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvsub_h(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvsub_h((v16i16)_1, (v16i16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvsub_w(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvsub_w((v8i32)_1, (v8i32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvsub_d(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvsub_d((v4i64)_1, (v4i64)_2);
+}
+
+#define __lasx_xvsubi_bu(/*__m256i*/ _1, /*ui5*/ _2)                           \
+  ((__m256i)__builtin_lasx_xvsubi_bu((v32i8)(_1), (_2)))
+
+#define __lasx_xvsubi_hu(/*__m256i*/ _1, /*ui5*/ _2)                           \
+  ((__m256i)__builtin_lasx_xvsubi_hu((v16i16)(_1), (_2)))
+
+#define __lasx_xvsubi_wu(/*__m256i*/ _1, /*ui5*/ _2)                           \
+  ((__m256i)__builtin_lasx_xvsubi_wu((v8i32)(_1), (_2)))
+
+#define __lasx_xvsubi_du(/*__m256i*/ _1, /*ui5*/ _2)                           \
+  ((__m256i)__builtin_lasx_xvsubi_du((v4i64)(_1), (_2)))
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvmax_b(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvmax_b((v32i8)_1, (v32i8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvmax_h(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvmax_h((v16i16)_1, (v16i16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvmax_w(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvmax_w((v8i32)_1, (v8i32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvmax_d(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvmax_d((v4i64)_1, (v4i64)_2);
+}
+
+#define __lasx_xvmaxi_b(/*__m256i*/ _1, /*si5*/ _2)                            \
+  ((__m256i)__builtin_lasx_xvmaxi_b((v32i8)(_1), (_2)))
+
+#define __lasx_xvmaxi_h(/*__m256i*/ _1, /*si5*/ _2)                            \
+  ((__m256i)__builtin_lasx_xvmaxi_h((v16i16)(_1), (_2)))
+
+#define __lasx_xvmaxi_w(/*__m256i*/ _1, /*si5*/ _2)                            \
+  ((__m256i)__builtin_lasx_xvmaxi_w((v8i32)(_1), (_2)))
+
+#define __lasx_xvmaxi_d(/*__m256i*/ _1, /*si5*/ _2)                            \
+  ((__m256i)__builtin_lasx_xvmaxi_d((v4i64)(_1), (_2)))
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvmax_bu(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvmax_bu((v32u8)_1, (v32u8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvmax_hu(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvmax_hu((v16u16)_1, (v16u16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvmax_wu(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvmax_wu((v8u32)_1, (v8u32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvmax_du(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvmax_du((v4u64)_1, (v4u64)_2);
+}
+
+#define __lasx_xvmaxi_bu(/*__m256i*/ _1, /*ui5*/ _2)                           \
+  ((__m256i)__builtin_lasx_xvmaxi_bu((v32u8)(_1), (_2)))
+
+#define __lasx_xvmaxi_hu(/*__m256i*/ _1, /*ui5*/ _2)                           \
+  ((__m256i)__builtin_lasx_xvmaxi_hu((v16u16)(_1), (_2)))
+
+#define __lasx_xvmaxi_wu(/*__m256i*/ _1, /*ui5*/ _2)                           \
+  ((__m256i)__builtin_lasx_xvmaxi_wu((v8u32)(_1), (_2)))
+
+#define __lasx_xvmaxi_du(/*__m256i*/ _1, /*ui5*/ _2)                           \
+  ((__m256i)__builtin_lasx_xvmaxi_du((v4u64)(_1), (_2)))
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvmin_b(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvmin_b((v32i8)_1, (v32i8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvmin_h(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvmin_h((v16i16)_1, (v16i16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvmin_w(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvmin_w((v8i32)_1, (v8i32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvmin_d(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvmin_d((v4i64)_1, (v4i64)_2);
+}
+
+#define __lasx_xvmini_b(/*__m256i*/ _1, /*si5*/ _2)                            \
+  ((__m256i)__builtin_lasx_xvmini_b((v32i8)(_1), (_2)))
+
+#define __lasx_xvmini_h(/*__m256i*/ _1, /*si5*/ _2)                            \
+  ((__m256i)__builtin_lasx_xvmini_h((v16i16)(_1), (_2)))
+
+#define __lasx_xvmini_w(/*__m256i*/ _1, /*si5*/ _2)                            \
+  ((__m256i)__builtin_lasx_xvmini_w((v8i32)(_1), (_2)))
+
+#define __lasx_xvmini_d(/*__m256i*/ _1, /*si5*/ _2)                            \
+  ((__m256i)__builtin_lasx_xvmini_d((v4i64)(_1), (_2)))
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvmin_bu(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvmin_bu((v32u8)_1, (v32u8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvmin_hu(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvmin_hu((v16u16)_1, (v16u16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvmin_wu(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvmin_wu((v8u32)_1, (v8u32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvmin_du(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvmin_du((v4u64)_1, (v4u64)_2);
+}
+
+#define __lasx_xvmini_bu(/*__m256i*/ _1, /*ui5*/ _2)                           \
+  ((__m256i)__builtin_lasx_xvmini_bu((v32u8)(_1), (_2)))
+
+#define __lasx_xvmini_hu(/*__m256i*/ _1, /*ui5*/ _2)                           \
+  ((__m256i)__builtin_lasx_xvmini_hu((v16u16)(_1), (_2)))
+
+#define __lasx_xvmini_wu(/*__m256i*/ _1, /*ui5*/ _2)                           \
+  ((__m256i)__builtin_lasx_xvmini_wu((v8u32)(_1), (_2)))
+
+#define __lasx_xvmini_du(/*__m256i*/ _1, /*ui5*/ _2)                           \
+  ((__m256i)__builtin_lasx_xvmini_du((v4u64)(_1), (_2)))
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvseq_b(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvseq_b((v32i8)_1, (v32i8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvseq_h(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvseq_h((v16i16)_1, (v16i16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvseq_w(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvseq_w((v8i32)_1, (v8i32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvseq_d(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvseq_d((v4i64)_1, (v4i64)_2);
+}
+
+#define __lasx_xvseqi_b(/*__m256i*/ _1, /*si5*/ _2)                            \
+  ((__m256i)__builtin_lasx_xvseqi_b((v32i8)(_1), (_2)))
+
+#define __lasx_xvseqi_h(/*__m256i*/ _1, /*si5*/ _2)                            \
+  ((__m256i)__builtin_lasx_xvseqi_h((v16i16)(_1), (_2)))
+
+#define __lasx_xvseqi_w(/*__m256i*/ _1, /*si5*/ _2)                            \
+  ((__m256i)__builtin_lasx_xvseqi_w((v8i32)(_1), (_2)))
+
+#define __lasx_xvseqi_d(/*__m256i*/ _1, /*si5*/ _2)                            \
+  ((__m256i)__builtin_lasx_xvseqi_d((v4i64)(_1), (_2)))
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvslt_b(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvslt_b((v32i8)_1, (v32i8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvslt_h(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvslt_h((v16i16)_1, (v16i16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvslt_w(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvslt_w((v8i32)_1, (v8i32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvslt_d(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvslt_d((v4i64)_1, (v4i64)_2);
+}
+
+#define __lasx_xvslti_b(/*__m256i*/ _1, /*si5*/ _2)                            \
+  ((__m256i)__builtin_lasx_xvslti_b((v32i8)(_1), (_2)))
+
+#define __lasx_xvslti_h(/*__m256i*/ _1, /*si5*/ _2)                            \
+  ((__m256i)__builtin_lasx_xvslti_h((v16i16)(_1), (_2)))
+
+#define __lasx_xvslti_w(/*__m256i*/ _1, /*si5*/ _2)                            \
+  ((__m256i)__builtin_lasx_xvslti_w((v8i32)(_1), (_2)))
+
+#define __lasx_xvslti_d(/*__m256i*/ _1, /*si5*/ _2)                            \
+  ((__m256i)__builtin_lasx_xvslti_d((v4i64)(_1), (_2)))
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvslt_bu(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvslt_bu((v32u8)_1, (v32u8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvslt_hu(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvslt_hu((v16u16)_1, (v16u16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvslt_wu(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvslt_wu((v8u32)_1, (v8u32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvslt_du(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvslt_du((v4u64)_1, (v4u64)_2);
+}
+
+#define __lasx_xvslti_bu(/*__m256i*/ _1, /*ui5*/ _2)                           \
+  ((__m256i)__builtin_lasx_xvslti_bu((v32u8)(_1), (_2)))
+
+#define __lasx_xvslti_hu(/*__m256i*/ _1, /*ui5*/ _2)                           \
+  ((__m256i)__builtin_lasx_xvslti_hu((v16u16)(_1), (_2)))
+
+#define __lasx_xvslti_wu(/*__m256i*/ _1, /*ui5*/ _2)                           \
+  ((__m256i)__builtin_lasx_xvslti_wu((v8u32)(_1), (_2)))
+
+#define __lasx_xvslti_du(/*__m256i*/ _1, /*ui5*/ _2)                           \
+  ((__m256i)__builtin_lasx_xvslti_du((v4u64)(_1), (_2)))
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvsle_b(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvsle_b((v32i8)_1, (v32i8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvsle_h(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvsle_h((v16i16)_1, (v16i16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvsle_w(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvsle_w((v8i32)_1, (v8i32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvsle_d(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvsle_d((v4i64)_1, (v4i64)_2);
+}
+
+#define __lasx_xvslei_b(/*__m256i*/ _1, /*si5*/ _2)                            \
+  ((__m256i)__builtin_lasx_xvslei_b((v32i8)(_1), (_2)))
+
+#define __lasx_xvslei_h(/*__m256i*/ _1, /*si5*/ _2)                            \
+  ((__m256i)__builtin_lasx_xvslei_h((v16i16)(_1), (_2)))
+
+#define __lasx_xvslei_w(/*__m256i*/ _1, /*si5*/ _2)                            \
+  ((__m256i)__builtin_lasx_xvslei_w((v8i32)(_1), (_2)))
+
+#define __lasx_xvslei_d(/*__m256i*/ _1, /*si5*/ _2)                            \
+  ((__m256i)__builtin_lasx_xvslei_d((v4i64)(_1), (_2)))
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvsle_bu(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvsle_bu((v32u8)_1, (v32u8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvsle_hu(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvsle_hu((v16u16)_1, (v16u16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvsle_wu(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvsle_wu((v8u32)_1, (v8u32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvsle_du(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvsle_du((v4u64)_1, (v4u64)_2);
+}
+
+#define __lasx_xvslei_bu(/*__m256i*/ _1, /*ui5*/ _2)                           \
+  ((__m256i)__builtin_lasx_xvslei_bu((v32u8)(_1), (_2)))
+
+#define __lasx_xvslei_hu(/*__m256i*/ _1, /*ui5*/ _2)                           \
+  ((__m256i)__builtin_lasx_xvslei_hu((v16u16)(_1), (_2)))
+
+#define __lasx_xvslei_wu(/*__m256i*/ _1, /*ui5*/ _2)                           \
+  ((__m256i)__builtin_lasx_xvslei_wu((v8u32)(_1), (_2)))
+
+#define __lasx_xvslei_du(/*__m256i*/ _1, /*ui5*/ _2)                           \
+  ((__m256i)__builtin_lasx_xvslei_du((v4u64)(_1), (_2)))
+
+#define __lasx_xvsat_b(/*__m256i*/ _1, /*ui3*/ _2)                             \
+  ((__m256i)__builtin_lasx_xvsat_b((v32i8)(_1), (_2)))
+
+#define __lasx_xvsat_h(/*__m256i*/ _1, /*ui4*/ _2)                             \
+  ((__m256i)__builtin_lasx_xvsat_h((v16i16)(_1), (_2)))
+
+#define __lasx_xvsat_w(/*__m256i*/ _1, /*ui5*/ _2)                             \
+  ((__m256i)__builtin_lasx_xvsat_w((v8i32)(_1), (_2)))
+
+#define __lasx_xvsat_d(/*__m256i*/ _1, /*ui6*/ _2)                             \
+  ((__m256i)__builtin_lasx_xvsat_d((v4i64)(_1), (_2)))
+
+#define __lasx_xvsat_bu(/*__m256i*/ _1, /*ui3*/ _2)                            \
+  ((__m256i)__builtin_lasx_xvsat_bu((v32u8)(_1), (_2)))
+
+#define __lasx_xvsat_hu(/*__m256i*/ _1, /*ui4*/ _2)                            \
+  ((__m256i)__builtin_lasx_xvsat_hu((v16u16)(_1), (_2)))
+
+#define __lasx_xvsat_wu(/*__m256i*/ _1, /*ui5*/ _2)                            \
+  ((__m256i)__builtin_lasx_xvsat_wu((v8u32)(_1), (_2)))
+
+#define __lasx_xvsat_du(/*__m256i*/ _1, /*ui6*/ _2)                            \
+  ((__m256i)__builtin_lasx_xvsat_du((v4u64)(_1), (_2)))
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvadda_b(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvadda_b((v32i8)_1, (v32i8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvadda_h(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvadda_h((v16i16)_1, (v16i16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvadda_w(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvadda_w((v8i32)_1, (v8i32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvadda_d(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvadda_d((v4i64)_1, (v4i64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvsadd_b(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvsadd_b((v32i8)_1, (v32i8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvsadd_h(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvsadd_h((v16i16)_1, (v16i16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvsadd_w(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvsadd_w((v8i32)_1, (v8i32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvsadd_d(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvsadd_d((v4i64)_1, (v4i64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvsadd_bu(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvsadd_bu((v32u8)_1, (v32u8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvsadd_hu(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvsadd_hu((v16u16)_1, (v16u16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvsadd_wu(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvsadd_wu((v8u32)_1, (v8u32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvsadd_du(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvsadd_du((v4u64)_1, (v4u64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvavg_b(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvavg_b((v32i8)_1, (v32i8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvavg_h(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvavg_h((v16i16)_1, (v16i16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvavg_w(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvavg_w((v8i32)_1, (v8i32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvavg_d(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvavg_d((v4i64)_1, (v4i64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvavg_bu(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvavg_bu((v32u8)_1, (v32u8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvavg_hu(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvavg_hu((v16u16)_1, (v16u16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvavg_wu(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvavg_wu((v8u32)_1, (v8u32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvavg_du(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvavg_du((v4u64)_1, (v4u64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvavgr_b(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvavgr_b((v32i8)_1, (v32i8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvavgr_h(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvavgr_h((v16i16)_1, (v16i16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvavgr_w(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvavgr_w((v8i32)_1, (v8i32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvavgr_d(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvavgr_d((v4i64)_1, (v4i64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvavgr_bu(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvavgr_bu((v32u8)_1, (v32u8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvavgr_hu(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvavgr_hu((v16u16)_1, (v16u16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvavgr_wu(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvavgr_wu((v8u32)_1, (v8u32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvavgr_du(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvavgr_du((v4u64)_1, (v4u64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvssub_b(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvssub_b((v32i8)_1, (v32i8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvssub_h(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvssub_h((v16i16)_1, (v16i16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvssub_w(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvssub_w((v8i32)_1, (v8i32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvssub_d(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvssub_d((v4i64)_1, (v4i64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvssub_bu(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvssub_bu((v32u8)_1, (v32u8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvssub_hu(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvssub_hu((v16u16)_1, (v16u16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvssub_wu(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvssub_wu((v8u32)_1, (v8u32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvssub_du(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvssub_du((v4u64)_1, (v4u64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvabsd_b(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvabsd_b((v32i8)_1, (v32i8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvabsd_h(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvabsd_h((v16i16)_1, (v16i16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvabsd_w(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvabsd_w((v8i32)_1, (v8i32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvabsd_d(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvabsd_d((v4i64)_1, (v4i64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvabsd_bu(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvabsd_bu((v32u8)_1, (v32u8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvabsd_hu(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvabsd_hu((v16u16)_1, (v16u16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvabsd_wu(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvabsd_wu((v8u32)_1, (v8u32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvabsd_du(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvabsd_du((v4u64)_1, (v4u64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvmul_b(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvmul_b((v32i8)_1, (v32i8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvmul_h(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvmul_h((v16i16)_1, (v16i16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvmul_w(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvmul_w((v8i32)_1, (v8i32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvmul_d(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvmul_d((v4i64)_1, (v4i64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvmadd_b(__m256i _1, __m256i _2, __m256i _3) {
+  return (__m256i)__builtin_lasx_xvmadd_b((v32i8)_1, (v32i8)_2, (v32i8)_3);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvmadd_h(__m256i _1, __m256i _2, __m256i _3) {
+  return (__m256i)__builtin_lasx_xvmadd_h((v16i16)_1, (v16i16)_2, (v16i16)_3);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvmadd_w(__m256i _1, __m256i _2, __m256i _3) {
+  return (__m256i)__builtin_lasx_xvmadd_w((v8i32)_1, (v8i32)_2, (v8i32)_3);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvmadd_d(__m256i _1, __m256i _2, __m256i _3) {
+  return (__m256i)__builtin_lasx_xvmadd_d((v4i64)_1, (v4i64)_2, (v4i64)_3);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvmsub_b(__m256i _1, __m256i _2, __m256i _3) {
+  return (__m256i)__builtin_lasx_xvmsub_b((v32i8)_1, (v32i8)_2, (v32i8)_3);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvmsub_h(__m256i _1, __m256i _2, __m256i _3) {
+  return (__m256i)__builtin_lasx_xvmsub_h((v16i16)_1, (v16i16)_2, (v16i16)_3);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvmsub_w(__m256i _1, __m256i _2, __m256i _3) {
+  return (__m256i)__builtin_lasx_xvmsub_w((v8i32)_1, (v8i32)_2, (v8i32)_3);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvmsub_d(__m256i _1, __m256i _2, __m256i _3) {
+  return (__m256i)__builtin_lasx_xvmsub_d((v4i64)_1, (v4i64)_2, (v4i64)_3);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvdiv_b(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvdiv_b((v32i8)_1, (v32i8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvdiv_h(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvdiv_h((v16i16)_1, (v16i16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvdiv_w(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvdiv_w((v8i32)_1, (v8i32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvdiv_d(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvdiv_d((v4i64)_1, (v4i64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvdiv_bu(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvdiv_bu((v32u8)_1, (v32u8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvdiv_hu(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvdiv_hu((v16u16)_1, (v16u16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvdiv_wu(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvdiv_wu((v8u32)_1, (v8u32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvdiv_du(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvdiv_du((v4u64)_1, (v4u64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvhaddw_h_b(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvhaddw_h_b((v32i8)_1, (v32i8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvhaddw_w_h(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvhaddw_w_h((v16i16)_1, (v16i16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvhaddw_d_w(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvhaddw_d_w((v8i32)_1, (v8i32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvhaddw_hu_bu(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvhaddw_hu_bu((v32u8)_1, (v32u8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvhaddw_wu_hu(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvhaddw_wu_hu((v16u16)_1, (v16u16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvhaddw_du_wu(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvhaddw_du_wu((v8u32)_1, (v8u32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvhsubw_h_b(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvhsubw_h_b((v32i8)_1, (v32i8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvhsubw_w_h(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvhsubw_w_h((v16i16)_1, (v16i16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvhsubw_d_w(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvhsubw_d_w((v8i32)_1, (v8i32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvhsubw_hu_bu(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvhsubw_hu_bu((v32u8)_1, (v32u8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvhsubw_wu_hu(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvhsubw_wu_hu((v16u16)_1, (v16u16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvhsubw_du_wu(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvhsubw_du_wu((v8u32)_1, (v8u32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvmod_b(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvmod_b((v32i8)_1, (v32i8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvmod_h(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvmod_h((v16i16)_1, (v16i16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvmod_w(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvmod_w((v8i32)_1, (v8i32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvmod_d(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvmod_d((v4i64)_1, (v4i64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvmod_bu(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvmod_bu((v32u8)_1, (v32u8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvmod_hu(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvmod_hu((v16u16)_1, (v16u16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvmod_wu(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvmod_wu((v8u32)_1, (v8u32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvmod_du(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvmod_du((v4u64)_1, (v4u64)_2);
+}
+
+#define __lasx_xvrepl128vei_b(/*__m256i*/ _1, /*ui4*/ _2)                      \
+  ((__m256i)__builtin_lasx_xvrepl128vei_b((v32i8)(_1), (_2)))
+
+#define __lasx_xvrepl128vei_h(/*__m256i*/ _1, /*ui3*/ _2)                      \
+  ((__m256i)__builtin_lasx_xvrepl128vei_h((v16i16)(_1), (_2)))
+
+#define __lasx_xvrepl128vei_w(/*__m256i*/ _1, /*ui2*/ _2)                      \
+  ((__m256i)__builtin_lasx_xvrepl128vei_w((v8i32)(_1), (_2)))
+
+#define __lasx_xvrepl128vei_d(/*__m256i*/ _1, /*ui1*/ _2)                      \
+  ((__m256i)__builtin_lasx_xvrepl128vei_d((v4i64)(_1), (_2)))
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvpickev_b(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvpickev_b((v32i8)_1, (v32i8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvpickev_h(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvpickev_h((v16i16)_1, (v16i16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvpickev_w(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvpickev_w((v8i32)_1, (v8i32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvpickev_d(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvpickev_d((v4i64)_1, (v4i64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvpickod_b(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvpickod_b((v32i8)_1, (v32i8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvpickod_h(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvpickod_h((v16i16)_1, (v16i16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvpickod_w(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvpickod_w((v8i32)_1, (v8i32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvpickod_d(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvpickod_d((v4i64)_1, (v4i64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvilvh_b(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvilvh_b((v32i8)_1, (v32i8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvilvh_h(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvilvh_h((v16i16)_1, (v16i16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvilvh_w(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvilvh_w((v8i32)_1, (v8i32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvilvh_d(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvilvh_d((v4i64)_1, (v4i64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvilvl_b(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvilvl_b((v32i8)_1, (v32i8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvilvl_h(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvilvl_h((v16i16)_1, (v16i16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvilvl_w(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvilvl_w((v8i32)_1, (v8i32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvilvl_d(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvilvl_d((v4i64)_1, (v4i64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvpackev_b(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvpackev_b((v32i8)_1, (v32i8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvpackev_h(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvpackev_h((v16i16)_1, (v16i16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvpackev_w(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvpackev_w((v8i32)_1, (v8i32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvpackev_d(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvpackev_d((v4i64)_1, (v4i64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvpackod_b(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvpackod_b((v32i8)_1, (v32i8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvpackod_h(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvpackod_h((v16i16)_1, (v16i16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvpackod_w(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvpackod_w((v8i32)_1, (v8i32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvpackod_d(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvpackod_d((v4i64)_1, (v4i64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvshuf_b(__m256i _1, __m256i _2, __m256i _3) {
+  return (__m256i)__builtin_lasx_xvshuf_b((v32i8)_1, (v32i8)_2, (v32i8)_3);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvshuf_h(__m256i _1, __m256i _2, __m256i _3) {
+  return (__m256i)__builtin_lasx_xvshuf_h((v16i16)_1, (v16i16)_2, (v16i16)_3);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvshuf_w(__m256i _1, __m256i _2, __m256i _3) {
+  return (__m256i)__builtin_lasx_xvshuf_w((v8i32)_1, (v8i32)_2, (v8i32)_3);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvshuf_d(__m256i _1, __m256i _2, __m256i _3) {
+  return (__m256i)__builtin_lasx_xvshuf_d((v4i64)_1, (v4i64)_2, (v4i64)_3);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvand_v(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvand_v((v32u8)_1, (v32u8)_2);
+}
+
+#define __lasx_xvandi_b(/*__m256i*/ _1, /*ui8*/ _2)                            \
+  ((__m256i)__builtin_lasx_xvandi_b((v32u8)(_1), (_2)))
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvor_v(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvor_v((v32u8)_1, (v32u8)_2);
+}
+
+#define __lasx_xvori_b(/*__m256i*/ _1, /*ui8*/ _2)                             \
+  ((__m256i)__builtin_lasx_xvori_b((v32u8)(_1), (_2)))
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvnor_v(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvnor_v((v32u8)_1, (v32u8)_2);
+}
+
+#define __lasx_xvnori_b(/*__m256i*/ _1, /*ui8*/ _2)                            \
+  ((__m256i)__builtin_lasx_xvnori_b((v32u8)(_1), (_2)))
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvxor_v(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvxor_v((v32u8)_1, (v32u8)_2);
+}
+
+#define __lasx_xvxori_b(/*__m256i*/ _1, /*ui8*/ _2)                            \
+  ((__m256i)__builtin_lasx_xvxori_b((v32u8)(_1), (_2)))
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvbitsel_v(__m256i _1, __m256i _2, __m256i _3) {
+  return (__m256i)__builtin_lasx_xvbitsel_v((v32u8)_1, (v32u8)_2, (v32u8)_3);
+}
+
+#define __lasx_xvbitseli_b(/*__m256i*/ _1, /*__m256i*/ _2, /*ui8*/ _3)         \
+  ((__m256i)__builtin_lasx_xvbitseli_b((v32u8)(_1), (v32u8)(_2), (_3)))
+
+#define __lasx_xvshuf4i_b(/*__m256i*/ _1, /*ui8*/ _2)                          \
+  ((__m256i)__builtin_lasx_xvshuf4i_b((v32i8)(_1), (_2)))
+
+#define __lasx_xvshuf4i_h(/*__m256i*/ _1, /*ui8*/ _2)                          \
+  ((__m256i)__builtin_lasx_xvshuf4i_h((v16i16)(_1), (_2)))
+
+#define __lasx_xvshuf4i_w(/*__m256i*/ _1, /*ui8*/ _2)                          \
+  ((__m256i)__builtin_lasx_xvshuf4i_w((v8i32)(_1), (_2)))
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvreplgr2vr_b(int _1) {
+  return (__m256i)__builtin_lasx_xvreplgr2vr_b((int)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvreplgr2vr_h(int _1) {
+  return (__m256i)__builtin_lasx_xvreplgr2vr_h((int)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvreplgr2vr_w(int _1) {
+  return (__m256i)__builtin_lasx_xvreplgr2vr_w((int)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvreplgr2vr_d(long int _1) {
+  return (__m256i)__builtin_lasx_xvreplgr2vr_d((long int)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvpcnt_b(__m256i _1) {
+  return (__m256i)__builtin_lasx_xvpcnt_b((v32i8)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvpcnt_h(__m256i _1) {
+  return (__m256i)__builtin_lasx_xvpcnt_h((v16i16)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvpcnt_w(__m256i _1) {
+  return (__m256i)__builtin_lasx_xvpcnt_w((v8i32)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvpcnt_d(__m256i _1) {
+  return (__m256i)__builtin_lasx_xvpcnt_d((v4i64)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvclo_b(__m256i _1) {
+  return (__m256i)__builtin_lasx_xvclo_b((v32i8)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvclo_h(__m256i _1) {
+  return (__m256i)__builtin_lasx_xvclo_h((v16i16)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvclo_w(__m256i _1) {
+  return (__m256i)__builtin_lasx_xvclo_w((v8i32)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvclo_d(__m256i _1) {
+  return (__m256i)__builtin_lasx_xvclo_d((v4i64)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvclz_b(__m256i _1) {
+  return (__m256i)__builtin_lasx_xvclz_b((v32i8)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvclz_h(__m256i _1) {
+  return (__m256i)__builtin_lasx_xvclz_h((v16i16)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvclz_w(__m256i _1) {
+  return (__m256i)__builtin_lasx_xvclz_w((v8i32)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvclz_d(__m256i _1) {
+  return (__m256i)__builtin_lasx_xvclz_d((v4i64)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256
+    __lasx_xvfadd_s(__m256 _1, __m256 _2) {
+  return (__m256)__builtin_lasx_xvfadd_s((v8f32)_1, (v8f32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256d
+    __lasx_xvfadd_d(__m256d _1, __m256d _2) {
+  return (__m256d)__builtin_lasx_xvfadd_d((v4f64)_1, (v4f64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256
+    __lasx_xvfsub_s(__m256 _1, __m256 _2) {
+  return (__m256)__builtin_lasx_xvfsub_s((v8f32)_1, (v8f32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256d
+    __lasx_xvfsub_d(__m256d _1, __m256d _2) {
+  return (__m256d)__builtin_lasx_xvfsub_d((v4f64)_1, (v4f64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256
+    __lasx_xvfmul_s(__m256 _1, __m256 _2) {
+  return (__m256)__builtin_lasx_xvfmul_s((v8f32)_1, (v8f32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256d
+    __lasx_xvfmul_d(__m256d _1, __m256d _2) {
+  return (__m256d)__builtin_lasx_xvfmul_d((v4f64)_1, (v4f64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256
+    __lasx_xvfdiv_s(__m256 _1, __m256 _2) {
+  return (__m256)__builtin_lasx_xvfdiv_s((v8f32)_1, (v8f32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256d
+    __lasx_xvfdiv_d(__m256d _1, __m256d _2) {
+  return (__m256d)__builtin_lasx_xvfdiv_d((v4f64)_1, (v4f64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvfcvt_h_s(__m256 _1, __m256 _2) {
+  return (__m256i)__builtin_lasx_xvfcvt_h_s((v8f32)_1, (v8f32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256
+    __lasx_xvfcvt_s_d(__m256d _1, __m256d _2) {
+  return (__m256)__builtin_lasx_xvfcvt_s_d((v4f64)_1, (v4f64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256
+    __lasx_xvfmin_s(__m256 _1, __m256 _2) {
+  return (__m256)__builtin_lasx_xvfmin_s((v8f32)_1, (v8f32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256d
+    __lasx_xvfmin_d(__m256d _1, __m256d _2) {
+  return (__m256d)__builtin_lasx_xvfmin_d((v4f64)_1, (v4f64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256
+    __lasx_xvfmina_s(__m256 _1, __m256 _2) {
+  return (__m256)__builtin_lasx_xvfmina_s((v8f32)_1, (v8f32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256d
+    __lasx_xvfmina_d(__m256d _1, __m256d _2) {
+  return (__m256d)__builtin_lasx_xvfmina_d((v4f64)_1, (v4f64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256
+    __lasx_xvfmax_s(__m256 _1, __m256 _2) {
+  return (__m256)__builtin_lasx_xvfmax_s((v8f32)_1, (v8f32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256d
+    __lasx_xvfmax_d(__m256d _1, __m256d _2) {
+  return (__m256d)__builtin_lasx_xvfmax_d((v4f64)_1, (v4f64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256
+    __lasx_xvfmaxa_s(__m256 _1, __m256 _2) {
+  return (__m256)__builtin_lasx_xvfmaxa_s((v8f32)_1, (v8f32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256d
+    __lasx_xvfmaxa_d(__m256d _1, __m256d _2) {
+  return (__m256d)__builtin_lasx_xvfmaxa_d((v4f64)_1, (v4f64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvfclass_s(__m256 _1) {
+  return (__m256i)__builtin_lasx_xvfclass_s((v8f32)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvfclass_d(__m256d _1) {
+  return (__m256i)__builtin_lasx_xvfclass_d((v4f64)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256
+    __lasx_xvfsqrt_s(__m256 _1) {
+  return (__m256)__builtin_lasx_xvfsqrt_s((v8f32)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256d
+    __lasx_xvfsqrt_d(__m256d _1) {
+  return (__m256d)__builtin_lasx_xvfsqrt_d((v4f64)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256
+    __lasx_xvfrecip_s(__m256 _1) {
+  return (__m256)__builtin_lasx_xvfrecip_s((v8f32)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256d
+    __lasx_xvfrecip_d(__m256d _1) {
+  return (__m256d)__builtin_lasx_xvfrecip_d((v4f64)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256
+    __lasx_xvfrint_s(__m256 _1) {
+  return (__m256)__builtin_lasx_xvfrint_s((v8f32)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256d
+    __lasx_xvfrint_d(__m256d _1) {
+  return (__m256d)__builtin_lasx_xvfrint_d((v4f64)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256
+    __lasx_xvfrsqrt_s(__m256 _1) {
+  return (__m256)__builtin_lasx_xvfrsqrt_s((v8f32)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256d
+    __lasx_xvfrsqrt_d(__m256d _1) {
+  return (__m256d)__builtin_lasx_xvfrsqrt_d((v4f64)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256
+    __lasx_xvflogb_s(__m256 _1) {
+  return (__m256)__builtin_lasx_xvflogb_s((v8f32)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256d
+    __lasx_xvflogb_d(__m256d _1) {
+  return (__m256d)__builtin_lasx_xvflogb_d((v4f64)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256
+    __lasx_xvfcvth_s_h(__m256i _1) {
+  return (__m256)__builtin_lasx_xvfcvth_s_h((v16i16)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256d
+    __lasx_xvfcvth_d_s(__m256 _1) {
+  return (__m256d)__builtin_lasx_xvfcvth_d_s((v8f32)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256
+    __lasx_xvfcvtl_s_h(__m256i _1) {
+  return (__m256)__builtin_lasx_xvfcvtl_s_h((v16i16)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256d
+    __lasx_xvfcvtl_d_s(__m256 _1) {
+  return (__m256d)__builtin_lasx_xvfcvtl_d_s((v8f32)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvftint_w_s(__m256 _1) {
+  return (__m256i)__builtin_lasx_xvftint_w_s((v8f32)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvftint_l_d(__m256d _1) {
+  return (__m256i)__builtin_lasx_xvftint_l_d((v4f64)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvftint_wu_s(__m256 _1) {
+  return (__m256i)__builtin_lasx_xvftint_wu_s((v8f32)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvftint_lu_d(__m256d _1) {
+  return (__m256i)__builtin_lasx_xvftint_lu_d((v4f64)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvftintrz_w_s(__m256 _1) {
+  return (__m256i)__builtin_lasx_xvftintrz_w_s((v8f32)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvftintrz_l_d(__m256d _1) {
+  return (__m256i)__builtin_lasx_xvftintrz_l_d((v4f64)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvftintrz_wu_s(__m256 _1) {
+  return (__m256i)__builtin_lasx_xvftintrz_wu_s((v8f32)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvftintrz_lu_d(__m256d _1) {
+  return (__m256i)__builtin_lasx_xvftintrz_lu_d((v4f64)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256
+    __lasx_xvffint_s_w(__m256i _1) {
+  return (__m256)__builtin_lasx_xvffint_s_w((v8i32)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256d
+    __lasx_xvffint_d_l(__m256i _1) {
+  return (__m256d)__builtin_lasx_xvffint_d_l((v4i64)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256
+    __lasx_xvffint_s_wu(__m256i _1) {
+  return (__m256)__builtin_lasx_xvffint_s_wu((v8u32)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256d
+    __lasx_xvffint_d_lu(__m256i _1) {
+  return (__m256d)__builtin_lasx_xvffint_d_lu((v4u64)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvreplve_b(__m256i _1, int _2) {
+  return (__m256i)__builtin_lasx_xvreplve_b((v32i8)_1, (int)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvreplve_h(__m256i _1, int _2) {
+  return (__m256i)__builtin_lasx_xvreplve_h((v16i16)_1, (int)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvreplve_w(__m256i _1, int _2) {
+  return (__m256i)__builtin_lasx_xvreplve_w((v8i32)_1, (int)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvreplve_d(__m256i _1, int _2) {
+  return (__m256i)__builtin_lasx_xvreplve_d((v4i64)_1, (int)_2);
+}
+
+#define __lasx_xvpermi_w(/*__m256i*/ _1, /*__m256i*/ _2, /*ui8*/ _3)           \
+  ((__m256i)__builtin_lasx_xvpermi_w((v8i32)(_1), (v8i32)(_2), (_3)))
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvandn_v(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvandn_v((v32u8)_1, (v32u8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvneg_b(__m256i _1) {
+  return (__m256i)__builtin_lasx_xvneg_b((v32i8)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvneg_h(__m256i _1) {
+  return (__m256i)__builtin_lasx_xvneg_h((v16i16)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvneg_w(__m256i _1) {
+  return (__m256i)__builtin_lasx_xvneg_w((v8i32)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvneg_d(__m256i _1) {
+  return (__m256i)__builtin_lasx_xvneg_d((v4i64)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvmuh_b(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvmuh_b((v32i8)_1, (v32i8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvmuh_h(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvmuh_h((v16i16)_1, (v16i16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvmuh_w(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvmuh_w((v8i32)_1, (v8i32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvmuh_d(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvmuh_d((v4i64)_1, (v4i64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvmuh_bu(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvmuh_bu((v32u8)_1, (v32u8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvmuh_hu(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvmuh_hu((v16u16)_1, (v16u16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvmuh_wu(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvmuh_wu((v8u32)_1, (v8u32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvmuh_du(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvmuh_du((v4u64)_1, (v4u64)_2);
+}
+
+#define __lasx_xvsllwil_h_b(/*__m256i*/ _1, /*ui3*/ _2)                        \
+  ((__m256i)__builtin_lasx_xvsllwil_h_b((v32i8)(_1), (_2)))
+
+#define __lasx_xvsllwil_w_h(/*__m256i*/ _1, /*ui4*/ _2)                        \
+  ((__m256i)__builtin_lasx_xvsllwil_w_h((v16i16)(_1), (_2)))
+
+#define __lasx_xvsllwil_d_w(/*__m256i*/ _1, /*ui5*/ _2)                        \
+  ((__m256i)__builtin_lasx_xvsllwil_d_w((v8i32)(_1), (_2)))
+
+#define __lasx_xvsllwil_hu_bu(/*__m256i*/ _1, /*ui3*/ _2)                      \
+  ((__m256i)__builtin_lasx_xvsllwil_hu_bu((v32u8)(_1), (_2)))
+
+#define __lasx_xvsllwil_wu_hu(/*__m256i*/ _1, /*ui4*/ _2)                      \
+  ((__m256i)__builtin_lasx_xvsllwil_wu_hu((v16u16)(_1), (_2)))
+
+#define __lasx_xvsllwil_du_wu(/*__m256i*/ _1, /*ui5*/ _2)                      \
+  ((__m256i)__builtin_lasx_xvsllwil_du_wu((v8u32)(_1), (_2)))
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvsran_b_h(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvsran_b_h((v16i16)_1, (v16i16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvsran_h_w(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvsran_h_w((v8i32)_1, (v8i32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvsran_w_d(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvsran_w_d((v4i64)_1, (v4i64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvssran_b_h(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvssran_b_h((v16i16)_1, (v16i16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvssran_h_w(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvssran_h_w((v8i32)_1, (v8i32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvssran_w_d(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvssran_w_d((v4i64)_1, (v4i64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvssran_bu_h(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvssran_bu_h((v16u16)_1, (v16u16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvssran_hu_w(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvssran_hu_w((v8u32)_1, (v8u32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvssran_wu_d(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvssran_wu_d((v4u64)_1, (v4u64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvsrarn_b_h(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvsrarn_b_h((v16i16)_1, (v16i16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvsrarn_h_w(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvsrarn_h_w((v8i32)_1, (v8i32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvsrarn_w_d(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvsrarn_w_d((v4i64)_1, (v4i64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvssrarn_b_h(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvssrarn_b_h((v16i16)_1, (v16i16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvssrarn_h_w(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvssrarn_h_w((v8i32)_1, (v8i32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvssrarn_w_d(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvssrarn_w_d((v4i64)_1, (v4i64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvssrarn_bu_h(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvssrarn_bu_h((v16u16)_1, (v16u16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvssrarn_hu_w(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvssrarn_hu_w((v8u32)_1, (v8u32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvssrarn_wu_d(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvssrarn_wu_d((v4u64)_1, (v4u64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvsrln_b_h(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvsrln_b_h((v16i16)_1, (v16i16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvsrln_h_w(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvsrln_h_w((v8i32)_1, (v8i32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvsrln_w_d(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvsrln_w_d((v4i64)_1, (v4i64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvssrln_bu_h(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvssrln_bu_h((v16u16)_1, (v16u16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvssrln_hu_w(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvssrln_hu_w((v8u32)_1, (v8u32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvssrln_wu_d(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvssrln_wu_d((v4u64)_1, (v4u64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvsrlrn_b_h(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvsrlrn_b_h((v16i16)_1, (v16i16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvsrlrn_h_w(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvsrlrn_h_w((v8i32)_1, (v8i32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvsrlrn_w_d(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvsrlrn_w_d((v4i64)_1, (v4i64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvssrlrn_bu_h(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvssrlrn_bu_h((v16u16)_1, (v16u16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvssrlrn_hu_w(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvssrlrn_hu_w((v8u32)_1, (v8u32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvssrlrn_wu_d(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvssrlrn_wu_d((v4u64)_1, (v4u64)_2);
+}
+
+#define __lasx_xvfrstpi_b(/*__m256i*/ _1, /*__m256i*/ _2, /*ui5*/ _3)          \
+  ((__m256i)__builtin_lasx_xvfrstpi_b((v32i8)(_1), (v32i8)(_2), (_3)))
+
+#define __lasx_xvfrstpi_h(/*__m256i*/ _1, /*__m256i*/ _2, /*ui5*/ _3)          \
+  ((__m256i)__builtin_lasx_xvfrstpi_h((v16i16)(_1), (v16i16)(_2), (_3)))
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvfrstp_b(__m256i _1, __m256i _2, __m256i _3) {
+  return (__m256i)__builtin_lasx_xvfrstp_b((v32i8)_1, (v32i8)_2, (v32i8)_3);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvfrstp_h(__m256i _1, __m256i _2, __m256i _3) {
+  return (__m256i)__builtin_lasx_xvfrstp_h((v16i16)_1, (v16i16)_2, (v16i16)_3);
+}
+
+#define __lasx_xvshuf4i_d(/*__m256i*/ _1, /*__m256i*/ _2, /*ui8*/ _3)          \
+  ((__m256i)__builtin_lasx_xvshuf4i_d((v4i64)(_1), (v4i64)(_2), (_3)))
+
+#define __lasx_xvbsrl_v(/*__m256i*/ _1, /*ui5*/ _2)                            \
+  ((__m256i)__builtin_lasx_xvbsrl_v((v32i8)(_1), (_2)))
+
+#define __lasx_xvbsll_v(/*__m256i*/ _1, /*ui5*/ _2)                            \
+  ((__m256i)__builtin_lasx_xvbsll_v((v32i8)(_1), (_2)))
+
+#define __lasx_xvextrins_b(/*__m256i*/ _1, /*__m256i*/ _2, /*ui8*/ _3)         \
+  ((__m256i)__builtin_lasx_xvextrins_b((v32i8)(_1), (v32i8)(_2), (_3)))
+
+#define __lasx_xvextrins_h(/*__m256i*/ _1, /*__m256i*/ _2, /*ui8*/ _3)         \
+  ((__m256i)__builtin_lasx_xvextrins_h((v16i16)(_1), (v16i16)(_2), (_3)))
+
+#define __lasx_xvextrins_w(/*__m256i*/ _1, /*__m256i*/ _2, /*ui8*/ _3)         \
+  ((__m256i)__builtin_lasx_xvextrins_w((v8i32)(_1), (v8i32)(_2), (_3)))
+
+#define __lasx_xvextrins_d(/*__m256i*/ _1, /*__m256i*/ _2, /*ui8*/ _3)         \
+  ((__m256i)__builtin_lasx_xvextrins_d((v4i64)(_1), (v4i64)(_2), (_3)))
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvmskltz_b(__m256i _1) {
+  return (__m256i)__builtin_lasx_xvmskltz_b((v32i8)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvmskltz_h(__m256i _1) {
+  return (__m256i)__builtin_lasx_xvmskltz_h((v16i16)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvmskltz_w(__m256i _1) {
+  return (__m256i)__builtin_lasx_xvmskltz_w((v8i32)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvmskltz_d(__m256i _1) {
+  return (__m256i)__builtin_lasx_xvmskltz_d((v4i64)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvsigncov_b(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvsigncov_b((v32i8)_1, (v32i8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvsigncov_h(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvsigncov_h((v16i16)_1, (v16i16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvsigncov_w(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvsigncov_w((v8i32)_1, (v8i32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvsigncov_d(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvsigncov_d((v4i64)_1, (v4i64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256
+    __lasx_xvfmadd_s(__m256 _1, __m256 _2, __m256 _3) {
+  return (__m256)__builtin_lasx_xvfmadd_s((v8f32)_1, (v8f32)_2, (v8f32)_3);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256d
+    __lasx_xvfmadd_d(__m256d _1, __m256d _2, __m256d _3) {
+  return (__m256d)__builtin_lasx_xvfmadd_d((v4f64)_1, (v4f64)_2, (v4f64)_3);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256
+    __lasx_xvfmsub_s(__m256 _1, __m256 _2, __m256 _3) {
+  return (__m256)__builtin_lasx_xvfmsub_s((v8f32)_1, (v8f32)_2, (v8f32)_3);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256d
+    __lasx_xvfmsub_d(__m256d _1, __m256d _2, __m256d _3) {
+  return (__m256d)__builtin_lasx_xvfmsub_d((v4f64)_1, (v4f64)_2, (v4f64)_3);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256
+    __lasx_xvfnmadd_s(__m256 _1, __m256 _2, __m256 _3) {
+  return (__m256)__builtin_lasx_xvfnmadd_s((v8f32)_1, (v8f32)_2, (v8f32)_3);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256d
+    __lasx_xvfnmadd_d(__m256d _1, __m256d _2, __m256d _3) {
+  return (__m256d)__builtin_lasx_xvfnmadd_d((v4f64)_1, (v4f64)_2, (v4f64)_3);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256
+    __lasx_xvfnmsub_s(__m256 _1, __m256 _2, __m256 _3) {
+  return (__m256)__builtin_lasx_xvfnmsub_s((v8f32)_1, (v8f32)_2, (v8f32)_3);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256d
+    __lasx_xvfnmsub_d(__m256d _1, __m256d _2, __m256d _3) {
+  return (__m256d)__builtin_lasx_xvfnmsub_d((v4f64)_1, (v4f64)_2, (v4f64)_3);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvftintrne_w_s(__m256 _1) {
+  return (__m256i)__builtin_lasx_xvftintrne_w_s((v8f32)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvftintrne_l_d(__m256d _1) {
+  return (__m256i)__builtin_lasx_xvftintrne_l_d((v4f64)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvftintrp_w_s(__m256 _1) {
+  return (__m256i)__builtin_lasx_xvftintrp_w_s((v8f32)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvftintrp_l_d(__m256d _1) {
+  return (__m256i)__builtin_lasx_xvftintrp_l_d((v4f64)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvftintrm_w_s(__m256 _1) {
+  return (__m256i)__builtin_lasx_xvftintrm_w_s((v8f32)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvftintrm_l_d(__m256d _1) {
+  return (__m256i)__builtin_lasx_xvftintrm_l_d((v4f64)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvftint_w_d(__m256d _1, __m256d _2) {
+  return (__m256i)__builtin_lasx_xvftint_w_d((v4f64)_1, (v4f64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256
+    __lasx_xvffint_s_l(__m256i _1, __m256i _2) {
+  return (__m256)__builtin_lasx_xvffint_s_l((v4i64)_1, (v4i64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvftintrz_w_d(__m256d _1, __m256d _2) {
+  return (__m256i)__builtin_lasx_xvftintrz_w_d((v4f64)_1, (v4f64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvftintrp_w_d(__m256d _1, __m256d _2) {
+  return (__m256i)__builtin_lasx_xvftintrp_w_d((v4f64)_1, (v4f64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvftintrm_w_d(__m256d _1, __m256d _2) {
+  return (__m256i)__builtin_lasx_xvftintrm_w_d((v4f64)_1, (v4f64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvftintrne_w_d(__m256d _1, __m256d _2) {
+  return (__m256i)__builtin_lasx_xvftintrne_w_d((v4f64)_1, (v4f64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvftinth_l_s(__m256 _1) {
+  return (__m256i)__builtin_lasx_xvftinth_l_s((v8f32)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvftintl_l_s(__m256 _1) {
+  return (__m256i)__builtin_lasx_xvftintl_l_s((v8f32)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256d
+    __lasx_xvffinth_d_w(__m256i _1) {
+  return (__m256d)__builtin_lasx_xvffinth_d_w((v8i32)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256d
+    __lasx_xvffintl_d_w(__m256i _1) {
+  return (__m256d)__builtin_lasx_xvffintl_d_w((v8i32)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvftintrzh_l_s(__m256 _1) {
+  return (__m256i)__builtin_lasx_xvftintrzh_l_s((v8f32)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvftintrzl_l_s(__m256 _1) {
+  return (__m256i)__builtin_lasx_xvftintrzl_l_s((v8f32)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvftintrph_l_s(__m256 _1) {
+  return (__m256i)__builtin_lasx_xvftintrph_l_s((v8f32)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvftintrpl_l_s(__m256 _1) {
+  return (__m256i)__builtin_lasx_xvftintrpl_l_s((v8f32)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvftintrmh_l_s(__m256 _1) {
+  return (__m256i)__builtin_lasx_xvftintrmh_l_s((v8f32)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvftintrml_l_s(__m256 _1) {
+  return (__m256i)__builtin_lasx_xvftintrml_l_s((v8f32)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvftintrneh_l_s(__m256 _1) {
+  return (__m256i)__builtin_lasx_xvftintrneh_l_s((v8f32)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvftintrnel_l_s(__m256 _1) {
+  return (__m256i)__builtin_lasx_xvftintrnel_l_s((v8f32)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256
+    __lasx_xvfrintrne_s(__m256 _1) {
+  return (__m256)__builtin_lasx_xvfrintrne_s((v8f32)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256d
+    __lasx_xvfrintrne_d(__m256d _1) {
+  return (__m256d)__builtin_lasx_xvfrintrne_d((v4f64)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256
+    __lasx_xvfrintrz_s(__m256 _1) {
+  return (__m256)__builtin_lasx_xvfrintrz_s((v8f32)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256d
+    __lasx_xvfrintrz_d(__m256d _1) {
+  return (__m256d)__builtin_lasx_xvfrintrz_d((v4f64)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256
+    __lasx_xvfrintrp_s(__m256 _1) {
+  return (__m256)__builtin_lasx_xvfrintrp_s((v8f32)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256d
+    __lasx_xvfrintrp_d(__m256d _1) {
+  return (__m256d)__builtin_lasx_xvfrintrp_d((v4f64)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256
+    __lasx_xvfrintrm_s(__m256 _1) {
+  return (__m256)__builtin_lasx_xvfrintrm_s((v8f32)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256d
+    __lasx_xvfrintrm_d(__m256d _1) {
+  return (__m256d)__builtin_lasx_xvfrintrm_d((v4f64)_1);
+}
+
+#define __lasx_xvld(/*void **/ _1, /*si12*/ _2)                                \
+  ((__m256i)__builtin_lasx_xvld((void const *)(_1), (_2)))
+
+#define __lasx_xvst(/*__m256i*/ _1, /*void **/ _2, /*si12*/ _3)                \
+  ((void)__builtin_lasx_xvst((v32i8)(_1), (void *)(_2), (_3)))
+
+#define __lasx_xvstelm_b(/*__m256i*/ _1, /*void **/ _2, /*si8*/ _3,            \
+                         /*idx*/ _4)                                           \
+  ((void)__builtin_lasx_xvstelm_b((v32i8)(_1), (void *)(_2), (_3), (_4)))
+
+#define __lasx_xvstelm_h(/*__m256i*/ _1, /*void **/ _2, /*si8*/ _3,            \
+                         /*idx*/ _4)                                           \
+  ((void)__builtin_lasx_xvstelm_h((v16i16)(_1), (void *)(_2), (_3), (_4)))
+
+#define __lasx_xvstelm_w(/*__m256i*/ _1, /*void **/ _2, /*si8*/ _3,            \
+                         /*idx*/ _4)                                           \
+  ((void)__builtin_lasx_xvstelm_w((v8i32)(_1), (void *)(_2), (_3), (_4)))
+
+#define __lasx_xvstelm_d(/*__m256i*/ _1, /*void **/ _2, /*si8*/ _3,            \
+                         /*idx*/ _4)                                           \
+  ((void)__builtin_lasx_xvstelm_d((v4i64)(_1), (void *)(_2), (_3), (_4)))
+
+#define __lasx_xvinsve0_w(/*__m256i*/ _1, /*__m256i*/ _2, /*ui3*/ _3)          \
+  ((__m256i)__builtin_lasx_xvinsve0_w((v8i32)(_1), (v8i32)(_2), (_3)))
+
+#define __lasx_xvinsve0_d(/*__m256i*/ _1, /*__m256i*/ _2, /*ui2*/ _3)          \
+  ((__m256i)__builtin_lasx_xvinsve0_d((v4i64)(_1), (v4i64)(_2), (_3)))
+
+#define __lasx_xvpickve_w(/*__m256i*/ _1, /*ui3*/ _2)                          \
+  ((__m256i)__builtin_lasx_xvpickve_w((v8i32)(_1), (_2)))
+
+#define __lasx_xvpickve_d(/*__m256i*/ _1, /*ui2*/ _2)                          \
+  ((__m256i)__builtin_lasx_xvpickve_d((v4i64)(_1), (_2)))
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvssrlrn_b_h(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvssrlrn_b_h((v16i16)_1, (v16i16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvssrlrn_h_w(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvssrlrn_h_w((v8i32)_1, (v8i32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvssrlrn_w_d(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvssrlrn_w_d((v4i64)_1, (v4i64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvssrln_b_h(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvssrln_b_h((v16i16)_1, (v16i16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvssrln_h_w(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvssrln_h_w((v8i32)_1, (v8i32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvssrln_w_d(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvssrln_w_d((v4i64)_1, (v4i64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvorn_v(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvorn_v((v32i8)_1, (v32i8)_2);
+}
+
+#define __lasx_xvldi(/*i13*/ _1) ((__m256i)__builtin_lasx_xvldi((_1)))
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvldx(void const *_1, long int _2) {
+  return (__m256i)__builtin_lasx_xvldx((void const *)_1, (long int)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) void
+    __lasx_xvstx(__m256i _1, void *_2, long int _3) {
+  return (void)__builtin_lasx_xvstx((v32i8)_1, (void *)_2, (long int)_3);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvextl_qu_du(__m256i _1) {
+  return (__m256i)__builtin_lasx_xvextl_qu_du((v4u64)_1);
+}
+
+#define __lasx_xvinsgr2vr_w(/*__m256i*/ _1, /*int*/ _2, /*ui3*/ _3)            \
+  ((__m256i)__builtin_lasx_xvinsgr2vr_w((v8i32)(_1), (int)(_2), (_3)))
+
+#define __lasx_xvinsgr2vr_d(/*__m256i*/ _1, /*long int*/ _2, /*ui2*/ _3)       \
+  ((__m256i)__builtin_lasx_xvinsgr2vr_d((v4i64)(_1), (long int)(_2), (_3)))
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvreplve0_b(__m256i _1) {
+  return (__m256i)__builtin_lasx_xvreplve0_b((v32i8)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvreplve0_h(__m256i _1) {
+  return (__m256i)__builtin_lasx_xvreplve0_h((v16i16)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvreplve0_w(__m256i _1) {
+  return (__m256i)__builtin_lasx_xvreplve0_w((v8i32)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvreplve0_d(__m256i _1) {
+  return (__m256i)__builtin_lasx_xvreplve0_d((v4i64)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvreplve0_q(__m256i _1) {
+  return (__m256i)__builtin_lasx_xvreplve0_q((v32i8)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_vext2xv_h_b(__m256i _1) {
+  return (__m256i)__builtin_lasx_vext2xv_h_b((v32i8)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_vext2xv_w_h(__m256i _1) {
+  return (__m256i)__builtin_lasx_vext2xv_w_h((v16i16)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_vext2xv_d_w(__m256i _1) {
+  return (__m256i)__builtin_lasx_vext2xv_d_w((v8i32)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_vext2xv_w_b(__m256i _1) {
+  return (__m256i)__builtin_lasx_vext2xv_w_b((v32i8)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_vext2xv_d_h(__m256i _1) {
+  return (__m256i)__builtin_lasx_vext2xv_d_h((v16i16)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_vext2xv_d_b(__m256i _1) {
+  return (__m256i)__builtin_lasx_vext2xv_d_b((v32i8)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_vext2xv_hu_bu(__m256i _1) {
+  return (__m256i)__builtin_lasx_vext2xv_hu_bu((v32i8)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_vext2xv_wu_hu(__m256i _1) {
+  return (__m256i)__builtin_lasx_vext2xv_wu_hu((v16i16)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_vext2xv_du_wu(__m256i _1) {
+  return (__m256i)__builtin_lasx_vext2xv_du_wu((v8i32)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_vext2xv_wu_bu(__m256i _1) {
+  return (__m256i)__builtin_lasx_vext2xv_wu_bu((v32i8)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_vext2xv_du_hu(__m256i _1) {
+  return (__m256i)__builtin_lasx_vext2xv_du_hu((v16i16)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_vext2xv_du_bu(__m256i _1) {
+  return (__m256i)__builtin_lasx_vext2xv_du_bu((v32i8)_1);
+}
+
+#define __lasx_xvpermi_q(/*__m256i*/ _1, /*__m256i*/ _2, /*ui8*/ _3)           \
+  ((__m256i)__builtin_lasx_xvpermi_q((v32i8)(_1), (v32i8)(_2), (_3)))
+
+#define __lasx_xvpermi_d(/*__m256i*/ _1, /*ui8*/ _2)                           \
+  ((__m256i)__builtin_lasx_xvpermi_d((v4i64)(_1), (_2)))
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvperm_w(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvperm_w((v8i32)_1, (v8i32)_2);
+}
+
+#define __lasx_xvldrepl_b(/*void **/ _1, /*si12*/ _2)                          \
+  ((__m256i)__builtin_lasx_xvldrepl_b((void const *)(_1), (_2)))
+
+#define __lasx_xvldrepl_h(/*void **/ _1, /*si11*/ _2)                          \
+  ((__m256i)__builtin_lasx_xvldrepl_h((void const *)(_1), (_2)))
+
+#define __lasx_xvldrepl_w(/*void **/ _1, /*si10*/ _2)                          \
+  ((__m256i)__builtin_lasx_xvldrepl_w((void const *)(_1), (_2)))
+
+#define __lasx_xvldrepl_d(/*void **/ _1, /*si9*/ _2)                           \
+  ((__m256i)__builtin_lasx_xvldrepl_d((void const *)(_1), (_2)))
+
+#define __lasx_xvpickve2gr_w(/*__m256i*/ _1, /*ui3*/ _2)                       \
+  ((int)__builtin_lasx_xvpickve2gr_w((v8i32)(_1), (_2)))
+
+#define __lasx_xvpickve2gr_wu(/*__m256i*/ _1, /*ui3*/ _2)                      \
+  ((unsigned int)__builtin_lasx_xvpickve2gr_wu((v8i32)(_1), (_2)))
+
+#define __lasx_xvpickve2gr_d(/*__m256i*/ _1, /*ui2*/ _2)                       \
+  ((long int)__builtin_lasx_xvpickve2gr_d((v4i64)(_1), (_2)))
+
+#define __lasx_xvpickve2gr_du(/*__m256i*/ _1, /*ui2*/ _2)                      \
+  ((unsigned long int)__builtin_lasx_xvpickve2gr_du((v4i64)(_1), (_2)))
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvaddwev_q_d(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvaddwev_q_d((v4i64)_1, (v4i64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvaddwev_d_w(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvaddwev_d_w((v8i32)_1, (v8i32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvaddwev_w_h(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvaddwev_w_h((v16i16)_1, (v16i16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvaddwev_h_b(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvaddwev_h_b((v32i8)_1, (v32i8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvaddwev_q_du(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvaddwev_q_du((v4u64)_1, (v4u64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvaddwev_d_wu(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvaddwev_d_wu((v8u32)_1, (v8u32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvaddwev_w_hu(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvaddwev_w_hu((v16u16)_1, (v16u16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvaddwev_h_bu(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvaddwev_h_bu((v32u8)_1, (v32u8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvsubwev_q_d(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvsubwev_q_d((v4i64)_1, (v4i64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvsubwev_d_w(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvsubwev_d_w((v8i32)_1, (v8i32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvsubwev_w_h(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvsubwev_w_h((v16i16)_1, (v16i16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvsubwev_h_b(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvsubwev_h_b((v32i8)_1, (v32i8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvsubwev_q_du(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvsubwev_q_du((v4u64)_1, (v4u64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvsubwev_d_wu(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvsubwev_d_wu((v8u32)_1, (v8u32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvsubwev_w_hu(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvsubwev_w_hu((v16u16)_1, (v16u16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvsubwev_h_bu(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvsubwev_h_bu((v32u8)_1, (v32u8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvmulwev_q_d(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvmulwev_q_d((v4i64)_1, (v4i64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvmulwev_d_w(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvmulwev_d_w((v8i32)_1, (v8i32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvmulwev_w_h(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvmulwev_w_h((v16i16)_1, (v16i16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvmulwev_h_b(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvmulwev_h_b((v32i8)_1, (v32i8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvmulwev_q_du(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvmulwev_q_du((v4u64)_1, (v4u64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvmulwev_d_wu(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvmulwev_d_wu((v8u32)_1, (v8u32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvmulwev_w_hu(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvmulwev_w_hu((v16u16)_1, (v16u16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvmulwev_h_bu(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvmulwev_h_bu((v32u8)_1, (v32u8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvaddwod_q_d(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvaddwod_q_d((v4i64)_1, (v4i64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvaddwod_d_w(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvaddwod_d_w((v8i32)_1, (v8i32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvaddwod_w_h(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvaddwod_w_h((v16i16)_1, (v16i16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvaddwod_h_b(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvaddwod_h_b((v32i8)_1, (v32i8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvaddwod_q_du(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvaddwod_q_du((v4u64)_1, (v4u64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvaddwod_d_wu(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvaddwod_d_wu((v8u32)_1, (v8u32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvaddwod_w_hu(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvaddwod_w_hu((v16u16)_1, (v16u16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvaddwod_h_bu(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvaddwod_h_bu((v32u8)_1, (v32u8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvsubwod_q_d(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvsubwod_q_d((v4i64)_1, (v4i64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvsubwod_d_w(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvsubwod_d_w((v8i32)_1, (v8i32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvsubwod_w_h(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvsubwod_w_h((v16i16)_1, (v16i16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvsubwod_h_b(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvsubwod_h_b((v32i8)_1, (v32i8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvsubwod_q_du(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvsubwod_q_du((v4u64)_1, (v4u64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvsubwod_d_wu(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvsubwod_d_wu((v8u32)_1, (v8u32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvsubwod_w_hu(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvsubwod_w_hu((v16u16)_1, (v16u16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvsubwod_h_bu(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvsubwod_h_bu((v32u8)_1, (v32u8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvmulwod_q_d(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvmulwod_q_d((v4i64)_1, (v4i64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvmulwod_d_w(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvmulwod_d_w((v8i32)_1, (v8i32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvmulwod_w_h(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvmulwod_w_h((v16i16)_1, (v16i16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvmulwod_h_b(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvmulwod_h_b((v32i8)_1, (v32i8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvmulwod_q_du(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvmulwod_q_du((v4u64)_1, (v4u64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvmulwod_d_wu(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvmulwod_d_wu((v8u32)_1, (v8u32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvmulwod_w_hu(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvmulwod_w_hu((v16u16)_1, (v16u16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvmulwod_h_bu(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvmulwod_h_bu((v32u8)_1, (v32u8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvaddwev_d_wu_w(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvaddwev_d_wu_w((v8u32)_1, (v8i32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvaddwev_w_hu_h(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvaddwev_w_hu_h((v16u16)_1, (v16i16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvaddwev_h_bu_b(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvaddwev_h_bu_b((v32u8)_1, (v32i8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvmulwev_d_wu_w(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvmulwev_d_wu_w((v8u32)_1, (v8i32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvmulwev_w_hu_h(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvmulwev_w_hu_h((v16u16)_1, (v16i16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvmulwev_h_bu_b(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvmulwev_h_bu_b((v32u8)_1, (v32i8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvaddwod_d_wu_w(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvaddwod_d_wu_w((v8u32)_1, (v8i32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvaddwod_w_hu_h(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvaddwod_w_hu_h((v16u16)_1, (v16i16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvaddwod_h_bu_b(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvaddwod_h_bu_b((v32u8)_1, (v32i8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvmulwod_d_wu_w(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvmulwod_d_wu_w((v8u32)_1, (v8i32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvmulwod_w_hu_h(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvmulwod_w_hu_h((v16u16)_1, (v16i16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvmulwod_h_bu_b(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvmulwod_h_bu_b((v32u8)_1, (v32i8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvhaddw_q_d(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvhaddw_q_d((v4i64)_1, (v4i64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvhaddw_qu_du(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvhaddw_qu_du((v4u64)_1, (v4u64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvhsubw_q_d(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvhsubw_q_d((v4i64)_1, (v4i64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvhsubw_qu_du(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvhsubw_qu_du((v4u64)_1, (v4u64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvmaddwev_q_d(__m256i _1, __m256i _2, __m256i _3) {
+  return (__m256i)__builtin_lasx_xvmaddwev_q_d((v4i64)_1, (v4i64)_2, (v4i64)_3);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvmaddwev_d_w(__m256i _1, __m256i _2, __m256i _3) {
+  return (__m256i)__builtin_lasx_xvmaddwev_d_w((v4i64)_1, (v8i32)_2, (v8i32)_3);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvmaddwev_w_h(__m256i _1, __m256i _2, __m256i _3) {
+  return (__m256i)__builtin_lasx_xvmaddwev_w_h((v8i32)_1, (v16i16)_2,
+                                               (v16i16)_3);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvmaddwev_h_b(__m256i _1, __m256i _2, __m256i _3) {
+  return (__m256i)__builtin_lasx_xvmaddwev_h_b((v16i16)_1, (v32i8)_2,
+                                               (v32i8)_3);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvmaddwev_q_du(__m256i _1, __m256i _2, __m256i _3) {
+  return (__m256i)__builtin_lasx_xvmaddwev_q_du((v4u64)_1, (v4u64)_2,
+                                                (v4u64)_3);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvmaddwev_d_wu(__m256i _1, __m256i _2, __m256i _3) {
+  return (__m256i)__builtin_lasx_xvmaddwev_d_wu((v4u64)_1, (v8u32)_2,
+                                                (v8u32)_3);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvmaddwev_w_hu(__m256i _1, __m256i _2, __m256i _3) {
+  return (__m256i)__builtin_lasx_xvmaddwev_w_hu((v8u32)_1, (v16u16)_2,
+                                                (v16u16)_3);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvmaddwev_h_bu(__m256i _1, __m256i _2, __m256i _3) {
+  return (__m256i)__builtin_lasx_xvmaddwev_h_bu((v16u16)_1, (v32u8)_2,
+                                                (v32u8)_3);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvmaddwod_q_d(__m256i _1, __m256i _2, __m256i _3) {
+  return (__m256i)__builtin_lasx_xvmaddwod_q_d((v4i64)_1, (v4i64)_2, (v4i64)_3);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvmaddwod_d_w(__m256i _1, __m256i _2, __m256i _3) {
+  return (__m256i)__builtin_lasx_xvmaddwod_d_w((v4i64)_1, (v8i32)_2, (v8i32)_3);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvmaddwod_w_h(__m256i _1, __m256i _2, __m256i _3) {
+  return (__m256i)__builtin_lasx_xvmaddwod_w_h((v8i32)_1, (v16i16)_2,
+                                               (v16i16)_3);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvmaddwod_h_b(__m256i _1, __m256i _2, __m256i _3) {
+  return (__m256i)__builtin_lasx_xvmaddwod_h_b((v16i16)_1, (v32i8)_2,
+                                               (v32i8)_3);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvmaddwod_q_du(__m256i _1, __m256i _2, __m256i _3) {
+  return (__m256i)__builtin_lasx_xvmaddwod_q_du((v4u64)_1, (v4u64)_2,
+                                                (v4u64)_3);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvmaddwod_d_wu(__m256i _1, __m256i _2, __m256i _3) {
+  return (__m256i)__builtin_lasx_xvmaddwod_d_wu((v4u64)_1, (v8u32)_2,
+                                                (v8u32)_3);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvmaddwod_w_hu(__m256i _1, __m256i _2, __m256i _3) {
+  return (__m256i)__builtin_lasx_xvmaddwod_w_hu((v8u32)_1, (v16u16)_2,
+                                                (v16u16)_3);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvmaddwod_h_bu(__m256i _1, __m256i _2, __m256i _3) {
+  return (__m256i)__builtin_lasx_xvmaddwod_h_bu((v16u16)_1, (v32u8)_2,
+                                                (v32u8)_3);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvmaddwev_q_du_d(__m256i _1, __m256i _2, __m256i _3) {
+  return (__m256i)__builtin_lasx_xvmaddwev_q_du_d((v4i64)_1, (v4u64)_2,
+                                                  (v4i64)_3);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvmaddwev_d_wu_w(__m256i _1, __m256i _2, __m256i _3) {
+  return (__m256i)__builtin_lasx_xvmaddwev_d_wu_w((v4i64)_1, (v8u32)_2,
+                                                  (v8i32)_3);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvmaddwev_w_hu_h(__m256i _1, __m256i _2, __m256i _3) {
+  return (__m256i)__builtin_lasx_xvmaddwev_w_hu_h((v8i32)_1, (v16u16)_2,
+                                                  (v16i16)_3);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvmaddwev_h_bu_b(__m256i _1, __m256i _2, __m256i _3) {
+  return (__m256i)__builtin_lasx_xvmaddwev_h_bu_b((v16i16)_1, (v32u8)_2,
+                                                  (v32i8)_3);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvmaddwod_q_du_d(__m256i _1, __m256i _2, __m256i _3) {
+  return (__m256i)__builtin_lasx_xvmaddwod_q_du_d((v4i64)_1, (v4u64)_2,
+                                                  (v4i64)_3);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvmaddwod_d_wu_w(__m256i _1, __m256i _2, __m256i _3) {
+  return (__m256i)__builtin_lasx_xvmaddwod_d_wu_w((v4i64)_1, (v8u32)_2,
+                                                  (v8i32)_3);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvmaddwod_w_hu_h(__m256i _1, __m256i _2, __m256i _3) {
+  return (__m256i)__builtin_lasx_xvmaddwod_w_hu_h((v8i32)_1, (v16u16)_2,
+                                                  (v16i16)_3);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvmaddwod_h_bu_b(__m256i _1, __m256i _2, __m256i _3) {
+  return (__m256i)__builtin_lasx_xvmaddwod_h_bu_b((v16i16)_1, (v32u8)_2,
+                                                  (v32i8)_3);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvrotr_b(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvrotr_b((v32i8)_1, (v32i8)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvrotr_h(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvrotr_h((v16i16)_1, (v16i16)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvrotr_w(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvrotr_w((v8i32)_1, (v8i32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvrotr_d(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvrotr_d((v4i64)_1, (v4i64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvadd_q(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvadd_q((v4i64)_1, (v4i64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvsub_q(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvsub_q((v4i64)_1, (v4i64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvaddwev_q_du_d(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvaddwev_q_du_d((v4u64)_1, (v4i64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvaddwod_q_du_d(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvaddwod_q_du_d((v4u64)_1, (v4i64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvmulwev_q_du_d(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvmulwev_q_du_d((v4u64)_1, (v4i64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvmulwod_q_du_d(__m256i _1, __m256i _2) {
+  return (__m256i)__builtin_lasx_xvmulwod_q_du_d((v4u64)_1, (v4i64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvmskgez_b(__m256i _1) {
+  return (__m256i)__builtin_lasx_xvmskgez_b((v32i8)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvmsknz_b(__m256i _1) {
+  return (__m256i)__builtin_lasx_xvmsknz_b((v32i8)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvexth_h_b(__m256i _1) {
+  return (__m256i)__builtin_lasx_xvexth_h_b((v32i8)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvexth_w_h(__m256i _1) {
+  return (__m256i)__builtin_lasx_xvexth_w_h((v16i16)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvexth_d_w(__m256i _1) {
+  return (__m256i)__builtin_lasx_xvexth_d_w((v8i32)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvexth_q_d(__m256i _1) {
+  return (__m256i)__builtin_lasx_xvexth_q_d((v4i64)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvexth_hu_bu(__m256i _1) {
+  return (__m256i)__builtin_lasx_xvexth_hu_bu((v32u8)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvexth_wu_hu(__m256i _1) {
+  return (__m256i)__builtin_lasx_xvexth_wu_hu((v16u16)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvexth_du_wu(__m256i _1) {
+  return (__m256i)__builtin_lasx_xvexth_du_wu((v8u32)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvexth_qu_du(__m256i _1) {
+  return (__m256i)__builtin_lasx_xvexth_qu_du((v4u64)_1);
+}
+
+#define __lasx_xvrotri_b(/*__m256i*/ _1, /*ui3*/ _2)                           \
+  ((__m256i)__builtin_lasx_xvrotri_b((v32i8)(_1), (_2)))
+
+#define __lasx_xvrotri_h(/*__m256i*/ _1, /*ui4*/ _2)                           \
+  ((__m256i)__builtin_lasx_xvrotri_h((v16i16)(_1), (_2)))
+
+#define __lasx_xvrotri_w(/*__m256i*/ _1, /*ui5*/ _2)                           \
+  ((__m256i)__builtin_lasx_xvrotri_w((v8i32)(_1), (_2)))
+
+#define __lasx_xvrotri_d(/*__m256i*/ _1, /*ui6*/ _2)                           \
+  ((__m256i)__builtin_lasx_xvrotri_d((v4i64)(_1), (_2)))
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvextl_q_d(__m256i _1) {
+  return (__m256i)__builtin_lasx_xvextl_q_d((v4i64)_1);
+}
+
+#define __lasx_xvsrlni_b_h(/*__m256i*/ _1, /*__m256i*/ _2, /*ui4*/ _3)         \
+  ((__m256i)__builtin_lasx_xvsrlni_b_h((v32i8)(_1), (v32i8)(_2), (_3)))
+
+#define __lasx_xvsrlni_h_w(/*__m256i*/ _1, /*__m256i*/ _2, /*ui5*/ _3)         \
+  ((__m256i)__builtin_lasx_xvsrlni_h_w((v16i16)(_1), (v16i16)(_2), (_3)))
+
+#define __lasx_xvsrlni_w_d(/*__m256i*/ _1, /*__m256i*/ _2, /*ui6*/ _3)         \
+  ((__m256i)__builtin_lasx_xvsrlni_w_d((v8i32)(_1), (v8i32)(_2), (_3)))
+
+#define __lasx_xvsrlni_d_q(/*__m256i*/ _1, /*__m256i*/ _2, /*ui7*/ _3)         \
+  ((__m256i)__builtin_lasx_xvsrlni_d_q((v4i64)(_1), (v4i64)(_2), (_3)))
+
+#define __lasx_xvsrlrni_b_h(/*__m256i*/ _1, /*__m256i*/ _2, /*ui4*/ _3)        \
+  ((__m256i)__builtin_lasx_xvsrlrni_b_h((v32i8)(_1), (v32i8)(_2), (_3)))
+
+#define __lasx_xvsrlrni_h_w(/*__m256i*/ _1, /*__m256i*/ _2, /*ui5*/ _3)        \
+  ((__m256i)__builtin_lasx_xvsrlrni_h_w((v16i16)(_1), (v16i16)(_2), (_3)))
+
+#define __lasx_xvsrlrni_w_d(/*__m256i*/ _1, /*__m256i*/ _2, /*ui6*/ _3)        \
+  ((__m256i)__builtin_lasx_xvsrlrni_w_d((v8i32)(_1), (v8i32)(_2), (_3)))
+
+#define __lasx_xvsrlrni_d_q(/*__m256i*/ _1, /*__m256i*/ _2, /*ui7*/ _3)        \
+  ((__m256i)__builtin_lasx_xvsrlrni_d_q((v4i64)(_1), (v4i64)(_2), (_3)))
+
+#define __lasx_xvssrlni_b_h(/*__m256i*/ _1, /*__m256i*/ _2, /*ui4*/ _3)        \
+  ((__m256i)__builtin_lasx_xvssrlni_b_h((v32i8)(_1), (v32i8)(_2), (_3)))
+
+#define __lasx_xvssrlni_h_w(/*__m256i*/ _1, /*__m256i*/ _2, /*ui5*/ _3)        \
+  ((__m256i)__builtin_lasx_xvssrlni_h_w((v16i16)(_1), (v16i16)(_2), (_3)))
+
+#define __lasx_xvssrlni_w_d(/*__m256i*/ _1, /*__m256i*/ _2, /*ui6*/ _3)        \
+  ((__m256i)__builtin_lasx_xvssrlni_w_d((v8i32)(_1), (v8i32)(_2), (_3)))
+
+#define __lasx_xvssrlni_d_q(/*__m256i*/ _1, /*__m256i*/ _2, /*ui7*/ _3)        \
+  ((__m256i)__builtin_lasx_xvssrlni_d_q((v4i64)(_1), (v4i64)(_2), (_3)))
+
+#define __lasx_xvssrlni_bu_h(/*__m256i*/ _1, /*__m256i*/ _2, /*ui4*/ _3)       \
+  ((__m256i)__builtin_lasx_xvssrlni_bu_h((v32u8)(_1), (v32i8)(_2), (_3)))
+
+#define __lasx_xvssrlni_hu_w(/*__m256i*/ _1, /*__m256i*/ _2, /*ui5*/ _3)       \
+  ((__m256i)__builtin_lasx_xvssrlni_hu_w((v16u16)(_1), (v16i16)(_2), (_3)))
+
+#define __lasx_xvssrlni_wu_d(/*__m256i*/ _1, /*__m256i*/ _2, /*ui6*/ _3)       \
+  ((__m256i)__builtin_lasx_xvssrlni_wu_d((v8u32)(_1), (v8i32)(_2), (_3)))
+
+#define __lasx_xvssrlni_du_q(/*__m256i*/ _1, /*__m256i*/ _2, /*ui7*/ _3)       \
+  ((__m256i)__builtin_lasx_xvssrlni_du_q((v4u64)(_1), (v4i64)(_2), (_3)))
+
+#define __lasx_xvssrlrni_b_h(/*__m256i*/ _1, /*__m256i*/ _2, /*ui4*/ _3)       \
+  ((__m256i)__builtin_lasx_xvssrlrni_b_h((v32i8)(_1), (v32i8)(_2), (_3)))
+
+#define __lasx_xvssrlrni_h_w(/*__m256i*/ _1, /*__m256i*/ _2, /*ui5*/ _3)       \
+  ((__m256i)__builtin_lasx_xvssrlrni_h_w((v16i16)(_1), (v16i16)(_2), (_3)))
+
+#define __lasx_xvssrlrni_w_d(/*__m256i*/ _1, /*__m256i*/ _2, /*ui6*/ _3)       \
+  ((__m256i)__builtin_lasx_xvssrlrni_w_d((v8i32)(_1), (v8i32)(_2), (_3)))
+
+#define __lasx_xvssrlrni_d_q(/*__m256i*/ _1, /*__m256i*/ _2, /*ui7*/ _3)       \
+  ((__m256i)__builtin_lasx_xvssrlrni_d_q((v4i64)(_1), (v4i64)(_2), (_3)))
+
+#define __lasx_xvssrlrni_bu_h(/*__m256i*/ _1, /*__m256i*/ _2, /*ui4*/ _3)      \
+  ((__m256i)__builtin_lasx_xvssrlrni_bu_h((v32u8)(_1), (v32i8)(_2), (_3)))
+
+#define __lasx_xvssrlrni_hu_w(/*__m256i*/ _1, /*__m256i*/ _2, /*ui5*/ _3)      \
+  ((__m256i)__builtin_lasx_xvssrlrni_hu_w((v16u16)(_1), (v16i16)(_2), (_3)))
+
+#define __lasx_xvssrlrni_wu_d(/*__m256i*/ _1, /*__m256i*/ _2, /*ui6*/ _3)      \
+  ((__m256i)__builtin_lasx_xvssrlrni_wu_d((v8u32)(_1), (v8i32)(_2), (_3)))
+
+#define __lasx_xvssrlrni_du_q(/*__m256i*/ _1, /*__m256i*/ _2, /*ui7*/ _3)      \
+  ((__m256i)__builtin_lasx_xvssrlrni_du_q((v4u64)(_1), (v4i64)(_2), (_3)))
+
+#define __lasx_xvsrani_b_h(/*__m256i*/ _1, /*__m256i*/ _2, /*ui4*/ _3)         \
+  ((__m256i)__builtin_lasx_xvsrani_b_h((v32i8)(_1), (v32i8)(_2), (_3)))
+
+#define __lasx_xvsrani_h_w(/*__m256i*/ _1, /*__m256i*/ _2, /*ui5*/ _3)         \
+  ((__m256i)__builtin_lasx_xvsrani_h_w((v16i16)(_1), (v16i16)(_2), (_3)))
+
+#define __lasx_xvsrani_w_d(/*__m256i*/ _1, /*__m256i*/ _2, /*ui6*/ _3)         \
+  ((__m256i)__builtin_lasx_xvsrani_w_d((v8i32)(_1), (v8i32)(_2), (_3)))
+
+#define __lasx_xvsrani_d_q(/*__m256i*/ _1, /*__m256i*/ _2, /*ui7*/ _3)         \
+  ((__m256i)__builtin_lasx_xvsrani_d_q((v4i64)(_1), (v4i64)(_2), (_3)))
+
+#define __lasx_xvsrarni_b_h(/*__m256i*/ _1, /*__m256i*/ _2, /*ui4*/ _3)        \
+  ((__m256i)__builtin_lasx_xvsrarni_b_h((v32i8)(_1), (v32i8)(_2), (_3)))
+
+#define __lasx_xvsrarni_h_w(/*__m256i*/ _1, /*__m256i*/ _2, /*ui5*/ _3)        \
+  ((__m256i)__builtin_lasx_xvsrarni_h_w((v16i16)(_1), (v16i16)(_2), (_3)))
+
+#define __lasx_xvsrarni_w_d(/*__m256i*/ _1, /*__m256i*/ _2, /*ui6*/ _3)        \
+  ((__m256i)__builtin_lasx_xvsrarni_w_d((v8i32)(_1), (v8i32)(_2), (_3)))
+
+#define __lasx_xvsrarni_d_q(/*__m256i*/ _1, /*__m256i*/ _2, /*ui7*/ _3)        \
+  ((__m256i)__builtin_lasx_xvsrarni_d_q((v4i64)(_1), (v4i64)(_2), (_3)))
+
+#define __lasx_xvssrani_b_h(/*__m256i*/ _1, /*__m256i*/ _2, /*ui4*/ _3)        \
+  ((__m256i)__builtin_lasx_xvssrani_b_h((v32i8)(_1), (v32i8)(_2), (_3)))
+
+#define __lasx_xvssrani_h_w(/*__m256i*/ _1, /*__m256i*/ _2, /*ui5*/ _3)        \
+  ((__m256i)__builtin_lasx_xvssrani_h_w((v16i16)(_1), (v16i16)(_2), (_3)))
+
+#define __lasx_xvssrani_w_d(/*__m256i*/ _1, /*__m256i*/ _2, /*ui6*/ _3)        \
+  ((__m256i)__builtin_lasx_xvssrani_w_d((v8i32)(_1), (v8i32)(_2), (_3)))
+
+#define __lasx_xvssrani_d_q(/*__m256i*/ _1, /*__m256i*/ _2, /*ui7*/ _3)        \
+  ((__m256i)__builtin_lasx_xvssrani_d_q((v4i64)(_1), (v4i64)(_2), (_3)))
+
+#define __lasx_xvssrani_bu_h(/*__m256i*/ _1, /*__m256i*/ _2, /*ui4*/ _3)       \
+  ((__m256i)__builtin_lasx_xvssrani_bu_h((v32u8)(_1), (v32i8)(_2), (_3)))
+
+#define __lasx_xvssrani_hu_w(/*__m256i*/ _1, /*__m256i*/ _2, /*ui5*/ _3)       \
+  ((__m256i)__builtin_lasx_xvssrani_hu_w((v16u16)(_1), (v16i16)(_2), (_3)))
+
+#define __lasx_xvssrani_wu_d(/*__m256i*/ _1, /*__m256i*/ _2, /*ui6*/ _3)       \
+  ((__m256i)__builtin_lasx_xvssrani_wu_d((v8u32)(_1), (v8i32)(_2), (_3)))
+
+#define __lasx_xvssrani_du_q(/*__m256i*/ _1, /*__m256i*/ _2, /*ui7*/ _3)       \
+  ((__m256i)__builtin_lasx_xvssrani_du_q((v4u64)(_1), (v4i64)(_2), (_3)))
+
+#define __lasx_xvssrarni_b_h(/*__m256i*/ _1, /*__m256i*/ _2, /*ui4*/ _3)       \
+  ((__m256i)__builtin_lasx_xvssrarni_b_h((v32i8)(_1), (v32i8)(_2), (_3)))
+
+#define __lasx_xvssrarni_h_w(/*__m256i*/ _1, /*__m256i*/ _2, /*ui5*/ _3)       \
+  ((__m256i)__builtin_lasx_xvssrarni_h_w((v16i16)(_1), (v16i16)(_2), (_3)))
+
+#define __lasx_xvssrarni_w_d(/*__m256i*/ _1, /*__m256i*/ _2, /*ui6*/ _3)       \
+  ((__m256i)__builtin_lasx_xvssrarni_w_d((v8i32)(_1), (v8i32)(_2), (_3)))
+
+#define __lasx_xvssrarni_d_q(/*__m256i*/ _1, /*__m256i*/ _2, /*ui7*/ _3)       \
+  ((__m256i)__builtin_lasx_xvssrarni_d_q((v4i64)(_1), (v4i64)(_2), (_3)))
+
+#define __lasx_xvssrarni_bu_h(/*__m256i*/ _1, /*__m256i*/ _2, /*ui4*/ _3)      \
+  ((__m256i)__builtin_lasx_xvssrarni_bu_h((v32u8)(_1), (v32i8)(_2), (_3)))
+
+#define __lasx_xvssrarni_hu_w(/*__m256i*/ _1, /*__m256i*/ _2, /*ui5*/ _3)      \
+  ((__m256i)__builtin_lasx_xvssrarni_hu_w((v16u16)(_1), (v16i16)(_2), (_3)))
+
+#define __lasx_xvssrarni_wu_d(/*__m256i*/ _1, /*__m256i*/ _2, /*ui6*/ _3)      \
+  ((__m256i)__builtin_lasx_xvssrarni_wu_d((v8u32)(_1), (v8i32)(_2), (_3)))
+
+#define __lasx_xvssrarni_du_q(/*__m256i*/ _1, /*__m256i*/ _2, /*ui7*/ _3)      \
+  ((__m256i)__builtin_lasx_xvssrarni_du_q((v4u64)(_1), (v4i64)(_2), (_3)))
+
+#define __lasx_xbnz_b(/*__m256i*/ _1) ((int)__builtin_lasx_xbnz_b((v32u8)(_1)))
+
+#define __lasx_xbnz_d(/*__m256i*/ _1) ((int)__builtin_lasx_xbnz_d((v4u64)(_1)))
+
+#define __lasx_xbnz_h(/*__m256i*/ _1) ((int)__builtin_lasx_xbnz_h((v16u16)(_1)))
+
+#define __lasx_xbnz_v(/*__m256i*/ _1) ((int)__builtin_lasx_xbnz_v((v32u8)(_1)))
+
+#define __lasx_xbnz_w(/*__m256i*/ _1) ((int)__builtin_lasx_xbnz_w((v8u32)(_1)))
+
+#define __lasx_xbz_b(/*__m256i*/ _1) ((int)__builtin_lasx_xbz_b((v32u8)(_1)))
+
+#define __lasx_xbz_d(/*__m256i*/ _1) ((int)__builtin_lasx_xbz_d((v4u64)(_1)))
+
+#define __lasx_xbz_h(/*__m256i*/ _1) ((int)__builtin_lasx_xbz_h((v16u16)(_1)))
+
+#define __lasx_xbz_v(/*__m256i*/ _1) ((int)__builtin_lasx_xbz_v((v32u8)(_1)))
+
+#define __lasx_xbz_w(/*__m256i*/ _1) ((int)__builtin_lasx_xbz_w((v8u32)(_1)))
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvfcmp_caf_d(__m256d _1, __m256d _2) {
+  return (__m256i)__builtin_lasx_xvfcmp_caf_d((v4f64)_1, (v4f64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvfcmp_caf_s(__m256 _1, __m256 _2) {
+  return (__m256i)__builtin_lasx_xvfcmp_caf_s((v8f32)_1, (v8f32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvfcmp_ceq_d(__m256d _1, __m256d _2) {
+  return (__m256i)__builtin_lasx_xvfcmp_ceq_d((v4f64)_1, (v4f64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvfcmp_ceq_s(__m256 _1, __m256 _2) {
+  return (__m256i)__builtin_lasx_xvfcmp_ceq_s((v8f32)_1, (v8f32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvfcmp_cle_d(__m256d _1, __m256d _2) {
+  return (__m256i)__builtin_lasx_xvfcmp_cle_d((v4f64)_1, (v4f64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvfcmp_cle_s(__m256 _1, __m256 _2) {
+  return (__m256i)__builtin_lasx_xvfcmp_cle_s((v8f32)_1, (v8f32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvfcmp_clt_d(__m256d _1, __m256d _2) {
+  return (__m256i)__builtin_lasx_xvfcmp_clt_d((v4f64)_1, (v4f64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvfcmp_clt_s(__m256 _1, __m256 _2) {
+  return (__m256i)__builtin_lasx_xvfcmp_clt_s((v8f32)_1, (v8f32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvfcmp_cne_d(__m256d _1, __m256d _2) {
+  return (__m256i)__builtin_lasx_xvfcmp_cne_d((v4f64)_1, (v4f64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvfcmp_cne_s(__m256 _1, __m256 _2) {
+  return (__m256i)__builtin_lasx_xvfcmp_cne_s((v8f32)_1, (v8f32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvfcmp_cor_d(__m256d _1, __m256d _2) {
+  return (__m256i)__builtin_lasx_xvfcmp_cor_d((v4f64)_1, (v4f64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvfcmp_cor_s(__m256 _1, __m256 _2) {
+  return (__m256i)__builtin_lasx_xvfcmp_cor_s((v8f32)_1, (v8f32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvfcmp_cueq_d(__m256d _1, __m256d _2) {
+  return (__m256i)__builtin_lasx_xvfcmp_cueq_d((v4f64)_1, (v4f64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvfcmp_cueq_s(__m256 _1, __m256 _2) {
+  return (__m256i)__builtin_lasx_xvfcmp_cueq_s((v8f32)_1, (v8f32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvfcmp_cule_d(__m256d _1, __m256d _2) {
+  return (__m256i)__builtin_lasx_xvfcmp_cule_d((v4f64)_1, (v4f64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvfcmp_cule_s(__m256 _1, __m256 _2) {
+  return (__m256i)__builtin_lasx_xvfcmp_cule_s((v8f32)_1, (v8f32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvfcmp_cult_d(__m256d _1, __m256d _2) {
+  return (__m256i)__builtin_lasx_xvfcmp_cult_d((v4f64)_1, (v4f64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvfcmp_cult_s(__m256 _1, __m256 _2) {
+  return (__m256i)__builtin_lasx_xvfcmp_cult_s((v8f32)_1, (v8f32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvfcmp_cun_d(__m256d _1, __m256d _2) {
+  return (__m256i)__builtin_lasx_xvfcmp_cun_d((v4f64)_1, (v4f64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvfcmp_cune_d(__m256d _1, __m256d _2) {
+  return (__m256i)__builtin_lasx_xvfcmp_cune_d((v4f64)_1, (v4f64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvfcmp_cune_s(__m256 _1, __m256 _2) {
+  return (__m256i)__builtin_lasx_xvfcmp_cune_s((v8f32)_1, (v8f32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvfcmp_cun_s(__m256 _1, __m256 _2) {
+  return (__m256i)__builtin_lasx_xvfcmp_cun_s((v8f32)_1, (v8f32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvfcmp_saf_d(__m256d _1, __m256d _2) {
+  return (__m256i)__builtin_lasx_xvfcmp_saf_d((v4f64)_1, (v4f64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvfcmp_saf_s(__m256 _1, __m256 _2) {
+  return (__m256i)__builtin_lasx_xvfcmp_saf_s((v8f32)_1, (v8f32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvfcmp_seq_d(__m256d _1, __m256d _2) {
+  return (__m256i)__builtin_lasx_xvfcmp_seq_d((v4f64)_1, (v4f64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvfcmp_seq_s(__m256 _1, __m256 _2) {
+  return (__m256i)__builtin_lasx_xvfcmp_seq_s((v8f32)_1, (v8f32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvfcmp_sle_d(__m256d _1, __m256d _2) {
+  return (__m256i)__builtin_lasx_xvfcmp_sle_d((v4f64)_1, (v4f64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvfcmp_sle_s(__m256 _1, __m256 _2) {
+  return (__m256i)__builtin_lasx_xvfcmp_sle_s((v8f32)_1, (v8f32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvfcmp_slt_d(__m256d _1, __m256d _2) {
+  return (__m256i)__builtin_lasx_xvfcmp_slt_d((v4f64)_1, (v4f64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvfcmp_slt_s(__m256 _1, __m256 _2) {
+  return (__m256i)__builtin_lasx_xvfcmp_slt_s((v8f32)_1, (v8f32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvfcmp_sne_d(__m256d _1, __m256d _2) {
+  return (__m256i)__builtin_lasx_xvfcmp_sne_d((v4f64)_1, (v4f64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvfcmp_sne_s(__m256 _1, __m256 _2) {
+  return (__m256i)__builtin_lasx_xvfcmp_sne_s((v8f32)_1, (v8f32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvfcmp_sor_d(__m256d _1, __m256d _2) {
+  return (__m256i)__builtin_lasx_xvfcmp_sor_d((v4f64)_1, (v4f64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvfcmp_sor_s(__m256 _1, __m256 _2) {
+  return (__m256i)__builtin_lasx_xvfcmp_sor_s((v8f32)_1, (v8f32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvfcmp_sueq_d(__m256d _1, __m256d _2) {
+  return (__m256i)__builtin_lasx_xvfcmp_sueq_d((v4f64)_1, (v4f64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvfcmp_sueq_s(__m256 _1, __m256 _2) {
+  return (__m256i)__builtin_lasx_xvfcmp_sueq_s((v8f32)_1, (v8f32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvfcmp_sule_d(__m256d _1, __m256d _2) {
+  return (__m256i)__builtin_lasx_xvfcmp_sule_d((v4f64)_1, (v4f64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvfcmp_sule_s(__m256 _1, __m256 _2) {
+  return (__m256i)__builtin_lasx_xvfcmp_sule_s((v8f32)_1, (v8f32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvfcmp_sult_d(__m256d _1, __m256d _2) {
+  return (__m256i)__builtin_lasx_xvfcmp_sult_d((v4f64)_1, (v4f64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvfcmp_sult_s(__m256 _1, __m256 _2) {
+  return (__m256i)__builtin_lasx_xvfcmp_sult_s((v8f32)_1, (v8f32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvfcmp_sun_d(__m256d _1, __m256d _2) {
+  return (__m256i)__builtin_lasx_xvfcmp_sun_d((v4f64)_1, (v4f64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvfcmp_sune_d(__m256d _1, __m256d _2) {
+  return (__m256i)__builtin_lasx_xvfcmp_sune_d((v4f64)_1, (v4f64)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvfcmp_sune_s(__m256 _1, __m256 _2) {
+  return (__m256i)__builtin_lasx_xvfcmp_sune_s((v8f32)_1, (v8f32)_2);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+    __lasx_xvfcmp_sun_s(__m256 _1, __m256 _2) {
+  return (__m256i)__builtin_lasx_xvfcmp_sun_s((v8f32)_1, (v8f32)_2);
+}
+
+#define __lasx_xvpickve_d_f(/*__m256d*/ _1, /*ui2*/ _2)                        \
+  ((__m256d)__builtin_lasx_xvpickve_d_f((v4f64)(_1), (_2)))
+
+#define __lasx_xvpickve_w_f(/*__m256*/ _1, /*ui3*/ _2)                         \
+  ((__m256)__builtin_lasx_xvpickve_w_f((v8f32)(_1), (_2)))
+
+#define __lasx_xvrepli_b(/*si10*/ _1) ((__m256i)__builtin_lasx_xvrepli_b((_1)))
+
+#define __lasx_xvrepli_d(/*si10*/ _1) ((__m256i)__builtin_lasx_xvrepli_d((_1)))
+
+#define __lasx_xvrepli_h(/*si10*/ _1) ((__m256i)__builtin_lasx_xvrepli_h((_1)))
+
+#define __lasx_xvrepli_w(/*si10*/ _1) ((__m256i)__builtin_lasx_xvrepli_w((_1)))
+
+#endif /* defined(__loongarch_asx).  */
+#endif /* _LOONGSON_ASXINTRIN_H.  */
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index e131061eb8854..44f698abdb9fe 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -4184,6 +4184,233 @@ bool Sema::CheckLoongArchBuiltinFunctionCall(const TargetInfo &TI,
   case LoongArch::BI__builtin_lsx_vrepli_w:
   case LoongArch::BI__builtin_lsx_vrepli_d:
     return SemaBuiltinConstantArgRange(TheCall, 0, -512, 511);
+
+  // LASX intrinsics.
+  case LoongArch::BI__builtin_lasx_xvbitclri_b:
+  case LoongArch::BI__builtin_lasx_xvbitrevi_b:
+  case LoongArch::BI__builtin_lasx_xvbitseti_b:
+  case LoongArch::BI__builtin_lasx_xvsat_b:
+  case LoongArch::BI__builtin_lasx_xvsat_bu:
+  case LoongArch::BI__builtin_lasx_xvslli_b:
+  case LoongArch::BI__builtin_lasx_xvsrai_b:
+  case LoongArch::BI__builtin_lasx_xvsrari_b:
+  case LoongArch::BI__builtin_lasx_xvsrli_b:
+  case LoongArch::BI__builtin_lasx_xvsllwil_h_b:
+  case LoongArch::BI__builtin_lasx_xvsllwil_hu_bu:
+  case LoongArch::BI__builtin_lasx_xvrotri_b:
+  case LoongArch::BI__builtin_lasx_xvsrlri_b:
+    return SemaBuiltinConstantArgRange(TheCall, 1, 0, 7);
+  case LoongArch::BI__builtin_lasx_xvbitclri_h:
+  case LoongArch::BI__builtin_lasx_xvbitrevi_h:
+  case LoongArch::BI__builtin_lasx_xvbitseti_h:
+  case LoongArch::BI__builtin_lasx_xvsat_h:
+  case LoongArch::BI__builtin_lasx_xvsat_hu:
+  case LoongArch::BI__builtin_lasx_xvslli_h:
+  case LoongArch::BI__builtin_lasx_xvsrai_h:
+  case LoongArch::BI__builtin_lasx_xvsrari_h:
+  case LoongArch::BI__builtin_lasx_xvsrli_h:
+  case LoongArch::BI__builtin_lasx_xvsllwil_w_h:
+  case LoongArch::BI__builtin_lasx_xvsllwil_wu_hu:
+  case LoongArch::BI__builtin_lasx_xvrotri_h:
+  case LoongArch::BI__builtin_lasx_xvsrlri_h:
+    return SemaBuiltinConstantArgRange(TheCall, 1, 0, 15);
+  case LoongArch::BI__builtin_lasx_xvssrarni_b_h:
+  case LoongArch::BI__builtin_lasx_xvssrarni_bu_h:
+  case LoongArch::BI__builtin_lasx_xvssrani_b_h:
+  case LoongArch::BI__builtin_lasx_xvssrani_bu_h:
+  case LoongArch::BI__builtin_lasx_xvsrarni_b_h:
+  case LoongArch::BI__builtin_lasx_xvsrlni_b_h:
+  case LoongArch::BI__builtin_lasx_xvsrlrni_b_h:
+  case LoongArch::BI__builtin_lasx_xvssrlni_b_h:
+  case LoongArch::BI__builtin_lasx_xvssrlni_bu_h:
+  case LoongArch::BI__builtin_lasx_xvssrlrni_b_h:
+  case LoongArch::BI__builtin_lasx_xvssrlrni_bu_h:
+  case LoongArch::BI__builtin_lasx_xvsrani_b_h:
+    return SemaBuiltinConstantArgRange(TheCall, 2, 0, 15);
+  case LoongArch::BI__builtin_lasx_xvslei_bu:
+  case LoongArch::BI__builtin_lasx_xvslei_hu:
+  case LoongArch::BI__builtin_lasx_xvslei_wu:
+  case LoongArch::BI__builtin_lasx_xvslei_du:
+  case LoongArch::BI__builtin_lasx_xvslti_bu:
+  case LoongArch::BI__builtin_lasx_xvslti_hu:
+  case LoongArch::BI__builtin_lasx_xvslti_wu:
+  case LoongArch::BI__builtin_lasx_xvslti_du:
+  case LoongArch::BI__builtin_lasx_xvmaxi_bu:
+  case LoongArch::BI__builtin_lasx_xvmaxi_hu:
+  case LoongArch::BI__builtin_lasx_xvmaxi_wu:
+  case LoongArch::BI__builtin_lasx_xvmaxi_du:
+  case LoongArch::BI__builtin_lasx_xvmini_bu:
+  case LoongArch::BI__builtin_lasx_xvmini_hu:
+  case LoongArch::BI__builtin_lasx_xvmini_wu:
+  case LoongArch::BI__builtin_lasx_xvmini_du:
+  case LoongArch::BI__builtin_lasx_xvaddi_bu:
+  case LoongArch::BI__builtin_lasx_xvaddi_hu:
+  case LoongArch::BI__builtin_lasx_xvaddi_wu:
+  case LoongArch::BI__builtin_lasx_xvaddi_du:
+  case LoongArch::BI__builtin_lasx_xvbitclri_w:
+  case LoongArch::BI__builtin_lasx_xvbitrevi_w:
+  case LoongArch::BI__builtin_lasx_xvbitseti_w:
+  case LoongArch::BI__builtin_lasx_xvsat_w:
+  case LoongArch::BI__builtin_lasx_xvsat_wu:
+  case LoongArch::BI__builtin_lasx_xvslli_w:
+  case LoongArch::BI__builtin_lasx_xvsrai_w:
+  case LoongArch::BI__builtin_lasx_xvsrari_w:
+  case LoongArch::BI__builtin_lasx_xvsrli_w:
+  case LoongArch::BI__builtin_lasx_xvsllwil_d_w:
+  case LoongArch::BI__builtin_lasx_xvsllwil_du_wu:
+  case LoongArch::BI__builtin_lasx_xvsrlri_w:
+  case LoongArch::BI__builtin_lasx_xvrotri_w:
+  case LoongArch::BI__builtin_lasx_xvsubi_bu:
+  case LoongArch::BI__builtin_lasx_xvsubi_hu:
+  case LoongArch::BI__builtin_lasx_xvsubi_wu:
+  case LoongArch::BI__builtin_lasx_xvsubi_du:
+  case LoongArch::BI__builtin_lasx_xvbsrl_v:
+  case LoongArch::BI__builtin_lasx_xvbsll_v:
+    return SemaBuiltinConstantArgRange(TheCall, 1, 0, 31);
+  case LoongArch::BI__builtin_lasx_xvssrarni_h_w:
+  case LoongArch::BI__builtin_lasx_xvssrarni_hu_w:
+  case LoongArch::BI__builtin_lasx_xvssrani_h_w:
+  case LoongArch::BI__builtin_lasx_xvssrani_hu_w:
+  case LoongArch::BI__builtin_lasx_xvsrarni_h_w:
+  case LoongArch::BI__builtin_lasx_xvsrani_h_w:
+  case LoongArch::BI__builtin_lasx_xvfrstpi_b:
+  case LoongArch::BI__builtin_lasx_xvfrstpi_h:
+  case LoongArch::BI__builtin_lasx_xvsrlni_h_w:
+  case LoongArch::BI__builtin_lasx_xvsrlrni_h_w:
+  case LoongArch::BI__builtin_lasx_xvssrlni_h_w:
+  case LoongArch::BI__builtin_lasx_xvssrlni_hu_w:
+  case LoongArch::BI__builtin_lasx_xvssrlrni_h_w:
+  case LoongArch::BI__builtin_lasx_xvssrlrni_hu_w:
+    return SemaBuiltinConstantArgRange(TheCall, 2, 0, 31);
+  case LoongArch::BI__builtin_lasx_xvbitclri_d:
+  case LoongArch::BI__builtin_lasx_xvbitrevi_d:
+  case LoongArch::BI__builtin_lasx_xvbitseti_d:
+  case LoongArch::BI__builtin_lasx_xvsat_d:
+  case LoongArch::BI__builtin_lasx_xvsat_du:
+  case LoongArch::BI__builtin_lasx_xvslli_d:
+  case LoongArch::BI__builtin_lasx_xvsrai_d:
+  case LoongArch::BI__builtin_lasx_xvsrli_d:
+  case LoongArch::BI__builtin_lasx_xvsrari_d:
+  case LoongArch::BI__builtin_lasx_xvrotri_d:
+  case LoongArch::BI__builtin_lasx_xvsrlri_d:
+    return SemaBuiltinConstantArgRange(TheCall, 1, 0, 63);
+  case LoongArch::BI__builtin_lasx_xvssrarni_w_d:
+  case LoongArch::BI__builtin_lasx_xvssrarni_wu_d:
+  case LoongArch::BI__builtin_lasx_xvssrani_w_d:
+  case LoongArch::BI__builtin_lasx_xvssrani_wu_d:
+  case LoongArch::BI__builtin_lasx_xvsrarni_w_d:
+  case LoongArch::BI__builtin_lasx_xvsrlni_w_d:
+  case LoongArch::BI__builtin_lasx_xvsrlrni_w_d:
+  case LoongArch::BI__builtin_lasx_xvssrlni_w_d:
+  case LoongArch::BI__builtin_lasx_xvssrlni_wu_d:
+  case LoongArch::BI__builtin_lasx_xvssrlrni_w_d:
+  case LoongArch::BI__builtin_lasx_xvssrlrni_wu_d:
+  case LoongArch::BI__builtin_lasx_xvsrani_w_d:
+    return SemaBuiltinConstantArgRange(TheCall, 2, 0, 63);
+  case LoongArch::BI__builtin_lasx_xvssrarni_d_q:
+  case LoongArch::BI__builtin_lasx_xvssrarni_du_q:
+  case LoongArch::BI__builtin_lasx_xvssrani_d_q:
+  case LoongArch::BI__builtin_lasx_xvssrani_du_q:
+  case LoongArch::BI__builtin_lasx_xvsrarni_d_q:
+  case LoongArch::BI__builtin_lasx_xvssrlni_d_q:
+  case LoongArch::BI__builtin_lasx_xvssrlni_du_q:
+  case LoongArch::BI__builtin_lasx_xvssrlrni_d_q:
+  case LoongArch::BI__builtin_lasx_xvssrlrni_du_q:
+  case LoongArch::BI__builtin_lasx_xvsrani_d_q:
+  case LoongArch::BI__builtin_lasx_xvsrlni_d_q:
+  case LoongArch::BI__builtin_lasx_xvsrlrni_d_q:
+    return SemaBuiltinConstantArgRange(TheCall, 2, 0, 127);
+  case LoongArch::BI__builtin_lasx_xvseqi_b:
+  case LoongArch::BI__builtin_lasx_xvseqi_h:
+  case LoongArch::BI__builtin_lasx_xvseqi_w:
+  case LoongArch::BI__builtin_lasx_xvseqi_d:
+  case LoongArch::BI__builtin_lasx_xvslti_b:
+  case LoongArch::BI__builtin_lasx_xvslti_h:
+  case LoongArch::BI__builtin_lasx_xvslti_w:
+  case LoongArch::BI__builtin_lasx_xvslti_d:
+  case LoongArch::BI__builtin_lasx_xvslei_b:
+  case LoongArch::BI__builtin_lasx_xvslei_h:
+  case LoongArch::BI__builtin_lasx_xvslei_w:
+  case LoongArch::BI__builtin_lasx_xvslei_d:
+  case LoongArch::BI__builtin_lasx_xvmaxi_b:
+  case LoongArch::BI__builtin_lasx_xvmaxi_h:
+  case LoongArch::BI__builtin_lasx_xvmaxi_w:
+  case LoongArch::BI__builtin_lasx_xvmaxi_d:
+  case LoongArch::BI__builtin_lasx_xvmini_b:
+  case LoongArch::BI__builtin_lasx_xvmini_h:
+  case LoongArch::BI__builtin_lasx_xvmini_w:
+  case LoongArch::BI__builtin_lasx_xvmini_d:
+    return SemaBuiltinConstantArgRange(TheCall, 1, -16, 15);
+  case LoongArch::BI__builtin_lasx_xvandi_b:
+  case LoongArch::BI__builtin_lasx_xvnori_b:
+  case LoongArch::BI__builtin_lasx_xvori_b:
+  case LoongArch::BI__builtin_lasx_xvshuf4i_b:
+  case LoongArch::BI__builtin_lasx_xvshuf4i_h:
+  case LoongArch::BI__builtin_lasx_xvshuf4i_w:
+  case LoongArch::BI__builtin_lasx_xvxori_b:
+  case LoongArch::BI__builtin_lasx_xvpermi_d:
+    return SemaBuiltinConstantArgRange(TheCall, 1, 0, 255);
+  case LoongArch::BI__builtin_lasx_xvbitseli_b:
+  case LoongArch::BI__builtin_lasx_xvshuf4i_d:
+  case LoongArch::BI__builtin_lasx_xvextrins_b:
+  case LoongArch::BI__builtin_lasx_xvextrins_h:
+  case LoongArch::BI__builtin_lasx_xvextrins_w:
+  case LoongArch::BI__builtin_lasx_xvextrins_d:
+  case LoongArch::BI__builtin_lasx_xvpermi_q:
+  case LoongArch::BI__builtin_lasx_xvpermi_w:
+    return SemaBuiltinConstantArgRange(TheCall, 2, 0, 255);
+  case LoongArch::BI__builtin_lasx_xvrepl128vei_b:
+    return SemaBuiltinConstantArgRange(TheCall, 1, 0, 15);
+  case LoongArch::BI__builtin_lasx_xvrepl128vei_h:
+  case LoongArch::BI__builtin_lasx_xvpickve2gr_w:
+  case LoongArch::BI__builtin_lasx_xvpickve2gr_wu:
+  case LoongArch::BI__builtin_lasx_xvpickve_w_f:
+  case LoongArch::BI__builtin_lasx_xvpickve_w:
+    return SemaBuiltinConstantArgRange(TheCall, 1, 0, 7);
+  case LoongArch::BI__builtin_lasx_xvinsgr2vr_w:
+  case LoongArch::BI__builtin_lasx_xvinsve0_w:
+    return SemaBuiltinConstantArgRange(TheCall, 2, 0, 7);
+  case LoongArch::BI__builtin_lasx_xvrepl128vei_w:
+  case LoongArch::BI__builtin_lasx_xvpickve2gr_d:
+  case LoongArch::BI__builtin_lasx_xvpickve2gr_du:
+  case LoongArch::BI__builtin_lasx_xvpickve_d_f:
+  case LoongArch::BI__builtin_lasx_xvpickve_d:
+    return SemaBuiltinConstantArgRange(TheCall, 1, 0, 3);
+  case LoongArch::BI__builtin_lasx_xvinsve0_d:
+  case LoongArch::BI__builtin_lasx_xvinsgr2vr_d:
+    return SemaBuiltinConstantArgRange(TheCall, 2, 0, 3);
+  case LoongArch::BI__builtin_lasx_xvstelm_b:
+    return SemaBuiltinConstantArgRange(TheCall, 2, -128, 127) ||
+           SemaBuiltinConstantArgRange(TheCall, 3, 0, 31);
+  case LoongArch::BI__builtin_lasx_xvstelm_h:
+    return SemaBuiltinConstantArgRange(TheCall, 2, -256, 254) ||
+           SemaBuiltinConstantArgRange(TheCall, 3, 0, 15);
+  case LoongArch::BI__builtin_lasx_xvstelm_w:
+    return SemaBuiltinConstantArgRange(TheCall, 2, -512, 508) ||
+           SemaBuiltinConstantArgRange(TheCall, 3, 0, 7);
+  case LoongArch::BI__builtin_lasx_xvstelm_d:
+    return SemaBuiltinConstantArgRange(TheCall, 2, -1024, 1016) ||
+           SemaBuiltinConstantArgRange(TheCall, 3, 0, 3);
+  case LoongArch::BI__builtin_lasx_xvrepl128vei_d:
+    return SemaBuiltinConstantArgRange(TheCall, 1, 0, 1);
+  case LoongArch::BI__builtin_lasx_xvldrepl_b:
+  case LoongArch::BI__builtin_lasx_xvld:
+    return SemaBuiltinConstantArgRange(TheCall, 1, -2048, 2047);
+  case LoongArch::BI__builtin_lasx_xvldrepl_h:
+    return SemaBuiltinConstantArgRange(TheCall, 1, -2048, 2046);
+  case LoongArch::BI__builtin_lasx_xvldrepl_w:
+    return SemaBuiltinConstantArgRange(TheCall, 1, -2048, 2044);
+  case LoongArch::BI__builtin_lasx_xvldrepl_d:
+    return SemaBuiltinConstantArgRange(TheCall, 1, -2048, 2040);
+  case LoongArch::BI__builtin_lasx_xvst:
+    return SemaBuiltinConstantArgRange(TheCall, 2, -2048, 2047);
+  case LoongArch::BI__builtin_lasx_xvldi:
+    return SemaBuiltinConstantArgRange(TheCall, 0, -4096, 4095);
+  case LoongArch::BI__builtin_lasx_xvrepli_b:
+  case LoongArch::BI__builtin_lasx_xvrepli_h:
+  case LoongArch::BI__builtin_lasx_xvrepli_w:
+  case LoongArch::BI__builtin_lasx_xvrepli_d:
+    return SemaBuiltinConstantArgRange(TheCall, 0, -512, 511);
   }
   return false;
 }

From 673c530837faa5ddb45769ddee01d09e1f73d406 Mon Sep 17 00:00:00 2001
From: chenli <chenli@loongson.cn>
Date: Fri, 27 Oct 2023 15:57:30 +0800
Subject: [PATCH 477/877] [LoongArch][CodeGen] Add LSX builtin testcases

---
 .../LoongArch/lsx/builtin-alias-error.c       | 1359 +++++
 .../CodeGen/LoongArch/lsx/builtin-alias.c     | 4451 ++++++++++++++
 .../CodeGen/LoongArch/lsx/builtin-error.c     | 1382 +++++
 clang/test/CodeGen/LoongArch/lsx/builtin.c    | 5193 +++++++++++++++++
 4 files changed, 12385 insertions(+)
 create mode 100644 clang/test/CodeGen/LoongArch/lsx/builtin-alias-error.c
 create mode 100644 clang/test/CodeGen/LoongArch/lsx/builtin-alias.c
 create mode 100644 clang/test/CodeGen/LoongArch/lsx/builtin-error.c
 create mode 100644 clang/test/CodeGen/LoongArch/lsx/builtin.c

diff --git a/clang/test/CodeGen/LoongArch/lsx/builtin-alias-error.c b/clang/test/CodeGen/LoongArch/lsx/builtin-alias-error.c
new file mode 100644
index 0000000000000..69cf2254fdd79
--- /dev/null
+++ b/clang/test/CodeGen/LoongArch/lsx/builtin-alias-error.c
@@ -0,0 +1,1359 @@
+// RUN: %clang_cc1 -triple loongarch64 -target-feature +lsx -verify %s
+
+#include <lsxintrin.h>
+
+v16i8 vslli_b(v16i8 _1, int var) {
+  v16i8 res = __lsx_vslli_b(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 7]}}
+  res |= __lsx_vslli_b(_1, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+  res |= __lsx_vslli_b(_1, var); // expected-error {{argument to '__builtin_lsx_vslli_b' must be a constant integer}}
+  return res;
+}
+
+v8i16 vslli_h(v8i16 _1, int var) {
+  v8i16 res = __lsx_vslli_h(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 15]}}
+  res |= __lsx_vslli_h(_1, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  res |= __lsx_vslli_h(_1, var); // expected-error {{argument to '__builtin_lsx_vslli_h' must be a constant integer}}
+  return res;
+}
+
+v4i32 vslli_w(v4i32 _1, int var) {
+  v4i32 res = __lsx_vslli_w(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __lsx_vslli_w(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __lsx_vslli_w(_1, var); // expected-error {{argument to '__builtin_lsx_vslli_w' must be a constant integer}}
+  return res;
+}
+
+v2i64 vslli_d(v2i64 _1, int var) {
+  v2i64 res = __lsx_vslli_d(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 63]}}
+  res |= __lsx_vslli_d(_1, 64); // expected-error {{argument value 64 is outside the valid range [0, 63]}}
+  res |= __lsx_vslli_d(_1, var); // expected-error {{argument to '__builtin_lsx_vslli_d' must be a constant integer}}
+  return res;
+}
+
+v16i8 vsrai_b(v16i8 _1, int var) {
+  v16i8 res = __lsx_vsrai_b(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 7]}}
+  res |= __lsx_vsrai_b(_1, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+  res |= __lsx_vsrai_b(_1, var); // expected-error {{argument to '__builtin_lsx_vsrai_b' must be a constant integer}}
+  return res;
+}
+
+v8i16 vsrai_h(v8i16 _1, int var) {
+  v8i16 res = __lsx_vsrai_h(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 15]}}
+  res |= __lsx_vsrai_h(_1, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  res |= __lsx_vsrai_h(_1, var); // expected-error {{argument to '__builtin_lsx_vsrai_h' must be a constant integer}}
+  return res;
+}
+
+v4i32 vsrai_w(v4i32 _1, int var) {
+  v4i32 res = __lsx_vsrai_w(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __lsx_vsrai_w(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __lsx_vsrai_w(_1, var); // expected-error {{argument to '__builtin_lsx_vsrai_w' must be a constant integer}}
+  return res;
+}
+
+v2i64 vsrai_d(v2i64 _1, int var) {
+  v2i64 res = __lsx_vsrai_d(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 63]}}
+  res |= __lsx_vsrai_d(_1, 64); // expected-error {{argument value 64 is outside the valid range [0, 63]}}
+  res |= __lsx_vsrai_d(_1, var); // expected-error {{argument to '__builtin_lsx_vsrai_d' must be a constant integer}}
+  return res;
+}
+
+v16i8 vsrari_b(v16i8 _1, int var) {
+  v16i8 res = __lsx_vsrari_b(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 7]}}
+  res |= __lsx_vsrari_b(_1, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+  res |= __lsx_vsrari_b(_1, var); // expected-error {{argument to '__builtin_lsx_vsrari_b' must be a constant integer}}
+  return res;
+}
+
+v8i16 vsrari_h(v8i16 _1, int var) {
+  v8i16 res = __lsx_vsrari_h(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 15]}}
+  res |= __lsx_vsrari_h(_1, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  res |= __lsx_vsrari_h(_1, var); // expected-error {{argument to '__builtin_lsx_vsrari_h' must be a constant integer}}
+  return res;
+}
+
+v4i32 vsrari_w(v4i32 _1, int var) {
+  v4i32 res = __lsx_vsrari_w(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __lsx_vsrari_w(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __lsx_vsrari_w(_1, var); // expected-error {{argument to '__builtin_lsx_vsrari_w' must be a constant integer}}
+  return res;
+}
+
+v2i64 vsrari_d(v2i64 _1, int var) {
+  v2i64 res = __lsx_vsrari_d(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 63]}}
+  res |= __lsx_vsrari_d(_1, 64); // expected-error {{argument value 64 is outside the valid range [0, 63]}}
+  res |= __lsx_vsrari_d(_1, var); // expected-error {{argument to '__builtin_lsx_vsrari_d' must be a constant integer}}
+  return res;
+}
+
+v16i8 vsrli_b(v16i8 _1, int var) {
+  v16i8 res = __lsx_vsrli_b(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 7]}}
+  res |= __lsx_vsrli_b(_1, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+  res |= __lsx_vsrli_b(_1, var); // expected-error {{argument to '__builtin_lsx_vsrli_b' must be a constant integer}}
+  return res;
+}
+
+v8i16 vsrli_h(v8i16 _1, int var) {
+  v8i16 res = __lsx_vsrli_h(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 15]}}
+  res |= __lsx_vsrli_h(_1, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  res |= __lsx_vsrli_h(_1, var); // expected-error {{argument to '__builtin_lsx_vsrli_h' must be a constant integer}}
+  return res;
+}
+
+v4i32 vsrli_w(v4i32 _1, int var) {
+  v4i32 res = __lsx_vsrli_w(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __lsx_vsrli_w(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __lsx_vsrli_w(_1, var); // expected-error {{argument to '__builtin_lsx_vsrli_w' must be a constant integer}}
+  return res;
+}
+
+v2i64 vsrli_d(v2i64 _1, int var) {
+  v2i64 res = __lsx_vsrli_d(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 63]}}
+  res |= __lsx_vsrli_d(_1, 64); // expected-error {{argument value 64 is outside the valid range [0, 63]}}
+  res |= __lsx_vsrli_d(_1, var); // expected-error {{argument to '__builtin_lsx_vsrli_d' must be a constant integer}}
+  return res;
+}
+
+v16i8 vsrlri_b(v16i8 _1, int var) {
+  v16i8 res = __lsx_vsrlri_b(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 7]}}
+  res |= __lsx_vsrlri_b(_1, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+  res |= __lsx_vsrlri_b(_1, var); // expected-error {{argument to '__builtin_lsx_vsrlri_b' must be a constant integer}}
+  return res;
+}
+
+v8i16 vsrlri_h(v8i16 _1, int var) {
+  v8i16 res = __lsx_vsrlri_h(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 15]}}
+  res |= __lsx_vsrlri_h(_1, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  res |= __lsx_vsrlri_h(_1, var); // expected-error {{argument to '__builtin_lsx_vsrlri_h' must be a constant integer}}
+  return res;
+}
+
+v4i32 vsrlri_w(v4i32 _1, int var) {
+  v4i32 res = __lsx_vsrlri_w(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __lsx_vsrlri_w(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __lsx_vsrlri_w(_1, var); // expected-error {{argument to '__builtin_lsx_vsrlri_w' must be a constant integer}}
+  return res;
+}
+
+v2i64 vsrlri_d(v2i64 _1, int var) {
+  v2i64 res = __lsx_vsrlri_d(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 63]}}
+  res |= __lsx_vsrlri_d(_1, 64); // expected-error {{argument value 64 is outside the valid range [0, 63]}}
+  res |= __lsx_vsrlri_d(_1, var); // expected-error {{argument to '__builtin_lsx_vsrlri_d' must be a constant integer}}
+  return res;
+}
+
+v16u8 vbitclri_b(v16u8 _1, int var) {
+  v16u8 res = __lsx_vbitclri_b(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 7]}}
+  res |= __lsx_vbitclri_b(_1, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+  res |= __lsx_vbitclri_b(_1, var); // expected-error {{argument to '__builtin_lsx_vbitclri_b' must be a constant integer}}
+  return res;
+}
+
+v8u16 vbitclri_h(v8u16 _1, int var) {
+  v8u16 res = __lsx_vbitclri_h(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 15]}}
+  res |= __lsx_vbitclri_h(_1, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  res |= __lsx_vbitclri_h(_1, var); // expected-error {{argument to '__builtin_lsx_vbitclri_h' must be a constant integer}}
+  return res;
+}
+
+v4u32 vbitclri_w(v4u32 _1, int var) {
+  v4u32 res = __lsx_vbitclri_w(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __lsx_vbitclri_w(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __lsx_vbitclri_w(_1, var); // expected-error {{argument to '__builtin_lsx_vbitclri_w' must be a constant integer}}
+  return res;
+}
+
+v2u64 vbitclri_d(v2u64 _1, int var) {
+  v2u64 res = __lsx_vbitclri_d(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 63]}}
+  res |= __lsx_vbitclri_d(_1, 64); // expected-error {{argument value 64 is outside the valid range [0, 63]}}
+  res |= __lsx_vbitclri_d(_1, var); // expected-error {{argument to '__builtin_lsx_vbitclri_d' must be a constant integer}}
+  return res;
+}
+
+v16u8 vbitseti_b(v16u8 _1, int var) {
+  v16u8 res = __lsx_vbitseti_b(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 7]}}
+  res |= __lsx_vbitseti_b(_1, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+  res |= __lsx_vbitseti_b(_1, var); // expected-error {{argument to '__builtin_lsx_vbitseti_b' must be a constant integer}}
+  return res;
+}
+
+v8u16 vbitseti_h(v8u16 _1, int var) {
+  v8u16 res = __lsx_vbitseti_h(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 15]}}
+  res |= __lsx_vbitseti_h(_1, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  res |= __lsx_vbitseti_h(_1, var); // expected-error {{argument to '__builtin_lsx_vbitseti_h' must be a constant integer}}
+  return res;
+}
+
+v4u32 vbitseti_w(v4u32 _1, int var) {
+  v4u32 res = __lsx_vbitseti_w(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __lsx_vbitseti_w(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __lsx_vbitseti_w(_1, var); // expected-error {{argument to '__builtin_lsx_vbitseti_w' must be a constant integer}}
+  return res;
+}
+
+v2u64 vbitseti_d(v2u64 _1, int var) {
+  v2u64 res = __lsx_vbitseti_d(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 63]}}
+  res |= __lsx_vbitseti_d(_1, 64); // expected-error {{argument value 64 is outside the valid range [0, 63]}}
+  res |= __lsx_vbitseti_d(_1, var); // expected-error {{argument to '__builtin_lsx_vbitseti_d' must be a constant integer}}
+  return res;
+}
+
+v16u8 vbitrevi_b(v16u8 _1, int var) {
+  v16u8 res = __lsx_vbitrevi_b(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 7]}}
+  res |= __lsx_vbitrevi_b(_1, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+  res |= __lsx_vbitrevi_b(_1, var); // expected-error {{argument to '__builtin_lsx_vbitrevi_b' must be a constant integer}}
+  return res;
+}
+
+v8u16 vbitrevi_h(v8u16 _1, int var) {
+  v8u16 res = __lsx_vbitrevi_h(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 15]}}
+  res |= __lsx_vbitrevi_h(_1, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  res |= __lsx_vbitrevi_h(_1, var); // expected-error {{argument to '__builtin_lsx_vbitrevi_h' must be a constant integer}}
+  return res;
+}
+
+v4u32 vbitrevi_w(v4u32 _1, int var) {
+  v4u32 res = __lsx_vbitrevi_w(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __lsx_vbitrevi_w(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __lsx_vbitrevi_w(_1, var); // expected-error {{argument to '__builtin_lsx_vbitrevi_w' must be a constant integer}}
+  return res;
+}
+
+v2u64 vbitrevi_d(v2u64 _1, int var) {
+  v2u64 res = __lsx_vbitrevi_d(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 63]}}
+  res |= __lsx_vbitrevi_d(_1, 64); // expected-error {{argument value 64 is outside the valid range [0, 63]}}
+  res |= __lsx_vbitrevi_d(_1, var); // expected-error {{argument to '__builtin_lsx_vbitrevi_d' must be a constant integer}}
+  return res;
+}
+
+v16i8 vaddi_bu(v16i8 _1, int var) {
+  v16i8 res = __lsx_vaddi_bu(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __lsx_vaddi_bu(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __lsx_vaddi_bu(_1, var); // expected-error {{argument to '__builtin_lsx_vaddi_bu' must be a constant integer}}
+  return res;
+}
+
+v8i16 vaddi_hu(v8i16 _1, int var) {
+  v8i16 res = __lsx_vaddi_hu(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __lsx_vaddi_hu(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __lsx_vaddi_hu(_1, var); // expected-error {{argument to '__builtin_lsx_vaddi_hu' must be a constant integer}}
+  return res;
+}
+
+v4i32 vaddi_wu(v4i32 _1, int var) {
+  v4i32 res = __lsx_vaddi_wu(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __lsx_vaddi_wu(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __lsx_vaddi_wu(_1, var); // expected-error {{argument to '__builtin_lsx_vaddi_wu' must be a constant integer}}
+  return res;
+}
+
+v2i64 vaddi_du(v2i64 _1, int var) {
+  v2i64 res = __lsx_vaddi_du(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __lsx_vaddi_du(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __lsx_vaddi_du(_1, var); // expected-error {{argument to '__builtin_lsx_vaddi_du' must be a constant integer}}
+  return res;
+}
+
+v16i8 vsubi_bu(v16i8 _1, int var) {
+  v16i8 res = __lsx_vsubi_bu(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __lsx_vsubi_bu(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __lsx_vsubi_bu(_1, var); // expected-error {{argument to '__builtin_lsx_vsubi_bu' must be a constant integer}}
+  return res;
+}
+
+v8i16 vsubi_hu(v8i16 _1, int var) {
+  v8i16 res = __lsx_vsubi_hu(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __lsx_vsubi_hu(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __lsx_vsubi_hu(_1, var); // expected-error {{argument to '__builtin_lsx_vsubi_hu' must be a constant integer}}
+  return res;
+}
+
+v4i32 vsubi_wu(v4i32 _1, int var) {
+  v4i32 res = __lsx_vsubi_wu(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __lsx_vsubi_wu(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __lsx_vsubi_wu(_1, var); // expected-error {{argument to '__builtin_lsx_vsubi_wu' must be a constant integer}}
+  return res;
+}
+
+v2i64 vsubi_du(v2i64 _1, int var) {
+  v2i64 res = __lsx_vsubi_du(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __lsx_vsubi_du(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __lsx_vsubi_du(_1, var); // expected-error {{argument to '__builtin_lsx_vsubi_du' must be a constant integer}}
+  return res;
+}
+
+v16i8 vmaxi_b(v16i8 _1, int var) {
+  v16i8 res = __lsx_vmaxi_b(_1, -17); // expected-error {{argument value -17 is outside the valid range [-16, 15]}}
+  res |= __lsx_vmaxi_b(_1, 16); // expected-error {{argument value 16 is outside the valid range [-16, 15]}}
+  res |= __lsx_vmaxi_b(_1, var); // expected-error {{argument to '__builtin_lsx_vmaxi_b' must be a constant integer}}
+  return res;
+}
+
+v8i16 vmaxi_h(v8i16 _1, int var) {
+  v8i16 res = __lsx_vmaxi_h(_1, -17); // expected-error {{argument value -17 is outside the valid range [-16, 15]}}
+  res |= __lsx_vmaxi_h(_1, 16); // expected-error {{argument value 16 is outside the valid range [-16, 15]}}
+  res |= __lsx_vmaxi_h(_1, var); // expected-error {{argument to '__builtin_lsx_vmaxi_h' must be a constant integer}}
+  return res;
+}
+
+v4i32 vmaxi_w(v4i32 _1, int var) {
+  v4i32 res = __lsx_vmaxi_w(_1, -17); // expected-error {{argument value -17 is outside the valid range [-16, 15]}}
+  res |= __lsx_vmaxi_w(_1, 16); // expected-error {{argument value 16 is outside the valid range [-16, 15]}}
+  res |= __lsx_vmaxi_w(_1, var); // expected-error {{argument to '__builtin_lsx_vmaxi_w' must be a constant integer}}
+  return res;
+}
+
+v2i64 vmaxi_d(v2i64 _1, int var) {
+  v2i64 res = __lsx_vmaxi_d(_1, -17); // expected-error {{argument value -17 is outside the valid range [-16, 15]}}
+  res |= __lsx_vmaxi_d(_1, 16); // expected-error {{argument value 16 is outside the valid range [-16, 15]}}
+  res |= __lsx_vmaxi_d(_1, var); // expected-error {{argument to '__builtin_lsx_vmaxi_d' must be a constant integer}}
+  return res;
+}
+
+v16u8 vmaxi_bu(v16u8 _1, int var) {
+  v16u8 res = __lsx_vmaxi_bu(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __lsx_vmaxi_bu(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __lsx_vmaxi_bu(_1, var); // expected-error {{argument to '__builtin_lsx_vmaxi_bu' must be a constant integer}}
+  return res;
+}
+
+v8u16 vmaxi_hu(v8u16 _1, int var) {
+  v8u16 res = __lsx_vmaxi_hu(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __lsx_vmaxi_hu(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __lsx_vmaxi_hu(_1, var); // expected-error {{argument to '__builtin_lsx_vmaxi_hu' must be a constant integer}}
+  return res;
+}
+
+v4u32 vmaxi_wu(v4u32 _1, int var) {
+  v4u32 res = __lsx_vmaxi_wu(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __lsx_vmaxi_wu(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __lsx_vmaxi_wu(_1, var); // expected-error {{argument to '__builtin_lsx_vmaxi_wu' must be a constant integer}}
+  return res;
+}
+
+v2u64 vmaxi_du(v2u64 _1, int var) {
+  v2u64 res = __lsx_vmaxi_du(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __lsx_vmaxi_du(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __lsx_vmaxi_du(_1, var); // expected-error {{argument to '__builtin_lsx_vmaxi_du' must be a constant integer}}
+  return res;
+}
+
+v16i8 vmini_b(v16i8 _1, int var) {
+  v16i8 res = __lsx_vmini_b(_1, -17); // expected-error {{argument value -17 is outside the valid range [-16, 15]}}
+  res |= __lsx_vmini_b(_1, 16); // expected-error {{argument value 16 is outside the valid range [-16, 15]}}
+  res |= __lsx_vmini_b(_1, var); // expected-error {{argument to '__builtin_lsx_vmini_b' must be a constant integer}}
+  return res;
+}
+
+v8i16 vmini_h(v8i16 _1, int var) {
+  v8i16 res = __lsx_vmini_h(_1, -17); // expected-error {{argument value -17 is outside the valid range [-16, 15]}}
+  res |= __lsx_vmini_h(_1, 16); // expected-error {{argument value 16 is outside the valid range [-16, 15]}}
+  res |= __lsx_vmini_h(_1, var); // expected-error {{argument to '__builtin_lsx_vmini_h' must be a constant integer}}}
+  return res;
+}
+
+v4i32 vmini_w(v4i32 _1, int var) {
+  v4i32 res = __lsx_vmini_w(_1, -17); // expected-error {{argument value -17 is outside the valid range [-16, 15]}}
+  res |= __lsx_vmini_w(_1, 16); // expected-error {{argument value 16 is outside the valid range [-16, 15]}}
+  res |= __lsx_vmini_w(_1, var); // expected-error {{argument to '__builtin_lsx_vmini_w' must be a constant integer}}
+  return res;
+}
+
+v2i64 vmini_d(v2i64 _1, int var) {
+  v2i64 res = __lsx_vmini_d(_1, -17); // expected-error {{argument value -17 is outside the valid range [-16, 15]}}
+  res |= __lsx_vmini_d(_1, 16); // expected-error {{argument value 16 is outside the valid range [-16, 15]}}
+  res |= __lsx_vmini_d(_1, var); // expected-error {{argument to '__builtin_lsx_vmini_d' must be a constant integer}}
+  return res;
+}
+
+v16u8 vmini_bu(v16u8 _1, int var) {
+  v16u8 res = __lsx_vmini_bu(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __lsx_vmini_bu(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __lsx_vmini_bu(_1, var); // expected-error {{argument to '__builtin_lsx_vmini_bu' must be a constant integer}}
+  return res;
+}
+
+v8u16 vmini_hu(v8u16 _1, int var) {
+  v8u16 res = __lsx_vmini_hu(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __lsx_vmini_hu(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __lsx_vmini_hu(_1, var); // expected-error {{argument to '__builtin_lsx_vmini_hu' must be a constant integer}}
+  return res;
+}
+
+v4u32 vmini_wu(v4u32 _1, int var) {
+  v4u32 res = __lsx_vmini_wu(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __lsx_vmini_wu(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __lsx_vmini_wu(_1, var); // expected-error {{argument to '__builtin_lsx_vmini_wu' must be a constant integer}}
+  return res;
+}
+
+v2u64 vmini_du(v2u64 _1, int var) {
+  v2u64 res = __lsx_vmini_du(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __lsx_vmini_du(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __lsx_vmini_du(_1, var); // expected-error {{argument to '__builtin_lsx_vmini_du' must be a constant integer}}
+  return res;
+}
+
+v16i8 vseqi_b(v16i8 _1, int var) {
+  v16i8 res = __lsx_vseqi_b(_1, -17); // expected-error {{argument value -17 is outside the valid range [-16, 15]}}
+  res |= __lsx_vseqi_b(_1, 16); // expected-error {{argument value 16 is outside the valid range [-16, 15]}}
+  res |= __lsx_vseqi_b(_1, var); // expected-error {{argument to '__builtin_lsx_vseqi_b' must be a constant integer}}
+  return res;
+}
+
+v8i16 vseqi_h(v8i16 _1, int var) {
+  v8i16 res = __lsx_vseqi_h(_1, -17); // expected-error {{argument value -17 is outside the valid range [-16, 15]}}
+  res |= __lsx_vseqi_h(_1, 16); // expected-error {{argument value 16 is outside the valid range [-16, 15]}}
+  res |= __lsx_vseqi_h(_1, var); // expected-error {{argument to '__builtin_lsx_vseqi_h' must be a constant integer}}
+  return res;
+}
+
+v4i32 vseqi_w(v4i32 _1, int var) {
+  v4i32 res = __lsx_vseqi_w(_1, -17); // expected-error {{argument value -17 is outside the valid range [-16, 15]}}
+  res |= __lsx_vseqi_w(_1, 16); // expected-error {{argument value 16 is outside the valid range [-16, 15]}}
+  res |= __lsx_vseqi_w(_1, var); // expected-error {{argument to '__builtin_lsx_vseqi_w' must be a constant integer}}
+  return res;
+}
+
+v2i64 vseqi_d(v2i64 _1, int var) {
+  v2i64 res = __lsx_vseqi_d(_1, -17); // expected-error {{argument value -17 is outside the valid range [-16, 15]}}
+  res |= __lsx_vseqi_d(_1, 16); // expected-error {{argument value 16 is outside the valid range [-16, 15]}}
+  res |= __lsx_vseqi_d(_1, var); // expected-error {{argument to '__builtin_lsx_vseqi_d' must be a constant integer}}
+  return res;
+}
+
+v16i8 vslti_b(v16i8 _1, int var) {
+  v16i8 res = __lsx_vslti_b(_1, -17); // expected-error {{argument value -17 is outside the valid range [-16, 15]}}
+  res |= __lsx_vslti_b(_1, 16); // expected-error {{argument value 16 is outside the valid range [-16, 15]}}
+  res |= __lsx_vslti_b(_1, var); // expected-error {{argument to '__builtin_lsx_vslti_b' must be a constant integer}}
+  return res;
+}
+
+v8i16 vslti_h(v8i16 _1, int var) {
+  v8i16 res = __lsx_vslti_h(_1, -17); // expected-error {{argument value -17 is outside the valid range [-16, 15]}}
+  res |= __lsx_vslti_h(_1, 16); // expected-error {{argument value 16 is outside the valid range [-16, 15]}}
+  res |= __lsx_vslti_h(_1, var); // expected-error {{argument to '__builtin_lsx_vslti_h' must be a constant integer}}
+  return res;
+}
+
+v4i32 vslti_w(v4i32 _1, int var) {
+  v4i32 res = __lsx_vslti_w(_1, -17); // expected-error {{argument value -17 is outside the valid range [-16, 15]}}
+  res |= __lsx_vslti_w(_1, 16); // expected-error {{argument value 16 is outside the valid range [-16, 15]}}
+  res |= __lsx_vslti_w(_1, var); // expected-error {{argument to '__builtin_lsx_vslti_w' must be a constant integer}}
+  return res;
+}
+
+v2i64 vslti_d(v2i64 _1, int var) {
+  v2i64 res = __lsx_vslti_d(_1, -17); // expected-error {{argument value -17 is outside the valid range [-16, 15]}}
+  res |= __lsx_vslti_d(_1, 16); // expected-error {{argument value 16 is outside the valid range [-16, 15]}}
+  res |= __lsx_vslti_d(_1, var); // expected-error {{argument to '__builtin_lsx_vslti_d' must be a constant integer}}
+  return res;
+}
+
+v16i8 vslti_bu(v16u8 _1, int var) {
+  v16i8 res = __lsx_vslti_bu(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __lsx_vslti_bu(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __lsx_vslti_bu(_1, var); // expected-error {{argument to '__builtin_lsx_vslti_bu' must be a constant integer}}
+  return res;
+}
+
+v8i16 vslti_hu(v8u16 _1, int var) {
+  v8i16 res = __lsx_vslti_hu(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __lsx_vslti_hu(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __lsx_vslti_hu(_1, var); // expected-error {{argument to '__builtin_lsx_vslti_hu' must be a constant integer}}
+  return res;
+}
+
+v4i32 vslti_wu(v4u32 _1, int var) {
+  v4i32 res = __lsx_vslti_wu(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __lsx_vslti_wu(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __lsx_vslti_wu(_1, var); // expected-error {{argument to '__builtin_lsx_vslti_wu' must be a constant integer}}
+  return res;
+}
+
+v2i64 vslti_du(v2u64 _1, int var) {
+  v2i64 res = __lsx_vslti_du(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __lsx_vslti_du(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __lsx_vslti_du(_1, var); // expected-error {{argument to '__builtin_lsx_vslti_du' must be a constant integer}}
+  return res;
+}
+
+v16i8 vslei_b(v16i8 _1, int var) {
+  v16i8 res = __lsx_vslei_b(_1, -17); // expected-error {{argument value -17 is outside the valid range [-16, 15]}}
+  res |= __lsx_vslei_b(_1, 16); // expected-error {{argument value 16 is outside the valid range [-16, 15]}}
+  res |= __lsx_vslei_b(_1, var); // expected-error {{argument to '__builtin_lsx_vslei_b' must be a constant integer}}
+  return res;
+}
+
+v8i16 vslei_h(v8i16 _1, int var) {
+  v8i16 res = __lsx_vslei_h(_1, -17); // expected-error {{argument value -17 is outside the valid range [-16, 15]}}
+  res |= __lsx_vslei_h(_1, 16); // expected-error {{argument value 16 is outside the valid range [-16, 15]}}
+  res |= __lsx_vslei_h(_1, var); // expected-error {{argument to '__builtin_lsx_vslei_h' must be a constant integer}}
+  return res;
+}
+
+v4i32 vslei_w(v4i32 _1, int var) {
+  v4i32 res = __lsx_vslei_w(_1, -17); // expected-error {{argument value -17 is outside the valid range [-16, 15]}}
+  res |= __lsx_vslei_w(_1, 16); // expected-error {{argument value 16 is outside the valid range [-16, 15]}}
+  res |= __lsx_vslei_w(_1, var); // expected-error {{argument to '__builtin_lsx_vslei_w' must be a constant integer}}
+  return res;
+}
+
+v2i64 vslei_d(v2i64 _1, int var) {
+  v2i64 res = __lsx_vslei_d(_1, -17); // expected-error {{argument value -17 is outside the valid range [-16, 15]}}
+  res |= __lsx_vslei_d(_1, 16); // expected-error {{argument value 16 is outside the valid range [-16, 15]}}
+  res |= __lsx_vslei_d(_1, var); // expected-error {{argument to '__builtin_lsx_vslei_d' must be a constant integer}}
+  return res;
+}
+
+v16i8 vslei_bu(v16u8 _1, int var) {
+  v16i8 res = __lsx_vslei_bu(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __lsx_vslei_bu(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __lsx_vslei_bu(_1, var); // expected-error {{argument to '__builtin_lsx_vslei_bu' must be a constant integer}}
+  return res;
+}
+
+v8i16 vslei_hu(v8u16 _1, int var) {
+  v8i16 res = __lsx_vslei_hu(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __lsx_vslei_hu(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __lsx_vslei_hu(_1, var); // expected-error {{argument to '__builtin_lsx_vslei_hu' must be a constant integer}}
+  return res;
+}
+
+v4i32 vslei_wu(v4u32 _1, int var) {
+  v4i32 res = __lsx_vslei_wu(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __lsx_vslei_wu(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __lsx_vslei_wu(_1, var); // expected-error {{argument to '__builtin_lsx_vslei_wu' must be a constant integer}}
+  return res;
+}
+
+v2i64 vslei_du(v2u64 _1, int var) {
+  v2i64 res = __lsx_vslei_du(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __lsx_vslei_du(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __lsx_vslei_du(_1, var); // expected-error {{argument to '__builtin_lsx_vslei_du' must be a constant integer}}
+  return res;
+}
+
+v16i8 vsat_b(v16i8 _1, int var) {
+  v16i8 res = __lsx_vsat_b(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 7]}}
+  res |= __lsx_vsat_b(_1, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+  res |= __lsx_vsat_b(_1, var); // expected-error {{argument to '__builtin_lsx_vsat_b' must be a constant integer}}
+  return res;
+}
+
+v8i16 vsat_h(v8i16 _1, int var) {
+  v8i16 res = __lsx_vsat_h(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 15]}}
+  res |= __lsx_vsat_h(_1, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  res |= __lsx_vsat_h(_1, var); // expected-error {{argument to '__builtin_lsx_vsat_h' must be a constant integer}}
+  return res;
+}
+
+v4i32 vsat_w(v4i32 _1, int var) {
+  v4i32 res = __lsx_vsat_w(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __lsx_vsat_w(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __lsx_vsat_w(_1, var); // expected-error {{argument to '__builtin_lsx_vsat_w' must be a constant integer}}
+  return res;
+}
+
+v2i64 vsat_d(v2i64 _1, int var) {
+  v2i64 res = __lsx_vsat_d(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 63]}}
+  res |= __lsx_vsat_d(_1, 64); // expected-error {{argument value 64 is outside the valid range [0, 63]}}
+  res |= __lsx_vsat_d(_1, var); // expected-error {{argument to '__builtin_lsx_vsat_d' must be a constant integer}}
+  return res;
+}
+
+v16u8 vsat_bu(v16u8 _1, int var) {
+  v16u8 res = __lsx_vsat_bu(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 7]}}
+  res |= __lsx_vsat_bu(_1, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+  res |= __lsx_vsat_bu(_1, var); // expected-error {{argument to '__builtin_lsx_vsat_bu' must be a constant integer}}
+  return res;
+}
+
+v8u16 vsat_hu(v8u16 _1, int var) {
+  v8u16 res = __lsx_vsat_hu(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 15]}}
+  res |= __lsx_vsat_hu(_1, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  res |= __lsx_vsat_hu(_1, var); // expected-error {{argument to '__builtin_lsx_vsat_hu' must be a constant integer}}
+  return res;
+}
+
+v4u32 vsat_wu(v4u32 _1, int var) {
+  v4u32 res = __lsx_vsat_wu(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __lsx_vsat_wu(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __lsx_vsat_wu(_1, var); // expected-error {{argument to '__builtin_lsx_vsat_wu' must be a constant integer}}
+  return res;
+}
+
+v2u64 vsat_du(v2u64 _1, int var) {
+  v2u64 res = __lsx_vsat_du(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 63]}}
+  res |= __lsx_vsat_du(_1, 64); // expected-error {{argument value 64 is outside the valid range [0, 63]}}
+  res |= __lsx_vsat_du(_1, var); // expected-error {{argument to '__builtin_lsx_vsat_du' must be a constant integer}}
+  return res;
+}
+
+v16i8 vreplvei_b(v16i8 _1, int var) {
+  v16i8 res = __lsx_vreplvei_b(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 15]}}
+  res |= __lsx_vreplvei_b(_1, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  res |= __lsx_vreplvei_b(_1, var); // expected-error {{argument to '__builtin_lsx_vreplvei_b' must be a constant integer}}
+  return res;
+}
+
+v8i16 vreplvei_h(v8i16 _1, int var) {
+  v8i16 res = __lsx_vreplvei_h(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 7]}}
+  res |= __lsx_vreplvei_h(_1, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+  res |= __lsx_vreplvei_h(_1, var); // expected-error {{argument to '__builtin_lsx_vreplvei_h' must be a constant integer}}
+  return res;
+}
+
+v4i32 vreplvei_w(v4i32 _1, int var) {
+  v4i32 res = __lsx_vreplvei_w(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 3]}}
+  res |= __lsx_vreplvei_w(_1, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  res |= __lsx_vreplvei_w(_1, var); // expected-error {{argument to '__builtin_lsx_vreplvei_w' must be a constant integer}}
+  return res;
+}
+
+v2i64 vreplvei_d(v2i64 _1, int var) {
+  v2i64 res = __lsx_vreplvei_d(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 1]}}
+  res |= __lsx_vreplvei_d(_1, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}}
+  res |= __lsx_vreplvei_d(_1, var); // expected-error {{argument to '__builtin_lsx_vreplvei_d' must be a constant integer}}
+  return res;
+}
+
+v16u8 vandi_b(v16u8 _1, int var) {
+  v16u8 res = __lsx_vandi_b(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 255]}}
+  res |= __lsx_vandi_b(_1, 256); // expected-error {{argument value 256 is outside the valid range [0, 255]}}
+  res |= __lsx_vandi_b(_1, var); // expected-error {{argument to '__builtin_lsx_vandi_b' must be a constant integer}}
+  return res;
+}
+
+v16u8 vori_b(v16u8 _1, int var) {
+  v16u8 res = __lsx_vori_b(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 255]}}
+  res |= __lsx_vori_b(_1, 256); // expected-error {{argument value 256 is outside the valid range [0, 255]}}
+  res |= __lsx_vori_b(_1, var); // expected-error {{argument to '__builtin_lsx_vori_b' must be a constant integer}}
+  return res;
+}
+
+v16u8 vnori_b(v16u8 _1, int var) {
+  v16u8 res = __lsx_vnori_b(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 255]}}
+  res |= __lsx_vnori_b(_1, 256); // expected-error {{argument value 256 is outside the valid range [0, 255]}}
+  res |= __lsx_vnori_b(_1, var); // expected-error {{argument to '__builtin_lsx_vnori_b' must be a constant integer}}
+  return res;
+}
+
+v16u8 vxori_b(v16u8 _1, int var) {
+  v16u8 res = __lsx_vxori_b(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 255]}}
+  res |= __lsx_vxori_b(_1, 256); // expected-error {{argument value 256 is outside the valid range [0, 255]}}
+  res |= __lsx_vxori_b(_1, var); // expected-error {{argument to '__builtin_lsx_vxori_b' must be a constant integer}}
+  return res;
+}
+
+v16u8 vbitseli_b(v16u8 _1, v16u8 _2, int var) {
+  v16u8 res = __lsx_vbitseli_b(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 255]}}
+  res |= __lsx_vbitseli_b(_1, _2, 256); // expected-error {{argument value 256 is outside the valid range [0, 255]}}
+  res |= __lsx_vbitseli_b(_1, _2, var); // expected-error {{argument to '__builtin_lsx_vbitseli_b' must be a constant integer}}
+  return res;
+}
+
+v16i8 vshuf4i_b(v16i8 _1, int var) {
+  v16i8 res = __lsx_vshuf4i_b(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 255]}}
+  res |= __lsx_vshuf4i_b(_1, 256); // expected-error {{argument value 256 is outside the valid range [0, 255]}}
+  res |= __lsx_vshuf4i_b(_1, var); // expected-error {{argument to '__builtin_lsx_vshuf4i_b' must be a constant integer}}
+  return res;
+}
+
+v8i16 vshuf4i_h(v8i16 _1, int var) {
+  v8i16 res = __lsx_vshuf4i_h(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 255]}}
+  res |= __lsx_vshuf4i_h(_1, 256); // expected-error {{argument value 256 is outside the valid range [0, 255]}}
+  res |= __lsx_vshuf4i_h(_1, var); // expected-error {{argument to '__builtin_lsx_vshuf4i_h' must be a constant integer}}
+  return res;
+}
+
+v4i32 vshuf4i_w(v4i32 _1, int var) {
+  v4i32 res = __lsx_vshuf4i_w(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 255]}}
+  res |= __lsx_vshuf4i_w(_1, 256); // expected-error {{argument value 256 is outside the valid range [0, 255]}}
+  res |= __lsx_vshuf4i_w(_1, var); // expected-error {{argument to '__builtin_lsx_vshuf4i_w' must be a constant integer}}
+  return res;
+}
+
+int vpickve2gr_b(v16i8 _1, int var) {
+  int res = __lsx_vpickve2gr_b(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 15]}}
+  res |= __lsx_vpickve2gr_b(_1, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  res |= __lsx_vpickve2gr_b(_1, var); // expected-error {{argument to '__builtin_lsx_vpickve2gr_b' must be a constant integer}}
+  return res;
+}
+
+int vpickve2gr_h(v8i16 _1, int var) {
+  int res = __lsx_vpickve2gr_h(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 7]}}
+  res |= __lsx_vpickve2gr_h(_1, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+  res |= __lsx_vpickve2gr_h(_1, var); // expected-error {{argument to '__builtin_lsx_vpickve2gr_h' must be a constant integer}}
+  return res;
+}
+
+int vpickve2gr_w(v4i32 _1, int var) {
+  int res = __lsx_vpickve2gr_w(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 3]}}
+  res |= __lsx_vpickve2gr_w(_1, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  res |= __lsx_vpickve2gr_w(_1, var); // expected-error {{argument to '__builtin_lsx_vpickve2gr_w' must be a constant integer}}
+  return res;
+}
+
+long vpickve2gr_d(v2i64 _1, int var) {
+  long res = __lsx_vpickve2gr_d(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 1]}}
+  res |= __lsx_vpickve2gr_d(_1, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}}
+  res |= __lsx_vpickve2gr_d(_1, var); // expected-error {{argument to '__builtin_lsx_vpickve2gr_d' must be a constant integer}}
+  return res;
+}
+
+unsigned int vpickve2gr_bu(v16i8 _1, int var) {
+  unsigned int res = __lsx_vpickve2gr_bu(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 15]}}
+  res |= __lsx_vpickve2gr_bu(_1, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  res |= __lsx_vpickve2gr_bu(_1, var); // expected-error {{argument to '__builtin_lsx_vpickve2gr_bu' must be a constant integer}}
+  return res;
+}
+
+unsigned int vpickve2gr_hu(v8i16 _1, int var) {
+  unsigned int res = __lsx_vpickve2gr_hu(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 7]}}
+  res |= __lsx_vpickve2gr_hu(_1, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+  res |= __lsx_vpickve2gr_hu(_1, var); // expected-error {{argument to '__builtin_lsx_vpickve2gr_hu' must be a constant integer}}
+  return res;
+}
+
+unsigned int vpickve2gr_wu(v4i32 _1, int var) {
+  unsigned int res = __lsx_vpickve2gr_wu(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 3]}}
+  res |= __lsx_vpickve2gr_wu(_1, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  res |= __lsx_vpickve2gr_wu(_1, var); // expected-error {{argument to '__builtin_lsx_vpickve2gr_wu' must be a constant integer}}
+  return res;
+}
+
+unsigned long int vpickve2gr_du(v2i64 _1, int var) {
+  unsigned long int res = __lsx_vpickve2gr_du(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 1]}}
+  res |= __lsx_vpickve2gr_du(_1, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}}
+  res |= __lsx_vpickve2gr_du(_1, var); // expected-error {{argument to '__builtin_lsx_vpickve2gr_du' must be a constant integer}}
+  return res;
+}
+
+v16i8 vinsgr2vr_b(v16i8 _1, int var) {
+  v16i8 res = __lsx_vinsgr2vr_b(_1, 1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 15]}}
+  res |= __lsx_vinsgr2vr_b(_1, 1, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  res |= __lsx_vinsgr2vr_b(_1, 1, var); // expected-error {{argument to '__builtin_lsx_vinsgr2vr_b' must be a constant integer}}
+  return res;
+}
+
+v8i16 vinsgr2vr_h(v8i16 _1, int var) {
+  v8i16 res = __lsx_vinsgr2vr_h(_1, 1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 7]}}
+  res |= __lsx_vinsgr2vr_h(_1, 1, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+  res |= __lsx_vinsgr2vr_h(_1, 1, var); // expected-error {{argument to '__builtin_lsx_vinsgr2vr_h' must be a constant integer}}
+  return res;
+}
+
+v4i32 vinsgr2vr_w(v4i32 _1, int var) {
+  v4i32 res = __lsx_vinsgr2vr_w(_1, 1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 3]}}
+  res |= __lsx_vinsgr2vr_w(_1, 1, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  res |= __lsx_vinsgr2vr_w(_1, 1, var); // expected-error {{argument to '__builtin_lsx_vinsgr2vr_w' must be a constant integer}}
+  return res;
+}
+
+v2i64 vinsgr2vr_d(v2i64 _1, int var) {
+  v2i64 res = __lsx_vinsgr2vr_d(_1, 1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 1]}}
+  res |= __lsx_vinsgr2vr_d(_1, 1, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}}
+  res |= __lsx_vinsgr2vr_d(_1, 1, var); // expected-error {{argument to '__builtin_lsx_vinsgr2vr_d' must be a constant integer}}
+  return res;
+}
+
+v8i16 vsllwil_h_b(v16i8 _1, int var) {
+  v8i16 res = __lsx_vsllwil_h_b(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 7]}}
+  res |= __lsx_vsllwil_h_b(_1, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+  res |= __lsx_vsllwil_h_b(_1, var); // expected-error {{argument to '__builtin_lsx_vsllwil_h_b' must be a constant integer}}
+  return res;
+}
+
+v4i32 vsllwil_w_h(v8i16 _1, int var) {
+  v4i32 res = __lsx_vsllwil_w_h(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 15]}}
+  res |= __lsx_vsllwil_w_h(_1, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  res |= __lsx_vsllwil_w_h(_1, var); // expected-error {{argument to '__builtin_lsx_vsllwil_w_h' must be a constant integer}}
+  return res;
+}
+
+v2i64 vsllwil_d_w(v4i32 _1, int var) {
+  v2i64 res = __lsx_vsllwil_d_w(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __lsx_vsllwil_d_w(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __lsx_vsllwil_d_w(_1, var); // expected-error {{argument to '__builtin_lsx_vsllwil_d_w' must be a constant integer}}
+  return res;
+}
+
+v8u16 vsllwil_hu_bu(v16u8 _1, int var) {
+  v8u16 res = __lsx_vsllwil_hu_bu(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 7]}}
+  res |= __lsx_vsllwil_hu_bu(_1, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+  res |= __lsx_vsllwil_hu_bu(_1, var); // expected-error {{argument to '__builtin_lsx_vsllwil_hu_bu' must be a constant integer}}
+  return res;
+}
+
+v4u32 vsllwil_wu_hu(v8u16 _1, int var) {
+  v4u32 res = __lsx_vsllwil_wu_hu(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 15]}}
+  res |= __lsx_vsllwil_wu_hu(_1, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  res |= __lsx_vsllwil_wu_hu(_1, var); // expected-error {{argument to '__builtin_lsx_vsllwil_wu_hu' must be a constant integer}}
+  return res;
+}
+
+v2u64 vsllwil_du_wu(v4u32 _1, int var) {
+  v2u64 res = __lsx_vsllwil_du_wu(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __lsx_vsllwil_du_wu(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __lsx_vsllwil_du_wu(_1, var); // expected-error {{argument to '__builtin_lsx_vsllwil_du_wu' must be a constant integer}}
+  return res;
+}
+
+v16i8 vfrstpi_b(v16i8 _1, v16i8 _2, int var) {
+  v16i8 res = __lsx_vfrstpi_b(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __lsx_vfrstpi_b(_1, _2, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __lsx_vfrstpi_b(_1, _2, var); // expected-error {{argument to '__builtin_lsx_vfrstpi_b' must be a constant integer}}
+  return res;
+}
+
+v8i16 vfrstpi_h(v8i16 _1, v8i16 _2, int var) {
+  v8i16 res = __lsx_vfrstpi_h(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __lsx_vfrstpi_h(_1, _2, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __lsx_vfrstpi_h(_1, _2, var); // expected-error {{argument to '__builtin_lsx_vfrstpi_h' must be a constant integer}}
+  return res;
+}
+
+v2i64 vshuf4i_d(v2i64 _1, v2i64 _2, int var) {
+  v2i64 res = __lsx_vshuf4i_d(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 255]}}
+  res |= __lsx_vshuf4i_d(_1, _2, 256); // expected-error {{argument value 256 is outside the valid range [0, 255]}}
+  res |= __lsx_vshuf4i_d(_1, _2, var); // expected-error {{argument to '__builtin_lsx_vshuf4i_d' must be a constant integer}}
+  return res;
+}
+
+v16i8 vbsrl_v(v16i8 _1, int var) {
+  v16i8 res = __lsx_vbsrl_v(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __lsx_vbsrl_v(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __lsx_vbsrl_v(_1, var); // expected-error {{argument to '__builtin_lsx_vbsrl_v' must be a constant integer}}
+  return res;
+}
+
+v16i8 vbsll_v(v16i8 _1, int var) {
+  v16i8 res = __lsx_vbsll_v(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __lsx_vbsll_v(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __lsx_vbsll_v(_1, var); // expected-error {{argument to '__builtin_lsx_vbsll_v' must be a constant integer}}
+  return res;
+}
+
+v16i8 vextrins_b(v16i8 _1, v16i8 _2, int var) {
+  v16i8 res = __lsx_vextrins_b(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 255]}}
+  res |= __lsx_vextrins_b(_1, _2, 256); // expected-error {{argument value 256 is outside the valid range [0, 255]}}
+  res |= __lsx_vextrins_b(_1, _2, var); // expected-error {{argument to '__builtin_lsx_vextrins_b' must be a constant integer}}
+  return res;
+}
+
+v8i16 vextrins_h(v8i16 _1, v8i16 _2, int var) {
+  v8i16 res = __lsx_vextrins_h(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 255]}}
+  res |= __lsx_vextrins_h(_1, _2, 256); // expected-error {{argument value 256 is outside the valid range [0, 255]}}
+  res |= __lsx_vextrins_h(_1, _2, var); // expected-error {{argument to '__builtin_lsx_vextrins_h' must be a constant integer}}
+  return res;
+}
+
+v4i32 vextrins_w(v4i32 _1, v4i32 _2, int var) {
+  v4i32 res = __lsx_vextrins_w(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 255]}}
+  res |= __lsx_vextrins_w(_1, _2, 256); // expected-error {{argument value 256 is outside the valid range [0, 255]}}
+  res |= __lsx_vextrins_w(_1, _2, var); // expected-error {{argument to '__builtin_lsx_vextrins_w' must be a constant integer}}
+  return res;
+}
+
+v2i64 vextrins_d(v2i64 _1, v2i64 _2, int var) {
+  v2i64 res = __lsx_vextrins_d(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 255]}}
+  res |= __lsx_vextrins_d(_1, _2, 256); // expected-error {{argument value 256 is outside the valid range [0, 255]}}
+  res |= __lsx_vextrins_d(_1, _2, var); // expected-error {{argument to '__builtin_lsx_vextrins_d' must be a constant integer}}
+  return res;
+}
+
+void vstelm_b_idx(v16i8 _1, void *_2, int var) {
+  __lsx_vstelm_b(_1, _2, 1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 15]}}
+  __lsx_vstelm_b(_1, _2, 1, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  __lsx_vstelm_b(_1, _2, 1, var); // expected-error {{argument to '__builtin_lsx_vstelm_b' must be a constant integer}}
+}
+
+void vstelm_h_idx(v8i16 _1, void *_2, int var) {
+  __lsx_vstelm_h(_1, _2, 2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 7]}}
+  __lsx_vstelm_h(_1, _2, 2, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+  __lsx_vstelm_h(_1, _2, 2, var); // expected-error {{argument to '__builtin_lsx_vstelm_h' must be a constant integer}}
+}
+
+void vstelm_w_idx(v4i32 _1, void *_2, int var) {
+  __lsx_vstelm_w(_1, _2, 4, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 3]}}
+  __lsx_vstelm_w(_1, _2, 4, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  __lsx_vstelm_w(_1, _2, 4, var); // expected-error {{argument to '__builtin_lsx_vstelm_w' must be a constant integer}}
+}
+
+void vstelm_d_idx(v2i64 _1, void *_2, int var) {
+  __lsx_vstelm_d(_1, _2, 8, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 1]}}
+  __lsx_vstelm_d(_1, _2, 8, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}}
+  __lsx_vstelm_d(_1, _2, 8, var); // expected-error {{argument to '__builtin_lsx_vstelm_d' must be a constant integer}}
+}
+
+void vstelm_b(v16i8 _1, void *_2, int var) {
+  __lsx_vstelm_b(_1, _2, -129, 1); // expected-error {{argument value -129 is outside the valid range [-128, 127]}}
+  __lsx_vstelm_b(_1, _2, 128, 1); // expected-error {{argument value 128 is outside the valid range [-128, 127]}}
+  __lsx_vstelm_b(_1, _2, var, 1); // expected-error {{argument to '__builtin_lsx_vstelm_b' must be a constant integer}}
+}
+
+void vstelm_h(v8i16 _1, void *_2, int var) {
+  __lsx_vstelm_h(_1, _2, -258, 1); // expected-error {{argument value -258 is outside the valid range [-256, 254]}}
+  __lsx_vstelm_h(_1, _2, 256, 1); // expected-error {{argument value 256 is outside the valid range [-256, 254]}}
+  __lsx_vstelm_h(_1, _2, var, 1); // expected-error {{argument to '__builtin_lsx_vstelm_h' must be a constant integer}}
+}
+
+void vstelm_w(v4i32 _1, void *_2, int var) {
+  __lsx_vstelm_w(_1, _2, -516, 1); // expected-error {{argument value -516 is outside the valid range [-512, 508]}}
+  __lsx_vstelm_w(_1, _2, 512, 1); // expected-error {{argument value 512 is outside the valid range [-512, 508]}}
+  __lsx_vstelm_w(_1, _2, var, 1); // expected-error {{argument to '__builtin_lsx_vstelm_w' must be a constant integer}}
+}
+
+void vstelm_d(v2i64 _1, void *_2, int var) {
+  __lsx_vstelm_d(_1, _2, -1032, 1); // expected-error {{argument value -1032 is outside the valid range [-1024, 1016]}}
+  __lsx_vstelm_d(_1, _2, 1024, 1); // expected-error {{argument value 1024 is outside the valid range [-1024, 1016]}}
+  __lsx_vstelm_d(_1, _2, var, 1); // expected-error {{argument to '__builtin_lsx_vstelm_d' must be a constant integer}}
+}
+
+v16i8 vldrepl_b(void *_1, int var) {
+  v16i8 res = __lsx_vldrepl_b(_1, -2049); // expected-error {{argument value -2049 is outside the valid range [-2048, 2047]}}
+  res |= __lsx_vldrepl_b(_1, 2048); // expected-error {{argument value 2048 is outside the valid range [-2048, 2047]}}
+  res |= __lsx_vldrepl_b(_1, var); // expected-error {{argument to '__builtin_lsx_vldrepl_b' must be a constant integer}}
+  return res;
+}
+
+v8i16 vldrepl_h(void *_1, int var) {
+  v8i16 res = __lsx_vldrepl_h(_1, -2050); // expected-error {{argument value -2050 is outside the valid range [-2048, 2046]}}
+  res |= __lsx_vldrepl_h(_1, 2048); // expected-error {{argument value 2048 is outside the valid range [-2048, 2046]}}
+  res |= __lsx_vldrepl_h(_1, var); // expected-error {{argument to '__builtin_lsx_vldrepl_h' must be a constant integer}}
+  return res;
+}
+
+v4i32 vldrepl_w(void *_1, int var) {
+  v4i32 res = __lsx_vldrepl_w(_1, -2052); // expected-error {{argument value -2052 is outside the valid range [-2048, 2044]}}
+  res |= __lsx_vldrepl_w(_1, 2048); // expected-error {{argument value 2048 is outside the valid range [-2048, 2044]}}
+  res |= __lsx_vldrepl_w(_1, var); // expected-error {{argument to '__builtin_lsx_vldrepl_w' must be a constant integer}}
+  return res;
+}
+
+v2i64 vldrepl_d(void *_1, int var) {
+  v2i64 res = __lsx_vldrepl_d(_1, -2056); // expected-error {{argument value -2056 is outside the valid range [-2048, 2040]}}
+  res |= __lsx_vldrepl_d(_1, 2048); // expected-error {{argument value 2048 is outside the valid range [-2048, 2040]}}
+  res |= __lsx_vldrepl_d(_1, var); // expected-error {{argument to '__builtin_lsx_vldrepl_d' must be a constant integer}}
+  return res;
+}
+
+v16i8 vrotri_b(v16i8 _1, int var) {
+  v16i8 res = __lsx_vrotri_b(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 7]}}
+  res |= __lsx_vrotri_b(_1, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+  res |= __lsx_vrotri_b(_1, var); // expected-error {{argument to '__builtin_lsx_vrotri_b' must be a constant integer}}
+  return res;
+}
+
+v8i16 vrotri_h(v8i16 _1, int var) {
+  v8i16 res = __lsx_vrotri_h(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 15]}}
+  res |= __lsx_vrotri_h(_1, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  res |= __lsx_vrotri_h(_1, var); // expected-error {{argument to '__builtin_lsx_vrotri_h' must be a constant integer}}
+  return res;
+}
+
+v4i32 vrotri_w(v4i32 _1, int var) {
+  v4i32 res = __lsx_vrotri_w(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __lsx_vrotri_w(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __lsx_vrotri_w(_1, var); // expected-error {{argument to '__builtin_lsx_vrotri_w' must be a constant integer}}
+  return res;
+}
+
+v2i64 vrotri_d(v2i64 _1, int var) {
+  v2i64 res = __lsx_vrotri_d(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 63]}}
+  res |= __lsx_vrotri_d(_1, 64); // expected-error {{argument value 64 is outside the valid range [0, 63]}}
+  res |= __lsx_vrotri_d(_1, var); // expected-error {{argument to '__builtin_lsx_vrotri_d' must be a constant integer}}
+  return res;
+}
+
+v16i8 vsrlni_b_h(v16i8 _1, v16i8 _2, int var) {
+  v16i8 res = __lsx_vsrlni_b_h(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 15]}}
+  res |= __lsx_vsrlni_b_h(_1, _2, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  res |= __lsx_vsrlni_b_h(_1, _2, var); // expected-error {{argument to '__builtin_lsx_vsrlni_b_h' must be a constant integer}}
+  return res;
+}
+
+v8i16 vsrlni_h_w(v8i16 _1, v8i16 _2, int var) {
+  v8i16 res = __lsx_vsrlni_h_w(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __lsx_vsrlni_h_w(_1, _2, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __lsx_vsrlni_h_w(_1, _2, var); // expected-error {{argument to '__builtin_lsx_vsrlni_h_w' must be a constant integer}}
+  return res;
+}
+
+v4i32 vsrlni_w_d(v4i32 _1, v4i32 _2, int var) {
+  v4i32 res = __lsx_vsrlni_w_d(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 63]}}
+  res |= __lsx_vsrlni_w_d(_1, _2, 64); // expected-error {{argument value 64 is outside the valid range [0, 63]}}
+  res |= __lsx_vsrlni_w_d(_1, _2, var); // expected-error {{argument to '__builtin_lsx_vsrlni_w_d' must be a constant integer}}
+  return res;
+}
+
+v2i64 vsrlni_d_q(v2i64 _1, v2i64 _2, int var) {
+  v2i64 res = __lsx_vsrlni_d_q(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 127]}}
+  res |= __lsx_vsrlni_d_q(_1, _2, 128); // expected-error {{argument value 128 is outside the valid range [0, 127]}}
+  res |= __lsx_vsrlni_d_q(_1, _2, var); // expected-error {{argument to '__builtin_lsx_vsrlni_d_q' must be a constant integer}}
+  return res;
+}
+
+v16i8 vsrlrni_b_h(v16i8 _1, v16i8 _2, int var) {
+  v16i8 res = __lsx_vsrlrni_b_h(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 15]}}
+  res |= __lsx_vsrlrni_b_h(_1, _2, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  res |= __lsx_vsrlrni_b_h(_1, _2, var); // expected-error {{argument to '__builtin_lsx_vsrlrni_b_h' must be a constant integer}}
+  return res;
+}
+
+v8i16 vsrlrni_h_w(v8i16 _1, v8i16 _2, int var) {
+  v8i16 res = __lsx_vsrlrni_h_w(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __lsx_vsrlrni_h_w(_1, _2, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __lsx_vsrlrni_h_w(_1, _2, var); // expected-error {{argument to '__builtin_lsx_vsrlrni_h_w' must be a constant integer}}
+  return res;
+}
+
+v4i32 vsrlrni_w_d(v4i32 _1, v4i32 _2, int var) {
+  v4i32 res = __lsx_vsrlrni_w_d(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 63]}}
+  res |= __lsx_vsrlrni_w_d(_1, _2, 64); // expected-error {{argument value 64 is outside the valid range [0, 63]}}
+  res |= __lsx_vsrlrni_w_d(_1, _2, var); // expected-error {{argument to '__builtin_lsx_vsrlrni_w_d' must be a constant integer}}
+  return res;
+}
+
+v2i64 vsrlrni_d_q(v2i64 _1, v2i64 _2, int var) {
+  v2i64 res = __lsx_vsrlrni_d_q(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 127]}}
+  res |= __lsx_vsrlrni_d_q(_1, _2, 128); // expected-error {{argument value 128 is outside the valid range [0, 127]}}
+  res |= __lsx_vsrlrni_d_q(_1, _2, var); // expected-error {{argument to '__builtin_lsx_vsrlrni_d_q' must be a constant integer}}
+  return res;
+}
+
+v16i8 vssrlni_b_h(v16i8 _1, v16i8 _2, int var) {
+  v16i8 res = __lsx_vssrlni_b_h(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 15]}}
+  res |= __lsx_vssrlni_b_h(_1, _2, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  res |= __lsx_vssrlni_b_h(_1, _2, var); // expected-error {{argument to '__builtin_lsx_vssrlni_b_h' must be a constant integer}}
+  return res;
+}
+
+v8i16 vssrlni_h_w(v8i16 _1, v8i16 _2, int var) {
+  v8i16 res = __lsx_vssrlni_h_w(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __lsx_vssrlni_h_w(_1, _2, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __lsx_vssrlni_h_w(_1, _2, var); // expected-error {{argument to '__builtin_lsx_vssrlni_h_w' must be a constant integer}}
+  return res;
+}
+
+v4i32 vssrlni_w_d(v4i32 _1, v4i32 _2, int var) {
+  v4i32 res = __lsx_vssrlni_w_d(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 63]}}
+  res |= __lsx_vssrlni_w_d(_1, _2, 64); // expected-error {{argument value 64 is outside the valid range [0, 63]}}
+  res |= __lsx_vssrlni_w_d(_1, _2, var); // expected-error {{argument to '__builtin_lsx_vssrlni_w_d' must be a constant integer}}
+  return res;
+}
+
+v2i64 vssrlni_d_q(v2i64 _1, v2i64 _2, int var) {
+  v2i64 res = __lsx_vssrlni_d_q(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 127]}}
+  res |= __lsx_vssrlni_d_q(_1, _2, 128); // expected-error {{argument value 128 is outside the valid range [0, 127]}}
+  res |= __lsx_vssrlni_d_q(_1, _2, var); // expected-error {{argument to '__builtin_lsx_vssrlni_d_q' must be a constant integer}}
+  return res;
+}
+
+v16u8 vssrlni_bu_h(v16u8 _1, v16i8 _2, int var) {
+  v16u8 res = __lsx_vssrlni_bu_h(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 15]}}
+  res |= __lsx_vssrlni_bu_h(_1, _2, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  res |= __lsx_vssrlni_bu_h(_1, _2, var); // expected-error {{argument to '__builtin_lsx_vssrlni_bu_h' must be a constant integer}}
+  return res;
+}
+
+v8u16 vssrlni_hu_w(v8u16 _1, v8i16 _2, int var) {
+  v8u16 res = __lsx_vssrlni_hu_w(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __lsx_vssrlni_hu_w(_1, _2, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __lsx_vssrlni_hu_w(_1, _2, var); // expected-error {{argument to '__builtin_lsx_vssrlni_hu_w' must be a constant integer}}
+  return res;
+}
+
+v4u32 vssrlni_wu_d(v4u32 _1, v4i32 _2, int var) {
+  v4u32 res = __lsx_vssrlni_wu_d(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 63]}}
+  res |= __lsx_vssrlni_wu_d(_1, _2, 64); // expected-error {{argument value 64 is outside the valid range [0, 63]}}
+  res |= __lsx_vssrlni_wu_d(_1, _2, var); // expected-error {{argument to '__builtin_lsx_vssrlni_wu_d' must be a constant integer}}
+  return res;
+}
+
+v2u64 vssrlni_du_q(v2u64 _1, v2i64 _2, int var) {
+  v2u64 res = __lsx_vssrlni_du_q(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 127]}}
+  res |= __lsx_vssrlni_du_q(_1, _2, 128); // expected-error {{argument value 128 is outside the valid range [0, 127]}}
+  res |= __lsx_vssrlni_du_q(_1, _2, var); // expected-error {{argument to '__builtin_lsx_vssrlni_du_q' must be a constant integer}}
+  return res;
+}
+
+v16i8 vssrlrni_b_h(v16i8 _1, v16i8 _2, int var) {
+  v16i8 res = __lsx_vssrlrni_b_h(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 15]}}
+  res |= __lsx_vssrlrni_b_h(_1, _2, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  res |= __lsx_vssrlrni_b_h(_1, _2, var); // expected-error {{argument to '__builtin_lsx_vssrlrni_b_h' must be a constant integer}}
+  return res;
+}
+
+v8i16 vssrlrni_h_w(v8i16 _1, v8i16 _2, int var) {
+  v8i16 res = __lsx_vssrlrni_h_w(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __lsx_vssrlrni_h_w(_1, _2, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __lsx_vssrlrni_h_w(_1, _2, var); // expected-error {{argument to '__builtin_lsx_vssrlrni_h_w' must be a constant integer}}
+  return res;
+}
+
+v4i32 vssrlrni_w_d(v4i32 _1, v4i32 _2, int var) {
+  v4i32 res = __lsx_vssrlrni_w_d(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 63]}}
+  res |= __lsx_vssrlrni_w_d(_1, _2, 64); // expected-error {{argument value 64 is outside the valid range [0, 63]}}
+  res |= __lsx_vssrlrni_w_d(_1, _2, var); // expected-error {{argument to '__builtin_lsx_vssrlrni_w_d' must be a constant integer}}
+  return res;
+}
+
+v2i64 vssrlrni_d_q(v2i64 _1, v2i64 _2, int var) {
+  v2i64 res = __lsx_vssrlrni_d_q(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 127]}}
+  res |= __lsx_vssrlrni_d_q(_1, _2, 128); // expected-error {{argument value 128 is outside the valid range [0, 127]}}
+  res |= __lsx_vssrlrni_d_q(_1, _2, var); // expected-error {{argument to '__builtin_lsx_vssrlrni_d_q' must be a constant integer}}
+  return res;
+}
+
+v16u8 vssrlrni_bu_h(v16u8 _1, v16i8 _2, int var) {
+  v16u8 res = __lsx_vssrlrni_bu_h(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 15]}}
+  res |= __lsx_vssrlrni_bu_h(_1, _2, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  res |= __lsx_vssrlrni_bu_h(_1, _2, var); // expected-error {{argument to '__builtin_lsx_vssrlrni_bu_h' must be a constant integer}}
+  return res;
+}
+
+v8u16 vssrlrni_hu_w(v8u16 _1, v8i16 _2, int var) {
+  v8u16 res = __lsx_vssrlrni_hu_w(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __lsx_vssrlrni_hu_w(_1, _2, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __lsx_vssrlrni_hu_w(_1, _2, var); // expected-error {{argument to '__builtin_lsx_vssrlrni_hu_w' must be a constant integer}}
+  return res;
+}
+
+v4u32 vssrlrni_wu_d(v4u32 _1, v4i32 _2, int var) {
+  v4u32 res = __lsx_vssrlrni_wu_d(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 63]}}
+  res |= __lsx_vssrlrni_wu_d(_1, _2, 64); // expected-error {{argument value 64 is outside the valid range [0, 63]}}
+  res |= __lsx_vssrlrni_wu_d(_1, _2, var); // expected-error {{argument to '__builtin_lsx_vssrlrni_wu_d' must be a constant integer}}
+  return res;
+}
+
+v2u64 vssrlrni_du_q(v2u64 _1, v2i64 _2, int var) {
+  v2u64 res = __lsx_vssrlrni_du_q(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 127]}}
+  res |= __lsx_vssrlrni_du_q(_1, _2, 128); // expected-error {{argument value 128 is outside the valid range [0, 127]}}
+  res |= __lsx_vssrlrni_du_q(_1, _2, var); // expected-error {{argument to '__builtin_lsx_vssrlrni_du_q' must be a constant integer}}
+  return res;
+}
+
+v16i8 vsrani_b_h(v16i8 _1, v16i8 _2, int var) {
+  v16i8 res = __lsx_vsrani_b_h(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 15]}}
+  res |= __lsx_vsrani_b_h(_1, _2, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  res |= __lsx_vsrani_b_h(_1, _2, var); // expected-error {{argument to '__builtin_lsx_vsrani_b_h' must be a constant integer}}
+  return res;
+}
+
+v8i16 vsrani_h_w(v8i16 _1, v8i16 _2, int var) {
+  v8i16 res = __lsx_vsrani_h_w(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __lsx_vsrani_h_w(_1, _2, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __lsx_vsrani_h_w(_1, _2, var); // expected-error {{argument to '__builtin_lsx_vsrani_h_w' must be a constant integer}}
+  return res;
+}
+
+v4i32 vsrani_w_d(v4i32 _1, v4i32 _2, int var) {
+  v4i32 res = __lsx_vsrani_w_d(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 63]}}
+  res |= __lsx_vsrani_w_d(_1, _2, 64); // expected-error {{argument value 64 is outside the valid range [0, 63]}}
+  res |= __lsx_vsrani_w_d(_1, _2, var); // expected-error {{argument to '__builtin_lsx_vsrani_w_d' must be a constant integer}}
+  return res;
+}
+
+v2i64 vsrani_d_q(v2i64 _1, v2i64 _2, int var) {
+  v2i64 res = __lsx_vsrani_d_q(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 127]}}
+  res |= __lsx_vsrani_d_q(_1, _2, 128); // expected-error {{argument value 128 is outside the valid range [0, 127]}}
+  res |= __lsx_vsrani_d_q(_1, _2, var); // expected-error {{argument to '__builtin_lsx_vsrani_d_q' must be a constant integer}}
+  return res;
+}
+
+v16i8 vsrarni_b_h(v16i8 _1, v16i8 _2, int var) {
+  v16i8 res = __lsx_vsrarni_b_h(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 15]}}
+  res |= __lsx_vsrarni_b_h(_1, _2, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  res |= __lsx_vsrarni_b_h(_1, _2, var); // expected-error {{argument to '__builtin_lsx_vsrarni_b_h' must be a constant integer}}
+  return res;
+}
+
+v8i16 vsrarni_h_w(v8i16 _1, v8i16 _2, int var) {
+  v8i16 res = __lsx_vsrarni_h_w(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __lsx_vsrarni_h_w(_1, _2, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __lsx_vsrarni_h_w(_1, _2, var); // expected-error {{argument to '__builtin_lsx_vsrarni_h_w' must be a constant integer}}
+  return res;
+}
+
+v4i32 vsrarni_w_d(v4i32 _1, v4i32 _2, int var) {
+  v4i32 res = __lsx_vsrarni_w_d(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 63]}}
+  res |= __lsx_vsrarni_w_d(_1, _2, 64); // expected-error {{argument value 64 is outside the valid range [0, 63]}}
+  res |= __lsx_vsrarni_w_d(_1, _2, var); // expected-error {{argument to '__builtin_lsx_vsrarni_w_d' must be a constant integer}}
+  return res;
+}
+
+v2i64 vsrarni_d_q(v2i64 _1, v2i64 _2, int var) {
+  v2i64 res = __lsx_vsrarni_d_q(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 127]}}
+  res |= __lsx_vsrarni_d_q(_1, _2, 128); // expected-error {{argument value 128 is outside the valid range [0, 127]}}
+  res |= __lsx_vsrarni_d_q(_1, _2, var); // expected-error {{argument to '__builtin_lsx_vsrarni_d_q' must be a constant integer}}
+  return res;
+}
+
+v16i8 vssrani_b_h(v16i8 _1, v16i8 _2, int var) {
+  v16i8 res = __lsx_vssrani_b_h(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 15]}}
+  res |= __lsx_vssrani_b_h(_1, _2, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  res |= __lsx_vssrani_b_h(_1, _2, var); // expected-error {{argument to '__builtin_lsx_vssrani_b_h' must be a constant integer}}
+  return res;
+}
+
+v8i16 vssrani_h_w(v8i16 _1, v8i16 _2, int var) {
+  v8i16 res = __lsx_vssrani_h_w(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __lsx_vssrani_h_w(_1, _2, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __lsx_vssrani_h_w(_1, _2, var); // expected-error {{argument to '__builtin_lsx_vssrani_h_w' must be a constant integer}}
+  return res;
+}
+
+v4i32 vssrani_w_d(v4i32 _1, v4i32 _2, int var) {
+  v4i32 res = __lsx_vssrani_w_d(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 63]}}
+  res |= __lsx_vssrani_w_d(_1, _2, 64); // expected-error {{argument value 64 is outside the valid range [0, 63]}}
+  res |= __lsx_vssrani_w_d(_1, _2, var); // expected-error {{argument to '__builtin_lsx_vssrani_w_d' must be a constant integer}}
+  return res;
+}
+
+v2i64 vssrani_d_q(v2i64 _1, v2i64 _2, int var) {
+  v2i64 res = __lsx_vssrani_d_q(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 127]}}
+  res |= __lsx_vssrani_d_q(_1, _2, 128); // expected-error {{argument value 128 is outside the valid range [0, 127]}}
+  res |= __lsx_vssrani_d_q(_1, _2, var); // expected-error {{argument to '__builtin_lsx_vssrani_d_q' must be a constant integer}}
+  return res;
+}
+
+v16u8 vssrani_bu_h(v16u8 _1, v16i8 _2, int var) {
+  v16u8 res = __lsx_vssrani_bu_h(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 15]}}
+  res |= __lsx_vssrani_bu_h(_1, _2, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  res |= __lsx_vssrani_bu_h(_1, _2, var); // expected-error {{argument to '__builtin_lsx_vssrani_bu_h' must be a constant integer}}
+  return res;
+}
+
+v8u16 vssrani_hu_w(v8u16 _1, v8i16 _2, int var) {
+  v8u16 res = __lsx_vssrani_hu_w(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __lsx_vssrani_hu_w(_1, _2, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __lsx_vssrani_hu_w(_1, _2, var); // expected-error {{argument to '__builtin_lsx_vssrani_hu_w' must be a constant integer}}
+  return res;
+}
+
+v4u32 vssrani_wu_d(v4u32 _1, v4i32 _2, int var) {
+  v4u32 res = __lsx_vssrani_wu_d(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 63]}}
+  res |= __lsx_vssrani_wu_d(_1, _2, 64); // expected-error {{argument value 64 is outside the valid range [0, 63]}}
+  res |= __lsx_vssrani_wu_d(_1, _2, var); // expected-error {{argument to '__builtin_lsx_vssrani_wu_d' must be a constant integer}}
+  return res;
+}
+
+v2u64 vssrani_du_q(v2u64 _1, v2i64 _2, int var) {
+  v2u64 res = __lsx_vssrani_du_q(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 127]}}
+  res |= __lsx_vssrani_du_q(_1, _2, 128); // expected-error {{argument value 128 is outside the valid range [0, 127]}}
+  res |= __lsx_vssrani_du_q(_1, _2, var); // expected-error {{argument to '__builtin_lsx_vssrani_du_q' must be a constant integer}}
+  return res;
+}
+
+v16i8 vssrarni_b_h(v16i8 _1, v16i8 _2, int var) {
+  v16i8 res = __lsx_vssrarni_b_h(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 15]}}
+  res |= __lsx_vssrarni_b_h(_1, _2, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  res |= __lsx_vssrarni_b_h(_1, _2, var); // expected-error {{argument to '__builtin_lsx_vssrarni_b_h' must be a constant integer}}
+  return res;
+}
+
+v8i16 vssrarni_h_w(v8i16 _1, v8i16 _2, int var) {
+  v8i16 res = __lsx_vssrarni_h_w(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __lsx_vssrarni_h_w(_1, _2, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __lsx_vssrarni_h_w(_1, _2, var); // expected-error {{argument to '__builtin_lsx_vssrarni_h_w' must be a constant integer}}
+  return res;
+}
+
+v4i32 vssrarni_w_d(v4i32 _1, v4i32 _2, int var) {
+  v4i32 res = __lsx_vssrarni_w_d(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 63]}}
+  res |= __lsx_vssrarni_w_d(_1, _2, 64); // expected-error {{argument value 64 is outside the valid range [0, 63]}}
+  res |= __lsx_vssrarni_w_d(_1, _2, var); // expected-error {{argument to '__builtin_lsx_vssrarni_w_d' must be a constant integer}}
+  return res;
+}
+
+v2i64 vssrarni_d_q(v2i64 _1, v2i64 _2, int var) {
+  v2i64 res = __lsx_vssrarni_d_q(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 127]}}
+  res |= __lsx_vssrarni_d_q(_1, _2, 128); // expected-error {{argument value 128 is outside the valid range [0, 127]}}
+  res |= __lsx_vssrarni_d_q(_1, _2, var); // expected-error {{argument to '__builtin_lsx_vssrarni_d_q' must be a constant integer}}
+  return res;
+}
+
+v16u8 vssrarni_bu_h(v16u8 _1, v16i8 _2, int var) {
+  v16u8 res = __lsx_vssrarni_bu_h(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 15]}}
+  res |= __lsx_vssrarni_bu_h(_1, _2, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  res |= __lsx_vssrarni_bu_h(_1, _2, var); // expected-error {{argument to '__builtin_lsx_vssrarni_bu_h' must be a constant integer}}
+  return res;
+}
+
+v8u16 vssrarni_hu_w(v8u16 _1, v8i16 _2, int var) {
+  v8u16 res = __lsx_vssrarni_hu_w(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __lsx_vssrarni_hu_w(_1, _2, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __lsx_vssrarni_hu_w(_1, _2, var); // expected-error {{argument to '__builtin_lsx_vssrarni_hu_w' must be a constant integer}}
+  return res;
+}
+
+v4u32 vssrarni_wu_d(v4u32 _1, v4i32 _2, int var) {
+  v4u32 res = __lsx_vssrarni_wu_d(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 63]}}
+  res |= __lsx_vssrarni_wu_d(_1, _2, 64); // expected-error {{argument value 64 is outside the valid range [0, 63]}}
+  res |= __lsx_vssrarni_wu_d(_1, _2, var); // expected-error {{argument to '__builtin_lsx_vssrarni_wu_d' must be a constant integer}}
+  return res;
+}
+
+v2u64 vssrarni_du_q(v2u64 _1, v2i64 _2, int var) {
+  v2u64 res = __lsx_vssrarni_du_q(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 127]}}
+  res |= __lsx_vssrarni_du_q(_1, _2, 128); // expected-error {{argument value 128 is outside the valid range [0, 127]}}
+  res |= __lsx_vssrarni_du_q(_1, _2, var); // expected-error {{argument to '__builtin_lsx_vssrarni_du_q' must be a constant integer}}
+  return res;
+}
+
+v4i32 vpermi_w(v4i32 _1, v4i32 _2, int var) {
+  v4i32 res = __lsx_vpermi_w(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 255]}}
+  res |= __lsx_vpermi_w(_1, _2, 256); // expected-error {{argument value 256 is outside the valid range [0, 255]}}
+  res |= __lsx_vpermi_w(_1, _2, var); // expected-error {{argument to '__builtin_lsx_vpermi_w' must be a constant integer}}
+  return res;
+}
+
+v16i8 vld(void *_1, int var) {
+  v16i8 res = __lsx_vld(_1, -2049); // expected-error {{argument value -2049 is outside the valid range [-2048, 2047]}}
+  res |= __lsx_vld(_1, 2048); // expected-error {{argument value 2048 is outside the valid range [-2048, 2047]}}
+  res |= __lsx_vld(_1, var); // expected-error {{argument to '__builtin_lsx_vld' must be a constant integer}}
+  return res;
+}
+
+void vst(v16i8 _1, void *_2, int var) {
+  __lsx_vst(_1, _2, -2049); // expected-error {{argument value -2049 is outside the valid range [-2048, 2047]}}
+  __lsx_vst(_1, _2, 2048); // expected-error {{argument value 2048 is outside the valid range [-2048, 2047]}}
+  __lsx_vst(_1, _2, var); // expected-error {{argument to '__builtin_lsx_vst' must be a constant integer}}
+}
+
+v2i64 vldi(int var) {
+  v2i64 res = __lsx_vldi(-4097); // expected-error {{argument value -4097 is outside the valid range [-4096, 4095]}}
+  res |= __lsx_vldi(4096); // expected-error {{argument value 4096 is outside the valid range [-4096, 4095]}}
+  res |= __lsx_vldi(var); // expected-error {{argument to '__builtin_lsx_vldi' must be a constant integer}}
+  return res;
+}
+
+v16i8 vrepli_b(int var) {
+  v16i8 res = __lsx_vrepli_b(-513); // expected-error {{argument value -513 is outside the valid range [-512, 511]}}
+  res |= __lsx_vrepli_b(512); // expected-error {{argument value 512 is outside the valid range [-512, 511]}}
+  res |= __lsx_vrepli_b(var); // expected-error {{argument to '__builtin_lsx_vrepli_b' must be a constant integer}}
+  return res;
+}
+
+v2i64 vrepli_d(int var) {
+  v2i64 res = __lsx_vrepli_d(-513); // expected-error {{argument value -513 is outside the valid range [-512, 511]}}
+  res |= __lsx_vrepli_d(512); // expected-error {{argument value 512 is outside the valid range [-512, 511]}}
+  res |= __lsx_vrepli_d(var); // expected-error {{argument to '__builtin_lsx_vrepli_d' must be a constant integer}}
+  return res;
+}
+
+v8i16 vrepli_h(int var) {
+  v8i16 res = __lsx_vrepli_h(-513); // expected-error {{argument value -513 is outside the valid range [-512, 511]}}
+  res |= __lsx_vrepli_h(512); // expected-error {{argument value 512 is outside the valid range [-512, 511]}}
+  res |= __lsx_vrepli_h(var); // expected-error {{argument to '__builtin_lsx_vrepli_h' must be a constant integer}}
+  return res;
+}
+
+v4i32 vrepli_w(int var) {
+  v4i32 res = __lsx_vrepli_w(-513); // expected-error {{argument value -513 is outside the valid range [-512, 511]}}
+  res |= __lsx_vrepli_w(512); // expected-error {{argument value 512 is outside the valid range [-512, 511]}}
+  res |= __lsx_vrepli_w(var); // expected-error {{argument to '__builtin_lsx_vrepli_w' must be a constant integer}}
+  return res;
+}
diff --git a/clang/test/CodeGen/LoongArch/lsx/builtin-alias.c b/clang/test/CodeGen/LoongArch/lsx/builtin-alias.c
new file mode 100644
index 0000000000000..331e29fb7d17f
--- /dev/null
+++ b/clang/test/CodeGen/LoongArch/lsx/builtin-alias.c
@@ -0,0 +1,4451 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// RUN: %clang_cc1 -triple loongarch64 -target-feature +lsx -O2 -emit-llvm %s -o - | FileCheck %s
+
+#include <lsxintrin.h>
+
+// CHECK-LABEL: @vsll_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsll.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vsll_b(v16i8 _1, v16i8 _2) { return __lsx_vsll_b(_1, _2); }
+// CHECK-LABEL: @vsll_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsll.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vsll_h(v8i16 _1, v8i16 _2) { return __lsx_vsll_h(_1, _2); }
+// CHECK-LABEL: @vsll_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsll.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vsll_w(v4i32 _1, v4i32 _2) { return __lsx_vsll_w(_1, _2); }
+// CHECK-LABEL: @vsll_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsll.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vsll_d(v2i64 _1, v2i64 _2) { return __lsx_vsll_d(_1, _2); }
+// CHECK-LABEL: @vslli_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vslli.b(<16 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vslli_b(v16i8 _1) { return __lsx_vslli_b(_1, 1); }
+// CHECK-LABEL: @vslli_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vslli.h(<8 x i16> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vslli_h(v8i16 _1) { return __lsx_vslli_h(_1, 1); }
+// CHECK-LABEL: @vslli_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vslli.w(<4 x i32> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vslli_w(v4i32 _1) { return __lsx_vslli_w(_1, 1); }
+// CHECK-LABEL: @vslli_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vslli.d(<2 x i64> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vslli_d(v2i64 _1) { return __lsx_vslli_d(_1, 1); }
+// CHECK-LABEL: @vsra_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsra.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vsra_b(v16i8 _1, v16i8 _2) { return __lsx_vsra_b(_1, _2); }
+// CHECK-LABEL: @vsra_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsra.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vsra_h(v8i16 _1, v8i16 _2) { return __lsx_vsra_h(_1, _2); }
+// CHECK-LABEL: @vsra_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsra.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vsra_w(v4i32 _1, v4i32 _2) { return __lsx_vsra_w(_1, _2); }
+// CHECK-LABEL: @vsra_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsra.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vsra_d(v2i64 _1, v2i64 _2) { return __lsx_vsra_d(_1, _2); }
+// CHECK-LABEL: @vsrai_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrai.b(<16 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vsrai_b(v16i8 _1) { return __lsx_vsrai_b(_1, 1); }
+// CHECK-LABEL: @vsrai_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrai.h(<8 x i16> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vsrai_h(v8i16 _1) { return __lsx_vsrai_h(_1, 1); }
+// CHECK-LABEL: @vsrai_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrai.w(<4 x i32> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vsrai_w(v4i32 _1) { return __lsx_vsrai_w(_1, 1); }
+// CHECK-LABEL: @vsrai_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsrai.d(<2 x i64> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vsrai_d(v2i64 _1) { return __lsx_vsrai_d(_1, 1); }
+// CHECK-LABEL: @vsrar_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrar.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vsrar_b(v16i8 _1, v16i8 _2) { return __lsx_vsrar_b(_1, _2); }
+// CHECK-LABEL: @vsrar_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrar.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vsrar_h(v8i16 _1, v8i16 _2) { return __lsx_vsrar_h(_1, _2); }
+// CHECK-LABEL: @vsrar_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrar.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vsrar_w(v4i32 _1, v4i32 _2) { return __lsx_vsrar_w(_1, _2); }
+// CHECK-LABEL: @vsrar_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsrar.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vsrar_d(v2i64 _1, v2i64 _2) { return __lsx_vsrar_d(_1, _2); }
+// CHECK-LABEL: @vsrari_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrari.b(<16 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vsrari_b(v16i8 _1) { return __lsx_vsrari_b(_1, 1); }
+// CHECK-LABEL: @vsrari_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrari.h(<8 x i16> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vsrari_h(v8i16 _1) { return __lsx_vsrari_h(_1, 1); }
+// CHECK-LABEL: @vsrari_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrari.w(<4 x i32> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vsrari_w(v4i32 _1) { return __lsx_vsrari_w(_1, 1); }
+// CHECK-LABEL: @vsrari_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsrari.d(<2 x i64> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vsrari_d(v2i64 _1) { return __lsx_vsrari_d(_1, 1); }
+// CHECK-LABEL: @vsrl_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrl.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vsrl_b(v16i8 _1, v16i8 _2) { return __lsx_vsrl_b(_1, _2); }
+// CHECK-LABEL: @vsrl_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrl.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vsrl_h(v8i16 _1, v8i16 _2) { return __lsx_vsrl_h(_1, _2); }
+// CHECK-LABEL: @vsrl_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrl.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vsrl_w(v4i32 _1, v4i32 _2) { return __lsx_vsrl_w(_1, _2); }
+// CHECK-LABEL: @vsrl_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsrl.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vsrl_d(v2i64 _1, v2i64 _2) { return __lsx_vsrl_d(_1, _2); }
+// CHECK-LABEL: @vsrli_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrli.b(<16 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vsrli_b(v16i8 _1) { return __lsx_vsrli_b(_1, 1); }
+// CHECK-LABEL: @vsrli_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrli.h(<8 x i16> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vsrli_h(v8i16 _1) { return __lsx_vsrli_h(_1, 1); }
+// CHECK-LABEL: @vsrli_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrli.w(<4 x i32> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vsrli_w(v4i32 _1) { return __lsx_vsrli_w(_1, 1); }
+// CHECK-LABEL: @vsrli_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsrli.d(<2 x i64> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vsrli_d(v2i64 _1) { return __lsx_vsrli_d(_1, 1); }
+// CHECK-LABEL: @vsrlr_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrlr.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vsrlr_b(v16i8 _1, v16i8 _2) { return __lsx_vsrlr_b(_1, _2); }
+// CHECK-LABEL: @vsrlr_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrlr.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vsrlr_h(v8i16 _1, v8i16 _2) { return __lsx_vsrlr_h(_1, _2); }
+// CHECK-LABEL: @vsrlr_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrlr.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vsrlr_w(v4i32 _1, v4i32 _2) { return __lsx_vsrlr_w(_1, _2); }
+// CHECK-LABEL: @vsrlr_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsrlr.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vsrlr_d(v2i64 _1, v2i64 _2) { return __lsx_vsrlr_d(_1, _2); }
+// CHECK-LABEL: @vsrlri_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrlri.b(<16 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vsrlri_b(v16i8 _1) { return __lsx_vsrlri_b(_1, 1); }
+// CHECK-LABEL: @vsrlri_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrlri.h(<8 x i16> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vsrlri_h(v8i16 _1) { return __lsx_vsrlri_h(_1, 1); }
+// CHECK-LABEL: @vsrlri_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrlri.w(<4 x i32> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vsrlri_w(v4i32 _1) { return __lsx_vsrlri_w(_1, 1); }
+// CHECK-LABEL: @vsrlri_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsrlri.d(<2 x i64> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vsrlri_d(v2i64 _1) { return __lsx_vsrlri_d(_1, 1); }
+// CHECK-LABEL: @vbitclr_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vbitclr.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16u8 vbitclr_b(v16u8 _1, v16u8 _2) { return __lsx_vbitclr_b(_1, _2); }
+// CHECK-LABEL: @vbitclr_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vbitclr.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8u16 vbitclr_h(v8u16 _1, v8u16 _2) { return __lsx_vbitclr_h(_1, _2); }
+// CHECK-LABEL: @vbitclr_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vbitclr.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4u32 vbitclr_w(v4u32 _1, v4u32 _2) { return __lsx_vbitclr_w(_1, _2); }
+// CHECK-LABEL: @vbitclr_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vbitclr.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2u64 vbitclr_d(v2u64 _1, v2u64 _2) { return __lsx_vbitclr_d(_1, _2); }
+// CHECK-LABEL: @vbitclri_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vbitclri.b(<16 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16u8 vbitclri_b(v16u8 _1) { return __lsx_vbitclri_b(_1, 1); }
+// CHECK-LABEL: @vbitclri_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vbitclri.h(<8 x i16> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8u16 vbitclri_h(v8u16 _1) { return __lsx_vbitclri_h(_1, 1); }
+// CHECK-LABEL: @vbitclri_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vbitclri.w(<4 x i32> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4u32 vbitclri_w(v4u32 _1) { return __lsx_vbitclri_w(_1, 1); }
+// CHECK-LABEL: @vbitclri_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vbitclri.d(<2 x i64> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2u64 vbitclri_d(v2u64 _1) { return __lsx_vbitclri_d(_1, 1); }
+// CHECK-LABEL: @vbitset_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vbitset.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16u8 vbitset_b(v16u8 _1, v16u8 _2) { return __lsx_vbitset_b(_1, _2); }
+// CHECK-LABEL: @vbitset_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vbitset.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8u16 vbitset_h(v8u16 _1, v8u16 _2) { return __lsx_vbitset_h(_1, _2); }
+// CHECK-LABEL: @vbitset_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vbitset.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4u32 vbitset_w(v4u32 _1, v4u32 _2) { return __lsx_vbitset_w(_1, _2); }
+// CHECK-LABEL: @vbitset_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vbitset.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2u64 vbitset_d(v2u64 _1, v2u64 _2) { return __lsx_vbitset_d(_1, _2); }
+// CHECK-LABEL: @vbitseti_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vbitseti.b(<16 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16u8 vbitseti_b(v16u8 _1) { return __lsx_vbitseti_b(_1, 1); }
+// CHECK-LABEL: @vbitseti_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vbitseti.h(<8 x i16> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8u16 vbitseti_h(v8u16 _1) { return __lsx_vbitseti_h(_1, 1); }
+// CHECK-LABEL: @vbitseti_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vbitseti.w(<4 x i32> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4u32 vbitseti_w(v4u32 _1) { return __lsx_vbitseti_w(_1, 1); }
+// CHECK-LABEL: @vbitseti_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vbitseti.d(<2 x i64> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2u64 vbitseti_d(v2u64 _1) { return __lsx_vbitseti_d(_1, 1); }
+// CHECK-LABEL: @vbitrev_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vbitrev.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16u8 vbitrev_b(v16u8 _1, v16u8 _2) { return __lsx_vbitrev_b(_1, _2); }
+// CHECK-LABEL: @vbitrev_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vbitrev.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8u16 vbitrev_h(v8u16 _1, v8u16 _2) { return __lsx_vbitrev_h(_1, _2); }
+// CHECK-LABEL: @vbitrev_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vbitrev.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4u32 vbitrev_w(v4u32 _1, v4u32 _2) { return __lsx_vbitrev_w(_1, _2); }
+// CHECK-LABEL: @vbitrev_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vbitrev.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2u64 vbitrev_d(v2u64 _1, v2u64 _2) { return __lsx_vbitrev_d(_1, _2); }
+// CHECK-LABEL: @vbitrevi_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vbitrevi.b(<16 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16u8 vbitrevi_b(v16u8 _1) { return __lsx_vbitrevi_b(_1, 1); }
+// CHECK-LABEL: @vbitrevi_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vbitrevi.h(<8 x i16> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8u16 vbitrevi_h(v8u16 _1) { return __lsx_vbitrevi_h(_1, 1); }
+// CHECK-LABEL: @vbitrevi_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vbitrevi.w(<4 x i32> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4u32 vbitrevi_w(v4u32 _1) { return __lsx_vbitrevi_w(_1, 1); }
+// CHECK-LABEL: @vbitrevi_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vbitrevi.d(<2 x i64> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2u64 vbitrevi_d(v2u64 _1) { return __lsx_vbitrevi_d(_1, 1); }
+// CHECK-LABEL: @vadd_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vadd.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vadd_b(v16i8 _1, v16i8 _2) { return __lsx_vadd_b(_1, _2); }
+// CHECK-LABEL: @vadd_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vadd.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vadd_h(v8i16 _1, v8i16 _2) { return __lsx_vadd_h(_1, _2); }
+// CHECK-LABEL: @vadd_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vadd.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vadd_w(v4i32 _1, v4i32 _2) { return __lsx_vadd_w(_1, _2); }
+// CHECK-LABEL: @vadd_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vadd.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vadd_d(v2i64 _1, v2i64 _2) { return __lsx_vadd_d(_1, _2); }
+// CHECK-LABEL: @vaddi_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vaddi.bu(<16 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vaddi_bu(v16i8 _1) { return __lsx_vaddi_bu(_1, 1); }
+// CHECK-LABEL: @vaddi_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vaddi.hu(<8 x i16> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vaddi_hu(v8i16 _1) { return __lsx_vaddi_hu(_1, 1); }
+// CHECK-LABEL: @vaddi_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vaddi.wu(<4 x i32> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vaddi_wu(v4i32 _1) { return __lsx_vaddi_wu(_1, 1); }
+// CHECK-LABEL: @vaddi_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddi.du(<2 x i64> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vaddi_du(v2i64 _1) { return __lsx_vaddi_du(_1, 1); }
+// CHECK-LABEL: @vsub_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsub.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vsub_b(v16i8 _1, v16i8 _2) { return __lsx_vsub_b(_1, _2); }
+// CHECK-LABEL: @vsub_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsub.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vsub_h(v8i16 _1, v8i16 _2) { return __lsx_vsub_h(_1, _2); }
+// CHECK-LABEL: @vsub_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsub.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vsub_w(v4i32 _1, v4i32 _2) { return __lsx_vsub_w(_1, _2); }
+// CHECK-LABEL: @vsub_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsub.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vsub_d(v2i64 _1, v2i64 _2) { return __lsx_vsub_d(_1, _2); }
+// CHECK-LABEL: @vsubi_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsubi.bu(<16 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vsubi_bu(v16i8 _1) { return __lsx_vsubi_bu(_1, 1); }
+// CHECK-LABEL: @vsubi_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsubi.hu(<8 x i16> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vsubi_hu(v8i16 _1) { return __lsx_vsubi_hu(_1, 1); }
+// CHECK-LABEL: @vsubi_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsubi.wu(<4 x i32> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vsubi_wu(v4i32 _1) { return __lsx_vsubi_wu(_1, 1); }
+// CHECK-LABEL: @vsubi_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsubi.du(<2 x i64> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vsubi_du(v2i64 _1) { return __lsx_vsubi_du(_1, 1); }
+// CHECK-LABEL: @vmax_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmax.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vmax_b(v16i8 _1, v16i8 _2) { return __lsx_vmax_b(_1, _2); }
+// CHECK-LABEL: @vmax_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmax.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vmax_h(v8i16 _1, v8i16 _2) { return __lsx_vmax_h(_1, _2); }
+// CHECK-LABEL: @vmax_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmax.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vmax_w(v4i32 _1, v4i32 _2) { return __lsx_vmax_w(_1, _2); }
+// CHECK-LABEL: @vmax_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmax.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vmax_d(v2i64 _1, v2i64 _2) { return __lsx_vmax_d(_1, _2); }
+// CHECK-LABEL: @vmaxi_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmaxi.b(<16 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vmaxi_b(v16i8 _1) { return __lsx_vmaxi_b(_1, 1); }
+// CHECK-LABEL: @vmaxi_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmaxi.h(<8 x i16> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vmaxi_h(v8i16 _1) { return __lsx_vmaxi_h(_1, 1); }
+// CHECK-LABEL: @vmaxi_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmaxi.w(<4 x i32> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vmaxi_w(v4i32 _1) { return __lsx_vmaxi_w(_1, 1); }
+// CHECK-LABEL: @vmaxi_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaxi.d(<2 x i64> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vmaxi_d(v2i64 _1) { return __lsx_vmaxi_d(_1, 1); }
+// CHECK-LABEL: @vmax_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmax.bu(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16u8 vmax_bu(v16u8 _1, v16u8 _2) { return __lsx_vmax_bu(_1, _2); }
+// CHECK-LABEL: @vmax_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmax.hu(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8u16 vmax_hu(v8u16 _1, v8u16 _2) { return __lsx_vmax_hu(_1, _2); }
+// CHECK-LABEL: @vmax_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmax.wu(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4u32 vmax_wu(v4u32 _1, v4u32 _2) { return __lsx_vmax_wu(_1, _2); }
+// CHECK-LABEL: @vmax_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmax.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2u64 vmax_du(v2u64 _1, v2u64 _2) { return __lsx_vmax_du(_1, _2); }
+// CHECK-LABEL: @vmaxi_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmaxi.bu(<16 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16u8 vmaxi_bu(v16u8 _1) { return __lsx_vmaxi_bu(_1, 1); }
+// CHECK-LABEL: @vmaxi_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmaxi.hu(<8 x i16> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8u16 vmaxi_hu(v8u16 _1) { return __lsx_vmaxi_hu(_1, 1); }
+// CHECK-LABEL: @vmaxi_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmaxi.wu(<4 x i32> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4u32 vmaxi_wu(v4u32 _1) { return __lsx_vmaxi_wu(_1, 1); }
+// CHECK-LABEL: @vmaxi_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaxi.du(<2 x i64> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2u64 vmaxi_du(v2u64 _1) { return __lsx_vmaxi_du(_1, 1); }
+// CHECK-LABEL: @vmin_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmin.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vmin_b(v16i8 _1, v16i8 _2) { return __lsx_vmin_b(_1, _2); }
+// CHECK-LABEL: @vmin_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmin.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vmin_h(v8i16 _1, v8i16 _2) { return __lsx_vmin_h(_1, _2); }
+// CHECK-LABEL: @vmin_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmin.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vmin_w(v4i32 _1, v4i32 _2) { return __lsx_vmin_w(_1, _2); }
+// CHECK-LABEL: @vmin_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmin.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vmin_d(v2i64 _1, v2i64 _2) { return __lsx_vmin_d(_1, _2); }
+// CHECK-LABEL: @vmini_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmini.b(<16 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vmini_b(v16i8 _1) { return __lsx_vmini_b(_1, 1); }
+// CHECK-LABEL: @vmini_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmini.h(<8 x i16> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vmini_h(v8i16 _1) { return __lsx_vmini_h(_1, 1); }
+// CHECK-LABEL: @vmini_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmini.w(<4 x i32> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vmini_w(v4i32 _1) { return __lsx_vmini_w(_1, 1); }
+// CHECK-LABEL: @vmini_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmini.d(<2 x i64> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vmini_d(v2i64 _1) { return __lsx_vmini_d(_1, 1); }
+// CHECK-LABEL: @vmin_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmin.bu(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16u8 vmin_bu(v16u8 _1, v16u8 _2) { return __lsx_vmin_bu(_1, _2); }
+// CHECK-LABEL: @vmin_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmin.hu(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8u16 vmin_hu(v8u16 _1, v8u16 _2) { return __lsx_vmin_hu(_1, _2); }
+// CHECK-LABEL: @vmin_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmin.wu(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4u32 vmin_wu(v4u32 _1, v4u32 _2) { return __lsx_vmin_wu(_1, _2); }
+// CHECK-LABEL: @vmin_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmin.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2u64 vmin_du(v2u64 _1, v2u64 _2) { return __lsx_vmin_du(_1, _2); }
+// CHECK-LABEL: @vmini_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmini.bu(<16 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16u8 vmini_bu(v16u8 _1) { return __lsx_vmini_bu(_1, 1); }
+// CHECK-LABEL: @vmini_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmini.hu(<8 x i16> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8u16 vmini_hu(v8u16 _1) { return __lsx_vmini_hu(_1, 1); }
+// CHECK-LABEL: @vmini_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmini.wu(<4 x i32> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4u32 vmini_wu(v4u32 _1) { return __lsx_vmini_wu(_1, 1); }
+// CHECK-LABEL: @vmini_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmini.du(<2 x i64> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2u64 vmini_du(v2u64 _1) { return __lsx_vmini_du(_1, 1); }
+// CHECK-LABEL: @vseq_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vseq.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vseq_b(v16i8 _1, v16i8 _2) { return __lsx_vseq_b(_1, _2); }
+// CHECK-LABEL: @vseq_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vseq.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vseq_h(v8i16 _1, v8i16 _2) { return __lsx_vseq_h(_1, _2); }
+// CHECK-LABEL: @vseq_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vseq.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vseq_w(v4i32 _1, v4i32 _2) { return __lsx_vseq_w(_1, _2); }
+// CHECK-LABEL: @vseq_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vseq.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vseq_d(v2i64 _1, v2i64 _2) { return __lsx_vseq_d(_1, _2); }
+// CHECK-LABEL: @vseqi_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vseqi.b(<16 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vseqi_b(v16i8 _1) { return __lsx_vseqi_b(_1, 1); }
+// CHECK-LABEL: @vseqi_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vseqi.h(<8 x i16> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vseqi_h(v8i16 _1) { return __lsx_vseqi_h(_1, 1); }
+// CHECK-LABEL: @vseqi_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vseqi.w(<4 x i32> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vseqi_w(v4i32 _1) { return __lsx_vseqi_w(_1, 1); }
+// CHECK-LABEL: @vseqi_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vseqi.d(<2 x i64> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vseqi_d(v2i64 _1) { return __lsx_vseqi_d(_1, 1); }
+// CHECK-LABEL: @vslti_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vslti.b(<16 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vslti_b(v16i8 _1) { return __lsx_vslti_b(_1, 1); }
+// CHECK-LABEL: @vslt_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vslt.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vslt_b(v16i8 _1, v16i8 _2) { return __lsx_vslt_b(_1, _2); }
+// CHECK-LABEL: @vslt_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vslt.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vslt_h(v8i16 _1, v8i16 _2) { return __lsx_vslt_h(_1, _2); }
+// CHECK-LABEL: @vslt_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vslt.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vslt_w(v4i32 _1, v4i32 _2) { return __lsx_vslt_w(_1, _2); }
+// CHECK-LABEL: @vslt_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vslt.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vslt_d(v2i64 _1, v2i64 _2) { return __lsx_vslt_d(_1, _2); }
+// CHECK-LABEL: @vslti_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vslti.h(<8 x i16> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vslti_h(v8i16 _1) { return __lsx_vslti_h(_1, 1); }
+// CHECK-LABEL: @vslti_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vslti.w(<4 x i32> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vslti_w(v4i32 _1) { return __lsx_vslti_w(_1, 1); }
+// CHECK-LABEL: @vslti_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vslti.d(<2 x i64> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vslti_d(v2i64 _1) { return __lsx_vslti_d(_1, 1); }
+// CHECK-LABEL: @vslt_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vslt.bu(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vslt_bu(v16u8 _1, v16u8 _2) { return __lsx_vslt_bu(_1, _2); }
+// CHECK-LABEL: @vslt_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vslt.hu(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vslt_hu(v8u16 _1, v8u16 _2) { return __lsx_vslt_hu(_1, _2); }
+// CHECK-LABEL: @vslt_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vslt.wu(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vslt_wu(v4u32 _1, v4u32 _2) { return __lsx_vslt_wu(_1, _2); }
+// CHECK-LABEL: @vslt_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vslt.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vslt_du(v2u64 _1, v2u64 _2) { return __lsx_vslt_du(_1, _2); }
+// CHECK-LABEL: @vslti_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vslti.bu(<16 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vslti_bu(v16u8 _1) { return __lsx_vslti_bu(_1, 1); }
+// CHECK-LABEL: @vslti_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vslti.hu(<8 x i16> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vslti_hu(v8u16 _1) { return __lsx_vslti_hu(_1, 1); }
+// CHECK-LABEL: @vslti_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vslti.wu(<4 x i32> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vslti_wu(v4u32 _1) { return __lsx_vslti_wu(_1, 1); }
+// CHECK-LABEL: @vslti_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vslti.du(<2 x i64> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vslti_du(v2u64 _1) { return __lsx_vslti_du(_1, 1); }
+// CHECK-LABEL: @vsle_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsle.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vsle_b(v16i8 _1, v16i8 _2) { return __lsx_vsle_b(_1, _2); }
+// CHECK-LABEL: @vsle_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsle.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vsle_h(v8i16 _1, v8i16 _2) { return __lsx_vsle_h(_1, _2); }
+// CHECK-LABEL: @vsle_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsle.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vsle_w(v4i32 _1, v4i32 _2) { return __lsx_vsle_w(_1, _2); }
+// CHECK-LABEL: @vsle_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsle.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vsle_d(v2i64 _1, v2i64 _2) { return __lsx_vsle_d(_1, _2); }
+// CHECK-LABEL: @vslei_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vslei.b(<16 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vslei_b(v16i8 _1) { return __lsx_vslei_b(_1, 1); }
+// CHECK-LABEL: @vslei_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vslei.h(<8 x i16> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vslei_h(v8i16 _1) { return __lsx_vslei_h(_1, 1); }
+// CHECK-LABEL: @vslei_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vslei.w(<4 x i32> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vslei_w(v4i32 _1) { return __lsx_vslei_w(_1, 1); }
+// CHECK-LABEL: @vslei_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vslei.d(<2 x i64> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vslei_d(v2i64 _1) { return __lsx_vslei_d(_1, 1); }
+// CHECK-LABEL: @vsle_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsle.bu(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vsle_bu(v16u8 _1, v16u8 _2) { return __lsx_vsle_bu(_1, _2); }
+// CHECK-LABEL: @vsle_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsle.hu(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vsle_hu(v8u16 _1, v8u16 _2) { return __lsx_vsle_hu(_1, _2); }
+// CHECK-LABEL: @vsle_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsle.wu(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vsle_wu(v4u32 _1, v4u32 _2) { return __lsx_vsle_wu(_1, _2); }
+// CHECK-LABEL: @vsle_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsle.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vsle_du(v2u64 _1, v2u64 _2) { return __lsx_vsle_du(_1, _2); }
+// CHECK-LABEL: @vslei_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vslei.bu(<16 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vslei_bu(v16u8 _1) { return __lsx_vslei_bu(_1, 1); }
+// CHECK-LABEL: @vslei_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vslei.hu(<8 x i16> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vslei_hu(v8u16 _1) { return __lsx_vslei_hu(_1, 1); }
+// CHECK-LABEL: @vslei_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vslei.wu(<4 x i32> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vslei_wu(v4u32 _1) { return __lsx_vslei_wu(_1, 1); }
+// CHECK-LABEL: @vslei_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vslei.du(<2 x i64> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vslei_du(v2u64 _1) { return __lsx_vslei_du(_1, 1); }
+// CHECK-LABEL: @vsat_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsat.b(<16 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vsat_b(v16i8 _1) { return __lsx_vsat_b(_1, 1); }
+// CHECK-LABEL: @vsat_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsat.h(<8 x i16> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vsat_h(v8i16 _1) { return __lsx_vsat_h(_1, 1); }
+// CHECK-LABEL: @vsat_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsat.w(<4 x i32> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vsat_w(v4i32 _1) { return __lsx_vsat_w(_1, 1); }
+// CHECK-LABEL: @vsat_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsat.d(<2 x i64> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vsat_d(v2i64 _1) { return __lsx_vsat_d(_1, 1); }
+// CHECK-LABEL: @vsat_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsat.bu(<16 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16u8 vsat_bu(v16u8 _1) { return __lsx_vsat_bu(_1, 1); }
+// CHECK-LABEL: @vsat_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsat.hu(<8 x i16> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8u16 vsat_hu(v8u16 _1) { return __lsx_vsat_hu(_1, 1); }
+// CHECK-LABEL: @vsat_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsat.wu(<4 x i32> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4u32 vsat_wu(v4u32 _1) { return __lsx_vsat_wu(_1, 1); }
+// CHECK-LABEL: @vsat_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsat.du(<2 x i64> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2u64 vsat_du(v2u64 _1) { return __lsx_vsat_du(_1, 1); }
+// CHECK-LABEL: @vadda_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vadda.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vadda_b(v16i8 _1, v16i8 _2) { return __lsx_vadda_b(_1, _2); }
+// CHECK-LABEL: @vadda_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vadda.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vadda_h(v8i16 _1, v8i16 _2) { return __lsx_vadda_h(_1, _2); }
+// CHECK-LABEL: @vadda_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vadda.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vadda_w(v4i32 _1, v4i32 _2) { return __lsx_vadda_w(_1, _2); }
+// CHECK-LABEL: @vadda_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vadda.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vadda_d(v2i64 _1, v2i64 _2) { return __lsx_vadda_d(_1, _2); }
+// CHECK-LABEL: @vsadd_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsadd.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vsadd_b(v16i8 _1, v16i8 _2) { return __lsx_vsadd_b(_1, _2); }
+// CHECK-LABEL: @vsadd_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsadd.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vsadd_h(v8i16 _1, v8i16 _2) { return __lsx_vsadd_h(_1, _2); }
+// CHECK-LABEL: @vsadd_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsadd.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vsadd_w(v4i32 _1, v4i32 _2) { return __lsx_vsadd_w(_1, _2); }
+// CHECK-LABEL: @vsadd_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsadd.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vsadd_d(v2i64 _1, v2i64 _2) { return __lsx_vsadd_d(_1, _2); }
+// CHECK-LABEL: @vsadd_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsadd.bu(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16u8 vsadd_bu(v16u8 _1, v16u8 _2) { return __lsx_vsadd_bu(_1, _2); }
+// CHECK-LABEL: @vsadd_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsadd.hu(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8u16 vsadd_hu(v8u16 _1, v8u16 _2) { return __lsx_vsadd_hu(_1, _2); }
+// CHECK-LABEL: @vsadd_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsadd.wu(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4u32 vsadd_wu(v4u32 _1, v4u32 _2) { return __lsx_vsadd_wu(_1, _2); }
+// CHECK-LABEL: @vsadd_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsadd.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2u64 vsadd_du(v2u64 _1, v2u64 _2) { return __lsx_vsadd_du(_1, _2); }
+// CHECK-LABEL: @vavg_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vavg.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vavg_b(v16i8 _1, v16i8 _2) { return __lsx_vavg_b(_1, _2); }
+// CHECK-LABEL: @vavg_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vavg.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vavg_h(v8i16 _1, v8i16 _2) { return __lsx_vavg_h(_1, _2); }
+// CHECK-LABEL: @vavg_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vavg.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vavg_w(v4i32 _1, v4i32 _2) { return __lsx_vavg_w(_1, _2); }
+// CHECK-LABEL: @vavg_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vavg.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vavg_d(v2i64 _1, v2i64 _2) { return __lsx_vavg_d(_1, _2); }
+// CHECK-LABEL: @vavg_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vavg.bu(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16u8 vavg_bu(v16u8 _1, v16u8 _2) { return __lsx_vavg_bu(_1, _2); }
+// CHECK-LABEL: @vavg_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vavg.hu(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8u16 vavg_hu(v8u16 _1, v8u16 _2) { return __lsx_vavg_hu(_1, _2); }
+// CHECK-LABEL: @vavg_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vavg.wu(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4u32 vavg_wu(v4u32 _1, v4u32 _2) { return __lsx_vavg_wu(_1, _2); }
+// CHECK-LABEL: @vavg_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vavg.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2u64 vavg_du(v2u64 _1, v2u64 _2) { return __lsx_vavg_du(_1, _2); }
+// CHECK-LABEL: @vavgr_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vavgr.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vavgr_b(v16i8 _1, v16i8 _2) { return __lsx_vavgr_b(_1, _2); }
+// CHECK-LABEL: @vavgr_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vavgr.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vavgr_h(v8i16 _1, v8i16 _2) { return __lsx_vavgr_h(_1, _2); }
+// CHECK-LABEL: @vavgr_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vavgr.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vavgr_w(v4i32 _1, v4i32 _2) { return __lsx_vavgr_w(_1, _2); }
+// CHECK-LABEL: @vavgr_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vavgr.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vavgr_d(v2i64 _1, v2i64 _2) { return __lsx_vavgr_d(_1, _2); }
+// CHECK-LABEL: @vavgr_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vavgr.bu(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16u8 vavgr_bu(v16u8 _1, v16u8 _2) { return __lsx_vavgr_bu(_1, _2); }
+// CHECK-LABEL: @vavgr_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vavgr.hu(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8u16 vavgr_hu(v8u16 _1, v8u16 _2) { return __lsx_vavgr_hu(_1, _2); }
+// CHECK-LABEL: @vavgr_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vavgr.wu(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4u32 vavgr_wu(v4u32 _1, v4u32 _2) { return __lsx_vavgr_wu(_1, _2); }
+// CHECK-LABEL: @vavgr_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vavgr.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2u64 vavgr_du(v2u64 _1, v2u64 _2) { return __lsx_vavgr_du(_1, _2); }
+// CHECK-LABEL: @vssub_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssub.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vssub_b(v16i8 _1, v16i8 _2) { return __lsx_vssub_b(_1, _2); }
+// CHECK-LABEL: @vssub_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssub.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vssub_h(v8i16 _1, v8i16 _2) { return __lsx_vssub_h(_1, _2); }
+// CHECK-LABEL: @vssub_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssub.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vssub_w(v4i32 _1, v4i32 _2) { return __lsx_vssub_w(_1, _2); }
+// CHECK-LABEL: @vssub_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vssub.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vssub_d(v2i64 _1, v2i64 _2) { return __lsx_vssub_d(_1, _2); }
+// CHECK-LABEL: @vssub_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssub.bu(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16u8 vssub_bu(v16u8 _1, v16u8 _2) { return __lsx_vssub_bu(_1, _2); }
+// CHECK-LABEL: @vssub_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssub.hu(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8u16 vssub_hu(v8u16 _1, v8u16 _2) { return __lsx_vssub_hu(_1, _2); }
+// CHECK-LABEL: @vssub_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssub.wu(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4u32 vssub_wu(v4u32 _1, v4u32 _2) { return __lsx_vssub_wu(_1, _2); }
+// CHECK-LABEL: @vssub_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vssub.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2u64 vssub_du(v2u64 _1, v2u64 _2) { return __lsx_vssub_du(_1, _2); }
+// CHECK-LABEL: @vabsd_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vabsd.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vabsd_b(v16i8 _1, v16i8 _2) { return __lsx_vabsd_b(_1, _2); }
+// CHECK-LABEL: @vabsd_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vabsd.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vabsd_h(v8i16 _1, v8i16 _2) { return __lsx_vabsd_h(_1, _2); }
+// CHECK-LABEL: @vabsd_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vabsd.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vabsd_w(v4i32 _1, v4i32 _2) { return __lsx_vabsd_w(_1, _2); }
+// CHECK-LABEL: @vabsd_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vabsd.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vabsd_d(v2i64 _1, v2i64 _2) { return __lsx_vabsd_d(_1, _2); }
+// CHECK-LABEL: @vabsd_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vabsd.bu(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16u8 vabsd_bu(v16u8 _1, v16u8 _2) { return __lsx_vabsd_bu(_1, _2); }
+// CHECK-LABEL: @vabsd_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vabsd.hu(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8u16 vabsd_hu(v8u16 _1, v8u16 _2) { return __lsx_vabsd_hu(_1, _2); }
+// CHECK-LABEL: @vabsd_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vabsd.wu(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4u32 vabsd_wu(v4u32 _1, v4u32 _2) { return __lsx_vabsd_wu(_1, _2); }
+// CHECK-LABEL: @vabsd_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vabsd.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2u64 vabsd_du(v2u64 _1, v2u64 _2) { return __lsx_vabsd_du(_1, _2); }
+// CHECK-LABEL: @vmul_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmul.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vmul_b(v16i8 _1, v16i8 _2) { return __lsx_vmul_b(_1, _2); }
+// CHECK-LABEL: @vmul_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmul.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vmul_h(v8i16 _1, v8i16 _2) { return __lsx_vmul_h(_1, _2); }
+// CHECK-LABEL: @vmul_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmul.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vmul_w(v4i32 _1, v4i32 _2) { return __lsx_vmul_w(_1, _2); }
+// CHECK-LABEL: @vmul_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmul.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vmul_d(v2i64 _1, v2i64 _2) { return __lsx_vmul_d(_1, _2); }
+// CHECK-LABEL: @vmadd_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmadd.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]], <16 x i8> [[_3:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vmadd_b(v16i8 _1, v16i8 _2, v16i8 _3) {
+  return __lsx_vmadd_b(_1, _2, _3);
+}
+// CHECK-LABEL: @vmadd_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmadd.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]], <8 x i16> [[_3:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vmadd_h(v8i16 _1, v8i16 _2, v8i16 _3) {
+  return __lsx_vmadd_h(_1, _2, _3);
+}
+// CHECK-LABEL: @vmadd_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmadd.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]], <4 x i32> [[_3:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vmadd_w(v4i32 _1, v4i32 _2, v4i32 _3) {
+  return __lsx_vmadd_w(_1, _2, _3);
+}
+// CHECK-LABEL: @vmadd_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmadd.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], <2 x i64> [[_3:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vmadd_d(v2i64 _1, v2i64 _2, v2i64 _3) {
+  return __lsx_vmadd_d(_1, _2, _3);
+}
+// CHECK-LABEL: @vmsub_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmsub.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]], <16 x i8> [[_3:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vmsub_b(v16i8 _1, v16i8 _2, v16i8 _3) {
+  return __lsx_vmsub_b(_1, _2, _3);
+}
+// CHECK-LABEL: @vmsub_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmsub.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]], <8 x i16> [[_3:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vmsub_h(v8i16 _1, v8i16 _2, v8i16 _3) {
+  return __lsx_vmsub_h(_1, _2, _3);
+}
+// CHECK-LABEL: @vmsub_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmsub.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]], <4 x i32> [[_3:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vmsub_w(v4i32 _1, v4i32 _2, v4i32 _3) {
+  return __lsx_vmsub_w(_1, _2, _3);
+}
+// CHECK-LABEL: @vmsub_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmsub.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], <2 x i64> [[_3:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vmsub_d(v2i64 _1, v2i64 _2, v2i64 _3) {
+  return __lsx_vmsub_d(_1, _2, _3);
+}
+// CHECK-LABEL: @vdiv_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vdiv.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vdiv_b(v16i8 _1, v16i8 _2) { return __lsx_vdiv_b(_1, _2); }
+// CHECK-LABEL: @vdiv_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vdiv.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vdiv_h(v8i16 _1, v8i16 _2) { return __lsx_vdiv_h(_1, _2); }
+// CHECK-LABEL: @vdiv_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vdiv.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vdiv_w(v4i32 _1, v4i32 _2) { return __lsx_vdiv_w(_1, _2); }
+// CHECK-LABEL: @vdiv_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vdiv.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vdiv_d(v2i64 _1, v2i64 _2) { return __lsx_vdiv_d(_1, _2); }
+// CHECK-LABEL: @vdiv_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vdiv.bu(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16u8 vdiv_bu(v16u8 _1, v16u8 _2) { return __lsx_vdiv_bu(_1, _2); }
+// CHECK-LABEL: @vdiv_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vdiv.hu(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8u16 vdiv_hu(v8u16 _1, v8u16 _2) { return __lsx_vdiv_hu(_1, _2); }
+// CHECK-LABEL: @vdiv_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vdiv.wu(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4u32 vdiv_wu(v4u32 _1, v4u32 _2) { return __lsx_vdiv_wu(_1, _2); }
+// CHECK-LABEL: @vdiv_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vdiv.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2u64 vdiv_du(v2u64 _1, v2u64 _2) { return __lsx_vdiv_du(_1, _2); }
+// CHECK-LABEL: @vhaddw_h_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vhaddw.h.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vhaddw_h_b(v16i8 _1, v16i8 _2) { return __lsx_vhaddw_h_b(_1, _2); }
+// CHECK-LABEL: @vhaddw_w_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vhaddw.w.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vhaddw_w_h(v8i16 _1, v8i16 _2) { return __lsx_vhaddw_w_h(_1, _2); }
+// CHECK-LABEL: @vhaddw_d_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vhaddw.d.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vhaddw_d_w(v4i32 _1, v4i32 _2) { return __lsx_vhaddw_d_w(_1, _2); }
+// CHECK-LABEL: @vhaddw_hu_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vhaddw.hu.bu(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8u16 vhaddw_hu_bu(v16u8 _1, v16u8 _2) { return __lsx_vhaddw_hu_bu(_1, _2); }
+// CHECK-LABEL: @vhaddw_wu_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vhaddw.wu.hu(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4u32 vhaddw_wu_hu(v8u16 _1, v8u16 _2) { return __lsx_vhaddw_wu_hu(_1, _2); }
+// CHECK-LABEL: @vhaddw_du_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vhaddw.du.wu(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2u64 vhaddw_du_wu(v4u32 _1, v4u32 _2) { return __lsx_vhaddw_du_wu(_1, _2); }
+// CHECK-LABEL: @vhsubw_h_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vhsubw.h.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vhsubw_h_b(v16i8 _1, v16i8 _2) { return __lsx_vhsubw_h_b(_1, _2); }
+// CHECK-LABEL: @vhsubw_w_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vhsubw.w.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vhsubw_w_h(v8i16 _1, v8i16 _2) { return __lsx_vhsubw_w_h(_1, _2); }
+// CHECK-LABEL: @vhsubw_d_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vhsubw.d.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vhsubw_d_w(v4i32 _1, v4i32 _2) { return __lsx_vhsubw_d_w(_1, _2); }
+// CHECK-LABEL: @vhsubw_hu_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vhsubw.hu.bu(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vhsubw_hu_bu(v16u8 _1, v16u8 _2) { return __lsx_vhsubw_hu_bu(_1, _2); }
+// CHECK-LABEL: @vhsubw_wu_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vhsubw.wu.hu(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vhsubw_wu_hu(v8u16 _1, v8u16 _2) { return __lsx_vhsubw_wu_hu(_1, _2); }
+// CHECK-LABEL: @vhsubw_du_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vhsubw.du.wu(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vhsubw_du_wu(v4u32 _1, v4u32 _2) { return __lsx_vhsubw_du_wu(_1, _2); }
+// CHECK-LABEL: @vmod_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmod.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vmod_b(v16i8 _1, v16i8 _2) { return __lsx_vmod_b(_1, _2); }
+// CHECK-LABEL: @vmod_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmod.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vmod_h(v8i16 _1, v8i16 _2) { return __lsx_vmod_h(_1, _2); }
+// CHECK-LABEL: @vmod_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmod.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vmod_w(v4i32 _1, v4i32 _2) { return __lsx_vmod_w(_1, _2); }
+// CHECK-LABEL: @vmod_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmod.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vmod_d(v2i64 _1, v2i64 _2) { return __lsx_vmod_d(_1, _2); }
+// CHECK-LABEL: @vmod_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmod.bu(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16u8 vmod_bu(v16u8 _1, v16u8 _2) { return __lsx_vmod_bu(_1, _2); }
+// CHECK-LABEL: @vmod_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmod.hu(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8u16 vmod_hu(v8u16 _1, v8u16 _2) { return __lsx_vmod_hu(_1, _2); }
+// CHECK-LABEL: @vmod_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmod.wu(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4u32 vmod_wu(v4u32 _1, v4u32 _2) { return __lsx_vmod_wu(_1, _2); }
+// CHECK-LABEL: @vmod_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmod.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2u64 vmod_du(v2u64 _1, v2u64 _2) { return __lsx_vmod_du(_1, _2); }
+// CHECK-LABEL: @vreplve_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vreplve.b(<16 x i8> [[_1:%.*]], i32 [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vreplve_b(v16i8 _1, int _2) { return __lsx_vreplve_b(_1, _2); }
+// CHECK-LABEL: @vreplve_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vreplve.h(<8 x i16> [[_1:%.*]], i32 [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vreplve_h(v8i16 _1, int _2) { return __lsx_vreplve_h(_1, _2); }
+// CHECK-LABEL: @vreplve_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vreplve.w(<4 x i32> [[_1:%.*]], i32 [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vreplve_w(v4i32 _1, int _2) { return __lsx_vreplve_w(_1, _2); }
+// CHECK-LABEL: @vreplve_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vreplve.d(<2 x i64> [[_1:%.*]], i32 [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vreplve_d(v2i64 _1, int _2) { return __lsx_vreplve_d(_1, _2); }
+// CHECK-LABEL: @vreplvei_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vreplvei.b(<16 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vreplvei_b(v16i8 _1) { return __lsx_vreplvei_b(_1, 1); }
+// CHECK-LABEL: @vreplvei_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vreplvei.h(<8 x i16> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vreplvei_h(v8i16 _1) { return __lsx_vreplvei_h(_1, 1); }
+// CHECK-LABEL: @vreplvei_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vreplvei.w(<4 x i32> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vreplvei_w(v4i32 _1) { return __lsx_vreplvei_w(_1, 1); }
+// CHECK-LABEL: @vreplvei_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vreplvei.d(<2 x i64> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vreplvei_d(v2i64 _1) { return __lsx_vreplvei_d(_1, 1); }
+// CHECK-LABEL: @vpickev_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vpickev.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vpickev_b(v16i8 _1, v16i8 _2) { return __lsx_vpickev_b(_1, _2); }
+// CHECK-LABEL: @vpickev_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vpickev.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vpickev_h(v8i16 _1, v8i16 _2) { return __lsx_vpickev_h(_1, _2); }
+// CHECK-LABEL: @vpickev_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vpickev.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vpickev_w(v4i32 _1, v4i32 _2) { return __lsx_vpickev_w(_1, _2); }
+// CHECK-LABEL: @vpickev_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vpickev.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vpickev_d(v2i64 _1, v2i64 _2) { return __lsx_vpickev_d(_1, _2); }
+// CHECK-LABEL: @vpickod_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vpickod.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vpickod_b(v16i8 _1, v16i8 _2) { return __lsx_vpickod_b(_1, _2); }
+// CHECK-LABEL: @vpickod_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vpickod.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vpickod_h(v8i16 _1, v8i16 _2) { return __lsx_vpickod_h(_1, _2); }
+// CHECK-LABEL: @vpickod_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vpickod.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vpickod_w(v4i32 _1, v4i32 _2) { return __lsx_vpickod_w(_1, _2); }
+// CHECK-LABEL: @vpickod_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vpickod.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vpickod_d(v2i64 _1, v2i64 _2) { return __lsx_vpickod_d(_1, _2); }
+// CHECK-LABEL: @vilvh_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vilvh.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vilvh_b(v16i8 _1, v16i8 _2) { return __lsx_vilvh_b(_1, _2); }
+// CHECK-LABEL: @vilvh_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vilvh.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vilvh_h(v8i16 _1, v8i16 _2) { return __lsx_vilvh_h(_1, _2); }
+// CHECK-LABEL: @vilvh_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vilvh.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vilvh_w(v4i32 _1, v4i32 _2) { return __lsx_vilvh_w(_1, _2); }
+// CHECK-LABEL: @vilvh_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vilvh.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vilvh_d(v2i64 _1, v2i64 _2) { return __lsx_vilvh_d(_1, _2); }
+// CHECK-LABEL: @vilvl_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vilvl.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vilvl_b(v16i8 _1, v16i8 _2) { return __lsx_vilvl_b(_1, _2); }
+// CHECK-LABEL: @vilvl_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vilvl.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vilvl_h(v8i16 _1, v8i16 _2) { return __lsx_vilvl_h(_1, _2); }
+// CHECK-LABEL: @vilvl_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vilvl.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vilvl_w(v4i32 _1, v4i32 _2) { return __lsx_vilvl_w(_1, _2); }
+// CHECK-LABEL: @vilvl_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vilvl.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vilvl_d(v2i64 _1, v2i64 _2) { return __lsx_vilvl_d(_1, _2); }
+// CHECK-LABEL: @vpackev_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vpackev.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vpackev_b(v16i8 _1, v16i8 _2) { return __lsx_vpackev_b(_1, _2); }
+// CHECK-LABEL: @vpackev_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vpackev.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vpackev_h(v8i16 _1, v8i16 _2) { return __lsx_vpackev_h(_1, _2); }
+// CHECK-LABEL: @vpackev_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vpackev.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vpackev_w(v4i32 _1, v4i32 _2) { return __lsx_vpackev_w(_1, _2); }
+// CHECK-LABEL: @vpackev_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vpackev.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vpackev_d(v2i64 _1, v2i64 _2) { return __lsx_vpackev_d(_1, _2); }
+// CHECK-LABEL: @vpackod_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vpackod.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vpackod_b(v16i8 _1, v16i8 _2) { return __lsx_vpackod_b(_1, _2); }
+// CHECK-LABEL: @vpackod_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vpackod.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vpackod_h(v8i16 _1, v8i16 _2) { return __lsx_vpackod_h(_1, _2); }
+// CHECK-LABEL: @vpackod_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vpackod.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vpackod_w(v4i32 _1, v4i32 _2) { return __lsx_vpackod_w(_1, _2); }
+// CHECK-LABEL: @vpackod_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vpackod.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vpackod_d(v2i64 _1, v2i64 _2) { return __lsx_vpackod_d(_1, _2); }
+// CHECK-LABEL: @vshuf_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vshuf.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]], <8 x i16> [[_3:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vshuf_h(v8i16 _1, v8i16 _2, v8i16 _3) {
+  return __lsx_vshuf_h(_1, _2, _3);
+}
+// CHECK-LABEL: @vshuf_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vshuf.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]], <4 x i32> [[_3:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vshuf_w(v4i32 _1, v4i32 _2, v4i32 _3) {
+  return __lsx_vshuf_w(_1, _2, _3);
+}
+// CHECK-LABEL: @vshuf_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vshuf.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], <2 x i64> [[_3:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vshuf_d(v2i64 _1, v2i64 _2, v2i64 _3) {
+  return __lsx_vshuf_d(_1, _2, _3);
+}
+// CHECK-LABEL: @vand_v(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vand.v(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16u8 vand_v(v16u8 _1, v16u8 _2) { return __lsx_vand_v(_1, _2); }
+// CHECK-LABEL: @vandi_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vandi.b(<16 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16u8 vandi_b(v16u8 _1) { return __lsx_vandi_b(_1, 1); }
+// CHECK-LABEL: @vor_v(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vor.v(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16u8 vor_v(v16u8 _1, v16u8 _2) { return __lsx_vor_v(_1, _2); }
+// CHECK-LABEL: @vori_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vori.b(<16 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16u8 vori_b(v16u8 _1) { return __lsx_vori_b(_1, 1); }
+// CHECK-LABEL: @vnor_v(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vnor.v(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16u8 vnor_v(v16u8 _1, v16u8 _2) { return __lsx_vnor_v(_1, _2); }
+// CHECK-LABEL: @vnori_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vnori.b(<16 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16u8 vnori_b(v16u8 _1) { return __lsx_vnori_b(_1, 1); }
+// CHECK-LABEL: @vxor_v(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vxor.v(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16u8 vxor_v(v16u8 _1, v16u8 _2) { return __lsx_vxor_v(_1, _2); }
+// CHECK-LABEL: @vxori_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vxori.b(<16 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16u8 vxori_b(v16u8 _1) { return __lsx_vxori_b(_1, 1); }
+// CHECK-LABEL: @vbitsel_v(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vbitsel.v(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]], <16 x i8> [[_3:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16u8 vbitsel_v(v16u8 _1, v16u8 _2, v16u8 _3) {
+  return __lsx_vbitsel_v(_1, _2, _3);
+}
+// CHECK-LABEL: @vbitseli_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vbitseli.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16u8 vbitseli_b(v16u8 _1, v16u8 _2) { return __lsx_vbitseli_b(_1, _2, 1); }
+// CHECK-LABEL: @vshuf4i_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vshuf4i.b(<16 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vshuf4i_b(v16i8 _1) { return __lsx_vshuf4i_b(_1, 1); }
+// CHECK-LABEL: @vshuf4i_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vshuf4i.h(<8 x i16> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vshuf4i_h(v8i16 _1) { return __lsx_vshuf4i_h(_1, 1); }
+// CHECK-LABEL: @vshuf4i_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vshuf4i.w(<4 x i32> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vshuf4i_w(v4i32 _1) { return __lsx_vshuf4i_w(_1, 1); }
+// CHECK-LABEL: @vreplgr2vr_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vreplgr2vr.b(i32 [[_1:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vreplgr2vr_b(int _1) { return __lsx_vreplgr2vr_b(_1); }
+// CHECK-LABEL: @vreplgr2vr_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vreplgr2vr.h(i32 [[_1:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vreplgr2vr_h(int _1) { return __lsx_vreplgr2vr_h(_1); }
+// CHECK-LABEL: @vreplgr2vr_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vreplgr2vr.w(i32 [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vreplgr2vr_w(int _1) { return __lsx_vreplgr2vr_w(_1); }
+// CHECK-LABEL: @vreplgr2vr_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vreplgr2vr.d(i64 [[_1:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vreplgr2vr_d(long _1) { return __lsx_vreplgr2vr_d(_1); }
+// CHECK-LABEL: @vpcnt_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vpcnt.b(<16 x i8> [[_1:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vpcnt_b(v16i8 _1) { return __lsx_vpcnt_b(_1); }
+// CHECK-LABEL: @vpcnt_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vpcnt.h(<8 x i16> [[_1:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vpcnt_h(v8i16 _1) { return __lsx_vpcnt_h(_1); }
+// CHECK-LABEL: @vpcnt_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vpcnt.w(<4 x i32> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vpcnt_w(v4i32 _1) { return __lsx_vpcnt_w(_1); }
+// CHECK-LABEL: @vpcnt_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vpcnt.d(<2 x i64> [[_1:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vpcnt_d(v2i64 _1) { return __lsx_vpcnt_d(_1); }
+// CHECK-LABEL: @vclo_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vclo.b(<16 x i8> [[_1:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vclo_b(v16i8 _1) { return __lsx_vclo_b(_1); }
+// CHECK-LABEL: @vclo_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vclo.h(<8 x i16> [[_1:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vclo_h(v8i16 _1) { return __lsx_vclo_h(_1); }
+// CHECK-LABEL: @vclo_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vclo.w(<4 x i32> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vclo_w(v4i32 _1) { return __lsx_vclo_w(_1); }
+// CHECK-LABEL: @vclo_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vclo.d(<2 x i64> [[_1:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vclo_d(v2i64 _1) { return __lsx_vclo_d(_1); }
+// CHECK-LABEL: @vclz_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vclz.b(<16 x i8> [[_1:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vclz_b(v16i8 _1) { return __lsx_vclz_b(_1); }
+// CHECK-LABEL: @vclz_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vclz.h(<8 x i16> [[_1:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vclz_h(v8i16 _1) { return __lsx_vclz_h(_1); }
+// CHECK-LABEL: @vclz_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vclz.w(<4 x i32> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vclz_w(v4i32 _1) { return __lsx_vclz_w(_1); }
+// CHECK-LABEL: @vclz_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vclz.d(<2 x i64> [[_1:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vclz_d(v2i64 _1) { return __lsx_vclz_d(_1); }
+// CHECK-LABEL: @vpickve2gr_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lsx.vpickve2gr.b(<16 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+int vpickve2gr_b(v16i8 _1) { return __lsx_vpickve2gr_b(_1, 1); }
+// CHECK-LABEL: @vpickve2gr_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lsx.vpickve2gr.h(<8 x i16> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+int vpickve2gr_h(v8i16 _1) { return __lsx_vpickve2gr_h(_1, 1); }
+// CHECK-LABEL: @vpickve2gr_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lsx.vpickve2gr.w(<4 x i32> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+int vpickve2gr_w(v4i32 _1) { return __lsx_vpickve2gr_w(_1, 1); }
+// CHECK-LABEL: @vpickve2gr_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.loongarch.lsx.vpickve2gr.d(<2 x i64> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret i64 [[TMP0]]
+//
+long vpickve2gr_d(v2i64 _1) { return __lsx_vpickve2gr_d(_1, 1); }
+// CHECK-LABEL: @vpickve2gr_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lsx.vpickve2gr.bu(<16 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+unsigned int vpickve2gr_bu(v16i8 _1) { return __lsx_vpickve2gr_bu(_1, 1); }
+// CHECK-LABEL: @vpickve2gr_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lsx.vpickve2gr.hu(<8 x i16> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+unsigned int vpickve2gr_hu(v8i16 _1) { return __lsx_vpickve2gr_hu(_1, 1); }
+// CHECK-LABEL: @vpickve2gr_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lsx.vpickve2gr.wu(<4 x i32> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+unsigned int vpickve2gr_wu(v4i32 _1) { return __lsx_vpickve2gr_wu(_1, 1); }
+// CHECK-LABEL: @vpickve2gr_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.loongarch.lsx.vpickve2gr.du(<2 x i64> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret i64 [[TMP0]]
+//
+unsigned long int vpickve2gr_du(v2i64 _1) { return __lsx_vpickve2gr_du(_1, 1); }
+// CHECK-LABEL: @vinsgr2vr_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vinsgr2vr.b(<16 x i8> [[_1:%.*]], i32 1, i32 1)
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vinsgr2vr_b(v16i8 _1) { return __lsx_vinsgr2vr_b(_1, 1, 1); }
+// CHECK-LABEL: @vinsgr2vr_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vinsgr2vr.h(<8 x i16> [[_1:%.*]], i32 1, i32 1)
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vinsgr2vr_h(v8i16 _1) { return __lsx_vinsgr2vr_h(_1, 1, 1); }
+// CHECK-LABEL: @vinsgr2vr_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vinsgr2vr.w(<4 x i32> [[_1:%.*]], i32 1, i32 1)
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vinsgr2vr_w(v4i32 _1) { return __lsx_vinsgr2vr_w(_1, 1, 1); }
+// CHECK-LABEL: @vinsgr2vr_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vinsgr2vr.d(<2 x i64> [[_1:%.*]], i64 1, i32 1)
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vinsgr2vr_d(v2i64 _1) { return __lsx_vinsgr2vr_d(_1, 1, 1); }
+// CHECK-LABEL: @vfadd_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfadd.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+//
+v4f32 vfadd_s(v4f32 _1, v4f32 _2) { return __lsx_vfadd_s(_1, _2); }
+// CHECK-LABEL: @vfadd_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfadd.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+//
+v2f64 vfadd_d(v2f64 _1, v2f64 _2) { return __lsx_vfadd_d(_1, _2); }
+// CHECK-LABEL: @vfsub_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfsub.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+//
+v4f32 vfsub_s(v4f32 _1, v4f32 _2) { return __lsx_vfsub_s(_1, _2); }
+// CHECK-LABEL: @vfsub_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfsub.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+//
+v2f64 vfsub_d(v2f64 _1, v2f64 _2) { return __lsx_vfsub_d(_1, _2); }
+// CHECK-LABEL: @vfmul_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfmul.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+//
+v4f32 vfmul_s(v4f32 _1, v4f32 _2) { return __lsx_vfmul_s(_1, _2); }
+// CHECK-LABEL: @vfmul_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfmul.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+//
+v2f64 vfmul_d(v2f64 _1, v2f64 _2) { return __lsx_vfmul_d(_1, _2); }
+// CHECK-LABEL: @vfdiv_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfdiv.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+//
+v4f32 vfdiv_s(v4f32 _1, v4f32 _2) { return __lsx_vfdiv_s(_1, _2); }
+// CHECK-LABEL: @vfdiv_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfdiv.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+//
+v2f64 vfdiv_d(v2f64 _1, v2f64 _2) { return __lsx_vfdiv_d(_1, _2); }
+// CHECK-LABEL: @vfcvt_h_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vfcvt.h.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vfcvt_h_s(v4f32 _1, v4f32 _2) { return __lsx_vfcvt_h_s(_1, _2); }
+// CHECK-LABEL: @vfcvt_s_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfcvt.s.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+//
+v4f32 vfcvt_s_d(v2f64 _1, v2f64 _2) { return __lsx_vfcvt_s_d(_1, _2); }
+// CHECK-LABEL: @vfmin_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfmin.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+//
+v4f32 vfmin_s(v4f32 _1, v4f32 _2) { return __lsx_vfmin_s(_1, _2); }
+// CHECK-LABEL: @vfmin_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfmin.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+//
+v2f64 vfmin_d(v2f64 _1, v2f64 _2) { return __lsx_vfmin_d(_1, _2); }
+// CHECK-LABEL: @vfmina_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfmina.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+//
+v4f32 vfmina_s(v4f32 _1, v4f32 _2) { return __lsx_vfmina_s(_1, _2); }
+// CHECK-LABEL: @vfmina_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfmina.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+//
+v2f64 vfmina_d(v2f64 _1, v2f64 _2) { return __lsx_vfmina_d(_1, _2); }
+// CHECK-LABEL: @vfmax_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfmax.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+//
+v4f32 vfmax_s(v4f32 _1, v4f32 _2) { return __lsx_vfmax_s(_1, _2); }
+// CHECK-LABEL: @vfmax_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfmax.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+//
+v2f64 vfmax_d(v2f64 _1, v2f64 _2) { return __lsx_vfmax_d(_1, _2); }
+// CHECK-LABEL: @vfmaxa_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfmaxa.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+//
+v4f32 vfmaxa_s(v4f32 _1, v4f32 _2) { return __lsx_vfmaxa_s(_1, _2); }
+// CHECK-LABEL: @vfmaxa_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfmaxa.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+//
+v2f64 vfmaxa_d(v2f64 _1, v2f64 _2) { return __lsx_vfmaxa_d(_1, _2); }
+// CHECK-LABEL: @vfclass_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfclass.s(<4 x float> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vfclass_s(v4f32 _1) { return __lsx_vfclass_s(_1); }
+// CHECK-LABEL: @vfclass_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfclass.d(<2 x double> [[_1:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vfclass_d(v2f64 _1) { return __lsx_vfclass_d(_1); }
+// CHECK-LABEL: @vfsqrt_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfsqrt.s(<4 x float> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+//
+v4f32 vfsqrt_s(v4f32 _1) { return __lsx_vfsqrt_s(_1); }
+// CHECK-LABEL: @vfsqrt_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfsqrt.d(<2 x double> [[_1:%.*]])
+// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+//
+v2f64 vfsqrt_d(v2f64 _1) { return __lsx_vfsqrt_d(_1); }
+// CHECK-LABEL: @vfrecip_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfrecip.s(<4 x float> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+//
+v4f32 vfrecip_s(v4f32 _1) { return __lsx_vfrecip_s(_1); }
+// CHECK-LABEL: @vfrecip_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfrecip.d(<2 x double> [[_1:%.*]])
+// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+//
+v2f64 vfrecip_d(v2f64 _1) { return __lsx_vfrecip_d(_1); }
+// CHECK-LABEL: @vfrint_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfrint.s(<4 x float> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+//
+v4f32 vfrint_s(v4f32 _1) { return __lsx_vfrint_s(_1); }
+// CHECK-LABEL: @vfrint_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfrint.d(<2 x double> [[_1:%.*]])
+// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+//
+v2f64 vfrint_d(v2f64 _1) { return __lsx_vfrint_d(_1); }
+// CHECK-LABEL: @vfrsqrt_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfrsqrt.s(<4 x float> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+//
+v4f32 vfrsqrt_s(v4f32 _1) { return __lsx_vfrsqrt_s(_1); }
+// CHECK-LABEL: @vfrsqrt_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfrsqrt.d(<2 x double> [[_1:%.*]])
+// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+//
+v2f64 vfrsqrt_d(v2f64 _1) { return __lsx_vfrsqrt_d(_1); }
+// CHECK-LABEL: @vflogb_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vflogb.s(<4 x float> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+//
+v4f32 vflogb_s(v4f32 _1) { return __lsx_vflogb_s(_1); }
+// CHECK-LABEL: @vflogb_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vflogb.d(<2 x double> [[_1:%.*]])
+// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+//
+v2f64 vflogb_d(v2f64 _1) { return __lsx_vflogb_d(_1); }
+// CHECK-LABEL: @vfcvth_s_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfcvth.s.h(<8 x i16> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+//
+v4f32 vfcvth_s_h(v8i16 _1) { return __lsx_vfcvth_s_h(_1); }
+// CHECK-LABEL: @vfcvth_d_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfcvth.d.s(<4 x float> [[_1:%.*]])
+// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+//
+v2f64 vfcvth_d_s(v4f32 _1) { return __lsx_vfcvth_d_s(_1); }
+// CHECK-LABEL: @vfcvtl_s_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfcvtl.s.h(<8 x i16> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+//
+v4f32 vfcvtl_s_h(v8i16 _1) { return __lsx_vfcvtl_s_h(_1); }
+// CHECK-LABEL: @vfcvtl_d_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfcvtl.d.s(<4 x float> [[_1:%.*]])
+// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+//
+v2f64 vfcvtl_d_s(v4f32 _1) { return __lsx_vfcvtl_d_s(_1); }
+// CHECK-LABEL: @vftint_w_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vftint.w.s(<4 x float> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vftint_w_s(v4f32 _1) { return __lsx_vftint_w_s(_1); }
+// CHECK-LABEL: @vftint_l_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftint.l.d(<2 x double> [[_1:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vftint_l_d(v2f64 _1) { return __lsx_vftint_l_d(_1); }
+// CHECK-LABEL: @vftint_wu_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vftint.wu.s(<4 x float> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4u32 vftint_wu_s(v4f32 _1) { return __lsx_vftint_wu_s(_1); }
+// CHECK-LABEL: @vftint_lu_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftint.lu.d(<2 x double> [[_1:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2u64 vftint_lu_d(v2f64 _1) { return __lsx_vftint_lu_d(_1); }
+// CHECK-LABEL: @vftintrz_w_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vftintrz.w.s(<4 x float> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vftintrz_w_s(v4f32 _1) { return __lsx_vftintrz_w_s(_1); }
+// CHECK-LABEL: @vftintrz_l_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrz.l.d(<2 x double> [[_1:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vftintrz_l_d(v2f64 _1) { return __lsx_vftintrz_l_d(_1); }
+// CHECK-LABEL: @vftintrz_wu_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vftintrz.wu.s(<4 x float> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4u32 vftintrz_wu_s(v4f32 _1) { return __lsx_vftintrz_wu_s(_1); }
+// CHECK-LABEL: @vftintrz_lu_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrz.lu.d(<2 x double> [[_1:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2u64 vftintrz_lu_d(v2f64 _1) { return __lsx_vftintrz_lu_d(_1); }
+// CHECK-LABEL: @vffint_s_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vffint.s.w(<4 x i32> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+//
+v4f32 vffint_s_w(v4i32 _1) { return __lsx_vffint_s_w(_1); }
+// CHECK-LABEL: @vffint_d_l(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vffint.d.l(<2 x i64> [[_1:%.*]])
+// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+//
+v2f64 vffint_d_l(v2i64 _1) { return __lsx_vffint_d_l(_1); }
+// CHECK-LABEL: @vffint_s_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vffint.s.wu(<4 x i32> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+//
+v4f32 vffint_s_wu(v4u32 _1) { return __lsx_vffint_s_wu(_1); }
+// CHECK-LABEL: @vffint_d_lu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vffint.d.lu(<2 x i64> [[_1:%.*]])
+// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+//
+v2f64 vffint_d_lu(v2u64 _1) { return __lsx_vffint_d_lu(_1); }
+// CHECK-LABEL: @vandn_v(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vandn.v(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16u8 vandn_v(v16u8 _1, v16u8 _2) { return __lsx_vandn_v(_1, _2); }
+// CHECK-LABEL: @vneg_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vneg.b(<16 x i8> [[_1:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vneg_b(v16i8 _1) { return __lsx_vneg_b(_1); }
+// CHECK-LABEL: @vneg_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vneg.h(<8 x i16> [[_1:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vneg_h(v8i16 _1) { return __lsx_vneg_h(_1); }
+// CHECK-LABEL: @vneg_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vneg.w(<4 x i32> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vneg_w(v4i32 _1) { return __lsx_vneg_w(_1); }
+// CHECK-LABEL: @vneg_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vneg.d(<2 x i64> [[_1:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vneg_d(v2i64 _1) { return __lsx_vneg_d(_1); }
+// CHECK-LABEL: @vmuh_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmuh.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vmuh_b(v16i8 _1, v16i8 _2) { return __lsx_vmuh_b(_1, _2); }
+// CHECK-LABEL: @vmuh_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmuh.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vmuh_h(v8i16 _1, v8i16 _2) { return __lsx_vmuh_h(_1, _2); }
+// CHECK-LABEL: @vmuh_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmuh.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vmuh_w(v4i32 _1, v4i32 _2) { return __lsx_vmuh_w(_1, _2); }
+// CHECK-LABEL: @vmuh_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmuh.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vmuh_d(v2i64 _1, v2i64 _2) { return __lsx_vmuh_d(_1, _2); }
+// CHECK-LABEL: @vmuh_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmuh.bu(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16u8 vmuh_bu(v16u8 _1, v16u8 _2) { return __lsx_vmuh_bu(_1, _2); }
+// CHECK-LABEL: @vmuh_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmuh.hu(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8u16 vmuh_hu(v8u16 _1, v8u16 _2) { return __lsx_vmuh_hu(_1, _2); }
+// CHECK-LABEL: @vmuh_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmuh.wu(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4u32 vmuh_wu(v4u32 _1, v4u32 _2) { return __lsx_vmuh_wu(_1, _2); }
+// CHECK-LABEL: @vmuh_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmuh.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2u64 vmuh_du(v2u64 _1, v2u64 _2) { return __lsx_vmuh_du(_1, _2); }
+// CHECK-LABEL: @vsllwil_h_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsllwil.h.b(<16 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vsllwil_h_b(v16i8 _1) { return __lsx_vsllwil_h_b(_1, 1); }
+// CHECK-LABEL: @vsllwil_w_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsllwil.w.h(<8 x i16> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vsllwil_w_h(v8i16 _1) { return __lsx_vsllwil_w_h(_1, 1); }
+// CHECK-LABEL: @vsllwil_d_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsllwil.d.w(<4 x i32> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vsllwil_d_w(v4i32 _1) { return __lsx_vsllwil_d_w(_1, 1); }
+// CHECK-LABEL: @vsllwil_hu_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsllwil.hu.bu(<16 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8u16 vsllwil_hu_bu(v16u8 _1) { return __lsx_vsllwil_hu_bu(_1, 1); }
+// CHECK-LABEL: @vsllwil_wu_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsllwil.wu.hu(<8 x i16> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4u32 vsllwil_wu_hu(v8u16 _1) { return __lsx_vsllwil_wu_hu(_1, 1); }
+// CHECK-LABEL: @vsllwil_du_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsllwil.du.wu(<4 x i32> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2u64 vsllwil_du_wu(v4u32 _1) { return __lsx_vsllwil_du_wu(_1, 1); }
+// CHECK-LABEL: @vsran_b_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsran.b.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vsran_b_h(v8i16 _1, v8i16 _2) { return __lsx_vsran_b_h(_1, _2); }
+// CHECK-LABEL: @vsran_h_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsran.h.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vsran_h_w(v4i32 _1, v4i32 _2) { return __lsx_vsran_h_w(_1, _2); }
+// CHECK-LABEL: @vsran_w_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsran.w.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vsran_w_d(v2i64 _1, v2i64 _2) { return __lsx_vsran_w_d(_1, _2); }
+// CHECK-LABEL: @vssran_b_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssran.b.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vssran_b_h(v8i16 _1, v8i16 _2) { return __lsx_vssran_b_h(_1, _2); }
+// CHECK-LABEL: @vssran_h_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssran.h.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vssran_h_w(v4i32 _1, v4i32 _2) { return __lsx_vssran_h_w(_1, _2); }
+// CHECK-LABEL: @vssran_w_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssran.w.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vssran_w_d(v2i64 _1, v2i64 _2) { return __lsx_vssran_w_d(_1, _2); }
+// CHECK-LABEL: @vssran_bu_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssran.bu.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16u8 vssran_bu_h(v8u16 _1, v8u16 _2) { return __lsx_vssran_bu_h(_1, _2); }
+// CHECK-LABEL: @vssran_hu_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssran.hu.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8u16 vssran_hu_w(v4u32 _1, v4u32 _2) { return __lsx_vssran_hu_w(_1, _2); }
+// CHECK-LABEL: @vssran_wu_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssran.wu.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4u32 vssran_wu_d(v2u64 _1, v2u64 _2) { return __lsx_vssran_wu_d(_1, _2); }
+// CHECK-LABEL: @vsrarn_b_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrarn.b.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vsrarn_b_h(v8i16 _1, v8i16 _2) { return __lsx_vsrarn_b_h(_1, _2); }
+// CHECK-LABEL: @vsrarn_h_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrarn.h.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vsrarn_h_w(v4i32 _1, v4i32 _2) { return __lsx_vsrarn_h_w(_1, _2); }
+// CHECK-LABEL: @vsrarn_w_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrarn.w.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vsrarn_w_d(v2i64 _1, v2i64 _2) { return __lsx_vsrarn_w_d(_1, _2); }
+// CHECK-LABEL: @vssrarn_b_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrarn.b.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vssrarn_b_h(v8i16 _1, v8i16 _2) { return __lsx_vssrarn_b_h(_1, _2); }
+// CHECK-LABEL: @vssrarn_h_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrarn.h.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vssrarn_h_w(v4i32 _1, v4i32 _2) { return __lsx_vssrarn_h_w(_1, _2); }
+// CHECK-LABEL: @vssrarn_w_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrarn.w.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vssrarn_w_d(v2i64 _1, v2i64 _2) { return __lsx_vssrarn_w_d(_1, _2); }
+// CHECK-LABEL: @vssrarn_bu_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrarn.bu.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16u8 vssrarn_bu_h(v8u16 _1, v8u16 _2) { return __lsx_vssrarn_bu_h(_1, _2); }
+// CHECK-LABEL: @vssrarn_hu_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrarn.hu.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8u16 vssrarn_hu_w(v4u32 _1, v4u32 _2) { return __lsx_vssrarn_hu_w(_1, _2); }
+// CHECK-LABEL: @vssrarn_wu_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrarn.wu.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4u32 vssrarn_wu_d(v2u64 _1, v2u64 _2) { return __lsx_vssrarn_wu_d(_1, _2); }
+// CHECK-LABEL: @vsrln_b_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrln.b.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vsrln_b_h(v8i16 _1, v8i16 _2) { return __lsx_vsrln_b_h(_1, _2); }
+// CHECK-LABEL: @vsrln_h_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrln.h.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vsrln_h_w(v4i32 _1, v4i32 _2) { return __lsx_vsrln_h_w(_1, _2); }
+// CHECK-LABEL: @vsrln_w_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrln.w.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vsrln_w_d(v2i64 _1, v2i64 _2) { return __lsx_vsrln_w_d(_1, _2); }
+// CHECK-LABEL: @vssrln_bu_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrln.bu.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16u8 vssrln_bu_h(v8u16 _1, v8u16 _2) { return __lsx_vssrln_bu_h(_1, _2); }
+// CHECK-LABEL: @vssrln_hu_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrln.hu.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8u16 vssrln_hu_w(v4u32 _1, v4u32 _2) { return __lsx_vssrln_hu_w(_1, _2); }
+// CHECK-LABEL: @vssrln_wu_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrln.wu.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4u32 vssrln_wu_d(v2u64 _1, v2u64 _2) { return __lsx_vssrln_wu_d(_1, _2); }
+// CHECK-LABEL: @vsrlrn_b_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrlrn.b.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vsrlrn_b_h(v8i16 _1, v8i16 _2) { return __lsx_vsrlrn_b_h(_1, _2); }
+// CHECK-LABEL: @vsrlrn_h_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrlrn.h.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vsrlrn_h_w(v4i32 _1, v4i32 _2) { return __lsx_vsrlrn_h_w(_1, _2); }
+// CHECK-LABEL: @vsrlrn_w_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrlrn.w.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vsrlrn_w_d(v2i64 _1, v2i64 _2) { return __lsx_vsrlrn_w_d(_1, _2); }
+// CHECK-LABEL: @vssrlrn_bu_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrlrn.bu.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16u8 vssrlrn_bu_h(v8u16 _1, v8u16 _2) { return __lsx_vssrlrn_bu_h(_1, _2); }
+// CHECK-LABEL: @vssrlrn_hu_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrlrn.hu.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8u16 vssrlrn_hu_w(v4u32 _1, v4u32 _2) { return __lsx_vssrlrn_hu_w(_1, _2); }
+// CHECK-LABEL: @vssrlrn_wu_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrlrn.wu.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4u32 vssrlrn_wu_d(v2u64 _1, v2u64 _2) { return __lsx_vssrlrn_wu_d(_1, _2); }
+// CHECK-LABEL: @vfrstpi_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vfrstpi.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vfrstpi_b(v16i8 _1, v16i8 _2) { return __lsx_vfrstpi_b(_1, _2, 1); }
+// CHECK-LABEL: @vfrstpi_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vfrstpi.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vfrstpi_h(v8i16 _1, v8i16 _2) { return __lsx_vfrstpi_h(_1, _2, 1); }
+// CHECK-LABEL: @vfrstp_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vfrstp.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]], <16 x i8> [[_3:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vfrstp_b(v16i8 _1, v16i8 _2, v16i8 _3) {
+  return __lsx_vfrstp_b(_1, _2, _3);
+}
+// CHECK-LABEL: @vfrstp_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vfrstp.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]], <8 x i16> [[_3:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vfrstp_h(v8i16 _1, v8i16 _2, v8i16 _3) {
+  return __lsx_vfrstp_h(_1, _2, _3);
+}
+// CHECK-LABEL: @vshuf4i_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vshuf4i.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vshuf4i_d(v2i64 _1, v2i64 _2) { return __lsx_vshuf4i_d(_1, _2, 1); }
+// CHECK-LABEL: @vbsrl_v(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vbsrl.v(<16 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vbsrl_v(v16i8 _1) { return __lsx_vbsrl_v(_1, 1); }
+// CHECK-LABEL: @vbsll_v(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vbsll.v(<16 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vbsll_v(v16i8 _1) { return __lsx_vbsll_v(_1, 1); }
+// CHECK-LABEL: @vextrins_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vextrins.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vextrins_b(v16i8 _1, v16i8 _2) { return __lsx_vextrins_b(_1, _2, 1); }
+// CHECK-LABEL: @vextrins_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vextrins.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vextrins_h(v8i16 _1, v8i16 _2) { return __lsx_vextrins_h(_1, _2, 1); }
+// CHECK-LABEL: @vextrins_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vextrins.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vextrins_w(v4i32 _1, v4i32 _2) { return __lsx_vextrins_w(_1, _2, 1); }
+// CHECK-LABEL: @vextrins_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vextrins.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vextrins_d(v2i64 _1, v2i64 _2) { return __lsx_vextrins_d(_1, _2, 1); }
+// CHECK-LABEL: @vmskltz_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmskltz.b(<16 x i8> [[_1:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vmskltz_b(v16i8 _1) { return __lsx_vmskltz_b(_1); }
+// CHECK-LABEL: @vmskltz_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmskltz.h(<8 x i16> [[_1:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vmskltz_h(v8i16 _1) { return __lsx_vmskltz_h(_1); }
+// CHECK-LABEL: @vmskltz_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmskltz.w(<4 x i32> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vmskltz_w(v4i32 _1) { return __lsx_vmskltz_w(_1); }
+// CHECK-LABEL: @vmskltz_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmskltz.d(<2 x i64> [[_1:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vmskltz_d(v2i64 _1) { return __lsx_vmskltz_d(_1); }
+// CHECK-LABEL: @vsigncov_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsigncov.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vsigncov_b(v16i8 _1, v16i8 _2) { return __lsx_vsigncov_b(_1, _2); }
+// CHECK-LABEL: @vsigncov_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsigncov.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vsigncov_h(v8i16 _1, v8i16 _2) { return __lsx_vsigncov_h(_1, _2); }
+// CHECK-LABEL: @vsigncov_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsigncov.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vsigncov_w(v4i32 _1, v4i32 _2) { return __lsx_vsigncov_w(_1, _2); }
+// CHECK-LABEL: @vsigncov_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsigncov.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vsigncov_d(v2i64 _1, v2i64 _2) { return __lsx_vsigncov_d(_1, _2); }
+// CHECK-LABEL: @vfmadd_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfmadd.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]], <4 x float> [[_3:%.*]])
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+//
+v4f32 vfmadd_s(v4f32 _1, v4f32 _2, v4f32 _3) {
+  return __lsx_vfmadd_s(_1, _2, _3);
+}
+// CHECK-LABEL: @vfmadd_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfmadd.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]], <2 x double> [[_3:%.*]])
+// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+//
+v2f64 vfmadd_d(v2f64 _1, v2f64 _2, v2f64 _3) {
+  return __lsx_vfmadd_d(_1, _2, _3);
+}
+// CHECK-LABEL: @vfmsub_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfmsub.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]], <4 x float> [[_3:%.*]])
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+//
+v4f32 vfmsub_s(v4f32 _1, v4f32 _2, v4f32 _3) {
+  return __lsx_vfmsub_s(_1, _2, _3);
+}
+// CHECK-LABEL: @vfmsub_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfmsub.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]], <2 x double> [[_3:%.*]])
+// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+//
+v2f64 vfmsub_d(v2f64 _1, v2f64 _2, v2f64 _3) {
+  return __lsx_vfmsub_d(_1, _2, _3);
+}
+// CHECK-LABEL: @vfnmadd_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfnmadd.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]], <4 x float> [[_3:%.*]])
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+//
+v4f32 vfnmadd_s(v4f32 _1, v4f32 _2, v4f32 _3) {
+  return __lsx_vfnmadd_s(_1, _2, _3);
+}
+// CHECK-LABEL: @vfnmadd_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfnmadd.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]], <2 x double> [[_3:%.*]])
+// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+//
+v2f64 vfnmadd_d(v2f64 _1, v2f64 _2, v2f64 _3) {
+  return __lsx_vfnmadd_d(_1, _2, _3);
+}
+// CHECK-LABEL: @vfnmsub_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfnmsub.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]], <4 x float> [[_3:%.*]])
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+//
+v4f32 vfnmsub_s(v4f32 _1, v4f32 _2, v4f32 _3) {
+  return __lsx_vfnmsub_s(_1, _2, _3);
+}
+// CHECK-LABEL: @vfnmsub_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfnmsub.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]], <2 x double> [[_3:%.*]])
+// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+//
+v2f64 vfnmsub_d(v2f64 _1, v2f64 _2, v2f64 _3) {
+  return __lsx_vfnmsub_d(_1, _2, _3);
+}
+// CHECK-LABEL: @vftintrne_w_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vftintrne.w.s(<4 x float> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vftintrne_w_s(v4f32 _1) { return __lsx_vftintrne_w_s(_1); }
+// CHECK-LABEL: @vftintrne_l_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrne.l.d(<2 x double> [[_1:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vftintrne_l_d(v2f64 _1) { return __lsx_vftintrne_l_d(_1); }
+// CHECK-LABEL: @vftintrp_w_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vftintrp.w.s(<4 x float> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vftintrp_w_s(v4f32 _1) { return __lsx_vftintrp_w_s(_1); }
+// CHECK-LABEL: @vftintrp_l_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrp.l.d(<2 x double> [[_1:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vftintrp_l_d(v2f64 _1) { return __lsx_vftintrp_l_d(_1); }
+// CHECK-LABEL: @vftintrm_w_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vftintrm.w.s(<4 x float> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vftintrm_w_s(v4f32 _1) { return __lsx_vftintrm_w_s(_1); }
+// CHECK-LABEL: @vftintrm_l_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrm.l.d(<2 x double> [[_1:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vftintrm_l_d(v2f64 _1) { return __lsx_vftintrm_l_d(_1); }
+// CHECK-LABEL: @vftint_w_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vftint.w.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vftint_w_d(v2f64 _1, v2f64 _2) { return __lsx_vftint_w_d(_1, _2); }
+// CHECK-LABEL: @vffint_s_l(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vffint.s.l(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+//
+v4f32 vffint_s_l(v2i64 _1, v2i64 _2) { return __lsx_vffint_s_l(_1, _2); }
+// CHECK-LABEL: @vftintrz_w_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vftintrz.w.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vftintrz_w_d(v2f64 _1, v2f64 _2) { return __lsx_vftintrz_w_d(_1, _2); }
+// CHECK-LABEL: @vftintrp_w_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vftintrp.w.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vftintrp_w_d(v2f64 _1, v2f64 _2) { return __lsx_vftintrp_w_d(_1, _2); }
+// CHECK-LABEL: @vftintrm_w_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vftintrm.w.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vftintrm_w_d(v2f64 _1, v2f64 _2) { return __lsx_vftintrm_w_d(_1, _2); }
+// CHECK-LABEL: @vftintrne_w_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vftintrne.w.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vftintrne_w_d(v2f64 _1, v2f64 _2) { return __lsx_vftintrne_w_d(_1, _2); }
+// CHECK-LABEL: @vftintl_l_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintl.l.s(<4 x float> [[_1:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vftintl_l_s(v4f32 _1) { return __lsx_vftintl_l_s(_1); }
+// CHECK-LABEL: @vftinth_l_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftinth.l.s(<4 x float> [[_1:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vftinth_l_s(v4f32 _1) { return __lsx_vftinth_l_s(_1); }
+// CHECK-LABEL: @vffinth_d_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vffinth.d.w(<4 x i32> [[_1:%.*]])
+// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+//
+v2f64 vffinth_d_w(v4i32 _1) { return __lsx_vffinth_d_w(_1); }
+// CHECK-LABEL: @vffintl_d_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vffintl.d.w(<4 x i32> [[_1:%.*]])
+// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+//
+v2f64 vffintl_d_w(v4i32 _1) { return __lsx_vffintl_d_w(_1); }
+// CHECK-LABEL: @vftintrzl_l_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrzl.l.s(<4 x float> [[_1:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vftintrzl_l_s(v4f32 _1) { return __lsx_vftintrzl_l_s(_1); }
+// CHECK-LABEL: @vftintrzh_l_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrzh.l.s(<4 x float> [[_1:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vftintrzh_l_s(v4f32 _1) { return __lsx_vftintrzh_l_s(_1); }
+// CHECK-LABEL: @vftintrpl_l_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrpl.l.s(<4 x float> [[_1:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vftintrpl_l_s(v4f32 _1) { return __lsx_vftintrpl_l_s(_1); }
+// CHECK-LABEL: @vftintrph_l_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrph.l.s(<4 x float> [[_1:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vftintrph_l_s(v4f32 _1) { return __lsx_vftintrph_l_s(_1); }
+// CHECK-LABEL: @vftintrml_l_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrml.l.s(<4 x float> [[_1:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vftintrml_l_s(v4f32 _1) { return __lsx_vftintrml_l_s(_1); }
+// CHECK-LABEL: @vftintrmh_l_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrmh.l.s(<4 x float> [[_1:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vftintrmh_l_s(v4f32 _1) { return __lsx_vftintrmh_l_s(_1); }
+// CHECK-LABEL: @vftintrnel_l_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrnel.l.s(<4 x float> [[_1:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vftintrnel_l_s(v4f32 _1) { return __lsx_vftintrnel_l_s(_1); }
+// CHECK-LABEL: @vftintrneh_l_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrneh.l.s(<4 x float> [[_1:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vftintrneh_l_s(v4f32 _1) { return __lsx_vftintrneh_l_s(_1); }
+// CHECK-LABEL: @vfrintrne_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfrintrne.s(<4 x float> [[_1:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+//
+v4i32 vfrintrne_s(v4f32 _1) { return __lsx_vfrintrne_s(_1); }
+// CHECK-LABEL: @vfrintrne_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfrintrne.d(<2 x double> [[_1:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x double> [[TMP0]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[TMP1]]
+//
+v2i64 vfrintrne_d(v2f64 _1) { return __lsx_vfrintrne_d(_1); }
+// CHECK-LABEL: @vfrintrz_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfrintrz.s(<4 x float> [[_1:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+//
+v4i32 vfrintrz_s(v4f32 _1) { return __lsx_vfrintrz_s(_1); }
+// CHECK-LABEL: @vfrintrz_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfrintrz.d(<2 x double> [[_1:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x double> [[TMP0]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[TMP1]]
+//
+v2i64 vfrintrz_d(v2f64 _1) { return __lsx_vfrintrz_d(_1); }
+// CHECK-LABEL: @vfrintrp_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfrintrp.s(<4 x float> [[_1:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+//
+v4i32 vfrintrp_s(v4f32 _1) { return __lsx_vfrintrp_s(_1); }
+// CHECK-LABEL: @vfrintrp_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfrintrp.d(<2 x double> [[_1:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x double> [[TMP0]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[TMP1]]
+//
+v2i64 vfrintrp_d(v2f64 _1) { return __lsx_vfrintrp_d(_1); }
+// CHECK-LABEL: @vfrintrm_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfrintrm.s(<4 x float> [[_1:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+//
+v4i32 vfrintrm_s(v4f32 _1) { return __lsx_vfrintrm_s(_1); }
+// CHECK-LABEL: @vfrintrm_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfrintrm.d(<2 x double> [[_1:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x double> [[TMP0]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[TMP1]]
+//
+v2i64 vfrintrm_d(v2f64 _1) { return __lsx_vfrintrm_d(_1); }
+// CHECK-LABEL: @vstelm_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.loongarch.lsx.vstelm.b(<16 x i8> [[_1:%.*]], ptr [[_2:%.*]], i32 1, i32 1)
+// CHECK-NEXT:    ret void
+//
+void vstelm_b(v16i8 _1, void *_2) { return __lsx_vstelm_b(_1, _2, 1, 1); }
+// CHECK-LABEL: @vstelm_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.loongarch.lsx.vstelm.h(<8 x i16> [[_1:%.*]], ptr [[_2:%.*]], i32 2, i32 1)
+// CHECK-NEXT:    ret void
+//
+void vstelm_h(v8i16 _1, void *_2) { return __lsx_vstelm_h(_1, _2, 2, 1); }
+// CHECK-LABEL: @vstelm_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.loongarch.lsx.vstelm.w(<4 x i32> [[_1:%.*]], ptr [[_2:%.*]], i32 4, i32 1)
+// CHECK-NEXT:    ret void
+//
+void vstelm_w(v4i32 _1, void *_2) { return __lsx_vstelm_w(_1, _2, 4, 1); }
+// CHECK-LABEL: @vstelm_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.loongarch.lsx.vstelm.d(<2 x i64> [[_1:%.*]], ptr [[_2:%.*]], i32 8, i32 1)
+// CHECK-NEXT:    ret void
+//
+void vstelm_d(v2i64 _1, void *_2) { return __lsx_vstelm_d(_1, _2, 8, 1); }
+// CHECK-LABEL: @vaddwev_d_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddwev.d.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vaddwev_d_w(v4i32 _1, v4i32 _2) { return __lsx_vaddwev_d_w(_1, _2); }
+// CHECK-LABEL: @vaddwev_w_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vaddwev.w.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vaddwev_w_h(v8i16 _1, v8i16 _2) { return __lsx_vaddwev_w_h(_1, _2); }
+// CHECK-LABEL: @vaddwev_h_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vaddwev.h.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vaddwev_h_b(v16i8 _1, v16i8 _2) { return __lsx_vaddwev_h_b(_1, _2); }
+// CHECK-LABEL: @vaddwod_d_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddwod.d.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vaddwod_d_w(v4i32 _1, v4i32 _2) { return __lsx_vaddwod_d_w(_1, _2); }
+// CHECK-LABEL: @vaddwod_w_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vaddwod.w.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vaddwod_w_h(v8i16 _1, v8i16 _2) { return __lsx_vaddwod_w_h(_1, _2); }
+// CHECK-LABEL: @vaddwod_h_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vaddwod.h.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vaddwod_h_b(v16i8 _1, v16i8 _2) { return __lsx_vaddwod_h_b(_1, _2); }
+// CHECK-LABEL: @vaddwev_d_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddwev.d.wu(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vaddwev_d_wu(v4u32 _1, v4u32 _2) { return __lsx_vaddwev_d_wu(_1, _2); }
+// CHECK-LABEL: @vaddwev_w_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vaddwev.w.hu(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vaddwev_w_hu(v8u16 _1, v8u16 _2) { return __lsx_vaddwev_w_hu(_1, _2); }
+// CHECK-LABEL: @vaddwev_h_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vaddwev.h.bu(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vaddwev_h_bu(v16u8 _1, v16u8 _2) { return __lsx_vaddwev_h_bu(_1, _2); }
+// CHECK-LABEL: @vaddwod_d_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddwod.d.wu(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vaddwod_d_wu(v4u32 _1, v4u32 _2) { return __lsx_vaddwod_d_wu(_1, _2); }
+// CHECK-LABEL: @vaddwod_w_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vaddwod.w.hu(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vaddwod_w_hu(v8u16 _1, v8u16 _2) { return __lsx_vaddwod_w_hu(_1, _2); }
+// CHECK-LABEL: @vaddwod_h_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vaddwod.h.bu(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vaddwod_h_bu(v16u8 _1, v16u8 _2) { return __lsx_vaddwod_h_bu(_1, _2); }
+// CHECK-LABEL: @vaddwev_d_wu_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddwev.d.wu.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vaddwev_d_wu_w(v4u32 _1, v4i32 _2) {
+  return __lsx_vaddwev_d_wu_w(_1, _2);
+}
+// CHECK-LABEL: @vaddwev_w_hu_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vaddwev.w.hu.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vaddwev_w_hu_h(v8u16 _1, v8i16 _2) {
+  return __lsx_vaddwev_w_hu_h(_1, _2);
+}
+// CHECK-LABEL: @vaddwev_h_bu_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vaddwev.h.bu.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vaddwev_h_bu_b(v16u8 _1, v16i8 _2) {
+  return __lsx_vaddwev_h_bu_b(_1, _2);
+}
+// CHECK-LABEL: @vaddwod_d_wu_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddwod.d.wu.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vaddwod_d_wu_w(v4u32 _1, v4i32 _2) {
+  return __lsx_vaddwod_d_wu_w(_1, _2);
+}
+// CHECK-LABEL: @vaddwod_w_hu_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vaddwod.w.hu.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vaddwod_w_hu_h(v8u16 _1, v8i16 _2) {
+  return __lsx_vaddwod_w_hu_h(_1, _2);
+}
+// CHECK-LABEL: @vaddwod_h_bu_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vaddwod.h.bu.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vaddwod_h_bu_b(v16u8 _1, v16i8 _2) {
+  return __lsx_vaddwod_h_bu_b(_1, _2);
+}
+// CHECK-LABEL: @vsubwev_d_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsubwev.d.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vsubwev_d_w(v4i32 _1, v4i32 _2) { return __lsx_vsubwev_d_w(_1, _2); }
+// CHECK-LABEL: @vsubwev_w_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsubwev.w.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vsubwev_w_h(v8i16 _1, v8i16 _2) { return __lsx_vsubwev_w_h(_1, _2); }
+// CHECK-LABEL: @vsubwev_h_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsubwev.h.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vsubwev_h_b(v16i8 _1, v16i8 _2) { return __lsx_vsubwev_h_b(_1, _2); }
+// CHECK-LABEL: @vsubwod_d_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsubwod.d.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vsubwod_d_w(v4i32 _1, v4i32 _2) { return __lsx_vsubwod_d_w(_1, _2); }
+// CHECK-LABEL: @vsubwod_w_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsubwod.w.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vsubwod_w_h(v8i16 _1, v8i16 _2) { return __lsx_vsubwod_w_h(_1, _2); }
+// CHECK-LABEL: @vsubwod_h_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsubwod.h.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vsubwod_h_b(v16i8 _1, v16i8 _2) { return __lsx_vsubwod_h_b(_1, _2); }
+// CHECK-LABEL: @vsubwev_d_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsubwev.d.wu(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vsubwev_d_wu(v4u32 _1, v4u32 _2) { return __lsx_vsubwev_d_wu(_1, _2); }
+// CHECK-LABEL: @vsubwev_w_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsubwev.w.hu(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vsubwev_w_hu(v8u16 _1, v8u16 _2) { return __lsx_vsubwev_w_hu(_1, _2); }
+// CHECK-LABEL: @vsubwev_h_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsubwev.h.bu(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vsubwev_h_bu(v16u8 _1, v16u8 _2) { return __lsx_vsubwev_h_bu(_1, _2); }
+// CHECK-LABEL: @vsubwod_d_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsubwod.d.wu(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vsubwod_d_wu(v4u32 _1, v4u32 _2) { return __lsx_vsubwod_d_wu(_1, _2); }
+// CHECK-LABEL: @vsubwod_w_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsubwod.w.hu(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vsubwod_w_hu(v8u16 _1, v8u16 _2) { return __lsx_vsubwod_w_hu(_1, _2); }
+// CHECK-LABEL: @vsubwod_h_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsubwod.h.bu(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vsubwod_h_bu(v16u8 _1, v16u8 _2) { return __lsx_vsubwod_h_bu(_1, _2); }
+// CHECK-LABEL: @vaddwev_q_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddwev.q.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vaddwev_q_d(v2i64 _1, v2i64 _2) { return __lsx_vaddwev_q_d(_1, _2); }
+// CHECK-LABEL: @vaddwod_q_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddwod.q.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vaddwod_q_d(v2i64 _1, v2i64 _2) { return __lsx_vaddwod_q_d(_1, _2); }
+// CHECK-LABEL: @vaddwev_q_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddwev.q.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vaddwev_q_du(v2u64 _1, v2u64 _2) { return __lsx_vaddwev_q_du(_1, _2); }
+// CHECK-LABEL: @vaddwod_q_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddwod.q.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vaddwod_q_du(v2u64 _1, v2u64 _2) { return __lsx_vaddwod_q_du(_1, _2); }
+// CHECK-LABEL: @vsubwev_q_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsubwev.q.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vsubwev_q_d(v2i64 _1, v2i64 _2) { return __lsx_vsubwev_q_d(_1, _2); }
+// CHECK-LABEL: @vsubwod_q_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsubwod.q.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vsubwod_q_d(v2i64 _1, v2i64 _2) { return __lsx_vsubwod_q_d(_1, _2); }
+// CHECK-LABEL: @vsubwev_q_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsubwev.q.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vsubwev_q_du(v2u64 _1, v2u64 _2) { return __lsx_vsubwev_q_du(_1, _2); }
+// CHECK-LABEL: @vsubwod_q_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsubwod.q.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vsubwod_q_du(v2u64 _1, v2u64 _2) { return __lsx_vsubwod_q_du(_1, _2); }
+// CHECK-LABEL: @vaddwev_q_du_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddwev.q.du.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vaddwev_q_du_d(v2u64 _1, v2i64 _2) {
+  return __lsx_vaddwev_q_du_d(_1, _2);
+}
+// CHECK-LABEL: @vaddwod_q_du_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddwod.q.du.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vaddwod_q_du_d(v2u64 _1, v2i64 _2) {
+  return __lsx_vaddwod_q_du_d(_1, _2);
+}
+// CHECK-LABEL: @vmulwev_d_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmulwev.d.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vmulwev_d_w(v4i32 _1, v4i32 _2) { return __lsx_vmulwev_d_w(_1, _2); }
+// CHECK-LABEL: @vmulwev_w_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmulwev.w.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vmulwev_w_h(v8i16 _1, v8i16 _2) { return __lsx_vmulwev_w_h(_1, _2); }
+// CHECK-LABEL: @vmulwev_h_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmulwev.h.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vmulwev_h_b(v16i8 _1, v16i8 _2) { return __lsx_vmulwev_h_b(_1, _2); }
+// CHECK-LABEL: @vmulwod_d_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmulwod.d.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vmulwod_d_w(v4i32 _1, v4i32 _2) { return __lsx_vmulwod_d_w(_1, _2); }
+// CHECK-LABEL: @vmulwod_w_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmulwod.w.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vmulwod_w_h(v8i16 _1, v8i16 _2) { return __lsx_vmulwod_w_h(_1, _2); }
+// CHECK-LABEL: @vmulwod_h_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmulwod.h.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vmulwod_h_b(v16i8 _1, v16i8 _2) { return __lsx_vmulwod_h_b(_1, _2); }
+// CHECK-LABEL: @vmulwev_d_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmulwev.d.wu(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vmulwev_d_wu(v4u32 _1, v4u32 _2) { return __lsx_vmulwev_d_wu(_1, _2); }
+// CHECK-LABEL: @vmulwev_w_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmulwev.w.hu(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vmulwev_w_hu(v8u16 _1, v8u16 _2) { return __lsx_vmulwev_w_hu(_1, _2); }
+// CHECK-LABEL: @vmulwev_h_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmulwev.h.bu(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vmulwev_h_bu(v16u8 _1, v16u8 _2) { return __lsx_vmulwev_h_bu(_1, _2); }
+// CHECK-LABEL: @vmulwod_d_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmulwod.d.wu(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vmulwod_d_wu(v4u32 _1, v4u32 _2) { return __lsx_vmulwod_d_wu(_1, _2); }
+// CHECK-LABEL: @vmulwod_w_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmulwod.w.hu(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vmulwod_w_hu(v8u16 _1, v8u16 _2) { return __lsx_vmulwod_w_hu(_1, _2); }
+// CHECK-LABEL: @vmulwod_h_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmulwod.h.bu(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vmulwod_h_bu(v16u8 _1, v16u8 _2) { return __lsx_vmulwod_h_bu(_1, _2); }
+// CHECK-LABEL: @vmulwev_d_wu_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmulwev.d.wu.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vmulwev_d_wu_w(v4u32 _1, v4i32 _2) {
+  return __lsx_vmulwev_d_wu_w(_1, _2);
+}
+// CHECK-LABEL: @vmulwev_w_hu_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmulwev.w.hu.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vmulwev_w_hu_h(v8u16 _1, v8i16 _2) {
+  return __lsx_vmulwev_w_hu_h(_1, _2);
+}
+// CHECK-LABEL: @vmulwev_h_bu_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmulwev.h.bu.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vmulwev_h_bu_b(v16u8 _1, v16i8 _2) {
+  return __lsx_vmulwev_h_bu_b(_1, _2);
+}
+// CHECK-LABEL: @vmulwod_d_wu_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmulwod.d.wu.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vmulwod_d_wu_w(v4u32 _1, v4i32 _2) {
+  return __lsx_vmulwod_d_wu_w(_1, _2);
+}
+// CHECK-LABEL: @vmulwod_w_hu_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmulwod.w.hu.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vmulwod_w_hu_h(v8u16 _1, v8i16 _2) {
+  return __lsx_vmulwod_w_hu_h(_1, _2);
+}
+// CHECK-LABEL: @vmulwod_h_bu_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmulwod.h.bu.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vmulwod_h_bu_b(v16u8 _1, v16i8 _2) {
+  return __lsx_vmulwod_h_bu_b(_1, _2);
+}
+// CHECK-LABEL: @vmulwev_q_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmulwev.q.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vmulwev_q_d(v2i64 _1, v2i64 _2) { return __lsx_vmulwev_q_d(_1, _2); }
+// CHECK-LABEL: @vmulwod_q_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmulwod.q.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vmulwod_q_d(v2i64 _1, v2i64 _2) { return __lsx_vmulwod_q_d(_1, _2); }
+// CHECK-LABEL: @vmulwev_q_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmulwev.q.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vmulwev_q_du(v2u64 _1, v2u64 _2) { return __lsx_vmulwev_q_du(_1, _2); }
+// CHECK-LABEL: @vmulwod_q_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmulwod.q.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vmulwod_q_du(v2u64 _1, v2u64 _2) { return __lsx_vmulwod_q_du(_1, _2); }
+// CHECK-LABEL: @vmulwev_q_du_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmulwev.q.du.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vmulwev_q_du_d(v2u64 _1, v2i64 _2) {
+  return __lsx_vmulwev_q_du_d(_1, _2);
+}
+// CHECK-LABEL: @vmulwod_q_du_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmulwod.q.du.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vmulwod_q_du_d(v2u64 _1, v2i64 _2) {
+  return __lsx_vmulwod_q_du_d(_1, _2);
+}
+// CHECK-LABEL: @vhaddw_q_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vhaddw.q.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vhaddw_q_d(v2i64 _1, v2i64 _2) { return __lsx_vhaddw_q_d(_1, _2); }
+// CHECK-LABEL: @vhaddw_qu_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vhaddw.qu.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2u64 vhaddw_qu_du(v2u64 _1, v2u64 _2) { return __lsx_vhaddw_qu_du(_1, _2); }
+// CHECK-LABEL: @vhsubw_q_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vhsubw.q.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vhsubw_q_d(v2i64 _1, v2i64 _2) { return __lsx_vhsubw_q_d(_1, _2); }
+// CHECK-LABEL: @vhsubw_qu_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vhsubw.qu.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2u64 vhsubw_qu_du(v2u64 _1, v2u64 _2) { return __lsx_vhsubw_qu_du(_1, _2); }
+// CHECK-LABEL: @vmaddwev_d_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaddwev.d.w(<2 x i64> [[_1:%.*]], <4 x i32> [[_2:%.*]], <4 x i32> [[_3:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vmaddwev_d_w(v2i64 _1, v4i32 _2, v4i32 _3) {
+  return __lsx_vmaddwev_d_w(_1, _2, _3);
+}
+// CHECK-LABEL: @vmaddwev_w_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmaddwev.w.h(<4 x i32> [[_1:%.*]], <8 x i16> [[_2:%.*]], <8 x i16> [[_3:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vmaddwev_w_h(v4i32 _1, v8i16 _2, v8i16 _3) {
+  return __lsx_vmaddwev_w_h(_1, _2, _3);
+}
+// CHECK-LABEL: @vmaddwev_h_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmaddwev.h.b(<8 x i16> [[_1:%.*]], <16 x i8> [[_2:%.*]], <16 x i8> [[_3:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vmaddwev_h_b(v8i16 _1, v16i8 _2, v16i8 _3) {
+  return __lsx_vmaddwev_h_b(_1, _2, _3);
+}
+// CHECK-LABEL: @vmaddwev_d_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaddwev.d.wu(<2 x i64> [[_1:%.*]], <4 x i32> [[_2:%.*]], <4 x i32> [[_3:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2u64 vmaddwev_d_wu(v2u64 _1, v4u32 _2, v4u32 _3) {
+  return __lsx_vmaddwev_d_wu(_1, _2, _3);
+}
+// CHECK-LABEL: @vmaddwev_w_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmaddwev.w.hu(<4 x i32> [[_1:%.*]], <8 x i16> [[_2:%.*]], <8 x i16> [[_3:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4u32 vmaddwev_w_hu(v4u32 _1, v8u16 _2, v8u16 _3) {
+  return __lsx_vmaddwev_w_hu(_1, _2, _3);
+}
+// CHECK-LABEL: @vmaddwev_h_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmaddwev.h.bu(<8 x i16> [[_1:%.*]], <16 x i8> [[_2:%.*]], <16 x i8> [[_3:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8u16 vmaddwev_h_bu(v8u16 _1, v16u8 _2, v16u8 _3) {
+  return __lsx_vmaddwev_h_bu(_1, _2, _3);
+}
+// CHECK-LABEL: @vmaddwod_d_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaddwod.d.w(<2 x i64> [[_1:%.*]], <4 x i32> [[_2:%.*]], <4 x i32> [[_3:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vmaddwod_d_w(v2i64 _1, v4i32 _2, v4i32 _3) {
+  return __lsx_vmaddwod_d_w(_1, _2, _3);
+}
+// CHECK-LABEL: @vmaddwod_w_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmaddwod.w.h(<4 x i32> [[_1:%.*]], <8 x i16> [[_2:%.*]], <8 x i16> [[_3:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vmaddwod_w_h(v4i32 _1, v8i16 _2, v8i16 _3) {
+  return __lsx_vmaddwod_w_h(_1, _2, _3);
+}
+// CHECK-LABEL: @vmaddwod_h_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmaddwod.h.b(<8 x i16> [[_1:%.*]], <16 x i8> [[_2:%.*]], <16 x i8> [[_3:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vmaddwod_h_b(v8i16 _1, v16i8 _2, v16i8 _3) {
+  return __lsx_vmaddwod_h_b(_1, _2, _3);
+}
+// CHECK-LABEL: @vmaddwod_d_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaddwod.d.wu(<2 x i64> [[_1:%.*]], <4 x i32> [[_2:%.*]], <4 x i32> [[_3:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2u64 vmaddwod_d_wu(v2u64 _1, v4u32 _2, v4u32 _3) {
+  return __lsx_vmaddwod_d_wu(_1, _2, _3);
+}
+// CHECK-LABEL: @vmaddwod_w_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmaddwod.w.hu(<4 x i32> [[_1:%.*]], <8 x i16> [[_2:%.*]], <8 x i16> [[_3:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4u32 vmaddwod_w_hu(v4u32 _1, v8u16 _2, v8u16 _3) {
+  return __lsx_vmaddwod_w_hu(_1, _2, _3);
+}
+// CHECK-LABEL: @vmaddwod_h_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmaddwod.h.bu(<8 x i16> [[_1:%.*]], <16 x i8> [[_2:%.*]], <16 x i8> [[_3:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8u16 vmaddwod_h_bu(v8u16 _1, v16u8 _2, v16u8 _3) {
+  return __lsx_vmaddwod_h_bu(_1, _2, _3);
+}
+// CHECK-LABEL: @vmaddwev_d_wu_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaddwev.d.wu.w(<2 x i64> [[_1:%.*]], <4 x i32> [[_2:%.*]], <4 x i32> [[_3:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vmaddwev_d_wu_w(v2i64 _1, v4u32 _2, v4i32 _3) {
+  return __lsx_vmaddwev_d_wu_w(_1, _2, _3);
+}
+// CHECK-LABEL: @vmaddwev_w_hu_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmaddwev.w.hu.h(<4 x i32> [[_1:%.*]], <8 x i16> [[_2:%.*]], <8 x i16> [[_3:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vmaddwev_w_hu_h(v4i32 _1, v8u16 _2, v8i16 _3) {
+  return __lsx_vmaddwev_w_hu_h(_1, _2, _3);
+}
+// CHECK-LABEL: @vmaddwev_h_bu_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmaddwev.h.bu.b(<8 x i16> [[_1:%.*]], <16 x i8> [[_2:%.*]], <16 x i8> [[_3:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vmaddwev_h_bu_b(v8i16 _1, v16u8 _2, v16i8 _3) {
+  return __lsx_vmaddwev_h_bu_b(_1, _2, _3);
+}
+// CHECK-LABEL: @vmaddwod_d_wu_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaddwod.d.wu.w(<2 x i64> [[_1:%.*]], <4 x i32> [[_2:%.*]], <4 x i32> [[_3:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vmaddwod_d_wu_w(v2i64 _1, v4u32 _2, v4i32 _3) {
+  return __lsx_vmaddwod_d_wu_w(_1, _2, _3);
+}
+// CHECK-LABEL: @vmaddwod_w_hu_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmaddwod.w.hu.h(<4 x i32> [[_1:%.*]], <8 x i16> [[_2:%.*]], <8 x i16> [[_3:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vmaddwod_w_hu_h(v4i32 _1, v8u16 _2, v8i16 _3) {
+  return __lsx_vmaddwod_w_hu_h(_1, _2, _3);
+}
+// CHECK-LABEL: @vmaddwod_h_bu_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmaddwod.h.bu.b(<8 x i16> [[_1:%.*]], <16 x i8> [[_2:%.*]], <16 x i8> [[_3:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vmaddwod_h_bu_b(v8i16 _1, v16u8 _2, v16i8 _3) {
+  return __lsx_vmaddwod_h_bu_b(_1, _2, _3);
+}
+// CHECK-LABEL: @vmaddwev_q_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaddwev.q.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], <2 x i64> [[_3:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vmaddwev_q_d(v2i64 _1, v2i64 _2, v2i64 _3) {
+  return __lsx_vmaddwev_q_d(_1, _2, _3);
+}
+// CHECK-LABEL: @vmaddwod_q_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaddwod.q.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], <2 x i64> [[_3:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vmaddwod_q_d(v2i64 _1, v2i64 _2, v2i64 _3) {
+  return __lsx_vmaddwod_q_d(_1, _2, _3);
+}
+// CHECK-LABEL: @vmaddwev_q_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaddwev.q.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], <2 x i64> [[_3:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2u64 vmaddwev_q_du(v2u64 _1, v2u64 _2, v2u64 _3) {
+  return __lsx_vmaddwev_q_du(_1, _2, _3);
+}
+// CHECK-LABEL: @vmaddwod_q_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaddwod.q.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], <2 x i64> [[_3:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2u64 vmaddwod_q_du(v2u64 _1, v2u64 _2, v2u64 _3) {
+  return __lsx_vmaddwod_q_du(_1, _2, _3);
+}
+// CHECK-LABEL: @vmaddwev_q_du_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaddwev.q.du.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], <2 x i64> [[_3:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vmaddwev_q_du_d(v2i64 _1, v2u64 _2, v2i64 _3) {
+  return __lsx_vmaddwev_q_du_d(_1, _2, _3);
+}
+// CHECK-LABEL: @vmaddwod_q_du_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaddwod.q.du.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], <2 x i64> [[_3:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vmaddwod_q_du_d(v2i64 _1, v2u64 _2, v2i64 _3) {
+  return __lsx_vmaddwod_q_du_d(_1, _2, _3);
+}
+// CHECK-LABEL: @vrotr_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vrotr.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vrotr_b(v16i8 _1, v16i8 _2) { return __lsx_vrotr_b(_1, _2); }
+// CHECK-LABEL: @vrotr_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vrotr.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vrotr_h(v8i16 _1, v8i16 _2) { return __lsx_vrotr_h(_1, _2); }
+// CHECK-LABEL: @vrotr_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vrotr.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vrotr_w(v4i32 _1, v4i32 _2) { return __lsx_vrotr_w(_1, _2); }
+// CHECK-LABEL: @vrotr_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vrotr.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vrotr_d(v2i64 _1, v2i64 _2) { return __lsx_vrotr_d(_1, _2); }
+// CHECK-LABEL: @vadd_q(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vadd.q(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vadd_q(v2i64 _1, v2i64 _2) { return __lsx_vadd_q(_1, _2); }
+// CHECK-LABEL: @vsub_q(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsub.q(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vsub_q(v2i64 _1, v2i64 _2) { return __lsx_vsub_q(_1, _2); }
+// CHECK-LABEL: @vldrepl_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vldrepl.b(ptr [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vldrepl_b(void *_1) { return __lsx_vldrepl_b(_1, 1); }
+// CHECK-LABEL: @vldrepl_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vldrepl.h(ptr [[_1:%.*]], i32 2)
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vldrepl_h(void *_1) { return __lsx_vldrepl_h(_1, 2); }
+// CHECK-LABEL: @vldrepl_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vldrepl.w(ptr [[_1:%.*]], i32 4)
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vldrepl_w(void *_1) { return __lsx_vldrepl_w(_1, 4); }
+// CHECK-LABEL: @vldrepl_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vldrepl.d(ptr [[_1:%.*]], i32 8)
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vldrepl_d(void *_1) { return __lsx_vldrepl_d(_1, 8); }
+// CHECK-LABEL: @vmskgez_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmskgez.b(<16 x i8> [[_1:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vmskgez_b(v16i8 _1) { return __lsx_vmskgez_b(_1); }
+// CHECK-LABEL: @vmsknz_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmsknz.b(<16 x i8> [[_1:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vmsknz_b(v16i8 _1) { return __lsx_vmsknz_b(_1); }
+// CHECK-LABEL: @vexth_h_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vexth.h.b(<16 x i8> [[_1:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vexth_h_b(v16i8 _1) { return __lsx_vexth_h_b(_1); }
+// CHECK-LABEL: @vexth_w_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vexth.w.h(<8 x i16> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vexth_w_h(v8i16 _1) { return __lsx_vexth_w_h(_1); }
+// CHECK-LABEL: @vexth_d_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vexth.d.w(<4 x i32> [[_1:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vexth_d_w(v4i32 _1) { return __lsx_vexth_d_w(_1); }
+// CHECK-LABEL: @vexth_q_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vexth.q.d(<2 x i64> [[_1:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vexth_q_d(v2i64 _1) { return __lsx_vexth_q_d(_1); }
+// CHECK-LABEL: @vexth_hu_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vexth.hu.bu(<16 x i8> [[_1:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8u16 vexth_hu_bu(v16u8 _1) { return __lsx_vexth_hu_bu(_1); }
+// CHECK-LABEL: @vexth_wu_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vexth.wu.hu(<8 x i16> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4u32 vexth_wu_hu(v8u16 _1) { return __lsx_vexth_wu_hu(_1); }
+// CHECK-LABEL: @vexth_du_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vexth.du.wu(<4 x i32> [[_1:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2u64 vexth_du_wu(v4u32 _1) { return __lsx_vexth_du_wu(_1); }
+// CHECK-LABEL: @vexth_qu_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vexth.qu.du(<2 x i64> [[_1:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2u64 vexth_qu_du(v2u64 _1) { return __lsx_vexth_qu_du(_1); }
+// CHECK-LABEL: @vrotri_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vrotri.b(<16 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vrotri_b(v16i8 _1) { return __lsx_vrotri_b(_1, 1); }
+// CHECK-LABEL: @vrotri_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vrotri.h(<8 x i16> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vrotri_h(v8i16 _1) { return __lsx_vrotri_h(_1, 1); }
+// CHECK-LABEL: @vrotri_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vrotri.w(<4 x i32> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vrotri_w(v4i32 _1) { return __lsx_vrotri_w(_1, 1); }
+// CHECK-LABEL: @vrotri_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vrotri.d(<2 x i64> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vrotri_d(v2i64 _1) { return __lsx_vrotri_d(_1, 1); }
+// CHECK-LABEL: @vextl_q_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vextl.q.d(<2 x i64> [[_1:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vextl_q_d(v2i64 _1) { return __lsx_vextl_q_d(_1); }
+// CHECK-LABEL: @vsrlni_b_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrlni.b.h(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vsrlni_b_h(v16i8 _1, v16i8 _2) { return __lsx_vsrlni_b_h(_1, _2, 1); }
+// CHECK-LABEL: @vsrlni_h_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrlni.h.w(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vsrlni_h_w(v8i16 _1, v8i16 _2) { return __lsx_vsrlni_h_w(_1, _2, 1); }
+// CHECK-LABEL: @vsrlni_w_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrlni.w.d(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vsrlni_w_d(v4i32 _1, v4i32 _2) { return __lsx_vsrlni_w_d(_1, _2, 1); }
+// CHECK-LABEL: @vsrlni_d_q(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsrlni.d.q(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vsrlni_d_q(v2i64 _1, v2i64 _2) { return __lsx_vsrlni_d_q(_1, _2, 1); }
+// CHECK-LABEL: @vsrlrni_b_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrlrni.b.h(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vsrlrni_b_h(v16i8 _1, v16i8 _2) { return __lsx_vsrlrni_b_h(_1, _2, 1); }
+// CHECK-LABEL: @vsrlrni_h_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrlrni.h.w(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vsrlrni_h_w(v8i16 _1, v8i16 _2) { return __lsx_vsrlrni_h_w(_1, _2, 1); }
+// CHECK-LABEL: @vsrlrni_w_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrlrni.w.d(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vsrlrni_w_d(v4i32 _1, v4i32 _2) { return __lsx_vsrlrni_w_d(_1, _2, 1); }
+// CHECK-LABEL: @vsrlrni_d_q(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsrlrni.d.q(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vsrlrni_d_q(v2i64 _1, v2i64 _2) { return __lsx_vsrlrni_d_q(_1, _2, 1); }
+// CHECK-LABEL: @vssrlni_b_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrlni.b.h(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vssrlni_b_h(v16i8 _1, v16i8 _2) { return __lsx_vssrlni_b_h(_1, _2, 1); }
+// CHECK-LABEL: @vssrlni_h_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrlni.h.w(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vssrlni_h_w(v8i16 _1, v8i16 _2) { return __lsx_vssrlni_h_w(_1, _2, 1); }
+// CHECK-LABEL: @vssrlni_w_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrlni.w.d(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vssrlni_w_d(v4i32 _1, v4i32 _2) { return __lsx_vssrlni_w_d(_1, _2, 1); }
+// CHECK-LABEL: @vssrlni_d_q(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vssrlni.d.q(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vssrlni_d_q(v2i64 _1, v2i64 _2) { return __lsx_vssrlni_d_q(_1, _2, 1); }
+// CHECK-LABEL: @vssrlni_bu_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrlni.bu.h(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16u8 vssrlni_bu_h(v16u8 _1, v16i8 _2) { return __lsx_vssrlni_bu_h(_1, _2, 1); }
+// CHECK-LABEL: @vssrlni_hu_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrlni.hu.w(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8u16 vssrlni_hu_w(v8u16 _1, v8i16 _2) { return __lsx_vssrlni_hu_w(_1, _2, 1); }
+// CHECK-LABEL: @vssrlni_wu_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrlni.wu.d(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4u32 vssrlni_wu_d(v4u32 _1, v4i32 _2) { return __lsx_vssrlni_wu_d(_1, _2, 1); }
+// CHECK-LABEL: @vssrlni_du_q(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vssrlni.du.q(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2u64 vssrlni_du_q(v2u64 _1, v2i64 _2) { return __lsx_vssrlni_du_q(_1, _2, 1); }
+// CHECK-LABEL: @vssrlrni_b_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrlrni.b.h(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vssrlrni_b_h(v16i8 _1, v16i8 _2) { return __lsx_vssrlrni_b_h(_1, _2, 1); }
+// CHECK-LABEL: @vssrlrni_h_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrlrni.h.w(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vssrlrni_h_w(v8i16 _1, v8i16 _2) { return __lsx_vssrlrni_h_w(_1, _2, 1); }
+// CHECK-LABEL: @vssrlrni_w_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrlrni.w.d(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vssrlrni_w_d(v4i32 _1, v4i32 _2) { return __lsx_vssrlrni_w_d(_1, _2, 1); }
+// CHECK-LABEL: @vssrlrni_d_q(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vssrlrni.d.q(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vssrlrni_d_q(v2i64 _1, v2i64 _2) { return __lsx_vssrlrni_d_q(_1, _2, 1); }
+// CHECK-LABEL: @vssrlrni_bu_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrlrni.bu.h(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16u8 vssrlrni_bu_h(v16u8 _1, v16i8 _2) {
+  return __lsx_vssrlrni_bu_h(_1, _2, 1);
+}
+// CHECK-LABEL: @vssrlrni_hu_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrlrni.hu.w(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8u16 vssrlrni_hu_w(v8u16 _1, v8i16 _2) {
+  return __lsx_vssrlrni_hu_w(_1, _2, 1);
+}
+// CHECK-LABEL: @vssrlrni_wu_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrlrni.wu.d(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4u32 vssrlrni_wu_d(v4u32 _1, v4i32 _2) {
+  return __lsx_vssrlrni_wu_d(_1, _2, 1);
+}
+// CHECK-LABEL: @vssrlrni_du_q(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vssrlrni.du.q(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2u64 vssrlrni_du_q(v2u64 _1, v2i64 _2) {
+  return __lsx_vssrlrni_du_q(_1, _2, 1);
+}
+// CHECK-LABEL: @vsrani_b_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrani.b.h(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vsrani_b_h(v16i8 _1, v16i8 _2) { return __lsx_vsrani_b_h(_1, _2, 1); }
+// CHECK-LABEL: @vsrani_h_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrani.h.w(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vsrani_h_w(v8i16 _1, v8i16 _2) { return __lsx_vsrani_h_w(_1, _2, 1); }
+// CHECK-LABEL: @vsrani_w_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrani.w.d(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vsrani_w_d(v4i32 _1, v4i32 _2) { return __lsx_vsrani_w_d(_1, _2, 1); }
+// CHECK-LABEL: @vsrani_d_q(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsrani.d.q(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vsrani_d_q(v2i64 _1, v2i64 _2) { return __lsx_vsrani_d_q(_1, _2, 1); }
+// CHECK-LABEL: @vsrarni_b_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrarni.b.h(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vsrarni_b_h(v16i8 _1, v16i8 _2) { return __lsx_vsrarni_b_h(_1, _2, 1); }
+// CHECK-LABEL: @vsrarni_h_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrarni.h.w(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vsrarni_h_w(v8i16 _1, v8i16 _2) { return __lsx_vsrarni_h_w(_1, _2, 1); }
+// CHECK-LABEL: @vsrarni_w_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrarni.w.d(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vsrarni_w_d(v4i32 _1, v4i32 _2) { return __lsx_vsrarni_w_d(_1, _2, 1); }
+// CHECK-LABEL: @vsrarni_d_q(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsrarni.d.q(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vsrarni_d_q(v2i64 _1, v2i64 _2) { return __lsx_vsrarni_d_q(_1, _2, 1); }
+// CHECK-LABEL: @vssrani_b_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrani.b.h(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vssrani_b_h(v16i8 _1, v16i8 _2) { return __lsx_vssrani_b_h(_1, _2, 1); }
+// CHECK-LABEL: @vssrani_h_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrani.h.w(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vssrani_h_w(v8i16 _1, v8i16 _2) { return __lsx_vssrani_h_w(_1, _2, 1); }
+// CHECK-LABEL: @vssrani_w_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrani.w.d(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vssrani_w_d(v4i32 _1, v4i32 _2) { return __lsx_vssrani_w_d(_1, _2, 1); }
+// CHECK-LABEL: @vssrani_d_q(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vssrani.d.q(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vssrani_d_q(v2i64 _1, v2i64 _2) { return __lsx_vssrani_d_q(_1, _2, 1); }
+// CHECK-LABEL: @vssrani_bu_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrani.bu.h(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16u8 vssrani_bu_h(v16u8 _1, v16i8 _2) { return __lsx_vssrani_bu_h(_1, _2, 1); }
+// CHECK-LABEL: @vssrani_hu_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrani.hu.w(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8u16 vssrani_hu_w(v8u16 _1, v8i16 _2) { return __lsx_vssrani_hu_w(_1, _2, 1); }
+// CHECK-LABEL: @vssrani_wu_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrani.wu.d(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4u32 vssrani_wu_d(v4u32 _1, v4i32 _2) { return __lsx_vssrani_wu_d(_1, _2, 1); }
+// CHECK-LABEL: @vssrani_du_q(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vssrani.du.q(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2u64 vssrani_du_q(v2u64 _1, v2i64 _2) { return __lsx_vssrani_du_q(_1, _2, 1); }
+// CHECK-LABEL: @vssrarni_b_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrarni.b.h(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vssrarni_b_h(v16i8 _1, v16i8 _2) { return __lsx_vssrarni_b_h(_1, _2, 1); }
+// CHECK-LABEL: @vssrarni_h_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrarni.h.w(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vssrarni_h_w(v8i16 _1, v8i16 _2) { return __lsx_vssrarni_h_w(_1, _2, 1); }
+// CHECK-LABEL: @vssrarni_w_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrarni.w.d(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vssrarni_w_d(v4i32 _1, v4i32 _2) { return __lsx_vssrarni_w_d(_1, _2, 1); }
+// CHECK-LABEL: @vssrarni_d_q(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vssrarni.d.q(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vssrarni_d_q(v2i64 _1, v2i64 _2) { return __lsx_vssrarni_d_q(_1, _2, 1); }
+// CHECK-LABEL: @vssrarni_bu_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrarni.bu.h(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16u8 vssrarni_bu_h(v16u8 _1, v16i8 _2) {
+  return __lsx_vssrarni_bu_h(_1, _2, 1);
+}
+// CHECK-LABEL: @vssrarni_hu_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrarni.hu.w(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8u16 vssrarni_hu_w(v8u16 _1, v8i16 _2) {
+  return __lsx_vssrarni_hu_w(_1, _2, 1);
+}
+// CHECK-LABEL: @vssrarni_wu_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrarni.wu.d(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4u32 vssrarni_wu_d(v4u32 _1, v4i32 _2) {
+  return __lsx_vssrarni_wu_d(_1, _2, 1);
+}
+// CHECK-LABEL: @vssrarni_du_q(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vssrarni.du.q(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2u64 vssrarni_du_q(v2u64 _1, v2i64 _2) {
+  return __lsx_vssrarni_du_q(_1, _2, 1);
+}
+// CHECK-LABEL: @vpermi_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vpermi.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vpermi_w(v4i32 _1, v4i32 _2) { return __lsx_vpermi_w(_1, _2, 1); }
+// CHECK-LABEL: @vld(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vld(ptr [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vld(void *_1) { return __lsx_vld(_1, 1); }
+// CHECK-LABEL: @vst(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.loongarch.lsx.vst(<16 x i8> [[_1:%.*]], ptr [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret void
+//
+void vst(v16i8 _1, void *_2) { return __lsx_vst(_1, _2, 1); }
+// CHECK-LABEL: @vssrlrn_b_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrlrn.b.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vssrlrn_b_h(v8i16 _1, v8i16 _2) { return __lsx_vssrlrn_b_h(_1, _2); }
+// CHECK-LABEL: @vssrlrn_h_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrlrn.h.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vssrlrn_h_w(v4i32 _1, v4i32 _2) { return __lsx_vssrlrn_h_w(_1, _2); }
+// CHECK-LABEL: @vssrlrn_w_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrlrn.w.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vssrlrn_w_d(v2i64 _1, v2i64 _2) { return __lsx_vssrlrn_w_d(_1, _2); }
+// CHECK-LABEL: @vssrln_b_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrln.b.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vssrln_b_h(v8i16 _1, v8i16 _2) { return __lsx_vssrln_b_h(_1, _2); }
+// CHECK-LABEL: @vssrln_h_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrln.h.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vssrln_h_w(v4i32 _1, v4i32 _2) { return __lsx_vssrln_h_w(_1, _2); }
+// CHECK-LABEL: @vssrln_w_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrln.w.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vssrln_w_d(v2i64 _1, v2i64 _2) { return __lsx_vssrln_w_d(_1, _2); }
+// CHECK-LABEL: @vorn_v(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vorn.v(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vorn_v(v16i8 _1, v16i8 _2) { return __lsx_vorn_v(_1, _2); }
+// CHECK-LABEL: @vldi(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vldi(i32 1)
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vldi() { return __lsx_vldi(1); }
+// CHECK-LABEL: @vshuf_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vshuf.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]], <16 x i8> [[_3:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vshuf_b(v16i8 _1, v16i8 _2, v16i8 _3) {
+  return __lsx_vshuf_b(_1, _2, _3);
+}
+// CHECK-LABEL: @vldx(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vldx(ptr [[_1:%.*]], i64 1)
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vldx(void *_1) { return __lsx_vldx(_1, 1); }
+// CHECK-LABEL: @vstx(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.loongarch.lsx.vstx(<16 x i8> [[_1:%.*]], ptr [[_2:%.*]], i64 1)
+// CHECK-NEXT:    ret void
+//
+void vstx(v16i8 _1, void *_2) { return __lsx_vstx(_1, _2, 1); }
+// CHECK-LABEL: @vextl_qu_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vextl.qu.du(<2 x i64> [[_1:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2u64 vextl_qu_du(v2u64 _1) { return __lsx_vextl_qu_du(_1); }
+// CHECK-LABEL: @bnz_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lsx.bnz.b(<16 x i8> [[_1:%.*]])
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+int bnz_b(v16u8 _1) { return __lsx_bnz_b(_1); }
+// CHECK-LABEL: @bnz_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lsx.bnz.d(<2 x i64> [[_1:%.*]])
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+int bnz_d(v2u64 _1) { return __lsx_bnz_d(_1); }
+// CHECK-LABEL: @bnz_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lsx.bnz.h(<8 x i16> [[_1:%.*]])
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+int bnz_h(v8u16 _1) { return __lsx_bnz_h(_1); }
+// CHECK-LABEL: @bnz_v(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lsx.bnz.v(<16 x i8> [[_1:%.*]])
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+int bnz_v(v16u8 _1) { return __lsx_bnz_v(_1); }
+// CHECK-LABEL: @bnz_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lsx.bnz.w(<4 x i32> [[_1:%.*]])
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+int bnz_w(v4u32 _1) { return __lsx_bnz_w(_1); }
+// CHECK-LABEL: @bz_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lsx.bz.b(<16 x i8> [[_1:%.*]])
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+int bz_b(v16u8 _1) { return __lsx_bz_b(_1); }
+// CHECK-LABEL: @bz_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lsx.bz.d(<2 x i64> [[_1:%.*]])
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+int bz_d(v2u64 _1) { return __lsx_bz_d(_1); }
+// CHECK-LABEL: @bz_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lsx.bz.h(<8 x i16> [[_1:%.*]])
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+int bz_h(v8u16 _1) { return __lsx_bz_h(_1); }
+// CHECK-LABEL: @bz_v(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lsx.bz.v(<16 x i8> [[_1:%.*]])
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+int bz_v(v16u8 _1) { return __lsx_bz_v(_1); }
+// CHECK-LABEL: @bz_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lsx.bz.w(<4 x i32> [[_1:%.*]])
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+int bz_w(v4u32 _1) { return __lsx_bz_w(_1); }
+// CHECK-LABEL: @vfcmp_caf_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.caf.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vfcmp_caf_d(v2f64 _1, v2f64 _2) { return __lsx_vfcmp_caf_d(_1, _2); }
+// CHECK-LABEL: @vfcmp_caf_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.caf.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vfcmp_caf_s(v4f32 _1, v4f32 _2) { return __lsx_vfcmp_caf_s(_1, _2); }
+// CHECK-LABEL: @vfcmp_ceq_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.ceq.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vfcmp_ceq_d(v2f64 _1, v2f64 _2) { return __lsx_vfcmp_ceq_d(_1, _2); }
+// CHECK-LABEL: @vfcmp_ceq_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.ceq.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vfcmp_ceq_s(v4f32 _1, v4f32 _2) { return __lsx_vfcmp_ceq_s(_1, _2); }
+// CHECK-LABEL: @vfcmp_cle_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.cle.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vfcmp_cle_d(v2f64 _1, v2f64 _2) { return __lsx_vfcmp_cle_d(_1, _2); }
+// CHECK-LABEL: @vfcmp_cle_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.cle.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vfcmp_cle_s(v4f32 _1, v4f32 _2) { return __lsx_vfcmp_cle_s(_1, _2); }
+// CHECK-LABEL: @vfcmp_clt_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.clt.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vfcmp_clt_d(v2f64 _1, v2f64 _2) { return __lsx_vfcmp_clt_d(_1, _2); }
+// CHECK-LABEL: @vfcmp_clt_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.clt.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vfcmp_clt_s(v4f32 _1, v4f32 _2) { return __lsx_vfcmp_clt_s(_1, _2); }
+// CHECK-LABEL: @vfcmp_cne_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.cne.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vfcmp_cne_d(v2f64 _1, v2f64 _2) { return __lsx_vfcmp_cne_d(_1, _2); }
+// CHECK-LABEL: @vfcmp_cne_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.cne.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vfcmp_cne_s(v4f32 _1, v4f32 _2) { return __lsx_vfcmp_cne_s(_1, _2); }
+// CHECK-LABEL: @vfcmp_cor_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.cor.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vfcmp_cor_d(v2f64 _1, v2f64 _2) { return __lsx_vfcmp_cor_d(_1, _2); }
+// CHECK-LABEL: @vfcmp_cor_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.cor.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vfcmp_cor_s(v4f32 _1, v4f32 _2) { return __lsx_vfcmp_cor_s(_1, _2); }
+// CHECK-LABEL: @vfcmp_cueq_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.cueq.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vfcmp_cueq_d(v2f64 _1, v2f64 _2) { return __lsx_vfcmp_cueq_d(_1, _2); }
+// CHECK-LABEL: @vfcmp_cueq_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.cueq.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vfcmp_cueq_s(v4f32 _1, v4f32 _2) { return __lsx_vfcmp_cueq_s(_1, _2); }
+// CHECK-LABEL: @vfcmp_cule_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.cule.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vfcmp_cule_d(v2f64 _1, v2f64 _2) { return __lsx_vfcmp_cule_d(_1, _2); }
+// CHECK-LABEL: @vfcmp_cule_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.cule.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vfcmp_cule_s(v4f32 _1, v4f32 _2) { return __lsx_vfcmp_cule_s(_1, _2); }
+// CHECK-LABEL: @vfcmp_cult_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.cult.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vfcmp_cult_d(v2f64 _1, v2f64 _2) { return __lsx_vfcmp_cult_d(_1, _2); }
+// CHECK-LABEL: @vfcmp_cult_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.cult.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vfcmp_cult_s(v4f32 _1, v4f32 _2) { return __lsx_vfcmp_cult_s(_1, _2); }
+// CHECK-LABEL: @vfcmp_cun_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.cun.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vfcmp_cun_d(v2f64 _1, v2f64 _2) { return __lsx_vfcmp_cun_d(_1, _2); }
+// CHECK-LABEL: @vfcmp_cune_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.cune.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vfcmp_cune_d(v2f64 _1, v2f64 _2) { return __lsx_vfcmp_cune_d(_1, _2); }
+// CHECK-LABEL: @vfcmp_cune_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.cune.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vfcmp_cune_s(v4f32 _1, v4f32 _2) { return __lsx_vfcmp_cune_s(_1, _2); }
+// CHECK-LABEL: @vfcmp_cun_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.cun.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vfcmp_cun_s(v4f32 _1, v4f32 _2) { return __lsx_vfcmp_cun_s(_1, _2); }
+// CHECK-LABEL: @vfcmp_saf_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.saf.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vfcmp_saf_d(v2f64 _1, v2f64 _2) { return __lsx_vfcmp_saf_d(_1, _2); }
+// CHECK-LABEL: @vfcmp_saf_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.saf.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vfcmp_saf_s(v4f32 _1, v4f32 _2) { return __lsx_vfcmp_saf_s(_1, _2); }
+// CHECK-LABEL: @vfcmp_seq_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.seq.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vfcmp_seq_d(v2f64 _1, v2f64 _2) { return __lsx_vfcmp_seq_d(_1, _2); }
+// CHECK-LABEL: @vfcmp_seq_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.seq.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vfcmp_seq_s(v4f32 _1, v4f32 _2) { return __lsx_vfcmp_seq_s(_1, _2); }
+// CHECK-LABEL: @vfcmp_sle_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.sle.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vfcmp_sle_d(v2f64 _1, v2f64 _2) { return __lsx_vfcmp_sle_d(_1, _2); }
+// CHECK-LABEL: @vfcmp_sle_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.sle.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vfcmp_sle_s(v4f32 _1, v4f32 _2) { return __lsx_vfcmp_sle_s(_1, _2); }
+// CHECK-LABEL: @vfcmp_slt_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.slt.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vfcmp_slt_d(v2f64 _1, v2f64 _2) { return __lsx_vfcmp_slt_d(_1, _2); }
+// CHECK-LABEL: @vfcmp_slt_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.slt.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vfcmp_slt_s(v4f32 _1, v4f32 _2) { return __lsx_vfcmp_slt_s(_1, _2); }
+// CHECK-LABEL: @vfcmp_sne_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.sne.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vfcmp_sne_d(v2f64 _1, v2f64 _2) { return __lsx_vfcmp_sne_d(_1, _2); }
+// CHECK-LABEL: @vfcmp_sne_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.sne.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vfcmp_sne_s(v4f32 _1, v4f32 _2) { return __lsx_vfcmp_sne_s(_1, _2); }
+// CHECK-LABEL: @vfcmp_sor_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.sor.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vfcmp_sor_d(v2f64 _1, v2f64 _2) { return __lsx_vfcmp_sor_d(_1, _2); }
+// CHECK-LABEL: @vfcmp_sor_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.sor.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vfcmp_sor_s(v4f32 _1, v4f32 _2) { return __lsx_vfcmp_sor_s(_1, _2); }
+// CHECK-LABEL: @vfcmp_sueq_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.sueq.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vfcmp_sueq_d(v2f64 _1, v2f64 _2) { return __lsx_vfcmp_sueq_d(_1, _2); }
+// CHECK-LABEL: @vfcmp_sueq_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.sueq.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vfcmp_sueq_s(v4f32 _1, v4f32 _2) { return __lsx_vfcmp_sueq_s(_1, _2); }
+// CHECK-LABEL: @vfcmp_sule_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.sule.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vfcmp_sule_d(v2f64 _1, v2f64 _2) { return __lsx_vfcmp_sule_d(_1, _2); }
+// CHECK-LABEL: @vfcmp_sule_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.sule.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vfcmp_sule_s(v4f32 _1, v4f32 _2) { return __lsx_vfcmp_sule_s(_1, _2); }
+// CHECK-LABEL: @vfcmp_sult_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.sult.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vfcmp_sult_d(v2f64 _1, v2f64 _2) { return __lsx_vfcmp_sult_d(_1, _2); }
+// CHECK-LABEL: @vfcmp_sult_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.sult.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vfcmp_sult_s(v4f32 _1, v4f32 _2) { return __lsx_vfcmp_sult_s(_1, _2); }
+// CHECK-LABEL: @vfcmp_sun_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.sun.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vfcmp_sun_d(v2f64 _1, v2f64 _2) { return __lsx_vfcmp_sun_d(_1, _2); }
+// CHECK-LABEL: @vfcmp_sune_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.sune.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vfcmp_sune_d(v2f64 _1, v2f64 _2) { return __lsx_vfcmp_sune_d(_1, _2); }
+// CHECK-LABEL: @vfcmp_sune_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.sune.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vfcmp_sune_s(v4f32 _1, v4f32 _2) { return __lsx_vfcmp_sune_s(_1, _2); }
+// CHECK-LABEL: @vfcmp_sun_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.sun.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vfcmp_sun_s(v4f32 _1, v4f32 _2) { return __lsx_vfcmp_sun_s(_1, _2); }
+// CHECK-LABEL: @vrepli_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vrepli.b(i32 1)
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vrepli_b() { return __lsx_vrepli_b(1); }
+// CHECK-LABEL: @vrepli_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vrepli.d(i32 1)
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vrepli_d() { return __lsx_vrepli_d(1); }
+// CHECK-LABEL: @vrepli_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vrepli.h(i32 1)
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vrepli_h() { return __lsx_vrepli_h(1); }
+// CHECK-LABEL: @vrepli_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vrepli.w(i32 1)
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vrepli_w() { return __lsx_vrepli_w(1); }
diff --git a/clang/test/CodeGen/LoongArch/lsx/builtin-error.c b/clang/test/CodeGen/LoongArch/lsx/builtin-error.c
new file mode 100644
index 0000000000000..3fc5f73f11934
--- /dev/null
+++ b/clang/test/CodeGen/LoongArch/lsx/builtin-error.c
@@ -0,0 +1,1382 @@
+// RUN: %clang_cc1 -triple loongarch64 -target-feature +lsx -verify %s
+
+typedef signed char v16i8 __attribute__((vector_size(16), aligned(16)));
+typedef signed char v16i8_b __attribute__((vector_size(16), aligned(1)));
+typedef unsigned char v16u8 __attribute__((vector_size(16), aligned(16)));
+typedef unsigned char v16u8_b __attribute__((vector_size(16), aligned(1)));
+typedef short v8i16 __attribute__((vector_size(16), aligned(16)));
+typedef short v8i16_h __attribute__((vector_size(16), aligned(2)));
+typedef unsigned short v8u16 __attribute__((vector_size(16), aligned(16)));
+typedef unsigned short v8u16_h __attribute__((vector_size(16), aligned(2)));
+typedef int v4i32 __attribute__((vector_size(16), aligned(16)));
+typedef int v4i32_w __attribute__((vector_size(16), aligned(4)));
+typedef unsigned int v4u32 __attribute__((vector_size(16), aligned(16)));
+typedef unsigned int v4u32_w __attribute__((vector_size(16), aligned(4)));
+typedef long long v2i64 __attribute__((vector_size(16), aligned(16)));
+typedef long long v2i64_d __attribute__((vector_size(16), aligned(8)));
+typedef unsigned long long v2u64 __attribute__((vector_size(16), aligned(16)));
+typedef unsigned long long v2u64_d __attribute__((vector_size(16), aligned(8)));
+typedef float v4f32 __attribute__((vector_size(16), aligned(16)));
+typedef float v4f32_w __attribute__((vector_size(16), aligned(4)));
+typedef double v2f64 __attribute__((vector_size(16), aligned(16)));
+typedef double v2f64_d __attribute__((vector_size(16), aligned(8)));
+
+typedef long long __m128i __attribute__((__vector_size__(16), __may_alias__));
+typedef float __m128 __attribute__((__vector_size__(16), __may_alias__));
+typedef double __m128d __attribute__((__vector_size__(16), __may_alias__));
+
+v16i8 vslli_b(v16i8 _1, int var) {
+  v16i8 res = __builtin_lsx_vslli_b(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 7]}}
+  res |= __builtin_lsx_vslli_b(_1, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+  res |= __builtin_lsx_vslli_b(_1, var); // expected-error {{argument to '__builtin_lsx_vslli_b' must be a constant integer}}
+  return res;
+}
+
+v8i16 vslli_h(v8i16 _1, int var) {
+  v8i16 res = __builtin_lsx_vslli_h(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 15]}}
+  res |= __builtin_lsx_vslli_h(_1, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  res |= __builtin_lsx_vslli_h(_1, var); // expected-error {{argument to '__builtin_lsx_vslli_h' must be a constant integer}}
+  return res;
+}
+
+v4i32 vslli_w(v4i32 _1, int var) {
+  v4i32 res = __builtin_lsx_vslli_w(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __builtin_lsx_vslli_w(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __builtin_lsx_vslli_w(_1, var); // expected-error {{argument to '__builtin_lsx_vslli_w' must be a constant integer}}
+  return res;
+}
+
+v2i64 vslli_d(v2i64 _1, int var) {
+  v2i64 res = __builtin_lsx_vslli_d(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 63]}}
+  res |= __builtin_lsx_vslli_d(_1, 64); // expected-error {{argument value 64 is outside the valid range [0, 63]}}
+  res |= __builtin_lsx_vslli_d(_1, var); // expected-error {{argument to '__builtin_lsx_vslli_d' must be a constant integer}}
+  return res;
+}
+
+v16i8 vsrai_b(v16i8 _1, int var) {
+  v16i8 res = __builtin_lsx_vsrai_b(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 7]}}
+  res |= __builtin_lsx_vsrai_b(_1, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+  res |= __builtin_lsx_vsrai_b(_1, var); // expected-error {{argument to '__builtin_lsx_vsrai_b' must be a constant integer}}
+  return res;
+}
+
+v8i16 vsrai_h(v8i16 _1, int var) {
+  v8i16 res = __builtin_lsx_vsrai_h(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 15]}}
+  res |= __builtin_lsx_vsrai_h(_1, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  res |= __builtin_lsx_vsrai_h(_1, var); // expected-error {{argument to '__builtin_lsx_vsrai_h' must be a constant integer}}
+  return res;
+}
+
+v4i32 vsrai_w(v4i32 _1, int var) {
+  v4i32 res = __builtin_lsx_vsrai_w(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __builtin_lsx_vsrai_w(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __builtin_lsx_vsrai_w(_1, var); // expected-error {{argument to '__builtin_lsx_vsrai_w' must be a constant integer}}
+  return res;
+}
+
+v2i64 vsrai_d(v2i64 _1, int var) {
+  v2i64 res = __builtin_lsx_vsrai_d(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 63]}}
+  res |= __builtin_lsx_vsrai_d(_1, 64); // expected-error {{argument value 64 is outside the valid range [0, 63]}}
+  res |= __builtin_lsx_vsrai_d(_1, var); // expected-error {{argument to '__builtin_lsx_vsrai_d' must be a constant integer}}
+  return res;
+}
+
+v16i8 vsrari_b(v16i8 _1, int var) {
+  v16i8 res = __builtin_lsx_vsrari_b(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 7]}}
+  res |= __builtin_lsx_vsrari_b(_1, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+  res |= __builtin_lsx_vsrari_b(_1, var); // expected-error {{argument to '__builtin_lsx_vsrari_b' must be a constant integer}}
+  return res;
+}
+
+v8i16 vsrari_h(v8i16 _1, int var) {
+  v8i16 res = __builtin_lsx_vsrari_h(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 15]}}
+  res |= __builtin_lsx_vsrari_h(_1, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  res |= __builtin_lsx_vsrari_h(_1, var); // expected-error {{argument to '__builtin_lsx_vsrari_h' must be a constant integer}}
+  return res;
+}
+
+v4i32 vsrari_w(v4i32 _1, int var) {
+  v4i32 res = __builtin_lsx_vsrari_w(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __builtin_lsx_vsrari_w(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __builtin_lsx_vsrari_w(_1, var); // expected-error {{argument to '__builtin_lsx_vsrari_w' must be a constant integer}}
+  return res;
+}
+
+v2i64 vsrari_d(v2i64 _1, int var) {
+  v2i64 res = __builtin_lsx_vsrari_d(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 63]}}
+  res |= __builtin_lsx_vsrari_d(_1, 64); // expected-error {{argument value 64 is outside the valid range [0, 63]}}
+  res |= __builtin_lsx_vsrari_d(_1, var); // expected-error {{argument to '__builtin_lsx_vsrari_d' must be a constant integer}}
+  return res;
+}
+
+v16i8 vsrli_b(v16i8 _1, int var) {
+  v16i8 res = __builtin_lsx_vsrli_b(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 7]}}
+  res |= __builtin_lsx_vsrli_b(_1, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+  res |= __builtin_lsx_vsrli_b(_1, var); // expected-error {{argument to '__builtin_lsx_vsrli_b' must be a constant integer}}
+  return res;
+}
+
+v8i16 vsrli_h(v8i16 _1, int var) {
+  v8i16 res = __builtin_lsx_vsrli_h(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 15]}}
+  res |= __builtin_lsx_vsrli_h(_1, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  res |= __builtin_lsx_vsrli_h(_1, var); // expected-error {{argument to '__builtin_lsx_vsrli_h' must be a constant integer}}
+  return res;
+}
+
+v4i32 vsrli_w(v4i32 _1, int var) {
+  v4i32 res = __builtin_lsx_vsrli_w(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __builtin_lsx_vsrli_w(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __builtin_lsx_vsrli_w(_1, var); // expected-error {{argument to '__builtin_lsx_vsrli_w' must be a constant integer}}
+  return res;
+}
+
+v2i64 vsrli_d(v2i64 _1, int var) {
+  v2i64 res = __builtin_lsx_vsrli_d(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 63]}}
+  res |= __builtin_lsx_vsrli_d(_1, 64); // expected-error {{argument value 64 is outside the valid range [0, 63]}}
+  res |= __builtin_lsx_vsrli_d(_1, var); // expected-error {{argument to '__builtin_lsx_vsrli_d' must be a constant integer}}
+  return res;
+}
+
+v16i8 vsrlri_b(v16i8 _1, int var) {
+  v16i8 res = __builtin_lsx_vsrlri_b(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 7]}}
+  res |= __builtin_lsx_vsrlri_b(_1, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+  res |= __builtin_lsx_vsrlri_b(_1, var); // expected-error {{argument to '__builtin_lsx_vsrlri_b' must be a constant integer}}
+  return res;
+}
+
+v8i16 vsrlri_h(v8i16 _1, int var) {
+  v8i16 res = __builtin_lsx_vsrlri_h(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 15]}}
+  res |= __builtin_lsx_vsrlri_h(_1, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  res |= __builtin_lsx_vsrlri_h(_1, var); // expected-error {{argument to '__builtin_lsx_vsrlri_h' must be a constant integer}}
+  return res;
+}
+
+v4i32 vsrlri_w(v4i32 _1, int var) {
+  v4i32 res = __builtin_lsx_vsrlri_w(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __builtin_lsx_vsrlri_w(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __builtin_lsx_vsrlri_w(_1, var); // expected-error {{argument to '__builtin_lsx_vsrlri_w' must be a constant integer}}
+  return res;
+}
+
+v2i64 vsrlri_d(v2i64 _1, int var) {
+  v2i64 res = __builtin_lsx_vsrlri_d(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 63]}}
+  res |= __builtin_lsx_vsrlri_d(_1, 64); // expected-error {{argument value 64 is outside the valid range [0, 63]}}
+  res |= __builtin_lsx_vsrlri_d(_1, var); // expected-error {{argument to '__builtin_lsx_vsrlri_d' must be a constant integer}}
+  return res;
+}
+
+v16u8 vbitclri_b(v16u8 _1, int var) {
+  v16u8 res = __builtin_lsx_vbitclri_b(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 7]}}
+  res |= __builtin_lsx_vbitclri_b(_1, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+  res |= __builtin_lsx_vbitclri_b(_1, var); // expected-error {{argument to '__builtin_lsx_vbitclri_b' must be a constant integer}}
+  return res;
+}
+
+v8u16 vbitclri_h(v8u16 _1, int var) {
+  v8u16 res = __builtin_lsx_vbitclri_h(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 15]}}
+  res |= __builtin_lsx_vbitclri_h(_1, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  res |= __builtin_lsx_vbitclri_h(_1, var); // expected-error {{argument to '__builtin_lsx_vbitclri_h' must be a constant integer}}
+  return res;
+}
+
+v4u32 vbitclri_w(v4u32 _1, int var) {
+  v4u32 res = __builtin_lsx_vbitclri_w(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __builtin_lsx_vbitclri_w(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __builtin_lsx_vbitclri_w(_1, var); // expected-error {{argument to '__builtin_lsx_vbitclri_w' must be a constant integer}}
+  return res;
+}
+
+v2u64 vbitclri_d(v2u64 _1, int var) {
+  v2u64 res = __builtin_lsx_vbitclri_d(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 63]}}
+  res |= __builtin_lsx_vbitclri_d(_1, 64); // expected-error {{argument value 64 is outside the valid range [0, 63]}}
+  res |= __builtin_lsx_vbitclri_d(_1, var); // expected-error {{argument to '__builtin_lsx_vbitclri_d' must be a constant integer}}
+  return res;
+}
+
+v16u8 vbitseti_b(v16u8 _1, int var) {
+  v16u8 res = __builtin_lsx_vbitseti_b(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 7]}}
+  res |= __builtin_lsx_vbitseti_b(_1, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+  res |= __builtin_lsx_vbitseti_b(_1, var); // expected-error {{argument to '__builtin_lsx_vbitseti_b' must be a constant integer}}
+  return res;
+}
+
+v8u16 vbitseti_h(v8u16 _1, int var) {
+  v8u16 res = __builtin_lsx_vbitseti_h(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 15]}}
+  res |= __builtin_lsx_vbitseti_h(_1, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  res |= __builtin_lsx_vbitseti_h(_1, var); // expected-error {{argument to '__builtin_lsx_vbitseti_h' must be a constant integer}}
+  return res;
+}
+
+v4u32 vbitseti_w(v4u32 _1, int var) {
+  v4u32 res = __builtin_lsx_vbitseti_w(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __builtin_lsx_vbitseti_w(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __builtin_lsx_vbitseti_w(_1, var); // expected-error {{argument to '__builtin_lsx_vbitseti_w' must be a constant integer}}
+  return res;
+}
+
+v2u64 vbitseti_d(v2u64 _1, int var) {
+  v2u64 res = __builtin_lsx_vbitseti_d(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 63]}}
+  res |= __builtin_lsx_vbitseti_d(_1, 64); // expected-error {{argument value 64 is outside the valid range [0, 63]}}
+  res |= __builtin_lsx_vbitseti_d(_1, var); // expected-error {{argument to '__builtin_lsx_vbitseti_d' must be a constant integer}}
+  return res;
+}
+
+v16u8 vbitrevi_b(v16u8 _1, int var) {
+  v16u8 res = __builtin_lsx_vbitrevi_b(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 7]}}
+  res |= __builtin_lsx_vbitrevi_b(_1, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+  res |= __builtin_lsx_vbitrevi_b(_1, var); // expected-error {{argument to '__builtin_lsx_vbitrevi_b' must be a constant integer}}
+  return res;
+}
+
+v8u16 vbitrevi_h(v8u16 _1, int var) {
+  v8u16 res = __builtin_lsx_vbitrevi_h(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 15]}}
+  res |= __builtin_lsx_vbitrevi_h(_1, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  res |= __builtin_lsx_vbitrevi_h(_1, var); // expected-error {{argument to '__builtin_lsx_vbitrevi_h' must be a constant integer}}
+  return res;
+}
+
+v4u32 vbitrevi_w(v4u32 _1, int var) {
+  v4u32 res = __builtin_lsx_vbitrevi_w(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __builtin_lsx_vbitrevi_w(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __builtin_lsx_vbitrevi_w(_1, var); // expected-error {{argument to '__builtin_lsx_vbitrevi_w' must be a constant integer}}
+  return res;
+}
+
+v2u64 vbitrevi_d(v2u64 _1, int var) {
+  v2u64 res = __builtin_lsx_vbitrevi_d(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 63]}}
+  res |= __builtin_lsx_vbitrevi_d(_1, 64); // expected-error {{argument value 64 is outside the valid range [0, 63]}}
+  res |= __builtin_lsx_vbitrevi_d(_1, var); // expected-error {{argument to '__builtin_lsx_vbitrevi_d' must be a constant integer}}
+  return res;
+}
+
+v16i8 vaddi_bu(v16i8 _1, int var) {
+  v16i8 res = __builtin_lsx_vaddi_bu(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __builtin_lsx_vaddi_bu(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __builtin_lsx_vaddi_bu(_1, var); // expected-error {{argument to '__builtin_lsx_vaddi_bu' must be a constant integer}}
+  return res;
+}
+
+v8i16 vaddi_hu(v8i16 _1, int var) {
+  v8i16 res = __builtin_lsx_vaddi_hu(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __builtin_lsx_vaddi_hu(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __builtin_lsx_vaddi_hu(_1, var); // expected-error {{argument to '__builtin_lsx_vaddi_hu' must be a constant integer}}
+  return res;
+}
+
+v4i32 vaddi_wu(v4i32 _1, int var) {
+  v4i32 res = __builtin_lsx_vaddi_wu(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __builtin_lsx_vaddi_wu(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __builtin_lsx_vaddi_wu(_1, var); // expected-error {{argument to '__builtin_lsx_vaddi_wu' must be a constant integer}}
+  return res;
+}
+
+v2i64 vaddi_du(v2i64 _1, int var) {
+  v2i64 res = __builtin_lsx_vaddi_du(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __builtin_lsx_vaddi_du(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __builtin_lsx_vaddi_du(_1, var); // expected-error {{argument to '__builtin_lsx_vaddi_du' must be a constant integer}}
+  return res;
+}
+
+v16i8 vsubi_bu(v16i8 _1, int var) {
+  v16i8 res = __builtin_lsx_vsubi_bu(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __builtin_lsx_vsubi_bu(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __builtin_lsx_vsubi_bu(_1, var); // expected-error {{argument to '__builtin_lsx_vsubi_bu' must be a constant integer}}
+  return res;
+}
+
+v8i16 vsubi_hu(v8i16 _1, int var) {
+  v8i16 res = __builtin_lsx_vsubi_hu(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __builtin_lsx_vsubi_hu(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __builtin_lsx_vsubi_hu(_1, var); // expected-error {{argument to '__builtin_lsx_vsubi_hu' must be a constant integer}}
+  return res;
+}
+
+v4i32 vsubi_wu(v4i32 _1, int var) {
+  v4i32 res = __builtin_lsx_vsubi_wu(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __builtin_lsx_vsubi_wu(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __builtin_lsx_vsubi_wu(_1, var); // expected-error {{argument to '__builtin_lsx_vsubi_wu' must be a constant integer}}
+  return res;
+}
+
+v2i64 vsubi_du(v2i64 _1, int var) {
+  v2i64 res = __builtin_lsx_vsubi_du(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __builtin_lsx_vsubi_du(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __builtin_lsx_vsubi_du(_1, var); // expected-error {{argument to '__builtin_lsx_vsubi_du' must be a constant integer}}
+  return res;
+}
+
+v16i8 vmaxi_b(v16i8 _1, int var) {
+  v16i8 res = __builtin_lsx_vmaxi_b(_1, -17); // expected-error {{argument value -17 is outside the valid range [-16, 15]}}
+  res |= __builtin_lsx_vmaxi_b(_1, 16); // expected-error {{argument value 16 is outside the valid range [-16, 15]}}
+  res |= __builtin_lsx_vmaxi_b(_1, var); // expected-error {{argument to '__builtin_lsx_vmaxi_b' must be a constant integer}}
+  return res;
+}
+
+v8i16 vmaxi_h(v8i16 _1, int var) {
+  v8i16 res = __builtin_lsx_vmaxi_h(_1, -17); // expected-error {{argument value -17 is outside the valid range [-16, 15]}}
+  res |= __builtin_lsx_vmaxi_h(_1, 16); // expected-error {{argument value 16 is outside the valid range [-16, 15]}}
+  res |= __builtin_lsx_vmaxi_h(_1, var); // expected-error {{argument to '__builtin_lsx_vmaxi_h' must be a constant integer}}
+  return res;
+}
+
+v4i32 vmaxi_w(v4i32 _1, int var) {
+  v4i32 res = __builtin_lsx_vmaxi_w(_1, -17); // expected-error {{argument value -17 is outside the valid range [-16, 15]}}
+  res |= __builtin_lsx_vmaxi_w(_1, 16); // expected-error {{argument value 16 is outside the valid range [-16, 15]}}
+  res |= __builtin_lsx_vmaxi_w(_1, var); // expected-error {{argument to '__builtin_lsx_vmaxi_w' must be a constant integer}}
+  return res;
+}
+
+v2i64 vmaxi_d(v2i64 _1, int var) {
+  v2i64 res = __builtin_lsx_vmaxi_d(_1, -17); // expected-error {{argument value -17 is outside the valid range [-16, 15]}}
+  res |= __builtin_lsx_vmaxi_d(_1, 16); // expected-error {{argument value 16 is outside the valid range [-16, 15]}}
+  res |= __builtin_lsx_vmaxi_d(_1, var); // expected-error {{argument to '__builtin_lsx_vmaxi_d' must be a constant integer}}
+  return res;
+}
+
+v16u8 vmaxi_bu(v16u8 _1, int var) {
+  v16u8 res = __builtin_lsx_vmaxi_bu(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __builtin_lsx_vmaxi_bu(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __builtin_lsx_vmaxi_bu(_1, var); // expected-error {{argument to '__builtin_lsx_vmaxi_bu' must be a constant integer}}
+  return res;
+}
+
+v8u16 vmaxi_hu(v8u16 _1, int var) {
+  v8u16 res = __builtin_lsx_vmaxi_hu(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __builtin_lsx_vmaxi_hu(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __builtin_lsx_vmaxi_hu(_1, var); // expected-error {{argument to '__builtin_lsx_vmaxi_hu' must be a constant integer}}
+  return res;
+}
+
+v4u32 vmaxi_wu(v4u32 _1, int var) {
+  v4u32 res = __builtin_lsx_vmaxi_wu(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __builtin_lsx_vmaxi_wu(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __builtin_lsx_vmaxi_wu(_1, var); // expected-error {{argument to '__builtin_lsx_vmaxi_wu' must be a constant integer}}
+  return res;
+}
+
+v2u64 vmaxi_du(v2u64 _1, int var) {
+  v2u64 res = __builtin_lsx_vmaxi_du(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __builtin_lsx_vmaxi_du(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __builtin_lsx_vmaxi_du(_1, var); // expected-error {{argument to '__builtin_lsx_vmaxi_du' must be a constant integer}}
+  return res;
+}
+
+v16i8 vmini_b(v16i8 _1, int var) {
+  v16i8 res = __builtin_lsx_vmini_b(_1, -17); // expected-error {{argument value -17 is outside the valid range [-16, 15]}}
+  res |= __builtin_lsx_vmini_b(_1, 16); // expected-error {{argument value 16 is outside the valid range [-16, 15]}}
+  res |= __builtin_lsx_vmini_b(_1, var); // expected-error {{argument to '__builtin_lsx_vmini_b' must be a constant integer}}
+  return res;
+}
+
+v8i16 vmini_h(v8i16 _1, int var) {
+  v8i16 res = __builtin_lsx_vmini_h(_1, -17); // expected-error {{argument value -17 is outside the valid range [-16, 15]}}
+  res |= __builtin_lsx_vmini_h(_1, 16); // expected-error {{argument value 16 is outside the valid range [-16, 15]}}
+  res |= __builtin_lsx_vmini_h(_1, var); // expected-error {{argument to '__builtin_lsx_vmini_h' must be a constant integer}}}
+  return res;
+}
+
+v4i32 vmini_w(v4i32 _1, int var) {
+  v4i32 res = __builtin_lsx_vmini_w(_1, -17); // expected-error {{argument value -17 is outside the valid range [-16, 15]}}
+  res |= __builtin_lsx_vmini_w(_1, 16); // expected-error {{argument value 16 is outside the valid range [-16, 15]}}
+  res |= __builtin_lsx_vmini_w(_1, var); // expected-error {{argument to '__builtin_lsx_vmini_w' must be a constant integer}}
+  return res;
+}
+
+v2i64 vmini_d(v2i64 _1, int var) {
+  v2i64 res = __builtin_lsx_vmini_d(_1, -17); // expected-error {{argument value -17 is outside the valid range [-16, 15]}}
+  res |= __builtin_lsx_vmini_d(_1, 16); // expected-error {{argument value 16 is outside the valid range [-16, 15]}}
+  res |= __builtin_lsx_vmini_d(_1, var); // expected-error {{argument to '__builtin_lsx_vmini_d' must be a constant integer}}
+  return res;
+}
+
+v16u8 vmini_bu(v16u8 _1, int var) {
+  v16u8 res = __builtin_lsx_vmini_bu(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __builtin_lsx_vmini_bu(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __builtin_lsx_vmini_bu(_1, var); // expected-error {{argument to '__builtin_lsx_vmini_bu' must be a constant integer}}
+  return res;
+}
+
+v8u16 vmini_hu(v8u16 _1, int var) {
+  v8u16 res = __builtin_lsx_vmini_hu(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __builtin_lsx_vmini_hu(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __builtin_lsx_vmini_hu(_1, var); // expected-error {{argument to '__builtin_lsx_vmini_hu' must be a constant integer}}
+  return res;
+}
+
+v4u32 vmini_wu(v4u32 _1, int var) {
+  v4u32 res = __builtin_lsx_vmini_wu(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __builtin_lsx_vmini_wu(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __builtin_lsx_vmini_wu(_1, var); // expected-error {{argument to '__builtin_lsx_vmini_wu' must be a constant integer}}
+  return res;
+}
+
+v2u64 vmini_du(v2u64 _1, int var) {
+  v2u64 res = __builtin_lsx_vmini_du(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __builtin_lsx_vmini_du(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __builtin_lsx_vmini_du(_1, var); // expected-error {{argument to '__builtin_lsx_vmini_du' must be a constant integer}}
+  return res;
+}
+
+v16i8 vseqi_b(v16i8 _1, int var) {
+  v16i8 res = __builtin_lsx_vseqi_b(_1, -17); // expected-error {{argument value -17 is outside the valid range [-16, 15]}}
+  res |= __builtin_lsx_vseqi_b(_1, 16); // expected-error {{argument value 16 is outside the valid range [-16, 15]}}
+  res |= __builtin_lsx_vseqi_b(_1, var); // expected-error {{argument to '__builtin_lsx_vseqi_b' must be a constant integer}}
+  return res;
+}
+
+v8i16 vseqi_h(v8i16 _1, int var) {
+  v8i16 res = __builtin_lsx_vseqi_h(_1, -17); // expected-error {{argument value -17 is outside the valid range [-16, 15]}}
+  res |= __builtin_lsx_vseqi_h(_1, 16); // expected-error {{argument value 16 is outside the valid range [-16, 15]}}
+  res |= __builtin_lsx_vseqi_h(_1, var); // expected-error {{argument to '__builtin_lsx_vseqi_h' must be a constant integer}}
+  return res;
+}
+
+v4i32 vseqi_w(v4i32 _1, int var) {
+  v4i32 res = __builtin_lsx_vseqi_w(_1, -17); // expected-error {{argument value -17 is outside the valid range [-16, 15]}}
+  res |= __builtin_lsx_vseqi_w(_1, 16); // expected-error {{argument value 16 is outside the valid range [-16, 15]}}
+  res |= __builtin_lsx_vseqi_w(_1, var); // expected-error {{argument to '__builtin_lsx_vseqi_w' must be a constant integer}}
+  return res;
+}
+
+v2i64 vseqi_d(v2i64 _1, int var) {
+  v2i64 res = __builtin_lsx_vseqi_d(_1, -17); // expected-error {{argument value -17 is outside the valid range [-16, 15]}}
+  res |= __builtin_lsx_vseqi_d(_1, 16); // expected-error {{argument value 16 is outside the valid range [-16, 15]}}
+  res |= __builtin_lsx_vseqi_d(_1, var); // expected-error {{argument to '__builtin_lsx_vseqi_d' must be a constant integer}}
+  return res;
+}
+
+v16i8 vslti_b(v16i8 _1, int var) {
+  v16i8 res = __builtin_lsx_vslti_b(_1, -17); // expected-error {{argument value -17 is outside the valid range [-16, 15]}}
+  res |= __builtin_lsx_vslti_b(_1, 16); // expected-error {{argument value 16 is outside the valid range [-16, 15]}}
+  res |= __builtin_lsx_vslti_b(_1, var); // expected-error {{argument to '__builtin_lsx_vslti_b' must be a constant integer}}
+  return res;
+}
+
+v8i16 vslti_h(v8i16 _1, int var) {
+  v8i16 res = __builtin_lsx_vslti_h(_1, -17); // expected-error {{argument value -17 is outside the valid range [-16, 15]}}
+  res |= __builtin_lsx_vslti_h(_1, 16); // expected-error {{argument value 16 is outside the valid range [-16, 15]}}
+  res |= __builtin_lsx_vslti_h(_1, var); // expected-error {{argument to '__builtin_lsx_vslti_h' must be a constant integer}}
+  return res;
+}
+
+v4i32 vslti_w(v4i32 _1, int var) {
+  v4i32 res = __builtin_lsx_vslti_w(_1, -17); // expected-error {{argument value -17 is outside the valid range [-16, 15]}}
+  res |= __builtin_lsx_vslti_w(_1, 16); // expected-error {{argument value 16 is outside the valid range [-16, 15]}}
+  res |= __builtin_lsx_vslti_w(_1, var); // expected-error {{argument to '__builtin_lsx_vslti_w' must be a constant integer}}
+  return res;
+}
+
+v2i64 vslti_d(v2i64 _1, int var) {
+  v2i64 res = __builtin_lsx_vslti_d(_1, -17); // expected-error {{argument value -17 is outside the valid range [-16, 15]}}
+  res |= __builtin_lsx_vslti_d(_1, 16); // expected-error {{argument value 16 is outside the valid range [-16, 15]}}
+  res |= __builtin_lsx_vslti_d(_1, var); // expected-error {{argument to '__builtin_lsx_vslti_d' must be a constant integer}}
+  return res;
+}
+
+v16i8 vslti_bu(v16u8 _1, int var) {
+  v16i8 res = __builtin_lsx_vslti_bu(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __builtin_lsx_vslti_bu(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __builtin_lsx_vslti_bu(_1, var); // expected-error {{argument to '__builtin_lsx_vslti_bu' must be a constant integer}}
+  return res;
+}
+
+v8i16 vslti_hu(v8u16 _1, int var) {
+  v8i16 res = __builtin_lsx_vslti_hu(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __builtin_lsx_vslti_hu(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __builtin_lsx_vslti_hu(_1, var); // expected-error {{argument to '__builtin_lsx_vslti_hu' must be a constant integer}}
+  return res;
+}
+
+v4i32 vslti_wu(v4u32 _1, int var) {
+  v4i32 res = __builtin_lsx_vslti_wu(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __builtin_lsx_vslti_wu(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __builtin_lsx_vslti_wu(_1, var); // expected-error {{argument to '__builtin_lsx_vslti_wu' must be a constant integer}}
+  return res;
+}
+
+v2i64 vslti_du(v2u64 _1, int var) {
+  v2i64 res = __builtin_lsx_vslti_du(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __builtin_lsx_vslti_du(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __builtin_lsx_vslti_du(_1, var); // expected-error {{argument to '__builtin_lsx_vslti_du' must be a constant integer}}
+  return res;
+}
+
+v16i8 vslei_b(v16i8 _1, int var) {
+  v16i8 res = __builtin_lsx_vslei_b(_1, -17); // expected-error {{argument value -17 is outside the valid range [-16, 15]}}
+  res |= __builtin_lsx_vslei_b(_1, 16); // expected-error {{argument value 16 is outside the valid range [-16, 15]}}
+  res |= __builtin_lsx_vslei_b(_1, var); // expected-error {{argument to '__builtin_lsx_vslei_b' must be a constant integer}}
+  return res;
+}
+
+v8i16 vslei_h(v8i16 _1, int var) {
+  v8i16 res = __builtin_lsx_vslei_h(_1, -17); // expected-error {{argument value -17 is outside the valid range [-16, 15]}}
+  res |= __builtin_lsx_vslei_h(_1, 16); // expected-error {{argument value 16 is outside the valid range [-16, 15]}}
+  res |= __builtin_lsx_vslei_h(_1, var); // expected-error {{argument to '__builtin_lsx_vslei_h' must be a constant integer}}
+  return res;
+}
+
+v4i32 vslei_w(v4i32 _1, int var) {
+  v4i32 res = __builtin_lsx_vslei_w(_1, -17); // expected-error {{argument value -17 is outside the valid range [-16, 15]}}
+  res |= __builtin_lsx_vslei_w(_1, 16); // expected-error {{argument value 16 is outside the valid range [-16, 15]}}
+  res |= __builtin_lsx_vslei_w(_1, var); // expected-error {{argument to '__builtin_lsx_vslei_w' must be a constant integer}}
+  return res;
+}
+
+v2i64 vslei_d(v2i64 _1, int var) {
+  v2i64 res = __builtin_lsx_vslei_d(_1, -17); // expected-error {{argument value -17 is outside the valid range [-16, 15]}}
+  res |= __builtin_lsx_vslei_d(_1, 16); // expected-error {{argument value 16 is outside the valid range [-16, 15]}}
+  res |= __builtin_lsx_vslei_d(_1, var); // expected-error {{argument to '__builtin_lsx_vslei_d' must be a constant integer}}
+  return res;
+}
+
+v16i8 vslei_bu(v16u8 _1, int var) {
+  v16i8 res = __builtin_lsx_vslei_bu(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __builtin_lsx_vslei_bu(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __builtin_lsx_vslei_bu(_1, var); // expected-error {{argument to '__builtin_lsx_vslei_bu' must be a constant integer}}
+  return res;
+}
+
+v8i16 vslei_hu(v8u16 _1, int var) {
+  v8i16 res = __builtin_lsx_vslei_hu(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __builtin_lsx_vslei_hu(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __builtin_lsx_vslei_hu(_1, var); // expected-error {{argument to '__builtin_lsx_vslei_hu' must be a constant integer}}
+  return res;
+}
+
+v4i32 vslei_wu(v4u32 _1, int var) {
+  v4i32 res = __builtin_lsx_vslei_wu(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __builtin_lsx_vslei_wu(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __builtin_lsx_vslei_wu(_1, var); // expected-error {{argument to '__builtin_lsx_vslei_wu' must be a constant integer}}
+  return res;
+}
+
+v2i64 vslei_du(v2u64 _1, int var) {
+  v2i64 res = __builtin_lsx_vslei_du(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __builtin_lsx_vslei_du(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __builtin_lsx_vslei_du(_1, var); // expected-error {{argument to '__builtin_lsx_vslei_du' must be a constant integer}}
+  return res;
+}
+
+v16i8 vsat_b(v16i8 _1, int var) {
+  v16i8 res = __builtin_lsx_vsat_b(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 7]}}
+  res |= __builtin_lsx_vsat_b(_1, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+  res |= __builtin_lsx_vsat_b(_1, var); // expected-error {{argument to '__builtin_lsx_vsat_b' must be a constant integer}}
+  return res;
+}
+
+v8i16 vsat_h(v8i16 _1, int var) {
+  v8i16 res = __builtin_lsx_vsat_h(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 15]}}
+  res |= __builtin_lsx_vsat_h(_1, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  res |= __builtin_lsx_vsat_h(_1, var); // expected-error {{argument to '__builtin_lsx_vsat_h' must be a constant integer}}
+  return res;
+}
+
+v4i32 vsat_w(v4i32 _1, int var) {
+  v4i32 res = __builtin_lsx_vsat_w(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __builtin_lsx_vsat_w(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __builtin_lsx_vsat_w(_1, var); // expected-error {{argument to '__builtin_lsx_vsat_w' must be a constant integer}}
+  return res;
+}
+
+v2i64 vsat_d(v2i64 _1, int var) {
+  v2i64 res = __builtin_lsx_vsat_d(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 63]}}
+  res |= __builtin_lsx_vsat_d(_1, 64); // expected-error {{argument value 64 is outside the valid range [0, 63]}}
+  res |= __builtin_lsx_vsat_d(_1, var); // expected-error {{argument to '__builtin_lsx_vsat_d' must be a constant integer}}
+  return res;
+}
+
+v16u8 vsat_bu(v16u8 _1, int var) {
+  v16u8 res = __builtin_lsx_vsat_bu(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 7]}}
+  res |= __builtin_lsx_vsat_bu(_1, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+  res |= __builtin_lsx_vsat_bu(_1, var); // expected-error {{argument to '__builtin_lsx_vsat_bu' must be a constant integer}}
+  return res;
+}
+
+v8u16 vsat_hu(v8u16 _1, int var) {
+  v8u16 res = __builtin_lsx_vsat_hu(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 15]}}
+  res |= __builtin_lsx_vsat_hu(_1, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  res |= __builtin_lsx_vsat_hu(_1, var); // expected-error {{argument to '__builtin_lsx_vsat_hu' must be a constant integer}}
+  return res;
+}
+
+v4u32 vsat_wu(v4u32 _1, int var) {
+  v4u32 res = __builtin_lsx_vsat_wu(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __builtin_lsx_vsat_wu(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __builtin_lsx_vsat_wu(_1, var); // expected-error {{argument to '__builtin_lsx_vsat_wu' must be a constant integer}}
+  return res;
+}
+
+v2u64 vsat_du(v2u64 _1, int var) {
+  v2u64 res = __builtin_lsx_vsat_du(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 63]}}
+  res |= __builtin_lsx_vsat_du(_1, 64); // expected-error {{argument value 64 is outside the valid range [0, 63]}}
+  res |= __builtin_lsx_vsat_du(_1, var); // expected-error {{argument to '__builtin_lsx_vsat_du' must be a constant integer}}
+  return res;
+}
+
+v16i8 vreplvei_b(v16i8 _1, int var) {
+  v16i8 res = __builtin_lsx_vreplvei_b(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 15]}}
+  res |= __builtin_lsx_vreplvei_b(_1, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  res |= __builtin_lsx_vreplvei_b(_1, var); // expected-error {{argument to '__builtin_lsx_vreplvei_b' must be a constant integer}}
+  return res;
+}
+
+v8i16 vreplvei_h(v8i16 _1, int var) {
+  v8i16 res = __builtin_lsx_vreplvei_h(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 7]}}
+  res |= __builtin_lsx_vreplvei_h(_1, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+  res |= __builtin_lsx_vreplvei_h(_1, var); // expected-error {{argument to '__builtin_lsx_vreplvei_h' must be a constant integer}}
+  return res;
+}
+
+v4i32 vreplvei_w(v4i32 _1, int var) {
+  v4i32 res = __builtin_lsx_vreplvei_w(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 3]}}
+  res |= __builtin_lsx_vreplvei_w(_1, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  res |= __builtin_lsx_vreplvei_w(_1, var); // expected-error {{argument to '__builtin_lsx_vreplvei_w' must be a constant integer}}
+  return res;
+}
+
+v2i64 vreplvei_d(v2i64 _1, int var) {
+  v2i64 res = __builtin_lsx_vreplvei_d(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 1]}}
+  res |= __builtin_lsx_vreplvei_d(_1, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}}
+  res |= __builtin_lsx_vreplvei_d(_1, var); // expected-error {{argument to '__builtin_lsx_vreplvei_d' must be a constant integer}}
+  return res;
+}
+
+v16u8 vandi_b(v16u8 _1, int var) {
+  v16u8 res = __builtin_lsx_vandi_b(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 255]}}
+  res |= __builtin_lsx_vandi_b(_1, 256); // expected-error {{argument value 256 is outside the valid range [0, 255]}}
+  res |= __builtin_lsx_vandi_b(_1, var); // expected-error {{argument to '__builtin_lsx_vandi_b' must be a constant integer}}
+  return res;
+}
+
+v16u8 vori_b(v16u8 _1, int var) {
+  v16u8 res = __builtin_lsx_vori_b(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 255]}}
+  res |= __builtin_lsx_vori_b(_1, 256); // expected-error {{argument value 256 is outside the valid range [0, 255]}}
+  res |= __builtin_lsx_vori_b(_1, var); // expected-error {{argument to '__builtin_lsx_vori_b' must be a constant integer}}
+  return res;
+}
+
+v16u8 vnori_b(v16u8 _1, int var) {
+  v16u8 res = __builtin_lsx_vnori_b(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 255]}}
+  res |= __builtin_lsx_vnori_b(_1, 256); // expected-error {{argument value 256 is outside the valid range [0, 255]}}
+  res |= __builtin_lsx_vnori_b(_1, var); // expected-error {{argument to '__builtin_lsx_vnori_b' must be a constant integer}}
+  return res;
+}
+
+v16u8 vxori_b(v16u8 _1, int var) {
+  v16u8 res = __builtin_lsx_vxori_b(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 255]}}
+  res |= __builtin_lsx_vxori_b(_1, 256); // expected-error {{argument value 256 is outside the valid range [0, 255]}}
+  res |= __builtin_lsx_vxori_b(_1, var); // expected-error {{argument to '__builtin_lsx_vxori_b' must be a constant integer}}
+  return res;
+}
+
+v16u8 vbitseli_b(v16u8 _1, v16u8 _2, int var) {
+  v16u8 res = __builtin_lsx_vbitseli_b(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 255]}}
+  res |= __builtin_lsx_vbitseli_b(_1, _2, 256); // expected-error {{argument value 256 is outside the valid range [0, 255]}}
+  res |= __builtin_lsx_vbitseli_b(_1, _2, var); // expected-error {{argument to '__builtin_lsx_vbitseli_b' must be a constant integer}}
+  return res;
+}
+
+v16i8 vshuf4i_b(v16i8 _1, int var) {
+  v16i8 res = __builtin_lsx_vshuf4i_b(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 255]}}
+  res |= __builtin_lsx_vshuf4i_b(_1, 256); // expected-error {{argument value 256 is outside the valid range [0, 255]}}
+  res |= __builtin_lsx_vshuf4i_b(_1, var); // expected-error {{argument to '__builtin_lsx_vshuf4i_b' must be a constant integer}}
+  return res;
+}
+
+v8i16 vshuf4i_h(v8i16 _1, int var) {
+  v8i16 res = __builtin_lsx_vshuf4i_h(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 255]}}
+  res |= __builtin_lsx_vshuf4i_h(_1, 256); // expected-error {{argument value 256 is outside the valid range [0, 255]}}
+  res |= __builtin_lsx_vshuf4i_h(_1, var); // expected-error {{argument to '__builtin_lsx_vshuf4i_h' must be a constant integer}}
+  return res;
+}
+
+v4i32 vshuf4i_w(v4i32 _1, int var) {
+  v4i32 res = __builtin_lsx_vshuf4i_w(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 255]}}
+  res |= __builtin_lsx_vshuf4i_w(_1, 256); // expected-error {{argument value 256 is outside the valid range [0, 255]}}
+  res |= __builtin_lsx_vshuf4i_w(_1, var); // expected-error {{argument to '__builtin_lsx_vshuf4i_w' must be a constant integer}}
+  return res;
+}
+
+int vpickve2gr_b(v16i8 _1, int var) {
+  int res = __builtin_lsx_vpickve2gr_b(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 15]}}
+  res |= __builtin_lsx_vpickve2gr_b(_1, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  res |= __builtin_lsx_vpickve2gr_b(_1, var); // expected-error {{argument to '__builtin_lsx_vpickve2gr_b' must be a constant integer}}
+  return res;
+}
+
+int vpickve2gr_h(v8i16 _1, int var) {
+  int res = __builtin_lsx_vpickve2gr_h(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 7]}}
+  res |= __builtin_lsx_vpickve2gr_h(_1, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+  res |= __builtin_lsx_vpickve2gr_h(_1, var); // expected-error {{argument to '__builtin_lsx_vpickve2gr_h' must be a constant integer}}
+  return res;
+}
+
+int vpickve2gr_w(v4i32 _1, int var) {
+  int res = __builtin_lsx_vpickve2gr_w(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 3]}}
+  res |= __builtin_lsx_vpickve2gr_w(_1, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  res |= __builtin_lsx_vpickve2gr_w(_1, var); // expected-error {{argument to '__builtin_lsx_vpickve2gr_w' must be a constant integer}}
+  return res;
+}
+
+long vpickve2gr_d(v2i64 _1, int var) {
+  long res = __builtin_lsx_vpickve2gr_d(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 1]}}
+  res |= __builtin_lsx_vpickve2gr_d(_1, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}}
+  res |= __builtin_lsx_vpickve2gr_d(_1, var); // expected-error {{argument to '__builtin_lsx_vpickve2gr_d' must be a constant integer}}
+  return res;
+}
+
+unsigned int vpickve2gr_bu(v16i8 _1, int var) {
+  unsigned int res = __builtin_lsx_vpickve2gr_bu(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 15]}}
+  res |= __builtin_lsx_vpickve2gr_bu(_1, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  res |= __builtin_lsx_vpickve2gr_bu(_1, var); // expected-error {{argument to '__builtin_lsx_vpickve2gr_bu' must be a constant integer}}
+  return res;
+}
+
+unsigned int vpickve2gr_hu(v8i16 _1, int var) {
+  unsigned int res = __builtin_lsx_vpickve2gr_hu(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 7]}}
+  res |= __builtin_lsx_vpickve2gr_hu(_1, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+  res |= __builtin_lsx_vpickve2gr_hu(_1, var); // expected-error {{argument to '__builtin_lsx_vpickve2gr_hu' must be a constant integer}}
+  return res;
+}
+
+unsigned int vpickve2gr_wu(v4i32 _1, int var) {
+  unsigned int res = __builtin_lsx_vpickve2gr_wu(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 3]}}
+  res |= __builtin_lsx_vpickve2gr_wu(_1, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  res |= __builtin_lsx_vpickve2gr_wu(_1, var); // expected-error {{argument to '__builtin_lsx_vpickve2gr_wu' must be a constant integer}}
+  return res;
+}
+
+unsigned long int vpickve2gr_du(v2i64 _1, int var) {
+  unsigned long int res = __builtin_lsx_vpickve2gr_du(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 1]}}
+  res |= __builtin_lsx_vpickve2gr_du(_1, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}}
+  res |= __builtin_lsx_vpickve2gr_du(_1, var); // expected-error {{argument to '__builtin_lsx_vpickve2gr_du' must be a constant integer}}
+  return res;
+}
+
+v16i8 vinsgr2vr_b(v16i8 _1, int var) {
+  v16i8 res = __builtin_lsx_vinsgr2vr_b(_1, 1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 15]}}
+  res |= __builtin_lsx_vinsgr2vr_b(_1, 1, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  res |= __builtin_lsx_vinsgr2vr_b(_1, 1, var); // expected-error {{argument to '__builtin_lsx_vinsgr2vr_b' must be a constant integer}}
+  return res;
+}
+
+v8i16 vinsgr2vr_h(v8i16 _1, int var) {
+  v8i16 res = __builtin_lsx_vinsgr2vr_h(_1, 1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 7]}}
+  res |= __builtin_lsx_vinsgr2vr_h(_1, 1, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+  res |= __builtin_lsx_vinsgr2vr_h(_1, 1, var); // expected-error {{argument to '__builtin_lsx_vinsgr2vr_h' must be a constant integer}}
+  return res;
+}
+
+v4i32 vinsgr2vr_w(v4i32 _1, int var) {
+  v4i32 res = __builtin_lsx_vinsgr2vr_w(_1, 1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 3]}}
+  res |= __builtin_lsx_vinsgr2vr_w(_1, 1, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  res |= __builtin_lsx_vinsgr2vr_w(_1, 1, var); // expected-error {{argument to '__builtin_lsx_vinsgr2vr_w' must be a constant integer}}
+  return res;
+}
+
+v2i64 vinsgr2vr_d(v2i64 _1, int var) {
+  v2i64 res = __builtin_lsx_vinsgr2vr_d(_1, 1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 1]}}
+  res |= __builtin_lsx_vinsgr2vr_d(_1, 1, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}}
+  res |= __builtin_lsx_vinsgr2vr_d(_1, 1, var); // expected-error {{argument to '__builtin_lsx_vinsgr2vr_d' must be a constant integer}}
+  return res;
+}
+
+v8i16 vsllwil_h_b(v16i8 _1, int var) {
+  v8i16 res = __builtin_lsx_vsllwil_h_b(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 7]}}
+  res |= __builtin_lsx_vsllwil_h_b(_1, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+  res |= __builtin_lsx_vsllwil_h_b(_1, var); // expected-error {{argument to '__builtin_lsx_vsllwil_h_b' must be a constant integer}}
+  return res;
+}
+
+v4i32 vsllwil_w_h(v8i16 _1, int var) {
+  v4i32 res = __builtin_lsx_vsllwil_w_h(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 15]}}
+  res |= __builtin_lsx_vsllwil_w_h(_1, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  res |= __builtin_lsx_vsllwil_w_h(_1, var); // expected-error {{argument to '__builtin_lsx_vsllwil_w_h' must be a constant integer}}
+  return res;
+}
+
+v2i64 vsllwil_d_w(v4i32 _1, int var) {
+  v2i64 res = __builtin_lsx_vsllwil_d_w(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __builtin_lsx_vsllwil_d_w(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __builtin_lsx_vsllwil_d_w(_1, var); // expected-error {{argument to '__builtin_lsx_vsllwil_d_w' must be a constant integer}}
+  return res;
+}
+
+v8u16 vsllwil_hu_bu(v16u8 _1, int var) {
+  v8u16 res = __builtin_lsx_vsllwil_hu_bu(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 7]}}
+  res |= __builtin_lsx_vsllwil_hu_bu(_1, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+  res |= __builtin_lsx_vsllwil_hu_bu(_1, var); // expected-error {{argument to '__builtin_lsx_vsllwil_hu_bu' must be a constant integer}}
+  return res;
+}
+
+v4u32 vsllwil_wu_hu(v8u16 _1, int var) {
+  v4u32 res = __builtin_lsx_vsllwil_wu_hu(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 15]}}
+  res |= __builtin_lsx_vsllwil_wu_hu(_1, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  res |= __builtin_lsx_vsllwil_wu_hu(_1, var); // expected-error {{argument to '__builtin_lsx_vsllwil_wu_hu' must be a constant integer}}
+  return res;
+}
+
+v2u64 vsllwil_du_wu(v4u32 _1, int var) {
+  v2u64 res = __builtin_lsx_vsllwil_du_wu(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __builtin_lsx_vsllwil_du_wu(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __builtin_lsx_vsllwil_du_wu(_1, var); // expected-error {{argument to '__builtin_lsx_vsllwil_du_wu' must be a constant integer}}
+  return res;
+}
+
+v16i8 vfrstpi_b(v16i8 _1, v16i8 _2, int var) {
+  v16i8 res = __builtin_lsx_vfrstpi_b(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __builtin_lsx_vfrstpi_b(_1, _2, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __builtin_lsx_vfrstpi_b(_1, _2, var); // expected-error {{argument to '__builtin_lsx_vfrstpi_b' must be a constant integer}}
+  return res;
+}
+
+v8i16 vfrstpi_h(v8i16 _1, v8i16 _2, int var) {
+  v8i16 res = __builtin_lsx_vfrstpi_h(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __builtin_lsx_vfrstpi_h(_1, _2, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __builtin_lsx_vfrstpi_h(_1, _2, var); // expected-error {{argument to '__builtin_lsx_vfrstpi_h' must be a constant integer}}
+  return res;
+}
+
+v2i64 vshuf4i_d(v2i64 _1, v2i64 _2, int var) {
+  v2i64 res = __builtin_lsx_vshuf4i_d(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 255]}}
+  res |= __builtin_lsx_vshuf4i_d(_1, _2, 256); // expected-error {{argument value 256 is outside the valid range [0, 255]}}
+  res |= __builtin_lsx_vshuf4i_d(_1, _2, var); // expected-error {{argument to '__builtin_lsx_vshuf4i_d' must be a constant integer}}
+  return res;
+}
+
+v16i8 vbsrl_v(v16i8 _1, int var) {
+  v16i8 res = __builtin_lsx_vbsrl_v(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __builtin_lsx_vbsrl_v(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __builtin_lsx_vbsrl_v(_1, var); // expected-error {{argument to '__builtin_lsx_vbsrl_v' must be a constant integer}}
+  return res;
+}
+
+v16i8 vbsll_v(v16i8 _1, int var) {
+  v16i8 res = __builtin_lsx_vbsll_v(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __builtin_lsx_vbsll_v(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __builtin_lsx_vbsll_v(_1, var); // expected-error {{argument to '__builtin_lsx_vbsll_v' must be a constant integer}}
+  return res;
+}
+
+v16i8 vextrins_b(v16i8 _1, v16i8 _2, int var) {
+  v16i8 res = __builtin_lsx_vextrins_b(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 255]}}
+  res |= __builtin_lsx_vextrins_b(_1, _2, 256); // expected-error {{argument value 256 is outside the valid range [0, 255]}}
+  res |= __builtin_lsx_vextrins_b(_1, _2, var); // expected-error {{argument to '__builtin_lsx_vextrins_b' must be a constant integer}}
+  return res;
+}
+
+v8i16 vextrins_h(v8i16 _1, v8i16 _2, int var) {
+  v8i16 res = __builtin_lsx_vextrins_h(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 255]}}
+  res |= __builtin_lsx_vextrins_h(_1, _2, 256); // expected-error {{argument value 256 is outside the valid range [0, 255]}}
+  res |= __builtin_lsx_vextrins_h(_1, _2, var); // expected-error {{argument to '__builtin_lsx_vextrins_h' must be a constant integer}}
+  return res;
+}
+
+v4i32 vextrins_w(v4i32 _1, v4i32 _2, int var) {
+  v4i32 res = __builtin_lsx_vextrins_w(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 255]}}
+  res |= __builtin_lsx_vextrins_w(_1, _2, 256); // expected-error {{argument value 256 is outside the valid range [0, 255]}}
+  res |= __builtin_lsx_vextrins_w(_1, _2, var); // expected-error {{argument to '__builtin_lsx_vextrins_w' must be a constant integer}}
+  return res;
+}
+
+v2i64 vextrins_d(v2i64 _1, v2i64 _2, int var) {
+  v2i64 res = __builtin_lsx_vextrins_d(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 255]}}
+  res |= __builtin_lsx_vextrins_d(_1, _2, 256); // expected-error {{argument value 256 is outside the valid range [0, 255]}}
+  res |= __builtin_lsx_vextrins_d(_1, _2, var); // expected-error {{argument to '__builtin_lsx_vextrins_d' must be a constant integer}}
+  return res;
+}
+
+void vstelm_b_idx(v16i8 _1, void *_2, int var) {
+  __builtin_lsx_vstelm_b(_1, _2, 1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 15]}}
+  __builtin_lsx_vstelm_b(_1, _2, 1, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  __builtin_lsx_vstelm_b(_1, _2, 1, var); // expected-error {{argument to '__builtin_lsx_vstelm_b' must be a constant integer}}
+}
+
+void vstelm_h_idx(v8i16 _1, void *_2, int var) {
+  __builtin_lsx_vstelm_h(_1, _2, 2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 7]}}
+  __builtin_lsx_vstelm_h(_1, _2, 2, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+  __builtin_lsx_vstelm_h(_1, _2, 2, var); // expected-error {{argument to '__builtin_lsx_vstelm_h' must be a constant integer}}
+}
+
+void vstelm_w_idx(v4i32 _1, void *_2, int var) {
+  __builtin_lsx_vstelm_w(_1, _2, 4, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 3]}}
+  __builtin_lsx_vstelm_w(_1, _2, 4, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  __builtin_lsx_vstelm_w(_1, _2, 4, var); // expected-error {{argument to '__builtin_lsx_vstelm_w' must be a constant integer}}
+}
+
+void vstelm_d_idx(v2i64 _1, void *_2, int var) {
+  __builtin_lsx_vstelm_d(_1, _2, 8, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 1]}}
+  __builtin_lsx_vstelm_d(_1, _2, 8, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}}
+  __builtin_lsx_vstelm_d(_1, _2, 8, var); // expected-error {{argument to '__builtin_lsx_vstelm_d' must be a constant integer}}
+}
+
+void vstelm_b(v16i8 _1, void *_2, int var) {
+  __builtin_lsx_vstelm_b(_1, _2, -129, 1); // expected-error {{argument value -129 is outside the valid range [-128, 127]}}
+  __builtin_lsx_vstelm_b(_1, _2, 128, 1); // expected-error {{argument value 128 is outside the valid range [-128, 127]}}
+  __builtin_lsx_vstelm_b(_1, _2, var, 1); // expected-error {{argument to '__builtin_lsx_vstelm_b' must be a constant integer}}
+}
+
+void vstelm_h(v8i16 _1, void *_2, int var) {
+  __builtin_lsx_vstelm_h(_1, _2, -258, 1); // expected-error {{argument value -258 is outside the valid range [-256, 254]}}
+  __builtin_lsx_vstelm_h(_1, _2, 256, 1); // expected-error {{argument value 256 is outside the valid range [-256, 254]}}
+  __builtin_lsx_vstelm_h(_1, _2, var, 1); // expected-error {{argument to '__builtin_lsx_vstelm_h' must be a constant integer}}
+}
+
+void vstelm_w(v4i32 _1, void *_2, int var) {
+  __builtin_lsx_vstelm_w(_1, _2, -516, 1); // expected-error {{argument value -516 is outside the valid range [-512, 508]}}
+  __builtin_lsx_vstelm_w(_1, _2, 512, 1); // expected-error {{argument value 512 is outside the valid range [-512, 508]}}
+  __builtin_lsx_vstelm_w(_1, _2, var, 1); // expected-error {{argument to '__builtin_lsx_vstelm_w' must be a constant integer}}
+}
+
+void vstelm_d(v2i64 _1, void *_2, int var) {
+  __builtin_lsx_vstelm_d(_1, _2, -1032, 1); // expected-error {{argument value -1032 is outside the valid range [-1024, 1016]}}
+  __builtin_lsx_vstelm_d(_1, _2, 1024, 1); // expected-error {{argument value 1024 is outside the valid range [-1024, 1016]}}
+  __builtin_lsx_vstelm_d(_1, _2, var, 1); // expected-error {{argument to '__builtin_lsx_vstelm_d' must be a constant integer}}
+}
+
+v16i8 vldrepl_b(void *_1, int var) {
+  v16i8 res = __builtin_lsx_vldrepl_b(_1, -2049); // expected-error {{argument value -2049 is outside the valid range [-2048, 2047]}}
+  res |= __builtin_lsx_vldrepl_b(_1, 2048); // expected-error {{argument value 2048 is outside the valid range [-2048, 2047]}}
+  res |= __builtin_lsx_vldrepl_b(_1, var); // expected-error {{argument to '__builtin_lsx_vldrepl_b' must be a constant integer}}
+  return res;
+}
+
+v8i16 vldrepl_h(void *_1, int var) {
+  v8i16 res = __builtin_lsx_vldrepl_h(_1, -2050); // expected-error {{argument value -2050 is outside the valid range [-2048, 2046]}}
+  res |= __builtin_lsx_vldrepl_h(_1, 2048); // expected-error {{argument value 2048 is outside the valid range [-2048, 2046]}}
+  res |= __builtin_lsx_vldrepl_h(_1, var); // expected-error {{argument to '__builtin_lsx_vldrepl_h' must be a constant integer}}
+  return res;
+}
+
+v4i32 vldrepl_w(void *_1, int var) {
+  v4i32 res = __builtin_lsx_vldrepl_w(_1, -2052); // expected-error {{argument value -2052 is outside the valid range [-2048, 2044]}}
+  res |= __builtin_lsx_vldrepl_w(_1, 2048); // expected-error {{argument value 2048 is outside the valid range [-2048, 2044]}}
+  res |= __builtin_lsx_vldrepl_w(_1, var); // expected-error {{argument to '__builtin_lsx_vldrepl_w' must be a constant integer}}
+  return res;
+}
+
+v2i64 vldrepl_d(void *_1, int var) {
+  v2i64 res = __builtin_lsx_vldrepl_d(_1, -2056); // expected-error {{argument value -2056 is outside the valid range [-2048, 2040]}}
+  res |= __builtin_lsx_vldrepl_d(_1, 2048); // expected-error {{argument value 2048 is outside the valid range [-2048, 2040]}}
+  res |= __builtin_lsx_vldrepl_d(_1, var); // expected-error {{argument to '__builtin_lsx_vldrepl_d' must be a constant integer}}
+  return res;
+}
+
+v16i8 vrotri_b(v16i8 _1, int var) {
+  v16i8 res = __builtin_lsx_vrotri_b(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 7]}}
+  res |= __builtin_lsx_vrotri_b(_1, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+  res |= __builtin_lsx_vrotri_b(_1, var); // expected-error {{argument to '__builtin_lsx_vrotri_b' must be a constant integer}}
+  return res;
+}
+
+v8i16 vrotri_h(v8i16 _1, int var) {
+  v8i16 res = __builtin_lsx_vrotri_h(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 15]}}
+  res |= __builtin_lsx_vrotri_h(_1, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  res |= __builtin_lsx_vrotri_h(_1, var); // expected-error {{argument to '__builtin_lsx_vrotri_h' must be a constant integer}}
+  return res;
+}
+
+v4i32 vrotri_w(v4i32 _1, int var) {
+  v4i32 res = __builtin_lsx_vrotri_w(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __builtin_lsx_vrotri_w(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __builtin_lsx_vrotri_w(_1, var); // expected-error {{argument to '__builtin_lsx_vrotri_w' must be a constant integer}}
+  return res;
+}
+
+v2i64 vrotri_d(v2i64 _1, int var) {
+  v2i64 res = __builtin_lsx_vrotri_d(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 63]}}
+  res |= __builtin_lsx_vrotri_d(_1, 64); // expected-error {{argument value 64 is outside the valid range [0, 63]}}
+  res |= __builtin_lsx_vrotri_d(_1, var); // expected-error {{argument to '__builtin_lsx_vrotri_d' must be a constant integer}}
+  return res;
+}
+
+v16i8 vsrlni_b_h(v16i8 _1, v16i8 _2, int var) {
+  v16i8 res = __builtin_lsx_vsrlni_b_h(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 15]}}
+  res |= __builtin_lsx_vsrlni_b_h(_1, _2, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  res |= __builtin_lsx_vsrlni_b_h(_1, _2, var); // expected-error {{argument to '__builtin_lsx_vsrlni_b_h' must be a constant integer}}
+  return res;
+}
+
+v8i16 vsrlni_h_w(v8i16 _1, v8i16 _2, int var) {
+  v8i16 res = __builtin_lsx_vsrlni_h_w(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __builtin_lsx_vsrlni_h_w(_1, _2, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __builtin_lsx_vsrlni_h_w(_1, _2, var); // expected-error {{argument to '__builtin_lsx_vsrlni_h_w' must be a constant integer}}
+  return res;
+}
+
+v4i32 vsrlni_w_d(v4i32 _1, v4i32 _2, int var) {
+  v4i32 res = __builtin_lsx_vsrlni_w_d(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 63]}}
+  res |= __builtin_lsx_vsrlni_w_d(_1, _2, 64); // expected-error {{argument value 64 is outside the valid range [0, 63]}}
+  res |= __builtin_lsx_vsrlni_w_d(_1, _2, var); // expected-error {{argument to '__builtin_lsx_vsrlni_w_d' must be a constant integer}}
+  return res;
+}
+
+v2i64 vsrlni_d_q(v2i64 _1, v2i64 _2, int var) {
+  v2i64 res = __builtin_lsx_vsrlni_d_q(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 127]}}
+  res |= __builtin_lsx_vsrlni_d_q(_1, _2, 128); // expected-error {{argument value 128 is outside the valid range [0, 127]}}
+  res |= __builtin_lsx_vsrlni_d_q(_1, _2, var); // expected-error {{argument to '__builtin_lsx_vsrlni_d_q' must be a constant integer}}
+  return res;
+}
+
+v16i8 vsrlrni_b_h(v16i8 _1, v16i8 _2, int var) {
+  v16i8 res = __builtin_lsx_vsrlrni_b_h(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 15]}}
+  res |= __builtin_lsx_vsrlrni_b_h(_1, _2, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  res |= __builtin_lsx_vsrlrni_b_h(_1, _2, var); // expected-error {{argument to '__builtin_lsx_vsrlrni_b_h' must be a constant integer}}
+  return res;
+}
+
+v8i16 vsrlrni_h_w(v8i16 _1, v8i16 _2, int var) {
+  v8i16 res = __builtin_lsx_vsrlrni_h_w(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __builtin_lsx_vsrlrni_h_w(_1, _2, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __builtin_lsx_vsrlrni_h_w(_1, _2, var); // expected-error {{argument to '__builtin_lsx_vsrlrni_h_w' must be a constant integer}}
+  return res;
+}
+
+v4i32 vsrlrni_w_d(v4i32 _1, v4i32 _2, int var) {
+  v4i32 res = __builtin_lsx_vsrlrni_w_d(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 63]}}
+  res |= __builtin_lsx_vsrlrni_w_d(_1, _2, 64); // expected-error {{argument value 64 is outside the valid range [0, 63]}}
+  res |= __builtin_lsx_vsrlrni_w_d(_1, _2, var); // expected-error {{argument to '__builtin_lsx_vsrlrni_w_d' must be a constant integer}}
+  return res;
+}
+
+v2i64 vsrlrni_d_q(v2i64 _1, v2i64 _2, int var) {
+  v2i64 res = __builtin_lsx_vsrlrni_d_q(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 127]}}
+  res |= __builtin_lsx_vsrlrni_d_q(_1, _2, 128); // expected-error {{argument value 128 is outside the valid range [0, 127]}}
+  res |= __builtin_lsx_vsrlrni_d_q(_1, _2, var); // expected-error {{argument to '__builtin_lsx_vsrlrni_d_q' must be a constant integer}}
+  return res;
+}
+
+v16i8 vssrlni_b_h(v16i8 _1, v16i8 _2, int var) {
+  v16i8 res = __builtin_lsx_vssrlni_b_h(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 15]}}
+  res |= __builtin_lsx_vssrlni_b_h(_1, _2, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  res |= __builtin_lsx_vssrlni_b_h(_1, _2, var); // expected-error {{argument to '__builtin_lsx_vssrlni_b_h' must be a constant integer}}
+  return res;
+}
+
+v8i16 vssrlni_h_w(v8i16 _1, v8i16 _2, int var) {
+  v8i16 res = __builtin_lsx_vssrlni_h_w(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __builtin_lsx_vssrlni_h_w(_1, _2, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __builtin_lsx_vssrlni_h_w(_1, _2, var); // expected-error {{argument to '__builtin_lsx_vssrlni_h_w' must be a constant integer}}
+  return res;
+}
+
+v4i32 vssrlni_w_d(v4i32 _1, v4i32 _2, int var) {
+  v4i32 res = __builtin_lsx_vssrlni_w_d(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 63]}}
+  res |= __builtin_lsx_vssrlni_w_d(_1, _2, 64); // expected-error {{argument value 64 is outside the valid range [0, 63]}}
+  res |= __builtin_lsx_vssrlni_w_d(_1, _2, var); // expected-error {{argument to '__builtin_lsx_vssrlni_w_d' must be a constant integer}}
+  return res;
+}
+
+v2i64 vssrlni_d_q(v2i64 _1, v2i64 _2, int var) {
+  v2i64 res = __builtin_lsx_vssrlni_d_q(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 127]}}
+  res |= __builtin_lsx_vssrlni_d_q(_1, _2, 128); // expected-error {{argument value 128 is outside the valid range [0, 127]}}
+  res |= __builtin_lsx_vssrlni_d_q(_1, _2, var); // expected-error {{argument to '__builtin_lsx_vssrlni_d_q' must be a constant integer}}
+  return res;
+}
+
+v16u8 vssrlni_bu_h(v16u8 _1, v16i8 _2, int var) {
+  v16u8 res = __builtin_lsx_vssrlni_bu_h(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 15]}}
+  res |= __builtin_lsx_vssrlni_bu_h(_1, _2, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  res |= __builtin_lsx_vssrlni_bu_h(_1, _2, var); // expected-error {{argument to '__builtin_lsx_vssrlni_bu_h' must be a constant integer}}
+  return res;
+}
+
+v8u16 vssrlni_hu_w(v8u16 _1, v8i16 _2, int var) {
+  v8u16 res = __builtin_lsx_vssrlni_hu_w(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __builtin_lsx_vssrlni_hu_w(_1, _2, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __builtin_lsx_vssrlni_hu_w(_1, _2, var); // expected-error {{argument to '__builtin_lsx_vssrlni_hu_w' must be a constant integer}}
+  return res;
+}
+
+v4u32 vssrlni_wu_d(v4u32 _1, v4i32 _2, int var) {
+  v4u32 res = __builtin_lsx_vssrlni_wu_d(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 63]}}
+  res |= __builtin_lsx_vssrlni_wu_d(_1, _2, 64); // expected-error {{argument value 64 is outside the valid range [0, 63]}}
+  res |= __builtin_lsx_vssrlni_wu_d(_1, _2, var); // expected-error {{argument to '__builtin_lsx_vssrlni_wu_d' must be a constant integer}}
+  return res;
+}
+
+v2u64 vssrlni_du_q(v2u64 _1, v2i64 _2, int var) {
+  v2u64 res = __builtin_lsx_vssrlni_du_q(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 127]}}
+  res |= __builtin_lsx_vssrlni_du_q(_1, _2, 128); // expected-error {{argument value 128 is outside the valid range [0, 127]}}
+  res |= __builtin_lsx_vssrlni_du_q(_1, _2, var); // expected-error {{argument to '__builtin_lsx_vssrlni_du_q' must be a constant integer}}
+  return res;
+}
+
+v16i8 vssrlrni_b_h(v16i8 _1, v16i8 _2, int var) {
+  v16i8 res = __builtin_lsx_vssrlrni_b_h(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 15]}}
+  res |= __builtin_lsx_vssrlrni_b_h(_1, _2, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  res |= __builtin_lsx_vssrlrni_b_h(_1, _2, var); // expected-error {{argument to '__builtin_lsx_vssrlrni_b_h' must be a constant integer}}
+  return res;
+}
+
+v8i16 vssrlrni_h_w(v8i16 _1, v8i16 _2, int var) {
+  v8i16 res = __builtin_lsx_vssrlrni_h_w(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __builtin_lsx_vssrlrni_h_w(_1, _2, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __builtin_lsx_vssrlrni_h_w(_1, _2, var); // expected-error {{argument to '__builtin_lsx_vssrlrni_h_w' must be a constant integer}}
+  return res;
+}
+
+v4i32 vssrlrni_w_d(v4i32 _1, v4i32 _2, int var) {
+  v4i32 res = __builtin_lsx_vssrlrni_w_d(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 63]}}
+  res |= __builtin_lsx_vssrlrni_w_d(_1, _2, 64); // expected-error {{argument value 64 is outside the valid range [0, 63]}}
+  res |= __builtin_lsx_vssrlrni_w_d(_1, _2, var); // expected-error {{argument to '__builtin_lsx_vssrlrni_w_d' must be a constant integer}}
+  return res;
+}
+
+v2i64 vssrlrni_d_q(v2i64 _1, v2i64 _2, int var) {
+  v2i64 res = __builtin_lsx_vssrlrni_d_q(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 127]}}
+  res |= __builtin_lsx_vssrlrni_d_q(_1, _2, 128); // expected-error {{argument value 128 is outside the valid range [0, 127]}}
+  res |= __builtin_lsx_vssrlrni_d_q(_1, _2, var); // expected-error {{argument to '__builtin_lsx_vssrlrni_d_q' must be a constant integer}}
+  return res;
+}
+
+v16u8 vssrlrni_bu_h(v16u8 _1, v16i8 _2, int var) {
+  v16u8 res = __builtin_lsx_vssrlrni_bu_h(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 15]}}
+  res |= __builtin_lsx_vssrlrni_bu_h(_1, _2, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  res |= __builtin_lsx_vssrlrni_bu_h(_1, _2, var); // expected-error {{argument to '__builtin_lsx_vssrlrni_bu_h' must be a constant integer}}
+  return res;
+}
+
+v8u16 vssrlrni_hu_w(v8u16 _1, v8i16 _2, int var) {
+  v8u16 res = __builtin_lsx_vssrlrni_hu_w(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __builtin_lsx_vssrlrni_hu_w(_1, _2, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __builtin_lsx_vssrlrni_hu_w(_1, _2, var); // expected-error {{argument to '__builtin_lsx_vssrlrni_hu_w' must be a constant integer}}
+  return res;
+}
+
+v4u32 vssrlrni_wu_d(v4u32 _1, v4i32 _2, int var) {
+  v4u32 res = __builtin_lsx_vssrlrni_wu_d(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 63]}}
+  res |= __builtin_lsx_vssrlrni_wu_d(_1, _2, 64); // expected-error {{argument value 64 is outside the valid range [0, 63]}}
+  res |= __builtin_lsx_vssrlrni_wu_d(_1, _2, var); // expected-error {{argument to '__builtin_lsx_vssrlrni_wu_d' must be a constant integer}}
+  return res;
+}
+
+v2u64 vssrlrni_du_q(v2u64 _1, v2i64 _2, int var) {
+  v2u64 res = __builtin_lsx_vssrlrni_du_q(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 127]}}
+  res |= __builtin_lsx_vssrlrni_du_q(_1, _2, 128); // expected-error {{argument value 128 is outside the valid range [0, 127]}}
+  res |= __builtin_lsx_vssrlrni_du_q(_1, _2, var); // expected-error {{argument to '__builtin_lsx_vssrlrni_du_q' must be a constant integer}}
+  return res;
+}
+
+v16i8 vsrani_b_h(v16i8 _1, v16i8 _2, int var) {
+  v16i8 res = __builtin_lsx_vsrani_b_h(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 15]}}
+  res |= __builtin_lsx_vsrani_b_h(_1, _2, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  res |= __builtin_lsx_vsrani_b_h(_1, _2, var); // expected-error {{argument to '__builtin_lsx_vsrani_b_h' must be a constant integer}}
+  return res;
+}
+
+v8i16 vsrani_h_w(v8i16 _1, v8i16 _2, int var) {
+  v8i16 res = __builtin_lsx_vsrani_h_w(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __builtin_lsx_vsrani_h_w(_1, _2, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __builtin_lsx_vsrani_h_w(_1, _2, var); // expected-error {{argument to '__builtin_lsx_vsrani_h_w' must be a constant integer}}
+  return res;
+}
+
+v4i32 vsrani_w_d(v4i32 _1, v4i32 _2, int var) {
+  v4i32 res = __builtin_lsx_vsrani_w_d(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 63]}}
+  res |= __builtin_lsx_vsrani_w_d(_1, _2, 64); // expected-error {{argument value 64 is outside the valid range [0, 63]}}
+  res |= __builtin_lsx_vsrani_w_d(_1, _2, var); // expected-error {{argument to '__builtin_lsx_vsrani_w_d' must be a constant integer}}
+  return res;
+}
+
+v2i64 vsrani_d_q(v2i64 _1, v2i64 _2, int var) {
+  v2i64 res = __builtin_lsx_vsrani_d_q(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 127]}}
+  res |= __builtin_lsx_vsrani_d_q(_1, _2, 128); // expected-error {{argument value 128 is outside the valid range [0, 127]}}
+  res |= __builtin_lsx_vsrani_d_q(_1, _2, var); // expected-error {{argument to '__builtin_lsx_vsrani_d_q' must be a constant integer}}
+  return res;
+}
+
+v16i8 vsrarni_b_h(v16i8 _1, v16i8 _2, int var) {
+  v16i8 res = __builtin_lsx_vsrarni_b_h(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 15]}}
+  res |= __builtin_lsx_vsrarni_b_h(_1, _2, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  res |= __builtin_lsx_vsrarni_b_h(_1, _2, var); // expected-error {{argument to '__builtin_lsx_vsrarni_b_h' must be a constant integer}}
+  return res;
+}
+
+v8i16 vsrarni_h_w(v8i16 _1, v8i16 _2, int var) {
+  v8i16 res = __builtin_lsx_vsrarni_h_w(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __builtin_lsx_vsrarni_h_w(_1, _2, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __builtin_lsx_vsrarni_h_w(_1, _2, var); // expected-error {{argument to '__builtin_lsx_vsrarni_h_w' must be a constant integer}}
+  return res;
+}
+
+v4i32 vsrarni_w_d(v4i32 _1, v4i32 _2, int var) {
+  v4i32 res = __builtin_lsx_vsrarni_w_d(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 63]}}
+  res |= __builtin_lsx_vsrarni_w_d(_1, _2, 64); // expected-error {{argument value 64 is outside the valid range [0, 63]}}
+  res |= __builtin_lsx_vsrarni_w_d(_1, _2, var); // expected-error {{argument to '__builtin_lsx_vsrarni_w_d' must be a constant integer}}
+  return res;
+}
+
+v2i64 vsrarni_d_q(v2i64 _1, v2i64 _2, int var) {
+  v2i64 res = __builtin_lsx_vsrarni_d_q(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 127]}}
+  res |= __builtin_lsx_vsrarni_d_q(_1, _2, 128); // expected-error {{argument value 128 is outside the valid range [0, 127]}}
+  res |= __builtin_lsx_vsrarni_d_q(_1, _2, var); // expected-error {{argument to '__builtin_lsx_vsrarni_d_q' must be a constant integer}}
+  return res;
+}
+
+v16i8 vssrani_b_h(v16i8 _1, v16i8 _2, int var) {
+  v16i8 res = __builtin_lsx_vssrani_b_h(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 15]}}
+  res |= __builtin_lsx_vssrani_b_h(_1, _2, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  res |= __builtin_lsx_vssrani_b_h(_1, _2, var); // expected-error {{argument to '__builtin_lsx_vssrani_b_h' must be a constant integer}}
+  return res;
+}
+
+v8i16 vssrani_h_w(v8i16 _1, v8i16 _2, int var) {
+  v8i16 res = __builtin_lsx_vssrani_h_w(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __builtin_lsx_vssrani_h_w(_1, _2, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __builtin_lsx_vssrani_h_w(_1, _2, var); // expected-error {{argument to '__builtin_lsx_vssrani_h_w' must be a constant integer}}
+  return res;
+}
+
+v4i32 vssrani_w_d(v4i32 _1, v4i32 _2, int var) {
+  v4i32 res = __builtin_lsx_vssrani_w_d(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 63]}}
+  res |= __builtin_lsx_vssrani_w_d(_1, _2, 64); // expected-error {{argument value 64 is outside the valid range [0, 63]}}
+  res |= __builtin_lsx_vssrani_w_d(_1, _2, var); // expected-error {{argument to '__builtin_lsx_vssrani_w_d' must be a constant integer}}
+  return res;
+}
+
+v2i64 vssrani_d_q(v2i64 _1, v2i64 _2, int var) {
+  v2i64 res = __builtin_lsx_vssrani_d_q(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 127]}}
+  res |= __builtin_lsx_vssrani_d_q(_1, _2, 128); // expected-error {{argument value 128 is outside the valid range [0, 127]}}
+  res |= __builtin_lsx_vssrani_d_q(_1, _2, var); // expected-error {{argument to '__builtin_lsx_vssrani_d_q' must be a constant integer}}
+  return res;
+}
+
+v16u8 vssrani_bu_h(v16u8 _1, v16i8 _2, int var) {
+  v16u8 res = __builtin_lsx_vssrani_bu_h(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 15]}}
+  res |= __builtin_lsx_vssrani_bu_h(_1, _2, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  res |= __builtin_lsx_vssrani_bu_h(_1, _2, var); // expected-error {{argument to '__builtin_lsx_vssrani_bu_h' must be a constant integer}}
+  return res;
+}
+
+v8u16 vssrani_hu_w(v8u16 _1, v8i16 _2, int var) {
+  v8u16 res = __builtin_lsx_vssrani_hu_w(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __builtin_lsx_vssrani_hu_w(_1, _2, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __builtin_lsx_vssrani_hu_w(_1, _2, var); // expected-error {{argument to '__builtin_lsx_vssrani_hu_w' must be a constant integer}}
+  return res;
+}
+
+v4u32 vssrani_wu_d(v4u32 _1, v4i32 _2, int var) {
+  v4u32 res = __builtin_lsx_vssrani_wu_d(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 63]}}
+  res |= __builtin_lsx_vssrani_wu_d(_1, _2, 64); // expected-error {{argument value 64 is outside the valid range [0, 63]}}
+  res |= __builtin_lsx_vssrani_wu_d(_1, _2, var); // expected-error {{argument to '__builtin_lsx_vssrani_wu_d' must be a constant integer}}
+  return res;
+}
+
+v2u64 vssrani_du_q(v2u64 _1, v2i64 _2, int var) {
+  v2u64 res = __builtin_lsx_vssrani_du_q(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 127]}}
+  res |= __builtin_lsx_vssrani_du_q(_1, _2, 128); // expected-error {{argument value 128 is outside the valid range [0, 127]}}
+  res |= __builtin_lsx_vssrani_du_q(_1, _2, var); // expected-error {{argument to '__builtin_lsx_vssrani_du_q' must be a constant integer}}
+  return res;
+}
+
+v16i8 vssrarni_b_h(v16i8 _1, v16i8 _2, int var) {
+  v16i8 res = __builtin_lsx_vssrarni_b_h(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 15]}}
+  res |= __builtin_lsx_vssrarni_b_h(_1, _2, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  res |= __builtin_lsx_vssrarni_b_h(_1, _2, var); // expected-error {{argument to '__builtin_lsx_vssrarni_b_h' must be a constant integer}}
+  return res;
+}
+
+v8i16 vssrarni_h_w(v8i16 _1, v8i16 _2, int var) {
+  v8i16 res = __builtin_lsx_vssrarni_h_w(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __builtin_lsx_vssrarni_h_w(_1, _2, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __builtin_lsx_vssrarni_h_w(_1, _2, var); // expected-error {{argument to '__builtin_lsx_vssrarni_h_w' must be a constant integer}}
+  return res;
+}
+
+v4i32 vssrarni_w_d(v4i32 _1, v4i32 _2, int var) {
+  v4i32 res = __builtin_lsx_vssrarni_w_d(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 63]}}
+  res |= __builtin_lsx_vssrarni_w_d(_1, _2, 64); // expected-error {{argument value 64 is outside the valid range [0, 63]}}
+  res |= __builtin_lsx_vssrarni_w_d(_1, _2, var); // expected-error {{argument to '__builtin_lsx_vssrarni_w_d' must be a constant integer}}
+  return res;
+}
+
+v2i64 vssrarni_d_q(v2i64 _1, v2i64 _2, int var) {
+  v2i64 res = __builtin_lsx_vssrarni_d_q(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 127]}}
+  res |= __builtin_lsx_vssrarni_d_q(_1, _2, 128); // expected-error {{argument value 128 is outside the valid range [0, 127]}}
+  res |= __builtin_lsx_vssrarni_d_q(_1, _2, var); // expected-error {{argument to '__builtin_lsx_vssrarni_d_q' must be a constant integer}}
+  return res;
+}
+
+v16u8 vssrarni_bu_h(v16u8 _1, v16i8 _2, int var) {
+  v16u8 res = __builtin_lsx_vssrarni_bu_h(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 15]}}
+  res |= __builtin_lsx_vssrarni_bu_h(_1, _2, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  res |= __builtin_lsx_vssrarni_bu_h(_1, _2, var); // expected-error {{argument to '__builtin_lsx_vssrarni_bu_h' must be a constant integer}}
+  return res;
+}
+
+v8u16 vssrarni_hu_w(v8u16 _1, v8i16 _2, int var) {
+  v8u16 res = __builtin_lsx_vssrarni_hu_w(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __builtin_lsx_vssrarni_hu_w(_1, _2, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __builtin_lsx_vssrarni_hu_w(_1, _2, var); // expected-error {{argument to '__builtin_lsx_vssrarni_hu_w' must be a constant integer}}
+  return res;
+}
+
+v4u32 vssrarni_wu_d(v4u32 _1, v4i32 _2, int var) {
+  v4u32 res = __builtin_lsx_vssrarni_wu_d(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 63]}}
+  res |= __builtin_lsx_vssrarni_wu_d(_1, _2, 64); // expected-error {{argument value 64 is outside the valid range [0, 63]}}
+  res |= __builtin_lsx_vssrarni_wu_d(_1, _2, var); // expected-error {{argument to '__builtin_lsx_vssrarni_wu_d' must be a constant integer}}
+  return res;
+}
+
+v2u64 vssrarni_du_q(v2u64 _1, v2i64 _2, int var) {
+  v2u64 res = __builtin_lsx_vssrarni_du_q(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 127]}}
+  res |= __builtin_lsx_vssrarni_du_q(_1, _2, 128); // expected-error {{argument value 128 is outside the valid range [0, 127]}}
+  res |= __builtin_lsx_vssrarni_du_q(_1, _2, var); // expected-error {{argument to '__builtin_lsx_vssrarni_du_q' must be a constant integer}}
+  return res;
+}
+
+v4i32 vpermi_w(v4i32 _1, v4i32 _2, int var) {
+  v4i32 res = __builtin_lsx_vpermi_w(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 255]}}
+  res |= __builtin_lsx_vpermi_w(_1, _2, 256); // expected-error {{argument value 256 is outside the valid range [0, 255]}}
+  res |= __builtin_lsx_vpermi_w(_1, _2, var); // expected-error {{argument to '__builtin_lsx_vpermi_w' must be a constant integer}}
+  return res;
+}
+
+v16i8 vld(void *_1, int var) {
+  v16i8 res = __builtin_lsx_vld(_1, -2049); // expected-error {{argument value -2049 is outside the valid range [-2048, 2047]}}
+  res |= __builtin_lsx_vld(_1, 2048); // expected-error {{argument value 2048 is outside the valid range [-2048, 2047]}}
+  res |= __builtin_lsx_vld(_1, var); // expected-error {{argument to '__builtin_lsx_vld' must be a constant integer}}
+  return res;
+}
+
+void vst(v16i8 _1, void *_2, int var) {
+  __builtin_lsx_vst(_1, _2, -2049); // expected-error {{argument value -2049 is outside the valid range [-2048, 2047]}}
+  __builtin_lsx_vst(_1, _2, 2048); // expected-error {{argument value 2048 is outside the valid range [-2048, 2047]}}
+  __builtin_lsx_vst(_1, _2, var); // expected-error {{argument to '__builtin_lsx_vst' must be a constant integer}}
+}
+
+v2i64 vldi(int var) {
+  v2i64 res = __builtin_lsx_vldi(-4097); // expected-error {{argument value -4097 is outside the valid range [-4096, 4095]}}
+  res |= __builtin_lsx_vldi(4096); // expected-error {{argument value 4096 is outside the valid range [-4096, 4095]}}
+  res |= __builtin_lsx_vldi(var); // expected-error {{argument to '__builtin_lsx_vldi' must be a constant integer}}
+  return res;
+}
+
+v16i8 vrepli_b(int var) {
+  v16i8 res = __builtin_lsx_vrepli_b(-513); // expected-error {{argument value -513 is outside the valid range [-512, 511]}}
+  res |= __builtin_lsx_vrepli_b(512); // expected-error {{argument value 512 is outside the valid range [-512, 511]}}
+  res |= __builtin_lsx_vrepli_b(var); // expected-error {{argument to '__builtin_lsx_vrepli_b' must be a constant integer}}
+  return res;
+}
+
+v2i64 vrepli_d(int var) {
+  v2i64 res = __builtin_lsx_vrepli_d(-513); // expected-error {{argument value -513 is outside the valid range [-512, 511]}}
+  res |= __builtin_lsx_vrepli_d(512); // expected-error {{argument value 512 is outside the valid range [-512, 511]}}
+  res |= __builtin_lsx_vrepli_d(var); // expected-error {{argument to '__builtin_lsx_vrepli_d' must be a constant integer}}
+  return res;
+}
+
+v8i16 vrepli_h(int var) {
+  v8i16 res = __builtin_lsx_vrepli_h(-513); // expected-error {{argument value -513 is outside the valid range [-512, 511]}}
+  res |= __builtin_lsx_vrepli_h(512); // expected-error {{argument value 512 is outside the valid range [-512, 511]}}
+  res |= __builtin_lsx_vrepli_h(var); // expected-error {{argument to '__builtin_lsx_vrepli_h' must be a constant integer}}
+  return res;
+}
+
+v4i32 vrepli_w(int var) {
+  v4i32 res = __builtin_lsx_vrepli_w(-513); // expected-error {{argument value -513 is outside the valid range [-512, 511]}}
+  res |= __builtin_lsx_vrepli_w(512); // expected-error {{argument value 512 is outside the valid range [-512, 511]}}
+  res |= __builtin_lsx_vrepli_w(var); // expected-error {{argument to '__builtin_lsx_vrepli_w' must be a constant integer}}
+  return res;
+}
diff --git a/clang/test/CodeGen/LoongArch/lsx/builtin.c b/clang/test/CodeGen/LoongArch/lsx/builtin.c
new file mode 100644
index 0000000000000..ef5a390e1838c
--- /dev/null
+++ b/clang/test/CodeGen/LoongArch/lsx/builtin.c
@@ -0,0 +1,5193 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// RUN: %clang_cc1 -triple loongarch64 -target-feature +lsx -O2 -emit-llvm %s -o - | FileCheck %s
+
+typedef signed char v16i8 __attribute__ ((vector_size(16), aligned(16)));
+typedef signed char v16i8_b __attribute__ ((vector_size(16), aligned(1)));
+typedef unsigned char v16u8 __attribute__ ((vector_size(16), aligned(16)));
+typedef unsigned char v16u8_b __attribute__ ((vector_size(16), aligned(1)));
+typedef short v8i16 __attribute__ ((vector_size(16), aligned(16)));
+typedef short v8i16_h __attribute__ ((vector_size(16), aligned(2)));
+typedef unsigned short v8u16 __attribute__ ((vector_size(16), aligned(16)));
+typedef unsigned short v8u16_h __attribute__ ((vector_size(16), aligned(2)));
+typedef int v4i32 __attribute__ ((vector_size(16), aligned(16)));
+typedef int v4i32_w __attribute__ ((vector_size(16), aligned(4)));
+typedef unsigned int v4u32 __attribute__ ((vector_size(16), aligned(16)));
+typedef unsigned int v4u32_w __attribute__ ((vector_size(16), aligned(4)));
+typedef long long v2i64 __attribute__ ((vector_size(16), aligned(16)));
+typedef long long v2i64_d __attribute__ ((vector_size(16), aligned(8)));
+typedef unsigned long long v2u64 __attribute__ ((vector_size(16), aligned(16)));
+typedef unsigned long long v2u64_d __attribute__ ((vector_size(16), aligned(8)));
+typedef float v4f32 __attribute__ ((vector_size(16), aligned(16)));
+typedef float v4f32_w __attribute__ ((vector_size(16), aligned(4)));
+typedef double v2f64 __attribute__ ((vector_size(16), aligned(16)));
+typedef double v2f64_d __attribute__ ((vector_size(16), aligned(8)));
+
+typedef long long __m128i __attribute__ ((__vector_size__ (16), __may_alias__));
+typedef float __m128 __attribute__ ((__vector_size__ (16), __may_alias__));
+typedef double __m128d __attribute__ ((__vector_size__ (16), __may_alias__));
+
+
+// CHECK-LABEL: @vsll_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsll.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vsll_b(v16i8 _1, v16i8 _2) { return __builtin_lsx_vsll_b(_1, _2); }
+// CHECK-LABEL: @vsll_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsll.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vsll_h(v8i16 _1, v8i16 _2) { return __builtin_lsx_vsll_h(_1, _2); }
+// CHECK-LABEL: @vsll_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsll.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vsll_w(v4i32 _1, v4i32 _2) { return __builtin_lsx_vsll_w(_1, _2); }
+// CHECK-LABEL: @vsll_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsll.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vsll_d(v2i64 _1, v2i64 _2) { return __builtin_lsx_vsll_d(_1, _2); }
+// CHECK-LABEL: @vslli_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vslli.b(<16 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vslli_b(v16i8 _1) { return __builtin_lsx_vslli_b(_1, 1); }
+// CHECK-LABEL: @vslli_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vslli.h(<8 x i16> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vslli_h(v8i16 _1) { return __builtin_lsx_vslli_h(_1, 1); }
+// CHECK-LABEL: @vslli_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vslli.w(<4 x i32> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vslli_w(v4i32 _1) { return __builtin_lsx_vslli_w(_1, 1); }
+// CHECK-LABEL: @vslli_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vslli.d(<2 x i64> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vslli_d(v2i64 _1) { return __builtin_lsx_vslli_d(_1, 1); }
+// CHECK-LABEL: @vsra_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsra.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vsra_b(v16i8 _1, v16i8 _2) { return __builtin_lsx_vsra_b(_1, _2); }
+// CHECK-LABEL: @vsra_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsra.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vsra_h(v8i16 _1, v8i16 _2) { return __builtin_lsx_vsra_h(_1, _2); }
+// CHECK-LABEL: @vsra_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsra.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vsra_w(v4i32 _1, v4i32 _2) { return __builtin_lsx_vsra_w(_1, _2); }
+// CHECK-LABEL: @vsra_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsra.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vsra_d(v2i64 _1, v2i64 _2) { return __builtin_lsx_vsra_d(_1, _2); }
+// CHECK-LABEL: @vsrai_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrai.b(<16 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vsrai_b(v16i8 _1) { return __builtin_lsx_vsrai_b(_1, 1); }
+// CHECK-LABEL: @vsrai_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrai.h(<8 x i16> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vsrai_h(v8i16 _1) { return __builtin_lsx_vsrai_h(_1, 1); }
+// CHECK-LABEL: @vsrai_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrai.w(<4 x i32> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vsrai_w(v4i32 _1) { return __builtin_lsx_vsrai_w(_1, 1); }
+// CHECK-LABEL: @vsrai_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsrai.d(<2 x i64> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vsrai_d(v2i64 _1) { return __builtin_lsx_vsrai_d(_1, 1); }
+// CHECK-LABEL: @vsrar_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrar.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vsrar_b(v16i8 _1, v16i8 _2) {
+  return __builtin_lsx_vsrar_b(_1, _2);
+}
+// CHECK-LABEL: @vsrar_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrar.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vsrar_h(v8i16 _1, v8i16 _2) {
+  return __builtin_lsx_vsrar_h(_1, _2);
+}
+// CHECK-LABEL: @vsrar_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrar.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vsrar_w(v4i32 _1, v4i32 _2) {
+  return __builtin_lsx_vsrar_w(_1, _2);
+}
+// CHECK-LABEL: @vsrar_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsrar.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vsrar_d(v2i64 _1, v2i64 _2) {
+  return __builtin_lsx_vsrar_d(_1, _2);
+}
+// CHECK-LABEL: @vsrari_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrari.b(<16 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vsrari_b(v16i8 _1) { return __builtin_lsx_vsrari_b(_1, 1); }
+// CHECK-LABEL: @vsrari_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrari.h(<8 x i16> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vsrari_h(v8i16 _1) { return __builtin_lsx_vsrari_h(_1, 1); }
+// CHECK-LABEL: @vsrari_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrari.w(<4 x i32> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vsrari_w(v4i32 _1) { return __builtin_lsx_vsrari_w(_1, 1); }
+// CHECK-LABEL: @vsrari_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsrari.d(<2 x i64> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vsrari_d(v2i64 _1) { return __builtin_lsx_vsrari_d(_1, 1); }
+// CHECK-LABEL: @vsrl_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrl.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vsrl_b(v16i8 _1, v16i8 _2) { return __builtin_lsx_vsrl_b(_1, _2); }
+// CHECK-LABEL: @vsrl_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrl.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vsrl_h(v8i16 _1, v8i16 _2) { return __builtin_lsx_vsrl_h(_1, _2); }
+// CHECK-LABEL: @vsrl_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrl.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vsrl_w(v4i32 _1, v4i32 _2) { return __builtin_lsx_vsrl_w(_1, _2); }
+// CHECK-LABEL: @vsrl_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsrl.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vsrl_d(v2i64 _1, v2i64 _2) { return __builtin_lsx_vsrl_d(_1, _2); }
+// CHECK-LABEL: @vsrli_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrli.b(<16 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vsrli_b(v16i8 _1) { return __builtin_lsx_vsrli_b(_1, 1); }
+// CHECK-LABEL: @vsrli_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrli.h(<8 x i16> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vsrli_h(v8i16 _1) { return __builtin_lsx_vsrli_h(_1, 1); }
+// CHECK-LABEL: @vsrli_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrli.w(<4 x i32> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vsrli_w(v4i32 _1) { return __builtin_lsx_vsrli_w(_1, 1); }
+// CHECK-LABEL: @vsrli_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsrli.d(<2 x i64> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vsrli_d(v2i64 _1) { return __builtin_lsx_vsrli_d(_1, 1); }
+// CHECK-LABEL: @vsrlr_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrlr.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vsrlr_b(v16i8 _1, v16i8 _2) {
+  return __builtin_lsx_vsrlr_b(_1, _2);
+}
+// CHECK-LABEL: @vsrlr_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrlr.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vsrlr_h(v8i16 _1, v8i16 _2) {
+  return __builtin_lsx_vsrlr_h(_1, _2);
+}
+// CHECK-LABEL: @vsrlr_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrlr.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vsrlr_w(v4i32 _1, v4i32 _2) {
+  return __builtin_lsx_vsrlr_w(_1, _2);
+}
+// CHECK-LABEL: @vsrlr_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsrlr.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vsrlr_d(v2i64 _1, v2i64 _2) {
+  return __builtin_lsx_vsrlr_d(_1, _2);
+}
+// CHECK-LABEL: @vsrlri_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrlri.b(<16 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vsrlri_b(v16i8 _1) { return __builtin_lsx_vsrlri_b(_1, 1); }
+// CHECK-LABEL: @vsrlri_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrlri.h(<8 x i16> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vsrlri_h(v8i16 _1) { return __builtin_lsx_vsrlri_h(_1, 1); }
+// CHECK-LABEL: @vsrlri_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrlri.w(<4 x i32> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vsrlri_w(v4i32 _1) { return __builtin_lsx_vsrlri_w(_1, 1); }
+// CHECK-LABEL: @vsrlri_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsrlri.d(<2 x i64> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vsrlri_d(v2i64 _1) { return __builtin_lsx_vsrlri_d(_1, 1); }
+// CHECK-LABEL: @vbitclr_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vbitclr.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16u8 vbitclr_b(v16u8 _1, v16u8 _2) {
+  return __builtin_lsx_vbitclr_b(_1, _2);
+}
+// CHECK-LABEL: @vbitclr_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vbitclr.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8u16 vbitclr_h(v8u16 _1, v8u16 _2) {
+  return __builtin_lsx_vbitclr_h(_1, _2);
+}
+// CHECK-LABEL: @vbitclr_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vbitclr.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4u32 vbitclr_w(v4u32 _1, v4u32 _2) {
+  return __builtin_lsx_vbitclr_w(_1, _2);
+}
+// CHECK-LABEL: @vbitclr_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vbitclr.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2u64 vbitclr_d(v2u64 _1, v2u64 _2) {
+  return __builtin_lsx_vbitclr_d(_1, _2);
+}
+// CHECK-LABEL: @vbitclri_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vbitclri.b(<16 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16u8 vbitclri_b(v16u8 _1) { return __builtin_lsx_vbitclri_b(_1, 1); }
+// CHECK-LABEL: @vbitclri_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vbitclri.h(<8 x i16> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8u16 vbitclri_h(v8u16 _1) { return __builtin_lsx_vbitclri_h(_1, 1); }
+// CHECK-LABEL: @vbitclri_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vbitclri.w(<4 x i32> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4u32 vbitclri_w(v4u32 _1) { return __builtin_lsx_vbitclri_w(_1, 1); }
+// CHECK-LABEL: @vbitclri_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vbitclri.d(<2 x i64> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2u64 vbitclri_d(v2u64 _1) { return __builtin_lsx_vbitclri_d(_1, 1); }
+// CHECK-LABEL: @vbitset_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vbitset.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16u8 vbitset_b(v16u8 _1, v16u8 _2) {
+  return __builtin_lsx_vbitset_b(_1, _2);
+}
+// CHECK-LABEL: @vbitset_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vbitset.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8u16 vbitset_h(v8u16 _1, v8u16 _2) {
+  return __builtin_lsx_vbitset_h(_1, _2);
+}
+// CHECK-LABEL: @vbitset_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vbitset.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4u32 vbitset_w(v4u32 _1, v4u32 _2) {
+  return __builtin_lsx_vbitset_w(_1, _2);
+}
+// CHECK-LABEL: @vbitset_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vbitset.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2u64 vbitset_d(v2u64 _1, v2u64 _2) {
+  return __builtin_lsx_vbitset_d(_1, _2);
+}
+// CHECK-LABEL: @vbitseti_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vbitseti.b(<16 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16u8 vbitseti_b(v16u8 _1) { return __builtin_lsx_vbitseti_b(_1, 1); }
+// CHECK-LABEL: @vbitseti_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vbitseti.h(<8 x i16> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8u16 vbitseti_h(v8u16 _1) { return __builtin_lsx_vbitseti_h(_1, 1); }
+// CHECK-LABEL: @vbitseti_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vbitseti.w(<4 x i32> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4u32 vbitseti_w(v4u32 _1) { return __builtin_lsx_vbitseti_w(_1, 1); }
+// CHECK-LABEL: @vbitseti_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vbitseti.d(<2 x i64> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2u64 vbitseti_d(v2u64 _1) { return __builtin_lsx_vbitseti_d(_1, 1); }
+// CHECK-LABEL: @vbitrev_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vbitrev.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16u8 vbitrev_b(v16u8 _1, v16u8 _2) {
+  return __builtin_lsx_vbitrev_b(_1, _2);
+}
+// CHECK-LABEL: @vbitrev_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vbitrev.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8u16 vbitrev_h(v8u16 _1, v8u16 _2) {
+  return __builtin_lsx_vbitrev_h(_1, _2);
+}
+// CHECK-LABEL: @vbitrev_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vbitrev.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4u32 vbitrev_w(v4u32 _1, v4u32 _2) {
+  return __builtin_lsx_vbitrev_w(_1, _2);
+}
+// CHECK-LABEL: @vbitrev_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vbitrev.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2u64 vbitrev_d(v2u64 _1, v2u64 _2) {
+  return __builtin_lsx_vbitrev_d(_1, _2);
+}
+// CHECK-LABEL: @vbitrevi_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vbitrevi.b(<16 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16u8 vbitrevi_b(v16u8 _1) { return __builtin_lsx_vbitrevi_b(_1, 1); }
+// CHECK-LABEL: @vbitrevi_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vbitrevi.h(<8 x i16> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8u16 vbitrevi_h(v8u16 _1) { return __builtin_lsx_vbitrevi_h(_1, 1); }
+// CHECK-LABEL: @vbitrevi_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vbitrevi.w(<4 x i32> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4u32 vbitrevi_w(v4u32 _1) { return __builtin_lsx_vbitrevi_w(_1, 1); }
+// CHECK-LABEL: @vbitrevi_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vbitrevi.d(<2 x i64> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2u64 vbitrevi_d(v2u64 _1) { return __builtin_lsx_vbitrevi_d(_1, 1); }
+// CHECK-LABEL: @vadd_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vadd.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vadd_b(v16i8 _1, v16i8 _2) { return __builtin_lsx_vadd_b(_1, _2); }
+// CHECK-LABEL: @vadd_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vadd.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vadd_h(v8i16 _1, v8i16 _2) { return __builtin_lsx_vadd_h(_1, _2); }
+// CHECK-LABEL: @vadd_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vadd.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vadd_w(v4i32 _1, v4i32 _2) { return __builtin_lsx_vadd_w(_1, _2); }
+// CHECK-LABEL: @vadd_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vadd.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vadd_d(v2i64 _1, v2i64 _2) { return __builtin_lsx_vadd_d(_1, _2); }
+// CHECK-LABEL: @vaddi_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vaddi.bu(<16 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vaddi_bu(v16i8 _1) { return __builtin_lsx_vaddi_bu(_1, 1); }
+// CHECK-LABEL: @vaddi_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vaddi.hu(<8 x i16> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vaddi_hu(v8i16 _1) { return __builtin_lsx_vaddi_hu(_1, 1); }
+// CHECK-LABEL: @vaddi_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vaddi.wu(<4 x i32> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vaddi_wu(v4i32 _1) { return __builtin_lsx_vaddi_wu(_1, 1); }
+// CHECK-LABEL: @vaddi_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddi.du(<2 x i64> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vaddi_du(v2i64 _1) { return __builtin_lsx_vaddi_du(_1, 1); }
+// CHECK-LABEL: @vsub_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsub.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vsub_b(v16i8 _1, v16i8 _2) { return __builtin_lsx_vsub_b(_1, _2); }
+// CHECK-LABEL: @vsub_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsub.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vsub_h(v8i16 _1, v8i16 _2) { return __builtin_lsx_vsub_h(_1, _2); }
+// CHECK-LABEL: @vsub_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsub.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vsub_w(v4i32 _1, v4i32 _2) { return __builtin_lsx_vsub_w(_1, _2); }
+// CHECK-LABEL: @vsub_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsub.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vsub_d(v2i64 _1, v2i64 _2) { return __builtin_lsx_vsub_d(_1, _2); }
+// CHECK-LABEL: @vsubi_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsubi.bu(<16 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vsubi_bu(v16i8 _1) { return __builtin_lsx_vsubi_bu(_1, 1); }
+// CHECK-LABEL: @vsubi_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsubi.hu(<8 x i16> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vsubi_hu(v8i16 _1) { return __builtin_lsx_vsubi_hu(_1, 1); }
+// CHECK-LABEL: @vsubi_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsubi.wu(<4 x i32> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vsubi_wu(v4i32 _1) { return __builtin_lsx_vsubi_wu(_1, 1); }
+// CHECK-LABEL: @vsubi_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsubi.du(<2 x i64> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vsubi_du(v2i64 _1) { return __builtin_lsx_vsubi_du(_1, 1); }
+// CHECK-LABEL: @vmax_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmax.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vmax_b(v16i8 _1, v16i8 _2) { return __builtin_lsx_vmax_b(_1, _2); }
+// CHECK-LABEL: @vmax_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmax.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vmax_h(v8i16 _1, v8i16 _2) { return __builtin_lsx_vmax_h(_1, _2); }
+// CHECK-LABEL: @vmax_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmax.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vmax_w(v4i32 _1, v4i32 _2) { return __builtin_lsx_vmax_w(_1, _2); }
+// CHECK-LABEL: @vmax_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmax.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vmax_d(v2i64 _1, v2i64 _2) { return __builtin_lsx_vmax_d(_1, _2); }
+// CHECK-LABEL: @vmaxi_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmaxi.b(<16 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vmaxi_b(v16i8 _1) { return __builtin_lsx_vmaxi_b(_1, 1); }
+// CHECK-LABEL: @vmaxi_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmaxi.h(<8 x i16> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vmaxi_h(v8i16 _1) { return __builtin_lsx_vmaxi_h(_1, 1); }
+// CHECK-LABEL: @vmaxi_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmaxi.w(<4 x i32> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vmaxi_w(v4i32 _1) { return __builtin_lsx_vmaxi_w(_1, 1); }
+// CHECK-LABEL: @vmaxi_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaxi.d(<2 x i64> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vmaxi_d(v2i64 _1) { return __builtin_lsx_vmaxi_d(_1, 1); }
+// CHECK-LABEL: @vmax_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmax.bu(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16u8 vmax_bu(v16u8 _1, v16u8 _2) {
+  return __builtin_lsx_vmax_bu(_1, _2);
+}
+// CHECK-LABEL: @vmax_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmax.hu(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8u16 vmax_hu(v8u16 _1, v8u16 _2) {
+  return __builtin_lsx_vmax_hu(_1, _2);
+}
+// CHECK-LABEL: @vmax_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmax.wu(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4u32 vmax_wu(v4u32 _1, v4u32 _2) {
+  return __builtin_lsx_vmax_wu(_1, _2);
+}
+// CHECK-LABEL: @vmax_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmax.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2u64 vmax_du(v2u64 _1, v2u64 _2) {
+  return __builtin_lsx_vmax_du(_1, _2);
+}
+// CHECK-LABEL: @vmaxi_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmaxi.bu(<16 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16u8 vmaxi_bu(v16u8 _1) { return __builtin_lsx_vmaxi_bu(_1, 1); }
+// CHECK-LABEL: @vmaxi_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmaxi.hu(<8 x i16> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8u16 vmaxi_hu(v8u16 _1) { return __builtin_lsx_vmaxi_hu(_1, 1); }
+// CHECK-LABEL: @vmaxi_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmaxi.wu(<4 x i32> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4u32 vmaxi_wu(v4u32 _1) { return __builtin_lsx_vmaxi_wu(_1, 1); }
+// CHECK-LABEL: @vmaxi_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaxi.du(<2 x i64> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2u64 vmaxi_du(v2u64 _1) { return __builtin_lsx_vmaxi_du(_1, 1); }
+// CHECK-LABEL: @vmin_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmin.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vmin_b(v16i8 _1, v16i8 _2) { return __builtin_lsx_vmin_b(_1, _2); }
+// CHECK-LABEL: @vmin_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmin.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vmin_h(v8i16 _1, v8i16 _2) { return __builtin_lsx_vmin_h(_1, _2); }
+// CHECK-LABEL: @vmin_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmin.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vmin_w(v4i32 _1, v4i32 _2) { return __builtin_lsx_vmin_w(_1, _2); }
+// CHECK-LABEL: @vmin_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmin.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vmin_d(v2i64 _1, v2i64 _2) { return __builtin_lsx_vmin_d(_1, _2); }
+// CHECK-LABEL: @vmini_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmini.b(<16 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vmini_b(v16i8 _1) { return __builtin_lsx_vmini_b(_1, 1); }
+// CHECK-LABEL: @vmini_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmini.h(<8 x i16> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vmini_h(v8i16 _1) { return __builtin_lsx_vmini_h(_1, 1); }
+// CHECK-LABEL: @vmini_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmini.w(<4 x i32> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vmini_w(v4i32 _1) { return __builtin_lsx_vmini_w(_1, 1); }
+// CHECK-LABEL: @vmini_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmini.d(<2 x i64> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vmini_d(v2i64 _1) { return __builtin_lsx_vmini_d(_1, 1); }
+// CHECK-LABEL: @vmin_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmin.bu(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16u8 vmin_bu(v16u8 _1, v16u8 _2) {
+  return __builtin_lsx_vmin_bu(_1, _2);
+}
+// CHECK-LABEL: @vmin_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmin.hu(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8u16 vmin_hu(v8u16 _1, v8u16 _2) {
+  return __builtin_lsx_vmin_hu(_1, _2);
+}
+// CHECK-LABEL: @vmin_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmin.wu(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4u32 vmin_wu(v4u32 _1, v4u32 _2) {
+  return __builtin_lsx_vmin_wu(_1, _2);
+}
+// CHECK-LABEL: @vmin_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmin.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2u64 vmin_du(v2u64 _1, v2u64 _2) {
+  return __builtin_lsx_vmin_du(_1, _2);
+}
+// CHECK-LABEL: @vmini_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmini.bu(<16 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16u8 vmini_bu(v16u8 _1) { return __builtin_lsx_vmini_bu(_1, 1); }
+// CHECK-LABEL: @vmini_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmini.hu(<8 x i16> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8u16 vmini_hu(v8u16 _1) { return __builtin_lsx_vmini_hu(_1, 1); }
+// CHECK-LABEL: @vmini_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmini.wu(<4 x i32> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4u32 vmini_wu(v4u32 _1) { return __builtin_lsx_vmini_wu(_1, 1); }
+// CHECK-LABEL: @vmini_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmini.du(<2 x i64> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2u64 vmini_du(v2u64 _1) { return __builtin_lsx_vmini_du(_1, 1); }
+// CHECK-LABEL: @vseq_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vseq.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vseq_b(v16i8 _1, v16i8 _2) { return __builtin_lsx_vseq_b(_1, _2); }
+// CHECK-LABEL: @vseq_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vseq.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vseq_h(v8i16 _1, v8i16 _2) { return __builtin_lsx_vseq_h(_1, _2); }
+// CHECK-LABEL: @vseq_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vseq.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vseq_w(v4i32 _1, v4i32 _2) { return __builtin_lsx_vseq_w(_1, _2); }
+// CHECK-LABEL: @vseq_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vseq.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vseq_d(v2i64 _1, v2i64 _2) { return __builtin_lsx_vseq_d(_1, _2); }
+// CHECK-LABEL: @vseqi_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vseqi.b(<16 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vseqi_b(v16i8 _1) { return __builtin_lsx_vseqi_b(_1, 1); }
+// CHECK-LABEL: @vseqi_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vseqi.h(<8 x i16> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vseqi_h(v8i16 _1) { return __builtin_lsx_vseqi_h(_1, 1); }
+// CHECK-LABEL: @vseqi_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vseqi.w(<4 x i32> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vseqi_w(v4i32 _1) { return __builtin_lsx_vseqi_w(_1, 1); }
+// CHECK-LABEL: @vseqi_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vseqi.d(<2 x i64> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vseqi_d(v2i64 _1) { return __builtin_lsx_vseqi_d(_1, 1); }
+// CHECK-LABEL: @vslti_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vslti.b(<16 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vslti_b(v16i8 _1) { return __builtin_lsx_vslti_b(_1, 1); }
+// CHECK-LABEL: @vslt_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vslt.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vslt_b(v16i8 _1, v16i8 _2) { return __builtin_lsx_vslt_b(_1, _2); }
+// CHECK-LABEL: @vslt_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vslt.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vslt_h(v8i16 _1, v8i16 _2) { return __builtin_lsx_vslt_h(_1, _2); }
+// CHECK-LABEL: @vslt_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vslt.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vslt_w(v4i32 _1, v4i32 _2) { return __builtin_lsx_vslt_w(_1, _2); }
+// CHECK-LABEL: @vslt_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vslt.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vslt_d(v2i64 _1, v2i64 _2) { return __builtin_lsx_vslt_d(_1, _2); }
+// CHECK-LABEL: @vslti_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vslti.h(<8 x i16> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vslti_h(v8i16 _1) { return __builtin_lsx_vslti_h(_1, 1); }
+// CHECK-LABEL: @vslti_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vslti.w(<4 x i32> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vslti_w(v4i32 _1) { return __builtin_lsx_vslti_w(_1, 1); }
+// CHECK-LABEL: @vslti_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vslti.d(<2 x i64> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vslti_d(v2i64 _1) { return __builtin_lsx_vslti_d(_1, 1); }
+// CHECK-LABEL: @vslt_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vslt.bu(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vslt_bu(v16u8 _1, v16u8 _2) {
+  return __builtin_lsx_vslt_bu(_1, _2);
+}
+// CHECK-LABEL: @vslt_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vslt.hu(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vslt_hu(v8u16 _1, v8u16 _2) {
+  return __builtin_lsx_vslt_hu(_1, _2);
+}
+// CHECK-LABEL: @vslt_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vslt.wu(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vslt_wu(v4u32 _1, v4u32 _2) {
+  return __builtin_lsx_vslt_wu(_1, _2);
+}
+// CHECK-LABEL: @vslt_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vslt.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vslt_du(v2u64 _1, v2u64 _2) {
+  return __builtin_lsx_vslt_du(_1, _2);
+}
+// CHECK-LABEL: @vslti_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vslti.bu(<16 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vslti_bu(v16u8 _1) { return __builtin_lsx_vslti_bu(_1, 1); }
+// CHECK-LABEL: @vslti_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vslti.hu(<8 x i16> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vslti_hu(v8u16 _1) { return __builtin_lsx_vslti_hu(_1, 1); }
+// CHECK-LABEL: @vslti_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vslti.wu(<4 x i32> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vslti_wu(v4u32 _1) { return __builtin_lsx_vslti_wu(_1, 1); }
+// CHECK-LABEL: @vslti_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vslti.du(<2 x i64> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vslti_du(v2u64 _1) { return __builtin_lsx_vslti_du(_1, 1); }
+// CHECK-LABEL: @vsle_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsle.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vsle_b(v16i8 _1, v16i8 _2) { return __builtin_lsx_vsle_b(_1, _2); }
+// CHECK-LABEL: @vsle_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsle.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vsle_h(v8i16 _1, v8i16 _2) { return __builtin_lsx_vsle_h(_1, _2); }
+// CHECK-LABEL: @vsle_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsle.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vsle_w(v4i32 _1, v4i32 _2) { return __builtin_lsx_vsle_w(_1, _2); }
+// CHECK-LABEL: @vsle_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsle.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vsle_d(v2i64 _1, v2i64 _2) { return __builtin_lsx_vsle_d(_1, _2); }
+// CHECK-LABEL: @vslei_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vslei.b(<16 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vslei_b(v16i8 _1) { return __builtin_lsx_vslei_b(_1, 1); }
+// CHECK-LABEL: @vslei_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vslei.h(<8 x i16> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vslei_h(v8i16 _1) { return __builtin_lsx_vslei_h(_1, 1); }
+// CHECK-LABEL: @vslei_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vslei.w(<4 x i32> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vslei_w(v4i32 _1) { return __builtin_lsx_vslei_w(_1, 1); }
+// CHECK-LABEL: @vslei_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vslei.d(<2 x i64> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vslei_d(v2i64 _1) { return __builtin_lsx_vslei_d(_1, 1); }
+// CHECK-LABEL: @vsle_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsle.bu(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vsle_bu(v16u8 _1, v16u8 _2) {
+  return __builtin_lsx_vsle_bu(_1, _2);
+}
+// CHECK-LABEL: @vsle_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsle.hu(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vsle_hu(v8u16 _1, v8u16 _2) {
+  return __builtin_lsx_vsle_hu(_1, _2);
+}
+// CHECK-LABEL: @vsle_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsle.wu(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vsle_wu(v4u32 _1, v4u32 _2) {
+  return __builtin_lsx_vsle_wu(_1, _2);
+}
+// CHECK-LABEL: @vsle_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsle.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vsle_du(v2u64 _1, v2u64 _2) {
+  return __builtin_lsx_vsle_du(_1, _2);
+}
+// CHECK-LABEL: @vslei_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vslei.bu(<16 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vslei_bu(v16u8 _1) { return __builtin_lsx_vslei_bu(_1, 1); }
+// CHECK-LABEL: @vslei_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vslei.hu(<8 x i16> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vslei_hu(v8u16 _1) { return __builtin_lsx_vslei_hu(_1, 1); }
+// CHECK-LABEL: @vslei_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vslei.wu(<4 x i32> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vslei_wu(v4u32 _1) { return __builtin_lsx_vslei_wu(_1, 1); }
+// CHECK-LABEL: @vslei_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vslei.du(<2 x i64> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vslei_du(v2u64 _1) { return __builtin_lsx_vslei_du(_1, 1); }
+// CHECK-LABEL: @vsat_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsat.b(<16 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vsat_b(v16i8 _1) { return __builtin_lsx_vsat_b(_1, 1); }
+// CHECK-LABEL: @vsat_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsat.h(<8 x i16> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vsat_h(v8i16 _1) { return __builtin_lsx_vsat_h(_1, 1); }
+// CHECK-LABEL: @vsat_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsat.w(<4 x i32> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vsat_w(v4i32 _1) { return __builtin_lsx_vsat_w(_1, 1); }
+// CHECK-LABEL: @vsat_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsat.d(<2 x i64> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vsat_d(v2i64 _1) { return __builtin_lsx_vsat_d(_1, 1); }
+// CHECK-LABEL: @vsat_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsat.bu(<16 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16u8 vsat_bu(v16u8 _1) { return __builtin_lsx_vsat_bu(_1, 1); }
+// CHECK-LABEL: @vsat_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsat.hu(<8 x i16> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8u16 vsat_hu(v8u16 _1) { return __builtin_lsx_vsat_hu(_1, 1); }
+// CHECK-LABEL: @vsat_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsat.wu(<4 x i32> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4u32 vsat_wu(v4u32 _1) { return __builtin_lsx_vsat_wu(_1, 1); }
+// CHECK-LABEL: @vsat_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsat.du(<2 x i64> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2u64 vsat_du(v2u64 _1) { return __builtin_lsx_vsat_du(_1, 1); }
+// CHECK-LABEL: @vadda_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vadda.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vadda_b(v16i8 _1, v16i8 _2) {
+  return __builtin_lsx_vadda_b(_1, _2);
+}
+// CHECK-LABEL: @vadda_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vadda.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vadda_h(v8i16 _1, v8i16 _2) {
+  return __builtin_lsx_vadda_h(_1, _2);
+}
+// CHECK-LABEL: @vadda_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vadda.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vadda_w(v4i32 _1, v4i32 _2) {
+  return __builtin_lsx_vadda_w(_1, _2);
+}
+// CHECK-LABEL: @vadda_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vadda.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vadda_d(v2i64 _1, v2i64 _2) {
+  return __builtin_lsx_vadda_d(_1, _2);
+}
+// CHECK-LABEL: @vsadd_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsadd.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vsadd_b(v16i8 _1, v16i8 _2) {
+  return __builtin_lsx_vsadd_b(_1, _2);
+}
+// CHECK-LABEL: @vsadd_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsadd.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vsadd_h(v8i16 _1, v8i16 _2) {
+  return __builtin_lsx_vsadd_h(_1, _2);
+}
+// CHECK-LABEL: @vsadd_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsadd.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vsadd_w(v4i32 _1, v4i32 _2) {
+  return __builtin_lsx_vsadd_w(_1, _2);
+}
+// CHECK-LABEL: @vsadd_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsadd.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vsadd_d(v2i64 _1, v2i64 _2) {
+  return __builtin_lsx_vsadd_d(_1, _2);
+}
+// CHECK-LABEL: @vsadd_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsadd.bu(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16u8 vsadd_bu(v16u8 _1, v16u8 _2) {
+  return __builtin_lsx_vsadd_bu(_1, _2);
+}
+// CHECK-LABEL: @vsadd_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsadd.hu(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8u16 vsadd_hu(v8u16 _1, v8u16 _2) {
+  return __builtin_lsx_vsadd_hu(_1, _2);
+}
+// CHECK-LABEL: @vsadd_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsadd.wu(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4u32 vsadd_wu(v4u32 _1, v4u32 _2) {
+  return __builtin_lsx_vsadd_wu(_1, _2);
+}
+// CHECK-LABEL: @vsadd_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsadd.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2u64 vsadd_du(v2u64 _1, v2u64 _2) {
+  return __builtin_lsx_vsadd_du(_1, _2);
+}
+// CHECK-LABEL: @vavg_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vavg.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vavg_b(v16i8 _1, v16i8 _2) { return __builtin_lsx_vavg_b(_1, _2); }
+// CHECK-LABEL: @vavg_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vavg.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vavg_h(v8i16 _1, v8i16 _2) { return __builtin_lsx_vavg_h(_1, _2); }
+// CHECK-LABEL: @vavg_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vavg.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vavg_w(v4i32 _1, v4i32 _2) { return __builtin_lsx_vavg_w(_1, _2); }
+// CHECK-LABEL: @vavg_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vavg.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vavg_d(v2i64 _1, v2i64 _2) { return __builtin_lsx_vavg_d(_1, _2); }
+// CHECK-LABEL: @vavg_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vavg.bu(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16u8 vavg_bu(v16u8 _1, v16u8 _2) {
+  return __builtin_lsx_vavg_bu(_1, _2);
+}
+// CHECK-LABEL: @vavg_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vavg.hu(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8u16 vavg_hu(v8u16 _1, v8u16 _2) {
+  return __builtin_lsx_vavg_hu(_1, _2);
+}
+// CHECK-LABEL: @vavg_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vavg.wu(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4u32 vavg_wu(v4u32 _1, v4u32 _2) {
+  return __builtin_lsx_vavg_wu(_1, _2);
+}
+// CHECK-LABEL: @vavg_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vavg.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2u64 vavg_du(v2u64 _1, v2u64 _2) {
+  return __builtin_lsx_vavg_du(_1, _2);
+}
+// CHECK-LABEL: @vavgr_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vavgr.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vavgr_b(v16i8 _1, v16i8 _2) {
+  return __builtin_lsx_vavgr_b(_1, _2);
+}
+// CHECK-LABEL: @vavgr_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vavgr.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vavgr_h(v8i16 _1, v8i16 _2) {
+  return __builtin_lsx_vavgr_h(_1, _2);
+}
+// CHECK-LABEL: @vavgr_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vavgr.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vavgr_w(v4i32 _1, v4i32 _2) {
+  return __builtin_lsx_vavgr_w(_1, _2);
+}
+// CHECK-LABEL: @vavgr_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vavgr.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vavgr_d(v2i64 _1, v2i64 _2) {
+  return __builtin_lsx_vavgr_d(_1, _2);
+}
+// CHECK-LABEL: @vavgr_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vavgr.bu(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16u8 vavgr_bu(v16u8 _1, v16u8 _2) {
+  return __builtin_lsx_vavgr_bu(_1, _2);
+}
+// CHECK-LABEL: @vavgr_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vavgr.hu(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8u16 vavgr_hu(v8u16 _1, v8u16 _2) {
+  return __builtin_lsx_vavgr_hu(_1, _2);
+}
+// CHECK-LABEL: @vavgr_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vavgr.wu(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4u32 vavgr_wu(v4u32 _1, v4u32 _2) {
+  return __builtin_lsx_vavgr_wu(_1, _2);
+}
+// CHECK-LABEL: @vavgr_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vavgr.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2u64 vavgr_du(v2u64 _1, v2u64 _2) {
+  return __builtin_lsx_vavgr_du(_1, _2);
+}
+// CHECK-LABEL: @vssub_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssub.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vssub_b(v16i8 _1, v16i8 _2) {
+  return __builtin_lsx_vssub_b(_1, _2);
+}
+// CHECK-LABEL: @vssub_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssub.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vssub_h(v8i16 _1, v8i16 _2) {
+  return __builtin_lsx_vssub_h(_1, _2);
+}
+// CHECK-LABEL: @vssub_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssub.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vssub_w(v4i32 _1, v4i32 _2) {
+  return __builtin_lsx_vssub_w(_1, _2);
+}
+// CHECK-LABEL: @vssub_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vssub.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vssub_d(v2i64 _1, v2i64 _2) {
+  return __builtin_lsx_vssub_d(_1, _2);
+}
+// CHECK-LABEL: @vssub_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssub.bu(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16u8 vssub_bu(v16u8 _1, v16u8 _2) {
+  return __builtin_lsx_vssub_bu(_1, _2);
+}
+// CHECK-LABEL: @vssub_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssub.hu(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8u16 vssub_hu(v8u16 _1, v8u16 _2) {
+  return __builtin_lsx_vssub_hu(_1, _2);
+}
+// CHECK-LABEL: @vssub_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssub.wu(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4u32 vssub_wu(v4u32 _1, v4u32 _2) {
+  return __builtin_lsx_vssub_wu(_1, _2);
+}
+// CHECK-LABEL: @vssub_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vssub.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2u64 vssub_du(v2u64 _1, v2u64 _2) {
+  return __builtin_lsx_vssub_du(_1, _2);
+}
+// CHECK-LABEL: @vabsd_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vabsd.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vabsd_b(v16i8 _1, v16i8 _2) {
+  return __builtin_lsx_vabsd_b(_1, _2);
+}
+// CHECK-LABEL: @vabsd_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vabsd.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vabsd_h(v8i16 _1, v8i16 _2) {
+  return __builtin_lsx_vabsd_h(_1, _2);
+}
+// CHECK-LABEL: @vabsd_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vabsd.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vabsd_w(v4i32 _1, v4i32 _2) {
+  return __builtin_lsx_vabsd_w(_1, _2);
+}
+// CHECK-LABEL: @vabsd_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vabsd.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vabsd_d(v2i64 _1, v2i64 _2) {
+  return __builtin_lsx_vabsd_d(_1, _2);
+}
+// CHECK-LABEL: @vabsd_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vabsd.bu(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16u8 vabsd_bu(v16u8 _1, v16u8 _2) {
+  return __builtin_lsx_vabsd_bu(_1, _2);
+}
+// CHECK-LABEL: @vabsd_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vabsd.hu(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8u16 vabsd_hu(v8u16 _1, v8u16 _2) {
+  return __builtin_lsx_vabsd_hu(_1, _2);
+}
+// CHECK-LABEL: @vabsd_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vabsd.wu(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4u32 vabsd_wu(v4u32 _1, v4u32 _2) {
+  return __builtin_lsx_vabsd_wu(_1, _2);
+}
+// CHECK-LABEL: @vabsd_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vabsd.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2u64 vabsd_du(v2u64 _1, v2u64 _2) {
+  return __builtin_lsx_vabsd_du(_1, _2);
+}
+// CHECK-LABEL: @vmul_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmul.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vmul_b(v16i8 _1, v16i8 _2) { return __builtin_lsx_vmul_b(_1, _2); }
+// CHECK-LABEL: @vmul_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmul.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vmul_h(v8i16 _1, v8i16 _2) { return __builtin_lsx_vmul_h(_1, _2); }
+// CHECK-LABEL: @vmul_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmul.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vmul_w(v4i32 _1, v4i32 _2) { return __builtin_lsx_vmul_w(_1, _2); }
+// CHECK-LABEL: @vmul_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmul.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vmul_d(v2i64 _1, v2i64 _2) { return __builtin_lsx_vmul_d(_1, _2); }
+// CHECK-LABEL: @vmadd_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmadd.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]], <16 x i8> [[_3:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vmadd_b(v16i8 _1, v16i8 _2, v16i8 _3) {
+  return __builtin_lsx_vmadd_b(_1, _2, _3);
+}
+// CHECK-LABEL: @vmadd_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmadd.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]], <8 x i16> [[_3:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vmadd_h(v8i16 _1, v8i16 _2, v8i16 _3) {
+  return __builtin_lsx_vmadd_h(_1, _2, _3);
+}
+// CHECK-LABEL: @vmadd_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmadd.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]], <4 x i32> [[_3:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vmadd_w(v4i32 _1, v4i32 _2, v4i32 _3) {
+  return __builtin_lsx_vmadd_w(_1, _2, _3);
+}
+// CHECK-LABEL: @vmadd_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmadd.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], <2 x i64> [[_3:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vmadd_d(v2i64 _1, v2i64 _2, v2i64 _3) {
+  return __builtin_lsx_vmadd_d(_1, _2, _3);
+}
+// CHECK-LABEL: @vmsub_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmsub.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]], <16 x i8> [[_3:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vmsub_b(v16i8 _1, v16i8 _2, v16i8 _3) {
+  return __builtin_lsx_vmsub_b(_1, _2, _3);
+}
+// CHECK-LABEL: @vmsub_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmsub.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]], <8 x i16> [[_3:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vmsub_h(v8i16 _1, v8i16 _2, v8i16 _3) {
+  return __builtin_lsx_vmsub_h(_1, _2, _3);
+}
+// CHECK-LABEL: @vmsub_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmsub.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]], <4 x i32> [[_3:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vmsub_w(v4i32 _1, v4i32 _2, v4i32 _3) {
+  return __builtin_lsx_vmsub_w(_1, _2, _3);
+}
+// CHECK-LABEL: @vmsub_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmsub.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], <2 x i64> [[_3:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vmsub_d(v2i64 _1, v2i64 _2, v2i64 _3) {
+  return __builtin_lsx_vmsub_d(_1, _2, _3);
+}
+// CHECK-LABEL: @vdiv_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vdiv.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vdiv_b(v16i8 _1, v16i8 _2) { return __builtin_lsx_vdiv_b(_1, _2); }
+// CHECK-LABEL: @vdiv_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vdiv.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vdiv_h(v8i16 _1, v8i16 _2) { return __builtin_lsx_vdiv_h(_1, _2); }
+// CHECK-LABEL: @vdiv_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vdiv.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vdiv_w(v4i32 _1, v4i32 _2) { return __builtin_lsx_vdiv_w(_1, _2); }
+// CHECK-LABEL: @vdiv_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vdiv.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vdiv_d(v2i64 _1, v2i64 _2) { return __builtin_lsx_vdiv_d(_1, _2); }
+// CHECK-LABEL: @vdiv_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vdiv.bu(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16u8 vdiv_bu(v16u8 _1, v16u8 _2) {
+  return __builtin_lsx_vdiv_bu(_1, _2);
+}
+// CHECK-LABEL: @vdiv_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vdiv.hu(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8u16 vdiv_hu(v8u16 _1, v8u16 _2) {
+  return __builtin_lsx_vdiv_hu(_1, _2);
+}
+// CHECK-LABEL: @vdiv_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vdiv.wu(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4u32 vdiv_wu(v4u32 _1, v4u32 _2) {
+  return __builtin_lsx_vdiv_wu(_1, _2);
+}
+// CHECK-LABEL: @vdiv_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vdiv.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2u64 vdiv_du(v2u64 _1, v2u64 _2) {
+  return __builtin_lsx_vdiv_du(_1, _2);
+}
+// CHECK-LABEL: @vhaddw_h_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vhaddw.h.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vhaddw_h_b(v16i8 _1, v16i8 _2) {
+  return __builtin_lsx_vhaddw_h_b(_1, _2);
+}
+// CHECK-LABEL: @vhaddw_w_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vhaddw.w.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vhaddw_w_h(v8i16 _1, v8i16 _2) {
+  return __builtin_lsx_vhaddw_w_h(_1, _2);
+}
+// CHECK-LABEL: @vhaddw_d_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vhaddw.d.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vhaddw_d_w(v4i32 _1, v4i32 _2) {
+  return __builtin_lsx_vhaddw_d_w(_1, _2);
+}
+// CHECK-LABEL: @vhaddw_hu_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vhaddw.hu.bu(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8u16 vhaddw_hu_bu(v16u8 _1, v16u8 _2) {
+  return __builtin_lsx_vhaddw_hu_bu(_1, _2);
+}
+// CHECK-LABEL: @vhaddw_wu_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vhaddw.wu.hu(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4u32 vhaddw_wu_hu(v8u16 _1, v8u16 _2) {
+  return __builtin_lsx_vhaddw_wu_hu(_1, _2);
+}
+// CHECK-LABEL: @vhaddw_du_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vhaddw.du.wu(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2u64 vhaddw_du_wu(v4u32 _1, v4u32 _2) {
+  return __builtin_lsx_vhaddw_du_wu(_1, _2);
+}
+// CHECK-LABEL: @vhsubw_h_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vhsubw.h.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vhsubw_h_b(v16i8 _1, v16i8 _2) {
+  return __builtin_lsx_vhsubw_h_b(_1, _2);
+}
+// CHECK-LABEL: @vhsubw_w_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vhsubw.w.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vhsubw_w_h(v8i16 _1, v8i16 _2) {
+  return __builtin_lsx_vhsubw_w_h(_1, _2);
+}
+// CHECK-LABEL: @vhsubw_d_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vhsubw.d.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vhsubw_d_w(v4i32 _1, v4i32 _2) {
+  return __builtin_lsx_vhsubw_d_w(_1, _2);
+}
+// CHECK-LABEL: @vhsubw_hu_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vhsubw.hu.bu(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vhsubw_hu_bu(v16u8 _1, v16u8 _2) {
+  return __builtin_lsx_vhsubw_hu_bu(_1, _2);
+}
+// CHECK-LABEL: @vhsubw_wu_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vhsubw.wu.hu(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vhsubw_wu_hu(v8u16 _1, v8u16 _2) {
+  return __builtin_lsx_vhsubw_wu_hu(_1, _2);
+}
+// CHECK-LABEL: @vhsubw_du_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vhsubw.du.wu(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vhsubw_du_wu(v4u32 _1, v4u32 _2) {
+  return __builtin_lsx_vhsubw_du_wu(_1, _2);
+}
+// CHECK-LABEL: @vmod_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmod.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vmod_b(v16i8 _1, v16i8 _2) { return __builtin_lsx_vmod_b(_1, _2); }
+// CHECK-LABEL: @vmod_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmod.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vmod_h(v8i16 _1, v8i16 _2) { return __builtin_lsx_vmod_h(_1, _2); }
+// CHECK-LABEL: @vmod_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmod.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vmod_w(v4i32 _1, v4i32 _2) { return __builtin_lsx_vmod_w(_1, _2); }
+// CHECK-LABEL: @vmod_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmod.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vmod_d(v2i64 _1, v2i64 _2) { return __builtin_lsx_vmod_d(_1, _2); }
+// CHECK-LABEL: @vmod_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmod.bu(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16u8 vmod_bu(v16u8 _1, v16u8 _2) {
+  return __builtin_lsx_vmod_bu(_1, _2);
+}
+// CHECK-LABEL: @vmod_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmod.hu(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8u16 vmod_hu(v8u16 _1, v8u16 _2) {
+  return __builtin_lsx_vmod_hu(_1, _2);
+}
+// CHECK-LABEL: @vmod_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmod.wu(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4u32 vmod_wu(v4u32 _1, v4u32 _2) {
+  return __builtin_lsx_vmod_wu(_1, _2);
+}
+// CHECK-LABEL: @vmod_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmod.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2u64 vmod_du(v2u64 _1, v2u64 _2) {
+  return __builtin_lsx_vmod_du(_1, _2);
+}
+// CHECK-LABEL: @vreplve_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vreplve.b(<16 x i8> [[_1:%.*]], i32 [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vreplve_b(v16i8 _1, int _2) {
+  return __builtin_lsx_vreplve_b(_1, _2);
+}
+// CHECK-LABEL: @vreplve_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vreplve.h(<8 x i16> [[_1:%.*]], i32 [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vreplve_h(v8i16 _1, int _2) {
+  return __builtin_lsx_vreplve_h(_1, _2);
+}
+// CHECK-LABEL: @vreplve_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vreplve.w(<4 x i32> [[_1:%.*]], i32 [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vreplve_w(v4i32 _1, int _2) {
+  return __builtin_lsx_vreplve_w(_1, _2);
+}
+// CHECK-LABEL: @vreplve_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vreplve.d(<2 x i64> [[_1:%.*]], i32 [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vreplve_d(v2i64 _1, int _2) {
+  return __builtin_lsx_vreplve_d(_1, _2);
+}
+// CHECK-LABEL: @vreplvei_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vreplvei.b(<16 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vreplvei_b(v16i8 _1) { return __builtin_lsx_vreplvei_b(_1, 1); }
+// CHECK-LABEL: @vreplvei_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vreplvei.h(<8 x i16> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vreplvei_h(v8i16 _1) { return __builtin_lsx_vreplvei_h(_1, 1); }
+// CHECK-LABEL: @vreplvei_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vreplvei.w(<4 x i32> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vreplvei_w(v4i32 _1) { return __builtin_lsx_vreplvei_w(_1, 1); }
+// CHECK-LABEL: @vreplvei_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vreplvei.d(<2 x i64> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vreplvei_d(v2i64 _1) { return __builtin_lsx_vreplvei_d(_1, 1); }
+// CHECK-LABEL: @vpickev_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vpickev.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vpickev_b(v16i8 _1, v16i8 _2) {
+  return __builtin_lsx_vpickev_b(_1, _2);
+}
+// CHECK-LABEL: @vpickev_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vpickev.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vpickev_h(v8i16 _1, v8i16 _2) {
+  return __builtin_lsx_vpickev_h(_1, _2);
+}
+// CHECK-LABEL: @vpickev_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vpickev.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vpickev_w(v4i32 _1, v4i32 _2) {
+  return __builtin_lsx_vpickev_w(_1, _2);
+}
+// CHECK-LABEL: @vpickev_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vpickev.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vpickev_d(v2i64 _1, v2i64 _2) {
+  return __builtin_lsx_vpickev_d(_1, _2);
+}
+// CHECK-LABEL: @vpickod_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vpickod.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vpickod_b(v16i8 _1, v16i8 _2) {
+  return __builtin_lsx_vpickod_b(_1, _2);
+}
+// CHECK-LABEL: @vpickod_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vpickod.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vpickod_h(v8i16 _1, v8i16 _2) {
+  return __builtin_lsx_vpickod_h(_1, _2);
+}
+// CHECK-LABEL: @vpickod_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vpickod.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vpickod_w(v4i32 _1, v4i32 _2) {
+  return __builtin_lsx_vpickod_w(_1, _2);
+}
+// CHECK-LABEL: @vpickod_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vpickod.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vpickod_d(v2i64 _1, v2i64 _2) {
+  return __builtin_lsx_vpickod_d(_1, _2);
+}
+// CHECK-LABEL: @vilvh_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vilvh.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vilvh_b(v16i8 _1, v16i8 _2) {
+  return __builtin_lsx_vilvh_b(_1, _2);
+}
+// CHECK-LABEL: @vilvh_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vilvh.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vilvh_h(v8i16 _1, v8i16 _2) {
+  return __builtin_lsx_vilvh_h(_1, _2);
+}
+// CHECK-LABEL: @vilvh_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vilvh.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vilvh_w(v4i32 _1, v4i32 _2) {
+  return __builtin_lsx_vilvh_w(_1, _2);
+}
+// CHECK-LABEL: @vilvh_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vilvh.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vilvh_d(v2i64 _1, v2i64 _2) {
+  return __builtin_lsx_vilvh_d(_1, _2);
+}
+// CHECK-LABEL: @vilvl_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vilvl.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vilvl_b(v16i8 _1, v16i8 _2) {
+  return __builtin_lsx_vilvl_b(_1, _2);
+}
+// CHECK-LABEL: @vilvl_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vilvl.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vilvl_h(v8i16 _1, v8i16 _2) {
+  return __builtin_lsx_vilvl_h(_1, _2);
+}
+// CHECK-LABEL: @vilvl_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vilvl.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vilvl_w(v4i32 _1, v4i32 _2) {
+  return __builtin_lsx_vilvl_w(_1, _2);
+}
+// CHECK-LABEL: @vilvl_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vilvl.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vilvl_d(v2i64 _1, v2i64 _2) {
+  return __builtin_lsx_vilvl_d(_1, _2);
+}
+// CHECK-LABEL: @vpackev_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vpackev.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vpackev_b(v16i8 _1, v16i8 _2) {
+  return __builtin_lsx_vpackev_b(_1, _2);
+}
+// CHECK-LABEL: @vpackev_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vpackev.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vpackev_h(v8i16 _1, v8i16 _2) {
+  return __builtin_lsx_vpackev_h(_1, _2);
+}
+// CHECK-LABEL: @vpackev_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vpackev.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vpackev_w(v4i32 _1, v4i32 _2) {
+  return __builtin_lsx_vpackev_w(_1, _2);
+}
+// CHECK-LABEL: @vpackev_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vpackev.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vpackev_d(v2i64 _1, v2i64 _2) {
+  return __builtin_lsx_vpackev_d(_1, _2);
+}
+// CHECK-LABEL: @vpackod_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vpackod.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vpackod_b(v16i8 _1, v16i8 _2) {
+  return __builtin_lsx_vpackod_b(_1, _2);
+}
+// CHECK-LABEL: @vpackod_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vpackod.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vpackod_h(v8i16 _1, v8i16 _2) {
+  return __builtin_lsx_vpackod_h(_1, _2);
+}
+// CHECK-LABEL: @vpackod_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vpackod.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vpackod_w(v4i32 _1, v4i32 _2) {
+  return __builtin_lsx_vpackod_w(_1, _2);
+}
+// CHECK-LABEL: @vpackod_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vpackod.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vpackod_d(v2i64 _1, v2i64 _2) {
+  return __builtin_lsx_vpackod_d(_1, _2);
+}
+// CHECK-LABEL: @vshuf_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vshuf.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]], <8 x i16> [[_3:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vshuf_h(v8i16 _1, v8i16 _2, v8i16 _3) {
+  return __builtin_lsx_vshuf_h(_1, _2, _3);
+}
+// CHECK-LABEL: @vshuf_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vshuf.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]], <4 x i32> [[_3:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vshuf_w(v4i32 _1, v4i32 _2, v4i32 _3) {
+  return __builtin_lsx_vshuf_w(_1, _2, _3);
+}
+// CHECK-LABEL: @vshuf_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vshuf.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], <2 x i64> [[_3:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vshuf_d(v2i64 _1, v2i64 _2, v2i64 _3) {
+  return __builtin_lsx_vshuf_d(_1, _2, _3);
+}
+// CHECK-LABEL: @vand_v(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vand.v(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16u8 vand_v(v16u8 _1, v16u8 _2) { return __builtin_lsx_vand_v(_1, _2); }
+// CHECK-LABEL: @vandi_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vandi.b(<16 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16u8 vandi_b(v16u8 _1) { return __builtin_lsx_vandi_b(_1, 1); }
+// CHECK-LABEL: @vor_v(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vor.v(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16u8 vor_v(v16u8 _1, v16u8 _2) { return __builtin_lsx_vor_v(_1, _2); }
+// CHECK-LABEL: @vori_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vori.b(<16 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16u8 vori_b(v16u8 _1) { return __builtin_lsx_vori_b(_1, 1); }
+// CHECK-LABEL: @vnor_v(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vnor.v(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16u8 vnor_v(v16u8 _1, v16u8 _2) { return __builtin_lsx_vnor_v(_1, _2); }
+// CHECK-LABEL: @vnori_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vnori.b(<16 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16u8 vnori_b(v16u8 _1) { return __builtin_lsx_vnori_b(_1, 1); }
+// CHECK-LABEL: @vxor_v(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vxor.v(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16u8 vxor_v(v16u8 _1, v16u8 _2) { return __builtin_lsx_vxor_v(_1, _2); }
+// CHECK-LABEL: @vxori_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vxori.b(<16 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16u8 vxori_b(v16u8 _1) { return __builtin_lsx_vxori_b(_1, 1); }
+// CHECK-LABEL: @vbitsel_v(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vbitsel.v(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]], <16 x i8> [[_3:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16u8 vbitsel_v(v16u8 _1, v16u8 _2, v16u8 _3) {
+  return __builtin_lsx_vbitsel_v(_1, _2, _3);
+}
+// CHECK-LABEL: @vbitseli_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vbitseli.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16u8 vbitseli_b(v16u8 _1, v16u8 _2) {
+  return __builtin_lsx_vbitseli_b(_1, _2, 1);
+}
+// CHECK-LABEL: @vshuf4i_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vshuf4i.b(<16 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vshuf4i_b(v16i8 _1) { return __builtin_lsx_vshuf4i_b(_1, 1); }
+// CHECK-LABEL: @vshuf4i_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vshuf4i.h(<8 x i16> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vshuf4i_h(v8i16 _1) { return __builtin_lsx_vshuf4i_h(_1, 1); }
+// CHECK-LABEL: @vshuf4i_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vshuf4i.w(<4 x i32> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vshuf4i_w(v4i32 _1) { return __builtin_lsx_vshuf4i_w(_1, 1); }
+// CHECK-LABEL: @vreplgr2vr_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vreplgr2vr.b(i32 [[_1:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vreplgr2vr_b(int _1) { return __builtin_lsx_vreplgr2vr_b(_1); }
+// CHECK-LABEL: @vreplgr2vr_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vreplgr2vr.h(i32 [[_1:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vreplgr2vr_h(int _1) { return __builtin_lsx_vreplgr2vr_h(_1); }
+// CHECK-LABEL: @vreplgr2vr_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vreplgr2vr.w(i32 [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vreplgr2vr_w(int _1) { return __builtin_lsx_vreplgr2vr_w(_1); }
+// CHECK-LABEL: @vreplgr2vr_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vreplgr2vr.d(i64 [[_1:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vreplgr2vr_d(long _1) { return __builtin_lsx_vreplgr2vr_d(_1); }
+// CHECK-LABEL: @vpcnt_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vpcnt.b(<16 x i8> [[_1:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vpcnt_b(v16i8 _1) { return __builtin_lsx_vpcnt_b(_1); }
+// CHECK-LABEL: @vpcnt_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vpcnt.h(<8 x i16> [[_1:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vpcnt_h(v8i16 _1) { return __builtin_lsx_vpcnt_h(_1); }
+// CHECK-LABEL: @vpcnt_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vpcnt.w(<4 x i32> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vpcnt_w(v4i32 _1) { return __builtin_lsx_vpcnt_w(_1); }
+// CHECK-LABEL: @vpcnt_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vpcnt.d(<2 x i64> [[_1:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vpcnt_d(v2i64 _1) { return __builtin_lsx_vpcnt_d(_1); }
+// CHECK-LABEL: @vclo_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vclo.b(<16 x i8> [[_1:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vclo_b(v16i8 _1) { return __builtin_lsx_vclo_b(_1); }
+// CHECK-LABEL: @vclo_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vclo.h(<8 x i16> [[_1:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vclo_h(v8i16 _1) { return __builtin_lsx_vclo_h(_1); }
+// CHECK-LABEL: @vclo_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vclo.w(<4 x i32> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vclo_w(v4i32 _1) { return __builtin_lsx_vclo_w(_1); }
+// CHECK-LABEL: @vclo_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vclo.d(<2 x i64> [[_1:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vclo_d(v2i64 _1) { return __builtin_lsx_vclo_d(_1); }
+// CHECK-LABEL: @vclz_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vclz.b(<16 x i8> [[_1:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vclz_b(v16i8 _1) { return __builtin_lsx_vclz_b(_1); }
+// CHECK-LABEL: @vclz_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vclz.h(<8 x i16> [[_1:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vclz_h(v8i16 _1) { return __builtin_lsx_vclz_h(_1); }
+// CHECK-LABEL: @vclz_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vclz.w(<4 x i32> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vclz_w(v4i32 _1) { return __builtin_lsx_vclz_w(_1); }
+// CHECK-LABEL: @vclz_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vclz.d(<2 x i64> [[_1:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vclz_d(v2i64 _1) { return __builtin_lsx_vclz_d(_1); }
+// CHECK-LABEL: @vpickve2gr_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lsx.vpickve2gr.b(<16 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+int vpickve2gr_b(v16i8 _1) { return __builtin_lsx_vpickve2gr_b(_1, 1); }
+// CHECK-LABEL: @vpickve2gr_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lsx.vpickve2gr.h(<8 x i16> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+int vpickve2gr_h(v8i16 _1) { return __builtin_lsx_vpickve2gr_h(_1, 1); }
+// CHECK-LABEL: @vpickve2gr_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lsx.vpickve2gr.w(<4 x i32> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+int vpickve2gr_w(v4i32 _1) { return __builtin_lsx_vpickve2gr_w(_1, 1); }
+// CHECK-LABEL: @vpickve2gr_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.loongarch.lsx.vpickve2gr.d(<2 x i64> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret i64 [[TMP0]]
+//
+long vpickve2gr_d(v2i64 _1) { return __builtin_lsx_vpickve2gr_d(_1, 1); }
+// CHECK-LABEL: @vpickve2gr_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lsx.vpickve2gr.bu(<16 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+unsigned int vpickve2gr_bu(v16i8 _1) {
+  return __builtin_lsx_vpickve2gr_bu(_1, 1);
+}
+// CHECK-LABEL: @vpickve2gr_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lsx.vpickve2gr.hu(<8 x i16> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+unsigned int vpickve2gr_hu(v8i16 _1) {
+  return __builtin_lsx_vpickve2gr_hu(_1, 1);
+}
+// CHECK-LABEL: @vpickve2gr_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lsx.vpickve2gr.wu(<4 x i32> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+unsigned int vpickve2gr_wu(v4i32 _1) {
+  return __builtin_lsx_vpickve2gr_wu(_1, 1);
+}
+// CHECK-LABEL: @vpickve2gr_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.loongarch.lsx.vpickve2gr.du(<2 x i64> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret i64 [[TMP0]]
+//
+unsigned long int vpickve2gr_du(v2i64 _1) {
+  return __builtin_lsx_vpickve2gr_du(_1, 1);
+}
+// CHECK-LABEL: @vinsgr2vr_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vinsgr2vr.b(<16 x i8> [[_1:%.*]], i32 1, i32 1)
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vinsgr2vr_b(v16i8 _1) {
+  return __builtin_lsx_vinsgr2vr_b(_1, 1, 1);
+}
+// CHECK-LABEL: @vinsgr2vr_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vinsgr2vr.h(<8 x i16> [[_1:%.*]], i32 1, i32 1)
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vinsgr2vr_h(v8i16 _1) {
+  return __builtin_lsx_vinsgr2vr_h(_1, 1, 1);
+}
+// CHECK-LABEL: @vinsgr2vr_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vinsgr2vr.w(<4 x i32> [[_1:%.*]], i32 1, i32 1)
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vinsgr2vr_w(v4i32 _1) {
+  return __builtin_lsx_vinsgr2vr_w(_1, 1, 1);
+}
+// CHECK-LABEL: @vinsgr2vr_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vinsgr2vr.d(<2 x i64> [[_1:%.*]], i64 1, i32 1)
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vinsgr2vr_d(v2i64 _1) {
+  return __builtin_lsx_vinsgr2vr_d(_1, 1, 1);
+}
+// CHECK-LABEL: @vfadd_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfadd.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+//
+v4f32 vfadd_s(v4f32 _1, v4f32 _2) {
+  return __builtin_lsx_vfadd_s(_1, _2);
+}
+// CHECK-LABEL: @vfadd_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfadd.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+//
+v2f64 vfadd_d(v2f64 _1, v2f64 _2) {
+  return __builtin_lsx_vfadd_d(_1, _2);
+}
+// CHECK-LABEL: @vfsub_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfsub.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+//
+v4f32 vfsub_s(v4f32 _1, v4f32 _2) {
+  return __builtin_lsx_vfsub_s(_1, _2);
+}
+// CHECK-LABEL: @vfsub_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfsub.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+//
+v2f64 vfsub_d(v2f64 _1, v2f64 _2) {
+  return __builtin_lsx_vfsub_d(_1, _2);
+}
+// CHECK-LABEL: @vfmul_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfmul.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+//
+v4f32 vfmul_s(v4f32 _1, v4f32 _2) {
+  return __builtin_lsx_vfmul_s(_1, _2);
+}
+// CHECK-LABEL: @vfmul_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfmul.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+//
+v2f64 vfmul_d(v2f64 _1, v2f64 _2) {
+  return __builtin_lsx_vfmul_d(_1, _2);
+}
+// CHECK-LABEL: @vfdiv_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfdiv.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+//
+v4f32 vfdiv_s(v4f32 _1, v4f32 _2) {
+  return __builtin_lsx_vfdiv_s(_1, _2);
+}
+// CHECK-LABEL: @vfdiv_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfdiv.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+//
+v2f64 vfdiv_d(v2f64 _1, v2f64 _2) {
+  return __builtin_lsx_vfdiv_d(_1, _2);
+}
+// CHECK-LABEL: @vfcvt_h_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vfcvt.h.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vfcvt_h_s(v4f32 _1, v4f32 _2) {
+  return __builtin_lsx_vfcvt_h_s(_1, _2);
+}
+// CHECK-LABEL: @vfcvt_s_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfcvt.s.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+//
+v4f32 vfcvt_s_d(v2f64 _1, v2f64 _2) {
+  return __builtin_lsx_vfcvt_s_d(_1, _2);
+}
+// CHECK-LABEL: @vfmin_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfmin.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+//
+v4f32 vfmin_s(v4f32 _1, v4f32 _2) {
+  return __builtin_lsx_vfmin_s(_1, _2);
+}
+// CHECK-LABEL: @vfmin_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfmin.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+//
+v2f64 vfmin_d(v2f64 _1, v2f64 _2) {
+  return __builtin_lsx_vfmin_d(_1, _2);
+}
+// CHECK-LABEL: @vfmina_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfmina.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+//
+v4f32 vfmina_s(v4f32 _1, v4f32 _2) {
+  return __builtin_lsx_vfmina_s(_1, _2);
+}
+// CHECK-LABEL: @vfmina_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfmina.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+//
+v2f64 vfmina_d(v2f64 _1, v2f64 _2) {
+  return __builtin_lsx_vfmina_d(_1, _2);
+}
+// CHECK-LABEL: @vfmax_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfmax.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+//
+v4f32 vfmax_s(v4f32 _1, v4f32 _2) {
+  return __builtin_lsx_vfmax_s(_1, _2);
+}
+// CHECK-LABEL: @vfmax_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfmax.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+//
+v2f64 vfmax_d(v2f64 _1, v2f64 _2) {
+  return __builtin_lsx_vfmax_d(_1, _2);
+}
+// CHECK-LABEL: @vfmaxa_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfmaxa.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+//
+v4f32 vfmaxa_s(v4f32 _1, v4f32 _2) {
+  return __builtin_lsx_vfmaxa_s(_1, _2);
+}
+// CHECK-LABEL: @vfmaxa_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfmaxa.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+//
+v2f64 vfmaxa_d(v2f64 _1, v2f64 _2) {
+  return __builtin_lsx_vfmaxa_d(_1, _2);
+}
+// CHECK-LABEL: @vfclass_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfclass.s(<4 x float> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vfclass_s(v4f32 _1) { return __builtin_lsx_vfclass_s(_1); }
+// CHECK-LABEL: @vfclass_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfclass.d(<2 x double> [[_1:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vfclass_d(v2f64 _1) { return __builtin_lsx_vfclass_d(_1); }
+// CHECK-LABEL: @vfsqrt_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfsqrt.s(<4 x float> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+//
+v4f32 vfsqrt_s(v4f32 _1) { return __builtin_lsx_vfsqrt_s(_1); }
+// CHECK-LABEL: @vfsqrt_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfsqrt.d(<2 x double> [[_1:%.*]])
+// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+//
+v2f64 vfsqrt_d(v2f64 _1) { return __builtin_lsx_vfsqrt_d(_1); }
+// CHECK-LABEL: @vfrecip_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfrecip.s(<4 x float> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+//
+v4f32 vfrecip_s(v4f32 _1) { return __builtin_lsx_vfrecip_s(_1); }
+// CHECK-LABEL: @vfrecip_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfrecip.d(<2 x double> [[_1:%.*]])
+// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+//
+v2f64 vfrecip_d(v2f64 _1) { return __builtin_lsx_vfrecip_d(_1); }
+// CHECK-LABEL: @vfrint_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfrint.s(<4 x float> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+//
+v4f32 vfrint_s(v4f32 _1) { return __builtin_lsx_vfrint_s(_1); }
+// CHECK-LABEL: @vfrint_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfrint.d(<2 x double> [[_1:%.*]])
+// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+//
+v2f64 vfrint_d(v2f64 _1) { return __builtin_lsx_vfrint_d(_1); }
+// CHECK-LABEL: @vfrsqrt_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfrsqrt.s(<4 x float> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+//
+v4f32 vfrsqrt_s(v4f32 _1) { return __builtin_lsx_vfrsqrt_s(_1); }
+// CHECK-LABEL: @vfrsqrt_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfrsqrt.d(<2 x double> [[_1:%.*]])
+// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+//
+v2f64 vfrsqrt_d(v2f64 _1) { return __builtin_lsx_vfrsqrt_d(_1); }
+// CHECK-LABEL: @vflogb_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vflogb.s(<4 x float> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+//
+v4f32 vflogb_s(v4f32 _1) { return __builtin_lsx_vflogb_s(_1); }
+// CHECK-LABEL: @vflogb_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vflogb.d(<2 x double> [[_1:%.*]])
+// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+//
+v2f64 vflogb_d(v2f64 _1) { return __builtin_lsx_vflogb_d(_1); }
+// CHECK-LABEL: @vfcvth_s_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfcvth.s.h(<8 x i16> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+//
+v4f32 vfcvth_s_h(v8i16 _1) { return __builtin_lsx_vfcvth_s_h(_1); }
+// CHECK-LABEL: @vfcvth_d_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfcvth.d.s(<4 x float> [[_1:%.*]])
+// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+//
+v2f64 vfcvth_d_s(v4f32 _1) { return __builtin_lsx_vfcvth_d_s(_1); }
+// CHECK-LABEL: @vfcvtl_s_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfcvtl.s.h(<8 x i16> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+//
+v4f32 vfcvtl_s_h(v8i16 _1) { return __builtin_lsx_vfcvtl_s_h(_1); }
+// CHECK-LABEL: @vfcvtl_d_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfcvtl.d.s(<4 x float> [[_1:%.*]])
+// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+//
+v2f64 vfcvtl_d_s(v4f32 _1) { return __builtin_lsx_vfcvtl_d_s(_1); }
+// CHECK-LABEL: @vftint_w_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vftint.w.s(<4 x float> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vftint_w_s(v4f32 _1) { return __builtin_lsx_vftint_w_s(_1); }
+// CHECK-LABEL: @vftint_l_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftint.l.d(<2 x double> [[_1:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vftint_l_d(v2f64 _1) { return __builtin_lsx_vftint_l_d(_1); }
+// CHECK-LABEL: @vftint_wu_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vftint.wu.s(<4 x float> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4u32 vftint_wu_s(v4f32 _1) { return __builtin_lsx_vftint_wu_s(_1); }
+// CHECK-LABEL: @vftint_lu_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftint.lu.d(<2 x double> [[_1:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2u64 vftint_lu_d(v2f64 _1) { return __builtin_lsx_vftint_lu_d(_1); }
+// CHECK-LABEL: @vftintrz_w_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vftintrz.w.s(<4 x float> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vftintrz_w_s(v4f32 _1) { return __builtin_lsx_vftintrz_w_s(_1); }
+// CHECK-LABEL: @vftintrz_l_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrz.l.d(<2 x double> [[_1:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vftintrz_l_d(v2f64 _1) { return __builtin_lsx_vftintrz_l_d(_1); }
+// CHECK-LABEL: @vftintrz_wu_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vftintrz.wu.s(<4 x float> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4u32 vftintrz_wu_s(v4f32 _1) { return __builtin_lsx_vftintrz_wu_s(_1); }
+// CHECK-LABEL: @vftintrz_lu_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrz.lu.d(<2 x double> [[_1:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2u64 vftintrz_lu_d(v2f64 _1) { return __builtin_lsx_vftintrz_lu_d(_1); }
+// CHECK-LABEL: @vffint_s_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vffint.s.w(<4 x i32> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+//
+v4f32 vffint_s_w(v4i32 _1) { return __builtin_lsx_vffint_s_w(_1); }
+// CHECK-LABEL: @vffint_d_l(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vffint.d.l(<2 x i64> [[_1:%.*]])
+// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+//
+v2f64 vffint_d_l(v2i64 _1) { return __builtin_lsx_vffint_d_l(_1); }
+// CHECK-LABEL: @vffint_s_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vffint.s.wu(<4 x i32> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+//
+v4f32 vffint_s_wu(v4u32 _1) { return __builtin_lsx_vffint_s_wu(_1); }
+// CHECK-LABEL: @vffint_d_lu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vffint.d.lu(<2 x i64> [[_1:%.*]])
+// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+//
+v2f64 vffint_d_lu(v2u64 _1) { return __builtin_lsx_vffint_d_lu(_1); }
+// CHECK-LABEL: @vandn_v(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vandn.v(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16u8 vandn_v(v16u8 _1, v16u8 _2) {
+  return __builtin_lsx_vandn_v(_1, _2);
+}
+// CHECK-LABEL: @vneg_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vneg.b(<16 x i8> [[_1:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vneg_b(v16i8 _1) { return __builtin_lsx_vneg_b(_1); }
+// CHECK-LABEL: @vneg_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vneg.h(<8 x i16> [[_1:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vneg_h(v8i16 _1) { return __builtin_lsx_vneg_h(_1); }
+// CHECK-LABEL: @vneg_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vneg.w(<4 x i32> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vneg_w(v4i32 _1) { return __builtin_lsx_vneg_w(_1); }
+// CHECK-LABEL: @vneg_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vneg.d(<2 x i64> [[_1:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vneg_d(v2i64 _1) { return __builtin_lsx_vneg_d(_1); }
+// CHECK-LABEL: @vmuh_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmuh.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vmuh_b(v16i8 _1, v16i8 _2) { return __builtin_lsx_vmuh_b(_1, _2); }
+// CHECK-LABEL: @vmuh_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmuh.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vmuh_h(v8i16 _1, v8i16 _2) { return __builtin_lsx_vmuh_h(_1, _2); }
+// CHECK-LABEL: @vmuh_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmuh.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vmuh_w(v4i32 _1, v4i32 _2) { return __builtin_lsx_vmuh_w(_1, _2); }
+// CHECK-LABEL: @vmuh_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmuh.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vmuh_d(v2i64 _1, v2i64 _2) { return __builtin_lsx_vmuh_d(_1, _2); }
+// CHECK-LABEL: @vmuh_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmuh.bu(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16u8 vmuh_bu(v16u8 _1, v16u8 _2) {
+  return __builtin_lsx_vmuh_bu(_1, _2);
+}
+// CHECK-LABEL: @vmuh_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmuh.hu(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8u16 vmuh_hu(v8u16 _1, v8u16 _2) {
+  return __builtin_lsx_vmuh_hu(_1, _2);
+}
+// CHECK-LABEL: @vmuh_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmuh.wu(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4u32 vmuh_wu(v4u32 _1, v4u32 _2) {
+  return __builtin_lsx_vmuh_wu(_1, _2);
+}
+// CHECK-LABEL: @vmuh_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmuh.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2u64 vmuh_du(v2u64 _1, v2u64 _2) {
+  return __builtin_lsx_vmuh_du(_1, _2);
+}
+// CHECK-LABEL: @vsllwil_h_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsllwil.h.b(<16 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vsllwil_h_b(v16i8 _1) { return __builtin_lsx_vsllwil_h_b(_1, 1); }
+// CHECK-LABEL: @vsllwil_w_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsllwil.w.h(<8 x i16> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vsllwil_w_h(v8i16 _1) { return __builtin_lsx_vsllwil_w_h(_1, 1); }
+// CHECK-LABEL: @vsllwil_d_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsllwil.d.w(<4 x i32> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vsllwil_d_w(v4i32 _1) { return __builtin_lsx_vsllwil_d_w(_1, 1); }
+// CHECK-LABEL: @vsllwil_hu_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsllwil.hu.bu(<16 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8u16 vsllwil_hu_bu(v16u8 _1) {
+  return __builtin_lsx_vsllwil_hu_bu(_1, 1);
+}
+// CHECK-LABEL: @vsllwil_wu_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsllwil.wu.hu(<8 x i16> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4u32 vsllwil_wu_hu(v8u16 _1) {
+  return __builtin_lsx_vsllwil_wu_hu(_1, 1);
+}
+// CHECK-LABEL: @vsllwil_du_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsllwil.du.wu(<4 x i32> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2u64 vsllwil_du_wu(v4u32 _1) {
+  return __builtin_lsx_vsllwil_du_wu(_1, 1);
+}
+// CHECK-LABEL: @vsran_b_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsran.b.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vsran_b_h(v8i16 _1, v8i16 _2) {
+  return __builtin_lsx_vsran_b_h(_1, _2);
+}
+// CHECK-LABEL: @vsran_h_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsran.h.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vsran_h_w(v4i32 _1, v4i32 _2) {
+  return __builtin_lsx_vsran_h_w(_1, _2);
+}
+// CHECK-LABEL: @vsran_w_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsran.w.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vsran_w_d(v2i64 _1, v2i64 _2) {
+  return __builtin_lsx_vsran_w_d(_1, _2);
+}
+// CHECK-LABEL: @vssran_b_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssran.b.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vssran_b_h(v8i16 _1, v8i16 _2) {
+  return __builtin_lsx_vssran_b_h(_1, _2);
+}
+// CHECK-LABEL: @vssran_h_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssran.h.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vssran_h_w(v4i32 _1, v4i32 _2) {
+  return __builtin_lsx_vssran_h_w(_1, _2);
+}
+// CHECK-LABEL: @vssran_w_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssran.w.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vssran_w_d(v2i64 _1, v2i64 _2) {
+  return __builtin_lsx_vssran_w_d(_1, _2);
+}
+// CHECK-LABEL: @vssran_bu_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssran.bu.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16u8 vssran_bu_h(v8u16 _1, v8u16 _2) {
+  return __builtin_lsx_vssran_bu_h(_1, _2);
+}
+// CHECK-LABEL: @vssran_hu_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssran.hu.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8u16 vssran_hu_w(v4u32 _1, v4u32 _2) {
+  return __builtin_lsx_vssran_hu_w(_1, _2);
+}
+// CHECK-LABEL: @vssran_wu_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssran.wu.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4u32 vssran_wu_d(v2u64 _1, v2u64 _2) {
+  return __builtin_lsx_vssran_wu_d(_1, _2);
+}
+// CHECK-LABEL: @vsrarn_b_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrarn.b.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vsrarn_b_h(v8i16 _1, v8i16 _2) {
+  return __builtin_lsx_vsrarn_b_h(_1, _2);
+}
+// CHECK-LABEL: @vsrarn_h_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrarn.h.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vsrarn_h_w(v4i32 _1, v4i32 _2) {
+  return __builtin_lsx_vsrarn_h_w(_1, _2);
+}
+// CHECK-LABEL: @vsrarn_w_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrarn.w.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vsrarn_w_d(v2i64 _1, v2i64 _2) {
+  return __builtin_lsx_vsrarn_w_d(_1, _2);
+}
+// CHECK-LABEL: @vssrarn_b_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrarn.b.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vssrarn_b_h(v8i16 _1, v8i16 _2) {
+  return __builtin_lsx_vssrarn_b_h(_1, _2);
+}
+// CHECK-LABEL: @vssrarn_h_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrarn.h.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vssrarn_h_w(v4i32 _1, v4i32 _2) {
+  return __builtin_lsx_vssrarn_h_w(_1, _2);
+}
+// CHECK-LABEL: @vssrarn_w_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrarn.w.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vssrarn_w_d(v2i64 _1, v2i64 _2) {
+  return __builtin_lsx_vssrarn_w_d(_1, _2);
+}
+// CHECK-LABEL: @vssrarn_bu_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrarn.bu.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16u8 vssrarn_bu_h(v8u16 _1, v8u16 _2) {
+  return __builtin_lsx_vssrarn_bu_h(_1, _2);
+}
+// CHECK-LABEL: @vssrarn_hu_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrarn.hu.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8u16 vssrarn_hu_w(v4u32 _1, v4u32 _2) {
+  return __builtin_lsx_vssrarn_hu_w(_1, _2);
+}
+// CHECK-LABEL: @vssrarn_wu_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrarn.wu.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4u32 vssrarn_wu_d(v2u64 _1, v2u64 _2) {
+  return __builtin_lsx_vssrarn_wu_d(_1, _2);
+}
+// CHECK-LABEL: @vsrln_b_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrln.b.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vsrln_b_h(v8i16 _1, v8i16 _2) {
+  return __builtin_lsx_vsrln_b_h(_1, _2);
+}
+// CHECK-LABEL: @vsrln_h_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrln.h.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vsrln_h_w(v4i32 _1, v4i32 _2) {
+  return __builtin_lsx_vsrln_h_w(_1, _2);
+}
+// CHECK-LABEL: @vsrln_w_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrln.w.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vsrln_w_d(v2i64 _1, v2i64 _2) {
+  return __builtin_lsx_vsrln_w_d(_1, _2);
+}
+// CHECK-LABEL: @vssrln_bu_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrln.bu.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16u8 vssrln_bu_h(v8u16 _1, v8u16 _2) {
+  return __builtin_lsx_vssrln_bu_h(_1, _2);
+}
+// CHECK-LABEL: @vssrln_hu_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrln.hu.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8u16 vssrln_hu_w(v4u32 _1, v4u32 _2) {
+  return __builtin_lsx_vssrln_hu_w(_1, _2);
+}
+// CHECK-LABEL: @vssrln_wu_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrln.wu.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4u32 vssrln_wu_d(v2u64 _1, v2u64 _2) {
+  return __builtin_lsx_vssrln_wu_d(_1, _2);
+}
+// CHECK-LABEL: @vsrlrn_b_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrlrn.b.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vsrlrn_b_h(v8i16 _1, v8i16 _2) {
+  return __builtin_lsx_vsrlrn_b_h(_1, _2);
+}
+// CHECK-LABEL: @vsrlrn_h_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrlrn.h.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vsrlrn_h_w(v4i32 _1, v4i32 _2) {
+  return __builtin_lsx_vsrlrn_h_w(_1, _2);
+}
+// CHECK-LABEL: @vsrlrn_w_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrlrn.w.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vsrlrn_w_d(v2i64 _1, v2i64 _2) {
+  return __builtin_lsx_vsrlrn_w_d(_1, _2);
+}
+// CHECK-LABEL: @vssrlrn_bu_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrlrn.bu.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16u8 vssrlrn_bu_h(v8u16 _1, v8u16 _2) {
+  return __builtin_lsx_vssrlrn_bu_h(_1, _2);
+}
+// CHECK-LABEL: @vssrlrn_hu_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrlrn.hu.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8u16 vssrlrn_hu_w(v4u32 _1, v4u32 _2) {
+  return __builtin_lsx_vssrlrn_hu_w(_1, _2);
+}
+// CHECK-LABEL: @vssrlrn_wu_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrlrn.wu.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4u32 vssrlrn_wu_d(v2u64 _1, v2u64 _2) {
+  return __builtin_lsx_vssrlrn_wu_d(_1, _2);
+}
+// CHECK-LABEL: @vfrstpi_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vfrstpi.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vfrstpi_b(v16i8 _1, v16i8 _2) {
+  return __builtin_lsx_vfrstpi_b(_1, _2, 1);
+}
+// CHECK-LABEL: @vfrstpi_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vfrstpi.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vfrstpi_h(v8i16 _1, v8i16 _2) {
+  return __builtin_lsx_vfrstpi_h(_1, _2, 1);
+}
+// CHECK-LABEL: @vfrstp_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vfrstp.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]], <16 x i8> [[_3:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vfrstp_b(v16i8 _1, v16i8 _2, v16i8 _3) {
+  return __builtin_lsx_vfrstp_b(_1, _2, _3);
+}
+// CHECK-LABEL: @vfrstp_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vfrstp.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]], <8 x i16> [[_3:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vfrstp_h(v8i16 _1, v8i16 _2, v8i16 _3) {
+  return __builtin_lsx_vfrstp_h(_1, _2, _3);
+}
+// CHECK-LABEL: @vshuf4i_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vshuf4i.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vshuf4i_d(v2i64 _1, v2i64 _2) {
+  return __builtin_lsx_vshuf4i_d(_1, _2, 1);
+}
+// CHECK-LABEL: @vbsrl_v(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vbsrl.v(<16 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vbsrl_v(v16i8 _1) { return __builtin_lsx_vbsrl_v(_1, 1); }
+// CHECK-LABEL: @vbsll_v(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vbsll.v(<16 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vbsll_v(v16i8 _1) { return __builtin_lsx_vbsll_v(_1, 1); }
+// CHECK-LABEL: @vextrins_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vextrins.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vextrins_b(v16i8 _1, v16i8 _2) {
+  return __builtin_lsx_vextrins_b(_1, _2, 1);
+}
+// CHECK-LABEL: @vextrins_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vextrins.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vextrins_h(v8i16 _1, v8i16 _2) {
+  return __builtin_lsx_vextrins_h(_1, _2, 1);
+}
+// CHECK-LABEL: @vextrins_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vextrins.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vextrins_w(v4i32 _1, v4i32 _2) {
+  return __builtin_lsx_vextrins_w(_1, _2, 1);
+}
+// CHECK-LABEL: @vextrins_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vextrins.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vextrins_d(v2i64 _1, v2i64 _2) {
+  return __builtin_lsx_vextrins_d(_1, _2, 1);
+}
+// CHECK-LABEL: @vmskltz_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmskltz.b(<16 x i8> [[_1:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vmskltz_b(v16i8 _1) { return __builtin_lsx_vmskltz_b(_1); }
+// CHECK-LABEL: @vmskltz_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmskltz.h(<8 x i16> [[_1:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vmskltz_h(v8i16 _1) { return __builtin_lsx_vmskltz_h(_1); }
+// CHECK-LABEL: @vmskltz_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmskltz.w(<4 x i32> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vmskltz_w(v4i32 _1) { return __builtin_lsx_vmskltz_w(_1); }
+// CHECK-LABEL: @vmskltz_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmskltz.d(<2 x i64> [[_1:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vmskltz_d(v2i64 _1) { return __builtin_lsx_vmskltz_d(_1); }
+// CHECK-LABEL: @vsigncov_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsigncov.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vsigncov_b(v16i8 _1, v16i8 _2) {
+  return __builtin_lsx_vsigncov_b(_1, _2);
+}
+// CHECK-LABEL: @vsigncov_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsigncov.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vsigncov_h(v8i16 _1, v8i16 _2) {
+  return __builtin_lsx_vsigncov_h(_1, _2);
+}
+// CHECK-LABEL: @vsigncov_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsigncov.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vsigncov_w(v4i32 _1, v4i32 _2) {
+  return __builtin_lsx_vsigncov_w(_1, _2);
+}
+// CHECK-LABEL: @vsigncov_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsigncov.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vsigncov_d(v2i64 _1, v2i64 _2) {
+  return __builtin_lsx_vsigncov_d(_1, _2);
+}
+// CHECK-LABEL: @vfmadd_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfmadd.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]], <4 x float> [[_3:%.*]])
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+//
+v4f32 vfmadd_s(v4f32 _1, v4f32 _2, v4f32 _3) {
+  return __builtin_lsx_vfmadd_s(_1, _2, _3);
+}
+// CHECK-LABEL: @vfmadd_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfmadd.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]], <2 x double> [[_3:%.*]])
+// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+//
+v2f64 vfmadd_d(v2f64 _1, v2f64 _2, v2f64 _3) {
+  return __builtin_lsx_vfmadd_d(_1, _2, _3);
+}
+// CHECK-LABEL: @vfmsub_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfmsub.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]], <4 x float> [[_3:%.*]])
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+//
+v4f32 vfmsub_s(v4f32 _1, v4f32 _2, v4f32 _3) {
+  return __builtin_lsx_vfmsub_s(_1, _2, _3);
+}
+// CHECK-LABEL: @vfmsub_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfmsub.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]], <2 x double> [[_3:%.*]])
+// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+//
+v2f64 vfmsub_d(v2f64 _1, v2f64 _2, v2f64 _3) {
+  return __builtin_lsx_vfmsub_d(_1, _2, _3);
+}
+// CHECK-LABEL: @vfnmadd_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfnmadd.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]], <4 x float> [[_3:%.*]])
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+//
+v4f32 vfnmadd_s(v4f32 _1, v4f32 _2, v4f32 _3) {
+  return __builtin_lsx_vfnmadd_s(_1, _2, _3);
+}
+// CHECK-LABEL: @vfnmadd_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfnmadd.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]], <2 x double> [[_3:%.*]])
+// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+//
+v2f64 vfnmadd_d(v2f64 _1, v2f64 _2, v2f64 _3) {
+  return __builtin_lsx_vfnmadd_d(_1, _2, _3);
+}
+// CHECK-LABEL: @vfnmsub_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfnmsub.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]], <4 x float> [[_3:%.*]])
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+//
+v4f32 vfnmsub_s(v4f32 _1, v4f32 _2, v4f32 _3) {
+  return __builtin_lsx_vfnmsub_s(_1, _2, _3);
+}
+// CHECK-LABEL: @vfnmsub_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfnmsub.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]], <2 x double> [[_3:%.*]])
+// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+//
+v2f64 vfnmsub_d(v2f64 _1, v2f64 _2, v2f64 _3) {
+  return __builtin_lsx_vfnmsub_d(_1, _2, _3);
+}
+// CHECK-LABEL: @vftintrne_w_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vftintrne.w.s(<4 x float> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vftintrne_w_s(v4f32 _1) { return __builtin_lsx_vftintrne_w_s(_1); }
+// CHECK-LABEL: @vftintrne_l_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrne.l.d(<2 x double> [[_1:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vftintrne_l_d(v2f64 _1) { return __builtin_lsx_vftintrne_l_d(_1); }
+// CHECK-LABEL: @vftintrp_w_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vftintrp.w.s(<4 x float> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vftintrp_w_s(v4f32 _1) { return __builtin_lsx_vftintrp_w_s(_1); }
+// CHECK-LABEL: @vftintrp_l_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrp.l.d(<2 x double> [[_1:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vftintrp_l_d(v2f64 _1) { return __builtin_lsx_vftintrp_l_d(_1); }
+// CHECK-LABEL: @vftintrm_w_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vftintrm.w.s(<4 x float> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vftintrm_w_s(v4f32 _1) { return __builtin_lsx_vftintrm_w_s(_1); }
+// CHECK-LABEL: @vftintrm_l_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrm.l.d(<2 x double> [[_1:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vftintrm_l_d(v2f64 _1) { return __builtin_lsx_vftintrm_l_d(_1); }
+// CHECK-LABEL: @vftint_w_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vftint.w.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vftint_w_d(v2f64 _1, v2f64 _2) {
+  return __builtin_lsx_vftint_w_d(_1, _2);
+}
+// CHECK-LABEL: @vffint_s_l(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vffint.s.l(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+//
+v4f32 vffint_s_l(v2i64 _1, v2i64 _2) {
+  return __builtin_lsx_vffint_s_l(_1, _2);
+}
+// CHECK-LABEL: @vftintrz_w_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vftintrz.w.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vftintrz_w_d(v2f64 _1, v2f64 _2) {
+  return __builtin_lsx_vftintrz_w_d(_1, _2);
+}
+// CHECK-LABEL: @vftintrp_w_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vftintrp.w.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vftintrp_w_d(v2f64 _1, v2f64 _2) {
+  return __builtin_lsx_vftintrp_w_d(_1, _2);
+}
+// CHECK-LABEL: @vftintrm_w_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vftintrm.w.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vftintrm_w_d(v2f64 _1, v2f64 _2) {
+  return __builtin_lsx_vftintrm_w_d(_1, _2);
+}
+// CHECK-LABEL: @vftintrne_w_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vftintrne.w.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vftintrne_w_d(v2f64 _1, v2f64 _2) {
+  return __builtin_lsx_vftintrne_w_d(_1, _2);
+}
+// CHECK-LABEL: @vftintl_l_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintl.l.s(<4 x float> [[_1:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vftintl_l_s(v4f32 _1) { return __builtin_lsx_vftintl_l_s(_1); }
+// CHECK-LABEL: @vftinth_l_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftinth.l.s(<4 x float> [[_1:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vftinth_l_s(v4f32 _1) { return __builtin_lsx_vftinth_l_s(_1); }
+// CHECK-LABEL: @vffinth_d_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vffinth.d.w(<4 x i32> [[_1:%.*]])
+// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+//
+v2f64 vffinth_d_w(v4i32 _1) { return __builtin_lsx_vffinth_d_w(_1); }
+// CHECK-LABEL: @vffintl_d_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vffintl.d.w(<4 x i32> [[_1:%.*]])
+// CHECK-NEXT:    ret <2 x double> [[TMP0]]
+//
+v2f64 vffintl_d_w(v4i32 _1) { return __builtin_lsx_vffintl_d_w(_1); }
+// CHECK-LABEL: @vftintrzl_l_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrzl.l.s(<4 x float> [[_1:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vftintrzl_l_s(v4f32 _1) { return __builtin_lsx_vftintrzl_l_s(_1); }
+// CHECK-LABEL: @vftintrzh_l_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrzh.l.s(<4 x float> [[_1:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vftintrzh_l_s(v4f32 _1) { return __builtin_lsx_vftintrzh_l_s(_1); }
+// CHECK-LABEL: @vftintrpl_l_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrpl.l.s(<4 x float> [[_1:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vftintrpl_l_s(v4f32 _1) { return __builtin_lsx_vftintrpl_l_s(_1); }
+// CHECK-LABEL: @vftintrph_l_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrph.l.s(<4 x float> [[_1:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vftintrph_l_s(v4f32 _1) { return __builtin_lsx_vftintrph_l_s(_1); }
+// CHECK-LABEL: @vftintrml_l_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrml.l.s(<4 x float> [[_1:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vftintrml_l_s(v4f32 _1) { return __builtin_lsx_vftintrml_l_s(_1); }
+// CHECK-LABEL: @vftintrmh_l_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrmh.l.s(<4 x float> [[_1:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vftintrmh_l_s(v4f32 _1) { return __builtin_lsx_vftintrmh_l_s(_1); }
+// CHECK-LABEL: @vftintrnel_l_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrnel.l.s(<4 x float> [[_1:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vftintrnel_l_s(v4f32 _1) {
+  return __builtin_lsx_vftintrnel_l_s(_1);
+}
+// CHECK-LABEL: @vftintrneh_l_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vftintrneh.l.s(<4 x float> [[_1:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vftintrneh_l_s(v4f32 _1) {
+  return __builtin_lsx_vftintrneh_l_s(_1);
+}
+// CHECK-LABEL: @vfrintrne_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfrintrne.s(<4 x float> [[_1:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+//
+v4i32 vfrintrne_s(v4f32 _1) { return __builtin_lsx_vfrintrne_s(_1); }
+// CHECK-LABEL: @vfrintrne_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfrintrne.d(<2 x double> [[_1:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x double> [[TMP0]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[TMP1]]
+//
+v2i64 vfrintrne_d(v2f64 _1) { return __builtin_lsx_vfrintrne_d(_1); }
+// CHECK-LABEL: @vfrintrz_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfrintrz.s(<4 x float> [[_1:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+//
+v4i32 vfrintrz_s(v4f32 _1) { return __builtin_lsx_vfrintrz_s(_1); }
+// CHECK-LABEL: @vfrintrz_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfrintrz.d(<2 x double> [[_1:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x double> [[TMP0]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[TMP1]]
+//
+v2i64 vfrintrz_d(v2f64 _1) { return __builtin_lsx_vfrintrz_d(_1); }
+// CHECK-LABEL: @vfrintrp_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfrintrp.s(<4 x float> [[_1:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+//
+v4i32 vfrintrp_s(v4f32 _1) { return __builtin_lsx_vfrintrp_s(_1); }
+// CHECK-LABEL: @vfrintrp_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfrintrp.d(<2 x double> [[_1:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x double> [[TMP0]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[TMP1]]
+//
+v2i64 vfrintrp_d(v2f64 _1) { return __builtin_lsx_vfrintrp_d(_1); }
+// CHECK-LABEL: @vfrintrm_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfrintrm.s(<4 x float> [[_1:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x float> [[TMP0]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+//
+v4i32 vfrintrm_s(v4f32 _1) { return __builtin_lsx_vfrintrm_s(_1); }
+// CHECK-LABEL: @vfrintrm_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfrintrm.d(<2 x double> [[_1:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x double> [[TMP0]] to <2 x i64>
+// CHECK-NEXT:    ret <2 x i64> [[TMP1]]
+//
+v2i64 vfrintrm_d(v2f64 _1) { return __builtin_lsx_vfrintrm_d(_1); }
+// CHECK-LABEL: @vstelm_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.loongarch.lsx.vstelm.b(<16 x i8> [[_1:%.*]], ptr [[_2:%.*]], i32 1, i32 1)
+// CHECK-NEXT:    ret void
+//
+void vstelm_b(v16i8 _1, void *_2) {
+  return __builtin_lsx_vstelm_b(_1, _2, 1, 1);
+}
+// CHECK-LABEL: @vstelm_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.loongarch.lsx.vstelm.h(<8 x i16> [[_1:%.*]], ptr [[_2:%.*]], i32 2, i32 1)
+// CHECK-NEXT:    ret void
+//
+void vstelm_h(v8i16 _1, void *_2) {
+  return __builtin_lsx_vstelm_h(_1, _2, 2, 1);
+}
+// CHECK-LABEL: @vstelm_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.loongarch.lsx.vstelm.w(<4 x i32> [[_1:%.*]], ptr [[_2:%.*]], i32 4, i32 1)
+// CHECK-NEXT:    ret void
+//
+void vstelm_w(v4i32 _1, void *_2) {
+  return __builtin_lsx_vstelm_w(_1, _2, 4, 1);
+}
+// CHECK-LABEL: @vstelm_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.loongarch.lsx.vstelm.d(<2 x i64> [[_1:%.*]], ptr [[_2:%.*]], i32 8, i32 1)
+// CHECK-NEXT:    ret void
+//
+void vstelm_d(v2i64 _1, void *_2) {
+  return __builtin_lsx_vstelm_d(_1, _2, 8, 1);
+}
+// CHECK-LABEL: @vaddwev_d_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddwev.d.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vaddwev_d_w(v4i32 _1, v4i32 _2) {
+  return __builtin_lsx_vaddwev_d_w(_1, _2);
+}
+// CHECK-LABEL: @vaddwev_w_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vaddwev.w.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vaddwev_w_h(v8i16 _1, v8i16 _2) {
+  return __builtin_lsx_vaddwev_w_h(_1, _2);
+}
+// CHECK-LABEL: @vaddwev_h_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vaddwev.h.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vaddwev_h_b(v16i8 _1, v16i8 _2) {
+  return __builtin_lsx_vaddwev_h_b(_1, _2);
+}
+// CHECK-LABEL: @vaddwod_d_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddwod.d.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vaddwod_d_w(v4i32 _1, v4i32 _2) {
+  return __builtin_lsx_vaddwod_d_w(_1, _2);
+}
+// CHECK-LABEL: @vaddwod_w_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vaddwod.w.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vaddwod_w_h(v8i16 _1, v8i16 _2) {
+  return __builtin_lsx_vaddwod_w_h(_1, _2);
+}
+// CHECK-LABEL: @vaddwod_h_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vaddwod.h.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vaddwod_h_b(v16i8 _1, v16i8 _2) {
+  return __builtin_lsx_vaddwod_h_b(_1, _2);
+}
+// CHECK-LABEL: @vaddwev_d_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddwev.d.wu(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vaddwev_d_wu(v4u32 _1, v4u32 _2) {
+  return __builtin_lsx_vaddwev_d_wu(_1, _2);
+}
+// CHECK-LABEL: @vaddwev_w_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vaddwev.w.hu(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vaddwev_w_hu(v8u16 _1, v8u16 _2) {
+  return __builtin_lsx_vaddwev_w_hu(_1, _2);
+}
+// CHECK-LABEL: @vaddwev_h_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vaddwev.h.bu(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vaddwev_h_bu(v16u8 _1, v16u8 _2) {
+  return __builtin_lsx_vaddwev_h_bu(_1, _2);
+}
+// CHECK-LABEL: @vaddwod_d_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddwod.d.wu(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vaddwod_d_wu(v4u32 _1, v4u32 _2) {
+  return __builtin_lsx_vaddwod_d_wu(_1, _2);
+}
+// CHECK-LABEL: @vaddwod_w_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vaddwod.w.hu(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vaddwod_w_hu(v8u16 _1, v8u16 _2) {
+  return __builtin_lsx_vaddwod_w_hu(_1, _2);
+}
+// CHECK-LABEL: @vaddwod_h_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vaddwod.h.bu(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vaddwod_h_bu(v16u8 _1, v16u8 _2) {
+  return __builtin_lsx_vaddwod_h_bu(_1, _2);
+}
+// CHECK-LABEL: @vaddwev_d_wu_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddwev.d.wu.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vaddwev_d_wu_w(v4u32 _1, v4i32 _2) {
+  return __builtin_lsx_vaddwev_d_wu_w(_1, _2);
+}
+// CHECK-LABEL: @vaddwev_w_hu_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vaddwev.w.hu.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vaddwev_w_hu_h(v8u16 _1, v8i16 _2) {
+  return __builtin_lsx_vaddwev_w_hu_h(_1, _2);
+}
+// CHECK-LABEL: @vaddwev_h_bu_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vaddwev.h.bu.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vaddwev_h_bu_b(v16u8 _1, v16i8 _2) {
+  return __builtin_lsx_vaddwev_h_bu_b(_1, _2);
+}
+// CHECK-LABEL: @vaddwod_d_wu_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddwod.d.wu.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vaddwod_d_wu_w(v4u32 _1, v4i32 _2) {
+  return __builtin_lsx_vaddwod_d_wu_w(_1, _2);
+}
+// CHECK-LABEL: @vaddwod_w_hu_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vaddwod.w.hu.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vaddwod_w_hu_h(v8u16 _1, v8i16 _2) {
+  return __builtin_lsx_vaddwod_w_hu_h(_1, _2);
+}
+// CHECK-LABEL: @vaddwod_h_bu_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vaddwod.h.bu.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vaddwod_h_bu_b(v16u8 _1, v16i8 _2) {
+  return __builtin_lsx_vaddwod_h_bu_b(_1, _2);
+}
+// CHECK-LABEL: @vsubwev_d_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsubwev.d.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vsubwev_d_w(v4i32 _1, v4i32 _2) {
+  return __builtin_lsx_vsubwev_d_w(_1, _2);
+}
+// CHECK-LABEL: @vsubwev_w_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsubwev.w.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vsubwev_w_h(v8i16 _1, v8i16 _2) {
+  return __builtin_lsx_vsubwev_w_h(_1, _2);
+}
+// CHECK-LABEL: @vsubwev_h_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsubwev.h.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vsubwev_h_b(v16i8 _1, v16i8 _2) {
+  return __builtin_lsx_vsubwev_h_b(_1, _2);
+}
+// CHECK-LABEL: @vsubwod_d_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsubwod.d.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vsubwod_d_w(v4i32 _1, v4i32 _2) {
+  return __builtin_lsx_vsubwod_d_w(_1, _2);
+}
+// CHECK-LABEL: @vsubwod_w_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsubwod.w.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vsubwod_w_h(v8i16 _1, v8i16 _2) {
+  return __builtin_lsx_vsubwod_w_h(_1, _2);
+}
+// CHECK-LABEL: @vsubwod_h_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsubwod.h.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vsubwod_h_b(v16i8 _1, v16i8 _2) {
+  return __builtin_lsx_vsubwod_h_b(_1, _2);
+}
+// CHECK-LABEL: @vsubwev_d_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsubwev.d.wu(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vsubwev_d_wu(v4u32 _1, v4u32 _2) {
+  return __builtin_lsx_vsubwev_d_wu(_1, _2);
+}
+// CHECK-LABEL: @vsubwev_w_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsubwev.w.hu(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vsubwev_w_hu(v8u16 _1, v8u16 _2) {
+  return __builtin_lsx_vsubwev_w_hu(_1, _2);
+}
+// CHECK-LABEL: @vsubwev_h_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsubwev.h.bu(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vsubwev_h_bu(v16u8 _1, v16u8 _2) {
+  return __builtin_lsx_vsubwev_h_bu(_1, _2);
+}
+// CHECK-LABEL: @vsubwod_d_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsubwod.d.wu(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vsubwod_d_wu(v4u32 _1, v4u32 _2) {
+  return __builtin_lsx_vsubwod_d_wu(_1, _2);
+}
+// CHECK-LABEL: @vsubwod_w_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsubwod.w.hu(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vsubwod_w_hu(v8u16 _1, v8u16 _2) {
+  return __builtin_lsx_vsubwod_w_hu(_1, _2);
+}
+// CHECK-LABEL: @vsubwod_h_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsubwod.h.bu(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vsubwod_h_bu(v16u8 _1, v16u8 _2) {
+  return __builtin_lsx_vsubwod_h_bu(_1, _2);
+}
+// CHECK-LABEL: @vaddwev_q_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddwev.q.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vaddwev_q_d(v2i64 _1, v2i64 _2) {
+  return __builtin_lsx_vaddwev_q_d(_1, _2);
+}
+// CHECK-LABEL: @vaddwod_q_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddwod.q.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vaddwod_q_d(v2i64 _1, v2i64 _2) {
+  return __builtin_lsx_vaddwod_q_d(_1, _2);
+}
+// CHECK-LABEL: @vaddwev_q_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddwev.q.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vaddwev_q_du(v2u64 _1, v2u64 _2) {
+  return __builtin_lsx_vaddwev_q_du(_1, _2);
+}
+// CHECK-LABEL: @vaddwod_q_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddwod.q.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vaddwod_q_du(v2u64 _1, v2u64 _2) {
+  return __builtin_lsx_vaddwod_q_du(_1, _2);
+}
+// CHECK-LABEL: @vsubwev_q_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsubwev.q.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vsubwev_q_d(v2i64 _1, v2i64 _2) {
+  return __builtin_lsx_vsubwev_q_d(_1, _2);
+}
+// CHECK-LABEL: @vsubwod_q_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsubwod.q.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vsubwod_q_d(v2i64 _1, v2i64 _2) {
+  return __builtin_lsx_vsubwod_q_d(_1, _2);
+}
+// CHECK-LABEL: @vsubwev_q_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsubwev.q.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vsubwev_q_du(v2u64 _1, v2u64 _2) {
+  return __builtin_lsx_vsubwev_q_du(_1, _2);
+}
+// CHECK-LABEL: @vsubwod_q_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsubwod.q.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vsubwod_q_du(v2u64 _1, v2u64 _2) {
+  return __builtin_lsx_vsubwod_q_du(_1, _2);
+}
+// CHECK-LABEL: @vaddwev_q_du_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddwev.q.du.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vaddwev_q_du_d(v2u64 _1, v2i64 _2) {
+  return __builtin_lsx_vaddwev_q_du_d(_1, _2);
+}
+// CHECK-LABEL: @vaddwod_q_du_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vaddwod.q.du.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vaddwod_q_du_d(v2u64 _1, v2i64 _2) {
+  return __builtin_lsx_vaddwod_q_du_d(_1, _2);
+}
+// CHECK-LABEL: @vmulwev_d_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmulwev.d.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vmulwev_d_w(v4i32 _1, v4i32 _2) {
+  return __builtin_lsx_vmulwev_d_w(_1, _2);
+}
+// CHECK-LABEL: @vmulwev_w_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmulwev.w.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vmulwev_w_h(v8i16 _1, v8i16 _2) {
+  return __builtin_lsx_vmulwev_w_h(_1, _2);
+}
+// CHECK-LABEL: @vmulwev_h_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmulwev.h.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vmulwev_h_b(v16i8 _1, v16i8 _2) {
+  return __builtin_lsx_vmulwev_h_b(_1, _2);
+}
+// CHECK-LABEL: @vmulwod_d_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmulwod.d.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vmulwod_d_w(v4i32 _1, v4i32 _2) {
+  return __builtin_lsx_vmulwod_d_w(_1, _2);
+}
+// CHECK-LABEL: @vmulwod_w_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmulwod.w.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vmulwod_w_h(v8i16 _1, v8i16 _2) {
+  return __builtin_lsx_vmulwod_w_h(_1, _2);
+}
+// CHECK-LABEL: @vmulwod_h_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmulwod.h.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vmulwod_h_b(v16i8 _1, v16i8 _2) {
+  return __builtin_lsx_vmulwod_h_b(_1, _2);
+}
+// CHECK-LABEL: @vmulwev_d_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmulwev.d.wu(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vmulwev_d_wu(v4u32 _1, v4u32 _2) {
+  return __builtin_lsx_vmulwev_d_wu(_1, _2);
+}
+// CHECK-LABEL: @vmulwev_w_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmulwev.w.hu(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vmulwev_w_hu(v8u16 _1, v8u16 _2) {
+  return __builtin_lsx_vmulwev_w_hu(_1, _2);
+}
+// CHECK-LABEL: @vmulwev_h_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmulwev.h.bu(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vmulwev_h_bu(v16u8 _1, v16u8 _2) {
+  return __builtin_lsx_vmulwev_h_bu(_1, _2);
+}
+// CHECK-LABEL: @vmulwod_d_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmulwod.d.wu(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vmulwod_d_wu(v4u32 _1, v4u32 _2) {
+  return __builtin_lsx_vmulwod_d_wu(_1, _2);
+}
+// CHECK-LABEL: @vmulwod_w_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmulwod.w.hu(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vmulwod_w_hu(v8u16 _1, v8u16 _2) {
+  return __builtin_lsx_vmulwod_w_hu(_1, _2);
+}
+// CHECK-LABEL: @vmulwod_h_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmulwod.h.bu(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vmulwod_h_bu(v16u8 _1, v16u8 _2) {
+  return __builtin_lsx_vmulwod_h_bu(_1, _2);
+}
+// CHECK-LABEL: @vmulwev_d_wu_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmulwev.d.wu.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vmulwev_d_wu_w(v4u32 _1, v4i32 _2) {
+  return __builtin_lsx_vmulwev_d_wu_w(_1, _2);
+}
+// CHECK-LABEL: @vmulwev_w_hu_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmulwev.w.hu.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vmulwev_w_hu_h(v8u16 _1, v8i16 _2) {
+  return __builtin_lsx_vmulwev_w_hu_h(_1, _2);
+}
+// CHECK-LABEL: @vmulwev_h_bu_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmulwev.h.bu.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vmulwev_h_bu_b(v16u8 _1, v16i8 _2) {
+  return __builtin_lsx_vmulwev_h_bu_b(_1, _2);
+}
+// CHECK-LABEL: @vmulwod_d_wu_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmulwod.d.wu.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vmulwod_d_wu_w(v4u32 _1, v4i32 _2) {
+  return __builtin_lsx_vmulwod_d_wu_w(_1, _2);
+}
+// CHECK-LABEL: @vmulwod_w_hu_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmulwod.w.hu.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vmulwod_w_hu_h(v8u16 _1, v8i16 _2) {
+  return __builtin_lsx_vmulwod_w_hu_h(_1, _2);
+}
+// CHECK-LABEL: @vmulwod_h_bu_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmulwod.h.bu.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vmulwod_h_bu_b(v16u8 _1, v16i8 _2) {
+  return __builtin_lsx_vmulwod_h_bu_b(_1, _2);
+}
+// CHECK-LABEL: @vmulwev_q_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmulwev.q.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vmulwev_q_d(v2i64 _1, v2i64 _2) {
+  return __builtin_lsx_vmulwev_q_d(_1, _2);
+}
+// CHECK-LABEL: @vmulwod_q_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmulwod.q.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vmulwod_q_d(v2i64 _1, v2i64 _2) {
+  return __builtin_lsx_vmulwod_q_d(_1, _2);
+}
+// CHECK-LABEL: @vmulwev_q_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmulwev.q.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vmulwev_q_du(v2u64 _1, v2u64 _2) {
+  return __builtin_lsx_vmulwev_q_du(_1, _2);
+}
+// CHECK-LABEL: @vmulwod_q_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmulwod.q.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vmulwod_q_du(v2u64 _1, v2u64 _2) {
+  return __builtin_lsx_vmulwod_q_du(_1, _2);
+}
+// CHECK-LABEL: @vmulwev_q_du_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmulwev.q.du.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vmulwev_q_du_d(v2u64 _1, v2i64 _2) {
+  return __builtin_lsx_vmulwev_q_du_d(_1, _2);
+}
+// CHECK-LABEL: @vmulwod_q_du_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmulwod.q.du.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vmulwod_q_du_d(v2u64 _1, v2i64 _2) {
+  return __builtin_lsx_vmulwod_q_du_d(_1, _2);
+}
+// CHECK-LABEL: @vhaddw_q_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vhaddw.q.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vhaddw_q_d(v2i64 _1, v2i64 _2) {
+  return __builtin_lsx_vhaddw_q_d(_1, _2);
+}
+// CHECK-LABEL: @vhaddw_qu_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vhaddw.qu.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2u64 vhaddw_qu_du(v2u64 _1, v2u64 _2) {
+  return __builtin_lsx_vhaddw_qu_du(_1, _2);
+}
+// CHECK-LABEL: @vhsubw_q_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vhsubw.q.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vhsubw_q_d(v2i64 _1, v2i64 _2) {
+  return __builtin_lsx_vhsubw_q_d(_1, _2);
+}
+// CHECK-LABEL: @vhsubw_qu_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vhsubw.qu.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2u64 vhsubw_qu_du(v2u64 _1, v2u64 _2) {
+  return __builtin_lsx_vhsubw_qu_du(_1, _2);
+}
+// CHECK-LABEL: @vmaddwev_d_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaddwev.d.w(<2 x i64> [[_1:%.*]], <4 x i32> [[_2:%.*]], <4 x i32> [[_3:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vmaddwev_d_w(v2i64 _1, v4i32 _2, v4i32 _3) {
+  return __builtin_lsx_vmaddwev_d_w(_1, _2, _3);
+}
+// CHECK-LABEL: @vmaddwev_w_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmaddwev.w.h(<4 x i32> [[_1:%.*]], <8 x i16> [[_2:%.*]], <8 x i16> [[_3:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vmaddwev_w_h(v4i32 _1, v8i16 _2, v8i16 _3) {
+  return __builtin_lsx_vmaddwev_w_h(_1, _2, _3);
+}
+// CHECK-LABEL: @vmaddwev_h_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmaddwev.h.b(<8 x i16> [[_1:%.*]], <16 x i8> [[_2:%.*]], <16 x i8> [[_3:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vmaddwev_h_b(v8i16 _1, v16i8 _2, v16i8 _3) {
+  return __builtin_lsx_vmaddwev_h_b(_1, _2, _3);
+}
+// CHECK-LABEL: @vmaddwev_d_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaddwev.d.wu(<2 x i64> [[_1:%.*]], <4 x i32> [[_2:%.*]], <4 x i32> [[_3:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2u64 vmaddwev_d_wu(v2u64 _1, v4u32 _2, v4u32 _3) {
+  return __builtin_lsx_vmaddwev_d_wu(_1, _2, _3);
+}
+// CHECK-LABEL: @vmaddwev_w_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmaddwev.w.hu(<4 x i32> [[_1:%.*]], <8 x i16> [[_2:%.*]], <8 x i16> [[_3:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4u32 vmaddwev_w_hu(v4u32 _1, v8u16 _2, v8u16 _3) {
+  return __builtin_lsx_vmaddwev_w_hu(_1, _2, _3);
+}
+// CHECK-LABEL: @vmaddwev_h_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmaddwev.h.bu(<8 x i16> [[_1:%.*]], <16 x i8> [[_2:%.*]], <16 x i8> [[_3:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8u16 vmaddwev_h_bu(v8u16 _1, v16u8 _2, v16u8 _3) {
+  return __builtin_lsx_vmaddwev_h_bu(_1, _2, _3);
+}
+// CHECK-LABEL: @vmaddwod_d_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaddwod.d.w(<2 x i64> [[_1:%.*]], <4 x i32> [[_2:%.*]], <4 x i32> [[_3:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vmaddwod_d_w(v2i64 _1, v4i32 _2, v4i32 _3) {
+  return __builtin_lsx_vmaddwod_d_w(_1, _2, _3);
+}
+// CHECK-LABEL: @vmaddwod_w_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmaddwod.w.h(<4 x i32> [[_1:%.*]], <8 x i16> [[_2:%.*]], <8 x i16> [[_3:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vmaddwod_w_h(v4i32 _1, v8i16 _2, v8i16 _3) {
+  return __builtin_lsx_vmaddwod_w_h(_1, _2, _3);
+}
+// CHECK-LABEL: @vmaddwod_h_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmaddwod.h.b(<8 x i16> [[_1:%.*]], <16 x i8> [[_2:%.*]], <16 x i8> [[_3:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vmaddwod_h_b(v8i16 _1, v16i8 _2, v16i8 _3) {
+  return __builtin_lsx_vmaddwod_h_b(_1, _2, _3);
+}
+// CHECK-LABEL: @vmaddwod_d_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaddwod.d.wu(<2 x i64> [[_1:%.*]], <4 x i32> [[_2:%.*]], <4 x i32> [[_3:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2u64 vmaddwod_d_wu(v2u64 _1, v4u32 _2, v4u32 _3) {
+  return __builtin_lsx_vmaddwod_d_wu(_1, _2, _3);
+}
+// CHECK-LABEL: @vmaddwod_w_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmaddwod.w.hu(<4 x i32> [[_1:%.*]], <8 x i16> [[_2:%.*]], <8 x i16> [[_3:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4u32 vmaddwod_w_hu(v4u32 _1, v8u16 _2, v8u16 _3) {
+  return __builtin_lsx_vmaddwod_w_hu(_1, _2, _3);
+}
+// CHECK-LABEL: @vmaddwod_h_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmaddwod.h.bu(<8 x i16> [[_1:%.*]], <16 x i8> [[_2:%.*]], <16 x i8> [[_3:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8u16 vmaddwod_h_bu(v8u16 _1, v16u8 _2, v16u8 _3) {
+  return __builtin_lsx_vmaddwod_h_bu(_1, _2, _3);
+}
+// CHECK-LABEL: @vmaddwev_d_wu_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaddwev.d.wu.w(<2 x i64> [[_1:%.*]], <4 x i32> [[_2:%.*]], <4 x i32> [[_3:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vmaddwev_d_wu_w(v2i64 _1, v4u32 _2, v4i32 _3) {
+  return __builtin_lsx_vmaddwev_d_wu_w(_1, _2, _3);
+}
+// CHECK-LABEL: @vmaddwev_w_hu_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmaddwev.w.hu.h(<4 x i32> [[_1:%.*]], <8 x i16> [[_2:%.*]], <8 x i16> [[_3:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vmaddwev_w_hu_h(v4i32 _1, v8u16 _2, v8i16 _3) {
+  return __builtin_lsx_vmaddwev_w_hu_h(_1, _2, _3);
+}
+// CHECK-LABEL: @vmaddwev_h_bu_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmaddwev.h.bu.b(<8 x i16> [[_1:%.*]], <16 x i8> [[_2:%.*]], <16 x i8> [[_3:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vmaddwev_h_bu_b(v8i16 _1, v16u8 _2, v16i8 _3) {
+  return __builtin_lsx_vmaddwev_h_bu_b(_1, _2, _3);
+}
+// CHECK-LABEL: @vmaddwod_d_wu_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaddwod.d.wu.w(<2 x i64> [[_1:%.*]], <4 x i32> [[_2:%.*]], <4 x i32> [[_3:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vmaddwod_d_wu_w(v2i64 _1, v4u32 _2, v4i32 _3) {
+  return __builtin_lsx_vmaddwod_d_wu_w(_1, _2, _3);
+}
+// CHECK-LABEL: @vmaddwod_w_hu_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vmaddwod.w.hu.h(<4 x i32> [[_1:%.*]], <8 x i16> [[_2:%.*]], <8 x i16> [[_3:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vmaddwod_w_hu_h(v4i32 _1, v8u16 _2, v8i16 _3) {
+  return __builtin_lsx_vmaddwod_w_hu_h(_1, _2, _3);
+}
+// CHECK-LABEL: @vmaddwod_h_bu_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vmaddwod.h.bu.b(<8 x i16> [[_1:%.*]], <16 x i8> [[_2:%.*]], <16 x i8> [[_3:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vmaddwod_h_bu_b(v8i16 _1, v16u8 _2, v16i8 _3) {
+  return __builtin_lsx_vmaddwod_h_bu_b(_1, _2, _3);
+}
+// CHECK-LABEL: @vmaddwev_q_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaddwev.q.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], <2 x i64> [[_3:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vmaddwev_q_d(v2i64 _1, v2i64 _2, v2i64 _3) {
+  return __builtin_lsx_vmaddwev_q_d(_1, _2, _3);
+}
+// CHECK-LABEL: @vmaddwod_q_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaddwod.q.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], <2 x i64> [[_3:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vmaddwod_q_d(v2i64 _1, v2i64 _2, v2i64 _3) {
+  return __builtin_lsx_vmaddwod_q_d(_1, _2, _3);
+}
+// CHECK-LABEL: @vmaddwev_q_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaddwev.q.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], <2 x i64> [[_3:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2u64 vmaddwev_q_du(v2u64 _1, v2u64 _2, v2u64 _3) {
+  return __builtin_lsx_vmaddwev_q_du(_1, _2, _3);
+}
+// CHECK-LABEL: @vmaddwod_q_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaddwod.q.du(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], <2 x i64> [[_3:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2u64 vmaddwod_q_du(v2u64 _1, v2u64 _2, v2u64 _3) {
+  return __builtin_lsx_vmaddwod_q_du(_1, _2, _3);
+}
+// CHECK-LABEL: @vmaddwev_q_du_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaddwev.q.du.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], <2 x i64> [[_3:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vmaddwev_q_du_d(v2i64 _1, v2u64 _2, v2i64 _3) {
+  return __builtin_lsx_vmaddwev_q_du_d(_1, _2, _3);
+}
+// CHECK-LABEL: @vmaddwod_q_du_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vmaddwod.q.du.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], <2 x i64> [[_3:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vmaddwod_q_du_d(v2i64 _1, v2u64 _2, v2i64 _3) {
+  return __builtin_lsx_vmaddwod_q_du_d(_1, _2, _3);
+}
+// CHECK-LABEL: @vrotr_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vrotr.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vrotr_b(v16i8 _1, v16i8 _2) {
+  return __builtin_lsx_vrotr_b(_1, _2);
+}
+// CHECK-LABEL: @vrotr_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vrotr.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vrotr_h(v8i16 _1, v8i16 _2) {
+  return __builtin_lsx_vrotr_h(_1, _2);
+}
+// CHECK-LABEL: @vrotr_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vrotr.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vrotr_w(v4i32 _1, v4i32 _2) {
+  return __builtin_lsx_vrotr_w(_1, _2);
+}
+// CHECK-LABEL: @vrotr_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vrotr.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vrotr_d(v2i64 _1, v2i64 _2) {
+  return __builtin_lsx_vrotr_d(_1, _2);
+}
+// CHECK-LABEL: @vadd_q(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vadd.q(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vadd_q(v2i64 _1, v2i64 _2) { return __builtin_lsx_vadd_q(_1, _2); }
+// CHECK-LABEL: @vsub_q(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsub.q(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vsub_q(v2i64 _1, v2i64 _2) { return __builtin_lsx_vsub_q(_1, _2); }
+// CHECK-LABEL: @vldrepl_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vldrepl.b(ptr [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vldrepl_b(void *_1) { return __builtin_lsx_vldrepl_b(_1, 1); }
+// CHECK-LABEL: @vldrepl_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vldrepl.h(ptr [[_1:%.*]], i32 2)
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vldrepl_h(void *_1) { return __builtin_lsx_vldrepl_h(_1, 2); }
+// CHECK-LABEL: @vldrepl_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vldrepl.w(ptr [[_1:%.*]], i32 4)
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vldrepl_w(void *_1) { return __builtin_lsx_vldrepl_w(_1, 4); }
+// CHECK-LABEL: @vldrepl_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vldrepl.d(ptr [[_1:%.*]], i32 8)
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vldrepl_d(void *_1) { return __builtin_lsx_vldrepl_d(_1, 8); }
+// CHECK-LABEL: @vmskgez_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmskgez.b(<16 x i8> [[_1:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vmskgez_b(v16i8 _1) { return __builtin_lsx_vmskgez_b(_1); }
+// CHECK-LABEL: @vmsknz_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vmsknz.b(<16 x i8> [[_1:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vmsknz_b(v16i8 _1) { return __builtin_lsx_vmsknz_b(_1); }
+// CHECK-LABEL: @vexth_h_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vexth.h.b(<16 x i8> [[_1:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vexth_h_b(v16i8 _1) { return __builtin_lsx_vexth_h_b(_1); }
+// CHECK-LABEL: @vexth_w_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vexth.w.h(<8 x i16> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vexth_w_h(v8i16 _1) { return __builtin_lsx_vexth_w_h(_1); }
+// CHECK-LABEL: @vexth_d_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vexth.d.w(<4 x i32> [[_1:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vexth_d_w(v4i32 _1) { return __builtin_lsx_vexth_d_w(_1); }
+// CHECK-LABEL: @vexth_q_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vexth.q.d(<2 x i64> [[_1:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vexth_q_d(v2i64 _1) { return __builtin_lsx_vexth_q_d(_1); }
+// CHECK-LABEL: @vexth_hu_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vexth.hu.bu(<16 x i8> [[_1:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8u16 vexth_hu_bu(v16u8 _1) { return __builtin_lsx_vexth_hu_bu(_1); }
+// CHECK-LABEL: @vexth_wu_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vexth.wu.hu(<8 x i16> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4u32 vexth_wu_hu(v8u16 _1) { return __builtin_lsx_vexth_wu_hu(_1); }
+// CHECK-LABEL: @vexth_du_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vexth.du.wu(<4 x i32> [[_1:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2u64 vexth_du_wu(v4u32 _1) { return __builtin_lsx_vexth_du_wu(_1); }
+// CHECK-LABEL: @vexth_qu_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vexth.qu.du(<2 x i64> [[_1:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2u64 vexth_qu_du(v2u64 _1) { return __builtin_lsx_vexth_qu_du(_1); }
+// CHECK-LABEL: @vrotri_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vrotri.b(<16 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vrotri_b(v16i8 _1) { return __builtin_lsx_vrotri_b(_1, 1); }
+// CHECK-LABEL: @vrotri_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vrotri.h(<8 x i16> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vrotri_h(v8i16 _1) { return __builtin_lsx_vrotri_h(_1, 1); }
+// CHECK-LABEL: @vrotri_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vrotri.w(<4 x i32> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vrotri_w(v4i32 _1) { return __builtin_lsx_vrotri_w(_1, 1); }
+// CHECK-LABEL: @vrotri_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vrotri.d(<2 x i64> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vrotri_d(v2i64 _1) { return __builtin_lsx_vrotri_d(_1, 1); }
+// CHECK-LABEL: @vextl_q_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vextl.q.d(<2 x i64> [[_1:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vextl_q_d(v2i64 _1) { return __builtin_lsx_vextl_q_d(_1); }
+// CHECK-LABEL: @vsrlni_b_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrlni.b.h(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vsrlni_b_h(v16i8 _1, v16i8 _2) {
+  return __builtin_lsx_vsrlni_b_h(_1, _2, 1);
+}
+// CHECK-LABEL: @vsrlni_h_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrlni.h.w(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vsrlni_h_w(v8i16 _1, v8i16 _2) {
+  return __builtin_lsx_vsrlni_h_w(_1, _2, 1);
+}
+// CHECK-LABEL: @vsrlni_w_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrlni.w.d(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vsrlni_w_d(v4i32 _1, v4i32 _2) {
+  return __builtin_lsx_vsrlni_w_d(_1, _2, 1);
+}
+// CHECK-LABEL: @vsrlni_d_q(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsrlni.d.q(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vsrlni_d_q(v2i64 _1, v2i64 _2) {
+  return __builtin_lsx_vsrlni_d_q(_1, _2, 1);
+}
+// CHECK-LABEL: @vsrlrni_b_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrlrni.b.h(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vsrlrni_b_h(v16i8 _1, v16i8 _2) {
+  return __builtin_lsx_vsrlrni_b_h(_1, _2, 1);
+}
+// CHECK-LABEL: @vsrlrni_h_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrlrni.h.w(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vsrlrni_h_w(v8i16 _1, v8i16 _2) {
+  return __builtin_lsx_vsrlrni_h_w(_1, _2, 1);
+}
+// CHECK-LABEL: @vsrlrni_w_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrlrni.w.d(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vsrlrni_w_d(v4i32 _1, v4i32 _2) {
+  return __builtin_lsx_vsrlrni_w_d(_1, _2, 1);
+}
+// CHECK-LABEL: @vsrlrni_d_q(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsrlrni.d.q(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vsrlrni_d_q(v2i64 _1, v2i64 _2) {
+  return __builtin_lsx_vsrlrni_d_q(_1, _2, 1);
+}
+// CHECK-LABEL: @vssrlni_b_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrlni.b.h(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vssrlni_b_h(v16i8 _1, v16i8 _2) {
+  return __builtin_lsx_vssrlni_b_h(_1, _2, 1);
+}
+// CHECK-LABEL: @vssrlni_h_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrlni.h.w(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vssrlni_h_w(v8i16 _1, v8i16 _2) {
+  return __builtin_lsx_vssrlni_h_w(_1, _2, 1);
+}
+// CHECK-LABEL: @vssrlni_w_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrlni.w.d(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vssrlni_w_d(v4i32 _1, v4i32 _2) {
+  return __builtin_lsx_vssrlni_w_d(_1, _2, 1);
+}
+// CHECK-LABEL: @vssrlni_d_q(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vssrlni.d.q(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vssrlni_d_q(v2i64 _1, v2i64 _2) {
+  return __builtin_lsx_vssrlni_d_q(_1, _2, 1);
+}
+// CHECK-LABEL: @vssrlni_bu_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrlni.bu.h(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16u8 vssrlni_bu_h(v16u8 _1, v16i8 _2) {
+  return __builtin_lsx_vssrlni_bu_h(_1, _2, 1);
+}
+// CHECK-LABEL: @vssrlni_hu_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrlni.hu.w(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8u16 vssrlni_hu_w(v8u16 _1, v8i16 _2) {
+  return __builtin_lsx_vssrlni_hu_w(_1, _2, 1);
+}
+// CHECK-LABEL: @vssrlni_wu_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrlni.wu.d(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4u32 vssrlni_wu_d(v4u32 _1, v4i32 _2) {
+  return __builtin_lsx_vssrlni_wu_d(_1, _2, 1);
+}
+// CHECK-LABEL: @vssrlni_du_q(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vssrlni.du.q(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2u64 vssrlni_du_q(v2u64 _1, v2i64 _2) {
+  return __builtin_lsx_vssrlni_du_q(_1, _2, 1);
+}
+// CHECK-LABEL: @vssrlrni_b_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrlrni.b.h(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vssrlrni_b_h(v16i8 _1, v16i8 _2) {
+  return __builtin_lsx_vssrlrni_b_h(_1, _2, 1);
+}
+// CHECK-LABEL: @vssrlrni_h_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrlrni.h.w(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vssrlrni_h_w(v8i16 _1, v8i16 _2) {
+  return __builtin_lsx_vssrlrni_h_w(_1, _2, 1);
+}
+// CHECK-LABEL: @vssrlrni_w_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrlrni.w.d(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vssrlrni_w_d(v4i32 _1, v4i32 _2) {
+  return __builtin_lsx_vssrlrni_w_d(_1, _2, 1);
+}
+// CHECK-LABEL: @vssrlrni_d_q(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vssrlrni.d.q(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vssrlrni_d_q(v2i64 _1, v2i64 _2) {
+  return __builtin_lsx_vssrlrni_d_q(_1, _2, 1);
+}
+// CHECK-LABEL: @vssrlrni_bu_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrlrni.bu.h(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16u8 vssrlrni_bu_h(v16u8 _1, v16i8 _2) {
+  return __builtin_lsx_vssrlrni_bu_h(_1, _2, 1);
+}
+// CHECK-LABEL: @vssrlrni_hu_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrlrni.hu.w(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8u16 vssrlrni_hu_w(v8u16 _1, v8i16 _2) {
+  return __builtin_lsx_vssrlrni_hu_w(_1, _2, 1);
+}
+// CHECK-LABEL: @vssrlrni_wu_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrlrni.wu.d(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4u32 vssrlrni_wu_d(v4u32 _1, v4i32 _2) {
+  return __builtin_lsx_vssrlrni_wu_d(_1, _2, 1);
+}
+// CHECK-LABEL: @vssrlrni_du_q(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vssrlrni.du.q(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2u64 vssrlrni_du_q(v2u64 _1, v2i64 _2) {
+  return __builtin_lsx_vssrlrni_du_q(_1, _2, 1);
+}
+// CHECK-LABEL: @vsrani_b_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrani.b.h(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vsrani_b_h(v16i8 _1, v16i8 _2) {
+  return __builtin_lsx_vsrani_b_h(_1, _2, 1);
+}
+// CHECK-LABEL: @vsrani_h_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrani.h.w(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vsrani_h_w(v8i16 _1, v8i16 _2) {
+  return __builtin_lsx_vsrani_h_w(_1, _2, 1);
+}
+// CHECK-LABEL: @vsrani_w_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrani.w.d(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vsrani_w_d(v4i32 _1, v4i32 _2) {
+  return __builtin_lsx_vsrani_w_d(_1, _2, 1);
+}
+// CHECK-LABEL: @vsrani_d_q(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsrani.d.q(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vsrani_d_q(v2i64 _1, v2i64 _2) {
+  return __builtin_lsx_vsrani_d_q(_1, _2, 1);
+}
+// CHECK-LABEL: @vsrarni_b_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vsrarni.b.h(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vsrarni_b_h(v16i8 _1, v16i8 _2) {
+  return __builtin_lsx_vsrarni_b_h(_1, _2, 1);
+}
+// CHECK-LABEL: @vsrarni_h_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vsrarni.h.w(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vsrarni_h_w(v8i16 _1, v8i16 _2) {
+  return __builtin_lsx_vsrarni_h_w(_1, _2, 1);
+}
+// CHECK-LABEL: @vsrarni_w_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vsrarni.w.d(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vsrarni_w_d(v4i32 _1, v4i32 _2) {
+  return __builtin_lsx_vsrarni_w_d(_1, _2, 1);
+}
+// CHECK-LABEL: @vsrarni_d_q(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vsrarni.d.q(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vsrarni_d_q(v2i64 _1, v2i64 _2) {
+  return __builtin_lsx_vsrarni_d_q(_1, _2, 1);
+}
+// CHECK-LABEL: @vssrani_b_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrani.b.h(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vssrani_b_h(v16i8 _1, v16i8 _2) {
+  return __builtin_lsx_vssrani_b_h(_1, _2, 1);
+}
+// CHECK-LABEL: @vssrani_h_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrani.h.w(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vssrani_h_w(v8i16 _1, v8i16 _2) {
+  return __builtin_lsx_vssrani_h_w(_1, _2, 1);
+}
+// CHECK-LABEL: @vssrani_w_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrani.w.d(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vssrani_w_d(v4i32 _1, v4i32 _2) {
+  return __builtin_lsx_vssrani_w_d(_1, _2, 1);
+}
+// CHECK-LABEL: @vssrani_d_q(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vssrani.d.q(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vssrani_d_q(v2i64 _1, v2i64 _2) {
+  return __builtin_lsx_vssrani_d_q(_1, _2, 1);
+}
+// CHECK-LABEL: @vssrani_bu_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrani.bu.h(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16u8 vssrani_bu_h(v16u8 _1, v16i8 _2) {
+  return __builtin_lsx_vssrani_bu_h(_1, _2, 1);
+}
+// CHECK-LABEL: @vssrani_hu_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrani.hu.w(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8u16 vssrani_hu_w(v8u16 _1, v8i16 _2) {
+  return __builtin_lsx_vssrani_hu_w(_1, _2, 1);
+}
+// CHECK-LABEL: @vssrani_wu_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrani.wu.d(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4u32 vssrani_wu_d(v4u32 _1, v4i32 _2) {
+  return __builtin_lsx_vssrani_wu_d(_1, _2, 1);
+}
+// CHECK-LABEL: @vssrani_du_q(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vssrani.du.q(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2u64 vssrani_du_q(v2u64 _1, v2i64 _2) {
+  return __builtin_lsx_vssrani_du_q(_1, _2, 1);
+}
+// CHECK-LABEL: @vssrarni_b_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrarni.b.h(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vssrarni_b_h(v16i8 _1, v16i8 _2) {
+  return __builtin_lsx_vssrarni_b_h(_1, _2, 1);
+}
+// CHECK-LABEL: @vssrarni_h_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrarni.h.w(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vssrarni_h_w(v8i16 _1, v8i16 _2) {
+  return __builtin_lsx_vssrarni_h_w(_1, _2, 1);
+}
+// CHECK-LABEL: @vssrarni_w_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrarni.w.d(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vssrarni_w_d(v4i32 _1, v4i32 _2) {
+  return __builtin_lsx_vssrarni_w_d(_1, _2, 1);
+}
+// CHECK-LABEL: @vssrarni_d_q(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vssrarni.d.q(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vssrarni_d_q(v2i64 _1, v2i64 _2) {
+  return __builtin_lsx_vssrarni_d_q(_1, _2, 1);
+}
+// CHECK-LABEL: @vssrarni_bu_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrarni.bu.h(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16u8 vssrarni_bu_h(v16u8 _1, v16i8 _2) {
+  return __builtin_lsx_vssrarni_bu_h(_1, _2, 1);
+}
+// CHECK-LABEL: @vssrarni_hu_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrarni.hu.w(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8u16 vssrarni_hu_w(v8u16 _1, v8i16 _2) {
+  return __builtin_lsx_vssrarni_hu_w(_1, _2, 1);
+}
+// CHECK-LABEL: @vssrarni_wu_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrarni.wu.d(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4u32 vssrarni_wu_d(v4u32 _1, v4i32 _2) {
+  return __builtin_lsx_vssrarni_wu_d(_1, _2, 1);
+}
+// CHECK-LABEL: @vssrarni_du_q(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vssrarni.du.q(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2u64 vssrarni_du_q(v2u64 _1, v2i64 _2) {
+  return __builtin_lsx_vssrarni_du_q(_1, _2, 1);
+}
+// CHECK-LABEL: @vpermi_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vpermi.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vpermi_w(v4i32 _1, v4i32 _2) {
+  return __builtin_lsx_vpermi_w(_1, _2, 1);
+}
+// CHECK-LABEL: @vld(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vld(ptr [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vld(void *_1) { return __builtin_lsx_vld(_1, 1); }
+// CHECK-LABEL: @vst(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.loongarch.lsx.vst(<16 x i8> [[_1:%.*]], ptr [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret void
+//
+void vst(v16i8 _1, void *_2) { return __builtin_lsx_vst(_1, _2, 1); }
+// CHECK-LABEL: @vssrlrn_b_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrlrn.b.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vssrlrn_b_h(v8i16 _1, v8i16 _2) {
+  return __builtin_lsx_vssrlrn_b_h(_1, _2);
+}
+// CHECK-LABEL: @vssrlrn_h_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrlrn.h.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vssrlrn_h_w(v4i32 _1, v4i32 _2) {
+  return __builtin_lsx_vssrlrn_h_w(_1, _2);
+}
+// CHECK-LABEL: @vssrlrn_w_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrlrn.w.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vssrlrn_w_d(v2i64 _1, v2i64 _2) {
+  return __builtin_lsx_vssrlrn_w_d(_1, _2);
+}
+// CHECK-LABEL: @vssrln_b_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vssrln.b.h(<8 x i16> [[_1:%.*]], <8 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vssrln_b_h(v8i16 _1, v8i16 _2) {
+  return __builtin_lsx_vssrln_b_h(_1, _2);
+}
+// CHECK-LABEL: @vssrln_h_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vssrln.h.w(<4 x i32> [[_1:%.*]], <4 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vssrln_h_w(v4i32 _1, v4i32 _2) {
+  return __builtin_lsx_vssrln_h_w(_1, _2);
+}
+// CHECK-LABEL: @vssrln_w_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vssrln.w.d(<2 x i64> [[_1:%.*]], <2 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vssrln_w_d(v2i64 _1, v2i64 _2) {
+  return __builtin_lsx_vssrln_w_d(_1, _2);
+}
+// CHECK-LABEL: @vorn_v(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vorn.v(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vorn_v(v16i8 _1, v16i8 _2) { return __builtin_lsx_vorn_v(_1, _2); }
+// CHECK-LABEL: @vldi(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vldi(i32 1)
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vldi() { return __builtin_lsx_vldi(1); }
+// CHECK-LABEL: @vshuf_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vshuf.b(<16 x i8> [[_1:%.*]], <16 x i8> [[_2:%.*]], <16 x i8> [[_3:%.*]])
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vshuf_b(v16i8 _1, v16i8 _2, v16i8 _3) {
+  return __builtin_lsx_vshuf_b(_1, _2, _3);
+}
+// CHECK-LABEL: @vldx(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vldx(ptr [[_1:%.*]], i64 1)
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vldx(void *_1) { return __builtin_lsx_vldx(_1, 1); }
+// CHECK-LABEL: @vstx(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.loongarch.lsx.vstx(<16 x i8> [[_1:%.*]], ptr [[_2:%.*]], i64 1)
+// CHECK-NEXT:    ret void
+//
+void vstx(v16i8 _1, void *_2) { return __builtin_lsx_vstx(_1, _2, 1); }
+// CHECK-LABEL: @vextl_qu_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vextl.qu.du(<2 x i64> [[_1:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2u64 vextl_qu_du(v2u64 _1) { return __builtin_lsx_vextl_qu_du(_1); }
+// CHECK-LABEL: @bnz_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lsx.bnz.b(<16 x i8> [[_1:%.*]])
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+int bnz_b(v16u8 _1) { return __builtin_lsx_bnz_b(_1); }
+// CHECK-LABEL: @bnz_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lsx.bnz.d(<2 x i64> [[_1:%.*]])
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+int bnz_d(v2u64 _1) { return __builtin_lsx_bnz_d(_1); }
+// CHECK-LABEL: @bnz_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lsx.bnz.h(<8 x i16> [[_1:%.*]])
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+int bnz_h(v8u16 _1) { return __builtin_lsx_bnz_h(_1); }
+// CHECK-LABEL: @bnz_v(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lsx.bnz.v(<16 x i8> [[_1:%.*]])
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+int bnz_v(v16u8 _1) { return __builtin_lsx_bnz_v(_1); }
+// CHECK-LABEL: @bnz_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lsx.bnz.w(<4 x i32> [[_1:%.*]])
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+int bnz_w(v4u32 _1) { return __builtin_lsx_bnz_w(_1); }
+// CHECK-LABEL: @bz_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lsx.bz.b(<16 x i8> [[_1:%.*]])
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+int bz_b(v16u8 _1) { return __builtin_lsx_bz_b(_1); }
+// CHECK-LABEL: @bz_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lsx.bz.d(<2 x i64> [[_1:%.*]])
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+int bz_d(v2u64 _1) { return __builtin_lsx_bz_d(_1); }
+// CHECK-LABEL: @bz_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lsx.bz.h(<8 x i16> [[_1:%.*]])
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+int bz_h(v8u16 _1) { return __builtin_lsx_bz_h(_1); }
+// CHECK-LABEL: @bz_v(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lsx.bz.v(<16 x i8> [[_1:%.*]])
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+int bz_v(v16u8 _1) { return __builtin_lsx_bz_v(_1); }
+// CHECK-LABEL: @bz_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lsx.bz.w(<4 x i32> [[_1:%.*]])
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+int bz_w(v4u32 _1) { return __builtin_lsx_bz_w(_1); }
+// CHECK-LABEL: @vfcmp_caf_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.caf.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vfcmp_caf_d(v2f64 _1, v2f64 _2) {
+  return __builtin_lsx_vfcmp_caf_d(_1, _2);
+}
+// CHECK-LABEL: @vfcmp_caf_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.caf.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vfcmp_caf_s(v4f32 _1, v4f32 _2) {
+  return __builtin_lsx_vfcmp_caf_s(_1, _2);
+}
+// CHECK-LABEL: @vfcmp_ceq_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.ceq.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vfcmp_ceq_d(v2f64 _1, v2f64 _2) {
+  return __builtin_lsx_vfcmp_ceq_d(_1, _2);
+}
+// CHECK-LABEL: @vfcmp_ceq_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.ceq.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vfcmp_ceq_s(v4f32 _1, v4f32 _2) {
+  return __builtin_lsx_vfcmp_ceq_s(_1, _2);
+}
+// CHECK-LABEL: @vfcmp_cle_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.cle.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vfcmp_cle_d(v2f64 _1, v2f64 _2) {
+  return __builtin_lsx_vfcmp_cle_d(_1, _2);
+}
+// CHECK-LABEL: @vfcmp_cle_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.cle.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vfcmp_cle_s(v4f32 _1, v4f32 _2) {
+  return __builtin_lsx_vfcmp_cle_s(_1, _2);
+}
+// CHECK-LABEL: @vfcmp_clt_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.clt.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vfcmp_clt_d(v2f64 _1, v2f64 _2) {
+  return __builtin_lsx_vfcmp_clt_d(_1, _2);
+}
+// CHECK-LABEL: @vfcmp_clt_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.clt.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vfcmp_clt_s(v4f32 _1, v4f32 _2) {
+  return __builtin_lsx_vfcmp_clt_s(_1, _2);
+}
+// CHECK-LABEL: @vfcmp_cne_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.cne.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vfcmp_cne_d(v2f64 _1, v2f64 _2) {
+  return __builtin_lsx_vfcmp_cne_d(_1, _2);
+}
+// CHECK-LABEL: @vfcmp_cne_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.cne.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vfcmp_cne_s(v4f32 _1, v4f32 _2) {
+  return __builtin_lsx_vfcmp_cne_s(_1, _2);
+}
+// CHECK-LABEL: @vfcmp_cor_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.cor.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vfcmp_cor_d(v2f64 _1, v2f64 _2) {
+  return __builtin_lsx_vfcmp_cor_d(_1, _2);
+}
+// CHECK-LABEL: @vfcmp_cor_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.cor.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vfcmp_cor_s(v4f32 _1, v4f32 _2) {
+  return __builtin_lsx_vfcmp_cor_s(_1, _2);
+}
+// CHECK-LABEL: @vfcmp_cueq_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.cueq.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vfcmp_cueq_d(v2f64 _1, v2f64 _2) {
+  return __builtin_lsx_vfcmp_cueq_d(_1, _2);
+}
+// CHECK-LABEL: @vfcmp_cueq_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.cueq.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vfcmp_cueq_s(v4f32 _1, v4f32 _2) {
+  return __builtin_lsx_vfcmp_cueq_s(_1, _2);
+}
+// CHECK-LABEL: @vfcmp_cule_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.cule.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vfcmp_cule_d(v2f64 _1, v2f64 _2) {
+  return __builtin_lsx_vfcmp_cule_d(_1, _2);
+}
+// CHECK-LABEL: @vfcmp_cule_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.cule.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vfcmp_cule_s(v4f32 _1, v4f32 _2) {
+  return __builtin_lsx_vfcmp_cule_s(_1, _2);
+}
+// CHECK-LABEL: @vfcmp_cult_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.cult.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vfcmp_cult_d(v2f64 _1, v2f64 _2) {
+  return __builtin_lsx_vfcmp_cult_d(_1, _2);
+}
+// CHECK-LABEL: @vfcmp_cult_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.cult.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vfcmp_cult_s(v4f32 _1, v4f32 _2) {
+  return __builtin_lsx_vfcmp_cult_s(_1, _2);
+}
+// CHECK-LABEL: @vfcmp_cun_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.cun.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vfcmp_cun_d(v2f64 _1, v2f64 _2) {
+  return __builtin_lsx_vfcmp_cun_d(_1, _2);
+}
+// CHECK-LABEL: @vfcmp_cune_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.cune.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vfcmp_cune_d(v2f64 _1, v2f64 _2) {
+  return __builtin_lsx_vfcmp_cune_d(_1, _2);
+}
+// CHECK-LABEL: @vfcmp_cune_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.cune.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vfcmp_cune_s(v4f32 _1, v4f32 _2) {
+  return __builtin_lsx_vfcmp_cune_s(_1, _2);
+}
+// CHECK-LABEL: @vfcmp_cun_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.cun.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vfcmp_cun_s(v4f32 _1, v4f32 _2) {
+  return __builtin_lsx_vfcmp_cun_s(_1, _2);
+}
+// CHECK-LABEL: @vfcmp_saf_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.saf.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vfcmp_saf_d(v2f64 _1, v2f64 _2) {
+  return __builtin_lsx_vfcmp_saf_d(_1, _2);
+}
+// CHECK-LABEL: @vfcmp_saf_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.saf.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vfcmp_saf_s(v4f32 _1, v4f32 _2) {
+  return __builtin_lsx_vfcmp_saf_s(_1, _2);
+}
+// CHECK-LABEL: @vfcmp_seq_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.seq.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vfcmp_seq_d(v2f64 _1, v2f64 _2) {
+  return __builtin_lsx_vfcmp_seq_d(_1, _2);
+}
+// CHECK-LABEL: @vfcmp_seq_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.seq.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vfcmp_seq_s(v4f32 _1, v4f32 _2) {
+  return __builtin_lsx_vfcmp_seq_s(_1, _2);
+}
+// CHECK-LABEL: @vfcmp_sle_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.sle.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vfcmp_sle_d(v2f64 _1, v2f64 _2) {
+  return __builtin_lsx_vfcmp_sle_d(_1, _2);
+}
+// CHECK-LABEL: @vfcmp_sle_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.sle.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vfcmp_sle_s(v4f32 _1, v4f32 _2) {
+  return __builtin_lsx_vfcmp_sle_s(_1, _2);
+}
+// CHECK-LABEL: @vfcmp_slt_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.slt.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vfcmp_slt_d(v2f64 _1, v2f64 _2) {
+  return __builtin_lsx_vfcmp_slt_d(_1, _2);
+}
+// CHECK-LABEL: @vfcmp_slt_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.slt.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vfcmp_slt_s(v4f32 _1, v4f32 _2) {
+  return __builtin_lsx_vfcmp_slt_s(_1, _2);
+}
+// CHECK-LABEL: @vfcmp_sne_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.sne.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vfcmp_sne_d(v2f64 _1, v2f64 _2) {
+  return __builtin_lsx_vfcmp_sne_d(_1, _2);
+}
+// CHECK-LABEL: @vfcmp_sne_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.sne.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vfcmp_sne_s(v4f32 _1, v4f32 _2) {
+  return __builtin_lsx_vfcmp_sne_s(_1, _2);
+}
+// CHECK-LABEL: @vfcmp_sor_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.sor.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vfcmp_sor_d(v2f64 _1, v2f64 _2) {
+  return __builtin_lsx_vfcmp_sor_d(_1, _2);
+}
+// CHECK-LABEL: @vfcmp_sor_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.sor.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vfcmp_sor_s(v4f32 _1, v4f32 _2) {
+  return __builtin_lsx_vfcmp_sor_s(_1, _2);
+}
+// CHECK-LABEL: @vfcmp_sueq_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.sueq.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vfcmp_sueq_d(v2f64 _1, v2f64 _2) {
+  return __builtin_lsx_vfcmp_sueq_d(_1, _2);
+}
+// CHECK-LABEL: @vfcmp_sueq_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.sueq.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vfcmp_sueq_s(v4f32 _1, v4f32 _2) {
+  return __builtin_lsx_vfcmp_sueq_s(_1, _2);
+}
+// CHECK-LABEL: @vfcmp_sule_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.sule.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vfcmp_sule_d(v2f64 _1, v2f64 _2) {
+  return __builtin_lsx_vfcmp_sule_d(_1, _2);
+}
+// CHECK-LABEL: @vfcmp_sule_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.sule.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vfcmp_sule_s(v4f32 _1, v4f32 _2) {
+  return __builtin_lsx_vfcmp_sule_s(_1, _2);
+}
+// CHECK-LABEL: @vfcmp_sult_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.sult.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vfcmp_sult_d(v2f64 _1, v2f64 _2) {
+  return __builtin_lsx_vfcmp_sult_d(_1, _2);
+}
+// CHECK-LABEL: @vfcmp_sult_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.sult.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vfcmp_sult_s(v4f32 _1, v4f32 _2) {
+  return __builtin_lsx_vfcmp_sult_s(_1, _2);
+}
+// CHECK-LABEL: @vfcmp_sun_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.sun.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vfcmp_sun_d(v2f64 _1, v2f64 _2) {
+  return __builtin_lsx_vfcmp_sun_d(_1, _2);
+}
+// CHECK-LABEL: @vfcmp_sune_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vfcmp.sune.d(<2 x double> [[_1:%.*]], <2 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vfcmp_sune_d(v2f64 _1, v2f64 _2) {
+  return __builtin_lsx_vfcmp_sune_d(_1, _2);
+}
+// CHECK-LABEL: @vfcmp_sune_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.sune.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vfcmp_sune_s(v4f32 _1, v4f32 _2) {
+  return __builtin_lsx_vfcmp_sune_s(_1, _2);
+}
+// CHECK-LABEL: @vfcmp_sun_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vfcmp.sun.s(<4 x float> [[_1:%.*]], <4 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vfcmp_sun_s(v4f32 _1, v4f32 _2) {
+  return __builtin_lsx_vfcmp_sun_s(_1, _2);
+}
+// CHECK-LABEL: @vrepli_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i8> @llvm.loongarch.lsx.vrepli.b(i32 1)
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+v16i8 vrepli_b() { return __builtin_lsx_vrepli_b(1); }
+// CHECK-LABEL: @vrepli_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <2 x i64> @llvm.loongarch.lsx.vrepli.d(i32 1)
+// CHECK-NEXT:    ret <2 x i64> [[TMP0]]
+//
+v2i64 vrepli_d() { return __builtin_lsx_vrepli_d(1); }
+// CHECK-LABEL: @vrepli_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.loongarch.lsx.vrepli.h(i32 1)
+// CHECK-NEXT:    ret <8 x i16> [[TMP0]]
+//
+v8i16 vrepli_h() { return __builtin_lsx_vrepli_h(1); }
+// CHECK-LABEL: @vrepli_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.loongarch.lsx.vrepli.w(i32 1)
+// CHECK-NEXT:    ret <4 x i32> [[TMP0]]
+//
+v4i32 vrepli_w() { return __builtin_lsx_vrepli_w(1); }

From 535408eedbf812d9038bd40a0faae5001d2256cf Mon Sep 17 00:00:00 2001
From: chenli <chenli@loongson.cn>
Date: Fri, 27 Oct 2023 15:58:55 +0800
Subject: [PATCH 478/877] [LoongArch][CodeGen] Add LASX builtin testcases

---
 .../LoongArch/lasx/builtin-alias-error.c      | 1373 +++++
 .../CodeGen/LoongArch/lasx/builtin-alias.c    | 4430 ++++++++++++++++
 .../CodeGen/LoongArch/lasx/builtin-error.c    | 1392 ++++++
 clang/test/CodeGen/LoongArch/lasx/builtin.c   | 4452 +++++++++++++++++
 4 files changed, 11647 insertions(+)
 create mode 100644 clang/test/CodeGen/LoongArch/lasx/builtin-alias-error.c
 create mode 100644 clang/test/CodeGen/LoongArch/lasx/builtin-alias.c
 create mode 100644 clang/test/CodeGen/LoongArch/lasx/builtin-error.c
 create mode 100644 clang/test/CodeGen/LoongArch/lasx/builtin.c

diff --git a/clang/test/CodeGen/LoongArch/lasx/builtin-alias-error.c b/clang/test/CodeGen/LoongArch/lasx/builtin-alias-error.c
new file mode 100644
index 0000000000000..2a3862bbe3c18
--- /dev/null
+++ b/clang/test/CodeGen/LoongArch/lasx/builtin-alias-error.c
@@ -0,0 +1,1373 @@
+// RUN: %clang_cc1 -triple loongarch64 -target-feature +lasx -verify %s
+
+#include <lasxintrin.h>
+
+v32i8 xvslli_b(v32i8 _1, int var) {
+  v32i8 res = __lasx_xvslli_b(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 7]}}
+  res |= __lasx_xvslli_b(_1, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+  res |= __lasx_xvslli_b(_1, var); // expected-error {{argument to '__builtin_lasx_xvslli_b' must be a constant integer}}
+  return res;
+}
+
+v16i16 xvslli_h(v16i16 _1, int var) {
+  v16i16 res = __lasx_xvslli_h(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 15]}}
+  res |= __lasx_xvslli_h(_1, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  res |= __lasx_xvslli_h(_1, var); // expected-error {{argument to '__builtin_lasx_xvslli_h' must be a constant integer}}
+  return res;
+}
+
+v8i32 xvslli_w(v8i32 _1, int var) {
+  v8i32 res = __lasx_xvslli_w(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __lasx_xvslli_w(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __lasx_xvslli_w(_1, var); // expected-error {{argument to '__builtin_lasx_xvslli_w' must be a constant integer}}
+  return res;
+}
+
+v4i64 xvslli_d(v4i64 _1, int var) {
+  v4i64 res = __lasx_xvslli_d(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 63]}}
+  res |= __lasx_xvslli_d(_1, 64); // expected-error {{argument value 64 is outside the valid range [0, 63]}}
+  res |= __lasx_xvslli_d(_1, var); // expected-error {{argument to '__builtin_lasx_xvslli_d' must be a constant integer}}
+  return res;
+}
+
+v32i8 xvsrai_b(v32i8 _1, int var) {
+  v32i8 res = __lasx_xvsrai_b(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 7]}}
+  res |= __lasx_xvsrai_b(_1, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+  res |= __lasx_xvsrai_b(_1, var); // expected-error {{argument to '__builtin_lasx_xvsrai_b' must be a constant integer}}
+  return res;
+}
+
+v16i16 xvsrai_h(v16i16 _1, int var) {
+  v16i16 res = __lasx_xvsrai_h(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 15]}}
+  res |= __lasx_xvsrai_h(_1, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  res |= __lasx_xvsrai_h(_1, var); // expected-error {{argument to '__builtin_lasx_xvsrai_h' must be a constant integer}}
+  return res;
+}
+
+v8i32 xvsrai_w(v8i32 _1, int var) {
+  v8i32 res = __lasx_xvsrai_w(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __lasx_xvsrai_w(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __lasx_xvsrai_w(_1, var); // expected-error {{argument to '__builtin_lasx_xvsrai_w' must be a constant integer}}
+  return res;
+}
+
+v4i64 xvsrai_d(v4i64 _1, int var) {
+  v4i64 res = __lasx_xvsrai_d(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 63]}}
+  res |= __lasx_xvsrai_d(_1, 64); // expected-error {{argument value 64 is outside the valid range [0, 63]}}
+  res |= __lasx_xvsrai_d(_1, var); // expected-error {{argument to '__builtin_lasx_xvsrai_d' must be a constant integer}}
+  return res;
+}
+
+v32i8 xvsrari_b(v32i8 _1, int var) {
+  v32i8 res = __lasx_xvsrari_b(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 7]}}
+  res |= __lasx_xvsrari_b(_1, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+  res |= __lasx_xvsrari_b(_1, var); // expected-error {{argument to '__builtin_lasx_xvsrari_b' must be a constant integer}}
+  return res;
+}
+
+v16i16 xvsrari_h(v16i16 _1, int var) {
+  v16i16 res = __lasx_xvsrari_h(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 15]}}
+  res |= __lasx_xvsrari_h(_1, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  res |= __lasx_xvsrari_h(_1, var); // expected-error {{argument to '__builtin_lasx_xvsrari_h' must be a constant integer}}
+  return res;
+}
+
+v8i32 xvsrari_w(v8i32 _1, int var) {
+  v8i32 res = __lasx_xvsrari_w(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __lasx_xvsrari_w(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __lasx_xvsrari_w(_1, var); // expected-error {{argument to '__builtin_lasx_xvsrari_w' must be a constant integer}}
+  return res;
+}
+
+v4i64 xvsrari_d(v4i64 _1, int var) {
+  v4i64 res = __lasx_xvsrari_d(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 63]}}
+  res |= __lasx_xvsrari_d(_1, 64); // expected-error {{argument value 64 is outside the valid range [0, 63]}}
+  res |= __lasx_xvsrari_d(_1, var); // expected-error {{argument to '__builtin_lasx_xvsrari_d' must be a constant integer}}
+  return res;
+}
+
+v32i8 xvsrli_b(v32i8 _1, int var) {
+  v32i8 res = __lasx_xvsrli_b(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 7]}}
+  res |= __lasx_xvsrli_b(_1, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+  res |= __lasx_xvsrli_b(_1, var); // expected-error {{argument to '__builtin_lasx_xvsrli_b' must be a constant integer}}
+  return res;
+}
+
+v16i16 xvsrli_h(v16i16 _1, int var) {
+  v16i16 res = __lasx_xvsrli_h(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 15]}}
+  res |= __lasx_xvsrli_h(_1, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  res |= __lasx_xvsrli_h(_1, var); // expected-error {{argument to '__builtin_lasx_xvsrli_h' must be a constant integer}}
+  return res;
+}
+
+v8i32 xvsrli_w(v8i32 _1, int var) {
+  v8i32 res = __lasx_xvsrli_w(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __lasx_xvsrli_w(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __lasx_xvsrli_w(_1, var); // expected-error {{argument to '__builtin_lasx_xvsrli_w' must be a constant integer}}
+  return res;
+}
+
+v4i64 xvsrli_d(v4i64 _1, int var) {
+  v4i64 res = __lasx_xvsrli_d(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 63]}}
+  res |= __lasx_xvsrli_d(_1, 64); // expected-error {{argument value 64 is outside the valid range [0, 63]}}
+  res |= __lasx_xvsrli_d(_1, var); // expected-error {{argument to '__builtin_lasx_xvsrli_d' must be a constant integer}}
+  return res;
+}
+
+v32i8 xvsrlri_b(v32i8 _1, int var) {
+  v32i8 res = __lasx_xvsrlri_b(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 7]}}
+  res |= __lasx_xvsrlri_b(_1, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+  res |= __lasx_xvsrlri_b(_1, var); // expected-error {{argument to '__builtin_lasx_xvsrlri_b' must be a constant integer}}
+  return res;
+}
+
+v16i16 xvsrlri_h(v16i16 _1, int var) {
+  v16i16 res = __lasx_xvsrlri_h(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 15]}}
+  res |= __lasx_xvsrlri_h(_1, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  res |= __lasx_xvsrlri_h(_1, var); // expected-error {{argument to '__builtin_lasx_xvsrlri_h' must be a constant integer}}
+  return res;
+}
+
+v8i32 xvsrlri_w(v8i32 _1, int var) {
+  v8i32 res = __lasx_xvsrlri_w(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __lasx_xvsrlri_w(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __lasx_xvsrlri_w(_1, var); // expected-error {{argument to '__builtin_lasx_xvsrlri_w' must be a constant integer}}
+  return res;
+}
+
+v4i64 xvsrlri_d(v4i64 _1, int var) {
+  v4i64 res = __lasx_xvsrlri_d(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 63]}}
+  res |= __lasx_xvsrlri_d(_1, 64); // expected-error {{argument value 64 is outside the valid range [0, 63]}}
+  res |= __lasx_xvsrlri_d(_1, var); // expected-error {{argument to '__builtin_lasx_xvsrlri_d' must be a constant integer}}
+  return res;
+}
+
+v32u8 xvbitclri_b(v32u8 _1, int var) {
+  v32u8 res = __lasx_xvbitclri_b(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 7]}}
+  res |= __lasx_xvbitclri_b(_1, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+  res |= __lasx_xvbitclri_b(_1, var); // expected-error {{argument to '__builtin_lasx_xvbitclri_b' must be a constant integer}}
+  return res;
+}
+
+v16u16 xvbitclri_h(v16u16 _1, int var) {
+  v16u16 res = __lasx_xvbitclri_h(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 15]}}
+  res |= __lasx_xvbitclri_h(_1, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  res |= __lasx_xvbitclri_h(_1, var); // expected-error {{argument to '__builtin_lasx_xvbitclri_h' must be a constant integer}}
+  return res;
+}
+
+v8u32 xvbitclri_w(v8u32 _1, int var) {
+  v8u32 res = __lasx_xvbitclri_w(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __lasx_xvbitclri_w(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __lasx_xvbitclri_w(_1, var); // expected-error {{argument to '__builtin_lasx_xvbitclri_w' must be a constant integer}}
+  return res;
+}
+
+v4u64 xvbitclri_d(v4u64 _1, int var) {
+  v4u64 res = __lasx_xvbitclri_d(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 63]}}
+  res |= __lasx_xvbitclri_d(_1, 64); // expected-error {{argument value 64 is outside the valid range [0, 63]}}
+  res |= __lasx_xvbitclri_d(_1, var); // expected-error {{argument to '__builtin_lasx_xvbitclri_d' must be a constant integer}}
+  return res;
+}
+
+v32u8 xvbitseti_b(v32u8 _1, int var) {
+  v32u8 res = __lasx_xvbitseti_b(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 7]}}
+  res |= __lasx_xvbitseti_b(_1, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+  res |= __lasx_xvbitseti_b(_1, var); // expected-error {{argument to '__builtin_lasx_xvbitseti_b' must be a constant integer}}
+  return res;
+}
+
+v16u16 xvbitseti_h(v16u16 _1, int var) {
+  v16u16 res = __lasx_xvbitseti_h(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 15]}}
+  res |= __lasx_xvbitseti_h(_1, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  res |= __lasx_xvbitseti_h(_1, var); // expected-error {{argument to '__builtin_lasx_xvbitseti_h' must be a constant integer}}
+  return res;
+}
+
+v8u32 xvbitseti_w(v8u32 _1, int var) {
+  v8u32 res = __lasx_xvbitseti_w(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __lasx_xvbitseti_w(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __lasx_xvbitseti_w(_1, var); // expected-error {{argument to '__builtin_lasx_xvbitseti_w' must be a constant integer}}
+  return res;
+}
+
+v4u64 xvbitseti_d(v4u64 _1, int var) {
+  v4u64 res = __lasx_xvbitseti_d(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 63]}}
+  res |= __lasx_xvbitseti_d(_1, 64); // expected-error {{argument value 64 is outside the valid range [0, 63]}}
+  res |= __lasx_xvbitseti_d(_1, var); // expected-error {{argument to '__builtin_lasx_xvbitseti_d' must be a constant integer}}
+  return res;
+}
+
+v32u8 xvbitrevi_b(v32u8 _1, int var) {
+  v32u8 res = __lasx_xvbitrevi_b(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 7]}}
+  res |= __lasx_xvbitrevi_b(_1, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+  res |= __lasx_xvbitrevi_b(_1, var); // expected-error {{argument to '__builtin_lasx_xvbitrevi_b' must be a constant integer}}
+  return res;
+}
+
+v16u16 xvbitrevi_h(v16u16 _1, int var) {
+  v16u16 res = __lasx_xvbitrevi_h(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 15]}}
+  res |= __lasx_xvbitrevi_h(_1, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  res |= __lasx_xvbitrevi_h(_1, var); // expected-error {{argument to '__builtin_lasx_xvbitrevi_h' must be a constant integer}}
+  return res;
+}
+
+v8u32 xvbitrevi_w(v8u32 _1, int var) {
+  v8u32 res = __lasx_xvbitrevi_w(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __lasx_xvbitrevi_w(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __lasx_xvbitrevi_w(_1, var); // expected-error {{argument to '__builtin_lasx_xvbitrevi_w' must be a constant integer}}
+  return res;
+}
+
+v4u64 xvbitrevi_d(v4u64 _1, int var) {
+  v4u64 res = __lasx_xvbitrevi_d(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 63]}}
+  res |= __lasx_xvbitrevi_d(_1, 64); // expected-error {{argument value 64 is outside the valid range [0, 63]}}
+  res |= __lasx_xvbitrevi_d(_1, var); // expected-error {{argument to '__builtin_lasx_xvbitrevi_d' must be a constant integer}}
+  return res;
+}
+
+v32i8 xvaddi_bu(v32i8 _1, int var) {
+  v32i8 res = __lasx_xvaddi_bu(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __lasx_xvaddi_bu(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __lasx_xvaddi_bu(_1, var); // expected-error {{argument to '__builtin_lasx_xvaddi_bu' must be a constant integer}}
+  return res;
+}
+
+v16i16 xvaddi_hu(v16i16 _1, int var) {
+  v16i16 res = __lasx_xvaddi_hu(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __lasx_xvaddi_hu(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __lasx_xvaddi_hu(_1, var); // expected-error {{argument to '__builtin_lasx_xvaddi_hu' must be a constant integer}}
+  return res;
+}
+
+v8i32 xvaddi_wu(v8i32 _1, int var) {
+  v8i32 res = __lasx_xvaddi_wu(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __lasx_xvaddi_wu(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __lasx_xvaddi_wu(_1, var); // expected-error {{argument to '__builtin_lasx_xvaddi_wu' must be a constant integer}}
+  return res;
+}
+
+v4i64 xvaddi_du(v4i64 _1, int var) {
+  v4i64 res = __lasx_xvaddi_du(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __lasx_xvaddi_du(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __lasx_xvaddi_du(_1, var); // expected-error {{argument to '__builtin_lasx_xvaddi_du' must be a constant integer}}
+  return res;
+}
+
+v32i8 xvsubi_bu(v32i8 _1, int var) {
+  v32i8 res = __lasx_xvsubi_bu(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __lasx_xvsubi_bu(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __lasx_xvsubi_bu(_1, var); // expected-error {{argument to '__builtin_lasx_xvsubi_bu' must be a constant integer}}
+  return res;
+}
+
+v16i16 xvsubi_hu(v16i16 _1, int var) {
+  v16i16 res = __lasx_xvsubi_hu(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __lasx_xvsubi_hu(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __lasx_xvsubi_hu(_1, var); // expected-error {{argument to '__builtin_lasx_xvsubi_hu' must be a constant integer}}
+  return res;
+}
+
+v8i32 xvsubi_wu(v8i32 _1, int var) {
+  v8i32 res = __lasx_xvsubi_wu(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __lasx_xvsubi_wu(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __lasx_xvsubi_wu(_1, var); // expected-error {{argument to '__builtin_lasx_xvsubi_wu' must be a constant integer}}
+  return res;
+}
+
+v4i64 xvsubi_du(v4i64 _1, int var) {
+  v4i64 res = __lasx_xvsubi_du(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __lasx_xvsubi_du(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __lasx_xvsubi_du(_1, var); // expected-error {{argument to '__builtin_lasx_xvsubi_du' must be a constant integer}}
+  return res;
+}
+
+v32i8 xvmaxi_b(v32i8 _1, int var) {
+  v32i8 res = __lasx_xvmaxi_b(_1, -17); // expected-error {{argument value -17 is outside the valid range [-16, 15]}}
+  res |= __lasx_xvmaxi_b(_1, 16); // expected-error {{argument value 16 is outside the valid range [-16, 15]}}
+  res |= __lasx_xvmaxi_b(_1, var); // expected-error {{argument to '__builtin_lasx_xvmaxi_b' must be a constant integer}}
+  return res;
+}
+
+v16i16 xvmaxi_h(v16i16 _1, int var) {
+  v16i16 res = __lasx_xvmaxi_h(_1, -17); // expected-error {{argument value -17 is outside the valid range [-16, 15]}}
+  res |= __lasx_xvmaxi_h(_1, 16); // expected-error {{argument value 16 is outside the valid range [-16, 15]}}
+  res |= __lasx_xvmaxi_h(_1, var); // expected-error {{argument to '__builtin_lasx_xvmaxi_h' must be a constant integer}}
+  return res;
+}
+
+v8i32 xvmaxi_w(v8i32 _1, int var) {
+  v8i32 res = __lasx_xvmaxi_w(_1, -17); // expected-error {{argument value -17 is outside the valid range [-16, 15]}}
+  res |= __lasx_xvmaxi_w(_1, 16); // expected-error {{argument value 16 is outside the valid range [-16, 15]}}
+  res |= __lasx_xvmaxi_w(_1, var); // expected-error {{argument to '__builtin_lasx_xvmaxi_w' must be a constant integer}}
+  return res;
+}
+
+v4i64 xvmaxi_d(v4i64 _1, int var) {
+  v4i64 res = __lasx_xvmaxi_d(_1, -17); // expected-error {{argument value -17 is outside the valid range [-16, 15]}}
+  res |= __lasx_xvmaxi_d(_1, 16); // expected-error {{argument value 16 is outside the valid range [-16, 15]}}
+  res |= __lasx_xvmaxi_d(_1, var); // expected-error {{argument to '__builtin_lasx_xvmaxi_d' must be a constant integer}}
+  return res;
+}
+
+v32u8 xvmaxi_bu(v32u8 _1, int var) {
+  v32u8 res = __lasx_xvmaxi_bu(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __lasx_xvmaxi_bu(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __lasx_xvmaxi_bu(_1, var); // expected-error {{argument to '__builtin_lasx_xvmaxi_bu' must be a constant integer}}
+  return res;
+}
+
+v16u16 xvmaxi_hu(v16u16 _1, int var) {
+  v16u16 res = __lasx_xvmaxi_hu(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __lasx_xvmaxi_hu(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __lasx_xvmaxi_hu(_1, var); // expected-error {{argument to '__builtin_lasx_xvmaxi_hu' must be a constant integer}}
+  return res;
+}
+
+v8u32 xvmaxi_wu(v8u32 _1, int var) {
+  v8u32 res = __lasx_xvmaxi_wu(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __lasx_xvmaxi_wu(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __lasx_xvmaxi_wu(_1, var); // expected-error {{argument to '__builtin_lasx_xvmaxi_wu' must be a constant integer}}
+  return res;
+}
+
+v4u64 xvmaxi_du(v4u64 _1, int var) {
+  v4u64 res = __lasx_xvmaxi_du(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __lasx_xvmaxi_du(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __lasx_xvmaxi_du(_1, var); // expected-error {{argument to '__builtin_lasx_xvmaxi_du' must be a constant integer}}
+  return res;
+}
+
+v32i8 xvmini_b(v32i8 _1, int var) {
+  v32i8 res = __lasx_xvmini_b(_1, -17); // expected-error {{argument value -17 is outside the valid range [-16, 15]}}
+  res |= __lasx_xvmini_b(_1, 16); // expected-error {{argument value 16 is outside the valid range [-16, 15]}}
+  res |= __lasx_xvmini_b(_1, var); // expected-error {{argument to '__builtin_lasx_xvmini_b' must be a constant integer}}
+  return res;
+}
+
+v16i16 xvmini_h(v16i16 _1, int var) {
+  v16i16 res = __lasx_xvmini_h(_1, -17); // expected-error {{argument value -17 is outside the valid range [-16, 15]}}
+  res |= __lasx_xvmini_h(_1, 16); // expected-error {{argument value 16 is outside the valid range [-16, 15]}}
+  res |= __lasx_xvmini_h(_1, var); // expected-error {{argument to '__builtin_lasx_xvmini_h' must be a constant integer}}}
+  return res;
+}
+
+v8i32 xvmini_w(v8i32 _1, int var) {
+  v8i32 res = __lasx_xvmini_w(_1, -17); // expected-error {{argument value -17 is outside the valid range [-16, 15]}}
+  res |= __lasx_xvmini_w(_1, 16); // expected-error {{argument value 16 is outside the valid range [-16, 15]}}
+  res |= __lasx_xvmini_w(_1, var); // expected-error {{argument to '__builtin_lasx_xvmini_w' must be a constant integer}}
+  return res;
+}
+
+v4i64 xvmini_d(v4i64 _1, int var) {
+  v4i64 res = __lasx_xvmini_d(_1, -17); // expected-error {{argument value -17 is outside the valid range [-16, 15]}}
+  res |= __lasx_xvmini_d(_1, 16); // expected-error {{argument value 16 is outside the valid range [-16, 15]}}
+  res |= __lasx_xvmini_d(_1, var); // expected-error {{argument to '__builtin_lasx_xvmini_d' must be a constant integer}}
+  return res;
+}
+
+v32u8 xvmini_bu(v32u8 _1, int var) {
+  v32u8 res = __lasx_xvmini_bu(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __lasx_xvmini_bu(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __lasx_xvmini_bu(_1, var); // expected-error {{argument to '__builtin_lasx_xvmini_bu' must be a constant integer}}
+  return res;
+}
+
+v16u16 xvmini_hu(v16u16 _1, int var) {
+  v16u16 res = __lasx_xvmini_hu(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __lasx_xvmini_hu(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __lasx_xvmini_hu(_1, var); // expected-error {{argument to '__builtin_lasx_xvmini_hu' must be a constant integer}}
+  return res;
+}
+
+v8u32 xvmini_wu(v8u32 _1, int var) {
+  v8u32 res = __lasx_xvmini_wu(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __lasx_xvmini_wu(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __lasx_xvmini_wu(_1, var); // expected-error {{argument to '__builtin_lasx_xvmini_wu' must be a constant integer}}
+  return res;
+}
+
+v4u64 xvmini_du(v4u64 _1, int var) {
+  v4u64 res = __lasx_xvmini_du(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __lasx_xvmini_du(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __lasx_xvmini_du(_1, var); // expected-error {{argument to '__builtin_lasx_xvmini_du' must be a constant integer}}
+  return res;
+}
+
+v32i8 xvseqi_b(v32i8 _1, int var) {
+  v32i8 res = __lasx_xvseqi_b(_1, -17); // expected-error {{argument value -17 is outside the valid range [-16, 15]}}
+  res |= __lasx_xvseqi_b(_1, 16); // expected-error {{argument value 16 is outside the valid range [-16, 15]}}
+  res |= __lasx_xvseqi_b(_1, var); // expected-error {{argument to '__builtin_lasx_xvseqi_b' must be a constant integer}}
+  return res;
+}
+
+v16i16 xvseqi_h(v16i16 _1, int var) {
+  v16i16 res = __lasx_xvseqi_h(_1, -17); // expected-error {{argument value -17 is outside the valid range [-16, 15]}}
+  res |= __lasx_xvseqi_h(_1, 16); // expected-error {{argument value 16 is outside the valid range [-16, 15]}}
+  res |= __lasx_xvseqi_h(_1, var); // expected-error {{argument to '__builtin_lasx_xvseqi_h' must be a constant integer}}
+  return res;
+}
+
+v8i32 xvseqi_w(v8i32 _1, int var) {
+  v8i32 res = __lasx_xvseqi_w(_1, -17); // expected-error {{argument value -17 is outside the valid range [-16, 15]}}
+  res |= __lasx_xvseqi_w(_1, 16); // expected-error {{argument value 16 is outside the valid range [-16, 15]}}
+  res |= __lasx_xvseqi_w(_1, var); // expected-error {{argument to '__builtin_lasx_xvseqi_w' must be a constant integer}}
+  return res;
+}
+
+v4i64 xvseqi_d(v4i64 _1, int var) {
+  v4i64 res = __lasx_xvseqi_d(_1, -17); // expected-error {{argument value -17 is outside the valid range [-16, 15]}}
+  res |= __lasx_xvseqi_d(_1, 16); // expected-error {{argument value 16 is outside the valid range [-16, 15]}}
+  res |= __lasx_xvseqi_d(_1, var); // expected-error {{argument to '__builtin_lasx_xvseqi_d' must be a constant integer}}
+  return res;
+}
+
+v32i8 xvslti_b(v32i8 _1, int var) {
+  v32i8 res = __lasx_xvslti_b(_1, -17); // expected-error {{argument value -17 is outside the valid range [-16, 15]}}
+  res |= __lasx_xvslti_b(_1, 16); // expected-error {{argument value 16 is outside the valid range [-16, 15]}}
+  res |= __lasx_xvslti_b(_1, var); // expected-error {{argument to '__builtin_lasx_xvslti_b' must be a constant integer}}
+  return res;
+}
+
+v16i16 xvslti_h(v16i16 _1, int var) {
+  v16i16 res = __lasx_xvslti_h(_1, -17); // expected-error {{argument value -17 is outside the valid range [-16, 15]}}
+  res |= __lasx_xvslti_h(_1, 16); // expected-error {{argument value 16 is outside the valid range [-16, 15]}}
+  res |= __lasx_xvslti_h(_1, var); // expected-error {{argument to '__builtin_lasx_xvslti_h' must be a constant integer}}
+  return res;
+}
+
+v8i32 xvslti_w(v8i32 _1, int var) {
+  v8i32 res = __lasx_xvslti_w(_1, -17); // expected-error {{argument value -17 is outside the valid range [-16, 15]}}
+  res |= __lasx_xvslti_w(_1, 16); // expected-error {{argument value 16 is outside the valid range [-16, 15]}}
+  res |= __lasx_xvslti_w(_1, var); // expected-error {{argument to '__builtin_lasx_xvslti_w' must be a constant integer}}
+  return res;
+}
+
+v4i64 xvslti_d(v4i64 _1, int var) {
+  v4i64 res = __lasx_xvslti_d(_1, -17); // expected-error {{argument value -17 is outside the valid range [-16, 15]}}
+  res |= __lasx_xvslti_d(_1, 16); // expected-error {{argument value 16 is outside the valid range [-16, 15]}}
+  res |= __lasx_xvslti_d(_1, var); // expected-error {{argument to '__builtin_lasx_xvslti_d' must be a constant integer}}
+  return res;
+}
+
+v32i8 xvslti_bu(v32u8 _1, int var) {
+  v32i8 res = __lasx_xvslti_bu(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __lasx_xvslti_bu(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __lasx_xvslti_bu(_1, var); // expected-error {{argument to '__builtin_lasx_xvslti_bu' must be a constant integer}}
+  return res;
+}
+
+v16i16 xvslti_hu(v16u16 _1, int var) {
+  v16i16 res = __lasx_xvslti_hu(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __lasx_xvslti_hu(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __lasx_xvslti_hu(_1, var); // expected-error {{argument to '__builtin_lasx_xvslti_hu' must be a constant integer}}
+  return res;
+}
+
+v8i32 xvslti_wu(v8u32 _1, int var) {
+  v8i32 res = __lasx_xvslti_wu(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __lasx_xvslti_wu(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __lasx_xvslti_wu(_1, var); // expected-error {{argument to '__builtin_lasx_xvslti_wu' must be a constant integer}}
+  return res;
+}
+
+v4i64 xvslti_du(v4u64 _1, int var) {
+  v4i64 res = __lasx_xvslti_du(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __lasx_xvslti_du(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __lasx_xvslti_du(_1, var); // expected-error {{argument to '__builtin_lasx_xvslti_du' must be a constant integer}}
+  return res;
+}
+
+v32i8 xvslei_b(v32i8 _1, int var) {
+  v32i8 res = __lasx_xvslei_b(_1, -17); // expected-error {{argument value -17 is outside the valid range [-16, 15]}}
+  res |= __lasx_xvslei_b(_1, 16); // expected-error {{argument value 16 is outside the valid range [-16, 15]}}
+  res |= __lasx_xvslei_b(_1, var); // expected-error {{argument to '__builtin_lasx_xvslei_b' must be a constant integer}}
+  return res;
+}
+
+v16i16 xvslei_h(v16i16 _1, int var) {
+  v16i16 res = __lasx_xvslei_h(_1, -17); // expected-error {{argument value -17 is outside the valid range [-16, 15]}}
+  res |= __lasx_xvslei_h(_1, 16); // expected-error {{argument value 16 is outside the valid range [-16, 15]}}
+  res |= __lasx_xvslei_h(_1, var); // expected-error {{argument to '__builtin_lasx_xvslei_h' must be a constant integer}}
+  return res;
+}
+
+v8i32 xvslei_w(v8i32 _1, int var) {
+  v8i32 res = __lasx_xvslei_w(_1, -17); // expected-error {{argument value -17 is outside the valid range [-16, 15]}}
+  res |= __lasx_xvslei_w(_1, 16); // expected-error {{argument value 16 is outside the valid range [-16, 15]}}
+  res |= __lasx_xvslei_w(_1, var); // expected-error {{argument to '__builtin_lasx_xvslei_w' must be a constant integer}}
+  return res;
+}
+
+v4i64 xvslei_d(v4i64 _1, int var) {
+  v4i64 res = __lasx_xvslei_d(_1, -17); // expected-error {{argument value -17 is outside the valid range [-16, 15]}}
+  res |= __lasx_xvslei_d(_1, 16); // expected-error {{argument value 16 is outside the valid range [-16, 15]}}
+  res |= __lasx_xvslei_d(_1, var); // expected-error {{argument to '__builtin_lasx_xvslei_d' must be a constant integer}}
+  return res;
+}
+
+v32i8 xvslei_bu(v32u8 _1, int var) {
+  v32i8 res = __lasx_xvslei_bu(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __lasx_xvslei_bu(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __lasx_xvslei_bu(_1, var); // expected-error {{argument to '__builtin_lasx_xvslei_bu' must be a constant integer}}
+  return res;
+}
+
+v16i16 xvslei_hu(v16u16 _1, int var) {
+  v16i16 res = __lasx_xvslei_hu(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __lasx_xvslei_hu(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __lasx_xvslei_hu(_1, var); // expected-error {{argument to '__builtin_lasx_xvslei_hu' must be a constant integer}}
+  return res;
+}
+
+v8i32 xvslei_wu(v8u32 _1, int var) {
+  v8i32 res = __lasx_xvslei_wu(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __lasx_xvslei_wu(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __lasx_xvslei_wu(_1, var); // expected-error {{argument to '__builtin_lasx_xvslei_wu' must be a constant integer}}
+  return res;
+}
+
+v4i64 xvslei_du(v4u64 _1, int var) {
+  v4i64 res = __lasx_xvslei_du(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __lasx_xvslei_du(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __lasx_xvslei_du(_1, var); // expected-error {{argument to '__builtin_lasx_xvslei_du' must be a constant integer}}
+  return res;
+}
+
+v32i8 xvsat_b(v32i8 _1, int var) {
+  v32i8 res = __lasx_xvsat_b(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 7]}}
+  res |= __lasx_xvsat_b(_1, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+  res |= __lasx_xvsat_b(_1, var); // expected-error {{argument to '__builtin_lasx_xvsat_b' must be a constant integer}}
+  return res;
+}
+
+v16i16 xvsat_h(v16i16 _1, int var) {
+  v16i16 res = __lasx_xvsat_h(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 15]}}
+  res |= __lasx_xvsat_h(_1, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  res |= __lasx_xvsat_h(_1, var); // expected-error {{argument to '__builtin_lasx_xvsat_h' must be a constant integer}}
+  return res;
+}
+
+v8i32 xvsat_w(v8i32 _1, int var) {
+  v8i32 res = __lasx_xvsat_w(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __lasx_xvsat_w(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __lasx_xvsat_w(_1, var); // expected-error {{argument to '__builtin_lasx_xvsat_w' must be a constant integer}}
+  return res;
+}
+
+v4i64 xvsat_d(v4i64 _1, int var) {
+  v4i64 res = __lasx_xvsat_d(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 63]}}
+  res |= __lasx_xvsat_d(_1, 64); // expected-error {{argument value 64 is outside the valid range [0, 63]}}
+  res |= __lasx_xvsat_d(_1, var); // expected-error {{argument to '__builtin_lasx_xvsat_d' must be a constant integer}}
+  return res;
+}
+
+v32u8 xvsat_bu(v32u8 _1, int var) {
+  v32u8 res = __lasx_xvsat_bu(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 7]}}
+  res |= __lasx_xvsat_bu(_1, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+  res |= __lasx_xvsat_bu(_1, var); // expected-error {{argument to '__builtin_lasx_xvsat_bu' must be a constant integer}}
+  return res;
+}
+
+v16u16 xvsat_hu(v16u16 _1, int var) {
+  v16u16 res = __lasx_xvsat_hu(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 15]}}
+  res |= __lasx_xvsat_hu(_1, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  res |= __lasx_xvsat_hu(_1, var); // expected-error {{argument to '__builtin_lasx_xvsat_hu' must be a constant integer}}
+  return res;
+}
+
+v8u32 xvsat_wu(v8u32 _1, int var) {
+  v8u32 res = __lasx_xvsat_wu(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __lasx_xvsat_wu(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __lasx_xvsat_wu(_1, var); // expected-error {{argument to '__builtin_lasx_xvsat_wu' must be a constant integer}}
+  return res;
+}
+
+v4u64 xvsat_du(v4u64 _1, int var) {
+  v4u64 res = __lasx_xvsat_du(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 63]}}
+  res |= __lasx_xvsat_du(_1, 64); // expected-error {{argument value 64 is outside the valid range [0, 63]}}
+  res |= __lasx_xvsat_du(_1, var); // expected-error {{argument to '__builtin_lasx_xvsat_du' must be a constant integer}}
+  return res;
+}
+
+v32i8 xvrepl128vei_b(v32i8 _1, int var) {
+  v32i8 res = __lasx_xvrepl128vei_b(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 15]}}
+  res |= __lasx_xvrepl128vei_b(_1, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  res |= __lasx_xvrepl128vei_b(_1, var); // expected-error {{argument to '__builtin_lasx_xvrepl128vei_b' must be a constant integer}}
+  return res;
+}
+
+v16i16 xvrepl128vei_h(v16i16 _1, int var) {
+  v16i16 res = __lasx_xvrepl128vei_h(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 7]}}
+  res |= __lasx_xvrepl128vei_h(_1, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+  res |= __lasx_xvrepl128vei_h(_1, var); // expected-error {{argument to '__builtin_lasx_xvrepl128vei_h' must be a constant integer}}
+  return res;
+}
+
+v8i32 xvrepl128vei_w(v8i32 _1, int var) {
+  v8i32 res = __lasx_xvrepl128vei_w(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 3]}}
+  res |= __lasx_xvrepl128vei_w(_1, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  res |= __lasx_xvrepl128vei_w(_1, var); // expected-error {{argument to '__builtin_lasx_xvrepl128vei_w' must be a constant integer}}
+  return res;
+}
+
+v4i64 xvrepl128vei_d(v4i64 _1, int var) {
+  v4i64 res = __lasx_xvrepl128vei_d(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 1]}}
+  res |= __lasx_xvrepl128vei_d(_1, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}}
+  res |= __lasx_xvrepl128vei_d(_1, var); // expected-error {{argument to '__builtin_lasx_xvrepl128vei_d' must be a constant integer}}
+  return res;
+}
+
+v32u8 xvandi_b(v32u8 _1, int var) {
+  v32u8 res = __lasx_xvandi_b(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 255]}}
+  res |= __lasx_xvandi_b(_1, 256); // expected-error {{argument value 256 is outside the valid range [0, 255]}}
+  res |= __lasx_xvandi_b(_1, var); // expected-error {{argument to '__builtin_lasx_xvandi_b' must be a constant integer}}
+  return res;
+}
+
+v32u8 xvori_b(v32u8 _1, int var) {
+  v32u8 res = __lasx_xvori_b(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 255]}}
+  res |= __lasx_xvori_b(_1, 256); // expected-error {{argument value 256 is outside the valid range [0, 255]}}
+  res |= __lasx_xvori_b(_1, var); // expected-error {{argument to '__builtin_lasx_xvori_b' must be a constant integer}}
+  return res;
+}
+
+v32u8 xvnori_b(v32u8 _1, int var) {
+  v32u8 res = __lasx_xvnori_b(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 255]}}
+  res |= __lasx_xvnori_b(_1, 256); // expected-error {{argument value 256 is outside the valid range [0, 255]}}
+  res |= __lasx_xvnori_b(_1, var); // expected-error {{argument to '__builtin_lasx_xvnori_b' must be a constant integer}}
+  return res;
+}
+
+v32u8 xvxori_b(v32u8 _1, int var) {
+  v32u8 res = __lasx_xvxori_b(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 255]}}
+  res |= __lasx_xvxori_b(_1, 256); // expected-error {{argument value 256 is outside the valid range [0, 255]}}
+  res |= __lasx_xvxori_b(_1, var); // expected-error {{argument to '__builtin_lasx_xvxori_b' must be a constant integer}}
+  return res;
+}
+
+v32u8 xvbitseli_b(v32u8 _1, v32u8 _2, int var) {
+  v32u8 res = __lasx_xvbitseli_b(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 255]}}
+  res |= __lasx_xvbitseli_b(_1, _2, 256); // expected-error {{argument value 256 is outside the valid range [0, 255]}}
+  res |= __lasx_xvbitseli_b(_1, _2, var); // expected-error {{argument to '__builtin_lasx_xvbitseli_b' must be a constant integer}}
+  return res;
+}
+
+v32i8 xvshuf4i_b(v32i8 _1, int var) {
+  v32i8 res = __lasx_xvshuf4i_b(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 255]}}
+  res |= __lasx_xvshuf4i_b(_1, 256); // expected-error {{argument value 256 is outside the valid range [0, 255]}}
+  res |= __lasx_xvshuf4i_b(_1, var); // expected-error {{argument to '__builtin_lasx_xvshuf4i_b' must be a constant integer}}
+  return res;
+}
+
+v16i16 xvshuf4i_h(v16i16 _1, int var) {
+  v16i16 res = __lasx_xvshuf4i_h(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 255]}}
+  res |= __lasx_xvshuf4i_h(_1, 256); // expected-error {{argument value 256 is outside the valid range [0, 255]}}
+  res |= __lasx_xvshuf4i_h(_1, var); // expected-error {{argument to '__builtin_lasx_xvshuf4i_h' must be a constant integer}}
+  return res;
+}
+
+v8i32 xvshuf4i_w(v8i32 _1, int var) {
+  v8i32 res = __lasx_xvshuf4i_w(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 255]}}
+  res |= __lasx_xvshuf4i_w(_1, 256); // expected-error {{argument value 256 is outside the valid range [0, 255]}}
+  res |= __lasx_xvshuf4i_w(_1, var); // expected-error {{argument to '__builtin_lasx_xvshuf4i_w' must be a constant integer}}
+  return res;
+}
+
+v4i64 xvshuf4i_d(v4i64 _1, v4i64 _2, int var) {
+  v4i64 res = __lasx_xvshuf4i_d(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 255]}}
+  res |= __lasx_xvshuf4i_d(_1, _2, 256); // expected-error {{argument value 256 is outside the valid range [0, 255]}}
+  res |= __lasx_xvshuf4i_d(_1, _2, var); // expected-error {{argument to '__builtin_lasx_xvshuf4i_d' must be a constant integer}}
+  return res;
+}
+
+v8i32 xvpermi_w(v8i32 _1, v8i32 _2, int var) {
+  v8i32 res = __lasx_xvpermi_w(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 255]}}
+  res |= __lasx_xvpermi_w(_1, _2, 256); // expected-error {{argument value 256 is outside the valid range [0, 255]}}
+  res |= __lasx_xvpermi_w(_1, _2, var); // expected-error {{argument to '__builtin_lasx_xvpermi_w' must be a constant integer}}
+  return res;
+}
+
+v4i64 xvpermi_d(v4i64 _1, int var) {
+  v4i64 res = __lasx_xvpermi_d(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 255]}}
+  res |= __lasx_xvpermi_d(_1, 256); // expected-error {{argument value 256 is outside the valid range [0, 255]}}
+  res |= __lasx_xvpermi_d(_1, var); // expected-error {{argument to '__builtin_lasx_xvpermi_d' must be a constant integer}}
+  return res;
+}
+
+v32i8 xvpermi_q(v32i8 _1, v32i8 _2, int var) {
+  v32i8 res = __lasx_xvpermi_q(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 255]}}
+  res |= __lasx_xvpermi_q(_1, _2, 256); // expected-error {{argument value 256 is outside the valid range [0, 255]}}
+  res |= __lasx_xvpermi_q(_1, _2, var); // expected-error {{argument to '__builtin_lasx_xvpermi_q' must be a constant integer}}
+  return res;
+}
+
+v16i16 xvsllwil_h_b(v32i8 _1, int var) {
+  v16i16 res = __lasx_xvsllwil_h_b(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 7]}}
+  res |= __lasx_xvsllwil_h_b(_1, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+  res |= __lasx_xvsllwil_h_b(_1, var); // expected-error {{argument to '__builtin_lasx_xvsllwil_h_b' must be a constant integer}}
+  return res;
+}
+
+v8i32 xvsllwil_w_h(v16i16 _1, int var) {
+  v8i32 res = __lasx_xvsllwil_w_h(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 15]}}
+  res |= __lasx_xvsllwil_w_h(_1, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  res |= __lasx_xvsllwil_w_h(_1, var); // expected-error {{argument to '__builtin_lasx_xvsllwil_w_h' must be a constant integer}}
+  return res;
+}
+
+v4i64 xvsllwil_d_w(v8i32 _1, int var) {
+  v4i64 res = __lasx_xvsllwil_d_w(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __lasx_xvsllwil_d_w(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __lasx_xvsllwil_d_w(_1, var); // expected-error {{argument to '__builtin_lasx_xvsllwil_d_w' must be a constant integer}}
+  return res;
+}
+
+v16u16 xvsllwil_hu_bu(v32u8 _1, int var) {
+  v16u16 res = __lasx_xvsllwil_hu_bu(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 7]}}
+  res |= __lasx_xvsllwil_hu_bu(_1, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+  res |= __lasx_xvsllwil_hu_bu(_1, var); // expected-error {{argument to '__builtin_lasx_xvsllwil_hu_bu' must be a constant integer}}
+  return res;
+}
+
+v8u32 xvsllwil_wu_hu(v16u16 _1, int var) {
+  v8u32 res = __lasx_xvsllwil_wu_hu(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 15]}}
+  res |= __lasx_xvsllwil_wu_hu(_1, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  res |= __lasx_xvsllwil_wu_hu(_1, var); // expected-error {{argument to '__builtin_lasx_xvsllwil_wu_hu' must be a constant integer}}
+  return res;
+}
+
+v4u64 xvsllwil_du_wu(v8u32 _1, int var) {
+  v4u64 res = __lasx_xvsllwil_du_wu(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __lasx_xvsllwil_du_wu(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __lasx_xvsllwil_du_wu(_1, var); // expected-error {{argument to '__builtin_lasx_xvsllwil_du_wu' must be a constant integer}}
+  return res;
+}
+
+v32i8 xvfrstpi_b(v32i8 _1, v32i8 _2, int var) {
+  v32i8 res = __lasx_xvfrstpi_b(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __lasx_xvfrstpi_b(_1, _2, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __lasx_xvfrstpi_b(_1, _2, var); // expected-error {{argument to '__builtin_lasx_xvfrstpi_b' must be a constant integer}}
+  return res;
+}
+
+v16i16 xvfrstpi_h(v16i16 _1, v16i16 _2, int var) {
+  v16i16 res = __lasx_xvfrstpi_h(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __lasx_xvfrstpi_h(_1, _2, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __lasx_xvfrstpi_h(_1, _2, var); // expected-error {{argument to '__builtin_lasx_xvfrstpi_h' must be a constant integer}}
+  return res;
+}
+
+v32i8 xvbsrl_v(v32i8 _1, int var) {
+  v32i8 res = __lasx_xvbsrl_v(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __lasx_xvbsrl_v(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __lasx_xvbsrl_v(_1, var); // expected-error {{argument to '__builtin_lasx_xvbsrl_v' must be a constant integer}}
+  return res;
+}
+
+v32i8 xvbsll_v(v32i8 _1, int var) {
+  v32i8 res = __lasx_xvbsll_v(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __lasx_xvbsll_v(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __lasx_xvbsll_v(_1, var); // expected-error {{argument to '__builtin_lasx_xvbsll_v' must be a constant integer}}
+  return res;
+}
+
+v32i8 xvextrins_b(v32i8 _1, v32i8 _2, int var) {
+  v32i8 res = __lasx_xvextrins_b(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 255]}}
+  res |= __lasx_xvextrins_b(_1, _2, 256); // expected-error {{argument value 256 is outside the valid range [0, 255]}}
+  res |= __lasx_xvextrins_b(_1, _2, var); // expected-error {{argument to '__builtin_lasx_xvextrins_b' must be a constant integer}}
+  return res;
+}
+
+v16i16 xvextrins_h(v16i16 _1, v16i16 _2, int var) {
+  v16i16 res = __lasx_xvextrins_h(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 255]}}
+  res |= __lasx_xvextrins_h(_1, _2, 256); // expected-error {{argument value 256 is outside the valid range [0, 255]}}
+  res |= __lasx_xvextrins_h(_1, _2, var); // expected-error {{argument to '__builtin_lasx_xvextrins_h' must be a constant integer}}
+  return res;
+}
+
+v8i32 xvextrins_w(v8i32 _1, v8i32 _2, int var) {
+  v8i32 res = __lasx_xvextrins_w(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 255]}}
+  res |= __lasx_xvextrins_w(_1, _2, 256); // expected-error {{argument value 256 is outside the valid range [0, 255]}}
+  res |= __lasx_xvextrins_w(_1, _2, var); // expected-error {{argument to '__builtin_lasx_xvextrins_w' must be a constant integer}}
+  return res;
+}
+
+v4i64 xvextrins_d(v4i64 _1, v4i64 _2, int var) {
+  v4i64 res = __lasx_xvextrins_d(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 255]}}
+  res |= __lasx_xvextrins_d(_1, _2, 256); // expected-error {{argument value 256 is outside the valid range [0, 255]}}
+  res |= __lasx_xvextrins_d(_1, _2, var); // expected-error {{argument to '__builtin_lasx_xvextrins_d' must be a constant integer}}
+  return res;
+}
+
+v32i8 xvld(void *_1, int var) {
+  v32i8 res = __lasx_xvld(_1, -2049); // expected-error {{argument value -2049 is outside the valid range [-2048, 2047]}}
+  res |= __lasx_xvld(_1, 2048); // expected-error {{argument value 2048 is outside the valid range [-2048, 2047]}}
+  res |= __lasx_xvld(_1, var); // expected-error {{argument to '__builtin_lasx_xvld' must be a constant integer}}
+  return res;
+}
+
+void xvst(v32i8 _1, void *_2, int var) {
+  __lasx_xvst(_1, _2, -2049); // expected-error {{argument value -2049 is outside the valid range [-2048, 2047]}}
+  __lasx_xvst(_1, _2, 2048); // expected-error {{argument value 2048 is outside the valid range [-2048, 2047]}}
+  __lasx_xvst(_1, _2, var); // expected-error {{argument to '__builtin_lasx_xvst' must be a constant integer}}
+}
+
+void xvstelm_b(v32i8 _1, void * _2, int var) {
+  __lasx_xvstelm_b(_1, _2, -129, 1); // expected-error {{argument value -129 is outside the valid range [-128, 127]}}
+  __lasx_xvstelm_b(_1, _2, 128, 1); // expected-error {{argument value 128 is outside the valid range [-128, 127]}}
+  __lasx_xvstelm_b(_1, _2, var, 1); // expected-error {{argument to '__builtin_lasx_xvstelm_b' must be a constant integer}}
+}
+
+void xvstelm_h(v16i16 _1, void * _2, int var) {
+  __lasx_xvstelm_h(_1, _2, -258, 1); // expected-error {{argument value -258 is outside the valid range [-256, 254]}}
+  __lasx_xvstelm_h(_1, _2, 256, 1); // expected-error {{argument value 256 is outside the valid range [-256, 254]}}
+  __lasx_xvstelm_h(_1, _2, var, 1); // expected-error {{argument to '__builtin_lasx_xvstelm_h' must be a constant integer}}
+}
+
+void xvstelm_w(v8i32 _1, void * _2, int var) {
+  __lasx_xvstelm_w(_1, _2, -516, 1); // expected-error {{argument value -516 is outside the valid range [-512, 508]}}
+  __lasx_xvstelm_w(_1, _2, 512, 1); // expected-error {{argument value 512 is outside the valid range [-512, 508]}}
+  __lasx_xvstelm_w(_1, _2, var, 1); // expected-error {{argument to '__builtin_lasx_xvstelm_w' must be a constant integer}}
+}
+
+void xvstelm_d(v4i64 _1, void * _2, int var) {
+  __lasx_xvstelm_d(_1, _2, -1032, 1); // expected-error {{argument value -1032 is outside the valid range [-1024, 1016]}}
+  __lasx_xvstelm_d(_1, _2, 1024, 1); // expected-error {{argument value 1024 is outside the valid range [-1024, 1016]}}
+  __lasx_xvstelm_d(_1, _2, var, 1); // expected-error {{argument to '__builtin_lasx_xvstelm_d' must be a constant integer}}
+}
+
+void xvstelm_b_idx(v32i8 _1, void * _2, int var) {
+  __lasx_xvstelm_b(_1, _2, 1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  __lasx_xvstelm_b(_1, _2, 1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  __lasx_xvstelm_b(_1, _2, 1, var); // expected-error {{argument to '__builtin_lasx_xvstelm_b' must be a constant integer}}
+}
+
+void xvstelm_h_idx(v16i16 _1, void * _2, int var) {
+  __lasx_xvstelm_h(_1, _2, 2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 15]}}
+  __lasx_xvstelm_h(_1, _2, 2, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  __lasx_xvstelm_h(_1, _2, 2, var); // expected-error {{argument to '__builtin_lasx_xvstelm_h' must be a constant integer}}
+}
+
+void xvstelm_w_idx(v8i32 _1, void * _2, int var) {
+  __lasx_xvstelm_w(_1, _2, 4, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 7]}}
+  __lasx_xvstelm_w(_1, _2, 4, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+  __lasx_xvstelm_w(_1, _2, 4, var); // expected-error {{argument to '__builtin_lasx_xvstelm_w' must be a constant integer}}
+}
+
+void xvstelm_d_idx(v4i64 _1, void * _2, int var) {
+  __lasx_xvstelm_d(_1, _2, 8, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 3]}}
+  __lasx_xvstelm_d(_1, _2, 8, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  __lasx_xvstelm_d(_1, _2, 8, var); // expected-error {{argument to '__builtin_lasx_xvstelm_d' must be a constant integer}}
+}
+
+v8i32 xvinsve0_w(v8i32 _1, v8i32 _2, int var) {
+  v8i32 res = __lasx_xvinsve0_w(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 7]}}
+  res |= __lasx_xvinsve0_w(_1, _2, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+  res |= __lasx_xvinsve0_w(_1, _2, var); // expected-error {{argument to '__builtin_lasx_xvinsve0_w' must be a constant integer}}
+  return res;
+}
+
+v4i64 xvinsve0_d(v4i64 _1, v4i64 _2, int var) {
+  v4i64 res = __lasx_xvinsve0_d(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 3]}}
+  res |= __lasx_xvinsve0_d(_1, _2, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  res |= __lasx_xvinsve0_d(_1, _2, var); // expected-error {{argument to '__builtin_lasx_xvinsve0_d' must be a constant integer}}
+  return res;
+}
+
+v8i32 xvpickve_w(v8i32 _1, int var) {
+  v8i32 res = __lasx_xvpickve_w(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 7]}}
+  res |= __lasx_xvpickve_w(_1, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+  res |= __lasx_xvpickve_w(_1, var); // expected-error {{argument to '__builtin_lasx_xvpickve_w' must be a constant integer}}
+  return res;
+}
+
+v4i64 xvpickve_d(v4i64 _1, int var) {
+  v4i64 res = __lasx_xvpickve_d(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 3]}}
+  res |= __lasx_xvpickve_d(_1, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  res |= __lasx_xvpickve_d(_1, var); // expected-error {{argument to '__builtin_lasx_xvpickve_d' must be a constant integer}}
+  return res;
+}
+
+v4i64 xvldi(int var) {
+  v4i64 res = __lasx_xvldi(-4097); // expected-error {{argument value -4097 is outside the valid range [-4096, 4095]}}
+  res |= __lasx_xvldi(4096); // expected-error {{argument value 4096 is outside the valid range [-4096, 4095]}}
+  res |= __lasx_xvldi(var); // expected-error {{argument to '__builtin_lasx_xvldi' must be a constant integer}}
+  return res;
+}
+
+v8i32 xvinsgr2vr_w(v8i32 _1, int var) {
+  v8i32 res = __lasx_xvinsgr2vr_w(_1, 1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 7]}}
+  res |= __lasx_xvinsgr2vr_w(_1, 1, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+  res |= __lasx_xvinsgr2vr_w(_1, 1, var); // expected-error {{argument to '__builtin_lasx_xvinsgr2vr_w' must be a constant integer}}
+  return res;
+}
+
+v4i64 xvinsgr2vr_d(v4i64 _1, int var) {
+  v4i64 res = __lasx_xvinsgr2vr_d(_1, 1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 3]}}
+  res |= __lasx_xvinsgr2vr_d(_1, 1, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  res |= __lasx_xvinsgr2vr_d(_1, 1, var); // expected-error {{argument to '__builtin_lasx_xvinsgr2vr_d' must be a constant integer}}
+  return res;
+}
+
+v32i8 xvldrepl_b(void *_1, int var) {
+  v32i8 res = __lasx_xvldrepl_b(_1, -2049); // expected-error {{argument value -2049 is outside the valid range [-2048, 2047]}}
+  res |= __lasx_xvldrepl_b(_1, 2048); // expected-error {{argument value 2048 is outside the valid range [-2048, 2047]}}
+  res |= __lasx_xvldrepl_b(_1, var); // expected-error {{argument to '__builtin_lasx_xvldrepl_b' must be a constant integer}}
+  return res;
+}
+
+v16i16 xvldrepl_h(void *_1, int var) {
+  v16i16 res = __lasx_xvldrepl_h(_1, -2050); // expected-error {{argument value -2050 is outside the valid range [-2048, 2046]}}
+  res |= __lasx_xvldrepl_h(_1, 2048); // expected-error {{argument value 2048 is outside the valid range [-2048, 2046]}}
+  res |= __lasx_xvldrepl_h(_1, var); // expected-error {{argument to '__builtin_lasx_xvldrepl_h' must be a constant integer}}
+  return res;
+}
+
+v8i32 xvldrepl_w(void *_1, int var) {
+  v8i32 res = __lasx_xvldrepl_w(_1, -2052); // expected-error {{argument value -2052 is outside the valid range [-2048, 2044]}}
+  res |= __lasx_xvldrepl_w(_1, 2048); // expected-error {{argument value 2048 is outside the valid range [-2048, 2044]}}
+  res |= __lasx_xvldrepl_w(_1, var); // expected-error {{argument to '__builtin_lasx_xvldrepl_w' must be a constant integer}}
+  return res;
+}
+
+v4i64 xvldrepl_d(void *_1, int var) {
+  v4i64 res = __lasx_xvldrepl_d(_1, -2056); // expected-error {{argument value -2056 is outside the valid range [-2048, 2040]}}
+  res |= __lasx_xvldrepl_d(_1, 2048); // expected-error {{argument value 2048 is outside the valid range [-2048, 2040]}}
+  res |= __lasx_xvldrepl_d(_1, var); // expected-error {{argument to '__builtin_lasx_xvldrepl_d' must be a constant integer}}
+  return res;
+}
+
+int xvpickve2gr_w(v8i32 _1, int var) {
+  int res = __lasx_xvpickve2gr_w(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 7]}}
+  res |= __lasx_xvpickve2gr_w(_1, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+  res |= __lasx_xvpickve2gr_w(_1, var); // expected-error {{argument to '__builtin_lasx_xvpickve2gr_w' must be a constant integer}}
+  return res;
+}
+
+unsigned int xvpickve2gr_wu(v8i32 _1, int var) {
+  unsigned int res = __lasx_xvpickve2gr_wu(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 7]}}
+  res |= __lasx_xvpickve2gr_wu(_1, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+  res |= __lasx_xvpickve2gr_wu(_1, var); // expected-error {{argument to '__builtin_lasx_xvpickve2gr_wu' must be a constant integer}}
+  return res;
+}
+
+long xvpickve2gr_d(v4i64 _1, int var) {
+  long res = __lasx_xvpickve2gr_d(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 3]}}
+  res |= __lasx_xvpickve2gr_d(_1, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  res |= __lasx_xvpickve2gr_d(_1, var); // expected-error {{argument to '__builtin_lasx_xvpickve2gr_d' must be a constant integer}}
+  return res;
+}
+
+unsigned long int xvpickve2gr_du(v4i64 _1, int var) {
+  unsigned long int res = __lasx_xvpickve2gr_du(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 3]}}
+  res |= __lasx_xvpickve2gr_du(_1, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  res |= __lasx_xvpickve2gr_du(_1, var); // expected-error {{argument to '__builtin_lasx_xvpickve2gr_du' must be a constant integer}}
+  return res;
+}
+
+v32i8 xvrotri_b(v32i8 _1, int var) {
+  v32i8 res = __lasx_xvrotri_b(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 7]}}
+  res |= __lasx_xvrotri_b(_1, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+  res |= __lasx_xvrotri_b(_1, var); // expected-error {{argument to '__builtin_lasx_xvrotri_b' must be a constant integer}}
+  return res;
+}
+
+v16i16 xvrotri_h(v16i16 _1, int var) {
+  v16i16 res = __lasx_xvrotri_h(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 15]}}
+  res |= __lasx_xvrotri_h(_1, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  res |= __lasx_xvrotri_h(_1, var); // expected-error {{argument to '__builtin_lasx_xvrotri_h' must be a constant integer}}
+  return res;
+}
+
+v8i32 xvrotri_w(v8i32 _1, int var) {
+  v8i32 res = __lasx_xvrotri_w(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __lasx_xvrotri_w(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __lasx_xvrotri_w(_1, var); // expected-error {{argument to '__builtin_lasx_xvrotri_w' must be a constant integer}}
+  return res;
+}
+
+v4i64 xvrotri_d(v4i64 _1, int var) {
+  v4i64 res = __lasx_xvrotri_d(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 63]}}
+  res |= __lasx_xvrotri_d(_1, 64); // expected-error {{argument value 64 is outside the valid range [0, 63]}}
+  res |= __lasx_xvrotri_d(_1, var); // expected-error {{argument to '__builtin_lasx_xvrotri_d' must be a constant integer}}
+  return res;
+}
+
+v32i8 xvsrlni_b_h(v32i8 _1, v32i8 _2, int var) {
+  v32i8 res = __lasx_xvsrlni_b_h(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 15]}}
+  res |= __lasx_xvsrlni_b_h(_1, _2, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  res |= __lasx_xvsrlni_b_h(_1, _2, var); // expected-error {{argument to '__builtin_lasx_xvsrlni_b_h' must be a constant integer}}
+  return res;
+}
+
+v16i16 xvsrlni_h_w(v16i16 _1, v16i16 _2, int var) {
+  v16i16 res = __lasx_xvsrlni_h_w(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __lasx_xvsrlni_h_w(_1, _2, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __lasx_xvsrlni_h_w(_1, _2, var); // expected-error {{argument to '__builtin_lasx_xvsrlni_h_w' must be a constant integer}}
+  return res;
+}
+
+v8i32 xvsrlni_w_d(v8i32 _1, v8i32 _2, int var) {
+  v8i32 res = __lasx_xvsrlni_w_d(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 63]}}
+  res |= __lasx_xvsrlni_w_d(_1, _2, 64); // expected-error {{argument value 64 is outside the valid range [0, 63]}}
+  res |= __lasx_xvsrlni_w_d(_1, _2, var); // expected-error {{argument to '__builtin_lasx_xvsrlni_w_d' must be a constant integer}}
+  return res;
+}
+
+v4i64 xvsrlni_d_q(v4i64 _1, v4i64 _2, int var) {
+  v4i64 res = __lasx_xvsrlni_d_q(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 127]}}
+  res |= __lasx_xvsrlni_d_q(_1, _2, 128); // expected-error {{argument value 128 is outside the valid range [0, 127]}}
+  res |= __lasx_xvsrlni_d_q(_1, _2, var); // expected-error {{argument to '__builtin_lasx_xvsrlni_d_q' must be a constant integer}}
+  return res;
+}
+
+v32i8 xvsrlrni_b_h(v32i8 _1, v32i8 _2, int var) {
+  v32i8 res = __lasx_xvsrlrni_b_h(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 15]}}
+  res |= __lasx_xvsrlrni_b_h(_1, _2, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  res |= __lasx_xvsrlrni_b_h(_1, _2, var); // expected-error {{argument to '__builtin_lasx_xvsrlrni_b_h' must be a constant integer}}
+  return res;
+}
+
+v16i16 xvsrlrni_h_w(v16i16 _1, v16i16 _2, int var) {
+  v16i16 res = __lasx_xvsrlrni_h_w(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __lasx_xvsrlrni_h_w(_1, _2, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __lasx_xvsrlrni_h_w(_1, _2, var); // expected-error {{argument to '__builtin_lasx_xvsrlrni_h_w' must be a constant integer}}
+  return res;
+}
+
+v8i32 xvsrlrni_w_d(v8i32 _1, v8i32 _2, int var) {
+  v8i32 res = __lasx_xvsrlrni_w_d(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 63]}}
+  res |= __lasx_xvsrlrni_w_d(_1, _2, 64); // expected-error {{argument value 64 is outside the valid range [0, 63]}}
+  res |= __lasx_xvsrlrni_w_d(_1, _2, var); // expected-error {{argument to '__builtin_lasx_xvsrlrni_w_d' must be a constant integer}}
+  return res;
+}
+
+v4i64 xvsrlrni_d_q(v4i64 _1, v4i64 _2, int var) {
+  v4i64 res = __lasx_xvsrlrni_d_q(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 127]}}
+  res |= __lasx_xvsrlrni_d_q(_1, _2, 128); // expected-error {{argument value 128 is outside the valid range [0, 127]}}
+  res |= __lasx_xvsrlrni_d_q(_1, _2, var); // expected-error {{argument to '__builtin_lasx_xvsrlrni_d_q' must be a constant integer}}
+  return res;
+}
+
+v32i8 xvssrlni_b_h(v32i8 _1, v32i8 _2, int var) {
+  v32i8 res = __lasx_xvssrlni_b_h(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 15]}}
+  res |= __lasx_xvssrlni_b_h(_1, _2, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  res |= __lasx_xvssrlni_b_h(_1, _2, var); // expected-error {{argument to '__builtin_lasx_xvssrlni_b_h' must be a constant integer}}
+  return res;
+}
+
+v16i16 xvssrlni_h_w(v16i16 _1, v16i16 _2, int var) {
+  v16i16 res = __lasx_xvssrlni_h_w(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __lasx_xvssrlni_h_w(_1, _2, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __lasx_xvssrlni_h_w(_1, _2, var); // expected-error {{argument to '__builtin_lasx_xvssrlni_h_w' must be a constant integer}}
+  return res;
+}
+
+v8i32 xvssrlni_w_d(v8i32 _1, v8i32 _2, int var) {
+  v8i32 res = __lasx_xvssrlni_w_d(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 63]}}
+  res |= __lasx_xvssrlni_w_d(_1, _2, 64); // expected-error {{argument value 64 is outside the valid range [0, 63]}}
+  res |= __lasx_xvssrlni_w_d(_1, _2, var); // expected-error {{argument to '__builtin_lasx_xvssrlni_w_d' must be a constant integer}}
+  return res;
+}
+
+v4i64 xvssrlni_d_q(v4i64 _1, v4i64 _2, int var) {
+  v4i64 res = __lasx_xvssrlni_d_q(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 127]}}
+  res |= __lasx_xvssrlni_d_q(_1, _2, 128); // expected-error {{argument value 128 is outside the valid range [0, 127]}}
+  res |= __lasx_xvssrlni_d_q(_1, _2, var); // expected-error {{argument to '__builtin_lasx_xvssrlni_d_q' must be a constant integer}}
+  return res;
+}
+
+v32u8 xvssrlni_bu_h(v32u8 _1, v32i8 _2, int var) {
+  v32u8 res = __lasx_xvssrlni_bu_h(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 15]}}
+  res |= __lasx_xvssrlni_bu_h(_1, _2, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  res |= __lasx_xvssrlni_bu_h(_1, _2, var); // expected-error {{argument to '__builtin_lasx_xvssrlni_bu_h' must be a constant integer}}
+  return res;
+}
+
+v16u16 xvssrlni_hu_w(v16u16 _1, v16i16 _2, int var) {
+  v16u16 res = __lasx_xvssrlni_hu_w(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __lasx_xvssrlni_hu_w(_1, _2, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __lasx_xvssrlni_hu_w(_1, _2, var); // expected-error {{argument to '__builtin_lasx_xvssrlni_hu_w' must be a constant integer}}
+  return res;
+}
+
+v8u32 xvssrlni_wu_d(v8u32 _1, v8i32 _2, int var) {
+  v8u32 res = __lasx_xvssrlni_wu_d(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 63]}}
+  res |= __lasx_xvssrlni_wu_d(_1, _2, 64); // expected-error {{argument value 64 is outside the valid range [0, 63]}}
+  res |= __lasx_xvssrlni_wu_d(_1, _2, var); // expected-error {{argument to '__builtin_lasx_xvssrlni_wu_d' must be a constant integer}}
+  return res;
+}
+
+v4u64 xvssrlni_du_q(v4u64 _1, v4i64 _2, int var) {
+  v4u64 res = __lasx_xvssrlni_du_q(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 127]}}
+  res |= __lasx_xvssrlni_du_q(_1, _2, 128); // expected-error {{argument value 128 is outside the valid range [0, 127]}}
+  res |= __lasx_xvssrlni_du_q(_1, _2, var); // expected-error {{argument to '__builtin_lasx_xvssrlni_du_q' must be a constant integer}}
+  return res;
+}
+
+v32i8 xvssrlrni_b_h(v32i8 _1, v32i8 _2, int var) {
+  v32i8 res = __lasx_xvssrlrni_b_h(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 15]}}
+  res |= __lasx_xvssrlrni_b_h(_1, _2, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  res |= __lasx_xvssrlrni_b_h(_1, _2, var); // expected-error {{argument to '__builtin_lasx_xvssrlrni_b_h' must be a constant integer}}
+  return res;
+}
+
+v16i16 xvssrlrni_h_w(v16i16 _1, v16i16 _2, int var) {
+  v16i16 res = __lasx_xvssrlrni_h_w(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __lasx_xvssrlrni_h_w(_1, _2, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __lasx_xvssrlrni_h_w(_1, _2, var); // expected-error {{argument to '__builtin_lasx_xvssrlrni_h_w' must be a constant integer}}
+  return res;
+}
+
+v8i32 xvssrlrni_w_d(v8i32 _1, v8i32 _2, int var) {
+  v8i32 res = __lasx_xvssrlrni_w_d(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 63]}}
+  res |= __lasx_xvssrlrni_w_d(_1, _2, 64); // expected-error {{argument value 64 is outside the valid range [0, 63]}}
+  res |= __lasx_xvssrlrni_w_d(_1, _2, var); // expected-error {{argument to '__builtin_lasx_xvssrlrni_w_d' must be a constant integer}}
+  return res;
+}
+
+v4i64 xvssrlrni_d_q(v4i64 _1, v4i64 _2, int var) {
+  v4i64 res = __lasx_xvssrlrni_d_q(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 127]}}
+  res |= __lasx_xvssrlrni_d_q(_1, _2, 128); // expected-error {{argument value 128 is outside the valid range [0, 127]}}
+  res |= __lasx_xvssrlrni_d_q(_1, _2, var); // expected-error {{argument to '__builtin_lasx_xvssrlrni_d_q' must be a constant integer}}
+  return res;
+}
+
+v32u8 xvssrlrni_bu_h(v32u8 _1, v32i8 _2, int var) {
+  v32u8 res = __lasx_xvssrlrni_bu_h(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 15]}}
+  res |= __lasx_xvssrlrni_bu_h(_1, _2, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  res |= __lasx_xvssrlrni_bu_h(_1, _2, var); // expected-error {{argument to '__builtin_lasx_xvssrlrni_bu_h' must be a constant integer}}
+  return res;
+}
+
+v16u16 xvssrlrni_hu_w(v16u16 _1, v16i16 _2, int var) {
+  v16u16 res = __lasx_xvssrlrni_hu_w(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __lasx_xvssrlrni_hu_w(_1, _2, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __lasx_xvssrlrni_hu_w(_1, _2, var); // expected-error {{argument to '__builtin_lasx_xvssrlrni_hu_w' must be a constant integer}}
+  return res;
+}
+
+v8u32 xvssrlrni_wu_d(v8u32 _1, v8i32 _2, int var) {
+  v8u32 res = __lasx_xvssrlrni_wu_d(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 63]}}
+  res |= __lasx_xvssrlrni_wu_d(_1, _2, 64); // expected-error {{argument value 64 is outside the valid range [0, 63]}}
+  res |= __lasx_xvssrlrni_wu_d(_1, _2, var); // expected-error {{argument to '__builtin_lasx_xvssrlrni_wu_d' must be a constant integer}}
+  return res;
+}
+
+v4u64 xvssrlrni_du_q(v4u64 _1, v4i64 _2, int var) {
+  v4u64 res = __lasx_xvssrlrni_du_q(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 127]}}
+  res |= __lasx_xvssrlrni_du_q(_1, _2, 128); // expected-error {{argument value 128 is outside the valid range [0, 127]}}
+  res |= __lasx_xvssrlrni_du_q(_1, _2, var); // expected-error {{argument to '__builtin_lasx_xvssrlrni_du_q' must be a constant integer}}
+  return res;
+}
+
+v32i8 xvsrani_b_h(v32i8 _1, v32i8 _2, int var) {
+  v32i8 res = __lasx_xvsrani_b_h(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 15]}}
+  res |= __lasx_xvsrani_b_h(_1, _2, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  res |= __lasx_xvsrani_b_h(_1, _2, var); // expected-error {{argument to '__builtin_lasx_xvsrani_b_h' must be a constant integer}}
+  return res;
+}
+
+v16i16 xvsrani_h_w(v16i16 _1, v16i16 _2, int var) {
+  v16i16 res = __lasx_xvsrani_h_w(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __lasx_xvsrani_h_w(_1, _2, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __lasx_xvsrani_h_w(_1, _2, var); // expected-error {{argument to '__builtin_lasx_xvsrani_h_w' must be a constant integer}}
+  return res;
+}
+
+v8i32 xvsrani_w_d(v8i32 _1, v8i32 _2, int var) {
+  v8i32 res = __lasx_xvsrani_w_d(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 63]}}
+  res |= __lasx_xvsrani_w_d(_1, _2, 64); // expected-error {{argument value 64 is outside the valid range [0, 63]}}
+  res |= __lasx_xvsrani_w_d(_1, _2, var); // expected-error {{argument to '__builtin_lasx_xvsrani_w_d' must be a constant integer}}
+  return res;
+}
+
+v4i64 xvsrani_d_q(v4i64 _1, v4i64 _2, int var) {
+  v4i64 res = __lasx_xvsrani_d_q(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 127]}}
+  res |= __lasx_xvsrani_d_q(_1, _2, 128); // expected-error {{argument value 128 is outside the valid range [0, 127]}}
+  res |= __lasx_xvsrani_d_q(_1, _2, var); // expected-error {{argument to '__builtin_lasx_xvsrani_d_q' must be a constant integer}}
+  return res;
+}
+
+v32i8 xvsrarni_b_h(v32i8 _1, v32i8 _2, int var) {
+  v32i8 res = __lasx_xvsrarni_b_h(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 15]}}
+  res |= __lasx_xvsrarni_b_h(_1, _2, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  res |= __lasx_xvsrarni_b_h(_1, _2, var); // expected-error {{argument to '__builtin_lasx_xvsrarni_b_h' must be a constant integer}}
+  return res;
+}
+
+v16i16 xvsrarni_h_w(v16i16 _1, v16i16 _2, int var) {
+  v16i16 res = __lasx_xvsrarni_h_w(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __lasx_xvsrarni_h_w(_1, _2, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __lasx_xvsrarni_h_w(_1, _2, var); // expected-error {{argument to '__builtin_lasx_xvsrarni_h_w' must be a constant integer}}
+  return res;
+}
+
+v8i32 xvsrarni_w_d(v8i32 _1, v8i32 _2, int var) {
+  v8i32 res = __lasx_xvsrarni_w_d(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 63]}}
+  res |= __lasx_xvsrarni_w_d(_1, _2, 64); // expected-error {{argument value 64 is outside the valid range [0, 63]}}
+  res |= __lasx_xvsrarni_w_d(_1, _2, var); // expected-error {{argument to '__builtin_lasx_xvsrarni_w_d' must be a constant integer}}
+  return res;
+}
+
+v4i64 xvsrarni_d_q(v4i64 _1, v4i64 _2, int var) {
+  v4i64 res = __lasx_xvsrarni_d_q(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 127]}}
+  res |= __lasx_xvsrarni_d_q(_1, _2, 128); // expected-error {{argument value 128 is outside the valid range [0, 127]}}
+  res |= __lasx_xvsrarni_d_q(_1, _2, var); // expected-error {{argument to '__builtin_lasx_xvsrarni_d_q' must be a constant integer}}
+  return res;
+}
+
+v32i8 xvssrani_b_h(v32i8 _1, v32i8 _2, int var) {
+  v32i8 res = __lasx_xvssrani_b_h(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 15]}}
+  res |= __lasx_xvssrani_b_h(_1, _2, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  res |= __lasx_xvssrani_b_h(_1, _2, var); // expected-error {{argument to '__builtin_lasx_xvssrani_b_h' must be a constant integer}}
+  return res;
+}
+
+v16i16 xvssrani_h_w(v16i16 _1, v16i16 _2, int var) {
+  v16i16 res = __lasx_xvssrani_h_w(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __lasx_xvssrani_h_w(_1, _2, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __lasx_xvssrani_h_w(_1, _2, var); // expected-error {{argument to '__builtin_lasx_xvssrani_h_w' must be a constant integer}}
+  return res;
+}
+
+v8i32 xvssrani_w_d(v8i32 _1, v8i32 _2, int var) {
+  v8i32 res = __lasx_xvssrani_w_d(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 63]}}
+  res |= __lasx_xvssrani_w_d(_1, _2, 64); // expected-error {{argument value 64 is outside the valid range [0, 63]}}
+  res |= __lasx_xvssrani_w_d(_1, _2, var); // expected-error {{argument to '__builtin_lasx_xvssrani_w_d' must be a constant integer}}
+  return res;
+}
+
+v4i64 xvssrani_d_q(v4i64 _1, v4i64 _2, int var) {
+  v4i64 res = __lasx_xvssrani_d_q(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 127]}}
+  res |= __lasx_xvssrani_d_q(_1, _2, 128); // expected-error {{argument value 128 is outside the valid range [0, 127]}}
+  res |= __lasx_xvssrani_d_q(_1, _2, var); // expected-error {{argument to '__builtin_lasx_xvssrani_d_q' must be a constant integer}}
+  return res;
+}
+
+v32u8 xvssrani_bu_h(v32u8 _1, v32i8 _2, int var) {
+  v32u8 res = __lasx_xvssrani_bu_h(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 15]}}
+  res |= __lasx_xvssrani_bu_h(_1, _2, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  res |= __lasx_xvssrani_bu_h(_1, _2, var); // expected-error {{argument to '__builtin_lasx_xvssrani_bu_h' must be a constant integer}}
+  return res;
+}
+
+v16u16 xvssrani_hu_w(v16u16 _1, v16i16 _2, int var) {
+  v16u16 res = __lasx_xvssrani_hu_w(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __lasx_xvssrani_hu_w(_1, _2, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __lasx_xvssrani_hu_w(_1, _2, var); // expected-error {{argument to '__builtin_lasx_xvssrani_hu_w' must be a constant integer}}
+  return res;
+}
+
+v8u32 xvssrani_wu_d(v8u32 _1, v8i32 _2, int var) {
+  v8u32 res = __lasx_xvssrani_wu_d(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 63]}}
+  res |= __lasx_xvssrani_wu_d(_1, _2, 64); // expected-error {{argument value 64 is outside the valid range [0, 63]}}
+  res |= __lasx_xvssrani_wu_d(_1, _2, var); // expected-error {{argument to '__builtin_lasx_xvssrani_wu_d' must be a constant integer}}
+  return res;
+}
+
+v4u64 xvssrani_du_q(v4u64 _1, v4i64 _2, int var) {
+  v4u64 res = __lasx_xvssrani_du_q(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 127]}}
+  res |= __lasx_xvssrani_du_q(_1, _2, 128); // expected-error {{argument value 128 is outside the valid range [0, 127]}}
+  res |= __lasx_xvssrani_du_q(_1, _2, var); // expected-error {{argument to '__builtin_lasx_xvssrani_du_q' must be a constant integer}}
+  return res;
+}
+
+v32i8 xvssrarni_b_h(v32i8 _1, v32i8 _2, int var) {
+  v32i8 res = __lasx_xvssrarni_b_h(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 15]}}
+  res |= __lasx_xvssrarni_b_h(_1, _2, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  res |= __lasx_xvssrarni_b_h(_1, _2, var); // expected-error {{argument to '__builtin_lasx_xvssrarni_b_h' must be a constant integer}}
+  return res;
+}
+
+v16i16 xvssrarni_h_w(v16i16 _1, v16i16 _2, int var) {
+  v16i16 res = __lasx_xvssrarni_h_w(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __lasx_xvssrarni_h_w(_1, _2, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __lasx_xvssrarni_h_w(_1, _2, var); // expected-error {{argument to '__builtin_lasx_xvssrarni_h_w' must be a constant integer}}
+  return res;
+}
+
+v8i32 xvssrarni_w_d(v8i32 _1, v8i32 _2, int var) {
+  v8i32 res = __lasx_xvssrarni_w_d(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 63]}}
+  res |= __lasx_xvssrarni_w_d(_1, _2, 64); // expected-error {{argument value 64 is outside the valid range [0, 63]}}
+  res |= __lasx_xvssrarni_w_d(_1, _2, var); // expected-error {{argument to '__builtin_lasx_xvssrarni_w_d' must be a constant integer}}
+  return res;
+}
+
+v4i64 xvssrarni_d_q(v4i64 _1, v4i64 _2, int var) {
+  v4i64 res = __lasx_xvssrarni_d_q(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 127]}}
+  res |= __lasx_xvssrarni_d_q(_1, _2, 128); // expected-error {{argument value 128 is outside the valid range [0, 127]}}
+  res |= __lasx_xvssrarni_d_q(_1, _2, var); // expected-error {{argument to '__builtin_lasx_xvssrarni_d_q' must be a constant integer}}
+  return res;
+}
+
+v32u8 xvssrarni_bu_h(v32u8 _1, v32i8 _2, int var) {
+  v32u8 res = __lasx_xvssrarni_bu_h(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 15]}}
+  res |= __lasx_xvssrarni_bu_h(_1, _2, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  res |= __lasx_xvssrarni_bu_h(_1, _2, var); // expected-error {{argument to '__builtin_lasx_xvssrarni_bu_h' must be a constant integer}}
+  return res;
+}
+
+v16u16 xvssrarni_hu_w(v16u16 _1, v16i16 _2, int var) {
+  v16u16 res = __lasx_xvssrarni_hu_w(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __lasx_xvssrarni_hu_w(_1, _2, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __lasx_xvssrarni_hu_w(_1, _2, var); // expected-error {{argument to '__builtin_lasx_xvssrarni_hu_w' must be a constant integer}}
+  return res;
+}
+
+v8u32 xvssrarni_wu_d(v8u32 _1, v8i32 _2, int var) {
+  v8u32 res = __lasx_xvssrarni_wu_d(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 63]}}
+  res |= __lasx_xvssrarni_wu_d(_1, _2, 64); // expected-error {{argument value 64 is outside the valid range [0, 63]}}
+  res |= __lasx_xvssrarni_wu_d(_1, _2, var); // expected-error {{argument to '__builtin_lasx_xvssrarni_wu_d' must be a constant integer}}
+  return res;
+}
+
+v4u64 xvssrarni_du_q(v4u64 _1, v4i64 _2, int var) {
+  v4u64 res = __lasx_xvssrarni_du_q(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 127]}}
+  res |= __lasx_xvssrarni_du_q(_1, _2, 128); // expected-error {{argument value 128 is outside the valid range [0, 127]}}
+  res |= __lasx_xvssrarni_du_q(_1, _2, var); // expected-error {{argument to '__builtin_lasx_xvssrarni_du_q' must be a constant integer}}
+  return res;
+}
+
+v4f64 xvpickve_d_f(v4f64 _1, int var) {
+  v4f64 res = __lasx_xvpickve_d_f(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 3]}}
+  res += __lasx_xvpickve_d_f(_1, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  res += __lasx_xvpickve_d_f(_1, var); // expected-error {{argument to '__builtin_lasx_xvpickve_d_f' must be a constant integer}}
+  return res;
+}
+
+v8f32 xvpickve_w_f(v8f32 _1, int var) {
+  v8f32 res = __lasx_xvpickve_w_f(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 7]}}
+  res += __lasx_xvpickve_w_f(_1, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+  res += __lasx_xvpickve_w_f(_1, var); // expected-error {{argument to '__builtin_lasx_xvpickve_w_f' must be a constant integer}}
+  return res;
+}
+
+v32i8 xvrepli_b(int var) {
+  v32i8 res = __lasx_xvrepli_b(-513); // expected-error {{argument value -513 is outside the valid range [-512, 511]}}
+  res |= __lasx_xvrepli_b(512); // expected-error {{argument value 512 is outside the valid range [-512, 511]}}
+  res |= __lasx_xvrepli_b(var); // expected-error {{argument to '__builtin_lasx_xvrepli_b' must be a constant integer}}
+  return res;
+}
+
+v4i64 xvrepli_d(int var) {
+  v4i64 res = __lasx_xvrepli_d(-513); // expected-error {{argument value -513 is outside the valid range [-512, 511]}}
+  res |= __lasx_xvrepli_d(512); // expected-error {{argument value 512 is outside the valid range [-512, 511]}}
+  res |= __lasx_xvrepli_d(var); // expected-error {{argument to '__builtin_lasx_xvrepli_d' must be a constant integer}}
+  return res;
+}
+
+v16i16 xvrepli_h(int var) {
+  v16i16 res = __lasx_xvrepli_h(-513); // expected-error {{argument value -513 is outside the valid range [-512, 511]}}
+  res |= __lasx_xvrepli_h(512); // expected-error {{argument value 512 is outside the valid range [-512, 511]}}
+  res |= __lasx_xvrepli_h(var); // expected-error {{argument to '__builtin_lasx_xvrepli_h' must be a constant integer}}
+  return res;
+}
+
+v8i32 xvrepli_w(int var) {
+  v8i32 res = __lasx_xvrepli_w(-513); // expected-error {{argument value -513 is outside the valid range [-512, 511]}}
+  res |= __lasx_xvrepli_w(512); // expected-error {{argument value 512 is outside the valid range [-512, 511]}}
+  res |= __lasx_xvrepli_w(var); // expected-error {{argument to '__builtin_lasx_xvrepli_w' must be a constant integer}}
+  return res;
+}
diff --git a/clang/test/CodeGen/LoongArch/lasx/builtin-alias.c b/clang/test/CodeGen/LoongArch/lasx/builtin-alias.c
new file mode 100644
index 0000000000000..09b2d5fcacf53
--- /dev/null
+++ b/clang/test/CodeGen/LoongArch/lasx/builtin-alias.c
@@ -0,0 +1,4430 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// RUN: %clang_cc1 -triple loongarch64 -target-feature +lasx -O2 -emit-llvm %s -o - | FileCheck %s
+
+#include <lasxintrin.h>
+
+// CHECK-LABEL: @xvsll_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsll.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvsll_b(v32i8 _1, v32i8 _2) { return __lasx_xvsll_b(_1, _2); }
+// CHECK-LABEL: @xvsll_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsll.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvsll_h(v16i16 _1, v16i16 _2) { return __lasx_xvsll_h(_1, _2); }
+// CHECK-LABEL: @xvsll_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsll.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvsll_w(v8i32 _1, v8i32 _2) { return __lasx_xvsll_w(_1, _2); }
+// CHECK-LABEL: @xvsll_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsll.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvsll_d(v4i64 _1, v4i64 _2) { return __lasx_xvsll_d(_1, _2); }
+// CHECK-LABEL: @xvslli_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvslli.b(<32 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvslli_b(v32i8 _1) { return __lasx_xvslli_b(_1, 1); }
+// CHECK-LABEL: @xvslli_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvslli.h(<16 x i16> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvslli_h(v16i16 _1) { return __lasx_xvslli_h(_1, 1); }
+// CHECK-LABEL: @xvslli_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvslli.w(<8 x i32> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvslli_w(v8i32 _1) { return __lasx_xvslli_w(_1, 1); }
+// CHECK-LABEL: @xvslli_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvslli.d(<4 x i64> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvslli_d(v4i64 _1) { return __lasx_xvslli_d(_1, 1); }
+// CHECK-LABEL: @xvsra_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsra.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvsra_b(v32i8 _1, v32i8 _2) { return __lasx_xvsra_b(_1, _2); }
+// CHECK-LABEL: @xvsra_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsra.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvsra_h(v16i16 _1, v16i16 _2) { return __lasx_xvsra_h(_1, _2); }
+// CHECK-LABEL: @xvsra_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsra.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvsra_w(v8i32 _1, v8i32 _2) { return __lasx_xvsra_w(_1, _2); }
+// CHECK-LABEL: @xvsra_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsra.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvsra_d(v4i64 _1, v4i64 _2) { return __lasx_xvsra_d(_1, _2); }
+// CHECK-LABEL: @xvsrai_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrai.b(<32 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvsrai_b(v32i8 _1) { return __lasx_xvsrai_b(_1, 1); }
+// CHECK-LABEL: @xvsrai_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrai.h(<16 x i16> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvsrai_h(v16i16 _1) { return __lasx_xvsrai_h(_1, 1); }
+// CHECK-LABEL: @xvsrai_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrai.w(<8 x i32> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvsrai_w(v8i32 _1) { return __lasx_xvsrai_w(_1, 1); }
+// CHECK-LABEL: @xvsrai_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrai.d(<4 x i64> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvsrai_d(v4i64 _1) { return __lasx_xvsrai_d(_1, 1); }
+// CHECK-LABEL: @xvsrar_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrar.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvsrar_b(v32i8 _1, v32i8 _2) { return __lasx_xvsrar_b(_1, _2); }
+// CHECK-LABEL: @xvsrar_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrar.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvsrar_h(v16i16 _1, v16i16 _2) { return __lasx_xvsrar_h(_1, _2); }
+// CHECK-LABEL: @xvsrar_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrar.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvsrar_w(v8i32 _1, v8i32 _2) { return __lasx_xvsrar_w(_1, _2); }
+// CHECK-LABEL: @xvsrar_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrar.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvsrar_d(v4i64 _1, v4i64 _2) { return __lasx_xvsrar_d(_1, _2); }
+// CHECK-LABEL: @xvsrari_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrari.b(<32 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvsrari_b(v32i8 _1) { return __lasx_xvsrari_b(_1, 1); }
+// CHECK-LABEL: @xvsrari_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrari.h(<16 x i16> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvsrari_h(v16i16 _1) { return __lasx_xvsrari_h(_1, 1); }
+// CHECK-LABEL: @xvsrari_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrari.w(<8 x i32> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvsrari_w(v8i32 _1) { return __lasx_xvsrari_w(_1, 1); }
+// CHECK-LABEL: @xvsrari_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrari.d(<4 x i64> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvsrari_d(v4i64 _1) { return __lasx_xvsrari_d(_1, 1); }
+// CHECK-LABEL: @xvsrl_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrl.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvsrl_b(v32i8 _1, v32i8 _2) { return __lasx_xvsrl_b(_1, _2); }
+// CHECK-LABEL: @xvsrl_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrl.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvsrl_h(v16i16 _1, v16i16 _2) { return __lasx_xvsrl_h(_1, _2); }
+// CHECK-LABEL: @xvsrl_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrl.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvsrl_w(v8i32 _1, v8i32 _2) { return __lasx_xvsrl_w(_1, _2); }
+// CHECK-LABEL: @xvsrl_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrl.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvsrl_d(v4i64 _1, v4i64 _2) { return __lasx_xvsrl_d(_1, _2); }
+// CHECK-LABEL: @xvsrli_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrli.b(<32 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvsrli_b(v32i8 _1) { return __lasx_xvsrli_b(_1, 1); }
+// CHECK-LABEL: @xvsrli_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrli.h(<16 x i16> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvsrli_h(v16i16 _1) { return __lasx_xvsrli_h(_1, 1); }
+// CHECK-LABEL: @xvsrli_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrli.w(<8 x i32> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvsrli_w(v8i32 _1) { return __lasx_xvsrli_w(_1, 1); }
+// CHECK-LABEL: @xvsrli_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrli.d(<4 x i64> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvsrli_d(v4i64 _1) { return __lasx_xvsrli_d(_1, 1); }
+// CHECK-LABEL: @xvsrlr_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrlr.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvsrlr_b(v32i8 _1, v32i8 _2) { return __lasx_xvsrlr_b(_1, _2); }
+// CHECK-LABEL: @xvsrlr_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrlr.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvsrlr_h(v16i16 _1, v16i16 _2) { return __lasx_xvsrlr_h(_1, _2); }
+// CHECK-LABEL: @xvsrlr_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrlr.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvsrlr_w(v8i32 _1, v8i32 _2) { return __lasx_xvsrlr_w(_1, _2); }
+// CHECK-LABEL: @xvsrlr_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrlr.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvsrlr_d(v4i64 _1, v4i64 _2) { return __lasx_xvsrlr_d(_1, _2); }
+// CHECK-LABEL: @xvsrlri_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrlri.b(<32 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvsrlri_b(v32i8 _1) { return __lasx_xvsrlri_b(_1, 1); }
+// CHECK-LABEL: @xvsrlri_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrlri.h(<16 x i16> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvsrlri_h(v16i16 _1) { return __lasx_xvsrlri_h(_1, 1); }
+// CHECK-LABEL: @xvsrlri_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrlri.w(<8 x i32> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvsrlri_w(v8i32 _1) { return __lasx_xvsrlri_w(_1, 1); }
+// CHECK-LABEL: @xvsrlri_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrlri.d(<4 x i64> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvsrlri_d(v4i64 _1) { return __lasx_xvsrlri_d(_1, 1); }
+// CHECK-LABEL: @xvbitclr_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbitclr.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32u8 xvbitclr_b(v32u8 _1, v32u8 _2) { return __lasx_xvbitclr_b(_1, _2); }
+// CHECK-LABEL: @xvbitclr_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvbitclr.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16u16 xvbitclr_h(v16u16 _1, v16u16 _2) { return __lasx_xvbitclr_h(_1, _2); }
+// CHECK-LABEL: @xvbitclr_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvbitclr.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8u32 xvbitclr_w(v8u32 _1, v8u32 _2) { return __lasx_xvbitclr_w(_1, _2); }
+// CHECK-LABEL: @xvbitclr_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvbitclr.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4u64 xvbitclr_d(v4u64 _1, v4u64 _2) { return __lasx_xvbitclr_d(_1, _2); }
+// CHECK-LABEL: @xvbitclri_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbitclri.b(<32 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32u8 xvbitclri_b(v32u8 _1) { return __lasx_xvbitclri_b(_1, 1); }
+// CHECK-LABEL: @xvbitclri_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvbitclri.h(<16 x i16> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16u16 xvbitclri_h(v16u16 _1) { return __lasx_xvbitclri_h(_1, 1); }
+// CHECK-LABEL: @xvbitclri_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvbitclri.w(<8 x i32> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8u32 xvbitclri_w(v8u32 _1) { return __lasx_xvbitclri_w(_1, 1); }
+// CHECK-LABEL: @xvbitclri_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvbitclri.d(<4 x i64> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4u64 xvbitclri_d(v4u64 _1) { return __lasx_xvbitclri_d(_1, 1); }
+// CHECK-LABEL: @xvbitset_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbitset.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32u8 xvbitset_b(v32u8 _1, v32u8 _2) { return __lasx_xvbitset_b(_1, _2); }
+// CHECK-LABEL: @xvbitset_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvbitset.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16u16 xvbitset_h(v16u16 _1, v16u16 _2) { return __lasx_xvbitset_h(_1, _2); }
+// CHECK-LABEL: @xvbitset_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvbitset.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8u32 xvbitset_w(v8u32 _1, v8u32 _2) { return __lasx_xvbitset_w(_1, _2); }
+// CHECK-LABEL: @xvbitset_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvbitset.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4u64 xvbitset_d(v4u64 _1, v4u64 _2) { return __lasx_xvbitset_d(_1, _2); }
+// CHECK-LABEL: @xvbitseti_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbitseti.b(<32 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32u8 xvbitseti_b(v32u8 _1) { return __lasx_xvbitseti_b(_1, 1); }
+// CHECK-LABEL: @xvbitseti_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvbitseti.h(<16 x i16> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16u16 xvbitseti_h(v16u16 _1) { return __lasx_xvbitseti_h(_1, 1); }
+// CHECK-LABEL: @xvbitseti_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvbitseti.w(<8 x i32> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8u32 xvbitseti_w(v8u32 _1) { return __lasx_xvbitseti_w(_1, 1); }
+// CHECK-LABEL: @xvbitseti_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvbitseti.d(<4 x i64> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4u64 xvbitseti_d(v4u64 _1) { return __lasx_xvbitseti_d(_1, 1); }
+// CHECK-LABEL: @xvbitrev_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbitrev.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32u8 xvbitrev_b(v32u8 _1, v32u8 _2) { return __lasx_xvbitrev_b(_1, _2); }
+// CHECK-LABEL: @xvbitrev_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvbitrev.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16u16 xvbitrev_h(v16u16 _1, v16u16 _2) { return __lasx_xvbitrev_h(_1, _2); }
+// CHECK-LABEL: @xvbitrev_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvbitrev.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8u32 xvbitrev_w(v8u32 _1, v8u32 _2) { return __lasx_xvbitrev_w(_1, _2); }
+// CHECK-LABEL: @xvbitrev_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvbitrev.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4u64 xvbitrev_d(v4u64 _1, v4u64 _2) { return __lasx_xvbitrev_d(_1, _2); }
+// CHECK-LABEL: @xvbitrevi_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbitrevi.b(<32 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32u8 xvbitrevi_b(v32u8 _1) { return __lasx_xvbitrevi_b(_1, 1); }
+// CHECK-LABEL: @xvbitrevi_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvbitrevi.h(<16 x i16> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16u16 xvbitrevi_h(v16u16 _1) { return __lasx_xvbitrevi_h(_1, 1); }
+// CHECK-LABEL: @xvbitrevi_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvbitrevi.w(<8 x i32> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8u32 xvbitrevi_w(v8u32 _1) { return __lasx_xvbitrevi_w(_1, 1); }
+// CHECK-LABEL: @xvbitrevi_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvbitrevi.d(<4 x i64> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4u64 xvbitrevi_d(v4u64 _1) { return __lasx_xvbitrevi_d(_1, 1); }
+// CHECK-LABEL: @xvadd_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvadd.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvadd_b(v32i8 _1, v32i8 _2) { return __lasx_xvadd_b(_1, _2); }
+// CHECK-LABEL: @xvadd_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvadd.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvadd_h(v16i16 _1, v16i16 _2) { return __lasx_xvadd_h(_1, _2); }
+// CHECK-LABEL: @xvadd_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvadd.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvadd_w(v8i32 _1, v8i32 _2) { return __lasx_xvadd_w(_1, _2); }
+// CHECK-LABEL: @xvadd_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvadd.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvadd_d(v4i64 _1, v4i64 _2) { return __lasx_xvadd_d(_1, _2); }
+// CHECK-LABEL: @xvaddi_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvaddi.bu(<32 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvaddi_bu(v32i8 _1) { return __lasx_xvaddi_bu(_1, 1); }
+// CHECK-LABEL: @xvaddi_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvaddi.hu(<16 x i16> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvaddi_hu(v16i16 _1) { return __lasx_xvaddi_hu(_1, 1); }
+// CHECK-LABEL: @xvaddi_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvaddi.wu(<8 x i32> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvaddi_wu(v8i32 _1) { return __lasx_xvaddi_wu(_1, 1); }
+// CHECK-LABEL: @xvaddi_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddi.du(<4 x i64> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvaddi_du(v4i64 _1) { return __lasx_xvaddi_du(_1, 1); }
+// CHECK-LABEL: @xvsub_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsub.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvsub_b(v32i8 _1, v32i8 _2) { return __lasx_xvsub_b(_1, _2); }
+// CHECK-LABEL: @xvsub_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsub.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvsub_h(v16i16 _1, v16i16 _2) { return __lasx_xvsub_h(_1, _2); }
+// CHECK-LABEL: @xvsub_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsub.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvsub_w(v8i32 _1, v8i32 _2) { return __lasx_xvsub_w(_1, _2); }
+// CHECK-LABEL: @xvsub_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsub.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvsub_d(v4i64 _1, v4i64 _2) { return __lasx_xvsub_d(_1, _2); }
+// CHECK-LABEL: @xvsubi_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsubi.bu(<32 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvsubi_bu(v32i8 _1) { return __lasx_xvsubi_bu(_1, 1); }
+// CHECK-LABEL: @xvsubi_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsubi.hu(<16 x i16> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvsubi_hu(v16i16 _1) { return __lasx_xvsubi_hu(_1, 1); }
+// CHECK-LABEL: @xvsubi_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsubi.wu(<8 x i32> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvsubi_wu(v8i32 _1) { return __lasx_xvsubi_wu(_1, 1); }
+// CHECK-LABEL: @xvsubi_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubi.du(<4 x i64> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvsubi_du(v4i64 _1) { return __lasx_xvsubi_du(_1, 1); }
+// CHECK-LABEL: @xvmax_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmax.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvmax_b(v32i8 _1, v32i8 _2) { return __lasx_xvmax_b(_1, _2); }
+// CHECK-LABEL: @xvmax_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmax.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvmax_h(v16i16 _1, v16i16 _2) { return __lasx_xvmax_h(_1, _2); }
+// CHECK-LABEL: @xvmax_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmax.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvmax_w(v8i32 _1, v8i32 _2) { return __lasx_xvmax_w(_1, _2); }
+// CHECK-LABEL: @xvmax_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmax.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvmax_d(v4i64 _1, v4i64 _2) { return __lasx_xvmax_d(_1, _2); }
+// CHECK-LABEL: @xvmaxi_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmaxi.b(<32 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvmaxi_b(v32i8 _1) { return __lasx_xvmaxi_b(_1, 1); }
+// CHECK-LABEL: @xvmaxi_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmaxi.h(<16 x i16> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvmaxi_h(v16i16 _1) { return __lasx_xvmaxi_h(_1, 1); }
+// CHECK-LABEL: @xvmaxi_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmaxi.w(<8 x i32> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvmaxi_w(v8i32 _1) { return __lasx_xvmaxi_w(_1, 1); }
+// CHECK-LABEL: @xvmaxi_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaxi.d(<4 x i64> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvmaxi_d(v4i64 _1) { return __lasx_xvmaxi_d(_1, 1); }
+// CHECK-LABEL: @xvmax_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmax.bu(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32u8 xvmax_bu(v32u8 _1, v32u8 _2) { return __lasx_xvmax_bu(_1, _2); }
+// CHECK-LABEL: @xvmax_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmax.hu(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16u16 xvmax_hu(v16u16 _1, v16u16 _2) { return __lasx_xvmax_hu(_1, _2); }
+// CHECK-LABEL: @xvmax_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmax.wu(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8u32 xvmax_wu(v8u32 _1, v8u32 _2) { return __lasx_xvmax_wu(_1, _2); }
+// CHECK-LABEL: @xvmax_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmax.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4u64 xvmax_du(v4u64 _1, v4u64 _2) { return __lasx_xvmax_du(_1, _2); }
+// CHECK-LABEL: @xvmaxi_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmaxi.bu(<32 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32u8 xvmaxi_bu(v32u8 _1) { return __lasx_xvmaxi_bu(_1, 1); }
+// CHECK-LABEL: @xvmaxi_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmaxi.hu(<16 x i16> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16u16 xvmaxi_hu(v16u16 _1) { return __lasx_xvmaxi_hu(_1, 1); }
+// CHECK-LABEL: @xvmaxi_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmaxi.wu(<8 x i32> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8u32 xvmaxi_wu(v8u32 _1) { return __lasx_xvmaxi_wu(_1, 1); }
+// CHECK-LABEL: @xvmaxi_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaxi.du(<4 x i64> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4u64 xvmaxi_du(v4u64 _1) { return __lasx_xvmaxi_du(_1, 1); }
+// CHECK-LABEL: @xvmin_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmin.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvmin_b(v32i8 _1, v32i8 _2) { return __lasx_xvmin_b(_1, _2); }
+// CHECK-LABEL: @xvmin_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmin.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvmin_h(v16i16 _1, v16i16 _2) { return __lasx_xvmin_h(_1, _2); }
+// CHECK-LABEL: @xvmin_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmin.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvmin_w(v8i32 _1, v8i32 _2) { return __lasx_xvmin_w(_1, _2); }
+// CHECK-LABEL: @xvmin_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmin.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvmin_d(v4i64 _1, v4i64 _2) { return __lasx_xvmin_d(_1, _2); }
+// CHECK-LABEL: @xvmini_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmini.b(<32 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvmini_b(v32i8 _1) { return __lasx_xvmini_b(_1, 1); }
+// CHECK-LABEL: @xvmini_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmini.h(<16 x i16> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvmini_h(v16i16 _1) { return __lasx_xvmini_h(_1, 1); }
+// CHECK-LABEL: @xvmini_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmini.w(<8 x i32> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvmini_w(v8i32 _1) { return __lasx_xvmini_w(_1, 1); }
+// CHECK-LABEL: @xvmini_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmini.d(<4 x i64> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvmini_d(v4i64 _1) { return __lasx_xvmini_d(_1, 1); }
+// CHECK-LABEL: @xvmin_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmin.bu(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32u8 xvmin_bu(v32u8 _1, v32u8 _2) { return __lasx_xvmin_bu(_1, _2); }
+// CHECK-LABEL: @xvmin_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmin.hu(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16u16 xvmin_hu(v16u16 _1, v16u16 _2) { return __lasx_xvmin_hu(_1, _2); }
+// CHECK-LABEL: @xvmin_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmin.wu(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8u32 xvmin_wu(v8u32 _1, v8u32 _2) { return __lasx_xvmin_wu(_1, _2); }
+// CHECK-LABEL: @xvmin_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmin.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4u64 xvmin_du(v4u64 _1, v4u64 _2) { return __lasx_xvmin_du(_1, _2); }
+// CHECK-LABEL: @xvmini_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmini.bu(<32 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32u8 xvmini_bu(v32u8 _1) { return __lasx_xvmini_bu(_1, 1); }
+// CHECK-LABEL: @xvmini_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmini.hu(<16 x i16> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16u16 xvmini_hu(v16u16 _1) { return __lasx_xvmini_hu(_1, 1); }
+// CHECK-LABEL: @xvmini_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmini.wu(<8 x i32> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8u32 xvmini_wu(v8u32 _1) { return __lasx_xvmini_wu(_1, 1); }
+// CHECK-LABEL: @xvmini_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmini.du(<4 x i64> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4u64 xvmini_du(v4u64 _1) { return __lasx_xvmini_du(_1, 1); }
+// CHECK-LABEL: @xvseq_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvseq.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvseq_b(v32i8 _1, v32i8 _2) { return __lasx_xvseq_b(_1, _2); }
+// CHECK-LABEL: @xvseq_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvseq.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvseq_h(v16i16 _1, v16i16 _2) { return __lasx_xvseq_h(_1, _2); }
+// CHECK-LABEL: @xvseq_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvseq.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvseq_w(v8i32 _1, v8i32 _2) { return __lasx_xvseq_w(_1, _2); }
+// CHECK-LABEL: @xvseq_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvseq.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvseq_d(v4i64 _1, v4i64 _2) { return __lasx_xvseq_d(_1, _2); }
+// CHECK-LABEL: @xvseqi_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvseqi.b(<32 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvseqi_b(v32i8 _1) { return __lasx_xvseqi_b(_1, 1); }
+// CHECK-LABEL: @xvseqi_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvseqi.h(<16 x i16> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvseqi_h(v16i16 _1) { return __lasx_xvseqi_h(_1, 1); }
+// CHECK-LABEL: @xvseqi_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvseqi.w(<8 x i32> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvseqi_w(v8i32 _1) { return __lasx_xvseqi_w(_1, 1); }
+// CHECK-LABEL: @xvseqi_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvseqi.d(<4 x i64> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvseqi_d(v4i64 _1) { return __lasx_xvseqi_d(_1, 1); }
+// CHECK-LABEL: @xvslt_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvslt.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvslt_b(v32i8 _1, v32i8 _2) { return __lasx_xvslt_b(_1, _2); }
+// CHECK-LABEL: @xvslt_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvslt.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvslt_h(v16i16 _1, v16i16 _2) { return __lasx_xvslt_h(_1, _2); }
+// CHECK-LABEL: @xvslt_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvslt.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvslt_w(v8i32 _1, v8i32 _2) { return __lasx_xvslt_w(_1, _2); }
+// CHECK-LABEL: @xvslt_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvslt.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvslt_d(v4i64 _1, v4i64 _2) { return __lasx_xvslt_d(_1, _2); }
+// CHECK-LABEL: @xvslti_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvslti.b(<32 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvslti_b(v32i8 _1) { return __lasx_xvslti_b(_1, 1); }
+// CHECK-LABEL: @xvslti_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvslti.h(<16 x i16> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvslti_h(v16i16 _1) { return __lasx_xvslti_h(_1, 1); }
+// CHECK-LABEL: @xvslti_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvslti.w(<8 x i32> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvslti_w(v8i32 _1) { return __lasx_xvslti_w(_1, 1); }
+// CHECK-LABEL: @xvslti_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvslti.d(<4 x i64> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvslti_d(v4i64 _1) { return __lasx_xvslti_d(_1, 1); }
+// CHECK-LABEL: @xvslt_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvslt.bu(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvslt_bu(v32u8 _1, v32u8 _2) { return __lasx_xvslt_bu(_1, _2); }
+// CHECK-LABEL: @xvslt_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvslt.hu(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvslt_hu(v16u16 _1, v16u16 _2) { return __lasx_xvslt_hu(_1, _2); }
+// CHECK-LABEL: @xvslt_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvslt.wu(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvslt_wu(v8u32 _1, v8u32 _2) { return __lasx_xvslt_wu(_1, _2); }
+// CHECK-LABEL: @xvslt_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvslt.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvslt_du(v4u64 _1, v4u64 _2) { return __lasx_xvslt_du(_1, _2); }
+// CHECK-LABEL: @xvslti_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvslti.bu(<32 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvslti_bu(v32u8 _1) { return __lasx_xvslti_bu(_1, 1); }
+// CHECK-LABEL: @xvslti_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvslti.hu(<16 x i16> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvslti_hu(v16u16 _1) { return __lasx_xvslti_hu(_1, 1); }
+// CHECK-LABEL: @xvslti_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvslti.wu(<8 x i32> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvslti_wu(v8u32 _1) { return __lasx_xvslti_wu(_1, 1); }
+// CHECK-LABEL: @xvslti_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvslti.du(<4 x i64> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvslti_du(v4u64 _1) { return __lasx_xvslti_du(_1, 1); }
+// CHECK-LABEL: @xvsle_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsle.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvsle_b(v32i8 _1, v32i8 _2) { return __lasx_xvsle_b(_1, _2); }
+// CHECK-LABEL: @xvsle_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsle.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvsle_h(v16i16 _1, v16i16 _2) { return __lasx_xvsle_h(_1, _2); }
+// CHECK-LABEL: @xvsle_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsle.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvsle_w(v8i32 _1, v8i32 _2) { return __lasx_xvsle_w(_1, _2); }
+// CHECK-LABEL: @xvsle_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsle.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvsle_d(v4i64 _1, v4i64 _2) { return __lasx_xvsle_d(_1, _2); }
+// CHECK-LABEL: @xvslei_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvslei.b(<32 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvslei_b(v32i8 _1) { return __lasx_xvslei_b(_1, 1); }
+// CHECK-LABEL: @xvslei_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvslei.h(<16 x i16> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvslei_h(v16i16 _1) { return __lasx_xvslei_h(_1, 1); }
+// CHECK-LABEL: @xvslei_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvslei.w(<8 x i32> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvslei_w(v8i32 _1) { return __lasx_xvslei_w(_1, 1); }
+// CHECK-LABEL: @xvslei_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvslei.d(<4 x i64> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvslei_d(v4i64 _1) { return __lasx_xvslei_d(_1, 1); }
+// CHECK-LABEL: @xvsle_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsle.bu(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvsle_bu(v32u8 _1, v32u8 _2) { return __lasx_xvsle_bu(_1, _2); }
+// CHECK-LABEL: @xvsle_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsle.hu(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvsle_hu(v16u16 _1, v16u16 _2) { return __lasx_xvsle_hu(_1, _2); }
+// CHECK-LABEL: @xvsle_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsle.wu(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvsle_wu(v8u32 _1, v8u32 _2) { return __lasx_xvsle_wu(_1, _2); }
+// CHECK-LABEL: @xvsle_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsle.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvsle_du(v4u64 _1, v4u64 _2) { return __lasx_xvsle_du(_1, _2); }
+// CHECK-LABEL: @xvslei_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvslei.bu(<32 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvslei_bu(v32u8 _1) { return __lasx_xvslei_bu(_1, 1); }
+// CHECK-LABEL: @xvslei_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvslei.hu(<16 x i16> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvslei_hu(v16u16 _1) { return __lasx_xvslei_hu(_1, 1); }
+// CHECK-LABEL: @xvslei_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvslei.wu(<8 x i32> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvslei_wu(v8u32 _1) { return __lasx_xvslei_wu(_1, 1); }
+// CHECK-LABEL: @xvslei_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvslei.du(<4 x i64> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvslei_du(v4u64 _1) { return __lasx_xvslei_du(_1, 1); }
+// CHECK-LABEL: @xvsat_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsat.b(<32 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvsat_b(v32i8 _1) { return __lasx_xvsat_b(_1, 1); }
+// CHECK-LABEL: @xvsat_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsat.h(<16 x i16> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvsat_h(v16i16 _1) { return __lasx_xvsat_h(_1, 1); }
+// CHECK-LABEL: @xvsat_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsat.w(<8 x i32> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvsat_w(v8i32 _1) { return __lasx_xvsat_w(_1, 1); }
+// CHECK-LABEL: @xvsat_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsat.d(<4 x i64> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvsat_d(v4i64 _1) { return __lasx_xvsat_d(_1, 1); }
+// CHECK-LABEL: @xvsat_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsat.bu(<32 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32u8 xvsat_bu(v32u8 _1) { return __lasx_xvsat_bu(_1, 1); }
+// CHECK-LABEL: @xvsat_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsat.hu(<16 x i16> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16u16 xvsat_hu(v16u16 _1) { return __lasx_xvsat_hu(_1, 1); }
+// CHECK-LABEL: @xvsat_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsat.wu(<8 x i32> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8u32 xvsat_wu(v8u32 _1) { return __lasx_xvsat_wu(_1, 1); }
+// CHECK-LABEL: @xvsat_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsat.du(<4 x i64> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4u64 xvsat_du(v4u64 _1) { return __lasx_xvsat_du(_1, 1); }
+// CHECK-LABEL: @xvadda_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvadda.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvadda_b(v32i8 _1, v32i8 _2) { return __lasx_xvadda_b(_1, _2); }
+// CHECK-LABEL: @xvadda_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvadda.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvadda_h(v16i16 _1, v16i16 _2) { return __lasx_xvadda_h(_1, _2); }
+// CHECK-LABEL: @xvadda_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvadda.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvadda_w(v8i32 _1, v8i32 _2) { return __lasx_xvadda_w(_1, _2); }
+// CHECK-LABEL: @xvadda_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvadda.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvadda_d(v4i64 _1, v4i64 _2) { return __lasx_xvadda_d(_1, _2); }
+// CHECK-LABEL: @xvsadd_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsadd.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvsadd_b(v32i8 _1, v32i8 _2) { return __lasx_xvsadd_b(_1, _2); }
+// CHECK-LABEL: @xvsadd_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsadd.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvsadd_h(v16i16 _1, v16i16 _2) { return __lasx_xvsadd_h(_1, _2); }
+// CHECK-LABEL: @xvsadd_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsadd.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvsadd_w(v8i32 _1, v8i32 _2) { return __lasx_xvsadd_w(_1, _2); }
+// CHECK-LABEL: @xvsadd_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsadd.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvsadd_d(v4i64 _1, v4i64 _2) { return __lasx_xvsadd_d(_1, _2); }
+// CHECK-LABEL: @xvsadd_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsadd.bu(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32u8 xvsadd_bu(v32u8 _1, v32u8 _2) { return __lasx_xvsadd_bu(_1, _2); }
+// CHECK-LABEL: @xvsadd_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsadd.hu(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16u16 xvsadd_hu(v16u16 _1, v16u16 _2) { return __lasx_xvsadd_hu(_1, _2); }
+// CHECK-LABEL: @xvsadd_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsadd.wu(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8u32 xvsadd_wu(v8u32 _1, v8u32 _2) { return __lasx_xvsadd_wu(_1, _2); }
+// CHECK-LABEL: @xvsadd_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsadd.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4u64 xvsadd_du(v4u64 _1, v4u64 _2) { return __lasx_xvsadd_du(_1, _2); }
+// CHECK-LABEL: @xvavg_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvavg.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvavg_b(v32i8 _1, v32i8 _2) { return __lasx_xvavg_b(_1, _2); }
+// CHECK-LABEL: @xvavg_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvavg.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvavg_h(v16i16 _1, v16i16 _2) { return __lasx_xvavg_h(_1, _2); }
+// CHECK-LABEL: @xvavg_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvavg.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvavg_w(v8i32 _1, v8i32 _2) { return __lasx_xvavg_w(_1, _2); }
+// CHECK-LABEL: @xvavg_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvavg.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvavg_d(v4i64 _1, v4i64 _2) { return __lasx_xvavg_d(_1, _2); }
+// CHECK-LABEL: @xvavg_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvavg.bu(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32u8 xvavg_bu(v32u8 _1, v32u8 _2) { return __lasx_xvavg_bu(_1, _2); }
+// CHECK-LABEL: @xvavg_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvavg.hu(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16u16 xvavg_hu(v16u16 _1, v16u16 _2) { return __lasx_xvavg_hu(_1, _2); }
+// CHECK-LABEL: @xvavg_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvavg.wu(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8u32 xvavg_wu(v8u32 _1, v8u32 _2) { return __lasx_xvavg_wu(_1, _2); }
+// CHECK-LABEL: @xvavg_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvavg.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4u64 xvavg_du(v4u64 _1, v4u64 _2) { return __lasx_xvavg_du(_1, _2); }
+// CHECK-LABEL: @xvavgr_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvavgr.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvavgr_b(v32i8 _1, v32i8 _2) { return __lasx_xvavgr_b(_1, _2); }
+// CHECK-LABEL: @xvavgr_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvavgr.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvavgr_h(v16i16 _1, v16i16 _2) { return __lasx_xvavgr_h(_1, _2); }
+// CHECK-LABEL: @xvavgr_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvavgr.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvavgr_w(v8i32 _1, v8i32 _2) { return __lasx_xvavgr_w(_1, _2); }
+// CHECK-LABEL: @xvavgr_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvavgr.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvavgr_d(v4i64 _1, v4i64 _2) { return __lasx_xvavgr_d(_1, _2); }
+// CHECK-LABEL: @xvavgr_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvavgr.bu(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32u8 xvavgr_bu(v32u8 _1, v32u8 _2) { return __lasx_xvavgr_bu(_1, _2); }
+// CHECK-LABEL: @xvavgr_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvavgr.hu(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16u16 xvavgr_hu(v16u16 _1, v16u16 _2) { return __lasx_xvavgr_hu(_1, _2); }
+// CHECK-LABEL: @xvavgr_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvavgr.wu(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8u32 xvavgr_wu(v8u32 _1, v8u32 _2) { return __lasx_xvavgr_wu(_1, _2); }
+// CHECK-LABEL: @xvavgr_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvavgr.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4u64 xvavgr_du(v4u64 _1, v4u64 _2) { return __lasx_xvavgr_du(_1, _2); }
+// CHECK-LABEL: @xvssub_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssub.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvssub_b(v32i8 _1, v32i8 _2) { return __lasx_xvssub_b(_1, _2); }
+// CHECK-LABEL: @xvssub_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssub.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvssub_h(v16i16 _1, v16i16 _2) { return __lasx_xvssub_h(_1, _2); }
+// CHECK-LABEL: @xvssub_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssub.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvssub_w(v8i32 _1, v8i32 _2) { return __lasx_xvssub_w(_1, _2); }
+// CHECK-LABEL: @xvssub_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssub.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvssub_d(v4i64 _1, v4i64 _2) { return __lasx_xvssub_d(_1, _2); }
+// CHECK-LABEL: @xvssub_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssub.bu(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32u8 xvssub_bu(v32u8 _1, v32u8 _2) { return __lasx_xvssub_bu(_1, _2); }
+// CHECK-LABEL: @xvssub_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssub.hu(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16u16 xvssub_hu(v16u16 _1, v16u16 _2) { return __lasx_xvssub_hu(_1, _2); }
+// CHECK-LABEL: @xvssub_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssub.wu(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8u32 xvssub_wu(v8u32 _1, v8u32 _2) { return __lasx_xvssub_wu(_1, _2); }
+// CHECK-LABEL: @xvssub_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssub.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4u64 xvssub_du(v4u64 _1, v4u64 _2) { return __lasx_xvssub_du(_1, _2); }
+// CHECK-LABEL: @xvabsd_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvabsd.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvabsd_b(v32i8 _1, v32i8 _2) { return __lasx_xvabsd_b(_1, _2); }
+// CHECK-LABEL: @xvabsd_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvabsd.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvabsd_h(v16i16 _1, v16i16 _2) { return __lasx_xvabsd_h(_1, _2); }
+// CHECK-LABEL: @xvabsd_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvabsd.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvabsd_w(v8i32 _1, v8i32 _2) { return __lasx_xvabsd_w(_1, _2); }
+// CHECK-LABEL: @xvabsd_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvabsd.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvabsd_d(v4i64 _1, v4i64 _2) { return __lasx_xvabsd_d(_1, _2); }
+// CHECK-LABEL: @xvabsd_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvabsd.bu(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32u8 xvabsd_bu(v32u8 _1, v32u8 _2) { return __lasx_xvabsd_bu(_1, _2); }
+// CHECK-LABEL: @xvabsd_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvabsd.hu(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16u16 xvabsd_hu(v16u16 _1, v16u16 _2) { return __lasx_xvabsd_hu(_1, _2); }
+// CHECK-LABEL: @xvabsd_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvabsd.wu(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8u32 xvabsd_wu(v8u32 _1, v8u32 _2) { return __lasx_xvabsd_wu(_1, _2); }
+// CHECK-LABEL: @xvabsd_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvabsd.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4u64 xvabsd_du(v4u64 _1, v4u64 _2) { return __lasx_xvabsd_du(_1, _2); }
+// CHECK-LABEL: @xvmul_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmul.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvmul_b(v32i8 _1, v32i8 _2) { return __lasx_xvmul_b(_1, _2); }
+// CHECK-LABEL: @xvmul_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmul.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvmul_h(v16i16 _1, v16i16 _2) { return __lasx_xvmul_h(_1, _2); }
+// CHECK-LABEL: @xvmul_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmul.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvmul_w(v8i32 _1, v8i32 _2) { return __lasx_xvmul_w(_1, _2); }
+// CHECK-LABEL: @xvmul_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmul.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvmul_d(v4i64 _1, v4i64 _2) { return __lasx_xvmul_d(_1, _2); }
+// CHECK-LABEL: @xvmadd_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmadd.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], <32 x i8> [[_3:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvmadd_b(v32i8 _1, v32i8 _2, v32i8 _3) { return __lasx_xvmadd_b(_1, _2, _3); }
+// CHECK-LABEL: @xvmadd_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmadd.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]], <16 x i16> [[_3:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvmadd_h(v16i16 _1, v16i16 _2, v16i16 _3) { return __lasx_xvmadd_h(_1, _2, _3); }
+// CHECK-LABEL: @xvmadd_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmadd.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]], <8 x i32> [[_3:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvmadd_w(v8i32 _1, v8i32 _2, v8i32 _3) { return __lasx_xvmadd_w(_1, _2, _3); }
+// CHECK-LABEL: @xvmadd_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmadd.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], <4 x i64> [[_3:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvmadd_d(v4i64 _1, v4i64 _2, v4i64 _3) { return __lasx_xvmadd_d(_1, _2, _3); }
+// CHECK-LABEL: @xvmsub_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmsub.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], <32 x i8> [[_3:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvmsub_b(v32i8 _1, v32i8 _2, v32i8 _3) { return __lasx_xvmsub_b(_1, _2, _3); }
+// CHECK-LABEL: @xvmsub_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmsub.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]], <16 x i16> [[_3:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvmsub_h(v16i16 _1, v16i16 _2, v16i16 _3) { return __lasx_xvmsub_h(_1, _2, _3); }
+// CHECK-LABEL: @xvmsub_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmsub.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]], <8 x i32> [[_3:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvmsub_w(v8i32 _1, v8i32 _2, v8i32 _3) { return __lasx_xvmsub_w(_1, _2, _3); }
+// CHECK-LABEL: @xvmsub_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmsub.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], <4 x i64> [[_3:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvmsub_d(v4i64 _1, v4i64 _2, v4i64 _3) { return __lasx_xvmsub_d(_1, _2, _3); }
+// CHECK-LABEL: @xvdiv_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvdiv.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvdiv_b(v32i8 _1, v32i8 _2) { return __lasx_xvdiv_b(_1, _2); }
+// CHECK-LABEL: @xvdiv_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvdiv.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvdiv_h(v16i16 _1, v16i16 _2) { return __lasx_xvdiv_h(_1, _2); }
+// CHECK-LABEL: @xvdiv_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvdiv.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvdiv_w(v8i32 _1, v8i32 _2) { return __lasx_xvdiv_w(_1, _2); }
+// CHECK-LABEL: @xvdiv_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvdiv.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvdiv_d(v4i64 _1, v4i64 _2) { return __lasx_xvdiv_d(_1, _2); }
+// CHECK-LABEL: @xvdiv_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvdiv.bu(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32u8 xvdiv_bu(v32u8 _1, v32u8 _2) { return __lasx_xvdiv_bu(_1, _2); }
+// CHECK-LABEL: @xvdiv_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvdiv.hu(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16u16 xvdiv_hu(v16u16 _1, v16u16 _2) { return __lasx_xvdiv_hu(_1, _2); }
+// CHECK-LABEL: @xvdiv_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvdiv.wu(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8u32 xvdiv_wu(v8u32 _1, v8u32 _2) { return __lasx_xvdiv_wu(_1, _2); }
+// CHECK-LABEL: @xvdiv_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvdiv.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4u64 xvdiv_du(v4u64 _1, v4u64 _2) { return __lasx_xvdiv_du(_1, _2); }
+// CHECK-LABEL: @xvhaddw_h_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvhaddw.h.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvhaddw_h_b(v32i8 _1, v32i8 _2) { return __lasx_xvhaddw_h_b(_1, _2); }
+// CHECK-LABEL: @xvhaddw_w_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvhaddw.w.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvhaddw_w_h(v16i16 _1, v16i16 _2) { return __lasx_xvhaddw_w_h(_1, _2); }
+// CHECK-LABEL: @xvhaddw_d_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvhaddw.d.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvhaddw_d_w(v8i32 _1, v8i32 _2) { return __lasx_xvhaddw_d_w(_1, _2); }
+// CHECK-LABEL: @xvhaddw_hu_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvhaddw.hu.bu(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16u16 xvhaddw_hu_bu(v32u8 _1, v32u8 _2) { return __lasx_xvhaddw_hu_bu(_1, _2); }
+// CHECK-LABEL: @xvhaddw_wu_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvhaddw.wu.hu(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8u32 xvhaddw_wu_hu(v16u16 _1, v16u16 _2) { return __lasx_xvhaddw_wu_hu(_1, _2); }
+// CHECK-LABEL: @xvhaddw_du_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvhaddw.du.wu(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4u64 xvhaddw_du_wu(v8u32 _1, v8u32 _2) { return __lasx_xvhaddw_du_wu(_1, _2); }
+// CHECK-LABEL: @xvhsubw_h_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvhsubw.h.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvhsubw_h_b(v32i8 _1, v32i8 _2) { return __lasx_xvhsubw_h_b(_1, _2); }
+// CHECK-LABEL: @xvhsubw_w_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvhsubw.w.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvhsubw_w_h(v16i16 _1, v16i16 _2) { return __lasx_xvhsubw_w_h(_1, _2); }
+// CHECK-LABEL: @xvhsubw_d_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvhsubw.d.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvhsubw_d_w(v8i32 _1, v8i32 _2) { return __lasx_xvhsubw_d_w(_1, _2); }
+// CHECK-LABEL: @xvhsubw_hu_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvhsubw.hu.bu(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvhsubw_hu_bu(v32u8 _1, v32u8 _2) { return __lasx_xvhsubw_hu_bu(_1, _2); }
+// CHECK-LABEL: @xvhsubw_wu_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvhsubw.wu.hu(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvhsubw_wu_hu(v16u16 _1, v16u16 _2) { return __lasx_xvhsubw_wu_hu(_1, _2); }
+// CHECK-LABEL: @xvhsubw_du_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvhsubw.du.wu(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvhsubw_du_wu(v8u32 _1, v8u32 _2) { return __lasx_xvhsubw_du_wu(_1, _2); }
+// CHECK-LABEL: @xvmod_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmod.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvmod_b(v32i8 _1, v32i8 _2) { return __lasx_xvmod_b(_1, _2); }
+// CHECK-LABEL: @xvmod_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmod.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvmod_h(v16i16 _1, v16i16 _2) { return __lasx_xvmod_h(_1, _2); }
+// CHECK-LABEL: @xvmod_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmod.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvmod_w(v8i32 _1, v8i32 _2) { return __lasx_xvmod_w(_1, _2); }
+// CHECK-LABEL: @xvmod_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmod.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvmod_d(v4i64 _1, v4i64 _2) { return __lasx_xvmod_d(_1, _2); }
+// CHECK-LABEL: @xvmod_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmod.bu(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32u8 xvmod_bu(v32u8 _1, v32u8 _2) { return __lasx_xvmod_bu(_1, _2); }
+// CHECK-LABEL: @xvmod_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmod.hu(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16u16 xvmod_hu(v16u16 _1, v16u16 _2) { return __lasx_xvmod_hu(_1, _2); }
+// CHECK-LABEL: @xvmod_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmod.wu(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8u32 xvmod_wu(v8u32 _1, v8u32 _2) { return __lasx_xvmod_wu(_1, _2); }
+// CHECK-LABEL: @xvmod_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmod.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4u64 xvmod_du(v4u64 _1, v4u64 _2) { return __lasx_xvmod_du(_1, _2); }
+// CHECK-LABEL: @xvrepl128vei_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvrepl128vei.b(<32 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvrepl128vei_b(v32i8 _1) { return __lasx_xvrepl128vei_b(_1, 1); }
+// CHECK-LABEL: @xvrepl128vei_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvrepl128vei.h(<16 x i16> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvrepl128vei_h(v16i16 _1) { return __lasx_xvrepl128vei_h(_1, 1); }
+// CHECK-LABEL: @xvrepl128vei_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvrepl128vei.w(<8 x i32> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvrepl128vei_w(v8i32 _1) { return __lasx_xvrepl128vei_w(_1, 1); }
+// CHECK-LABEL: @xvrepl128vei_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvrepl128vei.d(<4 x i64> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvrepl128vei_d(v4i64 _1) { return __lasx_xvrepl128vei_d(_1, 1); }
+// CHECK-LABEL: @xvpickev_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvpickev.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvpickev_b(v32i8 _1, v32i8 _2) { return __lasx_xvpickev_b(_1, _2); }
+// CHECK-LABEL: @xvpickev_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvpickev.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvpickev_h(v16i16 _1, v16i16 _2) { return __lasx_xvpickev_h(_1, _2); }
+// CHECK-LABEL: @xvpickev_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvpickev.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvpickev_w(v8i32 _1, v8i32 _2) { return __lasx_xvpickev_w(_1, _2); }
+// CHECK-LABEL: @xvpickev_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvpickev.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvpickev_d(v4i64 _1, v4i64 _2) { return __lasx_xvpickev_d(_1, _2); }
+// CHECK-LABEL: @xvpickod_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvpickod.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvpickod_b(v32i8 _1, v32i8 _2) { return __lasx_xvpickod_b(_1, _2); }
+// CHECK-LABEL: @xvpickod_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvpickod.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvpickod_h(v16i16 _1, v16i16 _2) { return __lasx_xvpickod_h(_1, _2); }
+// CHECK-LABEL: @xvpickod_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvpickod.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvpickod_w(v8i32 _1, v8i32 _2) { return __lasx_xvpickod_w(_1, _2); }
+// CHECK-LABEL: @xvpickod_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvpickod.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvpickod_d(v4i64 _1, v4i64 _2) { return __lasx_xvpickod_d(_1, _2); }
+// CHECK-LABEL: @xvilvh_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvilvh.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvilvh_b(v32i8 _1, v32i8 _2) { return __lasx_xvilvh_b(_1, _2); }
+// CHECK-LABEL: @xvilvh_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvilvh.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvilvh_h(v16i16 _1, v16i16 _2) { return __lasx_xvilvh_h(_1, _2); }
+// CHECK-LABEL: @xvilvh_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvilvh.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvilvh_w(v8i32 _1, v8i32 _2) { return __lasx_xvilvh_w(_1, _2); }
+// CHECK-LABEL: @xvilvh_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvilvh.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvilvh_d(v4i64 _1, v4i64 _2) { return __lasx_xvilvh_d(_1, _2); }
+// CHECK-LABEL: @xvilvl_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvilvl.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvilvl_b(v32i8 _1, v32i8 _2) { return __lasx_xvilvl_b(_1, _2); }
+// CHECK-LABEL: @xvilvl_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvilvl.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvilvl_h(v16i16 _1, v16i16 _2) { return __lasx_xvilvl_h(_1, _2); }
+// CHECK-LABEL: @xvilvl_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvilvl.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvilvl_w(v8i32 _1, v8i32 _2) { return __lasx_xvilvl_w(_1, _2); }
+// CHECK-LABEL: @xvilvl_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvilvl.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvilvl_d(v4i64 _1, v4i64 _2) { return __lasx_xvilvl_d(_1, _2); }
+// CHECK-LABEL: @xvpackev_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvpackev.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvpackev_b(v32i8 _1, v32i8 _2) { return __lasx_xvpackev_b(_1, _2); }
+// CHECK-LABEL: @xvpackev_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvpackev.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvpackev_h(v16i16 _1, v16i16 _2) { return __lasx_xvpackev_h(_1, _2); }
+// CHECK-LABEL: @xvpackev_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvpackev.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvpackev_w(v8i32 _1, v8i32 _2) { return __lasx_xvpackev_w(_1, _2); }
+// CHECK-LABEL: @xvpackev_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvpackev.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvpackev_d(v4i64 _1, v4i64 _2) { return __lasx_xvpackev_d(_1, _2); }
+// CHECK-LABEL: @xvpackod_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvpackod.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvpackod_b(v32i8 _1, v32i8 _2) { return __lasx_xvpackod_b(_1, _2); }
+// CHECK-LABEL: @xvpackod_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvpackod.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvpackod_h(v16i16 _1, v16i16 _2) { return __lasx_xvpackod_h(_1, _2); }
+// CHECK-LABEL: @xvpackod_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvpackod.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvpackod_w(v8i32 _1, v8i32 _2) { return __lasx_xvpackod_w(_1, _2); }
+// CHECK-LABEL: @xvpackod_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvpackod.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvpackod_d(v4i64 _1, v4i64 _2) { return __lasx_xvpackod_d(_1, _2); }
+// CHECK-LABEL: @xvshuf_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvshuf.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], <32 x i8> [[_3:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvshuf_b(v32i8 _1, v32i8 _2, v32i8 _3) { return __lasx_xvshuf_b(_1, _2, _3); }
+// CHECK-LABEL: @xvshuf_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvshuf.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]], <16 x i16> [[_3:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvshuf_h(v16i16 _1, v16i16 _2, v16i16 _3) { return __lasx_xvshuf_h(_1, _2, _3); }
+// CHECK-LABEL: @xvshuf_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvshuf.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]], <8 x i32> [[_3:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvshuf_w(v8i32 _1, v8i32 _2, v8i32 _3) { return __lasx_xvshuf_w(_1, _2, _3); }
+// CHECK-LABEL: @xvshuf_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvshuf.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], <4 x i64> [[_3:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvshuf_d(v4i64 _1, v4i64 _2, v4i64 _3) { return __lasx_xvshuf_d(_1, _2, _3); }
+// CHECK-LABEL: @xvand_v(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvand.v(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32u8 xvand_v(v32u8 _1, v32u8 _2) { return __lasx_xvand_v(_1, _2); }
+// CHECK-LABEL: @xvandi_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvandi.b(<32 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32u8 xvandi_b(v32u8 _1) { return __lasx_xvandi_b(_1, 1); }
+// CHECK-LABEL: @xvor_v(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvor.v(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32u8 xvor_v(v32u8 _1, v32u8 _2) { return __lasx_xvor_v(_1, _2); }
+// CHECK-LABEL: @xvori_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvori.b(<32 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32u8 xvori_b(v32u8 _1) { return __lasx_xvori_b(_1, 1); }
+// CHECK-LABEL: @xvnor_v(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvnor.v(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32u8 xvnor_v(v32u8 _1, v32u8 _2) { return __lasx_xvnor_v(_1, _2); }
+// CHECK-LABEL: @xvnori_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvnori.b(<32 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32u8 xvnori_b(v32u8 _1) { return __lasx_xvnori_b(_1, 1); }
+// CHECK-LABEL: @xvxor_v(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvxor.v(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32u8 xvxor_v(v32u8 _1, v32u8 _2) { return __lasx_xvxor_v(_1, _2); }
+// CHECK-LABEL: @xvxori_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvxori.b(<32 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32u8 xvxori_b(v32u8 _1) { return __lasx_xvxori_b(_1, 1); }
+// CHECK-LABEL: @xvbitsel_v(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbitsel.v(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], <32 x i8> [[_3:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32u8 xvbitsel_v(v32u8 _1, v32u8 _2, v32u8 _3) { return __lasx_xvbitsel_v(_1, _2, _3); }
+// CHECK-LABEL: @xvbitseli_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbitseli.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32u8 xvbitseli_b(v32u8 _1, v32u8 _2) { return __lasx_xvbitseli_b(_1, _2, 1); }
+// CHECK-LABEL: @xvshuf4i_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvshuf4i.b(<32 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvshuf4i_b(v32i8 _1) { return __lasx_xvshuf4i_b(_1, 1); }
+// CHECK-LABEL: @xvshuf4i_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvshuf4i.h(<16 x i16> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvshuf4i_h(v16i16 _1) { return __lasx_xvshuf4i_h(_1, 1); }
+// CHECK-LABEL: @xvshuf4i_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvshuf4i.w(<8 x i32> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvshuf4i_w(v8i32 _1) { return __lasx_xvshuf4i_w(_1, 1); }
+// CHECK-LABEL: @xvreplgr2vr_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvreplgr2vr.b(i32 [[_1:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvreplgr2vr_b(int _1) { return __lasx_xvreplgr2vr_b(_1); }
+// CHECK-LABEL: @xvreplgr2vr_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvreplgr2vr.h(i32 [[_1:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvreplgr2vr_h(int _1) { return __lasx_xvreplgr2vr_h(_1); }
+// CHECK-LABEL: @xvreplgr2vr_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvreplgr2vr.w(i32 [[_1:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvreplgr2vr_w(int _1) { return __lasx_xvreplgr2vr_w(_1); }
+// CHECK-LABEL: @xvreplgr2vr_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[CONV:%.*]] = sext i32 [[_1:%.*]] to i64
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvreplgr2vr.d(i64 [[CONV]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvreplgr2vr_d(int _1) { return __lasx_xvreplgr2vr_d(_1); }
+// CHECK-LABEL: @xvpcnt_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvpcnt.b(<32 x i8> [[_1:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvpcnt_b(v32i8 _1) { return __lasx_xvpcnt_b(_1); }
+// CHECK-LABEL: @xvpcnt_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvpcnt.h(<16 x i16> [[_1:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvpcnt_h(v16i16 _1) { return __lasx_xvpcnt_h(_1); }
+// CHECK-LABEL: @xvpcnt_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvpcnt.w(<8 x i32> [[_1:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvpcnt_w(v8i32 _1) { return __lasx_xvpcnt_w(_1); }
+// CHECK-LABEL: @xvpcnt_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvpcnt.d(<4 x i64> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvpcnt_d(v4i64 _1) { return __lasx_xvpcnt_d(_1); }
+// CHECK-LABEL: @xvclo_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvclo.b(<32 x i8> [[_1:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvclo_b(v32i8 _1) { return __lasx_xvclo_b(_1); }
+// CHECK-LABEL: @xvclo_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvclo.h(<16 x i16> [[_1:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvclo_h(v16i16 _1) { return __lasx_xvclo_h(_1); }
+// CHECK-LABEL: @xvclo_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvclo.w(<8 x i32> [[_1:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvclo_w(v8i32 _1) { return __lasx_xvclo_w(_1); }
+// CHECK-LABEL: @xvclo_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvclo.d(<4 x i64> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvclo_d(v4i64 _1) { return __lasx_xvclo_d(_1); }
+// CHECK-LABEL: @xvclz_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvclz.b(<32 x i8> [[_1:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvclz_b(v32i8 _1) { return __lasx_xvclz_b(_1); }
+// CHECK-LABEL: @xvclz_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvclz.h(<16 x i16> [[_1:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvclz_h(v16i16 _1) { return __lasx_xvclz_h(_1); }
+// CHECK-LABEL: @xvclz_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvclz.w(<8 x i32> [[_1:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvclz_w(v8i32 _1) { return __lasx_xvclz_w(_1); }
+// CHECK-LABEL: @xvclz_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvclz.d(<4 x i64> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvclz_d(v4i64 _1) { return __lasx_xvclz_d(_1); }
+// CHECK-LABEL: @xvfadd_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfadd.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+//
+v8f32 xvfadd_s(v8f32 _1, v8f32 _2) { return __lasx_xvfadd_s(_1, _2); }
+// CHECK-LABEL: @xvfadd_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfadd.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+//
+v4f64 xvfadd_d(v4f64 _1, v4f64 _2) { return __lasx_xvfadd_d(_1, _2); }
+// CHECK-LABEL: @xvfsub_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfsub.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+//
+v8f32 xvfsub_s(v8f32 _1, v8f32 _2) { return __lasx_xvfsub_s(_1, _2); }
+// CHECK-LABEL: @xvfsub_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfsub.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+//
+v4f64 xvfsub_d(v4f64 _1, v4f64 _2) { return __lasx_xvfsub_d(_1, _2); }
+// CHECK-LABEL: @xvfmul_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfmul.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+//
+v8f32 xvfmul_s(v8f32 _1, v8f32 _2) { return __lasx_xvfmul_s(_1, _2); }
+// CHECK-LABEL: @xvfmul_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfmul.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+//
+v4f64 xvfmul_d(v4f64 _1, v4f64 _2) { return __lasx_xvfmul_d(_1, _2); }
+// CHECK-LABEL: @xvfdiv_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfdiv.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+//
+v8f32 xvfdiv_s(v8f32 _1, v8f32 _2) { return __lasx_xvfdiv_s(_1, _2); }
+// CHECK-LABEL: @xvfdiv_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfdiv.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+//
+v4f64 xvfdiv_d(v4f64 _1, v4f64 _2) { return __lasx_xvfdiv_d(_1, _2); }
+// CHECK-LABEL: @xvfcvt_h_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvfcvt.h.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvfcvt_h_s(v8f32 _1, v8f32 _2) { return __lasx_xvfcvt_h_s(_1, _2); }
+// CHECK-LABEL: @xvfcvt_s_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfcvt.s.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+//
+v8f32 xvfcvt_s_d(v4f64 _1, v4f64 _2) { return __lasx_xvfcvt_s_d(_1, _2); }
+// CHECK-LABEL: @xvfmin_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfmin.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+//
+v8f32 xvfmin_s(v8f32 _1, v8f32 _2) { return __lasx_xvfmin_s(_1, _2); }
+// CHECK-LABEL: @xvfmin_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfmin.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+//
+v4f64 xvfmin_d(v4f64 _1, v4f64 _2) { return __lasx_xvfmin_d(_1, _2); }
+// CHECK-LABEL: @xvfmina_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfmina.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+//
+v8f32 xvfmina_s(v8f32 _1, v8f32 _2) { return __lasx_xvfmina_s(_1, _2); }
+// CHECK-LABEL: @xvfmina_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfmina.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+//
+v4f64 xvfmina_d(v4f64 _1, v4f64 _2) { return __lasx_xvfmina_d(_1, _2); }
+// CHECK-LABEL: @xvfmax_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfmax.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+//
+v8f32 xvfmax_s(v8f32 _1, v8f32 _2) { return __lasx_xvfmax_s(_1, _2); }
+// CHECK-LABEL: @xvfmax_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfmax.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+//
+v4f64 xvfmax_d(v4f64 _1, v4f64 _2) { return __lasx_xvfmax_d(_1, _2); }
+// CHECK-LABEL: @xvfmaxa_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfmaxa.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+//
+v8f32 xvfmaxa_s(v8f32 _1, v8f32 _2) { return __lasx_xvfmaxa_s(_1, _2); }
+// CHECK-LABEL: @xvfmaxa_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfmaxa.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+//
+v4f64 xvfmaxa_d(v4f64 _1, v4f64 _2) { return __lasx_xvfmaxa_d(_1, _2); }
+// CHECK-LABEL: @xvfclass_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfclass.s(<8 x float> [[_1:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvfclass_s(v8f32 _1) { return __lasx_xvfclass_s(_1); }
+// CHECK-LABEL: @xvfclass_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfclass.d(<4 x double> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvfclass_d(v4f64 _1) { return __lasx_xvfclass_d(_1); }
+// CHECK-LABEL: @xvfsqrt_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfsqrt.s(<8 x float> [[_1:%.*]])
+// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+//
+v8f32 xvfsqrt_s(v8f32 _1) { return __lasx_xvfsqrt_s(_1); }
+// CHECK-LABEL: @xvfsqrt_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfsqrt.d(<4 x double> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+//
+v4f64 xvfsqrt_d(v4f64 _1) { return __lasx_xvfsqrt_d(_1); }
+// CHECK-LABEL: @xvfrecip_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfrecip.s(<8 x float> [[_1:%.*]])
+// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+//
+v8f32 xvfrecip_s(v8f32 _1) { return __lasx_xvfrecip_s(_1); }
+// CHECK-LABEL: @xvfrecip_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfrecip.d(<4 x double> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+//
+v4f64 xvfrecip_d(v4f64 _1) { return __lasx_xvfrecip_d(_1); }
+// CHECK-LABEL: @xvfrint_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfrint.s(<8 x float> [[_1:%.*]])
+// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+//
+v8f32 xvfrint_s(v8f32 _1) { return __lasx_xvfrint_s(_1); }
+// CHECK-LABEL: @xvfrint_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfrint.d(<4 x double> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+//
+v4f64 xvfrint_d(v4f64 _1) { return __lasx_xvfrint_d(_1); }
+// CHECK-LABEL: @xvfrsqrt_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfrsqrt.s(<8 x float> [[_1:%.*]])
+// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+//
+v8f32 xvfrsqrt_s(v8f32 _1) { return __lasx_xvfrsqrt_s(_1); }
+// CHECK-LABEL: @xvfrsqrt_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfrsqrt.d(<4 x double> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+//
+v4f64 xvfrsqrt_d(v4f64 _1) { return __lasx_xvfrsqrt_d(_1); }
+// CHECK-LABEL: @xvflogb_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvflogb.s(<8 x float> [[_1:%.*]])
+// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+//
+v8f32 xvflogb_s(v8f32 _1) { return __lasx_xvflogb_s(_1); }
+// CHECK-LABEL: @xvflogb_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvflogb.d(<4 x double> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+//
+v4f64 xvflogb_d(v4f64 _1) { return __lasx_xvflogb_d(_1); }
+// CHECK-LABEL: @xvfcvth_s_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfcvth.s.h(<16 x i16> [[_1:%.*]])
+// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+//
+v8f32 xvfcvth_s_h(v16i16 _1) { return __lasx_xvfcvth_s_h(_1); }
+// CHECK-LABEL: @xvfcvth_d_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfcvth.d.s(<8 x float> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+//
+v4f64 xvfcvth_d_s(v8f32 _1) { return __lasx_xvfcvth_d_s(_1); }
+// CHECK-LABEL: @xvfcvtl_s_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfcvtl.s.h(<16 x i16> [[_1:%.*]])
+// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+//
+v8f32 xvfcvtl_s_h(v16i16 _1) { return __lasx_xvfcvtl_s_h(_1); }
+// CHECK-LABEL: @xvfcvtl_d_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfcvtl.d.s(<8 x float> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+//
+v4f64 xvfcvtl_d_s(v8f32 _1) { return __lasx_xvfcvtl_d_s(_1); }
+// CHECK-LABEL: @xvftint_w_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftint.w.s(<8 x float> [[_1:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvftint_w_s(v8f32 _1) { return __lasx_xvftint_w_s(_1); }
+// CHECK-LABEL: @xvftint_l_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftint.l.d(<4 x double> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvftint_l_d(v4f64 _1) { return __lasx_xvftint_l_d(_1); }
+// CHECK-LABEL: @xvftint_wu_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftint.wu.s(<8 x float> [[_1:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8u32 xvftint_wu_s(v8f32 _1) { return __lasx_xvftint_wu_s(_1); }
+// CHECK-LABEL: @xvftint_lu_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftint.lu.d(<4 x double> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4u64 xvftint_lu_d(v4f64 _1) { return __lasx_xvftint_lu_d(_1); }
+// CHECK-LABEL: @xvftintrz_w_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrz.w.s(<8 x float> [[_1:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvftintrz_w_s(v8f32 _1) { return __lasx_xvftintrz_w_s(_1); }
+// CHECK-LABEL: @xvftintrz_l_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrz.l.d(<4 x double> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvftintrz_l_d(v4f64 _1) { return __lasx_xvftintrz_l_d(_1); }
+// CHECK-LABEL: @xvftintrz_wu_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrz.wu.s(<8 x float> [[_1:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8u32 xvftintrz_wu_s(v8f32 _1) { return __lasx_xvftintrz_wu_s(_1); }
+// CHECK-LABEL: @xvftintrz_lu_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrz.lu.d(<4 x double> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4u64 xvftintrz_lu_d(v4f64 _1) { return __lasx_xvftintrz_lu_d(_1); }
+// CHECK-LABEL: @xvffint_s_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvffint.s.w(<8 x i32> [[_1:%.*]])
+// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+//
+v8f32 xvffint_s_w(v8i32 _1) { return __lasx_xvffint_s_w(_1); }
+// CHECK-LABEL: @xvffint_d_l(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvffint.d.l(<4 x i64> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+//
+v4f64 xvffint_d_l(v4i64 _1) { return __lasx_xvffint_d_l(_1); }
+// CHECK-LABEL: @xvffint_s_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvffint.s.wu(<8 x i32> [[_1:%.*]])
+// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+//
+v8f32 xvffint_s_wu(v8u32 _1) { return __lasx_xvffint_s_wu(_1); }
+// CHECK-LABEL: @xvffint_d_lu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvffint.d.lu(<4 x i64> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+//
+v4f64 xvffint_d_lu(v4u64 _1) { return __lasx_xvffint_d_lu(_1); }
+// CHECK-LABEL: @xvreplve_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvreplve.b(<32 x i8> [[_1:%.*]], i32 [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvreplve_b(v32i8 _1, int _2) { return __lasx_xvreplve_b(_1, _2); }
+// CHECK-LABEL: @xvreplve_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvreplve.h(<16 x i16> [[_1:%.*]], i32 [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvreplve_h(v16i16 _1, int _2) { return __lasx_xvreplve_h(_1, _2); }
+// CHECK-LABEL: @xvreplve_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvreplve.w(<8 x i32> [[_1:%.*]], i32 [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvreplve_w(v8i32 _1, int _2) { return __lasx_xvreplve_w(_1, _2); }
+// CHECK-LABEL: @xvreplve_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvreplve.d(<4 x i64> [[_1:%.*]], i32 [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvreplve_d(v4i64 _1, int _2) { return __lasx_xvreplve_d(_1, _2); }
+// CHECK-LABEL: @xvpermi_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvpermi.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvpermi_w(v8i32 _1, v8i32 _2) { return __lasx_xvpermi_w(_1, _2, 1); }
+// CHECK-LABEL: @xvandn_v(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvandn.v(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32u8 xvandn_v(v32u8 _1, v32u8 _2) { return __lasx_xvandn_v(_1, _2); }
+// CHECK-LABEL: @xvneg_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvneg.b(<32 x i8> [[_1:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvneg_b(v32i8 _1) { return __lasx_xvneg_b(_1); }
+// CHECK-LABEL: @xvneg_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvneg.h(<16 x i16> [[_1:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvneg_h(v16i16 _1) { return __lasx_xvneg_h(_1); }
+// CHECK-LABEL: @xvneg_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvneg.w(<8 x i32> [[_1:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvneg_w(v8i32 _1) { return __lasx_xvneg_w(_1); }
+// CHECK-LABEL: @xvneg_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvneg.d(<4 x i64> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvneg_d(v4i64 _1) { return __lasx_xvneg_d(_1); }
+// CHECK-LABEL: @xvmuh_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmuh.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvmuh_b(v32i8 _1, v32i8 _2) { return __lasx_xvmuh_b(_1, _2); }
+// CHECK-LABEL: @xvmuh_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmuh.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvmuh_h(v16i16 _1, v16i16 _2) { return __lasx_xvmuh_h(_1, _2); }
+// CHECK-LABEL: @xvmuh_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmuh.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvmuh_w(v8i32 _1, v8i32 _2) { return __lasx_xvmuh_w(_1, _2); }
+// CHECK-LABEL: @xvmuh_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmuh.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvmuh_d(v4i64 _1, v4i64 _2) { return __lasx_xvmuh_d(_1, _2); }
+// CHECK-LABEL: @xvmuh_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmuh.bu(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32u8 xvmuh_bu(v32u8 _1, v32u8 _2) { return __lasx_xvmuh_bu(_1, _2); }
+// CHECK-LABEL: @xvmuh_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmuh.hu(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16u16 xvmuh_hu(v16u16 _1, v16u16 _2) { return __lasx_xvmuh_hu(_1, _2); }
+// CHECK-LABEL: @xvmuh_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmuh.wu(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8u32 xvmuh_wu(v8u32 _1, v8u32 _2) { return __lasx_xvmuh_wu(_1, _2); }
+// CHECK-LABEL: @xvmuh_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmuh.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4u64 xvmuh_du(v4u64 _1, v4u64 _2) { return __lasx_xvmuh_du(_1, _2); }
+// CHECK-LABEL: @xvsllwil_h_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsllwil.h.b(<32 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvsllwil_h_b(v32i8 _1) { return __lasx_xvsllwil_h_b(_1, 1); }
+// CHECK-LABEL: @xvsllwil_w_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsllwil.w.h(<16 x i16> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvsllwil_w_h(v16i16 _1) { return __lasx_xvsllwil_w_h(_1, 1); }
+// CHECK-LABEL: @xvsllwil_d_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsllwil.d.w(<8 x i32> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvsllwil_d_w(v8i32 _1) { return __lasx_xvsllwil_d_w(_1, 1); }
+// CHECK-LABEL: @xvsllwil_hu_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsllwil.hu.bu(<32 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16u16 xvsllwil_hu_bu(v32u8 _1) { return __lasx_xvsllwil_hu_bu(_1, 1); }
+// CHECK-LABEL: @xvsllwil_wu_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsllwil.wu.hu(<16 x i16> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8u32 xvsllwil_wu_hu(v16u16 _1) { return __lasx_xvsllwil_wu_hu(_1, 1); }
+// CHECK-LABEL: @xvsllwil_du_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsllwil.du.wu(<8 x i32> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4u64 xvsllwil_du_wu(v8u32 _1) { return __lasx_xvsllwil_du_wu(_1, 1); }
+// CHECK-LABEL: @xvsran_b_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsran.b.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvsran_b_h(v16i16 _1, v16i16 _2) { return __lasx_xvsran_b_h(_1, _2); }
+// CHECK-LABEL: @xvsran_h_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsran.h.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvsran_h_w(v8i32 _1, v8i32 _2) { return __lasx_xvsran_h_w(_1, _2); }
+// CHECK-LABEL: @xvsran_w_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsran.w.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvsran_w_d(v4i64 _1, v4i64 _2) { return __lasx_xvsran_w_d(_1, _2); }
+// CHECK-LABEL: @xvssran_b_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssran.b.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvssran_b_h(v16i16 _1, v16i16 _2) { return __lasx_xvssran_b_h(_1, _2); }
+// CHECK-LABEL: @xvssran_h_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssran.h.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvssran_h_w(v8i32 _1, v8i32 _2) { return __lasx_xvssran_h_w(_1, _2); }
+// CHECK-LABEL: @xvssran_w_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssran.w.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvssran_w_d(v4i64 _1, v4i64 _2) { return __lasx_xvssran_w_d(_1, _2); }
+// CHECK-LABEL: @xvssran_bu_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssran.bu.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32u8 xvssran_bu_h(v16u16 _1, v16u16 _2) { return __lasx_xvssran_bu_h(_1, _2); }
+// CHECK-LABEL: @xvssran_hu_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssran.hu.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16u16 xvssran_hu_w(v8u32 _1, v8u32 _2) { return __lasx_xvssran_hu_w(_1, _2); }
+// CHECK-LABEL: @xvssran_wu_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssran.wu.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8u32 xvssran_wu_d(v4u64 _1, v4u64 _2) { return __lasx_xvssran_wu_d(_1, _2); }
+// CHECK-LABEL: @xvsrarn_b_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrarn.b.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvsrarn_b_h(v16i16 _1, v16i16 _2) { return __lasx_xvsrarn_b_h(_1, _2); }
+// CHECK-LABEL: @xvsrarn_h_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrarn.h.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvsrarn_h_w(v8i32 _1, v8i32 _2) { return __lasx_xvsrarn_h_w(_1, _2); }
+// CHECK-LABEL: @xvsrarn_w_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrarn.w.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvsrarn_w_d(v4i64 _1, v4i64 _2) { return __lasx_xvsrarn_w_d(_1, _2); }
+// CHECK-LABEL: @xvssrarn_b_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrarn.b.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvssrarn_b_h(v16i16 _1, v16i16 _2) { return __lasx_xvssrarn_b_h(_1, _2); }
+// CHECK-LABEL: @xvssrarn_h_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrarn.h.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvssrarn_h_w(v8i32 _1, v8i32 _2) { return __lasx_xvssrarn_h_w(_1, _2); }
+// CHECK-LABEL: @xvssrarn_w_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrarn.w.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvssrarn_w_d(v4i64 _1, v4i64 _2) { return __lasx_xvssrarn_w_d(_1, _2); }
+// CHECK-LABEL: @xvssrarn_bu_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrarn.bu.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32u8 xvssrarn_bu_h(v16u16 _1, v16u16 _2) { return __lasx_xvssrarn_bu_h(_1, _2); }
+// CHECK-LABEL: @xvssrarn_hu_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrarn.hu.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16u16 xvssrarn_hu_w(v8u32 _1, v8u32 _2) { return __lasx_xvssrarn_hu_w(_1, _2); }
+// CHECK-LABEL: @xvssrarn_wu_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrarn.wu.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8u32 xvssrarn_wu_d(v4u64 _1, v4u64 _2) { return __lasx_xvssrarn_wu_d(_1, _2); }
+// CHECK-LABEL: @xvsrln_b_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrln.b.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvsrln_b_h(v16i16 _1, v16i16 _2) { return __lasx_xvsrln_b_h(_1, _2); }
+// CHECK-LABEL: @xvsrln_h_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrln.h.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvsrln_h_w(v8i32 _1, v8i32 _2) { return __lasx_xvsrln_h_w(_1, _2); }
+// CHECK-LABEL: @xvsrln_w_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrln.w.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvsrln_w_d(v4i64 _1, v4i64 _2) { return __lasx_xvsrln_w_d(_1, _2); }
+// CHECK-LABEL: @xvssrln_bu_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrln.bu.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32u8 xvssrln_bu_h(v16u16 _1, v16u16 _2) { return __lasx_xvssrln_bu_h(_1, _2); }
+// CHECK-LABEL: @xvssrln_hu_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrln.hu.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16u16 xvssrln_hu_w(v8u32 _1, v8u32 _2) { return __lasx_xvssrln_hu_w(_1, _2); }
+// CHECK-LABEL: @xvssrln_wu_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrln.wu.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8u32 xvssrln_wu_d(v4u64 _1, v4u64 _2) { return __lasx_xvssrln_wu_d(_1, _2); }
+// CHECK-LABEL: @xvsrlrn_b_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrlrn.b.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvsrlrn_b_h(v16i16 _1, v16i16 _2) { return __lasx_xvsrlrn_b_h(_1, _2); }
+// CHECK-LABEL: @xvsrlrn_h_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrlrn.h.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvsrlrn_h_w(v8i32 _1, v8i32 _2) { return __lasx_xvsrlrn_h_w(_1, _2); }
+// CHECK-LABEL: @xvsrlrn_w_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrlrn.w.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvsrlrn_w_d(v4i64 _1, v4i64 _2) { return __lasx_xvsrlrn_w_d(_1, _2); }
+// CHECK-LABEL: @xvssrlrn_bu_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrlrn.bu.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32u8 xvssrlrn_bu_h(v16u16 _1, v16u16 _2) { return __lasx_xvssrlrn_bu_h(_1, _2); }
+// CHECK-LABEL: @xvssrlrn_hu_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrlrn.hu.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16u16 xvssrlrn_hu_w(v8u32 _1, v8u32 _2) { return __lasx_xvssrlrn_hu_w(_1, _2); }
+// CHECK-LABEL: @xvssrlrn_wu_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrlrn.wu.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8u32 xvssrlrn_wu_d(v4u64 _1, v4u64 _2) { return __lasx_xvssrlrn_wu_d(_1, _2); }
+// CHECK-LABEL: @xvfrstpi_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvfrstpi.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvfrstpi_b(v32i8 _1, v32i8 _2) { return __lasx_xvfrstpi_b(_1, _2, 1); }
+// CHECK-LABEL: @xvfrstpi_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvfrstpi.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvfrstpi_h(v16i16 _1, v16i16 _2) { return __lasx_xvfrstpi_h(_1, _2, 1); }
+// CHECK-LABEL: @xvfrstp_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvfrstp.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], <32 x i8> [[_3:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvfrstp_b(v32i8 _1, v32i8 _2, v32i8 _3) { return __lasx_xvfrstp_b(_1, _2, _3); }
+// CHECK-LABEL: @xvfrstp_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvfrstp.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]], <16 x i16> [[_3:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvfrstp_h(v16i16 _1, v16i16 _2, v16i16 _3) { return __lasx_xvfrstp_h(_1, _2, _3); }
+// CHECK-LABEL: @xvshuf4i_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvshuf4i.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvshuf4i_d(v4i64 _1, v4i64 _2) { return __lasx_xvshuf4i_d(_1, _2, 1); }
+// CHECK-LABEL: @xvbsrl_v(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbsrl.v(<32 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvbsrl_v(v32i8 _1) { return __lasx_xvbsrl_v(_1, 1); }
+// CHECK-LABEL: @xvbsll_v(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbsll.v(<32 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvbsll_v(v32i8 _1) { return __lasx_xvbsll_v(_1, 1); }
+// CHECK-LABEL: @xvextrins_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvextrins.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvextrins_b(v32i8 _1, v32i8 _2) { return __lasx_xvextrins_b(_1, _2, 1); }
+// CHECK-LABEL: @xvextrins_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvextrins.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvextrins_h(v16i16 _1, v16i16 _2) { return __lasx_xvextrins_h(_1, _2, 1); }
+// CHECK-LABEL: @xvextrins_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvextrins.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvextrins_w(v8i32 _1, v8i32 _2) { return __lasx_xvextrins_w(_1, _2, 1); }
+// CHECK-LABEL: @xvextrins_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvextrins.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvextrins_d(v4i64 _1, v4i64 _2) { return __lasx_xvextrins_d(_1, _2, 1); }
+// CHECK-LABEL: @xvmskltz_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmskltz.b(<32 x i8> [[_1:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvmskltz_b(v32i8 _1) { return __lasx_xvmskltz_b(_1); }
+// CHECK-LABEL: @xvmskltz_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmskltz.h(<16 x i16> [[_1:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvmskltz_h(v16i16 _1) { return __lasx_xvmskltz_h(_1); }
+// CHECK-LABEL: @xvmskltz_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmskltz.w(<8 x i32> [[_1:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvmskltz_w(v8i32 _1) { return __lasx_xvmskltz_w(_1); }
+// CHECK-LABEL: @xvmskltz_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmskltz.d(<4 x i64> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvmskltz_d(v4i64 _1) { return __lasx_xvmskltz_d(_1); }
+// CHECK-LABEL: @xvsigncov_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsigncov.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvsigncov_b(v32i8 _1, v32i8 _2) { return __lasx_xvsigncov_b(_1, _2); }
+// CHECK-LABEL: @xvsigncov_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsigncov.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvsigncov_h(v16i16 _1, v16i16 _2) { return __lasx_xvsigncov_h(_1, _2); }
+// CHECK-LABEL: @xvsigncov_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsigncov.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvsigncov_w(v8i32 _1, v8i32 _2) { return __lasx_xvsigncov_w(_1, _2); }
+// CHECK-LABEL: @xvsigncov_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsigncov.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvsigncov_d(v4i64 _1, v4i64 _2) { return __lasx_xvsigncov_d(_1, _2); }
+// CHECK-LABEL: @xvfmadd_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfmadd.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]], <8 x float> [[_3:%.*]])
+// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+//
+v8f32 xvfmadd_s(v8f32 _1, v8f32 _2, v8f32 _3) { return __lasx_xvfmadd_s(_1, _2, _3); }
+// CHECK-LABEL: @xvfmadd_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfmadd.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]], <4 x double> [[_3:%.*]])
+// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+//
+v4f64 xvfmadd_d(v4f64 _1, v4f64 _2, v4f64 _3) { return __lasx_xvfmadd_d(_1, _2, _3); }
+// CHECK-LABEL: @xvfmsub_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfmsub.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]], <8 x float> [[_3:%.*]])
+// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+//
+v8f32 xvfmsub_s(v8f32 _1, v8f32 _2, v8f32 _3) { return __lasx_xvfmsub_s(_1, _2, _3); }
+// CHECK-LABEL: @xvfmsub_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfmsub.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]], <4 x double> [[_3:%.*]])
+// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+//
+v4f64 xvfmsub_d(v4f64 _1, v4f64 _2, v4f64 _3) { return __lasx_xvfmsub_d(_1, _2, _3); }
+// CHECK-LABEL: @xvfnmadd_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfnmadd.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]], <8 x float> [[_3:%.*]])
+// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+//
+v8f32 xvfnmadd_s(v8f32 _1, v8f32 _2, v8f32 _3) { return __lasx_xvfnmadd_s(_1, _2, _3); }
+// CHECK-LABEL: @xvfnmadd_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfnmadd.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]], <4 x double> [[_3:%.*]])
+// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+//
+v4f64 xvfnmadd_d(v4f64 _1, v4f64 _2, v4f64 _3) { return __lasx_xvfnmadd_d(_1, _2, _3); }
+// CHECK-LABEL: @xvfnmsub_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfnmsub.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]], <8 x float> [[_3:%.*]])
+// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+//
+v8f32 xvfnmsub_s(v8f32 _1, v8f32 _2, v8f32 _3) { return __lasx_xvfnmsub_s(_1, _2, _3); }
+// CHECK-LABEL: @xvfnmsub_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfnmsub.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]], <4 x double> [[_3:%.*]])
+// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+//
+v4f64 xvfnmsub_d(v4f64 _1, v4f64 _2, v4f64 _3) { return __lasx_xvfnmsub_d(_1, _2, _3); }
+// CHECK-LABEL: @xvftintrne_w_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrne.w.s(<8 x float> [[_1:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvftintrne_w_s(v8f32 _1) { return __lasx_xvftintrne_w_s(_1); }
+// CHECK-LABEL: @xvftintrne_l_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrne.l.d(<4 x double> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvftintrne_l_d(v4f64 _1) { return __lasx_xvftintrne_l_d(_1); }
+// CHECK-LABEL: @xvftintrp_w_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrp.w.s(<8 x float> [[_1:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvftintrp_w_s(v8f32 _1) { return __lasx_xvftintrp_w_s(_1); }
+// CHECK-LABEL: @xvftintrp_l_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrp.l.d(<4 x double> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvftintrp_l_d(v4f64 _1) { return __lasx_xvftintrp_l_d(_1); }
+// CHECK-LABEL: @xvftintrm_w_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrm.w.s(<8 x float> [[_1:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvftintrm_w_s(v8f32 _1) { return __lasx_xvftintrm_w_s(_1); }
+// CHECK-LABEL: @xvftintrm_l_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrm.l.d(<4 x double> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvftintrm_l_d(v4f64 _1) { return __lasx_xvftintrm_l_d(_1); }
+// CHECK-LABEL: @xvftint_w_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftint.w.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvftint_w_d(v4f64 _1, v4f64 _2) { return __lasx_xvftint_w_d(_1, _2); }
+// CHECK-LABEL: @xvffint_s_l(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvffint.s.l(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+//
+v8f32 xvffint_s_l(v4i64 _1, v4i64 _2) { return __lasx_xvffint_s_l(_1, _2); }
+// CHECK-LABEL: @xvftintrz_w_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrz.w.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvftintrz_w_d(v4f64 _1, v4f64 _2) { return __lasx_xvftintrz_w_d(_1, _2); }
+// CHECK-LABEL: @xvftintrp_w_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrp.w.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvftintrp_w_d(v4f64 _1, v4f64 _2) { return __lasx_xvftintrp_w_d(_1, _2); }
+// CHECK-LABEL: @xvftintrm_w_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrm.w.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvftintrm_w_d(v4f64 _1, v4f64 _2) { return __lasx_xvftintrm_w_d(_1, _2); }
+// CHECK-LABEL: @xvftintrne_w_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrne.w.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvftintrne_w_d(v4f64 _1, v4f64 _2) { return __lasx_xvftintrne_w_d(_1, _2); }
+// CHECK-LABEL: @xvftinth_l_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftinth.l.s(<8 x float> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvftinth_l_s(v8f32 _1) { return __lasx_xvftinth_l_s(_1); }
+// CHECK-LABEL: @xvftintl_l_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintl.l.s(<8 x float> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvftintl_l_s(v8f32 _1) { return __lasx_xvftintl_l_s(_1); }
+// CHECK-LABEL: @xvffinth_d_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvffinth.d.w(<8 x i32> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+//
+v4f64 xvffinth_d_w(v8i32 _1) { return __lasx_xvffinth_d_w(_1); }
+// CHECK-LABEL: @xvffintl_d_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvffintl.d.w(<8 x i32> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+//
+v4f64 xvffintl_d_w(v8i32 _1) { return __lasx_xvffintl_d_w(_1); }
+// CHECK-LABEL: @xvftintrzh_l_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrzh.l.s(<8 x float> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvftintrzh_l_s(v8f32 _1) { return __lasx_xvftintrzh_l_s(_1); }
+// CHECK-LABEL: @xvftintrzl_l_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrzl.l.s(<8 x float> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvftintrzl_l_s(v8f32 _1) { return __lasx_xvftintrzl_l_s(_1); }
+// CHECK-LABEL: @xvftintrph_l_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrph.l.s(<8 x float> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvftintrph_l_s(v8f32 _1) { return __lasx_xvftintrph_l_s(_1); }
+// CHECK-LABEL: @xvftintrpl_l_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrpl.l.s(<8 x float> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvftintrpl_l_s(v8f32 _1) { return __lasx_xvftintrpl_l_s(_1); }
+// CHECK-LABEL: @xvftintrmh_l_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrmh.l.s(<8 x float> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvftintrmh_l_s(v8f32 _1) { return __lasx_xvftintrmh_l_s(_1); }
+// CHECK-LABEL: @xvftintrml_l_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrml.l.s(<8 x float> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvftintrml_l_s(v8f32 _1) { return __lasx_xvftintrml_l_s(_1); }
+// CHECK-LABEL: @xvftintrneh_l_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrneh.l.s(<8 x float> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvftintrneh_l_s(v8f32 _1) { return __lasx_xvftintrneh_l_s(_1); }
+// CHECK-LABEL: @xvftintrnel_l_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrnel.l.s(<8 x float> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvftintrnel_l_s(v8f32 _1) { return __lasx_xvftintrnel_l_s(_1); }
+// CHECK-LABEL: @xvfrintrne_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfrintrne.s(<8 x float> [[_1:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x float> [[TMP0]] to <8 x i32>
+// CHECK-NEXT:    ret <8 x i32> [[TMP1]]
+//
+v8i32 xvfrintrne_s(v8f32 _1) { return __lasx_xvfrintrne_s(_1); }
+// CHECK-LABEL: @xvfrintrne_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfrintrne.d(<4 x double> [[_1:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x double> [[TMP0]] to <4 x i64>
+// CHECK-NEXT:    ret <4 x i64> [[TMP1]]
+//
+v4i64 xvfrintrne_d(v4f64 _1) { return __lasx_xvfrintrne_d(_1); }
+// CHECK-LABEL: @xvfrintrz_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfrintrz.s(<8 x float> [[_1:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x float> [[TMP0]] to <8 x i32>
+// CHECK-NEXT:    ret <8 x i32> [[TMP1]]
+//
+v8i32 xvfrintrz_s(v8f32 _1) { return __lasx_xvfrintrz_s(_1); }
+// CHECK-LABEL: @xvfrintrz_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfrintrz.d(<4 x double> [[_1:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x double> [[TMP0]] to <4 x i64>
+// CHECK-NEXT:    ret <4 x i64> [[TMP1]]
+//
+v4i64 xvfrintrz_d(v4f64 _1) { return __lasx_xvfrintrz_d(_1); }
+// CHECK-LABEL: @xvfrintrp_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfrintrp.s(<8 x float> [[_1:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x float> [[TMP0]] to <8 x i32>
+// CHECK-NEXT:    ret <8 x i32> [[TMP1]]
+//
+v8i32 xvfrintrp_s(v8f32 _1) { return __lasx_xvfrintrp_s(_1); }
+// CHECK-LABEL: @xvfrintrp_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfrintrp.d(<4 x double> [[_1:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x double> [[TMP0]] to <4 x i64>
+// CHECK-NEXT:    ret <4 x i64> [[TMP1]]
+//
+v4i64 xvfrintrp_d(v4f64 _1) { return __lasx_xvfrintrp_d(_1); }
+// CHECK-LABEL: @xvfrintrm_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfrintrm.s(<8 x float> [[_1:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x float> [[TMP0]] to <8 x i32>
+// CHECK-NEXT:    ret <8 x i32> [[TMP1]]
+//
+v8i32 xvfrintrm_s(v8f32 _1) { return __lasx_xvfrintrm_s(_1); }
+// CHECK-LABEL: @xvfrintrm_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfrintrm.d(<4 x double> [[_1:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x double> [[TMP0]] to <4 x i64>
+// CHECK-NEXT:    ret <4 x i64> [[TMP1]]
+//
+v4i64 xvfrintrm_d(v4f64 _1) { return __lasx_xvfrintrm_d(_1); }
+// CHECK-LABEL: @xvld(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvld(ptr [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvld(void * _1) { return __lasx_xvld(_1, 1); }
+// CHECK-LABEL: @xvst(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.loongarch.lasx.xvst(<32 x i8> [[_1:%.*]], ptr [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret void
+//
+void xvst(v32i8 _1, void * _2) { return __lasx_xvst(_1, _2, 1); }
+// CHECK-LABEL: @xvstelm_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.loongarch.lasx.xvstelm.b(<32 x i8> [[_1:%.*]], ptr [[_2:%.*]], i32 1, i32 1)
+// CHECK-NEXT:    ret void
+//
+void xvstelm_b(v32i8 _1, void * _2) { return __lasx_xvstelm_b(_1, _2, 1, 1); }
+// CHECK-LABEL: @xvstelm_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.loongarch.lasx.xvstelm.h(<16 x i16> [[_1:%.*]], ptr [[_2:%.*]], i32 2, i32 1)
+// CHECK-NEXT:    ret void
+//
+void xvstelm_h(v16i16 _1, void * _2) { return __lasx_xvstelm_h(_1, _2, 2, 1); }
+// CHECK-LABEL: @xvstelm_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.loongarch.lasx.xvstelm.w(<8 x i32> [[_1:%.*]], ptr [[_2:%.*]], i32 4, i32 1)
+// CHECK-NEXT:    ret void
+//
+void xvstelm_w(v8i32 _1, void * _2) { return __lasx_xvstelm_w(_1, _2, 4, 1); }
+// CHECK-LABEL: @xvstelm_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.loongarch.lasx.xvstelm.d(<4 x i64> [[_1:%.*]], ptr [[_2:%.*]], i32 8, i32 1)
+// CHECK-NEXT:    ret void
+//
+void xvstelm_d(v4i64 _1, void * _2) { return __lasx_xvstelm_d(_1, _2, 8, 1); }
+// CHECK-LABEL: @xvinsve0_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvinsve0.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvinsve0_w(v8i32 _1, v8i32 _2) { return __lasx_xvinsve0_w(_1, _2, 1); }
+// CHECK-LABEL: @xvinsve0_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvinsve0.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvinsve0_d(v4i64 _1, v4i64 _2) { return __lasx_xvinsve0_d(_1, _2, 1); }
+// CHECK-LABEL: @xvpickve_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvpickve.w(<8 x i32> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvpickve_w(v8i32 _1) { return __lasx_xvpickve_w(_1, 1); }
+// CHECK-LABEL: @xvpickve_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvpickve.d(<4 x i64> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvpickve_d(v4i64 _1) { return __lasx_xvpickve_d(_1, 1); }
+// CHECK-LABEL: @xvssrlrn_b_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrlrn.b.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvssrlrn_b_h(v16i16 _1, v16i16 _2) { return __lasx_xvssrlrn_b_h(_1, _2); }
+// CHECK-LABEL: @xvssrlrn_h_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrlrn.h.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvssrlrn_h_w(v8i32 _1, v8i32 _2) { return __lasx_xvssrlrn_h_w(_1, _2); }
+// CHECK-LABEL: @xvssrlrn_w_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrlrn.w.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvssrlrn_w_d(v4i64 _1, v4i64 _2) { return __lasx_xvssrlrn_w_d(_1, _2); }
+// CHECK-LABEL: @xvssrln_b_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrln.b.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvssrln_b_h(v16i16 _1, v16i16 _2) { return __lasx_xvssrln_b_h(_1, _2); }
+// CHECK-LABEL: @xvssrln_h_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrln.h.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvssrln_h_w(v8i32 _1, v8i32 _2) { return __lasx_xvssrln_h_w(_1, _2); }
+// CHECK-LABEL: @xvssrln_w_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrln.w.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvssrln_w_d(v4i64 _1, v4i64 _2) { return __lasx_xvssrln_w_d(_1, _2); }
+// CHECK-LABEL: @xvorn_v(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvorn.v(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvorn_v(v32i8 _1, v32i8 _2) { return __lasx_xvorn_v(_1, _2); }
+// CHECK-LABEL: @xvldi(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvldi(i32 1)
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvldi() { return __lasx_xvldi(1); }
+// CHECK-LABEL: @xvldx(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvldx(ptr [[_1:%.*]], i64 1)
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvldx(void * _1) { return __lasx_xvldx(_1, 1); }
+// CHECK-LABEL: @xvstx(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.loongarch.lasx.xvstx(<32 x i8> [[_1:%.*]], ptr [[_2:%.*]], i64 1)
+// CHECK-NEXT:    ret void
+//
+void xvstx(v32i8 _1, void * _2) { return __lasx_xvstx(_1, _2, 1); }
+// CHECK-LABEL: @xvextl_qu_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvextl.qu.du(<4 x i64> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4u64 xvextl_qu_du(v4u64 _1) { return __lasx_xvextl_qu_du(_1); }
+// CHECK-LABEL: @xvinsgr2vr_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvinsgr2vr.w(<8 x i32> [[_1:%.*]], i32 1, i32 1)
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvinsgr2vr_w(v8i32 _1) { return __lasx_xvinsgr2vr_w(_1, 1, 1); }
+// CHECK-LABEL: @xvinsgr2vr_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvinsgr2vr.d(<4 x i64> [[_1:%.*]], i64 1, i32 1)
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvinsgr2vr_d(v4i64 _1) { return __lasx_xvinsgr2vr_d(_1, 1, 1); }
+// CHECK-LABEL: @xvreplve0_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvreplve0.b(<32 x i8> [[_1:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvreplve0_b(v32i8 _1) { return __lasx_xvreplve0_b(_1); }
+// CHECK-LABEL: @xvreplve0_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvreplve0.h(<16 x i16> [[_1:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvreplve0_h(v16i16 _1) { return __lasx_xvreplve0_h(_1); }
+// CHECK-LABEL: @xvreplve0_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvreplve0.w(<8 x i32> [[_1:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvreplve0_w(v8i32 _1) { return __lasx_xvreplve0_w(_1); }
+// CHECK-LABEL: @xvreplve0_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvreplve0.d(<4 x i64> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvreplve0_d(v4i64 _1) { return __lasx_xvreplve0_d(_1); }
+// CHECK-LABEL: @xvreplve0_q(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvreplve0.q(<32 x i8> [[_1:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvreplve0_q(v32i8 _1) { return __lasx_xvreplve0_q(_1); }
+// CHECK-LABEL: @vext2xv_h_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.vext2xv.h.b(<32 x i8> [[_1:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 vext2xv_h_b(v32i8 _1) { return __lasx_vext2xv_h_b(_1); }
+// CHECK-LABEL: @vext2xv_w_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.vext2xv.w.h(<16 x i16> [[_1:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 vext2xv_w_h(v16i16 _1) { return __lasx_vext2xv_w_h(_1); }
+// CHECK-LABEL: @vext2xv_d_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.vext2xv.d.w(<8 x i32> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 vext2xv_d_w(v8i32 _1) { return __lasx_vext2xv_d_w(_1); }
+// CHECK-LABEL: @vext2xv_w_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.vext2xv.w.b(<32 x i8> [[_1:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 vext2xv_w_b(v32i8 _1) { return __lasx_vext2xv_w_b(_1); }
+// CHECK-LABEL: @vext2xv_d_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.vext2xv.d.h(<16 x i16> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 vext2xv_d_h(v16i16 _1) { return __lasx_vext2xv_d_h(_1); }
+// CHECK-LABEL: @vext2xv_d_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.vext2xv.d.b(<32 x i8> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 vext2xv_d_b(v32i8 _1) { return __lasx_vext2xv_d_b(_1); }
+// CHECK-LABEL: @vext2xv_hu_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.vext2xv.hu.bu(<32 x i8> [[_1:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 vext2xv_hu_bu(v32i8 _1) { return __lasx_vext2xv_hu_bu(_1); }
+// CHECK-LABEL: @vext2xv_wu_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.vext2xv.wu.hu(<16 x i16> [[_1:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 vext2xv_wu_hu(v16i16 _1) { return __lasx_vext2xv_wu_hu(_1); }
+// CHECK-LABEL: @vext2xv_du_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.vext2xv.du.wu(<8 x i32> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 vext2xv_du_wu(v8i32 _1) { return __lasx_vext2xv_du_wu(_1); }
+// CHECK-LABEL: @vext2xv_wu_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.vext2xv.wu.bu(<32 x i8> [[_1:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 vext2xv_wu_bu(v32i8 _1) { return __lasx_vext2xv_wu_bu(_1); }
+// CHECK-LABEL: @vext2xv_du_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.vext2xv.du.hu(<16 x i16> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 vext2xv_du_hu(v16i16 _1) { return __lasx_vext2xv_du_hu(_1); }
+// CHECK-LABEL: @vext2xv_du_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.vext2xv.du.bu(<32 x i8> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 vext2xv_du_bu(v32i8 _1) { return __lasx_vext2xv_du_bu(_1); }
+// CHECK-LABEL: @xvpermi_q(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvpermi.q(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvpermi_q(v32i8 _1, v32i8 _2) { return __lasx_xvpermi_q(_1, _2, 1); }
+// CHECK-LABEL: @xvpermi_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvpermi.d(<4 x i64> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvpermi_d(v4i64 _1) { return __lasx_xvpermi_d(_1, 1); }
+// CHECK-LABEL: @xvperm_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvperm.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvperm_w(v8i32 _1, v8i32 _2) { return __lasx_xvperm_w(_1, _2); }
+// CHECK-LABEL: @xvldrepl_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvldrepl.b(ptr [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvldrepl_b(void * _1) { return __lasx_xvldrepl_b(_1, 1); }
+// CHECK-LABEL: @xvldrepl_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvldrepl.h(ptr [[_1:%.*]], i32 2)
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvldrepl_h(void * _1) { return __lasx_xvldrepl_h(_1, 2); }
+// CHECK-LABEL: @xvldrepl_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvldrepl.w(ptr [[_1:%.*]], i32 4)
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvldrepl_w(void * _1) { return __lasx_xvldrepl_w(_1, 4); }
+// CHECK-LABEL: @xvldrepl_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvldrepl.d(ptr [[_1:%.*]], i32 8)
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvldrepl_d(void * _1) { return __lasx_xvldrepl_d(_1, 8); }
+// CHECK-LABEL: @xvpickve2gr_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lasx.xvpickve2gr.w(<8 x i32> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+int xvpickve2gr_w(v8i32 _1) { return __lasx_xvpickve2gr_w(_1, 1); }
+// CHECK-LABEL: @xvpickve2gr_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lasx.xvpickve2gr.wu(<8 x i32> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+unsigned int xvpickve2gr_wu(v8i32 _1) { return __lasx_xvpickve2gr_wu(_1, 1); }
+// CHECK-LABEL: @xvpickve2gr_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.loongarch.lasx.xvpickve2gr.d(<4 x i64> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret i64 [[TMP0]]
+//
+long xvpickve2gr_d(v4i64 _1) { return __lasx_xvpickve2gr_d(_1, 1); }
+// CHECK-LABEL: @xvpickve2gr_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.loongarch.lasx.xvpickve2gr.du(<4 x i64> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret i64 [[TMP0]]
+//
+unsigned long int xvpickve2gr_du(v4i64 _1) { return __lasx_xvpickve2gr_du(_1, 1); }
+// CHECK-LABEL: @xvaddwev_q_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwev.q.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvaddwev_q_d(v4i64 _1, v4i64 _2) { return __lasx_xvaddwev_q_d(_1, _2); }
+// CHECK-LABEL: @xvaddwev_d_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwev.d.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvaddwev_d_w(v8i32 _1, v8i32 _2) { return __lasx_xvaddwev_d_w(_1, _2); }
+// CHECK-LABEL: @xvaddwev_w_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvaddwev.w.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvaddwev_w_h(v16i16 _1, v16i16 _2) { return __lasx_xvaddwev_w_h(_1, _2); }
+// CHECK-LABEL: @xvaddwev_h_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvaddwev.h.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvaddwev_h_b(v32i8 _1, v32i8 _2) { return __lasx_xvaddwev_h_b(_1, _2); }
+// CHECK-LABEL: @xvaddwev_q_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwev.q.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvaddwev_q_du(v4u64 _1, v4u64 _2) { return __lasx_xvaddwev_q_du(_1, _2); }
+// CHECK-LABEL: @xvaddwev_d_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwev.d.wu(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvaddwev_d_wu(v8u32 _1, v8u32 _2) { return __lasx_xvaddwev_d_wu(_1, _2); }
+// CHECK-LABEL: @xvaddwev_w_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvaddwev.w.hu(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvaddwev_w_hu(v16u16 _1, v16u16 _2) { return __lasx_xvaddwev_w_hu(_1, _2); }
+// CHECK-LABEL: @xvaddwev_h_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvaddwev.h.bu(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvaddwev_h_bu(v32u8 _1, v32u8 _2) { return __lasx_xvaddwev_h_bu(_1, _2); }
+// CHECK-LABEL: @xvsubwev_q_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubwev.q.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvsubwev_q_d(v4i64 _1, v4i64 _2) { return __lasx_xvsubwev_q_d(_1, _2); }
+// CHECK-LABEL: @xvsubwev_d_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubwev.d.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvsubwev_d_w(v8i32 _1, v8i32 _2) { return __lasx_xvsubwev_d_w(_1, _2); }
+// CHECK-LABEL: @xvsubwev_w_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsubwev.w.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvsubwev_w_h(v16i16 _1, v16i16 _2) { return __lasx_xvsubwev_w_h(_1, _2); }
+// CHECK-LABEL: @xvsubwev_h_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsubwev.h.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvsubwev_h_b(v32i8 _1, v32i8 _2) { return __lasx_xvsubwev_h_b(_1, _2); }
+// CHECK-LABEL: @xvsubwev_q_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubwev.q.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvsubwev_q_du(v4u64 _1, v4u64 _2) { return __lasx_xvsubwev_q_du(_1, _2); }
+// CHECK-LABEL: @xvsubwev_d_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubwev.d.wu(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvsubwev_d_wu(v8u32 _1, v8u32 _2) { return __lasx_xvsubwev_d_wu(_1, _2); }
+// CHECK-LABEL: @xvsubwev_w_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsubwev.w.hu(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvsubwev_w_hu(v16u16 _1, v16u16 _2) { return __lasx_xvsubwev_w_hu(_1, _2); }
+// CHECK-LABEL: @xvsubwev_h_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsubwev.h.bu(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvsubwev_h_bu(v32u8 _1, v32u8 _2) { return __lasx_xvsubwev_h_bu(_1, _2); }
+// CHECK-LABEL: @xvmulwev_q_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwev.q.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvmulwev_q_d(v4i64 _1, v4i64 _2) { return __lasx_xvmulwev_q_d(_1, _2); }
+// CHECK-LABEL: @xvmulwev_d_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwev.d.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvmulwev_d_w(v8i32 _1, v8i32 _2) { return __lasx_xvmulwev_d_w(_1, _2); }
+// CHECK-LABEL: @xvmulwev_w_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmulwev.w.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvmulwev_w_h(v16i16 _1, v16i16 _2) { return __lasx_xvmulwev_w_h(_1, _2); }
+// CHECK-LABEL: @xvmulwev_h_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmulwev.h.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvmulwev_h_b(v32i8 _1, v32i8 _2) { return __lasx_xvmulwev_h_b(_1, _2); }
+// CHECK-LABEL: @xvmulwev_q_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwev.q.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvmulwev_q_du(v4u64 _1, v4u64 _2) { return __lasx_xvmulwev_q_du(_1, _2); }
+// CHECK-LABEL: @xvmulwev_d_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwev.d.wu(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvmulwev_d_wu(v8u32 _1, v8u32 _2) { return __lasx_xvmulwev_d_wu(_1, _2); }
+// CHECK-LABEL: @xvmulwev_w_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmulwev.w.hu(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvmulwev_w_hu(v16u16 _1, v16u16 _2) { return __lasx_xvmulwev_w_hu(_1, _2); }
+// CHECK-LABEL: @xvmulwev_h_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmulwev.h.bu(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvmulwev_h_bu(v32u8 _1, v32u8 _2) { return __lasx_xvmulwev_h_bu(_1, _2); }
+// CHECK-LABEL: @xvaddwod_q_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwod.q.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvaddwod_q_d(v4i64 _1, v4i64 _2) { return __lasx_xvaddwod_q_d(_1, _2); }
+// CHECK-LABEL: @xvaddwod_d_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwod.d.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvaddwod_d_w(v8i32 _1, v8i32 _2) { return __lasx_xvaddwod_d_w(_1, _2); }
+// CHECK-LABEL: @xvaddwod_w_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvaddwod.w.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvaddwod_w_h(v16i16 _1, v16i16 _2) { return __lasx_xvaddwod_w_h(_1, _2); }
+// CHECK-LABEL: @xvaddwod_h_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvaddwod.h.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvaddwod_h_b(v32i8 _1, v32i8 _2) { return __lasx_xvaddwod_h_b(_1, _2); }
+// CHECK-LABEL: @xvaddwod_q_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwod.q.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvaddwod_q_du(v4u64 _1, v4u64 _2) { return __lasx_xvaddwod_q_du(_1, _2); }
+// CHECK-LABEL: @xvaddwod_d_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwod.d.wu(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvaddwod_d_wu(v8u32 _1, v8u32 _2) { return __lasx_xvaddwod_d_wu(_1, _2); }
+// CHECK-LABEL: @xvaddwod_w_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvaddwod.w.hu(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvaddwod_w_hu(v16u16 _1, v16u16 _2) { return __lasx_xvaddwod_w_hu(_1, _2); }
+// CHECK-LABEL: @xvaddwod_h_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvaddwod.h.bu(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvaddwod_h_bu(v32u8 _1, v32u8 _2) { return __lasx_xvaddwod_h_bu(_1, _2); }
+// CHECK-LABEL: @xvsubwod_q_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubwod.q.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvsubwod_q_d(v4i64 _1, v4i64 _2) { return __lasx_xvsubwod_q_d(_1, _2); }
+// CHECK-LABEL: @xvsubwod_d_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubwod.d.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvsubwod_d_w(v8i32 _1, v8i32 _2) { return __lasx_xvsubwod_d_w(_1, _2); }
+// CHECK-LABEL: @xvsubwod_w_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsubwod.w.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvsubwod_w_h(v16i16 _1, v16i16 _2) { return __lasx_xvsubwod_w_h(_1, _2); }
+// CHECK-LABEL: @xvsubwod_h_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsubwod.h.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvsubwod_h_b(v32i8 _1, v32i8 _2) { return __lasx_xvsubwod_h_b(_1, _2); }
+// CHECK-LABEL: @xvsubwod_q_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubwod.q.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvsubwod_q_du(v4u64 _1, v4u64 _2) { return __lasx_xvsubwod_q_du(_1, _2); }
+// CHECK-LABEL: @xvsubwod_d_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubwod.d.wu(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvsubwod_d_wu(v8u32 _1, v8u32 _2) { return __lasx_xvsubwod_d_wu(_1, _2); }
+// CHECK-LABEL: @xvsubwod_w_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsubwod.w.hu(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvsubwod_w_hu(v16u16 _1, v16u16 _2) { return __lasx_xvsubwod_w_hu(_1, _2); }
+// CHECK-LABEL: @xvsubwod_h_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsubwod.h.bu(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvsubwod_h_bu(v32u8 _1, v32u8 _2) { return __lasx_xvsubwod_h_bu(_1, _2); }
+// CHECK-LABEL: @xvmulwod_q_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwod.q.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvmulwod_q_d(v4i64 _1, v4i64 _2) { return __lasx_xvmulwod_q_d(_1, _2); }
+// CHECK-LABEL: @xvmulwod_d_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwod.d.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvmulwod_d_w(v8i32 _1, v8i32 _2) { return __lasx_xvmulwod_d_w(_1, _2); }
+// CHECK-LABEL: @xvmulwod_w_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmulwod.w.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvmulwod_w_h(v16i16 _1, v16i16 _2) { return __lasx_xvmulwod_w_h(_1, _2); }
+// CHECK-LABEL: @xvmulwod_h_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmulwod.h.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvmulwod_h_b(v32i8 _1, v32i8 _2) { return __lasx_xvmulwod_h_b(_1, _2); }
+// CHECK-LABEL: @xvmulwod_q_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwod.q.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvmulwod_q_du(v4u64 _1, v4u64 _2) { return __lasx_xvmulwod_q_du(_1, _2); }
+// CHECK-LABEL: @xvmulwod_d_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwod.d.wu(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvmulwod_d_wu(v8u32 _1, v8u32 _2) { return __lasx_xvmulwod_d_wu(_1, _2); }
+// CHECK-LABEL: @xvmulwod_w_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmulwod.w.hu(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvmulwod_w_hu(v16u16 _1, v16u16 _2) { return __lasx_xvmulwod_w_hu(_1, _2); }
+// CHECK-LABEL: @xvmulwod_h_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmulwod.h.bu(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvmulwod_h_bu(v32u8 _1, v32u8 _2) { return __lasx_xvmulwod_h_bu(_1, _2); }
+// CHECK-LABEL: @xvaddwev_d_wu_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwev.d.wu.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvaddwev_d_wu_w(v8u32 _1, v8i32 _2) { return __lasx_xvaddwev_d_wu_w(_1, _2); }
+// CHECK-LABEL: @xvaddwev_w_hu_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvaddwev.w.hu.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvaddwev_w_hu_h(v16u16 _1, v16i16 _2) { return __lasx_xvaddwev_w_hu_h(_1, _2); }
+// CHECK-LABEL: @xvaddwev_h_bu_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvaddwev.h.bu.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvaddwev_h_bu_b(v32u8 _1, v32i8 _2) { return __lasx_xvaddwev_h_bu_b(_1, _2); }
+// CHECK-LABEL: @xvmulwev_d_wu_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwev.d.wu.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvmulwev_d_wu_w(v8u32 _1, v8i32 _2) { return __lasx_xvmulwev_d_wu_w(_1, _2); }
+// CHECK-LABEL: @xvmulwev_w_hu_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmulwev.w.hu.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvmulwev_w_hu_h(v16u16 _1, v16i16 _2) { return __lasx_xvmulwev_w_hu_h(_1, _2); }
+// CHECK-LABEL: @xvmulwev_h_bu_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmulwev.h.bu.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvmulwev_h_bu_b(v32u8 _1, v32i8 _2) { return __lasx_xvmulwev_h_bu_b(_1, _2); }
+// CHECK-LABEL: @xvaddwod_d_wu_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwod.d.wu.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvaddwod_d_wu_w(v8u32 _1, v8i32 _2) { return __lasx_xvaddwod_d_wu_w(_1, _2); }
+// CHECK-LABEL: @xvaddwod_w_hu_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvaddwod.w.hu.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvaddwod_w_hu_h(v16u16 _1, v16i16 _2) { return __lasx_xvaddwod_w_hu_h(_1, _2); }
+// CHECK-LABEL: @xvaddwod_h_bu_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvaddwod.h.bu.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvaddwod_h_bu_b(v32u8 _1, v32i8 _2) { return __lasx_xvaddwod_h_bu_b(_1, _2); }
+// CHECK-LABEL: @xvmulwod_d_wu_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwod.d.wu.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvmulwod_d_wu_w(v8u32 _1, v8i32 _2) { return __lasx_xvmulwod_d_wu_w(_1, _2); }
+// CHECK-LABEL: @xvmulwod_w_hu_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmulwod.w.hu.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvmulwod_w_hu_h(v16u16 _1, v16i16 _2) { return __lasx_xvmulwod_w_hu_h(_1, _2); }
+// CHECK-LABEL: @xvmulwod_h_bu_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmulwod.h.bu.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvmulwod_h_bu_b(v32u8 _1, v32i8 _2) { return __lasx_xvmulwod_h_bu_b(_1, _2); }
+// CHECK-LABEL: @xvhaddw_q_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvhaddw.q.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvhaddw_q_d(v4i64 _1, v4i64 _2) { return __lasx_xvhaddw_q_d(_1, _2); }
+// CHECK-LABEL: @xvhaddw_qu_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvhaddw.qu.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4u64 xvhaddw_qu_du(v4u64 _1, v4u64 _2) { return __lasx_xvhaddw_qu_du(_1, _2); }
+// CHECK-LABEL: @xvhsubw_q_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvhsubw.q.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvhsubw_q_d(v4i64 _1, v4i64 _2) { return __lasx_xvhsubw_q_d(_1, _2); }
+// CHECK-LABEL: @xvhsubw_qu_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvhsubw.qu.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4u64 xvhsubw_qu_du(v4u64 _1, v4u64 _2) { return __lasx_xvhsubw_qu_du(_1, _2); }
+// CHECK-LABEL: @xvmaddwev_q_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwev.q.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], <4 x i64> [[_3:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvmaddwev_q_d(v4i64 _1, v4i64 _2, v4i64 _3) { return __lasx_xvmaddwev_q_d(_1, _2, _3); }
+// CHECK-LABEL: @xvmaddwev_d_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwev.d.w(<4 x i64> [[_1:%.*]], <8 x i32> [[_2:%.*]], <8 x i32> [[_3:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvmaddwev_d_w(v4i64 _1, v8i32 _2, v8i32 _3) { return __lasx_xvmaddwev_d_w(_1, _2, _3); }
+// CHECK-LABEL: @xvmaddwev_w_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmaddwev.w.h(<8 x i32> [[_1:%.*]], <16 x i16> [[_2:%.*]], <16 x i16> [[_3:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvmaddwev_w_h(v8i32 _1, v16i16 _2, v16i16 _3) { return __lasx_xvmaddwev_w_h(_1, _2, _3); }
+// CHECK-LABEL: @xvmaddwev_h_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmaddwev.h.b(<16 x i16> [[_1:%.*]], <32 x i8> [[_2:%.*]], <32 x i8> [[_3:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvmaddwev_h_b(v16i16 _1, v32i8 _2, v32i8 _3) { return __lasx_xvmaddwev_h_b(_1, _2, _3); }
+// CHECK-LABEL: @xvmaddwev_q_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwev.q.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], <4 x i64> [[_3:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4u64 xvmaddwev_q_du(v4u64 _1, v4u64 _2, v4u64 _3) { return __lasx_xvmaddwev_q_du(_1, _2, _3); }
+// CHECK-LABEL: @xvmaddwev_d_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwev.d.wu(<4 x i64> [[_1:%.*]], <8 x i32> [[_2:%.*]], <8 x i32> [[_3:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4u64 xvmaddwev_d_wu(v4u64 _1, v8u32 _2, v8u32 _3) { return __lasx_xvmaddwev_d_wu(_1, _2, _3); }
+// CHECK-LABEL: @xvmaddwev_w_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmaddwev.w.hu(<8 x i32> [[_1:%.*]], <16 x i16> [[_2:%.*]], <16 x i16> [[_3:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8u32 xvmaddwev_w_hu(v8u32 _1, v16u16 _2, v16u16 _3) { return __lasx_xvmaddwev_w_hu(_1, _2, _3); }
+// CHECK-LABEL: @xvmaddwev_h_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmaddwev.h.bu(<16 x i16> [[_1:%.*]], <32 x i8> [[_2:%.*]], <32 x i8> [[_3:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16u16 xvmaddwev_h_bu(v16u16 _1, v32u8 _2, v32u8 _3) { return __lasx_xvmaddwev_h_bu(_1, _2, _3); }
+// CHECK-LABEL: @xvmaddwod_q_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwod.q.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], <4 x i64> [[_3:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvmaddwod_q_d(v4i64 _1, v4i64 _2, v4i64 _3) { return __lasx_xvmaddwod_q_d(_1, _2, _3); }
+// CHECK-LABEL: @xvmaddwod_d_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwod.d.w(<4 x i64> [[_1:%.*]], <8 x i32> [[_2:%.*]], <8 x i32> [[_3:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvmaddwod_d_w(v4i64 _1, v8i32 _2, v8i32 _3) { return __lasx_xvmaddwod_d_w(_1, _2, _3); }
+// CHECK-LABEL: @xvmaddwod_w_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmaddwod.w.h(<8 x i32> [[_1:%.*]], <16 x i16> [[_2:%.*]], <16 x i16> [[_3:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvmaddwod_w_h(v8i32 _1, v16i16 _2, v16i16 _3) { return __lasx_xvmaddwod_w_h(_1, _2, _3); }
+// CHECK-LABEL: @xvmaddwod_h_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmaddwod.h.b(<16 x i16> [[_1:%.*]], <32 x i8> [[_2:%.*]], <32 x i8> [[_3:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvmaddwod_h_b(v16i16 _1, v32i8 _2, v32i8 _3) { return __lasx_xvmaddwod_h_b(_1, _2, _3); }
+// CHECK-LABEL: @xvmaddwod_q_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwod.q.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], <4 x i64> [[_3:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4u64 xvmaddwod_q_du(v4u64 _1, v4u64 _2, v4u64 _3) { return __lasx_xvmaddwod_q_du(_1, _2, _3); }
+// CHECK-LABEL: @xvmaddwod_d_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwod.d.wu(<4 x i64> [[_1:%.*]], <8 x i32> [[_2:%.*]], <8 x i32> [[_3:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4u64 xvmaddwod_d_wu(v4u64 _1, v8u32 _2, v8u32 _3) { return __lasx_xvmaddwod_d_wu(_1, _2, _3); }
+// CHECK-LABEL: @xvmaddwod_w_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmaddwod.w.hu(<8 x i32> [[_1:%.*]], <16 x i16> [[_2:%.*]], <16 x i16> [[_3:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8u32 xvmaddwod_w_hu(v8u32 _1, v16u16 _2, v16u16 _3) { return __lasx_xvmaddwod_w_hu(_1, _2, _3); }
+// CHECK-LABEL: @xvmaddwod_h_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmaddwod.h.bu(<16 x i16> [[_1:%.*]], <32 x i8> [[_2:%.*]], <32 x i8> [[_3:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16u16 xvmaddwod_h_bu(v16u16 _1, v32u8 _2, v32u8 _3) { return __lasx_xvmaddwod_h_bu(_1, _2, _3); }
+// CHECK-LABEL: @xvmaddwev_q_du_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwev.q.du.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], <4 x i64> [[_3:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvmaddwev_q_du_d(v4i64 _1, v4u64 _2, v4i64 _3) { return __lasx_xvmaddwev_q_du_d(_1, _2, _3); }
+// CHECK-LABEL: @xvmaddwev_d_wu_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwev.d.wu.w(<4 x i64> [[_1:%.*]], <8 x i32> [[_2:%.*]], <8 x i32> [[_3:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvmaddwev_d_wu_w(v4i64 _1, v8u32 _2, v8i32 _3) { return __lasx_xvmaddwev_d_wu_w(_1, _2, _3); }
+// CHECK-LABEL: @xvmaddwev_w_hu_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmaddwev.w.hu.h(<8 x i32> [[_1:%.*]], <16 x i16> [[_2:%.*]], <16 x i16> [[_3:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvmaddwev_w_hu_h(v8i32 _1, v16u16 _2, v16i16 _3) { return __lasx_xvmaddwev_w_hu_h(_1, _2, _3); }
+// CHECK-LABEL: @xvmaddwev_h_bu_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmaddwev.h.bu.b(<16 x i16> [[_1:%.*]], <32 x i8> [[_2:%.*]], <32 x i8> [[_3:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvmaddwev_h_bu_b(v16i16 _1, v32u8 _2, v32i8 _3) { return __lasx_xvmaddwev_h_bu_b(_1, _2, _3); }
+// CHECK-LABEL: @xvmaddwod_q_du_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwod.q.du.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], <4 x i64> [[_3:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvmaddwod_q_du_d(v4i64 _1, v4u64 _2, v4i64 _3) { return __lasx_xvmaddwod_q_du_d(_1, _2, _3); }
+// CHECK-LABEL: @xvmaddwod_d_wu_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwod.d.wu.w(<4 x i64> [[_1:%.*]], <8 x i32> [[_2:%.*]], <8 x i32> [[_3:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvmaddwod_d_wu_w(v4i64 _1, v8u32 _2, v8i32 _3) { return __lasx_xvmaddwod_d_wu_w(_1, _2, _3); }
+// CHECK-LABEL: @xvmaddwod_w_hu_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmaddwod.w.hu.h(<8 x i32> [[_1:%.*]], <16 x i16> [[_2:%.*]], <16 x i16> [[_3:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvmaddwod_w_hu_h(v8i32 _1, v16u16 _2, v16i16 _3) { return __lasx_xvmaddwod_w_hu_h(_1, _2, _3); }
+// CHECK-LABEL: @xvmaddwod_h_bu_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmaddwod.h.bu.b(<16 x i16> [[_1:%.*]], <32 x i8> [[_2:%.*]], <32 x i8> [[_3:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvmaddwod_h_bu_b(v16i16 _1, v32u8 _2, v32i8 _3) { return __lasx_xvmaddwod_h_bu_b(_1, _2, _3); }
+// CHECK-LABEL: @xvrotr_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvrotr.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvrotr_b(v32i8 _1, v32i8 _2) { return __lasx_xvrotr_b(_1, _2); }
+// CHECK-LABEL: @xvrotr_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvrotr.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvrotr_h(v16i16 _1, v16i16 _2) { return __lasx_xvrotr_h(_1, _2); }
+// CHECK-LABEL: @xvrotr_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvrotr.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvrotr_w(v8i32 _1, v8i32 _2) { return __lasx_xvrotr_w(_1, _2); }
+// CHECK-LABEL: @xvrotr_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvrotr.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvrotr_d(v4i64 _1, v4i64 _2) { return __lasx_xvrotr_d(_1, _2); }
+// CHECK-LABEL: @xvadd_q(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvadd.q(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvadd_q(v4i64 _1, v4i64 _2) { return __lasx_xvadd_q(_1, _2); }
+// CHECK-LABEL: @xvsub_q(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsub.q(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvsub_q(v4i64 _1, v4i64 _2) { return __lasx_xvsub_q(_1, _2); }
+// CHECK-LABEL: @xvaddwev_q_du_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwev.q.du.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvaddwev_q_du_d(v4u64 _1, v4i64 _2) { return __lasx_xvaddwev_q_du_d(_1, _2); }
+// CHECK-LABEL: @xvaddwod_q_du_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwod.q.du.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvaddwod_q_du_d(v4u64 _1, v4i64 _2) { return __lasx_xvaddwod_q_du_d(_1, _2); }
+// CHECK-LABEL: @xvmulwev_q_du_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwev.q.du.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvmulwev_q_du_d(v4u64 _1, v4i64 _2) { return __lasx_xvmulwev_q_du_d(_1, _2); }
+// CHECK-LABEL: @xvmulwod_q_du_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwod.q.du.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvmulwod_q_du_d(v4u64 _1, v4i64 _2) { return __lasx_xvmulwod_q_du_d(_1, _2); }
+// CHECK-LABEL: @xvmskgez_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmskgez.b(<32 x i8> [[_1:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvmskgez_b(v32i8 _1) { return __lasx_xvmskgez_b(_1); }
+// CHECK-LABEL: @xvmsknz_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmsknz.b(<32 x i8> [[_1:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvmsknz_b(v32i8 _1) { return __lasx_xvmsknz_b(_1); }
+// CHECK-LABEL: @xvexth_h_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvexth.h.b(<32 x i8> [[_1:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvexth_h_b(v32i8 _1) { return __lasx_xvexth_h_b(_1); }
+// CHECK-LABEL: @xvexth_w_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvexth.w.h(<16 x i16> [[_1:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvexth_w_h(v16i16 _1) { return __lasx_xvexth_w_h(_1); }
+// CHECK-LABEL: @xvexth_d_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvexth.d.w(<8 x i32> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvexth_d_w(v8i32 _1) { return __lasx_xvexth_d_w(_1); }
+// CHECK-LABEL: @xvexth_q_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvexth.q.d(<4 x i64> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvexth_q_d(v4i64 _1) { return __lasx_xvexth_q_d(_1); }
+// CHECK-LABEL: @xvexth_hu_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvexth.hu.bu(<32 x i8> [[_1:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16u16 xvexth_hu_bu(v32u8 _1) { return __lasx_xvexth_hu_bu(_1); }
+// CHECK-LABEL: @xvexth_wu_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvexth.wu.hu(<16 x i16> [[_1:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8u32 xvexth_wu_hu(v16u16 _1) { return __lasx_xvexth_wu_hu(_1); }
+// CHECK-LABEL: @xvexth_du_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvexth.du.wu(<8 x i32> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4u64 xvexth_du_wu(v8u32 _1) { return __lasx_xvexth_du_wu(_1); }
+// CHECK-LABEL: @xvexth_qu_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvexth.qu.du(<4 x i64> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4u64 xvexth_qu_du(v4u64 _1) { return __lasx_xvexth_qu_du(_1); }
+// CHECK-LABEL: @xvrotri_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvrotri.b(<32 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvrotri_b(v32i8 _1) { return __lasx_xvrotri_b(_1, 1); }
+// CHECK-LABEL: @xvrotri_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvrotri.h(<16 x i16> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvrotri_h(v16i16 _1) { return __lasx_xvrotri_h(_1, 1); }
+// CHECK-LABEL: @xvrotri_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvrotri.w(<8 x i32> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvrotri_w(v8i32 _1) { return __lasx_xvrotri_w(_1, 1); }
+// CHECK-LABEL: @xvrotri_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvrotri.d(<4 x i64> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvrotri_d(v4i64 _1) { return __lasx_xvrotri_d(_1, 1); }
+// CHECK-LABEL: @xvextl_q_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvextl.q.d(<4 x i64> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvextl_q_d(v4i64 _1) { return __lasx_xvextl_q_d(_1); }
+// CHECK-LABEL: @xvsrlni_b_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrlni.b.h(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvsrlni_b_h(v32i8 _1, v32i8 _2) { return __lasx_xvsrlni_b_h(_1, _2, 1); }
+// CHECK-LABEL: @xvsrlni_h_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrlni.h.w(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvsrlni_h_w(v16i16 _1, v16i16 _2) { return __lasx_xvsrlni_h_w(_1, _2, 1); }
+// CHECK-LABEL: @xvsrlni_w_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrlni.w.d(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvsrlni_w_d(v8i32 _1, v8i32 _2) { return __lasx_xvsrlni_w_d(_1, _2, 1); }
+// CHECK-LABEL: @xvsrlni_d_q(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrlni.d.q(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvsrlni_d_q(v4i64 _1, v4i64 _2) { return __lasx_xvsrlni_d_q(_1, _2, 1); }
+// CHECK-LABEL: @xvsrlrni_b_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrlrni.b.h(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvsrlrni_b_h(v32i8 _1, v32i8 _2) { return __lasx_xvsrlrni_b_h(_1, _2, 1); }
+// CHECK-LABEL: @xvsrlrni_h_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrlrni.h.w(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvsrlrni_h_w(v16i16 _1, v16i16 _2) { return __lasx_xvsrlrni_h_w(_1, _2, 1); }
+// CHECK-LABEL: @xvsrlrni_w_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrlrni.w.d(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvsrlrni_w_d(v8i32 _1, v8i32 _2) { return __lasx_xvsrlrni_w_d(_1, _2, 1); }
+// CHECK-LABEL: @xvsrlrni_d_q(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrlrni.d.q(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvsrlrni_d_q(v4i64 _1, v4i64 _2) { return __lasx_xvsrlrni_d_q(_1, _2, 1); }
+// CHECK-LABEL: @xvssrlni_b_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrlni.b.h(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvssrlni_b_h(v32i8 _1, v32i8 _2) { return __lasx_xvssrlni_b_h(_1, _2, 1); }
+// CHECK-LABEL: @xvssrlni_h_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrlni.h.w(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvssrlni_h_w(v16i16 _1, v16i16 _2) { return __lasx_xvssrlni_h_w(_1, _2, 1); }
+// CHECK-LABEL: @xvssrlni_w_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrlni.w.d(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvssrlni_w_d(v8i32 _1, v8i32 _2) { return __lasx_xvssrlni_w_d(_1, _2, 1); }
+// CHECK-LABEL: @xvssrlni_d_q(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssrlni.d.q(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvssrlni_d_q(v4i64 _1, v4i64 _2) { return __lasx_xvssrlni_d_q(_1, _2, 1); }
+// CHECK-LABEL: @xvssrlni_bu_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrlni.bu.h(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32u8 xvssrlni_bu_h(v32u8 _1, v32i8 _2) { return __lasx_xvssrlni_bu_h(_1, _2, 1); }
+// CHECK-LABEL: @xvssrlni_hu_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrlni.hu.w(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16u16 xvssrlni_hu_w(v16u16 _1, v16i16 _2) { return __lasx_xvssrlni_hu_w(_1, _2, 1); }
+// CHECK-LABEL: @xvssrlni_wu_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrlni.wu.d(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8u32 xvssrlni_wu_d(v8u32 _1, v8i32 _2) { return __lasx_xvssrlni_wu_d(_1, _2, 1); }
+// CHECK-LABEL: @xvssrlni_du_q(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssrlni.du.q(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4u64 xvssrlni_du_q(v4u64 _1, v4i64 _2) { return __lasx_xvssrlni_du_q(_1, _2, 1); }
+// CHECK-LABEL: @xvssrlrni_b_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrlrni.b.h(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvssrlrni_b_h(v32i8 _1, v32i8 _2) { return __lasx_xvssrlrni_b_h(_1, _2, 1); }
+// CHECK-LABEL: @xvssrlrni_h_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrlrni.h.w(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvssrlrni_h_w(v16i16 _1, v16i16 _2) { return __lasx_xvssrlrni_h_w(_1, _2, 1); }
+// CHECK-LABEL: @xvssrlrni_w_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrlrni.w.d(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvssrlrni_w_d(v8i32 _1, v8i32 _2) { return __lasx_xvssrlrni_w_d(_1, _2, 1); }
+// CHECK-LABEL: @xvssrlrni_d_q(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssrlrni.d.q(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvssrlrni_d_q(v4i64 _1, v4i64 _2) { return __lasx_xvssrlrni_d_q(_1, _2, 1); }
+// CHECK-LABEL: @xvssrlrni_bu_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrlrni.bu.h(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32u8 xvssrlrni_bu_h(v32u8 _1, v32i8 _2) { return __lasx_xvssrlrni_bu_h(_1, _2, 1); }
+// CHECK-LABEL: @xvssrlrni_hu_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrlrni.hu.w(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16u16 xvssrlrni_hu_w(v16u16 _1, v16i16 _2) { return __lasx_xvssrlrni_hu_w(_1, _2, 1); }
+// CHECK-LABEL: @xvssrlrni_wu_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrlrni.wu.d(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8u32 xvssrlrni_wu_d(v8u32 _1, v8i32 _2) { return __lasx_xvssrlrni_wu_d(_1, _2, 1); }
+// CHECK-LABEL: @xvssrlrni_du_q(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssrlrni.du.q(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4u64 xvssrlrni_du_q(v4u64 _1, v4i64 _2) { return __lasx_xvssrlrni_du_q(_1, _2, 1); }
+// CHECK-LABEL: @xvsrani_b_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrani.b.h(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvsrani_b_h(v32i8 _1, v32i8 _2) { return __lasx_xvsrani_b_h(_1, _2, 1); }
+// CHECK-LABEL: @xvsrani_h_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrani.h.w(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvsrani_h_w(v16i16 _1, v16i16 _2) { return __lasx_xvsrani_h_w(_1, _2, 1); }
+// CHECK-LABEL: @xvsrani_w_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrani.w.d(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvsrani_w_d(v8i32 _1, v8i32 _2) { return __lasx_xvsrani_w_d(_1, _2, 1); }
+// CHECK-LABEL: @xvsrani_d_q(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrani.d.q(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvsrani_d_q(v4i64 _1, v4i64 _2) { return __lasx_xvsrani_d_q(_1, _2, 1); }
+// CHECK-LABEL: @xvsrarni_b_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrarni.b.h(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvsrarni_b_h(v32i8 _1, v32i8 _2) { return __lasx_xvsrarni_b_h(_1, _2, 1); }
+// CHECK-LABEL: @xvsrarni_h_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrarni.h.w(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvsrarni_h_w(v16i16 _1, v16i16 _2) { return __lasx_xvsrarni_h_w(_1, _2, 1); }
+// CHECK-LABEL: @xvsrarni_w_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrarni.w.d(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvsrarni_w_d(v8i32 _1, v8i32 _2) { return __lasx_xvsrarni_w_d(_1, _2, 1); }
+// CHECK-LABEL: @xvsrarni_d_q(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrarni.d.q(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvsrarni_d_q(v4i64 _1, v4i64 _2) { return __lasx_xvsrarni_d_q(_1, _2, 1); }
+// CHECK-LABEL: @xvssrani_b_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrani.b.h(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvssrani_b_h(v32i8 _1, v32i8 _2) { return __lasx_xvssrani_b_h(_1, _2, 1); }
+// CHECK-LABEL: @xvssrani_h_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrani.h.w(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvssrani_h_w(v16i16 _1, v16i16 _2) { return __lasx_xvssrani_h_w(_1, _2, 1); }
+// CHECK-LABEL: @xvssrani_w_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrani.w.d(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvssrani_w_d(v8i32 _1, v8i32 _2) { return __lasx_xvssrani_w_d(_1, _2, 1); }
+// CHECK-LABEL: @xvssrani_d_q(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssrani.d.q(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvssrani_d_q(v4i64 _1, v4i64 _2) { return __lasx_xvssrani_d_q(_1, _2, 1); }
+// CHECK-LABEL: @xvssrani_bu_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrani.bu.h(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32u8 xvssrani_bu_h(v32u8 _1, v32i8 _2) { return __lasx_xvssrani_bu_h(_1, _2, 1); }
+// CHECK-LABEL: @xvssrani_hu_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrani.hu.w(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16u16 xvssrani_hu_w(v16u16 _1, v16i16 _2) { return __lasx_xvssrani_hu_w(_1, _2, 1); }
+// CHECK-LABEL: @xvssrani_wu_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrani.wu.d(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8u32 xvssrani_wu_d(v8u32 _1, v8i32 _2) { return __lasx_xvssrani_wu_d(_1, _2, 1); }
+// CHECK-LABEL: @xvssrani_du_q(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssrani.du.q(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4u64 xvssrani_du_q(v4u64 _1, v4i64 _2) { return __lasx_xvssrani_du_q(_1, _2, 1); }
+// CHECK-LABEL: @xvssrarni_b_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrarni.b.h(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvssrarni_b_h(v32i8 _1, v32i8 _2) { return __lasx_xvssrarni_b_h(_1, _2, 1); }
+// CHECK-LABEL: @xvssrarni_h_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrarni.h.w(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvssrarni_h_w(v16i16 _1, v16i16 _2) { return __lasx_xvssrarni_h_w(_1, _2, 1); }
+// CHECK-LABEL: @xvssrarni_w_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrarni.w.d(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvssrarni_w_d(v8i32 _1, v8i32 _2) { return __lasx_xvssrarni_w_d(_1, _2, 1); }
+// CHECK-LABEL: @xvssrarni_d_q(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssrarni.d.q(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvssrarni_d_q(v4i64 _1, v4i64 _2) { return __lasx_xvssrarni_d_q(_1, _2, 1); }
+// CHECK-LABEL: @xvssrarni_bu_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrarni.bu.h(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32u8 xvssrarni_bu_h(v32u8 _1, v32i8 _2) { return __lasx_xvssrarni_bu_h(_1, _2, 1); }
+// CHECK-LABEL: @xvssrarni_hu_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrarni.hu.w(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16u16 xvssrarni_hu_w(v16u16 _1, v16i16 _2) { return __lasx_xvssrarni_hu_w(_1, _2, 1); }
+// CHECK-LABEL: @xvssrarni_wu_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrarni.wu.d(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8u32 xvssrarni_wu_d(v8u32 _1, v8i32 _2) { return __lasx_xvssrarni_wu_d(_1, _2, 1); }
+// CHECK-LABEL: @xvssrarni_du_q(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssrarni.du.q(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4u64 xvssrarni_du_q(v4u64 _1, v4i64 _2) { return __lasx_xvssrarni_du_q(_1, _2, 1); }
+// CHECK-LABEL: @xbnz_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lasx.xbnz.b(<32 x i8> [[_1:%.*]])
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+int xbnz_b(v32u8 _1) { return __lasx_xbnz_b(_1); }
+// CHECK-LABEL: @xbnz_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lasx.xbnz.d(<4 x i64> [[_1:%.*]])
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+int xbnz_d(v4u64 _1) { return __lasx_xbnz_d(_1); }
+// CHECK-LABEL: @xbnz_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lasx.xbnz.h(<16 x i16> [[_1:%.*]])
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+int xbnz_h(v16u16 _1) { return __lasx_xbnz_h(_1); }
+// CHECK-LABEL: @xbnz_v(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lasx.xbnz.v(<32 x i8> [[_1:%.*]])
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+int xbnz_v(v32u8 _1) { return __lasx_xbnz_v(_1); }
+// CHECK-LABEL: @xbnz_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lasx.xbnz.w(<8 x i32> [[_1:%.*]])
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+int xbnz_w(v8u32 _1) { return __lasx_xbnz_w(_1); }
+// CHECK-LABEL: @xbz_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lasx.xbz.b(<32 x i8> [[_1:%.*]])
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+int xbz_b(v32u8 _1) { return __lasx_xbz_b(_1); }
+// CHECK-LABEL: @xbz_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lasx.xbz.d(<4 x i64> [[_1:%.*]])
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+int xbz_d(v4u64 _1) { return __lasx_xbz_d(_1); }
+// CHECK-LABEL: @xbz_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lasx.xbz.h(<16 x i16> [[_1:%.*]])
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+int xbz_h(v16u16 _1) { return __lasx_xbz_h(_1); }
+// CHECK-LABEL: @xbz_v(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lasx.xbz.v(<32 x i8> [[_1:%.*]])
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+int xbz_v(v32u8 _1) { return __lasx_xbz_v(_1); }
+// CHECK-LABEL: @xbz_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lasx.xbz.w(<8 x i32> [[_1:%.*]])
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+int xbz_w(v8u32 _1) { return __lasx_xbz_w(_1); }
+// CHECK-LABEL: @xvfcmp_caf_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.caf.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvfcmp_caf_d(v4f64 _1, v4f64 _2) { return __lasx_xvfcmp_caf_d(_1, _2); }
+// CHECK-LABEL: @xvfcmp_caf_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.caf.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvfcmp_caf_s(v8f32 _1, v8f32 _2) { return __lasx_xvfcmp_caf_s(_1, _2); }
+// CHECK-LABEL: @xvfcmp_ceq_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.ceq.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvfcmp_ceq_d(v4f64 _1, v4f64 _2) { return __lasx_xvfcmp_ceq_d(_1, _2); }
+// CHECK-LABEL: @xvfcmp_ceq_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.ceq.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvfcmp_ceq_s(v8f32 _1, v8f32 _2) { return __lasx_xvfcmp_ceq_s(_1, _2); }
+// CHECK-LABEL: @xvfcmp_cle_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.cle.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvfcmp_cle_d(v4f64 _1, v4f64 _2) { return __lasx_xvfcmp_cle_d(_1, _2); }
+// CHECK-LABEL: @xvfcmp_cle_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.cle.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvfcmp_cle_s(v8f32 _1, v8f32 _2) { return __lasx_xvfcmp_cle_s(_1, _2); }
+// CHECK-LABEL: @xvfcmp_clt_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.clt.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvfcmp_clt_d(v4f64 _1, v4f64 _2) { return __lasx_xvfcmp_clt_d(_1, _2); }
+// CHECK-LABEL: @xvfcmp_clt_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.clt.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvfcmp_clt_s(v8f32 _1, v8f32 _2) { return __lasx_xvfcmp_clt_s(_1, _2); }
+// CHECK-LABEL: @xvfcmp_cne_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.cne.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvfcmp_cne_d(v4f64 _1, v4f64 _2) { return __lasx_xvfcmp_cne_d(_1, _2); }
+// CHECK-LABEL: @xvfcmp_cne_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.cne.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvfcmp_cne_s(v8f32 _1, v8f32 _2) { return __lasx_xvfcmp_cne_s(_1, _2); }
+// CHECK-LABEL: @xvfcmp_cor_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.cor.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvfcmp_cor_d(v4f64 _1, v4f64 _2) { return __lasx_xvfcmp_cor_d(_1, _2); }
+// CHECK-LABEL: @xvfcmp_cor_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.cor.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvfcmp_cor_s(v8f32 _1, v8f32 _2) { return __lasx_xvfcmp_cor_s(_1, _2); }
+// CHECK-LABEL: @xvfcmp_cueq_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.cueq.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvfcmp_cueq_d(v4f64 _1, v4f64 _2) { return __lasx_xvfcmp_cueq_d(_1, _2); }
+// CHECK-LABEL: @xvfcmp_cueq_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.cueq.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvfcmp_cueq_s(v8f32 _1, v8f32 _2) { return __lasx_xvfcmp_cueq_s(_1, _2); }
+// CHECK-LABEL: @xvfcmp_cule_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.cule.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvfcmp_cule_d(v4f64 _1, v4f64 _2) { return __lasx_xvfcmp_cule_d(_1, _2); }
+// CHECK-LABEL: @xvfcmp_cule_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.cule.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvfcmp_cule_s(v8f32 _1, v8f32 _2) { return __lasx_xvfcmp_cule_s(_1, _2); }
+// CHECK-LABEL: @xvfcmp_cult_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.cult.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvfcmp_cult_d(v4f64 _1, v4f64 _2) { return __lasx_xvfcmp_cult_d(_1, _2); }
+// CHECK-LABEL: @xvfcmp_cult_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.cult.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvfcmp_cult_s(v8f32 _1, v8f32 _2) { return __lasx_xvfcmp_cult_s(_1, _2); }
+// CHECK-LABEL: @xvfcmp_cun_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.cun.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvfcmp_cun_d(v4f64 _1, v4f64 _2) { return __lasx_xvfcmp_cun_d(_1, _2); }
+// CHECK-LABEL: @xvfcmp_cune_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.cune.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvfcmp_cune_d(v4f64 _1, v4f64 _2) { return __lasx_xvfcmp_cune_d(_1, _2); }
+// CHECK-LABEL: @xvfcmp_cune_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.cune.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvfcmp_cune_s(v8f32 _1, v8f32 _2) { return __lasx_xvfcmp_cune_s(_1, _2); }
+// CHECK-LABEL: @xvfcmp_cun_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.cun.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvfcmp_cun_s(v8f32 _1, v8f32 _2) { return __lasx_xvfcmp_cun_s(_1, _2); }
+// CHECK-LABEL: @xvfcmp_saf_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.saf.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvfcmp_saf_d(v4f64 _1, v4f64 _2) { return __lasx_xvfcmp_saf_d(_1, _2); }
+// CHECK-LABEL: @xvfcmp_saf_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.saf.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvfcmp_saf_s(v8f32 _1, v8f32 _2) { return __lasx_xvfcmp_saf_s(_1, _2); }
+// CHECK-LABEL: @xvfcmp_seq_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.seq.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvfcmp_seq_d(v4f64 _1, v4f64 _2) { return __lasx_xvfcmp_seq_d(_1, _2); }
+// CHECK-LABEL: @xvfcmp_seq_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.seq.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvfcmp_seq_s(v8f32 _1, v8f32 _2) { return __lasx_xvfcmp_seq_s(_1, _2); }
+// CHECK-LABEL: @xvfcmp_sle_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.sle.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvfcmp_sle_d(v4f64 _1, v4f64 _2) { return __lasx_xvfcmp_sle_d(_1, _2); }
+// CHECK-LABEL: @xvfcmp_sle_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.sle.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvfcmp_sle_s(v8f32 _1, v8f32 _2) { return __lasx_xvfcmp_sle_s(_1, _2); }
+// CHECK-LABEL: @xvfcmp_slt_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.slt.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvfcmp_slt_d(v4f64 _1, v4f64 _2) { return __lasx_xvfcmp_slt_d(_1, _2); }
+// CHECK-LABEL: @xvfcmp_slt_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.slt.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvfcmp_slt_s(v8f32 _1, v8f32 _2) { return __lasx_xvfcmp_slt_s(_1, _2); }
+// CHECK-LABEL: @xvfcmp_sne_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.sne.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvfcmp_sne_d(v4f64 _1, v4f64 _2) { return __lasx_xvfcmp_sne_d(_1, _2); }
+// CHECK-LABEL: @xvfcmp_sne_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.sne.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvfcmp_sne_s(v8f32 _1, v8f32 _2) { return __lasx_xvfcmp_sne_s(_1, _2); }
+// CHECK-LABEL: @xvfcmp_sor_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.sor.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvfcmp_sor_d(v4f64 _1, v4f64 _2) { return __lasx_xvfcmp_sor_d(_1, _2); }
+// CHECK-LABEL: @xvfcmp_sor_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.sor.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvfcmp_sor_s(v8f32 _1, v8f32 _2) { return __lasx_xvfcmp_sor_s(_1, _2); }
+// CHECK-LABEL: @xvfcmp_sueq_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.sueq.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvfcmp_sueq_d(v4f64 _1, v4f64 _2) { return __lasx_xvfcmp_sueq_d(_1, _2); }
+// CHECK-LABEL: @xvfcmp_sueq_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.sueq.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvfcmp_sueq_s(v8f32 _1, v8f32 _2) { return __lasx_xvfcmp_sueq_s(_1, _2); }
+// CHECK-LABEL: @xvfcmp_sule_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.sule.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvfcmp_sule_d(v4f64 _1, v4f64 _2) { return __lasx_xvfcmp_sule_d(_1, _2); }
+// CHECK-LABEL: @xvfcmp_sule_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.sule.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvfcmp_sule_s(v8f32 _1, v8f32 _2) { return __lasx_xvfcmp_sule_s(_1, _2); }
+// CHECK-LABEL: @xvfcmp_sult_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.sult.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvfcmp_sult_d(v4f64 _1, v4f64 _2) { return __lasx_xvfcmp_sult_d(_1, _2); }
+// CHECK-LABEL: @xvfcmp_sult_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.sult.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvfcmp_sult_s(v8f32 _1, v8f32 _2) { return __lasx_xvfcmp_sult_s(_1, _2); }
+// CHECK-LABEL: @xvfcmp_sun_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.sun.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvfcmp_sun_d(v4f64 _1, v4f64 _2) { return __lasx_xvfcmp_sun_d(_1, _2); }
+// CHECK-LABEL: @xvfcmp_sune_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.sune.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvfcmp_sune_d(v4f64 _1, v4f64 _2) { return __lasx_xvfcmp_sune_d(_1, _2); }
+// CHECK-LABEL: @xvfcmp_sune_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.sune.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvfcmp_sune_s(v8f32 _1, v8f32 _2) { return __lasx_xvfcmp_sune_s(_1, _2); }
+// CHECK-LABEL: @xvfcmp_sun_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.sun.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvfcmp_sun_s(v8f32 _1, v8f32 _2) { return __lasx_xvfcmp_sun_s(_1, _2); }
+// CHECK-LABEL: @xvpickve_d_f(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvpickve.d.f(<4 x double> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+//
+v4f64 xvpickve_d_f(v4f64 _1) { return __lasx_xvpickve_d_f(_1, 1); }
+// CHECK-LABEL: @xvpickve_w_f(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvpickve.w.f(<8 x float> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+//
+v8f32 xvpickve_w_f(v8f32 _1) { return __lasx_xvpickve_w_f(_1, 1); }
+// CHECK-LABEL: @xvrepli_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvrepli.b(i32 1)
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvrepli_b() { return __lasx_xvrepli_b(1); }
+// CHECK-LABEL: @xvrepli_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvrepli.d(i32 1)
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvrepli_d() { return __lasx_xvrepli_d(1); }
+// CHECK-LABEL: @xvrepli_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvrepli.h(i32 1)
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvrepli_h() { return __lasx_xvrepli_h(1); }
+// CHECK-LABEL: @xvrepli_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvrepli.w(i32 1)
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvrepli_w() { return __lasx_xvrepli_w(1); }
diff --git a/clang/test/CodeGen/LoongArch/lasx/builtin-error.c b/clang/test/CodeGen/LoongArch/lasx/builtin-error.c
new file mode 100644
index 0000000000000..724484465769e
--- /dev/null
+++ b/clang/test/CodeGen/LoongArch/lasx/builtin-error.c
@@ -0,0 +1,1392 @@
+// RUN: %clang_cc1 -triple loongarch64 -target-feature +lasx -verify %s
+
+typedef signed char v32i8 __attribute__((vector_size(32), aligned(32)));
+typedef signed char v32i8_b __attribute__((vector_size(32), aligned(1)));
+typedef unsigned char v32u8 __attribute__((vector_size(32), aligned(32)));
+typedef unsigned char v32u8_b __attribute__((vector_size(32), aligned(1)));
+typedef short v16i16 __attribute__((vector_size(32), aligned(32)));
+typedef short v16i16_h __attribute__((vector_size(32), aligned(2)));
+typedef unsigned short v16u16 __attribute__((vector_size(32), aligned(32)));
+typedef unsigned short v16u16_h __attribute__((vector_size(32), aligned(2)));
+typedef int v8i32 __attribute__((vector_size(32), aligned(32)));
+typedef int v8i32_w __attribute__((vector_size(32), aligned(4)));
+typedef unsigned int v8u32 __attribute__((vector_size(32), aligned(32)));
+typedef unsigned int v8u32_w __attribute__((vector_size(32), aligned(4)));
+typedef long long v4i64 __attribute__((vector_size(32), aligned(32)));
+typedef long long v4i64_d __attribute__((vector_size(32), aligned(8)));
+typedef unsigned long long v4u64 __attribute__((vector_size(32), aligned(32)));
+typedef unsigned long long v4u64_d __attribute__((vector_size(32), aligned(8)));
+typedef float v8f32 __attribute__((vector_size(32), aligned(32)));
+typedef float v8f32_w __attribute__((vector_size(32), aligned(4)));
+typedef double v4f64 __attribute__((vector_size(32), aligned(32)));
+typedef double v4f64_d __attribute__((vector_size(32), aligned(8)));
+
+v32i8 xvslli_b(v32i8 _1, int var) {
+  v32i8 res = __builtin_lasx_xvslli_b(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 7]}}
+  res |= __builtin_lasx_xvslli_b(_1, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+  res |= __builtin_lasx_xvslli_b(_1, var); // expected-error {{argument to '__builtin_lasx_xvslli_b' must be a constant integer}}
+  return res;
+}
+
+v16i16 xvslli_h(v16i16 _1, int var) {
+  v16i16 res = __builtin_lasx_xvslli_h(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 15]}}
+  res |= __builtin_lasx_xvslli_h(_1, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  res |= __builtin_lasx_xvslli_h(_1, var); // expected-error {{argument to '__builtin_lasx_xvslli_h' must be a constant integer}}
+  return res;
+}
+
+v8i32 xvslli_w(v8i32 _1, int var) {
+  v8i32 res = __builtin_lasx_xvslli_w(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __builtin_lasx_xvslli_w(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __builtin_lasx_xvslli_w(_1, var); // expected-error {{argument to '__builtin_lasx_xvslli_w' must be a constant integer}}
+  return res;
+}
+
+v4i64 xvslli_d(v4i64 _1, int var) {
+  v4i64 res = __builtin_lasx_xvslli_d(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 63]}}
+  res |= __builtin_lasx_xvslli_d(_1, 64); // expected-error {{argument value 64 is outside the valid range [0, 63]}}
+  res |= __builtin_lasx_xvslli_d(_1, var); // expected-error {{argument to '__builtin_lasx_xvslli_d' must be a constant integer}}
+  return res;
+}
+
+v32i8 xvsrai_b(v32i8 _1, int var) {
+  v32i8 res = __builtin_lasx_xvsrai_b(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 7]}}
+  res |= __builtin_lasx_xvsrai_b(_1, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+  res |= __builtin_lasx_xvsrai_b(_1, var); // expected-error {{argument to '__builtin_lasx_xvsrai_b' must be a constant integer}}
+  return res;
+}
+
+v16i16 xvsrai_h(v16i16 _1, int var) {
+  v16i16 res = __builtin_lasx_xvsrai_h(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 15]}}
+  res |= __builtin_lasx_xvsrai_h(_1, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  res |= __builtin_lasx_xvsrai_h(_1, var); // expected-error {{argument to '__builtin_lasx_xvsrai_h' must be a constant integer}}
+  return res;
+}
+
+v8i32 xvsrai_w(v8i32 _1, int var) {
+  v8i32 res = __builtin_lasx_xvsrai_w(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __builtin_lasx_xvsrai_w(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __builtin_lasx_xvsrai_w(_1, var); // expected-error {{argument to '__builtin_lasx_xvsrai_w' must be a constant integer}}
+  return res;
+}
+
+v4i64 xvsrai_d(v4i64 _1, int var) {
+  v4i64 res = __builtin_lasx_xvsrai_d(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 63]}}
+  res |= __builtin_lasx_xvsrai_d(_1, 64); // expected-error {{argument value 64 is outside the valid range [0, 63]}}
+  res |= __builtin_lasx_xvsrai_d(_1, var); // expected-error {{argument to '__builtin_lasx_xvsrai_d' must be a constant integer}}
+  return res;
+}
+
+v32i8 xvsrari_b(v32i8 _1, int var) {
+  v32i8 res = __builtin_lasx_xvsrari_b(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 7]}}
+  res |= __builtin_lasx_xvsrari_b(_1, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+  res |= __builtin_lasx_xvsrari_b(_1, var); // expected-error {{argument to '__builtin_lasx_xvsrari_b' must be a constant integer}}
+  return res;
+}
+
+v16i16 xvsrari_h(v16i16 _1, int var) {
+  v16i16 res = __builtin_lasx_xvsrari_h(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 15]}}
+  res |= __builtin_lasx_xvsrari_h(_1, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  res |= __builtin_lasx_xvsrari_h(_1, var); // expected-error {{argument to '__builtin_lasx_xvsrari_h' must be a constant integer}}
+  return res;
+}
+
+v8i32 xvsrari_w(v8i32 _1, int var) {
+  v8i32 res = __builtin_lasx_xvsrari_w(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __builtin_lasx_xvsrari_w(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __builtin_lasx_xvsrari_w(_1, var); // expected-error {{argument to '__builtin_lasx_xvsrari_w' must be a constant integer}}
+  return res;
+}
+
+v4i64 xvsrari_d(v4i64 _1, int var) {
+  v4i64 res = __builtin_lasx_xvsrari_d(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 63]}}
+  res |= __builtin_lasx_xvsrari_d(_1, 64); // expected-error {{argument value 64 is outside the valid range [0, 63]}}
+  res |= __builtin_lasx_xvsrari_d(_1, var); // expected-error {{argument to '__builtin_lasx_xvsrari_d' must be a constant integer}}
+  return res;
+}
+
+v32i8 xvsrli_b(v32i8 _1, int var) {
+  v32i8 res = __builtin_lasx_xvsrli_b(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 7]}}
+  res |= __builtin_lasx_xvsrli_b(_1, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+  res |= __builtin_lasx_xvsrli_b(_1, var); // expected-error {{argument to '__builtin_lasx_xvsrli_b' must be a constant integer}}
+  return res;
+}
+
+v16i16 xvsrli_h(v16i16 _1, int var) {
+  v16i16 res = __builtin_lasx_xvsrli_h(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 15]}}
+  res |= __builtin_lasx_xvsrli_h(_1, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  res |= __builtin_lasx_xvsrli_h(_1, var); // expected-error {{argument to '__builtin_lasx_xvsrli_h' must be a constant integer}}
+  return res;
+}
+
+v8i32 xvsrli_w(v8i32 _1, int var) {
+  v8i32 res = __builtin_lasx_xvsrli_w(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __builtin_lasx_xvsrli_w(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __builtin_lasx_xvsrli_w(_1, var); // expected-error {{argument to '__builtin_lasx_xvsrli_w' must be a constant integer}}
+  return res;
+}
+
+v4i64 xvsrli_d(v4i64 _1, int var) {
+  v4i64 res = __builtin_lasx_xvsrli_d(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 63]}}
+  res |= __builtin_lasx_xvsrli_d(_1, 64); // expected-error {{argument value 64 is outside the valid range [0, 63]}}
+  res |= __builtin_lasx_xvsrli_d(_1, var); // expected-error {{argument to '__builtin_lasx_xvsrli_d' must be a constant integer}}
+  return res;
+}
+
+v32i8 xvsrlri_b(v32i8 _1, int var) {
+  v32i8 res = __builtin_lasx_xvsrlri_b(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 7]}}
+  res |= __builtin_lasx_xvsrlri_b(_1, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+  res |= __builtin_lasx_xvsrlri_b(_1, var); // expected-error {{argument to '__builtin_lasx_xvsrlri_b' must be a constant integer}}
+  return res;
+}
+
+v16i16 xvsrlri_h(v16i16 _1, int var) {
+  v16i16 res = __builtin_lasx_xvsrlri_h(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 15]}}
+  res |= __builtin_lasx_xvsrlri_h(_1, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  res |= __builtin_lasx_xvsrlri_h(_1, var); // expected-error {{argument to '__builtin_lasx_xvsrlri_h' must be a constant integer}}
+  return res;
+}
+
+v8i32 xvsrlri_w(v8i32 _1, int var) {
+  v8i32 res = __builtin_lasx_xvsrlri_w(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __builtin_lasx_xvsrlri_w(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __builtin_lasx_xvsrlri_w(_1, var); // expected-error {{argument to '__builtin_lasx_xvsrlri_w' must be a constant integer}}
+  return res;
+}
+
+v4i64 xvsrlri_d(v4i64 _1, int var) {
+  v4i64 res = __builtin_lasx_xvsrlri_d(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 63]}}
+  res |= __builtin_lasx_xvsrlri_d(_1, 64); // expected-error {{argument value 64 is outside the valid range [0, 63]}}
+  res |= __builtin_lasx_xvsrlri_d(_1, var); // expected-error {{argument to '__builtin_lasx_xvsrlri_d' must be a constant integer}}
+  return res;
+}
+
+v32u8 xvbitclri_b(v32u8 _1, int var) {
+  v32u8 res = __builtin_lasx_xvbitclri_b(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 7]}}
+  res |= __builtin_lasx_xvbitclri_b(_1, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+  res |= __builtin_lasx_xvbitclri_b(_1, var); // expected-error {{argument to '__builtin_lasx_xvbitclri_b' must be a constant integer}}
+  return res;
+}
+
+v16u16 xvbitclri_h(v16u16 _1, int var) {
+  v16u16 res = __builtin_lasx_xvbitclri_h(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 15]}}
+  res |= __builtin_lasx_xvbitclri_h(_1, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  res |= __builtin_lasx_xvbitclri_h(_1, var); // expected-error {{argument to '__builtin_lasx_xvbitclri_h' must be a constant integer}}
+  return res;
+}
+
+v8u32 xvbitclri_w(v8u32 _1, int var) {
+  v8u32 res = __builtin_lasx_xvbitclri_w(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __builtin_lasx_xvbitclri_w(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __builtin_lasx_xvbitclri_w(_1, var); // expected-error {{argument to '__builtin_lasx_xvbitclri_w' must be a constant integer}}
+  return res;
+}
+
+v4u64 xvbitclri_d(v4u64 _1, int var) {
+  v4u64 res = __builtin_lasx_xvbitclri_d(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 63]}}
+  res |= __builtin_lasx_xvbitclri_d(_1, 64); // expected-error {{argument value 64 is outside the valid range [0, 63]}}
+  res |= __builtin_lasx_xvbitclri_d(_1, var); // expected-error {{argument to '__builtin_lasx_xvbitclri_d' must be a constant integer}}
+  return res;
+}
+
+v32u8 xvbitseti_b(v32u8 _1, int var) {
+  v32u8 res = __builtin_lasx_xvbitseti_b(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 7]}}
+  res |= __builtin_lasx_xvbitseti_b(_1, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+  res |= __builtin_lasx_xvbitseti_b(_1, var); // expected-error {{argument to '__builtin_lasx_xvbitseti_b' must be a constant integer}}
+  return res;
+}
+
+v16u16 xvbitseti_h(v16u16 _1, int var) {
+  v16u16 res = __builtin_lasx_xvbitseti_h(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 15]}}
+  res |= __builtin_lasx_xvbitseti_h(_1, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  res |= __builtin_lasx_xvbitseti_h(_1, var); // expected-error {{argument to '__builtin_lasx_xvbitseti_h' must be a constant integer}}
+  return res;
+}
+
+v8u32 xvbitseti_w(v8u32 _1, int var) {
+  v8u32 res = __builtin_lasx_xvbitseti_w(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __builtin_lasx_xvbitseti_w(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __builtin_lasx_xvbitseti_w(_1, var); // expected-error {{argument to '__builtin_lasx_xvbitseti_w' must be a constant integer}}
+  return res;
+}
+
+v4u64 xvbitseti_d(v4u64 _1, int var) {
+  v4u64 res = __builtin_lasx_xvbitseti_d(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 63]}}
+  res |= __builtin_lasx_xvbitseti_d(_1, 64); // expected-error {{argument value 64 is outside the valid range [0, 63]}}
+  res |= __builtin_lasx_xvbitseti_d(_1, var); // expected-error {{argument to '__builtin_lasx_xvbitseti_d' must be a constant integer}}
+  return res;
+}
+
+v32u8 xvbitrevi_b(v32u8 _1, int var) {
+  v32u8 res = __builtin_lasx_xvbitrevi_b(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 7]}}
+  res |= __builtin_lasx_xvbitrevi_b(_1, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+  res |= __builtin_lasx_xvbitrevi_b(_1, var); // expected-error {{argument to '__builtin_lasx_xvbitrevi_b' must be a constant integer}}
+  return res;
+}
+
+v16u16 xvbitrevi_h(v16u16 _1, int var) {
+  v16u16 res = __builtin_lasx_xvbitrevi_h(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 15]}}
+  res |= __builtin_lasx_xvbitrevi_h(_1, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  res |= __builtin_lasx_xvbitrevi_h(_1, var); // expected-error {{argument to '__builtin_lasx_xvbitrevi_h' must be a constant integer}}
+  return res;
+}
+
+v8u32 xvbitrevi_w(v8u32 _1, int var) {
+  v8u32 res = __builtin_lasx_xvbitrevi_w(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __builtin_lasx_xvbitrevi_w(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __builtin_lasx_xvbitrevi_w(_1, var); // expected-error {{argument to '__builtin_lasx_xvbitrevi_w' must be a constant integer}}
+  return res;
+}
+
+v4u64 xvbitrevi_d(v4u64 _1, int var) {
+  v4u64 res = __builtin_lasx_xvbitrevi_d(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 63]}}
+  res |= __builtin_lasx_xvbitrevi_d(_1, 64); // expected-error {{argument value 64 is outside the valid range [0, 63]}}
+  res |= __builtin_lasx_xvbitrevi_d(_1, var); // expected-error {{argument to '__builtin_lasx_xvbitrevi_d' must be a constant integer}}
+  return res;
+}
+
+v32i8 xvaddi_bu(v32i8 _1, int var) {
+  v32i8 res = __builtin_lasx_xvaddi_bu(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __builtin_lasx_xvaddi_bu(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __builtin_lasx_xvaddi_bu(_1, var); // expected-error {{argument to '__builtin_lasx_xvaddi_bu' must be a constant integer}}
+  return res;
+}
+
+v16i16 xvaddi_hu(v16i16 _1, int var) {
+  v16i16 res = __builtin_lasx_xvaddi_hu(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __builtin_lasx_xvaddi_hu(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __builtin_lasx_xvaddi_hu(_1, var); // expected-error {{argument to '__builtin_lasx_xvaddi_hu' must be a constant integer}}
+  return res;
+}
+
+v8i32 xvaddi_wu(v8i32 _1, int var) {
+  v8i32 res = __builtin_lasx_xvaddi_wu(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __builtin_lasx_xvaddi_wu(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __builtin_lasx_xvaddi_wu(_1, var); // expected-error {{argument to '__builtin_lasx_xvaddi_wu' must be a constant integer}}
+  return res;
+}
+
+v4i64 xvaddi_du(v4i64 _1, int var) {
+  v4i64 res = __builtin_lasx_xvaddi_du(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __builtin_lasx_xvaddi_du(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __builtin_lasx_xvaddi_du(_1, var); // expected-error {{argument to '__builtin_lasx_xvaddi_du' must be a constant integer}}
+  return res;
+}
+
+v32i8 xvsubi_bu(v32i8 _1, int var) {
+  v32i8 res = __builtin_lasx_xvsubi_bu(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __builtin_lasx_xvsubi_bu(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __builtin_lasx_xvsubi_bu(_1, var); // expected-error {{argument to '__builtin_lasx_xvsubi_bu' must be a constant integer}}
+  return res;
+}
+
+v16i16 xvsubi_hu(v16i16 _1, int var) {
+  v16i16 res = __builtin_lasx_xvsubi_hu(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __builtin_lasx_xvsubi_hu(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __builtin_lasx_xvsubi_hu(_1, var); // expected-error {{argument to '__builtin_lasx_xvsubi_hu' must be a constant integer}}
+  return res;
+}
+
+v8i32 xvsubi_wu(v8i32 _1, int var) {
+  v8i32 res = __builtin_lasx_xvsubi_wu(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __builtin_lasx_xvsubi_wu(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __builtin_lasx_xvsubi_wu(_1, var); // expected-error {{argument to '__builtin_lasx_xvsubi_wu' must be a constant integer}}
+  return res;
+}
+
+v4i64 xvsubi_du(v4i64 _1, int var) {
+  v4i64 res = __builtin_lasx_xvsubi_du(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __builtin_lasx_xvsubi_du(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __builtin_lasx_xvsubi_du(_1, var); // expected-error {{argument to '__builtin_lasx_xvsubi_du' must be a constant integer}}
+  return res;
+}
+
+v32i8 xvmaxi_b(v32i8 _1, int var) {
+  v32i8 res = __builtin_lasx_xvmaxi_b(_1, -17); // expected-error {{argument value -17 is outside the valid range [-16, 15]}}
+  res |= __builtin_lasx_xvmaxi_b(_1, 16); // expected-error {{argument value 16 is outside the valid range [-16, 15]}}
+  res |= __builtin_lasx_xvmaxi_b(_1, var); // expected-error {{argument to '__builtin_lasx_xvmaxi_b' must be a constant integer}}
+  return res;
+}
+
+v16i16 xvmaxi_h(v16i16 _1, int var) {
+  v16i16 res = __builtin_lasx_xvmaxi_h(_1, -17); // expected-error {{argument value -17 is outside the valid range [-16, 15]}}
+  res |= __builtin_lasx_xvmaxi_h(_1, 16); // expected-error {{argument value 16 is outside the valid range [-16, 15]}}
+  res |= __builtin_lasx_xvmaxi_h(_1, var); // expected-error {{argument to '__builtin_lasx_xvmaxi_h' must be a constant integer}}
+  return res;
+}
+
+v8i32 xvmaxi_w(v8i32 _1, int var) {
+  v8i32 res = __builtin_lasx_xvmaxi_w(_1, -17); // expected-error {{argument value -17 is outside the valid range [-16, 15]}}
+  res |= __builtin_lasx_xvmaxi_w(_1, 16); // expected-error {{argument value 16 is outside the valid range [-16, 15]}}
+  res |= __builtin_lasx_xvmaxi_w(_1, var); // expected-error {{argument to '__builtin_lasx_xvmaxi_w' must be a constant integer}}
+  return res;
+}
+
+v4i64 xvmaxi_d(v4i64 _1, int var) {
+  v4i64 res = __builtin_lasx_xvmaxi_d(_1, -17); // expected-error {{argument value -17 is outside the valid range [-16, 15]}}
+  res |= __builtin_lasx_xvmaxi_d(_1, 16); // expected-error {{argument value 16 is outside the valid range [-16, 15]}}
+  res |= __builtin_lasx_xvmaxi_d(_1, var); // expected-error {{argument to '__builtin_lasx_xvmaxi_d' must be a constant integer}}
+  return res;
+}
+
+v32u8 xvmaxi_bu(v32u8 _1, int var) {
+  v32u8 res = __builtin_lasx_xvmaxi_bu(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __builtin_lasx_xvmaxi_bu(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __builtin_lasx_xvmaxi_bu(_1, var); // expected-error {{argument to '__builtin_lasx_xvmaxi_bu' must be a constant integer}}
+  return res;
+}
+
+v16u16 xvmaxi_hu(v16u16 _1, int var) {
+  v16u16 res = __builtin_lasx_xvmaxi_hu(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __builtin_lasx_xvmaxi_hu(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __builtin_lasx_xvmaxi_hu(_1, var); // expected-error {{argument to '__builtin_lasx_xvmaxi_hu' must be a constant integer}}
+  return res;
+}
+
+v8u32 xvmaxi_wu(v8u32 _1, int var) {
+  v8u32 res = __builtin_lasx_xvmaxi_wu(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __builtin_lasx_xvmaxi_wu(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __builtin_lasx_xvmaxi_wu(_1, var); // expected-error {{argument to '__builtin_lasx_xvmaxi_wu' must be a constant integer}}
+  return res;
+}
+
+v4u64 xvmaxi_du(v4u64 _1, int var) {
+  v4u64 res = __builtin_lasx_xvmaxi_du(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __builtin_lasx_xvmaxi_du(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __builtin_lasx_xvmaxi_du(_1, var); // expected-error {{argument to '__builtin_lasx_xvmaxi_du' must be a constant integer}}
+  return res;
+}
+
+v32i8 xvmini_b(v32i8 _1, int var) {
+  v32i8 res = __builtin_lasx_xvmini_b(_1, -17); // expected-error {{argument value -17 is outside the valid range [-16, 15]}}
+  res |= __builtin_lasx_xvmini_b(_1, 16); // expected-error {{argument value 16 is outside the valid range [-16, 15]}}
+  res |= __builtin_lasx_xvmini_b(_1, var); // expected-error {{argument to '__builtin_lasx_xvmini_b' must be a constant integer}}
+  return res;
+}
+
+v16i16 xvmini_h(v16i16 _1, int var) {
+  v16i16 res = __builtin_lasx_xvmini_h(_1, -17); // expected-error {{argument value -17 is outside the valid range [-16, 15]}}
+  res |= __builtin_lasx_xvmini_h(_1, 16); // expected-error {{argument value 16 is outside the valid range [-16, 15]}}
+  res |= __builtin_lasx_xvmini_h(_1, var); // expected-error {{argument to '__builtin_lasx_xvmini_h' must be a constant integer}}}
+  return res;
+}
+
+v8i32 xvmini_w(v8i32 _1, int var) {
+  v8i32 res = __builtin_lasx_xvmini_w(_1, -17); // expected-error {{argument value -17 is outside the valid range [-16, 15]}}
+  res |= __builtin_lasx_xvmini_w(_1, 16); // expected-error {{argument value 16 is outside the valid range [-16, 15]}}
+  res |= __builtin_lasx_xvmini_w(_1, var); // expected-error {{argument to '__builtin_lasx_xvmini_w' must be a constant integer}}
+  return res;
+}
+
+v4i64 xvmini_d(v4i64 _1, int var) {
+  v4i64 res = __builtin_lasx_xvmini_d(_1, -17); // expected-error {{argument value -17 is outside the valid range [-16, 15]}}
+  res |= __builtin_lasx_xvmini_d(_1, 16); // expected-error {{argument value 16 is outside the valid range [-16, 15]}}
+  res |= __builtin_lasx_xvmini_d(_1, var); // expected-error {{argument to '__builtin_lasx_xvmini_d' must be a constant integer}}
+  return res;
+}
+
+v32u8 xvmini_bu(v32u8 _1, int var) {
+  v32u8 res = __builtin_lasx_xvmini_bu(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __builtin_lasx_xvmini_bu(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __builtin_lasx_xvmini_bu(_1, var); // expected-error {{argument to '__builtin_lasx_xvmini_bu' must be a constant integer}}
+  return res;
+}
+
+v16u16 xvmini_hu(v16u16 _1, int var) {
+  v16u16 res = __builtin_lasx_xvmini_hu(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __builtin_lasx_xvmini_hu(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __builtin_lasx_xvmini_hu(_1, var); // expected-error {{argument to '__builtin_lasx_xvmini_hu' must be a constant integer}}
+  return res;
+}
+
+v8u32 xvmini_wu(v8u32 _1, int var) {
+  v8u32 res = __builtin_lasx_xvmini_wu(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __builtin_lasx_xvmini_wu(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __builtin_lasx_xvmini_wu(_1, var); // expected-error {{argument to '__builtin_lasx_xvmini_wu' must be a constant integer}}
+  return res;
+}
+
+v4u64 xvmini_du(v4u64 _1, int var) {
+  v4u64 res = __builtin_lasx_xvmini_du(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __builtin_lasx_xvmini_du(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __builtin_lasx_xvmini_du(_1, var); // expected-error {{argument to '__builtin_lasx_xvmini_du' must be a constant integer}}
+  return res;
+}
+
+v32i8 xvseqi_b(v32i8 _1, int var) {
+  v32i8 res = __builtin_lasx_xvseqi_b(_1, -17); // expected-error {{argument value -17 is outside the valid range [-16, 15]}}
+  res |= __builtin_lasx_xvseqi_b(_1, 16); // expected-error {{argument value 16 is outside the valid range [-16, 15]}}
+  res |= __builtin_lasx_xvseqi_b(_1, var); // expected-error {{argument to '__builtin_lasx_xvseqi_b' must be a constant integer}}
+  return res;
+}
+
+v16i16 xvseqi_h(v16i16 _1, int var) {
+  v16i16 res = __builtin_lasx_xvseqi_h(_1, -17); // expected-error {{argument value -17 is outside the valid range [-16, 15]}}
+  res |= __builtin_lasx_xvseqi_h(_1, 16); // expected-error {{argument value 16 is outside the valid range [-16, 15]}}
+  res |= __builtin_lasx_xvseqi_h(_1, var); // expected-error {{argument to '__builtin_lasx_xvseqi_h' must be a constant integer}}
+  return res;
+}
+
+v8i32 xvseqi_w(v8i32 _1, int var) {
+  v8i32 res = __builtin_lasx_xvseqi_w(_1, -17); // expected-error {{argument value -17 is outside the valid range [-16, 15]}}
+  res |= __builtin_lasx_xvseqi_w(_1, 16); // expected-error {{argument value 16 is outside the valid range [-16, 15]}}
+  res |= __builtin_lasx_xvseqi_w(_1, var); // expected-error {{argument to '__builtin_lasx_xvseqi_w' must be a constant integer}}
+  return res;
+}
+
+v4i64 xvseqi_d(v4i64 _1, int var) {
+  v4i64 res = __builtin_lasx_xvseqi_d(_1, -17); // expected-error {{argument value -17 is outside the valid range [-16, 15]}}
+  res |= __builtin_lasx_xvseqi_d(_1, 16); // expected-error {{argument value 16 is outside the valid range [-16, 15]}}
+  res |= __builtin_lasx_xvseqi_d(_1, var); // expected-error {{argument to '__builtin_lasx_xvseqi_d' must be a constant integer}}
+  return res;
+}
+
+v32i8 xvslti_b(v32i8 _1, int var) {
+  v32i8 res = __builtin_lasx_xvslti_b(_1, -17); // expected-error {{argument value -17 is outside the valid range [-16, 15]}}
+  res |= __builtin_lasx_xvslti_b(_1, 16); // expected-error {{argument value 16 is outside the valid range [-16, 15]}}
+  res |= __builtin_lasx_xvslti_b(_1, var); // expected-error {{argument to '__builtin_lasx_xvslti_b' must be a constant integer}}
+  return res;
+}
+
+v16i16 xvslti_h(v16i16 _1, int var) {
+  v16i16 res = __builtin_lasx_xvslti_h(_1, -17); // expected-error {{argument value -17 is outside the valid range [-16, 15]}}
+  res |= __builtin_lasx_xvslti_h(_1, 16); // expected-error {{argument value 16 is outside the valid range [-16, 15]}}
+  res |= __builtin_lasx_xvslti_h(_1, var); // expected-error {{argument to '__builtin_lasx_xvslti_h' must be a constant integer}}
+  return res;
+}
+
+v8i32 xvslti_w(v8i32 _1, int var) {
+  v8i32 res = __builtin_lasx_xvslti_w(_1, -17); // expected-error {{argument value -17 is outside the valid range [-16, 15]}}
+  res |= __builtin_lasx_xvslti_w(_1, 16); // expected-error {{argument value 16 is outside the valid range [-16, 15]}}
+  res |= __builtin_lasx_xvslti_w(_1, var); // expected-error {{argument to '__builtin_lasx_xvslti_w' must be a constant integer}}
+  return res;
+}
+
+v4i64 xvslti_d(v4i64 _1, int var) {
+  v4i64 res = __builtin_lasx_xvslti_d(_1, -17); // expected-error {{argument value -17 is outside the valid range [-16, 15]}}
+  res |= __builtin_lasx_xvslti_d(_1, 16); // expected-error {{argument value 16 is outside the valid range [-16, 15]}}
+  res |= __builtin_lasx_xvslti_d(_1, var); // expected-error {{argument to '__builtin_lasx_xvslti_d' must be a constant integer}}
+  return res;
+}
+
+v32i8 xvslti_bu(v32u8 _1, int var) {
+  v32i8 res = __builtin_lasx_xvslti_bu(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __builtin_lasx_xvslti_bu(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __builtin_lasx_xvslti_bu(_1, var); // expected-error {{argument to '__builtin_lasx_xvslti_bu' must be a constant integer}}
+  return res;
+}
+
+v16i16 xvslti_hu(v16u16 _1, int var) {
+  v16i16 res = __builtin_lasx_xvslti_hu(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __builtin_lasx_xvslti_hu(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __builtin_lasx_xvslti_hu(_1, var); // expected-error {{argument to '__builtin_lasx_xvslti_hu' must be a constant integer}}
+  return res;
+}
+
+v8i32 xvslti_wu(v8u32 _1, int var) {
+  v8i32 res = __builtin_lasx_xvslti_wu(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __builtin_lasx_xvslti_wu(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __builtin_lasx_xvslti_wu(_1, var); // expected-error {{argument to '__builtin_lasx_xvslti_wu' must be a constant integer}}
+  return res;
+}
+
+v4i64 xvslti_du(v4u64 _1, int var) {
+  v4i64 res = __builtin_lasx_xvslti_du(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __builtin_lasx_xvslti_du(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __builtin_lasx_xvslti_du(_1, var); // expected-error {{argument to '__builtin_lasx_xvslti_du' must be a constant integer}}
+  return res;
+}
+
+v32i8 xvslei_b(v32i8 _1, int var) {
+  v32i8 res = __builtin_lasx_xvslei_b(_1, -17); // expected-error {{argument value -17 is outside the valid range [-16, 15]}}
+  res |= __builtin_lasx_xvslei_b(_1, 16); // expected-error {{argument value 16 is outside the valid range [-16, 15]}}
+  res |= __builtin_lasx_xvslei_b(_1, var); // expected-error {{argument to '__builtin_lasx_xvslei_b' must be a constant integer}}
+  return res;
+}
+
+v16i16 xvslei_h(v16i16 _1, int var) {
+  v16i16 res = __builtin_lasx_xvslei_h(_1, -17); // expected-error {{argument value -17 is outside the valid range [-16, 15]}}
+  res |= __builtin_lasx_xvslei_h(_1, 16); // expected-error {{argument value 16 is outside the valid range [-16, 15]}}
+  res |= __builtin_lasx_xvslei_h(_1, var); // expected-error {{argument to '__builtin_lasx_xvslei_h' must be a constant integer}}
+  return res;
+}
+
+v8i32 xvslei_w(v8i32 _1, int var) {
+  v8i32 res = __builtin_lasx_xvslei_w(_1, -17); // expected-error {{argument value -17 is outside the valid range [-16, 15]}}
+  res |= __builtin_lasx_xvslei_w(_1, 16); // expected-error {{argument value 16 is outside the valid range [-16, 15]}}
+  res |= __builtin_lasx_xvslei_w(_1, var); // expected-error {{argument to '__builtin_lasx_xvslei_w' must be a constant integer}}
+  return res;
+}
+
+v4i64 xvslei_d(v4i64 _1, int var) {
+  v4i64 res = __builtin_lasx_xvslei_d(_1, -17); // expected-error {{argument value -17 is outside the valid range [-16, 15]}}
+  res |= __builtin_lasx_xvslei_d(_1, 16); // expected-error {{argument value 16 is outside the valid range [-16, 15]}}
+  res |= __builtin_lasx_xvslei_d(_1, var); // expected-error {{argument to '__builtin_lasx_xvslei_d' must be a constant integer}}
+  return res;
+}
+
+v32i8 xvslei_bu(v32u8 _1, int var) {
+  v32i8 res = __builtin_lasx_xvslei_bu(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __builtin_lasx_xvslei_bu(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __builtin_lasx_xvslei_bu(_1, var); // expected-error {{argument to '__builtin_lasx_xvslei_bu' must be a constant integer}}
+  return res;
+}
+
+v16i16 xvslei_hu(v16u16 _1, int var) {
+  v16i16 res = __builtin_lasx_xvslei_hu(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __builtin_lasx_xvslei_hu(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __builtin_lasx_xvslei_hu(_1, var); // expected-error {{argument to '__builtin_lasx_xvslei_hu' must be a constant integer}}
+  return res;
+}
+
+v8i32 xvslei_wu(v8u32 _1, int var) {
+  v8i32 res = __builtin_lasx_xvslei_wu(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __builtin_lasx_xvslei_wu(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __builtin_lasx_xvslei_wu(_1, var); // expected-error {{argument to '__builtin_lasx_xvslei_wu' must be a constant integer}}
+  return res;
+}
+
+v4i64 xvslei_du(v4u64 _1, int var) {
+  v4i64 res = __builtin_lasx_xvslei_du(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __builtin_lasx_xvslei_du(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __builtin_lasx_xvslei_du(_1, var); // expected-error {{argument to '__builtin_lasx_xvslei_du' must be a constant integer}}
+  return res;
+}
+
+v32i8 xvsat_b(v32i8 _1, int var) {
+  v32i8 res = __builtin_lasx_xvsat_b(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 7]}}
+  res |= __builtin_lasx_xvsat_b(_1, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+  res |= __builtin_lasx_xvsat_b(_1, var); // expected-error {{argument to '__builtin_lasx_xvsat_b' must be a constant integer}}
+  return res;
+}
+
+v16i16 xvsat_h(v16i16 _1, int var) {
+  v16i16 res = __builtin_lasx_xvsat_h(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 15]}}
+  res |= __builtin_lasx_xvsat_h(_1, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  res |= __builtin_lasx_xvsat_h(_1, var); // expected-error {{argument to '__builtin_lasx_xvsat_h' must be a constant integer}}
+  return res;
+}
+
+v8i32 xvsat_w(v8i32 _1, int var) {
+  v8i32 res = __builtin_lasx_xvsat_w(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __builtin_lasx_xvsat_w(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __builtin_lasx_xvsat_w(_1, var); // expected-error {{argument to '__builtin_lasx_xvsat_w' must be a constant integer}}
+  return res;
+}
+
+v4i64 xvsat_d(v4i64 _1, int var) {
+  v4i64 res = __builtin_lasx_xvsat_d(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 63]}}
+  res |= __builtin_lasx_xvsat_d(_1, 64); // expected-error {{argument value 64 is outside the valid range [0, 63]}}
+  res |= __builtin_lasx_xvsat_d(_1, var); // expected-error {{argument to '__builtin_lasx_xvsat_d' must be a constant integer}}
+  return res;
+}
+
+v32u8 xvsat_bu(v32u8 _1, int var) {
+  v32u8 res = __builtin_lasx_xvsat_bu(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 7]}}
+  res |= __builtin_lasx_xvsat_bu(_1, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+  res |= __builtin_lasx_xvsat_bu(_1, var); // expected-error {{argument to '__builtin_lasx_xvsat_bu' must be a constant integer}}
+  return res;
+}
+
+v16u16 xvsat_hu(v16u16 _1, int var) {
+  v16u16 res = __builtin_lasx_xvsat_hu(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 15]}}
+  res |= __builtin_lasx_xvsat_hu(_1, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  res |= __builtin_lasx_xvsat_hu(_1, var); // expected-error {{argument to '__builtin_lasx_xvsat_hu' must be a constant integer}}
+  return res;
+}
+
+v8u32 xvsat_wu(v8u32 _1, int var) {
+  v8u32 res = __builtin_lasx_xvsat_wu(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __builtin_lasx_xvsat_wu(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __builtin_lasx_xvsat_wu(_1, var); // expected-error {{argument to '__builtin_lasx_xvsat_wu' must be a constant integer}}
+  return res;
+}
+
+v4u64 xvsat_du(v4u64 _1, int var) {
+  v4u64 res = __builtin_lasx_xvsat_du(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 63]}}
+  res |= __builtin_lasx_xvsat_du(_1, 64); // expected-error {{argument value 64 is outside the valid range [0, 63]}}
+  res |= __builtin_lasx_xvsat_du(_1, var); // expected-error {{argument to '__builtin_lasx_xvsat_du' must be a constant integer}}
+  return res;
+}
+
+v32i8 xvrepl128vei_b(v32i8 _1, int var) {
+  v32i8 res = __builtin_lasx_xvrepl128vei_b(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 15]}}
+  res |= __builtin_lasx_xvrepl128vei_b(_1, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  res |= __builtin_lasx_xvrepl128vei_b(_1, var); // expected-error {{argument to '__builtin_lasx_xvrepl128vei_b' must be a constant integer}}
+  return res;
+}
+
+v16i16 xvrepl128vei_h(v16i16 _1, int var) {
+  v16i16 res = __builtin_lasx_xvrepl128vei_h(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 7]}}
+  res |= __builtin_lasx_xvrepl128vei_h(_1, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+  res |= __builtin_lasx_xvrepl128vei_h(_1, var); // expected-error {{argument to '__builtin_lasx_xvrepl128vei_h' must be a constant integer}}
+  return res;
+}
+
+v8i32 xvrepl128vei_w(v8i32 _1, int var) {
+  v8i32 res = __builtin_lasx_xvrepl128vei_w(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 3]}}
+  res |= __builtin_lasx_xvrepl128vei_w(_1, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  res |= __builtin_lasx_xvrepl128vei_w(_1, var); // expected-error {{argument to '__builtin_lasx_xvrepl128vei_w' must be a constant integer}}
+  return res;
+}
+
+v4i64 xvrepl128vei_d(v4i64 _1, int var) {
+  v4i64 res = __builtin_lasx_xvrepl128vei_d(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 1]}}
+  res |= __builtin_lasx_xvrepl128vei_d(_1, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}}
+  res |= __builtin_lasx_xvrepl128vei_d(_1, var); // expected-error {{argument to '__builtin_lasx_xvrepl128vei_d' must be a constant integer}}
+  return res;
+}
+
+v32u8 xvandi_b(v32u8 _1, int var) {
+  v32u8 res = __builtin_lasx_xvandi_b(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 255]}}
+  res |= __builtin_lasx_xvandi_b(_1, 256); // expected-error {{argument value 256 is outside the valid range [0, 255]}}
+  res |= __builtin_lasx_xvandi_b(_1, var); // expected-error {{argument to '__builtin_lasx_xvandi_b' must be a constant integer}}
+  return res;
+}
+
+v32u8 xvori_b(v32u8 _1, int var) {
+  v32u8 res = __builtin_lasx_xvori_b(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 255]}}
+  res |= __builtin_lasx_xvori_b(_1, 256); // expected-error {{argument value 256 is outside the valid range [0, 255]}}
+  res |= __builtin_lasx_xvori_b(_1, var); // expected-error {{argument to '__builtin_lasx_xvori_b' must be a constant integer}}
+  return res;
+}
+
+v32u8 xvnori_b(v32u8 _1, int var) {
+  v32u8 res = __builtin_lasx_xvnori_b(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 255]}}
+  res |= __builtin_lasx_xvnori_b(_1, 256); // expected-error {{argument value 256 is outside the valid range [0, 255]}}
+  res |= __builtin_lasx_xvnori_b(_1, var); // expected-error {{argument to '__builtin_lasx_xvnori_b' must be a constant integer}}
+  return res;
+}
+
+v32u8 xvxori_b(v32u8 _1, int var) {
+  v32u8 res = __builtin_lasx_xvxori_b(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 255]}}
+  res |= __builtin_lasx_xvxori_b(_1, 256); // expected-error {{argument value 256 is outside the valid range [0, 255]}}
+  res |= __builtin_lasx_xvxori_b(_1, var); // expected-error {{argument to '__builtin_lasx_xvxori_b' must be a constant integer}}
+  return res;
+}
+
+v32u8 xvbitseli_b(v32u8 _1, v32u8 _2, int var) {
+  v32u8 res = __builtin_lasx_xvbitseli_b(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 255]}}
+  res |= __builtin_lasx_xvbitseli_b(_1, _2, 256); // expected-error {{argument value 256 is outside the valid range [0, 255]}}
+  res |= __builtin_lasx_xvbitseli_b(_1, _2, var); // expected-error {{argument to '__builtin_lasx_xvbitseli_b' must be a constant integer}}
+  return res;
+}
+
+v32i8 xvshuf4i_b(v32i8 _1, int var) {
+  v32i8 res = __builtin_lasx_xvshuf4i_b(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 255]}}
+  res |= __builtin_lasx_xvshuf4i_b(_1, 256); // expected-error {{argument value 256 is outside the valid range [0, 255]}}
+  res |= __builtin_lasx_xvshuf4i_b(_1, var); // expected-error {{argument to '__builtin_lasx_xvshuf4i_b' must be a constant integer}}
+  return res;
+}
+
+v16i16 xvshuf4i_h(v16i16 _1, int var) {
+  v16i16 res = __builtin_lasx_xvshuf4i_h(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 255]}}
+  res |= __builtin_lasx_xvshuf4i_h(_1, 256); // expected-error {{argument value 256 is outside the valid range [0, 255]}}
+  res |= __builtin_lasx_xvshuf4i_h(_1, var); // expected-error {{argument to '__builtin_lasx_xvshuf4i_h' must be a constant integer}}
+  return res;
+}
+
+v8i32 xvshuf4i_w(v8i32 _1, int var) {
+  v8i32 res = __builtin_lasx_xvshuf4i_w(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 255]}}
+  res |= __builtin_lasx_xvshuf4i_w(_1, 256); // expected-error {{argument value 256 is outside the valid range [0, 255]}}
+  res |= __builtin_lasx_xvshuf4i_w(_1, var); // expected-error {{argument to '__builtin_lasx_xvshuf4i_w' must be a constant integer}}
+  return res;
+}
+
+v4i64 xvshuf4i_d(v4i64 _1, v4i64 _2, int var) {
+  v4i64 res = __builtin_lasx_xvshuf4i_d(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 255]}}
+  res |= __builtin_lasx_xvshuf4i_d(_1, _2, 256); // expected-error {{argument value 256 is outside the valid range [0, 255]}}
+  res |= __builtin_lasx_xvshuf4i_d(_1, _2, var); // expected-error {{argument to '__builtin_lasx_xvshuf4i_d' must be a constant integer}}
+  return res;
+}
+
+v8i32 xvpermi_w(v8i32 _1, v8i32 _2, int var) {
+  v8i32 res = __builtin_lasx_xvpermi_w(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 255]}}
+  res |= __builtin_lasx_xvpermi_w(_1, _2, 256); // expected-error {{argument value 256 is outside the valid range [0, 255]}}
+  res |= __builtin_lasx_xvpermi_w(_1, _2, var); // expected-error {{argument to '__builtin_lasx_xvpermi_w' must be a constant integer}}
+  return res;
+}
+
+v4i64 xvpermi_d(v4i64 _1, int var) {
+  v4i64 res = __builtin_lasx_xvpermi_d(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 255]}}
+  res |= __builtin_lasx_xvpermi_d(_1, 256); // expected-error {{argument value 256 is outside the valid range [0, 255]}}
+  res |= __builtin_lasx_xvpermi_d(_1, var); // expected-error {{argument to '__builtin_lasx_xvpermi_d' must be a constant integer}}
+  return res;
+}
+
+v32i8 xvpermi_q(v32i8 _1, v32i8 _2, int var) {
+  v32i8 res = __builtin_lasx_xvpermi_q(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 255]}}
+  res |= __builtin_lasx_xvpermi_q(_1, _2, 256); // expected-error {{argument value 256 is outside the valid range [0, 255]}}
+  res |= __builtin_lasx_xvpermi_q(_1, _2, var); // expected-error {{argument to '__builtin_lasx_xvpermi_q' must be a constant integer}}
+  return res;
+}
+
+v16i16 xvsllwil_h_b(v32i8 _1, int var) {
+  v16i16 res = __builtin_lasx_xvsllwil_h_b(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 7]}}
+  res |= __builtin_lasx_xvsllwil_h_b(_1, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+  res |= __builtin_lasx_xvsllwil_h_b(_1, var); // expected-error {{argument to '__builtin_lasx_xvsllwil_h_b' must be a constant integer}}
+  return res;
+}
+
+v8i32 xvsllwil_w_h(v16i16 _1, int var) {
+  v8i32 res = __builtin_lasx_xvsllwil_w_h(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 15]}}
+  res |= __builtin_lasx_xvsllwil_w_h(_1, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  res |= __builtin_lasx_xvsllwil_w_h(_1, var); // expected-error {{argument to '__builtin_lasx_xvsllwil_w_h' must be a constant integer}}
+  return res;
+}
+
+v4i64 xvsllwil_d_w(v8i32 _1, int var) {
+  v4i64 res = __builtin_lasx_xvsllwil_d_w(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __builtin_lasx_xvsllwil_d_w(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __builtin_lasx_xvsllwil_d_w(_1, var); // expected-error {{argument to '__builtin_lasx_xvsllwil_d_w' must be a constant integer}}
+  return res;
+}
+
+v16u16 xvsllwil_hu_bu(v32u8 _1, int var) {
+  v16u16 res = __builtin_lasx_xvsllwil_hu_bu(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 7]}}
+  res |= __builtin_lasx_xvsllwil_hu_bu(_1, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+  res |= __builtin_lasx_xvsllwil_hu_bu(_1, var); // expected-error {{argument to '__builtin_lasx_xvsllwil_hu_bu' must be a constant integer}}
+  return res;
+}
+
+v8u32 xvsllwil_wu_hu(v16u16 _1, int var) {
+  v8u32 res = __builtin_lasx_xvsllwil_wu_hu(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 15]}}
+  res |= __builtin_lasx_xvsllwil_wu_hu(_1, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  res |= __builtin_lasx_xvsllwil_wu_hu(_1, var); // expected-error {{argument to '__builtin_lasx_xvsllwil_wu_hu' must be a constant integer}}
+  return res;
+}
+
+v4u64 xvsllwil_du_wu(v8u32 _1, int var) {
+  v4u64 res = __builtin_lasx_xvsllwil_du_wu(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __builtin_lasx_xvsllwil_du_wu(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __builtin_lasx_xvsllwil_du_wu(_1, var); // expected-error {{argument to '__builtin_lasx_xvsllwil_du_wu' must be a constant integer}}
+  return res;
+}
+
+v32i8 xvfrstpi_b(v32i8 _1, v32i8 _2, int var) {
+  v32i8 res = __builtin_lasx_xvfrstpi_b(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __builtin_lasx_xvfrstpi_b(_1, _2, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __builtin_lasx_xvfrstpi_b(_1, _2, var); // expected-error {{argument to '__builtin_lasx_xvfrstpi_b' must be a constant integer}}
+  return res;
+}
+
+v16i16 xvfrstpi_h(v16i16 _1, v16i16 _2, int var) {
+  v16i16 res = __builtin_lasx_xvfrstpi_h(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __builtin_lasx_xvfrstpi_h(_1, _2, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __builtin_lasx_xvfrstpi_h(_1, _2, var); // expected-error {{argument to '__builtin_lasx_xvfrstpi_h' must be a constant integer}}
+  return res;
+}
+
+v32i8 xvbsrl_v(v32i8 _1, int var) {
+  v32i8 res = __builtin_lasx_xvbsrl_v(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __builtin_lasx_xvbsrl_v(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __builtin_lasx_xvbsrl_v(_1, var); // expected-error {{argument to '__builtin_lasx_xvbsrl_v' must be a constant integer}}
+  return res;
+}
+
+v32i8 xvbsll_v(v32i8 _1, int var) {
+  v32i8 res = __builtin_lasx_xvbsll_v(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __builtin_lasx_xvbsll_v(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __builtin_lasx_xvbsll_v(_1, var); // expected-error {{argument to '__builtin_lasx_xvbsll_v' must be a constant integer}}
+  return res;
+}
+
+v32i8 xvextrins_b(v32i8 _1, v32i8 _2, int var) {
+  v32i8 res = __builtin_lasx_xvextrins_b(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 255]}}
+  res |= __builtin_lasx_xvextrins_b(_1, _2, 256); // expected-error {{argument value 256 is outside the valid range [0, 255]}}
+  res |= __builtin_lasx_xvextrins_b(_1, _2, var); // expected-error {{argument to '__builtin_lasx_xvextrins_b' must be a constant integer}}
+  return res;
+}
+
+v16i16 xvextrins_h(v16i16 _1, v16i16 _2, int var) {
+  v16i16 res = __builtin_lasx_xvextrins_h(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 255]}}
+  res |= __builtin_lasx_xvextrins_h(_1, _2, 256); // expected-error {{argument value 256 is outside the valid range [0, 255]}}
+  res |= __builtin_lasx_xvextrins_h(_1, _2, var); // expected-error {{argument to '__builtin_lasx_xvextrins_h' must be a constant integer}}
+  return res;
+}
+
+v8i32 xvextrins_w(v8i32 _1, v8i32 _2, int var) {
+  v8i32 res = __builtin_lasx_xvextrins_w(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 255]}}
+  res |= __builtin_lasx_xvextrins_w(_1, _2, 256); // expected-error {{argument value 256 is outside the valid range [0, 255]}}
+  res |= __builtin_lasx_xvextrins_w(_1, _2, var); // expected-error {{argument to '__builtin_lasx_xvextrins_w' must be a constant integer}}
+  return res;
+}
+
+v4i64 xvextrins_d(v4i64 _1, v4i64 _2, int var) {
+  v4i64 res = __builtin_lasx_xvextrins_d(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 255]}}
+  res |= __builtin_lasx_xvextrins_d(_1, _2, 256); // expected-error {{argument value 256 is outside the valid range [0, 255]}}
+  res |= __builtin_lasx_xvextrins_d(_1, _2, var); // expected-error {{argument to '__builtin_lasx_xvextrins_d' must be a constant integer}}
+  return res;
+}
+
+v32i8 xvld(void *_1, int var) {
+  v32i8 res = __builtin_lasx_xvld(_1, -2049); // expected-error {{argument value -2049 is outside the valid range [-2048, 2047]}}
+  res |= __builtin_lasx_xvld(_1, 2048); // expected-error {{argument value 2048 is outside the valid range [-2048, 2047]}}
+  res |= __builtin_lasx_xvld(_1, var); // expected-error {{argument to '__builtin_lasx_xvld' must be a constant integer}}
+  return res;
+}
+
+void xvst(v32i8 _1, void *_2, int var) {
+  __builtin_lasx_xvst(_1, _2, -2049); // expected-error {{argument value -2049 is outside the valid range [-2048, 2047]}}
+  __builtin_lasx_xvst(_1, _2, 2048); // expected-error {{argument value 2048 is outside the valid range [-2048, 2047]}}
+  __builtin_lasx_xvst(_1, _2, var); // expected-error {{argument to '__builtin_lasx_xvst' must be a constant integer}}
+}
+
+void xvstelm_b(v32i8 _1, void * _2, int var) {
+  __builtin_lasx_xvstelm_b(_1, _2, -129, 1); // expected-error {{argument value -129 is outside the valid range [-128, 127]}}
+  __builtin_lasx_xvstelm_b(_1, _2, 128, 1); // expected-error {{argument value 128 is outside the valid range [-128, 127]}}
+  __builtin_lasx_xvstelm_b(_1, _2, var, 1); // expected-error {{argument to '__builtin_lasx_xvstelm_b' must be a constant integer}}
+}
+
+void xvstelm_h(v16i16 _1, void * _2, int var) {
+  __builtin_lasx_xvstelm_h(_1, _2, -258, 1); // expected-error {{argument value -258 is outside the valid range [-256, 254]}}
+  __builtin_lasx_xvstelm_h(_1, _2, 256, 1); // expected-error {{argument value 256 is outside the valid range [-256, 254]}}
+  __builtin_lasx_xvstelm_h(_1, _2, var, 1); // expected-error {{argument to '__builtin_lasx_xvstelm_h' must be a constant integer}}
+}
+
+void xvstelm_w(v8i32 _1, void * _2, int var) {
+  __builtin_lasx_xvstelm_w(_1, _2, -516, 1); // expected-error {{argument value -516 is outside the valid range [-512, 508]}}
+  __builtin_lasx_xvstelm_w(_1, _2, 512, 1); // expected-error {{argument value 512 is outside the valid range [-512, 508]}}
+  __builtin_lasx_xvstelm_w(_1, _2, var, 1); // expected-error {{argument to '__builtin_lasx_xvstelm_w' must be a constant integer}}
+}
+
+void xvstelm_d(v4i64 _1, void * _2, int var) {
+  __builtin_lasx_xvstelm_d(_1, _2, -1032, 1); // expected-error {{argument value -1032 is outside the valid range [-1024, 1016]}}
+  __builtin_lasx_xvstelm_d(_1, _2, 1024, 1); // expected-error {{argument value 1024 is outside the valid range [-1024, 1016]}}
+  __builtin_lasx_xvstelm_d(_1, _2, var, 1); // expected-error {{argument to '__builtin_lasx_xvstelm_d' must be a constant integer}}
+}
+
+void xvstelm_b_idx(v32i8 _1, void * _2, int var) {
+  __builtin_lasx_xvstelm_b(_1, _2, 1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  __builtin_lasx_xvstelm_b(_1, _2, 1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  __builtin_lasx_xvstelm_b(_1, _2, 1, var); // expected-error {{argument to '__builtin_lasx_xvstelm_b' must be a constant integer}}
+}
+
+void xvstelm_h_idx(v16i16 _1, void * _2, int var) {
+  __builtin_lasx_xvstelm_h(_1, _2, 2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 15]}}
+  __builtin_lasx_xvstelm_h(_1, _2, 2, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  __builtin_lasx_xvstelm_h(_1, _2, 2, var); // expected-error {{argument to '__builtin_lasx_xvstelm_h' must be a constant integer}}
+}
+
+void xvstelm_w_idx(v8i32 _1, void * _2, int var) {
+  __builtin_lasx_xvstelm_w(_1, _2, 4, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 7]}}
+  __builtin_lasx_xvstelm_w(_1, _2, 4, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+  __builtin_lasx_xvstelm_w(_1, _2, 4, var); // expected-error {{argument to '__builtin_lasx_xvstelm_w' must be a constant integer}}
+}
+
+void xvstelm_d_idx(v4i64 _1, void * _2, int var) {
+  __builtin_lasx_xvstelm_d(_1, _2, 8, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 3]}}
+  __builtin_lasx_xvstelm_d(_1, _2, 8, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  __builtin_lasx_xvstelm_d(_1, _2, 8, var); // expected-error {{argument to '__builtin_lasx_xvstelm_d' must be a constant integer}}
+}
+
+v8i32 xvinsve0_w(v8i32 _1, v8i32 _2, int var) {
+  v8i32 res = __builtin_lasx_xvinsve0_w(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 7]}}
+  res |= __builtin_lasx_xvinsve0_w(_1, _2, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+  res |= __builtin_lasx_xvinsve0_w(_1, _2, var); // expected-error {{argument to '__builtin_lasx_xvinsve0_w' must be a constant integer}}
+  return res;
+}
+
+v4i64 xvinsve0_d(v4i64 _1, v4i64 _2, int var) {
+  v4i64 res = __builtin_lasx_xvinsve0_d(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 3]}}
+  res |= __builtin_lasx_xvinsve0_d(_1, _2, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  res |= __builtin_lasx_xvinsve0_d(_1, _2, var); // expected-error {{argument to '__builtin_lasx_xvinsve0_d' must be a constant integer}}
+  return res;
+}
+
+v8i32 xvpickve_w(v8i32 _1, int var) {
+  v8i32 res = __builtin_lasx_xvpickve_w(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 7]}}
+  res |= __builtin_lasx_xvpickve_w(_1, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+  res |= __builtin_lasx_xvpickve_w(_1, var); // expected-error {{argument to '__builtin_lasx_xvpickve_w' must be a constant integer}}
+  return res;
+}
+
+v4i64 xvpickve_d(v4i64 _1, int var) {
+  v4i64 res = __builtin_lasx_xvpickve_d(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 3]}}
+  res |= __builtin_lasx_xvpickve_d(_1, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  res |= __builtin_lasx_xvpickve_d(_1, var); // expected-error {{argument to '__builtin_lasx_xvpickve_d' must be a constant integer}}
+  return res;
+}
+
+v4i64 xvldi(int var) {
+  v4i64 res = __builtin_lasx_xvldi(-4097); // expected-error {{argument value -4097 is outside the valid range [-4096, 4095]}}
+  res |= __builtin_lasx_xvldi(4096); // expected-error {{argument value 4096 is outside the valid range [-4096, 4095]}}
+  res |= __builtin_lasx_xvldi(var); // expected-error {{argument to '__builtin_lasx_xvldi' must be a constant integer}}
+  return res;
+}
+
+v8i32 xvinsgr2vr_w(v8i32 _1, int var) {
+  v8i32 res = __builtin_lasx_xvinsgr2vr_w(_1, 1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 7]}}
+  res |= __builtin_lasx_xvinsgr2vr_w(_1, 1, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+  res |= __builtin_lasx_xvinsgr2vr_w(_1, 1, var); // expected-error {{argument to '__builtin_lasx_xvinsgr2vr_w' must be a constant integer}}
+  return res;
+}
+
+v4i64 xvinsgr2vr_d(v4i64 _1, int var) {
+  v4i64 res = __builtin_lasx_xvinsgr2vr_d(_1, 1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 3]}}
+  res |= __builtin_lasx_xvinsgr2vr_d(_1, 1, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  res |= __builtin_lasx_xvinsgr2vr_d(_1, 1, var); // expected-error {{argument to '__builtin_lasx_xvinsgr2vr_d' must be a constant integer}}
+  return res;
+}
+
+v32i8 xvldrepl_b(void *_1, int var) {
+  v32i8 res = __builtin_lasx_xvldrepl_b(_1, -2049); // expected-error {{argument value -2049 is outside the valid range [-2048, 2047]}}
+  res |= __builtin_lasx_xvldrepl_b(_1, 2048); // expected-error {{argument value 2048 is outside the valid range [-2048, 2047]}}
+  res |= __builtin_lasx_xvldrepl_b(_1, var); // expected-error {{argument to '__builtin_lasx_xvldrepl_b' must be a constant integer}}
+  return res;
+}
+
+v16i16 xvldrepl_h(void *_1, int var) {
+  v16i16 res = __builtin_lasx_xvldrepl_h(_1, -2050); // expected-error {{argument value -2050 is outside the valid range [-2048, 2046]}}
+  res |= __builtin_lasx_xvldrepl_h(_1, 2048); // expected-error {{argument value 2048 is outside the valid range [-2048, 2046]}}
+  res |= __builtin_lasx_xvldrepl_h(_1, var); // expected-error {{argument to '__builtin_lasx_xvldrepl_h' must be a constant integer}}
+  return res;
+}
+
+v8i32 xvldrepl_w(void *_1, int var) {
+  v8i32 res = __builtin_lasx_xvldrepl_w(_1, -2052); // expected-error {{argument value -2052 is outside the valid range [-2048, 2044]}}
+  res |= __builtin_lasx_xvldrepl_w(_1, 2048); // expected-error {{argument value 2048 is outside the valid range [-2048, 2044]}}
+  res |= __builtin_lasx_xvldrepl_w(_1, var); // expected-error {{argument to '__builtin_lasx_xvldrepl_w' must be a constant integer}}
+  return res;
+}
+
+v4i64 xvldrepl_d(void *_1, int var) {
+  v4i64 res = __builtin_lasx_xvldrepl_d(_1, -2056); // expected-error {{argument value -2056 is outside the valid range [-2048, 2040]}}
+  res |= __builtin_lasx_xvldrepl_d(_1, 2048); // expected-error {{argument value 2048 is outside the valid range [-2048, 2040]}}
+  res |= __builtin_lasx_xvldrepl_d(_1, var); // expected-error {{argument to '__builtin_lasx_xvldrepl_d' must be a constant integer}}
+  return res;
+}
+
+int xvpickve2gr_w(v8i32 _1, int var) {
+  int res = __builtin_lasx_xvpickve2gr_w(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 7]}}
+  res |= __builtin_lasx_xvpickve2gr_w(_1, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+  res |= __builtin_lasx_xvpickve2gr_w(_1, var); // expected-error {{argument to '__builtin_lasx_xvpickve2gr_w' must be a constant integer}}
+  return res;
+}
+
+unsigned int xvpickve2gr_wu(v8i32 _1, int var) {
+  unsigned int res = __builtin_lasx_xvpickve2gr_wu(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 7]}}
+  res |= __builtin_lasx_xvpickve2gr_wu(_1, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+  res |= __builtin_lasx_xvpickve2gr_wu(_1, var); // expected-error {{argument to '__builtin_lasx_xvpickve2gr_wu' must be a constant integer}}
+  return res;
+}
+
+long xvpickve2gr_d(v4i64 _1, int var) {
+  long res = __builtin_lasx_xvpickve2gr_d(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 3]}}
+  res |= __builtin_lasx_xvpickve2gr_d(_1, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  res |= __builtin_lasx_xvpickve2gr_d(_1, var); // expected-error {{argument to '__builtin_lasx_xvpickve2gr_d' must be a constant integer}}
+  return res;
+}
+
+unsigned long int xvpickve2gr_du(v4i64 _1, int var) {
+  unsigned long int res = __builtin_lasx_xvpickve2gr_du(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 3]}}
+  res |= __builtin_lasx_xvpickve2gr_du(_1, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  res |= __builtin_lasx_xvpickve2gr_du(_1, var); // expected-error {{argument to '__builtin_lasx_xvpickve2gr_du' must be a constant integer}}
+  return res;
+}
+
+v32i8 xvrotri_b(v32i8 _1, int var) {
+  v32i8 res = __builtin_lasx_xvrotri_b(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 7]}}
+  res |= __builtin_lasx_xvrotri_b(_1, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+  res |= __builtin_lasx_xvrotri_b(_1, var); // expected-error {{argument to '__builtin_lasx_xvrotri_b' must be a constant integer}}
+  return res;
+}
+
+v16i16 xvrotri_h(v16i16 _1, int var) {
+  v16i16 res = __builtin_lasx_xvrotri_h(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 15]}}
+  res |= __builtin_lasx_xvrotri_h(_1, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  res |= __builtin_lasx_xvrotri_h(_1, var); // expected-error {{argument to '__builtin_lasx_xvrotri_h' must be a constant integer}}
+  return res;
+}
+
+v8i32 xvrotri_w(v8i32 _1, int var) {
+  v8i32 res = __builtin_lasx_xvrotri_w(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __builtin_lasx_xvrotri_w(_1, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __builtin_lasx_xvrotri_w(_1, var); // expected-error {{argument to '__builtin_lasx_xvrotri_w' must be a constant integer}}
+  return res;
+}
+
+v4i64 xvrotri_d(v4i64 _1, int var) {
+  v4i64 res = __builtin_lasx_xvrotri_d(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 63]}}
+  res |= __builtin_lasx_xvrotri_d(_1, 64); // expected-error {{argument value 64 is outside the valid range [0, 63]}}
+  res |= __builtin_lasx_xvrotri_d(_1, var); // expected-error {{argument to '__builtin_lasx_xvrotri_d' must be a constant integer}}
+  return res;
+}
+
+v32i8 xvsrlni_b_h(v32i8 _1, v32i8 _2, int var) {
+  v32i8 res = __builtin_lasx_xvsrlni_b_h(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 15]}}
+  res |= __builtin_lasx_xvsrlni_b_h(_1, _2, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  res |= __builtin_lasx_xvsrlni_b_h(_1, _2, var); // expected-error {{argument to '__builtin_lasx_xvsrlni_b_h' must be a constant integer}}
+  return res;
+}
+
+v16i16 xvsrlni_h_w(v16i16 _1, v16i16 _2, int var) {
+  v16i16 res = __builtin_lasx_xvsrlni_h_w(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __builtin_lasx_xvsrlni_h_w(_1, _2, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __builtin_lasx_xvsrlni_h_w(_1, _2, var); // expected-error {{argument to '__builtin_lasx_xvsrlni_h_w' must be a constant integer}}
+  return res;
+}
+
+v8i32 xvsrlni_w_d(v8i32 _1, v8i32 _2, int var) {
+  v8i32 res = __builtin_lasx_xvsrlni_w_d(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 63]}}
+  res |= __builtin_lasx_xvsrlni_w_d(_1, _2, 64); // expected-error {{argument value 64 is outside the valid range [0, 63]}}
+  res |= __builtin_lasx_xvsrlni_w_d(_1, _2, var); // expected-error {{argument to '__builtin_lasx_xvsrlni_w_d' must be a constant integer}}
+  return res;
+}
+
+v4i64 xvsrlni_d_q(v4i64 _1, v4i64 _2, int var) {
+  v4i64 res = __builtin_lasx_xvsrlni_d_q(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 127]}}
+  res |= __builtin_lasx_xvsrlni_d_q(_1, _2, 128); // expected-error {{argument value 128 is outside the valid range [0, 127]}}
+  res |= __builtin_lasx_xvsrlni_d_q(_1, _2, var); // expected-error {{argument to '__builtin_lasx_xvsrlni_d_q' must be a constant integer}}
+  return res;
+}
+
+v32i8 xvsrlrni_b_h(v32i8 _1, v32i8 _2, int var) {
+  v32i8 res = __builtin_lasx_xvsrlrni_b_h(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 15]}}
+  res |= __builtin_lasx_xvsrlrni_b_h(_1, _2, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  res |= __builtin_lasx_xvsrlrni_b_h(_1, _2, var); // expected-error {{argument to '__builtin_lasx_xvsrlrni_b_h' must be a constant integer}}
+  return res;
+}
+
+v16i16 xvsrlrni_h_w(v16i16 _1, v16i16 _2, int var) {
+  v16i16 res = __builtin_lasx_xvsrlrni_h_w(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __builtin_lasx_xvsrlrni_h_w(_1, _2, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __builtin_lasx_xvsrlrni_h_w(_1, _2, var); // expected-error {{argument to '__builtin_lasx_xvsrlrni_h_w' must be a constant integer}}
+  return res;
+}
+
+v8i32 xvsrlrni_w_d(v8i32 _1, v8i32 _2, int var) {
+  v8i32 res = __builtin_lasx_xvsrlrni_w_d(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 63]}}
+  res |= __builtin_lasx_xvsrlrni_w_d(_1, _2, 64); // expected-error {{argument value 64 is outside the valid range [0, 63]}}
+  res |= __builtin_lasx_xvsrlrni_w_d(_1, _2, var); // expected-error {{argument to '__builtin_lasx_xvsrlrni_w_d' must be a constant integer}}
+  return res;
+}
+
+v4i64 xvsrlrni_d_q(v4i64 _1, v4i64 _2, int var) {
+  v4i64 res = __builtin_lasx_xvsrlrni_d_q(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 127]}}
+  res |= __builtin_lasx_xvsrlrni_d_q(_1, _2, 128); // expected-error {{argument value 128 is outside the valid range [0, 127]}}
+  res |= __builtin_lasx_xvsrlrni_d_q(_1, _2, var); // expected-error {{argument to '__builtin_lasx_xvsrlrni_d_q' must be a constant integer}}
+  return res;
+}
+
+v32i8 xvssrlni_b_h(v32i8 _1, v32i8 _2, int var) {
+  v32i8 res = __builtin_lasx_xvssrlni_b_h(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 15]}}
+  res |= __builtin_lasx_xvssrlni_b_h(_1, _2, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  res |= __builtin_lasx_xvssrlni_b_h(_1, _2, var); // expected-error {{argument to '__builtin_lasx_xvssrlni_b_h' must be a constant integer}}
+  return res;
+}
+
+v16i16 xvssrlni_h_w(v16i16 _1, v16i16 _2, int var) {
+  v16i16 res = __builtin_lasx_xvssrlni_h_w(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __builtin_lasx_xvssrlni_h_w(_1, _2, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __builtin_lasx_xvssrlni_h_w(_1, _2, var); // expected-error {{argument to '__builtin_lasx_xvssrlni_h_w' must be a constant integer}}
+  return res;
+}
+
+v8i32 xvssrlni_w_d(v8i32 _1, v8i32 _2, int var) {
+  v8i32 res = __builtin_lasx_xvssrlni_w_d(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 63]}}
+  res |= __builtin_lasx_xvssrlni_w_d(_1, _2, 64); // expected-error {{argument value 64 is outside the valid range [0, 63]}}
+  res |= __builtin_lasx_xvssrlni_w_d(_1, _2, var); // expected-error {{argument to '__builtin_lasx_xvssrlni_w_d' must be a constant integer}}
+  return res;
+}
+
+v4i64 xvssrlni_d_q(v4i64 _1, v4i64 _2, int var) {
+  v4i64 res = __builtin_lasx_xvssrlni_d_q(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 127]}}
+  res |= __builtin_lasx_xvssrlni_d_q(_1, _2, 128); // expected-error {{argument value 128 is outside the valid range [0, 127]}}
+  res |= __builtin_lasx_xvssrlni_d_q(_1, _2, var); // expected-error {{argument to '__builtin_lasx_xvssrlni_d_q' must be a constant integer}}
+  return res;
+}
+
+v32u8 xvssrlni_bu_h(v32u8 _1, v32i8 _2, int var) {
+  v32u8 res = __builtin_lasx_xvssrlni_bu_h(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 15]}}
+  res |= __builtin_lasx_xvssrlni_bu_h(_1, _2, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  res |= __builtin_lasx_xvssrlni_bu_h(_1, _2, var); // expected-error {{argument to '__builtin_lasx_xvssrlni_bu_h' must be a constant integer}}
+  return res;
+}
+
+v16u16 xvssrlni_hu_w(v16u16 _1, v16i16 _2, int var) {
+  v16u16 res = __builtin_lasx_xvssrlni_hu_w(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __builtin_lasx_xvssrlni_hu_w(_1, _2, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __builtin_lasx_xvssrlni_hu_w(_1, _2, var); // expected-error {{argument to '__builtin_lasx_xvssrlni_hu_w' must be a constant integer}}
+  return res;
+}
+
+v8u32 xvssrlni_wu_d(v8u32 _1, v8i32 _2, int var) {
+  v8u32 res = __builtin_lasx_xvssrlni_wu_d(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 63]}}
+  res |= __builtin_lasx_xvssrlni_wu_d(_1, _2, 64); // expected-error {{argument value 64 is outside the valid range [0, 63]}}
+  res |= __builtin_lasx_xvssrlni_wu_d(_1, _2, var); // expected-error {{argument to '__builtin_lasx_xvssrlni_wu_d' must be a constant integer}}
+  return res;
+}
+
+v4u64 xvssrlni_du_q(v4u64 _1, v4i64 _2, int var) {
+  v4u64 res = __builtin_lasx_xvssrlni_du_q(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 127]}}
+  res |= __builtin_lasx_xvssrlni_du_q(_1, _2, 128); // expected-error {{argument value 128 is outside the valid range [0, 127]}}
+  res |= __builtin_lasx_xvssrlni_du_q(_1, _2, var); // expected-error {{argument to '__builtin_lasx_xvssrlni_du_q' must be a constant integer}}
+  return res;
+}
+
+v32i8 xvssrlrni_b_h(v32i8 _1, v32i8 _2, int var) {
+  v32i8 res = __builtin_lasx_xvssrlrni_b_h(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 15]}}
+  res |= __builtin_lasx_xvssrlrni_b_h(_1, _2, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  res |= __builtin_lasx_xvssrlrni_b_h(_1, _2, var); // expected-error {{argument to '__builtin_lasx_xvssrlrni_b_h' must be a constant integer}}
+  return res;
+}
+
+v16i16 xvssrlrni_h_w(v16i16 _1, v16i16 _2, int var) {
+  v16i16 res = __builtin_lasx_xvssrlrni_h_w(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __builtin_lasx_xvssrlrni_h_w(_1, _2, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __builtin_lasx_xvssrlrni_h_w(_1, _2, var); // expected-error {{argument to '__builtin_lasx_xvssrlrni_h_w' must be a constant integer}}
+  return res;
+}
+
+v8i32 xvssrlrni_w_d(v8i32 _1, v8i32 _2, int var) {
+  v8i32 res = __builtin_lasx_xvssrlrni_w_d(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 63]}}
+  res |= __builtin_lasx_xvssrlrni_w_d(_1, _2, 64); // expected-error {{argument value 64 is outside the valid range [0, 63]}}
+  res |= __builtin_lasx_xvssrlrni_w_d(_1, _2, var); // expected-error {{argument to '__builtin_lasx_xvssrlrni_w_d' must be a constant integer}}
+  return res;
+}
+
+v4i64 xvssrlrni_d_q(v4i64 _1, v4i64 _2, int var) {
+  v4i64 res = __builtin_lasx_xvssrlrni_d_q(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 127]}}
+  res |= __builtin_lasx_xvssrlrni_d_q(_1, _2, 128); // expected-error {{argument value 128 is outside the valid range [0, 127]}}
+  res |= __builtin_lasx_xvssrlrni_d_q(_1, _2, var); // expected-error {{argument to '__builtin_lasx_xvssrlrni_d_q' must be a constant integer}}
+  return res;
+}
+
+v32u8 xvssrlrni_bu_h(v32u8 _1, v32i8 _2, int var) {
+  v32u8 res = __builtin_lasx_xvssrlrni_bu_h(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 15]}}
+  res |= __builtin_lasx_xvssrlrni_bu_h(_1, _2, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  res |= __builtin_lasx_xvssrlrni_bu_h(_1, _2, var); // expected-error {{argument to '__builtin_lasx_xvssrlrni_bu_h' must be a constant integer}}
+  return res;
+}
+
+v16u16 xvssrlrni_hu_w(v16u16 _1, v16i16 _2, int var) {
+  v16u16 res = __builtin_lasx_xvssrlrni_hu_w(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __builtin_lasx_xvssrlrni_hu_w(_1, _2, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __builtin_lasx_xvssrlrni_hu_w(_1, _2, var); // expected-error {{argument to '__builtin_lasx_xvssrlrni_hu_w' must be a constant integer}}
+  return res;
+}
+
+v8u32 xvssrlrni_wu_d(v8u32 _1, v8i32 _2, int var) {
+  v8u32 res = __builtin_lasx_xvssrlrni_wu_d(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 63]}}
+  res |= __builtin_lasx_xvssrlrni_wu_d(_1, _2, 64); // expected-error {{argument value 64 is outside the valid range [0, 63]}}
+  res |= __builtin_lasx_xvssrlrni_wu_d(_1, _2, var); // expected-error {{argument to '__builtin_lasx_xvssrlrni_wu_d' must be a constant integer}}
+  return res;
+}
+
+v4u64 xvssrlrni_du_q(v4u64 _1, v4i64 _2, int var) {
+  v4u64 res = __builtin_lasx_xvssrlrni_du_q(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 127]}}
+  res |= __builtin_lasx_xvssrlrni_du_q(_1, _2, 128); // expected-error {{argument value 128 is outside the valid range [0, 127]}}
+  res |= __builtin_lasx_xvssrlrni_du_q(_1, _2, var); // expected-error {{argument to '__builtin_lasx_xvssrlrni_du_q' must be a constant integer}}
+  return res;
+}
+
+v32i8 xvsrani_b_h(v32i8 _1, v32i8 _2, int var) {
+  v32i8 res = __builtin_lasx_xvsrani_b_h(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 15]}}
+  res |= __builtin_lasx_xvsrani_b_h(_1, _2, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  res |= __builtin_lasx_xvsrani_b_h(_1, _2, var); // expected-error {{argument to '__builtin_lasx_xvsrani_b_h' must be a constant integer}}
+  return res;
+}
+
+v16i16 xvsrani_h_w(v16i16 _1, v16i16 _2, int var) {
+  v16i16 res = __builtin_lasx_xvsrani_h_w(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __builtin_lasx_xvsrani_h_w(_1, _2, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __builtin_lasx_xvsrani_h_w(_1, _2, var); // expected-error {{argument to '__builtin_lasx_xvsrani_h_w' must be a constant integer}}
+  return res;
+}
+
+v8i32 xvsrani_w_d(v8i32 _1, v8i32 _2, int var) {
+  v8i32 res = __builtin_lasx_xvsrani_w_d(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 63]}}
+  res |= __builtin_lasx_xvsrani_w_d(_1, _2, 64); // expected-error {{argument value 64 is outside the valid range [0, 63]}}
+  res |= __builtin_lasx_xvsrani_w_d(_1, _2, var); // expected-error {{argument to '__builtin_lasx_xvsrani_w_d' must be a constant integer}}
+  return res;
+}
+
+v4i64 xvsrani_d_q(v4i64 _1, v4i64 _2, int var) {
+  v4i64 res = __builtin_lasx_xvsrani_d_q(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 127]}}
+  res |= __builtin_lasx_xvsrani_d_q(_1, _2, 128); // expected-error {{argument value 128 is outside the valid range [0, 127]}}
+  res |= __builtin_lasx_xvsrani_d_q(_1, _2, var); // expected-error {{argument to '__builtin_lasx_xvsrani_d_q' must be a constant integer}}
+  return res;
+}
+
+v32i8 xvsrarni_b_h(v32i8 _1, v32i8 _2, int var) {
+  v32i8 res = __builtin_lasx_xvsrarni_b_h(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 15]}}
+  res |= __builtin_lasx_xvsrarni_b_h(_1, _2, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  res |= __builtin_lasx_xvsrarni_b_h(_1, _2, var); // expected-error {{argument to '__builtin_lasx_xvsrarni_b_h' must be a constant integer}}
+  return res;
+}
+
+v16i16 xvsrarni_h_w(v16i16 _1, v16i16 _2, int var) {
+  v16i16 res = __builtin_lasx_xvsrarni_h_w(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __builtin_lasx_xvsrarni_h_w(_1, _2, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __builtin_lasx_xvsrarni_h_w(_1, _2, var); // expected-error {{argument to '__builtin_lasx_xvsrarni_h_w' must be a constant integer}}
+  return res;
+}
+
+v8i32 xvsrarni_w_d(v8i32 _1, v8i32 _2, int var) {
+  v8i32 res = __builtin_lasx_xvsrarni_w_d(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 63]}}
+  res |= __builtin_lasx_xvsrarni_w_d(_1, _2, 64); // expected-error {{argument value 64 is outside the valid range [0, 63]}}
+  res |= __builtin_lasx_xvsrarni_w_d(_1, _2, var); // expected-error {{argument to '__builtin_lasx_xvsrarni_w_d' must be a constant integer}}
+  return res;
+}
+
+v4i64 xvsrarni_d_q(v4i64 _1, v4i64 _2, int var) {
+  v4i64 res = __builtin_lasx_xvsrarni_d_q(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 127]}}
+  res |= __builtin_lasx_xvsrarni_d_q(_1, _2, 128); // expected-error {{argument value 128 is outside the valid range [0, 127]}}
+  res |= __builtin_lasx_xvsrarni_d_q(_1, _2, var); // expected-error {{argument to '__builtin_lasx_xvsrarni_d_q' must be a constant integer}}
+  return res;
+}
+
+v32i8 xvssrani_b_h(v32i8 _1, v32i8 _2, int var) {
+  v32i8 res = __builtin_lasx_xvssrani_b_h(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 15]}}
+  res |= __builtin_lasx_xvssrani_b_h(_1, _2, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  res |= __builtin_lasx_xvssrani_b_h(_1, _2, var); // expected-error {{argument to '__builtin_lasx_xvssrani_b_h' must be a constant integer}}
+  return res;
+}
+
+v16i16 xvssrani_h_w(v16i16 _1, v16i16 _2, int var) {
+  v16i16 res = __builtin_lasx_xvssrani_h_w(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __builtin_lasx_xvssrani_h_w(_1, _2, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __builtin_lasx_xvssrani_h_w(_1, _2, var); // expected-error {{argument to '__builtin_lasx_xvssrani_h_w' must be a constant integer}}
+  return res;
+}
+
+v8i32 xvssrani_w_d(v8i32 _1, v8i32 _2, int var) {
+  v8i32 res = __builtin_lasx_xvssrani_w_d(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 63]}}
+  res |= __builtin_lasx_xvssrani_w_d(_1, _2, 64); // expected-error {{argument value 64 is outside the valid range [0, 63]}}
+  res |= __builtin_lasx_xvssrani_w_d(_1, _2, var); // expected-error {{argument to '__builtin_lasx_xvssrani_w_d' must be a constant integer}}
+  return res;
+}
+
+v4i64 xvssrani_d_q(v4i64 _1, v4i64 _2, int var) {
+  v4i64 res = __builtin_lasx_xvssrani_d_q(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 127]}}
+  res |= __builtin_lasx_xvssrani_d_q(_1, _2, 128); // expected-error {{argument value 128 is outside the valid range [0, 127]}}
+  res |= __builtin_lasx_xvssrani_d_q(_1, _2, var); // expected-error {{argument to '__builtin_lasx_xvssrani_d_q' must be a constant integer}}
+  return res;
+}
+
+v32u8 xvssrani_bu_h(v32u8 _1, v32i8 _2, int var) {
+  v32u8 res = __builtin_lasx_xvssrani_bu_h(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 15]}}
+  res |= __builtin_lasx_xvssrani_bu_h(_1, _2, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  res |= __builtin_lasx_xvssrani_bu_h(_1, _2, var); // expected-error {{argument to '__builtin_lasx_xvssrani_bu_h' must be a constant integer}}
+  return res;
+}
+
+v16u16 xvssrani_hu_w(v16u16 _1, v16i16 _2, int var) {
+  v16u16 res = __builtin_lasx_xvssrani_hu_w(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __builtin_lasx_xvssrani_hu_w(_1, _2, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __builtin_lasx_xvssrani_hu_w(_1, _2, var); // expected-error {{argument to '__builtin_lasx_xvssrani_hu_w' must be a constant integer}}
+  return res;
+}
+
+v8u32 xvssrani_wu_d(v8u32 _1, v8i32 _2, int var) {
+  v8u32 res = __builtin_lasx_xvssrani_wu_d(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 63]}}
+  res |= __builtin_lasx_xvssrani_wu_d(_1, _2, 64); // expected-error {{argument value 64 is outside the valid range [0, 63]}}
+  res |= __builtin_lasx_xvssrani_wu_d(_1, _2, var); // expected-error {{argument to '__builtin_lasx_xvssrani_wu_d' must be a constant integer}}
+  return res;
+}
+
+v4u64 xvssrani_du_q(v4u64 _1, v4i64 _2, int var) {
+  v4u64 res = __builtin_lasx_xvssrani_du_q(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 127]}}
+  res |= __builtin_lasx_xvssrani_du_q(_1, _2, 128); // expected-error {{argument value 128 is outside the valid range [0, 127]}}
+  res |= __builtin_lasx_xvssrani_du_q(_1, _2, var); // expected-error {{argument to '__builtin_lasx_xvssrani_du_q' must be a constant integer}}
+  return res;
+}
+
+v32i8 xvssrarni_b_h(v32i8 _1, v32i8 _2, int var) {
+  v32i8 res = __builtin_lasx_xvssrarni_b_h(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 15]}}
+  res |= __builtin_lasx_xvssrarni_b_h(_1, _2, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  res |= __builtin_lasx_xvssrarni_b_h(_1, _2, var); // expected-error {{argument to '__builtin_lasx_xvssrarni_b_h' must be a constant integer}}
+  return res;
+}
+
+v16i16 xvssrarni_h_w(v16i16 _1, v16i16 _2, int var) {
+  v16i16 res = __builtin_lasx_xvssrarni_h_w(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __builtin_lasx_xvssrarni_h_w(_1, _2, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __builtin_lasx_xvssrarni_h_w(_1, _2, var); // expected-error {{argument to '__builtin_lasx_xvssrarni_h_w' must be a constant integer}}
+  return res;
+}
+
+v8i32 xvssrarni_w_d(v8i32 _1, v8i32 _2, int var) {
+  v8i32 res = __builtin_lasx_xvssrarni_w_d(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 63]}}
+  res |= __builtin_lasx_xvssrarni_w_d(_1, _2, 64); // expected-error {{argument value 64 is outside the valid range [0, 63]}}
+  res |= __builtin_lasx_xvssrarni_w_d(_1, _2, var); // expected-error {{argument to '__builtin_lasx_xvssrarni_w_d' must be a constant integer}}
+  return res;
+}
+
+v4i64 xvssrarni_d_q(v4i64 _1, v4i64 _2, int var) {
+  v4i64 res = __builtin_lasx_xvssrarni_d_q(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 127]}}
+  res |= __builtin_lasx_xvssrarni_d_q(_1, _2, 128); // expected-error {{argument value 128 is outside the valid range [0, 127]}}
+  res |= __builtin_lasx_xvssrarni_d_q(_1, _2, var); // expected-error {{argument to '__builtin_lasx_xvssrarni_d_q' must be a constant integer}}
+  return res;
+}
+
+v32u8 xvssrarni_bu_h(v32u8 _1, v32i8 _2, int var) {
+  v32u8 res = __builtin_lasx_xvssrarni_bu_h(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 15]}}
+  res |= __builtin_lasx_xvssrarni_bu_h(_1, _2, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  res |= __builtin_lasx_xvssrarni_bu_h(_1, _2, var); // expected-error {{argument to '__builtin_lasx_xvssrarni_bu_h' must be a constant integer}}
+  return res;
+}
+
+v16u16 xvssrarni_hu_w(v16u16 _1, v16i16 _2, int var) {
+  v16u16 res = __builtin_lasx_xvssrarni_hu_w(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 31]}}
+  res |= __builtin_lasx_xvssrarni_hu_w(_1, _2, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  res |= __builtin_lasx_xvssrarni_hu_w(_1, _2, var); // expected-error {{argument to '__builtin_lasx_xvssrarni_hu_w' must be a constant integer}}
+  return res;
+}
+
+v8u32 xvssrarni_wu_d(v8u32 _1, v8i32 _2, int var) {
+  v8u32 res = __builtin_lasx_xvssrarni_wu_d(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 63]}}
+  res |= __builtin_lasx_xvssrarni_wu_d(_1, _2, 64); // expected-error {{argument value 64 is outside the valid range [0, 63]}}
+  res |= __builtin_lasx_xvssrarni_wu_d(_1, _2, var); // expected-error {{argument to '__builtin_lasx_xvssrarni_wu_d' must be a constant integer}}
+  return res;
+}
+
+v4u64 xvssrarni_du_q(v4u64 _1, v4i64 _2, int var) {
+  v4u64 res = __builtin_lasx_xvssrarni_du_q(_1, _2, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 127]}}
+  res |= __builtin_lasx_xvssrarni_du_q(_1, _2, 128); // expected-error {{argument value 128 is outside the valid range [0, 127]}}
+  res |= __builtin_lasx_xvssrarni_du_q(_1, _2, var); // expected-error {{argument to '__builtin_lasx_xvssrarni_du_q' must be a constant integer}}
+  return res;
+}
+
+v4f64 xvpickve_d_f(v4f64 _1, int var) {
+  v4f64 res = __builtin_lasx_xvpickve_d_f(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 3]}}
+  res += __builtin_lasx_xvpickve_d_f(_1, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  res += __builtin_lasx_xvpickve_d_f(_1, var); // expected-error {{argument to '__builtin_lasx_xvpickve_d_f' must be a constant integer}}
+  return res;
+}
+
+v8f32 xvpickve_w_f(v8f32 _1, int var) {
+  v8f32 res = __builtin_lasx_xvpickve_w_f(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 7]}}
+  res += __builtin_lasx_xvpickve_w_f(_1, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+  res += __builtin_lasx_xvpickve_w_f(_1, var); // expected-error {{argument to '__builtin_lasx_xvpickve_w_f' must be a constant integer}}
+  return res;
+}
+
+v32i8 xvrepli_b(int var) {
+  v32i8 res = __builtin_lasx_xvrepli_b(-513); // expected-error {{argument value -513 is outside the valid range [-512, 511]}}
+  res |= __builtin_lasx_xvrepli_b(512); // expected-error {{argument value 512 is outside the valid range [-512, 511]}}
+  res |= __builtin_lasx_xvrepli_b(var); // expected-error {{argument to '__builtin_lasx_xvrepli_b' must be a constant integer}}
+  return res;
+}
+
+v4i64 xvrepli_d(int var) {
+  v4i64 res = __builtin_lasx_xvrepli_d(-513); // expected-error {{argument value -513 is outside the valid range [-512, 511]}}
+  res |= __builtin_lasx_xvrepli_d(512); // expected-error {{argument value 512 is outside the valid range [-512, 511]}}
+  res |= __builtin_lasx_xvrepli_d(var); // expected-error {{argument to '__builtin_lasx_xvrepli_d' must be a constant integer}}
+  return res;
+}
+
+v16i16 xvrepli_h(int var) {
+  v16i16 res = __builtin_lasx_xvrepli_h(-513); // expected-error {{argument value -513 is outside the valid range [-512, 511]}}
+  res |= __builtin_lasx_xvrepli_h(512); // expected-error {{argument value 512 is outside the valid range [-512, 511]}}
+  res |= __builtin_lasx_xvrepli_h(var); // expected-error {{argument to '__builtin_lasx_xvrepli_h' must be a constant integer}}
+  return res;
+}
+
+v8i32 xvrepli_w(int var) {
+  v8i32 res = __builtin_lasx_xvrepli_w(-513); // expected-error {{argument value -513 is outside the valid range [-512, 511]}}
+  res |= __builtin_lasx_xvrepli_w(512); // expected-error {{argument value 512 is outside the valid range [-512, 511]}}
+  res |= __builtin_lasx_xvrepli_w(var); // expected-error {{argument to '__builtin_lasx_xvrepli_w' must be a constant integer}}
+  return res;
+}
diff --git a/clang/test/CodeGen/LoongArch/lasx/builtin.c b/clang/test/CodeGen/LoongArch/lasx/builtin.c
new file mode 100644
index 0000000000000..0185f2004d526
--- /dev/null
+++ b/clang/test/CodeGen/LoongArch/lasx/builtin.c
@@ -0,0 +1,4452 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// RUN: %clang_cc1 -triple loongarch64 -target-feature +lasx -O2 -emit-llvm %s -o - | FileCheck %s
+
+typedef signed char v32i8 __attribute__((vector_size(32), aligned(32)));
+typedef signed char v32i8_b __attribute__((vector_size(32), aligned(1)));
+typedef unsigned char v32u8 __attribute__((vector_size(32), aligned(32)));
+typedef unsigned char v32u8_b __attribute__((vector_size(32), aligned(1)));
+typedef short v16i16 __attribute__((vector_size(32), aligned(32)));
+typedef short v16i16_h __attribute__((vector_size(32), aligned(2)));
+typedef unsigned short v16u16 __attribute__((vector_size(32), aligned(32)));
+typedef unsigned short v16u16_h __attribute__((vector_size(32), aligned(2)));
+typedef int v8i32 __attribute__((vector_size(32), aligned(32)));
+typedef int v8i32_w __attribute__((vector_size(32), aligned(4)));
+typedef unsigned int v8u32 __attribute__((vector_size(32), aligned(32)));
+typedef unsigned int v8u32_w __attribute__((vector_size(32), aligned(4)));
+typedef long long v4i64 __attribute__((vector_size(32), aligned(32)));
+typedef long long v4i64_d __attribute__((vector_size(32), aligned(8)));
+typedef unsigned long long v4u64 __attribute__((vector_size(32), aligned(32)));
+typedef unsigned long long v4u64_d __attribute__((vector_size(32), aligned(8)));
+typedef float v8f32 __attribute__((vector_size(32), aligned(32)));
+typedef float v8f32_w __attribute__((vector_size(32), aligned(4)));
+typedef double v4f64 __attribute__((vector_size(32), aligned(32)));
+typedef double v4f64_d __attribute__((vector_size(32), aligned(8)));
+
+typedef double v4f64 __attribute__((vector_size(32), aligned(32)));
+typedef double v4f64_d __attribute__((vector_size(32), aligned(8)));
+
+// CHECK-LABEL: @xvsll_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsll.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvsll_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvsll_b(_1, _2); }
+// CHECK-LABEL: @xvsll_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsll.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvsll_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvsll_h(_1, _2); }
+// CHECK-LABEL: @xvsll_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsll.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvsll_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvsll_w(_1, _2); }
+// CHECK-LABEL: @xvsll_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsll.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvsll_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvsll_d(_1, _2); }
+// CHECK-LABEL: @xvslli_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvslli.b(<32 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvslli_b(v32i8 _1) { return __builtin_lasx_xvslli_b(_1, 1); }
+// CHECK-LABEL: @xvslli_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvslli.h(<16 x i16> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvslli_h(v16i16 _1) { return __builtin_lasx_xvslli_h(_1, 1); }
+// CHECK-LABEL: @xvslli_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvslli.w(<8 x i32> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvslli_w(v8i32 _1) { return __builtin_lasx_xvslli_w(_1, 1); }
+// CHECK-LABEL: @xvslli_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvslli.d(<4 x i64> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvslli_d(v4i64 _1) { return __builtin_lasx_xvslli_d(_1, 1); }
+// CHECK-LABEL: @xvsra_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsra.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvsra_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvsra_b(_1, _2); }
+// CHECK-LABEL: @xvsra_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsra.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvsra_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvsra_h(_1, _2); }
+// CHECK-LABEL: @xvsra_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsra.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvsra_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvsra_w(_1, _2); }
+// CHECK-LABEL: @xvsra_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsra.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvsra_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvsra_d(_1, _2); }
+// CHECK-LABEL: @xvsrai_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrai.b(<32 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvsrai_b(v32i8 _1) { return __builtin_lasx_xvsrai_b(_1, 1); }
+// CHECK-LABEL: @xvsrai_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrai.h(<16 x i16> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvsrai_h(v16i16 _1) { return __builtin_lasx_xvsrai_h(_1, 1); }
+// CHECK-LABEL: @xvsrai_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrai.w(<8 x i32> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvsrai_w(v8i32 _1) { return __builtin_lasx_xvsrai_w(_1, 1); }
+// CHECK-LABEL: @xvsrai_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrai.d(<4 x i64> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvsrai_d(v4i64 _1) { return __builtin_lasx_xvsrai_d(_1, 1); }
+// CHECK-LABEL: @xvsrar_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrar.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvsrar_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvsrar_b(_1, _2); }
+// CHECK-LABEL: @xvsrar_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrar.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvsrar_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvsrar_h(_1, _2); }
+// CHECK-LABEL: @xvsrar_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrar.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvsrar_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvsrar_w(_1, _2); }
+// CHECK-LABEL: @xvsrar_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrar.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvsrar_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvsrar_d(_1, _2); }
+// CHECK-LABEL: @xvsrari_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrari.b(<32 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvsrari_b(v32i8 _1) { return __builtin_lasx_xvsrari_b(_1, 1); }
+// CHECK-LABEL: @xvsrari_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrari.h(<16 x i16> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvsrari_h(v16i16 _1) { return __builtin_lasx_xvsrari_h(_1, 1); }
+// CHECK-LABEL: @xvsrari_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrari.w(<8 x i32> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvsrari_w(v8i32 _1) { return __builtin_lasx_xvsrari_w(_1, 1); }
+// CHECK-LABEL: @xvsrari_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrari.d(<4 x i64> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvsrari_d(v4i64 _1) { return __builtin_lasx_xvsrari_d(_1, 1); }
+// CHECK-LABEL: @xvsrl_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrl.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvsrl_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvsrl_b(_1, _2); }
+// CHECK-LABEL: @xvsrl_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrl.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvsrl_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvsrl_h(_1, _2); }
+// CHECK-LABEL: @xvsrl_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrl.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvsrl_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvsrl_w(_1, _2); }
+// CHECK-LABEL: @xvsrl_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrl.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvsrl_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvsrl_d(_1, _2); }
+// CHECK-LABEL: @xvsrli_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrli.b(<32 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvsrli_b(v32i8 _1) { return __builtin_lasx_xvsrli_b(_1, 1); }
+// CHECK-LABEL: @xvsrli_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrli.h(<16 x i16> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvsrli_h(v16i16 _1) { return __builtin_lasx_xvsrli_h(_1, 1); }
+// CHECK-LABEL: @xvsrli_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrli.w(<8 x i32> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvsrli_w(v8i32 _1) { return __builtin_lasx_xvsrli_w(_1, 1); }
+// CHECK-LABEL: @xvsrli_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrli.d(<4 x i64> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvsrli_d(v4i64 _1) { return __builtin_lasx_xvsrli_d(_1, 1); }
+// CHECK-LABEL: @xvsrlr_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrlr.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvsrlr_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvsrlr_b(_1, _2); }
+// CHECK-LABEL: @xvsrlr_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrlr.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvsrlr_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvsrlr_h(_1, _2); }
+// CHECK-LABEL: @xvsrlr_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrlr.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvsrlr_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvsrlr_w(_1, _2); }
+// CHECK-LABEL: @xvsrlr_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrlr.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvsrlr_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvsrlr_d(_1, _2); }
+// CHECK-LABEL: @xvsrlri_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrlri.b(<32 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvsrlri_b(v32i8 _1) { return __builtin_lasx_xvsrlri_b(_1, 1); }
+// CHECK-LABEL: @xvsrlri_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrlri.h(<16 x i16> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvsrlri_h(v16i16 _1) { return __builtin_lasx_xvsrlri_h(_1, 1); }
+// CHECK-LABEL: @xvsrlri_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrlri.w(<8 x i32> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvsrlri_w(v8i32 _1) { return __builtin_lasx_xvsrlri_w(_1, 1); }
+// CHECK-LABEL: @xvsrlri_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrlri.d(<4 x i64> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvsrlri_d(v4i64 _1) { return __builtin_lasx_xvsrlri_d(_1, 1); }
+// CHECK-LABEL: @xvbitclr_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbitclr.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32u8 xvbitclr_b(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvbitclr_b(_1, _2); }
+// CHECK-LABEL: @xvbitclr_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvbitclr.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16u16 xvbitclr_h(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvbitclr_h(_1, _2); }
+// CHECK-LABEL: @xvbitclr_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvbitclr.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8u32 xvbitclr_w(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvbitclr_w(_1, _2); }
+// CHECK-LABEL: @xvbitclr_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvbitclr.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4u64 xvbitclr_d(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvbitclr_d(_1, _2); }
+// CHECK-LABEL: @xvbitclri_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbitclri.b(<32 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32u8 xvbitclri_b(v32u8 _1) { return __builtin_lasx_xvbitclri_b(_1, 1); }
+// CHECK-LABEL: @xvbitclri_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvbitclri.h(<16 x i16> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16u16 xvbitclri_h(v16u16 _1) { return __builtin_lasx_xvbitclri_h(_1, 1); }
+// CHECK-LABEL: @xvbitclri_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvbitclri.w(<8 x i32> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8u32 xvbitclri_w(v8u32 _1) { return __builtin_lasx_xvbitclri_w(_1, 1); }
+// CHECK-LABEL: @xvbitclri_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvbitclri.d(<4 x i64> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4u64 xvbitclri_d(v4u64 _1) { return __builtin_lasx_xvbitclri_d(_1, 1); }
+// CHECK-LABEL: @xvbitset_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbitset.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32u8 xvbitset_b(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvbitset_b(_1, _2); }
+// CHECK-LABEL: @xvbitset_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvbitset.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16u16 xvbitset_h(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvbitset_h(_1, _2); }
+// CHECK-LABEL: @xvbitset_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvbitset.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8u32 xvbitset_w(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvbitset_w(_1, _2); }
+// CHECK-LABEL: @xvbitset_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvbitset.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4u64 xvbitset_d(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvbitset_d(_1, _2); }
+// CHECK-LABEL: @xvbitseti_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbitseti.b(<32 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32u8 xvbitseti_b(v32u8 _1) { return __builtin_lasx_xvbitseti_b(_1, 1); }
+// CHECK-LABEL: @xvbitseti_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvbitseti.h(<16 x i16> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16u16 xvbitseti_h(v16u16 _1) { return __builtin_lasx_xvbitseti_h(_1, 1); }
+// CHECK-LABEL: @xvbitseti_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvbitseti.w(<8 x i32> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8u32 xvbitseti_w(v8u32 _1) { return __builtin_lasx_xvbitseti_w(_1, 1); }
+// CHECK-LABEL: @xvbitseti_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvbitseti.d(<4 x i64> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4u64 xvbitseti_d(v4u64 _1) { return __builtin_lasx_xvbitseti_d(_1, 1); }
+// CHECK-LABEL: @xvbitrev_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbitrev.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32u8 xvbitrev_b(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvbitrev_b(_1, _2); }
+// CHECK-LABEL: @xvbitrev_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvbitrev.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16u16 xvbitrev_h(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvbitrev_h(_1, _2); }
+// CHECK-LABEL: @xvbitrev_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvbitrev.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8u32 xvbitrev_w(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvbitrev_w(_1, _2); }
+// CHECK-LABEL: @xvbitrev_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvbitrev.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4u64 xvbitrev_d(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvbitrev_d(_1, _2); }
+// CHECK-LABEL: @xvbitrevi_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbitrevi.b(<32 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32u8 xvbitrevi_b(v32u8 _1) { return __builtin_lasx_xvbitrevi_b(_1, 1); }
+// CHECK-LABEL: @xvbitrevi_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvbitrevi.h(<16 x i16> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16u16 xvbitrevi_h(v16u16 _1) { return __builtin_lasx_xvbitrevi_h(_1, 1); }
+// CHECK-LABEL: @xvbitrevi_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvbitrevi.w(<8 x i32> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8u32 xvbitrevi_w(v8u32 _1) { return __builtin_lasx_xvbitrevi_w(_1, 1); }
+// CHECK-LABEL: @xvbitrevi_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvbitrevi.d(<4 x i64> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4u64 xvbitrevi_d(v4u64 _1) { return __builtin_lasx_xvbitrevi_d(_1, 1); }
+// CHECK-LABEL: @xvadd_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvadd.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvadd_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvadd_b(_1, _2); }
+// CHECK-LABEL: @xvadd_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvadd.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvadd_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvadd_h(_1, _2); }
+// CHECK-LABEL: @xvadd_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvadd.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvadd_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvadd_w(_1, _2); }
+// CHECK-LABEL: @xvadd_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvadd.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvadd_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvadd_d(_1, _2); }
+// CHECK-LABEL: @xvaddi_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvaddi.bu(<32 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvaddi_bu(v32i8 _1) { return __builtin_lasx_xvaddi_bu(_1, 1); }
+// CHECK-LABEL: @xvaddi_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvaddi.hu(<16 x i16> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvaddi_hu(v16i16 _1) { return __builtin_lasx_xvaddi_hu(_1, 1); }
+// CHECK-LABEL: @xvaddi_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvaddi.wu(<8 x i32> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvaddi_wu(v8i32 _1) { return __builtin_lasx_xvaddi_wu(_1, 1); }
+// CHECK-LABEL: @xvaddi_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddi.du(<4 x i64> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvaddi_du(v4i64 _1) { return __builtin_lasx_xvaddi_du(_1, 1); }
+// CHECK-LABEL: @xvsub_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsub.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvsub_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvsub_b(_1, _2); }
+// CHECK-LABEL: @xvsub_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsub.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvsub_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvsub_h(_1, _2); }
+// CHECK-LABEL: @xvsub_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsub.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvsub_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvsub_w(_1, _2); }
+// CHECK-LABEL: @xvsub_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsub.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvsub_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvsub_d(_1, _2); }
+// CHECK-LABEL: @xvsubi_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsubi.bu(<32 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvsubi_bu(v32i8 _1) { return __builtin_lasx_xvsubi_bu(_1, 1); }
+// CHECK-LABEL: @xvsubi_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsubi.hu(<16 x i16> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvsubi_hu(v16i16 _1) { return __builtin_lasx_xvsubi_hu(_1, 1); }
+// CHECK-LABEL: @xvsubi_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsubi.wu(<8 x i32> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvsubi_wu(v8i32 _1) { return __builtin_lasx_xvsubi_wu(_1, 1); }
+// CHECK-LABEL: @xvsubi_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubi.du(<4 x i64> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvsubi_du(v4i64 _1) { return __builtin_lasx_xvsubi_du(_1, 1); }
+// CHECK-LABEL: @xvmax_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmax.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvmax_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvmax_b(_1, _2); }
+// CHECK-LABEL: @xvmax_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmax.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvmax_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvmax_h(_1, _2); }
+// CHECK-LABEL: @xvmax_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmax.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvmax_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvmax_w(_1, _2); }
+// CHECK-LABEL: @xvmax_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmax.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvmax_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvmax_d(_1, _2); }
+// CHECK-LABEL: @xvmaxi_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmaxi.b(<32 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvmaxi_b(v32i8 _1) { return __builtin_lasx_xvmaxi_b(_1, 1); }
+// CHECK-LABEL: @xvmaxi_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmaxi.h(<16 x i16> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvmaxi_h(v16i16 _1) { return __builtin_lasx_xvmaxi_h(_1, 1); }
+// CHECK-LABEL: @xvmaxi_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmaxi.w(<8 x i32> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvmaxi_w(v8i32 _1) { return __builtin_lasx_xvmaxi_w(_1, 1); }
+// CHECK-LABEL: @xvmaxi_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaxi.d(<4 x i64> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvmaxi_d(v4i64 _1) { return __builtin_lasx_xvmaxi_d(_1, 1); }
+// CHECK-LABEL: @xvmax_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmax.bu(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32u8 xvmax_bu(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvmax_bu(_1, _2); }
+// CHECK-LABEL: @xvmax_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmax.hu(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16u16 xvmax_hu(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvmax_hu(_1, _2); }
+// CHECK-LABEL: @xvmax_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmax.wu(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8u32 xvmax_wu(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvmax_wu(_1, _2); }
+// CHECK-LABEL: @xvmax_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmax.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4u64 xvmax_du(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvmax_du(_1, _2); }
+// CHECK-LABEL: @xvmaxi_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmaxi.bu(<32 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32u8 xvmaxi_bu(v32u8 _1) { return __builtin_lasx_xvmaxi_bu(_1, 1); }
+// CHECK-LABEL: @xvmaxi_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmaxi.hu(<16 x i16> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16u16 xvmaxi_hu(v16u16 _1) { return __builtin_lasx_xvmaxi_hu(_1, 1); }
+// CHECK-LABEL: @xvmaxi_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmaxi.wu(<8 x i32> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8u32 xvmaxi_wu(v8u32 _1) { return __builtin_lasx_xvmaxi_wu(_1, 1); }
+// CHECK-LABEL: @xvmaxi_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaxi.du(<4 x i64> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4u64 xvmaxi_du(v4u64 _1) { return __builtin_lasx_xvmaxi_du(_1, 1); }
+// CHECK-LABEL: @xvmin_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmin.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvmin_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvmin_b(_1, _2); }
+// CHECK-LABEL: @xvmin_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmin.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvmin_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvmin_h(_1, _2); }
+// CHECK-LABEL: @xvmin_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmin.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvmin_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvmin_w(_1, _2); }
+// CHECK-LABEL: @xvmin_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmin.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvmin_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvmin_d(_1, _2); }
+// CHECK-LABEL: @xvmini_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmini.b(<32 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvmini_b(v32i8 _1) { return __builtin_lasx_xvmini_b(_1, 1); }
+// CHECK-LABEL: @xvmini_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmini.h(<16 x i16> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvmini_h(v16i16 _1) { return __builtin_lasx_xvmini_h(_1, 1); }
+// CHECK-LABEL: @xvmini_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmini.w(<8 x i32> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvmini_w(v8i32 _1) { return __builtin_lasx_xvmini_w(_1, 1); }
+// CHECK-LABEL: @xvmini_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmini.d(<4 x i64> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvmini_d(v4i64 _1) { return __builtin_lasx_xvmini_d(_1, 1); }
+// CHECK-LABEL: @xvmin_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmin.bu(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32u8 xvmin_bu(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvmin_bu(_1, _2); }
+// CHECK-LABEL: @xvmin_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmin.hu(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16u16 xvmin_hu(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvmin_hu(_1, _2); }
+// CHECK-LABEL: @xvmin_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmin.wu(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8u32 xvmin_wu(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvmin_wu(_1, _2); }
+// CHECK-LABEL: @xvmin_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmin.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4u64 xvmin_du(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvmin_du(_1, _2); }
+// CHECK-LABEL: @xvmini_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmini.bu(<32 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32u8 xvmini_bu(v32u8 _1) { return __builtin_lasx_xvmini_bu(_1, 1); }
+// CHECK-LABEL: @xvmini_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmini.hu(<16 x i16> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16u16 xvmini_hu(v16u16 _1) { return __builtin_lasx_xvmini_hu(_1, 1); }
+// CHECK-LABEL: @xvmini_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmini.wu(<8 x i32> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8u32 xvmini_wu(v8u32 _1) { return __builtin_lasx_xvmini_wu(_1, 1); }
+// CHECK-LABEL: @xvmini_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmini.du(<4 x i64> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4u64 xvmini_du(v4u64 _1) { return __builtin_lasx_xvmini_du(_1, 1); }
+// CHECK-LABEL: @xvseq_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvseq.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvseq_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvseq_b(_1, _2); }
+// CHECK-LABEL: @xvseq_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvseq.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvseq_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvseq_h(_1, _2); }
+// CHECK-LABEL: @xvseq_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvseq.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvseq_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvseq_w(_1, _2); }
+// CHECK-LABEL: @xvseq_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvseq.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvseq_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvseq_d(_1, _2); }
+// CHECK-LABEL: @xvseqi_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvseqi.b(<32 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvseqi_b(v32i8 _1) { return __builtin_lasx_xvseqi_b(_1, 1); }
+// CHECK-LABEL: @xvseqi_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvseqi.h(<16 x i16> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvseqi_h(v16i16 _1) { return __builtin_lasx_xvseqi_h(_1, 1); }
+// CHECK-LABEL: @xvseqi_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvseqi.w(<8 x i32> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvseqi_w(v8i32 _1) { return __builtin_lasx_xvseqi_w(_1, 1); }
+// CHECK-LABEL: @xvseqi_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvseqi.d(<4 x i64> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvseqi_d(v4i64 _1) { return __builtin_lasx_xvseqi_d(_1, 1); }
+// CHECK-LABEL: @xvslt_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvslt.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvslt_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvslt_b(_1, _2); }
+// CHECK-LABEL: @xvslt_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvslt.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvslt_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvslt_h(_1, _2); }
+// CHECK-LABEL: @xvslt_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvslt.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvslt_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvslt_w(_1, _2); }
+// CHECK-LABEL: @xvslt_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvslt.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvslt_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvslt_d(_1, _2); }
+// CHECK-LABEL: @xvslti_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvslti.b(<32 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvslti_b(v32i8 _1) { return __builtin_lasx_xvslti_b(_1, 1); }
+// CHECK-LABEL: @xvslti_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvslti.h(<16 x i16> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvslti_h(v16i16 _1) { return __builtin_lasx_xvslti_h(_1, 1); }
+// CHECK-LABEL: @xvslti_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvslti.w(<8 x i32> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvslti_w(v8i32 _1) { return __builtin_lasx_xvslti_w(_1, 1); }
+// CHECK-LABEL: @xvslti_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvslti.d(<4 x i64> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvslti_d(v4i64 _1) { return __builtin_lasx_xvslti_d(_1, 1); }
+// CHECK-LABEL: @xvslt_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvslt.bu(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvslt_bu(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvslt_bu(_1, _2); }
+// CHECK-LABEL: @xvslt_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvslt.hu(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvslt_hu(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvslt_hu(_1, _2); }
+// CHECK-LABEL: @xvslt_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvslt.wu(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvslt_wu(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvslt_wu(_1, _2); }
+// CHECK-LABEL: @xvslt_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvslt.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvslt_du(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvslt_du(_1, _2); }
+// CHECK-LABEL: @xvslti_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvslti.bu(<32 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvslti_bu(v32u8 _1) { return __builtin_lasx_xvslti_bu(_1, 1); }
+// CHECK-LABEL: @xvslti_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvslti.hu(<16 x i16> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvslti_hu(v16u16 _1) { return __builtin_lasx_xvslti_hu(_1, 1); }
+// CHECK-LABEL: @xvslti_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvslti.wu(<8 x i32> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvslti_wu(v8u32 _1) { return __builtin_lasx_xvslti_wu(_1, 1); }
+// CHECK-LABEL: @xvslti_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvslti.du(<4 x i64> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvslti_du(v4u64 _1) { return __builtin_lasx_xvslti_du(_1, 1); }
+// CHECK-LABEL: @xvsle_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsle.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvsle_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvsle_b(_1, _2); }
+// CHECK-LABEL: @xvsle_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsle.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvsle_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvsle_h(_1, _2); }
+// CHECK-LABEL: @xvsle_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsle.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvsle_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvsle_w(_1, _2); }
+// CHECK-LABEL: @xvsle_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsle.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvsle_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvsle_d(_1, _2); }
+// CHECK-LABEL: @xvslei_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvslei.b(<32 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvslei_b(v32i8 _1) { return __builtin_lasx_xvslei_b(_1, 1); }
+// CHECK-LABEL: @xvslei_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvslei.h(<16 x i16> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvslei_h(v16i16 _1) { return __builtin_lasx_xvslei_h(_1, 1); }
+// CHECK-LABEL: @xvslei_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvslei.w(<8 x i32> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvslei_w(v8i32 _1) { return __builtin_lasx_xvslei_w(_1, 1); }
+// CHECK-LABEL: @xvslei_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvslei.d(<4 x i64> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvslei_d(v4i64 _1) { return __builtin_lasx_xvslei_d(_1, 1); }
+// CHECK-LABEL: @xvsle_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsle.bu(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvsle_bu(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvsle_bu(_1, _2); }
+// CHECK-LABEL: @xvsle_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsle.hu(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvsle_hu(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvsle_hu(_1, _2); }
+// CHECK-LABEL: @xvsle_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsle.wu(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvsle_wu(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvsle_wu(_1, _2); }
+// CHECK-LABEL: @xvsle_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsle.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvsle_du(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvsle_du(_1, _2); }
+// CHECK-LABEL: @xvslei_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvslei.bu(<32 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvslei_bu(v32u8 _1) { return __builtin_lasx_xvslei_bu(_1, 1); }
+// CHECK-LABEL: @xvslei_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvslei.hu(<16 x i16> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvslei_hu(v16u16 _1) { return __builtin_lasx_xvslei_hu(_1, 1); }
+// CHECK-LABEL: @xvslei_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvslei.wu(<8 x i32> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvslei_wu(v8u32 _1) { return __builtin_lasx_xvslei_wu(_1, 1); }
+// CHECK-LABEL: @xvslei_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvslei.du(<4 x i64> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvslei_du(v4u64 _1) { return __builtin_lasx_xvslei_du(_1, 1); }
+// CHECK-LABEL: @xvsat_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsat.b(<32 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvsat_b(v32i8 _1) { return __builtin_lasx_xvsat_b(_1, 1); }
+// CHECK-LABEL: @xvsat_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsat.h(<16 x i16> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvsat_h(v16i16 _1) { return __builtin_lasx_xvsat_h(_1, 1); }
+// CHECK-LABEL: @xvsat_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsat.w(<8 x i32> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvsat_w(v8i32 _1) { return __builtin_lasx_xvsat_w(_1, 1); }
+// CHECK-LABEL: @xvsat_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsat.d(<4 x i64> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvsat_d(v4i64 _1) { return __builtin_lasx_xvsat_d(_1, 1); }
+// CHECK-LABEL: @xvsat_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsat.bu(<32 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32u8 xvsat_bu(v32u8 _1) { return __builtin_lasx_xvsat_bu(_1, 1); }
+// CHECK-LABEL: @xvsat_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsat.hu(<16 x i16> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16u16 xvsat_hu(v16u16 _1) { return __builtin_lasx_xvsat_hu(_1, 1); }
+// CHECK-LABEL: @xvsat_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsat.wu(<8 x i32> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8u32 xvsat_wu(v8u32 _1) { return __builtin_lasx_xvsat_wu(_1, 1); }
+// CHECK-LABEL: @xvsat_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsat.du(<4 x i64> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4u64 xvsat_du(v4u64 _1) { return __builtin_lasx_xvsat_du(_1, 1); }
+// CHECK-LABEL: @xvadda_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvadda.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvadda_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvadda_b(_1, _2); }
+// CHECK-LABEL: @xvadda_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvadda.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvadda_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvadda_h(_1, _2); }
+// CHECK-LABEL: @xvadda_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvadda.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvadda_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvadda_w(_1, _2); }
+// CHECK-LABEL: @xvadda_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvadda.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvadda_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvadda_d(_1, _2); }
+// CHECK-LABEL: @xvsadd_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsadd.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvsadd_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvsadd_b(_1, _2); }
+// CHECK-LABEL: @xvsadd_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsadd.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvsadd_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvsadd_h(_1, _2); }
+// CHECK-LABEL: @xvsadd_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsadd.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvsadd_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvsadd_w(_1, _2); }
+// CHECK-LABEL: @xvsadd_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsadd.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvsadd_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvsadd_d(_1, _2); }
+// CHECK-LABEL: @xvsadd_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsadd.bu(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32u8 xvsadd_bu(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvsadd_bu(_1, _2); }
+// CHECK-LABEL: @xvsadd_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsadd.hu(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16u16 xvsadd_hu(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvsadd_hu(_1, _2); }
+// CHECK-LABEL: @xvsadd_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsadd.wu(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8u32 xvsadd_wu(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvsadd_wu(_1, _2); }
+// CHECK-LABEL: @xvsadd_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsadd.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4u64 xvsadd_du(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvsadd_du(_1, _2); }
+// CHECK-LABEL: @xvavg_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvavg.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvavg_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvavg_b(_1, _2); }
+// CHECK-LABEL: @xvavg_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvavg.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvavg_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvavg_h(_1, _2); }
+// CHECK-LABEL: @xvavg_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvavg.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvavg_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvavg_w(_1, _2); }
+// CHECK-LABEL: @xvavg_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvavg.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvavg_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvavg_d(_1, _2); }
+// CHECK-LABEL: @xvavg_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvavg.bu(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32u8 xvavg_bu(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvavg_bu(_1, _2); }
+// CHECK-LABEL: @xvavg_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvavg.hu(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16u16 xvavg_hu(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvavg_hu(_1, _2); }
+// CHECK-LABEL: @xvavg_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvavg.wu(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8u32 xvavg_wu(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvavg_wu(_1, _2); }
+// CHECK-LABEL: @xvavg_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvavg.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4u64 xvavg_du(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvavg_du(_1, _2); }
+// CHECK-LABEL: @xvavgr_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvavgr.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvavgr_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvavgr_b(_1, _2); }
+// CHECK-LABEL: @xvavgr_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvavgr.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvavgr_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvavgr_h(_1, _2); }
+// CHECK-LABEL: @xvavgr_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvavgr.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvavgr_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvavgr_w(_1, _2); }
+// CHECK-LABEL: @xvavgr_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvavgr.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvavgr_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvavgr_d(_1, _2); }
+// CHECK-LABEL: @xvavgr_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvavgr.bu(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32u8 xvavgr_bu(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvavgr_bu(_1, _2); }
+// CHECK-LABEL: @xvavgr_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvavgr.hu(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16u16 xvavgr_hu(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvavgr_hu(_1, _2); }
+// CHECK-LABEL: @xvavgr_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvavgr.wu(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8u32 xvavgr_wu(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvavgr_wu(_1, _2); }
+// CHECK-LABEL: @xvavgr_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvavgr.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4u64 xvavgr_du(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvavgr_du(_1, _2); }
+// CHECK-LABEL: @xvssub_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssub.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvssub_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvssub_b(_1, _2); }
+// CHECK-LABEL: @xvssub_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssub.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvssub_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvssub_h(_1, _2); }
+// CHECK-LABEL: @xvssub_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssub.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvssub_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvssub_w(_1, _2); }
+// CHECK-LABEL: @xvssub_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssub.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvssub_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvssub_d(_1, _2); }
+// CHECK-LABEL: @xvssub_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssub.bu(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32u8 xvssub_bu(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvssub_bu(_1, _2); }
+// CHECK-LABEL: @xvssub_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssub.hu(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16u16 xvssub_hu(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvssub_hu(_1, _2); }
+// CHECK-LABEL: @xvssub_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssub.wu(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8u32 xvssub_wu(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvssub_wu(_1, _2); }
+// CHECK-LABEL: @xvssub_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssub.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4u64 xvssub_du(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvssub_du(_1, _2); }
+// CHECK-LABEL: @xvabsd_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvabsd.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvabsd_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvabsd_b(_1, _2); }
+// CHECK-LABEL: @xvabsd_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvabsd.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvabsd_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvabsd_h(_1, _2); }
+// CHECK-LABEL: @xvabsd_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvabsd.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvabsd_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvabsd_w(_1, _2); }
+// CHECK-LABEL: @xvabsd_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvabsd.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvabsd_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvabsd_d(_1, _2); }
+// CHECK-LABEL: @xvabsd_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvabsd.bu(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32u8 xvabsd_bu(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvabsd_bu(_1, _2); }
+// CHECK-LABEL: @xvabsd_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvabsd.hu(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16u16 xvabsd_hu(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvabsd_hu(_1, _2); }
+// CHECK-LABEL: @xvabsd_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvabsd.wu(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8u32 xvabsd_wu(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvabsd_wu(_1, _2); }
+// CHECK-LABEL: @xvabsd_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvabsd.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4u64 xvabsd_du(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvabsd_du(_1, _2); }
+// CHECK-LABEL: @xvmul_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmul.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvmul_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvmul_b(_1, _2); }
+// CHECK-LABEL: @xvmul_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmul.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvmul_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvmul_h(_1, _2); }
+// CHECK-LABEL: @xvmul_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmul.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvmul_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvmul_w(_1, _2); }
+// CHECK-LABEL: @xvmul_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmul.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvmul_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvmul_d(_1, _2); }
+// CHECK-LABEL: @xvmadd_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmadd.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], <32 x i8> [[_3:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvmadd_b(v32i8 _1, v32i8 _2, v32i8 _3) { return __builtin_lasx_xvmadd_b(_1, _2, _3); }
+// CHECK-LABEL: @xvmadd_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmadd.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]], <16 x i16> [[_3:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvmadd_h(v16i16 _1, v16i16 _2, v16i16 _3) { return __builtin_lasx_xvmadd_h(_1, _2, _3); }
+// CHECK-LABEL: @xvmadd_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmadd.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]], <8 x i32> [[_3:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvmadd_w(v8i32 _1, v8i32 _2, v8i32 _3) { return __builtin_lasx_xvmadd_w(_1, _2, _3); }
+// CHECK-LABEL: @xvmadd_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmadd.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], <4 x i64> [[_3:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvmadd_d(v4i64 _1, v4i64 _2, v4i64 _3) { return __builtin_lasx_xvmadd_d(_1, _2, _3); }
+// CHECK-LABEL: @xvmsub_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmsub.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], <32 x i8> [[_3:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvmsub_b(v32i8 _1, v32i8 _2, v32i8 _3) { return __builtin_lasx_xvmsub_b(_1, _2, _3); }
+// CHECK-LABEL: @xvmsub_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmsub.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]], <16 x i16> [[_3:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvmsub_h(v16i16 _1, v16i16 _2, v16i16 _3) { return __builtin_lasx_xvmsub_h(_1, _2, _3); }
+// CHECK-LABEL: @xvmsub_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmsub.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]], <8 x i32> [[_3:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvmsub_w(v8i32 _1, v8i32 _2, v8i32 _3) { return __builtin_lasx_xvmsub_w(_1, _2, _3); }
+// CHECK-LABEL: @xvmsub_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmsub.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], <4 x i64> [[_3:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvmsub_d(v4i64 _1, v4i64 _2, v4i64 _3) { return __builtin_lasx_xvmsub_d(_1, _2, _3); }
+// CHECK-LABEL: @xvdiv_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvdiv.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvdiv_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvdiv_b(_1, _2); }
+// CHECK-LABEL: @xvdiv_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvdiv.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvdiv_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvdiv_h(_1, _2); }
+// CHECK-LABEL: @xvdiv_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvdiv.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvdiv_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvdiv_w(_1, _2); }
+// CHECK-LABEL: @xvdiv_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvdiv.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvdiv_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvdiv_d(_1, _2); }
+// CHECK-LABEL: @xvdiv_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvdiv.bu(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32u8 xvdiv_bu(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvdiv_bu(_1, _2); }
+// CHECK-LABEL: @xvdiv_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvdiv.hu(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16u16 xvdiv_hu(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvdiv_hu(_1, _2); }
+// CHECK-LABEL: @xvdiv_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvdiv.wu(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8u32 xvdiv_wu(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvdiv_wu(_1, _2); }
+// CHECK-LABEL: @xvdiv_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvdiv.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4u64 xvdiv_du(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvdiv_du(_1, _2); }
+// CHECK-LABEL: @xvhaddw_h_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvhaddw.h.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvhaddw_h_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvhaddw_h_b(_1, _2); }
+// CHECK-LABEL: @xvhaddw_w_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvhaddw.w.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvhaddw_w_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvhaddw_w_h(_1, _2); }
+// CHECK-LABEL: @xvhaddw_d_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvhaddw.d.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvhaddw_d_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvhaddw_d_w(_1, _2); }
+// CHECK-LABEL: @xvhaddw_hu_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvhaddw.hu.bu(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16u16 xvhaddw_hu_bu(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvhaddw_hu_bu(_1, _2); }
+// CHECK-LABEL: @xvhaddw_wu_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvhaddw.wu.hu(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8u32 xvhaddw_wu_hu(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvhaddw_wu_hu(_1, _2); }
+// CHECK-LABEL: @xvhaddw_du_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvhaddw.du.wu(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4u64 xvhaddw_du_wu(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvhaddw_du_wu(_1, _2); }
+// CHECK-LABEL: @xvhsubw_h_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvhsubw.h.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvhsubw_h_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvhsubw_h_b(_1, _2); }
+// CHECK-LABEL: @xvhsubw_w_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvhsubw.w.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvhsubw_w_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvhsubw_w_h(_1, _2); }
+// CHECK-LABEL: @xvhsubw_d_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvhsubw.d.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvhsubw_d_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvhsubw_d_w(_1, _2); }
+// CHECK-LABEL: @xvhsubw_hu_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvhsubw.hu.bu(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvhsubw_hu_bu(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvhsubw_hu_bu(_1, _2); }
+// CHECK-LABEL: @xvhsubw_wu_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvhsubw.wu.hu(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvhsubw_wu_hu(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvhsubw_wu_hu(_1, _2); }
+// CHECK-LABEL: @xvhsubw_du_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvhsubw.du.wu(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvhsubw_du_wu(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvhsubw_du_wu(_1, _2); }
+// CHECK-LABEL: @xvmod_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmod.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvmod_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvmod_b(_1, _2); }
+// CHECK-LABEL: @xvmod_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmod.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvmod_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvmod_h(_1, _2); }
+// CHECK-LABEL: @xvmod_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmod.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvmod_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvmod_w(_1, _2); }
+// CHECK-LABEL: @xvmod_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmod.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvmod_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvmod_d(_1, _2); }
+// CHECK-LABEL: @xvmod_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmod.bu(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32u8 xvmod_bu(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvmod_bu(_1, _2); }
+// CHECK-LABEL: @xvmod_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmod.hu(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16u16 xvmod_hu(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvmod_hu(_1, _2); }
+// CHECK-LABEL: @xvmod_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmod.wu(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8u32 xvmod_wu(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvmod_wu(_1, _2); }
+// CHECK-LABEL: @xvmod_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmod.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4u64 xvmod_du(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvmod_du(_1, _2); }
+// CHECK-LABEL: @xvrepl128vei_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvrepl128vei.b(<32 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvrepl128vei_b(v32i8 _1) { return __builtin_lasx_xvrepl128vei_b(_1, 1); }
+// CHECK-LABEL: @xvrepl128vei_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvrepl128vei.h(<16 x i16> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvrepl128vei_h(v16i16 _1) { return __builtin_lasx_xvrepl128vei_h(_1, 1); }
+// CHECK-LABEL: @xvrepl128vei_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvrepl128vei.w(<8 x i32> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvrepl128vei_w(v8i32 _1) { return __builtin_lasx_xvrepl128vei_w(_1, 1); }
+// CHECK-LABEL: @xvrepl128vei_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvrepl128vei.d(<4 x i64> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvrepl128vei_d(v4i64 _1) { return __builtin_lasx_xvrepl128vei_d(_1, 1); }
+// CHECK-LABEL: @xvpickev_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvpickev.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvpickev_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvpickev_b(_1, _2); }
+// CHECK-LABEL: @xvpickev_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvpickev.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvpickev_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvpickev_h(_1, _2); }
+// CHECK-LABEL: @xvpickev_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvpickev.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvpickev_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvpickev_w(_1, _2); }
+// CHECK-LABEL: @xvpickev_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvpickev.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvpickev_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvpickev_d(_1, _2); }
+// CHECK-LABEL: @xvpickod_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvpickod.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvpickod_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvpickod_b(_1, _2); }
+// CHECK-LABEL: @xvpickod_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvpickod.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvpickod_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvpickod_h(_1, _2); }
+// CHECK-LABEL: @xvpickod_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvpickod.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvpickod_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvpickod_w(_1, _2); }
+// CHECK-LABEL: @xvpickod_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvpickod.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvpickod_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvpickod_d(_1, _2); }
+// CHECK-LABEL: @xvilvh_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvilvh.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvilvh_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvilvh_b(_1, _2); }
+// CHECK-LABEL: @xvilvh_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvilvh.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvilvh_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvilvh_h(_1, _2); }
+// CHECK-LABEL: @xvilvh_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvilvh.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvilvh_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvilvh_w(_1, _2); }
+// CHECK-LABEL: @xvilvh_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvilvh.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvilvh_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvilvh_d(_1, _2); }
+// CHECK-LABEL: @xvilvl_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvilvl.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvilvl_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvilvl_b(_1, _2); }
+// CHECK-LABEL: @xvilvl_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvilvl.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvilvl_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvilvl_h(_1, _2); }
+// CHECK-LABEL: @xvilvl_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvilvl.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvilvl_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvilvl_w(_1, _2); }
+// CHECK-LABEL: @xvilvl_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvilvl.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvilvl_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvilvl_d(_1, _2); }
+// CHECK-LABEL: @xvpackev_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvpackev.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvpackev_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvpackev_b(_1, _2); }
+// CHECK-LABEL: @xvpackev_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvpackev.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvpackev_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvpackev_h(_1, _2); }
+// CHECK-LABEL: @xvpackev_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvpackev.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvpackev_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvpackev_w(_1, _2); }
+// CHECK-LABEL: @xvpackev_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvpackev.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvpackev_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvpackev_d(_1, _2); }
+// CHECK-LABEL: @xvpackod_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvpackod.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvpackod_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvpackod_b(_1, _2); }
+// CHECK-LABEL: @xvpackod_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvpackod.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvpackod_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvpackod_h(_1, _2); }
+// CHECK-LABEL: @xvpackod_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvpackod.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvpackod_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvpackod_w(_1, _2); }
+// CHECK-LABEL: @xvpackod_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvpackod.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvpackod_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvpackod_d(_1, _2); }
+// CHECK-LABEL: @xvshuf_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvshuf.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], <32 x i8> [[_3:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvshuf_b(v32i8 _1, v32i8 _2, v32i8 _3) { return __builtin_lasx_xvshuf_b(_1, _2, _3); }
+// CHECK-LABEL: @xvshuf_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvshuf.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]], <16 x i16> [[_3:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvshuf_h(v16i16 _1, v16i16 _2, v16i16 _3) { return __builtin_lasx_xvshuf_h(_1, _2, _3); }
+// CHECK-LABEL: @xvshuf_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvshuf.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]], <8 x i32> [[_3:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvshuf_w(v8i32 _1, v8i32 _2, v8i32 _3) { return __builtin_lasx_xvshuf_w(_1, _2, _3); }
+// CHECK-LABEL: @xvshuf_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvshuf.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], <4 x i64> [[_3:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvshuf_d(v4i64 _1, v4i64 _2, v4i64 _3) { return __builtin_lasx_xvshuf_d(_1, _2, _3); }
+// CHECK-LABEL: @xvand_v(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvand.v(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32u8 xvand_v(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvand_v(_1, _2); }
+// CHECK-LABEL: @xvandi_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvandi.b(<32 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32u8 xvandi_b(v32u8 _1) { return __builtin_lasx_xvandi_b(_1, 1); }
+// CHECK-LABEL: @xvor_v(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvor.v(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32u8 xvor_v(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvor_v(_1, _2); }
+// CHECK-LABEL: @xvori_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvori.b(<32 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32u8 xvori_b(v32u8 _1) { return __builtin_lasx_xvori_b(_1, 1); }
+// CHECK-LABEL: @xvnor_v(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvnor.v(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32u8 xvnor_v(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvnor_v(_1, _2); }
+// CHECK-LABEL: @xvnori_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvnori.b(<32 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32u8 xvnori_b(v32u8 _1) { return __builtin_lasx_xvnori_b(_1, 1); }
+// CHECK-LABEL: @xvxor_v(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvxor.v(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32u8 xvxor_v(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvxor_v(_1, _2); }
+// CHECK-LABEL: @xvxori_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvxori.b(<32 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32u8 xvxori_b(v32u8 _1) { return __builtin_lasx_xvxori_b(_1, 1); }
+// CHECK-LABEL: @xvbitsel_v(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbitsel.v(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], <32 x i8> [[_3:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32u8 xvbitsel_v(v32u8 _1, v32u8 _2, v32u8 _3) { return __builtin_lasx_xvbitsel_v(_1, _2, _3); }
+// CHECK-LABEL: @xvbitseli_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbitseli.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32u8 xvbitseli_b(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvbitseli_b(_1, _2, 1); }
+// CHECK-LABEL: @xvshuf4i_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvshuf4i.b(<32 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvshuf4i_b(v32i8 _1) { return __builtin_lasx_xvshuf4i_b(_1, 1); }
+// CHECK-LABEL: @xvshuf4i_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvshuf4i.h(<16 x i16> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvshuf4i_h(v16i16 _1) { return __builtin_lasx_xvshuf4i_h(_1, 1); }
+// CHECK-LABEL: @xvshuf4i_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvshuf4i.w(<8 x i32> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvshuf4i_w(v8i32 _1) { return __builtin_lasx_xvshuf4i_w(_1, 1); }
+// CHECK-LABEL: @xvreplgr2vr_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvreplgr2vr.b(i32 [[_1:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvreplgr2vr_b(int _1) { return __builtin_lasx_xvreplgr2vr_b(_1); }
+// CHECK-LABEL: @xvreplgr2vr_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvreplgr2vr.h(i32 [[_1:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvreplgr2vr_h(int _1) { return __builtin_lasx_xvreplgr2vr_h(_1); }
+// CHECK-LABEL: @xvreplgr2vr_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvreplgr2vr.w(i32 [[_1:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvreplgr2vr_w(int _1) { return __builtin_lasx_xvreplgr2vr_w(_1); }
+// CHECK-LABEL: @xvreplgr2vr_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[CONV:%.*]] = sext i32 [[_1:%.*]] to i64
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvreplgr2vr.d(i64 [[CONV]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvreplgr2vr_d(int _1) { return __builtin_lasx_xvreplgr2vr_d(_1); }
+// CHECK-LABEL: @xvpcnt_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvpcnt.b(<32 x i8> [[_1:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvpcnt_b(v32i8 _1) { return __builtin_lasx_xvpcnt_b(_1); }
+// CHECK-LABEL: @xvpcnt_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvpcnt.h(<16 x i16> [[_1:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvpcnt_h(v16i16 _1) { return __builtin_lasx_xvpcnt_h(_1); }
+// CHECK-LABEL: @xvpcnt_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvpcnt.w(<8 x i32> [[_1:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvpcnt_w(v8i32 _1) { return __builtin_lasx_xvpcnt_w(_1); }
+// CHECK-LABEL: @xvpcnt_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvpcnt.d(<4 x i64> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvpcnt_d(v4i64 _1) { return __builtin_lasx_xvpcnt_d(_1); }
+// CHECK-LABEL: @xvclo_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvclo.b(<32 x i8> [[_1:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvclo_b(v32i8 _1) { return __builtin_lasx_xvclo_b(_1); }
+// CHECK-LABEL: @xvclo_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvclo.h(<16 x i16> [[_1:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvclo_h(v16i16 _1) { return __builtin_lasx_xvclo_h(_1); }
+// CHECK-LABEL: @xvclo_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvclo.w(<8 x i32> [[_1:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvclo_w(v8i32 _1) { return __builtin_lasx_xvclo_w(_1); }
+// CHECK-LABEL: @xvclo_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvclo.d(<4 x i64> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvclo_d(v4i64 _1) { return __builtin_lasx_xvclo_d(_1); }
+// CHECK-LABEL: @xvclz_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvclz.b(<32 x i8> [[_1:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvclz_b(v32i8 _1) { return __builtin_lasx_xvclz_b(_1); }
+// CHECK-LABEL: @xvclz_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvclz.h(<16 x i16> [[_1:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvclz_h(v16i16 _1) { return __builtin_lasx_xvclz_h(_1); }
+// CHECK-LABEL: @xvclz_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvclz.w(<8 x i32> [[_1:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvclz_w(v8i32 _1) { return __builtin_lasx_xvclz_w(_1); }
+// CHECK-LABEL: @xvclz_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvclz.d(<4 x i64> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvclz_d(v4i64 _1) { return __builtin_lasx_xvclz_d(_1); }
+// CHECK-LABEL: @xvfadd_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfadd.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+//
+v8f32 xvfadd_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfadd_s(_1, _2); }
+// CHECK-LABEL: @xvfadd_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfadd.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+//
+v4f64 xvfadd_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfadd_d(_1, _2); }
+// CHECK-LABEL: @xvfsub_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfsub.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+//
+v8f32 xvfsub_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfsub_s(_1, _2); }
+// CHECK-LABEL: @xvfsub_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfsub.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+//
+v4f64 xvfsub_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfsub_d(_1, _2); }
+// CHECK-LABEL: @xvfmul_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfmul.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+//
+v8f32 xvfmul_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfmul_s(_1, _2); }
+// CHECK-LABEL: @xvfmul_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfmul.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+//
+v4f64 xvfmul_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfmul_d(_1, _2); }
+// CHECK-LABEL: @xvfdiv_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfdiv.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+//
+v8f32 xvfdiv_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfdiv_s(_1, _2); }
+// CHECK-LABEL: @xvfdiv_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfdiv.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+//
+v4f64 xvfdiv_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfdiv_d(_1, _2); }
+// CHECK-LABEL: @xvfcvt_h_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvfcvt.h.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvfcvt_h_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfcvt_h_s(_1, _2); }
+// CHECK-LABEL: @xvfcvt_s_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfcvt.s.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+//
+v8f32 xvfcvt_s_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfcvt_s_d(_1, _2); }
+// CHECK-LABEL: @xvfmin_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfmin.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+//
+v8f32 xvfmin_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfmin_s(_1, _2); }
+// CHECK-LABEL: @xvfmin_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfmin.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+//
+v4f64 xvfmin_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfmin_d(_1, _2); }
+// CHECK-LABEL: @xvfmina_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfmina.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+//
+v8f32 xvfmina_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfmina_s(_1, _2); }
+// CHECK-LABEL: @xvfmina_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfmina.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+//
+v4f64 xvfmina_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfmina_d(_1, _2); }
+// CHECK-LABEL: @xvfmax_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfmax.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+//
+v8f32 xvfmax_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfmax_s(_1, _2); }
+// CHECK-LABEL: @xvfmax_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfmax.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+//
+v4f64 xvfmax_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfmax_d(_1, _2); }
+// CHECK-LABEL: @xvfmaxa_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfmaxa.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+//
+v8f32 xvfmaxa_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfmaxa_s(_1, _2); }
+// CHECK-LABEL: @xvfmaxa_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfmaxa.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+//
+v4f64 xvfmaxa_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfmaxa_d(_1, _2); }
+// CHECK-LABEL: @xvfclass_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfclass.s(<8 x float> [[_1:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvfclass_s(v8f32 _1) { return __builtin_lasx_xvfclass_s(_1); }
+// CHECK-LABEL: @xvfclass_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfclass.d(<4 x double> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvfclass_d(v4f64 _1) { return __builtin_lasx_xvfclass_d(_1); }
+// CHECK-LABEL: @xvfsqrt_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfsqrt.s(<8 x float> [[_1:%.*]])
+// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+//
+v8f32 xvfsqrt_s(v8f32 _1) { return __builtin_lasx_xvfsqrt_s(_1); }
+// CHECK-LABEL: @xvfsqrt_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfsqrt.d(<4 x double> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+//
+v4f64 xvfsqrt_d(v4f64 _1) { return __builtin_lasx_xvfsqrt_d(_1); }
+// CHECK-LABEL: @xvfrecip_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfrecip.s(<8 x float> [[_1:%.*]])
+// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+//
+v8f32 xvfrecip_s(v8f32 _1) { return __builtin_lasx_xvfrecip_s(_1); }
+// CHECK-LABEL: @xvfrecip_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfrecip.d(<4 x double> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+//
+v4f64 xvfrecip_d(v4f64 _1) { return __builtin_lasx_xvfrecip_d(_1); }
+// CHECK-LABEL: @xvfrint_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfrint.s(<8 x float> [[_1:%.*]])
+// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+//
+v8f32 xvfrint_s(v8f32 _1) { return __builtin_lasx_xvfrint_s(_1); }
+// CHECK-LABEL: @xvfrint_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfrint.d(<4 x double> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+//
+v4f64 xvfrint_d(v4f64 _1) { return __builtin_lasx_xvfrint_d(_1); }
+// CHECK-LABEL: @xvfrsqrt_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfrsqrt.s(<8 x float> [[_1:%.*]])
+// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+//
+v8f32 xvfrsqrt_s(v8f32 _1) { return __builtin_lasx_xvfrsqrt_s(_1); }
+// CHECK-LABEL: @xvfrsqrt_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfrsqrt.d(<4 x double> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+//
+v4f64 xvfrsqrt_d(v4f64 _1) { return __builtin_lasx_xvfrsqrt_d(_1); }
+// CHECK-LABEL: @xvflogb_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvflogb.s(<8 x float> [[_1:%.*]])
+// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+//
+v8f32 xvflogb_s(v8f32 _1) { return __builtin_lasx_xvflogb_s(_1); }
+// CHECK-LABEL: @xvflogb_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvflogb.d(<4 x double> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+//
+v4f64 xvflogb_d(v4f64 _1) { return __builtin_lasx_xvflogb_d(_1); }
+// CHECK-LABEL: @xvfcvth_s_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfcvth.s.h(<16 x i16> [[_1:%.*]])
+// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+//
+v8f32 xvfcvth_s_h(v16i16 _1) { return __builtin_lasx_xvfcvth_s_h(_1); }
+// CHECK-LABEL: @xvfcvth_d_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfcvth.d.s(<8 x float> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+//
+v4f64 xvfcvth_d_s(v8f32 _1) { return __builtin_lasx_xvfcvth_d_s(_1); }
+// CHECK-LABEL: @xvfcvtl_s_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfcvtl.s.h(<16 x i16> [[_1:%.*]])
+// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+//
+v8f32 xvfcvtl_s_h(v16i16 _1) { return __builtin_lasx_xvfcvtl_s_h(_1); }
+// CHECK-LABEL: @xvfcvtl_d_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfcvtl.d.s(<8 x float> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+//
+v4f64 xvfcvtl_d_s(v8f32 _1) { return __builtin_lasx_xvfcvtl_d_s(_1); }
+// CHECK-LABEL: @xvftint_w_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftint.w.s(<8 x float> [[_1:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvftint_w_s(v8f32 _1) { return __builtin_lasx_xvftint_w_s(_1); }
+// CHECK-LABEL: @xvftint_l_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftint.l.d(<4 x double> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvftint_l_d(v4f64 _1) { return __builtin_lasx_xvftint_l_d(_1); }
+// CHECK-LABEL: @xvftint_wu_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftint.wu.s(<8 x float> [[_1:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8u32 xvftint_wu_s(v8f32 _1) { return __builtin_lasx_xvftint_wu_s(_1); }
+// CHECK-LABEL: @xvftint_lu_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftint.lu.d(<4 x double> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4u64 xvftint_lu_d(v4f64 _1) { return __builtin_lasx_xvftint_lu_d(_1); }
+// CHECK-LABEL: @xvftintrz_w_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrz.w.s(<8 x float> [[_1:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvftintrz_w_s(v8f32 _1) { return __builtin_lasx_xvftintrz_w_s(_1); }
+// CHECK-LABEL: @xvftintrz_l_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrz.l.d(<4 x double> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvftintrz_l_d(v4f64 _1) { return __builtin_lasx_xvftintrz_l_d(_1); }
+// CHECK-LABEL: @xvftintrz_wu_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrz.wu.s(<8 x float> [[_1:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8u32 xvftintrz_wu_s(v8f32 _1) { return __builtin_lasx_xvftintrz_wu_s(_1); }
+// CHECK-LABEL: @xvftintrz_lu_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrz.lu.d(<4 x double> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4u64 xvftintrz_lu_d(v4f64 _1) { return __builtin_lasx_xvftintrz_lu_d(_1); }
+// CHECK-LABEL: @xvffint_s_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvffint.s.w(<8 x i32> [[_1:%.*]])
+// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+//
+v8f32 xvffint_s_w(v8i32 _1) { return __builtin_lasx_xvffint_s_w(_1); }
+// CHECK-LABEL: @xvffint_d_l(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvffint.d.l(<4 x i64> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+//
+v4f64 xvffint_d_l(v4i64 _1) { return __builtin_lasx_xvffint_d_l(_1); }
+// CHECK-LABEL: @xvffint_s_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvffint.s.wu(<8 x i32> [[_1:%.*]])
+// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+//
+v8f32 xvffint_s_wu(v8u32 _1) { return __builtin_lasx_xvffint_s_wu(_1); }
+// CHECK-LABEL: @xvffint_d_lu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvffint.d.lu(<4 x i64> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+//
+v4f64 xvffint_d_lu(v4u64 _1) { return __builtin_lasx_xvffint_d_lu(_1); }
+// CHECK-LABEL: @xvreplve_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvreplve.b(<32 x i8> [[_1:%.*]], i32 [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvreplve_b(v32i8 _1, int _2) { return __builtin_lasx_xvreplve_b(_1, _2); }
+// CHECK-LABEL: @xvreplve_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvreplve.h(<16 x i16> [[_1:%.*]], i32 [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvreplve_h(v16i16 _1, int _2) { return __builtin_lasx_xvreplve_h(_1, _2); }
+// CHECK-LABEL: @xvreplve_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvreplve.w(<8 x i32> [[_1:%.*]], i32 [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvreplve_w(v8i32 _1, int _2) { return __builtin_lasx_xvreplve_w(_1, _2); }
+// CHECK-LABEL: @xvreplve_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvreplve.d(<4 x i64> [[_1:%.*]], i32 [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvreplve_d(v4i64 _1, int _2) { return __builtin_lasx_xvreplve_d(_1, _2); }
+// CHECK-LABEL: @xvpermi_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvpermi.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvpermi_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvpermi_w(_1, _2, 1); }
+// CHECK-LABEL: @xvandn_v(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvandn.v(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32u8 xvandn_v(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvandn_v(_1, _2); }
+// CHECK-LABEL: @xvneg_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvneg.b(<32 x i8> [[_1:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvneg_b(v32i8 _1) { return __builtin_lasx_xvneg_b(_1); }
+// CHECK-LABEL: @xvneg_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvneg.h(<16 x i16> [[_1:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvneg_h(v16i16 _1) { return __builtin_lasx_xvneg_h(_1); }
+// CHECK-LABEL: @xvneg_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvneg.w(<8 x i32> [[_1:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvneg_w(v8i32 _1) { return __builtin_lasx_xvneg_w(_1); }
+// CHECK-LABEL: @xvneg_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvneg.d(<4 x i64> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvneg_d(v4i64 _1) { return __builtin_lasx_xvneg_d(_1); }
+// CHECK-LABEL: @xvmuh_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmuh.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvmuh_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvmuh_b(_1, _2); }
+// CHECK-LABEL: @xvmuh_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmuh.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvmuh_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvmuh_h(_1, _2); }
+// CHECK-LABEL: @xvmuh_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmuh.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvmuh_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvmuh_w(_1, _2); }
+// CHECK-LABEL: @xvmuh_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmuh.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvmuh_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvmuh_d(_1, _2); }
+// CHECK-LABEL: @xvmuh_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmuh.bu(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32u8 xvmuh_bu(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvmuh_bu(_1, _2); }
+// CHECK-LABEL: @xvmuh_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmuh.hu(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16u16 xvmuh_hu(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvmuh_hu(_1, _2); }
+// CHECK-LABEL: @xvmuh_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmuh.wu(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8u32 xvmuh_wu(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvmuh_wu(_1, _2); }
+// CHECK-LABEL: @xvmuh_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmuh.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4u64 xvmuh_du(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvmuh_du(_1, _2); }
+// CHECK-LABEL: @xvsllwil_h_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsllwil.h.b(<32 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvsllwil_h_b(v32i8 _1) { return __builtin_lasx_xvsllwil_h_b(_1, 1); }
+// CHECK-LABEL: @xvsllwil_w_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsllwil.w.h(<16 x i16> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvsllwil_w_h(v16i16 _1) { return __builtin_lasx_xvsllwil_w_h(_1, 1); }
+// CHECK-LABEL: @xvsllwil_d_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsllwil.d.w(<8 x i32> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvsllwil_d_w(v8i32 _1) { return __builtin_lasx_xvsllwil_d_w(_1, 1); }
+// CHECK-LABEL: @xvsllwil_hu_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsllwil.hu.bu(<32 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16u16 xvsllwil_hu_bu(v32u8 _1) { return __builtin_lasx_xvsllwil_hu_bu(_1, 1); }
+// CHECK-LABEL: @xvsllwil_wu_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsllwil.wu.hu(<16 x i16> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8u32 xvsllwil_wu_hu(v16u16 _1) { return __builtin_lasx_xvsllwil_wu_hu(_1, 1); }
+// CHECK-LABEL: @xvsllwil_du_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsllwil.du.wu(<8 x i32> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4u64 xvsllwil_du_wu(v8u32 _1) { return __builtin_lasx_xvsllwil_du_wu(_1, 1); }
+// CHECK-LABEL: @xvsran_b_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsran.b.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvsran_b_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvsran_b_h(_1, _2); }
+// CHECK-LABEL: @xvsran_h_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsran.h.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvsran_h_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvsran_h_w(_1, _2); }
+// CHECK-LABEL: @xvsran_w_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsran.w.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvsran_w_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvsran_w_d(_1, _2); }
+// CHECK-LABEL: @xvssran_b_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssran.b.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvssran_b_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvssran_b_h(_1, _2); }
+// CHECK-LABEL: @xvssran_h_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssran.h.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvssran_h_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvssran_h_w(_1, _2); }
+// CHECK-LABEL: @xvssran_w_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssran.w.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvssran_w_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvssran_w_d(_1, _2); }
+// CHECK-LABEL: @xvssran_bu_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssran.bu.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32u8 xvssran_bu_h(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvssran_bu_h(_1, _2); }
+// CHECK-LABEL: @xvssran_hu_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssran.hu.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16u16 xvssran_hu_w(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvssran_hu_w(_1, _2); }
+// CHECK-LABEL: @xvssran_wu_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssran.wu.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8u32 xvssran_wu_d(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvssran_wu_d(_1, _2); }
+// CHECK-LABEL: @xvsrarn_b_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrarn.b.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvsrarn_b_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvsrarn_b_h(_1, _2); }
+// CHECK-LABEL: @xvsrarn_h_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrarn.h.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvsrarn_h_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvsrarn_h_w(_1, _2); }
+// CHECK-LABEL: @xvsrarn_w_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrarn.w.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvsrarn_w_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvsrarn_w_d(_1, _2); }
+// CHECK-LABEL: @xvssrarn_b_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrarn.b.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvssrarn_b_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvssrarn_b_h(_1, _2); }
+// CHECK-LABEL: @xvssrarn_h_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrarn.h.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvssrarn_h_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvssrarn_h_w(_1, _2); }
+// CHECK-LABEL: @xvssrarn_w_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrarn.w.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvssrarn_w_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvssrarn_w_d(_1, _2); }
+// CHECK-LABEL: @xvssrarn_bu_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrarn.bu.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32u8 xvssrarn_bu_h(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvssrarn_bu_h(_1, _2); }
+// CHECK-LABEL: @xvssrarn_hu_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrarn.hu.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16u16 xvssrarn_hu_w(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvssrarn_hu_w(_1, _2); }
+// CHECK-LABEL: @xvssrarn_wu_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrarn.wu.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8u32 xvssrarn_wu_d(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvssrarn_wu_d(_1, _2); }
+// CHECK-LABEL: @xvsrln_b_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrln.b.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvsrln_b_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvsrln_b_h(_1, _2); }
+// CHECK-LABEL: @xvsrln_h_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrln.h.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvsrln_h_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvsrln_h_w(_1, _2); }
+// CHECK-LABEL: @xvsrln_w_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrln.w.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvsrln_w_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvsrln_w_d(_1, _2); }
+// CHECK-LABEL: @xvssrln_bu_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrln.bu.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32u8 xvssrln_bu_h(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvssrln_bu_h(_1, _2); }
+// CHECK-LABEL: @xvssrln_hu_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrln.hu.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16u16 xvssrln_hu_w(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvssrln_hu_w(_1, _2); }
+// CHECK-LABEL: @xvssrln_wu_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrln.wu.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8u32 xvssrln_wu_d(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvssrln_wu_d(_1, _2); }
+// CHECK-LABEL: @xvsrlrn_b_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrlrn.b.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvsrlrn_b_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvsrlrn_b_h(_1, _2); }
+// CHECK-LABEL: @xvsrlrn_h_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrlrn.h.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvsrlrn_h_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvsrlrn_h_w(_1, _2); }
+// CHECK-LABEL: @xvsrlrn_w_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrlrn.w.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvsrlrn_w_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvsrlrn_w_d(_1, _2); }
+// CHECK-LABEL: @xvssrlrn_bu_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrlrn.bu.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32u8 xvssrlrn_bu_h(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvssrlrn_bu_h(_1, _2); }
+// CHECK-LABEL: @xvssrlrn_hu_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrlrn.hu.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16u16 xvssrlrn_hu_w(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvssrlrn_hu_w(_1, _2); }
+// CHECK-LABEL: @xvssrlrn_wu_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrlrn.wu.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8u32 xvssrlrn_wu_d(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvssrlrn_wu_d(_1, _2); }
+// CHECK-LABEL: @xvfrstpi_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvfrstpi.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvfrstpi_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvfrstpi_b(_1, _2, 1); }
+// CHECK-LABEL: @xvfrstpi_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvfrstpi.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvfrstpi_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvfrstpi_h(_1, _2, 1); }
+// CHECK-LABEL: @xvfrstp_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvfrstp.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], <32 x i8> [[_3:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvfrstp_b(v32i8 _1, v32i8 _2, v32i8 _3) { return __builtin_lasx_xvfrstp_b(_1, _2, _3); }
+// CHECK-LABEL: @xvfrstp_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvfrstp.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]], <16 x i16> [[_3:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvfrstp_h(v16i16 _1, v16i16 _2, v16i16 _3) { return __builtin_lasx_xvfrstp_h(_1, _2, _3); }
+// CHECK-LABEL: @xvshuf4i_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvshuf4i.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvshuf4i_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvshuf4i_d(_1, _2, 1); }
+// CHECK-LABEL: @xvbsrl_v(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbsrl.v(<32 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvbsrl_v(v32i8 _1) { return __builtin_lasx_xvbsrl_v(_1, 1); }
+// CHECK-LABEL: @xvbsll_v(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvbsll.v(<32 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvbsll_v(v32i8 _1) { return __builtin_lasx_xvbsll_v(_1, 1); }
+// CHECK-LABEL: @xvextrins_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvextrins.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvextrins_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvextrins_b(_1, _2, 1); }
+// CHECK-LABEL: @xvextrins_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvextrins.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvextrins_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvextrins_h(_1, _2, 1); }
+// CHECK-LABEL: @xvextrins_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvextrins.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvextrins_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvextrins_w(_1, _2, 1); }
+// CHECK-LABEL: @xvextrins_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvextrins.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvextrins_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvextrins_d(_1, _2, 1); }
+// CHECK-LABEL: @xvmskltz_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmskltz.b(<32 x i8> [[_1:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvmskltz_b(v32i8 _1) { return __builtin_lasx_xvmskltz_b(_1); }
+// CHECK-LABEL: @xvmskltz_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmskltz.h(<16 x i16> [[_1:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvmskltz_h(v16i16 _1) { return __builtin_lasx_xvmskltz_h(_1); }
+// CHECK-LABEL: @xvmskltz_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmskltz.w(<8 x i32> [[_1:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvmskltz_w(v8i32 _1) { return __builtin_lasx_xvmskltz_w(_1); }
+// CHECK-LABEL: @xvmskltz_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmskltz.d(<4 x i64> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvmskltz_d(v4i64 _1) { return __builtin_lasx_xvmskltz_d(_1); }
+// CHECK-LABEL: @xvsigncov_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsigncov.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvsigncov_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvsigncov_b(_1, _2); }
+// CHECK-LABEL: @xvsigncov_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsigncov.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvsigncov_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvsigncov_h(_1, _2); }
+// CHECK-LABEL: @xvsigncov_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsigncov.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvsigncov_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvsigncov_w(_1, _2); }
+// CHECK-LABEL: @xvsigncov_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsigncov.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvsigncov_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvsigncov_d(_1, _2); }
+// CHECK-LABEL: @xvfmadd_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfmadd.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]], <8 x float> [[_3:%.*]])
+// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+//
+v8f32 xvfmadd_s(v8f32 _1, v8f32 _2, v8f32 _3) { return __builtin_lasx_xvfmadd_s(_1, _2, _3); }
+// CHECK-LABEL: @xvfmadd_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfmadd.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]], <4 x double> [[_3:%.*]])
+// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+//
+v4f64 xvfmadd_d(v4f64 _1, v4f64 _2, v4f64 _3) { return __builtin_lasx_xvfmadd_d(_1, _2, _3); }
+// CHECK-LABEL: @xvfmsub_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfmsub.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]], <8 x float> [[_3:%.*]])
+// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+//
+v8f32 xvfmsub_s(v8f32 _1, v8f32 _2, v8f32 _3) { return __builtin_lasx_xvfmsub_s(_1, _2, _3); }
+// CHECK-LABEL: @xvfmsub_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfmsub.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]], <4 x double> [[_3:%.*]])
+// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+//
+v4f64 xvfmsub_d(v4f64 _1, v4f64 _2, v4f64 _3) { return __builtin_lasx_xvfmsub_d(_1, _2, _3); }
+// CHECK-LABEL: @xvfnmadd_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfnmadd.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]], <8 x float> [[_3:%.*]])
+// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+//
+v8f32 xvfnmadd_s(v8f32 _1, v8f32 _2, v8f32 _3) { return __builtin_lasx_xvfnmadd_s(_1, _2, _3); }
+// CHECK-LABEL: @xvfnmadd_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfnmadd.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]], <4 x double> [[_3:%.*]])
+// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+//
+v4f64 xvfnmadd_d(v4f64 _1, v4f64 _2, v4f64 _3) { return __builtin_lasx_xvfnmadd_d(_1, _2, _3); }
+// CHECK-LABEL: @xvfnmsub_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfnmsub.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]], <8 x float> [[_3:%.*]])
+// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+//
+v8f32 xvfnmsub_s(v8f32 _1, v8f32 _2, v8f32 _3) { return __builtin_lasx_xvfnmsub_s(_1, _2, _3); }
+// CHECK-LABEL: @xvfnmsub_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfnmsub.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]], <4 x double> [[_3:%.*]])
+// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+//
+v4f64 xvfnmsub_d(v4f64 _1, v4f64 _2, v4f64 _3) { return __builtin_lasx_xvfnmsub_d(_1, _2, _3); }
+// CHECK-LABEL: @xvftintrne_w_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrne.w.s(<8 x float> [[_1:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvftintrne_w_s(v8f32 _1) { return __builtin_lasx_xvftintrne_w_s(_1); }
+// CHECK-LABEL: @xvftintrne_l_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrne.l.d(<4 x double> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvftintrne_l_d(v4f64 _1) { return __builtin_lasx_xvftintrne_l_d(_1); }
+// CHECK-LABEL: @xvftintrp_w_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrp.w.s(<8 x float> [[_1:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvftintrp_w_s(v8f32 _1) { return __builtin_lasx_xvftintrp_w_s(_1); }
+// CHECK-LABEL: @xvftintrp_l_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrp.l.d(<4 x double> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvftintrp_l_d(v4f64 _1) { return __builtin_lasx_xvftintrp_l_d(_1); }
+// CHECK-LABEL: @xvftintrm_w_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrm.w.s(<8 x float> [[_1:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvftintrm_w_s(v8f32 _1) { return __builtin_lasx_xvftintrm_w_s(_1); }
+// CHECK-LABEL: @xvftintrm_l_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrm.l.d(<4 x double> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvftintrm_l_d(v4f64 _1) { return __builtin_lasx_xvftintrm_l_d(_1); }
+// CHECK-LABEL: @xvftint_w_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftint.w.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvftint_w_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvftint_w_d(_1, _2); }
+// CHECK-LABEL: @xvffint_s_l(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvffint.s.l(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+//
+v8f32 xvffint_s_l(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvffint_s_l(_1, _2); }
+// CHECK-LABEL: @xvftintrz_w_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrz.w.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvftintrz_w_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvftintrz_w_d(_1, _2); }
+// CHECK-LABEL: @xvftintrp_w_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrp.w.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvftintrp_w_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvftintrp_w_d(_1, _2); }
+// CHECK-LABEL: @xvftintrm_w_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrm.w.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvftintrm_w_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvftintrm_w_d(_1, _2); }
+// CHECK-LABEL: @xvftintrne_w_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvftintrne.w.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvftintrne_w_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvftintrne_w_d(_1, _2); }
+// CHECK-LABEL: @xvftinth_l_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftinth.l.s(<8 x float> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvftinth_l_s(v8f32 _1) { return __builtin_lasx_xvftinth_l_s(_1); }
+// CHECK-LABEL: @xvftintl_l_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintl.l.s(<8 x float> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvftintl_l_s(v8f32 _1) { return __builtin_lasx_xvftintl_l_s(_1); }
+// CHECK-LABEL: @xvffinth_d_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvffinth.d.w(<8 x i32> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+//
+v4f64 xvffinth_d_w(v8i32 _1) { return __builtin_lasx_xvffinth_d_w(_1); }
+// CHECK-LABEL: @xvffintl_d_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvffintl.d.w(<8 x i32> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+//
+v4f64 xvffintl_d_w(v8i32 _1) { return __builtin_lasx_xvffintl_d_w(_1); }
+// CHECK-LABEL: @xvftintrzh_l_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrzh.l.s(<8 x float> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvftintrzh_l_s(v8f32 _1) { return __builtin_lasx_xvftintrzh_l_s(_1); }
+// CHECK-LABEL: @xvftintrzl_l_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrzl.l.s(<8 x float> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvftintrzl_l_s(v8f32 _1) { return __builtin_lasx_xvftintrzl_l_s(_1); }
+// CHECK-LABEL: @xvftintrph_l_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrph.l.s(<8 x float> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvftintrph_l_s(v8f32 _1) { return __builtin_lasx_xvftintrph_l_s(_1); }
+// CHECK-LABEL: @xvftintrpl_l_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrpl.l.s(<8 x float> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvftintrpl_l_s(v8f32 _1) { return __builtin_lasx_xvftintrpl_l_s(_1); }
+// CHECK-LABEL: @xvftintrmh_l_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrmh.l.s(<8 x float> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvftintrmh_l_s(v8f32 _1) { return __builtin_lasx_xvftintrmh_l_s(_1); }
+// CHECK-LABEL: @xvftintrml_l_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrml.l.s(<8 x float> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvftintrml_l_s(v8f32 _1) { return __builtin_lasx_xvftintrml_l_s(_1); }
+// CHECK-LABEL: @xvftintrneh_l_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrneh.l.s(<8 x float> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvftintrneh_l_s(v8f32 _1) { return __builtin_lasx_xvftintrneh_l_s(_1); }
+// CHECK-LABEL: @xvftintrnel_l_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvftintrnel.l.s(<8 x float> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvftintrnel_l_s(v8f32 _1) { return __builtin_lasx_xvftintrnel_l_s(_1); }
+// CHECK-LABEL: @xvfrintrne_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfrintrne.s(<8 x float> [[_1:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x float> [[TMP0]] to <8 x i32>
+// CHECK-NEXT:    ret <8 x i32> [[TMP1]]
+//
+v8i32 xvfrintrne_s(v8f32 _1) { return __builtin_lasx_xvfrintrne_s(_1); }
+// CHECK-LABEL: @xvfrintrne_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfrintrne.d(<4 x double> [[_1:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x double> [[TMP0]] to <4 x i64>
+// CHECK-NEXT:    ret <4 x i64> [[TMP1]]
+//
+v4i64 xvfrintrne_d(v4f64 _1) { return __builtin_lasx_xvfrintrne_d(_1); }
+// CHECK-LABEL: @xvfrintrz_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfrintrz.s(<8 x float> [[_1:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x float> [[TMP0]] to <8 x i32>
+// CHECK-NEXT:    ret <8 x i32> [[TMP1]]
+//
+v8i32 xvfrintrz_s(v8f32 _1) { return __builtin_lasx_xvfrintrz_s(_1); }
+// CHECK-LABEL: @xvfrintrz_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfrintrz.d(<4 x double> [[_1:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x double> [[TMP0]] to <4 x i64>
+// CHECK-NEXT:    ret <4 x i64> [[TMP1]]
+//
+v4i64 xvfrintrz_d(v4f64 _1) { return __builtin_lasx_xvfrintrz_d(_1); }
+// CHECK-LABEL: @xvfrintrp_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfrintrp.s(<8 x float> [[_1:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x float> [[TMP0]] to <8 x i32>
+// CHECK-NEXT:    ret <8 x i32> [[TMP1]]
+//
+v8i32 xvfrintrp_s(v8f32 _1) { return __builtin_lasx_xvfrintrp_s(_1); }
+// CHECK-LABEL: @xvfrintrp_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfrintrp.d(<4 x double> [[_1:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x double> [[TMP0]] to <4 x i64>
+// CHECK-NEXT:    ret <4 x i64> [[TMP1]]
+//
+v4i64 xvfrintrp_d(v4f64 _1) { return __builtin_lasx_xvfrintrp_d(_1); }
+// CHECK-LABEL: @xvfrintrm_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfrintrm.s(<8 x float> [[_1:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x float> [[TMP0]] to <8 x i32>
+// CHECK-NEXT:    ret <8 x i32> [[TMP1]]
+//
+v8i32 xvfrintrm_s(v8f32 _1) { return __builtin_lasx_xvfrintrm_s(_1); }
+// CHECK-LABEL: @xvfrintrm_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfrintrm.d(<4 x double> [[_1:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x double> [[TMP0]] to <4 x i64>
+// CHECK-NEXT:    ret <4 x i64> [[TMP1]]
+//
+v4i64 xvfrintrm_d(v4f64 _1) { return __builtin_lasx_xvfrintrm_d(_1); }
+// CHECK-LABEL: @xvld(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvld(ptr [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvld(void *_1) { return __builtin_lasx_xvld(_1, 1); }
+// CHECK-LABEL: @xvst(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.loongarch.lasx.xvst(<32 x i8> [[_1:%.*]], ptr [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret void
+//
+void xvst(v32i8 _1, void *_2) { return __builtin_lasx_xvst(_1, _2, 1); }
+// CHECK-LABEL: @xvstelm_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.loongarch.lasx.xvstelm.b(<32 x i8> [[_1:%.*]], ptr [[_2:%.*]], i32 1, i32 1)
+// CHECK-NEXT:    ret void
+//
+void xvstelm_b(v32i8 _1, void * _2) { return __builtin_lasx_xvstelm_b(_1, _2, 1, 1); }
+// CHECK-LABEL: @xvstelm_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.loongarch.lasx.xvstelm.h(<16 x i16> [[_1:%.*]], ptr [[_2:%.*]], i32 2, i32 1)
+// CHECK-NEXT:    ret void
+//
+void xvstelm_h(v16i16 _1, void * _2) { return __builtin_lasx_xvstelm_h(_1, _2, 2, 1); }
+// CHECK-LABEL: @xvstelm_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.loongarch.lasx.xvstelm.w(<8 x i32> [[_1:%.*]], ptr [[_2:%.*]], i32 4, i32 1)
+// CHECK-NEXT:    ret void
+//
+void xvstelm_w(v8i32 _1, void * _2) { return __builtin_lasx_xvstelm_w(_1, _2, 4, 1); }
+// CHECK-LABEL: @xvstelm_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.loongarch.lasx.xvstelm.d(<4 x i64> [[_1:%.*]], ptr [[_2:%.*]], i32 8, i32 1)
+// CHECK-NEXT:    ret void
+//
+void xvstelm_d(v4i64 _1, void * _2) { return __builtin_lasx_xvstelm_d(_1, _2, 8, 1); }
+// CHECK-LABEL: @xvinsve0_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvinsve0.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvinsve0_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvinsve0_w(_1, _2, 1); }
+// CHECK-LABEL: @xvinsve0_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvinsve0.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvinsve0_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvinsve0_d(_1, _2, 1); }
+// CHECK-LABEL: @xvpickve_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvpickve.w(<8 x i32> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvpickve_w(v8i32 _1) { return __builtin_lasx_xvpickve_w(_1, 1); }
+// CHECK-LABEL: @xvpickve_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvpickve.d(<4 x i64> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvpickve_d(v4i64 _1) { return __builtin_lasx_xvpickve_d(_1, 1); }
+// CHECK-LABEL: @xvssrlrn_b_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrlrn.b.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvssrlrn_b_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvssrlrn_b_h(_1, _2); }
+// CHECK-LABEL: @xvssrlrn_h_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrlrn.h.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvssrlrn_h_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvssrlrn_h_w(_1, _2); }
+// CHECK-LABEL: @xvssrlrn_w_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrlrn.w.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvssrlrn_w_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvssrlrn_w_d(_1, _2); }
+// CHECK-LABEL: @xvssrln_b_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrln.b.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvssrln_b_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvssrln_b_h(_1, _2); }
+// CHECK-LABEL: @xvssrln_h_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrln.h.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvssrln_h_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvssrln_h_w(_1, _2); }
+// CHECK-LABEL: @xvssrln_w_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrln.w.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvssrln_w_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvssrln_w_d(_1, _2); }
+// CHECK-LABEL: @xvorn_v(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvorn.v(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvorn_v(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvorn_v(_1, _2); }
+// CHECK-LABEL: @xvldi(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvldi(i32 1)
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvldi() { return __builtin_lasx_xvldi(1); }
+// CHECK-LABEL: @xvldx(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvldx(ptr [[_1:%.*]], i64 1)
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvldx(void *_1) { return __builtin_lasx_xvldx(_1, 1); }
+// CHECK-LABEL: @xvstx(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.loongarch.lasx.xvstx(<32 x i8> [[_1:%.*]], ptr [[_2:%.*]], i64 1)
+// CHECK-NEXT:    ret void
+//
+void xvstx(v32i8 _1, void *_2) { return __builtin_lasx_xvstx(_1, _2, 1); }
+// CHECK-LABEL: @xvextl_qu_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvextl.qu.du(<4 x i64> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4u64 xvextl_qu_du(v4u64 _1) { return __builtin_lasx_xvextl_qu_du(_1); }
+// CHECK-LABEL: @xvinsgr2vr_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvinsgr2vr.w(<8 x i32> [[_1:%.*]], i32 1, i32 1)
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvinsgr2vr_w(v8i32 _1) { return __builtin_lasx_xvinsgr2vr_w(_1, 1, 1); }
+// CHECK-LABEL: @xvinsgr2vr_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvinsgr2vr.d(<4 x i64> [[_1:%.*]], i64 1, i32 1)
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvinsgr2vr_d(v4i64 _1) { return __builtin_lasx_xvinsgr2vr_d(_1, 1, 1); }
+// CHECK-LABEL: @xvreplve0_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvreplve0.b(<32 x i8> [[_1:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvreplve0_b(v32i8 _1) { return __builtin_lasx_xvreplve0_b(_1); }
+// CHECK-LABEL: @xvreplve0_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvreplve0.h(<16 x i16> [[_1:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvreplve0_h(v16i16 _1) { return __builtin_lasx_xvreplve0_h(_1); }
+// CHECK-LABEL: @xvreplve0_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvreplve0.w(<8 x i32> [[_1:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvreplve0_w(v8i32 _1) { return __builtin_lasx_xvreplve0_w(_1); }
+// CHECK-LABEL: @xvreplve0_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvreplve0.d(<4 x i64> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvreplve0_d(v4i64 _1) { return __builtin_lasx_xvreplve0_d(_1); }
+// CHECK-LABEL: @xvreplve0_q(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvreplve0.q(<32 x i8> [[_1:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvreplve0_q(v32i8 _1) { return __builtin_lasx_xvreplve0_q(_1); }
+// CHECK-LABEL: @vext2xv_h_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.vext2xv.h.b(<32 x i8> [[_1:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 vext2xv_h_b(v32i8 _1) { return __builtin_lasx_vext2xv_h_b(_1); }
+// CHECK-LABEL: @vext2xv_w_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.vext2xv.w.h(<16 x i16> [[_1:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 vext2xv_w_h(v16i16 _1) { return __builtin_lasx_vext2xv_w_h(_1); }
+// CHECK-LABEL: @vext2xv_d_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.vext2xv.d.w(<8 x i32> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 vext2xv_d_w(v8i32 _1) { return __builtin_lasx_vext2xv_d_w(_1); }
+// CHECK-LABEL: @vext2xv_w_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.vext2xv.w.b(<32 x i8> [[_1:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 vext2xv_w_b(v32i8 _1) { return __builtin_lasx_vext2xv_w_b(_1); }
+// CHECK-LABEL: @vext2xv_d_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.vext2xv.d.h(<16 x i16> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 vext2xv_d_h(v16i16 _1) { return __builtin_lasx_vext2xv_d_h(_1); }
+// CHECK-LABEL: @vext2xv_d_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.vext2xv.d.b(<32 x i8> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 vext2xv_d_b(v32i8 _1) { return __builtin_lasx_vext2xv_d_b(_1); }
+// CHECK-LABEL: @vext2xv_hu_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.vext2xv.hu.bu(<32 x i8> [[_1:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 vext2xv_hu_bu(v32i8 _1) { return __builtin_lasx_vext2xv_hu_bu(_1); }
+// CHECK-LABEL: @vext2xv_wu_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.vext2xv.wu.hu(<16 x i16> [[_1:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 vext2xv_wu_hu(v16i16 _1) { return __builtin_lasx_vext2xv_wu_hu(_1); }
+// CHECK-LABEL: @vext2xv_du_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.vext2xv.du.wu(<8 x i32> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 vext2xv_du_wu(v8i32 _1) { return __builtin_lasx_vext2xv_du_wu(_1); }
+// CHECK-LABEL: @vext2xv_wu_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.vext2xv.wu.bu(<32 x i8> [[_1:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 vext2xv_wu_bu(v32i8 _1) { return __builtin_lasx_vext2xv_wu_bu(_1); }
+// CHECK-LABEL: @vext2xv_du_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.vext2xv.du.hu(<16 x i16> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 vext2xv_du_hu(v16i16 _1) { return __builtin_lasx_vext2xv_du_hu(_1); }
+// CHECK-LABEL: @vext2xv_du_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.vext2xv.du.bu(<32 x i8> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 vext2xv_du_bu(v32i8 _1) { return __builtin_lasx_vext2xv_du_bu(_1); }
+// CHECK-LABEL: @xvpermi_q(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvpermi.q(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvpermi_q(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvpermi_q(_1, _2, 1); }
+// CHECK-LABEL: @xvpermi_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvpermi.d(<4 x i64> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvpermi_d(v4i64 _1) { return __builtin_lasx_xvpermi_d(_1, 1); }
+// CHECK-LABEL: @xvperm_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvperm.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvperm_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvperm_w(_1, _2); }
+// CHECK-LABEL: @xvldrepl_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvldrepl.b(ptr [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvldrepl_b(void *_1) { return __builtin_lasx_xvldrepl_b(_1, 1); }
+// CHECK-LABEL: @xvldrepl_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvldrepl.h(ptr [[_1:%.*]], i32 2)
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvldrepl_h(void *_1) { return __builtin_lasx_xvldrepl_h(_1, 2); }
+// CHECK-LABEL: @xvldrepl_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvldrepl.w(ptr [[_1:%.*]], i32 4)
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvldrepl_w(void *_1) { return __builtin_lasx_xvldrepl_w(_1, 4); }
+// CHECK-LABEL: @xvldrepl_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvldrepl.d(ptr [[_1:%.*]], i32 8)
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvldrepl_d(void *_1) { return __builtin_lasx_xvldrepl_d(_1, 8); }
+// CHECK-LABEL: @xvpickve2gr_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lasx.xvpickve2gr.w(<8 x i32> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+int xvpickve2gr_w(v8i32 _1) { return __builtin_lasx_xvpickve2gr_w(_1, 1); }
+// CHECK-LABEL: @xvpickve2gr_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lasx.xvpickve2gr.wu(<8 x i32> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+unsigned int xvpickve2gr_wu(v8i32 _1) { return __builtin_lasx_xvpickve2gr_wu(_1, 1); }
+// CHECK-LABEL: @xvpickve2gr_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.loongarch.lasx.xvpickve2gr.d(<4 x i64> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret i64 [[TMP0]]
+//
+long xvpickve2gr_d(v4i64 _1) { return __builtin_lasx_xvpickve2gr_d(_1, 1); }
+// CHECK-LABEL: @xvpickve2gr_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.loongarch.lasx.xvpickve2gr.du(<4 x i64> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret i64 [[TMP0]]
+//
+unsigned long int xvpickve2gr_du(v4i64 _1) { return __builtin_lasx_xvpickve2gr_du(_1, 1); }
+// CHECK-LABEL: @xvaddwev_q_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwev.q.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvaddwev_q_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvaddwev_q_d(_1, _2); }
+// CHECK-LABEL: @xvaddwev_d_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwev.d.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvaddwev_d_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvaddwev_d_w(_1, _2); }
+// CHECK-LABEL: @xvaddwev_w_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvaddwev.w.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvaddwev_w_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvaddwev_w_h(_1, _2); }
+// CHECK-LABEL: @xvaddwev_h_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvaddwev.h.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvaddwev_h_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvaddwev_h_b(_1, _2); }
+// CHECK-LABEL: @xvaddwev_q_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwev.q.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvaddwev_q_du(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvaddwev_q_du(_1, _2); }
+// CHECK-LABEL: @xvaddwev_d_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwev.d.wu(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvaddwev_d_wu(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvaddwev_d_wu(_1, _2); }
+// CHECK-LABEL: @xvaddwev_w_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvaddwev.w.hu(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvaddwev_w_hu(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvaddwev_w_hu(_1, _2); }
+// CHECK-LABEL: @xvaddwev_h_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvaddwev.h.bu(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvaddwev_h_bu(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvaddwev_h_bu(_1, _2); }
+// CHECK-LABEL: @xvsubwev_q_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubwev.q.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvsubwev_q_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvsubwev_q_d(_1, _2); }
+// CHECK-LABEL: @xvsubwev_d_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubwev.d.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvsubwev_d_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvsubwev_d_w(_1, _2); }
+// CHECK-LABEL: @xvsubwev_w_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsubwev.w.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvsubwev_w_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvsubwev_w_h(_1, _2); }
+// CHECK-LABEL: @xvsubwev_h_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsubwev.h.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvsubwev_h_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvsubwev_h_b(_1, _2); }
+// CHECK-LABEL: @xvsubwev_q_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubwev.q.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvsubwev_q_du(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvsubwev_q_du(_1, _2); }
+// CHECK-LABEL: @xvsubwev_d_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubwev.d.wu(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvsubwev_d_wu(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvsubwev_d_wu(_1, _2); }
+// CHECK-LABEL: @xvsubwev_w_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsubwev.w.hu(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvsubwev_w_hu(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvsubwev_w_hu(_1, _2); }
+// CHECK-LABEL: @xvsubwev_h_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsubwev.h.bu(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvsubwev_h_bu(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvsubwev_h_bu(_1, _2); }
+// CHECK-LABEL: @xvmulwev_q_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwev.q.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvmulwev_q_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvmulwev_q_d(_1, _2); }
+// CHECK-LABEL: @xvmulwev_d_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwev.d.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvmulwev_d_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvmulwev_d_w(_1, _2); }
+// CHECK-LABEL: @xvmulwev_w_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmulwev.w.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvmulwev_w_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvmulwev_w_h(_1, _2); }
+// CHECK-LABEL: @xvmulwev_h_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmulwev.h.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvmulwev_h_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvmulwev_h_b(_1, _2); }
+// CHECK-LABEL: @xvmulwev_q_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwev.q.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvmulwev_q_du(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvmulwev_q_du(_1, _2); }
+// CHECK-LABEL: @xvmulwev_d_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwev.d.wu(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvmulwev_d_wu(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvmulwev_d_wu(_1, _2); }
+// CHECK-LABEL: @xvmulwev_w_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmulwev.w.hu(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvmulwev_w_hu(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvmulwev_w_hu(_1, _2); }
+// CHECK-LABEL: @xvmulwev_h_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmulwev.h.bu(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvmulwev_h_bu(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvmulwev_h_bu(_1, _2); }
+// CHECK-LABEL: @xvaddwod_q_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwod.q.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvaddwod_q_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvaddwod_q_d(_1, _2); }
+// CHECK-LABEL: @xvaddwod_d_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwod.d.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvaddwod_d_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvaddwod_d_w(_1, _2); }
+// CHECK-LABEL: @xvaddwod_w_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvaddwod.w.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvaddwod_w_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvaddwod_w_h(_1, _2); }
+// CHECK-LABEL: @xvaddwod_h_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvaddwod.h.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvaddwod_h_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvaddwod_h_b(_1, _2); }
+// CHECK-LABEL: @xvaddwod_q_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwod.q.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvaddwod_q_du(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvaddwod_q_du(_1, _2); }
+// CHECK-LABEL: @xvaddwod_d_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwod.d.wu(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvaddwod_d_wu(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvaddwod_d_wu(_1, _2); }
+// CHECK-LABEL: @xvaddwod_w_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvaddwod.w.hu(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvaddwod_w_hu(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvaddwod_w_hu(_1, _2); }
+// CHECK-LABEL: @xvaddwod_h_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvaddwod.h.bu(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvaddwod_h_bu(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvaddwod_h_bu(_1, _2); }
+// CHECK-LABEL: @xvsubwod_q_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubwod.q.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvsubwod_q_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvsubwod_q_d(_1, _2); }
+// CHECK-LABEL: @xvsubwod_d_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubwod.d.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvsubwod_d_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvsubwod_d_w(_1, _2); }
+// CHECK-LABEL: @xvsubwod_w_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsubwod.w.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvsubwod_w_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvsubwod_w_h(_1, _2); }
+// CHECK-LABEL: @xvsubwod_h_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsubwod.h.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvsubwod_h_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvsubwod_h_b(_1, _2); }
+// CHECK-LABEL: @xvsubwod_q_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubwod.q.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvsubwod_q_du(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvsubwod_q_du(_1, _2); }
+// CHECK-LABEL: @xvsubwod_d_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsubwod.d.wu(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvsubwod_d_wu(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvsubwod_d_wu(_1, _2); }
+// CHECK-LABEL: @xvsubwod_w_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsubwod.w.hu(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvsubwod_w_hu(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvsubwod_w_hu(_1, _2); }
+// CHECK-LABEL: @xvsubwod_h_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsubwod.h.bu(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvsubwod_h_bu(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvsubwod_h_bu(_1, _2); }
+// CHECK-LABEL: @xvmulwod_q_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwod.q.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvmulwod_q_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvmulwod_q_d(_1, _2); }
+// CHECK-LABEL: @xvmulwod_d_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwod.d.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvmulwod_d_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvmulwod_d_w(_1, _2); }
+// CHECK-LABEL: @xvmulwod_w_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmulwod.w.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvmulwod_w_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvmulwod_w_h(_1, _2); }
+// CHECK-LABEL: @xvmulwod_h_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmulwod.h.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvmulwod_h_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvmulwod_h_b(_1, _2); }
+// CHECK-LABEL: @xvmulwod_q_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwod.q.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvmulwod_q_du(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvmulwod_q_du(_1, _2); }
+// CHECK-LABEL: @xvmulwod_d_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwod.d.wu(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvmulwod_d_wu(v8u32 _1, v8u32 _2) { return __builtin_lasx_xvmulwod_d_wu(_1, _2); }
+// CHECK-LABEL: @xvmulwod_w_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmulwod.w.hu(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvmulwod_w_hu(v16u16 _1, v16u16 _2) { return __builtin_lasx_xvmulwod_w_hu(_1, _2); }
+// CHECK-LABEL: @xvmulwod_h_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmulwod.h.bu(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvmulwod_h_bu(v32u8 _1, v32u8 _2) { return __builtin_lasx_xvmulwod_h_bu(_1, _2); }
+// CHECK-LABEL: @xvaddwev_d_wu_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwev.d.wu.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvaddwev_d_wu_w(v8u32 _1, v8i32 _2) { return __builtin_lasx_xvaddwev_d_wu_w(_1, _2); }
+// CHECK-LABEL: @xvaddwev_w_hu_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvaddwev.w.hu.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvaddwev_w_hu_h(v16u16 _1, v16i16 _2) { return __builtin_lasx_xvaddwev_w_hu_h(_1, _2); }
+// CHECK-LABEL: @xvaddwev_h_bu_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvaddwev.h.bu.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvaddwev_h_bu_b(v32u8 _1, v32i8 _2) { return __builtin_lasx_xvaddwev_h_bu_b(_1, _2); }
+// CHECK-LABEL: @xvmulwev_d_wu_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwev.d.wu.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvmulwev_d_wu_w(v8u32 _1, v8i32 _2) { return __builtin_lasx_xvmulwev_d_wu_w(_1, _2); }
+// CHECK-LABEL: @xvmulwev_w_hu_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmulwev.w.hu.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvmulwev_w_hu_h(v16u16 _1, v16i16 _2) { return __builtin_lasx_xvmulwev_w_hu_h(_1, _2); }
+// CHECK-LABEL: @xvmulwev_h_bu_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmulwev.h.bu.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvmulwev_h_bu_b(v32u8 _1, v32i8 _2) { return __builtin_lasx_xvmulwev_h_bu_b(_1, _2); }
+// CHECK-LABEL: @xvaddwod_d_wu_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwod.d.wu.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvaddwod_d_wu_w(v8u32 _1, v8i32 _2) { return __builtin_lasx_xvaddwod_d_wu_w(_1, _2); }
+// CHECK-LABEL: @xvaddwod_w_hu_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvaddwod.w.hu.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvaddwod_w_hu_h(v16u16 _1, v16i16 _2) { return __builtin_lasx_xvaddwod_w_hu_h(_1, _2); }
+// CHECK-LABEL: @xvaddwod_h_bu_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvaddwod.h.bu.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvaddwod_h_bu_b(v32u8 _1, v32i8 _2) { return __builtin_lasx_xvaddwod_h_bu_b(_1, _2); }
+// CHECK-LABEL: @xvmulwod_d_wu_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwod.d.wu.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvmulwod_d_wu_w(v8u32 _1, v8i32 _2) { return __builtin_lasx_xvmulwod_d_wu_w(_1, _2); }
+// CHECK-LABEL: @xvmulwod_w_hu_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmulwod.w.hu.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvmulwod_w_hu_h(v16u16 _1, v16i16 _2) { return __builtin_lasx_xvmulwod_w_hu_h(_1, _2); }
+// CHECK-LABEL: @xvmulwod_h_bu_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmulwod.h.bu.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvmulwod_h_bu_b(v32u8 _1, v32i8 _2) { return __builtin_lasx_xvmulwod_h_bu_b(_1, _2); }
+// CHECK-LABEL: @xvhaddw_q_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvhaddw.q.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvhaddw_q_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvhaddw_q_d(_1, _2); }
+// CHECK-LABEL: @xvhaddw_qu_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvhaddw.qu.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4u64 xvhaddw_qu_du(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvhaddw_qu_du(_1, _2); }
+// CHECK-LABEL: @xvhsubw_q_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvhsubw.q.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvhsubw_q_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvhsubw_q_d(_1, _2); }
+// CHECK-LABEL: @xvhsubw_qu_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvhsubw.qu.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4u64 xvhsubw_qu_du(v4u64 _1, v4u64 _2) { return __builtin_lasx_xvhsubw_qu_du(_1, _2); }
+// CHECK-LABEL: @xvmaddwev_q_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwev.q.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], <4 x i64> [[_3:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvmaddwev_q_d(v4i64 _1, v4i64 _2, v4i64 _3) { return __builtin_lasx_xvmaddwev_q_d(_1, _2, _3); }
+// CHECK-LABEL: @xvmaddwev_d_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwev.d.w(<4 x i64> [[_1:%.*]], <8 x i32> [[_2:%.*]], <8 x i32> [[_3:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvmaddwev_d_w(v4i64 _1, v8i32 _2, v8i32 _3) { return __builtin_lasx_xvmaddwev_d_w(_1, _2, _3); }
+// CHECK-LABEL: @xvmaddwev_w_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmaddwev.w.h(<8 x i32> [[_1:%.*]], <16 x i16> [[_2:%.*]], <16 x i16> [[_3:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvmaddwev_w_h(v8i32 _1, v16i16 _2, v16i16 _3) { return __builtin_lasx_xvmaddwev_w_h(_1, _2, _3); }
+// CHECK-LABEL: @xvmaddwev_h_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmaddwev.h.b(<16 x i16> [[_1:%.*]], <32 x i8> [[_2:%.*]], <32 x i8> [[_3:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvmaddwev_h_b(v16i16 _1, v32i8 _2, v32i8 _3) { return __builtin_lasx_xvmaddwev_h_b(_1, _2, _3); }
+// CHECK-LABEL: @xvmaddwev_q_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwev.q.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], <4 x i64> [[_3:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4u64 xvmaddwev_q_du(v4u64 _1, v4u64 _2, v4u64 _3) { return __builtin_lasx_xvmaddwev_q_du(_1, _2, _3); }
+// CHECK-LABEL: @xvmaddwev_d_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwev.d.wu(<4 x i64> [[_1:%.*]], <8 x i32> [[_2:%.*]], <8 x i32> [[_3:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4u64 xvmaddwev_d_wu(v4u64 _1, v8u32 _2, v8u32 _3) { return __builtin_lasx_xvmaddwev_d_wu(_1, _2, _3); }
+// CHECK-LABEL: @xvmaddwev_w_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmaddwev.w.hu(<8 x i32> [[_1:%.*]], <16 x i16> [[_2:%.*]], <16 x i16> [[_3:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8u32 xvmaddwev_w_hu(v8u32 _1, v16u16 _2, v16u16 _3) { return __builtin_lasx_xvmaddwev_w_hu(_1, _2, _3); }
+// CHECK-LABEL: @xvmaddwev_h_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmaddwev.h.bu(<16 x i16> [[_1:%.*]], <32 x i8> [[_2:%.*]], <32 x i8> [[_3:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16u16 xvmaddwev_h_bu(v16u16 _1, v32u8 _2, v32u8 _3) { return __builtin_lasx_xvmaddwev_h_bu(_1, _2, _3); }
+// CHECK-LABEL: @xvmaddwod_q_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwod.q.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], <4 x i64> [[_3:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvmaddwod_q_d(v4i64 _1, v4i64 _2, v4i64 _3) { return __builtin_lasx_xvmaddwod_q_d(_1, _2, _3); }
+// CHECK-LABEL: @xvmaddwod_d_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwod.d.w(<4 x i64> [[_1:%.*]], <8 x i32> [[_2:%.*]], <8 x i32> [[_3:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvmaddwod_d_w(v4i64 _1, v8i32 _2, v8i32 _3) { return __builtin_lasx_xvmaddwod_d_w(_1, _2, _3); }
+// CHECK-LABEL: @xvmaddwod_w_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmaddwod.w.h(<8 x i32> [[_1:%.*]], <16 x i16> [[_2:%.*]], <16 x i16> [[_3:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvmaddwod_w_h(v8i32 _1, v16i16 _2, v16i16 _3) { return __builtin_lasx_xvmaddwod_w_h(_1, _2, _3); }
+// CHECK-LABEL: @xvmaddwod_h_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmaddwod.h.b(<16 x i16> [[_1:%.*]], <32 x i8> [[_2:%.*]], <32 x i8> [[_3:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvmaddwod_h_b(v16i16 _1, v32i8 _2, v32i8 _3) { return __builtin_lasx_xvmaddwod_h_b(_1, _2, _3); }
+// CHECK-LABEL: @xvmaddwod_q_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwod.q.du(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], <4 x i64> [[_3:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4u64 xvmaddwod_q_du(v4u64 _1, v4u64 _2, v4u64 _3) { return __builtin_lasx_xvmaddwod_q_du(_1, _2, _3); }
+// CHECK-LABEL: @xvmaddwod_d_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwod.d.wu(<4 x i64> [[_1:%.*]], <8 x i32> [[_2:%.*]], <8 x i32> [[_3:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4u64 xvmaddwod_d_wu(v4u64 _1, v8u32 _2, v8u32 _3) { return __builtin_lasx_xvmaddwod_d_wu(_1, _2, _3); }
+// CHECK-LABEL: @xvmaddwod_w_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmaddwod.w.hu(<8 x i32> [[_1:%.*]], <16 x i16> [[_2:%.*]], <16 x i16> [[_3:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8u32 xvmaddwod_w_hu(v8u32 _1, v16u16 _2, v16u16 _3) { return __builtin_lasx_xvmaddwod_w_hu(_1, _2, _3); }
+// CHECK-LABEL: @xvmaddwod_h_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmaddwod.h.bu(<16 x i16> [[_1:%.*]], <32 x i8> [[_2:%.*]], <32 x i8> [[_3:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16u16 xvmaddwod_h_bu(v16u16 _1, v32u8 _2, v32u8 _3) { return __builtin_lasx_xvmaddwod_h_bu(_1, _2, _3); }
+// CHECK-LABEL: @xvmaddwev_q_du_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwev.q.du.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], <4 x i64> [[_3:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvmaddwev_q_du_d(v4i64 _1, v4u64 _2, v4i64 _3) { return __builtin_lasx_xvmaddwev_q_du_d(_1, _2, _3); }
+// CHECK-LABEL: @xvmaddwev_d_wu_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwev.d.wu.w(<4 x i64> [[_1:%.*]], <8 x i32> [[_2:%.*]], <8 x i32> [[_3:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvmaddwev_d_wu_w(v4i64 _1, v8u32 _2, v8i32 _3) { return __builtin_lasx_xvmaddwev_d_wu_w(_1, _2, _3); }
+// CHECK-LABEL: @xvmaddwev_w_hu_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmaddwev.w.hu.h(<8 x i32> [[_1:%.*]], <16 x i16> [[_2:%.*]], <16 x i16> [[_3:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvmaddwev_w_hu_h(v8i32 _1, v16u16 _2, v16i16 _3) { return __builtin_lasx_xvmaddwev_w_hu_h(_1, _2, _3); }
+// CHECK-LABEL: @xvmaddwev_h_bu_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmaddwev.h.bu.b(<16 x i16> [[_1:%.*]], <32 x i8> [[_2:%.*]], <32 x i8> [[_3:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvmaddwev_h_bu_b(v16i16 _1, v32u8 _2, v32i8 _3) { return __builtin_lasx_xvmaddwev_h_bu_b(_1, _2, _3); }
+// CHECK-LABEL: @xvmaddwod_q_du_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwod.q.du.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], <4 x i64> [[_3:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvmaddwod_q_du_d(v4i64 _1, v4u64 _2, v4i64 _3) { return __builtin_lasx_xvmaddwod_q_du_d(_1, _2, _3); }
+// CHECK-LABEL: @xvmaddwod_d_wu_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmaddwod.d.wu.w(<4 x i64> [[_1:%.*]], <8 x i32> [[_2:%.*]], <8 x i32> [[_3:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvmaddwod_d_wu_w(v4i64 _1, v8u32 _2, v8i32 _3) { return __builtin_lasx_xvmaddwod_d_wu_w(_1, _2, _3); }
+// CHECK-LABEL: @xvmaddwod_w_hu_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvmaddwod.w.hu.h(<8 x i32> [[_1:%.*]], <16 x i16> [[_2:%.*]], <16 x i16> [[_3:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvmaddwod_w_hu_h(v8i32 _1, v16u16 _2, v16i16 _3) { return __builtin_lasx_xvmaddwod_w_hu_h(_1, _2, _3); }
+// CHECK-LABEL: @xvmaddwod_h_bu_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvmaddwod.h.bu.b(<16 x i16> [[_1:%.*]], <32 x i8> [[_2:%.*]], <32 x i8> [[_3:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvmaddwod_h_bu_b(v16i16 _1, v32u8 _2, v32i8 _3) { return __builtin_lasx_xvmaddwod_h_bu_b(_1, _2, _3); }
+// CHECK-LABEL: @xvrotr_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvrotr.b(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvrotr_b(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvrotr_b(_1, _2); }
+// CHECK-LABEL: @xvrotr_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvrotr.h(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvrotr_h(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvrotr_h(_1, _2); }
+// CHECK-LABEL: @xvrotr_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvrotr.w(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvrotr_w(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvrotr_w(_1, _2); }
+// CHECK-LABEL: @xvrotr_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvrotr.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvrotr_d(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvrotr_d(_1, _2); }
+// CHECK-LABEL: @xvadd_q(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvadd.q(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvadd_q(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvadd_q(_1, _2); }
+// CHECK-LABEL: @xvsub_q(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsub.q(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvsub_q(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvsub_q(_1, _2); }
+// CHECK-LABEL: @xvaddwev_q_du_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwev.q.du.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvaddwev_q_du_d(v4u64 _1, v4i64 _2) { return __builtin_lasx_xvaddwev_q_du_d(_1, _2); }
+// CHECK-LABEL: @xvaddwod_q_du_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvaddwod.q.du.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvaddwod_q_du_d(v4u64 _1, v4i64 _2) { return __builtin_lasx_xvaddwod_q_du_d(_1, _2); }
+// CHECK-LABEL: @xvmulwev_q_du_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwev.q.du.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvmulwev_q_du_d(v4u64 _1, v4i64 _2) { return __builtin_lasx_xvmulwev_q_du_d(_1, _2); }
+// CHECK-LABEL: @xvmulwod_q_du_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvmulwod.q.du.d(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvmulwod_q_du_d(v4u64 _1, v4i64 _2) { return __builtin_lasx_xvmulwod_q_du_d(_1, _2); }
+// CHECK-LABEL: @xvmskgez_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmskgez.b(<32 x i8> [[_1:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvmskgez_b(v32i8 _1) { return __builtin_lasx_xvmskgez_b(_1); }
+// CHECK-LABEL: @xvmsknz_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvmsknz.b(<32 x i8> [[_1:%.*]])
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvmsknz_b(v32i8 _1) { return __builtin_lasx_xvmsknz_b(_1); }
+// CHECK-LABEL: @xvexth_h_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvexth.h.b(<32 x i8> [[_1:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvexth_h_b(v32i8 _1) { return __builtin_lasx_xvexth_h_b(_1); }
+// CHECK-LABEL: @xvexth_w_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvexth.w.h(<16 x i16> [[_1:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvexth_w_h(v16i16 _1) { return __builtin_lasx_xvexth_w_h(_1); }
+// CHECK-LABEL: @xvexth_d_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvexth.d.w(<8 x i32> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvexth_d_w(v8i32 _1) { return __builtin_lasx_xvexth_d_w(_1); }
+// CHECK-LABEL: @xvexth_q_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvexth.q.d(<4 x i64> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvexth_q_d(v4i64 _1) { return __builtin_lasx_xvexth_q_d(_1); }
+// CHECK-LABEL: @xvexth_hu_bu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvexth.hu.bu(<32 x i8> [[_1:%.*]])
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16u16 xvexth_hu_bu(v32u8 _1) { return __builtin_lasx_xvexth_hu_bu(_1); }
+// CHECK-LABEL: @xvexth_wu_hu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvexth.wu.hu(<16 x i16> [[_1:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8u32 xvexth_wu_hu(v16u16 _1) { return __builtin_lasx_xvexth_wu_hu(_1); }
+// CHECK-LABEL: @xvexth_du_wu(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvexth.du.wu(<8 x i32> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4u64 xvexth_du_wu(v8u32 _1) { return __builtin_lasx_xvexth_du_wu(_1); }
+// CHECK-LABEL: @xvexth_qu_du(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvexth.qu.du(<4 x i64> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4u64 xvexth_qu_du(v4u64 _1) { return __builtin_lasx_xvexth_qu_du(_1); }
+// CHECK-LABEL: @xvrotri_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvrotri.b(<32 x i8> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvrotri_b(v32i8 _1) { return __builtin_lasx_xvrotri_b(_1, 1); }
+// CHECK-LABEL: @xvrotri_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvrotri.h(<16 x i16> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvrotri_h(v16i16 _1) { return __builtin_lasx_xvrotri_h(_1, 1); }
+// CHECK-LABEL: @xvrotri_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvrotri.w(<8 x i32> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvrotri_w(v8i32 _1) { return __builtin_lasx_xvrotri_w(_1, 1); }
+// CHECK-LABEL: @xvrotri_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvrotri.d(<4 x i64> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvrotri_d(v4i64 _1) { return __builtin_lasx_xvrotri_d(_1, 1); }
+// CHECK-LABEL: @xvextl_q_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvextl.q.d(<4 x i64> [[_1:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvextl_q_d(v4i64 _1) { return __builtin_lasx_xvextl_q_d(_1); }
+// CHECK-LABEL: @xvsrlni_b_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrlni.b.h(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvsrlni_b_h(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvsrlni_b_h(_1, _2, 1); }
+// CHECK-LABEL: @xvsrlni_h_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrlni.h.w(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvsrlni_h_w(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvsrlni_h_w(_1, _2, 1); }
+// CHECK-LABEL: @xvsrlni_w_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrlni.w.d(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvsrlni_w_d(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvsrlni_w_d(_1, _2, 1); }
+// CHECK-LABEL: @xvsrlni_d_q(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrlni.d.q(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvsrlni_d_q(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvsrlni_d_q(_1, _2, 1); }
+// CHECK-LABEL: @xvsrlrni_b_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrlrni.b.h(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvsrlrni_b_h(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvsrlrni_b_h(_1, _2, 1); }
+// CHECK-LABEL: @xvsrlrni_h_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrlrni.h.w(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvsrlrni_h_w(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvsrlrni_h_w(_1, _2, 1); }
+// CHECK-LABEL: @xvsrlrni_w_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrlrni.w.d(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvsrlrni_w_d(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvsrlrni_w_d(_1, _2, 1); }
+// CHECK-LABEL: @xvsrlrni_d_q(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrlrni.d.q(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvsrlrni_d_q(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvsrlrni_d_q(_1, _2, 1); }
+// CHECK-LABEL: @xvssrlni_b_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrlni.b.h(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvssrlni_b_h(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvssrlni_b_h(_1, _2, 1); }
+// CHECK-LABEL: @xvssrlni_h_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrlni.h.w(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvssrlni_h_w(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvssrlni_h_w(_1, _2, 1); }
+// CHECK-LABEL: @xvssrlni_w_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrlni.w.d(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvssrlni_w_d(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvssrlni_w_d(_1, _2, 1); }
+// CHECK-LABEL: @xvssrlni_d_q(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssrlni.d.q(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvssrlni_d_q(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvssrlni_d_q(_1, _2, 1); }
+// CHECK-LABEL: @xvssrlni_bu_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrlni.bu.h(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32u8 xvssrlni_bu_h(v32u8 _1, v32i8 _2) { return __builtin_lasx_xvssrlni_bu_h(_1, _2, 1); }
+// CHECK-LABEL: @xvssrlni_hu_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrlni.hu.w(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16u16 xvssrlni_hu_w(v16u16 _1, v16i16 _2) { return __builtin_lasx_xvssrlni_hu_w(_1, _2, 1); }
+// CHECK-LABEL: @xvssrlni_wu_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrlni.wu.d(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8u32 xvssrlni_wu_d(v8u32 _1, v8i32 _2) { return __builtin_lasx_xvssrlni_wu_d(_1, _2, 1); }
+// CHECK-LABEL: @xvssrlni_du_q(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssrlni.du.q(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4u64 xvssrlni_du_q(v4u64 _1, v4i64 _2) { return __builtin_lasx_xvssrlni_du_q(_1, _2, 1); }
+// CHECK-LABEL: @xvssrlrni_b_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrlrni.b.h(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvssrlrni_b_h(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvssrlrni_b_h(_1, _2, 1); }
+// CHECK-LABEL: @xvssrlrni_h_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrlrni.h.w(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvssrlrni_h_w(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvssrlrni_h_w(_1, _2, 1); }
+// CHECK-LABEL: @xvssrlrni_w_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrlrni.w.d(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvssrlrni_w_d(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvssrlrni_w_d(_1, _2, 1); }
+// CHECK-LABEL: @xvssrlrni_d_q(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssrlrni.d.q(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvssrlrni_d_q(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvssrlrni_d_q(_1, _2, 1); }
+// CHECK-LABEL: @xvssrlrni_bu_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrlrni.bu.h(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32u8 xvssrlrni_bu_h(v32u8 _1, v32i8 _2) { return __builtin_lasx_xvssrlrni_bu_h(_1, _2, 1); }
+// CHECK-LABEL: @xvssrlrni_hu_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrlrni.hu.w(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16u16 xvssrlrni_hu_w(v16u16 _1, v16i16 _2) { return __builtin_lasx_xvssrlrni_hu_w(_1, _2, 1); }
+// CHECK-LABEL: @xvssrlrni_wu_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrlrni.wu.d(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8u32 xvssrlrni_wu_d(v8u32 _1, v8i32 _2) { return __builtin_lasx_xvssrlrni_wu_d(_1, _2, 1); }
+// CHECK-LABEL: @xvssrlrni_du_q(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssrlrni.du.q(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4u64 xvssrlrni_du_q(v4u64 _1, v4i64 _2) { return __builtin_lasx_xvssrlrni_du_q(_1, _2, 1); }
+// CHECK-LABEL: @xvsrani_b_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrani.b.h(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvsrani_b_h(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvsrani_b_h(_1, _2, 1); }
+// CHECK-LABEL: @xvsrani_h_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrani.h.w(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvsrani_h_w(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvsrani_h_w(_1, _2, 1); }
+// CHECK-LABEL: @xvsrani_w_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrani.w.d(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvsrani_w_d(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvsrani_w_d(_1, _2, 1); }
+// CHECK-LABEL: @xvsrani_d_q(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrani.d.q(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvsrani_d_q(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvsrani_d_q(_1, _2, 1); }
+// CHECK-LABEL: @xvsrarni_b_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvsrarni.b.h(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvsrarni_b_h(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvsrarni_b_h(_1, _2, 1); }
+// CHECK-LABEL: @xvsrarni_h_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvsrarni.h.w(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvsrarni_h_w(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvsrarni_h_w(_1, _2, 1); }
+// CHECK-LABEL: @xvsrarni_w_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvsrarni.w.d(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvsrarni_w_d(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvsrarni_w_d(_1, _2, 1); }
+// CHECK-LABEL: @xvsrarni_d_q(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvsrarni.d.q(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvsrarni_d_q(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvsrarni_d_q(_1, _2, 1); }
+// CHECK-LABEL: @xvssrani_b_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrani.b.h(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvssrani_b_h(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvssrani_b_h(_1, _2, 1); }
+// CHECK-LABEL: @xvssrani_h_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrani.h.w(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvssrani_h_w(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvssrani_h_w(_1, _2, 1); }
+// CHECK-LABEL: @xvssrani_w_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrani.w.d(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvssrani_w_d(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvssrani_w_d(_1, _2, 1); }
+// CHECK-LABEL: @xvssrani_d_q(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssrani.d.q(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvssrani_d_q(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvssrani_d_q(_1, _2, 1); }
+// CHECK-LABEL: @xvssrani_bu_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrani.bu.h(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32u8 xvssrani_bu_h(v32u8 _1, v32i8 _2) { return __builtin_lasx_xvssrani_bu_h(_1, _2, 1); }
+// CHECK-LABEL: @xvssrani_hu_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrani.hu.w(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16u16 xvssrani_hu_w(v16u16 _1, v16i16 _2) { return __builtin_lasx_xvssrani_hu_w(_1, _2, 1); }
+// CHECK-LABEL: @xvssrani_wu_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrani.wu.d(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8u32 xvssrani_wu_d(v8u32 _1, v8i32 _2) { return __builtin_lasx_xvssrani_wu_d(_1, _2, 1); }
+// CHECK-LABEL: @xvssrani_du_q(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssrani.du.q(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4u64 xvssrani_du_q(v4u64 _1, v4i64 _2) { return __builtin_lasx_xvssrani_du_q(_1, _2, 1); }
+// CHECK-LABEL: @xvssrarni_b_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrarni.b.h(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvssrarni_b_h(v32i8 _1, v32i8 _2) { return __builtin_lasx_xvssrarni_b_h(_1, _2, 1); }
+// CHECK-LABEL: @xvssrarni_h_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrarni.h.w(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvssrarni_h_w(v16i16 _1, v16i16 _2) { return __builtin_lasx_xvssrarni_h_w(_1, _2, 1); }
+// CHECK-LABEL: @xvssrarni_w_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrarni.w.d(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvssrarni_w_d(v8i32 _1, v8i32 _2) { return __builtin_lasx_xvssrarni_w_d(_1, _2, 1); }
+// CHECK-LABEL: @xvssrarni_d_q(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssrarni.d.q(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvssrarni_d_q(v4i64 _1, v4i64 _2) { return __builtin_lasx_xvssrarni_d_q(_1, _2, 1); }
+// CHECK-LABEL: @xvssrarni_bu_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvssrarni.bu.h(<32 x i8> [[_1:%.*]], <32 x i8> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32u8 xvssrarni_bu_h(v32u8 _1, v32i8 _2) { return __builtin_lasx_xvssrarni_bu_h(_1, _2, 1); }
+// CHECK-LABEL: @xvssrarni_hu_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvssrarni.hu.w(<16 x i16> [[_1:%.*]], <16 x i16> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16u16 xvssrarni_hu_w(v16u16 _1, v16i16 _2) { return __builtin_lasx_xvssrarni_hu_w(_1, _2, 1); }
+// CHECK-LABEL: @xvssrarni_wu_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvssrarni.wu.d(<8 x i32> [[_1:%.*]], <8 x i32> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8u32 xvssrarni_wu_d(v8u32 _1, v8i32 _2) { return __builtin_lasx_xvssrarni_wu_d(_1, _2, 1); }
+// CHECK-LABEL: @xvssrarni_du_q(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvssrarni.du.q(<4 x i64> [[_1:%.*]], <4 x i64> [[_2:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4u64 xvssrarni_du_q(v4u64 _1, v4i64 _2) { return __builtin_lasx_xvssrarni_du_q(_1, _2, 1); }
+// CHECK-LABEL: @xbnz_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lasx.xbnz.b(<32 x i8> [[_1:%.*]])
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+int xbnz_b(v32u8 _1) { return __builtin_lasx_xbnz_b(_1); }
+// CHECK-LABEL: @xbnz_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lasx.xbnz.d(<4 x i64> [[_1:%.*]])
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+int xbnz_d(v4u64 _1) { return __builtin_lasx_xbnz_d(_1); }
+// CHECK-LABEL: @xbnz_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lasx.xbnz.h(<16 x i16> [[_1:%.*]])
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+int xbnz_h(v16u16 _1) { return __builtin_lasx_xbnz_h(_1); }
+// CHECK-LABEL: @xbnz_v(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lasx.xbnz.v(<32 x i8> [[_1:%.*]])
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+int xbnz_v(v32u8 _1) { return __builtin_lasx_xbnz_v(_1); }
+// CHECK-LABEL: @xbnz_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lasx.xbnz.w(<8 x i32> [[_1:%.*]])
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+int xbnz_w(v8u32 _1) { return __builtin_lasx_xbnz_w(_1); }
+// CHECK-LABEL: @xbz_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lasx.xbz.b(<32 x i8> [[_1:%.*]])
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+int xbz_b(v32u8 _1) { return __builtin_lasx_xbz_b(_1); }
+// CHECK-LABEL: @xbz_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lasx.xbz.d(<4 x i64> [[_1:%.*]])
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+int xbz_d(v4u64 _1) { return __builtin_lasx_xbz_d(_1); }
+// CHECK-LABEL: @xbz_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lasx.xbz.h(<16 x i16> [[_1:%.*]])
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+int xbz_h(v16u16 _1) { return __builtin_lasx_xbz_h(_1); }
+// CHECK-LABEL: @xbz_v(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lasx.xbz.v(<32 x i8> [[_1:%.*]])
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+int xbz_v(v32u8 _1) { return __builtin_lasx_xbz_v(_1); }
+// CHECK-LABEL: @xbz_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.loongarch.lasx.xbz.w(<8 x i32> [[_1:%.*]])
+// CHECK-NEXT:    ret i32 [[TMP0]]
+//
+int xbz_w(v8u32 _1) { return __builtin_lasx_xbz_w(_1); }
+// CHECK-LABEL: @xvfcmp_caf_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.caf.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvfcmp_caf_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfcmp_caf_d(_1, _2); }
+// CHECK-LABEL: @xvfcmp_caf_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.caf.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvfcmp_caf_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfcmp_caf_s(_1, _2); }
+// CHECK-LABEL: @xvfcmp_ceq_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.ceq.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvfcmp_ceq_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfcmp_ceq_d(_1, _2); }
+// CHECK-LABEL: @xvfcmp_ceq_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.ceq.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvfcmp_ceq_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfcmp_ceq_s(_1, _2); }
+// CHECK-LABEL: @xvfcmp_cle_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.cle.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvfcmp_cle_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfcmp_cle_d(_1, _2); }
+// CHECK-LABEL: @xvfcmp_cle_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.cle.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvfcmp_cle_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfcmp_cle_s(_1, _2); }
+// CHECK-LABEL: @xvfcmp_clt_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.clt.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvfcmp_clt_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfcmp_clt_d(_1, _2); }
+// CHECK-LABEL: @xvfcmp_clt_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.clt.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvfcmp_clt_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfcmp_clt_s(_1, _2); }
+// CHECK-LABEL: @xvfcmp_cne_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.cne.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvfcmp_cne_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfcmp_cne_d(_1, _2); }
+// CHECK-LABEL: @xvfcmp_cne_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.cne.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvfcmp_cne_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfcmp_cne_s(_1, _2); }
+// CHECK-LABEL: @xvfcmp_cor_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.cor.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvfcmp_cor_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfcmp_cor_d(_1, _2); }
+// CHECK-LABEL: @xvfcmp_cor_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.cor.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvfcmp_cor_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfcmp_cor_s(_1, _2); }
+// CHECK-LABEL: @xvfcmp_cueq_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.cueq.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvfcmp_cueq_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfcmp_cueq_d(_1, _2); }
+// CHECK-LABEL: @xvfcmp_cueq_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.cueq.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvfcmp_cueq_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfcmp_cueq_s(_1, _2); }
+// CHECK-LABEL: @xvfcmp_cule_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.cule.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvfcmp_cule_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfcmp_cule_d(_1, _2); }
+// CHECK-LABEL: @xvfcmp_cule_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.cule.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvfcmp_cule_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfcmp_cule_s(_1, _2); }
+// CHECK-LABEL: @xvfcmp_cult_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.cult.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvfcmp_cult_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfcmp_cult_d(_1, _2); }
+// CHECK-LABEL: @xvfcmp_cult_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.cult.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvfcmp_cult_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfcmp_cult_s(_1, _2); }
+// CHECK-LABEL: @xvfcmp_cun_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.cun.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvfcmp_cun_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfcmp_cun_d(_1, _2); }
+// CHECK-LABEL: @xvfcmp_cune_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.cune.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvfcmp_cune_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfcmp_cune_d(_1, _2); }
+// CHECK-LABEL: @xvfcmp_cune_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.cune.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvfcmp_cune_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfcmp_cune_s(_1, _2); }
+// CHECK-LABEL: @xvfcmp_cun_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.cun.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvfcmp_cun_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfcmp_cun_s(_1, _2); }
+// CHECK-LABEL: @xvfcmp_saf_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.saf.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvfcmp_saf_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfcmp_saf_d(_1, _2); }
+// CHECK-LABEL: @xvfcmp_saf_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.saf.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvfcmp_saf_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfcmp_saf_s(_1, _2); }
+// CHECK-LABEL: @xvfcmp_seq_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.seq.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvfcmp_seq_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfcmp_seq_d(_1, _2); }
+// CHECK-LABEL: @xvfcmp_seq_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.seq.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvfcmp_seq_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfcmp_seq_s(_1, _2); }
+// CHECK-LABEL: @xvfcmp_sle_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.sle.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvfcmp_sle_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfcmp_sle_d(_1, _2); }
+// CHECK-LABEL: @xvfcmp_sle_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.sle.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvfcmp_sle_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfcmp_sle_s(_1, _2); }
+// CHECK-LABEL: @xvfcmp_slt_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.slt.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvfcmp_slt_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfcmp_slt_d(_1, _2); }
+// CHECK-LABEL: @xvfcmp_slt_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.slt.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvfcmp_slt_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfcmp_slt_s(_1, _2); }
+// CHECK-LABEL: @xvfcmp_sne_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.sne.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvfcmp_sne_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfcmp_sne_d(_1, _2); }
+// CHECK-LABEL: @xvfcmp_sne_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.sne.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvfcmp_sne_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfcmp_sne_s(_1, _2); }
+// CHECK-LABEL: @xvfcmp_sor_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.sor.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvfcmp_sor_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfcmp_sor_d(_1, _2); }
+// CHECK-LABEL: @xvfcmp_sor_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.sor.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvfcmp_sor_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfcmp_sor_s(_1, _2); }
+// CHECK-LABEL: @xvfcmp_sueq_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.sueq.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvfcmp_sueq_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfcmp_sueq_d(_1, _2); }
+// CHECK-LABEL: @xvfcmp_sueq_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.sueq.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvfcmp_sueq_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfcmp_sueq_s(_1, _2); }
+// CHECK-LABEL: @xvfcmp_sule_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.sule.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvfcmp_sule_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfcmp_sule_d(_1, _2); }
+// CHECK-LABEL: @xvfcmp_sule_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.sule.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvfcmp_sule_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfcmp_sule_s(_1, _2); }
+// CHECK-LABEL: @xvfcmp_sult_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.sult.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvfcmp_sult_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfcmp_sult_d(_1, _2); }
+// CHECK-LABEL: @xvfcmp_sult_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.sult.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvfcmp_sult_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfcmp_sult_s(_1, _2); }
+// CHECK-LABEL: @xvfcmp_sun_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.sun.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvfcmp_sun_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfcmp_sun_d(_1, _2); }
+// CHECK-LABEL: @xvfcmp_sune_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvfcmp.sune.d(<4 x double> [[_1:%.*]], <4 x double> [[_2:%.*]])
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvfcmp_sune_d(v4f64 _1, v4f64 _2) { return __builtin_lasx_xvfcmp_sune_d(_1, _2); }
+// CHECK-LABEL: @xvfcmp_sune_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.sune.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvfcmp_sune_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfcmp_sune_s(_1, _2); }
+// CHECK-LABEL: @xvfcmp_sun_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvfcmp.sun.s(<8 x float> [[_1:%.*]], <8 x float> [[_2:%.*]])
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvfcmp_sun_s(v8f32 _1, v8f32 _2) { return __builtin_lasx_xvfcmp_sun_s(_1, _2); }
+// CHECK-LABEL: @xvpickve_d_f(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvpickve.d.f(<4 x double> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <4 x double> [[TMP0]]
+//
+v4f64 xvpickve_d_f(v4f64 _1) { return __builtin_lasx_xvpickve_d_f(_1, 1); }
+// CHECK-LABEL: @xvpickve_w_f(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvpickve.w.f(<8 x float> [[_1:%.*]], i32 1)
+// CHECK-NEXT:    ret <8 x float> [[TMP0]]
+//
+v8f32 xvpickve_w_f(v8f32 _1) { return __builtin_lasx_xvpickve_w_f(_1, 1); }
+// CHECK-LABEL: @xvrepli_b(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.xvrepli.b(i32 1)
+// CHECK-NEXT:    ret <32 x i8> [[TMP0]]
+//
+v32i8 xvrepli_b() { return __builtin_lasx_xvrepli_b(1); }
+// CHECK-LABEL: @xvrepli_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x i64> @llvm.loongarch.lasx.xvrepli.d(i32 1)
+// CHECK-NEXT:    ret <4 x i64> [[TMP0]]
+//
+v4i64 xvrepli_d() { return __builtin_lasx_xvrepli_d(1); }
+// CHECK-LABEL: @xvrepli_h(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.loongarch.lasx.xvrepli.h(i32 1)
+// CHECK-NEXT:    ret <16 x i16> [[TMP0]]
+//
+v16i16 xvrepli_h() { return __builtin_lasx_xvrepli_h(1); }
+// CHECK-LABEL: @xvrepli_w(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.loongarch.lasx.xvrepli.w(i32 1)
+// CHECK-NEXT:    ret <8 x i32> [[TMP0]]
+//
+v8i32 xvrepli_w() { return __builtin_lasx_xvrepli_w(1); }

From 1aaac0129cb1a5c7de74265bed7f6c940a625642 Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Tue, 31 Oct 2023 07:54:27 +0000
Subject: [PATCH 479/877] [gn build] Port a4005e729c8d

---
 llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn
index a70ff97299aa0..bb503ddf571e7 100644
--- a/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn
@@ -201,6 +201,7 @@ copy("Headers") {
     "iso646.h",
     "keylockerintrin.h",
     "larchintrin.h",
+    "lasxintrin.h",
     "limits.h",
     "llvm_libc_wrappers/assert.h",
     "llvm_libc_wrappers/ctype.h",

From f3fd1ecd5759cfa7f368a2e0a876ca7cddcc8b2b Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Tue, 31 Oct 2023 07:54:27 +0000
Subject: [PATCH 480/877] [gn build] Port d6bfa3341181

---
 llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn
index bb503ddf571e7..20135221fdacd 100644
--- a/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn
@@ -210,6 +210,7 @@ copy("Headers") {
     "llvm_libc_wrappers/stdlib.h",
     "llvm_libc_wrappers/string.h",
     "llvm_libc_wrappers/time.h",
+    "lsxintrin.h",
     "lwpintrin.h",
     "lzcntintrin.h",
     "mm3dnow.h",

From d473e2c124df5adfdb513de2044424ebbfc72bf7 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Tue, 31 Oct 2023 09:05:12 +0100
Subject: [PATCH 481/877] [RISCV][MCA] Remove unnecessary -debug flag from test
 (NFC)

This test doesn't appear to actually use -debug output, and the
flag makes the test fail on non-assert builds.
---
 llvm/test/tools/llvm-mca/RISCV/SiFive7/strided-load-x0.s | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/test/tools/llvm-mca/RISCV/SiFive7/strided-load-x0.s b/llvm/test/tools/llvm-mca/RISCV/SiFive7/strided-load-x0.s
index 8b52d0ece6359..41a6935873ad3 100644
--- a/llvm/test/tools/llvm-mca/RISCV/SiFive7/strided-load-x0.s
+++ b/llvm/test/tools/llvm-mca/RISCV/SiFive7/strided-load-x0.s
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
-# RUN: llvm-mca -debug -mtriple=riscv64 -mcpu=sifive-x280 -iterations=1 < %s | FileCheck %s
+# RUN: llvm-mca -mtriple=riscv64 -mcpu=sifive-x280 -iterations=1 < %s | FileCheck %s
 
 vsetvli zero, zero, e32, m1, tu, mu
 

From d56eb823fc772bcfea266275da98cfe607b05b7f Mon Sep 17 00:00:00 2001
From: Paulo Matos <pmatos@igalia.com>
Date: Tue, 31 Oct 2023 09:08:35 +0100
Subject: [PATCH 482/877] [NFC] Remote TODO regarding opaque ptr conversion

---
 llvm/include/llvm/IR/Type.h | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/llvm/include/llvm/IR/Type.h b/llvm/include/llvm/IR/Type.h
index c38078cc6087e..63414cac47607 100644
--- a/llvm/include/llvm/IR/Type.h
+++ b/llvm/include/llvm/IR/Type.h
@@ -482,9 +482,6 @@ class Type {
   //===--------------------------------------------------------------------===//
   // Convenience methods for getting pointer types.
   //
-
-  // TODO: After opaque pointer transition this can be replaced by simply
-  //       calling PointerType::get(C, AS).
   static PointerType *getInt8PtrTy(LLVMContext &C, unsigned AS = 0);
 
   static Type *getWasm_ExternrefTy(LLVMContext &C);

From dbd4a0dd38eb03df4f7d55c780b3dd6cb15a270d Mon Sep 17 00:00:00 2001
From: Christian Ulmann <christian.ulmann@nextsilicon.com>
Date: Tue, 31 Oct 2023 09:22:44 +0100
Subject: [PATCH 483/877] [MLIR][GPUCommon] Remove typed pointer support
 (#70735)

This commit removes the GPUCommon's lowering support for typed pointers.
Typed pointers have been deprecated for a while now and it's planned to
soon remove them from the LLVM dialect.

Related PSA:
https://discourse.llvm.org/t/psa-removal-of-typed-pointers-from-the-llvm-dialect/74502
---
 mlir/include/mlir/Conversion/Passes.td        |   5 +-
 .../include/mlir/Dialect/LLVMIR/LLVMDialect.h |   2 +-
 .../GPUCommon/GPUToLLVMConversion.cpp         | 145 +++++-------------
 ...ower-2to4-sparse-to-gpu-runtime-calls.mlir |   2 +-
 .../lower-alloc-to-gpu-runtime-calls.mlir     |   2 +-
 ...ower-launch-func-to-gpu-runtime-calls.mlir |   4 +-
 .../lower-memcpy-to-gpu-runtime-calls.mlir    |   2 +-
 .../lower-memset-to-gpu-runtime-calls.mlir    |   2 +-
 .../lower-sparse-to-gpu-runtime-calls.mlir    |   2 +-
 .../lower-wait-to-gpu-runtime-calls.mlir      |   2 +-
 .../Conversion/GPUCommon/transfer_write.mlir  |   2 +-
 .../Conversion/GPUCommon/typed-pointers.mlir  |  82 ----------
 12 files changed, 50 insertions(+), 202 deletions(-)
 delete mode 100644 mlir/test/Conversion/GPUCommon/typed-pointers.mlir

diff --git a/mlir/include/mlir/Conversion/Passes.td b/mlir/include/mlir/Conversion/Passes.td
index ba7dc642af2a0..036c9b0039779 100644
--- a/mlir/include/mlir/Conversion/Passes.td
+++ b/mlir/include/mlir/Conversion/Passes.td
@@ -449,10 +449,7 @@ def GpuToLLVMConversionPass : Pass<"gpu-to-llvm", "ModuleOp"> {
     Option<"gpuBinaryAnnotation", "gpu-binary-annotation", "std::string",
                /*default=*/"gpu::getDefaultGpuBinaryAnnotation()",
                "Annotation attribute string for GPU binary"
-               >,
-    Option<"useOpaquePointers", "use-opaque-pointers", "bool",
-               /*default=*/"true", "Generate LLVM IR using opaque pointers "
-               "instead of typed pointers">,
+               >
   ];
 
   let dependentDialects = [
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMDialect.h b/mlir/include/mlir/Dialect/LLVMIR/LLVMDialect.h
index 447e3c9a59e5c..bbed1ea5cf622 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMDialect.h
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMDialect.h
@@ -210,7 +210,7 @@ class GEPIndicesAdaptor {
 /// string (operations inserted at the builder insertion point).
 Value createGlobalString(Location loc, OpBuilder &builder, StringRef name,
                          StringRef value, Linkage linkage,
-                         bool useOpaquePointers);
+                         bool useOpaquePointers = true);
 
 /// LLVM requires some operations to be inside of a Module operation. This
 /// function confirms that the Operation has the desired properties.
diff --git a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
index 12bd02050be03..7bac8f5a8f0e0 100644
--- a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
+++ b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
@@ -82,19 +82,12 @@ class ConvertOpToGpuRuntimeCallPattern : public ConvertOpToLLVMPattern<OpTy> {
   MLIRContext *context = &this->getTypeConverter()->getContext();
 
   Type llvmVoidType = LLVM::LLVMVoidType::get(context);
-  LLVM::LLVMPointerType llvmPointerType =
-      this->getTypeConverter()->getPointerType(IntegerType::get(context, 8));
-  Type llvmPointerPointerType =
-      this->getTypeConverter()->getPointerType(llvmPointerType);
+  LLVM::LLVMPointerType llvmPointerType = LLVM::LLVMPointerType::get(context);
   Type llvmInt8Type = IntegerType::get(context, 8);
   Type llvmInt16Type = IntegerType::get(context, 16);
   Type llvmInt32Type = IntegerType::get(context, 32);
   Type llvmInt64Type = IntegerType::get(context, 64);
   Type llvmFloat32Type = Float32Type::get(context);
-  Type llvmInt8PointerType =
-      this->getTypeConverter()->getPointerType(llvmInt8Type);
-  Type llvmInt64PointerType =
-      this->getTypeConverter()->getPointerType(llvmInt64Type);
   Type llvmIntPtrType = IntegerType::get(
       context, this->getTypeConverter()->getPointerBitwidth(0));
 
@@ -115,18 +108,18 @@ class ConvertOpToGpuRuntimeCallPattern : public ConvertOpToLLVMPattern<OpTy> {
       "mgpuLaunchKernel",
       llvmVoidType,
       {
-          llvmPointerType,        /* void* f */
-          llvmIntPtrType,         /* intptr_t gridXDim */
-          llvmIntPtrType,         /* intptr_t gridyDim */
-          llvmIntPtrType,         /* intptr_t gridZDim */
-          llvmIntPtrType,         /* intptr_t blockXDim */
-          llvmIntPtrType,         /* intptr_t blockYDim */
-          llvmIntPtrType,         /* intptr_t blockZDim */
-          llvmInt32Type,          /* unsigned int sharedMemBytes */
-          llvmPointerType,        /* void *hstream */
-          llvmPointerPointerType, /* void **kernelParams */
-          llvmPointerPointerType, /* void **extra */
-          llvmInt64Type           /* size_t paramsCount */
+          llvmPointerType, /* void* f */
+          llvmIntPtrType,  /* intptr_t gridXDim */
+          llvmIntPtrType,  /* intptr_t gridyDim */
+          llvmIntPtrType,  /* intptr_t gridZDim */
+          llvmIntPtrType,  /* intptr_t blockXDim */
+          llvmIntPtrType,  /* intptr_t blockYDim */
+          llvmIntPtrType,  /* intptr_t blockZDim */
+          llvmInt32Type,   /* unsigned int sharedMemBytes */
+          llvmPointerType, /* void *hstream */
+          llvmPointerType, /* void **kernelParams */
+          llvmPointerType, /* void **extra */
+          llvmInt64Type    /* size_t paramsCount */
       }};
   FunctionCallBuilder streamCreateCallBuilder = {
       "mgpuStreamCreate", llvmPointerType /* void *stream */, {}};
@@ -588,7 +581,6 @@ DECLARE_CONVERT_OP_TO_GPU_RUNTIME_CALL_PATTERN(SetCsrPointersOp)
 
 void GpuToLLVMConversionPass::runOnOperation() {
   LowerToLLVMOptions options(&getContext());
-  options.useOpaquePointers = useOpaquePointers;
   options.useBarePtrCallConv = hostBarePtrCallConv;
 
   LLVMTypeConverter converter(&getContext(), options);
@@ -835,8 +827,6 @@ LogicalResult ConvertAllocOpToGpuRuntimeCallPattern::matchAndRewrite(
 
   // Allocate the underlying buffer and store a pointer to it in the MemRef
   // descriptor.
-  Type elementPtrType = this->getElementPtrType(memRefType);
-
   auto nullPtr = rewriter.create<mlir::LLVM::ZeroOp>(loc, llvmPointerType);
   Value stream = adaptor.getAsyncDependencies().empty()
                      ? nullPtr
@@ -848,9 +838,6 @@ LogicalResult ConvertAllocOpToGpuRuntimeCallPattern::matchAndRewrite(
   Value allocatedPtr =
       allocCallBuilder.create(loc, rewriter, {sizeBytes, stream, isHostShared})
           .getResult();
-  if (!getTypeConverter()->useOpaquePointers())
-    allocatedPtr =
-        rewriter.create<LLVM::BitcastOp>(loc, elementPtrType, allocatedPtr);
 
   // No alignment.
   Value alignedPtr = allocatedPtr;
@@ -880,8 +867,6 @@ LogicalResult ConvertDeallocOpToGpuRuntimeCallPattern::matchAndRewrite(
 
   Value pointer =
       MemRefDescriptor(adaptor.getMemref()).allocatedPtr(rewriter, loc);
-  if (!getTypeConverter()->useOpaquePointers())
-    pointer = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, pointer);
   Value stream = adaptor.getAsyncDependencies().front();
   deallocCallBuilder.create(loc, rewriter, {pointer, stream});
 
@@ -1035,24 +1020,21 @@ Value ConvertLaunchFuncOpToGpuRuntimeCallPattern::generateParamsArray(
   auto structType = LLVM::LLVMStructType::getNewIdentified(context, StringRef(),
                                                            argumentTypes);
   auto one = builder.create<LLVM::ConstantOp>(loc, llvmInt32Type, 1);
-  auto structPtr = builder.create<LLVM::AllocaOp>(
-      loc, getTypeConverter()->getPointerType(structType), structType, one,
-      /*alignment=*/0);
+  auto structPtr =
+      builder.create<LLVM::AllocaOp>(loc, llvmPointerType, structType, one,
+                                     /*alignment=*/0);
   auto arraySize =
       builder.create<LLVM::ConstantOp>(loc, llvmInt32Type, numArguments);
   auto arrayPtr = builder.create<LLVM::AllocaOp>(
-      loc, llvmPointerPointerType, llvmPointerType, arraySize, /*alignment=*/0);
+      loc, llvmPointerType, llvmPointerType, arraySize, /*alignment=*/0);
   for (const auto &en : llvm::enumerate(arguments)) {
-    Value fieldPtr = builder.create<LLVM::GEPOp>(
-        loc, getTypeConverter()->getPointerType(argumentTypes[en.index()]),
-        structType, structPtr, ArrayRef<LLVM::GEPArg>{0, en.index()});
+    Value fieldPtr =
+        builder.create<LLVM::GEPOp>(loc, llvmPointerType, structType, structPtr,
+                                    ArrayRef<LLVM::GEPArg>{0, en.index()});
     builder.create<LLVM::StoreOp>(loc, en.value(), fieldPtr);
     auto elementPtr = builder.create<LLVM::GEPOp>(
-        loc, llvmPointerPointerType, llvmPointerType, arrayPtr,
+        loc, llvmPointerType, llvmPointerType, arrayPtr,
         ArrayRef<LLVM::GEPArg>{en.index()});
-    if (!getTypeConverter()->useOpaquePointers())
-      fieldPtr =
-          builder.create<LLVM::BitcastOp>(loc, llvmPointerType, fieldPtr);
     builder.create<LLVM::StoreOp>(loc, fieldPtr, elementPtr);
   }
   return arrayPtr;
@@ -1079,7 +1061,7 @@ Value ConvertLaunchFuncOpToGpuRuntimeCallPattern::generateKernelNameConstant(
       std::string(llvm::formatv("{0}_{1}_kernel_name", moduleName, name));
   return LLVM::createGlobalString(
       loc, builder, globalName, StringRef(kernelName.data(), kernelName.size()),
-      LLVM::Linkage::Internal, getTypeConverter()->useOpaquePointers());
+      LLVM::Linkage::Internal);
 }
 
 // Emits LLVM IR to launch a kernel function. Expects the module that contains
@@ -1170,9 +1152,9 @@ LogicalResult ConvertLaunchFuncOpToGpuRuntimeCallPattern::matchAndRewrite(
 
   SmallString<128> nameBuffer(kernelModule.getName());
   nameBuffer.append(kGpuBinaryStorageSuffix);
-  Value data = LLVM::createGlobalString(
-      loc, rewriter, nameBuffer.str(), binaryAttr.getValue(),
-      LLVM::Linkage::Internal, getTypeConverter()->useOpaquePointers());
+  Value data =
+      LLVM::createGlobalString(loc, rewriter, nameBuffer.str(),
+                               binaryAttr.getValue(), LLVM::Linkage::Internal);
 
   // Pass the binary size. SPIRV requires binary size.
   auto gpuBlob = binaryAttr.getValue();
@@ -1205,7 +1187,7 @@ LogicalResult ConvertLaunchFuncOpToGpuRuntimeCallPattern::matchAndRewrite(
           : adaptor.getAsyncDependencies().front();
   // Create array of pointers to kernel arguments.
   auto kernelParams = generateParamsArray(launchOp, adaptor, rewriter);
-  auto nullpointer = rewriter.create<LLVM::ZeroOp>(loc, llvmPointerPointerType);
+  auto nullpointer = rewriter.create<LLVM::ZeroOp>(loc, llvmPointerType);
   Value dynamicSharedMemorySize = launchOp.getDynamicSharedMemorySize()
                                       ? launchOp.getDynamicSharedMemorySize()
                                       : zero;
@@ -1241,14 +1223,10 @@ static Value bitAndAddrspaceCast(Location loc,
   if (destinationType.getAddressSpace() != sourceTy.getAddressSpace())
     sourcePtr = rewriter.create<LLVM::AddrSpaceCastOp>(
         loc,
-        typeConverter.getPointerType(sourceTy.getElementType(),
-                                     destinationType.getAddressSpace()),
+        LLVM::LLVMPointerType::get(rewriter.getContext(),
+                                   destinationType.getAddressSpace()),
         sourcePtr);
-
-  if (typeConverter.useOpaquePointers())
-    return sourcePtr;
-
-  return rewriter.create<LLVM::BitcastOp>(loc, destinationType, sourcePtr);
+  return sourcePtr;
 }
 
 LogicalResult ConvertMemcpyOpToGpuRuntimeCallPattern::matchAndRewrite(
@@ -1366,8 +1344,6 @@ LogicalResult ConvertCreateDnTensorOpToGpuRuntimeCallPattern::matchAndRewrite(
   auto stream = adaptor.getAsyncDependencies().front();
   Value pTensor =
       MemRefDescriptor(adaptor.getMemref()).allocatedPtr(rewriter, loc);
-  if (!getTypeConverter()->useOpaquePointers())
-    pTensor = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, pTensor);
   Type dType = op.getMemref().getType().getElementType();
   auto dtp = genConstInt32From(rewriter, loc, getCuSparseDataTypeFrom(dType));
 
@@ -1388,7 +1364,7 @@ LogicalResult ConvertCreateDnTensorOpToGpuRuntimeCallPattern::matchAndRewrite(
       auto handleSz = rewriter.create<LLVM::ConstantOp>(
           loc, getIndexType(), rewriter.getIndexAttr(11032));
       handle = rewriter.create<LLVM::AllocaOp>(
-          loc, llvmInt8PointerType, llvmInt8Type, handleSz, /*alignment=*/16);
+          loc, llvmPointerType, llvmInt8Type, handleSz, /*alignment=*/16);
       handle = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, handle);
 
       createLtDnMatCallBuilder
@@ -1457,11 +1433,6 @@ LogicalResult ConvertCreateCooOpToGpuRuntimeCallPattern::matchAndRewrite(
       MemRefDescriptor(adaptor.getColIdxs()).allocatedPtr(rewriter, loc);
   Value pValues =
       MemRefDescriptor(adaptor.getValues()).allocatedPtr(rewriter, loc);
-  if (!getTypeConverter()->useOpaquePointers()) {
-    pRowIdxs = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, pRowIdxs);
-    pColIdxs = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, pColIdxs);
-    pValues = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, pValues);
-  }
   Type iType =
       llvm::cast<MemRefType>(op.getColIdxs().getType()).getElementType();
   Type dType =
@@ -1489,10 +1460,6 @@ LogicalResult ConvertCreateCooAoSOpToGpuRuntimeCallPattern::matchAndRewrite(
   Value pIdxs = MemRefDescriptor(adaptor.getIdxs()).allocatedPtr(rewriter, loc);
   Value pValues =
       MemRefDescriptor(adaptor.getValues()).allocatedPtr(rewriter, loc);
-  if (!getTypeConverter()->useOpaquePointers()) {
-    pIdxs = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, pIdxs);
-    pValues = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, pValues);
-  }
   Type iType = llvm::cast<MemRefType>(op.getIdxs().getType()).getElementType();
   Type dType =
       llvm::cast<MemRefType>(op.getValues().getType()).getElementType();
@@ -1522,11 +1489,6 @@ LogicalResult ConvertCreateCsrOpToGpuRuntimeCallPattern::matchAndRewrite(
       MemRefDescriptor(adaptor.getColIdxs()).allocatedPtr(rewriter, loc);
   Value pValues =
       MemRefDescriptor(adaptor.getValues()).allocatedPtr(rewriter, loc);
-  if (!getTypeConverter()->useOpaquePointers()) {
-    pRowPos = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, pRowPos);
-    pColIdxs = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, pColIdxs);
-    pValues = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, pValues);
-  }
   Type pType =
       llvm::cast<MemRefType>(op.getRowPos().getType()).getElementType();
   Type iType =
@@ -1556,8 +1518,6 @@ LogicalResult ConvertCreate2To4SpMatOpToGpuRuntimeCallPattern::matchAndRewrite(
   auto stream = adaptor.getAsyncDependencies().front();
   Value pMat =
       MemRefDescriptor(adaptor.getMemref()).allocatedPtr(rewriter, loc);
-  if (!getTypeConverter()->useOpaquePointers())
-    pMat = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, pMat);
   Type dType =
       llvm::cast<MemRefType>(op.getMemref().getType()).getElementType();
   auto dtp = genConstInt32From(rewriter, loc, getCuSparseDataTypeFrom(dType));
@@ -1566,7 +1526,7 @@ LogicalResult ConvertCreate2To4SpMatOpToGpuRuntimeCallPattern::matchAndRewrite(
   auto handleSz = rewriter.create<LLVM::ConstantOp>(
       loc, getIndexType(), rewriter.getIndexAttr(44104));
   Value handle = rewriter.create<LLVM::AllocaOp>(
-      loc, llvmInt8PointerType, llvmInt8Type, handleSz, /*alignment=*/16);
+      loc, llvmPointerType, llvmInt8Type, handleSz, /*alignment=*/16);
   handle = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, handle);
 
   create2To4SpMatCallBuilder
@@ -1630,8 +1590,6 @@ LogicalResult ConvertSpMVOpToGpuRuntimeCallPattern::matchAndRewrite(
   auto stream = adaptor.getAsyncDependencies().front();
   Value pBuf =
       MemRefDescriptor(adaptor.getBuffer()).allocatedPtr(rewriter, loc);
-  if (!getTypeConverter()->useOpaquePointers())
-    pBuf = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, pBuf);
   spMVCallBuilder.create(loc, rewriter,
                          {modeA, adaptor.getSpmatA(), adaptor.getDnX(),
                           adaptor.getDnY(), computeType, pBuf, stream});
@@ -1658,7 +1616,7 @@ LogicalResult ConvertSpMMBufferSizeOpToGpuRuntimeCallPattern::matchAndRewrite(
     auto three = rewriter.create<LLVM::ConstantOp>(loc, getIndexType(),
                                                    rewriter.getIndexAttr(3));
     auto bufferSize = rewriter.create<LLVM::AllocaOp>(
-        loc, llvmInt64PointerType, llvmInt64Type, three, /*alignment=*/16);
+        loc, llvmPointerType, llvmPointerType, three, /*alignment=*/16);
     createCuSparseLtSpMMBufferSizeBuilder
         .create(loc, rewriter,
                 {bufferSize, modeA, modeB, adaptor.getSpmatA(),
@@ -1667,11 +1625,11 @@ LogicalResult ConvertSpMMBufferSizeOpToGpuRuntimeCallPattern::matchAndRewrite(
         .getResult();
 
     auto bufferSizePtr1 = rewriter.create<LLVM::GEPOp>(
-        loc, llvmInt64PointerType, llvmInt64PointerType, bufferSize,
+        loc, llvmPointerType, llvmPointerType, bufferSize,
         ValueRange{rewriter.create<LLVM::ConstantOp>(
             loc, getIndexType(), rewriter.getIndexAttr(1))});
     auto bufferSizePtr2 = rewriter.create<LLVM::GEPOp>(
-        loc, llvmInt64PointerType, llvmInt64PointerType, bufferSize,
+        loc, llvmPointerType, llvmPointerType, bufferSize,
         ValueRange{rewriter.create<LLVM::ConstantOp>(
             loc, getIndexType(), rewriter.getIndexAttr(2))});
     auto bufferSize0 =
@@ -1737,8 +1695,6 @@ LogicalResult ConvertSpMMOpToGpuRuntimeCallPattern::matchAndRewrite(
     SmallVector<Value> pBufs;
     for (Value buffer : adaptor.getBuffers()) {
       Value pBuf = MemRefDescriptor(buffer).allocatedPtr(rewriter, loc);
-      if (!getTypeConverter()->useOpaquePointers())
-        pBuf = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, pBuf);
       pBufs.push_back(pBuf);
     }
     createCuSparseLtSpMMBuilder.create(
@@ -1748,8 +1704,6 @@ LogicalResult ConvertSpMMOpToGpuRuntimeCallPattern::matchAndRewrite(
   } else {
     Value pBuf = MemRefDescriptor(adaptor.getBuffers().front())
                      .allocatedPtr(rewriter, loc);
-    if (!getTypeConverter()->useOpaquePointers())
-      pBuf = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, pBuf);
     createSpMMCallBuilder.create(loc, rewriter,
                                  {modeA, modeB, adaptor.getSpmatA(),
                                   adaptor.getDnmatB(), adaptor.getDnmatC(),
@@ -1762,8 +1716,7 @@ LogicalResult ConvertSpMMOpToGpuRuntimeCallPattern::matchAndRewrite(
 template <typename T>
 static void addOpaquePointerConversion(LLVMTypeConverter &converter) {
   converter.addConversion([&converter](T) -> Type {
-    return converter.getPointerType(
-        IntegerType::get(&converter.getContext(), 8));
+    return LLVM::LLVMPointerType::get(&converter.getContext());
   });
 }
 
@@ -1781,8 +1734,6 @@ LogicalResult ConvertSDDMMOpToGpuRuntimeCallPattern::matchAndRewrite(
   auto stream = adaptor.getAsyncDependencies().front();
   Value pBuf =
       MemRefDescriptor(adaptor.getBuffer()).allocatedPtr(rewriter, loc);
-  if (!getTypeConverter()->useOpaquePointers())
-    pBuf = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, pBuf);
   createSDDMMCallBuilder.create(loc, rewriter,
                                 {modeA, modeB, adaptor.getDnmatA(),
                                  adaptor.getDnmatB(), adaptor.getSpmatC(),
@@ -1837,9 +1788,6 @@ ConvertSpGEMMWorkEstimationOrComputeOpToGpuRuntimeCallPattern::matchAndRewrite(
 
   Value pBuf =
       MemRefDescriptor(adaptor.getBuffer()).allocatedPtr(rewriter, loc);
-  if (!getTypeConverter()->useOpaquePointers())
-    pBuf = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, pBuf);
-
   Value bufferSizeNew;
 
   if (adaptor.getKind() ==
@@ -1896,18 +1844,18 @@ LogicalResult ConvertSpMatGetSizeOpToGpuRuntimeCallPattern::matchAndRewrite(
   auto three = rewriter.create<LLVM::ConstantOp>(loc, getIndexType(),
                                                  rewriter.getIndexAttr(3));
   auto buffer = rewriter.create<LLVM::AllocaOp>(
-      loc, llvmInt64PointerType, llvmInt64Type, three, /*alignment=*/16);
+      loc, llvmPointerType, llvmInt64Type, three, /*alignment=*/16);
 
   auto rowsPtr = rewriter.create<LLVM::GEPOp>(
-      loc, llvmInt64PointerType, llvmInt64PointerType, buffer,
+      loc, llvmPointerType, llvmPointerType, buffer,
       ValueRange{rewriter.create<LLVM::ConstantOp>(loc, getIndexType(),
                                                    rewriter.getIndexAttr(0))});
   auto colsPtr = rewriter.create<LLVM::GEPOp>(
-      loc, llvmInt64PointerType, llvmInt64PointerType, buffer,
+      loc, llvmPointerType, llvmPointerType, buffer,
       ValueRange{rewriter.create<LLVM::ConstantOp>(loc, getIndexType(),
                                                    rewriter.getIndexAttr(1))});
   auto nnzsPtr = rewriter.create<LLVM::GEPOp>(
-      loc, llvmInt64PointerType, llvmInt64PointerType, buffer,
+      loc, llvmPointerType, llvmPointerType, buffer,
       ValueRange{rewriter.create<LLVM::ConstantOp>(loc, getIndexType(),
                                                    rewriter.getIndexAttr(2))});
   createSpMatGetSizeBuilder.create(
@@ -1934,11 +1882,6 @@ LogicalResult ConvertSetCsrPointersOpToGpuRuntimeCallPattern::matchAndRewrite(
       MemRefDescriptor(adaptor.getCoordinates()).allocatedPtr(rewriter, loc);
   Value pVal =
       MemRefDescriptor(adaptor.getValues()).allocatedPtr(rewriter, loc);
-  if (!getTypeConverter()->useOpaquePointers()) {
-    pPos = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, pPos);
-    pCrd = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, pCrd);
-    pVal = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, pVal);
-  }
   createSetCsrPointersBuilder.create(
       loc, rewriter, {adaptor.getSpmat(), pPos, pCrd, pVal, stream});
   rewriter.replaceOp(op, {stream});
@@ -1959,11 +1902,6 @@ LogicalResult ConvertCreateCscOpToGpuRuntimeCallPattern::matchAndRewrite(
       MemRefDescriptor(adaptor.getRowIdxs()).allocatedPtr(rewriter, loc);
   Value pValues =
       MemRefDescriptor(adaptor.getValues()).allocatedPtr(rewriter, loc);
-  if (!getTypeConverter()->useOpaquePointers()) {
-    pColPos = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, pColPos);
-    pRowIdxs = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, pRowIdxs);
-    pValues = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, pValues);
-  }
   Type pType =
       llvm::cast<MemRefType>(op.getColPos().getType()).getElementType();
   Type iType =
@@ -1997,11 +1935,6 @@ LogicalResult ConvertCreateBsrOpToGpuRuntimeCallPattern::matchAndRewrite(
       MemRefDescriptor(adaptor.getBColIdxs()).allocatedPtr(rewriter, loc);
   Value pValues =
       MemRefDescriptor(adaptor.getValues()).allocatedPtr(rewriter, loc);
-  if (!getTypeConverter()->useOpaquePointers()) {
-    pRowPos = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, pRowPos);
-    pColIdxs = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, pColIdxs);
-    pValues = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, pValues);
-  }
   Type pType =
       llvm::cast<MemRefType>(op.getBRowPos().getType()).getElementType();
   Type iType =
diff --git a/mlir/test/Conversion/GPUCommon/lower-2to4-sparse-to-gpu-runtime-calls.mlir b/mlir/test/Conversion/GPUCommon/lower-2to4-sparse-to-gpu-runtime-calls.mlir
index 113d49c507e9c..f448d35992333 100644
--- a/mlir/test/Conversion/GPUCommon/lower-2to4-sparse-to-gpu-runtime-calls.mlir
+++ b/mlir/test/Conversion/GPUCommon/lower-2to4-sparse-to-gpu-runtime-calls.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s --gpu-to-llvm='use-opaque-pointers=1' | FileCheck %s
+// RUN: mlir-opt %s --gpu-to-llvm | FileCheck %s
 
 module attributes {gpu.container_module} {
 
diff --git a/mlir/test/Conversion/GPUCommon/lower-alloc-to-gpu-runtime-calls.mlir b/mlir/test/Conversion/GPUCommon/lower-alloc-to-gpu-runtime-calls.mlir
index 70450656b9df6..ae8b7aaac7fd9 100644
--- a/mlir/test/Conversion/GPUCommon/lower-alloc-to-gpu-runtime-calls.mlir
+++ b/mlir/test/Conversion/GPUCommon/lower-alloc-to-gpu-runtime-calls.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s --gpu-to-llvm='use-opaque-pointers=1' | FileCheck %s
+// RUN: mlir-opt %s --gpu-to-llvm | FileCheck %s
 
 module attributes {gpu.container_module} {
   // CHECK-LABEL: llvm.func @main
diff --git a/mlir/test/Conversion/GPUCommon/lower-launch-func-to-gpu-runtime-calls.mlir b/mlir/test/Conversion/GPUCommon/lower-launch-func-to-gpu-runtime-calls.mlir
index 9df110d9b23ba..1b9afcdf50a17 100644
--- a/mlir/test/Conversion/GPUCommon/lower-launch-func-to-gpu-runtime-calls.mlir
+++ b/mlir/test/Conversion/GPUCommon/lower-launch-func-to-gpu-runtime-calls.mlir
@@ -1,5 +1,5 @@
-// RUN: mlir-opt %s --gpu-to-llvm="gpu-binary-annotation=nvvm.cubin use-opaque-pointers=1" -split-input-file | FileCheck %s
-// RUN: mlir-opt %s --gpu-to-llvm="gpu-binary-annotation=rocdl.hsaco use-opaque-pointers=1" -split-input-file | FileCheck %s --check-prefix=ROCDL
+// RUN: mlir-opt %s --gpu-to-llvm="gpu-binary-annotation=nvvm.cubin" -split-input-file | FileCheck %s
+// RUN: mlir-opt %s --gpu-to-llvm="gpu-binary-annotation=rocdl.hsaco" -split-input-file | FileCheck %s --check-prefix=ROCDL
 
 module attributes {gpu.container_module} {
 
diff --git a/mlir/test/Conversion/GPUCommon/lower-memcpy-to-gpu-runtime-calls.mlir b/mlir/test/Conversion/GPUCommon/lower-memcpy-to-gpu-runtime-calls.mlir
index 5c8e6d11934db..3f86b07698279 100644
--- a/mlir/test/Conversion/GPUCommon/lower-memcpy-to-gpu-runtime-calls.mlir
+++ b/mlir/test/Conversion/GPUCommon/lower-memcpy-to-gpu-runtime-calls.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s --gpu-to-llvm='use-opaque-pointers=1' | FileCheck %s
+// RUN: mlir-opt %s --gpu-to-llvm | FileCheck %s
 
 module attributes {gpu.container_module} {
 
diff --git a/mlir/test/Conversion/GPUCommon/lower-memset-to-gpu-runtime-calls.mlir b/mlir/test/Conversion/GPUCommon/lower-memset-to-gpu-runtime-calls.mlir
index 7e4b1191c5e6c..aaced31813d57 100644
--- a/mlir/test/Conversion/GPUCommon/lower-memset-to-gpu-runtime-calls.mlir
+++ b/mlir/test/Conversion/GPUCommon/lower-memset-to-gpu-runtime-calls.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s --gpu-to-llvm='use-opaque-pointers=1' | FileCheck %s
+// RUN: mlir-opt %s --gpu-to-llvm | FileCheck %s
 
 module attributes {gpu.container_module} {
 
diff --git a/mlir/test/Conversion/GPUCommon/lower-sparse-to-gpu-runtime-calls.mlir b/mlir/test/Conversion/GPUCommon/lower-sparse-to-gpu-runtime-calls.mlir
index f86d929e0e19a..d4c0a76088356 100644
--- a/mlir/test/Conversion/GPUCommon/lower-sparse-to-gpu-runtime-calls.mlir
+++ b/mlir/test/Conversion/GPUCommon/lower-sparse-to-gpu-runtime-calls.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s --gpu-to-llvm='use-opaque-pointers=1' | FileCheck %s
+// RUN: mlir-opt %s --gpu-to-llvm | FileCheck %s
 
 module attributes {gpu.container_module} {
 
diff --git a/mlir/test/Conversion/GPUCommon/lower-wait-to-gpu-runtime-calls.mlir b/mlir/test/Conversion/GPUCommon/lower-wait-to-gpu-runtime-calls.mlir
index a828c1d58da5f..d15efe354cfa8 100644
--- a/mlir/test/Conversion/GPUCommon/lower-wait-to-gpu-runtime-calls.mlir
+++ b/mlir/test/Conversion/GPUCommon/lower-wait-to-gpu-runtime-calls.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s --gpu-to-llvm='use-opaque-pointers=1' | FileCheck %s
+// RUN: mlir-opt %s --gpu-to-llvm | FileCheck %s
 
 module attributes {gpu.container_module} {
 
diff --git a/mlir/test/Conversion/GPUCommon/transfer_write.mlir b/mlir/test/Conversion/GPUCommon/transfer_write.mlir
index 9c2edf698548b..cba85915b49e4 100644
--- a/mlir/test/Conversion/GPUCommon/transfer_write.mlir
+++ b/mlir/test/Conversion/GPUCommon/transfer_write.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s --gpu-to-llvm='use-opaque-pointers=1' | FileCheck %s
+// RUN: mlir-opt %s --gpu-to-llvm | FileCheck %s
 
   func.func @warp_extract(%arg0: index, %arg1: memref<1024x1024xf32>, %arg2: index, %arg3: vector<1xf32>) {
     %c0 = arith.constant 0 : index
diff --git a/mlir/test/Conversion/GPUCommon/typed-pointers.mlir b/mlir/test/Conversion/GPUCommon/typed-pointers.mlir
deleted file mode 100644
index e27162c7dbc19..0000000000000
--- a/mlir/test/Conversion/GPUCommon/typed-pointers.mlir
+++ /dev/null
@@ -1,82 +0,0 @@
-// RUN: mlir-opt %s --gpu-to-llvm='use-opaque-pointers=0' --split-input-file | FileCheck %s
-
-module attributes {gpu.container_module} {
-  // CHECK-LABEL: llvm.func @main
-  // CHECK-SAME: %[[size:.*]]: i64
-  func.func @main(%size : index) {
-    // CHECK: %[[stream:.*]] = llvm.call @mgpuStreamCreate()
-    %0 = gpu.wait async
-    // CHECK: %[[gep:.*]] = llvm.getelementptr {{.*}}[%[[size]]]
-    // CHECK: %[[size_bytes:.*]] = llvm.ptrtoint %[[gep]]
-    // CHECK: %[[isHostShared:.*]] = llvm.mlir.constant
-    // CHECK: llvm.call @mgpuMemAlloc(%[[size_bytes]], %[[stream]], %[[isHostShared]])
-    %1, %2 = gpu.alloc async [%0] (%size) : memref<?xf32>
-    // CHECK: %[[float_ptr:.*]] = llvm.extractvalue {{.*}}[0]
-    // CHECK: %[[void_ptr:.*]] = llvm.bitcast %[[float_ptr]]
-    // CHECK: llvm.call @mgpuMemFree(%[[void_ptr]], %[[stream]])
-    %3 = gpu.dealloc async [%2] %1 : memref<?xf32>
-    // CHECK: llvm.call @mgpuStreamSynchronize(%[[stream]])
-    // CHECK: llvm.call @mgpuStreamDestroy(%[[stream]])
-    gpu.wait [%3]
-    return
-  }
-
-  // CHECK: func @foo
-  func.func @foo(%dst : memref<7xf32, 1>, %src : memref<7xf32>) {
-    // CHECK: %[[t0:.*]] = llvm.call @mgpuStreamCreate
-    %t0 = gpu.wait async
-    // CHECK: %[[size_bytes:.*]] = llvm.ptrtoint
-    // CHECK-NOT: llvm.addrspacecast
-    // CHECK: %[[src:.*]] = llvm.bitcast
-    // CHECK: %[[addr_cast:.*]] = llvm.addrspacecast
-    // CHECK: %[[dst:.*]] = llvm.bitcast %[[addr_cast]]
-    // CHECK: llvm.call @mgpuMemcpy(%[[dst]], %[[src]], %[[size_bytes]], %[[t0]])
-    %t1 = gpu.memcpy async [%t0] %dst, %src : memref<7xf32, 1>, memref<7xf32>
-    // CHECK: llvm.call @mgpuStreamSynchronize(%[[t0]])
-    // CHECK: llvm.call @mgpuStreamDestroy(%[[t0]])
-    gpu.wait [%t1]
-    return
-  }
-}
-
-// -----
-
-module attributes {gpu.container_module} {
-
-  // CHECK: func @memset_f32
-  func.func @memset_f32(%dst : memref<7xf32, 1>, %value : f32) {
-    // CHECK: %[[t0:.*]] = llvm.call @mgpuStreamCreate
-    %t0 = gpu.wait async
-    // CHECK: %[[size_bytes:.*]] = llvm.mlir.constant
-    // CHECK: %[[value:.*]] = llvm.bitcast
-    // CHECK: %[[addr_cast:.*]] = llvm.addrspacecast
-    // CHECK: %[[dst:.*]] = llvm.bitcast %[[addr_cast]]
-    // CHECK: llvm.call @mgpuMemset32(%[[dst]], %[[value]], %[[size_bytes]], %[[t0]])
-    %t1 = gpu.memset async [%t0] %dst, %value : memref<7xf32, 1>, f32
-    // CHECK: llvm.call @mgpuStreamSynchronize(%[[t0]])
-    // CHECK: llvm.call @mgpuStreamDestroy(%[[t0]])
-    gpu.wait [%t1]
-    return
-  }
-}
-
-// -----
-
-module attributes {gpu.container_module} {
-
-  // CHECK: func @memset_f16
-  func.func @memset_f16(%dst : memref<7xf16, 1>, %value : f16) {
-    // CHECK: %[[t0:.*]] = llvm.call @mgpuStreamCreate
-    %t0 = gpu.wait async
-    // CHECK: %[[size_bytes:.*]] = llvm.mlir.constant
-    // CHECK: %[[value:.*]] = llvm.bitcast
-    // CHECK: %[[addr_cast:.*]] = llvm.addrspacecast
-    // CHECK: %[[dst:.*]] = llvm.bitcast %[[addr_cast]]
-    // CHECK: llvm.call @mgpuMemset16(%[[dst]], %[[value]], %[[size_bytes]], %[[t0]])
-    %t1 = gpu.memset async [%t0] %dst, %value : memref<7xf16, 1>, f16
-    // CHECK: llvm.call @mgpuStreamSynchronize(%[[t0]])
-    // CHECK: llvm.call @mgpuStreamDestroy(%[[t0]])
-    gpu.wait [%t1]
-    return
-  }
-}

From 6086c272a3a59eb0b6b79dcbe00486bf4461856a Mon Sep 17 00:00:00 2001
From: Matthias Springer <me@m-sp.org>
Date: Tue, 31 Oct 2023 17:26:56 +0900
Subject: [PATCH 484/877] [mlir][memref] Fix out-of-bounds crash when reifying
 result dims (#70774)

Do not crash when the input IR is invalid, i.e., when the index of the
dimension operand of a `tensor.dim`/`memref.dim` is out-of-bounds. This
fixes #70180.
---
 .../ResolveShapedTypeResultDims.cpp           |  3 +++
 mlir/test/Dialect/MemRef/resolve-dim-ops.mlir | 27 +++++++++++++++++++
 2 files changed, 30 insertions(+)
 create mode 100644 mlir/test/Dialect/MemRef/resolve-dim-ops.mlir

diff --git a/mlir/lib/Dialect/MemRef/Transforms/ResolveShapedTypeResultDims.cpp b/mlir/lib/Dialect/MemRef/Transforms/ResolveShapedTypeResultDims.cpp
index f18ae2cc9b688..fe2eede375ec1 100644
--- a/mlir/lib/Dialect/MemRef/Transforms/ResolveShapedTypeResultDims.cpp
+++ b/mlir/lib/Dialect/MemRef/Transforms/ResolveShapedTypeResultDims.cpp
@@ -94,6 +94,9 @@ struct DimOfReifyRankedShapedTypeOpInterface : public OpRewritePattern<OpTy> {
                                  reifiedResultShapes)))
       return failure();
     unsigned resultNumber = dimValue.getResultNumber();
+    // Do not apply pattern if the IR is invalid (dim out of bounds).
+    if (*dimIndex >= reifiedResultShapes[resultNumber].size())
+      return rewriter.notifyMatchFailure(dimOp, "dimension is out of bounds");
     Value replacement = getValueOrCreateConstantIndexOp(
         rewriter, dimOp.getLoc(), reifiedResultShapes[resultNumber][*dimIndex]);
     rewriter.replaceOp(dimOp, replacement);
diff --git a/mlir/test/Dialect/MemRef/resolve-dim-ops.mlir b/mlir/test/Dialect/MemRef/resolve-dim-ops.mlir
new file mode 100644
index 0000000000000..18e9a9d02e108
--- /dev/null
+++ b/mlir/test/Dialect/MemRef/resolve-dim-ops.mlir
@@ -0,0 +1,27 @@
+// RUN: mlir-opt --resolve-ranked-shaped-type-result-dims --split-input-file %s | FileCheck %s
+
+// CHECK-LABEL: func @dim_out_of_bounds(
+//  CHECK-NEXT:   arith.constant
+//  CHECK-NEXT:   memref.dim
+//  CHECK-NEXT:   return
+func.func @dim_out_of_bounds(%m : memref<7x8xf32>) -> index {
+  %idx = arith.constant 7 : index
+  %0 = memref.dim %m, %idx : memref<7x8xf32>
+  return %0 : index
+}
+
+// -----
+
+// CHECK-LABEL: func @dim_out_of_bounds_2(
+//  CHECK-NEXT:   arith.constant
+//  CHECK-NEXT:   arith.constant
+//  CHECK-NEXT:   bufferization.alloc_tensor
+//  CHECK-NEXT:   tensor.dim
+//  CHECK-NEXT:   return
+func.func @dim_out_of_bounds_2(%idx1 : index, %idx2 : index) -> index {
+  %idx = arith.constant 7 : index
+  %sz = arith.constant 5 : index
+  %alloc = bufferization.alloc_tensor(%sz, %sz) : tensor<?x?xf32>
+  %0 = tensor.dim %alloc, %idx : tensor<?x?xf32>
+  return %0 : index
+}

From 03c8fbf092a6b6169239d5824df5cced5c020419 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke@igalia.com>
Date: Tue, 31 Oct 2023 08:41:56 +0000
Subject: [PATCH 485/877] [RISCV] Add _RM pseudos to pseudos table (#70693)

---
 .../Target/RISCV/RISCVInstrInfoVPseudos.td    |  9 +-
 .../CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll   | 84 +++++++++----------
 2 files changed, 46 insertions(+), 47 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
index 445f057d61ee1..bec67153b6543 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
@@ -134,7 +134,8 @@ class PseudoToVInst<string PseudoInst> {
                         ["_M2", ""],
                         ["_M4", ""],
                         ["_M8", ""],
-                        ["_SE", ""]
+                        ["_SE", ""],
+                        ["_RM", ""]
                        ];
   string VInst = !foldl(PseudoInst, AffixSubsts, Acc, AffixSubst,
                         !subst(AffixSubst[0], AffixSubst[1], Acc));
@@ -1113,7 +1114,8 @@ class VPseudoUnaryNoMask_FRM<VReg RetClass,
                              string Constraint = ""> :
       Pseudo<(outs RetClass:$rd),
              (ins RetClass:$merge, OpClass:$rs2, ixlenimm:$frm,
-                  AVL:$vl, ixlenimm:$sew, ixlenimm:$policy), []> {
+                  AVL:$vl, ixlenimm:$sew, ixlenimm:$policy), []>,
+      RISCVVPseudo {
   let mayLoad = 0;
   let mayStore = 0;
   let hasSideEffects = 0;
@@ -1131,7 +1133,8 @@ class VPseudoUnaryMask_FRM<VReg RetClass,
       Pseudo<(outs GetVRegNoV0<RetClass>.R:$rd),
              (ins GetVRegNoV0<RetClass>.R:$merge, OpClass:$rs2,
                   VMaskOp:$vm, ixlenimm:$frm,
-                  AVL:$vl, ixlenimm:$sew, ixlenimm:$policy), []> {
+                  AVL:$vl, ixlenimm:$sew, ixlenimm:$policy), []>,
+      RISCVVPseudo {
   let mayLoad = 0;
   let mayStore = 0;
   let hasSideEffects = 0;
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll
index 9c30d3ac71679..e62eec573a570 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctlz.ll
@@ -529,16 +529,15 @@ define void @ctlz_v2i64(ptr %x, ptr %y) nounwind {
 ;
 ; LMULMAX2-RV32F-LABEL: ctlz_v2i64:
 ; LMULMAX2-RV32F:       # %bb.0:
-; LMULMAX2-RV32F-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; LMULMAX2-RV32F-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; LMULMAX2-RV32F-NEXT:    vle64.v v8, (a0)
-; LMULMAX2-RV32F-NEXT:    fsrmi a1, 1
-; LMULMAX2-RV32F-NEXT:    vfncvt.f.xu.w v9, v8
-; LMULMAX2-RV32F-NEXT:    fsrm a1
-; LMULMAX2-RV32F-NEXT:    vsrl.vi v8, v9, 23
 ; LMULMAX2-RV32F-NEXT:    li a1, 190
-; LMULMAX2-RV32F-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
 ; LMULMAX2-RV32F-NEXT:    vmv.v.x v9, a1
+; LMULMAX2-RV32F-NEXT:    fsrmi a1, 1
 ; LMULMAX2-RV32F-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; LMULMAX2-RV32F-NEXT:    vfncvt.f.xu.w v10, v8
+; LMULMAX2-RV32F-NEXT:    fsrm a1
+; LMULMAX2-RV32F-NEXT:    vsrl.vi v8, v10, 23
 ; LMULMAX2-RV32F-NEXT:    vwsubu.wv v9, v9, v8
 ; LMULMAX2-RV32F-NEXT:    li a1, 64
 ; LMULMAX2-RV32F-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
@@ -550,12 +549,12 @@ define void @ctlz_v2i64(ptr %x, ptr %y) nounwind {
 ; LMULMAX2-RV64F:       # %bb.0:
 ; LMULMAX2-RV64F-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; LMULMAX2-RV64F-NEXT:    vle64.v v8, (a0)
-; LMULMAX2-RV64F-NEXT:    fsrmi a1, 1
-; LMULMAX2-RV64F-NEXT:    vfncvt.f.xu.w v9, v8
-; LMULMAX2-RV64F-NEXT:    fsrm a1
-; LMULMAX2-RV64F-NEXT:    vsrl.vi v8, v9, 23
 ; LMULMAX2-RV64F-NEXT:    li a1, 190
 ; LMULMAX2-RV64F-NEXT:    vmv.v.x v9, a1
+; LMULMAX2-RV64F-NEXT:    fsrmi a1, 1
+; LMULMAX2-RV64F-NEXT:    vfncvt.f.xu.w v10, v8
+; LMULMAX2-RV64F-NEXT:    fsrm a1
+; LMULMAX2-RV64F-NEXT:    vsrl.vi v8, v10, 23
 ; LMULMAX2-RV64F-NEXT:    vwsubu.vv v10, v9, v8
 ; LMULMAX2-RV64F-NEXT:    li a1, 64
 ; LMULMAX2-RV64F-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
@@ -1129,16 +1128,15 @@ define void @ctlz_v4i64(ptr %x, ptr %y) nounwind {
 ;
 ; LMULMAX2-RV32F-LABEL: ctlz_v4i64:
 ; LMULMAX2-RV32F:       # %bb.0:
-; LMULMAX2-RV32F-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; LMULMAX2-RV32F-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; LMULMAX2-RV32F-NEXT:    vle64.v v8, (a0)
-; LMULMAX2-RV32F-NEXT:    fsrmi a1, 1
-; LMULMAX2-RV32F-NEXT:    vfncvt.f.xu.w v10, v8
-; LMULMAX2-RV32F-NEXT:    fsrm a1
-; LMULMAX2-RV32F-NEXT:    vsrl.vi v8, v10, 23
 ; LMULMAX2-RV32F-NEXT:    li a1, 190
-; LMULMAX2-RV32F-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; LMULMAX2-RV32F-NEXT:    vmv.v.x v10, a1
+; LMULMAX2-RV32F-NEXT:    fsrmi a1, 1
 ; LMULMAX2-RV32F-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; LMULMAX2-RV32F-NEXT:    vfncvt.f.xu.w v12, v8
+; LMULMAX2-RV32F-NEXT:    fsrm a1
+; LMULMAX2-RV32F-NEXT:    vsrl.vi v8, v12, 23
 ; LMULMAX2-RV32F-NEXT:    vwsubu.wv v10, v10, v8
 ; LMULMAX2-RV32F-NEXT:    li a1, 64
 ; LMULMAX2-RV32F-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
@@ -1150,16 +1148,16 @@ define void @ctlz_v4i64(ptr %x, ptr %y) nounwind {
 ; LMULMAX2-RV64F:       # %bb.0:
 ; LMULMAX2-RV64F-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; LMULMAX2-RV64F-NEXT:    vle64.v v8, (a0)
+; LMULMAX2-RV64F-NEXT:    li a1, 190
+; LMULMAX2-RV64F-NEXT:    vmv.v.x v10, a1
 ; LMULMAX2-RV64F-NEXT:    fsrmi a1, 1
-; LMULMAX2-RV64F-NEXT:    vfncvt.f.xu.w v10, v8
+; LMULMAX2-RV64F-NEXT:    vfncvt.f.xu.w v11, v8
 ; LMULMAX2-RV64F-NEXT:    fsrm a1
-; LMULMAX2-RV64F-NEXT:    vsrl.vi v8, v10, 23
-; LMULMAX2-RV64F-NEXT:    li a1, 190
-; LMULMAX2-RV64F-NEXT:    vmv.v.x v9, a1
-; LMULMAX2-RV64F-NEXT:    vwsubu.vv v10, v9, v8
+; LMULMAX2-RV64F-NEXT:    vsrl.vi v8, v11, 23
+; LMULMAX2-RV64F-NEXT:    vwsubu.vv v12, v10, v8
 ; LMULMAX2-RV64F-NEXT:    li a1, 64
 ; LMULMAX2-RV64F-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; LMULMAX2-RV64F-NEXT:    vminu.vx v8, v10, a1
+; LMULMAX2-RV64F-NEXT:    vminu.vx v8, v12, a1
 ; LMULMAX2-RV64F-NEXT:    vse64.v v8, (a0)
 ; LMULMAX2-RV64F-NEXT:    ret
 ;
@@ -1716,16 +1714,15 @@ define void @ctlz_zero_undef_v2i64(ptr %x, ptr %y) nounwind {
 ;
 ; LMULMAX2-RV32F-LABEL: ctlz_zero_undef_v2i64:
 ; LMULMAX2-RV32F:       # %bb.0:
-; LMULMAX2-RV32F-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; LMULMAX2-RV32F-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; LMULMAX2-RV32F-NEXT:    vle64.v v8, (a0)
-; LMULMAX2-RV32F-NEXT:    fsrmi a1, 1
-; LMULMAX2-RV32F-NEXT:    vfncvt.f.xu.w v9, v8
-; LMULMAX2-RV32F-NEXT:    fsrm a1
-; LMULMAX2-RV32F-NEXT:    vsrl.vi v8, v9, 23
 ; LMULMAX2-RV32F-NEXT:    li a1, 190
-; LMULMAX2-RV32F-NEXT:    vsetvli zero, zero, e64, m1, ta, ma
 ; LMULMAX2-RV32F-NEXT:    vmv.v.x v9, a1
+; LMULMAX2-RV32F-NEXT:    fsrmi a1, 1
 ; LMULMAX2-RV32F-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; LMULMAX2-RV32F-NEXT:    vfncvt.f.xu.w v10, v8
+; LMULMAX2-RV32F-NEXT:    fsrm a1
+; LMULMAX2-RV32F-NEXT:    vsrl.vi v8, v10, 23
 ; LMULMAX2-RV32F-NEXT:    vwsubu.wv v9, v9, v8
 ; LMULMAX2-RV32F-NEXT:    vse64.v v9, (a0)
 ; LMULMAX2-RV32F-NEXT:    ret
@@ -1734,12 +1731,12 @@ define void @ctlz_zero_undef_v2i64(ptr %x, ptr %y) nounwind {
 ; LMULMAX2-RV64F:       # %bb.0:
 ; LMULMAX2-RV64F-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; LMULMAX2-RV64F-NEXT:    vle64.v v8, (a0)
-; LMULMAX2-RV64F-NEXT:    fsrmi a1, 1
-; LMULMAX2-RV64F-NEXT:    vfncvt.f.xu.w v9, v8
-; LMULMAX2-RV64F-NEXT:    fsrm a1
-; LMULMAX2-RV64F-NEXT:    vsrl.vi v8, v9, 23
 ; LMULMAX2-RV64F-NEXT:    li a1, 190
 ; LMULMAX2-RV64F-NEXT:    vmv.v.x v9, a1
+; LMULMAX2-RV64F-NEXT:    fsrmi a1, 1
+; LMULMAX2-RV64F-NEXT:    vfncvt.f.xu.w v10, v8
+; LMULMAX2-RV64F-NEXT:    fsrm a1
+; LMULMAX2-RV64F-NEXT:    vsrl.vi v8, v10, 23
 ; LMULMAX2-RV64F-NEXT:    vwsubu.vv v10, v9, v8
 ; LMULMAX2-RV64F-NEXT:    vse64.v v10, (a0)
 ; LMULMAX2-RV64F-NEXT:    ret
@@ -2286,16 +2283,15 @@ define void @ctlz_zero_undef_v4i64(ptr %x, ptr %y) nounwind {
 ;
 ; LMULMAX2-RV32F-LABEL: ctlz_zero_undef_v4i64:
 ; LMULMAX2-RV32F:       # %bb.0:
-; LMULMAX2-RV32F-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; LMULMAX2-RV32F-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; LMULMAX2-RV32F-NEXT:    vle64.v v8, (a0)
-; LMULMAX2-RV32F-NEXT:    fsrmi a1, 1
-; LMULMAX2-RV32F-NEXT:    vfncvt.f.xu.w v10, v8
-; LMULMAX2-RV32F-NEXT:    fsrm a1
-; LMULMAX2-RV32F-NEXT:    vsrl.vi v8, v10, 23
 ; LMULMAX2-RV32F-NEXT:    li a1, 190
-; LMULMAX2-RV32F-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
 ; LMULMAX2-RV32F-NEXT:    vmv.v.x v10, a1
+; LMULMAX2-RV32F-NEXT:    fsrmi a1, 1
 ; LMULMAX2-RV32F-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; LMULMAX2-RV32F-NEXT:    vfncvt.f.xu.w v12, v8
+; LMULMAX2-RV32F-NEXT:    fsrm a1
+; LMULMAX2-RV32F-NEXT:    vsrl.vi v8, v12, 23
 ; LMULMAX2-RV32F-NEXT:    vwsubu.wv v10, v10, v8
 ; LMULMAX2-RV32F-NEXT:    vse64.v v10, (a0)
 ; LMULMAX2-RV32F-NEXT:    ret
@@ -2304,14 +2300,14 @@ define void @ctlz_zero_undef_v4i64(ptr %x, ptr %y) nounwind {
 ; LMULMAX2-RV64F:       # %bb.0:
 ; LMULMAX2-RV64F-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; LMULMAX2-RV64F-NEXT:    vle64.v v8, (a0)
+; LMULMAX2-RV64F-NEXT:    li a1, 190
+; LMULMAX2-RV64F-NEXT:    vmv.v.x v10, a1
 ; LMULMAX2-RV64F-NEXT:    fsrmi a1, 1
-; LMULMAX2-RV64F-NEXT:    vfncvt.f.xu.w v10, v8
+; LMULMAX2-RV64F-NEXT:    vfncvt.f.xu.w v11, v8
 ; LMULMAX2-RV64F-NEXT:    fsrm a1
-; LMULMAX2-RV64F-NEXT:    vsrl.vi v8, v10, 23
-; LMULMAX2-RV64F-NEXT:    li a1, 190
-; LMULMAX2-RV64F-NEXT:    vmv.v.x v9, a1
-; LMULMAX2-RV64F-NEXT:    vwsubu.vv v10, v9, v8
-; LMULMAX2-RV64F-NEXT:    vse64.v v10, (a0)
+; LMULMAX2-RV64F-NEXT:    vsrl.vi v8, v11, 23
+; LMULMAX2-RV64F-NEXT:    vwsubu.vv v12, v10, v8
+; LMULMAX2-RV64F-NEXT:    vse64.v v12, (a0)
 ; LMULMAX2-RV64F-NEXT:    ret
 ;
 ; LMULMAX2-RV32D-LABEL: ctlz_zero_undef_v4i64:

From 03934e70ef1cf86a6ebd1a56e0b5b46f4fec8e13 Mon Sep 17 00:00:00 2001
From: Ilya Leoshkevich <iii@linux.ibm.com>
Date: Tue, 31 Oct 2023 09:51:06 +0100
Subject: [PATCH 486/877] [SystemZ] Enable AtomicExpand pass (#70398)

The upcoming OpenMP support for SystemZ requires handling of IR insns
like `atomicrmw fadd`. Normally atomic float operations are expanded by
Clang and such insns do not occur, but OpenMP generates them directly.
Other architectures handle this using the AtomicExpand pass, which
SystemZ did not need so far. Enable it.

Currently AtomicExpand treats atomic load and stores of floats
pessimistically: it casts them to integers, which SystemZ does not need,
since the floating point load and store instructions are already atomic.
However, the way Clang currently expands them is pessimistic as well, so
this change does not make things worse. Optimizing operations on atomic
floats can be a separate change in the future.

This change does not create any differences the Linux kernel build.
---
 .../Target/SystemZ/SystemZISelLowering.cpp    |  9 ++++
 llvm/lib/Target/SystemZ/SystemZISelLowering.h |  2 +
 .../Target/SystemZ/SystemZTargetMachine.cpp   |  2 +
 llvm/test/CodeGen/SystemZ/atomic-load-06.ll   | 13 ++++++
 llvm/test/CodeGen/SystemZ/atomic-load-07.ll   | 11 +++++
 llvm/test/CodeGen/SystemZ/atomic-load-08.ll   | 20 +++++++++
 llvm/test/CodeGen/SystemZ/atomic-store-06.ll  | 13 ++++++
 llvm/test/CodeGen/SystemZ/atomic-store-07.ll  | 11 +++++
 llvm/test/CodeGen/SystemZ/atomic-store-08.ll  | 20 +++++++++
 .../test/CodeGen/SystemZ/atomicrmw-fadd-01.ll | 22 ++++++++++
 .../test/CodeGen/SystemZ/atomicrmw-fadd-02.ll | 19 +++++++++
 .../test/CodeGen/SystemZ/atomicrmw-fadd-03.ll | 30 +++++++++++++
 .../test/CodeGen/SystemZ/atomicrmw-fmax-01.ll | 27 ++++++++++++
 .../test/CodeGen/SystemZ/atomicrmw-fmax-02.ll | 24 +++++++++++
 .../test/CodeGen/SystemZ/atomicrmw-fmax-03.ll | 42 +++++++++++++++++++
 .../test/CodeGen/SystemZ/atomicrmw-fmin-01.ll | 27 ++++++++++++
 .../test/CodeGen/SystemZ/atomicrmw-fmin-02.ll | 24 +++++++++++
 .../test/CodeGen/SystemZ/atomicrmw-fmin-03.ll | 42 +++++++++++++++++++
 .../test/CodeGen/SystemZ/atomicrmw-fsub-01.ll | 22 ++++++++++
 .../test/CodeGen/SystemZ/atomicrmw-fsub-02.ll | 19 +++++++++
 .../test/CodeGen/SystemZ/atomicrmw-fsub-03.ll | 30 +++++++++++++
 .../CodeGen/SystemZ/atomicrmw-udec_wrap.ll    | 28 +++++++++++++
 .../CodeGen/SystemZ/atomicrmw-uinc_wrap.ll    | 22 ++++++++++
 .../test/CodeGen/SystemZ/atomicrmw-xchg-05.ll | 17 ++++++++
 .../test/CodeGen/SystemZ/atomicrmw-xchg-06.ll | 16 +++++++
 .../test/CodeGen/SystemZ/atomicrmw-xchg-07.ll | 24 +++++++++++
 26 files changed, 536 insertions(+)
 create mode 100644 llvm/test/CodeGen/SystemZ/atomic-load-06.ll
 create mode 100644 llvm/test/CodeGen/SystemZ/atomic-load-07.ll
 create mode 100644 llvm/test/CodeGen/SystemZ/atomic-load-08.ll
 create mode 100644 llvm/test/CodeGen/SystemZ/atomic-store-06.ll
 create mode 100644 llvm/test/CodeGen/SystemZ/atomic-store-07.ll
 create mode 100644 llvm/test/CodeGen/SystemZ/atomic-store-08.ll
 create mode 100644 llvm/test/CodeGen/SystemZ/atomicrmw-fadd-01.ll
 create mode 100644 llvm/test/CodeGen/SystemZ/atomicrmw-fadd-02.ll
 create mode 100644 llvm/test/CodeGen/SystemZ/atomicrmw-fadd-03.ll
 create mode 100644 llvm/test/CodeGen/SystemZ/atomicrmw-fmax-01.ll
 create mode 100644 llvm/test/CodeGen/SystemZ/atomicrmw-fmax-02.ll
 create mode 100644 llvm/test/CodeGen/SystemZ/atomicrmw-fmax-03.ll
 create mode 100644 llvm/test/CodeGen/SystemZ/atomicrmw-fmin-01.ll
 create mode 100644 llvm/test/CodeGen/SystemZ/atomicrmw-fmin-02.ll
 create mode 100644 llvm/test/CodeGen/SystemZ/atomicrmw-fmin-03.ll
 create mode 100644 llvm/test/CodeGen/SystemZ/atomicrmw-fsub-01.ll
 create mode 100644 llvm/test/CodeGen/SystemZ/atomicrmw-fsub-02.ll
 create mode 100644 llvm/test/CodeGen/SystemZ/atomicrmw-fsub-03.ll
 create mode 100644 llvm/test/CodeGen/SystemZ/atomicrmw-udec_wrap.ll
 create mode 100644 llvm/test/CodeGen/SystemZ/atomicrmw-uinc_wrap.ll
 create mode 100644 llvm/test/CodeGen/SystemZ/atomicrmw-xchg-05.ll
 create mode 100644 llvm/test/CodeGen/SystemZ/atomicrmw-xchg-06.ll
 create mode 100644 llvm/test/CodeGen/SystemZ/atomicrmw-xchg-07.ll

diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index 3db777f904df0..e6ea4205623d3 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -872,6 +872,15 @@ bool SystemZTargetLowering::hasInlineStackProbe(const MachineFunction &MF) const
   return false;
 }
 
+TargetLowering::AtomicExpansionKind
+SystemZTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
+  return (RMW->isFloatingPointOperation() ||
+          RMW->getOperation() == AtomicRMWInst::UIncWrap ||
+          RMW->getOperation() == AtomicRMWInst::UDecWrap)
+             ? AtomicExpansionKind::CmpXChg
+             : AtomicExpansionKind::None;
+}
+
 bool SystemZTargetLowering::isLegalICmpImmediate(int64_t Imm) const {
   // We can use CGFI or CLGFI.
   return isInt<32>(Imm) || isUInt<32>(Imm);
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.h b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
index 0d4b4873e9d73..fd951b935702a 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.h
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
@@ -452,6 +452,8 @@ class SystemZTargetLowering : public TargetLowering {
     return VT != MVT::f64;
   }
   bool hasInlineStackProbe(const MachineFunction &MF) const override;
+  AtomicExpansionKind
+  shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const override;
   bool isLegalICmpImmediate(int64_t Imm) const override;
   bool isLegalAddImmediate(int64_t Imm) const override;
   bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty,
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp b/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp
index 8235446da65c4..186494ad2ac61 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp
@@ -226,6 +226,8 @@ void SystemZPassConfig::addIRPasses() {
     addPass(createLoopDataPrefetchPass());
   }
 
+  addPass(createAtomicExpandPass());
+
   TargetPassConfig::addIRPasses();
 }
 
diff --git a/llvm/test/CodeGen/SystemZ/atomic-load-06.ll b/llvm/test/CodeGen/SystemZ/atomic-load-06.ll
new file mode 100644
index 0000000000000..c9c5504520345
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/atomic-load-06.ll
@@ -0,0 +1,13 @@
+; Test float atomic loads.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+define float @f1(ptr %src) {
+; CHECK-LABEL: f1:
+; CHECK: lgf [[R:%r[0-9]+]], 0(%r2)
+; CHECK: sllg [[R]], [[R]], 32
+; CHECK: ldgr %f0, [[R]]
+; CHECK: br %r14
+  %val = load atomic float, ptr %src seq_cst, align 4
+  ret float %val
+}
diff --git a/llvm/test/CodeGen/SystemZ/atomic-load-07.ll b/llvm/test/CodeGen/SystemZ/atomic-load-07.ll
new file mode 100644
index 0000000000000..d183cb6af3d20
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/atomic-load-07.ll
@@ -0,0 +1,11 @@
+; Test double atomic loads.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+define double @f1(ptr %src) {
+; CHECK-LABEL: f1:
+; CHECK: ld %f0, 0(%r2)
+; CHECK: br %r14
+  %val = load atomic double, ptr %src seq_cst, align 8
+  ret double %val
+}
diff --git a/llvm/test/CodeGen/SystemZ/atomic-load-08.ll b/llvm/test/CodeGen/SystemZ/atomic-load-08.ll
new file mode 100644
index 0000000000000..069d2168e19af
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/atomic-load-08.ll
@@ -0,0 +1,20 @@
+; Test long double atomic loads. Expect a libcall.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+define void @f1(ptr %ret, ptr %src) {
+; CHECK-LABEL: f1:
+; CHECK: lgr [[RET:%r[0-9]+]], %r2
+; CHECK: la %r4, 160(%r15)
+; CHECK: lghi %r2, 16
+; CHECK: lhi %r5, 5
+; CHECK: brasl %r14, __atomic_load@PLT
+; CHECK: ld [[FL:%f[0-9]+]], 160(%r15)
+; CHECK: ld [[FH:%f[0-9]+]], 168(%r15)
+; CHECK: std [[FL]], 0([[RET]])
+; CHECK: std [[FH]], 8([[RET]])
+; CHECK: br %r14
+  %val = load atomic fp128, ptr %src seq_cst, align 8
+  store fp128 %val, ptr %ret, align 8
+  ret void
+}
diff --git a/llvm/test/CodeGen/SystemZ/atomic-store-06.ll b/llvm/test/CodeGen/SystemZ/atomic-store-06.ll
new file mode 100644
index 0000000000000..fd39793faefc8
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/atomic-store-06.ll
@@ -0,0 +1,13 @@
+; Test float atomic loads.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+define void @f1(ptr %src, float %val) {
+; CHECK-LABEL: f1:
+; CHECK: lgdr [[R:%r[0-9]+]], %f0
+; CHECK: srlg [[R]], [[R]], 32
+; CHECK: st [[R]], 0(%r2)
+; CHECK: br %r14
+  store atomic float %val, ptr %src seq_cst, align 4
+  ret void
+}
diff --git a/llvm/test/CodeGen/SystemZ/atomic-store-07.ll b/llvm/test/CodeGen/SystemZ/atomic-store-07.ll
new file mode 100644
index 0000000000000..c904b738f2c57
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/atomic-store-07.ll
@@ -0,0 +1,11 @@
+; Test double atomic stores.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+define void @f1(ptr %dst, double %val) {
+; CHECK-LABEL: f1:
+; CHECK: std %f0, 0(%r2)
+; CHECK: br %r14
+  store atomic double %val, ptr %dst seq_cst, align 8
+  ret void
+}
diff --git a/llvm/test/CodeGen/SystemZ/atomic-store-08.ll b/llvm/test/CodeGen/SystemZ/atomic-store-08.ll
new file mode 100644
index 0000000000000..b33b283e8dbd7
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/atomic-store-08.ll
@@ -0,0 +1,20 @@
+; Test long double atomic stores. Expect a libcall.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+define void @f1(ptr %dst, ptr %src) {
+; CHECK-LABEL: f1:
+; CHECK: ld [[FL:%f[0-9]+]], 0(%r3)
+; CHECK: ld [[FH:%f[0-9]+]], 8(%r3)
+; CHECK: lgr %r3, %r2
+; CHECK: std [[FL]], 160(%r15)
+; CHECK: std [[FH]], 168(%r15)
+; CHECK: la %r4, 160(%r15)
+; CHECK: lghi %r2, 16
+; CHECK: lhi %r5, 5
+; CHECK: brasl %r14, __atomic_store@PLT
+; CHECK: br %r14
+  %val = load fp128, ptr %src, align 8
+  store atomic fp128 %val, ptr %dst seq_cst, align 8
+  ret void
+}
diff --git a/llvm/test/CodeGen/SystemZ/atomicrmw-fadd-01.ll b/llvm/test/CodeGen/SystemZ/atomicrmw-fadd-01.ll
new file mode 100644
index 0000000000000..1bfa055781c98
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/atomicrmw-fadd-01.ll
@@ -0,0 +1,22 @@
+; Test atomic float addition. Expect a compare-and-swap loop.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+define float @f1(ptr %src, float %b) {
+; CHECK-LABEL: f1:
+; CHECK: le [[F:%f[0-9]+]], 0(%r2)
+; CHECK: [[L:\.L.+]]:
+; CHECK: lgdr [[RI:%r[0-9]+]], [[F]]
+; CHECK: aebr [[F]], %f0
+; CHECK: lgdr [[RO:%r[0-9]+]], [[F]]
+; CHECK: srlg [[RO]], [[RO]], 32
+; CHECK: srlg [[RI]], [[RI]], 32
+; CHECK: cs [[RI]], [[RO]], 0(%r2)
+; CHECK: sllg [[RI]], [[RI]], 32
+; CHECK: ldgr [[F]], [[RI]]
+; CHECK: jl [[L]]
+; CHECK: ler %f0, [[F]]
+; CHECK: br %r14
+  %res = atomicrmw fadd ptr %src, float %b seq_cst
+  ret float %res
+}
diff --git a/llvm/test/CodeGen/SystemZ/atomicrmw-fadd-02.ll b/llvm/test/CodeGen/SystemZ/atomicrmw-fadd-02.ll
new file mode 100644
index 0000000000000..7f9ee3cbec22e
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/atomicrmw-fadd-02.ll
@@ -0,0 +1,19 @@
+; Test atomic double addition. Expect a compare-and-swap loop.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+define double @f1(ptr %src, double %b) {
+; CHECK-LABEL: f1:
+; CHECK: ld [[F:%f[0-9]+]], 0(%r2)
+; CHECK: [[L:\.L.+]]:
+; CHECK: lgdr [[RI:%r[0-9]+]], [[F]]
+; CHECK: adbr [[F]], %f0
+; CHECK: lgdr [[RO:%r[0-9]+]], [[F]]
+; CHECK: csg [[RI]], [[RO]], 0(%r2)
+; CHECK: ldgr [[F]], [[RI]]
+; CHECK: jl [[L]]
+; CHECK: ldr %f0, [[F]]
+; CHECK: br %r14
+  %res = atomicrmw fadd ptr %src, double %b seq_cst
+  ret double %res
+}
diff --git a/llvm/test/CodeGen/SystemZ/atomicrmw-fadd-03.ll b/llvm/test/CodeGen/SystemZ/atomicrmw-fadd-03.ll
new file mode 100644
index 0000000000000..729fcbc4ac1e7
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/atomicrmw-fadd-03.ll
@@ -0,0 +1,30 @@
+; Test atomic long double addition. Expect a compare-and-swap loop.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+define void @f1(ptr %ret, ptr %src, ptr %b) {
+; CHECK-LABEL: f1:
+; CHECK: [[FBL:%f[0-9]+]], 0(%r4)
+; CHECK: [[FBH:%f[0-9]+]], 8(%r4)
+; CHECK: [[FSL:%f[0-9]+]], 0(%r3)
+; CHECK: [[FSH:%f[0-9]+]], 8(%r3)
+; CHECK: [[LABEL:\.L.+]]:
+; CHECK: lgdr [[RISH:%r[0-9]+]], [[FSH]]
+; CHECK: lgdr [[RISL:%r[0-9]+]], [[FSL]]
+; CHECK: axbr [[FSL]], [[FBL]]
+; CHECK: lgdr [[ROSH:%r[0-9]+]], [[FSH]]
+; CHECK: lgdr [[ROSL:%r[0-9]+]], [[FSL]]
+; CHECK: cdsg [[RISL]], [[ROSL]], 0(%r3)
+; CHECK: stg [[RISH]], 168(%r15)
+; CHECK: stg [[RISL]], 160(%r15)
+; CHECK: ld [[FSL]], 160(%r15)
+; CHECK: ld [[FSH]], 168(%r15)
+; CHECK: jl [[LABEL]]
+; CHECK: std [[FSL]], 0(%r2)
+; CHECK: std [[FSH]], 8(%r2)
+; CHECK: br %r14
+  %val = load fp128, ptr %b
+  %res = atomicrmw fadd ptr %src, fp128 %val seq_cst
+  store fp128 %res, ptr %ret
+  ret void
+}
diff --git a/llvm/test/CodeGen/SystemZ/atomicrmw-fmax-01.ll b/llvm/test/CodeGen/SystemZ/atomicrmw-fmax-01.ll
new file mode 100644
index 0000000000000..80c43137e3a03
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/atomicrmw-fmax-01.ll
@@ -0,0 +1,27 @@
+; Test atomic float maximum.
+; Expect a libcall in a compare-and-swap loop.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+define float @f1(ptr %src, float %b) {
+; CHECK-LABEL: f1:
+; CHECK: lgr [[SRC:%r[0-9]+]], %r2
+; CHECK: le [[FSRC:%f[0-9]+]], 0(%r2)
+; CHECK: ler [[FB:%f[0-9]+]], %f0
+; CHECK: [[L:\.L.+]]:
+; CHECK: ler %f0, [[FSRC]]
+; CHECK: ler %f2, [[FB]]
+; CHECK: brasl %r14, fmaxf@PLT
+; CHECK: lgdr [[RO:%r[0-9]+]], %f0
+; CHECK: srlg [[RO]], [[RO]], 32
+; CHECK: lgdr [[RI:%r[0-9]+]], [[FSRC]]
+; CHECK: srlg [[RI]], [[RI]], 32
+; CHECK: cs [[RI]], [[RO]], 0([[SRC]])
+; CHECK: sllg [[RO]], [[RI]], 32
+; CHECK: ldgr [[FSRC]], [[RO]]
+; CHECK: jl [[L]]
+; CHECK: ler %f0, [[FSRC]]
+; CHECK: br %r14
+  %res = atomicrmw fmax ptr %src, float %b seq_cst
+  ret float %res
+}
diff --git a/llvm/test/CodeGen/SystemZ/atomicrmw-fmax-02.ll b/llvm/test/CodeGen/SystemZ/atomicrmw-fmax-02.ll
new file mode 100644
index 0000000000000..8c245bb049f3f
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/atomicrmw-fmax-02.ll
@@ -0,0 +1,24 @@
+; Test atomic double maximum.
+; Expect a libcall in a compare-and-swap loop.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+define double @f1(ptr %src, double %b) {
+; CHECK-LABEL: f1:
+; CHECK: lgr [[RB:%r[0-9]+]], %r2
+; CHECK: ld [[FB:%f[0-9]+]], 0(%r2)
+; CHECK: ldr [[FSRC:%f[0-9]+]], %f0
+; CHECK: [[L:\.L.+]]:
+; CHECK: ldr %f0, [[FB]]
+; CHECK: ldr %f2, [[FSRC]]
+; CHECK: brasl %r14, fmax@PLT
+; CHECK: lgdr [[RO:%r[0-9]+]], %f0
+; CHECK: lgdr [[RI:%r[0-9]+]], [[FB]]
+; CHECK: csg [[RI]], [[RO]], 0([[RB]])
+; CHECK: ldgr [[FB]], [[RI]]
+; CHECK: jl [[L]]
+; CHECK: ldr %f0, [[FB]]
+; CHECK: br %r14
+  %res = atomicrmw fmax ptr %src, double %b seq_cst
+  ret double %res
+}
diff --git a/llvm/test/CodeGen/SystemZ/atomicrmw-fmax-03.ll b/llvm/test/CodeGen/SystemZ/atomicrmw-fmax-03.ll
new file mode 100644
index 0000000000000..3c8ea19f86f86
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/atomicrmw-fmax-03.ll
@@ -0,0 +1,42 @@
+; Test atomic long double maximum.
+; Expect a libcall in a compare-and-swap loop.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+define void @f1(ptr %ret, ptr %src, ptr %b) {
+; CHECK-LABEL: f1:
+; CHECK: lgr [[SRC:%r[0-9]+]], %r3
+; CHECK: ld [[FBL:%f[0-9]+]], 0(%r4)
+; CHECK: ld [[FBH:%f[0-9]+]], 8(%r4)
+; CHECK: ld [[FSL:%f[0-9]+]], 0(%r3)
+; CHECK: ld [[FSH:%f[0-9]+]], 8(%r3)
+; CHECK: lgr [[RET:%r[0-9]+]], %r2
+; CHECK: [[L:\.L.+]]:
+; CHECK: std [[FBL]], 160(%r15)
+; CHECK: std [[FBH]], 168(%r15)
+; CHECK: la %r2, 192(%r15)
+; CHECK: la %r3, 176(%r15)
+; CHECK: la %r4, 160(%r15)
+; CHECK: std [[FSL]], 176(%r15)
+; CHECK: std [[FSH]], 184(%r15)
+; CHECK: brasl %r14, fmaxl@PLT
+; CHECK: ld [[FL:%f[0-9]+]], 192(%r15)
+; CHECK: ld [[FH:%f[0-9]+]], 200(%r15)
+; CHECK: lgdr [[RH:%r[0-9]+]], [[FH]]
+; CHECK: lgdr [[RL:%r[0-9]+]], [[FL]]
+; CHECK: lgdr [[RSH:%r[0-9]+]], [[FSH]]
+; CHECK: lgdr [[RSL:%r[0-9]+]], [[FSL]]
+; CHECK: cdsg [[RSL]], [[RL]], 0([[SRC]])
+; CHECK: stg [[RSH]], 216(%r15)
+; CHECK: stg [[RSL]], 208(%r15)
+; CHECK: ld [[FSL]], 208(%r15)
+; CHECK: ld [[FSH]], 216(%r15)
+; CHECK: jl [[L]]
+; CHECK: std [[FSL]], 0([[RET]])
+; CHECK: std [[FSH]], 8([[RET]])
+; CHECK: br %r14
+  %val = load fp128, ptr %b
+  %res = atomicrmw fmax ptr %src, fp128 %val seq_cst
+  store fp128 %res, ptr %ret
+  ret void
+}
diff --git a/llvm/test/CodeGen/SystemZ/atomicrmw-fmin-01.ll b/llvm/test/CodeGen/SystemZ/atomicrmw-fmin-01.ll
new file mode 100644
index 0000000000000..c67b02e688de3
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/atomicrmw-fmin-01.ll
@@ -0,0 +1,27 @@
+; Test atomic float minimum.
+; Expect a libcall in a compare-and-swap loop.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+define float @f1(ptr %src, float %b) {
+; CHECK-LABEL: f1:
+; CHECK: lgr [[SRC:%r[0-9]+]], %r2
+; CHECK: le [[FSRC:%f[0-9]+]], 0(%r2)
+; CHECK: ler [[FB:%f[0-9]+]], %f0
+; CHECK: [[L:\.L.+]]:
+; CHECK: ler %f0, [[FSRC]]
+; CHECK: ler %f2, [[FB]]
+; CHECK: brasl %r14, fminf@PLT
+; CHECK: lgdr [[RO:%r[0-9]+]], %f0
+; CHECK: srlg [[RO]], [[RO]], 32
+; CHECK: lgdr [[RI:%r[0-9]+]], [[FSRC]]
+; CHECK: srlg [[RI]], [[RI]], 32
+; CHECK: cs [[RI]], [[RO]], 0([[SRC]])
+; CHECK: sllg [[RO]], [[RI]], 32
+; CHECK: ldgr [[FSRC]], [[RO]]
+; CHECK: jl [[L]]
+; CHECK: ler %f0, [[FSRC]]
+; CHECK: br %r14
+  %res = atomicrmw fmin ptr %src, float %b seq_cst
+  ret float %res
+}
diff --git a/llvm/test/CodeGen/SystemZ/atomicrmw-fmin-02.ll b/llvm/test/CodeGen/SystemZ/atomicrmw-fmin-02.ll
new file mode 100644
index 0000000000000..6691a8b21d11b
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/atomicrmw-fmin-02.ll
@@ -0,0 +1,24 @@
+; Test atomic double minimum.
+; Expect a libcall in a compare-and-swap loop.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+define double @f1(ptr %src, double %b) {
+; CHECK-LABEL: f1:
+; CHECK: lgr [[SRC:%r[0-9]+]], %r2
+; CHECK: ld [[FSRC:%f[0-9]+]], 0(%r2)
+; CHECK: ldr [[FB:%f[0-9]+]], %f0
+; CHECK: [[L:\.L.+]]:
+; CHECK: ldr %f0, [[FSRC]]
+; CHECK: ldr %f2, [[FB]]
+; CHECK: brasl %r14, fmin@PLT
+; CHECK: lgdr [[RO:%r[0-9]+]], %f0
+; CHECK: lgdr [[RI:%r[0-9]+]], [[FSRC]]
+; CHECK: csg [[RI]], [[RO]], 0([[SRC]])
+; CHECK: ldgr [[FSRC]], [[RI]]
+; CHECK: jl [[L]]
+; CHECK: ldr %f0, [[FSRC]]
+; CHECK: br %r14
+  %res = atomicrmw fmin ptr %src, double %b seq_cst
+  ret double %res
+}
diff --git a/llvm/test/CodeGen/SystemZ/atomicrmw-fmin-03.ll b/llvm/test/CodeGen/SystemZ/atomicrmw-fmin-03.ll
new file mode 100644
index 0000000000000..dfa2cc021d166
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/atomicrmw-fmin-03.ll
@@ -0,0 +1,42 @@
+; Test atomic long double minimum.
+; Expect a libcall in a compare-and-swap loop.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+define void @f1(ptr %ret, ptr %src, ptr %b) {
+; CHECK-LABEL: f1:
+; CHECK: lgr [[SRC:%r[0-9]+]], %r3
+; CHECK: ld [[FBL:%f[0-9]+]], 0(%r4)
+; CHECK: ld [[FBH:%f[0-9]+]], 8(%r4)
+; CHECK: ld [[FSL:%f[0-9]+]], 0(%r3)
+; CHECK: ld [[FSH:%f[0-9]+]], 8(%r3)
+; CHECK: lgr [[RET:%r[0-9]+]], %r2
+; CHECK: [[L:\.L.+]]:
+; CHECK: std [[FBL]], 160(%r15)
+; CHECK: std [[FBH]], 168(%r15)
+; CHECK: la %r2, 192(%r15)
+; CHECK: la %r3, 176(%r15)
+; CHECK: la %r4, 160(%r15)
+; CHECK: std [[FSL]], 176(%r15)
+; CHECK: std [[FSH]], 184(%r15)
+; CHECK: brasl %r14, fminl@PLT
+; CHECK: ld [[FL:%f[0-9]+]], 192(%r15)
+; CHECK: ld [[FH:%f[0-9]+]], 200(%r15)
+; CHECK: lgdr [[RH:%r[0-9]+]], [[FH]]
+; CHECK: lgdr [[RL:%r[0-9]+]], [[FL]]
+; CHECK: lgdr [[RSH:%r[0-9]+]], [[FSH]]
+; CHECK: lgdr [[RSL:%r[0-9]+]], [[FSL]]
+; CHECK: cdsg [[RSL]], [[RL]], 0([[SRC]])
+; CHECK: stg [[RSH]], 216(%r15)
+; CHECK: stg [[RSL]], 208(%r15)
+; CHECK: ld [[FSL]], 208(%r15)
+; CHECK: ld [[FSH]], 216(%r15)
+; CHECK: jl [[L]]
+; CHECK: std [[FSL]], 0([[RET]])
+; CHECK: std [[FSH]], 8([[RET]])
+; CHECK: br %r14
+  %val = load fp128, ptr %b
+  %res = atomicrmw fmin ptr %src, fp128 %val seq_cst
+  store fp128 %res, ptr %ret
+  ret void
+}
diff --git a/llvm/test/CodeGen/SystemZ/atomicrmw-fsub-01.ll b/llvm/test/CodeGen/SystemZ/atomicrmw-fsub-01.ll
new file mode 100644
index 0000000000000..3f4ad31762753
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/atomicrmw-fsub-01.ll
@@ -0,0 +1,22 @@
+; Test atomic float subtraction. Expect a compare-and-swap loop.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+define float @f1(ptr %src, float %b) {
+; CHECK-LABEL: f1:
+; CHECK: le [[F:%f[0-9]+]], 0(%r2)
+; CHECK: [[L:\.L.+]]:
+; CHECK: lgdr [[RI:%r[0-9]+]], [[F]]
+; CHECK: sebr [[F]], %f0
+; CHECK: lgdr [[RO:%r[0-9]+]], [[F]]
+; CHECK: srlg [[RO]], [[RO]], 32
+; CHECK: srlg [[RI]], [[RI]], 32
+; CHECK: cs [[RI]], [[RO]], 0(%r2)
+; CHECK: sllg [[RI]], [[RI]], 32
+; CHECK: ldgr [[F]], [[RI]]
+; CHECK: jl [[L]]
+; CHECK: ler %f0, [[F]]
+; CHECK: br %r14
+  %res = atomicrmw fsub ptr %src, float %b seq_cst
+  ret float %res
+}
diff --git a/llvm/test/CodeGen/SystemZ/atomicrmw-fsub-02.ll b/llvm/test/CodeGen/SystemZ/atomicrmw-fsub-02.ll
new file mode 100644
index 0000000000000..69071f9cbe029
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/atomicrmw-fsub-02.ll
@@ -0,0 +1,19 @@
+; Test atomic double subtraction. Expect a compare-and-swap loop.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+define double @f1(ptr %src, double %b) {
+; CHECK-LABEL: f1:
+; CHECK: ld [[F:%f[0-9]+]], 0(%r2)
+; CHECK: [[L:\.L.+]]:
+; CHECK: lgdr [[RI:%r[0-9]+]], [[F]]
+; CHECK: sdbr [[F]], %f0
+; CHECK: lgdr [[RO:%r[0-9]+]], [[F]]
+; CHECK: csg [[RI]], [[RO]], 0(%r2)
+; CHECK: ldgr [[F]], [[RI]]
+; CHECK: jl [[L]]
+; CHECK: ldr %f0, [[F]]
+; CHECK: br %r14
+  %res = atomicrmw fsub ptr %src, double %b seq_cst
+  ret double %res
+}
diff --git a/llvm/test/CodeGen/SystemZ/atomicrmw-fsub-03.ll b/llvm/test/CodeGen/SystemZ/atomicrmw-fsub-03.ll
new file mode 100644
index 0000000000000..aacbc942f1a56
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/atomicrmw-fsub-03.ll
@@ -0,0 +1,30 @@
+; Test atomic long double subtraction. Expect a compare-and-swap loop.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+define void @f1(ptr %ret, ptr %src, ptr %b) {
+; CHECK-LABEL: f1:
+; CHECK: [[FBL:%f[0-9]+]], 0(%r4)
+; CHECK: [[FBH:%f[0-9]+]], 8(%r4)
+; CHECK: [[FSL:%f[0-9]+]], 0(%r3)
+; CHECK: [[FSH:%f[0-9]+]], 8(%r3)
+; CHECK: [[LABEL:\.L.+]]:
+; CHECK: lgdr [[RISH:%r[0-9]+]], [[FSH]]
+; CHECK: lgdr [[RISL:%r[0-9]+]], [[FSL]]
+; CHECK: sxbr [[FSL]], [[FBL]]
+; CHECK: lgdr [[ROSH:%r[0-9]+]], [[FSH]]
+; CHECK: lgdr [[ROSL:%r[0-9]+]], [[FSL]]
+; CHECK: cdsg [[RISL]], [[ROSL]], 0(%r3)
+; CHECK: stg [[RISH]], 168(%r15)
+; CHECK: stg [[RISL]], 160(%r15)
+; CHECK: ld [[FSL]], 160(%r15)
+; CHECK: ld [[FSH]], 168(%r15)
+; CHECK: jl [[LABEL]]
+; CHECK: std [[FSL]], 0(%r2)
+; CHECK: std [[FSH]], 8(%r2)
+; CHECK: br %r14
+  %val = load fp128, ptr %b
+  %res = atomicrmw fsub ptr %src, fp128 %val seq_cst
+  store fp128 %res, ptr %ret
+  ret void
+}
diff --git a/llvm/test/CodeGen/SystemZ/atomicrmw-udec_wrap.ll b/llvm/test/CodeGen/SystemZ/atomicrmw-udec_wrap.ll
new file mode 100644
index 0000000000000..d6427fc29f051
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/atomicrmw-udec_wrap.ll
@@ -0,0 +1,28 @@
+; Test decrementing until to a minimum value. Expect a compare-and-swap loop.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+define i64 @f1(ptr %src, i64 %b) {
+; CHECK-LABEL: f1:
+; CHECK: lgr [[SRC:%r[0-9]+]], %r2
+; CHECK: lg [[RI:%r[0-9]+]], 0(%r2)
+; CHECK: j [[L2:\.L.+]]
+; CHECK: [[L1:\.L.+]]:
+; CHECK: csg [[RI]], [[RO:%r[0-9]+]], 0([[SRC]])
+; CHECK: je [[L4:\.L.+]]
+; CHECK: [[L2]]:
+; CHECK: lgr [[RO]], [[RI]]
+; CHECK: slgfi [[RO]], 1
+; CHECK: lgr [[RB:%r[0-9]+]], %r3
+; CHECK: clgrjh [[RI]], %r3, [[L3:\.L.+]]
+; CHECK: lgr [[RB]], [[RO]]
+; CHECK: [[L3]]:
+; CHECK: lgr [[RO]], [[RI]]
+; CHECK: slgfi [[RO]], 1
+; CHECK: lgr [[RO]], %r3
+; CHECK: jle [[L1]]
+; CHECK: [[L4]]:
+; CHECK: br %r14
+  %res = atomicrmw udec_wrap ptr %src, i64 %b seq_cst
+  ret i64 %res
+}
diff --git a/llvm/test/CodeGen/SystemZ/atomicrmw-uinc_wrap.ll b/llvm/test/CodeGen/SystemZ/atomicrmw-uinc_wrap.ll
new file mode 100644
index 0000000000000..cf90d75653595
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/atomicrmw-uinc_wrap.ll
@@ -0,0 +1,22 @@
+; Test incrementing up to a maximum value. Expect a compare-and-swap loop.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+define i64 @f1(ptr %src, i64 %b) {
+; CHECK-LABEL: f1:
+; CHECK: lgr [[SRC:%r[0-9]+]], %r2
+; CHECK: lg [[RI:%r[0-9]+]], 0(%r2)
+; CHECK: j [[L2:\.L.+]]
+; CHECK: [[L1:\.L.+]]:
+; CHECK: csg [[RI]], [[RO:%r[0-9]+]], 0([[SRC]])
+; CHECK: je [[L3:\.L.+]]
+; CHECK: [[L2]]:
+; CHECK: lghi [[RO]], 0
+; CHECK: clgrjhe [[RI]], %r3, [[L1]]
+; CHECK: la [[RO]], 1([[RI]])
+; CHECK: j [[L1]]
+; CHECK: [[L3]]:
+; CHECK: br %r14
+  %res = atomicrmw uinc_wrap ptr %src, i64 %b seq_cst
+  ret i64 %res
+}
diff --git a/llvm/test/CodeGen/SystemZ/atomicrmw-xchg-05.ll b/llvm/test/CodeGen/SystemZ/atomicrmw-xchg-05.ll
new file mode 100644
index 0000000000000..d86e476115c38
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/atomicrmw-xchg-05.ll
@@ -0,0 +1,17 @@
+; Test float atomic exchange.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+define float @f1(ptr %src, float %b) {
+; CHECK-LABEL: f1:
+; CHECK: l [[RI:%r[0-9]+]], 0(%r2)
+; CHECK: lgdr [[RO:%r[0-9]+]], %f0
+; CHECK: [[LABEL:\.[^:]*]]:
+; CHECK: cs [[RI]], [[RO]], 0(%r2)
+; CHECK: jl [[LABEL]]
+; CHECK: sllg [[RI]], [[RI]], 32
+; CHECK: ldgr %f0, [[RI]]
+; CHECK: br %r14
+  %res = atomicrmw xchg ptr %src, float %b seq_cst
+  ret float %res
+}
diff --git a/llvm/test/CodeGen/SystemZ/atomicrmw-xchg-06.ll b/llvm/test/CodeGen/SystemZ/atomicrmw-xchg-06.ll
new file mode 100644
index 0000000000000..9b78e783035dd
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/atomicrmw-xchg-06.ll
@@ -0,0 +1,16 @@
+; Test double atomic exchange.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+define double @f1(ptr %src, double %b) {
+; CHECK-LABEL: f1:
+; CHECK: lg [[RI:%r[0-9]+]], 0(%r2)
+; CHECK: lgdr [[RO:%r[0-9]+]], %f0
+; CHECK: [[LABEL:\.[^:]*]]:
+; CHECK: csg [[RI]], [[RO]], 0(%r2)
+; CHECK: jl [[LABEL]]
+; CHECK: ldgr %f0, [[RI]]
+; CHECK: br %r14
+  %res = atomicrmw xchg ptr %src, double %b seq_cst
+  ret double %res
+}
diff --git a/llvm/test/CodeGen/SystemZ/atomicrmw-xchg-07.ll b/llvm/test/CodeGen/SystemZ/atomicrmw-xchg-07.ll
new file mode 100644
index 0000000000000..80cc85158e45e
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/atomicrmw-xchg-07.ll
@@ -0,0 +1,24 @@
+; Test long double atomic exchange.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
+
+define void @f1(ptr %ret, ptr %src, ptr %b) {
+; CHECK-LABEL: f1:
+; CHECK: lg [[RH:%r[0-9]+]], 8(%r4)
+; CHECK: lgr [[RET:%r[0-9]+]], %r2
+; CHECK: lg [[RL:%r[0-9]+]], 0(%r4)
+; CHECK: stg [[RH]], 168(%r15)
+; CHECK: la %r2, 176(%r15)
+; CHECK: la %r4, 160(%r15)
+; CHECK: stg [[RL]], 160(%r15)
+; CHECK: brasl %r14, __sync_lock_test_and_set_16@PLT
+; CHECK: lg [[RH2:%r[0-9]+]], 184(%r15)
+; CHECK: lg [[RL2:%r[0-9]+]], 176(%r15)
+; CHECK: stg [[RH]], 8([[RET]])
+; CHECK: stg [[RL]], 0([[RET]])
+; CHECK: br %r14
+  %val = load fp128, ptr %b, align 8
+  %res = atomicrmw xchg ptr %src, fp128 %val seq_cst
+  store fp128 %res, ptr %ret, align 8
+  ret void
+}

From b26e6a8eb57da6bc0f6d968a7ff87be0f3862683 Mon Sep 17 00:00:00 2001
From: Pierre van Houtryve <pierre.vanhoutryve@amd.com>
Date: Tue, 31 Oct 2023 09:57:10 +0100
Subject: [PATCH 487/877] [GlobalISel] Add `GITypeOf` special type (#66079)

Allows creating a register/immediate that uses the same type as a
matched operand.
---
 llvm/docs/GlobalISel/MIRPatterns.rst          |  42 +++
 .../llvm/CodeGen/GlobalISel/CombinerHelper.h  |   3 -
 .../CodeGen/GlobalISel/GIMatchTableExecutor.h |  10 +
 .../GlobalISel/GIMatchTableExecutorImpl.h     |  32 +-
 .../include/llvm/Target/GlobalISel/Combine.td |  25 +-
 .../lib/CodeGen/GlobalISel/CombinerHelper.cpp |  12 -
 .../match-table-typeof.td                     |  49 ++++
 .../operand-types.td                          |  28 +-
 .../pattern-parsing.td                        |  25 +-
 .../typeof-errors.td                          |  72 +++++
 .../TableGen/GlobalISelCombinerEmitter.cpp    | 277 +++++++++++++++---
 llvm/utils/TableGen/GlobalISelMatchTable.cpp  |  36 ++-
 llvm/utils/TableGen/GlobalISelMatchTable.h    |  89 +++++-
 13 files changed, 621 insertions(+), 79 deletions(-)
 create mode 100644 llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-typeof.td
 create mode 100644 llvm/test/TableGen/GlobalISelCombinerEmitter/typeof-errors.td

diff --git a/llvm/docs/GlobalISel/MIRPatterns.rst b/llvm/docs/GlobalISel/MIRPatterns.rst
index fa70311f48572..a3883b14b3e0b 100644
--- a/llvm/docs/GlobalISel/MIRPatterns.rst
+++ b/llvm/docs/GlobalISel/MIRPatterns.rst
@@ -101,6 +101,48 @@ pattern, you can try naming your patterns to see exactly where the issue is.
   // using $x again here copies operand 1 from G_AND into the new inst.
   (apply (COPY $root, $x))
 
+Types
+-----
+
+ValueType
+~~~~~~~~~
+
+Subclasses of ``ValueType`` are valid types, e.g. ``i32``.
+
+GITypeOf
+~~~~~~~~
+
+``GITypeOf<"$x">`` is a ``GISpecialType`` that allows for the creation of a
+register or immediate with the same type as another (register) operand.
+
+Operand:
+
+* An operand name as a string, prefixed by ``$``.
+
+Semantics:
+
+* Can only appear in an 'apply' pattern.
+* The operand name used must appear in the 'match' pattern of the
+  same ``GICombineRule``.
+
+.. code-block:: text
+  :caption: Example: Immediate
+
+  def mul_by_neg_one: GICombineRule <
+    (defs root:$root),
+    (match (G_MUL $dst, $x, -1)),
+    (apply (G_SUB $dst, (GITypeOf<"$x"> 0), $x))
+  >;
+
+.. code-block:: text
+  :caption: Example: Temp Reg
+
+  def Test0 : GICombineRule<
+    (defs root:$dst),
+    (match (G_FMUL $dst, $src, -1)),
+    (apply (G_FSUB $dst, $src, $tmp),
+           (G_FNEG GITypeOf<"$dst">:$tmp, $src))>;
+
 Builtin Operations
 ------------------
 
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index 65299e852574b..ba72a3b71ffd7 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -405,9 +405,6 @@ class CombinerHelper {
   void applyCombineTruncOfShift(MachineInstr &MI,
                                 std::pair<MachineInstr *, LLT> &MatchInfo);
 
-  /// Transform G_MUL(x, -1) to G_SUB(0, x)
-  void applyCombineMulByNegativeOne(MachineInstr &MI);
-
   /// Return true if any explicit use operand on \p MI is defined by a
   /// G_IMPLICIT_DEF.
   bool matchAnyExplicitUseIsUndef(MachineInstr &MI);
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/GIMatchTableExecutor.h b/llvm/include/llvm/CodeGen/GlobalISel/GIMatchTableExecutor.h
index 209f80c6d6d28..6fcd9d09e1863 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/GIMatchTableExecutor.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/GIMatchTableExecutor.h
@@ -275,6 +275,12 @@ enum {
   /// - StoreIdx - Store location in RecordedOperands.
   GIM_RecordNamedOperand,
 
+  /// Records an operand's register type into the set of temporary types.
+  /// - InsnID - Instruction ID
+  /// - OpIdx - Operand index
+  /// - TempTypeIdx - Temp Type Index, always negative.
+  GIM_RecordRegType,
+
   /// Fail the current try-block, or completely fail to match if there is no
   /// current try-block.
   GIM_Reject,
@@ -522,6 +528,10 @@ class GIMatchTableExecutor {
     /// list. Currently such predicates don't have more then 3 arguments.
     std::array<const MachineOperand *, 3> RecordedOperands;
 
+    /// Types extracted from an instruction's operand.
+    /// Whenever a type index is negative, we look here instead.
+    SmallVector<LLT, 4> RecordedTypes;
+
     MatcherState(unsigned MaxRenderers);
   };
 
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h b/llvm/include/llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h
index fb03d5ec0bc89..32e2f21d775f3 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h
@@ -92,6 +92,14 @@ bool GIMatchTableExecutor::executeMatchTable(
     return true;
   };
 
+  // If the index is >= 0, it's an index in the type objects generated by
+  // TableGen. If the index is <0, it's an index in the recorded types object.
+  auto getTypeFromIdx = [&](int64_t Idx) -> LLT {
+    if (Idx >= 0)
+      return ExecInfo.TypeObjects[Idx];
+    return State.RecordedTypes[1 - Idx];
+  };
+
   while (true) {
     assert(CurrentIdx != ~0u && "Invalid MatchTable index");
     int64_t MatcherOpcode = MatchTable[CurrentIdx++];
@@ -627,8 +635,7 @@ bool GIMatchTableExecutor::executeMatchTable(
                              << "), TypeID=" << TypeID << ")\n");
       assert(State.MIs[InsnID] != nullptr && "Used insn before defined");
       MachineOperand &MO = State.MIs[InsnID]->getOperand(OpIdx);
-      if (!MO.isReg() ||
-          MRI.getType(MO.getReg()) != ExecInfo.TypeObjects[TypeID]) {
+      if (!MO.isReg() || MRI.getType(MO.getReg()) != getTypeFromIdx(TypeID)) {
         if (handleReject() == RejectAndGiveUp)
           return false;
       }
@@ -679,6 +686,25 @@ bool GIMatchTableExecutor::executeMatchTable(
       State.RecordedOperands[StoreIdx] = &State.MIs[InsnID]->getOperand(OpIdx);
       break;
     }
+    case GIM_RecordRegType: {
+      int64_t InsnID = MatchTable[CurrentIdx++];
+      int64_t OpIdx = MatchTable[CurrentIdx++];
+      int64_t TypeIdx = MatchTable[CurrentIdx++];
+
+      DEBUG_WITH_TYPE(TgtExecutor::getName(),
+                      dbgs() << CurrentIdx << ": GIM_RecordRegType(MIs["
+                             << InsnID << "]->getOperand(" << OpIdx
+                             << "), TypeIdx=" << TypeIdx << ")\n");
+      assert(State.MIs[InsnID] != nullptr && "Used insn before defined");
+      assert(TypeIdx <= 0 && "Temp types always have negative indexes!");
+      // Indexes start at -1.
+      TypeIdx = 1 - TypeIdx;
+      const auto &Op = State.MIs[InsnID]->getOperand(OpIdx);
+      if (State.RecordedTypes.size() <= (uint64_t)TypeIdx)
+        State.RecordedTypes.resize(TypeIdx + 1, LLT());
+      State.RecordedTypes[TypeIdx] = MRI.getType(Op.getReg());
+      break;
+    }
     case GIM_CheckRegBankForClass: {
       int64_t InsnID = MatchTable[CurrentIdx++];
       int64_t OpIdx = MatchTable[CurrentIdx++];
@@ -1275,7 +1301,7 @@ bool GIMatchTableExecutor::executeMatchTable(
       int64_t TypeID = MatchTable[CurrentIdx++];
 
       State.TempRegisters[TempRegID] =
-          MRI.createGenericVirtualRegister(ExecInfo.TypeObjects[TypeID]);
+          MRI.createGenericVirtualRegister(getTypeFromIdx(TypeID));
       DEBUG_WITH_TYPE(TgtExecutor::getName(),
                       dbgs() << CurrentIdx << ": TempRegs[" << TempRegID
                              << "] = GIR_MakeTempReg(" << TypeID << ")\n");
diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index bb8223ba3486a..63c485a5a6c60 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -110,6 +110,24 @@ class GICombinePatFrag<dag outs, dag ins, list<dag> alts> {
   list<dag> Alternatives = alts;
 }
 
+//===----------------------------------------------------------------------===//
+// Pattern Special Types
+//===----------------------------------------------------------------------===//
+
+class GISpecialType;
+
+// In an apply pattern, GITypeOf can be used to set the type of a new temporary
+// register to match the type of a matched register.
+//
+// This can only be used on temporary registers defined by the apply pattern.
+//
+// TODO: Make this work in matchers as well?
+//
+// FIXME: Syntax is very ugly.
+class GITypeOf<string opName> : GISpecialType {
+  string OpName = opName;
+}
+
 //===----------------------------------------------------------------------===//
 // Pattern Builtins
 //===----------------------------------------------------------------------===//
@@ -776,10 +794,9 @@ def trunc_shift: GICombineRule <
 
 // Transform (mul x, -1) -> (sub 0, x)
 def mul_by_neg_one: GICombineRule <
-  (defs root:$root),
-  (match (wip_match_opcode G_MUL):$root,
-         [{ return Helper.matchConstantOp(${root}->getOperand(2), -1); }]),
-  (apply [{ Helper.applyCombineMulByNegativeOne(*${root}); }])
+  (defs root:$dst),
+  (match (G_MUL $dst, $x, -1)),
+  (apply (G_SUB $dst, (GITypeOf<"$x"> 0), $x))
 >;
 
 // Fold (xor (and x, y), y) -> (and (not x), y)
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index 3c2b5f490ccb8..51c268ab77c22 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -2351,18 +2351,6 @@ void CombinerHelper::applyCombineExtOfExt(
   }
 }
 
-void CombinerHelper::applyCombineMulByNegativeOne(MachineInstr &MI) {
-  assert(MI.getOpcode() == TargetOpcode::G_MUL && "Expected a G_MUL");
-  Register DstReg = MI.getOperand(0).getReg();
-  Register SrcReg = MI.getOperand(1).getReg();
-  LLT DstTy = MRI.getType(DstReg);
-
-  Builder.setInstrAndDebugLoc(MI);
-  Builder.buildSub(DstReg, Builder.buildConstant(DstTy, 0), SrcReg,
-                   MI.getFlags());
-  MI.eraseFromParent();
-}
-
 bool CombinerHelper::matchCombineTruncOfExt(
     MachineInstr &MI, std::pair<Register, unsigned> &MatchInfo) {
   assert(MI.getOpcode() == TargetOpcode::G_TRUNC && "Expected a G_TRUNC");
diff --git a/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-typeof.td b/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-typeof.td
new file mode 100644
index 0000000000000..496d86aeef2d1
--- /dev/null
+++ b/llvm/test/TableGen/GlobalISelCombinerEmitter/match-table-typeof.td
@@ -0,0 +1,49 @@
+// RUN: llvm-tblgen -I %p/../../../include -gen-global-isel-combiner \
+// RUN:     -combiners=MyCombiner %s | \
+// RUN: FileCheck %s
+
+include "llvm/Target/Target.td"
+include "llvm/Target/GlobalISel/Combine.td"
+
+def MyTargetISA : InstrInfo;
+def MyTarget : Target { let InstructionSet = MyTargetISA; }
+
+def Test0 : GICombineRule<
+  (defs root:$dst),
+  (match (G_MUL $dst, $src, -1)),
+  (apply (G_SUB $dst, (GITypeOf<"$src"> 0), $tmp),
+         (G_CONSTANT GITypeOf<"$dst">:$tmp, (GITypeOf<"$src"> 42)))>;
+
+// CHECK:      const int64_t *GenMyCombiner::getMatchTable() const {
+// CHECK-NEXT:   constexpr static int64_t MatchTable0[] = {
+// CHECK-NEXT:     GIM_Try, /*On fail goto*//*Label 0*/ 57, // Rule ID 0 //
+// CHECK-NEXT:       GIM_CheckSimplePredicate, GICXXPred_Simple_IsRule0Enabled,
+// CHECK-NEXT:       GIM_CheckOpcode, /*MI*/0, TargetOpcode::G_MUL,
+// CHECK-NEXT:       // MIs[0] dst
+// CHECK-NEXT:       GIM_RecordRegType, /*MI*/0, /*Op*/0, /*TempTypeIdx*/-1,
+// CHECK-NEXT:       // MIs[0] src
+// CHECK-NEXT:       GIM_RecordRegType, /*MI*/0, /*Op*/1, /*TempTypeIdx*/-2,
+// CHECK-NEXT:       // MIs[0] Operand 2
+// CHECK-NEXT:       GIM_CheckConstantInt, /*MI*/0, /*Op*/2, -1,
+// CHECK-NEXT:       GIR_MakeTempReg, /*TempRegID*/1, /*TypeID*/-2,
+// CHECK-NEXT:       GIR_BuildConstant, /*TempRegID*/1, /*Val*/0,
+// CHECK-NEXT:       GIR_MakeTempReg, /*TempRegID*/0, /*TypeID*/-1,
+// CHECK-NEXT:       // Combiner Rule #0: Test0
+// CHECK-NEXT:       GIR_BuildMI, /*InsnID*/0, /*Opcode*/TargetOpcode::G_CONSTANT,
+// CHECK-NEXT:       GIR_AddTempRegister, /*InsnID*/0, /*TempRegID*/0, /*TempRegFlags*/0,
+// CHECK-NEXT:       GIR_AddCImm, /*InsnID*/0, /*Type*/-2, /*Imm*/42,
+// CHECK-NEXT:       GIR_EraseFromParent, /*InsnID*/0,
+// CHECK-NEXT:       GIR_BuildMI, /*InsnID*/1, /*Opcode*/TargetOpcode::G_SUB,
+// CHECK-NEXT:       GIR_Copy, /*NewInsnID*/1, /*OldInsnID*/0, /*OpIdx*/0, // dst
+// CHECK-NEXT:       GIR_AddTempRegister, /*InsnID*/1, /*TempRegID*/1, /*TempRegFlags*/0,
+// CHECK-NEXT:       GIR_AddTempRegister, /*InsnID*/1, /*TempRegID*/0, /*TempRegFlags*/0,
+// CHECK-NEXT:       GIR_Done,
+// CHECK-NEXT:     // Label 0: @57
+// CHECK-NEXT:     GIM_Reject,
+// CHECK-NEXT:     };
+// CHECK-NEXT:   return MatchTable0;
+// CHECK-NEXT: }
+
+def MyCombiner: GICombiner<"GenMyCombiner", [
+  Test0
+]>;
diff --git a/llvm/test/TableGen/GlobalISelCombinerEmitter/operand-types.td b/llvm/test/TableGen/GlobalISelCombinerEmitter/operand-types.td
index c871e603e4e05..4769bed972401 100644
--- a/llvm/test/TableGen/GlobalISelCombinerEmitter/operand-types.td
+++ b/llvm/test/TableGen/GlobalISelCombinerEmitter/operand-types.td
@@ -79,7 +79,33 @@ def PatFragTest0 : GICombineRule<
   (match (FooPF $dst)),
   (apply (COPY $dst, (i32 0)))>;
 
+
+// CHECK:      (CombineRule name:TypeOfProp id:2 root:x
+// CHECK-NEXT:   (MatchPats
+// CHECK-NEXT:     <match_root>__TypeOfProp_match_0:(CodeGenInstructionPattern G_ZEXT operands:[<def>$x, $y])
+// CHECK-NEXT:   )
+// CHECK-NEXT:   (ApplyPats
+// CHECK-NEXT:     <apply_root>__TypeOfProp_apply_0:(CodeGenInstructionPattern G_ANYEXT operands:[<def>$x, GITypeOf<$y>:$tmp])
+// CHECK-NEXT:     __TypeOfProp_apply_1:(CodeGenInstructionPattern G_ANYEXT operands:[<def>GITypeOf<$y>:$tmp, $y])
+// CHECK-NEXT:   )
+// CHECK-NEXT:   (OperandTable MatchPats
+// CHECK-NEXT:     x -> __TypeOfProp_match_0
+// CHECK-NEXT:     y -> <live-in>
+// CHECK-NEXT:   )
+// CHECK-NEXT:   (OperandTable ApplyPats
+// CHECK-NEXT:     tmp -> __TypeOfProp_apply_1
+// CHECK-NEXT:     x -> __TypeOfProp_apply_0
+// CHECK-NEXT:     y -> <live-in>
+// CHECK-NEXT:   )
+// CHECK-NEXT: )
+def TypeOfProp : GICombineRule<
+  (defs root:$x),
+  (match (G_ZEXT $x, $y)),
+  (apply (G_ANYEXT $x, GITypeOf<"$y">:$tmp),
+         (G_ANYEXT $tmp, $y))>;
+
 def MyCombiner: GICombiner<"GenMyCombiner", [
   InstTest0,
-  PatFragTest0
+  PatFragTest0,
+  TypeOfProp
 ]>;
diff --git a/llvm/test/TableGen/GlobalISelCombinerEmitter/pattern-parsing.td b/llvm/test/TableGen/GlobalISelCombinerEmitter/pattern-parsing.td
index bc75b15233b55..fd41a7d1d7241 100644
--- a/llvm/test/TableGen/GlobalISelCombinerEmitter/pattern-parsing.td
+++ b/llvm/test/TableGen/GlobalISelCombinerEmitter/pattern-parsing.td
@@ -297,6 +297,28 @@ def VariadicsOutTest : GICombineRule<
   (apply (COPY $a, (i32 0)),
          (COPY $b, (i32 0)))>;
 
+// CHECK:      (CombineRule name:TypeOfTest id:10 root:dst
+// CHECK-NEXT:   (MatchPats
+// CHECK-NEXT:     <match_root>__TypeOfTest_match_0:(CodeGenInstructionPattern COPY operands:[<def>$dst, $tmp])
+// CHECK-NEXT:     __TypeOfTest_match_1:(CodeGenInstructionPattern G_ZEXT operands:[<def>$tmp, $src])
+// CHECK-NEXT:   )
+// CHECK-NEXT:   (ApplyPats
+// CHECK-NEXT:     <apply_root>__TypeOfTest_apply_0:(CodeGenInstructionPattern G_MUL operands:[<def>$dst, (GITypeOf<$src> 0), (GITypeOf<$dst> -1)])
+// CHECK-NEXT:   )
+// CHECK-NEXT:   (OperandTable MatchPats
+// CHECK-NEXT:     dst -> __TypeOfTest_match_0
+// CHECK-NEXT:     src -> <live-in>
+// CHECK-NEXT:     tmp -> __TypeOfTest_match_1
+// CHECK-NEXT:   )
+// CHECK-NEXT:   (OperandTable ApplyPats
+// CHECK-NEXT:     dst -> __TypeOfTest_apply_0
+// CHECK-NEXT:   )
+// CHECK-NEXT: )
+def TypeOfTest : GICombineRule<
+  (defs root:$dst),
+  (match (COPY $dst, $tmp),
+         (G_ZEXT $tmp, $src)),
+  (apply (G_MUL $dst, (GITypeOf<"$src"> 0), (GITypeOf<"$dst"> -1)))>;
 
 def MyCombiner: GICombiner<"GenMyCombiner", [
   WipOpcodeTest0,
@@ -308,5 +330,6 @@ def MyCombiner: GICombiner<"GenMyCombiner", [
   PatFragTest0,
   PatFragTest1,
   VariadicsInTest,
-  VariadicsOutTest
+  VariadicsOutTest,
+  TypeOfTest
 ]>;
diff --git a/llvm/test/TableGen/GlobalISelCombinerEmitter/typeof-errors.td b/llvm/test/TableGen/GlobalISelCombinerEmitter/typeof-errors.td
new file mode 100644
index 0000000000000..6040d6def4497
--- /dev/null
+++ b/llvm/test/TableGen/GlobalISelCombinerEmitter/typeof-errors.td
@@ -0,0 +1,72 @@
+// RUN: not llvm-tblgen -I %p/../../../include -gen-global-isel-combiner \
+// RUN:     -combiners=MyCombiner %s 2>&1| \
+// RUN: FileCheck %s -implicit-check-not=error:
+
+include "llvm/Target/Target.td"
+include "llvm/Target/GlobalISel/Combine.td"
+
+def MyTargetISA : InstrInfo;
+def MyTarget : Target { let InstructionSet = MyTargetISA; }
+
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: invalid operand name format 'unknown' in GITypeOf: expected '$' followed by an operand name
+def NoDollarSign : GICombineRule<
+  (defs root:$dst),
+  (match (G_ZEXT $dst, $src)),
+  (apply (G_ANYEXT $dst, (GITypeOf<"unknown"> 0)))>;
+
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: 'unknown' ('GITypeOf<$unknown>') does not refer to a matched operand!
+def UnknownOperand : GICombineRule<
+  (defs root:$dst),
+  (match (G_ZEXT $dst, $src)),
+  (apply (G_ANYEXT $dst, (GITypeOf<"$unknown"> 0)))>;
+
+// CHECK: :[[@LINE+2]]:{{[0-9]+}}: error: GISpecialType is not supported in 'match' patterns
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: note: operand 1 of '__UseInMatch_match_0' has type 'GITypeOf<$dst>'
+def UseInMatch : GICombineRule<
+  (defs root:$dst),
+  (match (G_ZEXT $dst, (GITypeOf<"$dst"> 0))),
+  (apply (G_ANYEXT $dst, (i32 0)))>;
+
+// CHECK: :[[@LINE+3]]:{{[0-9]+}}: error: GISpecialType is not supported in GICombinePatFrag
+// CHECK: :[[@LINE+2]]:{{[0-9]+}}: note: operand 1 of '__PFWithTypeOF_alt0_pattern_0' has type 'GITypeOf<$dst>
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: Could not parse GICombinePatFrag 'PFWithTypeOF'
+def PFWithTypeOF: GICombinePatFrag<
+    (outs $dst), (ins),
+    [(pattern (G_ANYEXT $dst, (GITypeOf<"$dst"> 0)))]>;
+
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: Failed to parse pattern: '(PFWithTypeOF ?:$dst)'
+def UseInPF: GICombineRule<
+  (defs root:$dst),
+  (match (PFWithTypeOF $dst)),
+  (apply (G_ANYEXT $dst, (i32 0)))>;
+
+// CHECK: :[[@LINE+2]]:{{[0-9]+}}: error: GISpecialType is not supported in 'match' patterns
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: note: operand 1 of '__InferredUseInMatch_match_0' has type 'GITypeOf<$dst>'
+def InferredUseInMatch : GICombineRule<
+  (defs root:$dst),
+  (match (G_ZEXT $dst, $src)),
+  (apply (G_ANYEXT $dst, GITypeOf<"$dst">:$src))>;
+
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: conflicting types for operand 'src': first seen with 'i32' in '__InferenceConflict_match_0, now seen with 'GITypeOf<$dst>' in '__InferenceConflict_apply_0'
+def InferenceConflict : GICombineRule<
+  (defs root:$dst),
+  (match (G_ZEXT $dst, i32:$src)),
+  (apply (G_ANYEXT $dst, GITypeOf<"$dst">:$src))>;
+
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: 'tmp' ('GITypeOf<$tmp>') does not refer to a matched operand!
+def TypeOfApplyTmp : GICombineRule<
+  (defs root:$dst),
+  (match (G_ZEXT $dst, $src)),
+  (apply (G_ANYEXT $dst, i32:$tmp),
+         (G_ANYEXT $tmp, (GITypeOf<"$tmp"> 0)))>;
+
+// CHECK: :[[@LINE+1]]:{{[0-9]+}}: error: Failed to parse one or more rules
+def MyCombiner: GICombiner<"GenMyCombiner", [
+  NoDollarSign,
+  UnknownOperand,
+  UseInMatch,
+  UseInPF,
+  InferredUseInMatch,
+  InferenceConflict,
+  TypeOfApplyTmp
+]>;
diff --git a/llvm/utils/TableGen/GlobalISelCombinerEmitter.cpp b/llvm/utils/TableGen/GlobalISelCombinerEmitter.cpp
index 7992cb4362a17..0c7b33a7b9d88 100644
--- a/llvm/utils/TableGen/GlobalISelCombinerEmitter.cpp
+++ b/llvm/utils/TableGen/GlobalISelCombinerEmitter.cpp
@@ -73,6 +73,8 @@ constexpr StringLiteral CXXApplyPrefix = "GICXXCustomAction_CombineApply";
 constexpr StringLiteral CXXPredPrefix = "GICXXPred_MI_Predicate_";
 constexpr StringLiteral PatFragClassName = "GICombinePatFrag";
 constexpr StringLiteral BuiltinInstClassName = "GIBuiltinInst";
+constexpr StringLiteral SpecialTyClassName = "GISpecialType";
+constexpr StringLiteral TypeOfClassName = "GITypeOf";
 
 std::string getIsEnabledPredicateEnumName(unsigned CombinerRuleID) {
   return "GICXXPred_Simple_IsRule" + to_string(CombinerRuleID) + "Enabled";
@@ -123,11 +125,6 @@ template <typename Container> auto values(Container &&C) {
   return map_range(C, [](auto &Entry) -> auto & { return Entry.second; });
 }
 
-LLTCodeGen getLLTCodeGenFromRecord(const Record *Ty) {
-  assert(Ty->isSubClassOf("ValueType"));
-  return LLTCodeGen(*MVTToLLT(getValueType(Ty)));
-}
-
 //===- MatchData Handling -------------------------------------------------===//
 
 /// Represents MatchData defined by the match stage and required by the apply
@@ -292,6 +289,116 @@ class CXXPredicateCode {
 CXXPredicateCode::CXXPredicateCodePool CXXPredicateCode::AllCXXMatchCode;
 CXXPredicateCode::CXXPredicateCodePool CXXPredicateCode::AllCXXApplyCode;
 
+//===- PatternType --------------------------------------------------------===//
+
+/// Represent the type of a Pattern Operand.
+///
+/// Types have two form:
+///   - LLTs, which are straightforward.
+///   - Special types, e.g. GITypeOf
+class PatternType {
+public:
+  PatternType() = default;
+  PatternType(const Record *R) : R(R) {}
+
+  bool isValidType() const { return !R || isLLT() || isSpecial(); }
+
+  bool isLLT() const { return R && R->isSubClassOf("ValueType"); }
+  bool isSpecial() const { return R && R->isSubClassOf(SpecialTyClassName); }
+  bool isTypeOf() const { return R && R->isSubClassOf(TypeOfClassName); }
+
+  StringRef getTypeOfOpName() const;
+  LLTCodeGen getLLTCodeGen() const;
+
+  bool checkSemantics(ArrayRef<SMLoc> DiagLoc) const;
+
+  LLTCodeGenOrTempType getLLTCodeGenOrTempType(RuleMatcher &RM) const;
+
+  explicit operator bool() const { return R != nullptr; }
+
+  bool operator==(const PatternType &Other) const;
+  bool operator!=(const PatternType &Other) const { return !operator==(Other); }
+
+  std::string str() const;
+
+private:
+  StringRef getRawOpName() const { return R->getValueAsString("OpName"); }
+
+  const Record *R = nullptr;
+};
+
+StringRef PatternType::getTypeOfOpName() const {
+  assert(isTypeOf());
+  StringRef Name = getRawOpName();
+  Name.consume_front("$");
+  return Name;
+}
+
+LLTCodeGen PatternType::getLLTCodeGen() const {
+  assert(isLLT());
+  return *MVTToLLT(getValueType(R));
+}
+
+LLTCodeGenOrTempType
+PatternType::getLLTCodeGenOrTempType(RuleMatcher &RM) const {
+  assert(isValidType());
+
+  if (isLLT())
+    return getLLTCodeGen();
+
+  assert(isTypeOf());
+  auto &OM = RM.getOperandMatcher(getTypeOfOpName());
+  return OM.getTempTypeIdx(RM);
+}
+
+bool PatternType::checkSemantics(ArrayRef<SMLoc> DiagLoc) const {
+  if (!isTypeOf())
+    return true;
+
+  auto RawOpName = getRawOpName();
+  if (RawOpName.starts_with("$"))
+    return true;
+
+  PrintError(DiagLoc, "invalid operand name format '" + RawOpName + "' in " +
+                          TypeOfClassName +
+                          ": expected '$' followed by an operand name");
+  return false;
+}
+
+bool PatternType::operator==(const PatternType &Other) const {
+  if (R == Other.R) {
+    if (R && R->getName() != Other.R->getName()) {
+      dbgs() << "Same ptr but: " << R->getName() << " and "
+             << Other.R->getName() << "?\n";
+      assert(false);
+    }
+    return true;
+  }
+
+  if (isTypeOf() && Other.isTypeOf())
+    return getTypeOfOpName() == Other.getTypeOfOpName();
+
+  return false;
+}
+
+std::string PatternType::str() const {
+  if (!R)
+    return "";
+
+  if (!isValidType())
+    return "<invalid>";
+
+  if (isLLT())
+    return R->getName().str();
+
+  assert(isSpecial());
+
+  if (isTypeOf())
+    return (TypeOfClassName + "<$" + getTypeOfOpName() + ">").str();
+
+  llvm_unreachable("Unknown type!");
+}
+
 //===- Pattern Base Class -------------------------------------------------===//
 
 /// Base class for all patterns that can be written in an `apply`, `match` or
@@ -499,13 +606,15 @@ class InstructionOperand {
 public:
   using IntImmTy = int64_t;
 
-  InstructionOperand(IntImmTy Imm, StringRef Name, const Record *Type)
+  InstructionOperand(IntImmTy Imm, StringRef Name, PatternType Type)
       : Value(Imm), Name(insertStrRef(Name)), Type(Type) {
-    assert(!Type || Type->isSubClassOf("ValueType"));
+    assert(Type.isValidType());
   }
 
-  InstructionOperand(StringRef Name, const Record *Type)
-      : Name(insertStrRef(Name)), Type(Type) {}
+  InstructionOperand(StringRef Name, PatternType Type)
+      : Name(insertStrRef(Name)), Type(Type) {
+    assert(Type.isValidType());
+  }
 
   bool isNamedImmediate() const { return hasImmValue() && isNamedOperand(); }
 
@@ -527,11 +636,12 @@ class InstructionOperand {
   void setIsDef(bool Value = true) { Def = Value; }
   bool isDef() const { return Def; }
 
-  void setType(const Record *R) {
-    assert((!Type || (Type == R)) && "Overwriting type!");
-    Type = R;
+  void setType(PatternType NewType) {
+    assert((!Type || (Type == NewType)) && "Overwriting type!");
+    assert(NewType.isValidType());
+    Type = NewType;
   }
-  const Record *getType() const { return Type; }
+  PatternType getType() const { return Type; }
 
   std::string describe() const {
     if (!hasImmValue())
@@ -547,11 +657,11 @@ class InstructionOperand {
       OS << "<def>";
 
     bool NeedsColon = true;
-    if (const Record *Ty = getType()) {
+    if (Type) {
       if (hasImmValue())
-        OS << "(" << Ty->getName() << " " << getImmValue() << ")";
+        OS << "(" << Type.str() << " " << getImmValue() << ")";
       else
-        OS << Ty->getName();
+        OS << Type.str();
     } else if (hasImmValue())
       OS << getImmValue();
     else
@@ -566,7 +676,7 @@ class InstructionOperand {
 private:
   std::optional<int64_t> Value;
   StringRef Name;
-  const Record *Type = nullptr;
+  PatternType Type;
   bool Def = false;
 };
 
@@ -622,6 +732,10 @@ class InstructionPattern : public Pattern {
 
   virtual StringRef getInstName() const = 0;
 
+  /// Diagnoses all uses of special types in this Pattern and returns true if at
+  /// least one diagnostic was emitted.
+  bool diagnoseAllSpecialTypes(ArrayRef<SMLoc> Loc, Twine Msg) const;
+
   void reportUnreachable(ArrayRef<SMLoc> Locs) const;
   virtual bool checkSemantics(ArrayRef<SMLoc> Loc);
 
@@ -633,6 +747,20 @@ class InstructionPattern : public Pattern {
   SmallVector<InstructionOperand, 4> Operands;
 };
 
+bool InstructionPattern::diagnoseAllSpecialTypes(ArrayRef<SMLoc> Loc,
+                                                 Twine Msg) const {
+  bool HasDiag = false;
+  for (const auto &[Idx, Op] : enumerate(operands())) {
+    if (Op.getType().isSpecial()) {
+      PrintError(Loc, Msg);
+      PrintNote(Loc, "operand " + Twine(Idx) + " of '" + getName() +
+                         "' has type '" + Op.getType().str() + "'");
+      HasDiag = true;
+    }
+  }
+  return HasDiag;
+}
+
 void InstructionPattern::reportUnreachable(ArrayRef<SMLoc> Locs) const {
   PrintError(Locs, "pattern '" + getName() + "' ('" + getInstName() +
                        "') is unreachable from the pattern root!");
@@ -829,17 +957,20 @@ unsigned CodeGenInstructionPattern::getNumInstOperands() const {
 /// It infers the type of each operand, check it's consistent with the known
 /// type of the operand, and then sets all of the types in all operands in
 /// setAllOperandTypes.
+///
+/// It also handles verifying correctness of special types.
 class OperandTypeChecker {
 public:
   OperandTypeChecker(ArrayRef<SMLoc> DiagLoc) : DiagLoc(DiagLoc) {}
 
-  bool check(InstructionPattern *P);
+  bool check(InstructionPattern *P,
+             std::function<bool(const PatternType &)> VerifyTypeOfOperand);
 
   void setAllOperandTypes();
 
 private:
   struct OpTypeInfo {
-    const Record *Type = nullptr;
+    PatternType Type;
     InstructionPattern *TypeSrc = nullptr;
   };
 
@@ -849,16 +980,26 @@ class OperandTypeChecker {
   SmallVector<InstructionPattern *, 16> Pats;
 };
 
-bool OperandTypeChecker::check(InstructionPattern *P) {
+bool OperandTypeChecker::check(
+    InstructionPattern *P,
+    std::function<bool(const PatternType &)> VerifyTypeOfOperand) {
   Pats.push_back(P);
 
-  for (auto &Op : P->named_operands()) {
-    const Record *Ty = Op.getType();
+  for (auto &Op : P->operands()) {
+    const auto Ty = Op.getType();
     if (!Ty)
       continue;
 
-    auto &Info = Types[Op.getOperandName()];
+    if (!Ty.checkSemantics(DiagLoc))
+      return false;
+
+    if (Ty.isTypeOf() && !VerifyTypeOfOperand(Ty))
+      return false;
 
+    if (!Op.isNamedOperand())
+      continue;
+
+    auto &Info = Types[Op.getOperandName()];
     if (!Info.Type) {
       Info.Type = Ty;
       Info.TypeSrc = P;
@@ -868,9 +1009,9 @@ bool OperandTypeChecker::check(InstructionPattern *P) {
     if (Info.Type != Ty) {
       PrintError(DiagLoc, "conflicting types for operand '" +
                               Op.getOperandName() + "': first seen with '" +
-                              Info.Type->getName() + "' in '" +
+                              Info.Type.str() + "' in '" +
                               Info.TypeSrc->getName() + ", now seen with '" +
-                              Ty->getName() + "' in '" + P->getName() + "'");
+                              Ty.str() + "' in '" + P->getName() + "'");
       return false;
     }
   }
@@ -1058,7 +1199,12 @@ bool PatFrag::checkSemantics() {
                    PatFragClassName);
         return false;
       case Pattern::K_CXX:
+        continue;
       case Pattern::K_CodeGenInstruction:
+        if (cast<CodeGenInstructionPattern>(Pat.get())->diagnoseAllSpecialTypes(
+                Def.getLoc(), SpecialTyClassName + " is not supported in " +
+                                  PatFragClassName))
+          return false;
         continue;
       case Pattern::K_PatFrag:
         // TODO: It's just that the emitter doesn't handle it but technically
@@ -1142,12 +1288,16 @@ bool PatFrag::checkSemantics() {
 
   // TODO: find unused params
 
+  const auto CheckTypeOf = [&](const PatternType &) -> bool {
+    llvm_unreachable("GITypeOf should have been rejected earlier!");
+  };
+
   // Now, typecheck all alternatives.
   for (auto &Alt : Alts) {
     OperandTypeChecker OTC(Def.getLoc());
     for (auto &Pat : Alt.Pats) {
       if (auto *IP = dyn_cast<InstructionPattern>(Pat.get())) {
-        if (!OTC.check(IP))
+        if (!OTC.check(IP, CheckTypeOf))
           return false;
       }
     }
@@ -1954,21 +2104,49 @@ bool CombineRuleBuilder::hasEraseRoot() const {
 bool CombineRuleBuilder::typecheckPatterns() {
   OperandTypeChecker OTC(RuleDef.getLoc());
 
+  const auto CheckMatchTypeOf = [&](const PatternType &) -> bool {
+    // We'll reject those after we're done inferring
+    return true;
+  };
+
   for (auto &Pat : values(MatchPats)) {
     if (auto *IP = dyn_cast<InstructionPattern>(Pat.get())) {
-      if (!OTC.check(IP))
+      if (!OTC.check(IP, CheckMatchTypeOf))
         return false;
     }
   }
 
+  const auto CheckApplyTypeOf = [&](const PatternType &Ty) {
+    // GITypeOf<"$x"> can only be used if "$x" is a matched operand.
+    const auto OpName = Ty.getTypeOfOpName();
+    if (MatchOpTable.lookup(OpName).Found)
+      return true;
+
+    PrintError("'" + OpName + "' ('" + Ty.str() +
+               "') does not refer to a matched operand!");
+    return false;
+  };
+
   for (auto &Pat : values(ApplyPats)) {
     if (auto *IP = dyn_cast<InstructionPattern>(Pat.get())) {
-      if (!OTC.check(IP))
+      if (!OTC.check(IP, CheckApplyTypeOf))
         return false;
     }
   }
 
   OTC.setAllOperandTypes();
+
+  // Always check this after in case inference adds some special types to the
+  // match patterns.
+  for (auto &Pat : values(MatchPats)) {
+    if (auto *IP = dyn_cast<InstructionPattern>(Pat.get())) {
+      if (IP->diagnoseAllSpecialTypes(
+              RuleDef.getLoc(),
+              SpecialTyClassName + " is not supported in 'match' patterns")) {
+        return false;
+      }
+    }
+  }
   return true;
 }
 
@@ -2461,10 +2639,12 @@ bool CombineRuleBuilder::parseInstructionPatternOperand(
     if (DagOp->getNumArgs() != 1)
       return ParseErr();
 
-    Record *ImmTy = DagOp->getOperatorAsDef(RuleDef.getLoc());
-    if (!ImmTy->isSubClassOf("ValueType")) {
+    const Record *TyDef = DagOp->getOperatorAsDef(RuleDef.getLoc());
+    PatternType ImmTy(TyDef);
+    if (!ImmTy.isValidType()) {
       PrintError("cannot parse immediate '" + OpInit->getAsUnquotedString() +
-                 "', '" + ImmTy->getName() + "' is not a ValueType!");
+                 "', '" + TyDef->getName() + "' is not a ValueType or " +
+                 SpecialTyClassName);
       return false;
     }
 
@@ -2491,12 +2671,13 @@ bool CombineRuleBuilder::parseInstructionPatternOperand(
       return false;
     }
     const Record *Def = DefI->getDef();
-    if (!Def->isSubClassOf("ValueType")) {
+    PatternType Ty(Def);
+    if (!Ty.isValidType()) {
       PrintError("invalid operand type: '" + Def->getName() +
                  "' is not a ValueType");
       return false;
     }
-    IP.addOperand(OpName->getAsUnquotedString(), Def);
+    IP.addOperand(OpName->getAsUnquotedString(), Ty);
     return true;
   }
 
@@ -2823,8 +3004,8 @@ bool CombineRuleBuilder::emitPatFragMatchPattern(
       StringRef PFName = PF.getName();
       PrintWarning("impossible type constraints: operand " + Twine(PIdx) +
                    " of '" + PFP.getName() + "' has type '" +
-                   ArgOp.getType()->getName() + "', but '" + PFName +
-                   "' constrains it to '" + O.getType()->getName() + "'");
+                   ArgOp.getType().str() + "', but '" + PFName +
+                   "' constrains it to '" + O.getType().str() + "'");
       if (ArgOp.isNamedOperand())
         PrintNote("operand " + Twine(PIdx) + " of '" + PFP.getName() +
                   "' is '" + ArgOp.getOperandName() + "'");
@@ -3055,17 +3236,18 @@ bool CombineRuleBuilder::emitInstructionApplyPattern(
       // This is a brand new register.
       TempRegID = M.allocateTempRegID();
       OperandToTempRegID[OpName] = TempRegID;
-      const Record *Ty = Op.getType();
+      const auto Ty = Op.getType();
       if (!Ty) {
         PrintError("def of a new register '" + OpName +
                    "' in the apply patterns must have a type");
         return false;
       }
+
       declareTempRegExpansion(CE, TempRegID, OpName);
       // Always insert the action at the beginning, otherwise we may end up
       // using the temp reg before it's available.
       M.insertAction<MakeTempRegisterAction>(
-          M.actions_begin(), getLLTCodeGenFromRecord(Ty), TempRegID);
+          M.actions_begin(), Ty.getLLTCodeGenOrTempType(M), TempRegID);
     }
 
     DstMI.addRenderer<TempRegRenderer>(TempRegID);
@@ -3088,7 +3270,7 @@ bool CombineRuleBuilder::emitCodeGenInstructionApplyImmOperand(
   // G_CONSTANT is a special case and needs a CImm though so this is likely a
   // mistake.
   const bool isGConstant = P.is("G_CONSTANT");
-  const Record *Ty = O.getType();
+  const auto Ty = O.getType();
   if (!Ty) {
     if (isGConstant) {
       PrintError("'G_CONSTANT' immediate must be typed!");
@@ -3101,16 +3283,17 @@ bool CombineRuleBuilder::emitCodeGenInstructionApplyImmOperand(
     return true;
   }
 
-  LLTCodeGen LLT = getLLTCodeGenFromRecord(Ty);
+  auto ImmTy = Ty.getLLTCodeGenOrTempType(M);
+
   if (isGConstant) {
-    DstMI.addRenderer<ImmRenderer>(O.getImmValue(), LLT);
+    DstMI.addRenderer<ImmRenderer>(O.getImmValue(), ImmTy);
     return true;
   }
 
   unsigned TempRegID = M.allocateTempRegID();
   // Ensure MakeTempReg & the BuildConstantAction occur at the beginning.
-  auto InsertIt =
-      M.insertAction<MakeTempRegisterAction>(M.actions_begin(), LLT, TempRegID);
+  auto InsertIt = M.insertAction<MakeTempRegisterAction>(M.actions_begin(),
+                                                         ImmTy, TempRegID);
   M.insertAction<BuildConstantAction>(++InsertIt, TempRegID, O.getImmValue());
   DstMI.addRenderer<TempRegRenderer>(TempRegID);
   return true;
@@ -3227,8 +3410,14 @@ bool CombineRuleBuilder::emitCodeGenInstructionMatchPattern(
     // Always emit a check for unnamed operands.
     if (OpName.empty() ||
         !M.getOperandMatcher(OpName).contains<LLTOperandMatcher>()) {
-      if (const Record *Ty = RemappedO.getType())
-        OM.addPredicate<LLTOperandMatcher>(getLLTCodeGenFromRecord(Ty));
+      if (const auto Ty = RemappedO.getType()) {
+        // TODO: We could support GITypeOf here on the condition that the
+        // OperandMatcher exists already. Though it's clunky to make this work
+        // and isn't all that useful so it's just rejected in typecheckPatterns
+        // at this time.
+        assert(Ty.isLLT() && "Only LLTs are supported in match patterns!");
+        OM.addPredicate<LLTOperandMatcher>(Ty.getLLTCodeGen());
+      }
     }
 
     // Stop here if the operand is a def, or if it had no name.
diff --git a/llvm/utils/TableGen/GlobalISelMatchTable.cpp b/llvm/utils/TableGen/GlobalISelMatchTable.cpp
index 9a4a375f34bdb..6ec85269e6e20 100644
--- a/llvm/utils/TableGen/GlobalISelMatchTable.cpp
+++ b/llvm/utils/TableGen/GlobalISelMatchTable.cpp
@@ -822,6 +822,15 @@ const OperandMatcher &RuleMatcher::getPhysRegOperandMatcher(Record *Reg) const {
   return *I->second;
 }
 
+OperandMatcher &RuleMatcher::getOperandMatcher(StringRef Name) {
+  const auto &I = DefinedOperands.find(Name);
+
+  if (I == DefinedOperands.end())
+    PrintFatalError(SrcLoc, "Operand " + Name + " was not declared in matcher");
+
+  return *I->second;
+}
+
 const OperandMatcher &RuleMatcher::getOperandMatcher(StringRef Name) const {
   const auto &I = DefinedOperands.find(Name);
 
@@ -1081,6 +1090,17 @@ void RecordNamedOperandMatcher::emitPredicateOpcodes(MatchTable &Table,
         << MatchTable::Comment("Name : " + Name) << MatchTable::LineBreak;
 }
 
+//===- RecordRegisterType ------------------------------------------===//
+
+void RecordRegisterType::emitPredicateOpcodes(MatchTable &Table,
+                                              RuleMatcher &Rule) const {
+  assert(Idx < 0 && "Temp types always have negative indexes!");
+  Table << MatchTable::Opcode("GIM_RecordRegType") << MatchTable::Comment("MI")
+        << MatchTable::IntValue(InsnVarID) << MatchTable::Comment("Op")
+        << MatchTable::IntValue(OpIdx) << MatchTable::Comment("TempTypeIdx")
+        << MatchTable::IntValue(Idx) << MatchTable::LineBreak;
+}
+
 //===- ComplexPatternOperandMatcher ---------------------------------------===//
 
 void ComplexPatternOperandMatcher::emitPredicateOpcodes(
@@ -1196,6 +1216,18 @@ std::string OperandMatcher::getOperandExpr(unsigned InsnVarID) const {
 
 unsigned OperandMatcher::getInsnVarID() const { return Insn.getInsnVarID(); }
 
+TempTypeIdx OperandMatcher::getTempTypeIdx(RuleMatcher &Rule) {
+  if (TTIdx >= 0) {
+    // Temp type index not assigned yet, so assign one and add the necessary
+    // predicate.
+    TTIdx = Rule.getNextTempTypeIdx();
+    assert(TTIdx < 0);
+    addPredicate<RecordRegisterType>(TTIdx);
+    return TTIdx;
+  }
+  return TTIdx;
+}
+
 void OperandMatcher::emitPredicateOpcodes(MatchTable &Table,
                                           RuleMatcher &Rule) {
   if (!Optimized) {
@@ -2092,9 +2124,7 @@ void MakeTempRegisterAction::emitActionOpcodes(MatchTable &Table,
                                                RuleMatcher &Rule) const {
   Table << MatchTable::Opcode("GIR_MakeTempReg")
         << MatchTable::Comment("TempRegID") << MatchTable::IntValue(TempRegID)
-        << MatchTable::Comment("TypeID")
-        << MatchTable::NamedValue(Ty.getCxxEnumValue())
-        << MatchTable::LineBreak;
+        << MatchTable::Comment("TypeID") << Ty << MatchTable::LineBreak;
 }
 
 } // namespace gi
diff --git a/llvm/utils/TableGen/GlobalISelMatchTable.h b/llvm/utils/TableGen/GlobalISelMatchTable.h
index 5608bab482bfd..364f2a1ec725d 100644
--- a/llvm/utils/TableGen/GlobalISelMatchTable.h
+++ b/llvm/utils/TableGen/GlobalISelMatchTable.h
@@ -273,6 +273,40 @@ extern std::set<LLTCodeGen> KnownTypes;
 /// MVTs that don't map cleanly to an LLT (e.g., iPTR, *any, ...).
 std::optional<LLTCodeGen> MVTToLLT(MVT::SimpleValueType SVT);
 
+using TempTypeIdx = int64_t;
+class LLTCodeGenOrTempType {
+public:
+  LLTCodeGenOrTempType(const LLTCodeGen &LLT) : Data(LLT) {}
+  LLTCodeGenOrTempType(TempTypeIdx TempTy) : Data(TempTy) {}
+
+  bool isLLTCodeGen() const { return std::holds_alternative<LLTCodeGen>(Data); }
+  bool isTempTypeIdx() const {
+    return std::holds_alternative<TempTypeIdx>(Data);
+  }
+
+  const LLTCodeGen &getLLTCodeGen() const {
+    assert(isLLTCodeGen());
+    return std::get<LLTCodeGen>(Data);
+  }
+
+  TempTypeIdx getTempTypeIdx() const {
+    assert(isTempTypeIdx());
+    return std::get<TempTypeIdx>(Data);
+  }
+
+private:
+  std::variant<LLTCodeGen, TempTypeIdx> Data;
+};
+
+inline MatchTable &operator<<(MatchTable &Table,
+                              const LLTCodeGenOrTempType &Ty) {
+  if (Ty.isLLTCodeGen())
+    Table << MatchTable::NamedValue(Ty.getLLTCodeGen().getCxxEnumValue());
+  else
+    Table << MatchTable::IntValue(Ty.getTempTypeIdx());
+  return Table;
+}
+
 //===- Matchers -----------------------------------------------------------===//
 class Matcher {
 public:
@@ -459,6 +493,9 @@ class RuleMatcher : public Matcher {
   /// ID for the next temporary register ID allocated with allocateTempRegID()
   unsigned NextTempRegID;
 
+  /// ID for the next recorded type. Starts at -1 and counts down.
+  TempTypeIdx NextTempTypeIdx = -1;
+
   // HwMode predicate index for this rule. -1 if no HwMode.
   int HwModeIdx = -1;
 
@@ -498,6 +535,8 @@ class RuleMatcher : public Matcher {
   RuleMatcher(RuleMatcher &&Other) = default;
   RuleMatcher &operator=(RuleMatcher &&Other) = default;
 
+  TempTypeIdx getNextTempTypeIdx() { return NextTempTypeIdx--; }
+
   uint64_t getRuleID() const { return RuleID; }
 
   InstructionMatcher &addInstructionMatcher(StringRef SymbolicName);
@@ -602,6 +641,7 @@ class RuleMatcher : public Matcher {
   }
 
   InstructionMatcher &getInstructionMatcher(StringRef SymbolicName) const;
+  OperandMatcher &getOperandMatcher(StringRef Name);
   const OperandMatcher &getOperandMatcher(StringRef Name) const;
   const OperandMatcher &getPhysRegOperandMatcher(Record *) const;
 
@@ -762,6 +802,7 @@ class PredicateMatcher {
     OPM_RegBank,
     OPM_MBB,
     OPM_RecordNamedOperand,
+    OPM_RecordRegType,
   };
 
 protected:
@@ -963,6 +1004,30 @@ class RecordNamedOperandMatcher : public OperandPredicateMatcher {
                             RuleMatcher &Rule) const override;
 };
 
+/// Generates code to store a register operand's type into the set of temporary
+/// LLTs.
+class RecordRegisterType : public OperandPredicateMatcher {
+protected:
+  TempTypeIdx Idx;
+
+public:
+  RecordRegisterType(unsigned InsnVarID, unsigned OpIdx, TempTypeIdx Idx)
+      : OperandPredicateMatcher(OPM_RecordRegType, InsnVarID, OpIdx), Idx(Idx) {
+  }
+
+  static bool classof(const PredicateMatcher *P) {
+    return P->getKind() == OPM_RecordRegType;
+  }
+
+  bool isIdentical(const PredicateMatcher &B) const override {
+    return OperandPredicateMatcher::isIdentical(B) &&
+           Idx == cast<RecordRegisterType>(&B)->Idx;
+  }
+
+  void emitPredicateOpcodes(MatchTable &Table,
+                            RuleMatcher &Rule) const override;
+};
+
 /// Generates code to check that an operand is a particular target constant.
 class ComplexPatternOperandMatcher : public OperandPredicateMatcher {
 protected:
@@ -1169,6 +1234,8 @@ class OperandMatcher : public PredicateListMatcher<OperandPredicateMatcher> {
   /// countRendererFns().
   unsigned AllocatedTemporariesBaseID;
 
+  TempTypeIdx TTIdx = 0;
+
 public:
   OperandMatcher(InstructionMatcher &Insn, unsigned OpIdx,
                  const std::string &SymbolicName,
@@ -1196,6 +1263,11 @@ class OperandMatcher : public PredicateListMatcher<OperandPredicateMatcher> {
   unsigned getOpIdx() const { return OpIdx; }
   unsigned getInsnVarID() const;
 
+  /// If this OperandMatcher has not been assigned a TempTypeIdx yet, assigns it
+  /// one and adds a `RecordRegisterType` predicate to this matcher. If one has
+  /// already been assigned, simply returns it.
+  TempTypeIdx getTempTypeIdx(RuleMatcher &Rule);
+
   std::string getOperandExpr(unsigned InsnVarID) const;
 
   InstructionMatcher &getInstructionMatcher() const { return Insn; }
@@ -1955,15 +2027,16 @@ class ImmRenderer : public OperandRenderer {
 protected:
   unsigned InsnID;
   int64_t Imm;
-  std::optional<LLTCodeGen> CImmLLT;
+  std::optional<LLTCodeGenOrTempType> CImmLLT;
 
 public:
   ImmRenderer(unsigned InsnID, int64_t Imm)
       : OperandRenderer(OR_Imm), InsnID(InsnID), Imm(Imm) {}
 
-  ImmRenderer(unsigned InsnID, int64_t Imm, const LLTCodeGen &CImmLLT)
+  ImmRenderer(unsigned InsnID, int64_t Imm, const LLTCodeGenOrTempType &CImmLLT)
       : OperandRenderer(OR_Imm), InsnID(InsnID), Imm(Imm), CImmLLT(CImmLLT) {
-    KnownTypes.insert(CImmLLT);
+    if (CImmLLT.isLLTCodeGen())
+      KnownTypes.insert(CImmLLT.getLLTCodeGen());
   }
 
   static bool classof(const OperandRenderer *R) {
@@ -1976,8 +2049,7 @@ class ImmRenderer : public OperandRenderer {
              "ConstantInt immediate are only for combiners!");
       Table << MatchTable::Opcode("GIR_AddCImm")
             << MatchTable::Comment("InsnID") << MatchTable::IntValue(InsnID)
-            << MatchTable::Comment("Type")
-            << MatchTable::NamedValue(CImmLLT->getCxxEnumValue())
+            << MatchTable::Comment("Type") << *CImmLLT
             << MatchTable::Comment("Imm") << MatchTable::IntValue(Imm)
             << MatchTable::LineBreak;
     } else {
@@ -2290,13 +2362,14 @@ class ConstrainOperandToRegClassAction : public MatchAction {
 /// instructions together.
 class MakeTempRegisterAction : public MatchAction {
 private:
-  LLTCodeGen Ty;
+  LLTCodeGenOrTempType Ty;
   unsigned TempRegID;
 
 public:
-  MakeTempRegisterAction(const LLTCodeGen &Ty, unsigned TempRegID)
+  MakeTempRegisterAction(const LLTCodeGenOrTempType &Ty, unsigned TempRegID)
       : MatchAction(AK_MakeTempReg), Ty(Ty), TempRegID(TempRegID) {
-    KnownTypes.insert(Ty);
+    if (Ty.isLLTCodeGen())
+      KnownTypes.insert(Ty.getLLTCodeGen());
   }
 
   static bool classof(const MatchAction *A) {

From bbd61d807f86e0b043976eb39a1b0ad13f306a9c Mon Sep 17 00:00:00 2001
From: martinboehme <mboehme@google.com>
Date: Tue, 31 Oct 2023 10:04:30 +0100
Subject: [PATCH 488/877] [clang][dataflow][NFC] Move `parseAll()` to
 TestingSupport and rename `parseFormulas()` (#70437)

I'm working on a patch that will use this function from a different
test.
---
 .../Analysis/FlowSensitive/SolverTest.cpp     | 30 +++++--------------
 .../Analysis/FlowSensitive/TestingSupport.cpp | 16 ++++++++++
 .../Analysis/FlowSensitive/TestingSupport.h   |  4 +++
 3 files changed, 28 insertions(+), 22 deletions(-)

diff --git a/clang/unittests/Analysis/FlowSensitive/SolverTest.cpp b/clang/unittests/Analysis/FlowSensitive/SolverTest.cpp
index a61e692088a87..71f6da93594e3 100644
--- a/clang/unittests/Analysis/FlowSensitive/SolverTest.cpp
+++ b/clang/unittests/Analysis/FlowSensitive/SolverTest.cpp
@@ -25,6 +25,7 @@ using namespace clang;
 using namespace dataflow;
 
 using test::ConstraintContext;
+using test::parseFormulas;
 using testing::_;
 using testing::AnyOf;
 using testing::Pair;
@@ -33,21 +34,6 @@ using testing::UnorderedElementsAre;
 constexpr auto AssignedTrue = Solver::Result::Assignment::AssignedTrue;
 constexpr auto AssignedFalse = Solver::Result::Assignment::AssignedFalse;
 
-std::vector<const Formula *> parseAll(Arena &A, StringRef Lines) {
-  std::vector<const Formula *> Result;
-  while (!Lines.empty()) {
-    auto [First, Rest] = Lines.split('\n');
-    Lines = Rest;
-    if (First.trim().empty())
-      continue;
-    if (auto F = A.parseFormula(First))
-      Result.push_back(&*F);
-    else
-      ADD_FAILURE() << llvm::toString(F.takeError());
-  }
-  return Result;
-}
-
 // Checks if the conjunction of `Vals` is satisfiable and returns the
 // corresponding result.
 Solver::Result solve(llvm::ArrayRef<const Formula *> Vals) {
@@ -277,7 +263,7 @@ TEST(SolverTest, IffWithUnits) {
 
 TEST(SolverTest, IffWithUnitsConflict) {
   Arena A;
-  auto Constraints = parseAll(A, R"(
+  auto Constraints = parseFormulas(A, R"(
      (V0 = V1)
      V0
      !V1
@@ -287,7 +273,7 @@ TEST(SolverTest, IffWithUnitsConflict) {
 
 TEST(SolverTest, IffTransitiveConflict) {
   Arena A;
-  auto Constraints = parseAll(A, R"(
+  auto Constraints = parseFormulas(A, R"(
      (V0 = V1)
      (V1 = V2)
      V2
@@ -298,7 +284,7 @@ TEST(SolverTest, IffTransitiveConflict) {
 
 TEST(SolverTest, DeMorgan) {
   Arena A;
-  auto Constraints = parseAll(A, R"(
+  auto Constraints = parseFormulas(A, R"(
      (!(V0 | V1) = (!V0 & !V1))
      (!(V2 & V3) = (!V2 | !V3))
   )");
@@ -307,7 +293,7 @@ TEST(SolverTest, DeMorgan) {
 
 TEST(SolverTest, RespectsAdditionalConstraints) {
   Arena A;
-  auto Constraints = parseAll(A, R"(
+  auto Constraints = parseFormulas(A, R"(
      (V0 = V1)
      V0
      !V1
@@ -317,7 +303,7 @@ TEST(SolverTest, RespectsAdditionalConstraints) {
 
 TEST(SolverTest, ImplicationIsEquivalentToDNF) {
   Arena A;
-  auto Constraints = parseAll(A, R"(
+  auto Constraints = parseFormulas(A, R"(
      !((V0 => V1) = (!V0 | V1))
   )");
   EXPECT_THAT(solve(Constraints), unsat());
@@ -325,7 +311,7 @@ TEST(SolverTest, ImplicationIsEquivalentToDNF) {
 
 TEST(SolverTest, ImplicationConflict) {
   Arena A;
-  auto Constraints = parseAll(A, R"(
+  auto Constraints = parseFormulas(A, R"(
      (V0 => V1)
      (V0 & !V1)
   )");
@@ -334,7 +320,7 @@ TEST(SolverTest, ImplicationConflict) {
 
 TEST(SolverTest, ReachedLimitsReflectsTimeouts) {
   Arena A;
-  auto Constraints = parseAll(A, R"(
+  auto Constraints = parseFormulas(A, R"(
      (!(V0 | V1) = (!V0 & !V1))
      (!(V2 & V3) = (!V2 & !V3))
   )");
diff --git a/clang/unittests/Analysis/FlowSensitive/TestingSupport.cpp b/clang/unittests/Analysis/FlowSensitive/TestingSupport.cpp
index 65c527ae63d2d..e24ff25cb8292 100644
--- a/clang/unittests/Analysis/FlowSensitive/TestingSupport.cpp
+++ b/clang/unittests/Analysis/FlowSensitive/TestingSupport.cpp
@@ -15,6 +15,7 @@
 #include "llvm/ADT/StringSet.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Testing/Annotations/Annotations.h"
+#include "gtest/gtest.h"
 #include <cassert>
 #include <functional>
 #include <memory>
@@ -218,3 +219,18 @@ const IndirectFieldDecl *test::findIndirectFieldDecl(ASTContext &ASTCtx,
   assert(Result != nullptr);
   return Result;
 }
+
+std::vector<const Formula *> test::parseFormulas(Arena &A, StringRef Lines) {
+  std::vector<const Formula *> Result;
+  while (!Lines.empty()) {
+    auto [First, Rest] = Lines.split('\n');
+    Lines = Rest;
+    if (First.trim().empty())
+      continue;
+    if (auto F = A.parseFormula(First))
+      Result.push_back(&*F);
+    else
+      ADD_FAILURE() << llvm::toString(F.takeError());
+  }
+  return Result;
+}
diff --git a/clang/unittests/Analysis/FlowSensitive/TestingSupport.h b/clang/unittests/Analysis/FlowSensitive/TestingSupport.h
index a8089d9b8c7a1..100d78378695d 100644
--- a/clang/unittests/Analysis/FlowSensitive/TestingSupport.h
+++ b/clang/unittests/Analysis/FlowSensitive/TestingSupport.h
@@ -525,6 +525,10 @@ class ConstraintContext {
   }
 };
 
+/// Parses a list of formulas, separated by newlines, and returns them.
+/// On parse errors, calls `ADD_FAILURE()` to fail the current test.
+std::vector<const Formula *> parseFormulas(Arena &A, StringRef Lines);
+
 } // namespace test
 } // namespace dataflow
 } // namespace clang

From e66629501189b15f24810b7a65cd814f76a25e23 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell@arm.com>
Date: Tue, 31 Oct 2023 09:06:21 +0000
Subject: [PATCH 489/877] [mlir][ArmSME] Support lowering masked
 vector.outerproduct ops to SME (#69604)

This patch adds support for lowering masked outer products to SME. This
is done in two stages. First, vector.outerproducts (both masked and
non-masked) are rewritten to arm_sme.outerproducts. The
arm_sme.outerproduct op is close to vector.outerproduct, but supports
masking on the operands rather than the result. It also limits the cases
it handles to things that could be (directly) lowered to SME.

This currently requires that the source of the mask is a
vector.create_mask op. E.g.:

```mlir
%mask = vector.create_mask %dimA, %dimB : vector<[4]x[4]xi1>
%result = vector.mask %mask {
             vector.outerproduct %vecA, %vecB
              : vector<[4]xf32>, vector<[4]xf32>
          } : vector<[4]x[4]xi1> -> vector<[4]x[4]xf32>
```
Is rewritten to:
```
%maskA = vector.create_mask %dimA : vector<[4]xi1>
%maskB = vector.create_mask %dimB : vector<[4]xi1>
%result = arm_sme.outerproduct %vecA, %vecB masks(%maskA, %maskB)
              : vector<[4]xf32>, vector<[4]xf32>
```
(The same rewrite works for non-masked vector.outerproducts too)

The arm_sme.outerproduct can then be directly lowered to SME intrinsics.
---
 .../mlir/Dialect/ArmSME/IR/ArmSMEOps.td       | 109 ++++++++++++++-
 .../VectorToArmSME/VectorToArmSME.cpp         | 102 +++++++++++++-
 .../Transforms/LegalizeForLLVMExport.cpp      |  66 ++++-----
 mlir/test/Dialect/ArmSME/invalid.mlir         |  22 +++
 mlir/test/Dialect/ArmSME/roundtrip.mlir       |  44 ++++++
 .../Dialect/ArmSME/vector-ops-to-llvm.mlir    |  67 ++++++++-
 .../Dialect/ArmSME/vector-ops-to-sme.mlir     |  96 +++++++++++++
 .../CPU/ArmSME/test-outerproduct-f32.mlir     |  82 ++++++++++-
 .../CPU/ArmSME/test-outerproduct-f64.mlir     | 130 ++++++++++++++++--
 9 files changed, 654 insertions(+), 64 deletions(-)

diff --git a/mlir/include/mlir/Dialect/ArmSME/IR/ArmSMEOps.td b/mlir/include/mlir/Dialect/ArmSME/IR/ArmSMEOps.td
index b30d0fdb866bd..2f6e52ff2badb 100644
--- a/mlir/include/mlir/Dialect/ArmSME/IR/ArmSMEOps.td
+++ b/mlir/include/mlir/Dialect/ArmSME/IR/ArmSMEOps.td
@@ -79,6 +79,27 @@ def ArmSME_TileSliceLayoutAttr : EnumAttr<ArmSME_Dialect, TileSliceLayout,
   let defaultValue = "TileSliceLayout::Horizontal";
 }
 
+def CombiningKind : I32EnumAttr<"CombiningKind", "Kind of combining function", [
+  I32EnumAttrCase<"Add", 0, "add">,
+  I32EnumAttrCase<"Sub", 1, "sub">,
+]> {
+  let cppNamespace = "::mlir::arm_sme";
+  let genSpecializedAttr = 0;
+}
+
+/// An attribute that specifies how to combine a newly produced value with the
+/// accumulator. This is similar to vector::CombiningKindAttr, but limited to
+/// the functions that are valid for SME outer products. Add corresponds to a
+/// MOPA and sub to a MOPS.
+/// E.g. For f32:
+/// FMOPA: https://developer.arm.com/documentation/ddi0602/2022-03/SME-Instructions/FMOPA--non-widening---Floating-point-outer-product-and-accumulate-
+/// FMOPS: https://developer.arm.com/documentation/ddi0602/2022-03/SME-Instructions/FMOPS--non-widening---Floating-point-outer-product-and-subtract-
+def ArmSME_CombiningKindAttr : EnumAttr<ArmSME_Dialect, CombiningKind,
+                                          "kind"> {
+  let assemblyFormat = "`<` $value `>`";
+  let defaultValue = "CombiningKind::Add";
+}
+
 //===----------------------------------------------------------------------===//
 // ArmSME op definitions
 //===----------------------------------------------------------------------===//
@@ -209,7 +230,7 @@ def ZeroOp : ArmSME_Op<"zero", [Pure]> {
   let results = (outs SMETile:$res);
   let description = [{
     Initialise ZA with 0. This operation is convenient wrapper for the SME
-    `zero` intrinsic and instruction. 
+    `zero` intrinsic and instruction.
 
     Example 1: Zero an 8-bit element ZA tile.
 
@@ -561,4 +582,90 @@ def MoveTileSliceToVectorOp : ArmSME_Op<"move_tile_slice_to_vector", [Pure,
   }];
 }
 
+class HasMatchingMaskTypeConstraint<string operand> :
+  OptionalTypesMatchWith<
+    "shape of `" # operand #  "Mask` matches `" # operand # "`",
+    operand, operand # "Mask",
+    "::llvm::cast<mlir::VectorType>($_self).cloneWith({}, IntegerType::get($_ctxt, 1))">;
+
+class OuterProductResultTileTypeConstraint<string operand> :
+  OptionalTypesMatchWith<operand # "type is derived from `lhs` and `rhs`",
+    "lhs", operand,
+    "[&]{"
+    "  auto vectorType = ::llvm::cast<mlir::VectorType>($_self);"
+    "  int64_t size = vectorType.getDimSize(0);"
+    "  return VectorType::get("
+    "    { size, size }, vectorType.getElementType(), { true, true });"
+    "}()">;
+
+def OuterProductOp :
+  ArmSME_Op<"outerproduct", [Pure,
+    AttrSizedOperandSegments,
+    AllTypesMatch<["lhs", "rhs"]>,
+    HasMatchingMaskTypeConstraint<"lhs">,
+    HasMatchingMaskTypeConstraint<"rhs">,
+    PredOpTrait<
+      "both `lhsMask` and `rhsMask` should be provided or neither",
+      CPred<"bool(getLhsMask()) == bool(getRhsMask())">>,
+    OuterProductResultTileTypeConstraint<"result">,
+    OuterProductResultTileTypeConstraint<"acc">
+  ]>
+{
+  let summary = "Outer product with optional fused add/sub";
+
+  let description = [{
+    This operation represents an outer product that fits within an SME tile.
+    All operands must be SVE vectors and the result a SME tile. Unlike
+    `vector.outerproduct` masking is on the operands (rather than the result),
+    which mirrors the SME instructions.
+
+    Example 1: Unmasked outerproduct (without accumulator)
+    ```mlir
+    // Not specifying an accumulator implicitly zeros the destination tile.
+    %result = arm_sme.outerproduct $lhs, $rhs : vector<[4]xf32>, vector<[4]xf32>
+    ```
+
+    Example 2: Unmasked outerproduct (with accumulator)
+    ```mlir
+    %result = arm_sme.outerproduct $lhs, $rhs acc($accumulator)
+                : vector<[4]xf32>, vector<[4]xf32>
+    ```
+
+    Example 3: Masked outerproduct
+    ```mlir
+    %result = arm_sme.outerproduct $lhs, $rhs masks($lhsMask, $rhsMask)
+                : vector<[4]xf32>, vector<[4]xf32>
+    ```
+
+    Example 4: Masked outerproduct (with accumulator)
+    ```mlir
+    %result = arm_sme.outerproduct $lhs, $rhs acc($accumulator) masks($lhsMask, $rhsMask)
+                : vector<[4]xf32>, vector<[4]xf32>
+    ```
+  }];
+
+  let arguments = (ins
+    SVEVector:$lhs, SVEVector:$rhs,
+    Optional<SVEPredicate>:$lhsMask,
+    Optional<SVEPredicate>:$rhsMask,
+    Optional<SMETile>: $acc,
+    ArmSME_CombiningKindAttr:$kind);
+  let results = (outs SMETile:$result);
+
+  let assemblyFormat = [{
+    $lhs `,` $rhs
+    oilist(
+        `kind` `` $kind
+      | `acc` `` `(` $acc `)`
+      | `masks` `` `(` $lhsMask `,` $rhsMask `)`
+    ) attr-dict `:` type($lhs) `,` type($rhs)
+  }];
+
+  let extraClassDeclaration = [{
+    VectorType getLhsType() { return llvm::cast<VectorType>(getLhs().getType()); }
+    VectorType getRhsType() { return llvm::cast<VectorType>(getRhs().getType()); }
+    VectorType getResultType() { return llvm::cast<VectorType>(getResult().getType()); }
+  }];
+}
+
 #endif // ARMSME_OPS
diff --git a/mlir/lib/Conversion/VectorToArmSME/VectorToArmSME.cpp b/mlir/lib/Conversion/VectorToArmSME/VectorToArmSME.cpp
index d06eb4f5b01c9..b60c21e2ced7a 100644
--- a/mlir/lib/Conversion/VectorToArmSME/VectorToArmSME.cpp
+++ b/mlir/lib/Conversion/VectorToArmSME/VectorToArmSME.cpp
@@ -427,6 +427,105 @@ struct TransposeOpToArmSMELowering
   }
 };
 
+/// Conversion pattern for vector.outerproduct.
+///
+/// If the vector.outerproduct is masked (and the mask is from a
+/// vector.create_mask), then the mask is decomposed into two 1-D masks for the
+/// operands.
+///
+/// Example:
+///
+///   %mask = vector.create_mask %dimA, %dimB : vector<[4]x[4]xi1>
+///   %result = vector.mask %mask {
+///                vector.outerproduct %vecA, %vecB
+///                 : vector<[4]xf32>, vector<[4]xf32>
+///             } : vector<[4]x[4]xi1> -> vector<[4]x[4]xf32>
+///
+/// is converted to:
+///
+///    %maskA = vector.create_mask %dimA : vector<[4]xi1>
+///    %maskB = vector.create_mask %dimB : vector<[4]xi1>
+///    %result = arm_sme.outerproduct %vecA, %vecB masks(%maskA, %maskB)
+///                : vector<[4]xf32>, vector<[4]xf32>
+///
+/// Unmasked outerproducts can be directly replaced with the arm_sme op.
+///
+/// Example:
+///
+///   %result = vector.outerproduct %vecA, %vecB
+///              : vector<[4]xf32>, vector<[4]xf32>
+///
+/// is converted to:
+///
+///   %result = arm_sme.outerproduct %vecA, %vecB
+///              : vector<[4]xf32>, vector<[4]xf32>
+///
+struct VectorOuterProductToArmSMELowering
+    : public OpRewritePattern<vector::OuterProductOp> {
+
+  using OpRewritePattern<vector::OuterProductOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(vector::OuterProductOp outerProductOp,
+                                PatternRewriter &rewriter) const override {
+
+    // We don't yet support lowering AXPY operations to SME. These could be
+    // lowered by masking out all but the first element of the LHS.
+    if (!isa<VectorType>(outerProductOp.getOperandTypeRHS()))
+      return outerProductOp.emitError("AXPY operations not supported");
+
+    if (!arm_sme::isValidSMETileVectorType(
+            outerProductOp.getResultVectorType()))
+      return outerProductOp.emitError(
+          "outer product does not fit into SME tile");
+
+    auto kind = outerProductOp.getKind();
+    if (kind != vector::CombiningKind::ADD)
+      return outerProductOp.emitError(
+          "unsupported kind (lowering to SME only supports ADD at the moment)");
+
+    Value lhsMask = {};
+    Value rhsMask = {};
+    Operation *rootOp = outerProductOp;
+    auto loc = outerProductOp.getLoc();
+    if (outerProductOp.isMasked()) {
+      auto maskOp = outerProductOp.getMaskingOp();
+      rewriter.setInsertionPoint(maskOp);
+      rootOp = maskOp;
+      auto operandMasks = decomposeResultMask(loc, maskOp.getMask(), rewriter);
+      if (failed(operandMasks))
+        return failure();
+      std::tie(lhsMask, rhsMask) = *operandMasks;
+    }
+
+    rewriter.replaceOpWithNewOp<arm_sme::OuterProductOp>(
+        rootOp, outerProductOp.getResultVectorType(), outerProductOp.getLhs(),
+        outerProductOp.getRhs(), lhsMask, rhsMask, outerProductOp.getAcc());
+
+    return success();
+  }
+
+  static FailureOr<std::pair<Value, Value>>
+  decomposeResultMask(Location loc, Value mask, PatternRewriter &rewriter) {
+    // Attempt to extract masks from vector.create_mask.
+    // TODO: Add support for other mask sources.
+    auto createMaskOp = mask.getDefiningOp<vector::CreateMaskOp>();
+    if (!createMaskOp)
+      return failure();
+
+    auto maskType = createMaskOp.getVectorType();
+    Value lhsMaskDim = createMaskOp.getOperand(0);
+    Value rhsMaskDim = createMaskOp.getOperand(1);
+
+    VectorType operandMaskType = VectorType::Builder(maskType).dropDim(0);
+    Value lhsMask =
+        rewriter.create<vector::CreateMaskOp>(loc, operandMaskType, lhsMaskDim);
+    Value rhsMask =
+        rewriter.create<vector::CreateMaskOp>(loc, operandMaskType, rhsMaskDim);
+
+    return std::make_pair(lhsMask, rhsMask);
+  }
+};
+
 } // namespace
 
 void mlir::populateVectorToArmSMEPatterns(RewritePatternSet &patterns,
@@ -434,5 +533,6 @@ void mlir::populateVectorToArmSMEPatterns(RewritePatternSet &patterns,
   patterns.add<BroadcastOpToArmSMELowering, ConstantOpToArmSMELowering,
                SplatOpToArmSMELowering, TransferReadPermutationToArmSMELowering,
                TransferWriteToArmSMELowering, TransposeOpToArmSMELowering,
-               VectorLoadToArmSMELowering, VectorStoreToArmSMELowering>(&ctx);
+               VectorLoadToArmSMELowering, VectorStoreToArmSMELowering,
+               VectorOuterProductToArmSMELowering>(&ctx);
 }
diff --git a/mlir/lib/Dialect/ArmSME/Transforms/LegalizeForLLVMExport.cpp b/mlir/lib/Dialect/ArmSME/Transforms/LegalizeForLLVMExport.cpp
index 1231da356f8ed..105f2de207a08 100644
--- a/mlir/lib/Dialect/ArmSME/Transforms/LegalizeForLLVMExport.cpp
+++ b/mlir/lib/Dialect/ArmSME/Transforms/LegalizeForLLVMExport.cpp
@@ -460,11 +460,11 @@ struct MoveTileSliceToVectorArmSMELowering
   }
 };
 
-/// Lower `vector.outerproduct` to SME MOPA intrinsics.
+/// Lower `arm_sme.outerproduct` to SME MOPA intrinsics.
 ///
 /// Example:
 ///
-///   %0 = vector.outerproduct %lhs, %rhs, %acc {kind = #vector.kind<add>}
+///   %0 = arm_sme.outerproduct %lhs, %rhs acc(%acc)
 ///     : vector<[4]xf32>, vector<[4]xf32>
 ///
 /// is converted to:
@@ -474,13 +474,13 @@ struct MoveTileSliceToVectorArmSMELowering
 ///        vector<[4]xf32>) -> ()
 ///
 /// Currently only supports FMOPA and BFMOPA (non-widening).
-struct VectorOuterProductToArmSMELowering
-    : public ConvertOpToLLVMPattern<vector::OuterProductOp> {
-  using ConvertOpToLLVMPattern<vector::OuterProductOp>::ConvertOpToLLVMPattern;
+struct OuterProductOpConversion
+    : public ConvertOpToLLVMPattern<arm_sme::OuterProductOp> {
+  using ConvertOpToLLVMPattern<arm_sme::OuterProductOp>::ConvertOpToLLVMPattern;
 
   LogicalResult
-  matchAndRewrite(vector::OuterProductOp outerProductOp,
-                  vector::OuterProductOp::Adaptor adaptor,
+  matchAndRewrite(arm_sme::OuterProductOp outerProductOp,
+                  arm_sme::OuterProductOp::Adaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
     auto isSupportedType = [](VectorType vectorType) {
       // TODO: the FP outer product instruction variants are predicated on
@@ -512,24 +512,13 @@ struct VectorOuterProductToArmSMELowering
       return true;
     };
 
-    auto resultVectorType = outerProductOp.getResultVectorType();
-    if (!isSupportedType(resultVectorType))
-      return outerProductOp.emitError("unsupported type");
-
-    vector::CombiningKind kind = outerProductOp.getKind();
-    if (kind != vector::CombiningKind::ADD)
-      // TODO: support subtract.
+    // TODO: Support CombiningKind::Sub for outer products.
+    if (outerProductOp.getKind() != CombiningKind::Add)
       return outerProductOp.emitError("unsupported kind");
 
-    auto maskableOp =
-        cast<vector::MaskableOpInterface>(outerProductOp.getOperation());
-    if (maskableOp.isMasked())
-      // TODO: support masking.
-      return outerProductOp.emitError("masking is currently unsupported");
-
-    if (!isa<VectorType>(outerProductOp.getOperandTypeRHS()))
-      // AXPY operation not suited for SME.
-      return failure();
+    auto resultVectorType = outerProductOp.getResultType();
+    if (!isSupportedType(resultVectorType))
+      return outerProductOp.emitError("unsupported type");
 
     auto loc = outerProductOp.getLoc();
 
@@ -542,21 +531,24 @@ struct VectorOuterProductToArmSMELowering
     auto tileId = rewriter.create<arm_sme::CastVectorToTile>(
         loc, rewriter.getIntegerType(elementWidth), acc);
 
-    // Create all active predicate mask.
-    auto one = rewriter.create<arith::ConstantOp>(
-        loc, rewriter.getI1Type(),
-        rewriter.getIntegerAttr(rewriter.getI1Type(), 1));
-    auto predTy =
-        VectorType::get(resultVectorType.getShape()[0], rewriter.getI1Type(),
-                        /*scalableDims=*/{true});
-    auto allActiveMask = rewriter.create<vector::SplatOp>(loc, predTy, one);
-
     auto tileI32 = castTileIDToI32(tileId, loc, rewriter);
 
+    Value lhsMask = outerProductOp.getLhsMask();
+    Value rhsMask = outerProductOp.getRhsMask();
+
+    if (!lhsMask || !rhsMask) {
+      auto predTy =
+          outerProductOp.getLhsType().cloneWith({}, rewriter.getI1Type());
+      Value allActiveMask = rewriter.create<arith::ConstantOp>(
+          loc, DenseElementsAttr::get(predTy, true));
+      lhsMask = allActiveMask;
+      rhsMask = allActiveMask;
+    }
+
     // Create 'arm_sme.intr.mopa' outer product intrinsic.
-    rewriter.create<arm_sme::aarch64_sme_mopa>(
-        loc, tileI32, allActiveMask, allActiveMask, outerProductOp.getLhs(),
-        outerProductOp.getRhs());
+    rewriter.create<arm_sme::aarch64_sme_mopa>(loc, tileI32, lhsMask, rhsMask,
+                                               outerProductOp.getLhs(),
+                                               outerProductOp.getRhs());
 
     // Create `CastTileToVectorOp` to use as the output.
     rewriter.replaceOpWithNewOp<arm_sme::CastTileToVector>(
@@ -733,6 +725,6 @@ void mlir::populateArmSMELegalizeForLLVMExportPatterns(
   patterns.add<
       LoadTileSliceToArmSMELowering, MoveTileSliceToVectorArmSMELowering,
       MoveVectorToTileSliceToArmSMELowering, StoreTileSliceToArmSMELowering,
-      VectorOuterProductToArmSMELowering, ZeroOpConversion,
-      VectorExtractToArmSMELowering, VectorInsertToArmSMELowering>(converter);
+      OuterProductOpConversion, ZeroOpConversion, VectorExtractToArmSMELowering,
+      VectorInsertToArmSMELowering>(converter);
 }
diff --git a/mlir/test/Dialect/ArmSME/invalid.mlir b/mlir/test/Dialect/ArmSME/invalid.mlir
index 25c62f78d8435..dba8b1937936e 100644
--- a/mlir/test/Dialect/ArmSME/invalid.mlir
+++ b/mlir/test/Dialect/ArmSME/invalid.mlir
@@ -150,3 +150,25 @@ func.func @arm_sme_tile_load__pad_but_no_mask(%src : memref<?x?xf64>, %pad : f64
   %tile = arm_sme.tile_load %src[%c0, %c0], %pad, : memref<?x?xf64>, vector<[2]x[2]xf64>
   return
 }
+
+//===----------------------------------------------------------------------===//
+// arm_sme.outerproduct
+//===----------------------------------------------------------------------===//
+
+// -----
+
+func.func @arm_sme_outerproduct__bad_result_type(%vecA: vector<[2]xi16>, %vecB: vector<[2]xi16>) -> vector<[2]x[2]xi16>
+{
+  // expected-error@+1 {{op result #0 must be vector<[16]x[16]xi8> of 8-bit signless integer values or vector<[8]x[8]xi16> of 16-bit signless integer values or vector<[4]x[4]xi32> of 32-bit signless integer values or vector<[2]x[2]xi64> of 64-bit signless integer values or vector<[1]x[1]xi128> of 128-bit signless integer values or vector<[8]x[8]xf16> of 16-bit float values or vector<[8]x[8]xbf16> of bfloat16 type values or vector<[4]x[4]xf32> of 32-bit float values or vector<[2]x[2]xf64> of 64-bit float values, but got 'vector<[2]x[2]xi16>'}}
+  %0 = arm_sme.outerproduct %vecA, %vecB : vector<[2]xi16>, vector<[2]xi16>
+  return %0 : vector<[2]x[2]xi16>
+}
+
+// -----
+
+func.func @arm_sme_outerproduct__bad_vector_type(%vecA: vector<[4]xf32>, %vecB: vector<[8]xf32>) -> vector<[4]x[4]xf32>
+{
+  // expected-error@+1 {{op failed to verify that all of {lhs, rhs} have same type}}
+  %0 = arm_sme.outerproduct %vecA, %vecB : vector<[4]xf32>, vector<[8]xf32>
+  return %0 : vector<[4]x[4]xf32>
+}
diff --git a/mlir/test/Dialect/ArmSME/roundtrip.mlir b/mlir/test/Dialect/ArmSME/roundtrip.mlir
index 6866137267dc6..90b05c54c58d9 100644
--- a/mlir/test/Dialect/ArmSME/roundtrip.mlir
+++ b/mlir/test/Dialect/ArmSME/roundtrip.mlir
@@ -1161,3 +1161,47 @@ func.func @arm_sme_move_tile_slice_to_vector_ver_f64(%tile : vector<[2]x[2]xf64>
   %slice = arm_sme.move_tile_slice_to_vector %tile[%tile_slice_index] layout<vertical> : vector<[2]xf64> from vector<[2]x[2]xf64>
   return %slice : vector<[2]xf64>
 }
+
+//===----------------------------------------------------------------------===//
+// arm_sme.outerproduct
+//===----------------------------------------------------------------------===//
+
+// -----
+
+func.func @arm_sme_outerproduct(%vecA: vector<[8]xi16>, %vecB: vector<[8]xi16>) -> vector<[8]x[8]xi16> {
+  // CHECK: arm_sme.outerproduct {{.*}}, {{.*}} : vector<[8]xi16>, vector<[8]xi16>
+  %result = arm_sme.outerproduct %vecA, %vecB : vector<[8]xi16>, vector<[8]xi16>
+  return %result : vector<[8]x[8]xi16>
+}
+
+// -----
+
+func.func @arm_sme_outerproduct_with_masking(%vecA: vector<[4]xf32>, %vecB: vector<[4]xf32>, %maskA: vector<[4]xi1>, %maskB: vector<[4]xi1>) -> vector<[4]x[4]xf32> {
+  // CHECK: arm_sme.outerproduct {{.*}}, {{.*}} masks({{.*}}, {{.*}}) : vector<[4]xf32>, vector<[4]xf32>
+  %result = arm_sme.outerproduct %vecA, %vecB masks(%maskA, %maskB) : vector<[4]xf32>, vector<[4]xf32>
+  return %result : vector<[4]x[4]xf32>
+}
+
+// -----
+
+func.func @arm_sme_outerproduct_with_acc(%vecA: vector<[2]xi64>, %vecB: vector<[2]xi64>, %acc: vector<[2]x[2]xi64>) -> vector<[2]x[2]xi64> {
+  // CHECK: arm_sme.outerproduct {{.*}}, {{.*}} acc({{.*}}) : vector<[2]xi64>, vector<[2]xi64>
+  %result = arm_sme.outerproduct %vecA, %vecB acc(%acc) : vector<[2]xi64>, vector<[2]xi64>
+  return %result : vector<[2]x[2]xi64>
+}
+
+// -----
+
+func.func @arm_sme_outerproduct_with_kind(%vecA: vector<[2]xf64>, %vecB: vector<[2]xf64>) -> vector<[2]x[2]xf64>  {
+  // CHECK: arm_sme.outerproduct {{.*}}, {{.*}} kind<sub> : vector<[2]xf64>, vector<[2]xf64>
+  %result = arm_sme.outerproduct %vecA, %vecB kind<sub> : vector<[2]xf64>, vector<[2]xf64>
+  return %result : vector<[2]x[2]xf64>
+}
+
+// -----
+
+func.func @arm_sme_outerproduct_with_everything(%vecA: vector<[16]xi8>, %vecB: vector<[16]xi8>, %acc: vector<[16]x[16]xi8>, %maskA: vector<[16]xi1>, %maskB: vector<[16]xi1>) -> vector<[16]x[16]xi8> {
+  // CHECK: arm_sme.outerproduct {{.*}}, {{.*}} kind<sub> acc({{.*}}) masks({{.*}}, {{.*}}) : vector<[16]xi8>, vector<[16]xi8>
+  %result = arm_sme.outerproduct %vecA, %vecB kind<sub> acc(%acc) masks(%maskA, %maskB) : vector<[16]xi8>, vector<[16]xi8>
+  return %result : vector<[16]x[16]xi8>
+}
diff --git a/mlir/test/Dialect/ArmSME/vector-ops-to-llvm.mlir b/mlir/test/Dialect/ArmSME/vector-ops-to-llvm.mlir
index 32f46d9fd817c..721ff8f2c3589 100644
--- a/mlir/test/Dialect/ArmSME/vector-ops-to-llvm.mlir
+++ b/mlir/test/Dialect/ArmSME/vector-ops-to-llvm.mlir
@@ -463,9 +463,68 @@ func.func @vector_outerproduct_no_accumulator(%lhs : vector<[2]xf64>, %rhs : vec
 
 // -----
 
+// CHECK-LABEL: @vector_outerproduct_masked_f32
+// CHECK-SAME: (%[[LHS:.*]]: vector<[4]xf32>, %[[RHS:.*]]: vector<[4]xf32>, %[[ACC:.*]]: vector<[4]x[4]xf32>, %[[DIM0:.*]]: index, %[[DIM1:.*]]: index
+func.func @vector_outerproduct_masked_f32(%lhs : vector<[4]xf32>, %rhs : vector<[4]xf32>, %acc : vector<[4]x[4]xf32>, %dim0 : index, %dim1 : index) {
+  // CHECK: %[[DIM0_I32:.*]] = arith.index_cast %[[DIM0]] : index to i32
+  // CHECK: %[[INSERT_DIM0:.*]] = llvm.insertelement %[[DIM0_I32]], {{.*}} : vector<[4]xi32>
+  // CHECK: %[[SPLAT_DIM0:.*]] = llvm.shufflevector %[[INSERT_DIM0]], {{.*}} : vector<[4]xi32>
+  // CHECK: %[[LHS_MASK:.*]] = arith.cmpi slt, %{{.*}}, %[[SPLAT_DIM0]] : vector<[4]xi32>
+  // CHECK: %[[DIM1_I32:.*]] = arith.index_cast %[[DIM1]] : index to i32
+  // CHECK: %[[INSERT_DIM1:.*]] = llvm.insertelement %[[DIM1_I32]], {{.*}} : vector<[4]xi32>
+  // CHECK: %[[SPLAT_DIM1:.*]] = llvm.shufflevector %[[INSERT_DIM1]], {{.*}} : vector<[4]xi32>
+  // CHECK: %[[RHS_MASK:.*]] = arith.cmpi slt, %{{.*}}, %[[SPLAT_DIM1]] : vector<[4]xi32>
+  // CHECK: %[[CAST_VECTOR_TO_TILE:.*]] = arm_sme.cast_vector_to_tile %[[ACC]] : vector<[4]x[4]xf32> to i32
+  // CHECK: "arm_sme.intr.mopa"(%[[CAST_VECTOR_TO_TILE]], %[[LHS_MASK]], %[[RHS_MASK]], %[[LHS]], %[[RHS]]) : (i32, vector<[4]xi1>, vector<[4]xi1>, vector<[4]xf32>, vector<[4]xf32>)
+  %mask = vector.create_mask %dim0, %dim1 : vector<[4]x[4]xi1>
+  %result = vector.mask %mask { vector.outerproduct %lhs, %rhs, %acc {kind = #vector.kind<add>} : vector<[4]xf32>, vector<[4]xf32> } : vector<[4]x[4]xi1> -> vector<[4]x[4]xf32>
+  "prevent.dce"(%result) : (vector<[4]x[4]xf32>) -> ()
+}
+
+// -----
+
+// CHECK-LABEL: @vector_outerproduct_masked_f16
+// CHECK-SAME: (%[[LHS:.*]]: vector<[8]xf16>, %[[RHS:.*]]: vector<[8]xf16>, %[[ACC:.*]]: vector<[8]x[8]xf16>,
+func.func @vector_outerproduct_masked_f16(%lhs : vector<[8]xf16>, %rhs : vector<[8]xf16>, %acc : vector<[8]x[8]xf16>, %dim0 : index, %dim1 : index) {
+  // CHECK: arith.cmpi slt, {{.*}} : vector<[8]xi32>
+  // CHECK: arith.cmpi slt, {{.*}} : vector<[8]xi32>
+  // CHECK: "arm_sme.intr.mopa"({{.*}}, {{.*}}, {{.*}}) : (i32, vector<[8]xi1>, vector<[8]xi1>, vector<[8]xf16>, vector<[8]xf16>)
+  %mask = vector.create_mask %dim0, %dim1 : vector<[8]x[8]xi1>
+  %result = vector.mask %mask { vector.outerproduct %lhs, %rhs, %acc {kind = #vector.kind<add>} : vector<[8]xf16>, vector<[8]xf16> } : vector<[8]x[8]xi1> -> vector<[8]x[8]xf16>
+  "prevent.dce"(%result) : (vector<[8]x[8]xf16>) -> ()
+}
+
+// -----
+
+// CHECK-LABEL: @vector_outerproduct_masked_bf16
+// CHECK-SAME: (%[[LHS:.*]]: vector<[8]xbf16>, %[[RHS:.*]]: vector<[8]xbf16>, %[[ACC:.*]]: vector<[8]x[8]xbf16>,
+func.func @vector_outerproduct_masked_bf16(%lhs : vector<[8]xbf16>, %rhs : vector<[8]xbf16>, %acc : vector<[8]x[8]xbf16>, %dim0 : index, %dim1 : index) {
+  // CHECK: arith.cmpi slt, {{.*}} : vector<[8]xi32>
+  // CHECK: arith.cmpi slt, {{.*}} : vector<[8]xi32>
+  // CHECK: "arm_sme.intr.mopa"({{.*}}, {{.*}}, {{.*}}) : (i32, vector<[8]xi1>, vector<[8]xi1>, vector<[8]xbf16>, vector<[8]xbf16>)
+  %mask = vector.create_mask %dim0, %dim1 : vector<[8]x[8]xi1>
+  %result = vector.mask %mask { vector.outerproduct %lhs, %rhs, %acc {kind = #vector.kind<add>} : vector<[8]xbf16>, vector<[8]xbf16> } : vector<[8]x[8]xi1> -> vector<[8]x[8]xbf16>
+  "prevent.dce"(%result) : (vector<[8]x[8]xbf16>) -> ()
+}
+
+// -----
+
+// CHECK-LABEL: @vector_outerproduct_masked_f64
+// CHECK-SAME: (%[[LHS:.*]]: vector<[2]xf64>, %[[RHS:.*]]: vector<[2]xf64>, %[[ACC:.*]]: vector<[2]x[2]xf64>,
+func.func @vector_outerproduct_masked_f64(%lhs : vector<[2]xf64>, %rhs : vector<[2]xf64>, %acc : vector<[2]x[2]xf64>, %dim0 : index, %dim1 : index) {
+  // CHECK: arith.cmpi slt, {{.*}} : vector<[2]xi32>
+  // CHECK: arith.cmpi slt, {{.*}} : vector<[2]xi32>
+  // CHECK: "arm_sme.intr.mopa"({{.*}}, {{.*}}, {{.*}}) : (i32, vector<[2]xi1>, vector<[2]xi1>, vector<[2]xf64>, vector<[2]xf64>)
+  %mask = vector.create_mask %dim0, %dim1 : vector<[2]x[2]xi1>
+  %result = vector.mask %mask { vector.outerproduct %lhs, %rhs, %acc {kind = #vector.kind<add>} : vector<[2]xf64>, vector<[2]xf64> } : vector<[2]x[2]xi1> -> vector<[2]x[2]xf64>
+  "prevent.dce"(%result) : (vector<[2]x[2]xf64>) -> ()
+}
+
+// -----
+
 // CHECK-LABEL: @vector_outerproduct_unsupported_axpy
 func.func @vector_outerproduct_unsupported_axpy(%lhs : vector<[2]xf64>, %rhs : f64, %acc : vector<[2]xf64>) -> vector<[2]xf64> {
-  // CHECK-NOT: arm_sme
+  // expected-error@+1 {{AXPY operations not supported}}
   %0 = vector.outerproduct %lhs, %rhs, %acc {kind = #vector.kind<mul>} : vector<[2]xf64>, f64
   return %0 : vector<[2]xf64>
 }
@@ -473,7 +532,6 @@ func.func @vector_outerproduct_unsupported_axpy(%lhs : vector<[2]xf64>, %rhs : f
 // -----
 
 func.func @vector_outerproduct_unsupported_type(%lhs : vector<[16]xi8>, %rhs : vector<[16]xi8>, %acc : vector<[16]x[16]xi8>) {
-  // expected-error@+2 {{failed to legalize operation 'vector.outerproduct'}}
   // expected-error@+1 {{unsupported type}}
   %0 = vector.outerproduct %lhs, %rhs, %acc {kind = #vector.kind<add>} : vector<[16]xi8>, vector<[16]xi8>
   "prevent.dce"(%0) : (vector<[16]x[16]xi8>) -> ()
@@ -490,9 +548,8 @@ func.func @vector_outerproduct_unsupported_kind(%lhs : vector<[2]xf64>, %rhs : v
 
 // -----
 
-func.func @vector_outerproduct_add_masked_f32(%lhs : vector<[4]xf32>, %rhs : vector<[4]xf32>, %acc : vector<[4]x[4]xf32>, %mask : vector<[4]x[4]xi1>) {
-  // expected-error@+2 {{failed to legalize operation 'vector.outerproduct'}}
-  // expected-error@+1 {{masking is currently unsupported}}
+func.func @vector_outerproduct_unknown_mask(%lhs : vector<[4]xf32>, %rhs : vector<[4]xf32>, %acc : vector<[4]x[4]xf32>, %mask : vector<[4]x[4]xi1>) {
+  // expected-error@+1 {{failed to legalize operation 'vector.outerproduct'}}
   %0 = vector.mask %mask { vector.outerproduct %lhs, %rhs, %acc {kind = #vector.kind<add>} : vector<[4]xf32>, vector<[4]xf32> } : vector<[4]x[4]xi1> -> vector<[4]x[4]xf32>
   "prevent.dce"(%0) : (vector<[4]x[4]xf32>) -> ()
 }
diff --git a/mlir/test/Dialect/ArmSME/vector-ops-to-sme.mlir b/mlir/test/Dialect/ArmSME/vector-ops-to-sme.mlir
index 455b47a83e28f..9eb7cd143e5b5 100644
--- a/mlir/test/Dialect/ArmSME/vector-ops-to-sme.mlir
+++ b/mlir/test/Dialect/ArmSME/vector-ops-to-sme.mlir
@@ -578,3 +578,99 @@ func.func @transpose_f64(%arg0: vector<[2]x[2]xf64>) {
   "prevent.dce"(%0) : (vector<[2]x[2]xf64>) -> ()
   return
 }
+
+//===----------------------------------------------------------------------===//
+// vector.outerproduct
+//===----------------------------------------------------------------------===//
+
+// -----
+
+// CHECK-LABEL: @vector_outerproduct_masked_f16
+// CHECK-SAME: (%[[LHS:.*]]: vector<[8]xf16>, %[[RHS:.*]]: vector<[8]xf16>, %[[ACC:.*]]: vector<[8]x[8]xf16>, %[[DIM0:.*]]: index, %[[DIM1:.*]]: index
+func.func @vector_outerproduct_masked_f16(%lhs : vector<[8]xf16>, %rhs : vector<[8]xf16>, %acc : vector<[8]x[8]xf16>, %dim0 : index, %dim1 : index) {
+  %mask = vector.create_mask %dim0, %dim1 : vector<[8]x[8]xi1>
+  // CHECK: %[[LHS_MASK:.*]] = vector.create_mask %[[DIM0]] : vector<[8]xi1>
+  // CHECK: %[[RHS_MASK:.*]] = vector.create_mask %[[DIM1]] : vector<[8]xi1>
+  // CHECK: arm_sme.outerproduct %[[LHS]], %[[RHS]] acc(%[[ACC]]) masks(%[[LHS_MASK]], %[[RHS_MASK]]) : vector<[8]xf16>, vector<[8]xf16>
+  %result = vector.mask %mask { vector.outerproduct %lhs, %rhs, %acc {kind = #vector.kind<add>} : vector<[8]xf16>, vector<[8]xf16> } : vector<[8]x[8]xi1> -> vector<[8]x[8]xf16>
+  "prevent.dce"(%result) : (vector<[8]x[8]xf16>) -> ()
+}
+
+// -----
+
+// CHECK-LABEL: @vector_outerproduct_masked_bf16
+// CHECK-SAME: (%[[LHS:.*]]: vector<[8]xbf16>, %[[RHS:.*]]: vector<[8]xbf16>, %[[ACC:.*]]: vector<[8]x[8]xbf16>, %[[DIM0:.*]]: index, %[[DIM1:.*]]: index
+func.func @vector_outerproduct_masked_bf16(%lhs : vector<[8]xbf16>, %rhs : vector<[8]xbf16>, %acc : vector<[8]x[8]xbf16>, %dim0 : index, %dim1 : index) {
+  %mask = vector.create_mask %dim0, %dim1 : vector<[8]x[8]xi1>
+  // CHECK: %[[LHS_MASK:.*]] = vector.create_mask %[[DIM0]] : vector<[8]xi1>
+  // CHECK: %[[RHS_MASK:.*]] = vector.create_mask %[[DIM1]] : vector<[8]xi1>
+  // CHECK: arm_sme.outerproduct %[[LHS]], %[[RHS]] acc(%[[ACC]]) masks(%[[LHS_MASK]], %[[RHS_MASK]]) : vector<[8]xbf16>, vector<[8]xbf16>
+  %result = vector.mask %mask { vector.outerproduct %lhs, %rhs, %acc {kind = #vector.kind<add>} : vector<[8]xbf16>, vector<[8]xbf16> } : vector<[8]x[8]xi1> -> vector<[8]x[8]xbf16>
+  "prevent.dce"(%result) : (vector<[8]x[8]xbf16>) -> ()
+}
+
+// -----
+
+// CHECK-LABEL: @vector_outerproduct_masked_f32
+// CHECK-SAME: (%[[LHS:.*]]: vector<[4]xf32>, %[[RHS:.*]]: vector<[4]xf32>, %[[ACC:.*]]: vector<[4]x[4]xf32>, %[[DIM0:.*]]: index, %[[DIM1:.*]]: index
+func.func @vector_outerproduct_masked_f32(%lhs : vector<[4]xf32>, %rhs : vector<[4]xf32>, %acc : vector<[4]x[4]xf32>, %dim0 : index, %dim1 : index) {
+  %mask = vector.create_mask %dim0, %dim1 : vector<[4]x[4]xi1>
+  // CHECK: %[[LHS_MASK:.*]] = vector.create_mask %[[DIM0]] : vector<[4]xi1>
+  // CHECK: %[[RHS_MASK:.*]] = vector.create_mask %[[DIM1]] : vector<[4]xi1>
+  // CHECK: arm_sme.outerproduct %[[LHS]], %[[RHS]] acc(%[[ACC]]) masks(%[[LHS_MASK]], %[[RHS_MASK]]) : vector<[4]xf32>, vector<[4]xf32>
+  %result = vector.mask %mask { vector.outerproduct %lhs, %rhs, %acc {kind = #vector.kind<add>} : vector<[4]xf32>, vector<[4]xf32> } : vector<[4]x[4]xi1> -> vector<[4]x[4]xf32>
+  "prevent.dce"(%result) : (vector<[4]x[4]xf32>) -> ()
+}
+
+// -----
+
+// CHECK-LABEL: @vector_outerproduct_masked_f64
+// CHECK-SAME: (%[[LHS:.*]]: vector<[2]xf64>, %[[RHS:.*]]: vector<[2]xf64>, %[[ACC:.*]]: vector<[2]x[2]xf64>, %[[DIM0:.*]]: index, %[[DIM1:.*]]: index
+func.func @vector_outerproduct_masked_f64(%lhs : vector<[2]xf64>, %rhs : vector<[2]xf64>, %acc : vector<[2]x[2]xf64>, %dim0 : index, %dim1 : index) {
+  %mask = vector.create_mask %dim0, %dim1 : vector<[2]x[2]xi1>
+  // CHECK: %[[LHS_MASK:.*]] = vector.create_mask %[[DIM0]] : vector<[2]xi1>
+  // CHECK: %[[RHS_MASK:.*]] = vector.create_mask %[[DIM1]] : vector<[2]xi1>
+  // CHECK: arm_sme.outerproduct %[[LHS]], %[[RHS]] acc(%[[ACC]]) masks(%[[LHS_MASK]], %[[RHS_MASK]]) : vector<[2]xf64>, vector<[2]xf64>
+  %result = vector.mask %mask { vector.outerproduct %lhs, %rhs, %acc {kind = #vector.kind<add>} : vector<[2]xf64>, vector<[2]xf64> } : vector<[2]x[2]xi1> -> vector<[2]x[2]xf64>
+  "prevent.dce"(%result) : (vector<[2]x[2]xf64>) -> ()
+}
+
+// -----
+
+// CHECK-LABEL: @vector_outerproduct_f16
+// CHECK-SAME: (%[[LHS:.*]]: vector<[8]xf16>, %[[RHS:.*]]: vector<[8]xf16>, %[[ACC:.*]]: vector<[8]x[8]xf16>
+func.func @vector_outerproduct_f16(%lhs : vector<[8]xf16>, %rhs : vector<[8]xf16>, %acc : vector<[8]x[8]xf16>) {
+  // CHECK: arm_sme.outerproduct %[[LHS]], %[[RHS]] acc(%[[ACC]]) : vector<[8]xf16>, vector<[8]xf16>
+  %result = vector.outerproduct %lhs, %rhs, %acc {kind = #vector.kind<add>} : vector<[8]xf16>, vector<[8]xf16>
+  "prevent.dce"(%result) : (vector<[8]x[8]xf16>) -> ()
+}
+
+// -----
+
+// CHECK-LABEL: @vector_outerproduct_bf16
+// CHECK-SAME: (%[[LHS:.*]]: vector<[8]xbf16>, %[[RHS:.*]]: vector<[8]xbf16>, %[[ACC:.*]]: vector<[8]x[8]xbf16>
+func.func @vector_outerproduct_bf16(%lhs : vector<[8]xbf16>, %rhs : vector<[8]xbf16>, %acc : vector<[8]x[8]xbf16>) {
+  // CHECK: arm_sme.outerproduct %[[LHS]], %[[RHS]] acc(%[[ACC]]) : vector<[8]xbf16>, vector<[8]xbf16>
+  %result = vector.outerproduct %lhs, %rhs, %acc {kind = #vector.kind<add>} : vector<[8]xbf16>, vector<[8]xbf16>
+  "prevent.dce"(%result) : (vector<[8]x[8]xbf16>) -> ()
+}
+
+// -----
+
+// CHECK-LABEL: @vector_outerproduct_f32
+// CHECK-SAME: (%[[LHS:.*]]: vector<[4]xf32>, %[[RHS:.*]]: vector<[4]xf32>, %[[ACC:.*]]: vector<[4]x[4]xf32>
+func.func @vector_outerproduct_f32(%lhs : vector<[4]xf32>, %rhs : vector<[4]xf32>, %acc : vector<[4]x[4]xf32>) {
+  // CHECK: arm_sme.outerproduct %[[LHS]], %[[RHS]] acc(%[[ACC]]) : vector<[4]xf32>, vector<[4]xf32>
+  %result = vector.outerproduct %lhs, %rhs, %acc {kind = #vector.kind<add>} : vector<[4]xf32>, vector<[4]xf32>
+  "prevent.dce"(%result) : (vector<[4]x[4]xf32>) -> ()
+}
+
+// -----
+
+// CHECK-LABEL: @vector_outerproduct_f64
+// CHECK-SAME: (%[[LHS:.*]]: vector<[2]xf64>, %[[RHS:.*]]: vector<[2]xf64>, %[[ACC:.*]]: vector<[2]x[2]xf64>
+func.func @vector_outerproduct_f64(%lhs : vector<[2]xf64>, %rhs : vector<[2]xf64>, %acc : vector<[2]x[2]xf64>) {
+  // CHECK: arm_sme.outerproduct %[[LHS]], %[[RHS]] acc(%[[ACC]]) : vector<[2]xf64>, vector<[2]xf64>
+  %result = vector.outerproduct %lhs, %rhs, %acc {kind = #vector.kind<add>} : vector<[2]xf64>, vector<[2]xf64>
+  "prevent.dce"(%result) : (vector<[2]x[2]xf64>) -> ()
+}
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-outerproduct-f32.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-outerproduct-f32.mlir
index 38ba489e2fafb..ae5ad9cc2a5e9 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-outerproduct-f32.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-outerproduct-f32.mlir
@@ -3,16 +3,24 @@
 // DEFINE:   -enable-arm-streaming="mode=locally enable-za" \
 // DEFINE:   -convert-vector-to-arm-sme -convert-arm-sme-to-scf \
 // DEFINE:   -convert-vector-to-llvm="enable-arm-sme" -cse -canonicalize \
-// DEFINE:   -allocate-arm-sme-tiles -test-lower-to-llvm
-// DEFINE: %{run} = %mcr_aarch64_cmd \
+// DEFINE:   -allocate-arm-sme-tiles -test-lower-to-llvm -o %t
+// DEFINE: %{run} = %mcr_aarch64_cmd %t \
 // DEFINE:   -march=aarch64 -mattr=+sve,+sme \
 // DEFINE:   -e %{entry_point} -entry-point-result=void \
 // DEFINE:   -shared-libs=%mlir_runner_utils,%mlir_c_runner_utils
 
-// RUN: %{compile} | %{run} | FileCheck %s --check-prefix=WITHOUT-ACC
+// RUN: %{compile}
+
+// RUN: %{run} | FileCheck %s --check-prefix=WITHOUT-ACC
 
 // REDEFINE: %{entry_point} = test_outerproduct_with_accumulator_4x4xf32
-// RUN: %{compile} | %{run} | FileCheck %s --check-prefix=WITH-ACC
+// RUN: %{run} | FileCheck %s --check-prefix=WITH-ACC
+
+// REDEFINE: %{entry_point} = test_masked_outerproduct_no_accumulator_4x4xf32
+// RUN: %{run} | FileCheck %s --check-prefix=WITH-MASK
+
+// REDEFINE: %{entry_point} = test_masked_outerproduct_with_accumulator_4x4xf32
+// RUN: %{run} | FileCheck %s --check-prefix=WITH-MASK-AND-ACC
 
 func.func @test_outerproduct_no_accumulator_4x4xf32() {
   %c0 = arith.constant 0 : index
@@ -41,7 +49,7 @@ func.func @test_outerproduct_with_accumulator_4x4xf32() {
   %c0 = arith.constant 0 : index
   %f10 = arith.constant 10.0 : f32
 
-  %acc = vector.broadcast %f10 : f32 to vector<[4]x[4]xf32>
+  %acc = vector.splat %f10 : vector<[4]x[4]xf32>
   %vector_i32 = llvm.intr.experimental.stepvector : vector<[4]xi32>
   %vector = arith.sitofp %vector_i32 : vector<[4]xi32> to vector<[4]xf32>
   %tile = vector.outerproduct %vector, %vector, %acc : vector<[4]xf32>, vector<[4]xf32>
@@ -61,3 +69,67 @@ func.func @test_outerproduct_with_accumulator_4x4xf32() {
 
   return
 }
+
+func.func @test_masked_outerproduct_no_accumulator_4x4xf32() {
+  %c0 = arith.constant 0 : index
+  %ones = arith.constant dense<1> : vector<[4]xi32>
+
+  %step_vector = llvm.intr.experimental.stepvector : vector<[4]xi32>
+  %vector_i32 = arith.addi %step_vector, %ones : vector<[4]xi32>
+  %vector = arith.sitofp %vector_i32 : vector<[4]xi32> to vector<[4]xf32>
+
+  %lhsDim = arith.constant 3 : index
+  %rhsDim = arith.constant 2 : index
+  %mask = vector.create_mask %lhsDim, %rhsDim : vector<[4]x[4]xi1>
+
+  %tile = vector.mask %mask {
+    vector.outerproduct %vector, %vector : vector<[4]xf32>, vector<[4]xf32>
+  } : vector<[4]x[4]xi1> -> vector<[4]x[4]xf32>
+
+  // Print the tile. Due to masking the result will be the top 3x2xf32 section.
+  //
+  // WITH-MASK:      TILE BEGIN
+  // WITH-MASK-NEXT: ( 1, 2, 0, 0
+  // WITH-MASK-NEXT: ( 2, 4, 0, 0
+  // WITH-MASK-NEXT: ( 3, 6, 0, 0
+  // WITH-MASK-NEXT: ( 0, 0, 0, 0
+  // WITH-MASK:      TILE END
+  vector.print str "TILE BEGIN"
+  vector.print %tile : vector<[4]x[4]xf32>
+  vector.print str "TILE END"
+
+  return
+}
+
+func.func @test_masked_outerproduct_with_accumulator_4x4xf32() {
+  %c0 = arith.constant 0 : index
+  %ones = arith.constant dense<1> : vector<[4]xi32>
+  %f10 = arith.constant 10.0 : f32
+
+  %acc = vector.splat %f10 : vector<[4]x[4]xf32>
+  %step_vector = llvm.intr.experimental.stepvector : vector<[4]xi32>
+  %vector_i32 = arith.addi %step_vector, %ones : vector<[4]xi32>
+  %vector = arith.sitofp %vector_i32 : vector<[4]xi32> to vector<[4]xf32>
+
+  %lhsDim = arith.constant 2 : index
+  %rhsDim = arith.constant 3 : index
+  %mask = vector.create_mask %lhsDim, %rhsDim : vector<[4]x[4]xi1>
+
+  %tile = vector.mask %mask {
+    vector.outerproduct %vector, %vector, %acc : vector<[4]xf32>, vector<[4]xf32>
+  } : vector<[4]x[4]xi1> -> vector<[4]x[4]xf32>
+
+  // Print the tile. Due to masking the result will be the top 2x3xf32 section.
+  //
+  // WITH-MASK-AND-ACC:      TILE BEGIN
+  // WITH-MASK-AND-ACC-NEXT: ( 11, 12, 13, 10
+  // WITH-MASK-AND-ACC-NEXT: ( 12, 14, 16, 10
+  // WITH-MASK-AND-ACC-NEXT: ( 10, 10, 10, 10
+  // WITH-MASK-AND-ACC-NEXT: ( 10, 10, 10, 10
+  // WITH-MASK-AND-ACC:      TILE END
+  vector.print str "TILE BEGIN"
+  vector.print %tile : vector<[4]x[4]xf32>
+  vector.print str "TILE END"
+
+  return
+}
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-outerproduct-f64.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-outerproduct-f64.mlir
index 82f14595a24da..36ce896a4c1bd 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-outerproduct-f64.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-outerproduct-f64.mlir
@@ -1,34 +1,47 @@
-// DEFINE: %{entry_point} = test_outerproduct_with_accumulator_2x2xf64
+// DEFINE: %{entry_point} = test_outerproduct_no_accumulator_2x2xf64
 // DEFINE: %{compile} = mlir-opt %s \
 // DEFINE:   -enable-arm-streaming="mode=locally enable-za" \
 // DEFINE:   -convert-vector-to-arm-sme -convert-arm-sme-to-scf \
 // DEFINE:   -convert-vector-to-llvm="enable-arm-sme" -cse -canonicalize \
-// DEFINE:   -allocate-arm-sme-tiles -test-lower-to-llvm
-// DEFINE: %{run} = %mcr_aarch64_cmd \
+// DEFINE:   -allocate-arm-sme-tiles -test-lower-to-llvm -o %t
+// DEFINE: %{run} = %mcr_aarch64_cmd %t \
 // DEFINE:   -march=aarch64 -mattr=+sve,+sme-f64f64 \
 // DEFINE:   -e %{entry_point} -entry-point-result=void \
 // DEFINE:   -shared-libs=%mlir_runner_utils,%mlir_c_runner_utils
 
-// RUN: %{compile} | %{run} | FileCheck %s
+// RUN: %{compile}
 
-func.func @test_outerproduct_with_accumulator_2x2xf64() {
-  %f1 = arith.constant 1.0 : f64
-  %f2 = arith.constant 2.0 : f64
-  %f10 = arith.constant 10.0 : f64
+// RUN: %{run} | FileCheck %s
+
+// REDEFINE: %{entry_point} = test_outerproduct_with_accumulator_2x2xf64
+// RUN: %{run} | FileCheck %s --check-prefix=WITH-ACC
+
+// REDEFINE: %{entry_point} = test_masked_outerproduct_no_accumulator_2x2xf64
+// RUN: %{run} | FileCheck %s --check-prefix=WITH-MASK
+
+// REDEFINE: %{entry_point} = test_masked_outerproduct_with_accumulator_2x2xf64
+// RUN: %{run} | FileCheck %s --check-prefix=WITH-MASK-AND-ACC
+
+func.func @test_outerproduct_no_accumulator_2x2xf64() {
+  %c0 = arith.constant 0 : index
+  %ones = arith.constant dense<1> : vector<[2]xi32>
 
-  %a = vector.splat %f1 : vector<[2]xf64>
-  %b = vector.splat %f2 : vector<[2]xf64>
-  // TODO: vector.splat doesn't support ArmSME.
-  %c = vector.broadcast %f10 : f64 to vector<[2]x[2]xf64>
+  %step_vector = llvm.intr.experimental.stepvector : vector<[2]xi32>
+  %vector_i32 = arith.addi %step_vector, %ones : vector<[2]xi32>
+  %vector = arith.sitofp %vector_i32 : vector<[2]xi32> to vector<[2]xf64>
 
-  %tile = vector.outerproduct %a, %b, %c : vector<[2]xf64>, vector<[2]xf64>
+  %lhsDim = arith.constant 1 : index
+  %rhsDim = arith.constant 2 : index
+  %mask = vector.create_mask %lhsDim, %rhsDim : vector<[2]x[2]xi1>
+
+  %tile = vector.outerproduct %vector, %vector : vector<[2]xf64>, vector<[2]xf64>
 
   // Print the tile. The smallest SVL is 128-bits so the tile will be at least
   // 2x2xf64.
   //
   // CHECK:      TILE BEGIN
-  // CHECK-NEXT: ( 12, 12
-  // CHECK-NEXT: ( 12, 12
+  // CHECK-NEXT: ( 1, 2
+  // CHECK-NEXT: ( 2, 4
   // CHECK:      TILE END
   vector.print str "TILE BEGIN"
   vector.print %tile : vector<[2]x[2]xf64>
@@ -36,3 +49,90 @@ func.func @test_outerproduct_with_accumulator_2x2xf64() {
 
   return
 }
+
+func.func @test_outerproduct_with_accumulator_2x2xf64() {
+  %c0 = arith.constant 0 : index
+  %ones = arith.constant dense<1> : vector<[2]xi32>
+  %f10 = arith.constant 10.0 : f64
+
+  %acc = vector.splat %f10 : vector<[2]x[2]xf64>
+  %step_vector = llvm.intr.experimental.stepvector : vector<[2]xi32>
+  %vector_i32 = arith.addi %step_vector, %ones : vector<[2]xi32>
+  %vector = arith.sitofp %vector_i32 : vector<[2]xi32> to vector<[2]xf64>
+
+  %tile = vector.outerproduct %vector, %vector, %acc : vector<[2]xf64>, vector<[2]xf64>
+
+  // Print the tile. The smallest SVL is 128-bits so the tile will be at least
+  // 2x2xf64.
+  //
+  // WITH-ACC:      TILE BEGIN
+  // WITH-ACC-NEXT: ( 11, 12
+  // WITH-ACC-NEXT: ( 12, 14
+  // WITH-ACC:      TILE END
+  vector.print str "TILE BEGIN"
+  vector.print %tile : vector<[2]x[2]xf64>
+  vector.print str "TILE END"
+
+  return
+}
+
+func.func @test_masked_outerproduct_no_accumulator_2x2xf64() {
+  %c0 = arith.constant 0 : index
+  %ones = arith.constant dense<1> : vector<[2]xi32>
+  %f10 = arith.constant 10.0 : f64
+
+  %step_vector = llvm.intr.experimental.stepvector : vector<[2]xi32>
+  %vector_i32 = arith.addi %step_vector, %ones : vector<[2]xi32>
+  %vector = arith.sitofp %vector_i32 : vector<[2]xi32> to vector<[2]xf64>
+
+  %lhsDim = arith.constant 2 : index
+  %rhsDim = arith.constant 1 : index
+  %mask = vector.create_mask %lhsDim, %rhsDim : vector<[2]x[2]xi1>
+
+  %tile = vector.mask %mask {
+    vector.outerproduct %vector, %vector : vector<[2]xf64>, vector<[2]xf64>
+  } : vector<[2]x[2]xi1> -> vector<[2]x[2]xf64>
+
+  // Print the tile. Due to masking the result will be the top 2x1xf64 section.
+  //
+  // WITH-MASK:      TILE BEGIN
+  // WITH-MASK-NEXT: ( 1, 0
+  // WITH-MASK-NEXT: ( 2, 0
+  // WITH-MASK:      TILE END
+  vector.print str "TILE BEGIN"
+  vector.print %tile : vector<[2]x[2]xf64>
+  vector.print str "TILE END"
+
+  return
+}
+
+func.func @test_masked_outerproduct_with_accumulator_2x2xf64() {
+  %c0 = arith.constant 0 : index
+  %ones = arith.constant dense<1> : vector<[2]xi32>
+  %f10 = arith.constant 10.0 : f64
+
+  %acc = vector.splat %f10 : vector<[2]x[2]xf64>
+  %step_vector = llvm.intr.experimental.stepvector : vector<[2]xi32>
+  %vector_i32 = arith.addi %step_vector, %ones : vector<[2]xi32>
+  %vector = arith.sitofp %vector_i32 : vector<[2]xi32> to vector<[2]xf64>
+
+  %lhsDim = arith.constant 1 : index
+  %rhsDim = arith.constant 2 : index
+  %mask = vector.create_mask %lhsDim, %rhsDim : vector<[2]x[2]xi1>
+
+  %tile = vector.mask %mask {
+    vector.outerproduct %vector, %vector, %acc : vector<[2]xf64>, vector<[2]xf64>
+  } : vector<[2]x[2]xi1> -> vector<[2]x[2]xf64>
+
+  // Print the tile. Due to masking the result will be the top 1x2xf64 section.
+  //
+  // WITH-MASK-AND-ACC:      TILE BEGIN
+  // WITH-MASK-AND-ACC-NEXT: ( 11, 12
+  // WITH-MASK-AND-ACC-NEXT: ( 10, 10
+  // WITH-MASK-AND-ACC:      TILE END
+  vector.print str "TILE BEGIN"
+  vector.print %tile : vector<[2]x[2]xf64>
+  vector.print str "TILE END"
+
+  return
+}

From 6cbcb793506de2cf76dae19269aa4b47e1b55d43 Mon Sep 17 00:00:00 2001
From: lorenzo chelini <l.chelini@icloud.com>
Date: Tue, 31 Oct 2023 10:07:35 +0100
Subject: [PATCH 490/877] [MLIR][Linalg] Introduce SpecializeOp (#70326)

Introduce an operation to specialize linalg.generics, for example,
detecting a linalg.generic that is semantically equivalent to a
linalg.copy and replacing the former with the latter. After code
generation, it is helpful to lower named operations to vendor-optimized
libraries.
---
 .../mlir/Dialect/Linalg/IR/LinalgInterfaces.h |   3 +
 .../Linalg/TransformOps/LinalgTransformOps.td |  37 +++++
 .../Dialect/Linalg/Transforms/Transforms.h    |   5 +
 .../Dialect/Linalg/IR/LinalgInterfaces.cpp    |  22 +++
 .../TransformOps/LinalgTransformOps.cpp       |  24 +++
 .../Dialect/Linalg/Transforms/CMakeLists.txt  |   1 +
 .../Dialect/Linalg/Transforms/Specialize.cpp  |  32 ++++
 .../Linalg/transform-op-specialize.mlir       | 143 ++++++++++++++++++
 8 files changed, 267 insertions(+)
 create mode 100644 mlir/lib/Dialect/Linalg/Transforms/Specialize.cpp
 create mode 100644 mlir/test/Dialect/Linalg/transform-op-specialize.mlir

diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgInterfaces.h b/mlir/include/mlir/Dialect/Linalg/IR/LinalgInterfaces.h
index f6ba6586a81a2..6c8240267e7d0 100644
--- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgInterfaces.h
+++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgInterfaces.h
@@ -110,6 +110,9 @@ FailureOr<ConvolutionDimensions> inferConvolutionDims(LinalgOp linalgOp);
 // TODO: embed within `isa<ConvolutionOpInterface>` if possible / natural.
 bool isaConvolutionOpInterface(LinalgOp linalgOp);
 
+/// Checks whether `linalgOp` is semantically equivalent to a `linalg.copyOp`.
+bool isaCopyOpInterface(LinalgOp linalgOp);
+
 namespace detail {
 
 /// Returns true if the block contains a contraction of the following form:
diff --git a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
index 1ff88d036bc03..9e3f79e64bb1d 100644
--- a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
+++ b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
@@ -390,6 +390,43 @@ def GeneralizeOp : Op<Transform_Dialect, "structured.generalize",
   }];
 }
 
+//===----------------------------------------------------------------------===//
+// SpecializeOp
+//===----------------------------------------------------------------------===//
+
+def SpecializeOp : Op<Transform_Dialect, "structured.specialize",
+    [FunctionalStyleTransformOpTrait, MemoryEffectsOpInterface,
+     TransformOpInterface, TransformEachOpTrait,
+     ReportTrackingListenerFailuresOpTrait]> {
+  let description = [{
+    Transforms a generic operation into the equivalent named form.
+
+    #### Return modes
+
+    This operation ignores non-Linalg ops and drops them in the return. If all
+    the operations referred to by the `target` handle specialize, the transform
+    succeeds; otherwise, the operation produces a silenceable failure.  The return
+    handle points to only the subset of successfully produced equivalent named
+    operations, which can be empty or contain the original ops if they were already
+    in named form. The supported specialization to named Linalg operations are:
+    - linalg.copy of any rank.
+  }];
+
+  let arguments = (ins TransformHandleTypeInterface:$target);
+  let results = (outs TransformHandleTypeInterface:$transformed);
+  let assemblyFormat =
+      "$target attr-dict `:` "
+      "custom<SemiFunctionType>(type($target), type($transformed))";
+
+  let extraClassDeclaration = [{
+    ::mlir::DiagnosedSilenceableFailure applyToOne(
+        ::mlir::transform::TransformRewriter &rewriter,
+        ::mlir::linalg::LinalgOp target,
+        ::mlir::transform::ApplyToEachResultList &results,
+        ::mlir::transform::TransformState &state);
+  }];
+}
+
 //===----------------------------------------------------------------------===//
 // InterchangeOp
 //===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
index fbe2923c710aa..122f735628521 100644
--- a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
+++ b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
@@ -668,6 +668,11 @@ FailureOr<GenericOp> interchangeGenericOp(RewriterBase &rewriter,
 FailureOr<GenericOp> generalizeNamedOp(RewriterBase &rewriter,
                                        LinalgOp namedOp);
 
+/// Create a namedOp from the given GenericOp and replace the GenericOp.
+/// Currently we can specialize only trivial linalg copy operations.
+FailureOr<LinalgOp> specializeGenericOp(RewriterBase &rewriter,
+                                        GenericOp genericOp);
+
 /// Create a new buffer using the `allocationFn` provided. The size of this
 /// buffer is the smallest constant bounding size along each dimension that
 /// can be computed for the size of the result of `subView`. Returns the
diff --git a/mlir/lib/Dialect/Linalg/IR/LinalgInterfaces.cpp b/mlir/lib/Dialect/Linalg/IR/LinalgInterfaces.cpp
index 5fde8d71cac3e..dfd6b991e7da1 100644
--- a/mlir/lib/Dialect/Linalg/IR/LinalgInterfaces.cpp
+++ b/mlir/lib/Dialect/Linalg/IR/LinalgInterfaces.cpp
@@ -32,6 +32,7 @@ using namespace mlir::linalg;
 //===----------------------------------------------------------------------===//
 // Interface utility functions
 //===----------------------------------------------------------------------===//
+
 bool linalg::detail::canOpOperandsBeDroppedImpl(
     linalg::LinalgOp linalgOp, ArrayRef<OpOperand *> droppedOperands) {
   SmallVector<AffineMap> indexingMaps;
@@ -48,6 +49,27 @@ bool linalg::detail::canOpOperandsBeDroppedImpl(
   return inversePermutation(concatAffineMaps(indexingMaps)) != AffineMap();
 }
 
+//===----------------------------------------------------------------------===//
+// CopyOpInterface implementation
+//===----------------------------------------------------------------------===//
+
+bool linalg::isaCopyOpInterface(LinalgOp linalgOp) {
+  // Structural.
+  if (linalgOp.getNumParallelLoops() != linalgOp.getNumLoops())
+    return false;
+
+  // Operands and maps.
+  if (linalgOp.getNumDpsInputs() != 1 || linalgOp.getNumDpsInits() != 1)
+    return false;
+  auto mapRange = linalgOp.getIndexingMapsArray();
+  if (mapRange.size() != 2 || !mapRange.front().isIdentity() ||
+      !mapRange.back().isIdentity()) {
+    return false;
+  }
+  // Region.
+  return llvm::hasSingleElement(linalgOp.getBlock()->getOperations());
+}
+
 //===----------------------------------------------------------------------===//
 // ContractionOpInterface implementation
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
index 8508507871d0c..87be3bb85b6e7 100644
--- a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
+++ b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
@@ -1018,6 +1018,30 @@ transform::GeneralizeOp::applyToOne(transform::TransformRewriter &rewriter,
   return emitDefaultSilenceableFailure(target);
 }
 
+//===----------------------------------------------------------------------===//
+// SpecializeOp
+//===----------------------------------------------------------------------===/
+
+DiagnosedSilenceableFailure
+transform::SpecializeOp::applyToOne(transform::TransformRewriter &rewriter,
+                                    LinalgOp target,
+                                    transform::ApplyToEachResultList &results,
+                                    transform::TransformState &state) {
+  // Exit early if the operation is not a generic.
+  if (!isa<GenericOp>(target)) {
+    results.push_back(target);
+    return DiagnosedSilenceableFailure::success();
+  }
+  rewriter.setInsertionPoint(target);
+  FailureOr<LinalgOp> named =
+      specializeGenericOp(rewriter, cast<GenericOp>(target));
+  if (succeeded(named)) {
+    results.push_back(named->getOperation());
+    return DiagnosedSilenceableFailure::success();
+  }
+  return emitDefaultSilenceableFailure(target);
+}
+
 //===----------------------------------------------------------------------===//
 // InterchangeOp
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/Linalg/Transforms/CMakeLists.txt b/mlir/lib/Dialect/Linalg/Transforms/CMakeLists.txt
index bad246c262979..e0a43a29c32d8 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/Linalg/Transforms/CMakeLists.txt
@@ -24,6 +24,7 @@ add_mlir_dialect_library(MLIRLinalgTransforms
   NamedOpConversions.cpp
   Padding.cpp
   Promotion.cpp
+  Specialize.cpp
   Split.cpp
   SplitReduction.cpp
   SubsetHoisting.cpp
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Specialize.cpp b/mlir/lib/Dialect/Linalg/Transforms/Specialize.cpp
new file mode 100644
index 0000000000000..4c437b5db2c7b
--- /dev/null
+++ b/mlir/lib/Dialect/Linalg/Transforms/Specialize.cpp
@@ -0,0 +1,32 @@
+//===- Specialize.cpp - linalg generic ops to named ops  ------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a method to specialize generic operations to named
+// operations. Conceptually it is the opposite of generalize.cpp.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/Linalg/IR/Linalg.h"
+#include "mlir/Dialect/Linalg/IR/LinalgInterfaces.h"
+#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
+#include "llvm/Support/Debug.h"
+
+#define DEBUG_TYPE "linalg-specialization"
+
+using namespace mlir;
+using namespace mlir::linalg;
+
+FailureOr<LinalgOp> mlir::linalg::specializeGenericOp(RewriterBase &rewriter,
+                                                      GenericOp genericOp) {
+  if (isaCopyOpInterface(genericOp)) {
+    LinalgOp namedOp = rewriter.replaceOpWithNewOp<CopyOp>(
+        genericOp, genericOp.getDpsInputs()[0], genericOp.getDpsInits()[0]);
+    return namedOp;
+  }
+  return failure();
+}
diff --git a/mlir/test/Dialect/Linalg/transform-op-specialize.mlir b/mlir/test/Dialect/Linalg/transform-op-specialize.mlir
new file mode 100644
index 0000000000000..8a22c115f3117
--- /dev/null
+++ b/mlir/test/Dialect/Linalg/transform-op-specialize.mlir
@@ -0,0 +1,143 @@
+// RUN: mlir-opt --transform-interpreter --split-input-file --verify-diagnostics %s | FileCheck %s
+
+#map = affine_map<(d0, d1) -> (d0, d1)>
+#map1 = affine_map<(d0, d1) -> (d0)>
+#map2 = affine_map<(d0, d1) -> (d1, d0)>
+
+func.func @broadcast_copy_expect_no_match(%arg0: memref<?xf32>, %arg1: memref<?x?xf32>) {
+  // expected-note @below {{when applied to this op}}
+  linalg.generic {
+    indexing_maps = [#map1, #map], 
+    iterator_types = ["parallel", "parallel"]}
+    ins(%arg0 : memref<?xf32>) outs(%arg1 : memref<?x?xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+  }
+  return
+}
+
+func.func @not_a_copy_expect_no_match(%arg0: memref<?x?xf32>, %arg1: memref<?x?xf32>) {
+  // expected-note @below {{when applied to this op}}
+  linalg.generic {
+    indexing_maps = [#map, #map], 
+    iterator_types = ["parallel", "parallel"]}
+    ins(%arg0 : memref<?x?xf32>) outs(%arg1 : memref<?x?xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      %0 = arith.addf %in, %out : f32
+      linalg.yield %0 : f32
+  }
+  return
+}
+
+func.func @transpose_op_expect_no_match(%arg0: memref<?x?xf32>, %arg1: memref<?x?xf32>) {
+  // expected-note @below {{when applied to this op}}
+  linalg.generic {
+    indexing_maps = [#map, #map2], 
+    iterator_types = ["parallel", "parallel"]} 
+    ins(%arg0 : memref<?x?xf32>) outs(%arg1 : memref<?x?xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+  }
+  return
+}
+
+func.func @copy_with_up_cast(%arg0: memref<?x?xf16>, %arg1: memref<?x?xf32>) {
+  // expected-note @below {{when applied to this op}}
+  linalg.generic {
+    indexing_maps = [#map, #map], 
+    iterator_types = ["parallel", "parallel"]} 
+    ins(%arg0 : memref<?x?xf16>) outs(%arg1 : memref<?x?xf32>) {
+    ^bb0(%in: f16, %out: f32):
+      %0 = arith.extf %in : f16 to f32
+      linalg.yield %0 : f32
+  }
+  return
+}
+
+func.func @copy_with_down_cast(%arg0: memref<?x?xf32>, %arg1: memref<?x?xf16>) {
+  // expected-note @below {{when applied to this op}}
+  linalg.generic {
+    indexing_maps = [#map, #map], 
+    iterator_types = ["parallel", "parallel"]} 
+    ins(%arg0 : memref<?x?xf32>) outs(%arg1 : memref<?x?xf16>) {
+    ^bb0(%in: f32, %out: f16):
+      %0 = arith.truncf %in : f32 to f16
+      linalg.yield %0 : f16
+  }
+  return
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match interface{LinalgOp} in %arg1 : (!transform.any_op) -> !transform.any_op
+    // expected-error @below {{failed to apply}}
+    %1 = transform.structured.specialize %0 : (!transform.any_op) -> !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+#map = affine_map<(d0, d1) -> (d0, d1)>
+
+func.func @specialize_trivial_copy_memref(%arg0: memref<?x?xf32>, %arg1: memref<?x?xf32>) {
+  linalg.generic {
+    indexing_maps = [#map, #map], 
+    iterator_types = ["parallel", "parallel"]} 
+    ins(%arg0 : memref<?x?xf32>) outs(%arg1 : memref<?x?xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+  }
+  return
+}
+
+// CHECK-LABEL: specialize_trivial_copy_memref
+// CHECK-SAME: %[[ARG0:.+]]: memref<?x?xf32>, %[[ARG1:.+]]: memref<?x?xf32>
+// CHECK-NOT: linalg.generic
+// CHECK: linalg.copy ins(%[[ARG0]] : memref<?x?xf32>) outs(%[[ARG1]] : memref<?x?xf32>)
+
+#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+
+func.func @specialize_trivial_copy_tensor(%arg0: tensor<?x?x?xf32>, 
+    %arg1: tensor<?x?x?xf32>) -> tensor<?x?x?xf32> {
+  %0 = linalg.generic {
+    indexing_maps = [#map1, #map1], 
+    iterator_types = ["parallel", "parallel", "parallel"]}
+    ins(%arg0 : tensor<?x?x?xf32>) outs(%arg1 : tensor<?x?x?xf32>) {
+    ^bb0(%in: f32, %out: f32):
+      linalg.yield %in : f32
+  } -> tensor<?x?x?xf32>
+  return %0 : tensor<?x?x?xf32>
+}
+
+// CHECK-LABEL: specialize_trivial_copy_tensor
+// CHECK-SAME: %[[ARG0:.+]]: tensor<?x?x?xf32>, %[[ARG1:.+]]: tensor<?x?x?xf32>
+// CHECK-NOT: linalg.generic
+// CHECK: %{{.+}} = linalg.copy ins(%[[ARG0]] : tensor<?x?x?xf32>) outs(%[[ARG1]] : tensor<?x?x?xf32>)
+
+func.func @already_trivial_copy_memref(%arg0: memref<?x?xf32>, %arg1: memref<?x?xf32>) {
+  linalg.copy ins(%arg0: memref<?x?xf32>) outs(%arg1: memref<?x?xf32>)
+  return
+}
+
+// CHECK-LABEL: already_trivial_copy_memref
+// CHECK-SAME: %[[ARG0:.+]]: memref<?x?xf32>, %[[ARG1:.+]]: memref<?x?xf32>
+// CHECK: linalg.copy ins(%[[ARG0]] : memref<?x?xf32>) outs(%[[ARG1]] : memref<?x?xf32>)
+
+func.func @already_trivial_copy_tensor(%arg0: tensor<?x?x?xf32>,
+    %arg1: tensor<?x?x?xf32>) -> tensor<?x?x?xf32> {
+  %0 = linalg.copy ins(%arg0: tensor<?x?x?xf32>) outs(%arg1: tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
+  return %0 : tensor<?x?x?xf32>
+}
+
+// CHECK-LABEL: already_trivial_copy_tensor
+// CHECK-SAME: %[[ARG0:.+]]: tensor<?x?x?xf32>, %[[ARG1:.+]]: tensor<?x?x?xf32>
+// CHECK: %{{.+}} = linalg.copy ins(%[[ARG0]] : tensor<?x?x?xf32>) outs(%[[ARG1]] : tensor<?x?x?xf32>)
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match interface{LinalgOp} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %1 = transform.structured.specialize %0 : (!transform.any_op) -> !transform.any_op
+    transform.yield
+  }
+}

From 61b9176cf70444c54f3ac6eebd82fc9ffd69944d Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett@linaro.org>
Date: Tue, 31 Oct 2023 09:11:50 +0000
Subject: [PATCH 491/877] [llvm][TableGen] Add Compiler Explorer link to README

As Compiler Explorer now has trunk llvm-tblgen available.
---
 llvm/utils/TableGen/README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/utils/TableGen/README.md b/llvm/utils/TableGen/README.md
index e19701acfce63..3bee6555566a6 100644
--- a/llvm/utils/TableGen/README.md
+++ b/llvm/utils/TableGen/README.md
@@ -23,6 +23,7 @@ def HelloWorld {        // Hello
   string msg = "Hello world!";
 }
 ```
+[Try this example on Compiler Explorer.](https://godbolt.org/z/13xo1P5oz)
 
 The internalized records are passed on to various backends, which extract
 information from a subset of the records and generate one or more output files.

From 33b85867e30e1adc2ff2173039c199b81c10f52b Mon Sep 17 00:00:00 2001
From: Ying Yi <ying.yi@sony.com>
Date: Fri, 1 Sep 2023 15:30:44 +0100
Subject: [PATCH 492/877] Add two time-trace scope variables.

A time trace scope variable of `ParseDeclarationOrFunctionDefinition`
with the function's source location is added to record the time spent
parsing the function's declaration or definition. Another time trace
scope variable of `ParseFunctionDefinition` is also added to record the
name of the defined function. A release note is added as well.

Reviewed by: Aaron Ballman
Pull request: #65268
---
 clang/docs/ReleaseNotes.rst                   |  8 ++++
 clang/lib/Parse/Parser.cpp                    | 14 +++++-
 ...e-ParseDeclarationOrFunctionDefinition.cpp | 15 +++++++
 clang/unittests/Support/TimeProfilerTest.cpp  | 44 +++++++++++--------
 4 files changed, 62 insertions(+), 19 deletions(-)
 create mode 100644 clang/test/Driver/check-time-trace-ParseDeclarationOrFunctionDefinition.cpp

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index bc28bb567f693..c151bd9d234b5 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -427,6 +427,14 @@ Improvements to Clang's diagnostics
   (or, more commonly, ``NULL`` when the platform defines it as ``__null``) to be more consistent
   with GCC.
 
+Improvements to Clang's time-trace
+----------------------------------
+- Two time-trace scope variables are added. A time trace scope variable of
+  ``ParseDeclarationOrFunctionDefinition`` with the function's source location
+  is added to record the time spent parsing the function's declaration or
+  definition. Another time trace scope variable of ``ParseFunctionDefinition``
+  is also added to record the name of the defined function.
+
 Bug Fixes in This Version
 -------------------------
 - Fixed an issue where a class template specialization whose declaration is
diff --git a/clang/lib/Parse/Parser.cpp b/clang/lib/Parse/Parser.cpp
index 0f930248e7717..bef3a0dcb285e 100644
--- a/clang/lib/Parse/Parser.cpp
+++ b/clang/lib/Parse/Parser.cpp
@@ -13,8 +13,8 @@
 #include "clang/Parse/Parser.h"
 #include "clang/AST/ASTConsumer.h"
 #include "clang/AST/ASTContext.h"
-#include "clang/AST/DeclTemplate.h"
 #include "clang/AST/ASTLambda.h"
+#include "clang/AST/DeclTemplate.h"
 #include "clang/Basic/FileManager.h"
 #include "clang/Parse/ParseDiagnostic.h"
 #include "clang/Parse/RAIIObjectsForParser.h"
@@ -22,6 +22,7 @@
 #include "clang/Sema/ParsedTemplate.h"
 #include "clang/Sema/Scope.h"
 #include "llvm/Support/Path.h"
+#include "llvm/Support/TimeProfiler.h"
 using namespace clang;
 
 
@@ -1229,6 +1230,13 @@ Parser::DeclGroupPtrTy Parser::ParseDeclOrFunctionDefInternal(
 Parser::DeclGroupPtrTy Parser::ParseDeclarationOrFunctionDefinition(
     ParsedAttributes &Attrs, ParsedAttributes &DeclSpecAttrs,
     ParsingDeclSpec *DS, AccessSpecifier AS) {
+  // Add an enclosing time trace scope for a bunch of small scopes with
+  // "EvaluateAsConstExpr".
+  llvm::TimeTraceScope TimeScope(
+      "ParseDeclarationOrFunctionDefinition",
+      Tok.getLocation().printToString(
+          Actions.getASTContext().getSourceManager()));
+
   if (DS) {
     return ParseDeclOrFunctionDefInternal(Attrs, DeclSpecAttrs, *DS, AS);
   } else {
@@ -1259,6 +1267,10 @@ Parser::DeclGroupPtrTy Parser::ParseDeclarationOrFunctionDefinition(
 Decl *Parser::ParseFunctionDefinition(ParsingDeclarator &D,
                                       const ParsedTemplateInfo &TemplateInfo,
                                       LateParsedAttrList *LateParsedAttrs) {
+  llvm::TimeTraceScope TimeScope(
+      "ParseFunctionDefinition",
+      Actions.GetNameForDeclarator(D).getName().getAsString());
+
   // Poison SEH identifiers so they are flagged as illegal in function bodies.
   PoisonSEHIdentifiersRAIIObject PoisonSEHIdentifiers(*this, true);
   const DeclaratorChunk::FunctionTypeInfo &FTI = D.getFunctionTypeInfo();
diff --git a/clang/test/Driver/check-time-trace-ParseDeclarationOrFunctionDefinition.cpp b/clang/test/Driver/check-time-trace-ParseDeclarationOrFunctionDefinition.cpp
new file mode 100644
index 0000000000000..f854cddadbfcc
--- /dev/null
+++ b/clang/test/Driver/check-time-trace-ParseDeclarationOrFunctionDefinition.cpp
@@ -0,0 +1,15 @@
+// RUN: %clangxx -S -ftime-trace -ftime-trace-granularity=0 -o %T/check-time-trace-ParseDeclarationOrFunctionDefinition %s
+// RUN: cat %T/check-time-trace-ParseDeclarationOrFunctionDefinition.json \
+// RUN:   | %python -c 'import json, sys; json.dump(json.loads(sys.stdin.read()), sys.stdout, sort_keys=True, indent=2)' \
+// RUN:   | FileCheck %s
+
+// CHECK-DAG: "name": "ParseDeclarationOrFunctionDefinition"
+// CHECK-DAG: "detail": "{{.*}}check-time-trace-ParseDeclarationOrFunctionDefinition.cpp:15:1"
+// CHECK-DAG: "name": "ParseFunctionDefinition"
+// CHECK-DAG: "detail": "foo"
+// CHECK-DAG: "name": "ParseFunctionDefinition"
+// CHECK-DAG: "detail": "bar"
+
+template <typename T>
+void foo(T) {}
+void bar() { foo(0); }
diff --git a/clang/unittests/Support/TimeProfilerTest.cpp b/clang/unittests/Support/TimeProfilerTest.cpp
index a7ca2bf91e474..97fdbb7232b13 100644
--- a/clang/unittests/Support/TimeProfilerTest.cpp
+++ b/clang/unittests/Support/TimeProfilerTest.cpp
@@ -177,22 +177,29 @@ constexpr int slow_init_list[] = {1, 1, 2, 3, 5, 8, 13, 21}; // 25th line
   std::string TraceGraph = buildTraceGraph(Json);
   ASSERT_TRUE(TraceGraph == R"(
 Frontend
-| EvaluateAsRValue (<test.cc:8:21>)
-| EvaluateForOverflow (<test.cc:8:21, col:25>)
-| EvaluateForOverflow (<test.cc:8:30, col:32>)
-| EvaluateAsRValue (<test.cc:9:14>)
-| EvaluateForOverflow (<test.cc:9:9, col:14>)
-| isPotentialConstantExpr (slow_namespace::slow_func)
-| EvaluateAsBooleanCondition (<test.cc:8:21, col:25>)
-| | EvaluateAsRValue (<test.cc:8:21, col:25>)
-| EvaluateAsBooleanCondition (<test.cc:8:21, col:25>)
-| | EvaluateAsRValue (<test.cc:8:21, col:25>)
-| EvaluateAsInitializer (slow_value)
-| EvaluateAsConstantExpr (<test.cc:17:33, col:59>)
-| EvaluateAsConstantExpr (<test.cc:18:11, col:37>)
-| EvaluateAsConstantExpr (<test.cc:23:31, col:57>)
-| EvaluateAsRValue (<test.cc:22:14, line:23:58>)
-| EvaluateAsInitializer (slow_init_list)
+| ParseDeclarationOrFunctionDefinition (test.cc:2:1)
+| ParseDeclarationOrFunctionDefinition (test.cc:6:1)
+| | ParseFunctionDefinition (slow_func)
+| | | EvaluateAsRValue (<test.cc:8:21>)
+| | | EvaluateForOverflow (<test.cc:8:21, col:25>)
+| | | EvaluateForOverflow (<test.cc:8:30, col:32>)
+| | | EvaluateAsRValue (<test.cc:9:14>)
+| | | EvaluateForOverflow (<test.cc:9:9, col:14>)
+| | | isPotentialConstantExpr (slow_namespace::slow_func)
+| | | EvaluateAsBooleanCondition (<test.cc:8:21, col:25>)
+| | | | EvaluateAsRValue (<test.cc:8:21, col:25>)
+| | | EvaluateAsBooleanCondition (<test.cc:8:21, col:25>)
+| | | | EvaluateAsRValue (<test.cc:8:21, col:25>)
+| ParseDeclarationOrFunctionDefinition (test.cc:16:1)
+| | ParseFunctionDefinition (slow_test)
+| | | EvaluateAsInitializer (slow_value)
+| | | EvaluateAsConstantExpr (<test.cc:17:33, col:59>)
+| | | EvaluateAsConstantExpr (<test.cc:18:11, col:37>)
+| ParseDeclarationOrFunctionDefinition (test.cc:22:1)
+| | EvaluateAsConstantExpr (<test.cc:23:31, col:57>)
+| | EvaluateAsRValue (<test.cc:22:14, line:23:58>)
+| ParseDeclarationOrFunctionDefinition (test.cc:25:1)
+| | EvaluateAsInitializer (slow_init_list)
 | PerformPendingInstantiations
 )");
 
@@ -213,8 +220,9 @@ struct {
   std::string TraceGraph = buildTraceGraph(Json);
   ASSERT_TRUE(TraceGraph == R"(
 Frontend
-| isIntegerConstantExpr (<test.c:3:18>)
-| EvaluateKnownConstIntCheckOverflow (<test.c:3:18>)
+| ParseDeclarationOrFunctionDefinition (test.c:2:1)
+| | isIntegerConstantExpr (<test.c:3:18>)
+| | EvaluateKnownConstIntCheckOverflow (<test.c:3:18>)
 | PerformPendingInstantiations
 )");
 

From 75881dbb0fa5dcfe08518b6fb72621cbf60f45e2 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Tue, 31 Oct 2023 10:20:07 +0100
Subject: [PATCH 493/877] [JumpThreading] Don't phi translate past loop phi
 (#70664)

When evaluating comparisons in predecessors, phi operands are translated
into the predecessor. If the translation is across a backedge, this
means that the two operands of the icmp will be from two different loop
iterations, resulting in incorrect simplification.

Fix this by not performing the phi translation for phis in loop headers.

Note: This is not a complete fix. If the
jump-threading-across-loop-headers option is enabled, the LoopHeaders
variable does not get populated. Additional changes will be needed to
fix that case.

Related to https://github.com/llvm/llvm-project/issues/70651.
---
 llvm/lib/Transforms/Scalar/JumpThreading.cpp  |  5 ++++-
 llvm/test/Transforms/JumpThreading/pr70651.ll | 19 +++++++++++++++++--
 2 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/llvm/lib/Transforms/Scalar/JumpThreading.cpp
index f2b9d784ead8a..7a8128c5b6c09 100644
--- a/llvm/lib/Transforms/Scalar/JumpThreading.cpp
+++ b/llvm/lib/Transforms/Scalar/JumpThreading.cpp
@@ -761,7 +761,10 @@ bool JumpThreadingPass::computeValueKnownInPredecessorsImpl(
     PHINode *PN = dyn_cast<PHINode>(CmpLHS);
     if (!PN)
       PN = dyn_cast<PHINode>(CmpRHS);
-    if (PN && PN->getParent() == BB) {
+    // Do not perform phi translation across a loop header phi, because this
+    // may result in comparison of values from two different loop iterations.
+    // FIXME: This check is broken if LoopHeaders is not populated.
+    if (PN && PN->getParent() == BB && !LoopHeaders.contains(BB)) {
       const DataLayout &DL = PN->getModule()->getDataLayout();
       // We can do this simplification if any comparisons fold to true or false.
       // See if any do.
diff --git a/llvm/test/Transforms/JumpThreading/pr70651.ll b/llvm/test/Transforms/JumpThreading/pr70651.ll
index a156be541874a..2c6059bfdb3d5 100644
--- a/llvm/test/Transforms/JumpThreading/pr70651.ll
+++ b/llvm/test/Transforms/JumpThreading/pr70651.ll
@@ -1,7 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
 ; RUN: opt -S -passes=jump-threading < %s | FileCheck %s
+; RUN: opt -S -passes=jump-threading -jump-threading-across-loop-headers < %s | FileCheck %s --check-prefix=THREAD-LOOP
 
-; FIXME: This is a miscompile.
+; FIXME: This is a miscompile if -jump-threading-across-loop-headers is enabled.
 define i64 @test(i64 %v) {
 ; CHECK-LABEL: define i64 @test(
 ; CHECK-SAME: i64 [[V:%.*]]) {
@@ -12,10 +13,24 @@ define i64 @test(i64 %v) {
 ; CHECK-NEXT:    [[SUM:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[SUM_NEXT:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[SUM_NEXT]] = add i64 [[SUM]], [[V]]
 ; CHECK-NEXT:    [[OVERFLOW:%.*]] = icmp ult i64 [[SUM_NEXT]], [[SUM]]
-; CHECK-NEXT:    br i1 [[V_NONNEG]], label [[FOR_BODY]], label [[EXIT:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = xor i1 [[V_NONNEG]], [[OVERFLOW]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[EXIT:%.*]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret i64 [[SUM]]
 ;
+; THREAD-LOOP-LABEL: define i64 @test(
+; THREAD-LOOP-SAME: i64 [[V:%.*]]) {
+; THREAD-LOOP-NEXT:  entry:
+; THREAD-LOOP-NEXT:    [[V_NONNEG:%.*]] = icmp sgt i64 [[V]], -1
+; THREAD-LOOP-NEXT:    br label [[FOR_BODY:%.*]]
+; THREAD-LOOP:       for.body:
+; THREAD-LOOP-NEXT:    [[SUM:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[SUM_NEXT:%.*]], [[FOR_BODY]] ]
+; THREAD-LOOP-NEXT:    [[SUM_NEXT]] = add i64 [[SUM]], [[V]]
+; THREAD-LOOP-NEXT:    [[OVERFLOW:%.*]] = icmp ult i64 [[SUM_NEXT]], [[SUM]]
+; THREAD-LOOP-NEXT:    br i1 [[V_NONNEG]], label [[FOR_BODY]], label [[EXIT:%.*]]
+; THREAD-LOOP:       exit:
+; THREAD-LOOP-NEXT:    ret i64 [[SUM]]
+;
 entry:
   %v.nonneg = icmp sgt i64 %v, -1
   br label %for.body

From 83bf8e9a12719c8166f77615440c275f136f4066 Mon Sep 17 00:00:00 2001
From: NimishMishra <42909663+NimishMishra@users.noreply.github.com>
Date: Tue, 31 Oct 2023 02:34:37 -0700
Subject: [PATCH 494/877] [flang][OpenMP] Port OpenMP FIR tests for atomic
 update/capture to HLFIR (#70627)

Port atomic update and capture tests to HLFIR.
---
 flang/test/Lower/OpenMP/atomic-capture.f90    |  98 ++++++++++++
 .../test/Lower/OpenMP/atomic-update-hlfir.f90 |  23 ---
 flang/test/Lower/OpenMP/atomic-update.f90     | 149 ++++++++++++++++++
 3 files changed, 247 insertions(+), 23 deletions(-)
 create mode 100644 flang/test/Lower/OpenMP/atomic-capture.f90
 delete mode 100644 flang/test/Lower/OpenMP/atomic-update-hlfir.f90
 create mode 100644 flang/test/Lower/OpenMP/atomic-update.f90

diff --git a/flang/test/Lower/OpenMP/atomic-capture.f90 b/flang/test/Lower/OpenMP/atomic-capture.f90
new file mode 100644
index 0000000000000..cde0281dbdc84
--- /dev/null
+++ b/flang/test/Lower/OpenMP/atomic-capture.f90
@@ -0,0 +1,98 @@
+! This test checks the lowering of atomic capture
+
+! RUN: bbc -fopenmp -emit-hlfir %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir -fopenmp %s -o - | FileCheck %s 
+
+
+
+program OmpAtomicCapture
+    use omp_lib                                                                                                       
+
+!CHECK: %[[VAL_X_ALLOCA:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFEx"}
+!CHECK: %[[VAL_X_DECLARE:.*]]:2 = hlfir.declare %[[VAL_X_ALLOCA]] {{.*}}
+!CHECK: %[[VAL_Y_ALLOCA:.*]] = fir.alloca i32 {bindc_name = "y", uniq_name = "_QFEy"}
+!CHECK: %[[VAL_Y_DECLARE:.*]]:2 = hlfir.declare %[[VAL_Y_ALLOCA]] {{.*}}
+    integer :: x, y
+
+!CHECK: %[[VAL_Y_LOADED:.*]] = fir.load %[[VAL_X_DECLARE]]#0 : !fir.ref<i32>
+!CHECK: omp.atomic.capture hint(uncontended) {
+!CHECK: omp.atomic.update %[[VAL_Y_DECLARE]]#1 : !fir.ref<i32> {
+!CHECK: ^bb0(%[[ARG:.*]]: i32):
+!CHECK: %[[TEMP:.*]] = arith.muli %[[VAL_Y_LOADED]], %[[ARG]] : i32
+!CHECK: omp.yield(%[[TEMP]] : i32)
+!CHECK: }
+!CHECK: omp.atomic.read %[[VAL_X_DECLARE]]#1 = %[[VAL_Y_DECLARE]]#1 : !fir.ref<i32>, i32
+!CHECK: }
+    !$omp atomic hint(omp_sync_hint_uncontended) capture
+        y = x * y 
+        x = y
+    !$omp end atomic
+
+!CHECK: %[[VAL_20:.*]] = arith.constant 20 : i32
+!CHECK: %[[VAL_8:.*]] = arith.constant 8 : i32
+!CHECK: %[[VAL_X_LOADED:.*]] = fir.load %[[VAL_X_DECLARE]]#0 : !fir.ref<i32>
+!CHECK: %[[SUB:.*]] = arith.subi %[[VAL_8]], %[[VAL_X_LOADED]] : i32
+!CHECK: %[[NO_REASSOC:.*]] = hlfir.no_reassoc %[[SUB]] : i32
+!CHECK: %[[ADD:.*]] = arith.addi  %[[VAL_20]], %[[NO_REASSOC]] : i32
+!CHECK: omp.atomic.capture memory_order(acquire) hint(nonspeculative) {
+!CHECK:   omp.atomic.read %[[VAL_X_DECLARE]]#1 = %[[VAL_Y_DECLARE]]#1 : !fir.ref<i32>, i32
+!CHECK:   omp.atomic.write %[[VAL_Y_DECLARE]]#1 = %[[ADD]] : !fir.ref<i32>, i32
+!CHECK: }
+!CHECK: return
+!CHECK: }
+    !$omp atomic hint(omp_lock_hint_nonspeculative) capture acquire
+        x = y
+        y = 2 * 10 + (8 - x) 
+    !$omp end atomic
+end program
+
+
+!CHECK: func.func @_QPpointers_in_atomic_capture() {
+subroutine pointers_in_atomic_capture()
+
+!CHECK: %[[VAL_A_ALLOCA:.*]] = fir.alloca !fir.box<!fir.ptr<i32>> {bindc_name = "a", uniq_name = "_QFpointers_in_atomic_captureEa"}
+!CHECK: %[[ZERO:.*]] = fir.zero_bits !fir.ptr<i32>
+!CHECK: %[[EMBOX:.*]] = fir.embox %[[ZERO]] : (!fir.ptr<i32>) -> !fir.box<!fir.ptr<i32>>
+!CHECK: fir.store %[[EMBOX]] to %[[VAL_A_ALLOCA]] : !fir.ref<!fir.box<!fir.ptr<i32>>>
+!CHECK: %[[VAL_A_DECLARE:.*]]:2 = hlfir.declare %[[VAL_A_ALLOCA]] {{.*}}
+!CHECK: %[[VAL_B_ALLOCA:.*]] = fir.alloca !fir.box<!fir.ptr<i32>> {bindc_name = "b", uniq_name = "_QFpointers_in_atomic_captureEb"}
+!CHECK: %[[ZERO:.*]] = fir.zero_bits !fir.ptr<i32>
+!CHECK: %[[EMBOX:.*]] = fir.embox %[[ZERO]] : (!fir.ptr<i32>) -> !fir.box<!fir.ptr<i32>>
+!CHECK: fir.store %[[EMBOX]] to %[[VAL_B_ALLOCA]] : !fir.ref<!fir.box<!fir.ptr<i32>>>
+!CHECK: %[[VAL_B_DECLARE:.*]]:2 = hlfir.declare %[[VAL_B_ALLOCA]] {{.*}}
+!CHECK: %[[VAL_C_ALLOCA:.*]] = fir.alloca i32 {bindc_name = "c", fir.target, uniq_name = "_QFpointers_in_atomic_captureEc"}
+!CHECK: %[[VAL_C_DECLARE:.*]]:2 = hlfir.declare %[[VAL_C_ALLOCA]] {{.*}}
+!CHECK: %[[VAL_D_ALLOCA:.*]] = fir.alloca i32 {bindc_name = "d", fir.target, uniq_name = "_QFpointers_in_atomic_captureEd"}
+!CHECK: %[[VAL_D_DECLARE:.*]]:2 = hlfir.declare %[[VAL_D_ALLOCA]] {{.*}}
+    integer, pointer :: a, b
+    integer, target :: c, d
+
+!CHECK: %[[EMBOX:.*]] = fir.embox %[[VAL_C_DECLARE]]#1 : (!fir.ref<i32>) -> !fir.box<!fir.ptr<i32>>
+!CHECK: fir.store %[[EMBOX]] to %[[VAL_A_DECLARE]]#1 : !fir.ref<!fir.box<!fir.ptr<i32>>>
+!CHECK: %[[EMBOX:.*]] = fir.embox %[[VAL_D_DECLARE]]#1 : (!fir.ref<i32>) -> !fir.box<!fir.ptr<i32>>
+!CHECK: fir.store %[[EMBOX]] to %[[VAL_B_DECLARE]]#1 : !fir.ref<!fir.box<!fir.ptr<i32>>>
+    a=>c
+    b=>d
+
+!CHECK: %[[VAL_A_LOADED:.*]] = fir.load %[[VAL_A_DECLARE]]#0 : !fir.ref<!fir.box<!fir.ptr<i32>>>
+!CHECK: %[[VAL_A_BOX_ADDR:.*]] = fir.box_addr %[[VAL_A_LOADED]] : (!fir.box<!fir.ptr<i32>>) -> !fir.ptr<i32>
+!CHECK: %[[VAL_B_LOADED:.*]] = fir.load %[[VAL_B_DECLARE]]#0 : !fir.ref<!fir.box<!fir.ptr<i32>>>
+!CHECK: %[[VAL_B_BOX_ADDR:.*]] = fir.box_addr %[[VAL_B_LOADED]] : (!fir.box<!fir.ptr<i32>>) -> !fir.ptr<i32>
+!CHECK: %[[VAL_B_LOADED_2:.*]] = fir.load %[[VAL_B_DECLARE]]#0 : !fir.ref<!fir.box<!fir.ptr<i32>>>
+!CHECK: %[[VAL_B_BOX_ADDR_2:.*]] = fir.box_addr %[[VAL_B_LOADED_2]] : (!fir.box<!fir.ptr<i32>>) -> !fir.ptr<i32>
+!CHECK: %[[VAL_B:.*]] = fir.load %[[VAL_B_BOX_ADDR_2]] : !fir.ptr<i32>
+!CHECK: omp.atomic.capture {
+!CHECK: omp.atomic.update %[[VAL_A_BOX_ADDR]] : !fir.ptr<i32> {
+!CHECK: ^bb0(%[[ARG:.*]]: i32):
+!CHECK: %[[TEMP:.*]] = arith.addi %[[ARG]], %[[VAL_B]] : i32
+!CHECK: omp.yield(%[[TEMP]] : i32)
+!CHECK: }
+!CHECK: omp.atomic.read %[[VAL_B_BOX_ADDR]] = %[[VAL_A_BOX_ADDR]] : !fir.ptr<i32>, i32
+!CHECK: }
+!CHECK: return
+!CHECK: }
+    !$omp atomic capture
+        a = a + b
+        b = a
+    !$omp end atomic
+end subroutine
diff --git a/flang/test/Lower/OpenMP/atomic-update-hlfir.f90 b/flang/test/Lower/OpenMP/atomic-update-hlfir.f90
deleted file mode 100644
index 329009ab8ef8e..0000000000000
--- a/flang/test/Lower/OpenMP/atomic-update-hlfir.f90
+++ /dev/null
@@ -1,23 +0,0 @@
-! This test checks lowering of atomic and atomic update constructs with HLFIR
-! RUN: bbc -hlfir -fopenmp -emit-hlfir %s -o - | FileCheck %s
-! RUN: %flang_fc1 -flang-experimental-hlfir -emit-hlfir -fopenmp %s -o - | FileCheck %s
-
-subroutine sb
-  integer :: x, y
-
-  !$omp atomic update
-    x = x + y
-end subroutine
-
-!CHECK-LABEL: @_QPsb
-!CHECK:   %[[X_REF:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFsbEx"}
-!CHECK:   %[[X_DECL:.*]]:2 = hlfir.declare %[[X_REF]] {uniq_name = "_QFsbEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-!CHECK:   %[[Y_REF:.*]] = fir.alloca i32 {bindc_name = "y", uniq_name = "_QFsbEy"}
-!CHECK:   %[[Y_DECL:.*]]:2 = hlfir.declare %[[Y_REF]] {uniq_name = "_QFsbEy"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-!CHECK:   %[[Y_VAL:.*]] = fir.load %[[Y_DECL]]#0 : !fir.ref<i32>
-!CHECK:   omp.atomic.update   %[[X_DECL]]#1 : !fir.ref<i32> {
-!CHECK:   ^bb0(%[[ARG_X:.*]]: i32):
-!CHECK:     %[[X_UPDATE_VAL:.*]] = arith.addi %[[ARG_X]], %[[Y_VAL]] : i32
-!CHECK:     omp.yield(%[[X_UPDATE_VAL]] : i32)
-!CHECK:   }
-!CHECK:   return
diff --git a/flang/test/Lower/OpenMP/atomic-update.f90 b/flang/test/Lower/OpenMP/atomic-update.f90
new file mode 100644
index 0000000000000..e6319f70f3736
--- /dev/null
+++ b/flang/test/Lower/OpenMP/atomic-update.f90
@@ -0,0 +1,149 @@
+! This test checks lowering of atomic and atomic update constructs
+! RUN: bbc -fopenmp -emit-hlfir %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir -fopenmp %s -o - | FileCheck %s
+
+program OmpAtomicUpdate
+    use omp_lib
+!CHECK: %[[VAL_A:.*]] = fir.alloca !fir.box<!fir.ptr<i32>> {bindc_name = "a", uniq_name = "_QFEa"}
+!CHECK: %[[ZERO:.*]] = fir.zero_bits !fir.ptr<i32>
+!CHECK: %[[EMBOX:.*]] = fir.embox %[[ZERO]] : (!fir.ptr<i32>) -> !fir.box<!fir.ptr<i32>>
+!CHECK: fir.store %[[EMBOX]] to %[[VAL_A]] : !fir.ref<!fir.box<!fir.ptr<i32>>>
+!CHECK: %[[VAL_A_DECLARE:.*]]:2 = hlfir.declare %[[VAL_A]] {{.*}}
+!CHECK: %[[VAL_B:.*]] = fir.alloca !fir.box<!fir.ptr<i32>> {bindc_name = "b", uniq_name = "_QFEb"}
+!CHECK: %[[ZERO:.*]] = fir.zero_bits !fir.ptr<i32>
+!CHECK: %[[EMBOX:.*]] = fir.embox %[[ZERO]] : (!fir.ptr<i32>) -> !fir.box<!fir.ptr<i32>>
+!CHECK: fir.store %[[EMBOX]] to %[[VAL_B]] : !fir.ref<!fir.box<!fir.ptr<i32>>>
+!CHECK: %[[VAL_B_DECLARE:.*]]:2 = hlfir.declare %[[VAL_B]] {{.*}}
+!CHECK: %[[VAL_C_ADDRESS:.*]] = fir.address_of(@_QFEc) : !fir.ref<i32>
+!CHECK: %[[VAL_C_DECLARE:.*]]:2 = hlfir.declare %[[VAL_C_ADDRESS]] {{.*}}
+!CHECK: %[[VAL_D_ADDRESS:.*]] = fir.address_of(@_QFEd) : !fir.ref<i32>
+!CHECK: %[[VAL_D_DECLARE:.*]]:2 = hlfir.declare %[[VAL_D_ADDRESS]] {{.}}
+!CHECK: %[[VAL_i1_ALLOCA:.*]] = fir.alloca i8 {bindc_name = "i1", uniq_name = "_QFEi1"}
+!CHECK: %[[VAL_i1_DECLARE:.*]]:2 = hlfir.declare %[[VAL_i1_ALLOCA]] {{.*}}
+!CHECK: %[[VAL_c5:.*]] = arith.constant 5 : index
+!CHECK: %[[VAL_K_ALLOCA:.*]] = fir.alloca !fir.array<5xi32> {bindc_name = "k", uniq_name = "_QFEk"}
+!CHECK: %[[VAL_K_SHAPED:.*]] = fir.shape %[[VAL_c5]] : (index) -> !fir.shape<1>
+!CHECK: %[[VAL_K_DECLARE:.*]]:2 = hlfir.declare %[[VAL_K_ALLOCA]](%[[VAL_K_SHAPED]]) {{.*}}
+
+!CHECK: %[[VAL_X_ALLOCA:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFEx"}
+!CHECK: %[[VAL_X_DECLARE:.*]]:2 = hlfir.declare %[[VAL_X_ALLOCA]] {uniq_name = "_QFEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[VAL_Y_ALLOCA:.*]] = fir.alloca i32 {bindc_name = "y", uniq_name = "_QFEy"}
+!CHECK: %[[VAL_Y_DECLARE:.*]]:2 = hlfir.declare %[[VAL_Y_ALLOCA]] {uniq_name = "_QFEy"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[VAL_Z_ALLOCA:.*]] = fir.alloca i32 {bindc_name = "z", uniq_name = "_QFEz"}
+!CHECK: %[[VAL_Z_DECLARE:.*]]:2 = hlfir.declare %[[VAL_Z_ALLOCA]] {uniq_name = "_QFEz"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+    integer :: x, y, z
+    integer, pointer :: a, b
+    integer, target :: c, d
+    integer(1) :: i1
+    integer, dimension(5) :: k
+
+!CHECK: %[[EMBOX:.*]] = fir.embox %[[VAL_C_DECLARE]]#1 : (!fir.ref<i32>) -> !fir.box<!fir.ptr<i32>>
+!CHECK: fir.store %[[EMBOX]] to %[[VAL_A_DECLARE]]#1 : !fir.ref<!fir.box<!fir.ptr<i32>>>
+!CHECK: %[[EMBOX:.*]] = fir.embox %[[VAL_D_DECLARE]]#1 : (!fir.ref<i32>) -> !fir.box<!fir.ptr<i32>>
+!CHECK: fir.store %[[EMBOX]] to %[[VAL_B_DECLARE]]#1 : !fir.ref<!fir.box<!fir.ptr<i32>>>
+    a=>c
+    b=>d
+
+!CHECK: %[[VAL_c3:.*]] = arith.constant 3 : index
+!CHECK: %[[VAL_K_DESIGNATE:.*]] = hlfir.designate %[[VAL_K_DECLARE]]#0 (%[[VAL_c3]]) : (!fir.ref<!fir.array<5xi32>>, index) -> !fir.ref<i32>
+!CHECK: %[[LOADED_Z:.*]] = fir.load %[[VAL_Z_DECLARE]]#0 : !fir.ref<i32>
+!CHECK: omp.atomic.update %[[VAL_K_DESIGNATE]] : !fir.ref<i32> {
+!CHECK:  ^bb0(%[[ARG:.*]]: i32):
+!CHECK:     %[[TEMP:.*]] = arith.muli %[[LOADED_Z]], %[[ARG]] : i32
+!CHECK:     omp.yield(%[[TEMP]] : i32)
+!CHECK: }
+   !$omp atomic update
+        k(3) = z * k(3)
+
+!CHECK: %[[VAL_A_LOADED:.*]] = fir.load %[[VAL_A_DECLARE]]#0 : !fir.ref<!fir.box<!fir.ptr<i32>>>
+!CHECK: %[[VAL_A_BOX_ADDR:.*]] = fir.box_addr %[[VAL_A_LOADED]] : (!fir.box<!fir.ptr<i32>>) -> !fir.ptr<i32>
+!CHECK: %[[VAL_B_LOADED:.*]] = fir.load %[[VAL_B_DECLARE]]#0 : !fir.ref<!fir.box<!fir.ptr<i32>>>
+!CHECK: %[[VAL_B_BOX_ADDR:.*]] = fir.box_addr %[[VAL_B_LOADED]] : (!fir.box<!fir.ptr<i32>>) -> !fir.ptr<i32>
+!CHECK: %[[VAL_B:.*]] = fir.load %[[VAL_B_BOX_ADDR]] : !fir.ptr<i32>
+!CHECK: omp.atomic.update %[[VAL_A_BOX_ADDR]] : !fir.ptr<i32> {
+!CHECK: ^bb0(%[[ARG:.*]]: i32):
+!CHECK: %[[TEMP:.*]] = arith.addi %[[ARG]], %[[VAL_B]] : i32
+!CHECK: omp.yield(%[[TEMP]] : i32)
+!CHECK: }
+   !$omp atomic update
+        a = a + b 
+
+!CHECK: %[[VAL_c1:.*]] = arith.constant 1 : i32
+!CHECK: omp.atomic.update %[[VAL_Y_DECLARE]]#1 : !fir.ref<i32> {
+!CHECK: ^bb0(%[[ARG:.*]]: i32):
+!CHECK: %[[TEMP:.*]] = arith.addi %[[ARG]], %[[VAL_c1]] : i32
+!CHECK: omp.yield(%[[TEMP]] : i32)
+!CHECK: }
+   !$omp atomic 
+        y = y + 1
+
+!CHECK: %[[VAL_X_LOADED:.*]] = fir.load %[[VAL_X_DECLARE]]#0 : !fir.ref<i32>
+!CHECK: omp.atomic.update %[[VAL_Z_DECLARE]]#1 : !fir.ref<i32> {
+!CHECK: ^bb0(%[[ARG:.*]]: i32):
+!CHECK: %[[TEMP:.*]] = arith.muli %[[VAL_X_LOADED]], %[[ARG]] : i32
+!CHECK: omp.yield(%[[TEMP]] : i32)
+!CHECK: }
+    !$omp atomic update
+        z = x * z 
+
+!CHECK: %[[VAL_c1:.*]] = arith.constant 1 : i32
+!CHECK: omp.atomic.update memory_order(relaxed) hint(uncontended) %[[VAL_X_DECLARE]]#1 : !fir.ref<i32> {
+!CHECK: ^bb0(%[[ARG:.*]]: i32):
+!CHECK: %[[TEMP:.*]] = arith.subi %[[ARG]], %[[VAL_c1]] : i32
+!CHECK: omp.yield(%[[TEMP]] : i32)
+!CHECK: }
+   !$omp atomic relaxed update hint(omp_sync_hint_uncontended)
+        x = x - 1
+
+!CHECK: omp.atomic.update memory_order(relaxed) %[[VAL_Y_DECLARE]]#1 : !fir.ref<i32> {
+!CHECK: ^bb0(%[[ARG:.*]]: i32):
+!CHECK:  %[[VAL_C_LOADED:.*]] = fir.load %[[VAL_C_DECLARE]]#0 : !fir.ref<i32>
+!CHECK:  %[[VAL_D_LOADED:.*]] = fir.load %[[VAL_D_DECLARE]]#0 : !fir.ref<i32>
+!CHECK:  {{.*}} = arith.cmpi sgt, %[[ARG]], {{.*}} : i32
+!CHECK:  {{.*}} = arith.select {{.*}}, %[[ARG]], {{.*}} : i32
+!CHECK:  {{.*}} = arith.cmpi sgt, {{.*}}
+!CHECK:  %[[TEMP:.*]] = arith.select {{.*}} : i32
+!CHECK:  omp.yield(%[[TEMP]] : i32)
+!CHECK: }
+    !$omp atomic update relaxed 
+        y = max(y, c, d)
+
+!CHECK: %[[VAL_X_LOADED:.*]] = fir.load %[[VAL_X_DECLARE]]#0 : !fir.ref<i32>
+!CHECK: omp.atomic.update memory_order(relaxed) hint(contended) %[[VAL_Z_DECLARE]]#1 : !fir.ref<i32> {
+!CHECK: ^bb0(%[[ARG:.*]]: i32):
+!CHECK: %[[TEMP:.*]] = arith.addi %[[ARG]], %[[VAL_X_LOADED]] : i32
+!CHECK: omp.yield(%[[TEMP]] : i32)
+!CHECK: }
+    !$omp atomic relaxed hint(omp_sync_hint_contended)
+        z = z + x
+
+!CHECK: %[[VAL_c10:.*]] = arith.constant 10 : i32
+!CHECK: omp.atomic.update memory_order(release) hint(contended) %[[VAL_Z_DECLARE]]#1 : !fir.ref<i32> {
+!CHECK: ^bb0(%[[ARG:.*]]: i32):
+!CHECK: %[[TEMP:.*]] = arith.muli %[[VAL_c10]], %[[ARG]] : i32
+!CHECK: omp.yield(%[[TEMP]] : i32)
+!CHECK: }
+    !$omp atomic release update hint(omp_lock_hint_contended)
+        z = z * 10
+
+!CHECK: %[[VAL_Z_LOADED:.*]] = fir.load %[[VAL_Z_DECLARE]]#0 : !fir.ref<i32>
+!CHECK: omp.atomic.update memory_order(release) hint(speculative) %[[VAL_X_DECLARE]]#1 : !fir.ref<i32> {
+!CHECK: ^bb0(%[[ARG:.*]]: i32):
+!CHECK: %[[TEMP:.*]] = arith.divsi %[[ARG]], %[[VAL_Z_LOADED]] : i32
+!CHECK: omp.yield(%[[TEMP]] : i32)
+!CHECK: }
+    !$omp atomic hint(omp_lock_hint_speculative) update release
+        x = x / z
+
+!CHECK: %[[VAL_c1:.*]] = arith.constant 1 : i32
+!CHECK: omp.atomic.update %[[VAL_i1_DECLARE]]#1 : !fir.ref<i8> {
+!CHECK: ^bb0(%[[ARG:.*]]: i8):
+!CHECK: %[[CONVERT:.*]] = fir.convert %[[ARG]] : (i8) -> i32
+!CHECK: %[[ADD:.*]] = arith.addi %[[CONVERT]], %[[VAL_c1]] : i32
+!CHECK: %[[TEMP:.*]] = fir.convert %[[ADD]] : (i32) -> i8
+!CHECK: omp.yield(%[[TEMP]] : i8)
+!CHECK: }
+   !$omp atomic
+      i1 = i1 + 1
+    !$omp end atomic
+end program OmpAtomicUpdate

From 04736c7f7ae97bf9dc0d7ebaefe80990d5c17d65 Mon Sep 17 00:00:00 2001
From: Matthias Springer <me@m-sp.org>
Date: Tue, 31 Oct 2023 18:36:40 +0900
Subject: [PATCH 495/877] [mlir][SCF] Use `transform.get_parent_op` instead of
 `transform.loop.get_parent_for` (#70757)

Add a new attribute to `get_parent_op` to get the n-th parent. Remove
`transform.loop.get_parent_for`, which is no longer needed.
---
 .../SCF/TransformOps/SCFTransformOps.td       |  24 ----
 .../mlir/Dialect/Transform/IR/TransformOps.td |  11 +-
 .../SCF/TransformOps/SCFTransformOps.cpp      |  33 ------
 .../lib/Dialect/Transform/IR/TransformOps.cpp |  41 ++++---
 .../mlir/dialects/transform/__init__.py       |  42 +++----
 mlir/python/mlir/dialects/transform/loop.py   |  24 ----
 .../Dialect/SCF/transform-ops-invalid.mlir    |   4 +-
 mlir/test/Dialect/SCF/transform-ops.mlir      | 108 +-----------------
 .../Dialect/Transform/test-interpreter.mlir   |  28 ++++-
 mlir/test/python/dialects/transform.py        |   7 +-
 .../python/dialects/transform_loop_ext.py     |  15 ---
 11 files changed, 91 insertions(+), 246 deletions(-)

diff --git a/mlir/include/mlir/Dialect/SCF/TransformOps/SCFTransformOps.td b/mlir/include/mlir/Dialect/SCF/TransformOps/SCFTransformOps.td
index 700a29139a35b..14df7e23a430f 100644
--- a/mlir/include/mlir/Dialect/SCF/TransformOps/SCFTransformOps.td
+++ b/mlir/include/mlir/Dialect/SCF/TransformOps/SCFTransformOps.td
@@ -68,30 +68,6 @@ def ForallToForOp : Op<Transform_Dialect, "loop.forall_to_for",
   let assemblyFormat = "$target attr-dict `:` functional-type(operands, results)";
 }
 
-def GetParentForOp : Op<Transform_Dialect, "loop.get_parent_for",
-    [NavigationTransformOpTrait, MemoryEffectsOpInterface,
-     DeclareOpInterfaceMethods<TransformOpInterface>]> {
-  let summary = "Gets a handle to the parent 'for' loop of the given operation";
-  let description = [{
-    Produces a handle to the n-th (default 1) parent `scf.for` or `affine.for`
-    (when the affine flag is true) loop for each Payload IR operation
-    associated with the operand. Fails if such a loop cannot be found. The list
-    of operations associated with the handle contains parent operations in the
-    same order as the list associated with the operand, except for operations
-    that are parents to more than one input which are only present once.
-  }];
-
-  let arguments =
-    (ins TransformHandleTypeInterface:$target,
-         DefaultValuedAttr<ConfinedAttr<I64Attr, [IntPositive]>,
-                           "1">:$num_loops,
-         DefaultValuedAttr<BoolAttr, "false">:$affine);
-  let results = (outs TransformHandleTypeInterface : $parent);
-
-  let assemblyFormat =
-    "$target attr-dict `:` functional-type(operands, results)";
-}
-
 def LoopOutlineOp : Op<Transform_Dialect, "loop.outline",
     [FunctionalStyleTransformOpTrait, MemoryEffectsOpInterface,
      DeclareOpInterfaceMethods<TransformOpInterface>]> {
diff --git a/mlir/include/mlir/Dialect/Transform/IR/TransformOps.td b/mlir/include/mlir/Dialect/Transform/IR/TransformOps.td
index 2fd0e80db96fe..307257f4a582b 100644
--- a/mlir/include/mlir/Dialect/Transform/IR/TransformOps.td
+++ b/mlir/include/mlir/Dialect/Transform/IR/TransformOps.td
@@ -620,10 +620,11 @@ def GetParentOp : TransformDialectOp<"get_parent_op",
     that case for each target op, the closest parent op that fulfills all
     requirements, is returned.
     - `isolated_from_above`: the parent op must be isolated from above
-    - `allow_empty_results`: get_parent_op is allowed to return an empty list and
-      still succeeds. In such a case, if get_parent_op fails for any operation
-      in the list, the entire transform returns an empty handle.
+    - `allow_empty_results`: get_parent_op is allowed to return an empty list
+      and still succeeds. In such a case, if get_parent_op fails for any
+      operation in the list, the entire transform returns an empty handle.
     - `op_name`: the parent op must have the specified name
+    - `nth_parent`: get the n-th parent of that satisfies the above requirements
 
     If `deduplicate` is set, the result handle does not contain any duplicate
     ops. For example, given the list
@@ -641,7 +642,9 @@ def GetParentOp : TransformDialectOp<"get_parent_op",
                        UnitAttr:$isolated_from_above,
                        UnitAttr:$allow_empty_results,
                        OptionalAttr<StrAttr>:$op_name,
-                       UnitAttr:$deduplicate);
+                       UnitAttr:$deduplicate,
+                       DefaultValuedAttr<ConfinedAttr<I64Attr, [IntPositive]>,
+                                         "1">:$nth_parent);
   let results = (outs TransformHandleTypeInterface:$parent);
   let assemblyFormat =
     "$target attr-dict `:` functional-type(operands, results)";
diff --git a/mlir/lib/Dialect/SCF/TransformOps/SCFTransformOps.cpp b/mlir/lib/Dialect/SCF/TransformOps/SCFTransformOps.cpp
index 65d503d7c4ad8..62370604142cd 100644
--- a/mlir/lib/Dialect/SCF/TransformOps/SCFTransformOps.cpp
+++ b/mlir/lib/Dialect/SCF/TransformOps/SCFTransformOps.cpp
@@ -49,39 +49,6 @@ void transform::ApplySCFStructuralConversionPatternsOp::
                                                  conversionTarget);
 }
 
-//===----------------------------------------------------------------------===//
-// GetParentForOp
-//===----------------------------------------------------------------------===//
-
-DiagnosedSilenceableFailure
-transform::GetParentForOp::apply(transform::TransformRewriter &rewriter,
-                                 transform::TransformResults &results,
-                                 transform::TransformState &state) {
-  SetVector<Operation *> parents;
-  for (Operation *target : state.getPayloadOps(getTarget())) {
-    Operation *loop, *current = target;
-    for (unsigned i = 0, e = getNumLoops(); i < e; ++i) {
-      loop = getAffine()
-                 ? current->getParentOfType<AffineForOp>().getOperation()
-                 : current->getParentOfType<scf::ForOp>().getOperation();
-      if (!loop) {
-        DiagnosedSilenceableFailure diag =
-            emitSilenceableError()
-            << "could not find an '"
-            << (getAffine() ? AffineForOp::getOperationName()
-                            : scf::ForOp::getOperationName())
-            << "' parent";
-        diag.attachNote(target->getLoc()) << "target op";
-        return diag;
-      }
-      current = loop;
-    }
-    parents.insert(loop);
-  }
-  results.set(cast<OpResult>(getResult()), parents.getArrayRef());
-  return DiagnosedSilenceableFailure::success();
-}
-
 //===----------------------------------------------------------------------===//
 // ForallToForOp
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/Transform/IR/TransformOps.cpp b/mlir/lib/Dialect/Transform/IR/TransformOps.cpp
index 514a75b5d5904..7136e423470a2 100644
--- a/mlir/lib/Dialect/Transform/IR/TransformOps.cpp
+++ b/mlir/lib/Dialect/Transform/IR/TransformOps.cpp
@@ -1232,27 +1232,30 @@ transform::GetParentOp::apply(transform::TransformRewriter &rewriter,
   SmallVector<Operation *> parents;
   DenseSet<Operation *> resultSet;
   for (Operation *target : state.getPayloadOps(getTarget())) {
-    Operation *parent = target->getParentOp();
-    while (parent) {
-      bool checkIsolatedFromAbove =
-          !getIsolatedFromAbove() ||
-          parent->hasTrait<OpTrait::IsIsolatedFromAbove>();
-      bool checkOpName = !getOpName().has_value() ||
-                         parent->getName().getStringRef() == *getOpName();
-      if (checkIsolatedFromAbove && checkOpName)
-        break;
+    Operation *parent = target;
+    for (int64_t i = 0, e = getNthParent(); i < e; ++i) {
       parent = parent->getParentOp();
-    }
-    if (!parent) {
-      if (getAllowEmptyResults()) {
-        results.set(llvm::cast<OpResult>(getResult()), parents);
-        return DiagnosedSilenceableFailure::success();
+      while (parent) {
+        bool checkIsolatedFromAbove =
+            !getIsolatedFromAbove() ||
+            parent->hasTrait<OpTrait::IsIsolatedFromAbove>();
+        bool checkOpName = !getOpName().has_value() ||
+                           parent->getName().getStringRef() == *getOpName();
+        if (checkIsolatedFromAbove && checkOpName)
+          break;
+        parent = parent->getParentOp();
+      }
+      if (!parent) {
+        if (getAllowEmptyResults()) {
+          results.set(llvm::cast<OpResult>(getResult()), parents);
+          return DiagnosedSilenceableFailure::success();
+        }
+        DiagnosedSilenceableFailure diag =
+            emitSilenceableError()
+            << "could not find a parent op that matches all requirements";
+        diag.attachNote(target->getLoc()) << "target op";
+        return diag;
       }
-      DiagnosedSilenceableFailure diag =
-          emitSilenceableError()
-          << "could not find a parent op that matches all requirements";
-      diag.attachNote(target->getLoc()) << "target op";
-      return diag;
     }
     if (getDeduplicate()) {
       if (!resultSet.contains(parent)) {
diff --git a/mlir/python/mlir/dialects/transform/__init__.py b/mlir/python/mlir/dialects/transform/__init__.py
index f7a2026e800ae..166c5c5ca4ec3 100644
--- a/mlir/python/mlir/dialects/transform/__init__.py
+++ b/mlir/python/mlir/dialects/transform/__init__.py
@@ -52,26 +52,28 @@ def patterns(self) -> Block:
 
 @_ods_cext.register_operation(_Dialect, replace=True)
 class GetParentOp(GetParentOp):
-    def __init__(
-        self,
-        result_type: Type,
-        target: Union[Operation, Value],
-        *,
-        isolated_from_above: bool = False,
-        op_name: Optional[str] = None,
-        deduplicate: bool = False,
-        loc=None,
-        ip=None,
-    ):
-        super().__init__(
-            result_type,
-            _get_op_result_or_value(target),
-            isolated_from_above=isolated_from_above,
-            op_name=op_name,
-            deduplicate=deduplicate,
-            loc=loc,
-            ip=ip,
-        )
+  def __init__(
+      self,
+      result_type: Type,
+      target: Union[Operation, Value],
+      *,
+      isolated_from_above: bool = False,
+      op_name: Optional[str] = None,
+      deduplicate: bool = False,
+      nth_parent: int = 1,
+      loc=None,
+      ip=None,
+  ):
+    super().__init__(
+        result_type,
+        _get_op_result_or_value(target),
+        isolated_from_above=isolated_from_above,
+        op_name=op_name,
+        deduplicate=deduplicate,
+        nth_parent=nth_parent,
+        loc=loc,
+        ip=ip,
+    )
 
 
 @_ods_cext.register_operation(_Dialect, replace=True)
diff --git a/mlir/python/mlir/dialects/transform/loop.py b/mlir/python/mlir/dialects/transform/loop.py
index 6c89025f41383..3bdd9ca3b22f0 100644
--- a/mlir/python/mlir/dialects/transform/loop.py
+++ b/mlir/python/mlir/dialects/transform/loop.py
@@ -17,30 +17,6 @@
 from typing import Optional, Union
 
 
-@_ods_cext.register_operation(_Dialect, replace=True)
-class GetParentForOp(GetParentForOp):
-    """Extension for GetParentForOp."""
-
-    def __init__(
-        self,
-        result_type: Type,
-        target: Union[Operation, Value],
-        *,
-        num_loops: Optional[int] = None,
-        ip=None,
-        loc=None,
-    ):
-        if num_loops is None:
-            num_loops = 1
-        super().__init__(
-            result_type,
-            _get_op_result_or_value(target),
-            num_loops=num_loops,
-            ip=ip,
-            loc=loc,
-        )
-
-
 @_ods_cext.register_operation(_Dialect, replace=True)
 class LoopOutlineOp(LoopOutlineOp):
     """Extension for LoopOutlineOp."""
diff --git a/mlir/test/Dialect/SCF/transform-ops-invalid.mlir b/mlir/test/Dialect/SCF/transform-ops-invalid.mlir
index 96c57d4716d37..59b824d4ca262 100644
--- a/mlir/test/Dialect/SCF/transform-ops-invalid.mlir
+++ b/mlir/test/Dialect/SCF/transform-ops-invalid.mlir
@@ -32,7 +32,7 @@ func.func @test_loops_do_not_get_unrolled() {
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
     %0 = transform.structured.match ops{["arith.addi"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-    %1 = transform.loop.get_parent_for %0 { affine = true } : (!transform.any_op) -> !transform.op<"affine.for">
+    %1 = transform.get_parent_op %0 {op_name = "affine.for"} : (!transform.any_op) -> !transform.op<"affine.for">
     // expected-error @below {{failed to unroll}}
     transform.loop.unroll %1 { factor = 8 } : !transform.op<"affine.for">
     transform.yield
@@ -81,7 +81,7 @@ func.func @test_loops_do_not_get_peeled() {
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
     %0 = transform.structured.match ops{["arith.addi"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-    %1 = transform.loop.get_parent_for %0 : (!transform.any_op) -> !transform.op<"scf.for">
+    %1 = transform.get_parent_op %0 {op_name = "scf.for"} : (!transform.any_op) -> !transform.op<"scf.for">
     // expected-error @below {{failed to peel}}
     transform.loop.peel %1 : (!transform.op<"scf.for">) -> (!transform.any_op, !transform.any_op)
     transform.yield
diff --git a/mlir/test/Dialect/SCF/transform-ops.mlir b/mlir/test/Dialect/SCF/transform-ops.mlir
index 6d1ba48d3b935..74601cf5b34a1 100644
--- a/mlir/test/Dialect/SCF/transform-ops.mlir
+++ b/mlir/test/Dialect/SCF/transform-ops.mlir
@@ -1,53 +1,5 @@
 // RUN: mlir-opt %s -transform-interpreter -split-input-file -verify-diagnostics | FileCheck %s
 
-// CHECK-LABEL: @get_parent_for_op
-func.func @get_parent_for_op(%arg0: index, %arg1: index, %arg2: index) {
-  // expected-remark @below {{first loop}}
-  scf.for %i = %arg0 to %arg1 step %arg2 {
-    // expected-remark @below {{second loop}}
-    scf.for %j = %arg0 to %arg1 step %arg2 {
-      // expected-remark @below {{third loop}}
-      scf.for %k = %arg0 to %arg1 step %arg2 {
-        arith.addi %i, %j : index
-      }
-    }
-  }
-  return
-}
-
-module attributes {transform.with_named_sequence} {
-  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
-    %0 = transform.structured.match ops{["arith.addi"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-    // CHECK: = transform.loop.get_parent_for
-    %1 = transform.loop.get_parent_for %0 : (!transform.any_op) -> !transform.op<"scf.for">
-    %2 = transform.loop.get_parent_for %0 { num_loops = 2 } : (!transform.any_op) -> !transform.op<"scf.for">
-    %3 = transform.loop.get_parent_for %0 { num_loops = 3 } : (!transform.any_op) -> !transform.op<"scf.for">
-    transform.test_print_remark_at_operand %1, "third loop" : !transform.op<"scf.for">
-    transform.test_print_remark_at_operand %2, "second loop" : !transform.op<"scf.for">
-    transform.test_print_remark_at_operand %3, "first loop" : !transform.op<"scf.for">
-    transform.yield
-  }
-}
-
-// -----
-
-func.func @get_parent_for_op_no_loop(%arg0: index, %arg1: index) {
-  // expected-note @below {{target op}}
-  arith.addi %arg0, %arg1 : index
-  return
-}
-
-module attributes {transform.with_named_sequence} {
-  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
-    %0 = transform.structured.match ops{["arith.addi"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-    // expected-error @below {{could not find an 'scf.for' parent}}
-    %1 = transform.loop.get_parent_for %0 : (!transform.any_op) -> !transform.op<"scf.for">
-    transform.yield
-  }
-}
-
-// -----
-
 // Outlined functions:
 //
 // CHECK: func @foo(%{{.+}}, %{{.+}}, %{{.+}}, %{{.+}})
@@ -81,7 +33,7 @@ func.func @loop_outline_op(%arg0: index, %arg1: index, %arg2: index) {
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
     %0 = transform.structured.match ops{["arith.addi"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-    %1 = transform.loop.get_parent_for %0  : (!transform.any_op) -> !transform.op<"scf.for">
+    %1 = transform.get_parent_op %0 {op_name = "scf.for"} : (!transform.any_op) -> !transform.op<"scf.for">
     // CHECK: = transform.loop.outline %{{.*}}
     transform.loop.outline %1 {func_name = "foo"} : (!transform.op<"scf.for">) -> (!transform.any_op, !transform.any_op)
     transform.yield
@@ -114,7 +66,7 @@ func.func @loop_peel_op() {
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
     %0 = transform.structured.match ops{["arith.addi"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-    %1 = transform.loop.get_parent_for %0 : (!transform.any_op) -> !transform.op<"scf.for">
+    %1 = transform.get_parent_op %0 {op_name = "scf.for"} : (!transform.any_op) -> !transform.op<"scf.for">
     %main_loop, %remainder = transform.loop.peel %1 : (!transform.op<"scf.for">) -> (!transform.op<"scf.for">, !transform.op<"scf.for">)
     // Make sure 
     transform.test_print_remark_at_operand %main_loop, "main loop" : !transform.op<"scf.for">
@@ -152,7 +104,7 @@ func.func @loop_pipeline_op(%A: memref<?xf32>, %result: memref<?xf32>) {
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
     %0 = transform.structured.match ops{["arith.addf"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-    %1 = transform.loop.get_parent_for %0 : (!transform.any_op) -> !transform.op<"scf.for">
+    %1 = transform.get_parent_op %0 {op_name = "scf.for"} : (!transform.any_op) -> !transform.op<"scf.for">
     %2 = transform.loop.pipeline %1 : (!transform.op<"scf.for">) -> !transform.any_op
     // Verify that the returned handle is usable.
     transform.test_print_remark_at_operand %2, "transformed" : !transform.any_op
@@ -178,7 +130,7 @@ func.func @loop_unroll_op() {
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
     %0 = transform.structured.match ops{["arith.addi"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-    %1 = transform.loop.get_parent_for %0 : (!transform.any_op) -> !transform.op<"scf.for">
+    %1 = transform.get_parent_op %0 {op_name = "scf.for"} : (!transform.any_op) -> !transform.op<"scf.for">
     transform.loop.unroll %1 { factor = 4 } : !transform.op<"scf.for">
     transform.yield
   }
@@ -186,54 +138,6 @@ module attributes {transform.with_named_sequence} {
 
 // -----
 
-// CHECK-LABEL: @get_parent_for_op
-func.func @get_parent_for_op(%arg0: index, %arg1: index, %arg2: index) {
-  // expected-remark @below {{first loop}}
-  affine.for %i = %arg0 to %arg1 {
-    // expected-remark @below {{second loop}}
-    affine.for %j = %arg0 to %arg1 {
-      // expected-remark @below {{third loop}}
-      affine.for %k = %arg0 to %arg1 {
-        arith.addi %i, %j : index
-      }
-    }
-  }
-  return
-}
-
-module attributes {transform.with_named_sequence} {
-  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
-    %0 = transform.structured.match ops{["arith.addi"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-    // CHECK: = transform.loop.get_parent_for
-    %1 = transform.loop.get_parent_for %0 { affine = true } : (!transform.any_op) -> !transform.op<"affine.for">
-    %2 = transform.loop.get_parent_for %0 { num_loops = 2, affine = true } : (!transform.any_op) -> !transform.op<"affine.for">
-    %3 = transform.loop.get_parent_for %0 { num_loops = 3, affine = true } : (!transform.any_op) -> !transform.op<"affine.for">
-    transform.test_print_remark_at_operand %1, "third loop" : !transform.op<"affine.for">
-    transform.test_print_remark_at_operand %2, "second loop" : !transform.op<"affine.for">
-    transform.test_print_remark_at_operand %3, "first loop" : !transform.op<"affine.for">
-    transform.yield
-  }
-}
-
-// -----
-
-func.func @get_parent_for_op_no_loop(%arg0: index, %arg1: index) {
-  // expected-note @below {{target op}}
-  arith.addi %arg0, %arg1 : index
-  return
-}
-
-module attributes {transform.with_named_sequence} {
-  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
-    %0 = transform.structured.match ops{["arith.addi"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-    // expected-error @below {{could not find an 'affine.for' parent}}
-    %1 = transform.loop.get_parent_for %0 { affine = true } : (!transform.any_op) -> !transform.op<"affine.for">
-    transform.yield
-  }
-}
-
-// -----
-
 func.func @loop_unroll_op() {
   %c0 = arith.constant 0 : index
   %c42 = arith.constant 42 : index
@@ -250,7 +154,7 @@ func.func @loop_unroll_op() {
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
     %0 = transform.structured.match ops{["arith.addi"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-    %1 = transform.loop.get_parent_for %0 { affine = true } : (!transform.any_op) -> !transform.op<"affine.for">
+    %1 = transform.get_parent_op %0 {op_name = "affine.for"} : (!transform.any_op) -> !transform.op<"affine.for">
     transform.test_print_remark_at_operand %1, "affine for loop" : !transform.op<"affine.for">
     transform.loop.unroll %1 { factor = 4, affine = true } : !transform.op<"affine.for">
     transform.yield
@@ -277,7 +181,7 @@ func.func @test_mixed_loops() {
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
     %0 = transform.structured.match ops{["arith.addi"]} in %arg1 : (!transform.any_op) -> !transform.any_op
-    %1 = transform.loop.get_parent_for %0 { num_loops = 1, affine = true } : (!transform.any_op) -> !transform.op<"affine.for">
+    %1 = transform.get_parent_op %0 {op_name = "affine.for"} : (!transform.any_op) -> !transform.op<"affine.for">
     transform.test_print_remark_at_operand %1, "affine for loop" : !transform.op<"affine.for">
     transform.loop.unroll %1 { factor = 4 } : !transform.op<"affine.for">
     transform.yield
diff --git a/mlir/test/Dialect/Transform/test-interpreter.mlir b/mlir/test/Dialect/Transform/test-interpreter.mlir
index 3891c16b41155..d9a11994eb9d9 100644
--- a/mlir/test/Dialect/Transform/test-interpreter.mlir
+++ b/mlir/test/Dialect/Transform/test-interpreter.mlir
@@ -116,6 +116,32 @@ transform.with_pdl_patterns {
 
 // -----
 
+func.func @test_get_nth_parent() {
+  "test.foo"() ({
+    // expected-remark @below{{2nd parent}}
+    "test.foo"() ({
+      "test.qux"() ({
+        // expected-remark @below{{1st parent}}
+        "test.foo"() ({
+          "test.bar"() : () -> ()
+        }) : () -> ()
+      }) : () -> ()
+    }) : () -> ()
+  }) : () -> ()
+}
+
+transform.sequence failures(propagate) {
+^bb0(%arg0: !transform.any_op):
+  %f = transform.structured.match ops{["test.bar"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+  %parent = get_parent_op %f {nth_parent = 1, op_name = "test.foo"} : (!transform.any_op) -> !transform.any_op
+  test_print_remark_at_operand %parent, "1st parent" : !transform.any_op
+  %parent2 = get_parent_op %f {nth_parent = 2, op_name = "test.foo"} : (!transform.any_op) -> !transform.any_op
+  test_print_remark_at_operand %parent2, "2nd parent" : !transform.any_op
+  transform.yield
+}
+
+// -----
+
 func.func @foo() {
   %0 = arith.constant 0 : i32
   return
@@ -355,7 +381,7 @@ transform.with_pdl_patterns {
   sequence %arg0 : !transform.any_op failures(propagate) {
   ^bb1(%arg1: !transform.any_op):
     %0 = transform.pdl_match @match_const in %arg1 : (!transform.any_op) -> !transform.any_op
-    %1 = transform.loop.get_parent_for %0 : (!transform.any_op) -> !transform.any_op
+    %1 = transform.get_parent_op %0 {op_name = "scf.for"} : (!transform.any_op) -> !transform.any_op
     // expected-error @below {{only isolated-from-above ops can be alternative scopes}}
     alternatives %1 : !transform.any_op {
     ^bb2(%arg2: !transform.any_op):
diff --git a/mlir/test/python/dialects/transform.py b/mlir/test/python/dialects/transform.py
index 481d774572010..d778172a607a3 100644
--- a/mlir/test/python/dialects/transform.py
+++ b/mlir/test/python/dialects/transform.py
@@ -162,13 +162,16 @@ def testGetParentOp():
   )
   with InsertionPoint(sequence.body):
     transform.GetParentOp(
-        transform.AnyOpType.get(), sequence.bodyTarget, isolated_from_above=True
+        transform.AnyOpType.get(),
+        sequence.bodyTarget,
+        isolated_from_above=True,
+        nth_parent=2,
     )
     transform.YieldOp()
   # CHECK-LABEL: TEST: testGetParentOp
   # CHECK: transform.sequence
   # CHECK: ^{{.*}}(%[[ARG1:.+]]: !transform.any_op):
-  # CHECK:   = get_parent_op %[[ARG1]] {isolated_from_above}
+  # CHECK:   = get_parent_op %[[ARG1]] {isolated_from_above, nth_parent = 2 : i64}
 
 
 @run
diff --git a/mlir/test/python/dialects/transform_loop_ext.py b/mlir/test/python/dialects/transform_loop_ext.py
index daec6707d6743..840e7a46e7ce0 100644
--- a/mlir/test/python/dialects/transform_loop_ext.py
+++ b/mlir/test/python/dialects/transform_loop_ext.py
@@ -16,21 +16,6 @@ def run(f):
     return f
 
 
-@run
-def getParentLoop():
-    sequence = transform.SequenceOp(
-        transform.FailurePropagationMode.Propagate, [], pdl.OperationType.get()
-    )
-    with InsertionPoint(sequence.body):
-        loop.GetParentForOp(
-            transform.OperationType.get("scf.for"), sequence.bodyTarget, num_loops=2
-        )
-        transform.YieldOp()
-    # CHECK-LABEL: TEST: getParentLoop
-    # CHECK: = transform.loop.get_parent_for %
-    # CHECK: num_loops = 2
-
-
 @run
 def loopOutline():
     sequence = transform.SequenceOp(

From 03ec84a00ba4d540222ab39c407e02959058fbdd Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Tue, 31 Oct 2023 10:46:49 +0100
Subject: [PATCH 496/877] Revert "Add two time-trace scope variables."

This reverts commit 33b85867e30e1adc2ff2173039c199b81c10f52b.

This causes a large compile-time regression (about 1% for unoptimized
builds).
---
 clang/docs/ReleaseNotes.rst                   |  8 ----
 clang/lib/Parse/Parser.cpp                    | 14 +-----
 ...e-ParseDeclarationOrFunctionDefinition.cpp | 15 -------
 clang/unittests/Support/TimeProfilerTest.cpp  | 44 ++++++++-----------
 4 files changed, 19 insertions(+), 62 deletions(-)
 delete mode 100644 clang/test/Driver/check-time-trace-ParseDeclarationOrFunctionDefinition.cpp

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index c151bd9d234b5..bc28bb567f693 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -427,14 +427,6 @@ Improvements to Clang's diagnostics
   (or, more commonly, ``NULL`` when the platform defines it as ``__null``) to be more consistent
   with GCC.
 
-Improvements to Clang's time-trace
-----------------------------------
-- Two time-trace scope variables are added. A time trace scope variable of
-  ``ParseDeclarationOrFunctionDefinition`` with the function's source location
-  is added to record the time spent parsing the function's declaration or
-  definition. Another time trace scope variable of ``ParseFunctionDefinition``
-  is also added to record the name of the defined function.
-
 Bug Fixes in This Version
 -------------------------
 - Fixed an issue where a class template specialization whose declaration is
diff --git a/clang/lib/Parse/Parser.cpp b/clang/lib/Parse/Parser.cpp
index bef3a0dcb285e..0f930248e7717 100644
--- a/clang/lib/Parse/Parser.cpp
+++ b/clang/lib/Parse/Parser.cpp
@@ -13,8 +13,8 @@
 #include "clang/Parse/Parser.h"
 #include "clang/AST/ASTConsumer.h"
 #include "clang/AST/ASTContext.h"
-#include "clang/AST/ASTLambda.h"
 #include "clang/AST/DeclTemplate.h"
+#include "clang/AST/ASTLambda.h"
 #include "clang/Basic/FileManager.h"
 #include "clang/Parse/ParseDiagnostic.h"
 #include "clang/Parse/RAIIObjectsForParser.h"
@@ -22,7 +22,6 @@
 #include "clang/Sema/ParsedTemplate.h"
 #include "clang/Sema/Scope.h"
 #include "llvm/Support/Path.h"
-#include "llvm/Support/TimeProfiler.h"
 using namespace clang;
 
 
@@ -1230,13 +1229,6 @@ Parser::DeclGroupPtrTy Parser::ParseDeclOrFunctionDefInternal(
 Parser::DeclGroupPtrTy Parser::ParseDeclarationOrFunctionDefinition(
     ParsedAttributes &Attrs, ParsedAttributes &DeclSpecAttrs,
     ParsingDeclSpec *DS, AccessSpecifier AS) {
-  // Add an enclosing time trace scope for a bunch of small scopes with
-  // "EvaluateAsConstExpr".
-  llvm::TimeTraceScope TimeScope(
-      "ParseDeclarationOrFunctionDefinition",
-      Tok.getLocation().printToString(
-          Actions.getASTContext().getSourceManager()));
-
   if (DS) {
     return ParseDeclOrFunctionDefInternal(Attrs, DeclSpecAttrs, *DS, AS);
   } else {
@@ -1267,10 +1259,6 @@ Parser::DeclGroupPtrTy Parser::ParseDeclarationOrFunctionDefinition(
 Decl *Parser::ParseFunctionDefinition(ParsingDeclarator &D,
                                       const ParsedTemplateInfo &TemplateInfo,
                                       LateParsedAttrList *LateParsedAttrs) {
-  llvm::TimeTraceScope TimeScope(
-      "ParseFunctionDefinition",
-      Actions.GetNameForDeclarator(D).getName().getAsString());
-
   // Poison SEH identifiers so they are flagged as illegal in function bodies.
   PoisonSEHIdentifiersRAIIObject PoisonSEHIdentifiers(*this, true);
   const DeclaratorChunk::FunctionTypeInfo &FTI = D.getFunctionTypeInfo();
diff --git a/clang/test/Driver/check-time-trace-ParseDeclarationOrFunctionDefinition.cpp b/clang/test/Driver/check-time-trace-ParseDeclarationOrFunctionDefinition.cpp
deleted file mode 100644
index f854cddadbfcc..0000000000000
--- a/clang/test/Driver/check-time-trace-ParseDeclarationOrFunctionDefinition.cpp
+++ /dev/null
@@ -1,15 +0,0 @@
-// RUN: %clangxx -S -ftime-trace -ftime-trace-granularity=0 -o %T/check-time-trace-ParseDeclarationOrFunctionDefinition %s
-// RUN: cat %T/check-time-trace-ParseDeclarationOrFunctionDefinition.json \
-// RUN:   | %python -c 'import json, sys; json.dump(json.loads(sys.stdin.read()), sys.stdout, sort_keys=True, indent=2)' \
-// RUN:   | FileCheck %s
-
-// CHECK-DAG: "name": "ParseDeclarationOrFunctionDefinition"
-// CHECK-DAG: "detail": "{{.*}}check-time-trace-ParseDeclarationOrFunctionDefinition.cpp:15:1"
-// CHECK-DAG: "name": "ParseFunctionDefinition"
-// CHECK-DAG: "detail": "foo"
-// CHECK-DAG: "name": "ParseFunctionDefinition"
-// CHECK-DAG: "detail": "bar"
-
-template <typename T>
-void foo(T) {}
-void bar() { foo(0); }
diff --git a/clang/unittests/Support/TimeProfilerTest.cpp b/clang/unittests/Support/TimeProfilerTest.cpp
index 97fdbb7232b13..a7ca2bf91e474 100644
--- a/clang/unittests/Support/TimeProfilerTest.cpp
+++ b/clang/unittests/Support/TimeProfilerTest.cpp
@@ -177,29 +177,22 @@ constexpr int slow_init_list[] = {1, 1, 2, 3, 5, 8, 13, 21}; // 25th line
   std::string TraceGraph = buildTraceGraph(Json);
   ASSERT_TRUE(TraceGraph == R"(
 Frontend
-| ParseDeclarationOrFunctionDefinition (test.cc:2:1)
-| ParseDeclarationOrFunctionDefinition (test.cc:6:1)
-| | ParseFunctionDefinition (slow_func)
-| | | EvaluateAsRValue (<test.cc:8:21>)
-| | | EvaluateForOverflow (<test.cc:8:21, col:25>)
-| | | EvaluateForOverflow (<test.cc:8:30, col:32>)
-| | | EvaluateAsRValue (<test.cc:9:14>)
-| | | EvaluateForOverflow (<test.cc:9:9, col:14>)
-| | | isPotentialConstantExpr (slow_namespace::slow_func)
-| | | EvaluateAsBooleanCondition (<test.cc:8:21, col:25>)
-| | | | EvaluateAsRValue (<test.cc:8:21, col:25>)
-| | | EvaluateAsBooleanCondition (<test.cc:8:21, col:25>)
-| | | | EvaluateAsRValue (<test.cc:8:21, col:25>)
-| ParseDeclarationOrFunctionDefinition (test.cc:16:1)
-| | ParseFunctionDefinition (slow_test)
-| | | EvaluateAsInitializer (slow_value)
-| | | EvaluateAsConstantExpr (<test.cc:17:33, col:59>)
-| | | EvaluateAsConstantExpr (<test.cc:18:11, col:37>)
-| ParseDeclarationOrFunctionDefinition (test.cc:22:1)
-| | EvaluateAsConstantExpr (<test.cc:23:31, col:57>)
-| | EvaluateAsRValue (<test.cc:22:14, line:23:58>)
-| ParseDeclarationOrFunctionDefinition (test.cc:25:1)
-| | EvaluateAsInitializer (slow_init_list)
+| EvaluateAsRValue (<test.cc:8:21>)
+| EvaluateForOverflow (<test.cc:8:21, col:25>)
+| EvaluateForOverflow (<test.cc:8:30, col:32>)
+| EvaluateAsRValue (<test.cc:9:14>)
+| EvaluateForOverflow (<test.cc:9:9, col:14>)
+| isPotentialConstantExpr (slow_namespace::slow_func)
+| EvaluateAsBooleanCondition (<test.cc:8:21, col:25>)
+| | EvaluateAsRValue (<test.cc:8:21, col:25>)
+| EvaluateAsBooleanCondition (<test.cc:8:21, col:25>)
+| | EvaluateAsRValue (<test.cc:8:21, col:25>)
+| EvaluateAsInitializer (slow_value)
+| EvaluateAsConstantExpr (<test.cc:17:33, col:59>)
+| EvaluateAsConstantExpr (<test.cc:18:11, col:37>)
+| EvaluateAsConstantExpr (<test.cc:23:31, col:57>)
+| EvaluateAsRValue (<test.cc:22:14, line:23:58>)
+| EvaluateAsInitializer (slow_init_list)
 | PerformPendingInstantiations
 )");
 
@@ -220,9 +213,8 @@ struct {
   std::string TraceGraph = buildTraceGraph(Json);
   ASSERT_TRUE(TraceGraph == R"(
 Frontend
-| ParseDeclarationOrFunctionDefinition (test.c:2:1)
-| | isIntegerConstantExpr (<test.c:3:18>)
-| | EvaluateKnownConstIntCheckOverflow (<test.c:3:18>)
+| isIntegerConstantExpr (<test.c:3:18>)
+| EvaluateKnownConstIntCheckOverflow (<test.c:3:18>)
 | PerformPendingInstantiations
 )");
 

From 75b3c3d267bf49b9061db55c4527cfea9e62f77a Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Tue, 31 Oct 2023 09:51:30 +0000
Subject: [PATCH 497/877] [ARM] Disable UpperBound loop unrolling for MVE tail
 predicated loops. (#69709)

For MVE tail predicated loops, better code can be generated by keeping
the loop whole than to unroll to an upper bound, which requires the
expansion of active lane masks that can be difficult to generate good
code for. This patch disables UpperBound unrolling when we find a
active_lane_mask in the loop.
---
 .../lib/Target/ARM/ARMTargetTransformInfo.cpp | 12 ++-
 .../LoopUnroll/ARM/mve-upperbound.ll          | 79 +++++++++++++++++++
 2 files changed, 88 insertions(+), 3 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopUnroll/ARM/mve-upperbound.ll

diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
index e0d112c4a7edd..1dee7a3ccb6d8 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -2430,9 +2430,15 @@ ARMTTIImpl::getPreferredTailFoldingStyle(bool IVUpdateMayOverflow) const {
 void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
                                          TTI::UnrollingPreferences &UP,
                                          OptimizationRemarkEmitter *ORE) {
-  // Enable Upper bound unrolling universally, not dependant upon the conditions
-  // below.
-  UP.UpperBound = true;
+  // Enable Upper bound unrolling universally, providing that we do not see an
+  // active lane mask, which will be better kept as a loop to become tail
+  // predicated than to be conditionally unrolled.
+  UP.UpperBound =
+      !ST->hasMVEIntegerOps() || !any_of(*L->getHeader(), [](Instruction &I) {
+        return isa<IntrinsicInst>(I) &&
+               cast<IntrinsicInst>(I).getIntrinsicID() ==
+                   Intrinsic::get_active_lane_mask;
+      });
 
   // Only currently enable these preferences for M-Class cores.
   if (!ST->isMClass())
diff --git a/llvm/test/Transforms/LoopUnroll/ARM/mve-upperbound.ll b/llvm/test/Transforms/LoopUnroll/ARM/mve-upperbound.ll
new file mode 100644
index 0000000000000..2bb6f05b91b1a
--- /dev/null
+++ b/llvm/test/Transforms/LoopUnroll/ARM/mve-upperbound.ll
@@ -0,0 +1,79 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=loop-unroll -S -mtriple thumbv8.1m.main-none-eabi -mattr=+mve %s | FileCheck %s
+
+; The vector loop here is better kept as a loop than conditionally unrolled,
+; letting it transform into a tail predicted loop.
+
+define void @unroll_upper(ptr noundef %pSrc, ptr nocapture noundef writeonly %pDst, i32 noundef %blockSize) {
+; CHECK-LABEL: @unroll_upper(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP_NOT23:%.*]] = icmp ult i32 [[BLOCKSIZE:%.*]], 16
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[BLOCKSIZE]], 15
+; CHECK-NEXT:    [[CMP6_NOT28:%.*]] = icmp eq i32 [[AND]], 0
+; CHECK-NEXT:    br i1 [[CMP6_NOT28]], label [[WHILE_END12:%.*]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK:       vector.memcheck:
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[PDST:%.*]], i32 [[AND]]
+; CHECK-NEXT:    [[TMP0:%.*]] = shl nuw nsw i32 [[AND]], 1
+; CHECK-NEXT:    [[SCEVGEP32:%.*]] = getelementptr i8, ptr [[PSRC:%.*]], i32 [[TMP0]]
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[PDST]], [[SCEVGEP32]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[PSRC]], [[SCEVGEP]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    [[N_RND_UP:%.*]] = add nuw nsw i32 [[AND]], 7
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i32 [[N_RND_UP]], 24
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_MEMCHECK]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[PDST]], i32 [[INDEX]]
+; CHECK-NEXT:    [[TMP1:%.*]] = shl i32 [[INDEX]], 1
+; CHECK-NEXT:    [[NEXT_GEP37:%.*]] = getelementptr i8, ptr [[PSRC]], i32 [[TMP1]]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 [[INDEX]], i32 [[AND]])
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr [[NEXT_GEP37]], i32 2, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i16> poison)
+; CHECK-NEXT:    [[TMP2:%.*]] = lshr <8 x i16> [[WIDE_MASKED_LOAD]], <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc <8 x i16> [[TMP2]] to <8 x i8>
+; CHECK-NEXT:    call void @llvm.masked.store.v8i8.p0(<8 x i8> [[TMP3]], ptr [[NEXT_GEP]], i32 1, <8 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP4]], label [[WHILE_END12_LOOPEXIT:%.*]], label [[VECTOR_BODY]]
+; CHECK:       while.end12.loopexit:
+; CHECK-NEXT:    br label [[WHILE_END12]]
+; CHECK:       while.end12:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cmp.not23 = icmp ult i32 %blockSize, 16
+  %and = and i32 %blockSize, 15
+  %cmp6.not28 = icmp eq i32 %and, 0
+  br i1 %cmp6.not28, label %while.end12, label %vector.memcheck
+
+vector.memcheck:                                  ; preds = %entry
+  %scevgep = getelementptr i8, ptr %pDst, i32 %and
+  %0 = shl nuw nsw i32 %and, 1
+  %scevgep32 = getelementptr i8, ptr %pSrc, i32 %0
+  %bound0 = icmp ult ptr %pDst, %scevgep32
+  %bound1 = icmp ult ptr %pSrc, %scevgep
+  %found.conflict = and i1 %bound0, %bound1
+  %n.rnd.up = add nuw nsw i32 %and, 7
+  %n.vec = and i32 %n.rnd.up, 24
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.memcheck
+  %index = phi i32 [ 0, %vector.memcheck ], [ %index.next, %vector.body ]
+  %next.gep = getelementptr i8, ptr %pDst, i32 %index
+  %1 = shl i32 %index, 1
+  %next.gep37 = getelementptr i8, ptr %pSrc, i32 %1
+  %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %and)
+  %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr %next.gep37, i32 2, <8 x i1> %active.lane.mask, <8 x i16> poison)
+  %2 = lshr <8 x i16> %wide.masked.load, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+  %3 = trunc <8 x i16> %2 to <8 x i8>
+  call void @llvm.masked.store.v8i8.p0(<8 x i8> %3, ptr %next.gep, i32 1, <8 x i1> %active.lane.mask)
+  %index.next = add i32 %index, 8
+  %4 = icmp eq i32 %index.next, %n.vec
+  br i1 %4, label %while.end12, label %vector.body
+
+while.end12:                                      ; preds = %vector.body, %entry
+  ret void
+}
+
+declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32, i32)
+declare <8 x i16> @llvm.masked.load.v8i16.p0(ptr nocapture, i32 immarg, <8 x i1>, <8 x i16>)
+declare void @llvm.masked.store.v8i8.p0(<8 x i8>, ptr nocapture, i32 immarg, <8 x i1>)

From 6a06155c53344e7c276c7ccda5a3fb287937fff5 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Tue, 31 Oct 2023 11:23:43 +0100
Subject: [PATCH 498/877] [VectorCombine] Discard ScalarizationResults if
 transform aborted

Fixes https://github.com/llvm/llvm-project/issues/69820.
---
 .../Transforms/Vectorize/VectorCombine.cpp    |  8 +++++++
 .../load-extractelement-scalarization.ll      | 21 +++++++++++++++++++
 2 files changed, 29 insertions(+)

diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 51163352b1e68..9a91ce207bb04 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -14,6 +14,7 @@
 
 #include "llvm/Transforms/Vectorize/VectorCombine.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/ScopeExit.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/BasicAliasAnalysis.h"
@@ -1272,6 +1273,12 @@ bool VectorCombine::scalarizeLoadExtract(Instruction &I) {
   Instruction *LastCheckedInst = LI;
   unsigned NumInstChecked = 0;
   DenseMap<ExtractElementInst *, ScalarizationResult> NeedFreeze;
+  auto FailureGuard = make_scope_exit([&]() {
+    // If the transform is aborted, discard the ScalarizationResults.
+    for (auto &Pair : NeedFreeze)
+      Pair.second.discard();
+  });
+
   // Check if all users of the load are extracts with no memory modifications
   // between the load and the extract. Compute the cost of both the original
   // code and the scalarized version.
@@ -1339,6 +1346,7 @@ bool VectorCombine::scalarizeLoadExtract(Instruction &I) {
     replaceValue(*EI, *NewLoad);
   }
 
+  FailureGuard.release();
   return true;
 }
 
diff --git a/llvm/test/Transforms/VectorCombine/AArch64/load-extractelement-scalarization.ll b/llvm/test/Transforms/VectorCombine/AArch64/load-extractelement-scalarization.ll
index 7d1c7a6855d76..613c71cb769b0 100644
--- a/llvm/test/Transforms/VectorCombine/AArch64/load-extractelement-scalarization.ll
+++ b/llvm/test/Transforms/VectorCombine/AArch64/load-extractelement-scalarization.ll
@@ -906,3 +906,24 @@ exit:
   %p = phi i8 [ 0, %entry ], [ %ext, %then ]
   ret i8 0
 }
+
+declare void @use(...)
+
+; Make sure we don't assert.
+define void @pr69820(ptr %p, i32 %arg) {
+; CHECK-LABEL: @pr69820(
+; CHECK-NEXT:    [[V:%.*]] = load <4 x float>, ptr [[P:%.*]], align 16
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[ARG:%.*]], 3
+; CHECK-NEXT:    [[EXT:%.*]] = extractelement <4 x float> [[V]], i32 [[AND]]
+; CHECK-NEXT:    call void @use(<4 x float> [[V]], float [[EXT]])
+; CHECK-NEXT:    ret void
+;
+  %v = load <4 x float>, ptr %p, align 16
+  %and = and i32 %arg, 3
+  %ext = extractelement <4 x float> %v, i32 %and
+  call void @use(<4 x float> %v, float %ext)
+  ret void
+
+; uselistorder directives
+  uselistorder <4 x float> %v, { 1, 0 }
+}

From b8d3ccdff1fb12e38f226ce111ede74553059544 Mon Sep 17 00:00:00 2001
From: Jessica Del <50999226+OutOfCache@users.noreply.github.com>
Date: Tue, 31 Oct 2023 11:26:45 +0100
Subject: [PATCH 499/877] [AMDGPU] - Add s_bitreplicate intrinsic (#69209)

Add intrinsic for s_bitreplicate. Lower to S_BITREPLICATE_B64_B32
machine instruction in both GISel and Selection DAG.

Support VGPR arguments by inserting a `v_readfirstlane`.
---
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td      |  5 +++
 .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp  |  6 +++
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp        |  8 ++++
 llvm/lib/Target/AMDGPU/SOPInstructions.td     |  3 +-
 .../AMDGPU/llvm.amdgcn.bitreplicate.ll        | 45 +++++++++++++++++++
 5 files changed, 66 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.bitreplicate.ll

diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 89c7b6ab9ee43..50c63621b8e41 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -1927,6 +1927,11 @@ def int_amdgcn_inverse_ballot :
   Intrinsic<[llvm_i1_ty], [llvm_anyint_ty],
             [IntrNoMem, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
 
+// Lowers to S_BITREPLICATE_B64_B32.
+// The argument must be uniform; otherwise, the result is undefined.
+def int_amdgcn_s_bitreplicate :
+  DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_i32_ty], [IntrNoMem, IntrConvergent]>;
+
 class AMDGPUWaveReduce<LLVMType data_ty = llvm_anyint_ty> : Intrinsic<
     [data_ty],
     [
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index e409a24007a6b..0c5ed649bcdbe 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -2994,6 +2994,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
       applyMappingBFE(B, OpdMapper, false);
       return;
     case Intrinsic::amdgcn_inverse_ballot:
+    case Intrinsic::amdgcn_s_bitreplicate:
       applyDefaultMapping(OpdMapper);
       constrainOpWithReadfirstlane(B, MI, 2); // Mask
       return;
@@ -4546,6 +4547,11 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
       OpdsMapping[2] = AMDGPU::getValueMapping(regBankID, OpSize);
       break;
     }
+    case Intrinsic::amdgcn_s_bitreplicate:
+      Register MaskReg = MI.getOperand(2).getReg();
+      unsigned MaskBank = getRegBankID(MaskReg, MRI, AMDGPU::SGPRRegBankID);
+      OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 64);
+      OpdsMapping[2] = AMDGPU::getValueMapping(MaskBank, 32);
     }
     break;
   }
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index f1e375ee52cb8..1bd7a28ca650e 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -6473,6 +6473,14 @@ SIInstrInfo::legalizeOperands(MachineInstr &MI,
     return CreatedBB;
   }
 
+  // Legalize S_BITREPLICATE
+  if (MI.getOpcode() == AMDGPU::S_BITREPLICATE_B64_B32) {
+    MachineOperand &Src = MI.getOperand(1);
+    if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
+      Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
+    return CreatedBB;
+  }
+
   // Legalize MIMG and MUBUF/MTBUF for shaders.
   //
   // Shaders only generate MUBUF/MTBUF instructions via intrinsics or via
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 2f3b0ff2f7621..c419b5f7a5711 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -362,7 +362,8 @@ let SubtargetPredicate = isGFX9Plus in {
   } // End hasSideEffects = 1, Defs = [EXEC, SCC], Uses = [EXEC]
 
   let isReMaterializable = 1 in
-  def S_BITREPLICATE_B64_B32 : SOP1_64_32<"s_bitreplicate_b64_b32">;
+  def S_BITREPLICATE_B64_B32 : SOP1_64_32<"s_bitreplicate_b64_b32",
+  [(set i64:$sdst, (int_amdgcn_s_bitreplicate i32:$src0))]>;
 } // End SubtargetPredicate = isGFX9Plus
 
 let SubtargetPredicate = isGFX10Plus in {
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.bitreplicate.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.bitreplicate.ll
new file mode 100644
index 0000000000000..027c9ef5e7cc3
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.bitreplicate.ll
@@ -0,0 +1,45 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -global-isel=1 -verify-machineinstrs < %s | FileCheck  -check-prefixes=GFX11 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -global-isel=0 -verify-machineinstrs < %s | FileCheck  -check-prefixes=GFX11 %s
+
+declare i64 @llvm.amdgcn.s.bitreplicate(i32)
+
+define i64 @test_s_bitreplicate_constant() {
+; GFX11-LABEL: test_s_bitreplicate_constant:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_bitreplicate_b64_b32 s[0:1], 0x85fe3a92
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %br = call i64 @llvm.amdgcn.s.bitreplicate(i32 u0x85FE3A92)
+  ret i64 %br
+}
+
+define amdgpu_cs void @test_s_bitreplicate_sgpr(i32 inreg %mask, ptr addrspace(1) %out) {
+; GFX11-LABEL: test_s_bitreplicate_sgpr:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_bitreplicate_b64_b32 s[0:1], s0
+; GFX11-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX11-NEXT:    s_nop 0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+entry:
+  %br = call i64 @llvm.amdgcn.s.bitreplicate(i32 %mask)
+  store i64 %br, ptr addrspace(1) %out
+  ret void
+}
+
+define i64 @test_s_bitreplicate_vgpr(i32 %mask) {
+; GFX11-LABEL: test_s_bitreplicate_vgpr:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11-NEXT:    s_bitreplicate_b64_b32 s[0:1], s0
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %br = call i64 @llvm.amdgcn.s.bitreplicate(i32 %mask)
+  ret i64 %br
+}

From 00a831421fdd94aec65221bdb37042c1aacfe8e0 Mon Sep 17 00:00:00 2001
From: Sander de Smalen <sander.desmalen@arm.com>
Date: Tue, 31 Oct 2023 10:28:40 +0000
Subject: [PATCH 500/877] [AArch64][SME] Extend Inliner cost-model with custom
 penalty for calls. (#68416)

This is a stacked PR following on from #68415

This patch has two purposes:
(1) It tries to make inlining more likely when it can avoid a
streaming-mode change.
(2) It avoids inlining when inlining causes more streaming-mode changes.

An example of (1) is:
```
  void streaming_compatible_bar(void);

  void foo(void) __arm_streaming {
    /* other code */
    streaming_compatible_bar();
    /* other code */
  }

  void f(void) {
    foo();            // expensive streaming mode change
  }

  ->

  void f(void) {
    /* other code */
    streaming_compatible_bar();
    /* other code */
  }
```
where it wouldn't have inlined the function when foo would be a
non-streaming function.

An example of (2) is:
```
  void streaming_bar(void) __arm_streaming;

  void foo(void) __arm_streaming {
    streaming_bar();
    streaming_bar();
  }

  void f(void) {
    foo();            // expensive streaming mode change
  }

  -> (do not inline into)

  void f(void) {
    streaming_bar();  // these are now two expensive streaming mode changes
    streaming_bar();
  }```
---
 llvm/include/llvm/Analysis/InlineCost.h       |  3 +-
 .../llvm/Analysis/TargetTransformInfo.h       | 15 +++
 .../llvm/Analysis/TargetTransformInfoImpl.h   |  5 +
 llvm/lib/Analysis/InlineCost.cpp              | 15 +--
 llvm/lib/Analysis/TargetTransformInfo.cpp     |  7 ++
 .../AArch64/AArch64TargetTransformInfo.cpp    | 43 +++++++++
 .../AArch64/AArch64TargetTransformInfo.h      |  3 +
 llvm/lib/Transforms/IPO/PartialInlining.cpp   |  6 +-
 .../sme-pstatesm-attrs-low-threshold.ll       | 45 +++++++++
 .../Inline/AArch64/sme-pstatesm-attrs.ll      | 95 +++++++++++++++++++
 10 files changed, 227 insertions(+), 10 deletions(-)
 create mode 100644 llvm/test/Transforms/Inline/AArch64/sme-pstatesm-attrs-low-threshold.ll

diff --git a/llvm/include/llvm/Analysis/InlineCost.h b/llvm/include/llvm/Analysis/InlineCost.h
index 57f452853d2d6..3f0bb879e021f 100644
--- a/llvm/include/llvm/Analysis/InlineCost.h
+++ b/llvm/include/llvm/Analysis/InlineCost.h
@@ -259,7 +259,8 @@ InlineParams getInlineParams(unsigned OptLevel, unsigned SizeOptLevel);
 
 /// Return the cost associated with a callsite, including parameter passing
 /// and the call/return instruction.
-int getCallsiteCost(const CallBase &Call, const DataLayout &DL);
+int getCallsiteCost(const TargetTransformInfo &TTI, const CallBase &Call,
+                    const DataLayout &DL);
 
 /// Get an InlineCost object representing the cost of inlining this
 /// callsite.
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 3ec80d99b392b..c18e0acdb759a 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1517,6 +1517,15 @@ class TargetTransformInfo {
   bool areInlineCompatible(const Function *Caller,
                            const Function *Callee) const;
 
+  /// Returns a penalty for invoking call \p Call in \p F.
+  /// For example, if a function F calls a function G, which in turn calls
+  /// function H, then getInlineCallPenalty(F, H()) would return the
+  /// penalty of calling H from F, e.g. after inlining G into F.
+  /// \p DefaultCallPenalty is passed to give a default penalty that
+  /// the target can amend or override.
+  unsigned getInlineCallPenalty(const Function *F, const CallBase &Call,
+                                unsigned DefaultCallPenalty) const;
+
   /// \returns True if the caller and callee agree on how \p Types will be
   /// passed to or returned from the callee.
   /// to the callee.
@@ -2012,6 +2021,8 @@ class TargetTransformInfo::Concept {
       std::optional<uint32_t> AtomicCpySize) const = 0;
   virtual bool areInlineCompatible(const Function *Caller,
                                    const Function *Callee) const = 0;
+  virtual unsigned getInlineCallPenalty(const Function *F, const CallBase &Call,
+                                        unsigned DefaultCallPenalty) const = 0;
   virtual bool areTypesABICompatible(const Function *Caller,
                                      const Function *Callee,
                                      const ArrayRef<Type *> &Types) const = 0;
@@ -2673,6 +2684,10 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
                            const Function *Callee) const override {
     return Impl.areInlineCompatible(Caller, Callee);
   }
+  unsigned getInlineCallPenalty(const Function *F, const CallBase &Call,
+                                unsigned DefaultCallPenalty) const override {
+    return Impl.getInlineCallPenalty(F, Call, DefaultCallPenalty);
+  }
   bool areTypesABICompatible(const Function *Caller, const Function *Callee,
                              const ArrayRef<Type *> &Types) const override {
     return Impl.areTypesABICompatible(Caller, Callee, Types);
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index e149154435139..2ccf57c22234f 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -802,6 +802,11 @@ class TargetTransformInfoImplBase {
             Callee->getFnAttribute("target-features"));
   }
 
+  unsigned getInlineCallPenalty(const Function *F, const CallBase &Call,
+                                unsigned DefaultCallPenalty) const {
+    return DefaultCallPenalty;
+  }
+
   bool areTypesABICompatible(const Function *Caller, const Function *Callee,
                              const ArrayRef<Type *> &Types) const {
     return (Caller->getFnAttribute("target-cpu") ==
diff --git a/llvm/lib/Analysis/InlineCost.cpp b/llvm/lib/Analysis/InlineCost.cpp
index fa0c30637633d..7096e06d925ad 100644
--- a/llvm/lib/Analysis/InlineCost.cpp
+++ b/llvm/lib/Analysis/InlineCost.cpp
@@ -695,7 +695,8 @@ class InlineCostCallAnalyzer final : public CallAnalyzer {
       }
     } else
       // Otherwise simply add the cost for merely making the call.
-      addCost(CallPenalty);
+      addCost(TTI.getInlineCallPenalty(CandidateCall.getCaller(), Call,
+                                       CallPenalty));
   }
 
   void onFinalizeSwitch(unsigned JumpTableSize,
@@ -918,7 +919,7 @@ class InlineCostCallAnalyzer final : public CallAnalyzer {
     // Compute the total savings for the call site.
     auto *CallerBB = CandidateCall.getParent();
     BlockFrequencyInfo *CallerBFI = &(GetBFI(*(CallerBB->getParent())));
-    CycleSavings += getCallsiteCost(this->CandidateCall, DL);
+    CycleSavings += getCallsiteCost(TTI, this->CandidateCall, DL);
     CycleSavings *= *CallerBFI->getBlockProfileCount(CallerBB);
 
     // Remove the cost of the cold basic blocks to model the runtime cost more
@@ -1076,7 +1077,7 @@ class InlineCostCallAnalyzer final : public CallAnalyzer {
 
     // Give out bonuses for the callsite, as the instructions setting them up
     // will be gone after inlining.
-    addCost(-getCallsiteCost(this->CandidateCall, DL));
+    addCost(-getCallsiteCost(TTI, this->CandidateCall, DL));
 
     // If this function uses the coldcc calling convention, prefer not to inline
     // it.
@@ -1315,7 +1316,7 @@ class InlineCostFeaturesAnalyzer final : public CallAnalyzer {
 
   InlineResult onAnalysisStart() override {
     increment(InlineCostFeatureIndex::callsite_cost,
-              -1 * getCallsiteCost(this->CandidateCall, DL));
+              -1 * getCallsiteCost(TTI, this->CandidateCall, DL));
 
     set(InlineCostFeatureIndex::cold_cc_penalty,
         (F.getCallingConv() == CallingConv::Cold));
@@ -2887,7 +2888,8 @@ static bool functionsHaveCompatibleAttributes(
          AttributeFuncs::areInlineCompatible(*Caller, *Callee);
 }
 
-int llvm::getCallsiteCost(const CallBase &Call, const DataLayout &DL) {
+int llvm::getCallsiteCost(const TargetTransformInfo &TTI, const CallBase &Call,
+                          const DataLayout &DL) {
   int64_t Cost = 0;
   for (unsigned I = 0, E = Call.arg_size(); I != E; ++I) {
     if (Call.isByValArgument(I)) {
@@ -2917,7 +2919,8 @@ int llvm::getCallsiteCost(const CallBase &Call, const DataLayout &DL) {
   }
   // The call instruction also disappears after inlining.
   Cost += InstrCost;
-  Cost += CallPenalty;
+  Cost += TTI.getInlineCallPenalty(Call.getCaller(), Call, CallPenalty);
+
   return std::min<int64_t>(Cost, INT_MAX);
 }
 
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index aad14f21d1146..caa9b17ae695e 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1133,6 +1133,13 @@ bool TargetTransformInfo::areInlineCompatible(const Function *Caller,
   return TTIImpl->areInlineCompatible(Caller, Callee);
 }
 
+unsigned
+TargetTransformInfo::getInlineCallPenalty(const Function *F,
+                                          const CallBase &Call,
+                                          unsigned DefaultCallPenalty) const {
+  return TTIImpl->getInlineCallPenalty(F, Call, DefaultCallPenalty);
+}
+
 bool TargetTransformInfo::areTypesABICompatible(
     const Function *Caller, const Function *Callee,
     const ArrayRef<Type *> &Types) const {
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 776619c90393c..0eaa3e817c0b6 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -46,6 +46,15 @@ static cl::opt<unsigned>
     NeonNonConstStrideOverhead("neon-nonconst-stride-overhead", cl::init(10),
                                cl::Hidden);
 
+static cl::opt<unsigned> CallPenaltyChangeSM(
+    "call-penalty-sm-change", cl::init(5), cl::Hidden,
+    cl::desc(
+        "Penalty of calling a function that requires a change to PSTATE.SM"));
+
+static cl::opt<unsigned> InlineCallPenaltyChangeSM(
+    "inline-call-penalty-sm-change", cl::init(10), cl::Hidden,
+    cl::desc("Penalty of inlining a call that requires a change to PSTATE.SM"));
+
 namespace {
 class TailFoldingOption {
   // These bitfields will only ever be set to something non-zero in operator=,
@@ -269,6 +278,40 @@ bool AArch64TTIImpl::areTypesABICompatible(
   return true;
 }
 
+unsigned
+AArch64TTIImpl::getInlineCallPenalty(const Function *F, const CallBase &Call,
+                                     unsigned DefaultCallPenalty) const {
+  // This function calculates a penalty for executing Call in F.
+  //
+  // There are two ways this function can be called:
+  // (1)  F:
+  //       call from F -> G (the call here is Call)
+  //
+  // For (1), Call.getCaller() == F, so it will always return a high cost if
+  // a streaming-mode change is required (thus promoting the need to inline the
+  // function)
+  //
+  // (2)  F:
+  //       call from F -> G (the call here is not Call)
+  //      G:
+  //       call from G -> H (the call here is Call)
+  //
+  // For (2), if after inlining the body of G into F the call to H requires a
+  // streaming-mode change, and the call to G from F would also require a
+  // streaming-mode change, then there is benefit to do the streaming-mode
+  // change only once and avoid inlining of G into F.
+  SMEAttrs FAttrs(*F);
+  SMEAttrs CalleeAttrs(Call);
+  if (FAttrs.requiresSMChange(CalleeAttrs)) {
+    if (F == Call.getCaller()) // (1)
+      return CallPenaltyChangeSM * DefaultCallPenalty;
+    if (FAttrs.requiresSMChange(SMEAttrs(*Call.getCaller()))) // (2)
+      return InlineCallPenaltyChangeSM * DefaultCallPenalty;
+  }
+
+  return DefaultCallPenalty;
+}
+
 bool AArch64TTIImpl::shouldMaximizeVectorBandwidth(
     TargetTransformInfo::RegisterKind K) const {
   assert(K != TargetTransformInfo::RGK_Scalar);
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index c08004ad299fd..fa4c93d5f77a1 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -80,6 +80,9 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
   bool areTypesABICompatible(const Function *Caller, const Function *Callee,
                              const ArrayRef<Type *> &Types) const;
 
+  unsigned getInlineCallPenalty(const Function *F, const CallBase &Call,
+                                unsigned DefaultCallPenalty) const;
+
   /// \name Scalar TTI Implementations
   /// @{
 
diff --git a/llvm/lib/Transforms/IPO/PartialInlining.cpp b/llvm/lib/Transforms/IPO/PartialInlining.cpp
index 25da06add24f0..aa4f205ec5bdf 100644
--- a/llvm/lib/Transforms/IPO/PartialInlining.cpp
+++ b/llvm/lib/Transforms/IPO/PartialInlining.cpp
@@ -767,7 +767,7 @@ bool PartialInlinerImpl::shouldPartialInline(
   const DataLayout &DL = Caller->getParent()->getDataLayout();
 
   // The savings of eliminating the call:
-  int NonWeightedSavings = getCallsiteCost(CB, DL);
+  int NonWeightedSavings = getCallsiteCost(CalleeTTI, CB, DL);
   BlockFrequency NormWeightedSavings(NonWeightedSavings);
 
   // Weighted saving is smaller than weighted cost, return false
@@ -842,12 +842,12 @@ PartialInlinerImpl::computeBBInlineCost(BasicBlock *BB,
     }
 
     if (CallInst *CI = dyn_cast<CallInst>(&I)) {
-      InlineCost += getCallsiteCost(*CI, DL);
+      InlineCost += getCallsiteCost(*TTI, *CI, DL);
       continue;
     }
 
     if (InvokeInst *II = dyn_cast<InvokeInst>(&I)) {
-      InlineCost += getCallsiteCost(*II, DL);
+      InlineCost += getCallsiteCost(*TTI, *II, DL);
       continue;
     }
 
diff --git a/llvm/test/Transforms/Inline/AArch64/sme-pstatesm-attrs-low-threshold.ll b/llvm/test/Transforms/Inline/AArch64/sme-pstatesm-attrs-low-threshold.ll
new file mode 100644
index 0000000000000..72003d2fee4ba
--- /dev/null
+++ b/llvm/test/Transforms/Inline/AArch64/sme-pstatesm-attrs-low-threshold.ll
@@ -0,0 +1,45 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
+; RUN: opt < %s -mtriple=aarch64-unknown-linux-gnu -mattr=+sme -S -passes=inline  -inlinedefault-threshold=1 | FileCheck %s
+
+; This test sets the inline-threshold to 1 such that by default the call to @streaming_callee is not inlined.
+; However, if the call to @streaming_callee requires a streaming-mode change, it should always inline the call because the streaming-mode change is more expensive.
+target triple = "aarch64"
+
+declare void @streaming_compatible_f() "aarch64_pstate_sm_compatible"
+
+; Function @streaming_callee doesn't contain any operations that may use ZA
+; state and therefore can be legally inlined into a normal function.
+define void @streaming_callee() "aarch64_pstate_sm_enabled" {
+; CHECK-LABEL: define void @streaming_callee
+; CHECK-SAME: () #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT:    call void @streaming_compatible_f()
+; CHECK-NEXT:    call void @streaming_compatible_f()
+; CHECK-NEXT:    ret void
+;
+  call void @streaming_compatible_f()
+  call void @streaming_compatible_f()
+  ret void
+}
+
+; Inline call to @streaming_callee to remove a streaming mode change.
+define void @non_streaming_caller_inline() {
+; CHECK-LABEL: define void @non_streaming_caller_inline
+; CHECK-SAME: () #[[ATTR2:[0-9]+]] {
+; CHECK-NEXT:    call void @streaming_compatible_f()
+; CHECK-NEXT:    call void @streaming_compatible_f()
+; CHECK-NEXT:    ret void
+;
+  call void @streaming_callee()
+  ret void
+}
+
+; Don't inline call to @streaming_callee when the inline-threshold is set to 1, because it does not eliminate a streaming-mode change.
+define void @streaming_caller_dont_inline() "aarch64_pstate_sm_enabled" {
+; CHECK-LABEL: define void @streaming_caller_dont_inline
+; CHECK-SAME: () #[[ATTR1]] {
+; CHECK-NEXT:    call void @streaming_callee()
+; CHECK-NEXT:    ret void
+;
+  call void @streaming_callee()
+  ret void
+}
diff --git a/llvm/test/Transforms/Inline/AArch64/sme-pstatesm-attrs.ll b/llvm/test/Transforms/Inline/AArch64/sme-pstatesm-attrs.ll
index f2f5768dbe9c6..d6b1f3ef45e76 100644
--- a/llvm/test/Transforms/Inline/AArch64/sme-pstatesm-attrs.ll
+++ b/llvm/test/Transforms/Inline/AArch64/sme-pstatesm-attrs.ll
@@ -581,3 +581,98 @@ entry:
   %res = call i64 @normal_callee_call_sme_state()
   ret i64 %res
 }
+
+
+
+declare void @streaming_body() "aarch64_pstate_sm_enabled"
+
+define void @streaming_caller_single_streaming_callee() "aarch64_pstate_sm_enabled" {
+; CHECK-LABEL: define void @streaming_caller_single_streaming_callee
+; CHECK-SAME: () #[[ATTR2]] {
+; CHECK-NEXT:    call void @streaming_body()
+; CHECK-NEXT:    ret void
+;
+  call void @streaming_body()
+  ret void
+}
+
+define void @streaming_caller_multiple_streaming_callees() "aarch64_pstate_sm_enabled" {
+; CHECK-LABEL: define void @streaming_caller_multiple_streaming_callees
+; CHECK-SAME: () #[[ATTR2]] {
+; CHECK-NEXT:    call void @streaming_body()
+; CHECK-NEXT:    call void @streaming_body()
+; CHECK-NEXT:    ret void
+;
+  call void @streaming_body()
+  call void @streaming_body()
+  ret void
+}
+
+; Allow inlining, as inline it would not increase the number of streaming-mode changes.
+define void @streaming_caller_single_streaming_callee_inline() {
+; CHECK-LABEL: define void @streaming_caller_single_streaming_callee_inline
+; CHECK-SAME: () #[[ATTR1]] {
+; CHECK-NEXT:    call void @streaming_body()
+; CHECK-NEXT:    ret void
+;
+  call void @streaming_caller_single_streaming_callee()
+  ret void
+}
+
+; Prevent inlining, as inline it would lead to multiple streaming-mode changes.
+define void @streaming_caller_multiple_streaming_callees_dont_inline() {
+; CHECK-LABEL: define void @streaming_caller_multiple_streaming_callees_dont_inline
+; CHECK-SAME: () #[[ATTR1]] {
+; CHECK-NEXT:    call void @streaming_caller_multiple_streaming_callees()
+; CHECK-NEXT:    ret void
+;
+  call void @streaming_caller_multiple_streaming_callees()
+  ret void
+}
+
+declare void @streaming_compatible_body() "aarch64_pstate_sm_compatible"
+
+define void @streaming_caller_single_streaming_compatible_callee() "aarch64_pstate_sm_enabled" {
+; CHECK-LABEL: define void @streaming_caller_single_streaming_compatible_callee
+; CHECK-SAME: () #[[ATTR2]] {
+; CHECK-NEXT:    call void @streaming_compatible_body()
+; CHECK-NEXT:    ret void
+;
+  call void @streaming_compatible_body()
+  ret void
+}
+
+define void @streaming_caller_multiple_streaming_compatible_callees() "aarch64_pstate_sm_enabled" {
+; CHECK-LABEL: define void @streaming_caller_multiple_streaming_compatible_callees
+; CHECK-SAME: () #[[ATTR2]] {
+; CHECK-NEXT:    call void @streaming_compatible_body()
+; CHECK-NEXT:    call void @streaming_compatible_body()
+; CHECK-NEXT:    ret void
+;
+  call void @streaming_compatible_body()
+  call void @streaming_compatible_body()
+  ret void
+}
+
+; Allow inlining, as inline would remove a streaming-mode change.
+define void @streaming_caller_single_streaming_compatible_callee_inline() {
+; CHECK-LABEL: define void @streaming_caller_single_streaming_compatible_callee_inline
+; CHECK-SAME: () #[[ATTR1]] {
+; CHECK-NEXT:    call void @streaming_compatible_body()
+; CHECK-NEXT:    ret void
+;
+  call void @streaming_caller_single_streaming_compatible_callee()
+  ret void
+}
+
+; Allow inlining, as inline would remove several stremaing-mode changes.
+define void @streaming_caller_multiple_streaming_compatible_callees_inline() {
+; CHECK-LABEL: define void @streaming_caller_multiple_streaming_compatible_callees_inline
+; CHECK-SAME: () #[[ATTR1]] {
+; CHECK-NEXT:    call void @streaming_compatible_body()
+; CHECK-NEXT:    call void @streaming_compatible_body()
+; CHECK-NEXT:    ret void
+;
+  call void @streaming_caller_multiple_streaming_compatible_callees()
+  ret void
+}

From 3b786f2c7608964b4481b9fcd24ab29c7c42243d Mon Sep 17 00:00:00 2001
From: Kerry McLaughlin <kerry.mclaughlin@arm.com>
Date: Tue, 29 Aug 2023 13:23:18 +0000
Subject: [PATCH 501/877] [AArch64] Add intrinsic to count trailing zero
 elements

This patch introduces an experimental intrinsic for counting the
trailing zero elements in a vector. The intrinsic has generic expansion
in SelectionDAGBuilder, and for AArch64 there is a pattern which matches
to brkb & cntp instructions where SVE is enabled.

The intrinsic has a second operand, is_zero_poison, similar to the
existing cttz intrinsic.

These changes have been split out from D158291.
---
 llvm/docs/LangRef.rst                         |  39 +++
 llvm/include/llvm/CodeGen/TargetLowering.h    |   4 +
 llvm/include/llvm/IR/Intrinsics.td            |   5 +
 .../SelectionDAG/SelectionDAGBuilder.cpp      |  56 ++++
 .../Target/AArch64/AArch64ISelLowering.cpp    |  11 +
 llvm/lib/Target/AArch64/AArch64ISelLowering.h |   4 +
 llvm/lib/Target/AArch64/AArch64InstrInfo.td   |   3 +
 .../lib/Target/AArch64/AArch64SVEInstrInfo.td |  16 ++
 .../AArch64/intrinsic-cttz-elts-sve.ll        | 265 ++++++++++++++++++
 .../CodeGen/AArch64/intrinsic-cttz-elts.ll    | 146 ++++++++++
 .../RISCV/intrinsic-cttz-elts-vscale.ll       | 175 ++++++++++++
 .../test/CodeGen/RISCV/intrinsic-cttz-elts.ll | 114 ++++++++
 llvm/test/CodeGen/X86/intrinsic-cttz-elts.ll  | 134 +++++++++
 13 files changed, 972 insertions(+)
 create mode 100644 llvm/test/CodeGen/AArch64/intrinsic-cttz-elts-sve.ll
 create mode 100644 llvm/test/CodeGen/AArch64/intrinsic-cttz-elts.ll
 create mode 100644 llvm/test/CodeGen/RISCV/intrinsic-cttz-elts-vscale.ll
 create mode 100644 llvm/test/CodeGen/RISCV/intrinsic-cttz-elts.ll
 create mode 100644 llvm/test/CodeGen/X86/intrinsic-cttz-elts.ll

diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 803503a0e8cc7..d8679829fcd59 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -18497,6 +18497,45 @@ Arguments:
 Both arguments must be vectors of the same type whereby their logical
 concatenation matches the result type.
 
+'``llvm.experimental.cttz.elts``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+This is an overloaded intrinsic. You can use ```llvm.experimental.cttz.elts```
+on any vector of integer elements, both fixed width and scalable.
+
+::
+
+      declare i8 @llvm.experimental.cttz.elts.i8.v8i1(<8 x i1> <src>, i1 <is_zero_poison>)
+
+Overview:
+"""""""""
+
+The '``llvm.experimental.cttz.elts``' intrinsic counts the number of trailing
+zero elements of a vector.
+
+Arguments:
+""""""""""
+
+The first argument is the vector to be counted. This argument must be a vector
+with integer element type. The return type must also be an integer type which is
+wide enough to hold the maximum number of elements of the source vector. The
+behaviour of this intrinsic is undefined if the return type is not wide enough
+for the number of elements in the input vector.
+
+The second argument is a constant flag that indicates whether the intrinsic
+returns a valid result if the first argument is all zero. If the first argument
+is all zero and the second argument is true, the result is poison.
+
+Semantics:
+""""""""""
+
+The '``llvm.experimental.cttz.elts``' intrinsic counts the trailing (least
+significant) zero elements in a vector. If ``src == 0`` the result is the
+number of elements in the input vector.
+
 '``llvm.experimental.vector.splice``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 1494f335e4936..c87537291e3b1 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -465,6 +465,10 @@ class TargetLoweringBase {
     return true;
   }
 
+  /// Return true if the @llvm.experimental.cttz.elts intrinsic should be
+  /// expanded using generic code in SelectionDAGBuilder.
+  virtual bool shouldExpandCttzElements(EVT VT) const { return true; }
+
   // Return true if op(vecreduce(x), vecreduce(y)) should be reassociated to
   // vecreduce(op(x, y)) for the reduction opcode RedOpc.
   virtual bool shouldReassociateReduction(unsigned RedOpc, EVT VT) const {
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index d7a81591d0344..737fa2a41a3e2 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -2182,6 +2182,11 @@ def int_experimental_get_vector_length:
                         [IntrNoMem, IntrNoSync, IntrWillReturn,
                          ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>]>;
 
+def int_experimental_cttz_elts:
+  DefaultAttrsIntrinsic<[llvm_anyint_ty],
+            [llvm_anyvector_ty, llvm_i1_ty],
+            [IntrNoMem, IntrNoSync, IntrWillReturn, ImmArg<ArgIndex<1>>]>;
+
 def int_experimental_vp_splice:
   DefaultAttrsIntrinsic<[llvm_anyvector_ty],
             [LLVMMatchType<0>,
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index c518b1f95e902..229f220d8460b 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -7514,6 +7514,62 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     setValue(&I, Trunc);
     return;
   }
+  case Intrinsic::experimental_cttz_elts: {
+    auto DL = getCurSDLoc();
+    SDValue Op = getValue(I.getOperand(0));
+    EVT OpVT = Op.getValueType();
+
+    if (!TLI.shouldExpandCttzElements(OpVT)) {
+      visitTargetIntrinsic(I, Intrinsic);
+      return;
+    }
+
+    if (OpVT.getScalarType() != MVT::i1) {
+      // Compare the input vector elements to zero & use to count trailing zeros
+      SDValue AllZero = DAG.getConstant(0, DL, OpVT);
+      OpVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
+                              OpVT.getVectorElementCount());
+      Op = DAG.getSetCC(DL, OpVT, Op, AllZero, ISD::SETNE);
+    }
+
+    // Find the smallest "sensible" element type to use for the expansion.
+    ConstantRange CR(
+        APInt(64, OpVT.getVectorElementCount().getKnownMinValue()));
+    if (OpVT.isScalableVT())
+      CR = CR.umul_sat(getVScaleRange(I.getCaller(), 64));
+
+    // If the zero-is-poison flag is set, we can assume the upper limit
+    // of the result is VF-1.
+    if (!cast<ConstantSDNode>(getValue(I.getOperand(1)))->isZero())
+      CR = CR.subtract(APInt(64, 1));
+
+    unsigned EltWidth = I.getType()->getScalarSizeInBits();
+    EltWidth = std::min(EltWidth, (unsigned)CR.getActiveBits());
+    EltWidth = std::max(llvm::bit_ceil(EltWidth), (unsigned)8);
+
+    MVT NewEltTy = MVT::getIntegerVT(EltWidth);
+
+    // Create the new vector type & get the vector length
+    EVT NewVT = EVT::getVectorVT(*DAG.getContext(), NewEltTy,
+                                 OpVT.getVectorElementCount());
+
+    SDValue VL =
+        DAG.getElementCount(DL, NewEltTy, OpVT.getVectorElementCount());
+
+    SDValue StepVec = DAG.getStepVector(DL, NewVT);
+    SDValue SplatVL = DAG.getSplat(NewVT, DL, VL);
+    SDValue StepVL = DAG.getNode(ISD::SUB, DL, NewVT, SplatVL, StepVec);
+    SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, DL, NewVT, Op);
+    SDValue And = DAG.getNode(ISD::AND, DL, NewVT, StepVL, Ext);
+    SDValue Max = DAG.getNode(ISD::VECREDUCE_UMAX, DL, NewEltTy, And);
+    SDValue Sub = DAG.getNode(ISD::SUB, DL, NewEltTy, VL, Max);
+
+    EVT RetTy = TLI.getValueType(DAG.getDataLayout(), I.getType());
+    SDValue Ret = DAG.getZExtOrTrunc(Sub, DL, RetTy);
+
+    setValue(&I, Ret);
+    return;
+  }
   case Intrinsic::vector_insert: {
     SDValue Vec = getValue(I.getOperand(0));
     SDValue SubVec = getValue(I.getOperand(1));
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 4daaf21d42b3c..d00db82c9e49a 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1791,6 +1791,10 @@ bool AArch64TargetLowering::shouldExpandGetActiveLaneMask(EVT ResVT,
   return false;
 }
 
+bool AArch64TargetLowering::shouldExpandCttzElements(EVT VT) const {
+  return !Subtarget->hasSVEorSME() || VT != MVT::nxv16i1;
+}
+
 void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT,
                                                      bool StreamingSVE) {
   assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
@@ -2634,6 +2638,7 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
     MAKE_CASE(AArch64ISD::MRRS)
     MAKE_CASE(AArch64ISD::MSRR)
     MAKE_CASE(AArch64ISD::RSHRNB_I)
+    MAKE_CASE(AArch64ISD::CTTZ_ELTS)
   }
 #undef MAKE_CASE
   return nullptr;
@@ -5338,6 +5343,12 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
     }
     return SDValue();
   }
+  case Intrinsic::experimental_cttz_elts: {
+    SDValue NewCttzElts =
+        DAG.getNode(AArch64ISD::CTTZ_ELTS, dl, MVT::i64, Op.getOperand(1));
+
+    return DAG.getZExtOrTrunc(NewCttzElts, dl, Op.getValueType());
+  }
   }
 }
 
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 52e519cd8a0c9..7332a95615a4d 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -335,6 +335,8 @@ enum NodeType : unsigned {
   PTEST_ANY,
   PTRUE,
 
+  CTTZ_ELTS,
+
   BITREVERSE_MERGE_PASSTHRU,
   BSWAP_MERGE_PASSTHRU,
   REVH_MERGE_PASSTHRU,
@@ -927,6 +929,8 @@ class AArch64TargetLowering : public TargetLowering {
 
   bool shouldExpandGetActiveLaneMask(EVT VT, EVT OpVT) const override;
 
+  bool shouldExpandCttzElements(EVT VT) const override;
+
   /// If a change in streaming mode is required on entry to/return from a
   /// function call it emits and returns the corresponding SMSTART or SMSTOP node.
   /// \p Entry tells whether this is before/after the Call, which is necessary
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 069a283dd311e..d262c8bbe485a 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -842,6 +842,9 @@ def AArch64rshrnb_pf : PatFrags<(ops node:$rs, node:$i),
                             [(AArch64rshrnb node:$rs, node:$i),
                             (int_aarch64_sve_rshrnb node:$rs, node:$i)]>;
 
+def AArch64CttzElts : SDNode<"AArch64ISD::CTTZ_ELTS", SDTypeProfile<1, 1,
+                             [SDTCisInt<0>, SDTCisVec<1>]>, []>;
+
 // Match add node and also treat an 'or' node is as an 'add' if the or'ed operands
 // have no common bits.
 def add_and_or_is_add : PatFrags<(ops node:$lhs, node:$rhs),
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 1a586765d58b3..3faa06f995e5b 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -1964,6 +1964,11 @@ let Predicates = [HasSVEorSME] in {
   defm CNTW_XPiI : sve_int_count<0b100, "cntw", int_aarch64_sve_cntw>;
   defm CNTD_XPiI : sve_int_count<0b110, "cntd", int_aarch64_sve_cntd>;
   defm CNTP_XPP : sve_int_pcount_pred<0b0000, "cntp", int_aarch64_sve_cntp>;
+
+  def : Pat<(i64 (AArch64CttzElts nxv16i1:$Op1)),
+                 (i64 (!cast<Instruction>(CNTP_XPP_B)
+                        (nxv16i1 (!cast<Instruction>(BRKB_PPzP) (PTRUE_B 31), nxv16i1:$Op1)),
+                        (nxv16i1 (!cast<Instruction>(BRKB_PPzP) (PTRUE_B 31), nxv16i1:$Op1))))>;
 }
 
   defm INCB_XPiI : sve_int_pred_pattern_a<0b000, "incb", add, int_aarch64_sve_cntb>;
@@ -2049,6 +2054,17 @@ let Predicates = [HasSVEorSME] in {
   defm INCP_ZP     : sve_int_count_v<0b10000, "incp">;
   defm DECP_ZP     : sve_int_count_v<0b10100, "decp">;
 
+  def : Pat<(i64 (add GPR64:$Op1, (i64 (AArch64CttzElts nxv16i1:$Op2)))),
+                 (i64 (!cast<Instruction>(INCP_XP_B)
+                        (nxv16i1 (!cast<Instruction>(BRKB_PPzP) (PTRUE_B 31), nxv16i1:$Op2)),
+                        GPR64:$Op1))>;
+
+  def : Pat<(i32 (add GPR32:$Op1, (trunc (i64 (AArch64CttzElts nxv16i1:$Op2))))),
+                 (i32 (EXTRACT_SUBREG (i64 (!cast<Instruction>(INCP_XP_B)
+                        (nxv16i1 (!cast<Instruction>(BRKB_PPzP) (PTRUE_B 31), nxv16i1:$Op2)),
+                        (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$Op1, sub_32))),
+                      sub_32))>;
+
   defm INDEX_RR : sve_int_index_rr<"index", AArch64mul_p_oneuse>;
   defm INDEX_IR : sve_int_index_ir<"index", AArch64mul_p, AArch64mul_p_oneuse>;
   defm INDEX_RI : sve_int_index_ri<"index">;
diff --git a/llvm/test/CodeGen/AArch64/intrinsic-cttz-elts-sve.ll b/llvm/test/CodeGen/AArch64/intrinsic-cttz-elts-sve.ll
new file mode 100644
index 0000000000000..1a4ab6ab334a6
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/intrinsic-cttz-elts-sve.ll
@@ -0,0 +1,265 @@
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme < %s | FileCheck %s
+
+; WITH VSCALE RANGE
+
+define i64 @ctz_nxv8i1(<vscale x 8 x i1> %a) #0 {
+; CHECK-LABEL: ctz_nxv8i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    index z0.h, #0, #-1
+; CHECK-NEXT:    mov z1.h, p0/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    cnth x9
+; CHECK-NEXT:    inch z0.h
+; CHECK-NEXT:    and z0.d, z0.d, z1.d
+; CHECK-NEXT:    and z0.h, z0.h, #0xff
+; CHECK-NEXT:    umaxv h0, p0, z0.h
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    sub w8, w9, w8
+; CHECK-NEXT:    and x0, x8, #0xff
+; CHECK-NEXT:    ret
+  %res = call i64 @llvm.experimental.cttz.elts.i64.nxv8i1(<vscale x 8 x i1> %a, i1 0)
+  ret i64 %res
+}
+
+define i32 @ctz_nxv32i1(<vscale x 32 x i1> %a) #0 {
+; CHECK-LABEL: ctz_nxv32i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    index z0.h, #0, #-1
+; CHECK-NEXT:    cnth x8
+; CHECK-NEXT:    punpklo p2.h, p0.b
+; CHECK-NEXT:    neg x8, x8
+; CHECK-NEXT:    punpklo p3.h, p1.b
+; CHECK-NEXT:    rdvl x9, #2
+; CHECK-NEXT:    punpkhi p0.h, p0.b
+; CHECK-NEXT:    mov z1.h, w8
+; CHECK-NEXT:    rdvl x8, #-1
+; CHECK-NEXT:    punpkhi p1.h, p1.b
+; CHECK-NEXT:    mov z2.h, w8
+; CHECK-NEXT:    inch z0.h, all, mul #4
+; CHECK-NEXT:    mov z3.h, p2/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    ptrue p2.h
+; CHECK-NEXT:    mov z5.h, p3/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    add z1.h, z0.h, z1.h
+; CHECK-NEXT:    add z4.h, z0.h, z2.h
+; CHECK-NEXT:    mov z6.h, p0/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    mov z7.h, p1/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    and z0.d, z0.d, z3.d
+; CHECK-NEXT:    add z2.h, z1.h, z2.h
+; CHECK-NEXT:    and z3.d, z4.d, z5.d
+; CHECK-NEXT:    and z1.d, z1.d, z6.d
+; CHECK-NEXT:    and z2.d, z2.d, z7.d
+; CHECK-NEXT:    umax z0.h, p2/m, z0.h, z3.h
+; CHECK-NEXT:    umax z1.h, p2/m, z1.h, z2.h
+; CHECK-NEXT:    umax z0.h, p2/m, z0.h, z1.h
+; CHECK-NEXT:    umaxv h0, p2, z0.h
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    sub w8, w9, w8
+; CHECK-NEXT:    and w0, w8, #0xffff
+; CHECK-NEXT:    ret
+  %res = call i32 @llvm.experimental.cttz.elts.i32.nxv32i1(<vscale x 32 x i1> %a, i1 0)
+  ret i32 %res
+}
+
+define i32 @ctz_nxv4i32(<vscale x 4 x i32> %a) #0 {
+; CHECK-LABEL: ctz_nxv4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    index z1.s, #0, #-1
+; CHECK-NEXT:    cntw x9
+; CHECK-NEXT:    incw z1.s
+; CHECK-NEXT:    cmpne p1.s, p0/z, z0.s, #0
+; CHECK-NEXT:    mov z0.s, p1/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    and z0.d, z1.d, z0.d
+; CHECK-NEXT:    and z0.s, z0.s, #0xff
+; CHECK-NEXT:    umaxv s0, p0, z0.s
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    sub w8, w9, w8
+; CHECK-NEXT:    and w0, w8, #0xff
+; CHECK-NEXT:    ret
+  %res = call i32 @llvm.experimental.cttz.elts.i32.nxv4i32(<vscale x 4 x i32> %a, i1 0)
+  ret i32 %res
+}
+
+; VSCALE RANGE, ZERO IS POISON
+
+define i64 @vscale_4096(<vscale x 16 x i8> %a) #1 {
+; CHECK-LABEL: vscale_4096:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    cntw x8
+; CHECK-NEXT:    cnth x9
+; CHECK-NEXT:    neg x8, x8
+; CHECK-NEXT:    mov z1.s, w8
+; CHECK-NEXT:    neg x8, x9
+; CHECK-NEXT:    rdvl x9, #1
+; CHECK-NEXT:    mov z2.s, w8
+; CHECK-NEXT:    cmpne p0.b, p0/z, z0.b, #0
+; CHECK-NEXT:    index z0.s, #0, #-1
+; CHECK-NEXT:    punpklo p1.h, p0.b
+; CHECK-NEXT:    punpkhi p0.h, p0.b
+; CHECK-NEXT:    incw z0.s, all, mul #4
+; CHECK-NEXT:    add z1.s, z0.s, z1.s
+; CHECK-NEXT:    add z5.s, z0.s, z2.s
+; CHECK-NEXT:    punpkhi p2.h, p1.b
+; CHECK-NEXT:    punpkhi p3.h, p0.b
+; CHECK-NEXT:    punpklo p0.h, p0.b
+; CHECK-NEXT:    add z2.s, z1.s, z2.s
+; CHECK-NEXT:    punpklo p1.h, p1.b
+; CHECK-NEXT:    mov z3.s, p2/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    ptrue p2.s
+; CHECK-NEXT:    mov z4.s, p3/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    mov z6.s, p0/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    mov z7.s, p1/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    and z1.d, z1.d, z3.d
+; CHECK-NEXT:    and z2.d, z2.d, z4.d
+; CHECK-NEXT:    and z3.d, z5.d, z6.d
+; CHECK-NEXT:    and z0.d, z0.d, z7.d
+; CHECK-NEXT:    umax z1.s, p2/m, z1.s, z2.s
+; CHECK-NEXT:    umax z0.s, p2/m, z0.s, z3.s
+; CHECK-NEXT:    umax z0.s, p2/m, z0.s, z1.s
+; CHECK-NEXT:    umaxv s0, p2, z0.s
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    sub w0, w9, w8
+; CHECK-NEXT:    ret
+  %res = call i64 @llvm.experimental.cttz.elts.i64.nxv16i8(<vscale x 16 x i8> %a, i1 0)
+  ret i64 %res
+}
+
+define i64 @vscale_4096_poison(<vscale x 16 x i8> %a) #1 {
+; CHECK-LABEL: vscale_4096_poison:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    cnth x8
+; CHECK-NEXT:    rdvl x9, #1
+; CHECK-NEXT:    neg x8, x8
+; CHECK-NEXT:    mov z1.h, w8
+; CHECK-NEXT:    cmpne p0.b, p0/z, z0.b, #0
+; CHECK-NEXT:    index z0.h, #0, #-1
+; CHECK-NEXT:    punpkhi p1.h, p0.b
+; CHECK-NEXT:    punpklo p0.h, p0.b
+; CHECK-NEXT:    inch z0.h, all, mul #2
+; CHECK-NEXT:    add z1.h, z0.h, z1.h
+; CHECK-NEXT:    mov z2.h, p1/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    mov z3.h, p0/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    and z1.d, z1.d, z2.d
+; CHECK-NEXT:    and z0.d, z0.d, z3.d
+; CHECK-NEXT:    umax z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    umaxv h0, p0, z0.h
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    sub w8, w9, w8
+; CHECK-NEXT:    and x0, x8, #0xffff
+; CHECK-NEXT:    ret
+  %res = call i64 @llvm.experimental.cttz.elts.i64.nxv16i8(<vscale x 16 x i8> %a, i1 1)
+  ret i64 %res
+}
+
+; NO VSCALE RANGE
+
+define i32 @ctz_nxv8i1_no_range(<vscale x 8 x i1> %a) {
+; CHECK-LABEL: ctz_nxv8i1_no_range:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    index z0.s, #0, #-1
+; CHECK-NEXT:    punpklo p1.h, p0.b
+; CHECK-NEXT:    cntw x8
+; CHECK-NEXT:    punpkhi p0.h, p0.b
+; CHECK-NEXT:    neg x8, x8
+; CHECK-NEXT:    cnth x9
+; CHECK-NEXT:    mov z1.s, w8
+; CHECK-NEXT:    incw z0.s, all, mul #2
+; CHECK-NEXT:    mov z2.s, p1/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    mov z3.s, p0/z, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    add z1.s, z0.s, z1.s
+; CHECK-NEXT:    and z0.d, z0.d, z2.d
+; CHECK-NEXT:    and z1.d, z1.d, z3.d
+; CHECK-NEXT:    umax z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    umaxv s0, p0, z0.s
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    sub w0, w9, w8
+; CHECK-NEXT:    ret
+  %res = call i32 @llvm.experimental.cttz.elts.i32.nxv8i1(<vscale x 8 x i1> %a, i1 0)
+  ret i32 %res
+}
+
+; MATCH WITH BRKB + CNTP
+
+define i32 @ctz_nxv16i1(<vscale x 16 x i1> %pg, <vscale x 16 x i1> %a) {
+; CHECK-LABEL: ctz_nxv16i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    brkb p0.b, p0/z, p1.b
+; CHECK-NEXT:    cntp x0, p0, p0.b
+; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    ret
+  %res = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> %a, i1 0)
+  ret i32 %res
+}
+
+define i32 @ctz_nxv16i1_poison(<vscale x 16 x i1> %pg, <vscale x 16 x i1> %a) {
+; CHECK-LABEL: ctz_nxv16i1_poison:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    brkb p0.b, p0/z, p1.b
+; CHECK-NEXT:    cntp x0, p0, p0.b
+; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    ret
+  %res = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> %a, i1 1)
+  ret i32 %res
+}
+
+define i32 @ctz_and_nxv16i1(<vscale x 16 x i1> %pg, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
+; CHECK-LABEL: ctz_and_nxv16i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p1.b
+; CHECK-NEXT:    cmpne p0.b, p0/z, z0.b, z1.b
+; CHECK-NEXT:    brkb p0.b, p1/z, p0.b
+; CHECK-NEXT:    cntp x0, p0, p0.b
+; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    ret
+  %cmp = icmp ne <vscale x 16 x i8> %a, %b
+  %select = select <vscale x 16 x i1> %pg, <vscale x 16 x i1> %cmp, <vscale x 16 x i1> zeroinitializer
+  %and = and <vscale x 16 x i1> %pg, %select
+  %res = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> %and, i1 0)
+  ret i32 %res
+}
+
+define i64 @add_i64_ctz_nxv16i1_poison(<vscale x 16 x i1> %pg, <vscale x 16 x i1> %a, i64 %b) {
+; CHECK-LABEL: add_i64_ctz_nxv16i1_poison:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    brkb p0.b, p0/z, p1.b
+; CHECK-NEXT:    incp x0, p0.b
+; CHECK-NEXT:    ret
+  %res = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> %a, i1 1)
+  %add = add i64 %res, %b
+  ret i64 %add
+}
+
+define i32 @add_i32_ctz_nxv16i1_poison(<vscale x 16 x i1> %pg, <vscale x 16 x i1> %a, i32 %b) {
+; CHECK-LABEL: add_i32_ctz_nxv16i1_poison:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT:    brkb p0.b, p0/z, p1.b
+; CHECK-NEXT:    incp x0, p0.b
+; CHECK-NEXT:    // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:    ret
+  %res = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> %a, i1 1)
+  %trunc = trunc i64 %res to i32
+  %add = add i32 %trunc, %b
+  ret i32 %add
+}
+
+declare i32 @llvm.experimental.cttz.elts.i32.nxv8i1(<vscale x 8 x i1>, i1)
+declare i64 @llvm.experimental.cttz.elts.i64.nxv8i1(<vscale x 8 x i1>, i1)
+declare i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1>, i1)
+declare i32 @llvm.experimental.cttz.elts.i32.nxv16i1(<vscale x 16 x i1>, i1)
+declare i32 @llvm.experimental.cttz.elts.i32.nxv32i1(<vscale x 32 x i1>, i1)
+declare i32 @llvm.experimental.cttz.elts.i32.nxv4i32(<vscale x 4 x i32>, i1)
+
+declare i64 @llvm.experimental.cttz.elts.i64.nxv16i8(<vscale x 16 x i8>, i1)
+
+attributes #0 = { vscale_range(1,16) }
+attributes #1 = { vscale_range(1,4096) }
diff --git a/llvm/test/CodeGen/AArch64/intrinsic-cttz-elts.ll b/llvm/test/CodeGen/AArch64/intrinsic-cttz-elts.ll
new file mode 100644
index 0000000000000..a7ffefdecb5f7
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/intrinsic-cttz-elts.ll
@@ -0,0 +1,146 @@
+; RUN: llc -mtriple=aarch64-linux-gnu < %s | FileCheck %s
+
+; FIXED WIDTH
+
+define i8 @ctz_v8i1(<8 x i1> %a) {
+; CHECK-LABEL: .LCPI0_0:
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .byte 7
+; CHECK-NEXT:   .byte 6
+; CHECK-NEXT:   .byte 5
+; CHECK-NEXT:   .byte 4
+; CHECK-NEXT:   .byte 3
+; CHECK-NEXT:   .byte 2
+; CHECK-NEXT:   .byte 1
+; CHECK-LABEL: ctz_v8i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    shl v0.8b, v0.8b, #7
+; CHECK-NEXT:    adrp x8, .LCPI0_0
+; CHECK-NEXT:    mov w9, #8 // =0x8
+; CHECK-NEXT:    ldr d1, [x8, :lo12:.LCPI0_0]
+; CHECK-NEXT:    cmlt v0.8b, v0.8b, #0
+; CHECK-NEXT:    and v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    umaxv b0, v0.8b
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    sub w0, w9, w8
+; CHECK-NEXT:    ret
+  %res = call i8 @llvm.experimental.cttz.elts.i8.v8i1(<8 x i1> %a, i1 0)
+  ret i8 %res
+}
+
+define i32 @ctz_v16i1(<16 x i1> %a) {
+; CHECK-LABEL: .LCPI1_0:
+; CHECK-NEXT:   .byte 16
+; CHECK-NEXT:   .byte 15
+; CHECK-NEXT:   .byte 14
+; CHECK-NEXT:   .byte 13
+; CHECK-NEXT:   .byte 12
+; CHECK-NEXT:   .byte 11
+; CHECK-NEXT:   .byte 10
+; CHECK-NEXT:   .byte 9
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .byte 7
+; CHECK-NEXT:   .byte 6
+; CHECK-NEXT:   .byte 5
+; CHECK-NEXT:   .byte 4
+; CHECK-NEXT:   .byte 3
+; CHECK-NEXT:   .byte 2
+; CHECK-NEXT:   .byte 1
+; CHECK-LABEL: ctz_v16i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    shl v0.16b, v0.16b, #7
+; CHECK-NEXT:    adrp x8, .LCPI1_0
+; CHECK-NEXT:    mov w9, #16 // =0x10
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI1_0]
+; CHECK-NEXT:    cmlt v0.16b, v0.16b, #0
+; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    umaxv b0, v0.16b
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    sub w8, w9, w8
+; CHECK-NEXT:    and w0, w8, #0xff
+; CHECK-NEXT:    ret
+  %res = call i32 @llvm.experimental.cttz.elts.i32.v16i1(<16 x i1> %a, i1 0)
+  ret i32 %res
+}
+
+define i16 @ctz_v4i32(<4 x i32> %a) {
+; CHECK-LABEL: .LCPI2_0:
+; CHECK-NEXT:   .hword 4
+; CHECK-NEXT:   .hword 3
+; CHECK-NEXT:   .hword 2
+; CHECK-NEXT:   .hword 1
+; CHECK-LABEL: ctz_v4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    cmtst v0.4s, v0.4s, v0.4s
+; CHECK-NEXT:    adrp x8, .LCPI2_0
+; CHECK-NEXT:    mov w9, #4 // =0x4
+; CHECK-NEXT:    ldr d1, [x8, :lo12:.LCPI2_0]
+; CHECK-NEXT:    xtn v0.4h, v0.4s
+; CHECK-NEXT:    and v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    umaxv h0, v0.4h
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    sub w8, w9, w8
+; CHECK-NEXT:    and w0, w8, #0xff
+; CHECK-NEXT:    ret
+  %res = call i16 @llvm.experimental.cttz.elts.i16.v4i32(<4 x i32> %a, i1 0)
+  ret i16 %res
+}
+
+define i7 @ctz_i7_v8i1(<8 x i1> %a) {
+; CHECK-LABEL: .LCPI3_0:
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .byte 7
+; CHECK-NEXT:   .byte 6
+; CHECK-NEXT:   .byte 5
+; CHECK-NEXT:   .byte 4
+; CHECK-NEXT:   .byte 3
+; CHECK-NEXT:   .byte 2
+; CHECK-NEXT:   .byte 1
+; CHECK-LABEL: ctz_i7_v8i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    shl v0.8b, v0.8b, #7
+; CHECK-NEXT:    adrp x8, .LCPI3_0
+; CHECK-NEXT:    mov w9, #8 // =0x8
+; CHECK-NEXT:    ldr d1, [x8, :lo12:.LCPI3_0]
+; CHECK-NEXT:    cmlt v0.8b, v0.8b, #0
+; CHECK-NEXT:    and v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    umaxv b0, v0.8b
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    sub w0, w9, w8
+; CHECK-NEXT:    ret
+  %res = call i7 @llvm.experimental.cttz.elts.i7.v8i1(<8 x i1> %a, i1 0)
+  ret i7 %res
+}
+
+; ZERO IS POISON
+
+define i8 @ctz_v8i1_poison(<8 x i1> %a) {
+; CHECK-LABEL: .LCPI4_0:
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .byte 7
+; CHECK-NEXT:   .byte 6
+; CHECK-NEXT:   .byte 5
+; CHECK-NEXT:   .byte 4
+; CHECK-NEXT:   .byte 3
+; CHECK-NEXT:   .byte 2
+; CHECK-NEXT:   .byte 1
+; CHECK-LABEL: ctz_v8i1_poison:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    shl v0.8b, v0.8b, #7
+; CHECK-NEXT:    adrp x8, .LCPI4_0
+; CHECK-NEXT:    mov w9, #8 // =0x8
+; CHECK-NEXT:    ldr d1, [x8, :lo12:.LCPI4_0]
+; CHECK-NEXT:    cmlt v0.8b, v0.8b, #0
+; CHECK-NEXT:    and v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    umaxv b0, v0.8b
+; CHECK-NEXT:    fmov w8, s0
+; CHECK-NEXT:    sub w0, w9, w8
+; CHECK-NEXT:    ret
+  %res = call i8 @llvm.experimental.cttz.elts.i8.v8i1(<8 x i1> %a, i1 1)
+  ret i8 %res
+}
+
+declare i8 @llvm.experimental.cttz.elts.i8.v8i1(<8 x i1>, i1)
+declare i7 @llvm.experimental.cttz.elts.i7.v8i1(<8 x i1>, i1)
+declare i32 @llvm.experimental.cttz.elts.i32.v16i1(<16 x i1>, i1)
+declare i16 @llvm.experimental.cttz.elts.i16.v4i32(<4 x i32>, i1)
diff --git a/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts-vscale.ll b/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts-vscale.ll
new file mode 100644
index 0000000000000..862911ffa1863
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts-vscale.ll
@@ -0,0 +1,175 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc -mtriple=riscv32 -mattr=+v < %s | FileCheck %s -check-prefix=RV32
+; RUN: llc -mtriple=riscv64 -mattr=+v < %s | FileCheck %s -check-prefix=RV64
+
+; WITH VSCALE RANGE
+
+define i32 @ctz_nxv4i32(<vscale x 4 x i32> %a) #0 {
+; RV32-LABEL: ctz_nxv4i32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    srli a0, a0, 1
+; RV32-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; RV32-NEXT:    vmv.v.x v10, a0
+; RV32-NEXT:    vid.v v11
+; RV32-NEXT:    li a1, -1
+; RV32-NEXT:    vmadd.vx v11, a1, v10
+; RV32-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV32-NEXT:    vmsne.vi v0, v8, 0
+; RV32-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; RV32-NEXT:    vmv.v.i v8, 0
+; RV32-NEXT:    vmerge.vim v8, v8, -1, v0
+; RV32-NEXT:    vand.vv v8, v11, v8
+; RV32-NEXT:    vredmaxu.vs v8, v8, v8
+; RV32-NEXT:    vmv.x.s a1, v8
+; RV32-NEXT:    sub a0, a0, a1
+; RV32-NEXT:    lui a1, 16
+; RV32-NEXT:    addi a1, a1, -1
+; RV32-NEXT:    and a0, a0, a1
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: ctz_nxv4i32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    srli a0, a0, 1
+; RV64-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; RV64-NEXT:    vmv.v.x v10, a0
+; RV64-NEXT:    vid.v v11
+; RV64-NEXT:    li a1, -1
+; RV64-NEXT:    vmadd.vx v11, a1, v10
+; RV64-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; RV64-NEXT:    vmsne.vi v0, v8, 0
+; RV64-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; RV64-NEXT:    vmv.v.i v8, 0
+; RV64-NEXT:    vmerge.vim v8, v8, -1, v0
+; RV64-NEXT:    vand.vv v8, v11, v8
+; RV64-NEXT:    vredmaxu.vs v8, v8, v8
+; RV64-NEXT:    vmv.x.s a1, v8
+; RV64-NEXT:    sub a0, a0, a1
+; RV64-NEXT:    lui a1, 16
+; RV64-NEXT:    addiw a1, a1, -1
+; RV64-NEXT:    and a0, a0, a1
+; RV64-NEXT:    ret
+  %res = call i32 @llvm.experimental.cttz.elts.i32.nxv4i32(<vscale x 4 x i32> %a, i1 0)
+  ret i32 %res
+}
+
+; NO VSCALE RANGE
+
+define i64 @ctz_nxv8i1_no_range(<vscale x 8 x i16> %a) {
+; RV32-LABEL: ctz_nxv8i1_no_range:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -48
+; RV32-NEXT:    .cfi_def_cfa_offset 48
+; RV32-NEXT:    sw ra, 44(sp) # 4-byte Folded Spill
+; RV32-NEXT:    .cfi_offset ra, -4
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 1
+; RV32-NEXT:    sub sp, sp, a0
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 2 * vlenb
+; RV32-NEXT:    addi a0, sp, 32
+; RV32-NEXT:    vs2r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    srli a0, a0, 3
+; RV32-NEXT:    li a2, 8
+; RV32-NEXT:    li a1, 0
+; RV32-NEXT:    li a3, 0
+; RV32-NEXT:    call __muldi3@plt
+; RV32-NEXT:    sw a1, 24(sp)
+; RV32-NEXT:    sw a0, 20(sp)
+; RV32-NEXT:    addi a2, sp, 20
+; RV32-NEXT:    vsetvli a3, zero, e64, m8, ta, ma
+; RV32-NEXT:    vlse64.v v16, (a2), zero
+; RV32-NEXT:    vid.v v8
+; RV32-NEXT:    li a2, -1
+; RV32-NEXT:    vmadd.vx v8, a2, v16
+; RV32-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; RV32-NEXT:    addi a2, sp, 32
+; RV32-NEXT:    vl2r.v v16, (a2) # Unknown-size Folded Reload
+; RV32-NEXT:    vmsne.vi v0, v16, 0
+; RV32-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
+; RV32-NEXT:    vmv.v.i v16, 0
+; RV32-NEXT:    vmerge.vim v16, v16, -1, v0
+; RV32-NEXT:    vand.vv v8, v8, v16
+; RV32-NEXT:    vredmaxu.vs v8, v8, v8
+; RV32-NEXT:    vmv.x.s a2, v8
+; RV32-NEXT:    sltu a3, a0, a2
+; RV32-NEXT:    li a4, 32
+; RV32-NEXT:    vsetivli zero, 1, e64, m1, ta, ma
+; RV32-NEXT:    vsrl.vx v8, v8, a4
+; RV32-NEXT:    vmv.x.s a4, v8
+; RV32-NEXT:    sub a1, a1, a4
+; RV32-NEXT:    sub a1, a1, a3
+; RV32-NEXT:    sub a0, a0, a2
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    slli a2, a2, 1
+; RV32-NEXT:    add sp, sp, a2
+; RV32-NEXT:    lw ra, 44(sp) # 4-byte Folded Reload
+; RV32-NEXT:    addi sp, sp, 48
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: ctz_nxv8i1_no_range:
+; RV64:       # %bb.0:
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    vsetvli a1, zero, e64, m8, ta, ma
+; RV64-NEXT:    vmv.v.x v24, a0
+; RV64-NEXT:    vid.v v16
+; RV64-NEXT:    li a1, -1
+; RV64-NEXT:    vmadd.vx v16, a1, v24
+; RV64-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; RV64-NEXT:    vmsne.vi v0, v8, 0
+; RV64-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
+; RV64-NEXT:    vmv.v.i v8, 0
+; RV64-NEXT:    vmerge.vvm v8, v8, v16, v0
+; RV64-NEXT:    vredmaxu.vs v8, v8, v8
+; RV64-NEXT:    vmv.x.s a1, v8
+; RV64-NEXT:    sub a0, a0, a1
+; RV64-NEXT:    ret
+  %res = call i64 @llvm.experimental.cttz.elts.i64.nxv8i16(<vscale x 8 x i16> %a, i1 0)
+  ret i64 %res
+}
+
+define i32 @ctz_nxv16i1(<vscale x 16 x i1> %pg, <vscale x 16 x i1> %a) {
+; RV32-LABEL: ctz_nxv16i1:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vmv1r.v v0, v8
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 1
+; RV32-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
+; RV32-NEXT:    vmv.v.x v8, a0
+; RV32-NEXT:    vid.v v16
+; RV32-NEXT:    li a1, -1
+; RV32-NEXT:    vmadd.vx v16, a1, v8
+; RV32-NEXT:    vmv.v.i v8, 0
+; RV32-NEXT:    vmerge.vvm v8, v8, v16, v0
+; RV32-NEXT:    vredmaxu.vs v8, v8, v8
+; RV32-NEXT:    vmv.x.s a1, v8
+; RV32-NEXT:    sub a0, a0, a1
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: ctz_nxv16i1:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vmv1r.v v0, v8
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 1
+; RV64-NEXT:    vsetvli a1, zero, e32, m8, ta, ma
+; RV64-NEXT:    vmv.v.x v8, a0
+; RV64-NEXT:    vid.v v16
+; RV64-NEXT:    li a1, -1
+; RV64-NEXT:    vmadd.vx v16, a1, v8
+; RV64-NEXT:    vmv.v.i v8, 0
+; RV64-NEXT:    vmerge.vim v8, v8, -1, v0
+; RV64-NEXT:    vand.vv v8, v16, v8
+; RV64-NEXT:    vredmaxu.vs v8, v8, v8
+; RV64-NEXT:    vmv.x.s a1, v8
+; RV64-NEXT:    subw a0, a0, a1
+; RV64-NEXT:    ret
+  %res = call i32 @llvm.experimental.cttz.elts.i32.nxv16i1(<vscale x 16 x i1> %a, i1 0)
+  ret i32 %res
+}
+
+declare i64 @llvm.experimental.cttz.elts.i64.nxv8i16(<vscale x 8 x i16>, i1)
+declare i32 @llvm.experimental.cttz.elts.i32.nxv16i1(<vscale x 16 x i1>, i1)
+declare i32 @llvm.experimental.cttz.elts.i32.nxv4i32(<vscale x 4 x i32>, i1)
+
+attributes #0 = { vscale_range(2,1024) }
diff --git a/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts.ll b/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts.ll
new file mode 100644
index 0000000000000..15abc9b75883c
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts.ll
@@ -0,0 +1,114 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc -mtriple=riscv32 < %s | FileCheck %s -check-prefix=RV32
+; RUN: llc -mtriple=riscv64 < %s | FileCheck %s -check-prefix=RV64
+
+; FIXED WIDTH
+
+define i16 @ctz_v4i32(<4 x i32> %a) {
+; RV32-LABEL: ctz_v4i32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    lw a3, 0(a0)
+; RV32-NEXT:    lw a1, 4(a0)
+; RV32-NEXT:    lw a2, 12(a0)
+; RV32-NEXT:    lw a4, 8(a0)
+; RV32-NEXT:    seqz a0, a3
+; RV32-NEXT:    addi a0, a0, -1
+; RV32-NEXT:    andi a0, a0, 4
+; RV32-NEXT:    seqz a3, a4
+; RV32-NEXT:    addi a3, a3, -1
+; RV32-NEXT:    andi a3, a3, 2
+; RV32-NEXT:    bltu a3, a0, .LBB0_2
+; RV32-NEXT:  # %bb.1:
+; RV32-NEXT:    mv a0, a3
+; RV32-NEXT:  .LBB0_2:
+; RV32-NEXT:    snez a2, a2
+; RV32-NEXT:    seqz a1, a1
+; RV32-NEXT:    addi a1, a1, -1
+; RV32-NEXT:    andi a1, a1, 3
+; RV32-NEXT:    bltu a2, a1, .LBB0_4
+; RV32-NEXT:  # %bb.3:
+; RV32-NEXT:    mv a1, a2
+; RV32-NEXT:  .LBB0_4:
+; RV32-NEXT:    bltu a1, a0, .LBB0_6
+; RV32-NEXT:  # %bb.5:
+; RV32-NEXT:    mv a0, a1
+; RV32-NEXT:  .LBB0_6:
+; RV32-NEXT:    li a1, 4
+; RV32-NEXT:    sub a1, a1, a0
+; RV32-NEXT:    andi a0, a1, 255
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: ctz_v4i32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    lw a3, 0(a0)
+; RV64-NEXT:    lw a1, 8(a0)
+; RV64-NEXT:    lw a2, 24(a0)
+; RV64-NEXT:    lw a4, 16(a0)
+; RV64-NEXT:    seqz a0, a3
+; RV64-NEXT:    addi a0, a0, -1
+; RV64-NEXT:    andi a0, a0, 4
+; RV64-NEXT:    seqz a3, a4
+; RV64-NEXT:    addi a3, a3, -1
+; RV64-NEXT:    andi a3, a3, 2
+; RV64-NEXT:    bltu a3, a0, .LBB0_2
+; RV64-NEXT:  # %bb.1:
+; RV64-NEXT:    mv a0, a3
+; RV64-NEXT:  .LBB0_2:
+; RV64-NEXT:    snez a2, a2
+; RV64-NEXT:    seqz a1, a1
+; RV64-NEXT:    addi a1, a1, -1
+; RV64-NEXT:    andi a1, a1, 3
+; RV64-NEXT:    bltu a2, a1, .LBB0_4
+; RV64-NEXT:  # %bb.3:
+; RV64-NEXT:    mv a1, a2
+; RV64-NEXT:  .LBB0_4:
+; RV64-NEXT:    bltu a1, a0, .LBB0_6
+; RV64-NEXT:  # %bb.5:
+; RV64-NEXT:    mv a0, a1
+; RV64-NEXT:  .LBB0_6:
+; RV64-NEXT:    li a1, 4
+; RV64-NEXT:    subw a1, a1, a0
+; RV64-NEXT:    andi a0, a1, 255
+; RV64-NEXT:    ret
+  %res = call i16 @llvm.experimental.cttz.elts.i16.v4i32(<4 x i32> %a, i1 0)
+  ret i16 %res
+}
+
+; ZERO IS POISON
+
+define i32 @ctz_v2i1_poison(<2 x i1> %a) {
+; RV32-LABEL: ctz_v2i1_poison:
+; RV32:       # %bb.0:
+; RV32-NEXT:    andi a1, a1, 1
+; RV32-NEXT:    slli a0, a0, 31
+; RV32-NEXT:    srai a0, a0, 31
+; RV32-NEXT:    andi a0, a0, 2
+; RV32-NEXT:    bltu a1, a0, .LBB1_2
+; RV32-NEXT:  # %bb.1:
+; RV32-NEXT:    mv a0, a1
+; RV32-NEXT:  .LBB1_2:
+; RV32-NEXT:    li a1, 2
+; RV32-NEXT:    sub a1, a1, a0
+; RV32-NEXT:    andi a0, a1, 255
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: ctz_v2i1_poison:
+; RV64:       # %bb.0:
+; RV64-NEXT:    andi a1, a1, 1
+; RV64-NEXT:    slli a0, a0, 63
+; RV64-NEXT:    srai a0, a0, 63
+; RV64-NEXT:    andi a0, a0, 2
+; RV64-NEXT:    bltu a1, a0, .LBB1_2
+; RV64-NEXT:  # %bb.1:
+; RV64-NEXT:    mv a0, a1
+; RV64-NEXT:  .LBB1_2:
+; RV64-NEXT:    li a1, 2
+; RV64-NEXT:    subw a1, a1, a0
+; RV64-NEXT:    andi a0, a1, 255
+; RV64-NEXT:    ret
+  %res = call i32 @llvm.experimental.cttz.elts.i32.v2i1(<2 x i1> %a, i1 1)
+  ret i32 %res
+}
+
+declare i32 @llvm.experimental.cttz.elts.i32.v2i1(<2 x i1>, i1)
+declare i16 @llvm.experimental.cttz.elts.i16.v4i32(<4 x i32>, i1)
diff --git a/llvm/test/CodeGen/X86/intrinsic-cttz-elts.ll b/llvm/test/CodeGen/X86/intrinsic-cttz-elts.ll
new file mode 100644
index 0000000000000..1eb43db350447
--- /dev/null
+++ b/llvm/test/CodeGen/X86/intrinsic-cttz-elts.ll
@@ -0,0 +1,134 @@
+; RUN: llc -mtriple=x86_64-unknown-unknown < %s | FileCheck %s
+
+define i8 @ctz_v8i16(<8 x i16> %a) {
+; CHECK-LABEL: .LCPI0_0:
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .byte 7
+; CHECK-NEXT:   .byte 6
+; CHECK-NEXT:   .byte 5
+; CHECK-NEXT:   .byte 4
+; CHECK-NEXT:   .byte 3
+; CHECK-NEXT:   .byte 2
+; CHECK-NEXT:   .byte 1
+; CHECK-LABEL: ctz_v8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pxor %xmm1, %xmm1
+; CHECK-NEXT:    pcmpeqw %xmm0, %xmm1
+; CHECK-NEXT:    packsswb %xmm1, %xmm1
+; CHECK-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
+; CHECK-NEXT:    movl -{{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movl -{{[0-9]+}}(%rsp), %edx
+; CHECK-NEXT:    cmpb %cl, %al
+; CHECK-NEXT:    cmoval %eax, %ecx
+; CHECK-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    cmpb %al, %cl
+; CHECK-NEXT:    cmovbel %eax, %ecx
+; CHECK-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    cmpb %al, %cl
+; CHECK-NEXT:    cmovbel %eax, %ecx
+; CHECK-NEXT:    cmpb %dl, %cl
+; CHECK-NEXT:    cmovbel %edx, %ecx
+; CHECK-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    cmpb %al, %cl
+; CHECK-NEXT:    cmovbel %eax, %ecx
+; CHECK-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    cmpb %al, %cl
+; CHECK-NEXT:    cmovbel %eax, %ecx
+; CHECK-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    cmpb %al, %cl
+; CHECK-NEXT:    cmovbel %eax, %ecx
+; CHECK-NEXT:    movb $8, %al
+; CHECK-NEXT:    subb %cl, %al
+; CHECK-NEXT:    retq
+  %res = call i8 @llvm.experimental.cttz.elts.i8.v8i16(<8 x i16> %a, i1 0)
+  ret i8 %res
+}
+
+define i16 @ctz_v4i32(<4 x i32> %a) {
+; CHECK-LABEL: .LCPI1_0:
+; CHECK-NEXT:   .byte 4
+; CHECK-NEXT:   .byte 3
+; CHECK-NEXT:   .byte 2
+; CHECK-NEXT:   .byte 1
+; CHECK-LABEL: ctz_v4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pxor %xmm1, %xmm1
+; CHECK-NEXT:    pcmpeqd %xmm0, %xmm1
+; CHECK-NEXT:    packssdw %xmm1, %xmm1
+; CHECK-NEXT:    pcmpeqd %xmm0, %xmm0
+; CHECK-NEXT:    pxor %xmm1, %xmm0
+; CHECK-NEXT:    packsswb %xmm0, %xmm0
+; CHECK-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-NEXT:    movd %xmm0, %eax
+; CHECK-NEXT:    movl %eax, %ecx
+; CHECK-NEXT:    shrl $8, %ecx
+; CHECK-NEXT:    cmpb %cl, %al
+; CHECK-NEXT:    cmoval %eax, %ecx
+; CHECK-NEXT:    movl %eax, %edx
+; CHECK-NEXT:    shrl $16, %edx
+; CHECK-NEXT:    cmpb %dl, %cl
+; CHECK-NEXT:    cmoval %ecx, %edx
+; CHECK-NEXT:    shrl $24, %eax
+; CHECK-NEXT:    cmpb %al, %dl
+; CHECK-NEXT:    cmoval %edx, %eax
+; CHECK-NEXT:    movb $4, %cl
+; CHECK-NEXT:    subb %al, %cl
+; CHECK-NEXT:    movzbl %cl, %eax
+; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    retq
+  %res = call i16 @llvm.experimental.cttz.elts.i16.v4i32(<4 x i32> %a, i1 0)
+  ret i16 %res
+}
+
+; ZERO IS POISON
+
+define i8 @ctz_v8i16_poison(<8 x i16> %a) {
+; CHECK-LABEL: .LCPI2_0:
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .byte 7
+; CHECK-NEXT:   .byte 6
+; CHECK-NEXT:   .byte 5
+; CHECK-NEXT:   .byte 4
+; CHECK-NEXT:   .byte 3
+; CHECK-NEXT:   .byte 2
+; CHECK-NEXT:   .byte 1
+; CHECK-LABEL: ctz_v8i16_poison:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pxor %xmm1, %xmm1
+; CHECK-NEXT:    pcmpeqw %xmm0, %xmm1
+; CHECK-NEXT:    packsswb %xmm1, %xmm1
+; CHECK-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
+; CHECK-NEXT:    movl -{{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    movl -{{[0-9]+}}(%rsp), %edx
+; CHECK-NEXT:    cmpb %cl, %al
+; CHECK-NEXT:    cmoval %eax, %ecx
+; CHECK-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    cmpb %al, %cl
+; CHECK-NEXT:    cmovbel %eax, %ecx
+; CHECK-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    cmpb %al, %cl
+; CHECK-NEXT:    cmovbel %eax, %ecx
+; CHECK-NEXT:    cmpb %dl, %cl
+; CHECK-NEXT:    cmovbel %edx, %ecx
+; CHECK-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    cmpb %al, %cl
+; CHECK-NEXT:    cmovbel %eax, %ecx
+; CHECK-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    cmpb %al, %cl
+; CHECK-NEXT:    cmovbel %eax, %ecx
+; CHECK-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; CHECK-NEXT:    cmpb %al, %cl
+; CHECK-NEXT:    cmovbel %eax, %ecx
+; CHECK-NEXT:    movb $8, %al
+; CHECK-NEXT:    subb %cl, %al
+; CHECK-NEXT:    retq
+  %res = call i8 @llvm.experimental.cttz.elts.i8.v8i16(<8 x i16> %a, i1 1)
+  ret i8 %res
+}
+
+declare i8 @llvm.experimental.cttz.elts.i8.v8i16(<8 x i16>, i1)
+declare i16 @llvm.experimental.cttz.elts.i16.v4i32(<4 x i32>, i1)

From 4f5d463505b3e313cd3943132e7b2784a65c39d9 Mon Sep 17 00:00:00 2001
From: Mike Rice <michael.p.rice@intel.com>
Date: Tue, 31 Oct 2023 03:49:53 -0700
Subject: [PATCH 502/877] Remove malformed brief commands in comments (NFC)
 (#70746)

Co-authored-by: Timm Baeder <tbaeder@redhat.com>
---
 clang/include/clang/Basic/TargetInfo.h | 8 ++++----
 clang/include/clang/Parse/Parser.h     | 6 +++---
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/clang/include/clang/Basic/TargetInfo.h b/clang/include/clang/Basic/TargetInfo.h
index 9d56e97a3d4bb..b3c5cbfb319f0 100644
--- a/clang/include/clang/Basic/TargetInfo.h
+++ b/clang/include/clang/Basic/TargetInfo.h
@@ -1291,20 +1291,20 @@ class TargetInfo : public TransferrableTargetInfo,
     fillValidCPUList(Values);
   }
 
-  /// brief Determine whether this TargetInfo supports the given CPU name.
+  /// Determine whether this TargetInfo supports the given CPU name.
   virtual bool isValidCPUName(StringRef Name) const {
     return true;
   }
 
-  /// brief Determine whether this TargetInfo supports the given CPU name for
-  // tuning.
+  /// Determine whether this TargetInfo supports the given CPU name for
+  /// tuning.
   virtual bool isValidTuneCPUName(StringRef Name) const {
     return isValidCPUName(Name);
   }
 
   virtual ParsedTargetAttr parseTargetAttr(StringRef Str) const;
 
-  /// brief Determine whether this TargetInfo supports tune in target attribute.
+  /// Determine whether this TargetInfo supports tune in target attribute.
   virtual bool supportsTargetAttributeTune() const {
     return false;
   }
diff --git a/clang/include/clang/Parse/Parser.h b/clang/include/clang/Parse/Parser.h
index 79ac622fd03e2..30e0352c86863 100644
--- a/clang/include/clang/Parse/Parser.h
+++ b/clang/include/clang/Parse/Parser.h
@@ -663,9 +663,9 @@ class Parser : public CodeCompletionHandler {
     return PrevTokLocation;
   }
 
-  ///\ brief When we are consuming a code-completion token without having
-  /// matched specific position in the grammar, provide code-completion results
-  /// based on context.
+  /// When we are consuming a code-completion token without having matched
+  /// specific position in the grammar, provide code-completion results based
+  /// on context.
   ///
   /// \returns the source location of the code-completion token.
   SourceLocation handleUnexpectedCodeCompletionToken();

From c308cb9da65b57da7216a4863ab1a4969de44ccb Mon Sep 17 00:00:00 2001
From: Jie Fu <jiefu@tencent.com>
Date: Tue, 31 Oct 2023 19:04:37 +0800
Subject: [PATCH 503/877] [mlir] Fix -Wsign-compare in
 ResolveShapedTypeResultDims.cpp (NFC)

/llvm-project/mlir/lib/Dialect/MemRef/Transforms/ResolveShapedTypeResultDims.cpp:98:19: error: comparison of integers of different signs: 'value_type' (aka 'long long') and 'size_t' (aka 'unsigned long') [-Werror,-Wsign-compare]
    if (*dimIndex >= reifiedResultShapes[resultNumber].size())
        ~~~~~~~~~ ^  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
---
 .../Dialect/MemRef/Transforms/ResolveShapedTypeResultDims.cpp   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/lib/Dialect/MemRef/Transforms/ResolveShapedTypeResultDims.cpp b/mlir/lib/Dialect/MemRef/Transforms/ResolveShapedTypeResultDims.cpp
index fe2eede375ec1..0cb5931ce6bf9 100644
--- a/mlir/lib/Dialect/MemRef/Transforms/ResolveShapedTypeResultDims.cpp
+++ b/mlir/lib/Dialect/MemRef/Transforms/ResolveShapedTypeResultDims.cpp
@@ -95,7 +95,7 @@ struct DimOfReifyRankedShapedTypeOpInterface : public OpRewritePattern<OpTy> {
       return failure();
     unsigned resultNumber = dimValue.getResultNumber();
     // Do not apply pattern if the IR is invalid (dim out of bounds).
-    if (*dimIndex >= reifiedResultShapes[resultNumber].size())
+    if ((size_t)(*dimIndex) >= reifiedResultShapes[resultNumber].size())
       return rewriter.notifyMatchFailure(dimOp, "dimension is out of bounds");
     Value replacement = getValueOrCreateConstantIndexOp(
         rewriter, dimOp.getLoc(), reifiedResultShapes[resultNumber][*dimIndex]);

From deb5bd1289404e8ca751127672d9380a6203b880 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Tue, 31 Oct 2023 12:33:40 +0100
Subject: [PATCH 504/877] [DSE] Add test for #70547 (NFC)

---
 .../Transforms/DeadStoreElimination/assume.ll | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/llvm/test/Transforms/DeadStoreElimination/assume.ll b/llvm/test/Transforms/DeadStoreElimination/assume.ll
index aa767664c52de..c1c4027ea2f9a 100644
--- a/llvm/test/Transforms/DeadStoreElimination/assume.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/assume.ll
@@ -33,6 +33,25 @@ define void @f2() {
   ret void
 }
 
+; FIXME: This is a miscompile
+define void @pr70547() {
+; CHECK-LABEL: @pr70547(
+; CHECK-NEXT:    [[A:%.*]] = alloca i8, align 1
+; CHECK-NEXT:    [[CALL:%.*]] = call ptr @quux(ptr [[A]]) #[[ATTR1:[0-9]+]]
+; CHECK-NEXT:    [[V:%.*]] = load i8, ptr [[CALL]], align 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i8 [[V]], 1
+; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP]])
+; CHECK-NEXT:    ret void
+;
+  %a = alloca i8
+  store i8 0, ptr %a
+  %call = call ptr @quux(ptr %a) memory(none) nounwind willreturn
+  %v = load i8, ptr %call
+  %cmp = icmp ne i8 %v, 1
+  call void @llvm.assume(i1 %cmp)
+  ret void
+}
+
 declare ptr @_Znwm(i64)
 
 declare void @llvm.assume(i1)

From a6dabed3483c60469ff53d51622b22efc4b7b7d2 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Tue, 31 Oct 2023 11:47:42 +0000
Subject: [PATCH 505/877] [AMDGPU] Fix nondeterminism in SIFixSGPRCopies
 (#70644)

There are a couple of loops that iterate over V2SCopies. The iteration
order needs to be deterministic, otherwise we can call moveToVALU in
different orders, which causes temporary vregs to be allocated in
different orders, which can affect register allocation heuristics.
---
 llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp    | 10 ++--
 .../branch-folding-implicit-def-subreg.ll     | 20 +++----
 .../test/CodeGen/AMDGPU/carryout-selection.ll |  2 +-
 .../AMDGPU/fix-sgpr-copies-nondeterminism.ll  | 52 +++++++++++++++++++
 llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll     |  4 +-
 llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll |  2 +-
 6 files changed, 72 insertions(+), 18 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-nondeterminism.ll

diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
index b7ac90e33f65e..86980ee851bb1 100644
--- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -125,7 +125,7 @@ class SIFixSGPRCopies : public MachineFunctionPass {
   SmallVector<MachineInstr*, 4> PHINodes;
   SmallVector<MachineInstr*, 4> S2VCopies;
   unsigned NextVGPRToSGPRCopyID;
-  DenseMap<unsigned, V2SCopyInfo> V2SCopies;
+  MapVector<unsigned, V2SCopyInfo> V2SCopies;
   DenseMap<MachineInstr *, SetVector<unsigned>> SiblingPenalty;
 
 public:
@@ -988,7 +988,7 @@ bool SIFixSGPRCopies::needToBeConvertedToVALU(V2SCopyInfo *Info) {
   for (auto J : Info->Siblings) {
     auto InfoIt = V2SCopies.find(J);
     if (InfoIt != V2SCopies.end()) {
-      MachineInstr *SiblingCopy = InfoIt->getSecond().Copy;
+      MachineInstr *SiblingCopy = InfoIt->second.Copy;
       if (SiblingCopy->isImplicitDef())
         // the COPY has already been MoveToVALUed
         continue;
@@ -1023,12 +1023,12 @@ void SIFixSGPRCopies::lowerVGPR2SGPRCopies(MachineFunction &MF) {
     unsigned CurID = LoweringWorklist.pop_back_val();
     auto CurInfoIt = V2SCopies.find(CurID);
     if (CurInfoIt != V2SCopies.end()) {
-      V2SCopyInfo C = CurInfoIt->getSecond();
+      V2SCopyInfo C = CurInfoIt->second;
       LLVM_DEBUG(dbgs() << "Processing ...\n"; C.dump());
       for (auto S : C.Siblings) {
         auto SibInfoIt = V2SCopies.find(S);
         if (SibInfoIt != V2SCopies.end()) {
-          V2SCopyInfo &SI = SibInfoIt->getSecond();
+          V2SCopyInfo &SI = SibInfoIt->second;
           LLVM_DEBUG(dbgs() << "Sibling:\n"; SI.dump());
           if (!SI.NeedToBeConvertedToVALU) {
             SI.SChain.set_subtract(C.SChain);
@@ -1040,6 +1040,8 @@ void SIFixSGPRCopies::lowerVGPR2SGPRCopies(MachineFunction &MF) {
       }
       LLVM_DEBUG(dbgs() << "V2S copy " << *C.Copy
                         << " is being turned to VALU\n");
+      // TODO: MapVector::erase is inefficient. Do bulk removal with remove_if
+      // instead.
       V2SCopies.erase(C.ID);
       Copies.insert(C.Copy);
     }
diff --git a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
index b5a823355543c..96dbb03cded55 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
@@ -948,12 +948,12 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr15, $vgpr17, $vgpr30, $vgpr31, $vgpr52, $vgpr53, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9:0x000000000000000F, $sgpr10_sgpr11, $sgpr12_sgpr13, $sgpr18_sgpr19, $sgpr24_sgpr25, $sgpr28_sgpr29, $sgpr34_sgpr35, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr42_sgpr43, $sgpr44_sgpr45, $sgpr46_sgpr47, $sgpr48_sgpr49, $sgpr50_sgpr51, $sgpr52_sgpr53, $sgpr54_sgpr55, $sgpr58_sgpr59, $sgpr20_sgpr21_sgpr22_sgpr23:0x000000000000003C, $sgpr24_sgpr25_sgpr26_sgpr27:0x00000000000000F0, $vgpr0_vgpr1:0x000000000000000F, $vgpr2_vgpr3:0x000000000000000F, $vgpr4_vgpr5:0x000000000000000F, $vgpr6_vgpr7:0x000000000000000F, $vgpr8_vgpr9:0x000000000000000F, $vgpr10_vgpr11:0x000000000000000F, $vgpr12_vgpr13:0x000000000000000F, $vgpr14_vgpr15:0x0000000000000003, $vgpr16_vgpr17:0x0000000000000003, $vgpr18_vgpr19:0x000000000000000F, $vgpr20_vgpr21:0x000000000000000F, $vgpr22_vgpr23:0x000000000000000F, $vgpr24_vgpr25:0x000000000000000F, $vgpr40_vgpr41:0x000000000000000F, $vgpr42_vgpr43:0x000000000000000F, $vgpr44_vgpr45:0x000000000000000F, $vgpr46_vgpr47:0x000000000000000F, $vgpr56_vgpr57:0x000000000000000F, $vgpr58_vgpr59:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000F, $sgpr0_sgpr1_sgpr2_sgpr3
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $vgpr26 = V_OR_B32_e32 1, $vgpr24, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr38 = V_OR_B32_e32 $vgpr26, $vgpr22, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr34 = V_OR_B32_e32 $vgpr38, $vgpr20, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr48 = V_OR_B32_e32 $vgpr26, $vgpr22, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr34 = V_OR_B32_e32 $vgpr48, $vgpr20, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr28 = V_CNDMASK_B32_e64 0, $vgpr34, 0, 0, $sgpr12_sgpr13, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr36 = V_OR_B32_e32 $vgpr28, $vgpr18, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr48 = V_OR_B32_e32 $vgpr36, $vgpr10, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr32 = V_OR_B32_e32 $vgpr48, $vgpr12, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr38 = V_OR_B32_e32 $vgpr28, $vgpr18, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr36 = V_OR_B32_e32 $vgpr38, $vgpr10, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr32 = V_OR_B32_e32 $vgpr36, $vgpr12, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr50 = V_CNDMASK_B32_e64 0, 0, 0, $vgpr32, killed $sgpr12_sgpr13, implicit $exec
   ; GFX90A-NEXT:   renamable $sgpr12_sgpr13 = S_MOV_B64 -1
   ; GFX90A-NEXT:   renamable $vcc = S_AND_B64 $exec, killed renamable $sgpr28_sgpr29, implicit-def dead $scc
@@ -975,20 +975,20 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT:   renamable $vgpr2, renamable $vcc = V_ADD_CO_U32_e64 killed $sgpr26, $vgpr2, 0, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr3, dead renamable $vcc = V_ADDC_U32_e64 killed $vgpr10, killed $vgpr3, killed $vcc, 0, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr27 = V_MOV_B32_e32 0, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr39 = COPY renamable $vgpr27, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr49 = COPY renamable $vgpr27, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr35 = COPY renamable $vgpr27, implicit $exec
+  ; GFX90A-NEXT:   renamable $vgpr39 = COPY renamable $vgpr27, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr37 = COPY renamable $vgpr27, implicit $exec
-  ; GFX90A-NEXT:   renamable $vgpr49 = COPY renamable $vgpr27, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr29 = COPY renamable $vgpr27, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr51 = COPY renamable $vgpr27, implicit $exec
   ; GFX90A-NEXT:   renamable $vgpr33 = COPY renamable $vgpr27, implicit $exec
   ; GFX90A-NEXT:   DS_WRITE_B64_gfx9 renamable $vgpr27, renamable $vgpr26_vgpr27, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3)
   ; GFX90A-NEXT:   renamable $vgpr10 = COPY renamable $sgpr21, implicit $exec
-  ; GFX90A-NEXT:   DS_WRITE_B64_gfx9 renamable $vgpr10, killed renamable $vgpr38_vgpr39, 0, 0, implicit $exec :: (store (s64) into %ir.7, addrspace 3)
+  ; GFX90A-NEXT:   DS_WRITE_B64_gfx9 renamable $vgpr10, killed renamable $vgpr48_vgpr49, 0, 0, implicit $exec :: (store (s64) into %ir.7, addrspace 3)
   ; GFX90A-NEXT:   renamable $vgpr12 = COPY killed renamable $sgpr22, implicit $exec
   ; GFX90A-NEXT:   DS_WRITE_B64_gfx9 killed renamable $vgpr12, killed renamable $vgpr34_vgpr35, 0, 0, implicit $exec :: (store (s64) into %ir.8, addrspace 3)
-  ; GFX90A-NEXT:   DS_WRITE_B64_gfx9 renamable $vgpr27, killed renamable $vgpr36_vgpr37, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3)
-  ; GFX90A-NEXT:   DS_WRITE_B64_gfx9 renamable $vgpr10, killed renamable $vgpr48_vgpr49, 0, 0, implicit $exec :: (store (s64) into %ir.7, addrspace 3)
+  ; GFX90A-NEXT:   DS_WRITE_B64_gfx9 renamable $vgpr27, killed renamable $vgpr38_vgpr39, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3)
+  ; GFX90A-NEXT:   DS_WRITE_B64_gfx9 renamable $vgpr10, killed renamable $vgpr36_vgpr37, 0, 0, implicit $exec :: (store (s64) into %ir.7, addrspace 3)
   ; GFX90A-NEXT:   DS_WRITE_B64_gfx9 renamable $vgpr27, killed renamable $vgpr28_vgpr29, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3)
   ; GFX90A-NEXT:   DS_WRITE_B64_gfx9 killed renamable $vgpr10, killed renamable $vgpr50_vgpr51, 0, 0, implicit $exec :: (store (s64) into %ir.7, addrspace 3)
   ; GFX90A-NEXT:   DS_WRITE_B64_gfx9 killed renamable $vgpr27, killed renamable $vgpr32_vgpr33, 0, 0, implicit $exec :: (store (s64) into `ptr addrspace(3) null`, addrspace 3)
diff --git a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
index 50693a92bc92c..c6320658e7800 100644
--- a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
+++ b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
@@ -1965,7 +1965,7 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; VI-NEXT:    v_mul_lo_u32 v2, s8, v4
 ; VI-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s8, v5, 0
 ; VI-NEXT:    v_mul_lo_u32 v3, s9, v5
-; VI-NEXT:    v_add_u32_e32 v1, vcc, v2, v1
+; VI-NEXT:    v_add_u32_e32 v1, vcc, v1, v2
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, v1, v3
 ; VI-NEXT:    v_mul_hi_u32 v6, v5, v0
 ; VI-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], v5, v3, 0
diff --git a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-nondeterminism.ll b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-nondeterminism.ll
new file mode 100644
index 0000000000000..a78a8a2ed8fe6
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-nondeterminism.ll
@@ -0,0 +1,52 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck %s
+
+define amdgpu_gs void @f(i32 inreg %arg, i32 %arg1, i32 %arg2) {
+; CHECK-LABEL: f:
+; CHECK:       ; %bb.0: ; %bb
+; CHECK-NEXT:    s_cmp_eq_u32 s0, 0
+; CHECK-NEXT:    s_mov_b32 s0, 0
+; CHECK-NEXT:    s_cbranch_scc1 .LBB0_2
+; CHECK-NEXT:  ; %bb.1: ; %bb3
+; CHECK-NEXT:    v_mov_b32_e32 v5, v0
+; CHECK-NEXT:    s_branch .LBB0_3
+; CHECK-NEXT:  .LBB0_2:
+; CHECK-NEXT:    v_mov_b32_e32 v5, 1
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0
+; CHECK-NEXT:  .LBB0_3: ; %bb4
+; CHECK-NEXT:    v_mov_b32_e32 v6, 0
+; CHECK-NEXT:    s_mov_b32 s1, s0
+; CHECK-NEXT:    s_mov_b32 s2, s0
+; CHECK-NEXT:    s_mov_b32 s3, s0
+; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; CHECK-NEXT:    v_mov_b32_e32 v7, v6
+; CHECK-NEXT:    v_mov_b32_e32 v8, v6
+; CHECK-NEXT:    v_mov_b32_e32 v2, v6
+; CHECK-NEXT:    v_mov_b32_e32 v3, v6
+; CHECK-NEXT:    v_mov_b32_e32 v4, v6
+; CHECK-NEXT:    s_clause 0x1
+; CHECK-NEXT:    buffer_store_b128 v[5:8], v6, s[0:3], 0 idxen
+; CHECK-NEXT:    buffer_store_b128 v[1:4], v6, s[0:3], 0 idxen
+; CHECK-NEXT:    s_nop 0
+; CHECK-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; CHECK-NEXT:    s_endpgm
+bb:
+  %i = icmp eq i32 %arg, 0
+  br i1 %i, label %bb4, label %bb3
+
+bb3:
+  br label %bb4
+
+bb4:
+  %i5 = phi i32 [ %arg1, %bb3 ], [ 1, %bb ]
+  %i6 = phi i32 [ %arg2, %bb3 ], [ 0, %bb ]
+  %i7 = insertelement <4 x i32> zeroinitializer, i32 %i5, i64 0
+  %i8 = bitcast <4 x i32> %i7 to <4 x float>
+  call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %i8, <4 x i32> zeroinitializer, i32 0, i32 0, i32 0, i32 0)
+  %i9 = insertelement <4 x i32> zeroinitializer, i32 %i6, i64 0
+  %i10 = bitcast <4 x i32> %i9 to <4 x float>
+  call void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float> %i10, <4 x i32> zeroinitializer, i32 0, i32 0, i32 0, i32 0)
+  ret void
+}
+
+declare void @llvm.amdgcn.struct.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32 immarg)
diff --git a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll
index 086fab3f52175..680c27b697709 100644
--- a/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll
@@ -16,7 +16,7 @@ define amdgpu_kernel void @add_shr_i32(ptr addrspace(1) %out, ptr addrspace(1) %
 ; NOSDWA-NEXT:    v_mov_b32_e32 v1, s1
 ; NOSDWA-NEXT:    s_waitcnt vmcnt(0)
 ; NOSDWA-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; NOSDWA-NEXT:    v_add_u32_e32 v2, vcc, v2, v3
+; NOSDWA-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
 ; NOSDWA-NEXT:    flat_store_dword v[0:1], v2
 ; NOSDWA-NEXT:    s_endpgm
 ;
@@ -30,7 +30,7 @@ define amdgpu_kernel void @add_shr_i32(ptr addrspace(1) %out, ptr addrspace(1) %
 ; GFX89-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX89-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX89-NEXT:    s_waitcnt vmcnt(0)
-; GFX89-NEXT:    v_add_u32_sdwa v2, vcc, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX89-NEXT:    v_add_u32_sdwa v2, vcc, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX89-NEXT:    flat_store_dword v[0:1], v2
 ; GFX89-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll
index aa65a62e242da..60f309c0c1882 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll
@@ -509,7 +509,7 @@ define amdgpu_kernel void @livevariables_update_missed_block(ptr addrspace(1) %s
   ; SI-NEXT: bb.6.sw.bb18:
   ; SI-NEXT:   successors: %bb.5(0x80000000)
   ; SI-NEXT: {{  $}}
-  ; SI-NEXT:   [[PHI1:%[0-9]+]]:vgpr_32 = PHI undef %33:vgpr_32, %bb.3, [[GLOBAL_LOAD_UBYTE1]], %bb.4
+  ; SI-NEXT:   [[PHI1:%[0-9]+]]:vgpr_32 = PHI undef %35:vgpr_32, %bb.3, [[GLOBAL_LOAD_UBYTE1]], %bb.4
   ; SI-NEXT:   [[V_MOV_B2:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec
   ; SI-NEXT:   GLOBAL_STORE_BYTE killed [[V_MOV_B2]], killed [[PHI1]], 0, 0, implicit $exec :: (store (s8) into `ptr addrspace(1) null`, addrspace 1)
   ; SI-NEXT:   S_BRANCH %bb.5

From c280bed85a8b9cd6ebf48f9f2923890edc7039f0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bj=C3=B6rn=20Sch=C3=A4pers?= <bjoern@hazardy.de>
Date: Tue, 31 Oct 2023 13:10:46 +0100
Subject: [PATCH 506/877] [clang-format] Fix annotating annotations after
 requires clause

Fixes #69325.
---
 clang/lib/Format/TokenAnnotator.cpp           |  3 ++-
 clang/unittests/Format/TokenAnnotatorTest.cpp | 14 ++++++++++++++
 2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp
index aee966145b8e5..729e7e370bf62 100644
--- a/clang/lib/Format/TokenAnnotator.cpp
+++ b/clang/lib/Format/TokenAnnotator.cpp
@@ -2101,7 +2101,8 @@ class AnnotatingParser {
               BeforeParen->isNot(TT_TypenameMacro) &&
               BeforeParen->TokenText == BeforeParen->TokenText.upper() &&
               (!BeforeParen->Previous ||
-               BeforeParen->Previous->ClosesTemplateDeclaration)) {
+               BeforeParen->Previous->ClosesTemplateDeclaration ||
+               BeforeParen->Previous->ClosesRequiresClause)) {
             Current.setType(TT_FunctionAnnotationRParen);
           }
         }
diff --git a/clang/unittests/Format/TokenAnnotatorTest.cpp b/clang/unittests/Format/TokenAnnotatorTest.cpp
index c16c7d6475245..c9f1439e212b0 100644
--- a/clang/unittests/Format/TokenAnnotatorTest.cpp
+++ b/clang/unittests/Format/TokenAnnotatorTest.cpp
@@ -1376,6 +1376,20 @@ TEST_F(TokenAnnotatorTest, RequiresDoesNotChangeParsingOfTheRest) {
                     "}";
   RequiresTokenCount = 9;
   TestRequires(__LINE__);
+
+  BaseCode = "template<typename T>\n"
+             "ANNOTATE(\"S\"\n"
+             "         \"S\")\n"
+             "void foo();";
+  ConstrainedCode = "template<typename T>\n"
+                    "  requires(true)\n"
+                    "ANNOTATE(\"S\"\n"
+                    "         \"S\")\n"
+                    "void foo();";
+  BaseTokenCount = 16;
+  RequiresTokenCount = 4;
+  PrefixTokenCount = 5;
+  TestRequires(__LINE__);
 }
 
 TEST_F(TokenAnnotatorTest, UnderstandsAsm) {

From ab1c97b6510c4ce4643469191080a16b4007244d Mon Sep 17 00:00:00 2001
From: Pravin Jagtap <prjagtap@amd.com>
Date: Tue, 31 Oct 2023 18:07:40 +0530
Subject: [PATCH 507/877] [AMDGPU] Accept/Ignore any -mcmodel arguments.
 (#70760)

Authored-by: Pravin Jagtap <Pravin.Jagtap@amd.com>
---
 clang/lib/Driver/ToolChains/Clang.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index fb90fcd033b1a..79f7fba225707 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -5743,9 +5743,9 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
     } else if (Triple.getArch() == llvm::Triple::x86_64) {
       Ok = llvm::is_contained({"small", "kernel", "medium", "large", "tiny"},
                               CM);
-    } else if (Triple.isNVPTX()) {
-      // NVPTX does not care about the code model and will accept whatever works
-      // for the host.
+    } else if (Triple.isNVPTX() || Triple.isAMDGPU()) {
+      // NVPTX/AMDGPU does not care about the code model and will accept
+      // whatever works for the host.
       Ok = true;
     }
     if (Ok) {

From 1d090b8241c67d95911335efc5b11cea2f344050 Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <Ramkumar.Ramachandra@imgtec.com>
Date: Tue, 31 Oct 2023 12:50:46 +0000
Subject: [PATCH 508/877] LoopVectorize/test: add missing CHECK lines, cleanup
 intrinsic.ll (#70202)

Clean up intrinsic.ll by removing extraneous attributes and target
datalayout, fix a bug in the copysign_f64 test, and add missing CHECK
lines.
---
 .../Transforms/LoopVectorize/intrinsic.ll     | 666 ++++++++++--------
 1 file changed, 363 insertions(+), 303 deletions(-)

diff --git a/llvm/test/Transforms/LoopVectorize/intrinsic.ll b/llvm/test/Transforms/LoopVectorize/intrinsic.ll
index b2ba7cfbfa3a4..3e89105643bad 100644
--- a/llvm/test/Transforms/LoopVectorize/intrinsic.ll
+++ b/llvm/test/Transforms/LoopVectorize/intrinsic.ll
@@ -1,11 +1,10 @@
 ; RUN: opt < %s -passes=loop-vectorize,dce,instcombine -force-vector-interleave=1 -force-vector-width=4 -S | FileCheck %s
 
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
-
-;CHECK-LABEL: @sqrt_f32(
-;CHECK: llvm.sqrt.v4f32
-;CHECK: ret void
-define void @sqrt_f32(i32 %n, ptr noalias %y, ptr noalias %x) nounwind uwtable {
+define void @sqrt_f32(i32 %n, ptr %y, ptr %x) {
+; CHECK-LABEL: @sqrt_f32(
+; CHECK: llvm.sqrt.v4f32
+; CHECK: ret void
+;
 entry:
   %cmp6 = icmp sgt i32 %n, 0
   br i1 %cmp6, label %for.body, label %for.end
@@ -14,7 +13,7 @@ for.body:                                         ; preds = %entry, %for.body
   %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
   %arrayidx = getelementptr inbounds float, ptr %y, i64 %indvars.iv
   %0 = load float, ptr %arrayidx, align 4
-  %call = tail call float @llvm.sqrt.f32(float %0) nounwind readnone
+  %call = tail call float @llvm.sqrt.f32(float %0)
   %arrayidx2 = getelementptr inbounds float, ptr %x, i64 %indvars.iv
   store float %call, ptr %arrayidx2, align 4
   %indvars.iv.next = add i64 %indvars.iv, 1
@@ -26,12 +25,13 @@ for.end:                                          ; preds = %for.body, %entry
   ret void
 }
 
-declare float @llvm.sqrt.f32(float) nounwind readnone
+declare float @llvm.sqrt.f32(float)
 
-;CHECK-LABEL: @sqrt_f64(
-;CHECK: llvm.sqrt.v4f64
-;CHECK: ret void
-define void @sqrt_f64(i32 %n, ptr noalias %y, ptr noalias %x) nounwind uwtable {
+define void @sqrt_f64(i32 %n, ptr %y, ptr %x) {
+; CHECK-LABEL: @sqrt_f64(
+; CHECK: llvm.sqrt.v4f64
+; CHECK: ret void
+;
 entry:
   %cmp6 = icmp sgt i32 %n, 0
   br i1 %cmp6, label %for.body, label %for.end
@@ -40,7 +40,7 @@ for.body:                                         ; preds = %entry, %for.body
   %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
   %arrayidx = getelementptr inbounds double, ptr %y, i64 %indvars.iv
   %0 = load double, ptr %arrayidx, align 8
-  %call = tail call double @llvm.sqrt.f64(double %0) nounwind readnone
+  %call = tail call double @llvm.sqrt.f64(double %0)
   %arrayidx2 = getelementptr inbounds double, ptr %x, i64 %indvars.iv
   store double %call, ptr %arrayidx2, align 8
   %indvars.iv.next = add i64 %indvars.iv, 1
@@ -52,12 +52,13 @@ for.end:                                          ; preds = %for.body, %entry
   ret void
 }
 
-declare double @llvm.sqrt.f64(double) nounwind readnone
+declare double @llvm.sqrt.f64(double)
 
-;CHECK-LABEL: @sin_f32(
-;CHECK: llvm.sin.v4f32
-;CHECK: ret void
-define void @sin_f32(i32 %n, ptr noalias %y, ptr noalias %x) nounwind uwtable {
+define void @sin_f32(i32 %n, ptr %y, ptr %x) {
+; CHECK-LABEL: @sin_f32(
+; CHECK: llvm.sin.v4f32
+; CHECK: ret void
+;
 entry:
   %cmp6 = icmp sgt i32 %n, 0
   br i1 %cmp6, label %for.body, label %for.end
@@ -66,7 +67,7 @@ for.body:                                         ; preds = %entry, %for.body
   %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
   %arrayidx = getelementptr inbounds float, ptr %y, i64 %indvars.iv
   %0 = load float, ptr %arrayidx, align 4
-  %call = tail call float @llvm.sin.f32(float %0) nounwind readnone
+  %call = tail call float @llvm.sin.f32(float %0)
   %arrayidx2 = getelementptr inbounds float, ptr %x, i64 %indvars.iv
   store float %call, ptr %arrayidx2, align 4
   %indvars.iv.next = add i64 %indvars.iv, 1
@@ -78,12 +79,13 @@ for.end:                                          ; preds = %for.body, %entry
   ret void
 }
 
-declare float @llvm.sin.f32(float) nounwind readnone
+declare float @llvm.sin.f32(float)
 
-;CHECK-LABEL: @sin_f64(
-;CHECK: llvm.sin.v4f64
-;CHECK: ret void
-define void @sin_f64(i32 %n, ptr noalias %y, ptr noalias %x) nounwind uwtable {
+define void @sin_f64(i32 %n, ptr %y, ptr %x) {
+; CHECK-LABEL: @sin_f64(
+; CHECK: llvm.sin.v4f64
+; CHECK: ret void
+;
 entry:
   %cmp6 = icmp sgt i32 %n, 0
   br i1 %cmp6, label %for.body, label %for.end
@@ -92,7 +94,7 @@ for.body:                                         ; preds = %entry, %for.body
   %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
   %arrayidx = getelementptr inbounds double, ptr %y, i64 %indvars.iv
   %0 = load double, ptr %arrayidx, align 8
-  %call = tail call double @llvm.sin.f64(double %0) nounwind readnone
+  %call = tail call double @llvm.sin.f64(double %0)
   %arrayidx2 = getelementptr inbounds double, ptr %x, i64 %indvars.iv
   store double %call, ptr %arrayidx2, align 8
   %indvars.iv.next = add i64 %indvars.iv, 1
@@ -104,12 +106,13 @@ for.end:                                          ; preds = %for.body, %entry
   ret void
 }
 
-declare double @llvm.sin.f64(double) nounwind readnone
+declare double @llvm.sin.f64(double)
 
-;CHECK-LABEL: @cos_f32(
-;CHECK: llvm.cos.v4f32
-;CHECK: ret void
-define void @cos_f32(i32 %n, ptr noalias %y, ptr noalias %x) nounwind uwtable {
+define void @cos_f32(i32 %n, ptr %y, ptr %x) {
+; CHECK-LABEL: @cos_f32(
+; CHECK: llvm.cos.v4f32
+; CHECK: ret void
+;
 entry:
   %cmp6 = icmp sgt i32 %n, 0
   br i1 %cmp6, label %for.body, label %for.end
@@ -118,7 +121,7 @@ for.body:                                         ; preds = %entry, %for.body
   %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
   %arrayidx = getelementptr inbounds float, ptr %y, i64 %indvars.iv
   %0 = load float, ptr %arrayidx, align 4
-  %call = tail call float @llvm.cos.f32(float %0) nounwind readnone
+  %call = tail call float @llvm.cos.f32(float %0)
   %arrayidx2 = getelementptr inbounds float, ptr %x, i64 %indvars.iv
   store float %call, ptr %arrayidx2, align 4
   %indvars.iv.next = add i64 %indvars.iv, 1
@@ -130,12 +133,13 @@ for.end:                                          ; preds = %for.body, %entry
   ret void
 }
 
-declare float @llvm.cos.f32(float) nounwind readnone
+declare float @llvm.cos.f32(float)
 
-;CHECK-LABEL: @cos_f64(
-;CHECK: llvm.cos.v4f64
-;CHECK: ret void
-define void @cos_f64(i32 %n, ptr noalias %y, ptr noalias %x) nounwind uwtable {
+define void @cos_f64(i32 %n, ptr %y, ptr %x) {
+; CHECK-LABEL: @cos_f64(
+; CHECK: llvm.cos.v4f64
+; CHECK: ret void
+;
 entry:
   %cmp6 = icmp sgt i32 %n, 0
   br i1 %cmp6, label %for.body, label %for.end
@@ -144,7 +148,7 @@ for.body:                                         ; preds = %entry, %for.body
   %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
   %arrayidx = getelementptr inbounds double, ptr %y, i64 %indvars.iv
   %0 = load double, ptr %arrayidx, align 8
-  %call = tail call double @llvm.cos.f64(double %0) nounwind readnone
+  %call = tail call double @llvm.cos.f64(double %0)
   %arrayidx2 = getelementptr inbounds double, ptr %x, i64 %indvars.iv
   store double %call, ptr %arrayidx2, align 8
   %indvars.iv.next = add i64 %indvars.iv, 1
@@ -156,12 +160,13 @@ for.end:                                          ; preds = %for.body, %entry
   ret void
 }
 
-declare double @llvm.cos.f64(double) nounwind readnone
+declare double @llvm.cos.f64(double)
 
-;CHECK-LABEL: @exp_f32(
-;CHECK: llvm.exp.v4f32
-;CHECK: ret void
-define void @exp_f32(i32 %n, ptr noalias %y, ptr noalias %x) nounwind uwtable {
+define void @exp_f32(i32 %n, ptr %y, ptr %x) {
+; CHECK-LABEL: @exp_f32(
+; CHECK: llvm.exp.v4f32
+; CHECK: ret void
+;
 entry:
   %cmp6 = icmp sgt i32 %n, 0
   br i1 %cmp6, label %for.body, label %for.end
@@ -170,7 +175,7 @@ for.body:                                         ; preds = %entry, %for.body
   %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
   %arrayidx = getelementptr inbounds float, ptr %y, i64 %indvars.iv
   %0 = load float, ptr %arrayidx, align 4
-  %call = tail call float @llvm.exp.f32(float %0) nounwind readnone
+  %call = tail call float @llvm.exp.f32(float %0)
   %arrayidx2 = getelementptr inbounds float, ptr %x, i64 %indvars.iv
   store float %call, ptr %arrayidx2, align 4
   %indvars.iv.next = add i64 %indvars.iv, 1
@@ -182,12 +187,13 @@ for.end:                                          ; preds = %for.body, %entry
   ret void
 }
 
-declare float @llvm.exp.f32(float) nounwind readnone
+declare float @llvm.exp.f32(float)
 
-;CHECK-LABEL: @exp_f64(
-;CHECK: llvm.exp.v4f64
-;CHECK: ret void
-define void @exp_f64(i32 %n, ptr noalias %y, ptr noalias %x) nounwind uwtable {
+define void @exp_f64(i32 %n, ptr %y, ptr %x) {
+; CHECK-LABEL: @exp_f64(
+; CHECK: llvm.exp.v4f64
+; CHECK: ret void
+;
 entry:
   %cmp6 = icmp sgt i32 %n, 0
   br i1 %cmp6, label %for.body, label %for.end
@@ -196,7 +202,7 @@ for.body:                                         ; preds = %entry, %for.body
   %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
   %arrayidx = getelementptr inbounds double, ptr %y, i64 %indvars.iv
   %0 = load double, ptr %arrayidx, align 8
-  %call = tail call double @llvm.exp.f64(double %0) nounwind readnone
+  %call = tail call double @llvm.exp.f64(double %0)
   %arrayidx2 = getelementptr inbounds double, ptr %x, i64 %indvars.iv
   store double %call, ptr %arrayidx2, align 8
   %indvars.iv.next = add i64 %indvars.iv, 1
@@ -208,12 +214,13 @@ for.end:                                          ; preds = %for.body, %entry
   ret void
 }
 
-declare double @llvm.exp.f64(double) nounwind readnone
+declare double @llvm.exp.f64(double)
 
-;CHECK-LABEL: @exp2_f32(
-;CHECK: llvm.exp2.v4f32
-;CHECK: ret void
-define void @exp2_f32(i32 %n, ptr noalias %y, ptr noalias %x) nounwind uwtable {
+define void @exp2_f32(i32 %n, ptr %y, ptr %x) {
+; CHECK-LABEL: @exp2_f32(
+; CHECK: llvm.exp2.v4f32
+; CHECK: ret void
+;
 entry:
   %cmp6 = icmp sgt i32 %n, 0
   br i1 %cmp6, label %for.body, label %for.end
@@ -222,7 +229,7 @@ for.body:                                         ; preds = %entry, %for.body
   %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
   %arrayidx = getelementptr inbounds float, ptr %y, i64 %indvars.iv
   %0 = load float, ptr %arrayidx, align 4
-  %call = tail call float @llvm.exp2.f32(float %0) nounwind readnone
+  %call = tail call float @llvm.exp2.f32(float %0)
   %arrayidx2 = getelementptr inbounds float, ptr %x, i64 %indvars.iv
   store float %call, ptr %arrayidx2, align 4
   %indvars.iv.next = add i64 %indvars.iv, 1
@@ -234,12 +241,13 @@ for.end:                                          ; preds = %for.body, %entry
   ret void
 }
 
-declare float @llvm.exp2.f32(float) nounwind readnone
+declare float @llvm.exp2.f32(float)
 
-;CHECK-LABEL: @exp2_f64(
-;CHECK: llvm.exp2.v4f64
-;CHECK: ret void
-define void @exp2_f64(i32 %n, ptr noalias %y, ptr noalias %x) nounwind uwtable {
+define void @exp2_f64(i32 %n, ptr %y, ptr %x) {
+; CHECK-LABEL: @exp2_f64(
+; CHECK: llvm.exp2.v4f64
+; CHECK: ret void
+;
 entry:
   %cmp6 = icmp sgt i32 %n, 0
   br i1 %cmp6, label %for.body, label %for.end
@@ -248,7 +256,7 @@ for.body:                                         ; preds = %entry, %for.body
   %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
   %arrayidx = getelementptr inbounds double, ptr %y, i64 %indvars.iv
   %0 = load double, ptr %arrayidx, align 8
-  %call = tail call double @llvm.exp2.f64(double %0) nounwind readnone
+  %call = tail call double @llvm.exp2.f64(double %0)
   %arrayidx2 = getelementptr inbounds double, ptr %x, i64 %indvars.iv
   store double %call, ptr %arrayidx2, align 8
   %indvars.iv.next = add i64 %indvars.iv, 1
@@ -260,12 +268,13 @@ for.end:                                          ; preds = %for.body, %entry
   ret void
 }
 
-declare double @llvm.exp2.f64(double) nounwind readnone
+declare double @llvm.exp2.f64(double)
 
-;CHECK-LABEL: @log_f32(
-;CHECK: llvm.log.v4f32
-;CHECK: ret void
-define void @log_f32(i32 %n, ptr noalias %y, ptr noalias %x) nounwind uwtable {
+define void @log_f32(i32 %n, ptr %y, ptr %x) {
+; CHECK-LABEL: @log_f32(
+; CHECK: llvm.log.v4f32
+; CHECK: ret void
+;
 entry:
   %cmp6 = icmp sgt i32 %n, 0
   br i1 %cmp6, label %for.body, label %for.end
@@ -274,7 +283,7 @@ for.body:                                         ; preds = %entry, %for.body
   %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
   %arrayidx = getelementptr inbounds float, ptr %y, i64 %indvars.iv
   %0 = load float, ptr %arrayidx, align 4
-  %call = tail call float @llvm.log.f32(float %0) nounwind readnone
+  %call = tail call float @llvm.log.f32(float %0)
   %arrayidx2 = getelementptr inbounds float, ptr %x, i64 %indvars.iv
   store float %call, ptr %arrayidx2, align 4
   %indvars.iv.next = add i64 %indvars.iv, 1
@@ -286,12 +295,13 @@ for.end:                                          ; preds = %for.body, %entry
   ret void
 }
 
-declare float @llvm.log.f32(float) nounwind readnone
+declare float @llvm.log.f32(float)
 
-;CHECK-LABEL: @log_f64(
-;CHECK: llvm.log.v4f64
-;CHECK: ret void
-define void @log_f64(i32 %n, ptr noalias %y, ptr noalias %x) nounwind uwtable {
+define void @log_f64(i32 %n, ptr %y, ptr %x) {
+; CHECK-LABEL: @log_f64(
+; CHECK: llvm.log.v4f64
+; CHECK: ret void
+;
 entry:
   %cmp6 = icmp sgt i32 %n, 0
   br i1 %cmp6, label %for.body, label %for.end
@@ -300,7 +310,7 @@ for.body:                                         ; preds = %entry, %for.body
   %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
   %arrayidx = getelementptr inbounds double, ptr %y, i64 %indvars.iv
   %0 = load double, ptr %arrayidx, align 8
-  %call = tail call double @llvm.log.f64(double %0) nounwind readnone
+  %call = tail call double @llvm.log.f64(double %0)
   %arrayidx2 = getelementptr inbounds double, ptr %x, i64 %indvars.iv
   store double %call, ptr %arrayidx2, align 8
   %indvars.iv.next = add i64 %indvars.iv, 1
@@ -312,12 +322,13 @@ for.end:                                          ; preds = %for.body, %entry
   ret void
 }
 
-declare double @llvm.log.f64(double) nounwind readnone
+declare double @llvm.log.f64(double)
 
-;CHECK-LABEL: @log10_f32(
-;CHECK: llvm.log10.v4f32
-;CHECK: ret void
-define void @log10_f32(i32 %n, ptr noalias %y, ptr noalias %x) nounwind uwtable {
+define void @log10_f32(i32 %n, ptr %y, ptr %x) {
+; CHECK-LABEL: @log10_f32(
+; CHECK: llvm.log10.v4f32
+; CHECK: ret void
+;
 entry:
   %cmp6 = icmp sgt i32 %n, 0
   br i1 %cmp6, label %for.body, label %for.end
@@ -326,7 +337,7 @@ for.body:                                         ; preds = %entry, %for.body
   %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
   %arrayidx = getelementptr inbounds float, ptr %y, i64 %indvars.iv
   %0 = load float, ptr %arrayidx, align 4
-  %call = tail call float @llvm.log10.f32(float %0) nounwind readnone
+  %call = tail call float @llvm.log10.f32(float %0)
   %arrayidx2 = getelementptr inbounds float, ptr %x, i64 %indvars.iv
   store float %call, ptr %arrayidx2, align 4
   %indvars.iv.next = add i64 %indvars.iv, 1
@@ -338,12 +349,13 @@ for.end:                                          ; preds = %for.body, %entry
   ret void
 }
 
-declare float @llvm.log10.f32(float) nounwind readnone
+declare float @llvm.log10.f32(float)
 
-;CHECK-LABEL: @log10_f64(
-;CHECK: llvm.log10.v4f64
-;CHECK: ret void
-define void @log10_f64(i32 %n, ptr noalias %y, ptr noalias %x) nounwind uwtable {
+define void @log10_f64(i32 %n, ptr %y, ptr %x) {
+; CHECK-LABEL: @log10_f64(
+; CHECK: llvm.log10.v4f64
+; CHECK: ret void
+;
 entry:
   %cmp6 = icmp sgt i32 %n, 0
   br i1 %cmp6, label %for.body, label %for.end
@@ -352,7 +364,7 @@ for.body:                                         ; preds = %entry, %for.body
   %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
   %arrayidx = getelementptr inbounds double, ptr %y, i64 %indvars.iv
   %0 = load double, ptr %arrayidx, align 8
-  %call = tail call double @llvm.log10.f64(double %0) nounwind readnone
+  %call = tail call double @llvm.log10.f64(double %0)
   %arrayidx2 = getelementptr inbounds double, ptr %x, i64 %indvars.iv
   store double %call, ptr %arrayidx2, align 8
   %indvars.iv.next = add i64 %indvars.iv, 1
@@ -364,12 +376,13 @@ for.end:                                          ; preds = %for.body, %entry
   ret void
 }
 
-declare double @llvm.log10.f64(double) nounwind readnone
+declare double @llvm.log10.f64(double)
 
-;CHECK-LABEL: @log2_f32(
-;CHECK: llvm.log2.v4f32
-;CHECK: ret void
-define void @log2_f32(i32 %n, ptr noalias %y, ptr noalias %x) nounwind uwtable {
+define void @log2_f32(i32 %n, ptr %y, ptr %x) {
+; CHECK-LABEL: @log2_f32(
+; CHECK: llvm.log2.v4f32
+; CHECK: ret void
+;
 entry:
   %cmp6 = icmp sgt i32 %n, 0
   br i1 %cmp6, label %for.body, label %for.end
@@ -378,7 +391,7 @@ for.body:                                         ; preds = %entry, %for.body
   %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
   %arrayidx = getelementptr inbounds float, ptr %y, i64 %indvars.iv
   %0 = load float, ptr %arrayidx, align 4
-  %call = tail call float @llvm.log2.f32(float %0) nounwind readnone
+  %call = tail call float @llvm.log2.f32(float %0)
   %arrayidx2 = getelementptr inbounds float, ptr %x, i64 %indvars.iv
   store float %call, ptr %arrayidx2, align 4
   %indvars.iv.next = add i64 %indvars.iv, 1
@@ -390,12 +403,13 @@ for.end:                                          ; preds = %for.body, %entry
   ret void
 }
 
-declare float @llvm.log2.f32(float) nounwind readnone
+declare float @llvm.log2.f32(float)
 
-;CHECK-LABEL: @log2_f64(
-;CHECK: llvm.log2.v4f64
-;CHECK: ret void
-define void @log2_f64(i32 %n, ptr noalias %y, ptr noalias %x) nounwind uwtable {
+define void @log2_f64(i32 %n, ptr %y, ptr %x) {
+; CHECK-LABEL: @log2_f64(
+; CHECK: llvm.log2.v4f64
+; CHECK: ret void
+;
 entry:
   %cmp6 = icmp sgt i32 %n, 0
   br i1 %cmp6, label %for.body, label %for.end
@@ -404,7 +418,7 @@ for.body:                                         ; preds = %entry, %for.body
   %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
   %arrayidx = getelementptr inbounds double, ptr %y, i64 %indvars.iv
   %0 = load double, ptr %arrayidx, align 8
-  %call = tail call double @llvm.log2.f64(double %0) nounwind readnone
+  %call = tail call double @llvm.log2.f64(double %0)
   %arrayidx2 = getelementptr inbounds double, ptr %x, i64 %indvars.iv
   store double %call, ptr %arrayidx2, align 8
   %indvars.iv.next = add i64 %indvars.iv, 1
@@ -416,12 +430,13 @@ for.end:                                          ; preds = %for.body, %entry
   ret void
 }
 
-declare double @llvm.log2.f64(double) nounwind readnone
+declare double @llvm.log2.f64(double)
 
-;CHECK-LABEL: @fabs_f32(
-;CHECK: llvm.fabs.v4f32
-;CHECK: ret void
-define void @fabs_f32(i32 %n, ptr noalias %y, ptr noalias %x) nounwind uwtable {
+define void @fabs_f32(i32 %n, ptr %y, ptr %x) {
+; CHECK-LABEL: @fabs_f32(
+; CHECK: llvm.fabs.v4f32
+; CHECK: ret void
+;
 entry:
   %cmp6 = icmp sgt i32 %n, 0
   br i1 %cmp6, label %for.body, label %for.end
@@ -430,7 +445,7 @@ for.body:                                         ; preds = %entry, %for.body
   %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
   %arrayidx = getelementptr inbounds float, ptr %y, i64 %indvars.iv
   %0 = load float, ptr %arrayidx, align 4
-  %call = tail call float @llvm.fabs.f32(float %0) nounwind readnone
+  %call = tail call float @llvm.fabs.f32(float %0)
   %arrayidx2 = getelementptr inbounds float, ptr %x, i64 %indvars.iv
   store float %call, ptr %arrayidx2, align 4
   %indvars.iv.next = add i64 %indvars.iv, 1
@@ -442,9 +457,13 @@ for.end:                                          ; preds = %for.body, %entry
   ret void
 }
 
-declare float @llvm.fabs.f32(float) nounwind readnone
+declare float @llvm.fabs.f32(float)
 
-define void @fabs_f64(i32 %n, ptr noalias %y, ptr noalias %x) nounwind uwtable {
+define void @fabs_f64(i32 %n, ptr %y, ptr %x) {
+; CHECK-LABEL: @fabs_f64(
+; CHECK: llvm.fabs.v4f64
+; CHECK: ret void
+;
 entry:
   %cmp6 = icmp sgt i32 %n, 0
   br i1 %cmp6, label %for.body, label %for.end
@@ -453,7 +472,7 @@ for.body:                                         ; preds = %entry, %for.body
   %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
   %arrayidx = getelementptr inbounds double, ptr %y, i64 %indvars.iv
   %0 = load double, ptr %arrayidx, align 8
-  %call = tail call double @llvm.fabs(double %0) nounwind readnone
+  %call = tail call double @llvm.fabs(double %0)
   %arrayidx2 = getelementptr inbounds double, ptr %x, i64 %indvars.iv
   store double %call, ptr %arrayidx2, align 8
   %indvars.iv.next = add i64 %indvars.iv, 1
@@ -465,12 +484,13 @@ for.end:                                          ; preds = %for.body, %entry
   ret void
 }
 
-declare double @llvm.fabs(double) nounwind readnone
+declare double @llvm.fabs(double)
 
-;CHECK-LABEL: @copysign_f32(
-;CHECK: llvm.copysign.v4f32
-;CHECK: ret void
-define void @copysign_f32(i32 %n, ptr noalias %y, ptr noalias %x, ptr noalias %z) nounwind uwtable {
+define void @copysign_f32(i32 %n, ptr %y, ptr %x, ptr %z) {
+; CHECK-LABEL: @copysign_f32(
+; CHECK: llvm.copysign.v4f32
+; CHECK: ret void
+;
 entry:
   %cmp6 = icmp sgt i32 %n, 0
   br i1 %cmp6, label %for.body, label %for.end
@@ -481,7 +501,7 @@ for.body:                                         ; preds = %entry, %for.body
   %0 = load float, ptr %arrayidx, align 4
   %arrayidx1 = getelementptr inbounds float, ptr %z, i64 %indvars.iv
   %1 = load float, ptr %arrayidx1, align 4
-  %call = tail call float @llvm.copysign.f32(float %0, float %1) nounwind readnone
+  %call = tail call float @llvm.copysign.f32(float %0, float %1)
   %arrayidx2 = getelementptr inbounds float, ptr %x, i64 %indvars.iv
   store float %call, ptr %arrayidx2, align 4
   %indvars.iv.next = add i64 %indvars.iv, 1
@@ -493,9 +513,13 @@ for.end:                                          ; preds = %for.body, %entry
   ret void
 }
 
-declare float @llvm.copysign.f32(float, float) nounwind readnone
+declare float @llvm.copysign.f32(float, float)
 
-define void @copysign_f64(i32 %n, ptr noalias %y, ptr noalias %x, ptr noalias %z) nounwind uwtable {
+define void @copysign_f64(i32 %n, ptr %y, ptr %x, ptr %z) {
+; CHECK-LABEL: @copysign_f64(
+; CHECK: llvm.copysign.v4f64
+; CHECK: ret void
+;
 entry:
   %cmp6 = icmp sgt i32 %n, 0
   br i1 %cmp6, label %for.body, label %for.end
@@ -505,8 +529,8 @@ for.body:                                         ; preds = %entry, %for.body
   %arrayidx = getelementptr inbounds double, ptr %y, i64 %indvars.iv
   %0 = load double, ptr %arrayidx, align 8
   %arrayidx1 = getelementptr inbounds double, ptr %z, i64 %indvars.iv
-  %1 = load double, ptr %arrayidx, align 8
-  %call = tail call double @llvm.copysign(double %0, double %1) nounwind readnone
+  %1 = load double, ptr %arrayidx1, align 8
+  %call = tail call double @llvm.copysign(double %0, double %1)
   %arrayidx2 = getelementptr inbounds double, ptr %x, i64 %indvars.iv
   store double %call, ptr %arrayidx2, align 8
   %indvars.iv.next = add i64 %indvars.iv, 1
@@ -518,12 +542,13 @@ for.end:                                          ; preds = %for.body, %entry
   ret void
 }
 
-declare double @llvm.copysign(double, double) nounwind readnone
+declare double @llvm.copysign(double, double)
 
-;CHECK-LABEL: @floor_f32(
-;CHECK: llvm.floor.v4f32
-;CHECK: ret void
-define void @floor_f32(i32 %n, ptr noalias %y, ptr noalias %x) nounwind uwtable {
+define void @floor_f32(i32 %n, ptr %y, ptr %x) {
+; CHECK-LABEL: @floor_f32(
+; CHECK: llvm.floor.v4f32
+; CHECK: ret void
+;
 entry:
   %cmp6 = icmp sgt i32 %n, 0
   br i1 %cmp6, label %for.body, label %for.end
@@ -532,7 +557,7 @@ for.body:                                         ; preds = %entry, %for.body
   %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
   %arrayidx = getelementptr inbounds float, ptr %y, i64 %indvars.iv
   %0 = load float, ptr %arrayidx, align 4
-  %call = tail call float @llvm.floor.f32(float %0) nounwind readnone
+  %call = tail call float @llvm.floor.f32(float %0)
   %arrayidx2 = getelementptr inbounds float, ptr %x, i64 %indvars.iv
   store float %call, ptr %arrayidx2, align 4
   %indvars.iv.next = add i64 %indvars.iv, 1
@@ -544,12 +569,13 @@ for.end:                                          ; preds = %for.body, %entry
   ret void
 }
 
-declare float @llvm.floor.f32(float) nounwind readnone
+declare float @llvm.floor.f32(float)
 
-;CHECK-LABEL: @floor_f64(
-;CHECK: llvm.floor.v4f64
-;CHECK: ret void
-define void @floor_f64(i32 %n, ptr noalias %y, ptr noalias %x) nounwind uwtable {
+define void @floor_f64(i32 %n, ptr %y, ptr %x) {
+; CHECK-LABEL: @floor_f64(
+; CHECK: llvm.floor.v4f64
+; CHECK: ret void
+;
 entry:
   %cmp6 = icmp sgt i32 %n, 0
   br i1 %cmp6, label %for.body, label %for.end
@@ -558,7 +584,7 @@ for.body:                                         ; preds = %entry, %for.body
   %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
   %arrayidx = getelementptr inbounds double, ptr %y, i64 %indvars.iv
   %0 = load double, ptr %arrayidx, align 8
-  %call = tail call double @llvm.floor.f64(double %0) nounwind readnone
+  %call = tail call double @llvm.floor.f64(double %0)
   %arrayidx2 = getelementptr inbounds double, ptr %x, i64 %indvars.iv
   store double %call, ptr %arrayidx2, align 8
   %indvars.iv.next = add i64 %indvars.iv, 1
@@ -570,12 +596,13 @@ for.end:                                          ; preds = %for.body, %entry
   ret void
 }
 
-declare double @llvm.floor.f64(double) nounwind readnone
+declare double @llvm.floor.f64(double)
 
-;CHECK-LABEL: @ceil_f32(
-;CHECK: llvm.ceil.v4f32
-;CHECK: ret void
-define void @ceil_f32(i32 %n, ptr noalias %y, ptr noalias %x) nounwind uwtable {
+define void @ceil_f32(i32 %n, ptr %y, ptr %x) {
+; CHECK-LABEL: @ceil_f32(
+; CHECK: llvm.ceil.v4f32
+; CHECK: ret void
+;
 entry:
   %cmp6 = icmp sgt i32 %n, 0
   br i1 %cmp6, label %for.body, label %for.end
@@ -584,7 +611,7 @@ for.body:                                         ; preds = %entry, %for.body
   %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
   %arrayidx = getelementptr inbounds float, ptr %y, i64 %indvars.iv
   %0 = load float, ptr %arrayidx, align 4
-  %call = tail call float @llvm.ceil.f32(float %0) nounwind readnone
+  %call = tail call float @llvm.ceil.f32(float %0)
   %arrayidx2 = getelementptr inbounds float, ptr %x, i64 %indvars.iv
   store float %call, ptr %arrayidx2, align 4
   %indvars.iv.next = add i64 %indvars.iv, 1
@@ -596,12 +623,13 @@ for.end:                                          ; preds = %for.body, %entry
   ret void
 }
 
-declare float @llvm.ceil.f32(float) nounwind readnone
+declare float @llvm.ceil.f32(float)
 
-;CHECK-LABEL: @ceil_f64(
-;CHECK: llvm.ceil.v4f64
-;CHECK: ret void
-define void @ceil_f64(i32 %n, ptr noalias %y, ptr noalias %x) nounwind uwtable {
+define void @ceil_f64(i32 %n, ptr %y, ptr %x) {
+; CHECK-LABEL: @ceil_f64(
+; CHECK: llvm.ceil.v4f64
+; CHECK: ret void
+;
 entry:
   %cmp6 = icmp sgt i32 %n, 0
   br i1 %cmp6, label %for.body, label %for.end
@@ -610,7 +638,7 @@ for.body:                                         ; preds = %entry, %for.body
   %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
   %arrayidx = getelementptr inbounds double, ptr %y, i64 %indvars.iv
   %0 = load double, ptr %arrayidx, align 8
-  %call = tail call double @llvm.ceil.f64(double %0) nounwind readnone
+  %call = tail call double @llvm.ceil.f64(double %0)
   %arrayidx2 = getelementptr inbounds double, ptr %x, i64 %indvars.iv
   store double %call, ptr %arrayidx2, align 8
   %indvars.iv.next = add i64 %indvars.iv, 1
@@ -622,12 +650,13 @@ for.end:                                          ; preds = %for.body, %entry
   ret void
 }
 
-declare double @llvm.ceil.f64(double) nounwind readnone
+declare double @llvm.ceil.f64(double)
 
-;CHECK-LABEL: @trunc_f32(
-;CHECK: llvm.trunc.v4f32
-;CHECK: ret void
-define void @trunc_f32(i32 %n, ptr noalias %y, ptr noalias %x) nounwind uwtable {
+define void @trunc_f32(i32 %n, ptr %y, ptr %x) {
+; CHECK-LABEL: @trunc_f32(
+; CHECK: llvm.trunc.v4f32
+; CHECK: ret void
+;
 entry:
   %cmp6 = icmp sgt i32 %n, 0
   br i1 %cmp6, label %for.body, label %for.end
@@ -636,7 +665,7 @@ for.body:                                         ; preds = %entry, %for.body
   %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
   %arrayidx = getelementptr inbounds float, ptr %y, i64 %indvars.iv
   %0 = load float, ptr %arrayidx, align 4
-  %call = tail call float @llvm.trunc.f32(float %0) nounwind readnone
+  %call = tail call float @llvm.trunc.f32(float %0)
   %arrayidx2 = getelementptr inbounds float, ptr %x, i64 %indvars.iv
   store float %call, ptr %arrayidx2, align 4
   %indvars.iv.next = add i64 %indvars.iv, 1
@@ -648,12 +677,13 @@ for.end:                                          ; preds = %for.body, %entry
   ret void
 }
 
-declare float @llvm.trunc.f32(float) nounwind readnone
+declare float @llvm.trunc.f32(float)
 
-;CHECK-LABEL: @trunc_f64(
-;CHECK: llvm.trunc.v4f64
-;CHECK: ret void
-define void @trunc_f64(i32 %n, ptr noalias %y, ptr noalias %x) nounwind uwtable {
+define void @trunc_f64(i32 %n, ptr %y, ptr %x) {
+; CHECK-LABEL: @trunc_f64(
+; CHECK: llvm.trunc.v4f64
+; CHECK: ret void
+;
 entry:
   %cmp6 = icmp sgt i32 %n, 0
   br i1 %cmp6, label %for.body, label %for.end
@@ -662,7 +692,7 @@ for.body:                                         ; preds = %entry, %for.body
   %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
   %arrayidx = getelementptr inbounds double, ptr %y, i64 %indvars.iv
   %0 = load double, ptr %arrayidx, align 8
-  %call = tail call double @llvm.trunc.f64(double %0) nounwind readnone
+  %call = tail call double @llvm.trunc.f64(double %0)
   %arrayidx2 = getelementptr inbounds double, ptr %x, i64 %indvars.iv
   store double %call, ptr %arrayidx2, align 8
   %indvars.iv.next = add i64 %indvars.iv, 1
@@ -674,12 +704,13 @@ for.end:                                          ; preds = %for.body, %entry
   ret void
 }
 
-declare double @llvm.trunc.f64(double) nounwind readnone
+declare double @llvm.trunc.f64(double)
 
-;CHECK-LABEL: @rint_f32(
-;CHECK: llvm.rint.v4f32
-;CHECK: ret void
-define void @rint_f32(i32 %n, ptr noalias %y, ptr noalias %x) nounwind uwtable {
+define void @rint_f32(i32 %n, ptr %y, ptr %x) {
+; CHECK-LABEL: @rint_f32(
+; CHECK: llvm.rint.v4f32
+; CHECK: ret void
+;
 entry:
   %cmp6 = icmp sgt i32 %n, 0
   br i1 %cmp6, label %for.body, label %for.end
@@ -688,7 +719,7 @@ for.body:                                         ; preds = %entry, %for.body
   %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
   %arrayidx = getelementptr inbounds float, ptr %y, i64 %indvars.iv
   %0 = load float, ptr %arrayidx, align 4
-  %call = tail call float @llvm.rint.f32(float %0) nounwind readnone
+  %call = tail call float @llvm.rint.f32(float %0)
   %arrayidx2 = getelementptr inbounds float, ptr %x, i64 %indvars.iv
   store float %call, ptr %arrayidx2, align 4
   %indvars.iv.next = add i64 %indvars.iv, 1
@@ -700,12 +731,13 @@ for.end:                                          ; preds = %for.body, %entry
   ret void
 }
 
-declare float @llvm.rint.f32(float) nounwind readnone
+declare float @llvm.rint.f32(float)
 
-;CHECK-LABEL: @rint_f64(
-;CHECK: llvm.rint.v4f64
-;CHECK: ret void
-define void @rint_f64(i32 %n, ptr noalias %y, ptr noalias %x) nounwind uwtable {
+define void @rint_f64(i32 %n, ptr %y, ptr %x) {
+; CHECK-LABEL: @rint_f64(
+; CHECK: llvm.rint.v4f64
+; CHECK: ret void
+;
 entry:
   %cmp6 = icmp sgt i32 %n, 0
   br i1 %cmp6, label %for.body, label %for.end
@@ -714,7 +746,7 @@ for.body:                                         ; preds = %entry, %for.body
   %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
   %arrayidx = getelementptr inbounds double, ptr %y, i64 %indvars.iv
   %0 = load double, ptr %arrayidx, align 8
-  %call = tail call double @llvm.rint.f64(double %0) nounwind readnone
+  %call = tail call double @llvm.rint.f64(double %0)
   %arrayidx2 = getelementptr inbounds double, ptr %x, i64 %indvars.iv
   store double %call, ptr %arrayidx2, align 8
   %indvars.iv.next = add i64 %indvars.iv, 1
@@ -726,12 +758,13 @@ for.end:                                          ; preds = %for.body, %entry
   ret void
 }
 
-declare double @llvm.rint.f64(double) nounwind readnone
+declare double @llvm.rint.f64(double)
 
-;CHECK-LABEL: @nearbyint_f32(
-;CHECK: llvm.nearbyint.v4f32
-;CHECK: ret void
-define void @nearbyint_f32(i32 %n, ptr noalias %y, ptr noalias %x) nounwind uwtable {
+define void @nearbyint_f32(i32 %n, ptr %y, ptr %x) {
+; CHECK-LABEL: @nearbyint_f32(
+; CHECK: llvm.nearbyint.v4f32
+; CHECK: ret void
+;
 entry:
   %cmp6 = icmp sgt i32 %n, 0
   br i1 %cmp6, label %for.body, label %for.end
@@ -740,7 +773,7 @@ for.body:                                         ; preds = %entry, %for.body
   %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
   %arrayidx = getelementptr inbounds float, ptr %y, i64 %indvars.iv
   %0 = load float, ptr %arrayidx, align 4
-  %call = tail call float @llvm.nearbyint.f32(float %0) nounwind readnone
+  %call = tail call float @llvm.nearbyint.f32(float %0)
   %arrayidx2 = getelementptr inbounds float, ptr %x, i64 %indvars.iv
   store float %call, ptr %arrayidx2, align 4
   %indvars.iv.next = add i64 %indvars.iv, 1
@@ -752,12 +785,13 @@ for.end:                                          ; preds = %for.body, %entry
   ret void
 }
 
-declare float @llvm.nearbyint.f32(float) nounwind readnone
+declare float @llvm.nearbyint.f32(float)
 
-;CHECK-LABEL: @nearbyint_f64(
-;CHECK: llvm.nearbyint.v4f64
-;CHECK: ret void
-define void @nearbyint_f64(i32 %n, ptr noalias %y, ptr noalias %x) nounwind uwtable {
+define void @nearbyint_f64(i32 %n, ptr %y, ptr %x) {
+; CHECK-LABEL: @nearbyint_f64(
+; CHECK: llvm.nearbyint.v4f64
+; CHECK: ret void
+;
 entry:
   %cmp6 = icmp sgt i32 %n, 0
   br i1 %cmp6, label %for.body, label %for.end
@@ -766,7 +800,7 @@ for.body:                                         ; preds = %entry, %for.body
   %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
   %arrayidx = getelementptr inbounds double, ptr %y, i64 %indvars.iv
   %0 = load double, ptr %arrayidx, align 8
-  %call = tail call double @llvm.nearbyint.f64(double %0) nounwind readnone
+  %call = tail call double @llvm.nearbyint.f64(double %0)
   %arrayidx2 = getelementptr inbounds double, ptr %x, i64 %indvars.iv
   store double %call, ptr %arrayidx2, align 8
   %indvars.iv.next = add i64 %indvars.iv, 1
@@ -778,12 +812,13 @@ for.end:                                          ; preds = %for.body, %entry
   ret void
 }
 
-declare double @llvm.nearbyint.f64(double) nounwind readnone
+declare double @llvm.nearbyint.f64(double)
 
-;CHECK-LABEL: @round_f32(
-;CHECK: llvm.round.v4f32
-;CHECK: ret void
-define void @round_f32(i32 %n, ptr noalias %y, ptr noalias %x) nounwind uwtable {
+define void @round_f32(i32 %n, ptr %y, ptr %x) {
+; CHECK-LABEL: @round_f32(
+; CHECK: llvm.round.v4f32
+; CHECK: ret void
+;
 entry:
   %cmp6 = icmp sgt i32 %n, 0
   br i1 %cmp6, label %for.body, label %for.end
@@ -792,7 +827,7 @@ for.body:                                         ; preds = %entry, %for.body
   %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
   %arrayidx = getelementptr inbounds float, ptr %y, i64 %indvars.iv
   %0 = load float, ptr %arrayidx, align 4
-  %call = tail call float @llvm.round.f32(float %0) nounwind readnone
+  %call = tail call float @llvm.round.f32(float %0)
   %arrayidx2 = getelementptr inbounds float, ptr %x, i64 %indvars.iv
   store float %call, ptr %arrayidx2, align 4
   %indvars.iv.next = add i64 %indvars.iv, 1
@@ -804,12 +839,13 @@ for.end:                                          ; preds = %for.body, %entry
   ret void
 }
 
-declare float @llvm.round.f32(float) nounwind readnone
+declare float @llvm.round.f32(float)
 
-;CHECK-LABEL: @round_f64(
-;CHECK: llvm.round.v4f64
-;CHECK: ret void
-define void @round_f64(i32 %n, ptr noalias %y, ptr noalias %x) nounwind uwtable {
+define void @round_f64(i32 %n, ptr %y, ptr %x) {
+; CHECK-LABEL: @round_f64(
+; CHECK: llvm.round.v4f64
+; CHECK: ret void
+;
 entry:
   %cmp6 = icmp sgt i32 %n, 0
   br i1 %cmp6, label %for.body, label %for.end
@@ -818,7 +854,7 @@ for.body:                                         ; preds = %entry, %for.body
   %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
   %arrayidx = getelementptr inbounds double, ptr %y, i64 %indvars.iv
   %0 = load double, ptr %arrayidx, align 8
-  %call = tail call double @llvm.round.f64(double %0) nounwind readnone
+  %call = tail call double @llvm.round.f64(double %0)
   %arrayidx2 = getelementptr inbounds double, ptr %x, i64 %indvars.iv
   store double %call, ptr %arrayidx2, align 8
   %indvars.iv.next = add i64 %indvars.iv, 1
@@ -830,12 +866,13 @@ for.end:                                          ; preds = %for.body, %entry
   ret void
 }
 
-declare double @llvm.round.f64(double) nounwind readnone
+declare double @llvm.round.f64(double)
 
-;CHECK-LABEL: @roundeven_f32(
-;CHECK: llvm.roundeven.v4f32
-;CHECK: ret void
-define void @roundeven_f32(i32 %n, ptr noalias %y, ptr noalias %x) nounwind uwtable {
+define void @roundeven_f32(i32 %n, ptr %y, ptr %x) {
+; CHECK-LABEL: @roundeven_f32(
+; CHECK: llvm.roundeven.v4f32
+; CHECK: ret void
+;
 entry:
   %cmp6 = icmp sgt i32 %n, 0
   br i1 %cmp6, label %for.body, label %for.end
@@ -844,7 +881,7 @@ for.body:                                         ; preds = %entry, %for.body
   %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
   %arrayidx = getelementptr inbounds float, ptr %y, i64 %indvars.iv
   %0 = load float, ptr %arrayidx, align 4
-  %call = tail call float @llvm.roundeven.f32(float %0) nounwind readnone
+  %call = tail call float @llvm.roundeven.f32(float %0)
   %arrayidx2 = getelementptr inbounds float, ptr %x, i64 %indvars.iv
   store float %call, ptr %arrayidx2, align 4
   %indvars.iv.next = add i64 %indvars.iv, 1
@@ -856,12 +893,13 @@ for.end:                                          ; preds = %for.body, %entry
   ret void
 }
 
-declare float @llvm.roundeven.f32(float) nounwind readnone
+declare float @llvm.roundeven.f32(float)
 
-;CHECK-LABEL: @roundeven_f64(
-;CHECK: llvm.roundeven.v4f64
-;CHECK: ret void
-define void @roundeven_f64(i32 %n, ptr noalias %y, ptr noalias %x) nounwind uwtable {
+define void @roundeven_f64(i32 %n, ptr %y, ptr %x) {
+; CHECK-LABEL: @roundeven_f64(
+; CHECK: llvm.roundeven.v4f64
+; CHECK: ret void
+;
 entry:
   %cmp6 = icmp sgt i32 %n, 0
   br i1 %cmp6, label %for.body, label %for.end
@@ -870,7 +908,7 @@ for.body:                                         ; preds = %entry, %for.body
   %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
   %arrayidx = getelementptr inbounds double, ptr %y, i64 %indvars.iv
   %0 = load double, ptr %arrayidx, align 8
-  %call = tail call double @llvm.roundeven.f64(double %0) nounwind readnone
+  %call = tail call double @llvm.roundeven.f64(double %0)
   %arrayidx2 = getelementptr inbounds double, ptr %x, i64 %indvars.iv
   store double %call, ptr %arrayidx2, align 8
   %indvars.iv.next = add i64 %indvars.iv, 1
@@ -882,12 +920,13 @@ for.end:                                          ; preds = %for.body, %entry
   ret void
 }
 
-declare double @llvm.roundeven.f64(double) nounwind readnone
+declare double @llvm.roundeven.f64(double)
 
-;CHECK-LABEL: @fma_f32(
-;CHECK: llvm.fma.v4f32
-;CHECK: ret void
-define void @fma_f32(i32 %n, ptr noalias %y, ptr noalias %x, ptr noalias %z, ptr noalias %w) nounwind uwtable {
+define void @fma_f32(i32 %n, ptr %y, ptr %x, ptr %z, ptr %w) {
+; CHECK-LABEL: @fma_f32(
+; CHECK: llvm.fma.v4f32
+; CHECK: ret void
+;
 entry:
   %cmp12 = icmp sgt i32 %n, 0
   br i1 %cmp12, label %for.body, label %for.end
@@ -912,12 +951,13 @@ for.end:                                          ; preds = %for.body, %entry
   ret void
 }
 
-declare float @llvm.fma.f32(float, float, float) nounwind readnone
+declare float @llvm.fma.f32(float, float, float)
 
-;CHECK-LABEL: @fma_f64(
-;CHECK: llvm.fma.v4f64
-;CHECK: ret void
-define void @fma_f64(i32 %n, ptr noalias %y, ptr noalias %x, ptr noalias %z, ptr noalias %w) nounwind uwtable {
+define void @fma_f64(i32 %n, ptr %y, ptr %x, ptr %z, ptr %w) {
+; CHECK-LABEL: @fma_f64(
+; CHECK: llvm.fma.v4f64
+; CHECK: ret void
+;
 entry:
   %cmp12 = icmp sgt i32 %n, 0
   br i1 %cmp12, label %for.body, label %for.end
@@ -942,12 +982,13 @@ for.end:                                          ; preds = %for.body, %entry
   ret void
 }
 
-declare double @llvm.fma.f64(double, double, double) nounwind readnone
+declare double @llvm.fma.f64(double, double, double)
 
-;CHECK-LABEL: @fmuladd_f32(
-;CHECK: llvm.fmuladd.v4f32
-;CHECK: ret void
-define void @fmuladd_f32(i32 %n, ptr noalias %y, ptr noalias %x, ptr noalias %z, ptr noalias %w) nounwind uwtable {
+define void @fmuladd_f32(i32 %n, ptr %y, ptr %x, ptr %z, ptr %w) {
+; CHECK-LABEL: @fmuladd_f32(
+; CHECK: llvm.fmuladd.v4f32
+; CHECK: ret void
+;
 entry:
   %cmp12 = icmp sgt i32 %n, 0
   br i1 %cmp12, label %for.body, label %for.end
@@ -972,12 +1013,13 @@ for.end:                                          ; preds = %for.body, %entry
   ret void
 }
 
-declare float @llvm.fmuladd.f32(float, float, float) nounwind readnone
+declare float @llvm.fmuladd.f32(float, float, float)
 
-;CHECK-LABEL: @fmuladd_f64(
-;CHECK: llvm.fmuladd.v4f64
-;CHECK: ret void
-define void @fmuladd_f64(i32 %n, ptr noalias %y, ptr noalias %x, ptr noalias %z, ptr noalias %w) nounwind uwtable {
+define void @fmuladd_f64(i32 %n, ptr %y, ptr %x, ptr %z, ptr %w) {
+; CHECK-LABEL: @fmuladd_f64(
+; CHECK: llvm.fmuladd.v4f64
+; CHECK: ret void
+;
 entry:
   %cmp12 = icmp sgt i32 %n, 0
   br i1 %cmp12, label %for.body, label %for.end
@@ -1002,12 +1044,13 @@ for.end:                                          ; preds = %for.body, %entry
   ret void
 }
 
-declare double @llvm.fmuladd.f64(double, double, double) nounwind readnone
+declare double @llvm.fmuladd.f64(double, double, double)
 
-;CHECK-LABEL: @pow_f32(
-;CHECK: llvm.pow.v4f32
-;CHECK: ret void
-define void @pow_f32(i32 %n, ptr noalias %y, ptr noalias %x, ptr noalias %z) nounwind uwtable {
+define void @pow_f32(i32 %n, ptr %y, ptr %x, ptr %z) {
+; CHECK-LABEL: @pow_f32(
+; CHECK: llvm.pow.v4f32
+; CHECK: ret void
+;
 entry:
   %cmp9 = icmp sgt i32 %n, 0
   br i1 %cmp9, label %for.body, label %for.end
@@ -1018,7 +1061,7 @@ for.body:                                         ; preds = %entry, %for.body
   %0 = load float, ptr %arrayidx, align 4
   %arrayidx2 = getelementptr inbounds float, ptr %z, i64 %indvars.iv
   %1 = load float, ptr %arrayidx2, align 4
-  %call = tail call float @llvm.pow.f32(float %0, float %1) nounwind readnone
+  %call = tail call float @llvm.pow.f32(float %0, float %1)
   %arrayidx4 = getelementptr inbounds float, ptr %x, i64 %indvars.iv
   store float %call, ptr %arrayidx4, align 4
   %indvars.iv.next = add i64 %indvars.iv, 1
@@ -1030,12 +1073,13 @@ for.end:                                          ; preds = %for.body, %entry
   ret void
 }
 
-declare float @llvm.pow.f32(float, float) nounwind readnone
+declare float @llvm.pow.f32(float, float)
 
-;CHECK-LABEL: @pow_f64(
-;CHECK: llvm.pow.v4f64
-;CHECK: ret void
-define void @pow_f64(i32 %n, ptr noalias %y, ptr noalias %x, ptr noalias %z) nounwind uwtable {
+define void @pow_f64(i32 %n, ptr %y, ptr %x, ptr %z) {
+; CHECK-LABEL: @pow_f64(
+; CHECK: llvm.pow.v4f64
+; CHECK: ret void
+;
 entry:
   %cmp9 = icmp sgt i32 %n, 0
   br i1 %cmp9, label %for.body, label %for.end
@@ -1046,7 +1090,7 @@ for.body:                                         ; preds = %entry, %for.body
   %0 = load double, ptr %arrayidx, align 8
   %arrayidx2 = getelementptr inbounds double, ptr %z, i64 %indvars.iv
   %1 = load double, ptr %arrayidx2, align 8
-  %call = tail call double @llvm.pow.f64(double %0, double %1) nounwind readnone
+  %call = tail call double @llvm.pow.f64(double %0, double %1)
   %arrayidx4 = getelementptr inbounds double, ptr %x, i64 %indvars.iv
   store double %call, ptr %arrayidx4, align 8
   %indvars.iv.next = add i64 %indvars.iv, 1
@@ -1058,10 +1102,11 @@ for.end:                                          ; preds = %for.body, %entry
   ret void
 }
 
+define void @fabs_libm(ptr %x) {
 ; CHECK: fabs_libm
-; CHECK:  call <4 x float> @llvm.fabs.v4f32
+; CHECK: call <4 x float> @llvm.fabs.v4f32
 ; CHECK: ret void
-define void @fabs_libm(ptr nocapture %x) nounwind {
+;
 entry:
   br label %for.body
 
@@ -1082,20 +1127,19 @@ for.end:                                          ; preds = %for.body
 
 declare float @fabsf(float) nounwind readnone
 
-declare double @llvm.pow.f64(double, double) nounwind readnone
-
-
+declare double @llvm.pow.f64(double, double)
 
 ; Make sure we don't replace calls to functions with standard library function
 ; signatures but defined with internal linkage.
 
-define internal float @roundf(float %x) nounwind readnone {
+define internal float @roundf(float %x) {
   ret float 0.00000000
 }
+define void @internal_round(ptr %x) {
 ; CHECK-LABEL: internal_round
 ; CHECK-NOT:  load <4 x float>
-
-define void @internal_round(ptr nocapture %x) nounwind {
+; CHECK: ret void
+;
 entry:
   br label %for.body
 
@@ -1103,7 +1147,7 @@ for.body:                                         ; preds = %entry, %for.body
   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
   %arrayidx = getelementptr inbounds float, ptr %x, i64 %indvars.iv
   %0 = load float, ptr %arrayidx, align 4
-  %call = tail call float @roundf(float %0) nounwind readnone
+  %call = tail call float @roundf(float %0)
   store float %call, ptr %arrayidx, align 4
   %indvars.iv.next = add i64 %indvars.iv, 1
   %lftr.wideiv = trunc i64 %indvars.iv.next to i32
@@ -1119,10 +1163,11 @@ for.end:                                          ; preds = %for.body
 
 declare void @round(double %f)
 
+define void @wrong_signature(ptr %x) {
 ; CHECK-LABEL: wrong_signature
 ; CHECK-NOT:  load <4 x double>
-
-define void @wrong_signature(ptr nocapture %x) nounwind {
+; CHECK: ret void
+;
 entry:
   br label %for.body
 
@@ -1131,7 +1176,7 @@ for.body:                                         ; preds = %entry, %for.body
   %arrayidx = getelementptr inbounds double, ptr %x, i64 %indvars.iv
   %0 = load double, ptr %arrayidx, align 4
   store double %0, ptr %arrayidx, align 4
-  tail call void @round(double %0) nounwind readnone
+  tail call void @round(double %0)
   %indvars.iv.next = add i64 %indvars.iv, 1
   %lftr.wideiv = trunc i64 %indvars.iv.next to i32
   %exitcond = icmp eq i32 %lftr.wideiv, 1024
@@ -1141,12 +1186,13 @@ for.end:                                          ; preds = %for.body
   ret void
 }
 
-declare double @llvm.powi.f64.i32(double %Val, i32 %power) nounwind readnone
+declare double @llvm.powi.f64.i32(double %Val, i32 %power)
 
-;CHECK-LABEL: @powi_f64(
-;CHECK: llvm.powi.v4f64
-;CHECK: ret void
-define void @powi_f64(i32 %n, ptr noalias %y, ptr noalias %x, i32 %P) nounwind uwtable {
+define void @powi_f64(i32 %n, ptr %y, ptr %x, i32 %P) {
+; CHECK-LABEL: @powi_f64(
+; CHECK: llvm.powi.v4f64
+; CHECK: ret void
+;
 entry:
   %cmp9 = icmp sgt i32 %n, 0
   br i1 %cmp9, label %for.body, label %for.end
@@ -1155,7 +1201,7 @@ for.body:                                         ; preds = %entry, %for.body
   %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
   %arrayidx = getelementptr inbounds double, ptr %y, i64 %indvars.iv
   %0 = load double, ptr %arrayidx, align 8
-  %call = tail call double @llvm.powi.f64.i32(double %0, i32  %P) nounwind readnone
+  %call = tail call double @llvm.powi.f64.i32(double %0, i32  %P)
   %arrayidx4 = getelementptr inbounds double, ptr %x, i64 %indvars.iv
   store double %call, ptr %arrayidx4, align 8
   %indvars.iv.next = add i64 %indvars.iv, 1
@@ -1167,10 +1213,11 @@ for.end:                                          ; preds = %for.body, %entry
   ret void
 }
 
-;CHECK-LABEL: @powi_f64_neg(
-;CHECK-NOT: llvm.powi.v4f64
-;CHECK: ret void
-define void @powi_f64_neg(i32 %n, ptr noalias %y, ptr noalias %x) nounwind uwtable {
+define void @powi_f64_neg(i32 %n, ptr %y, ptr %x) {
+; CHECK-LABEL: @powi_f64_neg(
+; CHECK-NOT: llvm.powi.v4f64
+; CHECK: ret void
+;
 entry:
   %cmp9 = icmp sgt i32 %n, 0
   br i1 %cmp9, label %for.body, label %for.end
@@ -1180,7 +1227,7 @@ for.body:                                         ; preds = %entry, %for.body
   %arrayidx = getelementptr inbounds double, ptr %y, i64 %indvars.iv
   %0 = load double, ptr %arrayidx, align 8
   %1 = trunc i64 %indvars.iv to i32
-  %call = tail call double @llvm.powi.f64.i32(double %0, i32  %1) nounwind readnone
+  %call = tail call double @llvm.powi.f64.i32(double %0, i32  %1)
   %arrayidx4 = getelementptr inbounds double, ptr %x, i64 %indvars.iv
   store double %call, ptr %arrayidx4, align 8
   %indvars.iv.next = add i64 %indvars.iv, 1
@@ -1192,12 +1239,13 @@ for.end:                                          ; preds = %for.body, %entry
   ret void
 }
 
-declare i64  @llvm.cttz.i64 (i64, i1) nounwind readnone
+declare i64  @llvm.cttz.i64 (i64, i1)
 
-;CHECK-LABEL: @cttz_f64(
-;CHECK: llvm.cttz.v4i64
-;CHECK: ret void
-define void @cttz_f64(i32 %n, ptr noalias %y, ptr noalias %x) nounwind uwtable {
+define void @cttz_f64(i32 %n, ptr %y, ptr %x) {
+; CHECK-LABEL: @cttz_f64(
+; CHECK: llvm.cttz.v4i64
+; CHECK: ret void
+;
 entry:
   %cmp9 = icmp sgt i32 %n, 0
   br i1 %cmp9, label %for.body, label %for.end
@@ -1206,7 +1254,7 @@ for.body:                                         ; preds = %entry, %for.body
   %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
   %arrayidx = getelementptr inbounds i64, ptr %y, i64 %indvars.iv
   %0 = load i64, ptr %arrayidx, align 8
-  %call = tail call i64 @llvm.cttz.i64(i64 %0, i1 true) nounwind readnone
+  %call = tail call i64 @llvm.cttz.i64(i64 %0, i1 true)
   %arrayidx4 = getelementptr inbounds i64, ptr %x, i64 %indvars.iv
   store i64 %call, ptr %arrayidx4, align 8
   %indvars.iv.next = add i64 %indvars.iv, 1
@@ -1218,12 +1266,13 @@ for.end:                                          ; preds = %for.body, %entry
   ret void
 }
 
-declare i64  @llvm.ctlz.i64 (i64, i1) nounwind readnone
+declare i64  @llvm.ctlz.i64 (i64, i1)
 
-;CHECK-LABEL: @ctlz_f64(
-;CHECK: llvm.ctlz.v4i64
-;CHECK: ret void
-define void @ctlz_f64(i32 %n, ptr noalias %y, ptr noalias %x) nounwind uwtable {
+define void @ctlz_f64(i32 %n, ptr %y, ptr %x) {
+; CHECK-LABEL: @ctlz_f64(
+; CHECK: llvm.ctlz.v4i64
+; CHECK: ret void
+;
 entry:
   %cmp9 = icmp sgt i32 %n, 0
   br i1 %cmp9, label %for.body, label %for.end
@@ -1232,7 +1281,7 @@ for.body:                                         ; preds = %entry, %for.body
   %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
   %arrayidx = getelementptr inbounds i64, ptr %y, i64 %indvars.iv
   %0 = load i64, ptr %arrayidx, align 8
-  %call = tail call i64 @llvm.ctlz.i64(i64 %0, i1 true) nounwind readnone
+  %call = tail call i64 @llvm.ctlz.i64(i64 %0, i1 true)
   %arrayidx4 = getelementptr inbounds i64, ptr %x, i64 %indvars.iv
   store i64 %call, ptr %arrayidx4, align 8
   %indvars.iv.next = add i64 %indvars.iv, 1
@@ -1244,12 +1293,13 @@ for.end:                                          ; preds = %for.body, %entry
   ret void
 }
 
-declare i64 @llvm.abs.i64 (i64, i1) nounwind readnone
+declare i64 @llvm.abs.i64 (i64, i1)
 
-define void @abs_i64(i32 %n, ptr noalias %y, ptr noalias %x) nounwind uwtable {
-;CHECK-LABEL: @abs_i64(
-;CHECK: llvm.abs.v4i64(<4 x i64> [[WIDE_LOADX:%.*]], i1 true)
-;CHECK: ret void
+define void @abs_i64(i32 %n, ptr %y, ptr %x) {
+; CHECK-LABEL: @abs_i64(
+; CHECK: llvm.abs.v4i64(<4 x i64> [[WIDE_LOADX:%.*]], i1 true)
+; CHECK: ret void
+;
 entry:
   %cmp9 = icmp sgt i32 %n, 0
   br i1 %cmp9, label %for.body, label %for.end
@@ -1258,7 +1308,7 @@ for.body:                                         ; preds = %entry, %for.body
   %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
   %arrayidx = getelementptr inbounds i64, ptr %y, i64 %indvars.iv
   %0 = load i64, ptr %arrayidx, align 8
-  %call = tail call i64 @llvm.abs.i64(i64 %0, i1 true) nounwind readnone
+  %call = tail call i64 @llvm.abs.i64(i64 %0, i1 true)
   %arrayidx4 = getelementptr inbounds i64, ptr %x, i64 %indvars.iv
   store i64 %call, ptr %arrayidx4, align 8
   %indvars.iv.next = add i64 %indvars.iv, 1
@@ -1272,10 +1322,11 @@ for.end:                                          ; preds = %for.body, %entry
 
 declare i32 @llvm.smin.i32 (i32, i32)
 
-define void @smin_i32(i32 %n, ptr noalias %x, ptr noalias %y) {
+define void @smin_i32(i32 %n, ptr %x, ptr %y) {
 ; CHECK-LABEL: @smin_i32(
 ; CHECK:         call <4 x i32> @llvm.smin.v4i32(<4 x i32> [[WIDE_LOADX:%.*]], <4 x i32> [[WIDE_LOADY:%.*]])
 ; CHECK:         ret void
+;
 entry:
   %cmp = icmp sgt i32 %n, 0
   br i1 %cmp, label %loop, label %end
@@ -1298,10 +1349,11 @@ end:
 
 declare i32 @llvm.smax.i32 (i32, i32)
 
-define void @smax_i32(i32 %n, ptr noalias %x, ptr noalias %y) {
+define void @smax_i32(i32 %n, ptr %x, ptr %y) {
 ; CHECK-LABEL: @smax_i32(
 ; CHECK:         call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[WIDE_LOADX:%.*]], <4 x i32> [[WIDE_LOADY:%.*]])
 ; CHECK:         ret void
+;
 entry:
   %cmp = icmp sgt i32 %n, 0
   br i1 %cmp, label %loop, label %end
@@ -1324,10 +1376,11 @@ end:
 
 declare i32 @llvm.umin.i32 (i32, i32)
 
-define void @umin_i32(i32 %n, ptr noalias %x, ptr noalias %y) {
+define void @umin_i32(i32 %n, ptr %x, ptr %y) {
 ; CHECK-LABEL: @umin_i32(
 ; CHECK:         call <4 x i32> @llvm.umin.v4i32(<4 x i32> [[WIDE_LOADX:%.*]], <4 x i32> [[WIDE_LOADY:%.*]])
 ; CHECK:         ret void
+;
 entry:
   %cmp = icmp sgt i32 %n, 0
   br i1 %cmp, label %loop, label %end
@@ -1350,10 +1403,11 @@ end:
 
 declare i32 @llvm.umax.i32 (i32, i32)
 
-define void @umax_i32(i32 %n, ptr noalias %x, ptr noalias %y) {
+define void @umax_i32(i32 %n, ptr %x, ptr %y) {
 ; CHECK-LABEL: @umax_i32(
 ; CHECK:         call <4 x i32> @llvm.umax.v4i32(<4 x i32> [[WIDE_LOADX:%.*]], <4 x i32> [[WIDE_LOADY:%.*]])
 ; CHECK:         ret void
+;
 entry:
   %cmp = icmp sgt i32 %n, 0
   br i1 %cmp, label %loop, label %end
@@ -1376,10 +1430,11 @@ end:
 
 declare i32 @llvm.fshl.i32 (i32, i32, i32)
 
-define void @fshl_i32(i32 %n, ptr noalias %x, ptr noalias %y, i32 %shAmt) {
+define void @fshl_i32(i32 %n, ptr %x, ptr %y, i32 %shAmt) {
 ; CHECK-LABEL: @fshl_i32(
 ; CHECK:         call <4 x i32> @llvm.fshl.v4i32(<4 x i32> [[WIDE_LOADX:%.*]], <4 x i32> [[WIDE_LOADY:%.*]], <4 x i32> [[SPLAT:%.*]])
 ; CHECK:         ret void
+;
 entry:
   %cmp = icmp sgt i32 %n, 0
   br i1 %cmp, label %loop, label %end
@@ -1402,10 +1457,11 @@ end:
 
 declare i32 @llvm.fshr.i32 (i32, i32, i32)
 
-define void @fshr_i32(i32 %n, ptr noalias %x, ptr noalias %y, i32 %shAmt) {
+define void @fshr_i32(i32 %n, ptr %x, ptr %y, i32 %shAmt) {
 ; CHECK-LABEL: @fshr_i32(
 ; CHECK:         call <4 x i32> @llvm.fshr.v4i32(<4 x i32> [[WIDE_LOADX:%.*]], <4 x i32> [[WIDE_LOADY:%.*]], <4 x i32> [[SPLAT:%.*]])
 ; CHECK:         ret void
+;
 entry:
   %cmp = icmp sgt i32 %n, 0
   br i1 %cmp, label %loop, label %end
@@ -1426,12 +1482,13 @@ end:
   ret void
 }
 
-declare float @llvm.minnum.f32(float, float) nounwind readnone
+declare float @llvm.minnum.f32(float, float)
 
-;CHECK-LABEL: @minnum_f32(
-;CHECK: llvm.minnum.v4f32
-;CHECK: ret void
-define void @minnum_f32(i32 %n, ptr noalias %y, ptr noalias %x, ptr noalias %z) nounwind uwtable {
+define void @minnum_f32(i32 %n, ptr %y, ptr %x, ptr %z) {
+; CHECK-LABEL: @minnum_f32(
+; CHECK: llvm.minnum.v4f32
+; CHECK: ret void
+;
 entry:
   %cmp9 = icmp sgt i32 %n, 0
   br i1 %cmp9, label %for.body, label %for.end
@@ -1442,7 +1499,7 @@ for.body:                                         ; preds = %entry, %for.body
   %0 = load float, ptr %arrayidx, align 4
   %arrayidx2 = getelementptr inbounds float, ptr %z, i64 %indvars.iv
   %1 = load float, ptr %arrayidx2, align 4
-  %call = tail call float @llvm.minnum.f32(float %0, float %1) nounwind readnone
+  %call = tail call float @llvm.minnum.f32(float %0, float %1)
   %arrayidx4 = getelementptr inbounds float, ptr %x, i64 %indvars.iv
   store float %call, ptr %arrayidx4, align 4
   %indvars.iv.next = add i64 %indvars.iv, 1
@@ -1454,12 +1511,13 @@ for.end:                                          ; preds = %for.body, %entry
   ret void
 }
 
-declare float @llvm.maxnum.f32(float, float) nounwind readnone
+declare float @llvm.maxnum.f32(float, float)
 
-;CHECK-LABEL: @maxnum_f32(
-;CHECK: llvm.maxnum.v4f32
-;CHECK: ret void
-define void @maxnum_f32(i32 %n, ptr noalias %y, ptr noalias %x, ptr noalias %z) nounwind uwtable {
+define void @maxnum_f32(i32 %n, ptr %y, ptr %x, ptr %z) {
+; CHECK-LABEL: @maxnum_f32(
+; CHECK: llvm.maxnum.v4f32
+; CHECK: ret void
+;
 entry:
   %cmp9 = icmp sgt i32 %n, 0
   br i1 %cmp9, label %for.body, label %for.end
@@ -1470,7 +1528,7 @@ for.body:                                         ; preds = %entry, %for.body
   %0 = load float, ptr %arrayidx, align 4
   %arrayidx2 = getelementptr inbounds float, ptr %z, i64 %indvars.iv
   %1 = load float, ptr %arrayidx2, align 4
-  %call = tail call float @llvm.maxnum.f32(float %0, float %1) nounwind readnone
+  %call = tail call float @llvm.maxnum.f32(float %0, float %1)
   %arrayidx4 = getelementptr inbounds float, ptr %x, i64 %indvars.iv
   store float %call, ptr %arrayidx4, align 4
   %indvars.iv.next = add i64 %indvars.iv, 1
@@ -1482,12 +1540,13 @@ for.end:                                          ; preds = %for.body, %entry
   ret void
 }
 
-declare float @llvm.minimum.f32(float, float) nounwind readnone
+declare float @llvm.minimum.f32(float, float)
 
-;CHECK-LABEL: @minimum_f32(
-;CHECK: llvm.minimum.v4f32
-;CHECK: ret void
-define void @minimum_f32(i32 %n, ptr noalias %y, ptr noalias %x, ptr noalias %z) nounwind uwtable {
+define void @minimum_f32(i32 %n, ptr %y, ptr %x, ptr %z) {
+; CHECK-LABEL: @minimum_f32(
+; CHECK: llvm.minimum.v4f32
+; CHECK: ret void
+;
 entry:
   %cmp9 = icmp sgt i32 %n, 0
   br i1 %cmp9, label %for.body, label %for.end
@@ -1498,7 +1557,7 @@ for.body:                                         ; preds = %entry, %for.body
   %0 = load float, ptr %arrayidx, align 4
   %arrayidx2 = getelementptr inbounds float, ptr %z, i64 %indvars.iv
   %1 = load float, ptr %arrayidx2, align 4
-  %call = tail call float @llvm.minimum.f32(float %0, float %1) nounwind readnone
+  %call = tail call float @llvm.minimum.f32(float %0, float %1)
   %arrayidx4 = getelementptr inbounds float, ptr %x, i64 %indvars.iv
   store float %call, ptr %arrayidx4, align 4
   %indvars.iv.next = add i64 %indvars.iv, 1
@@ -1510,12 +1569,13 @@ for.end:                                          ; preds = %for.body, %entry
   ret void
 }
 
-declare float @llvm.maximum.f32(float, float) nounwind readnone
+declare float @llvm.maximum.f32(float, float)
 
-;CHECK-LABEL: @maximum_f32(
-;CHECK: llvm.maximum.v4f32
-;CHECK: ret void
-define void @maximum_f32(i32 %n, ptr noalias %y, ptr noalias %x, ptr noalias %z) nounwind uwtable {
+define void @maximum_f32(i32 %n, ptr %y, ptr %x, ptr %z) {
+; CHECK-LABEL: @maximum_f32(
+; CHECK: llvm.maximum.v4f32
+; CHECK: ret void
+;
 entry:
   %cmp9 = icmp sgt i32 %n, 0
   br i1 %cmp9, label %for.body, label %for.end
@@ -1526,7 +1586,7 @@ for.body:                                         ; preds = %entry, %for.body
   %0 = load float, ptr %arrayidx, align 4
   %arrayidx2 = getelementptr inbounds float, ptr %z, i64 %indvars.iv
   %1 = load float, ptr %arrayidx2, align 4
-  %call = tail call float @llvm.maximum.f32(float %0, float %1) nounwind readnone
+  %call = tail call float @llvm.maximum.f32(float %0, float %1)
   %arrayidx4 = getelementptr inbounds float, ptr %x, i64 %indvars.iv
   store float %call, ptr %arrayidx4, align 4
   %indvars.iv.next = add i64 %indvars.iv, 1

From dbb4f90252d19878d08a8dbe25d101d6c7af1f57 Mon Sep 17 00:00:00 2001
From: David Truby <david.truby@arm.com>
Date: Tue, 31 Oct 2023 12:51:38 +0000
Subject: [PATCH 509/877] [compiler-rt] Don't link builtins against the CRT on
 Windows (#70675)

compiler-rt/builtins doesn't depend on anything from the CRT but
currently links against it and embeds a `/defaultlib:msvcrt` in the
`.lib` file, forcing anyone linking against it to also link against that
specific CRT. This isn't necessary as the end user can just choose which
CRT they want to use independently.
---
 compiler-rt/cmake/builtin-config-ix.cmake | 1 +
 compiler-rt/lib/builtins/CMakeLists.txt   | 6 ++++++
 2 files changed, 7 insertions(+)

diff --git a/compiler-rt/cmake/builtin-config-ix.cmake b/compiler-rt/cmake/builtin-config-ix.cmake
index e91e3923a756c..9de05bb298efa 100644
--- a/compiler-rt/cmake/builtin-config-ix.cmake
+++ b/compiler-rt/cmake/builtin-config-ix.cmake
@@ -18,6 +18,7 @@ builtin_check_c_compiler_flag(-fno-profile-generate COMPILER_RT_HAS_FNO_PROFILE_
 builtin_check_c_compiler_flag(-fno-profile-instr-generate COMPILER_RT_HAS_FNO_PROFILE_INSTR_GENERATE_FLAG)
 builtin_check_c_compiler_flag(-fno-profile-instr-use COMPILER_RT_HAS_FNO_PROFILE_INSTR_USE_FLAG)
 builtin_check_c_compiler_flag(-Wno-pedantic         COMPILER_RT_HAS_WNO_PEDANTIC)
+builtin_check_c_compiler_flag(/Zl COMPILER_RT_HAS_ZL_FLAG)
 
 builtin_check_c_compiler_source(COMPILER_RT_HAS_ATOMIC_KEYWORD
 "
diff --git a/compiler-rt/lib/builtins/CMakeLists.txt b/compiler-rt/lib/builtins/CMakeLists.txt
index a705486e2d3c8..360fdb0e99b57 100644
--- a/compiler-rt/lib/builtins/CMakeLists.txt
+++ b/compiler-rt/lib/builtins/CMakeLists.txt
@@ -761,6 +761,12 @@ else ()
 
   append_list_if(COMPILER_RT_HAS_STD_C11_FLAG -std=c11 BUILTIN_CFLAGS)
 
+  # Don't embed directives for picking any specific CRT
+  if (MSVC)
+    set(CMAKE_MSVC_RUNTIME_LIBRARY "")
+    append_list_if(COMPILER_RT_HAS_ZL_FLAG /Zl BUILTIN_CFLAGS)
+  endif()
+
   # These flags would normally be added to CMAKE_C_FLAGS by the llvm
   # cmake step. Add them manually if this is a standalone build.
   if(COMPILER_RT_STANDALONE_BUILD)

From 748f861bea5e22520e490efef15f334994602669 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Tue, 31 Oct 2023 21:51:50 +0900
Subject: [PATCH 510/877] llvm-reduce: Handle cloning for MachineJumpTableInfo
 (#69086)

---
 .../mir/preserve-jump-table-info.mir          | 97 +++++++++++++++++++
 llvm/tools/llvm-reduce/ReducerWorkItem.cpp    | 22 +++++
 2 files changed, 119 insertions(+)
 create mode 100644 llvm/test/tools/llvm-reduce/mir/preserve-jump-table-info.mir

diff --git a/llvm/test/tools/llvm-reduce/mir/preserve-jump-table-info.mir b/llvm/test/tools/llvm-reduce/mir/preserve-jump-table-info.mir
new file mode 100644
index 0000000000000..3bfdc1c013c46
--- /dev/null
+++ b/llvm/test/tools/llvm-reduce/mir/preserve-jump-table-info.mir
@@ -0,0 +1,97 @@
+# REQUIRES: aarch64-registered-target
+# FIXME: Fails with -abort-on-invalid-reduction
+# RUN: llvm-reduce -simplify-mir --delta-passes=instructions -mtriple=aarch64-- --test FileCheck --test-arg --check-prefix=CHECK-INTERESTINGNESS --test-arg %s --test-arg --input-file %s -o %t 2> %t.log
+# RUN: FileCheck --match-full-lines --check-prefix=RESULT %s < %t
+
+# Check that jump table info is preserved through cloning
+
+# CHECK-INTERESTINGNESS: MOVi32imm
+
+# RESULT: jumpTable:
+# RESULT-NEXT: kind:            label-difference32
+# RESULT-NEXT: entries:
+# RESULT-NEXT: - id:              0
+# RESULT-NEXT: blocks:          [ '%bb.9', '%bb.5', '%bb.2', '%bb.2', '%bb.2', '%bb.2',
+# RESULT-NEXT:                    '%bb.2', '%bb.2', '%bb.2', '%bb.2', '%bb.6', '%bb.2',
+# RESULT-NEXT:                    '%bb.2', '%bb.2', '%bb.2', '%bb.2', '%bb.2', '%bb.2',
+# RESULT-NEXT:                    '%bb.2', '%bb.2', '%bb.2', '%bb.2', '%bb.2', '%bb.2',
+# RESULT-NEXT:                    '%bb.2', '%bb.2', '%bb.2', '%bb.2', '%bb.2', '%bb.2',
+# RESULT-NEXT:                    '%bb.2', '%bb.2', '%bb.2', '%bb.2', '%bb.7', '%bb.2',
+# RESULT-NEXT:                    '%bb.2', '%bb.2', '%bb.2', '%bb.7' ]
+
+---
+name:            widget
+tracksRegLiveness: true
+jumpTable:
+  kind:            label-difference32
+  entries:
+    - id:              0
+      blocks:          [ '%bb.9', '%bb.5', '%bb.2', '%bb.2', '%bb.2', '%bb.2',
+                         '%bb.2', '%bb.2', '%bb.2', '%bb.2', '%bb.6', '%bb.2',
+                         '%bb.2', '%bb.2', '%bb.2', '%bb.2', '%bb.2', '%bb.2',
+                         '%bb.2', '%bb.2', '%bb.2', '%bb.2', '%bb.2', '%bb.2',
+                         '%bb.2', '%bb.2', '%bb.2', '%bb.2', '%bb.2', '%bb.2',
+                         '%bb.2', '%bb.2', '%bb.2', '%bb.2', '%bb.7', '%bb.2',
+                         '%bb.2', '%bb.2', '%bb.2', '%bb.7' ]
+body:             |
+  bb.0:
+    liveins: $w0, $w1, $x2, $x3, $x4, $w5, $w6
+
+    %0:gpr32 = COPY $w6
+    %1:gpr32 = COPY $w5
+    %2:gpr64common = COPY $x4
+    %3:gpr64 = COPY $x3
+    %4:gpr64common = COPY $x2
+    %5:gpr32common = COPY $w1
+    %6:gpr32 = COPY $w0
+    undef %7.sub_32:gpr64 = ORRWrs $wzr, %5, 0, implicit-def %7
+    %8:gpr64common = MOVaddrJT target-flags(aarch64-page) %jump-table.0, target-flags(aarch64-pageoff, aarch64-nc) %jump-table.0
+    %9:gpr32 = MOVi32imm 1
+    undef %10.sub_32:gpr64 = IMPLICIT_DEF
+
+  bb.1:
+
+  bb.2:
+    successors: %bb.3(0x0fbefbf0), %bb.4(0x70410410)
+
+    dead $wzr = SUBSWri %5, 39, 0, implicit-def $nzcv
+    Bcc 8, %bb.3, implicit killed $nzcv
+    B %bb.4
+
+  bb.3:
+    successors: %bb.11(0x00000000), %bb.2(0x80000000)
+
+    dead $wzr = SUBSWri %5, 64, 0, implicit-def $nzcv
+    Bcc 0, %bb.11, implicit killed $nzcv
+    B %bb.2
+
+  bb.4:
+    successors: %bb.9(0x01288b01), %bb.5(0x01288b01), %bb.2(0x11f46a91), %bb.6(0x23e8d524), %bb.7(0x47d1aa49)
+
+    early-clobber %11:gpr64, dead early-clobber %12:gpr64sp = JumpTableDest32 %8, %7, %jump-table.0
+    JUMP_TABLE_DEBUG_INFO 0
+    BR %11
+
+  bb.5:
+    B %bb.8
+
+  bb.6:
+    B %bb.2
+
+  bb.7:
+    B %bb.2
+
+  bb.8:
+    B %bb.8
+
+  bb.9:
+    TBZW %0, 0, %bb.1
+    B %bb.10
+
+  bb.10:
+    B %bb.1
+
+  bb.11:
+    BRK 1
+
+...
diff --git a/llvm/tools/llvm-reduce/ReducerWorkItem.cpp b/llvm/tools/llvm-reduce/ReducerWorkItem.cpp
index 2a327bf134601..f38d9b8a1d5f0 100644
--- a/llvm/tools/llvm-reduce/ReducerWorkItem.cpp
+++ b/llvm/tools/llvm-reduce/ReducerWorkItem.cpp
@@ -19,6 +19,7 @@
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
@@ -153,6 +154,23 @@ static void cloneFrameInfo(
   }
 }
 
+static void cloneJumpTableInfo(
+    MachineFunction &DstMF, const MachineJumpTableInfo &SrcJTI,
+    const DenseMap<MachineBasicBlock *, MachineBasicBlock *> &Src2DstMBB) {
+
+  auto *DstJTI = DstMF.getOrCreateJumpTableInfo(SrcJTI.getEntryKind());
+
+  std::vector<MachineBasicBlock *> DstBBs;
+
+  for (const MachineJumpTableEntry &Entry : SrcJTI.getJumpTables()) {
+    for (MachineBasicBlock *X : Entry.MBBs)
+      DstBBs.push_back(Src2DstMBB.find(X)->second);
+
+    DstJTI->createJumpTableIndex(DstBBs);
+    DstBBs.clear();
+  }
+}
+
 static void cloneMemOperands(MachineInstr &DstMI, MachineInstr &SrcMI,
                              MachineFunction &SrcMF, MachineFunction &DstMF) {
   // The new MachineMemOperands should be owned by the new function's
@@ -266,6 +284,10 @@ static std::unique_ptr<MachineFunction> cloneMF(MachineFunction *SrcMF,
   // Copy stack objects and other info
   cloneFrameInfo(DstMFI, SrcMFI, Src2DstMBB);
 
+  if (MachineJumpTableInfo *SrcJTI = SrcMF->getJumpTableInfo()) {
+    cloneJumpTableInfo(*DstMF, *SrcJTI, Src2DstMBB);
+  }
+
   // Remap the debug info frame index references.
   DstMF->VariableDbgInfos = SrcMF->VariableDbgInfos;
 

From e62d25e37d3f218b6dbe4289a8cf7f32c5bc110b Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Tue, 31 Oct 2023 21:52:36 +0900
Subject: [PATCH 511/877] RegisterCoalescer: Relax assert for super register
 def rematerialization (#69088)

---
 llvm/lib/CodeGen/RegisterCoalescer.cpp        | 13 ++-
 ...icit-def-regression-imp-operand-assert.mir | 39 ++++++++
 .../X86/coalescer-implicit-def-regression.mir | 91 +++++++++++++++++++
 3 files changed, 139 insertions(+), 4 deletions(-)
 create mode 100644 llvm/test/CodeGen/X86/coalescer-implicit-def-regression-imp-operand-assert.mir
 create mode 100644 llvm/test/CodeGen/X86/coalescer-implicit-def-regression.mir

diff --git a/llvm/lib/CodeGen/RegisterCoalescer.cpp b/llvm/lib/CodeGen/RegisterCoalescer.cpp
index 7e5ce300370c9..9858482cd51b4 100644
--- a/llvm/lib/CodeGen/RegisterCoalescer.cpp
+++ b/llvm/lib/CodeGen/RegisterCoalescer.cpp
@@ -1415,6 +1415,9 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP,
   // from SUBREG_TO_REG, such as:
   // $edi = MOV32r0 implicit-def dead $eflags, implicit-def $rdi
   // undef %0.sub_32bit = MOV32r0 implicit-def dead $eflags, implicit-def %0
+  //
+  // The implicit-def of the super register may have been reduced to
+  // subregisters depending on the uses.
 
   bool NewMIDefinesFullReg = false;
 
@@ -1432,12 +1435,14 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP,
         assert(MO.isImplicit() && MO.getReg().isPhysical() &&
                (MO.isDead() ||
                 (DefSubIdx &&
-                 (TRI->getSubReg(MO.getReg(), DefSubIdx) ==
-                  MCRegister((unsigned)NewMI.getOperand(0).getReg())))));
+                 ((TRI->getSubReg(MO.getReg(), DefSubIdx) ==
+                   MCRegister((unsigned)NewMI.getOperand(0).getReg())) ||
+                  TRI->isSubRegisterEq(NewMI.getOperand(0).getReg(),
+                                       MO.getReg())))));
         NewMIImplDefs.push_back(MO.getReg().asMCReg());
       } else {
-        assert(MO.getReg() == NewMI.getOperand(0).getReg() &&
-               MO.getSubReg() == 0);
+        assert(MO.getReg() == NewMI.getOperand(0).getReg());
+
         // We're only expecting another def of the main output, so the range
         // should get updated with the regular output range.
         //
diff --git a/llvm/test/CodeGen/X86/coalescer-implicit-def-regression-imp-operand-assert.mir b/llvm/test/CodeGen/X86/coalescer-implicit-def-regression-imp-operand-assert.mir
new file mode 100644
index 0000000000000..14220ee01131f
--- /dev/null
+++ b/llvm/test/CodeGen/X86/coalescer-implicit-def-regression-imp-operand-assert.mir
@@ -0,0 +1,39 @@
+# RUN: llc -mtriple=x86_64-unknown-linux-gnu -run-pass=register-coalescer -o - %s
+---
+name:  rematerialize_subreg_to_reg_added_impdef_1
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    successors: %bb.1(0x2aaaaaab), %bb.2(0x55555555)
+    liveins: $edi
+
+    %0:gr32 = MOV32r0 implicit-def dead $eflags
+    %1:gr8 = COPY %0.sub_8bit
+    %2:gr64 = SUBREG_TO_REG 0, killed %0, %subreg.sub_32bit
+    JCC_1 %bb.2, 5, implicit killed undef $eflags
+
+  bb.1:
+    successors: %bb.3(0x80000000)
+
+    JMP_1 %bb.3
+
+  bb.2:
+    successors: %bb.3(0x80000000)
+
+    %5:gr64 = IMPLICIT_DEF
+    %2:gr64 = COPY killed %5
+
+  bb.3:
+    successors: %bb.4(0x30000000), %bb.5(0x50000000)
+
+    JCC_1 %bb.5, 5, implicit killed undef $eflags
+
+  bb.4:
+    $al = COPY killed %1
+    RET 0, killed undef $al
+
+  bb.5:
+    MOV64mr undef $noreg, 1, undef $noreg, 0, undef $noreg, killed %2 :: (store (s64))
+    RET 0, killed undef $al
+
+...
diff --git a/llvm/test/CodeGen/X86/coalescer-implicit-def-regression.mir b/llvm/test/CodeGen/X86/coalescer-implicit-def-regression.mir
new file mode 100644
index 0000000000000..722d9b60841a2
--- /dev/null
+++ b/llvm/test/CodeGen/X86/coalescer-implicit-def-regression.mir
@@ -0,0 +1,91 @@
+# RUN: llc -mtriple=x86_64-unknown-linux-gnu -run-pass=register-coalescer -o - %s
+
+# An implicit-def will be added to SUBREG_TO_REG during coalescing
+---
+name:  rematerialize_subreg_to_reg_added_impdef_0
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    successors: %bb.1(0x2aaaaaab), %bb.2(0x55555555)
+    liveins: $edi
+
+    %0:gr32 = MOV32r0 implicit-def dead $eflags
+    %1:gr8 = COPY %0.sub_8bit
+    %2:gr64 = SUBREG_TO_REG 0, killed %0, %subreg.sub_32bit
+    JCC_1 %bb.2, 5, implicit killed undef $eflags
+
+  bb.1:
+    %4:gr8 = COPY %1
+    %5:gr8 = COPY killed undef %1
+    JMP_1 %bb.5
+
+  bb.2:
+    %6:gr64 = IMPLICIT_DEF
+    %2:gr64 = COPY killed %6
+    %5:gr8 = MOV8ri 1
+
+  bb.5:
+    successors: %bb.6(0x30000000), %bb.7(0x50000000)
+
+    TEST8rr killed undef %5, %5, implicit-def $eflags
+    JCC_1 %bb.7, 5, implicit killed undef $eflags
+
+  bb.6:
+    $al = COPY killed %1
+    RET 0, killed undef $al
+
+  bb.7:
+    MOV64mr undef $noreg, 1, undef $noreg, 0, undef $noreg, killed %2 :: (store (s64))
+    RET 0, killed undef $al
+
+...
+
+
+# Reduced version of previous with the SUBREG_TO_REG already folded
+# away.
+#
+# The mov32r0 defines a subregister and has an implicit-def of the
+# super register. After coalescing, the full register implicit def of
+# %2 becomes a different subregister def.
+
+---
+name:  rematerialize_subreg_to_reg_coalesces_to_subreg_impdef
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    successors: %bb.1(0x2aaaaaab), %bb.2(0x55555555)
+    liveins: $edi
+
+    undef %2.sub_32bit:gr64_with_sub_8bit = MOV32r0 implicit-def dead $eflags, implicit-def %2
+    %1:gr8 = COPY %2.sub_8bit
+    JCC_1 %bb.2, 5, implicit killed undef $eflags
+
+  bb.1:
+    successors: %bb.3(0x80000000)
+
+    dead %3:gr8 = COPY %1
+    %4:gr8 = COPY undef %1
+    JMP_1 %bb.3
+
+  bb.2:
+    successors: %bb.3(0x80000000)
+
+    %5:gr64 = IMPLICIT_DEF
+    %2:gr64_with_sub_8bit = COPY %5
+    %4:gr8 = MOV8ri 1
+
+  bb.3:
+    successors: %bb.4(0x30000000), %bb.5(0x50000000)
+
+    TEST8rr undef %4, %4, implicit-def $eflags
+    JCC_1 %bb.5, 5, implicit killed undef $eflags
+
+  bb.4:
+    $al = COPY %1
+    RET 0, killed undef $al
+
+  bb.5:
+    MOV64mr undef $noreg, 1, undef $noreg, 0, undef $noreg, %2 :: (store (s64))
+    RET 0, killed undef $al
+
+...

From e8d5db206c2f5c4afac8288ab96193a5638270ae Mon Sep 17 00:00:00 2001
From: Aleksandr Popov <42888396+aleks-tmb@users.noreply.github.com>
Date: Tue, 31 Oct 2023 14:02:42 +0100
Subject: [PATCH 512/877] [LoopPeeling] Fix weights updating of peeled off
 branches (#70094)

In https://reviews.llvm.org/D64235 a new algorithm has been introduced
for updating the branch weights of latch blocks and their copies.

It increases the probability of going to the exit block for each next
peel iteration, calculating weights by (F - I * E, E), where:
- F is a weight of the edge from latch to header.
- E is a weight of the edge from latch to exit.
- I is a number of peeling iteration.

E.g: Let's say the latch branch weights are (100,300) and the estimated
trip count is 4. If we peel off all 4 iterations the weights of the
copied branches will be:
0: (100,300)
1: (100,200)
2: (100,100)
3: (100,1)

https://godbolt.org/z/93KnoEsT6

So we make the original loop almost unreachable from the 3rd peeled copy
according to the profile data. But that's only true if the profiling
data is accurate.
Underestimated trip count can lead to a performance issues with the
register allocator, which may decide to spill intervals inside the loop
assuming it's unreachable.

Since we don't know how accurate the profiling data is, it seems better
to set neutral 1/1 weights on the last peeled latch branch. After this
change, the weights in the example above will look like this:
0: (100,300)
1: (100,200)
2: (100,100)
3: (100,100)

Co-authored-by: Aleksandr Popov <apopov@azul.com>
---
 llvm/lib/Transforms/Utils/LoopPeel.cpp                | 11 ++++++++---
 .../test/Transforms/LoopUnroll/peel-loop-pgo-deopt.ll |  5 ++---
 llvm/test/Transforms/LoopUnroll/peel-loop-pgo.ll      |  5 ++---
 3 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/Transforms/Utils/LoopPeel.cpp b/llvm/lib/Transforms/Utils/LoopPeel.cpp
index 31f065b691f86..2881444206b0b 100644
--- a/llvm/lib/Transforms/Utils/LoopPeel.cpp
+++ b/llvm/lib/Transforms/Utils/LoopPeel.cpp
@@ -636,9 +636,14 @@ static void updateBranchWeights(Instruction *Term, WeightInfo &Info) {
                     MDB.createBranchWeights(Info.Weights));
   for (auto [Idx, SubWeight] : enumerate(Info.SubWeights))
     if (SubWeight != 0)
-      Info.Weights[Idx] = Info.Weights[Idx] > SubWeight
-                              ? Info.Weights[Idx] - SubWeight
-                              : 1;
+      // Don't set the probability of taking the edge from latch to loop header
+      // to less than 1:1 ratio (meaning Weight should not be lower than
+      // SubWeight), as this could significantly reduce the loop's hotness,
+      // which would be incorrect in the case of underestimating the trip count.
+      Info.Weights[Idx] =
+          Info.Weights[Idx] > SubWeight
+              ? std::max(Info.Weights[Idx] - SubWeight, SubWeight)
+              : SubWeight;
 }
 
 /// Initialize the weights for all exiting blocks.
diff --git a/llvm/test/Transforms/LoopUnroll/peel-loop-pgo-deopt.ll b/llvm/test/Transforms/LoopUnroll/peel-loop-pgo-deopt.ll
index a0dc216133fb7..d91cb5bab3827 100644
--- a/llvm/test/Transforms/LoopUnroll/peel-loop-pgo-deopt.ll
+++ b/llvm/test/Transforms/LoopUnroll/peel-loop-pgo-deopt.ll
@@ -21,7 +21,7 @@
 ; CHECK: br i1 %{{.*}}, label %[[NEXT2:.*]], label %for.cond.for.end_crit_edge, !prof !18
 ; CHECK: [[NEXT2]]:
 ; CHECK: br i1 %c, label %{{.*}}, label %side_exit.loopexit, !prof !15
-; CHECK: br i1 %{{.*}}, label %for.body, label %{{.*}}, !prof !19
+; CHECK: br i1 %{{.*}}, label %for.body, label %{{.*}}, !prof !18
 
 define i32 @basic(ptr %p, i32 %k, i1 %c) #0 !prof !15 {
 entry:
@@ -85,6 +85,5 @@ attributes #1 = { nounwind optsize }
 ; This is a weights of latch and its copies.
 ;CHECK: !16 = !{!"branch_weights", i32 3001, i32 1001}
 ;CHECK: !17 = !{!"branch_weights", i32 2000, i32 1001}
-;CHECK: !18 = !{!"branch_weights", i32 999, i32 1001}
-;CHECK: !19 = !{!"branch_weights", i32 1, i32 1001}
+;CHECK: !18 = !{!"branch_weights", i32 1001, i32 1001}
 
diff --git a/llvm/test/Transforms/LoopUnroll/peel-loop-pgo.ll b/llvm/test/Transforms/LoopUnroll/peel-loop-pgo.ll
index cadb6739dbc3f..15dce234baee9 100644
--- a/llvm/test/Transforms/LoopUnroll/peel-loop-pgo.ll
+++ b/llvm/test/Transforms/LoopUnroll/peel-loop-pgo.ll
@@ -24,7 +24,7 @@
 ; CHECK: [[NEXT1]]:
 ; CHECK: br i1 %{{.*}}, label %[[NEXT2:.*]], label %for.cond.for.end_crit_edge, !prof !17
 ; CHECK: [[NEXT2]]:
-; CHECK: br i1 %{{.*}}, label %for.body, label %{{.*}}, !prof !18
+; CHECK: br i1 %{{.*}}, label %for.body, label %{{.*}}, !prof !17
 
 define void @basic(ptr %p, i32 %k) #0 !prof !15 {
 entry:
@@ -105,6 +105,5 @@ attributes #1 = { nounwind optsize }
 
 ;CHECK: !15 = !{!"branch_weights", i32 3001, i32 1001}
 ;CHECK: !16 = !{!"branch_weights", i32 2000, i32 1001}
-;CHECK: !17 = !{!"branch_weights", i32 999, i32 1001}
-;CHECK: !18 = !{!"branch_weights", i32 1, i32 1001}
+;CHECK: !17 = !{!"branch_weights", i32 1001, i32 1001}
 

From 4b29e8cf391f99e684928ecf546b0c64670234f5 Mon Sep 17 00:00:00 2001
From: Sander de Smalen <sander.desmalen@arm.com>
Date: Tue, 31 Oct 2023 11:49:10 +0000
Subject: [PATCH 513/877] [AArch64] Mark ptrue (predicate-as-counter) as
 rematerializable.

---
 llvm/lib/Target/AArch64/SVEInstrFormats.td | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index d2f72fda3a229..ff641a85f8440 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -9249,6 +9249,7 @@ class sve2p1_ptrue_pn<string mnemonic, bits<2> sz, PNRP8to15RegOp pnrty, SDPatte
   let Inst{2-0}   = PNd;
 
   let hasSideEffects = 0;
+  let isReMaterializable = 1;
 }
 
 
From 8ea260a0931df774e49fab76fa030a2ca4998f54 Mon Sep 17 00:00:00 2001
From: Cullen Rhodes <cullen.rhodes@arm.com>
Date: Tue, 31 Oct 2023 13:08:55 +0000
Subject: [PATCH 514/877] [mlir][ArmSME] Add mask operand to load_tile_slice
 (#70655)

---
 .../mlir/Dialect/ArmSME/IR/ArmSMEOps.td       |  27 +++--
 .../Conversion/ArmSMEToSCF/ArmSMEToSCF.cpp    |  19 ++-
 .../Transforms/LegalizeForLLVMExport.cpp      |  37 +++---
 .../ArmSMEToSCF/arm-sme-to-scf.mlir           |  15 ++-
 mlir/test/Dialect/ArmSME/arm-sme-to-llvm.mlir |  76 ++++++------
 mlir/test/Dialect/ArmSME/invalid.mlir         |  13 ++
 mlir/test/Dialect/ArmSME/roundtrip.mlir       | 114 +++++++++---------
 7 files changed, 174 insertions(+), 127 deletions(-)

diff --git a/mlir/include/mlir/Dialect/ArmSME/IR/ArmSMEOps.td b/mlir/include/mlir/Dialect/ArmSME/IR/ArmSMEOps.td
index 2f6e52ff2badb..37a2257a0015c 100644
--- a/mlir/include/mlir/Dialect/ArmSME/IR/ArmSMEOps.td
+++ b/mlir/include/mlir/Dialect/ArmSME/IR/ArmSMEOps.td
@@ -390,7 +390,15 @@ def TileStoreOp : ArmSME_Op<"tile_store"> {
 }
 
 def LoadTileSliceOp : ArmSME_Op<"load_tile_slice", [
-    AllTypesMatch<["tile", "result"]>
+    AllTypesMatch<["tile", "result"]>,
+    TypesMatchWith<
+      "mask has i1 element type and is a slice of the result",
+      "result", "mask",
+      "VectorType("
+        "VectorType::Builder("
+          "::llvm::cast<mlir::VectorType>($_self)"
+        ").dropDim(0).setElementType(IntegerType::get($_self.getContext(), 1))"
+      ")">,
 ]> {
   let summary = "Tile slice load and update operation";
   let description = [{
@@ -406,23 +414,27 @@ def LoadTileSliceOp : ArmSME_Op<"load_tile_slice", [
     dimensions since the operation is scalable, and the element type must be a
     scalar that matches the element type of the result.
 
+    An SSA value `mask` specifies to mask out elements read from the MemRef.
+    The `mask` type is an `i1` vector with a shape that matches how elements
+    are read from the MemRef.
+
     Example 1: Load a vector<[16]xi8> tile slice from memory into tile horizontally (default) at given index.
     ```mlir
-    %tile_update = arm_sme.load_tile_slice %base[%c0], %tile, %tile_slice_index : memref<?x?xi8>, vector<[16]x[16]xi8>
+    %tile_update = arm_sme.load_tile_slice %base[%c0], %mask, %tile, %tile_slice_index : memref<?x?xi8>, vector<[16]xi1>, vector<[16]x[16]xi8>
     ```
 
     Example 2: Load a vector<[4]xf32> tile slice from memory into tile vertically at given index.
     ```mlir
-    %tile_update = arm_sme.load_tile_slice %base[%c0], %tile, %tile_slice_index layout<vertical> : memref<?x?xf32>, vector<[4]x[4]xf32>
+    %tile_update = arm_sme.load_tile_slice %base[%c0], %mask, %tile, %tile_slice_index layout<vertical> : memref<?x?xf32>, vector<[4]xi1>, vector<[4]x[4]xf32>
     ```
 
     Example 3: Load a vector<[1]xi128> tile slice from memory into tile vertically at given index.
     ```mlir
-    %tile_update = arm_sme.load_tile_slice %base[%c0], %tile, %tile_slice_index layout<vertical> : memref<?x?xi128>, vector<[1]x[1]xi128>
+    %tile_update = arm_sme.load_tile_slice %base[%c0], %mask, %tile, %tile_slice_index layout<vertical> : memref<?x?xi128>, vector<[1]xi1>, vector<[1]x[1]xi128>
     ```
   }];
   let arguments = (ins
-    Arg<AnyMemRef, "the reference to load from">:$base,
+    Arg<AnyMemRef, "the reference to load from">:$base, SVEPredicate:$mask,
     SMETile:$tile, Variadic<Index>:$indices, Index:$tile_slice_index,
     ArmSME_TileSliceLayoutAttr:$layout
   );
@@ -438,8 +450,9 @@ def LoadTileSliceOp : ArmSME_Op<"load_tile_slice", [
   }];
 
   let assemblyFormat = [{
-    $base `[` $indices `]` `,` $tile `,` $tile_slice_index (`layout` `` $layout^)?
-      attr-dict `:` type($base) `,` type($result)
+    $base `[` $indices `]` `,` $mask `,` $tile `,` $tile_slice_index
+      (`layout` `` $layout^)? attr-dict `:` type($base) `,` type($mask) `,`
+                                            type($result)
   }];
 }
 
diff --git a/mlir/lib/Conversion/ArmSMEToSCF/ArmSMEToSCF.cpp b/mlir/lib/Conversion/ArmSMEToSCF/ArmSMEToSCF.cpp
index 0ec51b7430c02..50cc818f1ffc0 100644
--- a/mlir/lib/Conversion/ArmSMEToSCF/ArmSMEToSCF.cpp
+++ b/mlir/lib/Conversion/ArmSMEToSCF/ArmSMEToSCF.cpp
@@ -60,6 +60,7 @@ void getMemrefIndices(ValueRange indices, unsigned rank, Value tileSliceIndex,
 ///
 ///  AFTER:
 ///  ```mlir
+///  %ptrue_s = arith.constant dense<true> : vector<[4]xi1>
 ///  %tile_id = arm_sme.get_tile_id : i32
 ///  %tile = arm_sme.cast_tile_to_vector %tile_id : i32 to vector<[4]x[4]xi32>
 ///  %vscale = vector.vscale
@@ -69,7 +70,8 @@ void getMemrefIndices(ValueRange indices, unsigned rank, Value tileSliceIndex,
 ///  %svl_s = arith.muli %min_svl_s, %vscale : index
 ///  scf.for %tile_slice_idx = %c0 to %svl_s step %c1 {
 ///    %tile_update = arm_sme.load_tile_slice %src[%tile_slice_idx],
-///      %tile, %tile_slice_idx : memref<?x?xi32>, vector<[4]x[4]xi32>
+///      %ptrue_s, %tile, %tile_slice_idx
+///        : memref<?x?xi32>, vector<[4]xi1>, vector<[4]x[4]xi32>
 ///  }
 ///  ```
 struct TileLoadOpConversion : public OpRewritePattern<arm_sme::TileLoadOp> {
@@ -77,6 +79,11 @@ struct TileLoadOpConversion : public OpRewritePattern<arm_sme::TileLoadOp> {
 
   LogicalResult matchAndRewrite(arm_sme::TileLoadOp tileLoadOp,
                                 PatternRewriter &rewriter) const override {
+    if (tileLoadOp.getMask())
+      // TODO: add masked patterns.
+      return rewriter.notifyMatchFailure(
+          tileLoadOp, "op has mask, needs masked pattern(s)");
+
     OpBuilder::InsertionGuard g(rewriter);
     auto loc = tileLoadOp.getLoc();
     auto tileType = tileLoadOp.getVectorType();
@@ -109,6 +116,12 @@ struct TileLoadOpConversion : public OpRewritePattern<arm_sme::TileLoadOp> {
 
     rewriter.setInsertionPointToStart(forOp.getBody());
 
+    // Create an 'all true' predicate for the tile slice.
+    auto predicateType =
+        VectorType::get(tileType.getDimSize(1), rewriter.getI1Type(), true);
+    auto allTruePredicate = rewriter.create<arith::ConstantOp>(
+        loc, DenseElementsAttr::get(predicateType, true));
+
     // Create 'arm_sme.load_tile_slice' to load tile slice from memory into
     // tile.
     SmallVector<Value> memrefIndices;
@@ -117,8 +130,8 @@ struct TileLoadOpConversion : public OpRewritePattern<arm_sme::TileLoadOp> {
                      tileLoadOp.getMemRefType().getRank(), tileSliceIndex,
                      numTileSlices, memrefIndices, loc, rewriter);
     rewriter.create<arm_sme::LoadTileSliceOp>(
-        loc, tileType, tileLoadOp.getBase(), tile, memrefIndices,
-        tileSliceIndex, tileLoadOp.getLayout());
+        loc, tileType, tileLoadOp.getBase(), allTruePredicate, tile,
+        memrefIndices, tileSliceIndex, tileLoadOp.getLayout());
 
     rewriter.setInsertionPointAfter(forOp);
 
diff --git a/mlir/lib/Dialect/ArmSME/Transforms/LegalizeForLLVMExport.cpp b/mlir/lib/Dialect/ArmSME/Transforms/LegalizeForLLVMExport.cpp
index 105f2de207a08..7dd04e25075c8 100644
--- a/mlir/lib/Dialect/ArmSME/Transforms/LegalizeForLLVMExport.cpp
+++ b/mlir/lib/Dialect/ArmSME/Transforms/LegalizeForLLVMExport.cpp
@@ -179,12 +179,7 @@ struct LoadTileSliceToArmSMELowering
         loc, rewriter.getI32Type(), tileSlice);
 
     // Create all active predicate mask.
-    auto one = rewriter.create<arith::ConstantOp>(
-        loc, rewriter.getI1Type(),
-        rewriter.getIntegerAttr(rewriter.getI1Type(), 1));
-    auto predTy = VectorType::get(tileType.getShape()[0], rewriter.getI1Type(),
-                                  /*scalableDims=*/{true});
-    auto allActiveMask = rewriter.create<vector::SplatOp>(loc, predTy, one);
+    auto maskOp = loadTileSliceOp.getMask();
 
     auto tileI32 = castTileIDToI32(tile, loc, rewriter);
     arm_sme::TileSliceLayout layout = loadTileSliceOp.getLayout();
@@ -195,24 +190,24 @@ struct LoadTileSliceToArmSMELowering
       default:
         llvm_unreachable("unexpected element type!");
       case 8:
-        rewriter.create<arm_sme::aarch64_sme_ld1b_horiz>(
-            loc, allActiveMask, ptr, tileI32, tileSliceI32);
+        rewriter.create<arm_sme::aarch64_sme_ld1b_horiz>(loc, maskOp, ptr,
+                                                         tileI32, tileSliceI32);
         break;
       case 16:
-        rewriter.create<arm_sme::aarch64_sme_ld1h_horiz>(
-            loc, allActiveMask, ptr, tileI32, tileSliceI32);
+        rewriter.create<arm_sme::aarch64_sme_ld1h_horiz>(loc, maskOp, ptr,
+                                                         tileI32, tileSliceI32);
         break;
       case 32:
-        rewriter.create<arm_sme::aarch64_sme_ld1w_horiz>(
-            loc, allActiveMask, ptr, tileI32, tileSliceI32);
+        rewriter.create<arm_sme::aarch64_sme_ld1w_horiz>(loc, maskOp, ptr,
+                                                         tileI32, tileSliceI32);
         break;
       case 64:
-        rewriter.create<arm_sme::aarch64_sme_ld1d_horiz>(
-            loc, allActiveMask, ptr, tileI32, tileSliceI32);
+        rewriter.create<arm_sme::aarch64_sme_ld1d_horiz>(loc, maskOp, ptr,
+                                                         tileI32, tileSliceI32);
         break;
       case 128:
-        rewriter.create<arm_sme::aarch64_sme_ld1q_horiz>(
-            loc, allActiveMask, ptr, tileI32, tileSliceI32);
+        rewriter.create<arm_sme::aarch64_sme_ld1q_horiz>(loc, maskOp, ptr,
+                                                         tileI32, tileSliceI32);
         break;
       }
     } else {
@@ -220,23 +215,23 @@ struct LoadTileSliceToArmSMELowering
       default:
         llvm_unreachable("unexpected element type!");
       case 8:
-        rewriter.create<arm_sme::aarch64_sme_ld1b_vert>(loc, allActiveMask, ptr,
+        rewriter.create<arm_sme::aarch64_sme_ld1b_vert>(loc, maskOp, ptr,
                                                         tileI32, tileSliceI32);
         break;
       case 16:
-        rewriter.create<arm_sme::aarch64_sme_ld1h_vert>(loc, allActiveMask, ptr,
+        rewriter.create<arm_sme::aarch64_sme_ld1h_vert>(loc, maskOp, ptr,
                                                         tileI32, tileSliceI32);
         break;
       case 32:
-        rewriter.create<arm_sme::aarch64_sme_ld1w_vert>(loc, allActiveMask, ptr,
+        rewriter.create<arm_sme::aarch64_sme_ld1w_vert>(loc, maskOp, ptr,
                                                         tileI32, tileSliceI32);
         break;
       case 64:
-        rewriter.create<arm_sme::aarch64_sme_ld1d_vert>(loc, allActiveMask, ptr,
+        rewriter.create<arm_sme::aarch64_sme_ld1d_vert>(loc, maskOp, ptr,
                                                         tileI32, tileSliceI32);
         break;
       case 128:
-        rewriter.create<arm_sme::aarch64_sme_ld1q_vert>(loc, allActiveMask, ptr,
+        rewriter.create<arm_sme::aarch64_sme_ld1q_vert>(loc, maskOp, ptr,
                                                         tileI32, tileSliceI32);
         break;
       }
diff --git a/mlir/test/Conversion/ArmSMEToSCF/arm-sme-to-scf.mlir b/mlir/test/Conversion/ArmSMEToSCF/arm-sme-to-scf.mlir
index 4b3020970d6cc..3fb320c0d219e 100644
--- a/mlir/test/Conversion/ArmSMEToSCF/arm-sme-to-scf.mlir
+++ b/mlir/test/Conversion/ArmSMEToSCF/arm-sme-to-scf.mlir
@@ -1,5 +1,9 @@
 // RUN: mlir-opt %s -convert-arm-sme-to-scf -cse -split-input-file | FileCheck %s
 
+//===----------------------------------------------------------------------===//
+// arm_sme.tile_load
+//===----------------------------------------------------------------------===//
+
 // CHECK-LABEL: func.func @arm_sme_tile_load_hor(
 // CHECK-SAME:                                   %[[SRC:.*]]: memref<?x?xi32>) {
 // CHECK-DAG:     %[[TILE_ID:.*]] = arm_sme.get_tile_id : i32
@@ -10,8 +14,9 @@
 // CHECK-DAG:     %[[VSCALE:.*]] = vector.vscale
 // CHECK-NEXT:    %[[NUM_TILE_SLICES:.*]] = arith.muli %[[C4]], %[[VSCALE]] : index
 // CHECK-NEXT:    scf.for %[[TILE_SLICE_INDEX:.*]] = %[[C0]] to %[[NUM_TILE_SLICES]] step %[[C1]] {
+// CHECK-NEXT:      %[[PTRUE_S:.*]] = arith.constant dense<true> : vector<[4]xi1>
 // CHECK-NEXT:      %[[OFFSET:.*]] = arith.addi %[[C0]], %[[TILE_SLICE_INDEX]] : index
-// CHECK-NEXT:      arm_sme.load_tile_slice %[[SRC]]{{\[}}%[[OFFSET]], %[[C0]]], %[[CAST_TILE_TO_VECTOR]], %[[TILE_SLICE_INDEX]] : memref<?x?xi32>, vector<[4]x[4]xi32>
+// CHECK-NEXT:      arm_sme.load_tile_slice %[[SRC]]{{\[}}%[[OFFSET]], %[[C0]]], %[[PTRUE_S]], %[[CAST_TILE_TO_VECTOR]], %[[TILE_SLICE_INDEX]] : memref<?x?xi32>, vector<[4]xi1>, vector<[4]x[4]xi32>
 func.func @arm_sme_tile_load_hor(%src : memref<?x?xi32>) {
   %c0 = arith.constant 0 : index
   %tile = arm_sme.tile_load %src[%c0, %c0] : memref<?x?xi32>, vector<[4]x[4]xi32>
@@ -28,6 +33,10 @@ func.func @arm_sme_tile_load_ver(%src : memref<?x?xi32>) {
   return
 }
 
+//===----------------------------------------------------------------------===//
+// arm_sme.tile_store
+//===----------------------------------------------------------------------===//
+
 // -----
 
 // CHECK-LABEL: func.func @arm_sme_tile_store_hor(
@@ -57,6 +66,10 @@ func.func @arm_sme_tile_store_ver(%tile : vector<[4]x[4]xi32>, %dest : memref<?x
   return
 }
 
+//===----------------------------------------------------------------------===//
+// vector.print
+//===----------------------------------------------------------------------===//
+
 // -----
 
 func.func @arm_sme_tile_print(%tile: vector<[4]x[4]xf32>)
diff --git a/mlir/test/Dialect/ArmSME/arm-sme-to-llvm.mlir b/mlir/test/Dialect/ArmSME/arm-sme-to-llvm.mlir
index 9074f0a7ee655..30ddb3c468601 100644
--- a/mlir/test/Dialect/ArmSME/arm-sme-to-llvm.mlir
+++ b/mlir/test/Dialect/ArmSME/arm-sme-to-llvm.mlir
@@ -8,9 +8,9 @@
 
 // CHECK-LABEL:   func.func @arm_sme_load_tile_slice_hor_i8(
 // CHECK-SAME:                                              %[[SRC:.*]]: memref<?x?xi8>,
+// CHECK-SAME:                                              %[[MASK:.*]]: vector<[16]xi1>,
 // CHECK-SAME:                                              %[[TILE:.*]]: vector<[16]x[16]xi8>,
 // CHECK-SAME:                                              %[[TILE_SLICE_INDEX:.*]]: index) {
-// CHECK:           %[[PTRUE_B:.*]] = arith.constant dense<true> : vector<[16]xi1>
 // CHECK:           %[[C0:.*]] = arith.constant 0 : index
 // CHECK:           %[[MEM_DESC:.*]] = builtin.unrealized_conversion_cast %[[SRC]] : memref<?x?xi8> to !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
 // CHECK:           %[[C0_I64:.*]] = builtin.unrealized_conversion_cast %[[C0]] : index to i64
@@ -21,12 +21,12 @@
 // CHECK:           %[[GEP:.*]] = llvm.getelementptr %[[ALIGNED_BASE]]{{\[}}%[[OFFSET]]] : (!llvm.ptr, i64) -> !llvm.ptr, i8
 // CHECK:           %[[TILE_SLICE_INDEX_I32:.*]] = arith.index_castui %[[TILE_SLICE_INDEX]] : index to i32
 // CHECK:           %[[TILE_ID_I32:.*]] = arith.extui %[[TILE_ID]] : i8 to i32
-// CHECK:           "arm_sme.intr.ld1b.horiz"(%[[PTRUE_B]], %[[GEP]], %[[TILE_ID_I32]], %[[TILE_SLICE_INDEX_I32]]) : (vector<[16]xi1>, !llvm.ptr, i32, i32) -> ()
+// CHECK:           "arm_sme.intr.ld1b.horiz"(%[[MASK]], %[[GEP]], %[[TILE_ID_I32]], %[[TILE_SLICE_INDEX_I32]]) : (vector<[16]xi1>, !llvm.ptr, i32, i32) -> ()
 // CHECK:           return
 // CHECK:         }
-func.func @arm_sme_load_tile_slice_hor_i8(%src : memref<?x?xi8>, %tile : vector<[16]x[16]xi8>, %tile_slice_index : index) {
+func.func @arm_sme_load_tile_slice_hor_i8(%src : memref<?x?xi8>, %mask : vector<[16]xi1>, %tile : vector<[16]x[16]xi8>, %tile_slice_index : index) {
   %c0 = arith.constant 0 : index
-  %tile_update = arm_sme.load_tile_slice %src[%c0], %tile, %tile_slice_index : memref<?x?xi8>, vector<[16]x[16]xi8>
+  %tile_update = arm_sme.load_tile_slice %src[%c0], %mask, %tile, %tile_slice_index : memref<?x?xi8>, vector<[16]xi1>, vector<[16]x[16]xi8>
   return
 }
 
@@ -34,9 +34,9 @@ func.func @arm_sme_load_tile_slice_hor_i8(%src : memref<?x?xi8>, %tile : vector<
 
 // CHECK-LABEL: @arm_sme_load_tile_slice_hor_i16
 // CHECK: "arm_sme.intr.ld1h.horiz"({{.*}}) : (vector<[8]xi1>, !llvm.ptr, i32, i32) -> ()
-func.func @arm_sme_load_tile_slice_hor_i16(%src : memref<?x?xi16>, %tile : vector<[8]x[8]xi16>, %tile_slice_index : index) {
+func.func @arm_sme_load_tile_slice_hor_i16(%src : memref<?x?xi16>, %mask : vector<[8]xi1>, %tile : vector<[8]x[8]xi16>, %tile_slice_index : index) {
   %c0 = arith.constant 0 : index
-  %tile_update = arm_sme.load_tile_slice %src[%c0], %tile, %tile_slice_index : memref<?x?xi16>, vector<[8]x[8]xi16>
+  %tile_update = arm_sme.load_tile_slice %src[%c0], %mask, %tile, %tile_slice_index : memref<?x?xi16>, vector<[8]xi1>, vector<[8]x[8]xi16>
   return
 }
 
@@ -44,9 +44,9 @@ func.func @arm_sme_load_tile_slice_hor_i16(%src : memref<?x?xi16>, %tile : vecto
 
 // CHECK-LABEL: @arm_sme_load_tile_slice_hor_i32
 // CHECK: "arm_sme.intr.ld1w.horiz"({{.*}}) : (vector<[4]xi1>, !llvm.ptr, i32, i32) -> ()
-func.func @arm_sme_load_tile_slice_hor_i32(%src : memref<?x?xi32>, %tile : vector<[4]x[4]xi32>, %tile_slice_index : index) {
+func.func @arm_sme_load_tile_slice_hor_i32(%src : memref<?x?xi32>, %mask : vector<[4]xi1>, %tile : vector<[4]x[4]xi32>, %tile_slice_index : index) {
   %c0 = arith.constant 0 : index
-  %tile_update = arm_sme.load_tile_slice %src[%c0], %tile, %tile_slice_index : memref<?x?xi32>, vector<[4]x[4]xi32>
+  %tile_update = arm_sme.load_tile_slice %src[%c0], %mask, %tile, %tile_slice_index : memref<?x?xi32>, vector<[4]xi1>, vector<[4]x[4]xi32>
   return
 }
 
@@ -54,9 +54,9 @@ func.func @arm_sme_load_tile_slice_hor_i32(%src : memref<?x?xi32>, %tile : vecto
 
 // CHECK-LABEL: @arm_sme_load_tile_slice_hor_i64
 // CHECK: "arm_sme.intr.ld1d.horiz"({{.*}}) : (vector<[2]xi1>, !llvm.ptr, i32, i32) -> ()
-func.func @arm_sme_load_tile_slice_hor_i64(%src : memref<?x?xi64>, %tile : vector<[2]x[2]xi64>, %tile_slice_index : index) {
+func.func @arm_sme_load_tile_slice_hor_i64(%src : memref<?x?xi64>, %mask : vector<[2]xi1>, %tile : vector<[2]x[2]xi64>, %tile_slice_index : index) {
   %c0 = arith.constant 0 : index
-  %tile_update = arm_sme.load_tile_slice %src[%c0], %tile, %tile_slice_index : memref<?x?xi64>, vector<[2]x[2]xi64>
+  %tile_update = arm_sme.load_tile_slice %src[%c0], %mask, %tile, %tile_slice_index : memref<?x?xi64>, vector<[2]xi1>, vector<[2]x[2]xi64>
   return
 }
 
@@ -64,9 +64,9 @@ func.func @arm_sme_load_tile_slice_hor_i64(%src : memref<?x?xi64>, %tile : vecto
 
 // CHECK-LABEL: @arm_sme_load_tile_slice_hor_i128
 // CHECK: "arm_sme.intr.ld1q.horiz"({{.*}}) : (vector<[1]xi1>, !llvm.ptr, i32, i32) -> ()
-func.func @arm_sme_load_tile_slice_hor_i128(%src : memref<?x?xi128>, %tile : vector<[1]x[1]xi128>, %tile_slice_index : index) {
+func.func @arm_sme_load_tile_slice_hor_i128(%src : memref<?x?xi128>, %mask : vector<[1]xi1>, %tile : vector<[1]x[1]xi128>, %tile_slice_index : index) {
   %c0 = arith.constant 0 : index
-  %tile_update = arm_sme.load_tile_slice %src[%c0], %tile, %tile_slice_index : memref<?x?xi128>, vector<[1]x[1]xi128>
+  %tile_update = arm_sme.load_tile_slice %src[%c0], %mask, %tile, %tile_slice_index : memref<?x?xi128>, vector<[1]xi1>, vector<[1]x[1]xi128>
   return
 }
 
@@ -74,9 +74,9 @@ func.func @arm_sme_load_tile_slice_hor_i128(%src : memref<?x?xi128>, %tile : vec
 
 // CHECK-LABEL: @arm_sme_load_tile_slice_hor_f16
 // CHECK: "arm_sme.intr.ld1h.horiz"({{.*}}) : (vector<[8]xi1>, !llvm.ptr, i32, i32) -> ()
-func.func @arm_sme_load_tile_slice_hor_f16(%src : memref<?x?xf16>, %tile : vector<[8]x[8]xf16>, %tile_slice_index : index) {
+func.func @arm_sme_load_tile_slice_hor_f16(%src : memref<?x?xf16>, %mask : vector<[8]xi1>, %tile : vector<[8]x[8]xf16>, %tile_slice_index : index) {
   %c0 = arith.constant 0 : index
-  %tile_update = arm_sme.load_tile_slice %src[%c0], %tile, %tile_slice_index : memref<?x?xf16>, vector<[8]x[8]xf16>
+  %tile_update = arm_sme.load_tile_slice %src[%c0], %mask, %tile, %tile_slice_index : memref<?x?xf16>, vector<[8]xi1>, vector<[8]x[8]xf16>
   return
 }
 
@@ -84,9 +84,9 @@ func.func @arm_sme_load_tile_slice_hor_f16(%src : memref<?x?xf16>, %tile : vecto
 
 // CHECK-LABEL: @arm_sme_load_tile_slice_hor_bf16
 // CHECK: "arm_sme.intr.ld1h.horiz"({{.*}}) : (vector<[8]xi1>, !llvm.ptr, i32, i32) -> ()
-func.func @arm_sme_load_tile_slice_hor_bf16(%src : memref<?x?xbf16>, %tile : vector<[8]x[8]xbf16>, %tile_slice_index : index) {
+func.func @arm_sme_load_tile_slice_hor_bf16(%src : memref<?x?xbf16>, %mask : vector<[8]xi1>, %tile : vector<[8]x[8]xbf16>, %tile_slice_index : index) {
   %c0 = arith.constant 0 : index
-  %tile_update = arm_sme.load_tile_slice %src[%c0], %tile, %tile_slice_index : memref<?x?xbf16>, vector<[8]x[8]xbf16>
+  %tile_update = arm_sme.load_tile_slice %src[%c0], %mask, %tile, %tile_slice_index : memref<?x?xbf16>, vector<[8]xi1>, vector<[8]x[8]xbf16>
   return
 }
 
@@ -94,9 +94,9 @@ func.func @arm_sme_load_tile_slice_hor_bf16(%src : memref<?x?xbf16>, %tile : vec
 
 // CHECK-LABEL: @arm_sme_load_tile_slice_hor_f32
 // CHECK: "arm_sme.intr.ld1w.horiz"({{.*}}) : (vector<[4]xi1>, !llvm.ptr, i32, i32) -> ()
-func.func @arm_sme_load_tile_slice_hor_f32(%src : memref<?x?xf32>, %tile : vector<[4]x[4]xf32>, %tile_slice_index : index) {
+func.func @arm_sme_load_tile_slice_hor_f32(%src : memref<?x?xf32>, %mask : vector<[4]xi1>, %tile : vector<[4]x[4]xf32>, %tile_slice_index : index) {
   %c0 = arith.constant 0 : index
-  %tile_update = arm_sme.load_tile_slice %src[%c0], %tile, %tile_slice_index : memref<?x?xf32>, vector<[4]x[4]xf32>
+  %tile_update = arm_sme.load_tile_slice %src[%c0], %mask, %tile, %tile_slice_index : memref<?x?xf32>, vector<[4]xi1>, vector<[4]x[4]xf32>
   return
 }
 
@@ -104,9 +104,9 @@ func.func @arm_sme_load_tile_slice_hor_f32(%src : memref<?x?xf32>, %tile : vecto
 
 // CHECK-LABEL: @arm_sme_load_tile_slice_hor_f64
 // CHECK: "arm_sme.intr.ld1d.horiz"({{.*}}) : (vector<[2]xi1>, !llvm.ptr, i32, i32) -> ()
-func.func @arm_sme_load_tile_slice_hor_f64(%src : memref<?x?xf64>, %tile : vector<[2]x[2]xf64>, %tile_slice_index : index) {
+func.func @arm_sme_load_tile_slice_hor_f64(%src : memref<?x?xf64>, %mask : vector<[2]xi1>, %tile : vector<[2]x[2]xf64>, %tile_slice_index : index) {
   %c0 = arith.constant 0 : index
-  %tile_update = arm_sme.load_tile_slice %src[%c0], %tile, %tile_slice_index : memref<?x?xf64>, vector<[2]x[2]xf64>
+  %tile_update = arm_sme.load_tile_slice %src[%c0], %mask, %tile, %tile_slice_index : memref<?x?xf64>, vector<[2]xi1>, vector<[2]x[2]xf64>
   return
 }
 
@@ -114,9 +114,9 @@ func.func @arm_sme_load_tile_slice_hor_f64(%src : memref<?x?xf64>, %tile : vecto
 
 // CHECK-LABEL: @arm_sme_load_tile_slice_ver_i8
 // CHECK: "arm_sme.intr.ld1b.vert"({{.*}}) : (vector<[16]xi1>, !llvm.ptr, i32, i32) -> ()
-func.func @arm_sme_load_tile_slice_ver_i8(%src : memref<?x?xi8>, %tile : vector<[16]x[16]xi8>, %tile_slice_index : index) {
+func.func @arm_sme_load_tile_slice_ver_i8(%src : memref<?x?xi8>, %mask : vector<[16]xi1>, %tile : vector<[16]x[16]xi8>, %tile_slice_index : index) {
   %c0 = arith.constant 0 : index
-  %tile_update = arm_sme.load_tile_slice %src[%c0], %tile, %tile_slice_index layout<vertical> : memref<?x?xi8>, vector<[16]x[16]xi8>
+  %tile_update = arm_sme.load_tile_slice %src[%c0], %mask, %tile, %tile_slice_index layout<vertical> : memref<?x?xi8>, vector<[16]xi1>, vector<[16]x[16]xi8>
   return
 }
 
@@ -124,9 +124,9 @@ func.func @arm_sme_load_tile_slice_ver_i8(%src : memref<?x?xi8>, %tile : vector<
 
 // CHECK-LABEL: @arm_sme_load_tile_slice_ver_i16
 // CHECK: "arm_sme.intr.ld1h.vert"({{.*}}) : (vector<[8]xi1>, !llvm.ptr, i32, i32) -> ()
-func.func @arm_sme_load_tile_slice_ver_i16(%src : memref<?x?xi16>, %tile : vector<[8]x[8]xi16>, %tile_slice_index : index) {
+func.func @arm_sme_load_tile_slice_ver_i16(%src : memref<?x?xi16>, %mask : vector<[8]xi1>, %tile : vector<[8]x[8]xi16>, %tile_slice_index : index) {
   %c0 = arith.constant 0 : index
-  %tile_update = arm_sme.load_tile_slice %src[%c0], %tile, %tile_slice_index layout<vertical> : memref<?x?xi16>, vector<[8]x[8]xi16>
+  %tile_update = arm_sme.load_tile_slice %src[%c0], %mask, %tile, %tile_slice_index layout<vertical> : memref<?x?xi16>, vector<[8]xi1>, vector<[8]x[8]xi16>
   return
 }
 
@@ -134,9 +134,9 @@ func.func @arm_sme_load_tile_slice_ver_i16(%src : memref<?x?xi16>, %tile : vecto
 
 // CHECK-LABEL: @arm_sme_load_tile_slice_ver_i32
 // CHECK: "arm_sme.intr.ld1w.vert"({{.*}}) : (vector<[4]xi1>, !llvm.ptr, i32, i32) -> ()
-func.func @arm_sme_load_tile_slice_ver_i32(%src : memref<?x?xi32>, %tile : vector<[4]x[4]xi32>, %tile_slice_index : index) {
+func.func @arm_sme_load_tile_slice_ver_i32(%src : memref<?x?xi32>, %mask : vector<[4]xi1>, %tile : vector<[4]x[4]xi32>, %tile_slice_index : index) {
   %c0 = arith.constant 0 : index
-  %tile_update = arm_sme.load_tile_slice %src[%c0], %tile, %tile_slice_index layout<vertical> : memref<?x?xi32>, vector<[4]x[4]xi32>
+  %tile_update = arm_sme.load_tile_slice %src[%c0], %mask, %tile, %tile_slice_index layout<vertical> : memref<?x?xi32>, vector<[4]xi1>, vector<[4]x[4]xi32>
   return
 }
 
@@ -144,9 +144,9 @@ func.func @arm_sme_load_tile_slice_ver_i32(%src : memref<?x?xi32>, %tile : vecto
 
 // CHECK-LABEL: @arm_sme_load_tile_slice_ver_i64
 // CHECK: "arm_sme.intr.ld1d.vert"({{.*}}) : (vector<[2]xi1>, !llvm.ptr, i32, i32) -> ()
-func.func @arm_sme_load_tile_slice_ver_i64(%src : memref<?x?xi64>, %tile : vector<[2]x[2]xi64>, %tile_slice_index : index) {
+func.func @arm_sme_load_tile_slice_ver_i64(%src : memref<?x?xi64>, %mask : vector<[2]xi1>, %tile : vector<[2]x[2]xi64>, %tile_slice_index : index) {
   %c0 = arith.constant 0 : index
-  %tile_update = arm_sme.load_tile_slice %src[%c0], %tile, %tile_slice_index layout<vertical> : memref<?x?xi64>, vector<[2]x[2]xi64>
+  %tile_update = arm_sme.load_tile_slice %src[%c0], %mask, %tile, %tile_slice_index layout<vertical> : memref<?x?xi64>, vector<[2]xi1>, vector<[2]x[2]xi64>
   return
 }
 
@@ -154,9 +154,9 @@ func.func @arm_sme_load_tile_slice_ver_i64(%src : memref<?x?xi64>, %tile : vecto
 
 // CHECK-LABEL: @arm_sme_load_tile_slice_ver_i128
 // CHECK: "arm_sme.intr.ld1q.vert"({{.*}}) : (vector<[1]xi1>, !llvm.ptr, i32, i32) -> ()
-func.func @arm_sme_load_tile_slice_ver_i128(%src : memref<?x?xi128>, %tile : vector<[1]x[1]xi128>, %tile_slice_index : index) {
+func.func @arm_sme_load_tile_slice_ver_i128(%src : memref<?x?xi128>, %mask : vector<[1]xi1>, %tile : vector<[1]x[1]xi128>, %tile_slice_index : index) {
   %c0 = arith.constant 0 : index
-  %tile_update = arm_sme.load_tile_slice %src[%c0], %tile, %tile_slice_index layout<vertical> : memref<?x?xi128>, vector<[1]x[1]xi128>
+  %tile_update = arm_sme.load_tile_slice %src[%c0], %mask, %tile, %tile_slice_index layout<vertical> : memref<?x?xi128>, vector<[1]xi1>, vector<[1]x[1]xi128>
   return
 }
 
@@ -164,9 +164,9 @@ func.func @arm_sme_load_tile_slice_ver_i128(%src : memref<?x?xi128>, %tile : vec
 
 // CHECK-LABEL: @arm_sme_load_tile_slice_ver_f16
 // CHECK: "arm_sme.intr.ld1h.vert"({{.*}}) : (vector<[8]xi1>, !llvm.ptr, i32, i32) -> ()
-func.func @arm_sme_load_tile_slice_ver_f16(%src : memref<?x?xf16>, %tile : vector<[8]x[8]xf16>, %tile_slice_index : index) {
+func.func @arm_sme_load_tile_slice_ver_f16(%src : memref<?x?xf16>, %mask : vector<[8]xi1>, %tile : vector<[8]x[8]xf16>, %tile_slice_index : index) {
   %c0 = arith.constant 0 : index
-  %tile_update = arm_sme.load_tile_slice %src[%c0], %tile, %tile_slice_index layout<vertical> : memref<?x?xf16>, vector<[8]x[8]xf16>
+  %tile_update = arm_sme.load_tile_slice %src[%c0], %mask, %tile, %tile_slice_index layout<vertical> : memref<?x?xf16>, vector<[8]xi1>, vector<[8]x[8]xf16>
   return
 }
 
@@ -174,9 +174,9 @@ func.func @arm_sme_load_tile_slice_ver_f16(%src : memref<?x?xf16>, %tile : vecto
 
 // CHECK-LABEL: @arm_sme_load_tile_slice_ver_bf16
 // CHECK: "arm_sme.intr.ld1h.vert"({{.*}}) : (vector<[8]xi1>, !llvm.ptr, i32, i32) -> ()
-func.func @arm_sme_load_tile_slice_ver_bf16(%src : memref<?x?xbf16>, %tile : vector<[8]x[8]xbf16>, %tile_slice_index : index) {
+func.func @arm_sme_load_tile_slice_ver_bf16(%src : memref<?x?xbf16>, %mask : vector<[8]xi1>, %tile : vector<[8]x[8]xbf16>, %tile_slice_index : index) {
   %c0 = arith.constant 0 : index
-  %tile_update = arm_sme.load_tile_slice %src[%c0], %tile, %tile_slice_index layout<vertical> : memref<?x?xbf16>, vector<[8]x[8]xbf16>
+  %tile_update = arm_sme.load_tile_slice %src[%c0], %mask, %tile, %tile_slice_index layout<vertical> : memref<?x?xbf16>, vector<[8]xi1>, vector<[8]x[8]xbf16>
   return
 }
 
@@ -184,9 +184,9 @@ func.func @arm_sme_load_tile_slice_ver_bf16(%src : memref<?x?xbf16>, %tile : vec
 
 // CHECK-LABEL: @arm_sme_load_tile_slice_ver_f32
 // CHECK: "arm_sme.intr.ld1w.vert"({{.*}}) : (vector<[4]xi1>, !llvm.ptr, i32, i32) -> ()
-func.func @arm_sme_load_tile_slice_ver_f32(%src : memref<?x?xf32>, %tile : vector<[4]x[4]xf32>, %tile_slice_index : index) {
+func.func @arm_sme_load_tile_slice_ver_f32(%src : memref<?x?xf32>, %mask : vector<[4]xi1>, %tile : vector<[4]x[4]xf32>, %tile_slice_index : index) {
   %c0 = arith.constant 0 : index
-  %tile_update = arm_sme.load_tile_slice %src[%c0], %tile, %tile_slice_index layout<vertical> : memref<?x?xf32>, vector<[4]x[4]xf32>
+  %tile_update = arm_sme.load_tile_slice %src[%c0], %mask, %tile, %tile_slice_index layout<vertical> : memref<?x?xf32>, vector<[4]xi1>, vector<[4]x[4]xf32>
   return
 }
 
@@ -194,9 +194,9 @@ func.func @arm_sme_load_tile_slice_ver_f32(%src : memref<?x?xf32>, %tile : vecto
 
 // CHECK-LABEL: @arm_sme_load_tile_slice_ver_f64
 // CHECK: "arm_sme.intr.ld1d.vert"({{.*}}) : (vector<[2]xi1>, !llvm.ptr, i32, i32) -> ()
-func.func @arm_sme_load_tile_slice_ver_f64(%src : memref<?x?xf64>, %tile : vector<[2]x[2]xf64>, %tile_slice_index : index) {
+func.func @arm_sme_load_tile_slice_ver_f64(%src : memref<?x?xf64>, %mask : vector<[2]xi1>, %tile : vector<[2]x[2]xf64>, %tile_slice_index : index) {
   %c0 = arith.constant 0 : index
-  %tile_update = arm_sme.load_tile_slice %src[%c0], %tile, %tile_slice_index layout<vertical> : memref<?x?xf64>, vector<[2]x[2]xf64>
+  %tile_update = arm_sme.load_tile_slice %src[%c0], %mask, %tile, %tile_slice_index layout<vertical> : memref<?x?xf64>, vector<[2]xi1>, vector<[2]x[2]xf64>
   return
 }
 
diff --git a/mlir/test/Dialect/ArmSME/invalid.mlir b/mlir/test/Dialect/ArmSME/invalid.mlir
index dba8b1937936e..1d6386bbf3828 100644
--- a/mlir/test/Dialect/ArmSME/invalid.mlir
+++ b/mlir/test/Dialect/ArmSME/invalid.mlir
@@ -151,6 +151,19 @@ func.func @arm_sme_tile_load__pad_but_no_mask(%src : memref<?x?xf64>, %pad : f64
   return
 }
 
+//===----------------------------------------------------------------------===//
+// arm_sme.load_tile_slice
+//===----------------------------------------------------------------------===//
+
+// -----
+
+func.func @arm_sme_load_tile_slice__bad_mask_type(%src : memref<?x?xi8>, %mask : vector<[2]xi1>, %tile : vector<[16]x[16]xi8>, %tile_slice_index : index) {
+  %c0 = arith.constant 0 : index
+  // expected-error@+1 {{op failed to verify that mask has i1 element type and is a slice of the result}}
+  %tile_update = arm_sme.load_tile_slice %src[%c0], %mask, %tile, %tile_slice_index : memref<?x?xi8>, vector<[2]xi1>, vector<[16]x[16]xi8>
+  return
+}
+
 //===----------------------------------------------------------------------===//
 // arm_sme.outerproduct
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/Dialect/ArmSME/roundtrip.mlir b/mlir/test/Dialect/ArmSME/roundtrip.mlir
index 90b05c54c58d9..6d0aa48015c14 100644
--- a/mlir/test/Dialect/ArmSME/roundtrip.mlir
+++ b/mlir/test/Dialect/ArmSME/roundtrip.mlir
@@ -638,173 +638,173 @@ func.func @arm_sme_tile_store_ver_i8(%tile : vector<[16]x[16]xi8>, %dest : memre
 
 // -----
 
-func.func @arm_sme_load_tile_slice_hor_i8(%src : memref<?x?xi8>, %tile : vector<[16]x[16]xi8>, %tile_slice_index : index) {
-  // CHECK: arm_sme.load_tile_slice %{{.*}}[{{.*}}], %{{.*}}, %{{.*}} : memref<?x?xi8>, vector<[16]x[16]xi8>
+func.func @arm_sme_load_tile_slice_hor_i8(%src : memref<?x?xi8>, %mask : vector<[16]xi1>, %tile : vector<[16]x[16]xi8>, %tile_slice_index : index) {
+  // CHECK: arm_sme.load_tile_slice %{{.*}}[{{.*}}], %{{.*}}, %{{.*}} : memref<?x?xi8>, vector<[16]xi1>, vector<[16]x[16]xi8>
   %c0 = arith.constant 0 : index
-  %tile_update = arm_sme.load_tile_slice %src[%c0], %tile, %tile_slice_index : memref<?x?xi8>, vector<[16]x[16]xi8>
+  %tile_update = arm_sme.load_tile_slice %src[%c0], %mask, %tile, %tile_slice_index : memref<?x?xi8>, vector<[16]xi1>, vector<[16]x[16]xi8>
   return
 }
 
 // -----
 
-func.func @arm_sme_load_tile_slice_hor_i16(%src : memref<?x?xi16>, %tile : vector<[8]x[8]xi16>, %tile_slice_index : index) {
-  // CHECK: arm_sme.load_tile_slice %{{.*}}[{{.*}}], %{{.*}}, %{{.*}} : memref<?x?xi16>, vector<[8]x[8]xi16>
+func.func @arm_sme_load_tile_slice_hor_i16(%src : memref<?x?xi16>, %mask : vector<[8]xi1>, %tile : vector<[8]x[8]xi16>, %tile_slice_index : index) {
+  // CHECK: arm_sme.load_tile_slice %{{.*}}[{{.*}}], %{{.*}}, %{{.*}} : memref<?x?xi16>, vector<[8]xi1>, vector<[8]x[8]xi16>
   %c0 = arith.constant 0 : index
-  %tile_update = arm_sme.load_tile_slice %src[%c0], %tile, %tile_slice_index : memref<?x?xi16>, vector<[8]x[8]xi16>
+  %tile_update = arm_sme.load_tile_slice %src[%c0], %mask, %tile, %tile_slice_index : memref<?x?xi16>, vector<[8]xi1>, vector<[8]x[8]xi16>
   return
 }
 
 // -----
 
-func.func @arm_sme_load_tile_slice_hor_i32(%src : memref<?x?xi32>, %tile : vector<[4]x[4]xi32>, %tile_slice_index : index) {
-  // CHECK: arm_sme.load_tile_slice %{{.*}}[{{.*}}], %{{.*}}, %{{.*}} : memref<?x?xi32>, vector<[4]x[4]xi32>
+func.func @arm_sme_load_tile_slice_hor_i32(%src : memref<?x?xi32>, %mask : vector<[4]xi1>, %tile : vector<[4]x[4]xi32>, %tile_slice_index : index) {
+  // CHECK: arm_sme.load_tile_slice %{{.*}}[{{.*}}], %{{.*}}, %{{.*}} : memref<?x?xi32>, vector<[4]xi1>, vector<[4]x[4]xi32>
   %c0 = arith.constant 0 : index
-  %tile_update = arm_sme.load_tile_slice %src[%c0], %tile, %tile_slice_index : memref<?x?xi32>, vector<[4]x[4]xi32>
+  %tile_update = arm_sme.load_tile_slice %src[%c0], %mask, %tile, %tile_slice_index : memref<?x?xi32>, vector<[4]xi1>, vector<[4]x[4]xi32>
   return
 }
 
 // -----
 
-func.func @arm_sme_load_tile_slice_hor_i64(%src : memref<?x?xi64>, %tile : vector<[2]x[2]xi64>, %tile_slice_index : index) {
-  // CHECK: arm_sme.load_tile_slice %{{.*}}[{{.*}}], %{{.*}}, %{{.*}} : memref<?x?xi64>, vector<[2]x[2]xi64>
+func.func @arm_sme_load_tile_slice_hor_i64(%src : memref<?x?xi64>, %mask : vector<[2]xi1>, %tile : vector<[2]x[2]xi64>, %tile_slice_index : index) {
+  // CHECK: arm_sme.load_tile_slice %{{.*}}[{{.*}}], %{{.*}}, %{{.*}} : memref<?x?xi64>, vector<[2]xi1>, vector<[2]x[2]xi64>
   %c0 = arith.constant 0 : index
-  %tile_update = arm_sme.load_tile_slice %src[%c0], %tile, %tile_slice_index : memref<?x?xi64>, vector<[2]x[2]xi64>
+  %tile_update = arm_sme.load_tile_slice %src[%c0], %mask, %tile, %tile_slice_index : memref<?x?xi64>, vector<[2]xi1>, vector<[2]x[2]xi64>
   return
 }
 
 // -----
 
-func.func @arm_sme_load_tile_slice_hor_i128(%src : memref<?x?xi128>, %tile : vector<[1]x[1]xi128>, %tile_slice_index : index) {
-  // CHECK: arm_sme.load_tile_slice %{{.*}}[{{.*}}], %{{.*}}, %{{.*}} : memref<?x?xi128>, vector<[1]x[1]xi128>
+func.func @arm_sme_load_tile_slice_hor_i128(%src : memref<?x?xi128>, %mask : vector<[1]xi1>, %tile : vector<[1]x[1]xi128>, %tile_slice_index : index) {
+  // CHECK: arm_sme.load_tile_slice %{{.*}}[{{.*}}], %{{.*}}, %{{.*}} : memref<?x?xi128>, vector<[1]xi1>, vector<[1]x[1]xi128>
   %c0 = arith.constant 0 : index
-  %tile_update = arm_sme.load_tile_slice %src[%c0], %tile, %tile_slice_index : memref<?x?xi128>, vector<[1]x[1]xi128>
+  %tile_update = arm_sme.load_tile_slice %src[%c0], %mask, %tile, %tile_slice_index : memref<?x?xi128>, vector<[1]xi1>, vector<[1]x[1]xi128>
   return
 }
 
 // -----
 
-func.func @arm_sme_load_tile_slice_hor_f16(%src : memref<?x?xf16>, %tile : vector<[8]x[8]xf16>, %tile_slice_index : index) {
-  // CHECK: arm_sme.load_tile_slice %{{.*}}[{{.*}}], %{{.*}}, %{{.*}} : memref<?x?xf16>, vector<[8]x[8]xf16>
+func.func @arm_sme_load_tile_slice_hor_f16(%src : memref<?x?xf16>, %mask : vector<[8]xi1>, %tile : vector<[8]x[8]xf16>, %tile_slice_index : index) {
+  // CHECK: arm_sme.load_tile_slice %{{.*}}[{{.*}}], %{{.*}}, %{{.*}} : memref<?x?xf16>, vector<[8]xi1>, vector<[8]x[8]xf16>
   %c0 = arith.constant 0 : index
-  %tile_update = arm_sme.load_tile_slice %src[%c0], %tile, %tile_slice_index : memref<?x?xf16>, vector<[8]x[8]xf16>
+  %tile_update = arm_sme.load_tile_slice %src[%c0], %mask, %tile, %tile_slice_index : memref<?x?xf16>, vector<[8]xi1>, vector<[8]x[8]xf16>
   return
 }
 
 // -----
 
-func.func @arm_sme_load_tile_slice_hor_bf16(%src : memref<?x?xbf16>, %tile : vector<[8]x[8]xbf16>, %tile_slice_index : index) {
-  // CHECK: arm_sme.load_tile_slice %{{.*}}[{{.*}}], %{{.*}}, %{{.*}} : memref<?x?xbf16>, vector<[8]x[8]xbf16>
+func.func @arm_sme_load_tile_slice_hor_bf16(%src : memref<?x?xbf16>, %mask : vector<[8]xi1>, %tile : vector<[8]x[8]xbf16>, %tile_slice_index : index) {
+  // CHECK: arm_sme.load_tile_slice %{{.*}}[{{.*}}], %{{.*}}, %{{.*}} : memref<?x?xbf16>, vector<[8]xi1>, vector<[8]x[8]xbf16>
   %c0 = arith.constant 0 : index
-  %tile_update = arm_sme.load_tile_slice %src[%c0], %tile, %tile_slice_index : memref<?x?xbf16>, vector<[8]x[8]xbf16>
+  %tile_update = arm_sme.load_tile_slice %src[%c0], %mask, %tile, %tile_slice_index : memref<?x?xbf16>, vector<[8]xi1>, vector<[8]x[8]xbf16>
   return
 }
 
 // -----
 
-func.func @arm_sme_load_tile_slice_hor_f32(%src : memref<?x?xf32>, %tile : vector<[4]x[4]xf32>, %tile_slice_index : index) {
-  // CHECK: arm_sme.load_tile_slice %{{.*}}[{{.*}}], %{{.*}}, %{{.*}} : memref<?x?xf32>, vector<[4]x[4]xf32>
+func.func @arm_sme_load_tile_slice_hor_f32(%src : memref<?x?xf32>, %mask : vector<[4]xi1>, %tile : vector<[4]x[4]xf32>, %tile_slice_index : index) {
+  // CHECK: arm_sme.load_tile_slice %{{.*}}[{{.*}}], %{{.*}}, %{{.*}} : memref<?x?xf32>, vector<[4]xi1>, vector<[4]x[4]xf32>
   %c0 = arith.constant 0 : index
-  %tile_update = arm_sme.load_tile_slice %src[%c0], %tile, %tile_slice_index : memref<?x?xf32>, vector<[4]x[4]xf32>
+  %tile_update = arm_sme.load_tile_slice %src[%c0], %mask, %tile, %tile_slice_index : memref<?x?xf32>, vector<[4]xi1>, vector<[4]x[4]xf32>
   return
 }
 
 // -----
 
-func.func @arm_sme_load_tile_slice_hor_f64(%src : memref<?x?xf64>, %tile : vector<[2]x[2]xf64>, %tile_slice_index : index) {
-  // CHECK: arm_sme.load_tile_slice %{{.*}}[{{.*}}], %{{.*}}, %{{.*}} : memref<?x?xf64>, vector<[2]x[2]xf64>
+func.func @arm_sme_load_tile_slice_hor_f64(%src : memref<?x?xf64>, %mask : vector<[2]xi1>, %tile : vector<[2]x[2]xf64>, %tile_slice_index : index) {
+  // CHECK: arm_sme.load_tile_slice %{{.*}}[{{.*}}], %{{.*}}, %{{.*}} : memref<?x?xf64>, vector<[2]xi1>, vector<[2]x[2]xf64>
   %c0 = arith.constant 0 : index
-  %tile_update = arm_sme.load_tile_slice %src[%c0], %tile, %tile_slice_index : memref<?x?xf64>, vector<[2]x[2]xf64>
+  %tile_update = arm_sme.load_tile_slice %src[%c0], %mask, %tile, %tile_slice_index : memref<?x?xf64>, vector<[2]xi1>, vector<[2]x[2]xf64>
   return
 }
 
 // -----
 
-func.func @arm_sme_load_tile_slice_ver_i8(%src : memref<?x?xi8>, %tile : vector<[16]x[16]xi8>, %tile_slice_index : index) {
-  // CHECK: arm_sme.load_tile_slice {{.*}} layout<vertical> : memref<?x?xi8>, vector<[16]x[16]xi8>
+func.func @arm_sme_load_tile_slice_ver_i8(%src : memref<?x?xi8>, %mask : vector<[16]xi1>, %tile : vector<[16]x[16]xi8>, %tile_slice_index : index) {
+  // CHECK: arm_sme.load_tile_slice {{.*}} layout<vertical> : memref<?x?xi8>, vector<[16]xi1>, vector<[16]x[16]xi8>
   %c0 = arith.constant 0 : index
-  %tile_update = arm_sme.load_tile_slice %src[%c0], %tile, %tile_slice_index layout<vertical> : memref<?x?xi8>, vector<[16]x[16]xi8>
+  %tile_update = arm_sme.load_tile_slice %src[%c0], %mask, %tile, %tile_slice_index layout<vertical> : memref<?x?xi8>, vector<[16]xi1>, vector<[16]x[16]xi8>
   return
 }
 
 // -----
 
-func.func @arm_sme_load_tile_slice_ver_i16(%src : memref<?x?xi16>, %tile : vector<[8]x[8]xi16>, %tile_slice_index : index) {
-  // CHECK: arm_sme.load_tile_slice {{.*}} layout<vertical> : memref<?x?xi16>, vector<[8]x[8]xi16>
+func.func @arm_sme_load_tile_slice_ver_i16(%src : memref<?x?xi16>, %mask : vector<[8]xi1>, %tile : vector<[8]x[8]xi16>, %tile_slice_index : index) {
+  // CHECK: arm_sme.load_tile_slice {{.*}} layout<vertical> : memref<?x?xi16>, vector<[8]xi1>, vector<[8]x[8]xi16>
   %c0 = arith.constant 0 : index
-  %tile_update = arm_sme.load_tile_slice %src[%c0], %tile, %tile_slice_index layout<vertical> : memref<?x?xi16>, vector<[8]x[8]xi16>
+  %tile_update = arm_sme.load_tile_slice %src[%c0], %mask, %tile, %tile_slice_index layout<vertical> : memref<?x?xi16>, vector<[8]xi1>, vector<[8]x[8]xi16>
   return
 }
 
 // -----
 
-func.func @arm_sme_load_tile_slice_ver_i32(%src : memref<?x?xi32>, %tile : vector<[4]x[4]xi32>, %tile_slice_index : index) {
-  // CHECK: arm_sme.load_tile_slice {{.*}} layout<vertical> : memref<?x?xi32>, vector<[4]x[4]xi32>
+func.func @arm_sme_load_tile_slice_ver_i32(%src : memref<?x?xi32>, %mask : vector<[4]xi1>, %tile : vector<[4]x[4]xi32>, %tile_slice_index : index) {
+  // CHECK: arm_sme.load_tile_slice {{.*}} layout<vertical> : memref<?x?xi32>, vector<[4]xi1>, vector<[4]x[4]xi32>
   %c0 = arith.constant 0 : index
-  %tile_update = arm_sme.load_tile_slice %src[%c0], %tile, %tile_slice_index layout<vertical> : memref<?x?xi32>, vector<[4]x[4]xi32>
+  %tile_update = arm_sme.load_tile_slice %src[%c0], %mask, %tile, %tile_slice_index layout<vertical> : memref<?x?xi32>, vector<[4]xi1>, vector<[4]x[4]xi32>
   return
 }
 
 // -----
 
-func.func @arm_sme_load_tile_slice_ver_i64(%src : memref<?x?xi64>, %tile : vector<[2]x[2]xi64>, %tile_slice_index : index) {
-  // CHECK: arm_sme.load_tile_slice {{.*}} layout<vertical> : memref<?x?xi64>, vector<[2]x[2]xi64>
+func.func @arm_sme_load_tile_slice_ver_i64(%src : memref<?x?xi64>, %mask : vector<[2]xi1>, %tile : vector<[2]x[2]xi64>, %tile_slice_index : index) {
+  // CHECK: arm_sme.load_tile_slice {{.*}} layout<vertical> : memref<?x?xi64>, vector<[2]xi1>, vector<[2]x[2]xi64>
   %c0 = arith.constant 0 : index
-  %tile_update = arm_sme.load_tile_slice %src[%c0], %tile, %tile_slice_index layout<vertical> : memref<?x?xi64>, vector<[2]x[2]xi64>
+  %tile_update = arm_sme.load_tile_slice %src[%c0], %mask, %tile, %tile_slice_index layout<vertical> : memref<?x?xi64>, vector<[2]xi1>, vector<[2]x[2]xi64>
   return
 }
 
 // -----
 
-func.func @arm_sme_load_tile_slice_ver_i128(%src : memref<?x?xi128>, %tile : vector<[1]x[1]xi128>, %tile_slice_index : index) {
-  // CHECK: arm_sme.load_tile_slice {{.*}} layout<vertical> : memref<?x?xi128>, vector<[1]x[1]xi128>
+func.func @arm_sme_load_tile_slice_ver_i128(%src : memref<?x?xi128>, %mask : vector<[1]xi1>, %tile : vector<[1]x[1]xi128>, %tile_slice_index : index) {
+  // CHECK: arm_sme.load_tile_slice {{.*}} layout<vertical> : memref<?x?xi128>, vector<[1]xi1>, vector<[1]x[1]xi128>
   %c0 = arith.constant 0 : index
-  %tile_update = arm_sme.load_tile_slice %src[%c0], %tile, %tile_slice_index layout<vertical> : memref<?x?xi128>, vector<[1]x[1]xi128>
+  %tile_update = arm_sme.load_tile_slice %src[%c0], %mask, %tile, %tile_slice_index layout<vertical> : memref<?x?xi128>, vector<[1]xi1>, vector<[1]x[1]xi128>
   return
 }
 
 // -----
 
-func.func @arm_sme_load_tile_slice_ver_f16(%src : memref<?x?xf16>, %tile : vector<[8]x[8]xf16>, %tile_slice_index : index) {
-  // CHECK: arm_sme.load_tile_slice {{.*}} layout<vertical> : memref<?x?xf16>, vector<[8]x[8]xf16>
+func.func @arm_sme_load_tile_slice_ver_f16(%src : memref<?x?xf16>, %mask : vector<[8]xi1>, %tile : vector<[8]x[8]xf16>, %tile_slice_index : index) {
+  // CHECK: arm_sme.load_tile_slice {{.*}} layout<vertical> : memref<?x?xf16>, vector<[8]xi1>, vector<[8]x[8]xf16>
   %c0 = arith.constant 0 : index
-  %tile_update = arm_sme.load_tile_slice %src[%c0], %tile, %tile_slice_index layout<vertical> : memref<?x?xf16>, vector<[8]x[8]xf16>
+  %tile_update = arm_sme.load_tile_slice %src[%c0], %mask, %tile, %tile_slice_index layout<vertical> : memref<?x?xf16>, vector<[8]xi1>, vector<[8]x[8]xf16>
   return
 }
 
 // -----
 
-func.func @arm_sme_load_tile_slice_ver_bf16(%src : memref<?x?xbf16>, %tile : vector<[8]x[8]xbf16>, %tile_slice_index : index) {
-  // CHECK: arm_sme.load_tile_slice {{.*}} layout<vertical> : memref<?x?xbf16>, vector<[8]x[8]xbf16>
+func.func @arm_sme_load_tile_slice_ver_bf16(%src : memref<?x?xbf16>, %mask : vector<[8]xi1>, %tile : vector<[8]x[8]xbf16>, %tile_slice_index : index) {
+  // CHECK: arm_sme.load_tile_slice {{.*}} layout<vertical> : memref<?x?xbf16>, vector<[8]xi1>, vector<[8]x[8]xbf16>
   %c0 = arith.constant 0 : index
-  %tile_update = arm_sme.load_tile_slice %src[%c0], %tile, %tile_slice_index layout<vertical> : memref<?x?xbf16>, vector<[8]x[8]xbf16>
+  %tile_update = arm_sme.load_tile_slice %src[%c0], %mask, %tile, %tile_slice_index layout<vertical> : memref<?x?xbf16>, vector<[8]xi1>, vector<[8]x[8]xbf16>
   return
 }
 
 // -----
 
-func.func @arm_sme_load_tile_slice_ver_f32(%src : memref<?x?xf32>, %tile : vector<[4]x[4]xf32>, %tile_slice_index : index) {
-  // CHECK: arm_sme.load_tile_slice {{.*}} layout<vertical> : memref<?x?xf32>, vector<[4]x[4]xf32>
+func.func @arm_sme_load_tile_slice_ver_f32(%src : memref<?x?xf32>, %mask : vector<[4]xi1>, %tile : vector<[4]x[4]xf32>, %tile_slice_index : index) {
+  // CHECK: arm_sme.load_tile_slice {{.*}} layout<vertical> : memref<?x?xf32>, vector<[4]xi1>, vector<[4]x[4]xf32>
   %c0 = arith.constant 0 : index
-  %tile_update = arm_sme.load_tile_slice %src[%c0], %tile, %tile_slice_index layout<vertical> : memref<?x?xf32>, vector<[4]x[4]xf32>
+  %tile_update = arm_sme.load_tile_slice %src[%c0], %mask, %tile, %tile_slice_index layout<vertical> : memref<?x?xf32>, vector<[4]xi1>, vector<[4]x[4]xf32>
   return
 }
 
 // -----
 
-func.func @arm_sme_load_tile_slice_ver_f64(%src : memref<?x?xf64>, %tile : vector<[2]x[2]xf64>, %tile_slice_index : index) {
-  // CHECK: arm_sme.load_tile_slice {{.*}} layout<vertical> : memref<?x?xf64>, vector<[2]x[2]xf64>
+func.func @arm_sme_load_tile_slice_ver_f64(%src : memref<?x?xf64>, %mask : vector<[2]xi1>, %tile : vector<[2]x[2]xf64>, %tile_slice_index : index) {
+  // CHECK: arm_sme.load_tile_slice {{.*}} layout<vertical> : memref<?x?xf64>, vector<[2]xi1>, vector<[2]x[2]xf64>
   %c0 = arith.constant 0 : index
-  %tile_update = arm_sme.load_tile_slice %src[%c0], %tile, %tile_slice_index layout<vertical> : memref<?x?xf64>, vector<[2]x[2]xf64>
+  %tile_update = arm_sme.load_tile_slice %src[%c0], %mask, %tile, %tile_slice_index layout<vertical> : memref<?x?xf64>, vector<[2]xi1>, vector<[2]x[2]xf64>
   return
 }
 
 // -----
 
 /// Layout is optional and horizontal is the default, verify it's still parsed.
-func.func @arm_sme_load_tile_slice_hor_i8(%src : memref<?x?xi8>, %tile : vector<[16]x[16]xi8>, %tile_slice_index : index) {
-  // CHECK: arm_sme.load_tile_slice %{{.*}}[{{.*}}], %{{.*}}, %{{.*}} : memref<?x?xi8>, vector<[16]x[16]xi8>
+func.func @arm_sme_load_tile_slice_hor_i8(%src : memref<?x?xi8>, %mask : vector<[16]xi1>, %tile : vector<[16]x[16]xi8>, %tile_slice_index : index) {
+  // CHECK: arm_sme.load_tile_slice %{{.*}}[{{.*}}], %{{.*}}, %{{.*}} : memref<?x?xi8>, vector<[16]xi1>, vector<[16]x[16]xi8>
   %c0 = arith.constant 0 : index
-  %tile_update = arm_sme.load_tile_slice %src[%c0], %tile, %tile_slice_index layout<horizontal> : memref<?x?xi8>, vector<[16]x[16]xi8>
+  %tile_update = arm_sme.load_tile_slice %src[%c0], %mask, %tile, %tile_slice_index layout<horizontal> : memref<?x?xi8>, vector<[16]xi1>, vector<[16]x[16]xi8>
   return
 }
 

From 562ce8bbd23d96ee64992bcda041bf67e7c51057 Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <Ramkumar.Ramachandra@imgtec.com>
Date: Tue, 31 Oct 2023 13:13:26 +0000
Subject: [PATCH 515/877] LoopVectorize: add negative test for lrint, llrint
 (#70211)

With the recent change 98c90a1 (ISel: introduce vector ISD::LRINT,
ISD::LLRINT; custom RISCV lowering), it is now possible to vectorize
llvm.lrint and llvm.llrint with a trivial change to VectorUtils. In
preparation for this change, and the corresponding test update, add a
negative test for lrint and llrint.
---
 .../Transforms/LoopVectorize/intrinsic.ll     | 52 +++++++++++++++++++
 1 file changed, 52 insertions(+)

diff --git a/llvm/test/Transforms/LoopVectorize/intrinsic.ll b/llvm/test/Transforms/LoopVectorize/intrinsic.ll
index 3e89105643bad..3e7754951de6a 100644
--- a/llvm/test/Transforms/LoopVectorize/intrinsic.ll
+++ b/llvm/test/Transforms/LoopVectorize/intrinsic.ll
@@ -1597,3 +1597,55 @@ for.body:                                         ; preds = %entry, %for.body
 for.end:                                          ; preds = %for.body, %entry
   ret void
 }
+
+declare i32 @llvm.lrint.i32.f32(float)
+
+define void @lrint_i32_f32(ptr %x, ptr %y, i64 %n) {
+; CHECK-LABEL: @lrint_i32_f32(
+; CHECK-NOT: llvm.lrint.v4i32.v4f32
+; CHECK: ret void
+;
+entry:
+  %cmp = icmp sgt i64 %n, 0
+  br i1 %cmp, label %for.body, label %exit
+
+for.body:                                         ; preds = %entry, %for.body
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %gep.load = getelementptr inbounds float, ptr %x, i64 %iv
+  %0 = load float, ptr %gep.load, align 4
+  %1 = tail call i32 @llvm.lrint.i32.f32(float %0)
+  %gep.store = getelementptr inbounds i32, ptr %y, i64 %iv
+  store i32 %1, ptr %gep.store, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, %n
+  br i1 %exitcond, label %exit, label %for.body
+
+exit:                                            ; preds = %for.body, %entry
+  ret void
+}
+
+declare i64 @llvm.llrint.i64.f32(float)
+
+define void @llrint_i64_f32(ptr %x, ptr %y, i64 %n) {
+; CHECK-LABEL: @llrint_i64_f32(
+; CHECK-NOT: llvm.llrint.v4i32.v4f32
+; CHECK: ret void
+;
+entry:
+  %cmp = icmp sgt i64 %n, 0
+  br i1 %cmp, label %for.body, label %exit
+
+for.body:                                         ; preds = %entry, %for.body
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %gep.load = getelementptr inbounds float, ptr %x, i64 %iv
+  %0 = load float, ptr %gep.load, align 4
+  %1 = tail call i64 @llvm.llrint.i64.f32(float %0)
+  %gep.store = getelementptr inbounds i64, ptr %y, i64 %iv
+  store i64 %1, ptr %gep.store, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond = icmp eq i64 %iv.next, %n
+  br i1 %exitcond, label %exit, label %for.body
+
+exit:                                            ; preds = %for.body, %entry
+  ret void
+}

From 9ca6bf3fb7b7df373723b3275730f101f9ff816b Mon Sep 17 00:00:00 2001
From: Lu Weining <luweining@loongson.cn>
Date: Tue, 31 Oct 2023 21:18:06 +0800
Subject: [PATCH 516/877] [LoongArch] Fix ABI mismatch with gcc/g++ about empty
 structs passing (#70320)

How empty structs (not as fields of container struct) are passed in C++
is not explicitly documented in psABI. However, this patch fixes the
mismatch with g++.

Note that the unnamed bitfield case `struct { int : 1; }` in C is also
fixed. Previously clang regards it as an empty struct and then ignores
it when passing. Now size of the struct is counted; since it's size is
not 0, clang will not ignore it even in C.

While https://reviews.llvm.org/D156116 fixed the handling of empty
struct when considering eligibility of the container struct for the FP
calling convention ('flattening'), this patch fixes the handling of
passing the empty struct itself.

Fix https://github.com/llvm/llvm-project/issues/70319
---
 clang/lib/CodeGen/Targets/LoongArch.cpp                | 10 ++++++----
 clang/test/CodeGen/LoongArch/abi-lp64d-empty-structs.c |  8 ++++----
 2 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/clang/lib/CodeGen/Targets/LoongArch.cpp b/clang/lib/CodeGen/Targets/LoongArch.cpp
index 26c68c3583b2a..c4d886eb40725 100644
--- a/clang/lib/CodeGen/Targets/LoongArch.cpp
+++ b/clang/lib/CodeGen/Targets/LoongArch.cpp
@@ -308,12 +308,14 @@ ABIArgInfo LoongArchABIInfo::classifyArgumentType(QualType Ty, bool IsFixed,
                                            CGCXXABI::RAA_DirectInMemory);
   }
 
-  // Ignore empty structs/unions.
-  if (isEmptyRecord(getContext(), Ty, true))
-    return ABIArgInfo::getIgnore();
-
   uint64_t Size = getContext().getTypeSize(Ty);
 
+  // Ignore empty struct or union whose size is zero, e.g. `struct { }` in C or
+  // `struct { int a[0]; }` in C++. In C++, `struct { }` is empty but it's size
+  // is 1 byte and g++ doesn't ignore it; clang++ matches this behaviour.
+  if (isEmptyRecord(getContext(), Ty, true) && Size == 0)
+    return ABIArgInfo::getIgnore();
+
   // Pass floating point values via FARs if possible.
   if (IsFixed && Ty->isFloatingType() && !Ty->isComplexType() &&
       FRLen >= Size && FARsLeft) {
diff --git a/clang/test/CodeGen/LoongArch/abi-lp64d-empty-structs.c b/clang/test/CodeGen/LoongArch/abi-lp64d-empty-structs.c
index d0daafac336ec..281b7b15841a9 100644
--- a/clang/test/CodeGen/LoongArch/abi-lp64d-empty-structs.c
+++ b/clang/test/CodeGen/LoongArch/abi-lp64d-empty-structs.c
@@ -93,7 +93,7 @@ struct s9 test_s9(struct s9 a) {
 }
 
 // CHECK-C: define{{.*}} void @test_s10()
-// CHECK-CXX: define{{.*}} void @_Z8test_s103s10()
+// CHECK-CXX: define{{.*}} i64 @_Z8test_s103s10(i64 {{.*}})
 struct s10 { };
 struct s10 test_s10(struct s10 a) {
   return a;
@@ -128,14 +128,14 @@ struct s14 test_s14(struct s14 a) {
 }
 
 // CHECK-C: define{{.*}} void @test_s15()
-// CHECK-CXX: define{{.*}} void @_Z8test_s153s15()
+// CHECK-CXX: define{{.*}} i64 @_Z8test_s153s15(i64 {{.*}})
 struct s15 { int : 0; };
 struct s15 test_s15(struct s15 a) {
   return a;
 }
 
-// CHECK-C: define{{.*}} void @test_s16()
-// CHECK-CXX: define{{.*}} void @_Z8test_s163s16()
+// CHECK-C: define{{.*}} i64 @test_s16(i64 {{.*}})
+// CHECK-CXX: define{{.*}} i64 @_Z8test_s163s16(i64 {{.*}})
 struct s16 { int : 1; };
 struct s16 test_s16(struct s16 a) {
   return a;

From 51d4ad6701159fc3a516442bc57945880c161ecc Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Tue, 31 Oct 2023 13:23:55 +0000
Subject: [PATCH 517/877] [AMDGPU] amdgpu-codegenprepare-idiv.ll - regenerate
 checks. NFC.

Reduces diffs in a future patch
---
 .../AMDGPU/amdgpu-codegenprepare-idiv.ll      | 76 +++++++++++++++++++
 1 file changed, 76 insertions(+)

diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
index 2b444e5e0e1f3..d9001656f308e 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
@@ -69,6 +69,7 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
+;
 ; GFX9-LABEL: udiv_i32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -163,6 +164,7 @@ define amdgpu_kernel void @urem_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
+;
 ; GFX9-LABEL: urem_i32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -278,6 +280,7 @@ define amdgpu_kernel void @sdiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
 ; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s0, v0
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
+;
 ; GFX9-LABEL: sdiv_i32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -395,6 +398,7 @@ define amdgpu_kernel void @srem_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
+;
 ; GFX9-LABEL: srem_i32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -476,6 +480,7 @@ define amdgpu_kernel void @udiv_i16(ptr addrspace(1) %out, i16 %x, i16 %y) {
 ; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
 ; GFX6-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
+;
 ; GFX9-LABEL: udiv_i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x2c
@@ -546,6 +551,7 @@ define amdgpu_kernel void @urem_i16(ptr addrspace(1) %out, i16 %x, i16 %y) {
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
 ; GFX6-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
+;
 ; GFX9-LABEL: urem_i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x2c
@@ -624,6 +630,7 @@ define amdgpu_kernel void @sdiv_i16(ptr addrspace(1) %out, i16 %x, i16 %y) {
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s4, v2
 ; GFX6-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
+;
 ; GFX9-LABEL: sdiv_i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
@@ -708,6 +715,7 @@ define amdgpu_kernel void @srem_i16(ptr addrspace(1) %out, i16 %x, i16 %y) {
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
 ; GFX6-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
+;
 ; GFX9-LABEL: srem_i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
@@ -780,6 +788,7 @@ define amdgpu_kernel void @udiv_i8(ptr addrspace(1) %out, i8 %x, i8 %y) {
 ; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
 ; GFX6-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
+;
 ; GFX9-LABEL: udiv_i8:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x2c
@@ -847,6 +856,7 @@ define amdgpu_kernel void @urem_i8(ptr addrspace(1) %out, i8 %x, i8 %y) {
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
 ; GFX6-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
+;
 ; GFX9-LABEL: urem_i8:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x2c
@@ -924,6 +934,7 @@ define amdgpu_kernel void @sdiv_i8(ptr addrspace(1) %out, i8 %x, i8 %y) {
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s4, v2
 ; GFX6-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
+;
 ; GFX9-LABEL: sdiv_i8:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
@@ -1009,6 +1020,7 @@ define amdgpu_kernel void @srem_i8(ptr addrspace(1) %out, i8 %x, i8 %y) {
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
 ; GFX6-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
+;
 ; GFX9-LABEL: srem_i8:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
@@ -1271,6 +1283,7 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
 ; GFX6-NEXT:    s_endpgm
+;
 ; GFX9-LABEL: udiv_v4i32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
@@ -1578,6 +1591,7 @@ define amdgpu_kernel void @urem_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
+;
 ; GFX9-LABEL: urem_v4i32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
@@ -1969,6 +1983,7 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x
 ; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s0, v3
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
 ; GFX6-NEXT:    s_endpgm
+;
 ; GFX9-LABEL: sdiv_v4i32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
@@ -2376,6 +2391,7 @@ define amdgpu_kernel void @srem_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
+;
 ; GFX9-LABEL: srem_v4i32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
@@ -2641,6 +2657,7 @@ define amdgpu_kernel void @udiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
+;
 ; GFX9-LABEL: udiv_v4i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
@@ -2862,6 +2879,7 @@ define amdgpu_kernel void @urem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
+;
 ; GFX9-LABEL: urem_v4i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
@@ -3111,6 +3129,7 @@ define amdgpu_kernel void @sdiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x
 ; GFX6-NEXT:    v_or_b32_e32 v0, v2, v0
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
+;
 ; GFX9-LABEL: sdiv_v4i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
@@ -3392,6 +3411,7 @@ define amdgpu_kernel void @srem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
+;
 ; GFX9-LABEL: srem_v4i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
@@ -3524,6 +3544,7 @@ define amdgpu_kernel void @udiv_i3(ptr addrspace(1) %out, i3 %x, i3 %y) {
 ; GFX6-NEXT:    v_and_b32_e32 v0, 7, v0
 ; GFX6-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
+;
 ; GFX9-LABEL: udiv_i3:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
@@ -3597,6 +3618,7 @@ define amdgpu_kernel void @urem_i3(ptr addrspace(1) %out, i3 %x, i3 %y) {
 ; GFX6-NEXT:    v_and_b32_e32 v0, 7, v0
 ; GFX6-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
+;
 ; GFX9-LABEL: urem_i3:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x2c
@@ -3678,6 +3700,7 @@ define amdgpu_kernel void @sdiv_i3(ptr addrspace(1) %out, i3 %x, i3 %y) {
 ; GFX6-NEXT:    v_and_b32_e32 v0, 7, v0
 ; GFX6-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
+;
 ; GFX9-LABEL: sdiv_i3:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
@@ -3765,6 +3788,7 @@ define amdgpu_kernel void @srem_i3(ptr addrspace(1) %out, i3 %x, i3 %y) {
 ; GFX6-NEXT:    v_and_b32_e32 v0, 7, v0
 ; GFX6-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
+;
 ; GFX9-LABEL: srem_i3:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
@@ -3910,6 +3934,7 @@ define amdgpu_kernel void @udiv_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x
 ; GFX6-NEXT:    buffer_store_short v2, off, s[0:3], 0 offset:4
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
+;
 ; GFX9-LABEL: udiv_v3i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
@@ -4082,6 +4107,7 @@ define amdgpu_kernel void @urem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x
 ; GFX6-NEXT:    buffer_store_short v2, off, s[0:3], 0 offset:4
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
+;
 ; GFX9-LABEL: urem_v3i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
@@ -4276,6 +4302,7 @@ define amdgpu_kernel void @sdiv_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x
 ; GFX6-NEXT:    buffer_store_short v0, off, s[0:3], 0 offset:4
 ; GFX6-NEXT:    buffer_store_dword v1, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
+;
 ; GFX9-LABEL: sdiv_v3i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
@@ -4492,6 +4519,7 @@ define amdgpu_kernel void @srem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x
 ; GFX6-NEXT:    buffer_store_short v2, off, s[0:3], 0 offset:4
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
+;
 ; GFX9-LABEL: srem_v3i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
@@ -4685,6 +4713,7 @@ define amdgpu_kernel void @udiv_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x
 ; GFX6-NEXT:    v_and_b32_e32 v0, 0x1fff, v1
 ; GFX6-NEXT:    buffer_store_short v0, off, s[0:3], 0 offset:4
 ; GFX6-NEXT:    s_endpgm
+;
 ; GFX9-LABEL: udiv_v3i15:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
@@ -4879,6 +4908,7 @@ define amdgpu_kernel void @urem_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x
 ; GFX6-NEXT:    v_and_b32_e32 v0, 0x1fff, v1
 ; GFX6-NEXT:    buffer_store_short v0, off, s[0:3], 0 offset:4
 ; GFX6-NEXT:    s_endpgm
+;
 ; GFX9-LABEL: urem_v3i15:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
@@ -5093,6 +5123,7 @@ define amdgpu_kernel void @sdiv_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x
 ; GFX6-NEXT:    v_and_b32_e32 v0, 0x1fff, v1
 ; GFX6-NEXT:    buffer_store_short v0, off, s[0:3], 0 offset:4
 ; GFX6-NEXT:    s_endpgm
+;
 ; GFX9-LABEL: sdiv_v3i15:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
@@ -5329,6 +5360,7 @@ define amdgpu_kernel void @srem_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x
 ; GFX6-NEXT:    v_and_b32_e32 v0, 0x1fff, v1
 ; GFX6-NEXT:    buffer_store_short v0, off, s[0:3], 0 offset:4
 ; GFX6-NEXT:    s_endpgm
+;
 ; GFX9-LABEL: srem_v3i15:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
@@ -5432,6 +5464,7 @@ define amdgpu_kernel void @udiv_i32_oddk_denom(ptr addrspace(1) %out, i32 %x) {
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 20, v0
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
+;
 ; GFX9-LABEL: udiv_i32_oddk_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
@@ -5468,6 +5501,7 @@ define amdgpu_kernel void @udiv_i32_pow2k_denom(ptr addrspace(1) %out, i32 %x) {
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
+;
 ; GFX9-LABEL: udiv_i32_pow2k_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
@@ -5503,6 +5537,7 @@ define amdgpu_kernel void @udiv_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
+;
 ; GFX9-LABEL: udiv_i32_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -5544,6 +5579,7 @@ define amdgpu_kernel void @udiv_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i3
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
+;
 ; GFX9-LABEL: udiv_v2i32_pow2k_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -5589,6 +5625,7 @@ define amdgpu_kernel void @udiv_v2i32_mixed_pow2k_denom(ptr addrspace(1) %out, <
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
+;
 ; GFX9-LABEL: udiv_v2i32_mixed_pow2k_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -5734,6 +5771,7 @@ define amdgpu_kernel void @udiv_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
 ; GFX6-NEXT:    s_endpgm
+;
 ; GFX9-LABEL: udiv_v2i32_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
@@ -5816,6 +5854,7 @@ define amdgpu_kernel void @urem_i32_oddk_denom(ptr addrspace(1) %out, i32 %x) {
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
+;
 ; GFX9-LABEL: urem_i32_oddk_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
@@ -5854,6 +5893,7 @@ define amdgpu_kernel void @urem_i32_pow2k_denom(ptr addrspace(1) %out, i32 %x) {
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
+;
 ; GFX9-LABEL: urem_i32_pow2k_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
@@ -5890,6 +5930,7 @@ define amdgpu_kernel void @urem_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
+;
 ; GFX9-LABEL: urem_i32_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -5932,6 +5973,7 @@ define amdgpu_kernel void @urem_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i3
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
+;
 ; GFX9-LABEL: urem_v2i32_pow2k_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -6063,6 +6105,7 @@ define amdgpu_kernel void @urem_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
+;
 ; GFX9-LABEL: urem_v2i32_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
@@ -6138,6 +6181,7 @@ define amdgpu_kernel void @sdiv_i32_oddk_denom(ptr addrspace(1) %out, i32 %x) {
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
+;
 ; GFX9-LABEL: sdiv_i32_oddk_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
@@ -6177,6 +6221,7 @@ define amdgpu_kernel void @sdiv_i32_pow2k_denom(ptr addrspace(1) %out, i32 %x) {
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
+;
 ; GFX9-LABEL: sdiv_i32_pow2k_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
@@ -6244,6 +6289,7 @@ define amdgpu_kernel void @sdiv_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x
 ; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s0, v0
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
+;
 ; GFX9-LABEL: sdiv_i32_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -6319,6 +6365,7 @@ define amdgpu_kernel void @sdiv_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i3
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
+;
 ; GFX9-LABEL: sdiv_v2i32_pow2k_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -6373,6 +6420,7 @@ define amdgpu_kernel void @ssdiv_v2i32_mixed_pow2k_denom(ptr addrspace(1) %out,
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
+;
 ; GFX9-LABEL: ssdiv_v2i32_mixed_pow2k_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -6557,6 +6605,7 @@ define amdgpu_kernel void @sdiv_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    v_subrev_i32_e32 v1, vcc, s4, v1
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
+;
 ; GFX9-LABEL: sdiv_v2i32_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
@@ -6658,6 +6707,7 @@ define amdgpu_kernel void @srem_i32_oddk_denom(ptr addrspace(1) %out, i32 %x) {
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
+;
 ; GFX9-LABEL: srem_i32_oddk_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
@@ -6700,6 +6750,7 @@ define amdgpu_kernel void @srem_i32_pow2k_denom(ptr addrspace(1) %out, i32 %x) {
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
+;
 ; GFX9-LABEL: srem_i32_pow2k_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s4, s[0:1], 0x2c
@@ -6762,6 +6813,7 @@ define amdgpu_kernel void @srem_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
+;
 ; GFX9-LABEL: srem_i32_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -6836,6 +6888,7 @@ define amdgpu_kernel void @srem_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i3
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
+;
 ; GFX9-LABEL: srem_v2i32_pow2k_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -7007,6 +7060,7 @@ define amdgpu_kernel void @srem_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
+;
 ; GFX9-LABEL: srem_v2i32_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2c
@@ -7179,6 +7233,7 @@ define amdgpu_kernel void @udiv_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) {
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
+;
 ; GFX9-LABEL: udiv_i64_oddk_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
@@ -7305,6 +7360,7 @@ define amdgpu_kernel void @udiv_i64_pow2k_denom(ptr addrspace(1) %out, i64 %x) {
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
+;
 ; GFX9-LABEL: udiv_i64_pow2k_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -7342,6 +7398,7 @@ define amdgpu_kernel void @udiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
+;
 ; GFX9-LABEL: udiv_i64_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x34
@@ -7386,6 +7443,7 @@ define amdgpu_kernel void @udiv_v2i64_pow2k_denom(ptr addrspace(1) %out, <2 x i6
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s7
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
+;
 ; GFX9-LABEL: udiv_v2i64_pow2k_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
@@ -7502,6 +7560,7 @@ define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(ptr addrspace(1) %out, <
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
+;
 ; GFX9-LABEL: udiv_v2i64_mixed_pow2k_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
@@ -7633,6 +7692,7 @@ define amdgpu_kernel void @udiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s7
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
+;
 ; GFX9-LABEL: udiv_v2i64_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
@@ -7756,6 +7816,7 @@ define amdgpu_kernel void @urem_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) {
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
+;
 ; GFX9-LABEL: urem_i64_oddk_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
@@ -7880,6 +7941,7 @@ define amdgpu_kernel void @urem_i64_pow2k_denom(ptr addrspace(1) %out, i64 %x) {
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
+;
 ; GFX9-LABEL: urem_i64_pow2k_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -7918,6 +7980,7 @@ define amdgpu_kernel void @urem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
+;
 ; GFX9-LABEL: urem_i64_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x34
@@ -7964,6 +8027,7 @@ define amdgpu_kernel void @urem_v2i64_pow2k_denom(ptr addrspace(1) %out, <2 x i6
 ; GFX6-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
+;
 ; GFX9-LABEL: urem_v2i64_pow2k_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
@@ -8017,6 +8081,7 @@ define amdgpu_kernel void @urem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s7
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
+;
 ; GFX9-LABEL: urem_v2i64_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
@@ -8143,6 +8208,7 @@ define amdgpu_kernel void @sdiv_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) {
 ; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
+;
 ; GFX9-LABEL: sdiv_i64_oddk_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_mov_b32 s4, 0x33fe64
@@ -8265,6 +8331,7 @@ define amdgpu_kernel void @sdiv_i64_pow2k_denom(ptr addrspace(1) %out, i64 %x) {
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
+;
 ; GFX9-LABEL: sdiv_i64_pow2k_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -8426,6 +8493,7 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
+;
 ; GFX9-LABEL: sdiv_i64_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x34
@@ -8616,6 +8684,7 @@ define amdgpu_kernel void @sdiv_v2i64_pow2k_denom(ptr addrspace(1) %out, <2 x i6
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s7
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
+;
 ; GFX9-LABEL: sdiv_v2i64_pow2k_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
@@ -8754,6 +8823,7 @@ define amdgpu_kernel void @ssdiv_v2i64_mixed_pow2k_denom(ptr addrspace(1) %out,
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
+;
 ; GFX9-LABEL: ssdiv_v2i64_mixed_pow2k_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
@@ -9131,6 +9201,7 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
 ; GFX6-NEXT:    s_endpgm
+;
 ; GFX9-LABEL: sdiv_v2i64_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34
@@ -9526,6 +9597,7 @@ define amdgpu_kernel void @srem_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) {
 ; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
+;
 ; GFX9-LABEL: srem_i64_oddk_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_mov_b32 s4, 0x33fe64
@@ -9653,6 +9725,7 @@ define amdgpu_kernel void @srem_i64_pow2k_denom(ptr addrspace(1) %out, i64 %x) {
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
+;
 ; GFX9-LABEL: srem_i64_pow2k_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
@@ -9814,6 +9887,7 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
+;
 ; GFX9-LABEL: srem_i64_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dword s2, s[0:1], 0x34
@@ -10005,6 +10079,7 @@ define amdgpu_kernel void @srem_v2i64_pow2k_denom(ptr addrspace(1) %out, <2 x i6
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s7
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GFX6-NEXT:    s_endpgm
+;
 ; GFX9-LABEL: srem_v2i64_pow2k_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
@@ -10302,6 +10377,7 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
 ; GFX6-NEXT:    s_endpgm
+;
 ; GFX9-LABEL: srem_v2i64_pow2_shl_denom:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[0:1], 0x34

From 4ce93d531d9335e108ca1c3bab9a86f474cda69f Mon Sep 17 00:00:00 2001
From: Christian Ulmann <christian.ulmann@nextsilicon.com>
Date: Tue, 31 Oct 2023 14:30:31 +0100
Subject: [PATCH 518/877] [MLIR][LLVM] Avoid creating invalid DICompositeTypes
 in import (#70797)

This commit ensures that the debug info import skips `DICompositeTypes`
that have an array type tag and failed to translate the base type. This
is necessary because array `DICompositeTypes` require a base type to be
valid.
Note that this is currently not verified in LLVM, instead it leads to an
explosion of the `ASMPrinter`.
---
 mlir/lib/Target/LLVMIR/DebugImporter.cpp     | 10 ++++-
 mlir/test/Target/LLVMIR/Import/debug-info.ll | 39 +++++++++++++++++---
 2 files changed, 42 insertions(+), 7 deletions(-)

diff --git a/mlir/lib/Target/LLVMIR/DebugImporter.cpp b/mlir/lib/Target/LLVMIR/DebugImporter.cpp
index a3e81d0dd87a4..36a03ec71c0c6 100644
--- a/mlir/lib/Target/LLVMIR/DebugImporter.cpp
+++ b/mlir/lib/Target/LLVMIR/DebugImporter.cpp
@@ -13,6 +13,7 @@
 #include "mlir/IR/Location.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/ScopeExit.h"
+#include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/Metadata.h"
@@ -69,11 +70,16 @@ DICompositeTypeAttr DebugImporter::translateImpl(llvm::DICompositeType *node) {
   // TODO: Support debug metadata with cyclic dependencies.
   if (llvm::is_contained(elements, nullptr))
     elements.clear();
+  DITypeAttr baseType = translate(node->getBaseType());
+  // Arrays require a base type, otherwise the debug metadata is considered to
+  // be malformed.
+  if (node->getTag() == llvm::dwarf::DW_TAG_array_type && !baseType)
+    return nullptr;
   return DICompositeTypeAttr::get(
       context, node->getTag(), getStringAttrOrNull(node->getRawName()),
       translate(node->getFile()), node->getLine(), translate(node->getScope()),
-      translate(node->getBaseType()), flags.value_or(DIFlags::Zero),
-      node->getSizeInBits(), node->getAlignInBits(), elements);
+      baseType, flags.value_or(DIFlags::Zero), node->getSizeInBits(),
+      node->getAlignInBits(), elements);
 }
 
 DIDerivedTypeAttr DebugImporter::translateImpl(llvm::DIDerivedType *node) {
diff --git a/mlir/test/Target/LLVMIR/Import/debug-info.ll b/mlir/test/Target/LLVMIR/Import/debug-info.ll
index 2c4fb2c213cf3..08e60deb699d8 100644
--- a/mlir/test/Target/LLVMIR/Import/debug-info.ll
+++ b/mlir/test/Target/LLVMIR/Import/debug-info.ll
@@ -159,7 +159,7 @@ define void @derived_type() !dbg !3 {
 
 ; CHECK-DAG: #[[INT:.+]] = #llvm.di_basic_type<tag = DW_TAG_base_type, name = "int">
 ; CHECK-DAG: #[[FILE:.+]] = #llvm.di_file<"debug-info.ll" in "/">
-; CHECK-DAG: #[[COMP1:.+]] = #llvm.di_composite_type<tag = DW_TAG_array_type, name = "array1", line = 10, sizeInBits = 128, alignInBits = 32>
+; CHECK-DAG: #[[COMP1:.+]] = #llvm.di_composite_type<tag = DW_TAG_array_type, name = "array1", line = 10, baseType = #[[INT]], sizeInBits = 128, alignInBits = 32>
 ; CHECK-DAG: #[[COMP2:.+]] = #llvm.di_composite_type<{{.*}}, file = #[[FILE]], scope = #[[FILE]], baseType = #[[INT]]>
 ; CHECK-DAG: #[[COMP3:.+]] = #llvm.di_composite_type<{{.*}}, flags = Vector, elements = #llvm.di_subrange<count = 4 : i64>>
 ; CHECK-DAG: #[[COMP4:.+]] = #llvm.di_composite_type<{{.*}}, flags = Vector, elements = #llvm.di_subrange<lowerBound = 0 : i64, upperBound = 4 : i64, stride = 1 : i64>>
@@ -179,10 +179,10 @@ define void @composite_type() !dbg !3 {
 !4 = !DISubroutineType(types: !5)
 !5 = !{!7, !8, !9, !10, !18}
 !6 = !DIBasicType(name: "int")
-!7 = !DICompositeType(tag: DW_TAG_array_type, name: "array1", line: 10, size: 128, align: 32)
+!7 = !DICompositeType(tag: DW_TAG_array_type, name: "array1", line: 10, size: 128, align: 32, baseType: !6)
 !8 = !DICompositeType(tag: DW_TAG_array_type, name: "array2", file: !2, scope: !2, baseType: !6)
-!9 = !DICompositeType(tag: DW_TAG_array_type, name: "array3", flags: DIFlagVector, elements: !13)
-!10 = !DICompositeType(tag: DW_TAG_array_type, name: "array4", flags: DIFlagVector, elements: !14)
+!9 = !DICompositeType(tag: DW_TAG_array_type, name: "array3", flags: DIFlagVector, elements: !13, baseType: !6)
+!10 = !DICompositeType(tag: DW_TAG_array_type, name: "array4", flags: DIFlagVector, elements: !14, baseType: !6)
 !11 = !DISubrange(count: 4)
 !12 = !DISubrange(lowerBound: 0, upperBound: 4, stride: 1)
 !13 = !{!11}
@@ -192,7 +192,7 @@ define void @composite_type() !dbg !3 {
 !15 = !DISubrange(count: !16)
 !16 = !DILocalVariable(scope: !3, name: "size")
 !17 = !{!15}
-!18 = !DICompositeType(tag: DW_TAG_array_type, name: "unsupported_elements", flags: DIFlagVector, elements: !17)
+!18 = !DICompositeType(tag: DW_TAG_array_type, name: "unsupported_elements", flags: DIFlagVector, elements: !17, baseType: !6)
 
 ; // -----
 
@@ -558,3 +558,32 @@ define void @func_in_module(ptr %arg) !dbg !8 {
 !2 = !DIFile(filename: "debug-info.ll", directory: "/")
 !8 = distinct !DISubprogram(name: "func_in_module", scope: !10, file: !2, unit: !1);
 !10 = !DIModule(scope: !2, name: "module", configMacros: "bar", includePath: "/", apinotes: "/", file: !2, line: 42, isDecl: true)
+
+; // -----
+
+; Verifies that array types that have an unimportable base type are removed to
+; avoid producing invalid IR.
+; CHECK: #[[DI_LOCAL_VAR:.+]] = #llvm.di_local_variable<
+; CHECK-NOT: type =
+
+; CHECK-LABEL: @array_with_cyclic_base_type
+define i32 @array_with_cyclic_base_type(ptr %0) !dbg !3 {
+  call void @llvm.dbg.value(metadata ptr %0, metadata !4, metadata !DIExpression()), !dbg !7
+  ret i32 0
+}
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare void @llvm.dbg.value(metadata, metadata, metadata)
+
+
+!llvm.module.flags = !{!0}
+!llvm.dbg.cu = !{!1}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2)
+!2 = !DIFile(filename: "debug-info.ll", directory: "/")
+!3 = distinct !DISubprogram(name: "func", scope: !2, file: !2, line: 46, scopeLine: 48, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !1)
+!4 = !DILocalVariable(name: "op", arg: 5, scope: !3, file: !2, line: 47, type: !5)
+!5 = !DICompositeType(tag: DW_TAG_array_type, size: 42, baseType: !6)
+!6 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !5)
+!7 = !DILocation(line: 0, scope: !3)

From 1b4371759ea3d1d5d49b98ce0b3dc841d1a59a59 Mon Sep 17 00:00:00 2001
From: Adrian Kuegel <akuegel@google.com>
Date: Tue, 31 Oct 2023 13:34:22 +0000
Subject: [PATCH 519/877] [mlir][Bazel] Add missing dependency after
 4ce93d531d9335e108ca1c3bab9a86f474cda69f

---
 utils/bazel/llvm-project-overlay/mlir/BUILD.bazel | 1 +
 1 file changed, 1 insertion(+)

diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index b1bd543ae7fce..0a534cf1f1cda 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -8398,6 +8398,7 @@ cc_library(
         ":LLVMIntrinsicConversionIncGen",
         ":Support",
         ":TranslateLib",
+        "//llvm:BinaryFormat",
         "//llvm:Core",
         "//llvm:IRPrinter",
         "//llvm:IRReader",

From 913a1d5a5a1ba6e85144b21cde82a27a9c7e1a70 Mon Sep 17 00:00:00 2001
From: Jake Egan <5326451+jakeegan@users.noreply.github.com>
Date: Tue, 31 Oct 2023 09:38:45 -0400
Subject: [PATCH 520/877] [NFC][tests] Fix target OS check for module test

PR #70661 accidentally used the old form.
---
 clang/test/Modules/relative-resource-dir.m | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/test/Modules/relative-resource-dir.m b/clang/test/Modules/relative-resource-dir.m
index e0d1b6d221d10..4f88150a2ba4c 100644
--- a/clang/test/Modules/relative-resource-dir.m
+++ b/clang/test/Modules/relative-resource-dir.m
@@ -1,4 +1,4 @@
-// UNSUPPORTED: -zos, -aix
+// UNSUPPORTED: target={{.*}}-zos{{.*}}, target={{.*}}-aix{{.*}}
 // REQUIRES: shell
 
 // RUN: EXPECTED_RESOURCE_DIR=`%clang -print-resource-dir` && \

From 73c6248cc2cc3acd01c3580bfdc64825c09a0fd6 Mon Sep 17 00:00:00 2001
From: Krzysztof Drewniak <Krzysztof.Drewniak@amd.com>
Date: Tue, 31 Oct 2023 09:01:43 -0500
Subject: [PATCH 521/877] [mlir][MemRefToLLVM] Fix crashes with unconvertable
 memory spaces (#70694)

Fixes #70160

The issue is resolved by:
1. Changing the call to address space conversion to use the correct
return type, preventing the code from moving past the if and into the
crashing optional dereference.
2. Adding handling to the AllocLikeOp rewriter for the case where the
underlying buffer allocation fails.
---
 .../MemRefToLLVM/AllocLikeConversion.cpp          | 13 +++++++++++--
 mlir/lib/Conversion/MemRefToLLVM/MemRefToLLVM.cpp |  6 ++++--
 mlir/test/Conversion/MemRefToLLVM/invalid.mlir    |  4 +---
 .../test/Conversion/MemRefToLLVM/issue-70160.mlir | 15 +++++++++++++++
 4 files changed, 31 insertions(+), 7 deletions(-)
 create mode 100644 mlir/test/Conversion/MemRefToLLVM/issue-70160.mlir

diff --git a/mlir/lib/Conversion/MemRefToLLVM/AllocLikeConversion.cpp b/mlir/lib/Conversion/MemRefToLLVM/AllocLikeConversion.cpp
index a2a426e3c2931..6286b32b37042 100644
--- a/mlir/lib/Conversion/MemRefToLLVM/AllocLikeConversion.cpp
+++ b/mlir/lib/Conversion/MemRefToLLVM/AllocLikeConversion.cpp
@@ -59,7 +59,11 @@ static Value castAllocFuncResult(ConversionPatternRewriter &rewriter,
                                  MemRefType memRefType, Type elementPtrType,
                                  const LLVMTypeConverter &typeConverter) {
   auto allocatedPtrTy = cast<LLVM::LLVMPointerType>(allocatedPtr.getType());
-  unsigned memrefAddrSpace = *typeConverter.getMemRefAddressSpace(memRefType);
+  FailureOr<unsigned> maybeMemrefAddrSpace =
+      typeConverter.getMemRefAddressSpace(memRefType);
+  if (failed(maybeMemrefAddrSpace))
+    return Value();
+  unsigned memrefAddrSpace = *maybeMemrefAddrSpace;
   if (allocatedPtrTy.getAddressSpace() != memrefAddrSpace)
     allocatedPtr = rewriter.create<LLVM::AddrSpaceCastOp>(
         loc,
@@ -91,7 +95,8 @@ std::tuple<Value, Value> AllocationOpLLVMLowering::allocateBufferManuallyAlign(
   Value allocatedPtr =
       castAllocFuncResult(rewriter, loc, results.getResult(), memRefType,
                           elementPtrType, *getTypeConverter());
-
+  if (!allocatedPtr)
+    return std::make_tuple(Value(), Value());
   Value alignedPtr = allocatedPtr;
   if (alignment) {
     // Compute the aligned pointer.
@@ -183,6 +188,10 @@ LogicalResult AllocLikeOpLLVMLowering::matchAndRewrite(
   auto [allocatedPtr, alignedPtr] =
       this->allocateBuffer(rewriter, loc, size, op);
 
+  if (!allocatedPtr || !alignedPtr)
+    return rewriter.notifyMatchFailure(loc,
+                                       "underlying buffer allocation failed");
+
   // Create the MemRef descriptor.
   auto memRefDescriptor = this->createMemRefDescriptor(
       loc, memRefType, allocatedPtr, alignedPtr, sizes, strides, rewriter);
diff --git a/mlir/lib/Conversion/MemRefToLLVM/MemRefToLLVM.cpp b/mlir/lib/Conversion/MemRefToLLVM/MemRefToLLVM.cpp
index 61bd23f12601c..e9efcedec0e14 100644
--- a/mlir/lib/Conversion/MemRefToLLVM/MemRefToLLVM.cpp
+++ b/mlir/lib/Conversion/MemRefToLLVM/MemRefToLLVM.cpp
@@ -76,6 +76,8 @@ struct AlignedAllocOpLowering : public AllocLikeOpLLVMLowering {
         rewriter, loc, sizeBytes, op, &defaultLayout,
         alignedAllocationGetAlignment(rewriter, loc, cast<memref::AllocOp>(op),
                                       &defaultLayout));
+    if (!ptr)
+      return std::make_tuple(Value(), Value());
     return std::make_tuple(ptr, ptr);
   }
 
@@ -560,9 +562,9 @@ struct GetGlobalMemrefOpLowering : public AllocLikeOpLLVMLowering {
 
     // This is called after a type conversion, which would have failed if this
     // call fails.
-    std::optional<unsigned> maybeAddressSpace =
+    FailureOr<unsigned> maybeAddressSpace =
         getTypeConverter()->getMemRefAddressSpace(type);
-    if (!maybeAddressSpace)
+    if (failed(maybeAddressSpace))
       return std::make_tuple(Value(), Value());
     unsigned memSpace = *maybeAddressSpace;
 
diff --git a/mlir/test/Conversion/MemRefToLLVM/invalid.mlir b/mlir/test/Conversion/MemRefToLLVM/invalid.mlir
index 786e6f2562199..40dd75af1dd77 100644
--- a/mlir/test/Conversion/MemRefToLLVM/invalid.mlir
+++ b/mlir/test/Conversion/MemRefToLLVM/invalid.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -finalize-memref-to-llvm -split-input-file 2>&1 | FileCheck %s
+// RUN: mlir-opt %s -finalize-memref-to-llvm 2>&1 | FileCheck %s
 // Since the error is at an unknown location, we use FileCheck instead of
 // -veri-y-diagnostics here
 
@@ -10,5 +10,3 @@ func.func @bad_address_space(%a: memref<2xindex, "foo">) {
     memref.store %c0, %a[%c0] : memref<2xindex, "foo">
     return
 }
-
-// -----
diff --git a/mlir/test/Conversion/MemRefToLLVM/issue-70160.mlir b/mlir/test/Conversion/MemRefToLLVM/issue-70160.mlir
new file mode 100644
index 0000000000000..4f94e6375131b
--- /dev/null
+++ b/mlir/test/Conversion/MemRefToLLVM/issue-70160.mlir
@@ -0,0 +1,15 @@
+// RUN: mlir-opt %s -finalize-memref-to-llvm 2>&1 | FileCheck %s
+// Since the error is at an unknown location, we use FileCheck instead of
+// -veri-y-diagnostics here
+
+// CHECK: conversion of memref memory space #gpu.address_space<workgroup> to integer address space failed. Consider adding memory space conversions
+// CHECK-LABEL: @issue_70160
+func.func @issue_70160() {
+  %alloc = memref.alloc() : memref<1x32x33xi32, #gpu.address_space<workgroup>>
+  %alloc1 = memref.alloc() : memref<i32>
+  %c0 = arith.constant 0 : index
+  // CHECK: memref.load
+  %0 = memref.load %alloc[%c0, %c0, %c0] : memref<1x32x33xi32, #gpu.address_space<workgroup>>
+  memref.store %0, %alloc1[] : memref<i32>
+  func.return
+}

From e7de95595c3391780c113c71ac82965abe1a1850 Mon Sep 17 00:00:00 2001
From: philnik777 <nikolasklauser@berlin.de>
Date: Tue, 31 Oct 2023 15:24:44 +0100
Subject: [PATCH 522/877] [libc++] Remove alignment_of uses (#70591)

There is no reason to use `alignment_of<T>` instead of `alignof(T)` or
`_LIBCPP_ALIGNOF(T)`. It only makes the code more verbose and results in
slightly worse compile times.
---
 libcxx/include/__memory/temporary_buffer.h | 3 +--
 libcxx/include/any                         | 6 ++----
 libcxx/include/future                      | 3 +--
 libcxx/include/new                         | 3 +--
 4 files changed, 5 insertions(+), 10 deletions(-)

diff --git a/libcxx/include/__memory/temporary_buffer.h b/libcxx/include/__memory/temporary_buffer.h
index c917f041a0147..92cfa2cd8d379 100644
--- a/libcxx/include/__memory/temporary_buffer.h
+++ b/libcxx/include/__memory/temporary_buffer.h
@@ -38,8 +38,7 @@ get_temporary_buffer(ptrdiff_t __n) _NOEXCEPT
 #if !defined(_LIBCPP_HAS_NO_ALIGNED_ALLOCATION)
     if (__is_overaligned_for_new(_LIBCPP_ALIGNOF(_Tp)))
         {
-            align_val_t __al =
-                align_val_t(alignment_of<_Tp>::value);
+            align_val_t __al = align_val_t(_LIBCPP_ALIGNOF(_Tp));
             __r.first = static_cast<_Tp*>(::operator new(
                 __n * sizeof(_Tp), __al, nothrow));
         } else {
diff --git a/libcxx/include/any b/libcxx/include/any
index b3ac5871c42fc..86dd43cf80764 100644
--- a/libcxx/include/any
+++ b/libcxx/include/any
@@ -90,7 +90,6 @@ namespace std {
 #include <__type_traits/add_const.h>
 #include <__type_traits/add_pointer.h>
 #include <__type_traits/aligned_storage.h>
-#include <__type_traits/alignment_of.h>
 #include <__type_traits/conditional.h>
 #include <__type_traits/decay.h>
 #include <__type_traits/is_constructible.h>
@@ -156,14 +155,13 @@ add_pointer_t<_ValueType> any_cast(any *) _NOEXCEPT;
 namespace __any_imp
 {
   _LIBCPP_SUPPRESS_DEPRECATED_PUSH
-  using _Buffer = aligned_storage_t<3*sizeof(void*), alignment_of<void*>::value>;
+  using _Buffer = aligned_storage_t<3*sizeof(void*), alignof(void*)>;
   _LIBCPP_SUPPRESS_DEPRECATED_POP
 
   template <class _Tp>
   using _IsSmallObject = integral_constant<bool
         , sizeof(_Tp) <= sizeof(_Buffer)
-          && alignment_of<_Buffer>::value
-             % alignment_of<_Tp>::value == 0
+          && alignof(_Buffer) % alignof(_Tp) == 0
           && is_nothrow_move_constructible<_Tp>::value
         >;
 
diff --git a/libcxx/include/future b/libcxx/include/future
index 96aa47fc60fcb..97b22aa458bbf 100644
--- a/libcxx/include/future
+++ b/libcxx/include/future
@@ -382,7 +382,6 @@ template <class R, class Alloc> struct uses_allocator<packaged_task<R>, Alloc>;
 #include <__system_error/error_code.h>
 #include <__system_error/error_condition.h>
 #include <__type_traits/aligned_storage.h>
-#include <__type_traits/alignment_of.h>
 #include <__type_traits/strip_signature.h>
 #include <__utility/auto_cast.h>
 #include <__utility/forward.h>
@@ -648,7 +647,7 @@ class _LIBCPP_AVAILABILITY_FUTURE _LIBCPP_HIDDEN __assoc_state
 {
     typedef __assoc_sub_state base;
 _LIBCPP_SUPPRESS_DEPRECATED_PUSH
-    typedef typename aligned_storage<sizeof(_Rp), alignment_of<_Rp>::value>::type _Up;
+    typedef typename aligned_storage<sizeof(_Rp), _LIBCPP_ALIGNOF(_Rp)>::type _Up;
 _LIBCPP_SUPPRESS_DEPRECATED_POP
 protected:
     _Up __value_;
diff --git a/libcxx/include/new b/libcxx/include/new
index 7a2ef54f778ee..e95f2e46d0fe3 100644
--- a/libcxx/include/new
+++ b/libcxx/include/new
@@ -90,7 +90,6 @@ void  operator delete[](void* ptr, void*) noexcept;
 #include <__availability>
 #include <__config>
 #include <__exception/exception.h>
-#include <__type_traits/alignment_of.h>
 #include <__type_traits/is_function.h>
 #include <__type_traits/is_same.h>
 #include <__type_traits/remove_cv.h>
@@ -259,7 +258,7 @@ _LIBCPP_CONSTEXPR inline _LIBCPP_INLINE_VISIBILITY bool __is_overaligned_for_new
 #ifdef __STDCPP_DEFAULT_NEW_ALIGNMENT__
   return __align > __STDCPP_DEFAULT_NEW_ALIGNMENT__;
 #else
-  return __align > alignment_of<max_align_t>::value;
+  return __align > _LIBCPP_ALIGNOF(max_align_t);
 #endif
 }
 

From 4c01a58008055aa8a7ce6e1a770216cd86664711 Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <Ramkumar.Ramachandra@imgtec.com>
Date: Tue, 31 Oct 2023 14:33:53 +0000
Subject: [PATCH 523/877] update_analyze_test_checks: support output from LAA
 (#67584)

update_analyze_test_checks.py is an invaluable tool in updating tests.
Unfortunately, it only supports output from the CostModel,
ScalarEvolution, and LoopVectorize analyses. Many LoopAccessAnalysis
tests use hand-crafted CHECK lines, and it is moreover tedious to
generate these CHECK lines, as the output fom the analysis is not
stable, and requires the test-writer to hand-craft FileCheck matches.
Alleviate this pain, and support output from:

  $ opt -passes='print<loop-accesses>'

This patch includes several non-trivial changes including:
- Preserving whitespace at the beginning of the line, so that the LAA
output can be properly indented.
- Regexes matching the unstable output, which is basically a pointer
address hex.
- Separating is_analyze from preserve_names clearly, as the former was
formerly used as an overload for the latter.

To demonstate the utility of this patch, several tests in
LoopAccessAnalysis have been auto-generated by
update_analyze_test_checks.py.
---
 .../Scalar/LoopAccessAnalysisPrinter.cpp      |    3 +-
 .../LoopAccessAnalysis/forked-pointers.ll     | 1434 +++++++++++------
 ...invariant-dep-with-backedge-taken-count.ll |  168 +-
 .../max_safe_dep_dist_non_unit_stride.ll      |   25 +-
 .../LoopAccessAnalysis/pointer-phis.ll        |  425 +++--
 .../LoopAccessAnalysis/symbolic-stride.ll     |  111 +-
 .../LoopUnroll/unroll-loop-invalidation.ll    |    2 +-
 .../Inputs/dotvariable-laa.ll                 |   23 +
 .../Inputs/dotvariable-laa.ll.expected        |   63 +
 .../Inputs/loop-access-analysis.ll            |   23 +
 .../Inputs/loop-access-analysis.ll.expected   |   58 +
 .../dotvariable-laa.test                      |    6 +
 .../loop-access-analysis.test                 |    6 +
 llvm/utils/UpdateTestChecks/common.py         |  175 +-
 llvm/utils/update_analyze_test_checks.py      |    4 -
 15 files changed, 1671 insertions(+), 855 deletions(-)
 create mode 100644 llvm/test/tools/UpdateTestChecks/update_analyze_test_checks/Inputs/dotvariable-laa.ll
 create mode 100644 llvm/test/tools/UpdateTestChecks/update_analyze_test_checks/Inputs/dotvariable-laa.ll.expected
 create mode 100644 llvm/test/tools/UpdateTestChecks/update_analyze_test_checks/Inputs/loop-access-analysis.ll
 create mode 100644 llvm/test/tools/UpdateTestChecks/update_analyze_test_checks/Inputs/loop-access-analysis.ll.expected
 create mode 100644 llvm/test/tools/UpdateTestChecks/update_analyze_test_checks/dotvariable-laa.test
 create mode 100644 llvm/test/tools/UpdateTestChecks/update_analyze_test_checks/loop-access-analysis.test

diff --git a/llvm/lib/Transforms/Scalar/LoopAccessAnalysisPrinter.cpp b/llvm/lib/Transforms/Scalar/LoopAccessAnalysisPrinter.cpp
index 9ae55b9018dad..3d3f22d686e32 100644
--- a/llvm/lib/Transforms/Scalar/LoopAccessAnalysisPrinter.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopAccessAnalysisPrinter.cpp
@@ -20,7 +20,8 @@ PreservedAnalyses LoopAccessInfoPrinterPass::run(Function &F,
                                                  FunctionAnalysisManager &AM) {
   auto &LAIs = AM.getResult<LoopAccessAnalysis>(F);
   auto &LI = AM.getResult<LoopAnalysis>(F);
-  OS << "Loop access info in function '" << F.getName() << "':\n";
+  OS << "Printing analysis 'Loop Access Analysis' for function '" << F.getName()
+     << "':\n";
 
   SmallPriorityWorklist<Loop *, 4> Worklist;
   appendLoopsToWorklist(LI, Worklist);
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/forked-pointers.ll b/llvm/test/Analysis/LoopAccessAnalysis/forked-pointers.ll
index 30ffc4cb5e1e8..848302bc24025 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/forked-pointers.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/forked-pointers.ll
@@ -1,43 +1,78 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 3
 ; RUN: opt -disable-output -passes='print<access-info>' %s 2>&1 | FileCheck %s
 ; RUN: opt -disable-output -passes='print<access-info>' -max-forked-scev-depth=2 %s 2>&1 | FileCheck -check-prefix=RECURSE %s
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 
-; CHECK-LABEL: function 'forked_ptrs_simple':
-; CHECK-NEXT:  loop:
-; CHECK-NEXT:    Memory dependences are safe with run-time checks
-; CHECK-NEXT:    Dependences:
-; CHECK-NEXT:    Run-time memory checks:
-; CHECK-NEXT:    Check 0:
-; CHECK-NEXT:      Comparing group ([[G1:.+]]):
-; CHECK-NEXT:        %gep.Dest = getelementptr inbounds float, ptr %Dest, i64 %iv
-; CHECK-NEXT:        %gep.Dest = getelementptr inbounds float, ptr %Dest, i64 %iv
-; CHECK-NEXT:      Against group ([[G2:.+]]):
-; CHECK-NEXT:        %select = select i1 %cmp, ptr %gep.1, ptr %gep.2
-; CHECK-NEXT:    Check 1:
-; CHECK-NEXT:      Comparing group ([[G1]]):
-; CHECK-NEXT:        %gep.Dest = getelementptr inbounds float, ptr %Dest, i64 %iv
-; CHECK-NEXT:        %gep.Dest = getelementptr inbounds float, ptr %Dest, i64 %iv
-; CHECK-NEXT:      Against group ([[G3:.+]]):
-; CHECK-NEXT:        %select = select i1 %cmp, ptr %gep.1, ptr %gep.2
-; CHECK-NEXT:    Grouped accesses:
-; CHECK-NEXT:      Group [[G1]]
-; CHECK-NEXT:        (Low: %Dest High: (400 + %Dest))
-; CHECK-NEXT:          Member: {%Dest,+,4}<nuw><%loop>
-; CHECK-NEXT:          Member: {%Dest,+,4}<nuw><%loop>
-; CHECK-NEXT:      Group [[G2]]:
-; CHECK-NEXT:        (Low: %Base1 High: (400 + %Base1))
-; CHECK-NEXT:          Member: {%Base1,+,4}<nw><%loop>
-; CHECK-NEXT:      Group [[G3]]:
-; CHECK-NEXT:        (Low: %Base2 High: (400 + %Base2))
-; CHECK-NEXT:          Member: {%Base2,+,4}<nw><%loop>
+define void @forked_ptrs_simple(ptr nocapture readonly %Base1, ptr nocapture readonly %Base2, ptr %Dest) {
+; CHECK-LABEL: 'forked_ptrs_simple'
+; CHECK-NEXT:    loop:
+; CHECK-NEXT:      Memory dependences are safe with run-time checks
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Check 0:
+; CHECK-NEXT:        Comparing group ([[GRP1:0x[0-9a-f]+]]):
+; CHECK-NEXT:          %gep.Dest = getelementptr inbounds float, ptr %Dest, i64 %iv
+; CHECK-NEXT:          %gep.Dest = getelementptr inbounds float, ptr %Dest, i64 %iv
+; CHECK-NEXT:        Against group ([[GRP2:0x[0-9a-f]+]]):
+; CHECK-NEXT:          %select = select i1 %cmp, ptr %gep.1, ptr %gep.2
+; CHECK-NEXT:      Check 1:
+; CHECK-NEXT:        Comparing group ([[GRP1]]):
+; CHECK-NEXT:          %gep.Dest = getelementptr inbounds float, ptr %Dest, i64 %iv
+; CHECK-NEXT:          %gep.Dest = getelementptr inbounds float, ptr %Dest, i64 %iv
+; CHECK-NEXT:        Against group ([[GRP3:0x[0-9a-f]+]]):
+; CHECK-NEXT:          %select = select i1 %cmp, ptr %gep.1, ptr %gep.2
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-NEXT:        Group [[GRP1]]:
+; CHECK-NEXT:          (Low: %Dest High: (400 + %Dest))
+; CHECK-NEXT:            Member: {%Dest,+,4}<nuw><%loop>
+; CHECK-NEXT:            Member: {%Dest,+,4}<nuw><%loop>
+; CHECK-NEXT:        Group [[GRP2]]:
+; CHECK-NEXT:          (Low: %Base1 High: (400 + %Base1))
+; CHECK-NEXT:            Member: {%Base1,+,4}<nw><%loop>
+; CHECK-NEXT:        Group [[GRP3]]:
+; CHECK-NEXT:          (Low: %Base2 High: (400 + %Base2))
+; CHECK-NEXT:            Member: {%Base2,+,4}<nw><%loop>
 ; CHECK-EMPTY:
-; CHECK-NEXT:    Non vectorizable stores to invariant address were not found in loop.
-; CHECK-NEXT:    SCEV assumptions:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
 ; CHECK-EMPTY:
-; CHECK-NEXT:    Expressions re-written:
-
-define void @forked_ptrs_simple(ptr nocapture readonly %Base1, ptr nocapture readonly %Base2, ptr %Dest) {
+; CHECK-NEXT:      Expressions re-written:
+;
+; RECURSE-LABEL: 'forked_ptrs_simple'
+; RECURSE-NEXT:    loop:
+; RECURSE-NEXT:      Memory dependences are safe with run-time checks
+; RECURSE-NEXT:      Dependences:
+; RECURSE-NEXT:      Run-time memory checks:
+; RECURSE-NEXT:      Check 0:
+; RECURSE-NEXT:        Comparing group ([[GRP4:0x[0-9a-f]+]]):
+; RECURSE-NEXT:          %gep.Dest = getelementptr inbounds float, ptr %Dest, i64 %iv
+; RECURSE-NEXT:          %gep.Dest = getelementptr inbounds float, ptr %Dest, i64 %iv
+; RECURSE-NEXT:        Against group ([[GRP5:0x[0-9a-f]+]]):
+; RECURSE-NEXT:          %select = select i1 %cmp, ptr %gep.1, ptr %gep.2
+; RECURSE-NEXT:      Check 1:
+; RECURSE-NEXT:        Comparing group ([[GRP4]]):
+; RECURSE-NEXT:          %gep.Dest = getelementptr inbounds float, ptr %Dest, i64 %iv
+; RECURSE-NEXT:          %gep.Dest = getelementptr inbounds float, ptr %Dest, i64 %iv
+; RECURSE-NEXT:        Against group ([[GRP6:0x[0-9a-f]+]]):
+; RECURSE-NEXT:          %select = select i1 %cmp, ptr %gep.1, ptr %gep.2
+; RECURSE-NEXT:      Grouped accesses:
+; RECURSE-NEXT:        Group [[GRP4]]:
+; RECURSE-NEXT:          (Low: %Dest High: (400 + %Dest))
+; RECURSE-NEXT:            Member: {%Dest,+,4}<nuw><%loop>
+; RECURSE-NEXT:            Member: {%Dest,+,4}<nuw><%loop>
+; RECURSE-NEXT:        Group [[GRP5]]:
+; RECURSE-NEXT:          (Low: %Base1 High: (400 + %Base1))
+; RECURSE-NEXT:            Member: {%Base1,+,4}<nw><%loop>
+; RECURSE-NEXT:        Group [[GRP6]]:
+; RECURSE-NEXT:          (Low: %Base2 High: (400 + %Base2))
+; RECURSE-NEXT:            Member: {%Base2,+,4}<nw><%loop>
+; RECURSE-EMPTY:
+; RECURSE-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; RECURSE-NEXT:      SCEV assumptions:
+; RECURSE-EMPTY:
+; RECURSE-NEXT:      Expressions re-written:
+;
 entry:
   br label %loop
 
@@ -59,59 +94,9 @@ exit:
   ret void
 }
 
-; CHECK-LABEL: function 'forked_ptrs_different_base_same_offset':
-; CHECK-NEXT:  for.body:
-; CHECK-NEXT:    Memory dependences are safe with run-time checks
-; CHECK-NEXT:    Dependences:
-; CHECK-NEXT:    Run-time memory checks:
-; CHECK-NEXT:    Check 0:
-; CHECK-NEXT:      Comparing group ([[G1:.+]]):
-; CHECK-NEXT:        %1 = getelementptr inbounds float, ptr %Dest, i64 %indvars.iv
-; CHECK-NEXT:      Against group ([[G2:.+]]):
-; CHECK-NEXT:        %arrayidx = getelementptr inbounds i32, ptr %Preds, i64 %indvars.iv
-; CHECK-NEXT:    Check 1:
-; CHECK-NEXT:      Comparing group ([[G1]]):
-; CHECK-NEXT:        %1 = getelementptr inbounds float, ptr %Dest, i64 %indvars.iv
-; CHECK-NEXT:      Against group ([[G3:.+]]):
-; CHECK-NEXT:        %.sink.in = getelementptr inbounds float, ptr %spec.select, i64 %indvars.iv
-; CHECK-NEXT:    Check 2:
-; CHECK-NEXT:      Comparing group ([[G1]]):
-; CHECK-NEXT:        %1 = getelementptr inbounds float, ptr %Dest, i64 %indvars.iv
-; CHECK-NEXT:      Against group ([[G4:.+]]):
-; CHECK-NEXT:        %.sink.in = getelementptr inbounds float, ptr %spec.select, i64 %indvars.iv
-; CHECK-NEXT:    Grouped accesses:
-; CHECK-NEXT:      Group [[G1]]:
-; CHECK-NEXT:        (Low: %Dest High: (400 + %Dest))
-; CHECK-NEXT:          Member: {%Dest,+,4}<nuw><%for.body>
-; CHECK-NEXT:      Group [[G2]]:
-; CHECK-NEXT:        (Low: %Preds High: (400 + %Preds))
-; CHECK-NEXT:          Member: {%Preds,+,4}<nuw><%for.body>
-; CHECK-NEXT:      Group [[G3]]:
-; CHECK-NEXT:        (Low: %Base2 High: (400 + %Base2))
-; CHECK-NEXT:          Member: {%Base2,+,4}<nw><%for.body>
-; CHECK-NEXT:      Group [[G4]]:
-; CHECK-NEXT:        (Low: %Base1 High: (400 + %Base1))
-; CHECK-NEXT:          Member: {%Base1,+,4}<nw><%for.body>
-; CHECK-EMPTY:
-; CHECK-NEXT:   Non vectorizable stores to invariant address were not found in loop.
-; CHECK-NEXT:   SCEV assumptions:
-; CHECK-EMPTY:
-; CHECK-NEXT:   Expressions re-written:
-
 ;; We have a limit on the recursion depth for finding a loop invariant or
 ;; addrec term; confirm we won't exceed that depth by forcing a lower
 ;; limit via -max-forked-scev-depth=2
-; RECURSE-LABEL: Loop access info in function 'forked_ptrs_same_base_different_offset':
-; RECURSE-NEXT:   for.body:
-; RECURSE-NEXT:     Report: cannot identify array bounds
-; RECURSE-NEXT:     Dependences:
-; RECURSE-NEXT:     Run-time memory checks:
-; RECURSE-NEXT:     Grouped accesses:
-; RECURSE-EMPTY:
-; RECURSE-NEXT:     Non vectorizable stores to invariant address were not found in loop.
-; RECURSE-NEXT:     SCEV assumptions:
-; RECURSE-EMPTY:
-; RECURSE-NEXT:     Expressions re-written:
 
 ;;;; Derived from the following C code
 ;; void forked_ptrs_different_base_same_offset(float *A, float *B, float *C, int *D) {
@@ -125,6 +110,84 @@ exit:
 ;; }
 
 define dso_local void @forked_ptrs_different_base_same_offset(ptr nocapture readonly nonnull %Base1, ptr nocapture readonly %Base2, ptr nocapture %Dest, ptr nocapture readonly %Preds) {
+; CHECK-LABEL: 'forked_ptrs_different_base_same_offset'
+; CHECK-NEXT:    for.body:
+; CHECK-NEXT:      Memory dependences are safe with run-time checks
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Check 0:
+; CHECK-NEXT:        Comparing group ([[GRP7:0x[0-9a-f]+]]):
+; CHECK-NEXT:          %1 = getelementptr inbounds float, ptr %Dest, i64 %indvars.iv
+; CHECK-NEXT:        Against group ([[GRP8:0x[0-9a-f]+]]):
+; CHECK-NEXT:          %arrayidx = getelementptr inbounds i32, ptr %Preds, i64 %indvars.iv
+; CHECK-NEXT:      Check 1:
+; CHECK-NEXT:        Comparing group ([[GRP7]]):
+; CHECK-NEXT:          %1 = getelementptr inbounds float, ptr %Dest, i64 %indvars.iv
+; CHECK-NEXT:        Against group ([[GRP9:0x[0-9a-f]+]]):
+; CHECK-NEXT:          %.sink.in = getelementptr inbounds float, ptr %spec.select, i64 %indvars.iv
+; CHECK-NEXT:      Check 2:
+; CHECK-NEXT:        Comparing group ([[GRP7]]):
+; CHECK-NEXT:          %1 = getelementptr inbounds float, ptr %Dest, i64 %indvars.iv
+; CHECK-NEXT:        Against group ([[GRP10:0x[0-9a-f]+]]):
+; CHECK-NEXT:          %.sink.in = getelementptr inbounds float, ptr %spec.select, i64 %indvars.iv
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-NEXT:        Group [[GRP7]]:
+; CHECK-NEXT:          (Low: %Dest High: (400 + %Dest))
+; CHECK-NEXT:            Member: {%Dest,+,4}<nuw><%for.body>
+; CHECK-NEXT:        Group [[GRP8]]:
+; CHECK-NEXT:          (Low: %Preds High: (400 + %Preds))
+; CHECK-NEXT:            Member: {%Preds,+,4}<nuw><%for.body>
+; CHECK-NEXT:        Group [[GRP9]]:
+; CHECK-NEXT:          (Low: %Base2 High: (400 + %Base2))
+; CHECK-NEXT:            Member: {%Base2,+,4}<nw><%for.body>
+; CHECK-NEXT:        Group [[GRP10]]:
+; CHECK-NEXT:          (Low: %Base1 High: (400 + %Base1))
+; CHECK-NEXT:            Member: {%Base1,+,4}<nw><%for.body>
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+; RECURSE-LABEL: 'forked_ptrs_different_base_same_offset'
+; RECURSE-NEXT:    for.body:
+; RECURSE-NEXT:      Memory dependences are safe with run-time checks
+; RECURSE-NEXT:      Dependences:
+; RECURSE-NEXT:      Run-time memory checks:
+; RECURSE-NEXT:      Check 0:
+; RECURSE-NEXT:        Comparing group ([[GRP11:0x[0-9a-f]+]]):
+; RECURSE-NEXT:          %1 = getelementptr inbounds float, ptr %Dest, i64 %indvars.iv
+; RECURSE-NEXT:        Against group ([[GRP12:0x[0-9a-f]+]]):
+; RECURSE-NEXT:          %arrayidx = getelementptr inbounds i32, ptr %Preds, i64 %indvars.iv
+; RECURSE-NEXT:      Check 1:
+; RECURSE-NEXT:        Comparing group ([[GRP11]]):
+; RECURSE-NEXT:          %1 = getelementptr inbounds float, ptr %Dest, i64 %indvars.iv
+; RECURSE-NEXT:        Against group ([[GRP13:0x[0-9a-f]+]]):
+; RECURSE-NEXT:          %.sink.in = getelementptr inbounds float, ptr %spec.select, i64 %indvars.iv
+; RECURSE-NEXT:      Check 2:
+; RECURSE-NEXT:        Comparing group ([[GRP11]]):
+; RECURSE-NEXT:          %1 = getelementptr inbounds float, ptr %Dest, i64 %indvars.iv
+; RECURSE-NEXT:        Against group ([[GRP14:0x[0-9a-f]+]]):
+; RECURSE-NEXT:          %.sink.in = getelementptr inbounds float, ptr %spec.select, i64 %indvars.iv
+; RECURSE-NEXT:      Grouped accesses:
+; RECURSE-NEXT:        Group [[GRP11]]:
+; RECURSE-NEXT:          (Low: %Dest High: (400 + %Dest))
+; RECURSE-NEXT:            Member: {%Dest,+,4}<nuw><%for.body>
+; RECURSE-NEXT:        Group [[GRP12]]:
+; RECURSE-NEXT:          (Low: %Preds High: (400 + %Preds))
+; RECURSE-NEXT:            Member: {%Preds,+,4}<nuw><%for.body>
+; RECURSE-NEXT:        Group [[GRP13]]:
+; RECURSE-NEXT:          (Low: %Base2 High: (400 + %Base2))
+; RECURSE-NEXT:            Member: {%Base2,+,4}<nw><%for.body>
+; RECURSE-NEXT:        Group [[GRP14]]:
+; RECURSE-NEXT:          (Low: %Base1 High: (400 + %Base1))
+; RECURSE-NEXT:            Member: {%Base1,+,4}<nw><%for.body>
+; RECURSE-EMPTY:
+; RECURSE-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; RECURSE-NEXT:      SCEV assumptions:
+; RECURSE-EMPTY:
+; RECURSE-NEXT:      Expressions re-written:
+;
 entry:
   br label %for.body
 
@@ -146,46 +209,85 @@ for.body:
   br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
 }
 
-; CHECK-LABEL: function 'forked_ptrs_different_base_same_offset_64b':
-; CHECK-NEXT:  for.body:
-; CHECK-NEXT:    Memory dependences are safe with run-time checks
-; CHECK-NEXT:    Dependences:
-; CHECK-NEXT:    Run-time memory checks:
-; CHECK-NEXT:    Check 0:
-; CHECK-NEXT:      Comparing group ([[G1:.+]]):
-; CHECK-NEXT:        %1 = getelementptr inbounds double, ptr %Dest, i64 %indvars.iv
-; CHECK-NEXT:      Against group ([[G2:.+]]):
-; CHECK-NEXT:        %arrayidx = getelementptr inbounds i32, ptr %Preds, i64 %indvars.iv
-; CHECK-NEXT:    Check 1:
-; CHECK-NEXT:      Comparing group ([[G1]]):
-; CHECK-NEXT:        %1 = getelementptr inbounds double, ptr %Dest, i64 %indvars.iv
-; CHECK-NEXT:      Against group ([[G3:.+]]):
-; CHECK-NEXT:        %.sink.in = getelementptr inbounds double, ptr %spec.select, i64 %indvars.iv
-; CHECK-NEXT:    Check 2:
-; CHECK-NEXT:      Comparing group ([[G1]]):
-; CHECK-NEXT:        %1 = getelementptr inbounds double, ptr %Dest, i64 %indvars.iv
-; CHECK-NEXT:      Against group ([[G4:.+]]):
-; CHECK-NEXT:        %.sink.in = getelementptr inbounds double, ptr %spec.select, i64 %indvars.iv
-; CHECK-NEXT:    Grouped accesses:
-; CHECK-NEXT:      Group [[G1]]:
-; CHECK-NEXT:        (Low: %Dest High: (800 + %Dest))
-; CHECK-NEXT:          Member: {%Dest,+,8}<nuw><%for.body>
-; CHECK-NEXT:      Group [[G2]]:
-; CHECK-NEXT:        (Low: %Preds High: (400 + %Preds))
-; CHECK-NEXT:          Member: {%Preds,+,4}<nuw><%for.body>
-; CHECK-NEXT:      Group [[G3]]:
-; CHECK-NEXT:        (Low: %Base2 High: (800 + %Base2))
-; CHECK-NEXT:          Member: {%Base2,+,8}<nw><%for.body>
-; CHECK-NEXT:      Group [[G4]]:
-; CHECK-NEXT:        (Low: %Base1 High: (800 + %Base1))
-; CHECK-NEXT:          Member: {%Base1,+,8}<nw><%for.body>
+define dso_local void @forked_ptrs_different_base_same_offset_64b(ptr nocapture readonly nonnull %Base1, ptr nocapture readonly %Base2, ptr nocapture %Dest, ptr nocapture readonly %Preds) {
+; CHECK-LABEL: 'forked_ptrs_different_base_same_offset_64b'
+; CHECK-NEXT:    for.body:
+; CHECK-NEXT:      Memory dependences are safe with run-time checks
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Check 0:
+; CHECK-NEXT:        Comparing group ([[GRP15:0x[0-9a-f]+]]):
+; CHECK-NEXT:          %1 = getelementptr inbounds double, ptr %Dest, i64 %indvars.iv
+; CHECK-NEXT:        Against group ([[GRP16:0x[0-9a-f]+]]):
+; CHECK-NEXT:          %arrayidx = getelementptr inbounds i32, ptr %Preds, i64 %indvars.iv
+; CHECK-NEXT:      Check 1:
+; CHECK-NEXT:        Comparing group ([[GRP15]]):
+; CHECK-NEXT:          %1 = getelementptr inbounds double, ptr %Dest, i64 %indvars.iv
+; CHECK-NEXT:        Against group ([[GRP17:0x[0-9a-f]+]]):
+; CHECK-NEXT:          %.sink.in = getelementptr inbounds double, ptr %spec.select, i64 %indvars.iv
+; CHECK-NEXT:      Check 2:
+; CHECK-NEXT:        Comparing group ([[GRP15]]):
+; CHECK-NEXT:          %1 = getelementptr inbounds double, ptr %Dest, i64 %indvars.iv
+; CHECK-NEXT:        Against group ([[GRP18:0x[0-9a-f]+]]):
+; CHECK-NEXT:          %.sink.in = getelementptr inbounds double, ptr %spec.select, i64 %indvars.iv
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-NEXT:        Group [[GRP15]]:
+; CHECK-NEXT:          (Low: %Dest High: (800 + %Dest))
+; CHECK-NEXT:            Member: {%Dest,+,8}<nuw><%for.body>
+; CHECK-NEXT:        Group [[GRP16]]:
+; CHECK-NEXT:          (Low: %Preds High: (400 + %Preds))
+; CHECK-NEXT:            Member: {%Preds,+,4}<nuw><%for.body>
+; CHECK-NEXT:        Group [[GRP17]]:
+; CHECK-NEXT:          (Low: %Base2 High: (800 + %Base2))
+; CHECK-NEXT:            Member: {%Base2,+,8}<nw><%for.body>
+; CHECK-NEXT:        Group [[GRP18]]:
+; CHECK-NEXT:          (Low: %Base1 High: (800 + %Base1))
+; CHECK-NEXT:            Member: {%Base1,+,8}<nw><%for.body>
 ; CHECK-EMPTY:
-; CHECK-NEXT:    Non vectorizable stores to invariant address were not found in loop.
-; CHECK-NEXT:    SCEV assumptions:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
 ; CHECK-EMPTY:
-; CHECK-NEXT:    Expressions re-written:
-
-define dso_local void @forked_ptrs_different_base_same_offset_64b(ptr nocapture readonly nonnull %Base1, ptr nocapture readonly %Base2, ptr nocapture %Dest, ptr nocapture readonly %Preds) {
+; CHECK-NEXT:      Expressions re-written:
+;
+; RECURSE-LABEL: 'forked_ptrs_different_base_same_offset_64b'
+; RECURSE-NEXT:    for.body:
+; RECURSE-NEXT:      Memory dependences are safe with run-time checks
+; RECURSE-NEXT:      Dependences:
+; RECURSE-NEXT:      Run-time memory checks:
+; RECURSE-NEXT:      Check 0:
+; RECURSE-NEXT:        Comparing group ([[GRP19:0x[0-9a-f]+]]):
+; RECURSE-NEXT:          %1 = getelementptr inbounds double, ptr %Dest, i64 %indvars.iv
+; RECURSE-NEXT:        Against group ([[GRP20:0x[0-9a-f]+]]):
+; RECURSE-NEXT:          %arrayidx = getelementptr inbounds i32, ptr %Preds, i64 %indvars.iv
+; RECURSE-NEXT:      Check 1:
+; RECURSE-NEXT:        Comparing group ([[GRP19]]):
+; RECURSE-NEXT:          %1 = getelementptr inbounds double, ptr %Dest, i64 %indvars.iv
+; RECURSE-NEXT:        Against group ([[GRP21:0x[0-9a-f]+]]):
+; RECURSE-NEXT:          %.sink.in = getelementptr inbounds double, ptr %spec.select, i64 %indvars.iv
+; RECURSE-NEXT:      Check 2:
+; RECURSE-NEXT:        Comparing group ([[GRP19]]):
+; RECURSE-NEXT:          %1 = getelementptr inbounds double, ptr %Dest, i64 %indvars.iv
+; RECURSE-NEXT:        Against group ([[GRP22:0x[0-9a-f]+]]):
+; RECURSE-NEXT:          %.sink.in = getelementptr inbounds double, ptr %spec.select, i64 %indvars.iv
+; RECURSE-NEXT:      Grouped accesses:
+; RECURSE-NEXT:        Group [[GRP19]]:
+; RECURSE-NEXT:          (Low: %Dest High: (800 + %Dest))
+; RECURSE-NEXT:            Member: {%Dest,+,8}<nuw><%for.body>
+; RECURSE-NEXT:        Group [[GRP20]]:
+; RECURSE-NEXT:          (Low: %Preds High: (400 + %Preds))
+; RECURSE-NEXT:            Member: {%Preds,+,4}<nuw><%for.body>
+; RECURSE-NEXT:        Group [[GRP21]]:
+; RECURSE-NEXT:          (Low: %Base2 High: (800 + %Base2))
+; RECURSE-NEXT:            Member: {%Base2,+,8}<nw><%for.body>
+; RECURSE-NEXT:        Group [[GRP22]]:
+; RECURSE-NEXT:          (Low: %Base1 High: (800 + %Base1))
+; RECURSE-NEXT:            Member: {%Base1,+,8}<nw><%for.body>
+; RECURSE-EMPTY:
+; RECURSE-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; RECURSE-NEXT:      SCEV assumptions:
+; RECURSE-EMPTY:
+; RECURSE-NEXT:      Expressions re-written:
+;
 entry:
   br label %for.body
 
@@ -207,46 +309,85 @@ for.body:
   br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
 }
 
-; CHECK-LABEL: function 'forked_ptrs_different_base_same_offset_23b':
-; CHECK-NEXT:  for.body:
-; CHECK-NEXT:    Memory dependences are safe with run-time checks
-; CHECK-NEXT:    Dependences:
-; CHECK-NEXT:    Run-time memory checks:
-; CHECK-NEXT:    Check 0:
-; CHECK-NEXT:      Comparing group ([[G1:.+]]):
-; CHECK-NEXT:        %1 = getelementptr inbounds i23, ptr %Dest, i64 %indvars.iv
-; CHECK-NEXT:      Against group ([[G2:.+]]):
-; CHECK-NEXT:        %arrayidx = getelementptr inbounds i32, ptr %Preds, i64 %indvars.iv
-; CHECK-NEXT:    Check 1:
-; CHECK-NEXT:      Comparing group ([[G1]]):
-; CHECK-NEXT:        %1 = getelementptr inbounds i23, ptr %Dest, i64 %indvars.iv
-; CHECK-NEXT:      Against group ([[G3:.+]]):
-; CHECK-NEXT:        %.sink.in = getelementptr inbounds i23, ptr %spec.select, i64 %indvars.iv
-; CHECK-NEXT:    Check 2:
-; CHECK-NEXT:      Comparing group ([[G1]]):
-; CHECK-NEXT:        %1 = getelementptr inbounds i23, ptr %Dest, i64 %indvars.iv
-; CHECK-NEXT:      Against group ([[G4:.+]]):
-; CHECK-NEXT:        %.sink.in = getelementptr inbounds i23, ptr %spec.select, i64 %indvars.iv
-; CHECK-NEXT:    Grouped accesses:
-; CHECK-NEXT:      Group [[G1]]:
-; CHECK-NEXT:        (Low: %Dest High: (399 + %Dest))
-; CHECK-NEXT:          Member: {%Dest,+,4}<nuw><%for.body>
-; CHECK-NEXT:      Group [[G2]]:
-; CHECK-NEXT:        (Low: %Preds High: (400 + %Preds))
-; CHECK-NEXT:          Member: {%Preds,+,4}<nuw><%for.body>
-; CHECK-NEXT:      Group [[G3]]:
-; CHECK-NEXT:        (Low: %Base2 High: (399 + %Base2))
-; CHECK-NEXT:          Member: {%Base2,+,4}<nw><%for.body>
-; CHECK-NEXT:      Group [[G4]]:
-; CHECK-NEXT:        (Low: %Base1 High: (399 + %Base1))
-; CHECK-NEXT:          Member: {%Base1,+,4}<nw><%for.body>
+define dso_local void @forked_ptrs_different_base_same_offset_23b(ptr nocapture readonly nonnull %Base1, ptr nocapture readonly %Base2, ptr nocapture %Dest, ptr nocapture readonly %Preds) {
+; CHECK-LABEL: 'forked_ptrs_different_base_same_offset_23b'
+; CHECK-NEXT:    for.body:
+; CHECK-NEXT:      Memory dependences are safe with run-time checks
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Check 0:
+; CHECK-NEXT:        Comparing group ([[GRP23:0x[0-9a-f]+]]):
+; CHECK-NEXT:          %1 = getelementptr inbounds i23, ptr %Dest, i64 %indvars.iv
+; CHECK-NEXT:        Against group ([[GRP24:0x[0-9a-f]+]]):
+; CHECK-NEXT:          %arrayidx = getelementptr inbounds i32, ptr %Preds, i64 %indvars.iv
+; CHECK-NEXT:      Check 1:
+; CHECK-NEXT:        Comparing group ([[GRP23]]):
+; CHECK-NEXT:          %1 = getelementptr inbounds i23, ptr %Dest, i64 %indvars.iv
+; CHECK-NEXT:        Against group ([[GRP25:0x[0-9a-f]+]]):
+; CHECK-NEXT:          %.sink.in = getelementptr inbounds i23, ptr %spec.select, i64 %indvars.iv
+; CHECK-NEXT:      Check 2:
+; CHECK-NEXT:        Comparing group ([[GRP23]]):
+; CHECK-NEXT:          %1 = getelementptr inbounds i23, ptr %Dest, i64 %indvars.iv
+; CHECK-NEXT:        Against group ([[GRP26:0x[0-9a-f]+]]):
+; CHECK-NEXT:          %.sink.in = getelementptr inbounds i23, ptr %spec.select, i64 %indvars.iv
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-NEXT:        Group [[GRP23]]:
+; CHECK-NEXT:          (Low: %Dest High: (399 + %Dest))
+; CHECK-NEXT:            Member: {%Dest,+,4}<nuw><%for.body>
+; CHECK-NEXT:        Group [[GRP24]]:
+; CHECK-NEXT:          (Low: %Preds High: (400 + %Preds))
+; CHECK-NEXT:            Member: {%Preds,+,4}<nuw><%for.body>
+; CHECK-NEXT:        Group [[GRP25]]:
+; CHECK-NEXT:          (Low: %Base2 High: (399 + %Base2))
+; CHECK-NEXT:            Member: {%Base2,+,4}<nw><%for.body>
+; CHECK-NEXT:        Group [[GRP26]]:
+; CHECK-NEXT:          (Low: %Base1 High: (399 + %Base1))
+; CHECK-NEXT:            Member: {%Base1,+,4}<nw><%for.body>
 ; CHECK-EMPTY:
-; CHECK-NEXT:    Non vectorizable stores to invariant address were not found in loop.
-; CHECK-NEXT:    SCEV assumptions:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
 ; CHECK-EMPTY:
-; CHECK-NEXT:    Expressions re-written:
-
-define dso_local void @forked_ptrs_different_base_same_offset_23b(ptr nocapture readonly nonnull %Base1, ptr nocapture readonly %Base2, ptr nocapture %Dest, ptr nocapture readonly %Preds) {
+; CHECK-NEXT:      Expressions re-written:
+;
+; RECURSE-LABEL: 'forked_ptrs_different_base_same_offset_23b'
+; RECURSE-NEXT:    for.body:
+; RECURSE-NEXT:      Memory dependences are safe with run-time checks
+; RECURSE-NEXT:      Dependences:
+; RECURSE-NEXT:      Run-time memory checks:
+; RECURSE-NEXT:      Check 0:
+; RECURSE-NEXT:        Comparing group ([[GRP27:0x[0-9a-f]+]]):
+; RECURSE-NEXT:          %1 = getelementptr inbounds i23, ptr %Dest, i64 %indvars.iv
+; RECURSE-NEXT:        Against group ([[GRP28:0x[0-9a-f]+]]):
+; RECURSE-NEXT:          %arrayidx = getelementptr inbounds i32, ptr %Preds, i64 %indvars.iv
+; RECURSE-NEXT:      Check 1:
+; RECURSE-NEXT:        Comparing group ([[GRP27]]):
+; RECURSE-NEXT:          %1 = getelementptr inbounds i23, ptr %Dest, i64 %indvars.iv
+; RECURSE-NEXT:        Against group ([[GRP29:0x[0-9a-f]+]]):
+; RECURSE-NEXT:          %.sink.in = getelementptr inbounds i23, ptr %spec.select, i64 %indvars.iv
+; RECURSE-NEXT:      Check 2:
+; RECURSE-NEXT:        Comparing group ([[GRP27]]):
+; RECURSE-NEXT:          %1 = getelementptr inbounds i23, ptr %Dest, i64 %indvars.iv
+; RECURSE-NEXT:        Against group ([[GRP30:0x[0-9a-f]+]]):
+; RECURSE-NEXT:          %.sink.in = getelementptr inbounds i23, ptr %spec.select, i64 %indvars.iv
+; RECURSE-NEXT:      Grouped accesses:
+; RECURSE-NEXT:        Group [[GRP27]]:
+; RECURSE-NEXT:          (Low: %Dest High: (399 + %Dest))
+; RECURSE-NEXT:            Member: {%Dest,+,4}<nuw><%for.body>
+; RECURSE-NEXT:        Group [[GRP28]]:
+; RECURSE-NEXT:          (Low: %Preds High: (400 + %Preds))
+; RECURSE-NEXT:            Member: {%Preds,+,4}<nuw><%for.body>
+; RECURSE-NEXT:        Group [[GRP29]]:
+; RECURSE-NEXT:          (Low: %Base2 High: (399 + %Base2))
+; RECURSE-NEXT:            Member: {%Base2,+,4}<nw><%for.body>
+; RECURSE-NEXT:        Group [[GRP30]]:
+; RECURSE-NEXT:          (Low: %Base1 High: (399 + %Base1))
+; RECURSE-NEXT:            Member: {%Base1,+,4}<nw><%for.body>
+; RECURSE-EMPTY:
+; RECURSE-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; RECURSE-NEXT:      SCEV assumptions:
+; RECURSE-EMPTY:
+; RECURSE-NEXT:      Expressions re-written:
+;
 entry:
   br label %for.body
 
@@ -268,46 +409,85 @@ for.body:
   br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
 }
 
-; CHECK-LABEL: function 'forked_ptrs_different_base_same_offset_6b':
-; CHECK-NEXT:  for.body:
-; CHECK-NEXT:    Memory dependences are safe with run-time checks
-; CHECK-NEXT:    Dependences:
-; CHECK-NEXT:    Run-time memory checks:
-; CHECK-NEXT:    Check 0:
-; CHECK-NEXT:      Comparing group ([[G1:.+]]):
-; CHECK-NEXT:        %1 = getelementptr inbounds i6, ptr %Dest, i64 %indvars.iv
-; CHECK-NEXT:      Against group ([[G2:.+]]):
-; CHECK-NEXT:        %arrayidx = getelementptr inbounds i32, ptr %Preds, i64 %indvars.iv
-; CHECK-NEXT:    Check 1:
-; CHECK-NEXT:      Comparing group ([[G1]]):
-; CHECK-NEXT:        %1 = getelementptr inbounds i6, ptr %Dest, i64 %indvars.iv
-; CHECK-NEXT:      Against group ([[G3:.+]]):
-; CHECK-NEXT:        %.sink.in = getelementptr inbounds i6, ptr %spec.select, i64 %indvars.iv
-; CHECK-NEXT:    Check 2:
-; CHECK-NEXT:      Comparing group ([[G1]]):
-; CHECK-NEXT:        %1 = getelementptr inbounds i6, ptr %Dest, i64 %indvars.iv
-; CHECK-NEXT:      Against group ([[G4:.+]]):
-; CHECK-NEXT:        %.sink.in = getelementptr inbounds i6, ptr %spec.select, i64 %indvars.iv
-; CHECK-NEXT:    Grouped accesses:
-; CHECK-NEXT:      Group [[G1]]:
-; CHECK-NEXT:        (Low: %Dest High: (100 + %Dest))
-; CHECK-NEXT:          Member: {%Dest,+,1}<nuw><%for.body>
-; CHECK-NEXT:      Group [[G2]]:
-; CHECK-NEXT:        (Low: %Preds High: (400 + %Preds))
-; CHECK-NEXT:          Member: {%Preds,+,4}<nuw><%for.body>
-; CHECK-NEXT:      Group [[G3]]:
-; CHECK-NEXT:        (Low: %Base2 High: (100 + %Base2))
-; CHECK-NEXT:          Member: {%Base2,+,1}<nw><%for.body>
-; CHECK-NEXT:      Group [[G4]]:
-; CHECK-NEXT:        (Low: %Base1 High: (100 + %Base1))
-; CHECK-NEXT:          Member: {%Base1,+,1}<nw><%for.body>
+define dso_local void @forked_ptrs_different_base_same_offset_6b(ptr nocapture readonly nonnull %Base1, ptr nocapture readonly %Base2, ptr nocapture %Dest, ptr nocapture readonly %Preds) {
+; CHECK-LABEL: 'forked_ptrs_different_base_same_offset_6b'
+; CHECK-NEXT:    for.body:
+; CHECK-NEXT:      Memory dependences are safe with run-time checks
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Check 0:
+; CHECK-NEXT:        Comparing group ([[GRP31:0x[0-9a-f]+]]):
+; CHECK-NEXT:          %1 = getelementptr inbounds i6, ptr %Dest, i64 %indvars.iv
+; CHECK-NEXT:        Against group ([[GRP32:0x[0-9a-f]+]]):
+; CHECK-NEXT:          %arrayidx = getelementptr inbounds i32, ptr %Preds, i64 %indvars.iv
+; CHECK-NEXT:      Check 1:
+; CHECK-NEXT:        Comparing group ([[GRP31]]):
+; CHECK-NEXT:          %1 = getelementptr inbounds i6, ptr %Dest, i64 %indvars.iv
+; CHECK-NEXT:        Against group ([[GRP33:0x[0-9a-f]+]]):
+; CHECK-NEXT:          %.sink.in = getelementptr inbounds i6, ptr %spec.select, i64 %indvars.iv
+; CHECK-NEXT:      Check 2:
+; CHECK-NEXT:        Comparing group ([[GRP31]]):
+; CHECK-NEXT:          %1 = getelementptr inbounds i6, ptr %Dest, i64 %indvars.iv
+; CHECK-NEXT:        Against group ([[GRP34:0x[0-9a-f]+]]):
+; CHECK-NEXT:          %.sink.in = getelementptr inbounds i6, ptr %spec.select, i64 %indvars.iv
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-NEXT:        Group [[GRP31]]:
+; CHECK-NEXT:          (Low: %Dest High: (100 + %Dest))
+; CHECK-NEXT:            Member: {%Dest,+,1}<nuw><%for.body>
+; CHECK-NEXT:        Group [[GRP32]]:
+; CHECK-NEXT:          (Low: %Preds High: (400 + %Preds))
+; CHECK-NEXT:            Member: {%Preds,+,4}<nuw><%for.body>
+; CHECK-NEXT:        Group [[GRP33]]:
+; CHECK-NEXT:          (Low: %Base2 High: (100 + %Base2))
+; CHECK-NEXT:            Member: {%Base2,+,1}<nw><%for.body>
+; CHECK-NEXT:        Group [[GRP34]]:
+; CHECK-NEXT:          (Low: %Base1 High: (100 + %Base1))
+; CHECK-NEXT:            Member: {%Base1,+,1}<nw><%for.body>
 ; CHECK-EMPTY:
-; CHECK-NEXT:    Non vectorizable stores to invariant address were not found in loop.
-; CHECK-NEXT:    SCEV assumptions:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
 ; CHECK-EMPTY:
-; CHECK-NEXT:    Expressions re-written:
-
-define dso_local void @forked_ptrs_different_base_same_offset_6b(ptr nocapture readonly nonnull %Base1, ptr nocapture readonly %Base2, ptr nocapture %Dest, ptr nocapture readonly %Preds) {
+; CHECK-NEXT:      Expressions re-written:
+;
+; RECURSE-LABEL: 'forked_ptrs_different_base_same_offset_6b'
+; RECURSE-NEXT:    for.body:
+; RECURSE-NEXT:      Memory dependences are safe with run-time checks
+; RECURSE-NEXT:      Dependences:
+; RECURSE-NEXT:      Run-time memory checks:
+; RECURSE-NEXT:      Check 0:
+; RECURSE-NEXT:        Comparing group ([[GRP35:0x[0-9a-f]+]]):
+; RECURSE-NEXT:          %1 = getelementptr inbounds i6, ptr %Dest, i64 %indvars.iv
+; RECURSE-NEXT:        Against group ([[GRP36:0x[0-9a-f]+]]):
+; RECURSE-NEXT:          %arrayidx = getelementptr inbounds i32, ptr %Preds, i64 %indvars.iv
+; RECURSE-NEXT:      Check 1:
+; RECURSE-NEXT:        Comparing group ([[GRP35]]):
+; RECURSE-NEXT:          %1 = getelementptr inbounds i6, ptr %Dest, i64 %indvars.iv
+; RECURSE-NEXT:        Against group ([[GRP37:0x[0-9a-f]+]]):
+; RECURSE-NEXT:          %.sink.in = getelementptr inbounds i6, ptr %spec.select, i64 %indvars.iv
+; RECURSE-NEXT:      Check 2:
+; RECURSE-NEXT:        Comparing group ([[GRP35]]):
+; RECURSE-NEXT:          %1 = getelementptr inbounds i6, ptr %Dest, i64 %indvars.iv
+; RECURSE-NEXT:        Against group ([[GRP38:0x[0-9a-f]+]]):
+; RECURSE-NEXT:          %.sink.in = getelementptr inbounds i6, ptr %spec.select, i64 %indvars.iv
+; RECURSE-NEXT:      Grouped accesses:
+; RECURSE-NEXT:        Group [[GRP35]]:
+; RECURSE-NEXT:          (Low: %Dest High: (100 + %Dest))
+; RECURSE-NEXT:            Member: {%Dest,+,1}<nuw><%for.body>
+; RECURSE-NEXT:        Group [[GRP36]]:
+; RECURSE-NEXT:          (Low: %Preds High: (400 + %Preds))
+; RECURSE-NEXT:            Member: {%Preds,+,4}<nuw><%for.body>
+; RECURSE-NEXT:        Group [[GRP37]]:
+; RECURSE-NEXT:          (Low: %Base2 High: (100 + %Base2))
+; RECURSE-NEXT:            Member: {%Base2,+,1}<nw><%for.body>
+; RECURSE-NEXT:        Group [[GRP38]]:
+; RECURSE-NEXT:          (Low: %Base1 High: (100 + %Base1))
+; RECURSE-NEXT:            Member: {%Base1,+,1}<nw><%for.body>
+; RECURSE-EMPTY:
+; RECURSE-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; RECURSE-NEXT:      SCEV assumptions:
+; RECURSE-EMPTY:
+; RECURSE-NEXT:      Expressions re-written:
+;
 entry:
   br label %for.body
 
@@ -329,46 +509,85 @@ for.body:
   br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
 }
 
-; CHECK-LABEL: function 'forked_ptrs_different_base_same_offset_possible_poison':
-; CHECK-NEXT:  for.body:
-; CHECK-NEXT:    Memory dependences are safe with run-time checks
-; CHECK-NEXT:    Dependences:
-; CHECK-NEXT:    Run-time memory checks:
-; CHECK-NEXT:    Check 0:
-; CHECK-NEXT:      Comparing group ([[G1:.+]]):
-; CHECK-NEXT:        %1 = getelementptr inbounds float, ptr %Dest, i64 %indvars.iv
-; CHECK-NEXT:      Against group ([[G2:.+]]):
-; CHECK-NEXT:        %arrayidx = getelementptr inbounds i32, ptr %Preds, i64 %indvars.iv
-; CHECK-NEXT:    Check 1:
-; CHECK-NEXT:      Comparing group ([[G1]]):
-; CHECK-NEXT:        %1 = getelementptr inbounds float, ptr %Dest, i64 %indvars.iv
-; CHECK-NEXT:      Against group ([[G3:.+]]):
-; CHECK-NEXT:        %.sink.in = getelementptr inbounds float, ptr %spec.select, i64 %indvars.iv
-; CHECK-NEXT:    Check 2:
-; CHECK-NEXT:      Comparing group ([[G1]]):
-; CHECK-NEXT:        %1 = getelementptr inbounds float, ptr %Dest, i64 %indvars.iv
-; CHECK-NEXT:      Against group ([[G4:.+]]):
-; CHECK-NEXT:        %.sink.in = getelementptr inbounds float, ptr %spec.select, i64 %indvars.iv
-; CHECK-NEXT:    Grouped accesses:
-; CHECK-NEXT:      Group [[G1]]:
-; CHECK-NEXT:        (Low: %Dest High: (400 + %Dest))
-; CHECK-NEXT:          Member: {%Dest,+,4}<nw><%for.body>
-; CHECK-NEXT:      Group [[G2]]:
-; CHECK-NEXT:        (Low: %Preds High: (400 + %Preds))
-; CHECK-NEXT:          Member: {%Preds,+,4}<nuw><%for.body>
-; CHECK-NEXT:      Group [[G3]]:
-; CHECK-NEXT:        (Low: %Base2 High: (400 + %Base2))
-; CHECK-NEXT:          Member: {%Base2,+,4}<nw><%for.body>
-; CHECK-NEXT:      Group [[G4]]:
-; CHECK-NEXT:        (Low: %Base1 High: (400 + %Base1))
-; CHECK-NEXT:          Member: {%Base1,+,4}<nw><%for.body>
+define dso_local void @forked_ptrs_different_base_same_offset_possible_poison(ptr nocapture readonly %Base1, ptr nocapture readonly %Base2, ptr nocapture %Dest, ptr nocapture readonly %Preds, i1 %c) {
+; CHECK-LABEL: 'forked_ptrs_different_base_same_offset_possible_poison'
+; CHECK-NEXT:    for.body:
+; CHECK-NEXT:      Memory dependences are safe with run-time checks
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Check 0:
+; CHECK-NEXT:        Comparing group ([[GRP39:0x[0-9a-f]+]]):
+; CHECK-NEXT:          %1 = getelementptr inbounds float, ptr %Dest, i64 %indvars.iv
+; CHECK-NEXT:        Against group ([[GRP40:0x[0-9a-f]+]]):
+; CHECK-NEXT:          %arrayidx = getelementptr inbounds i32, ptr %Preds, i64 %indvars.iv
+; CHECK-NEXT:      Check 1:
+; CHECK-NEXT:        Comparing group ([[GRP39]]):
+; CHECK-NEXT:          %1 = getelementptr inbounds float, ptr %Dest, i64 %indvars.iv
+; CHECK-NEXT:        Against group ([[GRP41:0x[0-9a-f]+]]):
+; CHECK-NEXT:          %.sink.in = getelementptr inbounds float, ptr %spec.select, i64 %indvars.iv
+; CHECK-NEXT:      Check 2:
+; CHECK-NEXT:        Comparing group ([[GRP39]]):
+; CHECK-NEXT:          %1 = getelementptr inbounds float, ptr %Dest, i64 %indvars.iv
+; CHECK-NEXT:        Against group ([[GRP42:0x[0-9a-f]+]]):
+; CHECK-NEXT:          %.sink.in = getelementptr inbounds float, ptr %spec.select, i64 %indvars.iv
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-NEXT:        Group [[GRP39]]:
+; CHECK-NEXT:          (Low: %Dest High: (400 + %Dest))
+; CHECK-NEXT:            Member: {%Dest,+,4}<nw><%for.body>
+; CHECK-NEXT:        Group [[GRP40]]:
+; CHECK-NEXT:          (Low: %Preds High: (400 + %Preds))
+; CHECK-NEXT:            Member: {%Preds,+,4}<nuw><%for.body>
+; CHECK-NEXT:        Group [[GRP41]]:
+; CHECK-NEXT:          (Low: %Base2 High: (400 + %Base2))
+; CHECK-NEXT:            Member: {%Base2,+,4}<nw><%for.body>
+; CHECK-NEXT:        Group [[GRP42]]:
+; CHECK-NEXT:          (Low: %Base1 High: (400 + %Base1))
+; CHECK-NEXT:            Member: {%Base1,+,4}<nw><%for.body>
 ; CHECK-EMPTY:
-; CHECK-NEXT:   Non vectorizable stores to invariant address were not found in loop.
-; CHECK-NEXT:   SCEV assumptions:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
 ; CHECK-EMPTY:
-; CHECK-NEXT:   Expressions re-written:
-
-define dso_local void @forked_ptrs_different_base_same_offset_possible_poison(ptr nocapture readonly %Base1, ptr nocapture readonly %Base2, ptr nocapture %Dest, ptr nocapture readonly %Preds, i1 %c) {
+; CHECK-NEXT:      Expressions re-written:
+;
+; RECURSE-LABEL: 'forked_ptrs_different_base_same_offset_possible_poison'
+; RECURSE-NEXT:    for.body:
+; RECURSE-NEXT:      Memory dependences are safe with run-time checks
+; RECURSE-NEXT:      Dependences:
+; RECURSE-NEXT:      Run-time memory checks:
+; RECURSE-NEXT:      Check 0:
+; RECURSE-NEXT:        Comparing group ([[GRP43:0x[0-9a-f]+]]):
+; RECURSE-NEXT:          %1 = getelementptr inbounds float, ptr %Dest, i64 %indvars.iv
+; RECURSE-NEXT:        Against group ([[GRP44:0x[0-9a-f]+]]):
+; RECURSE-NEXT:          %arrayidx = getelementptr inbounds i32, ptr %Preds, i64 %indvars.iv
+; RECURSE-NEXT:      Check 1:
+; RECURSE-NEXT:        Comparing group ([[GRP43]]):
+; RECURSE-NEXT:          %1 = getelementptr inbounds float, ptr %Dest, i64 %indvars.iv
+; RECURSE-NEXT:        Against group ([[GRP45:0x[0-9a-f]+]]):
+; RECURSE-NEXT:          %.sink.in = getelementptr inbounds float, ptr %spec.select, i64 %indvars.iv
+; RECURSE-NEXT:      Check 2:
+; RECURSE-NEXT:        Comparing group ([[GRP43]]):
+; RECURSE-NEXT:          %1 = getelementptr inbounds float, ptr %Dest, i64 %indvars.iv
+; RECURSE-NEXT:        Against group ([[GRP46:0x[0-9a-f]+]]):
+; RECURSE-NEXT:          %.sink.in = getelementptr inbounds float, ptr %spec.select, i64 %indvars.iv
+; RECURSE-NEXT:      Grouped accesses:
+; RECURSE-NEXT:        Group [[GRP43]]:
+; RECURSE-NEXT:          (Low: %Dest High: (400 + %Dest))
+; RECURSE-NEXT:            Member: {%Dest,+,4}<nw><%for.body>
+; RECURSE-NEXT:        Group [[GRP44]]:
+; RECURSE-NEXT:          (Low: %Preds High: (400 + %Preds))
+; RECURSE-NEXT:            Member: {%Preds,+,4}<nuw><%for.body>
+; RECURSE-NEXT:        Group [[GRP45]]:
+; RECURSE-NEXT:          (Low: %Base2 High: (400 + %Base2))
+; RECURSE-NEXT:            Member: {%Base2,+,4}<nw><%for.body>
+; RECURSE-NEXT:        Group [[GRP46]]:
+; RECURSE-NEXT:          (Low: %Base1 High: (400 + %Base1))
+; RECURSE-NEXT:            Member: {%Base1,+,4}<nw><%for.body>
+; RECURSE-EMPTY:
+; RECURSE-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; RECURSE-NEXT:      SCEV assumptions:
+; RECURSE-EMPTY:
+; RECURSE-NEXT:      Expressions re-written:
+;
 entry:
   br label %for.body
 
@@ -396,18 +615,6 @@ latch:
   br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
 }
 
-; CHECK-LABEL: function 'forked_ptrs_same_base_different_offset':
-; CHECK-NEXT:   for.body:
-; CHECK-NEXT:     Report: cannot identify array bounds
-; CHECK-NEXT:     Dependences:
-; CHECK-NEXT:     Run-time memory checks:
-; CHECK-NEXT:     Grouped accesses:
-; CHECK-EMPTY:
-; CHECK-NEXT:     Non vectorizable stores to invariant address were not found in loop.
-; CHECK-NEXT:     SCEV assumptions:
-; CHECK-EMPTY:
-; CHECK-NEXT:     Expressions re-written:
-
 ;;;; Derived from the following C code
 ;; void forked_ptrs_same_base_different_offset(float *A, float *B, int *C) {
 ;;   int offset;
@@ -421,6 +628,30 @@ latch:
 ;; }
 
 define dso_local void @forked_ptrs_same_base_different_offset(ptr nocapture readonly %Base, ptr nocapture %Dest, ptr nocapture readonly %Preds) {
+; CHECK-LABEL: 'forked_ptrs_same_base_different_offset'
+; CHECK-NEXT:    for.body:
+; CHECK-NEXT:      Report: cannot identify array bounds
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+; RECURSE-LABEL: 'forked_ptrs_same_base_different_offset'
+; RECURSE-NEXT:    for.body:
+; RECURSE-NEXT:      Report: cannot identify array bounds
+; RECURSE-NEXT:      Dependences:
+; RECURSE-NEXT:      Run-time memory checks:
+; RECURSE-NEXT:      Grouped accesses:
+; RECURSE-EMPTY:
+; RECURSE-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; RECURSE-NEXT:      SCEV assumptions:
+; RECURSE-EMPTY:
+; RECURSE-NEXT:      Expressions re-written:
+;
 entry:
   br label %for.body
 
@@ -446,40 +677,52 @@ for.body:                                         ; preds = %entry, %for.body
   br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
 }
 
-; CHECK-LABEL: function 'forked_ptrs_add_to_offset'
-; CHECK-NEXT:  for.body:
-; CHECK-NEXT:    Memory dependences are safe with run-time checks
-; CHECK-NEXT:    Dependences:
-; CHECK-NEXT:    Run-time memory checks:
-; CHECK-NEXT:    Check 0:
-; CHECK-NEXT:      Comparing group ([[G1:.+]]):
-; CHECK-NEXT:        %arrayidx5 = getelementptr inbounds float, ptr %Dest, i64 %indvars.iv
-; CHECK-NEXT:      Against group ([[G2:.+]]):
-; CHECK-NEXT:        %arrayidx = getelementptr inbounds i32, ptr %Preds, i64 %indvars.iv
-; CHECK-NEXT:    Check 1:
-; CHECK-NEXT:      Comparing group ([[G1:.+]]):
-; CHECK-NEXT:        %arrayidx5 = getelementptr inbounds float, ptr %Dest, i64 %indvars.iv
-; CHECK-NEXT:      Against group ([[G3:.+]]):
-; CHECK-NEXT:        %arrayidx3 = getelementptr inbounds float, ptr %Base, i64 %offset
-; CHECK-NEXT:        %arrayidx3 = getelementptr inbounds float, ptr %Base, i64 %offset
-; CHECK-NEXT:    Grouped accesses:
-; CHECK-NEXT:      Group [[G1]]:
-; CHECK-NEXT:        (Low: %Dest High: (400 + %Dest))
-; CHECK-NEXT:          Member: {%Dest,+,4}<nuw><%for.body>
-; CHECK-NEXT:      Group [[G2]]:
-; CHECK-NEXT:        (Low: %Preds High: (400 + %Preds))
-; CHECK-NEXT:          Member: {%Preds,+,4}<nuw><%for.body>
-; CHECK-NEXT:      Group [[G3]]:
-; CHECK-NEXT:        (Low: ((4 * %extra_offset) + %Base) High: (404 + (4 * %extra_offset) + %Base))
-; CHECK-NEXT:          Member: {(4 + (4 * %extra_offset) + %Base),+,4}<%for.body>
-; CHECK-NEXT:          Member: {((4 * %extra_offset) + %Base),+,4}<%for.body>
+define dso_local void @forked_ptrs_add_to_offset(ptr nocapture readonly %Base, ptr nocapture %Dest, ptr nocapture readonly %Preds, i64 %extra_offset) {
+; CHECK-LABEL: 'forked_ptrs_add_to_offset'
+; CHECK-NEXT:    for.body:
+; CHECK-NEXT:      Memory dependences are safe with run-time checks
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Check 0:
+; CHECK-NEXT:        Comparing group ([[GRP47:0x[0-9a-f]+]]):
+; CHECK-NEXT:          %arrayidx5 = getelementptr inbounds float, ptr %Dest, i64 %indvars.iv
+; CHECK-NEXT:        Against group ([[GRP48:0x[0-9a-f]+]]):
+; CHECK-NEXT:          %arrayidx = getelementptr inbounds i32, ptr %Preds, i64 %indvars.iv
+; CHECK-NEXT:      Check 1:
+; CHECK-NEXT:        Comparing group ([[GRP47]]):
+; CHECK-NEXT:          %arrayidx5 = getelementptr inbounds float, ptr %Dest, i64 %indvars.iv
+; CHECK-NEXT:        Against group ([[GRP49:0x[0-9a-f]+]]):
+; CHECK-NEXT:          %arrayidx3 = getelementptr inbounds float, ptr %Base, i64 %offset
+; CHECK-NEXT:          %arrayidx3 = getelementptr inbounds float, ptr %Base, i64 %offset
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-NEXT:        Group [[GRP47]]:
+; CHECK-NEXT:          (Low: %Dest High: (400 + %Dest))
+; CHECK-NEXT:            Member: {%Dest,+,4}<nuw><%for.body>
+; CHECK-NEXT:        Group [[GRP48]]:
+; CHECK-NEXT:          (Low: %Preds High: (400 + %Preds))
+; CHECK-NEXT:            Member: {%Preds,+,4}<nuw><%for.body>
+; CHECK-NEXT:        Group [[GRP49]]:
+; CHECK-NEXT:          (Low: ((4 * %extra_offset) + %Base) High: (404 + (4 * %extra_offset) + %Base))
+; CHECK-NEXT:            Member: {(4 + (4 * %extra_offset) + %Base),+,4}<%for.body>
+; CHECK-NEXT:            Member: {((4 * %extra_offset) + %Base),+,4}<%for.body>
 ; CHECK-EMPTY:
-; CHECK-NEXT:    Non vectorizable stores to invariant address were not found in loop.
-; CHECK-NEXT:    SCEV assumptions:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
 ; CHECK-EMPTY:
-; CHECK-NEXT:    Expressions re-written:
-
-define dso_local void @forked_ptrs_add_to_offset(ptr nocapture readonly %Base, ptr nocapture %Dest, ptr nocapture readonly %Preds, i64 %extra_offset) {
+; CHECK-NEXT:      Expressions re-written:
+;
+; RECURSE-LABEL: 'forked_ptrs_add_to_offset'
+; RECURSE-NEXT:    for.body:
+; RECURSE-NEXT:      Report: cannot identify array bounds
+; RECURSE-NEXT:      Dependences:
+; RECURSE-NEXT:      Run-time memory checks:
+; RECURSE-NEXT:      Grouped accesses:
+; RECURSE-EMPTY:
+; RECURSE-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; RECURSE-NEXT:      SCEV assumptions:
+; RECURSE-EMPTY:
+; RECURSE-NEXT:      Expressions re-written:
+;
 entry:
   br label %for.body
 
@@ -502,40 +745,52 @@ for.body:                                         ; preds = %entry, %for.body
   br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
 }
 
-; CHECK-LABEL: function 'forked_ptrs_sub_from_offset'
-; CHECK-NEXT:  for.body:
-; CHECK-NEXT:    Memory dependences are safe with run-time checks
-; CHECK-NEXT:    Dependences:
-; CHECK-NEXT:    Run-time memory checks:
-; CHECK-NEXT:    Check 0:
-; CHECK-NEXT:      Comparing group ([[G1:.+]]):
-; CHECK-NEXT:        %arrayidx5 = getelementptr inbounds float, ptr %Dest, i64 %indvars.iv
-; CHECK-NEXT:      Against group ([[G2:.+]]):
-; CHECK-NEXT:        %arrayidx = getelementptr inbounds i32, ptr %Preds, i64 %indvars.iv
-; CHECK-NEXT:    Check 1:
-; CHECK-NEXT:      Comparing group ([[G1]]):
-; CHECK-NEXT:        %arrayidx5 = getelementptr inbounds float, ptr %Dest, i64 %indvars.iv
-; CHECK-NEXT:      Against group ([[G3:.+]]):
-; CHECK-NEXT:        %arrayidx3 = getelementptr inbounds float, ptr %Base, i64 %offset
-; CHECK-NEXT:        %arrayidx3 = getelementptr inbounds float, ptr %Base, i64 %offset
-; CHECK-NEXT:    Grouped accesses:
-; CHECK-NEXT:      Group [[G1]]:
-; CHECK-NEXT:        (Low: %Dest High: (400 + %Dest))
-; CHECK-NEXT:          Member: {%Dest,+,4}<nuw><%for.body>
-; CHECK-NEXT:      Group [[G2]]:
-; CHECK-NEXT:        (Low: %Preds High: (400 + %Preds))
-; CHECK-NEXT:          Member: {%Preds,+,4}<nuw><%for.body>
-; CHECK-NEXT:      Group [[G3]]:
-; CHECK-NEXT:        (Low: ((-4 * %extra_offset) + %Base) High: (404 + (-4 * %extra_offset) + %Base))
-; CHECK-NEXT:          Member: {(4 + (-4 * %extra_offset) + %Base),+,4}<%for.body>
-; CHECK-NEXT:          Member: {((-4 * %extra_offset) + %Base),+,4}<%for.body>
+define dso_local void @forked_ptrs_sub_from_offset(ptr nocapture readonly %Base, ptr nocapture %Dest, ptr nocapture readonly %Preds, i64 %extra_offset) {
+; CHECK-LABEL: 'forked_ptrs_sub_from_offset'
+; CHECK-NEXT:    for.body:
+; CHECK-NEXT:      Memory dependences are safe with run-time checks
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Check 0:
+; CHECK-NEXT:        Comparing group ([[GRP50:0x[0-9a-f]+]]):
+; CHECK-NEXT:          %arrayidx5 = getelementptr inbounds float, ptr %Dest, i64 %indvars.iv
+; CHECK-NEXT:        Against group ([[GRP51:0x[0-9a-f]+]]):
+; CHECK-NEXT:          %arrayidx = getelementptr inbounds i32, ptr %Preds, i64 %indvars.iv
+; CHECK-NEXT:      Check 1:
+; CHECK-NEXT:        Comparing group ([[GRP50]]):
+; CHECK-NEXT:          %arrayidx5 = getelementptr inbounds float, ptr %Dest, i64 %indvars.iv
+; CHECK-NEXT:        Against group ([[GRP52:0x[0-9a-f]+]]):
+; CHECK-NEXT:          %arrayidx3 = getelementptr inbounds float, ptr %Base, i64 %offset
+; CHECK-NEXT:          %arrayidx3 = getelementptr inbounds float, ptr %Base, i64 %offset
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-NEXT:        Group [[GRP50]]:
+; CHECK-NEXT:          (Low: %Dest High: (400 + %Dest))
+; CHECK-NEXT:            Member: {%Dest,+,4}<nuw><%for.body>
+; CHECK-NEXT:        Group [[GRP51]]:
+; CHECK-NEXT:          (Low: %Preds High: (400 + %Preds))
+; CHECK-NEXT:            Member: {%Preds,+,4}<nuw><%for.body>
+; CHECK-NEXT:        Group [[GRP52]]:
+; CHECK-NEXT:          (Low: ((-4 * %extra_offset) + %Base) High: (404 + (-4 * %extra_offset) + %Base))
+; CHECK-NEXT:            Member: {(4 + (-4 * %extra_offset) + %Base),+,4}<%for.body>
+; CHECK-NEXT:            Member: {((-4 * %extra_offset) + %Base),+,4}<%for.body>
 ; CHECK-EMPTY:
-; CHECK-NEXT:    Non vectorizable stores to invariant address were not found in loop.
-; CHECK-NEXT:    SCEV assumptions:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
 ; CHECK-EMPTY:
-; CHECK-NEXT:    Expressions re-written:
-
-define dso_local void @forked_ptrs_sub_from_offset(ptr nocapture readonly %Base, ptr nocapture %Dest, ptr nocapture readonly %Preds, i64 %extra_offset) {
+; CHECK-NEXT:      Expressions re-written:
+;
+; RECURSE-LABEL: 'forked_ptrs_sub_from_offset'
+; RECURSE-NEXT:    for.body:
+; RECURSE-NEXT:      Report: cannot identify array bounds
+; RECURSE-NEXT:      Dependences:
+; RECURSE-NEXT:      Run-time memory checks:
+; RECURSE-NEXT:      Grouped accesses:
+; RECURSE-EMPTY:
+; RECURSE-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; RECURSE-NEXT:      SCEV assumptions:
+; RECURSE-EMPTY:
+; RECURSE-NEXT:      Expressions re-written:
+;
 entry:
   br label %for.body
 
@@ -558,40 +813,52 @@ for.body:                                         ; preds = %entry, %for.body
   br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
 }
 
-; CHECK-LABEL: function 'forked_ptrs_add_sub_offset'
-; CHECK-NEXT:  for.body:
-; CHECK-NEXT:    Memory dependences are safe with run-time checks
-; CHECK-NEXT:    Dependences:
-; CHECK-NEXT:    Run-time memory checks:
-; CHECK-NEXT:    Check 0:
-; CHECK-NEXT:      Comparing group ([[G1:.+]]):
-; CHECK-NEXT:        %arrayidx5 = getelementptr inbounds float, ptr %Dest, i64 %indvars.iv
-; CHECK-NEXT:      Against group ([[G2:.+]]):
-; CHECK-NEXT:        %arrayidx = getelementptr inbounds i32, ptr %Preds, i64 %indvars.iv
-; CHECK-NEXT:    Check 1:
-; CHECK-NEXT:      Comparing group ([[G1]]):
-; CHECK-NEXT:        %arrayidx5 = getelementptr inbounds float, ptr %Dest, i64 %indvars.iv
-; CHECK-NEXT:      Against group ([[G3:.+]]):
-; CHECK-NEXT:        %arrayidx3 = getelementptr inbounds float, ptr %Base, i64 %offset
-; CHECK-NEXT:        %arrayidx3 = getelementptr inbounds float, ptr %Base, i64 %offset
-; CHECK-NEXT:    Grouped accesses:
-; CHECK-NEXT:      Group [[G1]]:
-; CHECK-NEXT:        (Low: %Dest High: (400 + %Dest))
-; CHECK-NEXT:          Member: {%Dest,+,4}<nuw><%for.body>
-; CHECK-NEXT:      Group [[G2]]:
-; CHECK-NEXT:        (Low: %Preds High: (400 + %Preds))
-; CHECK-NEXT:          Member: {%Preds,+,4}<nuw><%for.body>
-; CHECK-NEXT:      Group [[G3]]:
-; CHECK-NEXT:        (Low: ((4 * %to_add) + (-4 * %to_sub) + %Base) High: (404 + (4 * %to_add) + (-4 * %to_sub) + %Base))
-; CHECK-NEXT:          Member: {(4 + (4 * %to_add) + (-4 * %to_sub) + %Base),+,4}<%for.body>
-; CHECK-NEXT:          Member: {((4 * %to_add) + (-4 * %to_sub) + %Base),+,4}<%for.body>
+define dso_local void @forked_ptrs_add_sub_offset(ptr nocapture readonly %Base, ptr nocapture %Dest, ptr nocapture readonly %Preds, i64 %to_add, i64 %to_sub) {
+; CHECK-LABEL: 'forked_ptrs_add_sub_offset'
+; CHECK-NEXT:    for.body:
+; CHECK-NEXT:      Memory dependences are safe with run-time checks
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Check 0:
+; CHECK-NEXT:        Comparing group ([[GRP53:0x[0-9a-f]+]]):
+; CHECK-NEXT:          %arrayidx5 = getelementptr inbounds float, ptr %Dest, i64 %indvars.iv
+; CHECK-NEXT:        Against group ([[GRP54:0x[0-9a-f]+]]):
+; CHECK-NEXT:          %arrayidx = getelementptr inbounds i32, ptr %Preds, i64 %indvars.iv
+; CHECK-NEXT:      Check 1:
+; CHECK-NEXT:        Comparing group ([[GRP53]]):
+; CHECK-NEXT:          %arrayidx5 = getelementptr inbounds float, ptr %Dest, i64 %indvars.iv
+; CHECK-NEXT:        Against group ([[GRP55:0x[0-9a-f]+]]):
+; CHECK-NEXT:          %arrayidx3 = getelementptr inbounds float, ptr %Base, i64 %offset
+; CHECK-NEXT:          %arrayidx3 = getelementptr inbounds float, ptr %Base, i64 %offset
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-NEXT:        Group [[GRP53]]:
+; CHECK-NEXT:          (Low: %Dest High: (400 + %Dest))
+; CHECK-NEXT:            Member: {%Dest,+,4}<nuw><%for.body>
+; CHECK-NEXT:        Group [[GRP54]]:
+; CHECK-NEXT:          (Low: %Preds High: (400 + %Preds))
+; CHECK-NEXT:            Member: {%Preds,+,4}<nuw><%for.body>
+; CHECK-NEXT:        Group [[GRP55]]:
+; CHECK-NEXT:          (Low: ((4 * %to_add) + (-4 * %to_sub) + %Base) High: (404 + (4 * %to_add) + (-4 * %to_sub) + %Base))
+; CHECK-NEXT:            Member: {(4 + (4 * %to_add) + (-4 * %to_sub) + %Base),+,4}<%for.body>
+; CHECK-NEXT:            Member: {((4 * %to_add) + (-4 * %to_sub) + %Base),+,4}<%for.body>
 ; CHECK-EMPTY:
-; CHECK-NEXT:    Non vectorizable stores to invariant address were not found in loop.
-; CHECK-NEXT:    SCEV assumptions:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
 ; CHECK-EMPTY:
-; CHECK-NEXT:    Expressions re-written:
-
-define dso_local void @forked_ptrs_add_sub_offset(ptr nocapture readonly %Base, ptr nocapture %Dest, ptr nocapture readonly %Preds, i64 %to_add, i64 %to_sub) {
+; CHECK-NEXT:      Expressions re-written:
+;
+; RECURSE-LABEL: 'forked_ptrs_add_sub_offset'
+; RECURSE-NEXT:    for.body:
+; RECURSE-NEXT:      Report: cannot identify array bounds
+; RECURSE-NEXT:      Dependences:
+; RECURSE-NEXT:      Run-time memory checks:
+; RECURSE-NEXT:      Grouped accesses:
+; RECURSE-EMPTY:
+; RECURSE-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; RECURSE-NEXT:      SCEV assumptions:
+; RECURSE-EMPTY:
+; RECURSE-NEXT:      Expressions re-written:
+;
 entry:
   br label %for.body
 
@@ -617,19 +884,31 @@ for.body:                                         ; preds = %entry, %for.body
 
 ;;;; Cases that can be handled by a forked pointer but are not currently allowed.
 
-; CHECK-LABEL: function 'forked_ptrs_mul_by_offset'
-; CHECK-NEXT:  for.body:
-; CHECK-NEXT:    Report: cannot identify array bounds
-; CHECK-NEXT:    Dependences:
-; CHECK-NEXT:    Run-time memory checks:
-; CHECK-NEXT:    Grouped accesses:
+define dso_local void @forked_ptrs_mul_by_offset(ptr nocapture readonly %Base, ptr nocapture %Dest, ptr nocapture readonly %Preds, i64 %extra_offset) {
+; CHECK-LABEL: 'forked_ptrs_mul_by_offset'
+; CHECK-NEXT:    for.body:
+; CHECK-NEXT:      Report: cannot identify array bounds
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
 ; CHECK-EMPTY:
-; CHECK-NEXT:    Non vectorizable stores to invariant address were not found in loop.
-; CHECK-NEXT:    SCEV assumptions:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
 ; CHECK-EMPTY:
-; CHECK-NEXT:    Expressions re-written:
-
-define dso_local void @forked_ptrs_mul_by_offset(ptr nocapture readonly %Base, ptr nocapture %Dest, ptr nocapture readonly %Preds, i64 %extra_offset) {
+; CHECK-NEXT:      Expressions re-written:
+;
+; RECURSE-LABEL: 'forked_ptrs_mul_by_offset'
+; RECURSE-NEXT:    for.body:
+; RECURSE-NEXT:      Report: cannot identify array bounds
+; RECURSE-NEXT:      Dependences:
+; RECURSE-NEXT:      Run-time memory checks:
+; RECURSE-NEXT:      Grouped accesses:
+; RECURSE-EMPTY:
+; RECURSE-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; RECURSE-NEXT:      SCEV assumptions:
+; RECURSE-EMPTY:
+; RECURSE-NEXT:      Expressions re-written:
+;
 entry:
   br label %for.body
 
@@ -652,22 +931,34 @@ for.body:                                         ; preds = %entry, %for.body
   br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
 }
 
-; CHECK-LABEL: function 'forked_ptrs_uniform_and_strided_forks':
-; CHECK-NEXT:  for.body:
-; CHECK-NEXT:    Report: cannot identify array bounds
-; CHECK-NEXT:    Dependences:
-; CHECK-NEXT:    Run-time memory checks:
-; CHECK-NEXT:    Grouped accesses:
-; CHECK-EMPTY:
-; CHECK-NEXT:    Non vectorizable stores to invariant address were not found in loop.
-; CHECK-NEXT:    SCEV assumptions:
-; CHECK-EMPTY:
-; CHECK-NEXT:    Expressions re-written:
-
 ;;;; Derived from forked_ptrs_same_base_different_offset with a manually
 ;;;; added uniform offset and a mul to provide a stride
 
 define dso_local void @forked_ptrs_uniform_and_strided_forks(float* nocapture readonly %Base, float* nocapture %Dest, i32* nocapture readonly %Preds) {
+; CHECK-LABEL: 'forked_ptrs_uniform_and_strided_forks'
+; CHECK-NEXT:    for.body:
+; CHECK-NEXT:      Report: cannot identify array bounds
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+; RECURSE-LABEL: 'forked_ptrs_uniform_and_strided_forks'
+; RECURSE-NEXT:    for.body:
+; RECURSE-NEXT:      Report: cannot identify array bounds
+; RECURSE-NEXT:      Dependences:
+; RECURSE-NEXT:      Run-time memory checks:
+; RECURSE-NEXT:      Grouped accesses:
+; RECURSE-EMPTY:
+; RECURSE-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; RECURSE-NEXT:      SCEV assumptions:
+; RECURSE-EMPTY:
+; RECURSE-NEXT:      Expressions re-written:
+;
 entry:
   br label %for.body
 
@@ -694,23 +985,35 @@ for.body:                                         ; preds = %entry, %for.body
   br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
 }
 
-; CHECK-LABEL:  function 'forked_ptrs_gather_and_contiguous_forks':
-; CHECK-NEXT:   for.body:
-; CHECK-NEXT:     Report: cannot identify array bounds
-; CHECK-NEXT:     Dependences:
-; CHECK-NEXT:     Run-time memory checks:
-; CHECK-NEXT:     Grouped accesses:
-; CHECK-EMPTY:
-; CHECK-NEXT:     Non vectorizable stores to invariant address were not found in loop.
-; CHECK-NEXT:     SCEV assumptions:
-; CHECK-EMPTY:
-; CHECK-NEXT:     Expressions re-written:
-
 ;;;; Derived from forked_ptrs_same_base_different_offset with a gather
 ;;;; added using Preds as an index array in addition to the per-iteration
 ;;;; condition.
 
 define dso_local void @forked_ptrs_gather_and_contiguous_forks(ptr nocapture readonly %Base1, ptr nocapture readonly %Base2, ptr nocapture %Dest, ptr nocapture readonly %Preds) {
+; CHECK-LABEL: 'forked_ptrs_gather_and_contiguous_forks'
+; CHECK-NEXT:    for.body:
+; CHECK-NEXT:      Report: cannot identify array bounds
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+; RECURSE-LABEL: 'forked_ptrs_gather_and_contiguous_forks'
+; RECURSE-NEXT:    for.body:
+; RECURSE-NEXT:      Report: cannot identify array bounds
+; RECURSE-NEXT:      Dependences:
+; RECURSE-NEXT:      Run-time memory checks:
+; RECURSE-NEXT:      Grouped accesses:
+; RECURSE-EMPTY:
+; RECURSE-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; RECURSE-NEXT:      SCEV assumptions:
+; RECURSE-EMPTY:
+; RECURSE-NEXT:      Expressions re-written:
+;
 entry:
   br label %for.body
 
@@ -737,19 +1040,31 @@ for.body:                                         ; preds = %entry, %for.body
 ;; We don't currently handle a fork in both the base and the offset of a
 ;; GEP instruction.
 
-; CHECK-LABEL: Loop access info in function 'forked_ptrs_two_forks_gep':
-; CHECK-NEXT:   for.body:
-; CHECK-NEXT:     Report: cannot identify array bounds
-; CHECK-NEXT:     Dependences:
-; CHECK-NEXT:     Run-time memory checks:
-; CHECK-NEXT:     Grouped accesses:
+define dso_local void @forked_ptrs_two_forks_gep(ptr nocapture readonly %Base1, ptr nocapture readonly %Base2, ptr nocapture %Dest, ptr nocapture readonly %Preds) {
+; CHECK-LABEL: 'forked_ptrs_two_forks_gep'
+; CHECK-NEXT:    for.body:
+; CHECK-NEXT:      Report: cannot identify array bounds
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
 ; CHECK-EMPTY:
-; CHECK-NEXT:     Non vectorizable stores to invariant address were not found in loop.
-; CHECK-NEXT:     SCEV assumptions:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
 ; CHECK-EMPTY:
-; CHECK-NEXT:     Expressions re-written:
-
-define dso_local void @forked_ptrs_two_forks_gep(ptr nocapture readonly %Base1, ptr nocapture readonly %Base2, ptr nocapture %Dest, ptr nocapture readonly %Preds) {
+; CHECK-NEXT:      Expressions re-written:
+;
+; RECURSE-LABEL: 'forked_ptrs_two_forks_gep'
+; RECURSE-NEXT:    for.body:
+; RECURSE-NEXT:      Report: cannot identify array bounds
+; RECURSE-NEXT:      Dependences:
+; RECURSE-NEXT:      Run-time memory checks:
+; RECURSE-NEXT:      Grouped accesses:
+; RECURSE-EMPTY:
+; RECURSE-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; RECURSE-NEXT:      SCEV assumptions:
+; RECURSE-EMPTY:
+; RECURSE-NEXT:      Expressions re-written:
+;
 entry:
   br label %for.body
 
@@ -774,19 +1089,31 @@ for.body:
 
 ;; We don't handle forks as children of a select
 
-; CHECK-LABEL: Loop access info in function 'forked_ptrs_two_select':
-; CHECK-NEXT:  loop:
-; CHECK-NEXT:    Report: cannot identify array bounds
-; CHECK-NEXT:    Dependences:
-; CHECK-NEXT:    Run-time memory checks:
-; CHECK-NEXT:    Grouped accesses:
+define void @forked_ptrs_two_select(ptr nocapture readonly %Base1, ptr nocapture readonly %Base2, ptr nocapture readonly %Base3, ptr %Dest) {
+; CHECK-LABEL: 'forked_ptrs_two_select'
+; CHECK-NEXT:    loop:
+; CHECK-NEXT:      Report: cannot identify array bounds
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
 ; CHECK-EMPTY:
-; CHECK-NEXT:    Non vectorizable stores to invariant address were not found in loop.
-; CHECK-NEXT:    SCEV assumptions:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
 ; CHECK-EMPTY:
-; CHECK-NEXT:    Expressions re-written:
-
-define void @forked_ptrs_two_select(ptr nocapture readonly %Base1, ptr nocapture readonly %Base2, ptr nocapture readonly %Base3, ptr %Dest) {
+; CHECK-NEXT:      Expressions re-written:
+;
+; RECURSE-LABEL: 'forked_ptrs_two_select'
+; RECURSE-NEXT:    loop:
+; RECURSE-NEXT:      Report: cannot identify array bounds
+; RECURSE-NEXT:      Dependences:
+; RECURSE-NEXT:      Run-time memory checks:
+; RECURSE-NEXT:      Grouped accesses:
+; RECURSE-EMPTY:
+; RECURSE-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; RECURSE-NEXT:      SCEV assumptions:
+; RECURSE-EMPTY:
+; RECURSE-NEXT:      Expressions re-written:
+;
 entry:
   br label %loop
 
@@ -812,19 +1139,32 @@ exit:
 }
 
 ;; We don't yet handle geps with more than 2 operands
-; CHECK-LABEL: Loop access info in function 'forked_ptrs_too_many_gep_ops':
-; CHECK-NEXT:   for.body:
-; CHECK-NEXT:     Report: cannot identify array bounds
-; CHECK-NEXT:     Dependences:
-; CHECK-NEXT:     Run-time memory checks:
-; CHECK-NEXT:     Grouped accesses:
-; CHECK-EMPTY:
-; CHECK-NEXT:     Non vectorizable stores to invariant address were not found in loop.
-; CHECK-NEXT:     SCEV assumptions:
-; CHECK-EMPTY:
-; CHECK-NEXT:     Expressions re-written:
 
 define void @forked_ptrs_too_many_gep_ops(ptr nocapture readonly %Base1, ptr nocapture readonly %Base2, ptr nocapture %Dest, ptr nocapture readonly %Preds) {
+; CHECK-LABEL: 'forked_ptrs_too_many_gep_ops'
+; CHECK-NEXT:    for.body:
+; CHECK-NEXT:      Report: cannot identify array bounds
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+; RECURSE-LABEL: 'forked_ptrs_too_many_gep_ops'
+; RECURSE-NEXT:    for.body:
+; RECURSE-NEXT:      Report: cannot identify array bounds
+; RECURSE-NEXT:      Dependences:
+; RECURSE-NEXT:      Run-time memory checks:
+; RECURSE-NEXT:      Grouped accesses:
+; RECURSE-EMPTY:
+; RECURSE-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; RECURSE-NEXT:      SCEV assumptions:
+; RECURSE-EMPTY:
+; RECURSE-NEXT:      Expressions re-written:
+;
 entry:
   br label %for.body
 
@@ -847,19 +1187,32 @@ for.cond.cleanup:
 }
 
 ;; We don't currently handle vector GEPs
-; CHECK-LABEL: Loop access info in function 'forked_ptrs_vector_gep':
-; CHECK-NEXT:   for.body:
-; CHECK-NEXT:     Report: cannot identify array bounds
-; CHECK-NEXT:     Dependences:
-; CHECK-NEXT:     Run-time memory checks:
-; CHECK-NEXT:     Grouped accesses:
-; CHECK-EMPTY:
-; CHECK-NEXT:     Non vectorizable stores to invariant address were not found in loop.
-; CHECK-NEXT:     SCEV assumptions:
-; CHECK-EMPTY:
-; CHECK-NEXT:     Expressions re-written:
 
 define void @forked_ptrs_vector_gep(ptr nocapture readonly %Base1, ptr nocapture readonly %Base2, ptr nocapture %Dest, ptr nocapture readonly %Preds) {
+; CHECK-LABEL: 'forked_ptrs_vector_gep'
+; CHECK-NEXT:    for.body:
+; CHECK-NEXT:      Report: cannot identify array bounds
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+; RECURSE-LABEL: 'forked_ptrs_vector_gep'
+; RECURSE-NEXT:    for.body:
+; RECURSE-NEXT:      Report: cannot identify array bounds
+; RECURSE-NEXT:      Dependences:
+; RECURSE-NEXT:      Run-time memory checks:
+; RECURSE-NEXT:      Grouped accesses:
+; RECURSE-EMPTY:
+; RECURSE-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; RECURSE-NEXT:      SCEV assumptions:
+; RECURSE-EMPTY:
+; RECURSE-NEXT:      Expressions re-written:
+;
 entry:
   br label %for.body
 
@@ -881,33 +1234,6 @@ for.cond.cleanup:
   ret void
 }
 
-; CHECK-LABEL: Loop access info in function 'sc_add_expr_ice':
-; CHECK-NEXT:   for.body:
-; CHECK-NEXT:     Memory dependences are safe with run-time checks
-; CHECK-NEXT:     Dependences:
-; CHECK-NEXT:     Run-time memory checks:
-; CHECK-NEXT:     Check 0:
-; CHECK-NEXT:       Comparing group ([[G1:.+]]):
-; CHECK-NEXT:       ptr %Base1
-; CHECK-NEXT:       Against group ([[G2:.+]]):
-; CHECK-NEXT:         %fptr = getelementptr inbounds double, ptr %Base2, i64 %sel
-; CHECK-NEXT:     Grouped accesses:
-; CHECK-NEXT:       Group [[G1]]:
-; CHECK-NEXT:         (Low: %Base1 High: (8 + %Base1))
-; CHECK-NEXT:           Member: %Base1
-; CHECK-NEXT:       Group [[G2]]:
-; CHECK-NEXT:         (Low: %Base2 High: ((8 * %N) + %Base2))
-; CHECK-NEXT:           Member: {%Base2,+,8}<%for.body>
-; CHECK-EMPTY:
-; CHECK-NEXT:     Non vectorizable stores to invariant address were not found in loop.
-; CHECK-NEXT:     SCEV assumptions:
-; CHECK-NEXT:     {0,+,1}<%for.body> Added Flags: <nusw>
-; CHECK-EMPTY:
-; CHECK-NEXT:     Expressions re-written:
-; CHECK-NEXT:     [PSE]  %fptr = getelementptr inbounds double, ptr %Base2, i64 %sel:
-; CHECK-NEXT:       ((8 * (zext i32 {0,+,1}<%for.body> to i64))<nuw><nsw> + %Base2)<nuw>
-; CHECK-NEXT:       --> {%Base2,+,8}<%for.body>
-
 ;;; The following test caused an ICE with the initial forked pointers work.
 ;;; One fork is loop invariant (%Base2 + 0), the other is an scAddExpr that
 ;;; contains an scAddRecExpr inside it:
@@ -918,6 +1244,60 @@ for.cond.cleanup:
 ;;; forks only, but we should be able to do better.
 
 define void @sc_add_expr_ice(ptr %Base1, ptr %Base2, i64 %N) {
+; CHECK-LABEL: 'sc_add_expr_ice'
+; CHECK-NEXT:    for.body:
+; CHECK-NEXT:      Memory dependences are safe with run-time checks
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Check 0:
+; CHECK-NEXT:        Comparing group ([[GRP56:0x[0-9a-f]+]]):
+; CHECK-NEXT:        ptr %Base1
+; CHECK-NEXT:        Against group ([[GRP57:0x[0-9a-f]+]]):
+; CHECK-NEXT:          %fptr = getelementptr inbounds double, ptr %Base2, i64 %sel
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-NEXT:        Group [[GRP56]]:
+; CHECK-NEXT:          (Low: %Base1 High: (8 + %Base1))
+; CHECK-NEXT:            Member: %Base1
+; CHECK-NEXT:        Group [[GRP57]]:
+; CHECK-NEXT:          (Low: %Base2 High: ((8 * %N) + %Base2))
+; CHECK-NEXT:            Member: {%Base2,+,8}<%for.body>
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-NEXT:      {0,+,1}<%for.body> Added Flags: <nusw>
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+; CHECK-NEXT:      [PSE] %fptr = getelementptr inbounds double, ptr %Base2, i64 %sel:
+; CHECK-NEXT:        ((8 * (zext i32 {0,+,1}<%for.body> to i64))<nuw><nsw> + %Base2)<nuw>
+; CHECK-NEXT:        --> {%Base2,+,8}<%for.body>
+;
+; RECURSE-LABEL: 'sc_add_expr_ice'
+; RECURSE-NEXT:    for.body:
+; RECURSE-NEXT:      Memory dependences are safe with run-time checks
+; RECURSE-NEXT:      Dependences:
+; RECURSE-NEXT:      Run-time memory checks:
+; RECURSE-NEXT:      Check 0:
+; RECURSE-NEXT:        Comparing group ([[GRP58:0x[0-9a-f]+]]):
+; RECURSE-NEXT:        ptr %Base1
+; RECURSE-NEXT:        Against group ([[GRP59:0x[0-9a-f]+]]):
+; RECURSE-NEXT:          %fptr = getelementptr inbounds double, ptr %Base2, i64 %sel
+; RECURSE-NEXT:      Grouped accesses:
+; RECURSE-NEXT:        Group [[GRP58]]:
+; RECURSE-NEXT:          (Low: %Base1 High: (8 + %Base1))
+; RECURSE-NEXT:            Member: %Base1
+; RECURSE-NEXT:        Group [[GRP59]]:
+; RECURSE-NEXT:          (Low: %Base2 High: ((8 * %N) + %Base2))
+; RECURSE-NEXT:            Member: {%Base2,+,8}<%for.body>
+; RECURSE-EMPTY:
+; RECURSE-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; RECURSE-NEXT:      SCEV assumptions:
+; RECURSE-NEXT:      {0,+,1}<%for.body> Added Flags: <nusw>
+; RECURSE-EMPTY:
+; RECURSE-NEXT:      Expressions re-written:
+; RECURSE-NEXT:      [PSE] %fptr = getelementptr inbounds double, ptr %Base2, i64 %sel:
+; RECURSE-NEXT:        ((8 * (zext i32 {0,+,1}<%for.body> to i64))<nuw><nsw> + %Base2)<nuw>
+; RECURSE-NEXT:        --> {%Base2,+,8}<%for.body>
+;
 entry:
   br label %for.body
 
@@ -938,45 +1318,94 @@ exit:
 }
 
 define void @forked_ptrs_with_different_base(ptr nocapture readonly %Preds, ptr nocapture %a, ptr nocapture %b, ptr nocapture readonly %c)  {
-; CHECK:       for.body:
-; CHECK-NEXT:    Memory dependences are safe with run-time checks
-; CHECK-NEXT:    Dependences:
-; CHECK-NEXT:    Run-time memory checks:
-; CHECK-NEXT:    Check 0:
-; CHECK-NEXT:      Comparing group ([[G1:.+]]):
-; CHECK-NEXT:        %arrayidx7 = getelementptr inbounds double, ptr %.sink, i64 %indvars.iv
-; CHECK-NEXT:      Against group ([[G2:.+]]):
-; CHECK-NEXT:        %arrayidx = getelementptr inbounds i32, ptr %Preds, i64 %indvars.iv
-; CHECK-NEXT:    Check 1:
-; CHECK-NEXT:      Comparing group ([[G1]]):
-; CHECK-NEXT:        %arrayidx7 = getelementptr inbounds double, ptr %.sink, i64 %indvars.iv
-; CHECK-NEXT:      Against group ([[G4:.+]]):
-; CHECK-NEXT:        %arrayidx5 = getelementptr inbounds double, ptr %0, i64 %indvars.iv
-; CHECK-NEXT:    Check 2:
-; CHECK-NEXT:      Comparing group ([[G3:.+]]):
-; CHECK-NEXT:        %arrayidx7 = getelementptr inbounds double, ptr %.sink, i64 %indvars.iv
-; CHECK-NEXT:      Against group ([[G2]]):
-; CHECK-NEXT:        %arrayidx = getelementptr inbounds i32, ptr %Preds, i64 %indvars.iv
-; CHECK-NEXT:    Check 3:
-; CHECK-NEXT:      Comparing group ([[G3]]):
-; CHECK-NEXT:        %arrayidx7 = getelementptr inbounds double, ptr %.sink, i64 %indvars.iv
-; CHECK-NEXT:      Against group ([[G4]]):
-; CHECK-NEXT:        %arrayidx5 = getelementptr inbounds double, ptr %0, i64 %indvars.iv
-; CHECK-NEXT:    Grouped accesses:
-; CHECK-NEXT:      Group [[G1]]:
-; CHECK-NEXT:        (Low: %1 High: (63992 + %1))
-; CHECK-NEXT:          Member: {%1,+,8}<nw><%for.body>
-; CHECK-NEXT:      Group [[G3]]:
-; CHECK-NEXT:        (Low: %2 High: (63992 + %2))
-; CHECK-NEXT:          Member: {%2,+,8}<nw><%for.body>
-; CHECK-NEXT:      Group [[G2]]:
-; CHECK-NEXT:        (Low: %Preds High: (31996 + %Preds))
-; CHECK-NEXT:          Member: {%Preds,+,4}<nuw><%for.body>
-; CHECK-NEXT:      Group [[G4]]:
-; CHECK-NEXT:        (Low: %0 High: (63992 + %0))
-; CHECK-NEXT:          Member: {%0,+,8}<nw><%for.body>
+; CHECK-LABEL: 'forked_ptrs_with_different_base'
+; CHECK-NEXT:    for.body:
+; CHECK-NEXT:      Memory dependences are safe with run-time checks
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Check 0:
+; CHECK-NEXT:        Comparing group ([[GRP60:0x[0-9a-f]+]]):
+; CHECK-NEXT:          %arrayidx7 = getelementptr inbounds double, ptr %.sink, i64 %indvars.iv
+; CHECK-NEXT:        Against group ([[GRP61:0x[0-9a-f]+]]):
+; CHECK-NEXT:          %arrayidx = getelementptr inbounds i32, ptr %Preds, i64 %indvars.iv
+; CHECK-NEXT:      Check 1:
+; CHECK-NEXT:        Comparing group ([[GRP60]]):
+; CHECK-NEXT:          %arrayidx7 = getelementptr inbounds double, ptr %.sink, i64 %indvars.iv
+; CHECK-NEXT:        Against group ([[GRP62:0x[0-9a-f]+]]):
+; CHECK-NEXT:          %arrayidx5 = getelementptr inbounds double, ptr %0, i64 %indvars.iv
+; CHECK-NEXT:      Check 2:
+; CHECK-NEXT:        Comparing group ([[GRP63:0x[0-9a-f]+]]):
+; CHECK-NEXT:          %arrayidx7 = getelementptr inbounds double, ptr %.sink, i64 %indvars.iv
+; CHECK-NEXT:        Against group ([[GRP61]]):
+; CHECK-NEXT:          %arrayidx = getelementptr inbounds i32, ptr %Preds, i64 %indvars.iv
+; CHECK-NEXT:      Check 3:
+; CHECK-NEXT:        Comparing group ([[GRP63]]):
+; CHECK-NEXT:          %arrayidx7 = getelementptr inbounds double, ptr %.sink, i64 %indvars.iv
+; CHECK-NEXT:        Against group ([[GRP62]]):
+; CHECK-NEXT:          %arrayidx5 = getelementptr inbounds double, ptr %0, i64 %indvars.iv
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-NEXT:        Group [[GRP60]]:
+; CHECK-NEXT:          (Low: %1 High: (63992 + %1))
+; CHECK-NEXT:            Member: {%1,+,8}<nw><%for.body>
+; CHECK-NEXT:        Group [[GRP63]]:
+; CHECK-NEXT:          (Low: %2 High: (63992 + %2))
+; CHECK-NEXT:            Member: {%2,+,8}<nw><%for.body>
+; CHECK-NEXT:        Group [[GRP61]]:
+; CHECK-NEXT:          (Low: %Preds High: (31996 + %Preds))
+; CHECK-NEXT:            Member: {%Preds,+,4}<nuw><%for.body>
+; CHECK-NEXT:        Group [[GRP62]]:
+; CHECK-NEXT:          (Low: %0 High: (63992 + %0))
+; CHECK-NEXT:            Member: {%0,+,8}<nw><%for.body>
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
 ; CHECK-EMPTY:
-; CHECK-NEXT:    Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      Expressions re-written:
+;
+; RECURSE-LABEL: 'forked_ptrs_with_different_base'
+; RECURSE-NEXT:    for.body:
+; RECURSE-NEXT:      Memory dependences are safe with run-time checks
+; RECURSE-NEXT:      Dependences:
+; RECURSE-NEXT:      Run-time memory checks:
+; RECURSE-NEXT:      Check 0:
+; RECURSE-NEXT:        Comparing group ([[GRP64:0x[0-9a-f]+]]):
+; RECURSE-NEXT:          %arrayidx7 = getelementptr inbounds double, ptr %.sink, i64 %indvars.iv
+; RECURSE-NEXT:        Against group ([[GRP65:0x[0-9a-f]+]]):
+; RECURSE-NEXT:          %arrayidx = getelementptr inbounds i32, ptr %Preds, i64 %indvars.iv
+; RECURSE-NEXT:      Check 1:
+; RECURSE-NEXT:        Comparing group ([[GRP64]]):
+; RECURSE-NEXT:          %arrayidx7 = getelementptr inbounds double, ptr %.sink, i64 %indvars.iv
+; RECURSE-NEXT:        Against group ([[GRP66:0x[0-9a-f]+]]):
+; RECURSE-NEXT:          %arrayidx5 = getelementptr inbounds double, ptr %0, i64 %indvars.iv
+; RECURSE-NEXT:      Check 2:
+; RECURSE-NEXT:        Comparing group ([[GRP67:0x[0-9a-f]+]]):
+; RECURSE-NEXT:          %arrayidx7 = getelementptr inbounds double, ptr %.sink, i64 %indvars.iv
+; RECURSE-NEXT:        Against group ([[GRP65]]):
+; RECURSE-NEXT:          %arrayidx = getelementptr inbounds i32, ptr %Preds, i64 %indvars.iv
+; RECURSE-NEXT:      Check 3:
+; RECURSE-NEXT:        Comparing group ([[GRP67]]):
+; RECURSE-NEXT:          %arrayidx7 = getelementptr inbounds double, ptr %.sink, i64 %indvars.iv
+; RECURSE-NEXT:        Against group ([[GRP66]]):
+; RECURSE-NEXT:          %arrayidx5 = getelementptr inbounds double, ptr %0, i64 %indvars.iv
+; RECURSE-NEXT:      Grouped accesses:
+; RECURSE-NEXT:        Group [[GRP64]]:
+; RECURSE-NEXT:          (Low: %1 High: (63992 + %1))
+; RECURSE-NEXT:            Member: {%1,+,8}<nw><%for.body>
+; RECURSE-NEXT:        Group [[GRP67]]:
+; RECURSE-NEXT:          (Low: %2 High: (63992 + %2))
+; RECURSE-NEXT:            Member: {%2,+,8}<nw><%for.body>
+; RECURSE-NEXT:        Group [[GRP65]]:
+; RECURSE-NEXT:          (Low: %Preds High: (31996 + %Preds))
+; RECURSE-NEXT:            Member: {%Preds,+,4}<nuw><%for.body>
+; RECURSE-NEXT:        Group [[GRP66]]:
+; RECURSE-NEXT:          (Low: %0 High: (63992 + %0))
+; RECURSE-NEXT:            Member: {%0,+,8}<nw><%for.body>
+; RECURSE-EMPTY:
+; RECURSE-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; RECURSE-NEXT:      SCEV assumptions:
+; RECURSE-EMPTY:
+; RECURSE-NEXT:      Expressions re-written:
+;
 entry:
   %0 = load ptr, ptr %c, align 64
   %1 = load ptr, ptr %a, align 64
@@ -1017,13 +1446,30 @@ for.inc:                                          ; preds = %if.then, %if.else
 
 ; Negative test: the operator number of PhiNode is not 2.
 define void @forked_ptrs_with_different_base3(ptr nocapture readonly %Preds, ptr nocapture %a, ptr nocapture %b, ptr nocapture readonly %c)  {
-; CHECK:       for.body:
-; CHECK-NEXT:    Report: cannot identify array bounds
-; CHECK-NEXT:    Dependences:
-; CHECK-NEXT:    Run-time memory checks:
-; CHECK-NEXT:    Grouped accesses:
+; CHECK-LABEL: 'forked_ptrs_with_different_base3'
+; CHECK-NEXT:    for.body:
+; CHECK-NEXT:      Report: cannot identify array bounds
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
 ; CHECK-EMPTY:
-; CHECK-NEXT:    Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      Expressions re-written:
+;
+; RECURSE-LABEL: 'forked_ptrs_with_different_base3'
+; RECURSE-NEXT:    for.body:
+; RECURSE-NEXT:      Report: cannot identify array bounds
+; RECURSE-NEXT:      Dependences:
+; RECURSE-NEXT:      Run-time memory checks:
+; RECURSE-NEXT:      Grouped accesses:
+; RECURSE-EMPTY:
+; RECURSE-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; RECURSE-NEXT:      SCEV assumptions:
+; RECURSE-EMPTY:
+; RECURSE-NEXT:      Expressions re-written:
+;
 entry:
   %ld.c = load ptr, ptr %c, align 64
   %ld.a = load ptr, ptr %a, align 64
@@ -1058,4 +1504,4 @@ for.inc:                                          ; preds = %if.br1, %if.br0
 
 for.cond.cleanup:                                 ; preds = %for.inc
   ret void
-}
\ No newline at end of file
+}
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/loop-invariant-dep-with-backedge-taken-count.ll b/llvm/test/Analysis/LoopAccessAnalysis/loop-invariant-dep-with-backedge-taken-count.ll
index 63e483dd72ff1..02285031f628b 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/loop-invariant-dep-with-backedge-taken-count.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/loop-invariant-dep-with-backedge-taken-count.ll
@@ -1,21 +1,26 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 3
 ; RUN: opt -passes='print<access-info>' -disable-output %s 2>&1 | FileCheck %s
 
 ; Test cases for using the backedge-taken-count to rule out dependencies between
 ; an invariant and strided accesses.
 
 define void @test_distance_greater_than_BTC_100(ptr %a) {
-; CHECK-LABEL: Loop access info in function 'test_distance_greater_than_BTC_100':
-; CHECK-NEXT:   loop:
-; CHECK-NEXT:     Report: unsafe dependent memory operations in loop.
-; CHECK-NEXT: Unknown data dependence.
-; CHECK-NEXT:     Dependences:
-; CHECK-NEXT:       Unknown:
-; CHECK-NEXT:           %l = load i32, ptr %gep.x, align 4 ->
-; CHECK-NEXT:           store i32 %l, ptr %gep, align 4
-; CHECK-EMPTY:
-; CHECK-NEXT:     Run-time memory checks:
-; CHECK-NEXT:     Grouped accesses:
+; CHECK-LABEL: 'test_distance_greater_than_BTC_100'
+; CHECK-NEXT:    loop:
+; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK-NEXT:  Unknown data dependence.
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        Unknown:
+; CHECK-NEXT:            %l = load i32, ptr %gep.x, align 4 ->
+; CHECK-NEXT:            store i32 %l, ptr %gep, align 4
 ; CHECK-EMPTY:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
 ;
 entry:
   %gep.x = getelementptr i32, ptr %a, i32 100
@@ -35,18 +40,22 @@ exit:
 }
 
 define void @test_distance_much_greater_than_BTC_100(ptr %a) {
-; CHECK-LABEL: Loop access info in function 'test_distance_much_greater_than_BTC_100':
-; CHECK-NEXT:   loop:
-; CHECK-NEXT:     Report: unsafe dependent memory operations in loop.
-; CHECK-NEXT: Unknown data dependence.
-; CHECK-NEXT:     Dependences:
-; CHECK-NEXT:       Unknown:
-; CHECK-NEXT:           %l = load i32, ptr %gep.x, align 4 ->
-; CHECK-NEXT:           store i32 %l, ptr %gep, align 4
-; CHECK-EMPTY:
-; CHECK-NEXT:     Run-time memory checks:
-; CHECK-NEXT:     Grouped accesses:
+; CHECK-LABEL: 'test_distance_much_greater_than_BTC_100'
+; CHECK-NEXT:    loop:
+; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK-NEXT:  Unknown data dependence.
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        Unknown:
+; CHECK-NEXT:            %l = load i32, ptr %gep.x, align 4 ->
+; CHECK-NEXT:            store i32 %l, ptr %gep, align 4
+; CHECK-EMPTY:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
 ; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
 ;
 entry:
   %gep.x = getelementptr i32, ptr %a, i32 200
@@ -66,18 +75,22 @@ exit:
 }
 
 define void @test_distance_equal_BTC_100(ptr %a) {
-; CHECK-LABEL: Loop access info in function 'test_distance_equal_BTC_100':
-; CHECK-NEXT:   loop:
-; CHECK-NEXT:     Report: unsafe dependent memory operations in loop.
-; CHECK-NEXT: Unknown data dependence.
-; CHECK-NEXT:     Dependences:
-; CHECK-NEXT:       Unknown:
-; CHECK-NEXT:           %l = load i32, ptr %gep.x, align 4 ->
-; CHECK-NEXT:           store i32 %l, ptr %gep, align 4
-; CHECK-EMPTY:
-; CHECK-NEXT:     Run-time memory checks:
-; CHECK-NEXT:     Grouped accesses:
+; CHECK-LABEL: 'test_distance_equal_BTC_100'
+; CHECK-NEXT:    loop:
+; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK-NEXT:  Unknown data dependence.
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        Unknown:
+; CHECK-NEXT:            %l = load i32, ptr %gep.x, align 4 ->
+; CHECK-NEXT:            store i32 %l, ptr %gep, align 4
 ; CHECK-EMPTY:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
 ;
 entry:
   %gep.x = getelementptr i32, ptr %a, i32 99
@@ -97,18 +110,22 @@ exit:
 }
 
 define void @test_distance_greater_than_BTC_10000(ptr %a) {
-; CHECK-LABEL: Loop access info in function 'test_distance_greater_than_BTC_10000':
-; CHECK-NEXT:   loop:
-; CHECK-NEXT:     Report: unsafe dependent memory operations in loop.
-; CHECK-NEXT: Unknown data dependence.
-; CHECK-NEXT:     Dependences:
-; CHECK-NEXT:       Unknown:
-; CHECK-NEXT:           %l = load i32, ptr %gep.x, align 4 ->
-; CHECK-NEXT:           store i32 %l, ptr %gep, align 4
-; CHECK-EMPTY:
-; CHECK-NEXT:     Run-time memory checks:
-; CHECK-NEXT:     Grouped accesses:
+; CHECK-LABEL: 'test_distance_greater_than_BTC_10000'
+; CHECK-NEXT:    loop:
+; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK-NEXT:  Unknown data dependence.
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        Unknown:
+; CHECK-NEXT:            %l = load i32, ptr %gep.x, align 4 ->
+; CHECK-NEXT:            store i32 %l, ptr %gep, align 4
+; CHECK-EMPTY:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
 ; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
 ;
 entry:
   %gep.x = getelementptr i32, ptr %a, i32 10000
@@ -128,18 +145,22 @@ exit:
 }
 
 define void @test_distance_equal_to_BTC_10000(ptr %a) {
-; CHECK-LABEL: Loop access info in function 'test_distance_equal_to_BTC_10000':
-; CHECK-NEXT:   loop:
-; CHECK-NEXT:     Report: unsafe dependent memory operations in loop.
-; CHECK-NEXT: Unknown data dependence.
-; CHECK-NEXT:     Dependences:
-; CHECK-NEXT:       Unknown:
-; CHECK-NEXT:           %l = load i32, ptr %gep.x, align 4 ->
-; CHECK-NEXT:           store i32 %l, ptr %gep, align 4
-; CHECK-EMPTY:
-; CHECK-NEXT:     Run-time memory checks:
-; CHECK-NEXT:     Grouped accesses:
+; CHECK-LABEL: 'test_distance_equal_to_BTC_10000'
+; CHECK-NEXT:    loop:
+; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK-NEXT:  Unknown data dependence.
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        Unknown:
+; CHECK-NEXT:            %l = load i32, ptr %gep.x, align 4 ->
+; CHECK-NEXT:            store i32 %l, ptr %gep, align 4
+; CHECK-EMPTY:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
 ; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
 ;
 entry:
   %gep.x = getelementptr i32, ptr %a, i32 9999
@@ -159,17 +180,32 @@ exit:
 }
 
 define void @test_btc_is_unknown_value(ptr %a, i32 %N) {
-; CHECK-LABEL: Loop access info in function 'test_btc_is_unknown_value':
-; CHECK-NEXT:   loop:
-; CHECK-NEXT:     Report: unsafe dependent memory operations in loop.
-; CHECK-NEXT: Unknown data dependence.
-; CHECK-NEXT:     Dependences:
-; CHECK-NEXT:       Unknown:
-; CHECK-NEXT:           %l = load i32, ptr %gep.x, align 4 ->
-; CHECK-NEXT:           store i32 %l, ptr %gep, align 4
-; CHECK-EMPTY:
-; CHECK-NEXT:     Run-time memory checks:
-; CHECK-NEXT:     Grouped accesses:
+; CHECK-LABEL: 'test_btc_is_unknown_value'
+; CHECK-NEXT:    loop:
+; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK-NEXT:  Unknown data dependence.
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        Unknown:
+; CHECK-NEXT:            %l = load i32, ptr %gep.x, align 4 ->
+; CHECK-NEXT:            store i32 %l, ptr %gep, align 4
+; CHECK-EMPTY:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-NEXT:        Group [[GRP1:0x[0-9a-f]+]]:
+; CHECK-NEXT:          (Low: (400 + %a) High: (404 + %a))
+; CHECK-NEXT:            Member: (400 + %a)
+; CHECK-NEXT:        Group [[GRP2:0x[0-9a-f]+]]:
+; CHECK-NEXT:          (Low: %a High: (4 + (4 * (zext i32 (-1 + %N) to i64))<nuw><nsw> + %a))
+; CHECK-NEXT:            Member: {%a,+,4}<nw><%loop>
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-NEXT:      {0,+,1}<nuw><%loop> Added Flags: <nssw>
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+; CHECK-NEXT:      [PSE] %gep = getelementptr i32, ptr %a, i32 %iv:
+; CHECK-NEXT:        ((4 * (sext i32 {0,+,1}<nuw><%loop> to i64))<nsw> + %a)
+; CHECK-NEXT:        --> {%a,+,4}<nw><%loop>
 ;
 entry:
   %gep.x = getelementptr i32, ptr %a, i32 100
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/max_safe_dep_dist_non_unit_stride.ll b/llvm/test/Analysis/LoopAccessAnalysis/max_safe_dep_dist_non_unit_stride.ll
index 0364248b004cf..54445c0182435 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/max_safe_dep_dist_non_unit_stride.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/max_safe_dep_dist_non_unit_stride.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 3
 ; RUN: opt -S -disable-output -passes='print<access-info>' < %s 2>&1 | FileCheck %s
 
 ; Generated from following C program:
@@ -8,21 +9,21 @@
 ;   }
 ; }
 define void @foo(i64  %len, ptr %a) {
-; CHECK-LABEL: Loop access info in function 'foo':
-; CHECK-NEXT:  loop:
-; CHECK-NEXT:   Memory dependences are safe with a maximum safe vector width of 64 bits
-; CHECK-NEXT:    Dependences:
-; CHECK-NEXT:      BackwardVectorizable:
-; CHECK-NEXT:          store i32 %0, ptr %arrayidx2, align 4 ->
-; CHECK-NEXT:          %1 = load i32, ptr %arrayidx5, align 4
+; CHECK-LABEL: 'foo'
+; CHECK-NEXT:    loop:
+; CHECK-NEXT:      Memory dependences are safe with a maximum safe vector width of 64 bits
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        BackwardVectorizable:
+; CHECK-NEXT:            store i32 %0, ptr %arrayidx2, align 4 ->
+; CHECK-NEXT:            %1 = load i32, ptr %arrayidx5, align 4
 ; CHECK-EMPTY:
-; CHECK-NEXT:    Run-time memory checks:
-; CHECK-NEXT:    Grouped accesses:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
 ; CHECK-EMPTY:
-; CHECK-NEXT:    Non vectorizable stores to invariant address were not found in loop.
-; CHECK-NEXT:    SCEV assumptions:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
 ; CHECK-EMPTY:
-; CHECK-NEXT:    Expressions re-written:
+; CHECK-NEXT:      Expressions re-written:
 ;
 loop.preheader:
   br label %loop
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/pointer-phis.ll b/llvm/test/Analysis/LoopAccessAnalysis/pointer-phis.ll
index b912a476651a2..a214451bfd3fd 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/pointer-phis.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/pointer-phis.ll
@@ -1,11 +1,20 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 3
 ; RUN: opt -passes='print<access-info>' -disable-output  < %s 2>&1 | FileCheck %s
 
 %s1 = type { [32000 x double], [32000 x double], [32000 x double] }
 
 define i32 @load_with_pointer_phi_no_runtime_checks(ptr %data) {
-; CHECK-LABEL: load_with_pointer_phi_no_runtime_checks
-; CHECK-NEXT:  loop.header:
-; CHECK-NEXT:    Memory dependences are safe
+; CHECK-LABEL: 'load_with_pointer_phi_no_runtime_checks'
+; CHECK-NEXT:    loop.header:
+; CHECK-NEXT:      Memory dependences are safe
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
 ;
 entry:
   br label %loop.header
@@ -39,8 +48,16 @@ exit:                                             ; preds = %loop.latch
 
 define i32 @store_with_pointer_phi_no_runtime_checks(ptr %data) {
 ; CHECK-LABEL: 'store_with_pointer_phi_no_runtime_checks'
-; CHECK-NEXT:  loop.header:
-; CHECK-NEXT:    Memory dependences are safe
+; CHECK-NEXT:    loop.header:
+; CHECK-NEXT:      Memory dependences are safe
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
 ;
 entry:
   br label %loop.header
@@ -74,24 +91,40 @@ exit:                                             ; preds = %loop.latch
 
 define i32 @store_with_pointer_phi_runtime_checks(ptr %A, ptr %B, ptr %C) {
 ; CHECK-LABEL: 'store_with_pointer_phi_runtime_checks'
-; CHECK-NEXT:  loop.header:
-; CHECK-NEXT:    Memory dependences are safe with run-time checks
-; CHECK:         Run-time memory checks:
-; CHECK-NEXT:    Check 0:
-; CHECK-NEXT:      Comparing group ([[GROUP_B:.+]]):
-; CHECK-NEXT:        %gep.1 = getelementptr inbounds double, ptr %B, i64 %iv
-; CHECK-NEXT:      Against group ([[GROUP_C:.+]]):
-; CHECK-NEXT:        %gep.2 = getelementptr inbounds double, ptr %C, i64 %iv
-; CHECK-NEXT:    Check 1:
-; CHECK-NEXT:      Comparing group ([[GROUP_B]]):
-; CHECK-NEXT:        %gep.1 = getelementptr inbounds double, ptr %B, i64 %iv
-; CHECK-NEXT:      Against group ([[GROUP_A:.+]]):
-; CHECK-NEXT:        %arrayidx = getelementptr inbounds double, ptr %A, i64 %iv
-; CHECK-NEXT:    Check 2:
-; CHECK-NEXT:      Comparing group ([[GROUP_C]]):
-; CHECK-NEXT:        %gep.2 = getelementptr inbounds double, ptr %C, i64 %iv
-; CHECK-NEXT:      Against group ([[GROUP_A]]):
-; CHECK-NEXT:        %arrayidx = getelementptr inbounds double, ptr %A, i64 %iv
+; CHECK-NEXT:    loop.header:
+; CHECK-NEXT:      Memory dependences are safe with run-time checks
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Check 0:
+; CHECK-NEXT:        Comparing group ([[GRP1:0x[0-9a-f]+]]):
+; CHECK-NEXT:          %gep.1 = getelementptr inbounds double, ptr %B, i64 %iv
+; CHECK-NEXT:        Against group ([[GRP2:0x[0-9a-f]+]]):
+; CHECK-NEXT:          %gep.2 = getelementptr inbounds double, ptr %C, i64 %iv
+; CHECK-NEXT:      Check 1:
+; CHECK-NEXT:        Comparing group ([[GRP1]]):
+; CHECK-NEXT:          %gep.1 = getelementptr inbounds double, ptr %B, i64 %iv
+; CHECK-NEXT:        Against group ([[GRP3:0x[0-9a-f]+]]):
+; CHECK-NEXT:          %arrayidx = getelementptr inbounds double, ptr %A, i64 %iv
+; CHECK-NEXT:      Check 2:
+; CHECK-NEXT:        Comparing group ([[GRP2]]):
+; CHECK-NEXT:          %gep.2 = getelementptr inbounds double, ptr %C, i64 %iv
+; CHECK-NEXT:        Against group ([[GRP3]]):
+; CHECK-NEXT:          %arrayidx = getelementptr inbounds double, ptr %A, i64 %iv
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-NEXT:        Group [[GRP1]]:
+; CHECK-NEXT:          (Low: %B High: (256000 + %B))
+; CHECK-NEXT:            Member: {%B,+,8}<nw><%loop.header>
+; CHECK-NEXT:        Group [[GRP2]]:
+; CHECK-NEXT:          (Low: %C High: (256000 + %C))
+; CHECK-NEXT:            Member: {%C,+,8}<nw><%loop.header>
+; CHECK-NEXT:        Group [[GRP3]]:
+; CHECK-NEXT:          (Low: %A High: (256000 + %A))
+; CHECK-NEXT:            Member: {%A,+,8}<nw><%loop.header>
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
 ;
 entry:
   br label %loop.header
@@ -125,13 +158,21 @@ exit:                                             ; preds = %loop.latch
 
 define i32 @load_with_pointer_phi_outside_loop(ptr %A, ptr %B, ptr %C, i1 %c.0, i1 %c.1) {
 ; CHECK-LABEL: 'load_with_pointer_phi_outside_loop'
-; CHECK-NEXT:  loop.header:
-; CHECK-NEXT:    Report: unsafe dependent memory operations in loop
-; CHECK-NEXT:    Unknown data dependence.
-; CHECK-NEXT:    Dependences:
-; CHECK-NEXT:      Unknown:
-; CHECK-NEXT:          %v8 = load double, ptr %ptr, align 8 ->
-; CHECK-NEXT:          store double %mul16, ptr %arrayidx, align 8
+; CHECK-NEXT:    loop.header:
+; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK-NEXT:  Unknown data dependence.
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        Unknown:
+; CHECK-NEXT:            %v8 = load double, ptr %ptr, align 8 ->
+; CHECK-NEXT:            store double %mul16, ptr %arrayidx, align 8
+; CHECK-EMPTY:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
 ;
 entry:
   br i1 %c.0, label %if.then, label %if.else
@@ -163,13 +204,21 @@ exit:                                             ; preds = %loop.latch
 
 define i32 @store_with_pointer_phi_outside_loop(ptr %A, ptr %B, ptr %C, i1 %c.0, i1 %c.1) {
 ; CHECK-LABEL: 'store_with_pointer_phi_outside_loop'
-; CHECK-NEXT:  loop.header:
-; CHECK-NEXT:    Report: unsafe dependent memory operations in loop.
-; CHECK-NEXT:    Unknown data dependence.
-; CHECK-NEXT:    Dependences:
-; CHECK-NEXT:      Unknown:
-; CHECK-NEXT:          %v8 = load double, ptr %arrayidx, align 8 ->
-; CHECK-NEXT:          store double %mul16, ptr %ptr, align 8
+; CHECK-NEXT:    loop.header:
+; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK-NEXT:  Unknown data dependence.
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        Unknown:
+; CHECK-NEXT:            %v8 = load double, ptr %arrayidx, align 8 ->
+; CHECK-NEXT:            store double %mul16, ptr %ptr, align 8
+; CHECK-EMPTY:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
 ;
 entry:
   br i1 %c.0, label %if.then, label %if.else
@@ -201,43 +250,49 @@ exit:                                             ; preds = %loop.latch
 
 define i32 @store_with_pointer_phi_incoming_phi(ptr %A, ptr %B, ptr %C, i1 %c.0, i1 %c.1) {
 ; CHECK-LABEL: 'store_with_pointer_phi_incoming_phi'
-; CHECK-NEXT:  loop.header:
-; CHECK-NEXT:    Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
-; CHECK-NEXT:    Unknown data dependence.
-; CHECK-NEXT:    Dependences:
-; CHECK-NEXT:      Unknown:
-; CHECK-NEXT:          %v8 = load double, ptr %arrayidx, align 8 ->
-; CHECK-NEXT:          store double %mul16, ptr %ptr.2, align 8
+; CHECK-NEXT:    loop.header:
+; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK-NEXT:  Unknown data dependence.
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        Unknown:
+; CHECK-NEXT:            %v8 = load double, ptr %arrayidx, align 8 ->
+; CHECK-NEXT:            store double %mul16, ptr %ptr.2, align 8
 ; CHECK-EMPTY:
-; CHECK-NEXT:    Run-time memory checks:
-; CHECK-NEXT:    Check 0:
-; CHECK-NEXT:      Comparing group ([[GROUP_C:.+]]):
-; CHECK-NEXT:      ptr %C
-; CHECK-NEXT:      Against group ([[GROUP_B:.+]]):
-; CHECK-NEXT:      ptr %B
-; CHECK-NEXT:    Check 1:
-; CHECK-NEXT:      Comparing group ([[GROUP_C]]):
-; CHECK-NEXT:      ptr %C
-; CHECK-NEXT:      Against group ([[GROUP_A:.+]]):
-; CHECK-NEXT:        %arrayidx = getelementptr inbounds double, ptr %A, i64 %iv
-; CHECK-NEXT:      ptr %A
-; CHECK-NEXT:    Check 2:
-; CHECK-NEXT:      Comparing group ([[GROUP_B]]):
-; CHECK-NEXT:      ptr %B
-; CHECK-NEXT:      Against group ([[GROUP_A]]):
-; CHECK-NEXT:        %arrayidx = getelementptr inbounds double, ptr %A, i64 %iv
-; CHECK-NEXT:      ptr %A
-; CHECK-NEXT:    Grouped accesses:
-; CHECK-NEXT:      Group [[GROUP_C]]:
-; CHECK-NEXT:        (Low: %C High: (8 + %C))
-; CHECK-NEXT:          Member: %C
-; CHECK-NEXT:      Group [[GROUP_B]]:
-; CHECK-NEXT:        (Low: %B High: (8 + %B))
-; CHECK-NEXT:          Member: %B
-; CHECK-NEXT:      Group [[GROUP_A]]:
-; CHECK-NEXT:        (Low: %A High: (256000 + %A))
-; CHECK-NEXT:          Member: {%A,+,8}<nuw><%loop.header>
-; CHECK-NEXT:          Member: %A
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Check 0:
+; CHECK-NEXT:        Comparing group ([[GRP4:0x[0-9a-f]+]]):
+; CHECK-NEXT:        ptr %C
+; CHECK-NEXT:        Against group ([[GRP5:0x[0-9a-f]+]]):
+; CHECK-NEXT:        ptr %B
+; CHECK-NEXT:      Check 1:
+; CHECK-NEXT:        Comparing group ([[GRP4]]):
+; CHECK-NEXT:        ptr %C
+; CHECK-NEXT:        Against group ([[GRP6:0x[0-9a-f]+]]):
+; CHECK-NEXT:          %arrayidx = getelementptr inbounds double, ptr %A, i64 %iv
+; CHECK-NEXT:        ptr %A
+; CHECK-NEXT:      Check 2:
+; CHECK-NEXT:        Comparing group ([[GRP5]]):
+; CHECK-NEXT:        ptr %B
+; CHECK-NEXT:        Against group ([[GRP6]]):
+; CHECK-NEXT:          %arrayidx = getelementptr inbounds double, ptr %A, i64 %iv
+; CHECK-NEXT:        ptr %A
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-NEXT:        Group [[GRP4]]:
+; CHECK-NEXT:          (Low: %C High: (8 + %C))
+; CHECK-NEXT:            Member: %C
+; CHECK-NEXT:        Group [[GRP5]]:
+; CHECK-NEXT:          (Low: %B High: (8 + %B))
+; CHECK-NEXT:            Member: %B
+; CHECK-NEXT:        Group [[GRP6]]:
+; CHECK-NEXT:          (Low: %A High: (256000 + %A))
+; CHECK-NEXT:            Member: {%A,+,8}<nuw><%loop.header>
+; CHECK-NEXT:            Member: %A
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
 ; CHECK-EMPTY
 entry:
   br label %loop.header
@@ -278,43 +333,49 @@ exit:                                             ; preds = %loop.latch
 ; Test cases with pointer phis forming a cycle.
 define i32 @store_with_pointer_phi_incoming_phi_irreducible_cycle(ptr %A, ptr %B, ptr %C, i1 %c.0, i1 %c.1) {
 ; CHECK-LABEL: 'store_with_pointer_phi_incoming_phi_irreducible_cycle'
-; CHECK-NEXT:  loop.header:
-; CHECK-NEXT:    Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
-; CHECK-NEXT:    Unknown data dependence.
-; CHECK-NEXT:    Dependences:
-; CHECK-NEXT:      Unknown:
-; CHECK-NEXT:          %v8 = load double, ptr %arrayidx, align 8 ->
-; CHECK-NEXT:          store double %mul16, ptr %ptr.3, align 8
+; CHECK-NEXT:    loop.header:
+; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK-NEXT:  Unknown data dependence.
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        Unknown:
+; CHECK-NEXT:            %v8 = load double, ptr %arrayidx, align 8 ->
+; CHECK-NEXT:            store double %mul16, ptr %ptr.3, align 8
 ; CHECK-EMPTY:
-; CHECK-NEXT:    Run-time memory checks:
-; CHECK-NEXT:    Check 0:
-; CHECK-NEXT:      Comparing group ([[GROUP_C:.+]]):
-; CHECK-NEXT:      ptr %C
-; CHECK-NEXT:      Against group ([[GROUP_B:.+]]):
-; CHECK-NEXT:      ptr %B
-; CHECK-NEXT:    Check 1:
-; CHECK-NEXT:      Comparing group ([[GROUP_C]]):
-; CHECK-NEXT:      ptr %C
-; CHECK-NEXT:      Against group ([[GROUP_A:.+]]):
-; CHECK-NEXT:        %arrayidx = getelementptr inbounds double, ptr %A, i64 %iv
-; CHECK-NEXT:      ptr %A
-; CHECK-NEXT:    Check 2:
-; CHECK-NEXT:      Comparing group ([[GROUP_B]]):
-; CHECK-NEXT:      ptr %B
-; CHECK-NEXT:      Against group ([[GROUP_A]]):
-; CHECK-NEXT:        %arrayidx = getelementptr inbounds double, ptr %A, i64 %iv
-; CHECK-NEXT:      ptr %A
-; CHECK-NEXT:    Grouped accesses:
-; CHECK-NEXT:      Group [[GROUP_C]]
-; CHECK-NEXT:        (Low: %C High: (8 + %C))
-; CHECK-NEXT:          Member: %C
-; CHECK-NEXT:      Group [[GROUP_B]]
-; CHECK-NEXT:        (Low: %B High: (8 + %B))
-; CHECK-NEXT:          Member: %B
-; CHECK-NEXT:      Group [[GROUP_A]]
-; CHECK-NEXT:        (Low: %A High: (256000 + %A))
-; CHECK-NEXT:          Member: {%A,+,8}<nuw><%loop.header>
-; CHECK-NEXT:          Member: %A
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Check 0:
+; CHECK-NEXT:        Comparing group ([[GRP7:0x[0-9a-f]+]]):
+; CHECK-NEXT:        ptr %C
+; CHECK-NEXT:        Against group ([[GRP8:0x[0-9a-f]+]]):
+; CHECK-NEXT:        ptr %B
+; CHECK-NEXT:      Check 1:
+; CHECK-NEXT:        Comparing group ([[GRP7]]):
+; CHECK-NEXT:        ptr %C
+; CHECK-NEXT:        Against group ([[GRP9:0x[0-9a-f]+]]):
+; CHECK-NEXT:          %arrayidx = getelementptr inbounds double, ptr %A, i64 %iv
+; CHECK-NEXT:        ptr %A
+; CHECK-NEXT:      Check 2:
+; CHECK-NEXT:        Comparing group ([[GRP8]]):
+; CHECK-NEXT:        ptr %B
+; CHECK-NEXT:        Against group ([[GRP9]]):
+; CHECK-NEXT:          %arrayidx = getelementptr inbounds double, ptr %A, i64 %iv
+; CHECK-NEXT:        ptr %A
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-NEXT:        Group [[GRP7]]:
+; CHECK-NEXT:          (Low: %C High: (8 + %C))
+; CHECK-NEXT:            Member: %C
+; CHECK-NEXT:        Group [[GRP8]]:
+; CHECK-NEXT:          (Low: %B High: (8 + %B))
+; CHECK-NEXT:            Member: %B
+; CHECK-NEXT:        Group [[GRP9]]:
+; CHECK-NEXT:          (Low: %A High: (256000 + %A))
+; CHECK-NEXT:            Member: {%A,+,8}<nuw><%loop.header>
+; CHECK-NEXT:            Member: %A
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
 ; CHECK-EMPTY
 entry:
   br label %loop.header
@@ -350,13 +411,21 @@ exit:                                             ; preds = %loop.latch
 
 define i32 @store_with_pointer_phi_outside_loop_select(ptr %A, ptr %B, ptr %C, i1 %c.0, i1 %c.1) {
 ; CHECK-LABEL: 'store_with_pointer_phi_outside_loop_select'
-; CHECK-NEXT:  loop.header:
-; CHECK-NEXT:    Report: unsafe dependent memory operations in loop.
-; CHECK-NEXT:    Unknown data dependence.
-; CHECK-NEXT:    Dependences:
-; CHECK-NEXT:      Unknown:
-; CHECK-NEXT:          %v8 = load double, ptr %arrayidx, align 8 ->
-; CHECK-NEXT:          store double %mul16, ptr %ptr, align 8
+; CHECK-NEXT:    loop.header:
+; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK-NEXT:  Unknown data dependence.
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        Unknown:
+; CHECK-NEXT:            %v8 = load double, ptr %arrayidx, align 8 ->
+; CHECK-NEXT:            store double %mul16, ptr %ptr, align 8
+; CHECK-EMPTY:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
 ;
 entry:
   br i1 %c.0, label %if.then, label %if.else
@@ -387,13 +456,17 @@ exit:                                             ; preds = %loop.latch
 }
 
 define i32 @store_with_pointer_phi_in_same_bb_use_other_phi(ptr %A, ptr %B, ptr %C, ptr %D, i1 %c.0, i1 %c.1) {
-; CHECK-LABEL: Loop access info in function 'store_with_pointer_phi_in_same_bb_use_other_phi':
-; CHECK-NEXT:   loop.header:
-; CHECK-NEXT:     Report: cannot identify array bounds
-; CHECK-NEXT:     Dependences:
-; CHECK-NEXT:     Run-time memory checks:
-; CHECK-NEXT:     Grouped accesses:
+; CHECK-LABEL: 'store_with_pointer_phi_in_same_bb_use_other_phi'
+; CHECK-NEXT:    loop.header:
+; CHECK-NEXT:      Report: cannot identify array bounds
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
 ; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
 ;
 entry:
   br label %loop.header
@@ -415,63 +488,67 @@ exit:                                             ; preds = %loop.latch
 }
 
 define void @phi_load_store_memdep_check(i1 %c, ptr %A, ptr %B, ptr %C) {
-; CHECK-LABEL: Loop access info in function 'phi_load_store_memdep_check':
-; CHECK-NEXT:   for.body:
-; CHECK-NEXT:    Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
-; CHECK-NEXT:    Unknown data dependence.
-; CHECK-NEXT:    Dependences:
-; CHECK-NEXT:      Unknown:
-; CHECK-NEXT:          %lv3 = load i16, ptr %c.sink, align 2 ->
-; CHECK-NEXT:          store i16 %add, ptr %c.sink, align 1
+; CHECK-LABEL: 'phi_load_store_memdep_check'
+; CHECK-NEXT:    for.body:
+; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK-NEXT:  Unknown data dependence.
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        Unknown:
+; CHECK-NEXT:            %lv3 = load i16, ptr %c.sink, align 2 ->
+; CHECK-NEXT:            store i16 %add, ptr %c.sink, align 1
+; CHECK-EMPTY:
+; CHECK-NEXT:        Unknown:
+; CHECK-NEXT:            %lv3 = load i16, ptr %c.sink, align 2 ->
+; CHECK-NEXT:            store i16 %add, ptr %c.sink, align 1
 ; CHECK-EMPTY:
-; CHECK-NEXT:      Unknown:
-; CHECK-NEXT:          %lv3 = load i16, ptr %c.sink, align 2 ->
-; CHECK-NEXT:          store i16 %add, ptr %c.sink, align 1
+; CHECK-NEXT:        Unknown:
+; CHECK-NEXT:            %lv = load i16, ptr %A, align 1 ->
+; CHECK-NEXT:            store i16 %lv, ptr %A, align 1
 ; CHECK-EMPTY:
-; CHECK-NEXT:      Unknown:
-; CHECK-NEXT:          %lv = load i16, ptr %A, align 1 ->
-; CHECK-NEXT:          store i16 %lv, ptr %A, align 1
+; CHECK-NEXT:        Unknown:
+; CHECK-NEXT:            store i16 %lv, ptr %A, align 1 ->
+; CHECK-NEXT:            %lv2 = load i16, ptr %A, align 1
 ; CHECK-EMPTY:
-; CHECK-NEXT:      Unknown:
-; CHECK-NEXT:          store i16 %lv, ptr %A, align 1 ->
-; CHECK-NEXT:          %lv2 = load i16, ptr %A, align 1
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Check 0:
+; CHECK-NEXT:        Comparing group ([[GRP10:0x[0-9a-f]+]]):
+; CHECK-NEXT:        ptr %A
+; CHECK-NEXT:        ptr %A
+; CHECK-NEXT:        Against group ([[GRP11:0x[0-9a-f]+]]):
+; CHECK-NEXT:        ptr %C
+; CHECK-NEXT:        ptr %C
+; CHECK-NEXT:      Check 1:
+; CHECK-NEXT:        Comparing group ([[GRP10]]):
+; CHECK-NEXT:        ptr %A
+; CHECK-NEXT:        ptr %A
+; CHECK-NEXT:        Against group ([[GRP12:0x[0-9a-f]+]]):
+; CHECK-NEXT:        ptr %B
+; CHECK-NEXT:        ptr %B
+; CHECK-NEXT:      Check 2:
+; CHECK-NEXT:        Comparing group ([[GRP11]]):
+; CHECK-NEXT:        ptr %C
+; CHECK-NEXT:        ptr %C
+; CHECK-NEXT:        Against group ([[GRP12]]):
+; CHECK-NEXT:        ptr %B
+; CHECK-NEXT:        ptr %B
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-NEXT:        Group [[GRP10]]:
+; CHECK-NEXT:          (Low: %A High: (2 + %A))
+; CHECK-NEXT:            Member: %A
+; CHECK-NEXT:            Member: %A
+; CHECK-NEXT:        Group [[GRP11]]:
+; CHECK-NEXT:          (Low: %C High: (2 + %C))
+; CHECK-NEXT:            Member: %C
+; CHECK-NEXT:            Member: %C
+; CHECK-NEXT:        Group [[GRP12]]:
+; CHECK-NEXT:          (Low: %B High: (2 + %B))
+; CHECK-NEXT:            Member: %B
+; CHECK-NEXT:            Member: %B
 ; CHECK-EMPTY:
-; CHECK-NEXT:    Run-time memory checks:
-; CHECK-NEXT:    Check 0:
-; CHECK-NEXT:      Comparing group ([[GROUP_A:.+]]):
-; CHECK-NEXT:      ptr %A
-; CHECK-NEXT:      ptr %A
-; CHECK-NEXT:      Against group ([[GROUP_C:.+]]):
-; CHECK-NEXT:      ptr %C
-; CHECK-NEXT:      ptr %C
-; CHECK-NEXT:    Check 1:
-; CHECK-NEXT:      Comparing group ([[GROUP_A]]):
-; CHECK-NEXT:      ptr %A
-; CHECK-NEXT:      ptr %A
-; CHECK-NEXT:      Against group ([[GROUP_B:.+]]):
-; CHECK-NEXT:      ptr %B
-; CHECK-NEXT:      ptr %B
-; CHECK-NEXT:    Check 2:
-; CHECK-NEXT:      Comparing group ([[GROUP_C]]):
-; CHECK-NEXT:      ptr %C
-; CHECK-NEXT:      ptr %C
-; CHECK-NEXT:      Against group ([[GROUP_B]]):
-; CHECK-NEXT:      ptr %B
-; CHECK-NEXT:      ptr %B
-; CHECK-NEXT:    Grouped accesses:
-; CHECK-NEXT:      Group [[GROUP_A]]
-; CHECK-NEXT:        (Low: %A High: (2 + %A))
-; CHECK-NEXT:          Member: %A
-; CHECK-NEXT:          Member: %A
-; CHECK-NEXT:      Group [[GROUP_C]]
-; CHECK-NEXT:        (Low: %C High: (2 + %C))
-; CHECK-NEXT:          Member: %C
-; CHECK-NEXT:          Member: %C
-; CHECK-NEXT:      Group [[GROUP_B]]
-; CHECK-NEXT:        (Low: %B High: (2 + %B))
-; CHECK-NEXT:          Member: %B
-; CHECK-NEXT:          Member: %B
+; CHECK-NEXT:      Non vectorizable stores to invariant address were found in loop.
+; CHECK-NEXT:      SCEV assumptions:
 ; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/symbolic-stride.ll b/llvm/test/Analysis/LoopAccessAnalysis/symbolic-stride.ll
index 016c574b3b7cb..6cc045d7a681b 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/symbolic-stride.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/symbolic-stride.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 3
 ; RUN: opt -S -disable-output -passes='print<access-info>' %s 2>&1 | FileCheck %s
 
 ;
@@ -5,26 +6,26 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 
 ; A forwarding in the presence of symbolic strides.
 define void @single_stride(ptr noalias %A, ptr noalias %B, i64 %N, i64 %stride) {
-; CHECK-LABEL: Loop access info in function 'single_stride':
-; CHECK-NEXT:  loop:
-; CHECK-NEXT:    Report: unsafe dependent memory operations in loop.
-; CHECK-NEXT:    Backward loop carried data dependence.
-; CHECK-NEXT:    Dependences:
-; CHECK-NEXT:      Backward:
-; CHECK-NEXT:          %load = load i32, ptr %gep.A, align 4 ->
-; CHECK-NEXT:          store i32 %add, ptr %gep.A.next, align 4
+; CHECK-LABEL: 'single_stride'
+; CHECK-NEXT:    loop:
+; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK-NEXT:  Backward loop carried data dependence.
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        Backward:
+; CHECK-NEXT:            %load = load i32, ptr %gep.A, align 4 ->
+; CHECK-NEXT:            store i32 %add, ptr %gep.A.next, align 4
 ; CHECK-EMPTY:
-; CHECK-NEXT:    Run-time memory checks:
-; CHECK-NEXT:    Grouped accesses:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
 ; CHECK-EMPTY:
-; CHECK-NEXT:    Non vectorizable stores to invariant address were not found in loop.
-; CHECK-NEXT:    SCEV assumptions:
-; CHECK-NEXT:    Equal predicate: %stride == 1
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-NEXT:      Equal predicate: %stride == 1
 ; CHECK-EMPTY:
-; CHECK-NEXT:    Expressions re-written:
-; CHECK-NEXT:    [PSE]  %gep.A = getelementptr inbounds i32, ptr %A, i64 %mul:
-; CHECK-NEXT:      {%A,+,(4 * %stride)}<%loop>
-; CHECK-NEXT:      --> {%A,+,4}<%loop>
+; CHECK-NEXT:      Expressions re-written:
+; CHECK-NEXT:      [PSE] %gep.A = getelementptr inbounds i32, ptr %A, i64 %mul:
+; CHECK-NEXT:        {%A,+,(4 * %stride)}<%loop>
+; CHECK-NEXT:        --> {%A,+,4}<%loop>
 ;
 entry:
   br label %loop
@@ -49,26 +50,26 @@ exit:                                          ; preds = %loop
 
 ; Similar to @single_stride, but with struct types.
 define void @single_stride_struct(ptr noalias %A, ptr noalias %B, i64 %N, i64 %stride) {
-; CHECK-LABEL: Loop access info in function 'single_stride_struct':
-; CHECK-NEXT:  loop:
-; CHECK-NEXT:    Report: unsafe dependent memory operations in loop.
-; CHECK-NEXT:    Backward loop carried data dependence.
-; CHECK-NEXT:    Dependences:
-; CHECK-NEXT:      Backward:
-; CHECK-NEXT:          %load = load { i32, i8 }, ptr %gep.A, align 4 ->
-; CHECK-NEXT:          store { i32, i8 } %ins, ptr %gep.A.next, align 4
+; CHECK-LABEL: 'single_stride_struct'
+; CHECK-NEXT:    loop:
+; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK-NEXT:  Backward loop carried data dependence.
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        Backward:
+; CHECK-NEXT:            %load = load { i32, i8 }, ptr %gep.A, align 4 ->
+; CHECK-NEXT:            store { i32, i8 } %ins, ptr %gep.A.next, align 4
 ; CHECK-EMPTY:
-; CHECK-NEXT:    Run-time memory checks:
-; CHECK-NEXT:    Grouped accesses:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
 ; CHECK-EMPTY:
-; CHECK-NEXT:    Non vectorizable stores to invariant address were not found in loop.
-; CHECK-NEXT:    SCEV assumptions:
-; CHECK-NEXT:    Equal predicate: %stride == 1
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-NEXT:      Equal predicate: %stride == 1
 ; CHECK-EMPTY:
-; CHECK-NEXT:    Expressions re-written:
-; CHECK-NEXT:    [PSE]  %gep.A = getelementptr inbounds { i32, i8 }, ptr %A, i64 %mul:
-; CHECK-NEXT:      {%A,+,(8 * %stride)}<%loop>
-; CHECK-NEXT:      --> {%A,+,8}<%loop>
+; CHECK-NEXT:      Expressions re-written:
+; CHECK-NEXT:      [PSE] %gep.A = getelementptr inbounds { i32, i8 }, ptr %A, i64 %mul:
+; CHECK-NEXT:        {%A,+,(8 * %stride)}<%loop>
+; CHECK-NEXT:        --> {%A,+,8}<%loop>
 ;
 entry:
   br label %loop
@@ -96,30 +97,30 @@ exit:
 
 ; A loop with two symbolic strides.
 define void @two_strides(ptr noalias %A, ptr noalias %B, i64 %N, i64 %stride.1, i64 %stride.2) {
-; CHECK-LABEL: Loop access info in function 'two_strides':
-; CHECK-NEXT:  loop:
-; CHECK-NEXT:    Report: unsafe dependent memory operations in loop.
-; CHECK-NEXT:    Backward loop carried data dependence.
-; CHECK-NEXT:    Dependences:
-; CHECK-NEXT:      Backward:
-; CHECK-NEXT:          %load = load i32, ptr %gep.A, align 4 ->
-; CHECK-NEXT:          store i32 %add, ptr %gep.A.next, align 4
+; CHECK-LABEL: 'two_strides'
+; CHECK-NEXT:    loop:
+; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK-NEXT:  Backward loop carried data dependence.
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        Backward:
+; CHECK-NEXT:            %load = load i32, ptr %gep.A, align 4 ->
+; CHECK-NEXT:            store i32 %add, ptr %gep.A.next, align 4
 ; CHECK-EMPTY:
-; CHECK-NEXT:    Run-time memory checks:
-; CHECK-NEXT:    Grouped accesses:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
 ; CHECK-EMPTY:
-; CHECK-NEXT:    Non vectorizable stores to invariant address were not found in loop.
-; CHECK-NEXT:    SCEV assumptions:
-; CHECK-NEXT:    Equal predicate: %stride.2 == 1
-; CHECK-NEXT:    Equal predicate: %stride.1 == 1
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-NEXT:      Equal predicate: %stride.2 == 1
+; CHECK-NEXT:      Equal predicate: %stride.1 == 1
 ; CHECK-EMPTY:
-; CHECK-NEXT:    Expressions re-written:
-; CHECK-NEXT:    [PSE]  %gep.A = getelementptr inbounds i32, ptr %A, i64 %mul:
-; CHECK-NEXT:      {%A,+,(4 * %stride.1)}<%loop>
-; CHECK-NEXT:      --> {%A,+,4}<%loop>
-; CHECK-NEXT:    [PSE]  %gep.A.next = getelementptr inbounds i32, ptr %A, i64 %mul.2:
-; CHECK-NEXT:      {((4 * %stride.2) + %A),+,(4 * %stride.2)}<%loop>
-; CHECK-NEXT:      --> {(4 + %A),+,4}<%loop>
+; CHECK-NEXT:      Expressions re-written:
+; CHECK-NEXT:      [PSE] %gep.A = getelementptr inbounds i32, ptr %A, i64 %mul:
+; CHECK-NEXT:        {%A,+,(4 * %stride.1)}<%loop>
+; CHECK-NEXT:        --> {%A,+,4}<%loop>
+; CHECK-NEXT:      [PSE] %gep.A.next = getelementptr inbounds i32, ptr %A, i64 %mul.2:
+; CHECK-NEXT:        {((4 * %stride.2) + %A),+,(4 * %stride.2)}<%loop>
+; CHECK-NEXT:        --> {(4 + %A),+,4}<%loop>
 ;
 entry:
   br label %loop
diff --git a/llvm/test/Transforms/LoopUnroll/unroll-loop-invalidation.ll b/llvm/test/Transforms/LoopUnroll/unroll-loop-invalidation.ll
index bd77e98df31b0..643cf653115dd 100644
--- a/llvm/test/Transforms/LoopUnroll/unroll-loop-invalidation.ll
+++ b/llvm/test/Transforms/LoopUnroll/unroll-loop-invalidation.ll
@@ -11,7 +11,7 @@
 ; CHECK: Invalidating analysis: LoopAccessAnalysis on test
 ; CHECK: Running pass: LoopAccessInfoPrinterPass
 ; CHECK: Running analysis: LoopAccessAnalysis on test
-; CHECK: Loop access info in function 'test':
+; CHECK: Printing analysis 'Loop Access Analysis' for function 'test':
 ; CHECK:   inner1.header:
 
 ; CHECK:   inner1.header.1:
diff --git a/llvm/test/tools/UpdateTestChecks/update_analyze_test_checks/Inputs/dotvariable-laa.ll b/llvm/test/tools/UpdateTestChecks/update_analyze_test_checks/Inputs/dotvariable-laa.ll
new file mode 100644
index 0000000000000..5d34df638aaea
--- /dev/null
+++ b/llvm/test/tools/UpdateTestChecks/update_analyze_test_checks/Inputs/dotvariable-laa.ll
@@ -0,0 +1,23 @@
+; RUN: opt -passes='print<access-info>' < %s -disable-output 2>&1 | FileCheck %s
+
+define dso_local void @dotvariable_laa(ptr nocapture readonly nonnull %Base1, ptr nocapture readonly %Base2, ptr nocapture %Dest, ptr nocapture readonly %Preds) {
+entry:
+  br label %for.body
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, ptr %Preds, i64 %indvars.iv
+  %0 = load i32, ptr %arrayidx, align 4
+  %cmp1.not = icmp eq i32 %0, 0
+  %spec.select = select i1 %cmp1.not, ptr %Base2, ptr %Base1
+  %.sink.in = getelementptr inbounds double, ptr %spec.select, i64 %indvars.iv
+  %.sink = load double, ptr %.sink.in, align 8
+  %1 = getelementptr inbounds double, ptr %Dest, i64 %indvars.iv
+  store double %.sink, ptr %1, align 8
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, 100
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
diff --git a/llvm/test/tools/UpdateTestChecks/update_analyze_test_checks/Inputs/dotvariable-laa.ll.expected b/llvm/test/tools/UpdateTestChecks/update_analyze_test_checks/Inputs/dotvariable-laa.ll.expected
new file mode 100644
index 0000000000000..7ba90c5088752
--- /dev/null
+++ b/llvm/test/tools/UpdateTestChecks/update_analyze_test_checks/Inputs/dotvariable-laa.ll.expected
@@ -0,0 +1,63 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt -passes='print<access-info>' < %s -disable-output 2>&1 | FileCheck %s
+
+define dso_local void @dotvariable_laa(ptr nocapture readonly nonnull %Base1, ptr nocapture readonly %Base2, ptr nocapture %Dest, ptr nocapture readonly %Preds) {
+; CHECK-LABEL: 'dotvariable_laa'
+; CHECK-NEXT:    for.body:
+; CHECK-NEXT:      Memory dependences are safe with run-time checks
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Check 0:
+; CHECK-NEXT:        Comparing group ([[GRP1:0x[0-9a-f]+]]):
+; CHECK-NEXT:          %1 = getelementptr inbounds double, ptr %Dest, i64 %indvars.iv
+; CHECK-NEXT:        Against group ([[GRP2:0x[0-9a-f]+]]):
+; CHECK-NEXT:          %arrayidx = getelementptr inbounds i32, ptr %Preds, i64 %indvars.iv
+; CHECK-NEXT:      Check 1:
+; CHECK-NEXT:        Comparing group ([[GRP1]]):
+; CHECK-NEXT:          %1 = getelementptr inbounds double, ptr %Dest, i64 %indvars.iv
+; CHECK-NEXT:        Against group ([[GRP3:0x[0-9a-f]+]]):
+; CHECK-NEXT:          %.sink.in = getelementptr inbounds double, ptr %spec.select, i64 %indvars.iv
+; CHECK-NEXT:      Check 2:
+; CHECK-NEXT:        Comparing group ([[GRP1]]):
+; CHECK-NEXT:          %1 = getelementptr inbounds double, ptr %Dest, i64 %indvars.iv
+; CHECK-NEXT:        Against group ([[GRP4:0x[0-9a-f]+]]):
+; CHECK-NEXT:          %.sink.in = getelementptr inbounds double, ptr %spec.select, i64 %indvars.iv
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-NEXT:        Group [[GRP1]]:
+; CHECK-NEXT:          (Low: %Dest High: (800 + %Dest))
+; CHECK-NEXT:            Member: {%Dest,+,8}<nuw><%for.body>
+; CHECK-NEXT:        Group [[GRP2]]:
+; CHECK-NEXT:          (Low: %Preds High: (400 + %Preds))
+; CHECK-NEXT:            Member: {%Preds,+,4}<nuw><%for.body>
+; CHECK-NEXT:        Group [[GRP3]]:
+; CHECK-NEXT:          (Low: %Base2 High: (800 + %Base2))
+; CHECK-NEXT:            Member: {%Base2,+,8}<nw><%for.body>
+; CHECK-NEXT:        Group [[GRP4]]:
+; CHECK-NEXT:          (Low: %Base1 High: (800 + %Base1))
+; CHECK-NEXT:            Member: {%Base1,+,8}<nw><%for.body>
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  br label %for.body
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, ptr %Preds, i64 %indvars.iv
+  %0 = load i32, ptr %arrayidx, align 4
+  %cmp1.not = icmp eq i32 %0, 0
+  %spec.select = select i1 %cmp1.not, ptr %Base2, ptr %Base1
+  %.sink.in = getelementptr inbounds double, ptr %spec.select, i64 %indvars.iv
+  %.sink = load double, ptr %.sink.in, align 8
+  %1 = getelementptr inbounds double, ptr %Dest, i64 %indvars.iv
+  store double %.sink, ptr %1, align 8
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, 100
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
diff --git a/llvm/test/tools/UpdateTestChecks/update_analyze_test_checks/Inputs/loop-access-analysis.ll b/llvm/test/tools/UpdateTestChecks/update_analyze_test_checks/Inputs/loop-access-analysis.ll
new file mode 100644
index 0000000000000..618c8f11cde9c
--- /dev/null
+++ b/llvm/test/tools/UpdateTestChecks/update_analyze_test_checks/Inputs/loop-access-analysis.ll
@@ -0,0 +1,23 @@
+; RUN: opt -passes='print<access-info>' < %s -disable-output 2>&1 | FileCheck %s
+
+define void @laa(ptr nocapture readonly %Base1, ptr nocapture readonly %Base2, ptr %Dest) {
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %gep.Dest = getelementptr inbounds float, ptr %Dest, i64 %iv
+  %l.Dest = load float, ptr %gep.Dest
+  %cmp = fcmp une float %l.Dest, 0.0
+  %gep.1 = getelementptr inbounds float, ptr %Base1, i64 %iv
+  %gep.2 = getelementptr inbounds float, ptr %Base2, i64 %iv
+  %select = select i1 %cmp, ptr %gep.1, ptr %gep.2
+  %sink = load float, ptr %select, align 4
+  store float %sink, ptr %gep.Dest, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 100
+  br i1 %exitcond.not, label %exit, label %loop
+
+exit:
+  ret void
+}
diff --git a/llvm/test/tools/UpdateTestChecks/update_analyze_test_checks/Inputs/loop-access-analysis.ll.expected b/llvm/test/tools/UpdateTestChecks/update_analyze_test_checks/Inputs/loop-access-analysis.ll.expected
new file mode 100644
index 0000000000000..2741336d6c8ed
--- /dev/null
+++ b/llvm/test/tools/UpdateTestChecks/update_analyze_test_checks/Inputs/loop-access-analysis.ll.expected
@@ -0,0 +1,58 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt -passes='print<access-info>' < %s -disable-output 2>&1 | FileCheck %s
+
+define void @laa(ptr nocapture readonly %Base1, ptr nocapture readonly %Base2, ptr %Dest) {
+; CHECK-LABEL: 'laa'
+; CHECK-NEXT:    loop:
+; CHECK-NEXT:      Memory dependences are safe with run-time checks
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Check 0:
+; CHECK-NEXT:        Comparing group ([[GRP1:0x[0-9a-f]+]]):
+; CHECK-NEXT:          %gep.Dest = getelementptr inbounds float, ptr %Dest, i64 %iv
+; CHECK-NEXT:          %gep.Dest = getelementptr inbounds float, ptr %Dest, i64 %iv
+; CHECK-NEXT:        Against group ([[GRP2:0x[0-9a-f]+]]):
+; CHECK-NEXT:          %select = select i1 %cmp, ptr %gep.1, ptr %gep.2
+; CHECK-NEXT:      Check 1:
+; CHECK-NEXT:        Comparing group ([[GRP1]]):
+; CHECK-NEXT:          %gep.Dest = getelementptr inbounds float, ptr %Dest, i64 %iv
+; CHECK-NEXT:          %gep.Dest = getelementptr inbounds float, ptr %Dest, i64 %iv
+; CHECK-NEXT:        Against group ([[GRP3:0x[0-9a-f]+]]):
+; CHECK-NEXT:          %select = select i1 %cmp, ptr %gep.1, ptr %gep.2
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-NEXT:        Group [[GRP1]]:
+; CHECK-NEXT:          (Low: %Dest High: (400 + %Dest))
+; CHECK-NEXT:            Member: {%Dest,+,4}<nuw><%loop>
+; CHECK-NEXT:            Member: {%Dest,+,4}<nuw><%loop>
+; CHECK-NEXT:        Group [[GRP2]]:
+; CHECK-NEXT:          (Low: %Base1 High: (400 + %Base1))
+; CHECK-NEXT:            Member: {%Base1,+,4}<nw><%loop>
+; CHECK-NEXT:        Group [[GRP3]]:
+; CHECK-NEXT:          (Low: %Base2 High: (400 + %Base2))
+; CHECK-NEXT:            Member: {%Base2,+,4}<nw><%loop>
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %gep.Dest = getelementptr inbounds float, ptr %Dest, i64 %iv
+  %l.Dest = load float, ptr %gep.Dest
+  %cmp = fcmp une float %l.Dest, 0.0
+  %gep.1 = getelementptr inbounds float, ptr %Base1, i64 %iv
+  %gep.2 = getelementptr inbounds float, ptr %Base2, i64 %iv
+  %select = select i1 %cmp, ptr %gep.1, ptr %gep.2
+  %sink = load float, ptr %select, align 4
+  store float %sink, ptr %gep.Dest, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 100
+  br i1 %exitcond.not, label %exit, label %loop
+
+exit:
+  ret void
+}
diff --git a/llvm/test/tools/UpdateTestChecks/update_analyze_test_checks/dotvariable-laa.test b/llvm/test/tools/UpdateTestChecks/update_analyze_test_checks/dotvariable-laa.test
new file mode 100644
index 0000000000000..5ede66ec6e100
--- /dev/null
+++ b/llvm/test/tools/UpdateTestChecks/update_analyze_test_checks/dotvariable-laa.test
@@ -0,0 +1,6 @@
+## Basic test checking that update_analyze_test_checks.py works correctly
+# RUN: cp -f %S/Inputs/dotvariable-laa.ll %t.ll && %update_analyze_test_checks %t.ll
+# RUN: diff -u %t.ll %S/Inputs/dotvariable-laa.ll.expected
+## Check that running the script again does not change the result:
+# RUN: %update_analyze_test_checks %t.ll
+# RUN: diff -u %t.ll %S/Inputs/dotvariable-laa.ll.expected
diff --git a/llvm/test/tools/UpdateTestChecks/update_analyze_test_checks/loop-access-analysis.test b/llvm/test/tools/UpdateTestChecks/update_analyze_test_checks/loop-access-analysis.test
new file mode 100644
index 0000000000000..6531f0610660a
--- /dev/null
+++ b/llvm/test/tools/UpdateTestChecks/update_analyze_test_checks/loop-access-analysis.test
@@ -0,0 +1,6 @@
+## Basic test checking that update_analyze_test_checks.py works correctly
+# RUN: cp -f %S/Inputs/loop-access-analysis.ll %t.ll && %update_analyze_test_checks %t.ll
+# RUN: diff -u %t.ll %S/Inputs/loop-access-analysis.ll.expected
+## Check that running the script again does not change the result:
+# RUN: %update_analyze_test_checks %t.ll
+# RUN: diff -u %t.ll %S/Inputs/loop-access-analysis.ll.expected
diff --git a/llvm/utils/UpdateTestChecks/common.py b/llvm/utils/UpdateTestChecks/common.py
index 33da7b3b8665d..74044f925aadd 100644
--- a/llvm/utils/UpdateTestChecks/common.py
+++ b/llvm/utils/UpdateTestChecks/common.py
@@ -502,6 +502,7 @@ def invoke_tool(exe, cmd_args, ir, preprocess_cmd=None, verbose=False):
 
 SCRUB_LEADING_WHITESPACE_RE = re.compile(r"^(\s+)")
 SCRUB_WHITESPACE_RE = re.compile(r"(?!^(|  \w))[ \t]+", flags=re.M)
+SCRUB_PRESERVE_LEADING_WHITESPACE_RE = re.compile(r"((?!^)[ \t]*(\S))[ \t]+")
 SCRUB_TRAILING_WHITESPACE_RE = re.compile(r"[ \t]+$", flags=re.M)
 SCRUB_TRAILING_WHITESPACE_TEST_RE = SCRUB_TRAILING_WHITESPACE_RE
 SCRUB_TRAILING_WHITESPACE_AND_ATTRIBUTES_RE = re.compile(
@@ -592,7 +593,8 @@ def do_filter(body, filters):
 def scrub_body(body):
     # Scrub runs of whitespace out of the assembly, but leave the leading
     # whitespace in place.
-    body = SCRUB_WHITESPACE_RE.sub(r" ", body)
+    body = SCRUB_PRESERVE_LEADING_WHITESPACE_RE.sub(lambda m: m.group(2) + " ", body)
+
     # Expand the tabs used for indentation.
     body = str.expandtabs(body, 2)
     # Strip trailing whitespace.
@@ -770,6 +772,7 @@ def process_run_line(
                 supported_analyses = {
                     "cost model analysis",
                     "scalar evolution analysis",
+                    "loop access analysis",
                 }
                 if analysis.lower() not in supported_analyses:
                     warn("Unsupported analysis mode: %r!" % (analysis,))
@@ -931,7 +934,7 @@ def get_ir_regex_from_ir_value_re_match(self, match):
     def get_value_name(self, var: str, check_prefix: str):
         var = var.replace("!", "")
         if self.replace_number_with_counter:
-            assert var.isdigit(), var
+            assert var
             replacement = self.variable_mapping.get(var, None)
             if replacement is None:
                 # Replace variable with an incrementing counter
@@ -1023,6 +1026,16 @@ def get_value_use(self, var, match, var_prefix=None):
     ),
 ]
 
+analyze_nameless_values = [
+    NamelessValue(
+        r"GRP",
+        "#",
+        r"",
+        r"0x[0-9a-f]+",
+        None,
+        replace_number_with_counter=True,
+    ),
+]
 
 def createOrRegexp(old, new):
     if not old:
@@ -1066,6 +1079,20 @@ def createPrefixMatch(prefix_str, prefix_re):
     r"((?:#|//)\s*)" + "(" + ASM_VALUE_REGEXP_STRING + ")" + ASM_VALUE_REGEXP_SUFFIX
 )
 
+ANALYZE_VALUE_REGEXP_PREFIX = r"(\s*)"
+ANALYZE_VALUE_REGEXP_STRING = r""
+for nameless_value in analyze_nameless_values:
+    match = createPrefixMatch(nameless_value.ir_prefix, nameless_value.ir_regexp)
+    ANALYZE_VALUE_REGEXP_STRING = createOrRegexp(ANALYZE_VALUE_REGEXP_STRING, match)
+ANALYZE_VALUE_REGEXP_SUFFIX = r"(\)?:)"
+ANALYZE_VALUE_RE = re.compile(
+    ANALYZE_VALUE_REGEXP_PREFIX
+    + r"("
+    + ANALYZE_VALUE_REGEXP_STRING
+    + r")"
+    + ANALYZE_VALUE_REGEXP_SUFFIX
+)
+
 # The entire match is group 0, the prefix has one group (=1), the entire
 # IR_VALUE_REGEXP_STRING is one group (=2), and then the nameless values start.
 first_nameless_group_in_ir_value_match = 3
@@ -1110,6 +1137,7 @@ def generalize_check_lines_common(
     nameless_values,
     nameless_value_regex,
     is_asm,
+    preserve_names,
 ):
     # This gets called for each match that occurs in
     # a line. We transform variables we haven't seen
@@ -1145,7 +1173,7 @@ def transform_line_vars(match):
     lines_with_def = []
 
     for i, line in enumerate(lines):
-        if not is_asm:
+        if not is_asm and not is_analyze:
             # An IR variable named '%.' matches the FileCheck regex string.
             line = line.replace("%.", "%dot")
             for regex in _global_hex_value_regex:
@@ -1163,7 +1191,7 @@ def transform_line_vars(match):
             # Ignore any comments, since the check lines will too.
             scrubbed_line = SCRUB_IR_COMMENT_RE.sub(r"", line)
             lines[i] = scrubbed_line
-        if is_asm or not is_analyze:
+        if not preserve_names:
             # It can happen that two matches are back-to-back and for some reason sub
             # will not replace both of them. For now we work around this by
             # substituting until there is no more match.
@@ -1176,7 +1204,9 @@ def transform_line_vars(match):
 
 
 # Replace IR value defs and uses with FileCheck variables.
-def generalize_check_lines(lines, is_analyze, vars_seen, global_vars_seen):
+def generalize_check_lines(
+    lines, is_analyze, vars_seen, global_vars_seen, preserve_names
+):
     return generalize_check_lines_common(
         lines,
         is_analyze,
@@ -1185,6 +1215,7 @@ def generalize_check_lines(lines, is_analyze, vars_seen, global_vars_seen):
         ir_nameless_values,
         IR_VALUE_RE,
         False,
+        preserve_names,
     )
 
 
@@ -1197,9 +1228,22 @@ def generalize_asm_check_lines(lines, vars_seen, global_vars_seen):
         asm_nameless_values,
         ASM_VALUE_RE,
         True,
+        False,
     )
 
 
+def generalize_analyze_check_lines(lines, vars_seen, global_vars_seen):
+    return generalize_check_lines_common(
+        lines,
+        True,
+        vars_seen,
+        global_vars_seen,
+        analyze_nameless_values,
+        ANALYZE_VALUE_RE,
+        False,
+        False,
+    )
+
 def add_checks(
     output_lines,
     comment_marker,
@@ -1212,6 +1256,7 @@ def add_checks(
     version,
     global_vars_seen_dict,
     is_filtered,
+    preserve_names=False,
 ):
     # prefix_exclusions are prefixes we cannot use to print the function because it doesn't exist in run lines that use these prefixes as well.
     prefix_exclusions = set()
@@ -1279,7 +1324,11 @@ def add_checks(
             args_and_sig = str(func_dict[checkprefix][func_name].args_and_sig)
             if args_and_sig:
                 args_and_sig = generalize_check_lines(
-                    [args_and_sig], is_analyze, vars_seen, global_vars_seen
+                    [args_and_sig],
+                    is_analyze,
+                    vars_seen,
+                    global_vars_seen,
+                    preserve_names,
                 )[0]
             func_name_separator = func_dict[checkprefix][func_name].func_name_separator
             if "[[" in args_and_sig:
@@ -1356,56 +1405,85 @@ def add_checks(
                     if key not in global_vars_seen_before:
                         global_vars_seen_dict[checkprefix][key] = global_vars_seen[key]
                 break
+            # For analyze output, generalize the output, and emit CHECK-EMPTY lines as well.
+            elif is_analyze:
+                func_body = generalize_analyze_check_lines(
+                    func_body, vars_seen, global_vars_seen
+                )
+                for func_line in func_body:
+                    if func_line.strip() == "":
+                        output_lines.append(
+                            "{} {}-EMPTY:".format(comment_marker, checkprefix)
+                        )
+                    else:
+                        check_suffix = "-NEXT" if not is_filtered else ""
+                        output_lines.append(
+                            "{} {}{}:  {}".format(
+                                comment_marker, checkprefix, check_suffix, func_line
+                            )
+                        )
 
+                # Add space between different check prefixes and also before the first
+                # line of code in the test function.
+                output_lines.append(comment_marker)
+
+                # Remember new global variables we have not seen before
+                for key in global_vars_seen:
+                    if key not in global_vars_seen_before:
+                        global_vars_seen_dict[checkprefix][key] = global_vars_seen[key]
+                break
             # For IR output, change all defs to FileCheck variables, so we're immune
             # to variable naming fashions.
-            func_body = generalize_check_lines(
-                func_body, is_analyze, vars_seen, global_vars_seen
-            )
+            else:
+                func_body = generalize_check_lines(
+                    func_body, False, vars_seen, global_vars_seen, preserve_names
+                )
 
-            # This could be selectively enabled with an optional invocation argument.
-            # Disabled for now: better to check everything. Be safe rather than sorry.
+                # This could be selectively enabled with an optional invocation argument.
+                # Disabled for now: better to check everything. Be safe rather than sorry.
 
-            # Handle the first line of the function body as a special case because
-            # it's often just noise (a useless asm comment or entry label).
-            # if func_body[0].startswith("#") or func_body[0].startswith("entry:"):
-            #  is_blank_line = True
-            # else:
-            #  output_lines.append('%s %s:       %s' % (comment_marker, checkprefix, func_body[0]))
-            #  is_blank_line = False
+                # Handle the first line of the function body as a special case because
+                # it's often just noise (a useless asm comment or entry label).
+                # if func_body[0].startswith("#") or func_body[0].startswith("entry:"):
+                #  is_blank_line = True
+                # else:
+                #  output_lines.append('%s %s:       %s' % (comment_marker, checkprefix, func_body[0]))
+                #  is_blank_line = False
 
-            is_blank_line = False
+                is_blank_line = False
 
-            for func_line in func_body:
-                if func_line.strip() == "":
-                    is_blank_line = True
-                    continue
-                # Do not waste time checking IR comments.
-                func_line = SCRUB_IR_COMMENT_RE.sub(r"", func_line)
+                for func_line in func_body:
+                    if func_line.strip() == "":
+                        is_blank_line = True
+                        continue
+                    # Do not waste time checking IR comments.
+                    func_line = SCRUB_IR_COMMENT_RE.sub(r"", func_line)
 
-                # Skip blank lines instead of checking them.
-                if is_blank_line:
-                    output_lines.append(
-                        "{} {}:       {}".format(comment_marker, checkprefix, func_line)
-                    )
-                else:
-                    check_suffix = "-NEXT" if not is_filtered else ""
-                    output_lines.append(
-                        "{} {}{}:  {}".format(
-                            comment_marker, checkprefix, check_suffix, func_line
+                    # Skip blank lines instead of checking them.
+                    if is_blank_line:
+                        output_lines.append(
+                            "{} {}:       {}".format(
+                                comment_marker, checkprefix, func_line
+                            )
                         )
-                    )
-                is_blank_line = False
+                    else:
+                        check_suffix = "-NEXT" if not is_filtered else ""
+                        output_lines.append(
+                            "{} {}{}:  {}".format(
+                                comment_marker, checkprefix, check_suffix, func_line
+                            )
+                        )
+                    is_blank_line = False
 
-            # Add space between different check prefixes and also before the first
-            # line of code in the test function.
-            output_lines.append(comment_marker)
+                # Add space between different check prefixes and also before the first
+                # line of code in the test function.
+                output_lines.append(comment_marker)
 
-            # Remember new global variables we have not seen before
-            for key in global_vars_seen:
-                if key not in global_vars_seen_before:
-                    global_vars_seen_dict[checkprefix][key] = global_vars_seen[key]
-            break
+                # Remember new global variables we have not seen before
+                for key in global_vars_seen:
+                    if key not in global_vars_seen_before:
+                        global_vars_seen_dict[checkprefix][key] = global_vars_seen[key]
+                break
     return printed_prefixes
 
 
@@ -1439,10 +1517,11 @@ def add_ir_checks(
         func_name,
         check_label_format,
         False,
-        preserve_names,
+        False,
         version,
         global_vars_seen_dict,
         is_filtered,
+        preserve_names,
     )
 
 
@@ -1500,7 +1579,7 @@ def add_global_checks(
     prefix_list,
     output_lines,
     global_vars_seen_dict,
-    is_analyze,
+    preserve_names,
     is_before_functions,
 ):
     printed_prefixes = set()
@@ -1540,7 +1619,7 @@ def add_global_checks(
                         if not matched:
                             continue
                     tmp = generalize_check_lines(
-                        [line], is_analyze, set(), global_vars_seen
+                        [line], False, set(), global_vars_seen, preserve_names
                     )
                     check_line = "%s %s: %s" % (comment_marker, checkprefix, tmp[0])
                     check_lines.append(check_line)
diff --git a/llvm/utils/update_analyze_test_checks.py b/llvm/utils/update_analyze_test_checks.py
index ac0df3c51176d..03053e5447d11 100755
--- a/llvm/utils/update_analyze_test_checks.py
+++ b/llvm/utils/update_analyze_test_checks.py
@@ -187,10 +187,6 @@ def main():
             if is_in_function:
                 if common.should_add_line_to_output(input_line, prefix_set):
                     # This input line of the function body will go as-is into the output.
-                    # Except make leading whitespace uniform: 2 spaces.
-                    input_line = common.SCRUB_LEADING_WHITESPACE_RE.sub(
-                        r"  ", input_line
-                    )
                     output_lines.append(input_line)
                 else:
                     continue

From 4b383d0af93136b80841fc140da0823dfc441dd4 Mon Sep 17 00:00:00 2001
From: Zequan Wu <zequanwu@google.com>
Date: Tue, 31 Oct 2023 10:41:01 -0400
Subject: [PATCH 524/877] [Profile] Refactor profile correlation. (#70712)

Refactor some code from https://github.com/llvm/llvm-project/pull/69493.

Rebase of https://github.com/llvm/llvm-project/pull/69656 on top of main
as it was messed up.
---
 clang/lib/CodeGen/BackendUtil.cpp             |  14 ++-
 compiler-rt/lib/profile/InstrProfiling.c      |   4 +
 compiler-rt/lib/profile/InstrProfiling.h      |   6 ++
 .../lib/profile/InstrProfilingBuffer.c        |  11 ++
 compiler-rt/lib/profile/InstrProfilingMerge.c |  11 +-
 .../lib/profile/InstrProfilingWriter.c        |  21 ++--
 .../Darwin/instrprof-debug-info-correlate.c   |   4 +-
 .../instrprof-debug-info-correlate-warnings.c |   2 +-
 .../Linux/instrprof-debug-info-correlate.c    |   6 +-
 .../instrprof-show-debug-info-correlation.c   |   6 +-
 llvm/docs/CommandGuide/llvm-profdata.rst      |   4 +-
 .../llvm/ProfileData/InstrProfCorrelator.h    |  13 ++-
 .../llvm/ProfileData/InstrProfReader.h        |   2 +
 .../Instrumentation/PGOInstrumentation.h      |   2 -
 .../CodeGen/TargetLoweringObjectFileImpl.cpp  |  19 ++++
 llvm/lib/ProfileData/InstrProfCorrelator.cpp  | 100 +++++++++++-------
 llvm/lib/ProfileData/InstrProfReader.cpp      |   4 +-
 .../Instrumentation/InstrProfiling.cpp        |  18 ++--
 .../Instrumentation/PGOInstrumentation.cpp    |   6 +-
 .../debug-info-correlate-coverage.ll          |   2 +-
 .../InstrProfiling/debug-info-correlate.ll    |   2 +-
 llvm/tools/llvm-profdata/llvm-profdata.cpp    |   9 +-
 22 files changed, 176 insertions(+), 90 deletions(-)

diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp
index 70accce456d3c..83b81a38a7685 100644
--- a/clang/lib/CodeGen/BackendUtil.cpp
+++ b/clang/lib/CodeGen/BackendUtil.cpp
@@ -42,6 +42,7 @@
 #include "llvm/Passes/PassBuilder.h"
 #include "llvm/Passes/PassPlugin.h"
 #include "llvm/Passes/StandardInstrumentations.h"
+#include "llvm/ProfileData/InstrProfCorrelator.h"
 #include "llvm/Support/BuryPointer.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/MemoryBuffer.h"
@@ -55,6 +56,7 @@
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/TargetParser/SubtargetFeature.h"
 #include "llvm/TargetParser/Triple.h"
+#include "llvm/Transforms/HipStdPar/HipStdPar.h"
 #include "llvm/Transforms/IPO/EmbedBitcodePass.h"
 #include "llvm/Transforms/IPO/LowerTypeTests.h"
 #include "llvm/Transforms/IPO/ThinLTOBitcodeWriter.h"
@@ -78,7 +80,6 @@
 #include "llvm/Transforms/Scalar/EarlyCSE.h"
 #include "llvm/Transforms/Scalar/GVN.h"
 #include "llvm/Transforms/Scalar/JumpThreading.h"
-#include "llvm/Transforms/HipStdPar/HipStdPar.h"
 #include "llvm/Transforms/Utils/Debugify.h"
 #include "llvm/Transforms/Utils/EntryExitInstrumenter.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
@@ -98,13 +99,18 @@ extern cl::opt<bool> PrintPipelinePasses;
 static cl::opt<bool> ClSanitizeOnOptimizerEarlyEP(
     "sanitizer-early-opt-ep", cl::Optional,
     cl::desc("Insert sanitizers on OptimizerEarlyEP."), cl::init(false));
-}
+
+extern cl::opt<bool> DebugInfoCorrelate;
+extern cl::opt<InstrProfCorrelator::ProfCorrelatorKind> ProfileCorrelate;
+} // namespace llvm
 
 namespace {
 
 // Default filename used for profile generation.
 std::string getDefaultProfileGenName() {
-  return DebugInfoCorrelate ? "default_%m.proflite" : "default_%m.profraw";
+  return DebugInfoCorrelate || ProfileCorrelate != InstrProfCorrelator::NONE
+             ? "default_%m.proflite"
+             : "default_%m.profraw";
 }
 
 class EmitAssemblyHelper {
@@ -197,7 +203,7 @@ class EmitAssemblyHelper {
   void EmitAssembly(BackendAction Action,
                     std::unique_ptr<raw_pwrite_stream> OS);
 };
-}
+} // namespace
 
 static SanitizerCoverageOptions
 getSancovOptsFromCGOpts(const CodeGenOptions &CGOpts) {
diff --git a/compiler-rt/lib/profile/InstrProfiling.c b/compiler-rt/lib/profile/InstrProfiling.c
index da04d8ebdec95..7d69e37815c94 100644
--- a/compiler-rt/lib/profile/InstrProfiling.c
+++ b/compiler-rt/lib/profile/InstrProfiling.c
@@ -89,3 +89,7 @@ COMPILER_RT_VISIBILITY void __llvm_profile_reset_counters(void) {
   }
   lprofSetProfileDumped(0);
 }
+
+inline int hasCorrelation() {
+  return (__llvm_profile_get_version() & VARIANT_MASK_DBG_CORRELATE) != 0ULL;
+}
diff --git a/compiler-rt/lib/profile/InstrProfiling.h b/compiler-rt/lib/profile/InstrProfiling.h
index e143149fca827..b8104af5f12b9 100644
--- a/compiler-rt/lib/profile/InstrProfiling.h
+++ b/compiler-rt/lib/profile/InstrProfiling.h
@@ -261,6 +261,9 @@ uint64_t __llvm_profile_get_magic(void);
 /*! \brief Get the version of the file format. */
 uint64_t __llvm_profile_get_version(void);
 
+/*! \brief If the binary is compiled with profile correlation. */
+int hasCorrelation();
+
 /*! \brief Get the number of entries in the profile data section. */
 uint64_t __llvm_profile_get_num_data(const __llvm_profile_data *Begin,
                                      const __llvm_profile_data *End);
@@ -282,6 +285,9 @@ uint64_t __llvm_profile_get_counters_size(const char *Begin, const char *End);
 uint64_t __llvm_profile_get_num_bitmap_bytes(const char *Begin,
                                              const char *End);
 
+/*! \brief Get the size of the profile name section in bytes. */
+uint64_t __llvm_profile_get_name_size(const char *Begin, const char *End);
+
 /* ! \brief Given the sizes of the data and counter information, return the
  * number of padding bytes before and after the counters, and after the names,
  * in the raw profile.
diff --git a/compiler-rt/lib/profile/InstrProfilingBuffer.c b/compiler-rt/lib/profile/InstrProfilingBuffer.c
index c7217b2dfef8a..69965142d9787 100644
--- a/compiler-rt/lib/profile/InstrProfilingBuffer.c
+++ b/compiler-rt/lib/profile/InstrProfilingBuffer.c
@@ -56,6 +56,8 @@ uint64_t __llvm_profile_get_size_for_buffer(void) {
 COMPILER_RT_VISIBILITY
 uint64_t __llvm_profile_get_num_data(const __llvm_profile_data *Begin,
                                      const __llvm_profile_data *End) {
+  if (hasCorrelation())
+    return 0;
   intptr_t BeginI = (intptr_t)Begin, EndI = (intptr_t)End;
   return ((EndI + sizeof(__llvm_profile_data) - 1) - BeginI) /
          sizeof(__llvm_profile_data);
@@ -64,6 +66,8 @@ uint64_t __llvm_profile_get_num_data(const __llvm_profile_data *Begin,
 COMPILER_RT_VISIBILITY
 uint64_t __llvm_profile_get_data_size(const __llvm_profile_data *Begin,
                                       const __llvm_profile_data *End) {
+  if (hasCorrelation())
+    return 0;
   return __llvm_profile_get_num_data(Begin, End) * sizeof(__llvm_profile_data);
 }
 
@@ -92,6 +96,13 @@ uint64_t __llvm_profile_get_num_bitmap_bytes(const char *Begin,
   return (End - Begin);
 }
 
+COMPILER_RT_VISIBILITY
+uint64_t __llvm_profile_get_name_size(const char *Begin, const char *End) {
+  if (hasCorrelation())
+    return 0;
+  return End - Begin;
+}
+
 /// Calculate the number of padding bytes needed to add to \p Offset in order
 /// for (\p Offset + Padding) to be page-aligned.
 static uint64_t calculateBytesNeededToPageAlign(uint64_t Offset) {
diff --git a/compiler-rt/lib/profile/InstrProfilingMerge.c b/compiler-rt/lib/profile/InstrProfilingMerge.c
index c5f168bf75177..b08973debda94 100644
--- a/compiler-rt/lib/profile/InstrProfilingMerge.c
+++ b/compiler-rt/lib/profile/InstrProfilingMerge.c
@@ -69,8 +69,9 @@ int __llvm_profile_check_compatibility(const char *ProfileData,
       Header->NumBitmapBytes !=
           __llvm_profile_get_num_bitmap_bytes(__llvm_profile_begin_bitmap(),
                                               __llvm_profile_end_bitmap()) ||
-      Header->NamesSize != (uint64_t)(__llvm_profile_end_names() -
-                                      __llvm_profile_begin_names()) ||
+      Header->NamesSize !=
+          __llvm_profile_get_name_size(__llvm_profile_begin_names(),
+                                       __llvm_profile_end_names()) ||
       Header->ValueKindLast != IPVK_Last)
     return 1;
 
@@ -138,9 +139,9 @@ int __llvm_profile_merge_from_buffer(const char *ProfileData,
   if (SrcNameStart < SrcCountersStart || SrcNameStart < SrcBitmapStart)
     return 1;
 
-  // Merge counters by iterating the entire counter section when debug info
-  // correlation is enabled.
-  if (__llvm_profile_get_version() & VARIANT_MASK_DBG_CORRELATE) {
+  // Merge counters by iterating the entire counter section when correlation is
+  // enabled.
+  if (hasCorrelation()) {
     for (SrcCounter = SrcCountersStart,
         DstCounter = __llvm_profile_begin_counters();
          SrcCounter < SrcCountersEnd;) {
diff --git a/compiler-rt/lib/profile/InstrProfilingWriter.c b/compiler-rt/lib/profile/InstrProfilingWriter.c
index 3b61f3def9f6e..d35ee6b20504f 100644
--- a/compiler-rt/lib/profile/InstrProfilingWriter.c
+++ b/compiler-rt/lib/profile/InstrProfilingWriter.c
@@ -262,21 +262,19 @@ lprofWriteDataImpl(ProfDataWriter *Writer, const __llvm_profile_data *DataBegin,
                    const char *BitmapBegin, const char *BitmapEnd,
                    VPDataReaderType *VPDataReader, const char *NamesBegin,
                    const char *NamesEnd, int SkipNameDataWrite) {
-  int DebugInfoCorrelate =
-      (__llvm_profile_get_version() & VARIANT_MASK_DBG_CORRELATE) != 0ULL;
+  int ProfileCorrelation = hasCorrelation();
 
   /* Calculate size of sections. */
   const uint64_t DataSectionSize =
-      DebugInfoCorrelate ? 0 : __llvm_profile_get_data_size(DataBegin, DataEnd);
-  const uint64_t NumData =
-      DebugInfoCorrelate ? 0 : __llvm_profile_get_num_data(DataBegin, DataEnd);
+      __llvm_profile_get_data_size(DataBegin, DataEnd);
+  const uint64_t NumData = __llvm_profile_get_num_data(DataBegin, DataEnd);
   const uint64_t CountersSectionSize =
       __llvm_profile_get_counters_size(CountersBegin, CountersEnd);
   const uint64_t NumCounters =
       __llvm_profile_get_num_counters(CountersBegin, CountersEnd);
   const uint64_t NumBitmapBytes =
       __llvm_profile_get_num_bitmap_bytes(BitmapBegin, BitmapEnd);
-  const uint64_t NamesSize = DebugInfoCorrelate ? 0 : NamesEnd - NamesBegin;
+  const uint64_t NamesSize = __llvm_profile_get_name_size(NamesBegin, NamesEnd);
 
   /* Create the header. */
   __llvm_profile_header Header;
@@ -304,7 +302,7 @@ lprofWriteDataImpl(ProfDataWriter *Writer, const __llvm_profile_data *DataBegin,
 #endif
 
   /* The data and names sections are omitted in lightweight mode. */
-  if (DebugInfoCorrelate) {
+  if (ProfileCorrelation) {
     Header.CountersDelta = 0;
     Header.NamesDelta = 0;
   }
@@ -320,21 +318,22 @@ lprofWriteDataImpl(ProfDataWriter *Writer, const __llvm_profile_data *DataBegin,
 
   /* Write the profile data. */
   ProfDataIOVec IOVecData[] = {
-      {DebugInfoCorrelate ? NULL : DataBegin, sizeof(uint8_t), DataSectionSize,
+      {ProfileCorrelation ? NULL : DataBegin, sizeof(uint8_t), DataSectionSize,
        0},
       {NULL, sizeof(uint8_t), PaddingBytesBeforeCounters, 1},
       {CountersBegin, sizeof(uint8_t), CountersSectionSize, 0},
       {NULL, sizeof(uint8_t), PaddingBytesAfterCounters, 1},
       {BitmapBegin, sizeof(uint8_t), NumBitmapBytes, 0},
       {NULL, sizeof(uint8_t), PaddingBytesAfterBitmapBytes, 1},
-      {(SkipNameDataWrite || DebugInfoCorrelate) ? NULL : NamesBegin,
+      {(SkipNameDataWrite || ProfileCorrelation) ? NULL : NamesBegin,
        sizeof(uint8_t), NamesSize, 0},
       {NULL, sizeof(uint8_t), PaddingBytesAfterNames, 1}};
   if (Writer->Write(Writer, IOVecData, sizeof(IOVecData) / sizeof(*IOVecData)))
     return -1;
 
-  /* Value profiling is not yet supported in continuous mode. */
-  if (__llvm_profile_is_continuous_mode_enabled())
+  /* Value profiling is not yet supported in continuous mode and profile
+   * correlation mode. */
+  if (__llvm_profile_is_continuous_mode_enabled() || ProfileCorrelation)
     return 0;
 
   return writeValueProfData(Writer, VPDataReader, DataBegin, DataEnd);
diff --git a/compiler-rt/test/profile/Darwin/instrprof-debug-info-correlate.c b/compiler-rt/test/profile/Darwin/instrprof-debug-info-correlate.c
index f347d439e2e06..46d25a4e386dc 100644
--- a/compiler-rt/test/profile/Darwin/instrprof-debug-info-correlate.c
+++ b/compiler-rt/test/profile/Darwin/instrprof-debug-info-correlate.c
@@ -1,5 +1,5 @@
 // Value profiling is currently not supported in lightweight mode.
-// RUN: %clang_pgogen -o %t -g -mllvm --debug-info-correlate -mllvm --disable-vp=true %S/../Inputs/instrprof-debug-info-correlate-main.cpp %S/../Inputs/instrprof-debug-info-correlate-foo.cpp
+// RUN: %clang_pgogen -o %t -g -mllvm --profile-correlate=debug-info -mllvm --disable-vp=true %S/../Inputs/instrprof-debug-info-correlate-main.cpp %S/../Inputs/instrprof-debug-info-correlate-foo.cpp
 // RUN: env LLVM_PROFILE_FILE=%t.proflite %run %t
 // RUN: llvm-profdata merge -o %t.profdata --debug-info=%t.dSYM %t.proflite
 
@@ -9,7 +9,7 @@
 
 // RUN: diff <(llvm-profdata show --all-functions --counts %t.normal.profdata) <(llvm-profdata show --all-functions --counts %t.profdata)
 
-// RUN: %clang_pgogen -o %t.cov -g -mllvm --debug-info-correlate -mllvm -pgo-function-entry-coverage -mllvm --disable-vp=true %S/../Inputs/instrprof-debug-info-correlate-main.cpp %S/../Inputs/instrprof-debug-info-correlate-foo.cpp
+// RUN: %clang_pgogen -o %t.cov -g -mllvm --profile-correlate=debug-info -mllvm -pgo-function-entry-coverage -mllvm --disable-vp=true %S/../Inputs/instrprof-debug-info-correlate-main.cpp %S/../Inputs/instrprof-debug-info-correlate-foo.cpp
 // RUN: env LLVM_PROFILE_FILE=%t.cov.proflite %run %t.cov
 // RUN: llvm-profdata merge -o %t.cov.profdata --debug-info=%t.cov.dSYM %t.cov.proflite
 
diff --git a/compiler-rt/test/profile/Linux/instrprof-debug-info-correlate-warnings.c b/compiler-rt/test/profile/Linux/instrprof-debug-info-correlate-warnings.c
index 5069c6340b64f..25022f241a6d2 100644
--- a/compiler-rt/test/profile/Linux/instrprof-debug-info-correlate-warnings.c
+++ b/compiler-rt/test/profile/Linux/instrprof-debug-info-correlate-warnings.c
@@ -1,6 +1,6 @@
 // Disable full debug info and verify that we get warnings during merging
 
-// RUN: %clang_pgogen -o %t -gline-tables-only -mllvm --debug-info-correlate -mllvm --disable-vp=true %S/../Inputs/instrprof-debug-info-correlate-main.cpp %S/../Inputs/instrprof-debug-info-correlate-foo.cpp
+// RUN: %clang_pgogen -o %t -gline-tables-only -mllvm --profile-correlate=debug-info -mllvm --disable-vp=true %S/../Inputs/instrprof-debug-info-correlate-main.cpp %S/../Inputs/instrprof-debug-info-correlate-foo.cpp
 // RUN: env LLVM_PROFILE_FILE=%t.proflite %run %t
 // RUN: llvm-profdata merge -o %t.profdata --debug-info=%t %t.proflite --max-debug-info-correlation-warnings=2 2>&1 >/dev/null | FileCheck %s --check-prefixes=CHECK,LIMIT --implicit-check-not=warning
 // RUN: llvm-profdata merge -o %t.profdata --debug-info=%t %t.proflite --max-debug-info-correlation-warnings=0 2>&1 >/dev/null | FileCheck %s --check-prefixes=CHECK,NOLIMIT --implicit-check-not=warning
diff --git a/compiler-rt/test/profile/Linux/instrprof-debug-info-correlate.c b/compiler-rt/test/profile/Linux/instrprof-debug-info-correlate.c
index a918d7b629900..ccfd65b6f4c4b 100644
--- a/compiler-rt/test/profile/Linux/instrprof-debug-info-correlate.c
+++ b/compiler-rt/test/profile/Linux/instrprof-debug-info-correlate.c
@@ -3,19 +3,19 @@
 // RUN: env LLVM_PROFILE_FILE=%t.profraw %run %t.normal
 // RUN: llvm-profdata merge -o %t.normal.profdata %t.profraw
 
-// RUN: %clang_pgogen -o %t.d4 -g -gdwarf-4 -mllvm --debug-info-correlate -mllvm --disable-vp=true %S/../Inputs/instrprof-debug-info-correlate-main.cpp %S/../Inputs/instrprof-debug-info-correlate-foo.cpp
+// RUN: %clang_pgogen -o %t.d4 -g -gdwarf-4 -mllvm --profile-correlate=debug-info -mllvm --disable-vp=true %S/../Inputs/instrprof-debug-info-correlate-main.cpp %S/../Inputs/instrprof-debug-info-correlate-foo.cpp
 // RUN: env LLVM_PROFILE_FILE=%t.d4.proflite %run %t.d4
 // RUN: llvm-profdata merge -o %t.d4.profdata --debug-info=%t.d4 %t.d4.proflite
 
 // RUN: diff <(llvm-profdata show --all-functions --counts %t.normal.profdata) <(llvm-profdata show --all-functions --counts %t.d4.profdata)
 
-// RUN: %clang_pgogen -o %t -g -mllvm --debug-info-correlate -mllvm --disable-vp=true %S/../Inputs/instrprof-debug-info-correlate-main.cpp %S/../Inputs/instrprof-debug-info-correlate-foo.cpp
+// RUN: %clang_pgogen -o %t -g -mllvm --profile-correlate=debug-info -mllvm --disable-vp=true %S/../Inputs/instrprof-debug-info-correlate-main.cpp %S/../Inputs/instrprof-debug-info-correlate-foo.cpp
 // RUN: env LLVM_PROFILE_FILE=%t.proflite %run %t
 // RUN: llvm-profdata merge -o %t.profdata --debug-info=%t %t.proflite
 
 // RUN: diff <(llvm-profdata show --all-functions --counts %t.normal.profdata) <(llvm-profdata show --all-functions --counts %t.profdata)
 
-// RUN: %clang_pgogen -o %t.cov -g -mllvm --debug-info-correlate -mllvm -pgo-function-entry-coverage -mllvm --disable-vp=true %S/../Inputs/instrprof-debug-info-correlate-main.cpp %S/../Inputs/instrprof-debug-info-correlate-foo.cpp
+// RUN: %clang_pgogen -o %t.cov -g -mllvm --profile-correlate=debug-info -mllvm -pgo-function-entry-coverage -mllvm --disable-vp=true %S/../Inputs/instrprof-debug-info-correlate-main.cpp %S/../Inputs/instrprof-debug-info-correlate-foo.cpp
 // RUN: env LLVM_PROFILE_FILE=%t.cov.proflite %run %t.cov
 // RUN: llvm-profdata merge -o %t.cov.profdata --debug-info=%t.cov %t.cov.proflite
 
diff --git a/compiler-rt/test/profile/Linux/instrprof-show-debug-info-correlation.c b/compiler-rt/test/profile/Linux/instrprof-show-debug-info-correlation.c
index 226d678aca347..93bf40f98d3ab 100644
--- a/compiler-rt/test/profile/Linux/instrprof-show-debug-info-correlation.c
+++ b/compiler-rt/test/profile/Linux/instrprof-show-debug-info-correlation.c
@@ -1,10 +1,10 @@
-// RUN: %clang_pgogen -o %t -g -mllvm --debug-info-correlate -mllvm --disable-vp=true %s
+// RUN: %clang_pgogen -o %t -g -mllvm --profile-correlate=debug-info -mllvm --disable-vp=true %s
 // RUN: llvm-profdata show --debug-info=%t --detailed-summary --show-prof-sym-list | FileCheck %s
 // RUN: llvm-profdata show --debug-info=%t --show-format=yaml | FileCheck %s --match-full-lines --check-prefix YAML
 
-// RUN: %clang_pgogen -o %t.no.dbg -mllvm --debug-info-correlate -mllvm --disable-vp=true %s
+// RUN: %clang_pgogen -o %t.no.dbg -mllvm --profile-correlate=debug-info -mllvm --disable-vp=true %s
 // RUN: not llvm-profdata show --debug-info=%t.no.dbg 2>&1 | FileCheck %s --check-prefix NO-DBG
-// NO-DBG: unable to correlate profile: could not find any profile metadata in debug info
+// NO-DBG: unable to correlate profile: could not find any profile data metadata in correlated file
 
 // YAML: Probes:
 // YAML:   - Function Name:   a
diff --git a/llvm/docs/CommandGuide/llvm-profdata.rst b/llvm/docs/CommandGuide/llvm-profdata.rst
index be42733ca1405..73759f14b49a9 100644
--- a/llvm/docs/CommandGuide/llvm-profdata.rst
+++ b/llvm/docs/CommandGuide/llvm-profdata.rst
@@ -195,7 +195,7 @@ OPTIONS
 .. option:: --debug-info=<path>
 
  Specify the executable or ``.dSYM`` that contains debug info for the raw profile.
- When ``-debug-info-correlate`` was used for instrumentation, use this option
+ When ``-profile-correlate=debug-info`` was used for instrumentation, use this option
  to correlate the raw profile.
 
 .. option:: --temporal-profile-trace-reservoir-size
@@ -346,7 +346,7 @@ OPTIONS
 .. option:: --debug-info=<path>
 
  Specify the executable or ``.dSYM`` that contains debug info for the raw profile.
- When ``-debug-info-correlate`` was used for instrumentation, use this option
+ When ``-profile-correlate=debug-info`` was used for instrumentation, use this option
  to show the correlated functions from the raw profile.
 
 .. option:: --covered
diff --git a/llvm/include/llvm/ProfileData/InstrProfCorrelator.h b/llvm/include/llvm/ProfileData/InstrProfCorrelator.h
index dd062951f277c..a3a0805a294a2 100644
--- a/llvm/include/llvm/ProfileData/InstrProfCorrelator.h
+++ b/llvm/include/llvm/ProfileData/InstrProfCorrelator.h
@@ -31,8 +31,11 @@ class ObjectFile;
 /// to their functions.
 class InstrProfCorrelator {
 public:
+  /// Indicate which kind correlator to use.
+  enum ProfCorrelatorKind { NONE, DEBUG_INFO };
+
   static llvm::Expected<std::unique_ptr<InstrProfCorrelator>>
-  get(StringRef DebugInfoFilename);
+  get(StringRef Filename, ProfCorrelatorKind FileKind);
 
   /// Construct a ProfileData vector used to correlate raw instrumentation data
   /// to their functions.
@@ -104,7 +107,7 @@ class InstrProfCorrelator {
 
 private:
   static llvm::Expected<std::unique_ptr<InstrProfCorrelator>>
-  get(std::unique_ptr<MemoryBuffer> Buffer);
+  get(std::unique_ptr<MemoryBuffer> Buffer, ProfCorrelatorKind FileKind);
 
   const InstrProfCorrelatorKind Kind;
 };
@@ -128,7 +131,7 @@ class InstrProfCorrelatorImpl : public InstrProfCorrelator {
 
   static llvm::Expected<std::unique_ptr<InstrProfCorrelatorImpl<IntPtrT>>>
   get(std::unique_ptr<InstrProfCorrelator::Context> Ctx,
-      const object::ObjectFile &Obj);
+      const object::ObjectFile &Obj, ProfCorrelatorKind FileKind);
 
 protected:
   std::vector<RawInstrProf::ProfileData<IntPtrT>> Data;
@@ -138,6 +141,8 @@ class InstrProfCorrelatorImpl : public InstrProfCorrelator {
       int MaxWarnings,
       InstrProfCorrelator::CorrelationData *Data = nullptr) = 0;
 
+  virtual Error correlateProfileNameImpl() = 0;
+
   Error dumpYaml(int MaxWarnings, raw_ostream &OS) override;
 
   void addProbe(StringRef FunctionName, uint64_t CFGHash, IntPtrT CounterOffset,
@@ -205,6 +210,8 @@ class DwarfInstrProfCorrelator : public InstrProfCorrelatorImpl<IntPtrT> {
   void correlateProfileDataImpl(
       int MaxWarnings,
       InstrProfCorrelator::CorrelationData *Data = nullptr) override;
+
+  Error correlateProfileNameImpl() override;
 };
 
 } // end namespace llvm
diff --git a/llvm/include/llvm/ProfileData/InstrProfReader.h b/llvm/include/llvm/ProfileData/InstrProfReader.h
index cf6429a324d36..1084a58eb098d 100644
--- a/llvm/include/llvm/ProfileData/InstrProfReader.h
+++ b/llvm/include/llvm/ProfileData/InstrProfReader.h
@@ -377,6 +377,8 @@ class RawInstrProfReader : public InstrProfReader {
     return (Version & VARIANT_MASK_DBG_CORRELATE) != 0;
   }
 
+  bool useCorrelate() const { return useDebugInfoCorrelate(); }
+
   bool hasSingleByteCoverage() const override {
     return (Version & VARIANT_MASK_BYTE_COVERAGE) != 0;
   }
diff --git a/llvm/include/llvm/Transforms/Instrumentation/PGOInstrumentation.h b/llvm/include/llvm/Transforms/Instrumentation/PGOInstrumentation.h
index 5b1977b7de9a2..3d8f3bd923545 100644
--- a/llvm/include/llvm/Transforms/Instrumentation/PGOInstrumentation.h
+++ b/llvm/include/llvm/Transforms/Instrumentation/PGOInstrumentation.h
@@ -24,8 +24,6 @@
 
 namespace llvm {
 
-extern cl::opt<bool> DebugInfoCorrelate;
-
 class Function;
 class Instruction;
 class Module;
diff --git a/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp b/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
index f3ba380818901..cf4983a6d0531 100644
--- a/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
@@ -57,6 +57,7 @@
 #include "llvm/MC/MCValue.h"
 #include "llvm/MC/SectionKind.h"
 #include "llvm/ProfileData/InstrProf.h"
+#include "llvm/ProfileData/InstrProfCorrelator.h"
 #include "llvm/Support/Base64.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CodeGen.h"
@@ -71,6 +72,24 @@
 using namespace llvm;
 using namespace dwarf;
 
+namespace llvm {
+// Deprecated. Use -profile-correlate=debug-info.
+cl::opt<bool> DebugInfoCorrelate(
+    "debug-info-correlate",
+    cl::desc("Use debug info to correlate profiles (Deprecated). Use "
+             "-profile-correlate=debug-info instead."),
+    cl::init(false));
+
+cl::opt<InstrProfCorrelator::ProfCorrelatorKind> ProfileCorrelate(
+    "profile-correlate",
+    cl::desc("Use debug info or binary file to correlate profiles."),
+    cl::init(InstrProfCorrelator::NONE),
+    cl::values(clEnumValN(InstrProfCorrelator::NONE, "",
+                          "No profile correlation"),
+               clEnumValN(InstrProfCorrelator::DEBUG_INFO, "debug-info",
+                          "Use debug info to correlate")));
+} // namespace llvm
+
 static cl::opt<bool> JumpTableInFunctionSection(
     "jumptable-in-function-section", cl::Hidden, cl::init(false),
     cl::desc("Putting Jump Table in function section"));
diff --git a/llvm/lib/ProfileData/InstrProfCorrelator.cpp b/llvm/lib/ProfileData/InstrProfCorrelator.cpp
index 2138368500bed..e429b9f1da54c 100644
--- a/llvm/lib/ProfileData/InstrProfCorrelator.cpp
+++ b/llvm/lib/ProfileData/InstrProfCorrelator.cpp
@@ -24,15 +24,20 @@
 
 using namespace llvm;
 
-/// Get the __llvm_prf_cnts section.
-Expected<object::SectionRef> getCountersSection(const object::ObjectFile &Obj) {
+/// Get profile section.
+Expected<object::SectionRef> getInstrProfSection(const object::ObjectFile &Obj,
+                                                 InstrProfSectKind IPSK) {
+  Triple::ObjectFormatType ObjFormat = Obj.getTripleObjectFormat();
+  std::string ExpectedSectionName =
+      getInstrProfSectionName(IPSK, ObjFormat,
+                              /*AddSegmentInfo=*/false);
   for (auto &Section : Obj.sections())
     if (auto SectionName = Section.getName())
-      if (SectionName.get() == INSTR_PROF_CNTS_SECT_NAME)
+      if (SectionName.get() == ExpectedSectionName)
         return Section;
   return make_error<InstrProfError>(
       instrprof_error::unable_to_correlate_profile,
-      "could not find counter section (" INSTR_PROF_CNTS_SECT_NAME ")");
+      "could not find section (" + Twine(ExpectedSectionName) + ")");
 }
 
 const char *InstrProfCorrelator::FunctionNameAttributeName = "Function Name";
@@ -42,7 +47,7 @@ const char *InstrProfCorrelator::NumCountersAttributeName = "Num Counters";
 llvm::Expected<std::unique_ptr<InstrProfCorrelator::Context>>
 InstrProfCorrelator::Context::get(std::unique_ptr<MemoryBuffer> Buffer,
                                   const object::ObjectFile &Obj) {
-  auto CountersSection = getCountersSection(Obj);
+  auto CountersSection = getInstrProfSection(Obj, IPSK_cnts);
   if (auto Err = CountersSection.takeError())
     return std::move(Err);
   auto C = std::make_unique<Context>();
@@ -54,30 +59,32 @@ InstrProfCorrelator::Context::get(std::unique_ptr<MemoryBuffer> Buffer,
 }
 
 llvm::Expected<std::unique_ptr<InstrProfCorrelator>>
-InstrProfCorrelator::get(StringRef DebugInfoFilename) {
-  auto DsymObjectsOrErr =
-      object::MachOObjectFile::findDsymObjectMembers(DebugInfoFilename);
-  if (auto Err = DsymObjectsOrErr.takeError())
-    return std::move(Err);
-  if (!DsymObjectsOrErr->empty()) {
-    // TODO: Enable profile correlation when there are multiple objects in a
-    // dSYM bundle.
-    if (DsymObjectsOrErr->size() > 1)
-      return make_error<InstrProfError>(
-          instrprof_error::unable_to_correlate_profile,
-          "using multiple objects is not yet supported");
-    DebugInfoFilename = *DsymObjectsOrErr->begin();
+InstrProfCorrelator::get(StringRef Filename, ProfCorrelatorKind FileKind) {
+  if (FileKind == DEBUG_INFO) {
+    auto DsymObjectsOrErr =
+        object::MachOObjectFile::findDsymObjectMembers(Filename);
+    if (auto Err = DsymObjectsOrErr.takeError())
+      return std::move(Err);
+    if (!DsymObjectsOrErr->empty()) {
+      // TODO: Enable profile correlation when there are multiple objects in a
+      // dSYM bundle.
+      if (DsymObjectsOrErr->size() > 1)
+        return make_error<InstrProfError>(
+            instrprof_error::unable_to_correlate_profile,
+            "using multiple objects is not yet supported");
+      Filename = *DsymObjectsOrErr->begin();
+    }
   }
-  auto BufferOrErr =
-      errorOrToExpected(MemoryBuffer::getFile(DebugInfoFilename));
+  auto BufferOrErr = errorOrToExpected(MemoryBuffer::getFile(Filename));
   if (auto Err = BufferOrErr.takeError())
     return std::move(Err);
 
-  return get(std::move(*BufferOrErr));
+  return get(std::move(*BufferOrErr), FileKind);
 }
 
 llvm::Expected<std::unique_ptr<InstrProfCorrelator>>
-InstrProfCorrelator::get(std::unique_ptr<MemoryBuffer> Buffer) {
+InstrProfCorrelator::get(std::unique_ptr<MemoryBuffer> Buffer,
+                         ProfCorrelatorKind FileKind) {
   auto BinOrErr = object::createBinary(*Buffer);
   if (auto Err = BinOrErr.takeError())
     return std::move(Err);
@@ -88,9 +95,11 @@ InstrProfCorrelator::get(std::unique_ptr<MemoryBuffer> Buffer) {
       return std::move(Err);
     auto T = Obj->makeTriple();
     if (T.isArch64Bit())
-      return InstrProfCorrelatorImpl<uint64_t>::get(std::move(*CtxOrErr), *Obj);
+      return InstrProfCorrelatorImpl<uint64_t>::get(std::move(*CtxOrErr), *Obj,
+                                                    FileKind);
     if (T.isArch32Bit())
-      return InstrProfCorrelatorImpl<uint32_t>::get(std::move(*CtxOrErr), *Obj);
+      return InstrProfCorrelatorImpl<uint32_t>::get(std::move(*CtxOrErr), *Obj,
+                                                    FileKind);
   }
   return make_error<InstrProfError>(
       instrprof_error::unable_to_correlate_profile, "not an object file");
@@ -132,29 +141,33 @@ template <class IntPtrT>
 llvm::Expected<std::unique_ptr<InstrProfCorrelatorImpl<IntPtrT>>>
 InstrProfCorrelatorImpl<IntPtrT>::get(
     std::unique_ptr<InstrProfCorrelator::Context> Ctx,
-    const object::ObjectFile &Obj) {
-  if (Obj.isELF() || Obj.isMachO()) {
-    auto DICtx = DWARFContext::create(Obj);
-    return std::make_unique<DwarfInstrProfCorrelator<IntPtrT>>(std::move(DICtx),
-                                                               std::move(Ctx));
+    const object::ObjectFile &Obj, ProfCorrelatorKind FileKind) {
+  if (FileKind == DEBUG_INFO) {
+    if (Obj.isELF() || Obj.isMachO()) {
+      auto DICtx = DWARFContext::create(Obj);
+      return std::make_unique<DwarfInstrProfCorrelator<IntPtrT>>(
+          std::move(DICtx), std::move(Ctx));
+    }
+    return make_error<InstrProfError>(
+        instrprof_error::unable_to_correlate_profile,
+        "unsupported debug info format (only DWARF is supported)");
   }
   return make_error<InstrProfError>(
       instrprof_error::unable_to_correlate_profile,
-      "unsupported debug info format (only DWARF is supported)");
+      "unsupported correlation file type (only DWARF is supported)");
 }
 
 template <class IntPtrT>
 Error InstrProfCorrelatorImpl<IntPtrT>::correlateProfileData(int MaxWarnings) {
   assert(Data.empty() && Names.empty() && NamesVec.empty());
   correlateProfileDataImpl(MaxWarnings);
-  if (Data.empty() || NamesVec.empty())
+  if (this->Data.empty())
     return make_error<InstrProfError>(
         instrprof_error::unable_to_correlate_profile,
-        "could not find any profile metadata in debug info");
-  auto Result =
-      collectGlobalObjectNameStrings(NamesVec, /*doCompression=*/false, Names);
-  CounterOffsets.clear();
-  NamesVec.clear();
+        "could not find any profile data metadata in correlated file");
+  Error Result = correlateProfileNameImpl();
+  this->CounterOffsets.clear();
+  this->NamesVec.clear();
   return Result;
 }
 
@@ -189,7 +202,7 @@ Error InstrProfCorrelatorImpl<IntPtrT>::dumpYaml(int MaxWarnings,
   if (Data.Probes.empty())
     return make_error<InstrProfError>(
         instrprof_error::unable_to_correlate_profile,
-        "could not find any profile metadata in debug info");
+        "could not find any profile data metadata in debug info");
   yaml::Output YamlOS(OS);
   YamlOS << Data;
   return Error::success();
@@ -365,3 +378,16 @@ void DwarfInstrProfCorrelator<IntPtrT>::correlateProfileDataImpl(
     WithColor::warning() << format("Suppressed %d additional warnings\n",
                                    NumSuppressedWarnings);
 }
+
+template <class IntPtrT>
+Error DwarfInstrProfCorrelator<IntPtrT>::correlateProfileNameImpl() {
+  if (this->NamesVec.empty()) {
+    return make_error<InstrProfError>(
+        instrprof_error::unable_to_correlate_profile,
+        "could not find any profile name metadata in debug info");
+  }
+  auto Result =
+      collectGlobalObjectNameStrings(this->NamesVec,
+                                     /*doCompression=*/false, this->Names);
+  return Result;
+}
diff --git a/llvm/lib/ProfileData/InstrProfReader.cpp b/llvm/lib/ProfileData/InstrProfReader.cpp
index 31d3ff2bcbf6c..dd66f4247ec73 100644
--- a/llvm/lib/ProfileData/InstrProfReader.cpp
+++ b/llvm/lib/ProfileData/InstrProfReader.cpp
@@ -563,9 +563,9 @@ Error RawInstrProfReader<IntPtrT>::readHeader(
                   "\nPLEASE update this tool to version in the raw profile, or "
                   "regenerate raw profile with expected version.")
                      .str());
-  if (useDebugInfoCorrelate() && !Correlator)
+  if (useCorrelate() && !Correlator)
     return error(instrprof_error::missing_debug_info_for_correlation);
-  if (!useDebugInfoCorrelate() && Correlator)
+  if (!useCorrelate() && Correlator)
     return error(instrprof_error::unexpected_debug_info_for_correlation);
 
   BinaryIdsSize = swap(Header.BinaryIdsSize);
diff --git a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
index 55eef2b76e9be..0ca04d1f71df8 100644
--- a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
+++ b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
@@ -60,10 +60,8 @@ using namespace llvm;
 #define DEBUG_TYPE "instrprof"
 
 namespace llvm {
-cl::opt<bool>
-    DebugInfoCorrelate("debug-info-correlate",
-                       cl::desc("Use debug info to correlate profiles."),
-                       cl::init(false));
+extern cl::opt<bool> DebugInfoCorrelate;
+extern cl::opt<InstrProfCorrelator::ProfCorrelatorKind> ProfileCorrelate;
 } // namespace llvm
 
 namespace {
@@ -650,7 +648,7 @@ void InstrProfiling::lowerValueProfileInst(InstrProfValueProfileInst *Ind) {
   // in lightweight mode. We need to move the value profile pointer to the
   // Counter struct to get this working.
   assert(
-      !DebugInfoCorrelate &&
+      (!DebugInfoCorrelate || ProfileCorrelate != InstrProfCorrelator::NONE) &&
       "Value profiling is not yet supported with lightweight instrumentation");
   GlobalVariable *Name = Ind->getName();
   auto It = ProfileDataMap.find(Name);
@@ -1078,10 +1076,11 @@ GlobalVariable *InstrProfiling::setupProfileSection(InstrProfInstBase *Inc,
   Function *Fn = Inc->getParent()->getParent();
   GlobalValue::LinkageTypes Linkage = NamePtr->getLinkage();
   GlobalValue::VisibilityTypes Visibility = NamePtr->getVisibility();
-
+  bool EnableDebugCorrelate =
+      DebugInfoCorrelate || ProfileCorrelate == InstrProfCorrelator::DEBUG_INFO;
   // Use internal rather than private linkage so the counter variable shows up
   // in the symbol table when using debug info for correlation.
-  if (DebugInfoCorrelate && TT.isOSBinFormatMachO() &&
+  if (EnableDebugCorrelate && TT.isOSBinFormatMachO() &&
       Linkage == GlobalValue::PrivateLinkage)
     Linkage = GlobalValue::InternalLinkage;
 
@@ -1202,7 +1201,8 @@ InstrProfiling::getOrCreateRegionCounters(InstrProfCntrInstBase *Inc) {
   auto *CounterPtr = setupProfileSection(Inc, IPSK_cnts);
   PD.RegionCounters = CounterPtr;
 
-  if (DebugInfoCorrelate) {
+  if (DebugInfoCorrelate ||
+      ProfileCorrelate == InstrProfCorrelator::DEBUG_INFO) {
     LLVMContext &Ctx = M->getContext();
     Function *Fn = Inc->getParent()->getParent();
     if (auto *SP = Fn->getSubprogram()) {
@@ -1245,7 +1245,7 @@ void InstrProfiling::createDataVariable(InstrProfCntrInstBase *Inc,
                                         InstrProfMCDCBitmapParameters *Params) {
   // When debug information is correlated to profile data, a data variable
   // is not needed.
-  if (DebugInfoCorrelate)
+  if (DebugInfoCorrelate || ProfileCorrelate == InstrProfCorrelator::DEBUG_INFO)
     return;
 
   GlobalVariable *NamePtr = Inc->getName();
diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
index 7ad1c9bc54f37..1e6f2470dedb7 100644
--- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
+++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
@@ -94,6 +94,7 @@
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Value.h"
 #include "llvm/ProfileData/InstrProf.h"
+#include "llvm/ProfileData/InstrProfCorrelator.h"
 #include "llvm/ProfileData/InstrProfReader.h"
 #include "llvm/Support/BranchProbability.h"
 #include "llvm/Support/CRC.h"
@@ -219,6 +220,9 @@ cl::opt<bool> NoPGOWarnMismatchComdatWeak(
     cl::desc("The option is used to turn on/off "
              "warnings about hash mismatch for comdat "
              "or weak functions."));
+
+extern cl::opt<bool> DebugInfoCorrelate;
+extern cl::opt<InstrProfCorrelator::ProfCorrelatorKind> ProfileCorrelate;
 } // namespace llvm
 
 // Command line option to enable/disable select instruction instrumentation.
@@ -381,7 +385,7 @@ static GlobalVariable *createIRLevelProfileFlagVar(Module &M, bool IsCS) {
     ProfileVersion |= VARIANT_MASK_CSIR_PROF;
   if (PGOInstrumentEntry)
     ProfileVersion |= VARIANT_MASK_INSTR_ENTRY;
-  if (DebugInfoCorrelate)
+  if (DebugInfoCorrelate || ProfileCorrelate == InstrProfCorrelator::DEBUG_INFO)
     ProfileVersion |= VARIANT_MASK_DBG_CORRELATE;
   if (PGOFunctionEntryCoverage)
     ProfileVersion |=
diff --git a/llvm/test/Instrumentation/InstrProfiling/debug-info-correlate-coverage.ll b/llvm/test/Instrumentation/InstrProfiling/debug-info-correlate-coverage.ll
index 192bac6e503a0..dd64615338170 100644
--- a/llvm/test/Instrumentation/InstrProfiling/debug-info-correlate-coverage.ll
+++ b/llvm/test/Instrumentation/InstrProfiling/debug-info-correlate-coverage.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -passes=instrprof -debug-info-correlate -S | opt -O2 -S | FileCheck %s
+; RUN: opt < %s -passes=instrprof -profile-correlate=debug-info -S | opt -O2 -S | FileCheck %s
 
 @__profn_foo = private constant [3 x i8] c"foo"
 ; CHECK:      @__profc_foo
diff --git a/llvm/test/Instrumentation/InstrProfiling/debug-info-correlate.ll b/llvm/test/Instrumentation/InstrProfiling/debug-info-correlate.ll
index fd868ead5b78d..84eaab33701a4 100644
--- a/llvm/test/Instrumentation/InstrProfiling/debug-info-correlate.ll
+++ b/llvm/test/Instrumentation/InstrProfiling/debug-info-correlate.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -passes=instrprof -debug-info-correlate -S > %t.ll
+; RUN: opt < %s -passes=instrprof -profile-correlate=debug-info -S > %t.ll
 ; RUN: FileCheck < %t.ll --implicit-check-not "{{__llvm_prf_data|__llvm_prf_names}}" %s
 ; RUN: %llc_dwarf -O0 -filetype=obj < %t.ll | llvm-dwarfdump - | FileCheck --implicit-check-not "{{DW_TAG|NULL}}" %s --check-prefix CHECK-DWARF
 
diff --git a/llvm/tools/llvm-profdata/llvm-profdata.cpp b/llvm/tools/llvm-profdata/llvm-profdata.cpp
index 7d665a8005b0d..f05029e10e9bc 100644
--- a/llvm/tools/llvm-profdata/llvm-profdata.cpp
+++ b/llvm/tools/llvm-profdata/llvm-profdata.cpp
@@ -426,8 +426,9 @@ mergeInstrProfile(const WeightedFileVector &Inputs, StringRef DebugInfoFilename,
 
   std::unique_ptr<InstrProfCorrelator> Correlator;
   if (!DebugInfoFilename.empty()) {
-    if (auto Err =
-            InstrProfCorrelator::get(DebugInfoFilename).moveInto(Correlator))
+    if (auto Err = InstrProfCorrelator::get(DebugInfoFilename,
+                                            InstrProfCorrelator::DEBUG_INFO)
+                       .moveInto(Correlator))
       exitWithError(std::move(Err), DebugInfoFilename);
     if (auto Err = Correlator->correlateProfileData(MaxDbgCorrelationWarnings))
       exitWithError(std::move(Err), DebugInfoFilename);
@@ -2893,7 +2894,9 @@ static int showDebugInfoCorrelation(const std::string &Filename,
   if (SFormat == ShowFormat::Json)
     exitWithError("JSON output is not supported for debug info correlation");
   std::unique_ptr<InstrProfCorrelator> Correlator;
-  if (auto Err = InstrProfCorrelator::get(Filename).moveInto(Correlator))
+  if (auto Err =
+          InstrProfCorrelator::get(Filename, InstrProfCorrelator::DEBUG_INFO)
+              .moveInto(Correlator))
     exitWithError(std::move(Err), Filename);
   if (SFormat == ShowFormat::Yaml) {
     if (auto Err = Correlator->dumpYaml(MaxDbgCorrelationWarnings, OS))

From 64859781207969e6d8b7eb69c61ed0feff42113c Mon Sep 17 00:00:00 2001
From: Philip Reames <preames@rivosinc.com>
Date: Tue, 31 Oct 2023 07:45:27 -0700
Subject: [PATCH 525/877] Refresh a couple of auto-gen tests [nfc]

Reducing spurious diff in an upcoming review.
---
 .../IRCE/iv-plus-offset-range-check.ll         | 18 +++++++++---------
 .../LoopIdiom/X86/memset-size-compute.ll       |  4 ++--
 .../Transforms/LoopVectorize/lcssa-crashes.ll  |  2 +-
 3 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/llvm/test/Transforms/IRCE/iv-plus-offset-range-check.ll b/llvm/test/Transforms/IRCE/iv-plus-offset-range-check.ll
index 455784cd3ad98..bda7cf82eff97 100644
--- a/llvm/test/Transforms/IRCE/iv-plus-offset-range-check.ll
+++ b/llvm/test/Transforms/IRCE/iv-plus-offset-range-check.ll
@@ -97,7 +97,7 @@ define i8 @test1(i8 %limit, i8 %n) {
 ; CHECK:       inbounds.postloop:
 ; CHECK-NEXT:    [[IDX_NEXT_POSTLOOP]] = add nuw i8 [[IDX_POSTLOOP]], 1
 ; CHECK-NEXT:    [[CMP_POSTLOOP:%.*]] = icmp slt i8 [[IDX_NEXT_POSTLOOP]], [[LIMIT]]
-; CHECK-NEXT:    br i1 [[CMP_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT_LOOPEXIT]], !llvm.loop [[LOOP0:![0-9]+]], !irce.loop.clone [[META5:![0-9]+]]
+; CHECK-NEXT:    br i1 [[CMP_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT_LOOPEXIT]], !llvm.loop [[LOOP0:![0-9]+]], !irce.loop.clone !5
 ;
 entry:
   %precheck = icmp sgt i8 %limit, 0
@@ -191,7 +191,7 @@ define i8 @test1a(i8 %limit, ptr %p) {
 ; CHECK:       inbounds.postloop:
 ; CHECK-NEXT:    [[IDX_NEXT_POSTLOOP]] = add nuw i8 [[IDX_POSTLOOP]], 1
 ; CHECK-NEXT:    [[CMP_POSTLOOP:%.*]] = icmp slt i8 [[IDX_NEXT_POSTLOOP]], [[LIMIT]]
-; CHECK-NEXT:    br i1 [[CMP_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT_LOOPEXIT]], !llvm.loop [[LOOP7:![0-9]+]], !irce.loop.clone [[META5]]
+; CHECK-NEXT:    br i1 [[CMP_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT_LOOPEXIT]], !llvm.loop [[LOOP7:![0-9]+]], !irce.loop.clone !5
 ;
 entry:
   %n = load i8, ptr %p, !range !0
@@ -445,7 +445,7 @@ define i8 @test3a(i8 %limit, ptr %p) {
 ; CHECK:       inbounds.postloop:
 ; CHECK-NEXT:    [[IDX_NEXT_POSTLOOP]] = add nuw i8 [[IDX_POSTLOOP]], 1
 ; CHECK-NEXT:    [[CMP_POSTLOOP:%.*]] = icmp slt i8 [[IDX_NEXT_POSTLOOP]], [[LIMIT]]
-; CHECK-NEXT:    br i1 [[CMP_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT_LOOPEXIT]], !llvm.loop [[LOOP8:![0-9]+]], !irce.loop.clone [[META5]]
+; CHECK-NEXT:    br i1 [[CMP_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT_LOOPEXIT]], !llvm.loop [[LOOP8:![0-9]+]], !irce.loop.clone !5
 ;
 entry:
   %n = load i8, ptr %p, !range !0
@@ -549,7 +549,7 @@ define i8 @test4(i8 %limit, i8 %n) {
 ; CHECK:       inbounds.postloop:
 ; CHECK-NEXT:    [[IDX_NEXT_POSTLOOP]] = add nuw i8 [[IDX_POSTLOOP]], 1
 ; CHECK-NEXT:    [[CMP_POSTLOOP:%.*]] = icmp slt i8 [[IDX_NEXT_POSTLOOP]], [[LIMIT]]
-; CHECK-NEXT:    br i1 [[CMP_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT_LOOPEXIT]], !llvm.loop [[LOOP9:![0-9]+]], !irce.loop.clone [[META5]]
+; CHECK-NEXT:    br i1 [[CMP_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT_LOOPEXIT]], !llvm.loop [[LOOP9:![0-9]+]], !irce.loop.clone !5
 ;
 entry:
   %precheck = icmp sgt i8 %limit, 0
@@ -648,7 +648,7 @@ define i8 @test4a(i8 %limit, ptr %p) {
 ; CHECK:       inbounds.postloop:
 ; CHECK-NEXT:    [[IDX_NEXT_POSTLOOP]] = add nuw i8 [[IDX_POSTLOOP]], 1
 ; CHECK-NEXT:    [[CMP_POSTLOOP:%.*]] = icmp slt i8 [[IDX_NEXT_POSTLOOP]], [[LIMIT]]
-; CHECK-NEXT:    br i1 [[CMP_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT_LOOPEXIT]], !llvm.loop [[LOOP10:![0-9]+]], !irce.loop.clone [[META5]]
+; CHECK-NEXT:    br i1 [[CMP_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT_LOOPEXIT]], !llvm.loop [[LOOP10:![0-9]+]], !irce.loop.clone !5
 ;
 entry:
   %n = load i8, ptr %p, !range !0
@@ -853,7 +853,7 @@ define i8 @test6(i8 %limit, i8 %n) {
 ; CHECK:       inbounds.postloop:
 ; CHECK-NEXT:    [[IDX_NEXT_POSTLOOP]] = add nuw i8 [[IDX_POSTLOOP]], 1
 ; CHECK-NEXT:    [[CMP_POSTLOOP:%.*]] = icmp slt i8 [[IDX_NEXT_POSTLOOP]], [[LIMIT]]
-; CHECK-NEXT:    br i1 [[CMP_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT_LOOPEXIT]], !llvm.loop [[LOOP11:![0-9]+]], !irce.loop.clone [[META5]]
+; CHECK-NEXT:    br i1 [[CMP_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT_LOOPEXIT]], !llvm.loop [[LOOP11:![0-9]+]], !irce.loop.clone !5
 ;
 entry:
   %precheck = icmp sgt i8 %limit, 0
@@ -944,7 +944,7 @@ define i8 @test6a(i8 %limit, ptr %p) {
 ; CHECK:       inbounds.postloop:
 ; CHECK-NEXT:    [[IDX_NEXT_POSTLOOP]] = add nuw i8 [[IDX_POSTLOOP]], 1
 ; CHECK-NEXT:    [[CMP_POSTLOOP:%.*]] = icmp slt i8 [[IDX_NEXT_POSTLOOP]], [[LIMIT]]
-; CHECK-NEXT:    br i1 [[CMP_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT_LOOPEXIT]], !llvm.loop [[LOOP12:![0-9]+]], !irce.loop.clone [[META5]]
+; CHECK-NEXT:    br i1 [[CMP_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT_LOOPEXIT]], !llvm.loop [[LOOP12:![0-9]+]], !irce.loop.clone !5
 ;
 entry:
   %n = load i8, ptr %p, !range !0
@@ -1038,7 +1038,7 @@ define i8 @test_overflow_check_compile_time(i8 %limit, ptr %p) {
 ; CHECK:       inbounds.postloop:
 ; CHECK-NEXT:    [[IDX_NEXT_POSTLOOP]] = add nuw i8 [[IDX_POSTLOOP]], 1
 ; CHECK-NEXT:    [[CMP_POSTLOOP:%.*]] = icmp slt i8 [[IDX_NEXT_POSTLOOP]], [[LIMIT]]
-; CHECK-NEXT:    br i1 [[CMP_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT_LOOPEXIT]], !llvm.loop [[LOOP14:![0-9]+]], !irce.loop.clone [[META5]]
+; CHECK-NEXT:    br i1 [[CMP_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT_LOOPEXIT]], !llvm.loop [[LOOP14:![0-9]+]], !irce.loop.clone !5
 ;
 entry:
   %n = load i8, ptr %p, !range !1
@@ -1147,7 +1147,7 @@ define i8 @test_overflow_check_runtime(i8 %limit, ptr %p) {
 ; CHECK:       inbounds.postloop:
 ; CHECK-NEXT:    [[IDX_NEXT_POSTLOOP]] = add nuw i8 [[IDX_POSTLOOP]], 1
 ; CHECK-NEXT:    [[CMP_POSTLOOP:%.*]] = icmp slt i8 [[IDX_NEXT_POSTLOOP]], [[LIMIT]]
-; CHECK-NEXT:    br i1 [[CMP_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT_LOOPEXIT]], !llvm.loop [[LOOP15:![0-9]+]], !irce.loop.clone [[META5]]
+; CHECK-NEXT:    br i1 [[CMP_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT_LOOPEXIT]], !llvm.loop [[LOOP15:![0-9]+]], !irce.loop.clone !5
 ;
 entry:
   %n = load i8, ptr %p, !range !1
diff --git a/llvm/test/Transforms/LoopIdiom/X86/memset-size-compute.ll b/llvm/test/Transforms/LoopIdiom/X86/memset-size-compute.ll
index fe3f5f2f6fce4..7aa7e6e69bcb7 100644
--- a/llvm/test/Transforms/LoopIdiom/X86/memset-size-compute.ll
+++ b/llvm/test/Transforms/LoopIdiom/X86/memset-size-compute.ll
@@ -14,12 +14,12 @@ define void @test(ptr %ptr) {
 ; CHECK-NEXT:    br label [[FOR_BODY_PREHEADER]]
 ; CHECK:       for.body.preheader:
 ; CHECK-NEXT:    [[LIM_0:%.*]] = phi i32 [ 65, [[ENTRY:%.*]] ], [ 1, [[DEAD:%.*]] ]
-; CHECK-NEXT:    [[UGLYGEP:%.*]] = getelementptr i8, ptr [[PTR:%.*]], i64 8
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[PTR:%.*]], i64 8
 ; CHECK-NEXT:    [[UMAX:%.*]] = call i32 @llvm.umax.i32(i32 [[LIM_0]], i32 2)
 ; CHECK-NEXT:    [[TMP0:%.*]] = add nsw i32 [[UMAX]], -1
 ; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
 ; CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 3
-; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[UGLYGEP]], i8 0, i64 [[TMP2]], i1 false)
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[SCEVGEP]], i8 0, i64 [[TMP2]], i1 false)
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ 1, [[FOR_BODY_PREHEADER]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/lcssa-crashes.ll b/llvm/test/Transforms/LoopVectorize/lcssa-crashes.ll
index b1934826c6416..295d2e73d6f0a 100644
--- a/llvm/test/Transforms/LoopVectorize/lcssa-crashes.ll
+++ b/llvm/test/Transforms/LoopVectorize/lcssa-crashes.ll
@@ -27,7 +27,7 @@ define void @test() {
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i32 [[LFTR_WIDEIV]], undef
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_BODY_I_I_I]], label [[FOR_END_I_I_I]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_BODY_I_I_I]], label [[FOR_END_I_I_I]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       for.end.i.i.i:
 ; CHECK-NEXT:    [[LCSSA:%.*]] = phi ptr [ undef, [[FOR_INC_I_I_I]] ], [ undef, [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    unreachable

From db7a1ed9a22fadacfa651e88d0f75f365d44af9a Mon Sep 17 00:00:00 2001
From: Zequan Wu <zequanwu@google.com>
Date: Tue, 31 Oct 2023 10:53:45 -0400
Subject: [PATCH 526/877] Revert "[Profile] Refactor profile correlation.
 (#70712)"

This reverts commit 4b383d0af93136b80841fc140da0823dfc441dd4.
---
 clang/lib/CodeGen/BackendUtil.cpp             |  14 +--
 compiler-rt/lib/profile/InstrProfiling.c      |   4 -
 compiler-rt/lib/profile/InstrProfiling.h      |   6 --
 .../lib/profile/InstrProfilingBuffer.c        |  11 --
 compiler-rt/lib/profile/InstrProfilingMerge.c |  11 +-
 .../lib/profile/InstrProfilingWriter.c        |  21 ++--
 .../Darwin/instrprof-debug-info-correlate.c   |   4 +-
 .../instrprof-debug-info-correlate-warnings.c |   2 +-
 .../Linux/instrprof-debug-info-correlate.c    |   6 +-
 .../instrprof-show-debug-info-correlation.c   |   6 +-
 llvm/docs/CommandGuide/llvm-profdata.rst      |   4 +-
 .../llvm/ProfileData/InstrProfCorrelator.h    |  13 +--
 .../llvm/ProfileData/InstrProfReader.h        |   2 -
 .../Instrumentation/PGOInstrumentation.h      |   2 +
 .../CodeGen/TargetLoweringObjectFileImpl.cpp  |  19 ----
 llvm/lib/ProfileData/InstrProfCorrelator.cpp  | 100 +++++++-----------
 llvm/lib/ProfileData/InstrProfReader.cpp      |   4 +-
 .../Instrumentation/InstrProfiling.cpp        |  18 ++--
 .../Instrumentation/PGOInstrumentation.cpp    |   6 +-
 .../debug-info-correlate-coverage.ll          |   2 +-
 .../InstrProfiling/debug-info-correlate.ll    |   2 +-
 llvm/tools/llvm-profdata/llvm-profdata.cpp    |   9 +-
 22 files changed, 90 insertions(+), 176 deletions(-)

diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp
index 83b81a38a7685..70accce456d3c 100644
--- a/clang/lib/CodeGen/BackendUtil.cpp
+++ b/clang/lib/CodeGen/BackendUtil.cpp
@@ -42,7 +42,6 @@
 #include "llvm/Passes/PassBuilder.h"
 #include "llvm/Passes/PassPlugin.h"
 #include "llvm/Passes/StandardInstrumentations.h"
-#include "llvm/ProfileData/InstrProfCorrelator.h"
 #include "llvm/Support/BuryPointer.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/MemoryBuffer.h"
@@ -56,7 +55,6 @@
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/TargetParser/SubtargetFeature.h"
 #include "llvm/TargetParser/Triple.h"
-#include "llvm/Transforms/HipStdPar/HipStdPar.h"
 #include "llvm/Transforms/IPO/EmbedBitcodePass.h"
 #include "llvm/Transforms/IPO/LowerTypeTests.h"
 #include "llvm/Transforms/IPO/ThinLTOBitcodeWriter.h"
@@ -80,6 +78,7 @@
 #include "llvm/Transforms/Scalar/EarlyCSE.h"
 #include "llvm/Transforms/Scalar/GVN.h"
 #include "llvm/Transforms/Scalar/JumpThreading.h"
+#include "llvm/Transforms/HipStdPar/HipStdPar.h"
 #include "llvm/Transforms/Utils/Debugify.h"
 #include "llvm/Transforms/Utils/EntryExitInstrumenter.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
@@ -99,18 +98,13 @@ extern cl::opt<bool> PrintPipelinePasses;
 static cl::opt<bool> ClSanitizeOnOptimizerEarlyEP(
     "sanitizer-early-opt-ep", cl::Optional,
     cl::desc("Insert sanitizers on OptimizerEarlyEP."), cl::init(false));
-
-extern cl::opt<bool> DebugInfoCorrelate;
-extern cl::opt<InstrProfCorrelator::ProfCorrelatorKind> ProfileCorrelate;
-} // namespace llvm
+}
 
 namespace {
 
 // Default filename used for profile generation.
 std::string getDefaultProfileGenName() {
-  return DebugInfoCorrelate || ProfileCorrelate != InstrProfCorrelator::NONE
-             ? "default_%m.proflite"
-             : "default_%m.profraw";
+  return DebugInfoCorrelate ? "default_%m.proflite" : "default_%m.profraw";
 }
 
 class EmitAssemblyHelper {
@@ -203,7 +197,7 @@ class EmitAssemblyHelper {
   void EmitAssembly(BackendAction Action,
                     std::unique_ptr<raw_pwrite_stream> OS);
 };
-} // namespace
+}
 
 static SanitizerCoverageOptions
 getSancovOptsFromCGOpts(const CodeGenOptions &CGOpts) {
diff --git a/compiler-rt/lib/profile/InstrProfiling.c b/compiler-rt/lib/profile/InstrProfiling.c
index 7d69e37815c94..da04d8ebdec95 100644
--- a/compiler-rt/lib/profile/InstrProfiling.c
+++ b/compiler-rt/lib/profile/InstrProfiling.c
@@ -89,7 +89,3 @@ COMPILER_RT_VISIBILITY void __llvm_profile_reset_counters(void) {
   }
   lprofSetProfileDumped(0);
 }
-
-inline int hasCorrelation() {
-  return (__llvm_profile_get_version() & VARIANT_MASK_DBG_CORRELATE) != 0ULL;
-}
diff --git a/compiler-rt/lib/profile/InstrProfiling.h b/compiler-rt/lib/profile/InstrProfiling.h
index b8104af5f12b9..e143149fca827 100644
--- a/compiler-rt/lib/profile/InstrProfiling.h
+++ b/compiler-rt/lib/profile/InstrProfiling.h
@@ -261,9 +261,6 @@ uint64_t __llvm_profile_get_magic(void);
 /*! \brief Get the version of the file format. */
 uint64_t __llvm_profile_get_version(void);
 
-/*! \brief If the binary is compiled with profile correlation. */
-int hasCorrelation();
-
 /*! \brief Get the number of entries in the profile data section. */
 uint64_t __llvm_profile_get_num_data(const __llvm_profile_data *Begin,
                                      const __llvm_profile_data *End);
@@ -285,9 +282,6 @@ uint64_t __llvm_profile_get_counters_size(const char *Begin, const char *End);
 uint64_t __llvm_profile_get_num_bitmap_bytes(const char *Begin,
                                              const char *End);
 
-/*! \brief Get the size of the profile name section in bytes. */
-uint64_t __llvm_profile_get_name_size(const char *Begin, const char *End);
-
 /* ! \brief Given the sizes of the data and counter information, return the
  * number of padding bytes before and after the counters, and after the names,
  * in the raw profile.
diff --git a/compiler-rt/lib/profile/InstrProfilingBuffer.c b/compiler-rt/lib/profile/InstrProfilingBuffer.c
index 69965142d9787..c7217b2dfef8a 100644
--- a/compiler-rt/lib/profile/InstrProfilingBuffer.c
+++ b/compiler-rt/lib/profile/InstrProfilingBuffer.c
@@ -56,8 +56,6 @@ uint64_t __llvm_profile_get_size_for_buffer(void) {
 COMPILER_RT_VISIBILITY
 uint64_t __llvm_profile_get_num_data(const __llvm_profile_data *Begin,
                                      const __llvm_profile_data *End) {
-  if (hasCorrelation())
-    return 0;
   intptr_t BeginI = (intptr_t)Begin, EndI = (intptr_t)End;
   return ((EndI + sizeof(__llvm_profile_data) - 1) - BeginI) /
          sizeof(__llvm_profile_data);
@@ -66,8 +64,6 @@ uint64_t __llvm_profile_get_num_data(const __llvm_profile_data *Begin,
 COMPILER_RT_VISIBILITY
 uint64_t __llvm_profile_get_data_size(const __llvm_profile_data *Begin,
                                       const __llvm_profile_data *End) {
-  if (hasCorrelation())
-    return 0;
   return __llvm_profile_get_num_data(Begin, End) * sizeof(__llvm_profile_data);
 }
 
@@ -96,13 +92,6 @@ uint64_t __llvm_profile_get_num_bitmap_bytes(const char *Begin,
   return (End - Begin);
 }
 
-COMPILER_RT_VISIBILITY
-uint64_t __llvm_profile_get_name_size(const char *Begin, const char *End) {
-  if (hasCorrelation())
-    return 0;
-  return End - Begin;
-}
-
 /// Calculate the number of padding bytes needed to add to \p Offset in order
 /// for (\p Offset + Padding) to be page-aligned.
 static uint64_t calculateBytesNeededToPageAlign(uint64_t Offset) {
diff --git a/compiler-rt/lib/profile/InstrProfilingMerge.c b/compiler-rt/lib/profile/InstrProfilingMerge.c
index b08973debda94..c5f168bf75177 100644
--- a/compiler-rt/lib/profile/InstrProfilingMerge.c
+++ b/compiler-rt/lib/profile/InstrProfilingMerge.c
@@ -69,9 +69,8 @@ int __llvm_profile_check_compatibility(const char *ProfileData,
       Header->NumBitmapBytes !=
           __llvm_profile_get_num_bitmap_bytes(__llvm_profile_begin_bitmap(),
                                               __llvm_profile_end_bitmap()) ||
-      Header->NamesSize !=
-          __llvm_profile_get_name_size(__llvm_profile_begin_names(),
-                                       __llvm_profile_end_names()) ||
+      Header->NamesSize != (uint64_t)(__llvm_profile_end_names() -
+                                      __llvm_profile_begin_names()) ||
       Header->ValueKindLast != IPVK_Last)
     return 1;
 
@@ -139,9 +138,9 @@ int __llvm_profile_merge_from_buffer(const char *ProfileData,
   if (SrcNameStart < SrcCountersStart || SrcNameStart < SrcBitmapStart)
     return 1;
 
-  // Merge counters by iterating the entire counter section when correlation is
-  // enabled.
-  if (hasCorrelation()) {
+  // Merge counters by iterating the entire counter section when debug info
+  // correlation is enabled.
+  if (__llvm_profile_get_version() & VARIANT_MASK_DBG_CORRELATE) {
     for (SrcCounter = SrcCountersStart,
         DstCounter = __llvm_profile_begin_counters();
          SrcCounter < SrcCountersEnd;) {
diff --git a/compiler-rt/lib/profile/InstrProfilingWriter.c b/compiler-rt/lib/profile/InstrProfilingWriter.c
index d35ee6b20504f..3b61f3def9f6e 100644
--- a/compiler-rt/lib/profile/InstrProfilingWriter.c
+++ b/compiler-rt/lib/profile/InstrProfilingWriter.c
@@ -262,19 +262,21 @@ lprofWriteDataImpl(ProfDataWriter *Writer, const __llvm_profile_data *DataBegin,
                    const char *BitmapBegin, const char *BitmapEnd,
                    VPDataReaderType *VPDataReader, const char *NamesBegin,
                    const char *NamesEnd, int SkipNameDataWrite) {
-  int ProfileCorrelation = hasCorrelation();
+  int DebugInfoCorrelate =
+      (__llvm_profile_get_version() & VARIANT_MASK_DBG_CORRELATE) != 0ULL;
 
   /* Calculate size of sections. */
   const uint64_t DataSectionSize =
-      __llvm_profile_get_data_size(DataBegin, DataEnd);
-  const uint64_t NumData = __llvm_profile_get_num_data(DataBegin, DataEnd);
+      DebugInfoCorrelate ? 0 : __llvm_profile_get_data_size(DataBegin, DataEnd);
+  const uint64_t NumData =
+      DebugInfoCorrelate ? 0 : __llvm_profile_get_num_data(DataBegin, DataEnd);
   const uint64_t CountersSectionSize =
       __llvm_profile_get_counters_size(CountersBegin, CountersEnd);
   const uint64_t NumCounters =
       __llvm_profile_get_num_counters(CountersBegin, CountersEnd);
   const uint64_t NumBitmapBytes =
       __llvm_profile_get_num_bitmap_bytes(BitmapBegin, BitmapEnd);
-  const uint64_t NamesSize = __llvm_profile_get_name_size(NamesBegin, NamesEnd);
+  const uint64_t NamesSize = DebugInfoCorrelate ? 0 : NamesEnd - NamesBegin;
 
   /* Create the header. */
   __llvm_profile_header Header;
@@ -302,7 +304,7 @@ lprofWriteDataImpl(ProfDataWriter *Writer, const __llvm_profile_data *DataBegin,
 #endif
 
   /* The data and names sections are omitted in lightweight mode. */
-  if (ProfileCorrelation) {
+  if (DebugInfoCorrelate) {
     Header.CountersDelta = 0;
     Header.NamesDelta = 0;
   }
@@ -318,22 +320,21 @@ lprofWriteDataImpl(ProfDataWriter *Writer, const __llvm_profile_data *DataBegin,
 
   /* Write the profile data. */
   ProfDataIOVec IOVecData[] = {
-      {ProfileCorrelation ? NULL : DataBegin, sizeof(uint8_t), DataSectionSize,
+      {DebugInfoCorrelate ? NULL : DataBegin, sizeof(uint8_t), DataSectionSize,
        0},
       {NULL, sizeof(uint8_t), PaddingBytesBeforeCounters, 1},
       {CountersBegin, sizeof(uint8_t), CountersSectionSize, 0},
       {NULL, sizeof(uint8_t), PaddingBytesAfterCounters, 1},
       {BitmapBegin, sizeof(uint8_t), NumBitmapBytes, 0},
       {NULL, sizeof(uint8_t), PaddingBytesAfterBitmapBytes, 1},
-      {(SkipNameDataWrite || ProfileCorrelation) ? NULL : NamesBegin,
+      {(SkipNameDataWrite || DebugInfoCorrelate) ? NULL : NamesBegin,
        sizeof(uint8_t), NamesSize, 0},
       {NULL, sizeof(uint8_t), PaddingBytesAfterNames, 1}};
   if (Writer->Write(Writer, IOVecData, sizeof(IOVecData) / sizeof(*IOVecData)))
     return -1;
 
-  /* Value profiling is not yet supported in continuous mode and profile
-   * correlation mode. */
-  if (__llvm_profile_is_continuous_mode_enabled() || ProfileCorrelation)
+  /* Value profiling is not yet supported in continuous mode. */
+  if (__llvm_profile_is_continuous_mode_enabled())
     return 0;
 
   return writeValueProfData(Writer, VPDataReader, DataBegin, DataEnd);
diff --git a/compiler-rt/test/profile/Darwin/instrprof-debug-info-correlate.c b/compiler-rt/test/profile/Darwin/instrprof-debug-info-correlate.c
index 46d25a4e386dc..f347d439e2e06 100644
--- a/compiler-rt/test/profile/Darwin/instrprof-debug-info-correlate.c
+++ b/compiler-rt/test/profile/Darwin/instrprof-debug-info-correlate.c
@@ -1,5 +1,5 @@
 // Value profiling is currently not supported in lightweight mode.
-// RUN: %clang_pgogen -o %t -g -mllvm --profile-correlate=debug-info -mllvm --disable-vp=true %S/../Inputs/instrprof-debug-info-correlate-main.cpp %S/../Inputs/instrprof-debug-info-correlate-foo.cpp
+// RUN: %clang_pgogen -o %t -g -mllvm --debug-info-correlate -mllvm --disable-vp=true %S/../Inputs/instrprof-debug-info-correlate-main.cpp %S/../Inputs/instrprof-debug-info-correlate-foo.cpp
 // RUN: env LLVM_PROFILE_FILE=%t.proflite %run %t
 // RUN: llvm-profdata merge -o %t.profdata --debug-info=%t.dSYM %t.proflite
 
@@ -9,7 +9,7 @@
 
 // RUN: diff <(llvm-profdata show --all-functions --counts %t.normal.profdata) <(llvm-profdata show --all-functions --counts %t.profdata)
 
-// RUN: %clang_pgogen -o %t.cov -g -mllvm --profile-correlate=debug-info -mllvm -pgo-function-entry-coverage -mllvm --disable-vp=true %S/../Inputs/instrprof-debug-info-correlate-main.cpp %S/../Inputs/instrprof-debug-info-correlate-foo.cpp
+// RUN: %clang_pgogen -o %t.cov -g -mllvm --debug-info-correlate -mllvm -pgo-function-entry-coverage -mllvm --disable-vp=true %S/../Inputs/instrprof-debug-info-correlate-main.cpp %S/../Inputs/instrprof-debug-info-correlate-foo.cpp
 // RUN: env LLVM_PROFILE_FILE=%t.cov.proflite %run %t.cov
 // RUN: llvm-profdata merge -o %t.cov.profdata --debug-info=%t.cov.dSYM %t.cov.proflite
 
diff --git a/compiler-rt/test/profile/Linux/instrprof-debug-info-correlate-warnings.c b/compiler-rt/test/profile/Linux/instrprof-debug-info-correlate-warnings.c
index 25022f241a6d2..5069c6340b64f 100644
--- a/compiler-rt/test/profile/Linux/instrprof-debug-info-correlate-warnings.c
+++ b/compiler-rt/test/profile/Linux/instrprof-debug-info-correlate-warnings.c
@@ -1,6 +1,6 @@
 // Disable full debug info and verify that we get warnings during merging
 
-// RUN: %clang_pgogen -o %t -gline-tables-only -mllvm --profile-correlate=debug-info -mllvm --disable-vp=true %S/../Inputs/instrprof-debug-info-correlate-main.cpp %S/../Inputs/instrprof-debug-info-correlate-foo.cpp
+// RUN: %clang_pgogen -o %t -gline-tables-only -mllvm --debug-info-correlate -mllvm --disable-vp=true %S/../Inputs/instrprof-debug-info-correlate-main.cpp %S/../Inputs/instrprof-debug-info-correlate-foo.cpp
 // RUN: env LLVM_PROFILE_FILE=%t.proflite %run %t
 // RUN: llvm-profdata merge -o %t.profdata --debug-info=%t %t.proflite --max-debug-info-correlation-warnings=2 2>&1 >/dev/null | FileCheck %s --check-prefixes=CHECK,LIMIT --implicit-check-not=warning
 // RUN: llvm-profdata merge -o %t.profdata --debug-info=%t %t.proflite --max-debug-info-correlation-warnings=0 2>&1 >/dev/null | FileCheck %s --check-prefixes=CHECK,NOLIMIT --implicit-check-not=warning
diff --git a/compiler-rt/test/profile/Linux/instrprof-debug-info-correlate.c b/compiler-rt/test/profile/Linux/instrprof-debug-info-correlate.c
index ccfd65b6f4c4b..a918d7b629900 100644
--- a/compiler-rt/test/profile/Linux/instrprof-debug-info-correlate.c
+++ b/compiler-rt/test/profile/Linux/instrprof-debug-info-correlate.c
@@ -3,19 +3,19 @@
 // RUN: env LLVM_PROFILE_FILE=%t.profraw %run %t.normal
 // RUN: llvm-profdata merge -o %t.normal.profdata %t.profraw
 
-// RUN: %clang_pgogen -o %t.d4 -g -gdwarf-4 -mllvm --profile-correlate=debug-info -mllvm --disable-vp=true %S/../Inputs/instrprof-debug-info-correlate-main.cpp %S/../Inputs/instrprof-debug-info-correlate-foo.cpp
+// RUN: %clang_pgogen -o %t.d4 -g -gdwarf-4 -mllvm --debug-info-correlate -mllvm --disable-vp=true %S/../Inputs/instrprof-debug-info-correlate-main.cpp %S/../Inputs/instrprof-debug-info-correlate-foo.cpp
 // RUN: env LLVM_PROFILE_FILE=%t.d4.proflite %run %t.d4
 // RUN: llvm-profdata merge -o %t.d4.profdata --debug-info=%t.d4 %t.d4.proflite
 
 // RUN: diff <(llvm-profdata show --all-functions --counts %t.normal.profdata) <(llvm-profdata show --all-functions --counts %t.d4.profdata)
 
-// RUN: %clang_pgogen -o %t -g -mllvm --profile-correlate=debug-info -mllvm --disable-vp=true %S/../Inputs/instrprof-debug-info-correlate-main.cpp %S/../Inputs/instrprof-debug-info-correlate-foo.cpp
+// RUN: %clang_pgogen -o %t -g -mllvm --debug-info-correlate -mllvm --disable-vp=true %S/../Inputs/instrprof-debug-info-correlate-main.cpp %S/../Inputs/instrprof-debug-info-correlate-foo.cpp
 // RUN: env LLVM_PROFILE_FILE=%t.proflite %run %t
 // RUN: llvm-profdata merge -o %t.profdata --debug-info=%t %t.proflite
 
 // RUN: diff <(llvm-profdata show --all-functions --counts %t.normal.profdata) <(llvm-profdata show --all-functions --counts %t.profdata)
 
-// RUN: %clang_pgogen -o %t.cov -g -mllvm --profile-correlate=debug-info -mllvm -pgo-function-entry-coverage -mllvm --disable-vp=true %S/../Inputs/instrprof-debug-info-correlate-main.cpp %S/../Inputs/instrprof-debug-info-correlate-foo.cpp
+// RUN: %clang_pgogen -o %t.cov -g -mllvm --debug-info-correlate -mllvm -pgo-function-entry-coverage -mllvm --disable-vp=true %S/../Inputs/instrprof-debug-info-correlate-main.cpp %S/../Inputs/instrprof-debug-info-correlate-foo.cpp
 // RUN: env LLVM_PROFILE_FILE=%t.cov.proflite %run %t.cov
 // RUN: llvm-profdata merge -o %t.cov.profdata --debug-info=%t.cov %t.cov.proflite
 
diff --git a/compiler-rt/test/profile/Linux/instrprof-show-debug-info-correlation.c b/compiler-rt/test/profile/Linux/instrprof-show-debug-info-correlation.c
index 93bf40f98d3ab..226d678aca347 100644
--- a/compiler-rt/test/profile/Linux/instrprof-show-debug-info-correlation.c
+++ b/compiler-rt/test/profile/Linux/instrprof-show-debug-info-correlation.c
@@ -1,10 +1,10 @@
-// RUN: %clang_pgogen -o %t -g -mllvm --profile-correlate=debug-info -mllvm --disable-vp=true %s
+// RUN: %clang_pgogen -o %t -g -mllvm --debug-info-correlate -mllvm --disable-vp=true %s
 // RUN: llvm-profdata show --debug-info=%t --detailed-summary --show-prof-sym-list | FileCheck %s
 // RUN: llvm-profdata show --debug-info=%t --show-format=yaml | FileCheck %s --match-full-lines --check-prefix YAML
 
-// RUN: %clang_pgogen -o %t.no.dbg -mllvm --profile-correlate=debug-info -mllvm --disable-vp=true %s
+// RUN: %clang_pgogen -o %t.no.dbg -mllvm --debug-info-correlate -mllvm --disable-vp=true %s
 // RUN: not llvm-profdata show --debug-info=%t.no.dbg 2>&1 | FileCheck %s --check-prefix NO-DBG
-// NO-DBG: unable to correlate profile: could not find any profile data metadata in correlated file
+// NO-DBG: unable to correlate profile: could not find any profile metadata in debug info
 
 // YAML: Probes:
 // YAML:   - Function Name:   a
diff --git a/llvm/docs/CommandGuide/llvm-profdata.rst b/llvm/docs/CommandGuide/llvm-profdata.rst
index 73759f14b49a9..be42733ca1405 100644
--- a/llvm/docs/CommandGuide/llvm-profdata.rst
+++ b/llvm/docs/CommandGuide/llvm-profdata.rst
@@ -195,7 +195,7 @@ OPTIONS
 .. option:: --debug-info=<path>
 
  Specify the executable or ``.dSYM`` that contains debug info for the raw profile.
- When ``-profile-correlate=debug-info`` was used for instrumentation, use this option
+ When ``-debug-info-correlate`` was used for instrumentation, use this option
  to correlate the raw profile.
 
 .. option:: --temporal-profile-trace-reservoir-size
@@ -346,7 +346,7 @@ OPTIONS
 .. option:: --debug-info=<path>
 
  Specify the executable or ``.dSYM`` that contains debug info for the raw profile.
- When ``-profile-correlate=debug-info`` was used for instrumentation, use this option
+ When ``-debug-info-correlate`` was used for instrumentation, use this option
  to show the correlated functions from the raw profile.
 
 .. option:: --covered
diff --git a/llvm/include/llvm/ProfileData/InstrProfCorrelator.h b/llvm/include/llvm/ProfileData/InstrProfCorrelator.h
index a3a0805a294a2..dd062951f277c 100644
--- a/llvm/include/llvm/ProfileData/InstrProfCorrelator.h
+++ b/llvm/include/llvm/ProfileData/InstrProfCorrelator.h
@@ -31,11 +31,8 @@ class ObjectFile;
 /// to their functions.
 class InstrProfCorrelator {
 public:
-  /// Indicate which kind correlator to use.
-  enum ProfCorrelatorKind { NONE, DEBUG_INFO };
-
   static llvm::Expected<std::unique_ptr<InstrProfCorrelator>>
-  get(StringRef Filename, ProfCorrelatorKind FileKind);
+  get(StringRef DebugInfoFilename);
 
   /// Construct a ProfileData vector used to correlate raw instrumentation data
   /// to their functions.
@@ -107,7 +104,7 @@ class InstrProfCorrelator {
 
 private:
   static llvm::Expected<std::unique_ptr<InstrProfCorrelator>>
-  get(std::unique_ptr<MemoryBuffer> Buffer, ProfCorrelatorKind FileKind);
+  get(std::unique_ptr<MemoryBuffer> Buffer);
 
   const InstrProfCorrelatorKind Kind;
 };
@@ -131,7 +128,7 @@ class InstrProfCorrelatorImpl : public InstrProfCorrelator {
 
   static llvm::Expected<std::unique_ptr<InstrProfCorrelatorImpl<IntPtrT>>>
   get(std::unique_ptr<InstrProfCorrelator::Context> Ctx,
-      const object::ObjectFile &Obj, ProfCorrelatorKind FileKind);
+      const object::ObjectFile &Obj);
 
 protected:
   std::vector<RawInstrProf::ProfileData<IntPtrT>> Data;
@@ -141,8 +138,6 @@ class InstrProfCorrelatorImpl : public InstrProfCorrelator {
       int MaxWarnings,
       InstrProfCorrelator::CorrelationData *Data = nullptr) = 0;
 
-  virtual Error correlateProfileNameImpl() = 0;
-
   Error dumpYaml(int MaxWarnings, raw_ostream &OS) override;
 
   void addProbe(StringRef FunctionName, uint64_t CFGHash, IntPtrT CounterOffset,
@@ -210,8 +205,6 @@ class DwarfInstrProfCorrelator : public InstrProfCorrelatorImpl<IntPtrT> {
   void correlateProfileDataImpl(
       int MaxWarnings,
       InstrProfCorrelator::CorrelationData *Data = nullptr) override;
-
-  Error correlateProfileNameImpl() override;
 };
 
 } // end namespace llvm
diff --git a/llvm/include/llvm/ProfileData/InstrProfReader.h b/llvm/include/llvm/ProfileData/InstrProfReader.h
index 1084a58eb098d..cf6429a324d36 100644
--- a/llvm/include/llvm/ProfileData/InstrProfReader.h
+++ b/llvm/include/llvm/ProfileData/InstrProfReader.h
@@ -377,8 +377,6 @@ class RawInstrProfReader : public InstrProfReader {
     return (Version & VARIANT_MASK_DBG_CORRELATE) != 0;
   }
 
-  bool useCorrelate() const { return useDebugInfoCorrelate(); }
-
   bool hasSingleByteCoverage() const override {
     return (Version & VARIANT_MASK_BYTE_COVERAGE) != 0;
   }
diff --git a/llvm/include/llvm/Transforms/Instrumentation/PGOInstrumentation.h b/llvm/include/llvm/Transforms/Instrumentation/PGOInstrumentation.h
index 3d8f3bd923545..5b1977b7de9a2 100644
--- a/llvm/include/llvm/Transforms/Instrumentation/PGOInstrumentation.h
+++ b/llvm/include/llvm/Transforms/Instrumentation/PGOInstrumentation.h
@@ -24,6 +24,8 @@
 
 namespace llvm {
 
+extern cl::opt<bool> DebugInfoCorrelate;
+
 class Function;
 class Instruction;
 class Module;
diff --git a/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp b/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
index cf4983a6d0531..f3ba380818901 100644
--- a/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
@@ -57,7 +57,6 @@
 #include "llvm/MC/MCValue.h"
 #include "llvm/MC/SectionKind.h"
 #include "llvm/ProfileData/InstrProf.h"
-#include "llvm/ProfileData/InstrProfCorrelator.h"
 #include "llvm/Support/Base64.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CodeGen.h"
@@ -72,24 +71,6 @@
 using namespace llvm;
 using namespace dwarf;
 
-namespace llvm {
-// Deprecated. Use -profile-correlate=debug-info.
-cl::opt<bool> DebugInfoCorrelate(
-    "debug-info-correlate",
-    cl::desc("Use debug info to correlate profiles (Deprecated). Use "
-             "-profile-correlate=debug-info instead."),
-    cl::init(false));
-
-cl::opt<InstrProfCorrelator::ProfCorrelatorKind> ProfileCorrelate(
-    "profile-correlate",
-    cl::desc("Use debug info or binary file to correlate profiles."),
-    cl::init(InstrProfCorrelator::NONE),
-    cl::values(clEnumValN(InstrProfCorrelator::NONE, "",
-                          "No profile correlation"),
-               clEnumValN(InstrProfCorrelator::DEBUG_INFO, "debug-info",
-                          "Use debug info to correlate")));
-} // namespace llvm
-
 static cl::opt<bool> JumpTableInFunctionSection(
     "jumptable-in-function-section", cl::Hidden, cl::init(false),
     cl::desc("Putting Jump Table in function section"));
diff --git a/llvm/lib/ProfileData/InstrProfCorrelator.cpp b/llvm/lib/ProfileData/InstrProfCorrelator.cpp
index e429b9f1da54c..2138368500bed 100644
--- a/llvm/lib/ProfileData/InstrProfCorrelator.cpp
+++ b/llvm/lib/ProfileData/InstrProfCorrelator.cpp
@@ -24,20 +24,15 @@
 
 using namespace llvm;
 
-/// Get profile section.
-Expected<object::SectionRef> getInstrProfSection(const object::ObjectFile &Obj,
-                                                 InstrProfSectKind IPSK) {
-  Triple::ObjectFormatType ObjFormat = Obj.getTripleObjectFormat();
-  std::string ExpectedSectionName =
-      getInstrProfSectionName(IPSK, ObjFormat,
-                              /*AddSegmentInfo=*/false);
+/// Get the __llvm_prf_cnts section.
+Expected<object::SectionRef> getCountersSection(const object::ObjectFile &Obj) {
   for (auto &Section : Obj.sections())
     if (auto SectionName = Section.getName())
-      if (SectionName.get() == ExpectedSectionName)
+      if (SectionName.get() == INSTR_PROF_CNTS_SECT_NAME)
         return Section;
   return make_error<InstrProfError>(
       instrprof_error::unable_to_correlate_profile,
-      "could not find section (" + Twine(ExpectedSectionName) + ")");
+      "could not find counter section (" INSTR_PROF_CNTS_SECT_NAME ")");
 }
 
 const char *InstrProfCorrelator::FunctionNameAttributeName = "Function Name";
@@ -47,7 +42,7 @@ const char *InstrProfCorrelator::NumCountersAttributeName = "Num Counters";
 llvm::Expected<std::unique_ptr<InstrProfCorrelator::Context>>
 InstrProfCorrelator::Context::get(std::unique_ptr<MemoryBuffer> Buffer,
                                   const object::ObjectFile &Obj) {
-  auto CountersSection = getInstrProfSection(Obj, IPSK_cnts);
+  auto CountersSection = getCountersSection(Obj);
   if (auto Err = CountersSection.takeError())
     return std::move(Err);
   auto C = std::make_unique<Context>();
@@ -59,32 +54,30 @@ InstrProfCorrelator::Context::get(std::unique_ptr<MemoryBuffer> Buffer,
 }
 
 llvm::Expected<std::unique_ptr<InstrProfCorrelator>>
-InstrProfCorrelator::get(StringRef Filename, ProfCorrelatorKind FileKind) {
-  if (FileKind == DEBUG_INFO) {
-    auto DsymObjectsOrErr =
-        object::MachOObjectFile::findDsymObjectMembers(Filename);
-    if (auto Err = DsymObjectsOrErr.takeError())
-      return std::move(Err);
-    if (!DsymObjectsOrErr->empty()) {
-      // TODO: Enable profile correlation when there are multiple objects in a
-      // dSYM bundle.
-      if (DsymObjectsOrErr->size() > 1)
-        return make_error<InstrProfError>(
-            instrprof_error::unable_to_correlate_profile,
-            "using multiple objects is not yet supported");
-      Filename = *DsymObjectsOrErr->begin();
-    }
+InstrProfCorrelator::get(StringRef DebugInfoFilename) {
+  auto DsymObjectsOrErr =
+      object::MachOObjectFile::findDsymObjectMembers(DebugInfoFilename);
+  if (auto Err = DsymObjectsOrErr.takeError())
+    return std::move(Err);
+  if (!DsymObjectsOrErr->empty()) {
+    // TODO: Enable profile correlation when there are multiple objects in a
+    // dSYM bundle.
+    if (DsymObjectsOrErr->size() > 1)
+      return make_error<InstrProfError>(
+          instrprof_error::unable_to_correlate_profile,
+          "using multiple objects is not yet supported");
+    DebugInfoFilename = *DsymObjectsOrErr->begin();
   }
-  auto BufferOrErr = errorOrToExpected(MemoryBuffer::getFile(Filename));
+  auto BufferOrErr =
+      errorOrToExpected(MemoryBuffer::getFile(DebugInfoFilename));
   if (auto Err = BufferOrErr.takeError())
     return std::move(Err);
 
-  return get(std::move(*BufferOrErr), FileKind);
+  return get(std::move(*BufferOrErr));
 }
 
 llvm::Expected<std::unique_ptr<InstrProfCorrelator>>
-InstrProfCorrelator::get(std::unique_ptr<MemoryBuffer> Buffer,
-                         ProfCorrelatorKind FileKind) {
+InstrProfCorrelator::get(std::unique_ptr<MemoryBuffer> Buffer) {
   auto BinOrErr = object::createBinary(*Buffer);
   if (auto Err = BinOrErr.takeError())
     return std::move(Err);
@@ -95,11 +88,9 @@ InstrProfCorrelator::get(std::unique_ptr<MemoryBuffer> Buffer,
       return std::move(Err);
     auto T = Obj->makeTriple();
     if (T.isArch64Bit())
-      return InstrProfCorrelatorImpl<uint64_t>::get(std::move(*CtxOrErr), *Obj,
-                                                    FileKind);
+      return InstrProfCorrelatorImpl<uint64_t>::get(std::move(*CtxOrErr), *Obj);
     if (T.isArch32Bit())
-      return InstrProfCorrelatorImpl<uint32_t>::get(std::move(*CtxOrErr), *Obj,
-                                                    FileKind);
+      return InstrProfCorrelatorImpl<uint32_t>::get(std::move(*CtxOrErr), *Obj);
   }
   return make_error<InstrProfError>(
       instrprof_error::unable_to_correlate_profile, "not an object file");
@@ -141,33 +132,29 @@ template <class IntPtrT>
 llvm::Expected<std::unique_ptr<InstrProfCorrelatorImpl<IntPtrT>>>
 InstrProfCorrelatorImpl<IntPtrT>::get(
     std::unique_ptr<InstrProfCorrelator::Context> Ctx,
-    const object::ObjectFile &Obj, ProfCorrelatorKind FileKind) {
-  if (FileKind == DEBUG_INFO) {
-    if (Obj.isELF() || Obj.isMachO()) {
-      auto DICtx = DWARFContext::create(Obj);
-      return std::make_unique<DwarfInstrProfCorrelator<IntPtrT>>(
-          std::move(DICtx), std::move(Ctx));
-    }
-    return make_error<InstrProfError>(
-        instrprof_error::unable_to_correlate_profile,
-        "unsupported debug info format (only DWARF is supported)");
+    const object::ObjectFile &Obj) {
+  if (Obj.isELF() || Obj.isMachO()) {
+    auto DICtx = DWARFContext::create(Obj);
+    return std::make_unique<DwarfInstrProfCorrelator<IntPtrT>>(std::move(DICtx),
+                                                               std::move(Ctx));
   }
   return make_error<InstrProfError>(
       instrprof_error::unable_to_correlate_profile,
-      "unsupported correlation file type (only DWARF is supported)");
+      "unsupported debug info format (only DWARF is supported)");
 }
 
 template <class IntPtrT>
 Error InstrProfCorrelatorImpl<IntPtrT>::correlateProfileData(int MaxWarnings) {
   assert(Data.empty() && Names.empty() && NamesVec.empty());
   correlateProfileDataImpl(MaxWarnings);
-  if (this->Data.empty())
+  if (Data.empty() || NamesVec.empty())
     return make_error<InstrProfError>(
         instrprof_error::unable_to_correlate_profile,
-        "could not find any profile data metadata in correlated file");
-  Error Result = correlateProfileNameImpl();
-  this->CounterOffsets.clear();
-  this->NamesVec.clear();
+        "could not find any profile metadata in debug info");
+  auto Result =
+      collectGlobalObjectNameStrings(NamesVec, /*doCompression=*/false, Names);
+  CounterOffsets.clear();
+  NamesVec.clear();
   return Result;
 }
 
@@ -202,7 +189,7 @@ Error InstrProfCorrelatorImpl<IntPtrT>::dumpYaml(int MaxWarnings,
   if (Data.Probes.empty())
     return make_error<InstrProfError>(
         instrprof_error::unable_to_correlate_profile,
-        "could not find any profile data metadata in debug info");
+        "could not find any profile metadata in debug info");
   yaml::Output YamlOS(OS);
   YamlOS << Data;
   return Error::success();
@@ -378,16 +365,3 @@ void DwarfInstrProfCorrelator<IntPtrT>::correlateProfileDataImpl(
     WithColor::warning() << format("Suppressed %d additional warnings\n",
                                    NumSuppressedWarnings);
 }
-
-template <class IntPtrT>
-Error DwarfInstrProfCorrelator<IntPtrT>::correlateProfileNameImpl() {
-  if (this->NamesVec.empty()) {
-    return make_error<InstrProfError>(
-        instrprof_error::unable_to_correlate_profile,
-        "could not find any profile name metadata in debug info");
-  }
-  auto Result =
-      collectGlobalObjectNameStrings(this->NamesVec,
-                                     /*doCompression=*/false, this->Names);
-  return Result;
-}
diff --git a/llvm/lib/ProfileData/InstrProfReader.cpp b/llvm/lib/ProfileData/InstrProfReader.cpp
index dd66f4247ec73..31d3ff2bcbf6c 100644
--- a/llvm/lib/ProfileData/InstrProfReader.cpp
+++ b/llvm/lib/ProfileData/InstrProfReader.cpp
@@ -563,9 +563,9 @@ Error RawInstrProfReader<IntPtrT>::readHeader(
                   "\nPLEASE update this tool to version in the raw profile, or "
                   "regenerate raw profile with expected version.")
                      .str());
-  if (useCorrelate() && !Correlator)
+  if (useDebugInfoCorrelate() && !Correlator)
     return error(instrprof_error::missing_debug_info_for_correlation);
-  if (!useCorrelate() && Correlator)
+  if (!useDebugInfoCorrelate() && Correlator)
     return error(instrprof_error::unexpected_debug_info_for_correlation);
 
   BinaryIdsSize = swap(Header.BinaryIdsSize);
diff --git a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
index 0ca04d1f71df8..55eef2b76e9be 100644
--- a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
+++ b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
@@ -60,8 +60,10 @@ using namespace llvm;
 #define DEBUG_TYPE "instrprof"
 
 namespace llvm {
-extern cl::opt<bool> DebugInfoCorrelate;
-extern cl::opt<InstrProfCorrelator::ProfCorrelatorKind> ProfileCorrelate;
+cl::opt<bool>
+    DebugInfoCorrelate("debug-info-correlate",
+                       cl::desc("Use debug info to correlate profiles."),
+                       cl::init(false));
 } // namespace llvm
 
 namespace {
@@ -648,7 +650,7 @@ void InstrProfiling::lowerValueProfileInst(InstrProfValueProfileInst *Ind) {
   // in lightweight mode. We need to move the value profile pointer to the
   // Counter struct to get this working.
   assert(
-      (!DebugInfoCorrelate || ProfileCorrelate != InstrProfCorrelator::NONE) &&
+      !DebugInfoCorrelate &&
       "Value profiling is not yet supported with lightweight instrumentation");
   GlobalVariable *Name = Ind->getName();
   auto It = ProfileDataMap.find(Name);
@@ -1076,11 +1078,10 @@ GlobalVariable *InstrProfiling::setupProfileSection(InstrProfInstBase *Inc,
   Function *Fn = Inc->getParent()->getParent();
   GlobalValue::LinkageTypes Linkage = NamePtr->getLinkage();
   GlobalValue::VisibilityTypes Visibility = NamePtr->getVisibility();
-  bool EnableDebugCorrelate =
-      DebugInfoCorrelate || ProfileCorrelate == InstrProfCorrelator::DEBUG_INFO;
+
   // Use internal rather than private linkage so the counter variable shows up
   // in the symbol table when using debug info for correlation.
-  if (EnableDebugCorrelate && TT.isOSBinFormatMachO() &&
+  if (DebugInfoCorrelate && TT.isOSBinFormatMachO() &&
       Linkage == GlobalValue::PrivateLinkage)
     Linkage = GlobalValue::InternalLinkage;
 
@@ -1201,8 +1202,7 @@ InstrProfiling::getOrCreateRegionCounters(InstrProfCntrInstBase *Inc) {
   auto *CounterPtr = setupProfileSection(Inc, IPSK_cnts);
   PD.RegionCounters = CounterPtr;
 
-  if (DebugInfoCorrelate ||
-      ProfileCorrelate == InstrProfCorrelator::DEBUG_INFO) {
+  if (DebugInfoCorrelate) {
     LLVMContext &Ctx = M->getContext();
     Function *Fn = Inc->getParent()->getParent();
     if (auto *SP = Fn->getSubprogram()) {
@@ -1245,7 +1245,7 @@ void InstrProfiling::createDataVariable(InstrProfCntrInstBase *Inc,
                                         InstrProfMCDCBitmapParameters *Params) {
   // When debug information is correlated to profile data, a data variable
   // is not needed.
-  if (DebugInfoCorrelate || ProfileCorrelate == InstrProfCorrelator::DEBUG_INFO)
+  if (DebugInfoCorrelate)
     return;
 
   GlobalVariable *NamePtr = Inc->getName();
diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
index 1e6f2470dedb7..7ad1c9bc54f37 100644
--- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
+++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
@@ -94,7 +94,6 @@
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Value.h"
 #include "llvm/ProfileData/InstrProf.h"
-#include "llvm/ProfileData/InstrProfCorrelator.h"
 #include "llvm/ProfileData/InstrProfReader.h"
 #include "llvm/Support/BranchProbability.h"
 #include "llvm/Support/CRC.h"
@@ -220,9 +219,6 @@ cl::opt<bool> NoPGOWarnMismatchComdatWeak(
     cl::desc("The option is used to turn on/off "
              "warnings about hash mismatch for comdat "
              "or weak functions."));
-
-extern cl::opt<bool> DebugInfoCorrelate;
-extern cl::opt<InstrProfCorrelator::ProfCorrelatorKind> ProfileCorrelate;
 } // namespace llvm
 
 // Command line option to enable/disable select instruction instrumentation.
@@ -385,7 +381,7 @@ static GlobalVariable *createIRLevelProfileFlagVar(Module &M, bool IsCS) {
     ProfileVersion |= VARIANT_MASK_CSIR_PROF;
   if (PGOInstrumentEntry)
     ProfileVersion |= VARIANT_MASK_INSTR_ENTRY;
-  if (DebugInfoCorrelate || ProfileCorrelate == InstrProfCorrelator::DEBUG_INFO)
+  if (DebugInfoCorrelate)
     ProfileVersion |= VARIANT_MASK_DBG_CORRELATE;
   if (PGOFunctionEntryCoverage)
     ProfileVersion |=
diff --git a/llvm/test/Instrumentation/InstrProfiling/debug-info-correlate-coverage.ll b/llvm/test/Instrumentation/InstrProfiling/debug-info-correlate-coverage.ll
index dd64615338170..192bac6e503a0 100644
--- a/llvm/test/Instrumentation/InstrProfiling/debug-info-correlate-coverage.ll
+++ b/llvm/test/Instrumentation/InstrProfiling/debug-info-correlate-coverage.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -passes=instrprof -profile-correlate=debug-info -S | opt -O2 -S | FileCheck %s
+; RUN: opt < %s -passes=instrprof -debug-info-correlate -S | opt -O2 -S | FileCheck %s
 
 @__profn_foo = private constant [3 x i8] c"foo"
 ; CHECK:      @__profc_foo
diff --git a/llvm/test/Instrumentation/InstrProfiling/debug-info-correlate.ll b/llvm/test/Instrumentation/InstrProfiling/debug-info-correlate.ll
index 84eaab33701a4..fd868ead5b78d 100644
--- a/llvm/test/Instrumentation/InstrProfiling/debug-info-correlate.ll
+++ b/llvm/test/Instrumentation/InstrProfiling/debug-info-correlate.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -passes=instrprof -profile-correlate=debug-info -S > %t.ll
+; RUN: opt < %s -passes=instrprof -debug-info-correlate -S > %t.ll
 ; RUN: FileCheck < %t.ll --implicit-check-not "{{__llvm_prf_data|__llvm_prf_names}}" %s
 ; RUN: %llc_dwarf -O0 -filetype=obj < %t.ll | llvm-dwarfdump - | FileCheck --implicit-check-not "{{DW_TAG|NULL}}" %s --check-prefix CHECK-DWARF
 
diff --git a/llvm/tools/llvm-profdata/llvm-profdata.cpp b/llvm/tools/llvm-profdata/llvm-profdata.cpp
index f05029e10e9bc..7d665a8005b0d 100644
--- a/llvm/tools/llvm-profdata/llvm-profdata.cpp
+++ b/llvm/tools/llvm-profdata/llvm-profdata.cpp
@@ -426,9 +426,8 @@ mergeInstrProfile(const WeightedFileVector &Inputs, StringRef DebugInfoFilename,
 
   std::unique_ptr<InstrProfCorrelator> Correlator;
   if (!DebugInfoFilename.empty()) {
-    if (auto Err = InstrProfCorrelator::get(DebugInfoFilename,
-                                            InstrProfCorrelator::DEBUG_INFO)
-                       .moveInto(Correlator))
+    if (auto Err =
+            InstrProfCorrelator::get(DebugInfoFilename).moveInto(Correlator))
       exitWithError(std::move(Err), DebugInfoFilename);
     if (auto Err = Correlator->correlateProfileData(MaxDbgCorrelationWarnings))
       exitWithError(std::move(Err), DebugInfoFilename);
@@ -2894,9 +2893,7 @@ static int showDebugInfoCorrelation(const std::string &Filename,
   if (SFormat == ShowFormat::Json)
     exitWithError("JSON output is not supported for debug info correlation");
   std::unique_ptr<InstrProfCorrelator> Correlator;
-  if (auto Err =
-          InstrProfCorrelator::get(Filename, InstrProfCorrelator::DEBUG_INFO)
-              .moveInto(Correlator))
+  if (auto Err = InstrProfCorrelator::get(Filename).moveInto(Correlator))
     exitWithError(std::move(Err), Filename);
   if (SFormat == ShowFormat::Yaml) {
     if (auto Err = Correlator->dumpYaml(MaxDbgCorrelationWarnings, OS))

From 34362c6ec6635a22910d5dce339b0934c799ebf6 Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <Ramkumar.Ramachandra@imgtec.com>
Date: Tue, 31 Oct 2023 14:55:23 +0000
Subject: [PATCH 527/877] LoopIdiomRecognize: update comment about ctpop (NFC)
 (#70812)

Ever since 95de7c3, LoopIdiomRecognize has been able to recognize the
ctpop idiom, with target-specific lowering information. However, the
comment in the header about ctpop not being recognized is outdated.
Remove it.
---
 llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index b85178585e0ad..3b2a7b03dd6d3 100644
--- a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -24,12 +24,6 @@
 //   memcmp, strlen, etc.
 // Future floating point idioms to recognize in -ffast-math mode:
 //   fpowi
-// Future integer operation idioms to recognize:
-//   ctpop
-//
-// Beware that isel's default lowering for ctpop is highly inefficient for
-// i64 and larger types when i64 is legal and the value has few bits set.  It
-// would be good to enhance isel to emit a loop for ctpop in this case.
 //
 // This could recognize common matrix multiplies and dot product idioms and
 // replace them with calls to BLAS (if linked in??).

From 7d21d7395ccc8613e5a4144a1b9735e5c719cda3 Mon Sep 17 00:00:00 2001
From: Alexandros Lamprineas <alexandros.lamprineas@arm.com>
Date: Tue, 31 Oct 2023 15:01:28 +0000
Subject: [PATCH 528/877] [LAA] Add a test case to show incorrect dependency
 classification (NFC). (#70473)

Currently the loop access analysis classifies this loop as unsafe to
vectorize because the memory dependencies are
'ForwardButPreventsForwarding'. However, the access pattern is
'write-after-read' with no subsequent read accessing the written memory
locations. I can't see how store-to-load forwarding is applicable here.

void vectorizable_Read_Write(int *A) {
  for (unsigned i = 1022; i >= 0; i--)
    A[i+1] = A[i] + 1;
}
---
 .../forward-negative-step.ll                  | 40 +++++++++++++++++++
 1 file changed, 40 insertions(+)
 create mode 100644 llvm/test/Analysis/LoopAccessAnalysis/forward-negative-step.ll

diff --git a/llvm/test/Analysis/LoopAccessAnalysis/forward-negative-step.ll b/llvm/test/Analysis/LoopAccessAnalysis/forward-negative-step.ll
new file mode 100644
index 0000000000000..89c1737fb7305
--- /dev/null
+++ b/llvm/test/Analysis/LoopAccessAnalysis/forward-negative-step.ll
@@ -0,0 +1,40 @@
+; RUN: opt -passes='print<access-info>' -disable-output  < %s 2>&1 | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+
+; FIXME: This should be vectorizable
+
+; void vectorizable_Read_Write(int *A) {
+;  for (unsigned i = 1022; i >= 0; i--)
+;    A[i+1] = A[i] + 1;
+; }
+
+; CHECK: function 'vectorizable_Read_Write':
+; CHECK-NEXT:   for.body:
+; CHECK-NEXT:     Report: unsafe dependent memory operations in loop
+; CHECK-NEXT:     Forward loop carried data dependence that prevents store-to-load forwarding.
+; CHECK-NEXT:     Dependences:
+; CHECK-NEXT:       ForwardButPreventsForwarding:
+; CHECK-NEXT:           %0 = load i32, ptr %arrayidx, align 4 ->
+; CHECK-NEXT:           store i32 %add, ptr %gep, align 4
+
+define void @vectorizable_Read_Write(ptr nocapture %A) {
+entry:
+  %invariant.gep = getelementptr i32, ptr %A, i64 1
+  br label %for.body
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %indvars.iv = phi i64 [ 1022, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, ptr %A, i64 %indvars.iv
+  %0 = load i32, ptr %arrayidx, align 4
+  %add = add nsw i32 %0, 1
+  %gep = getelementptr i32, ptr %invariant.gep, i64 %indvars.iv
+  store i32 %add, ptr %gep, align 4
+  %indvars.iv.next = add nsw i64 %indvars.iv, -1
+  %cmp.not = icmp eq i64 %indvars.iv, 0
+  br i1 %cmp.not, label %for.cond.cleanup, label %for.body
+}
+

From 49fd28d9601dde429436655ec74234e895c60b89 Mon Sep 17 00:00:00 2001
From: Vlad Serebrennikov <serebrennikov.vladislav@gmail.com>
Date: Tue, 31 Oct 2023 17:43:10 +0300
Subject: [PATCH 529/877] [clang][NFC] Refactor `ArrayType::ArraySizeModifier`

This patch moves `ArraySizeModifier` before `Type` declaration so that it's complete at `ArrayTypeBitfields` declaration. It's also converted to scoped enum along the way.
---
 clang/include/clang/AST/ASTContext.h          | 11 +--
 clang/include/clang/AST/PropertiesBase.td     |  2 +-
 clang/include/clang/AST/Type.h                | 19 ++---
 clang/include/clang/Sema/Sema.h               |  6 +-
 clang/lib/AST/ASTContext.cpp                  | 53 ++++++-------
 clang/lib/AST/ExprConstant.cpp                | 10 +--
 clang/lib/AST/JSONNodeDumper.cpp              |  6 +-
 clang/lib/AST/ODRHash.cpp                     |  2 +-
 clang/lib/AST/ScanfFormatString.cpp           |  2 +-
 clang/lib/AST/TextNodeDumper.cpp              |  6 +-
 clang/lib/AST/Type.cpp                        |  6 +-
 clang/lib/AST/TypePrinter.cpp                 |  6 +-
 clang/lib/CodeGen/CGAtomic.cpp                |  6 +-
 clang/lib/CodeGen/CGBuiltin.cpp               |  3 +-
 clang/lib/CodeGen/CGCall.cpp                  |  4 +-
 clang/lib/CodeGen/CGDebugInfo.cpp             |  4 +-
 clang/lib/CodeGen/CGObjC.cpp                  | 13 ++--
 clang/lib/CodeGen/CGOpenMPRuntime.cpp         | 33 ++++----
 clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp      | 10 +--
 clang/lib/CodeGen/CGStmtOpenMP.cpp            |  4 +-
 clang/lib/CodeGen/CodeGenModule.cpp           | 10 +--
 clang/lib/CodeGen/Targets/XCore.cpp           |  2 +-
 clang/lib/ExtractAPI/DeclarationFragments.cpp |  6 +-
 .../Frontend/Rewrite/RewriteModernObjC.cpp    |  2 +-
 clang/lib/Frontend/Rewrite/RewriteObjC.cpp    |  2 +-
 clang/lib/Index/USRGeneration.cpp             |  6 +-
 clang/lib/Sema/Sema.cpp                       |  4 +-
 clang/lib/Sema/SemaChecking.cpp               |  2 +-
 clang/lib/Sema/SemaDecl.cpp                   |  8 +-
 clang/lib/Sema/SemaExpr.cpp                   |  8 +-
 clang/lib/Sema/SemaExprCXX.cpp                | 10 +--
 clang/lib/Sema/SemaInit.cpp                   | 19 ++---
 clang/lib/Sema/SemaOpenMP.cpp                 | 10 +--
 clang/lib/Sema/SemaOverload.cpp               |  2 +-
 clang/lib/Sema/SemaType.cpp                   | 43 ++++++-----
 clang/lib/Sema/TreeTransform.h                | 77 +++++++------------
 36 files changed, 189 insertions(+), 228 deletions(-)

diff --git a/clang/include/clang/AST/ASTContext.h b/clang/include/clang/AST/ASTContext.h
index 8ad0514ee2ce2..24d22a9c692cd 100644
--- a/clang/include/clang/AST/ASTContext.h
+++ b/clang/include/clang/AST/ASTContext.h
@@ -1430,8 +1430,7 @@ class ASTContext : public RefCountedBase<ASTContext> {
   /// Return a non-unique reference to the type for a variable array of
   /// the specified element type.
   QualType getVariableArrayType(QualType EltTy, Expr *NumElts,
-                                ArrayType::ArraySizeModifier ASM,
-                                unsigned IndexTypeQuals,
+                                ArraySizeModifier ASM, unsigned IndexTypeQuals,
                                 SourceRange Brackets) const;
 
   /// Return a non-unique reference to the type for a dependently-sized
@@ -1440,21 +1439,19 @@ class ASTContext : public RefCountedBase<ASTContext> {
   /// FIXME: We will need these to be uniqued, or at least comparable, at some
   /// point.
   QualType getDependentSizedArrayType(QualType EltTy, Expr *NumElts,
-                                      ArrayType::ArraySizeModifier ASM,
+                                      ArraySizeModifier ASM,
                                       unsigned IndexTypeQuals,
                                       SourceRange Brackets) const;
 
   /// Return a unique reference to the type for an incomplete array of
   /// the specified element type.
-  QualType getIncompleteArrayType(QualType EltTy,
-                                  ArrayType::ArraySizeModifier ASM,
+  QualType getIncompleteArrayType(QualType EltTy, ArraySizeModifier ASM,
                                   unsigned IndexTypeQuals) const;
 
   /// Return the unique reference to the type for a constant array of
   /// the specified element type.
   QualType getConstantArrayType(QualType EltTy, const llvm::APInt &ArySize,
-                                const Expr *SizeExpr,
-                                ArrayType::ArraySizeModifier ASM,
+                                const Expr *SizeExpr, ArraySizeModifier ASM,
                                 unsigned IndexTypeQuals) const;
 
   /// Return a type for a constant array for a string literal of the
diff --git a/clang/include/clang/AST/PropertiesBase.td b/clang/include/clang/AST/PropertiesBase.td
index c6fe790e1964b..129e6e0136964 100644
--- a/clang/include/clang/AST/PropertiesBase.td
+++ b/clang/include/clang/AST/PropertiesBase.td
@@ -74,7 +74,7 @@ def APInt : PropertyType<"llvm::APInt"> { let PassByReference = 1; }
 def APSInt : PropertyType<"llvm::APSInt"> { let PassByReference = 1; }
 def APValue : PropertyType { let PassByReference = 1; }
 def APValueKind : EnumPropertyType<"APValue::ValueKind">;
-def ArraySizeModifier : EnumPropertyType<"ArrayType::ArraySizeModifier">;
+def ArraySizeModifier : EnumPropertyType<"ArraySizeModifier">;
 def AttrKind : EnumPropertyType<"attr::Kind">;
 def AutoTypeKeyword : EnumPropertyType;
 def Bool : PropertyType<"bool">;
diff --git a/clang/include/clang/AST/Type.h b/clang/include/clang/AST/Type.h
index 1e8e1303e65f6..080a5b045ed4d 100644
--- a/clang/include/clang/AST/Type.h
+++ b/clang/include/clang/AST/Type.h
@@ -1569,6 +1569,12 @@ enum class AutoTypeKeyword {
   GNUAutoType
 };
 
+/// Capture whether this is a normal array (e.g. int X[4])
+/// an array with a static size (e.g. int X[static 4]), or an array
+/// with a star size (e.g. int X[*]).
+/// 'static' is only allowed on function parameters.
+enum class ArraySizeModifier { Normal, Static, Star };
+
 /// The base class of the type hierarchy.
 ///
 /// A central concept with types is that each type always has a canonical
@@ -1660,7 +1666,7 @@ class alignas(TypeAlignment) Type : public ExtQualsTypeCommonBase {
 
     /// Storage class qualifiers from declarations like
     /// 'int X[static restrict 4]'. For function parameters only.
-    /// Actually an ArrayType::ArraySizeModifier.
+    /// Actually an ArraySizeModifier.
     unsigned SizeModifier : 3;
   };
   enum { NumArrayTypeBits = NumTypeBits + 6 };
@@ -3086,15 +3092,6 @@ class MemberPointerType : public Type, public llvm::FoldingSetNode {
 
 /// Represents an array type, per C99 6.7.5.2 - Array Declarators.
 class ArrayType : public Type, public llvm::FoldingSetNode {
-public:
-  /// Capture whether this is a normal array (e.g. int X[4])
-  /// an array with a static size (e.g. int X[static 4]), or an array
-  /// with a star size (e.g. int X[*]).
-  /// 'static' is only allowed on function parameters.
-  enum ArraySizeModifier {
-    Normal, Static, Star
-  };
-
 private:
   /// The element type of the array.
   QualType ElementType;
@@ -3218,7 +3215,7 @@ class IncompleteArrayType : public ArrayType {
   static void Profile(llvm::FoldingSetNodeID &ID, QualType ET,
                       ArraySizeModifier SizeMod, unsigned TypeQuals) {
     ID.AddPointer(ET.getAsOpaquePtr());
-    ID.AddInteger(SizeMod);
+    ID.AddInteger(llvm::to_underlying(SizeMod));
     ID.AddInteger(TypeQuals);
   }
 };
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index 1e9752345ffd1..91a4211a5cf5c 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -2080,9 +2080,9 @@ class Sema final {
                             SourceLocation Loc, DeclarationName Entity);
   QualType BuildReferenceType(QualType T, bool LValueRef,
                               SourceLocation Loc, DeclarationName Entity);
-  QualType BuildArrayType(QualType T, ArrayType::ArraySizeModifier ASM,
-                          Expr *ArraySize, unsigned Quals,
-                          SourceRange Brackets, DeclarationName Entity);
+  QualType BuildArrayType(QualType T, ArraySizeModifier ASM, Expr *ArraySize,
+                          unsigned Quals, SourceRange Brackets,
+                          DeclarationName Entity);
   QualType BuildVectorType(QualType T, Expr *VecSize, SourceLocation AttrLoc);
   QualType BuildExtVectorType(QualType T, Expr *ArraySize,
                               SourceLocation AttrLoc);
diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp
index 349c559b48b41..e64bb0cf29dbb 100644
--- a/clang/lib/AST/ASTContext.cpp
+++ b/clang/lib/AST/ASTContext.cpp
@@ -3497,7 +3497,7 @@ QualType ASTContext::getMemberPointerType(QualType T, const Type *Cls) const {
 QualType ASTContext::getConstantArrayType(QualType EltTy,
                                           const llvm::APInt &ArySizeIn,
                                           const Expr *SizeExpr,
-                                          ArrayType::ArraySizeModifier ASM,
+                                          ArraySizeModifier ASM,
                                           unsigned IndexTypeQuals) const {
   assert((EltTy->isDependentType() ||
           EltTy->isIncompleteType() || EltTy->isConstantSizeType()) &&
@@ -3663,12 +3663,10 @@ QualType ASTContext::getVariableArrayDecayedType(QualType type) const {
   // Turn incomplete types into [*] types.
   case Type::IncompleteArray: {
     const auto *iat = cast<IncompleteArrayType>(ty);
-    result = getVariableArrayType(
-                 getVariableArrayDecayedType(iat->getElementType()),
-                                  /*size*/ nullptr,
-                                  ArrayType::Normal,
-                                  iat->getIndexTypeCVRQualifiers(),
-                                  SourceRange());
+    result =
+        getVariableArrayType(getVariableArrayDecayedType(iat->getElementType()),
+                             /*size*/ nullptr, ArraySizeModifier::Normal,
+                             iat->getIndexTypeCVRQualifiers(), SourceRange());
     break;
   }
 
@@ -3676,11 +3674,9 @@ QualType ASTContext::getVariableArrayDecayedType(QualType type) const {
   case Type::VariableArray: {
     const auto *vat = cast<VariableArrayType>(ty);
     result = getVariableArrayType(
-                 getVariableArrayDecayedType(vat->getElementType()),
-                                  /*size*/ nullptr,
-                                  ArrayType::Star,
-                                  vat->getIndexTypeCVRQualifiers(),
-                                  vat->getBracketsRange());
+        getVariableArrayDecayedType(vat->getElementType()),
+        /*size*/ nullptr, ArraySizeModifier::Star,
+        vat->getIndexTypeCVRQualifiers(), vat->getBracketsRange());
     break;
   }
   }
@@ -3691,9 +3687,8 @@ QualType ASTContext::getVariableArrayDecayedType(QualType type) const {
 
 /// getVariableArrayType - Returns a non-unique reference to the type for a
 /// variable array of the specified element type.
-QualType ASTContext::getVariableArrayType(QualType EltTy,
-                                          Expr *NumElts,
-                                          ArrayType::ArraySizeModifier ASM,
+QualType ASTContext::getVariableArrayType(QualType EltTy, Expr *NumElts,
+                                          ArraySizeModifier ASM,
                                           unsigned IndexTypeQuals,
                                           SourceRange Brackets) const {
   // Since we don't unique expressions, it isn't possible to unique VLA's
@@ -3722,7 +3717,7 @@ QualType ASTContext::getVariableArrayType(QualType EltTy,
 /// type.
 QualType ASTContext::getDependentSizedArrayType(QualType elementType,
                                                 Expr *numElements,
-                                                ArrayType::ArraySizeModifier ASM,
+                                                ArraySizeModifier ASM,
                                                 unsigned elementTypeQuals,
                                                 SourceRange brackets) const {
   assert((!numElements || numElements->isTypeDependent() ||
@@ -3785,7 +3780,7 @@ QualType ASTContext::getDependentSizedArrayType(QualType elementType,
 }
 
 QualType ASTContext::getIncompleteArrayType(QualType elementType,
-                                            ArrayType::ArraySizeModifier ASM,
+                                            ArraySizeModifier ASM,
                                             unsigned elementTypeQuals) const {
   llvm::FoldingSetNodeID ID;
   IncompleteArrayType::Profile(ID, elementType, ASM, elementTypeQuals);
@@ -8857,9 +8852,8 @@ static TypedefDecl *CreatePowerABIBuiltinVaListDecl(const ASTContext *Context) {
 
   // typedef __va_list_tag __builtin_va_list[1];
   llvm::APInt Size(Context->getTypeSize(Context->getSizeType()), 1);
-  QualType VaListTagArrayType
-    = Context->getConstantArrayType(VaListTagTypedefType,
-                                    Size, nullptr, ArrayType::Normal, 0);
+  QualType VaListTagArrayType = Context->getConstantArrayType(
+      VaListTagTypedefType, Size, nullptr, ArraySizeModifier::Normal, 0);
   return Context->buildImplicitTypedef(VaListTagArrayType, "__builtin_va_list");
 }
 
@@ -8913,7 +8907,7 @@ CreateX86_64ABIBuiltinVaListDecl(const ASTContext *Context) {
   // typedef struct __va_list_tag __builtin_va_list[1];
   llvm::APInt Size(Context->getTypeSize(Context->getSizeType()), 1);
   QualType VaListTagArrayType = Context->getConstantArrayType(
-      VaListTagType, Size, nullptr, ArrayType::Normal, 0);
+      VaListTagType, Size, nullptr, ArraySizeModifier::Normal, 0);
   return Context->buildImplicitTypedef(VaListTagArrayType, "__builtin_va_list");
 }
 
@@ -8921,7 +8915,7 @@ static TypedefDecl *CreatePNaClABIBuiltinVaListDecl(const ASTContext *Context) {
   // typedef int __builtin_va_list[4];
   llvm::APInt Size(Context->getTypeSize(Context->getSizeType()), 4);
   QualType IntArrayType = Context->getConstantArrayType(
-      Context->IntTy, Size, nullptr, ArrayType::Normal, 0);
+      Context->IntTy, Size, nullptr, ArraySizeModifier::Normal, 0);
   return Context->buildImplicitTypedef(IntArrayType, "__builtin_va_list");
 }
 
@@ -9016,7 +9010,7 @@ CreateSystemZBuiltinVaListDecl(const ASTContext *Context) {
   // typedef __va_list_tag __builtin_va_list[1];
   llvm::APInt Size(Context->getTypeSize(Context->getSizeType()), 1);
   QualType VaListTagArrayType = Context->getConstantArrayType(
-      VaListTagType, Size, nullptr, ArrayType::Normal, 0);
+      VaListTagType, Size, nullptr, ArraySizeModifier::Normal, 0);
 
   return Context->buildImplicitTypedef(VaListTagArrayType, "__builtin_va_list");
 }
@@ -9067,7 +9061,7 @@ static TypedefDecl *CreateHexagonBuiltinVaListDecl(const ASTContext *Context) {
   // typedef __va_list_tag __builtin_va_list[1];
   llvm::APInt Size(Context->getTypeSize(Context->getSizeType()), 1);
   QualType VaListTagArrayType = Context->getConstantArrayType(
-      VaListTagTypedefType, Size, nullptr, ArrayType::Normal, 0);
+      VaListTagTypedefType, Size, nullptr, ArraySizeModifier::Normal, 0);
 
   return Context->buildImplicitTypedef(VaListTagArrayType, "__builtin_va_list");
 }
@@ -10762,12 +10756,10 @@ QualType ASTContext::mergeTypes(QualType LHS, QualType RHS, bool OfBlockPointer,
       return RHS;
     if (LCAT)
       return getConstantArrayType(ResultType, LCAT->getSize(),
-                                  LCAT->getSizeExpr(),
-                                  ArrayType::ArraySizeModifier(), 0);
+                                  LCAT->getSizeExpr(), ArraySizeModifier(), 0);
     if (RCAT)
       return getConstantArrayType(ResultType, RCAT->getSize(),
-                                  RCAT->getSizeExpr(),
-                                  ArrayType::ArraySizeModifier(), 0);
+                                  RCAT->getSizeExpr(), ArraySizeModifier(), 0);
     if (LVAT && getCanonicalType(LHSElem) == getCanonicalType(ResultType))
       return LHS;
     if (RVAT && getCanonicalType(RHSElem) == getCanonicalType(ResultType))
@@ -10786,8 +10778,7 @@ QualType ASTContext::mergeTypes(QualType LHS, QualType RHS, bool OfBlockPointer,
     }
     if (getCanonicalType(LHSElem) == getCanonicalType(ResultType)) return LHS;
     if (getCanonicalType(RHSElem) == getCanonicalType(ResultType)) return RHS;
-    return getIncompleteArrayType(ResultType,
-                                  ArrayType::ArraySizeModifier(), 0);
+    return getIncompleteArrayType(ResultType, ArraySizeModifier(), 0);
   }
   case Type::FunctionNoProto:
     return mergeFunctionTypes(LHS, RHS, OfBlockPointer, Unqualified,
@@ -12224,7 +12215,7 @@ QualType ASTContext::getStringLiteralArrayType(QualType EltTy,
   // Get an array type for the string, according to C99 6.4.5. This includes
   // the null terminator character.
   return getConstantArrayType(EltTy, llvm::APInt(32, Length + 1), nullptr,
-                              ArrayType::Normal, /*IndexTypeQuals*/ 0);
+                              ArraySizeModifier::Normal, /*IndexTypeQuals*/ 0);
 }
 
 StringLiteral *
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index f6f71ce6bfe0f..aea26d380b8e8 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -6776,8 +6776,8 @@ static bool HandleOperatorNewCall(EvalInfo &Info, const CallExpr *E,
     return false;
   }
 
-  QualType AllocType = Info.Ctx.getConstantArrayType(ElemType, Size, nullptr,
-                                                     ArrayType::Normal, 0);
+  QualType AllocType = Info.Ctx.getConstantArrayType(
+      ElemType, Size, nullptr, ArraySizeModifier::Normal, 0);
   APValue *Val = Info.createHeapAlloc(E, AllocType, Result);
   *Val = APValue(APValue::UninitArray(), 0, Size.getZExtValue());
   Result.addArray(Info, E, cast<ConstantArrayType>(AllocType));
@@ -9039,8 +9039,8 @@ class PointerExprEvaluator
     QualType CharTy = Info.Ctx.CharTy.withConst();
     APInt Size(Info.Ctx.getTypeSize(Info.Ctx.getSizeType()),
                ResultStr.size() + 1);
-    QualType ArrayTy = Info.Ctx.getConstantArrayType(CharTy, Size, nullptr,
-                                                     ArrayType::Normal, 0);
+    QualType ArrayTy = Info.Ctx.getConstantArrayType(
+        CharTy, Size, nullptr, ArraySizeModifier::Normal, 0);
 
     StringLiteral *SL =
         StringLiteral::Create(Info.Ctx, ResultStr, StringLiteral::Ordinary,
@@ -9863,7 +9863,7 @@ bool PointerExprEvaluator::VisitCXXNewExpr(const CXXNewExpr *E) {
     }
 
     AllocType = Info.Ctx.getConstantArrayType(AllocType, ArrayBound, nullptr,
-                                              ArrayType::Normal, 0);
+                                              ArraySizeModifier::Normal, 0);
   } else {
     assert(!AllocType->isArrayType() &&
            "array allocation with non-array new");
diff --git a/clang/lib/AST/JSONNodeDumper.cpp b/clang/lib/AST/JSONNodeDumper.cpp
index 25b94ec5616b1..60598bd0a80bb 100644
--- a/clang/lib/AST/JSONNodeDumper.cpp
+++ b/clang/lib/AST/JSONNodeDumper.cpp
@@ -646,13 +646,13 @@ void JSONNodeDumper::VisitRValueReferenceType(const ReferenceType *RT) {
 
 void JSONNodeDumper::VisitArrayType(const ArrayType *AT) {
   switch (AT->getSizeModifier()) {
-  case ArrayType::Star:
+  case ArraySizeModifier::Star:
     JOS.attribute("sizeModifier", "*");
     break;
-  case ArrayType::Static:
+  case ArraySizeModifier::Static:
     JOS.attribute("sizeModifier", "static");
     break;
-  case ArrayType::Normal:
+  case ArraySizeModifier::Normal:
     break;
   }
 
diff --git a/clang/lib/AST/ODRHash.cpp b/clang/lib/AST/ODRHash.cpp
index 7f9b5eb52e367..22bd914c02687 100644
--- a/clang/lib/AST/ODRHash.cpp
+++ b/clang/lib/AST/ODRHash.cpp
@@ -931,7 +931,7 @@ class ODRTypeVisitor : public TypeVisitor<ODRTypeVisitor> {
 
   void VisitArrayType(const ArrayType *T) {
     AddQualType(T->getElementType());
-    ID.AddInteger(T->getSizeModifier());
+    ID.AddInteger(llvm::to_underlying(T->getSizeModifier()));
     VisitQualifiers(T->getIndexTypeQualifiers());
     VisitType(T);
   }
diff --git a/clang/lib/AST/ScanfFormatString.cpp b/clang/lib/AST/ScanfFormatString.cpp
index d6ff1a616285f..64c430e623b57 100644
--- a/clang/lib/AST/ScanfFormatString.cpp
+++ b/clang/lib/AST/ScanfFormatString.cpp
@@ -446,7 +446,7 @@ bool ScanfSpecifier::fixType(QualType QT, QualType RawQT,
 
     // If we know the target array length, we can use it as a field width.
     if (const ConstantArrayType *CAT = Ctx.getAsConstantArrayType(RawQT)) {
-      if (CAT->getSizeModifier() == ArrayType::Normal)
+      if (CAT->getSizeModifier() == ArraySizeModifier::Normal)
         FieldWidth = OptionalAmount(OptionalAmount::Constant,
                                     CAT->getSize().getZExtValue() - 1,
                                     "", 0, false);
diff --git a/clang/lib/AST/TextNodeDumper.cpp b/clang/lib/AST/TextNodeDumper.cpp
index ca609cf2d2060..60f053e6daaaa 100644
--- a/clang/lib/AST/TextNodeDumper.cpp
+++ b/clang/lib/AST/TextNodeDumper.cpp
@@ -1549,12 +1549,12 @@ void TextNodeDumper::VisitRValueReferenceType(const ReferenceType *T) {
 
 void TextNodeDumper::VisitArrayType(const ArrayType *T) {
   switch (T->getSizeModifier()) {
-  case ArrayType::Normal:
+  case ArraySizeModifier::Normal:
     break;
-  case ArrayType::Static:
+  case ArraySizeModifier::Static:
     OS << " static";
     break;
-  case ArrayType::Star:
+  case ArraySizeModifier::Star:
     OS << " *";
     break;
   }
diff --git a/clang/lib/AST/Type.cpp b/clang/lib/AST/Type.cpp
index 8389b14230581..2a89b20771543 100644
--- a/clang/lib/AST/Type.cpp
+++ b/clang/lib/AST/Type.cpp
@@ -156,7 +156,7 @@ ArrayType::ArrayType(TypeClass tc, QualType et, QualType can,
                     : TypeDependence::None)),
       ElementType(et) {
   ArrayTypeBits.IndexTypeQuals = tq;
-  ArrayTypeBits.SizeModifier = sm;
+  ArrayTypeBits.SizeModifier = llvm::to_underlying(sm);
 }
 
 unsigned ConstantArrayType::getNumAddressingBits(const ASTContext &Context,
@@ -218,7 +218,7 @@ void ConstantArrayType::Profile(llvm::FoldingSetNodeID &ID,
                                 unsigned TypeQuals) {
   ID.AddPointer(ET.getAsOpaquePtr());
   ID.AddInteger(ArraySize.getZExtValue());
-  ID.AddInteger(SizeMod);
+  ID.AddInteger(llvm::to_underlying(SizeMod));
   ID.AddInteger(TypeQuals);
   ID.AddBoolean(SizeExpr != nullptr);
   if (SizeExpr)
@@ -239,7 +239,7 @@ void DependentSizedArrayType::Profile(llvm::FoldingSetNodeID &ID,
                                       unsigned TypeQuals,
                                       Expr *E) {
   ID.AddPointer(ET.getAsOpaquePtr());
-  ID.AddInteger(SizeMod);
+  ID.AddInteger(llvm::to_underlying(SizeMod));
   ID.AddInteger(TypeQuals);
   E->Profile(ID, Context, true);
 }
diff --git a/clang/lib/AST/TypePrinter.cpp b/clang/lib/AST/TypePrinter.cpp
index b9f6c0eeb450d..f6e69b988f41d 100644
--- a/clang/lib/AST/TypePrinter.cpp
+++ b/clang/lib/AST/TypePrinter.cpp
@@ -529,7 +529,7 @@ void TypePrinter::printConstantArrayAfter(const ConstantArrayType *T,
     OS << ' ';
   }
 
-  if (T->getSizeModifier() == ArrayType::Static)
+  if (T->getSizeModifier() == ArraySizeModifier::Static)
     OS << "static ";
 
   OS << T->getSize().getZExtValue() << ']';
@@ -562,9 +562,9 @@ void TypePrinter::printVariableArrayAfter(const VariableArrayType *T,
     OS << ' ';
   }
 
-  if (T->getSizeModifier() == VariableArrayType::Static)
+  if (T->getSizeModifier() == ArraySizeModifier::Static)
     OS << "static ";
-  else if (T->getSizeModifier() == VariableArrayType::Star)
+  else if (T->getSizeModifier() == ArraySizeModifier::Star)
     OS << '*';
 
   if (T->getSizeExpr())
diff --git a/clang/lib/CodeGen/CGAtomic.cpp b/clang/lib/CodeGen/CGAtomic.cpp
index 83ad6739015b8..f7c597e181b0b 100644
--- a/clang/lib/CodeGen/CGAtomic.cpp
+++ b/clang/lib/CodeGen/CGAtomic.cpp
@@ -101,9 +101,9 @@ namespace {
           llvm::APInt Size(
               /*numBits=*/32,
               C.toCharUnitsFromBits(AtomicSizeInBits).getQuantity());
-          AtomicTy =
-              C.getConstantArrayType(C.CharTy, Size, nullptr, ArrayType::Normal,
-                                     /*IndexTypeQuals=*/0);
+          AtomicTy = C.getConstantArrayType(C.CharTy, Size, nullptr,
+                                            ArraySizeModifier::Normal,
+                                            /*IndexTypeQuals=*/0);
         }
         AtomicAlign = ValueAlign = lvalue.getAlignment();
       } else if (lvalue.isVectorElt()) {
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index dce5ee5888c45..a90d0fdd2efbf 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -5292,7 +5292,8 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
         -> std::tuple<llvm::Value *, llvm::Value *, llvm::Value *> {
       llvm::APInt ArraySize(32, NumArgs - First);
       QualType SizeArrayTy = getContext().getConstantArrayType(
-          getContext().getSizeType(), ArraySize, nullptr, ArrayType::Normal,
+          getContext().getSizeType(), ArraySize, nullptr,
+          ArraySizeModifier::Normal,
           /*IndexTypeQuals=*/0);
       auto Tmp = CreateMemTemp(SizeArrayTy, "block_sizes");
       llvm::Value *TmpPtr = Tmp.getPointer();
diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp
index e1e1ad7e6a17b..d0ee70c8d7127 100644
--- a/clang/lib/CodeGen/CGCall.cpp
+++ b/clang/lib/CodeGen/CGCall.cpp
@@ -3062,7 +3062,7 @@ void CodeGenFunction::EmitFunctionProlog(const CGFunctionInfo &FI,
             // indicates dereferenceability, and if the size is constant we can
             // use the dereferenceable attribute (which requires the size in
             // bytes).
-            if (ArrTy->getSizeModifier() == ArrayType::Static) {
+            if (ArrTy->getSizeModifier() == ArraySizeModifier::Static) {
               QualType ETy = ArrTy->getElementType();
               llvm::Align Alignment =
                   CGM.getNaturalTypeAlignment(ETy).getAsAlign();
@@ -3086,7 +3086,7 @@ void CodeGenFunction::EmitFunctionProlog(const CGFunctionInfo &FI,
             // For C99 VLAs with the static keyword, we don't know the size so
             // we can't use the dereferenceable attribute, but in addrspace(0)
             // we know that it must be nonnull.
-            if (ArrTy->getSizeModifier() == VariableArrayType::Static) {
+            if (ArrTy->getSizeModifier() == ArraySizeModifier::Static) {
               QualType ETy = ArrTy->getElementType();
               llvm::Align Alignment =
                   CGM.getNaturalTypeAlignment(ETy).getAsAlign();
diff --git a/clang/lib/CodeGen/CGDebugInfo.cpp b/clang/lib/CodeGen/CGDebugInfo.cpp
index 0aaf678bf287c..2fb4fa46f51a5 100644
--- a/clang/lib/CodeGen/CGDebugInfo.cpp
+++ b/clang/lib/CodeGen/CGDebugInfo.cpp
@@ -3876,7 +3876,7 @@ void CGDebugInfo::collectVarDeclProps(const VarDecl *VD, llvm::DIFile *&Unit,
     QualType ET = CGM.getContext().getAsArrayType(T)->getElementType();
 
     T = CGM.getContext().getConstantArrayType(ET, ConstVal, nullptr,
-                                              ArrayType::Normal, 0);
+                                              ArraySizeModifier::Normal, 0);
   }
 
   Name = VD->getName();
@@ -4548,7 +4548,7 @@ CGDebugInfo::EmitTypeForVarWithBlocksAttr(const VarDecl *VD,
     if (NumPaddingBytes.isPositive()) {
       llvm::APInt pad(32, NumPaddingBytes.getQuantity());
       FType = CGM.getContext().getConstantArrayType(
-          CGM.getContext().CharTy, pad, nullptr, ArrayType::Normal, 0);
+          CGM.getContext().CharTy, pad, nullptr, ArraySizeModifier::Normal, 0);
       EltTys.push_back(CreateMemberType(Unit, FType, "", &FieldOffset));
     }
   }
diff --git a/clang/lib/CodeGen/CGObjC.cpp b/clang/lib/CodeGen/CGObjC.cpp
index aa3a0ff57003d..5c967f97018f8 100644
--- a/clang/lib/CodeGen/CGObjC.cpp
+++ b/clang/lib/CodeGen/CGObjC.cpp
@@ -149,9 +149,9 @@ llvm::Value *CodeGenFunction::EmitObjCCollectionLiteral(const Expr *E,
   llvm::APInt APNumElements(Context.getTypeSize(Context.getSizeType()),
                             NumElements);
   QualType ElementType = Context.getObjCIdType().withConst();
-  QualType ElementArrayType
-    = Context.getConstantArrayType(ElementType, APNumElements, nullptr,
-                                   ArrayType::Normal, /*IndexTypeQuals=*/0);
+  QualType ElementArrayType = Context.getConstantArrayType(
+      ElementType, APNumElements, nullptr, ArraySizeModifier::Normal,
+      /*IndexTypeQuals=*/0);
 
   // Allocate the temporary array(s).
   Address Objects = CreateMemTemp(ElementArrayType, "objects");
@@ -1801,10 +1801,9 @@ void CodeGenFunction::EmitObjCForCollectionStmt(const ObjCForCollectionStmt &S){
   Selector FastEnumSel =
       CGM.getContext().Selectors.getSelector(std::size(II), &II[0]);
 
-  QualType ItemsTy =
-    getContext().getConstantArrayType(getContext().getObjCIdType(),
-                                      llvm::APInt(32, NumItems), nullptr,
-                                      ArrayType::Normal, 0);
+  QualType ItemsTy = getContext().getConstantArrayType(
+      getContext().getObjCIdType(), llvm::APInt(32, NumItems), nullptr,
+      ArraySizeModifier::Normal, 0);
   Address ItemsPtr = CreateMemTemp(ItemsTy, "items.ptr");
 
   // Emit the collection pointer.  In ARC, we do a retain.
diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index bcd67b7205c7d..01786774aad6b 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -2368,7 +2368,7 @@ void CGOpenMPRuntime::emitSingleRegion(CodeGenFunction &CGF,
   if (DidIt.isValid()) {
     llvm::APInt ArraySize(/*unsigned int numBits=*/32, CopyprivateVars.size());
     QualType CopyprivateArrayTy = C.getConstantArrayType(
-        C.VoidPtrTy, ArraySize, nullptr, ArrayType::Normal,
+        C.VoidPtrTy, ArraySize, nullptr, ArraySizeModifier::Normal,
         /*IndexTypeQuals=*/0);
     // Create a list of all private variables for copyprivate.
     Address CopyprivateList =
@@ -3938,9 +3938,9 @@ CGOpenMPRuntime::emitTaskInit(CodeGenFunction &CGF, SourceLocation Loc,
           VK_PRValue);
       CodeGenFunction::OpaqueValueMapping OpaqueMap(CGF, OVE,
                                                     RValue::get(NumOfElements));
-      KmpTaskAffinityInfoArrayTy =
-          C.getVariableArrayType(KmpTaskAffinityInfoTy, OVE, ArrayType::Normal,
-                                 /*IndexTypeQuals=*/0, SourceRange(Loc, Loc));
+      KmpTaskAffinityInfoArrayTy = C.getVariableArrayType(
+          KmpTaskAffinityInfoTy, OVE, ArraySizeModifier::Normal,
+          /*IndexTypeQuals=*/0, SourceRange(Loc, Loc));
       // Properly emit variable-sized array.
       auto *PD = ImplicitParamDecl::Create(C, KmpTaskAffinityInfoArrayTy,
                                            ImplicitParamDecl::Other);
@@ -3952,7 +3952,7 @@ CGOpenMPRuntime::emitTaskInit(CodeGenFunction &CGF, SourceLocation Loc,
       KmpTaskAffinityInfoArrayTy = C.getConstantArrayType(
           KmpTaskAffinityInfoTy,
           llvm::APInt(C.getTypeSize(C.getSizeType()), NumAffinities), nullptr,
-          ArrayType::Normal, /*IndexTypeQuals=*/0);
+          ArraySizeModifier::Normal, /*IndexTypeQuals=*/0);
       AffinitiesArray =
           CGF.CreateMemTemp(KmpTaskAffinityInfoArrayTy, ".affs.arr.addr");
       AffinitiesArray = CGF.Builder.CreateConstArrayGEP(AffinitiesArray, 0);
@@ -4397,7 +4397,7 @@ std::pair<llvm::Value *, Address> CGOpenMPRuntime::emitDependClause(
     CodeGenFunction::OpaqueValueMapping OpaqueMap(CGF, OVE,
                                                   RValue::get(NumOfElements));
     KmpDependInfoArrayTy =
-        C.getVariableArrayType(KmpDependInfoTy, OVE, ArrayType::Normal,
+        C.getVariableArrayType(KmpDependInfoTy, OVE, ArraySizeModifier::Normal,
                                /*IndexTypeQuals=*/0, SourceRange(Loc, Loc));
     // CGF.EmitVariablyModifiedType(KmpDependInfoArrayTy);
     // Properly emit variable-sized array.
@@ -4410,7 +4410,7 @@ std::pair<llvm::Value *, Address> CGOpenMPRuntime::emitDependClause(
   } else {
     KmpDependInfoArrayTy = C.getConstantArrayType(
         KmpDependInfoTy, llvm::APInt(/*numBits=*/64, NumDependencies), nullptr,
-        ArrayType::Normal, /*IndexTypeQuals=*/0);
+        ArraySizeModifier::Normal, /*IndexTypeQuals=*/0);
     DependenciesArray =
         CGF.CreateMemTemp(KmpDependInfoArrayTy, ".dep.arr.addr");
     DependenciesArray = CGF.Builder.CreateConstArrayGEP(DependenciesArray, 0);
@@ -4490,7 +4490,7 @@ Address CGOpenMPRuntime::emitDepobjDependClause(
   } else {
     QualType KmpDependInfoArrayTy = C.getConstantArrayType(
         KmpDependInfoTy, llvm::APInt(/*numBits=*/64, NumDependencies + 1),
-        nullptr, ArrayType::Normal, /*IndexTypeQuals=*/0);
+        nullptr, ArraySizeModifier::Normal, /*IndexTypeQuals=*/0);
     CharUnits Sz = C.getTypeSizeInChars(KmpDependInfoArrayTy);
     Size = CGM.getSize(Sz.alignTo(Align));
     NumDepsVal = llvm::ConstantInt::get(CGF.IntPtrTy, NumDependencies);
@@ -5106,9 +5106,9 @@ void CGOpenMPRuntime::emitReduction(CodeGenFunction &CGF, SourceLocation Loc,
       ++Size;
   }
   llvm::APInt ArraySize(/*unsigned int numBits=*/32, Size);
-  QualType ReductionArrayTy =
-      C.getConstantArrayType(C.VoidPtrTy, ArraySize, nullptr, ArrayType::Normal,
-                             /*IndexTypeQuals=*/0);
+  QualType ReductionArrayTy = C.getConstantArrayType(
+      C.VoidPtrTy, ArraySize, nullptr, ArraySizeModifier::Normal,
+      /*IndexTypeQuals=*/0);
   Address ReductionList =
       CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
   const auto *IPriv = Privates.begin();
@@ -5577,8 +5577,9 @@ llvm::Value *CGOpenMPRuntime::emitTaskReductionInit(
   QualType RDType = C.getRecordType(RD);
   unsigned Size = Data.ReductionVars.size();
   llvm::APInt ArraySize(/*numBits=*/64, Size);
-  QualType ArrayRDType = C.getConstantArrayType(
-      RDType, ArraySize, nullptr, ArrayType::Normal, /*IndexTypeQuals=*/0);
+  QualType ArrayRDType =
+      C.getConstantArrayType(RDType, ArraySize, nullptr,
+                             ArraySizeModifier::Normal, /*IndexTypeQuals=*/0);
   // kmp_task_red_input_t .rd_input.[Size];
   Address TaskRedInput = CGF.CreateMemTemp(ArrayRDType, ".rd_input.");
   ReductionCodeGen RCG(Data.ReductionVars, Data.ReductionOrigs,
@@ -11164,8 +11165,8 @@ void CGOpenMPRuntime::emitDoacrossInit(CodeGenFunction &CGF,
     RD = cast<RecordDecl>(KmpDimTy->getAsTagDecl());
   }
   llvm::APInt Size(/*numBits=*/32, NumIterations.size());
-  QualType ArrayTy =
-      C.getConstantArrayType(KmpDimTy, Size, nullptr, ArrayType::Normal, 0);
+  QualType ArrayTy = C.getConstantArrayType(KmpDimTy, Size, nullptr,
+                                            ArraySizeModifier::Normal, 0);
 
   Address DimsAddr = CGF.CreateMemTemp(ArrayTy, "dims");
   CGF.EmitNullInitialization(DimsAddr, ArrayTy);
@@ -11217,7 +11218,7 @@ static void EmitDoacrossOrdered(CodeGenFunction &CGF, CodeGenModule &CGM,
       CGM.getContext().getIntTypeForBitwidth(/*DestWidth=*/64, /*Signed=*/1);
   llvm::APInt Size(/*numBits=*/32, C->getNumLoops());
   QualType ArrayTy = CGM.getContext().getConstantArrayType(
-      Int64Ty, Size, nullptr, ArrayType::Normal, 0);
+      Int64Ty, Size, nullptr, ArraySizeModifier::Normal, 0);
   Address CntAddr = CGF.CreateMemTemp(ArrayTy, ".cnt.addr");
   for (unsigned I = 0, E = C->getNumLoops(); I < E; ++I) {
     const Expr *CounterVal = C->getLoopData(I);
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
index 9d00ebae70280..bd9329b8e2d41 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -174,8 +174,8 @@ static RecordDecl *buildRecordForGlobalizedVars(
       }
     } else {
       llvm::APInt ArraySize(32, BufSize);
-      Type = C.getConstantArrayType(Type, ArraySize, nullptr, ArrayType::Normal,
-                                    0);
+      Type = C.getConstantArrayType(Type, ArraySize, nullptr,
+                                    ArraySizeModifier::Normal, 0);
       Field = FieldDecl::Create(
           C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type,
           C.getTrivialTypeSourceInfo(Type, SourceLocation()),
@@ -2934,9 +2934,9 @@ void CGOpenMPRuntimeGPU::emitReduction(
       ++Size;
   }
   llvm::APInt ArraySize(/*unsigned int numBits=*/32, Size);
-  QualType ReductionArrayTy =
-      C.getConstantArrayType(C.VoidPtrTy, ArraySize, nullptr, ArrayType::Normal,
-                             /*IndexTypeQuals=*/0);
+  QualType ReductionArrayTy = C.getConstantArrayType(
+      C.VoidPtrTy, ArraySize, nullptr, ArraySizeModifier::Normal,
+      /*IndexTypeQuals=*/0);
   Address ReductionList =
       CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
   auto IPriv = Privates.begin();
diff --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp
index a4e80a4a9e1fd..3e2ed50a57502 100644
--- a/clang/lib/CodeGen/CGStmtOpenMP.cpp
+++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp
@@ -5062,7 +5062,7 @@ void CodeGenFunction::EmitOMPTargetTaskBasedDirective(
         getContext(), getContext().getTranslationUnitDecl(), /*NumParams=*/0);
     llvm::APInt ArrSize(/*numBits=*/32, InputInfo.NumberOfTargetItems);
     QualType BaseAndPointerAndMapperType = getContext().getConstantArrayType(
-        getContext().VoidPtrTy, ArrSize, nullptr, ArrayType::Normal,
+        getContext().VoidPtrTy, ArrSize, nullptr, ArraySizeModifier::Normal,
         /*IndexTypeQuals=*/0);
     BPVD = createImplicitFirstprivateForType(
         getContext(), Data, BaseAndPointerAndMapperType, CD, S.getBeginLoc());
@@ -5070,7 +5070,7 @@ void CodeGenFunction::EmitOMPTargetTaskBasedDirective(
         getContext(), Data, BaseAndPointerAndMapperType, CD, S.getBeginLoc());
     QualType SizesType = getContext().getConstantArrayType(
         getContext().getIntTypeForBitwidth(/*DestWidth=*/64, /*Signed=*/1),
-        ArrSize, nullptr, ArrayType::Normal,
+        ArrSize, nullptr, ArraySizeModifier::Normal,
         /*IndexTypeQuals=*/0);
     SVD = createImplicitFirstprivateForType(getContext(), Data, SizesType, CD,
                                             S.getBeginLoc());
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index b1a6683a66bd0..fc135ce80d82f 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -6094,12 +6094,10 @@ QualType CodeGenModule::getObjCFastEnumerationStateType() {
     D->startDefinition();
 
     QualType FieldTypes[] = {
-      Context.UnsignedLongTy,
-      Context.getPointerType(Context.getObjCIdType()),
-      Context.getPointerType(Context.UnsignedLongTy),
-      Context.getConstantArrayType(Context.UnsignedLongTy,
-                           llvm::APInt(32, 5), nullptr, ArrayType::Normal, 0)
-    };
+        Context.UnsignedLongTy, Context.getPointerType(Context.getObjCIdType()),
+        Context.getPointerType(Context.UnsignedLongTy),
+        Context.getConstantArrayType(Context.UnsignedLongTy, llvm::APInt(32, 5),
+                                     nullptr, ArraySizeModifier::Normal, 0)};
 
     for (size_t i = 0; i < 4; ++i) {
       FieldDecl *Field = FieldDecl::Create(Context,
diff --git a/clang/lib/CodeGen/Targets/XCore.cpp b/clang/lib/CodeGen/Targets/XCore.cpp
index 8be240c018d06..aeb48f851e169 100644
--- a/clang/lib/CodeGen/Targets/XCore.cpp
+++ b/clang/lib/CodeGen/Targets/XCore.cpp
@@ -543,7 +543,7 @@ static bool appendArrayType(SmallStringEnc &Enc, QualType QT,
                             const ArrayType *AT,
                             const CodeGen::CodeGenModule &CGM,
                             TypeStringCache &TSC, StringRef NoSizeEnc) {
-  if (AT->getSizeModifier() != ArrayType::Normal)
+  if (AT->getSizeModifier() != ArraySizeModifier::Normal)
     return false;
   Enc += "a(";
   if (const ConstantArrayType *CAT = dyn_cast<ConstantArrayType>(AT))
diff --git a/clang/lib/ExtractAPI/DeclarationFragments.cpp b/clang/lib/ExtractAPI/DeclarationFragments.cpp
index 5c5a9df650527..06552d011a2d4 100644
--- a/clang/lib/ExtractAPI/DeclarationFragments.cpp
+++ b/clang/lib/ExtractAPI/DeclarationFragments.cpp
@@ -266,12 +266,12 @@ DeclarationFragments DeclarationFragmentsBuilder::getFragmentsForType(
     After.append("[", DeclarationFragments::FragmentKind::Text);
 
     switch (AT->getSizeModifier()) {
-    case ArrayType::Normal:
+    case ArraySizeModifier::Normal:
       break;
-    case ArrayType::Static:
+    case ArraySizeModifier::Static:
       Fragments.append("static", DeclarationFragments::FragmentKind::Keyword);
       break;
-    case ArrayType::Star:
+    case ArraySizeModifier::Star:
       Fragments.append("*", DeclarationFragments::FragmentKind::Text);
       break;
     }
diff --git a/clang/lib/Frontend/Rewrite/RewriteModernObjC.cpp b/clang/lib/Frontend/Rewrite/RewriteModernObjC.cpp
index 94bd2befd0b2a..4b961e036fcdd 100644
--- a/clang/lib/Frontend/Rewrite/RewriteModernObjC.cpp
+++ b/clang/lib/Frontend/Rewrite/RewriteModernObjC.cpp
@@ -600,7 +600,7 @@ namespace {
     StringLiteral *getStringLiteral(StringRef Str) {
       QualType StrType = Context->getConstantArrayType(
           Context->CharTy, llvm::APInt(32, Str.size() + 1), nullptr,
-          ArrayType::Normal, 0);
+          ArraySizeModifier::Normal, 0);
       return StringLiteral::Create(*Context, Str, StringLiteral::Ordinary,
                                    /*Pascal=*/false, StrType, SourceLocation());
     }
diff --git a/clang/lib/Frontend/Rewrite/RewriteObjC.cpp b/clang/lib/Frontend/Rewrite/RewriteObjC.cpp
index 520fbe940aa3b..89ffb8908d37b 100644
--- a/clang/lib/Frontend/Rewrite/RewriteObjC.cpp
+++ b/clang/lib/Frontend/Rewrite/RewriteObjC.cpp
@@ -499,7 +499,7 @@ namespace {
     StringLiteral *getStringLiteral(StringRef Str) {
       QualType StrType = Context->getConstantArrayType(
           Context->CharTy, llvm::APInt(32, Str.size() + 1), nullptr,
-          ArrayType::Normal, 0);
+          ArraySizeModifier::Normal, 0);
       return StringLiteral::Create(*Context, Str, StringLiteral::Ordinary,
                                    /*Pascal=*/false, StrType, SourceLocation());
     }
diff --git a/clang/lib/Index/USRGeneration.cpp b/clang/lib/Index/USRGeneration.cpp
index 614f5d8d2cad5..0eee6fe6d8207 100644
--- a/clang/lib/Index/USRGeneration.cpp
+++ b/clang/lib/Index/USRGeneration.cpp
@@ -927,13 +927,13 @@ void USRGenerator::VisitType(QualType T) {
     if (const auto *const AT = dyn_cast<ArrayType>(T)) {
       Out << '{';
       switch (AT->getSizeModifier()) {
-      case ArrayType::Static:
+      case ArraySizeModifier::Static:
         Out << 's';
         break;
-      case ArrayType::Star:
+      case ArraySizeModifier::Star:
         Out << '*';
         break;
-      case ArrayType::Normal:
+      case ArraySizeModifier::Normal:
         Out << 'n';
         break;
       }
diff --git a/clang/lib/Sema/Sema.cpp b/clang/lib/Sema/Sema.cpp
index acb765559e6a8..8e78d2c11f92d 100644
--- a/clang/lib/Sema/Sema.cpp
+++ b/clang/lib/Sema/Sema.cpp
@@ -1333,8 +1333,8 @@ void Sema::ActOnEndOfTranslationUnit() {
       // Set the length of the array to 1 (C99 6.9.2p5).
       Diag(VD->getLocation(), diag::warn_tentative_incomplete_array);
       llvm::APInt One(Context.getTypeSize(Context.getSizeType()), true);
-      QualType T = Context.getConstantArrayType(ArrayT->getElementType(), One,
-                                                nullptr, ArrayType::Normal, 0);
+      QualType T = Context.getConstantArrayType(
+          ArrayT->getElementType(), One, nullptr, ArraySizeModifier::Normal, 0);
       VD->setType(T);
     } else if (RequireCompleteType(VD->getLocation(), VD->getType(),
                                    diag::err_tentative_def_incomplete_type))
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index 44f698abdb9fe..a022eed379716 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -17286,7 +17286,7 @@ static void diagnoseArrayStarInParamType(Sema &S, QualType PType,
   if (!AT)
     return;
 
-  if (AT->getSizeModifier() != ArrayType::Star) {
+  if (AT->getSizeModifier() != ArraySizeModifier::Star) {
     diagnoseArrayStarInParamType(S, AT->getElementType(), Loc);
     return;
   }
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index c4979b51e68f5..d1550f1289068 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -3389,7 +3389,7 @@ static bool EquivalentArrayTypes(QualType Old, QualType New,
     if (Ty->isIncompleteArrayType() || Ty->isPointerType())
       return true;
     if (const auto *VAT = Ctx.getAsVariableArrayType(Ty))
-      return VAT->getSizeModifier() == ArrayType::ArraySizeModifier::Star;
+      return VAT->getSizeModifier() == ArraySizeModifier::Star;
     return false;
   };
 
@@ -3401,8 +3401,8 @@ static bool EquivalentArrayTypes(QualType Old, QualType New,
   if (Old->isVariableArrayType() && New->isVariableArrayType()) {
     const auto *OldVAT = Ctx.getAsVariableArrayType(Old);
     const auto *NewVAT = Ctx.getAsVariableArrayType(New);
-    if ((OldVAT->getSizeModifier() == ArrayType::ArraySizeModifier::Star) ^
-        (NewVAT->getSizeModifier() == ArrayType::ArraySizeModifier::Star))
+    if ((OldVAT->getSizeModifier() == ArraySizeModifier::Star) ^
+        (NewVAT->getSizeModifier() == ArraySizeModifier::Star))
       return false;
     return true;
   }
@@ -6595,7 +6595,7 @@ static QualType TryToFixInvalidVariablyModifiedType(QualType T,
   }
 
   QualType FoldedArrayType = Context.getConstantArrayType(
-      ElemTy, Res, VLATy->getSizeExpr(), ArrayType::Normal, 0);
+      ElemTy, Res, VLATy->getSizeExpr(), ArraySizeModifier::Normal, 0);
   return Qs.apply(Context, FoldedArrayType);
 }
 
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index c2772bfb71c77..83519476f0742 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -3741,14 +3741,14 @@ ExprResult Sema::BuildPredefinedExpr(SourceLocation Loc,
       ConvertUTF8ToWideString(Context.getTypeSizeInChars(ResTy).getQuantity(),
                               Str, RawChars);
       ResTy = Context.getConstantArrayType(ResTy, LengthI, nullptr,
-                                           ArrayType::Normal,
+                                           ArraySizeModifier::Normal,
                                            /*IndexTypeQuals*/ 0);
       SL = StringLiteral::Create(Context, RawChars, StringLiteral::Wide,
                                  /*Pascal*/ false, ResTy, Loc);
     } else {
       ResTy = Context.adjustStringLiteralBaseType(Context.CharTy.withConst());
       ResTy = Context.getConstantArrayType(ResTy, LengthI, nullptr,
-                                           ArrayType::Normal,
+                                           ArraySizeModifier::Normal,
                                            /*IndexTypeQuals*/ 0);
       SL = StringLiteral::Create(Context, Str, StringLiteral::Ordinary,
                                  /*Pascal*/ false, ResTy, Loc);
@@ -4003,7 +4003,7 @@ ExprResult Sema::ActOnNumericConstant(const Token &Tok, Scope *UDLScope) {
       unsigned Length = Literal.getUDSuffixOffset();
       QualType StrTy = Context.getConstantArrayType(
           Context.adjustStringLiteralBaseType(Context.CharTy.withConst()),
-          llvm::APInt(32, Length + 1), nullptr, ArrayType::Normal, 0);
+          llvm::APInt(32, Length + 1), nullptr, ArraySizeModifier::Normal, 0);
       Expr *Lit =
           StringLiteral::Create(Context, StringRef(TokSpelling.data(), Length),
                                 StringLiteral::Ordinary,
@@ -6784,7 +6784,7 @@ Sema::CheckStaticArrayArgument(SourceLocation CallLoc,
   QualType OrigTy = Param->getOriginalType();
 
   const ArrayType *AT = Context.getAsArrayType(OrigTy);
-  if (!AT || AT->getSizeModifier() != ArrayType::Static)
+  if (!AT || AT->getSizeModifier() != ArraySizeModifier::Static)
     return;
 
   if (ArgExpr->isNullPointerConstant(Context,
diff --git a/clang/lib/Sema/SemaExprCXX.cpp b/clang/lib/Sema/SemaExprCXX.cpp
index 6344c9102330a..e977407056218 100644
--- a/clang/lib/Sema/SemaExprCXX.cpp
+++ b/clang/lib/Sema/SemaExprCXX.cpp
@@ -2402,10 +2402,10 @@ ExprResult Sema::BuildCXXNew(SourceRange Range, bool UseGlobal,
           AllocType,
           llvm::APInt(Context.getTypeSize(Context.getSizeType()),
                       *KnownArraySize),
-          *ArraySize, ArrayType::Normal, 0);
+          *ArraySize, ArraySizeModifier::Normal, 0);
     else if (ArraySize)
-      InitType =
-          Context.getIncompleteArrayType(AllocType, ArrayType::Normal, 0);
+      InitType = Context.getIncompleteArrayType(AllocType,
+                                                ArraySizeModifier::Normal, 0);
     else
       InitType = AllocType;
 
@@ -6915,9 +6915,9 @@ QualType Sema::FindCompositePointerType(SourceLocation Loc,
       case Array:
         if (auto *CAT = cast_or_null<ConstantArrayType>(ClassOrBound))
           return Ctx.getConstantArrayType(T, CAT->getSize(), nullptr,
-                                          ArrayType::Normal, 0);
+                                          ArraySizeModifier::Normal, 0);
         else
-          return Ctx.getIncompleteArrayType(T, ArrayType::Normal, 0);
+          return Ctx.getIncompleteArrayType(T, ArraySizeModifier::Normal, 0);
       }
       llvm_unreachable("unknown step kind");
     }
diff --git a/clang/lib/Sema/SemaInit.cpp b/clang/lib/Sema/SemaInit.cpp
index 8f945bc764bef..ec796def96ad3 100644
--- a/clang/lib/Sema/SemaInit.cpp
+++ b/clang/lib/Sema/SemaInit.cpp
@@ -200,9 +200,8 @@ static void CheckStringInit(Expr *Str, QualType &DeclT, const ArrayType *AT,
     // being initialized to a string literal.
     llvm::APInt ConstVal(32, StrLength);
     // Return a new array type (C99 6.7.8p22).
-    DeclT = S.Context.getConstantArrayType(IAT->getElementType(),
-                                           ConstVal, nullptr,
-                                           ArrayType::Normal, 0);
+    DeclT = S.Context.getConstantArrayType(
+        IAT->getElementType(), ConstVal, nullptr, ArraySizeModifier::Normal, 0);
     updateStringLiteralType(Str, DeclT);
     return;
   }
@@ -2053,7 +2052,7 @@ void InitListChecker::CheckArrayType(const InitializedEntity &Entity,
     }
 
     DeclType = SemaRef.Context.getConstantArrayType(
-        elementType, maxElements, nullptr, ArrayType::Normal, 0);
+        elementType, maxElements, nullptr, ArraySizeModifier::Normal, 0);
   }
   if (!hadError) {
     // If there are any members of the array that get value-initialized, check
@@ -4057,7 +4056,7 @@ static bool TryInitializerListConstruction(Sema &S,
       E.withConst(),
       llvm::APInt(S.Context.getTypeSize(S.Context.getSizeType()),
                   List->getNumInits()),
-      nullptr, clang::ArrayType::Normal, 0);
+      nullptr, clang::ArraySizeModifier::Normal, 0);
   InitializedEntity HiddenArray =
       InitializedEntity::InitializeTemporary(ArrayType);
   InitializationKind Kind = InitializationKind::CreateDirectList(
@@ -5513,7 +5512,7 @@ static void TryOrBuildParenListInitialization(
     if (ResultType.isNull()) {
       ResultType = S.Context.getConstantArrayType(
           AT->getElementType(), llvm::APInt(/*numBits=*/32, ArrayLength),
-          /*SizeExpr=*/nullptr, ArrayType::Normal, 0);
+          /*SizeExpr=*/nullptr, ArraySizeModifier::Normal, 0);
     }
   } else if (auto *RT = Entity.getType()->getAs<RecordType>()) {
     bool IsUnion = RT->isUnionType();
@@ -9237,10 +9236,8 @@ ExprResult InitializationSequence::Perform(Sema &S,
           if (const ConstantArrayType *ConstantSource
                  = S.Context.getAsConstantArrayType(CurInit.get()->getType())) {
             *ResultType = S.Context.getConstantArrayType(
-                                             IncompleteDest->getElementType(),
-                                             ConstantSource->getSize(),
-                                             ConstantSource->getSizeExpr(),
-                                             ArrayType::Normal, 0);
+                IncompleteDest->getElementType(), ConstantSource->getSize(),
+                ConstantSource->getSizeExpr(), ArraySizeModifier::Normal, 0);
           }
         }
       }
@@ -9508,7 +9505,7 @@ static void diagnoseListInit(Sema &S, const InitializedEntity &Entity,
         E.withConst(),
         llvm::APInt(S.Context.getTypeSize(S.Context.getSizeType()),
                     InitList->getNumInits()),
-        nullptr, clang::ArrayType::Normal, 0);
+        nullptr, clang::ArraySizeModifier::Normal, 0);
     InitializedEntity HiddenArray =
         InitializedEntity::InitializeTemporary(ArrayType);
     return diagnoseListInit(S, HiddenArray, InitList);
diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp
index 75f9e152dca92..a471a20aa7200 100644
--- a/clang/lib/Sema/SemaOpenMP.cpp
+++ b/clang/lib/Sema/SemaOpenMP.cpp
@@ -19656,7 +19656,7 @@ static bool actOnOMPReductionKindClause(
       if (ConstantLengthOASE && !SingleElement) {
         for (llvm::APSInt &Size : ArraySizes)
           PrivateTy = Context.getConstantArrayType(PrivateTy, Size, nullptr,
-                                                   ArrayType::Normal,
+                                                   ArraySizeModifier::Normal,
                                                    /*IndexTypeQuals=*/0);
       }
     }
@@ -19683,7 +19683,7 @@ static bool actOnOMPReductionKindClause(
           Type,
           new (Context)
               OpaqueValueExpr(ELoc, Context.getSizeType(), VK_PRValue),
-          ArrayType::Normal, /*IndexTypeQuals=*/0, SourceRange());
+          ArraySizeModifier::Normal, /*IndexTypeQuals=*/0, SourceRange());
     } else if (!ASE && !OASE &&
                Context.getAsArrayType(D->getType().getNonReferenceType())) {
       PrivateTy = D->getType().getNonReferenceType();
@@ -19921,9 +19921,9 @@ static bool actOnOMPReductionKindClause(
         // Build temp array for prefix sum.
         auto *Dim = new (S.Context)
             OpaqueValueExpr(ELoc, S.Context.getSizeType(), VK_PRValue);
-        QualType ArrayTy =
-            S.Context.getVariableArrayType(PrivateTy, Dim, ArrayType::Normal,
-                                           /*IndexTypeQuals=*/0, {ELoc, ELoc});
+        QualType ArrayTy = S.Context.getVariableArrayType(
+            PrivateTy, Dim, ArraySizeModifier::Normal,
+            /*IndexTypeQuals=*/0, {ELoc, ELoc});
         VarDecl *TempArrayVD =
             buildVarDecl(S, ELoc, ArrayTy, D->getName(),
                          D->hasAttrs() ? &D->getAttrs() : nullptr);
diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp
index db386fef0661c..d3d2dfed2ce0c 100644
--- a/clang/lib/Sema/SemaOverload.cpp
+++ b/clang/lib/Sema/SemaOverload.cpp
@@ -5386,7 +5386,7 @@ TryListConversion(Sema &S, InitListExpr *From, QualType ToType,
         }
         llvm::APInt Size(S.Context.getTypeSize(S.Context.getSizeType()), e);
         ContTy = S.Context.getConstantArrayType(InitTy, Size, nullptr,
-                                                ArrayType::Normal, 0);
+                                                ArraySizeModifier::Normal, 0);
       }
     }
 
diff --git a/clang/lib/Sema/SemaType.cpp b/clang/lib/Sema/SemaType.cpp
index dea77fae4cadb..37491925d750a 100644
--- a/clang/lib/Sema/SemaType.cpp
+++ b/clang/lib/Sema/SemaType.cpp
@@ -2475,7 +2475,7 @@ bool Sema::checkArrayElementAlignment(QualType EltTy, SourceLocation Loc) {
 ///
 /// \returns A suitable array type, if there are no errors. Otherwise,
 /// returns a NULL type.
-QualType Sema::BuildArrayType(QualType T, ArrayType::ArraySizeModifier ASM,
+QualType Sema::BuildArrayType(QualType T, ArraySizeModifier ASM,
                               Expr *ArraySize, unsigned Quals,
                               SourceRange Brackets, DeclarationName Entity) {
 
@@ -2635,7 +2635,7 @@ QualType Sema::BuildArrayType(QualType T, ArrayType::ArraySizeModifier ASM,
 
   llvm::APSInt ConstVal(Context.getTypeSize(Context.getSizeType()));
   if (!ArraySize) {
-    if (ASM == ArrayType::Star) {
+    if (ASM == ArraySizeModifier::Star) {
       Diag(Loc, VLADiag);
       if (VLAIsError)
         return QualType();
@@ -2722,10 +2722,10 @@ QualType Sema::BuildArrayType(QualType T, ArrayType::ArraySizeModifier ASM,
 
   // If this is not C99, diagnose array size modifiers on non-VLAs.
   if (!getLangOpts().C99 && !T->isVariableArrayType() &&
-      (ASM != ArrayType::Normal || Quals != 0)) {
+      (ASM != ArraySizeModifier::Normal || Quals != 0)) {
     Diag(Loc, getLangOpts().CPlusPlus ? diag::err_c99_array_usage_cxx
                                       : diag::ext_c99_array_usage)
-        << ASM;
+        << llvm::to_underlying(ASM);
   }
 
   // OpenCL v2.0 s6.12.5 - Arrays of blocks are not supported.
@@ -5156,7 +5156,7 @@ static TypeSourceInfo *GetFullTypeForDeclarator(TypeProcessingState &state,
       }
       DeclaratorChunk::ArrayTypeInfo &ATI = DeclType.Arr;
       Expr *ArraySize = static_cast<Expr*>(ATI.NumElts);
-      ArrayType::ArraySizeModifier ASM;
+      ArraySizeModifier ASM;
 
       // Microsoft property fields can have multiple sizeless array chunks
       // (i.e. int x[][][]). Skip all of these except one to avoid creating
@@ -5171,31 +5171,32 @@ static TypeSourceInfo *GetFullTypeForDeclarator(TypeProcessingState &state,
       }
 
       if (ATI.isStar)
-        ASM = ArrayType::Star;
+        ASM = ArraySizeModifier::Star;
       else if (ATI.hasStatic)
-        ASM = ArrayType::Static;
+        ASM = ArraySizeModifier::Static;
       else
-        ASM = ArrayType::Normal;
-      if (ASM == ArrayType::Star && !D.isPrototypeContext()) {
+        ASM = ArraySizeModifier::Normal;
+      if (ASM == ArraySizeModifier::Star && !D.isPrototypeContext()) {
         // FIXME: This check isn't quite right: it allows star in prototypes
         // for function definitions, and disallows some edge cases detailed
         // in http://gcc.gnu.org/ml/gcc-patches/2009-02/msg00133.html
         S.Diag(DeclType.Loc, diag::err_array_star_outside_prototype);
-        ASM = ArrayType::Normal;
+        ASM = ArraySizeModifier::Normal;
         D.setInvalidType(true);
       }
 
       // C99 6.7.5.2p1: The optional type qualifiers and the keyword static
       // shall appear only in a declaration of a function parameter with an
       // array type, ...
-      if (ASM == ArrayType::Static || ATI.TypeQuals) {
+      if (ASM == ArraySizeModifier::Static || ATI.TypeQuals) {
         if (!(D.isPrototypeContext() ||
               D.getContext() == DeclaratorContext::KNRTypeList)) {
-          S.Diag(DeclType.Loc, diag::err_array_static_outside_prototype) <<
-              (ASM == ArrayType::Static ? "'static'" : "type qualifier");
+          S.Diag(DeclType.Loc, diag::err_array_static_outside_prototype)
+              << (ASM == ArraySizeModifier::Static ? "'static'"
+                                                   : "type qualifier");
           // Remove the 'static' and the type qualifiers.
-          if (ASM == ArrayType::Static)
-            ASM = ArrayType::Normal;
+          if (ASM == ArraySizeModifier::Static)
+            ASM = ArraySizeModifier::Normal;
           ATI.TypeQuals = 0;
           D.setInvalidType(true);
         }
@@ -5203,10 +5204,11 @@ static TypeSourceInfo *GetFullTypeForDeclarator(TypeProcessingState &state,
         // C99 6.7.5.2p1: ... and then only in the outermost array type
         // derivation.
         if (hasOuterPointerLikeChunk(D, chunkIndex)) {
-          S.Diag(DeclType.Loc, diag::err_array_static_not_outermost) <<
-            (ASM == ArrayType::Static ? "'static'" : "type qualifier");
-          if (ASM == ArrayType::Static)
-            ASM = ArrayType::Normal;
+          S.Diag(DeclType.Loc, diag::err_array_static_not_outermost)
+              << (ASM == ArraySizeModifier::Static ? "'static'"
+                                                   : "type qualifier");
+          if (ASM == ArraySizeModifier::Static)
+            ASM = ArraySizeModifier::Normal;
           ATI.TypeQuals = 0;
           D.setInvalidType(true);
         }
@@ -5216,8 +5218,7 @@ static TypeSourceInfo *GetFullTypeForDeclarator(TypeProcessingState &state,
       // necessary if they're marked 'static'.
       if (complainAboutMissingNullability == CAMN_Yes &&
           !hasNullabilityAttr(DeclType.getAttrs()) &&
-          ASM != ArrayType::Static &&
-          D.isPrototypeContext() &&
+          ASM != ArraySizeModifier::Static && D.isPrototypeContext() &&
           !hasOuterPointerLikeChunk(D, chunkIndex)) {
         checkNullabilityConsistency(S, SimplePointerKind::Array, DeclType.Loc);
       }
diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h
index 17ebfe14cecad..e990cc2e60c8d 100644
--- a/clang/lib/Sema/TreeTransform.h
+++ b/clang/lib/Sema/TreeTransform.h
@@ -878,12 +878,9 @@ class TreeTransform {
   /// By default, performs semantic analysis when building the array type.
   /// Subclasses may override this routine to provide different behavior.
   /// Also by default, all of the other Rebuild*Array
-  QualType RebuildArrayType(QualType ElementType,
-                            ArrayType::ArraySizeModifier SizeMod,
-                            const llvm::APInt *Size,
-                            Expr *SizeExpr,
-                            unsigned IndexTypeQuals,
-                            SourceRange BracketsRange);
+  QualType RebuildArrayType(QualType ElementType, ArraySizeModifier SizeMod,
+                            const llvm::APInt *Size, Expr *SizeExpr,
+                            unsigned IndexTypeQuals, SourceRange BracketsRange);
 
   /// Build a new constant array type given the element type, size
   /// modifier, (known) size of the array, and index type qualifiers.
@@ -891,9 +888,8 @@ class TreeTransform {
   /// By default, performs semantic analysis when building the array type.
   /// Subclasses may override this routine to provide different behavior.
   QualType RebuildConstantArrayType(QualType ElementType,
-                                    ArrayType::ArraySizeModifier SizeMod,
-                                    const llvm::APInt &Size,
-                                    Expr *SizeExpr,
+                                    ArraySizeModifier SizeMod,
+                                    const llvm::APInt &Size, Expr *SizeExpr,
                                     unsigned IndexTypeQuals,
                                     SourceRange BracketsRange);
 
@@ -903,7 +899,7 @@ class TreeTransform {
   /// By default, performs semantic analysis when building the array type.
   /// Subclasses may override this routine to provide different behavior.
   QualType RebuildIncompleteArrayType(QualType ElementType,
-                                      ArrayType::ArraySizeModifier SizeMod,
+                                      ArraySizeModifier SizeMod,
                                       unsigned IndexTypeQuals,
                                       SourceRange BracketsRange);
 
@@ -913,8 +909,7 @@ class TreeTransform {
   /// By default, performs semantic analysis when building the array type.
   /// Subclasses may override this routine to provide different behavior.
   QualType RebuildVariableArrayType(QualType ElementType,
-                                    ArrayType::ArraySizeModifier SizeMod,
-                                    Expr *SizeExpr,
+                                    ArraySizeModifier SizeMod, Expr *SizeExpr,
                                     unsigned IndexTypeQuals,
                                     SourceRange BracketsRange);
 
@@ -924,7 +919,7 @@ class TreeTransform {
   /// By default, performs semantic analysis when building the array type.
   /// Subclasses may override this routine to provide different behavior.
   QualType RebuildDependentSizedArrayType(QualType ElementType,
-                                          ArrayType::ArraySizeModifier SizeMod,
+                                          ArraySizeModifier SizeMod,
                                           Expr *SizeExpr,
                                           unsigned IndexTypeQuals,
                                           SourceRange BracketsRange);
@@ -14918,14 +14913,10 @@ QualType TreeTransform<Derived>::RebuildObjCObjectPointerType(
   return SemaRef.Context.getObjCObjectPointerType(PointeeType);
 }
 
-template<typename Derived>
-QualType
-TreeTransform<Derived>::RebuildArrayType(QualType ElementType,
-                                         ArrayType::ArraySizeModifier SizeMod,
-                                         const llvm::APInt *Size,
-                                         Expr *SizeExpr,
-                                         unsigned IndexTypeQuals,
-                                         SourceRange BracketsRange) {
+template <typename Derived>
+QualType TreeTransform<Derived>::RebuildArrayType(
+    QualType ElementType, ArraySizeModifier SizeMod, const llvm::APInt *Size,
+    Expr *SizeExpr, unsigned IndexTypeQuals, SourceRange BracketsRange) {
   if (SizeExpr || !Size)
     return SemaRef.BuildArrayType(ElementType, SizeMod, SizeExpr,
                                   IndexTypeQuals, BracketsRange,
@@ -14953,47 +14944,35 @@ TreeTransform<Derived>::RebuildArrayType(QualType ElementType,
                                 getDerived().getBaseEntity());
 }
 
-template<typename Derived>
-QualType
-TreeTransform<Derived>::RebuildConstantArrayType(QualType ElementType,
-                                                 ArrayType::ArraySizeModifier SizeMod,
-                                                 const llvm::APInt &Size,
-                                                 Expr *SizeExpr,
-                                                 unsigned IndexTypeQuals,
-                                                 SourceRange BracketsRange) {
+template <typename Derived>
+QualType TreeTransform<Derived>::RebuildConstantArrayType(
+    QualType ElementType, ArraySizeModifier SizeMod, const llvm::APInt &Size,
+    Expr *SizeExpr, unsigned IndexTypeQuals, SourceRange BracketsRange) {
   return getDerived().RebuildArrayType(ElementType, SizeMod, &Size, SizeExpr,
                                         IndexTypeQuals, BracketsRange);
 }
 
-template<typename Derived>
-QualType
-TreeTransform<Derived>::RebuildIncompleteArrayType(QualType ElementType,
-                                          ArrayType::ArraySizeModifier SizeMod,
-                                                 unsigned IndexTypeQuals,
-                                                   SourceRange BracketsRange) {
+template <typename Derived>
+QualType TreeTransform<Derived>::RebuildIncompleteArrayType(
+    QualType ElementType, ArraySizeModifier SizeMod, unsigned IndexTypeQuals,
+    SourceRange BracketsRange) {
   return getDerived().RebuildArrayType(ElementType, SizeMod, nullptr, nullptr,
                                        IndexTypeQuals, BracketsRange);
 }
 
-template<typename Derived>
-QualType
-TreeTransform<Derived>::RebuildVariableArrayType(QualType ElementType,
-                                          ArrayType::ArraySizeModifier SizeMod,
-                                                 Expr *SizeExpr,
-                                                 unsigned IndexTypeQuals,
-                                                 SourceRange BracketsRange) {
+template <typename Derived>
+QualType TreeTransform<Derived>::RebuildVariableArrayType(
+    QualType ElementType, ArraySizeModifier SizeMod, Expr *SizeExpr,
+    unsigned IndexTypeQuals, SourceRange BracketsRange) {
   return getDerived().RebuildArrayType(ElementType, SizeMod, nullptr,
                                        SizeExpr,
                                        IndexTypeQuals, BracketsRange);
 }
 
-template<typename Derived>
-QualType
-TreeTransform<Derived>::RebuildDependentSizedArrayType(QualType ElementType,
-                                          ArrayType::ArraySizeModifier SizeMod,
-                                                       Expr *SizeExpr,
-                                                       unsigned IndexTypeQuals,
-                                                   SourceRange BracketsRange) {
+template <typename Derived>
+QualType TreeTransform<Derived>::RebuildDependentSizedArrayType(
+    QualType ElementType, ArraySizeModifier SizeMod, Expr *SizeExpr,
+    unsigned IndexTypeQuals, SourceRange BracketsRange) {
   return getDerived().RebuildArrayType(ElementType, SizeMod, nullptr,
                                        SizeExpr,
                                        IndexTypeQuals, BracketsRange);

From f5f4c5b313fc809b9b1b9d600d0ec3cec0d12941 Mon Sep 17 00:00:00 2001
From: Vlad Serebrennikov <serebrennikov.vladislav@gmail.com>
Date: Tue, 31 Oct 2023 18:20:45 +0300
Subject: [PATCH 530/877] [clang][NFC] Follow up to ArraySizeModifier
 refactoring

This addresses issues found by https://lab.llvm.org/buildbot/#/builders/68/builds/62599 introduced in d71ac4b05bcf2804368ec7217a13c2c47fce7479
---
 clang-tools-extra/clangd/Hover.cpp            |  6 +++---
 libcxxabi/test/test_demangle.pass.cpp         | 20 +++++++++----------
 .../TypeSystem/Clang/TypeSystemClang.cpp      | 11 +++++-----
 3 files changed, 19 insertions(+), 18 deletions(-)

diff --git a/clang-tools-extra/clangd/Hover.cpp b/clang-tools-extra/clangd/Hover.cpp
index 933c69294b409..7f7b5513dff6f 100644
--- a/clang-tools-extra/clangd/Hover.cpp
+++ b/clang-tools-extra/clangd/Hover.cpp
@@ -686,9 +686,9 @@ getPredefinedExprHoverContents(const PredefinedExpr &PE, ASTContext &Ctx,
     HI.Type = printType(Name->getType(), Ctx, PP);
   } else {
     // Inside templates, the approximate type `const char[]` is still useful.
-    QualType StringType = Ctx.getIncompleteArrayType(
-        Ctx.CharTy.withConst(), ArrayType::ArraySizeModifier::Normal,
-        /*IndexTypeQuals=*/0);
+    QualType StringType = Ctx.getIncompleteArrayType(Ctx.CharTy.withConst(),
+                                                     ArraySizeModifier::Normal,
+                                                     /*IndexTypeQuals=*/0);
     HI.Type = printType(StringType, Ctx, PP);
   }
   return HI;
diff --git a/libcxxabi/test/test_demangle.pass.cpp b/libcxxabi/test/test_demangle.pass.cpp
index 77741a952850a..3bfdc04ced50b 100644
--- a/libcxxabi/test/test_demangle.pass.cpp
+++ b/libcxxabi/test/test_demangle.pass.cpp
@@ -5989,7 +5989,7 @@ const char* cases[][2] =
     {"_ZN5clang13TreeTransformIN12_GLOBAL__N_129CurrentInstantiationRebuilderEE13TransformTypeERNS_14TypeLocBuilderENS_7TypeLocE", "clang::TreeTransform<(anonymous namespace)::CurrentInstantiationRebuilder>::TransformType(clang::TypeLocBuilder&, clang::TypeLoc)"},
     {"_ZN5clang13TreeTransformIN12_GLOBAL__N_129CurrentInstantiationRebuilderEE27TransformFunctionTypeParamsENS_14SourceLocationEPPNS_11ParmVarDeclEjPKNS_8QualTypeERN4llvm15SmallVectorImplIS8_EEPNSC_IS6_EE", "clang::TreeTransform<(anonymous namespace)::CurrentInstantiationRebuilder>::TransformFunctionTypeParams(clang::SourceLocation, clang::ParmVarDecl**, unsigned int, clang::QualType const*, llvm::SmallVectorImpl<clang::QualType>&, llvm::SmallVectorImpl<clang::ParmVarDecl*>*)"},
     {"_ZN5clang13TreeTransformIN12_GLOBAL__N_129CurrentInstantiationRebuilderEE26TransformFunctionTypeParamEPNS_11ParmVarDeclEN4llvm8OptionalIjEE", "clang::TreeTransform<(anonymous namespace)::CurrentInstantiationRebuilder>::TransformFunctionTypeParam(clang::ParmVarDecl*, llvm::Optional<unsigned int>)"},
-    {"_ZN5clang13TreeTransformIN12_GLOBAL__N_129CurrentInstantiationRebuilderEE16RebuildArrayTypeENS_8QualTypeENS_9ArrayType17ArraySizeModifierEPKN4llvm5APIntEPNS_4ExprEjNS_11SourceRangeE", "clang::TreeTransform<(anonymous namespace)::CurrentInstantiationRebuilder>::RebuildArrayType(clang::QualType, clang::ArrayType::ArraySizeModifier, llvm::APInt const*, clang::Expr*, unsigned int, clang::SourceRange)"},
+    {"_ZN5clang13TreeTransformIN12_GLOBAL__N_129CurrentInstantiationRebuilderEE16RebuildArrayTypeENS_8QualTypeENS_9ArrayType17ArraySizeModifierEPKN4llvm5APIntEPNS_4ExprEjNS_11SourceRangeE", "clang::TreeTransform<(anonymous namespace)::CurrentInstantiationRebuilder>::RebuildArrayType(clang::QualType, clang::ArraySizeModifier, llvm::APInt const*, clang::Expr*, unsigned int, clang::SourceRange)"},
     {"_ZN5clang13TreeTransformIN12_GLOBAL__N_129CurrentInstantiationRebuilderEE22TransformReferenceTypeERNS_14TypeLocBuilderENS_16ReferenceTypeLocE", "clang::TreeTransform<(anonymous namespace)::CurrentInstantiationRebuilder>::TransformReferenceType(clang::TypeLocBuilder&, clang::ReferenceTypeLoc)"},
     {"_ZN5clang13TreeTransformIN12_GLOBAL__N_129CurrentInstantiationRebuilderEE26TransformTemplateArgumentsINS_33TemplateArgumentLocInventIteratorIS2_PKNS_16TemplateArgumentEEEEEbT_SA_RNS_24TemplateArgumentListInfoE", "bool clang::TreeTransform<(anonymous namespace)::CurrentInstantiationRebuilder>::TransformTemplateArguments<clang::TemplateArgumentLocInventIterator<(anonymous namespace)::CurrentInstantiationRebuilder, clang::TemplateArgument const*>>(clang::TemplateArgumentLocInventIterator<(anonymous namespace)::CurrentInstantiationRebuilder, clang::TemplateArgument const*>, clang::TemplateArgumentLocInventIterator<(anonymous namespace)::CurrentInstantiationRebuilder, clang::TemplateArgument const*>, clang::TemplateArgumentListInfo&)"},
     {"_ZN5clang13TreeTransformIN12_GLOBAL__N_129CurrentInstantiationRebuilderEE25TransformTemplateArgumentERKNS_19TemplateArgumentLocERS4_", "clang::TreeTransform<(anonymous namespace)::CurrentInstantiationRebuilder>::TransformTemplateArgument(clang::TemplateArgumentLoc const&, clang::TemplateArgumentLoc&)"},
@@ -6152,7 +6152,7 @@ const char* cases[][2] =
     {"_ZN5clang13TreeTransformIN12_GLOBAL__N_120TemplateInstantiatorEE17TransformCallExprEPNS_8CallExprE", "clang::TreeTransform<(anonymous namespace)::TemplateInstantiator>::TransformCallExpr(clang::CallExpr*)"},
     {"_ZN5clang13TreeTransformIN12_GLOBAL__N_120TemplateInstantiatorEE21TransformCXXCatchStmtEPNS_12CXXCatchStmtE", "clang::TreeTransform<(anonymous namespace)::TemplateInstantiator>::TransformCXXCatchStmt(clang::CXXCatchStmt*)"},
     {"_ZN5clang13TreeTransformIN12_GLOBAL__N_120TemplateInstantiatorEE26TransformFunctionProtoTypeERNS_14TypeLocBuilderENS_20FunctionProtoTypeLocE", "clang::TreeTransform<(anonymous namespace)::TemplateInstantiator>::TransformFunctionProtoType(clang::TypeLocBuilder&, clang::FunctionProtoTypeLoc)"},
-    {"_ZN5clang13TreeTransformIN12_GLOBAL__N_120TemplateInstantiatorEE16RebuildArrayTypeENS_8QualTypeENS_9ArrayType17ArraySizeModifierEPKN4llvm5APIntEPNS_4ExprEjNS_11SourceRangeE", "clang::TreeTransform<(anonymous namespace)::TemplateInstantiator>::RebuildArrayType(clang::QualType, clang::ArrayType::ArraySizeModifier, llvm::APInt const*, clang::Expr*, unsigned int, clang::SourceRange)"},
+    {"_ZN5clang13TreeTransformIN12_GLOBAL__N_120TemplateInstantiatorEE16RebuildArrayTypeENS_8QualTypeENS_9ArrayType17ArraySizeModifierEPKN4llvm5APIntEPNS_4ExprEjNS_11SourceRangeE", "clang::TreeTransform<(anonymous namespace)::TemplateInstantiator>::RebuildArrayType(clang::QualType, clang::ArraySizeModifier, llvm::APInt const*, clang::Expr*, unsigned int, clang::SourceRange)"},
     {"_ZN5clang13TreeTransformIN12_GLOBAL__N_120TemplateInstantiatorEE22TransformReferenceTypeERNS_14TypeLocBuilderENS_16ReferenceTypeLocE", "clang::TreeTransform<(anonymous namespace)::TemplateInstantiator>::TransformReferenceType(clang::TypeLocBuilder&, clang::ReferenceTypeLoc)"},
     {"_ZN5clang14TypeLocBuilder12pushFullCopyENS_7TypeLocE", "clang::TypeLocBuilder::pushFullCopy(clang::TypeLoc)"},
     {"_ZN5clang11DeclVisitorINS_24TemplateDeclInstantiatorEPNS_4DeclEE5VisitES3_", "clang::DeclVisitor<clang::TemplateDeclInstantiator, clang::Decl*>::Visit(clang::Decl*)"},
@@ -6262,7 +6262,7 @@ const char* cases[][2] =
     {"_ZN5clang4Sema14BuildParenTypeENS_8QualTypeE", "clang::Sema::BuildParenType(clang::QualType)"},
     {"_ZN5clang4Sema16BuildPointerTypeENS_8QualTypeENS_14SourceLocationENS_15DeclarationNameE", "clang::Sema::BuildPointerType(clang::QualType, clang::SourceLocation, clang::DeclarationName)"},
     {"_ZN5clang4Sema18BuildReferenceTypeENS_8QualTypeEbNS_14SourceLocationENS_15DeclarationNameE", "clang::Sema::BuildReferenceType(clang::QualType, bool, clang::SourceLocation, clang::DeclarationName)"},
-    {"_ZN5clang4Sema14BuildArrayTypeENS_8QualTypeENS_9ArrayType17ArraySizeModifierEPNS_4ExprEjNS_11SourceRangeENS_15DeclarationNameE", "clang::Sema::BuildArrayType(clang::QualType, clang::ArrayType::ArraySizeModifier, clang::Expr*, unsigned int, clang::SourceRange, clang::DeclarationName)"},
+    {"_ZN5clang4Sema14BuildArrayTypeENS_8QualTypeENS_9ArrayType17ArraySizeModifierEPNS_4ExprEjNS_11SourceRangeENS_15DeclarationNameE", "clang::Sema::BuildArrayType(clang::QualType, clang::ArraySizeModifier, clang::Expr*, unsigned int, clang::SourceRange, clang::DeclarationName)"},
     {"_ZN5clang4Sema19RequireCompleteTypeENS_14SourceLocationENS_8QualTypeEj", "clang::Sema::RequireCompleteType(clang::SourceLocation, clang::QualType, unsigned int)"},
     {"_ZN5clang4Sema18BuildExtVectorTypeENS_8QualTypeEPNS_4ExprENS_14SourceLocationE", "clang::Sema::BuildExtVectorType(clang::QualType, clang::Expr*, clang::SourceLocation)"},
     {"_ZN5clang4Sema17BuildFunctionTypeENS_8QualTypeEPS1_jbjNS_16RefQualifierKindENS_14SourceLocationENS_15DeclarationNameENS_12FunctionType7ExtInfoE", "clang::Sema::BuildFunctionType(clang::QualType, clang::QualType*, unsigned int, bool, unsigned int, clang::RefQualifierKind, clang::SourceLocation, clang::DeclarationName, clang::FunctionType::ExtInfo)"},
@@ -8880,11 +8880,11 @@ const char* cases[][2] =
     {"_ZNK5clang10ASTContext22getLValueReferenceTypeENS_8QualTypeEb", "clang::ASTContext::getLValueReferenceType(clang::QualType, bool) const"},
     {"_ZNK5clang10ASTContext22getRValueReferenceTypeENS_8QualTypeE", "clang::ASTContext::getRValueReferenceType(clang::QualType) const"},
     {"_ZNK5clang10ASTContext20getMemberPointerTypeENS_8QualTypeEPKNS_4TypeE", "clang::ASTContext::getMemberPointerType(clang::QualType, clang::Type const*) const"},
-    {"_ZNK5clang10ASTContext20getConstantArrayTypeENS_8QualTypeERKN4llvm5APIntENS_9ArrayType17ArraySizeModifierEj", "clang::ASTContext::getConstantArrayType(clang::QualType, llvm::APInt const&, clang::ArrayType::ArraySizeModifier, unsigned int) const"},
+    {"_ZNK5clang10ASTContext20getConstantArrayTypeENS_8QualTypeERKN4llvm5APIntENS_9ArrayType17ArraySizeModifierEj", "clang::ASTContext::getConstantArrayType(clang::QualType, llvm::APInt const&, clang::ArraySizeModifier, unsigned int) const"},
     {"_ZNK5clang10ASTContext27getVariableArrayDecayedTypeENS_8QualTypeE", "clang::ASTContext::getVariableArrayDecayedType(clang::QualType) const"},
-    {"_ZNK5clang10ASTContext26getDependentSizedArrayTypeENS_8QualTypeEPNS_4ExprENS_9ArrayType17ArraySizeModifierEjNS_11SourceRangeE", "clang::ASTContext::getDependentSizedArrayType(clang::QualType, clang::Expr*, clang::ArrayType::ArraySizeModifier, unsigned int, clang::SourceRange) const"},
-    {"_ZNK5clang10ASTContext20getVariableArrayTypeENS_8QualTypeEPNS_4ExprENS_9ArrayType17ArraySizeModifierEjNS_11SourceRangeE", "clang::ASTContext::getVariableArrayType(clang::QualType, clang::Expr*, clang::ArrayType::ArraySizeModifier, unsigned int, clang::SourceRange) const"},
-    {"_ZNK5clang10ASTContext22getIncompleteArrayTypeENS_8QualTypeENS_9ArrayType17ArraySizeModifierEj", "clang::ASTContext::getIncompleteArrayType(clang::QualType, clang::ArrayType::ArraySizeModifier, unsigned int) const"},
+    {"_ZNK5clang10ASTContext26getDependentSizedArrayTypeENS_8QualTypeEPNS_4ExprENS_9ArrayType17ArraySizeModifierEjNS_11SourceRangeE", "clang::ASTContext::getDependentSizedArrayType(clang::QualType, clang::Expr*, clang::ArraySizeModifier, unsigned int, clang::SourceRange) const"},
+    {"_ZNK5clang10ASTContext20getVariableArrayTypeENS_8QualTypeEPNS_4ExprENS_9ArrayType17ArraySizeModifierEjNS_11SourceRangeE", "clang::ASTContext::getVariableArrayType(clang::QualType, clang::Expr*, clang::ArraySizeModifier, unsigned int, clang::SourceRange) const"},
+    {"_ZNK5clang10ASTContext22getIncompleteArrayTypeENS_8QualTypeENS_9ArrayType17ArraySizeModifierEj", "clang::ASTContext::getIncompleteArrayType(clang::QualType, clang::ArraySizeModifier, unsigned int) const"},
     {"_ZNK5clang10ASTContext13getVectorTypeENS_8QualTypeEjNS_10VectorType10VectorKindE", "clang::ASTContext::getVectorType(clang::QualType, unsigned int, clang::VectorType::VectorKind) const"},
     {"_ZNK5clang10ASTContext16getExtVectorTypeENS_8QualTypeEj", "clang::ASTContext::getExtVectorType(clang::QualType, unsigned int) const"},
     {"_ZNK5clang10ASTContext30getDependentSizedExtVectorTypeENS_8QualTypeEPNS_4ExprENS_14SourceLocationE", "clang::ASTContext::getDependentSizedExtVectorType(clang::QualType, clang::Expr*, clang::SourceLocation) const"},
@@ -10865,9 +10865,9 @@ const char* cases[][2] =
     {"_ZN5clang8QualType10isConstantES0_RNS_10ASTContextE", "clang::QualType::isConstant(clang::QualType, clang::ASTContext&)"},
     {"_ZN5clang17ConstantArrayType20getNumAddressingBitsERNS_10ASTContextENS_8QualTypeERKN4llvm5APIntE", "clang::ConstantArrayType::getNumAddressingBits(clang::ASTContext&, clang::QualType, llvm::APInt const&)"},
     {"_ZN5clang17ConstantArrayType14getMaxSizeBitsERNS_10ASTContextE", "clang::ConstantArrayType::getMaxSizeBits(clang::ASTContext&)"},
-    {"_ZN5clang23DependentSizedArrayTypeC1ERKNS_10ASTContextENS_8QualTypeES4_PNS_4ExprENS_9ArrayType17ArraySizeModifierEjNS_11SourceRangeE", "clang::DependentSizedArrayType::DependentSizedArrayType(clang::ASTContext const&, clang::QualType, clang::QualType, clang::Expr*, clang::ArrayType::ArraySizeModifier, unsigned int, clang::SourceRange)"},
-    {"_ZN5clang23DependentSizedArrayTypeC2ERKNS_10ASTContextENS_8QualTypeES4_PNS_4ExprENS_9ArrayType17ArraySizeModifierEjNS_11SourceRangeE", "clang::DependentSizedArrayType::DependentSizedArrayType(clang::ASTContext const&, clang::QualType, clang::QualType, clang::Expr*, clang::ArrayType::ArraySizeModifier, unsigned int, clang::SourceRange)"},
-    {"_ZN5clang23DependentSizedArrayType7ProfileERN4llvm16FoldingSetNodeIDERKNS_10ASTContextENS_8QualTypeENS_9ArrayType17ArraySizeModifierEjPNS_4ExprE", "clang::DependentSizedArrayType::Profile(llvm::FoldingSetNodeID&, clang::ASTContext const&, clang::QualType, clang::ArrayType::ArraySizeModifier, unsigned int, clang::Expr*)"},
+    {"_ZN5clang23DependentSizedArrayTypeC1ERKNS_10ASTContextENS_8QualTypeES4_PNS_4ExprENS_9ArrayType17ArraySizeModifierEjNS_11SourceRangeE", "clang::DependentSizedArrayType::DependentSizedArrayType(clang::ASTContext const&, clang::QualType, clang::QualType, clang::Expr*, clang::ArraySizeModifier, unsigned int, clang::SourceRange)"},
+    {"_ZN5clang23DependentSizedArrayTypeC2ERKNS_10ASTContextENS_8QualTypeES4_PNS_4ExprENS_9ArrayType17ArraySizeModifierEjNS_11SourceRangeE", "clang::DependentSizedArrayType::DependentSizedArrayType(clang::ASTContext const&, clang::QualType, clang::QualType, clang::Expr*, clang::ArraySizeModifier, unsigned int, clang::SourceRange)"},
+    {"_ZN5clang23DependentSizedArrayType7ProfileERN4llvm16FoldingSetNodeIDERKNS_10ASTContextENS_8QualTypeENS_9ArrayType17ArraySizeModifierEjPNS_4ExprE", "clang::DependentSizedArrayType::Profile(llvm::FoldingSetNodeID&, clang::ASTContext const&, clang::QualType, clang::ArraySizeModifier, unsigned int, clang::Expr*)"},
     {"_ZN5clang27DependentSizedExtVectorTypeC1ERKNS_10ASTContextENS_8QualTypeES4_PNS_4ExprENS_14SourceLocationE", "clang::DependentSizedExtVectorType::DependentSizedExtVectorType(clang::ASTContext const&, clang::QualType, clang::QualType, clang::Expr*, clang::SourceLocation)"},
     {"_ZN5clang27DependentSizedExtVectorTypeC2ERKNS_10ASTContextENS_8QualTypeES4_PNS_4ExprENS_14SourceLocationE", "clang::DependentSizedExtVectorType::DependentSizedExtVectorType(clang::ASTContext const&, clang::QualType, clang::QualType, clang::Expr*, clang::SourceLocation)"},
     {"_ZN5clang27DependentSizedExtVectorType7ProfileERN4llvm16FoldingSetNodeIDERKNS_10ASTContextENS_8QualTypeEPNS_4ExprE", "clang::DependentSizedExtVectorType::Profile(llvm::FoldingSetNodeID&, clang::ASTContext const&, clang::QualType, clang::Expr*)"},
diff --git a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
index f037708efc380..b2a5cb4eb99f4 100644
--- a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
+++ b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
@@ -2294,12 +2294,13 @@ CompilerType TypeSystemClang::CreateArrayType(const CompilerType &element_type,
 
       llvm::APInt ap_element_count(64, element_count);
       if (element_count == 0) {
-        return GetType(ast.getIncompleteArrayType(
-            ClangUtil::GetQualType(element_type), clang::ArrayType::Normal, 0));
+        return GetType(
+            ast.getIncompleteArrayType(ClangUtil::GetQualType(element_type),
+                                       clang::ArraySizeModifier::Normal, 0));
       } else {
         return GetType(ast.getConstantArrayType(
             ClangUtil::GetQualType(element_type), ap_element_count, nullptr,
-            clang::ArrayType::Normal, 0));
+            clang::ArraySizeModifier::Normal, 0));
       }
     }
   }
@@ -4308,10 +4309,10 @@ CompilerType TypeSystemClang::GetArrayType(lldb::opaque_compiler_type_t type,
     if (size != 0)
       return GetType(ast_ctx.getConstantArrayType(
           qual_type, llvm::APInt(64, size), nullptr,
-          clang::ArrayType::ArraySizeModifier::Normal, 0));
+          clang::ArraySizeModifier::Normal, 0));
     else
       return GetType(ast_ctx.getIncompleteArrayType(
-          qual_type, clang::ArrayType::ArraySizeModifier::Normal, 0));
+          qual_type, clang::ArraySizeModifier::Normal, 0));
   }
 
   return CompilerType();

From d4a885fcbc98ec8c62fde9ea7022611eafceb8c5 Mon Sep 17 00:00:00 2001
From: "Yueh-Ting (eop) Chen" <yueh.ting.chen@gmail.com>
Date: Tue, 31 Oct 2023 23:23:32 +0800
Subject: [PATCH 531/877] [Clang][RISCV] Add vcreate intrinsics for RVV
 non-tuple types (#70355)

riscv-non-isa/rvv-intrinsic-doc#288
---
 clang/include/clang/Basic/riscv_vector.td     |   54 +-
 .../non-policy/non-overloaded/vcreate.c       | 2455 ++++++++++++-----
 2 files changed, 1873 insertions(+), 636 deletions(-)

diff --git a/clang/include/clang/Basic/riscv_vector.td b/clang/include/clang/Basic/riscv_vector.td
index 63316da940594..8bbfd1cef5106 100644
--- a/clang/include/clang/Basic/riscv_vector.td
+++ b/clang/include/clang/Basic/riscv_vector.td
@@ -345,6 +345,32 @@ class VString<int nf, bit signed> {
                    !eq(nf, 8): !if(signed, "vvvvvvvv", "UvUvUvUvUvUvUvUv"));
 }
 
+
+class FixedVString<int fixed_lmul, int num, string vec> {
+  string V = "(LFixedLog2LMUL:" # fixed_lmul # ")" # vec;
+  string S = !interleave(!listsplat(V, num), "");
+}
+
+multiclass RVVNonTupleVCreateBuiltin<int dst_lmul, list<int> src_lmul_list> {
+  defvar dst_v = FixedVString<dst_lmul, 1, "v">.V;
+  defvar dst_uv = FixedVString<dst_lmul, 1, "Uv">.V;
+  foreach src_lmul = src_lmul_list in {
+    defvar num = !shl(1, !sub(dst_lmul, src_lmul));
+
+    defvar src_v = FixedVString<src_lmul, num, "v">.V;
+    defvar src_s = FixedVString<src_lmul, num, "v">.S;
+    def vcreate # src_v # dst_v : RVVBuiltin<src_v # dst_v,
+                                             dst_v # src_s,
+                                             "csilxfd", dst_v>;
+
+    defvar src_uv = FixedVString<src_lmul, num, "Uv">.V;
+    defvar src_us = FixedVString<src_lmul, num, "Uv">.S;
+    def vcreate_u # src_uv # dst_uv : RVVBuiltin<src_uv # dst_uv,
+                                                 dst_uv # src_us,
+                                                 "csil", dst_uv>;
+  }
+}
+
 multiclass RVVPseudoUnaryBuiltin<string IR, string type_range> {
   let Name = NAME,
       IRName = IR,
@@ -2438,15 +2464,31 @@ let HasMasked = false, HasVL = false, IRName = "" in {
       SupportOverloading = false,
       ManualCodegen = [{
       {
-        assert(isa<StructType>(ResultType));
-        unsigned NF = cast<StructType>(ResultType)->getNumElements();
-        llvm::Value *ReturnTuple = llvm::PoisonValue::get(ResultType);
-        for (unsigned I = 0; I < NF; ++I) {
-          ReturnTuple = Builder.CreateInsertValue(ReturnTuple, Ops[I], {I});
+        if (isa<StructType>(ResultType)) {
+          unsigned NF = cast<StructType>(ResultType)->getNumElements();
+          llvm::Value *ReturnTuple = llvm::PoisonValue::get(ResultType);
+          for (unsigned I = 0; I < NF; ++I) {
+            ReturnTuple = Builder.CreateInsertValue(ReturnTuple, Ops[I], {I});
+          }
+          return ReturnTuple;
         }
-        return ReturnTuple;
+        llvm::Value *ReturnVector = llvm::PoisonValue::get(ResultType);
+        auto *VecTy = cast<ScalableVectorType>(Ops[0]->getType());
+        for (unsigned I = 0, N = Ops.size(); I < N; ++I) {
+          llvm::Value *Idx =
+            ConstantInt::get(Builder.getInt64Ty(),
+                              VecTy->getMinNumElements() * I);
+          ReturnVector =
+            Builder.CreateInsertVector(ResultType, ReturnVector, Ops[I], Idx);
+        }
+        return ReturnVector;
       }
       }] in {
+
+    defm : RVVNonTupleVCreateBuiltin<1, [0]>;
+    defm : RVVNonTupleVCreateBuiltin<2, [0, 1]>;
+    defm : RVVNonTupleVCreateBuiltin<3, [0, 1, 2]>;
+
     foreach nf = NFList in {
       let NF = nf in {
         defvar T = "(Tuple:" # nf # ")";
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vcreate.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vcreate.c
index 5dee49e455741..2878f2a3277a7 100644
--- a/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vcreate.c
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-autogenerated/non-policy/non-overloaded/vcreate.c
@@ -1,4 +1,4 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 2
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 3
 // REQUIRES: riscv-registered-target
 // RUN: %clang_cc1 -triple riscv64 -target-feature +v -target-feature +zfh \
 // RUN:   -target-feature +zvfh -disable-O0-optnone  \
@@ -7,8 +7,897 @@
 
 #include <riscv_vector.h>
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x half>, <vscale x 1 x half> } @test_vcreate_v_f16mf4x2
-// CHECK-RV64-SAME: (<vscale x 1 x half> [[V0:%.*]], <vscale x 1 x half> [[V1:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x half> @test_vcreate_v_f16m1_f16m2(
+// CHECK-RV64-SAME: <vscale x 4 x half> [[V0:%.*]], <vscale x 4 x half> [[V1:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x half> @llvm.vector.insert.nxv8f16.nxv4f16(<vscale x 8 x half> poison, <vscale x 4 x half> [[V0]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP1:%.*]] = call <vscale x 8 x half> @llvm.vector.insert.nxv8f16.nxv4f16(<vscale x 8 x half> [[TMP0]], <vscale x 4 x half> [[V1]], i64 4)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x half> [[TMP1]]
+//
+vfloat16m2_t test_vcreate_v_f16m1_f16m2(vfloat16m1_t v0, vfloat16m1_t v1) {
+  return __riscv_vcreate_v_f16m1_f16m2(v0, v1);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_vcreate_v_f16m1_f16m4(
+// CHECK-RV64-SAME: <vscale x 4 x half> [[V0:%.*]], <vscale x 4 x half> [[V1:%.*]], <vscale x 4 x half> [[V2:%.*]], <vscale x 4 x half> [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv4f16(<vscale x 16 x half> poison, <vscale x 4 x half> [[V0]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP1:%.*]] = call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv4f16(<vscale x 16 x half> [[TMP0]], <vscale x 4 x half> [[V1]], i64 4)
+// CHECK-RV64-NEXT:    [[TMP2:%.*]] = call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv4f16(<vscale x 16 x half> [[TMP1]], <vscale x 4 x half> [[V2]], i64 8)
+// CHECK-RV64-NEXT:    [[TMP3:%.*]] = call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv4f16(<vscale x 16 x half> [[TMP2]], <vscale x 4 x half> [[V3]], i64 12)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP3]]
+//
+vfloat16m4_t test_vcreate_v_f16m1_f16m4(vfloat16m1_t v0, vfloat16m1_t v1,
+                                        vfloat16m1_t v2, vfloat16m1_t v3) {
+  return __riscv_vcreate_v_f16m1_f16m4(v0, v1, v2, v3);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x half> @test_vcreate_v_f16m1_f16m8(
+// CHECK-RV64-SAME: <vscale x 4 x half> [[V0:%.*]], <vscale x 4 x half> [[V1:%.*]], <vscale x 4 x half> [[V2:%.*]], <vscale x 4 x half> [[V3:%.*]], <vscale x 4 x half> [[V4:%.*]], <vscale x 4 x half> [[V5:%.*]], <vscale x 4 x half> [[V6:%.*]], <vscale x 4 x half> [[V7:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv4f16(<vscale x 32 x half> poison, <vscale x 4 x half> [[V0]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP1:%.*]] = call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv4f16(<vscale x 32 x half> [[TMP0]], <vscale x 4 x half> [[V1]], i64 4)
+// CHECK-RV64-NEXT:    [[TMP2:%.*]] = call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv4f16(<vscale x 32 x half> [[TMP1]], <vscale x 4 x half> [[V2]], i64 8)
+// CHECK-RV64-NEXT:    [[TMP3:%.*]] = call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv4f16(<vscale x 32 x half> [[TMP2]], <vscale x 4 x half> [[V3]], i64 12)
+// CHECK-RV64-NEXT:    [[TMP4:%.*]] = call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv4f16(<vscale x 32 x half> [[TMP3]], <vscale x 4 x half> [[V4]], i64 16)
+// CHECK-RV64-NEXT:    [[TMP5:%.*]] = call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv4f16(<vscale x 32 x half> [[TMP4]], <vscale x 4 x half> [[V5]], i64 20)
+// CHECK-RV64-NEXT:    [[TMP6:%.*]] = call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv4f16(<vscale x 32 x half> [[TMP5]], <vscale x 4 x half> [[V6]], i64 24)
+// CHECK-RV64-NEXT:    [[TMP7:%.*]] = call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv4f16(<vscale x 32 x half> [[TMP6]], <vscale x 4 x half> [[V7]], i64 28)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x half> [[TMP7]]
+//
+vfloat16m8_t test_vcreate_v_f16m1_f16m8(vfloat16m1_t v0, vfloat16m1_t v1,
+                                        vfloat16m1_t v2, vfloat16m1_t v3,
+                                        vfloat16m1_t v4, vfloat16m1_t v5,
+                                        vfloat16m1_t v6, vfloat16m1_t v7) {
+  return __riscv_vcreate_v_f16m1_f16m8(v0, v1, v2, v3, v4, v5, v6, v7);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x half> @test_vcreate_v_f16m2_f16m4(
+// CHECK-RV64-SAME: <vscale x 8 x half> [[V0:%.*]], <vscale x 8 x half> [[V1:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> poison, <vscale x 8 x half> [[V0]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP1:%.*]] = call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> [[TMP0]], <vscale x 8 x half> [[V1]], i64 8)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x half> [[TMP1]]
+//
+vfloat16m4_t test_vcreate_v_f16m2_f16m4(vfloat16m2_t v0, vfloat16m2_t v1) {
+  return __riscv_vcreate_v_f16m2_f16m4(v0, v1);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x half> @test_vcreate_v_f16m2_f16m8(
+// CHECK-RV64-SAME: <vscale x 8 x half> [[V0:%.*]], <vscale x 8 x half> [[V1:%.*]], <vscale x 8 x half> [[V2:%.*]], <vscale x 8 x half> [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> poison, <vscale x 8 x half> [[V0]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP1:%.*]] = call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP0]], <vscale x 8 x half> [[V1]], i64 8)
+// CHECK-RV64-NEXT:    [[TMP2:%.*]] = call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP1]], <vscale x 8 x half> [[V2]], i64 16)
+// CHECK-RV64-NEXT:    [[TMP3:%.*]] = call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP2]], <vscale x 8 x half> [[V3]], i64 24)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x half> [[TMP3]]
+//
+vfloat16m8_t test_vcreate_v_f16m2_f16m8(vfloat16m2_t v0, vfloat16m2_t v1,
+                                        vfloat16m2_t v2, vfloat16m2_t v3) {
+  return __riscv_vcreate_v_f16m2_f16m8(v0, v1, v2, v3);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x half> @test_vcreate_v_f16m4_f16m8(
+// CHECK-RV64-SAME: <vscale x 16 x half> [[V0:%.*]], <vscale x 16 x half> [[V1:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv16f16(<vscale x 32 x half> poison, <vscale x 16 x half> [[V0]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP1:%.*]] = call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv16f16(<vscale x 32 x half> [[TMP0]], <vscale x 16 x half> [[V1]], i64 16)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x half> [[TMP1]]
+//
+vfloat16m8_t test_vcreate_v_f16m4_f16m8(vfloat16m4_t v0, vfloat16m4_t v1) {
+  return __riscv_vcreate_v_f16m4_f16m8(v0, v1);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x float> @test_vcreate_v_f32m1_f32m2(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[V0:%.*]], <vscale x 2 x float> [[V1:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv2f32(<vscale x 4 x float> poison, <vscale x 2 x float> [[V0]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP1:%.*]] = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv2f32(<vscale x 4 x float> [[TMP0]], <vscale x 2 x float> [[V1]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x float> [[TMP1]]
+//
+vfloat32m2_t test_vcreate_v_f32m1_f32m2(vfloat32m1_t v0, vfloat32m1_t v1) {
+  return __riscv_vcreate_v_f32m1_f32m2(v0, v1);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vcreate_v_f32m1_f32m4(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[V0:%.*]], <vscale x 2 x float> [[V1:%.*]], <vscale x 2 x float> [[V2:%.*]], <vscale x 2 x float> [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv2f32(<vscale x 8 x float> poison, <vscale x 2 x float> [[V0]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP1:%.*]] = call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv2f32(<vscale x 8 x float> [[TMP0]], <vscale x 2 x float> [[V1]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP2:%.*]] = call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv2f32(<vscale x 8 x float> [[TMP1]], <vscale x 2 x float> [[V2]], i64 4)
+// CHECK-RV64-NEXT:    [[TMP3:%.*]] = call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv2f32(<vscale x 8 x float> [[TMP2]], <vscale x 2 x float> [[V3]], i64 6)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP3]]
+//
+vfloat32m4_t test_vcreate_v_f32m1_f32m4(vfloat32m1_t v0, vfloat32m1_t v1,
+                                        vfloat32m1_t v2, vfloat32m1_t v3) {
+  return __riscv_vcreate_v_f32m1_f32m4(v0, v1, v2, v3);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vcreate_v_f32m1_f32m8(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[V0:%.*]], <vscale x 2 x float> [[V1:%.*]], <vscale x 2 x float> [[V2:%.*]], <vscale x 2 x float> [[V3:%.*]], <vscale x 2 x float> [[V4:%.*]], <vscale x 2 x float> [[V5:%.*]], <vscale x 2 x float> [[V6:%.*]], <vscale x 2 x float> [[V7:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv2f32(<vscale x 16 x float> poison, <vscale x 2 x float> [[V0]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP1:%.*]] = call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv2f32(<vscale x 16 x float> [[TMP0]], <vscale x 2 x float> [[V1]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP2:%.*]] = call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv2f32(<vscale x 16 x float> [[TMP1]], <vscale x 2 x float> [[V2]], i64 4)
+// CHECK-RV64-NEXT:    [[TMP3:%.*]] = call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv2f32(<vscale x 16 x float> [[TMP2]], <vscale x 2 x float> [[V3]], i64 6)
+// CHECK-RV64-NEXT:    [[TMP4:%.*]] = call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv2f32(<vscale x 16 x float> [[TMP3]], <vscale x 2 x float> [[V4]], i64 8)
+// CHECK-RV64-NEXT:    [[TMP5:%.*]] = call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv2f32(<vscale x 16 x float> [[TMP4]], <vscale x 2 x float> [[V5]], i64 10)
+// CHECK-RV64-NEXT:    [[TMP6:%.*]] = call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv2f32(<vscale x 16 x float> [[TMP5]], <vscale x 2 x float> [[V6]], i64 12)
+// CHECK-RV64-NEXT:    [[TMP7:%.*]] = call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv2f32(<vscale x 16 x float> [[TMP6]], <vscale x 2 x float> [[V7]], i64 14)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP7]]
+//
+vfloat32m8_t test_vcreate_v_f32m1_f32m8(vfloat32m1_t v0, vfloat32m1_t v1,
+                                        vfloat32m1_t v2, vfloat32m1_t v3,
+                                        vfloat32m1_t v4, vfloat32m1_t v5,
+                                        vfloat32m1_t v6, vfloat32m1_t v7) {
+  return __riscv_vcreate_v_f32m1_f32m8(v0, v1, v2, v3, v4, v5, v6, v7);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x float> @test_vcreate_v_f32m2_f32m4(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[V0:%.*]], <vscale x 4 x float> [[V1:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[V0]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP1:%.*]] = call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP0]], <vscale x 4 x float> [[V1]], i64 4)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x float> [[TMP1]]
+//
+vfloat32m4_t test_vcreate_v_f32m2_f32m4(vfloat32m2_t v0, vfloat32m2_t v1) {
+  return __riscv_vcreate_v_f32m2_f32m4(v0, v1);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vcreate_v_f32m2_f32m8(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[V0:%.*]], <vscale x 4 x float> [[V1:%.*]], <vscale x 4 x float> [[V2:%.*]], <vscale x 4 x float> [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 x float> [[V0]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP1:%.*]] = call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP0]], <vscale x 4 x float> [[V1]], i64 4)
+// CHECK-RV64-NEXT:    [[TMP2:%.*]] = call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP1]], <vscale x 4 x float> [[V2]], i64 8)
+// CHECK-RV64-NEXT:    [[TMP3:%.*]] = call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP2]], <vscale x 4 x float> [[V3]], i64 12)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP3]]
+//
+vfloat32m8_t test_vcreate_v_f32m2_f32m8(vfloat32m2_t v0, vfloat32m2_t v1,
+                                        vfloat32m2_t v2, vfloat32m2_t v3) {
+  return __riscv_vcreate_v_f32m2_f32m8(v0, v1, v2, v3);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_vcreate_v_f32m4_f32m8(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[V0:%.*]], <vscale x 8 x float> [[V1:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv8f32(<vscale x 16 x float> poison, <vscale x 8 x float> [[V0]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP1:%.*]] = call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv8f32(<vscale x 16 x float> [[TMP0]], <vscale x 8 x float> [[V1]], i64 8)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP1]]
+//
+vfloat32m8_t test_vcreate_v_f32m4_f32m8(vfloat32m4_t v0, vfloat32m4_t v1) {
+  return __riscv_vcreate_v_f32m4_f32m8(v0, v1);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x double> @test_vcreate_v_f64m1_f64m2(
+// CHECK-RV64-SAME: <vscale x 1 x double> [[V0:%.*]], <vscale x 1 x double> [[V1:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x double> @llvm.vector.insert.nxv2f64.nxv1f64(<vscale x 2 x double> poison, <vscale x 1 x double> [[V0]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP1:%.*]] = call <vscale x 2 x double> @llvm.vector.insert.nxv2f64.nxv1f64(<vscale x 2 x double> [[TMP0]], <vscale x 1 x double> [[V1]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x double> [[TMP1]]
+//
+vfloat64m2_t test_vcreate_v_f64m1_f64m2(vfloat64m1_t v0, vfloat64m1_t v1) {
+  return __riscv_vcreate_v_f64m1_f64m2(v0, v1);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x double> @test_vcreate_v_f64m1_f64m4(
+// CHECK-RV64-SAME: <vscale x 1 x double> [[V0:%.*]], <vscale x 1 x double> [[V1:%.*]], <vscale x 1 x double> [[V2:%.*]], <vscale x 1 x double> [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv1f64(<vscale x 4 x double> poison, <vscale x 1 x double> [[V0]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP1:%.*]] = call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv1f64(<vscale x 4 x double> [[TMP0]], <vscale x 1 x double> [[V1]], i64 1)
+// CHECK-RV64-NEXT:    [[TMP2:%.*]] = call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv1f64(<vscale x 4 x double> [[TMP1]], <vscale x 1 x double> [[V2]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP3:%.*]] = call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv1f64(<vscale x 4 x double> [[TMP2]], <vscale x 1 x double> [[V3]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x double> [[TMP3]]
+//
+vfloat64m4_t test_vcreate_v_f64m1_f64m4(vfloat64m1_t v0, vfloat64m1_t v1,
+                                        vfloat64m1_t v2, vfloat64m1_t v3) {
+  return __riscv_vcreate_v_f64m1_f64m4(v0, v1, v2, v3);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x double> @test_vcreate_v_f64m1_f64m8(
+// CHECK-RV64-SAME: <vscale x 1 x double> [[V0:%.*]], <vscale x 1 x double> [[V1:%.*]], <vscale x 1 x double> [[V2:%.*]], <vscale x 1 x double> [[V3:%.*]], <vscale x 1 x double> [[V4:%.*]], <vscale x 1 x double> [[V5:%.*]], <vscale x 1 x double> [[V6:%.*]], <vscale x 1 x double> [[V7:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv1f64(<vscale x 8 x double> poison, <vscale x 1 x double> [[V0]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP1:%.*]] = call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv1f64(<vscale x 8 x double> [[TMP0]], <vscale x 1 x double> [[V1]], i64 1)
+// CHECK-RV64-NEXT:    [[TMP2:%.*]] = call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv1f64(<vscale x 8 x double> [[TMP1]], <vscale x 1 x double> [[V2]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP3:%.*]] = call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv1f64(<vscale x 8 x double> [[TMP2]], <vscale x 1 x double> [[V3]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP4:%.*]] = call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv1f64(<vscale x 8 x double> [[TMP3]], <vscale x 1 x double> [[V4]], i64 4)
+// CHECK-RV64-NEXT:    [[TMP5:%.*]] = call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv1f64(<vscale x 8 x double> [[TMP4]], <vscale x 1 x double> [[V5]], i64 5)
+// CHECK-RV64-NEXT:    [[TMP6:%.*]] = call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv1f64(<vscale x 8 x double> [[TMP5]], <vscale x 1 x double> [[V6]], i64 6)
+// CHECK-RV64-NEXT:    [[TMP7:%.*]] = call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv1f64(<vscale x 8 x double> [[TMP6]], <vscale x 1 x double> [[V7]], i64 7)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x double> [[TMP7]]
+//
+vfloat64m8_t test_vcreate_v_f64m1_f64m8(vfloat64m1_t v0, vfloat64m1_t v1,
+                                        vfloat64m1_t v2, vfloat64m1_t v3,
+                                        vfloat64m1_t v4, vfloat64m1_t v5,
+                                        vfloat64m1_t v6, vfloat64m1_t v7) {
+  return __riscv_vcreate_v_f64m1_f64m8(v0, v1, v2, v3, v4, v5, v6, v7);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x double> @test_vcreate_v_f64m2_f64m4(
+// CHECK-RV64-SAME: <vscale x 2 x double> [[V0:%.*]], <vscale x 2 x double> [[V1:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> poison, <vscale x 2 x double> [[V0]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP1:%.*]] = call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> [[TMP0]], <vscale x 2 x double> [[V1]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x double> [[TMP1]]
+//
+vfloat64m4_t test_vcreate_v_f64m2_f64m4(vfloat64m2_t v0, vfloat64m2_t v1) {
+  return __riscv_vcreate_v_f64m2_f64m4(v0, v1);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x double> @test_vcreate_v_f64m2_f64m8(
+// CHECK-RV64-SAME: <vscale x 2 x double> [[V0:%.*]], <vscale x 2 x double> [[V1:%.*]], <vscale x 2 x double> [[V2:%.*]], <vscale x 2 x double> [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> poison, <vscale x 2 x double> [[V0]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP1:%.*]] = call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP0]], <vscale x 2 x double> [[V1]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP2:%.*]] = call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP1]], <vscale x 2 x double> [[V2]], i64 4)
+// CHECK-RV64-NEXT:    [[TMP3:%.*]] = call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv2f64(<vscale x 8 x double> [[TMP2]], <vscale x 2 x double> [[V3]], i64 6)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x double> [[TMP3]]
+//
+vfloat64m8_t test_vcreate_v_f64m2_f64m8(vfloat64m2_t v0, vfloat64m2_t v1,
+                                        vfloat64m2_t v2, vfloat64m2_t v3) {
+  return __riscv_vcreate_v_f64m2_f64m8(v0, v1, v2, v3);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x double> @test_vcreate_v_f64m4_f64m8(
+// CHECK-RV64-SAME: <vscale x 4 x double> [[V0:%.*]], <vscale x 4 x double> [[V1:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv4f64(<vscale x 8 x double> poison, <vscale x 4 x double> [[V0]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP1:%.*]] = call <vscale x 8 x double> @llvm.vector.insert.nxv8f64.nxv4f64(<vscale x 8 x double> [[TMP0]], <vscale x 4 x double> [[V1]], i64 4)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x double> [[TMP1]]
+//
+vfloat64m8_t test_vcreate_v_f64m4_f64m8(vfloat64m4_t v0, vfloat64m4_t v1) {
+  return __riscv_vcreate_v_f64m4_f64m8(v0, v1);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i8> @test_vcreate_v_i8m1_i8m2(
+// CHECK-RV64-SAME: <vscale x 8 x i8> [[V0:%.*]], <vscale x 8 x i8> [[V1:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.vector.insert.nxv16i8.nxv8i8(<vscale x 16 x i8> poison, <vscale x 8 x i8> [[V0]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP1:%.*]] = call <vscale x 16 x i8> @llvm.vector.insert.nxv16i8.nxv8i8(<vscale x 16 x i8> [[TMP0]], <vscale x 8 x i8> [[V1]], i64 8)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP1]]
+//
+vint8m2_t test_vcreate_v_i8m1_i8m2(vint8m1_t v0, vint8m1_t v1) {
+  return __riscv_vcreate_v_i8m1_i8m2(v0, v1);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i8> @test_vcreate_v_i8m1_i8m4(
+// CHECK-RV64-SAME: <vscale x 8 x i8> [[V0:%.*]], <vscale x 8 x i8> [[V1:%.*]], <vscale x 8 x i8> [[V2:%.*]], <vscale x 8 x i8> [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv8i8(<vscale x 32 x i8> poison, <vscale x 8 x i8> [[V0]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP1:%.*]] = call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv8i8(<vscale x 32 x i8> [[TMP0]], <vscale x 8 x i8> [[V1]], i64 8)
+// CHECK-RV64-NEXT:    [[TMP2:%.*]] = call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv8i8(<vscale x 32 x i8> [[TMP1]], <vscale x 8 x i8> [[V2]], i64 16)
+// CHECK-RV64-NEXT:    [[TMP3:%.*]] = call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv8i8(<vscale x 32 x i8> [[TMP2]], <vscale x 8 x i8> [[V3]], i64 24)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP3]]
+//
+vint8m4_t test_vcreate_v_i8m1_i8m4(vint8m1_t v0, vint8m1_t v1, vint8m1_t v2,
+                                   vint8m1_t v3) {
+  return __riscv_vcreate_v_i8m1_i8m4(v0, v1, v2, v3);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 64 x i8> @test_vcreate_v_i8m1_i8m8(
+// CHECK-RV64-SAME: <vscale x 8 x i8> [[V0:%.*]], <vscale x 8 x i8> [[V1:%.*]], <vscale x 8 x i8> [[V2:%.*]], <vscale x 8 x i8> [[V3:%.*]], <vscale x 8 x i8> [[V4:%.*]], <vscale x 8 x i8> [[V5:%.*]], <vscale x 8 x i8> [[V6:%.*]], <vscale x 8 x i8> [[V7:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv8i8(<vscale x 64 x i8> poison, <vscale x 8 x i8> [[V0]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP1:%.*]] = call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv8i8(<vscale x 64 x i8> [[TMP0]], <vscale x 8 x i8> [[V1]], i64 8)
+// CHECK-RV64-NEXT:    [[TMP2:%.*]] = call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv8i8(<vscale x 64 x i8> [[TMP1]], <vscale x 8 x i8> [[V2]], i64 16)
+// CHECK-RV64-NEXT:    [[TMP3:%.*]] = call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv8i8(<vscale x 64 x i8> [[TMP2]], <vscale x 8 x i8> [[V3]], i64 24)
+// CHECK-RV64-NEXT:    [[TMP4:%.*]] = call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv8i8(<vscale x 64 x i8> [[TMP3]], <vscale x 8 x i8> [[V4]], i64 32)
+// CHECK-RV64-NEXT:    [[TMP5:%.*]] = call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv8i8(<vscale x 64 x i8> [[TMP4]], <vscale x 8 x i8> [[V5]], i64 40)
+// CHECK-RV64-NEXT:    [[TMP6:%.*]] = call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv8i8(<vscale x 64 x i8> [[TMP5]], <vscale x 8 x i8> [[V6]], i64 48)
+// CHECK-RV64-NEXT:    [[TMP7:%.*]] = call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv8i8(<vscale x 64 x i8> [[TMP6]], <vscale x 8 x i8> [[V7]], i64 56)
+// CHECK-RV64-NEXT:    ret <vscale x 64 x i8> [[TMP7]]
+//
+vint8m8_t test_vcreate_v_i8m1_i8m8(vint8m1_t v0, vint8m1_t v1, vint8m1_t v2,
+                                   vint8m1_t v3, vint8m1_t v4, vint8m1_t v5,
+                                   vint8m1_t v6, vint8m1_t v7) {
+  return __riscv_vcreate_v_i8m1_i8m8(v0, v1, v2, v3, v4, v5, v6, v7);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i8> @test_vcreate_v_i8m2_i8m4(
+// CHECK-RV64-SAME: <vscale x 16 x i8> [[V0:%.*]], <vscale x 16 x i8> [[V1:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[V0]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP1:%.*]] = call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP0]], <vscale x 16 x i8> [[V1]], i64 16)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP1]]
+//
+vint8m4_t test_vcreate_v_i8m2_i8m4(vint8m2_t v0, vint8m2_t v1) {
+  return __riscv_vcreate_v_i8m2_i8m4(v0, v1);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 64 x i8> @test_vcreate_v_i8m2_i8m8(
+// CHECK-RV64-SAME: <vscale x 16 x i8> [[V0:%.*]], <vscale x 16 x i8> [[V1:%.*]], <vscale x 16 x i8> [[V2:%.*]], <vscale x 16 x i8> [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[V0]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP1:%.*]] = call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP0]], <vscale x 16 x i8> [[V1]], i64 16)
+// CHECK-RV64-NEXT:    [[TMP2:%.*]] = call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP1]], <vscale x 16 x i8> [[V2]], i64 32)
+// CHECK-RV64-NEXT:    [[TMP3:%.*]] = call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP2]], <vscale x 16 x i8> [[V3]], i64 48)
+// CHECK-RV64-NEXT:    ret <vscale x 64 x i8> [[TMP3]]
+//
+vint8m8_t test_vcreate_v_i8m2_i8m8(vint8m2_t v0, vint8m2_t v1, vint8m2_t v2,
+                                   vint8m2_t v3) {
+  return __riscv_vcreate_v_i8m2_i8m8(v0, v1, v2, v3);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 64 x i8> @test_vcreate_v_i8m4_i8m8(
+// CHECK-RV64-SAME: <vscale x 32 x i8> [[V0:%.*]], <vscale x 32 x i8> [[V1:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv32i8(<vscale x 64 x i8> poison, <vscale x 32 x i8> [[V0]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP1:%.*]] = call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv32i8(<vscale x 64 x i8> [[TMP0]], <vscale x 32 x i8> [[V1]], i64 32)
+// CHECK-RV64-NEXT:    ret <vscale x 64 x i8> [[TMP1]]
+//
+vint8m8_t test_vcreate_v_i8m4_i8m8(vint8m4_t v0, vint8m4_t v1) {
+  return __riscv_vcreate_v_i8m4_i8m8(v0, v1);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i16> @test_vcreate_v_i16m1_i16m2(
+// CHECK-RV64-SAME: <vscale x 4 x i16> [[V0:%.*]], <vscale x 4 x i16> [[V1:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.nxv4i16(<vscale x 8 x i16> poison, <vscale x 4 x i16> [[V0]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP1:%.*]] = call <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.nxv4i16(<vscale x 8 x i16> [[TMP0]], <vscale x 4 x i16> [[V1]], i64 4)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
+//
+vint16m2_t test_vcreate_v_i16m1_i16m2(vint16m1_t v0, vint16m1_t v1) {
+  return __riscv_vcreate_v_i16m1_i16m2(v0, v1);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i16> @test_vcreate_v_i16m1_i16m4(
+// CHECK-RV64-SAME: <vscale x 4 x i16> [[V0:%.*]], <vscale x 4 x i16> [[V1:%.*]], <vscale x 4 x i16> [[V2:%.*]], <vscale x 4 x i16> [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv4i16(<vscale x 16 x i16> poison, <vscale x 4 x i16> [[V0]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP1:%.*]] = call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv4i16(<vscale x 16 x i16> [[TMP0]], <vscale x 4 x i16> [[V1]], i64 4)
+// CHECK-RV64-NEXT:    [[TMP2:%.*]] = call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv4i16(<vscale x 16 x i16> [[TMP1]], <vscale x 4 x i16> [[V2]], i64 8)
+// CHECK-RV64-NEXT:    [[TMP3:%.*]] = call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv4i16(<vscale x 16 x i16> [[TMP2]], <vscale x 4 x i16> [[V3]], i64 12)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i16> [[TMP3]]
+//
+vint16m4_t test_vcreate_v_i16m1_i16m4(vint16m1_t v0, vint16m1_t v1,
+                                      vint16m1_t v2, vint16m1_t v3) {
+  return __riscv_vcreate_v_i16m1_i16m4(v0, v1, v2, v3);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i16> @test_vcreate_v_i16m1_i16m8(
+// CHECK-RV64-SAME: <vscale x 4 x i16> [[V0:%.*]], <vscale x 4 x i16> [[V1:%.*]], <vscale x 4 x i16> [[V2:%.*]], <vscale x 4 x i16> [[V3:%.*]], <vscale x 4 x i16> [[V4:%.*]], <vscale x 4 x i16> [[V5:%.*]], <vscale x 4 x i16> [[V6:%.*]], <vscale x 4 x i16> [[V7:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv4i16(<vscale x 32 x i16> poison, <vscale x 4 x i16> [[V0]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP1:%.*]] = call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv4i16(<vscale x 32 x i16> [[TMP0]], <vscale x 4 x i16> [[V1]], i64 4)
+// CHECK-RV64-NEXT:    [[TMP2:%.*]] = call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv4i16(<vscale x 32 x i16> [[TMP1]], <vscale x 4 x i16> [[V2]], i64 8)
+// CHECK-RV64-NEXT:    [[TMP3:%.*]] = call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv4i16(<vscale x 32 x i16> [[TMP2]], <vscale x 4 x i16> [[V3]], i64 12)
+// CHECK-RV64-NEXT:    [[TMP4:%.*]] = call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv4i16(<vscale x 32 x i16> [[TMP3]], <vscale x 4 x i16> [[V4]], i64 16)
+// CHECK-RV64-NEXT:    [[TMP5:%.*]] = call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv4i16(<vscale x 32 x i16> [[TMP4]], <vscale x 4 x i16> [[V5]], i64 20)
+// CHECK-RV64-NEXT:    [[TMP6:%.*]] = call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv4i16(<vscale x 32 x i16> [[TMP5]], <vscale x 4 x i16> [[V6]], i64 24)
+// CHECK-RV64-NEXT:    [[TMP7:%.*]] = call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv4i16(<vscale x 32 x i16> [[TMP6]], <vscale x 4 x i16> [[V7]], i64 28)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i16> [[TMP7]]
+//
+vint16m8_t test_vcreate_v_i16m1_i16m8(vint16m1_t v0, vint16m1_t v1,
+                                      vint16m1_t v2, vint16m1_t v3,
+                                      vint16m1_t v4, vint16m1_t v5,
+                                      vint16m1_t v6, vint16m1_t v7) {
+  return __riscv_vcreate_v_i16m1_i16m8(v0, v1, v2, v3, v4, v5, v6, v7);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i16> @test_vcreate_v_i16m2_i16m4(
+// CHECK-RV64-SAME: <vscale x 8 x i16> [[V0:%.*]], <vscale x 8 x i16> [[V1:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[V0]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP1:%.*]] = call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP0]], <vscale x 8 x i16> [[V1]], i64 8)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i16> [[TMP1]]
+//
+vint16m4_t test_vcreate_v_i16m2_i16m4(vint16m2_t v0, vint16m2_t v1) {
+  return __riscv_vcreate_v_i16m2_i16m4(v0, v1);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i16> @test_vcreate_v_i16m2_i16m8(
+// CHECK-RV64-SAME: <vscale x 8 x i16> [[V0:%.*]], <vscale x 8 x i16> [[V1:%.*]], <vscale x 8 x i16> [[V2:%.*]], <vscale x 8 x i16> [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[V0]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP1:%.*]] = call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP0]], <vscale x 8 x i16> [[V1]], i64 8)
+// CHECK-RV64-NEXT:    [[TMP2:%.*]] = call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP1]], <vscale x 8 x i16> [[V2]], i64 16)
+// CHECK-RV64-NEXT:    [[TMP3:%.*]] = call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP2]], <vscale x 8 x i16> [[V3]], i64 24)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i16> [[TMP3]]
+//
+vint16m8_t test_vcreate_v_i16m2_i16m8(vint16m2_t v0, vint16m2_t v1,
+                                      vint16m2_t v2, vint16m2_t v3) {
+  return __riscv_vcreate_v_i16m2_i16m8(v0, v1, v2, v3);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i16> @test_vcreate_v_i16m4_i16m8(
+// CHECK-RV64-SAME: <vscale x 16 x i16> [[V0:%.*]], <vscale x 16 x i16> [[V1:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv16i16(<vscale x 32 x i16> poison, <vscale x 16 x i16> [[V0]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP1:%.*]] = call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv16i16(<vscale x 32 x i16> [[TMP0]], <vscale x 16 x i16> [[V1]], i64 16)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i16> [[TMP1]]
+//
+vint16m8_t test_vcreate_v_i16m4_i16m8(vint16m4_t v0, vint16m4_t v1) {
+  return __riscv_vcreate_v_i16m4_i16m8(v0, v1);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i32> @test_vcreate_v_i32m1_i32m2(
+// CHECK-RV64-SAME: <vscale x 2 x i32> [[V0:%.*]], <vscale x 2 x i32> [[V1:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.nxv2i32(<vscale x 4 x i32> poison, <vscale x 2 x i32> [[V0]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP1:%.*]] = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.nxv2i32(<vscale x 4 x i32> [[TMP0]], <vscale x 2 x i32> [[V1]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
+//
+vint32m2_t test_vcreate_v_i32m1_i32m2(vint32m1_t v0, vint32m1_t v1) {
+  return __riscv_vcreate_v_i32m1_i32m2(v0, v1);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i32> @test_vcreate_v_i32m1_i32m4(
+// CHECK-RV64-SAME: <vscale x 2 x i32> [[V0:%.*]], <vscale x 2 x i32> [[V1:%.*]], <vscale x 2 x i32> [[V2:%.*]], <vscale x 2 x i32> [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv2i32(<vscale x 8 x i32> poison, <vscale x 2 x i32> [[V0]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP1:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv2i32(<vscale x 8 x i32> [[TMP0]], <vscale x 2 x i32> [[V1]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP2:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv2i32(<vscale x 8 x i32> [[TMP1]], <vscale x 2 x i32> [[V2]], i64 4)
+// CHECK-RV64-NEXT:    [[TMP3:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv2i32(<vscale x 8 x i32> [[TMP2]], <vscale x 2 x i32> [[V3]], i64 6)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP3]]
+//
+vint32m4_t test_vcreate_v_i32m1_i32m4(vint32m1_t v0, vint32m1_t v1,
+                                      vint32m1_t v2, vint32m1_t v3) {
+  return __riscv_vcreate_v_i32m1_i32m4(v0, v1, v2, v3);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i32> @test_vcreate_v_i32m1_i32m8(
+// CHECK-RV64-SAME: <vscale x 2 x i32> [[V0:%.*]], <vscale x 2 x i32> [[V1:%.*]], <vscale x 2 x i32> [[V2:%.*]], <vscale x 2 x i32> [[V3:%.*]], <vscale x 2 x i32> [[V4:%.*]], <vscale x 2 x i32> [[V5:%.*]], <vscale x 2 x i32> [[V6:%.*]], <vscale x 2 x i32> [[V7:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv2i32(<vscale x 16 x i32> poison, <vscale x 2 x i32> [[V0]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP1:%.*]] = call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv2i32(<vscale x 16 x i32> [[TMP0]], <vscale x 2 x i32> [[V1]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP2:%.*]] = call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv2i32(<vscale x 16 x i32> [[TMP1]], <vscale x 2 x i32> [[V2]], i64 4)
+// CHECK-RV64-NEXT:    [[TMP3:%.*]] = call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv2i32(<vscale x 16 x i32> [[TMP2]], <vscale x 2 x i32> [[V3]], i64 6)
+// CHECK-RV64-NEXT:    [[TMP4:%.*]] = call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv2i32(<vscale x 16 x i32> [[TMP3]], <vscale x 2 x i32> [[V4]], i64 8)
+// CHECK-RV64-NEXT:    [[TMP5:%.*]] = call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv2i32(<vscale x 16 x i32> [[TMP4]], <vscale x 2 x i32> [[V5]], i64 10)
+// CHECK-RV64-NEXT:    [[TMP6:%.*]] = call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv2i32(<vscale x 16 x i32> [[TMP5]], <vscale x 2 x i32> [[V6]], i64 12)
+// CHECK-RV64-NEXT:    [[TMP7:%.*]] = call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv2i32(<vscale x 16 x i32> [[TMP6]], <vscale x 2 x i32> [[V7]], i64 14)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP7]]
+//
+vint32m8_t test_vcreate_v_i32m1_i32m8(vint32m1_t v0, vint32m1_t v1,
+                                      vint32m1_t v2, vint32m1_t v3,
+                                      vint32m1_t v4, vint32m1_t v5,
+                                      vint32m1_t v6, vint32m1_t v7) {
+  return __riscv_vcreate_v_i32m1_i32m8(v0, v1, v2, v3, v4, v5, v6, v7);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i32> @test_vcreate_v_i32m2_i32m4(
+// CHECK-RV64-SAME: <vscale x 4 x i32> [[V0:%.*]], <vscale x 4 x i32> [[V1:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[V0]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP1:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP0]], <vscale x 4 x i32> [[V1]], i64 4)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP1]]
+//
+vint32m4_t test_vcreate_v_i32m2_i32m4(vint32m2_t v0, vint32m2_t v1) {
+  return __riscv_vcreate_v_i32m2_i32m4(v0, v1);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i32> @test_vcreate_v_i32m2_i32m8(
+// CHECK-RV64-SAME: <vscale x 4 x i32> [[V0:%.*]], <vscale x 4 x i32> [[V1:%.*]], <vscale x 4 x i32> [[V2:%.*]], <vscale x 4 x i32> [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[V0]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP1:%.*]] = call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP0]], <vscale x 4 x i32> [[V1]], i64 4)
+// CHECK-RV64-NEXT:    [[TMP2:%.*]] = call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP1]], <vscale x 4 x i32> [[V2]], i64 8)
+// CHECK-RV64-NEXT:    [[TMP3:%.*]] = call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 x i32> [[V3]], i64 12)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP3]]
+//
+vint32m8_t test_vcreate_v_i32m2_i32m8(vint32m2_t v0, vint32m2_t v1,
+                                      vint32m2_t v2, vint32m2_t v3) {
+  return __riscv_vcreate_v_i32m2_i32m8(v0, v1, v2, v3);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i32> @test_vcreate_v_i32m4_i32m8(
+// CHECK-RV64-SAME: <vscale x 8 x i32> [[V0:%.*]], <vscale x 8 x i32> [[V1:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv8i32(<vscale x 16 x i32> poison, <vscale x 8 x i32> [[V0]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP1:%.*]] = call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv8i32(<vscale x 16 x i32> [[TMP0]], <vscale x 8 x i32> [[V1]], i64 8)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP1]]
+//
+vint32m8_t test_vcreate_v_i32m4_i32m8(vint32m4_t v0, vint32m4_t v1) {
+  return __riscv_vcreate_v_i32m4_i32m8(v0, v1);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i64> @test_vcreate_v_i64m1_i64m2(
+// CHECK-RV64-SAME: <vscale x 1 x i64> [[V0:%.*]], <vscale x 1 x i64> [[V1:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.nxv1i64(<vscale x 2 x i64> poison, <vscale x 1 x i64> [[V0]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP1:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.nxv1i64(<vscale x 2 x i64> [[TMP0]], <vscale x 1 x i64> [[V1]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
+//
+vint64m2_t test_vcreate_v_i64m1_i64m2(vint64m1_t v0, vint64m1_t v1) {
+  return __riscv_vcreate_v_i64m1_i64m2(v0, v1);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i64> @test_vcreate_v_i64m1_i64m4(
+// CHECK-RV64-SAME: <vscale x 1 x i64> [[V0:%.*]], <vscale x 1 x i64> [[V1:%.*]], <vscale x 1 x i64> [[V2:%.*]], <vscale x 1 x i64> [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv1i64(<vscale x 4 x i64> poison, <vscale x 1 x i64> [[V0]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP1:%.*]] = call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv1i64(<vscale x 4 x i64> [[TMP0]], <vscale x 1 x i64> [[V1]], i64 1)
+// CHECK-RV64-NEXT:    [[TMP2:%.*]] = call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv1i64(<vscale x 4 x i64> [[TMP1]], <vscale x 1 x i64> [[V2]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP3:%.*]] = call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv1i64(<vscale x 4 x i64> [[TMP2]], <vscale x 1 x i64> [[V3]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP3]]
+//
+vint64m4_t test_vcreate_v_i64m1_i64m4(vint64m1_t v0, vint64m1_t v1,
+                                      vint64m1_t v2, vint64m1_t v3) {
+  return __riscv_vcreate_v_i64m1_i64m4(v0, v1, v2, v3);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i64> @test_vcreate_v_i64m1_i64m8(
+// CHECK-RV64-SAME: <vscale x 1 x i64> [[V0:%.*]], <vscale x 1 x i64> [[V1:%.*]], <vscale x 1 x i64> [[V2:%.*]], <vscale x 1 x i64> [[V3:%.*]], <vscale x 1 x i64> [[V4:%.*]], <vscale x 1 x i64> [[V5:%.*]], <vscale x 1 x i64> [[V6:%.*]], <vscale x 1 x i64> [[V7:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv1i64(<vscale x 8 x i64> poison, <vscale x 1 x i64> [[V0]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP1:%.*]] = call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv1i64(<vscale x 8 x i64> [[TMP0]], <vscale x 1 x i64> [[V1]], i64 1)
+// CHECK-RV64-NEXT:    [[TMP2:%.*]] = call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv1i64(<vscale x 8 x i64> [[TMP1]], <vscale x 1 x i64> [[V2]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP3:%.*]] = call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv1i64(<vscale x 8 x i64> [[TMP2]], <vscale x 1 x i64> [[V3]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP4:%.*]] = call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv1i64(<vscale x 8 x i64> [[TMP3]], <vscale x 1 x i64> [[V4]], i64 4)
+// CHECK-RV64-NEXT:    [[TMP5:%.*]] = call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv1i64(<vscale x 8 x i64> [[TMP4]], <vscale x 1 x i64> [[V5]], i64 5)
+// CHECK-RV64-NEXT:    [[TMP6:%.*]] = call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv1i64(<vscale x 8 x i64> [[TMP5]], <vscale x 1 x i64> [[V6]], i64 6)
+// CHECK-RV64-NEXT:    [[TMP7:%.*]] = call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv1i64(<vscale x 8 x i64> [[TMP6]], <vscale x 1 x i64> [[V7]], i64 7)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP7]]
+//
+vint64m8_t test_vcreate_v_i64m1_i64m8(vint64m1_t v0, vint64m1_t v1,
+                                      vint64m1_t v2, vint64m1_t v3,
+                                      vint64m1_t v4, vint64m1_t v5,
+                                      vint64m1_t v6, vint64m1_t v7) {
+  return __riscv_vcreate_v_i64m1_i64m8(v0, v1, v2, v3, v4, v5, v6, v7);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i64> @test_vcreate_v_i64m2_i64m4(
+// CHECK-RV64-SAME: <vscale x 2 x i64> [[V0:%.*]], <vscale x 2 x i64> [[V1:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[V0]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP1:%.*]] = call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP0]], <vscale x 2 x i64> [[V1]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP1]]
+//
+vint64m4_t test_vcreate_v_i64m2_i64m4(vint64m2_t v0, vint64m2_t v1) {
+  return __riscv_vcreate_v_i64m2_i64m4(v0, v1);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i64> @test_vcreate_v_i64m2_i64m8(
+// CHECK-RV64-SAME: <vscale x 2 x i64> [[V0:%.*]], <vscale x 2 x i64> [[V1:%.*]], <vscale x 2 x i64> [[V2:%.*]], <vscale x 2 x i64> [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[V0]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP1:%.*]] = call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP0]], <vscale x 2 x i64> [[V1]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP2:%.*]] = call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP1]], <vscale x 2 x i64> [[V2]], i64 4)
+// CHECK-RV64-NEXT:    [[TMP3:%.*]] = call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP2]], <vscale x 2 x i64> [[V3]], i64 6)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP3]]
+//
+vint64m8_t test_vcreate_v_i64m2_i64m8(vint64m2_t v0, vint64m2_t v1,
+                                      vint64m2_t v2, vint64m2_t v3) {
+  return __riscv_vcreate_v_i64m2_i64m8(v0, v1, v2, v3);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i64> @test_vcreate_v_i64m4_i64m8(
+// CHECK-RV64-SAME: <vscale x 4 x i64> [[V0:%.*]], <vscale x 4 x i64> [[V1:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv4i64(<vscale x 8 x i64> poison, <vscale x 4 x i64> [[V0]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP1:%.*]] = call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv4i64(<vscale x 8 x i64> [[TMP0]], <vscale x 4 x i64> [[V1]], i64 4)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP1]]
+//
+vint64m8_t test_vcreate_v_i64m4_i64m8(vint64m4_t v0, vint64m4_t v1) {
+  return __riscv_vcreate_v_i64m4_i64m8(v0, v1);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i8> @test_vcreate_v_u8m1_u8m2(
+// CHECK-RV64-SAME: <vscale x 8 x i8> [[V0:%.*]], <vscale x 8 x i8> [[V1:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i8> @llvm.vector.insert.nxv16i8.nxv8i8(<vscale x 16 x i8> poison, <vscale x 8 x i8> [[V0]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP1:%.*]] = call <vscale x 16 x i8> @llvm.vector.insert.nxv16i8.nxv8i8(<vscale x 16 x i8> [[TMP0]], <vscale x 8 x i8> [[V1]], i64 8)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i8> [[TMP1]]
+//
+vuint8m2_t test_vcreate_v_u8m1_u8m2(vuint8m1_t v0, vuint8m1_t v1) {
+  return __riscv_vcreate_v_u8m1_u8m2(v0, v1);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i8> @test_vcreate_v_u8m1_u8m4(
+// CHECK-RV64-SAME: <vscale x 8 x i8> [[V0:%.*]], <vscale x 8 x i8> [[V1:%.*]], <vscale x 8 x i8> [[V2:%.*]], <vscale x 8 x i8> [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv8i8(<vscale x 32 x i8> poison, <vscale x 8 x i8> [[V0]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP1:%.*]] = call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv8i8(<vscale x 32 x i8> [[TMP0]], <vscale x 8 x i8> [[V1]], i64 8)
+// CHECK-RV64-NEXT:    [[TMP2:%.*]] = call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv8i8(<vscale x 32 x i8> [[TMP1]], <vscale x 8 x i8> [[V2]], i64 16)
+// CHECK-RV64-NEXT:    [[TMP3:%.*]] = call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv8i8(<vscale x 32 x i8> [[TMP2]], <vscale x 8 x i8> [[V3]], i64 24)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP3]]
+//
+vuint8m4_t test_vcreate_v_u8m1_u8m4(vuint8m1_t v0, vuint8m1_t v1, vuint8m1_t v2,
+                                    vuint8m1_t v3) {
+  return __riscv_vcreate_v_u8m1_u8m4(v0, v1, v2, v3);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 64 x i8> @test_vcreate_v_u8m1_u8m8(
+// CHECK-RV64-SAME: <vscale x 8 x i8> [[V0:%.*]], <vscale x 8 x i8> [[V1:%.*]], <vscale x 8 x i8> [[V2:%.*]], <vscale x 8 x i8> [[V3:%.*]], <vscale x 8 x i8> [[V4:%.*]], <vscale x 8 x i8> [[V5:%.*]], <vscale x 8 x i8> [[V6:%.*]], <vscale x 8 x i8> [[V7:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv8i8(<vscale x 64 x i8> poison, <vscale x 8 x i8> [[V0]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP1:%.*]] = call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv8i8(<vscale x 64 x i8> [[TMP0]], <vscale x 8 x i8> [[V1]], i64 8)
+// CHECK-RV64-NEXT:    [[TMP2:%.*]] = call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv8i8(<vscale x 64 x i8> [[TMP1]], <vscale x 8 x i8> [[V2]], i64 16)
+// CHECK-RV64-NEXT:    [[TMP3:%.*]] = call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv8i8(<vscale x 64 x i8> [[TMP2]], <vscale x 8 x i8> [[V3]], i64 24)
+// CHECK-RV64-NEXT:    [[TMP4:%.*]] = call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv8i8(<vscale x 64 x i8> [[TMP3]], <vscale x 8 x i8> [[V4]], i64 32)
+// CHECK-RV64-NEXT:    [[TMP5:%.*]] = call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv8i8(<vscale x 64 x i8> [[TMP4]], <vscale x 8 x i8> [[V5]], i64 40)
+// CHECK-RV64-NEXT:    [[TMP6:%.*]] = call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv8i8(<vscale x 64 x i8> [[TMP5]], <vscale x 8 x i8> [[V6]], i64 48)
+// CHECK-RV64-NEXT:    [[TMP7:%.*]] = call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv8i8(<vscale x 64 x i8> [[TMP6]], <vscale x 8 x i8> [[V7]], i64 56)
+// CHECK-RV64-NEXT:    ret <vscale x 64 x i8> [[TMP7]]
+//
+vuint8m8_t test_vcreate_v_u8m1_u8m8(vuint8m1_t v0, vuint8m1_t v1, vuint8m1_t v2,
+                                    vuint8m1_t v3, vuint8m1_t v4, vuint8m1_t v5,
+                                    vuint8m1_t v6, vuint8m1_t v7) {
+  return __riscv_vcreate_v_u8m1_u8m8(v0, v1, v2, v3, v4, v5, v6, v7);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i8> @test_vcreate_v_u8m2_u8m4(
+// CHECK-RV64-SAME: <vscale x 16 x i8> [[V0:%.*]], <vscale x 16 x i8> [[V1:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> poison, <vscale x 16 x i8> [[V0]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP1:%.*]] = call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP0]], <vscale x 16 x i8> [[V1]], i64 16)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i8> [[TMP1]]
+//
+vuint8m4_t test_vcreate_v_u8m2_u8m4(vuint8m2_t v0, vuint8m2_t v1) {
+  return __riscv_vcreate_v_u8m2_u8m4(v0, v1);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 64 x i8> @test_vcreate_v_u8m2_u8m8(
+// CHECK-RV64-SAME: <vscale x 16 x i8> [[V0:%.*]], <vscale x 16 x i8> [[V1:%.*]], <vscale x 16 x i8> [[V2:%.*]], <vscale x 16 x i8> [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> poison, <vscale x 16 x i8> [[V0]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP1:%.*]] = call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP0]], <vscale x 16 x i8> [[V1]], i64 16)
+// CHECK-RV64-NEXT:    [[TMP2:%.*]] = call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP1]], <vscale x 16 x i8> [[V2]], i64 32)
+// CHECK-RV64-NEXT:    [[TMP3:%.*]] = call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP2]], <vscale x 16 x i8> [[V3]], i64 48)
+// CHECK-RV64-NEXT:    ret <vscale x 64 x i8> [[TMP3]]
+//
+vuint8m8_t test_vcreate_v_u8m2_u8m8(vuint8m2_t v0, vuint8m2_t v1, vuint8m2_t v2,
+                                    vuint8m2_t v3) {
+  return __riscv_vcreate_v_u8m2_u8m8(v0, v1, v2, v3);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 64 x i8> @test_vcreate_v_u8m4_u8m8(
+// CHECK-RV64-SAME: <vscale x 32 x i8> [[V0:%.*]], <vscale x 32 x i8> [[V1:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv32i8(<vscale x 64 x i8> poison, <vscale x 32 x i8> [[V0]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP1:%.*]] = call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv32i8(<vscale x 64 x i8> [[TMP0]], <vscale x 32 x i8> [[V1]], i64 32)
+// CHECK-RV64-NEXT:    ret <vscale x 64 x i8> [[TMP1]]
+//
+vuint8m8_t test_vcreate_v_u8m4_u8m8(vuint8m4_t v0, vuint8m4_t v1) {
+  return __riscv_vcreate_v_u8m4_u8m8(v0, v1);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i16> @test_vcreate_v_u16m1_u16m2(
+// CHECK-RV64-SAME: <vscale x 4 x i16> [[V0:%.*]], <vscale x 4 x i16> [[V1:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.nxv4i16(<vscale x 8 x i16> poison, <vscale x 4 x i16> [[V0]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP1:%.*]] = call <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.nxv4i16(<vscale x 8 x i16> [[TMP0]], <vscale x 4 x i16> [[V1]], i64 4)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i16> [[TMP1]]
+//
+vuint16m2_t test_vcreate_v_u16m1_u16m2(vuint16m1_t v0, vuint16m1_t v1) {
+  return __riscv_vcreate_v_u16m1_u16m2(v0, v1);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i16> @test_vcreate_v_u16m1_u16m4(
+// CHECK-RV64-SAME: <vscale x 4 x i16> [[V0:%.*]], <vscale x 4 x i16> [[V1:%.*]], <vscale x 4 x i16> [[V2:%.*]], <vscale x 4 x i16> [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv4i16(<vscale x 16 x i16> poison, <vscale x 4 x i16> [[V0]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP1:%.*]] = call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv4i16(<vscale x 16 x i16> [[TMP0]], <vscale x 4 x i16> [[V1]], i64 4)
+// CHECK-RV64-NEXT:    [[TMP2:%.*]] = call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv4i16(<vscale x 16 x i16> [[TMP1]], <vscale x 4 x i16> [[V2]], i64 8)
+// CHECK-RV64-NEXT:    [[TMP3:%.*]] = call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv4i16(<vscale x 16 x i16> [[TMP2]], <vscale x 4 x i16> [[V3]], i64 12)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i16> [[TMP3]]
+//
+vuint16m4_t test_vcreate_v_u16m1_u16m4(vuint16m1_t v0, vuint16m1_t v1,
+                                       vuint16m1_t v2, vuint16m1_t v3) {
+  return __riscv_vcreate_v_u16m1_u16m4(v0, v1, v2, v3);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i16> @test_vcreate_v_u16m1_u16m8(
+// CHECK-RV64-SAME: <vscale x 4 x i16> [[V0:%.*]], <vscale x 4 x i16> [[V1:%.*]], <vscale x 4 x i16> [[V2:%.*]], <vscale x 4 x i16> [[V3:%.*]], <vscale x 4 x i16> [[V4:%.*]], <vscale x 4 x i16> [[V5:%.*]], <vscale x 4 x i16> [[V6:%.*]], <vscale x 4 x i16> [[V7:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv4i16(<vscale x 32 x i16> poison, <vscale x 4 x i16> [[V0]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP1:%.*]] = call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv4i16(<vscale x 32 x i16> [[TMP0]], <vscale x 4 x i16> [[V1]], i64 4)
+// CHECK-RV64-NEXT:    [[TMP2:%.*]] = call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv4i16(<vscale x 32 x i16> [[TMP1]], <vscale x 4 x i16> [[V2]], i64 8)
+// CHECK-RV64-NEXT:    [[TMP3:%.*]] = call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv4i16(<vscale x 32 x i16> [[TMP2]], <vscale x 4 x i16> [[V3]], i64 12)
+// CHECK-RV64-NEXT:    [[TMP4:%.*]] = call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv4i16(<vscale x 32 x i16> [[TMP3]], <vscale x 4 x i16> [[V4]], i64 16)
+// CHECK-RV64-NEXT:    [[TMP5:%.*]] = call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv4i16(<vscale x 32 x i16> [[TMP4]], <vscale x 4 x i16> [[V5]], i64 20)
+// CHECK-RV64-NEXT:    [[TMP6:%.*]] = call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv4i16(<vscale x 32 x i16> [[TMP5]], <vscale x 4 x i16> [[V6]], i64 24)
+// CHECK-RV64-NEXT:    [[TMP7:%.*]] = call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv4i16(<vscale x 32 x i16> [[TMP6]], <vscale x 4 x i16> [[V7]], i64 28)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i16> [[TMP7]]
+//
+vuint16m8_t test_vcreate_v_u16m1_u16m8(vuint16m1_t v0, vuint16m1_t v1,
+                                       vuint16m1_t v2, vuint16m1_t v3,
+                                       vuint16m1_t v4, vuint16m1_t v5,
+                                       vuint16m1_t v6, vuint16m1_t v7) {
+  return __riscv_vcreate_v_u16m1_u16m8(v0, v1, v2, v3, v4, v5, v6, v7);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i16> @test_vcreate_v_u16m2_u16m4(
+// CHECK-RV64-SAME: <vscale x 8 x i16> [[V0:%.*]], <vscale x 8 x i16> [[V1:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> poison, <vscale x 8 x i16> [[V0]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP1:%.*]] = call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP0]], <vscale x 8 x i16> [[V1]], i64 8)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i16> [[TMP1]]
+//
+vuint16m4_t test_vcreate_v_u16m2_u16m4(vuint16m2_t v0, vuint16m2_t v1) {
+  return __riscv_vcreate_v_u16m2_u16m4(v0, v1);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i16> @test_vcreate_v_u16m2_u16m8(
+// CHECK-RV64-SAME: <vscale x 8 x i16> [[V0:%.*]], <vscale x 8 x i16> [[V1:%.*]], <vscale x 8 x i16> [[V2:%.*]], <vscale x 8 x i16> [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> poison, <vscale x 8 x i16> [[V0]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP1:%.*]] = call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP0]], <vscale x 8 x i16> [[V1]], i64 8)
+// CHECK-RV64-NEXT:    [[TMP2:%.*]] = call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP1]], <vscale x 8 x i16> [[V2]], i64 16)
+// CHECK-RV64-NEXT:    [[TMP3:%.*]] = call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP2]], <vscale x 8 x i16> [[V3]], i64 24)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i16> [[TMP3]]
+//
+vuint16m8_t test_vcreate_v_u16m2_u16m8(vuint16m2_t v0, vuint16m2_t v1,
+                                       vuint16m2_t v2, vuint16m2_t v3) {
+  return __riscv_vcreate_v_u16m2_u16m8(v0, v1, v2, v3);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i16> @test_vcreate_v_u16m4_u16m8(
+// CHECK-RV64-SAME: <vscale x 16 x i16> [[V0:%.*]], <vscale x 16 x i16> [[V1:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv16i16(<vscale x 32 x i16> poison, <vscale x 16 x i16> [[V0]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP1:%.*]] = call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv16i16(<vscale x 32 x i16> [[TMP0]], <vscale x 16 x i16> [[V1]], i64 16)
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i16> [[TMP1]]
+//
+vuint16m8_t test_vcreate_v_u16m4_u16m8(vuint16m4_t v0, vuint16m4_t v1) {
+  return __riscv_vcreate_v_u16m4_u16m8(v0, v1);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i32> @test_vcreate_v_u32m1_u32m2(
+// CHECK-RV64-SAME: <vscale x 2 x i32> [[V0:%.*]], <vscale x 2 x i32> [[V1:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.nxv2i32(<vscale x 4 x i32> poison, <vscale x 2 x i32> [[V0]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP1:%.*]] = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.nxv2i32(<vscale x 4 x i32> [[TMP0]], <vscale x 2 x i32> [[V1]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
+//
+vuint32m2_t test_vcreate_v_u32m1_u32m2(vuint32m1_t v0, vuint32m1_t v1) {
+  return __riscv_vcreate_v_u32m1_u32m2(v0, v1);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i32> @test_vcreate_v_u32m1_u32m4(
+// CHECK-RV64-SAME: <vscale x 2 x i32> [[V0:%.*]], <vscale x 2 x i32> [[V1:%.*]], <vscale x 2 x i32> [[V2:%.*]], <vscale x 2 x i32> [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv2i32(<vscale x 8 x i32> poison, <vscale x 2 x i32> [[V0]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP1:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv2i32(<vscale x 8 x i32> [[TMP0]], <vscale x 2 x i32> [[V1]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP2:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv2i32(<vscale x 8 x i32> [[TMP1]], <vscale x 2 x i32> [[V2]], i64 4)
+// CHECK-RV64-NEXT:    [[TMP3:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv2i32(<vscale x 8 x i32> [[TMP2]], <vscale x 2 x i32> [[V3]], i64 6)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP3]]
+//
+vuint32m4_t test_vcreate_v_u32m1_u32m4(vuint32m1_t v0, vuint32m1_t v1,
+                                       vuint32m1_t v2, vuint32m1_t v3) {
+  return __riscv_vcreate_v_u32m1_u32m4(v0, v1, v2, v3);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i32> @test_vcreate_v_u32m1_u32m8(
+// CHECK-RV64-SAME: <vscale x 2 x i32> [[V0:%.*]], <vscale x 2 x i32> [[V1:%.*]], <vscale x 2 x i32> [[V2:%.*]], <vscale x 2 x i32> [[V3:%.*]], <vscale x 2 x i32> [[V4:%.*]], <vscale x 2 x i32> [[V5:%.*]], <vscale x 2 x i32> [[V6:%.*]], <vscale x 2 x i32> [[V7:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv2i32(<vscale x 16 x i32> poison, <vscale x 2 x i32> [[V0]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP1:%.*]] = call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv2i32(<vscale x 16 x i32> [[TMP0]], <vscale x 2 x i32> [[V1]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP2:%.*]] = call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv2i32(<vscale x 16 x i32> [[TMP1]], <vscale x 2 x i32> [[V2]], i64 4)
+// CHECK-RV64-NEXT:    [[TMP3:%.*]] = call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv2i32(<vscale x 16 x i32> [[TMP2]], <vscale x 2 x i32> [[V3]], i64 6)
+// CHECK-RV64-NEXT:    [[TMP4:%.*]] = call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv2i32(<vscale x 16 x i32> [[TMP3]], <vscale x 2 x i32> [[V4]], i64 8)
+// CHECK-RV64-NEXT:    [[TMP5:%.*]] = call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv2i32(<vscale x 16 x i32> [[TMP4]], <vscale x 2 x i32> [[V5]], i64 10)
+// CHECK-RV64-NEXT:    [[TMP6:%.*]] = call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv2i32(<vscale x 16 x i32> [[TMP5]], <vscale x 2 x i32> [[V6]], i64 12)
+// CHECK-RV64-NEXT:    [[TMP7:%.*]] = call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv2i32(<vscale x 16 x i32> [[TMP6]], <vscale x 2 x i32> [[V7]], i64 14)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP7]]
+//
+vuint32m8_t test_vcreate_v_u32m1_u32m8(vuint32m1_t v0, vuint32m1_t v1,
+                                       vuint32m1_t v2, vuint32m1_t v3,
+                                       vuint32m1_t v4, vuint32m1_t v5,
+                                       vuint32m1_t v6, vuint32m1_t v7) {
+  return __riscv_vcreate_v_u32m1_u32m8(v0, v1, v2, v3, v4, v5, v6, v7);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i32> @test_vcreate_v_u32m2_u32m4(
+// CHECK-RV64-SAME: <vscale x 4 x i32> [[V0:%.*]], <vscale x 4 x i32> [[V1:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[V0]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP1:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP0]], <vscale x 4 x i32> [[V1]], i64 4)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i32> [[TMP1]]
+//
+vuint32m4_t test_vcreate_v_u32m2_u32m4(vuint32m2_t v0, vuint32m2_t v1) {
+  return __riscv_vcreate_v_u32m2_u32m4(v0, v1);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i32> @test_vcreate_v_u32m2_u32m8(
+// CHECK-RV64-SAME: <vscale x 4 x i32> [[V0:%.*]], <vscale x 4 x i32> [[V1:%.*]], <vscale x 4 x i32> [[V2:%.*]], <vscale x 4 x i32> [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[V0]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP1:%.*]] = call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP0]], <vscale x 4 x i32> [[V1]], i64 4)
+// CHECK-RV64-NEXT:    [[TMP2:%.*]] = call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP1]], <vscale x 4 x i32> [[V2]], i64 8)
+// CHECK-RV64-NEXT:    [[TMP3:%.*]] = call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP2]], <vscale x 4 x i32> [[V3]], i64 12)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP3]]
+//
+vuint32m8_t test_vcreate_v_u32m2_u32m8(vuint32m2_t v0, vuint32m2_t v1,
+                                       vuint32m2_t v2, vuint32m2_t v3) {
+  return __riscv_vcreate_v_u32m2_u32m8(v0, v1, v2, v3);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i32> @test_vcreate_v_u32m4_u32m8(
+// CHECK-RV64-SAME: <vscale x 8 x i32> [[V0:%.*]], <vscale x 8 x i32> [[V1:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv8i32(<vscale x 16 x i32> poison, <vscale x 8 x i32> [[V0]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP1:%.*]] = call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv8i32(<vscale x 16 x i32> [[TMP0]], <vscale x 8 x i32> [[V1]], i64 8)
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP1]]
+//
+vuint32m8_t test_vcreate_v_u32m4_u32m8(vuint32m4_t v0, vuint32m4_t v1) {
+  return __riscv_vcreate_v_u32m4_u32m8(v0, v1);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 2 x i64> @test_vcreate_v_u64m1_u64m2(
+// CHECK-RV64-SAME: <vscale x 1 x i64> [[V0:%.*]], <vscale x 1 x i64> [[V1:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.nxv1i64(<vscale x 2 x i64> poison, <vscale x 1 x i64> [[V0]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP1:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.nxv1i64(<vscale x 2 x i64> [[TMP0]], <vscale x 1 x i64> [[V1]], i64 1)
+// CHECK-RV64-NEXT:    ret <vscale x 2 x i64> [[TMP1]]
+//
+vuint64m2_t test_vcreate_v_u64m1_u64m2(vuint64m1_t v0, vuint64m1_t v1) {
+  return __riscv_vcreate_v_u64m1_u64m2(v0, v1);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i64> @test_vcreate_v_u64m1_u64m4(
+// CHECK-RV64-SAME: <vscale x 1 x i64> [[V0:%.*]], <vscale x 1 x i64> [[V1:%.*]], <vscale x 1 x i64> [[V2:%.*]], <vscale x 1 x i64> [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv1i64(<vscale x 4 x i64> poison, <vscale x 1 x i64> [[V0]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP1:%.*]] = call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv1i64(<vscale x 4 x i64> [[TMP0]], <vscale x 1 x i64> [[V1]], i64 1)
+// CHECK-RV64-NEXT:    [[TMP2:%.*]] = call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv1i64(<vscale x 4 x i64> [[TMP1]], <vscale x 1 x i64> [[V2]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP3:%.*]] = call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv1i64(<vscale x 4 x i64> [[TMP2]], <vscale x 1 x i64> [[V3]], i64 3)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP3]]
+//
+vuint64m4_t test_vcreate_v_u64m1_u64m4(vuint64m1_t v0, vuint64m1_t v1,
+                                       vuint64m1_t v2, vuint64m1_t v3) {
+  return __riscv_vcreate_v_u64m1_u64m4(v0, v1, v2, v3);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i64> @test_vcreate_v_u64m1_u64m8(
+// CHECK-RV64-SAME: <vscale x 1 x i64> [[V0:%.*]], <vscale x 1 x i64> [[V1:%.*]], <vscale x 1 x i64> [[V2:%.*]], <vscale x 1 x i64> [[V3:%.*]], <vscale x 1 x i64> [[V4:%.*]], <vscale x 1 x i64> [[V5:%.*]], <vscale x 1 x i64> [[V6:%.*]], <vscale x 1 x i64> [[V7:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv1i64(<vscale x 8 x i64> poison, <vscale x 1 x i64> [[V0]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP1:%.*]] = call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv1i64(<vscale x 8 x i64> [[TMP0]], <vscale x 1 x i64> [[V1]], i64 1)
+// CHECK-RV64-NEXT:    [[TMP2:%.*]] = call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv1i64(<vscale x 8 x i64> [[TMP1]], <vscale x 1 x i64> [[V2]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP3:%.*]] = call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv1i64(<vscale x 8 x i64> [[TMP2]], <vscale x 1 x i64> [[V3]], i64 3)
+// CHECK-RV64-NEXT:    [[TMP4:%.*]] = call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv1i64(<vscale x 8 x i64> [[TMP3]], <vscale x 1 x i64> [[V4]], i64 4)
+// CHECK-RV64-NEXT:    [[TMP5:%.*]] = call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv1i64(<vscale x 8 x i64> [[TMP4]], <vscale x 1 x i64> [[V5]], i64 5)
+// CHECK-RV64-NEXT:    [[TMP6:%.*]] = call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv1i64(<vscale x 8 x i64> [[TMP5]], <vscale x 1 x i64> [[V6]], i64 6)
+// CHECK-RV64-NEXT:    [[TMP7:%.*]] = call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv1i64(<vscale x 8 x i64> [[TMP6]], <vscale x 1 x i64> [[V7]], i64 7)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP7]]
+//
+vuint64m8_t test_vcreate_v_u64m1_u64m8(vuint64m1_t v0, vuint64m1_t v1,
+                                       vuint64m1_t v2, vuint64m1_t v3,
+                                       vuint64m1_t v4, vuint64m1_t v5,
+                                       vuint64m1_t v6, vuint64m1_t v7) {
+  return __riscv_vcreate_v_u64m1_u64m8(v0, v1, v2, v3, v4, v5, v6, v7);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 4 x i64> @test_vcreate_v_u64m2_u64m4(
+// CHECK-RV64-SAME: <vscale x 2 x i64> [[V0:%.*]], <vscale x 2 x i64> [[V1:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[V0]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP1:%.*]] = call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP0]], <vscale x 2 x i64> [[V1]], i64 2)
+// CHECK-RV64-NEXT:    ret <vscale x 4 x i64> [[TMP1]]
+//
+vuint64m4_t test_vcreate_v_u64m2_u64m4(vuint64m2_t v0, vuint64m2_t v1) {
+  return __riscv_vcreate_v_u64m2_u64m4(v0, v1);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i64> @test_vcreate_v_u64m2_u64m8(
+// CHECK-RV64-SAME: <vscale x 2 x i64> [[V0:%.*]], <vscale x 2 x i64> [[V1:%.*]], <vscale x 2 x i64> [[V2:%.*]], <vscale x 2 x i64> [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> poison, <vscale x 2 x i64> [[V0]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP1:%.*]] = call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP0]], <vscale x 2 x i64> [[V1]], i64 2)
+// CHECK-RV64-NEXT:    [[TMP2:%.*]] = call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP1]], <vscale x 2 x i64> [[V2]], i64 4)
+// CHECK-RV64-NEXT:    [[TMP3:%.*]] = call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv2i64(<vscale x 8 x i64> [[TMP2]], <vscale x 2 x i64> [[V3]], i64 6)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP3]]
+//
+vuint64m8_t test_vcreate_v_u64m2_u64m8(vuint64m2_t v0, vuint64m2_t v1,
+                                       vuint64m2_t v2, vuint64m2_t v3) {
+  return __riscv_vcreate_v_u64m2_u64m8(v0, v1, v2, v3);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i64> @test_vcreate_v_u64m4_u64m8(
+// CHECK-RV64-SAME: <vscale x 4 x i64> [[V0:%.*]], <vscale x 4 x i64> [[V1:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv4i64(<vscale x 8 x i64> poison, <vscale x 4 x i64> [[V0]], i64 0)
+// CHECK-RV64-NEXT:    [[TMP1:%.*]] = call <vscale x 8 x i64> @llvm.vector.insert.nxv8i64.nxv4i64(<vscale x 8 x i64> [[TMP0]], <vscale x 4 x i64> [[V1]], i64 4)
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP1]]
+//
+vuint64m8_t test_vcreate_v_u64m4_u64m8(vuint64m4_t v0, vuint64m4_t v1) {
+  return __riscv_vcreate_v_u64m4_u64m8(v0, v1);
+}
+
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x half>, <vscale x 1 x half> } @test_vcreate_v_f16mf4x2(
+// CHECK-RV64-SAME: <vscale x 1 x half> [[V0:%.*]], <vscale x 1 x half> [[V1:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 1 x half>, <vscale x 1 x half> } poison, <vscale x 1 x half> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 1 x half>, <vscale x 1 x half> } [[TMP0]], <vscale x 1 x half> [[V1]], 1
@@ -18,20 +907,21 @@ vfloat16mf4x2_t test_vcreate_v_f16mf4x2(vfloat16mf4_t v0, vfloat16mf4_t v1) {
   return __riscv_vcreate_v_f16mf4x2(v0, v1);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half> } @test_vcreate_v_f16mf4x3
-// CHECK-RV64-SAME: (<vscale x 1 x half> [[V0:%.*]], <vscale x 1 x half> [[V1:%.*]], <vscale x 1 x half> [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half> } @test_vcreate_v_f16mf4x3(
+// CHECK-RV64-SAME: <vscale x 1 x half> [[V0:%.*]], <vscale x 1 x half> [[V1:%.*]], <vscale x 1 x half> [[V2:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half> } poison, <vscale x 1 x half> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half> } [[TMP0]], <vscale x 1 x half> [[V1]], 1
 // CHECK-RV64-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half> } [[TMP1]], <vscale x 1 x half> [[V2]], 2
 // CHECK-RV64-NEXT:    ret { <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half> } [[TMP2]]
 //
-vfloat16mf4x3_t test_vcreate_v_f16mf4x3(vfloat16mf4_t v0, vfloat16mf4_t v1, vfloat16mf4_t v2) {
+vfloat16mf4x3_t test_vcreate_v_f16mf4x3(vfloat16mf4_t v0, vfloat16mf4_t v1,
+                                        vfloat16mf4_t v2) {
   return __riscv_vcreate_v_f16mf4x3(v0, v1, v2);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half> } @test_vcreate_v_f16mf4x4
-// CHECK-RV64-SAME: (<vscale x 1 x half> [[V0:%.*]], <vscale x 1 x half> [[V1:%.*]], <vscale x 1 x half> [[V2:%.*]], <vscale x 1 x half> [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half> } @test_vcreate_v_f16mf4x4(
+// CHECK-RV64-SAME: <vscale x 1 x half> [[V0:%.*]], <vscale x 1 x half> [[V1:%.*]], <vscale x 1 x half> [[V2:%.*]], <vscale x 1 x half> [[V3:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half> } poison, <vscale x 1 x half> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half> } [[TMP0]], <vscale x 1 x half> [[V1]], 1
@@ -39,12 +929,13 @@ vfloat16mf4x3_t test_vcreate_v_f16mf4x3(vfloat16mf4_t v0, vfloat16mf4_t v1, vflo
 // CHECK-RV64-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half> } [[TMP2]], <vscale x 1 x half> [[V3]], 3
 // CHECK-RV64-NEXT:    ret { <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half> } [[TMP3]]
 //
-vfloat16mf4x4_t test_vcreate_v_f16mf4x4(vfloat16mf4_t v0, vfloat16mf4_t v1, vfloat16mf4_t v2, vfloat16mf4_t v3) {
+vfloat16mf4x4_t test_vcreate_v_f16mf4x4(vfloat16mf4_t v0, vfloat16mf4_t v1,
+                                        vfloat16mf4_t v2, vfloat16mf4_t v3) {
   return __riscv_vcreate_v_f16mf4x4(v0, v1, v2, v3);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half> } @test_vcreate_v_f16mf4x5
-// CHECK-RV64-SAME: (<vscale x 1 x half> [[V0:%.*]], <vscale x 1 x half> [[V1:%.*]], <vscale x 1 x half> [[V2:%.*]], <vscale x 1 x half> [[V3:%.*]], <vscale x 1 x half> [[V4:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half> } @test_vcreate_v_f16mf4x5(
+// CHECK-RV64-SAME: <vscale x 1 x half> [[V0:%.*]], <vscale x 1 x half> [[V1:%.*]], <vscale x 1 x half> [[V2:%.*]], <vscale x 1 x half> [[V3:%.*]], <vscale x 1 x half> [[V4:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half> } poison, <vscale x 1 x half> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half> } [[TMP0]], <vscale x 1 x half> [[V1]], 1
@@ -53,12 +944,14 @@ vfloat16mf4x4_t test_vcreate_v_f16mf4x4(vfloat16mf4_t v0, vfloat16mf4_t v1, vflo
 // CHECK-RV64-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half> } [[TMP3]], <vscale x 1 x half> [[V4]], 4
 // CHECK-RV64-NEXT:    ret { <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half> } [[TMP4]]
 //
-vfloat16mf4x5_t test_vcreate_v_f16mf4x5(vfloat16mf4_t v0, vfloat16mf4_t v1, vfloat16mf4_t v2, vfloat16mf4_t v3, vfloat16mf4_t v4) {
+vfloat16mf4x5_t test_vcreate_v_f16mf4x5(vfloat16mf4_t v0, vfloat16mf4_t v1,
+                                        vfloat16mf4_t v2, vfloat16mf4_t v3,
+                                        vfloat16mf4_t v4) {
   return __riscv_vcreate_v_f16mf4x5(v0, v1, v2, v3, v4);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half> } @test_vcreate_v_f16mf4x6
-// CHECK-RV64-SAME: (<vscale x 1 x half> [[V0:%.*]], <vscale x 1 x half> [[V1:%.*]], <vscale x 1 x half> [[V2:%.*]], <vscale x 1 x half> [[V3:%.*]], <vscale x 1 x half> [[V4:%.*]], <vscale x 1 x half> [[V5:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half> } @test_vcreate_v_f16mf4x6(
+// CHECK-RV64-SAME: <vscale x 1 x half> [[V0:%.*]], <vscale x 1 x half> [[V1:%.*]], <vscale x 1 x half> [[V2:%.*]], <vscale x 1 x half> [[V3:%.*]], <vscale x 1 x half> [[V4:%.*]], <vscale x 1 x half> [[V5:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half> } poison, <vscale x 1 x half> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half> } [[TMP0]], <vscale x 1 x half> [[V1]], 1
@@ -68,12 +961,14 @@ vfloat16mf4x5_t test_vcreate_v_f16mf4x5(vfloat16mf4_t v0, vfloat16mf4_t v1, vflo
 // CHECK-RV64-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half> } [[TMP4]], <vscale x 1 x half> [[V5]], 5
 // CHECK-RV64-NEXT:    ret { <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half> } [[TMP5]]
 //
-vfloat16mf4x6_t test_vcreate_v_f16mf4x6(vfloat16mf4_t v0, vfloat16mf4_t v1, vfloat16mf4_t v2, vfloat16mf4_t v3, vfloat16mf4_t v4, vfloat16mf4_t v5) {
+vfloat16mf4x6_t test_vcreate_v_f16mf4x6(vfloat16mf4_t v0, vfloat16mf4_t v1,
+                                        vfloat16mf4_t v2, vfloat16mf4_t v3,
+                                        vfloat16mf4_t v4, vfloat16mf4_t v5) {
   return __riscv_vcreate_v_f16mf4x6(v0, v1, v2, v3, v4, v5);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half> } @test_vcreate_v_f16mf4x7
-// CHECK-RV64-SAME: (<vscale x 1 x half> [[V0:%.*]], <vscale x 1 x half> [[V1:%.*]], <vscale x 1 x half> [[V2:%.*]], <vscale x 1 x half> [[V3:%.*]], <vscale x 1 x half> [[V4:%.*]], <vscale x 1 x half> [[V5:%.*]], <vscale x 1 x half> [[V6:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half> } @test_vcreate_v_f16mf4x7(
+// CHECK-RV64-SAME: <vscale x 1 x half> [[V0:%.*]], <vscale x 1 x half> [[V1:%.*]], <vscale x 1 x half> [[V2:%.*]], <vscale x 1 x half> [[V3:%.*]], <vscale x 1 x half> [[V4:%.*]], <vscale x 1 x half> [[V5:%.*]], <vscale x 1 x half> [[V6:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half> } poison, <vscale x 1 x half> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half> } [[TMP0]], <vscale x 1 x half> [[V1]], 1
@@ -84,12 +979,15 @@ vfloat16mf4x6_t test_vcreate_v_f16mf4x6(vfloat16mf4_t v0, vfloat16mf4_t v1, vflo
 // CHECK-RV64-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half> } [[TMP5]], <vscale x 1 x half> [[V6]], 6
 // CHECK-RV64-NEXT:    ret { <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half> } [[TMP6]]
 //
-vfloat16mf4x7_t test_vcreate_v_f16mf4x7(vfloat16mf4_t v0, vfloat16mf4_t v1, vfloat16mf4_t v2, vfloat16mf4_t v3, vfloat16mf4_t v4, vfloat16mf4_t v5, vfloat16mf4_t v6) {
+vfloat16mf4x7_t test_vcreate_v_f16mf4x7(vfloat16mf4_t v0, vfloat16mf4_t v1,
+                                        vfloat16mf4_t v2, vfloat16mf4_t v3,
+                                        vfloat16mf4_t v4, vfloat16mf4_t v5,
+                                        vfloat16mf4_t v6) {
   return __riscv_vcreate_v_f16mf4x7(v0, v1, v2, v3, v4, v5, v6);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half> } @test_vcreate_v_f16mf4x8
-// CHECK-RV64-SAME: (<vscale x 1 x half> [[V0:%.*]], <vscale x 1 x half> [[V1:%.*]], <vscale x 1 x half> [[V2:%.*]], <vscale x 1 x half> [[V3:%.*]], <vscale x 1 x half> [[V4:%.*]], <vscale x 1 x half> [[V5:%.*]], <vscale x 1 x half> [[V6:%.*]], <vscale x 1 x half> [[V7:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half> } @test_vcreate_v_f16mf4x8(
+// CHECK-RV64-SAME: <vscale x 1 x half> [[V0:%.*]], <vscale x 1 x half> [[V1:%.*]], <vscale x 1 x half> [[V2:%.*]], <vscale x 1 x half> [[V3:%.*]], <vscale x 1 x half> [[V4:%.*]], <vscale x 1 x half> [[V5:%.*]], <vscale x 1 x half> [[V6:%.*]], <vscale x 1 x half> [[V7:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half> } poison, <vscale x 1 x half> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half> } [[TMP0]], <vscale x 1 x half> [[V1]], 1
@@ -101,12 +999,15 @@ vfloat16mf4x7_t test_vcreate_v_f16mf4x7(vfloat16mf4_t v0, vfloat16mf4_t v1, vflo
 // CHECK-RV64-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half> } [[TMP6]], <vscale x 1 x half> [[V7]], 7
 // CHECK-RV64-NEXT:    ret { <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x half> } [[TMP7]]
 //
-vfloat16mf4x8_t test_vcreate_v_f16mf4x8(vfloat16mf4_t v0, vfloat16mf4_t v1, vfloat16mf4_t v2, vfloat16mf4_t v3, vfloat16mf4_t v4, vfloat16mf4_t v5, vfloat16mf4_t v6, vfloat16mf4_t v7) {
+vfloat16mf4x8_t test_vcreate_v_f16mf4x8(vfloat16mf4_t v0, vfloat16mf4_t v1,
+                                        vfloat16mf4_t v2, vfloat16mf4_t v3,
+                                        vfloat16mf4_t v4, vfloat16mf4_t v5,
+                                        vfloat16mf4_t v6, vfloat16mf4_t v7) {
   return __riscv_vcreate_v_f16mf4x8(v0, v1, v2, v3, v4, v5, v6, v7);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x half>, <vscale x 2 x half> } @test_vcreate_v_f16mf2x2
-// CHECK-RV64-SAME: (<vscale x 2 x half> [[V0:%.*]], <vscale x 2 x half> [[V1:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x half>, <vscale x 2 x half> } @test_vcreate_v_f16mf2x2(
+// CHECK-RV64-SAME: <vscale x 2 x half> [[V0:%.*]], <vscale x 2 x half> [[V1:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x half>, <vscale x 2 x half> } poison, <vscale x 2 x half> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x half>, <vscale x 2 x half> } [[TMP0]], <vscale x 2 x half> [[V1]], 1
@@ -116,20 +1017,21 @@ vfloat16mf2x2_t test_vcreate_v_f16mf2x2(vfloat16mf2_t v0, vfloat16mf2_t v1) {
   return __riscv_vcreate_v_f16mf2x2(v0, v1);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half> } @test_vcreate_v_f16mf2x3
-// CHECK-RV64-SAME: (<vscale x 2 x half> [[V0:%.*]], <vscale x 2 x half> [[V1:%.*]], <vscale x 2 x half> [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half> } @test_vcreate_v_f16mf2x3(
+// CHECK-RV64-SAME: <vscale x 2 x half> [[V0:%.*]], <vscale x 2 x half> [[V1:%.*]], <vscale x 2 x half> [[V2:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half> } poison, <vscale x 2 x half> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half> } [[TMP0]], <vscale x 2 x half> [[V1]], 1
 // CHECK-RV64-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half> } [[TMP1]], <vscale x 2 x half> [[V2]], 2
 // CHECK-RV64-NEXT:    ret { <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half> } [[TMP2]]
 //
-vfloat16mf2x3_t test_vcreate_v_f16mf2x3(vfloat16mf2_t v0, vfloat16mf2_t v1, vfloat16mf2_t v2) {
+vfloat16mf2x3_t test_vcreate_v_f16mf2x3(vfloat16mf2_t v0, vfloat16mf2_t v1,
+                                        vfloat16mf2_t v2) {
   return __riscv_vcreate_v_f16mf2x3(v0, v1, v2);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half> } @test_vcreate_v_f16mf2x4
-// CHECK-RV64-SAME: (<vscale x 2 x half> [[V0:%.*]], <vscale x 2 x half> [[V1:%.*]], <vscale x 2 x half> [[V2:%.*]], <vscale x 2 x half> [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half> } @test_vcreate_v_f16mf2x4(
+// CHECK-RV64-SAME: <vscale x 2 x half> [[V0:%.*]], <vscale x 2 x half> [[V1:%.*]], <vscale x 2 x half> [[V2:%.*]], <vscale x 2 x half> [[V3:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half> } poison, <vscale x 2 x half> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half> } [[TMP0]], <vscale x 2 x half> [[V1]], 1
@@ -137,12 +1039,13 @@ vfloat16mf2x3_t test_vcreate_v_f16mf2x3(vfloat16mf2_t v0, vfloat16mf2_t v1, vflo
 // CHECK-RV64-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half> } [[TMP2]], <vscale x 2 x half> [[V3]], 3
 // CHECK-RV64-NEXT:    ret { <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half> } [[TMP3]]
 //
-vfloat16mf2x4_t test_vcreate_v_f16mf2x4(vfloat16mf2_t v0, vfloat16mf2_t v1, vfloat16mf2_t v2, vfloat16mf2_t v3) {
+vfloat16mf2x4_t test_vcreate_v_f16mf2x4(vfloat16mf2_t v0, vfloat16mf2_t v1,
+                                        vfloat16mf2_t v2, vfloat16mf2_t v3) {
   return __riscv_vcreate_v_f16mf2x4(v0, v1, v2, v3);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half> } @test_vcreate_v_f16mf2x5
-// CHECK-RV64-SAME: (<vscale x 2 x half> [[V0:%.*]], <vscale x 2 x half> [[V1:%.*]], <vscale x 2 x half> [[V2:%.*]], <vscale x 2 x half> [[V3:%.*]], <vscale x 2 x half> [[V4:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half> } @test_vcreate_v_f16mf2x5(
+// CHECK-RV64-SAME: <vscale x 2 x half> [[V0:%.*]], <vscale x 2 x half> [[V1:%.*]], <vscale x 2 x half> [[V2:%.*]], <vscale x 2 x half> [[V3:%.*]], <vscale x 2 x half> [[V4:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half> } poison, <vscale x 2 x half> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half> } [[TMP0]], <vscale x 2 x half> [[V1]], 1
@@ -151,12 +1054,14 @@ vfloat16mf2x4_t test_vcreate_v_f16mf2x4(vfloat16mf2_t v0, vfloat16mf2_t v1, vflo
 // CHECK-RV64-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half> } [[TMP3]], <vscale x 2 x half> [[V4]], 4
 // CHECK-RV64-NEXT:    ret { <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half> } [[TMP4]]
 //
-vfloat16mf2x5_t test_vcreate_v_f16mf2x5(vfloat16mf2_t v0, vfloat16mf2_t v1, vfloat16mf2_t v2, vfloat16mf2_t v3, vfloat16mf2_t v4) {
+vfloat16mf2x5_t test_vcreate_v_f16mf2x5(vfloat16mf2_t v0, vfloat16mf2_t v1,
+                                        vfloat16mf2_t v2, vfloat16mf2_t v3,
+                                        vfloat16mf2_t v4) {
   return __riscv_vcreate_v_f16mf2x5(v0, v1, v2, v3, v4);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half> } @test_vcreate_v_f16mf2x6
-// CHECK-RV64-SAME: (<vscale x 2 x half> [[V0:%.*]], <vscale x 2 x half> [[V1:%.*]], <vscale x 2 x half> [[V2:%.*]], <vscale x 2 x half> [[V3:%.*]], <vscale x 2 x half> [[V4:%.*]], <vscale x 2 x half> [[V5:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half> } @test_vcreate_v_f16mf2x6(
+// CHECK-RV64-SAME: <vscale x 2 x half> [[V0:%.*]], <vscale x 2 x half> [[V1:%.*]], <vscale x 2 x half> [[V2:%.*]], <vscale x 2 x half> [[V3:%.*]], <vscale x 2 x half> [[V4:%.*]], <vscale x 2 x half> [[V5:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half> } poison, <vscale x 2 x half> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half> } [[TMP0]], <vscale x 2 x half> [[V1]], 1
@@ -166,12 +1071,14 @@ vfloat16mf2x5_t test_vcreate_v_f16mf2x5(vfloat16mf2_t v0, vfloat16mf2_t v1, vflo
 // CHECK-RV64-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half> } [[TMP4]], <vscale x 2 x half> [[V5]], 5
 // CHECK-RV64-NEXT:    ret { <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half> } [[TMP5]]
 //
-vfloat16mf2x6_t test_vcreate_v_f16mf2x6(vfloat16mf2_t v0, vfloat16mf2_t v1, vfloat16mf2_t v2, vfloat16mf2_t v3, vfloat16mf2_t v4, vfloat16mf2_t v5) {
+vfloat16mf2x6_t test_vcreate_v_f16mf2x6(vfloat16mf2_t v0, vfloat16mf2_t v1,
+                                        vfloat16mf2_t v2, vfloat16mf2_t v3,
+                                        vfloat16mf2_t v4, vfloat16mf2_t v5) {
   return __riscv_vcreate_v_f16mf2x6(v0, v1, v2, v3, v4, v5);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half> } @test_vcreate_v_f16mf2x7
-// CHECK-RV64-SAME: (<vscale x 2 x half> [[V0:%.*]], <vscale x 2 x half> [[V1:%.*]], <vscale x 2 x half> [[V2:%.*]], <vscale x 2 x half> [[V3:%.*]], <vscale x 2 x half> [[V4:%.*]], <vscale x 2 x half> [[V5:%.*]], <vscale x 2 x half> [[V6:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half> } @test_vcreate_v_f16mf2x7(
+// CHECK-RV64-SAME: <vscale x 2 x half> [[V0:%.*]], <vscale x 2 x half> [[V1:%.*]], <vscale x 2 x half> [[V2:%.*]], <vscale x 2 x half> [[V3:%.*]], <vscale x 2 x half> [[V4:%.*]], <vscale x 2 x half> [[V5:%.*]], <vscale x 2 x half> [[V6:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half> } poison, <vscale x 2 x half> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half> } [[TMP0]], <vscale x 2 x half> [[V1]], 1
@@ -182,12 +1089,15 @@ vfloat16mf2x6_t test_vcreate_v_f16mf2x6(vfloat16mf2_t v0, vfloat16mf2_t v1, vflo
 // CHECK-RV64-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half> } [[TMP5]], <vscale x 2 x half> [[V6]], 6
 // CHECK-RV64-NEXT:    ret { <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half> } [[TMP6]]
 //
-vfloat16mf2x7_t test_vcreate_v_f16mf2x7(vfloat16mf2_t v0, vfloat16mf2_t v1, vfloat16mf2_t v2, vfloat16mf2_t v3, vfloat16mf2_t v4, vfloat16mf2_t v5, vfloat16mf2_t v6) {
+vfloat16mf2x7_t test_vcreate_v_f16mf2x7(vfloat16mf2_t v0, vfloat16mf2_t v1,
+                                        vfloat16mf2_t v2, vfloat16mf2_t v3,
+                                        vfloat16mf2_t v4, vfloat16mf2_t v5,
+                                        vfloat16mf2_t v6) {
   return __riscv_vcreate_v_f16mf2x7(v0, v1, v2, v3, v4, v5, v6);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half> } @test_vcreate_v_f16mf2x8
-// CHECK-RV64-SAME: (<vscale x 2 x half> [[V0:%.*]], <vscale x 2 x half> [[V1:%.*]], <vscale x 2 x half> [[V2:%.*]], <vscale x 2 x half> [[V3:%.*]], <vscale x 2 x half> [[V4:%.*]], <vscale x 2 x half> [[V5:%.*]], <vscale x 2 x half> [[V6:%.*]], <vscale x 2 x half> [[V7:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half> } @test_vcreate_v_f16mf2x8(
+// CHECK-RV64-SAME: <vscale x 2 x half> [[V0:%.*]], <vscale x 2 x half> [[V1:%.*]], <vscale x 2 x half> [[V2:%.*]], <vscale x 2 x half> [[V3:%.*]], <vscale x 2 x half> [[V4:%.*]], <vscale x 2 x half> [[V5:%.*]], <vscale x 2 x half> [[V6:%.*]], <vscale x 2 x half> [[V7:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half> } poison, <vscale x 2 x half> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half> } [[TMP0]], <vscale x 2 x half> [[V1]], 1
@@ -199,12 +1109,15 @@ vfloat16mf2x7_t test_vcreate_v_f16mf2x7(vfloat16mf2_t v0, vfloat16mf2_t v1, vflo
 // CHECK-RV64-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half> } [[TMP6]], <vscale x 2 x half> [[V7]], 7
 // CHECK-RV64-NEXT:    ret { <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half> } [[TMP7]]
 //
-vfloat16mf2x8_t test_vcreate_v_f16mf2x8(vfloat16mf2_t v0, vfloat16mf2_t v1, vfloat16mf2_t v2, vfloat16mf2_t v3, vfloat16mf2_t v4, vfloat16mf2_t v5, vfloat16mf2_t v6, vfloat16mf2_t v7) {
+vfloat16mf2x8_t test_vcreate_v_f16mf2x8(vfloat16mf2_t v0, vfloat16mf2_t v1,
+                                        vfloat16mf2_t v2, vfloat16mf2_t v3,
+                                        vfloat16mf2_t v4, vfloat16mf2_t v5,
+                                        vfloat16mf2_t v6, vfloat16mf2_t v7) {
   return __riscv_vcreate_v_f16mf2x8(v0, v1, v2, v3, v4, v5, v6, v7);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x half>, <vscale x 4 x half> } @test_vcreate_v_f16m1x2
-// CHECK-RV64-SAME: (<vscale x 4 x half> [[V0:%.*]], <vscale x 4 x half> [[V1:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x half>, <vscale x 4 x half> } @test_vcreate_v_f16m1x2(
+// CHECK-RV64-SAME: <vscale x 4 x half> [[V0:%.*]], <vscale x 4 x half> [[V1:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 4 x half>, <vscale x 4 x half> } poison, <vscale x 4 x half> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 4 x half>, <vscale x 4 x half> } [[TMP0]], <vscale x 4 x half> [[V1]], 1
@@ -214,20 +1127,21 @@ vfloat16m1x2_t test_vcreate_v_f16m1x2(vfloat16m1_t v0, vfloat16m1_t v1) {
   return __riscv_vcreate_v_f16m1x2(v0, v1);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half> } @test_vcreate_v_f16m1x3
-// CHECK-RV64-SAME: (<vscale x 4 x half> [[V0:%.*]], <vscale x 4 x half> [[V1:%.*]], <vscale x 4 x half> [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half> } @test_vcreate_v_f16m1x3(
+// CHECK-RV64-SAME: <vscale x 4 x half> [[V0:%.*]], <vscale x 4 x half> [[V1:%.*]], <vscale x 4 x half> [[V2:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half> } poison, <vscale x 4 x half> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half> } [[TMP0]], <vscale x 4 x half> [[V1]], 1
 // CHECK-RV64-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half> } [[TMP1]], <vscale x 4 x half> [[V2]], 2
 // CHECK-RV64-NEXT:    ret { <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half> } [[TMP2]]
 //
-vfloat16m1x3_t test_vcreate_v_f16m1x3(vfloat16m1_t v0, vfloat16m1_t v1, vfloat16m1_t v2) {
+vfloat16m1x3_t test_vcreate_v_f16m1x3(vfloat16m1_t v0, vfloat16m1_t v1,
+                                      vfloat16m1_t v2) {
   return __riscv_vcreate_v_f16m1x3(v0, v1, v2);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half> } @test_vcreate_v_f16m1x4
-// CHECK-RV64-SAME: (<vscale x 4 x half> [[V0:%.*]], <vscale x 4 x half> [[V1:%.*]], <vscale x 4 x half> [[V2:%.*]], <vscale x 4 x half> [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half> } @test_vcreate_v_f16m1x4(
+// CHECK-RV64-SAME: <vscale x 4 x half> [[V0:%.*]], <vscale x 4 x half> [[V1:%.*]], <vscale x 4 x half> [[V2:%.*]], <vscale x 4 x half> [[V3:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half> } poison, <vscale x 4 x half> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half> } [[TMP0]], <vscale x 4 x half> [[V1]], 1
@@ -235,12 +1149,13 @@ vfloat16m1x3_t test_vcreate_v_f16m1x3(vfloat16m1_t v0, vfloat16m1_t v1, vfloat16
 // CHECK-RV64-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half> } [[TMP2]], <vscale x 4 x half> [[V3]], 3
 // CHECK-RV64-NEXT:    ret { <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half> } [[TMP3]]
 //
-vfloat16m1x4_t test_vcreate_v_f16m1x4(vfloat16m1_t v0, vfloat16m1_t v1, vfloat16m1_t v2, vfloat16m1_t v3) {
+vfloat16m1x4_t test_vcreate_v_f16m1x4(vfloat16m1_t v0, vfloat16m1_t v1,
+                                      vfloat16m1_t v2, vfloat16m1_t v3) {
   return __riscv_vcreate_v_f16m1x4(v0, v1, v2, v3);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half> } @test_vcreate_v_f16m1x5
-// CHECK-RV64-SAME: (<vscale x 4 x half> [[V0:%.*]], <vscale x 4 x half> [[V1:%.*]], <vscale x 4 x half> [[V2:%.*]], <vscale x 4 x half> [[V3:%.*]], <vscale x 4 x half> [[V4:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half> } @test_vcreate_v_f16m1x5(
+// CHECK-RV64-SAME: <vscale x 4 x half> [[V0:%.*]], <vscale x 4 x half> [[V1:%.*]], <vscale x 4 x half> [[V2:%.*]], <vscale x 4 x half> [[V3:%.*]], <vscale x 4 x half> [[V4:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half> } poison, <vscale x 4 x half> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half> } [[TMP0]], <vscale x 4 x half> [[V1]], 1
@@ -249,12 +1164,14 @@ vfloat16m1x4_t test_vcreate_v_f16m1x4(vfloat16m1_t v0, vfloat16m1_t v1, vfloat16
 // CHECK-RV64-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half> } [[TMP3]], <vscale x 4 x half> [[V4]], 4
 // CHECK-RV64-NEXT:    ret { <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half> } [[TMP4]]
 //
-vfloat16m1x5_t test_vcreate_v_f16m1x5(vfloat16m1_t v0, vfloat16m1_t v1, vfloat16m1_t v2, vfloat16m1_t v3, vfloat16m1_t v4) {
+vfloat16m1x5_t test_vcreate_v_f16m1x5(vfloat16m1_t v0, vfloat16m1_t v1,
+                                      vfloat16m1_t v2, vfloat16m1_t v3,
+                                      vfloat16m1_t v4) {
   return __riscv_vcreate_v_f16m1x5(v0, v1, v2, v3, v4);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half> } @test_vcreate_v_f16m1x6
-// CHECK-RV64-SAME: (<vscale x 4 x half> [[V0:%.*]], <vscale x 4 x half> [[V1:%.*]], <vscale x 4 x half> [[V2:%.*]], <vscale x 4 x half> [[V3:%.*]], <vscale x 4 x half> [[V4:%.*]], <vscale x 4 x half> [[V5:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half> } @test_vcreate_v_f16m1x6(
+// CHECK-RV64-SAME: <vscale x 4 x half> [[V0:%.*]], <vscale x 4 x half> [[V1:%.*]], <vscale x 4 x half> [[V2:%.*]], <vscale x 4 x half> [[V3:%.*]], <vscale x 4 x half> [[V4:%.*]], <vscale x 4 x half> [[V5:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half> } poison, <vscale x 4 x half> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half> } [[TMP0]], <vscale x 4 x half> [[V1]], 1
@@ -264,12 +1181,14 @@ vfloat16m1x5_t test_vcreate_v_f16m1x5(vfloat16m1_t v0, vfloat16m1_t v1, vfloat16
 // CHECK-RV64-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half> } [[TMP4]], <vscale x 4 x half> [[V5]], 5
 // CHECK-RV64-NEXT:    ret { <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half> } [[TMP5]]
 //
-vfloat16m1x6_t test_vcreate_v_f16m1x6(vfloat16m1_t v0, vfloat16m1_t v1, vfloat16m1_t v2, vfloat16m1_t v3, vfloat16m1_t v4, vfloat16m1_t v5) {
+vfloat16m1x6_t test_vcreate_v_f16m1x6(vfloat16m1_t v0, vfloat16m1_t v1,
+                                      vfloat16m1_t v2, vfloat16m1_t v3,
+                                      vfloat16m1_t v4, vfloat16m1_t v5) {
   return __riscv_vcreate_v_f16m1x6(v0, v1, v2, v3, v4, v5);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half> } @test_vcreate_v_f16m1x7
-// CHECK-RV64-SAME: (<vscale x 4 x half> [[V0:%.*]], <vscale x 4 x half> [[V1:%.*]], <vscale x 4 x half> [[V2:%.*]], <vscale x 4 x half> [[V3:%.*]], <vscale x 4 x half> [[V4:%.*]], <vscale x 4 x half> [[V5:%.*]], <vscale x 4 x half> [[V6:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half> } @test_vcreate_v_f16m1x7(
+// CHECK-RV64-SAME: <vscale x 4 x half> [[V0:%.*]], <vscale x 4 x half> [[V1:%.*]], <vscale x 4 x half> [[V2:%.*]], <vscale x 4 x half> [[V3:%.*]], <vscale x 4 x half> [[V4:%.*]], <vscale x 4 x half> [[V5:%.*]], <vscale x 4 x half> [[V6:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half> } poison, <vscale x 4 x half> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half> } [[TMP0]], <vscale x 4 x half> [[V1]], 1
@@ -280,12 +1199,15 @@ vfloat16m1x6_t test_vcreate_v_f16m1x6(vfloat16m1_t v0, vfloat16m1_t v1, vfloat16
 // CHECK-RV64-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half> } [[TMP5]], <vscale x 4 x half> [[V6]], 6
 // CHECK-RV64-NEXT:    ret { <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half> } [[TMP6]]
 //
-vfloat16m1x7_t test_vcreate_v_f16m1x7(vfloat16m1_t v0, vfloat16m1_t v1, vfloat16m1_t v2, vfloat16m1_t v3, vfloat16m1_t v4, vfloat16m1_t v5, vfloat16m1_t v6) {
+vfloat16m1x7_t test_vcreate_v_f16m1x7(vfloat16m1_t v0, vfloat16m1_t v1,
+                                      vfloat16m1_t v2, vfloat16m1_t v3,
+                                      vfloat16m1_t v4, vfloat16m1_t v5,
+                                      vfloat16m1_t v6) {
   return __riscv_vcreate_v_f16m1x7(v0, v1, v2, v3, v4, v5, v6);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half> } @test_vcreate_v_f16m1x8
-// CHECK-RV64-SAME: (<vscale x 4 x half> [[V0:%.*]], <vscale x 4 x half> [[V1:%.*]], <vscale x 4 x half> [[V2:%.*]], <vscale x 4 x half> [[V3:%.*]], <vscale x 4 x half> [[V4:%.*]], <vscale x 4 x half> [[V5:%.*]], <vscale x 4 x half> [[V6:%.*]], <vscale x 4 x half> [[V7:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half> } @test_vcreate_v_f16m1x8(
+// CHECK-RV64-SAME: <vscale x 4 x half> [[V0:%.*]], <vscale x 4 x half> [[V1:%.*]], <vscale x 4 x half> [[V2:%.*]], <vscale x 4 x half> [[V3:%.*]], <vscale x 4 x half> [[V4:%.*]], <vscale x 4 x half> [[V5:%.*]], <vscale x 4 x half> [[V6:%.*]], <vscale x 4 x half> [[V7:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half> } poison, <vscale x 4 x half> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half> } [[TMP0]], <vscale x 4 x half> [[V1]], 1
@@ -297,12 +1219,15 @@ vfloat16m1x7_t test_vcreate_v_f16m1x7(vfloat16m1_t v0, vfloat16m1_t v1, vfloat16
 // CHECK-RV64-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half> } [[TMP6]], <vscale x 4 x half> [[V7]], 7
 // CHECK-RV64-NEXT:    ret { <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half> } [[TMP7]]
 //
-vfloat16m1x8_t test_vcreate_v_f16m1x8(vfloat16m1_t v0, vfloat16m1_t v1, vfloat16m1_t v2, vfloat16m1_t v3, vfloat16m1_t v4, vfloat16m1_t v5, vfloat16m1_t v6, vfloat16m1_t v7) {
+vfloat16m1x8_t test_vcreate_v_f16m1x8(vfloat16m1_t v0, vfloat16m1_t v1,
+                                      vfloat16m1_t v2, vfloat16m1_t v3,
+                                      vfloat16m1_t v4, vfloat16m1_t v5,
+                                      vfloat16m1_t v6, vfloat16m1_t v7) {
   return __riscv_vcreate_v_f16m1x8(v0, v1, v2, v3, v4, v5, v6, v7);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 8 x half>, <vscale x 8 x half> } @test_vcreate_v_f16m2x2
-// CHECK-RV64-SAME: (<vscale x 8 x half> [[V0:%.*]], <vscale x 8 x half> [[V1:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 8 x half>, <vscale x 8 x half> } @test_vcreate_v_f16m2x2(
+// CHECK-RV64-SAME: <vscale x 8 x half> [[V0:%.*]], <vscale x 8 x half> [[V1:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half> } poison, <vscale x 8 x half> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], <vscale x 8 x half> [[V1]], 1
@@ -312,20 +1237,21 @@ vfloat16m2x2_t test_vcreate_v_f16m2x2(vfloat16m2_t v0, vfloat16m2_t v1) {
   return __riscv_vcreate_v_f16m2x2(v0, v1);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @test_vcreate_v_f16m2x3
-// CHECK-RV64-SAME: (<vscale x 8 x half> [[V0:%.*]], <vscale x 8 x half> [[V1:%.*]], <vscale x 8 x half> [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @test_vcreate_v_f16m2x3(
+// CHECK-RV64-SAME: <vscale x 8 x half> [[V0:%.*]], <vscale x 8 x half> [[V1:%.*]], <vscale x 8 x half> [[V2:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } poison, <vscale x 8 x half> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], <vscale x 8 x half> [[V1]], 1
 // CHECK-RV64-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP1]], <vscale x 8 x half> [[V2]], 2
 // CHECK-RV64-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP2]]
 //
-vfloat16m2x3_t test_vcreate_v_f16m2x3(vfloat16m2_t v0, vfloat16m2_t v1, vfloat16m2_t v2) {
+vfloat16m2x3_t test_vcreate_v_f16m2x3(vfloat16m2_t v0, vfloat16m2_t v1,
+                                      vfloat16m2_t v2) {
   return __riscv_vcreate_v_f16m2x3(v0, v1, v2);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @test_vcreate_v_f16m2x4
-// CHECK-RV64-SAME: (<vscale x 8 x half> [[V0:%.*]], <vscale x 8 x half> [[V1:%.*]], <vscale x 8 x half> [[V2:%.*]], <vscale x 8 x half> [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } @test_vcreate_v_f16m2x4(
+// CHECK-RV64-SAME: <vscale x 8 x half> [[V0:%.*]], <vscale x 8 x half> [[V1:%.*]], <vscale x 8 x half> [[V2:%.*]], <vscale x 8 x half> [[V3:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } poison, <vscale x 8 x half> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]], <vscale x 8 x half> [[V1]], 1
@@ -333,12 +1259,13 @@ vfloat16m2x3_t test_vcreate_v_f16m2x3(vfloat16m2_t v0, vfloat16m2_t v1, vfloat16
 // CHECK-RV64-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP2]], <vscale x 8 x half> [[V3]], 3
 // CHECK-RV64-NEXT:    ret { <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half> } [[TMP3]]
 //
-vfloat16m2x4_t test_vcreate_v_f16m2x4(vfloat16m2_t v0, vfloat16m2_t v1, vfloat16m2_t v2, vfloat16m2_t v3) {
+vfloat16m2x4_t test_vcreate_v_f16m2x4(vfloat16m2_t v0, vfloat16m2_t v1,
+                                      vfloat16m2_t v2, vfloat16m2_t v3) {
   return __riscv_vcreate_v_f16m2x4(v0, v1, v2, v3);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 16 x half>, <vscale x 16 x half> } @test_vcreate_v_f16m4x2
-// CHECK-RV64-SAME: (<vscale x 16 x half> [[V0:%.*]], <vscale x 16 x half> [[V1:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 16 x half>, <vscale x 16 x half> } @test_vcreate_v_f16m4x2(
+// CHECK-RV64-SAME: <vscale x 16 x half> [[V0:%.*]], <vscale x 16 x half> [[V1:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x half>, <vscale x 16 x half> } poison, <vscale x 16 x half> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 16 x half>, <vscale x 16 x half> } [[TMP0]], <vscale x 16 x half> [[V1]], 1
@@ -348,8 +1275,8 @@ vfloat16m4x2_t test_vcreate_v_f16m4x2(vfloat16m4_t v0, vfloat16m4_t v1) {
   return __riscv_vcreate_v_f16m4x2(v0, v1);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x float>, <vscale x 1 x float> } @test_vcreate_v_f32mf2x2
-// CHECK-RV64-SAME: (<vscale x 1 x float> [[V0:%.*]], <vscale x 1 x float> [[V1:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x float>, <vscale x 1 x float> } @test_vcreate_v_f32mf2x2(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[V0:%.*]], <vscale x 1 x float> [[V1:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 1 x float>, <vscale x 1 x float> } poison, <vscale x 1 x float> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 1 x float>, <vscale x 1 x float> } [[TMP0]], <vscale x 1 x float> [[V1]], 1
@@ -359,20 +1286,21 @@ vfloat32mf2x2_t test_vcreate_v_f32mf2x2(vfloat32mf2_t v0, vfloat32mf2_t v1) {
   return __riscv_vcreate_v_f32mf2x2(v0, v1);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float> } @test_vcreate_v_f32mf2x3
-// CHECK-RV64-SAME: (<vscale x 1 x float> [[V0:%.*]], <vscale x 1 x float> [[V1:%.*]], <vscale x 1 x float> [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float> } @test_vcreate_v_f32mf2x3(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[V0:%.*]], <vscale x 1 x float> [[V1:%.*]], <vscale x 1 x float> [[V2:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float> } poison, <vscale x 1 x float> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float> } [[TMP0]], <vscale x 1 x float> [[V1]], 1
 // CHECK-RV64-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float> } [[TMP1]], <vscale x 1 x float> [[V2]], 2
 // CHECK-RV64-NEXT:    ret { <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float> } [[TMP2]]
 //
-vfloat32mf2x3_t test_vcreate_v_f32mf2x3(vfloat32mf2_t v0, vfloat32mf2_t v1, vfloat32mf2_t v2) {
+vfloat32mf2x3_t test_vcreate_v_f32mf2x3(vfloat32mf2_t v0, vfloat32mf2_t v1,
+                                        vfloat32mf2_t v2) {
   return __riscv_vcreate_v_f32mf2x3(v0, v1, v2);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float> } @test_vcreate_v_f32mf2x4
-// CHECK-RV64-SAME: (<vscale x 1 x float> [[V0:%.*]], <vscale x 1 x float> [[V1:%.*]], <vscale x 1 x float> [[V2:%.*]], <vscale x 1 x float> [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float> } @test_vcreate_v_f32mf2x4(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[V0:%.*]], <vscale x 1 x float> [[V1:%.*]], <vscale x 1 x float> [[V2:%.*]], <vscale x 1 x float> [[V3:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float> } poison, <vscale x 1 x float> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float> } [[TMP0]], <vscale x 1 x float> [[V1]], 1
@@ -380,12 +1308,13 @@ vfloat32mf2x3_t test_vcreate_v_f32mf2x3(vfloat32mf2_t v0, vfloat32mf2_t v1, vflo
 // CHECK-RV64-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float> } [[TMP2]], <vscale x 1 x float> [[V3]], 3
 // CHECK-RV64-NEXT:    ret { <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float> } [[TMP3]]
 //
-vfloat32mf2x4_t test_vcreate_v_f32mf2x4(vfloat32mf2_t v0, vfloat32mf2_t v1, vfloat32mf2_t v2, vfloat32mf2_t v3) {
+vfloat32mf2x4_t test_vcreate_v_f32mf2x4(vfloat32mf2_t v0, vfloat32mf2_t v1,
+                                        vfloat32mf2_t v2, vfloat32mf2_t v3) {
   return __riscv_vcreate_v_f32mf2x4(v0, v1, v2, v3);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float> } @test_vcreate_v_f32mf2x5
-// CHECK-RV64-SAME: (<vscale x 1 x float> [[V0:%.*]], <vscale x 1 x float> [[V1:%.*]], <vscale x 1 x float> [[V2:%.*]], <vscale x 1 x float> [[V3:%.*]], <vscale x 1 x float> [[V4:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float> } @test_vcreate_v_f32mf2x5(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[V0:%.*]], <vscale x 1 x float> [[V1:%.*]], <vscale x 1 x float> [[V2:%.*]], <vscale x 1 x float> [[V3:%.*]], <vscale x 1 x float> [[V4:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float> } poison, <vscale x 1 x float> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float> } [[TMP0]], <vscale x 1 x float> [[V1]], 1
@@ -394,12 +1323,14 @@ vfloat32mf2x4_t test_vcreate_v_f32mf2x4(vfloat32mf2_t v0, vfloat32mf2_t v1, vflo
 // CHECK-RV64-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float> } [[TMP3]], <vscale x 1 x float> [[V4]], 4
 // CHECK-RV64-NEXT:    ret { <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float> } [[TMP4]]
 //
-vfloat32mf2x5_t test_vcreate_v_f32mf2x5(vfloat32mf2_t v0, vfloat32mf2_t v1, vfloat32mf2_t v2, vfloat32mf2_t v3, vfloat32mf2_t v4) {
+vfloat32mf2x5_t test_vcreate_v_f32mf2x5(vfloat32mf2_t v0, vfloat32mf2_t v1,
+                                        vfloat32mf2_t v2, vfloat32mf2_t v3,
+                                        vfloat32mf2_t v4) {
   return __riscv_vcreate_v_f32mf2x5(v0, v1, v2, v3, v4);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float> } @test_vcreate_v_f32mf2x6
-// CHECK-RV64-SAME: (<vscale x 1 x float> [[V0:%.*]], <vscale x 1 x float> [[V1:%.*]], <vscale x 1 x float> [[V2:%.*]], <vscale x 1 x float> [[V3:%.*]], <vscale x 1 x float> [[V4:%.*]], <vscale x 1 x float> [[V5:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float> } @test_vcreate_v_f32mf2x6(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[V0:%.*]], <vscale x 1 x float> [[V1:%.*]], <vscale x 1 x float> [[V2:%.*]], <vscale x 1 x float> [[V3:%.*]], <vscale x 1 x float> [[V4:%.*]], <vscale x 1 x float> [[V5:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float> } poison, <vscale x 1 x float> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float> } [[TMP0]], <vscale x 1 x float> [[V1]], 1
@@ -409,12 +1340,14 @@ vfloat32mf2x5_t test_vcreate_v_f32mf2x5(vfloat32mf2_t v0, vfloat32mf2_t v1, vflo
 // CHECK-RV64-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float> } [[TMP4]], <vscale x 1 x float> [[V5]], 5
 // CHECK-RV64-NEXT:    ret { <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float> } [[TMP5]]
 //
-vfloat32mf2x6_t test_vcreate_v_f32mf2x6(vfloat32mf2_t v0, vfloat32mf2_t v1, vfloat32mf2_t v2, vfloat32mf2_t v3, vfloat32mf2_t v4, vfloat32mf2_t v5) {
+vfloat32mf2x6_t test_vcreate_v_f32mf2x6(vfloat32mf2_t v0, vfloat32mf2_t v1,
+                                        vfloat32mf2_t v2, vfloat32mf2_t v3,
+                                        vfloat32mf2_t v4, vfloat32mf2_t v5) {
   return __riscv_vcreate_v_f32mf2x6(v0, v1, v2, v3, v4, v5);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float> } @test_vcreate_v_f32mf2x7
-// CHECK-RV64-SAME: (<vscale x 1 x float> [[V0:%.*]], <vscale x 1 x float> [[V1:%.*]], <vscale x 1 x float> [[V2:%.*]], <vscale x 1 x float> [[V3:%.*]], <vscale x 1 x float> [[V4:%.*]], <vscale x 1 x float> [[V5:%.*]], <vscale x 1 x float> [[V6:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float> } @test_vcreate_v_f32mf2x7(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[V0:%.*]], <vscale x 1 x float> [[V1:%.*]], <vscale x 1 x float> [[V2:%.*]], <vscale x 1 x float> [[V3:%.*]], <vscale x 1 x float> [[V4:%.*]], <vscale x 1 x float> [[V5:%.*]], <vscale x 1 x float> [[V6:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float> } poison, <vscale x 1 x float> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float> } [[TMP0]], <vscale x 1 x float> [[V1]], 1
@@ -425,12 +1358,15 @@ vfloat32mf2x6_t test_vcreate_v_f32mf2x6(vfloat32mf2_t v0, vfloat32mf2_t v1, vflo
 // CHECK-RV64-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float> } [[TMP5]], <vscale x 1 x float> [[V6]], 6
 // CHECK-RV64-NEXT:    ret { <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float> } [[TMP6]]
 //
-vfloat32mf2x7_t test_vcreate_v_f32mf2x7(vfloat32mf2_t v0, vfloat32mf2_t v1, vfloat32mf2_t v2, vfloat32mf2_t v3, vfloat32mf2_t v4, vfloat32mf2_t v5, vfloat32mf2_t v6) {
+vfloat32mf2x7_t test_vcreate_v_f32mf2x7(vfloat32mf2_t v0, vfloat32mf2_t v1,
+                                        vfloat32mf2_t v2, vfloat32mf2_t v3,
+                                        vfloat32mf2_t v4, vfloat32mf2_t v5,
+                                        vfloat32mf2_t v6) {
   return __riscv_vcreate_v_f32mf2x7(v0, v1, v2, v3, v4, v5, v6);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float> } @test_vcreate_v_f32mf2x8
-// CHECK-RV64-SAME: (<vscale x 1 x float> [[V0:%.*]], <vscale x 1 x float> [[V1:%.*]], <vscale x 1 x float> [[V2:%.*]], <vscale x 1 x float> [[V3:%.*]], <vscale x 1 x float> [[V4:%.*]], <vscale x 1 x float> [[V5:%.*]], <vscale x 1 x float> [[V6:%.*]], <vscale x 1 x float> [[V7:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float> } @test_vcreate_v_f32mf2x8(
+// CHECK-RV64-SAME: <vscale x 1 x float> [[V0:%.*]], <vscale x 1 x float> [[V1:%.*]], <vscale x 1 x float> [[V2:%.*]], <vscale x 1 x float> [[V3:%.*]], <vscale x 1 x float> [[V4:%.*]], <vscale x 1 x float> [[V5:%.*]], <vscale x 1 x float> [[V6:%.*]], <vscale x 1 x float> [[V7:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float> } poison, <vscale x 1 x float> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float> } [[TMP0]], <vscale x 1 x float> [[V1]], 1
@@ -442,12 +1378,15 @@ vfloat32mf2x7_t test_vcreate_v_f32mf2x7(vfloat32mf2_t v0, vfloat32mf2_t v1, vflo
 // CHECK-RV64-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float> } [[TMP6]], <vscale x 1 x float> [[V7]], 7
 // CHECK-RV64-NEXT:    ret { <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float> } [[TMP7]]
 //
-vfloat32mf2x8_t test_vcreate_v_f32mf2x8(vfloat32mf2_t v0, vfloat32mf2_t v1, vfloat32mf2_t v2, vfloat32mf2_t v3, vfloat32mf2_t v4, vfloat32mf2_t v5, vfloat32mf2_t v6, vfloat32mf2_t v7) {
+vfloat32mf2x8_t test_vcreate_v_f32mf2x8(vfloat32mf2_t v0, vfloat32mf2_t v1,
+                                        vfloat32mf2_t v2, vfloat32mf2_t v3,
+                                        vfloat32mf2_t v4, vfloat32mf2_t v5,
+                                        vfloat32mf2_t v6, vfloat32mf2_t v7) {
   return __riscv_vcreate_v_f32mf2x8(v0, v1, v2, v3, v4, v5, v6, v7);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x float>, <vscale x 2 x float> } @test_vcreate_v_f32m1x2
-// CHECK-RV64-SAME: (<vscale x 2 x float> [[V0:%.*]], <vscale x 2 x float> [[V1:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x float>, <vscale x 2 x float> } @test_vcreate_v_f32m1x2(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[V0:%.*]], <vscale x 2 x float> [[V1:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x float>, <vscale x 2 x float> } poison, <vscale x 2 x float> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x float>, <vscale x 2 x float> } [[TMP0]], <vscale x 2 x float> [[V1]], 1
@@ -457,20 +1396,21 @@ vfloat32m1x2_t test_vcreate_v_f32m1x2(vfloat32m1_t v0, vfloat32m1_t v1) {
   return __riscv_vcreate_v_f32m1x2(v0, v1);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float> } @test_vcreate_v_f32m1x3
-// CHECK-RV64-SAME: (<vscale x 2 x float> [[V0:%.*]], <vscale x 2 x float> [[V1:%.*]], <vscale x 2 x float> [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float> } @test_vcreate_v_f32m1x3(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[V0:%.*]], <vscale x 2 x float> [[V1:%.*]], <vscale x 2 x float> [[V2:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float> } poison, <vscale x 2 x float> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float> } [[TMP0]], <vscale x 2 x float> [[V1]], 1
 // CHECK-RV64-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float> } [[TMP1]], <vscale x 2 x float> [[V2]], 2
 // CHECK-RV64-NEXT:    ret { <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float> } [[TMP2]]
 //
-vfloat32m1x3_t test_vcreate_v_f32m1x3(vfloat32m1_t v0, vfloat32m1_t v1, vfloat32m1_t v2) {
+vfloat32m1x3_t test_vcreate_v_f32m1x3(vfloat32m1_t v0, vfloat32m1_t v1,
+                                      vfloat32m1_t v2) {
   return __riscv_vcreate_v_f32m1x3(v0, v1, v2);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float> } @test_vcreate_v_f32m1x4
-// CHECK-RV64-SAME: (<vscale x 2 x float> [[V0:%.*]], <vscale x 2 x float> [[V1:%.*]], <vscale x 2 x float> [[V2:%.*]], <vscale x 2 x float> [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float> } @test_vcreate_v_f32m1x4(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[V0:%.*]], <vscale x 2 x float> [[V1:%.*]], <vscale x 2 x float> [[V2:%.*]], <vscale x 2 x float> [[V3:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float> } poison, <vscale x 2 x float> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float> } [[TMP0]], <vscale x 2 x float> [[V1]], 1
@@ -478,12 +1418,13 @@ vfloat32m1x3_t test_vcreate_v_f32m1x3(vfloat32m1_t v0, vfloat32m1_t v1, vfloat32
 // CHECK-RV64-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float> } [[TMP2]], <vscale x 2 x float> [[V3]], 3
 // CHECK-RV64-NEXT:    ret { <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float> } [[TMP3]]
 //
-vfloat32m1x4_t test_vcreate_v_f32m1x4(vfloat32m1_t v0, vfloat32m1_t v1, vfloat32m1_t v2, vfloat32m1_t v3) {
+vfloat32m1x4_t test_vcreate_v_f32m1x4(vfloat32m1_t v0, vfloat32m1_t v1,
+                                      vfloat32m1_t v2, vfloat32m1_t v3) {
   return __riscv_vcreate_v_f32m1x4(v0, v1, v2, v3);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float> } @test_vcreate_v_f32m1x5
-// CHECK-RV64-SAME: (<vscale x 2 x float> [[V0:%.*]], <vscale x 2 x float> [[V1:%.*]], <vscale x 2 x float> [[V2:%.*]], <vscale x 2 x float> [[V3:%.*]], <vscale x 2 x float> [[V4:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float> } @test_vcreate_v_f32m1x5(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[V0:%.*]], <vscale x 2 x float> [[V1:%.*]], <vscale x 2 x float> [[V2:%.*]], <vscale x 2 x float> [[V3:%.*]], <vscale x 2 x float> [[V4:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float> } poison, <vscale x 2 x float> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float> } [[TMP0]], <vscale x 2 x float> [[V1]], 1
@@ -492,12 +1433,14 @@ vfloat32m1x4_t test_vcreate_v_f32m1x4(vfloat32m1_t v0, vfloat32m1_t v1, vfloat32
 // CHECK-RV64-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float> } [[TMP3]], <vscale x 2 x float> [[V4]], 4
 // CHECK-RV64-NEXT:    ret { <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float> } [[TMP4]]
 //
-vfloat32m1x5_t test_vcreate_v_f32m1x5(vfloat32m1_t v0, vfloat32m1_t v1, vfloat32m1_t v2, vfloat32m1_t v3, vfloat32m1_t v4) {
+vfloat32m1x5_t test_vcreate_v_f32m1x5(vfloat32m1_t v0, vfloat32m1_t v1,
+                                      vfloat32m1_t v2, vfloat32m1_t v3,
+                                      vfloat32m1_t v4) {
   return __riscv_vcreate_v_f32m1x5(v0, v1, v2, v3, v4);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float> } @test_vcreate_v_f32m1x6
-// CHECK-RV64-SAME: (<vscale x 2 x float> [[V0:%.*]], <vscale x 2 x float> [[V1:%.*]], <vscale x 2 x float> [[V2:%.*]], <vscale x 2 x float> [[V3:%.*]], <vscale x 2 x float> [[V4:%.*]], <vscale x 2 x float> [[V5:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float> } @test_vcreate_v_f32m1x6(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[V0:%.*]], <vscale x 2 x float> [[V1:%.*]], <vscale x 2 x float> [[V2:%.*]], <vscale x 2 x float> [[V3:%.*]], <vscale x 2 x float> [[V4:%.*]], <vscale x 2 x float> [[V5:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float> } poison, <vscale x 2 x float> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float> } [[TMP0]], <vscale x 2 x float> [[V1]], 1
@@ -507,12 +1450,14 @@ vfloat32m1x5_t test_vcreate_v_f32m1x5(vfloat32m1_t v0, vfloat32m1_t v1, vfloat32
 // CHECK-RV64-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float> } [[TMP4]], <vscale x 2 x float> [[V5]], 5
 // CHECK-RV64-NEXT:    ret { <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float> } [[TMP5]]
 //
-vfloat32m1x6_t test_vcreate_v_f32m1x6(vfloat32m1_t v0, vfloat32m1_t v1, vfloat32m1_t v2, vfloat32m1_t v3, vfloat32m1_t v4, vfloat32m1_t v5) {
+vfloat32m1x6_t test_vcreate_v_f32m1x6(vfloat32m1_t v0, vfloat32m1_t v1,
+                                      vfloat32m1_t v2, vfloat32m1_t v3,
+                                      vfloat32m1_t v4, vfloat32m1_t v5) {
   return __riscv_vcreate_v_f32m1x6(v0, v1, v2, v3, v4, v5);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float> } @test_vcreate_v_f32m1x7
-// CHECK-RV64-SAME: (<vscale x 2 x float> [[V0:%.*]], <vscale x 2 x float> [[V1:%.*]], <vscale x 2 x float> [[V2:%.*]], <vscale x 2 x float> [[V3:%.*]], <vscale x 2 x float> [[V4:%.*]], <vscale x 2 x float> [[V5:%.*]], <vscale x 2 x float> [[V6:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float> } @test_vcreate_v_f32m1x7(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[V0:%.*]], <vscale x 2 x float> [[V1:%.*]], <vscale x 2 x float> [[V2:%.*]], <vscale x 2 x float> [[V3:%.*]], <vscale x 2 x float> [[V4:%.*]], <vscale x 2 x float> [[V5:%.*]], <vscale x 2 x float> [[V6:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float> } poison, <vscale x 2 x float> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float> } [[TMP0]], <vscale x 2 x float> [[V1]], 1
@@ -523,12 +1468,15 @@ vfloat32m1x6_t test_vcreate_v_f32m1x6(vfloat32m1_t v0, vfloat32m1_t v1, vfloat32
 // CHECK-RV64-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float> } [[TMP5]], <vscale x 2 x float> [[V6]], 6
 // CHECK-RV64-NEXT:    ret { <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float> } [[TMP6]]
 //
-vfloat32m1x7_t test_vcreate_v_f32m1x7(vfloat32m1_t v0, vfloat32m1_t v1, vfloat32m1_t v2, vfloat32m1_t v3, vfloat32m1_t v4, vfloat32m1_t v5, vfloat32m1_t v6) {
+vfloat32m1x7_t test_vcreate_v_f32m1x7(vfloat32m1_t v0, vfloat32m1_t v1,
+                                      vfloat32m1_t v2, vfloat32m1_t v3,
+                                      vfloat32m1_t v4, vfloat32m1_t v5,
+                                      vfloat32m1_t v6) {
   return __riscv_vcreate_v_f32m1x7(v0, v1, v2, v3, v4, v5, v6);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float> } @test_vcreate_v_f32m1x8
-// CHECK-RV64-SAME: (<vscale x 2 x float> [[V0:%.*]], <vscale x 2 x float> [[V1:%.*]], <vscale x 2 x float> [[V2:%.*]], <vscale x 2 x float> [[V3:%.*]], <vscale x 2 x float> [[V4:%.*]], <vscale x 2 x float> [[V5:%.*]], <vscale x 2 x float> [[V6:%.*]], <vscale x 2 x float> [[V7:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float> } @test_vcreate_v_f32m1x8(
+// CHECK-RV64-SAME: <vscale x 2 x float> [[V0:%.*]], <vscale x 2 x float> [[V1:%.*]], <vscale x 2 x float> [[V2:%.*]], <vscale x 2 x float> [[V3:%.*]], <vscale x 2 x float> [[V4:%.*]], <vscale x 2 x float> [[V5:%.*]], <vscale x 2 x float> [[V6:%.*]], <vscale x 2 x float> [[V7:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float> } poison, <vscale x 2 x float> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float> } [[TMP0]], <vscale x 2 x float> [[V1]], 1
@@ -540,12 +1488,15 @@ vfloat32m1x7_t test_vcreate_v_f32m1x7(vfloat32m1_t v0, vfloat32m1_t v1, vfloat32
 // CHECK-RV64-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float> } [[TMP6]], <vscale x 2 x float> [[V7]], 7
 // CHECK-RV64-NEXT:    ret { <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float> } [[TMP7]]
 //
-vfloat32m1x8_t test_vcreate_v_f32m1x8(vfloat32m1_t v0, vfloat32m1_t v1, vfloat32m1_t v2, vfloat32m1_t v3, vfloat32m1_t v4, vfloat32m1_t v5, vfloat32m1_t v6, vfloat32m1_t v7) {
+vfloat32m1x8_t test_vcreate_v_f32m1x8(vfloat32m1_t v0, vfloat32m1_t v1,
+                                      vfloat32m1_t v2, vfloat32m1_t v3,
+                                      vfloat32m1_t v4, vfloat32m1_t v5,
+                                      vfloat32m1_t v6, vfloat32m1_t v7) {
   return __riscv_vcreate_v_f32m1x8(v0, v1, v2, v3, v4, v5, v6, v7);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x float>, <vscale x 4 x float> } @test_vcreate_v_f32m2x2
-// CHECK-RV64-SAME: (<vscale x 4 x float> [[V0:%.*]], <vscale x 4 x float> [[V1:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x float>, <vscale x 4 x float> } @test_vcreate_v_f32m2x2(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[V0:%.*]], <vscale x 4 x float> [[V1:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float> } poison, <vscale x 4 x float> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], <vscale x 4 x float> [[V1]], 1
@@ -555,20 +1506,21 @@ vfloat32m2x2_t test_vcreate_v_f32m2x2(vfloat32m2_t v0, vfloat32m2_t v1) {
   return __riscv_vcreate_v_f32m2x2(v0, v1);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @test_vcreate_v_f32m2x3
-// CHECK-RV64-SAME: (<vscale x 4 x float> [[V0:%.*]], <vscale x 4 x float> [[V1:%.*]], <vscale x 4 x float> [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @test_vcreate_v_f32m2x3(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[V0:%.*]], <vscale x 4 x float> [[V1:%.*]], <vscale x 4 x float> [[V2:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } poison, <vscale x 4 x float> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], <vscale x 4 x float> [[V1]], 1
 // CHECK-RV64-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP1]], <vscale x 4 x float> [[V2]], 2
 // CHECK-RV64-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP2]]
 //
-vfloat32m2x3_t test_vcreate_v_f32m2x3(vfloat32m2_t v0, vfloat32m2_t v1, vfloat32m2_t v2) {
+vfloat32m2x3_t test_vcreate_v_f32m2x3(vfloat32m2_t v0, vfloat32m2_t v1,
+                                      vfloat32m2_t v2) {
   return __riscv_vcreate_v_f32m2x3(v0, v1, v2);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @test_vcreate_v_f32m2x4
-// CHECK-RV64-SAME: (<vscale x 4 x float> [[V0:%.*]], <vscale x 4 x float> [[V1:%.*]], <vscale x 4 x float> [[V2:%.*]], <vscale x 4 x float> [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @test_vcreate_v_f32m2x4(
+// CHECK-RV64-SAME: <vscale x 4 x float> [[V0:%.*]], <vscale x 4 x float> [[V1:%.*]], <vscale x 4 x float> [[V2:%.*]], <vscale x 4 x float> [[V3:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } poison, <vscale x 4 x float> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP0]], <vscale x 4 x float> [[V1]], 1
@@ -576,12 +1528,13 @@ vfloat32m2x3_t test_vcreate_v_f32m2x3(vfloat32m2_t v0, vfloat32m2_t v1, vfloat32
 // CHECK-RV64-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP2]], <vscale x 4 x float> [[V3]], 3
 // CHECK-RV64-NEXT:    ret { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP3]]
 //
-vfloat32m2x4_t test_vcreate_v_f32m2x4(vfloat32m2_t v0, vfloat32m2_t v1, vfloat32m2_t v2, vfloat32m2_t v3) {
+vfloat32m2x4_t test_vcreate_v_f32m2x4(vfloat32m2_t v0, vfloat32m2_t v1,
+                                      vfloat32m2_t v2, vfloat32m2_t v3) {
   return __riscv_vcreate_v_f32m2x4(v0, v1, v2, v3);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 8 x float>, <vscale x 8 x float> } @test_vcreate_v_f32m4x2
-// CHECK-RV64-SAME: (<vscale x 8 x float> [[V0:%.*]], <vscale x 8 x float> [[V1:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 8 x float>, <vscale x 8 x float> } @test_vcreate_v_f32m4x2(
+// CHECK-RV64-SAME: <vscale x 8 x float> [[V0:%.*]], <vscale x 8 x float> [[V1:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x float>, <vscale x 8 x float> } poison, <vscale x 8 x float> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x float>, <vscale x 8 x float> } [[TMP0]], <vscale x 8 x float> [[V1]], 1
@@ -591,8 +1544,8 @@ vfloat32m4x2_t test_vcreate_v_f32m4x2(vfloat32m4_t v0, vfloat32m4_t v1) {
   return __riscv_vcreate_v_f32m4x2(v0, v1);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x double>, <vscale x 1 x double> } @test_vcreate_v_f64m1x2
-// CHECK-RV64-SAME: (<vscale x 1 x double> [[V0:%.*]], <vscale x 1 x double> [[V1:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x double>, <vscale x 1 x double> } @test_vcreate_v_f64m1x2(
+// CHECK-RV64-SAME: <vscale x 1 x double> [[V0:%.*]], <vscale x 1 x double> [[V1:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 1 x double>, <vscale x 1 x double> } poison, <vscale x 1 x double> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 1 x double>, <vscale x 1 x double> } [[TMP0]], <vscale x 1 x double> [[V1]], 1
@@ -602,20 +1555,21 @@ vfloat64m1x2_t test_vcreate_v_f64m1x2(vfloat64m1_t v0, vfloat64m1_t v1) {
   return __riscv_vcreate_v_f64m1x2(v0, v1);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double> } @test_vcreate_v_f64m1x3
-// CHECK-RV64-SAME: (<vscale x 1 x double> [[V0:%.*]], <vscale x 1 x double> [[V1:%.*]], <vscale x 1 x double> [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double> } @test_vcreate_v_f64m1x3(
+// CHECK-RV64-SAME: <vscale x 1 x double> [[V0:%.*]], <vscale x 1 x double> [[V1:%.*]], <vscale x 1 x double> [[V2:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double> } poison, <vscale x 1 x double> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double> } [[TMP0]], <vscale x 1 x double> [[V1]], 1
 // CHECK-RV64-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double> } [[TMP1]], <vscale x 1 x double> [[V2]], 2
 // CHECK-RV64-NEXT:    ret { <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double> } [[TMP2]]
 //
-vfloat64m1x3_t test_vcreate_v_f64m1x3(vfloat64m1_t v0, vfloat64m1_t v1, vfloat64m1_t v2) {
+vfloat64m1x3_t test_vcreate_v_f64m1x3(vfloat64m1_t v0, vfloat64m1_t v1,
+                                      vfloat64m1_t v2) {
   return __riscv_vcreate_v_f64m1x3(v0, v1, v2);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double> } @test_vcreate_v_f64m1x4
-// CHECK-RV64-SAME: (<vscale x 1 x double> [[V0:%.*]], <vscale x 1 x double> [[V1:%.*]], <vscale x 1 x double> [[V2:%.*]], <vscale x 1 x double> [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double> } @test_vcreate_v_f64m1x4(
+// CHECK-RV64-SAME: <vscale x 1 x double> [[V0:%.*]], <vscale x 1 x double> [[V1:%.*]], <vscale x 1 x double> [[V2:%.*]], <vscale x 1 x double> [[V3:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double> } poison, <vscale x 1 x double> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double> } [[TMP0]], <vscale x 1 x double> [[V1]], 1
@@ -623,12 +1577,13 @@ vfloat64m1x3_t test_vcreate_v_f64m1x3(vfloat64m1_t v0, vfloat64m1_t v1, vfloat64
 // CHECK-RV64-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double> } [[TMP2]], <vscale x 1 x double> [[V3]], 3
 // CHECK-RV64-NEXT:    ret { <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double> } [[TMP3]]
 //
-vfloat64m1x4_t test_vcreate_v_f64m1x4(vfloat64m1_t v0, vfloat64m1_t v1, vfloat64m1_t v2, vfloat64m1_t v3) {
+vfloat64m1x4_t test_vcreate_v_f64m1x4(vfloat64m1_t v0, vfloat64m1_t v1,
+                                      vfloat64m1_t v2, vfloat64m1_t v3) {
   return __riscv_vcreate_v_f64m1x4(v0, v1, v2, v3);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double> } @test_vcreate_v_f64m1x5
-// CHECK-RV64-SAME: (<vscale x 1 x double> [[V0:%.*]], <vscale x 1 x double> [[V1:%.*]], <vscale x 1 x double> [[V2:%.*]], <vscale x 1 x double> [[V3:%.*]], <vscale x 1 x double> [[V4:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double> } @test_vcreate_v_f64m1x5(
+// CHECK-RV64-SAME: <vscale x 1 x double> [[V0:%.*]], <vscale x 1 x double> [[V1:%.*]], <vscale x 1 x double> [[V2:%.*]], <vscale x 1 x double> [[V3:%.*]], <vscale x 1 x double> [[V4:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double> } poison, <vscale x 1 x double> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double> } [[TMP0]], <vscale x 1 x double> [[V1]], 1
@@ -637,12 +1592,14 @@ vfloat64m1x4_t test_vcreate_v_f64m1x4(vfloat64m1_t v0, vfloat64m1_t v1, vfloat64
 // CHECK-RV64-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double> } [[TMP3]], <vscale x 1 x double> [[V4]], 4
 // CHECK-RV64-NEXT:    ret { <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double> } [[TMP4]]
 //
-vfloat64m1x5_t test_vcreate_v_f64m1x5(vfloat64m1_t v0, vfloat64m1_t v1, vfloat64m1_t v2, vfloat64m1_t v3, vfloat64m1_t v4) {
+vfloat64m1x5_t test_vcreate_v_f64m1x5(vfloat64m1_t v0, vfloat64m1_t v1,
+                                      vfloat64m1_t v2, vfloat64m1_t v3,
+                                      vfloat64m1_t v4) {
   return __riscv_vcreate_v_f64m1x5(v0, v1, v2, v3, v4);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double> } @test_vcreate_v_f64m1x6
-// CHECK-RV64-SAME: (<vscale x 1 x double> [[V0:%.*]], <vscale x 1 x double> [[V1:%.*]], <vscale x 1 x double> [[V2:%.*]], <vscale x 1 x double> [[V3:%.*]], <vscale x 1 x double> [[V4:%.*]], <vscale x 1 x double> [[V5:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double> } @test_vcreate_v_f64m1x6(
+// CHECK-RV64-SAME: <vscale x 1 x double> [[V0:%.*]], <vscale x 1 x double> [[V1:%.*]], <vscale x 1 x double> [[V2:%.*]], <vscale x 1 x double> [[V3:%.*]], <vscale x 1 x double> [[V4:%.*]], <vscale x 1 x double> [[V5:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double> } poison, <vscale x 1 x double> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double> } [[TMP0]], <vscale x 1 x double> [[V1]], 1
@@ -652,12 +1609,14 @@ vfloat64m1x5_t test_vcreate_v_f64m1x5(vfloat64m1_t v0, vfloat64m1_t v1, vfloat64
 // CHECK-RV64-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double> } [[TMP4]], <vscale x 1 x double> [[V5]], 5
 // CHECK-RV64-NEXT:    ret { <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double> } [[TMP5]]
 //
-vfloat64m1x6_t test_vcreate_v_f64m1x6(vfloat64m1_t v0, vfloat64m1_t v1, vfloat64m1_t v2, vfloat64m1_t v3, vfloat64m1_t v4, vfloat64m1_t v5) {
+vfloat64m1x6_t test_vcreate_v_f64m1x6(vfloat64m1_t v0, vfloat64m1_t v1,
+                                      vfloat64m1_t v2, vfloat64m1_t v3,
+                                      vfloat64m1_t v4, vfloat64m1_t v5) {
   return __riscv_vcreate_v_f64m1x6(v0, v1, v2, v3, v4, v5);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double> } @test_vcreate_v_f64m1x7
-// CHECK-RV64-SAME: (<vscale x 1 x double> [[V0:%.*]], <vscale x 1 x double> [[V1:%.*]], <vscale x 1 x double> [[V2:%.*]], <vscale x 1 x double> [[V3:%.*]], <vscale x 1 x double> [[V4:%.*]], <vscale x 1 x double> [[V5:%.*]], <vscale x 1 x double> [[V6:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double> } @test_vcreate_v_f64m1x7(
+// CHECK-RV64-SAME: <vscale x 1 x double> [[V0:%.*]], <vscale x 1 x double> [[V1:%.*]], <vscale x 1 x double> [[V2:%.*]], <vscale x 1 x double> [[V3:%.*]], <vscale x 1 x double> [[V4:%.*]], <vscale x 1 x double> [[V5:%.*]], <vscale x 1 x double> [[V6:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double> } poison, <vscale x 1 x double> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double> } [[TMP0]], <vscale x 1 x double> [[V1]], 1
@@ -668,12 +1627,15 @@ vfloat64m1x6_t test_vcreate_v_f64m1x6(vfloat64m1_t v0, vfloat64m1_t v1, vfloat64
 // CHECK-RV64-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double> } [[TMP5]], <vscale x 1 x double> [[V6]], 6
 // CHECK-RV64-NEXT:    ret { <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double> } [[TMP6]]
 //
-vfloat64m1x7_t test_vcreate_v_f64m1x7(vfloat64m1_t v0, vfloat64m1_t v1, vfloat64m1_t v2, vfloat64m1_t v3, vfloat64m1_t v4, vfloat64m1_t v5, vfloat64m1_t v6) {
+vfloat64m1x7_t test_vcreate_v_f64m1x7(vfloat64m1_t v0, vfloat64m1_t v1,
+                                      vfloat64m1_t v2, vfloat64m1_t v3,
+                                      vfloat64m1_t v4, vfloat64m1_t v5,
+                                      vfloat64m1_t v6) {
   return __riscv_vcreate_v_f64m1x7(v0, v1, v2, v3, v4, v5, v6);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double> } @test_vcreate_v_f64m1x8
-// CHECK-RV64-SAME: (<vscale x 1 x double> [[V0:%.*]], <vscale x 1 x double> [[V1:%.*]], <vscale x 1 x double> [[V2:%.*]], <vscale x 1 x double> [[V3:%.*]], <vscale x 1 x double> [[V4:%.*]], <vscale x 1 x double> [[V5:%.*]], <vscale x 1 x double> [[V6:%.*]], <vscale x 1 x double> [[V7:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double> } @test_vcreate_v_f64m1x8(
+// CHECK-RV64-SAME: <vscale x 1 x double> [[V0:%.*]], <vscale x 1 x double> [[V1:%.*]], <vscale x 1 x double> [[V2:%.*]], <vscale x 1 x double> [[V3:%.*]], <vscale x 1 x double> [[V4:%.*]], <vscale x 1 x double> [[V5:%.*]], <vscale x 1 x double> [[V6:%.*]], <vscale x 1 x double> [[V7:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double> } poison, <vscale x 1 x double> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double> } [[TMP0]], <vscale x 1 x double> [[V1]], 1
@@ -685,12 +1647,15 @@ vfloat64m1x7_t test_vcreate_v_f64m1x7(vfloat64m1_t v0, vfloat64m1_t v1, vfloat64
 // CHECK-RV64-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double> } [[TMP6]], <vscale x 1 x double> [[V7]], 7
 // CHECK-RV64-NEXT:    ret { <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double> } [[TMP7]]
 //
-vfloat64m1x8_t test_vcreate_v_f64m1x8(vfloat64m1_t v0, vfloat64m1_t v1, vfloat64m1_t v2, vfloat64m1_t v3, vfloat64m1_t v4, vfloat64m1_t v5, vfloat64m1_t v6, vfloat64m1_t v7) {
+vfloat64m1x8_t test_vcreate_v_f64m1x8(vfloat64m1_t v0, vfloat64m1_t v1,
+                                      vfloat64m1_t v2, vfloat64m1_t v3,
+                                      vfloat64m1_t v4, vfloat64m1_t v5,
+                                      vfloat64m1_t v6, vfloat64m1_t v7) {
   return __riscv_vcreate_v_f64m1x8(v0, v1, v2, v3, v4, v5, v6, v7);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x double>, <vscale x 2 x double> } @test_vcreate_v_f64m2x2
-// CHECK-RV64-SAME: (<vscale x 2 x double> [[V0:%.*]], <vscale x 2 x double> [[V1:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x double>, <vscale x 2 x double> } @test_vcreate_v_f64m2x2(
+// CHECK-RV64-SAME: <vscale x 2 x double> [[V0:%.*]], <vscale x 2 x double> [[V1:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double> } poison, <vscale x 2 x double> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], <vscale x 2 x double> [[V1]], 1
@@ -700,20 +1665,21 @@ vfloat64m2x2_t test_vcreate_v_f64m2x2(vfloat64m2_t v0, vfloat64m2_t v1) {
   return __riscv_vcreate_v_f64m2x2(v0, v1);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @test_vcreate_v_f64m2x3
-// CHECK-RV64-SAME: (<vscale x 2 x double> [[V0:%.*]], <vscale x 2 x double> [[V1:%.*]], <vscale x 2 x double> [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @test_vcreate_v_f64m2x3(
+// CHECK-RV64-SAME: <vscale x 2 x double> [[V0:%.*]], <vscale x 2 x double> [[V1:%.*]], <vscale x 2 x double> [[V2:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } poison, <vscale x 2 x double> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], <vscale x 2 x double> [[V1]], 1
 // CHECK-RV64-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP1]], <vscale x 2 x double> [[V2]], 2
 // CHECK-RV64-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP2]]
 //
-vfloat64m2x3_t test_vcreate_v_f64m2x3(vfloat64m2_t v0, vfloat64m2_t v1, vfloat64m2_t v2) {
+vfloat64m2x3_t test_vcreate_v_f64m2x3(vfloat64m2_t v0, vfloat64m2_t v1,
+                                      vfloat64m2_t v2) {
   return __riscv_vcreate_v_f64m2x3(v0, v1, v2);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @test_vcreate_v_f64m2x4
-// CHECK-RV64-SAME: (<vscale x 2 x double> [[V0:%.*]], <vscale x 2 x double> [[V1:%.*]], <vscale x 2 x double> [[V2:%.*]], <vscale x 2 x double> [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @test_vcreate_v_f64m2x4(
+// CHECK-RV64-SAME: <vscale x 2 x double> [[V0:%.*]], <vscale x 2 x double> [[V1:%.*]], <vscale x 2 x double> [[V2:%.*]], <vscale x 2 x double> [[V3:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } poison, <vscale x 2 x double> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP0]], <vscale x 2 x double> [[V1]], 1
@@ -721,12 +1687,13 @@ vfloat64m2x3_t test_vcreate_v_f64m2x3(vfloat64m2_t v0, vfloat64m2_t v1, vfloat64
 // CHECK-RV64-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP2]], <vscale x 2 x double> [[V3]], 3
 // CHECK-RV64-NEXT:    ret { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[TMP3]]
 //
-vfloat64m2x4_t test_vcreate_v_f64m2x4(vfloat64m2_t v0, vfloat64m2_t v1, vfloat64m2_t v2, vfloat64m2_t v3) {
+vfloat64m2x4_t test_vcreate_v_f64m2x4(vfloat64m2_t v0, vfloat64m2_t v1,
+                                      vfloat64m2_t v2, vfloat64m2_t v3) {
   return __riscv_vcreate_v_f64m2x4(v0, v1, v2, v3);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x double>, <vscale x 4 x double> } @test_vcreate_v_f64m4x2
-// CHECK-RV64-SAME: (<vscale x 4 x double> [[V0:%.*]], <vscale x 4 x double> [[V1:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x double>, <vscale x 4 x double> } @test_vcreate_v_f64m4x2(
+// CHECK-RV64-SAME: <vscale x 4 x double> [[V0:%.*]], <vscale x 4 x double> [[V1:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 4 x double>, <vscale x 4 x double> } poison, <vscale x 4 x double> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 4 x double>, <vscale x 4 x double> } [[TMP0]], <vscale x 4 x double> [[V1]], 1
@@ -736,8 +1703,8 @@ vfloat64m4x2_t test_vcreate_v_f64m4x2(vfloat64m4_t v0, vfloat64m4_t v1) {
   return __riscv_vcreate_v_f64m4x2(v0, v1);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i8>, <vscale x 1 x i8> } @test_vcreate_v_i8mf8x2
-// CHECK-RV64-SAME: (<vscale x 1 x i8> [[V0:%.*]], <vscale x 1 x i8> [[V1:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i8>, <vscale x 1 x i8> } @test_vcreate_v_i8mf8x2(
+// CHECK-RV64-SAME: <vscale x 1 x i8> [[V0:%.*]], <vscale x 1 x i8> [[V1:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 1 x i8>, <vscale x 1 x i8> } poison, <vscale x 1 x i8> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 1 x i8>, <vscale x 1 x i8> } [[TMP0]], <vscale x 1 x i8> [[V1]], 1
@@ -747,20 +1714,21 @@ vint8mf8x2_t test_vcreate_v_i8mf8x2(vint8mf8_t v0, vint8mf8_t v1) {
   return __riscv_vcreate_v_i8mf8x2(v0, v1);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8> } @test_vcreate_v_i8mf8x3
-// CHECK-RV64-SAME: (<vscale x 1 x i8> [[V0:%.*]], <vscale x 1 x i8> [[V1:%.*]], <vscale x 1 x i8> [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8> } @test_vcreate_v_i8mf8x3(
+// CHECK-RV64-SAME: <vscale x 1 x i8> [[V0:%.*]], <vscale x 1 x i8> [[V1:%.*]], <vscale x 1 x i8> [[V2:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8> } poison, <vscale x 1 x i8> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8> } [[TMP0]], <vscale x 1 x i8> [[V1]], 1
 // CHECK-RV64-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8> } [[TMP1]], <vscale x 1 x i8> [[V2]], 2
 // CHECK-RV64-NEXT:    ret { <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8> } [[TMP2]]
 //
-vint8mf8x3_t test_vcreate_v_i8mf8x3(vint8mf8_t v0, vint8mf8_t v1, vint8mf8_t v2) {
+vint8mf8x3_t test_vcreate_v_i8mf8x3(vint8mf8_t v0, vint8mf8_t v1,
+                                    vint8mf8_t v2) {
   return __riscv_vcreate_v_i8mf8x3(v0, v1, v2);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8> } @test_vcreate_v_i8mf8x4
-// CHECK-RV64-SAME: (<vscale x 1 x i8> [[V0:%.*]], <vscale x 1 x i8> [[V1:%.*]], <vscale x 1 x i8> [[V2:%.*]], <vscale x 1 x i8> [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8> } @test_vcreate_v_i8mf8x4(
+// CHECK-RV64-SAME: <vscale x 1 x i8> [[V0:%.*]], <vscale x 1 x i8> [[V1:%.*]], <vscale x 1 x i8> [[V2:%.*]], <vscale x 1 x i8> [[V3:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8> } poison, <vscale x 1 x i8> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8> } [[TMP0]], <vscale x 1 x i8> [[V1]], 1
@@ -768,12 +1736,13 @@ vint8mf8x3_t test_vcreate_v_i8mf8x3(vint8mf8_t v0, vint8mf8_t v1, vint8mf8_t v2)
 // CHECK-RV64-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8> } [[TMP2]], <vscale x 1 x i8> [[V3]], 3
 // CHECK-RV64-NEXT:    ret { <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8> } [[TMP3]]
 //
-vint8mf8x4_t test_vcreate_v_i8mf8x4(vint8mf8_t v0, vint8mf8_t v1, vint8mf8_t v2, vint8mf8_t v3) {
+vint8mf8x4_t test_vcreate_v_i8mf8x4(vint8mf8_t v0, vint8mf8_t v1, vint8mf8_t v2,
+                                    vint8mf8_t v3) {
   return __riscv_vcreate_v_i8mf8x4(v0, v1, v2, v3);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8> } @test_vcreate_v_i8mf8x5
-// CHECK-RV64-SAME: (<vscale x 1 x i8> [[V0:%.*]], <vscale x 1 x i8> [[V1:%.*]], <vscale x 1 x i8> [[V2:%.*]], <vscale x 1 x i8> [[V3:%.*]], <vscale x 1 x i8> [[V4:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8> } @test_vcreate_v_i8mf8x5(
+// CHECK-RV64-SAME: <vscale x 1 x i8> [[V0:%.*]], <vscale x 1 x i8> [[V1:%.*]], <vscale x 1 x i8> [[V2:%.*]], <vscale x 1 x i8> [[V3:%.*]], <vscale x 1 x i8> [[V4:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8> } poison, <vscale x 1 x i8> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8> } [[TMP0]], <vscale x 1 x i8> [[V1]], 1
@@ -782,12 +1751,13 @@ vint8mf8x4_t test_vcreate_v_i8mf8x4(vint8mf8_t v0, vint8mf8_t v1, vint8mf8_t v2,
 // CHECK-RV64-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8> } [[TMP3]], <vscale x 1 x i8> [[V4]], 4
 // CHECK-RV64-NEXT:    ret { <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8> } [[TMP4]]
 //
-vint8mf8x5_t test_vcreate_v_i8mf8x5(vint8mf8_t v0, vint8mf8_t v1, vint8mf8_t v2, vint8mf8_t v3, vint8mf8_t v4) {
+vint8mf8x5_t test_vcreate_v_i8mf8x5(vint8mf8_t v0, vint8mf8_t v1, vint8mf8_t v2,
+                                    vint8mf8_t v3, vint8mf8_t v4) {
   return __riscv_vcreate_v_i8mf8x5(v0, v1, v2, v3, v4);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8> } @test_vcreate_v_i8mf8x6
-// CHECK-RV64-SAME: (<vscale x 1 x i8> [[V0:%.*]], <vscale x 1 x i8> [[V1:%.*]], <vscale x 1 x i8> [[V2:%.*]], <vscale x 1 x i8> [[V3:%.*]], <vscale x 1 x i8> [[V4:%.*]], <vscale x 1 x i8> [[V5:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8> } @test_vcreate_v_i8mf8x6(
+// CHECK-RV64-SAME: <vscale x 1 x i8> [[V0:%.*]], <vscale x 1 x i8> [[V1:%.*]], <vscale x 1 x i8> [[V2:%.*]], <vscale x 1 x i8> [[V3:%.*]], <vscale x 1 x i8> [[V4:%.*]], <vscale x 1 x i8> [[V5:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8> } poison, <vscale x 1 x i8> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8> } [[TMP0]], <vscale x 1 x i8> [[V1]], 1
@@ -797,12 +1767,14 @@ vint8mf8x5_t test_vcreate_v_i8mf8x5(vint8mf8_t v0, vint8mf8_t v1, vint8mf8_t v2,
 // CHECK-RV64-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8> } [[TMP4]], <vscale x 1 x i8> [[V5]], 5
 // CHECK-RV64-NEXT:    ret { <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8> } [[TMP5]]
 //
-vint8mf8x6_t test_vcreate_v_i8mf8x6(vint8mf8_t v0, vint8mf8_t v1, vint8mf8_t v2, vint8mf8_t v3, vint8mf8_t v4, vint8mf8_t v5) {
+vint8mf8x6_t test_vcreate_v_i8mf8x6(vint8mf8_t v0, vint8mf8_t v1, vint8mf8_t v2,
+                                    vint8mf8_t v3, vint8mf8_t v4,
+                                    vint8mf8_t v5) {
   return __riscv_vcreate_v_i8mf8x6(v0, v1, v2, v3, v4, v5);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8> } @test_vcreate_v_i8mf8x7
-// CHECK-RV64-SAME: (<vscale x 1 x i8> [[V0:%.*]], <vscale x 1 x i8> [[V1:%.*]], <vscale x 1 x i8> [[V2:%.*]], <vscale x 1 x i8> [[V3:%.*]], <vscale x 1 x i8> [[V4:%.*]], <vscale x 1 x i8> [[V5:%.*]], <vscale x 1 x i8> [[V6:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8> } @test_vcreate_v_i8mf8x7(
+// CHECK-RV64-SAME: <vscale x 1 x i8> [[V0:%.*]], <vscale x 1 x i8> [[V1:%.*]], <vscale x 1 x i8> [[V2:%.*]], <vscale x 1 x i8> [[V3:%.*]], <vscale x 1 x i8> [[V4:%.*]], <vscale x 1 x i8> [[V5:%.*]], <vscale x 1 x i8> [[V6:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8> } poison, <vscale x 1 x i8> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8> } [[TMP0]], <vscale x 1 x i8> [[V1]], 1
@@ -813,12 +1785,14 @@ vint8mf8x6_t test_vcreate_v_i8mf8x6(vint8mf8_t v0, vint8mf8_t v1, vint8mf8_t v2,
 // CHECK-RV64-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8> } [[TMP5]], <vscale x 1 x i8> [[V6]], 6
 // CHECK-RV64-NEXT:    ret { <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8> } [[TMP6]]
 //
-vint8mf8x7_t test_vcreate_v_i8mf8x7(vint8mf8_t v0, vint8mf8_t v1, vint8mf8_t v2, vint8mf8_t v3, vint8mf8_t v4, vint8mf8_t v5, vint8mf8_t v6) {
+vint8mf8x7_t test_vcreate_v_i8mf8x7(vint8mf8_t v0, vint8mf8_t v1, vint8mf8_t v2,
+                                    vint8mf8_t v3, vint8mf8_t v4, vint8mf8_t v5,
+                                    vint8mf8_t v6) {
   return __riscv_vcreate_v_i8mf8x7(v0, v1, v2, v3, v4, v5, v6);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8> } @test_vcreate_v_i8mf8x8
-// CHECK-RV64-SAME: (<vscale x 1 x i8> [[V0:%.*]], <vscale x 1 x i8> [[V1:%.*]], <vscale x 1 x i8> [[V2:%.*]], <vscale x 1 x i8> [[V3:%.*]], <vscale x 1 x i8> [[V4:%.*]], <vscale x 1 x i8> [[V5:%.*]], <vscale x 1 x i8> [[V6:%.*]], <vscale x 1 x i8> [[V7:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8> } @test_vcreate_v_i8mf8x8(
+// CHECK-RV64-SAME: <vscale x 1 x i8> [[V0:%.*]], <vscale x 1 x i8> [[V1:%.*]], <vscale x 1 x i8> [[V2:%.*]], <vscale x 1 x i8> [[V3:%.*]], <vscale x 1 x i8> [[V4:%.*]], <vscale x 1 x i8> [[V5:%.*]], <vscale x 1 x i8> [[V6:%.*]], <vscale x 1 x i8> [[V7:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8> } poison, <vscale x 1 x i8> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8> } [[TMP0]], <vscale x 1 x i8> [[V1]], 1
@@ -830,12 +1804,14 @@ vint8mf8x7_t test_vcreate_v_i8mf8x7(vint8mf8_t v0, vint8mf8_t v1, vint8mf8_t v2,
 // CHECK-RV64-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8> } [[TMP6]], <vscale x 1 x i8> [[V7]], 7
 // CHECK-RV64-NEXT:    ret { <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8> } [[TMP7]]
 //
-vint8mf8x8_t test_vcreate_v_i8mf8x8(vint8mf8_t v0, vint8mf8_t v1, vint8mf8_t v2, vint8mf8_t v3, vint8mf8_t v4, vint8mf8_t v5, vint8mf8_t v6, vint8mf8_t v7) {
+vint8mf8x8_t test_vcreate_v_i8mf8x8(vint8mf8_t v0, vint8mf8_t v1, vint8mf8_t v2,
+                                    vint8mf8_t v3, vint8mf8_t v4, vint8mf8_t v5,
+                                    vint8mf8_t v6, vint8mf8_t v7) {
   return __riscv_vcreate_v_i8mf8x8(v0, v1, v2, v3, v4, v5, v6, v7);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i8>, <vscale x 2 x i8> } @test_vcreate_v_i8mf4x2
-// CHECK-RV64-SAME: (<vscale x 2 x i8> [[V0:%.*]], <vscale x 2 x i8> [[V1:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i8>, <vscale x 2 x i8> } @test_vcreate_v_i8mf4x2(
+// CHECK-RV64-SAME: <vscale x 2 x i8> [[V0:%.*]], <vscale x 2 x i8> [[V1:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x i8>, <vscale x 2 x i8> } poison, <vscale x 2 x i8> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x i8>, <vscale x 2 x i8> } [[TMP0]], <vscale x 2 x i8> [[V1]], 1
@@ -845,20 +1821,21 @@ vint8mf4x2_t test_vcreate_v_i8mf4x2(vint8mf4_t v0, vint8mf4_t v1) {
   return __riscv_vcreate_v_i8mf4x2(v0, v1);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8> } @test_vcreate_v_i8mf4x3
-// CHECK-RV64-SAME: (<vscale x 2 x i8> [[V0:%.*]], <vscale x 2 x i8> [[V1:%.*]], <vscale x 2 x i8> [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8> } @test_vcreate_v_i8mf4x3(
+// CHECK-RV64-SAME: <vscale x 2 x i8> [[V0:%.*]], <vscale x 2 x i8> [[V1:%.*]], <vscale x 2 x i8> [[V2:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8> } poison, <vscale x 2 x i8> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8> } [[TMP0]], <vscale x 2 x i8> [[V1]], 1
 // CHECK-RV64-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8> } [[TMP1]], <vscale x 2 x i8> [[V2]], 2
 // CHECK-RV64-NEXT:    ret { <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8> } [[TMP2]]
 //
-vint8mf4x3_t test_vcreate_v_i8mf4x3(vint8mf4_t v0, vint8mf4_t v1, vint8mf4_t v2) {
+vint8mf4x3_t test_vcreate_v_i8mf4x3(vint8mf4_t v0, vint8mf4_t v1,
+                                    vint8mf4_t v2) {
   return __riscv_vcreate_v_i8mf4x3(v0, v1, v2);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8> } @test_vcreate_v_i8mf4x4
-// CHECK-RV64-SAME: (<vscale x 2 x i8> [[V0:%.*]], <vscale x 2 x i8> [[V1:%.*]], <vscale x 2 x i8> [[V2:%.*]], <vscale x 2 x i8> [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8> } @test_vcreate_v_i8mf4x4(
+// CHECK-RV64-SAME: <vscale x 2 x i8> [[V0:%.*]], <vscale x 2 x i8> [[V1:%.*]], <vscale x 2 x i8> [[V2:%.*]], <vscale x 2 x i8> [[V3:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8> } poison, <vscale x 2 x i8> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8> } [[TMP0]], <vscale x 2 x i8> [[V1]], 1
@@ -866,12 +1843,13 @@ vint8mf4x3_t test_vcreate_v_i8mf4x3(vint8mf4_t v0, vint8mf4_t v1, vint8mf4_t v2)
 // CHECK-RV64-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8> } [[TMP2]], <vscale x 2 x i8> [[V3]], 3
 // CHECK-RV64-NEXT:    ret { <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8> } [[TMP3]]
 //
-vint8mf4x4_t test_vcreate_v_i8mf4x4(vint8mf4_t v0, vint8mf4_t v1, vint8mf4_t v2, vint8mf4_t v3) {
+vint8mf4x4_t test_vcreate_v_i8mf4x4(vint8mf4_t v0, vint8mf4_t v1, vint8mf4_t v2,
+                                    vint8mf4_t v3) {
   return __riscv_vcreate_v_i8mf4x4(v0, v1, v2, v3);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8> } @test_vcreate_v_i8mf4x5
-// CHECK-RV64-SAME: (<vscale x 2 x i8> [[V0:%.*]], <vscale x 2 x i8> [[V1:%.*]], <vscale x 2 x i8> [[V2:%.*]], <vscale x 2 x i8> [[V3:%.*]], <vscale x 2 x i8> [[V4:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8> } @test_vcreate_v_i8mf4x5(
+// CHECK-RV64-SAME: <vscale x 2 x i8> [[V0:%.*]], <vscale x 2 x i8> [[V1:%.*]], <vscale x 2 x i8> [[V2:%.*]], <vscale x 2 x i8> [[V3:%.*]], <vscale x 2 x i8> [[V4:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8> } poison, <vscale x 2 x i8> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8> } [[TMP0]], <vscale x 2 x i8> [[V1]], 1
@@ -880,12 +1858,13 @@ vint8mf4x4_t test_vcreate_v_i8mf4x4(vint8mf4_t v0, vint8mf4_t v1, vint8mf4_t v2,
 // CHECK-RV64-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8> } [[TMP3]], <vscale x 2 x i8> [[V4]], 4
 // CHECK-RV64-NEXT:    ret { <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8> } [[TMP4]]
 //
-vint8mf4x5_t test_vcreate_v_i8mf4x5(vint8mf4_t v0, vint8mf4_t v1, vint8mf4_t v2, vint8mf4_t v3, vint8mf4_t v4) {
+vint8mf4x5_t test_vcreate_v_i8mf4x5(vint8mf4_t v0, vint8mf4_t v1, vint8mf4_t v2,
+                                    vint8mf4_t v3, vint8mf4_t v4) {
   return __riscv_vcreate_v_i8mf4x5(v0, v1, v2, v3, v4);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8> } @test_vcreate_v_i8mf4x6
-// CHECK-RV64-SAME: (<vscale x 2 x i8> [[V0:%.*]], <vscale x 2 x i8> [[V1:%.*]], <vscale x 2 x i8> [[V2:%.*]], <vscale x 2 x i8> [[V3:%.*]], <vscale x 2 x i8> [[V4:%.*]], <vscale x 2 x i8> [[V5:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8> } @test_vcreate_v_i8mf4x6(
+// CHECK-RV64-SAME: <vscale x 2 x i8> [[V0:%.*]], <vscale x 2 x i8> [[V1:%.*]], <vscale x 2 x i8> [[V2:%.*]], <vscale x 2 x i8> [[V3:%.*]], <vscale x 2 x i8> [[V4:%.*]], <vscale x 2 x i8> [[V5:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8> } poison, <vscale x 2 x i8> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8> } [[TMP0]], <vscale x 2 x i8> [[V1]], 1
@@ -895,12 +1874,14 @@ vint8mf4x5_t test_vcreate_v_i8mf4x5(vint8mf4_t v0, vint8mf4_t v1, vint8mf4_t v2,
 // CHECK-RV64-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8> } [[TMP4]], <vscale x 2 x i8> [[V5]], 5
 // CHECK-RV64-NEXT:    ret { <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8> } [[TMP5]]
 //
-vint8mf4x6_t test_vcreate_v_i8mf4x6(vint8mf4_t v0, vint8mf4_t v1, vint8mf4_t v2, vint8mf4_t v3, vint8mf4_t v4, vint8mf4_t v5) {
+vint8mf4x6_t test_vcreate_v_i8mf4x6(vint8mf4_t v0, vint8mf4_t v1, vint8mf4_t v2,
+                                    vint8mf4_t v3, vint8mf4_t v4,
+                                    vint8mf4_t v5) {
   return __riscv_vcreate_v_i8mf4x6(v0, v1, v2, v3, v4, v5);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8> } @test_vcreate_v_i8mf4x7
-// CHECK-RV64-SAME: (<vscale x 2 x i8> [[V0:%.*]], <vscale x 2 x i8> [[V1:%.*]], <vscale x 2 x i8> [[V2:%.*]], <vscale x 2 x i8> [[V3:%.*]], <vscale x 2 x i8> [[V4:%.*]], <vscale x 2 x i8> [[V5:%.*]], <vscale x 2 x i8> [[V6:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8> } @test_vcreate_v_i8mf4x7(
+// CHECK-RV64-SAME: <vscale x 2 x i8> [[V0:%.*]], <vscale x 2 x i8> [[V1:%.*]], <vscale x 2 x i8> [[V2:%.*]], <vscale x 2 x i8> [[V3:%.*]], <vscale x 2 x i8> [[V4:%.*]], <vscale x 2 x i8> [[V5:%.*]], <vscale x 2 x i8> [[V6:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8> } poison, <vscale x 2 x i8> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8> } [[TMP0]], <vscale x 2 x i8> [[V1]], 1
@@ -911,12 +1892,14 @@ vint8mf4x6_t test_vcreate_v_i8mf4x6(vint8mf4_t v0, vint8mf4_t v1, vint8mf4_t v2,
 // CHECK-RV64-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8> } [[TMP5]], <vscale x 2 x i8> [[V6]], 6
 // CHECK-RV64-NEXT:    ret { <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8> } [[TMP6]]
 //
-vint8mf4x7_t test_vcreate_v_i8mf4x7(vint8mf4_t v0, vint8mf4_t v1, vint8mf4_t v2, vint8mf4_t v3, vint8mf4_t v4, vint8mf4_t v5, vint8mf4_t v6) {
+vint8mf4x7_t test_vcreate_v_i8mf4x7(vint8mf4_t v0, vint8mf4_t v1, vint8mf4_t v2,
+                                    vint8mf4_t v3, vint8mf4_t v4, vint8mf4_t v5,
+                                    vint8mf4_t v6) {
   return __riscv_vcreate_v_i8mf4x7(v0, v1, v2, v3, v4, v5, v6);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8> } @test_vcreate_v_i8mf4x8
-// CHECK-RV64-SAME: (<vscale x 2 x i8> [[V0:%.*]], <vscale x 2 x i8> [[V1:%.*]], <vscale x 2 x i8> [[V2:%.*]], <vscale x 2 x i8> [[V3:%.*]], <vscale x 2 x i8> [[V4:%.*]], <vscale x 2 x i8> [[V5:%.*]], <vscale x 2 x i8> [[V6:%.*]], <vscale x 2 x i8> [[V7:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8> } @test_vcreate_v_i8mf4x8(
+// CHECK-RV64-SAME: <vscale x 2 x i8> [[V0:%.*]], <vscale x 2 x i8> [[V1:%.*]], <vscale x 2 x i8> [[V2:%.*]], <vscale x 2 x i8> [[V3:%.*]], <vscale x 2 x i8> [[V4:%.*]], <vscale x 2 x i8> [[V5:%.*]], <vscale x 2 x i8> [[V6:%.*]], <vscale x 2 x i8> [[V7:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8> } poison, <vscale x 2 x i8> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8> } [[TMP0]], <vscale x 2 x i8> [[V1]], 1
@@ -928,12 +1911,14 @@ vint8mf4x7_t test_vcreate_v_i8mf4x7(vint8mf4_t v0, vint8mf4_t v1, vint8mf4_t v2,
 // CHECK-RV64-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8> } [[TMP6]], <vscale x 2 x i8> [[V7]], 7
 // CHECK-RV64-NEXT:    ret { <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8> } [[TMP7]]
 //
-vint8mf4x8_t test_vcreate_v_i8mf4x8(vint8mf4_t v0, vint8mf4_t v1, vint8mf4_t v2, vint8mf4_t v3, vint8mf4_t v4, vint8mf4_t v5, vint8mf4_t v6, vint8mf4_t v7) {
+vint8mf4x8_t test_vcreate_v_i8mf4x8(vint8mf4_t v0, vint8mf4_t v1, vint8mf4_t v2,
+                                    vint8mf4_t v3, vint8mf4_t v4, vint8mf4_t v5,
+                                    vint8mf4_t v6, vint8mf4_t v7) {
   return __riscv_vcreate_v_i8mf4x8(v0, v1, v2, v3, v4, v5, v6, v7);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x i8>, <vscale x 4 x i8> } @test_vcreate_v_i8mf2x2
-// CHECK-RV64-SAME: (<vscale x 4 x i8> [[V0:%.*]], <vscale x 4 x i8> [[V1:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x i8>, <vscale x 4 x i8> } @test_vcreate_v_i8mf2x2(
+// CHECK-RV64-SAME: <vscale x 4 x i8> [[V0:%.*]], <vscale x 4 x i8> [[V1:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 4 x i8>, <vscale x 4 x i8> } poison, <vscale x 4 x i8> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 4 x i8>, <vscale x 4 x i8> } [[TMP0]], <vscale x 4 x i8> [[V1]], 1
@@ -943,20 +1928,21 @@ vint8mf2x2_t test_vcreate_v_i8mf2x2(vint8mf2_t v0, vint8mf2_t v1) {
   return __riscv_vcreate_v_i8mf2x2(v0, v1);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8> } @test_vcreate_v_i8mf2x3
-// CHECK-RV64-SAME: (<vscale x 4 x i8> [[V0:%.*]], <vscale x 4 x i8> [[V1:%.*]], <vscale x 4 x i8> [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8> } @test_vcreate_v_i8mf2x3(
+// CHECK-RV64-SAME: <vscale x 4 x i8> [[V0:%.*]], <vscale x 4 x i8> [[V1:%.*]], <vscale x 4 x i8> [[V2:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8> } poison, <vscale x 4 x i8> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8> } [[TMP0]], <vscale x 4 x i8> [[V1]], 1
 // CHECK-RV64-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8> } [[TMP1]], <vscale x 4 x i8> [[V2]], 2
 // CHECK-RV64-NEXT:    ret { <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8> } [[TMP2]]
 //
-vint8mf2x3_t test_vcreate_v_i8mf2x3(vint8mf2_t v0, vint8mf2_t v1, vint8mf2_t v2) {
+vint8mf2x3_t test_vcreate_v_i8mf2x3(vint8mf2_t v0, vint8mf2_t v1,
+                                    vint8mf2_t v2) {
   return __riscv_vcreate_v_i8mf2x3(v0, v1, v2);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8> } @test_vcreate_v_i8mf2x4
-// CHECK-RV64-SAME: (<vscale x 4 x i8> [[V0:%.*]], <vscale x 4 x i8> [[V1:%.*]], <vscale x 4 x i8> [[V2:%.*]], <vscale x 4 x i8> [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8> } @test_vcreate_v_i8mf2x4(
+// CHECK-RV64-SAME: <vscale x 4 x i8> [[V0:%.*]], <vscale x 4 x i8> [[V1:%.*]], <vscale x 4 x i8> [[V2:%.*]], <vscale x 4 x i8> [[V3:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8> } poison, <vscale x 4 x i8> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8> } [[TMP0]], <vscale x 4 x i8> [[V1]], 1
@@ -964,12 +1950,13 @@ vint8mf2x3_t test_vcreate_v_i8mf2x3(vint8mf2_t v0, vint8mf2_t v1, vint8mf2_t v2)
 // CHECK-RV64-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8> } [[TMP2]], <vscale x 4 x i8> [[V3]], 3
 // CHECK-RV64-NEXT:    ret { <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8> } [[TMP3]]
 //
-vint8mf2x4_t test_vcreate_v_i8mf2x4(vint8mf2_t v0, vint8mf2_t v1, vint8mf2_t v2, vint8mf2_t v3) {
+vint8mf2x4_t test_vcreate_v_i8mf2x4(vint8mf2_t v0, vint8mf2_t v1, vint8mf2_t v2,
+                                    vint8mf2_t v3) {
   return __riscv_vcreate_v_i8mf2x4(v0, v1, v2, v3);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8> } @test_vcreate_v_i8mf2x5
-// CHECK-RV64-SAME: (<vscale x 4 x i8> [[V0:%.*]], <vscale x 4 x i8> [[V1:%.*]], <vscale x 4 x i8> [[V2:%.*]], <vscale x 4 x i8> [[V3:%.*]], <vscale x 4 x i8> [[V4:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8> } @test_vcreate_v_i8mf2x5(
+// CHECK-RV64-SAME: <vscale x 4 x i8> [[V0:%.*]], <vscale x 4 x i8> [[V1:%.*]], <vscale x 4 x i8> [[V2:%.*]], <vscale x 4 x i8> [[V3:%.*]], <vscale x 4 x i8> [[V4:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8> } poison, <vscale x 4 x i8> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8> } [[TMP0]], <vscale x 4 x i8> [[V1]], 1
@@ -978,12 +1965,13 @@ vint8mf2x4_t test_vcreate_v_i8mf2x4(vint8mf2_t v0, vint8mf2_t v1, vint8mf2_t v2,
 // CHECK-RV64-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8> } [[TMP3]], <vscale x 4 x i8> [[V4]], 4
 // CHECK-RV64-NEXT:    ret { <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8> } [[TMP4]]
 //
-vint8mf2x5_t test_vcreate_v_i8mf2x5(vint8mf2_t v0, vint8mf2_t v1, vint8mf2_t v2, vint8mf2_t v3, vint8mf2_t v4) {
+vint8mf2x5_t test_vcreate_v_i8mf2x5(vint8mf2_t v0, vint8mf2_t v1, vint8mf2_t v2,
+                                    vint8mf2_t v3, vint8mf2_t v4) {
   return __riscv_vcreate_v_i8mf2x5(v0, v1, v2, v3, v4);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8> } @test_vcreate_v_i8mf2x6
-// CHECK-RV64-SAME: (<vscale x 4 x i8> [[V0:%.*]], <vscale x 4 x i8> [[V1:%.*]], <vscale x 4 x i8> [[V2:%.*]], <vscale x 4 x i8> [[V3:%.*]], <vscale x 4 x i8> [[V4:%.*]], <vscale x 4 x i8> [[V5:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8> } @test_vcreate_v_i8mf2x6(
+// CHECK-RV64-SAME: <vscale x 4 x i8> [[V0:%.*]], <vscale x 4 x i8> [[V1:%.*]], <vscale x 4 x i8> [[V2:%.*]], <vscale x 4 x i8> [[V3:%.*]], <vscale x 4 x i8> [[V4:%.*]], <vscale x 4 x i8> [[V5:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8> } poison, <vscale x 4 x i8> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8> } [[TMP0]], <vscale x 4 x i8> [[V1]], 1
@@ -993,12 +1981,14 @@ vint8mf2x5_t test_vcreate_v_i8mf2x5(vint8mf2_t v0, vint8mf2_t v1, vint8mf2_t v2,
 // CHECK-RV64-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8> } [[TMP4]], <vscale x 4 x i8> [[V5]], 5
 // CHECK-RV64-NEXT:    ret { <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8> } [[TMP5]]
 //
-vint8mf2x6_t test_vcreate_v_i8mf2x6(vint8mf2_t v0, vint8mf2_t v1, vint8mf2_t v2, vint8mf2_t v3, vint8mf2_t v4, vint8mf2_t v5) {
+vint8mf2x6_t test_vcreate_v_i8mf2x6(vint8mf2_t v0, vint8mf2_t v1, vint8mf2_t v2,
+                                    vint8mf2_t v3, vint8mf2_t v4,
+                                    vint8mf2_t v5) {
   return __riscv_vcreate_v_i8mf2x6(v0, v1, v2, v3, v4, v5);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8> } @test_vcreate_v_i8mf2x7
-// CHECK-RV64-SAME: (<vscale x 4 x i8> [[V0:%.*]], <vscale x 4 x i8> [[V1:%.*]], <vscale x 4 x i8> [[V2:%.*]], <vscale x 4 x i8> [[V3:%.*]], <vscale x 4 x i8> [[V4:%.*]], <vscale x 4 x i8> [[V5:%.*]], <vscale x 4 x i8> [[V6:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8> } @test_vcreate_v_i8mf2x7(
+// CHECK-RV64-SAME: <vscale x 4 x i8> [[V0:%.*]], <vscale x 4 x i8> [[V1:%.*]], <vscale x 4 x i8> [[V2:%.*]], <vscale x 4 x i8> [[V3:%.*]], <vscale x 4 x i8> [[V4:%.*]], <vscale x 4 x i8> [[V5:%.*]], <vscale x 4 x i8> [[V6:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8> } poison, <vscale x 4 x i8> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8> } [[TMP0]], <vscale x 4 x i8> [[V1]], 1
@@ -1009,12 +1999,14 @@ vint8mf2x6_t test_vcreate_v_i8mf2x6(vint8mf2_t v0, vint8mf2_t v1, vint8mf2_t v2,
 // CHECK-RV64-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8> } [[TMP5]], <vscale x 4 x i8> [[V6]], 6
 // CHECK-RV64-NEXT:    ret { <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8> } [[TMP6]]
 //
-vint8mf2x7_t test_vcreate_v_i8mf2x7(vint8mf2_t v0, vint8mf2_t v1, vint8mf2_t v2, vint8mf2_t v3, vint8mf2_t v4, vint8mf2_t v5, vint8mf2_t v6) {
+vint8mf2x7_t test_vcreate_v_i8mf2x7(vint8mf2_t v0, vint8mf2_t v1, vint8mf2_t v2,
+                                    vint8mf2_t v3, vint8mf2_t v4, vint8mf2_t v5,
+                                    vint8mf2_t v6) {
   return __riscv_vcreate_v_i8mf2x7(v0, v1, v2, v3, v4, v5, v6);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8> } @test_vcreate_v_i8mf2x8
-// CHECK-RV64-SAME: (<vscale x 4 x i8> [[V0:%.*]], <vscale x 4 x i8> [[V1:%.*]], <vscale x 4 x i8> [[V2:%.*]], <vscale x 4 x i8> [[V3:%.*]], <vscale x 4 x i8> [[V4:%.*]], <vscale x 4 x i8> [[V5:%.*]], <vscale x 4 x i8> [[V6:%.*]], <vscale x 4 x i8> [[V7:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8> } @test_vcreate_v_i8mf2x8(
+// CHECK-RV64-SAME: <vscale x 4 x i8> [[V0:%.*]], <vscale x 4 x i8> [[V1:%.*]], <vscale x 4 x i8> [[V2:%.*]], <vscale x 4 x i8> [[V3:%.*]], <vscale x 4 x i8> [[V4:%.*]], <vscale x 4 x i8> [[V5:%.*]], <vscale x 4 x i8> [[V6:%.*]], <vscale x 4 x i8> [[V7:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8> } poison, <vscale x 4 x i8> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8> } [[TMP0]], <vscale x 4 x i8> [[V1]], 1
@@ -1026,12 +2018,14 @@ vint8mf2x7_t test_vcreate_v_i8mf2x7(vint8mf2_t v0, vint8mf2_t v1, vint8mf2_t v2,
 // CHECK-RV64-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8> } [[TMP6]], <vscale x 4 x i8> [[V7]], 7
 // CHECK-RV64-NEXT:    ret { <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8> } [[TMP7]]
 //
-vint8mf2x8_t test_vcreate_v_i8mf2x8(vint8mf2_t v0, vint8mf2_t v1, vint8mf2_t v2, vint8mf2_t v3, vint8mf2_t v4, vint8mf2_t v5, vint8mf2_t v6, vint8mf2_t v7) {
+vint8mf2x8_t test_vcreate_v_i8mf2x8(vint8mf2_t v0, vint8mf2_t v1, vint8mf2_t v2,
+                                    vint8mf2_t v3, vint8mf2_t v4, vint8mf2_t v5,
+                                    vint8mf2_t v6, vint8mf2_t v7) {
   return __riscv_vcreate_v_i8mf2x8(v0, v1, v2, v3, v4, v5, v6, v7);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 8 x i8>, <vscale x 8 x i8> } @test_vcreate_v_i8m1x2
-// CHECK-RV64-SAME: (<vscale x 8 x i8> [[V0:%.*]], <vscale x 8 x i8> [[V1:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 8 x i8>, <vscale x 8 x i8> } @test_vcreate_v_i8m1x2(
+// CHECK-RV64-SAME: <vscale x 8 x i8> [[V0:%.*]], <vscale x 8 x i8> [[V1:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x i8>, <vscale x 8 x i8> } poison, <vscale x 8 x i8> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x i8>, <vscale x 8 x i8> } [[TMP0]], <vscale x 8 x i8> [[V1]], 1
@@ -1041,8 +2035,8 @@ vint8m1x2_t test_vcreate_v_i8m1x2(vint8m1_t v0, vint8m1_t v1) {
   return __riscv_vcreate_v_i8m1x2(v0, v1);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } @test_vcreate_v_i8m1x3
-// CHECK-RV64-SAME: (<vscale x 8 x i8> [[V0:%.*]], <vscale x 8 x i8> [[V1:%.*]], <vscale x 8 x i8> [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } @test_vcreate_v_i8m1x3(
+// CHECK-RV64-SAME: <vscale x 8 x i8> [[V0:%.*]], <vscale x 8 x i8> [[V1:%.*]], <vscale x 8 x i8> [[V2:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } poison, <vscale x 8 x i8> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } [[TMP0]], <vscale x 8 x i8> [[V1]], 1
@@ -1053,8 +2047,8 @@ vint8m1x3_t test_vcreate_v_i8m1x3(vint8m1_t v0, vint8m1_t v1, vint8m1_t v2) {
   return __riscv_vcreate_v_i8m1x3(v0, v1, v2);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } @test_vcreate_v_i8m1x4
-// CHECK-RV64-SAME: (<vscale x 8 x i8> [[V0:%.*]], <vscale x 8 x i8> [[V1:%.*]], <vscale x 8 x i8> [[V2:%.*]], <vscale x 8 x i8> [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } @test_vcreate_v_i8m1x4(
+// CHECK-RV64-SAME: <vscale x 8 x i8> [[V0:%.*]], <vscale x 8 x i8> [[V1:%.*]], <vscale x 8 x i8> [[V2:%.*]], <vscale x 8 x i8> [[V3:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } poison, <vscale x 8 x i8> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } [[TMP0]], <vscale x 8 x i8> [[V1]], 1
@@ -1062,12 +2056,13 @@ vint8m1x3_t test_vcreate_v_i8m1x3(vint8m1_t v0, vint8m1_t v1, vint8m1_t v2) {
 // CHECK-RV64-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } [[TMP2]], <vscale x 8 x i8> [[V3]], 3
 // CHECK-RV64-NEXT:    ret { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } [[TMP3]]
 //
-vint8m1x4_t test_vcreate_v_i8m1x4(vint8m1_t v0, vint8m1_t v1, vint8m1_t v2, vint8m1_t v3) {
+vint8m1x4_t test_vcreate_v_i8m1x4(vint8m1_t v0, vint8m1_t v1, vint8m1_t v2,
+                                  vint8m1_t v3) {
   return __riscv_vcreate_v_i8m1x4(v0, v1, v2, v3);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } @test_vcreate_v_i8m1x5
-// CHECK-RV64-SAME: (<vscale x 8 x i8> [[V0:%.*]], <vscale x 8 x i8> [[V1:%.*]], <vscale x 8 x i8> [[V2:%.*]], <vscale x 8 x i8> [[V3:%.*]], <vscale x 8 x i8> [[V4:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } @test_vcreate_v_i8m1x5(
+// CHECK-RV64-SAME: <vscale x 8 x i8> [[V0:%.*]], <vscale x 8 x i8> [[V1:%.*]], <vscale x 8 x i8> [[V2:%.*]], <vscale x 8 x i8> [[V3:%.*]], <vscale x 8 x i8> [[V4:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } poison, <vscale x 8 x i8> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } [[TMP0]], <vscale x 8 x i8> [[V1]], 1
@@ -1076,12 +2071,13 @@ vint8m1x4_t test_vcreate_v_i8m1x4(vint8m1_t v0, vint8m1_t v1, vint8m1_t v2, vint
 // CHECK-RV64-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } [[TMP3]], <vscale x 8 x i8> [[V4]], 4
 // CHECK-RV64-NEXT:    ret { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } [[TMP4]]
 //
-vint8m1x5_t test_vcreate_v_i8m1x5(vint8m1_t v0, vint8m1_t v1, vint8m1_t v2, vint8m1_t v3, vint8m1_t v4) {
+vint8m1x5_t test_vcreate_v_i8m1x5(vint8m1_t v0, vint8m1_t v1, vint8m1_t v2,
+                                  vint8m1_t v3, vint8m1_t v4) {
   return __riscv_vcreate_v_i8m1x5(v0, v1, v2, v3, v4);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } @test_vcreate_v_i8m1x6
-// CHECK-RV64-SAME: (<vscale x 8 x i8> [[V0:%.*]], <vscale x 8 x i8> [[V1:%.*]], <vscale x 8 x i8> [[V2:%.*]], <vscale x 8 x i8> [[V3:%.*]], <vscale x 8 x i8> [[V4:%.*]], <vscale x 8 x i8> [[V5:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } @test_vcreate_v_i8m1x6(
+// CHECK-RV64-SAME: <vscale x 8 x i8> [[V0:%.*]], <vscale x 8 x i8> [[V1:%.*]], <vscale x 8 x i8> [[V2:%.*]], <vscale x 8 x i8> [[V3:%.*]], <vscale x 8 x i8> [[V4:%.*]], <vscale x 8 x i8> [[V5:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } poison, <vscale x 8 x i8> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } [[TMP0]], <vscale x 8 x i8> [[V1]], 1
@@ -1091,12 +2087,13 @@ vint8m1x5_t test_vcreate_v_i8m1x5(vint8m1_t v0, vint8m1_t v1, vint8m1_t v2, vint
 // CHECK-RV64-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } [[TMP4]], <vscale x 8 x i8> [[V5]], 5
 // CHECK-RV64-NEXT:    ret { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } [[TMP5]]
 //
-vint8m1x6_t test_vcreate_v_i8m1x6(vint8m1_t v0, vint8m1_t v1, vint8m1_t v2, vint8m1_t v3, vint8m1_t v4, vint8m1_t v5) {
+vint8m1x6_t test_vcreate_v_i8m1x6(vint8m1_t v0, vint8m1_t v1, vint8m1_t v2,
+                                  vint8m1_t v3, vint8m1_t v4, vint8m1_t v5) {
   return __riscv_vcreate_v_i8m1x6(v0, v1, v2, v3, v4, v5);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } @test_vcreate_v_i8m1x7
-// CHECK-RV64-SAME: (<vscale x 8 x i8> [[V0:%.*]], <vscale x 8 x i8> [[V1:%.*]], <vscale x 8 x i8> [[V2:%.*]], <vscale x 8 x i8> [[V3:%.*]], <vscale x 8 x i8> [[V4:%.*]], <vscale x 8 x i8> [[V5:%.*]], <vscale x 8 x i8> [[V6:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } @test_vcreate_v_i8m1x7(
+// CHECK-RV64-SAME: <vscale x 8 x i8> [[V0:%.*]], <vscale x 8 x i8> [[V1:%.*]], <vscale x 8 x i8> [[V2:%.*]], <vscale x 8 x i8> [[V3:%.*]], <vscale x 8 x i8> [[V4:%.*]], <vscale x 8 x i8> [[V5:%.*]], <vscale x 8 x i8> [[V6:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } poison, <vscale x 8 x i8> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } [[TMP0]], <vscale x 8 x i8> [[V1]], 1
@@ -1107,12 +2104,14 @@ vint8m1x6_t test_vcreate_v_i8m1x6(vint8m1_t v0, vint8m1_t v1, vint8m1_t v2, vint
 // CHECK-RV64-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } [[TMP5]], <vscale x 8 x i8> [[V6]], 6
 // CHECK-RV64-NEXT:    ret { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } [[TMP6]]
 //
-vint8m1x7_t test_vcreate_v_i8m1x7(vint8m1_t v0, vint8m1_t v1, vint8m1_t v2, vint8m1_t v3, vint8m1_t v4, vint8m1_t v5, vint8m1_t v6) {
+vint8m1x7_t test_vcreate_v_i8m1x7(vint8m1_t v0, vint8m1_t v1, vint8m1_t v2,
+                                  vint8m1_t v3, vint8m1_t v4, vint8m1_t v5,
+                                  vint8m1_t v6) {
   return __riscv_vcreate_v_i8m1x7(v0, v1, v2, v3, v4, v5, v6);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } @test_vcreate_v_i8m1x8
-// CHECK-RV64-SAME: (<vscale x 8 x i8> [[V0:%.*]], <vscale x 8 x i8> [[V1:%.*]], <vscale x 8 x i8> [[V2:%.*]], <vscale x 8 x i8> [[V3:%.*]], <vscale x 8 x i8> [[V4:%.*]], <vscale x 8 x i8> [[V5:%.*]], <vscale x 8 x i8> [[V6:%.*]], <vscale x 8 x i8> [[V7:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } @test_vcreate_v_i8m1x8(
+// CHECK-RV64-SAME: <vscale x 8 x i8> [[V0:%.*]], <vscale x 8 x i8> [[V1:%.*]], <vscale x 8 x i8> [[V2:%.*]], <vscale x 8 x i8> [[V3:%.*]], <vscale x 8 x i8> [[V4:%.*]], <vscale x 8 x i8> [[V5:%.*]], <vscale x 8 x i8> [[V6:%.*]], <vscale x 8 x i8> [[V7:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } poison, <vscale x 8 x i8> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } [[TMP0]], <vscale x 8 x i8> [[V1]], 1
@@ -1124,12 +2123,14 @@ vint8m1x7_t test_vcreate_v_i8m1x7(vint8m1_t v0, vint8m1_t v1, vint8m1_t v2, vint
 // CHECK-RV64-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } [[TMP6]], <vscale x 8 x i8> [[V7]], 7
 // CHECK-RV64-NEXT:    ret { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } [[TMP7]]
 //
-vint8m1x8_t test_vcreate_v_i8m1x8(vint8m1_t v0, vint8m1_t v1, vint8m1_t v2, vint8m1_t v3, vint8m1_t v4, vint8m1_t v5, vint8m1_t v6, vint8m1_t v7) {
+vint8m1x8_t test_vcreate_v_i8m1x8(vint8m1_t v0, vint8m1_t v1, vint8m1_t v2,
+                                  vint8m1_t v3, vint8m1_t v4, vint8m1_t v5,
+                                  vint8m1_t v6, vint8m1_t v7) {
   return __riscv_vcreate_v_i8m1x8(v0, v1, v2, v3, v4, v5, v6, v7);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8> } @test_vcreate_v_i8m2x2
-// CHECK-RV64-SAME: (<vscale x 16 x i8> [[V0:%.*]], <vscale x 16 x i8> [[V1:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8> } @test_vcreate_v_i8m2x2(
+// CHECK-RV64-SAME: <vscale x 16 x i8> [[V0:%.*]], <vscale x 16 x i8> [[V1:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[V1]], 1
@@ -1139,8 +2140,8 @@ vint8m2x2_t test_vcreate_v_i8m2x2(vint8m2_t v0, vint8m2_t v1) {
   return __riscv_vcreate_v_i8m2x2(v0, v1);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @test_vcreate_v_i8m2x3
-// CHECK-RV64-SAME: (<vscale x 16 x i8> [[V0:%.*]], <vscale x 16 x i8> [[V1:%.*]], <vscale x 16 x i8> [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @test_vcreate_v_i8m2x3(
+// CHECK-RV64-SAME: <vscale x 16 x i8> [[V0:%.*]], <vscale x 16 x i8> [[V1:%.*]], <vscale x 16 x i8> [[V2:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[V1]], 1
@@ -1151,8 +2152,8 @@ vint8m2x3_t test_vcreate_v_i8m2x3(vint8m2_t v0, vint8m2_t v1, vint8m2_t v2) {
   return __riscv_vcreate_v_i8m2x3(v0, v1, v2);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @test_vcreate_v_i8m2x4
-// CHECK-RV64-SAME: (<vscale x 16 x i8> [[V0:%.*]], <vscale x 16 x i8> [[V1:%.*]], <vscale x 16 x i8> [[V2:%.*]], <vscale x 16 x i8> [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @test_vcreate_v_i8m2x4(
+// CHECK-RV64-SAME: <vscale x 16 x i8> [[V0:%.*]], <vscale x 16 x i8> [[V1:%.*]], <vscale x 16 x i8> [[V2:%.*]], <vscale x 16 x i8> [[V3:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[V1]], 1
@@ -1160,12 +2161,13 @@ vint8m2x3_t test_vcreate_v_i8m2x3(vint8m2_t v0, vint8m2_t v1, vint8m2_t v2) {
 // CHECK-RV64-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], <vscale x 16 x i8> [[V3]], 3
 // CHECK-RV64-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]]
 //
-vint8m2x4_t test_vcreate_v_i8m2x4(vint8m2_t v0, vint8m2_t v1, vint8m2_t v2, vint8m2_t v3) {
+vint8m2x4_t test_vcreate_v_i8m2x4(vint8m2_t v0, vint8m2_t v1, vint8m2_t v2,
+                                  vint8m2_t v3) {
   return __riscv_vcreate_v_i8m2x4(v0, v1, v2, v3);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 32 x i8>, <vscale x 32 x i8> } @test_vcreate_v_i8m4x2
-// CHECK-RV64-SAME: (<vscale x 32 x i8> [[V0:%.*]], <vscale x 32 x i8> [[V1:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 32 x i8>, <vscale x 32 x i8> } @test_vcreate_v_i8m4x2(
+// CHECK-RV64-SAME: <vscale x 32 x i8> [[V0:%.*]], <vscale x 32 x i8> [[V1:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 32 x i8>, <vscale x 32 x i8> } poison, <vscale x 32 x i8> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 32 x i8>, <vscale x 32 x i8> } [[TMP0]], <vscale x 32 x i8> [[V1]], 1
@@ -1175,8 +2177,8 @@ vint8m4x2_t test_vcreate_v_i8m4x2(vint8m4_t v0, vint8m4_t v1) {
   return __riscv_vcreate_v_i8m4x2(v0, v1);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i16>, <vscale x 1 x i16> } @test_vcreate_v_i16mf4x2
-// CHECK-RV64-SAME: (<vscale x 1 x i16> [[V0:%.*]], <vscale x 1 x i16> [[V1:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i16>, <vscale x 1 x i16> } @test_vcreate_v_i16mf4x2(
+// CHECK-RV64-SAME: <vscale x 1 x i16> [[V0:%.*]], <vscale x 1 x i16> [[V1:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 1 x i16>, <vscale x 1 x i16> } poison, <vscale x 1 x i16> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 1 x i16>, <vscale x 1 x i16> } [[TMP0]], <vscale x 1 x i16> [[V1]], 1
@@ -1186,20 +2188,21 @@ vint16mf4x2_t test_vcreate_v_i16mf4x2(vint16mf4_t v0, vint16mf4_t v1) {
   return __riscv_vcreate_v_i16mf4x2(v0, v1);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16> } @test_vcreate_v_i16mf4x3
-// CHECK-RV64-SAME: (<vscale x 1 x i16> [[V0:%.*]], <vscale x 1 x i16> [[V1:%.*]], <vscale x 1 x i16> [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16> } @test_vcreate_v_i16mf4x3(
+// CHECK-RV64-SAME: <vscale x 1 x i16> [[V0:%.*]], <vscale x 1 x i16> [[V1:%.*]], <vscale x 1 x i16> [[V2:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16> } poison, <vscale x 1 x i16> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16> } [[TMP0]], <vscale x 1 x i16> [[V1]], 1
 // CHECK-RV64-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16> } [[TMP1]], <vscale x 1 x i16> [[V2]], 2
 // CHECK-RV64-NEXT:    ret { <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16> } [[TMP2]]
 //
-vint16mf4x3_t test_vcreate_v_i16mf4x3(vint16mf4_t v0, vint16mf4_t v1, vint16mf4_t v2) {
+vint16mf4x3_t test_vcreate_v_i16mf4x3(vint16mf4_t v0, vint16mf4_t v1,
+                                      vint16mf4_t v2) {
   return __riscv_vcreate_v_i16mf4x3(v0, v1, v2);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16> } @test_vcreate_v_i16mf4x4
-// CHECK-RV64-SAME: (<vscale x 1 x i16> [[V0:%.*]], <vscale x 1 x i16> [[V1:%.*]], <vscale x 1 x i16> [[V2:%.*]], <vscale x 1 x i16> [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16> } @test_vcreate_v_i16mf4x4(
+// CHECK-RV64-SAME: <vscale x 1 x i16> [[V0:%.*]], <vscale x 1 x i16> [[V1:%.*]], <vscale x 1 x i16> [[V2:%.*]], <vscale x 1 x i16> [[V3:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16> } poison, <vscale x 1 x i16> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16> } [[TMP0]], <vscale x 1 x i16> [[V1]], 1
@@ -1207,12 +2210,13 @@ vint16mf4x3_t test_vcreate_v_i16mf4x3(vint16mf4_t v0, vint16mf4_t v1, vint16mf4_
 // CHECK-RV64-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16> } [[TMP2]], <vscale x 1 x i16> [[V3]], 3
 // CHECK-RV64-NEXT:    ret { <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16> } [[TMP3]]
 //
-vint16mf4x4_t test_vcreate_v_i16mf4x4(vint16mf4_t v0, vint16mf4_t v1, vint16mf4_t v2, vint16mf4_t v3) {
+vint16mf4x4_t test_vcreate_v_i16mf4x4(vint16mf4_t v0, vint16mf4_t v1,
+                                      vint16mf4_t v2, vint16mf4_t v3) {
   return __riscv_vcreate_v_i16mf4x4(v0, v1, v2, v3);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16> } @test_vcreate_v_i16mf4x5
-// CHECK-RV64-SAME: (<vscale x 1 x i16> [[V0:%.*]], <vscale x 1 x i16> [[V1:%.*]], <vscale x 1 x i16> [[V2:%.*]], <vscale x 1 x i16> [[V3:%.*]], <vscale x 1 x i16> [[V4:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16> } @test_vcreate_v_i16mf4x5(
+// CHECK-RV64-SAME: <vscale x 1 x i16> [[V0:%.*]], <vscale x 1 x i16> [[V1:%.*]], <vscale x 1 x i16> [[V2:%.*]], <vscale x 1 x i16> [[V3:%.*]], <vscale x 1 x i16> [[V4:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16> } poison, <vscale x 1 x i16> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16> } [[TMP0]], <vscale x 1 x i16> [[V1]], 1
@@ -1221,12 +2225,14 @@ vint16mf4x4_t test_vcreate_v_i16mf4x4(vint16mf4_t v0, vint16mf4_t v1, vint16mf4_
 // CHECK-RV64-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16> } [[TMP3]], <vscale x 1 x i16> [[V4]], 4
 // CHECK-RV64-NEXT:    ret { <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16> } [[TMP4]]
 //
-vint16mf4x5_t test_vcreate_v_i16mf4x5(vint16mf4_t v0, vint16mf4_t v1, vint16mf4_t v2, vint16mf4_t v3, vint16mf4_t v4) {
+vint16mf4x5_t test_vcreate_v_i16mf4x5(vint16mf4_t v0, vint16mf4_t v1,
+                                      vint16mf4_t v2, vint16mf4_t v3,
+                                      vint16mf4_t v4) {
   return __riscv_vcreate_v_i16mf4x5(v0, v1, v2, v3, v4);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16> } @test_vcreate_v_i16mf4x6
-// CHECK-RV64-SAME: (<vscale x 1 x i16> [[V0:%.*]], <vscale x 1 x i16> [[V1:%.*]], <vscale x 1 x i16> [[V2:%.*]], <vscale x 1 x i16> [[V3:%.*]], <vscale x 1 x i16> [[V4:%.*]], <vscale x 1 x i16> [[V5:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16> } @test_vcreate_v_i16mf4x6(
+// CHECK-RV64-SAME: <vscale x 1 x i16> [[V0:%.*]], <vscale x 1 x i16> [[V1:%.*]], <vscale x 1 x i16> [[V2:%.*]], <vscale x 1 x i16> [[V3:%.*]], <vscale x 1 x i16> [[V4:%.*]], <vscale x 1 x i16> [[V5:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16> } poison, <vscale x 1 x i16> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16> } [[TMP0]], <vscale x 1 x i16> [[V1]], 1
@@ -1236,12 +2242,14 @@ vint16mf4x5_t test_vcreate_v_i16mf4x5(vint16mf4_t v0, vint16mf4_t v1, vint16mf4_
 // CHECK-RV64-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16> } [[TMP4]], <vscale x 1 x i16> [[V5]], 5
 // CHECK-RV64-NEXT:    ret { <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16> } [[TMP5]]
 //
-vint16mf4x6_t test_vcreate_v_i16mf4x6(vint16mf4_t v0, vint16mf4_t v1, vint16mf4_t v2, vint16mf4_t v3, vint16mf4_t v4, vint16mf4_t v5) {
+vint16mf4x6_t test_vcreate_v_i16mf4x6(vint16mf4_t v0, vint16mf4_t v1,
+                                      vint16mf4_t v2, vint16mf4_t v3,
+                                      vint16mf4_t v4, vint16mf4_t v5) {
   return __riscv_vcreate_v_i16mf4x6(v0, v1, v2, v3, v4, v5);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16> } @test_vcreate_v_i16mf4x7
-// CHECK-RV64-SAME: (<vscale x 1 x i16> [[V0:%.*]], <vscale x 1 x i16> [[V1:%.*]], <vscale x 1 x i16> [[V2:%.*]], <vscale x 1 x i16> [[V3:%.*]], <vscale x 1 x i16> [[V4:%.*]], <vscale x 1 x i16> [[V5:%.*]], <vscale x 1 x i16> [[V6:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16> } @test_vcreate_v_i16mf4x7(
+// CHECK-RV64-SAME: <vscale x 1 x i16> [[V0:%.*]], <vscale x 1 x i16> [[V1:%.*]], <vscale x 1 x i16> [[V2:%.*]], <vscale x 1 x i16> [[V3:%.*]], <vscale x 1 x i16> [[V4:%.*]], <vscale x 1 x i16> [[V5:%.*]], <vscale x 1 x i16> [[V6:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16> } poison, <vscale x 1 x i16> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16> } [[TMP0]], <vscale x 1 x i16> [[V1]], 1
@@ -1252,12 +2260,15 @@ vint16mf4x6_t test_vcreate_v_i16mf4x6(vint16mf4_t v0, vint16mf4_t v1, vint16mf4_
 // CHECK-RV64-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16> } [[TMP5]], <vscale x 1 x i16> [[V6]], 6
 // CHECK-RV64-NEXT:    ret { <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16> } [[TMP6]]
 //
-vint16mf4x7_t test_vcreate_v_i16mf4x7(vint16mf4_t v0, vint16mf4_t v1, vint16mf4_t v2, vint16mf4_t v3, vint16mf4_t v4, vint16mf4_t v5, vint16mf4_t v6) {
+vint16mf4x7_t test_vcreate_v_i16mf4x7(vint16mf4_t v0, vint16mf4_t v1,
+                                      vint16mf4_t v2, vint16mf4_t v3,
+                                      vint16mf4_t v4, vint16mf4_t v5,
+                                      vint16mf4_t v6) {
   return __riscv_vcreate_v_i16mf4x7(v0, v1, v2, v3, v4, v5, v6);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16> } @test_vcreate_v_i16mf4x8
-// CHECK-RV64-SAME: (<vscale x 1 x i16> [[V0:%.*]], <vscale x 1 x i16> [[V1:%.*]], <vscale x 1 x i16> [[V2:%.*]], <vscale x 1 x i16> [[V3:%.*]], <vscale x 1 x i16> [[V4:%.*]], <vscale x 1 x i16> [[V5:%.*]], <vscale x 1 x i16> [[V6:%.*]], <vscale x 1 x i16> [[V7:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16> } @test_vcreate_v_i16mf4x8(
+// CHECK-RV64-SAME: <vscale x 1 x i16> [[V0:%.*]], <vscale x 1 x i16> [[V1:%.*]], <vscale x 1 x i16> [[V2:%.*]], <vscale x 1 x i16> [[V3:%.*]], <vscale x 1 x i16> [[V4:%.*]], <vscale x 1 x i16> [[V5:%.*]], <vscale x 1 x i16> [[V6:%.*]], <vscale x 1 x i16> [[V7:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16> } poison, <vscale x 1 x i16> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16> } [[TMP0]], <vscale x 1 x i16> [[V1]], 1
@@ -1269,12 +2280,15 @@ vint16mf4x7_t test_vcreate_v_i16mf4x7(vint16mf4_t v0, vint16mf4_t v1, vint16mf4_
 // CHECK-RV64-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16> } [[TMP6]], <vscale x 1 x i16> [[V7]], 7
 // CHECK-RV64-NEXT:    ret { <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16> } [[TMP7]]
 //
-vint16mf4x8_t test_vcreate_v_i16mf4x8(vint16mf4_t v0, vint16mf4_t v1, vint16mf4_t v2, vint16mf4_t v3, vint16mf4_t v4, vint16mf4_t v5, vint16mf4_t v6, vint16mf4_t v7) {
+vint16mf4x8_t test_vcreate_v_i16mf4x8(vint16mf4_t v0, vint16mf4_t v1,
+                                      vint16mf4_t v2, vint16mf4_t v3,
+                                      vint16mf4_t v4, vint16mf4_t v5,
+                                      vint16mf4_t v6, vint16mf4_t v7) {
   return __riscv_vcreate_v_i16mf4x8(v0, v1, v2, v3, v4, v5, v6, v7);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i16>, <vscale x 2 x i16> } @test_vcreate_v_i16mf2x2
-// CHECK-RV64-SAME: (<vscale x 2 x i16> [[V0:%.*]], <vscale x 2 x i16> [[V1:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i16>, <vscale x 2 x i16> } @test_vcreate_v_i16mf2x2(
+// CHECK-RV64-SAME: <vscale x 2 x i16> [[V0:%.*]], <vscale x 2 x i16> [[V1:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x i16>, <vscale x 2 x i16> } poison, <vscale x 2 x i16> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x i16>, <vscale x 2 x i16> } [[TMP0]], <vscale x 2 x i16> [[V1]], 1
@@ -1284,20 +2298,21 @@ vint16mf2x2_t test_vcreate_v_i16mf2x2(vint16mf2_t v0, vint16mf2_t v1) {
   return __riscv_vcreate_v_i16mf2x2(v0, v1);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16> } @test_vcreate_v_i16mf2x3
-// CHECK-RV64-SAME: (<vscale x 2 x i16> [[V0:%.*]], <vscale x 2 x i16> [[V1:%.*]], <vscale x 2 x i16> [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16> } @test_vcreate_v_i16mf2x3(
+// CHECK-RV64-SAME: <vscale x 2 x i16> [[V0:%.*]], <vscale x 2 x i16> [[V1:%.*]], <vscale x 2 x i16> [[V2:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16> } poison, <vscale x 2 x i16> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16> } [[TMP0]], <vscale x 2 x i16> [[V1]], 1
 // CHECK-RV64-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16> } [[TMP1]], <vscale x 2 x i16> [[V2]], 2
 // CHECK-RV64-NEXT:    ret { <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16> } [[TMP2]]
 //
-vint16mf2x3_t test_vcreate_v_i16mf2x3(vint16mf2_t v0, vint16mf2_t v1, vint16mf2_t v2) {
+vint16mf2x3_t test_vcreate_v_i16mf2x3(vint16mf2_t v0, vint16mf2_t v1,
+                                      vint16mf2_t v2) {
   return __riscv_vcreate_v_i16mf2x3(v0, v1, v2);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16> } @test_vcreate_v_i16mf2x4
-// CHECK-RV64-SAME: (<vscale x 2 x i16> [[V0:%.*]], <vscale x 2 x i16> [[V1:%.*]], <vscale x 2 x i16> [[V2:%.*]], <vscale x 2 x i16> [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16> } @test_vcreate_v_i16mf2x4(
+// CHECK-RV64-SAME: <vscale x 2 x i16> [[V0:%.*]], <vscale x 2 x i16> [[V1:%.*]], <vscale x 2 x i16> [[V2:%.*]], <vscale x 2 x i16> [[V3:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16> } poison, <vscale x 2 x i16> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16> } [[TMP0]], <vscale x 2 x i16> [[V1]], 1
@@ -1305,12 +2320,13 @@ vint16mf2x3_t test_vcreate_v_i16mf2x3(vint16mf2_t v0, vint16mf2_t v1, vint16mf2_
 // CHECK-RV64-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16> } [[TMP2]], <vscale x 2 x i16> [[V3]], 3
 // CHECK-RV64-NEXT:    ret { <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16> } [[TMP3]]
 //
-vint16mf2x4_t test_vcreate_v_i16mf2x4(vint16mf2_t v0, vint16mf2_t v1, vint16mf2_t v2, vint16mf2_t v3) {
+vint16mf2x4_t test_vcreate_v_i16mf2x4(vint16mf2_t v0, vint16mf2_t v1,
+                                      vint16mf2_t v2, vint16mf2_t v3) {
   return __riscv_vcreate_v_i16mf2x4(v0, v1, v2, v3);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16> } @test_vcreate_v_i16mf2x5
-// CHECK-RV64-SAME: (<vscale x 2 x i16> [[V0:%.*]], <vscale x 2 x i16> [[V1:%.*]], <vscale x 2 x i16> [[V2:%.*]], <vscale x 2 x i16> [[V3:%.*]], <vscale x 2 x i16> [[V4:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16> } @test_vcreate_v_i16mf2x5(
+// CHECK-RV64-SAME: <vscale x 2 x i16> [[V0:%.*]], <vscale x 2 x i16> [[V1:%.*]], <vscale x 2 x i16> [[V2:%.*]], <vscale x 2 x i16> [[V3:%.*]], <vscale x 2 x i16> [[V4:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16> } poison, <vscale x 2 x i16> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16> } [[TMP0]], <vscale x 2 x i16> [[V1]], 1
@@ -1319,12 +2335,14 @@ vint16mf2x4_t test_vcreate_v_i16mf2x4(vint16mf2_t v0, vint16mf2_t v1, vint16mf2_
 // CHECK-RV64-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16> } [[TMP3]], <vscale x 2 x i16> [[V4]], 4
 // CHECK-RV64-NEXT:    ret { <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16> } [[TMP4]]
 //
-vint16mf2x5_t test_vcreate_v_i16mf2x5(vint16mf2_t v0, vint16mf2_t v1, vint16mf2_t v2, vint16mf2_t v3, vint16mf2_t v4) {
+vint16mf2x5_t test_vcreate_v_i16mf2x5(vint16mf2_t v0, vint16mf2_t v1,
+                                      vint16mf2_t v2, vint16mf2_t v3,
+                                      vint16mf2_t v4) {
   return __riscv_vcreate_v_i16mf2x5(v0, v1, v2, v3, v4);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16> } @test_vcreate_v_i16mf2x6
-// CHECK-RV64-SAME: (<vscale x 2 x i16> [[V0:%.*]], <vscale x 2 x i16> [[V1:%.*]], <vscale x 2 x i16> [[V2:%.*]], <vscale x 2 x i16> [[V3:%.*]], <vscale x 2 x i16> [[V4:%.*]], <vscale x 2 x i16> [[V5:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16> } @test_vcreate_v_i16mf2x6(
+// CHECK-RV64-SAME: <vscale x 2 x i16> [[V0:%.*]], <vscale x 2 x i16> [[V1:%.*]], <vscale x 2 x i16> [[V2:%.*]], <vscale x 2 x i16> [[V3:%.*]], <vscale x 2 x i16> [[V4:%.*]], <vscale x 2 x i16> [[V5:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16> } poison, <vscale x 2 x i16> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16> } [[TMP0]], <vscale x 2 x i16> [[V1]], 1
@@ -1334,12 +2352,14 @@ vint16mf2x5_t test_vcreate_v_i16mf2x5(vint16mf2_t v0, vint16mf2_t v1, vint16mf2_
 // CHECK-RV64-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16> } [[TMP4]], <vscale x 2 x i16> [[V5]], 5
 // CHECK-RV64-NEXT:    ret { <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16> } [[TMP5]]
 //
-vint16mf2x6_t test_vcreate_v_i16mf2x6(vint16mf2_t v0, vint16mf2_t v1, vint16mf2_t v2, vint16mf2_t v3, vint16mf2_t v4, vint16mf2_t v5) {
+vint16mf2x6_t test_vcreate_v_i16mf2x6(vint16mf2_t v0, vint16mf2_t v1,
+                                      vint16mf2_t v2, vint16mf2_t v3,
+                                      vint16mf2_t v4, vint16mf2_t v5) {
   return __riscv_vcreate_v_i16mf2x6(v0, v1, v2, v3, v4, v5);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16> } @test_vcreate_v_i16mf2x7
-// CHECK-RV64-SAME: (<vscale x 2 x i16> [[V0:%.*]], <vscale x 2 x i16> [[V1:%.*]], <vscale x 2 x i16> [[V2:%.*]], <vscale x 2 x i16> [[V3:%.*]], <vscale x 2 x i16> [[V4:%.*]], <vscale x 2 x i16> [[V5:%.*]], <vscale x 2 x i16> [[V6:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16> } @test_vcreate_v_i16mf2x7(
+// CHECK-RV64-SAME: <vscale x 2 x i16> [[V0:%.*]], <vscale x 2 x i16> [[V1:%.*]], <vscale x 2 x i16> [[V2:%.*]], <vscale x 2 x i16> [[V3:%.*]], <vscale x 2 x i16> [[V4:%.*]], <vscale x 2 x i16> [[V5:%.*]], <vscale x 2 x i16> [[V6:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16> } poison, <vscale x 2 x i16> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16> } [[TMP0]], <vscale x 2 x i16> [[V1]], 1
@@ -1350,12 +2370,15 @@ vint16mf2x6_t test_vcreate_v_i16mf2x6(vint16mf2_t v0, vint16mf2_t v1, vint16mf2_
 // CHECK-RV64-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16> } [[TMP5]], <vscale x 2 x i16> [[V6]], 6
 // CHECK-RV64-NEXT:    ret { <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16> } [[TMP6]]
 //
-vint16mf2x7_t test_vcreate_v_i16mf2x7(vint16mf2_t v0, vint16mf2_t v1, vint16mf2_t v2, vint16mf2_t v3, vint16mf2_t v4, vint16mf2_t v5, vint16mf2_t v6) {
+vint16mf2x7_t test_vcreate_v_i16mf2x7(vint16mf2_t v0, vint16mf2_t v1,
+                                      vint16mf2_t v2, vint16mf2_t v3,
+                                      vint16mf2_t v4, vint16mf2_t v5,
+                                      vint16mf2_t v6) {
   return __riscv_vcreate_v_i16mf2x7(v0, v1, v2, v3, v4, v5, v6);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16> } @test_vcreate_v_i16mf2x8
-// CHECK-RV64-SAME: (<vscale x 2 x i16> [[V0:%.*]], <vscale x 2 x i16> [[V1:%.*]], <vscale x 2 x i16> [[V2:%.*]], <vscale x 2 x i16> [[V3:%.*]], <vscale x 2 x i16> [[V4:%.*]], <vscale x 2 x i16> [[V5:%.*]], <vscale x 2 x i16> [[V6:%.*]], <vscale x 2 x i16> [[V7:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16> } @test_vcreate_v_i16mf2x8(
+// CHECK-RV64-SAME: <vscale x 2 x i16> [[V0:%.*]], <vscale x 2 x i16> [[V1:%.*]], <vscale x 2 x i16> [[V2:%.*]], <vscale x 2 x i16> [[V3:%.*]], <vscale x 2 x i16> [[V4:%.*]], <vscale x 2 x i16> [[V5:%.*]], <vscale x 2 x i16> [[V6:%.*]], <vscale x 2 x i16> [[V7:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16> } poison, <vscale x 2 x i16> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16> } [[TMP0]], <vscale x 2 x i16> [[V1]], 1
@@ -1367,12 +2390,15 @@ vint16mf2x7_t test_vcreate_v_i16mf2x7(vint16mf2_t v0, vint16mf2_t v1, vint16mf2_
 // CHECK-RV64-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16> } [[TMP6]], <vscale x 2 x i16> [[V7]], 7
 // CHECK-RV64-NEXT:    ret { <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16> } [[TMP7]]
 //
-vint16mf2x8_t test_vcreate_v_i16mf2x8(vint16mf2_t v0, vint16mf2_t v1, vint16mf2_t v2, vint16mf2_t v3, vint16mf2_t v4, vint16mf2_t v5, vint16mf2_t v6, vint16mf2_t v7) {
+vint16mf2x8_t test_vcreate_v_i16mf2x8(vint16mf2_t v0, vint16mf2_t v1,
+                                      vint16mf2_t v2, vint16mf2_t v3,
+                                      vint16mf2_t v4, vint16mf2_t v5,
+                                      vint16mf2_t v6, vint16mf2_t v7) {
   return __riscv_vcreate_v_i16mf2x8(v0, v1, v2, v3, v4, v5, v6, v7);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x i16>, <vscale x 4 x i16> } @test_vcreate_v_i16m1x2
-// CHECK-RV64-SAME: (<vscale x 4 x i16> [[V0:%.*]], <vscale x 4 x i16> [[V1:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x i16>, <vscale x 4 x i16> } @test_vcreate_v_i16m1x2(
+// CHECK-RV64-SAME: <vscale x 4 x i16> [[V0:%.*]], <vscale x 4 x i16> [[V1:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 4 x i16>, <vscale x 4 x i16> } poison, <vscale x 4 x i16> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 4 x i16>, <vscale x 4 x i16> } [[TMP0]], <vscale x 4 x i16> [[V1]], 1
@@ -1382,20 +2408,21 @@ vint16m1x2_t test_vcreate_v_i16m1x2(vint16m1_t v0, vint16m1_t v1) {
   return __riscv_vcreate_v_i16m1x2(v0, v1);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16> } @test_vcreate_v_i16m1x3
-// CHECK-RV64-SAME: (<vscale x 4 x i16> [[V0:%.*]], <vscale x 4 x i16> [[V1:%.*]], <vscale x 4 x i16> [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16> } @test_vcreate_v_i16m1x3(
+// CHECK-RV64-SAME: <vscale x 4 x i16> [[V0:%.*]], <vscale x 4 x i16> [[V1:%.*]], <vscale x 4 x i16> [[V2:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16> } poison, <vscale x 4 x i16> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16> } [[TMP0]], <vscale x 4 x i16> [[V1]], 1
 // CHECK-RV64-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16> } [[TMP1]], <vscale x 4 x i16> [[V2]], 2
 // CHECK-RV64-NEXT:    ret { <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16> } [[TMP2]]
 //
-vint16m1x3_t test_vcreate_v_i16m1x3(vint16m1_t v0, vint16m1_t v1, vint16m1_t v2) {
+vint16m1x3_t test_vcreate_v_i16m1x3(vint16m1_t v0, vint16m1_t v1,
+                                    vint16m1_t v2) {
   return __riscv_vcreate_v_i16m1x3(v0, v1, v2);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16> } @test_vcreate_v_i16m1x4
-// CHECK-RV64-SAME: (<vscale x 4 x i16> [[V0:%.*]], <vscale x 4 x i16> [[V1:%.*]], <vscale x 4 x i16> [[V2:%.*]], <vscale x 4 x i16> [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16> } @test_vcreate_v_i16m1x4(
+// CHECK-RV64-SAME: <vscale x 4 x i16> [[V0:%.*]], <vscale x 4 x i16> [[V1:%.*]], <vscale x 4 x i16> [[V2:%.*]], <vscale x 4 x i16> [[V3:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16> } poison, <vscale x 4 x i16> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16> } [[TMP0]], <vscale x 4 x i16> [[V1]], 1
@@ -1403,12 +2430,13 @@ vint16m1x3_t test_vcreate_v_i16m1x3(vint16m1_t v0, vint16m1_t v1, vint16m1_t v2)
 // CHECK-RV64-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16> } [[TMP2]], <vscale x 4 x i16> [[V3]], 3
 // CHECK-RV64-NEXT:    ret { <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16> } [[TMP3]]
 //
-vint16m1x4_t test_vcreate_v_i16m1x4(vint16m1_t v0, vint16m1_t v1, vint16m1_t v2, vint16m1_t v3) {
+vint16m1x4_t test_vcreate_v_i16m1x4(vint16m1_t v0, vint16m1_t v1, vint16m1_t v2,
+                                    vint16m1_t v3) {
   return __riscv_vcreate_v_i16m1x4(v0, v1, v2, v3);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16> } @test_vcreate_v_i16m1x5
-// CHECK-RV64-SAME: (<vscale x 4 x i16> [[V0:%.*]], <vscale x 4 x i16> [[V1:%.*]], <vscale x 4 x i16> [[V2:%.*]], <vscale x 4 x i16> [[V3:%.*]], <vscale x 4 x i16> [[V4:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16> } @test_vcreate_v_i16m1x5(
+// CHECK-RV64-SAME: <vscale x 4 x i16> [[V0:%.*]], <vscale x 4 x i16> [[V1:%.*]], <vscale x 4 x i16> [[V2:%.*]], <vscale x 4 x i16> [[V3:%.*]], <vscale x 4 x i16> [[V4:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16> } poison, <vscale x 4 x i16> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16> } [[TMP0]], <vscale x 4 x i16> [[V1]], 1
@@ -1417,12 +2445,13 @@ vint16m1x4_t test_vcreate_v_i16m1x4(vint16m1_t v0, vint16m1_t v1, vint16m1_t v2,
 // CHECK-RV64-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16> } [[TMP3]], <vscale x 4 x i16> [[V4]], 4
 // CHECK-RV64-NEXT:    ret { <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16> } [[TMP4]]
 //
-vint16m1x5_t test_vcreate_v_i16m1x5(vint16m1_t v0, vint16m1_t v1, vint16m1_t v2, vint16m1_t v3, vint16m1_t v4) {
+vint16m1x5_t test_vcreate_v_i16m1x5(vint16m1_t v0, vint16m1_t v1, vint16m1_t v2,
+                                    vint16m1_t v3, vint16m1_t v4) {
   return __riscv_vcreate_v_i16m1x5(v0, v1, v2, v3, v4);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16> } @test_vcreate_v_i16m1x6
-// CHECK-RV64-SAME: (<vscale x 4 x i16> [[V0:%.*]], <vscale x 4 x i16> [[V1:%.*]], <vscale x 4 x i16> [[V2:%.*]], <vscale x 4 x i16> [[V3:%.*]], <vscale x 4 x i16> [[V4:%.*]], <vscale x 4 x i16> [[V5:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16> } @test_vcreate_v_i16m1x6(
+// CHECK-RV64-SAME: <vscale x 4 x i16> [[V0:%.*]], <vscale x 4 x i16> [[V1:%.*]], <vscale x 4 x i16> [[V2:%.*]], <vscale x 4 x i16> [[V3:%.*]], <vscale x 4 x i16> [[V4:%.*]], <vscale x 4 x i16> [[V5:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16> } poison, <vscale x 4 x i16> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16> } [[TMP0]], <vscale x 4 x i16> [[V1]], 1
@@ -1432,12 +2461,14 @@ vint16m1x5_t test_vcreate_v_i16m1x5(vint16m1_t v0, vint16m1_t v1, vint16m1_t v2,
 // CHECK-RV64-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16> } [[TMP4]], <vscale x 4 x i16> [[V5]], 5
 // CHECK-RV64-NEXT:    ret { <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16> } [[TMP5]]
 //
-vint16m1x6_t test_vcreate_v_i16m1x6(vint16m1_t v0, vint16m1_t v1, vint16m1_t v2, vint16m1_t v3, vint16m1_t v4, vint16m1_t v5) {
+vint16m1x6_t test_vcreate_v_i16m1x6(vint16m1_t v0, vint16m1_t v1, vint16m1_t v2,
+                                    vint16m1_t v3, vint16m1_t v4,
+                                    vint16m1_t v5) {
   return __riscv_vcreate_v_i16m1x6(v0, v1, v2, v3, v4, v5);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16> } @test_vcreate_v_i16m1x7
-// CHECK-RV64-SAME: (<vscale x 4 x i16> [[V0:%.*]], <vscale x 4 x i16> [[V1:%.*]], <vscale x 4 x i16> [[V2:%.*]], <vscale x 4 x i16> [[V3:%.*]], <vscale x 4 x i16> [[V4:%.*]], <vscale x 4 x i16> [[V5:%.*]], <vscale x 4 x i16> [[V6:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16> } @test_vcreate_v_i16m1x7(
+// CHECK-RV64-SAME: <vscale x 4 x i16> [[V0:%.*]], <vscale x 4 x i16> [[V1:%.*]], <vscale x 4 x i16> [[V2:%.*]], <vscale x 4 x i16> [[V3:%.*]], <vscale x 4 x i16> [[V4:%.*]], <vscale x 4 x i16> [[V5:%.*]], <vscale x 4 x i16> [[V6:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16> } poison, <vscale x 4 x i16> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16> } [[TMP0]], <vscale x 4 x i16> [[V1]], 1
@@ -1448,12 +2479,14 @@ vint16m1x6_t test_vcreate_v_i16m1x6(vint16m1_t v0, vint16m1_t v1, vint16m1_t v2,
 // CHECK-RV64-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16> } [[TMP5]], <vscale x 4 x i16> [[V6]], 6
 // CHECK-RV64-NEXT:    ret { <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16> } [[TMP6]]
 //
-vint16m1x7_t test_vcreate_v_i16m1x7(vint16m1_t v0, vint16m1_t v1, vint16m1_t v2, vint16m1_t v3, vint16m1_t v4, vint16m1_t v5, vint16m1_t v6) {
+vint16m1x7_t test_vcreate_v_i16m1x7(vint16m1_t v0, vint16m1_t v1, vint16m1_t v2,
+                                    vint16m1_t v3, vint16m1_t v4, vint16m1_t v5,
+                                    vint16m1_t v6) {
   return __riscv_vcreate_v_i16m1x7(v0, v1, v2, v3, v4, v5, v6);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16> } @test_vcreate_v_i16m1x8
-// CHECK-RV64-SAME: (<vscale x 4 x i16> [[V0:%.*]], <vscale x 4 x i16> [[V1:%.*]], <vscale x 4 x i16> [[V2:%.*]], <vscale x 4 x i16> [[V3:%.*]], <vscale x 4 x i16> [[V4:%.*]], <vscale x 4 x i16> [[V5:%.*]], <vscale x 4 x i16> [[V6:%.*]], <vscale x 4 x i16> [[V7:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16> } @test_vcreate_v_i16m1x8(
+// CHECK-RV64-SAME: <vscale x 4 x i16> [[V0:%.*]], <vscale x 4 x i16> [[V1:%.*]], <vscale x 4 x i16> [[V2:%.*]], <vscale x 4 x i16> [[V3:%.*]], <vscale x 4 x i16> [[V4:%.*]], <vscale x 4 x i16> [[V5:%.*]], <vscale x 4 x i16> [[V6:%.*]], <vscale x 4 x i16> [[V7:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16> } poison, <vscale x 4 x i16> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16> } [[TMP0]], <vscale x 4 x i16> [[V1]], 1
@@ -1465,12 +2498,14 @@ vint16m1x7_t test_vcreate_v_i16m1x7(vint16m1_t v0, vint16m1_t v1, vint16m1_t v2,
 // CHECK-RV64-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16> } [[TMP6]], <vscale x 4 x i16> [[V7]], 7
 // CHECK-RV64-NEXT:    ret { <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16> } [[TMP7]]
 //
-vint16m1x8_t test_vcreate_v_i16m1x8(vint16m1_t v0, vint16m1_t v1, vint16m1_t v2, vint16m1_t v3, vint16m1_t v4, vint16m1_t v5, vint16m1_t v6, vint16m1_t v7) {
+vint16m1x8_t test_vcreate_v_i16m1x8(vint16m1_t v0, vint16m1_t v1, vint16m1_t v2,
+                                    vint16m1_t v3, vint16m1_t v4, vint16m1_t v5,
+                                    vint16m1_t v6, vint16m1_t v7) {
   return __riscv_vcreate_v_i16m1x8(v0, v1, v2, v3, v4, v5, v6, v7);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 8 x i16>, <vscale x 8 x i16> } @test_vcreate_v_i16m2x2
-// CHECK-RV64-SAME: (<vscale x 8 x i16> [[V0:%.*]], <vscale x 8 x i16> [[V1:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 8 x i16>, <vscale x 8 x i16> } @test_vcreate_v_i16m2x2(
+// CHECK-RV64-SAME: <vscale x 8 x i16> [[V0:%.*]], <vscale x 8 x i16> [[V1:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } poison, <vscale x 8 x i16> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], <vscale x 8 x i16> [[V1]], 1
@@ -1480,20 +2515,21 @@ vint16m2x2_t test_vcreate_v_i16m2x2(vint16m2_t v0, vint16m2_t v1) {
   return __riscv_vcreate_v_i16m2x2(v0, v1);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @test_vcreate_v_i16m2x3
-// CHECK-RV64-SAME: (<vscale x 8 x i16> [[V0:%.*]], <vscale x 8 x i16> [[V1:%.*]], <vscale x 8 x i16> [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @test_vcreate_v_i16m2x3(
+// CHECK-RV64-SAME: <vscale x 8 x i16> [[V0:%.*]], <vscale x 8 x i16> [[V1:%.*]], <vscale x 8 x i16> [[V2:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } poison, <vscale x 8 x i16> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], <vscale x 8 x i16> [[V1]], 1
 // CHECK-RV64-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], <vscale x 8 x i16> [[V2]], 2
 // CHECK-RV64-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]]
 //
-vint16m2x3_t test_vcreate_v_i16m2x3(vint16m2_t v0, vint16m2_t v1, vint16m2_t v2) {
+vint16m2x3_t test_vcreate_v_i16m2x3(vint16m2_t v0, vint16m2_t v1,
+                                    vint16m2_t v2) {
   return __riscv_vcreate_v_i16m2x3(v0, v1, v2);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @test_vcreate_v_i16m2x4
-// CHECK-RV64-SAME: (<vscale x 8 x i16> [[V0:%.*]], <vscale x 8 x i16> [[V1:%.*]], <vscale x 8 x i16> [[V2:%.*]], <vscale x 8 x i16> [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @test_vcreate_v_i16m2x4(
+// CHECK-RV64-SAME: <vscale x 8 x i16> [[V0:%.*]], <vscale x 8 x i16> [[V1:%.*]], <vscale x 8 x i16> [[V2:%.*]], <vscale x 8 x i16> [[V3:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } poison, <vscale x 8 x i16> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], <vscale x 8 x i16> [[V1]], 1
@@ -1501,12 +2537,13 @@ vint16m2x3_t test_vcreate_v_i16m2x3(vint16m2_t v0, vint16m2_t v1, vint16m2_t v2)
 // CHECK-RV64-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], <vscale x 8 x i16> [[V3]], 3
 // CHECK-RV64-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP3]]
 //
-vint16m2x4_t test_vcreate_v_i16m2x4(vint16m2_t v0, vint16m2_t v1, vint16m2_t v2, vint16m2_t v3) {
+vint16m2x4_t test_vcreate_v_i16m2x4(vint16m2_t v0, vint16m2_t v1, vint16m2_t v2,
+                                    vint16m2_t v3) {
   return __riscv_vcreate_v_i16m2x4(v0, v1, v2, v3);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 16 x i16>, <vscale x 16 x i16> } @test_vcreate_v_i16m4x2
-// CHECK-RV64-SAME: (<vscale x 16 x i16> [[V0:%.*]], <vscale x 16 x i16> [[V1:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 16 x i16>, <vscale x 16 x i16> } @test_vcreate_v_i16m4x2(
+// CHECK-RV64-SAME: <vscale x 16 x i16> [[V0:%.*]], <vscale x 16 x i16> [[V1:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i16>, <vscale x 16 x i16> } poison, <vscale x 16 x i16> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 16 x i16>, <vscale x 16 x i16> } [[TMP0]], <vscale x 16 x i16> [[V1]], 1
@@ -1516,8 +2553,8 @@ vint16m4x2_t test_vcreate_v_i16m4x2(vint16m4_t v0, vint16m4_t v1) {
   return __riscv_vcreate_v_i16m4x2(v0, v1);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i32>, <vscale x 1 x i32> } @test_vcreate_v_i32mf2x2
-// CHECK-RV64-SAME: (<vscale x 1 x i32> [[V0:%.*]], <vscale x 1 x i32> [[V1:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i32>, <vscale x 1 x i32> } @test_vcreate_v_i32mf2x2(
+// CHECK-RV64-SAME: <vscale x 1 x i32> [[V0:%.*]], <vscale x 1 x i32> [[V1:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 1 x i32>, <vscale x 1 x i32> } poison, <vscale x 1 x i32> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 1 x i32>, <vscale x 1 x i32> } [[TMP0]], <vscale x 1 x i32> [[V1]], 1
@@ -1527,20 +2564,21 @@ vint32mf2x2_t test_vcreate_v_i32mf2x2(vint32mf2_t v0, vint32mf2_t v1) {
   return __riscv_vcreate_v_i32mf2x2(v0, v1);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } @test_vcreate_v_i32mf2x3
-// CHECK-RV64-SAME: (<vscale x 1 x i32> [[V0:%.*]], <vscale x 1 x i32> [[V1:%.*]], <vscale x 1 x i32> [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } @test_vcreate_v_i32mf2x3(
+// CHECK-RV64-SAME: <vscale x 1 x i32> [[V0:%.*]], <vscale x 1 x i32> [[V1:%.*]], <vscale x 1 x i32> [[V2:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } poison, <vscale x 1 x i32> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } [[TMP0]], <vscale x 1 x i32> [[V1]], 1
 // CHECK-RV64-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } [[TMP1]], <vscale x 1 x i32> [[V2]], 2
 // CHECK-RV64-NEXT:    ret { <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } [[TMP2]]
 //
-vint32mf2x3_t test_vcreate_v_i32mf2x3(vint32mf2_t v0, vint32mf2_t v1, vint32mf2_t v2) {
+vint32mf2x3_t test_vcreate_v_i32mf2x3(vint32mf2_t v0, vint32mf2_t v1,
+                                      vint32mf2_t v2) {
   return __riscv_vcreate_v_i32mf2x3(v0, v1, v2);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } @test_vcreate_v_i32mf2x4
-// CHECK-RV64-SAME: (<vscale x 1 x i32> [[V0:%.*]], <vscale x 1 x i32> [[V1:%.*]], <vscale x 1 x i32> [[V2:%.*]], <vscale x 1 x i32> [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } @test_vcreate_v_i32mf2x4(
+// CHECK-RV64-SAME: <vscale x 1 x i32> [[V0:%.*]], <vscale x 1 x i32> [[V1:%.*]], <vscale x 1 x i32> [[V2:%.*]], <vscale x 1 x i32> [[V3:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } poison, <vscale x 1 x i32> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } [[TMP0]], <vscale x 1 x i32> [[V1]], 1
@@ -1548,12 +2586,13 @@ vint32mf2x3_t test_vcreate_v_i32mf2x3(vint32mf2_t v0, vint32mf2_t v1, vint32mf2_
 // CHECK-RV64-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } [[TMP2]], <vscale x 1 x i32> [[V3]], 3
 // CHECK-RV64-NEXT:    ret { <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } [[TMP3]]
 //
-vint32mf2x4_t test_vcreate_v_i32mf2x4(vint32mf2_t v0, vint32mf2_t v1, vint32mf2_t v2, vint32mf2_t v3) {
+vint32mf2x4_t test_vcreate_v_i32mf2x4(vint32mf2_t v0, vint32mf2_t v1,
+                                      vint32mf2_t v2, vint32mf2_t v3) {
   return __riscv_vcreate_v_i32mf2x4(v0, v1, v2, v3);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } @test_vcreate_v_i32mf2x5
-// CHECK-RV64-SAME: (<vscale x 1 x i32> [[V0:%.*]], <vscale x 1 x i32> [[V1:%.*]], <vscale x 1 x i32> [[V2:%.*]], <vscale x 1 x i32> [[V3:%.*]], <vscale x 1 x i32> [[V4:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } @test_vcreate_v_i32mf2x5(
+// CHECK-RV64-SAME: <vscale x 1 x i32> [[V0:%.*]], <vscale x 1 x i32> [[V1:%.*]], <vscale x 1 x i32> [[V2:%.*]], <vscale x 1 x i32> [[V3:%.*]], <vscale x 1 x i32> [[V4:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } poison, <vscale x 1 x i32> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } [[TMP0]], <vscale x 1 x i32> [[V1]], 1
@@ -1562,12 +2601,14 @@ vint32mf2x4_t test_vcreate_v_i32mf2x4(vint32mf2_t v0, vint32mf2_t v1, vint32mf2_
 // CHECK-RV64-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } [[TMP3]], <vscale x 1 x i32> [[V4]], 4
 // CHECK-RV64-NEXT:    ret { <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } [[TMP4]]
 //
-vint32mf2x5_t test_vcreate_v_i32mf2x5(vint32mf2_t v0, vint32mf2_t v1, vint32mf2_t v2, vint32mf2_t v3, vint32mf2_t v4) {
+vint32mf2x5_t test_vcreate_v_i32mf2x5(vint32mf2_t v0, vint32mf2_t v1,
+                                      vint32mf2_t v2, vint32mf2_t v3,
+                                      vint32mf2_t v4) {
   return __riscv_vcreate_v_i32mf2x5(v0, v1, v2, v3, v4);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } @test_vcreate_v_i32mf2x6
-// CHECK-RV64-SAME: (<vscale x 1 x i32> [[V0:%.*]], <vscale x 1 x i32> [[V1:%.*]], <vscale x 1 x i32> [[V2:%.*]], <vscale x 1 x i32> [[V3:%.*]], <vscale x 1 x i32> [[V4:%.*]], <vscale x 1 x i32> [[V5:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } @test_vcreate_v_i32mf2x6(
+// CHECK-RV64-SAME: <vscale x 1 x i32> [[V0:%.*]], <vscale x 1 x i32> [[V1:%.*]], <vscale x 1 x i32> [[V2:%.*]], <vscale x 1 x i32> [[V3:%.*]], <vscale x 1 x i32> [[V4:%.*]], <vscale x 1 x i32> [[V5:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } poison, <vscale x 1 x i32> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } [[TMP0]], <vscale x 1 x i32> [[V1]], 1
@@ -1577,12 +2618,14 @@ vint32mf2x5_t test_vcreate_v_i32mf2x5(vint32mf2_t v0, vint32mf2_t v1, vint32mf2_
 // CHECK-RV64-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } [[TMP4]], <vscale x 1 x i32> [[V5]], 5
 // CHECK-RV64-NEXT:    ret { <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } [[TMP5]]
 //
-vint32mf2x6_t test_vcreate_v_i32mf2x6(vint32mf2_t v0, vint32mf2_t v1, vint32mf2_t v2, vint32mf2_t v3, vint32mf2_t v4, vint32mf2_t v5) {
+vint32mf2x6_t test_vcreate_v_i32mf2x6(vint32mf2_t v0, vint32mf2_t v1,
+                                      vint32mf2_t v2, vint32mf2_t v3,
+                                      vint32mf2_t v4, vint32mf2_t v5) {
   return __riscv_vcreate_v_i32mf2x6(v0, v1, v2, v3, v4, v5);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } @test_vcreate_v_i32mf2x7
-// CHECK-RV64-SAME: (<vscale x 1 x i32> [[V0:%.*]], <vscale x 1 x i32> [[V1:%.*]], <vscale x 1 x i32> [[V2:%.*]], <vscale x 1 x i32> [[V3:%.*]], <vscale x 1 x i32> [[V4:%.*]], <vscale x 1 x i32> [[V5:%.*]], <vscale x 1 x i32> [[V6:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } @test_vcreate_v_i32mf2x7(
+// CHECK-RV64-SAME: <vscale x 1 x i32> [[V0:%.*]], <vscale x 1 x i32> [[V1:%.*]], <vscale x 1 x i32> [[V2:%.*]], <vscale x 1 x i32> [[V3:%.*]], <vscale x 1 x i32> [[V4:%.*]], <vscale x 1 x i32> [[V5:%.*]], <vscale x 1 x i32> [[V6:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } poison, <vscale x 1 x i32> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } [[TMP0]], <vscale x 1 x i32> [[V1]], 1
@@ -1593,12 +2636,15 @@ vint32mf2x6_t test_vcreate_v_i32mf2x6(vint32mf2_t v0, vint32mf2_t v1, vint32mf2_
 // CHECK-RV64-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } [[TMP5]], <vscale x 1 x i32> [[V6]], 6
 // CHECK-RV64-NEXT:    ret { <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } [[TMP6]]
 //
-vint32mf2x7_t test_vcreate_v_i32mf2x7(vint32mf2_t v0, vint32mf2_t v1, vint32mf2_t v2, vint32mf2_t v3, vint32mf2_t v4, vint32mf2_t v5, vint32mf2_t v6) {
+vint32mf2x7_t test_vcreate_v_i32mf2x7(vint32mf2_t v0, vint32mf2_t v1,
+                                      vint32mf2_t v2, vint32mf2_t v3,
+                                      vint32mf2_t v4, vint32mf2_t v5,
+                                      vint32mf2_t v6) {
   return __riscv_vcreate_v_i32mf2x7(v0, v1, v2, v3, v4, v5, v6);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } @test_vcreate_v_i32mf2x8
-// CHECK-RV64-SAME: (<vscale x 1 x i32> [[V0:%.*]], <vscale x 1 x i32> [[V1:%.*]], <vscale x 1 x i32> [[V2:%.*]], <vscale x 1 x i32> [[V3:%.*]], <vscale x 1 x i32> [[V4:%.*]], <vscale x 1 x i32> [[V5:%.*]], <vscale x 1 x i32> [[V6:%.*]], <vscale x 1 x i32> [[V7:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } @test_vcreate_v_i32mf2x8(
+// CHECK-RV64-SAME: <vscale x 1 x i32> [[V0:%.*]], <vscale x 1 x i32> [[V1:%.*]], <vscale x 1 x i32> [[V2:%.*]], <vscale x 1 x i32> [[V3:%.*]], <vscale x 1 x i32> [[V4:%.*]], <vscale x 1 x i32> [[V5:%.*]], <vscale x 1 x i32> [[V6:%.*]], <vscale x 1 x i32> [[V7:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } poison, <vscale x 1 x i32> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } [[TMP0]], <vscale x 1 x i32> [[V1]], 1
@@ -1610,12 +2656,15 @@ vint32mf2x7_t test_vcreate_v_i32mf2x7(vint32mf2_t v0, vint32mf2_t v1, vint32mf2_
 // CHECK-RV64-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } [[TMP6]], <vscale x 1 x i32> [[V7]], 7
 // CHECK-RV64-NEXT:    ret { <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } [[TMP7]]
 //
-vint32mf2x8_t test_vcreate_v_i32mf2x8(vint32mf2_t v0, vint32mf2_t v1, vint32mf2_t v2, vint32mf2_t v3, vint32mf2_t v4, vint32mf2_t v5, vint32mf2_t v6, vint32mf2_t v7) {
+vint32mf2x8_t test_vcreate_v_i32mf2x8(vint32mf2_t v0, vint32mf2_t v1,
+                                      vint32mf2_t v2, vint32mf2_t v3,
+                                      vint32mf2_t v4, vint32mf2_t v5,
+                                      vint32mf2_t v6, vint32mf2_t v7) {
   return __riscv_vcreate_v_i32mf2x8(v0, v1, v2, v3, v4, v5, v6, v7);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i32>, <vscale x 2 x i32> } @test_vcreate_v_i32m1x2
-// CHECK-RV64-SAME: (<vscale x 2 x i32> [[V0:%.*]], <vscale x 2 x i32> [[V1:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i32>, <vscale x 2 x i32> } @test_vcreate_v_i32m1x2(
+// CHECK-RV64-SAME: <vscale x 2 x i32> [[V0:%.*]], <vscale x 2 x i32> [[V1:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } poison, <vscale x 2 x i32> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP0]], <vscale x 2 x i32> [[V1]], 1
@@ -1625,20 +2674,21 @@ vint32m1x2_t test_vcreate_v_i32m1x2(vint32m1_t v0, vint32m1_t v1) {
   return __riscv_vcreate_v_i32m1x2(v0, v1);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } @test_vcreate_v_i32m1x3
-// CHECK-RV64-SAME: (<vscale x 2 x i32> [[V0:%.*]], <vscale x 2 x i32> [[V1:%.*]], <vscale x 2 x i32> [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } @test_vcreate_v_i32m1x3(
+// CHECK-RV64-SAME: <vscale x 2 x i32> [[V0:%.*]], <vscale x 2 x i32> [[V1:%.*]], <vscale x 2 x i32> [[V2:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } poison, <vscale x 2 x i32> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP0]], <vscale x 2 x i32> [[V1]], 1
 // CHECK-RV64-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP1]], <vscale x 2 x i32> [[V2]], 2
 // CHECK-RV64-NEXT:    ret { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP2]]
 //
-vint32m1x3_t test_vcreate_v_i32m1x3(vint32m1_t v0, vint32m1_t v1, vint32m1_t v2) {
+vint32m1x3_t test_vcreate_v_i32m1x3(vint32m1_t v0, vint32m1_t v1,
+                                    vint32m1_t v2) {
   return __riscv_vcreate_v_i32m1x3(v0, v1, v2);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } @test_vcreate_v_i32m1x4
-// CHECK-RV64-SAME: (<vscale x 2 x i32> [[V0:%.*]], <vscale x 2 x i32> [[V1:%.*]], <vscale x 2 x i32> [[V2:%.*]], <vscale x 2 x i32> [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } @test_vcreate_v_i32m1x4(
+// CHECK-RV64-SAME: <vscale x 2 x i32> [[V0:%.*]], <vscale x 2 x i32> [[V1:%.*]], <vscale x 2 x i32> [[V2:%.*]], <vscale x 2 x i32> [[V3:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } poison, <vscale x 2 x i32> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP0]], <vscale x 2 x i32> [[V1]], 1
@@ -1646,12 +2696,13 @@ vint32m1x3_t test_vcreate_v_i32m1x3(vint32m1_t v0, vint32m1_t v1, vint32m1_t v2)
 // CHECK-RV64-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP2]], <vscale x 2 x i32> [[V3]], 3
 // CHECK-RV64-NEXT:    ret { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP3]]
 //
-vint32m1x4_t test_vcreate_v_i32m1x4(vint32m1_t v0, vint32m1_t v1, vint32m1_t v2, vint32m1_t v3) {
+vint32m1x4_t test_vcreate_v_i32m1x4(vint32m1_t v0, vint32m1_t v1, vint32m1_t v2,
+                                    vint32m1_t v3) {
   return __riscv_vcreate_v_i32m1x4(v0, v1, v2, v3);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } @test_vcreate_v_i32m1x5
-// CHECK-RV64-SAME: (<vscale x 2 x i32> [[V0:%.*]], <vscale x 2 x i32> [[V1:%.*]], <vscale x 2 x i32> [[V2:%.*]], <vscale x 2 x i32> [[V3:%.*]], <vscale x 2 x i32> [[V4:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } @test_vcreate_v_i32m1x5(
+// CHECK-RV64-SAME: <vscale x 2 x i32> [[V0:%.*]], <vscale x 2 x i32> [[V1:%.*]], <vscale x 2 x i32> [[V2:%.*]], <vscale x 2 x i32> [[V3:%.*]], <vscale x 2 x i32> [[V4:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } poison, <vscale x 2 x i32> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP0]], <vscale x 2 x i32> [[V1]], 1
@@ -1660,12 +2711,13 @@ vint32m1x4_t test_vcreate_v_i32m1x4(vint32m1_t v0, vint32m1_t v1, vint32m1_t v2,
 // CHECK-RV64-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP3]], <vscale x 2 x i32> [[V4]], 4
 // CHECK-RV64-NEXT:    ret { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP4]]
 //
-vint32m1x5_t test_vcreate_v_i32m1x5(vint32m1_t v0, vint32m1_t v1, vint32m1_t v2, vint32m1_t v3, vint32m1_t v4) {
+vint32m1x5_t test_vcreate_v_i32m1x5(vint32m1_t v0, vint32m1_t v1, vint32m1_t v2,
+                                    vint32m1_t v3, vint32m1_t v4) {
   return __riscv_vcreate_v_i32m1x5(v0, v1, v2, v3, v4);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } @test_vcreate_v_i32m1x6
-// CHECK-RV64-SAME: (<vscale x 2 x i32> [[V0:%.*]], <vscale x 2 x i32> [[V1:%.*]], <vscale x 2 x i32> [[V2:%.*]], <vscale x 2 x i32> [[V3:%.*]], <vscale x 2 x i32> [[V4:%.*]], <vscale x 2 x i32> [[V5:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } @test_vcreate_v_i32m1x6(
+// CHECK-RV64-SAME: <vscale x 2 x i32> [[V0:%.*]], <vscale x 2 x i32> [[V1:%.*]], <vscale x 2 x i32> [[V2:%.*]], <vscale x 2 x i32> [[V3:%.*]], <vscale x 2 x i32> [[V4:%.*]], <vscale x 2 x i32> [[V5:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } poison, <vscale x 2 x i32> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP0]], <vscale x 2 x i32> [[V1]], 1
@@ -1675,12 +2727,14 @@ vint32m1x5_t test_vcreate_v_i32m1x5(vint32m1_t v0, vint32m1_t v1, vint32m1_t v2,
 // CHECK-RV64-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP4]], <vscale x 2 x i32> [[V5]], 5
 // CHECK-RV64-NEXT:    ret { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP5]]
 //
-vint32m1x6_t test_vcreate_v_i32m1x6(vint32m1_t v0, vint32m1_t v1, vint32m1_t v2, vint32m1_t v3, vint32m1_t v4, vint32m1_t v5) {
+vint32m1x6_t test_vcreate_v_i32m1x6(vint32m1_t v0, vint32m1_t v1, vint32m1_t v2,
+                                    vint32m1_t v3, vint32m1_t v4,
+                                    vint32m1_t v5) {
   return __riscv_vcreate_v_i32m1x6(v0, v1, v2, v3, v4, v5);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } @test_vcreate_v_i32m1x7
-// CHECK-RV64-SAME: (<vscale x 2 x i32> [[V0:%.*]], <vscale x 2 x i32> [[V1:%.*]], <vscale x 2 x i32> [[V2:%.*]], <vscale x 2 x i32> [[V3:%.*]], <vscale x 2 x i32> [[V4:%.*]], <vscale x 2 x i32> [[V5:%.*]], <vscale x 2 x i32> [[V6:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } @test_vcreate_v_i32m1x7(
+// CHECK-RV64-SAME: <vscale x 2 x i32> [[V0:%.*]], <vscale x 2 x i32> [[V1:%.*]], <vscale x 2 x i32> [[V2:%.*]], <vscale x 2 x i32> [[V3:%.*]], <vscale x 2 x i32> [[V4:%.*]], <vscale x 2 x i32> [[V5:%.*]], <vscale x 2 x i32> [[V6:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } poison, <vscale x 2 x i32> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP0]], <vscale x 2 x i32> [[V1]], 1
@@ -1691,12 +2745,14 @@ vint32m1x6_t test_vcreate_v_i32m1x6(vint32m1_t v0, vint32m1_t v1, vint32m1_t v2,
 // CHECK-RV64-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP5]], <vscale x 2 x i32> [[V6]], 6
 // CHECK-RV64-NEXT:    ret { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP6]]
 //
-vint32m1x7_t test_vcreate_v_i32m1x7(vint32m1_t v0, vint32m1_t v1, vint32m1_t v2, vint32m1_t v3, vint32m1_t v4, vint32m1_t v5, vint32m1_t v6) {
+vint32m1x7_t test_vcreate_v_i32m1x7(vint32m1_t v0, vint32m1_t v1, vint32m1_t v2,
+                                    vint32m1_t v3, vint32m1_t v4, vint32m1_t v5,
+                                    vint32m1_t v6) {
   return __riscv_vcreate_v_i32m1x7(v0, v1, v2, v3, v4, v5, v6);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } @test_vcreate_v_i32m1x8
-// CHECK-RV64-SAME: (<vscale x 2 x i32> [[V0:%.*]], <vscale x 2 x i32> [[V1:%.*]], <vscale x 2 x i32> [[V2:%.*]], <vscale x 2 x i32> [[V3:%.*]], <vscale x 2 x i32> [[V4:%.*]], <vscale x 2 x i32> [[V5:%.*]], <vscale x 2 x i32> [[V6:%.*]], <vscale x 2 x i32> [[V7:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } @test_vcreate_v_i32m1x8(
+// CHECK-RV64-SAME: <vscale x 2 x i32> [[V0:%.*]], <vscale x 2 x i32> [[V1:%.*]], <vscale x 2 x i32> [[V2:%.*]], <vscale x 2 x i32> [[V3:%.*]], <vscale x 2 x i32> [[V4:%.*]], <vscale x 2 x i32> [[V5:%.*]], <vscale x 2 x i32> [[V6:%.*]], <vscale x 2 x i32> [[V7:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } poison, <vscale x 2 x i32> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP0]], <vscale x 2 x i32> [[V1]], 1
@@ -1708,12 +2764,14 @@ vint32m1x7_t test_vcreate_v_i32m1x7(vint32m1_t v0, vint32m1_t v1, vint32m1_t v2,
 // CHECK-RV64-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP6]], <vscale x 2 x i32> [[V7]], 7
 // CHECK-RV64-NEXT:    ret { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP7]]
 //
-vint32m1x8_t test_vcreate_v_i32m1x8(vint32m1_t v0, vint32m1_t v1, vint32m1_t v2, vint32m1_t v3, vint32m1_t v4, vint32m1_t v5, vint32m1_t v6, vint32m1_t v7) {
+vint32m1x8_t test_vcreate_v_i32m1x8(vint32m1_t v0, vint32m1_t v1, vint32m1_t v2,
+                                    vint32m1_t v3, vint32m1_t v4, vint32m1_t v5,
+                                    vint32m1_t v6, vint32m1_t v7) {
   return __riscv_vcreate_v_i32m1x8(v0, v1, v2, v3, v4, v5, v6, v7);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x i32>, <vscale x 4 x i32> } @test_vcreate_v_i32m2x2
-// CHECK-RV64-SAME: (<vscale x 4 x i32> [[V0:%.*]], <vscale x 4 x i32> [[V1:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x i32>, <vscale x 4 x i32> } @test_vcreate_v_i32m2x2(
+// CHECK-RV64-SAME: <vscale x 4 x i32> [[V0:%.*]], <vscale x 4 x i32> [[V1:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } poison, <vscale x 4 x i32> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], <vscale x 4 x i32> [[V1]], 1
@@ -1723,20 +2781,21 @@ vint32m2x2_t test_vcreate_v_i32m2x2(vint32m2_t v0, vint32m2_t v1) {
   return __riscv_vcreate_v_i32m2x2(v0, v1);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @test_vcreate_v_i32m2x3
-// CHECK-RV64-SAME: (<vscale x 4 x i32> [[V0:%.*]], <vscale x 4 x i32> [[V1:%.*]], <vscale x 4 x i32> [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @test_vcreate_v_i32m2x3(
+// CHECK-RV64-SAME: <vscale x 4 x i32> [[V0:%.*]], <vscale x 4 x i32> [[V1:%.*]], <vscale x 4 x i32> [[V2:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } poison, <vscale x 4 x i32> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], <vscale x 4 x i32> [[V1]], 1
 // CHECK-RV64-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], <vscale x 4 x i32> [[V2]], 2
 // CHECK-RV64-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]]
 //
-vint32m2x3_t test_vcreate_v_i32m2x3(vint32m2_t v0, vint32m2_t v1, vint32m2_t v2) {
+vint32m2x3_t test_vcreate_v_i32m2x3(vint32m2_t v0, vint32m2_t v1,
+                                    vint32m2_t v2) {
   return __riscv_vcreate_v_i32m2x3(v0, v1, v2);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @test_vcreate_v_i32m2x4
-// CHECK-RV64-SAME: (<vscale x 4 x i32> [[V0:%.*]], <vscale x 4 x i32> [[V1:%.*]], <vscale x 4 x i32> [[V2:%.*]], <vscale x 4 x i32> [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @test_vcreate_v_i32m2x4(
+// CHECK-RV64-SAME: <vscale x 4 x i32> [[V0:%.*]], <vscale x 4 x i32> [[V1:%.*]], <vscale x 4 x i32> [[V2:%.*]], <vscale x 4 x i32> [[V3:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } poison, <vscale x 4 x i32> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], <vscale x 4 x i32> [[V1]], 1
@@ -1744,12 +2803,13 @@ vint32m2x3_t test_vcreate_v_i32m2x3(vint32m2_t v0, vint32m2_t v1, vint32m2_t v2)
 // CHECK-RV64-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], <vscale x 4 x i32> [[V3]], 3
 // CHECK-RV64-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]]
 //
-vint32m2x4_t test_vcreate_v_i32m2x4(vint32m2_t v0, vint32m2_t v1, vint32m2_t v2, vint32m2_t v3) {
+vint32m2x4_t test_vcreate_v_i32m2x4(vint32m2_t v0, vint32m2_t v1, vint32m2_t v2,
+                                    vint32m2_t v3) {
   return __riscv_vcreate_v_i32m2x4(v0, v1, v2, v3);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 8 x i32>, <vscale x 8 x i32> } @test_vcreate_v_i32m4x2
-// CHECK-RV64-SAME: (<vscale x 8 x i32> [[V0:%.*]], <vscale x 8 x i32> [[V1:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 8 x i32>, <vscale x 8 x i32> } @test_vcreate_v_i32m4x2(
+// CHECK-RV64-SAME: <vscale x 8 x i32> [[V0:%.*]], <vscale x 8 x i32> [[V1:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } poison, <vscale x 8 x i32> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } [[TMP0]], <vscale x 8 x i32> [[V1]], 1
@@ -1759,8 +2819,8 @@ vint32m4x2_t test_vcreate_v_i32m4x2(vint32m4_t v0, vint32m4_t v1) {
   return __riscv_vcreate_v_i32m4x2(v0, v1);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i64>, <vscale x 1 x i64> } @test_vcreate_v_i64m1x2
-// CHECK-RV64-SAME: (<vscale x 1 x i64> [[V0:%.*]], <vscale x 1 x i64> [[V1:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i64>, <vscale x 1 x i64> } @test_vcreate_v_i64m1x2(
+// CHECK-RV64-SAME: <vscale x 1 x i64> [[V0:%.*]], <vscale x 1 x i64> [[V1:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } poison, <vscale x 1 x i64> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } [[TMP0]], <vscale x 1 x i64> [[V1]], 1
@@ -1770,20 +2830,21 @@ vint64m1x2_t test_vcreate_v_i64m1x2(vint64m1_t v0, vint64m1_t v1) {
   return __riscv_vcreate_v_i64m1x2(v0, v1);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } @test_vcreate_v_i64m1x3
-// CHECK-RV64-SAME: (<vscale x 1 x i64> [[V0:%.*]], <vscale x 1 x i64> [[V1:%.*]], <vscale x 1 x i64> [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } @test_vcreate_v_i64m1x3(
+// CHECK-RV64-SAME: <vscale x 1 x i64> [[V0:%.*]], <vscale x 1 x i64> [[V1:%.*]], <vscale x 1 x i64> [[V2:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } poison, <vscale x 1 x i64> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[TMP0]], <vscale x 1 x i64> [[V1]], 1
 // CHECK-RV64-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[TMP1]], <vscale x 1 x i64> [[V2]], 2
 // CHECK-RV64-NEXT:    ret { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[TMP2]]
 //
-vint64m1x3_t test_vcreate_v_i64m1x3(vint64m1_t v0, vint64m1_t v1, vint64m1_t v2) {
+vint64m1x3_t test_vcreate_v_i64m1x3(vint64m1_t v0, vint64m1_t v1,
+                                    vint64m1_t v2) {
   return __riscv_vcreate_v_i64m1x3(v0, v1, v2);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } @test_vcreate_v_i64m1x4
-// CHECK-RV64-SAME: (<vscale x 1 x i64> [[V0:%.*]], <vscale x 1 x i64> [[V1:%.*]], <vscale x 1 x i64> [[V2:%.*]], <vscale x 1 x i64> [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } @test_vcreate_v_i64m1x4(
+// CHECK-RV64-SAME: <vscale x 1 x i64> [[V0:%.*]], <vscale x 1 x i64> [[V1:%.*]], <vscale x 1 x i64> [[V2:%.*]], <vscale x 1 x i64> [[V3:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } poison, <vscale x 1 x i64> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[TMP0]], <vscale x 1 x i64> [[V1]], 1
@@ -1791,12 +2852,13 @@ vint64m1x3_t test_vcreate_v_i64m1x3(vint64m1_t v0, vint64m1_t v1, vint64m1_t v2)
 // CHECK-RV64-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[TMP2]], <vscale x 1 x i64> [[V3]], 3
 // CHECK-RV64-NEXT:    ret { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[TMP3]]
 //
-vint64m1x4_t test_vcreate_v_i64m1x4(vint64m1_t v0, vint64m1_t v1, vint64m1_t v2, vint64m1_t v3) {
+vint64m1x4_t test_vcreate_v_i64m1x4(vint64m1_t v0, vint64m1_t v1, vint64m1_t v2,
+                                    vint64m1_t v3) {
   return __riscv_vcreate_v_i64m1x4(v0, v1, v2, v3);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } @test_vcreate_v_i64m1x5
-// CHECK-RV64-SAME: (<vscale x 1 x i64> [[V0:%.*]], <vscale x 1 x i64> [[V1:%.*]], <vscale x 1 x i64> [[V2:%.*]], <vscale x 1 x i64> [[V3:%.*]], <vscale x 1 x i64> [[V4:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } @test_vcreate_v_i64m1x5(
+// CHECK-RV64-SAME: <vscale x 1 x i64> [[V0:%.*]], <vscale x 1 x i64> [[V1:%.*]], <vscale x 1 x i64> [[V2:%.*]], <vscale x 1 x i64> [[V3:%.*]], <vscale x 1 x i64> [[V4:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } poison, <vscale x 1 x i64> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[TMP0]], <vscale x 1 x i64> [[V1]], 1
@@ -1805,12 +2867,13 @@ vint64m1x4_t test_vcreate_v_i64m1x4(vint64m1_t v0, vint64m1_t v1, vint64m1_t v2,
 // CHECK-RV64-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[TMP3]], <vscale x 1 x i64> [[V4]], 4
 // CHECK-RV64-NEXT:    ret { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[TMP4]]
 //
-vint64m1x5_t test_vcreate_v_i64m1x5(vint64m1_t v0, vint64m1_t v1, vint64m1_t v2, vint64m1_t v3, vint64m1_t v4) {
+vint64m1x5_t test_vcreate_v_i64m1x5(vint64m1_t v0, vint64m1_t v1, vint64m1_t v2,
+                                    vint64m1_t v3, vint64m1_t v4) {
   return __riscv_vcreate_v_i64m1x5(v0, v1, v2, v3, v4);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } @test_vcreate_v_i64m1x6
-// CHECK-RV64-SAME: (<vscale x 1 x i64> [[V0:%.*]], <vscale x 1 x i64> [[V1:%.*]], <vscale x 1 x i64> [[V2:%.*]], <vscale x 1 x i64> [[V3:%.*]], <vscale x 1 x i64> [[V4:%.*]], <vscale x 1 x i64> [[V5:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } @test_vcreate_v_i64m1x6(
+// CHECK-RV64-SAME: <vscale x 1 x i64> [[V0:%.*]], <vscale x 1 x i64> [[V1:%.*]], <vscale x 1 x i64> [[V2:%.*]], <vscale x 1 x i64> [[V3:%.*]], <vscale x 1 x i64> [[V4:%.*]], <vscale x 1 x i64> [[V5:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } poison, <vscale x 1 x i64> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[TMP0]], <vscale x 1 x i64> [[V1]], 1
@@ -1820,12 +2883,14 @@ vint64m1x5_t test_vcreate_v_i64m1x5(vint64m1_t v0, vint64m1_t v1, vint64m1_t v2,
 // CHECK-RV64-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[TMP4]], <vscale x 1 x i64> [[V5]], 5
 // CHECK-RV64-NEXT:    ret { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[TMP5]]
 //
-vint64m1x6_t test_vcreate_v_i64m1x6(vint64m1_t v0, vint64m1_t v1, vint64m1_t v2, vint64m1_t v3, vint64m1_t v4, vint64m1_t v5) {
+vint64m1x6_t test_vcreate_v_i64m1x6(vint64m1_t v0, vint64m1_t v1, vint64m1_t v2,
+                                    vint64m1_t v3, vint64m1_t v4,
+                                    vint64m1_t v5) {
   return __riscv_vcreate_v_i64m1x6(v0, v1, v2, v3, v4, v5);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } @test_vcreate_v_i64m1x7
-// CHECK-RV64-SAME: (<vscale x 1 x i64> [[V0:%.*]], <vscale x 1 x i64> [[V1:%.*]], <vscale x 1 x i64> [[V2:%.*]], <vscale x 1 x i64> [[V3:%.*]], <vscale x 1 x i64> [[V4:%.*]], <vscale x 1 x i64> [[V5:%.*]], <vscale x 1 x i64> [[V6:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } @test_vcreate_v_i64m1x7(
+// CHECK-RV64-SAME: <vscale x 1 x i64> [[V0:%.*]], <vscale x 1 x i64> [[V1:%.*]], <vscale x 1 x i64> [[V2:%.*]], <vscale x 1 x i64> [[V3:%.*]], <vscale x 1 x i64> [[V4:%.*]], <vscale x 1 x i64> [[V5:%.*]], <vscale x 1 x i64> [[V6:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } poison, <vscale x 1 x i64> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[TMP0]], <vscale x 1 x i64> [[V1]], 1
@@ -1836,12 +2901,14 @@ vint64m1x6_t test_vcreate_v_i64m1x6(vint64m1_t v0, vint64m1_t v1, vint64m1_t v2,
 // CHECK-RV64-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[TMP5]], <vscale x 1 x i64> [[V6]], 6
 // CHECK-RV64-NEXT:    ret { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[TMP6]]
 //
-vint64m1x7_t test_vcreate_v_i64m1x7(vint64m1_t v0, vint64m1_t v1, vint64m1_t v2, vint64m1_t v3, vint64m1_t v4, vint64m1_t v5, vint64m1_t v6) {
+vint64m1x7_t test_vcreate_v_i64m1x7(vint64m1_t v0, vint64m1_t v1, vint64m1_t v2,
+                                    vint64m1_t v3, vint64m1_t v4, vint64m1_t v5,
+                                    vint64m1_t v6) {
   return __riscv_vcreate_v_i64m1x7(v0, v1, v2, v3, v4, v5, v6);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } @test_vcreate_v_i64m1x8
-// CHECK-RV64-SAME: (<vscale x 1 x i64> [[V0:%.*]], <vscale x 1 x i64> [[V1:%.*]], <vscale x 1 x i64> [[V2:%.*]], <vscale x 1 x i64> [[V3:%.*]], <vscale x 1 x i64> [[V4:%.*]], <vscale x 1 x i64> [[V5:%.*]], <vscale x 1 x i64> [[V6:%.*]], <vscale x 1 x i64> [[V7:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } @test_vcreate_v_i64m1x8(
+// CHECK-RV64-SAME: <vscale x 1 x i64> [[V0:%.*]], <vscale x 1 x i64> [[V1:%.*]], <vscale x 1 x i64> [[V2:%.*]], <vscale x 1 x i64> [[V3:%.*]], <vscale x 1 x i64> [[V4:%.*]], <vscale x 1 x i64> [[V5:%.*]], <vscale x 1 x i64> [[V6:%.*]], <vscale x 1 x i64> [[V7:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } poison, <vscale x 1 x i64> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[TMP0]], <vscale x 1 x i64> [[V1]], 1
@@ -1853,12 +2920,14 @@ vint64m1x7_t test_vcreate_v_i64m1x7(vint64m1_t v0, vint64m1_t v1, vint64m1_t v2,
 // CHECK-RV64-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[TMP6]], <vscale x 1 x i64> [[V7]], 7
 // CHECK-RV64-NEXT:    ret { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[TMP7]]
 //
-vint64m1x8_t test_vcreate_v_i64m1x8(vint64m1_t v0, vint64m1_t v1, vint64m1_t v2, vint64m1_t v3, vint64m1_t v4, vint64m1_t v5, vint64m1_t v6, vint64m1_t v7) {
+vint64m1x8_t test_vcreate_v_i64m1x8(vint64m1_t v0, vint64m1_t v1, vint64m1_t v2,
+                                    vint64m1_t v3, vint64m1_t v4, vint64m1_t v5,
+                                    vint64m1_t v6, vint64m1_t v7) {
   return __riscv_vcreate_v_i64m1x8(v0, v1, v2, v3, v4, v5, v6, v7);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i64>, <vscale x 2 x i64> } @test_vcreate_v_i64m2x2
-// CHECK-RV64-SAME: (<vscale x 2 x i64> [[V0:%.*]], <vscale x 2 x i64> [[V1:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i64>, <vscale x 2 x i64> } @test_vcreate_v_i64m2x2(
+// CHECK-RV64-SAME: <vscale x 2 x i64> [[V0:%.*]], <vscale x 2 x i64> [[V1:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } poison, <vscale x 2 x i64> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], <vscale x 2 x i64> [[V1]], 1
@@ -1868,20 +2937,21 @@ vint64m2x2_t test_vcreate_v_i64m2x2(vint64m2_t v0, vint64m2_t v1) {
   return __riscv_vcreate_v_i64m2x2(v0, v1);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @test_vcreate_v_i64m2x3
-// CHECK-RV64-SAME: (<vscale x 2 x i64> [[V0:%.*]], <vscale x 2 x i64> [[V1:%.*]], <vscale x 2 x i64> [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @test_vcreate_v_i64m2x3(
+// CHECK-RV64-SAME: <vscale x 2 x i64> [[V0:%.*]], <vscale x 2 x i64> [[V1:%.*]], <vscale x 2 x i64> [[V2:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } poison, <vscale x 2 x i64> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], <vscale x 2 x i64> [[V1]], 1
 // CHECK-RV64-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], <vscale x 2 x i64> [[V2]], 2
 // CHECK-RV64-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]]
 //
-vint64m2x3_t test_vcreate_v_i64m2x3(vint64m2_t v0, vint64m2_t v1, vint64m2_t v2) {
+vint64m2x3_t test_vcreate_v_i64m2x3(vint64m2_t v0, vint64m2_t v1,
+                                    vint64m2_t v2) {
   return __riscv_vcreate_v_i64m2x3(v0, v1, v2);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @test_vcreate_v_i64m2x4
-// CHECK-RV64-SAME: (<vscale x 2 x i64> [[V0:%.*]], <vscale x 2 x i64> [[V1:%.*]], <vscale x 2 x i64> [[V2:%.*]], <vscale x 2 x i64> [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @test_vcreate_v_i64m2x4(
+// CHECK-RV64-SAME: <vscale x 2 x i64> [[V0:%.*]], <vscale x 2 x i64> [[V1:%.*]], <vscale x 2 x i64> [[V2:%.*]], <vscale x 2 x i64> [[V3:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } poison, <vscale x 2 x i64> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], <vscale x 2 x i64> [[V1]], 1
@@ -1889,12 +2959,13 @@ vint64m2x3_t test_vcreate_v_i64m2x3(vint64m2_t v0, vint64m2_t v1, vint64m2_t v2)
 // CHECK-RV64-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], <vscale x 2 x i64> [[V3]], 3
 // CHECK-RV64-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP3]]
 //
-vint64m2x4_t test_vcreate_v_i64m2x4(vint64m2_t v0, vint64m2_t v1, vint64m2_t v2, vint64m2_t v3) {
+vint64m2x4_t test_vcreate_v_i64m2x4(vint64m2_t v0, vint64m2_t v1, vint64m2_t v2,
+                                    vint64m2_t v3) {
   return __riscv_vcreate_v_i64m2x4(v0, v1, v2, v3);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x i64>, <vscale x 4 x i64> } @test_vcreate_v_i64m4x2
-// CHECK-RV64-SAME: (<vscale x 4 x i64> [[V0:%.*]], <vscale x 4 x i64> [[V1:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x i64>, <vscale x 4 x i64> } @test_vcreate_v_i64m4x2(
+// CHECK-RV64-SAME: <vscale x 4 x i64> [[V0:%.*]], <vscale x 4 x i64> [[V1:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 4 x i64>, <vscale x 4 x i64> } poison, <vscale x 4 x i64> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 4 x i64>, <vscale x 4 x i64> } [[TMP0]], <vscale x 4 x i64> [[V1]], 1
@@ -1904,8 +2975,8 @@ vint64m4x2_t test_vcreate_v_i64m4x2(vint64m4_t v0, vint64m4_t v1) {
   return __riscv_vcreate_v_i64m4x2(v0, v1);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i8>, <vscale x 1 x i8> } @test_vcreate_v_u8mf8x2
-// CHECK-RV64-SAME: (<vscale x 1 x i8> [[V0:%.*]], <vscale x 1 x i8> [[V1:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i8>, <vscale x 1 x i8> } @test_vcreate_v_u8mf8x2(
+// CHECK-RV64-SAME: <vscale x 1 x i8> [[V0:%.*]], <vscale x 1 x i8> [[V1:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 1 x i8>, <vscale x 1 x i8> } poison, <vscale x 1 x i8> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 1 x i8>, <vscale x 1 x i8> } [[TMP0]], <vscale x 1 x i8> [[V1]], 1
@@ -1915,20 +2986,21 @@ vuint8mf8x2_t test_vcreate_v_u8mf8x2(vuint8mf8_t v0, vuint8mf8_t v1) {
   return __riscv_vcreate_v_u8mf8x2(v0, v1);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8> } @test_vcreate_v_u8mf8x3
-// CHECK-RV64-SAME: (<vscale x 1 x i8> [[V0:%.*]], <vscale x 1 x i8> [[V1:%.*]], <vscale x 1 x i8> [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8> } @test_vcreate_v_u8mf8x3(
+// CHECK-RV64-SAME: <vscale x 1 x i8> [[V0:%.*]], <vscale x 1 x i8> [[V1:%.*]], <vscale x 1 x i8> [[V2:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8> } poison, <vscale x 1 x i8> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8> } [[TMP0]], <vscale x 1 x i8> [[V1]], 1
 // CHECK-RV64-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8> } [[TMP1]], <vscale x 1 x i8> [[V2]], 2
 // CHECK-RV64-NEXT:    ret { <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8> } [[TMP2]]
 //
-vuint8mf8x3_t test_vcreate_v_u8mf8x3(vuint8mf8_t v0, vuint8mf8_t v1, vuint8mf8_t v2) {
+vuint8mf8x3_t test_vcreate_v_u8mf8x3(vuint8mf8_t v0, vuint8mf8_t v1,
+                                     vuint8mf8_t v2) {
   return __riscv_vcreate_v_u8mf8x3(v0, v1, v2);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8> } @test_vcreate_v_u8mf8x4
-// CHECK-RV64-SAME: (<vscale x 1 x i8> [[V0:%.*]], <vscale x 1 x i8> [[V1:%.*]], <vscale x 1 x i8> [[V2:%.*]], <vscale x 1 x i8> [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8> } @test_vcreate_v_u8mf8x4(
+// CHECK-RV64-SAME: <vscale x 1 x i8> [[V0:%.*]], <vscale x 1 x i8> [[V1:%.*]], <vscale x 1 x i8> [[V2:%.*]], <vscale x 1 x i8> [[V3:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8> } poison, <vscale x 1 x i8> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8> } [[TMP0]], <vscale x 1 x i8> [[V1]], 1
@@ -1936,12 +3008,13 @@ vuint8mf8x3_t test_vcreate_v_u8mf8x3(vuint8mf8_t v0, vuint8mf8_t v1, vuint8mf8_t
 // CHECK-RV64-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8> } [[TMP2]], <vscale x 1 x i8> [[V3]], 3
 // CHECK-RV64-NEXT:    ret { <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8> } [[TMP3]]
 //
-vuint8mf8x4_t test_vcreate_v_u8mf8x4(vuint8mf8_t v0, vuint8mf8_t v1, vuint8mf8_t v2, vuint8mf8_t v3) {
+vuint8mf8x4_t test_vcreate_v_u8mf8x4(vuint8mf8_t v0, vuint8mf8_t v1,
+                                     vuint8mf8_t v2, vuint8mf8_t v3) {
   return __riscv_vcreate_v_u8mf8x4(v0, v1, v2, v3);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8> } @test_vcreate_v_u8mf8x5
-// CHECK-RV64-SAME: (<vscale x 1 x i8> [[V0:%.*]], <vscale x 1 x i8> [[V1:%.*]], <vscale x 1 x i8> [[V2:%.*]], <vscale x 1 x i8> [[V3:%.*]], <vscale x 1 x i8> [[V4:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8> } @test_vcreate_v_u8mf8x5(
+// CHECK-RV64-SAME: <vscale x 1 x i8> [[V0:%.*]], <vscale x 1 x i8> [[V1:%.*]], <vscale x 1 x i8> [[V2:%.*]], <vscale x 1 x i8> [[V3:%.*]], <vscale x 1 x i8> [[V4:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8> } poison, <vscale x 1 x i8> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8> } [[TMP0]], <vscale x 1 x i8> [[V1]], 1
@@ -1950,12 +3023,14 @@ vuint8mf8x4_t test_vcreate_v_u8mf8x4(vuint8mf8_t v0, vuint8mf8_t v1, vuint8mf8_t
 // CHECK-RV64-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8> } [[TMP3]], <vscale x 1 x i8> [[V4]], 4
 // CHECK-RV64-NEXT:    ret { <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8> } [[TMP4]]
 //
-vuint8mf8x5_t test_vcreate_v_u8mf8x5(vuint8mf8_t v0, vuint8mf8_t v1, vuint8mf8_t v2, vuint8mf8_t v3, vuint8mf8_t v4) {
+vuint8mf8x5_t test_vcreate_v_u8mf8x5(vuint8mf8_t v0, vuint8mf8_t v1,
+                                     vuint8mf8_t v2, vuint8mf8_t v3,
+                                     vuint8mf8_t v4) {
   return __riscv_vcreate_v_u8mf8x5(v0, v1, v2, v3, v4);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8> } @test_vcreate_v_u8mf8x6
-// CHECK-RV64-SAME: (<vscale x 1 x i8> [[V0:%.*]], <vscale x 1 x i8> [[V1:%.*]], <vscale x 1 x i8> [[V2:%.*]], <vscale x 1 x i8> [[V3:%.*]], <vscale x 1 x i8> [[V4:%.*]], <vscale x 1 x i8> [[V5:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8> } @test_vcreate_v_u8mf8x6(
+// CHECK-RV64-SAME: <vscale x 1 x i8> [[V0:%.*]], <vscale x 1 x i8> [[V1:%.*]], <vscale x 1 x i8> [[V2:%.*]], <vscale x 1 x i8> [[V3:%.*]], <vscale x 1 x i8> [[V4:%.*]], <vscale x 1 x i8> [[V5:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8> } poison, <vscale x 1 x i8> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8> } [[TMP0]], <vscale x 1 x i8> [[V1]], 1
@@ -1965,12 +3040,14 @@ vuint8mf8x5_t test_vcreate_v_u8mf8x5(vuint8mf8_t v0, vuint8mf8_t v1, vuint8mf8_t
 // CHECK-RV64-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8> } [[TMP4]], <vscale x 1 x i8> [[V5]], 5
 // CHECK-RV64-NEXT:    ret { <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8> } [[TMP5]]
 //
-vuint8mf8x6_t test_vcreate_v_u8mf8x6(vuint8mf8_t v0, vuint8mf8_t v1, vuint8mf8_t v2, vuint8mf8_t v3, vuint8mf8_t v4, vuint8mf8_t v5) {
+vuint8mf8x6_t test_vcreate_v_u8mf8x6(vuint8mf8_t v0, vuint8mf8_t v1,
+                                     vuint8mf8_t v2, vuint8mf8_t v3,
+                                     vuint8mf8_t v4, vuint8mf8_t v5) {
   return __riscv_vcreate_v_u8mf8x6(v0, v1, v2, v3, v4, v5);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8> } @test_vcreate_v_u8mf8x7
-// CHECK-RV64-SAME: (<vscale x 1 x i8> [[V0:%.*]], <vscale x 1 x i8> [[V1:%.*]], <vscale x 1 x i8> [[V2:%.*]], <vscale x 1 x i8> [[V3:%.*]], <vscale x 1 x i8> [[V4:%.*]], <vscale x 1 x i8> [[V5:%.*]], <vscale x 1 x i8> [[V6:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8> } @test_vcreate_v_u8mf8x7(
+// CHECK-RV64-SAME: <vscale x 1 x i8> [[V0:%.*]], <vscale x 1 x i8> [[V1:%.*]], <vscale x 1 x i8> [[V2:%.*]], <vscale x 1 x i8> [[V3:%.*]], <vscale x 1 x i8> [[V4:%.*]], <vscale x 1 x i8> [[V5:%.*]], <vscale x 1 x i8> [[V6:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8> } poison, <vscale x 1 x i8> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8> } [[TMP0]], <vscale x 1 x i8> [[V1]], 1
@@ -1981,12 +3058,15 @@ vuint8mf8x6_t test_vcreate_v_u8mf8x6(vuint8mf8_t v0, vuint8mf8_t v1, vuint8mf8_t
 // CHECK-RV64-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8> } [[TMP5]], <vscale x 1 x i8> [[V6]], 6
 // CHECK-RV64-NEXT:    ret { <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8> } [[TMP6]]
 //
-vuint8mf8x7_t test_vcreate_v_u8mf8x7(vuint8mf8_t v0, vuint8mf8_t v1, vuint8mf8_t v2, vuint8mf8_t v3, vuint8mf8_t v4, vuint8mf8_t v5, vuint8mf8_t v6) {
+vuint8mf8x7_t test_vcreate_v_u8mf8x7(vuint8mf8_t v0, vuint8mf8_t v1,
+                                     vuint8mf8_t v2, vuint8mf8_t v3,
+                                     vuint8mf8_t v4, vuint8mf8_t v5,
+                                     vuint8mf8_t v6) {
   return __riscv_vcreate_v_u8mf8x7(v0, v1, v2, v3, v4, v5, v6);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8> } @test_vcreate_v_u8mf8x8
-// CHECK-RV64-SAME: (<vscale x 1 x i8> [[V0:%.*]], <vscale x 1 x i8> [[V1:%.*]], <vscale x 1 x i8> [[V2:%.*]], <vscale x 1 x i8> [[V3:%.*]], <vscale x 1 x i8> [[V4:%.*]], <vscale x 1 x i8> [[V5:%.*]], <vscale x 1 x i8> [[V6:%.*]], <vscale x 1 x i8> [[V7:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8> } @test_vcreate_v_u8mf8x8(
+// CHECK-RV64-SAME: <vscale x 1 x i8> [[V0:%.*]], <vscale x 1 x i8> [[V1:%.*]], <vscale x 1 x i8> [[V2:%.*]], <vscale x 1 x i8> [[V3:%.*]], <vscale x 1 x i8> [[V4:%.*]], <vscale x 1 x i8> [[V5:%.*]], <vscale x 1 x i8> [[V6:%.*]], <vscale x 1 x i8> [[V7:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8> } poison, <vscale x 1 x i8> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8> } [[TMP0]], <vscale x 1 x i8> [[V1]], 1
@@ -1998,12 +3078,15 @@ vuint8mf8x7_t test_vcreate_v_u8mf8x7(vuint8mf8_t v0, vuint8mf8_t v1, vuint8mf8_t
 // CHECK-RV64-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8> } [[TMP6]], <vscale x 1 x i8> [[V7]], 7
 // CHECK-RV64-NEXT:    ret { <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8>, <vscale x 1 x i8> } [[TMP7]]
 //
-vuint8mf8x8_t test_vcreate_v_u8mf8x8(vuint8mf8_t v0, vuint8mf8_t v1, vuint8mf8_t v2, vuint8mf8_t v3, vuint8mf8_t v4, vuint8mf8_t v5, vuint8mf8_t v6, vuint8mf8_t v7) {
+vuint8mf8x8_t test_vcreate_v_u8mf8x8(vuint8mf8_t v0, vuint8mf8_t v1,
+                                     vuint8mf8_t v2, vuint8mf8_t v3,
+                                     vuint8mf8_t v4, vuint8mf8_t v5,
+                                     vuint8mf8_t v6, vuint8mf8_t v7) {
   return __riscv_vcreate_v_u8mf8x8(v0, v1, v2, v3, v4, v5, v6, v7);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i8>, <vscale x 2 x i8> } @test_vcreate_v_u8mf4x2
-// CHECK-RV64-SAME: (<vscale x 2 x i8> [[V0:%.*]], <vscale x 2 x i8> [[V1:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i8>, <vscale x 2 x i8> } @test_vcreate_v_u8mf4x2(
+// CHECK-RV64-SAME: <vscale x 2 x i8> [[V0:%.*]], <vscale x 2 x i8> [[V1:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x i8>, <vscale x 2 x i8> } poison, <vscale x 2 x i8> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x i8>, <vscale x 2 x i8> } [[TMP0]], <vscale x 2 x i8> [[V1]], 1
@@ -2013,20 +3096,21 @@ vuint8mf4x2_t test_vcreate_v_u8mf4x2(vuint8mf4_t v0, vuint8mf4_t v1) {
   return __riscv_vcreate_v_u8mf4x2(v0, v1);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8> } @test_vcreate_v_u8mf4x3
-// CHECK-RV64-SAME: (<vscale x 2 x i8> [[V0:%.*]], <vscale x 2 x i8> [[V1:%.*]], <vscale x 2 x i8> [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8> } @test_vcreate_v_u8mf4x3(
+// CHECK-RV64-SAME: <vscale x 2 x i8> [[V0:%.*]], <vscale x 2 x i8> [[V1:%.*]], <vscale x 2 x i8> [[V2:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8> } poison, <vscale x 2 x i8> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8> } [[TMP0]], <vscale x 2 x i8> [[V1]], 1
 // CHECK-RV64-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8> } [[TMP1]], <vscale x 2 x i8> [[V2]], 2
 // CHECK-RV64-NEXT:    ret { <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8> } [[TMP2]]
 //
-vuint8mf4x3_t test_vcreate_v_u8mf4x3(vuint8mf4_t v0, vuint8mf4_t v1, vuint8mf4_t v2) {
+vuint8mf4x3_t test_vcreate_v_u8mf4x3(vuint8mf4_t v0, vuint8mf4_t v1,
+                                     vuint8mf4_t v2) {
   return __riscv_vcreate_v_u8mf4x3(v0, v1, v2);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8> } @test_vcreate_v_u8mf4x4
-// CHECK-RV64-SAME: (<vscale x 2 x i8> [[V0:%.*]], <vscale x 2 x i8> [[V1:%.*]], <vscale x 2 x i8> [[V2:%.*]], <vscale x 2 x i8> [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8> } @test_vcreate_v_u8mf4x4(
+// CHECK-RV64-SAME: <vscale x 2 x i8> [[V0:%.*]], <vscale x 2 x i8> [[V1:%.*]], <vscale x 2 x i8> [[V2:%.*]], <vscale x 2 x i8> [[V3:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8> } poison, <vscale x 2 x i8> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8> } [[TMP0]], <vscale x 2 x i8> [[V1]], 1
@@ -2034,12 +3118,13 @@ vuint8mf4x3_t test_vcreate_v_u8mf4x3(vuint8mf4_t v0, vuint8mf4_t v1, vuint8mf4_t
 // CHECK-RV64-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8> } [[TMP2]], <vscale x 2 x i8> [[V3]], 3
 // CHECK-RV64-NEXT:    ret { <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8> } [[TMP3]]
 //
-vuint8mf4x4_t test_vcreate_v_u8mf4x4(vuint8mf4_t v0, vuint8mf4_t v1, vuint8mf4_t v2, vuint8mf4_t v3) {
+vuint8mf4x4_t test_vcreate_v_u8mf4x4(vuint8mf4_t v0, vuint8mf4_t v1,
+                                     vuint8mf4_t v2, vuint8mf4_t v3) {
   return __riscv_vcreate_v_u8mf4x4(v0, v1, v2, v3);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8> } @test_vcreate_v_u8mf4x5
-// CHECK-RV64-SAME: (<vscale x 2 x i8> [[V0:%.*]], <vscale x 2 x i8> [[V1:%.*]], <vscale x 2 x i8> [[V2:%.*]], <vscale x 2 x i8> [[V3:%.*]], <vscale x 2 x i8> [[V4:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8> } @test_vcreate_v_u8mf4x5(
+// CHECK-RV64-SAME: <vscale x 2 x i8> [[V0:%.*]], <vscale x 2 x i8> [[V1:%.*]], <vscale x 2 x i8> [[V2:%.*]], <vscale x 2 x i8> [[V3:%.*]], <vscale x 2 x i8> [[V4:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8> } poison, <vscale x 2 x i8> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8> } [[TMP0]], <vscale x 2 x i8> [[V1]], 1
@@ -2048,12 +3133,14 @@ vuint8mf4x4_t test_vcreate_v_u8mf4x4(vuint8mf4_t v0, vuint8mf4_t v1, vuint8mf4_t
 // CHECK-RV64-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8> } [[TMP3]], <vscale x 2 x i8> [[V4]], 4
 // CHECK-RV64-NEXT:    ret { <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8> } [[TMP4]]
 //
-vuint8mf4x5_t test_vcreate_v_u8mf4x5(vuint8mf4_t v0, vuint8mf4_t v1, vuint8mf4_t v2, vuint8mf4_t v3, vuint8mf4_t v4) {
+vuint8mf4x5_t test_vcreate_v_u8mf4x5(vuint8mf4_t v0, vuint8mf4_t v1,
+                                     vuint8mf4_t v2, vuint8mf4_t v3,
+                                     vuint8mf4_t v4) {
   return __riscv_vcreate_v_u8mf4x5(v0, v1, v2, v3, v4);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8> } @test_vcreate_v_u8mf4x6
-// CHECK-RV64-SAME: (<vscale x 2 x i8> [[V0:%.*]], <vscale x 2 x i8> [[V1:%.*]], <vscale x 2 x i8> [[V2:%.*]], <vscale x 2 x i8> [[V3:%.*]], <vscale x 2 x i8> [[V4:%.*]], <vscale x 2 x i8> [[V5:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8> } @test_vcreate_v_u8mf4x6(
+// CHECK-RV64-SAME: <vscale x 2 x i8> [[V0:%.*]], <vscale x 2 x i8> [[V1:%.*]], <vscale x 2 x i8> [[V2:%.*]], <vscale x 2 x i8> [[V3:%.*]], <vscale x 2 x i8> [[V4:%.*]], <vscale x 2 x i8> [[V5:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8> } poison, <vscale x 2 x i8> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8> } [[TMP0]], <vscale x 2 x i8> [[V1]], 1
@@ -2063,12 +3150,14 @@ vuint8mf4x5_t test_vcreate_v_u8mf4x5(vuint8mf4_t v0, vuint8mf4_t v1, vuint8mf4_t
 // CHECK-RV64-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8> } [[TMP4]], <vscale x 2 x i8> [[V5]], 5
 // CHECK-RV64-NEXT:    ret { <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8> } [[TMP5]]
 //
-vuint8mf4x6_t test_vcreate_v_u8mf4x6(vuint8mf4_t v0, vuint8mf4_t v1, vuint8mf4_t v2, vuint8mf4_t v3, vuint8mf4_t v4, vuint8mf4_t v5) {
+vuint8mf4x6_t test_vcreate_v_u8mf4x6(vuint8mf4_t v0, vuint8mf4_t v1,
+                                     vuint8mf4_t v2, vuint8mf4_t v3,
+                                     vuint8mf4_t v4, vuint8mf4_t v5) {
   return __riscv_vcreate_v_u8mf4x6(v0, v1, v2, v3, v4, v5);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8> } @test_vcreate_v_u8mf4x7
-// CHECK-RV64-SAME: (<vscale x 2 x i8> [[V0:%.*]], <vscale x 2 x i8> [[V1:%.*]], <vscale x 2 x i8> [[V2:%.*]], <vscale x 2 x i8> [[V3:%.*]], <vscale x 2 x i8> [[V4:%.*]], <vscale x 2 x i8> [[V5:%.*]], <vscale x 2 x i8> [[V6:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8> } @test_vcreate_v_u8mf4x7(
+// CHECK-RV64-SAME: <vscale x 2 x i8> [[V0:%.*]], <vscale x 2 x i8> [[V1:%.*]], <vscale x 2 x i8> [[V2:%.*]], <vscale x 2 x i8> [[V3:%.*]], <vscale x 2 x i8> [[V4:%.*]], <vscale x 2 x i8> [[V5:%.*]], <vscale x 2 x i8> [[V6:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8> } poison, <vscale x 2 x i8> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8> } [[TMP0]], <vscale x 2 x i8> [[V1]], 1
@@ -2079,12 +3168,15 @@ vuint8mf4x6_t test_vcreate_v_u8mf4x6(vuint8mf4_t v0, vuint8mf4_t v1, vuint8mf4_t
 // CHECK-RV64-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8> } [[TMP5]], <vscale x 2 x i8> [[V6]], 6
 // CHECK-RV64-NEXT:    ret { <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8> } [[TMP6]]
 //
-vuint8mf4x7_t test_vcreate_v_u8mf4x7(vuint8mf4_t v0, vuint8mf4_t v1, vuint8mf4_t v2, vuint8mf4_t v3, vuint8mf4_t v4, vuint8mf4_t v5, vuint8mf4_t v6) {
+vuint8mf4x7_t test_vcreate_v_u8mf4x7(vuint8mf4_t v0, vuint8mf4_t v1,
+                                     vuint8mf4_t v2, vuint8mf4_t v3,
+                                     vuint8mf4_t v4, vuint8mf4_t v5,
+                                     vuint8mf4_t v6) {
   return __riscv_vcreate_v_u8mf4x7(v0, v1, v2, v3, v4, v5, v6);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8> } @test_vcreate_v_u8mf4x8
-// CHECK-RV64-SAME: (<vscale x 2 x i8> [[V0:%.*]], <vscale x 2 x i8> [[V1:%.*]], <vscale x 2 x i8> [[V2:%.*]], <vscale x 2 x i8> [[V3:%.*]], <vscale x 2 x i8> [[V4:%.*]], <vscale x 2 x i8> [[V5:%.*]], <vscale x 2 x i8> [[V6:%.*]], <vscale x 2 x i8> [[V7:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8> } @test_vcreate_v_u8mf4x8(
+// CHECK-RV64-SAME: <vscale x 2 x i8> [[V0:%.*]], <vscale x 2 x i8> [[V1:%.*]], <vscale x 2 x i8> [[V2:%.*]], <vscale x 2 x i8> [[V3:%.*]], <vscale x 2 x i8> [[V4:%.*]], <vscale x 2 x i8> [[V5:%.*]], <vscale x 2 x i8> [[V6:%.*]], <vscale x 2 x i8> [[V7:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8> } poison, <vscale x 2 x i8> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8> } [[TMP0]], <vscale x 2 x i8> [[V1]], 1
@@ -2096,12 +3188,15 @@ vuint8mf4x7_t test_vcreate_v_u8mf4x7(vuint8mf4_t v0, vuint8mf4_t v1, vuint8mf4_t
 // CHECK-RV64-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8> } [[TMP6]], <vscale x 2 x i8> [[V7]], 7
 // CHECK-RV64-NEXT:    ret { <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8>, <vscale x 2 x i8> } [[TMP7]]
 //
-vuint8mf4x8_t test_vcreate_v_u8mf4x8(vuint8mf4_t v0, vuint8mf4_t v1, vuint8mf4_t v2, vuint8mf4_t v3, vuint8mf4_t v4, vuint8mf4_t v5, vuint8mf4_t v6, vuint8mf4_t v7) {
+vuint8mf4x8_t test_vcreate_v_u8mf4x8(vuint8mf4_t v0, vuint8mf4_t v1,
+                                     vuint8mf4_t v2, vuint8mf4_t v3,
+                                     vuint8mf4_t v4, vuint8mf4_t v5,
+                                     vuint8mf4_t v6, vuint8mf4_t v7) {
   return __riscv_vcreate_v_u8mf4x8(v0, v1, v2, v3, v4, v5, v6, v7);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x i8>, <vscale x 4 x i8> } @test_vcreate_v_u8mf2x2
-// CHECK-RV64-SAME: (<vscale x 4 x i8> [[V0:%.*]], <vscale x 4 x i8> [[V1:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x i8>, <vscale x 4 x i8> } @test_vcreate_v_u8mf2x2(
+// CHECK-RV64-SAME: <vscale x 4 x i8> [[V0:%.*]], <vscale x 4 x i8> [[V1:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 4 x i8>, <vscale x 4 x i8> } poison, <vscale x 4 x i8> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 4 x i8>, <vscale x 4 x i8> } [[TMP0]], <vscale x 4 x i8> [[V1]], 1
@@ -2111,20 +3206,21 @@ vuint8mf2x2_t test_vcreate_v_u8mf2x2(vuint8mf2_t v0, vuint8mf2_t v1) {
   return __riscv_vcreate_v_u8mf2x2(v0, v1);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8> } @test_vcreate_v_u8mf2x3
-// CHECK-RV64-SAME: (<vscale x 4 x i8> [[V0:%.*]], <vscale x 4 x i8> [[V1:%.*]], <vscale x 4 x i8> [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8> } @test_vcreate_v_u8mf2x3(
+// CHECK-RV64-SAME: <vscale x 4 x i8> [[V0:%.*]], <vscale x 4 x i8> [[V1:%.*]], <vscale x 4 x i8> [[V2:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8> } poison, <vscale x 4 x i8> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8> } [[TMP0]], <vscale x 4 x i8> [[V1]], 1
 // CHECK-RV64-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8> } [[TMP1]], <vscale x 4 x i8> [[V2]], 2
 // CHECK-RV64-NEXT:    ret { <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8> } [[TMP2]]
 //
-vuint8mf2x3_t test_vcreate_v_u8mf2x3(vuint8mf2_t v0, vuint8mf2_t v1, vuint8mf2_t v2) {
+vuint8mf2x3_t test_vcreate_v_u8mf2x3(vuint8mf2_t v0, vuint8mf2_t v1,
+                                     vuint8mf2_t v2) {
   return __riscv_vcreate_v_u8mf2x3(v0, v1, v2);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8> } @test_vcreate_v_u8mf2x4
-// CHECK-RV64-SAME: (<vscale x 4 x i8> [[V0:%.*]], <vscale x 4 x i8> [[V1:%.*]], <vscale x 4 x i8> [[V2:%.*]], <vscale x 4 x i8> [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8> } @test_vcreate_v_u8mf2x4(
+// CHECK-RV64-SAME: <vscale x 4 x i8> [[V0:%.*]], <vscale x 4 x i8> [[V1:%.*]], <vscale x 4 x i8> [[V2:%.*]], <vscale x 4 x i8> [[V3:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8> } poison, <vscale x 4 x i8> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8> } [[TMP0]], <vscale x 4 x i8> [[V1]], 1
@@ -2132,12 +3228,13 @@ vuint8mf2x3_t test_vcreate_v_u8mf2x3(vuint8mf2_t v0, vuint8mf2_t v1, vuint8mf2_t
 // CHECK-RV64-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8> } [[TMP2]], <vscale x 4 x i8> [[V3]], 3
 // CHECK-RV64-NEXT:    ret { <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8> } [[TMP3]]
 //
-vuint8mf2x4_t test_vcreate_v_u8mf2x4(vuint8mf2_t v0, vuint8mf2_t v1, vuint8mf2_t v2, vuint8mf2_t v3) {
+vuint8mf2x4_t test_vcreate_v_u8mf2x4(vuint8mf2_t v0, vuint8mf2_t v1,
+                                     vuint8mf2_t v2, vuint8mf2_t v3) {
   return __riscv_vcreate_v_u8mf2x4(v0, v1, v2, v3);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8> } @test_vcreate_v_u8mf2x5
-// CHECK-RV64-SAME: (<vscale x 4 x i8> [[V0:%.*]], <vscale x 4 x i8> [[V1:%.*]], <vscale x 4 x i8> [[V2:%.*]], <vscale x 4 x i8> [[V3:%.*]], <vscale x 4 x i8> [[V4:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8> } @test_vcreate_v_u8mf2x5(
+// CHECK-RV64-SAME: <vscale x 4 x i8> [[V0:%.*]], <vscale x 4 x i8> [[V1:%.*]], <vscale x 4 x i8> [[V2:%.*]], <vscale x 4 x i8> [[V3:%.*]], <vscale x 4 x i8> [[V4:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8> } poison, <vscale x 4 x i8> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8> } [[TMP0]], <vscale x 4 x i8> [[V1]], 1
@@ -2146,12 +3243,14 @@ vuint8mf2x4_t test_vcreate_v_u8mf2x4(vuint8mf2_t v0, vuint8mf2_t v1, vuint8mf2_t
 // CHECK-RV64-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8> } [[TMP3]], <vscale x 4 x i8> [[V4]], 4
 // CHECK-RV64-NEXT:    ret { <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8> } [[TMP4]]
 //
-vuint8mf2x5_t test_vcreate_v_u8mf2x5(vuint8mf2_t v0, vuint8mf2_t v1, vuint8mf2_t v2, vuint8mf2_t v3, vuint8mf2_t v4) {
+vuint8mf2x5_t test_vcreate_v_u8mf2x5(vuint8mf2_t v0, vuint8mf2_t v1,
+                                     vuint8mf2_t v2, vuint8mf2_t v3,
+                                     vuint8mf2_t v4) {
   return __riscv_vcreate_v_u8mf2x5(v0, v1, v2, v3, v4);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8> } @test_vcreate_v_u8mf2x6
-// CHECK-RV64-SAME: (<vscale x 4 x i8> [[V0:%.*]], <vscale x 4 x i8> [[V1:%.*]], <vscale x 4 x i8> [[V2:%.*]], <vscale x 4 x i8> [[V3:%.*]], <vscale x 4 x i8> [[V4:%.*]], <vscale x 4 x i8> [[V5:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8> } @test_vcreate_v_u8mf2x6(
+// CHECK-RV64-SAME: <vscale x 4 x i8> [[V0:%.*]], <vscale x 4 x i8> [[V1:%.*]], <vscale x 4 x i8> [[V2:%.*]], <vscale x 4 x i8> [[V3:%.*]], <vscale x 4 x i8> [[V4:%.*]], <vscale x 4 x i8> [[V5:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8> } poison, <vscale x 4 x i8> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8> } [[TMP0]], <vscale x 4 x i8> [[V1]], 1
@@ -2161,12 +3260,14 @@ vuint8mf2x5_t test_vcreate_v_u8mf2x5(vuint8mf2_t v0, vuint8mf2_t v1, vuint8mf2_t
 // CHECK-RV64-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8> } [[TMP4]], <vscale x 4 x i8> [[V5]], 5
 // CHECK-RV64-NEXT:    ret { <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8> } [[TMP5]]
 //
-vuint8mf2x6_t test_vcreate_v_u8mf2x6(vuint8mf2_t v0, vuint8mf2_t v1, vuint8mf2_t v2, vuint8mf2_t v3, vuint8mf2_t v4, vuint8mf2_t v5) {
+vuint8mf2x6_t test_vcreate_v_u8mf2x6(vuint8mf2_t v0, vuint8mf2_t v1,
+                                     vuint8mf2_t v2, vuint8mf2_t v3,
+                                     vuint8mf2_t v4, vuint8mf2_t v5) {
   return __riscv_vcreate_v_u8mf2x6(v0, v1, v2, v3, v4, v5);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8> } @test_vcreate_v_u8mf2x7
-// CHECK-RV64-SAME: (<vscale x 4 x i8> [[V0:%.*]], <vscale x 4 x i8> [[V1:%.*]], <vscale x 4 x i8> [[V2:%.*]], <vscale x 4 x i8> [[V3:%.*]], <vscale x 4 x i8> [[V4:%.*]], <vscale x 4 x i8> [[V5:%.*]], <vscale x 4 x i8> [[V6:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8> } @test_vcreate_v_u8mf2x7(
+// CHECK-RV64-SAME: <vscale x 4 x i8> [[V0:%.*]], <vscale x 4 x i8> [[V1:%.*]], <vscale x 4 x i8> [[V2:%.*]], <vscale x 4 x i8> [[V3:%.*]], <vscale x 4 x i8> [[V4:%.*]], <vscale x 4 x i8> [[V5:%.*]], <vscale x 4 x i8> [[V6:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8> } poison, <vscale x 4 x i8> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8> } [[TMP0]], <vscale x 4 x i8> [[V1]], 1
@@ -2177,12 +3278,15 @@ vuint8mf2x6_t test_vcreate_v_u8mf2x6(vuint8mf2_t v0, vuint8mf2_t v1, vuint8mf2_t
 // CHECK-RV64-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8> } [[TMP5]], <vscale x 4 x i8> [[V6]], 6
 // CHECK-RV64-NEXT:    ret { <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8> } [[TMP6]]
 //
-vuint8mf2x7_t test_vcreate_v_u8mf2x7(vuint8mf2_t v0, vuint8mf2_t v1, vuint8mf2_t v2, vuint8mf2_t v3, vuint8mf2_t v4, vuint8mf2_t v5, vuint8mf2_t v6) {
+vuint8mf2x7_t test_vcreate_v_u8mf2x7(vuint8mf2_t v0, vuint8mf2_t v1,
+                                     vuint8mf2_t v2, vuint8mf2_t v3,
+                                     vuint8mf2_t v4, vuint8mf2_t v5,
+                                     vuint8mf2_t v6) {
   return __riscv_vcreate_v_u8mf2x7(v0, v1, v2, v3, v4, v5, v6);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8> } @test_vcreate_v_u8mf2x8
-// CHECK-RV64-SAME: (<vscale x 4 x i8> [[V0:%.*]], <vscale x 4 x i8> [[V1:%.*]], <vscale x 4 x i8> [[V2:%.*]], <vscale x 4 x i8> [[V3:%.*]], <vscale x 4 x i8> [[V4:%.*]], <vscale x 4 x i8> [[V5:%.*]], <vscale x 4 x i8> [[V6:%.*]], <vscale x 4 x i8> [[V7:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8> } @test_vcreate_v_u8mf2x8(
+// CHECK-RV64-SAME: <vscale x 4 x i8> [[V0:%.*]], <vscale x 4 x i8> [[V1:%.*]], <vscale x 4 x i8> [[V2:%.*]], <vscale x 4 x i8> [[V3:%.*]], <vscale x 4 x i8> [[V4:%.*]], <vscale x 4 x i8> [[V5:%.*]], <vscale x 4 x i8> [[V6:%.*]], <vscale x 4 x i8> [[V7:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8> } poison, <vscale x 4 x i8> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8> } [[TMP0]], <vscale x 4 x i8> [[V1]], 1
@@ -2194,12 +3298,15 @@ vuint8mf2x7_t test_vcreate_v_u8mf2x7(vuint8mf2_t v0, vuint8mf2_t v1, vuint8mf2_t
 // CHECK-RV64-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8> } [[TMP6]], <vscale x 4 x i8> [[V7]], 7
 // CHECK-RV64-NEXT:    ret { <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8>, <vscale x 4 x i8> } [[TMP7]]
 //
-vuint8mf2x8_t test_vcreate_v_u8mf2x8(vuint8mf2_t v0, vuint8mf2_t v1, vuint8mf2_t v2, vuint8mf2_t v3, vuint8mf2_t v4, vuint8mf2_t v5, vuint8mf2_t v6, vuint8mf2_t v7) {
+vuint8mf2x8_t test_vcreate_v_u8mf2x8(vuint8mf2_t v0, vuint8mf2_t v1,
+                                     vuint8mf2_t v2, vuint8mf2_t v3,
+                                     vuint8mf2_t v4, vuint8mf2_t v5,
+                                     vuint8mf2_t v6, vuint8mf2_t v7) {
   return __riscv_vcreate_v_u8mf2x8(v0, v1, v2, v3, v4, v5, v6, v7);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 8 x i8>, <vscale x 8 x i8> } @test_vcreate_v_u8m1x2
-// CHECK-RV64-SAME: (<vscale x 8 x i8> [[V0:%.*]], <vscale x 8 x i8> [[V1:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 8 x i8>, <vscale x 8 x i8> } @test_vcreate_v_u8m1x2(
+// CHECK-RV64-SAME: <vscale x 8 x i8> [[V0:%.*]], <vscale x 8 x i8> [[V1:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x i8>, <vscale x 8 x i8> } poison, <vscale x 8 x i8> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x i8>, <vscale x 8 x i8> } [[TMP0]], <vscale x 8 x i8> [[V1]], 1
@@ -2209,20 +3316,21 @@ vuint8m1x2_t test_vcreate_v_u8m1x2(vuint8m1_t v0, vuint8m1_t v1) {
   return __riscv_vcreate_v_u8m1x2(v0, v1);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } @test_vcreate_v_u8m1x3
-// CHECK-RV64-SAME: (<vscale x 8 x i8> [[V0:%.*]], <vscale x 8 x i8> [[V1:%.*]], <vscale x 8 x i8> [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } @test_vcreate_v_u8m1x3(
+// CHECK-RV64-SAME: <vscale x 8 x i8> [[V0:%.*]], <vscale x 8 x i8> [[V1:%.*]], <vscale x 8 x i8> [[V2:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } poison, <vscale x 8 x i8> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } [[TMP0]], <vscale x 8 x i8> [[V1]], 1
 // CHECK-RV64-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } [[TMP1]], <vscale x 8 x i8> [[V2]], 2
 // CHECK-RV64-NEXT:    ret { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } [[TMP2]]
 //
-vuint8m1x3_t test_vcreate_v_u8m1x3(vuint8m1_t v0, vuint8m1_t v1, vuint8m1_t v2) {
+vuint8m1x3_t test_vcreate_v_u8m1x3(vuint8m1_t v0, vuint8m1_t v1,
+                                   vuint8m1_t v2) {
   return __riscv_vcreate_v_u8m1x3(v0, v1, v2);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } @test_vcreate_v_u8m1x4
-// CHECK-RV64-SAME: (<vscale x 8 x i8> [[V0:%.*]], <vscale x 8 x i8> [[V1:%.*]], <vscale x 8 x i8> [[V2:%.*]], <vscale x 8 x i8> [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } @test_vcreate_v_u8m1x4(
+// CHECK-RV64-SAME: <vscale x 8 x i8> [[V0:%.*]], <vscale x 8 x i8> [[V1:%.*]], <vscale x 8 x i8> [[V2:%.*]], <vscale x 8 x i8> [[V3:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } poison, <vscale x 8 x i8> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } [[TMP0]], <vscale x 8 x i8> [[V1]], 1
@@ -2230,12 +3338,13 @@ vuint8m1x3_t test_vcreate_v_u8m1x3(vuint8m1_t v0, vuint8m1_t v1, vuint8m1_t v2)
 // CHECK-RV64-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } [[TMP2]], <vscale x 8 x i8> [[V3]], 3
 // CHECK-RV64-NEXT:    ret { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } [[TMP3]]
 //
-vuint8m1x4_t test_vcreate_v_u8m1x4(vuint8m1_t v0, vuint8m1_t v1, vuint8m1_t v2, vuint8m1_t v3) {
+vuint8m1x4_t test_vcreate_v_u8m1x4(vuint8m1_t v0, vuint8m1_t v1, vuint8m1_t v2,
+                                   vuint8m1_t v3) {
   return __riscv_vcreate_v_u8m1x4(v0, v1, v2, v3);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } @test_vcreate_v_u8m1x5
-// CHECK-RV64-SAME: (<vscale x 8 x i8> [[V0:%.*]], <vscale x 8 x i8> [[V1:%.*]], <vscale x 8 x i8> [[V2:%.*]], <vscale x 8 x i8> [[V3:%.*]], <vscale x 8 x i8> [[V4:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } @test_vcreate_v_u8m1x5(
+// CHECK-RV64-SAME: <vscale x 8 x i8> [[V0:%.*]], <vscale x 8 x i8> [[V1:%.*]], <vscale x 8 x i8> [[V2:%.*]], <vscale x 8 x i8> [[V3:%.*]], <vscale x 8 x i8> [[V4:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } poison, <vscale x 8 x i8> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } [[TMP0]], <vscale x 8 x i8> [[V1]], 1
@@ -2244,12 +3353,13 @@ vuint8m1x4_t test_vcreate_v_u8m1x4(vuint8m1_t v0, vuint8m1_t v1, vuint8m1_t v2,
 // CHECK-RV64-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } [[TMP3]], <vscale x 8 x i8> [[V4]], 4
 // CHECK-RV64-NEXT:    ret { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } [[TMP4]]
 //
-vuint8m1x5_t test_vcreate_v_u8m1x5(vuint8m1_t v0, vuint8m1_t v1, vuint8m1_t v2, vuint8m1_t v3, vuint8m1_t v4) {
+vuint8m1x5_t test_vcreate_v_u8m1x5(vuint8m1_t v0, vuint8m1_t v1, vuint8m1_t v2,
+                                   vuint8m1_t v3, vuint8m1_t v4) {
   return __riscv_vcreate_v_u8m1x5(v0, v1, v2, v3, v4);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } @test_vcreate_v_u8m1x6
-// CHECK-RV64-SAME: (<vscale x 8 x i8> [[V0:%.*]], <vscale x 8 x i8> [[V1:%.*]], <vscale x 8 x i8> [[V2:%.*]], <vscale x 8 x i8> [[V3:%.*]], <vscale x 8 x i8> [[V4:%.*]], <vscale x 8 x i8> [[V5:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } @test_vcreate_v_u8m1x6(
+// CHECK-RV64-SAME: <vscale x 8 x i8> [[V0:%.*]], <vscale x 8 x i8> [[V1:%.*]], <vscale x 8 x i8> [[V2:%.*]], <vscale x 8 x i8> [[V3:%.*]], <vscale x 8 x i8> [[V4:%.*]], <vscale x 8 x i8> [[V5:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } poison, <vscale x 8 x i8> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } [[TMP0]], <vscale x 8 x i8> [[V1]], 1
@@ -2259,12 +3369,14 @@ vuint8m1x5_t test_vcreate_v_u8m1x5(vuint8m1_t v0, vuint8m1_t v1, vuint8m1_t v2,
 // CHECK-RV64-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } [[TMP4]], <vscale x 8 x i8> [[V5]], 5
 // CHECK-RV64-NEXT:    ret { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } [[TMP5]]
 //
-vuint8m1x6_t test_vcreate_v_u8m1x6(vuint8m1_t v0, vuint8m1_t v1, vuint8m1_t v2, vuint8m1_t v3, vuint8m1_t v4, vuint8m1_t v5) {
+vuint8m1x6_t test_vcreate_v_u8m1x6(vuint8m1_t v0, vuint8m1_t v1, vuint8m1_t v2,
+                                   vuint8m1_t v3, vuint8m1_t v4,
+                                   vuint8m1_t v5) {
   return __riscv_vcreate_v_u8m1x6(v0, v1, v2, v3, v4, v5);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } @test_vcreate_v_u8m1x7
-// CHECK-RV64-SAME: (<vscale x 8 x i8> [[V0:%.*]], <vscale x 8 x i8> [[V1:%.*]], <vscale x 8 x i8> [[V2:%.*]], <vscale x 8 x i8> [[V3:%.*]], <vscale x 8 x i8> [[V4:%.*]], <vscale x 8 x i8> [[V5:%.*]], <vscale x 8 x i8> [[V6:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } @test_vcreate_v_u8m1x7(
+// CHECK-RV64-SAME: <vscale x 8 x i8> [[V0:%.*]], <vscale x 8 x i8> [[V1:%.*]], <vscale x 8 x i8> [[V2:%.*]], <vscale x 8 x i8> [[V3:%.*]], <vscale x 8 x i8> [[V4:%.*]], <vscale x 8 x i8> [[V5:%.*]], <vscale x 8 x i8> [[V6:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } poison, <vscale x 8 x i8> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } [[TMP0]], <vscale x 8 x i8> [[V1]], 1
@@ -2275,12 +3387,14 @@ vuint8m1x6_t test_vcreate_v_u8m1x6(vuint8m1_t v0, vuint8m1_t v1, vuint8m1_t v2,
 // CHECK-RV64-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } [[TMP5]], <vscale x 8 x i8> [[V6]], 6
 // CHECK-RV64-NEXT:    ret { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } [[TMP6]]
 //
-vuint8m1x7_t test_vcreate_v_u8m1x7(vuint8m1_t v0, vuint8m1_t v1, vuint8m1_t v2, vuint8m1_t v3, vuint8m1_t v4, vuint8m1_t v5, vuint8m1_t v6) {
+vuint8m1x7_t test_vcreate_v_u8m1x7(vuint8m1_t v0, vuint8m1_t v1, vuint8m1_t v2,
+                                   vuint8m1_t v3, vuint8m1_t v4, vuint8m1_t v5,
+                                   vuint8m1_t v6) {
   return __riscv_vcreate_v_u8m1x7(v0, v1, v2, v3, v4, v5, v6);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } @test_vcreate_v_u8m1x8
-// CHECK-RV64-SAME: (<vscale x 8 x i8> [[V0:%.*]], <vscale x 8 x i8> [[V1:%.*]], <vscale x 8 x i8> [[V2:%.*]], <vscale x 8 x i8> [[V3:%.*]], <vscale x 8 x i8> [[V4:%.*]], <vscale x 8 x i8> [[V5:%.*]], <vscale x 8 x i8> [[V6:%.*]], <vscale x 8 x i8> [[V7:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } @test_vcreate_v_u8m1x8(
+// CHECK-RV64-SAME: <vscale x 8 x i8> [[V0:%.*]], <vscale x 8 x i8> [[V1:%.*]], <vscale x 8 x i8> [[V2:%.*]], <vscale x 8 x i8> [[V3:%.*]], <vscale x 8 x i8> [[V4:%.*]], <vscale x 8 x i8> [[V5:%.*]], <vscale x 8 x i8> [[V6:%.*]], <vscale x 8 x i8> [[V7:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } poison, <vscale x 8 x i8> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } [[TMP0]], <vscale x 8 x i8> [[V1]], 1
@@ -2292,12 +3406,14 @@ vuint8m1x7_t test_vcreate_v_u8m1x7(vuint8m1_t v0, vuint8m1_t v1, vuint8m1_t v2,
 // CHECK-RV64-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } [[TMP6]], <vscale x 8 x i8> [[V7]], 7
 // CHECK-RV64-NEXT:    ret { <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i8> } [[TMP7]]
 //
-vuint8m1x8_t test_vcreate_v_u8m1x8(vuint8m1_t v0, vuint8m1_t v1, vuint8m1_t v2, vuint8m1_t v3, vuint8m1_t v4, vuint8m1_t v5, vuint8m1_t v6, vuint8m1_t v7) {
+vuint8m1x8_t test_vcreate_v_u8m1x8(vuint8m1_t v0, vuint8m1_t v1, vuint8m1_t v2,
+                                   vuint8m1_t v3, vuint8m1_t v4, vuint8m1_t v5,
+                                   vuint8m1_t v6, vuint8m1_t v7) {
   return __riscv_vcreate_v_u8m1x8(v0, v1, v2, v3, v4, v5, v6, v7);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8> } @test_vcreate_v_u8m2x2
-// CHECK-RV64-SAME: (<vscale x 16 x i8> [[V0:%.*]], <vscale x 16 x i8> [[V1:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8> } @test_vcreate_v_u8m2x2(
+// CHECK-RV64-SAME: <vscale x 16 x i8> [[V0:%.*]], <vscale x 16 x i8> [[V1:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[V1]], 1
@@ -2307,20 +3423,21 @@ vuint8m2x2_t test_vcreate_v_u8m2x2(vuint8m2_t v0, vuint8m2_t v1) {
   return __riscv_vcreate_v_u8m2x2(v0, v1);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @test_vcreate_v_u8m2x3
-// CHECK-RV64-SAME: (<vscale x 16 x i8> [[V0:%.*]], <vscale x 16 x i8> [[V1:%.*]], <vscale x 16 x i8> [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @test_vcreate_v_u8m2x3(
+// CHECK-RV64-SAME: <vscale x 16 x i8> [[V0:%.*]], <vscale x 16 x i8> [[V1:%.*]], <vscale x 16 x i8> [[V2:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[V1]], 1
 // CHECK-RV64-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP1]], <vscale x 16 x i8> [[V2]], 2
 // CHECK-RV64-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]]
 //
-vuint8m2x3_t test_vcreate_v_u8m2x3(vuint8m2_t v0, vuint8m2_t v1, vuint8m2_t v2) {
+vuint8m2x3_t test_vcreate_v_u8m2x3(vuint8m2_t v0, vuint8m2_t v1,
+                                   vuint8m2_t v2) {
   return __riscv_vcreate_v_u8m2x3(v0, v1, v2);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @test_vcreate_v_u8m2x4
-// CHECK-RV64-SAME: (<vscale x 16 x i8> [[V0:%.*]], <vscale x 16 x i8> [[V1:%.*]], <vscale x 16 x i8> [[V2:%.*]], <vscale x 16 x i8> [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @test_vcreate_v_u8m2x4(
+// CHECK-RV64-SAME: <vscale x 16 x i8> [[V0:%.*]], <vscale x 16 x i8> [[V1:%.*]], <vscale x 16 x i8> [[V2:%.*]], <vscale x 16 x i8> [[V3:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } poison, <vscale x 16 x i8> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]], <vscale x 16 x i8> [[V1]], 1
@@ -2328,12 +3445,13 @@ vuint8m2x3_t test_vcreate_v_u8m2x3(vuint8m2_t v0, vuint8m2_t v1, vuint8m2_t v2)
 // CHECK-RV64-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP2]], <vscale x 16 x i8> [[V3]], 3
 // CHECK-RV64-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP3]]
 //
-vuint8m2x4_t test_vcreate_v_u8m2x4(vuint8m2_t v0, vuint8m2_t v1, vuint8m2_t v2, vuint8m2_t v3) {
+vuint8m2x4_t test_vcreate_v_u8m2x4(vuint8m2_t v0, vuint8m2_t v1, vuint8m2_t v2,
+                                   vuint8m2_t v3) {
   return __riscv_vcreate_v_u8m2x4(v0, v1, v2, v3);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 32 x i8>, <vscale x 32 x i8> } @test_vcreate_v_u8m4x2
-// CHECK-RV64-SAME: (<vscale x 32 x i8> [[V0:%.*]], <vscale x 32 x i8> [[V1:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 32 x i8>, <vscale x 32 x i8> } @test_vcreate_v_u8m4x2(
+// CHECK-RV64-SAME: <vscale x 32 x i8> [[V0:%.*]], <vscale x 32 x i8> [[V1:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 32 x i8>, <vscale x 32 x i8> } poison, <vscale x 32 x i8> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 32 x i8>, <vscale x 32 x i8> } [[TMP0]], <vscale x 32 x i8> [[V1]], 1
@@ -2343,8 +3461,8 @@ vuint8m4x2_t test_vcreate_v_u8m4x2(vuint8m4_t v0, vuint8m4_t v1) {
   return __riscv_vcreate_v_u8m4x2(v0, v1);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i16>, <vscale x 1 x i16> } @test_vcreate_v_u16mf4x2
-// CHECK-RV64-SAME: (<vscale x 1 x i16> [[V0:%.*]], <vscale x 1 x i16> [[V1:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i16>, <vscale x 1 x i16> } @test_vcreate_v_u16mf4x2(
+// CHECK-RV64-SAME: <vscale x 1 x i16> [[V0:%.*]], <vscale x 1 x i16> [[V1:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 1 x i16>, <vscale x 1 x i16> } poison, <vscale x 1 x i16> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 1 x i16>, <vscale x 1 x i16> } [[TMP0]], <vscale x 1 x i16> [[V1]], 1
@@ -2354,20 +3472,21 @@ vuint16mf4x2_t test_vcreate_v_u16mf4x2(vuint16mf4_t v0, vuint16mf4_t v1) {
   return __riscv_vcreate_v_u16mf4x2(v0, v1);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16> } @test_vcreate_v_u16mf4x3
-// CHECK-RV64-SAME: (<vscale x 1 x i16> [[V0:%.*]], <vscale x 1 x i16> [[V1:%.*]], <vscale x 1 x i16> [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16> } @test_vcreate_v_u16mf4x3(
+// CHECK-RV64-SAME: <vscale x 1 x i16> [[V0:%.*]], <vscale x 1 x i16> [[V1:%.*]], <vscale x 1 x i16> [[V2:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16> } poison, <vscale x 1 x i16> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16> } [[TMP0]], <vscale x 1 x i16> [[V1]], 1
 // CHECK-RV64-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16> } [[TMP1]], <vscale x 1 x i16> [[V2]], 2
 // CHECK-RV64-NEXT:    ret { <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16> } [[TMP2]]
 //
-vuint16mf4x3_t test_vcreate_v_u16mf4x3(vuint16mf4_t v0, vuint16mf4_t v1, vuint16mf4_t v2) {
+vuint16mf4x3_t test_vcreate_v_u16mf4x3(vuint16mf4_t v0, vuint16mf4_t v1,
+                                       vuint16mf4_t v2) {
   return __riscv_vcreate_v_u16mf4x3(v0, v1, v2);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16> } @test_vcreate_v_u16mf4x4
-// CHECK-RV64-SAME: (<vscale x 1 x i16> [[V0:%.*]], <vscale x 1 x i16> [[V1:%.*]], <vscale x 1 x i16> [[V2:%.*]], <vscale x 1 x i16> [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16> } @test_vcreate_v_u16mf4x4(
+// CHECK-RV64-SAME: <vscale x 1 x i16> [[V0:%.*]], <vscale x 1 x i16> [[V1:%.*]], <vscale x 1 x i16> [[V2:%.*]], <vscale x 1 x i16> [[V3:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16> } poison, <vscale x 1 x i16> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16> } [[TMP0]], <vscale x 1 x i16> [[V1]], 1
@@ -2375,12 +3494,13 @@ vuint16mf4x3_t test_vcreate_v_u16mf4x3(vuint16mf4_t v0, vuint16mf4_t v1, vuint16
 // CHECK-RV64-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16> } [[TMP2]], <vscale x 1 x i16> [[V3]], 3
 // CHECK-RV64-NEXT:    ret { <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16> } [[TMP3]]
 //
-vuint16mf4x4_t test_vcreate_v_u16mf4x4(vuint16mf4_t v0, vuint16mf4_t v1, vuint16mf4_t v2, vuint16mf4_t v3) {
+vuint16mf4x4_t test_vcreate_v_u16mf4x4(vuint16mf4_t v0, vuint16mf4_t v1,
+                                       vuint16mf4_t v2, vuint16mf4_t v3) {
   return __riscv_vcreate_v_u16mf4x4(v0, v1, v2, v3);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16> } @test_vcreate_v_u16mf4x5
-// CHECK-RV64-SAME: (<vscale x 1 x i16> [[V0:%.*]], <vscale x 1 x i16> [[V1:%.*]], <vscale x 1 x i16> [[V2:%.*]], <vscale x 1 x i16> [[V3:%.*]], <vscale x 1 x i16> [[V4:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16> } @test_vcreate_v_u16mf4x5(
+// CHECK-RV64-SAME: <vscale x 1 x i16> [[V0:%.*]], <vscale x 1 x i16> [[V1:%.*]], <vscale x 1 x i16> [[V2:%.*]], <vscale x 1 x i16> [[V3:%.*]], <vscale x 1 x i16> [[V4:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16> } poison, <vscale x 1 x i16> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16> } [[TMP0]], <vscale x 1 x i16> [[V1]], 1
@@ -2389,12 +3509,14 @@ vuint16mf4x4_t test_vcreate_v_u16mf4x4(vuint16mf4_t v0, vuint16mf4_t v1, vuint16
 // CHECK-RV64-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16> } [[TMP3]], <vscale x 1 x i16> [[V4]], 4
 // CHECK-RV64-NEXT:    ret { <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16> } [[TMP4]]
 //
-vuint16mf4x5_t test_vcreate_v_u16mf4x5(vuint16mf4_t v0, vuint16mf4_t v1, vuint16mf4_t v2, vuint16mf4_t v3, vuint16mf4_t v4) {
+vuint16mf4x5_t test_vcreate_v_u16mf4x5(vuint16mf4_t v0, vuint16mf4_t v1,
+                                       vuint16mf4_t v2, vuint16mf4_t v3,
+                                       vuint16mf4_t v4) {
   return __riscv_vcreate_v_u16mf4x5(v0, v1, v2, v3, v4);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16> } @test_vcreate_v_u16mf4x6
-// CHECK-RV64-SAME: (<vscale x 1 x i16> [[V0:%.*]], <vscale x 1 x i16> [[V1:%.*]], <vscale x 1 x i16> [[V2:%.*]], <vscale x 1 x i16> [[V3:%.*]], <vscale x 1 x i16> [[V4:%.*]], <vscale x 1 x i16> [[V5:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16> } @test_vcreate_v_u16mf4x6(
+// CHECK-RV64-SAME: <vscale x 1 x i16> [[V0:%.*]], <vscale x 1 x i16> [[V1:%.*]], <vscale x 1 x i16> [[V2:%.*]], <vscale x 1 x i16> [[V3:%.*]], <vscale x 1 x i16> [[V4:%.*]], <vscale x 1 x i16> [[V5:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16> } poison, <vscale x 1 x i16> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16> } [[TMP0]], <vscale x 1 x i16> [[V1]], 1
@@ -2404,12 +3526,14 @@ vuint16mf4x5_t test_vcreate_v_u16mf4x5(vuint16mf4_t v0, vuint16mf4_t v1, vuint16
 // CHECK-RV64-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16> } [[TMP4]], <vscale x 1 x i16> [[V5]], 5
 // CHECK-RV64-NEXT:    ret { <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16> } [[TMP5]]
 //
-vuint16mf4x6_t test_vcreate_v_u16mf4x6(vuint16mf4_t v0, vuint16mf4_t v1, vuint16mf4_t v2, vuint16mf4_t v3, vuint16mf4_t v4, vuint16mf4_t v5) {
+vuint16mf4x6_t test_vcreate_v_u16mf4x6(vuint16mf4_t v0, vuint16mf4_t v1,
+                                       vuint16mf4_t v2, vuint16mf4_t v3,
+                                       vuint16mf4_t v4, vuint16mf4_t v5) {
   return __riscv_vcreate_v_u16mf4x6(v0, v1, v2, v3, v4, v5);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16> } @test_vcreate_v_u16mf4x7
-// CHECK-RV64-SAME: (<vscale x 1 x i16> [[V0:%.*]], <vscale x 1 x i16> [[V1:%.*]], <vscale x 1 x i16> [[V2:%.*]], <vscale x 1 x i16> [[V3:%.*]], <vscale x 1 x i16> [[V4:%.*]], <vscale x 1 x i16> [[V5:%.*]], <vscale x 1 x i16> [[V6:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16> } @test_vcreate_v_u16mf4x7(
+// CHECK-RV64-SAME: <vscale x 1 x i16> [[V0:%.*]], <vscale x 1 x i16> [[V1:%.*]], <vscale x 1 x i16> [[V2:%.*]], <vscale x 1 x i16> [[V3:%.*]], <vscale x 1 x i16> [[V4:%.*]], <vscale x 1 x i16> [[V5:%.*]], <vscale x 1 x i16> [[V6:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16> } poison, <vscale x 1 x i16> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16> } [[TMP0]], <vscale x 1 x i16> [[V1]], 1
@@ -2420,12 +3544,15 @@ vuint16mf4x6_t test_vcreate_v_u16mf4x6(vuint16mf4_t v0, vuint16mf4_t v1, vuint16
 // CHECK-RV64-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16> } [[TMP5]], <vscale x 1 x i16> [[V6]], 6
 // CHECK-RV64-NEXT:    ret { <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16> } [[TMP6]]
 //
-vuint16mf4x7_t test_vcreate_v_u16mf4x7(vuint16mf4_t v0, vuint16mf4_t v1, vuint16mf4_t v2, vuint16mf4_t v3, vuint16mf4_t v4, vuint16mf4_t v5, vuint16mf4_t v6) {
+vuint16mf4x7_t test_vcreate_v_u16mf4x7(vuint16mf4_t v0, vuint16mf4_t v1,
+                                       vuint16mf4_t v2, vuint16mf4_t v3,
+                                       vuint16mf4_t v4, vuint16mf4_t v5,
+                                       vuint16mf4_t v6) {
   return __riscv_vcreate_v_u16mf4x7(v0, v1, v2, v3, v4, v5, v6);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16> } @test_vcreate_v_u16mf4x8
-// CHECK-RV64-SAME: (<vscale x 1 x i16> [[V0:%.*]], <vscale x 1 x i16> [[V1:%.*]], <vscale x 1 x i16> [[V2:%.*]], <vscale x 1 x i16> [[V3:%.*]], <vscale x 1 x i16> [[V4:%.*]], <vscale x 1 x i16> [[V5:%.*]], <vscale x 1 x i16> [[V6:%.*]], <vscale x 1 x i16> [[V7:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16> } @test_vcreate_v_u16mf4x8(
+// CHECK-RV64-SAME: <vscale x 1 x i16> [[V0:%.*]], <vscale x 1 x i16> [[V1:%.*]], <vscale x 1 x i16> [[V2:%.*]], <vscale x 1 x i16> [[V3:%.*]], <vscale x 1 x i16> [[V4:%.*]], <vscale x 1 x i16> [[V5:%.*]], <vscale x 1 x i16> [[V6:%.*]], <vscale x 1 x i16> [[V7:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16> } poison, <vscale x 1 x i16> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16> } [[TMP0]], <vscale x 1 x i16> [[V1]], 1
@@ -2437,12 +3564,15 @@ vuint16mf4x7_t test_vcreate_v_u16mf4x7(vuint16mf4_t v0, vuint16mf4_t v1, vuint16
 // CHECK-RV64-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16> } [[TMP6]], <vscale x 1 x i16> [[V7]], 7
 // CHECK-RV64-NEXT:    ret { <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16>, <vscale x 1 x i16> } [[TMP7]]
 //
-vuint16mf4x8_t test_vcreate_v_u16mf4x8(vuint16mf4_t v0, vuint16mf4_t v1, vuint16mf4_t v2, vuint16mf4_t v3, vuint16mf4_t v4, vuint16mf4_t v5, vuint16mf4_t v6, vuint16mf4_t v7) {
+vuint16mf4x8_t test_vcreate_v_u16mf4x8(vuint16mf4_t v0, vuint16mf4_t v1,
+                                       vuint16mf4_t v2, vuint16mf4_t v3,
+                                       vuint16mf4_t v4, vuint16mf4_t v5,
+                                       vuint16mf4_t v6, vuint16mf4_t v7) {
   return __riscv_vcreate_v_u16mf4x8(v0, v1, v2, v3, v4, v5, v6, v7);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i16>, <vscale x 2 x i16> } @test_vcreate_v_u16mf2x2
-// CHECK-RV64-SAME: (<vscale x 2 x i16> [[V0:%.*]], <vscale x 2 x i16> [[V1:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i16>, <vscale x 2 x i16> } @test_vcreate_v_u16mf2x2(
+// CHECK-RV64-SAME: <vscale x 2 x i16> [[V0:%.*]], <vscale x 2 x i16> [[V1:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x i16>, <vscale x 2 x i16> } poison, <vscale x 2 x i16> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x i16>, <vscale x 2 x i16> } [[TMP0]], <vscale x 2 x i16> [[V1]], 1
@@ -2452,20 +3582,21 @@ vuint16mf2x2_t test_vcreate_v_u16mf2x2(vuint16mf2_t v0, vuint16mf2_t v1) {
   return __riscv_vcreate_v_u16mf2x2(v0, v1);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16> } @test_vcreate_v_u16mf2x3
-// CHECK-RV64-SAME: (<vscale x 2 x i16> [[V0:%.*]], <vscale x 2 x i16> [[V1:%.*]], <vscale x 2 x i16> [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16> } @test_vcreate_v_u16mf2x3(
+// CHECK-RV64-SAME: <vscale x 2 x i16> [[V0:%.*]], <vscale x 2 x i16> [[V1:%.*]], <vscale x 2 x i16> [[V2:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16> } poison, <vscale x 2 x i16> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16> } [[TMP0]], <vscale x 2 x i16> [[V1]], 1
 // CHECK-RV64-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16> } [[TMP1]], <vscale x 2 x i16> [[V2]], 2
 // CHECK-RV64-NEXT:    ret { <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16> } [[TMP2]]
 //
-vuint16mf2x3_t test_vcreate_v_u16mf2x3(vuint16mf2_t v0, vuint16mf2_t v1, vuint16mf2_t v2) {
+vuint16mf2x3_t test_vcreate_v_u16mf2x3(vuint16mf2_t v0, vuint16mf2_t v1,
+                                       vuint16mf2_t v2) {
   return __riscv_vcreate_v_u16mf2x3(v0, v1, v2);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16> } @test_vcreate_v_u16mf2x4
-// CHECK-RV64-SAME: (<vscale x 2 x i16> [[V0:%.*]], <vscale x 2 x i16> [[V1:%.*]], <vscale x 2 x i16> [[V2:%.*]], <vscale x 2 x i16> [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16> } @test_vcreate_v_u16mf2x4(
+// CHECK-RV64-SAME: <vscale x 2 x i16> [[V0:%.*]], <vscale x 2 x i16> [[V1:%.*]], <vscale x 2 x i16> [[V2:%.*]], <vscale x 2 x i16> [[V3:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16> } poison, <vscale x 2 x i16> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16> } [[TMP0]], <vscale x 2 x i16> [[V1]], 1
@@ -2473,12 +3604,13 @@ vuint16mf2x3_t test_vcreate_v_u16mf2x3(vuint16mf2_t v0, vuint16mf2_t v1, vuint16
 // CHECK-RV64-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16> } [[TMP2]], <vscale x 2 x i16> [[V3]], 3
 // CHECK-RV64-NEXT:    ret { <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16> } [[TMP3]]
 //
-vuint16mf2x4_t test_vcreate_v_u16mf2x4(vuint16mf2_t v0, vuint16mf2_t v1, vuint16mf2_t v2, vuint16mf2_t v3) {
+vuint16mf2x4_t test_vcreate_v_u16mf2x4(vuint16mf2_t v0, vuint16mf2_t v1,
+                                       vuint16mf2_t v2, vuint16mf2_t v3) {
   return __riscv_vcreate_v_u16mf2x4(v0, v1, v2, v3);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16> } @test_vcreate_v_u16mf2x5
-// CHECK-RV64-SAME: (<vscale x 2 x i16> [[V0:%.*]], <vscale x 2 x i16> [[V1:%.*]], <vscale x 2 x i16> [[V2:%.*]], <vscale x 2 x i16> [[V3:%.*]], <vscale x 2 x i16> [[V4:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16> } @test_vcreate_v_u16mf2x5(
+// CHECK-RV64-SAME: <vscale x 2 x i16> [[V0:%.*]], <vscale x 2 x i16> [[V1:%.*]], <vscale x 2 x i16> [[V2:%.*]], <vscale x 2 x i16> [[V3:%.*]], <vscale x 2 x i16> [[V4:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16> } poison, <vscale x 2 x i16> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16> } [[TMP0]], <vscale x 2 x i16> [[V1]], 1
@@ -2487,12 +3619,14 @@ vuint16mf2x4_t test_vcreate_v_u16mf2x4(vuint16mf2_t v0, vuint16mf2_t v1, vuint16
 // CHECK-RV64-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16> } [[TMP3]], <vscale x 2 x i16> [[V4]], 4
 // CHECK-RV64-NEXT:    ret { <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16> } [[TMP4]]
 //
-vuint16mf2x5_t test_vcreate_v_u16mf2x5(vuint16mf2_t v0, vuint16mf2_t v1, vuint16mf2_t v2, vuint16mf2_t v3, vuint16mf2_t v4) {
+vuint16mf2x5_t test_vcreate_v_u16mf2x5(vuint16mf2_t v0, vuint16mf2_t v1,
+                                       vuint16mf2_t v2, vuint16mf2_t v3,
+                                       vuint16mf2_t v4) {
   return __riscv_vcreate_v_u16mf2x5(v0, v1, v2, v3, v4);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16> } @test_vcreate_v_u16mf2x6
-// CHECK-RV64-SAME: (<vscale x 2 x i16> [[V0:%.*]], <vscale x 2 x i16> [[V1:%.*]], <vscale x 2 x i16> [[V2:%.*]], <vscale x 2 x i16> [[V3:%.*]], <vscale x 2 x i16> [[V4:%.*]], <vscale x 2 x i16> [[V5:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16> } @test_vcreate_v_u16mf2x6(
+// CHECK-RV64-SAME: <vscale x 2 x i16> [[V0:%.*]], <vscale x 2 x i16> [[V1:%.*]], <vscale x 2 x i16> [[V2:%.*]], <vscale x 2 x i16> [[V3:%.*]], <vscale x 2 x i16> [[V4:%.*]], <vscale x 2 x i16> [[V5:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16> } poison, <vscale x 2 x i16> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16> } [[TMP0]], <vscale x 2 x i16> [[V1]], 1
@@ -2502,12 +3636,14 @@ vuint16mf2x5_t test_vcreate_v_u16mf2x5(vuint16mf2_t v0, vuint16mf2_t v1, vuint16
 // CHECK-RV64-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16> } [[TMP4]], <vscale x 2 x i16> [[V5]], 5
 // CHECK-RV64-NEXT:    ret { <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16> } [[TMP5]]
 //
-vuint16mf2x6_t test_vcreate_v_u16mf2x6(vuint16mf2_t v0, vuint16mf2_t v1, vuint16mf2_t v2, vuint16mf2_t v3, vuint16mf2_t v4, vuint16mf2_t v5) {
+vuint16mf2x6_t test_vcreate_v_u16mf2x6(vuint16mf2_t v0, vuint16mf2_t v1,
+                                       vuint16mf2_t v2, vuint16mf2_t v3,
+                                       vuint16mf2_t v4, vuint16mf2_t v5) {
   return __riscv_vcreate_v_u16mf2x6(v0, v1, v2, v3, v4, v5);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16> } @test_vcreate_v_u16mf2x7
-// CHECK-RV64-SAME: (<vscale x 2 x i16> [[V0:%.*]], <vscale x 2 x i16> [[V1:%.*]], <vscale x 2 x i16> [[V2:%.*]], <vscale x 2 x i16> [[V3:%.*]], <vscale x 2 x i16> [[V4:%.*]], <vscale x 2 x i16> [[V5:%.*]], <vscale x 2 x i16> [[V6:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16> } @test_vcreate_v_u16mf2x7(
+// CHECK-RV64-SAME: <vscale x 2 x i16> [[V0:%.*]], <vscale x 2 x i16> [[V1:%.*]], <vscale x 2 x i16> [[V2:%.*]], <vscale x 2 x i16> [[V3:%.*]], <vscale x 2 x i16> [[V4:%.*]], <vscale x 2 x i16> [[V5:%.*]], <vscale x 2 x i16> [[V6:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16> } poison, <vscale x 2 x i16> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16> } [[TMP0]], <vscale x 2 x i16> [[V1]], 1
@@ -2518,12 +3654,15 @@ vuint16mf2x6_t test_vcreate_v_u16mf2x6(vuint16mf2_t v0, vuint16mf2_t v1, vuint16
 // CHECK-RV64-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16> } [[TMP5]], <vscale x 2 x i16> [[V6]], 6
 // CHECK-RV64-NEXT:    ret { <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16> } [[TMP6]]
 //
-vuint16mf2x7_t test_vcreate_v_u16mf2x7(vuint16mf2_t v0, vuint16mf2_t v1, vuint16mf2_t v2, vuint16mf2_t v3, vuint16mf2_t v4, vuint16mf2_t v5, vuint16mf2_t v6) {
+vuint16mf2x7_t test_vcreate_v_u16mf2x7(vuint16mf2_t v0, vuint16mf2_t v1,
+                                       vuint16mf2_t v2, vuint16mf2_t v3,
+                                       vuint16mf2_t v4, vuint16mf2_t v5,
+                                       vuint16mf2_t v6) {
   return __riscv_vcreate_v_u16mf2x7(v0, v1, v2, v3, v4, v5, v6);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16> } @test_vcreate_v_u16mf2x8
-// CHECK-RV64-SAME: (<vscale x 2 x i16> [[V0:%.*]], <vscale x 2 x i16> [[V1:%.*]], <vscale x 2 x i16> [[V2:%.*]], <vscale x 2 x i16> [[V3:%.*]], <vscale x 2 x i16> [[V4:%.*]], <vscale x 2 x i16> [[V5:%.*]], <vscale x 2 x i16> [[V6:%.*]], <vscale x 2 x i16> [[V7:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16> } @test_vcreate_v_u16mf2x8(
+// CHECK-RV64-SAME: <vscale x 2 x i16> [[V0:%.*]], <vscale x 2 x i16> [[V1:%.*]], <vscale x 2 x i16> [[V2:%.*]], <vscale x 2 x i16> [[V3:%.*]], <vscale x 2 x i16> [[V4:%.*]], <vscale x 2 x i16> [[V5:%.*]], <vscale x 2 x i16> [[V6:%.*]], <vscale x 2 x i16> [[V7:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16> } poison, <vscale x 2 x i16> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16> } [[TMP0]], <vscale x 2 x i16> [[V1]], 1
@@ -2535,12 +3674,15 @@ vuint16mf2x7_t test_vcreate_v_u16mf2x7(vuint16mf2_t v0, vuint16mf2_t v1, vuint16
 // CHECK-RV64-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16> } [[TMP6]], <vscale x 2 x i16> [[V7]], 7
 // CHECK-RV64-NEXT:    ret { <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16>, <vscale x 2 x i16> } [[TMP7]]
 //
-vuint16mf2x8_t test_vcreate_v_u16mf2x8(vuint16mf2_t v0, vuint16mf2_t v1, vuint16mf2_t v2, vuint16mf2_t v3, vuint16mf2_t v4, vuint16mf2_t v5, vuint16mf2_t v6, vuint16mf2_t v7) {
+vuint16mf2x8_t test_vcreate_v_u16mf2x8(vuint16mf2_t v0, vuint16mf2_t v1,
+                                       vuint16mf2_t v2, vuint16mf2_t v3,
+                                       vuint16mf2_t v4, vuint16mf2_t v5,
+                                       vuint16mf2_t v6, vuint16mf2_t v7) {
   return __riscv_vcreate_v_u16mf2x8(v0, v1, v2, v3, v4, v5, v6, v7);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x i16>, <vscale x 4 x i16> } @test_vcreate_v_u16m1x2
-// CHECK-RV64-SAME: (<vscale x 4 x i16> [[V0:%.*]], <vscale x 4 x i16> [[V1:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x i16>, <vscale x 4 x i16> } @test_vcreate_v_u16m1x2(
+// CHECK-RV64-SAME: <vscale x 4 x i16> [[V0:%.*]], <vscale x 4 x i16> [[V1:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 4 x i16>, <vscale x 4 x i16> } poison, <vscale x 4 x i16> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 4 x i16>, <vscale x 4 x i16> } [[TMP0]], <vscale x 4 x i16> [[V1]], 1
@@ -2550,20 +3692,21 @@ vuint16m1x2_t test_vcreate_v_u16m1x2(vuint16m1_t v0, vuint16m1_t v1) {
   return __riscv_vcreate_v_u16m1x2(v0, v1);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16> } @test_vcreate_v_u16m1x3
-// CHECK-RV64-SAME: (<vscale x 4 x i16> [[V0:%.*]], <vscale x 4 x i16> [[V1:%.*]], <vscale x 4 x i16> [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16> } @test_vcreate_v_u16m1x3(
+// CHECK-RV64-SAME: <vscale x 4 x i16> [[V0:%.*]], <vscale x 4 x i16> [[V1:%.*]], <vscale x 4 x i16> [[V2:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16> } poison, <vscale x 4 x i16> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16> } [[TMP0]], <vscale x 4 x i16> [[V1]], 1
 // CHECK-RV64-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16> } [[TMP1]], <vscale x 4 x i16> [[V2]], 2
 // CHECK-RV64-NEXT:    ret { <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16> } [[TMP2]]
 //
-vuint16m1x3_t test_vcreate_v_u16m1x3(vuint16m1_t v0, vuint16m1_t v1, vuint16m1_t v2) {
+vuint16m1x3_t test_vcreate_v_u16m1x3(vuint16m1_t v0, vuint16m1_t v1,
+                                     vuint16m1_t v2) {
   return __riscv_vcreate_v_u16m1x3(v0, v1, v2);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16> } @test_vcreate_v_u16m1x4
-// CHECK-RV64-SAME: (<vscale x 4 x i16> [[V0:%.*]], <vscale x 4 x i16> [[V1:%.*]], <vscale x 4 x i16> [[V2:%.*]], <vscale x 4 x i16> [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16> } @test_vcreate_v_u16m1x4(
+// CHECK-RV64-SAME: <vscale x 4 x i16> [[V0:%.*]], <vscale x 4 x i16> [[V1:%.*]], <vscale x 4 x i16> [[V2:%.*]], <vscale x 4 x i16> [[V3:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16> } poison, <vscale x 4 x i16> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16> } [[TMP0]], <vscale x 4 x i16> [[V1]], 1
@@ -2571,12 +3714,13 @@ vuint16m1x3_t test_vcreate_v_u16m1x3(vuint16m1_t v0, vuint16m1_t v1, vuint16m1_t
 // CHECK-RV64-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16> } [[TMP2]], <vscale x 4 x i16> [[V3]], 3
 // CHECK-RV64-NEXT:    ret { <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16> } [[TMP3]]
 //
-vuint16m1x4_t test_vcreate_v_u16m1x4(vuint16m1_t v0, vuint16m1_t v1, vuint16m1_t v2, vuint16m1_t v3) {
+vuint16m1x4_t test_vcreate_v_u16m1x4(vuint16m1_t v0, vuint16m1_t v1,
+                                     vuint16m1_t v2, vuint16m1_t v3) {
   return __riscv_vcreate_v_u16m1x4(v0, v1, v2, v3);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16> } @test_vcreate_v_u16m1x5
-// CHECK-RV64-SAME: (<vscale x 4 x i16> [[V0:%.*]], <vscale x 4 x i16> [[V1:%.*]], <vscale x 4 x i16> [[V2:%.*]], <vscale x 4 x i16> [[V3:%.*]], <vscale x 4 x i16> [[V4:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16> } @test_vcreate_v_u16m1x5(
+// CHECK-RV64-SAME: <vscale x 4 x i16> [[V0:%.*]], <vscale x 4 x i16> [[V1:%.*]], <vscale x 4 x i16> [[V2:%.*]], <vscale x 4 x i16> [[V3:%.*]], <vscale x 4 x i16> [[V4:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16> } poison, <vscale x 4 x i16> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16> } [[TMP0]], <vscale x 4 x i16> [[V1]], 1
@@ -2585,12 +3729,14 @@ vuint16m1x4_t test_vcreate_v_u16m1x4(vuint16m1_t v0, vuint16m1_t v1, vuint16m1_t
 // CHECK-RV64-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16> } [[TMP3]], <vscale x 4 x i16> [[V4]], 4
 // CHECK-RV64-NEXT:    ret { <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16> } [[TMP4]]
 //
-vuint16m1x5_t test_vcreate_v_u16m1x5(vuint16m1_t v0, vuint16m1_t v1, vuint16m1_t v2, vuint16m1_t v3, vuint16m1_t v4) {
+vuint16m1x5_t test_vcreate_v_u16m1x5(vuint16m1_t v0, vuint16m1_t v1,
+                                     vuint16m1_t v2, vuint16m1_t v3,
+                                     vuint16m1_t v4) {
   return __riscv_vcreate_v_u16m1x5(v0, v1, v2, v3, v4);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16> } @test_vcreate_v_u16m1x6
-// CHECK-RV64-SAME: (<vscale x 4 x i16> [[V0:%.*]], <vscale x 4 x i16> [[V1:%.*]], <vscale x 4 x i16> [[V2:%.*]], <vscale x 4 x i16> [[V3:%.*]], <vscale x 4 x i16> [[V4:%.*]], <vscale x 4 x i16> [[V5:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16> } @test_vcreate_v_u16m1x6(
+// CHECK-RV64-SAME: <vscale x 4 x i16> [[V0:%.*]], <vscale x 4 x i16> [[V1:%.*]], <vscale x 4 x i16> [[V2:%.*]], <vscale x 4 x i16> [[V3:%.*]], <vscale x 4 x i16> [[V4:%.*]], <vscale x 4 x i16> [[V5:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16> } poison, <vscale x 4 x i16> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16> } [[TMP0]], <vscale x 4 x i16> [[V1]], 1
@@ -2600,12 +3746,14 @@ vuint16m1x5_t test_vcreate_v_u16m1x5(vuint16m1_t v0, vuint16m1_t v1, vuint16m1_t
 // CHECK-RV64-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16> } [[TMP4]], <vscale x 4 x i16> [[V5]], 5
 // CHECK-RV64-NEXT:    ret { <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16> } [[TMP5]]
 //
-vuint16m1x6_t test_vcreate_v_u16m1x6(vuint16m1_t v0, vuint16m1_t v1, vuint16m1_t v2, vuint16m1_t v3, vuint16m1_t v4, vuint16m1_t v5) {
+vuint16m1x6_t test_vcreate_v_u16m1x6(vuint16m1_t v0, vuint16m1_t v1,
+                                     vuint16m1_t v2, vuint16m1_t v3,
+                                     vuint16m1_t v4, vuint16m1_t v5) {
   return __riscv_vcreate_v_u16m1x6(v0, v1, v2, v3, v4, v5);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16> } @test_vcreate_v_u16m1x7
-// CHECK-RV64-SAME: (<vscale x 4 x i16> [[V0:%.*]], <vscale x 4 x i16> [[V1:%.*]], <vscale x 4 x i16> [[V2:%.*]], <vscale x 4 x i16> [[V3:%.*]], <vscale x 4 x i16> [[V4:%.*]], <vscale x 4 x i16> [[V5:%.*]], <vscale x 4 x i16> [[V6:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16> } @test_vcreate_v_u16m1x7(
+// CHECK-RV64-SAME: <vscale x 4 x i16> [[V0:%.*]], <vscale x 4 x i16> [[V1:%.*]], <vscale x 4 x i16> [[V2:%.*]], <vscale x 4 x i16> [[V3:%.*]], <vscale x 4 x i16> [[V4:%.*]], <vscale x 4 x i16> [[V5:%.*]], <vscale x 4 x i16> [[V6:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16> } poison, <vscale x 4 x i16> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16> } [[TMP0]], <vscale x 4 x i16> [[V1]], 1
@@ -2616,12 +3764,15 @@ vuint16m1x6_t test_vcreate_v_u16m1x6(vuint16m1_t v0, vuint16m1_t v1, vuint16m1_t
 // CHECK-RV64-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16> } [[TMP5]], <vscale x 4 x i16> [[V6]], 6
 // CHECK-RV64-NEXT:    ret { <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16> } [[TMP6]]
 //
-vuint16m1x7_t test_vcreate_v_u16m1x7(vuint16m1_t v0, vuint16m1_t v1, vuint16m1_t v2, vuint16m1_t v3, vuint16m1_t v4, vuint16m1_t v5, vuint16m1_t v6) {
+vuint16m1x7_t test_vcreate_v_u16m1x7(vuint16m1_t v0, vuint16m1_t v1,
+                                     vuint16m1_t v2, vuint16m1_t v3,
+                                     vuint16m1_t v4, vuint16m1_t v5,
+                                     vuint16m1_t v6) {
   return __riscv_vcreate_v_u16m1x7(v0, v1, v2, v3, v4, v5, v6);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16> } @test_vcreate_v_u16m1x8
-// CHECK-RV64-SAME: (<vscale x 4 x i16> [[V0:%.*]], <vscale x 4 x i16> [[V1:%.*]], <vscale x 4 x i16> [[V2:%.*]], <vscale x 4 x i16> [[V3:%.*]], <vscale x 4 x i16> [[V4:%.*]], <vscale x 4 x i16> [[V5:%.*]], <vscale x 4 x i16> [[V6:%.*]], <vscale x 4 x i16> [[V7:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16> } @test_vcreate_v_u16m1x8(
+// CHECK-RV64-SAME: <vscale x 4 x i16> [[V0:%.*]], <vscale x 4 x i16> [[V1:%.*]], <vscale x 4 x i16> [[V2:%.*]], <vscale x 4 x i16> [[V3:%.*]], <vscale x 4 x i16> [[V4:%.*]], <vscale x 4 x i16> [[V5:%.*]], <vscale x 4 x i16> [[V6:%.*]], <vscale x 4 x i16> [[V7:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16> } poison, <vscale x 4 x i16> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16> } [[TMP0]], <vscale x 4 x i16> [[V1]], 1
@@ -2633,12 +3784,15 @@ vuint16m1x7_t test_vcreate_v_u16m1x7(vuint16m1_t v0, vuint16m1_t v1, vuint16m1_t
 // CHECK-RV64-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16> } [[TMP6]], <vscale x 4 x i16> [[V7]], 7
 // CHECK-RV64-NEXT:    ret { <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i16> } [[TMP7]]
 //
-vuint16m1x8_t test_vcreate_v_u16m1x8(vuint16m1_t v0, vuint16m1_t v1, vuint16m1_t v2, vuint16m1_t v3, vuint16m1_t v4, vuint16m1_t v5, vuint16m1_t v6, vuint16m1_t v7) {
+vuint16m1x8_t test_vcreate_v_u16m1x8(vuint16m1_t v0, vuint16m1_t v1,
+                                     vuint16m1_t v2, vuint16m1_t v3,
+                                     vuint16m1_t v4, vuint16m1_t v5,
+                                     vuint16m1_t v6, vuint16m1_t v7) {
   return __riscv_vcreate_v_u16m1x8(v0, v1, v2, v3, v4, v5, v6, v7);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 8 x i16>, <vscale x 8 x i16> } @test_vcreate_v_u16m2x2
-// CHECK-RV64-SAME: (<vscale x 8 x i16> [[V0:%.*]], <vscale x 8 x i16> [[V1:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 8 x i16>, <vscale x 8 x i16> } @test_vcreate_v_u16m2x2(
+// CHECK-RV64-SAME: <vscale x 8 x i16> [[V0:%.*]], <vscale x 8 x i16> [[V1:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } poison, <vscale x 8 x i16> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], <vscale x 8 x i16> [[V1]], 1
@@ -2648,20 +3802,21 @@ vuint16m2x2_t test_vcreate_v_u16m2x2(vuint16m2_t v0, vuint16m2_t v1) {
   return __riscv_vcreate_v_u16m2x2(v0, v1);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @test_vcreate_v_u16m2x3
-// CHECK-RV64-SAME: (<vscale x 8 x i16> [[V0:%.*]], <vscale x 8 x i16> [[V1:%.*]], <vscale x 8 x i16> [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @test_vcreate_v_u16m2x3(
+// CHECK-RV64-SAME: <vscale x 8 x i16> [[V0:%.*]], <vscale x 8 x i16> [[V1:%.*]], <vscale x 8 x i16> [[V2:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } poison, <vscale x 8 x i16> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], <vscale x 8 x i16> [[V1]], 1
 // CHECK-RV64-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP1]], <vscale x 8 x i16> [[V2]], 2
 // CHECK-RV64-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]]
 //
-vuint16m2x3_t test_vcreate_v_u16m2x3(vuint16m2_t v0, vuint16m2_t v1, vuint16m2_t v2) {
+vuint16m2x3_t test_vcreate_v_u16m2x3(vuint16m2_t v0, vuint16m2_t v1,
+                                     vuint16m2_t v2) {
   return __riscv_vcreate_v_u16m2x3(v0, v1, v2);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @test_vcreate_v_u16m2x4
-// CHECK-RV64-SAME: (<vscale x 8 x i16> [[V0:%.*]], <vscale x 8 x i16> [[V1:%.*]], <vscale x 8 x i16> [[V2:%.*]], <vscale x 8 x i16> [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @test_vcreate_v_u16m2x4(
+// CHECK-RV64-SAME: <vscale x 8 x i16> [[V0:%.*]], <vscale x 8 x i16> [[V1:%.*]], <vscale x 8 x i16> [[V2:%.*]], <vscale x 8 x i16> [[V3:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } poison, <vscale x 8 x i16> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP0]], <vscale x 8 x i16> [[V1]], 1
@@ -2669,12 +3824,13 @@ vuint16m2x3_t test_vcreate_v_u16m2x3(vuint16m2_t v0, vuint16m2_t v1, vuint16m2_t
 // CHECK-RV64-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP2]], <vscale x 8 x i16> [[V3]], 3
 // CHECK-RV64-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[TMP3]]
 //
-vuint16m2x4_t test_vcreate_v_u16m2x4(vuint16m2_t v0, vuint16m2_t v1, vuint16m2_t v2, vuint16m2_t v3) {
+vuint16m2x4_t test_vcreate_v_u16m2x4(vuint16m2_t v0, vuint16m2_t v1,
+                                     vuint16m2_t v2, vuint16m2_t v3) {
   return __riscv_vcreate_v_u16m2x4(v0, v1, v2, v3);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 16 x i16>, <vscale x 16 x i16> } @test_vcreate_v_u16m4x2
-// CHECK-RV64-SAME: (<vscale x 16 x i16> [[V0:%.*]], <vscale x 16 x i16> [[V1:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 16 x i16>, <vscale x 16 x i16> } @test_vcreate_v_u16m4x2(
+// CHECK-RV64-SAME: <vscale x 16 x i16> [[V0:%.*]], <vscale x 16 x i16> [[V1:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 16 x i16>, <vscale x 16 x i16> } poison, <vscale x 16 x i16> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 16 x i16>, <vscale x 16 x i16> } [[TMP0]], <vscale x 16 x i16> [[V1]], 1
@@ -2684,8 +3840,8 @@ vuint16m4x2_t test_vcreate_v_u16m4x2(vuint16m4_t v0, vuint16m4_t v1) {
   return __riscv_vcreate_v_u16m4x2(v0, v1);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i32>, <vscale x 1 x i32> } @test_vcreate_v_u32mf2x2
-// CHECK-RV64-SAME: (<vscale x 1 x i32> [[V0:%.*]], <vscale x 1 x i32> [[V1:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i32>, <vscale x 1 x i32> } @test_vcreate_v_u32mf2x2(
+// CHECK-RV64-SAME: <vscale x 1 x i32> [[V0:%.*]], <vscale x 1 x i32> [[V1:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 1 x i32>, <vscale x 1 x i32> } poison, <vscale x 1 x i32> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 1 x i32>, <vscale x 1 x i32> } [[TMP0]], <vscale x 1 x i32> [[V1]], 1
@@ -2695,20 +3851,21 @@ vuint32mf2x2_t test_vcreate_v_u32mf2x2(vuint32mf2_t v0, vuint32mf2_t v1) {
   return __riscv_vcreate_v_u32mf2x2(v0, v1);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } @test_vcreate_v_u32mf2x3
-// CHECK-RV64-SAME: (<vscale x 1 x i32> [[V0:%.*]], <vscale x 1 x i32> [[V1:%.*]], <vscale x 1 x i32> [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } @test_vcreate_v_u32mf2x3(
+// CHECK-RV64-SAME: <vscale x 1 x i32> [[V0:%.*]], <vscale x 1 x i32> [[V1:%.*]], <vscale x 1 x i32> [[V2:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } poison, <vscale x 1 x i32> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } [[TMP0]], <vscale x 1 x i32> [[V1]], 1
 // CHECK-RV64-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } [[TMP1]], <vscale x 1 x i32> [[V2]], 2
 // CHECK-RV64-NEXT:    ret { <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } [[TMP2]]
 //
-vuint32mf2x3_t test_vcreate_v_u32mf2x3(vuint32mf2_t v0, vuint32mf2_t v1, vuint32mf2_t v2) {
+vuint32mf2x3_t test_vcreate_v_u32mf2x3(vuint32mf2_t v0, vuint32mf2_t v1,
+                                       vuint32mf2_t v2) {
   return __riscv_vcreate_v_u32mf2x3(v0, v1, v2);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } @test_vcreate_v_u32mf2x4
-// CHECK-RV64-SAME: (<vscale x 1 x i32> [[V0:%.*]], <vscale x 1 x i32> [[V1:%.*]], <vscale x 1 x i32> [[V2:%.*]], <vscale x 1 x i32> [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } @test_vcreate_v_u32mf2x4(
+// CHECK-RV64-SAME: <vscale x 1 x i32> [[V0:%.*]], <vscale x 1 x i32> [[V1:%.*]], <vscale x 1 x i32> [[V2:%.*]], <vscale x 1 x i32> [[V3:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } poison, <vscale x 1 x i32> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } [[TMP0]], <vscale x 1 x i32> [[V1]], 1
@@ -2716,12 +3873,13 @@ vuint32mf2x3_t test_vcreate_v_u32mf2x3(vuint32mf2_t v0, vuint32mf2_t v1, vuint32
 // CHECK-RV64-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } [[TMP2]], <vscale x 1 x i32> [[V3]], 3
 // CHECK-RV64-NEXT:    ret { <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } [[TMP3]]
 //
-vuint32mf2x4_t test_vcreate_v_u32mf2x4(vuint32mf2_t v0, vuint32mf2_t v1, vuint32mf2_t v2, vuint32mf2_t v3) {
+vuint32mf2x4_t test_vcreate_v_u32mf2x4(vuint32mf2_t v0, vuint32mf2_t v1,
+                                       vuint32mf2_t v2, vuint32mf2_t v3) {
   return __riscv_vcreate_v_u32mf2x4(v0, v1, v2, v3);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } @test_vcreate_v_u32mf2x5
-// CHECK-RV64-SAME: (<vscale x 1 x i32> [[V0:%.*]], <vscale x 1 x i32> [[V1:%.*]], <vscale x 1 x i32> [[V2:%.*]], <vscale x 1 x i32> [[V3:%.*]], <vscale x 1 x i32> [[V4:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } @test_vcreate_v_u32mf2x5(
+// CHECK-RV64-SAME: <vscale x 1 x i32> [[V0:%.*]], <vscale x 1 x i32> [[V1:%.*]], <vscale x 1 x i32> [[V2:%.*]], <vscale x 1 x i32> [[V3:%.*]], <vscale x 1 x i32> [[V4:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } poison, <vscale x 1 x i32> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } [[TMP0]], <vscale x 1 x i32> [[V1]], 1
@@ -2730,12 +3888,14 @@ vuint32mf2x4_t test_vcreate_v_u32mf2x4(vuint32mf2_t v0, vuint32mf2_t v1, vuint32
 // CHECK-RV64-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } [[TMP3]], <vscale x 1 x i32> [[V4]], 4
 // CHECK-RV64-NEXT:    ret { <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } [[TMP4]]
 //
-vuint32mf2x5_t test_vcreate_v_u32mf2x5(vuint32mf2_t v0, vuint32mf2_t v1, vuint32mf2_t v2, vuint32mf2_t v3, vuint32mf2_t v4) {
+vuint32mf2x5_t test_vcreate_v_u32mf2x5(vuint32mf2_t v0, vuint32mf2_t v1,
+                                       vuint32mf2_t v2, vuint32mf2_t v3,
+                                       vuint32mf2_t v4) {
   return __riscv_vcreate_v_u32mf2x5(v0, v1, v2, v3, v4);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } @test_vcreate_v_u32mf2x6
-// CHECK-RV64-SAME: (<vscale x 1 x i32> [[V0:%.*]], <vscale x 1 x i32> [[V1:%.*]], <vscale x 1 x i32> [[V2:%.*]], <vscale x 1 x i32> [[V3:%.*]], <vscale x 1 x i32> [[V4:%.*]], <vscale x 1 x i32> [[V5:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } @test_vcreate_v_u32mf2x6(
+// CHECK-RV64-SAME: <vscale x 1 x i32> [[V0:%.*]], <vscale x 1 x i32> [[V1:%.*]], <vscale x 1 x i32> [[V2:%.*]], <vscale x 1 x i32> [[V3:%.*]], <vscale x 1 x i32> [[V4:%.*]], <vscale x 1 x i32> [[V5:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } poison, <vscale x 1 x i32> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } [[TMP0]], <vscale x 1 x i32> [[V1]], 1
@@ -2745,12 +3905,14 @@ vuint32mf2x5_t test_vcreate_v_u32mf2x5(vuint32mf2_t v0, vuint32mf2_t v1, vuint32
 // CHECK-RV64-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } [[TMP4]], <vscale x 1 x i32> [[V5]], 5
 // CHECK-RV64-NEXT:    ret { <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } [[TMP5]]
 //
-vuint32mf2x6_t test_vcreate_v_u32mf2x6(vuint32mf2_t v0, vuint32mf2_t v1, vuint32mf2_t v2, vuint32mf2_t v3, vuint32mf2_t v4, vuint32mf2_t v5) {
+vuint32mf2x6_t test_vcreate_v_u32mf2x6(vuint32mf2_t v0, vuint32mf2_t v1,
+                                       vuint32mf2_t v2, vuint32mf2_t v3,
+                                       vuint32mf2_t v4, vuint32mf2_t v5) {
   return __riscv_vcreate_v_u32mf2x6(v0, v1, v2, v3, v4, v5);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } @test_vcreate_v_u32mf2x7
-// CHECK-RV64-SAME: (<vscale x 1 x i32> [[V0:%.*]], <vscale x 1 x i32> [[V1:%.*]], <vscale x 1 x i32> [[V2:%.*]], <vscale x 1 x i32> [[V3:%.*]], <vscale x 1 x i32> [[V4:%.*]], <vscale x 1 x i32> [[V5:%.*]], <vscale x 1 x i32> [[V6:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } @test_vcreate_v_u32mf2x7(
+// CHECK-RV64-SAME: <vscale x 1 x i32> [[V0:%.*]], <vscale x 1 x i32> [[V1:%.*]], <vscale x 1 x i32> [[V2:%.*]], <vscale x 1 x i32> [[V3:%.*]], <vscale x 1 x i32> [[V4:%.*]], <vscale x 1 x i32> [[V5:%.*]], <vscale x 1 x i32> [[V6:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } poison, <vscale x 1 x i32> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } [[TMP0]], <vscale x 1 x i32> [[V1]], 1
@@ -2761,12 +3923,15 @@ vuint32mf2x6_t test_vcreate_v_u32mf2x6(vuint32mf2_t v0, vuint32mf2_t v1, vuint32
 // CHECK-RV64-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } [[TMP5]], <vscale x 1 x i32> [[V6]], 6
 // CHECK-RV64-NEXT:    ret { <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } [[TMP6]]
 //
-vuint32mf2x7_t test_vcreate_v_u32mf2x7(vuint32mf2_t v0, vuint32mf2_t v1, vuint32mf2_t v2, vuint32mf2_t v3, vuint32mf2_t v4, vuint32mf2_t v5, vuint32mf2_t v6) {
+vuint32mf2x7_t test_vcreate_v_u32mf2x7(vuint32mf2_t v0, vuint32mf2_t v1,
+                                       vuint32mf2_t v2, vuint32mf2_t v3,
+                                       vuint32mf2_t v4, vuint32mf2_t v5,
+                                       vuint32mf2_t v6) {
   return __riscv_vcreate_v_u32mf2x7(v0, v1, v2, v3, v4, v5, v6);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } @test_vcreate_v_u32mf2x8
-// CHECK-RV64-SAME: (<vscale x 1 x i32> [[V0:%.*]], <vscale x 1 x i32> [[V1:%.*]], <vscale x 1 x i32> [[V2:%.*]], <vscale x 1 x i32> [[V3:%.*]], <vscale x 1 x i32> [[V4:%.*]], <vscale x 1 x i32> [[V5:%.*]], <vscale x 1 x i32> [[V6:%.*]], <vscale x 1 x i32> [[V7:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } @test_vcreate_v_u32mf2x8(
+// CHECK-RV64-SAME: <vscale x 1 x i32> [[V0:%.*]], <vscale x 1 x i32> [[V1:%.*]], <vscale x 1 x i32> [[V2:%.*]], <vscale x 1 x i32> [[V3:%.*]], <vscale x 1 x i32> [[V4:%.*]], <vscale x 1 x i32> [[V5:%.*]], <vscale x 1 x i32> [[V6:%.*]], <vscale x 1 x i32> [[V7:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } poison, <vscale x 1 x i32> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } [[TMP0]], <vscale x 1 x i32> [[V1]], 1
@@ -2778,12 +3943,15 @@ vuint32mf2x7_t test_vcreate_v_u32mf2x7(vuint32mf2_t v0, vuint32mf2_t v1, vuint32
 // CHECK-RV64-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } [[TMP6]], <vscale x 1 x i32> [[V7]], 7
 // CHECK-RV64-NEXT:    ret { <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32>, <vscale x 1 x i32> } [[TMP7]]
 //
-vuint32mf2x8_t test_vcreate_v_u32mf2x8(vuint32mf2_t v0, vuint32mf2_t v1, vuint32mf2_t v2, vuint32mf2_t v3, vuint32mf2_t v4, vuint32mf2_t v5, vuint32mf2_t v6, vuint32mf2_t v7) {
+vuint32mf2x8_t test_vcreate_v_u32mf2x8(vuint32mf2_t v0, vuint32mf2_t v1,
+                                       vuint32mf2_t v2, vuint32mf2_t v3,
+                                       vuint32mf2_t v4, vuint32mf2_t v5,
+                                       vuint32mf2_t v6, vuint32mf2_t v7) {
   return __riscv_vcreate_v_u32mf2x8(v0, v1, v2, v3, v4, v5, v6, v7);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i32>, <vscale x 2 x i32> } @test_vcreate_v_u32m1x2
-// CHECK-RV64-SAME: (<vscale x 2 x i32> [[V0:%.*]], <vscale x 2 x i32> [[V1:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i32>, <vscale x 2 x i32> } @test_vcreate_v_u32m1x2(
+// CHECK-RV64-SAME: <vscale x 2 x i32> [[V0:%.*]], <vscale x 2 x i32> [[V1:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } poison, <vscale x 2 x i32> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP0]], <vscale x 2 x i32> [[V1]], 1
@@ -2793,20 +3961,21 @@ vuint32m1x2_t test_vcreate_v_u32m1x2(vuint32m1_t v0, vuint32m1_t v1) {
   return __riscv_vcreate_v_u32m1x2(v0, v1);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } @test_vcreate_v_u32m1x3
-// CHECK-RV64-SAME: (<vscale x 2 x i32> [[V0:%.*]], <vscale x 2 x i32> [[V1:%.*]], <vscale x 2 x i32> [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } @test_vcreate_v_u32m1x3(
+// CHECK-RV64-SAME: <vscale x 2 x i32> [[V0:%.*]], <vscale x 2 x i32> [[V1:%.*]], <vscale x 2 x i32> [[V2:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } poison, <vscale x 2 x i32> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP0]], <vscale x 2 x i32> [[V1]], 1
 // CHECK-RV64-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP1]], <vscale x 2 x i32> [[V2]], 2
 // CHECK-RV64-NEXT:    ret { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP2]]
 //
-vuint32m1x3_t test_vcreate_v_u32m1x3(vuint32m1_t v0, vuint32m1_t v1, vuint32m1_t v2) {
+vuint32m1x3_t test_vcreate_v_u32m1x3(vuint32m1_t v0, vuint32m1_t v1,
+                                     vuint32m1_t v2) {
   return __riscv_vcreate_v_u32m1x3(v0, v1, v2);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } @test_vcreate_v_u32m1x4
-// CHECK-RV64-SAME: (<vscale x 2 x i32> [[V0:%.*]], <vscale x 2 x i32> [[V1:%.*]], <vscale x 2 x i32> [[V2:%.*]], <vscale x 2 x i32> [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } @test_vcreate_v_u32m1x4(
+// CHECK-RV64-SAME: <vscale x 2 x i32> [[V0:%.*]], <vscale x 2 x i32> [[V1:%.*]], <vscale x 2 x i32> [[V2:%.*]], <vscale x 2 x i32> [[V3:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } poison, <vscale x 2 x i32> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP0]], <vscale x 2 x i32> [[V1]], 1
@@ -2814,12 +3983,13 @@ vuint32m1x3_t test_vcreate_v_u32m1x3(vuint32m1_t v0, vuint32m1_t v1, vuint32m1_t
 // CHECK-RV64-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP2]], <vscale x 2 x i32> [[V3]], 3
 // CHECK-RV64-NEXT:    ret { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP3]]
 //
-vuint32m1x4_t test_vcreate_v_u32m1x4(vuint32m1_t v0, vuint32m1_t v1, vuint32m1_t v2, vuint32m1_t v3) {
+vuint32m1x4_t test_vcreate_v_u32m1x4(vuint32m1_t v0, vuint32m1_t v1,
+                                     vuint32m1_t v2, vuint32m1_t v3) {
   return __riscv_vcreate_v_u32m1x4(v0, v1, v2, v3);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } @test_vcreate_v_u32m1x5
-// CHECK-RV64-SAME: (<vscale x 2 x i32> [[V0:%.*]], <vscale x 2 x i32> [[V1:%.*]], <vscale x 2 x i32> [[V2:%.*]], <vscale x 2 x i32> [[V3:%.*]], <vscale x 2 x i32> [[V4:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } @test_vcreate_v_u32m1x5(
+// CHECK-RV64-SAME: <vscale x 2 x i32> [[V0:%.*]], <vscale x 2 x i32> [[V1:%.*]], <vscale x 2 x i32> [[V2:%.*]], <vscale x 2 x i32> [[V3:%.*]], <vscale x 2 x i32> [[V4:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } poison, <vscale x 2 x i32> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP0]], <vscale x 2 x i32> [[V1]], 1
@@ -2828,12 +3998,14 @@ vuint32m1x4_t test_vcreate_v_u32m1x4(vuint32m1_t v0, vuint32m1_t v1, vuint32m1_t
 // CHECK-RV64-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP3]], <vscale x 2 x i32> [[V4]], 4
 // CHECK-RV64-NEXT:    ret { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP4]]
 //
-vuint32m1x5_t test_vcreate_v_u32m1x5(vuint32m1_t v0, vuint32m1_t v1, vuint32m1_t v2, vuint32m1_t v3, vuint32m1_t v4) {
+vuint32m1x5_t test_vcreate_v_u32m1x5(vuint32m1_t v0, vuint32m1_t v1,
+                                     vuint32m1_t v2, vuint32m1_t v3,
+                                     vuint32m1_t v4) {
   return __riscv_vcreate_v_u32m1x5(v0, v1, v2, v3, v4);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } @test_vcreate_v_u32m1x6
-// CHECK-RV64-SAME: (<vscale x 2 x i32> [[V0:%.*]], <vscale x 2 x i32> [[V1:%.*]], <vscale x 2 x i32> [[V2:%.*]], <vscale x 2 x i32> [[V3:%.*]], <vscale x 2 x i32> [[V4:%.*]], <vscale x 2 x i32> [[V5:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } @test_vcreate_v_u32m1x6(
+// CHECK-RV64-SAME: <vscale x 2 x i32> [[V0:%.*]], <vscale x 2 x i32> [[V1:%.*]], <vscale x 2 x i32> [[V2:%.*]], <vscale x 2 x i32> [[V3:%.*]], <vscale x 2 x i32> [[V4:%.*]], <vscale x 2 x i32> [[V5:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } poison, <vscale x 2 x i32> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP0]], <vscale x 2 x i32> [[V1]], 1
@@ -2843,12 +4015,14 @@ vuint32m1x5_t test_vcreate_v_u32m1x5(vuint32m1_t v0, vuint32m1_t v1, vuint32m1_t
 // CHECK-RV64-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP4]], <vscale x 2 x i32> [[V5]], 5
 // CHECK-RV64-NEXT:    ret { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP5]]
 //
-vuint32m1x6_t test_vcreate_v_u32m1x6(vuint32m1_t v0, vuint32m1_t v1, vuint32m1_t v2, vuint32m1_t v3, vuint32m1_t v4, vuint32m1_t v5) {
+vuint32m1x6_t test_vcreate_v_u32m1x6(vuint32m1_t v0, vuint32m1_t v1,
+                                     vuint32m1_t v2, vuint32m1_t v3,
+                                     vuint32m1_t v4, vuint32m1_t v5) {
   return __riscv_vcreate_v_u32m1x6(v0, v1, v2, v3, v4, v5);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } @test_vcreate_v_u32m1x7
-// CHECK-RV64-SAME: (<vscale x 2 x i32> [[V0:%.*]], <vscale x 2 x i32> [[V1:%.*]], <vscale x 2 x i32> [[V2:%.*]], <vscale x 2 x i32> [[V3:%.*]], <vscale x 2 x i32> [[V4:%.*]], <vscale x 2 x i32> [[V5:%.*]], <vscale x 2 x i32> [[V6:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } @test_vcreate_v_u32m1x7(
+// CHECK-RV64-SAME: <vscale x 2 x i32> [[V0:%.*]], <vscale x 2 x i32> [[V1:%.*]], <vscale x 2 x i32> [[V2:%.*]], <vscale x 2 x i32> [[V3:%.*]], <vscale x 2 x i32> [[V4:%.*]], <vscale x 2 x i32> [[V5:%.*]], <vscale x 2 x i32> [[V6:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } poison, <vscale x 2 x i32> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP0]], <vscale x 2 x i32> [[V1]], 1
@@ -2859,12 +4033,15 @@ vuint32m1x6_t test_vcreate_v_u32m1x6(vuint32m1_t v0, vuint32m1_t v1, vuint32m1_t
 // CHECK-RV64-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP5]], <vscale x 2 x i32> [[V6]], 6
 // CHECK-RV64-NEXT:    ret { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP6]]
 //
-vuint32m1x7_t test_vcreate_v_u32m1x7(vuint32m1_t v0, vuint32m1_t v1, vuint32m1_t v2, vuint32m1_t v3, vuint32m1_t v4, vuint32m1_t v5, vuint32m1_t v6) {
+vuint32m1x7_t test_vcreate_v_u32m1x7(vuint32m1_t v0, vuint32m1_t v1,
+                                     vuint32m1_t v2, vuint32m1_t v3,
+                                     vuint32m1_t v4, vuint32m1_t v5,
+                                     vuint32m1_t v6) {
   return __riscv_vcreate_v_u32m1x7(v0, v1, v2, v3, v4, v5, v6);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } @test_vcreate_v_u32m1x8
-// CHECK-RV64-SAME: (<vscale x 2 x i32> [[V0:%.*]], <vscale x 2 x i32> [[V1:%.*]], <vscale x 2 x i32> [[V2:%.*]], <vscale x 2 x i32> [[V3:%.*]], <vscale x 2 x i32> [[V4:%.*]], <vscale x 2 x i32> [[V5:%.*]], <vscale x 2 x i32> [[V6:%.*]], <vscale x 2 x i32> [[V7:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } @test_vcreate_v_u32m1x8(
+// CHECK-RV64-SAME: <vscale x 2 x i32> [[V0:%.*]], <vscale x 2 x i32> [[V1:%.*]], <vscale x 2 x i32> [[V2:%.*]], <vscale x 2 x i32> [[V3:%.*]], <vscale x 2 x i32> [[V4:%.*]], <vscale x 2 x i32> [[V5:%.*]], <vscale x 2 x i32> [[V6:%.*]], <vscale x 2 x i32> [[V7:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } poison, <vscale x 2 x i32> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP0]], <vscale x 2 x i32> [[V1]], 1
@@ -2876,12 +4053,15 @@ vuint32m1x7_t test_vcreate_v_u32m1x7(vuint32m1_t v0, vuint32m1_t v1, vuint32m1_t
 // CHECK-RV64-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP6]], <vscale x 2 x i32> [[V7]], 7
 // CHECK-RV64-NEXT:    ret { <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i32> } [[TMP7]]
 //
-vuint32m1x8_t test_vcreate_v_u32m1x8(vuint32m1_t v0, vuint32m1_t v1, vuint32m1_t v2, vuint32m1_t v3, vuint32m1_t v4, vuint32m1_t v5, vuint32m1_t v6, vuint32m1_t v7) {
+vuint32m1x8_t test_vcreate_v_u32m1x8(vuint32m1_t v0, vuint32m1_t v1,
+                                     vuint32m1_t v2, vuint32m1_t v3,
+                                     vuint32m1_t v4, vuint32m1_t v5,
+                                     vuint32m1_t v6, vuint32m1_t v7) {
   return __riscv_vcreate_v_u32m1x8(v0, v1, v2, v3, v4, v5, v6, v7);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x i32>, <vscale x 4 x i32> } @test_vcreate_v_u32m2x2
-// CHECK-RV64-SAME: (<vscale x 4 x i32> [[V0:%.*]], <vscale x 4 x i32> [[V1:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x i32>, <vscale x 4 x i32> } @test_vcreate_v_u32m2x2(
+// CHECK-RV64-SAME: <vscale x 4 x i32> [[V0:%.*]], <vscale x 4 x i32> [[V1:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } poison, <vscale x 4 x i32> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], <vscale x 4 x i32> [[V1]], 1
@@ -2891,20 +4071,21 @@ vuint32m2x2_t test_vcreate_v_u32m2x2(vuint32m2_t v0, vuint32m2_t v1) {
   return __riscv_vcreate_v_u32m2x2(v0, v1);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @test_vcreate_v_u32m2x3
-// CHECK-RV64-SAME: (<vscale x 4 x i32> [[V0:%.*]], <vscale x 4 x i32> [[V1:%.*]], <vscale x 4 x i32> [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @test_vcreate_v_u32m2x3(
+// CHECK-RV64-SAME: <vscale x 4 x i32> [[V0:%.*]], <vscale x 4 x i32> [[V1:%.*]], <vscale x 4 x i32> [[V2:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } poison, <vscale x 4 x i32> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], <vscale x 4 x i32> [[V1]], 1
 // CHECK-RV64-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP1]], <vscale x 4 x i32> [[V2]], 2
 // CHECK-RV64-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]]
 //
-vuint32m2x3_t test_vcreate_v_u32m2x3(vuint32m2_t v0, vuint32m2_t v1, vuint32m2_t v2) {
+vuint32m2x3_t test_vcreate_v_u32m2x3(vuint32m2_t v0, vuint32m2_t v1,
+                                     vuint32m2_t v2) {
   return __riscv_vcreate_v_u32m2x3(v0, v1, v2);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @test_vcreate_v_u32m2x4
-// CHECK-RV64-SAME: (<vscale x 4 x i32> [[V0:%.*]], <vscale x 4 x i32> [[V1:%.*]], <vscale x 4 x i32> [[V2:%.*]], <vscale x 4 x i32> [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @test_vcreate_v_u32m2x4(
+// CHECK-RV64-SAME: <vscale x 4 x i32> [[V0:%.*]], <vscale x 4 x i32> [[V1:%.*]], <vscale x 4 x i32> [[V2:%.*]], <vscale x 4 x i32> [[V3:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } poison, <vscale x 4 x i32> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP0]], <vscale x 4 x i32> [[V1]], 1
@@ -2912,12 +4093,13 @@ vuint32m2x3_t test_vcreate_v_u32m2x3(vuint32m2_t v0, vuint32m2_t v1, vuint32m2_t
 // CHECK-RV64-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], <vscale x 4 x i32> [[V3]], 3
 // CHECK-RV64-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP3]]
 //
-vuint32m2x4_t test_vcreate_v_u32m2x4(vuint32m2_t v0, vuint32m2_t v1, vuint32m2_t v2, vuint32m2_t v3) {
+vuint32m2x4_t test_vcreate_v_u32m2x4(vuint32m2_t v0, vuint32m2_t v1,
+                                     vuint32m2_t v2, vuint32m2_t v3) {
   return __riscv_vcreate_v_u32m2x4(v0, v1, v2, v3);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 8 x i32>, <vscale x 8 x i32> } @test_vcreate_v_u32m4x2
-// CHECK-RV64-SAME: (<vscale x 8 x i32> [[V0:%.*]], <vscale x 8 x i32> [[V1:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 8 x i32>, <vscale x 8 x i32> } @test_vcreate_v_u32m4x2(
+// CHECK-RV64-SAME: <vscale x 8 x i32> [[V0:%.*]], <vscale x 8 x i32> [[V1:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } poison, <vscale x 8 x i32> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } [[TMP0]], <vscale x 8 x i32> [[V1]], 1
@@ -2927,8 +4109,8 @@ vuint32m4x2_t test_vcreate_v_u32m4x2(vuint32m4_t v0, vuint32m4_t v1) {
   return __riscv_vcreate_v_u32m4x2(v0, v1);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i64>, <vscale x 1 x i64> } @test_vcreate_v_u64m1x2
-// CHECK-RV64-SAME: (<vscale x 1 x i64> [[V0:%.*]], <vscale x 1 x i64> [[V1:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i64>, <vscale x 1 x i64> } @test_vcreate_v_u64m1x2(
+// CHECK-RV64-SAME: <vscale x 1 x i64> [[V0:%.*]], <vscale x 1 x i64> [[V1:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } poison, <vscale x 1 x i64> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 1 x i64>, <vscale x 1 x i64> } [[TMP0]], <vscale x 1 x i64> [[V1]], 1
@@ -2938,20 +4120,21 @@ vuint64m1x2_t test_vcreate_v_u64m1x2(vuint64m1_t v0, vuint64m1_t v1) {
   return __riscv_vcreate_v_u64m1x2(v0, v1);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } @test_vcreate_v_u64m1x3
-// CHECK-RV64-SAME: (<vscale x 1 x i64> [[V0:%.*]], <vscale x 1 x i64> [[V1:%.*]], <vscale x 1 x i64> [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } @test_vcreate_v_u64m1x3(
+// CHECK-RV64-SAME: <vscale x 1 x i64> [[V0:%.*]], <vscale x 1 x i64> [[V1:%.*]], <vscale x 1 x i64> [[V2:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } poison, <vscale x 1 x i64> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[TMP0]], <vscale x 1 x i64> [[V1]], 1
 // CHECK-RV64-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[TMP1]], <vscale x 1 x i64> [[V2]], 2
 // CHECK-RV64-NEXT:    ret { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[TMP2]]
 //
-vuint64m1x3_t test_vcreate_v_u64m1x3(vuint64m1_t v0, vuint64m1_t v1, vuint64m1_t v2) {
+vuint64m1x3_t test_vcreate_v_u64m1x3(vuint64m1_t v0, vuint64m1_t v1,
+                                     vuint64m1_t v2) {
   return __riscv_vcreate_v_u64m1x3(v0, v1, v2);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } @test_vcreate_v_u64m1x4
-// CHECK-RV64-SAME: (<vscale x 1 x i64> [[V0:%.*]], <vscale x 1 x i64> [[V1:%.*]], <vscale x 1 x i64> [[V2:%.*]], <vscale x 1 x i64> [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } @test_vcreate_v_u64m1x4(
+// CHECK-RV64-SAME: <vscale x 1 x i64> [[V0:%.*]], <vscale x 1 x i64> [[V1:%.*]], <vscale x 1 x i64> [[V2:%.*]], <vscale x 1 x i64> [[V3:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } poison, <vscale x 1 x i64> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[TMP0]], <vscale x 1 x i64> [[V1]], 1
@@ -2959,12 +4142,13 @@ vuint64m1x3_t test_vcreate_v_u64m1x3(vuint64m1_t v0, vuint64m1_t v1, vuint64m1_t
 // CHECK-RV64-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[TMP2]], <vscale x 1 x i64> [[V3]], 3
 // CHECK-RV64-NEXT:    ret { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[TMP3]]
 //
-vuint64m1x4_t test_vcreate_v_u64m1x4(vuint64m1_t v0, vuint64m1_t v1, vuint64m1_t v2, vuint64m1_t v3) {
+vuint64m1x4_t test_vcreate_v_u64m1x4(vuint64m1_t v0, vuint64m1_t v1,
+                                     vuint64m1_t v2, vuint64m1_t v3) {
   return __riscv_vcreate_v_u64m1x4(v0, v1, v2, v3);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } @test_vcreate_v_u64m1x5
-// CHECK-RV64-SAME: (<vscale x 1 x i64> [[V0:%.*]], <vscale x 1 x i64> [[V1:%.*]], <vscale x 1 x i64> [[V2:%.*]], <vscale x 1 x i64> [[V3:%.*]], <vscale x 1 x i64> [[V4:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } @test_vcreate_v_u64m1x5(
+// CHECK-RV64-SAME: <vscale x 1 x i64> [[V0:%.*]], <vscale x 1 x i64> [[V1:%.*]], <vscale x 1 x i64> [[V2:%.*]], <vscale x 1 x i64> [[V3:%.*]], <vscale x 1 x i64> [[V4:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } poison, <vscale x 1 x i64> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[TMP0]], <vscale x 1 x i64> [[V1]], 1
@@ -2973,12 +4157,14 @@ vuint64m1x4_t test_vcreate_v_u64m1x4(vuint64m1_t v0, vuint64m1_t v1, vuint64m1_t
 // CHECK-RV64-NEXT:    [[TMP4:%.*]] = insertvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[TMP3]], <vscale x 1 x i64> [[V4]], 4
 // CHECK-RV64-NEXT:    ret { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[TMP4]]
 //
-vuint64m1x5_t test_vcreate_v_u64m1x5(vuint64m1_t v0, vuint64m1_t v1, vuint64m1_t v2, vuint64m1_t v3, vuint64m1_t v4) {
+vuint64m1x5_t test_vcreate_v_u64m1x5(vuint64m1_t v0, vuint64m1_t v1,
+                                     vuint64m1_t v2, vuint64m1_t v3,
+                                     vuint64m1_t v4) {
   return __riscv_vcreate_v_u64m1x5(v0, v1, v2, v3, v4);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } @test_vcreate_v_u64m1x6
-// CHECK-RV64-SAME: (<vscale x 1 x i64> [[V0:%.*]], <vscale x 1 x i64> [[V1:%.*]], <vscale x 1 x i64> [[V2:%.*]], <vscale x 1 x i64> [[V3:%.*]], <vscale x 1 x i64> [[V4:%.*]], <vscale x 1 x i64> [[V5:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } @test_vcreate_v_u64m1x6(
+// CHECK-RV64-SAME: <vscale x 1 x i64> [[V0:%.*]], <vscale x 1 x i64> [[V1:%.*]], <vscale x 1 x i64> [[V2:%.*]], <vscale x 1 x i64> [[V3:%.*]], <vscale x 1 x i64> [[V4:%.*]], <vscale x 1 x i64> [[V5:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } poison, <vscale x 1 x i64> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[TMP0]], <vscale x 1 x i64> [[V1]], 1
@@ -2988,12 +4174,14 @@ vuint64m1x5_t test_vcreate_v_u64m1x5(vuint64m1_t v0, vuint64m1_t v1, vuint64m1_t
 // CHECK-RV64-NEXT:    [[TMP5:%.*]] = insertvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[TMP4]], <vscale x 1 x i64> [[V5]], 5
 // CHECK-RV64-NEXT:    ret { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[TMP5]]
 //
-vuint64m1x6_t test_vcreate_v_u64m1x6(vuint64m1_t v0, vuint64m1_t v1, vuint64m1_t v2, vuint64m1_t v3, vuint64m1_t v4, vuint64m1_t v5) {
+vuint64m1x6_t test_vcreate_v_u64m1x6(vuint64m1_t v0, vuint64m1_t v1,
+                                     vuint64m1_t v2, vuint64m1_t v3,
+                                     vuint64m1_t v4, vuint64m1_t v5) {
   return __riscv_vcreate_v_u64m1x6(v0, v1, v2, v3, v4, v5);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } @test_vcreate_v_u64m1x7
-// CHECK-RV64-SAME: (<vscale x 1 x i64> [[V0:%.*]], <vscale x 1 x i64> [[V1:%.*]], <vscale x 1 x i64> [[V2:%.*]], <vscale x 1 x i64> [[V3:%.*]], <vscale x 1 x i64> [[V4:%.*]], <vscale x 1 x i64> [[V5:%.*]], <vscale x 1 x i64> [[V6:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } @test_vcreate_v_u64m1x7(
+// CHECK-RV64-SAME: <vscale x 1 x i64> [[V0:%.*]], <vscale x 1 x i64> [[V1:%.*]], <vscale x 1 x i64> [[V2:%.*]], <vscale x 1 x i64> [[V3:%.*]], <vscale x 1 x i64> [[V4:%.*]], <vscale x 1 x i64> [[V5:%.*]], <vscale x 1 x i64> [[V6:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } poison, <vscale x 1 x i64> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[TMP0]], <vscale x 1 x i64> [[V1]], 1
@@ -3004,12 +4192,15 @@ vuint64m1x6_t test_vcreate_v_u64m1x6(vuint64m1_t v0, vuint64m1_t v1, vuint64m1_t
 // CHECK-RV64-NEXT:    [[TMP6:%.*]] = insertvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[TMP5]], <vscale x 1 x i64> [[V6]], 6
 // CHECK-RV64-NEXT:    ret { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[TMP6]]
 //
-vuint64m1x7_t test_vcreate_v_u64m1x7(vuint64m1_t v0, vuint64m1_t v1, vuint64m1_t v2, vuint64m1_t v3, vuint64m1_t v4, vuint64m1_t v5, vuint64m1_t v6) {
+vuint64m1x7_t test_vcreate_v_u64m1x7(vuint64m1_t v0, vuint64m1_t v1,
+                                     vuint64m1_t v2, vuint64m1_t v3,
+                                     vuint64m1_t v4, vuint64m1_t v5,
+                                     vuint64m1_t v6) {
   return __riscv_vcreate_v_u64m1x7(v0, v1, v2, v3, v4, v5, v6);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } @test_vcreate_v_u64m1x8
-// CHECK-RV64-SAME: (<vscale x 1 x i64> [[V0:%.*]], <vscale x 1 x i64> [[V1:%.*]], <vscale x 1 x i64> [[V2:%.*]], <vscale x 1 x i64> [[V3:%.*]], <vscale x 1 x i64> [[V4:%.*]], <vscale x 1 x i64> [[V5:%.*]], <vscale x 1 x i64> [[V6:%.*]], <vscale x 1 x i64> [[V7:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } @test_vcreate_v_u64m1x8(
+// CHECK-RV64-SAME: <vscale x 1 x i64> [[V0:%.*]], <vscale x 1 x i64> [[V1:%.*]], <vscale x 1 x i64> [[V2:%.*]], <vscale x 1 x i64> [[V3:%.*]], <vscale x 1 x i64> [[V4:%.*]], <vscale x 1 x i64> [[V5:%.*]], <vscale x 1 x i64> [[V6:%.*]], <vscale x 1 x i64> [[V7:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } poison, <vscale x 1 x i64> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[TMP0]], <vscale x 1 x i64> [[V1]], 1
@@ -3021,12 +4212,15 @@ vuint64m1x7_t test_vcreate_v_u64m1x7(vuint64m1_t v0, vuint64m1_t v1, vuint64m1_t
 // CHECK-RV64-NEXT:    [[TMP7:%.*]] = insertvalue { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[TMP6]], <vscale x 1 x i64> [[V7]], 7
 // CHECK-RV64-NEXT:    ret { <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i64> } [[TMP7]]
 //
-vuint64m1x8_t test_vcreate_v_u64m1x8(vuint64m1_t v0, vuint64m1_t v1, vuint64m1_t v2, vuint64m1_t v3, vuint64m1_t v4, vuint64m1_t v5, vuint64m1_t v6, vuint64m1_t v7) {
+vuint64m1x8_t test_vcreate_v_u64m1x8(vuint64m1_t v0, vuint64m1_t v1,
+                                     vuint64m1_t v2, vuint64m1_t v3,
+                                     vuint64m1_t v4, vuint64m1_t v5,
+                                     vuint64m1_t v6, vuint64m1_t v7) {
   return __riscv_vcreate_v_u64m1x8(v0, v1, v2, v3, v4, v5, v6, v7);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i64>, <vscale x 2 x i64> } @test_vcreate_v_u64m2x2
-// CHECK-RV64-SAME: (<vscale x 2 x i64> [[V0:%.*]], <vscale x 2 x i64> [[V1:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i64>, <vscale x 2 x i64> } @test_vcreate_v_u64m2x2(
+// CHECK-RV64-SAME: <vscale x 2 x i64> [[V0:%.*]], <vscale x 2 x i64> [[V1:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } poison, <vscale x 2 x i64> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], <vscale x 2 x i64> [[V1]], 1
@@ -3036,20 +4230,21 @@ vuint64m2x2_t test_vcreate_v_u64m2x2(vuint64m2_t v0, vuint64m2_t v1) {
   return __riscv_vcreate_v_u64m2x2(v0, v1);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @test_vcreate_v_u64m2x3
-// CHECK-RV64-SAME: (<vscale x 2 x i64> [[V0:%.*]], <vscale x 2 x i64> [[V1:%.*]], <vscale x 2 x i64> [[V2:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @test_vcreate_v_u64m2x3(
+// CHECK-RV64-SAME: <vscale x 2 x i64> [[V0:%.*]], <vscale x 2 x i64> [[V1:%.*]], <vscale x 2 x i64> [[V2:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } poison, <vscale x 2 x i64> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], <vscale x 2 x i64> [[V1]], 1
 // CHECK-RV64-NEXT:    [[TMP2:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP1]], <vscale x 2 x i64> [[V2]], 2
 // CHECK-RV64-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]]
 //
-vuint64m2x3_t test_vcreate_v_u64m2x3(vuint64m2_t v0, vuint64m2_t v1, vuint64m2_t v2) {
+vuint64m2x3_t test_vcreate_v_u64m2x3(vuint64m2_t v0, vuint64m2_t v1,
+                                     vuint64m2_t v2) {
   return __riscv_vcreate_v_u64m2x3(v0, v1, v2);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @test_vcreate_v_u64m2x4
-// CHECK-RV64-SAME: (<vscale x 2 x i64> [[V0:%.*]], <vscale x 2 x i64> [[V1:%.*]], <vscale x 2 x i64> [[V2:%.*]], <vscale x 2 x i64> [[V3:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @test_vcreate_v_u64m2x4(
+// CHECK-RV64-SAME: <vscale x 2 x i64> [[V0:%.*]], <vscale x 2 x i64> [[V1:%.*]], <vscale x 2 x i64> [[V2:%.*]], <vscale x 2 x i64> [[V3:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } poison, <vscale x 2 x i64> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP0]], <vscale x 2 x i64> [[V1]], 1
@@ -3057,12 +4252,13 @@ vuint64m2x3_t test_vcreate_v_u64m2x3(vuint64m2_t v0, vuint64m2_t v1, vuint64m2_t
 // CHECK-RV64-NEXT:    [[TMP3:%.*]] = insertvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP2]], <vscale x 2 x i64> [[V3]], 3
 // CHECK-RV64-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[TMP3]]
 //
-vuint64m2x4_t test_vcreate_v_u64m2x4(vuint64m2_t v0, vuint64m2_t v1, vuint64m2_t v2, vuint64m2_t v3) {
+vuint64m2x4_t test_vcreate_v_u64m2x4(vuint64m2_t v0, vuint64m2_t v1,
+                                     vuint64m2_t v2, vuint64m2_t v3) {
   return __riscv_vcreate_v_u64m2x4(v0, v1, v2, v3);
 }
 
-// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x i64>, <vscale x 4 x i64> } @test_vcreate_v_u64m4x2
-// CHECK-RV64-SAME: (<vscale x 4 x i64> [[V0:%.*]], <vscale x 4 x i64> [[V1:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-LABEL: define dso_local { <vscale x 4 x i64>, <vscale x 4 x i64> } @test_vcreate_v_u64m4x2(
+// CHECK-RV64-SAME: <vscale x 4 x i64> [[V0:%.*]], <vscale x 4 x i64> [[V1:%.*]]) #[[ATTR0]] {
 // CHECK-RV64-NEXT:  entry:
 // CHECK-RV64-NEXT:    [[TMP0:%.*]] = insertvalue { <vscale x 4 x i64>, <vscale x 4 x i64> } poison, <vscale x 4 x i64> [[V0]], 0
 // CHECK-RV64-NEXT:    [[TMP1:%.*]] = insertvalue { <vscale x 4 x i64>, <vscale x 4 x i64> } [[TMP0]], <vscale x 4 x i64> [[V1]], 1
@@ -3071,4 +4267,3 @@ vuint64m2x4_t test_vcreate_v_u64m2x4(vuint64m2_t v0, vuint64m2_t v1, vuint64m2_t
 vuint64m4x2_t test_vcreate_v_u64m4x2(vuint64m4_t v0, vuint64m4_t v1) {
   return __riscv_vcreate_v_u64m4x2(v0, v1);
 }
-

From d4026458d4ced8471cc0fcbf3a1a4cd1944ebea9 Mon Sep 17 00:00:00 2001
From: Jason Molenda <jmolenda@apple.com>
Date: Tue, 31 Oct 2023 08:38:42 -0700
Subject: [PATCH 532/877] [LLDB] On AArch64, reconfigure register context first
 (#70742)

On an SVE/SME the register context configuration may change after the
inferior process has executed. This was handled via
https://reviews.llvm.org/D159504 but it is reconfiguring and clearing
the register context after we've parsed any expedited reigster values
from the stop reply packet. That results in lldb having to read each
register value one at a time while at that stop location, which will be
a performance problem on non-local debug setups.

The configuration & clearing needs to happen first. Also, update the
names of the local variables for a little clarity.
---
 .../Process/gdb-remote/ProcessGDBRemote.cpp   | 34 +++++++++----------
 1 file changed, 16 insertions(+), 18 deletions(-)

diff --git a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp
index dad1396698050..16b3ba661d071 100644
--- a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp
+++ b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp
@@ -1642,9 +1642,22 @@ ThreadSP ProcessGDBRemote::SetThreadStopInfo(
   }
 
   ThreadGDBRemote *gdb_thread = static_cast<ThreadGDBRemote *>(thread_sp.get());
-  RegisterContextSP gdb_reg_ctx_sp(gdb_thread->GetRegisterContext());
+  RegisterContextSP reg_ctx_sp(gdb_thread->GetRegisterContext());
 
-  gdb_reg_ctx_sp->InvalidateIfNeeded(true);
+  reg_ctx_sp->InvalidateIfNeeded(true);
+
+  // AArch64 SVE/SME specific code below updates SVE and ZA register sizes and
+  // offsets if value of VG or SVG registers has changed since last stop.
+  const ArchSpec &arch = GetTarget().GetArchitecture();
+  if (arch.IsValid() && arch.GetTriple().isAArch64()) {
+    GDBRemoteRegisterContext *gdb_remote_reg_ctx =
+        static_cast<GDBRemoteRegisterContext *>(reg_ctx_sp.get());
+
+    if (gdb_remote_reg_ctx) {
+      gdb_remote_reg_ctx->AArch64Reconfigure();
+      gdb_remote_reg_ctx->InvalidateAllRegisters();
+    }
+  }
 
   auto iter = std::find(m_thread_ids.begin(), m_thread_ids.end(), tid);
   if (iter != m_thread_ids.end())
@@ -1655,25 +1668,10 @@ ThreadSP ProcessGDBRemote::SetThreadStopInfo(
     WritableDataBufferSP buffer_sp(
         new DataBufferHeap(reg_value_extractor.GetStringRef().size() / 2, 0));
     reg_value_extractor.GetHexBytes(buffer_sp->GetData(), '\xcc');
-    uint32_t lldb_regnum = gdb_reg_ctx_sp->ConvertRegisterKindToRegisterNumber(
+    uint32_t lldb_regnum = reg_ctx_sp->ConvertRegisterKindToRegisterNumber(
         eRegisterKindProcessPlugin, pair.first);
     gdb_thread->PrivateSetRegisterValue(lldb_regnum, buffer_sp->GetData());
   }
-
-  // AArch64 SVE/SME specific code below updates SVE and ZA register sizes and
-  // offsets if value of VG or SVG registers has changed since last stop.
-  const ArchSpec &arch = GetTarget().GetArchitecture();
-  if (arch.IsValid() && arch.GetTriple().isAArch64()) {
-    GDBRemoteRegisterContext *reg_ctx_sp =
-        static_cast<GDBRemoteRegisterContext *>(
-            gdb_thread->GetRegisterContext().get());
-
-    if (reg_ctx_sp) {
-      reg_ctx_sp->AArch64Reconfigure();
-      reg_ctx_sp->InvalidateAllRegisters();
-    }
-  }
-
   thread_sp->SetName(thread_name.empty() ? nullptr : thread_name.c_str());
 
   gdb_thread->SetThreadDispatchQAddr(thread_dispatch_qaddr);

From b0997097355b3578c8061373899770928096f36b Mon Sep 17 00:00:00 2001
From: Joseph Huber <jhuber6@vols.utk.edu>
Date: Tue, 31 Oct 2023 10:41:59 -0500
Subject: [PATCH 533/877] [StackProtector] Do not emit the stack protector on
 GPU architectures (#70799)

Summary:
This patch changes the code generation to not emit the stack protector
metadata on unsupported architectures. The issue was caused by system
toolchains emitting stack protector option by default which would lead
to errors when compiling for the GPU. I elected to change the code
generation as we may want to update this in the future so we should keep
the `clang` Driver code common. Although the user can use some
combination of `-Xarch-device -fno-stack-protector` to override this, it
is very irritating to do when we shouldn't emit this incompatible IR
anyway.

Fixes: https://github.com/llvm/llvm-project/issues/65911
---
 clang/lib/CodeGen/CodeGenModule.cpp | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index fc135ce80d82f..4203a6218aba6 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -761,6 +761,14 @@ static void setVisibilityFromDLLStorageClass(const clang::LangOptions &LO,
   }
 }
 
+static bool isStackProtectorOn(const LangOptions &LangOpts,
+                               const llvm::Triple &Triple,
+                               clang::LangOptions::StackProtectorMode Mode) {
+  if (Triple.isAMDGPU() || Triple.isNVPTX())
+    return false;
+  return LangOpts.getStackProtector() == Mode;
+}
+
 void CodeGenModule::Release() {
   Module *Primary = getContext().getCurrentNamedModule();
   if (CXX20ModuleInits && Primary && !Primary->isHeaderLikeModule())
@@ -2296,13 +2304,13 @@ void CodeGenModule::SetLLVMFunctionAttributesForDefinition(const Decl *D,
   if (D && D->hasAttr<NoStackProtectorAttr>())
     ; // Do nothing.
   else if (D && D->hasAttr<StrictGuardStackCheckAttr>() &&
-           LangOpts.getStackProtector() == LangOptions::SSPOn)
+           isStackProtectorOn(LangOpts, getTriple(), LangOptions::SSPOn))
     B.addAttribute(llvm::Attribute::StackProtectStrong);
-  else if (LangOpts.getStackProtector() == LangOptions::SSPOn)
+  else if (isStackProtectorOn(LangOpts, getTriple(), LangOptions::SSPOn))
     B.addAttribute(llvm::Attribute::StackProtect);
-  else if (LangOpts.getStackProtector() == LangOptions::SSPStrong)
+  else if (isStackProtectorOn(LangOpts, getTriple(), LangOptions::SSPStrong))
     B.addAttribute(llvm::Attribute::StackProtectStrong);
-  else if (LangOpts.getStackProtector() == LangOptions::SSPReq)
+  else if (isStackProtectorOn(LangOpts, getTriple(), LangOptions::SSPReq))
     B.addAttribute(llvm::Attribute::StackProtectReq);
 
   if (!D) {

From 33b6fe1ff7c81e9841ba6cb4c82a0b70c78bdbed Mon Sep 17 00:00:00 2001
From: Paulo Matos <pmatos@igalia.com>
Date: Tue, 31 Oct 2023 16:45:34 +0100
Subject: [PATCH 534/877] Revert "[NFC] Remote TODO regarding opaque ptr
 conversion"

This reverts commit d56eb823fc772bcfea266275da98cfe607b05b7f.
---
 llvm/include/llvm/IR/Type.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/llvm/include/llvm/IR/Type.h b/llvm/include/llvm/IR/Type.h
index 63414cac47607..c38078cc6087e 100644
--- a/llvm/include/llvm/IR/Type.h
+++ b/llvm/include/llvm/IR/Type.h
@@ -482,6 +482,9 @@ class Type {
   //===--------------------------------------------------------------------===//
   // Convenience methods for getting pointer types.
   //
+
+  // TODO: After opaque pointer transition this can be replaced by simply
+  //       calling PointerType::get(C, AS).
   static PointerType *getInt8PtrTy(LLVMContext &C, unsigned AS = 0);
 
   static Type *getWasm_ExternrefTy(LLVMContext &C);

From b096b8b1ff30abc9983da12d1b8c099530663627 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?=
 =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?=
 =?UTF-8?q?=E3=83=B3=29?= <clementval@gmail.com>
Date: Tue, 31 Oct 2023 08:51:54 -0700
Subject: [PATCH 535/877] [flang][openacc] Warn only when the same variable is
 in the same declare (#70698)

A var may appear at most once in all the clauses of declare directives
for a function, subroutine, program, or module.
We raise an error when a var appears in two different clauses. If it is
in the same clauses, we just issue a warning.
---
 flang/lib/Semantics/check-acc-structure.cpp   | 27 ++++++++++++++-----
 flang/lib/Semantics/check-acc-structure.h     |  4 +--
 .../OpenACC/acc-declare-validity.f90          |  7 +++--
 3 files changed, 27 insertions(+), 11 deletions(-)

diff --git a/flang/lib/Semantics/check-acc-structure.cpp b/flang/lib/Semantics/check-acc-structure.cpp
index 763418ede24d5..64b07da2324d9 100644
--- a/flang/lib/Semantics/check-acc-structure.cpp
+++ b/flang/lib/Semantics/check-acc-structure.cpp
@@ -401,14 +401,27 @@ void AccStructureChecker::CheckMultipleOccurrenceInDeclare(
             [&](const Fortran::parser::Designator &designator) {
               if (const auto *name = getDesignatorNameIfDataRef(designator)) {
                 if (declareSymbols.contains(&name->symbol->GetUltimate())) {
-                  context_.Say(GetContext().clauseSource,
-                      "'%s' in the %s clause is already present in another "
-                      "clause in this module"_err_en_US,
-                      name->symbol->name(),
-                      parser::ToUpperCaseLetters(
-                          llvm::acc::getOpenACCClauseName(clause).str()));
+                  if (declareSymbols[&name->symbol->GetUltimate()] == clause) {
+                    context_.Say(GetContext().clauseSource,
+                        "'%s' in the %s clause is already present in the same "
+                        "clause in this module"_warn_en_US,
+                        name->symbol->name(),
+                        parser::ToUpperCaseLetters(
+                            llvm::acc::getOpenACCClauseName(clause).str()));
+                  } else {
+                    context_.Say(GetContext().clauseSource,
+                        "'%s' in the %s clause is already present in another "
+                        "%s clause in this module"_err_en_US,
+                        name->symbol->name(),
+                        parser::ToUpperCaseLetters(
+                            llvm::acc::getOpenACCClauseName(clause).str()),
+                        parser::ToUpperCaseLetters(
+                            llvm::acc::getOpenACCClauseName(
+                                declareSymbols[&name->symbol->GetUltimate()])
+                                .str()));
+                  }
                 }
-                declareSymbols.insert(&name->symbol->GetUltimate());
+                declareSymbols.insert({&name->symbol->GetUltimate(), clause});
               }
             },
             [&](const Fortran::parser::Name &name) {
diff --git a/flang/lib/Semantics/check-acc-structure.h b/flang/lib/Semantics/check-acc-structure.h
index 2a09b7a39395d..6a9aa01a9bb13 100644
--- a/flang/lib/Semantics/check-acc-structure.h
+++ b/flang/lib/Semantics/check-acc-structure.h
@@ -18,7 +18,7 @@
 #include "flang/Common/enum-set.h"
 #include "flang/Parser/parse-tree.h"
 #include "flang/Semantics/semantics.h"
-#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/Frontend/OpenACC/ACC.h.inc"
 
 using AccDirectiveSet = Fortran::common::EnumSet<llvm::acc::Directive,
@@ -91,7 +91,7 @@ class AccStructureChecker
   llvm::StringRef getClauseName(llvm::acc::Clause clause) override;
   llvm::StringRef getDirectiveName(llvm::acc::Directive directive) override;
 
-  llvm::SmallDenseSet<Symbol *> declareSymbols;
+  llvm::SmallDenseMap<Symbol *, llvm::acc::Clause> declareSymbols;
   unsigned loopNestLevel = 0;
 };
 
diff --git a/flang/test/Semantics/OpenACC/acc-declare-validity.f90 b/flang/test/Semantics/OpenACC/acc-declare-validity.f90
index c0333d78dd9f7..2e7e651815d4a 100644
--- a/flang/test/Semantics/OpenACC/acc-declare-validity.f90
+++ b/flang/test/Semantics/OpenACC/acc-declare-validity.f90
@@ -14,7 +14,7 @@ module openacc_declare_validity
 
   !$acc declare create(aa, bb)
 
-  !ERROR: 'aa' in the CREATE clause is already present in another clause in this module
+  !WARNING: 'aa' in the CREATE clause is already present in the same clause in this module
   !$acc declare create(aa)
 
   !$acc declare link(ab)
@@ -36,13 +36,16 @@ module openacc_declare_validity
   !ERROR: The ZERO modifier is not allowed for the CREATE clause on the DECLARE directive
   !$acc declare create(zero: dd)
 
+  !ERROR: 'bb' in the COPYIN clause is already present in another CREATE clause in this module
+  !$acc declare copyin(bb)
+
 contains
 
   subroutine sub1(cc, dd)
     real(8) :: cc(:)
     real(8) :: dd(:)
     !$acc declare present(cc, dd)
-    !ERROR: 'cc' in the CREATE clause is already present in another clause in this module
+    !ERROR: 'cc' in the CREATE clause is already present in another PRESENT clause in this module
     !$acc declare create(cc)
   end subroutine sub1
 

From 72d8e47a2c345cfe163d91a2263ce01c3257cec2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?=
 =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?=
 =?UTF-8?q?=E3=83=B3=29?= <clementval@gmail.com>
Date: Tue, 31 Oct 2023 08:54:33 -0700
Subject: [PATCH 536/877] [flang][openacc] Allow scalar in acc cache directive
 (#70713)

---
 flang/docs/OpenACC.md                               | 1 +
 flang/lib/Semantics/resolve-directives.cpp          | 5 ++++-
 flang/test/Semantics/OpenACC/acc-cache-validity.f90 | 8 ++------
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/flang/docs/OpenACC.md b/flang/docs/OpenACC.md
index 41c974f837421..4c36a38f8bf57 100644
--- a/flang/docs/OpenACC.md
+++ b/flang/docs/OpenACC.md
@@ -24,3 +24,4 @@ local:
 * The `if` clause accepts scalar integer expression in addition to scalar
   logical expression.
 * `!$acc routine` directive can be placed at the top level. 
+* `!$acc cache` directive accepts scalar variable.
diff --git a/flang/lib/Semantics/resolve-directives.cpp b/flang/lib/Semantics/resolve-directives.cpp
index 66d3a9da36505..fc0648b34bede 100644
--- a/flang/lib/Semantics/resolve-directives.cpp
+++ b/flang/lib/Semantics/resolve-directives.cpp
@@ -1052,7 +1052,10 @@ static bool IsLastNameArray(const parser::Designator &designator) {
   const evaluate::DataRef dataRef{*(name.symbol)};
   return common::visit(
       common::visitors{
-          [](const evaluate::SymbolRef &ref) { return ref->Rank() > 0; },
+          [](const evaluate::SymbolRef &ref) {
+            return ref->Rank() > 0 ||
+                ref->GetType()->category() == DeclTypeSpec::Numeric;
+          },
           [](const evaluate::ArrayRef &aref) {
             return aref.base().IsSymbol() ||
                 aref.base().GetComponent().base().Rank() == 0;
diff --git a/flang/test/Semantics/OpenACC/acc-cache-validity.f90 b/flang/test/Semantics/OpenACC/acc-cache-validity.f90
index 9eb12b9e2b7e4..49f400e763bfb 100644
--- a/flang/test/Semantics/OpenACC/acc-cache-validity.f90
+++ b/flang/test/Semantics/OpenACC/acc-cache-validity.f90
@@ -29,15 +29,11 @@ program openacc_cache_validity
   !$acc cache(t%arr)
   !$acc cache(ta(1:2)%arr)
   !$acc cache(ta(1:2)%arr(1:4))
-
-  !ERROR: Only array element or subarray are allowed in CACHE directive
-  !$acc cache(ta(1:2)%s)
-
-  !ERROR: Only array element or subarray are allowed in CACHE directive
   !$acc cache(i)
+  !$acc cache(t%s)
 
   !ERROR: Only array element or subarray are allowed in CACHE directive
-  !$acc cache(t%s)
+  !$acc cache(t)
 
   !ERROR: Only array element or subarray are allowed in CACHE directive
   !$acc cache(/i/)

From 3351097c7b69ee88b6f2ad283ef8a56776bf0559 Mon Sep 17 00:00:00 2001
From: Sebastian Poeplau <poeplau@adacore.com>
Date: Tue, 31 Oct 2023 16:55:00 +0100
Subject: [PATCH 537/877] [LLVM-C] Add LLVMCreateTargetMachineWithABI (#68406)

The ABI parameter is used by a number of common backends, including ARM,
MIPS, PPC, and RISCV. Exposing it via the C API makes it possible for
users of those backends to configure the ABI without custom bindings.
---
 llvm/docs/ReleaseNotes.rst                    |   5 +
 llvm/include/llvm-c/TargetMachine.h           |  50 ++++++
 llvm/lib/Target/TargetMachineC.cpp            | 165 +++++++++++++-----
 llvm/unittests/Target/CMakeLists.txt          |   9 +
 .../Target/TargetMachineOptionsTest.cpp       |  73 ++++++++
 5 files changed, 258 insertions(+), 44 deletions(-)
 create mode 100644 llvm/unittests/Target/TargetMachineOptionsTest.cpp

diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
index 55a83c0e9b54d..a1277171e8a54 100644
--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@@ -167,6 +167,11 @@ Changes to the C API
   * ``LLVMConstAnd``
   * ``LLVMConstOr``
 
+* Added ``LLVMCreateTargetMachineWithOptions``, along with helper functions for
+  an opaque option structure, as an alternative to ``LLVMCreateTargetMachine``.
+  The option structure exposes an additional setting (i.e., the target ABI) and
+  provides default values for unspecified settings.
+
 Changes to the CodeGen infrastructure
 -------------------------------------
 
diff --git a/llvm/include/llvm-c/TargetMachine.h b/llvm/include/llvm-c/TargetMachine.h
index bfbe1421a3560..0612fd0a19d6e 100644
--- a/llvm/include/llvm-c/TargetMachine.h
+++ b/llvm/include/llvm-c/TargetMachine.h
@@ -31,6 +31,7 @@ LLVM_C_EXTERN_C_BEGIN
  * @{
  */
 
+typedef struct LLVMOpaqueTargetMachineOptions *LLVMTargetMachineOptionsRef;
 typedef struct LLVMOpaqueTargetMachine *LLVMTargetMachineRef;
 typedef struct LLVMTarget *LLVMTargetRef;
 
@@ -98,6 +99,55 @@ LLVMBool LLVMTargetHasTargetMachine(LLVMTargetRef T);
 LLVMBool LLVMTargetHasAsmBackend(LLVMTargetRef T);
 
 /*===-- Target Machine ----------------------------------------------------===*/
+/**
+ * Create a new set of options for an llvm::TargetMachine.
+ *
+ * The returned option structure must be released with
+ * LLVMDisposeTargetMachineOptions() after the call to
+ * LLVMCreateTargetMachineWithOptions().
+ */
+LLVMTargetMachineOptionsRef LLVMCreateTargetMachineOptions(void);
+
+/**
+ * Dispose of an LLVMTargetMachineOptionsRef instance.
+ */
+void LLVMDisposeTargetMachineOptions(LLVMTargetMachineOptionsRef Options);
+
+void LLVMTargetMachineOptionsSetCPU(LLVMTargetMachineOptionsRef Options,
+                                    const char *CPU);
+
+/**
+ * Set the list of features for the target machine.
+ *
+ * \param Features a comma-separated list of features.
+ */
+void LLVMTargetMachineOptionsSetFeatures(LLVMTargetMachineOptionsRef Options,
+                                         const char *Features);
+
+void LLVMTargetMachineOptionsSetABI(LLVMTargetMachineOptionsRef Options,
+                                    const char *ABI);
+
+void LLVMTargetMachineOptionsSetCodeGenOptLevel(
+    LLVMTargetMachineOptionsRef Options, LLVMCodeGenOptLevel Level);
+
+void LLVMTargetMachineOptionsSetRelocMode(LLVMTargetMachineOptionsRef Options,
+                                          LLVMRelocMode Reloc);
+
+void LLVMTargetMachineOptionsSetCodeModel(LLVMTargetMachineOptionsRef Options,
+                                          LLVMCodeModel CodeModel);
+
+/**
+ * Create a new llvm::TargetMachine.
+ *
+ * \param T the target to create a machine for.
+ * \param Triple a triple describing the target machine.
+ * \param Options additional configuration (see
+ *                LLVMCreateTargetMachineOptions()).
+ */
+LLVMTargetMachineRef
+LLVMCreateTargetMachineWithOptions(LLVMTargetRef T, const char *Triple,
+                                   LLVMTargetMachineOptionsRef Options);
+
 /** Creates a new llvm::TargetMachine. See llvm::Target::createTargetMachine */
 LLVMTargetMachineRef LLVMCreateTargetMachine(LLVMTargetRef T,
   const char *Triple, const char *CPU, const char *Features,
diff --git a/llvm/lib/Target/TargetMachineC.cpp b/llvm/lib/Target/TargetMachineC.cpp
index d418377325b21..4d23f40c110d1 100644
--- a/llvm/lib/Target/TargetMachineC.cpp
+++ b/llvm/lib/Target/TargetMachineC.cpp
@@ -17,6 +17,7 @@
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Module.h"
 #include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/CBindingWrapping.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/CodeGenCWrappers.h"
@@ -28,6 +29,24 @@
 
 using namespace llvm;
 
+namespace llvm {
+
+/// Options for LLVMCreateTargetMachine().
+struct LLVMTargetMachineOptions {
+  std::string CPU;
+  std::string Features;
+  std::string ABI;
+  CodeGenOptLevel OL = CodeGenOptLevel::Default;
+  std::optional<Reloc::Model> RM;
+  std::optional<CodeModel::Model> CM;
+  bool JIT;
+};
+
+} // namespace llvm
+
+DEFINE_SIMPLE_CONVERSION_FUNCTIONS(LLVMTargetMachineOptions,
+                                   LLVMTargetMachineOptionsRef)
+
 static TargetMachine *unwrap(LLVMTargetMachineRef P) {
   return reinterpret_cast<TargetMachine *>(P);
 }
@@ -96,56 +115,114 @@ LLVMBool LLVMTargetHasAsmBackend(LLVMTargetRef T) {
   return unwrap(T)->hasMCAsmBackend();
 }
 
-LLVMTargetMachineRef LLVMCreateTargetMachine(LLVMTargetRef T,
-        const char *Triple, const char *CPU, const char *Features,
-        LLVMCodeGenOptLevel Level, LLVMRelocMode Reloc,
-        LLVMCodeModel CodeModel) {
-  std::optional<Reloc::Model> RM;
-  switch (Reloc){
-    case LLVMRelocStatic:
-      RM = Reloc::Static;
-      break;
-    case LLVMRelocPIC:
-      RM = Reloc::PIC_;
-      break;
-    case LLVMRelocDynamicNoPic:
-      RM = Reloc::DynamicNoPIC;
-      break;
-    case LLVMRelocROPI:
-      RM = Reloc::ROPI;
-      break;
-    case LLVMRelocRWPI:
-      RM = Reloc::RWPI;
-      break;
-    case LLVMRelocROPI_RWPI:
-      RM = Reloc::ROPI_RWPI;
-      break;
-    default:
-      break;
-  }
+LLVMTargetMachineOptionsRef LLVMCreateTargetMachineOptions(void) {
+  return wrap(new LLVMTargetMachineOptions());
+}
 
-  bool JIT;
-  std::optional<CodeModel::Model> CM = unwrap(CodeModel, JIT);
+void LLVMDisposeTargetMachineOptions(LLVMTargetMachineOptionsRef Options) {
+  delete unwrap(Options);
+}
+
+void LLVMTargetMachineOptionsSetCPU(LLVMTargetMachineOptionsRef Options,
+                                    const char *CPU) {
+  unwrap(Options)->CPU = CPU;
+}
+
+void LLVMTargetMachineOptionsSetFeatures(LLVMTargetMachineOptionsRef Options,
+                                         const char *Features) {
+  unwrap(Options)->Features = Features;
+}
 
+void LLVMTargetMachineOptionsSetABI(LLVMTargetMachineOptionsRef Options,
+                                    const char *ABI) {
+  unwrap(Options)->ABI = ABI;
+}
+
+void LLVMTargetMachineOptionsSetCodeGenOptLevel(
+    LLVMTargetMachineOptionsRef Options, LLVMCodeGenOptLevel Level) {
   CodeGenOptLevel OL;
+
   switch (Level) {
-    case LLVMCodeGenLevelNone:
-      OL = CodeGenOptLevel::None;
-      break;
-    case LLVMCodeGenLevelLess:
-      OL = CodeGenOptLevel::Less;
-      break;
-    case LLVMCodeGenLevelAggressive:
-      OL = CodeGenOptLevel::Aggressive;
-      break;
-    default:
-      OL = CodeGenOptLevel::Default;
-      break;
+  case LLVMCodeGenLevelNone:
+    OL = CodeGenOptLevel::None;
+    break;
+  case LLVMCodeGenLevelLess:
+    OL = CodeGenOptLevel::Less;
+    break;
+  case LLVMCodeGenLevelAggressive:
+    OL = CodeGenOptLevel::Aggressive;
+    break;
+  case LLVMCodeGenLevelDefault:
+    OL = CodeGenOptLevel::Default;
+    break;
   }
 
-  TargetOptions opt;
-  return wrap(unwrap(T)->createTargetMachine(Triple, CPU, Features, opt, RM, CM,
-                                             OL, JIT));
+  unwrap(Options)->OL = OL;
+}
+
+void LLVMTargetMachineOptionsSetRelocMode(LLVMTargetMachineOptionsRef Options,
+                                          LLVMRelocMode Reloc) {
+  std::optional<Reloc::Model> RM;
+
+  switch (Reloc) {
+  case LLVMRelocStatic:
+    RM = Reloc::Static;
+    break;
+  case LLVMRelocPIC:
+    RM = Reloc::PIC_;
+    break;
+  case LLVMRelocDynamicNoPic:
+    RM = Reloc::DynamicNoPIC;
+    break;
+  case LLVMRelocROPI:
+    RM = Reloc::ROPI;
+    break;
+  case LLVMRelocRWPI:
+    RM = Reloc::RWPI;
+    break;
+  case LLVMRelocROPI_RWPI:
+    RM = Reloc::ROPI_RWPI;
+    break;
+  case LLVMRelocDefault:
+    break;
+  }
+
+  unwrap(Options)->RM = RM;
+}
+
+void LLVMTargetMachineOptionsSetCodeModel(LLVMTargetMachineOptionsRef Options,
+                                          LLVMCodeModel CodeModel) {
+  auto CM = unwrap(CodeModel, unwrap(Options)->JIT);
+  unwrap(Options)->CM = CM;
+}
+
+LLVMTargetMachineRef
+LLVMCreateTargetMachineWithOptions(LLVMTargetRef T, const char *Triple,
+                                   LLVMTargetMachineOptionsRef Options) {
+  auto *Opt = unwrap(Options);
+  TargetOptions TO;
+  TO.MCOptions.ABIName = Opt->ABI;
+  return wrap(unwrap(T)->createTargetMachine(Triple, Opt->CPU, Opt->Features,
+                                             TO, Opt->RM, Opt->CM, Opt->OL,
+                                             Opt->JIT));
+}
+
+LLVMTargetMachineRef
+LLVMCreateTargetMachine(LLVMTargetRef T, const char *Triple, const char *CPU,
+                        const char *Features, LLVMCodeGenOptLevel Level,
+                        LLVMRelocMode Reloc, LLVMCodeModel CodeModel) {
+  auto *Options = LLVMCreateTargetMachineOptions();
+
+  LLVMTargetMachineOptionsSetCPU(Options, CPU);
+  LLVMTargetMachineOptionsSetFeatures(Options, Features);
+  LLVMTargetMachineOptionsSetCodeGenOptLevel(Options, Level);
+  LLVMTargetMachineOptionsSetRelocMode(Options, Reloc);
+  LLVMTargetMachineOptionsSetCodeModel(Options, CodeModel);
+
+  auto *Machine = LLVMCreateTargetMachineWithOptions(T, Triple, Options);
+
+  LLVMDisposeTargetMachineOptions(Options);
+  return Machine;
 }
 
 void LLVMDisposeTargetMachine(LLVMTargetMachineRef T) { delete unwrap(T); }
diff --git a/llvm/unittests/Target/CMakeLists.txt b/llvm/unittests/Target/CMakeLists.txt
index 9015029b0970d..fb0775a67fe6d 100644
--- a/llvm/unittests/Target/CMakeLists.txt
+++ b/llvm/unittests/Target/CMakeLists.txt
@@ -3,3 +3,12 @@ foreach(t ${LLVM_TARGETS_TO_BUILD})
     add_subdirectory(${t})
   endif()
 endforeach()
+
+set(LLVM_LINK_COMPONENTS Target AllTargetsCodeGens)
+
+add_llvm_unittest(TargetMachineCTests
+  TargetMachineOptionsTest.cpp
+)
+
+set_property(TARGET TargetMachineCTests
+  PROPERTY FOLDER "Tests/UnitTests/TargetTests")
diff --git a/llvm/unittests/Target/TargetMachineOptionsTest.cpp b/llvm/unittests/Target/TargetMachineOptionsTest.cpp
new file mode 100644
index 0000000000000..27f8855b47551
--- /dev/null
+++ b/llvm/unittests/Target/TargetMachineOptionsTest.cpp
@@ -0,0 +1,73 @@
+//===-- llvm/unittests/Target/TargetMachineOptionsTest.cpp ----------
+//-----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains unit tests for the opaque structure describing options
+/// for TargetMachine creation via the C API.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm-c/Core.h"
+#include "llvm-c/TargetMachine.h"
+#include "llvm/Config/llvm-config.h"
+#include "gtest/gtest.h"
+
+namespace llvm {
+
+TEST(TargetMachineCTest, TargetMachineOptions) {
+  auto *Options = LLVMCreateTargetMachineOptions();
+
+  LLVMTargetMachineOptionsSetCPU(Options, "cortex-a53");
+  LLVMTargetMachineOptionsSetFeatures(Options, "+neon");
+  LLVMTargetMachineOptionsSetABI(Options, "aapcs");
+  LLVMTargetMachineOptionsSetCodeGenOptLevel(Options, LLVMCodeGenLevelNone);
+  LLVMTargetMachineOptionsSetRelocMode(Options, LLVMRelocStatic);
+  LLVMTargetMachineOptionsSetCodeModel(Options, LLVMCodeModelKernel);
+
+  LLVMDisposeTargetMachineOptions(Options);
+}
+
+TEST(TargetMachineCTest, TargetMachineCreation) {
+  LLVMInitializeAllTargets();
+  LLVMInitializeAllTargetInfos();
+  LLVMInitializeAllTargetMCs();
+
+  // Get the default target to keep the test as generic as possible. This may
+  // not be a target for which we can generate code; in that case we give up.
+
+  auto *Triple = LLVMGetDefaultTargetTriple();
+  if (strlen(Triple) == 0) {
+    LLVMDisposeMessage(Triple);
+    GTEST_SKIP();
+  }
+
+  LLVMTargetRef Target = nullptr;
+  char *Error = nullptr;
+  if (LLVMGetTargetFromTriple(Triple, &Target, &Error))
+    FAIL() << "Failed to create target from default triple (" << Triple
+           << "): " << Error;
+
+  ASSERT_NE(Target, nullptr);
+
+  if (!LLVMTargetHasTargetMachine(Target))
+    GTEST_SKIP() << "Default target doesn't support code generation";
+
+  // We don't know which target we're creating a machine for, so don't set any
+  // non-default options; they might cause fatal errors.
+
+  auto *Options = LLVMCreateTargetMachineOptions();
+  auto *TM = LLVMCreateTargetMachineWithOptions(Target, Triple, Options);
+  ASSERT_NE(TM, nullptr);
+
+  LLVMDisposeMessage(Triple);
+  LLVMDisposeTargetMachineOptions(Options);
+  LLVMDisposeTargetMachine(TM);
+}
+
+} // namespace llvm

From e107c9468b9c734fba016166fccc82a7e2b6527b Mon Sep 17 00:00:00 2001
From: Chuanqi Xu <yedeng.yd@linux.alibaba.com>
Date: Tue, 31 Oct 2023 23:59:47 +0800
Subject: [PATCH 538/877] [clang-scan-deps] [P1689] Keep consistent behavior
 for make dependencies with clang (#69551)

Close https://github.com/llvm/llvm-project/issues/69439.

This patch tries to reuse the codes to generate make style dependencies
information with P1689 format directly.
---
 .../DependencyScanning/ModuleDepCollector.cpp        | 12 ++++++++++++
 clang/test/ClangScanDeps/P1689.cppm                  | 10 ++++++++++
 2 files changed, 22 insertions(+)

diff --git a/clang/lib/Tooling/DependencyScanning/ModuleDepCollector.cpp b/clang/lib/Tooling/DependencyScanning/ModuleDepCollector.cpp
index 40115b7b5ae25..2abacc426f4c9 100644
--- a/clang/lib/Tooling/DependencyScanning/ModuleDepCollector.cpp
+++ b/clang/lib/Tooling/DependencyScanning/ModuleDepCollector.cpp
@@ -666,12 +666,24 @@ static StringRef makeAbsoluteAndPreferred(CompilerInstance &CI, StringRef Path,
 }
 
 void ModuleDepCollector::addFileDep(StringRef Path) {
+  if (IsStdModuleP1689Format) {
+    // Within P1689 format, we don't want all the paths to be absolute path
+    // since it may violate the tranditional make style dependencies info.
+    FileDeps.push_back(std::string(Path));
+    return;
+  }
+
   llvm::SmallString<256> Storage;
   Path = makeAbsoluteAndPreferred(ScanInstance, Path, Storage);
   FileDeps.push_back(std::string(Path));
 }
 
 void ModuleDepCollector::addFileDep(ModuleDeps &MD, StringRef Path) {
+  if (IsStdModuleP1689Format) {
+    MD.FileDeps.insert(Path);
+    return;
+  }
+
   llvm::SmallString<256> Storage;
   Path = makeAbsoluteAndPreferred(ScanInstance, Path, Storage);
   MD.FileDeps.insert(Path);
diff --git a/clang/test/ClangScanDeps/P1689.cppm b/clang/test/ClangScanDeps/P1689.cppm
index dffb16974a3e4..24632e25cf7f3 100644
--- a/clang/test/ClangScanDeps/P1689.cppm
+++ b/clang/test/ClangScanDeps/P1689.cppm
@@ -42,6 +42,14 @@
 // RUN: clang-scan-deps -format=p1689 \
 // RUN:   -- %clang++ -std=c++20 -fmodules -fimplicit-module-maps -fmodules-cache-path=%t/cache -c %t/impl_part.cppm -o %t/impl_part.o \
 // RUN:   | FileCheck %t/impl_part.cppm -DPREFIX=%/t
+//
+// Check the path in the make style dependencies are generated in relative path form
+// RUN: cd %t
+// RUN: clang-scan-deps -format=p1689 \
+// RUN:   -- %clang++ -std=c++20 -c -fprebuilt-module-path=%t impl_part.cppm -o impl_part.o \
+// RUN:      -MT impl_part.o.ddi -MD -MF impl_part.dep
+// RUN:   cat impl_part.dep | FileCheck impl_part.cppm -DPREFIX=%/t --check-prefix=CHECK-MAKE-RELATIVE
+
 
 //--- P1689.json.in
 [
@@ -168,6 +176,8 @@ void World() {
 // CHECK-MAKE:   [[PREFIX]]/impl_part.cppm
 // CHECK-MAKE:   [[PREFIX]]/header.mock
 
+// CHECK-MAKE-RELATIVE: impl_part.o.ddi: impl_part.cppm header.mock
+
 //--- interface_part.cppm
 export module M:interface_part;
 export void World();

From 1adb898e2db980fc402b8eac7ebc762c75d05826 Mon Sep 17 00:00:00 2001
From: Adrian Prantl <aprantl@apple.com>
Date: Tue, 31 Oct 2023 09:10:50 -0700
Subject: [PATCH 539/877] Mark headers as textual and unbreak the modules build

---
 clang/include/module.modulemap | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/clang/include/module.modulemap b/clang/include/module.modulemap
index 6ea613c70306f..7dbb7cf02b4f8 100644
--- a/clang/include/module.modulemap
+++ b/clang/include/module.modulemap
@@ -49,6 +49,9 @@ module Clang_Basic {
   textual header "clang/Basic/BuiltinsHexagonDep.def"
   textual header "clang/Basic/BuiltinsHexagonMapCustomDep.def"
   textual header "clang/Basic/BuiltinsLoongArch.def"
+  textual header "clang/Basic/BuiltinsLoongArchBase.def"
+  textual header "clang/Basic/BuiltinsLoongArchLSX.def"
+  textual header "clang/Basic/BuiltinsLoongArchLASX.def"
   textual header "clang/Basic/BuiltinsMips.def"
   textual header "clang/Basic/BuiltinsNEON.def"
   textual header "clang/Basic/BuiltinsNVPTX.def"

From 6be0e979896f7dd610abf263f845c532f1be3762 Mon Sep 17 00:00:00 2001
From: Tom Eccles <tom.eccles@arm.com>
Date: Tue, 31 Oct 2023 16:15:13 +0000
Subject: [PATCH 540/877] [flang] Add fastmath attributes to complex arithmetic
 (#70690)

Propagate fast math flags through complex number lowering (when lowering
fir.*c directly to llvm floating point operations).

The lowering path through the MLIR complex dialect is unchanged.

This leads to a small improvement in spec2017 fotonik3d_r.
---
 .../include/flang/Optimizer/Dialect/FIROps.td | 18 ++++---
 flang/lib/Optimizer/CodeGen/CodeGen.cpp       | 50 ++++++++++++-------
 flang/test/Fir/convert-to-llvm.fir            | 50 +++++++++----------
 flang/test/Lower/HLFIR/binary-ops.f90         |  6 +--
 flang/test/Lower/OpenACC/acc-reduction.f90    |  4 +-
 flang/test/Lower/array-elemental-calls-2.f90  |  2 +-
 flang/test/Lower/assignment.f90               |  6 +--
 7 files changed, 77 insertions(+), 59 deletions(-)

diff --git a/flang/include/flang/Optimizer/Dialect/FIROps.td b/flang/include/flang/Optimizer/Dialect/FIROps.td
index dd2e90c3b1a1f..6e8064a63b7ae 100644
--- a/flang/include/flang/Optimizer/Dialect/FIROps.td
+++ b/flang/include/flang/Optimizer/Dialect/FIROps.td
@@ -2538,12 +2538,18 @@ def fir_NegcOp : ComplexUnaryArithmeticOp<"negc">;
 
 class ComplexArithmeticOp<string mnemonic, list<Trait> traits = []> :
       fir_ArithmeticOp<mnemonic, traits>,
-      Arguments<(ins fir_ComplexType:$lhs, fir_ComplexType:$rhs)>;
-
-def fir_AddcOp : ComplexArithmeticOp<"addc", [Commutative]>;
-def fir_SubcOp : ComplexArithmeticOp<"subc">;
-def fir_MulcOp : ComplexArithmeticOp<"mulc", [Commutative]>;
-def fir_DivcOp : ComplexArithmeticOp<"divc">;
+      Arguments<(ins fir_ComplexType:$lhs, fir_ComplexType:$rhs,
+          DefaultValuedAttr<Arith_FastMathAttr,
+                            "::mlir::arith::FastMathFlags::none">:$fastmath)>;
+
+def fir_AddcOp : ComplexArithmeticOp<"addc",
+    [Commutative, DeclareOpInterfaceMethods<ArithFastMathInterface>]>;
+def fir_SubcOp : ComplexArithmeticOp<"subc",
+    [DeclareOpInterfaceMethods<ArithFastMathInterface>]>;
+def fir_MulcOp : ComplexArithmeticOp<"mulc",
+    [Commutative, DeclareOpInterfaceMethods<ArithFastMathInterface>]>;
+def fir_DivcOp : ComplexArithmeticOp<"divc",
+    [DeclareOpInterfaceMethods<ArithFastMathInterface>]>;
 // Pow is a builtin call and not a primitive
 
 def fir_CmpcOp : fir_Op<"cmpc",
diff --git a/flang/lib/Optimizer/CodeGen/CodeGen.cpp b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
index 0f85f89f1a481..9eabacdc818f6 100644
--- a/flang/lib/Optimizer/CodeGen/CodeGen.cpp
+++ b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
@@ -33,6 +33,7 @@
 #include "mlir/Conversion/OpenMPToLLVM/ConvertOpenMPToLLVM.h"
 #include "mlir/Conversion/ReconcileUnrealizedCasts/ReconcileUnrealizedCasts.h"
 #include "mlir/Conversion/VectorToLLVM/ConvertVectorToLLVM.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Dialect/LLVMIR/Transforms/AddComdats.h"
 #include "mlir/Dialect/OpenACC/OpenACC.h"
@@ -44,6 +45,7 @@
 #include "mlir/Target/LLVMIR/ModuleTranslation.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/TypeSwitch.h"
+#include <mlir/Dialect/LLVMIR/LLVMAttrs.h>
 
 namespace fir {
 #define GEN_PASS_DEF_FIRTOLLVMLOWERING
@@ -3496,12 +3498,20 @@ struct AbsentOpConversion : public FIROpConversion<fir::AbsentOp> {
 // Primitive operations on Complex types
 //
 
+template <typename OPTY>
+static inline mlir::LLVM::FastmathFlagsAttr getLLVMFMFAttr(OPTY op) {
+  return mlir::LLVM::FastmathFlagsAttr::get(
+      op.getContext(),
+      mlir::arith::convertArithFastMathFlagsToLLVM(op.getFastmath()));
+}
+
 /// Generate inline code for complex addition/subtraction
 template <typename LLVMOP, typename OPTY>
 static mlir::LLVM::InsertValueOp
 complexSum(OPTY sumop, mlir::ValueRange opnds,
            mlir::ConversionPatternRewriter &rewriter,
            const fir::LLVMTypeConverter &lowering) {
+  mlir::LLVM::FastmathFlagsAttr fmf = getLLVMFMFAttr(sumop);
   mlir::Value a = opnds[0];
   mlir::Value b = opnds[1];
   auto loc = sumop.getLoc();
@@ -3511,8 +3521,8 @@ complexSum(OPTY sumop, mlir::ValueRange opnds,
   auto y0 = rewriter.create<mlir::LLVM::ExtractValueOp>(loc, a, 1);
   auto x1 = rewriter.create<mlir::LLVM::ExtractValueOp>(loc, b, 0);
   auto y1 = rewriter.create<mlir::LLVM::ExtractValueOp>(loc, b, 1);
-  auto rx = rewriter.create<LLVMOP>(loc, eleTy, x0, x1);
-  auto ry = rewriter.create<LLVMOP>(loc, eleTy, y0, y1);
+  auto rx = rewriter.create<LLVMOP>(loc, eleTy, x0, x1, fmf);
+  auto ry = rewriter.create<LLVMOP>(loc, eleTy, y0, y1, fmf);
   auto r0 = rewriter.create<mlir::LLVM::UndefOp>(loc, ty);
   auto r1 = rewriter.create<mlir::LLVM::InsertValueOp>(loc, r0, rx, 0);
   return rewriter.create<mlir::LLVM::InsertValueOp>(loc, r1, ry, 1);
@@ -3560,6 +3570,7 @@ struct MulcOpConversion : public FIROpConversion<fir::MulcOp> {
     // TODO: Can we use a call to __muldc3 ?
     // given: (x + iy) * (x' + iy')
     // result: (xx'-yy')+i(xy'+yx')
+    mlir::LLVM::FastmathFlagsAttr fmf = getLLVMFMFAttr(mulc);
     mlir::Value a = adaptor.getOperands()[0];
     mlir::Value b = adaptor.getOperands()[1];
     auto loc = mulc.getLoc();
@@ -3569,12 +3580,12 @@ struct MulcOpConversion : public FIROpConversion<fir::MulcOp> {
     auto y0 = rewriter.create<mlir::LLVM::ExtractValueOp>(loc, a, 1);
     auto x1 = rewriter.create<mlir::LLVM::ExtractValueOp>(loc, b, 0);
     auto y1 = rewriter.create<mlir::LLVM::ExtractValueOp>(loc, b, 1);
-    auto xx = rewriter.create<mlir::LLVM::FMulOp>(loc, eleTy, x0, x1);
-    auto yx = rewriter.create<mlir::LLVM::FMulOp>(loc, eleTy, y0, x1);
-    auto xy = rewriter.create<mlir::LLVM::FMulOp>(loc, eleTy, x0, y1);
-    auto ri = rewriter.create<mlir::LLVM::FAddOp>(loc, eleTy, xy, yx);
-    auto yy = rewriter.create<mlir::LLVM::FMulOp>(loc, eleTy, y0, y1);
-    auto rr = rewriter.create<mlir::LLVM::FSubOp>(loc, eleTy, xx, yy);
+    auto xx = rewriter.create<mlir::LLVM::FMulOp>(loc, eleTy, x0, x1, fmf);
+    auto yx = rewriter.create<mlir::LLVM::FMulOp>(loc, eleTy, y0, x1, fmf);
+    auto xy = rewriter.create<mlir::LLVM::FMulOp>(loc, eleTy, x0, y1, fmf);
+    auto ri = rewriter.create<mlir::LLVM::FAddOp>(loc, eleTy, xy, yx, fmf);
+    auto yy = rewriter.create<mlir::LLVM::FMulOp>(loc, eleTy, y0, y1, fmf);
+    auto rr = rewriter.create<mlir::LLVM::FSubOp>(loc, eleTy, xx, yy, fmf);
     auto ra = rewriter.create<mlir::LLVM::UndefOp>(loc, ty);
     auto r1 = rewriter.create<mlir::LLVM::InsertValueOp>(loc, ra, rr, 0);
     auto r0 = rewriter.create<mlir::LLVM::InsertValueOp>(loc, r1, ri, 1);
@@ -3594,6 +3605,7 @@ struct DivcOpConversion : public FIROpConversion<fir::DivcOp> {
     // Just generate inline code for now.
     // given: (x + iy) / (x' + iy')
     // result: ((xx'+yy')/d) + i((yx'-xy')/d) where d = x'x' + y'y'
+    mlir::LLVM::FastmathFlagsAttr fmf = getLLVMFMFAttr(divc);
     mlir::Value a = adaptor.getOperands()[0];
     mlir::Value b = adaptor.getOperands()[1];
     auto loc = divc.getLoc();
@@ -3603,17 +3615,17 @@ struct DivcOpConversion : public FIROpConversion<fir::DivcOp> {
     auto y0 = rewriter.create<mlir::LLVM::ExtractValueOp>(loc, a, 1);
     auto x1 = rewriter.create<mlir::LLVM::ExtractValueOp>(loc, b, 0);
     auto y1 = rewriter.create<mlir::LLVM::ExtractValueOp>(loc, b, 1);
-    auto xx = rewriter.create<mlir::LLVM::FMulOp>(loc, eleTy, x0, x1);
-    auto x1x1 = rewriter.create<mlir::LLVM::FMulOp>(loc, eleTy, x1, x1);
-    auto yx = rewriter.create<mlir::LLVM::FMulOp>(loc, eleTy, y0, x1);
-    auto xy = rewriter.create<mlir::LLVM::FMulOp>(loc, eleTy, x0, y1);
-    auto yy = rewriter.create<mlir::LLVM::FMulOp>(loc, eleTy, y0, y1);
-    auto y1y1 = rewriter.create<mlir::LLVM::FMulOp>(loc, eleTy, y1, y1);
-    auto d = rewriter.create<mlir::LLVM::FAddOp>(loc, eleTy, x1x1, y1y1);
-    auto rrn = rewriter.create<mlir::LLVM::FAddOp>(loc, eleTy, xx, yy);
-    auto rin = rewriter.create<mlir::LLVM::FSubOp>(loc, eleTy, yx, xy);
-    auto rr = rewriter.create<mlir::LLVM::FDivOp>(loc, eleTy, rrn, d);
-    auto ri = rewriter.create<mlir::LLVM::FDivOp>(loc, eleTy, rin, d);
+    auto xx = rewriter.create<mlir::LLVM::FMulOp>(loc, eleTy, x0, x1, fmf);
+    auto x1x1 = rewriter.create<mlir::LLVM::FMulOp>(loc, eleTy, x1, x1, fmf);
+    auto yx = rewriter.create<mlir::LLVM::FMulOp>(loc, eleTy, y0, x1, fmf);
+    auto xy = rewriter.create<mlir::LLVM::FMulOp>(loc, eleTy, x0, y1, fmf);
+    auto yy = rewriter.create<mlir::LLVM::FMulOp>(loc, eleTy, y0, y1, fmf);
+    auto y1y1 = rewriter.create<mlir::LLVM::FMulOp>(loc, eleTy, y1, y1, fmf);
+    auto d = rewriter.create<mlir::LLVM::FAddOp>(loc, eleTy, x1x1, y1y1, fmf);
+    auto rrn = rewriter.create<mlir::LLVM::FAddOp>(loc, eleTy, xx, yy, fmf);
+    auto rin = rewriter.create<mlir::LLVM::FSubOp>(loc, eleTy, yx, xy, fmf);
+    auto rr = rewriter.create<mlir::LLVM::FDivOp>(loc, eleTy, rrn, d, fmf);
+    auto ri = rewriter.create<mlir::LLVM::FDivOp>(loc, eleTy, rin, d, fmf);
     auto ra = rewriter.create<mlir::LLVM::UndefOp>(loc, ty);
     auto r1 = rewriter.create<mlir::LLVM::InsertValueOp>(loc, ra, rr, 0);
     auto r0 = rewriter.create<mlir::LLVM::InsertValueOp>(loc, r1, ri, 1);
diff --git a/flang/test/Fir/convert-to-llvm.fir b/flang/test/Fir/convert-to-llvm.fir
index cecfbff7eac22..c9a44914b9870 100644
--- a/flang/test/Fir/convert-to-llvm.fir
+++ b/flang/test/Fir/convert-to-llvm.fir
@@ -507,7 +507,7 @@ func.func @test_call_return_val() -> i32 {
 // result: (x + x') + i(y + y')
 
 func.func @fir_complex_add(%a: !fir.complex<16>, %b: !fir.complex<16>) -> !fir.complex<16> {
-  %c = fir.addc %a, %b : !fir.complex<16>
+  %c = fir.addc %a, %b {fastmath = #arith.fastmath<fast>} : !fir.complex<16>
   return %c : !fir.complex<16>
 }
 
@@ -518,8 +518,8 @@ func.func @fir_complex_add(%a: !fir.complex<16>, %b: !fir.complex<16>) -> !fir.c
 // CHECK:         %[[Y0:.*]] = llvm.extractvalue %[[ARG0]][1] : !llvm.struct<(f128, f128)>
 // CHECK:         %[[X1:.*]] = llvm.extractvalue %[[ARG1]][0] : !llvm.struct<(f128, f128)>
 // CHECK:         %[[Y1:.*]] = llvm.extractvalue %[[ARG1]][1] : !llvm.struct<(f128, f128)>
-// CHECK:         %[[ADD_X0_X1:.*]] = llvm.fadd %[[X0]], %[[X1]]  : f128
-// CHECK:         %[[ADD_Y0_Y1:.*]] = llvm.fadd %[[Y0]], %[[Y1]]  : f128
+// CHECK:         %[[ADD_X0_X1:.*]] = llvm.fadd %[[X0]], %[[X1]] {fastmathFlags = #llvm.fastmath<fast>} : f128
+// CHECK:         %[[ADD_Y0_Y1:.*]] = llvm.fadd %[[Y0]], %[[Y1]] {fastmathFlags = #llvm.fastmath<fast>} : f128
 // CHECK:         %{{.*}} = llvm.mlir.undef : !llvm.struct<(f128, f128)>
 // CHECK:         %{{.*}} = llvm.insertvalue %[[ADD_X0_X1]], %{{.*}}[0] : !llvm.struct<(f128, f128)>
 // CHECK:         %{{.*}} = llvm.insertvalue %[[ADD_Y0_Y1]], %{{.*}}[1] : !llvm.struct<(f128, f128)>
@@ -532,7 +532,7 @@ func.func @fir_complex_add(%a: !fir.complex<16>, %b: !fir.complex<16>) -> !fir.c
 // result: (x - x') + i(y - y')
 
 func.func @fir_complex_sub(%a: !fir.complex<16>, %b: !fir.complex<16>) -> !fir.complex<16> {
-  %c = fir.subc %a, %b : !fir.complex<16>
+  %c = fir.subc %a, %b {fastmath = #arith.fastmath<fast>} : !fir.complex<16>
   return %c : !fir.complex<16>
 }
 
@@ -543,8 +543,8 @@ func.func @fir_complex_sub(%a: !fir.complex<16>, %b: !fir.complex<16>) -> !fir.c
 // CHECK:         %[[Y0:.*]] = llvm.extractvalue %[[ARG0]][1] : !llvm.struct<(f128, f128)>
 // CHECK:         %[[X1:.*]] = llvm.extractvalue %[[ARG1]][0] : !llvm.struct<(f128, f128)>
 // CHECK:         %[[Y1:.*]] = llvm.extractvalue %[[ARG1]][1] : !llvm.struct<(f128, f128)>
-// CHECK:         %[[SUB_X0_X1:.*]] = llvm.fsub %[[X0]], %[[X1]]  : f128
-// CHECK:         %[[SUB_Y0_Y1:.*]] = llvm.fsub %[[Y0]], %[[Y1]]  : f128
+// CHECK:         %[[SUB_X0_X1:.*]] = llvm.fsub %[[X0]], %[[X1]] {fastmathFlags = #llvm.fastmath<fast>}  : f128
+// CHECK:         %[[SUB_Y0_Y1:.*]] = llvm.fsub %[[Y0]], %[[Y1]] {fastmathFlags = #llvm.fastmath<fast>} : f128
 // CHECK:         %{{.*}} = llvm.mlir.undef : !llvm.struct<(f128, f128)>
 // CHECK:         %{{.*}} = llvm.insertvalue %[[SUB_X0_X1]], %{{.*}}[0] : !llvm.struct<(f128, f128)>
 // CHECK:         %{{.*}} = llvm.insertvalue %[[SUB_Y0_Y1]], %{{.*}}[1] : !llvm.struct<(f128, f128)>
@@ -557,7 +557,7 @@ func.func @fir_complex_sub(%a: !fir.complex<16>, %b: !fir.complex<16>) -> !fir.c
 // result: (xx'-yy')+i(xy'+yx')
 
 func.func @fir_complex_mul(%a: !fir.complex<16>, %b: !fir.complex<16>) -> !fir.complex<16> {
-  %c = fir.mulc %a, %b : !fir.complex<16>
+  %c = fir.mulc %a, %b {fastmath = #arith.fastmath<fast>} : !fir.complex<16>
   return %c : !fir.complex<16>
 }
 
@@ -568,12 +568,12 @@ func.func @fir_complex_mul(%a: !fir.complex<16>, %b: !fir.complex<16>) -> !fir.c
 // CHECK:         %[[Y0:.*]] = llvm.extractvalue %[[ARG0]][1] : !llvm.struct<(f128, f128)>
 // CHECK:         %[[X1:.*]] = llvm.extractvalue %[[ARG1]][0] : !llvm.struct<(f128, f128)>
 // CHECK:         %[[Y1:.*]] = llvm.extractvalue %[[ARG1]][1] : !llvm.struct<(f128, f128)>
-// CHECK:         %[[MUL_X0_X1:.*]] = llvm.fmul %[[X0]], %[[X1]]  : f128
-// CHECK:         %[[MUL_Y0_X1:.*]] = llvm.fmul %[[Y0]], %[[X1]]  : f128
-// CHECK:         %[[MUL_X0_Y1:.*]] = llvm.fmul %[[X0]], %[[Y1]]  : f128
-// CHECK:         %[[ADD:.*]] = llvm.fadd %[[MUL_X0_Y1]], %[[MUL_Y0_X1]]  : f128
-// CHECK:         %[[MUL_Y0_Y1:.*]] = llvm.fmul %[[Y0]], %[[Y1]]  : f128
-// CHECK:         %[[SUB:.*]] = llvm.fsub %[[MUL_X0_X1]], %[[MUL_Y0_Y1]]  : f128
+// CHECK:         %[[MUL_X0_X1:.*]] = llvm.fmul %[[X0]], %[[X1]] {fastmathFlags = #llvm.fastmath<fast>} : f128
+// CHECK:         %[[MUL_Y0_X1:.*]] = llvm.fmul %[[Y0]], %[[X1]] {fastmathFlags = #llvm.fastmath<fast>} : f128
+// CHECK:         %[[MUL_X0_Y1:.*]] = llvm.fmul %[[X0]], %[[Y1]] {fastmathFlags = #llvm.fastmath<fast>} : f128
+// CHECK:         %[[ADD:.*]] = llvm.fadd %[[MUL_X0_Y1]], %[[MUL_Y0_X1]] {fastmathFlags = #llvm.fastmath<fast>} : f128
+// CHECK:         %[[MUL_Y0_Y1:.*]] = llvm.fmul %[[Y0]], %[[Y1]] {fastmathFlags = #llvm.fastmath<fast>} : f128
+// CHECK:         %[[SUB:.*]] = llvm.fsub %[[MUL_X0_X1]], %[[MUL_Y0_Y1]] {fastmathFlags = #llvm.fastmath<fast>} : f128
 // CHECK:         %{{.*}} = llvm.mlir.undef : !llvm.struct<(f128, f128)>
 // CHECK:         %{{.*}} = llvm.insertvalue %[[SUB]], %{{.*}}[0] : !llvm.struct<(f128, f128)>
 // CHECK:         %{{.*}} = llvm.insertvalue %[[ADD]], %{{.*}}[1] : !llvm.struct<(f128, f128)>
@@ -586,7 +586,7 @@ func.func @fir_complex_mul(%a: !fir.complex<16>, %b: !fir.complex<16>) -> !fir.c
 // result: ((xx'+yy')/d) + i((yx'-xy')/d) where d = x'x' + y'y'
 
 func.func @fir_complex_div(%a: !fir.complex<16>, %b: !fir.complex<16>) -> !fir.complex<16> {
-  %c = fir.divc %a, %b : !fir.complex<16>
+  %c = fir.divc %a, %b {fastmath = #arith.fastmath<fast>} : !fir.complex<16>
   return %c : !fir.complex<16>
 }
 
@@ -597,17 +597,17 @@ func.func @fir_complex_div(%a: !fir.complex<16>, %b: !fir.complex<16>) -> !fir.c
 // CHECK:         %[[Y0:.*]] = llvm.extractvalue %[[ARG0]][1] : !llvm.struct<(f128, f128)>
 // CHECK:         %[[X1:.*]] = llvm.extractvalue %[[ARG1]][0] : !llvm.struct<(f128, f128)>
 // CHECK:         %[[Y1:.*]] = llvm.extractvalue %[[ARG1]][1] : !llvm.struct<(f128, f128)>
-// CHECK:         %[[MUL_X0_X1:.*]] = llvm.fmul %[[X0]], %[[X1]]  : f128
-// CHECK:         %[[MUL_X1_X1:.*]] = llvm.fmul %[[X1]], %[[X1]]  : f128
-// CHECK:         %[[MUL_Y0_X1:.*]] = llvm.fmul %[[Y0]], %[[X1]]  : f128
-// CHECK:         %[[MUL_X0_Y1:.*]] = llvm.fmul %[[X0]], %[[Y1]]  : f128
-// CHECK:         %[[MUL_Y0_Y1:.*]] = llvm.fmul %[[Y0]], %[[Y1]]  : f128
-// CHECK:         %[[MUL_Y1_Y1:.*]] = llvm.fmul %[[Y1]], %[[Y1]]  : f128
-// CHECK:         %[[ADD_X1X1_Y1Y1:.*]] = llvm.fadd %[[MUL_X1_X1]], %[[MUL_Y1_Y1]]  : f128
-// CHECK:         %[[ADD_X0X1_Y0Y1:.*]] = llvm.fadd %[[MUL_X0_X1]], %[[MUL_Y0_Y1]]  : f128
-// CHECK:         %[[SUB_Y0X1_X0Y1:.*]] = llvm.fsub %[[MUL_Y0_X1]], %[[MUL_X0_Y1]]  : f128
-// CHECK:         %[[DIV0:.*]] = llvm.fdiv %[[ADD_X0X1_Y0Y1]], %[[ADD_X1X1_Y1Y1]]  : f128
-// CHECK:         %[[DIV1:.*]] = llvm.fdiv %[[SUB_Y0X1_X0Y1]], %[[ADD_X1X1_Y1Y1]]  : f128
+// CHECK:         %[[MUL_X0_X1:.*]] = llvm.fmul %[[X0]], %[[X1]] {fastmathFlags = #llvm.fastmath<fast>} : f128
+// CHECK:         %[[MUL_X1_X1:.*]] = llvm.fmul %[[X1]], %[[X1]] {fastmathFlags = #llvm.fastmath<fast>} : f128
+// CHECK:         %[[MUL_Y0_X1:.*]] = llvm.fmul %[[Y0]], %[[X1]] {fastmathFlags = #llvm.fastmath<fast>} : f128
+// CHECK:         %[[MUL_X0_Y1:.*]] = llvm.fmul %[[X0]], %[[Y1]] {fastmathFlags = #llvm.fastmath<fast>} : f128
+// CHECK:         %[[MUL_Y0_Y1:.*]] = llvm.fmul %[[Y0]], %[[Y1]] {fastmathFlags = #llvm.fastmath<fast>} : f128
+// CHECK:         %[[MUL_Y1_Y1:.*]] = llvm.fmul %[[Y1]], %[[Y1]] {fastmathFlags = #llvm.fastmath<fast>} : f128
+// CHECK:         %[[ADD_X1X1_Y1Y1:.*]] = llvm.fadd %[[MUL_X1_X1]], %[[MUL_Y1_Y1]] {fastmathFlags = #llvm.fastmath<fast>} : f128
+// CHECK:         %[[ADD_X0X1_Y0Y1:.*]] = llvm.fadd %[[MUL_X0_X1]], %[[MUL_Y0_Y1]] {fastmathFlags = #llvm.fastmath<fast>} : f128
+// CHECK:         %[[SUB_Y0X1_X0Y1:.*]] = llvm.fsub %[[MUL_Y0_X1]], %[[MUL_X0_Y1]] {fastmathFlags = #llvm.fastmath<fast>} : f128
+// CHECK:         %[[DIV0:.*]] = llvm.fdiv %[[ADD_X0X1_Y0Y1]], %[[ADD_X1X1_Y1Y1]] {fastmathFlags = #llvm.fastmath<fast>} : f128
+// CHECK:         %[[DIV1:.*]] = llvm.fdiv %[[SUB_Y0X1_X0Y1]], %[[ADD_X1X1_Y1Y1]] {fastmathFlags = #llvm.fastmath<fast>} : f128
 // CHECK:         %{{.*}} = llvm.mlir.undef : !llvm.struct<(f128, f128)>
 // CHECK:         %{{.*}} = llvm.insertvalue %[[DIV0]], %{{.*}}[0] : !llvm.struct<(f128, f128)>
 // CHECK:         %{{.*}} = llvm.insertvalue %[[DIV1]], %{{.*}}[1] : !llvm.struct<(f128, f128)>
diff --git a/flang/test/Lower/HLFIR/binary-ops.f90 b/flang/test/Lower/HLFIR/binary-ops.f90
index 8db6da3de81b2..6b89577cc5458 100644
--- a/flang/test/Lower/HLFIR/binary-ops.f90
+++ b/flang/test/Lower/HLFIR/binary-ops.f90
@@ -32,7 +32,7 @@ subroutine complex_add(x, y, z)
 ! CHECK:  %[[VAL_5:.*]]:2 = hlfir.declare %{{.*}}z"} : (!fir.ref<!fir.complex<4>>) -> (!fir.ref<!fir.complex<4>>, !fir.ref<!fir.complex<4>>)
 ! CHECK:  %[[VAL_6:.*]] = fir.load %[[VAL_4]]#0 : !fir.ref<!fir.complex<4>>
 ! CHECK:  %[[VAL_7:.*]] = fir.load %[[VAL_5]]#0 : !fir.ref<!fir.complex<4>>
-! CHECK:  %[[VAL_8:.*]] = fir.addc %[[VAL_6]], %[[VAL_7]] : !fir.complex<4>
+! CHECK:  %[[VAL_8:.*]] = fir.addc %[[VAL_6]], %[[VAL_7]] {fastmath = #arith.fastmath<contract>} : !fir.complex<4>
 
 subroutine int_sub(x, y, z)
  integer :: x, y, z
@@ -65,7 +65,7 @@ subroutine complex_sub(x, y, z)
 ! CHECK:  %[[VAL_5:.*]]:2 = hlfir.declare %{{.*}}z"} : (!fir.ref<!fir.complex<4>>) -> (!fir.ref<!fir.complex<4>>, !fir.ref<!fir.complex<4>>)
 ! CHECK:  %[[VAL_6:.*]] = fir.load %[[VAL_4]]#0 : !fir.ref<!fir.complex<4>>
 ! CHECK:  %[[VAL_7:.*]] = fir.load %[[VAL_5]]#0 : !fir.ref<!fir.complex<4>>
-! CHECK:  %[[VAL_8:.*]] = fir.subc %[[VAL_6]], %[[VAL_7]] : !fir.complex<4>
+! CHECK:  %[[VAL_8:.*]] = fir.subc %[[VAL_6]], %[[VAL_7]] {fastmath = #arith.fastmath<contract>} : !fir.complex<4>
 
 subroutine int_mul(x, y, z)
  integer :: x, y, z
@@ -98,7 +98,7 @@ subroutine complex_mul(x, y, z)
 ! CHECK:  %[[VAL_5:.*]]:2 = hlfir.declare %{{.*}}z"} : (!fir.ref<!fir.complex<4>>) -> (!fir.ref<!fir.complex<4>>, !fir.ref<!fir.complex<4>>)
 ! CHECK:  %[[VAL_6:.*]] = fir.load %[[VAL_4]]#0 : !fir.ref<!fir.complex<4>>
 ! CHECK:  %[[VAL_7:.*]] = fir.load %[[VAL_5]]#0 : !fir.ref<!fir.complex<4>>
-! CHECK:  %[[VAL_8:.*]] = fir.mulc %[[VAL_6]], %[[VAL_7]] : !fir.complex<4>
+! CHECK:  %[[VAL_8:.*]] = fir.mulc %[[VAL_6]], %[[VAL_7]] {fastmath = #arith.fastmath<contract>} : !fir.complex<4>
 
 subroutine int_div(x, y, z)
  integer :: x, y, z
diff --git a/flang/test/Lower/OpenACC/acc-reduction.f90 b/flang/test/Lower/OpenACC/acc-reduction.f90
index b874d5219625d..8671c280c2fb3 100644
--- a/flang/test/Lower/OpenACC/acc-reduction.f90
+++ b/flang/test/Lower/OpenACC/acc-reduction.f90
@@ -163,7 +163,7 @@
 ! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<!fir.complex<4>>, %[[ARG1:.*]]: !fir.ref<!fir.complex<4>>):
 ! CHECK:   %[[LOAD0:.*]] = fir.load %[[ARG0]] : !fir.ref<!fir.complex<4>>
 ! CHECK:   %[[LOAD1:.*]] = fir.load %[[ARG1]] : !fir.ref<!fir.complex<4>>
-! CHECK:   %[[COMBINED:.*]] = fir.mulc %[[LOAD0]], %[[LOAD1]] : !fir.complex<4>
+! CHECK:   %[[COMBINED:.*]] = fir.mulc %[[LOAD0]], %[[LOAD1]] {fastmath = #arith.fastmath<contract>} : !fir.complex<4>
 ! CHECK:   fir.store %[[COMBINED]] to %[[ARG0]] : !fir.ref<!fir.complex<4>>
 ! CHECK:   acc.yield %[[ARG0]] : !fir.ref<!fir.complex<4>>
 ! CHECK: }
@@ -183,7 +183,7 @@
 ! CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<!fir.complex<4>>, %[[ARG1:.*]]: !fir.ref<!fir.complex<4>>):
 ! CHECK:   %[[LOAD0:.*]] = fir.load %[[ARG0]] : !fir.ref<!fir.complex<4>>
 ! CHECK:   %[[LOAD1:.*]] = fir.load %[[ARG1]] : !fir.ref<!fir.complex<4>>
-! CHECK:   %[[COMBINED:.*]] = fir.addc %[[LOAD0]], %[[LOAD1]] : !fir.complex<4>
+! CHECK:   %[[COMBINED:.*]] = fir.addc %[[LOAD0]], %[[LOAD1]] {fastmath = #arith.fastmath<contract>} : !fir.complex<4>
 ! CHECK:   fir.store %[[COMBINED]] to %[[ARG0]] : !fir.ref<!fir.complex<4>>
 ! CHECK:   acc.yield %[[ARG0]] : !fir.ref<!fir.complex<4>>
 ! CHECK: }
diff --git a/flang/test/Lower/array-elemental-calls-2.f90 b/flang/test/Lower/array-elemental-calls-2.f90
index 94e24a9910bc2..0d6e34c6391c3 100644
--- a/flang/test/Lower/array-elemental-calls-2.f90
+++ b/flang/test/Lower/array-elemental-calls-2.f90
@@ -144,7 +144,7 @@ subroutine check_cmplx_part()
 ! CHECK:  %[[VAL_13:.*]] = fir.load %{{.*}} : !fir.ref<!fir.complex<8>>
 ! CHECK:  fir.do_loop
 ! CHECK:  %[[VAL_23:.*]] = fir.array_fetch %{{.*}}, %{{.*}} : (!fir.array<10x!fir.complex<8>>, index) -> !fir.complex<8>
-! CHECK:  %[[VAL_24:.*]] = fir.addc %[[VAL_23]], %[[VAL_13]] : !fir.complex<8>
+! CHECK:  %[[VAL_24:.*]] = fir.addc %[[VAL_23]], %[[VAL_13]] {fastmath = #arith.fastmath<contract>} : !fir.complex<8>
 ! CHECK:  %[[VAL_25:.*]] = fir.extract_value %[[VAL_24]], [1 : index] : (!fir.complex<8>) -> f64
 ! CHECK:  fir.call @_QPelem_func_real(%[[VAL_25]]) {{.*}}: (f64) -> i32
 end subroutine
diff --git a/flang/test/Lower/assignment.f90 b/flang/test/Lower/assignment.f90
index 9b5039e3ea88e..058842828d268 100644
--- a/flang/test/Lower/assignment.f90
+++ b/flang/test/Lower/assignment.f90
@@ -203,7 +203,7 @@ real function divf(a, b)
 ! CHECK:         %[[FCTRES:.*]] = fir.alloca !fir.complex<4>
 ! CHECK:         %[[A_VAL:.*]] = fir.load %[[A]] : !fir.ref<!fir.complex<4>>
 ! CHECK:         %[[B_VAL:.*]] = fir.load %[[B]] : !fir.ref<!fir.complex<4>>
-! CHECK:         %[[ADD:.*]] = fir.addc %[[A_VAL]], %[[B_VAL]] : !fir.complex<4>
+! CHECK:         %[[ADD:.*]] = fir.addc %[[A_VAL]], %[[B_VAL]] {fastmath = #arith.fastmath<contract>} : !fir.complex<4>
 ! CHECK:         fir.store %[[ADD]] to %[[FCTRES]] : !fir.ref<!fir.complex<4>>
 ! CHECK:         %[[RET:.*]] = fir.load %[[FCTRES]] : !fir.ref<!fir.complex<4>>
 ! CHECK:         return %[[RET]] : !fir.complex<4>
@@ -219,7 +219,7 @@ real function divf(a, b)
 ! CHECK:         %[[FCTRES:.*]] = fir.alloca !fir.complex<4>
 ! CHECK:         %[[A_VAL:.*]] = fir.load %[[A]] : !fir.ref<!fir.complex<4>>
 ! CHECK:         %[[B_VAL:.*]] = fir.load %[[B]] : !fir.ref<!fir.complex<4>>
-! CHECK:         %[[SUB:.*]] = fir.subc %[[A_VAL]], %[[B_VAL]] : !fir.complex<4>
+! CHECK:         %[[SUB:.*]] = fir.subc %[[A_VAL]], %[[B_VAL]] {fastmath = #arith.fastmath<contract>} : !fir.complex<4>
 ! CHECK:         fir.store %[[SUB]] to %[[FCTRES]] : !fir.ref<!fir.complex<4>>
 ! CHECK:         %[[RET:.*]] = fir.load %[[FCTRES]] : !fir.ref<!fir.complex<4>>
 ! CHECK:         return %[[RET]] : !fir.complex<4>
@@ -235,7 +235,7 @@ real function divf(a, b)
 ! CHECK:         %[[FCTRES:.*]] = fir.alloca !fir.complex<4>
 ! CHECK:         %[[A_VAL:.*]] = fir.load %[[A]] : !fir.ref<!fir.complex<4>>
 ! CHECK:         %[[B_VAL:.*]] = fir.load %[[B]] : !fir.ref<!fir.complex<4>>
-! CHECK:         %[[MUL:.*]] = fir.mulc %[[A_VAL]], %[[B_VAL]] : !fir.complex<4>
+! CHECK:         %[[MUL:.*]] = fir.mulc %[[A_VAL]], %[[B_VAL]] {fastmath = #arith.fastmath<contract>} : !fir.complex<4>
 ! CHECK:         fir.store %[[MUL]] to %[[FCTRES]] : !fir.ref<!fir.complex<4>>
 ! CHECK:         %[[RET:.*]] = fir.load %[[FCTRES]] : !fir.ref<!fir.complex<4>>
 ! CHECK:         return %[[RET]] : !fir.complex<4>

From f8742b8d6a3489dc8974f4166d413a66cb8d9c21 Mon Sep 17 00:00:00 2001
From: Philip Reames <preames@rivosinc.com>
Date: Tue, 31 Oct 2023 09:33:07 -0700
Subject: [PATCH 541/877] [SCEV] Teach SCEVExpander to use zext nneg when
 possible (#70815)

zext nneg was recently added to the IR in #67982. Teaching SCEVExpander
to emit nneg when possible is valuable since SCEV may have proved
non-trivial facts about loop bounds which would otherwise be lost when
materializing the value.
---
 .../Transforms/Utils/ScalarEvolutionExpander.cpp   |  5 ++++-
 .../Transforms/IRCE/iv-plus-offset-range-check.ll  |  2 +-
 llvm/test/Transforms/IRCE/wide_indvar.ll           | 14 +++++++-------
 .../promote-iv-to-eliminate-casts.ll               |  4 ++--
 llvm/test/Transforms/IndVarSimplify/zext-nuw.ll    |  2 +-
 .../LoopIdiom/X86/memset-size-compute.ll           |  2 +-
 llvm/test/Transforms/LoopVectorize/reduction.ll    |  4 ++--
 .../CodeGen/partial_write_in_region_with_loop.ll   |  2 +-
 8 files changed, 19 insertions(+), 16 deletions(-)

diff --git a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
index 5c976d5748542..c8cd9c4ee13e6 100644
--- a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
+++ b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
@@ -1293,7 +1293,10 @@ Value *SCEVExpander::visitTruncateExpr(const SCEVTruncateExpr *S) {
 
 Value *SCEVExpander::visitZeroExtendExpr(const SCEVZeroExtendExpr *S) {
   Value *V = expand(S->getOperand());
-  return Builder.CreateZExt(V, S->getType());
+  auto *Res = Builder.CreateZExt(V, S->getType());
+  if (auto *I = dyn_cast<Instruction>(Res))
+    I->setNonNeg(SE.isKnownNonNegative(S->getOperand()));
+  return Res;
 }
 
 Value *SCEVExpander::visitSignExtendExpr(const SCEVSignExtendExpr *S) {
diff --git a/llvm/test/Transforms/IRCE/iv-plus-offset-range-check.ll b/llvm/test/Transforms/IRCE/iv-plus-offset-range-check.ll
index bda7cf82eff97..9c3713bdd5db0 100644
--- a/llvm/test/Transforms/IRCE/iv-plus-offset-range-check.ll
+++ b/llvm/test/Transforms/IRCE/iv-plus-offset-range-check.ll
@@ -1088,7 +1088,7 @@ define i8 @test_overflow_check_runtime(i8 %limit, ptr %p) {
 ; CHECK-NEXT:    [[SMAX2:%.*]] = call i8 @llvm.smax.i8(i8 [[SMIN]], i8 -1)
 ; CHECK-NEXT:    [[TMP4:%.*]] = add nsw i8 [[SMAX2]], 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul i8 [[TMP2]], [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = zext i8 [[N]] to i16
+; CHECK-NEXT:    [[TMP6:%.*]] = zext nneg i8 [[N]] to i16
 ; CHECK-NEXT:    [[TMP7:%.*]] = sub i16 124, [[TMP6]]
 ; CHECK-NEXT:    [[SMIN3:%.*]] = call i16 @llvm.smin.i16(i16 [[TMP7]], i16 0)
 ; CHECK-NEXT:    [[TMP8:%.*]] = trunc i16 [[SMIN3]] to i8
diff --git a/llvm/test/Transforms/IRCE/wide_indvar.ll b/llvm/test/Transforms/IRCE/wide_indvar.ll
index 415ed5750dcc4..10eea088af4f2 100644
--- a/llvm/test/Transforms/IRCE/wide_indvar.ll
+++ b/llvm/test/Transforms/IRCE/wide_indvar.ll
@@ -125,7 +125,7 @@ define i32 @test_increasing_slt_slt_wide_non-negative(ptr %n_ptr, ptr %m_ptr) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[N:%.*]] = load i32, ptr [[N_PTR]], align 4, !range [[RNG6:![0-9]+]]
 ; CHECK-NEXT:    [[M:%.*]] = load i64, ptr [[M_PTR]], align 4, !range [[RNG7:![0-9]+]]
-; CHECK-NEXT:    [[TMP0:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    [[TMP0:%.*]] = zext nneg i32 [[N]] to i64
 ; CHECK-NEXT:    [[EXIT_MAINLOOP_AT:%.*]] = call i64 @llvm.smin.i64(i64 [[M]], i64 [[TMP0]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp slt i64 0, [[EXIT_MAINLOOP_AT]]
 ; CHECK-NEXT:    br i1 [[TMP1]], label [[LOOP_PREHEADER:%.*]], label [[MAIN_PSEUDO_EXIT:%.*]]
@@ -217,7 +217,7 @@ define i32 @test_increasing_slt_slt_wide_general(ptr %n_ptr, ptr %m_ptr) {
 ; CHECK-NEXT:    [[SMAX1:%.*]] = call i64 @llvm.smax.i64(i64 [[SMIN]], i64 -1)
 ; CHECK-NEXT:    [[TMP2:%.*]] = add nsw i64 [[SMAX1]], 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = zext nneg i32 [[N]] to i64
 ; CHECK-NEXT:    [[SMIN2:%.*]] = call i64 @llvm.smin.i64(i64 [[TMP3]], i64 [[TMP4]])
 ; CHECK-NEXT:    [[EXIT_MAINLOOP_AT:%.*]] = call i64 @llvm.smax.i64(i64 [[SMIN2]], i64 0)
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp slt i64 0, [[EXIT_MAINLOOP_AT]]
@@ -309,7 +309,7 @@ define i32 @test_increasing_slt_slt_wide_general_preloop(ptr %n_ptr, ptr %m_ptr)
 ; CHECK-NEXT:    [[TMP1:%.*]] = sub i64 [[M]], [[SMAX1]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = add nsw i64 [[SMAX]], 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = zext nneg i32 [[N]] to i64
 ; CHECK-NEXT:    [[SMIN2:%.*]] = call i64 @llvm.smin.i64(i64 [[TMP3]], i64 [[TMP4]])
 ; CHECK-NEXT:    [[EXIT_MAINLOOP_AT:%.*]] = call i64 @llvm.smax.i64(i64 [[SMIN2]], i64 -1)
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp slt i64 -1, [[EXIT_PRELOOP_AT]]
@@ -456,7 +456,7 @@ define i32 @test_increasing_slt_slt_wide_multiple_checks(ptr %n_ptr, ptr %m1_ptr
 ; CHECK-NEXT:    [[TMP14:%.*]] = add nsw i64 [[SMAX12]], 1
 ; CHECK-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP13]], [[TMP14]]
 ; CHECK-NEXT:    [[SMIN13:%.*]] = call i64 @llvm.smin.i64(i64 [[SMIN9]], i64 [[TMP15]])
-; CHECK-NEXT:    [[TMP16:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    [[TMP16:%.*]] = zext nneg i32 [[N]] to i64
 ; CHECK-NEXT:    [[SMIN14:%.*]] = call i64 @llvm.smin.i64(i64 [[SMIN13]], i64 [[TMP16]])
 ; CHECK-NEXT:    [[EXIT_MAINLOOP_AT:%.*]] = call i64 @llvm.smax.i64(i64 [[SMIN14]], i64 0)
 ; CHECK-NEXT:    [[TMP17:%.*]] = icmp slt i64 0, [[EXIT_MAINLOOP_AT]]
@@ -719,7 +719,7 @@ define i32 @test_increasing_ult_ult_wide_non-negative(ptr %n_ptr, ptr %m_ptr) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[N:%.*]] = load i32, ptr [[N_PTR]], align 4, !range [[RNG6]]
 ; CHECK-NEXT:    [[M:%.*]] = load i64, ptr [[M_PTR]], align 4, !range [[RNG7]]
-; CHECK-NEXT:    [[TMP0:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    [[TMP0:%.*]] = zext nneg i32 [[N]] to i64
 ; CHECK-NEXT:    [[EXIT_MAINLOOP_AT:%.*]] = call i64 @llvm.umin.i64(i64 [[M]], i64 [[TMP0]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i64 0, [[EXIT_MAINLOOP_AT]]
 ; CHECK-NEXT:    br i1 [[TMP1]], label [[LOOP_PREHEADER:%.*]], label [[MAIN_PSEUDO_EXIT:%.*]]
@@ -809,7 +809,7 @@ define i32 @test_increasing_ult_ult_wide_general(ptr %n_ptr, ptr %m_ptr) {
 ; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[SMIN]], i64 -1)
 ; CHECK-NEXT:    [[TMP1:%.*]] = add nsw i64 [[SMAX]], 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = mul i64 [[TMP0]], [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    [[TMP3:%.*]] = zext nneg i32 [[N]] to i64
 ; CHECK-NEXT:    [[EXIT_MAINLOOP_AT:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP2]], i64 [[TMP3]])
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp ult i64 0, [[EXIT_MAINLOOP_AT]]
 ; CHECK-NEXT:    br i1 [[TMP4]], label [[LOOP_PREHEADER:%.*]], label [[MAIN_PSEUDO_EXIT:%.*]]
@@ -918,7 +918,7 @@ define i32 @test_increasing_ult_ult_wide_multiple_checks(ptr %n_ptr, ptr %m1_ptr
 ; CHECK-NEXT:    [[TMP10:%.*]] = add nsw i64 [[SMAX7]], 1
 ; CHECK-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP9]], [[TMP10]]
 ; CHECK-NEXT:    [[UMIN8:%.*]] = call i64 @llvm.umin.i64(i64 [[UMIN5]], i64 [[TMP11]])
-; CHECK-NEXT:    [[TMP12:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    [[TMP12:%.*]] = zext nneg i32 [[N]] to i64
 ; CHECK-NEXT:    [[EXIT_MAINLOOP_AT:%.*]] = call i64 @llvm.umin.i64(i64 [[UMIN8]], i64 [[TMP12]])
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp ult i64 0, [[EXIT_MAINLOOP_AT]]
 ; CHECK-NEXT:    br i1 [[TMP13]], label [[LOOP_PREHEADER:%.*]], label [[MAIN_PSEUDO_EXIT:%.*]]
diff --git a/llvm/test/Transforms/IndVarSimplify/promote-iv-to-eliminate-casts.ll b/llvm/test/Transforms/IndVarSimplify/promote-iv-to-eliminate-casts.ll
index 8819df6d013fe..d912540c95657 100644
--- a/llvm/test/Transforms/IndVarSimplify/promote-iv-to-eliminate-casts.ll
+++ b/llvm/test/Transforms/IndVarSimplify/promote-iv-to-eliminate-casts.ll
@@ -230,7 +230,7 @@ define void @promote_latch_condition_decrementing_loop_02(ptr %p, ptr %a) {
 ; CHECK-NEXT:    [[ZERO_CHECK:%.*]] = icmp eq i32 [[LEN]], 0
 ; CHECK-NEXT:    br i1 [[ZERO_CHECK]], label [[LOOPEXIT:%.*]], label [[PREHEADER:%.*]]
 ; CHECK:       preheader:
-; CHECK-NEXT:    [[TMP0:%.*]] = zext i32 [[LEN]] to i64
+; CHECK-NEXT:    [[TMP0:%.*]] = zext nneg i32 [[LEN]] to i64
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loopexit.loopexit:
 ; CHECK-NEXT:    br label [[LOOPEXIT]]
@@ -273,7 +273,7 @@ define void @promote_latch_condition_decrementing_loop_03(ptr %p, ptr %a) {
 ; CHECK-NEXT:    [[ZERO_CHECK:%.*]] = icmp eq i32 [[LEN]], 0
 ; CHECK-NEXT:    br i1 [[ZERO_CHECK]], label [[LOOPEXIT:%.*]], label [[PREHEADER:%.*]]
 ; CHECK:       preheader:
-; CHECK-NEXT:    [[TMP0:%.*]] = zext i32 [[LEN]] to i64
+; CHECK-NEXT:    [[TMP0:%.*]] = zext nneg i32 [[LEN]] to i64
 ; CHECK-NEXT:    [[TMP1:%.*]] = add nuw nsw i64 [[TMP0]], 1
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loopexit.loopexit:
diff --git a/llvm/test/Transforms/IndVarSimplify/zext-nuw.ll b/llvm/test/Transforms/IndVarSimplify/zext-nuw.ll
index 01eaa46d981e4..d24f9a4e40e38 100644
--- a/llvm/test/Transforms/IndVarSimplify/zext-nuw.ll
+++ b/llvm/test/Transforms/IndVarSimplify/zext-nuw.ll
@@ -16,7 +16,7 @@ define void @_Z3fn1v() {
 ; CHECK-NEXT:    br label [[DOTPREHEADER4_LR_PH:%.*]]
 ; CHECK:       .preheader4.lr.ph:
 ; CHECK-NEXT:    [[TMP1:%.*]] = add nsw i32 [[X4]], -1
-; CHECK-NEXT:    [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = zext nneg i32 [[TMP1]] to i64
 ; CHECK-NEXT:    [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = sext i8 [[J_SROA_0_0_COPYLOAD]] to i64
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP3]], [[TMP4]]
diff --git a/llvm/test/Transforms/LoopIdiom/X86/memset-size-compute.ll b/llvm/test/Transforms/LoopIdiom/X86/memset-size-compute.ll
index 7aa7e6e69bcb7..ea2cfe74be264 100644
--- a/llvm/test/Transforms/LoopIdiom/X86/memset-size-compute.ll
+++ b/llvm/test/Transforms/LoopIdiom/X86/memset-size-compute.ll
@@ -17,7 +17,7 @@ define void @test(ptr %ptr) {
 ; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[PTR:%.*]], i64 8
 ; CHECK-NEXT:    [[UMAX:%.*]] = call i32 @llvm.umax.i32(i32 [[LIM_0]], i32 2)
 ; CHECK-NEXT:    [[TMP0:%.*]] = add nsw i32 [[UMAX]], -1
-; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = zext nneg i32 [[TMP0]] to i64
 ; CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 3
 ; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[SCEVGEP]], i8 0, i64 [[TMP2]], i1 false)
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/reduction.ll b/llvm/test/Transforms/LoopVectorize/reduction.ll
index aba2532706de5..25352ee0991ba 100644
--- a/llvm/test/Transforms/LoopVectorize/reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/reduction.ll
@@ -1200,7 +1200,7 @@ define i64 @reduction_with_phi_with_one_incoming_on_backedge(i16 %n, ptr %A) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[SMAX:%.*]] = call i16 @llvm.smax.i16(i16 [[N]], i16 2)
 ; CHECK-NEXT:    [[TMP0:%.*]] = add nsw i16 [[SMAX]], -1
-; CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[TMP0]] to i32
+; CHECK-NEXT:    [[TMP1:%.*]] = zext nneg i16 [[TMP0]] to i32
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i16 [[SMAX]], 5
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
@@ -1278,7 +1278,7 @@ define i64 @reduction_with_phi_with_two_incoming_on_backedge(i16 %n, ptr %A) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[SMAX:%.*]] = call i16 @llvm.smax.i16(i16 [[N]], i16 2)
 ; CHECK-NEXT:    [[TMP0:%.*]] = add nsw i16 [[SMAX]], -1
-; CHECK-NEXT:    [[TMP1:%.*]] = zext i16 [[TMP0]] to i32
+; CHECK-NEXT:    [[TMP1:%.*]] = zext nneg i16 [[TMP0]] to i32
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i16 [[SMAX]], 5
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
diff --git a/polly/test/CodeGen/partial_write_in_region_with_loop.ll b/polly/test/CodeGen/partial_write_in_region_with_loop.ll
index 5af09852944b1..48a9dbef21d10 100644
--- a/polly/test/CodeGen/partial_write_in_region_with_loop.ll
+++ b/polly/test/CodeGen/partial_write_in_region_with_loop.ll
@@ -9,7 +9,7 @@
 ; CHECK:polly.stmt.bb3:
 ; CHECK-NEXT:  %polly.subregion.iv = phi i32 [ %polly.subregion.iv.inc, %polly.stmt.bb5.cont ], [ 0, %polly.stmt.bb3.entry ]
 ; CHECK-NEXT:  %polly.j.0 = phi i64 [ %j.0.phiops.reload, %polly.stmt.bb3.entry ], [ %p_tmp10, %polly.stmt.bb5.cont ]
-; CHECK-NEXT:  %8 = zext i64 %polly.indvar to i65
+; CHECK-NEXT:  %8 = zext nneg i64 %polly.indvar to i65
 ; CHECK-NEXT:  %9 = add nsw i64 %polly.indvar, -1
 ; CHECK-NEXT:  %10 = zext i64 %9 to i65
 ; CHECK-NEXT:  %11 = mul i65 %8, %10

From 73498d260888a62430e24f36032fa0b57ca770d5 Mon Sep 17 00:00:00 2001
From: Sander de Smalen <sander.desmalen@arm.com>
Date: Tue, 31 Oct 2023 16:52:42 +0000
Subject: [PATCH 542/877] [AArch64] Also implement PNR -> PNR copies. (#70682)

Previously we only implemented PNR -> PPR and PPR -> PNR copies.
---
 llvm/lib/Target/AArch64/AArch64InstrInfo.cpp  | 47 ++++++++-----------
 .../{PNRtoPPRCopy.mir => sve2p1_copy_pnr.mir} | 23 ++++++++-
 2 files changed, 41 insertions(+), 29 deletions(-)
 rename llvm/test/CodeGen/AArch64/{PNRtoPPRCopy.mir => sve2p1_copy_pnr.mir} (70%)

diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index e9b9f8013abea..c9f7fef66c9f5 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -4426,36 +4426,27 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     return;
   }
 
-  if (AArch64::PNRRegClass.contains(DestReg) &&
-      AArch64::PPRRegClass.contains(SrcReg)) {
+  // Copy a predicate-as-counter register by ORRing with itself as if it
+  // were a regular predicate (mask) register.
+  bool DestIsPNR = AArch64::PNRRegClass.contains(DestReg);
+  bool SrcIsPNR = AArch64::PNRRegClass.contains(SrcReg);
+  if (DestIsPNR || SrcIsPNR) {
     assert((Subtarget.hasSVE2p1() || Subtarget.hasSME2()) &&
            "Unexpected predicate-as-counter register.");
-    // Copy from pX to pnX is a no-op
-    if ((DestReg.id() - AArch64::PN0) == (SrcReg.id() - AArch64::P0))
-      return;
-    MCRegister PPRDestReg = (DestReg - AArch64::PN0) + AArch64::P0;
-    BuildMI(MBB, I, DL, get(AArch64::ORR_PPzPP), PPRDestReg)
-        .addReg(SrcReg)
-        .addReg(SrcReg)
-        .addReg(SrcReg, getKillRegState(KillSrc))
-        .addDef(DestReg, RegState::Implicit);
-    return;
-  }
-
-  if (AArch64::PPRRegClass.contains(DestReg) &&
-      AArch64::PNRRegClass.contains(SrcReg)) {
-    assert((Subtarget.hasSVE2p1() || Subtarget.hasSME2()) &&
-           "Unexpected predicate-as-counter register.");
-    // Copy from pnX to pX is a no-op
-    if ((DestReg.id() - AArch64::P0) == (SrcReg.id() - AArch64::PN0))
-      return;
-    MCRegister PNRDestReg = (DestReg - AArch64::P0) + AArch64::PN0;
-    MCRegister PPRSrcReg = (SrcReg - AArch64::PN0) + AArch64::P0;
-    BuildMI(MBB, I, DL, get(AArch64::ORR_PPzPP), DestReg)
-        .addReg(PPRSrcReg)
-        .addReg(PPRSrcReg)
-        .addReg(PPRSrcReg, getKillRegState(KillSrc))
-        .addDef(PNRDestReg, RegState::Implicit);
+    auto ToPPR = [](MCRegister R) -> MCRegister {
+      return (R - AArch64::PN0) + AArch64::P0;
+    };
+    MCRegister PPRSrcReg = SrcIsPNR ? ToPPR(SrcReg) : SrcReg;
+    MCRegister PPRDestReg = DestIsPNR ? ToPPR(DestReg) : DestReg;
+
+    if (PPRSrcReg != PPRDestReg) {
+      auto NewMI = BuildMI(MBB, I, DL, get(AArch64::ORR_PPzPP), PPRDestReg)
+                       .addReg(PPRSrcReg) // Pg
+                       .addReg(PPRSrcReg)
+                       .addReg(PPRSrcReg, getKillRegState(KillSrc));
+      if (DestIsPNR)
+        NewMI.addDef(DestReg, RegState::Implicit);
+    }
     return;
   }
 
diff --git a/llvm/test/CodeGen/AArch64/PNRtoPPRCopy.mir b/llvm/test/CodeGen/AArch64/sve2p1_copy_pnr.mir
similarity index 70%
rename from llvm/test/CodeGen/AArch64/PNRtoPPRCopy.mir
rename to llvm/test/CodeGen/AArch64/sve2p1_copy_pnr.mir
index 5b1e24ea732f2..d6a87a42a79e0 100644
--- a/llvm/test/CodeGen/AArch64/PNRtoPPRCopy.mir
+++ b/llvm/test/CodeGen/AArch64/sve2p1_copy_pnr.mir
@@ -14,7 +14,7 @@ body:             |
   bb.0:
     ; CHECK-LABEL: name: pnr_to_ppr
     ; CHECK: renamable $pn8 = PTRUE_C_D
-    ; CHECK-NEXT: $p0 = ORR_PPzPP $p8, $p8, killed $p8, implicit-def $pn0
+    ; CHECK-NEXT: $p0 = ORR_PPzPP $p8, $p8, killed $p8
     ; CHECK-NEXT: RET_ReallyLR implicit killed $p0
     renamable $pn8 = PTRUE_C_D
     $p0 = COPY killed renamable $pn8
@@ -42,3 +42,24 @@ body:             |
     RET_ReallyLR implicit killed $pn0
 
 ...
+---
+name:            pnr_to_pnr
+alignment:       4
+tracksRegLiveness: true
+tracksDebugUserValues: true
+frameInfo:
+  maxAlignment:    1
+  maxCallFrameSize: 0
+machineFunctionInfo:
+  hasRedZone:      false
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: pnr_to_pnr
+    ; CHECK: renamable $pn8 = PTRUE_C_H
+    ; CHECK-NEXT: $p0 = ORR_PPzPP $p8, $p8, killed $p8, implicit-def $pn0
+    ; CHECK-NEXT: RET_ReallyLR implicit killed $pn0
+    renamable $pn8 = PTRUE_C_H
+    $pn0 = COPY killed renamable $pn8
+    RET_ReallyLR implicit killed $pn0
+
+...

From 7fe4025c16be8148f88091b201c29492aaa1ccd0 Mon Sep 17 00:00:00 2001
From: Philip Reames <preames@rivosinc.com>
Date: Tue, 31 Oct 2023 09:50:21 -0700
Subject: [PATCH 543/877] Autogenerate a indvars test for ease of update in
 future change

---
 .../Transforms/IndVarSimplify/ada-loops.ll    | 158 ++++++++++++------
 1 file changed, 108 insertions(+), 50 deletions(-)

diff --git a/llvm/test/Transforms/IndVarSimplify/ada-loops.ll b/llvm/test/Transforms/IndVarSimplify/ada-loops.ll
index 26d4219b347f1..b4725f8ce774e 100644
--- a/llvm/test/Transforms/IndVarSimplify/ada-loops.ll
+++ b/llvm/test/Transforms/IndVarSimplify/ada-loops.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
 ; RUN: opt < %s -passes=indvars -S | FileCheck %s
 ;
 ; PR1301
@@ -15,85 +16,142 @@
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-n8:16:32"
 target triple = "i686-pc-linux-gnu"
 
-; CHECK-LABEL: @kinds__sbytezero
-; CHECK:         bb.thread:
-; CHECK:         sext
-; CHECK:         bb:
-; CHECK-NOT:     {{sext i8|zext i8|add i8|trunc}}
-
 define void @kinds__sbytezero(ptr nocapture %a) nounwind {
+; CHECK-LABEL: define void @kinds__sbytezero(
+; CHECK-SAME: ptr nocapture [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  bb.thread:
+; CHECK-NEXT:    [[TMP46:%.*]] = getelementptr [256 x i32], ptr [[A]], i32 0, i32 0
+; CHECK-NEXT:    store i32 0, ptr [[TMP46]], align 4
+; CHECK-NEXT:    [[SEXT:%.*]] = sext i8 127 to i32
+; CHECK-NEXT:    br label [[BB:%.*]]
+; CHECK:       bb:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i32 [ [[INDVARS_IV_NEXT:%.*]], [[BB]] ], [ -128, [[BB_THREAD:%.*]] ]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i32 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = add nsw i32 [[INDVARS_IV_NEXT]], 128
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr [256 x i32], ptr [[A]], i32 0, i32 [[TMP3]]
+; CHECK-NEXT:    store i32 0, ptr [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[INDVARS_IV_NEXT]], [[SEXT]]
+; CHECK-NEXT:    br i1 [[TMP0]], label [[RETURN:%.*]], label [[BB]]
+; CHECK:       return:
+; CHECK-NEXT:    ret void
+;
 bb.thread:
-	%tmp46 = getelementptr [256 x i32], ptr %a, i32 0, i32 0		; <ptr> [#uses=1]
-	store i32 0, ptr %tmp46
-	br label %bb
+  %tmp46 = getelementptr [256 x i32], ptr %a, i32 0, i32 0		; <ptr> [#uses=1]
+  store i32 0, ptr %tmp46
+  br label %bb
 
 bb:		; preds = %bb, %bb.thread
-	%i.0.reg2mem.0 = phi i8 [ -128, %bb.thread ], [ %tmp8, %bb ]		; <i8> [#uses=1]
-	%tmp8 = add i8 %i.0.reg2mem.0, 1		; <i8> [#uses=3]
-	%tmp1 = sext i8 %tmp8 to i32		; <i32> [#uses=1]
-	%tmp3 = add i32 %tmp1, 128		; <i32> [#uses=1]
-	%tmp4 = getelementptr [256 x i32], ptr %a, i32 0, i32 %tmp3		; <ptr> [#uses=1]
-	store i32 0, ptr %tmp4
-	%0 = icmp eq i8 %tmp8, 127		; <i1> [#uses=1]
-	br i1 %0, label %return, label %bb
+  %i.0.reg2mem.0 = phi i8 [ -128, %bb.thread ], [ %tmp8, %bb ]		; <i8> [#uses=1]
+  %tmp8 = add i8 %i.0.reg2mem.0, 1		; <i8> [#uses=3]
+  %tmp1 = sext i8 %tmp8 to i32		; <i32> [#uses=1]
+  %tmp3 = add i32 %tmp1, 128		; <i32> [#uses=1]
+  %tmp4 = getelementptr [256 x i32], ptr %a, i32 0, i32 %tmp3		; <ptr> [#uses=1]
+  store i32 0, ptr %tmp4
+  %0 = icmp eq i8 %tmp8, 127		; <i1> [#uses=1]
+  br i1 %0, label %return, label %bb
 
 return:		; preds = %bb
-	ret void
+  ret void
 }
 
-; CHECK-LABEL: @kinds__ubytezero
 
 define void @kinds__ubytezero(ptr nocapture %a) nounwind {
+; CHECK-LABEL: define void @kinds__ubytezero(
+; CHECK-SAME: ptr nocapture [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  bb.thread:
+; CHECK-NEXT:    [[TMP35:%.*]] = getelementptr [256 x i32], ptr [[A]], i32 0, i32 0
+; CHECK-NEXT:    store i32 0, ptr [[TMP35]], align 4
+; CHECK-NEXT:    br label [[BB:%.*]]
+; CHECK:       bb:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i32 [ [[INDVARS_IV_NEXT:%.*]], [[BB]] ], [ 0, [[BB_THREAD:%.*]] ]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i32 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr [256 x i32], ptr [[A]], i32 0, i32 [[INDVARS_IV_NEXT]]
+; CHECK-NEXT:    store i32 0, ptr [[TMP3]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[INDVARS_IV_NEXT]], 255
+; CHECK-NEXT:    br i1 [[TMP0]], label [[RETURN:%.*]], label [[BB]]
+; CHECK:       return:
+; CHECK-NEXT:    ret void
+;
 bb.thread:
-	%tmp35 = getelementptr [256 x i32], ptr %a, i32 0, i32 0		; <ptr> [#uses=1]
-	store i32 0, ptr %tmp35
-	br label %bb
+  %tmp35 = getelementptr [256 x i32], ptr %a, i32 0, i32 0		; <ptr> [#uses=1]
+  store i32 0, ptr %tmp35
+  br label %bb
 
 bb:		; preds = %bb, %bb.thread
-	%i.0.reg2mem.0 = phi i8 [ 0, %bb.thread ], [ %tmp7, %bb ]		; <i8> [#uses=1]
-	%tmp7 = add i8 %i.0.reg2mem.0, 1		; <i8> [#uses=3]
-	%tmp1 = zext i8 %tmp7 to i32		; <i32> [#uses=1]
-	%tmp3 = getelementptr [256 x i32], ptr %a, i32 0, i32 %tmp1		; <ptr> [#uses=1]
-	store i32 0, ptr %tmp3
-	%0 = icmp eq i8 %tmp7, -1		; <i1> [#uses=1]
-	br i1 %0, label %return, label %bb
+  %i.0.reg2mem.0 = phi i8 [ 0, %bb.thread ], [ %tmp7, %bb ]		; <i8> [#uses=1]
+  %tmp7 = add i8 %i.0.reg2mem.0, 1		; <i8> [#uses=3]
+  %tmp1 = zext i8 %tmp7 to i32		; <i32> [#uses=1]
+  %tmp3 = getelementptr [256 x i32], ptr %a, i32 0, i32 %tmp1		; <ptr> [#uses=1]
+  store i32 0, ptr %tmp3
+  %0 = icmp eq i8 %tmp7, -1		; <i1> [#uses=1]
+  br i1 %0, label %return, label %bb
 
 return:		; preds = %bb
-	ret void
+  ret void
 }
 
 define void @kinds__srangezero(ptr nocapture %a) nounwind {
+; CHECK-LABEL: define void @kinds__srangezero(
+; CHECK-SAME: ptr nocapture [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  bb.thread:
+; CHECK-NEXT:    br label [[BB:%.*]]
+; CHECK:       bb:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i32 [ [[INDVARS_IV_NEXT:%.*]], [[BB]] ], [ -10, [[BB_THREAD:%.*]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = add nsw i32 [[INDVARS_IV]], 10
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr [21 x i32], ptr [[A]], i32 0, i32 [[TMP4]]
+; CHECK-NEXT:    store i32 0, ptr [[TMP5]], align 4
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i32 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INDVARS_IV_NEXT]], 11
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[RETURN:%.*]], label [[BB]]
+; CHECK:       return:
+; CHECK-NEXT:    ret void
+;
 bb.thread:
-	br label %bb
+  br label %bb
 
 bb:		; preds = %bb, %bb.thread
-	%i.0.reg2mem.0 = phi i8 [ -10, %bb.thread ], [ %tmp7, %bb ]		; <i8> [#uses=2]
-	%tmp12 = sext i8 %i.0.reg2mem.0 to i32		; <i32> [#uses=1]
-	%tmp4 = add i32 %tmp12, 10		; <i32> [#uses=1]
-	%tmp5 = getelementptr [21 x i32], ptr %a, i32 0, i32 %tmp4		; <ptr> [#uses=1]
-	store i32 0, ptr %tmp5
-	%tmp7 = add i8 %i.0.reg2mem.0, 1		; <i8> [#uses=2]
-	%0 = icmp sgt i8 %tmp7, 10		; <i1> [#uses=1]
-	br i1 %0, label %return, label %bb
+  %i.0.reg2mem.0 = phi i8 [ -10, %bb.thread ], [ %tmp7, %bb ]		; <i8> [#uses=2]
+  %tmp12 = sext i8 %i.0.reg2mem.0 to i32		; <i32> [#uses=1]
+  %tmp4 = add i32 %tmp12, 10		; <i32> [#uses=1]
+  %tmp5 = getelementptr [21 x i32], ptr %a, i32 0, i32 %tmp4		; <ptr> [#uses=1]
+  store i32 0, ptr %tmp5
+  %tmp7 = add i8 %i.0.reg2mem.0, 1		; <i8> [#uses=2]
+  %0 = icmp sgt i8 %tmp7, 10		; <i1> [#uses=1]
+  br i1 %0, label %return, label %bb
 
 return:		; preds = %bb
-	ret void
+  ret void
 }
 
 define void @kinds__urangezero(ptr nocapture %a) nounwind {
+; CHECK-LABEL: define void @kinds__urangezero(
+; CHECK-SAME: ptr nocapture [[A:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  bb.thread:
+; CHECK-NEXT:    br label [[BB:%.*]]
+; CHECK:       bb:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i32 [ [[INDVARS_IV_NEXT:%.*]], [[BB]] ], [ 10, [[BB_THREAD:%.*]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = add nsw i32 [[INDVARS_IV]], -10
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr [21 x i32], ptr [[A]], i32 0, i32 [[TMP4]]
+; CHECK-NEXT:    store i32 0, ptr [[TMP5]], align 4
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i32 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INDVARS_IV_NEXT]], 31
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[RETURN:%.*]], label [[BB]]
+; CHECK:       return:
+; CHECK-NEXT:    ret void
+;
 bb.thread:
-	br label %bb
+  br label %bb
 
 bb:		; preds = %bb, %bb.thread
-	%i.0.reg2mem.0 = phi i8 [ 10, %bb.thread ], [ %tmp7, %bb ]		; <i8> [#uses=2]
-	%tmp12 = sext i8 %i.0.reg2mem.0 to i32		; <i32> [#uses=1]
-	%tmp4 = add i32 %tmp12, -10		; <i32> [#uses=1]
-	%tmp5 = getelementptr [21 x i32], ptr %a, i32 0, i32 %tmp4		; <ptr> [#uses=1]
-	store i32 0, ptr %tmp5
-	%tmp7 = add i8 %i.0.reg2mem.0, 1		; <i8> [#uses=2]
-	%0 = icmp sgt i8 %tmp7, 30		; <i1> [#uses=1]
-	br i1 %0, label %return, label %bb
+  %i.0.reg2mem.0 = phi i8 [ 10, %bb.thread ], [ %tmp7, %bb ]		; <i8> [#uses=2]
+  %tmp12 = sext i8 %i.0.reg2mem.0 to i32		; <i32> [#uses=1]
+  %tmp4 = add i32 %tmp12, -10		; <i32> [#uses=1]
+  %tmp5 = getelementptr [21 x i32], ptr %a, i32 0, i32 %tmp4		; <ptr> [#uses=1]
+  store i32 0, ptr %tmp5
+  %tmp7 = add i8 %i.0.reg2mem.0, 1		; <i8> [#uses=2]
+  %0 = icmp sgt i8 %tmp7, 30		; <i1> [#uses=1]
+  br i1 %0, label %return, label %bb
 
 return:		; preds = %bb
-	ret void
+  ret void
 }

From c7040509299542f8986448fa47abc0f3da75f661 Mon Sep 17 00:00:00 2001
From: Peter Klausler <35819229+klausler@users.noreply.github.com>
Date: Tue, 31 Oct 2023 10:12:16 -0700
Subject: [PATCH 544/877] [flang] Reduce implicit interface compatibility
 warnings due to length (#69385)

When a procedure with an implicit interface is called from two call
sites with distinct argument types, we emit a warning. This can lead to
a lot of output when the actual argument types are differing-length
character, and the warnings are less important since the procedure may
well have been defined with assumed-length character dummy arguments. So
let cases like CALL MYERROR('ab'); CALL MYERROR('abc') slide by without
a warning.
---
 flang/lib/Evaluate/characteristics.cpp | 13 +++++++++++--
 flang/test/Semantics/call35.f90        |  2 ++
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/flang/lib/Evaluate/characteristics.cpp b/flang/lib/Evaluate/characteristics.cpp
index ac61e72f428a9..3074fc0516b46 100644
--- a/flang/lib/Evaluate/characteristics.cpp
+++ b/flang/lib/Evaluate/characteristics.cpp
@@ -300,7 +300,15 @@ bool DummyDataObject::IsCompatibleWith(
     }
     return false;
   }
-  if (!type.type().IsTkLenCompatibleWith(actual.type.type())) {
+  // Treat deduced dummy character type as if it were assumed-length character
+  // to avoid useless "implicit interfaces have distinct type" warnings from
+  // CALL FOO('abc'); CALL FOO('abcd').
+  bool deducedAssumedLength{type.type().category() == TypeCategory::Character &&
+      attrs.test(Attr::DeducedFromActual)};
+  bool compatibleTypes{deducedAssumedLength
+          ? type.type().IsTkCompatibleWith(actual.type.type())
+          : type.type().IsTkLenCompatibleWith(actual.type.type())};
+  if (!compatibleTypes) {
     if (whyNot) {
       *whyNot = "incompatible dummy data object types: "s +
           type.type().AsFortran() + " vs " + actual.type.type().AsFortran();
@@ -314,7 +322,8 @@ bool DummyDataObject::IsCompatibleWith(
     }
     return false;
   }
-  if (type.type().category() == TypeCategory::Character) {
+  if (type.type().category() == TypeCategory::Character &&
+      !deducedAssumedLength) {
     if (actual.type.type().IsAssumedLengthCharacter() !=
         type.type().IsAssumedLengthCharacter()) {
       if (whyNot) {
diff --git a/flang/test/Semantics/call35.f90 b/flang/test/Semantics/call35.f90
index ddcd64cec6c43..ff819481226d6 100644
--- a/flang/test/Semantics/call35.f90
+++ b/flang/test/Semantics/call35.f90
@@ -1,11 +1,13 @@
 ! RUN: %python %S/test_errors.py %s %flang_fc1 -Werror
 subroutine s1
   call ext(1, 2)
+  call myerror('abc')
 end
 
 subroutine s2
   !WARNING: Reference to the procedure 'ext' has an implicit interface that is distinct from another reference: distinct numbers of dummy arguments
   call ext(1.)
+  call myerror('abcd') ! don't warn about distinct lengths
 end
 
 subroutine s3

From 483e92468e597b73c646182bd755a0d5ef67d327 Mon Sep 17 00:00:00 2001
From: Aleksandr Popov <42888396+aleks-tmb@users.noreply.github.com>
Date: Tue, 31 Oct 2023 18:16:59 +0100
Subject: [PATCH 545/877] [NFC] Extract LoopConstrainer from IRCE to reuse it
 outside the pass (#70508)

Co-authored-by: Aleksandr Popov <apopov@azul.com>
---
 .../llvm/Transforms/Utils/LoopConstrainer.h   |  226 ++++
 .../Scalar/InductiveRangeCheckElimination.cpp | 1136 +----------------
 llvm/lib/Transforms/Utils/CMakeLists.txt      |    1 +
 llvm/lib/Transforms/Utils/LoopConstrainer.cpp |  904 +++++++++++++
 .../IRCE/add-metadata-pre-post-loops.ll       |    6 +-
 .../Transforms/IRCE/conjunctive-checks.ll     |    4 +-
 .../test/Transforms/IRCE/correct-loop-info.ll |    4 +-
 .../IRCE/iv-plus-offset-range-check.ll        |   18 +-
 .../IRCE/multiple-access-no-preloop.ll        |    2 +-
 .../IRCE/non-loop-invariant-rhs-instr.ll      |    2 +-
 llvm/test/Transforms/IRCE/pre_post_loops.ll   |    4 +-
 .../IRCE/range_intersect_miscompile.ll        |    6 +-
 .../IRCE/ranges_of_different_types.ll         |   24 +-
 .../test/Transforms/IRCE/rc-negative-bound.ll |    8 +-
 .../Transforms/IRCE/stride_more_than_1.ll     |   22 +-
 llvm/test/Transforms/IRCE/unhandled.ll        |    2 +-
 .../IRCE/unsigned_comparisons_ugt.ll          |   10 +-
 .../IRCE/unsigned_comparisons_ult.ll          |   14 +-
 llvm/test/Transforms/IRCE/wide_indvar.ll      |   20 +-
 .../llvm/lib/Transforms/Utils/BUILD.gn        |    1 +
 20 files changed, 1229 insertions(+), 1185 deletions(-)
 create mode 100644 llvm/include/llvm/Transforms/Utils/LoopConstrainer.h
 create mode 100644 llvm/lib/Transforms/Utils/LoopConstrainer.cpp

diff --git a/llvm/include/llvm/Transforms/Utils/LoopConstrainer.h b/llvm/include/llvm/Transforms/Utils/LoopConstrainer.h
new file mode 100644
index 0000000000000..64db907e9a0f4
--- /dev/null
+++ b/llvm/include/llvm/Transforms/Utils/LoopConstrainer.h
@@ -0,0 +1,226 @@
+//===- LoopConstrainer.h ----------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_UTILS_LOOP_CONSTRAINER_H
+#define LLVM_TRANSFORMS_UTILS_LOOP_CONSTRAINER_H
+
+#include "llvm/Support/Casting.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+#include <optional>
+
+namespace llvm {
+
+class BasicBlock;
+class BranchInst;
+class DominatorTree;
+class IntegerType;
+class Loop;
+class LoopInfo;
+class PHINode;
+class ScalarEvolution;
+class SCEV;
+class Value;
+
+// Keeps track of the structure of a loop.  This is similar to llvm::Loop,
+// except that it is more lightweight and can track the state of a loop through
+// changing and potentially invalid IR.  This structure also formalizes the
+// kinds of loops we can deal with -- ones that have a single latch that is also
+// an exiting block *and* have a canonical induction variable.
+struct LoopStructure {
+  const char *Tag = "";
+
+  BasicBlock *Header = nullptr;
+  BasicBlock *Latch = nullptr;
+
+  // `Latch's terminator instruction is `LatchBr', and it's `LatchBrExitIdx'th
+  // successor is `LatchExit', the exit block of the loop.
+  BranchInst *LatchBr = nullptr;
+  BasicBlock *LatchExit = nullptr;
+  unsigned LatchBrExitIdx = std::numeric_limits<unsigned>::max();
+
+  // The loop represented by this instance of LoopStructure is semantically
+  // equivalent to:
+  //
+  // intN_ty inc = IndVarIncreasing ? 1 : -1;
+  // pred_ty predicate = IndVarIncreasing ? ICMP_SLT : ICMP_SGT;
+  //
+  // for (intN_ty iv = IndVarStart; predicate(iv, LoopExitAt); iv = IndVarBase)
+  //   ... body ...
+
+  Value *IndVarBase = nullptr;
+  Value *IndVarStart = nullptr;
+  Value *IndVarStep = nullptr;
+  Value *LoopExitAt = nullptr;
+  bool IndVarIncreasing = false;
+  bool IsSignedPredicate = true;
+  IntegerType *ExitCountTy = nullptr;
+
+  LoopStructure() = default;
+
+  template <typename M> LoopStructure map(M Map) const {
+    LoopStructure Result;
+    Result.Tag = Tag;
+    Result.Header = cast<BasicBlock>(Map(Header));
+    Result.Latch = cast<BasicBlock>(Map(Latch));
+    Result.LatchBr = cast<BranchInst>(Map(LatchBr));
+    Result.LatchExit = cast<BasicBlock>(Map(LatchExit));
+    Result.LatchBrExitIdx = LatchBrExitIdx;
+    Result.IndVarBase = Map(IndVarBase);
+    Result.IndVarStart = Map(IndVarStart);
+    Result.IndVarStep = Map(IndVarStep);
+    Result.LoopExitAt = Map(LoopExitAt);
+    Result.IndVarIncreasing = IndVarIncreasing;
+    Result.IsSignedPredicate = IsSignedPredicate;
+    Result.ExitCountTy = ExitCountTy;
+    return Result;
+  }
+
+  static std::optional<LoopStructure>
+  parseLoopStructure(ScalarEvolution &, Loop &, bool, const char *&);
+};
+
+/// This class is used to constrain loops to run within a given iteration space.
+/// The algorithm this class implements is given a Loop and a range [Begin,
+/// End).  The algorithm then tries to break out a "main loop" out of the loop
+/// it is given in a way that the "main loop" runs with the induction variable
+/// in a subset of [Begin, End).  The algorithm emits appropriate pre and post
+/// loops to run any remaining iterations.  The pre loop runs any iterations in
+/// which the induction variable is < Begin, and the post loop runs any
+/// iterations in which the induction variable is >= End.
+class LoopConstrainer {
+public:
+  // Calculated subranges we restrict the iteration space of the main loop to.
+  // See the implementation of `calculateSubRanges' for more details on how
+  // these fields are computed.  `LowLimit` is std::nullopt if there is no
+  // restriction on low end of the restricted iteration space of the main loop.
+  // `HighLimit` is std::nullopt if there is no restriction on high end of the
+  // restricted iteration space of the main loop.
+
+  struct SubRanges {
+    std::optional<const SCEV *> LowLimit;
+    std::optional<const SCEV *> HighLimit;
+  };
+
+private:
+  // The representation of a clone of the original loop we started out with.
+  struct ClonedLoop {
+    // The cloned blocks
+    std::vector<BasicBlock *> Blocks;
+
+    // `Map` maps values in the clonee into values in the cloned version
+    ValueToValueMapTy Map;
+
+    // An instance of `LoopStructure` for the cloned loop
+    LoopStructure Structure;
+  };
+
+  // Result of rewriting the range of a loop.  See changeIterationSpaceEnd for
+  // more details on what these fields mean.
+  struct RewrittenRangeInfo {
+    BasicBlock *PseudoExit = nullptr;
+    BasicBlock *ExitSelector = nullptr;
+    std::vector<PHINode *> PHIValuesAtPseudoExit;
+    PHINode *IndVarEnd = nullptr;
+
+    RewrittenRangeInfo() = default;
+  };
+
+  // Clone `OriginalLoop' and return the result in CLResult.  The IR after
+  // running `cloneLoop' is well formed except for the PHI nodes in CLResult --
+  // the PHI nodes say that there is an incoming edge from `OriginalPreheader`
+  // but there is no such edge.
+  void cloneLoop(ClonedLoop &CLResult, const char *Tag) const;
+
+  // Create the appropriate loop structure needed to describe a cloned copy of
+  // `Original`.  The clone is described by `VM`.
+  Loop *createClonedLoopStructure(Loop *Original, Loop *Parent,
+                                  ValueToValueMapTy &VM, bool IsSubloop);
+
+  // Rewrite the iteration space of the loop denoted by (LS, Preheader). The
+  // iteration space of the rewritten loop ends at ExitLoopAt.  The start of the
+  // iteration space is not changed.  `ExitLoopAt' is assumed to be slt
+  // `OriginalHeaderCount'.
+  //
+  // If there are iterations left to execute, control is made to jump to
+  // `ContinuationBlock', otherwise they take the normal loop exit.  The
+  // returned `RewrittenRangeInfo' object is populated as follows:
+  //
+  //  .PseudoExit is a basic block that unconditionally branches to
+  //      `ContinuationBlock'.
+  //
+  //  .ExitSelector is a basic block that decides, on exit from the loop,
+  //      whether to branch to the "true" exit or to `PseudoExit'.
+  //
+  //  .PHIValuesAtPseudoExit are PHINodes in `PseudoExit' that compute the value
+  //      for each PHINode in the loop header on taking the pseudo exit.
+  //
+  // After changeIterationSpaceEnd, `Preheader' is no longer a legitimate
+  // preheader because it is made to branch to the loop header only
+  // conditionally.
+  RewrittenRangeInfo
+  changeIterationSpaceEnd(const LoopStructure &LS, BasicBlock *Preheader,
+                          Value *ExitLoopAt,
+                          BasicBlock *ContinuationBlock) const;
+
+  // The loop denoted by `LS' has `OldPreheader' as its preheader.  This
+  // function creates a new preheader for `LS' and returns it.
+  BasicBlock *createPreheader(const LoopStructure &LS, BasicBlock *OldPreheader,
+                              const char *Tag) const;
+
+  // `ContinuationBlockAndPreheader' was the continuation block for some call to
+  // `changeIterationSpaceEnd' and is the preheader to the loop denoted by `LS'.
+  // This function rewrites the PHI nodes in `LS.Header' to start with the
+  // correct value.
+  void rewriteIncomingValuesForPHIs(
+      LoopStructure &LS, BasicBlock *ContinuationBlockAndPreheader,
+      const LoopConstrainer::RewrittenRangeInfo &RRI) const;
+
+  // Even though we do not preserve any passes at this time, we at least need to
+  // keep the parent loop structure consistent.  The `LPPassManager' seems to
+  // verify this after running a loop pass.  This function adds the list of
+  // blocks denoted by BBs to this loops parent loop if required.
+  void addToParentLoopIfNeeded(ArrayRef<BasicBlock *> BBs);
+
+  // Some global state.
+  Function &F;
+  LLVMContext &Ctx;
+  ScalarEvolution &SE;
+  DominatorTree &DT;
+  LoopInfo &LI;
+  function_ref<void(Loop *, bool)> LPMAddNewLoop;
+
+  // Information about the original loop we started out with.
+  Loop &OriginalLoop;
+
+  BasicBlock *OriginalPreheader = nullptr;
+
+  // The preheader of the main loop.  This may or may not be different from
+  // `OriginalPreheader'.
+  BasicBlock *MainLoopPreheader = nullptr;
+
+  // Type of the range we need to run the main loop in.
+  Type *RangeTy;
+
+  // The structure of the main loop (see comment at the beginning of this class
+  // for a definition)
+  LoopStructure MainLoopStructure;
+
+  SubRanges SR;
+
+public:
+  LoopConstrainer(Loop &L, LoopInfo &LI,
+                  function_ref<void(Loop *, bool)> LPMAddNewLoop,
+                  const LoopStructure &LS, ScalarEvolution &SE,
+                  DominatorTree &DT, Type *T, SubRanges SR);
+
+  // Entry point for the algorithm.  Returns true on success.
+  bool run();
+};
+} // namespace llvm
+
+#endif // LLVM_TRANSFORMS_UTILS_LOOP_CONSTRAINER_H
diff --git a/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp b/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp
index bf40b355651c3..72a20213dfa66 100644
--- a/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp
@@ -81,6 +81,7 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/LoopConstrainer.h"
 #include "llvm/Transforms/Utils/LoopSimplify.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
@@ -129,7 +130,7 @@ static cl::opt<bool>
     PrintScaledBoundaryRangeChecks("irce-print-scaled-boundary-range-checks",
                                    cl::Hidden, cl::init(false));
 
-static const char *ClonedLoopTag = "irce.loop.clone";
+static const char *ClonedLoopTag = "loop_constrainer.loop.clone";
 
 #define DEBUG_TYPE "irce"
 
@@ -241,8 +242,6 @@ class InductiveRangeCheck {
       SmallVectorImpl<InductiveRangeCheck> &Checks, bool &Changed);
 };
 
-struct LoopStructure;
-
 class InductiveRangeCheckElimination {
   ScalarEvolution &SE;
   BranchProbabilityInfo *BPI;
@@ -554,649 +553,6 @@ void InductiveRangeCheck::extractRangeChecksFromBranch(
                                                   Checks, Visited);
 }
 
-// Add metadata to the loop L to disable loop optimizations. Callers need to
-// confirm that optimizing loop L is not beneficial.
-static void DisableAllLoopOptsOnLoop(Loop &L) {
-  // We do not care about any existing loopID related metadata for L, since we
-  // are setting all loop metadata to false.
-  LLVMContext &Context = L.getHeader()->getContext();
-  // Reserve first location for self reference to the LoopID metadata node.
-  MDNode *Dummy = MDNode::get(Context, {});
-  MDNode *DisableUnroll = MDNode::get(
-      Context, {MDString::get(Context, "llvm.loop.unroll.disable")});
-  Metadata *FalseVal =
-      ConstantAsMetadata::get(ConstantInt::get(Type::getInt1Ty(Context), 0));
-  MDNode *DisableVectorize = MDNode::get(
-      Context,
-      {MDString::get(Context, "llvm.loop.vectorize.enable"), FalseVal});
-  MDNode *DisableLICMVersioning = MDNode::get(
-      Context, {MDString::get(Context, "llvm.loop.licm_versioning.disable")});
-  MDNode *DisableDistribution= MDNode::get(
-      Context,
-      {MDString::get(Context, "llvm.loop.distribute.enable"), FalseVal});
-  MDNode *NewLoopID =
-      MDNode::get(Context, {Dummy, DisableUnroll, DisableVectorize,
-                            DisableLICMVersioning, DisableDistribution});
-  // Set operand 0 to refer to the loop id itself.
-  NewLoopID->replaceOperandWith(0, NewLoopID);
-  L.setLoopID(NewLoopID);
-}
-
-namespace {
-
-// Keeps track of the structure of a loop.  This is similar to llvm::Loop,
-// except that it is more lightweight and can track the state of a loop through
-// changing and potentially invalid IR.  This structure also formalizes the
-// kinds of loops we can deal with -- ones that have a single latch that is also
-// an exiting block *and* have a canonical induction variable.
-struct LoopStructure {
-  const char *Tag = "";
-
-  BasicBlock *Header = nullptr;
-  BasicBlock *Latch = nullptr;
-
-  // `Latch's terminator instruction is `LatchBr', and it's `LatchBrExitIdx'th
-  // successor is `LatchExit', the exit block of the loop.
-  BranchInst *LatchBr = nullptr;
-  BasicBlock *LatchExit = nullptr;
-  unsigned LatchBrExitIdx = std::numeric_limits<unsigned>::max();
-
-  // The loop represented by this instance of LoopStructure is semantically
-  // equivalent to:
-  //
-  // intN_ty inc = IndVarIncreasing ? 1 : -1;
-  // pred_ty predicate = IndVarIncreasing ? ICMP_SLT : ICMP_SGT;
-  //
-  // for (intN_ty iv = IndVarStart; predicate(iv, LoopExitAt); iv = IndVarBase)
-  //   ... body ...
-
-  Value *IndVarBase = nullptr;
-  Value *IndVarStart = nullptr;
-  Value *IndVarStep = nullptr;
-  Value *LoopExitAt = nullptr;
-  bool IndVarIncreasing = false;
-  bool IsSignedPredicate = true;
-
-  LoopStructure() = default;
-
-  template <typename M> LoopStructure map(M Map) const {
-    LoopStructure Result;
-    Result.Tag = Tag;
-    Result.Header = cast<BasicBlock>(Map(Header));
-    Result.Latch = cast<BasicBlock>(Map(Latch));
-    Result.LatchBr = cast<BranchInst>(Map(LatchBr));
-    Result.LatchExit = cast<BasicBlock>(Map(LatchExit));
-    Result.LatchBrExitIdx = LatchBrExitIdx;
-    Result.IndVarBase = Map(IndVarBase);
-    Result.IndVarStart = Map(IndVarStart);
-    Result.IndVarStep = Map(IndVarStep);
-    Result.LoopExitAt = Map(LoopExitAt);
-    Result.IndVarIncreasing = IndVarIncreasing;
-    Result.IsSignedPredicate = IsSignedPredicate;
-    return Result;
-  }
-
-  static std::optional<LoopStructure> parseLoopStructure(ScalarEvolution &,
-                                                         Loop &, const char *&);
-};
-
-/// This class is used to constrain loops to run within a given iteration space.
-/// The algorithm this class implements is given a Loop and a range [Begin,
-/// End).  The algorithm then tries to break out a "main loop" out of the loop
-/// it is given in a way that the "main loop" runs with the induction variable
-/// in a subset of [Begin, End).  The algorithm emits appropriate pre and post
-/// loops to run any remaining iterations.  The pre loop runs any iterations in
-/// which the induction variable is < Begin, and the post loop runs any
-/// iterations in which the induction variable is >= End.
-class LoopConstrainer {
-  // The representation of a clone of the original loop we started out with.
-  struct ClonedLoop {
-    // The cloned blocks
-    std::vector<BasicBlock *> Blocks;
-
-    // `Map` maps values in the clonee into values in the cloned version
-    ValueToValueMapTy Map;
-
-    // An instance of `LoopStructure` for the cloned loop
-    LoopStructure Structure;
-  };
-
-  // Result of rewriting the range of a loop.  See changeIterationSpaceEnd for
-  // more details on what these fields mean.
-  struct RewrittenRangeInfo {
-    BasicBlock *PseudoExit = nullptr;
-    BasicBlock *ExitSelector = nullptr;
-    std::vector<PHINode *> PHIValuesAtPseudoExit;
-    PHINode *IndVarEnd = nullptr;
-
-    RewrittenRangeInfo() = default;
-  };
-
-  // Calculated subranges we restrict the iteration space of the main loop to.
-  // See the implementation of `calculateSubRanges' for more details on how
-  // these fields are computed.  `LowLimit` is std::nullopt if there is no
-  // restriction on low end of the restricted iteration space of the main loop.
-  // `HighLimit` is std::nullopt if there is no restriction on high end of the
-  // restricted iteration space of the main loop.
-
-  struct SubRanges {
-    std::optional<const SCEV *> LowLimit;
-    std::optional<const SCEV *> HighLimit;
-  };
-
-  // Compute a safe set of limits for the main loop to run in -- effectively the
-  // intersection of `Range' and the iteration space of the original loop.
-  // Return std::nullopt if unable to compute the set of subranges.
-  std::optional<SubRanges> calculateSubRanges(bool IsSignedPredicate) const;
-
-  // Clone `OriginalLoop' and return the result in CLResult.  The IR after
-  // running `cloneLoop' is well formed except for the PHI nodes in CLResult --
-  // the PHI nodes say that there is an incoming edge from `OriginalPreheader`
-  // but there is no such edge.
-  void cloneLoop(ClonedLoop &CLResult, const char *Tag) const;
-
-  // Create the appropriate loop structure needed to describe a cloned copy of
-  // `Original`.  The clone is described by `VM`.
-  Loop *createClonedLoopStructure(Loop *Original, Loop *Parent,
-                                  ValueToValueMapTy &VM, bool IsSubloop);
-
-  // Rewrite the iteration space of the loop denoted by (LS, Preheader). The
-  // iteration space of the rewritten loop ends at ExitLoopAt.  The start of the
-  // iteration space is not changed.  `ExitLoopAt' is assumed to be slt
-  // `OriginalHeaderCount'.
-  //
-  // If there are iterations left to execute, control is made to jump to
-  // `ContinuationBlock', otherwise they take the normal loop exit.  The
-  // returned `RewrittenRangeInfo' object is populated as follows:
-  //
-  //  .PseudoExit is a basic block that unconditionally branches to
-  //      `ContinuationBlock'.
-  //
-  //  .ExitSelector is a basic block that decides, on exit from the loop,
-  //      whether to branch to the "true" exit or to `PseudoExit'.
-  //
-  //  .PHIValuesAtPseudoExit are PHINodes in `PseudoExit' that compute the value
-  //      for each PHINode in the loop header on taking the pseudo exit.
-  //
-  // After changeIterationSpaceEnd, `Preheader' is no longer a legitimate
-  // preheader because it is made to branch to the loop header only
-  // conditionally.
-  RewrittenRangeInfo
-  changeIterationSpaceEnd(const LoopStructure &LS, BasicBlock *Preheader,
-                          Value *ExitLoopAt,
-                          BasicBlock *ContinuationBlock) const;
-
-  // The loop denoted by `LS' has `OldPreheader' as its preheader.  This
-  // function creates a new preheader for `LS' and returns it.
-  BasicBlock *createPreheader(const LoopStructure &LS, BasicBlock *OldPreheader,
-                              const char *Tag) const;
-
-  // `ContinuationBlockAndPreheader' was the continuation block for some call to
-  // `changeIterationSpaceEnd' and is the preheader to the loop denoted by `LS'.
-  // This function rewrites the PHI nodes in `LS.Header' to start with the
-  // correct value.
-  void rewriteIncomingValuesForPHIs(
-      LoopStructure &LS, BasicBlock *ContinuationBlockAndPreheader,
-      const LoopConstrainer::RewrittenRangeInfo &RRI) const;
-
-  // Even though we do not preserve any passes at this time, we at least need to
-  // keep the parent loop structure consistent.  The `LPPassManager' seems to
-  // verify this after running a loop pass.  This function adds the list of
-  // blocks denoted by BBs to this loops parent loop if required.
-  void addToParentLoopIfNeeded(ArrayRef<BasicBlock *> BBs);
-
-  // Some global state.
-  Function &F;
-  LLVMContext &Ctx;
-  ScalarEvolution &SE;
-  DominatorTree &DT;
-  LoopInfo &LI;
-  function_ref<void(Loop *, bool)> LPMAddNewLoop;
-
-  // Information about the original loop we started out with.
-  Loop &OriginalLoop;
-
-  const IntegerType *ExitCountTy = nullptr;
-  BasicBlock *OriginalPreheader = nullptr;
-
-  // The preheader of the main loop.  This may or may not be different from
-  // `OriginalPreheader'.
-  BasicBlock *MainLoopPreheader = nullptr;
-
-  // The range we need to run the main loop in.
-  InductiveRangeCheck::Range Range;
-
-  // The structure of the main loop (see comment at the beginning of this class
-  // for a definition)
-  LoopStructure MainLoopStructure;
-
-public:
-  LoopConstrainer(Loop &L, LoopInfo &LI,
-                  function_ref<void(Loop *, bool)> LPMAddNewLoop,
-                  const LoopStructure &LS, ScalarEvolution &SE,
-                  DominatorTree &DT, InductiveRangeCheck::Range R)
-      : F(*L.getHeader()->getParent()), Ctx(L.getHeader()->getContext()),
-        SE(SE), DT(DT), LI(LI), LPMAddNewLoop(LPMAddNewLoop), OriginalLoop(L),
-        Range(R), MainLoopStructure(LS) {}
-
-  // Entry point for the algorithm.  Returns true on success.
-  bool run();
-};
-
-} // end anonymous namespace
-
-/// Given a loop with an deccreasing induction variable, is it possible to
-/// safely calculate the bounds of a new loop using the given Predicate.
-static bool isSafeDecreasingBound(const SCEV *Start,
-                                  const SCEV *BoundSCEV, const SCEV *Step,
-                                  ICmpInst::Predicate Pred,
-                                  unsigned LatchBrExitIdx,
-                                  Loop *L, ScalarEvolution &SE) {
-  if (Pred != ICmpInst::ICMP_SLT && Pred != ICmpInst::ICMP_SGT &&
-      Pred != ICmpInst::ICMP_ULT && Pred != ICmpInst::ICMP_UGT)
-    return false;
-
-  if (!SE.isAvailableAtLoopEntry(BoundSCEV, L))
-    return false;
-
-  assert(SE.isKnownNegative(Step) && "expecting negative step");
-
-  LLVM_DEBUG(dbgs() << "irce: isSafeDecreasingBound with:\n");
-  LLVM_DEBUG(dbgs() << "irce: Start: " << *Start << "\n");
-  LLVM_DEBUG(dbgs() << "irce: Step: " << *Step << "\n");
-  LLVM_DEBUG(dbgs() << "irce: BoundSCEV: " << *BoundSCEV << "\n");
-  LLVM_DEBUG(dbgs() << "irce: Pred: " << Pred << "\n");
-  LLVM_DEBUG(dbgs() << "irce: LatchExitBrIdx: " << LatchBrExitIdx << "\n");
-
-  bool IsSigned = ICmpInst::isSigned(Pred);
-  // The predicate that we need to check that the induction variable lies
-  // within bounds.
-  ICmpInst::Predicate BoundPred =
-    IsSigned ? CmpInst::ICMP_SGT : CmpInst::ICMP_UGT;
-
-  if (LatchBrExitIdx == 1)
-    return SE.isLoopEntryGuardedByCond(L, BoundPred, Start, BoundSCEV);
-
-  assert(LatchBrExitIdx == 0 &&
-         "LatchBrExitIdx should be either 0 or 1");
-
-  const SCEV *StepPlusOne = SE.getAddExpr(Step, SE.getOne(Step->getType()));
-  unsigned BitWidth = cast<IntegerType>(BoundSCEV->getType())->getBitWidth();
-  APInt Min = IsSigned ? APInt::getSignedMinValue(BitWidth) :
-    APInt::getMinValue(BitWidth);
-  const SCEV *Limit = SE.getMinusSCEV(SE.getConstant(Min), StepPlusOne);
-
-  const SCEV *MinusOne =
-    SE.getMinusSCEV(BoundSCEV, SE.getOne(BoundSCEV->getType()));
-
-  return SE.isLoopEntryGuardedByCond(L, BoundPred, Start, MinusOne) &&
-         SE.isLoopEntryGuardedByCond(L, BoundPred, BoundSCEV, Limit);
-
-}
-
-/// Given a loop with an increasing induction variable, is it possible to
-/// safely calculate the bounds of a new loop using the given Predicate.
-static bool isSafeIncreasingBound(const SCEV *Start,
-                                  const SCEV *BoundSCEV, const SCEV *Step,
-                                  ICmpInst::Predicate Pred,
-                                  unsigned LatchBrExitIdx,
-                                  Loop *L, ScalarEvolution &SE) {
-  if (Pred != ICmpInst::ICMP_SLT && Pred != ICmpInst::ICMP_SGT &&
-      Pred != ICmpInst::ICMP_ULT && Pred != ICmpInst::ICMP_UGT)
-    return false;
-
-  if (!SE.isAvailableAtLoopEntry(BoundSCEV, L))
-    return false;
-
-  LLVM_DEBUG(dbgs() << "irce: isSafeIncreasingBound with:\n");
-  LLVM_DEBUG(dbgs() << "irce: Start: " << *Start << "\n");
-  LLVM_DEBUG(dbgs() << "irce: Step: " << *Step << "\n");
-  LLVM_DEBUG(dbgs() << "irce: BoundSCEV: " << *BoundSCEV << "\n");
-  LLVM_DEBUG(dbgs() << "irce: Pred: " << Pred << "\n");
-  LLVM_DEBUG(dbgs() << "irce: LatchExitBrIdx: " << LatchBrExitIdx << "\n");
-
-  bool IsSigned = ICmpInst::isSigned(Pred);
-  // The predicate that we need to check that the induction variable lies
-  // within bounds.
-  ICmpInst::Predicate BoundPred =
-      IsSigned ? CmpInst::ICMP_SLT : CmpInst::ICMP_ULT;
-
-  if (LatchBrExitIdx == 1)
-    return SE.isLoopEntryGuardedByCond(L, BoundPred, Start, BoundSCEV);
-
-  assert(LatchBrExitIdx == 0 && "LatchBrExitIdx should be 0 or 1");
-
-  const SCEV *StepMinusOne =
-    SE.getMinusSCEV(Step, SE.getOne(Step->getType()));
-  unsigned BitWidth = cast<IntegerType>(BoundSCEV->getType())->getBitWidth();
-  APInt Max = IsSigned ? APInt::getSignedMaxValue(BitWidth) :
-    APInt::getMaxValue(BitWidth);
-  const SCEV *Limit = SE.getMinusSCEV(SE.getConstant(Max), StepMinusOne);
-
-  return (SE.isLoopEntryGuardedByCond(L, BoundPred, Start,
-                                      SE.getAddExpr(BoundSCEV, Step)) &&
-          SE.isLoopEntryGuardedByCond(L, BoundPred, BoundSCEV, Limit));
-}
-
-/// Returns estimate for max latch taken count of the loop of the narrowest
-/// available type. If the latch block has such estimate, it is returned.
-/// Otherwise, we use max exit count of whole loop (that is potentially of wider
-/// type than latch check itself), which is still better than no estimate.
-static const SCEV *getNarrowestLatchMaxTakenCountEstimate(ScalarEvolution &SE,
-                                                          const Loop &L) {
-  const SCEV *FromBlock =
-      SE.getExitCount(&L, L.getLoopLatch(), ScalarEvolution::SymbolicMaximum);
-  if (isa<SCEVCouldNotCompute>(FromBlock))
-    return SE.getSymbolicMaxBackedgeTakenCount(&L);
-  return FromBlock;
-}
-
-std::optional<LoopStructure>
-LoopStructure::parseLoopStructure(ScalarEvolution &SE, Loop &L,
-                                  const char *&FailureReason) {
-  if (!L.isLoopSimplifyForm()) {
-    FailureReason = "loop not in LoopSimplify form";
-    return std::nullopt;
-  }
-
-  BasicBlock *Latch = L.getLoopLatch();
-  assert(Latch && "Simplified loops only have one latch!");
-
-  if (Latch->getTerminator()->getMetadata(ClonedLoopTag)) {
-    FailureReason = "loop has already been cloned";
-    return std::nullopt;
-  }
-
-  if (!L.isLoopExiting(Latch)) {
-    FailureReason = "no loop latch";
-    return std::nullopt;
-  }
-
-  BasicBlock *Header = L.getHeader();
-  BasicBlock *Preheader = L.getLoopPreheader();
-  if (!Preheader) {
-    FailureReason = "no preheader";
-    return std::nullopt;
-  }
-
-  BranchInst *LatchBr = dyn_cast<BranchInst>(Latch->getTerminator());
-  if (!LatchBr || LatchBr->isUnconditional()) {
-    FailureReason = "latch terminator not conditional branch";
-    return std::nullopt;
-  }
-
-  unsigned LatchBrExitIdx = LatchBr->getSuccessor(0) == Header ? 1 : 0;
-
-  ICmpInst *ICI = dyn_cast<ICmpInst>(LatchBr->getCondition());
-  if (!ICI || !isa<IntegerType>(ICI->getOperand(0)->getType())) {
-    FailureReason = "latch terminator branch not conditional on integral icmp";
-    return std::nullopt;
-  }
-
-  const SCEV *MaxBETakenCount = getNarrowestLatchMaxTakenCountEstimate(SE, L);
-  if (isa<SCEVCouldNotCompute>(MaxBETakenCount)) {
-    FailureReason = "could not compute latch count";
-    return std::nullopt;
-  }
-  assert(SE.getLoopDisposition(MaxBETakenCount, &L) ==
-             ScalarEvolution::LoopInvariant &&
-         "loop variant exit count doesn't make sense!");
-
-  ICmpInst::Predicate Pred = ICI->getPredicate();
-  Value *LeftValue = ICI->getOperand(0);
-  const SCEV *LeftSCEV = SE.getSCEV(LeftValue);
-  IntegerType *IndVarTy = cast<IntegerType>(LeftValue->getType());
-
-  Value *RightValue = ICI->getOperand(1);
-  const SCEV *RightSCEV = SE.getSCEV(RightValue);
-
-  // We canonicalize `ICI` such that `LeftSCEV` is an add recurrence.
-  if (!isa<SCEVAddRecExpr>(LeftSCEV)) {
-    if (isa<SCEVAddRecExpr>(RightSCEV)) {
-      std::swap(LeftSCEV, RightSCEV);
-      std::swap(LeftValue, RightValue);
-      Pred = ICmpInst::getSwappedPredicate(Pred);
-    } else {
-      FailureReason = "no add recurrences in the icmp";
-      return std::nullopt;
-    }
-  }
-
-  auto HasNoSignedWrap = [&](const SCEVAddRecExpr *AR) {
-    if (AR->getNoWrapFlags(SCEV::FlagNSW))
-      return true;
-
-    IntegerType *Ty = cast<IntegerType>(AR->getType());
-    IntegerType *WideTy =
-        IntegerType::get(Ty->getContext(), Ty->getBitWidth() * 2);
-
-    const SCEVAddRecExpr *ExtendAfterOp =
-        dyn_cast<SCEVAddRecExpr>(SE.getSignExtendExpr(AR, WideTy));
-    if (ExtendAfterOp) {
-      const SCEV *ExtendedStart = SE.getSignExtendExpr(AR->getStart(), WideTy);
-      const SCEV *ExtendedStep =
-          SE.getSignExtendExpr(AR->getStepRecurrence(SE), WideTy);
-
-      bool NoSignedWrap = ExtendAfterOp->getStart() == ExtendedStart &&
-                          ExtendAfterOp->getStepRecurrence(SE) == ExtendedStep;
-
-      if (NoSignedWrap)
-        return true;
-    }
-
-    // We may have proved this when computing the sign extension above.
-    return AR->getNoWrapFlags(SCEV::FlagNSW) != SCEV::FlagAnyWrap;
-  };
-
-  // `ICI` is interpreted as taking the backedge if the *next* value of the
-  // induction variable satisfies some constraint.
-
-  const SCEVAddRecExpr *IndVarBase = cast<SCEVAddRecExpr>(LeftSCEV);
-  if (IndVarBase->getLoop() != &L) {
-    FailureReason = "LHS in cmp is not an AddRec for this loop";
-    return std::nullopt;
-  }
-  if (!IndVarBase->isAffine()) {
-    FailureReason = "LHS in icmp not induction variable";
-    return std::nullopt;
-  }
-  const SCEV* StepRec = IndVarBase->getStepRecurrence(SE);
-  if (!isa<SCEVConstant>(StepRec)) {
-    FailureReason = "LHS in icmp not induction variable";
-    return std::nullopt;
-  }
-  ConstantInt *StepCI = cast<SCEVConstant>(StepRec)->getValue();
-
-  if (ICI->isEquality() && !HasNoSignedWrap(IndVarBase)) {
-    FailureReason = "LHS in icmp needs nsw for equality predicates";
-    return std::nullopt;
-  }
-
-  assert(!StepCI->isZero() && "Zero step?");
-  bool IsIncreasing = !StepCI->isNegative();
-  bool IsSignedPredicate;
-  const SCEV *StartNext = IndVarBase->getStart();
-  const SCEV *Addend = SE.getNegativeSCEV(IndVarBase->getStepRecurrence(SE));
-  const SCEV *IndVarStart = SE.getAddExpr(StartNext, Addend);
-  const SCEV *Step = SE.getSCEV(StepCI);
-
-  const SCEV *FixedRightSCEV = nullptr;
-
-  // If RightValue resides within loop (but still being loop invariant),
-  // regenerate it as preheader.
-  if (auto *I = dyn_cast<Instruction>(RightValue))
-    if (L.contains(I->getParent()))
-      FixedRightSCEV = RightSCEV;
-
-  if (IsIncreasing) {
-    bool DecreasedRightValueByOne = false;
-    if (StepCI->isOne()) {
-      // Try to turn eq/ne predicates to those we can work with.
-      if (Pred == ICmpInst::ICMP_NE && LatchBrExitIdx == 1)
-        // while (++i != len) {         while (++i < len) {
-        //   ...                 --->     ...
-        // }                            }
-        // If both parts are known non-negative, it is profitable to use
-        // unsigned comparison in increasing loop. This allows us to make the
-        // comparison check against "RightSCEV + 1" more optimistic.
-        if (isKnownNonNegativeInLoop(IndVarStart, &L, SE) &&
-            isKnownNonNegativeInLoop(RightSCEV, &L, SE))
-          Pred = ICmpInst::ICMP_ULT;
-        else
-          Pred = ICmpInst::ICMP_SLT;
-      else if (Pred == ICmpInst::ICMP_EQ && LatchBrExitIdx == 0) {
-        // while (true) {               while (true) {
-        //   if (++i == len)     --->     if (++i > len - 1)
-        //     break;                       break;
-        //   ...                          ...
-        // }                            }
-        if (IndVarBase->getNoWrapFlags(SCEV::FlagNUW) &&
-            cannotBeMinInLoop(RightSCEV, &L, SE, /*Signed*/false)) {
-          Pred = ICmpInst::ICMP_UGT;
-          RightSCEV = SE.getMinusSCEV(RightSCEV,
-                                      SE.getOne(RightSCEV->getType()));
-          DecreasedRightValueByOne = true;
-        } else if (cannotBeMinInLoop(RightSCEV, &L, SE, /*Signed*/true)) {
-          Pred = ICmpInst::ICMP_SGT;
-          RightSCEV = SE.getMinusSCEV(RightSCEV,
-                                      SE.getOne(RightSCEV->getType()));
-          DecreasedRightValueByOne = true;
-        }
-      }
-    }
-
-    bool LTPred = (Pred == ICmpInst::ICMP_SLT || Pred == ICmpInst::ICMP_ULT);
-    bool GTPred = (Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_UGT);
-    bool FoundExpectedPred =
-        (LTPred && LatchBrExitIdx == 1) || (GTPred && LatchBrExitIdx == 0);
-
-    if (!FoundExpectedPred) {
-      FailureReason = "expected icmp slt semantically, found something else";
-      return std::nullopt;
-    }
-
-    IsSignedPredicate = ICmpInst::isSigned(Pred);
-    if (!IsSignedPredicate && !AllowUnsignedLatchCondition) {
-      FailureReason = "unsigned latch conditions are explicitly prohibited";
-      return std::nullopt;
-    }
-
-    if (!isSafeIncreasingBound(IndVarStart, RightSCEV, Step, Pred,
-                               LatchBrExitIdx, &L, SE)) {
-      FailureReason = "Unsafe loop bounds";
-      return std::nullopt;
-    }
-    if (LatchBrExitIdx == 0) {
-      // We need to increase the right value unless we have already decreased
-      // it virtually when we replaced EQ with SGT.
-      if (!DecreasedRightValueByOne)
-        FixedRightSCEV =
-            SE.getAddExpr(RightSCEV, SE.getOne(RightSCEV->getType()));
-    } else {
-      assert(!DecreasedRightValueByOne &&
-             "Right value can be decreased only for LatchBrExitIdx == 0!");
-    }
-  } else {
-    bool IncreasedRightValueByOne = false;
-    if (StepCI->isMinusOne()) {
-      // Try to turn eq/ne predicates to those we can work with.
-      if (Pred == ICmpInst::ICMP_NE && LatchBrExitIdx == 1)
-        // while (--i != len) {         while (--i > len) {
-        //   ...                 --->     ...
-        // }                            }
-        // We intentionally don't turn the predicate into UGT even if we know
-        // that both operands are non-negative, because it will only pessimize
-        // our check against "RightSCEV - 1".
-        Pred = ICmpInst::ICMP_SGT;
-      else if (Pred == ICmpInst::ICMP_EQ && LatchBrExitIdx == 0) {
-        // while (true) {               while (true) {
-        //   if (--i == len)     --->     if (--i < len + 1)
-        //     break;                       break;
-        //   ...                          ...
-        // }                            }
-        if (IndVarBase->getNoWrapFlags(SCEV::FlagNUW) &&
-            cannotBeMaxInLoop(RightSCEV, &L, SE, /* Signed */ false)) {
-          Pred = ICmpInst::ICMP_ULT;
-          RightSCEV = SE.getAddExpr(RightSCEV, SE.getOne(RightSCEV->getType()));
-          IncreasedRightValueByOne = true;
-        } else if (cannotBeMaxInLoop(RightSCEV, &L, SE, /* Signed */ true)) {
-          Pred = ICmpInst::ICMP_SLT;
-          RightSCEV = SE.getAddExpr(RightSCEV, SE.getOne(RightSCEV->getType()));
-          IncreasedRightValueByOne = true;
-        }
-      }
-    }
-
-    bool LTPred = (Pred == ICmpInst::ICMP_SLT || Pred == ICmpInst::ICMP_ULT);
-    bool GTPred = (Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_UGT);
-
-    bool FoundExpectedPred =
-        (GTPred && LatchBrExitIdx == 1) || (LTPred && LatchBrExitIdx == 0);
-
-    if (!FoundExpectedPred) {
-      FailureReason = "expected icmp sgt semantically, found something else";
-      return std::nullopt;
-    }
-
-    IsSignedPredicate =
-        Pred == ICmpInst::ICMP_SLT || Pred == ICmpInst::ICMP_SGT;
-
-    if (!IsSignedPredicate && !AllowUnsignedLatchCondition) {
-      FailureReason = "unsigned latch conditions are explicitly prohibited";
-      return std::nullopt;
-    }
-
-    if (!isSafeDecreasingBound(IndVarStart, RightSCEV, Step, Pred,
-                               LatchBrExitIdx, &L, SE)) {
-      FailureReason = "Unsafe bounds";
-      return std::nullopt;
-    }
-
-    if (LatchBrExitIdx == 0) {
-      // We need to decrease the right value unless we have already increased
-      // it virtually when we replaced EQ with SLT.
-      if (!IncreasedRightValueByOne)
-        FixedRightSCEV =
-            SE.getMinusSCEV(RightSCEV, SE.getOne(RightSCEV->getType()));
-    } else {
-      assert(!IncreasedRightValueByOne &&
-             "Right value can be increased only for LatchBrExitIdx == 0!");
-    }
-  }
-  BasicBlock *LatchExit = LatchBr->getSuccessor(LatchBrExitIdx);
-
-  assert(!L.contains(LatchExit) && "expected an exit block!");
-  const DataLayout &DL = Preheader->getModule()->getDataLayout();
-  SCEVExpander Expander(SE, DL, "irce");
-  Instruction *Ins = Preheader->getTerminator();
-
-  if (FixedRightSCEV)
-    RightValue =
-        Expander.expandCodeFor(FixedRightSCEV, FixedRightSCEV->getType(), Ins);
-
-  Value *IndVarStartV = Expander.expandCodeFor(IndVarStart, IndVarTy, Ins);
-  IndVarStartV->setName("indvar.start");
-
-  LoopStructure Result;
-
-  Result.Tag = "main";
-  Result.Header = Header;
-  Result.Latch = Latch;
-  Result.LatchBr = LatchBr;
-  Result.LatchExit = LatchExit;
-  Result.LatchBrExitIdx = LatchBrExitIdx;
-  Result.IndVarStart = IndVarStartV;
-  Result.IndVarStep = StepCI;
-  Result.IndVarBase = LeftValue;
-  Result.IndVarIncreasing = IsIncreasing;
-  Result.LoopExitAt = RightValue;
-  Result.IsSignedPredicate = IsSignedPredicate;
-
-  FailureReason = nullptr;
-
-  return Result;
-}
-
 /// If the type of \p S matches with \p Ty, return \p S. Otherwise, return
 /// signed or unsigned extension of \p S to type \p Ty.
 static const SCEV *NoopOrExtend(const SCEV *S, Type *Ty, ScalarEvolution &SE,
@@ -1204,17 +560,23 @@ static const SCEV *NoopOrExtend(const SCEV *S, Type *Ty, ScalarEvolution &SE,
   return Signed ? SE.getNoopOrSignExtend(S, Ty) : SE.getNoopOrZeroExtend(S, Ty);
 }
 
-std::optional<LoopConstrainer::SubRanges>
-LoopConstrainer::calculateSubRanges(bool IsSignedPredicate) const {
+// Compute a safe set of limits for the main loop to run in -- effectively the
+// intersection of `Range' and the iteration space of the original loop.
+// Return std::nullopt if unable to compute the set of subranges.
+static std::optional<LoopConstrainer::SubRanges>
+calculateSubRanges(ScalarEvolution &SE, const Loop &L,
+                   InductiveRangeCheck::Range &Range,
+                   const LoopStructure &MainLoopStructure) {
   auto *RTy = cast<IntegerType>(Range.getType());
   // We only support wide range checks and narrow latches.
-  if (!AllowNarrowLatchCondition && RTy != ExitCountTy)
+  if (!AllowNarrowLatchCondition && RTy != MainLoopStructure.ExitCountTy)
     return std::nullopt;
-  if (RTy->getBitWidth() < ExitCountTy->getBitWidth())
+  if (RTy->getBitWidth() < MainLoopStructure.ExitCountTy->getBitWidth())
     return std::nullopt;
 
   LoopConstrainer::SubRanges Result;
 
+  bool IsSignedPredicate = MainLoopStructure.IsSignedPredicate;
   // I think we can be more aggressive here and make this nuw / nsw if the
   // addition that feeds into the icmp for the latch's terminating branch is nuw
   // / nsw.  In any case, a wrapping 2's complement addition is safe.
@@ -1258,7 +620,7 @@ LoopConstrainer::calculateSubRanges(bool IsSignedPredicate) const {
     GreatestSeen = Start;
   }
 
-  auto Clamp = [this, Smallest, Greatest, IsSignedPredicate](const SCEV *S) {
+  auto Clamp = [&SE, Smallest, Greatest, IsSignedPredicate](const SCEV *S) {
     return IsSignedPredicate
                ? SE.getSMaxExpr(Smallest, SE.getSMinExpr(Greatest, S))
                : SE.getUMaxExpr(Smallest, SE.getUMinExpr(Greatest, S));
@@ -1283,465 +645,6 @@ LoopConstrainer::calculateSubRanges(bool IsSignedPredicate) const {
   return Result;
 }
 
-void LoopConstrainer::cloneLoop(LoopConstrainer::ClonedLoop &Result,
-                                const char *Tag) const {
-  for (BasicBlock *BB : OriginalLoop.getBlocks()) {
-    BasicBlock *Clone = CloneBasicBlock(BB, Result.Map, Twine(".") + Tag, &F);
-    Result.Blocks.push_back(Clone);
-    Result.Map[BB] = Clone;
-  }
-
-  auto GetClonedValue = [&Result](Value *V) {
-    assert(V && "null values not in domain!");
-    auto It = Result.Map.find(V);
-    if (It == Result.Map.end())
-      return V;
-    return static_cast<Value *>(It->second);
-  };
-
-  auto *ClonedLatch =
-      cast<BasicBlock>(GetClonedValue(OriginalLoop.getLoopLatch()));
-  ClonedLatch->getTerminator()->setMetadata(ClonedLoopTag,
-                                            MDNode::get(Ctx, {}));
-
-  Result.Structure = MainLoopStructure.map(GetClonedValue);
-  Result.Structure.Tag = Tag;
-
-  for (unsigned i = 0, e = Result.Blocks.size(); i != e; ++i) {
-    BasicBlock *ClonedBB = Result.Blocks[i];
-    BasicBlock *OriginalBB = OriginalLoop.getBlocks()[i];
-
-    assert(Result.Map[OriginalBB] == ClonedBB && "invariant!");
-
-    for (Instruction &I : *ClonedBB)
-      RemapInstruction(&I, Result.Map,
-                       RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
-
-    // Exit blocks will now have one more predecessor and their PHI nodes need
-    // to be edited to reflect that.  No phi nodes need to be introduced because
-    // the loop is in LCSSA.
-
-    for (auto *SBB : successors(OriginalBB)) {
-      if (OriginalLoop.contains(SBB))
-        continue; // not an exit block
-
-      for (PHINode &PN : SBB->phis()) {
-        Value *OldIncoming = PN.getIncomingValueForBlock(OriginalBB);
-        PN.addIncoming(GetClonedValue(OldIncoming), ClonedBB);
-        SE.forgetValue(&PN);
-      }
-    }
-  }
-}
-
-LoopConstrainer::RewrittenRangeInfo LoopConstrainer::changeIterationSpaceEnd(
-    const LoopStructure &LS, BasicBlock *Preheader, Value *ExitSubloopAt,
-    BasicBlock *ContinuationBlock) const {
-  // We start with a loop with a single latch:
-  //
-  //    +--------------------+
-  //    |                    |
-  //    |     preheader      |
-  //    |                    |
-  //    +--------+-----------+
-  //             |      ----------------\
-  //             |     /                |
-  //    +--------v----v------+          |
-  //    |                    |          |
-  //    |      header        |          |
-  //    |                    |          |
-  //    +--------------------+          |
-  //                                    |
-  //            .....                   |
-  //                                    |
-  //    +--------------------+          |
-  //    |                    |          |
-  //    |       latch        >----------/
-  //    |                    |
-  //    +-------v------------+
-  //            |
-  //            |
-  //            |   +--------------------+
-  //            |   |                    |
-  //            +--->   original exit    |
-  //                |                    |
-  //                +--------------------+
-  //
-  // We change the control flow to look like
-  //
-  //
-  //    +--------------------+
-  //    |                    |
-  //    |     preheader      >-------------------------+
-  //    |                    |                         |
-  //    +--------v-----------+                         |
-  //             |    /-------------+                  |
-  //             |   /              |                  |
-  //    +--------v--v--------+      |                  |
-  //    |                    |      |                  |
-  //    |      header        |      |   +--------+     |
-  //    |                    |      |   |        |     |
-  //    +--------------------+      |   |  +-----v-----v-----------+
-  //                                |   |  |                       |
-  //                                |   |  |     .pseudo.exit      |
-  //                                |   |  |                       |
-  //                                |   |  +-----------v-----------+
-  //                                |   |              |
-  //            .....               |   |              |
-  //                                |   |     +--------v-------------+
-  //    +--------------------+      |   |     |                      |
-  //    |                    |      |   |     |   ContinuationBlock  |
-  //    |       latch        >------+   |     |                      |
-  //    |                    |          |     +----------------------+
-  //    +---------v----------+          |
-  //              |                     |
-  //              |                     |
-  //              |     +---------------^-----+
-  //              |     |                     |
-  //              +----->    .exit.selector   |
-  //                    |                     |
-  //                    +----------v----------+
-  //                               |
-  //     +--------------------+    |
-  //     |                    |    |
-  //     |   original exit    <----+
-  //     |                    |
-  //     +--------------------+
-
-  RewrittenRangeInfo RRI;
-
-  BasicBlock *BBInsertLocation = LS.Latch->getNextNode();
-  RRI.ExitSelector = BasicBlock::Create(Ctx, Twine(LS.Tag) + ".exit.selector",
-                                        &F, BBInsertLocation);
-  RRI.PseudoExit = BasicBlock::Create(Ctx, Twine(LS.Tag) + ".pseudo.exit", &F,
-                                      BBInsertLocation);
-
-  BranchInst *PreheaderJump = cast<BranchInst>(Preheader->getTerminator());
-  bool Increasing = LS.IndVarIncreasing;
-  bool IsSignedPredicate = LS.IsSignedPredicate;
-
-  IRBuilder<> B(PreheaderJump);
-  auto *RangeTy = Range.getBegin()->getType();
-  auto NoopOrExt = [&](Value *V) {
-    if (V->getType() == RangeTy)
-      return V;
-    return IsSignedPredicate ? B.CreateSExt(V, RangeTy, "wide." + V->getName())
-                             : B.CreateZExt(V, RangeTy, "wide." + V->getName());
-  };
-
-  // EnterLoopCond - is it okay to start executing this `LS'?
-  Value *EnterLoopCond = nullptr;
-  auto Pred =
-      Increasing
-          ? (IsSignedPredicate ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT)
-          : (IsSignedPredicate ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT);
-  Value *IndVarStart = NoopOrExt(LS.IndVarStart);
-  EnterLoopCond = B.CreateICmp(Pred, IndVarStart, ExitSubloopAt);
-
-  B.CreateCondBr(EnterLoopCond, LS.Header, RRI.PseudoExit);
-  PreheaderJump->eraseFromParent();
-
-  LS.LatchBr->setSuccessor(LS.LatchBrExitIdx, RRI.ExitSelector);
-  B.SetInsertPoint(LS.LatchBr);
-  Value *IndVarBase = NoopOrExt(LS.IndVarBase);
-  Value *TakeBackedgeLoopCond = B.CreateICmp(Pred, IndVarBase, ExitSubloopAt);
-
-  Value *CondForBranch = LS.LatchBrExitIdx == 1
-                             ? TakeBackedgeLoopCond
-                             : B.CreateNot(TakeBackedgeLoopCond);
-
-  LS.LatchBr->setCondition(CondForBranch);
-
-  B.SetInsertPoint(RRI.ExitSelector);
-
-  // IterationsLeft - are there any more iterations left, given the original
-  // upper bound on the induction variable?  If not, we branch to the "real"
-  // exit.
-  Value *LoopExitAt = NoopOrExt(LS.LoopExitAt);
-  Value *IterationsLeft = B.CreateICmp(Pred, IndVarBase, LoopExitAt);
-  B.CreateCondBr(IterationsLeft, RRI.PseudoExit, LS.LatchExit);
-
-  BranchInst *BranchToContinuation =
-      BranchInst::Create(ContinuationBlock, RRI.PseudoExit);
-
-  // We emit PHI nodes into `RRI.PseudoExit' that compute the "latest" value of
-  // each of the PHI nodes in the loop header.  This feeds into the initial
-  // value of the same PHI nodes if/when we continue execution.
-  for (PHINode &PN : LS.Header->phis()) {
-    PHINode *NewPHI = PHINode::Create(PN.getType(), 2, PN.getName() + ".copy",
-                                      BranchToContinuation);
-
-    NewPHI->addIncoming(PN.getIncomingValueForBlock(Preheader), Preheader);
-    NewPHI->addIncoming(PN.getIncomingValueForBlock(LS.Latch),
-                        RRI.ExitSelector);
-    RRI.PHIValuesAtPseudoExit.push_back(NewPHI);
-  }
-
-  RRI.IndVarEnd = PHINode::Create(IndVarBase->getType(), 2, "indvar.end",
-                                  BranchToContinuation);
-  RRI.IndVarEnd->addIncoming(IndVarStart, Preheader);
-  RRI.IndVarEnd->addIncoming(IndVarBase, RRI.ExitSelector);
-
-  // The latch exit now has a branch from `RRI.ExitSelector' instead of
-  // `LS.Latch'.  The PHI nodes need to be updated to reflect that.
-  LS.LatchExit->replacePhiUsesWith(LS.Latch, RRI.ExitSelector);
-
-  return RRI;
-}
-
-void LoopConstrainer::rewriteIncomingValuesForPHIs(
-    LoopStructure &LS, BasicBlock *ContinuationBlock,
-    const LoopConstrainer::RewrittenRangeInfo &RRI) const {
-  unsigned PHIIndex = 0;
-  for (PHINode &PN : LS.Header->phis())
-    PN.setIncomingValueForBlock(ContinuationBlock,
-                                RRI.PHIValuesAtPseudoExit[PHIIndex++]);
-
-  LS.IndVarStart = RRI.IndVarEnd;
-}
-
-BasicBlock *LoopConstrainer::createPreheader(const LoopStructure &LS,
-                                             BasicBlock *OldPreheader,
-                                             const char *Tag) const {
-  BasicBlock *Preheader = BasicBlock::Create(Ctx, Tag, &F, LS.Header);
-  BranchInst::Create(LS.Header, Preheader);
-
-  LS.Header->replacePhiUsesWith(OldPreheader, Preheader);
-
-  return Preheader;
-}
-
-void LoopConstrainer::addToParentLoopIfNeeded(ArrayRef<BasicBlock *> BBs) {
-  Loop *ParentLoop = OriginalLoop.getParentLoop();
-  if (!ParentLoop)
-    return;
-
-  for (BasicBlock *BB : BBs)
-    ParentLoop->addBasicBlockToLoop(BB, LI);
-}
-
-Loop *LoopConstrainer::createClonedLoopStructure(Loop *Original, Loop *Parent,
-                                                 ValueToValueMapTy &VM,
-                                                 bool IsSubloop) {
-  Loop &New = *LI.AllocateLoop();
-  if (Parent)
-    Parent->addChildLoop(&New);
-  else
-    LI.addTopLevelLoop(&New);
-  LPMAddNewLoop(&New, IsSubloop);
-
-  // Add all of the blocks in Original to the new loop.
-  for (auto *BB : Original->blocks())
-    if (LI.getLoopFor(BB) == Original)
-      New.addBasicBlockToLoop(cast<BasicBlock>(VM[BB]), LI);
-
-  // Add all of the subloops to the new loop.
-  for (Loop *SubLoop : *Original)
-    createClonedLoopStructure(SubLoop, &New, VM, /* IsSubloop */ true);
-
-  return &New;
-}
-
-bool LoopConstrainer::run() {
-  BasicBlock *Preheader = nullptr;
-  const SCEV *MaxBETakenCount =
-      getNarrowestLatchMaxTakenCountEstimate(SE, OriginalLoop);
-  Preheader = OriginalLoop.getLoopPreheader();
-  assert(!isa<SCEVCouldNotCompute>(MaxBETakenCount) && Preheader != nullptr &&
-         "preconditions!");
-  ExitCountTy = cast<IntegerType>(MaxBETakenCount->getType());
-
-  OriginalPreheader = Preheader;
-  MainLoopPreheader = Preheader;
-
-  bool IsSignedPredicate = MainLoopStructure.IsSignedPredicate;
-  std::optional<SubRanges> MaybeSR = calculateSubRanges(IsSignedPredicate);
-  if (!MaybeSR) {
-    LLVM_DEBUG(dbgs() << "irce: could not compute subranges\n");
-    return false;
-  }
-
-  SubRanges SR = *MaybeSR;
-  bool Increasing = MainLoopStructure.IndVarIncreasing;
-  IntegerType *IVTy =
-      cast<IntegerType>(Range.getBegin()->getType());
-
-  SCEVExpander Expander(SE, F.getParent()->getDataLayout(), "irce");
-  Instruction *InsertPt = OriginalPreheader->getTerminator();
-
-  // It would have been better to make `PreLoop' and `PostLoop'
-  // `std::optional<ClonedLoop>'s, but `ValueToValueMapTy' does not have a copy
-  // constructor.
-  ClonedLoop PreLoop, PostLoop;
-  bool NeedsPreLoop =
-      Increasing ? SR.LowLimit.has_value() : SR.HighLimit.has_value();
-  bool NeedsPostLoop =
-      Increasing ? SR.HighLimit.has_value() : SR.LowLimit.has_value();
-
-  Value *ExitPreLoopAt = nullptr;
-  Value *ExitMainLoopAt = nullptr;
-  const SCEVConstant *MinusOneS =
-      cast<SCEVConstant>(SE.getConstant(IVTy, -1, true /* isSigned */));
-
-  if (NeedsPreLoop) {
-    const SCEV *ExitPreLoopAtSCEV = nullptr;
-
-    if (Increasing)
-      ExitPreLoopAtSCEV = *SR.LowLimit;
-    else if (cannotBeMinInLoop(*SR.HighLimit, &OriginalLoop, SE,
-                               IsSignedPredicate))
-      ExitPreLoopAtSCEV = SE.getAddExpr(*SR.HighLimit, MinusOneS);
-    else {
-      LLVM_DEBUG(dbgs() << "irce: could not prove no-overflow when computing "
-                        << "preloop exit limit.  HighLimit = "
-                        << *(*SR.HighLimit) << "\n");
-      return false;
-    }
-
-    if (!Expander.isSafeToExpandAt(ExitPreLoopAtSCEV, InsertPt)) {
-      LLVM_DEBUG(dbgs() << "irce: could not prove that it is safe to expand the"
-                        << " preloop exit limit " << *ExitPreLoopAtSCEV
-                        << " at block " << InsertPt->getParent()->getName()
-                        << "\n");
-      return false;
-    }
-
-    ExitPreLoopAt = Expander.expandCodeFor(ExitPreLoopAtSCEV, IVTy, InsertPt);
-    ExitPreLoopAt->setName("exit.preloop.at");
-  }
-
-  if (NeedsPostLoop) {
-    const SCEV *ExitMainLoopAtSCEV = nullptr;
-
-    if (Increasing)
-      ExitMainLoopAtSCEV = *SR.HighLimit;
-    else if (cannotBeMinInLoop(*SR.LowLimit, &OriginalLoop, SE,
-                               IsSignedPredicate))
-      ExitMainLoopAtSCEV = SE.getAddExpr(*SR.LowLimit, MinusOneS);
-    else {
-      LLVM_DEBUG(dbgs() << "irce: could not prove no-overflow when computing "
-                        << "mainloop exit limit.  LowLimit = "
-                        << *(*SR.LowLimit) << "\n");
-      return false;
-    }
-
-    if (!Expander.isSafeToExpandAt(ExitMainLoopAtSCEV, InsertPt)) {
-      LLVM_DEBUG(dbgs() << "irce: could not prove that it is safe to expand the"
-                        << " main loop exit limit " << *ExitMainLoopAtSCEV
-                        << " at block " << InsertPt->getParent()->getName()
-                        << "\n");
-      return false;
-    }
-
-    ExitMainLoopAt = Expander.expandCodeFor(ExitMainLoopAtSCEV, IVTy, InsertPt);
-    ExitMainLoopAt->setName("exit.mainloop.at");
-  }
-
-  // We clone these ahead of time so that we don't have to deal with changing
-  // and temporarily invalid IR as we transform the loops.
-  if (NeedsPreLoop)
-    cloneLoop(PreLoop, "preloop");
-  if (NeedsPostLoop)
-    cloneLoop(PostLoop, "postloop");
-
-  RewrittenRangeInfo PreLoopRRI;
-
-  if (NeedsPreLoop) {
-    Preheader->getTerminator()->replaceUsesOfWith(MainLoopStructure.Header,
-                                                  PreLoop.Structure.Header);
-
-    MainLoopPreheader =
-        createPreheader(MainLoopStructure, Preheader, "mainloop");
-    PreLoopRRI = changeIterationSpaceEnd(PreLoop.Structure, Preheader,
-                                         ExitPreLoopAt, MainLoopPreheader);
-    rewriteIncomingValuesForPHIs(MainLoopStructure, MainLoopPreheader,
-                                 PreLoopRRI);
-  }
-
-  BasicBlock *PostLoopPreheader = nullptr;
-  RewrittenRangeInfo PostLoopRRI;
-
-  if (NeedsPostLoop) {
-    PostLoopPreheader =
-        createPreheader(PostLoop.Structure, Preheader, "postloop");
-    PostLoopRRI = changeIterationSpaceEnd(MainLoopStructure, MainLoopPreheader,
-                                          ExitMainLoopAt, PostLoopPreheader);
-    rewriteIncomingValuesForPHIs(PostLoop.Structure, PostLoopPreheader,
-                                 PostLoopRRI);
-  }
-
-  BasicBlock *NewMainLoopPreheader =
-      MainLoopPreheader != Preheader ? MainLoopPreheader : nullptr;
-  BasicBlock *NewBlocks[] = {PostLoopPreheader,        PreLoopRRI.PseudoExit,
-                             PreLoopRRI.ExitSelector,  PostLoopRRI.PseudoExit,
-                             PostLoopRRI.ExitSelector, NewMainLoopPreheader};
-
-  // Some of the above may be nullptr, filter them out before passing to
-  // addToParentLoopIfNeeded.
-  auto NewBlocksEnd =
-      std::remove(std::begin(NewBlocks), std::end(NewBlocks), nullptr);
-
-  addToParentLoopIfNeeded(ArrayRef(std::begin(NewBlocks), NewBlocksEnd));
-
-  DT.recalculate(F);
-
-  // We need to first add all the pre and post loop blocks into the loop
-  // structures (as part of createClonedLoopStructure), and then update the
-  // LCSSA form and LoopSimplifyForm. This is necessary for correctly updating
-  // LI when LoopSimplifyForm is generated.
-  Loop *PreL = nullptr, *PostL = nullptr;
-  if (!PreLoop.Blocks.empty()) {
-    PreL = createClonedLoopStructure(&OriginalLoop,
-                                     OriginalLoop.getParentLoop(), PreLoop.Map,
-                                     /* IsSubLoop */ false);
-  }
-
-  if (!PostLoop.Blocks.empty()) {
-    PostL =
-        createClonedLoopStructure(&OriginalLoop, OriginalLoop.getParentLoop(),
-                                  PostLoop.Map, /* IsSubLoop */ false);
-  }
-
-  // This function canonicalizes the loop into Loop-Simplify and LCSSA forms.
-  auto CanonicalizeLoop = [&] (Loop *L, bool IsOriginalLoop) {
-    formLCSSARecursively(*L, DT, &LI, &SE);
-    simplifyLoop(L, &DT, &LI, &SE, nullptr, nullptr, true);
-    // Pre/post loops are slow paths, we do not need to perform any loop
-    // optimizations on them.
-    if (!IsOriginalLoop)
-      DisableAllLoopOptsOnLoop(*L);
-  };
-  if (PreL)
-    CanonicalizeLoop(PreL, false);
-  if (PostL)
-    CanonicalizeLoop(PostL, false);
-  CanonicalizeLoop(&OriginalLoop, true);
-
-  /// At this point:
-  /// - We've broken a "main loop" out of the loop in a way that the "main loop"
-  /// runs with the induction variable in a subset of [Begin, End).
-  /// - There is no overflow when computing "main loop" exit limit.
-  /// - Max latch taken count of the loop is limited.
-  /// It guarantees that induction variable will not overflow iterating in the
-  /// "main loop".
-  if (isa<OverflowingBinaryOperator>(MainLoopStructure.IndVarBase))
-    if (IsSignedPredicate)
-      cast<BinaryOperator>(MainLoopStructure.IndVarBase)
-          ->setHasNoSignedWrap(true);
-  /// TODO: support unsigned predicate.
-  /// To add NUW flag we need to prove that both operands of BO are
-  /// non-negative. E.g:
-  /// ...
-  /// %iv.next = add nsw i32 %iv, -1
-  /// %cmp = icmp ult i32 %iv.next, %n
-  /// br i1 %cmp, label %loopexit, label %loop
-  ///
-  /// -1 is MAX_UINT in terms of unsigned int. Adding anything but zero will
-  /// overflow, therefore NUW flag is not legal here.
-
-  return true;
-}
-
 /// Computes and returns a range of values for the induction variable (IndVar)
 /// in which the range check can be safely elided.  If it cannot compute such a
 /// range, returns std::nullopt.
@@ -2109,7 +1012,8 @@ bool InductiveRangeCheckElimination::run(
 
   const char *FailureReason = nullptr;
   std::optional<LoopStructure> MaybeLoopStructure =
-      LoopStructure::parseLoopStructure(SE, *L, FailureReason);
+      LoopStructure::parseLoopStructure(SE, *L, AllowUnsignedLatchCondition,
+                                        FailureReason);
   if (!MaybeLoopStructure) {
     LLVM_DEBUG(dbgs() << "irce: could not parse loop structure: "
                       << FailureReason << "\n";);
@@ -2148,7 +1052,15 @@ bool InductiveRangeCheckElimination::run(
   if (!SafeIterRange)
     return Changed;
 
-  LoopConstrainer LC(*L, LI, LPMAddNewLoop, LS, SE, DT, *SafeIterRange);
+  std::optional<LoopConstrainer::SubRanges> MaybeSR =
+      calculateSubRanges(SE, *L, *SafeIterRange, LS);
+  if (!MaybeSR) {
+    LLVM_DEBUG(dbgs() << "irce: could not compute subranges\n");
+    return false;
+  }
+
+  LoopConstrainer LC(*L, LI, LPMAddNewLoop, LS, SE, DT,
+                     SafeIterRange->getBegin()->getType(), *MaybeSR);
 
   if (LC.run()) {
     Changed = true;
diff --git a/llvm/lib/Transforms/Utils/CMakeLists.txt b/llvm/lib/Transforms/Utils/CMakeLists.txt
index e971c638327bf..51e8821773c3a 100644
--- a/llvm/lib/Transforms/Utils/CMakeLists.txt
+++ b/llvm/lib/Transforms/Utils/CMakeLists.txt
@@ -38,6 +38,7 @@ add_llvm_component_library(LLVMTransformUtils
   LCSSA.cpp
   LibCallsShrinkWrap.cpp
   Local.cpp
+  LoopConstrainer.cpp
   LoopPeel.cpp
   LoopRotationUtils.cpp
   LoopSimplify.cpp
diff --git a/llvm/lib/Transforms/Utils/LoopConstrainer.cpp b/llvm/lib/Transforms/Utils/LoopConstrainer.cpp
new file mode 100644
index 0000000000000..ea6d952cfa7d4
--- /dev/null
+++ b/llvm/lib/Transforms/Utils/LoopConstrainer.cpp
@@ -0,0 +1,904 @@
+#include "llvm/Transforms/Utils/LoopConstrainer.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/LoopSimplify.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
+
+using namespace llvm;
+
+static const char *ClonedLoopTag = "loop_constrainer.loop.clone";
+
+#define DEBUG_TYPE "loop-constrainer"
+
+/// Given a loop with an deccreasing induction variable, is it possible to
+/// safely calculate the bounds of a new loop using the given Predicate.
+static bool isSafeDecreasingBound(const SCEV *Start, const SCEV *BoundSCEV,
+                                  const SCEV *Step, ICmpInst::Predicate Pred,
+                                  unsigned LatchBrExitIdx, Loop *L,
+                                  ScalarEvolution &SE) {
+  if (Pred != ICmpInst::ICMP_SLT && Pred != ICmpInst::ICMP_SGT &&
+      Pred != ICmpInst::ICMP_ULT && Pred != ICmpInst::ICMP_UGT)
+    return false;
+
+  if (!SE.isAvailableAtLoopEntry(BoundSCEV, L))
+    return false;
+
+  assert(SE.isKnownNegative(Step) && "expecting negative step");
+
+  LLVM_DEBUG(dbgs() << "isSafeDecreasingBound with:\n");
+  LLVM_DEBUG(dbgs() << "Start: " << *Start << "\n");
+  LLVM_DEBUG(dbgs() << "Step: " << *Step << "\n");
+  LLVM_DEBUG(dbgs() << "BoundSCEV: " << *BoundSCEV << "\n");
+  LLVM_DEBUG(dbgs() << "Pred: " << Pred << "\n");
+  LLVM_DEBUG(dbgs() << "LatchExitBrIdx: " << LatchBrExitIdx << "\n");
+
+  bool IsSigned = ICmpInst::isSigned(Pred);
+  // The predicate that we need to check that the induction variable lies
+  // within bounds.
+  ICmpInst::Predicate BoundPred =
+      IsSigned ? CmpInst::ICMP_SGT : CmpInst::ICMP_UGT;
+
+  if (LatchBrExitIdx == 1)
+    return SE.isLoopEntryGuardedByCond(L, BoundPred, Start, BoundSCEV);
+
+  assert(LatchBrExitIdx == 0 && "LatchBrExitIdx should be either 0 or 1");
+
+  const SCEV *StepPlusOne = SE.getAddExpr(Step, SE.getOne(Step->getType()));
+  unsigned BitWidth = cast<IntegerType>(BoundSCEV->getType())->getBitWidth();
+  APInt Min = IsSigned ? APInt::getSignedMinValue(BitWidth)
+                       : APInt::getMinValue(BitWidth);
+  const SCEV *Limit = SE.getMinusSCEV(SE.getConstant(Min), StepPlusOne);
+
+  const SCEV *MinusOne =
+      SE.getMinusSCEV(BoundSCEV, SE.getOne(BoundSCEV->getType()));
+
+  return SE.isLoopEntryGuardedByCond(L, BoundPred, Start, MinusOne) &&
+         SE.isLoopEntryGuardedByCond(L, BoundPred, BoundSCEV, Limit);
+}
+
+/// Given a loop with an increasing induction variable, is it possible to
+/// safely calculate the bounds of a new loop using the given Predicate.
+static bool isSafeIncreasingBound(const SCEV *Start, const SCEV *BoundSCEV,
+                                  const SCEV *Step, ICmpInst::Predicate Pred,
+                                  unsigned LatchBrExitIdx, Loop *L,
+                                  ScalarEvolution &SE) {
+  if (Pred != ICmpInst::ICMP_SLT && Pred != ICmpInst::ICMP_SGT &&
+      Pred != ICmpInst::ICMP_ULT && Pred != ICmpInst::ICMP_UGT)
+    return false;
+
+  if (!SE.isAvailableAtLoopEntry(BoundSCEV, L))
+    return false;
+
+  LLVM_DEBUG(dbgs() << "isSafeIncreasingBound with:\n");
+  LLVM_DEBUG(dbgs() << "Start: " << *Start << "\n");
+  LLVM_DEBUG(dbgs() << "Step: " << *Step << "\n");
+  LLVM_DEBUG(dbgs() << "BoundSCEV: " << *BoundSCEV << "\n");
+  LLVM_DEBUG(dbgs() << "Pred: " << Pred << "\n");
+  LLVM_DEBUG(dbgs() << "LatchExitBrIdx: " << LatchBrExitIdx << "\n");
+
+  bool IsSigned = ICmpInst::isSigned(Pred);
+  // The predicate that we need to check that the induction variable lies
+  // within bounds.
+  ICmpInst::Predicate BoundPred =
+      IsSigned ? CmpInst::ICMP_SLT : CmpInst::ICMP_ULT;
+
+  if (LatchBrExitIdx == 1)
+    return SE.isLoopEntryGuardedByCond(L, BoundPred, Start, BoundSCEV);
+
+  assert(LatchBrExitIdx == 0 && "LatchBrExitIdx should be 0 or 1");
+
+  const SCEV *StepMinusOne = SE.getMinusSCEV(Step, SE.getOne(Step->getType()));
+  unsigned BitWidth = cast<IntegerType>(BoundSCEV->getType())->getBitWidth();
+  APInt Max = IsSigned ? APInt::getSignedMaxValue(BitWidth)
+                       : APInt::getMaxValue(BitWidth);
+  const SCEV *Limit = SE.getMinusSCEV(SE.getConstant(Max), StepMinusOne);
+
+  return (SE.isLoopEntryGuardedByCond(L, BoundPred, Start,
+                                      SE.getAddExpr(BoundSCEV, Step)) &&
+          SE.isLoopEntryGuardedByCond(L, BoundPred, BoundSCEV, Limit));
+}
+
+/// Returns estimate for max latch taken count of the loop of the narrowest
+/// available type. If the latch block has such estimate, it is returned.
+/// Otherwise, we use max exit count of whole loop (that is potentially of wider
+/// type than latch check itself), which is still better than no estimate.
+static const SCEV *getNarrowestLatchMaxTakenCountEstimate(ScalarEvolution &SE,
+                                                          const Loop &L) {
+  const SCEV *FromBlock =
+      SE.getExitCount(&L, L.getLoopLatch(), ScalarEvolution::SymbolicMaximum);
+  if (isa<SCEVCouldNotCompute>(FromBlock))
+    return SE.getSymbolicMaxBackedgeTakenCount(&L);
+  return FromBlock;
+}
+
+std::optional<LoopStructure>
+LoopStructure::parseLoopStructure(ScalarEvolution &SE, Loop &L,
+                                  bool AllowUnsignedLatchCond,
+                                  const char *&FailureReason) {
+  if (!L.isLoopSimplifyForm()) {
+    FailureReason = "loop not in LoopSimplify form";
+    return std::nullopt;
+  }
+
+  BasicBlock *Latch = L.getLoopLatch();
+  assert(Latch && "Simplified loops only have one latch!");
+
+  if (Latch->getTerminator()->getMetadata(ClonedLoopTag)) {
+    FailureReason = "loop has already been cloned";
+    return std::nullopt;
+  }
+
+  if (!L.isLoopExiting(Latch)) {
+    FailureReason = "no loop latch";
+    return std::nullopt;
+  }
+
+  BasicBlock *Header = L.getHeader();
+  BasicBlock *Preheader = L.getLoopPreheader();
+  if (!Preheader) {
+    FailureReason = "no preheader";
+    return std::nullopt;
+  }
+
+  BranchInst *LatchBr = dyn_cast<BranchInst>(Latch->getTerminator());
+  if (!LatchBr || LatchBr->isUnconditional()) {
+    FailureReason = "latch terminator not conditional branch";
+    return std::nullopt;
+  }
+
+  unsigned LatchBrExitIdx = LatchBr->getSuccessor(0) == Header ? 1 : 0;
+
+  ICmpInst *ICI = dyn_cast<ICmpInst>(LatchBr->getCondition());
+  if (!ICI || !isa<IntegerType>(ICI->getOperand(0)->getType())) {
+    FailureReason = "latch terminator branch not conditional on integral icmp";
+    return std::nullopt;
+  }
+
+  const SCEV *MaxBETakenCount = getNarrowestLatchMaxTakenCountEstimate(SE, L);
+  if (isa<SCEVCouldNotCompute>(MaxBETakenCount)) {
+    FailureReason = "could not compute latch count";
+    return std::nullopt;
+  }
+  assert(SE.getLoopDisposition(MaxBETakenCount, &L) ==
+             ScalarEvolution::LoopInvariant &&
+         "loop variant exit count doesn't make sense!");
+
+  ICmpInst::Predicate Pred = ICI->getPredicate();
+  Value *LeftValue = ICI->getOperand(0);
+  const SCEV *LeftSCEV = SE.getSCEV(LeftValue);
+  IntegerType *IndVarTy = cast<IntegerType>(LeftValue->getType());
+
+  Value *RightValue = ICI->getOperand(1);
+  const SCEV *RightSCEV = SE.getSCEV(RightValue);
+
+  // We canonicalize `ICI` such that `LeftSCEV` is an add recurrence.
+  if (!isa<SCEVAddRecExpr>(LeftSCEV)) {
+    if (isa<SCEVAddRecExpr>(RightSCEV)) {
+      std::swap(LeftSCEV, RightSCEV);
+      std::swap(LeftValue, RightValue);
+      Pred = ICmpInst::getSwappedPredicate(Pred);
+    } else {
+      FailureReason = "no add recurrences in the icmp";
+      return std::nullopt;
+    }
+  }
+
+  auto HasNoSignedWrap = [&](const SCEVAddRecExpr *AR) {
+    if (AR->getNoWrapFlags(SCEV::FlagNSW))
+      return true;
+
+    IntegerType *Ty = cast<IntegerType>(AR->getType());
+    IntegerType *WideTy =
+        IntegerType::get(Ty->getContext(), Ty->getBitWidth() * 2);
+
+    const SCEVAddRecExpr *ExtendAfterOp =
+        dyn_cast<SCEVAddRecExpr>(SE.getSignExtendExpr(AR, WideTy));
+    if (ExtendAfterOp) {
+      const SCEV *ExtendedStart = SE.getSignExtendExpr(AR->getStart(), WideTy);
+      const SCEV *ExtendedStep =
+          SE.getSignExtendExpr(AR->getStepRecurrence(SE), WideTy);
+
+      bool NoSignedWrap = ExtendAfterOp->getStart() == ExtendedStart &&
+                          ExtendAfterOp->getStepRecurrence(SE) == ExtendedStep;
+
+      if (NoSignedWrap)
+        return true;
+    }
+
+    // We may have proved this when computing the sign extension above.
+    return AR->getNoWrapFlags(SCEV::FlagNSW) != SCEV::FlagAnyWrap;
+  };
+
+  // `ICI` is interpreted as taking the backedge if the *next* value of the
+  // induction variable satisfies some constraint.
+
+  const SCEVAddRecExpr *IndVarBase = cast<SCEVAddRecExpr>(LeftSCEV);
+  if (IndVarBase->getLoop() != &L) {
+    FailureReason = "LHS in cmp is not an AddRec for this loop";
+    return std::nullopt;
+  }
+  if (!IndVarBase->isAffine()) {
+    FailureReason = "LHS in icmp not induction variable";
+    return std::nullopt;
+  }
+  const SCEV *StepRec = IndVarBase->getStepRecurrence(SE);
+  if (!isa<SCEVConstant>(StepRec)) {
+    FailureReason = "LHS in icmp not induction variable";
+    return std::nullopt;
+  }
+  ConstantInt *StepCI = cast<SCEVConstant>(StepRec)->getValue();
+
+  if (ICI->isEquality() && !HasNoSignedWrap(IndVarBase)) {
+    FailureReason = "LHS in icmp needs nsw for equality predicates";
+    return std::nullopt;
+  }
+
+  assert(!StepCI->isZero() && "Zero step?");
+  bool IsIncreasing = !StepCI->isNegative();
+  bool IsSignedPredicate;
+  const SCEV *StartNext = IndVarBase->getStart();
+  const SCEV *Addend = SE.getNegativeSCEV(IndVarBase->getStepRecurrence(SE));
+  const SCEV *IndVarStart = SE.getAddExpr(StartNext, Addend);
+  const SCEV *Step = SE.getSCEV(StepCI);
+
+  const SCEV *FixedRightSCEV = nullptr;
+
+  // If RightValue resides within loop (but still being loop invariant),
+  // regenerate it as preheader.
+  if (auto *I = dyn_cast<Instruction>(RightValue))
+    if (L.contains(I->getParent()))
+      FixedRightSCEV = RightSCEV;
+
+  if (IsIncreasing) {
+    bool DecreasedRightValueByOne = false;
+    if (StepCI->isOne()) {
+      // Try to turn eq/ne predicates to those we can work with.
+      if (Pred == ICmpInst::ICMP_NE && LatchBrExitIdx == 1)
+        // while (++i != len) {         while (++i < len) {
+        //   ...                 --->     ...
+        // }                            }
+        // If both parts are known non-negative, it is profitable to use
+        // unsigned comparison in increasing loop. This allows us to make the
+        // comparison check against "RightSCEV + 1" more optimistic.
+        if (isKnownNonNegativeInLoop(IndVarStart, &L, SE) &&
+            isKnownNonNegativeInLoop(RightSCEV, &L, SE))
+          Pred = ICmpInst::ICMP_ULT;
+        else
+          Pred = ICmpInst::ICMP_SLT;
+      else if (Pred == ICmpInst::ICMP_EQ && LatchBrExitIdx == 0) {
+        // while (true) {               while (true) {
+        //   if (++i == len)     --->     if (++i > len - 1)
+        //     break;                       break;
+        //   ...                          ...
+        // }                            }
+        if (IndVarBase->getNoWrapFlags(SCEV::FlagNUW) &&
+            cannotBeMinInLoop(RightSCEV, &L, SE, /*Signed*/ false)) {
+          Pred = ICmpInst::ICMP_UGT;
+          RightSCEV =
+              SE.getMinusSCEV(RightSCEV, SE.getOne(RightSCEV->getType()));
+          DecreasedRightValueByOne = true;
+        } else if (cannotBeMinInLoop(RightSCEV, &L, SE, /*Signed*/ true)) {
+          Pred = ICmpInst::ICMP_SGT;
+          RightSCEV =
+              SE.getMinusSCEV(RightSCEV, SE.getOne(RightSCEV->getType()));
+          DecreasedRightValueByOne = true;
+        }
+      }
+    }
+
+    bool LTPred = (Pred == ICmpInst::ICMP_SLT || Pred == ICmpInst::ICMP_ULT);
+    bool GTPred = (Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_UGT);
+    bool FoundExpectedPred =
+        (LTPred && LatchBrExitIdx == 1) || (GTPred && LatchBrExitIdx == 0);
+
+    if (!FoundExpectedPred) {
+      FailureReason = "expected icmp slt semantically, found something else";
+      return std::nullopt;
+    }
+
+    IsSignedPredicate = ICmpInst::isSigned(Pred);
+    if (!IsSignedPredicate && !AllowUnsignedLatchCond) {
+      FailureReason = "unsigned latch conditions are explicitly prohibited";
+      return std::nullopt;
+    }
+
+    if (!isSafeIncreasingBound(IndVarStart, RightSCEV, Step, Pred,
+                               LatchBrExitIdx, &L, SE)) {
+      FailureReason = "Unsafe loop bounds";
+      return std::nullopt;
+    }
+    if (LatchBrExitIdx == 0) {
+      // We need to increase the right value unless we have already decreased
+      // it virtually when we replaced EQ with SGT.
+      if (!DecreasedRightValueByOne)
+        FixedRightSCEV =
+            SE.getAddExpr(RightSCEV, SE.getOne(RightSCEV->getType()));
+    } else {
+      assert(!DecreasedRightValueByOne &&
+             "Right value can be decreased only for LatchBrExitIdx == 0!");
+    }
+  } else {
+    bool IncreasedRightValueByOne = false;
+    if (StepCI->isMinusOne()) {
+      // Try to turn eq/ne predicates to those we can work with.
+      if (Pred == ICmpInst::ICMP_NE && LatchBrExitIdx == 1)
+        // while (--i != len) {         while (--i > len) {
+        //   ...                 --->     ...
+        // }                            }
+        // We intentionally don't turn the predicate into UGT even if we know
+        // that both operands are non-negative, because it will only pessimize
+        // our check against "RightSCEV - 1".
+        Pred = ICmpInst::ICMP_SGT;
+      else if (Pred == ICmpInst::ICMP_EQ && LatchBrExitIdx == 0) {
+        // while (true) {               while (true) {
+        //   if (--i == len)     --->     if (--i < len + 1)
+        //     break;                       break;
+        //   ...                          ...
+        // }                            }
+        if (IndVarBase->getNoWrapFlags(SCEV::FlagNUW) &&
+            cannotBeMaxInLoop(RightSCEV, &L, SE, /* Signed */ false)) {
+          Pred = ICmpInst::ICMP_ULT;
+          RightSCEV = SE.getAddExpr(RightSCEV, SE.getOne(RightSCEV->getType()));
+          IncreasedRightValueByOne = true;
+        } else if (cannotBeMaxInLoop(RightSCEV, &L, SE, /* Signed */ true)) {
+          Pred = ICmpInst::ICMP_SLT;
+          RightSCEV = SE.getAddExpr(RightSCEV, SE.getOne(RightSCEV->getType()));
+          IncreasedRightValueByOne = true;
+        }
+      }
+    }
+
+    bool LTPred = (Pred == ICmpInst::ICMP_SLT || Pred == ICmpInst::ICMP_ULT);
+    bool GTPred = (Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_UGT);
+
+    bool FoundExpectedPred =
+        (GTPred && LatchBrExitIdx == 1) || (LTPred && LatchBrExitIdx == 0);
+
+    if (!FoundExpectedPred) {
+      FailureReason = "expected icmp sgt semantically, found something else";
+      return std::nullopt;
+    }
+
+    IsSignedPredicate =
+        Pred == ICmpInst::ICMP_SLT || Pred == ICmpInst::ICMP_SGT;
+
+    if (!IsSignedPredicate && !AllowUnsignedLatchCond) {
+      FailureReason = "unsigned latch conditions are explicitly prohibited";
+      return std::nullopt;
+    }
+
+    if (!isSafeDecreasingBound(IndVarStart, RightSCEV, Step, Pred,
+                               LatchBrExitIdx, &L, SE)) {
+      FailureReason = "Unsafe bounds";
+      return std::nullopt;
+    }
+
+    if (LatchBrExitIdx == 0) {
+      // We need to decrease the right value unless we have already increased
+      // it virtually when we replaced EQ with SLT.
+      if (!IncreasedRightValueByOne)
+        FixedRightSCEV =
+            SE.getMinusSCEV(RightSCEV, SE.getOne(RightSCEV->getType()));
+    } else {
+      assert(!IncreasedRightValueByOne &&
+             "Right value can be increased only for LatchBrExitIdx == 0!");
+    }
+  }
+  BasicBlock *LatchExit = LatchBr->getSuccessor(LatchBrExitIdx);
+
+  assert(!L.contains(LatchExit) && "expected an exit block!");
+  const DataLayout &DL = Preheader->getModule()->getDataLayout();
+  SCEVExpander Expander(SE, DL, "loop-constrainer");
+  Instruction *Ins = Preheader->getTerminator();
+
+  if (FixedRightSCEV)
+    RightValue =
+        Expander.expandCodeFor(FixedRightSCEV, FixedRightSCEV->getType(), Ins);
+
+  Value *IndVarStartV = Expander.expandCodeFor(IndVarStart, IndVarTy, Ins);
+  IndVarStartV->setName("indvar.start");
+
+  LoopStructure Result;
+
+  Result.Tag = "main";
+  Result.Header = Header;
+  Result.Latch = Latch;
+  Result.LatchBr = LatchBr;
+  Result.LatchExit = LatchExit;
+  Result.LatchBrExitIdx = LatchBrExitIdx;
+  Result.IndVarStart = IndVarStartV;
+  Result.IndVarStep = StepCI;
+  Result.IndVarBase = LeftValue;
+  Result.IndVarIncreasing = IsIncreasing;
+  Result.LoopExitAt = RightValue;
+  Result.IsSignedPredicate = IsSignedPredicate;
+  Result.ExitCountTy = cast<IntegerType>(MaxBETakenCount->getType());
+
+  FailureReason = nullptr;
+
+  return Result;
+}
+
+// Add metadata to the loop L to disable loop optimizations. Callers need to
+// confirm that optimizing loop L is not beneficial.
+static void DisableAllLoopOptsOnLoop(Loop &L) {
+  // We do not care about any existing loopID related metadata for L, since we
+  // are setting all loop metadata to false.
+  LLVMContext &Context = L.getHeader()->getContext();
+  // Reserve first location for self reference to the LoopID metadata node.
+  MDNode *Dummy = MDNode::get(Context, {});
+  MDNode *DisableUnroll = MDNode::get(
+      Context, {MDString::get(Context, "llvm.loop.unroll.disable")});
+  Metadata *FalseVal =
+      ConstantAsMetadata::get(ConstantInt::get(Type::getInt1Ty(Context), 0));
+  MDNode *DisableVectorize = MDNode::get(
+      Context,
+      {MDString::get(Context, "llvm.loop.vectorize.enable"), FalseVal});
+  MDNode *DisableLICMVersioning = MDNode::get(
+      Context, {MDString::get(Context, "llvm.loop.licm_versioning.disable")});
+  MDNode *DisableDistribution = MDNode::get(
+      Context,
+      {MDString::get(Context, "llvm.loop.distribute.enable"), FalseVal});
+  MDNode *NewLoopID =
+      MDNode::get(Context, {Dummy, DisableUnroll, DisableVectorize,
+                            DisableLICMVersioning, DisableDistribution});
+  // Set operand 0 to refer to the loop id itself.
+  NewLoopID->replaceOperandWith(0, NewLoopID);
+  L.setLoopID(NewLoopID);
+}
+
+LoopConstrainer::LoopConstrainer(Loop &L, LoopInfo &LI,
+                                 function_ref<void(Loop *, bool)> LPMAddNewLoop,
+                                 const LoopStructure &LS, ScalarEvolution &SE,
+                                 DominatorTree &DT, Type *T, SubRanges SR)
+    : F(*L.getHeader()->getParent()), Ctx(L.getHeader()->getContext()), SE(SE),
+      DT(DT), LI(LI), LPMAddNewLoop(LPMAddNewLoop), OriginalLoop(L), RangeTy(T),
+      MainLoopStructure(LS), SR(SR) {}
+
+void LoopConstrainer::cloneLoop(LoopConstrainer::ClonedLoop &Result,
+                                const char *Tag) const {
+  for (BasicBlock *BB : OriginalLoop.getBlocks()) {
+    BasicBlock *Clone = CloneBasicBlock(BB, Result.Map, Twine(".") + Tag, &F);
+    Result.Blocks.push_back(Clone);
+    Result.Map[BB] = Clone;
+  }
+
+  auto GetClonedValue = [&Result](Value *V) {
+    assert(V && "null values not in domain!");
+    auto It = Result.Map.find(V);
+    if (It == Result.Map.end())
+      return V;
+    return static_cast<Value *>(It->second);
+  };
+
+  auto *ClonedLatch =
+      cast<BasicBlock>(GetClonedValue(OriginalLoop.getLoopLatch()));
+  ClonedLatch->getTerminator()->setMetadata(ClonedLoopTag,
+                                            MDNode::get(Ctx, {}));
+
+  Result.Structure = MainLoopStructure.map(GetClonedValue);
+  Result.Structure.Tag = Tag;
+
+  for (unsigned i = 0, e = Result.Blocks.size(); i != e; ++i) {
+    BasicBlock *ClonedBB = Result.Blocks[i];
+    BasicBlock *OriginalBB = OriginalLoop.getBlocks()[i];
+
+    assert(Result.Map[OriginalBB] == ClonedBB && "invariant!");
+
+    for (Instruction &I : *ClonedBB)
+      RemapInstruction(&I, Result.Map,
+                       RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
+
+    // Exit blocks will now have one more predecessor and their PHI nodes need
+    // to be edited to reflect that.  No phi nodes need to be introduced because
+    // the loop is in LCSSA.
+
+    for (auto *SBB : successors(OriginalBB)) {
+      if (OriginalLoop.contains(SBB))
+        continue; // not an exit block
+
+      for (PHINode &PN : SBB->phis()) {
+        Value *OldIncoming = PN.getIncomingValueForBlock(OriginalBB);
+        PN.addIncoming(GetClonedValue(OldIncoming), ClonedBB);
+        SE.forgetValue(&PN);
+      }
+    }
+  }
+}
+
+LoopConstrainer::RewrittenRangeInfo LoopConstrainer::changeIterationSpaceEnd(
+    const LoopStructure &LS, BasicBlock *Preheader, Value *ExitSubloopAt,
+    BasicBlock *ContinuationBlock) const {
+  // We start with a loop with a single latch:
+  //
+  //    +--------------------+
+  //    |                    |
+  //    |     preheader      |
+  //    |                    |
+  //    +--------+-----------+
+  //             |      ----------------\
+  //             |     /                |
+  //    +--------v----v------+          |
+  //    |                    |          |
+  //    |      header        |          |
+  //    |                    |          |
+  //    +--------------------+          |
+  //                                    |
+  //            .....                   |
+  //                                    |
+  //    +--------------------+          |
+  //    |                    |          |
+  //    |       latch        >----------/
+  //    |                    |
+  //    +-------v------------+
+  //            |
+  //            |
+  //            |   +--------------------+
+  //            |   |                    |
+  //            +--->   original exit    |
+  //                |                    |
+  //                +--------------------+
+  //
+  // We change the control flow to look like
+  //
+  //
+  //    +--------------------+
+  //    |                    |
+  //    |     preheader      >-------------------------+
+  //    |                    |                         |
+  //    +--------v-----------+                         |
+  //             |    /-------------+                  |
+  //             |   /              |                  |
+  //    +--------v--v--------+      |                  |
+  //    |                    |      |                  |
+  //    |      header        |      |   +--------+     |
+  //    |                    |      |   |        |     |
+  //    +--------------------+      |   |  +-----v-----v-----------+
+  //                                |   |  |                       |
+  //                                |   |  |     .pseudo.exit      |
+  //                                |   |  |                       |
+  //                                |   |  +-----------v-----------+
+  //                                |   |              |
+  //            .....               |   |              |
+  //                                |   |     +--------v-------------+
+  //    +--------------------+      |   |     |                      |
+  //    |                    |      |   |     |   ContinuationBlock  |
+  //    |       latch        >------+   |     |                      |
+  //    |                    |          |     +----------------------+
+  //    +---------v----------+          |
+  //              |                     |
+  //              |                     |
+  //              |     +---------------^-----+
+  //              |     |                     |
+  //              +----->    .exit.selector   |
+  //                    |                     |
+  //                    +----------v----------+
+  //                               |
+  //     +--------------------+    |
+  //     |                    |    |
+  //     |   original exit    <----+
+  //     |                    |
+  //     +--------------------+
+
+  RewrittenRangeInfo RRI;
+
+  BasicBlock *BBInsertLocation = LS.Latch->getNextNode();
+  RRI.ExitSelector = BasicBlock::Create(Ctx, Twine(LS.Tag) + ".exit.selector",
+                                        &F, BBInsertLocation);
+  RRI.PseudoExit = BasicBlock::Create(Ctx, Twine(LS.Tag) + ".pseudo.exit", &F,
+                                      BBInsertLocation);
+
+  BranchInst *PreheaderJump = cast<BranchInst>(Preheader->getTerminator());
+  bool Increasing = LS.IndVarIncreasing;
+  bool IsSignedPredicate = LS.IsSignedPredicate;
+
+  IRBuilder<> B(PreheaderJump);
+  auto NoopOrExt = [&](Value *V) {
+    if (V->getType() == RangeTy)
+      return V;
+    return IsSignedPredicate ? B.CreateSExt(V, RangeTy, "wide." + V->getName())
+                             : B.CreateZExt(V, RangeTy, "wide." + V->getName());
+  };
+
+  // EnterLoopCond - is it okay to start executing this `LS'?
+  Value *EnterLoopCond = nullptr;
+  auto Pred =
+      Increasing
+          ? (IsSignedPredicate ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT)
+          : (IsSignedPredicate ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT);
+  Value *IndVarStart = NoopOrExt(LS.IndVarStart);
+  EnterLoopCond = B.CreateICmp(Pred, IndVarStart, ExitSubloopAt);
+
+  B.CreateCondBr(EnterLoopCond, LS.Header, RRI.PseudoExit);
+  PreheaderJump->eraseFromParent();
+
+  LS.LatchBr->setSuccessor(LS.LatchBrExitIdx, RRI.ExitSelector);
+  B.SetInsertPoint(LS.LatchBr);
+  Value *IndVarBase = NoopOrExt(LS.IndVarBase);
+  Value *TakeBackedgeLoopCond = B.CreateICmp(Pred, IndVarBase, ExitSubloopAt);
+
+  Value *CondForBranch = LS.LatchBrExitIdx == 1
+                             ? TakeBackedgeLoopCond
+                             : B.CreateNot(TakeBackedgeLoopCond);
+
+  LS.LatchBr->setCondition(CondForBranch);
+
+  B.SetInsertPoint(RRI.ExitSelector);
+
+  // IterationsLeft - are there any more iterations left, given the original
+  // upper bound on the induction variable?  If not, we branch to the "real"
+  // exit.
+  Value *LoopExitAt = NoopOrExt(LS.LoopExitAt);
+  Value *IterationsLeft = B.CreateICmp(Pred, IndVarBase, LoopExitAt);
+  B.CreateCondBr(IterationsLeft, RRI.PseudoExit, LS.LatchExit);
+
+  BranchInst *BranchToContinuation =
+      BranchInst::Create(ContinuationBlock, RRI.PseudoExit);
+
+  // We emit PHI nodes into `RRI.PseudoExit' that compute the "latest" value of
+  // each of the PHI nodes in the loop header.  This feeds into the initial
+  // value of the same PHI nodes if/when we continue execution.
+  for (PHINode &PN : LS.Header->phis()) {
+    PHINode *NewPHI = PHINode::Create(PN.getType(), 2, PN.getName() + ".copy",
+                                      BranchToContinuation);
+
+    NewPHI->addIncoming(PN.getIncomingValueForBlock(Preheader), Preheader);
+    NewPHI->addIncoming(PN.getIncomingValueForBlock(LS.Latch),
+                        RRI.ExitSelector);
+    RRI.PHIValuesAtPseudoExit.push_back(NewPHI);
+  }
+
+  RRI.IndVarEnd = PHINode::Create(IndVarBase->getType(), 2, "indvar.end",
+                                  BranchToContinuation);
+  RRI.IndVarEnd->addIncoming(IndVarStart, Preheader);
+  RRI.IndVarEnd->addIncoming(IndVarBase, RRI.ExitSelector);
+
+  // The latch exit now has a branch from `RRI.ExitSelector' instead of
+  // `LS.Latch'.  The PHI nodes need to be updated to reflect that.
+  LS.LatchExit->replacePhiUsesWith(LS.Latch, RRI.ExitSelector);
+
+  return RRI;
+}
+
+void LoopConstrainer::rewriteIncomingValuesForPHIs(
+    LoopStructure &LS, BasicBlock *ContinuationBlock,
+    const LoopConstrainer::RewrittenRangeInfo &RRI) const {
+  unsigned PHIIndex = 0;
+  for (PHINode &PN : LS.Header->phis())
+    PN.setIncomingValueForBlock(ContinuationBlock,
+                                RRI.PHIValuesAtPseudoExit[PHIIndex++]);
+
+  LS.IndVarStart = RRI.IndVarEnd;
+}
+
+BasicBlock *LoopConstrainer::createPreheader(const LoopStructure &LS,
+                                             BasicBlock *OldPreheader,
+                                             const char *Tag) const {
+  BasicBlock *Preheader = BasicBlock::Create(Ctx, Tag, &F, LS.Header);
+  BranchInst::Create(LS.Header, Preheader);
+
+  LS.Header->replacePhiUsesWith(OldPreheader, Preheader);
+
+  return Preheader;
+}
+
+void LoopConstrainer::addToParentLoopIfNeeded(ArrayRef<BasicBlock *> BBs) {
+  Loop *ParentLoop = OriginalLoop.getParentLoop();
+  if (!ParentLoop)
+    return;
+
+  for (BasicBlock *BB : BBs)
+    ParentLoop->addBasicBlockToLoop(BB, LI);
+}
+
+Loop *LoopConstrainer::createClonedLoopStructure(Loop *Original, Loop *Parent,
+                                                 ValueToValueMapTy &VM,
+                                                 bool IsSubloop) {
+  Loop &New = *LI.AllocateLoop();
+  if (Parent)
+    Parent->addChildLoop(&New);
+  else
+    LI.addTopLevelLoop(&New);
+  LPMAddNewLoop(&New, IsSubloop);
+
+  // Add all of the blocks in Original to the new loop.
+  for (auto *BB : Original->blocks())
+    if (LI.getLoopFor(BB) == Original)
+      New.addBasicBlockToLoop(cast<BasicBlock>(VM[BB]), LI);
+
+  // Add all of the subloops to the new loop.
+  for (Loop *SubLoop : *Original)
+    createClonedLoopStructure(SubLoop, &New, VM, /* IsSubloop */ true);
+
+  return &New;
+}
+
+bool LoopConstrainer::run() {
+  BasicBlock *Preheader = OriginalLoop.getLoopPreheader();
+  assert(Preheader != nullptr && "precondition!");
+
+  OriginalPreheader = Preheader;
+  MainLoopPreheader = Preheader;
+  bool IsSignedPredicate = MainLoopStructure.IsSignedPredicate;
+  bool Increasing = MainLoopStructure.IndVarIncreasing;
+  IntegerType *IVTy = cast<IntegerType>(RangeTy);
+
+  SCEVExpander Expander(SE, F.getParent()->getDataLayout(), "loop-constrainer");
+  Instruction *InsertPt = OriginalPreheader->getTerminator();
+
+  // It would have been better to make `PreLoop' and `PostLoop'
+  // `std::optional<ClonedLoop>'s, but `ValueToValueMapTy' does not have a copy
+  // constructor.
+  ClonedLoop PreLoop, PostLoop;
+  bool NeedsPreLoop =
+      Increasing ? SR.LowLimit.has_value() : SR.HighLimit.has_value();
+  bool NeedsPostLoop =
+      Increasing ? SR.HighLimit.has_value() : SR.LowLimit.has_value();
+
+  Value *ExitPreLoopAt = nullptr;
+  Value *ExitMainLoopAt = nullptr;
+  const SCEVConstant *MinusOneS =
+      cast<SCEVConstant>(SE.getConstant(IVTy, -1, true /* isSigned */));
+
+  if (NeedsPreLoop) {
+    const SCEV *ExitPreLoopAtSCEV = nullptr;
+
+    if (Increasing)
+      ExitPreLoopAtSCEV = *SR.LowLimit;
+    else if (cannotBeMinInLoop(*SR.HighLimit, &OriginalLoop, SE,
+                               IsSignedPredicate))
+      ExitPreLoopAtSCEV = SE.getAddExpr(*SR.HighLimit, MinusOneS);
+    else {
+      LLVM_DEBUG(dbgs() << "could not prove no-overflow when computing "
+                        << "preloop exit limit.  HighLimit = "
+                        << *(*SR.HighLimit) << "\n");
+      return false;
+    }
+
+    if (!Expander.isSafeToExpandAt(ExitPreLoopAtSCEV, InsertPt)) {
+      LLVM_DEBUG(dbgs() << "could not prove that it is safe to expand the"
+                        << " preloop exit limit " << *ExitPreLoopAtSCEV
+                        << " at block " << InsertPt->getParent()->getName()
+                        << "\n");
+      return false;
+    }
+
+    ExitPreLoopAt = Expander.expandCodeFor(ExitPreLoopAtSCEV, IVTy, InsertPt);
+    ExitPreLoopAt->setName("exit.preloop.at");
+  }
+
+  if (NeedsPostLoop) {
+    const SCEV *ExitMainLoopAtSCEV = nullptr;
+
+    if (Increasing)
+      ExitMainLoopAtSCEV = *SR.HighLimit;
+    else if (cannotBeMinInLoop(*SR.LowLimit, &OriginalLoop, SE,
+                               IsSignedPredicate))
+      ExitMainLoopAtSCEV = SE.getAddExpr(*SR.LowLimit, MinusOneS);
+    else {
+      LLVM_DEBUG(dbgs() << "could not prove no-overflow when computing "
+                        << "mainloop exit limit.  LowLimit = "
+                        << *(*SR.LowLimit) << "\n");
+      return false;
+    }
+
+    if (!Expander.isSafeToExpandAt(ExitMainLoopAtSCEV, InsertPt)) {
+      LLVM_DEBUG(dbgs() << "could not prove that it is safe to expand the"
+                        << " main loop exit limit " << *ExitMainLoopAtSCEV
+                        << " at block " << InsertPt->getParent()->getName()
+                        << "\n");
+      return false;
+    }
+
+    ExitMainLoopAt = Expander.expandCodeFor(ExitMainLoopAtSCEV, IVTy, InsertPt);
+    ExitMainLoopAt->setName("exit.mainloop.at");
+  }
+
+  // We clone these ahead of time so that we don't have to deal with changing
+  // and temporarily invalid IR as we transform the loops.
+  if (NeedsPreLoop)
+    cloneLoop(PreLoop, "preloop");
+  if (NeedsPostLoop)
+    cloneLoop(PostLoop, "postloop");
+
+  RewrittenRangeInfo PreLoopRRI;
+
+  if (NeedsPreLoop) {
+    Preheader->getTerminator()->replaceUsesOfWith(MainLoopStructure.Header,
+                                                  PreLoop.Structure.Header);
+
+    MainLoopPreheader =
+        createPreheader(MainLoopStructure, Preheader, "mainloop");
+    PreLoopRRI = changeIterationSpaceEnd(PreLoop.Structure, Preheader,
+                                         ExitPreLoopAt, MainLoopPreheader);
+    rewriteIncomingValuesForPHIs(MainLoopStructure, MainLoopPreheader,
+                                 PreLoopRRI);
+  }
+
+  BasicBlock *PostLoopPreheader = nullptr;
+  RewrittenRangeInfo PostLoopRRI;
+
+  if (NeedsPostLoop) {
+    PostLoopPreheader =
+        createPreheader(PostLoop.Structure, Preheader, "postloop");
+    PostLoopRRI = changeIterationSpaceEnd(MainLoopStructure, MainLoopPreheader,
+                                          ExitMainLoopAt, PostLoopPreheader);
+    rewriteIncomingValuesForPHIs(PostLoop.Structure, PostLoopPreheader,
+                                 PostLoopRRI);
+  }
+
+  BasicBlock *NewMainLoopPreheader =
+      MainLoopPreheader != Preheader ? MainLoopPreheader : nullptr;
+  BasicBlock *NewBlocks[] = {PostLoopPreheader,        PreLoopRRI.PseudoExit,
+                             PreLoopRRI.ExitSelector,  PostLoopRRI.PseudoExit,
+                             PostLoopRRI.ExitSelector, NewMainLoopPreheader};
+
+  // Some of the above may be nullptr, filter them out before passing to
+  // addToParentLoopIfNeeded.
+  auto NewBlocksEnd =
+      std::remove(std::begin(NewBlocks), std::end(NewBlocks), nullptr);
+
+  addToParentLoopIfNeeded(ArrayRef(std::begin(NewBlocks), NewBlocksEnd));
+
+  DT.recalculate(F);
+
+  // We need to first add all the pre and post loop blocks into the loop
+  // structures (as part of createClonedLoopStructure), and then update the
+  // LCSSA form and LoopSimplifyForm. This is necessary for correctly updating
+  // LI when LoopSimplifyForm is generated.
+  Loop *PreL = nullptr, *PostL = nullptr;
+  if (!PreLoop.Blocks.empty()) {
+    PreL = createClonedLoopStructure(&OriginalLoop,
+                                     OriginalLoop.getParentLoop(), PreLoop.Map,
+                                     /* IsSubLoop */ false);
+  }
+
+  if (!PostLoop.Blocks.empty()) {
+    PostL =
+        createClonedLoopStructure(&OriginalLoop, OriginalLoop.getParentLoop(),
+                                  PostLoop.Map, /* IsSubLoop */ false);
+  }
+
+  // This function canonicalizes the loop into Loop-Simplify and LCSSA forms.
+  auto CanonicalizeLoop = [&](Loop *L, bool IsOriginalLoop) {
+    formLCSSARecursively(*L, DT, &LI, &SE);
+    simplifyLoop(L, &DT, &LI, &SE, nullptr, nullptr, true);
+    // Pre/post loops are slow paths, we do not need to perform any loop
+    // optimizations on them.
+    if (!IsOriginalLoop)
+      DisableAllLoopOptsOnLoop(*L);
+  };
+  if (PreL)
+    CanonicalizeLoop(PreL, false);
+  if (PostL)
+    CanonicalizeLoop(PostL, false);
+  CanonicalizeLoop(&OriginalLoop, true);
+
+  /// At this point:
+  /// - We've broken a "main loop" out of the loop in a way that the "main loop"
+  /// runs with the induction variable in a subset of [Begin, End).
+  /// - There is no overflow when computing "main loop" exit limit.
+  /// - Max latch taken count of the loop is limited.
+  /// It guarantees that induction variable will not overflow iterating in the
+  /// "main loop".
+  if (isa<OverflowingBinaryOperator>(MainLoopStructure.IndVarBase))
+    if (IsSignedPredicate)
+      cast<BinaryOperator>(MainLoopStructure.IndVarBase)
+          ->setHasNoSignedWrap(true);
+  /// TODO: support unsigned predicate.
+  /// To add NUW flag we need to prove that both operands of BO are
+  /// non-negative. E.g:
+  /// ...
+  /// %iv.next = add nsw i32 %iv, -1
+  /// %cmp = icmp ult i32 %iv.next, %n
+  /// br i1 %cmp, label %loopexit, label %loop
+  ///
+  /// -1 is MAX_UINT in terms of unsigned int. Adding anything but zero will
+  /// overflow, therefore NUW flag is not legal here.
+
+  return true;
+}
diff --git a/llvm/test/Transforms/IRCE/add-metadata-pre-post-loops.ll b/llvm/test/Transforms/IRCE/add-metadata-pre-post-loops.ll
index 4d0b25b7845ed..eb72907730c6d 100644
--- a/llvm/test/Transforms/IRCE/add-metadata-pre-post-loops.ll
+++ b/llvm/test/Transforms/IRCE/add-metadata-pre-post-loops.ll
@@ -9,7 +9,7 @@
 define void @inner_loop(ptr %arr, ptr %a_len_ptr, i32 %n) #0 {
 ; CHECK-LABEL: inner_loop(
 ; CHECK-LABEL: in.bounds.postloop
-; CHECK: br i1 %next.postloop, label %loop.postloop, label %exit.loopexit.loopexit, !llvm.loop !2, !irce.loop.clone !7
+; CHECK: br i1 %next.postloop, label %loop.postloop, label %exit.loopexit.loopexit, !llvm.loop !2, !loop_constrainer.loop.clone !7
 
 entry:
   %len = load i32, ptr %a_len_ptr, !range !0
@@ -39,9 +39,9 @@ exit:                                             ; preds = %in.bounds, %entry
 define void @single_access_with_preloop(ptr %arr, ptr %a_len_ptr, i32 %n, i32 %offset) {
 ; CHECK-LABEL: @single_access_with_preloop(
 ; CHECK-LABEL: in.bounds.preloop
-; CHECK: br i1 [[COND:%[^ ]+]], label %loop.preloop, label %preloop.exit.selector, !llvm.loop !8, !irce.loop.clone !7
+; CHECK: br i1 [[COND:%[^ ]+]], label %loop.preloop, label %preloop.exit.selector, !llvm.loop !8, !loop_constrainer.loop.clone !7
 ; CHECK-LABEL: in.bounds.postloop
-; CHECK: br i1 %next.postloop, label %loop.postloop, label %exit.loopexit.loopexit, !llvm.loop !9, !irce.loop.clone !7
+; CHECK: br i1 %next.postloop, label %loop.postloop, label %exit.loopexit.loopexit, !llvm.loop !9, !loop_constrainer.loop.clone !7
  entry:
   %len = load i32, ptr %a_len_ptr, !range !0
   %first.itr.check = icmp sgt i32 %n, 0
diff --git a/llvm/test/Transforms/IRCE/conjunctive-checks.ll b/llvm/test/Transforms/IRCE/conjunctive-checks.ll
index d914ed87815bf..a384c99eb0d56 100644
--- a/llvm/test/Transforms/IRCE/conjunctive-checks.ll
+++ b/llvm/test/Transforms/IRCE/conjunctive-checks.ll
@@ -64,7 +64,7 @@ define void @f_0(ptr %arr, ptr %a_len_ptr, i32 %n, ptr %cond_buf) {
 ; CHECK-NEXT:    [[ADDR_POSTLOOP:%.*]] = getelementptr i32, ptr [[ARR]], i32 [[IDX_FOR_ABC_POSTLOOP]]
 ; CHECK-NEXT:    store i32 0, ptr [[ADDR_POSTLOOP]], align 4
 ; CHECK-NEXT:    [[NEXT_POSTLOOP:%.*]] = icmp slt i32 [[IDX_NEXT_POSTLOOP]], [[N]]
-; CHECK-NEXT:    br i1 [[NEXT_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT_LOOPEXIT:%.*]], !llvm.loop [[LOOP2:![0-9]+]], !irce.loop.clone [[META7:![0-9]+]]
+; CHECK-NEXT:    br i1 [[NEXT_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT_LOOPEXIT:%.*]], !llvm.loop [[LOOP2:![0-9]+]], !loop_constrainer.loop.clone [[META7:![0-9]+]]
 ;
 entry:
   %len = load i32, ptr %a_len_ptr, !range !0
@@ -158,7 +158,7 @@ define void @f_1(
 ; CHECK-NEXT:    [[ADDR_B_POSTLOOP:%.*]] = getelementptr i32, ptr [[ARR_B]], i32 [[IDX_POSTLOOP]]
 ; CHECK-NEXT:    store i32 -1, ptr [[ADDR_B_POSTLOOP]], align 4
 ; CHECK-NEXT:    [[NEXT_POSTLOOP:%.*]] = icmp slt i32 [[IDX_NEXT_POSTLOOP]], [[N]]
-; CHECK-NEXT:    br i1 [[NEXT_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT_LOOPEXIT:%.*]], !llvm.loop [[LOOP8:![0-9]+]], !irce.loop.clone [[META7]]
+; CHECK-NEXT:    br i1 [[NEXT_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT_LOOPEXIT:%.*]], !llvm.loop [[LOOP8:![0-9]+]], !loop_constrainer.loop.clone [[META7]]
 ;
   ptr %arr_a, ptr %a_len_ptr, ptr %arr_b, ptr %b_len_ptr, i32 %n) {
 
diff --git a/llvm/test/Transforms/IRCE/correct-loop-info.ll b/llvm/test/Transforms/IRCE/correct-loop-info.ll
index 515ad15473cb8..c75de167681ad 100644
--- a/llvm/test/Transforms/IRCE/correct-loop-info.ll
+++ b/llvm/test/Transforms/IRCE/correct-loop-info.ll
@@ -99,7 +99,7 @@ define void @baz() personality ptr @ham {
 ; CHECK:       bb8.preloop:
 ; CHECK-NEXT:    [[TMP9_PRELOOP:%.*]] = icmp slt i32 [[TMP6_PRELOOP]], 84
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp slt i32 [[TMP6_PRELOOP]], [[EXIT_PRELOOP_AT]]
-; CHECK-NEXT:    br i1 [[TMP4]], label [[INNERHEADER_PRELOOP]], label [[PRELOOP_EXIT_SELECTOR:%.*]], !llvm.loop [[LOOP0:![0-9]+]], !irce.loop.clone [[META5:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP4]], label [[INNERHEADER_PRELOOP]], label [[PRELOOP_EXIT_SELECTOR:%.*]], !llvm.loop [[LOOP0:![0-9]+]], !loop_constrainer.loop.clone [[META5:![0-9]+]]
 ; CHECK:       preloop.exit.selector:
 ; CHECK-NEXT:    [[TMP6_PRELOOP_LCSSA:%.*]] = phi i32 [ [[TMP6_PRELOOP]], [[BB8_PRELOOP]] ]
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp slt i32 [[TMP6_PRELOOP_LCSSA]], 84
@@ -120,7 +120,7 @@ define void @baz() personality ptr @ham {
 ; CHECK-NEXT:    br i1 [[TMP7_POSTLOOP]], label [[BB8_POSTLOOP]], label [[EXIT3_LOOPEXIT4:%.*]]
 ; CHECK:       bb8.postloop:
 ; CHECK-NEXT:    [[TMP9_POSTLOOP:%.*]] = icmp slt i32 [[TMP6_POSTLOOP]], 84
-; CHECK-NEXT:    br i1 [[TMP9_POSTLOOP]], label [[INNERHEADER_POSTLOOP]], label [[BB13_LOOPEXIT:%.*]], !llvm.loop [[LOOP6:![0-9]+]], !irce.loop.clone [[META5]]
+; CHECK-NEXT:    br i1 [[TMP9_POSTLOOP]], label [[INNERHEADER_POSTLOOP]], label [[BB13_LOOPEXIT:%.*]], !llvm.loop [[LOOP6:![0-9]+]], !loop_constrainer.loop.clone [[META5]]
 ;
 bb:
   br label %outerheader
diff --git a/llvm/test/Transforms/IRCE/iv-plus-offset-range-check.ll b/llvm/test/Transforms/IRCE/iv-plus-offset-range-check.ll
index 9c3713bdd5db0..d6d55dd9b10f2 100644
--- a/llvm/test/Transforms/IRCE/iv-plus-offset-range-check.ll
+++ b/llvm/test/Transforms/IRCE/iv-plus-offset-range-check.ll
@@ -97,7 +97,7 @@ define i8 @test1(i8 %limit, i8 %n) {
 ; CHECK:       inbounds.postloop:
 ; CHECK-NEXT:    [[IDX_NEXT_POSTLOOP]] = add nuw i8 [[IDX_POSTLOOP]], 1
 ; CHECK-NEXT:    [[CMP_POSTLOOP:%.*]] = icmp slt i8 [[IDX_NEXT_POSTLOOP]], [[LIMIT]]
-; CHECK-NEXT:    br i1 [[CMP_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT_LOOPEXIT]], !llvm.loop [[LOOP0:![0-9]+]], !irce.loop.clone !5
+; CHECK-NEXT:    br i1 [[CMP_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT_LOOPEXIT]], !llvm.loop [[LOOP0:![0-9]+]], !loop_constrainer.loop.clone !5
 ;
 entry:
   %precheck = icmp sgt i8 %limit, 0
@@ -191,7 +191,7 @@ define i8 @test1a(i8 %limit, ptr %p) {
 ; CHECK:       inbounds.postloop:
 ; CHECK-NEXT:    [[IDX_NEXT_POSTLOOP]] = add nuw i8 [[IDX_POSTLOOP]], 1
 ; CHECK-NEXT:    [[CMP_POSTLOOP:%.*]] = icmp slt i8 [[IDX_NEXT_POSTLOOP]], [[LIMIT]]
-; CHECK-NEXT:    br i1 [[CMP_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT_LOOPEXIT]], !llvm.loop [[LOOP7:![0-9]+]], !irce.loop.clone !5
+; CHECK-NEXT:    br i1 [[CMP_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT_LOOPEXIT]], !llvm.loop [[LOOP7:![0-9]+]], !loop_constrainer.loop.clone !5
 ;
 entry:
   %n = load i8, ptr %p, !range !0
@@ -445,7 +445,7 @@ define i8 @test3a(i8 %limit, ptr %p) {
 ; CHECK:       inbounds.postloop:
 ; CHECK-NEXT:    [[IDX_NEXT_POSTLOOP]] = add nuw i8 [[IDX_POSTLOOP]], 1
 ; CHECK-NEXT:    [[CMP_POSTLOOP:%.*]] = icmp slt i8 [[IDX_NEXT_POSTLOOP]], [[LIMIT]]
-; CHECK-NEXT:    br i1 [[CMP_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT_LOOPEXIT]], !llvm.loop [[LOOP8:![0-9]+]], !irce.loop.clone !5
+; CHECK-NEXT:    br i1 [[CMP_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT_LOOPEXIT]], !llvm.loop [[LOOP8:![0-9]+]], !loop_constrainer.loop.clone !5
 ;
 entry:
   %n = load i8, ptr %p, !range !0
@@ -549,7 +549,7 @@ define i8 @test4(i8 %limit, i8 %n) {
 ; CHECK:       inbounds.postloop:
 ; CHECK-NEXT:    [[IDX_NEXT_POSTLOOP]] = add nuw i8 [[IDX_POSTLOOP]], 1
 ; CHECK-NEXT:    [[CMP_POSTLOOP:%.*]] = icmp slt i8 [[IDX_NEXT_POSTLOOP]], [[LIMIT]]
-; CHECK-NEXT:    br i1 [[CMP_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT_LOOPEXIT]], !llvm.loop [[LOOP9:![0-9]+]], !irce.loop.clone !5
+; CHECK-NEXT:    br i1 [[CMP_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT_LOOPEXIT]], !llvm.loop [[LOOP9:![0-9]+]], !loop_constrainer.loop.clone !5
 ;
 entry:
   %precheck = icmp sgt i8 %limit, 0
@@ -648,7 +648,7 @@ define i8 @test4a(i8 %limit, ptr %p) {
 ; CHECK:       inbounds.postloop:
 ; CHECK-NEXT:    [[IDX_NEXT_POSTLOOP]] = add nuw i8 [[IDX_POSTLOOP]], 1
 ; CHECK-NEXT:    [[CMP_POSTLOOP:%.*]] = icmp slt i8 [[IDX_NEXT_POSTLOOP]], [[LIMIT]]
-; CHECK-NEXT:    br i1 [[CMP_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT_LOOPEXIT]], !llvm.loop [[LOOP10:![0-9]+]], !irce.loop.clone !5
+; CHECK-NEXT:    br i1 [[CMP_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT_LOOPEXIT]], !llvm.loop [[LOOP10:![0-9]+]], !loop_constrainer.loop.clone !5
 ;
 entry:
   %n = load i8, ptr %p, !range !0
@@ -853,7 +853,7 @@ define i8 @test6(i8 %limit, i8 %n) {
 ; CHECK:       inbounds.postloop:
 ; CHECK-NEXT:    [[IDX_NEXT_POSTLOOP]] = add nuw i8 [[IDX_POSTLOOP]], 1
 ; CHECK-NEXT:    [[CMP_POSTLOOP:%.*]] = icmp slt i8 [[IDX_NEXT_POSTLOOP]], [[LIMIT]]
-; CHECK-NEXT:    br i1 [[CMP_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT_LOOPEXIT]], !llvm.loop [[LOOP11:![0-9]+]], !irce.loop.clone !5
+; CHECK-NEXT:    br i1 [[CMP_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT_LOOPEXIT]], !llvm.loop [[LOOP11:![0-9]+]], !loop_constrainer.loop.clone !5
 ;
 entry:
   %precheck = icmp sgt i8 %limit, 0
@@ -944,7 +944,7 @@ define i8 @test6a(i8 %limit, ptr %p) {
 ; CHECK:       inbounds.postloop:
 ; CHECK-NEXT:    [[IDX_NEXT_POSTLOOP]] = add nuw i8 [[IDX_POSTLOOP]], 1
 ; CHECK-NEXT:    [[CMP_POSTLOOP:%.*]] = icmp slt i8 [[IDX_NEXT_POSTLOOP]], [[LIMIT]]
-; CHECK-NEXT:    br i1 [[CMP_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT_LOOPEXIT]], !llvm.loop [[LOOP12:![0-9]+]], !irce.loop.clone !5
+; CHECK-NEXT:    br i1 [[CMP_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT_LOOPEXIT]], !llvm.loop [[LOOP12:![0-9]+]], !loop_constrainer.loop.clone !5
 ;
 entry:
   %n = load i8, ptr %p, !range !0
@@ -1038,7 +1038,7 @@ define i8 @test_overflow_check_compile_time(i8 %limit, ptr %p) {
 ; CHECK:       inbounds.postloop:
 ; CHECK-NEXT:    [[IDX_NEXT_POSTLOOP]] = add nuw i8 [[IDX_POSTLOOP]], 1
 ; CHECK-NEXT:    [[CMP_POSTLOOP:%.*]] = icmp slt i8 [[IDX_NEXT_POSTLOOP]], [[LIMIT]]
-; CHECK-NEXT:    br i1 [[CMP_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT_LOOPEXIT]], !llvm.loop [[LOOP14:![0-9]+]], !irce.loop.clone !5
+; CHECK-NEXT:    br i1 [[CMP_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT_LOOPEXIT]], !llvm.loop [[LOOP14:![0-9]+]], !loop_constrainer.loop.clone !5
 ;
 entry:
   %n = load i8, ptr %p, !range !1
@@ -1147,7 +1147,7 @@ define i8 @test_overflow_check_runtime(i8 %limit, ptr %p) {
 ; CHECK:       inbounds.postloop:
 ; CHECK-NEXT:    [[IDX_NEXT_POSTLOOP]] = add nuw i8 [[IDX_POSTLOOP]], 1
 ; CHECK-NEXT:    [[CMP_POSTLOOP:%.*]] = icmp slt i8 [[IDX_NEXT_POSTLOOP]], [[LIMIT]]
-; CHECK-NEXT:    br i1 [[CMP_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT_LOOPEXIT]], !llvm.loop [[LOOP15:![0-9]+]], !irce.loop.clone !5
+; CHECK-NEXT:    br i1 [[CMP_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT_LOOPEXIT]], !llvm.loop [[LOOP15:![0-9]+]], !loop_constrainer.loop.clone !5
 ;
 entry:
   %n = load i8, ptr %p, !range !1
diff --git a/llvm/test/Transforms/IRCE/multiple-access-no-preloop.ll b/llvm/test/Transforms/IRCE/multiple-access-no-preloop.ll
index 94dc9bb44e1f5..0570bc65bcdf3 100644
--- a/llvm/test/Transforms/IRCE/multiple-access-no-preloop.ll
+++ b/llvm/test/Transforms/IRCE/multiple-access-no-preloop.ll
@@ -69,7 +69,7 @@ define void @multiple_access_no_preloop(
 ; CHECK-NEXT:    [[ADDR_B_POSTLOOP:%.*]] = getelementptr i32, ptr [[ARR_B]], i32 [[IDX_POSTLOOP]]
 ; CHECK-NEXT:    store i32 -1, ptr [[ADDR_B_POSTLOOP]], align 4
 ; CHECK-NEXT:    [[NEXT_POSTLOOP:%.*]] = icmp slt i32 [[IDX_NEXT_POSTLOOP]], [[N]]
-; CHECK-NEXT:    br i1 [[NEXT_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT_LOOPEXIT:%.*]], !llvm.loop [[LOOP2:![0-9]+]], !irce.loop.clone [[META7:![0-9]+]]
+; CHECK-NEXT:    br i1 [[NEXT_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT_LOOPEXIT:%.*]], !llvm.loop [[LOOP2:![0-9]+]], !loop_constrainer.loop.clone [[META7:![0-9]+]]
 ;
   ptr %arr_a, ptr %a_len_ptr, ptr %arr_b, ptr %b_len_ptr, i32 %n) {
 
diff --git a/llvm/test/Transforms/IRCE/non-loop-invariant-rhs-instr.ll b/llvm/test/Transforms/IRCE/non-loop-invariant-rhs-instr.ll
index d9d2a27e973b6..c28fc59014f5c 100644
--- a/llvm/test/Transforms/IRCE/non-loop-invariant-rhs-instr.ll
+++ b/llvm/test/Transforms/IRCE/non-loop-invariant-rhs-instr.ll
@@ -80,7 +80,7 @@ define i32 @test_01(i32 %A, i64 %Len, ptr %array) {
 ; CHECK-NEXT:    [[RES2_POSTLOOP]] = mul i32 [[RES_POSTLOOP]], 3
 ; CHECK-NEXT:    [[TMP10:%.*]] = zext i32 [[A]] to i64
 ; CHECK-NEXT:    [[CMP2_POSTLOOP:%.*]] = icmp ugt i64 [[INDVAR_NEXT_POSTLOOP]], [[TMP10]]
-; CHECK-NEXT:    br i1 [[CMP2_POSTLOOP]], label [[LOOPEXIT_LOOPEXIT]], label [[LOOP_POSTLOOP]], !llvm.loop [[LOOP0:![0-9]+]], !irce.loop.clone !5
+; CHECK-NEXT:    br i1 [[CMP2_POSTLOOP]], label [[LOOPEXIT_LOOPEXIT]], label [[LOOP_POSTLOOP]], !llvm.loop [[LOOP0:![0-9]+]], !loop_constrainer.loop.clone !5
 ;
 preheader:
   %tripcheck = icmp sgt i64 %Len, 2
diff --git a/llvm/test/Transforms/IRCE/pre_post_loops.ll b/llvm/test/Transforms/IRCE/pre_post_loops.ll
index f6368cf28776b..da2faa34719af 100644
--- a/llvm/test/Transforms/IRCE/pre_post_loops.ll
+++ b/llvm/test/Transforms/IRCE/pre_post_loops.ll
@@ -55,7 +55,7 @@ define void @test_01(ptr %arr, ptr %a_len_ptr) {
 ; CHECK-NEXT:    [[ADDR_POSTLOOP:%.*]] = getelementptr i32, ptr [[ARR]], i32 [[IDX_POSTLOOP]]
 ; CHECK-NEXT:    store i32 0, ptr [[ADDR_POSTLOOP]], align 4
 ; CHECK-NEXT:    [[NEXT_POSTLOOP:%.*]] = icmp slt i32 [[IDX_NEXT_POSTLOOP]], 2147483647
-; CHECK-NEXT:    br i1 [[NEXT_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT:%.*]], !llvm.loop [[LOOP1:![0-9]+]], !irce.loop.clone [[META6:![0-9]+]]
+; CHECK-NEXT:    br i1 [[NEXT_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT:%.*]], !llvm.loop [[LOOP1:![0-9]+]], !loop_constrainer.loop.clone [[META6:![0-9]+]]
 ;
 
 entry:
@@ -122,7 +122,7 @@ define void @test_02(ptr %arr, ptr %a_len_ptr) {
 ; CHECK-NEXT:    store i32 0, ptr [[ADDR_PRELOOP]], align 4
 ; CHECK-NEXT:    [[NEXT_PRELOOP:%.*]] = icmp sgt i32 [[IDX_NEXT_PRELOOP]], -1
 ; CHECK-NEXT:    [[TMP0:%.*]] = icmp sgt i32 [[IDX_NEXT_PRELOOP]], -1
-; CHECK-NEXT:    br i1 [[TMP0]], label [[LOOP_PRELOOP]], label [[PRELOOP_EXIT_SELECTOR:%.*]], !llvm.loop [[LOOP7:![0-9]+]], !irce.loop.clone [[META6]]
+; CHECK-NEXT:    br i1 [[TMP0]], label [[LOOP_PRELOOP]], label [[PRELOOP_EXIT_SELECTOR:%.*]], !llvm.loop [[LOOP7:![0-9]+]], !loop_constrainer.loop.clone [[META6]]
 ; CHECK:       preloop.exit.selector:
 ; CHECK-NEXT:    [[IDX_NEXT_PRELOOP_LCSSA:%.*]] = phi i32 [ [[IDX_NEXT_PRELOOP]], [[IN_BOUNDS_PRELOOP]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt i32 [[IDX_NEXT_PRELOOP_LCSSA]], -1
diff --git a/llvm/test/Transforms/IRCE/range_intersect_miscompile.ll b/llvm/test/Transforms/IRCE/range_intersect_miscompile.ll
index 75c7be87e2c03..68613feacba32 100644
--- a/llvm/test/Transforms/IRCE/range_intersect_miscompile.ll
+++ b/llvm/test/Transforms/IRCE/range_intersect_miscompile.ll
@@ -83,7 +83,7 @@ define void @test_01() {
 ; CHECK:       loop_latch.postloop:
 ; CHECK-NEXT:    [[IV_NEXT_POSTLOOP]] = add i32 [[IV_POSTLOOP]], 1
 ; CHECK-NEXT:    [[LOOP_COND_POSTLOOP:%.*]] = icmp ult i32 [[IV_NEXT_POSTLOOP]], 400
-; CHECK-NEXT:    br i1 [[LOOP_COND_POSTLOOP]], label [[LOOP_HEADER_POSTLOOP]], label [[EXIT_LOOPEXIT]], !llvm.loop [[LOOP0:![0-9]+]], !irce.loop.clone [[META5:![0-9]+]]
+; CHECK-NEXT:    br i1 [[LOOP_COND_POSTLOOP]], label [[LOOP_HEADER_POSTLOOP]], label [[EXIT_LOOPEXIT]], !llvm.loop [[LOOP0:![0-9]+]], !loop_constrainer.loop.clone [[META5:![0-9]+]]
 ;
 
 entry:
@@ -333,7 +333,7 @@ define void @test_04(ptr %p) {
 ; CHECK:       loop_latch.postloop:
 ; CHECK-NEXT:    [[IV_NEXT_POSTLOOP]] = add i32 [[IV_POSTLOOP]], 1
 ; CHECK-NEXT:    [[LOOP_COND_POSTLOOP:%.*]] = icmp ult i32 [[IV_NEXT_POSTLOOP]], 400
-; CHECK-NEXT:    br i1 [[LOOP_COND_POSTLOOP]], label [[LOOP_HEADER_POSTLOOP]], label [[EXIT_LOOPEXIT]], !llvm.loop [[LOOP6:![0-9]+]], !irce.loop.clone [[META5]]
+; CHECK-NEXT:    br i1 [[LOOP_COND_POSTLOOP]], label [[LOOP_HEADER_POSTLOOP]], label [[EXIT_LOOPEXIT]], !llvm.loop [[LOOP6:![0-9]+]], !loop_constrainer.loop.clone [[META5]]
 ;
 
 entry:
@@ -446,7 +446,7 @@ define void @test_05(ptr %p) {
 ; CHECK:       loop_latch.postloop:
 ; CHECK-NEXT:    [[IV_NEXT_POSTLOOP]] = add i32 [[IV_POSTLOOP]], 1
 ; CHECK-NEXT:    [[LOOP_COND_POSTLOOP:%.*]] = icmp ult i32 [[IV_NEXT_POSTLOOP]], 400
-; CHECK-NEXT:    br i1 [[LOOP_COND_POSTLOOP]], label [[LOOP_HEADER_POSTLOOP]], label [[EXIT_LOOPEXIT]], !llvm.loop [[LOOP8:![0-9]+]], !irce.loop.clone [[META5]]
+; CHECK-NEXT:    br i1 [[LOOP_COND_POSTLOOP]], label [[LOOP_HEADER_POSTLOOP]], label [[EXIT_LOOPEXIT]], !llvm.loop [[LOOP8:![0-9]+]], !loop_constrainer.loop.clone [[META5]]
 ;
 
 entry:
diff --git a/llvm/test/Transforms/IRCE/ranges_of_different_types.ll b/llvm/test/Transforms/IRCE/ranges_of_different_types.ll
index b66aa94aaf0f8..c3246969155f7 100644
--- a/llvm/test/Transforms/IRCE/ranges_of_different_types.ll
+++ b/llvm/test/Transforms/IRCE/ranges_of_different_types.ll
@@ -71,7 +71,7 @@ define void @test_01(ptr %arr, ptr %a_len_ptr) #0 {
 ; CHECK-NEXT:    [[ADDR_POSTLOOP:%.*]] = getelementptr i32, ptr [[ARR]], i32 [[IDX_POSTLOOP]]
 ; CHECK-NEXT:    store i32 0, ptr [[ADDR_POSTLOOP]], align 4
 ; CHECK-NEXT:    [[NEXT_POSTLOOP:%.*]] = icmp slt i32 [[IDX_NEXT_POSTLOOP]], 101
-; CHECK-NEXT:    br i1 [[NEXT_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT:%.*]], !llvm.loop [[LOOP1:![0-9]+]], !irce.loop.clone [[META6:![0-9]+]]
+; CHECK-NEXT:    br i1 [[NEXT_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT:%.*]], !llvm.loop [[LOOP1:![0-9]+]], !loop_constrainer.loop.clone [[META6:![0-9]+]]
 ;
 
 entry:
@@ -173,7 +173,7 @@ define void @test_02(ptr %arr, ptr %a_len_ptr) #0 {
 ; CHECK-NEXT:    store i32 0, ptr [[ADDR_PRELOOP]], align 4
 ; CHECK-NEXT:    [[NEXT_PRELOOP:%.*]] = icmp slt i32 [[IDX_NEXT_PRELOOP]], 101
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp slt i32 [[IDX_NEXT_PRELOOP]], 13
-; CHECK-NEXT:    br i1 [[TMP5]], label [[LOOP_PRELOOP]], label [[PRELOOP_EXIT_SELECTOR:%.*]], !llvm.loop [[LOOP7:![0-9]+]], !irce.loop.clone [[META6]]
+; CHECK-NEXT:    br i1 [[TMP5]], label [[LOOP_PRELOOP]], label [[PRELOOP_EXIT_SELECTOR:%.*]], !llvm.loop [[LOOP7:![0-9]+]], !loop_constrainer.loop.clone [[META6]]
 ; CHECK:       preloop.exit.selector:
 ; CHECK-NEXT:    [[IDX_NEXT_PRELOOP_LCSSA:%.*]] = phi i32 [ [[IDX_NEXT_PRELOOP]], [[IN_BOUNDS_PRELOOP]] ]
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp slt i32 [[IDX_NEXT_PRELOOP_LCSSA]], 101
@@ -194,7 +194,7 @@ define void @test_02(ptr %arr, ptr %a_len_ptr) #0 {
 ; CHECK-NEXT:    [[ADDR_POSTLOOP:%.*]] = getelementptr i32, ptr [[ARR]], i32 [[IDX_POSTLOOP]]
 ; CHECK-NEXT:    store i32 0, ptr [[ADDR_POSTLOOP]], align 4
 ; CHECK-NEXT:    [[NEXT_POSTLOOP:%.*]] = icmp slt i32 [[IDX_NEXT_POSTLOOP]], 101
-; CHECK-NEXT:    br i1 [[NEXT_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT:%.*]], !llvm.loop [[LOOP8:![0-9]+]], !irce.loop.clone [[META6]]
+; CHECK-NEXT:    br i1 [[NEXT_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT:%.*]], !llvm.loop [[LOOP8:![0-9]+]], !loop_constrainer.loop.clone [[META6]]
 ;
 
 entry:
@@ -290,7 +290,7 @@ define void @test_03(ptr %arr, ptr %a_len_ptr) #0 {
 ; CHECK-NEXT:    [[ADDR_POSTLOOP:%.*]] = getelementptr i32, ptr [[ARR]], i32 [[IDX_POSTLOOP]]
 ; CHECK-NEXT:    store i32 0, ptr [[ADDR_POSTLOOP]], align 4
 ; CHECK-NEXT:    [[NEXT_POSTLOOP:%.*]] = icmp ult i32 [[IDX_NEXT_POSTLOOP]], 101
-; CHECK-NEXT:    br i1 [[NEXT_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT:%.*]], !llvm.loop [[LOOP9:![0-9]+]], !irce.loop.clone [[META6]]
+; CHECK-NEXT:    br i1 [[NEXT_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT:%.*]], !llvm.loop [[LOOP9:![0-9]+]], !loop_constrainer.loop.clone [[META6]]
 ;
 
 entry:
@@ -390,7 +390,7 @@ define void @test_04(ptr %arr, ptr %a_len_ptr) #0 {
 ; CHECK-NEXT:    store i32 0, ptr [[ADDR_PRELOOP]], align 4
 ; CHECK-NEXT:    [[NEXT_PRELOOP:%.*]] = icmp ult i32 [[IDX_NEXT_PRELOOP]], 101
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp ult i32 [[IDX_NEXT_PRELOOP]], 13
-; CHECK-NEXT:    br i1 [[TMP4]], label [[LOOP_PRELOOP]], label [[PRELOOP_EXIT_SELECTOR:%.*]], !llvm.loop [[LOOP10:![0-9]+]], !irce.loop.clone [[META6]]
+; CHECK-NEXT:    br i1 [[TMP4]], label [[LOOP_PRELOOP]], label [[PRELOOP_EXIT_SELECTOR:%.*]], !llvm.loop [[LOOP10:![0-9]+]], !loop_constrainer.loop.clone [[META6]]
 ; CHECK:       preloop.exit.selector:
 ; CHECK-NEXT:    [[IDX_NEXT_PRELOOP_LCSSA:%.*]] = phi i32 [ [[IDX_NEXT_PRELOOP]], [[IN_BOUNDS_PRELOOP]] ]
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[IDX_NEXT_PRELOOP_LCSSA]], 101
@@ -411,7 +411,7 @@ define void @test_04(ptr %arr, ptr %a_len_ptr) #0 {
 ; CHECK-NEXT:    [[ADDR_POSTLOOP:%.*]] = getelementptr i32, ptr [[ARR]], i32 [[IDX_POSTLOOP]]
 ; CHECK-NEXT:    store i32 0, ptr [[ADDR_POSTLOOP]], align 4
 ; CHECK-NEXT:    [[NEXT_POSTLOOP:%.*]] = icmp ult i32 [[IDX_NEXT_POSTLOOP]], 101
-; CHECK-NEXT:    br i1 [[NEXT_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT:%.*]], !llvm.loop [[LOOP11:![0-9]+]], !irce.loop.clone [[META6]]
+; CHECK-NEXT:    br i1 [[NEXT_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT:%.*]], !llvm.loop [[LOOP11:![0-9]+]], !loop_constrainer.loop.clone [[META6]]
 ;
 
 entry:
@@ -492,7 +492,7 @@ define void @test_05(ptr %arr, ptr %a_len_ptr) #0 {
 ; CHECK-NEXT:    [[ADDR_POSTLOOP:%.*]] = getelementptr i32, ptr [[ARR]], i32 [[IDX_POSTLOOP]]
 ; CHECK-NEXT:    store i32 0, ptr [[ADDR_POSTLOOP]], align 4
 ; CHECK-NEXT:    [[NEXT_POSTLOOP:%.*]] = icmp slt i32 [[IDX_NEXT_POSTLOOP]], 101
-; CHECK-NEXT:    br i1 [[NEXT_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT:%.*]], !llvm.loop [[LOOP12:![0-9]+]], !irce.loop.clone [[META6]]
+; CHECK-NEXT:    br i1 [[NEXT_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT:%.*]], !llvm.loop [[LOOP12:![0-9]+]], !loop_constrainer.loop.clone [[META6]]
 ;
 
 entry:
@@ -579,7 +579,7 @@ define void @test_06(ptr %arr, ptr %a_len_ptr) #0 {
 ; CHECK-NEXT:    store i32 0, ptr [[ADDR_PRELOOP]], align 4
 ; CHECK-NEXT:    [[NEXT_PRELOOP:%.*]] = icmp slt i32 [[IDX_NEXT_PRELOOP]], 101
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp slt i32 [[IDX_NEXT_PRELOOP]], 13
-; CHECK-NEXT:    br i1 [[TMP5]], label [[LOOP_PRELOOP]], label [[PRELOOP_EXIT_SELECTOR:%.*]], !llvm.loop [[LOOP13:![0-9]+]], !irce.loop.clone [[META6]]
+; CHECK-NEXT:    br i1 [[TMP5]], label [[LOOP_PRELOOP]], label [[PRELOOP_EXIT_SELECTOR:%.*]], !llvm.loop [[LOOP13:![0-9]+]], !loop_constrainer.loop.clone [[META6]]
 ; CHECK:       preloop.exit.selector:
 ; CHECK-NEXT:    [[IDX_NEXT_PRELOOP_LCSSA:%.*]] = phi i32 [ [[IDX_NEXT_PRELOOP]], [[IN_BOUNDS_PRELOOP]] ]
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp slt i32 [[IDX_NEXT_PRELOOP_LCSSA]], 101
@@ -600,7 +600,7 @@ define void @test_06(ptr %arr, ptr %a_len_ptr) #0 {
 ; CHECK-NEXT:    [[ADDR_POSTLOOP:%.*]] = getelementptr i32, ptr [[ARR]], i32 [[IDX_POSTLOOP]]
 ; CHECK-NEXT:    store i32 0, ptr [[ADDR_POSTLOOP]], align 4
 ; CHECK-NEXT:    [[NEXT_POSTLOOP:%.*]] = icmp slt i32 [[IDX_NEXT_POSTLOOP]], 101
-; CHECK-NEXT:    br i1 [[NEXT_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT:%.*]], !llvm.loop [[LOOP14:![0-9]+]], !irce.loop.clone [[META6]]
+; CHECK-NEXT:    br i1 [[NEXT_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT:%.*]], !llvm.loop [[LOOP14:![0-9]+]], !loop_constrainer.loop.clone [[META6]]
 ;
 
 entry:
@@ -681,7 +681,7 @@ define void @test_07(ptr %arr, ptr %a_len_ptr) #0 {
 ; CHECK-NEXT:    [[ADDR_POSTLOOP:%.*]] = getelementptr i32, ptr [[ARR]], i32 [[IDX_POSTLOOP]]
 ; CHECK-NEXT:    store i32 0, ptr [[ADDR_POSTLOOP]], align 4
 ; CHECK-NEXT:    [[NEXT_POSTLOOP:%.*]] = icmp ult i32 [[IDX_NEXT_POSTLOOP]], 101
-; CHECK-NEXT:    br i1 [[NEXT_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT:%.*]], !llvm.loop [[LOOP15:![0-9]+]], !irce.loop.clone [[META6]]
+; CHECK-NEXT:    br i1 [[NEXT_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT:%.*]], !llvm.loop [[LOOP15:![0-9]+]], !loop_constrainer.loop.clone [[META6]]
 ;
 
 entry:
@@ -766,7 +766,7 @@ define void @test_08(ptr %arr, ptr %a_len_ptr) #0 {
 ; CHECK-NEXT:    store i32 0, ptr [[ADDR_PRELOOP]], align 4
 ; CHECK-NEXT:    [[NEXT_PRELOOP:%.*]] = icmp ult i32 [[IDX_NEXT_PRELOOP]], 101
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp ult i32 [[IDX_NEXT_PRELOOP]], 13
-; CHECK-NEXT:    br i1 [[TMP4]], label [[LOOP_PRELOOP]], label [[PRELOOP_EXIT_SELECTOR:%.*]], !llvm.loop [[LOOP16:![0-9]+]], !irce.loop.clone [[META6]]
+; CHECK-NEXT:    br i1 [[TMP4]], label [[LOOP_PRELOOP]], label [[PRELOOP_EXIT_SELECTOR:%.*]], !llvm.loop [[LOOP16:![0-9]+]], !loop_constrainer.loop.clone [[META6]]
 ; CHECK:       preloop.exit.selector:
 ; CHECK-NEXT:    [[IDX_NEXT_PRELOOP_LCSSA:%.*]] = phi i32 [ [[IDX_NEXT_PRELOOP]], [[IN_BOUNDS_PRELOOP]] ]
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[IDX_NEXT_PRELOOP_LCSSA]], 101
@@ -787,7 +787,7 @@ define void @test_08(ptr %arr, ptr %a_len_ptr) #0 {
 ; CHECK-NEXT:    [[ADDR_POSTLOOP:%.*]] = getelementptr i32, ptr [[ARR]], i32 [[IDX_POSTLOOP]]
 ; CHECK-NEXT:    store i32 0, ptr [[ADDR_POSTLOOP]], align 4
 ; CHECK-NEXT:    [[NEXT_POSTLOOP:%.*]] = icmp ult i32 [[IDX_NEXT_POSTLOOP]], 101
-; CHECK-NEXT:    br i1 [[NEXT_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT:%.*]], !llvm.loop [[LOOP17:![0-9]+]], !irce.loop.clone [[META6]]
+; CHECK-NEXT:    br i1 [[NEXT_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT:%.*]], !llvm.loop [[LOOP17:![0-9]+]], !loop_constrainer.loop.clone [[META6]]
 ;
 
 entry:
diff --git a/llvm/test/Transforms/IRCE/rc-negative-bound.ll b/llvm/test/Transforms/IRCE/rc-negative-bound.ll
index 0c94d2b4f8c7a..b3b2a18dad87b 100644
--- a/llvm/test/Transforms/IRCE/rc-negative-bound.ll
+++ b/llvm/test/Transforms/IRCE/rc-negative-bound.ll
@@ -167,7 +167,7 @@ define void @test_03(ptr %arr, i32 %n, i32 %bound) {
 ; CHECK-NEXT:    [[ADDR_POSTLOOP:%.*]] = getelementptr i32, ptr [[ARR]], i32 [[IDX_POSTLOOP]]
 ; CHECK-NEXT:    store i32 0, ptr [[ADDR_POSTLOOP]], align 4
 ; CHECK-NEXT:    [[NEXT_POSTLOOP:%.*]] = icmp slt i32 [[IDX_NEXT_POSTLOOP]], [[N]]
-; CHECK-NEXT:    br i1 [[NEXT_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT_LOOPEXIT:%.*]], !llvm.loop [[LOOP1:![0-9]+]], !irce.loop.clone [[META6:![0-9]+]]
+; CHECK-NEXT:    br i1 [[NEXT_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT_LOOPEXIT:%.*]], !llvm.loop [[LOOP1:![0-9]+]], !loop_constrainer.loop.clone [[META6:![0-9]+]]
 ;
 
   entry:
@@ -253,7 +253,7 @@ define void @test_04(ptr %arr, i32 %n, i32 %bound) {
 ; CHECK-NEXT:    [[ADDR_POSTLOOP:%.*]] = getelementptr i32, ptr [[ARR]], i32 [[IDX_POSTLOOP]]
 ; CHECK-NEXT:    store i32 0, ptr [[ADDR_POSTLOOP]], align 4
 ; CHECK-NEXT:    [[NEXT_POSTLOOP:%.*]] = icmp ult i32 [[IDX_NEXT_POSTLOOP]], [[N]]
-; CHECK-NEXT:    br i1 [[NEXT_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT_LOOPEXIT:%.*]], !llvm.loop [[LOOP7:![0-9]+]], !irce.loop.clone [[META6]]
+; CHECK-NEXT:    br i1 [[NEXT_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT_LOOPEXIT:%.*]], !llvm.loop [[LOOP7:![0-9]+]], !loop_constrainer.loop.clone [[META6]]
 ;
 
   entry:
@@ -447,7 +447,7 @@ define void @test_07(ptr %arr, i32 %n, i32 %bound) {
 ; CHECK-NEXT:    [[ADDR_POSTLOOP:%.*]] = getelementptr i32, ptr [[ARR]], i32 [[IDX_POSTLOOP]]
 ; CHECK-NEXT:    store i32 0, ptr [[ADDR_POSTLOOP]], align 4
 ; CHECK-NEXT:    [[NEXT_POSTLOOP:%.*]] = icmp slt i32 [[IDX_NEXT_POSTLOOP]], [[N]]
-; CHECK-NEXT:    br i1 [[NEXT_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT_LOOPEXIT:%.*]], !llvm.loop [[LOOP8:![0-9]+]], !irce.loop.clone [[META6]]
+; CHECK-NEXT:    br i1 [[NEXT_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT_LOOPEXIT:%.*]], !llvm.loop [[LOOP8:![0-9]+]], !loop_constrainer.loop.clone [[META6]]
 ;
   entry:
   %first.itr.check = icmp sgt i32 %n, 0
@@ -535,7 +535,7 @@ define void @test_08(ptr %arr, i32 %n, i32 %bound) {
 ; CHECK-NEXT:    [[ADDR_POSTLOOP:%.*]] = getelementptr i32, ptr [[ARR]], i32 [[IDX_POSTLOOP]]
 ; CHECK-NEXT:    store i32 0, ptr [[ADDR_POSTLOOP]], align 4
 ; CHECK-NEXT:    [[NEXT_POSTLOOP:%.*]] = icmp ult i32 [[IDX_NEXT_POSTLOOP]], [[N]]
-; CHECK-NEXT:    br i1 [[NEXT_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT_LOOPEXIT:%.*]], !llvm.loop [[LOOP9:![0-9]+]], !irce.loop.clone [[META6]]
+; CHECK-NEXT:    br i1 [[NEXT_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT_LOOPEXIT:%.*]], !llvm.loop [[LOOP9:![0-9]+]], !loop_constrainer.loop.clone [[META6]]
 ;
   entry:
   %first.itr.check = icmp sgt i32 %n, 0
diff --git a/llvm/test/Transforms/IRCE/stride_more_than_1.ll b/llvm/test/Transforms/IRCE/stride_more_than_1.ll
index da6b5603e9260..92cd410b12f66 100644
--- a/llvm/test/Transforms/IRCE/stride_more_than_1.ll
+++ b/llvm/test/Transforms/IRCE/stride_more_than_1.ll
@@ -61,7 +61,7 @@ define void @test_01(ptr %arr, ptr %a_len_ptr) {
 ; CHECK-NEXT:    [[ADDR_POSTLOOP:%.*]] = getelementptr i32, ptr [[ARR]], i32 [[IDX_POSTLOOP]]
 ; CHECK-NEXT:    store i32 0, ptr [[ADDR_POSTLOOP]], align 4
 ; CHECK-NEXT:    [[NEXT_POSTLOOP:%.*]] = icmp slt i32 [[IDX_NEXT_POSTLOOP]], 100
-; CHECK-NEXT:    br i1 [[NEXT_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT:%.*]], !llvm.loop [[LOOP1:![0-9]+]], !irce.loop.clone !6
+; CHECK-NEXT:    br i1 [[NEXT_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT:%.*]], !llvm.loop [[LOOP1:![0-9]+]], !loop_constrainer.loop.clone !6
 ;
 
 entry:
@@ -137,7 +137,7 @@ define void @test_02(ptr %arr, ptr %a_len_ptr) {
 ; CHECK-NEXT:    [[ADDR_POSTLOOP:%.*]] = getelementptr i32, ptr [[ARR]], i32 [[IDX_POSTLOOP]]
 ; CHECK-NEXT:    store i32 0, ptr [[ADDR_POSTLOOP]], align 4
 ; CHECK-NEXT:    [[NEXT_POSTLOOP:%.*]] = icmp slt i32 [[IDX_NEXT_POSTLOOP]], 2147483640
-; CHECK-NEXT:    br i1 [[NEXT_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT:%.*]], !llvm.loop [[LOOP7:![0-9]+]], !irce.loop.clone !6
+; CHECK-NEXT:    br i1 [[NEXT_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT:%.*]], !llvm.loop [[LOOP7:![0-9]+]], !loop_constrainer.loop.clone !6
 ;
 
 entry:
@@ -214,7 +214,7 @@ define void @test_03(ptr %arr, ptr %a_len_ptr) {
 ; CHECK-NEXT:    [[ADDR_POSTLOOP:%.*]] = getelementptr i32, ptr [[ARR]], i32 [[IDX_POSTLOOP]]
 ; CHECK-NEXT:    store i32 0, ptr [[ADDR_POSTLOOP]], align 4
 ; CHECK-NEXT:    [[NEXT_POSTLOOP:%.*]] = icmp slt i32 [[IDX_NEXT_POSTLOOP]], 2147483647
-; CHECK-NEXT:    br i1 [[NEXT_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT:%.*]], !llvm.loop [[LOOP9:![0-9]+]], !irce.loop.clone !6
+; CHECK-NEXT:    br i1 [[NEXT_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT:%.*]], !llvm.loop [[LOOP9:![0-9]+]], !loop_constrainer.loop.clone !6
 ;
 
 entry:
@@ -292,7 +292,7 @@ define void @test_04(ptr %arr, ptr %a_len_ptr) {
 ; CHECK-NEXT:    [[ADDR_POSTLOOP:%.*]] = getelementptr i32, ptr [[ARR]], i32 [[IDX_POSTLOOP]]
 ; CHECK-NEXT:    store i32 0, ptr [[ADDR_POSTLOOP]], align 4
 ; CHECK-NEXT:    [[NEXT_POSTLOOP:%.*]] = icmp slt i32 [[IDX_NEXT_POSTLOOP]], 2147483647
-; CHECK-NEXT:    br i1 [[NEXT_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT:%.*]], !llvm.loop [[LOOP11:![0-9]+]], !irce.loop.clone !6
+; CHECK-NEXT:    br i1 [[NEXT_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT:%.*]], !llvm.loop [[LOOP11:![0-9]+]], !loop_constrainer.loop.clone !6
 ;
 
 
@@ -363,7 +363,7 @@ define void @test_05(ptr %arr, ptr %a_len_ptr) {
 ; CHECK-NEXT:    store i32 0, ptr [[ADDR_PRELOOP]], align 4
 ; CHECK-NEXT:    [[NEXT_PRELOOP:%.*]] = icmp sgt i32 [[IDX_NEXT_PRELOOP]], -1
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt i32 [[IDX_NEXT_PRELOOP]], [[EXIT_PRELOOP_AT]]
-; CHECK-NEXT:    br i1 [[TMP1]], label [[LOOP_PRELOOP]], label [[PRELOOP_EXIT_SELECTOR:%.*]], !llvm.loop [[LOOP12:![0-9]+]], !irce.loop.clone !6
+; CHECK-NEXT:    br i1 [[TMP1]], label [[LOOP_PRELOOP]], label [[PRELOOP_EXIT_SELECTOR:%.*]], !llvm.loop [[LOOP12:![0-9]+]], !loop_constrainer.loop.clone !6
 ; CHECK:       preloop.exit.selector:
 ; CHECK-NEXT:    [[IDX_NEXT_PRELOOP_LCSSA:%.*]] = phi i32 [ [[IDX_NEXT_PRELOOP]], [[IN_BOUNDS_PRELOOP]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp sgt i32 [[IDX_NEXT_PRELOOP_LCSSA]], -1
@@ -440,7 +440,7 @@ define void @test_06(ptr %arr, ptr %a_len_ptr) {
 ; CHECK-NEXT:    store i32 0, ptr [[ADDR_PRELOOP]], align 4
 ; CHECK-NEXT:    [[NEXT_PRELOOP:%.*]] = icmp ugt i32 [[IDX_NEXT_PRELOOP]], 6
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ugt i32 [[IDX_NEXT_PRELOOP]], [[EXIT_PRELOOP_AT]]
-; CHECK-NEXT:    br i1 [[TMP1]], label [[LOOP_PRELOOP]], label [[PRELOOP_EXIT_SELECTOR:%.*]], !llvm.loop [[LOOP14:![0-9]+]], !irce.loop.clone !6
+; CHECK-NEXT:    br i1 [[TMP1]], label [[LOOP_PRELOOP]], label [[PRELOOP_EXIT_SELECTOR:%.*]], !llvm.loop [[LOOP14:![0-9]+]], !loop_constrainer.loop.clone !6
 ; CHECK:       preloop.exit.selector:
 ; CHECK-NEXT:    [[IDX_NEXT_PRELOOP_LCSSA:%.*]] = phi i32 [ [[IDX_NEXT_PRELOOP]], [[IN_BOUNDS_PRELOOP]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp ugt i32 [[IDX_NEXT_PRELOOP_LCSSA]], 6
@@ -564,7 +564,7 @@ define void @test_08(ptr %arr, ptr %a_len_ptr) {
 ; CHECK-NEXT:    store i32 0, ptr [[ADDR_PRELOOP]], align 4
 ; CHECK-NEXT:    [[NEXT_PRELOOP:%.*]] = icmp ugt i32 [[IDX_NEXT_PRELOOP]], 6
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ugt i32 [[IDX_NEXT_PRELOOP]], [[EXIT_PRELOOP_AT]]
-; CHECK-NEXT:    br i1 [[TMP1]], label [[LOOP_PRELOOP]], label [[PRELOOP_EXIT_SELECTOR:%.*]], !llvm.loop [[LOOP15:![0-9]+]], !irce.loop.clone !6
+; CHECK-NEXT:    br i1 [[TMP1]], label [[LOOP_PRELOOP]], label [[PRELOOP_EXIT_SELECTOR:%.*]], !llvm.loop [[LOOP15:![0-9]+]], !loop_constrainer.loop.clone !6
 ; CHECK:       preloop.exit.selector:
 ; CHECK-NEXT:    [[IDX_NEXT_PRELOOP_LCSSA:%.*]] = phi i32 [ [[IDX_NEXT_PRELOOP]], [[IN_BOUNDS_PRELOOP]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp ugt i32 [[IDX_NEXT_PRELOOP_LCSSA]], 6
@@ -673,7 +673,7 @@ define i32 @test_09(ptr %p, ptr %capacity_p, ptr %num_elements_p) {
 ; CHECK-NEXT:    store i32 1, ptr [[EL_PTR_POSTLOOP]], align 4
 ; CHECK-NEXT:    [[IV_NEXT_POSTLOOP]] = add nuw nsw i32 [[IV_POSTLOOP]], 4
 ; CHECK-NEXT:    [[LOOP_COND_POSTLOOP:%.*]] = icmp slt i32 [[IV_NEXT_POSTLOOP]], [[NUM_ELEMENTS]]
-; CHECK-NEXT:    br i1 [[LOOP_COND_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT]], !llvm.loop [[LOOP18:![0-9]+]], !irce.loop.clone !6
+; CHECK-NEXT:    br i1 [[LOOP_COND_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT]], !llvm.loop [[LOOP18:![0-9]+]], !loop_constrainer.loop.clone !6
 ;
 entry:
   %capacity = load i32, ptr %capacity_p, !range !4
@@ -769,7 +769,7 @@ define i32 @test_10(ptr %p, ptr %capacity_p, ptr %num_elements_p) {
 ; CHECK-NEXT:    store i32 1, ptr [[EL_PTR_POSTLOOP]], align 4
 ; CHECK-NEXT:    [[IV_NEXT_POSTLOOP]] = add nuw nsw i32 [[IV_POSTLOOP]], 4
 ; CHECK-NEXT:    [[LOOP_COND_POSTLOOP:%.*]] = icmp slt i32 [[IV_NEXT_POSTLOOP]], [[NUM_ELEMENTS]]
-; CHECK-NEXT:    br i1 [[LOOP_COND_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT]], !llvm.loop [[LOOP19:![0-9]+]], !irce.loop.clone !6
+; CHECK-NEXT:    br i1 [[LOOP_COND_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT]], !llvm.loop [[LOOP19:![0-9]+]], !loop_constrainer.loop.clone !6
 ;
 entry:
   %capacity = load i32, ptr %capacity_p, !range !4
@@ -866,7 +866,7 @@ define i32 @test_11(ptr %p, ptr %capacity_p, ptr %num_elements_p) {
 ; CHECK-NEXT:    store i32 1, ptr [[EL_PTR_POSTLOOP]], align 4
 ; CHECK-NEXT:    [[IV_NEXT_POSTLOOP]] = add nuw nsw i32 [[IV_POSTLOOP]], 4
 ; CHECK-NEXT:    [[LOOP_COND_POSTLOOP:%.*]] = icmp slt i32 [[IV_NEXT_POSTLOOP]], [[NUM_ELEMENTS]]
-; CHECK-NEXT:    br i1 [[LOOP_COND_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT]], !llvm.loop [[LOOP20:![0-9]+]], !irce.loop.clone !6
+; CHECK-NEXT:    br i1 [[LOOP_COND_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT]], !llvm.loop [[LOOP20:![0-9]+]], !loop_constrainer.loop.clone !6
 ;
 entry:
   %capacity = load i32, ptr %capacity_p, !range !4
@@ -953,7 +953,7 @@ define i32 @binop_or_is_iv_base(ptr %p, i32 %end) {
 ; CHECK-NEXT:    [[IV_ADD_POSTLOOP]] = add i32 [[IV_POSTLOOP]], 8
 ; CHECK-NEXT:    [[IV_OR_POSTLOOP:%.*]] = or i32 [[IV_ADD_POSTLOOP]], 7
 ; CHECK-NEXT:    [[CMP_POSTLOOP:%.*]] = icmp slt i32 [[IV_OR_POSTLOOP]], [[END]]
-; CHECK-NEXT:    br i1 [[CMP_POSTLOOP]], label [[LOOP_HEADER_POSTLOOP]], label [[COMMON_RET_LOOPEXIT_LOOPEXIT:%.*]], !llvm.loop [[LOOP21:![0-9]+]], !irce.loop.clone !6
+; CHECK-NEXT:    br i1 [[CMP_POSTLOOP]], label [[LOOP_HEADER_POSTLOOP]], label [[COMMON_RET_LOOPEXIT_LOOPEXIT:%.*]], !llvm.loop [[LOOP21:![0-9]+]], !loop_constrainer.loop.clone !6
 ;
 entry:
   %n = load atomic i32, ptr %p unordered, align 8, !range !1
diff --git a/llvm/test/Transforms/IRCE/unhandled.ll b/llvm/test/Transforms/IRCE/unhandled.ll
index a604e303d1b38..0b06ce9ac253d 100644
--- a/llvm/test/Transforms/IRCE/unhandled.ll
+++ b/llvm/test/Transforms/IRCE/unhandled.ll
@@ -87,7 +87,7 @@ define void @already_cloned(ptr %arr, ptr %a_len_ptr, i32 %n) {
   %addr = getelementptr i32, ptr %arr, i32 %idx
   store i32 0, ptr %addr
   %next = icmp slt i32 %idx.next, %n
-  br i1 %next, label %loop, label %exit, !irce.loop.clone !{}
+  br i1 %next, label %loop, label %exit, !loop_constrainer.loop.clone !{}
 
  out.of.bounds:
   ret void
diff --git a/llvm/test/Transforms/IRCE/unsigned_comparisons_ugt.ll b/llvm/test/Transforms/IRCE/unsigned_comparisons_ugt.ll
index ce7b40ef1fa88..7236cf4817ade 100644
--- a/llvm/test/Transforms/IRCE/unsigned_comparisons_ugt.ll
+++ b/llvm/test/Transforms/IRCE/unsigned_comparisons_ugt.ll
@@ -60,7 +60,7 @@ define void @test_01(ptr %arr, ptr %a_len_ptr) #0 {
 ; CHECK-NEXT:    [[ADDR_POSTLOOP:%.*]] = getelementptr i32, ptr [[ARR]], i32 [[IDX_POSTLOOP]]
 ; CHECK-NEXT:    store i32 0, ptr [[ADDR_POSTLOOP]], align 4
 ; CHECK-NEXT:    [[NEXT_POSTLOOP:%.*]] = icmp ugt i32 [[IDX_NEXT_POSTLOOP]], 100
-; CHECK-NEXT:    br i1 [[NEXT_POSTLOOP]], label [[EXIT_LOOPEXIT:%.*]], label [[LOOP_POSTLOOP]], !llvm.loop [[LOOP1:![0-9]+]], !irce.loop.clone [[META6:![0-9]+]]
+; CHECK-NEXT:    br i1 [[NEXT_POSTLOOP]], label [[EXIT_LOOPEXIT:%.*]], label [[LOOP_POSTLOOP]], !llvm.loop [[LOOP1:![0-9]+]], !loop_constrainer.loop.clone [[META6:![0-9]+]]
 ;
 
 entry:
@@ -130,7 +130,7 @@ define void @test_02(ptr %arr, ptr %a_len_ptr) #0 {
 ; CHECK-NEXT:    store i32 0, ptr [[ADDR_PRELOOP]], align 4
 ; CHECK-NEXT:    [[NEXT_PRELOOP:%.*]] = icmp ugt i32 [[IDX_NEXT_PRELOOP]], 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ugt i32 [[IDX_NEXT_PRELOOP]], [[EXIT_PRELOOP_AT]]
-; CHECK-NEXT:    br i1 [[TMP1]], label [[LOOP_PRELOOP]], label [[PRELOOP_EXIT_SELECTOR:%.*]], !llvm.loop [[LOOP7:![0-9]+]], !irce.loop.clone [[META6]]
+; CHECK-NEXT:    br i1 [[TMP1]], label [[LOOP_PRELOOP]], label [[PRELOOP_EXIT_SELECTOR:%.*]], !llvm.loop [[LOOP7:![0-9]+]], !loop_constrainer.loop.clone [[META6]]
 ; CHECK:       preloop.exit.selector:
 ; CHECK-NEXT:    [[IDX_NEXT_PRELOOP_LCSSA:%.*]] = phi i32 [ [[IDX_NEXT_PRELOOP]], [[IN_BOUNDS_PRELOOP]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp ugt i32 [[IDX_NEXT_PRELOOP_LCSSA]], 0
@@ -215,7 +215,7 @@ define void @test_03(ptr %arr, ptr %a_len_ptr) #0 {
 ; CHECK-NEXT:    [[ADDR_POSTLOOP:%.*]] = getelementptr i32, ptr [[ARR]], i32 [[IDX_POSTLOOP]]
 ; CHECK-NEXT:    store i32 0, ptr [[ADDR_POSTLOOP]], align 4
 ; CHECK-NEXT:    [[NEXT_POSTLOOP:%.*]] = icmp ugt i32 [[IDX_NEXT_POSTLOOP]], -2147483648
-; CHECK-NEXT:    br i1 [[NEXT_POSTLOOP]], label [[EXIT_LOOPEXIT:%.*]], label [[LOOP_POSTLOOP]], !llvm.loop [[LOOP8:![0-9]+]], !irce.loop.clone [[META6]]
+; CHECK-NEXT:    br i1 [[NEXT_POSTLOOP]], label [[EXIT_LOOPEXIT:%.*]], label [[LOOP_POSTLOOP]], !llvm.loop [[LOOP8:![0-9]+]], !loop_constrainer.loop.clone [[META6]]
 ;
 
 entry:
@@ -285,7 +285,7 @@ define void @test_04(ptr %arr, ptr %a_len_ptr) #0 {
 ; CHECK-NEXT:    store i32 0, ptr [[ADDR_PRELOOP]], align 4
 ; CHECK-NEXT:    [[NEXT_PRELOOP:%.*]] = icmp ugt i32 [[IDX_NEXT_PRELOOP]], 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ugt i32 [[IDX_NEXT_PRELOOP]], [[EXIT_PRELOOP_AT]]
-; CHECK-NEXT:    br i1 [[TMP1]], label [[LOOP_PRELOOP]], label [[PRELOOP_EXIT_SELECTOR:%.*]], !llvm.loop [[LOOP9:![0-9]+]], !irce.loop.clone [[META6]]
+; CHECK-NEXT:    br i1 [[TMP1]], label [[LOOP_PRELOOP]], label [[PRELOOP_EXIT_SELECTOR:%.*]], !llvm.loop [[LOOP9:![0-9]+]], !loop_constrainer.loop.clone [[META6]]
 ; CHECK:       preloop.exit.selector:
 ; CHECK-NEXT:    [[IDX_NEXT_PRELOOP_LCSSA:%.*]] = phi i32 [ [[IDX_NEXT_PRELOOP]], [[IN_BOUNDS_PRELOOP]] ]
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp ugt i32 [[IDX_NEXT_PRELOOP_LCSSA]], 0
@@ -406,7 +406,7 @@ define void @test_06(ptr %arr, ptr %a_len_ptr) #0 {
 ; CHECK-NEXT:    store i32 0, ptr [[ADDR_PRELOOP]], align 4
 ; CHECK-NEXT:    [[NEXT_PRELOOP:%.*]] = icmp ugt i32 [[IDX_NEXT_PRELOOP]], 0
 ; CHECK-NEXT:    [[TMP0:%.*]] = icmp ugt i32 [[IDX_NEXT_PRELOOP]], 0
-; CHECK-NEXT:    br i1 [[TMP0]], label [[LOOP_PRELOOP]], label [[PRELOOP_EXIT_SELECTOR:%.*]], !llvm.loop [[LOOP10:![0-9]+]], !irce.loop.clone [[META6]]
+; CHECK-NEXT:    br i1 [[TMP0]], label [[LOOP_PRELOOP]], label [[PRELOOP_EXIT_SELECTOR:%.*]], !llvm.loop [[LOOP10:![0-9]+]], !loop_constrainer.loop.clone [[META6]]
 ; CHECK:       preloop.exit.selector:
 ; CHECK-NEXT:    [[IDX_NEXT_PRELOOP_LCSSA:%.*]] = phi i32 [ [[IDX_NEXT_PRELOOP]], [[IN_BOUNDS_PRELOOP]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ugt i32 [[IDX_NEXT_PRELOOP_LCSSA]], 0
diff --git a/llvm/test/Transforms/IRCE/unsigned_comparisons_ult.ll b/llvm/test/Transforms/IRCE/unsigned_comparisons_ult.ll
index 7920ac362f453..546e1b0faa1c3 100644
--- a/llvm/test/Transforms/IRCE/unsigned_comparisons_ult.ll
+++ b/llvm/test/Transforms/IRCE/unsigned_comparisons_ult.ll
@@ -62,7 +62,7 @@ define void @test_01(ptr %arr, ptr %a_len_ptr) #0 {
 ; CHECK-NEXT:    [[ADDR_POSTLOOP:%.*]] = getelementptr i32, ptr [[ARR]], i32 [[IDX_POSTLOOP]]
 ; CHECK-NEXT:    store i32 0, ptr [[ADDR_POSTLOOP]], align 4
 ; CHECK-NEXT:    [[NEXT_POSTLOOP:%.*]] = icmp ult i32 [[IDX_NEXT_POSTLOOP]], 100
-; CHECK-NEXT:    br i1 [[NEXT_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT:%.*]], !llvm.loop [[LOOP1:![0-9]+]], !irce.loop.clone [[META6:![0-9]+]]
+; CHECK-NEXT:    br i1 [[NEXT_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT:%.*]], !llvm.loop [[LOOP1:![0-9]+]], !loop_constrainer.loop.clone [[META6:![0-9]+]]
 ;
 
 entry:
@@ -133,7 +133,7 @@ define void @test_02(ptr %arr, ptr %a_len_ptr) #0 {
 ; CHECK-NEXT:    [[NEXT_PRELOOP:%.*]] = icmp ult i32 [[IDX_NEXT_PRELOOP]], 1
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ugt i32 [[IDX_NEXT_PRELOOP]], [[EXIT_PRELOOP_AT]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = xor i1 [[TMP1]], true
-; CHECK-NEXT:    br i1 [[TMP2]], label [[PRELOOP_EXIT_SELECTOR:%.*]], label [[LOOP_PRELOOP]], !llvm.loop [[LOOP7:![0-9]+]], !irce.loop.clone [[META6]]
+; CHECK-NEXT:    br i1 [[TMP2]], label [[PRELOOP_EXIT_SELECTOR:%.*]], label [[LOOP_PRELOOP]], !llvm.loop [[LOOP7:![0-9]+]], !loop_constrainer.loop.clone [[META6]]
 ; CHECK:       preloop.exit.selector:
 ; CHECK-NEXT:    [[IDX_NEXT_PRELOOP_LCSSA:%.*]] = phi i32 [ [[IDX_NEXT_PRELOOP]], [[IN_BOUNDS_PRELOOP]] ]
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ugt i32 [[IDX_NEXT_PRELOOP_LCSSA]], 0
@@ -217,7 +217,7 @@ define void @test_03(ptr %arr, ptr %a_len_ptr) #0 {
 ; CHECK-NEXT:    [[ADDR_POSTLOOP:%.*]] = getelementptr i32, ptr [[ARR]], i32 [[IDX_POSTLOOP]]
 ; CHECK-NEXT:    store i32 0, ptr [[ADDR_POSTLOOP]], align 4
 ; CHECK-NEXT:    [[NEXT_POSTLOOP:%.*]] = icmp ult i32 [[IDX_NEXT_POSTLOOP]], 2147483647
-; CHECK-NEXT:    br i1 [[NEXT_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT:%.*]], !llvm.loop [[LOOP8:![0-9]+]], !irce.loop.clone [[META6]]
+; CHECK-NEXT:    br i1 [[NEXT_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT:%.*]], !llvm.loop [[LOOP8:![0-9]+]], !loop_constrainer.loop.clone [[META6]]
 ;
 
 entry:
@@ -293,7 +293,7 @@ define void @test_04(ptr %arr, ptr %a_len_ptr) #0 {
 ; CHECK-NEXT:    [[ADDR_POSTLOOP:%.*]] = getelementptr i32, ptr [[ARR]], i32 [[IDX_POSTLOOP]]
 ; CHECK-NEXT:    store i32 0, ptr [[ADDR_POSTLOOP]], align 4
 ; CHECK-NEXT:    [[NEXT_POSTLOOP:%.*]] = icmp ult i32 [[IDX_NEXT_POSTLOOP]], -2147483648
-; CHECK-NEXT:    br i1 [[NEXT_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT:%.*]], !llvm.loop [[LOOP9:![0-9]+]], !irce.loop.clone [[META6]]
+; CHECK-NEXT:    br i1 [[NEXT_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT:%.*]], !llvm.loop [[LOOP9:![0-9]+]], !loop_constrainer.loop.clone [[META6]]
 ;
 
 entry:
@@ -364,7 +364,7 @@ define void @test_05(ptr %arr, ptr %a_len_ptr) #0 {
 ; CHECK-NEXT:    [[NEXT_PRELOOP:%.*]] = icmp ult i32 [[IDX_NEXT_PRELOOP]], 1
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ugt i32 [[IDX_NEXT_PRELOOP]], [[EXIT_PRELOOP_AT]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = xor i1 [[TMP1]], true
-; CHECK-NEXT:    br i1 [[TMP2]], label [[PRELOOP_EXIT_SELECTOR:%.*]], label [[LOOP_PRELOOP]], !llvm.loop [[LOOP10:![0-9]+]], !irce.loop.clone [[META6]]
+; CHECK-NEXT:    br i1 [[TMP2]], label [[PRELOOP_EXIT_SELECTOR:%.*]], label [[LOOP_PRELOOP]], !llvm.loop [[LOOP10:![0-9]+]], !loop_constrainer.loop.clone [[META6]]
 ; CHECK:       preloop.exit.selector:
 ; CHECK-NEXT:    [[IDX_NEXT_PRELOOP_LCSSA:%.*]] = phi i32 [ [[IDX_NEXT_PRELOOP]], [[IN_BOUNDS_PRELOOP]] ]
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ugt i32 [[IDX_NEXT_PRELOOP_LCSSA]], 0
@@ -448,7 +448,7 @@ define void @test_06(ptr %arr, ptr %a_len_ptr) #0 {
 ; CHECK-NEXT:    [[ADDR_POSTLOOP:%.*]] = getelementptr i32, ptr [[ARR]], i32 [[IDX_POSTLOOP]]
 ; CHECK-NEXT:    store i32 0, ptr [[ADDR_POSTLOOP]], align 4
 ; CHECK-NEXT:    [[NEXT_POSTLOOP:%.*]] = icmp ult i32 [[IDX_NEXT_POSTLOOP]], -1
-; CHECK-NEXT:    br i1 [[NEXT_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT:%.*]], !llvm.loop [[LOOP11:![0-9]+]], !irce.loop.clone [[META6]]
+; CHECK-NEXT:    br i1 [[NEXT_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT:%.*]], !llvm.loop [[LOOP11:![0-9]+]], !loop_constrainer.loop.clone [[META6]]
 ;
 
 entry:
@@ -573,7 +573,7 @@ define void @test_08(ptr %arr, ptr %a_len_ptr) #0 {
 ; CHECK-NEXT:    [[ADDR_POSTLOOP:%.*]] = getelementptr i32, ptr [[ARR]], i32 [[IDX_POSTLOOP]]
 ; CHECK-NEXT:    store i32 0, ptr [[ADDR_POSTLOOP]], align 4
 ; CHECK-NEXT:    [[NEXT_POSTLOOP:%.*]] = icmp ult i32 [[IDX_NEXT_POSTLOOP]], -100
-; CHECK-NEXT:    br i1 [[NEXT_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT:%.*]], !llvm.loop [[LOOP12:![0-9]+]], !irce.loop.clone [[META6]]
+; CHECK-NEXT:    br i1 [[NEXT_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT:%.*]], !llvm.loop [[LOOP12:![0-9]+]], !loop_constrainer.loop.clone [[META6]]
 ;
 
 entry:
diff --git a/llvm/test/Transforms/IRCE/wide_indvar.ll b/llvm/test/Transforms/IRCE/wide_indvar.ll
index 10eea088af4f2..ecb13adb61fdf 100644
--- a/llvm/test/Transforms/IRCE/wide_indvar.ll
+++ b/llvm/test/Transforms/IRCE/wide_indvar.ll
@@ -94,7 +94,7 @@ define i32 @test_increasing_slt_slt_wide_simple_postloop() {
 ; CHECK-NEXT:    [[IV_NEXT_POSTLOOP]] = add i64 [[IV_POSTLOOP]], 1
 ; CHECK-NEXT:    [[NARROW_IV_POSTLOOP]] = trunc i64 [[IV_NEXT_POSTLOOP]] to i32
 ; CHECK-NEXT:    [[LATCH_COND_POSTLOOP:%.*]] = icmp slt i32 [[NARROW_IV_POSTLOOP]], 100
-; CHECK-NEXT:    br i1 [[LATCH_COND_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT]], !llvm.loop [[LOOP0:![0-9]+]], !irce.loop.clone !5
+; CHECK-NEXT:    br i1 [[LATCH_COND_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT]], !llvm.loop [[LOOP0:![0-9]+]], !loop_constrainer.loop.clone !5
 ;
 
 entry:
@@ -175,7 +175,7 @@ define i32 @test_increasing_slt_slt_wide_non-negative(ptr %n_ptr, ptr %m_ptr) {
 ; CHECK-NEXT:    [[IV_NEXT_POSTLOOP]] = add i64 [[IV_POSTLOOP]], 1
 ; CHECK-NEXT:    [[NARROW_IV_POSTLOOP]] = trunc i64 [[IV_NEXT_POSTLOOP]] to i32
 ; CHECK-NEXT:    [[LATCH_COND_POSTLOOP:%.*]] = icmp slt i32 [[NARROW_IV_POSTLOOP]], [[N]]
-; CHECK-NEXT:    br i1 [[LATCH_COND_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT]], !llvm.loop [[LOOP8:![0-9]+]], !irce.loop.clone !5
+; CHECK-NEXT:    br i1 [[LATCH_COND_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT]], !llvm.loop [[LOOP8:![0-9]+]], !loop_constrainer.loop.clone !5
 ;
 
 entry:
@@ -268,7 +268,7 @@ define i32 @test_increasing_slt_slt_wide_general(ptr %n_ptr, ptr %m_ptr) {
 ; CHECK-NEXT:    [[IV_NEXT_POSTLOOP]] = add i64 [[IV_POSTLOOP]], 1
 ; CHECK-NEXT:    [[NARROW_IV_POSTLOOP]] = trunc i64 [[IV_NEXT_POSTLOOP]] to i32
 ; CHECK-NEXT:    [[LATCH_COND_POSTLOOP:%.*]] = icmp slt i32 [[NARROW_IV_POSTLOOP]], [[N]]
-; CHECK-NEXT:    br i1 [[LATCH_COND_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT]], !llvm.loop [[LOOP9:![0-9]+]], !irce.loop.clone !5
+; CHECK-NEXT:    br i1 [[LATCH_COND_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT]], !llvm.loop [[LOOP9:![0-9]+]], !loop_constrainer.loop.clone !5
 ;
 
 entry:
@@ -367,7 +367,7 @@ define i32 @test_increasing_slt_slt_wide_general_preloop(ptr %n_ptr, ptr %m_ptr)
 ; CHECK-NEXT:    [[LATCH_COND_PRELOOP:%.*]] = icmp slt i32 [[NARROW_IV_PRELOOP]], [[N]]
 ; CHECK-NEXT:    [[WIDE_NARROW_IV_PRELOOP:%.*]] = sext i32 [[NARROW_IV_PRELOOP]] to i64
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp slt i64 [[WIDE_NARROW_IV_PRELOOP]], [[EXIT_PRELOOP_AT]]
-; CHECK-NEXT:    br i1 [[TMP9]], label [[LOOP_PRELOOP]], label [[PRELOOP_EXIT_SELECTOR]], !llvm.loop [[LOOP10:![0-9]+]], !irce.loop.clone !5
+; CHECK-NEXT:    br i1 [[TMP9]], label [[LOOP_PRELOOP]], label [[PRELOOP_EXIT_SELECTOR]], !llvm.loop [[LOOP10:![0-9]+]], !loop_constrainer.loop.clone !5
 ; CHECK:       preloop.exit.selector:
 ; CHECK-NEXT:    [[IV_NEXT_PRELOOP_LCSSA:%.*]] = phi i64 [ [[IV_NEXT_PRELOOP]], [[BACKEDGE_PRELOOP]] ]
 ; CHECK-NEXT:    [[NARROW_IV_PRELOOP_LCSSA]] = phi i32 [ [[NARROW_IV_PRELOOP]], [[BACKEDGE_PRELOOP]] ]
@@ -389,7 +389,7 @@ define i32 @test_increasing_slt_slt_wide_general_preloop(ptr %n_ptr, ptr %m_ptr)
 ; CHECK-NEXT:    [[IV_NEXT_POSTLOOP]] = add i64 [[IV_POSTLOOP]], 1
 ; CHECK-NEXT:    [[NARROW_IV_POSTLOOP]] = trunc i64 [[IV_POSTLOOP]] to i32
 ; CHECK-NEXT:    [[LATCH_COND_POSTLOOP:%.*]] = icmp slt i32 [[NARROW_IV_POSTLOOP]], [[N]]
-; CHECK-NEXT:    br i1 [[LATCH_COND_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT]], !llvm.loop [[LOOP11:![0-9]+]], !irce.loop.clone !5
+; CHECK-NEXT:    br i1 [[LATCH_COND_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT]], !llvm.loop [[LOOP11:![0-9]+]], !loop_constrainer.loop.clone !5
 ;
 
 entry:
@@ -519,7 +519,7 @@ define i32 @test_increasing_slt_slt_wide_multiple_checks(ptr %n_ptr, ptr %m1_ptr
 ; CHECK-NEXT:    [[IV_NEXT_POSTLOOP]] = add i64 [[IV_POSTLOOP]], 1
 ; CHECK-NEXT:    [[NARROW_IV_POSTLOOP]] = trunc i64 [[IV_NEXT_POSTLOOP]] to i32
 ; CHECK-NEXT:    [[LATCH_COND_POSTLOOP:%.*]] = icmp slt i32 [[NARROW_IV_POSTLOOP]], [[N]]
-; CHECK-NEXT:    br i1 [[LATCH_COND_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT]], !llvm.loop [[LOOP12:![0-9]+]], !irce.loop.clone !5
+; CHECK-NEXT:    br i1 [[LATCH_COND_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT]], !llvm.loop [[LOOP12:![0-9]+]], !loop_constrainer.loop.clone !5
 ;
 
 entry:
@@ -688,7 +688,7 @@ define i32 @test_increasing_ult_ult_wide_simple_postloop() {
 ; CHECK-NEXT:    [[IV_NEXT_POSTLOOP]] = add i64 [[IV_POSTLOOP]], 1
 ; CHECK-NEXT:    [[NARROW_IV_POSTLOOP]] = trunc i64 [[IV_NEXT_POSTLOOP]] to i32
 ; CHECK-NEXT:    [[LATCH_COND_POSTLOOP:%.*]] = icmp ult i32 [[NARROW_IV_POSTLOOP]], 100
-; CHECK-NEXT:    br i1 [[LATCH_COND_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT]], !llvm.loop [[LOOP13:![0-9]+]], !irce.loop.clone !5
+; CHECK-NEXT:    br i1 [[LATCH_COND_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT]], !llvm.loop [[LOOP13:![0-9]+]], !loop_constrainer.loop.clone !5
 ;
 
 entry:
@@ -769,7 +769,7 @@ define i32 @test_increasing_ult_ult_wide_non-negative(ptr %n_ptr, ptr %m_ptr) {
 ; CHECK-NEXT:    [[IV_NEXT_POSTLOOP]] = add i64 [[IV_POSTLOOP]], 1
 ; CHECK-NEXT:    [[NARROW_IV_POSTLOOP]] = trunc i64 [[IV_NEXT_POSTLOOP]] to i32
 ; CHECK-NEXT:    [[LATCH_COND_POSTLOOP:%.*]] = icmp ult i32 [[NARROW_IV_POSTLOOP]], [[N]]
-; CHECK-NEXT:    br i1 [[LATCH_COND_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT]], !llvm.loop [[LOOP14:![0-9]+]], !irce.loop.clone !5
+; CHECK-NEXT:    br i1 [[LATCH_COND_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT]], !llvm.loop [[LOOP14:![0-9]+]], !loop_constrainer.loop.clone !5
 ;
 
 entry:
@@ -859,7 +859,7 @@ define i32 @test_increasing_ult_ult_wide_general(ptr %n_ptr, ptr %m_ptr) {
 ; CHECK-NEXT:    [[IV_NEXT_POSTLOOP]] = add i64 [[IV_POSTLOOP]], 1
 ; CHECK-NEXT:    [[NARROW_IV_POSTLOOP]] = trunc i64 [[IV_NEXT_POSTLOOP]] to i32
 ; CHECK-NEXT:    [[LATCH_COND_POSTLOOP:%.*]] = icmp ult i32 [[NARROW_IV_POSTLOOP]], [[N]]
-; CHECK-NEXT:    br i1 [[LATCH_COND_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT]], !llvm.loop [[LOOP15:![0-9]+]], !irce.loop.clone !5
+; CHECK-NEXT:    br i1 [[LATCH_COND_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT]], !llvm.loop [[LOOP15:![0-9]+]], !loop_constrainer.loop.clone !5
 ;
 
 entry:
@@ -980,7 +980,7 @@ define i32 @test_increasing_ult_ult_wide_multiple_checks(ptr %n_ptr, ptr %m1_ptr
 ; CHECK-NEXT:    [[IV_NEXT_POSTLOOP]] = add i64 [[IV_POSTLOOP]], 1
 ; CHECK-NEXT:    [[NARROW_IV_POSTLOOP]] = trunc i64 [[IV_NEXT_POSTLOOP]] to i32
 ; CHECK-NEXT:    [[LATCH_COND_POSTLOOP:%.*]] = icmp ult i32 [[NARROW_IV_POSTLOOP]], [[N]]
-; CHECK-NEXT:    br i1 [[LATCH_COND_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT]], !llvm.loop [[LOOP16:![0-9]+]], !irce.loop.clone !5
+; CHECK-NEXT:    br i1 [[LATCH_COND_POSTLOOP]], label [[LOOP_POSTLOOP]], label [[EXIT_LOOPEXIT]], !llvm.loop [[LOOP16:![0-9]+]], !loop_constrainer.loop.clone !5
 ;
 
 entry:
diff --git a/llvm/utils/gn/secondary/llvm/lib/Transforms/Utils/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Transforms/Utils/BUILD.gn
index e12f3165c5ecd..0e76f8286068e 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Transforms/Utils/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Transforms/Utils/BUILD.gn
@@ -46,6 +46,7 @@ static_library("Utils") {
     "LCSSA.cpp",
     "LibCallsShrinkWrap.cpp",
     "Local.cpp",
+    "LoopConstrainer.cpp"
     "LoopPeel.cpp",
     "LoopRotationUtils.cpp",
     "LoopSimplify.cpp",

From ccc573c6d821b010217d0621fd271c0a7972ccba Mon Sep 17 00:00:00 2001
From: Peter Klausler <35819229+klausler@users.noreply.github.com>
Date: Tue, 31 Oct 2023 10:25:29 -0700
Subject: [PATCH 546/877] =?UTF-8?q?[flang][runtime]=20When=20OPEN=20implie?=
 =?UTF-8?q?s=20CLOSE,=20disable=20later=20extant=20unit=20c=E2=80=A6=20(#6?=
 =?UTF-8?q?9390)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

…hecks

An OPEN statement on an existing unit can imply a CLOSE of that unit;
for example, OPEN(5, FILE="mydata", FORM="formatted") should implicitly
close the standard input that had been preconnected to unit 5. When this
happens, later checks in OPEN statement completion that apply only to
existing units should be disabled.
---
 flang/runtime/io-stmt.cpp |  6 ++++--
 flang/runtime/unit.cpp    | 11 +++++++----
 flang/runtime/unit.h      |  3 ++-
 3 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/flang/runtime/io-stmt.cpp b/flang/runtime/io-stmt.cpp
index 1d951a4157d31..dedf1f8364ad3 100644
--- a/flang/runtime/io-stmt.cpp
+++ b/flang/runtime/io-stmt.cpp
@@ -248,8 +248,10 @@ void OpenStatementState::CompleteOperation() {
   }
   if (path_.get() || wasExtant_ ||
       (status_ && *status_ == OpenStatus::Scratch)) {
-    unit().OpenUnit(status_, action_, position_.value_or(Position::AsIs),
-        std::move(path_), pathLength_, convert_, *this);
+    if (unit().OpenUnit(status_, action_, position_.value_or(Position::AsIs),
+            std::move(path_), pathLength_, convert_, *this)) {
+      wasExtant_ = false; // existing unit was closed
+    }
   } else {
     unit().OpenAnonymousUnit(
         status_, action_, position_.value_or(Position::AsIs), convert_, *this);
diff --git a/flang/runtime/unit.cpp b/flang/runtime/unit.cpp
index 1c8432328848e..dec8d8032f615 100644
--- a/flang/runtime/unit.cpp
+++ b/flang/runtime/unit.cpp
@@ -90,7 +90,7 @@ ExternalFileUnit &ExternalFileUnit::NewUnit(
   return unit;
 }
 
-void ExternalFileUnit::OpenUnit(std::optional<OpenStatus> status,
+bool ExternalFileUnit::OpenUnit(std::optional<OpenStatus> status,
     std::optional<Action> action, Position position, OwningPtr<char> &&newPath,
     std::size_t newPathLength, Convert convert, IoErrorHandler &handler) {
   if (convert == Convert::Unknown) {
@@ -99,24 +99,26 @@ void ExternalFileUnit::OpenUnit(std::optional<OpenStatus> status,
   swapEndianness_ = convert == Convert::Swap ||
       (convert == Convert::LittleEndian && !isHostLittleEndian) ||
       (convert == Convert::BigEndian && isHostLittleEndian);
+  bool impliedClose{false};
   if (IsConnected()) {
     bool isSamePath{newPath.get() && path() && pathLength() == newPathLength &&
         std::memcmp(path(), newPath.get(), newPathLength) == 0};
     if (status && *status != OpenStatus::Old && isSamePath) {
       handler.SignalError("OPEN statement for connected unit may not have "
                           "explicit STATUS= other than 'OLD'");
-      return;
+      return impliedClose;
     }
     if (!newPath.get() || isSamePath) {
       // OPEN of existing unit, STATUS='OLD' or unspecified, not new FILE=
       newPath.reset();
-      return;
+      return impliedClose;
     }
     // Otherwise, OPEN on open unit with new FILE= implies CLOSE
     DoImpliedEndfile(handler);
     FlushOutput(handler);
     TruncateFrame(0, handler);
     Close(CloseStatus::Keep, handler);
+    impliedClose = true;
   }
   if (newPath.get() && newPathLength > 0) {
     if (const auto *already{
@@ -125,7 +127,7 @@ void ExternalFileUnit::OpenUnit(std::optional<OpenStatus> status,
           "OPEN(UNIT=%d,FILE='%.*s'): file is already connected to unit %d",
           unitNumber_, static_cast<int>(newPathLength), newPath.get(),
           already->unitNumber_);
-      return;
+      return impliedClose;
     }
   }
   set_path(std::move(newPath), newPathLength);
@@ -166,6 +168,7 @@ void ExternalFileUnit::OpenUnit(std::optional<OpenStatus> status,
       currentRecordNumber = *endfileRecordNumber;
     }
   }
+  return impliedClose;
 }
 
 void ExternalFileUnit::OpenAnonymousUnit(std::optional<OpenStatus> status,
diff --git a/flang/runtime/unit.h b/flang/runtime/unit.h
index b6007a9b15385..1ec3013ba82b3 100644
--- a/flang/runtime/unit.h
+++ b/flang/runtime/unit.h
@@ -59,7 +59,8 @@ class ExternalFileUnit : public ConnectionState,
   static void CloseAll(IoErrorHandler &);
   static void FlushAll(IoErrorHandler &);
 
-  void OpenUnit(std::optional<OpenStatus>, std::optional<Action>, Position,
+  // Returns true if an existing unit was closed
+  bool OpenUnit(std::optional<OpenStatus>, std::optional<Action>, Position,
       OwningPtr<char> &&path, std::size_t pathLength, Convert,
       IoErrorHandler &);
   void OpenAnonymousUnit(std::optional<OpenStatus>, std::optional<Action>,

From a3ee0d4fac1659be934c170df60d2c10b8a3d5bb Mon Sep 17 00:00:00 2001
From: Jade Lovelace <software@lfcode.ca>
Date: Tue, 31 Oct 2023 10:30:24 -0700
Subject: [PATCH 547/877] [libc++] Remove unnecessary <cstdint> include from
 <string> (#70613)

<string> doesn't use any declarations from <cstdint> and this doesn't change
the transitive includes, so this is effectively a NFC cleanup.
---
 libcxx/include/string | 1 -
 1 file changed, 1 deletion(-)

diff --git a/libcxx/include/string b/libcxx/include/string
index cf9f0c847eb43..68f13414c7b69 100644
--- a/libcxx/include/string
+++ b/libcxx/include/string
@@ -614,7 +614,6 @@ basic_string<char32_t> operator""s( const char32_t *str, size_t len );
 #include <__utility/swap.h>
 #include <__utility/unreachable.h>
 #include <climits>
-#include <cstdint>
 #include <cstdio>  // EOF
 #include <cstring>
 #include <limits>

From b2bdc45580e04c86571a355b17bf406993eceebc Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph@gmail.com>
Date: Tue, 31 Oct 2023 10:29:29 -0700
Subject: [PATCH 548/877] [mlir][python] Fix possible use of variable use
 before set

The _mlirRegisterEverything symbol may not be built by some customers. The
code here was intended to support this, but didn't properly initialize the
init_module variable.
This would break JAX with:

NameError: free variable 'init_module' referenced before assignment in enclosing scope
---
 mlir/python/mlir/_mlir_libs/__init__.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/mlir/python/mlir/_mlir_libs/__init__.py b/mlir/python/mlir/_mlir_libs/__init__.py
index 71c074bc955e8..d5fc447e49bf3 100644
--- a/mlir/python/mlir/_mlir_libs/__init__.py
+++ b/mlir/python/mlir/_mlir_libs/__init__.py
@@ -83,6 +83,7 @@ def process_initializer_module(module_name):
 
     # If _mlirRegisterEverything is built, then include it as an initializer
     # module.
+    init_module = None
     if process_initializer_module("_mlirRegisterEverything"):
         init_module = importlib.import_module(f"._mlirRegisterEverything", __name__)
 

From 4739c883fdc1ddb22b51502dbde177410663dee3 Mon Sep 17 00:00:00 2001
From: Peter Klausler <35819229+klausler@users.noreply.github.com>
Date: Tue, 31 Oct 2023 10:36:50 -0700
Subject: [PATCH 549/877] =?UTF-8?q?[flang]=20Make=20NULL()=20initializers?=
 =?UTF-8?q?=20explicit=20for=20allocatables=20in=20DATA=20co=E2=80=A6=20(#?=
 =?UTF-8?q?69753)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

…nversion

As requested by people working on lowering: when semantics converts the
contents of DATA statements into explicit object initializers, ensure
that structure constructors for derived types contain explicit NULL()
values for their allocatable components.
---
 flang/lib/Evaluate/initial-image.cpp |  9 ++++++++-
 flang/test/Semantics/data19.f90      |  2 +-
 flang/test/Semantics/data20.f90      | 18 ++++++++++++++++++
 3 files changed, 27 insertions(+), 2 deletions(-)
 create mode 100644 flang/test/Semantics/data20.f90

diff --git a/flang/lib/Evaluate/initial-image.cpp b/flang/lib/Evaluate/initial-image.cpp
index 3b0d738c422d4..6a712bcfbe1bc 100644
--- a/flang/lib/Evaluate/initial-image.cpp
+++ b/flang/lib/Evaluate/initial-image.cpp
@@ -119,9 +119,16 @@ class AsConstantHelper {
             for (std::size_t j{0}; j < elements; ++j, at += stride) {
               if (Result value{image_.AsConstantPointer(at)}) {
                 typedValue[j].emplace(component, std::move(*value));
+              } else {
+                typedValue[j].emplace(component, Expr<SomeType>{NullPointer{}});
               }
             }
-          } else if (!IsAllocatable(component)) {
+          } else if (IsAllocatable(component)) {
+            // Lowering needs an explicit NULL() for allocatables
+            for (std::size_t j{0}; j < elements; ++j, at += stride) {
+              typedValue[j].emplace(component, Expr<SomeType>{NullPointer{}});
+            }
+          } else {
             auto componentType{DynamicType::From(component)};
             CHECK(componentType.has_value());
             auto componentExtents{GetConstantExtents(context_, component)};
diff --git a/flang/test/Semantics/data19.f90 b/flang/test/Semantics/data19.f90
index 6ed2e2144d401..b686ae221dcdb 100644
--- a/flang/test/Semantics/data19.f90
+++ b/flang/test/Semantics/data19.f90
@@ -1,6 +1,6 @@
 ! RUN: %flang_fc1 -fdebug-dump-symbols %s 2>&1 | FileCheck %s
 ! Test truncation/padding in DATA statement.
-
+program main
   character(len=3) :: c1, c2, c3(2), c4(2)
   data c1(1:2), c1(3:3) /'123', '4'/
   data c2(1:2), c2(3:3) /'1', '2'/
diff --git a/flang/test/Semantics/data20.f90 b/flang/test/Semantics/data20.f90
new file mode 100644
index 0000000000000..dea2350adbdae
--- /dev/null
+++ b/flang/test/Semantics/data20.f90
@@ -0,0 +1,18 @@
+! RUN: %flang_fc1 -fdebug-dump-symbols %s 2>&1 | FileCheck %s
+! Verify that allocatable components have explicit NULL() initializers
+! when converted from DATA statements
+!CHECK: x1 (InDataStmt) size=32 offset=0: ObjectEntity type: TYPE(t) init:t(a=NULL(),n=1_4)
+!CHECK: x2 (InDataStmt) size=64 offset=32: ObjectEntity type: TYPE(t) shape: 1_8:2_8 init:[t::t(a=NULL(),n=2_4),t(a=NULL(),n=3_4)]
+!CHECK: x3 (InDataStmt) size=64 offset=96: ObjectEntity type: TYPE(t2) init:t2(b=[t::t(a=NULL(),n=4_4),t(a=NULL(),n=5_4)])
+program main
+  type t
+    real, allocatable :: a
+    integer n
+  end type
+  type t2
+    type(t) b(2)
+  end type
+  type(t) x1, x2(2)
+  type(t2) x3
+  data x1%n/1/, x2(:)%n/2, 3/, x3%b(:)%n/4, 5/
+end

From 23c24ed4bc994d00ecb83c59fa0f5051f1a3bfa9 Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph@gmail.com>
Date: Tue, 31 Oct 2023 10:39:40 -0700
Subject: [PATCH 550/877] Remove unused variable in
 lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp

Fixing build warning.
---
 llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp b/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp
index 72a20213dfa66..c60fffb60124f 100644
--- a/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp
@@ -130,8 +130,6 @@ static cl::opt<bool>
     PrintScaledBoundaryRangeChecks("irce-print-scaled-boundary-range-checks",
                                    cl::Hidden, cl::init(false));
 
-static const char *ClonedLoopTag = "loop_constrainer.loop.clone";
-
 #define DEBUG_TYPE "irce"
 
 namespace {

From df4cf0b937fdbcfad407883b74b56ded981b2d9c Mon Sep 17 00:00:00 2001
From: Mark de Wever <koraq@xs4all.nl>
Date: Tue, 31 Oct 2023 18:41:36 +0100
Subject: [PATCH 551/877] [clang-tidy] Fixes a typo in the help message.

It contained 'thewarnings' without a space. It also fixes some odd
formatting in the text introduced by

commit dd3c26a045c081620375a878159f536758baba6e
Author: Tobias Hieta <tobias@hieta.se>
Date:   Wed May 17 10:56:49 2023 +0200

    [NFC][Py Reformat] Reformat python files in clang and clang-tools-extra
---
 .../clang-tidy/tool/run-clang-tidy.py            | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/clang-tools-extra/clang-tidy/tool/run-clang-tidy.py b/clang-tools-extra/clang-tidy/tool/run-clang-tidy.py
index 179759216196f..70f8cbcdcb2f1 100755
--- a/clang-tools-extra/clang-tidy/tool/run-clang-tidy.py
+++ b/clang-tools-extra/clang-tidy/tool/run-clang-tidy.py
@@ -257,7 +257,7 @@ def main():
     parser.add_argument(
         "-allow-enabling-alpha-checkers",
         action="store_true",
-        help="allow alpha checkers from " "clang-analyzer.",
+        help="allow alpha checkers from clang-analyzer.",
     )
     parser.add_argument(
         "-clang-tidy-binary", metavar="PATH", help="path to clang-tidy binary"
@@ -270,7 +270,7 @@ def main():
     parser.add_argument(
         "-checks",
         default=None,
-        help="checks filter, when not specified, use clang-tidy " "default",
+        help="checks filter, when not specified, use clang-tidy default",
     )
     config_group = parser.add_mutually_exclusive_group()
     config_group.add_argument(
@@ -303,7 +303,7 @@ def main():
     parser.add_argument(
         "-line-filter",
         default=None,
-        help="List of files with line ranges to filter the" "warnings.",
+        help="List of files with line ranges to filter the warnings.",
     )
     if yaml:
         parser.add_argument(
@@ -335,12 +335,12 @@ def main():
     )
     parser.add_argument("-fix", action="store_true", help="apply fix-its")
     parser.add_argument(
-        "-format", action="store_true", help="Reformat code " "after applying fixes"
+        "-format", action="store_true", help="Reformat code after applying fixes"
     )
     parser.add_argument(
         "-style",
         default="file",
-        help="The style of reformat " "code after applying fixes",
+        help="The style of reformat code after applying fixes",
     )
     parser.add_argument(
         "-use-color",
@@ -359,14 +359,14 @@ def main():
         dest="extra_arg",
         action="append",
         default=[],
-        help="Additional argument to append to the compiler " "command line.",
+        help="Additional argument to append to the compiler command line.",
     )
     parser.add_argument(
         "-extra-arg-before",
         dest="extra_arg_before",
         action="append",
         default=[],
-        help="Additional argument to prepend to the compiler " "command line.",
+        help="Additional argument to prepend to the compiler command line.",
     )
     parser.add_argument(
         "-quiet", action="store_true", help="Run clang-tidy in quiet mode"
@@ -381,7 +381,7 @@ def main():
     parser.add_argument(
         "-warnings-as-errors",
         default=None,
-        help="Upgrades warnings to errors. Same format as " "'-checks'",
+        help="Upgrades warnings to errors. Same format as '-checks'",
     )
     args = parser.parse_args()
 

From 4ad2ada5216ee2bb3c334a3233a9ab51f2521b82 Mon Sep 17 00:00:00 2001
From: Vlad Serebrennikov <serebrennikov.vladislav@gmail.com>
Date: Tue, 31 Oct 2023 20:45:02 +0300
Subject: [PATCH 552/877] [clang][NFC] Refactor ElaboratedTypeKeyword

This patch moves ElaboratedTypeKeyword before `Type` definition so that the enum is complete where bit-field for it is declared. It also converts it to scoped enum and removes `ETK_` prefix.
---
 .../modernize/UseConstraintsCheck.cpp         |  4 +-
 .../modernize/UseTrailingReturnTypeCheck.cpp  |  3 +-
 clang/include/clang/AST/Type.h                | 58 ++++++------
 clang/include/clang/AST/TypeLoc.h             |  2 +-
 clang/lib/AST/ASTContext.cpp                  |  5 +-
 clang/lib/AST/ASTStructuralEquivalence.cpp    |  4 +-
 clang/lib/AST/ItaniumMangle.cpp               | 28 +++---
 clang/lib/AST/ODRHash.cpp                     |  2 +-
 clang/lib/AST/QualTypeNames.cpp               |  4 +-
 clang/lib/AST/Type.cpp                        | 94 ++++++++++++-------
 clang/lib/AST/TypePrinter.cpp                 |  6 +-
 clang/lib/ExtractAPI/DeclarationFragments.cpp |  2 +-
 clang/lib/Sema/SemaCXXScopeSpec.cpp           |  2 +-
 clang/lib/Sema/SemaCoroutine.cpp              |  3 +-
 clang/lib/Sema/SemaDecl.cpp                   | 19 ++--
 clang/lib/Sema/SemaDeclCXX.cpp                | 29 +++---
 clang/lib/Sema/SemaExpr.cpp                   |  2 +-
 clang/lib/Sema/SemaExprCXX.cpp                | 25 ++---
 clang/lib/Sema/SemaTemplate.cpp               | 28 +++---
 clang/lib/Sema/SemaTemplateInstantiate.cpp    |  3 +-
 clang/lib/Sema/SemaType.cpp                   |  9 +-
 clang/lib/Sema/TreeTransform.h                |  6 +-
 22 files changed, 193 insertions(+), 145 deletions(-)

diff --git a/clang-tools-extra/clang-tidy/modernize/UseConstraintsCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseConstraintsCheck.cpp
index 310b5bf9affa1..6d7d1d6b87c60 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseConstraintsCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/UseConstraintsCheck.cpp
@@ -53,7 +53,7 @@ matchEnableIfSpecializationImplTypename(TypeLoc TheType) {
   if (const auto Dep = TheType.getAs<DependentNameTypeLoc>()) {
     const IdentifierInfo *Identifier = Dep.getTypePtr()->getIdentifier();
     if (!Identifier || Identifier->getName() != "type" ||
-        Dep.getTypePtr()->getKeyword() != ETK_Typename) {
+        Dep.getTypePtr()->getKeyword() != ElaboratedTypeKeyword::Typename) {
       return std::nullopt;
     }
     TheType = Dep.getQualifierLoc().getTypeLoc();
@@ -105,7 +105,7 @@ matchEnableIfSpecializationImplTrait(TypeLoc TheType) {
     if (const auto *AliasedType =
             dyn_cast<DependentNameType>(Specialization->getAliasedType())) {
       if (AliasedType->getIdentifier()->getName() != "type" ||
-          AliasedType->getKeyword() != ETK_Typename) {
+          AliasedType->getKeyword() != ElaboratedTypeKeyword::Typename) {
         return std::nullopt;
       }
     } else {
diff --git a/clang-tools-extra/clang-tidy/modernize/UseTrailingReturnTypeCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseTrailingReturnTypeCheck.cpp
index 5a456c58fb5cc..9774e988d71e2 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseTrailingReturnTypeCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/UseTrailingReturnTypeCheck.cpp
@@ -98,7 +98,8 @@ struct UnqualNameVisitor : public RecursiveASTVisitor<UnqualNameVisitor> {
       return false;
     const auto *T = TL.getTypePtr();
     return TraverseTypeLoc(TL.getNamedTypeLoc(),
-                           T->getKeyword() != ETK_None || T->getQualifier());
+                           T->getKeyword() != ElaboratedTypeKeyword::None ||
+                               T->getQualifier());
   }
 
   bool VisitDeclRefExpr(DeclRefExpr *S) {
diff --git a/clang/include/clang/AST/Type.h b/clang/include/clang/AST/Type.h
index 080a5b045ed4d..233f607849ce7 100644
--- a/clang/include/clang/AST/Type.h
+++ b/clang/include/clang/AST/Type.h
@@ -1575,6 +1575,32 @@ enum class AutoTypeKeyword {
 /// 'static' is only allowed on function parameters.
 enum class ArraySizeModifier { Normal, Static, Star };
 
+/// The elaboration keyword that precedes a qualified type name or
+/// introduces an elaborated-type-specifier.
+enum class ElaboratedTypeKeyword {
+  /// The "struct" keyword introduces the elaborated-type-specifier.
+  Struct,
+
+  /// The "__interface" keyword introduces the elaborated-type-specifier.
+  Interface,
+
+  /// The "union" keyword introduces the elaborated-type-specifier.
+  Union,
+
+  /// The "class" keyword introduces the elaborated-type-specifier.
+  Class,
+
+  /// The "enum" keyword introduces the elaborated-type-specifier.
+  Enum,
+
+  /// The "typename" keyword precedes the qualified type name, e.g.,
+  /// \c typename T::type.
+  Typename,
+
+  /// No keyword precedes the qualified type name.
+  None
+};
+
 /// The base class of the type hierarchy.
 ///
 /// A central concept with types is that each type always has a canonical
@@ -5659,32 +5685,6 @@ enum TagTypeKind {
   TTK_Enum
 };
 
-/// The elaboration keyword that precedes a qualified type name or
-/// introduces an elaborated-type-specifier.
-enum ElaboratedTypeKeyword {
-  /// The "struct" keyword introduces the elaborated-type-specifier.
-  ETK_Struct,
-
-  /// The "__interface" keyword introduces the elaborated-type-specifier.
-  ETK_Interface,
-
-  /// The "union" keyword introduces the elaborated-type-specifier.
-  ETK_Union,
-
-  /// The "class" keyword introduces the elaborated-type-specifier.
-  ETK_Class,
-
-  /// The "enum" keyword introduces the elaborated-type-specifier.
-  ETK_Enum,
-
-  /// The "typename" keyword precedes the qualified type name, e.g.,
-  /// \c typename T::type.
-  ETK_Typename,
-
-  /// No keyword precedes the qualified type name.
-  ETK_None
-};
-
 /// A helper class for Type nodes having an ElaboratedTypeKeyword.
 /// The keyword in stored in the free bits of the base class.
 /// Also provides a few static helpers for converting and printing
@@ -5694,7 +5694,7 @@ class TypeWithKeyword : public Type {
   TypeWithKeyword(ElaboratedTypeKeyword Keyword, TypeClass tc,
                   QualType Canonical, TypeDependence Dependence)
       : Type(tc, Canonical, Dependence) {
-    TypeWithKeywordBits.Keyword = Keyword;
+    TypeWithKeywordBits.Keyword = llvm::to_underlying(Keyword);
   }
 
 public:
@@ -5800,7 +5800,7 @@ class ElaboratedType final
   static void Profile(llvm::FoldingSetNodeID &ID, ElaboratedTypeKeyword Keyword,
                       NestedNameSpecifier *NNS, QualType NamedType,
                       TagDecl *OwnedTagDecl) {
-    ID.AddInteger(Keyword);
+    ID.AddInteger(llvm::to_underlying(Keyword));
     ID.AddPointer(NNS);
     NamedType.Profile(ID);
     ID.AddPointer(OwnedTagDecl);
@@ -5859,7 +5859,7 @@ class DependentNameType : public TypeWithKeyword, public llvm::FoldingSetNode {
 
   static void Profile(llvm::FoldingSetNodeID &ID, ElaboratedTypeKeyword Keyword,
                       NestedNameSpecifier *NNS, const IdentifierInfo *Name) {
-    ID.AddInteger(Keyword);
+    ID.AddInteger(llvm::to_underlying(Keyword));
     ID.AddPointer(NNS);
     ID.AddPointer(Name);
   }
diff --git a/clang/include/clang/AST/TypeLoc.h b/clang/include/clang/AST/TypeLoc.h
index 5bb487f1a7f44..471deb14aba51 100644
--- a/clang/include/clang/AST/TypeLoc.h
+++ b/clang/include/clang/AST/TypeLoc.h
@@ -2294,7 +2294,7 @@ class ElaboratedTypeLoc : public ConcreteTypeLoc<UnqualTypeLoc,
   QualType getInnerType() const { return getTypePtr()->getNamedType(); }
 
   bool isEmpty() const {
-    return getTypePtr()->getKeyword() == ElaboratedTypeKeyword::ETK_None &&
+    return getTypePtr()->getKeyword() == ElaboratedTypeKeyword::None &&
            !getTypePtr()->getQualifier();
   }
 
diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp
index e64bb0cf29dbb..e864886a84007 100644
--- a/clang/lib/AST/ASTContext.cpp
+++ b/clang/lib/AST/ASTContext.cpp
@@ -5122,7 +5122,8 @@ ASTContext::getDependentTemplateSpecializationType(
   NestedNameSpecifier *CanonNNS = getCanonicalNestedNameSpecifier(NNS);
 
   ElaboratedTypeKeyword CanonKeyword = Keyword;
-  if (Keyword == ETK_None) CanonKeyword = ETK_Typename;
+  if (Keyword == ElaboratedTypeKeyword::None)
+    CanonKeyword = ElaboratedTypeKeyword::Typename;
 
   bool AnyNonCanonArgs = false;
   auto CanonArgs =
@@ -12502,7 +12503,7 @@ static auto getCommonTemplateArguments(ASTContext &Ctx,
 template <class T>
 static ElaboratedTypeKeyword getCommonTypeKeyword(const T *X, const T *Y) {
   return X->getKeyword() == Y->getKeyword() ? X->getKeyword()
-                                            : ElaboratedTypeKeyword::ETK_None;
+                                            : ElaboratedTypeKeyword::None;
 }
 
 template <class T>
diff --git a/clang/lib/AST/ASTStructuralEquivalence.cpp b/clang/lib/AST/ASTStructuralEquivalence.cpp
index 8ad142ff09b78..6bb4bf14b873d 100644
--- a/clang/lib/AST/ASTStructuralEquivalence.cpp
+++ b/clang/lib/AST/ASTStructuralEquivalence.cpp
@@ -1173,7 +1173,9 @@ static bool IsStructurallyEquivalent(StructuralEquivalenceContext &Context,
   case Type::Elaborated: {
     const auto *Elab1 = cast<ElaboratedType>(T1);
     const auto *Elab2 = cast<ElaboratedType>(T2);
-    // CHECKME: what if a keyword is ETK_None or ETK_typename ?
+    // CHECKME: what if a keyword is ElaboratedTypeKeyword::None or
+    // ElaboratedTypeKeyword::Typename
+    // ?
     if (Elab1->getKeyword() != Elab2->getKeyword())
       return false;
     if (!IsStructurallyEquivalent(Context, Elab1->getQualifier(),
diff --git a/clang/lib/AST/ItaniumMangle.cpp b/clang/lib/AST/ItaniumMangle.cpp
index 5efd23a38e739..80a177eb39625 100644
--- a/clang/lib/AST/ItaniumMangle.cpp
+++ b/clang/lib/AST/ItaniumMangle.cpp
@@ -4209,20 +4209,20 @@ void CXXNameMangler::mangleType(const DependentNameType *T) {
   //                   ::= Te <name> # dependent elaborated type specifier using
   //                                 # 'enum'
   switch (T->getKeyword()) {
-    case ETK_None:
-    case ETK_Typename:
-      break;
-    case ETK_Struct:
-    case ETK_Class:
-    case ETK_Interface:
-      Out << "Ts";
-      break;
-    case ETK_Union:
-      Out << "Tu";
-      break;
-    case ETK_Enum:
-      Out << "Te";
-      break;
+  case ElaboratedTypeKeyword::None:
+  case ElaboratedTypeKeyword::Typename:
+    break;
+  case ElaboratedTypeKeyword::Struct:
+  case ElaboratedTypeKeyword::Class:
+  case ElaboratedTypeKeyword::Interface:
+    Out << "Ts";
+    break;
+  case ElaboratedTypeKeyword::Union:
+    Out << "Tu";
+    break;
+  case ElaboratedTypeKeyword::Enum:
+    Out << "Te";
+    break;
   }
   // Typename types are always nested
   Out << 'N';
diff --git a/clang/lib/AST/ODRHash.cpp b/clang/lib/AST/ODRHash.cpp
index 22bd914c02687..ce2eb96a651e3 100644
--- a/clang/lib/AST/ODRHash.cpp
+++ b/clang/lib/AST/ODRHash.cpp
@@ -1181,7 +1181,7 @@ class ODRTypeVisitor : public TypeVisitor<ODRTypeVisitor> {
   }
 
   void VisitTypeWithKeyword(const TypeWithKeyword *T) {
-    ID.AddInteger(T->getKeyword());
+    ID.AddInteger(llvm::to_underlying(T->getKeyword()));
     VisitType(T);
   };
 
diff --git a/clang/lib/AST/QualTypeNames.cpp b/clang/lib/AST/QualTypeNames.cpp
index 7557336f0aafa..066377423df76 100644
--- a/clang/lib/AST/QualTypeNames.cpp
+++ b/clang/lib/AST/QualTypeNames.cpp
@@ -440,7 +440,7 @@ QualType getFullyQualifiedType(QualType QT, const ASTContext &Ctx,
   // elaborated type.
   Qualifiers PrefixQualifiers = QT.getLocalQualifiers();
   QT = QualType(QT.getTypePtr(), 0);
-  ElaboratedTypeKeyword Keyword = ETK_None;
+  ElaboratedTypeKeyword Keyword = ElaboratedTypeKeyword::None;
   if (const auto *ETypeInput = dyn_cast<ElaboratedType>(QT.getTypePtr())) {
     QT = ETypeInput->getNamedType();
     assert(!QT.hasLocalQualifiers());
@@ -471,7 +471,7 @@ QualType getFullyQualifiedType(QualType QT, const ASTContext &Ctx,
         Ctx, QT.getTypePtr(), WithGlobalNsPrefix);
     QT = QualType(TypePtr, 0);
   }
-  if (Prefix || Keyword != ETK_None) {
+  if (Prefix || Keyword != ElaboratedTypeKeyword::None) {
     QT = Ctx.getElaboratedType(Keyword, Prefix, QT);
   }
   QT = Ctx.getQualifiedType(QT, PrefixQualifiers);
diff --git a/clang/lib/AST/Type.cpp b/clang/lib/AST/Type.cpp
index 2a89b20771543..5b70380d4ad2e 100644
--- a/clang/lib/AST/Type.cpp
+++ b/clang/lib/AST/Type.cpp
@@ -3016,13 +3016,20 @@ bool Type::isSpecifierType() const {
 ElaboratedTypeKeyword
 TypeWithKeyword::getKeywordForTypeSpec(unsigned TypeSpec) {
   switch (TypeSpec) {
-  default: return ETK_None;
-  case TST_typename: return ETK_Typename;
-  case TST_class: return ETK_Class;
-  case TST_struct: return ETK_Struct;
-  case TST_interface: return ETK_Interface;
-  case TST_union: return ETK_Union;
-  case TST_enum: return ETK_Enum;
+  default:
+    return ElaboratedTypeKeyword::None;
+  case TST_typename:
+    return ElaboratedTypeKeyword::Typename;
+  case TST_class:
+    return ElaboratedTypeKeyword::Class;
+  case TST_struct:
+    return ElaboratedTypeKeyword::Struct;
+  case TST_interface:
+    return ElaboratedTypeKeyword::Interface;
+  case TST_union:
+    return ElaboratedTypeKeyword::Union;
+  case TST_enum:
+    return ElaboratedTypeKeyword::Enum;
   }
 }
 
@@ -3042,11 +3049,16 @@ TypeWithKeyword::getTagTypeKindForTypeSpec(unsigned TypeSpec) {
 ElaboratedTypeKeyword
 TypeWithKeyword::getKeywordForTagTypeKind(TagTypeKind Kind) {
   switch (Kind) {
-  case TTK_Class: return ETK_Class;
-  case TTK_Struct: return ETK_Struct;
-  case TTK_Interface: return ETK_Interface;
-  case TTK_Union: return ETK_Union;
-  case TTK_Enum: return ETK_Enum;
+  case TTK_Class:
+    return ElaboratedTypeKeyword::Class;
+  case TTK_Struct:
+    return ElaboratedTypeKeyword::Struct;
+  case TTK_Interface:
+    return ElaboratedTypeKeyword::Interface;
+  case TTK_Union:
+    return ElaboratedTypeKeyword::Union;
+  case TTK_Enum:
+    return ElaboratedTypeKeyword::Enum;
   }
   llvm_unreachable("Unknown tag type kind.");
 }
@@ -3054,13 +3066,18 @@ TypeWithKeyword::getKeywordForTagTypeKind(TagTypeKind Kind) {
 TagTypeKind
 TypeWithKeyword::getTagTypeKindForKeyword(ElaboratedTypeKeyword Keyword) {
   switch (Keyword) {
-  case ETK_Class: return TTK_Class;
-  case ETK_Struct: return TTK_Struct;
-  case ETK_Interface: return TTK_Interface;
-  case ETK_Union: return TTK_Union;
-  case ETK_Enum: return TTK_Enum;
-  case ETK_None: // Fall through.
-  case ETK_Typename:
+  case ElaboratedTypeKeyword::Class:
+    return TTK_Class;
+  case ElaboratedTypeKeyword::Struct:
+    return TTK_Struct;
+  case ElaboratedTypeKeyword::Interface:
+    return TTK_Interface;
+  case ElaboratedTypeKeyword::Union:
+    return TTK_Union;
+  case ElaboratedTypeKeyword::Enum:
+    return TTK_Enum;
+  case ElaboratedTypeKeyword::None: // Fall through.
+  case ElaboratedTypeKeyword::Typename:
     llvm_unreachable("Elaborated type keyword is not a tag type kind.");
   }
   llvm_unreachable("Unknown elaborated type keyword.");
@@ -3069,14 +3086,14 @@ TypeWithKeyword::getTagTypeKindForKeyword(ElaboratedTypeKeyword Keyword) {
 bool
 TypeWithKeyword::KeywordIsTagTypeKind(ElaboratedTypeKeyword Keyword) {
   switch (Keyword) {
-  case ETK_None:
-  case ETK_Typename:
+  case ElaboratedTypeKeyword::None:
+  case ElaboratedTypeKeyword::Typename:
     return false;
-  case ETK_Class:
-  case ETK_Struct:
-  case ETK_Interface:
-  case ETK_Union:
-  case ETK_Enum:
+  case ElaboratedTypeKeyword::Class:
+  case ElaboratedTypeKeyword::Struct:
+  case ElaboratedTypeKeyword::Interface:
+  case ElaboratedTypeKeyword::Union:
+  case ElaboratedTypeKeyword::Enum:
     return true;
   }
   llvm_unreachable("Unknown elaborated type keyword.");
@@ -3084,13 +3101,20 @@ TypeWithKeyword::KeywordIsTagTypeKind(ElaboratedTypeKeyword Keyword) {
 
 StringRef TypeWithKeyword::getKeywordName(ElaboratedTypeKeyword Keyword) {
   switch (Keyword) {
-  case ETK_None: return {};
-  case ETK_Typename: return "typename";
-  case ETK_Class:  return "class";
-  case ETK_Struct: return "struct";
-  case ETK_Interface: return "__interface";
-  case ETK_Union:  return "union";
-  case ETK_Enum:   return "enum";
+  case ElaboratedTypeKeyword::None:
+    return {};
+  case ElaboratedTypeKeyword::Typename:
+    return "typename";
+  case ElaboratedTypeKeyword::Class:
+    return "class";
+  case ElaboratedTypeKeyword::Struct:
+    return "struct";
+  case ElaboratedTypeKeyword::Interface:
+    return "__interface";
+  case ElaboratedTypeKeyword::Union:
+    return "union";
+  case ElaboratedTypeKeyword::Enum:
+    return "enum";
   }
 
   llvm_unreachable("Unknown elaborated type keyword.");
@@ -3123,7 +3147,7 @@ DependentTemplateSpecializationType::Profile(llvm::FoldingSetNodeID &ID,
                                              NestedNameSpecifier *Qualifier,
                                              const IdentifierInfo *Name,
                                              ArrayRef<TemplateArgument> Args) {
-  ID.AddInteger(Keyword);
+  ID.AddInteger(llvm::to_underlying(Keyword));
   ID.AddPointer(Qualifier);
   ID.AddPointer(Name);
   for (const TemplateArgument &Arg : Args)
@@ -4741,7 +4765,7 @@ AutoType::AutoType(QualType DeducedAsType, AutoTypeKeyword Keyword,
                    ConceptDecl *TypeConstraintConcept,
                    ArrayRef<TemplateArgument> TypeConstraintArgs)
     : DeducedType(Auto, DeducedAsType, ExtraDependence, Canon) {
-  AutoTypeBits.Keyword = (unsigned)Keyword;
+  AutoTypeBits.Keyword = llvm::to_underlying(Keyword);
   AutoTypeBits.NumArgs = TypeConstraintArgs.size();
   this->TypeConstraintConcept = TypeConstraintConcept;
   assert(TypeConstraintConcept || AutoTypeBits.NumArgs == 0);
diff --git a/clang/lib/AST/TypePrinter.cpp b/clang/lib/AST/TypePrinter.cpp
index f6e69b988f41d..c24373b346046 100644
--- a/clang/lib/AST/TypePrinter.cpp
+++ b/clang/lib/AST/TypePrinter.cpp
@@ -1597,7 +1597,7 @@ void TypePrinter::printElaboratedBefore(const ElaboratedType *T,
   if (!Policy.IncludeTagDefinition)
   {
     OS << TypeWithKeyword::getKeywordName(T->getKeyword());
-    if (T->getKeyword() != ETK_None)
+    if (T->getKeyword() != ElaboratedTypeKeyword::None)
       OS << " ";
     NestedNameSpecifier *Qualifier = T->getQualifier();
     if (Qualifier)
@@ -1641,7 +1641,7 @@ void TypePrinter::printParenAfter(const ParenType *T, raw_ostream &OS) {
 void TypePrinter::printDependentNameBefore(const DependentNameType *T,
                                            raw_ostream &OS) {
   OS << TypeWithKeyword::getKeywordName(T->getKeyword());
-  if (T->getKeyword() != ETK_None)
+  if (T->getKeyword() != ElaboratedTypeKeyword::None)
     OS << " ";
 
   T->getQualifier()->print(OS, Policy);
@@ -1658,7 +1658,7 @@ void TypePrinter::printDependentTemplateSpecializationBefore(
   IncludeStrongLifetimeRAII Strong(Policy);
 
   OS << TypeWithKeyword::getKeywordName(T->getKeyword());
-  if (T->getKeyword() != ETK_None)
+  if (T->getKeyword() != ElaboratedTypeKeyword::None)
     OS << " ";
 
   if (T->getQualifier())
diff --git a/clang/lib/ExtractAPI/DeclarationFragments.cpp b/clang/lib/ExtractAPI/DeclarationFragments.cpp
index 06552d011a2d4..02fa6cd6119ec 100644
--- a/clang/lib/ExtractAPI/DeclarationFragments.cpp
+++ b/clang/lib/ExtractAPI/DeclarationFragments.cpp
@@ -297,7 +297,7 @@ DeclarationFragments DeclarationFragmentsBuilder::getFragmentsForType(
   // qualified name, e.g., `N::M::type`, or both.
   if (const ElaboratedType *ET = dyn_cast<ElaboratedType>(T)) {
     ElaboratedTypeKeyword Keyword = ET->getKeyword();
-    if (Keyword != ETK_None) {
+    if (Keyword != ElaboratedTypeKeyword::None) {
       Fragments
           .append(ElaboratedType::getKeywordName(Keyword),
                   DeclarationFragments::FragmentKind::Keyword)
diff --git a/clang/lib/Sema/SemaCXXScopeSpec.cpp b/clang/lib/Sema/SemaCXXScopeSpec.cpp
index f37ba5cf4c109..44a40215b90df 100644
--- a/clang/lib/Sema/SemaCXXScopeSpec.cpp
+++ b/clang/lib/Sema/SemaCXXScopeSpec.cpp
@@ -908,7 +908,7 @@ bool Sema::ActOnCXXNestedNameSpecifier(Scope *S,
     // the template name.
     assert(DTN->getQualifier() == SS.getScopeRep());
     QualType T = Context.getDependentTemplateSpecializationType(
-        ETK_None, DTN->getQualifier(), DTN->getIdentifier(),
+        ElaboratedTypeKeyword::None, DTN->getQualifier(), DTN->getIdentifier(),
         TemplateArgs.arguments());
 
     // Create source-location information for this type.
diff --git a/clang/lib/Sema/SemaCoroutine.cpp b/clang/lib/Sema/SemaCoroutine.cpp
index cfaa93fbea4dd..38ac406b14ada 100644
--- a/clang/lib/Sema/SemaCoroutine.cpp
+++ b/clang/lib/Sema/SemaCoroutine.cpp
@@ -118,7 +118,8 @@ static QualType lookupPromiseType(Sema &S, const FunctionDecl *FD,
     auto *NNS = NestedNameSpecifier::Create(S.Context, nullptr, S.getStdNamespace());
     NNS = NestedNameSpecifier::Create(S.Context, NNS, false,
                                       CoroTrait.getTypePtr());
-    return S.Context.getElaboratedType(ETK_None, NNS, PromiseType);
+    return S.Context.getElaboratedType(ElaboratedTypeKeyword::None, NNS,
+                                       PromiseType);
   };
 
   if (!PromiseType->getAsCXXRecordDecl()) {
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index d1550f1289068..824267acbb1c0 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -266,7 +266,8 @@ static ParsedType recoverFromTypeInKnownDependentBase(Sema &S,
   ASTContext &Context = S.Context;
   auto *NNS = NestedNameSpecifier::Create(Context, nullptr, false,
                                           cast<Type>(Context.getRecordType(RD)));
-  QualType T = Context.getDependentNameType(ETK_Typename, NNS, &II);
+  QualType T =
+      Context.getDependentNameType(ElaboratedTypeKeyword::Typename, NNS, &II);
 
   CXXScopeSpec SS;
   SS.MakeTrivial(Context, NNS, SourceRange(NameLoc));
@@ -303,10 +304,10 @@ static ParsedType buildNamedType(Sema &S, const CXXScopeSpec *SS, QualType T,
   }
 
   if (!SS || SS->isEmpty())
-    return ParsedType::make(
-        S.Context.getElaboratedType(ETK_None, nullptr, T, nullptr));
+    return ParsedType::make(S.Context.getElaboratedType(
+        ElaboratedTypeKeyword::None, nullptr, T, nullptr));
 
-  QualType ElTy = S.getElaboratedType(ETK_None, *SS, T);
+  QualType ElTy = S.getElaboratedType(ElaboratedTypeKeyword::None, *SS, T);
   if (!WantNontrivialTypeSourceInfo)
     return ParsedType::make(ElTy);
 
@@ -383,9 +384,10 @@ ParsedType Sema::getTypeName(const IdentifierInfo &II, SourceLocation NameLoc,
               .get();
 
         NestedNameSpecifierLoc QualifierLoc = SS->getWithLocInContext(Context);
-        QualType T =
-            CheckTypenameType(IsImplicitTypename ? ETK_Typename : ETK_None,
-                              SourceLocation(), QualifierLoc, II, NameLoc);
+        QualType T = CheckTypenameType(
+            IsImplicitTypename ? ElaboratedTypeKeyword::Typename
+                               : ElaboratedTypeKeyword::None,
+            SourceLocation(), QualifierLoc, II, NameLoc);
         return ParsedType::make(T);
       }
 
@@ -648,7 +650,8 @@ ParsedType Sema::ActOnMSVCUnknownTypeName(const IdentifierInfo &II,
     return ParsedType();
   }
 
-  QualType T = Context.getDependentNameType(ETK_None, NNS, &II);
+  QualType T =
+      Context.getDependentNameType(ElaboratedTypeKeyword::None, NNS, &II);
 
   // Build type location information.  We synthesized the qualifier, so we have
   // to build a fake NestedNameSpecifierLoc.
diff --git a/clang/lib/Sema/SemaDeclCXX.cpp b/clang/lib/Sema/SemaDeclCXX.cpp
index beb7e5b177c6e..5b6d452ebf4ea 100644
--- a/clang/lib/Sema/SemaDeclCXX.cpp
+++ b/clang/lib/Sema/SemaDeclCXX.cpp
@@ -4511,9 +4511,9 @@ Sema::BuildMemInitializer(Decl *ConstructorD,
         if (!NotUnknownSpecialization) {
           // When the scope specifier can refer to a member of an unknown
           // specialization, we take it as a type name.
-          BaseType = CheckTypenameType(ETK_None, SourceLocation(),
-                                       SS.getWithLocInContext(Context),
-                                       *MemberOrBase, IdLoc);
+          BaseType = CheckTypenameType(
+              ElaboratedTypeKeyword::None, SourceLocation(),
+              SS.getWithLocInContext(Context), *MemberOrBase, IdLoc);
           if (BaseType.isNull())
             return true;
 
@@ -4596,7 +4596,8 @@ Sema::BuildMemInitializer(Decl *ConstructorD,
     }
 
     if (BaseType.isNull()) {
-      BaseType = getElaboratedType(ETK_None, SS, Context.getTypeDeclType(TyD));
+      BaseType = getElaboratedType(ElaboratedTypeKeyword::None, SS,
+                                   Context.getTypeDeclType(TyD));
       MarkAnyDeclReferenced(TyD->getLocation(), TyD, /*OdrUse=*/false);
       TInfo = Context.CreateTypeSourceInfo(BaseType);
       ElaboratedTypeLoc TL = TInfo->getTypeLoc().castAs<ElaboratedTypeLoc>();
@@ -7727,7 +7728,8 @@ bool Sema::CheckExplicitlyDefaultedSpecialMember(CXXMethodDecl *MD,
     QualType ThisType = MD->getFunctionObjectParameterType();
 
     QualType DeclType = Context.getTypeDeclType(RD);
-    DeclType = Context.getElaboratedType(ETK_None, nullptr, DeclType, nullptr);
+    DeclType = Context.getElaboratedType(ElaboratedTypeKeyword::None, nullptr,
+                                         DeclType, nullptr);
     DeclType = Context.getAddrSpaceQualType(
         DeclType, ThisType.getQualifiers().getAddressSpace());
     QualType ExpectedReturnType = Context.getLValueReferenceType(DeclType);
@@ -11836,7 +11838,8 @@ QualType Sema::CheckComparisonCategoryType(ComparisonCategoryType Kind,
   auto TyForDiags = [&](ComparisonCategoryInfo *Info) {
     auto *NNS =
         NestedNameSpecifier::Create(Context, nullptr, getStdNamespace());
-    return Context.getElaboratedType(ETK_None, NNS, Info->getType());
+    return Context.getElaboratedType(ElaboratedTypeKeyword::None, NNS,
+                                     Info->getType());
   };
 
   // Check if we've already successfully checked the comparison category type
@@ -12055,7 +12058,7 @@ QualType Sema::BuildStdInitializerList(QualType Element, SourceLocation Loc) {
                                        Context.getTrivialTypeSourceInfo(Element,
                                                                         Loc)));
   return Context.getElaboratedType(
-      ElaboratedTypeKeyword::ETK_None,
+      ElaboratedTypeKeyword::None,
       NestedNameSpecifier::Create(Context, nullptr, getStdNamespace()),
       CheckTemplateIdType(TemplateName(StdInitializerList), Loc, Args));
 }
@@ -14837,7 +14840,8 @@ CXXMethodDecl *Sema::DeclareImplicitCopyAssignment(CXXRecordDecl *ClassDecl) {
     return nullptr;
 
   QualType ArgType = Context.getTypeDeclType(ClassDecl);
-  ArgType = Context.getElaboratedType(ETK_None, nullptr, ArgType, nullptr);
+  ArgType = Context.getElaboratedType(ElaboratedTypeKeyword::None, nullptr,
+                                      ArgType, nullptr);
   LangAS AS = getDefaultCXXMethodAddrSpace();
   if (AS != LangAS::Default)
     ArgType = Context.getAddrSpaceQualType(ArgType, AS);
@@ -15190,7 +15194,8 @@ CXXMethodDecl *Sema::DeclareImplicitMoveAssignment(CXXRecordDecl *ClassDecl) {
   // constructor rules.
 
   QualType ArgType = Context.getTypeDeclType(ClassDecl);
-  ArgType = Context.getElaboratedType(ETK_None, nullptr, ArgType, nullptr);
+  ArgType = Context.getElaboratedType(ElaboratedTypeKeyword::None, nullptr,
+                                      ArgType, nullptr);
   LangAS AS = getDefaultCXXMethodAddrSpace();
   if (AS != LangAS::Default)
     ArgType = Context.getAddrSpaceQualType(ArgType, AS);
@@ -15579,7 +15584,8 @@ CXXConstructorDecl *Sema::DeclareImplicitCopyConstructor(
 
   QualType ClassType = Context.getTypeDeclType(ClassDecl);
   QualType ArgType = ClassType;
-  ArgType = Context.getElaboratedType(ETK_None, nullptr, ArgType, nullptr);
+  ArgType = Context.getElaboratedType(ElaboratedTypeKeyword::None, nullptr,
+                                      ArgType, nullptr);
   bool Const = ClassDecl->implicitCopyConstructorHasConstParam();
   if (Const)
     ArgType = ArgType.withConst();
@@ -15724,7 +15730,8 @@ CXXConstructorDecl *Sema::DeclareImplicitMoveConstructor(
   QualType ClassType = Context.getTypeDeclType(ClassDecl);
 
   QualType ArgType = ClassType;
-  ArgType = Context.getElaboratedType(ETK_None, nullptr, ArgType, nullptr);
+  ArgType = Context.getElaboratedType(ElaboratedTypeKeyword::None, nullptr,
+                                      ArgType, nullptr);
   LangAS AS = getDefaultCXXMethodAddrSpace();
   if (AS != LangAS::Default)
     ArgType = Context.getAddrSpaceQualType(ClassType, AS);
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index 83519476f0742..8d3086cf3962c 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -2987,7 +2987,7 @@ ExprResult Sema::BuildQualifiedDeclarationNameExpr(
     TypeLocBuilder TLB;
     TLB.pushTypeSpec(Ty).setNameLoc(NameInfo.getLoc());
 
-    QualType ET = getElaboratedType(ETK_None, SS, Ty);
+    QualType ET = getElaboratedType(ElaboratedTypeKeyword::None, SS, Ty);
     ElaboratedTypeLoc QTL = TLB.push<ElaboratedTypeLoc>(ET);
     QTL.setElaboratedKeywordLoc(SourceLocation());
     QTL.setQualifierLoc(SS.getWithLocInContext(Context));
diff --git a/clang/lib/Sema/SemaExprCXX.cpp b/clang/lib/Sema/SemaExprCXX.cpp
index e977407056218..e350a6a12b2b5 100644
--- a/clang/lib/Sema/SemaExprCXX.cpp
+++ b/clang/lib/Sema/SemaExprCXX.cpp
@@ -72,8 +72,8 @@ ParsedType Sema::getInheritingConstructorName(CXXScopeSpec &SS,
     // Strip off the last layer of the nested-name-specifier and build a
     // typename type for it.
     assert(NNS->getAsIdentifier() == &Name && "not a constructor name");
-    Type = Context.getDependentNameType(ETK_None, NNS->getPrefix(),
-                                        NNS->getAsIdentifier());
+    Type = Context.getDependentNameType(
+        ElaboratedTypeKeyword::None, NNS->getPrefix(), NNS->getAsIdentifier());
     break;
 
   case NestedNameSpecifier::Global:
@@ -101,7 +101,8 @@ ParsedType Sema::getConstructorName(IdentifierInfo &II,
   // friend declaration or an inherited constructor declaration), form an
   // unresolved "typename" type.
   if (CurClass->isDependentContext() && !EnteringContext && SS.getScopeRep()) {
-    QualType T = Context.getDependentNameType(ETK_None, SS.getScopeRep(), &II);
+    QualType T = Context.getDependentNameType(ElaboratedTypeKeyword::None,
+                                              SS.getScopeRep(), &II);
     return ParsedType::make(T);
   }
 
@@ -245,8 +246,9 @@ ParsedType Sema::getDestructorName(IdentifierInfo &II, SourceLocation NameLoc,
       if (IsAcceptableResult(Type)) {
         QualType T = Context.getTypeDeclType(Type);
         MarkAnyDeclReferenced(Type->getLocation(), Type, /*OdrUse=*/false);
-        return CreateParsedType(Context.getElaboratedType(ETK_None, nullptr, T),
-                                Context.getTrivialTypeSourceInfo(T, NameLoc));
+        return CreateParsedType(
+            Context.getElaboratedType(ElaboratedTypeKeyword::None, nullptr, T),
+            Context.getTrivialTypeSourceInfo(T, NameLoc));
       }
     }
 
@@ -362,9 +364,9 @@ ParsedType Sema::getDestructorName(IdentifierInfo &II, SourceLocation NameLoc,
     // We didn't find our type, but that's OK: it's dependent anyway.
 
     // FIXME: What if we have no nested-name-specifier?
-    QualType T = CheckTypenameType(ETK_None, SourceLocation(),
-                                   SS.getWithLocInContext(Context),
-                                   II, NameLoc);
+    QualType T =
+        CheckTypenameType(ElaboratedTypeKeyword::None, SourceLocation(),
+                          SS.getWithLocInContext(Context), II, NameLoc);
     return ParsedType::make(T);
   }
 
@@ -8943,9 +8945,10 @@ Sema::ActOnTypeRequirement(SourceLocation TypenameKWLoc, CXXScopeSpec &SS,
          "Exactly one of TypeName and TemplateId must be specified.");
   TypeSourceInfo *TSI = nullptr;
   if (TypeName) {
-    QualType T = CheckTypenameType(ETK_Typename, TypenameKWLoc,
-                                   SS.getWithLocInContext(Context), *TypeName,
-                                   NameLoc, &TSI, /*DeducedTSTContext=*/false);
+    QualType T =
+        CheckTypenameType(ElaboratedTypeKeyword::Typename, TypenameKWLoc,
+                          SS.getWithLocInContext(Context), *TypeName, NameLoc,
+                          &TSI, /*DeducedTSTContext=*/false);
     if (T.isNull())
       return nullptr;
   } else {
diff --git a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp
index c2477ec0063e4..59721c8dc664a 100644
--- a/clang/lib/Sema/SemaTemplate.cpp
+++ b/clang/lib/Sema/SemaTemplate.cpp
@@ -3907,7 +3907,7 @@ QualType Sema::CheckTemplateIdType(TemplateName Name,
     // correct, or the code is ill-formed and will be diagnosed when the
     // dependent name is substituted.
     return Context.getDependentTemplateSpecializationType(
-        ETK_None, DTN->getQualifier(), DTN->getIdentifier(),
+        ElaboratedTypeKeyword::None, DTN->getQualifier(), DTN->getIdentifier(),
         TemplateArgs.arguments());
 
   if (Name.getAsAssumedTemplateName() &&
@@ -4221,7 +4221,7 @@ TypeResult Sema::ActOnTemplateIdType(
   if (DependentTemplateName *DTN = Template.getAsDependentTemplateName()) {
     assert(SS.getScopeRep() == DTN->getQualifier());
     QualType T = Context.getDependentTemplateSpecializationType(
-        ETK_None, DTN->getQualifier(), DTN->getIdentifier(),
+        ElaboratedTypeKeyword::None, DTN->getQualifier(), DTN->getIdentifier(),
         TemplateArgs.arguments());
     // Build type-source information.
     TypeLocBuilder TLB;
@@ -4254,8 +4254,9 @@ TypeResult Sema::ActOnTemplateIdType(
     SpecTL.setArgLocInfo(i, TemplateArgs[i].getLocInfo());
 
   // Create an elaborated-type-specifier containing the nested-name-specifier.
-  QualType ElTy = getElaboratedType(
-      ETK_None, !IsCtorOrDtorName ? SS : CXXScopeSpec(), SpecTy);
+  QualType ElTy =
+      getElaboratedType(ElaboratedTypeKeyword::None,
+                        !IsCtorOrDtorName ? SS : CXXScopeSpec(), SpecTy);
   ElaboratedTypeLoc ElabTL = TLB.push<ElaboratedTypeLoc>(ElTy);
   ElabTL.setElaboratedKeywordLoc(SourceLocation());
   if (!ElabTL.isEmpty())
@@ -5289,8 +5290,8 @@ bool Sema::CheckTemplateTypeArgument(
 
         // Recover by synthesizing a type using the location information that we
         // already have.
-        ArgType =
-            Context.getDependentNameType(ETK_Typename, SS.getScopeRep(), II);
+        ArgType = Context.getDependentNameType(ElaboratedTypeKeyword::Typename,
+                                               SS.getScopeRep(), II);
         TypeLocBuilder TLB;
         DependentNameTypeLoc TL = TLB.push<DependentNameTypeLoc>(ArgType);
         TL.setElaboratedKeywordLoc(SourceLocation(/*synthesized*/));
@@ -5568,7 +5569,7 @@ convertTypeTemplateArgumentToTemplate(ASTContext &Context, TypeLoc TLoc) {
   // Extract and step over any surrounding nested-name-specifier.
   NestedNameSpecifierLoc QualLoc;
   if (auto ETLoc = TLoc.getAs<ElaboratedTypeLoc>()) {
-    if (ETLoc.getTypePtr()->getKeyword() != ETK_None)
+    if (ETLoc.getTypePtr()->getKeyword() != ElaboratedTypeKeyword::None)
       return TemplateArgumentLoc();
 
     QualLoc = ETLoc.getQualifierLoc();
@@ -10860,8 +10861,8 @@ TypeResult Sema::ActOnTypenameType(Scope *S, SourceLocation TypenameLoc,
   QualType T =
       CheckTypenameType((TypenameLoc.isValid() ||
                          IsImplicitTypename == ImplicitTypenameContext::Yes)
-                            ? ETK_Typename
-                            : ETK_None,
+                            ? ElaboratedTypeKeyword::Typename
+                            : ElaboratedTypeKeyword::None,
                         TypenameLoc, QualifierLoc, II, IdLoc, &TSI,
                         /*DeducedTSTContext=*/true);
   if (T.isNull())
@@ -10910,8 +10911,8 @@ Sema::ActOnTypenameType(Scope *S,
     assert(DTN && "dependent template has non-dependent name?");
     assert(DTN->getQualifier() == SS.getScopeRep());
     QualType T = Context.getDependentTemplateSpecializationType(
-        ETK_Typename, DTN->getQualifier(), DTN->getIdentifier(),
-        TemplateArgs.arguments());
+        ElaboratedTypeKeyword::Typename, DTN->getQualifier(),
+        DTN->getIdentifier(), TemplateArgs.arguments());
 
     // Create source-location information for this type.
     TypeLocBuilder Builder;
@@ -10943,7 +10944,8 @@ Sema::ActOnTypenameType(Scope *S,
   for (unsigned I = 0, N = TemplateArgs.size(); I != N; ++I)
     SpecTL.setArgLocInfo(I, TemplateArgs[I].getLocInfo());
 
-  T = Context.getElaboratedType(ETK_Typename, SS.getScopeRep(), T);
+  T = Context.getElaboratedType(ElaboratedTypeKeyword::Typename,
+                                SS.getScopeRep(), T);
   ElaboratedTypeLoc TL = Builder.push<ElaboratedTypeLoc>(T);
   TL.setElaboratedKeywordLoc(TypenameLoc);
   TL.setQualifierLoc(SS.getWithLocInContext(Context));
@@ -11146,7 +11148,7 @@ Sema::CheckTypenameType(ElaboratedTypeKeyword Keyword,
       // ignore functions, but that appears to be an oversight.
       auto *LookupRD = dyn_cast_or_null<CXXRecordDecl>(Ctx);
       auto *FoundRD = dyn_cast<CXXRecordDecl>(Type);
-      if (Keyword == ETK_Typename && LookupRD && FoundRD &&
+      if (Keyword == ElaboratedTypeKeyword::Typename && LookupRD && FoundRD &&
           FoundRD->isInjectedClassName() &&
           declaresSameEntity(LookupRD, cast<Decl>(FoundRD->getParent())))
         Diag(IILoc, diag::ext_out_of_line_qualified_id_type_names_constructor)
diff --git a/clang/lib/Sema/SemaTemplateInstantiate.cpp b/clang/lib/Sema/SemaTemplateInstantiate.cpp
index e09897318ba98..6ad70109c8cee 100644
--- a/clang/lib/Sema/SemaTemplateInstantiate.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiate.cpp
@@ -1709,7 +1709,8 @@ TemplateInstantiator::RebuildElaboratedType(SourceLocation KeywordLoc,
 
     // TODO: should we even warn on struct/class mismatches for this?  Seems
     // like it's likely to produce a lot of spurious errors.
-    if (Id && Keyword != ETK_None && Keyword != ETK_Typename) {
+    if (Id && Keyword != ElaboratedTypeKeyword::None &&
+        Keyword != ElaboratedTypeKeyword::Typename) {
       TagTypeKind Kind = TypeWithKeyword::getTagTypeKindForKeyword(Keyword);
       if (!SemaRef.isAcceptableTagRedeclaration(TD, Kind, /*isDefinition*/false,
                                                 TagLocation, Id)) {
diff --git a/clang/lib/Sema/SemaType.cpp b/clang/lib/Sema/SemaType.cpp
index 37491925d750a..dd7065e32a7a5 100644
--- a/clang/lib/Sema/SemaType.cpp
+++ b/clang/lib/Sema/SemaType.cpp
@@ -5714,8 +5714,8 @@ static TypeSourceInfo *GetFullTypeForDeclarator(TypeProcessingState &state,
         NestedNameSpecifier *NNSPrefix = NNS->getPrefix();
         switch (NNS->getKind()) {
         case NestedNameSpecifier::Identifier:
-          ClsType = Context.getDependentNameType(ETK_None, NNSPrefix,
-                                                 NNS->getAsIdentifier());
+          ClsType = Context.getDependentNameType(
+              ElaboratedTypeKeyword::None, NNSPrefix, NNS->getAsIdentifier());
           break;
 
         case NestedNameSpecifier::Namespace:
@@ -5733,7 +5733,8 @@ static TypeSourceInfo *GetFullTypeForDeclarator(TypeProcessingState &state,
           // NOTE: in particular, no wrap occurs if ClsType already is an
           // Elaborated, DependentName, or DependentTemplateSpecialization.
           if (isa<TemplateSpecializationType>(NNS->getAsType()))
-            ClsType = Context.getElaboratedType(ETK_None, NNSPrefix, ClsType);
+            ClsType = Context.getElaboratedType(ElaboratedTypeKeyword::None,
+                                                NNSPrefix, ClsType);
           break;
         }
       } else {
@@ -6333,7 +6334,7 @@ namespace {
           }
       }
       const ElaboratedType *T = TL.getTypePtr();
-      TL.setElaboratedKeywordLoc(T->getKeyword() != ETK_None
+      TL.setElaboratedKeywordLoc(T->getKeyword() != ElaboratedTypeKeyword::None
                                      ? DS.getTypeSpecTypeLoc()
                                      : SourceLocation());
       const CXXScopeSpec& SS = DS.getTypeSpecScope();
diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h
index e990cc2e60c8d..6e8321e163e1c 100644
--- a/clang/lib/Sema/TreeTransform.h
+++ b/clang/lib/Sema/TreeTransform.h
@@ -1160,7 +1160,8 @@ class TreeTransform {
                                                     Id);
     }
 
-    if (Keyword == ETK_None || Keyword == ETK_Typename) {
+    if (Keyword == ElaboratedTypeKeyword::None ||
+        Keyword == ElaboratedTypeKeyword::Typename) {
       return SemaRef.CheckTypenameType(Keyword, KeywordLoc, QualifierLoc,
                                        *Id, IdLoc, DeducedTSTContext);
     }
@@ -7019,7 +7020,8 @@ TreeTransform<Derived>::TransformElaboratedType(TypeLocBuilder &TLB,
   //   If the identifier resolves to a typedef-name or the simple-template-id
   //   resolves to an alias template specialization, the
   //   elaborated-type-specifier is ill-formed.
-  if (T->getKeyword() != ETK_None && T->getKeyword() != ETK_Typename) {
+  if (T->getKeyword() != ElaboratedTypeKeyword::None &&
+      T->getKeyword() != ElaboratedTypeKeyword::Typename) {
     if (const TemplateSpecializationType *TST =
           NamedT->getAs<TemplateSpecializationType>()) {
       TemplateName Template = TST->getTemplateName();

From fab5c8fa4076614be243b0a5c95f4f3c6b725db6 Mon Sep 17 00:00:00 2001
From: Daniel Bertalan <dani@danielbertalan.dev>
Date: Thu, 14 Apr 2022 10:20:46 +0200
Subject: [PATCH 553/877] [compiler-rt] Build crtbegin.o/crtend.o for
 SerenityOS

Differential Revision: https://reviews.llvm.org/D154398
---
 compiler-rt/cmake/builtin-config-ix.cmake | 2 +-
 compiler-rt/cmake/crt-config-ix.cmake     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/compiler-rt/cmake/builtin-config-ix.cmake b/compiler-rt/cmake/builtin-config-ix.cmake
index 9de05bb298efa..5ccc5d7a559b2 100644
--- a/compiler-rt/cmake/builtin-config-ix.cmake
+++ b/compiler-rt/cmake/builtin-config-ix.cmake
@@ -232,7 +232,7 @@ else()
     ${ALL_BUILTIN_SUPPORTED_ARCH})
 endif()
 
-if (OS_NAME MATCHES "Linux" AND NOT LLVM_USE_SANITIZER)
+if (OS_NAME MATCHES "Linux|SerenityOS" AND NOT LLVM_USE_SANITIZER)
   set(COMPILER_RT_HAS_CRT TRUE)
 else()
   set(COMPILER_RT_HAS_CRT FALSE)
diff --git a/compiler-rt/cmake/crt-config-ix.cmake b/compiler-rt/cmake/crt-config-ix.cmake
index dc3265e60984c..ebc7d671e74ee 100644
--- a/compiler-rt/cmake/crt-config-ix.cmake
+++ b/compiler-rt/cmake/crt-config-ix.cmake
@@ -47,7 +47,7 @@ if(NOT APPLE)
   message(STATUS "Supported architectures for crt: ${CRT_SUPPORTED_ARCH}")
 endif()
 
-if (CRT_SUPPORTED_ARCH AND OS_NAME MATCHES "Linux" AND NOT LLVM_USE_SANITIZER)
+if (CRT_SUPPORTED_ARCH AND OS_NAME MATCHES "Linux|SerenityOS" AND NOT LLVM_USE_SANITIZER)
   set(COMPILER_RT_HAS_CRT TRUE)
 else()
   set(COMPILER_RT_HAS_CRT FALSE)

From a9b3f200159595ed8f13cc891ae34291020ba111 Mon Sep 17 00:00:00 2001
From: Peter Klausler <35819229+klausler@users.noreply.github.com>
Date: Tue, 31 Oct 2023 10:49:01 -0700
Subject: [PATCH 554/877] [flang] Fix regression in submodule symbol checking
 (#69778)

The change https://github.com/llvm/llvm-project/pull/67361 removed
submodule name symbols from the name dictionaries of their parent
(sub)modules to prevent needless errors about name clashes, but these
symbols still need to be checked for things like excessive length.
---
 flang/lib/Semantics/check-declarations.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/flang/lib/Semantics/check-declarations.cpp b/flang/lib/Semantics/check-declarations.cpp
index ce16b2df54b05..87db92c6fc493 100644
--- a/flang/lib/Semantics/check-declarations.cpp
+++ b/flang/lib/Semantics/check-declarations.cpp
@@ -2395,6 +2395,10 @@ void CheckHelper::Check(const Scope &scope) {
     for (const auto &pair : scope) {
       Check(*pair.second);
     }
+    if (scope.IsSubmodule() && scope.symbol()) {
+      // Submodule names are not in their parent's scopes
+      Check(*scope.symbol());
+    }
     for (const auto &pair : scope.commonBlocks()) {
       CheckCommonBlock(*pair.second);
     }

From 8ce4b7bcd5f7680243d2faa8eb8340bf92a20907 Mon Sep 17 00:00:00 2001
From: Antonio Frighetto <me@antoniofrighetto.com>
Date: Tue, 31 Oct 2023 18:51:21 +0100
Subject: [PATCH 555/877] [AArch64] Handle newly-added atomic instructions in
 `getMemOpInfo`

2-stage AArch64 buildbot was previously failing.

Fixes: https://lab.llvm.org/buildbot/#/builders/198/builds/5636.
---
 llvm/lib/Target/AArch64/AArch64InstrInfo.cpp | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index c9f7fef66c9f5..a9b0cfbbd33ea 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -3538,8 +3538,10 @@ bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, TypeSize &Scale,
   case AArch64::PRFUMi:
   case AArch64::LDURXi:
   case AArch64::LDURDi:
+  case AArch64::LDAPURXi:
   case AArch64::STURXi:
   case AArch64::STURDi:
+  case AArch64::STLURXi:
     Width = 8;
     Scale = TypeSize::Fixed(1);
     MinOffset = -256;
@@ -3548,8 +3550,10 @@ bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, TypeSize &Scale,
   case AArch64::LDURWi:
   case AArch64::LDURSi:
   case AArch64::LDURSWi:
+  case AArch64::LDAPURSWi:
   case AArch64::STURWi:
   case AArch64::STURSi:
+  case AArch64::STLURWi:
     Width = 4;
     Scale = TypeSize::Fixed(1);
     MinOffset = -256;
@@ -3559,8 +3563,12 @@ bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, TypeSize &Scale,
   case AArch64::LDURHHi:
   case AArch64::LDURSHXi:
   case AArch64::LDURSHWi:
+  case AArch64::LDAPURHi:
+  case AArch64::LDAPURSHWi:
+  case AArch64::LDAPURSHXi:
   case AArch64::STURHi:
   case AArch64::STURHHi:
+  case AArch64::STLURHi:
     Width = 2;
     Scale = TypeSize::Fixed(1);
     MinOffset = -256;
@@ -3570,8 +3578,12 @@ bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, TypeSize &Scale,
   case AArch64::LDURBBi:
   case AArch64::LDURSBXi:
   case AArch64::LDURSBWi:
+  case AArch64::LDAPURBi:
+  case AArch64::LDAPURSBWi:
+  case AArch64::LDAPURSBXi:
   case AArch64::STURBi:
   case AArch64::STURBBi:
+  case AArch64::STLURBi:
     Width = 1;
     Scale = TypeSize::Fixed(1);
     MinOffset = -256;

From 9a847fed6220a166f47bfc51bb8bdf5f23356857 Mon Sep 17 00:00:00 2001
From: Arthur Eubanks <aeubanks@google.com>
Date: Tue, 31 Oct 2023 11:00:02 -0700
Subject: [PATCH 556/877] [gn build] Fix build

---
 llvm/utils/gn/secondary/llvm/lib/Transforms/Utils/BUILD.gn | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/utils/gn/secondary/llvm/lib/Transforms/Utils/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Transforms/Utils/BUILD.gn
index 0e76f8286068e..58acabf85d296 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Transforms/Utils/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Transforms/Utils/BUILD.gn
@@ -46,7 +46,7 @@ static_library("Utils") {
     "LCSSA.cpp",
     "LibCallsShrinkWrap.cpp",
     "Local.cpp",
-    "LoopConstrainer.cpp"
+    "LoopConstrainer.cpp",
     "LoopPeel.cpp",
     "LoopRotationUtils.cpp",
     "LoopSimplify.cpp",

From 1c3445052b8fd75f212470aac0fc3d68728ab4ff Mon Sep 17 00:00:00 2001
From: Peter Klausler <35819229+klausler@users.noreply.github.com>
Date: Tue, 31 Oct 2023 11:03:02 -0700
Subject: [PATCH 557/877] [flang] Accept distinct kinds of arguments to
 IAND/IEOR/IOR (#69782)

As is already supported as a common extension for intrinsic functions
like DIM, allow distinct kinds of integer actual arguments to the
MIL-STD bit intrinsic functions IAND, IEOR, and IOR, with the kind of
the result being the largest of the kinds of the operands. (Though one
could make a case that IAND should return the smallest kind of its
operands, that's not what other compilers do.)
---
 flang/docs/Extensions.md          |  2 +-
 flang/lib/Evaluate/intrinsics.cpp | 10 +++++++---
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/flang/docs/Extensions.md b/flang/docs/Extensions.md
index 373f18e1e2284..2786a8cca73b5 100644
--- a/flang/docs/Extensions.md
+++ b/flang/docs/Extensions.md
@@ -177,7 +177,7 @@ end
    or `EXTENDEDTYPE(PARENTTYPE=PARENTTYPE(1,2,3))`).
 * Some intrinsic functions are specified in the standard as requiring the
   same type and kind for their arguments (viz., ATAN with two arguments,
-  ATAN2, DIM, HYPOT, MAX, MIN, MOD, and MODULO);
+  ATAN2, DIM, HYPOT, IAND, IEOR, IOR, MAX, MIN, MOD, and MODULO);
   we allow distinct types to be used, promoting
   the arguments as if they were operands to an intrinsic `+` operator,
   and defining the result type accordingly.
diff --git a/flang/lib/Evaluate/intrinsics.cpp b/flang/lib/Evaluate/intrinsics.cpp
index e4125f6572aa9..c711b4feaca48 100644
--- a/flang/lib/Evaluate/intrinsics.cpp
+++ b/flang/lib/Evaluate/intrinsics.cpp
@@ -170,6 +170,7 @@ static constexpr TypePattern SameType{AnyType, KindCode::same};
 // an intrinsic operation like addition.  This is a nonstandard but nearly
 // universal extension feature.
 static constexpr TypePattern OperandReal{RealType, KindCode::operand};
+static constexpr TypePattern OperandInt{IntType, KindCode::operand};
 static constexpr TypePattern OperandIntOrReal{IntOrRealType, KindCode::operand};
 
 // For ASSOCIATED, the first argument is a typeless pointer
@@ -517,13 +518,15 @@ static const IntrinsicInterface genericIntrinsicFunction[]{
         SameInt, Rank::dimReduced, IntrinsicClass::transformationalFunction},
     {"iparity", {{"array", SameInt, Rank::array}, MissingDIM, OptionalMASK},
         SameInt, Rank::scalar, IntrinsicClass::transformationalFunction},
-    {"iand", {{"i", SameInt}, {"j", SameInt, Rank::elementalOrBOZ}}, SameInt},
+    {"iand", {{"i", OperandInt}, {"j", OperandInt, Rank::elementalOrBOZ}},
+        OperandInt},
     {"iand", {{"i", BOZ}, {"j", SameInt}}, SameInt},
     {"ibclr", {{"i", SameInt}, {"pos", AnyInt}}, SameInt},
     {"ibits", {{"i", SameInt}, {"pos", AnyInt}, {"len", AnyInt}}, SameInt},
     {"ibset", {{"i", SameInt}, {"pos", AnyInt}}, SameInt},
     {"ichar", {{"c", AnyChar}, DefaultingKIND}, KINDInt},
-    {"ieor", {{"i", SameInt}, {"j", SameInt, Rank::elementalOrBOZ}}, SameInt},
+    {"ieor", {{"i", OperandInt}, {"j", OperandInt, Rank::elementalOrBOZ}},
+        OperandInt},
     {"ieor", {{"i", BOZ}, {"j", SameInt}}, SameInt},
     {"image_status", {{"image", SameInt}, OptionalTEAM}, DefaultInt},
     {"index",
@@ -533,7 +536,8 @@ static const IntrinsicInterface genericIntrinsicFunction[]{
         KINDInt},
     {"int", {{"a", AnyNumeric, Rank::elementalOrBOZ}, DefaultingKIND}, KINDInt},
     {"int_ptr_kind", {}, DefaultInt, Rank::scalar},
-    {"ior", {{"i", SameInt}, {"j", SameInt, Rank::elementalOrBOZ}}, SameInt},
+    {"ior", {{"i", OperandInt}, {"j", OperandInt, Rank::elementalOrBOZ}},
+        OperandInt},
     {"ior", {{"i", BOZ}, {"j", SameInt}}, SameInt},
     {"ishft", {{"i", SameInt}, {"shift", AnyInt}}, SameInt},
     {"ishftc",

From f87af714a555ad06c9d212ae3c333294d366b76d Mon Sep 17 00:00:00 2001
From: Peter Klausler <35819229+klausler@users.noreply.github.com>
Date: Tue, 31 Oct 2023 11:15:03 -0700
Subject: [PATCH 558/877] [flang] Silence a bogus warning (#69799)

A recent fix (https://github.com/llvm/llvm-project/pull/66232) to
interpret a hitherto unknown name whose first appearance is as the
target of a procedure pointer initializer as an external procedure has
led to some inapproprite warning messages if the name is later defined
as an external subroutine ("X was already declared as an external").

Ensure that the EXTERNAL attribute is correctly noted as being implicit,
and that it's okay that neither the Subroutine nor Function flag has yet
been set for the implicit external.
---
 flang/lib/Semantics/resolve-names.cpp | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/flang/lib/Semantics/resolve-names.cpp b/flang/lib/Semantics/resolve-names.cpp
index 9f7a59b0b454a..bc99dac63dc88 100644
--- a/flang/lib/Semantics/resolve-names.cpp
+++ b/flang/lib/Semantics/resolve-names.cpp
@@ -4191,18 +4191,23 @@ bool SubprogramVisitor::HandlePreviousCalls(
     // ENTRY's name.  We have to replace that symbol in situ to avoid the
     // obligation to rewrite symbol pointers in the parse tree.
     if (!symbol.test(subpFlag)) {
+      auto other{subpFlag == Symbol::Flag::Subroutine
+              ? Symbol::Flag::Function
+              : Symbol::Flag::Subroutine};
       // External statements issue an explicit EXTERNAL attribute.
       if (symbol.attrs().test(Attr::EXTERNAL) &&
           !symbol.implicitAttrs().test(Attr::EXTERNAL)) {
         // Warn if external statement previously declared.
         Say(name,
             "EXTERNAL attribute was already specified on '%s'"_warn_en_US);
-      } else {
+      } else if (symbol.test(other)) {
         Say2(name,
             subpFlag == Symbol::Flag::Function
                 ? "'%s' was previously called as a subroutine"_err_en_US
                 : "'%s' was previously called as a function"_err_en_US,
             symbol, "Previous call of '%s'"_en_US);
+      } else {
+        symbol.set(subpFlag);
       }
     }
     EntityDetails entity;
@@ -8163,6 +8168,7 @@ bool ResolveNamesVisitor::Pre(const parser::PointerAssignmentStmt &x) {
       // Unknown target of procedure pointer must be an external procedure
       Symbol &symbol{MakeSymbol(
           context().globalScope(), name->source, Attrs{Attr::EXTERNAL})};
+      symbol.implicitAttrs().set(Attr::EXTERNAL);
       Resolve(*name, symbol);
       ConvertToProcEntity(symbol);
       return false;

From e87cdda35e062f39e72b5912a255821bf002041d Mon Sep 17 00:00:00 2001
From: Peter Klausler <35819229+klausler@users.noreply.github.com>
Date: Tue, 31 Oct 2023 11:26:30 -0700
Subject: [PATCH 559/877] [flang] Fix bogus error on aliased derived type
 (#69959)

When a derived type definition is visible by two or more names in the
same scope due to USE renaming, any generic ASSIGNMENT(=) or OPERATOR()
bindings in the type will produce incorrect error messages about
indistinguishable specific procedures. This is due to the use of a
std::vector<> to hold a sequence of symbols and some derived information
for the specific procedures of the generic name. Change to a std::map<>
indexed by the ultimate symbol of each specific so that duplicates
cannot arise.
---
 flang/lib/Semantics/check-declarations.cpp | 26 +++++++++++-----------
 flang/test/Semantics/generic07.f90         | 26 +++++++++++++++++++---
 2 files changed, 36 insertions(+), 16 deletions(-)

diff --git a/flang/lib/Semantics/check-declarations.cpp b/flang/lib/Semantics/check-declarations.cpp
index 87db92c6fc493..0b847f5c3408b 100644
--- a/flang/lib/Semantics/check-declarations.cpp
+++ b/flang/lib/Semantics/check-declarations.cpp
@@ -198,10 +198,10 @@ class DistinguishabilityHelper {
   SemanticsContext &context_;
   struct ProcedureInfo {
     GenericKind kind;
-    const Symbol &symbol;
     const Procedure &procedure;
   };
-  std::map<SourceName, std::vector<ProcedureInfo>> nameToInfo_;
+  std::map<SourceName, std::map<const Symbol *, ProcedureInfo>>
+      nameToSpecifics_;
 };
 
 void CheckHelper::Check(const ParamValue &value, bool canBeAssumed) {
@@ -3444,26 +3444,26 @@ evaluate::Shape SubprogramMatchHelper::FoldShape(const evaluate::Shape &shape) {
 }
 
 void DistinguishabilityHelper::Add(const Symbol &generic, GenericKind kind,
-    const Symbol &specific, const Procedure &procedure) {
-  if (!context_.HasError(specific)) {
-    nameToInfo_[generic.name()].emplace_back(
-        ProcedureInfo{kind, specific, procedure});
+    const Symbol &ultimateSpecific, const Procedure &procedure) {
+  if (!context_.HasError(ultimateSpecific)) {
+    nameToSpecifics_[generic.name()].emplace(
+        &ultimateSpecific, ProcedureInfo{kind, procedure});
   }
 }
 
 void DistinguishabilityHelper::Check(const Scope &scope) {
-  for (const auto &[name, info] : nameToInfo_) {
-    auto count{info.size()};
-    for (std::size_t i1{0}; i1 < count - 1; ++i1) {
-      const auto &[kind, symbol, proc]{info[i1]};
-      for (std::size_t i2{i1 + 1}; i2 < count; ++i2) {
+  for (const auto &[name, info] : nameToSpecifics_) {
+    for (auto iter1{info.begin()}; iter1 != info.end(); ++iter1) {
+      const auto &[ultimate, procInfo]{*iter1};
+      const auto &[kind, proc]{procInfo};
+      for (auto iter2{iter1}; ++iter2 != info.end();) {
         auto distinguishable{kind.IsName()
                 ? evaluate::characteristics::Distinguishable
                 : evaluate::characteristics::DistinguishableOpOrAssign};
         if (!distinguishable(
-                context_.languageFeatures(), proc, info[i2].procedure)) {
+                context_.languageFeatures(), proc, iter2->second.procedure)) {
           SayNotDistinguishable(GetTopLevelUnitContaining(scope), name, kind,
-              symbol, info[i2].symbol);
+              *ultimate, *iter2->first);
         }
       }
     }
diff --git a/flang/test/Semantics/generic07.f90 b/flang/test/Semantics/generic07.f90
index 885697e4b5a97..e7486c02a7d2b 100644
--- a/flang/test/Semantics/generic07.f90
+++ b/flang/test/Semantics/generic07.f90
@@ -1,5 +1,5 @@
 ! RUN: %python %S/test_errors.py %s %flang_fc1
-module m
+module m1
   type :: t1
     sequence
     real :: x
@@ -29,8 +29,28 @@ subroutine s4a(x)
   end
 end
 
+module m2
+  type t10
+    integer n
+   contains
+    procedure :: f
+    generic:: operator(+) => f
+  end type
+ contains
+  elemental type(t10) function f(x,y)
+    class(t10), intent(in) :: x, y
+    f%n = x%n + y%n
+  end
+end
+
+module m3
+  use m2, only: rt10 => t10
+end
+
 program test
-  use m, only: s1a, s2a, s3a, s4a
+  use m1, only: s1a, s2a, s3a, s4a
+  use m2, only: t10
+  use m3, only: rt10 ! alias for t10, ensure no distinguishability error
   type :: t1
     sequence
     integer :: x ! distinct type
@@ -54,7 +74,7 @@ program test
   interface distinguishable3
     procedure :: s1a, s1b
   end interface
-  !ERROR: Generic 'indistinguishable' may not have specific procedures 's2a' and 's2b' as their interfaces are not distinguishable
+  !ERROR: Generic 'indistinguishable' may not have specific procedures 's2b' and 's2a' as their interfaces are not distinguishable
   interface indistinguishable
     procedure :: s2a, s2b
   end interface

From ad810519d9124c78589f939abab427a02db1ccc0 Mon Sep 17 00:00:00 2001
From: Nico Weber <thakis@chromium.org>
Date: Tue, 31 Oct 2023 14:31:44 -0400
Subject: [PATCH 560/877] [gn] port 3351097c7b69ee (TargetMachineCTests)

---
 llvm/utils/gn/secondary/llvm/unittests/BUILD.gn        | 1 +
 llvm/utils/gn/secondary/llvm/unittests/Target/BUILD.gn | 6 ++++++
 2 files changed, 7 insertions(+)
 create mode 100644 llvm/utils/gn/secondary/llvm/unittests/Target/BUILD.gn

diff --git a/llvm/utils/gn/secondary/llvm/unittests/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/BUILD.gn
index 35cf2dbdf6a90..6cc3848c11149 100644
--- a/llvm/utils/gn/secondary/llvm/unittests/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/unittests/BUILD.gn
@@ -46,6 +46,7 @@ group("unittests") {
     "Support:SupportTests",
     "Support/DynamicLibrary:DynamicLibraryTests",
     "TableGen:TableGenTests",
+    "Target:TargetMachineCTests",
     "TargetParser:TargetParserTests",
     "Testing/ADT:TestingADTTests",
     "Testing/Support:TestingSupportTests",
diff --git a/llvm/utils/gn/secondary/llvm/unittests/Target/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/Target/BUILD.gn
new file mode 100644
index 0000000000000..f5d86bbf22f79
--- /dev/null
+++ b/llvm/utils/gn/secondary/llvm/unittests/Target/BUILD.gn
@@ -0,0 +1,6 @@
+import("//third-party/unittest/unittest.gni")
+
+unittest("TargetMachineCTests") {
+  deps = [ "//llvm/lib/Target:AllTargetsCodeGens" ]
+  sources = [ "TargetMachineOptionsTest.cpp" ]
+}

From f706411f71a29bc8bb6c04b21958fa3a0d17fd99 Mon Sep 17 00:00:00 2001
From: Peter Klausler <35819229+klausler@users.noreply.github.com>
Date: Tue, 31 Oct 2023 11:38:25 -0700
Subject: [PATCH 561/877] =?UTF-8?q?[flang][preprocessor]=20Handle=20compil?=
 =?UTF-8?q?er=20directives=20with=20continuations=20a=E2=80=A6=20(#70128)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

…fter macro expansion

When compiler directives (!$omp) and/or their continuations (!$omp &)
are produced by macro expansion, handle those continuations. Also allow
a continuation marker (&) to appear in a macro actual argument.
---
 flang/docs/Preprocessing.md                   |   2 +-
 flang/lib/Parser/prescan.cpp                  | 128 +++++++++++++++++-
 flang/lib/Parser/prescan.h                    |   2 +
 .../directive-contin-with-pp.F90              |  41 ++++++
 flang/test/Preprocessing/pp130.F90            |   5 +-
 5 files changed, 168 insertions(+), 10 deletions(-)
 create mode 100644 flang/test/Preprocessing/directive-contin-with-pp.F90

diff --git a/flang/docs/Preprocessing.md b/flang/docs/Preprocessing.md
index 7465ff538e42e..3c523472f39bd 100644
--- a/flang/docs/Preprocessing.md
+++ b/flang/docs/Preprocessing.md
@@ -228,5 +228,5 @@ E . . E E .   pp125.F90  #DEFINE works in free form
 . . E . E E   pp127.F90  FLM call with closing ')' on next line (not a continuation)
 E . E . E E   pp128.F90  FLM call with '(' on next line (not a continuation)
 . . N . . N   pp129.F90  #define KWM !, then KWM works as comment line initiator
-E . E . . E   pp130.F90  #define KWM &, use for continuation w/o pasting (ifort and nag seem to continue #define)
+. . E . . E   pp130.F90  #define KWM &, use for continuation w/o pasting (ifort and nag seem to continue #define)
 ```
diff --git a/flang/lib/Parser/prescan.cpp b/flang/lib/Parser/prescan.cpp
index f61eff5b0dd6a..5b600822282a4 100644
--- a/flang/lib/Parser/prescan.cpp
+++ b/flang/lib/Parser/prescan.cpp
@@ -213,6 +213,9 @@ void Prescanner::Statement() {
       if (preprocessed->HasRedundantBlanks()) {
         preprocessed->RemoveRedundantBlanks();
       }
+      while (CompilerDirectiveContinuation(*preprocessed, ppl.sentinel)) {
+        newlineProvenance = GetCurrentProvenance();
+      }
       NormalizeCompilerDirectiveCommentMarker(*preprocessed);
       preprocessed->ToLowerCase();
       SourceFormChange(preprocessed->ToString());
@@ -227,6 +230,9 @@ void Prescanner::Statement() {
           preprocessed->RemoveBlanks(/*after column*/ 6);
         }
       } else {
+        while (SourceLineContinuation(*preprocessed)) {
+          newlineProvenance = GetCurrentProvenance();
+        }
         if (preprocessed->HasRedundantBlanks()) {
           preprocessed->RemoveRedundantBlanks();
         }
@@ -239,12 +245,17 @@ void Prescanner::Statement() {
       break;
     }
   } else {
-    tokens.ToLowerCase();
     if (line.kind == LineClassification::Kind::CompilerDirective) {
+      while (CompilerDirectiveContinuation(tokens, line.sentinel)) {
+        newlineProvenance = GetCurrentProvenance();
+      }
+      tokens.ToLowerCase();
       SourceFormChange(tokens.ToString());
-    }
-    if (inFixedForm_ && line.kind == LineClassification::Kind::Source) {
-      EnforceStupidEndStatementRules(tokens);
+    } else { // Kind::Source
+      tokens.ToLowerCase();
+      if (inFixedForm_) {
+        EnforceStupidEndStatementRules(tokens);
+      }
     }
     tokens.CheckBadFortranCharacters(messages_)
         .CheckBadParentheses(messages_)
@@ -1132,8 +1143,10 @@ bool Prescanner::FreeFormContinuation() {
   if (*p != '\n') {
     if (inCharLiteral_) {
       return false;
-    } else if (*p != '!' &&
-        features_.ShouldWarn(LanguageFeature::CruftAfterAmpersand)) {
+    } else if (*p == '!') { // & ! comment - ok
+    } else if (ampersand && isPossibleMacroCall_ && (*p == ',' || *p == ')')) {
+      return false; // allow & at end of a macro argument
+    } else if (features_.ShouldWarn(LanguageFeature::CruftAfterAmpersand)) {
       Say(GetProvenance(p), "missing ! before comment after &"_warn_en_US);
     }
   }
@@ -1318,4 +1331,107 @@ void Prescanner::SourceFormChange(std::string &&dir) {
     inFixedForm_ = true;
   }
 }
+
+// Acquire and append compiler directive continuation lines to
+// the tokens that constitute a compiler directive, even when those
+// directive continuation lines are the result of macro expansion.
+// (Not used when neither the original compiler directive line nor
+// the directive continuation line result from preprocessing; regular
+// line continuation during tokenization handles that normal case.)
+bool Prescanner::CompilerDirectiveContinuation(
+    TokenSequence &tokens, const char *origSentinel) {
+  if (inFixedForm_ || tokens.empty() ||
+      tokens.TokenAt(tokens.SizeInTokens() - 1) != "&") {
+    return false;
+  }
+  LineClassification followingLine{ClassifyLine(nextLine_)};
+  if (followingLine.kind == LineClassification::Kind::Comment) {
+    nextLine_ += followingLine.payloadOffset; // advance to '!' or newline
+    NextLine();
+    return true;
+  }
+  CHECK(origSentinel != nullptr);
+  directiveSentinel_ = origSentinel; // so IsDirective() is true
+  const char *nextContinuation{
+      followingLine.kind == LineClassification::Kind::CompilerDirective
+          ? FreeFormContinuationLine(true)
+          : nullptr};
+  if (!nextContinuation &&
+      followingLine.kind != LineClassification::Kind::Source) {
+    return false;
+  }
+  auto origNextLine{nextLine_};
+  BeginSourceLine(nextLine_);
+  NextLine();
+  TokenSequence followingTokens;
+  if (nextContinuation) {
+    // What follows is !DIR$ & xxx; skip over the & so that it
+    // doesn't cause a spurious continuation.
+    at_ = nextContinuation;
+  } else {
+    // What follows looks like a source line before macro expansion,
+    // but might become a directive continuation afterwards.
+    SkipSpaces();
+  }
+  while (NextToken(followingTokens)) {
+  }
+  if (auto followingPrepro{
+          preprocessor_.MacroReplacement(followingTokens, *this)}) {
+    followingTokens = std::move(*followingPrepro);
+  }
+  followingTokens.RemoveRedundantBlanks();
+  std::size_t startAt{0};
+  std::size_t keep{followingTokens.SizeInTokens()};
+  bool ok{false};
+  if (nextContinuation) {
+    ok = true;
+  } else {
+    if (keep >= 3 && followingTokens.TokenAt(0) == "!" &&
+        followingTokens.TokenAt(2) == "&") {
+      CharBlock sentinel{followingTokens.TokenAt(1)};
+      if (!sentinel.empty() &&
+          std::memcmp(sentinel.begin(), origSentinel, sentinel.size()) == 0) {
+        startAt = 3;
+        keep -= 3;
+        ok = true;
+      }
+    }
+  }
+  if (ok) {
+    tokens.pop_back(); // delete original '&'
+    tokens.Put(followingTokens, startAt, keep);
+  } else {
+    nextLine_ = origNextLine;
+  }
+  return ok;
+}
+
+// Similar, but for source line continuation after macro replacement.
+bool Prescanner::SourceLineContinuation(TokenSequence &tokens) {
+  if (!inFixedForm_ && !tokens.empty() &&
+      tokens.TokenAt(tokens.SizeInTokens() - 1) == "&") {
+    LineClassification followingLine{ClassifyLine(nextLine_)};
+    if (followingLine.kind == LineClassification::Kind::Comment) {
+      nextLine_ += followingLine.payloadOffset; // advance to '!' or newline
+      NextLine();
+      return true;
+    } else if (const char *nextContinuation{FreeFormContinuationLine(true)}) {
+      BeginSourceLine(nextLine_);
+      NextLine();
+      TokenSequence followingTokens;
+      at_ = nextContinuation;
+      while (NextToken(followingTokens)) {
+      }
+      if (auto followingPrepro{
+              preprocessor_.MacroReplacement(followingTokens, *this)}) {
+        followingTokens = std::move(*followingPrepro);
+      }
+      followingTokens.RemoveRedundantBlanks();
+      tokens.pop_back(); // delete original '&'
+      tokens.Put(followingTokens);
+      return true;
+    }
+  }
+  return false;
+}
 } // namespace Fortran::parser
diff --git a/flang/lib/Parser/prescan.h b/flang/lib/Parser/prescan.h
index 021632657a98c..5f80d26636dc7 100644
--- a/flang/lib/Parser/prescan.h
+++ b/flang/lib/Parser/prescan.h
@@ -186,6 +186,8 @@ class Prescanner {
       const char *) const;
   LineClassification ClassifyLine(const char *) const;
   void SourceFormChange(std::string &&);
+  bool CompilerDirectiveContinuation(TokenSequence &, const char *sentinel);
+  bool SourceLineContinuation(TokenSequence &);
 
   Messages &messages_;
   CookedSource &cooked_;
diff --git a/flang/test/Preprocessing/directive-contin-with-pp.F90 b/flang/test/Preprocessing/directive-contin-with-pp.F90
new file mode 100644
index 0000000000000..9a06ae8438210
--- /dev/null
+++ b/flang/test/Preprocessing/directive-contin-with-pp.F90
@@ -0,0 +1,41 @@
+! RUN: %flang -E %s 2>&1 | FileCheck %s
+
+#define DIR_START !dir$
+#define DIR_CONT !dir$&
+#define FIRST(x) DIR_START x
+#define NEXT(x) DIR_CONT x
+#define AMPER &
+
+subroutine s(x1, x2, x3, x4, x5, x6, x7)
+
+!dir$ ignore_tkr x1
+
+!dir$ ignore_tkr &
+!dir$& x2
+
+DIR_START ignore_tkr x3
+
+!dir$ ignore_tkr AMPER
+DIR_CONT x4
+
+FIRST(ignore_tkr &)
+!dir$& x5
+
+FIRST(ignore_tkr &)
+NEXT(x6)
+
+FIRST(ignore_tkr &)
+NEXT(x7 &)
+NEXT(x8)
+
+end
+
+!CHECK: subroutine s(x1, x2, x3, x4, x5, x6, x7)
+!CHECK: !dir$ ignore_tkr x1
+!CHECK: !dir$ ignore_tkr x2
+!CHECK: !dir$ ignore_tkr x3
+!CHECK: !dir$ ignore_tkr  x4
+!CHECK: !dir$ ignore_tkr  x5
+!CHECK: !dir$ ignore_tkr  x6
+!CHECK: !dir$ ignore_tkr  x7  x8
+!CHECK: end
diff --git a/flang/test/Preprocessing/pp130.F90 b/flang/test/Preprocessing/pp130.F90
index fcc3b3958f7f9..6116cf1c20e5f 100644
--- a/flang/test/Preprocessing/pp130.F90
+++ b/flang/test/Preprocessing/pp130.F90
@@ -1,8 +1,7 @@
-! RUN: not %flang -E %s 2>&1 | FileCheck %s
-! CHECK: error: bad character ('&') in Fortran token
+! RUN: %flang -E %s 2>&1 | FileCheck %s
+! CHECK: j = j +  111
 ! #define KWM &, use for continuation w/o pasting (ifort and nag seem to continue #define)
 #define KWM &
-
       integer :: j
       j = 666
       j = j + KWM

From 1344b65c90507e2368a3d0678df9f179e8890665 Mon Sep 17 00:00:00 2001
From: Daniil Suchkov <suc-daniil@yandex.ru>
Date: Tue, 31 Oct 2023 11:43:57 -0700
Subject: [PATCH 562/877] [SCEV] Fix incorrect NUW inference (#70521)

This patch fixes a miscompile in LSR caused by incorrect inference of
NUW flag for AddRec: we shouldn't infer no-wrap flags based on a
comparison which doesn't fully control the loop exit.
---
 llvm/lib/Analysis/ScalarEvolution.cpp         |  5 +++
 .../scev-incorrect-nuw-inference.ll           | 31 ++++++++-----------
 2 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp
index f13e508a6c454..de24aa986688a 100644
--- a/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -12640,6 +12640,11 @@ ScalarEvolution::howManyLessThans(const SCEV *LHS, const SCEV *RHS,
       const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(ZExt->getOperand());
       if (AR && AR->getLoop() == L && AR->isAffine()) {
         auto canProveNUW = [&]() {
+          // We can use the comparison to infer no-wrap flags only if it fully
+          // controls the loop exit.
+          if (!ControlsOnlyExit)
+            return false;
+
           if (!isLoopInvariant(RHS, L))
             return false;
 
diff --git a/llvm/test/Transforms/LoopStrengthReduce/scev-incorrect-nuw-inference.ll b/llvm/test/Transforms/LoopStrengthReduce/scev-incorrect-nuw-inference.ll
index 3a9cb160e88d7..1b37b4cd2a2d8 100644
--- a/llvm/test/Transforms/LoopStrengthReduce/scev-incorrect-nuw-inference.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/scev-incorrect-nuw-inference.ll
@@ -5,38 +5,33 @@
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128-ni:1-p2:32:8:8:32-ni:2"
 target triple = "x86_64-unknown-linux-gnu"
 
-; FIXME: the returned value should be equal to
-;     zext (trunk (%phi-1) to i16) to i64
-;   or simply
+; The returned value should be equal to
 ;     zext (%phi-1) to i64
-; which means it should be equal to 1209. Currently, due to a bug in SCEV, it's
-; over 65534.
+; or simply 1209.
 define noundef i64 @test() {
 ; CHECK-LABEL: define noundef i64 @test() {
 ; CHECK-NEXT:  bb2:
 ; CHECK-NEXT:    br label [[BB3:%.*]]
 ; CHECK:       bb3:
-; CHECK-NEXT:    [[LSR_IV:%.*]] = phi i64 [ [[LSR_IV_NEXT:%.*]], [[BB10:%.*]] ], [ 0, [[BB2:%.*]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[LSR_IV]], 65535
-; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[LSR_IV]], 65407
-; CHECK-NEXT:    [[ICMP5:%.*]] = icmp ult i64 [[TMP1]], -256
-; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[TMP0]] to i32
-; CHECK-NEXT:    [[ICMP6:%.*]] = icmp ult i32 [[TMP2]], 128
+; CHECK-NEXT:    [[LSR_IV:%.*]] = phi i32 [ [[LSR_IV_NEXT:%.*]], [[BB10:%.*]] ], [ -1, [[BB2:%.*]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[LSR_IV]], 65536
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[TMP0]], 65535
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext i32 [[AND]] to i64
+; CHECK-NEXT:    [[ADD4:%.*]] = add nsw i64 [[ZEXT]], -128
+; CHECK-NEXT:    [[ICMP5:%.*]] = icmp ult i64 [[ADD4]], -256
+; CHECK-NEXT:    [[ICMP6:%.*]] = icmp ult i32 [[AND]], 128
 ; CHECK-NEXT:    [[OR:%.*]] = or i1 [[ICMP5]], [[ICMP6]]
 ; CHECK-NEXT:    br i1 [[OR]], label [[BB10]], label [[BB7:%.*]]
 ; CHECK:       bb7:
-; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[LSR_IV]] to i32
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[LSR_IV]], 1
 ; CHECK-NEXT:    call void @foo(i32 [[TMP1]])
 ; CHECK-NEXT:    unreachable
 ; CHECK:       bb10:
-; CHECK-NEXT:    [[LSR_IV_NEXT]] = add nuw nsw i64 [[LSR_IV]], 1
-; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[LSR_IV_NEXT]], -1
-; CHECK-NEXT:    [[TMP:%.*]] = trunc i64 [[TMP2]] to i32
-; CHECK-NEXT:    [[ICMP12:%.*]] = icmp ult i32 [[TMP]], 1210
+; CHECK-NEXT:    [[LSR_IV_NEXT]] = add nsw i32 [[LSR_IV]], 1
+; CHECK-NEXT:    [[ICMP12:%.*]] = icmp ult i32 [[LSR_IV_NEXT]], 1210
 ; CHECK-NEXT:    br i1 [[ICMP12]], label [[BB3]], label [[BB13:%.*]]
 ; CHECK:       bb13:
-; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[LSR_IV_NEXT]], 65534
-; CHECK-NEXT:    ret i64 [[TMP3]]
+; CHECK-NEXT:    ret i64 [[ZEXT]]
 ;
 bb2:
   br label %bb3

From ae7b20b583fab1325d8b51fe5f2eaf612de8b95e Mon Sep 17 00:00:00 2001
From: Vlad Serebrennikov <serebrennikov.vladislav@gmail.com>
Date: Tue, 31 Oct 2023 21:48:42 +0300
Subject: [PATCH 563/877] [clang][NFC] Refactor `VectorType::VectorKind`

This patch moves `VectorKind` to namespace scope, and make it complete at the point its bit-field is declared. It also converts it to a scoped enum.
---
 clang/include/clang/AST/ASTContext.h      |   4 +-
 clang/include/clang/AST/PropertiesBase.td |   2 +-
 clang/include/clang/AST/Type.h            |  72 +++++++--------
 clang/lib/AST/ASTContext.cpp              |  61 ++++++-------
 clang/lib/AST/ItaniumMangle.cpp           |  42 ++++-----
 clang/lib/AST/JSONNodeDumper.cpp          |  18 ++--
 clang/lib/AST/ODRHash.cpp                 |   2 +-
 clang/lib/AST/TextNodeDumper.cpp          |  18 ++--
 clang/lib/AST/Type.cpp                    |  11 ++-
 clang/lib/AST/TypePrinter.cpp             |  40 ++++-----
 clang/lib/CodeGen/CGDebugInfo.cpp         |   4 +-
 clang/lib/CodeGen/CGExprScalar.cpp        |   2 +-
 clang/lib/CodeGen/Targets/AArch64.cpp     |  12 +--
 clang/lib/CodeGen/Targets/RISCV.cpp       |   4 +-
 clang/lib/Sema/SemaCast.cpp               |   8 +-
 clang/lib/Sema/SemaChecking.cpp           |   6 +-
 clang/lib/Sema/SemaDeclAttr.cpp           |   2 +-
 clang/lib/Sema/SemaExpr.cpp               | 103 ++++++++++------------
 clang/lib/Sema/SemaExprCXX.cpp            |   2 +-
 clang/lib/Sema/SemaInit.cpp               |   4 +-
 clang/lib/Sema/SemaType.cpp               |  32 ++++---
 clang/lib/Sema/TreeTransform.h            |  14 ++-
 libcxxabi/test/test_demangle.pass.cpp     |  12 +--
 23 files changed, 231 insertions(+), 244 deletions(-)

diff --git a/clang/include/clang/AST/ASTContext.h b/clang/include/clang/AST/ASTContext.h
index 24d22a9c692cd..e5f78dfdc22ab 100644
--- a/clang/include/clang/AST/ASTContext.h
+++ b/clang/include/clang/AST/ASTContext.h
@@ -1493,12 +1493,12 @@ class ASTContext : public RefCountedBase<ASTContext> {
   ///
   /// \pre \p VectorType must be a built-in type.
   QualType getVectorType(QualType VectorType, unsigned NumElts,
-                         VectorType::VectorKind VecKind) const;
+                         VectorKind VecKind) const;
   /// Return the unique reference to the type for a dependently sized vector of
   /// the specified element type.
   QualType getDependentVectorType(QualType VectorType, Expr *SizeExpr,
                                   SourceLocation AttrLoc,
-                                  VectorType::VectorKind VecKind) const;
+                                  VectorKind VecKind) const;
 
   /// Return the unique reference to an extended vector type
   /// of the specified element type and size.
diff --git a/clang/include/clang/AST/PropertiesBase.td b/clang/include/clang/AST/PropertiesBase.td
index 129e6e0136964..d86c4eba6a225 100644
--- a/clang/include/clang/AST/PropertiesBase.td
+++ b/clang/include/clang/AST/PropertiesBase.td
@@ -142,7 +142,7 @@ def TypeOfKind : EnumPropertyType<"TypeOfKind">;
 def UInt32 : CountPropertyType<"uint32_t">;
 def UInt64 : CountPropertyType<"uint64_t">;
 def UnaryTypeTransformKind : EnumPropertyType<"UnaryTransformType::UTTKind">;
-def VectorKind : EnumPropertyType<"VectorType::VectorKind">;
+def VectorKind : EnumPropertyType<"VectorKind">;
 
 def ExceptionSpecInfo : PropertyType<"FunctionProtoType::ExceptionSpecInfo"> {
   let BufferElementTypes = [ QualType ];
diff --git a/clang/include/clang/AST/Type.h b/clang/include/clang/AST/Type.h
index 233f607849ce7..a15ed46cae0ee 100644
--- a/clang/include/clang/AST/Type.h
+++ b/clang/include/clang/AST/Type.h
@@ -1601,6 +1601,35 @@ enum class ElaboratedTypeKeyword {
   None
 };
 
+enum class VectorKind {
+  /// not a target-specific vector type
+  Generic,
+
+  /// is AltiVec vector
+  AltiVecVector,
+
+  /// is AltiVec 'vector Pixel'
+  AltiVecPixel,
+
+  /// is AltiVec 'vector bool ...'
+  AltiVecBool,
+
+  /// is ARM Neon vector
+  Neon,
+
+  /// is ARM Neon polynomial vector
+  NeonPoly,
+
+  /// is AArch64 SVE fixed-length data vector
+  SveFixedLengthData,
+
+  /// is AArch64 SVE fixed-length predicate vector
+  SveFixedLengthPredicate,
+
+  /// is RISC-V RVV fixed-length data vector
+  RVVFixedLengthData,
+};
+
 /// The base class of the type hierarchy.
 ///
 /// A central concept with types is that each type always has a canonical
@@ -3452,36 +3481,6 @@ class DependentSizedExtVectorType : public Type, public llvm::FoldingSetNode {
 /// Since the constructor takes the number of vector elements, the
 /// client is responsible for converting the size into the number of elements.
 class VectorType : public Type, public llvm::FoldingSetNode {
-public:
-  enum VectorKind {
-    /// not a target-specific vector type
-    GenericVector,
-
-    /// is AltiVec vector
-    AltiVecVector,
-
-    /// is AltiVec 'vector Pixel'
-    AltiVecPixel,
-
-    /// is AltiVec 'vector bool ...'
-    AltiVecBool,
-
-    /// is ARM Neon vector
-    NeonVector,
-
-    /// is ARM Neon polynomial vector
-    NeonPolyVector,
-
-    /// is AArch64 SVE fixed-length data vector
-    SveFixedLengthDataVector,
-
-    /// is AArch64 SVE fixed-length predicate vector
-    SveFixedLengthPredicateVector,
-
-    /// is RISC-V RVV fixed-length data vector
-    RVVFixedLengthDataVector,
-  };
-
 protected:
   friend class ASTContext; // ASTContext creates these.
 
@@ -3516,7 +3515,7 @@ class VectorType : public Type, public llvm::FoldingSetNode {
     ID.AddPointer(ElementType.getAsOpaquePtr());
     ID.AddInteger(NumElements);
     ID.AddInteger(TypeClass);
-    ID.AddInteger(VecKind);
+    ID.AddInteger(llvm::to_underlying(VecKind));
   }
 
   static bool classof(const Type *T) {
@@ -3541,14 +3540,14 @@ class DependentVectorType : public Type, public llvm::FoldingSetNode {
   SourceLocation Loc;
 
   DependentVectorType(QualType ElementType, QualType CanonType, Expr *SizeExpr,
-                      SourceLocation Loc, VectorType::VectorKind vecKind);
+                      SourceLocation Loc, VectorKind vecKind);
 
 public:
   Expr *getSizeExpr() const { return SizeExpr; }
   QualType getElementType() const { return ElementType; }
   SourceLocation getAttributeLoc() const { return Loc; }
-  VectorType::VectorKind getVectorKind() const {
-    return VectorType::VectorKind(VectorTypeBits.VecKind);
+  VectorKind getVectorKind() const {
+    return VectorKind(VectorTypeBits.VecKind);
   }
 
   bool isSugared() const { return false; }
@@ -3564,7 +3563,7 @@ class DependentVectorType : public Type, public llvm::FoldingSetNode {
 
   static void Profile(llvm::FoldingSetNodeID &ID, const ASTContext &Context,
                       QualType ElementType, const Expr *SizeExpr,
-                      VectorType::VectorKind VecKind);
+                      VectorKind VecKind);
 };
 
 /// ExtVectorType - Extended vector type. This type is created using
@@ -3577,7 +3576,8 @@ class ExtVectorType : public VectorType {
   friend class ASTContext; // ASTContext creates these.
 
   ExtVectorType(QualType vecType, unsigned nElements, QualType canonType)
-      : VectorType(ExtVector, vecType, nElements, canonType, GenericVector) {}
+      : VectorType(ExtVector, vecType, nElements, canonType,
+                   VectorKind::Generic) {}
 
 public:
   static int getPointAccessorIdx(char c) {
diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp
index e864886a84007..1cb81cffd37ea 100644
--- a/clang/lib/AST/ASTContext.cpp
+++ b/clang/lib/AST/ASTContext.cpp
@@ -1935,14 +1935,14 @@ TypeInfo ASTContext::getTypeInfoImpl(const Type *T) const {
     uint64_t TargetVectorAlign = Target->getMaxVectorAlign();
     if (TargetVectorAlign && TargetVectorAlign < Align)
       Align = TargetVectorAlign;
-    if (VT->getVectorKind() == VectorType::SveFixedLengthDataVector)
+    if (VT->getVectorKind() == VectorKind::SveFixedLengthData)
       // Adjust the alignment for fixed-length SVE vectors. This is important
       // for non-power-of-2 vector lengths.
       Align = 128;
-    else if (VT->getVectorKind() == VectorType::SveFixedLengthPredicateVector)
+    else if (VT->getVectorKind() == VectorKind::SveFixedLengthPredicate)
       // Adjust the alignment for fixed-length SVE predicates.
       Align = 16;
-    else if (VT->getVectorKind() == VectorType::RVVFixedLengthDataVector)
+    else if (VT->getVectorKind() == VectorKind::RVVFixedLengthData)
       // Adjust the alignment for fixed-length RVV vectors.
       Align = std::min<unsigned>(64, Width);
     break;
@@ -4004,7 +4004,7 @@ QualType ASTContext::getScalableVectorType(QualType EltTy, unsigned NumElts,
 /// getVectorType - Return the unique reference to a vector type of
 /// the specified element type and size. VectorType must be a built-in type.
 QualType ASTContext::getVectorType(QualType vecType, unsigned NumElts,
-                                   VectorType::VectorKind VecKind) const {
+                                   VectorKind VecKind) const {
   assert(vecType->isBuiltinType() ||
          (vecType->isBitIntType() &&
           // Only support _BitInt elements with byte-sized power of 2 NumBits.
@@ -4036,10 +4036,9 @@ QualType ASTContext::getVectorType(QualType vecType, unsigned NumElts,
   return QualType(New, 0);
 }
 
-QualType
-ASTContext::getDependentVectorType(QualType VecType, Expr *SizeExpr,
-                                   SourceLocation AttrLoc,
-                                   VectorType::VectorKind VecKind) const {
+QualType ASTContext::getDependentVectorType(QualType VecType, Expr *SizeExpr,
+                                            SourceLocation AttrLoc,
+                                            VectorKind VecKind) const {
   llvm::FoldingSetNodeID ID;
   DependentVectorType::Profile(ID, *this, getCanonicalType(VecType), SizeExpr,
                                VecKind);
@@ -4088,7 +4087,7 @@ QualType ASTContext::getExtVectorType(QualType vecType,
   // Check if we've already instantiated a vector of this type.
   llvm::FoldingSetNodeID ID;
   VectorType::Profile(ID, vecType, NumElts, Type::ExtVector,
-                      VectorType::GenericVector);
+                      VectorKind::Generic);
   void *InsertPos = nullptr;
   if (VectorType *VTP = VectorTypes.FindNodeOrInsertPos(ID, InsertPos))
     return QualType(VTP, 0);
@@ -9396,16 +9395,16 @@ bool ASTContext::areCompatibleVectorTypes(QualType FirstVec,
   const auto *Second = SecondVec->castAs<VectorType>();
   if (First->getNumElements() == Second->getNumElements() &&
       hasSameType(First->getElementType(), Second->getElementType()) &&
-      First->getVectorKind() != VectorType::AltiVecPixel &&
-      First->getVectorKind() != VectorType::AltiVecBool &&
-      Second->getVectorKind() != VectorType::AltiVecPixel &&
-      Second->getVectorKind() != VectorType::AltiVecBool &&
-      First->getVectorKind() != VectorType::SveFixedLengthDataVector &&
-      First->getVectorKind() != VectorType::SveFixedLengthPredicateVector &&
-      Second->getVectorKind() != VectorType::SveFixedLengthDataVector &&
-      Second->getVectorKind() != VectorType::SveFixedLengthPredicateVector &&
-      First->getVectorKind() != VectorType::RVVFixedLengthDataVector &&
-      Second->getVectorKind() != VectorType::RVVFixedLengthDataVector)
+      First->getVectorKind() != VectorKind::AltiVecPixel &&
+      First->getVectorKind() != VectorKind::AltiVecBool &&
+      Second->getVectorKind() != VectorKind::AltiVecPixel &&
+      Second->getVectorKind() != VectorKind::AltiVecBool &&
+      First->getVectorKind() != VectorKind::SveFixedLengthData &&
+      First->getVectorKind() != VectorKind::SveFixedLengthPredicate &&
+      Second->getVectorKind() != VectorKind::SveFixedLengthData &&
+      Second->getVectorKind() != VectorKind::SveFixedLengthPredicate &&
+      First->getVectorKind() != VectorKind::RVVFixedLengthData &&
+      Second->getVectorKind() != VectorKind::RVVFixedLengthData)
     return true;
 
   return false;
@@ -9432,12 +9431,12 @@ bool ASTContext::areCompatibleSveTypes(QualType FirstType,
       if (const auto *VT = SecondType->getAs<VectorType>()) {
         // Predicates have the same representation as uint8 so we also have to
         // check the kind to make these types incompatible.
-        if (VT->getVectorKind() == VectorType::SveFixedLengthPredicateVector)
+        if (VT->getVectorKind() == VectorKind::SveFixedLengthPredicate)
           return BT->getKind() == BuiltinType::SveBool;
-        else if (VT->getVectorKind() == VectorType::SveFixedLengthDataVector)
+        else if (VT->getVectorKind() == VectorKind::SveFixedLengthData)
           return VT->getElementType().getCanonicalType() ==
                  FirstType->getSveEltType(*this);
-        else if (VT->getVectorKind() == VectorType::GenericVector)
+        else if (VT->getVectorKind() == VectorKind::Generic)
           return getTypeSize(SecondType) == getSVETypeSize(*this, BT) &&
                  hasSameType(VT->getElementType(),
                              getBuiltinVectorTypeInfo(BT).ElementType);
@@ -9463,16 +9462,15 @@ bool ASTContext::areLaxCompatibleSveTypes(QualType FirstType,
       return false;
 
     const auto *VecTy = SecondType->getAs<VectorType>();
-    if (VecTy &&
-        (VecTy->getVectorKind() == VectorType::SveFixedLengthDataVector ||
-         VecTy->getVectorKind() == VectorType::GenericVector)) {
+    if (VecTy && (VecTy->getVectorKind() == VectorKind::SveFixedLengthData ||
+                  VecTy->getVectorKind() == VectorKind::Generic)) {
       const LangOptions::LaxVectorConversionKind LVCKind =
           getLangOpts().getLaxVectorConversions();
 
       // Can not convert between sve predicates and sve vectors because of
       // different size.
       if (BT->getKind() == BuiltinType::SveBool &&
-          VecTy->getVectorKind() == VectorType::SveFixedLengthDataVector)
+          VecTy->getVectorKind() == VectorKind::SveFixedLengthData)
         return false;
 
       // If __ARM_FEATURE_SVE_BITS != N do not allow GNU vector lax conversion.
@@ -9480,7 +9478,7 @@ bool ASTContext::areLaxCompatibleSveTypes(QualType FirstType,
       // converts to VLAT and VLAT implicitly converts to GNUT."
       // ACLE Spec Version 00bet6, 3.7.3.2. Behavior common to vectors and
       // predicates.
-      if (VecTy->getVectorKind() == VectorType::GenericVector &&
+      if (VecTy->getVectorKind() == VectorKind::Generic &&
           getTypeSize(SecondType) != getSVETypeSize(*this, BT))
         return false;
 
@@ -9527,8 +9525,8 @@ bool ASTContext::areCompatibleRVVTypes(QualType FirstType,
   auto IsValidCast = [this](QualType FirstType, QualType SecondType) {
     if (const auto *BT = FirstType->getAs<BuiltinType>()) {
       if (const auto *VT = SecondType->getAs<VectorType>()) {
-        if (VT->getVectorKind() == VectorType::RVVFixedLengthDataVector ||
-            VT->getVectorKind() == VectorType::GenericVector)
+        if (VT->getVectorKind() == VectorKind::RVVFixedLengthData ||
+            VT->getVectorKind() == VectorKind::Generic)
           return FirstType->isRVVVLSBuiltinType() &&
                  getTypeSize(SecondType) == getRVVTypeSize(*this, BT) &&
                  hasSameType(VT->getElementType(),
@@ -9558,7 +9556,7 @@ bool ASTContext::areLaxCompatibleRVVTypes(QualType FirstType,
       return false;
 
     const auto *VecTy = SecondType->getAs<VectorType>();
-    if (VecTy && VecTy->getVectorKind() == VectorType::GenericVector) {
+    if (VecTy && VecTy->getVectorKind() == VectorKind::Generic) {
       const LangOptions::LaxVectorConversionKind LVCKind =
           getLangOpts().getLaxVectorConversions();
 
@@ -11398,8 +11396,7 @@ static QualType DecodeTypeFromStr(const char *&Str, const ASTContext &Context,
     assert(!RequiresICE && "Can't require vector ICE");
 
     // TODO: No way to make AltiVec vectors in builtins yet.
-    Type = Context.getVectorType(ElementType, NumElements,
-                                 VectorType::GenericVector);
+    Type = Context.getVectorType(ElementType, NumElements, VectorKind::Generic);
     break;
   }
   case 'E': {
diff --git a/clang/lib/AST/ItaniumMangle.cpp b/clang/lib/AST/ItaniumMangle.cpp
index 80a177eb39625..261a56c4b666a 100644
--- a/clang/lib/AST/ItaniumMangle.cpp
+++ b/clang/lib/AST/ItaniumMangle.cpp
@@ -3707,7 +3707,7 @@ void CXXNameMangler::mangleNeonVectorType(const VectorType *T) {
   QualType EltType = T->getElementType();
   assert(EltType->isBuiltinType() && "Neon vector element not a BuiltinType");
   const char *EltName = nullptr;
-  if (T->getVectorKind() == VectorType::NeonPolyVector) {
+  if (T->getVectorKind() == VectorKind::NeonPoly) {
     switch (cast<BuiltinType>(EltType)->getKind()) {
     case BuiltinType::SChar:
     case BuiltinType::UChar:
@@ -3809,7 +3809,7 @@ void CXXNameMangler::mangleAArch64NeonVectorType(const VectorType *T) {
          "Neon vector type not 64 or 128 bits");
 
   StringRef EltName;
-  if (T->getVectorKind() == VectorType::NeonPolyVector) {
+  if (T->getVectorKind() == VectorKind::NeonPoly) {
     switch (cast<BuiltinType>(EltType)->getKind()) {
     case BuiltinType::UChar:
       EltName = "Poly8";
@@ -3864,8 +3864,8 @@ void CXXNameMangler::mangleAArch64NeonVectorType(const DependentVectorType *T) {
 // for the Arm Architecture, see
 // https://github.com/ARM-software/abi-aa/blob/main/aapcs64/aapcs64.rst#appendix-c-mangling
 void CXXNameMangler::mangleAArch64FixedSveVectorType(const VectorType *T) {
-  assert((T->getVectorKind() == VectorType::SveFixedLengthDataVector ||
-          T->getVectorKind() == VectorType::SveFixedLengthPredicateVector) &&
+  assert((T->getVectorKind() == VectorKind::SveFixedLengthData ||
+          T->getVectorKind() == VectorKind::SveFixedLengthPredicate) &&
          "expected fixed-length SVE vector!");
 
   QualType EltType = T->getElementType();
@@ -3878,7 +3878,7 @@ void CXXNameMangler::mangleAArch64FixedSveVectorType(const VectorType *T) {
     TypeName = "__SVInt8_t";
     break;
   case BuiltinType::UChar: {
-    if (T->getVectorKind() == VectorType::SveFixedLengthDataVector)
+    if (T->getVectorKind() == VectorKind::SveFixedLengthData)
       TypeName = "__SVUint8_t";
     else
       TypeName = "__SVBool_t";
@@ -3920,7 +3920,7 @@ void CXXNameMangler::mangleAArch64FixedSveVectorType(const VectorType *T) {
 
   unsigned VecSizeInBits = getASTContext().getTypeInfo(T).Width;
 
-  if (T->getVectorKind() == VectorType::SveFixedLengthPredicateVector)
+  if (T->getVectorKind() == VectorKind::SveFixedLengthPredicate)
     VecSizeInBits *= 8;
 
   Out << "9__SVE_VLSI" << 'u' << TypeName.size() << TypeName << "Lj"
@@ -3937,7 +3937,7 @@ void CXXNameMangler::mangleAArch64FixedSveVectorType(
 }
 
 void CXXNameMangler::mangleRISCVFixedRVVVectorType(const VectorType *T) {
-  assert(T->getVectorKind() == VectorType::RVVFixedLengthDataVector &&
+  assert(T->getVectorKind() == VectorKind::RVVFixedLengthData &&
          "expected fixed-length RVV vector!");
 
   QualType EltType = T->getElementType();
@@ -4021,8 +4021,8 @@ void CXXNameMangler::mangleRISCVFixedRVVVectorType(
 //                         ::= p # AltiVec vector pixel
 //                         ::= b # Altivec vector bool
 void CXXNameMangler::mangleType(const VectorType *T) {
-  if ((T->getVectorKind() == VectorType::NeonVector ||
-       T->getVectorKind() == VectorType::NeonPolyVector)) {
+  if ((T->getVectorKind() == VectorKind::Neon ||
+       T->getVectorKind() == VectorKind::NeonPoly)) {
     llvm::Triple Target = getASTContext().getTargetInfo().getTriple();
     llvm::Triple::ArchType Arch =
         getASTContext().getTargetInfo().getTriple().getArch();
@@ -4032,26 +4032,26 @@ void CXXNameMangler::mangleType(const VectorType *T) {
     else
       mangleNeonVectorType(T);
     return;
-  } else if (T->getVectorKind() == VectorType::SveFixedLengthDataVector ||
-             T->getVectorKind() == VectorType::SveFixedLengthPredicateVector) {
+  } else if (T->getVectorKind() == VectorKind::SveFixedLengthData ||
+             T->getVectorKind() == VectorKind::SveFixedLengthPredicate) {
     mangleAArch64FixedSveVectorType(T);
     return;
-  } else if (T->getVectorKind() == VectorType::RVVFixedLengthDataVector) {
+  } else if (T->getVectorKind() == VectorKind::RVVFixedLengthData) {
     mangleRISCVFixedRVVVectorType(T);
     return;
   }
   Out << "Dv" << T->getNumElements() << '_';
-  if (T->getVectorKind() == VectorType::AltiVecPixel)
+  if (T->getVectorKind() == VectorKind::AltiVecPixel)
     Out << 'p';
-  else if (T->getVectorKind() == VectorType::AltiVecBool)
+  else if (T->getVectorKind() == VectorKind::AltiVecBool)
     Out << 'b';
   else
     mangleType(T->getElementType());
 }
 
 void CXXNameMangler::mangleType(const DependentVectorType *T) {
-  if ((T->getVectorKind() == VectorType::NeonVector ||
-       T->getVectorKind() == VectorType::NeonPolyVector)) {
+  if ((T->getVectorKind() == VectorKind::Neon ||
+       T->getVectorKind() == VectorKind::NeonPoly)) {
     llvm::Triple Target = getASTContext().getTargetInfo().getTriple();
     llvm::Triple::ArchType Arch =
         getASTContext().getTargetInfo().getTriple().getArch();
@@ -4061,11 +4061,11 @@ void CXXNameMangler::mangleType(const DependentVectorType *T) {
     else
       mangleNeonVectorType(T);
     return;
-  } else if (T->getVectorKind() == VectorType::SveFixedLengthDataVector ||
-             T->getVectorKind() == VectorType::SveFixedLengthPredicateVector) {
+  } else if (T->getVectorKind() == VectorKind::SveFixedLengthData ||
+             T->getVectorKind() == VectorKind::SveFixedLengthPredicate) {
     mangleAArch64FixedSveVectorType(T);
     return;
-  } else if (T->getVectorKind() == VectorType::RVVFixedLengthDataVector) {
+  } else if (T->getVectorKind() == VectorKind::RVVFixedLengthData) {
     mangleRISCVFixedRVVVectorType(T);
     return;
   }
@@ -4073,9 +4073,9 @@ void CXXNameMangler::mangleType(const DependentVectorType *T) {
   Out << "Dv";
   mangleExpression(T->getSizeExpr());
   Out << '_';
-  if (T->getVectorKind() == VectorType::AltiVecPixel)
+  if (T->getVectorKind() == VectorKind::AltiVecPixel)
     Out << 'p';
-  else if (T->getVectorKind() == VectorType::AltiVecBool)
+  else if (T->getVectorKind() == VectorKind::AltiVecBool)
     Out << 'b';
   else
     mangleType(T->getElementType());
diff --git a/clang/lib/AST/JSONNodeDumper.cpp b/clang/lib/AST/JSONNodeDumper.cpp
index 60598bd0a80bb..abc3b273433ac 100644
--- a/clang/lib/AST/JSONNodeDumper.cpp
+++ b/clang/lib/AST/JSONNodeDumper.cpp
@@ -677,30 +677,30 @@ void JSONNodeDumper::VisitDependentSizedExtVectorType(
 void JSONNodeDumper::VisitVectorType(const VectorType *VT) {
   JOS.attribute("numElements", VT->getNumElements());
   switch (VT->getVectorKind()) {
-  case VectorType::GenericVector:
+  case VectorKind::Generic:
     break;
-  case VectorType::AltiVecVector:
+  case VectorKind::AltiVecVector:
     JOS.attribute("vectorKind", "altivec");
     break;
-  case VectorType::AltiVecPixel:
+  case VectorKind::AltiVecPixel:
     JOS.attribute("vectorKind", "altivec pixel");
     break;
-  case VectorType::AltiVecBool:
+  case VectorKind::AltiVecBool:
     JOS.attribute("vectorKind", "altivec bool");
     break;
-  case VectorType::NeonVector:
+  case VectorKind::Neon:
     JOS.attribute("vectorKind", "neon");
     break;
-  case VectorType::NeonPolyVector:
+  case VectorKind::NeonPoly:
     JOS.attribute("vectorKind", "neon poly");
     break;
-  case VectorType::SveFixedLengthDataVector:
+  case VectorKind::SveFixedLengthData:
     JOS.attribute("vectorKind", "fixed-length sve data vector");
     break;
-  case VectorType::SveFixedLengthPredicateVector:
+  case VectorKind::SveFixedLengthPredicate:
     JOS.attribute("vectorKind", "fixed-length sve predicate vector");
     break;
-  case VectorType::RVVFixedLengthDataVector:
+  case VectorKind::RVVFixedLengthData:
     JOS.attribute("vectorKind", "fixed-length rvv data vector");
     break;
   }
diff --git a/clang/lib/AST/ODRHash.cpp b/clang/lib/AST/ODRHash.cpp
index ce2eb96a651e3..e728c47ade788 100644
--- a/clang/lib/AST/ODRHash.cpp
+++ b/clang/lib/AST/ODRHash.cpp
@@ -1222,7 +1222,7 @@ class ODRTypeVisitor : public TypeVisitor<ODRTypeVisitor> {
   void VisitVectorType(const VectorType *T) {
     AddQualType(T->getElementType());
     ID.AddInteger(T->getNumElements());
-    ID.AddInteger(T->getVectorKind());
+    ID.AddInteger(llvm::to_underlying(T->getVectorKind()));
     VisitType(T);
   }
 
diff --git a/clang/lib/AST/TextNodeDumper.cpp b/clang/lib/AST/TextNodeDumper.cpp
index 60f053e6daaaa..89951b384f128 100644
--- a/clang/lib/AST/TextNodeDumper.cpp
+++ b/clang/lib/AST/TextNodeDumper.cpp
@@ -1587,30 +1587,30 @@ void TextNodeDumper::VisitDependentSizedExtVectorType(
 
 void TextNodeDumper::VisitVectorType(const VectorType *T) {
   switch (T->getVectorKind()) {
-  case VectorType::GenericVector:
+  case VectorKind::Generic:
     break;
-  case VectorType::AltiVecVector:
+  case VectorKind::AltiVecVector:
     OS << " altivec";
     break;
-  case VectorType::AltiVecPixel:
+  case VectorKind::AltiVecPixel:
     OS << " altivec pixel";
     break;
-  case VectorType::AltiVecBool:
+  case VectorKind::AltiVecBool:
     OS << " altivec bool";
     break;
-  case VectorType::NeonVector:
+  case VectorKind::Neon:
     OS << " neon";
     break;
-  case VectorType::NeonPolyVector:
+  case VectorKind::NeonPoly:
     OS << " neon poly";
     break;
-  case VectorType::SveFixedLengthDataVector:
+  case VectorKind::SveFixedLengthData:
     OS << " fixed-length sve data vector";
     break;
-  case VectorType::SveFixedLengthPredicateVector:
+  case VectorKind::SveFixedLengthPredicate:
     OS << " fixed-length sve predicate vector";
     break;
-  case VectorType::RVVFixedLengthDataVector:
+  case VectorKind::RVVFixedLengthData:
     OS << " fixed-length rvv data vector";
     break;
   }
diff --git a/clang/lib/AST/Type.cpp b/clang/lib/AST/Type.cpp
index 5b70380d4ad2e..98a4f12c4f574 100644
--- a/clang/lib/AST/Type.cpp
+++ b/clang/lib/AST/Type.cpp
@@ -246,23 +246,22 @@ void DependentSizedArrayType::Profile(llvm::FoldingSetNodeID &ID,
 
 DependentVectorType::DependentVectorType(QualType ElementType,
                                          QualType CanonType, Expr *SizeExpr,
-                                         SourceLocation Loc,
-                                         VectorType::VectorKind VecKind)
+                                         SourceLocation Loc, VectorKind VecKind)
     : Type(DependentVector, CanonType,
            TypeDependence::DependentInstantiation |
                ElementType->getDependence() |
                (SizeExpr ? toTypeDependence(SizeExpr->getDependence())
                          : TypeDependence::None)),
       ElementType(ElementType), SizeExpr(SizeExpr), Loc(Loc) {
-  VectorTypeBits.VecKind = VecKind;
+  VectorTypeBits.VecKind = llvm::to_underlying(VecKind);
 }
 
 void DependentVectorType::Profile(llvm::FoldingSetNodeID &ID,
                                   const ASTContext &Context,
                                   QualType ElementType, const Expr *SizeExpr,
-                                  VectorType::VectorKind VecKind) {
+                                  VectorKind VecKind) {
   ID.AddPointer(ElementType.getAsOpaquePtr());
-  ID.AddInteger(VecKind);
+  ID.AddInteger(llvm::to_underlying(VecKind));
   SizeExpr->Profile(ID, Context, true);
 }
 
@@ -358,7 +357,7 @@ VectorType::VectorType(QualType vecType, unsigned nElements, QualType canonType,
 VectorType::VectorType(TypeClass tc, QualType vecType, unsigned nElements,
                        QualType canonType, VectorKind vecKind)
     : Type(tc, canonType, vecType->getDependence()), ElementType(vecType) {
-  VectorTypeBits.VecKind = vecKind;
+  VectorTypeBits.VecKind = llvm::to_underlying(vecKind);
   VectorTypeBits.NumElements = nElements;
 }
 
diff --git a/clang/lib/AST/TypePrinter.cpp b/clang/lib/AST/TypePrinter.cpp
index c24373b346046..e4f5f40cd6259 100644
--- a/clang/lib/AST/TypePrinter.cpp
+++ b/clang/lib/AST/TypePrinter.cpp
@@ -642,28 +642,28 @@ void TypePrinter::printDependentSizedExtVectorAfter(
 
 void TypePrinter::printVectorBefore(const VectorType *T, raw_ostream &OS) {
   switch (T->getVectorKind()) {
-  case VectorType::AltiVecPixel:
+  case VectorKind::AltiVecPixel:
     OS << "__vector __pixel ";
     break;
-  case VectorType::AltiVecBool:
+  case VectorKind::AltiVecBool:
     OS << "__vector __bool ";
     printBefore(T->getElementType(), OS);
     break;
-  case VectorType::AltiVecVector:
+  case VectorKind::AltiVecVector:
     OS << "__vector ";
     printBefore(T->getElementType(), OS);
     break;
-  case VectorType::NeonVector:
+  case VectorKind::Neon:
     OS << "__attribute__((neon_vector_type("
        << T->getNumElements() << "))) ";
     printBefore(T->getElementType(), OS);
     break;
-  case VectorType::NeonPolyVector:
+  case VectorKind::NeonPoly:
     OS << "__attribute__((neon_polyvector_type(" <<
           T->getNumElements() << "))) ";
     printBefore(T->getElementType(), OS);
     break;
-  case VectorType::GenericVector: {
+  case VectorKind::Generic: {
     // FIXME: We prefer to print the size directly here, but have no way
     // to get the size of the type.
     OS << "__attribute__((__vector_size__("
@@ -674,13 +674,13 @@ void TypePrinter::printVectorBefore(const VectorType *T, raw_ostream &OS) {
     printBefore(T->getElementType(), OS);
     break;
   }
-  case VectorType::SveFixedLengthDataVector:
-  case VectorType::SveFixedLengthPredicateVector:
+  case VectorKind::SveFixedLengthData:
+  case VectorKind::SveFixedLengthPredicate:
     // FIXME: We prefer to print the size directly here, but have no way
     // to get the size of the type.
     OS << "__attribute__((__arm_sve_vector_bits__(";
 
-    if (T->getVectorKind() == VectorType::SveFixedLengthPredicateVector)
+    if (T->getVectorKind() == VectorKind::SveFixedLengthPredicate)
       // Predicates take a bit per byte of the vector size, multiply by 8 to
       // get the number of bits passed to the attribute.
       OS << T->getNumElements() * 8;
@@ -693,7 +693,7 @@ void TypePrinter::printVectorBefore(const VectorType *T, raw_ostream &OS) {
     OS << ") * 8))) ";
     printBefore(T->getElementType(), OS);
     break;
-  case VectorType::RVVFixedLengthDataVector:
+  case VectorKind::RVVFixedLengthData:
     // FIXME: We prefer to print the size directly here, but have no way
     // to get the size of the type.
     OS << "__attribute__((__riscv_rvv_vector_bits__(";
@@ -716,32 +716,32 @@ void TypePrinter::printVectorAfter(const VectorType *T, raw_ostream &OS) {
 void TypePrinter::printDependentVectorBefore(
     const DependentVectorType *T, raw_ostream &OS) {
   switch (T->getVectorKind()) {
-  case VectorType::AltiVecPixel:
+  case VectorKind::AltiVecPixel:
     OS << "__vector __pixel ";
     break;
-  case VectorType::AltiVecBool:
+  case VectorKind::AltiVecBool:
     OS << "__vector __bool ";
     printBefore(T->getElementType(), OS);
     break;
-  case VectorType::AltiVecVector:
+  case VectorKind::AltiVecVector:
     OS << "__vector ";
     printBefore(T->getElementType(), OS);
     break;
-  case VectorType::NeonVector:
+  case VectorKind::Neon:
     OS << "__attribute__((neon_vector_type(";
     if (T->getSizeExpr())
       T->getSizeExpr()->printPretty(OS, nullptr, Policy);
     OS << "))) ";
     printBefore(T->getElementType(), OS);
     break;
-  case VectorType::NeonPolyVector:
+  case VectorKind::NeonPoly:
     OS << "__attribute__((neon_polyvector_type(";
     if (T->getSizeExpr())
       T->getSizeExpr()->printPretty(OS, nullptr, Policy);
     OS << "))) ";
     printBefore(T->getElementType(), OS);
     break;
-  case VectorType::GenericVector: {
+  case VectorKind::Generic: {
     // FIXME: We prefer to print the size directly here, but have no way
     // to get the size of the type.
     OS << "__attribute__((__vector_size__(";
@@ -753,14 +753,14 @@ void TypePrinter::printDependentVectorBefore(
     printBefore(T->getElementType(), OS);
     break;
   }
-  case VectorType::SveFixedLengthDataVector:
-  case VectorType::SveFixedLengthPredicateVector:
+  case VectorKind::SveFixedLengthData:
+  case VectorKind::SveFixedLengthPredicate:
     // FIXME: We prefer to print the size directly here, but have no way
     // to get the size of the type.
     OS << "__attribute__((__arm_sve_vector_bits__(";
     if (T->getSizeExpr()) {
       T->getSizeExpr()->printPretty(OS, nullptr, Policy);
-      if (T->getVectorKind() == VectorType::SveFixedLengthPredicateVector)
+      if (T->getVectorKind() == VectorKind::SveFixedLengthPredicate)
         // Predicates take a bit per byte of the vector size, multiply by 8 to
         // get the number of bits passed to the attribute.
         OS << " * 8";
@@ -772,7 +772,7 @@ void TypePrinter::printDependentVectorBefore(
     OS << "))) ";
     printBefore(T->getElementType(), OS);
     break;
-  case VectorType::RVVFixedLengthDataVector:
+  case VectorKind::RVVFixedLengthData:
     // FIXME: We prefer to print the size directly here, but have no way
     // to get the size of the type.
     OS << "__attribute__((__riscv_rvv_vector_bits__(";
diff --git a/clang/lib/CodeGen/CGDebugInfo.cpp b/clang/lib/CodeGen/CGDebugInfo.cpp
index 2fb4fa46f51a5..dca11b317fba5 100644
--- a/clang/lib/CodeGen/CGDebugInfo.cpp
+++ b/clang/lib/CodeGen/CGDebugInfo.cpp
@@ -3121,8 +3121,8 @@ llvm::DIType *CGDebugInfo::CreateType(const VectorType *Ty,
     uint64_t NumVectorBytes = Size / Ctx.getCharWidth();
 
     // Construct the vector of 'char' type.
-    QualType CharVecTy = Ctx.getVectorType(Ctx.CharTy, NumVectorBytes,
-                                           VectorType::GenericVector);
+    QualType CharVecTy =
+        Ctx.getVectorType(Ctx.CharTy, NumVectorBytes, VectorKind::Generic);
     return CreateType(CharVecTy->getAs<VectorType>(), Unit);
   }
 
diff --git a/clang/lib/CodeGen/CGExprScalar.cpp b/clang/lib/CodeGen/CGExprScalar.cpp
index 7633c6b17db88..1a7a3f97bb779 100644
--- a/clang/lib/CodeGen/CGExprScalar.cpp
+++ b/clang/lib/CodeGen/CGExprScalar.cpp
@@ -2931,7 +2931,7 @@ Value *ScalarExprEmitter::VisitUnaryLNot(const UnaryOperator *E) {
   // Perform vector logical not on comparison with zero vector.
   if (E->getType()->isVectorType() &&
       E->getType()->castAs<VectorType>()->getVectorKind() ==
-          VectorType::GenericVector) {
+          VectorKind::Generic) {
     Value *Oper = Visit(E->getSubExpr());
     Value *Zero = llvm::Constant::getNullValue(Oper->getType());
     Value *Result;
diff --git a/clang/lib/CodeGen/Targets/AArch64.cpp b/clang/lib/CodeGen/Targets/AArch64.cpp
index ee0968bce1aaf..be5145daa00b7 100644
--- a/clang/lib/CodeGen/Targets/AArch64.cpp
+++ b/clang/lib/CodeGen/Targets/AArch64.cpp
@@ -185,7 +185,7 @@ ABIArgInfo AArch64ABIInfo::coerceIllegalVector(QualType Ty) const {
   assert(Ty->isVectorType() && "expected vector type!");
 
   const auto *VT = Ty->castAs<VectorType>();
-  if (VT->getVectorKind() == VectorType::SveFixedLengthPredicateVector) {
+  if (VT->getVectorKind() == VectorKind::SveFixedLengthPredicate) {
     assert(VT->getElementType()->isBuiltinType() && "expected builtin type!");
     assert(VT->getElementType()->castAs<BuiltinType>()->getKind() ==
                BuiltinType::UChar &&
@@ -194,7 +194,7 @@ ABIArgInfo AArch64ABIInfo::coerceIllegalVector(QualType Ty) const {
         llvm::Type::getInt1Ty(getVMContext()), 16));
   }
 
-  if (VT->getVectorKind() == VectorType::SveFixedLengthDataVector) {
+  if (VT->getVectorKind() == VectorKind::SveFixedLengthData) {
     assert(VT->getElementType()->isBuiltinType() && "expected builtin type!");
 
     const auto *BT = VT->getElementType()->castAs<BuiltinType>();
@@ -368,8 +368,8 @@ ABIArgInfo AArch64ABIInfo::classifyReturnType(QualType RetTy,
     return ABIArgInfo::getIgnore();
 
   if (const auto *VT = RetTy->getAs<VectorType>()) {
-    if (VT->getVectorKind() == VectorType::SveFixedLengthDataVector ||
-        VT->getVectorKind() == VectorType::SveFixedLengthPredicateVector)
+    if (VT->getVectorKind() == VectorKind::SveFixedLengthData ||
+        VT->getVectorKind() == VectorKind::SveFixedLengthPredicate)
       return coerceIllegalVector(RetTy);
   }
 
@@ -443,8 +443,8 @@ bool AArch64ABIInfo::isIllegalVectorType(QualType Ty) const {
     // Check whether VT is a fixed-length SVE vector. These types are
     // represented as scalable vectors in function args/return and must be
     // coerced from fixed vectors.
-    if (VT->getVectorKind() == VectorType::SveFixedLengthDataVector ||
-        VT->getVectorKind() == VectorType::SveFixedLengthPredicateVector)
+    if (VT->getVectorKind() == VectorKind::SveFixedLengthData ||
+        VT->getVectorKind() == VectorKind::SveFixedLengthPredicate)
       return true;
 
     // Check whether VT is legal.
diff --git a/clang/lib/CodeGen/Targets/RISCV.cpp b/clang/lib/CodeGen/Targets/RISCV.cpp
index b12c3025f6073..1e1d249b37ac0 100644
--- a/clang/lib/CodeGen/Targets/RISCV.cpp
+++ b/clang/lib/CodeGen/Targets/RISCV.cpp
@@ -318,7 +318,7 @@ ABIArgInfo RISCVABIInfo::coerceVLSVector(QualType Ty) const {
   assert(Ty->isVectorType() && "expected vector type!");
 
   const auto *VT = Ty->castAs<VectorType>();
-  assert(VT->getVectorKind() == VectorType::RVVFixedLengthDataVector &&
+  assert(VT->getVectorKind() == VectorKind::RVVFixedLengthData &&
          "Unexpected vector kind");
 
   assert(VT->getElementType()->isBuiltinType() && "expected builtin type!");
@@ -431,7 +431,7 @@ ABIArgInfo RISCVABIInfo::classifyArgumentType(QualType Ty, bool IsFixed,
   }
 
   if (const VectorType *VT = Ty->getAs<VectorType>())
-    if (VT->getVectorKind() == VectorType::RVVFixedLengthDataVector)
+    if (VT->getVectorKind() == VectorKind::RVVFixedLengthData)
       return coerceVLSVector(Ty);
 
   // Aggregates which are <= 2*XLen will be passed in registers if possible,
diff --git a/clang/lib/Sema/SemaCast.cpp b/clang/lib/Sema/SemaCast.cpp
index 87e6d1a2198fc..9d85568d97b2d 100644
--- a/clang/lib/Sema/SemaCast.cpp
+++ b/clang/lib/Sema/SemaCast.cpp
@@ -2682,11 +2682,11 @@ void CastOperation::checkAddressSpaceCast(QualType SrcType, QualType DestType) {
 bool Sema::ShouldSplatAltivecScalarInCast(const VectorType *VecTy) {
   bool SrcCompatXL = this->getLangOpts().getAltivecSrcCompat() ==
                      LangOptions::AltivecSrcCompatKind::XL;
-  VectorType::VectorKind VKind = VecTy->getVectorKind();
+  VectorKind VKind = VecTy->getVectorKind();
 
-  if ((VKind == VectorType::AltiVecVector) ||
-      (SrcCompatXL && ((VKind == VectorType::AltiVecBool) ||
-                       (VKind == VectorType::AltiVecPixel)))) {
+  if ((VKind == VectorKind::AltiVecVector) ||
+      (SrcCompatXL && ((VKind == VectorKind::AltiVecBool) ||
+                       (VKind == VectorKind::AltiVecPixel)))) {
     return true;
   }
   return false;
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index a022eed379716..a8217ed1297b0 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -4645,7 +4645,7 @@ static QualType DecodePPCMMATypeFromStr(ASTContext &Context, const char *&Str,
   switch (*Str++) {
   case 'V':
     return Context.getVectorType(Context.UnsignedCharTy, 16,
-                                 VectorType::VectorKind::AltiVecVector);
+                                 VectorKind::AltiVecVector);
   case 'i': {
     char *End;
     unsigned size = strtoul(Str, &End, 10);
@@ -9153,8 +9153,8 @@ ExprResult Sema::SemaBuiltinShuffleVector(CallExpr *TheCall) {
                                       TheCall->getArg(1)->getEndLoc()));
     } else if (numElements != numResElements) {
       QualType eltType = LHSType->castAs<VectorType>()->getElementType();
-      resType = Context.getVectorType(eltType, numResElements,
-                                      VectorType::GenericVector);
+      resType =
+          Context.getVectorType(eltType, numResElements, VectorKind::Generic);
     }
   }
 
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index fc4e3ccf29a60..5e43f66d8be8d 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -4861,7 +4861,7 @@ void Sema::AddModeAttr(Decl *D, const AttributeCommonInfo &CI,
   QualType NewTy = NewElemTy;
   if (VectorSize.getBoolValue()) {
     NewTy = Context.getVectorType(NewTy, VectorSize.getZExtValue(),
-                                  VectorType::GenericVector);
+                                  VectorKind::Generic);
   } else if (const auto *OldVT = OldTy->getAs<VectorType>()) {
     // Complex machine mode does not support base vector types.
     if (ComplexMode) {
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index 8d3086cf3962c..2a3cd7a00806d 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -8249,8 +8249,7 @@ bool Sema::isValidSveBitcast(QualType srcTy, QualType destTy) {
       return false;
 
     const auto *VecTy = SecondType->getAs<VectorType>();
-    return VecTy &&
-           VecTy->getVectorKind() == VectorType::SveFixedLengthDataVector;
+    return VecTy && VecTy->getVectorKind() == VectorKind::SveFixedLengthData;
   };
 
   return ValidScalableConversion(srcTy, destTy) ||
@@ -8271,8 +8270,7 @@ bool Sema::isValidRVVBitcast(QualType srcTy, QualType destTy) {
       return false;
 
     const auto *VecTy = SecondType->getAs<VectorType>();
-    return VecTy &&
-           VecTy->getVectorKind() == VectorType::RVVFixedLengthDataVector;
+    return VecTy && VecTy->getVectorKind() == VectorKind::RVVFixedLengthData;
   };
 
   return ValidScalableConversion(srcTy, destTy) ||
@@ -8318,19 +8316,19 @@ bool Sema::anyAltivecTypes(QualType SrcTy, QualType DestTy) {
 
   bool IsSrcTyAltivec =
       SrcTy->isVectorType() && ((SrcTy->castAs<VectorType>()->getVectorKind() ==
-                                 VectorType::AltiVecVector) ||
+                                 VectorKind::AltiVecVector) ||
                                 (SrcTy->castAs<VectorType>()->getVectorKind() ==
-                                 VectorType::AltiVecBool) ||
+                                 VectorKind::AltiVecBool) ||
                                 (SrcTy->castAs<VectorType>()->getVectorKind() ==
-                                 VectorType::AltiVecPixel));
+                                 VectorKind::AltiVecPixel));
 
   bool IsDestTyAltivec = DestTy->isVectorType() &&
                          ((DestTy->castAs<VectorType>()->getVectorKind() ==
-                           VectorType::AltiVecVector) ||
+                           VectorKind::AltiVecVector) ||
                           (DestTy->castAs<VectorType>()->getVectorKind() ==
-                           VectorType::AltiVecBool) ||
+                           VectorKind::AltiVecBool) ||
                           (DestTy->castAs<VectorType>()->getVectorKind() ==
-                           VectorType::AltiVecPixel));
+                           VectorKind::AltiVecPixel));
 
   return (IsSrcTyAltivec || IsDestTyAltivec);
 }
@@ -8629,16 +8627,15 @@ ExprResult Sema::BuildVectorLiteral(SourceLocation LParenLoc,
   else {
     // For OpenCL, when the number of initializers is a single value,
     // it will be replicated to all components of the vector.
-    if (getLangOpts().OpenCL &&
-        VTy->getVectorKind() == VectorType::GenericVector &&
+    if (getLangOpts().OpenCL && VTy->getVectorKind() == VectorKind::Generic &&
         numExprs == 1) {
-        QualType ElemTy = VTy->getElementType();
-        ExprResult Literal = DefaultLvalueConversion(exprs[0]);
-        if (Literal.isInvalid())
-          return ExprError();
-        Literal = ImpCastExprToType(Literal.get(), ElemTy,
-                                    PrepareScalarCast(Literal, ElemTy));
-        return BuildCStyleCastExpr(LParenLoc, TInfo, RParenLoc, Literal.get());
+      QualType ElemTy = VTy->getElementType();
+      ExprResult Literal = DefaultLvalueConversion(exprs[0]);
+      if (Literal.isInvalid())
+        return ExprError();
+      Literal = ImpCastExprToType(Literal.get(), ElemTy,
+                                  PrepareScalarCast(Literal, ElemTy));
+      return BuildCStyleCastExpr(LParenLoc, TInfo, RParenLoc, Literal.get());
     }
 
     initExprs.append(exprs, exprs + numExprs);
@@ -10229,9 +10226,9 @@ Sema::CheckAssignmentConstraints(QualType LHSType, ExprResult &RHS,
       if (VecType && VecType->getNumElements() == 1 &&
           isLaxVectorConversion(RHSType, LHSType)) {
         if (Context.getTargetInfo().getTriple().isPPC() &&
-            (VecType->getVectorKind() == VectorType::AltiVecVector ||
-             VecType->getVectorKind() == VectorType::AltiVecBool ||
-             VecType->getVectorKind() == VectorType::AltiVecPixel))
+            (VecType->getVectorKind() == VectorKind::AltiVecVector ||
+             VecType->getVectorKind() == VectorKind::AltiVecBool ||
+             VecType->getVectorKind() == VectorKind::AltiVecPixel))
           Diag(RHS.get()->getExprLoc(), diag::warn_deprecated_lax_vec_conv_all)
               << RHSType << LHSType;
         ExprResult *VecExpr = &RHS;
@@ -11096,9 +11093,9 @@ QualType Sema::CheckVectorOperands(ExprResult &LHS, ExprResult &RHS,
 
   // AltiVec-style "vector bool op vector bool" combinations are allowed
   // for some operators but not others.
-  if (!AllowBothBool &&
-      LHSVecType && LHSVecType->getVectorKind() == VectorType::AltiVecBool &&
-      RHSVecType && RHSVecType->getVectorKind() == VectorType::AltiVecBool)
+  if (!AllowBothBool && LHSVecType &&
+      LHSVecType->getVectorKind() == VectorKind::AltiVecBool && RHSVecType &&
+      RHSVecType->getVectorKind() == VectorKind::AltiVecBool)
     return ReportInvalid ? InvalidOperands(Loc, LHS, RHS) : QualType();
 
   // This operation may not be performed on boolean vectors.
@@ -11130,15 +11127,15 @@ QualType Sema::CheckVectorOperands(ExprResult &LHS, ExprResult &RHS,
       LHSVecType->getNumElements() == RHSVecType->getNumElements() &&
       (Context.getTypeSize(LHSVecType->getElementType()) ==
        Context.getTypeSize(RHSVecType->getElementType()))) {
-    if (LHSVecType->getVectorKind() == VectorType::AltiVecVector &&
+    if (LHSVecType->getVectorKind() == VectorKind::AltiVecVector &&
         LHSVecType->getElementType()->isIntegerType() &&
-        RHSVecType->getVectorKind() == VectorType::AltiVecBool) {
+        RHSVecType->getVectorKind() == VectorKind::AltiVecBool) {
       RHS = ImpCastExprToType(RHS.get(), LHSType, CK_BitCast);
       return LHSType;
     }
     if (!IsCompAssign &&
-        LHSVecType->getVectorKind() == VectorType::AltiVecBool &&
-        RHSVecType->getVectorKind() == VectorType::AltiVecVector &&
+        LHSVecType->getVectorKind() == VectorKind::AltiVecBool &&
+        RHSVecType->getVectorKind() == VectorKind::AltiVecVector &&
         RHSVecType->getElementType()->isIntegerType()) {
       LHS = ImpCastExprToType(LHS.get(), RHSType, CK_BitCast);
       return RHSType;
@@ -11152,10 +11149,10 @@ QualType Sema::CheckVectorOperands(ExprResult &LHS, ExprResult &RHS,
     const VectorType *VecType = SecondType->getAs<VectorType>();
     SVEorRVV = 0;
     if (FirstType->isSizelessBuiltinType() && VecType) {
-      if (VecType->getVectorKind() == VectorType::SveFixedLengthDataVector ||
-          VecType->getVectorKind() == VectorType::SveFixedLengthPredicateVector)
+      if (VecType->getVectorKind() == VectorKind::SveFixedLengthData ||
+          VecType->getVectorKind() == VectorKind::SveFixedLengthPredicate)
         return true;
-      if (VecType->getVectorKind() == VectorType::RVVFixedLengthDataVector) {
+      if (VecType->getVectorKind() == VectorKind::RVVFixedLengthData) {
         SVEorRVV = 1;
         return true;
       }
@@ -11181,14 +11178,12 @@ QualType Sema::CheckVectorOperands(ExprResult &LHS, ExprResult &RHS,
 
     SVEorRVV = 0;
     if (FirstVecType && SecondVecType) {
-      if (FirstVecType->getVectorKind() == VectorType::GenericVector) {
-        if (SecondVecType->getVectorKind() ==
-                VectorType::SveFixedLengthDataVector ||
+      if (FirstVecType->getVectorKind() == VectorKind::Generic) {
+        if (SecondVecType->getVectorKind() == VectorKind::SveFixedLengthData ||
             SecondVecType->getVectorKind() ==
-                VectorType::SveFixedLengthPredicateVector)
+                VectorKind::SveFixedLengthPredicate)
           return true;
-        if (SecondVecType->getVectorKind() ==
-            VectorType::RVVFixedLengthDataVector) {
+        if (SecondVecType->getVectorKind() == VectorKind::RVVFixedLengthData) {
           SVEorRVV = 1;
           return true;
         }
@@ -11197,7 +11192,7 @@ QualType Sema::CheckVectorOperands(ExprResult &LHS, ExprResult &RHS,
     }
 
     if (SecondVecType &&
-        SecondVecType->getVectorKind() == VectorType::GenericVector) {
+        SecondVecType->getVectorKind() == VectorKind::Generic) {
       if (FirstType->isSVESizelessBuiltinType())
         return true;
       if (FirstType->isRVVSizelessBuiltinType()) {
@@ -12413,10 +12408,10 @@ QualType Sema::CheckShiftOperands(ExprResult &LHS, ExprResult &RHS,
       // like general shifts, except that neither the LHS nor the RHS is
       // allowed to be a "vector bool".
       if (auto LHSVecType = LHS.get()->getType()->getAs<VectorType>())
-        if (LHSVecType->getVectorKind() == VectorType::AltiVecBool)
+        if (LHSVecType->getVectorKind() == VectorKind::AltiVecBool)
           return InvalidOperands(Loc, LHS, RHS);
       if (auto RHSVecType = RHS.get()->getType()->getAs<VectorType>())
-        if (RHSVecType->getVectorKind() == VectorType::AltiVecBool)
+        if (RHSVecType->getVectorKind() == VectorKind::AltiVecBool)
           return InvalidOperands(Loc, LHS, RHS);
     }
     return checkVectorShift(*this, LHS, RHS, Loc, IsCompAssign);
@@ -13585,23 +13580,23 @@ QualType Sema::GetSignedVectorType(QualType V) {
 
   if (TypeSize == Context.getTypeSize(Context.Int128Ty))
     return Context.getVectorType(Context.Int128Ty, VTy->getNumElements(),
-                                 VectorType::GenericVector);
+                                 VectorKind::Generic);
   if (TypeSize == Context.getTypeSize(Context.LongLongTy))
     return Context.getVectorType(Context.LongLongTy, VTy->getNumElements(),
-                                 VectorType::GenericVector);
+                                 VectorKind::Generic);
   if (TypeSize == Context.getTypeSize(Context.LongTy))
     return Context.getVectorType(Context.LongTy, VTy->getNumElements(),
-                                 VectorType::GenericVector);
+                                 VectorKind::Generic);
   if (TypeSize == Context.getTypeSize(Context.IntTy))
     return Context.getVectorType(Context.IntTy, VTy->getNumElements(),
-                                 VectorType::GenericVector);
+                                 VectorKind::Generic);
   if (TypeSize == Context.getTypeSize(Context.ShortTy))
     return Context.getVectorType(Context.ShortTy, VTy->getNumElements(),
-                                 VectorType::GenericVector);
+                                 VectorKind::Generic);
   assert(TypeSize == Context.getTypeSize(Context.CharTy) &&
          "Unhandled vector element size in vector compare");
   return Context.getVectorType(Context.CharTy, VTy->getNumElements(),
-                               VectorType::GenericVector);
+                               VectorKind::Generic);
 }
 
 QualType Sema::GetSignedSizelessVectorType(QualType V) {
@@ -13652,7 +13647,7 @@ QualType Sema::CheckVectorCompareOperands(ExprResult &LHS, ExprResult &RHS,
       // If AltiVec, the comparison results in a numeric type, i.e.
       // bool for C++, int for C
       if (vType->castAs<VectorType>()->getVectorKind() ==
-          VectorType::AltiVecVector)
+          VectorKind::AltiVecVector)
         return Context.getLogicalOperationType();
       else
         Diag(Loc, diag::warn_deprecated_altivec_src_compat);
@@ -14879,10 +14874,10 @@ static QualType CheckIncrementDecrementOperand(Sema &S, Expr *Op,
     // OK! ( C/C++ Language Extensions for CBEA(Version 2.6) 10.3 )
   } else if (S.getLangOpts().ZVector && ResType->isVectorType() &&
              (ResType->castAs<VectorType>()->getVectorKind() !=
-              VectorType::AltiVecBool)) {
+              VectorKind::AltiVecBool)) {
     // The z vector extensions allow ++ and -- for non-bool vectors.
-  } else if(S.getLangOpts().OpenCL && ResType->isVectorType() &&
-            ResType->castAs<VectorType>()->getElementType()->isIntegerType()) {
+  } else if (S.getLangOpts().OpenCL && ResType->isVectorType() &&
+             ResType->castAs<VectorType>()->getElementType()->isIntegerType()) {
     // OpenCL V1.2 6.3 says dec/inc ops operate on integer vector types.
   } else {
     S.Diag(OpLoc, diag::err_typecheck_illegal_increment_decrement)
@@ -15590,7 +15585,7 @@ static bool needsConversionOfHalfVec(bool OpRequiresConversion, ASTContext &Ctx,
     // the vectors shouldn't be treated as storage-only types. See the
     // discussion here: https://reviews.llvm.org/rG825235c140e7
     if (const VectorType *VT = Ty->getAs<VectorType>()) {
-      if (VT->getVectorKind() == VectorType::NeonVector)
+      if (VT->getVectorKind() == VectorKind::Neon)
         return false;
       return VT->getElementType().getCanonicalType() == Ctx.HalfTy;
     }
@@ -16371,7 +16366,7 @@ ExprResult Sema::CreateBuiltinUnaryOp(SourceLocation OpLoc,
              // The z vector extensions don't allow + or - with bool vectors.
              (!Context.getLangOpts().ZVector ||
               resultType->castAs<VectorType>()->getVectorKind() !=
-              VectorType::AltiVecBool))
+                  VectorKind::AltiVecBool))
       break;
     else if (resultType->isSveVLSBuiltinType()) // SVE vectors allow + and -
       break;
@@ -16461,7 +16456,7 @@ ExprResult Sema::CreateBuiltinUnaryOp(SourceLocation OpLoc,
       break;
     } else if (Context.getLangOpts().CPlusPlus && resultType->isVectorType()) {
       const VectorType *VTy = resultType->castAs<VectorType>();
-      if (VTy->getVectorKind() != VectorType::GenericVector)
+      if (VTy->getVectorKind() != VectorKind::Generic)
         return ExprError(Diag(OpLoc, diag::err_typecheck_unary_expr)
                          << resultType << Input.get()->getSourceRange());
 
diff --git a/clang/lib/Sema/SemaExprCXX.cpp b/clang/lib/Sema/SemaExprCXX.cpp
index e350a6a12b2b5..ea286c9709c13 100644
--- a/clang/lib/Sema/SemaExprCXX.cpp
+++ b/clang/lib/Sema/SemaExprCXX.cpp
@@ -6405,7 +6405,7 @@ QualType Sema::CheckVectorConditionalTypes(ExprResult &Cond, ExprResult &LHS,
           Context.getExtVectorType(ResultElementTy, CondVT->getNumElements());
     else
       ResultType = Context.getVectorType(
-          ResultElementTy, CondVT->getNumElements(), VectorType::GenericVector);
+          ResultElementTy, CondVT->getNumElements(), VectorKind::Generic);
 
     LHS = ImpCastExprToType(LHS.get(), ResultType, CK_VectorSplat);
     RHS = ImpCastExprToType(RHS.get(), ResultType, CK_VectorSplat);
diff --git a/clang/lib/Sema/SemaInit.cpp b/clang/lib/Sema/SemaInit.cpp
index ec796def96ad3..ed02d3580f34f 100644
--- a/clang/lib/Sema/SemaInit.cpp
+++ b/clang/lib/Sema/SemaInit.cpp
@@ -1808,8 +1808,8 @@ void InitListChecker::CheckVectorType(const InitializedEntity &Entity,
 
     bool isBigEndian = SemaRef.Context.getTargetInfo().isBigEndian();
     const VectorType *T = Entity.getType()->castAs<VectorType>();
-    if (isBigEndian && (T->getVectorKind() == VectorType::NeonVector ||
-                        T->getVectorKind() == VectorType::NeonPolyVector)) {
+    if (isBigEndian && (T->getVectorKind() == VectorKind::Neon ||
+                        T->getVectorKind() == VectorKind::NeonPoly)) {
       // The ability to use vector initializer lists is a GNU vector extension
       // and is unrelated to the NEON intrinsics in arm_neon.h. On little
       // endian machines it works fine, however on big endian machines it
diff --git a/clang/lib/Sema/SemaType.cpp b/clang/lib/Sema/SemaType.cpp
index dd7065e32a7a5..a46deed8e7c58 100644
--- a/clang/lib/Sema/SemaType.cpp
+++ b/clang/lib/Sema/SemaType.cpp
@@ -1802,11 +1802,11 @@ static QualType ConvertDeclSpecToType(TypeProcessingState &state) {
   } else if (DS.isTypeAltiVecVector()) {
     unsigned typeSize = static_cast<unsigned>(Context.getTypeSize(Result));
     assert(typeSize > 0 && "type size for vector must be greater than 0 bits");
-    VectorType::VectorKind VecKind = VectorType::AltiVecVector;
+    VectorKind VecKind = VectorKind::AltiVecVector;
     if (DS.isTypeAltiVecPixel())
-      VecKind = VectorType::AltiVecPixel;
+      VecKind = VectorKind::AltiVecPixel;
     else if (DS.isTypeAltiVecBool())
-      VecKind = VectorType::AltiVecBool;
+      VecKind = VectorKind::AltiVecBool;
     Result = Context.getVectorType(Result, 128/typeSize, VecKind);
   }
 
@@ -2767,7 +2767,7 @@ QualType Sema::BuildVectorType(QualType CurType, Expr *SizeExpr,
 
   if (SizeExpr->isTypeDependent() || SizeExpr->isValueDependent())
     return Context.getDependentVectorType(CurType, SizeExpr, AttrLoc,
-                                               VectorType::GenericVector);
+                                          VectorKind::Generic);
 
   std::optional<llvm::APSInt> VecSize =
       SizeExpr->getIntegerConstantExpr(Context);
@@ -2780,7 +2780,7 @@ QualType Sema::BuildVectorType(QualType CurType, Expr *SizeExpr,
 
   if (CurType->isDependentType())
     return Context.getDependentVectorType(CurType, SizeExpr, AttrLoc,
-                                               VectorType::GenericVector);
+                                          VectorKind::Generic);
 
   // vecSize is specified in bytes - convert to bits.
   if (!VecSize->isIntN(61)) {
@@ -2811,7 +2811,7 @@ QualType Sema::BuildVectorType(QualType CurType, Expr *SizeExpr,
   }
 
   return Context.getVectorType(CurType, VectorSizeBits / TypeSize,
-                               VectorType::GenericVector);
+                               VectorKind::Generic);
 }
 
 /// Build an ext-vector type.
@@ -8267,8 +8267,7 @@ static void HandleExtVectorTypeAttr(QualType &CurType, const ParsedAttr &Attr,
     CurType = T;
 }
 
-static bool isPermittedNeonBaseType(QualType &Ty,
-                                    VectorType::VectorKind VecKind, Sema &S) {
+static bool isPermittedNeonBaseType(QualType &Ty, VectorKind VecKind, Sema &S) {
   const BuiltinType *BTy = Ty->getAs<BuiltinType>();
   if (!BTy)
     return false;
@@ -8280,7 +8279,7 @@ static bool isPermittedNeonBaseType(QualType &Ty,
   bool IsPolyUnsigned = Triple.getArch() == llvm::Triple::aarch64 ||
                         Triple.getArch() == llvm::Triple::aarch64_32 ||
                         Triple.getArch() == llvm::Triple::aarch64_be;
-  if (VecKind == VectorType::NeonPolyVector) {
+  if (VecKind == VectorKind::NeonPoly) {
     if (IsPolyUnsigned) {
       // AArch64 polynomial vectors are unsigned.
       return BTy->getKind() == BuiltinType::UChar ||
@@ -8340,7 +8339,7 @@ static bool verifyValidIntegerConstantExpr(Sema &S, const ParsedAttr &Attr,
 /// not the vector size in bytes.  The vector width and element type must
 /// match one of the standard Neon vector types.
 static void HandleNeonVectorTypeAttr(QualType &CurType, const ParsedAttr &Attr,
-                                     Sema &S, VectorType::VectorKind VecKind) {
+                                     Sema &S, VectorKind VecKind) {
   bool IsTargetCUDAAndHostARM = false;
   if (S.getLangOpts().CUDAIsDevice) {
     const TargetInfo *AuxTI = S.getASTContext().getAuxTargetInfo();
@@ -8448,11 +8447,11 @@ static void HandleArmSveVectorBitsTypeAttr(QualType &CurType, ParsedAttr &Attr,
 
   QualType EltType = CurType->getSveEltType(S.Context);
   unsigned TypeSize = S.Context.getTypeSize(EltType);
-  VectorType::VectorKind VecKind = VectorType::SveFixedLengthDataVector;
+  VectorKind VecKind = VectorKind::SveFixedLengthData;
   if (BT->getKind() == BuiltinType::SveBool) {
     // Predicates are represented as i8.
     VecSize /= S.Context.getCharWidth() * S.Context.getCharWidth();
-    VecKind = VectorType::SveFixedLengthPredicateVector;
+    VecKind = VectorKind::SveFixedLengthPredicate;
   } else
     VecSize /= TypeSize;
   CurType = S.Context.getVectorType(EltType, VecSize, VecKind);
@@ -8462,7 +8461,7 @@ static void HandleArmMveStrictPolymorphismAttr(TypeProcessingState &State,
                                                QualType &CurType,
                                                ParsedAttr &Attr) {
   const VectorType *VT = dyn_cast<VectorType>(CurType);
-  if (!VT || VT->getVectorKind() != VectorType::NeonVector) {
+  if (!VT || VT->getVectorKind() != VectorKind::Neon) {
     State.getSema().Diag(Attr.getLoc(),
                          diag::err_attribute_arm_mve_polymorphism);
     Attr.setInvalid();
@@ -8533,7 +8532,7 @@ static void HandleRISCVRVVVectorBitsTypeAttr(QualType &CurType,
     return;
   }
 
-  VectorType::VectorKind VecKind = VectorType::RVVFixedLengthDataVector;
+  VectorKind VecKind = VectorKind::RVVFixedLengthData;
   VecSize /= EltSize;
   CurType = S.Context.getVectorType(Info.ElementType, VecSize, VecKind);
 }
@@ -8770,13 +8769,12 @@ static void processTypeAttrs(TypeProcessingState &state, QualType &type,
       attr.setUsedAsTypeAttr();
       break;
     case ParsedAttr::AT_NeonVectorType:
-      HandleNeonVectorTypeAttr(type, attr, state.getSema(),
-                               VectorType::NeonVector);
+      HandleNeonVectorTypeAttr(type, attr, state.getSema(), VectorKind::Neon);
       attr.setUsedAsTypeAttr();
       break;
     case ParsedAttr::AT_NeonPolyVectorType:
       HandleNeonVectorTypeAttr(type, attr, state.getSema(),
-                               VectorType::NeonPolyVector);
+                               VectorKind::NeonPoly);
       attr.setUsedAsTypeAttr();
       break;
     case ParsedAttr::AT_ArmSveVectorBits:
diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h
index 6e8321e163e1c..22357d55d3733 100644
--- a/clang/lib/Sema/TreeTransform.h
+++ b/clang/lib/Sema/TreeTransform.h
@@ -930,7 +930,7 @@ class TreeTransform {
   /// By default, performs semantic analysis when building the vector type.
   /// Subclasses may override this routine to provide different behavior.
   QualType RebuildVectorType(QualType ElementType, unsigned NumElements,
-                             VectorType::VectorKind VecKind);
+                             VectorKind VecKind);
 
   /// Build a new potentially dependently-sized extended vector type
   /// given the element type and number of elements.
@@ -938,8 +938,7 @@ class TreeTransform {
   /// By default, performs semantic analysis when building the vector type.
   /// Subclasses may override this routine to provide different behavior.
   QualType RebuildDependentVectorType(QualType ElementType, Expr *SizeExpr,
-                                           SourceLocation AttributeLoc,
-                                           VectorType::VectorKind);
+                                      SourceLocation AttributeLoc, VectorKind);
 
   /// Build a new extended vector type given the element type and
   /// number of elements.
@@ -14988,10 +14987,9 @@ QualType TreeTransform<Derived>::RebuildDependentAddressSpaceType(
 }
 
 template <typename Derived>
-QualType
-TreeTransform<Derived>::RebuildVectorType(QualType ElementType,
-                                          unsigned NumElements,
-                                          VectorType::VectorKind VecKind) {
+QualType TreeTransform<Derived>::RebuildVectorType(QualType ElementType,
+                                                   unsigned NumElements,
+                                                   VectorKind VecKind) {
   // FIXME: semantic checking!
   return SemaRef.Context.getVectorType(ElementType, NumElements, VecKind);
 }
@@ -14999,7 +14997,7 @@ TreeTransform<Derived>::RebuildVectorType(QualType ElementType,
 template <typename Derived>
 QualType TreeTransform<Derived>::RebuildDependentVectorType(
     QualType ElementType, Expr *SizeExpr, SourceLocation AttributeLoc,
-    VectorType::VectorKind VecKind) {
+    VectorKind VecKind) {
   return SemaRef.BuildVectorType(ElementType, SizeExpr, AttributeLoc);
 }
 
diff --git a/libcxxabi/test/test_demangle.pass.cpp b/libcxxabi/test/test_demangle.pass.cpp
index 3bfdc04ced50b..cd640aad80df9 100644
--- a/libcxxabi/test/test_demangle.pass.cpp
+++ b/libcxxabi/test/test_demangle.pass.cpp
@@ -6293,7 +6293,7 @@ const char* cases[][2] =
     {"_ZN12_GLOBAL__N_117TypeSpecLocFiller22VisitObjCObjectTypeLocEN5clang17ObjCObjectTypeLocE", "(anonymous namespace)::TypeSpecLocFiller::VisitObjCObjectTypeLoc(clang::ObjCObjectTypeLoc)"},
     {"_ZN5clang14TypeLocVisitorIN12_GLOBAL__N_117TypeSpecLocFillerEvE5VisitENS_7TypeLocE", "clang::TypeLocVisitor<(anonymous namespace)::TypeSpecLocFiller, void>::Visit(clang::TypeLoc)"},
     {"_Z25handleObjCPointerTypeAttrRN12_GLOBAL__N_119TypeProcessingStateERN5clang13AttributeListERNS2_8QualTypeE", "handleObjCPointerTypeAttr((anonymous namespace)::TypeProcessingState&, clang::AttributeList&, clang::QualType&)"},
-    {"_Z24HandleNeonVectorTypeAttrRN5clang8QualTypeERKNS_13AttributeListERNS_4SemaENS_10VectorType10VectorKindEPKc", "HandleNeonVectorTypeAttr(clang::QualType&, clang::AttributeList const&, clang::Sema&, clang::VectorType::VectorKind, char const*)"},
+    {"_Z24HandleNeonVectorTypeAttrRN5clang8QualTypeERKNS_13AttributeListERNS_4SemaENS_10VectorType10VectorKindEPKc", "HandleNeonVectorTypeAttr(clang::QualType&, clang::AttributeList const&, clang::Sema&, clang::VectorKind, char const*)"},
     {"_Z22handleFunctionTypeAttrRN12_GLOBAL__N_119TypeProcessingStateERN5clang13AttributeListERNS2_8QualTypeE", "handleFunctionTypeAttr((anonymous namespace)::TypeProcessingState&, clang::AttributeList&, clang::QualType&)"},
     {"_ZN12_GLOBAL__N_121FunctionTypeUnwrapper4wrapERN5clang10ASTContextENS1_8QualTypeEj", "(anonymous namespace)::FunctionTypeUnwrapper::wrap(clang::ASTContext&, clang::QualType, unsigned int)"},
     {"_ZN12_GLOBAL__N_121FunctionTypeUnwrapper4wrapERN5clang10ASTContextEPKNS1_4TypeEj", "(anonymous namespace)::FunctionTypeUnwrapper::wrap(clang::ASTContext&, clang::Type const*, unsigned int)"},
@@ -8885,7 +8885,7 @@ const char* cases[][2] =
     {"_ZNK5clang10ASTContext26getDependentSizedArrayTypeENS_8QualTypeEPNS_4ExprENS_9ArrayType17ArraySizeModifierEjNS_11SourceRangeE", "clang::ASTContext::getDependentSizedArrayType(clang::QualType, clang::Expr*, clang::ArraySizeModifier, unsigned int, clang::SourceRange) const"},
     {"_ZNK5clang10ASTContext20getVariableArrayTypeENS_8QualTypeEPNS_4ExprENS_9ArrayType17ArraySizeModifierEjNS_11SourceRangeE", "clang::ASTContext::getVariableArrayType(clang::QualType, clang::Expr*, clang::ArraySizeModifier, unsigned int, clang::SourceRange) const"},
     {"_ZNK5clang10ASTContext22getIncompleteArrayTypeENS_8QualTypeENS_9ArrayType17ArraySizeModifierEj", "clang::ASTContext::getIncompleteArrayType(clang::QualType, clang::ArraySizeModifier, unsigned int) const"},
-    {"_ZNK5clang10ASTContext13getVectorTypeENS_8QualTypeEjNS_10VectorType10VectorKindE", "clang::ASTContext::getVectorType(clang::QualType, unsigned int, clang::VectorType::VectorKind) const"},
+    {"_ZNK5clang10ASTContext13getVectorTypeENS_8QualTypeEjNS_10VectorType10VectorKindE", "clang::ASTContext::getVectorType(clang::QualType, unsigned int, clang::VectorKind) const"},
     {"_ZNK5clang10ASTContext16getExtVectorTypeENS_8QualTypeEj", "clang::ASTContext::getExtVectorType(clang::QualType, unsigned int) const"},
     {"_ZNK5clang10ASTContext30getDependentSizedExtVectorTypeENS_8QualTypeEPNS_4ExprENS_14SourceLocationE", "clang::ASTContext::getDependentSizedExtVectorType(clang::QualType, clang::Expr*, clang::SourceLocation) const"},
     {"_ZNK5clang10ASTContext21getCanonicalParamTypeENS_8QualTypeE", "clang::ASTContext::getCanonicalParamType(clang::QualType) const"},
@@ -10871,10 +10871,10 @@ const char* cases[][2] =
     {"_ZN5clang27DependentSizedExtVectorTypeC1ERKNS_10ASTContextENS_8QualTypeES4_PNS_4ExprENS_14SourceLocationE", "clang::DependentSizedExtVectorType::DependentSizedExtVectorType(clang::ASTContext const&, clang::QualType, clang::QualType, clang::Expr*, clang::SourceLocation)"},
     {"_ZN5clang27DependentSizedExtVectorTypeC2ERKNS_10ASTContextENS_8QualTypeES4_PNS_4ExprENS_14SourceLocationE", "clang::DependentSizedExtVectorType::DependentSizedExtVectorType(clang::ASTContext const&, clang::QualType, clang::QualType, clang::Expr*, clang::SourceLocation)"},
     {"_ZN5clang27DependentSizedExtVectorType7ProfileERN4llvm16FoldingSetNodeIDERKNS_10ASTContextENS_8QualTypeEPNS_4ExprE", "clang::DependentSizedExtVectorType::Profile(llvm::FoldingSetNodeID&, clang::ASTContext const&, clang::QualType, clang::Expr*)"},
-    {"_ZN5clang10VectorTypeC1ENS_8QualTypeEjS1_NS0_10VectorKindE", "clang::VectorType::VectorType(clang::QualType, unsigned int, clang::QualType, clang::VectorType::VectorKind)"},
-    {"_ZN5clang10VectorTypeC2ENS_8QualTypeEjS1_NS0_10VectorKindE", "clang::VectorType::VectorType(clang::QualType, unsigned int, clang::QualType, clang::VectorType::VectorKind)"},
-    {"_ZN5clang10VectorTypeC1ENS_4Type9TypeClassENS_8QualTypeEjS3_NS0_10VectorKindE", "clang::VectorType::VectorType(clang::Type::TypeClass, clang::QualType, unsigned int, clang::QualType, clang::VectorType::VectorKind)"},
-    {"_ZN5clang10VectorTypeC2ENS_4Type9TypeClassENS_8QualTypeEjS3_NS0_10VectorKindE", "clang::VectorType::VectorType(clang::Type::TypeClass, clang::QualType, unsigned int, clang::QualType, clang::VectorType::VectorKind)"},
+    {"_ZN5clang10VectorTypeC1ENS_8QualTypeEjS1_NS0_10VectorKindE", "clang::VectorType::VectorType(clang::QualType, unsigned int, clang::QualType, clang::VectorKind)"},
+    {"_ZN5clang10VectorTypeC2ENS_8QualTypeEjS1_NS0_10VectorKindE", "clang::VectorType::VectorType(clang::QualType, unsigned int, clang::QualType, clang::VectorKind)"},
+    {"_ZN5clang10VectorTypeC1ENS_4Type9TypeClassENS_8QualTypeEjS3_NS0_10VectorKindE", "clang::VectorType::VectorType(clang::Type::TypeClass, clang::QualType, unsigned int, clang::QualType, clang::VectorKind)"},
+    {"_ZN5clang10VectorTypeC2ENS_4Type9TypeClassENS_8QualTypeEjS3_NS0_10VectorKindE", "clang::VectorType::VectorType(clang::Type::TypeClass, clang::QualType, unsigned int, clang::QualType, clang::VectorKind)"},
     {"_ZNK5clang4Type29getArrayElementTypeNoTypeQualEv", "clang::Type::getArrayElementTypeNoTypeQual() const"},
     {"_ZNK5clang4Type27getUnqualifiedDesugaredTypeEv", "clang::Type::getUnqualifiedDesugaredType() const"},
     {"_ZN5clang8QualType16getDesugaredTypeES0_RKNS_10ASTContextE", "clang::QualType::getDesugaredType(clang::QualType, clang::ASTContext const&)"},

From fa6b574215377389258f019853a528c12706196d Mon Sep 17 00:00:00 2001
From: Peter Klausler <35819229+klausler@users.noreply.github.com>
Date: Tue, 31 Oct 2023 11:51:00 -0700
Subject: [PATCH 564/877] [flang] Account for shadowed symbols when skimming
 executable part (#70246)

Name resolution takes a quick pass over the executable part of a
(sub)program in search of symbols that appear to be called as
procedures, so that those names don't get mistakenly converted into
objects when finishing up specification part processing. This pass
doesn't currently cope with symbol shadowing by nested declarations in
executable constructs. This patch ensures that nested declarations for
symbols that could be used in contexts that might have been parsed as
function references properly shadow symbols of the same name in outer
scopes.
---
 flang/lib/Semantics/resolve-names.cpp | 157 +++++++++++++++++++++-----
 flang/test/Semantics/symbol32.f90     |  75 ++++++++++++
 2 files changed, 206 insertions(+), 26 deletions(-)
 create mode 100644 flang/test/Semantics/symbol32.f90

diff --git a/flang/lib/Semantics/resolve-names.cpp b/flang/lib/Semantics/resolve-names.cpp
index bc99dac63dc88..98773a1b9d6ab 100644
--- a/flang/lib/Semantics/resolve-names.cpp
+++ b/flang/lib/Semantics/resolve-names.cpp
@@ -1573,8 +1573,7 @@ class ResolveNamesVisitor : public virtual ScopeHandler,
     llvm_unreachable("This node is handled in ProgramUnit");
   }
 
-  void NoteExecutablePartCall(
-      Symbol::Flag, const parser::Call &, bool hasCUDAChevrons);
+  void NoteExecutablePartCall(Symbol::Flag, SourceName, bool hasCUDAChevrons);
 
   friend void ResolveSpecificationParts(SemanticsContext &, const Symbol &);
 
@@ -7711,27 +7710,24 @@ bool ResolveNamesVisitor::CheckImplicitNoneExternal(
 // of the subprogram's interface, and to mark as procedures any symbols
 // that might otherwise have been miscategorized as objects.
 void ResolveNamesVisitor::NoteExecutablePartCall(
-    Symbol::Flag flag, const parser::Call &call, bool hasCUDAChevrons) {
-  auto &designator{std::get<parser::ProcedureDesignator>(call.t)};
-  if (const auto *name{std::get_if<parser::Name>(&designator.u)}) {
-    // Subtlety: The symbol pointers in the parse tree are not set, because
-    // they might end up resolving elsewhere (e.g., construct entities in
-    // SELECT TYPE).
-    if (Symbol * symbol{currScope().FindSymbol(name->source)}) {
-      Symbol::Flag other{flag == Symbol::Flag::Subroutine
-              ? Symbol::Flag::Function
-              : Symbol::Flag::Subroutine};
-      if (!symbol->test(other)) {
-        ConvertToProcEntity(*symbol);
-        if (auto *details{symbol->detailsIf<ProcEntityDetails>()}) {
-          symbol->set(flag);
-          if (IsDummy(*symbol)) {
-            SetImplicitAttr(*symbol, Attr::EXTERNAL);
-          }
-          ApplyImplicitRules(*symbol);
-          if (hasCUDAChevrons) {
-            details->set_isCUDAKernel();
-          }
+    Symbol::Flag flag, SourceName name, bool hasCUDAChevrons) {
+  // Subtlety: The symbol pointers in the parse tree are not set, because
+  // they might end up resolving elsewhere (e.g., construct entities in
+  // SELECT TYPE).
+  if (Symbol * symbol{currScope().FindSymbol(name)}) {
+    Symbol::Flag other{flag == Symbol::Flag::Subroutine
+            ? Symbol::Flag::Function
+            : Symbol::Flag::Subroutine};
+    if (!symbol->test(other)) {
+      ConvertToProcEntity(*symbol);
+      if (auto *details{symbol->detailsIf<ProcEntityDetails>()}) {
+        symbol->set(flag);
+        if (IsDummy(*symbol)) {
+          SetImplicitAttr(*symbol, Attr::EXTERNAL);
+        }
+        ApplyImplicitRules(*symbol);
+        if (hasCUDAChevrons) {
+          details->set_isCUDAKernel();
         }
       }
     }
@@ -8451,21 +8447,130 @@ class ExecutionPartSkimmer {
   void Walk(const parser::ExecutionPart *exec) {
     if (exec) {
       parser::Walk(*exec, *this);
+      CHECK(nestedScopes_.empty());
     }
   }
 
   template <typename A> bool Pre(const A &) { return true; }
   template <typename A> void Post(const A &) {}
   void Post(const parser::FunctionReference &fr) {
-    resolver_.NoteExecutablePartCall(Symbol::Flag::Function, fr.v, false);
+    NoteCall(Symbol::Flag::Function, fr.v, false);
   }
   void Post(const parser::CallStmt &cs) {
-    resolver_.NoteExecutablePartCall(
-        Symbol::Flag::Subroutine, cs.call, cs.chevrons.has_value());
+    NoteCall(Symbol::Flag::Subroutine, cs.call, cs.chevrons.has_value());
+  }
+  bool Pre(const parser::AssociateConstruct &) {
+    PushScope();
+    return true;
+  }
+  void Post(const parser::AssociateConstruct &) { PopScope(); }
+  bool Pre(const parser::Association &x) {
+    Hide(std::get<parser::Name>(x.t));
+    return true;
+  }
+  bool Pre(const parser::BlockConstruct &) {
+    PushScope();
+    return true;
+  }
+  void Post(const parser::BlockConstruct &) { PopScope(); }
+  bool Pre(const parser::EntityDecl &x) {
+    Hide(std::get<parser::ObjectName>(x.t));
+    return true;
+  }
+  void Post(const parser::ImportStmt &x) {
+    if (x.kind == common::ImportKind::None ||
+        x.kind == common::ImportKind::Only) {
+      if (!nestedScopes_.front().importOnly.has_value()) {
+        nestedScopes_.front().importOnly.emplace();
+      }
+      for (const auto &name : x.names) {
+        nestedScopes_.front().importOnly->emplace(name.source);
+      }
+    } else {
+      // no special handling needed for explicit names or IMPORT, ALL
+    }
+  }
+  void Post(const parser::UseStmt &x) {
+    if (const auto *onlyList{std::get_if<std::list<parser::Only>>(&x.u)}) {
+      for (const auto &only : *onlyList) {
+        if (const auto *name{std::get_if<parser::Name>(&only.u)}) {
+          Hide(*name);
+        } else if (const auto *rename{std::get_if<parser::Rename>(&only.u)}) {
+          if (const auto *names{
+                  std::get_if<parser::Rename::Names>(&rename->u)}) {
+            Hide(std::get<0>(names->t));
+          }
+        }
+      }
+    } else {
+      // USE may or may not shadow symbols in host scopes
+      nestedScopes_.front().hasUseWithoutOnly = true;
+    }
+  }
+  bool Pre(const parser::DerivedTypeStmt &x) {
+    Hide(std::get<parser::Name>(x.t));
+    PushScope();
+    return true;
+  }
+  void Post(const parser::DerivedTypeDef &) { PopScope(); }
+  bool Pre(const parser::SelectTypeConstruct &) {
+    PushScope();
+    return true;
+  }
+  void Post(const parser::SelectTypeConstruct &) { PopScope(); }
+  bool Pre(const parser::SelectTypeStmt &x) {
+    if (const auto &maybeName{std::get<1>(x.t)}) {
+      Hide(*maybeName);
+    }
+    return true;
+  }
+  bool Pre(const parser::SelectRankConstruct &) {
+    PushScope();
+    return true;
+  }
+  void Post(const parser::SelectRankConstruct &) { PopScope(); }
+  bool Pre(const parser::SelectRankStmt &x) {
+    if (const auto &maybeName{std::get<1>(x.t)}) {
+      Hide(*maybeName);
+    }
+    return true;
   }
 
 private:
+  void NoteCall(
+      Symbol::Flag flag, const parser::Call &call, bool hasCUDAChevrons) {
+    auto &designator{std::get<parser::ProcedureDesignator>(call.t)};
+    if (const auto *name{std::get_if<parser::Name>(&designator.u)}) {
+      for (const auto &scope : nestedScopes_) {
+        if (scope.locals.find(name->source) != scope.locals.end()) {
+          return; // shadowed by nested declaration
+        }
+        if (scope.hasUseWithoutOnly) {
+          break;
+        }
+        if (scope.importOnly &&
+            scope.importOnly->find(name->source) == scope.importOnly->end()) {
+          return; // not imported
+        }
+      }
+      resolver_.NoteExecutablePartCall(flag, name->source, hasCUDAChevrons);
+    }
+  }
+
+  void PushScope() { nestedScopes_.emplace_front(); }
+  void PopScope() { nestedScopes_.pop_front(); }
+  void Hide(const parser::Name &name) {
+    nestedScopes_.front().locals.emplace(name.source);
+  }
+
   ResolveNamesVisitor &resolver_;
+
+  struct NestedScopeInfo {
+    bool hasUseWithoutOnly{false};
+    std::set<SourceName> locals;
+    std::optional<std::set<SourceName>> importOnly;
+  };
+  std::list<NestedScopeInfo> nestedScopes_;
 };
 
 // Build the scope tree and resolve names in the specification parts of this
diff --git a/flang/test/Semantics/symbol32.f90 b/flang/test/Semantics/symbol32.f90
new file mode 100644
index 0000000000000..6d7264a6bf962
--- /dev/null
+++ b/flang/test/Semantics/symbol32.f90
@@ -0,0 +1,75 @@
+! RUN: %python %S/test_symbols.py %s %flang_fc1
+! Test the executable part skimming for apparent calls, to ensure that
+! symbols in nested scopes (BLOCK, &c.) properly shadow host symbols.
+!DEF: /m Module
+module m
+end module
+!DEF: /subr (Subroutine) Subprogram
+!DEF: /subr/da INTENT(IN) ObjectEntity CLASS(*)
+!DEF: /subr/ar INTENT(IN) ObjectEntity REAL(4)
+subroutine subr (da, ar)
+ !REF: /subr/da
+ class(*), intent(in) :: da(:)
+ !REF: /subr/ar
+ real, intent(in) :: ar(..)
+ !DEF: /subr/s2 ObjectEntity REAL(4)
+ !DEF: /subr/s4 ObjectEntity REAL(4)
+ !DEF: /subr/s6 ObjectEntity REAL(4)
+ !DEF: /subr/s7 (Function) ProcEntity REAL(4)
+ !DEF: /subr/s8 ObjectEntity REAL(4)
+ real s2, s4, s6, s7, s8
+ !DEF: /s1 EXTERNAL (Function, Implicit) ProcEntity REAL(4)
+ print *, s1(1)
+ block
+  !DEF: /subr/BlockConstruct1/s2 ObjectEntity INTEGER(4)
+  !DEF: /subr/BlockConstruct1/s5 (Function) ProcEntity INTEGER(4)
+  integer s2(10), s5
+  !DEF: /subr/BlockConstruct1/s4 DerivedType
+  type :: s4
+   !DEF: /subr/BlockConstruct1/s4/n ObjectEntity INTEGER(4)
+   integer :: n
+  end type
+  !REF: /subr/BlockConstruct1/s2
+  print *, s2(1)
+  !DEF: /s3 EXTERNAL (Function, Implicit) ProcEntity REAL(4)
+  print *, s3(1)
+  !REF: /subr/BlockConstruct1/s4
+  print *, s4(1)
+  !REF: /subr/BlockConstruct1/s5
+  print *, s5(1)
+ end block
+ block
+  import, none
+  !DEF: /s2 EXTERNAL (Function, Implicit) ProcEntity REAL(4)
+  print *, s2(1)
+ end block
+ block
+  !REF: /subr/s6
+  import, only: s6
+  !DEF: /s8 EXTERNAL (Function, Implicit) ProcEntity REAL(4)
+  print *, s8(1)
+ end block
+ block
+  !REF: /m
+  use :: m
+  !REF: /subr/s7
+  print *, s7(1)
+ end block
+ !DEF: /subr/OtherConstruct1/s2 AssocEntity REAL(4)
+ associate (s2 => [1.])
+  !REF: /subr/OtherConstruct1/s2
+  print *, s2(1)
+ end associate
+ !REF: /subr/da
+ select type (s2 => da)
+ type is (real)
+  !DEF: /subr/OtherConstruct2/s2 AssocEntity REAL(4)
+  print *, s2(1)
+ end select
+ !REF: /subr/ar
+ select rank (s2 => ar)
+ rank (1)
+  !DEF: /subr/OtherConstruct3/s2 AssocEntity REAL(4)
+  print *, s2(1)
+ end select
+end subroutine

From 11529d5b3b7cb763cd7ec9e117f33543ca7d6a90 Mon Sep 17 00:00:00 2001
From: Peter Klausler <35819229+klausler@users.noreply.github.com>
Date: Tue, 31 Oct 2023 12:05:29 -0700
Subject: [PATCH 565/877] [flang] Fine-tune function result equivalence
 checking (#70260)

When a separate module function's definition has a redundant interface
-- it's defined with MODULE FUNCTION, not MODULE PROCEDURE -- the check
for result type equivalence needs to allow for character lengths that
are the results of specification expressions. At present,
identical-looking length specification expression don't compare equal,
since they can refer to distinct dummy argument symbols. Ensure just
that they are both constant or not, and if constant, that the lengths
have the same value.
---
 flang/lib/Evaluate/characteristics.cpp     | 33 +++++++++++++++++-----
 flang/lib/Semantics/check-declarations.cpp | 13 +++++----
 flang/test/Semantics/separate-mp02.f90     | 17 +++++++++--
 flang/test/Semantics/separate-mp03.f90     |  2 +-
 4 files changed, 50 insertions(+), 15 deletions(-)

diff --git a/flang/lib/Evaluate/characteristics.cpp b/flang/lib/Evaluate/characteristics.cpp
index 3074fc0516b46..c600cea5c420c 100644
--- a/flang/lib/Evaluate/characteristics.cpp
+++ b/flang/lib/Evaluate/characteristics.cpp
@@ -1069,13 +1069,32 @@ bool FunctionResult::IsCompatibleWith(
                 actual.IsAssumedLengthCharacter()) {
               return true;
             } else {
-              const auto *ifaceLenParam{
-                  ifaceTypeShape->type().charLengthParamValue()};
-              const auto *actualLenParam{
-                  actualTypeShape->type().charLengthParamValue()};
-              if (ifaceLenParam && actualLenParam &&
-                  *ifaceLenParam == *actualLenParam) {
-                return true;
+              auto len{ToInt64(ifaceTypeShape->LEN())};
+              auto actualLen{ToInt64(actualTypeShape->LEN())};
+              if (len.has_value() != actualLen.has_value()) {
+                if (whyNot) {
+                  *whyNot = "constant-length vs non-constant-length character "
+                            "results";
+                }
+              } else if (len && *len != *actualLen) {
+                if (whyNot) {
+                  *whyNot = "character results with distinct lengths";
+                }
+              } else {
+                const auto *ifaceLenParam{
+                    ifaceTypeShape->type().charLengthParamValue()};
+                const auto *actualLenParam{
+                    actualTypeShape->type().charLengthParamValue()};
+                if (ifaceLenParam && actualLenParam &&
+                    ifaceLenParam->isExplicit() !=
+                        actualLenParam->isExplicit()) {
+                  if (whyNot) {
+                    *whyNot =
+                        "explicit-length vs deferred-length character results";
+                  }
+                } else {
+                  return true;
+                }
               }
             }
           }
diff --git a/flang/lib/Semantics/check-declarations.cpp b/flang/lib/Semantics/check-declarations.cpp
index 0b847f5c3408b..3626aaf3f4492 100644
--- a/flang/lib/Semantics/check-declarations.cpp
+++ b/flang/lib/Semantics/check-declarations.cpp
@@ -3289,11 +3289,14 @@ void SubprogramMatchHelper::Check(
     Say(symbol1, symbol2,
         "Module subprogram '%s' and its corresponding interface body are not both BIND(C)"_err_en_US);
   }
-  if (proc1->functionResult && proc2->functionResult &&
-      *proc1->functionResult != *proc2->functionResult) {
-    Say(symbol1, symbol2,
-        "Return type of function '%s' does not match return type of"
-        " the corresponding interface body"_err_en_US);
+  if (proc1->functionResult && proc2->functionResult) {
+    std::string whyNot;
+    if (!proc1->functionResult->IsCompatibleWith(
+            *proc2->functionResult, &whyNot)) {
+      Say(symbol1, symbol2,
+          "Result of function '%s' is not compatible with the result of the corresponding interface body: %s"_err_en_US,
+          whyNot);
+    }
   }
   for (int i{0}; i < nargs1; ++i) {
     const Symbol *arg1{args1[i]};
diff --git a/flang/test/Semantics/separate-mp02.f90 b/flang/test/Semantics/separate-mp02.f90
index 39a469b6ccc09..c39f18064796b 100644
--- a/flang/test/Semantics/separate-mp02.f90
+++ b/flang/test/Semantics/separate-mp02.f90
@@ -272,10 +272,10 @@ module function f3()
   !OK
   real module function f1()
   end
-  !ERROR: Return type of function 'f2' does not match return type of the corresponding interface body
+  !ERROR: Result of function 'f2' is not compatible with the result of the corresponding interface body: function results have distinct types: INTEGER(4) vs REAL(4)
   integer module function f2()
   end
-  !ERROR: Return type of function 'f3' does not match return type of the corresponding interface body
+  !ERROR: Result of function 'f3' is not compatible with the result of the corresponding interface body: function results have incompatible attributes
   module function f3()
     real :: f3
     pointer :: f3
@@ -334,3 +334,16 @@ module subroutine sub2(s)
     character(len=1) s
   end subroutine
 end submodule
+
+module m10
+  interface
+    module character(2) function f()
+    end function
+  end interface
+end module
+submodule(m10) sm10
+ contains
+  !ERROR: Result of function 'f' is not compatible with the result of the corresponding interface body: function results have distinct types: CHARACTER(KIND=1,LEN=3_8) vs CHARACTER(KIND=1,LEN=2_8)
+  module character(3) function f()
+  end function
+end submodule
diff --git a/flang/test/Semantics/separate-mp03.f90 b/flang/test/Semantics/separate-mp03.f90
index 1bbeced44a4f7..8bf21b37ae2b9 100644
--- a/flang/test/Semantics/separate-mp03.f90
+++ b/flang/test/Semantics/separate-mp03.f90
@@ -81,7 +81,7 @@ integer module function f1(x)
   !ERROR: 'notf2' was not declared a separate module procedure
   module procedure notf2
   end procedure
-  !ERROR: Return type of function 'f3' does not match return type of the corresponding interface body
+  !ERROR: Result of function 'f3' is not compatible with the result of the corresponding interface body: function results have distinct types: REAL(4) vs INTEGER(4)
   module function f3(x) result(res)
     real :: res
     real, intent(in) :: x

From b8014b5837847407a2c843329da92c75fddb2610 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Tue, 31 Oct 2023 12:09:30 -0700
Subject: [PATCH 566/877] [RISCV][GISel] Simplify selectSelect. NFC (#70846)

Use GSelect and reduce number of temporaries.
---
 .../RISCV/GISel/RISCVInstructionSelector.cpp  | 31 +++++++++----------
 1 file changed, 14 insertions(+), 17 deletions(-)

diff --git a/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp b/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp
index b03be71ed7b2a..698168f6a8012 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp
+++ b/llvm/lib/Target/RISCV/GISel/RISCVInstructionSelector.cpp
@@ -776,27 +776,24 @@ static void getICMPOperandsForBranch(MachineInstr &MI, MachineIRBuilder &MIB,
 bool RISCVInstructionSelector::selectSelect(MachineInstr &MI,
                                             MachineIRBuilder &MIB,
                                             MachineRegisterInfo &MRI) const {
-  assert(MI.getOpcode() == TargetOpcode::G_SELECT);
+  auto &SelectMI = cast<GSelect>(MI);
 
   // If MI is a G_SELECT(G_ICMP(tst, A, B), C, D) then we can use (A, B, tst)
   // as the (LHS, RHS, CC) of the Select_GPR_Using_CC_GPR.
-  Register MIOp1Reg = MI.getOperand(1).getReg();
-  bool Op1IsICMP = mi_match(MIOp1Reg, MRI, m_GICmp(m_Pred(), m_Reg(), m_Reg()));
-  RISCVCC::CondCode CC;
-  Register LHS, RHS;
-  if (Op1IsICMP)
-    getICMPOperandsForBranch(*MRI.getVRegDef(MIOp1Reg), MIB, MRI, CC, LHS, RHS);
-
-  Register Op1 = Op1IsICMP ? LHS : MI.getOperand(1).getReg();
-  Register Op2 = Op1IsICMP ? RHS : RISCV::X0;
-  unsigned Op3 = Op1IsICMP ? CC : RISCVCC::COND_NE;
+  Register LHS = SelectMI.getCondReg();
+  Register RHS = RISCV::X0;
+  RISCVCC::CondCode CC = RISCVCC::COND_NE;
+
+  if (mi_match(LHS, MRI, m_GICmp(m_Pred(), m_Reg(), m_Reg())))
+    getICMPOperandsForBranch(*MRI.getVRegDef(LHS), MIB, MRI, CC, LHS, RHS);
+
   MachineInstr *Result = MIB.buildInstr(RISCV::Select_GPR_Using_CC_GPR)
-                             .addDef(MI.getOperand(0).getReg())
-                             .addReg(Op1)
-                             .addReg(Op2)
-                             .addImm(Op3)
-                             .addReg(MI.getOperand(2).getReg())
-                             .addReg(MI.getOperand(3).getReg());
+                             .addDef(SelectMI.getReg(0))
+                             .addReg(LHS)
+                             .addReg(RHS)
+                             .addImm(CC)
+                             .addReg(SelectMI.getTrueReg())
+                             .addReg(SelectMI.getFalseReg());
   MI.eraseFromParent();
   return constrainSelectedInstRegOperands(*Result, TII, TRI, RBI);
 }

From 0e8cbb6ac8dc56104f200ce6f92008a315ccc267 Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev@outlook.com>
Date: Tue, 31 Oct 2023 11:43:18 -0700
Subject: [PATCH 567/877] [SLP][NFC]Add a test with poisonous reduction,
 seeding bool logical op. NFC.

---
 .../X86/reduction-bool-logic-op-inside.ll     | 25 +++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction-bool-logic-op-inside.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction-bool-logic-op-inside.ll
index b66967d183cac..1070c49093821 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reduction-bool-logic-op-inside.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction-bool-logic-op-inside.ll
@@ -17,3 +17,28 @@ define i1 @test(i32 %x) {
   ret i1 %ret
 }
 
+define i1 @test1(i32 %x, i32 %a, i32 %b, i32 %c, i32 %d) {
+; CHECK-LABEL: define i1 @test1(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]], i32 [[D:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[X]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[D]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[A]], i32 2
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[B]], i32 3
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp sgt <4 x i32> [[TMP4]], <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp sgt i32 [[C]], 1
+; CHECK-NEXT:    [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
+; CHECK-NEXT:    [[OP_RDX:%.*]] = select i1 [[TMP6]], i1 true, i1 [[CMP3]]
+; CHECK-NEXT:    ret i1 [[OP_RDX]]
+;
+  %cmp = icmp sgt i32 %x, 1
+  %cmp1 = icmp sgt i32 %a, 1
+  %cmp2 = icmp sgt i32 %b, 1
+  %cmp3 = icmp sgt i32 %c, 1
+  %cmp4 = icmp sgt i32 %d, 1
+  %sel1 = select i1 %cmp, i1 true, i1 %cmp1
+  %sel2 = select i1 %sel1, i1 true, i1 %cmp2
+  %sel3 = select i1 %sel2, i1 true, i1 %cmp3
+  %sel4 = select i1 %cmp, i1 true, i1 %cmp4
+  %ret = or i1 %sel3, %sel4
+  ret i1 %ret
+}

From 7a7603883506e1b02be9da560edc6d75d440a1e9 Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <Ramkumar.Ramachandra@imgtec.com>
Date: Tue, 31 Oct 2023 19:16:39 +0000
Subject: [PATCH 568/877] CodeGen/RISCV: increase test coverage of lrint,
 llrint (#70826)

To follow up on 98c90a1 (ISel: introduce vector ISD::LRINT, ISD::LLRINT;
custom RISCV lowering), increase the test coverage to test the codegen
of the i32-variant of lrint on RV64, and llrint on RV32.
---
 llvm/test/CodeGen/RISCV/rvv/llrint-sdnode.ll |   2 +
 llvm/test/CodeGen/RISCV/rvv/lrint-sdnode.ll  | 144 +++++++++++++------
 2 files changed, 101 insertions(+), 45 deletions(-)

diff --git a/llvm/test/CodeGen/RISCV/rvv/llrint-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/llrint-sdnode.ll
index 9a485a8b58be1..818abb9f4a009 100644
--- a/llvm/test/CodeGen/RISCV/rvv/llrint-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/llrint-sdnode.ll
@@ -1,4 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -mattr=+v,+f,+d -target-abi=ilp32d \
+; RUN:     -verify-machineinstrs < %s | FileCheck %s
 ; RUN: llc -mtriple=riscv64 -mattr=+v,+f,+d -target-abi=lp64d \
 ; RUN:     -verify-machineinstrs < %s | FileCheck %s
 
diff --git a/llvm/test/CodeGen/RISCV/rvv/lrint-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/lrint-sdnode.ll
index 61a5367b7fc5c..e75ea700df4f1 100644
--- a/llvm/test/CodeGen/RISCV/rvv/lrint-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/lrint-sdnode.ll
@@ -1,8 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+f,+d \
 ; RUN:     -target-abi=ilp32d -verify-machineinstrs | FileCheck %s --check-prefix=RV32
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv64 -mattr=+v,+f,+d \
+; RUN:     -target-abi=lp64d -verify-machineinstrs | FileCheck %s --check-prefix=RV64-i32
 ; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+f,+d \
-; RUN:     -target-abi=lp64d -verify-machineinstrs | FileCheck %s --check-prefix=RV64
+; RUN:     -target-abi=lp64d -verify-machineinstrs | FileCheck %s --check-prefix=RV64-i64
 
 define <vscale x 1 x iXLen> @lrint_nxv1f32(<vscale x 1 x float> %x) {
 ; RV32-LABEL: lrint_nxv1f32:
@@ -11,12 +13,18 @@ define <vscale x 1 x iXLen> @lrint_nxv1f32(<vscale x 1 x float> %x) {
 ; RV32-NEXT:    vfcvt.x.f.v v8, v8
 ; RV32-NEXT:    ret
 ;
-; RV64-LABEL: lrint_nxv1f32:
-; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
-; RV64-NEXT:    vfwcvt.x.f.v v9, v8
-; RV64-NEXT:    vmv1r.v v8, v9
-; RV64-NEXT:    ret
+; RV64-i32-LABEL: lrint_nxv1f32:
+; RV64-i32:       # %bb.0:
+; RV64-i32-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
+; RV64-i32-NEXT:    vfcvt.x.f.v v8, v8
+; RV64-i32-NEXT:    ret
+;
+; RV64-i64-LABEL: lrint_nxv1f32:
+; RV64-i64:       # %bb.0:
+; RV64-i64-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
+; RV64-i64-NEXT:    vfwcvt.x.f.v v9, v8
+; RV64-i64-NEXT:    vmv1r.v v8, v9
+; RV64-i64-NEXT:    ret
   %a = call <vscale x 1 x iXLen> @llvm.lrint.nxv1iXLen.nxv1f32(<vscale x 1 x float> %x)
   ret <vscale x 1 x iXLen> %a
 }
@@ -29,12 +37,18 @@ define <vscale x 2 x iXLen> @lrint_nxv2f32(<vscale x 2 x float> %x) {
 ; RV32-NEXT:    vfcvt.x.f.v v8, v8
 ; RV32-NEXT:    ret
 ;
-; RV64-LABEL: lrint_nxv2f32:
-; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
-; RV64-NEXT:    vfwcvt.x.f.v v10, v8
-; RV64-NEXT:    vmv2r.v v8, v10
-; RV64-NEXT:    ret
+; RV64-i32-LABEL: lrint_nxv2f32:
+; RV64-i32:       # %bb.0:
+; RV64-i32-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
+; RV64-i32-NEXT:    vfcvt.x.f.v v8, v8
+; RV64-i32-NEXT:    ret
+;
+; RV64-i64-LABEL: lrint_nxv2f32:
+; RV64-i64:       # %bb.0:
+; RV64-i64-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
+; RV64-i64-NEXT:    vfwcvt.x.f.v v10, v8
+; RV64-i64-NEXT:    vmv2r.v v8, v10
+; RV64-i64-NEXT:    ret
   %a = call <vscale x 2 x iXLen> @llvm.lrint.nxv2iXLen.nxv2f32(<vscale x 2 x float> %x)
   ret <vscale x 2 x iXLen> %a
 }
@@ -47,12 +61,18 @@ define <vscale x 4 x iXLen> @lrint_nxv4f32(<vscale x 4 x float> %x) {
 ; RV32-NEXT:    vfcvt.x.f.v v8, v8
 ; RV32-NEXT:    ret
 ;
-; RV64-LABEL: lrint_nxv4f32:
-; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
-; RV64-NEXT:    vfwcvt.x.f.v v12, v8
-; RV64-NEXT:    vmv4r.v v8, v12
-; RV64-NEXT:    ret
+; RV64-i32-LABEL: lrint_nxv4f32:
+; RV64-i32:       # %bb.0:
+; RV64-i32-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
+; RV64-i32-NEXT:    vfcvt.x.f.v v8, v8
+; RV64-i32-NEXT:    ret
+;
+; RV64-i64-LABEL: lrint_nxv4f32:
+; RV64-i64:       # %bb.0:
+; RV64-i64-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
+; RV64-i64-NEXT:    vfwcvt.x.f.v v12, v8
+; RV64-i64-NEXT:    vmv4r.v v8, v12
+; RV64-i64-NEXT:    ret
   %a = call <vscale x 4 x iXLen> @llvm.lrint.nxv4iXLen.nxv4f32(<vscale x 4 x float> %x)
   ret <vscale x 4 x iXLen> %a
 }
@@ -65,12 +85,18 @@ define <vscale x 8 x iXLen> @lrint_nxv8f32(<vscale x 8 x float> %x) {
 ; RV32-NEXT:    vfcvt.x.f.v v8, v8
 ; RV32-NEXT:    ret
 ;
-; RV64-LABEL: lrint_nxv8f32:
-; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
-; RV64-NEXT:    vfwcvt.x.f.v v16, v8
-; RV64-NEXT:    vmv8r.v v8, v16
-; RV64-NEXT:    ret
+; RV64-i32-LABEL: lrint_nxv8f32:
+; RV64-i32:       # %bb.0:
+; RV64-i32-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
+; RV64-i32-NEXT:    vfcvt.x.f.v v8, v8
+; RV64-i32-NEXT:    ret
+;
+; RV64-i64-LABEL: lrint_nxv8f32:
+; RV64-i64:       # %bb.0:
+; RV64-i64-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
+; RV64-i64-NEXT:    vfwcvt.x.f.v v16, v8
+; RV64-i64-NEXT:    vmv8r.v v8, v16
+; RV64-i64-NEXT:    ret
   %a = call <vscale x 8 x iXLen> @llvm.lrint.nxv8iXLen.nxv8f32(<vscale x 8 x float> %x)
   ret <vscale x 8 x iXLen> %a
 }
@@ -90,11 +116,18 @@ define <vscale x 1 x iXLen> @lrint_nxv1f64(<vscale x 1 x double> %x) {
 ; RV32-NEXT:    vmv1r.v v8, v9
 ; RV32-NEXT:    ret
 ;
-; RV64-LABEL: lrint_nxv1f64:
-; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; RV64-NEXT:    vfcvt.x.f.v v8, v8
-; RV64-NEXT:    ret
+; RV64-i32-LABEL: lrint_nxv1f64:
+; RV64-i32:       # %bb.0:
+; RV64-i32-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
+; RV64-i32-NEXT:    vfncvt.x.f.w v9, v8
+; RV64-i32-NEXT:    vmv1r.v v8, v9
+; RV64-i32-NEXT:    ret
+;
+; RV64-i64-LABEL: lrint_nxv1f64:
+; RV64-i64:       # %bb.0:
+; RV64-i64-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; RV64-i64-NEXT:    vfcvt.x.f.v v8, v8
+; RV64-i64-NEXT:    ret
   %a = call <vscale x 1 x iXLen> @llvm.lrint.nxv1iXLen.nxv1f64(<vscale x 1 x double> %x)
   ret <vscale x 1 x iXLen> %a
 }
@@ -108,11 +141,18 @@ define <vscale x 2 x iXLen> @lrint_nxv2f64(<vscale x 2 x double> %x) {
 ; RV32-NEXT:    vmv.v.v v8, v10
 ; RV32-NEXT:    ret
 ;
-; RV64-LABEL: lrint_nxv2f64:
-; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
-; RV64-NEXT:    vfcvt.x.f.v v8, v8
-; RV64-NEXT:    ret
+; RV64-i32-LABEL: lrint_nxv2f64:
+; RV64-i32:       # %bb.0:
+; RV64-i32-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
+; RV64-i32-NEXT:    vfncvt.x.f.w v10, v8
+; RV64-i32-NEXT:    vmv.v.v v8, v10
+; RV64-i32-NEXT:    ret
+;
+; RV64-i64-LABEL: lrint_nxv2f64:
+; RV64-i64:       # %bb.0:
+; RV64-i64-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; RV64-i64-NEXT:    vfcvt.x.f.v v8, v8
+; RV64-i64-NEXT:    ret
   %a = call <vscale x 2 x iXLen> @llvm.lrint.nxv2iXLen.nxv2f64(<vscale x 2 x double> %x)
   ret <vscale x 2 x iXLen> %a
 }
@@ -126,11 +166,18 @@ define <vscale x 4 x iXLen> @lrint_nxv4f64(<vscale x 4 x double> %x) {
 ; RV32-NEXT:    vmv.v.v v8, v12
 ; RV32-NEXT:    ret
 ;
-; RV64-LABEL: lrint_nxv4f64:
-; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; RV64-NEXT:    vfcvt.x.f.v v8, v8
-; RV64-NEXT:    ret
+; RV64-i32-LABEL: lrint_nxv4f64:
+; RV64-i32:       # %bb.0:
+; RV64-i32-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
+; RV64-i32-NEXT:    vfncvt.x.f.w v12, v8
+; RV64-i32-NEXT:    vmv.v.v v8, v12
+; RV64-i32-NEXT:    ret
+;
+; RV64-i64-LABEL: lrint_nxv4f64:
+; RV64-i64:       # %bb.0:
+; RV64-i64-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
+; RV64-i64-NEXT:    vfcvt.x.f.v v8, v8
+; RV64-i64-NEXT:    ret
   %a = call <vscale x 4 x iXLen> @llvm.lrint.nxv4iXLen.nxv4f64(<vscale x 4 x double> %x)
   ret <vscale x 4 x iXLen> %a
 }
@@ -144,11 +191,18 @@ define <vscale x 8 x iXLen> @lrint_nxv8f64(<vscale x 8 x double> %x) {
 ; RV32-NEXT:    vmv.v.v v8, v16
 ; RV32-NEXT:    ret
 ;
-; RV64-LABEL: lrint_nxv8f64:
-; RV64:       # %bb.0:
-; RV64-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
-; RV64-NEXT:    vfcvt.x.f.v v8, v8
-; RV64-NEXT:    ret
+; RV64-i32-LABEL: lrint_nxv8f64:
+; RV64-i32:       # %bb.0:
+; RV64-i32-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
+; RV64-i32-NEXT:    vfncvt.x.f.w v16, v8
+; RV64-i32-NEXT:    vmv.v.v v8, v16
+; RV64-i32-NEXT:    ret
+;
+; RV64-i64-LABEL: lrint_nxv8f64:
+; RV64-i64:       # %bb.0:
+; RV64-i64-NEXT:    vsetvli a0, zero, e64, m8, ta, ma
+; RV64-i64-NEXT:    vfcvt.x.f.v v8, v8
+; RV64-i64-NEXT:    ret
   %a = call <vscale x 8 x iXLen> @llvm.lrint.nxv8iXLen.nxv8f64(<vscale x 8 x double> %x)
   ret <vscale x 8 x iXLen> %a
 }

From 1bea0347bfbba19e5cfecb0c66846e2bb41dc338 Mon Sep 17 00:00:00 2001
From: Peter Klausler <35819229+klausler@users.noreply.github.com>
Date: Tue, 31 Oct 2023 12:17:00 -0700
Subject: [PATCH 569/877] [flang] Fix mod file generation of derived type
 initializers... (#70511)

... when the derived type used in the structure constructor(s) is from
another module and not use-associated into the current module.

This came up in a test with a derived type component default initializer
of "c_null_ptr", which is replaced with the expression
"__builtin_c_ptr(address=0_8)"; the derived type name "__builtin_c_ptr"
is not available in the current scope, and the module file would fail
semantic analysis when USE'd.

The best solution that I found was to extend module file generation to
detect this case and handle it by inserting the right USE association to
the ultimate derived type symbol, possibly with renaming to a
compiler-created name in the case of a conflict.

To implement this transformation, it was necessary to fix the utility
evaluate::CollectSymbols() to include the derived type symbol from a
structure constructor. This involved extending the expression traversal
framework to visit the derived type spec of a structure constructor.
Extending CollectSymbols() caused a lowering test to fail mysteriously,
so I tracked down the code in PFTBuilder that didn't expect to see a
DerivedTypeDetails symbol and dealt with it there.
---
 flang/include/flang/Evaluate/constant.h   |   3 +-
 flang/include/flang/Evaluate/expression.h |   3 +-
 flang/include/flang/Evaluate/traverse.h   |  21 ++--
 flang/include/flang/Evaluate/type.h       |   3 +-
 flang/include/flang/Parser/message.h      |   6 +
 flang/lib/Evaluate/formatting.cpp         |  15 ++-
 flang/lib/Lower/PFTBuilder.cpp            |   5 +-
 flang/lib/Semantics/mod-file.cpp          | 138 +++++++++++++++++-----
 flang/lib/Semantics/mod-file.h            |   2 +
 flang/test/Semantics/modfile03.f90        |   4 +-
 flang/test/Semantics/modfile59.f90        |  30 +++++
 11 files changed, 180 insertions(+), 50 deletions(-)
 create mode 100644 flang/test/Semantics/modfile59.f90

diff --git a/flang/include/flang/Evaluate/constant.h b/flang/include/flang/Evaluate/constant.h
index 8c841918bccbe..ee83d9fc04f3b 100644
--- a/flang/include/flang/Evaluate/constant.h
+++ b/flang/include/flang/Evaluate/constant.h
@@ -126,7 +126,8 @@ class ConstantBase : public ConstantBounds {
   constexpr Result result() const { return result_; }
 
   constexpr DynamicType GetType() const { return result_.GetType(); }
-  llvm::raw_ostream &AsFortran(llvm::raw_ostream &) const;
+  llvm::raw_ostream &AsFortran(llvm::raw_ostream &,
+      const parser::CharBlock *derivedTypeRename = nullptr) const;
 
 protected:
   std::vector<Element> Reshape(const ConstantSubscripts &) const;
diff --git a/flang/include/flang/Evaluate/expression.h b/flang/include/flang/Evaluate/expression.h
index 642ddf5116847..64db0b88d03e5 100644
--- a/flang/include/flang/Evaluate/expression.h
+++ b/flang/include/flang/Evaluate/expression.h
@@ -735,7 +735,8 @@ class StructureConstructor {
   StructureConstructor &Add(const semantics::Symbol &, Expr<SomeType> &&);
   int Rank() const { return 0; }
   DynamicType GetType() const;
-  llvm::raw_ostream &AsFortran(llvm::raw_ostream &) const;
+  llvm::raw_ostream &AsFortran(llvm::raw_ostream &,
+      const parser::CharBlock *derivedTypeRename = nullptr) const;
 
 private:
   std::optional<Expr<SomeType>> CreateParentComponent(const Symbol &) const;
diff --git a/flang/include/flang/Evaluate/traverse.h b/flang/include/flang/Evaluate/traverse.h
index 54cdb690ec088..8d75cc2df7247 100644
--- a/flang/include/flang/Evaluate/traverse.h
+++ b/flang/include/flang/Evaluate/traverse.h
@@ -100,16 +100,8 @@ template <typename Visitor, typename Result> class Traverse {
   Result operator()(const NullPointer &) const { return visitor_.Default(); }
   template <typename T> Result operator()(const Constant<T> &x) const {
     if constexpr (T::category == TypeCategory::Derived) {
-      std::optional<Result> result;
-      for (const StructureConstructorValues &map : x.values()) {
-        for (const auto &pair : map) {
-          auto value{visitor_(pair.second.value())};
-          result = result
-              ? visitor_.Combine(std::move(*result), std::move(value))
-              : std::move(value);
-        }
-      }
-      return result ? *result : visitor_.Default();
+      return visitor_.Combine(
+          visitor_(x.result().derivedTypeSpec()), CombineContents(x.values()));
     } else {
       return visitor_.Default();
     }
@@ -217,12 +209,19 @@ template <typename Visitor, typename Result> class Traverse {
       const semantics::DerivedTypeSpec::ParameterMapType::value_type &x) const {
     return visitor_(x.second);
   }
+  Result operator()(
+      const semantics::DerivedTypeSpec::ParameterMapType &x) const {
+    return CombineContents(x);
+  }
   Result operator()(const semantics::DerivedTypeSpec &x) const {
-    return CombineContents(x.parameters());
+    return Combine(x.typeSymbol(), x.parameters());
   }
   Result operator()(const StructureConstructorValues::value_type &x) const {
     return visitor_(x.second);
   }
+  Result operator()(const StructureConstructorValues &x) const {
+    return CombineContents(x);
+  }
   Result operator()(const StructureConstructor &x) const {
     return visitor_.Combine(visitor_(x.derivedTypeSpec()), CombineContents(x));
   }
diff --git a/flang/include/flang/Evaluate/type.h b/flang/include/flang/Evaluate/type.h
index 99916aaf39978..da6efea6f8a74 100644
--- a/flang/include/flang/Evaluate/type.h
+++ b/flang/include/flang/Evaluate/type.h
@@ -272,7 +272,8 @@ const semantics::DerivedTypeSpec *GetDerivedTypeSpec(
 const semantics::DerivedTypeSpec *GetParentTypeSpec(
     const semantics::DerivedTypeSpec &);
 
-std::string DerivedTypeSpecAsFortran(const semantics::DerivedTypeSpec &);
+std::string DerivedTypeSpecAsFortran(const semantics::DerivedTypeSpec &,
+    const parser::CharBlock *derivedTypeRename = nullptr);
 
 template <TypeCategory CATEGORY, int KIND = 0> struct TypeBase {
   static constexpr TypeCategory category{CATEGORY};
diff --git a/flang/include/flang/Parser/message.h b/flang/include/flang/Parser/message.h
index 64b1298fb4969..668559aeec947 100644
--- a/flang/include/flang/Parser/message.h
+++ b/flang/include/flang/Parser/message.h
@@ -121,6 +121,12 @@ class MessageFormattedText {
     return *this;
   }
   std::string MoveString() { return std::move(string_); }
+  bool operator==(const MessageFormattedText &that) const {
+    return severity_ == that.severity_ && string_ == that.string_;
+  }
+  bool operator!=(const MessageFormattedText &that) const {
+    return !(*this == that);
+  }
 
 private:
   void Format(const MessageFixedText *, ...);
diff --git a/flang/lib/Evaluate/formatting.cpp b/flang/lib/Evaluate/formatting.cpp
index 5684c07657e61..5f822bbcbb04f 100644
--- a/flang/lib/Evaluate/formatting.cpp
+++ b/flang/lib/Evaluate/formatting.cpp
@@ -53,7 +53,7 @@ static void ShapeAsFortran(llvm::raw_ostream &o,
 
 template <typename RESULT, typename VALUE>
 llvm::raw_ostream &ConstantBase<RESULT, VALUE>::AsFortran(
-    llvm::raw_ostream &o) const {
+    llvm::raw_ostream &o, const parser::CharBlock *derivedTypeRename) const {
   bool hasNonDefaultLowerBound{printLbounds && HasNonDefaultLowerBound()};
   if (Rank() > 1 || hasNonDefaultLowerBound) {
     o << "reshape(";
@@ -85,7 +85,8 @@ llvm::raw_ostream &ConstantBase<RESULT, VALUE>::AsFortran(
         o << ".false." << '_' << Result::kind;
       }
     } else {
-      StructureConstructor{result_.derivedTypeSpec(), value}.AsFortran(o);
+      StructureConstructor{result_.derivedTypeSpec(), value}.AsFortran(
+          o, derivedTypeRename);
     }
   }
   if (Rank() > 0) {
@@ -503,8 +504,9 @@ llvm::raw_ostream &ExpressionBase<RESULT>::AsFortran(
   return o;
 }
 
-llvm::raw_ostream &StructureConstructor::AsFortran(llvm::raw_ostream &o) const {
-  o << DerivedTypeSpecAsFortran(result_.derivedTypeSpec());
+llvm::raw_ostream &StructureConstructor::AsFortran(
+    llvm::raw_ostream &o, const parser::CharBlock *derivedTypeRename) const {
+  o << DerivedTypeSpecAsFortran(result_.derivedTypeSpec(), derivedTypeRename);
   if (values_.empty()) {
     o << '(';
   } else {
@@ -566,10 +568,11 @@ std::string SomeDerived::AsFortran() const {
   }
 }
 
-std::string DerivedTypeSpecAsFortran(const semantics::DerivedTypeSpec &spec) {
+std::string DerivedTypeSpecAsFortran(const semantics::DerivedTypeSpec &spec,
+    const parser::CharBlock *derivedTypeRename) {
   std::string buf;
   llvm::raw_string_ostream ss{buf};
-  ss << spec.name().ToString();
+  ss << (derivedTypeRename ? *derivedTypeRename : spec.name()).ToString();
   char ch{'('};
   for (const auto &[name, value] : spec.parameters()) {
     ss << ch << name.ToString() << '=';
diff --git a/flang/lib/Lower/PFTBuilder.cpp b/flang/lib/Lower/PFTBuilder.cpp
index 0946a85dcaddd..bc207ba4b9f12 100644
--- a/flang/lib/Lower/PFTBuilder.cpp
+++ b/flang/lib/Lower/PFTBuilder.cpp
@@ -1513,7 +1513,7 @@ struct SymbolDependenceAnalysis {
     // Derived type component symbols may be collected by "CollectSymbols"
     // below when processing something like "real :: x(derived%component)". The
     // symbol "component" has "ObjectEntityDetails", but it should not be
-    // instantiated: it is is part of "derived" that should be the only one to
+    // instantiated: it is part of "derived" that should be the only one to
     // be instantiated.
     if (sym.owner().IsDerivedType())
       return 0;
@@ -1569,7 +1569,8 @@ struct SymbolDependenceAnalysis {
       // Handle any symbols in initialization expressions.
       if (auto e = details->init())
         for (const auto &s : evaluate::CollectSymbols(*e))
-          depth = std::max(analyze(s) + 1, depth);
+          if (!s->has<semantics::DerivedTypeDetails>())
+            depth = std::max(analyze(s) + 1, depth);
     }
     adjustSize(depth + 1);
     bool global = lower::symbolIsGlobal(sym);
diff --git a/flang/lib/Semantics/mod-file.cpp b/flang/lib/Semantics/mod-file.cpp
index 8684eb1fbd332..70b6bbf8b557a 100644
--- a/flang/lib/Semantics/mod-file.cpp
+++ b/flang/lib/Semantics/mod-file.cpp
@@ -44,10 +44,11 @@ struct ModHeader {
 };
 
 static std::optional<SourceName> GetSubmoduleParent(const parser::Program &);
-static void CollectSymbols(const Scope &, SymbolVector &, SymbolVector &);
+static void CollectSymbols(const Scope &, SymbolVector &, SymbolVector &,
+    std::map<const Symbol *, SourceName> &);
 static void PutPassName(llvm::raw_ostream &, const std::optional<SourceName> &);
 static void PutInit(llvm::raw_ostream &, const Symbol &, const MaybeExpr &,
-    const parser::Expr *);
+    const parser::Expr *, const std::map<const Symbol *, SourceName> &);
 static void PutInit(llvm::raw_ostream &, const MaybeIntExpr &);
 static void PutBound(llvm::raw_ostream &, const Bound &);
 static void PutShapeSpec(llvm::raw_ostream &, const ShapeSpec &);
@@ -173,11 +174,81 @@ std::string ModFileWriter::GetAsString(const Symbol &symbol) {
   return all.str();
 }
 
+// Collect symbols from initializations that are being referenced directly
+// from other modules; they may require new USE associations.
+static void HarvestInitializerSymbols(
+    SourceOrderedSymbolSet &set, const Scope &scope) {
+  for (const auto &[_, symbol] : scope) {
+    if (symbol->has<DerivedTypeDetails>()) {
+      if (symbol->scope()) {
+        HarvestInitializerSymbols(set, *symbol->scope());
+      }
+    } else if (IsNamedConstant(*symbol) || scope.IsDerivedType()) {
+      if (const auto *object{symbol->detailsIf<ObjectEntityDetails>()}) {
+        if (object->init()) {
+          for (SymbolRef ref : evaluate::CollectSymbols(*object->init())) {
+            set.emplace(*ref);
+          }
+        }
+      } else if (const auto *proc{symbol->detailsIf<ProcEntityDetails>()}) {
+        if (proc->init() && *proc->init()) {
+          set.emplace(**proc->init());
+        }
+      }
+    }
+  }
+}
+
+void ModFileWriter::PrepareRenamings(const Scope &scope) {
+  SourceOrderedSymbolSet symbolsInInits;
+  HarvestInitializerSymbols(symbolsInInits, scope);
+  for (SymbolRef s : symbolsInInits) {
+    const Scope *sMod{FindModuleContaining(s->owner())};
+    if (!sMod) {
+      continue;
+    }
+    SourceName rename{s->name()};
+    if (const Symbol * found{scope.FindSymbol(s->name())}) {
+      if (found == &*s) {
+        continue; // available in scope
+      }
+      if (const auto *generic{found->detailsIf<GenericDetails>()}) {
+        if (generic->derivedType() == &*s || generic->specific() == &*s) {
+          continue;
+        }
+      } else if (found->has<UseDetails>()) {
+        if (&found->GetUltimate() == &*s) {
+          continue; // already use-associated with same name
+        }
+      }
+      if (&s->owner() != &found->owner()) { // Symbol needs renaming
+        rename = scope.context().SaveTempName(
+            DEREF(sMod->symbol()).name().ToString() + "$" +
+            s->name().ToString());
+      }
+    }
+    // Symbol is used in this scope but not visible under its name
+    if (sMod->parent().IsIntrinsicModules()) {
+      uses_ << "use,intrinsic::";
+    } else {
+      uses_ << "use ";
+    }
+    uses_ << DEREF(sMod->symbol()).name() << ",only:";
+    if (rename != s->name()) {
+      uses_ << rename << "=>";
+    }
+    uses_ << s->name() << '\n';
+    useExtraAttrs_ << "private::" << rename << '\n';
+    renamings_.emplace(&*s, rename);
+  }
+}
+
 // Put out the visible symbols from scope.
 void ModFileWriter::PutSymbols(const Scope &scope) {
   SymbolVector sorted;
   SymbolVector uses;
-  CollectSymbols(scope, sorted, uses);
+  PrepareRenamings(scope);
+  CollectSymbols(scope, sorted, uses, renamings_);
   std::string buf; // stuff after CONTAINS in derived type
   llvm::raw_string_ostream typeBindings{buf};
   for (const Symbol &symbol : sorted) {
@@ -404,7 +475,7 @@ void ModFileWriter::PutDECStructure(
         }
         decls_ << ref->name();
         PutShape(decls_, object->shape(), '(', ')');
-        PutInit(decls_, *ref, object->init(), nullptr);
+        PutInit(decls_, *ref, object->init(), nullptr, renamings_);
         emittedDECFields_.insert(*ref);
       } else if (any) {
         break; // any later use of this structure will use RECORD/str/
@@ -628,31 +699,30 @@ static inline SourceName NameInModuleFile(const Symbol &symbol) {
 
 // Collect the symbols of this scope sorted by their original order, not name.
 // Generics and namelists are exceptions: they are sorted after other symbols.
-void CollectSymbols(
-    const Scope &scope, SymbolVector &sorted, SymbolVector &uses) {
+void CollectSymbols(const Scope &scope, SymbolVector &sorted,
+    SymbolVector &uses, std::map<const Symbol *, SourceName> &renamings) {
   SymbolVector namelist, generics;
-  std::size_t commonSize{scope.commonBlocks().size()};
   auto symbols{scope.GetSymbols()};
+  std::size_t commonSize{scope.commonBlocks().size()};
   sorted.reserve(symbols.size() + commonSize);
   for (SymbolRef symbol : symbols) {
-    if (!symbol->test(Symbol::Flag::ParentComp)) {
-      if (symbol->has<NamelistDetails>()) {
-        namelist.push_back(symbol);
-      } else if (const auto *generic{symbol->detailsIf<GenericDetails>()}) {
-        if (generic->specific() &&
-            &generic->specific()->owner() == &symbol->owner()) {
-          sorted.push_back(*generic->specific());
-        } else if (generic->derivedType() &&
-            &generic->derivedType()->owner() == &symbol->owner()) {
-          sorted.push_back(*generic->derivedType());
-        }
-        generics.push_back(symbol);
-      } else {
-        sorted.push_back(symbol);
-      }
-      if (const auto *details{symbol->detailsIf<GenericDetails>()}) {
-        uses.insert(uses.end(), details->uses().begin(), details->uses().end());
+    if (symbol->test(Symbol::Flag::ParentComp)) {
+    } else if (symbol->has<NamelistDetails>()) {
+      namelist.push_back(symbol);
+    } else if (const auto *generic{symbol->detailsIf<GenericDetails>()}) {
+      if (generic->specific() &&
+          &generic->specific()->owner() == &symbol->owner()) {
+        sorted.push_back(*generic->specific());
+      } else if (generic->derivedType() &&
+          &generic->derivedType()->owner() == &symbol->owner()) {
+        sorted.push_back(*generic->derivedType());
       }
+      generics.push_back(symbol);
+    } else {
+      sorted.push_back(symbol);
+    }
+    if (const auto *details{symbol->detailsIf<GenericDetails>()}) {
+      uses.insert(uses.end(), details->uses().begin(), details->uses().end());
     }
   }
   // Sort most symbols by name: use of Symbol::ReplaceName ensures the source
@@ -734,7 +804,8 @@ void ModFileWriter::PutObjectEntity(
       getSymbolAttrsToWrite(symbol));
   PutShape(os, details.shape(), '(', ')');
   PutShape(os, details.coshape(), '[', ']');
-  PutInit(os, symbol, details.init(), details.unanalyzedPDTComponentInit());
+  PutInit(os, symbol, details.init(), details.unanalyzedPDTComponentInit(),
+      renamings_);
   os << '\n';
   if (auto tkr{GetIgnoreTKR(symbol)}; !tkr.empty()) {
     os << "!dir$ ignore_tkr(";
@@ -828,12 +899,25 @@ void ModFileWriter::PutTypeParam(llvm::raw_ostream &os, const Symbol &symbol) {
 }
 
 void PutInit(llvm::raw_ostream &os, const Symbol &symbol, const MaybeExpr &init,
-    const parser::Expr *unanalyzed) {
-  if (symbol.attrs().test(Attr::PARAMETER) || symbol.owner().IsDerivedType()) {
+    const parser::Expr *unanalyzed,
+    const std::map<const Symbol *, SourceName> &renamings) {
+  if (IsNamedConstant(symbol) || symbol.owner().IsDerivedType()) {
     const char *assign{symbol.attrs().test(Attr::POINTER) ? "=>" : "="};
     if (unanalyzed) {
       parser::Unparse(os << assign, *unanalyzed);
     } else if (init) {
+      if (const auto *dtConst{
+              evaluate::UnwrapExpr<evaluate::Constant<evaluate::SomeDerived>>(
+                  *init)}) {
+        const Symbol &dtSym{dtConst->result().derivedTypeSpec().typeSymbol()};
+        if (auto iter{renamings.find(&dtSym)}; iter != renamings.end()) {
+          // Initializer is a constant whose derived type's name has
+          // been brought into scope from a module under a new name
+          // to avoid a conflict.
+          dtConst->AsFortran(os << assign, &iter->second);
+          return;
+        }
+      }
       init->AsFortran(os << assign);
     }
   }
diff --git a/flang/lib/Semantics/mod-file.h b/flang/lib/Semantics/mod-file.h
index a2366a222d360..5be117153dd4d 100644
--- a/flang/lib/Semantics/mod-file.h
+++ b/flang/lib/Semantics/mod-file.h
@@ -52,11 +52,13 @@ class ModFileWriter {
   llvm::raw_string_ostream decls_{declsBuf_};
   llvm::raw_string_ostream contains_{containsBuf_};
   bool isSubmodule_{false};
+  std::map<const Symbol *, SourceName> renamings_;
 
   void WriteAll(const Scope &);
   void WriteOne(const Scope &);
   void Write(const Symbol &);
   std::string GetAsString(const Symbol &);
+  void PrepareRenamings(const Scope &);
   void PutSymbols(const Scope &);
   // Returns true if a derived type with bindings and "contains" was emitted
   bool PutComponents(const Symbol &);
diff --git a/flang/test/Semantics/modfile03.f90 b/flang/test/Semantics/modfile03.f90
index 8e2eb43e7e549..db0caeab973f8 100644
--- a/flang/test/Semantics/modfile03.f90
+++ b/flang/test/Semantics/modfile03.f90
@@ -135,8 +135,10 @@ module m6d
 end
 !Expect: m6d.mod
 !module m6d
+! use m6a,only:t1
 ! use m6a,only:t2=>t1
-! type(t2),parameter::p=t2()
+! private::t1
+! type(t2),parameter::p=t1()
 !end
 
 module m6e
diff --git a/flang/test/Semantics/modfile59.f90 b/flang/test/Semantics/modfile59.f90
new file mode 100644
index 0000000000000..873451f72795c
--- /dev/null
+++ b/flang/test/Semantics/modfile59.f90
@@ -0,0 +1,30 @@
+! RUN: %python %S/test_modfile.py %s %flang_fc1
+! Test derived type renaming in initializers necessary to avoid
+! clashing with local names
+module m
+  use, intrinsic :: iso_c_binding, only: &
+    c_ptr, c_funptr, c_null_ptr, c_null_funptr
+  real, private :: __builtin_c_ptr, __builtin_c_funptr
+  type mydt
+    type(c_funptr) :: component = c_null_funptr
+  end type
+  type(c_ptr), parameter :: namedConst = c_null_ptr
+end
+
+!Expect: m.mod
+!module m
+!use,intrinsic::__fortran_builtins,only:__fortran_builtins$__builtin_c_ptr=>__builtin_c_ptr
+!use,intrinsic::__fortran_builtins,only:__fortran_builtins$__builtin_c_funptr=>__builtin_c_funptr
+!use,intrinsic::iso_c_binding,only:c_ptr
+!use,intrinsic::iso_c_binding,only:c_funptr
+!use,intrinsic::iso_c_binding,only:c_null_ptr
+!use,intrinsic::iso_c_binding,only:c_null_funptr
+!private::__fortran_builtins$__builtin_c_ptr
+!private::__fortran_builtins$__builtin_c_funptr
+!real(4),private::__builtin_c_ptr
+!real(4),private::__builtin_c_funptr
+!type::mydt
+!type(c_funptr)::component=__fortran_builtins$__builtin_c_funptr(__address=0_8)
+!end type
+!type(c_ptr),parameter::namedconst=__fortran_builtins$__builtin_c_ptr(__address=0_8)
+!end

From 4c997e1536e4f5b43c38f6a4325441a7804d0da1 Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev@outlook.com>
Date: Tue, 31 Oct 2023 11:41:27 -0700
Subject: [PATCH 570/877] [SLP]Fix PR70507: emit freeeze whenever required for
 bool logical ops in the middle of reduction ops.

Need to emit freeze instruction not only in the case, where the root is
bool logical op, but also if we reduce several scalars, but unable to
say precisely, if the root is bool logical op.
---
 llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp       |  8 +++++++-
 .../X86/reduction-bool-logic-op-inside.ll             |  5 +++--
 .../X86/reduction-gather-non-scheduled-extracts.ll    | 11 +++++------
 3 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 4bb6301f4612f..58974eefc4df5 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -14122,6 +14122,10 @@ class HorizontalReduction {
       // Initialize the final value in the reduction.
       return Res;
     };
+    bool AnyBoolLogicOp =
+        any_of(ReductionOps.back(), [](Value *V) {
+          return isBoolLogicOp(cast<Instruction>(V));
+        });
     // The reduction root is used as the insertion point for new instructions,
     // so set it as externally used to prevent it from being deleted.
     ExternallyUsedValues[ReductionRoot];
@@ -14454,7 +14458,9 @@ class HorizontalReduction {
         // To prevent poison from leaking across what used to be sequential,
         // safe, scalar boolean logic operations, the reduction operand must be
         // frozen.
-        if (isBoolLogicOp(RdxRootInst))
+        if ((isBoolLogicOp(RdxRootInst) ||
+             (AnyBoolLogicOp && VL.size() != TrackedVals.size())) &&
+            !isGuaranteedNotToBePoison(VectorizedRoot))
           VectorizedRoot = Builder.CreateFreeze(VectorizedRoot);
 
         // Emit code to correctly handle reused reduced values, if required.
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction-bool-logic-op-inside.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction-bool-logic-op-inside.ll
index 1070c49093821..481ff15a523f8 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reduction-bool-logic-op-inside.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction-bool-logic-op-inside.ll
@@ -26,8 +26,9 @@ define i1 @test1(i32 %x, i32 %a, i32 %b, i32 %c, i32 %d) {
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[B]], i32 3
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp sgt <4 x i32> [[TMP4]], <i32 1, i32 1, i32 1, i32 1>
 ; CHECK-NEXT:    [[CMP3:%.*]] = icmp sgt i32 [[C]], 1
-; CHECK-NEXT:    [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
-; CHECK-NEXT:    [[OP_RDX:%.*]] = select i1 [[TMP6]], i1 true, i1 [[CMP3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = freeze <4 x i1> [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP6]])
+; CHECK-NEXT:    [[OP_RDX:%.*]] = select i1 [[TMP7]], i1 true, i1 [[CMP3]]
 ; CHECK-NEXT:    ret i1 [[OP_RDX]]
 ;
   %cmp = icmp sgt i32 %x, 1
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction-gather-non-scheduled-extracts.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction-gather-non-scheduled-extracts.ll
index 45fa59bb61fe1..f032d4b6ecd45 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reduction-gather-non-scheduled-extracts.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction-gather-non-scheduled-extracts.ll
@@ -9,14 +9,13 @@ define void @tes() {
 ; CHECK:       1:
 ; CHECK-NEXT:    [[TMP2:%.*]] = select i1 false, i1 false, i1 false
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i1> zeroinitializer, <2 x i1> [[TMP0]], <4 x i32> <i32 0, i32 0, i32 0, i32 2>
-; CHECK-NEXT:    [[TMP4:%.*]] = freeze <4 x i1> [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP4]])
-; CHECK-NEXT:    [[OP_RDX:%.*]] = select i1 false, i1 [[TMP5]], i1 false
+; CHECK-NEXT:    [[TMP4:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP3]])
+; CHECK-NEXT:    [[OP_RDX:%.*]] = select i1 false, i1 [[TMP4]], i1 false
 ; CHECK-NEXT:    [[OP_RDX1:%.*]] = select i1 [[TMP2]], i1 [[OP_RDX]], i1 false
-; CHECK-NEXT:    br i1 [[OP_RDX1]], label [[TMP6:%.*]], label [[TMP7:%.*]]
-; CHECK:       6:
+; CHECK-NEXT:    br i1 [[OP_RDX1]], label [[TMP5:%.*]], label [[TMP6:%.*]]
+; CHECK:       5:
 ; CHECK-NEXT:    ret void
-; CHECK:       7:
+; CHECK:       6:
 ; CHECK-NEXT:    ret void
 ;
 entry:

From 2d4ada215ebb3062947c399cf20d386aa78619a6 Mon Sep 17 00:00:00 2001
From: Peter Klausler <35819229+klausler@users.noreply.github.com>
Date: Tue, 31 Oct 2023 12:29:30 -0700
Subject: [PATCH 571/877] [flang] Fix IsDescriptor() to fix SMP compatibility
 checking (#70676)

Fix IsDescriptor() so that it doesn't return a false negative result
that messes up compatibility checking between a separate module
procedure's interface and its redundant definition (without MODULE
PROCEDURE). Specifically, lower bounds just don't matter, and any upper
bound that's not explicit is dispositive.
---
 flang/lib/Evaluate/type.cpp            | 8 +-------
 flang/test/Semantics/separate-mp02.f90 | 6 ++++++
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/flang/lib/Evaluate/type.cpp b/flang/lib/Evaluate/type.cpp
index 82b8f28f961dd..72e385104cd64 100644
--- a/flang/lib/Evaluate/type.cpp
+++ b/flang/lib/Evaluate/type.cpp
@@ -40,18 +40,12 @@ static bool IsDescriptor(const ObjectEntityDetails &details) {
   std::size_t j{0};
   for (const ShapeSpec &shapeSpec : details.shape()) {
     ++j;
-    if (const auto &lb{shapeSpec.lbound().GetExplicit()};
-        !lb || !IsConstantExpr(*lb)) {
-      return true;
-    }
     if (const auto &ub{shapeSpec.ubound().GetExplicit()}) {
       if (!IsConstantExpr(*ub)) {
         return true;
       }
-    } else if (j == details.shape().size() && details.isDummy()) {
-      // assumed size array
     } else {
-      return true;
+      return shapeSpec.ubound().isColon();
     }
   }
   return false;
diff --git a/flang/test/Semantics/separate-mp02.f90 b/flang/test/Semantics/separate-mp02.f90
index c39f18064796b..5d13b6b693c8f 100644
--- a/flang/test/Semantics/separate-mp02.f90
+++ b/flang/test/Semantics/separate-mp02.f90
@@ -35,6 +35,9 @@ module subroutine s9(x, y, z, w)
       character(len=*) :: z
       character(len=*) :: w
     end
+    module subroutine s10(x, y, z, w)
+      real x(0:), y(:), z(0:*), w(*)
+    end
   end interface
 end
 
@@ -78,6 +81,9 @@ module subroutine s9(x, y, z, w)
     !ERROR: Dummy argument 'w' has type CHARACTER(KIND=1,LEN=4_8); the corresponding argument in the interface body has distinct type CHARACTER(KIND=1,LEN=*)
     character(len=4) :: w
   end
+  module subroutine s10(x, y, z, w)
+    real x(:), y(0:), z(*), w(0:*) ! all ok, lower bounds don't matter
+  end
 end
 
 module m2

From 297230acdadd667fda379debd7dda20ace4f7213 Mon Sep 17 00:00:00 2001
From: Peter Klausler <35819229+klausler@users.noreply.github.com>
Date: Tue, 31 Oct 2023 12:35:01 -0700
Subject: [PATCH 572/877] [flang] Accept directive sentinels in macro-replaced
 source better (#70699)

At present, the prescanner emits an error if a source line or compiler
directive, after macro replacement or not, contains a token with a
non-Fortran character. In the particular case of the '!' character, the
code that checks for bad character will accept the '!' if it appears
after a ';', since the '!' might begin a compiler directive.

This current implementation fails when a compiler directive appears
after some other character that might (by means of further source
processing not visible to the prescanner) be replaced with a ';' or
newline.

Extend the bad character check for '!' to actually check for a compiler
directive sentinel instead.
---
 flang/lib/Parser/prescan.cpp                  | 23 +++++++++++++++----
 flang/lib/Parser/prescan.h                    |  2 ++
 flang/lib/Parser/token-sequence.cpp           | 22 ++++++++++--------
 flang/lib/Parser/token-sequence.h             |  3 ++-
 .../test/Preprocessing/preprocessed-dirs.F90  |  8 +++++++
 5 files changed, 43 insertions(+), 15 deletions(-)
 create mode 100644 flang/test/Preprocessing/preprocessed-dirs.F90

diff --git a/flang/lib/Parser/prescan.cpp b/flang/lib/Parser/prescan.cpp
index 5b600822282a4..0408de6208647 100644
--- a/flang/lib/Parser/prescan.cpp
+++ b/flang/lib/Parser/prescan.cpp
@@ -205,7 +205,7 @@ void Prescanner::Statement() {
       Say(preprocessed->GetProvenanceRange(),
           "Preprocessed line resembles a preprocessor directive"_warn_en_US);
       preprocessed->ToLowerCase()
-          .CheckBadFortranCharacters(messages_)
+          .CheckBadFortranCharacters(messages_, *this)
           .CheckBadParentheses(messages_)
           .Emit(cooked_);
       break;
@@ -220,7 +220,7 @@ void Prescanner::Statement() {
       preprocessed->ToLowerCase();
       SourceFormChange(preprocessed->ToString());
       preprocessed->ClipComment(*this, true /* skip first ! */)
-          .CheckBadFortranCharacters(messages_)
+          .CheckBadFortranCharacters(messages_, *this)
           .CheckBadParentheses(messages_)
           .Emit(cooked_);
       break;
@@ -239,7 +239,7 @@ void Prescanner::Statement() {
       }
       preprocessed->ToLowerCase()
           .ClipComment(*this)
-          .CheckBadFortranCharacters(messages_)
+          .CheckBadFortranCharacters(messages_, *this)
           .CheckBadParentheses(messages_)
           .Emit(cooked_);
       break;
@@ -257,7 +257,7 @@ void Prescanner::Statement() {
         EnforceStupidEndStatementRules(tokens);
       }
     }
-    tokens.CheckBadFortranCharacters(messages_)
+    tokens.CheckBadFortranCharacters(messages_, *this)
         .CheckBadParentheses(messages_)
         .Emit(cooked_);
   }
@@ -1277,6 +1277,21 @@ const char *Prescanner::IsCompilerDirectiveSentinel(
   return iter == compilerDirectiveSentinels_.end() ? nullptr : iter->c_str();
 }
 
+const char *Prescanner::IsCompilerDirectiveSentinel(CharBlock token) const {
+  const char *p{token.begin()};
+  const char *end{p + token.size()};
+  while (p < end && (*p == ' ' || *p == '\n')) {
+    ++p;
+  }
+  if (p < end && *p == '!') {
+    ++p;
+  }
+  while (end > p && (end[-1] == ' ' || end[-1] == '\t')) {
+    --end;
+  }
+  return end > p && IsCompilerDirectiveSentinel(p, end - p) ? p : nullptr;
+}
+
 constexpr bool IsDirective(const char *match, const char *dir) {
   for (; *match; ++match) {
     if (*match != ToLowerCaseLetter(*dir++)) {
diff --git a/flang/lib/Parser/prescan.h b/flang/lib/Parser/prescan.h
index 5f80d26636dc7..16b2c6165f611 100644
--- a/flang/lib/Parser/prescan.h
+++ b/flang/lib/Parser/prescan.h
@@ -68,7 +68,9 @@ class Prescanner {
   bool IsNextLinePreprocessorDirective() const;
   TokenSequence TokenizePreprocessorDirective();
   Provenance GetCurrentProvenance() const { return GetProvenance(at_); }
+
   const char *IsCompilerDirectiveSentinel(const char *, std::size_t) const;
+  const char *IsCompilerDirectiveSentinel(CharBlock) const;
 
   template <typename... A> Message &Say(A &&...a) {
     return messages_.Say(std::forward<A>(a)...);
diff --git a/flang/lib/Parser/token-sequence.cpp b/flang/lib/Parser/token-sequence.cpp
index 139d2e1ba811d..c5a630c471d16 100644
--- a/flang/lib/Parser/token-sequence.cpp
+++ b/flang/lib/Parser/token-sequence.cpp
@@ -343,16 +343,23 @@ ProvenanceRange TokenSequence::GetProvenanceRange() const {
 }
 
 const TokenSequence &TokenSequence::CheckBadFortranCharacters(
-    Messages &messages) const {
+    Messages &messages, const Prescanner &prescanner) const {
   std::size_t tokens{SizeInTokens()};
-  bool isBangOk{true};
   for (std::size_t j{0}; j < tokens; ++j) {
     CharBlock token{TokenAt(j)};
     char ch{token.FirstNonBlank()};
     if (ch != ' ' && !IsValidFortranTokenCharacter(ch)) {
-      if (ch == '!' && isBangOk) {
-        // allow in !dir$
-      } else if (ch < ' ' || ch >= '\x7f') {
+      if (ch == '!') {
+        if (prescanner.IsCompilerDirectiveSentinel(token)) {
+          continue;
+        } else if (j + 1 < tokens &&
+            prescanner.IsCompilerDirectiveSentinel(
+                TokenAt(j + 1))) { // !dir$, &c.
+          ++j;
+          continue;
+        }
+      }
+      if (ch < ' ' || ch >= '\x7f') {
         messages.Say(GetTokenProvenanceRange(j),
             "bad character (0x%02x) in Fortran token"_err_en_US, ch & 0xff);
       } else {
@@ -360,11 +367,6 @@ const TokenSequence &TokenSequence::CheckBadFortranCharacters(
             "bad character ('%c') in Fortran token"_err_en_US, ch);
       }
     }
-    if (ch == ';') {
-      isBangOk = true;
-    } else if (ch != ' ') {
-      isBangOk = false;
-    }
   }
   return *this;
 }
diff --git a/flang/lib/Parser/token-sequence.h b/flang/lib/Parser/token-sequence.h
index 6b9e1f87ee016..3df403d41e636 100644
--- a/flang/lib/Parser/token-sequence.h
+++ b/flang/lib/Parser/token-sequence.h
@@ -123,7 +123,8 @@ class TokenSequence {
   TokenSequence &RemoveBlanks(std::size_t firstChar = 0);
   TokenSequence &RemoveRedundantBlanks(std::size_t firstChar = 0);
   TokenSequence &ClipComment(const Prescanner &, bool skipFirst = false);
-  const TokenSequence &CheckBadFortranCharacters(Messages &) const;
+  const TokenSequence &CheckBadFortranCharacters(
+      Messages &, const Prescanner &) const;
   const TokenSequence &CheckBadParentheses(Messages &) const;
   void Emit(CookedSource &) const;
   llvm::raw_ostream &Dump(llvm::raw_ostream &) const;
diff --git a/flang/test/Preprocessing/preprocessed-dirs.F90 b/flang/test/Preprocessing/preprocessed-dirs.F90
new file mode 100644
index 0000000000000..8ac769fdfb61d
--- /dev/null
+++ b/flang/test/Preprocessing/preprocessed-dirs.F90
@@ -0,0 +1,8 @@
+! RUN: %flang -fc1 -E -fopenacc %s 2>&1 | FileCheck %s
+!CHECK: subroutine r4(x) Z real :: x Z !$acc routine Z print *, x Z end
+#define SUB(s, t) subroutine s(x) Z\
+  t :: x Z\
+  !$acc routine Z\
+  print *, x Z\
+  end subroutine s
+SUB(r4, real)

From c81a2c058e2b8a0bb8426cfaedc90e00d9290eb0 Mon Sep 17 00:00:00 2001
From: Peiming Liu <36770114+PeimingLiu@users.noreply.github.com>
Date: Tue, 31 Oct 2023 12:35:52 -0700
Subject: [PATCH 573/877] [mlir][sparse] add helper class to implement common
 rewriter to re/demap sparse tensors. (#70750)

---
 .../Dialect/SparseTensor/IR/SparseTensor.h    |  2 +-
 .../SparseTensor/IR/SparseTensorType.h        |  9 ++-
 .../Transforms/SparseReinterpretMap.cpp       | 64 +++++++++++++++----
 3 files changed, 59 insertions(+), 16 deletions(-)

diff --git a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensor.h b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensor.h
index 51aacd8cc2438..9776361da4809 100644
--- a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensor.h
+++ b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensor.h
@@ -97,7 +97,7 @@ template <typename T>
 inline RankedTensorType getRankedTensorType(T &&t) {
   assert(static_cast<bool>(std::forward<T>(t)) &&
          "getRankedTensorType got null argument");
-  return cast<RankedTensorType>(std::forward<T>(t).getType());
+  return dyn_cast<RankedTensorType>(std::forward<T>(t).getType());
 }
 
 /// Convenience method to abbreviate casting `getType()`.
diff --git a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorType.h b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorType.h
index 0761cbee52407..1fd91d0c02e4d 100644
--- a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorType.h
+++ b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorType.h
@@ -336,11 +336,18 @@ class SparseTensorType {
   const AffineMap lvlToDim;
 };
 
-/// Convenience method to abbreviate wrapping `getRankedTensorType`.
+/// Convenience methods to abbreviate wrapping `getRankedTensorType`.
 template <typename T>
 inline SparseTensorType getSparseTensorType(T t) {
   return SparseTensorType(getRankedTensorType(t));
 }
+template <typename T>
+inline std::optional<SparseTensorType> tryGetSparseTensorType(T t) {
+  RankedTensorType rtp = getRankedTensorType(t);
+  if (rtp)
+    return SparseTensorType(rtp);
+  return std::nullopt;
+}
 
 } // namespace sparse_tensor
 } // namespace mlir
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseReinterpretMap.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseReinterpretMap.cpp
index 66fd2e4d94a28..5880f2158b8cd 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseReinterpretMap.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseReinterpretMap.cpp
@@ -23,8 +23,44 @@ namespace {
 //   (2) rewrite linalg.generic ops traits on level crds
 //   (3) compute topsort, and resolve cyles with sparse_tensor.convert ops
 
+// CRTP to help implementing a rewriter that demaps all its inputs and remaps
+// all its outputs.
+template <typename SubClass, typename SourceOp>
+struct DemapInsRemapOutsRewriter : public OpRewritePattern<SourceOp> {
+  using OpRewritePattern<SourceOp>::OpRewritePattern;
+  using OpAdaptor = typename SourceOp::Adaptor;
+
+  LogicalResult matchAndRewrite(SourceOp op,
+                                PatternRewriter &rewriter) const override {
+    if (!static_cast<const SubClass *>(this)->matchOp(op))
+      return failure();
+
+    Location loc = op.getLoc();
+    // Demaps non-trivial inputs.
+    SmallVector<Value> deMappedIns(op->getOperands());
+    for (Value &in : deMappedIns)
+      if (auto stt = tryGetSparseTensorType(in); stt && !stt->isIdentity())
+        in = rewriter.create<ReinterpretMapOp>(loc, stt->getDemappedType(), in);
+
+    // CRTP call.
+    OpAdaptor adaptor(deMappedIns);
+    ValueRange outs =
+        static_cast<const SubClass *>(this)->rewriteOp(op, adaptor, rewriter);
+    assert(outs.size() == op->getResults().size());
+
+    // Remap  outputs.
+    SmallVector<Value> reMappedOuts(outs);
+    for (auto [r, a] : llvm::zip(reMappedOuts, op->getResults()))
+      if (r.getType() != a.getType())
+        r = rewriter.create<ReinterpretMapOp>(loc, a.getType(), r);
+
+    rewriter.replaceOp(op, reMappedOuts);
+    return success();
+  }
+};
+
 //===----------------------------------------------------------------------===//
-// Reiterpret Map Rewriters for operations other than linalg.generics
+// Reinterpret Map Rewriters for operations other than linalg.generics
 //===----------------------------------------------------------------------===//
 
 struct CrdTranslateRewriter : public OpRewritePattern<CrdTranslateOp> {
@@ -34,6 +70,7 @@ struct CrdTranslateRewriter : public OpRewritePattern<CrdTranslateOp> {
     AffineMap map = op.getDirection() == CrdTransDirectionKind::dim2lvl
                         ? op.getEncoder().getDimToLvl()
                         : op.getEncoder().getLvlToDim();
+
     SmallVector<Value> outCrds;
     for (AffineExpr result : map.getResults()) {
       // TODO: we should probably expand the affine map to IR using our own
@@ -49,24 +86,23 @@ struct CrdTranslateRewriter : public OpRewritePattern<CrdTranslateOp> {
   }
 };
 
-struct TensorInsertRewriter : public OpRewritePattern<tensor::InsertOp> {
-  using OpRewritePattern::OpRewritePattern;
-  LogicalResult matchAndRewrite(tensor::InsertOp op,
-                                PatternRewriter &rewriter) const override {
+struct TensorInsertRewriter
+    : public DemapInsRemapOutsRewriter<TensorInsertRewriter, tensor::InsertOp> {
+  using DemapInsRemapOutsRewriter::DemapInsRemapOutsRewriter;
 
-    if (!op.getResult().getType().getEncoding())
-      return failure();
+  bool matchOp(tensor::InsertOp op) const {
+    return op.getResult().getType().getEncoding() != nullptr;
+  }
+
+  ValueRange rewriteOp(tensor::InsertOp op, OpAdaptor adaptor,
+                       PatternRewriter &rewriter) const {
     Location loc = op.getLoc();
     auto stt = getSparseTensorType(op.getResult());
     ValueRange lvlCrd = stt.translateCrds(rewriter, loc, op.getIndices(),
                                           CrdTransDirectionKind::dim2lvl);
-
-    Value t = rewriter.create<ReinterpretMapOp>(
-        loc, stt.getEncoding().withoutDimToLvl(), op.getDest());
-    t = rewriter.create<sparse_tensor::InsertOp>(loc, op.getScalar(), t,
-                                                 lvlCrd);
-    rewriter.replaceOpWithNewOp<ReinterpretMapOp>(op, op.getType(), t);
-    return success();
+    Operation *insertOp = rewriter.create<sparse_tensor::InsertOp>(
+        loc, op.getScalar(), adaptor.getDest(), lvlCrd);
+    return insertOp->getResults();
   }
 };
 

From ba177c7286f538a9f4d4962fc8ac6440a2244ed9 Mon Sep 17 00:00:00 2001
From: Roland McGrath <mcgrathr@google.com>
Date: Tue, 31 Oct 2023 12:37:09 -0700
Subject: [PATCH 574/877] [libc] Add a few missing casts (#70850)

Stricter GCC warnings about implicit widening and narrowing cases
necessitate additional explicit casts around some integer operations.
---
 libc/src/__support/integer_to_string.h | 6 +++---
 libc/src/stdio/printf_core/writer.h    | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/libc/src/__support/integer_to_string.h b/libc/src/__support/integer_to_string.h
index 29f7124612594..6bbedac68efce 100644
--- a/libc/src/__support/integer_to_string.h
+++ b/libc/src/__support/integer_to_string.h
@@ -213,15 +213,15 @@ template <typename T, typename Fmt = radix::Dec> class IntegerToString {
 
     LIBC_INLINE static char digit_char(uint8_t digit) {
       if (digit < 10)
-        return '0' + digit;
-      return (Fmt::IS_UPPERCASE ? 'A' : 'a') + (digit - 10);
+        return '0' + static_cast<char>(digit);
+      return (Fmt::IS_UPPERCASE ? 'A' : 'a') + static_cast<char>(digit - 10);
     }
 
     LIBC_INLINE static void
     write_unsigned_number(UNSIGNED_T value,
                           details::BackwardStringBufferWriter &sink) {
       for (; sink.ok() && value != 0; value /= Fmt::BASE) {
-        const uint8_t digit(value % Fmt::BASE);
+        const uint8_t digit(static_cast<uint8_t>(value % Fmt::BASE));
         sink.push(digit_char(digit));
       }
     }
diff --git a/libc/src/stdio/printf_core/writer.h b/libc/src/stdio/printf_core/writer.h
index 1d4ff9916ee5f..e4f503abc34c5 100644
--- a/libc/src/stdio/printf_core/writer.h
+++ b/libc/src/stdio/printf_core/writer.h
@@ -93,7 +93,7 @@ class Writer final {
   // Takes a string, copies it into the buffer if there is space, else passes it
   // to the overflow mechanism to be handled separately.
   LIBC_INLINE int write(cpp::string_view new_string) {
-    chars_written += new_string.size();
+    chars_written += static_cast<int>(new_string.size());
     if (LIBC_LIKELY(wb->buff_cur + new_string.size() <= wb->buff_len)) {
       inline_memcpy(wb->buff + wb->buff_cur, new_string.data(),
                     new_string.size());
@@ -107,7 +107,7 @@ class Writer final {
   // if there is space, else calls pad which will loop and call the overflow
   // mechanism on a secondary buffer.
   LIBC_INLINE int write(char new_char, size_t length) {
-    chars_written += length;
+    chars_written += static_cast<int>(length);
 
     if (LIBC_LIKELY(wb->buff_cur + length <= wb->buff_len)) {
       inline_memset(wb->buff + wb->buff_cur, new_char, length);

From f19af90cf88246d42ce1f2310ded70895e0835b8 Mon Sep 17 00:00:00 2001
From: Peter Klausler <35819229+klausler@users.noreply.github.com>
Date: Tue, 31 Oct 2023 12:39:18 -0700
Subject: [PATCH 575/877] [flang][cuda] Remove check for obsolete constraint
 (#70707)

The online CUDA Fortran documentation states that a device subprogram
must be a top-level module subprogram, but this has turned out to be an
obsolete constraint. Stop enforcing it.
---
 flang/lib/Semantics/check-declarations.cpp | 6 ------
 flang/test/Semantics/cuf02.cuf             | 2 --
 2 files changed, 8 deletions(-)

diff --git a/flang/lib/Semantics/check-declarations.cpp b/flang/lib/Semantics/check-declarations.cpp
index 3626aaf3f4492..fdd29f081324c 100644
--- a/flang/lib/Semantics/check-declarations.cpp
+++ b/flang/lib/Semantics/check-declarations.cpp
@@ -1394,12 +1394,6 @@ void CheckHelper::CheckSubprogram(
     if (ClassifyProcedure(symbol) == ProcedureDefinitionClass::Internal) {
       messages_.Say(symbol.name(),
           "A device subprogram may not be an internal subprogram"_err_en_US);
-    } else if ((*cudaAttrs == common::CUDASubprogramAttrs::Device ||
-                   *cudaAttrs == common::CUDASubprogramAttrs::HostDevice) &&
-        (symbol.owner().kind() != Scope::Kind::Module ||
-            details.isInterface())) {
-      messages_.Say(symbol.name(),
-          "An ATTRIBUTES(DEVICE) subprogram must be a top-level module procedure"_err_en_US);
     }
   }
   if ((!details.cudaLaunchBounds().empty() ||
diff --git a/flang/test/Semantics/cuf02.cuf b/flang/test/Semantics/cuf02.cuf
index 38b3e783d86b3..881a3005e2817 100644
--- a/flang/test/Semantics/cuf02.cuf
+++ b/flang/test/Semantics/cuf02.cuf
@@ -1,7 +1,6 @@
 ! RUN: %python %S/test_errors.py %s %flang_fc1
 module m
   interface
-    !ERROR: An ATTRIBUTES(DEVICE) subprogram must be a top-level module procedure
     attributes(device) subroutine exts1
     end
   end interface
@@ -44,6 +43,5 @@ module m
   end
 end
 
-!ERROR: An ATTRIBUTES(DEVICE) subprogram must be a top-level module procedure
 attributes(device) subroutine exts1
 end

From b1b4bf0618d391d1c9e961d0b4f86e99be0b9e3b Mon Sep 17 00:00:00 2001
From: Peter Klausler <35819229+klausler@users.noreply.github.com>
Date: Tue, 31 Oct 2023 12:52:20 -0700
Subject: [PATCH 576/877] [flang] Fix build warning on now-unused variable
 (#70852)

A recent change of mine caused a local variable to become obsolete in
its function, leading to a warning with some build compilers that is
fatal under -Werror, breaking a build bot. Remove the variable.
---
 flang/lib/Evaluate/type.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/flang/lib/Evaluate/type.cpp b/flang/lib/Evaluate/type.cpp
index 72e385104cd64..9c5c57a45aee5 100644
--- a/flang/lib/Evaluate/type.cpp
+++ b/flang/lib/Evaluate/type.cpp
@@ -37,9 +37,7 @@ static bool IsDescriptor(const ObjectEntityDetails &details) {
   if (IsDescriptor(details.type()) || details.IsAssumedRank()) {
     return true;
   }
-  std::size_t j{0};
   for (const ShapeSpec &shapeSpec : details.shape()) {
-    ++j;
     if (const auto &ub{shapeSpec.ubound().GetExplicit()}) {
       if (!IsConstantExpr(*ub)) {
         return true;

From b799080f19c2c1a23d096289e6e03cdbd8b44895 Mon Sep 17 00:00:00 2001
From: nicole mazzuca <nicole@strega-nil.co>
Date: Tue, 31 Oct 2023 12:59:24 -0700
Subject: [PATCH 577/877] [ASan][Windows] Add __cdecl to public sanitizer
 functions (#69625)

This is necessary for many projects which pass `/Gz` to their compiles,
which makes their default calling convention `__stdcall`.

(personal note, I _really_ wish there was a pragma for this)
---
 .../include/sanitizer/allocator_interface.h   | 129 +++++----
 .../include/sanitizer/asan_interface.h        |  85 +++---
 .../include/sanitizer/common_interface_defs.h | 148 ++++++----
 .../include/sanitizer/coverage_interface.h    |  19 +-
 .../include/sanitizer/dfsan_interface.h       |  95 +++---
 .../include/sanitizer/hwasan_interface.h      | 158 +++++-----
 .../include/sanitizer/lsan_interface.h        | 106 +++----
 .../include/sanitizer/memprof_interface.h     |  13 +-
 .../include/sanitizer/msan_interface.h        | 211 +++++++-------
 .../include/sanitizer/scudo_interface.h       |  28 +-
 .../include/sanitizer/tsan_interface.h        | 122 ++++----
 .../include/sanitizer/tsan_interface_atomic.h | 273 +++++++++---------
 .../include/sanitizer/ubsan_interface.h       |   6 +-
 13 files changed, 736 insertions(+), 657 deletions(-)

diff --git a/compiler-rt/include/sanitizer/allocator_interface.h b/compiler-rt/include/sanitizer/allocator_interface.h
index 367e6409258f1..19af06969e083 100644
--- a/compiler-rt/include/sanitizer/allocator_interface.h
+++ b/compiler-rt/include/sanitizer/allocator_interface.h
@@ -11,86 +11,89 @@
 #ifndef SANITIZER_ALLOCATOR_INTERFACE_H
 #define SANITIZER_ALLOCATOR_INTERFACE_H
 
+#include <sanitizer/common_interface_defs.h>
 #include <stddef.h>
 
 #ifdef __cplusplus
 extern "C" {
 #endif
-  /* Returns the estimated number of bytes that will be reserved by allocator
-     for request of "size" bytes. If allocator can't allocate that much
-     memory, returns the maximal possible allocation size, otherwise returns
-     "size". */
-  size_t __sanitizer_get_estimated_allocated_size(size_t size);
+/* Returns the estimated number of bytes that will be reserved by allocator
+   for request of "size" bytes. If allocator can't allocate that much
+   memory, returns the maximal possible allocation size, otherwise returns
+   "size". */
+size_t SANITIZER_CDECL __sanitizer_get_estimated_allocated_size(size_t size);
 
-  /* Returns true if p was returned by the allocator and
-     is not yet freed. */
-  int __sanitizer_get_ownership(const volatile void *p);
+/* Returns true if p was returned by the allocator and
+   is not yet freed. */
+int SANITIZER_CDECL __sanitizer_get_ownership(const volatile void *p);
 
-  /* If a pointer lies within an allocation, it will return the start address
-     of the allocation. Otherwise, it returns nullptr. */
-  const void *__sanitizer_get_allocated_begin(const void *p);
+/* If a pointer lies within an allocation, it will return the start address
+   of the allocation. Otherwise, it returns nullptr. */
+const void *SANITIZER_CDECL __sanitizer_get_allocated_begin(const void *p);
 
-  /* Returns the number of bytes reserved for the pointer p.
-     Requires (get_ownership(p) == true) or (p == 0). */
-  size_t __sanitizer_get_allocated_size(const volatile void *p);
+/* Returns the number of bytes reserved for the pointer p.
+   Requires (get_ownership(p) == true) or (p == 0). */
+size_t SANITIZER_CDECL __sanitizer_get_allocated_size(const volatile void *p);
 
-  /* Returns the number of bytes reserved for the pointer p.
-     Requires __sanitizer_get_allocated_begin(p) == p. */
-  size_t __sanitizer_get_allocated_size_fast(const volatile void *p);
+/* Returns the number of bytes reserved for the pointer p.
+   Requires __sanitizer_get_allocated_begin(p) == p. */
+size_t SANITIZER_CDECL
+__sanitizer_get_allocated_size_fast(const volatile void *p);
 
-  /* Number of bytes, allocated and not yet freed by the application. */
-  size_t __sanitizer_get_current_allocated_bytes(void);
+/* Number of bytes, allocated and not yet freed by the application. */
+size_t SANITIZER_CDECL __sanitizer_get_current_allocated_bytes(void);
 
-  /* Number of bytes, mmaped by the allocator to fulfill allocation requests.
-     Generally, for request of X bytes, allocator can reserve and add to free
-     lists a large number of chunks of size X to use them for future requests.
-     All these chunks count toward the heap size. Currently, allocator never
-     releases memory to OS (instead, it just puts freed chunks to free
-     lists). */
-  size_t __sanitizer_get_heap_size(void);
+/* Number of bytes, mmaped by the allocator to fulfill allocation requests.
+   Generally, for request of X bytes, allocator can reserve and add to free
+   lists a large number of chunks of size X to use them for future requests.
+   All these chunks count toward the heap size. Currently, allocator never
+   releases memory to OS (instead, it just puts freed chunks to free
+   lists). */
+size_t SANITIZER_CDECL __sanitizer_get_heap_size(void);
 
-  /* Number of bytes, mmaped by the allocator, which can be used to fulfill
-     allocation requests. When a user program frees memory chunk, it can first
-     fall into quarantine and will count toward __sanitizer_get_free_bytes()
-     later. */
-  size_t __sanitizer_get_free_bytes(void);
+/* Number of bytes, mmaped by the allocator, which can be used to fulfill
+   allocation requests. When a user program frees memory chunk, it can first
+   fall into quarantine and will count toward __sanitizer_get_free_bytes()
+   later. */
+size_t SANITIZER_CDECL __sanitizer_get_free_bytes(void);
 
-  /* Number of bytes in unmapped pages, that are released to OS. Currently,
-     always returns 0. */
-  size_t __sanitizer_get_unmapped_bytes(void);
+/* Number of bytes in unmapped pages, that are released to OS. Currently,
+   always returns 0. */
+size_t SANITIZER_CDECL __sanitizer_get_unmapped_bytes(void);
 
-  /* Malloc hooks that may be optionally provided by user.
-     __sanitizer_malloc_hook(ptr, size) is called immediately after
-       allocation of "size" bytes, which returned "ptr".
-     __sanitizer_free_hook(ptr) is called immediately before
-       deallocation of "ptr". */
-  void __sanitizer_malloc_hook(const volatile void *ptr, size_t size);
-  void __sanitizer_free_hook(const volatile void *ptr);
+/* Malloc hooks that may be optionally provided by user.
+   __sanitizer_malloc_hook(ptr, size) is called immediately after
+     allocation of "size" bytes, which returned "ptr".
+   __sanitizer_free_hook(ptr) is called immediately before
+     deallocation of "ptr". */
+void SANITIZER_CDECL __sanitizer_malloc_hook(const volatile void *ptr,
+                                             size_t size);
+void SANITIZER_CDECL __sanitizer_free_hook(const volatile void *ptr);
 
-  /* Installs a pair of hooks for malloc/free.
-     Several (currently, 5) hook pairs may be installed, they are executed
-     in the order they were installed and after calling
-     __sanitizer_malloc_hook/__sanitizer_free_hook.
-     Unlike __sanitizer_malloc_hook/__sanitizer_free_hook these hooks can be
-     chained and do not rely on weak symbols working on the platform, but
-     require __sanitizer_install_malloc_and_free_hooks to be called at startup
-     and thus will not be called on malloc/free very early in the process.
-     Returns the number of hooks currently installed or 0 on failure.
-     Not thread-safe, should be called in the main thread before starting
-     other threads.
-  */
-  int __sanitizer_install_malloc_and_free_hooks(
-      void (*malloc_hook)(const volatile void *, size_t),
-      void (*free_hook)(const volatile void *));
+/* Installs a pair of hooks for malloc/free.
+   Several (currently, 5) hook pairs may be installed, they are executed
+   in the order they were installed and after calling
+   __sanitizer_malloc_hook/__sanitizer_free_hook.
+   Unlike __sanitizer_malloc_hook/__sanitizer_free_hook these hooks can be
+   chained and do not rely on weak symbols working on the platform, but
+   require __sanitizer_install_malloc_and_free_hooks to be called at startup
+   and thus will not be called on malloc/free very early in the process.
+   Returns the number of hooks currently installed or 0 on failure.
+   Not thread-safe, should be called in the main thread before starting
+   other threads.
+*/
+int SANITIZER_CDECL __sanitizer_install_malloc_and_free_hooks(
+    void (*SANITIZER_CDECL malloc_hook)(const volatile void *, size_t),
+    void (*SANITIZER_CDECL free_hook)(const volatile void *));
 
-  /* Drains allocator quarantines (calling thread's and global ones), returns
-     freed memory back to OS and releases other non-essential internal allocator
-     resources in attempt to reduce process RSS.
-     Currently available with ASan only.
-  */
-  void __sanitizer_purge_allocator(void);
+/* Drains allocator quarantines (calling thread's and global ones), returns
+   freed memory back to OS and releases other non-essential internal allocator
+   resources in attempt to reduce process RSS.
+   Currently available with ASan only.
+*/
+void SANITIZER_CDECL __sanitizer_purge_allocator(void);
 #ifdef __cplusplus
-}  // extern "C"
+} // extern "C"
 #endif
 
 #endif
diff --git a/compiler-rt/include/sanitizer/asan_interface.h b/compiler-rt/include/sanitizer/asan_interface.h
index 9bff21c117b39..de3ea3ddb440d 100644
--- a/compiler-rt/include/sanitizer/asan_interface.h
+++ b/compiler-rt/include/sanitizer/asan_interface.h
@@ -31,7 +31,8 @@ extern "C" {
 ///
 /// \param addr Start of memory region.
 /// \param size Size of memory region.
-void __asan_poison_memory_region(void const volatile *addr, size_t size);
+void SANITIZER_CDECL __asan_poison_memory_region(void const volatile *addr,
+                                                 size_t size);
 
 /// Marks a memory region (<c>[addr, addr+size)</c>) as addressable.
 ///
@@ -45,7 +46,8 @@ void __asan_poison_memory_region(void const volatile *addr, size_t size);
 ///
 /// \param addr Start of memory region.
 /// \param size Size of memory region.
-void __asan_unpoison_memory_region(void const volatile *addr, size_t size);
+void SANITIZER_CDECL __asan_unpoison_memory_region(void const volatile *addr,
+                                                   size_t size);
 
 // Macros provided for convenience.
 #if __has_feature(address_sanitizer) || defined(__SANITIZE_ADDRESS__)
@@ -56,7 +58,7 @@ void __asan_unpoison_memory_region(void const volatile *addr, size_t size);
 ///
 /// \param addr Start of memory region.
 /// \param size Size of memory region.
-#define ASAN_POISON_MEMORY_REGION(addr, size) \
+#define ASAN_POISON_MEMORY_REGION(addr, size)                                  \
   __asan_poison_memory_region((addr), (size))
 
 /// Marks a memory region as addressable.
@@ -66,13 +68,11 @@ void __asan_unpoison_memory_region(void const volatile *addr, size_t size);
 ///
 /// \param addr Start of memory region.
 /// \param size Size of memory region.
-#define ASAN_UNPOISON_MEMORY_REGION(addr, size) \
+#define ASAN_UNPOISON_MEMORY_REGION(addr, size)                                \
   __asan_unpoison_memory_region((addr), (size))
 #else
-#define ASAN_POISON_MEMORY_REGION(addr, size) \
-  ((void)(addr), (void)(size))
-#define ASAN_UNPOISON_MEMORY_REGION(addr, size) \
-  ((void)(addr), (void)(size))
+#define ASAN_POISON_MEMORY_REGION(addr, size) ((void)(addr), (void)(size))
+#define ASAN_UNPOISON_MEMORY_REGION(addr, size) ((void)(addr), (void)(size))
 #endif
 
 /// Checks if an address is poisoned.
@@ -85,7 +85,7 @@ void __asan_unpoison_memory_region(void const volatile *addr, size_t size);
 ///
 /// \retval 1 Address is poisoned.
 /// \retval 0 Address is not poisoned.
-int __asan_address_is_poisoned(void const volatile *addr);
+int SANITIZER_CDECL __asan_address_is_poisoned(void const volatile *addr);
 
 /// Checks if a region is poisoned.
 ///
@@ -95,14 +95,14 @@ int __asan_address_is_poisoned(void const volatile *addr);
 /// \param beg Start of memory region.
 /// \param size Start of memory region.
 /// \returns Address of first poisoned byte.
-void *__asan_region_is_poisoned(void *beg, size_t size);
+void *SANITIZER_CDECL __asan_region_is_poisoned(void *beg, size_t size);
 
 /// Describes an address (useful for calling from the debugger).
 ///
 /// Prints the description of <c><i>addr</i></c>.
 ///
 /// \param addr Address to describe.
-void __asan_describe_address(void *addr);
+void SANITIZER_CDECL __asan_describe_address(void *addr);
 
 /// Checks if an error has been or is being reported (useful for calling from
 /// the debugger to get information about an ASan error).
@@ -111,7 +111,7 @@ void __asan_describe_address(void *addr);
 ///
 /// \returns 1 if an error has been (or is being) reported. Otherwise returns
 /// 0.
-int __asan_report_present(void);
+int SANITIZER_CDECL __asan_report_present(void);
 
 /// Gets the PC (program counter) register value of an ASan error (useful for
 /// calling from the debugger).
@@ -120,7 +120,7 @@ int __asan_report_present(void);
 /// Otherwise returns 0.
 ///
 /// \returns PC value.
-void *__asan_get_report_pc(void);
+void *SANITIZER_CDECL __asan_get_report_pc(void);
 
 /// Gets the BP (base pointer) register value of an ASan error (useful for
 /// calling from the debugger).
@@ -129,7 +129,7 @@ void *__asan_get_report_pc(void);
 /// Otherwise returns 0.
 ///
 /// \returns BP value.
-void *__asan_get_report_bp(void);
+void *SANITIZER_CDECL __asan_get_report_bp(void);
 
 /// Gets the SP (stack pointer) register value of an ASan error (useful for
 /// calling from the debugger).
@@ -138,7 +138,7 @@ void *__asan_get_report_bp(void);
 /// Otherwise returns 0.
 ///
 /// \returns SP value.
-void *__asan_get_report_sp(void);
+void *SANITIZER_CDECL __asan_get_report_sp(void);
 
 /// Gets the address of the report buffer of an ASan error (useful for calling
 /// from the debugger).
@@ -147,7 +147,7 @@ void *__asan_get_report_sp(void);
 /// reported. Otherwise returns 0.
 ///
 /// \returns Address of report buffer.
-void *__asan_get_report_address(void);
+void *SANITIZER_CDECL __asan_get_report_address(void);
 
 /// Gets access type of an ASan error (useful for calling from the debugger).
 ///
@@ -155,7 +155,7 @@ void *__asan_get_report_address(void);
 /// reported. Otherwise returns 0.
 ///
 /// \returns Access type (0 = read, 1 = write).
-int __asan_get_report_access_type(void);
+int SANITIZER_CDECL __asan_get_report_access_type(void);
 
 /// Gets access size of an ASan error (useful for calling from the debugger).
 ///
@@ -163,7 +163,7 @@ int __asan_get_report_access_type(void);
 /// returns 0.
 ///
 /// \returns Access size in bytes.
-size_t __asan_get_report_access_size(void);
+size_t SANITIZER_CDECL __asan_get_report_access_size(void);
 
 /// Gets the bug description of an ASan error (useful for calling from a
 /// debugger).
@@ -171,7 +171,7 @@ size_t __asan_get_report_access_size(void);
 /// \returns Returns a bug description if an error has been (or is being)
 /// reported - for example, "heap-use-after-free". Otherwise returns an empty
 /// string.
-const char *__asan_get_report_description(void);
+const char *SANITIZER_CDECL __asan_get_report_description(void);
 
 /// Gets information about a pointer (useful for calling from the debugger).
 ///
@@ -192,8 +192,10 @@ const char *__asan_get_report_description(void);
 /// \param[out] region_size Size of the region in bytes.
 ///
 /// \returns Returns the category of the given pointer as a constant string.
-const char *__asan_locate_address(void *addr, char *name, size_t name_size,
-                                  void **region_address, size_t *region_size);
+const char *SANITIZER_CDECL __asan_locate_address(void *addr, char *name,
+                                                  size_t name_size,
+                                                  void **region_address,
+                                                  size_t *region_size);
 
 /// Gets the allocation stack trace and thread ID for a heap address (useful
 /// for calling from the debugger).
@@ -207,8 +209,8 @@ const char *__asan_locate_address(void *addr, char *name, size_t name_size,
 /// \param[out] thread_id The thread ID of the address.
 ///
 /// \returns Returns the number of stored frames or 0 on error.
-size_t __asan_get_alloc_stack(void *addr, void **trace, size_t size,
-                              int *thread_id);
+size_t SANITIZER_CDECL __asan_get_alloc_stack(void *addr, void **trace,
+                                              size_t size, int *thread_id);
 
 /// Gets the free stack trace and thread ID for a heap address (useful for
 /// calling from the debugger).
@@ -222,15 +224,16 @@ size_t __asan_get_alloc_stack(void *addr, void **trace, size_t size,
 /// \param[out] thread_id The thread ID of the address.
 ///
 /// \returns Returns the number of stored frames or 0 on error.
-size_t __asan_get_free_stack(void *addr, void **trace, size_t size,
-                             int *thread_id);
+size_t SANITIZER_CDECL __asan_get_free_stack(void *addr, void **trace,
+                                             size_t size, int *thread_id);
 
 /// Gets the current shadow memory mapping (useful for calling from the
 /// debugger).
 ///
 /// \param[out] shadow_scale Shadow scale value.
 /// \param[out] shadow_offset Offset value.
-void __asan_get_shadow_mapping(size_t *shadow_scale, size_t *shadow_offset);
+void SANITIZER_CDECL __asan_get_shadow_mapping(size_t *shadow_scale,
+                                               size_t *shadow_offset);
 
 /// This is an internal function that is called to report an error. However,
 /// it is still a part of the interface because you might want to set a
@@ -242,29 +245,31 @@ void __asan_get_shadow_mapping(size_t *shadow_scale, size_t *shadow_offset);
 /// \param addr Address of the ASan error.
 /// \param is_write True if the error is a write error; false otherwise.
 /// \param access_size Size of the memory access of the ASan error.
-void __asan_report_error(void *pc, void *bp, void *sp,
-                         void *addr, int is_write, size_t access_size);
+void SANITIZER_CDECL __asan_report_error(void *pc, void *bp, void *sp,
+                                         void *addr, int is_write,
+                                         size_t access_size);
 
 // Deprecated. Call __sanitizer_set_death_callback instead.
-void __asan_set_death_callback(void (*callback)(void));
+void SANITIZER_CDECL __asan_set_death_callback(void (*callback)(void));
 
 /// Sets the callback function to be called during ASan error reporting.
 ///
 /// The callback provides a string pointer to the report.
 ///
 /// \param callback User-provided function.
-void __asan_set_error_report_callback(void (*callback)(const char *));
+void SANITIZER_CDECL
+__asan_set_error_report_callback(void (*callback)(const char *));
 
 /// User-provided callback on ASan errors.
 ///
 /// You can provide a function that would be called immediately when ASan
 /// detects an error. This is useful in cases when ASan detects an error but
 /// your program crashes before the ASan report is printed.
-void __asan_on_error(void);
+void SANITIZER_CDECL __asan_on_error(void);
 
 /// Prints accumulated statistics to <c>stderr</c> (useful for calling from the
 /// debugger).
-void __asan_print_accumulated_stats(void);
+void SANITIZER_CDECL __asan_print_accumulated_stats(void);
 
 /// User-provided default option settings.
 ///
@@ -273,7 +278,7 @@ void __asan_print_accumulated_stats(void);
 /// <c>verbosity=1:halt_on_error=0</c>).
 ///
 /// \returns Default options string.
-const char* __asan_default_options(void);
+const char *SANITIZER_CDECL __asan_default_options(void);
 
 // The following two functions facilitate garbage collection in presence of
 // ASan's fake stack.
@@ -285,7 +290,7 @@ const char* __asan_default_options(void);
 /// does not have a fake stack.
 ///
 /// \returns An opaque handler to the fake stack or NULL.
-void *__asan_get_current_fake_stack(void);
+void *SANITIZER_CDECL __asan_get_current_fake_stack(void);
 
 /// Checks if an address belongs to a given fake stack.
 ///
@@ -305,22 +310,22 @@ void *__asan_get_current_fake_stack(void);
 /// \param[out] beg Beginning of fake frame.
 /// \param[out] end End of fake frame.
 /// \returns Stack address or NULL.
-void *__asan_addr_is_in_fake_stack(void *fake_stack, void *addr, void **beg,
-                                   void **end);
+void *SANITIZER_CDECL __asan_addr_is_in_fake_stack(void *fake_stack, void *addr,
+                                                   void **beg, void **end);
 
 /// Performs shadow memory cleanup of the current thread's stack before a
 /// function marked with the <c>[[noreturn]]</c> attribute is called.
 ///
 /// To avoid false positives on the stack, must be called before no-return
 /// functions like <c>_exit()</c> and <c>execl()</c>.
-void __asan_handle_no_return(void);
+void SANITIZER_CDECL __asan_handle_no_return(void);
 
 /// Update allocation stack trace for the given allocation to the current stack
 /// trace. Returns 1 if successful, 0 if not.
-int __asan_update_allocation_context(void* addr);
+int SANITIZER_CDECL __asan_update_allocation_context(void *addr);
 
 #ifdef __cplusplus
-}  // extern "C"
+} // extern "C"
 #endif
 
-#endif  // SANITIZER_ASAN_INTERFACE_H
+#endif // SANITIZER_ASAN_INTERFACE_H
diff --git a/compiler-rt/include/sanitizer/common_interface_defs.h b/compiler-rt/include/sanitizer/common_interface_defs.h
index 983df7cea16ed..0bbade454244a 100644
--- a/compiler-rt/include/sanitizer/common_interface_defs.h
+++ b/compiler-rt/include/sanitizer/common_interface_defs.h
@@ -20,6 +20,14 @@
 #define __has_feature(x) 0
 #endif
 
+// Windows allows a user to set their default calling convention, but we always
+// use __cdecl
+#ifdef _WIN32
+#define SANITIZER_CDECL __cdecl
+#else
+#define SANITIZER_CDECL
+#endif
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -39,71 +47,73 @@ typedef struct {
 } __sanitizer_sandbox_arguments;
 
 // Tell the tools to write their reports to "path.<pid>" instead of stderr.
-void __sanitizer_set_report_path(const char *path);
+void SANITIZER_CDECL __sanitizer_set_report_path(const char *path);
 // Tell the tools to write their reports to the provided file descriptor
 // (casted to void *).
-void __sanitizer_set_report_fd(void *fd);
+void SANITIZER_CDECL __sanitizer_set_report_fd(void *fd);
 // Get the current full report file path, if a path was specified by
 // an earlier call to __sanitizer_set_report_path. Returns null otherwise.
-const char *__sanitizer_get_report_path();
+const char *SANITIZER_CDECL __sanitizer_get_report_path();
 
 // Notify the tools that the sandbox is going to be turned on. The reserved
 // parameter will be used in the future to hold a structure with functions
 // that the tools may call to bypass the sandbox.
-void __sanitizer_sandbox_on_notify(__sanitizer_sandbox_arguments *args);
+void SANITIZER_CDECL
+__sanitizer_sandbox_on_notify(__sanitizer_sandbox_arguments *args);
 
 // This function is called by the tool when it has just finished reporting
 // an error. 'error_summary' is a one-line string that summarizes
 // the error message. This function can be overridden by the client.
-void __sanitizer_report_error_summary(const char *error_summary);
+void SANITIZER_CDECL
+__sanitizer_report_error_summary(const char *error_summary);
 
 // Some of the sanitizers (for example ASan/TSan) could miss bugs that happen
 // in unaligned loads/stores. To find such bugs reliably, you need to replace
 // plain unaligned loads/stores with these calls.
 
 /// Loads a 16-bit unaligned value.
-///
+//
 /// \param p Pointer to unaligned memory.
 ///
 /// \returns Loaded value.
-uint16_t __sanitizer_unaligned_load16(const void *p);
+uint16_t SANITIZER_CDECL __sanitizer_unaligned_load16(const void *p);
 
 /// Loads a 32-bit unaligned value.
 ///
 /// \param p Pointer to unaligned memory.
 ///
 /// \returns Loaded value.
-uint32_t __sanitizer_unaligned_load32(const void *p);
+uint32_t SANITIZER_CDECL __sanitizer_unaligned_load32(const void *p);
 
 /// Loads a 64-bit unaligned value.
 ///
 /// \param p Pointer to unaligned memory.
 ///
 /// \returns Loaded value.
-uint64_t __sanitizer_unaligned_load64(const void *p);
+uint64_t SANITIZER_CDECL __sanitizer_unaligned_load64(const void *p);
 
 /// Stores a 16-bit unaligned value.
 ///
 /// \param p Pointer to unaligned memory.
 /// \param x 16-bit value to store.
-void __sanitizer_unaligned_store16(void *p, uint16_t x);
+void SANITIZER_CDECL __sanitizer_unaligned_store16(void *p, uint16_t x);
 
 /// Stores a 32-bit unaligned value.
 ///
 /// \param p Pointer to unaligned memory.
 /// \param x 32-bit value to store.
-void __sanitizer_unaligned_store32(void *p, uint32_t x);
+void SANITIZER_CDECL __sanitizer_unaligned_store32(void *p, uint32_t x);
 
 /// Stores a 64-bit unaligned value.
 ///
 /// \param p Pointer to unaligned memory.
 /// \param x 64-bit value to store.
-void __sanitizer_unaligned_store64(void *p, uint64_t x);
+void SANITIZER_CDECL __sanitizer_unaligned_store64(void *p, uint64_t x);
 
 // Returns 1 on the first call, then returns 0 thereafter.  Called by the tool
 // to ensure only one report is printed when multiple errors occur
 // simultaneously.
-int __sanitizer_acquire_crash_state();
+int SANITIZER_CDECL __sanitizer_acquire_crash_state();
 
 /// Annotates the current state of a contiguous container, such as
 /// <c>std::vector</c>, <c>std::string</c>, or similar.
@@ -151,10 +161,8 @@ int __sanitizer_acquire_crash_state();
 /// \param end End of memory region.
 /// \param old_mid Old middle of memory region.
 /// \param new_mid New middle of memory region.
-void __sanitizer_annotate_contiguous_container(const void *beg,
-                                               const void *end,
-                                               const void *old_mid,
-                                               const void *new_mid);
+void SANITIZER_CDECL __sanitizer_annotate_contiguous_container(
+    const void *beg, const void *end, const void *old_mid, const void *new_mid);
 
 /// Similar to <c>__sanitizer_annotate_contiguous_container</c>.
 ///
@@ -185,7 +193,7 @@ void __sanitizer_annotate_contiguous_container(const void *beg,
 /// \param old_container_end End of used region.
 /// \param new_container_beg New beginning of used region.
 /// \param new_container_end New end of used region.
-void __sanitizer_annotate_double_ended_contiguous_container(
+void SANITIZER_CDECL __sanitizer_annotate_double_ended_contiguous_container(
     const void *storage_beg, const void *storage_end,
     const void *old_container_beg, const void *old_container_end,
     const void *new_container_beg, const void *new_container_end);
@@ -206,8 +214,9 @@ void __sanitizer_annotate_double_ended_contiguous_container(
 ///
 /// \returns True if the contiguous container <c>[beg, end)</c> is properly
 ///  poisoned.
-int __sanitizer_verify_contiguous_container(const void *beg, const void *mid,
-                                            const void *end);
+int SANITIZER_CDECL __sanitizer_verify_contiguous_container(const void *beg,
+                                                            const void *mid,
+                                                            const void *end);
 
 /// Returns true if the double ended contiguous
 /// container <c>[storage_beg, storage_end)</c> is properly poisoned.
@@ -230,7 +239,7 @@ int __sanitizer_verify_contiguous_container(const void *beg, const void *mid,
 /// \returns True if the double-ended contiguous container <c>[storage_beg,
 /// container_beg, container_end, end)</c> is properly poisoned - only
 /// [container_beg; container_end) is addressable.
-int __sanitizer_verify_double_ended_contiguous_container(
+int SANITIZER_CDECL __sanitizer_verify_double_ended_contiguous_container(
     const void *storage_beg, const void *container_beg,
     const void *container_end, const void *storage_end);
 
@@ -244,9 +253,8 @@ int __sanitizer_verify_double_ended_contiguous_container(
 /// \param end Old end of memory region.
 ///
 /// \returns The bad address or NULL.
-const void *__sanitizer_contiguous_container_find_bad_address(const void *beg,
-                                                              const void *mid,
-                                                              const void *end);
+const void *SANITIZER_CDECL __sanitizer_contiguous_container_find_bad_address(
+    const void *beg, const void *mid, const void *end);
 
 /// returns the address of the first improperly poisoned byte.
 ///
@@ -258,13 +266,14 @@ const void *__sanitizer_contiguous_container_find_bad_address(const void *beg,
 /// \param storage_end End of memory region.
 ///
 /// \returns The bad address or NULL.
-const void *__sanitizer_double_ended_contiguous_container_find_bad_address(
+const void *SANITIZER_CDECL
+__sanitizer_double_ended_contiguous_container_find_bad_address(
     const void *storage_beg, const void *container_beg,
     const void *container_end, const void *storage_end);
 
 /// Prints the stack trace leading to this call (useful for calling from the
 /// debugger).
-void __sanitizer_print_stack_trace(void);
+void SANITIZER_CDECL __sanitizer_print_stack_trace(void);
 
 // Symbolizes the supplied 'pc' using the format string 'fmt'.
 // Outputs at most 'out_buf_size' bytes into 'out_buf'.
@@ -276,17 +285,20 @@ void __sanitizer_print_stack_trace(void);
 // Inlined frames can be removed with 'symbolize_inline_frames=0'.
 // The format syntax is described in
 // lib/sanitizer_common/sanitizer_stacktrace_printer.h.
-void __sanitizer_symbolize_pc(void *pc, const char *fmt, char *out_buf,
-                              size_t out_buf_size);
+void SANITIZER_CDECL __sanitizer_symbolize_pc(void *pc, const char *fmt,
+                                              char *out_buf,
+                                              size_t out_buf_size);
 // Same as __sanitizer_symbolize_pc, but for data section (i.e. globals).
-void __sanitizer_symbolize_global(void *data_ptr, const char *fmt,
-                                  char *out_buf, size_t out_buf_size);
+void SANITIZER_CDECL __sanitizer_symbolize_global(void *data_ptr,
+                                                  const char *fmt,
+                                                  char *out_buf,
+                                                  size_t out_buf_size);
 // Determine the return address.
 #if !defined(_MSC_VER) || defined(__clang__)
 #define __sanitizer_return_address()                                           \
   __builtin_extract_return_addr(__builtin_return_address(0))
 #else
-extern "C" void *_ReturnAddress(void);
+extern "C" void *SANITIZER_CDECL _ReturnAddress(void);
 #pragma intrinsic(_ReturnAddress)
 #define __sanitizer_return_address() _ReturnAddress()
 #endif
@@ -296,8 +308,7 @@ extern "C" void *_ReturnAddress(void);
 /// Passing 0 will unset the callback.
 ///
 /// \param callback User-provided callback.
-void __sanitizer_set_death_callback(void (*callback)(void));
-
+void SANITIZER_CDECL __sanitizer_set_death_callback(void (*callback)(void));
 
 // Interceptor hooks.
 // Whenever a libc function interceptor is called, it checks if the
@@ -313,8 +324,10 @@ void __sanitizer_set_death_callback(void (*callback)(void));
 /// \param s2 Pointer to block of memory.
 /// \param n Number of bytes to compare.
 /// \param result Value returned by the intercepted function.
-void __sanitizer_weak_hook_memcmp(void *called_pc, const void *s1,
-                                  const void *s2, size_t n, int result);
+void SANITIZER_CDECL __sanitizer_weak_hook_memcmp(void *called_pc,
+                                                  const void *s1,
+                                                  const void *s2, size_t n,
+                                                  int result);
 
 /// Interceptor hook for <c>strncmp()</c>.
 ///
@@ -323,8 +336,10 @@ void __sanitizer_weak_hook_memcmp(void *called_pc, const void *s1,
 /// \param s2 Pointer to block of memory.
 /// \param n Number of bytes to compare.
 /// \param result Value returned by the intercepted function.
-void __sanitizer_weak_hook_strncmp(void *called_pc, const char *s1,
-                                  const char *s2, size_t n, int result);
+void SANITIZER_CDECL __sanitizer_weak_hook_strncmp(void *called_pc,
+                                                   const char *s1,
+                                                   const char *s2, size_t n,
+                                                   int result);
 
 /// Interceptor hook for <c>strncasecmp()</c>.
 ///
@@ -333,8 +348,10 @@ void __sanitizer_weak_hook_strncmp(void *called_pc, const char *s1,
 /// \param s2 Pointer to block of memory.
 /// \param n Number of bytes to compare.
 /// \param result Value returned by the intercepted function.
-void __sanitizer_weak_hook_strncasecmp(void *called_pc, const char *s1,
-                                       const char *s2, size_t n, int result);
+void SANITIZER_CDECL __sanitizer_weak_hook_strncasecmp(void *called_pc,
+                                                       const char *s1,
+                                                       const char *s2, size_t n,
+                                                       int result);
 
 /// Interceptor hook for <c>strcmp()</c>.
 ///
@@ -342,8 +359,9 @@ void __sanitizer_weak_hook_strncasecmp(void *called_pc, const char *s1,
 /// \param s1 Pointer to block of memory.
 /// \param s2 Pointer to block of memory.
 /// \param result Value returned by the intercepted function.
-void __sanitizer_weak_hook_strcmp(void *called_pc, const char *s1,
-                                  const char *s2, int result);
+void SANITIZER_CDECL __sanitizer_weak_hook_strcmp(void *called_pc,
+                                                  const char *s1,
+                                                  const char *s2, int result);
 
 /// Interceptor hook for <c>strcasecmp()</c>.
 ///
@@ -351,8 +369,10 @@ void __sanitizer_weak_hook_strcmp(void *called_pc, const char *s1,
 /// \param s1 Pointer to block of memory.
 /// \param s2 Pointer to block of memory.
 /// \param result Value returned by the intercepted function.
-void __sanitizer_weak_hook_strcasecmp(void *called_pc, const char *s1,
-                                      const char *s2, int result);
+void SANITIZER_CDECL __sanitizer_weak_hook_strcasecmp(void *called_pc,
+                                                      const char *s1,
+                                                      const char *s2,
+                                                      int result);
 
 /// Interceptor hook for <c>strstr()</c>.
 ///
@@ -360,23 +380,27 @@ void __sanitizer_weak_hook_strcasecmp(void *called_pc, const char *s1,
 /// \param s1 Pointer to block of memory.
 /// \param s2 Pointer to block of memory.
 /// \param result Value returned by the intercepted function.
-void __sanitizer_weak_hook_strstr(void *called_pc, const char *s1,
-                                  const char *s2, char *result);
+void SANITIZER_CDECL __sanitizer_weak_hook_strstr(void *called_pc,
+                                                  const char *s1,
+                                                  const char *s2, char *result);
 
-void __sanitizer_weak_hook_strcasestr(void *called_pc, const char *s1,
-                                      const char *s2, char *result);
+void SANITIZER_CDECL __sanitizer_weak_hook_strcasestr(void *called_pc,
+                                                      const char *s1,
+                                                      const char *s2,
+                                                      char *result);
 
-void __sanitizer_weak_hook_memmem(void *called_pc,
-                                  const void *s1, size_t len1,
-                                  const void *s2, size_t len2, void *result);
+void SANITIZER_CDECL __sanitizer_weak_hook_memmem(void *called_pc,
+                                                  const void *s1, size_t len1,
+                                                  const void *s2, size_t len2,
+                                                  void *result);
 
 // Prints stack traces for all live heap allocations ordered by total
 // allocation size until top_percent of total live heap is shown. top_percent
 // should be between 1 and 100. At most max_number_of_contexts contexts
 // (stack traces) are printed.
 // Experimental feature currently available only with ASan on Linux/x86_64.
-void __sanitizer_print_memory_profile(size_t top_percent,
-                                      size_t max_number_of_contexts);
+void SANITIZER_CDECL __sanitizer_print_memory_profile(
+    size_t top_percent, size_t max_number_of_contexts);
 
 /// Notify ASan that a fiber switch has started (required only if implementing
 /// your own fiber library).
@@ -405,8 +429,9 @@ void __sanitizer_print_memory_profile(size_t top_percent,
 /// \param[out] fake_stack_save Fake stack save location.
 /// \param bottom Bottom address of stack.
 /// \param size Size of stack in bytes.
-void __sanitizer_start_switch_fiber(void **fake_stack_save,
-                                    const void *bottom, size_t size);
+void SANITIZER_CDECL __sanitizer_start_switch_fiber(void **fake_stack_save,
+                                                    const void *bottom,
+                                                    size_t size);
 
 /// Notify ASan that a fiber switch has completed (required only if
 /// implementing your own fiber library).
@@ -419,18 +444,17 @@ void __sanitizer_start_switch_fiber(void **fake_stack_save,
 /// \param fake_stack_save Fake stack save location.
 /// \param[out] bottom_old Bottom address of old stack.
 /// \param[out] size_old Size of old stack in bytes.
-void __sanitizer_finish_switch_fiber(void *fake_stack_save,
-                                     const void **bottom_old,
-                                     size_t *size_old);
+void SANITIZER_CDECL __sanitizer_finish_switch_fiber(void *fake_stack_save,
+                                                     const void **bottom_old,
+                                                     size_t *size_old);
 
 // Get full module name and calculate pc offset within it.
 // Returns 1 if pc belongs to some module, 0 if module was not found.
-int __sanitizer_get_module_and_offset_for_pc(void *pc, char *module_path,
-                                             size_t module_path_len,
-                                             void **pc_offset);
+int SANITIZER_CDECL __sanitizer_get_module_and_offset_for_pc(
+    void *pc, char *module_path, size_t module_path_len, void **pc_offset);
 
 #ifdef __cplusplus
-}  // extern "C"
+} // extern "C"
 #endif
 
-#endif  // SANITIZER_COMMON_INTERFACE_DEFS_H
+#endif // SANITIZER_COMMON_INTERFACE_DEFS_H
diff --git a/compiler-rt/include/sanitizer/coverage_interface.h b/compiler-rt/include/sanitizer/coverage_interface.h
index c063cfe60c5ba..6235dfc2d4ba4 100644
--- a/compiler-rt/include/sanitizer/coverage_interface.h
+++ b/compiler-rt/include/sanitizer/coverage_interface.h
@@ -18,18 +18,19 @@
 extern "C" {
 #endif
 
-  // Record and dump coverage info.
-  void __sanitizer_cov_dump(void);
+// Record and dump coverage info.
+void SANITIZER_CDECL __sanitizer_cov_dump(void);
 
-  // Clear collected coverage info.
-  void __sanitizer_cov_reset(void);
+// Clear collected coverage info.
+void SANITIZER_CDECL __sanitizer_cov_reset(void);
 
-  // Dump collected coverage info. Sorts pcs by module into individual .sancov
-  // files.
-  void __sanitizer_dump_coverage(const uintptr_t *pcs, uintptr_t len);
+// Dump collected coverage info. Sorts pcs by module into individual .sancov
+// files.
+void SANITIZER_CDECL __sanitizer_dump_coverage(const uintptr_t *pcs,
+                                               uintptr_t len);
 
 #ifdef __cplusplus
-}  // extern "C"
+} // extern "C"
 #endif
 
-#endif  // SANITIZER_COVERAG_INTERFACE_H
+#endif // SANITIZER_COVERAG_INTERFACE_H
diff --git a/compiler-rt/include/sanitizer/dfsan_interface.h b/compiler-rt/include/sanitizer/dfsan_interface.h
index 7317da7c92594..1084ec1f53cc0 100644
--- a/compiler-rt/include/sanitizer/dfsan_interface.h
+++ b/compiler-rt/include/sanitizer/dfsan_interface.h
@@ -13,9 +13,9 @@
 #ifndef DFSAN_INTERFACE_H
 #define DFSAN_INTERFACE_H
 
+#include <sanitizer/common_interface_defs.h>
 #include <stddef.h>
 #include <stdint.h>
-#include <sanitizer/common_interface_defs.h>
 
 #ifdef __cplusplus
 extern "C" {
@@ -25,29 +25,30 @@ typedef uint8_t dfsan_label;
 typedef uint32_t dfsan_origin;
 
 /// Signature of the callback argument to dfsan_set_write_callback().
-typedef void (*dfsan_write_callback_t)(int fd, const void *buf, size_t count);
+typedef void (*SANITIZER_CDECL dfsan_write_callback_t)(int fd, const void *buf,
+                                                       size_t count);
 
 /// Signature of the callback argument to dfsan_set_conditional_callback().
-typedef void (*dfsan_conditional_callback_t)(dfsan_label label,
-                                             dfsan_origin origin);
+typedef void (*SANITIZER_CDECL dfsan_conditional_callback_t)(
+    dfsan_label label, dfsan_origin origin);
 
 /// Signature of the callback argument to dfsan_set_reaches_function_callback().
 /// The description is intended to hold the name of the variable.
-typedef void (*dfsan_reaches_function_callback_t)(dfsan_label label,
-                                                  dfsan_origin origin,
-                                                  const char *file,
-                                                  unsigned int line,
-                                                  const char *function);
+typedef void (*SANITIZER_CDECL dfsan_reaches_function_callback_t)(
+    dfsan_label label, dfsan_origin origin, const char *file, unsigned int line,
+    const char *function);
 
 /// Computes the union of \c l1 and \c l2, resulting in a union label.
-dfsan_label dfsan_union(dfsan_label l1, dfsan_label l2);
+dfsan_label SANITIZER_CDECL dfsan_union(dfsan_label l1, dfsan_label l2);
 
 /// Sets the label for each address in [addr,addr+size) to \c label.
-void dfsan_set_label(dfsan_label label, void *addr, size_t size);
+void SANITIZER_CDECL dfsan_set_label(dfsan_label label, void *addr,
+                                     size_t size);
 
 /// Sets the label for each address in [addr,addr+size) to the union of the
 /// current label for that address and \c label.
-void dfsan_add_label(dfsan_label label, void *addr, size_t size);
+void SANITIZER_CDECL dfsan_add_label(dfsan_label label, void *addr,
+                                     size_t size);
 
 /// Retrieves the label associated with the given data.
 ///
@@ -55,23 +56,24 @@ void dfsan_add_label(dfsan_label label, void *addr, size_t size);
 /// which can be truncated or extended (implicitly or explicitly) as necessary.
 /// The truncation/extension operations will preserve the label of the original
 /// value.
-dfsan_label dfsan_get_label(long data);
+dfsan_label SANITIZER_CDECL dfsan_get_label(long data);
 
 /// Retrieves the immediate origin associated with the given data. The returned
 /// origin may point to another origin.
 ///
 /// The type of 'data' is arbitrary.
-dfsan_origin dfsan_get_origin(long data);
+dfsan_origin SANITIZER_CDECL dfsan_get_origin(long data);
 
 /// Retrieves the label associated with the data at the given address.
-dfsan_label dfsan_read_label(const void *addr, size_t size);
+dfsan_label SANITIZER_CDECL dfsan_read_label(const void *addr, size_t size);
 
 /// Return the origin associated with the first taint byte in the size bytes
 /// from the address addr.
-dfsan_origin dfsan_read_origin_of_first_taint(const void *addr, size_t size);
+dfsan_origin SANITIZER_CDECL dfsan_read_origin_of_first_taint(const void *addr,
+                                                              size_t size);
 
 /// Returns whether the given label contains the label elem.
-int dfsan_has_label(dfsan_label label, dfsan_label elem);
+int SANITIZER_CDECL dfsan_has_label(dfsan_label label, dfsan_label elem);
 
 /// Flushes the DFSan shadow, i.e. forgets about all labels currently associated
 /// with the application memory.  Use this call to start over the taint tracking
@@ -79,37 +81,39 @@ int dfsan_has_label(dfsan_label label, dfsan_label elem);
 ///
 /// Note: If another thread is working with tainted data during the flush, that
 /// taint could still be written to shadow after the flush.
-void dfsan_flush(void);
+void SANITIZER_CDECL dfsan_flush(void);
 
 /// Sets a callback to be invoked on calls to write().  The callback is invoked
 /// before the write is done.  The write is not guaranteed to succeed when the
 /// callback executes.  Pass in NULL to remove any callback.
-void dfsan_set_write_callback(dfsan_write_callback_t labeled_write_callback);
+void SANITIZER_CDECL
+dfsan_set_write_callback(dfsan_write_callback_t labeled_write_callback);
 
 /// Sets a callback to be invoked on any conditional expressions which have a
 /// taint label set. This can be used to find where tainted data influences
 /// the behavior of the program.
 /// These callbacks will only be added when -dfsan-conditional-callbacks=true.
-void dfsan_set_conditional_callback(dfsan_conditional_callback_t callback);
+void SANITIZER_CDECL
+dfsan_set_conditional_callback(dfsan_conditional_callback_t callback);
 
 /// Conditional expressions occur during signal handlers.
 /// Making callbacks that handle signals well is tricky, so when
 /// -dfsan-conditional-callbacks=true, conditional expressions used in signal
 /// handlers will add the labels they see into a global (bitwise-or together).
 /// This function returns all label bits seen in signal handler conditions.
-dfsan_label dfsan_get_labels_in_signal_conditional();
+dfsan_label SANITIZER_CDECL dfsan_get_labels_in_signal_conditional();
 
 /// Sets a callback to be invoked when tainted data reaches a function.
 /// This could occur at function entry, or at a load instruction.
 /// These callbacks will only be added if -dfsan-reaches-function-callbacks=1.
-void dfsan_set_reaches_function_callback(
-    dfsan_reaches_function_callback_t callback);
+void SANITIZER_CDECL
+dfsan_set_reaches_function_callback(dfsan_reaches_function_callback_t callback);
 
 /// Making callbacks that handle signals well is tricky, so when
 /// -dfsan-reaches-function-callbacks=true, functions reached in signal
 /// handlers will add the labels they see into a global (bitwise-or together).
 /// This function returns all label bits seen during signal handlers.
-dfsan_label dfsan_get_labels_in_signal_reaches_function();
+dfsan_label SANITIZER_CDECL dfsan_get_labels_in_signal_reaches_function();
 
 /// Interceptor hooks.
 /// Whenever a dfsan's custom function is called the corresponding
@@ -117,20 +121,25 @@ dfsan_label dfsan_get_labels_in_signal_reaches_function();
 /// The primary use case is taint-guided fuzzing, where the fuzzer
 /// needs to see the parameters of the function and the labels.
 /// FIXME: implement more hooks.
-void dfsan_weak_hook_memcmp(void *caller_pc, const void *s1, const void *s2,
-                            size_t n, dfsan_label s1_label,
-                            dfsan_label s2_label, dfsan_label n_label);
-void dfsan_weak_hook_strncmp(void *caller_pc, const char *s1, const char *s2,
-                             size_t n, dfsan_label s1_label,
-                             dfsan_label s2_label, dfsan_label n_label);
+void SANITIZER_CDECL dfsan_weak_hook_memcmp(void *caller_pc, const void *s1,
+                                            const void *s2, size_t n,
+                                            dfsan_label s1_label,
+                                            dfsan_label s2_label,
+                                            dfsan_label n_label);
+void SANITIZER_CDECL dfsan_weak_hook_strncmp(void *caller_pc, const char *s1,
+                                             const char *s2, size_t n,
+                                             dfsan_label s1_label,
+                                             dfsan_label s2_label,
+                                             dfsan_label n_label);
 
 /// Prints the origin trace of the label at the address addr to stderr. It also
 /// prints description at the beginning of the trace. If origin tracking is not
 /// on, or the address is not labeled, it prints nothing.
-void dfsan_print_origin_trace(const void *addr, const char *description);
+void SANITIZER_CDECL dfsan_print_origin_trace(const void *addr,
+                                              const char *description);
 /// As above, but use an origin id from dfsan_get_origin() instead of address.
 /// Does not include header line with taint label and address information.
-void dfsan_print_origin_id_trace(dfsan_origin origin);
+void SANITIZER_CDECL dfsan_print_origin_id_trace(dfsan_origin origin);
 
 /// Prints the origin trace of the label at the address \p addr to a
 /// pre-allocated output buffer. If origin tracking is not on, or the address is
@@ -166,12 +175,15 @@ void dfsan_print_origin_id_trace(dfsan_origin origin);
 /// \returns The number of symbols that should have been written to \p out_buf
 /// (not including trailing null byte '\0'). Thus, the string is truncated iff
 /// return value is not less than \p out_buf_size.
-size_t dfsan_sprint_origin_trace(const void *addr, const char *description,
-                                 char *out_buf, size_t out_buf_size);
+size_t SANITIZER_CDECL dfsan_sprint_origin_trace(const void *addr,
+                                                 const char *description,
+                                                 char *out_buf,
+                                                 size_t out_buf_size);
 /// As above, but use an origin id from dfsan_get_origin() instead of address.
 /// Does not include header line with taint label and address information.
-size_t dfsan_sprint_origin_id_trace(dfsan_origin origin, char *out_buf,
-                                    size_t out_buf_size);
+size_t SANITIZER_CDECL dfsan_sprint_origin_id_trace(dfsan_origin origin,
+                                                    char *out_buf,
+                                                    size_t out_buf_size);
 
 /// Prints the stack trace leading to this call to a pre-allocated output
 /// buffer.
@@ -184,19 +196,20 @@ size_t dfsan_sprint_origin_id_trace(dfsan_origin origin, char *out_buf,
 /// \returns The number of symbols that should have been written to \p out_buf
 /// (not including trailing null byte '\0'). Thus, the string is truncated iff
 /// return value is not less than \p out_buf_size.
-size_t dfsan_sprint_stack_trace(char *out_buf, size_t out_buf_size);
+size_t SANITIZER_CDECL dfsan_sprint_stack_trace(char *out_buf,
+                                                size_t out_buf_size);
 
 /// Retrieves the very first origin associated with the data at the given
 /// address.
-dfsan_origin dfsan_get_init_origin(const void *addr);
+dfsan_origin SANITIZER_CDECL dfsan_get_init_origin(const void *addr);
 
 /// Returns the value of -dfsan-track-origins.
 /// * 0: do not track origins.
 /// * 1: track origins at memory store operations.
 /// * 2: track origins at memory load and store operations.
-int dfsan_get_track_origins(void);
+int SANITIZER_CDECL dfsan_get_track_origins(void);
 #ifdef __cplusplus
-}  // extern "C"
+} // extern "C"
 
 template <typename T> void dfsan_set_label(dfsan_label label, T &data) {
   dfsan_set_label(label, (void *)&data, sizeof(T));
@@ -204,4 +217,4 @@ template <typename T> void dfsan_set_label(dfsan_label label, T &data) {
 
 #endif
 
-#endif  // DFSAN_INTERFACE_H
+#endif // DFSAN_INTERFACE_H
diff --git a/compiler-rt/include/sanitizer/hwasan_interface.h b/compiler-rt/include/sanitizer/hwasan_interface.h
index ee742c7f30317..abe310c066694 100644
--- a/compiler-rt/include/sanitizer/hwasan_interface.h
+++ b/compiler-rt/include/sanitizer/hwasan_interface.h
@@ -18,82 +18,88 @@
 #ifdef __cplusplus
 extern "C" {
 #endif
-  // Libc hook for program startup in statically linked executables.
-  // Initializes enough of the runtime to run instrumented code. This function
-  // should only be called in statically linked executables because it modifies
-  // the GOT, which won't work in regular binaries because RELRO will already
-  // have been applied by the time the function is called. This also means that
-  // the function should be called before libc applies RELRO.
-  // Does not call libc unless there is an error.
-  // Can be called multiple times.
-  void __hwasan_init_static(void);
-
-  // This function may be optionally provided by user and should return
-  // a string containing HWASan runtime options. See asan_flags.h for details.
-  const char* __hwasan_default_options(void);
-
-  void __hwasan_enable_allocator_tagging(void);
-  void __hwasan_disable_allocator_tagging(void);
-
-  // Mark region of memory with the given tag. Both address and size need to be
-  // 16-byte aligned.
-  void __hwasan_tag_memory(const volatile void *p, unsigned char tag,
-                           size_t size);
-
-  /// Set pointer tag. Previous tag is lost.
-  void *__hwasan_tag_pointer(const volatile void *p, unsigned char tag);
-
-  // Set memory tag from the current SP address to the given address to zero.
-  // This is meant to annotate longjmp and other non-local jumps.
-  // This function needs to know the (almost) exact destination frame address;
-  // clearing shadow for the entire thread stack like __asan_handle_no_return
-  // does would cause false reports.
-  void __hwasan_handle_longjmp(const void *sp_dst);
-
-  // Set memory tag for the part of the current thread stack below sp_dst to
-  // zero. Call this in vfork() before returning in the parent process.
-  void __hwasan_handle_vfork(const void *sp_dst);
-
-  // Libc hook for thread creation. Should be called in the child thread before
-  // any instrumented code.
-  void __hwasan_thread_enter();
-
-  // Libc hook for thread destruction. No instrumented code should run after
-  // this call.
-  void __hwasan_thread_exit();
-
-  // Print shadow and origin for the memory range to stderr in a human-readable
-  // format.
-  void __hwasan_print_shadow(const volatile void *x, size_t size);
-
-  // Print one-line report about the memory usage of the current process.
-  void __hwasan_print_memory_usage();
-
-  /* Returns the offset of the first byte in the memory range that can not be
-   * accessed through the pointer in x, or -1 if the whole range is good. */
-  intptr_t __hwasan_test_shadow(const volatile void *x, size_t size);
-
-  /* Sets the callback function to be called during HWASan error reporting. */
-  void __hwasan_set_error_report_callback(void (*callback)(const char *));
-
-  int __sanitizer_posix_memalign(void **memptr, size_t alignment, size_t size);
-  void * __sanitizer_memalign(size_t alignment, size_t size);
-  void * __sanitizer_aligned_alloc(size_t alignment, size_t size);
-  void * __sanitizer___libc_memalign(size_t alignment, size_t size);
-  void * __sanitizer_valloc(size_t size);
-  void * __sanitizer_pvalloc(size_t size);
-  void __sanitizer_free(void *ptr);
-  void __sanitizer_cfree(void *ptr);
-  size_t __sanitizer_malloc_usable_size(const void *ptr);
-  struct mallinfo __sanitizer_mallinfo();
-  int __sanitizer_mallopt(int cmd, int value);
-  void __sanitizer_malloc_stats(void);
-  void * __sanitizer_calloc(size_t nmemb, size_t size);
-  void * __sanitizer_realloc(void *ptr, size_t size);
-  void * __sanitizer_reallocarray(void *ptr, size_t nmemb, size_t size);
-  void * __sanitizer_malloc(size_t size);
+// Libc hook for program startup in statically linked executables.
+// Initializes enough of the runtime to run instrumented code. This function
+// should only be called in statically linked executables because it modifies
+// the GOT, which won't work in regular binaries because RELRO will already
+// have been applied by the time the function is called. This also means that
+// the function should be called before libc applies RELRO.
+// Does not call libc unless there is an error.
+// Can be called multiple times.
+void SANITIZER_CDECL __hwasan_init_static(void);
+
+// This function may be optionally provided by user and should return
+// a string containing HWASan runtime options. See asan_flags.h for details.
+const char *SANITIZER_CDECL __hwasan_default_options(void);
+
+void SANITIZER_CDECL __hwasan_enable_allocator_tagging(void);
+void SANITIZER_CDECL __hwasan_disable_allocator_tagging(void);
+
+// Mark region of memory with the given tag. Both address and size need to be
+// 16-byte aligned.
+void SANITIZER_CDECL __hwasan_tag_memory(const volatile void *p,
+                                         unsigned char tag, size_t size);
+
+/// Set pointer tag. Previous tag is lost.
+void *SANITIZER_CDECL __hwasan_tag_pointer(const volatile void *p,
+                                           unsigned char tag);
+
+// Set memory tag from the current SP address to the given address to zero.
+// This is meant to annotate longjmp and other non-local jumps.
+// This function needs to know the (almost) exact destination frame address;
+// clearing shadow for the entire thread stack like __asan_handle_no_return
+// does would cause false reports.
+void SANITIZER_CDECL __hwasan_handle_longjmp(const void *sp_dst);
+
+// Set memory tag for the part of the current thread stack below sp_dst to
+// zero. Call this in vfork() before returning in the parent process.
+void SANITIZER_CDECL __hwasan_handle_vfork(const void *sp_dst);
+
+// Libc hook for thread creation. Should be called in the child thread before
+// any instrumented code.
+void SANITIZER_CDECL __hwasan_thread_enter();
+
+// Libc hook for thread destruction. No instrumented code should run after
+// this call.
+void SANITIZER_CDECL __hwasan_thread_exit();
+
+// Print shadow and origin for the memory range to stderr in a human-readable
+// format.
+void SANITIZER_CDECL __hwasan_print_shadow(const volatile void *x, size_t size);
+
+// Print one-line report about the memory usage of the current process.
+void SANITIZER_CDECL __hwasan_print_memory_usage();
+
+/* Returns the offset of the first byte in the memory range that can not be
+ * accessed through the pointer in x, or -1 if the whole range is good. */
+intptr_t SANITIZER_CDECL __hwasan_test_shadow(const volatile void *x,
+                                              size_t size);
+
+/* Sets the callback function to be called during HWASan error reporting. */
+void SANITIZER_CDECL
+__hwasan_set_error_report_callback(void (*callback)(const char *));
+
+int SANITIZER_CDECL __sanitizer_posix_memalign(void **memptr, size_t alignment,
+                                               size_t size);
+void *SANITIZER_CDECL __sanitizer_memalign(size_t alignment, size_t size);
+void *SANITIZER_CDECL __sanitizer_aligned_alloc(size_t alignment, size_t size);
+void *SANITIZER_CDECL __sanitizer___libc_memalign(size_t alignment,
+                                                  size_t size);
+void *SANITIZER_CDECL __sanitizer_valloc(size_t size);
+void *SANITIZER_CDECL __sanitizer_pvalloc(size_t size);
+void SANITIZER_CDECL __sanitizer_free(void *ptr);
+void SANITIZER_CDECL __sanitizer_cfree(void *ptr);
+size_t SANITIZER_CDECL __sanitizer_malloc_usable_size(const void *ptr);
+struct mallinfo SANITIZER_CDECL __sanitizer_mallinfo();
+int SANITIZER_CDECL __sanitizer_mallopt(int cmd, int value);
+void SANITIZER_CDECL __sanitizer_malloc_stats(void);
+void *SANITIZER_CDECL __sanitizer_calloc(size_t nmemb, size_t size);
+void *SANITIZER_CDECL __sanitizer_realloc(void *ptr, size_t size);
+void *SANITIZER_CDECL __sanitizer_reallocarray(void *ptr, size_t nmemb,
+                                               size_t size);
+void *SANITIZER_CDECL __sanitizer_malloc(size_t size);
 #ifdef __cplusplus
-}  // extern "C"
+} // extern "C"
 #endif
 
-#endif  // SANITIZER_HWASAN_INTERFACE_H
+#endif // SANITIZER_HWASAN_INTERFACE_H
diff --git a/compiler-rt/include/sanitizer/lsan_interface.h b/compiler-rt/include/sanitizer/lsan_interface.h
index 2bb992672f2e6..18f3456a126c3 100644
--- a/compiler-rt/include/sanitizer/lsan_interface.h
+++ b/compiler-rt/include/sanitizer/lsan_interface.h
@@ -18,72 +18,72 @@
 #ifdef __cplusplus
 extern "C" {
 #endif
-  // Allocations made between calls to __lsan_disable() and __lsan_enable() will
-  // be treated as non-leaks. Disable/enable pairs may be nested.
-  void __lsan_disable(void);
-  void __lsan_enable(void);
+// Allocations made between calls to __lsan_disable() and __lsan_enable() will
+// be treated as non-leaks. Disable/enable pairs may be nested.
+void SANITIZER_CDECL __lsan_disable(void);
+void SANITIZER_CDECL __lsan_enable(void);
 
-  // The heap object into which p points will be treated as a non-leak.
-  void __lsan_ignore_object(const void *p);
+// The heap object into which p points will be treated as a non-leak.
+void SANITIZER_CDECL __lsan_ignore_object(const void *p);
 
-  // Memory regions registered through this interface will be treated as sources
-  // of live pointers during leak checking. Useful if you store pointers in
-  // mapped memory.
-  // Points of note:
-  // - __lsan_unregister_root_region() must be called with the same pointer and
-  // size that have earlier been passed to __lsan_register_root_region()
-  // - LSan will skip any inaccessible memory when scanning a root region. E.g.,
-  // if you map memory within a larger region that you have mprotect'ed, you can
-  // register the entire large region.
-  // - the implementation is not optimized for performance. This interface is
-  // intended to be used for a small number of relatively static regions.
-  void __lsan_register_root_region(const void *p, size_t size);
-  void __lsan_unregister_root_region(const void *p, size_t size);
+// Memory regions registered through this interface will be treated as sources
+// of live pointers during leak checking. Useful if you store pointers in
+// mapped memory.
+// Points of note:
+// - __lsan_unregister_root_region() must be called with the same pointer and
+// size that have earlier been passed to __lsan_register_root_region()
+// - LSan will skip any inaccessible memory when scanning a root region. E.g.,
+// if you map memory within a larger region that you have mprotect'ed, you can
+// register the entire large region.
+// - the implementation is not optimized for performance. This interface is
+// intended to be used for a small number of relatively static regions.
+void SANITIZER_CDECL __lsan_register_root_region(const void *p, size_t size);
+void SANITIZER_CDECL __lsan_unregister_root_region(const void *p, size_t size);
 
-  // Check for leaks now. This function behaves identically to the default
-  // end-of-process leak check. In particular, it will terminate the process if
-  // leaks are found and the exitcode runtime flag is non-zero.
-  // Subsequent calls to this function will have no effect and end-of-process
-  // leak check will not run. Effectively, end-of-process leak check is moved to
-  // the time of first invocation of this function.
-  // By calling this function early during process shutdown, you can instruct
-  // LSan to ignore shutdown-only leaks which happen later on.
-  void __lsan_do_leak_check(void);
+// Check for leaks now. This function behaves identically to the default
+// end-of-process leak check. In particular, it will terminate the process if
+// leaks are found and the exitcode runtime flag is non-zero.
+// Subsequent calls to this function will have no effect and end-of-process
+// leak check will not run. Effectively, end-of-process leak check is moved to
+// the time of first invocation of this function.
+// By calling this function early during process shutdown, you can instruct
+// LSan to ignore shutdown-only leaks which happen later on.
+void SANITIZER_CDECL __lsan_do_leak_check(void);
 
-  // Check for leaks now. Returns zero if no leaks have been found or if leak
-  // detection is disabled, non-zero otherwise.
-  // This function may be called repeatedly, e.g. to periodically check a
-  // long-running process. It prints a leak report if appropriate, but does not
-  // terminate the process. It does not affect the behavior of
-  // __lsan_do_leak_check() or the end-of-process leak check, and is not
-  // affected by them.
-  int __lsan_do_recoverable_leak_check(void);
+// Check for leaks now. Returns zero if no leaks have been found or if leak
+// detection is disabled, non-zero otherwise.
+// This function may be called repeatedly, e.g. to periodically check a
+// long-running process. It prints a leak report if appropriate, but does not
+// terminate the process. It does not affect the behavior of
+// __lsan_do_leak_check() or the end-of-process leak check, and is not
+// affected by them.
+int SANITIZER_CDECL __lsan_do_recoverable_leak_check(void);
 
-  // The user may optionally provide this function to disallow leak checking
-  // for the program it is linked into (if the return value is non-zero). This
-  // function must be defined as returning a constant value; any behavior beyond
-  // that is unsupported.
-  // To avoid dead stripping, you may need to define this function with
-  // __attribute__((used))
-  int __lsan_is_turned_off(void);
+// The user may optionally provide this function to disallow leak checking
+// for the program it is linked into (if the return value is non-zero). This
+// function must be defined as returning a constant value; any behavior beyond
+// that is unsupported.
+// To avoid dead stripping, you may need to define this function with
+// __attribute__((used))
+int SANITIZER_CDECL __lsan_is_turned_off(void);
 
-  // This function may be optionally provided by user and should return
-  // a string containing LSan runtime options. See lsan_flags.inc for details.
-  const char *__lsan_default_options(void);
+// This function may be optionally provided by user and should return
+// a string containing LSan runtime options. See lsan_flags.inc for details.
+const char *SANITIZER_CDECL __lsan_default_options(void);
 
-  // This function may be optionally provided by the user and should return
-  // a string containing LSan suppressions.
-  const char *__lsan_default_suppressions(void);
+// This function may be optionally provided by the user and should return
+// a string containing LSan suppressions.
+const char *SANITIZER_CDECL __lsan_default_suppressions(void);
 #ifdef __cplusplus
-}  // extern "C"
+} // extern "C"
 
 namespace __lsan {
 class ScopedDisabler {
- public:
+public:
   ScopedDisabler() { __lsan_disable(); }
   ~ScopedDisabler() { __lsan_enable(); }
 };
-}  // namespace __lsan
+} // namespace __lsan
 #endif
 
-#endif  // SANITIZER_LSAN_INTERFACE_H
+#endif // SANITIZER_LSAN_INTERFACE_H
diff --git a/compiler-rt/include/sanitizer/memprof_interface.h b/compiler-rt/include/sanitizer/memprof_interface.h
index 76031de4014c0..fe0a2fc5ef025 100644
--- a/compiler-rt/include/sanitizer/memprof_interface.h
+++ b/compiler-rt/include/sanitizer/memprof_interface.h
@@ -24,25 +24,26 @@ extern "C" {
 ///
 /// \param addr Start of memory region.
 /// \param size Size of memory region.
-void __memprof_record_access_range(void const volatile *addr, size_t size);
+void SANITIZER_CDECL __memprof_record_access_range(void const volatile *addr,
+                                                   size_t size);
 
 /// Records access to a memory address <c><i>addr</i></c>.
 ///
 /// This memory must be previously allocated by your program.
 ///
 /// \param addr Accessed memory address
-void __memprof_record_access(void const volatile *addr);
+void SANITIZER_CDECL __memprof_record_access(void const volatile *addr);
 
 /// User-provided callback on MemProf errors.
 ///
 /// You can provide a function that would be called immediately when MemProf
 /// detects an error. This is useful in cases when MemProf detects an error but
 /// your program crashes before the MemProf report is printed.
-void __memprof_on_error(void);
+void SANITIZER_CDECL __memprof_on_error(void);
 
 /// Prints accumulated statistics to <c>stderr</c> (useful for calling from the
 /// debugger).
-void __memprof_print_accumulated_stats(void);
+void SANITIZER_CDECL __memprof_print_accumulated_stats(void);
 
 /// User-provided default option settings.
 ///
@@ -51,12 +52,12 @@ void __memprof_print_accumulated_stats(void);
 /// <c>verbosity=1:print_stats=1</c>).
 ///
 /// \returns Default options string.
-const char *__memprof_default_options(void);
+const char *SANITIZER_CDECL __memprof_default_options(void);
 
 /// Prints the memory profile to the current profile file.
 ///
 /// \returns 0 on success.
-int __memprof_profile_dump(void);
+int SANITIZER_CDECL __memprof_profile_dump(void);
 
 #ifdef __cplusplus
 } // extern "C"
diff --git a/compiler-rt/include/sanitizer/msan_interface.h b/compiler-rt/include/sanitizer/msan_interface.h
index 854b12cda36ec..459de87689488 100644
--- a/compiler-rt/include/sanitizer/msan_interface.h
+++ b/compiler-rt/include/sanitizer/msan_interface.h
@@ -18,109 +18,118 @@
 #ifdef __cplusplus
 extern "C" {
 #endif
-  /* Set raw origin for the memory range. */
-  void __msan_set_origin(const volatile void *a, size_t size, uint32_t origin);
-
-  /* Get raw origin for an address. */
-  uint32_t __msan_get_origin(const volatile void *a);
-
-  /* Test that this_id is a descendant of prev_id (or they are simply equal).
-   * "descendant" here means they are part of the same chain, created with
-   * __msan_chain_origin. */
-  int __msan_origin_is_descendant_or_same(uint32_t this_id, uint32_t prev_id);
-
-  /* Returns non-zero if tracking origins. */
-  int __msan_get_track_origins(void);
-
-  /* Returns the origin id of the latest UMR in the calling thread. */
-  uint32_t __msan_get_umr_origin(void);
-
-  /* Make memory region fully initialized (without changing its contents). */
-  void __msan_unpoison(const volatile void *a, size_t size);
-
-  /* Make a null-terminated string fully initialized (without changing its
-     contents). */
-  void __msan_unpoison_string(const volatile char *a);
-
-  /* Make first n parameters of the next function call fully initialized. */
-  void __msan_unpoison_param(size_t n);
-
-  /* Make memory region fully uninitialized (without changing its contents).
-     This is a legacy interface that does not update origin information. Use
-     __msan_allocated_memory() instead. */
-  void __msan_poison(const volatile void *a, size_t size);
-
-  /* Make memory region partially uninitialized (without changing its contents).
-   */
-  void __msan_partial_poison(const volatile void *data, void *shadow,
-                             size_t size);
-
-  /* Returns the offset of the first (at least partially) poisoned byte in the
-     memory range, or -1 if the whole range is good. */
-  intptr_t __msan_test_shadow(const volatile void *x, size_t size);
-
-  /* Checks that memory range is fully initialized, and reports an error if it
-   * is not. */
-  void __msan_check_mem_is_initialized(const volatile void *x, size_t size);
-
-  /* For testing:
-     __msan_set_expect_umr(1);
-     ... some buggy code ...
-     __msan_set_expect_umr(0);
-     The last line will verify that a UMR happened. */
-  void __msan_set_expect_umr(int expect_umr);
-
-  /* Change the value of keep_going flag. Non-zero value means don't terminate
-     program execution when an error is detected. This will not affect error in
-     modules that were compiled without the corresponding compiler flag. */
-  void __msan_set_keep_going(int keep_going);
-
-  /* Print shadow and origin for the memory range to stderr in a human-readable
-     format. */
-  void __msan_print_shadow(const volatile void *x, size_t size);
-
-  /* Print shadow for the memory range to stderr in a minimalistic
-     human-readable format. */
-  void __msan_dump_shadow(const volatile void *x, size_t size);
-
-  /* Returns true if running under a dynamic tool (DynamoRio-based). */
-  int  __msan_has_dynamic_component(void);
-
-  /* Tell MSan about newly allocated memory (ex.: custom allocator).
-     Memory will be marked uninitialized, with origin at the call site. */
-  void __msan_allocated_memory(const volatile void* data, size_t size);
-
-  /* Tell MSan about newly destroyed memory. Mark memory as uninitialized. */
-  void __sanitizer_dtor_callback(const volatile void* data, size_t size);
-  void __sanitizer_dtor_callback_fields(const volatile void *data, size_t size);
-  void __sanitizer_dtor_callback_vptr(const volatile void *data);
-
-  /* This function may be optionally provided by user and should return
-     a string containing Msan runtime options. See msan_flags.h for details. */
-  const char* __msan_default_options(void);
-
-  /* Deprecated. Call __sanitizer_set_death_callback instead. */
-  void __msan_set_death_callback(void (*callback)(void));
-
-  /* Update shadow for the application copy of size bytes from src to dst.
-     Src and dst are application addresses. This function does not copy the
-     actual application memory, it only updates shadow and origin for such
-     copy. Source and destination regions can overlap. */
-  void __msan_copy_shadow(const volatile void *dst, const volatile void *src,
-                          size_t size);
-
-  /* Disables uninitialized memory checks in interceptors. */
-  void __msan_scoped_disable_interceptor_checks(void);
-
-  /* Re-enables uninitialized memory checks in interceptors after a previous
-     call to __msan_scoped_disable_interceptor_checks. */
-  void __msan_scoped_enable_interceptor_checks(void);
-
-  void __msan_start_switch_fiber(const void *bottom, size_t size);
-  void __msan_finish_switch_fiber(const void **bottom_old, size_t *size_old);
+/* Set raw origin for the memory range. */
+void SANITIZER_CDECL __msan_set_origin(const volatile void *a, size_t size,
+                                       uint32_t origin);
+
+/* Get raw origin for an address. */
+uint32_t SANITIZER_CDECL __msan_get_origin(const volatile void *a);
+
+/* Test that this_id is a descendant of prev_id (or they are simply equal).
+ * "descendant" here means they are part of the same chain, created with
+ * __msan_chain_origin. */
+int SANITIZER_CDECL __msan_origin_is_descendant_or_same(uint32_t this_id,
+                                                        uint32_t prev_id);
+
+/* Returns non-zero if tracking origins. */
+int SANITIZER_CDECL __msan_get_track_origins(void);
+
+/* Returns the origin id of the latest UMR in the calling thread. */
+uint32_t SANITIZER_CDECL __msan_get_umr_origin(void);
+
+/* Make memory region fully initialized (without changing its contents). */
+void SANITIZER_CDECL __msan_unpoison(const volatile void *a, size_t size);
+
+/* Make a null-terminated string fully initialized (without changing its
+   contents). */
+void SANITIZER_CDECL __msan_unpoison_string(const volatile char *a);
+
+/* Make first n parameters of the next function call fully initialized. */
+void SANITIZER_CDECL __msan_unpoison_param(size_t n);
+
+/* Make memory region fully uninitialized (without changing its contents).
+   This is a legacy interface that does not update origin information. Use
+   __msan_allocated_memory() instead. */
+void SANITIZER_CDECL __msan_poison(const volatile void *a, size_t size);
+
+/* Make memory region partially uninitialized (without changing its contents).
+ */
+void SANITIZER_CDECL __msan_partial_poison(const volatile void *data,
+                                           void *shadow, size_t size);
+
+/* Returns the offset of the first (at least partially) poisoned byte in the
+   memory range, or -1 if the whole range is good. */
+intptr_t SANITIZER_CDECL __msan_test_shadow(const volatile void *x,
+                                            size_t size);
+
+/* Checks that memory range is fully initialized, and reports an error if it
+ * is not. */
+void SANITIZER_CDECL __msan_check_mem_is_initialized(const volatile void *x,
+                                                     size_t size);
+
+/* For testing:
+   __msan_set_expect_umr(1);
+   ... some buggy code ...
+   __msan_set_expect_umr(0);
+   The last line will verify that a UMR happened. */
+void SANITIZER_CDECL __msan_set_expect_umr(int expect_umr);
+
+/* Change the value of keep_going flag. Non-zero value means don't terminate
+   program execution when an error is detected. This will not affect error in
+   modules that were compiled without the corresponding compiler flag. */
+void SANITIZER_CDECL __msan_set_keep_going(int keep_going);
+
+/* Print shadow and origin for the memory range to stderr in a human-readable
+   format. */
+void SANITIZER_CDECL __msan_print_shadow(const volatile void *x, size_t size);
+
+/* Print shadow for the memory range to stderr in a minimalistic
+   human-readable format. */
+void SANITIZER_CDECL __msan_dump_shadow(const volatile void *x, size_t size);
+
+/* Returns true if running under a dynamic tool (DynamoRio-based). */
+int SANITIZER_CDECL __msan_has_dynamic_component(void);
+
+/* Tell MSan about newly allocated memory (ex.: custom allocator).
+   Memory will be marked uninitialized, with origin at the call site. */
+void SANITIZER_CDECL __msan_allocated_memory(const volatile void *data,
+                                             size_t size);
+
+/* Tell MSan about newly destroyed memory. Mark memory as uninitialized. */
+void SANITIZER_CDECL __sanitizer_dtor_callback(const volatile void *data,
+                                               size_t size);
+void SANITIZER_CDECL __sanitizer_dtor_callback_fields(const volatile void *data,
+                                                      size_t size);
+void SANITIZER_CDECL __sanitizer_dtor_callback_vptr(const volatile void *data);
+
+/* This function may be optionally provided by user and should return
+   a string containing Msan runtime options. See msan_flags.h for details. */
+const char *SANITIZER_CDECL __msan_default_options(void);
+
+/* Deprecated. Call __sanitizer_set_death_callback instead. */
+void SANITIZER_CDECL
+__msan_set_death_callback(void (*SANITIZER_CDECL callback)(void));
+
+/* Update shadow for the application copy of size bytes from src to dst.
+   Src and dst are application addresses. This function does not copy the
+   actual application memory, it only updates shadow and origin for such
+   copy. Source and destination regions can overlap. */
+void SANITIZER_CDECL __msan_copy_shadow(const volatile void *dst,
+                                        const volatile void *src, size_t size);
+
+/* Disables uninitialized memory checks in interceptors. */
+void SANITIZER_CDECL __msan_scoped_disable_interceptor_checks(void);
+
+/* Re-enables uninitialized memory checks in interceptors after a previous
+   call to __msan_scoped_disable_interceptor_checks. */
+void SANITIZER_CDECL __msan_scoped_enable_interceptor_checks(void);
+
+void SANITIZER_CDECL __msan_start_switch_fiber(const void *bottom, size_t size);
+void SANITIZER_CDECL __msan_finish_switch_fiber(const void **bottom_old,
+                                                size_t *size_old);
 
 #ifdef __cplusplus
-}  // extern "C"
+} // extern "C"
 #endif
 
 #endif
diff --git a/compiler-rt/include/sanitizer/scudo_interface.h b/compiler-rt/include/sanitizer/scudo_interface.h
index dd522c1efc212..37fd7bfeccf0b 100644
--- a/compiler-rt/include/sanitizer/scudo_interface.h
+++ b/compiler-rt/include/sanitizer/scudo_interface.h
@@ -17,22 +17,22 @@
 #ifdef __cplusplus
 extern "C" {
 #endif
-  // This function may be optionally provided by a user and should return
-  // a string containing Scudo runtime options. See scudo_flags.h for details.
-  const char* __scudo_default_options(void);
+// This function may be optionally provided by a user and should return
+// a string containing Scudo runtime options. See scudo_flags.h for details.
+const char *SANITIZER_CDECL __scudo_default_options(void);
 
-  // This function allows to set the RSS limit at runtime. This can be either
-  // the hard limit (HardLimit=1) or the soft limit (HardLimit=0). The limit
-  // can be removed by setting LimitMb to 0. This function's parameters should
-  // be fully trusted to avoid security mishaps.
-  void __scudo_set_rss_limit(size_t LimitMb, int HardLimit);
+// This function allows to set the RSS limit at runtime. This can be either
+// the hard limit (HardLimit=1) or the soft limit (HardLimit=0). The limit
+// can be removed by setting LimitMb to 0. This function's parameters should
+// be fully trusted to avoid security mishaps.
+void SANITIZER_CDECL __scudo_set_rss_limit(size_t LimitMb, int HardLimit);
 
-  // This function outputs various allocator statistics for both the Primary
-  // and Secondary allocators, including memory usage, number of allocations
-  // and deallocations.
-  void __scudo_print_stats(void);
+// This function outputs various allocator statistics for both the Primary
+// and Secondary allocators, including memory usage, number of allocations
+// and deallocations.
+void SANITIZER_CDECL __scudo_print_stats(void);
 #ifdef __cplusplus
-}  // extern "C"
+} // extern "C"
 #endif
 
-#endif  // SANITIZER_SCUDO_INTERFACE_H_
+#endif // SANITIZER_SCUDO_INTERFACE_H_
diff --git a/compiler-rt/include/sanitizer/tsan_interface.h b/compiler-rt/include/sanitizer/tsan_interface.h
index f19c79d79ba62..3ef79ab81dd53 100644
--- a/compiler-rt/include/sanitizer/tsan_interface.h
+++ b/compiler-rt/include/sanitizer/tsan_interface.h
@@ -21,8 +21,8 @@ extern "C" {
 
 // __tsan_release establishes a happens-before relation with a preceding
 // __tsan_acquire on the same address.
-void __tsan_acquire(void *addr);
-void __tsan_release(void *addr);
+void SANITIZER_CDECL __tsan_acquire(void *addr);
+void SANITIZER_CDECL __tsan_release(void *addr);
 
 // Annotations for custom mutexes.
 // The annotations allow to get better reports (with sets of locked mutexes),
@@ -52,16 +52,16 @@ static const unsigned __tsan_mutex_not_static       = 1 << 8;
 // Mutex operation flags:
 
 // Denotes read lock operation.
-static const unsigned __tsan_mutex_read_lock        = 1 << 3;
+static const unsigned __tsan_mutex_read_lock = 1 << 3;
 // Denotes try lock operation.
-static const unsigned __tsan_mutex_try_lock         = 1 << 4;
+static const unsigned __tsan_mutex_try_lock = 1 << 4;
 // Denotes that a try lock operation has failed to acquire the mutex.
-static const unsigned __tsan_mutex_try_lock_failed  = 1 << 5;
+static const unsigned __tsan_mutex_try_lock_failed = 1 << 5;
 // Denotes that the lock operation acquires multiple recursion levels.
 // Number of levels is passed in recursion parameter.
 // This is useful for annotation of e.g. Java builtin monitors,
 // for which wait operation releases all recursive acquisitions of the mutex.
-static const unsigned __tsan_mutex_recursive_lock   = 1 << 6;
+static const unsigned __tsan_mutex_recursive_lock = 1 << 6;
 // Denotes that the unlock operation releases all recursion levels.
 // Number of released levels is returned and later must be passed to
 // the corresponding __tsan_mutex_post_lock annotation.
@@ -75,20 +75,20 @@ static const unsigned __tsan_mutex_try_read_lock_failed =
 
 // Annotate creation of a mutex.
 // Supported flags: mutex creation flags.
-void __tsan_mutex_create(void *addr, unsigned flags);
+void SANITIZER_CDECL __tsan_mutex_create(void *addr, unsigned flags);
 
 // Annotate destruction of a mutex.
 // Supported flags:
 //   - __tsan_mutex_linker_init
 //   - __tsan_mutex_not_static
-void __tsan_mutex_destroy(void *addr, unsigned flags);
+void SANITIZER_CDECL __tsan_mutex_destroy(void *addr, unsigned flags);
 
 // Annotate start of lock operation.
 // Supported flags:
 //   - __tsan_mutex_read_lock
 //   - __tsan_mutex_try_lock
 //   - all mutex creation flags
-void __tsan_mutex_pre_lock(void *addr, unsigned flags);
+void SANITIZER_CDECL __tsan_mutex_pre_lock(void *addr, unsigned flags);
 
 // Annotate end of lock operation.
 // Supported flags:
@@ -97,23 +97,24 @@ void __tsan_mutex_pre_lock(void *addr, unsigned flags);
 //   - __tsan_mutex_try_lock_failed
 //   - __tsan_mutex_recursive_lock
 //   - all mutex creation flags
-void __tsan_mutex_post_lock(void *addr, unsigned flags, int recursion);
+void SANITIZER_CDECL __tsan_mutex_post_lock(void *addr, unsigned flags,
+                                            int recursion);
 
 // Annotate start of unlock operation.
 // Supported flags:
 //   - __tsan_mutex_read_lock
 //   - __tsan_mutex_recursive_unlock
-int __tsan_mutex_pre_unlock(void *addr, unsigned flags);
+int SANITIZER_CDECL __tsan_mutex_pre_unlock(void *addr, unsigned flags);
 
 // Annotate end of unlock operation.
 // Supported flags:
 //   - __tsan_mutex_read_lock (must match __tsan_mutex_pre_unlock)
-void __tsan_mutex_post_unlock(void *addr, unsigned flags);
+void SANITIZER_CDECL __tsan_mutex_post_unlock(void *addr, unsigned flags);
 
 // Annotate start/end of notify/signal/broadcast operation.
 // Supported flags: none.
-void __tsan_mutex_pre_signal(void *addr, unsigned flags);
-void __tsan_mutex_post_signal(void *addr, unsigned flags);
+void SANITIZER_CDECL __tsan_mutex_pre_signal(void *addr, unsigned flags);
+void SANITIZER_CDECL __tsan_mutex_post_signal(void *addr, unsigned flags);
 
 // Annotate start/end of a region of code where lock/unlock/signal operation
 // diverts to do something else unrelated to the mutex. This can be used to
@@ -123,8 +124,8 @@ void __tsan_mutex_post_signal(void *addr, unsigned flags);
 // __tsan_mutex_pre/post_lock, __tsan_mutex_pre/post_unlock,
 // __tsan_mutex_pre/post_signal regions.
 // Supported flags: none.
-void __tsan_mutex_pre_divert(void *addr, unsigned flags);
-void __tsan_mutex_post_divert(void *addr, unsigned flags);
+void SANITIZER_CDECL __tsan_mutex_pre_divert(void *addr, unsigned flags);
+void SANITIZER_CDECL __tsan_mutex_post_divert(void *addr, unsigned flags);
 
 // External race detection API.
 // Can be used by non-instrumented libraries to detect when their objects are
@@ -136,11 +137,14 @@ void __tsan_mutex_post_divert(void *addr, unsigned flags);
 //   - __tsan_external_register_tag registers a 'tag' with the specified name,
 //       which is later used in read/write annotations to denote the object type
 //   - __tsan_external_assign_tag can optionally mark a heap object with a tag
-void *__tsan_external_register_tag(const char *object_type);
-void __tsan_external_register_header(void *tag, const char *header);
-void __tsan_external_assign_tag(void *addr, void *tag);
-void __tsan_external_read(void *addr, void *caller_pc, void *tag);
-void __tsan_external_write(void *addr, void *caller_pc, void *tag);
+void *SANITIZER_CDECL __tsan_external_register_tag(const char *object_type);
+void SANITIZER_CDECL __tsan_external_register_header(void *tag,
+                                                     const char *header);
+void SANITIZER_CDECL __tsan_external_assign_tag(void *addr, void *tag);
+void SANITIZER_CDECL __tsan_external_read(void *addr, void *caller_pc,
+                                          void *tag);
+void SANITIZER_CDECL __tsan_external_write(void *addr, void *caller_pc,
+                                           void *tag);
 
 // Fiber switching API.
 //   - TSAN context for fiber can be created by __tsan_create_fiber
@@ -150,33 +154,33 @@ void __tsan_external_write(void *addr, void *caller_pc, void *tag);
 //   - __tsan_switch_to_fiber should be called immediately before switch
 //     to fiber, such as call of swapcontext.
 //   - Fiber name can be set by __tsan_set_fiber_name.
-void *__tsan_get_current_fiber(void);
-void *__tsan_create_fiber(unsigned flags);
-void __tsan_destroy_fiber(void *fiber);
-void __tsan_switch_to_fiber(void *fiber, unsigned flags);
-void __tsan_set_fiber_name(void *fiber, const char *name);
+void *SANITIZER_CDECL __tsan_get_current_fiber(void);
+void *SANITIZER_CDECL __tsan_create_fiber(unsigned flags);
+void SANITIZER_CDECL __tsan_destroy_fiber(void *fiber);
+void SANITIZER_CDECL __tsan_switch_to_fiber(void *fiber, unsigned flags);
+void SANITIZER_CDECL __tsan_set_fiber_name(void *fiber, const char *name);
 
 // Flags for __tsan_switch_to_fiber:
 // Do not establish a happens-before relation between fibers
 static const unsigned __tsan_switch_to_fiber_no_sync = 1 << 0;
 
 // User-provided callback invoked on TSan initialization.
-void __tsan_on_initialize();
+void SANITIZER_CDECL __tsan_on_initialize();
 
 // User-provided callback invoked on TSan shutdown.
 // `failed` - Nonzero if TSan did detect issues, zero otherwise.
 // Return `0` if TSan should exit as if no issues were detected.  Return nonzero
 // if TSan should exit as if issues were detected.
-int __tsan_on_finalize(int failed);
+int SANITIZER_CDECL __tsan_on_finalize(int failed);
 
 // Release TSan internal memory in a best-effort manner.
-void __tsan_flush_memory();
+void SANITIZER_CDECL __tsan_flush_memory();
 
 // User-provided default TSAN options.
-const char* __tsan_default_options(void);
+const char *SANITIZER_CDECL __tsan_default_options(void);
 
 // User-provided default TSAN suppressions.
-const char* __tsan_default_suppressions(void);
+const char *SANITIZER_CDECL __tsan_default_suppressions(void);
 
 /// Returns a report's description.
 ///
@@ -198,11 +202,10 @@ const char* __tsan_default_suppressions(void);
 /// call.
 /// \param trace_size Size in bytes of the trace buffer.
 /// \returns Returns 1 if successful, 0 if not.
-int __tsan_get_report_data(void *report, const char **description, int *count,
-                           int *stack_count, int *mop_count, int *loc_count,
-                           int *mutex_count, int *thread_count,
-                           int *unique_tid_count, void **sleep_trace,
-                           unsigned long trace_size);
+int SANITIZER_CDECL __tsan_get_report_data(
+    void *report, const char **description, int *count, int *stack_count,
+    int *mop_count, int *loc_count, int *mutex_count, int *thread_count,
+    int *unique_tid_count, void **sleep_trace, unsigned long trace_size);
 
 /// Returns information about stack traces included in the report.
 ///
@@ -211,8 +214,9 @@ int __tsan_get_report_data(void *report, const char **description, int *count,
 /// \param trace A buffer to store the stack trace.
 /// \param trace_size Size in bytes of the trace buffer.
 /// \returns Returns 1 if successful, 0 if not.
-int __tsan_get_report_stack(void *report, unsigned long idx, void **trace,
-                            unsigned long trace_size);
+int SANITIZER_CDECL __tsan_get_report_stack(void *report, unsigned long idx,
+                                            void **trace,
+                                            unsigned long trace_size);
 
 /// Returns information about memory operations included in the report.
 ///
@@ -226,9 +230,10 @@ int __tsan_get_report_stack(void *report, unsigned long idx, void **trace,
 /// \param trace A buffer to store the stack trace.
 /// \param trace_size Size in bytes of the trace buffer.
 /// \returns Returns 1 if successful, 0 if not.
-int __tsan_get_report_mop(void *report, unsigned long idx, int *tid,
-                          void **addr, int *size, int *write, int *atomic,
-                          void **trace, unsigned long trace_size);
+int SANITIZER_CDECL __tsan_get_report_mop(void *report, unsigned long idx,
+                                          int *tid, void **addr, int *size,
+                                          int *write, int *atomic, void **trace,
+                                          unsigned long trace_size);
 
 /// Returns information about locations included in the report.
 ///
@@ -244,10 +249,12 @@ int __tsan_get_report_mop(void *report, unsigned long idx, int *tid,
 /// \param trace A buffer to store the stack trace.
 /// \param trace_size Size in bytes of the trace buffer.
 /// \returns Returns 1 if successful, 0 if not.
-int __tsan_get_report_loc(void *report, unsigned long idx, const char **type,
-                          void **addr, void **start, unsigned long *size,
-                          int *tid, int *fd, int *suppressable, void **trace,
-                          unsigned long trace_size);
+int SANITIZER_CDECL __tsan_get_report_loc(void *report, unsigned long idx,
+                                          const char **type, void **addr,
+                                          void **start, unsigned long *size,
+                                          int *tid, int *fd, int *suppressable,
+                                          void **trace,
+                                          unsigned long trace_size);
 
 /// Returns information about mutexes included in the report.
 ///
@@ -259,9 +266,10 @@ int __tsan_get_report_loc(void *report, unsigned long idx, const char **type,
 /// \param trace A buffer to store the stack trace.
 /// \param trace_size Size in bytes of the trace buffer.
 /// \returns Returns 1 if successful, 0 if not.
-int __tsan_get_report_mutex(void *report, unsigned long idx, uint64_t *mutex_id,
-                            void **addr, int *destroyed, void **trace,
-                            unsigned long trace_size);
+int SANITIZER_CDECL __tsan_get_report_mutex(void *report, unsigned long idx,
+                                            uint64_t *mutex_id, void **addr,
+                                            int *destroyed, void **trace,
+                                            unsigned long trace_size);
 
 /// Returns information about threads included in the report.
 ///
@@ -275,10 +283,11 @@ int __tsan_get_report_mutex(void *report, unsigned long idx, uint64_t *mutex_id,
 /// \param trace A buffer to store the stack trace.
 /// \param trace_size Size in bytes of the trace buffer.
 /// \returns Returns 1 if successful, 0 if not.
-int __tsan_get_report_thread(void *report, unsigned long idx, int *tid,
-                             uint64_t *os_id, int *running, const char **name,
-                             int *parent_tid, void **trace,
-                             unsigned long trace_size);
+int SANITIZER_CDECL __tsan_get_report_thread(void *report, unsigned long idx,
+                                             int *tid, uint64_t *os_id,
+                                             int *running, const char **name,
+                                             int *parent_tid, void **trace,
+                                             unsigned long trace_size);
 
 /// Returns information about unique thread IDs included in the report.
 ///
@@ -286,17 +295,18 @@ int __tsan_get_report_thread(void *report, unsigned long idx, int *tid,
 /// \param idx Index to the report's unique thread IDs.
 /// \param[out] tid Unique thread ID of the report.
 /// \returns Returns 1 if successful, 0 if not.
-int __tsan_get_report_unique_tid(void *report, unsigned long idx, int *tid);
+int SANITIZER_CDECL __tsan_get_report_unique_tid(void *report,
+                                                 unsigned long idx, int *tid);
 
 /// Returns the current report.
 ///
 /// If TSan is currently reporting a detected issue on the current thread,
 /// returns an opaque pointer to the current report. Otherwise returns NULL.
 /// \returns An opaque pointer to the current report. Otherwise returns NULL.
-void *__tsan_get_current_report();
+void *SANITIZER_CDECL __tsan_get_current_report();
 
 #ifdef __cplusplus
-}  // extern "C"
+} // extern "C"
 #endif
 
-#endif  // SANITIZER_TSAN_INTERFACE_H
+#endif // SANITIZER_TSAN_INTERFACE_H
diff --git a/compiler-rt/include/sanitizer/tsan_interface_atomic.h b/compiler-rt/include/sanitizer/tsan_interface_atomic.h
index 5e41e2256c300..de3a1c3936097 100644
--- a/compiler-rt/include/sanitizer/tsan_interface_atomic.h
+++ b/compiler-rt/include/sanitizer/tsan_interface_atomic.h
@@ -13,6 +13,8 @@
 #ifndef TSAN_INTERFACE_ATOMIC_H
 #define TSAN_INTERFACE_ATOMIC_H
 
+#include <sanitizer/common_interface_defs.h>
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -21,12 +23,12 @@ typedef char __tsan_atomic8;
 typedef short __tsan_atomic16;
 typedef int __tsan_atomic32;
 typedef long __tsan_atomic64;
-#if defined(__SIZEOF_INT128__) \
-    || (__clang_major__ * 100 + __clang_minor__ >= 302)
+#if defined(__SIZEOF_INT128__) ||                                              \
+    (__clang_major__ * 100 + __clang_minor__ >= 302)
 __extension__ typedef __int128 __tsan_atomic128;
-# define __TSAN_HAS_INT128 1
+#define __TSAN_HAS_INT128 1
 #else
-# define __TSAN_HAS_INT128 0
+#define __TSAN_HAS_INT128 0
 #endif
 
 // Part of ABI, do not change.
@@ -40,182 +42,187 @@ typedef enum {
   __tsan_memory_order_seq_cst
 } __tsan_memory_order;
 
-__tsan_atomic8 __tsan_atomic8_load(const volatile __tsan_atomic8 *a,
-    __tsan_memory_order mo);
-__tsan_atomic16 __tsan_atomic16_load(const volatile __tsan_atomic16 *a,
-    __tsan_memory_order mo);
-__tsan_atomic32 __tsan_atomic32_load(const volatile __tsan_atomic32 *a,
-    __tsan_memory_order mo);
-__tsan_atomic64 __tsan_atomic64_load(const volatile __tsan_atomic64 *a,
-    __tsan_memory_order mo);
+__tsan_atomic8 SANITIZER_CDECL
+__tsan_atomic8_load(const volatile __tsan_atomic8 *a, __tsan_memory_order mo);
+__tsan_atomic16 SANITIZER_CDECL
+__tsan_atomic16_load(const volatile __tsan_atomic16 *a, __tsan_memory_order mo);
+__tsan_atomic32 SANITIZER_CDECL
+__tsan_atomic32_load(const volatile __tsan_atomic32 *a, __tsan_memory_order mo);
+__tsan_atomic64 SANITIZER_CDECL
+__tsan_atomic64_load(const volatile __tsan_atomic64 *a, __tsan_memory_order mo);
 #if __TSAN_HAS_INT128
-__tsan_atomic128 __tsan_atomic128_load(const volatile __tsan_atomic128 *a,
-    __tsan_memory_order mo);
+__tsan_atomic128 SANITIZER_CDECL __tsan_atomic128_load(
+    const volatile __tsan_atomic128 *a, __tsan_memory_order mo);
 #endif
 
-void __tsan_atomic8_store(volatile __tsan_atomic8 *a, __tsan_atomic8 v,
-    __tsan_memory_order mo);
-void __tsan_atomic16_store(volatile __tsan_atomic16 *a, __tsan_atomic16 v,
-    __tsan_memory_order mo);
-void __tsan_atomic32_store(volatile __tsan_atomic32 *a, __tsan_atomic32 v,
-    __tsan_memory_order mo);
-void __tsan_atomic64_store(volatile __tsan_atomic64 *a, __tsan_atomic64 v,
-    __tsan_memory_order mo);
+void SANITIZER_CDECL __tsan_atomic8_store(volatile __tsan_atomic8 *a,
+                                          __tsan_atomic8 v,
+                                          __tsan_memory_order mo);
+void SANITIZER_CDECL __tsan_atomic16_store(volatile __tsan_atomic16 *a,
+                                           __tsan_atomic16 v,
+                                           __tsan_memory_order mo);
+void SANITIZER_CDECL __tsan_atomic32_store(volatile __tsan_atomic32 *a,
+                                           __tsan_atomic32 v,
+                                           __tsan_memory_order mo);
+void SANITIZER_CDECL __tsan_atomic64_store(volatile __tsan_atomic64 *a,
+                                           __tsan_atomic64 v,
+                                           __tsan_memory_order mo);
 #if __TSAN_HAS_INT128
-void __tsan_atomic128_store(volatile __tsan_atomic128 *a, __tsan_atomic128 v,
-    __tsan_memory_order mo);
+void SANITIZER_CDECL __tsan_atomic128_store(volatile __tsan_atomic128 *a,
+                                            __tsan_atomic128 v,
+                                            __tsan_memory_order mo);
 #endif
 
-__tsan_atomic8 __tsan_atomic8_exchange(volatile __tsan_atomic8 *a,
-    __tsan_atomic8 v, __tsan_memory_order mo);
-__tsan_atomic16 __tsan_atomic16_exchange(volatile __tsan_atomic16 *a,
-    __tsan_atomic16 v, __tsan_memory_order mo);
-__tsan_atomic32 __tsan_atomic32_exchange(volatile __tsan_atomic32 *a,
-    __tsan_atomic32 v, __tsan_memory_order mo);
-__tsan_atomic64 __tsan_atomic64_exchange(volatile __tsan_atomic64 *a,
-    __tsan_atomic64 v, __tsan_memory_order mo);
+__tsan_atomic8 SANITIZER_CDECL __tsan_atomic8_exchange(
+    volatile __tsan_atomic8 *a, __tsan_atomic8 v, __tsan_memory_order mo);
+__tsan_atomic16 SANITIZER_CDECL __tsan_atomic16_exchange(
+    volatile __tsan_atomic16 *a, __tsan_atomic16 v, __tsan_memory_order mo);
+__tsan_atomic32 SANITIZER_CDECL __tsan_atomic32_exchange(
+    volatile __tsan_atomic32 *a, __tsan_atomic32 v, __tsan_memory_order mo);
+__tsan_atomic64 SANITIZER_CDECL __tsan_atomic64_exchange(
+    volatile __tsan_atomic64 *a, __tsan_atomic64 v, __tsan_memory_order mo);
 #if __TSAN_HAS_INT128
-__tsan_atomic128 __tsan_atomic128_exchange(volatile __tsan_atomic128 *a,
-    __tsan_atomic128 v, __tsan_memory_order mo);
+__tsan_atomic128 SANITIZER_CDECL __tsan_atomic128_exchange(
+    volatile __tsan_atomic128 *a, __tsan_atomic128 v, __tsan_memory_order mo);
 #endif
 
-__tsan_atomic8 __tsan_atomic8_fetch_add(volatile __tsan_atomic8 *a,
-    __tsan_atomic8 v, __tsan_memory_order mo);
-__tsan_atomic16 __tsan_atomic16_fetch_add(volatile __tsan_atomic16 *a,
-    __tsan_atomic16 v, __tsan_memory_order mo);
-__tsan_atomic32 __tsan_atomic32_fetch_add(volatile __tsan_atomic32 *a,
-    __tsan_atomic32 v, __tsan_memory_order mo);
-__tsan_atomic64 __tsan_atomic64_fetch_add(volatile __tsan_atomic64 *a,
-    __tsan_atomic64 v, __tsan_memory_order mo);
+__tsan_atomic8 SANITIZER_CDECL __tsan_atomic8_fetch_add(
+    volatile __tsan_atomic8 *a, __tsan_atomic8 v, __tsan_memory_order mo);
+__tsan_atomic16 SANITIZER_CDECL __tsan_atomic16_fetch_add(
+    volatile __tsan_atomic16 *a, __tsan_atomic16 v, __tsan_memory_order mo);
+__tsan_atomic32 SANITIZER_CDECL __tsan_atomic32_fetch_add(
+    volatile __tsan_atomic32 *a, __tsan_atomic32 v, __tsan_memory_order mo);
+__tsan_atomic64 SANITIZER_CDECL __tsan_atomic64_fetch_add(
+    volatile __tsan_atomic64 *a, __tsan_atomic64 v, __tsan_memory_order mo);
 #if __TSAN_HAS_INT128
-__tsan_atomic128 __tsan_atomic128_fetch_add(volatile __tsan_atomic128 *a,
-    __tsan_atomic128 v, __tsan_memory_order mo);
+__tsan_atomic128 SANITIZER_CDECL __tsan_atomic128_fetch_add(
+    volatile __tsan_atomic128 *a, __tsan_atomic128 v, __tsan_memory_order mo);
 #endif
 
-__tsan_atomic8 __tsan_atomic8_fetch_sub(volatile __tsan_atomic8 *a,
-    __tsan_atomic8 v, __tsan_memory_order mo);
-__tsan_atomic16 __tsan_atomic16_fetch_sub(volatile __tsan_atomic16 *a,
-    __tsan_atomic16 v, __tsan_memory_order mo);
-__tsan_atomic32 __tsan_atomic32_fetch_sub(volatile __tsan_atomic32 *a,
-    __tsan_atomic32 v, __tsan_memory_order mo);
-__tsan_atomic64 __tsan_atomic64_fetch_sub(volatile __tsan_atomic64 *a,
-    __tsan_atomic64 v, __tsan_memory_order mo);
+__tsan_atomic8 SANITIZER_CDECL __tsan_atomic8_fetch_sub(
+    volatile __tsan_atomic8 *a, __tsan_atomic8 v, __tsan_memory_order mo);
+__tsan_atomic16 SANITIZER_CDECL __tsan_atomic16_fetch_sub(
+    volatile __tsan_atomic16 *a, __tsan_atomic16 v, __tsan_memory_order mo);
+__tsan_atomic32 SANITIZER_CDECL __tsan_atomic32_fetch_sub(
+    volatile __tsan_atomic32 *a, __tsan_atomic32 v, __tsan_memory_order mo);
+__tsan_atomic64 SANITIZER_CDECL __tsan_atomic64_fetch_sub(
+    volatile __tsan_atomic64 *a, __tsan_atomic64 v, __tsan_memory_order mo);
 #if __TSAN_HAS_INT128
-__tsan_atomic128 __tsan_atomic128_fetch_sub(volatile __tsan_atomic128 *a,
-    __tsan_atomic128 v, __tsan_memory_order mo);
+__tsan_atomic128 SANITIZER_CDECL __tsan_atomic128_fetch_sub(
+    volatile __tsan_atomic128 *a, __tsan_atomic128 v, __tsan_memory_order mo);
 #endif
 
-__tsan_atomic8 __tsan_atomic8_fetch_and(volatile __tsan_atomic8 *a,
-    __tsan_atomic8 v, __tsan_memory_order mo);
-__tsan_atomic16 __tsan_atomic16_fetch_and(volatile __tsan_atomic16 *a,
-    __tsan_atomic16 v, __tsan_memory_order mo);
-__tsan_atomic32 __tsan_atomic32_fetch_and(volatile __tsan_atomic32 *a,
-    __tsan_atomic32 v, __tsan_memory_order mo);
-__tsan_atomic64 __tsan_atomic64_fetch_and(volatile __tsan_atomic64 *a,
-    __tsan_atomic64 v, __tsan_memory_order mo);
+__tsan_atomic8 SANITIZER_CDECL __tsan_atomic8_fetch_and(
+    volatile __tsan_atomic8 *a, __tsan_atomic8 v, __tsan_memory_order mo);
+__tsan_atomic16 SANITIZER_CDECL __tsan_atomic16_fetch_and(
+    volatile __tsan_atomic16 *a, __tsan_atomic16 v, __tsan_memory_order mo);
+__tsan_atomic32 SANITIZER_CDECL __tsan_atomic32_fetch_and(
+    volatile __tsan_atomic32 *a, __tsan_atomic32 v, __tsan_memory_order mo);
+__tsan_atomic64 SANITIZER_CDECL __tsan_atomic64_fetch_and(
+    volatile __tsan_atomic64 *a, __tsan_atomic64 v, __tsan_memory_order mo);
 #if __TSAN_HAS_INT128
-__tsan_atomic128 __tsan_atomic128_fetch_and(volatile __tsan_atomic128 *a,
-    __tsan_atomic128 v, __tsan_memory_order mo);
+__tsan_atomic128 SANITIZER_CDECL __tsan_atomic128_fetch_and(
+    volatile __tsan_atomic128 *a, __tsan_atomic128 v, __tsan_memory_order mo);
 #endif
 
-__tsan_atomic8 __tsan_atomic8_fetch_or(volatile __tsan_atomic8 *a,
-    __tsan_atomic8 v, __tsan_memory_order mo);
-__tsan_atomic16 __tsan_atomic16_fetch_or(volatile __tsan_atomic16 *a,
-    __tsan_atomic16 v, __tsan_memory_order mo);
-__tsan_atomic32 __tsan_atomic32_fetch_or(volatile __tsan_atomic32 *a,
-    __tsan_atomic32 v, __tsan_memory_order mo);
-__tsan_atomic64 __tsan_atomic64_fetch_or(volatile __tsan_atomic64 *a,
-    __tsan_atomic64 v, __tsan_memory_order mo);
+__tsan_atomic8 SANITIZER_CDECL __tsan_atomic8_fetch_or(
+    volatile __tsan_atomic8 *a, __tsan_atomic8 v, __tsan_memory_order mo);
+__tsan_atomic16 SANITIZER_CDECL __tsan_atomic16_fetch_or(
+    volatile __tsan_atomic16 *a, __tsan_atomic16 v, __tsan_memory_order mo);
+__tsan_atomic32 SANITIZER_CDECL __tsan_atomic32_fetch_or(
+    volatile __tsan_atomic32 *a, __tsan_atomic32 v, __tsan_memory_order mo);
+__tsan_atomic64 SANITIZER_CDECL __tsan_atomic64_fetch_or(
+    volatile __tsan_atomic64 *a, __tsan_atomic64 v, __tsan_memory_order mo);
 #if __TSAN_HAS_INT128
-__tsan_atomic128 __tsan_atomic128_fetch_or(volatile __tsan_atomic128 *a,
-    __tsan_atomic128 v, __tsan_memory_order mo);
+__tsan_atomic128 SANITIZER_CDECL __tsan_atomic128_fetch_or(
+    volatile __tsan_atomic128 *a, __tsan_atomic128 v, __tsan_memory_order mo);
 #endif
 
-__tsan_atomic8 __tsan_atomic8_fetch_xor(volatile __tsan_atomic8 *a,
-    __tsan_atomic8 v, __tsan_memory_order mo);
-__tsan_atomic16 __tsan_atomic16_fetch_xor(volatile __tsan_atomic16 *a,
-    __tsan_atomic16 v, __tsan_memory_order mo);
-__tsan_atomic32 __tsan_atomic32_fetch_xor(volatile __tsan_atomic32 *a,
-    __tsan_atomic32 v, __tsan_memory_order mo);
-__tsan_atomic64 __tsan_atomic64_fetch_xor(volatile __tsan_atomic64 *a,
-    __tsan_atomic64 v, __tsan_memory_order mo);
+__tsan_atomic8 SANITIZER_CDECL __tsan_atomic8_fetch_xor(
+    volatile __tsan_atomic8 *a, __tsan_atomic8 v, __tsan_memory_order mo);
+__tsan_atomic16 SANITIZER_CDECL __tsan_atomic16_fetch_xor(
+    volatile __tsan_atomic16 *a, __tsan_atomic16 v, __tsan_memory_order mo);
+__tsan_atomic32 SANITIZER_CDECL __tsan_atomic32_fetch_xor(
+    volatile __tsan_atomic32 *a, __tsan_atomic32 v, __tsan_memory_order mo);
+__tsan_atomic64 SANITIZER_CDECL __tsan_atomic64_fetch_xor(
+    volatile __tsan_atomic64 *a, __tsan_atomic64 v, __tsan_memory_order mo);
 #if __TSAN_HAS_INT128
-__tsan_atomic128 __tsan_atomic128_fetch_xor(volatile __tsan_atomic128 *a,
-    __tsan_atomic128 v, __tsan_memory_order mo);
+__tsan_atomic128 SANITIZER_CDECL __tsan_atomic128_fetch_xor(
+    volatile __tsan_atomic128 *a, __tsan_atomic128 v, __tsan_memory_order mo);
 #endif
 
-__tsan_atomic8 __tsan_atomic8_fetch_nand(volatile __tsan_atomic8 *a,
-    __tsan_atomic8 v, __tsan_memory_order mo);
-__tsan_atomic16 __tsan_atomic16_fetch_nand(volatile __tsan_atomic16 *a,
-    __tsan_atomic16 v, __tsan_memory_order mo);
-__tsan_atomic32 __tsan_atomic32_fetch_nand(volatile __tsan_atomic32 *a,
-    __tsan_atomic32 v, __tsan_memory_order mo);
-__tsan_atomic64 __tsan_atomic64_fetch_nand(volatile __tsan_atomic64 *a,
-    __tsan_atomic64 v, __tsan_memory_order mo);
+__tsan_atomic8 SANITIZER_CDECL __tsan_atomic8_fetch_nand(
+    volatile __tsan_atomic8 *a, __tsan_atomic8 v, __tsan_memory_order mo);
+__tsan_atomic16 SANITIZER_CDECL __tsan_atomic16_fetch_nand(
+    volatile __tsan_atomic16 *a, __tsan_atomic16 v, __tsan_memory_order mo);
+__tsan_atomic32 SANITIZER_CDECL __tsan_atomic32_fetch_nand(
+    volatile __tsan_atomic32 *a, __tsan_atomic32 v, __tsan_memory_order mo);
+__tsan_atomic64 SANITIZER_CDECL __tsan_atomic64_fetch_nand(
+    volatile __tsan_atomic64 *a, __tsan_atomic64 v, __tsan_memory_order mo);
 #if __TSAN_HAS_INT128
-__tsan_atomic128 __tsan_atomic128_fetch_nand(volatile __tsan_atomic128 *a,
-    __tsan_atomic128 v, __tsan_memory_order mo);
+__tsan_atomic128 SANITIZER_CDECL __tsan_atomic128_fetch_nand(
+    volatile __tsan_atomic128 *a, __tsan_atomic128 v, __tsan_memory_order mo);
 #endif
 
-int __tsan_atomic8_compare_exchange_weak(volatile __tsan_atomic8 *a,
-    __tsan_atomic8 *c, __tsan_atomic8 v, __tsan_memory_order mo,
-    __tsan_memory_order fail_mo);
-int __tsan_atomic16_compare_exchange_weak(volatile __tsan_atomic16 *a,
-    __tsan_atomic16 *c, __tsan_atomic16 v, __tsan_memory_order mo,
-    __tsan_memory_order fail_mo);
-int __tsan_atomic32_compare_exchange_weak(volatile __tsan_atomic32 *a,
-    __tsan_atomic32 *c, __tsan_atomic32 v, __tsan_memory_order mo,
-    __tsan_memory_order fail_mo);
-int __tsan_atomic64_compare_exchange_weak(volatile __tsan_atomic64 *a,
-    __tsan_atomic64 *c, __tsan_atomic64 v, __tsan_memory_order mo,
-    __tsan_memory_order fail_mo);
+int SANITIZER_CDECL __tsan_atomic8_compare_exchange_weak(
+    volatile __tsan_atomic8 *a, __tsan_atomic8 *c, __tsan_atomic8 v,
+    __tsan_memory_order mo, __tsan_memory_order fail_mo);
+int SANITIZER_CDECL __tsan_atomic16_compare_exchange_weak(
+    volatile __tsan_atomic16 *a, __tsan_atomic16 *c, __tsan_atomic16 v,
+    __tsan_memory_order mo, __tsan_memory_order fail_mo);
+int SANITIZER_CDECL __tsan_atomic32_compare_exchange_weak(
+    volatile __tsan_atomic32 *a, __tsan_atomic32 *c, __tsan_atomic32 v,
+    __tsan_memory_order mo, __tsan_memory_order fail_mo);
+int SANITIZER_CDECL __tsan_atomic64_compare_exchange_weak(
+    volatile __tsan_atomic64 *a, __tsan_atomic64 *c, __tsan_atomic64 v,
+    __tsan_memory_order mo, __tsan_memory_order fail_mo);
 #if __TSAN_HAS_INT128
-int __tsan_atomic128_compare_exchange_weak(volatile __tsan_atomic128 *a,
-    __tsan_atomic128 *c, __tsan_atomic128 v, __tsan_memory_order mo,
-    __tsan_memory_order fail_mo);
+int SANITIZER_CDECL __tsan_atomic128_compare_exchange_weak(
+    volatile __tsan_atomic128 *a, __tsan_atomic128 *c, __tsan_atomic128 v,
+    __tsan_memory_order mo, __tsan_memory_order fail_mo);
 #endif
 
-int __tsan_atomic8_compare_exchange_strong(volatile __tsan_atomic8 *a,
-    __tsan_atomic8 *c, __tsan_atomic8 v, __tsan_memory_order mo,
-    __tsan_memory_order fail_mo);
-int __tsan_atomic16_compare_exchange_strong(volatile __tsan_atomic16 *a,
-    __tsan_atomic16 *c, __tsan_atomic16 v, __tsan_memory_order mo,
-    __tsan_memory_order fail_mo);
-int __tsan_atomic32_compare_exchange_strong(volatile __tsan_atomic32 *a,
-    __tsan_atomic32 *c, __tsan_atomic32 v, __tsan_memory_order mo,
-    __tsan_memory_order fail_mo);
-int __tsan_atomic64_compare_exchange_strong(volatile __tsan_atomic64 *a,
-    __tsan_atomic64 *c, __tsan_atomic64 v, __tsan_memory_order mo,
-    __tsan_memory_order fail_mo);
+int SANITIZER_CDECL __tsan_atomic8_compare_exchange_strong(
+    volatile __tsan_atomic8 *a, __tsan_atomic8 *c, __tsan_atomic8 v,
+    __tsan_memory_order mo, __tsan_memory_order fail_mo);
+int SANITIZER_CDECL __tsan_atomic16_compare_exchange_strong(
+    volatile __tsan_atomic16 *a, __tsan_atomic16 *c, __tsan_atomic16 v,
+    __tsan_memory_order mo, __tsan_memory_order fail_mo);
+int SANITIZER_CDECL __tsan_atomic32_compare_exchange_strong(
+    volatile __tsan_atomic32 *a, __tsan_atomic32 *c, __tsan_atomic32 v,
+    __tsan_memory_order mo, __tsan_memory_order fail_mo);
+int SANITIZER_CDECL __tsan_atomic64_compare_exchange_strong(
+    volatile __tsan_atomic64 *a, __tsan_atomic64 *c, __tsan_atomic64 v,
+    __tsan_memory_order mo, __tsan_memory_order fail_mo);
 #if __TSAN_HAS_INT128
-int __tsan_atomic128_compare_exchange_strong(volatile __tsan_atomic128 *a,
-    __tsan_atomic128 *c, __tsan_atomic128 v, __tsan_memory_order mo,
-    __tsan_memory_order fail_mo);
+int SANITIZER_CDECL __tsan_atomic128_compare_exchange_strong(
+    volatile __tsan_atomic128 *a, __tsan_atomic128 *c, __tsan_atomic128 v,
+    __tsan_memory_order mo, __tsan_memory_order fail_mo);
 #endif
 
-__tsan_atomic8 __tsan_atomic8_compare_exchange_val(
+__tsan_atomic8 SANITIZER_CDECL __tsan_atomic8_compare_exchange_val(
     volatile __tsan_atomic8 *a, __tsan_atomic8 c, __tsan_atomic8 v,
     __tsan_memory_order mo, __tsan_memory_order fail_mo);
-__tsan_atomic16 __tsan_atomic16_compare_exchange_val(
+__tsan_atomic16 SANITIZER_CDECL __tsan_atomic16_compare_exchange_val(
     volatile __tsan_atomic16 *a, __tsan_atomic16 c, __tsan_atomic16 v,
     __tsan_memory_order mo, __tsan_memory_order fail_mo);
-__tsan_atomic32 __tsan_atomic32_compare_exchange_val(
+__tsan_atomic32 SANITIZER_CDECL __tsan_atomic32_compare_exchange_val(
     volatile __tsan_atomic32 *a, __tsan_atomic32 c, __tsan_atomic32 v,
     __tsan_memory_order mo, __tsan_memory_order fail_mo);
-__tsan_atomic64 __tsan_atomic64_compare_exchange_val(
+__tsan_atomic64 SANITIZER_CDECL __tsan_atomic64_compare_exchange_val(
     volatile __tsan_atomic64 *a, __tsan_atomic64 c, __tsan_atomic64 v,
     __tsan_memory_order mo, __tsan_memory_order fail_mo);
 #if __TSAN_HAS_INT128
-__tsan_atomic128 __tsan_atomic128_compare_exchange_val(
+__tsan_atomic128 SANITIZER_CDECL __tsan_atomic128_compare_exchange_val(
     volatile __tsan_atomic128 *a, __tsan_atomic128 c, __tsan_atomic128 v,
     __tsan_memory_order mo, __tsan_memory_order fail_mo);
 #endif
 
-void __tsan_atomic_thread_fence(__tsan_memory_order mo);
-void __tsan_atomic_signal_fence(__tsan_memory_order mo);
+void SANITIZER_CDECL __tsan_atomic_thread_fence(__tsan_memory_order mo);
+void SANITIZER_CDECL __tsan_atomic_signal_fence(__tsan_memory_order mo);
 
 #ifdef __cplusplus
-}  // extern "C"
+} // extern "C"
 #endif
 
-#endif  // TSAN_INTERFACE_ATOMIC_H
+#endif // TSAN_INTERFACE_ATOMIC_H
diff --git a/compiler-rt/include/sanitizer/ubsan_interface.h b/compiler-rt/include/sanitizer/ubsan_interface.h
index 59fc6c3c184c7..435eb1ae332ca 100644
--- a/compiler-rt/include/sanitizer/ubsan_interface.h
+++ b/compiler-rt/include/sanitizer/ubsan_interface.h
@@ -23,10 +23,10 @@ extern "C" {
 /// <c>verbosity=1:halt_on_error=0</c>).
 ///
 /// \returns Default options string.
-const char* __ubsan_default_options(void);
+const char *SANITIZER_CDECL __ubsan_default_options(void);
 
 #ifdef __cplusplus
-}  // extern "C"
+} // extern "C"
 #endif
 
-#endif  // SANITIZER_UBSAN_INTERFACE_H
+#endif // SANITIZER_UBSAN_INTERFACE_H

From 2260ebf7b6df15db96c76039758dd9dbf009c334 Mon Sep 17 00:00:00 2001
From: Tulio Magno Quites Machado Filho <tuliom@redhat.com>
Date: Tue, 31 Oct 2023 17:08:55 -0300
Subject: [PATCH 578/877] [lldb] Replace the usage of module imp with module
 importlib (#70443)

imp got removed in Python 3.12 [1] and the community recommends using
importlib in newer Python versions.

[1] https://docs.python.org/3.12/whatsnew/3.12.html#imp
---
 lldb/scripts/use_lldb_suite.py  | 15 ++++++++-------
 lldb/test/API/use_lldb_suite.py | 14 +++++++-------
 2 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/lldb/scripts/use_lldb_suite.py b/lldb/scripts/use_lldb_suite.py
index 6388d87b181ce..a050db0e79e68 100644
--- a/lldb/scripts/use_lldb_suite.py
+++ b/lldb/scripts/use_lldb_suite.py
@@ -17,11 +17,12 @@ def find_lldb_root():
 
 
 lldb_root = find_lldb_root()
-import imp
 
-fp, pathname, desc = imp.find_module("use_lldb_suite_root", [lldb_root])
-try:
-    imp.load_module("use_lldb_suite_root", fp, pathname, desc)
-finally:
-    if fp:
-        fp.close()
+import importlib.machinery
+import importlib.util
+
+path = os.path.join(lldb_root, "use_lldb_suite_root.py")
+loader = importlib.machinery.SourceFileLoader("use_lldb_suite_root", path)
+spec = importlib.util.spec_from_loader("use_lldb_suite_root", loader=loader)
+module = importlib.util.module_from_spec(spec)
+loader.exec_module(module)
diff --git a/lldb/test/API/use_lldb_suite.py b/lldb/test/API/use_lldb_suite.py
index e237dd4b8a560..a680f8c466a34 100644
--- a/lldb/test/API/use_lldb_suite.py
+++ b/lldb/test/API/use_lldb_suite.py
@@ -20,11 +20,11 @@ def find_lldb_root():
 
 lldb_root = find_lldb_root()
 
-import imp
+import importlib.machinery
+import importlib.util
 
-fp, pathname, desc = imp.find_module("use_lldb_suite_root", [lldb_root])
-try:
-    imp.load_module("use_lldb_suite_root", fp, pathname, desc)
-finally:
-    if fp:
-        fp.close()
+path = os.path.join(lldb_root, "use_lldb_suite_root.py")
+loader = importlib.machinery.SourceFileLoader("use_lldb_suite_root", path)
+spec = importlib.util.spec_from_loader("use_lldb_suite_root", loader=loader)
+module = importlib.util.module_from_spec(spec)
+loader.exec_module(module)

From 4fc70197919cded66502d1ad853f9e7f344b9a77 Mon Sep 17 00:00:00 2001
From: Jake Egan <5326451+jakeegan@users.noreply.github.com>
Date: Tue, 31 Oct 2023 16:22:49 -0400
Subject: [PATCH 579/877] [libc++][AIX] Add OS version to target triple

This will allow for configuring tests according to AIX version.

Reviewed By: daltenty, #libc, Mordante

Differential Revision: https://reviews.llvm.org/D149660
---
 libcxx/test/configs/ibm-libc++-shared.cfg.in    |  6 ++++++
 .../test/configs/ibm-libc++abi-shared.cfg.in    |  6 ++++++
 .../test/configs/ibm-libunwind-shared.cfg.in    |  6 ++++++
 libunwind/test/signal_frame.pass.cpp            |  2 +-
 llvm/utils/lit/lit/util.py                      | 17 +++++++++++++++++
 5 files changed, 36 insertions(+), 1 deletion(-)

diff --git a/libcxx/test/configs/ibm-libc++-shared.cfg.in b/libcxx/test/configs/ibm-libc++-shared.cfg.in
index 7881556603cfa..50061e9717ce0 100644
--- a/libcxx/test/configs/ibm-libc++-shared.cfg.in
+++ b/libcxx/test/configs/ibm-libc++-shared.cfg.in
@@ -4,6 +4,12 @@
 
 lit_config.load_config(config, '@CMAKE_CURRENT_BINARY_DIR@/cmake-bridge.cfg')
 
+import lit.util
+if lit.util.isAIXTriple(config.target_triple):
+  # Add the AIX version to the triple here because there currently isn't a good
+  # way to retrieve the AIX version in the driver.
+  config.target_triple = lit.util.addAIXVersion(config.target_triple)
+
 config.substitutions.append(('%{flags}', '-pthread'))
 config.substitutions.append(('%{compile_flags}',
     '-nostdinc++ -D__LIBC_NO_CPP_MATH_OVERLOADS__ -I %{include} -I %{libcxx}/test/support'
diff --git a/libcxxabi/test/configs/ibm-libc++abi-shared.cfg.in b/libcxxabi/test/configs/ibm-libc++abi-shared.cfg.in
index 2a9bc9061566f..bd6c1fb135aaa 100644
--- a/libcxxabi/test/configs/ibm-libc++abi-shared.cfg.in
+++ b/libcxxabi/test/configs/ibm-libc++abi-shared.cfg.in
@@ -2,6 +2,12 @@
 
 lit_config.load_config(config, '@CMAKE_CURRENT_BINARY_DIR@/cmake-bridge.cfg')
 
+import lit.util
+if lit.util.isAIXTriple(config.target_triple):
+  # Add the AIX version to the triple here because there currently isn't a good
+  # way to retrieve the AIX version in the driver.
+  config.target_triple = lit.util.addAIXVersion(config.target_triple)
+
 config.substitutions.append(('%{flags}',''))
 config.substitutions.append(('%{compile_flags}',
     '-nostdinc++ -I %{include} -I %{cxx-include} -I %{cxx-target-include} %{maybe-include-libunwind} ' +
diff --git a/libunwind/test/configs/ibm-libunwind-shared.cfg.in b/libunwind/test/configs/ibm-libunwind-shared.cfg.in
index c38d278ad2c9c..2221e0cf499ff 100644
--- a/libunwind/test/configs/ibm-libunwind-shared.cfg.in
+++ b/libunwind/test/configs/ibm-libunwind-shared.cfg.in
@@ -3,6 +3,12 @@
 
 lit_config.load_config(config, '@CMAKE_CURRENT_BINARY_DIR@/cmake-bridge.cfg')
 
+import lit.util
+if lit.util.isAIXTriple(config.target_triple):
+  # Add the AIX version to the triple here because there currently isn't a good
+  # way to retrieve the AIX version in the driver.
+  config.target_triple = lit.util.addAIXVersion(config.target_triple)
+
 config.substitutions.append(('%{flags}', ''))
 config.substitutions.append(('%{compile_flags}',
     '-nostdinc++ -I %{include}'
diff --git a/libunwind/test/signal_frame.pass.cpp b/libunwind/test/signal_frame.pass.cpp
index e5409f6ce3d99..004029cfe1e90 100644
--- a/libunwind/test/signal_frame.pass.cpp
+++ b/libunwind/test/signal_frame.pass.cpp
@@ -19,7 +19,7 @@
 
 // The AIX assembler does not support CFI directives, which
 // are necessary to run this test.
-// UNSUPPORTED: target=powerpc{{(64)?}}-ibm-aix
+// UNSUPPORTED: target={{.*}}-aix{{.*}}
 
 // Windows doesn't generally use CFI directives. However, i686
 // mingw targets do use DWARF (where CFI directives are supported).
diff --git a/llvm/utils/lit/lit/util.py b/llvm/utils/lit/lit/util.py
index c8d7ec62a5745..232ddc9171ad3 100644
--- a/llvm/utils/lit/lit/util.py
+++ b/llvm/utils/lit/lit/util.py
@@ -6,6 +6,7 @@
 import numbers
 import os
 import platform
+import re
 import signal
 import subprocess
 import sys
@@ -429,6 +430,22 @@ def killProcess():
     return out, err, exitCode
 
 
+def isAIXTriple(target_triple):
+    """Whether the given target triple is for AIX,
+    e.g. powerpc64-ibm-aix
+    """
+    return "aix" in target_triple
+
+
+def addAIXVersion(target_triple):
+    """Add the AIX version to the given target triple,
+    e.g. powerpc64-ibm-aix7.2.5.6
+    """
+    os_cmd = "oslevel -s | awk -F\'-\' \'{printf \"%.1f.%d.%d\", $1/1000, $2, $3}\'"
+    os_version = subprocess.run(os_cmd, capture_output=True, shell=True).stdout.decode()
+    return re.sub("aix", "aix" + os_version, target_triple)
+
+
 def isMacOSTriple(target_triple):
     """Whether the given target triple is for macOS,
     e.g. x86_64-apple-darwin, arm64-apple-macos

From 703de006d30b734676e48deadafd6d89903f357e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Tue, 31 Oct 2023 22:38:53 +0200
Subject: [PATCH 580/877] [compiler-rt] [test] Apply the MSVC scalbn test
 exceptions to MinGW too (#70776)

MinGW mode can also use the MSVC/UCRT math functions. On ARM/ARM64,
mingw-w64 has always used the UCRT scalbn function. On x86, mingw-w64
recently changed to prefer the UCRT version for a large number of math
functions.
---
 compiler-rt/test/builtins/Unit/compiler_rt_scalbn_test.c  | 6 +++---
 compiler-rt/test/builtins/Unit/compiler_rt_scalbnf_test.c | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/compiler-rt/test/builtins/Unit/compiler_rt_scalbn_test.c b/compiler-rt/test/builtins/Unit/compiler_rt_scalbn_test.c
index 990e7b947cb2e..9124601c0bb2e 100644
--- a/compiler-rt/test/builtins/Unit/compiler_rt_scalbn_test.c
+++ b/compiler-rt/test/builtins/Unit/compiler_rt_scalbn_test.c
@@ -61,13 +61,13 @@ int main() {
   // should have the same rounding behavior as double-precision multiplication.
 #if (defined(__arm__) || defined(__aarch64__)) && defined(__ARM_FP) || \
     defined(__i386__) || defined(__x86_64__)
-// Skip these tests for MSVC because its scalbn function always behaves as if
-// the default rounding mode is set (FE_TONEAREST).
+// Skip these tests on Windows because the UCRT scalbn function always behaves
+// as if the default rounding mode is set (FE_TONEAREST).
 // Also skip for newlib because although its scalbn function does respect the
 // rounding mode, where the tests trigger an underflow or overflow using a
 // large exponent the result is rounded in the opposite direction to that which
 // would be expected in the (FE_UPWARD) and (FE_DOWNWARD) modes.
-#  if !defined(_MSC_VER) && !defined(_NEWLIB_VERSION)
+#  if !defined(_WIN32) && !defined(_NEWLIB_VERSION)
   fesetround(FE_UPWARD);
   if (iterate_cases("FE_UPWARD")) return 1;
 
diff --git a/compiler-rt/test/builtins/Unit/compiler_rt_scalbnf_test.c b/compiler-rt/test/builtins/Unit/compiler_rt_scalbnf_test.c
index 9edfe4aa1de3e..3ffdde6aed3f2 100644
--- a/compiler-rt/test/builtins/Unit/compiler_rt_scalbnf_test.c
+++ b/compiler-rt/test/builtins/Unit/compiler_rt_scalbnf_test.c
@@ -60,13 +60,13 @@ int main() {
   // should have the same rounding behavior as single-precision multiplication.
 #if (defined(__arm__) || defined(__aarch64__)) && defined(__ARM_FP) || \
     defined(__i386__) || defined(__x86_64__)
-// Skip these tests for MSVC because its scalbnf function always behaves as if
-// the default rounding mode is set (FE_TONEAREST).
+// Skip these tests on Windows because the UCRT scalbnf function always behaves
+// as if the default rounding mode is set (FE_TONEAREST).
 // Also skip for newlib because although its scalbnf function does respect the
 // rounding mode, where the tests trigger an underflow or overflow using a
 // large exponent the result is rounded in the opposite direction to that which
 // would be expected in the (FE_UPWARD) and (FE_DOWNWARD) modes.
-#  if !defined(_MSC_VER) && !defined(_NEWLIB_VERSION)
+#  if !defined(_WIN32) && !defined(_NEWLIB_VERSION)
   fesetround(FE_UPWARD);
   if (iterate_cases("FE_UPWARD")) return 1;
 

From 89a2e701596835ba2714e190990da09d4e976a9a Mon Sep 17 00:00:00 2001
From: Zequan Wu <zequanwu@google.com>
Date: Tue, 31 Oct 2023 16:40:51 -0400
Subject: [PATCH 581/877] [llvm-profdata] Emit warning when counter value is
 greater than 2^56. (#69513)

Fixes #65416
---
 llvm/include/llvm/ProfileData/InstrProf.h     |  3 +-
 .../llvm/ProfileData/InstrProfReader.h        | 17 +++++--
 llvm/lib/ProfileData/InstrProf.cpp            |  3 ++
 llvm/lib/ProfileData/InstrProfReader.cpp      | 21 ++++++---
 .../malformed-num-counters-zero.test          | 38 +++++++++++++--
 llvm/tools/llvm-profdata/llvm-profdata.cpp    | 46 +++++++++++++++----
 6 files changed, 103 insertions(+), 25 deletions(-)

diff --git a/llvm/include/llvm/ProfileData/InstrProf.h b/llvm/include/llvm/ProfileData/InstrProf.h
index e16785efc3a36..84eb6d3d68ad1 100644
--- a/llvm/include/llvm/ProfileData/InstrProf.h
+++ b/llvm/include/llvm/ProfileData/InstrProf.h
@@ -348,7 +348,8 @@ enum class instrprof_error {
   uncompress_failed,
   empty_raw_profile,
   zlib_unavailable,
-  raw_profile_version_mismatch
+  raw_profile_version_mismatch,
+  counter_value_too_large,
 };
 
 /// An ordered list of functions identified by their NameRef found in
diff --git a/llvm/include/llvm/ProfileData/InstrProfReader.h b/llvm/include/llvm/ProfileData/InstrProfReader.h
index cf6429a324d36..1ebbb5c63ed68 100644
--- a/llvm/include/llvm/ProfileData/InstrProfReader.h
+++ b/llvm/include/llvm/ProfileData/InstrProfReader.h
@@ -202,11 +202,13 @@ class InstrProfReader {
   /// instrprof file.
   static Expected<std::unique_ptr<InstrProfReader>>
   create(const Twine &Path, vfs::FileSystem &FS,
-         const InstrProfCorrelator *Correlator = nullptr);
+         const InstrProfCorrelator *Correlator = nullptr,
+         std::function<void(Error)> Warn = nullptr);
 
   static Expected<std::unique_ptr<InstrProfReader>>
   create(std::unique_ptr<MemoryBuffer> Buffer,
-         const InstrProfCorrelator *Correlator = nullptr);
+         const InstrProfCorrelator *Correlator = nullptr,
+         std::function<void(Error)> Warn = nullptr);
 
   /// \param Weight for raw profiles use this as the temporal profile trace
   ///               weight
@@ -344,12 +346,19 @@ class RawInstrProfReader : public InstrProfReader {
   /// Start address of binary id length and data pairs.
   const uint8_t *BinaryIdsStart;
 
+  std::function<void(Error)> Warn;
+
+  /// Maxium counter value 2^56.
+  static const uint64_t MaxCounterValue = (1ULL << 56);
+
 public:
   RawInstrProfReader(std::unique_ptr<MemoryBuffer> DataBuffer,
-                     const InstrProfCorrelator *Correlator)
+                     const InstrProfCorrelator *Correlator,
+                     std::function<void(Error)> Warn)
       : DataBuffer(std::move(DataBuffer)),
         Correlator(dyn_cast_or_null<const InstrProfCorrelatorImpl<IntPtrT>>(
-            Correlator)) {}
+            Correlator)),
+        Warn(Warn) {}
   RawInstrProfReader(const RawInstrProfReader &) = delete;
   RawInstrProfReader &operator=(const RawInstrProfReader &) = delete;
 
diff --git a/llvm/lib/ProfileData/InstrProf.cpp b/llvm/lib/ProfileData/InstrProf.cpp
index 583415ff451a0..47bd1bd93490d 100644
--- a/llvm/lib/ProfileData/InstrProf.cpp
+++ b/llvm/lib/ProfileData/InstrProf.cpp
@@ -161,6 +161,9 @@ static std::string getInstrProfErrString(instrprof_error Err,
   case instrprof_error::raw_profile_version_mismatch:
     OS << "raw profile version mismatch";
     break;
+  case instrprof_error::counter_value_too_large:
+    OS << "excessively large counter value suggests corrupted profile data";
+    break;
   }
 
   // If optional error message is not empty, append it to the message.
diff --git a/llvm/lib/ProfileData/InstrProfReader.cpp b/llvm/lib/ProfileData/InstrProfReader.cpp
index 31d3ff2bcbf6c..fd02f1f0c1235 100644
--- a/llvm/lib/ProfileData/InstrProfReader.cpp
+++ b/llvm/lib/ProfileData/InstrProfReader.cpp
@@ -167,17 +167,20 @@ static Error printBinaryIdsInternal(raw_ostream &OS,
 
 Expected<std::unique_ptr<InstrProfReader>>
 InstrProfReader::create(const Twine &Path, vfs::FileSystem &FS,
-                        const InstrProfCorrelator *Correlator) {
+                        const InstrProfCorrelator *Correlator,
+                        std::function<void(Error)> Warn) {
   // Set up the buffer to read.
   auto BufferOrError = setupMemoryBuffer(Path, FS);
   if (Error E = BufferOrError.takeError())
     return std::move(E);
-  return InstrProfReader::create(std::move(BufferOrError.get()), Correlator);
+  return InstrProfReader::create(std::move(BufferOrError.get()), Correlator,
+                                 Warn);
 }
 
 Expected<std::unique_ptr<InstrProfReader>>
 InstrProfReader::create(std::unique_ptr<MemoryBuffer> Buffer,
-                        const InstrProfCorrelator *Correlator) {
+                        const InstrProfCorrelator *Correlator,
+                        std::function<void(Error)> Warn) {
   if (Buffer->getBufferSize() == 0)
     return make_error<InstrProfError>(instrprof_error::empty_raw_profile);
 
@@ -186,9 +189,9 @@ InstrProfReader::create(std::unique_ptr<MemoryBuffer> Buffer,
   if (IndexedInstrProfReader::hasFormat(*Buffer))
     Result.reset(new IndexedInstrProfReader(std::move(Buffer)));
   else if (RawInstrProfReader64::hasFormat(*Buffer))
-    Result.reset(new RawInstrProfReader64(std::move(Buffer), Correlator));
+    Result.reset(new RawInstrProfReader64(std::move(Buffer), Correlator, Warn));
   else if (RawInstrProfReader32::hasFormat(*Buffer))
-    Result.reset(new RawInstrProfReader32(std::move(Buffer), Correlator));
+    Result.reset(new RawInstrProfReader32(std::move(Buffer), Correlator, Warn));
   else if (TextInstrProfReader::hasFormat(*Buffer))
     Result.reset(new TextInstrProfReader(std::move(Buffer)));
   else
@@ -706,8 +709,12 @@ Error RawInstrProfReader<IntPtrT>::readRawCounts(
       // A value of zero signifies the block is covered.
       Record.Counts.push_back(*Ptr == 0 ? 1 : 0);
     } else {
-      const auto *CounterValue = reinterpret_cast<const uint64_t *>(Ptr);
-      Record.Counts.push_back(swap(*CounterValue));
+      uint64_t CounterValue = swap(*reinterpret_cast<const uint64_t *>(Ptr));
+      if (CounterValue > MaxCounterValue && Warn)
+        Warn(make_error<InstrProfError>(
+            instrprof_error::counter_value_too_large, Twine(CounterValue)));
+
+      Record.Counts.push_back(CounterValue);
     }
   }
 
diff --git a/llvm/test/tools/llvm-profdata/malformed-num-counters-zero.test b/llvm/test/tools/llvm-profdata/malformed-num-counters-zero.test
index e1e33824bf2f8..2e747f81a6bfa 100644
--- a/llvm/test/tools/llvm-profdata/malformed-num-counters-zero.test
+++ b/llvm/test/tools/llvm-profdata/malformed-num-counters-zero.test
@@ -21,7 +21,6 @@ RUN: printf '\1\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
-RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\10\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\4\0\1\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
@@ -42,6 +41,11 @@ RUN: printf '\0\0\4\0\1\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
+
+// Make two copies for another test.
+RUN: cp %t.profraw %t-bad.profraw
+RUN: cp %t.profraw %t-good.profraw
+
 // Make NumCounters = 0 so that we get "number of counters is zero" error message
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
@@ -49,5 +53,33 @@ RUN: printf '\0\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\023\0\0\0\0\0\0\0' >> %t.profraw
 RUN: printf '\3\0foo\0\0\0' >> %t.profraw
 
-RUN: not llvm-profdata show %t.profraw 2>&1 | FileCheck %s
-CHECK: malformed instrumentation profile data: number of counters is zero
+RUN: not llvm-profdata show %t.profraw 2>&1 | FileCheck %s --check-prefix=ZERO
+ZERO: malformed instrumentation profile data: number of counters is zero
+
+// Test a counter value greater than 2^56.
+RUN: printf '\1\0\0\0\0\0\0\0' >> %t-bad.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t-bad.profraw
+// Counter value is 72057594037927937
+RUN: printf '\1\0\0\0\0\0\0\1' >> %t-bad.profraw
+RUN: printf '\3\0foo\0\0\0' >> %t-bad.profraw
+
+RUN: printf '\1\0\0\0\0\0\0\0' >> %t-good.profraw
+RUN: printf '\0\0\0\0\0\0\0\0' >> %t-good.profraw
+// Counter value is 72057594037927937
+RUN: printf '\1\0\0\0\0\0\0\0' >> %t-good.profraw
+RUN: printf '\3\0foo\0\0\0' >> %t-good.profraw
+
+// llvm-profdata fails if there is a warning for any input file under default failure mode (any).
+RUN: not llvm-profdata merge %t-bad.profraw %t-good.profraw -o %t.profdata 2>&1 | FileCheck %s --check-prefix=ANY
+ANY: {{.*}} excessively large counter value suggests corrupted profile data: 72057594037927937
+
+// -failure-mode=all only fails if there is a warning for every input file.
+RUN: not llvm-profdata merge %t-bad.profraw -failure-mode=all -o %t.profdata 2>&1 | FileCheck %s --check-prefix=ALL-ERR
+ALL-ERR: {{.*}} excessively large counter value suggests corrupted profile data: 72057594037927937
+
+RUN: llvm-profdata merge %t-bad.profraw %t-good.profraw -failure-mode=all -o %t.profdata 2>&1 | FileCheck %s --check-prefix=ALL-WARN
+ALL-WARN: {{.*}} excessively large counter value suggests corrupted profile data: 72057594037927937
+
+// -failure-mode=warn does not fail at all. It only prints warnings.
+RUN: llvm-profdata merge %t-bad.profraw -failure-mode=warn -o %t.profdata 2>&1 | FileCheck %s --check-prefix=WARN
+WARN: {{.*}} excessively large counter value suggests corrupted profile data: 72057594037927937
diff --git a/llvm/tools/llvm-profdata/llvm-profdata.cpp b/llvm/tools/llvm-profdata/llvm-profdata.cpp
index 7d665a8005b0d..84290ac609a6a 100644
--- a/llvm/tools/llvm-profdata/llvm-profdata.cpp
+++ b/llvm/tools/llvm-profdata/llvm-profdata.cpp
@@ -114,8 +114,8 @@ static void exitWithErrorCode(std::error_code EC, StringRef Whence = "") {
 
 namespace {
 enum ProfileKinds { instr, sample, memory };
-enum FailureMode { failIfAnyAreInvalid, failIfAllAreInvalid };
-}
+enum FailureMode { warnOnly, failIfAnyAreInvalid, failIfAllAreInvalid };
+} // namespace
 
 static void warnOrExitGivenError(FailureMode FailMode, std::error_code EC,
                                  StringRef Whence = "") {
@@ -314,7 +314,22 @@ static void loadInput(const WeightedFile &Input, SymbolRemapper *Remapper,
   }
 
   auto FS = vfs::getRealFileSystem();
-  auto ReaderOrErr = InstrProfReader::create(Input.Filename, *FS, Correlator);
+  // TODO: This only saves the first non-fatal error from InstrProfReader, and
+  // then added to WriterContext::Errors. However, this is not extensible, if
+  // we have more non-fatal errors from InstrProfReader in the future. How
+  // should this interact with different -failure-mode?
+  std::optional<std::pair<Error, std::string>> ReaderWarning;
+  auto Warn = [&](Error E) {
+    if (ReaderWarning) {
+      consumeError(std::move(E));
+      return;
+    }
+    // Only show the first time an error occurs in this file.
+    auto [ErrCode, Msg] = InstrProfError::take(std::move(E));
+    ReaderWarning = {make_error<InstrProfError>(ErrCode, Msg), Filename};
+  };
+  auto ReaderOrErr =
+      InstrProfReader::create(Input.Filename, *FS, Correlator, Warn);
   if (Error E = ReaderOrErr.takeError()) {
     // Skip the empty profiles by returning silently.
     auto [ErrCode, Msg] = InstrProfError::take(std::move(E));
@@ -362,14 +377,23 @@ static void loadInput(const WeightedFile &Input, SymbolRemapper *Remapper,
           Traces, Reader->getTemporalProfTraceStreamSize());
   }
   if (Reader->hasError()) {
-    if (Error E = Reader->getError())
+    if (Error E = Reader->getError()) {
       WC->Errors.emplace_back(std::move(E), Filename);
+      return;
+    }
   }
 
   std::vector<llvm::object::BuildID> BinaryIds;
-  if (Error E = Reader->readBinaryIds(BinaryIds))
+  if (Error E = Reader->readBinaryIds(BinaryIds)) {
     WC->Errors.emplace_back(std::move(E), Filename);
+    return;
+  }
   WC->Writer.addBinaryIds(BinaryIds);
+
+  if (ReaderWarning) {
+    WC->Errors.emplace_back(std::move(ReaderWarning->first),
+                            ReaderWarning->second);
+  }
 }
 
 /// Merge the \p Src writer context into \p Dst.
@@ -492,7 +516,7 @@ mergeInstrProfile(const WeightedFileVector &Inputs, StringRef DebugInfoFilename,
       warn(toString(std::move(ErrorPair.first)), ErrorPair.second);
     }
   }
-  if (NumErrors == Inputs.size() ||
+  if ((NumErrors == Inputs.size() && FailMode == failIfAllAreInvalid) ||
       (NumErrors > 0 && FailMode == failIfAnyAreInvalid))
     exitWithError("no profile can be merged");
 
@@ -1202,10 +1226,12 @@ static int merge_main(int argc, const char *argv[]) {
                      "GCC encoding (only meaningful for -sample)")));
   cl::opt<FailureMode> FailureMode(
       "failure-mode", cl::init(failIfAnyAreInvalid), cl::desc("Failure mode:"),
-      cl::values(clEnumValN(failIfAnyAreInvalid, "any",
-                            "Fail if any profile is invalid."),
-                 clEnumValN(failIfAllAreInvalid, "all",
-                            "Fail only if all profiles are invalid.")));
+      cl::values(
+          clEnumValN(warnOnly, "warn", "Do not fail and just print warnings."),
+          clEnumValN(failIfAnyAreInvalid, "any",
+                     "Fail if any profile is invalid."),
+          clEnumValN(failIfAllAreInvalid, "all",
+                     "Fail only if all profiles are invalid.")));
   cl::opt<bool> OutputSparse("sparse", cl::init(false),
       cl::desc("Generate a sparse profile (only meaningful for -instr)"));
   cl::opt<unsigned> NumThreads(

From 15b0cb4c72a0ef471b0c86284863ed88a6778088 Mon Sep 17 00:00:00 2001
From: nicole mazzuca <nicole@strega-nil.co>
Date: Tue, 31 Oct 2023 14:14:58 -0700
Subject: [PATCH 582/877] [windows][ASan] Fix build (#70855)

PR #69625 broke the build - I put __cdecl on the wrong side of the `*`
in function pointer declarations.

Lesson learned - run check-all!
---
 compiler-rt/include/sanitizer/allocator_interface.h | 4 ++--
 compiler-rt/include/sanitizer/dfsan_interface.h     | 8 ++++----
 compiler-rt/include/sanitizer/msan_interface.h      | 2 +-
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/compiler-rt/include/sanitizer/allocator_interface.h b/compiler-rt/include/sanitizer/allocator_interface.h
index 19af06969e083..a792e9f0136e6 100644
--- a/compiler-rt/include/sanitizer/allocator_interface.h
+++ b/compiler-rt/include/sanitizer/allocator_interface.h
@@ -83,8 +83,8 @@ void SANITIZER_CDECL __sanitizer_free_hook(const volatile void *ptr);
    other threads.
 */
 int SANITIZER_CDECL __sanitizer_install_malloc_and_free_hooks(
-    void (*SANITIZER_CDECL malloc_hook)(const volatile void *, size_t),
-    void (*SANITIZER_CDECL free_hook)(const volatile void *));
+    void(SANITIZER_CDECL *malloc_hook)(const volatile void *, size_t),
+    void(SANITIZER_CDECL *free_hook)(const volatile void *));
 
 /* Drains allocator quarantines (calling thread's and global ones), returns
    freed memory back to OS and releases other non-essential internal allocator
diff --git a/compiler-rt/include/sanitizer/dfsan_interface.h b/compiler-rt/include/sanitizer/dfsan_interface.h
index 1084ec1f53cc0..4e52e1b54cd89 100644
--- a/compiler-rt/include/sanitizer/dfsan_interface.h
+++ b/compiler-rt/include/sanitizer/dfsan_interface.h
@@ -25,16 +25,16 @@ typedef uint8_t dfsan_label;
 typedef uint32_t dfsan_origin;
 
 /// Signature of the callback argument to dfsan_set_write_callback().
-typedef void (*SANITIZER_CDECL dfsan_write_callback_t)(int fd, const void *buf,
-                                                       size_t count);
+typedef void(SANITIZER_CDECL *dfsan_write_callback_t)(int fd, const void *buf,
+                                                      size_t count);
 
 /// Signature of the callback argument to dfsan_set_conditional_callback().
-typedef void (*SANITIZER_CDECL dfsan_conditional_callback_t)(
+typedef void(SANITIZER_CDECL *dfsan_conditional_callback_t)(
     dfsan_label label, dfsan_origin origin);
 
 /// Signature of the callback argument to dfsan_set_reaches_function_callback().
 /// The description is intended to hold the name of the variable.
-typedef void (*SANITIZER_CDECL dfsan_reaches_function_callback_t)(
+typedef void(SANITIZER_CDECL *dfsan_reaches_function_callback_t)(
     dfsan_label label, dfsan_origin origin, const char *file, unsigned int line,
     const char *function);
 
diff --git a/compiler-rt/include/sanitizer/msan_interface.h b/compiler-rt/include/sanitizer/msan_interface.h
index 459de87689488..6fedc03125453 100644
--- a/compiler-rt/include/sanitizer/msan_interface.h
+++ b/compiler-rt/include/sanitizer/msan_interface.h
@@ -108,7 +108,7 @@ const char *SANITIZER_CDECL __msan_default_options(void);
 
 /* Deprecated. Call __sanitizer_set_death_callback instead. */
 void SANITIZER_CDECL
-__msan_set_death_callback(void (*SANITIZER_CDECL callback)(void));
+__msan_set_death_callback(void(SANITIZER_CDECL *callback)(void));
 
 /* Update shadow for the application copy of size bytes from src to dst.
    Src and dst are application addresses. This function does not copy the

From 5bfd89bda7c2d5ff167c7bcea0c8d69b0b498f08 Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <Ramkumar.Ramachandra@imgtec.com>
Date: Tue, 31 Oct 2023 21:29:15 +0000
Subject: [PATCH 583/877] VectorUtils: mark lrint, llrint as trivially
 vectorizable (#69945)

With the recent change 98c90a13 (ISel: introduce vector ISD::LRINT,
ISD::LLRINT; custom RISCV lowering), it is now possible for
SLPVectorizer, LoopVectorize, and Scalarizer to operate on llvm.lrint
and llvm.llrint, with vector codegen for the RISC-V target. Make a
trivial change to VectorUtils, and update the corresponding tests.
---
 llvm/lib/Analysis/VectorUtils.cpp             |   4 +
 .../Transforms/LoopVectorize/intrinsic.ll     |   4 +-
 .../Transforms/SLPVectorizer/RISCV/fround.ll  | 165 ++++--------------
 llvm/test/Transforms/Scalarizer/intrinsics.ll |  14 +-
 4 files changed, 48 insertions(+), 139 deletions(-)

diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp
index be171867a2bf4..4f5db8b7aaf74 100644
--- a/llvm/lib/Analysis/VectorUtils.cpp
+++ b/llvm/lib/Analysis/VectorUtils.cpp
@@ -91,6 +91,8 @@ bool llvm::isTriviallyVectorizable(Intrinsic::ID ID) {
   case Intrinsic::canonicalize:
   case Intrinsic::fptosi_sat:
   case Intrinsic::fptoui_sat:
+  case Intrinsic::lrint:
+  case Intrinsic::llrint:
     return true;
   default:
     return false;
@@ -122,6 +124,8 @@ bool llvm::isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID,
   switch (ID) {
   case Intrinsic::fptosi_sat:
   case Intrinsic::fptoui_sat:
+  case Intrinsic::lrint:
+  case Intrinsic::llrint:
     return OpdIdx == -1 || OpdIdx == 0;
   case Intrinsic::is_fpclass:
     return OpdIdx == 0;
diff --git a/llvm/test/Transforms/LoopVectorize/intrinsic.ll b/llvm/test/Transforms/LoopVectorize/intrinsic.ll
index 3e7754951de6a..0f070347dd4ef 100644
--- a/llvm/test/Transforms/LoopVectorize/intrinsic.ll
+++ b/llvm/test/Transforms/LoopVectorize/intrinsic.ll
@@ -1602,7 +1602,7 @@ declare i32 @llvm.lrint.i32.f32(float)
 
 define void @lrint_i32_f32(ptr %x, ptr %y, i64 %n) {
 ; CHECK-LABEL: @lrint_i32_f32(
-; CHECK-NOT: llvm.lrint.v4i32.v4f32
+; CHECK: llvm.lrint.v4i32.v4f32
 ; CHECK: ret void
 ;
 entry:
@@ -1628,7 +1628,7 @@ declare i64 @llvm.llrint.i64.f32(float)
 
 define void @llrint_i64_f32(ptr %x, ptr %y, i64 %n) {
 ; CHECK-LABEL: @llrint_i64_f32(
-; CHECK-NOT: llvm.llrint.v4i32.v4f32
+; CHECK: llvm.llrint.v4i64.v4f32
 ; CHECK: ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/fround.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/fround.ll
index e5c46b72a1c30..9cd0d5d321e2f 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/fround.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/fround.ll
@@ -34,13 +34,8 @@ define <2 x i32> @lrint_v2i32f32(ptr %a) {
 ; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[A]], align 8
-; CHECK-NEXT:    [[VECEXT:%.*]] = extractelement <2 x float> [[TMP0]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.lrint.i32.f32(float [[VECEXT]])
-; CHECK-NEXT:    [[VECINS:%.*]] = insertelement <2 x i32> undef, i32 [[TMP1]], i32 0
-; CHECK-NEXT:    [[VECEXT_1:%.*]] = extractelement <2 x float> [[TMP0]], i32 1
-; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.lrint.i32.f32(float [[VECEXT_1]])
-; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <2 x i32> [[VECINS]], i32 [[TMP2]], i32 1
-; CHECK-NEXT:    ret <2 x i32> [[VECINS_1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x i32> @llvm.lrint.v2i32.v2f32(<2 x float> [[TMP0]])
+; CHECK-NEXT:    ret <2 x i32> [[TMP1]]
 ;
 entry:
   %0 = load <2 x float>, ptr %a
@@ -58,19 +53,8 @@ define <4 x i32> @lrint_v4i32f32(ptr %a) {
 ; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16
-; CHECK-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.lrint.i32.f32(float [[VECEXT]])
-; CHECK-NEXT:    [[VECINS:%.*]] = insertelement <4 x i32> undef, i32 [[TMP1]], i32 0
-; CHECK-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
-; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.lrint.i32.f32(float [[VECEXT_1]])
-; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x i32> [[VECINS]], i32 [[TMP2]], i32 1
-; CHECK-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
-; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.lrint.i32.f32(float [[VECEXT_2]])
-; CHECK-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x i32> [[VECINS_1]], i32 [[TMP3]], i32 2
-; CHECK-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
-; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.lrint.i32.f32(float [[VECEXT_3]])
-; CHECK-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x i32> [[VECINS_2]], i32 [[TMP4]], i32 3
-; CHECK-NEXT:    ret <4 x i32> [[VECINS_3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i32> @llvm.lrint.v4i32.v4f32(<4 x float> [[TMP0]])
+; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
 ;
 entry:
   %0 = load <4 x float>, ptr %a
@@ -94,31 +78,8 @@ define <8 x i32> @lrint_v8i32f32(ptr %a) {
 ; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x float>, ptr [[A]], align 32
-; CHECK-NEXT:    [[VECEXT:%.*]] = extractelement <8 x float> [[TMP0]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.lrint.i32.f32(float [[VECEXT]])
-; CHECK-NEXT:    [[VECINS:%.*]] = insertelement <8 x i32> undef, i32 [[TMP1]], i32 0
-; CHECK-NEXT:    [[VECEXT_1:%.*]] = extractelement <8 x float> [[TMP0]], i32 1
-; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.lrint.i32.f32(float [[VECEXT_1]])
-; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <8 x i32> [[VECINS]], i32 [[TMP2]], i32 1
-; CHECK-NEXT:    [[VECEXT_2:%.*]] = extractelement <8 x float> [[TMP0]], i32 2
-; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.lrint.i32.f32(float [[VECEXT_2]])
-; CHECK-NEXT:    [[VECINS_2:%.*]] = insertelement <8 x i32> [[VECINS_1]], i32 [[TMP3]], i32 2
-; CHECK-NEXT:    [[VECEXT_3:%.*]] = extractelement <8 x float> [[TMP0]], i32 3
-; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.lrint.i32.f32(float [[VECEXT_3]])
-; CHECK-NEXT:    [[VECINS_3:%.*]] = insertelement <8 x i32> [[VECINS_2]], i32 [[TMP4]], i32 3
-; CHECK-NEXT:    [[VECEXT_4:%.*]] = extractelement <8 x float> [[TMP0]], i32 4
-; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.lrint.i32.f32(float [[VECEXT_4]])
-; CHECK-NEXT:    [[VECINS_4:%.*]] = insertelement <8 x i32> [[VECINS_3]], i32 [[TMP5]], i32 4
-; CHECK-NEXT:    [[VECEXT_5:%.*]] = extractelement <8 x float> [[TMP0]], i32 5
-; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.lrint.i32.f32(float [[VECEXT_5]])
-; CHECK-NEXT:    [[VECINS_5:%.*]] = insertelement <8 x i32> [[VECINS_4]], i32 [[TMP6]], i32 5
-; CHECK-NEXT:    [[VECEXT_6:%.*]] = extractelement <8 x float> [[TMP0]], i32 6
-; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.lrint.i32.f32(float [[VECEXT_6]])
-; CHECK-NEXT:    [[VECINS_6:%.*]] = insertelement <8 x i32> [[VECINS_5]], i32 [[TMP7]], i32 6
-; CHECK-NEXT:    [[VECEXT_7:%.*]] = extractelement <8 x float> [[TMP0]], i32 7
-; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.lrint.i32.f32(float [[VECEXT_7]])
-; CHECK-NEXT:    [[VECINS_7:%.*]] = insertelement <8 x i32> [[VECINS_6]], i32 [[TMP8]], i32 7
-; CHECK-NEXT:    ret <8 x i32> [[VECINS_7]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i32> @llvm.lrint.v8i32.v8f32(<8 x float> [[TMP0]])
+; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
 ;
 entry:
   %0 = load <8 x float>, ptr %a
@@ -154,13 +115,8 @@ define <2 x i64> @lrint_v2i64f32(ptr %a) {
 ; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[A]], align 8
-; CHECK-NEXT:    [[VECEXT:%.*]] = extractelement <2 x float> [[TMP0]], i64 0
-; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.lrint.i64.f32(float [[VECEXT]])
-; CHECK-NEXT:    [[VECINS:%.*]] = insertelement <2 x i64> undef, i64 [[TMP1]], i64 0
-; CHECK-NEXT:    [[VECEXT_1:%.*]] = extractelement <2 x float> [[TMP0]], i64 1
-; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.lrint.i64.f32(float [[VECEXT_1]])
-; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <2 x i64> [[VECINS]], i64 [[TMP2]], i64 1
-; CHECK-NEXT:    ret <2 x i64> [[VECINS_1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x i64> @llvm.lrint.v2i64.v2f32(<2 x float> [[TMP0]])
+; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
 ;
 entry:
   %0 = load <2 x float>, ptr %a
@@ -178,19 +134,8 @@ define <4 x i64> @lrint_v4i64f32(ptr %a) {
 ; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16
-; CHECK-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i64 0
-; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.lrint.i64.f32(float [[VECEXT]])
-; CHECK-NEXT:    [[VECINS:%.*]] = insertelement <4 x i64> undef, i64 [[TMP1]], i64 0
-; CHECK-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i64 1
-; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.lrint.i64.f32(float [[VECEXT_1]])
-; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x i64> [[VECINS]], i64 [[TMP2]], i64 1
-; CHECK-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i64 2
-; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.lrint.i64.f32(float [[VECEXT_2]])
-; CHECK-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x i64> [[VECINS_1]], i64 [[TMP3]], i64 2
-; CHECK-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i64 3
-; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.lrint.i64.f32(float [[VECEXT_3]])
-; CHECK-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x i64> [[VECINS_2]], i64 [[TMP4]], i64 3
-; CHECK-NEXT:    ret <4 x i64> [[VECINS_3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i64> @llvm.lrint.v4i64.v4f32(<4 x float> [[TMP0]])
+; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
 ;
 entry:
   %0 = load <4 x float>, ptr %a
@@ -214,31 +159,14 @@ define <8 x i64> @lrint_v8i64f32(ptr %a) {
 ; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x float>, ptr [[A]], align 32
-; CHECK-NEXT:    [[VECEXT:%.*]] = extractelement <8 x float> [[TMP0]], i64 0
-; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.lrint.i64.f32(float [[VECEXT]])
-; CHECK-NEXT:    [[VECINS:%.*]] = insertelement <8 x i64> undef, i64 [[TMP1]], i64 0
-; CHECK-NEXT:    [[VECEXT_1:%.*]] = extractelement <8 x float> [[TMP0]], i64 1
-; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.lrint.i64.f32(float [[VECEXT_1]])
-; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <8 x i64> [[VECINS]], i64 [[TMP2]], i64 1
-; CHECK-NEXT:    [[VECEXT_2:%.*]] = extractelement <8 x float> [[TMP0]], i64 2
-; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.lrint.i64.f32(float [[VECEXT_2]])
-; CHECK-NEXT:    [[VECINS_2:%.*]] = insertelement <8 x i64> [[VECINS_1]], i64 [[TMP3]], i64 2
-; CHECK-NEXT:    [[VECEXT_3:%.*]] = extractelement <8 x float> [[TMP0]], i64 3
-; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.lrint.i64.f32(float [[VECEXT_3]])
-; CHECK-NEXT:    [[VECINS_3:%.*]] = insertelement <8 x i64> [[VECINS_2]], i64 [[TMP4]], i64 3
-; CHECK-NEXT:    [[VECEXT_4:%.*]] = extractelement <8 x float> [[TMP0]], i64 4
-; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.lrint.i64.f32(float [[VECEXT_4]])
-; CHECK-NEXT:    [[VECINS_4:%.*]] = insertelement <8 x i64> [[VECINS_3]], i64 [[TMP5]], i64 4
-; CHECK-NEXT:    [[VECEXT_5:%.*]] = extractelement <8 x float> [[TMP0]], i64 5
-; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.lrint.i64.f32(float [[VECEXT_5]])
-; CHECK-NEXT:    [[VECINS_5:%.*]] = insertelement <8 x i64> [[VECINS_4]], i64 [[TMP6]], i64 5
-; CHECK-NEXT:    [[VECEXT_6:%.*]] = extractelement <8 x float> [[TMP0]], i64 6
-; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.lrint.i64.f32(float [[VECEXT_6]])
-; CHECK-NEXT:    [[VECINS_6:%.*]] = insertelement <8 x i64> [[VECINS_5]], i64 [[TMP7]], i64 6
-; CHECK-NEXT:    [[VECEXT_7:%.*]] = extractelement <8 x float> [[TMP0]], i64 7
-; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.lrint.i64.f32(float [[VECEXT_7]])
-; CHECK-NEXT:    [[VECINS_7:%.*]] = insertelement <8 x i64> [[VECINS_6]], i64 [[TMP8]], i64 7
-; CHECK-NEXT:    ret <8 x i64> [[VECINS_7]]
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[TMP0]], <8 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i64> @llvm.lrint.v4i64.v4f32(<4 x float> [[TMP1]])
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i64> [[TMP2]], <4 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP0]], <8 x float> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP5:%.*]] = call <4 x i64> @llvm.lrint.v4i64.v4f32(<4 x float> [[TMP4]])
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i64> [[TMP5]], <4 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[VECINS_71:%.*]] = shufflevector <8 x i64> [[TMP3]], <8 x i64> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT:    ret <8 x i64> [[VECINS_71]]
 ;
 entry:
   %0 = load <8 x float>, ptr %a
@@ -274,13 +202,8 @@ define <2 x i64> @llrint_v2i64f32(ptr %a) {
 ; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[A]], align 8
-; CHECK-NEXT:    [[VECEXT:%.*]] = extractelement <2 x float> [[TMP0]], i64 0
-; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.llrint.i64.f32(float [[VECEXT]])
-; CHECK-NEXT:    [[VECINS:%.*]] = insertelement <2 x i64> undef, i64 [[TMP1]], i64 0
-; CHECK-NEXT:    [[VECEXT_1:%.*]] = extractelement <2 x float> [[TMP0]], i64 1
-; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.llrint.i64.f32(float [[VECEXT_1]])
-; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <2 x i64> [[VECINS]], i64 [[TMP2]], i64 1
-; CHECK-NEXT:    ret <2 x i64> [[VECINS_1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x i64> @llvm.llrint.v2i64.v2f32(<2 x float> [[TMP0]])
+; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
 ;
 entry:
   %0 = load <2 x float>, ptr %a
@@ -298,19 +221,8 @@ define <4 x i64> @llrint_v4i64f32(ptr %a) {
 ; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16
-; CHECK-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i64 0
-; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.llrint.i64.f32(float [[VECEXT]])
-; CHECK-NEXT:    [[VECINS:%.*]] = insertelement <4 x i64> undef, i64 [[TMP1]], i64 0
-; CHECK-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i64 1
-; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.llrint.i64.f32(float [[VECEXT_1]])
-; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x i64> [[VECINS]], i64 [[TMP2]], i64 1
-; CHECK-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i64 2
-; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.llrint.i64.f32(float [[VECEXT_2]])
-; CHECK-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x i64> [[VECINS_1]], i64 [[TMP3]], i64 2
-; CHECK-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i64 3
-; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.llrint.i64.f32(float [[VECEXT_3]])
-; CHECK-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x i64> [[VECINS_2]], i64 [[TMP4]], i64 3
-; CHECK-NEXT:    ret <4 x i64> [[VECINS_3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i64> @llvm.llrint.v4i64.v4f32(<4 x float> [[TMP0]])
+; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
 ;
 entry:
   %0 = load <4 x float>, ptr %a
@@ -334,31 +246,14 @@ define <8 x i64> @llrint_v8i64f32(ptr %a) {
 ; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x float>, ptr [[A]], align 32
-; CHECK-NEXT:    [[VECEXT:%.*]] = extractelement <8 x float> [[TMP0]], i64 0
-; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.llrint.i64.f32(float [[VECEXT]])
-; CHECK-NEXT:    [[VECINS:%.*]] = insertelement <8 x i64> undef, i64 [[TMP1]], i64 0
-; CHECK-NEXT:    [[VECEXT_1:%.*]] = extractelement <8 x float> [[TMP0]], i64 1
-; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.llrint.i64.f32(float [[VECEXT_1]])
-; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <8 x i64> [[VECINS]], i64 [[TMP2]], i64 1
-; CHECK-NEXT:    [[VECEXT_2:%.*]] = extractelement <8 x float> [[TMP0]], i64 2
-; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.llrint.i64.f32(float [[VECEXT_2]])
-; CHECK-NEXT:    [[VECINS_2:%.*]] = insertelement <8 x i64> [[VECINS_1]], i64 [[TMP3]], i64 2
-; CHECK-NEXT:    [[VECEXT_3:%.*]] = extractelement <8 x float> [[TMP0]], i64 3
-; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.llrint.i64.f32(float [[VECEXT_3]])
-; CHECK-NEXT:    [[VECINS_3:%.*]] = insertelement <8 x i64> [[VECINS_2]], i64 [[TMP4]], i64 3
-; CHECK-NEXT:    [[VECEXT_4:%.*]] = extractelement <8 x float> [[TMP0]], i64 4
-; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.llrint.i64.f32(float [[VECEXT_4]])
-; CHECK-NEXT:    [[VECINS_4:%.*]] = insertelement <8 x i64> [[VECINS_3]], i64 [[TMP5]], i64 4
-; CHECK-NEXT:    [[VECEXT_5:%.*]] = extractelement <8 x float> [[TMP0]], i64 5
-; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.llrint.i64.f32(float [[VECEXT_5]])
-; CHECK-NEXT:    [[VECINS_5:%.*]] = insertelement <8 x i64> [[VECINS_4]], i64 [[TMP6]], i64 5
-; CHECK-NEXT:    [[VECEXT_6:%.*]] = extractelement <8 x float> [[TMP0]], i64 6
-; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.llrint.i64.f32(float [[VECEXT_6]])
-; CHECK-NEXT:    [[VECINS_6:%.*]] = insertelement <8 x i64> [[VECINS_5]], i64 [[TMP7]], i64 6
-; CHECK-NEXT:    [[VECEXT_7:%.*]] = extractelement <8 x float> [[TMP0]], i64 7
-; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.llrint.i64.f32(float [[VECEXT_7]])
-; CHECK-NEXT:    [[VECINS_7:%.*]] = insertelement <8 x i64> [[VECINS_6]], i64 [[TMP8]], i64 7
-; CHECK-NEXT:    ret <8 x i64> [[VECINS_7]]
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[TMP0]], <8 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i64> @llvm.llrint.v4i64.v4f32(<4 x float> [[TMP1]])
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i64> [[TMP2]], <4 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP0]], <8 x float> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP5:%.*]] = call <4 x i64> @llvm.llrint.v4i64.v4f32(<4 x float> [[TMP4]])
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i64> [[TMP5]], <4 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[VECINS_71:%.*]] = shufflevector <8 x i64> [[TMP3]], <8 x i64> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT:    ret <8 x i64> [[VECINS_71]]
 ;
 entry:
   %0 = load <8 x float>, ptr %a
diff --git a/llvm/test/Transforms/Scalarizer/intrinsics.ll b/llvm/test/Transforms/Scalarizer/intrinsics.ll
index f5e5be1f5998e..cee44ef434260 100644
--- a/llvm/test/Transforms/Scalarizer/intrinsics.ll
+++ b/llvm/test/Transforms/Scalarizer/intrinsics.ll
@@ -217,7 +217,12 @@ define <2 x i32> @scalarize_fptoui_sat(<2 x float> %x) #0 {
 
 define <2 x i32> @scalarize_lrint(<2 x float> %x) #0 {
 ; CHECK-LABEL: @scalarize_lrint(
-; CHECK-NEXT:    [[RND:%.*]] = call <2 x i32> @llvm.lrint.v2i32.v2f32(<2 x float> [[X:%.*]])
+; CHECK-NEXT:    [[X_I0:%.*]] = extractelement <2 x float> [[X:%.*]], i64 0
+; CHECK-NEXT:    [[RND_I0:%.*]] = call i32 @llvm.lrint.i32.f32(float [[X_I0]])
+; CHECK-NEXT:    [[X_I1:%.*]] = extractelement <2 x float> [[X]], i64 1
+; CHECK-NEXT:    [[RND_I1:%.*]] = call i32 @llvm.lrint.i32.f32(float [[X_I1]])
+; CHECK-NEXT:    [[RND_UPTO0:%.*]] = insertelement <2 x i32> poison, i32 [[RND_I0]], i64 0
+; CHECK-NEXT:    [[RND:%.*]] = insertelement <2 x i32> [[RND_UPTO0]], i32 [[RND_I1]], i64 1
 ; CHECK-NEXT:    ret <2 x i32> [[RND]]
 ;
   %rnd = call <2 x i32> @llvm.lrint.v2i32.v2f32(<2 x float> %x)
@@ -226,7 +231,12 @@ define <2 x i32> @scalarize_lrint(<2 x float> %x) #0 {
 
 define <2 x i32> @scalarize_llrint(<2 x float> %x) #0 {
 ; CHECK-LABEL: @scalarize_llrint(
-; CHECK-NEXT:    [[RND:%.*]] = call <2 x i32> @llvm.llrint.v2i32.v2f32(<2 x float> [[X:%.*]])
+; CHECK-NEXT:    [[X_I0:%.*]] = extractelement <2 x float> [[X:%.*]], i64 0
+; CHECK-NEXT:    [[RND_I0:%.*]] = call i32 @llvm.llrint.i32.f32(float [[X_I0]])
+; CHECK-NEXT:    [[X_I1:%.*]] = extractelement <2 x float> [[X]], i64 1
+; CHECK-NEXT:    [[RND_I1:%.*]] = call i32 @llvm.llrint.i32.f32(float [[X_I1]])
+; CHECK-NEXT:    [[RND_UPTO0:%.*]] = insertelement <2 x i32> poison, i32 [[RND_I0]], i64 0
+; CHECK-NEXT:    [[RND:%.*]] = insertelement <2 x i32> [[RND_UPTO0]], i32 [[RND_I1]], i64 1
 ; CHECK-NEXT:    ret <2 x i32> [[RND]]
 ;
   %rnd = call <2 x i32> @llvm.llrint.v2i32.v2f32(<2 x float> %x)

From a78f5c064975f4d597ed1b18206d579748eb91cb Mon Sep 17 00:00:00 2001
From: Philip Reames <preames@rivosinc.com>
Date: Tue, 31 Oct 2023 14:37:57 -0700
Subject: [PATCH 584/877] [IndVars] Use IRBuilder in eliminateTrunc [nfc-ish]
 (#70836)

Mostly a cleanup so that we don't need to manually emit instructions,
and can eagerly constant fold where relevant.
---
 llvm/lib/Transforms/Utils/SimplifyIndVar.cpp         | 11 ++++++-----
 .../Transforms/IndVarSimplify/X86/eliminate-trunc.ll | 12 ++++--------
 llvm/test/Transforms/IndVarSimplify/ada-loops.ll     |  3 +--
 3 files changed, 11 insertions(+), 15 deletions(-)

diff --git a/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp b/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp
index a23ac41acaa58..ae3644183a735 100644
--- a/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp
@@ -539,7 +539,8 @@ bool SimplifyIndvar::eliminateTrunc(TruncInst *TI) {
   for (auto *ICI : ICmpUsers) {
     bool IsSwapped = L->isLoopInvariant(ICI->getOperand(0));
     auto *Op1 = IsSwapped ? ICI->getOperand(0) : ICI->getOperand(1);
-    Instruction *Ext = nullptr;
+    IRBuilder<> Builder(ICI);
+    Value *Ext = nullptr;
     // For signed/unsigned predicate, replace the old comparison with comparison
     // of immediate IV against sext/zext of the invariant argument. If we can
     // use either sext or zext (i.e. we are dealing with equality predicate),
@@ -550,18 +551,18 @@ bool SimplifyIndvar::eliminateTrunc(TruncInst *TI) {
     if (IsSwapped) Pred = ICmpInst::getSwappedPredicate(Pred);
     if (CanUseZExt(ICI)) {
       assert(DoesZExtCollapse && "Unprofitable zext?");
-      Ext = new ZExtInst(Op1, IVTy, "zext", ICI);
+      Ext = Builder.CreateZExt(Op1, IVTy, "zext");
       Pred = ICmpInst::getUnsignedPredicate(Pred);
     } else {
       assert(DoesSExtCollapse && "Unprofitable sext?");
-      Ext = new SExtInst(Op1, IVTy, "sext", ICI);
+      Ext = Builder.CreateSExt(Op1, IVTy, "sext");
       assert(Pred == ICmpInst::getSignedPredicate(Pred) && "Must be signed!");
     }
     bool Changed;
     L->makeLoopInvariant(Ext, Changed);
     (void)Changed;
-    ICmpInst *NewICI = new ICmpInst(ICI, Pred, IV, Ext);
-    ICI->replaceAllUsesWith(NewICI);
+    auto *NewCmp = Builder.CreateICmp(Pred, IV, Ext);
+    ICI->replaceAllUsesWith(NewCmp);
     DeadInsts.emplace_back(ICI);
   }
 
diff --git a/llvm/test/Transforms/IndVarSimplify/X86/eliminate-trunc.ll b/llvm/test/Transforms/IndVarSimplify/X86/eliminate-trunc.ll
index de705bd14c467..e0c31fdbaa4fb 100644
--- a/llvm/test/Transforms/IndVarSimplify/X86/eliminate-trunc.ll
+++ b/llvm/test/Transforms/IndVarSimplify/X86/eliminate-trunc.ll
@@ -572,12 +572,11 @@ define void @test_13a(i32 %n) {
 ;
 ; CHECK-LABEL: @test_13a(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[ZEXT:%.*]] = zext i32 1024 to i64
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 2
-; CHECK-NEXT:    [[TMP0:%.*]] = icmp ne i64 [[IV]], [[ZEXT]]
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp ne i64 [[IV]], 1024
 ; CHECK-NEXT:    br i1 [[TMP0]], label [[LOOP]], label [[EXIT:%.*]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
@@ -598,12 +597,11 @@ define void @test_13b(i32 %n) {
 ;
 ; CHECK-LABEL: @test_13b(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[ZEXT:%.*]] = zext i32 1024 to i64
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 2
-; CHECK-NEXT:    [[TMP0:%.*]] = icmp ult i64 [[IV]], [[ZEXT]]
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp ult i64 [[IV]], 1024
 ; CHECK-NEXT:    br i1 [[TMP0]], label [[LOOP]], label [[EXIT:%.*]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
@@ -624,12 +622,11 @@ define void @test_13c(i32 %n) {
 ;
 ; CHECK-LABEL: @test_13c(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[ZEXT:%.*]] = zext i32 1024 to i64
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 2
-; CHECK-NEXT:    [[TMP0:%.*]] = icmp ult i64 [[IV]], [[ZEXT]]
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp ult i64 [[IV]], 1024
 ; CHECK-NEXT:    br i1 [[TMP0]], label [[LOOP]], label [[EXIT:%.*]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
@@ -650,12 +647,11 @@ define void @test_13d(i32 %n) {
 ;
 ; CHECK-LABEL: @test_13d(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[SEXT:%.*]] = sext i32 1024 to i64
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ -20, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[IV_NEXT]] = add nsw i64 [[IV]], 2
-; CHECK-NEXT:    [[TMP0:%.*]] = icmp slt i64 [[IV]], [[SEXT]]
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp slt i64 [[IV]], 1024
 ; CHECK-NEXT:    br i1 [[TMP0]], label [[LOOP]], label [[EXIT:%.*]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/IndVarSimplify/ada-loops.ll b/llvm/test/Transforms/IndVarSimplify/ada-loops.ll
index b4725f8ce774e..0da00408e41a5 100644
--- a/llvm/test/Transforms/IndVarSimplify/ada-loops.ll
+++ b/llvm/test/Transforms/IndVarSimplify/ada-loops.ll
@@ -22,7 +22,6 @@ define void @kinds__sbytezero(ptr nocapture %a) nounwind {
 ; CHECK-NEXT:  bb.thread:
 ; CHECK-NEXT:    [[TMP46:%.*]] = getelementptr [256 x i32], ptr [[A]], i32 0, i32 0
 ; CHECK-NEXT:    store i32 0, ptr [[TMP46]], align 4
-; CHECK-NEXT:    [[SEXT:%.*]] = sext i8 127 to i32
 ; CHECK-NEXT:    br label [[BB:%.*]]
 ; CHECK:       bb:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i32 [ [[INDVARS_IV_NEXT:%.*]], [[BB]] ], [ -128, [[BB_THREAD:%.*]] ]
@@ -30,7 +29,7 @@ define void @kinds__sbytezero(ptr nocapture %a) nounwind {
 ; CHECK-NEXT:    [[TMP3:%.*]] = add nsw i32 [[INDVARS_IV_NEXT]], 128
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr [256 x i32], ptr [[A]], i32 0, i32 [[TMP3]]
 ; CHECK-NEXT:    store i32 0, ptr [[TMP4]], align 4
-; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[INDVARS_IV_NEXT]], [[SEXT]]
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[INDVARS_IV_NEXT]], 127
 ; CHECK-NEXT:    br i1 [[TMP0]], label [[RETURN:%.*]], label [[BB]]
 ; CHECK:       return:
 ; CHECK-NEXT:    ret void

From ef6014e764825753293fabf65c92afe79ed402cd Mon Sep 17 00:00:00 2001
From: Christian Ulmann <christian.ulmann@nextsilicon.com>
Date: Tue, 31 Oct 2023 23:02:55 +0100
Subject: [PATCH 585/877] [MLIR][LaunchFuncToLLVM] Remove typed pointer support
 (#70840)

This commit removes the typed pointer support from the LaunchFunc's
lowering to LLVM dialect. Typed pointers have been deprecated for a
while now and it's planned to soon remove them from the LLVM dialect.

Related PSA:
https://discourse.llvm.org/t/psa-removal-of-typed-pointers-from-the-llvm-dialect/74502
---
 mlir/include/mlir/Conversion/Passes.td                      | 6 ------
 .../Conversion/SPIRVToLLVM/ConvertLaunchFuncToLLVMCalls.cpp | 1 -
 2 files changed, 7 deletions(-)

diff --git a/mlir/include/mlir/Conversion/Passes.td b/mlir/include/mlir/Conversion/Passes.td
index 036c9b0039779..2d504cd9192ad 100644
--- a/mlir/include/mlir/Conversion/Passes.td
+++ b/mlir/include/mlir/Conversion/Passes.td
@@ -472,12 +472,6 @@ def LowerHostCodeToLLVMPass : Pass<"lower-host-to-llvm", "ModuleOp"> {
     dialect, emitting C wrappers.
   }];
 
-  let options = [
-    Option<"useOpaquePointers", "use-opaque-pointers", "bool",
-                 /*default=*/"true", "Generate LLVM IR using opaque pointers "
-                 "instead of typed pointers">
-  ];
-
   let dependentDialects = ["LLVM::LLVMDialect"];
 }
 
diff --git a/mlir/lib/Conversion/SPIRVToLLVM/ConvertLaunchFuncToLLVMCalls.cpp b/mlir/lib/Conversion/SPIRVToLLVM/ConvertLaunchFuncToLLVMCalls.cpp
index 605c5b8b256ee..0e9eb9799c3e0 100644
--- a/mlir/lib/Conversion/SPIRVToLLVM/ConvertLaunchFuncToLLVMCalls.cpp
+++ b/mlir/lib/Conversion/SPIRVToLLVM/ConvertLaunchFuncToLLVMCalls.cpp
@@ -298,7 +298,6 @@ class LowerHostCodeToLLVM
 
     // Specify options to lower to LLVM and pull in the conversion patterns.
     LowerToLLVMOptions options(module.getContext());
-    options.useOpaquePointers = useOpaquePointers;
 
     auto *context = module.getContext();
     RewritePatternSet patterns(context);

From e8c0ae60d7020dabd6edbf3635080a6f65c60151 Mon Sep 17 00:00:00 2001
From: Joseph Huber <jhuber6@vols.utk.edu>
Date: Tue, 31 Oct 2023 17:23:24 -0500
Subject: [PATCH 586/877] [OpenMP] Add optimization to remove the RPC client
 (#70683)

Summary:
Part of the work done in the `libc` project is to provide host services
for things like `printf` or `malloc`, or generally any syscall-like
behaviour. This scheme works by emitting an externally visible global
called `__llvm_libc_rpc_client` that the host runtime can pick up to get
a handle to the global memory associated with the client. We use the
presence of this symbol to indicate whether or not we need to run an RPC
server. Normally, this symbol is only present if something requiring an
RPC server was linked in, such as `printf`. However, if this call to
`printf` was subsequently optimizated out, the symbol would remain and
cannot be removed (rightfully so) because of its linkage. This patch
adds a special-case optimization to remove this symbol so we can
indicate that an RPC server is no longer needed.

This patch puts this logic in `OpenMPOpt` as the most readily available
place for it. In the future, we should think how to move this somewhere
more generic. Furthermore, we use a hard-coded runtime name (which isn't
uncommon given all the other magic symbol names). But it might be nice
to abstract that part away.
---
 llvm/lib/Transforms/IPO/OpenMPOpt.cpp         | 34 +++++++++++++++++++
 .../test/Transforms/OpenMP/keep_rpc_client.ll | 34 +++++++++++++++++++
 .../Transforms/OpenMP/remove_rpc_client.ll    | 29 ++++++++++++++++
 3 files changed, 97 insertions(+)
 create mode 100644 llvm/test/Transforms/OpenMP/keep_rpc_client.ll
 create mode 100644 llvm/test/Transforms/OpenMP/remove_rpc_client.ll

diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
index b5e63de3cb839..3e9705d61e8e0 100644
--- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -978,6 +978,9 @@ struct OpenMPOpt {
       }
     }
 
+    if (OMPInfoCache.OpenMPPostLink)
+      Changed |= removeRuntimeSymbols();
+
     return Changed;
   }
 
@@ -1507,6 +1510,37 @@ struct OpenMPOpt {
     return Changed;
   }
 
+  /// Tries to remove known runtime symbols that are optional from the module.
+  bool removeRuntimeSymbols() {
+    // The RPC client symbol is defined in `libc` and indicates that something
+    // required an RPC server. If its users were all optimized out then we can
+    // safely remove it.
+    // TODO: This should be somewhere more common in the future.
+    if (GlobalVariable *GV = M.getNamedGlobal("__llvm_libc_rpc_client")) {
+      if (!GV->getType()->isPointerTy())
+        return false;
+
+      Constant *C = GV->getInitializer();
+      if (!C)
+        return false;
+
+      // Check to see if the only user of the RPC client is the external handle.
+      GlobalVariable *Client = dyn_cast<GlobalVariable>(C->stripPointerCasts());
+      if (!Client || Client->getNumUses() > 1 ||
+          Client->user_back() != GV->getInitializer())
+        return false;
+
+      Client->replaceAllUsesWith(PoisonValue::get(Client->getType()));
+      Client->eraseFromParent();
+
+      GV->replaceAllUsesWith(PoisonValue::get(GV->getType()));
+      GV->eraseFromParent();
+
+      return true;
+    }
+    return false;
+  }
+
   /// Tries to hide the latency of runtime calls that involve host to
   /// device memory transfers by splitting them into their "issue" and "wait"
   /// versions. The "issue" is moved upwards as much as possible. The "wait" is
diff --git a/llvm/test/Transforms/OpenMP/keep_rpc_client.ll b/llvm/test/Transforms/OpenMP/keep_rpc_client.ll
new file mode 100644
index 0000000000000..8ee7bddd4dc0b
--- /dev/null
+++ b/llvm/test/Transforms/OpenMP/keep_rpc_client.ll
@@ -0,0 +1,34 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals
+; RUN: opt -S -passes=openmp-opt-postlink < %s | FileCheck %s --check-prefix=POSTLINK
+; RUN: opt -S -passes=openmp-opt < %s | FileCheck %s --check-prefix=PRELINK
+
+@client = internal addrspace(1) global i64 zeroinitializer, align 8
+@__llvm_libc_rpc_client = protected local_unnamed_addr addrspace(1) global ptr addrspacecast (ptr addrspace(1) @client to ptr), align 8
+
+;.
+; POSTLINK: @[[CLIENT:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(1) global i64 0, align 8
+; POSTLINK: @[[__LLVM_LIBC_RPC_CLIENT:[a-zA-Z0-9_$"\\.-]+]] = protected local_unnamed_addr addrspace(1) global ptr addrspacecast (ptr addrspace(1) @client to ptr), align 8
+;.
+; PRELINK: @[[CLIENT:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(1) global i64 0, align 8
+; PRELINK: @[[__LLVM_LIBC_RPC_CLIENT:[a-zA-Z0-9_$"\\.-]+]] = protected local_unnamed_addr addrspace(1) global ptr addrspacecast (ptr addrspace(1) @client to ptr), align 8
+;.
+define i64 @a() {
+; POSTLINK-LABEL: define {{[^@]+}}@a
+; POSTLINK-SAME: () #[[ATTR0:[0-9]+]] {
+; POSTLINK-NEXT:    [[RETVAL:%.*]] = load i64, ptr addrspace(1) @client, align 8
+; POSTLINK-NEXT:    ret i64 [[RETVAL]]
+;
+; PRELINK-LABEL: define {{[^@]+}}@a
+; PRELINK-SAME: () #[[ATTR0:[0-9]+]] {
+; PRELINK-NEXT:    [[RETVAL:%.*]] = load i64, ptr addrspace(1) @client, align 8
+; PRELINK-NEXT:    ret i64 [[RETVAL]]
+;
+  %retval = load i64, ptr addrspace(1) @client, align 8
+  ret i64 %retval
+}
+
+!llvm.module.flags = !{!0, !1, !2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 7, !"openmp", i32 50}
+!2 = !{i32 7, !"openmp-device", i32 50}
diff --git a/llvm/test/Transforms/OpenMP/remove_rpc_client.ll b/llvm/test/Transforms/OpenMP/remove_rpc_client.ll
new file mode 100644
index 0000000000000..18538cbb72cb4
--- /dev/null
+++ b/llvm/test/Transforms/OpenMP/remove_rpc_client.ll
@@ -0,0 +1,29 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals
+; RUN: opt -S -passes=openmp-opt-postlink < %s | FileCheck %s --check-prefix=POSTLINK
+; RUN: opt -S -passes=openmp-opt < %s | FileCheck %s --check-prefix=PRELINK
+
+@client = internal addrspace(1) global i32 zeroinitializer, align 8
+@__llvm_libc_rpc_client = protected local_unnamed_addr addrspace(1) global ptr addrspacecast (ptr addrspace(1) @client to ptr), align 8
+
+;.
+; POSTLINK-NOT: @[[CLIENT:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(1) global i32 0, align 8
+; POSTLINK-NOT: @[[__LLVM_LIBC_RPC_CLIENT:[a-zA-Z0-9_$"\\.-]+]] = protected local_unnamed_addr addrspace(1) global ptr addrspacecast (ptr addrspace(1) @client to ptr), align 8
+;.
+; PRELINK: @[[CLIENT:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(1) global i32 0, align 8
+; PRELINK: @[[__LLVM_LIBC_RPC_CLIENT:[a-zA-Z0-9_$"\\.-]+]] = protected local_unnamed_addr addrspace(1) global ptr addrspacecast (ptr addrspace(1) @client to ptr), align 8
+;.
+define void @a() {
+; POSTLINK-LABEL: define {{[^@]+}}@a() {
+; POSTLINK-NEXT:    ret void
+;
+; PRELINK-LABEL: define {{[^@]+}}@a() {
+; PRELINK-NEXT:    ret void
+;
+  ret void
+}
+
+!llvm.module.flags = !{!0, !1, !2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 7, !"openmp", i32 50}
+!2 = !{i32 7, !"openmp-device", i32 50}

From 6ae7b735dbd50eb7ade1573a86d037a2943e679c Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Tue, 31 Oct 2023 15:30:08 -0700
Subject: [PATCH 587/877] [ARM][test] Improve stack-protector tests

llvm/test/LTO/ARM/ssp-static-reloc.ll is more about using the static
relocation model with "PIC Level" and unrelated to the LTO infrastructure.
Move the test. Update stack_guard_remat.ll to clearly test "PIC Level"
with the relevant relocation models.
---
 llvm/test/CodeGen/ARM/stack-guard-elf.ll   | 41 ++++++++++++++++++++++
 llvm/test/CodeGen/ARM/stack_guard_remat.ll | 16 +++++----
 llvm/test/LTO/ARM/ssp-static-reloc.ll      | 40 ---------------------
 3 files changed, 51 insertions(+), 46 deletions(-)
 create mode 100644 llvm/test/CodeGen/ARM/stack-guard-elf.ll
 delete mode 100644 llvm/test/LTO/ARM/ssp-static-reloc.ll

diff --git a/llvm/test/CodeGen/ARM/stack-guard-elf.ll b/llvm/test/CodeGen/ARM/stack-guard-elf.ll
new file mode 100644
index 0000000000000..6c0421d6a3c9e
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/stack-guard-elf.ll
@@ -0,0 +1,41 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc -relocation-model=static < %s | FileCheck %s
+
+target triple = "armv7a-linux-gnueabi"
+
+define i32 @test1() #0 {
+; CHECK-LABEL: test1:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    push {r11, lr}
+; CHECK-NEXT:    sub sp, sp, #8
+; CHECK-NEXT:    sub sp, sp, #1024
+; CHECK-NEXT:    movw r0, :lower16:__stack_chk_guard
+; CHECK-NEXT:    movt r0, :upper16:__stack_chk_guard
+; CHECK-NEXT:    ldr r0, [r0]
+; CHECK-NEXT:    ldr r0, [r0]
+; CHECK-NEXT:    str r0, [sp, #1028]
+; CHECK-NEXT:    add r0, sp, #4
+; CHECK-NEXT:    bl foo
+; CHECK-NEXT:    movw r1, :lower16:__stack_chk_guard
+; CHECK-NEXT:    ldr r0, [sp, #1028]
+; CHECK-NEXT:    movt r1, :upper16:__stack_chk_guard
+; CHECK-NEXT:    ldr r1, [r1]
+; CHECK-NEXT:    ldr r1, [r1]
+; CHECK-NEXT:    cmp r1, r0
+; CHECK-NEXT:    moveq r0, #0
+; CHECK-NEXT:    addeq sp, sp, #8
+; CHECK-NEXT:    addeq sp, sp, #1024
+; CHECK-NEXT:    popeq {r11, pc}
+; CHECK-NEXT:  .LBB0_1:
+; CHECK-NEXT:    bl __stack_chk_fail
+  %a1 = alloca [256 x i32], align 4
+  call void @foo(ptr %a1) #3
+  ret i32 0
+}
+
+declare void @foo(ptr)
+
+attributes #0 = { nounwind sspstrong }
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 8, !"PIC Level", i32 2}
diff --git a/llvm/test/CodeGen/ARM/stack_guard_remat.ll b/llvm/test/CodeGen/ARM/stack_guard_remat.ll
index ad40212c9b6f1..983ef1336daa4 100644
--- a/llvm/test/CodeGen/ARM/stack_guard_remat.ll
+++ b/llvm/test/CodeGen/ARM/stack_guard_remat.ll
@@ -1,9 +1,11 @@
-; RUN: llc < %s -mtriple=arm-apple-ios -relocation-model=pic -no-integrated-as | FileCheck %s -check-prefix=PIC
-; RUN: llc < %s -mtriple=arm-apple-ios -relocation-model=static -no-integrated-as | FileCheck %s -check-prefix=NO-PIC -check-prefix=STATIC
-; RUN: llc < %s -mtriple=arm-apple-ios -relocation-model=dynamic-no-pic -no-integrated-as | FileCheck %s  -check-prefix=NO-PIC -check-prefix=DYNAMIC-NO-PIC
-; RUN: llc < %s -mtriple=armv7-apple-ios -mcpu=cortex-a8 -relocation-model=pic -no-integrated-as | FileCheck %s -check-prefix=PIC-V7
-; RUN: llc < %s -mtriple=armv7-apple-ios -mcpu=cortex-a8 -relocation-model=static -no-integrated-as | FileCheck %s -check-prefix=STATIC-V7
-; RUN: llc < %s -mtriple=armv7-apple-ios -mcpu=cortex-a8 -relocation-model=dynamic-no-pic -no-integrated-as | FileCheck %s -check-prefix=DYNAMIC-NO-PIC-V7
+; RUN: rm -rf %t && split-file %s %t && cd %t
+; RUN: cat main.ll pic-flag.ll > pic.ll
+; RUN: llc < pic.ll -mtriple=arm-apple-ios -relocation-model=pic -no-integrated-as | FileCheck %s -check-prefix=PIC
+; RUN: llc < main.ll -mtriple=arm-apple-ios -relocation-model=static -no-integrated-as | FileCheck %s -check-prefix=NO-PIC -check-prefix=STATIC
+; RUN: llc < main.ll -mtriple=arm-apple-ios -relocation-model=dynamic-no-pic -no-integrated-as | FileCheck %s -check-prefixes=NO-PIC,DYNAMIC-NO-PIC
+; RUN: llc < pic.ll -mtriple=armv7-apple-ios -mcpu=cortex-a8 -relocation-model=pic -no-integrated-as | FileCheck %s -check-prefix=PIC-V7
+; RUN: llc < main.ll -mtriple=armv7-apple-ios -mcpu=cortex-a8 -relocation-model=static -no-integrated-as | FileCheck %s -check-prefix=STATIC-V7
+; RUN: llc < main.ll -mtriple=armv7-apple-ios -mcpu=cortex-a8 -relocation-model=dynamic-no-pic -no-integrated-as | FileCheck %s -check-prefix=DYNAMIC-NO-PIC-V7
 
 ;PIC:   foo2
 ;PIC:   ldr [[R0:r[0-9]+]], [[LABEL0:LCPI[0-9_]+]]
@@ -47,6 +49,7 @@
 ;DYNAMIC-NO-PIC-V7: L___stack_chk_guard$non_lazy_ptr:
 ;DYNAMIC-NO-PIC-V7:   .indirect_symbol        ___stack_chk_guard
 
+;--- main.ll
 ; Function Attrs: nounwind ssp
 define i32 @test_stack_guard_remat() #0 {
   %a1 = alloca [256 x i32], align 4
@@ -67,5 +70,6 @@ declare void @llvm.lifetime.end.p0(i64, ptr nocapture)
 
 attributes #0 = { nounwind ssp "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
 
+;--- pic-flag.ll
 !llvm.module.flags = !{!0}
 !0 = !{i32 7, !"PIC Level", i32 2}
diff --git a/llvm/test/LTO/ARM/ssp-static-reloc.ll b/llvm/test/LTO/ARM/ssp-static-reloc.ll
deleted file mode 100644
index c8825c2aae0fb..0000000000000
--- a/llvm/test/LTO/ARM/ssp-static-reloc.ll
+++ /dev/null
@@ -1,40 +0,0 @@
-; RUN: llvm-as < %s > %t.bc
-; RUN: llvm-lto -O0 -relocation-model=static -o %t.o %t.bc
-; RUN: llvm-objdump -d -r %t.o | FileCheck %s
-
-; Confirm that we do generate one too many indirections accessing the stack guard
-; variable, when the relocation model is static and the PIC level is not 0..
-; This is preparation for the fix.
-;
-target triple = "armv4t-unknown-unknown"
-
-define arm_aapcscc i8 @foo() #0 {
-entry:
-  %arr = alloca [200 x i8], align 1
-  call void @llvm.memset.p0.i32(ptr align 1 %arr, i8 0, i32 200, i1 false)
-  %arrayidx = getelementptr inbounds [200 x i8], ptr %arr, i32 0, i8 5
-  %0 = load i8, ptr %arrayidx, align 1
-  ret i8 %0
-}
-
-; CHECK:      <foo>:
-; CHECK:      [[#%x,CURPC:]]:{{.*}} ldr r[[REG1:[0-9]+]], [pc, #0x[[#%x,OFFSET:]]]
-; CHECK-NEXT: ldr r[[REG2:[0-9]+]], [r[[REG1]]]
-; CHECK-NEXT: ldr r[[REG3:[0-9]+]], [r[[REG2]]]
-; CHECK-NEXT: str r[[REG3]],
-; CHECK:      [[#CURPC + OFFSET + 8]]:{{.*}}.word
-; CHECK-NEXT: R_ARM_ABS32 __stack_chk_guard
-
-declare void @llvm.memset.p0.i32(ptr nocapture writeonly, i8, i32, i1 immarg)
-
-define arm_aapcscc i32 @main() {
-entry:
-  %call = call arm_aapcscc i8 @foo()
-  %conv = zext i8 %call to i32
-  ret i32 %conv
-}
-
-attributes #0 = { sspstrong }
-
-!llvm.module.flags = !{!0}
-!0 = !{i32 8, !"PIC Level", i32 2}

From 15733fe5f3cbf36ad1c9fbb0c6e4f31fdec224fa Mon Sep 17 00:00:00 2001
From: Adrian Prantl <adrian-prantl@users.noreply.github.com>
Date: Tue, 31 Oct 2023 15:36:06 -0700
Subject: [PATCH 588/877] Also log the error output from xcrun, if it fails.
 (#70716)

In the rare case that an Xcode installation is damaged, this output
could contain clues to further diagnose the issue.

rdar://117698630
---
 lldb/source/Host/macosx/objcxx/HostInfoMacOSX.mm | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/lldb/source/Host/macosx/objcxx/HostInfoMacOSX.mm b/lldb/source/Host/macosx/objcxx/HostInfoMacOSX.mm
index 33d94504fe70f..110a01732b247 100644
--- a/lldb/source/Host/macosx/objcxx/HostInfoMacOSX.mm
+++ b/lldb/source/Host/macosx/objcxx/HostInfoMacOSX.mm
@@ -417,6 +417,8 @@ static void ParseOSVersion(llvm::VersionTuple &version, NSString *Key) {
     // xcrun didn't find a matching SDK. Not an error, we'll try
     // different spellings.
     LLDB_LOG(log, "xcrun returned exit code {0}", status);
+    if (!output_str.empty())
+      LLDB_LOG(log, "xcrun output was:\n{0}", output_str);
     return "";
   }
   if (output_str.empty()) {

From 5888dee7d04748744743a35d3aef030018bdc275 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Tue, 31 Oct 2023 15:37:26 -0700
Subject: [PATCH 589/877] [ARM,ELF] Fix access to dso_preemptable
 __stack_chk_guard with static relocation model (#70014)

The ELF code from https://reviews.llvm.org/D112811 emits LDRLIT_ga_pcrel
when `TM.isPositionIndependent()` but uses a different condition
`Subtarget.isGVIndirectSymbol(GV)` (aka dso_preemptable on ELF targets).
This would cause incorrect access for dso_preemptable
`__stack_chk_guard` with the static relocation model.

Regarding whether `__stack_chk_guard` gets the dso_local specifier,
https://reviews.llvm.org/D150841 switched to
`M.getDirectAccessExternalData()` (implied by "PIC Level") instead of
`TM.getRelocationModel() == Reloc::Static`.

The result is that when non-zero "PIC Level" is used with static
relocation model (e.g. -fPIE/-fPIC LTO compiles with -no-pie linking),
`__stack_chk_guard` accesses are incorrect.
```
        ldr     r0, .LCPI0_0
        ldr     r0, [r0]
        ldr     r0, [r0]   // incorrectly dereferences __stack_chk_guard
...
.LCPI0_0:
        .long   __stack_chk_guard
```

To fix this, for dso_preemptable `__stack_chk_guard`, emit a GOT PIC
code sequence like for -fpic using `LDRLIT_ga_pcrel`:
```
        ldr     r0, .LCPI0_0
.LPC0_0:
        add     r0, pc, r0
        ldr     r0, [r0]
        ldr     r0, [r0]
...
LCPI0_0:
.Ltmp0:
        .long   __stack_chk_guard(GOT_PREL)-((.LPC0_0+8)-.Ltmp0)
```

Technically, `LDRLIT_ga_abs` with `R_ARM_GOT_ABS` could be used, but
`R_ARM_GOT_ABS` does not have GNU or integrated assembler support.
(Note, `.LCPI0_0: .long __stack_chk_guard@GOT` produces an
`R_ARM_GOT_BREL`, which is not desired).

This patch fixes #6499 while not changing behavior for the following
configurations:
```
run arm.linux.nopic --target=arm-linux-gnueabi -fno-pic
run arm.linux.pie --target=arm-linux-gnueabi -fpie
run arm.linux.pic --target=arm-linux-gnueabi -fpic
run armv6.darwin.nopic --target=armv6-apple-darwin -fno-pic
run armv6.darwin.dynamicnopic --target=armv6-apple-darwin -mdynamic-no-pic
run armv6.darwin.pic --target=armv6-apple-darwin -fpic
run armv7.darwin.nopic --target=armv7-apple-darwin -mcpu=cortex-a8 -fno-pic
run armv7.darwin.dynamicnopic --target=armv7-apple-darwin -mcpu=cortex-a8 -mdynamic-no-pic
run armv7.darwin.pic --target=armv7-apple-darwin -mcpu=cortex-a8 -fpic
run arm64.darwin.pic --target=arm64-apple-darwin
```
---
 llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp |  2 +-
 llvm/lib/Target/ARM/ARMInstrInfo.cpp     |  7 +++++--
 llvm/test/CodeGen/ARM/stack-guard-elf.ll | 22 ++++++++++++++++++----
 3 files changed, 24 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
index 1ffdde0360cf6..4c78379ccf5c4 100644
--- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -4978,7 +4978,7 @@ void ARMBaseInstrInfo::expandLoadStackGuardBase(MachineBasicBlock::iterator MI,
         TargetFlags |= ARMII::MO_DLLIMPORT;
       else if (IsIndirect)
         TargetFlags |= ARMII::MO_COFFSTUB;
-    } else if (Subtarget.isGVInGOT(GV)) {
+    } else if (IsIndirect) {
       TargetFlags |= ARMII::MO_GOT;
     }
 
diff --git a/llvm/lib/Target/ARM/ARMInstrInfo.cpp b/llvm/lib/Target/ARM/ARMInstrInfo.cpp
index 00db13f2eb520..ccc883f646a62 100644
--- a/llvm/lib/Target/ARM/ARMInstrInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMInstrInfo.cpp
@@ -104,8 +104,11 @@ void ARMInstrInfo::expandLoadStackGuard(MachineBasicBlock::iterator MI) const {
   const GlobalValue *GV =
       cast<GlobalValue>((*MI->memoperands_begin())->getValue());
 
-  if (!Subtarget.useMovt() || Subtarget.isGVInGOT(GV)) {
-    if (TM.isPositionIndependent())
+  bool ForceELFGOTPIC = Subtarget.isTargetELF() && !GV->isDSOLocal();
+  if (!Subtarget.useMovt() || ForceELFGOTPIC) {
+    // For ELF non-PIC, use GOT PIC code sequence as well because R_ARM_GOT_ABS
+    // does not have assembler support.
+    if (TM.isPositionIndependent() || ForceELFGOTPIC)
       expandLoadStackGuardBase(MI, ARM::LDRLIT_ga_pcrel, ARM::LDRi12);
     else
       expandLoadStackGuardBase(MI, ARM::LDRLIT_ga_abs, ARM::LDRi12);
diff --git a/llvm/test/CodeGen/ARM/stack-guard-elf.ll b/llvm/test/CodeGen/ARM/stack-guard-elf.ll
index 6c0421d6a3c9e..7a342d003c72d 100644
--- a/llvm/test/CodeGen/ARM/stack-guard-elf.ll
+++ b/llvm/test/CodeGen/ARM/stack-guard-elf.ll
@@ -1,5 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+;; direct-access-external-data is false due to PIC Level, so __stack_chk_guard
+;; is dso_preemtable. Check that we use GOT PIC code sequence as well because
+;; R_ARM_GOT_ABS does not have assembler support.
 ; RUN: llc -relocation-model=static < %s | FileCheck %s
+; RUN: llc -relocation-model=pic < %s | FileCheck %s
 
 target triple = "armv7a-linux-gnueabi"
 
@@ -9,16 +13,18 @@ define i32 @test1() #0 {
 ; CHECK-NEXT:    push {r11, lr}
 ; CHECK-NEXT:    sub sp, sp, #8
 ; CHECK-NEXT:    sub sp, sp, #1024
-; CHECK-NEXT:    movw r0, :lower16:__stack_chk_guard
-; CHECK-NEXT:    movt r0, :upper16:__stack_chk_guard
+; CHECK-NEXT:    ldr r0, .LCPI0_0
+; CHECK-NEXT:  .LPC0_0:
+; CHECK-NEXT:    add r0, pc, r0
 ; CHECK-NEXT:    ldr r0, [r0]
 ; CHECK-NEXT:    ldr r0, [r0]
 ; CHECK-NEXT:    str r0, [sp, #1028]
 ; CHECK-NEXT:    add r0, sp, #4
 ; CHECK-NEXT:    bl foo
-; CHECK-NEXT:    movw r1, :lower16:__stack_chk_guard
 ; CHECK-NEXT:    ldr r0, [sp, #1028]
-; CHECK-NEXT:    movt r1, :upper16:__stack_chk_guard
+; CHECK-NEXT:    ldr r1, .LCPI0_1
+; CHECK-NEXT:  .LPC0_1:
+; CHECK-NEXT:    add r1, pc, r1
 ; CHECK-NEXT:    ldr r1, [r1]
 ; CHECK-NEXT:    ldr r1, [r1]
 ; CHECK-NEXT:    cmp r1, r0
@@ -28,6 +34,14 @@ define i32 @test1() #0 {
 ; CHECK-NEXT:    popeq {r11, pc}
 ; CHECK-NEXT:  .LBB0_1:
 ; CHECK-NEXT:    bl __stack_chk_fail
+; CHECK-NEXT:    .p2align 2
+; CHECK-NEXT:  @ %bb.2:
+; CHECK-NEXT:  .LCPI0_0:
+; CHECK-NEXT:  .Ltmp0:
+; CHECK-NEXT:    .long __stack_chk_guard(GOT_PREL)-((.LPC0_0+8)-.Ltmp0)
+; CHECK-NEXT:  .LCPI0_1:
+; CHECK-NEXT:  .Ltmp1:
+; CHECK-NEXT:    .long __stack_chk_guard(GOT_PREL)-((.LPC0_1+8)-.Ltmp1)
   %a1 = alloca [256 x i32], align 4
   call void @foo(ptr %a1) #3
   ret i32 0

From 7ad9e9dcf518431a8ecedcc06b09df6c799658ef Mon Sep 17 00:00:00 2001
From: Matteo Franciolini <m_franciolini@apple.com>
Date: Tue, 31 Oct 2023 15:41:29 -0700
Subject: [PATCH 590/877] [mlir][bytecode] Implements back deployment
 capability for MLIR dialects (#70724)

When emitting bytecode, clients can specify a target dialect version to
emit in `BytecodeWriterConfig`. This exposes a target dialect version to
the DialectBytecodeWriter, which can be queried by name and used to
back-deploy attributes, types, and properties.
---
 .../mlir/Bytecode/BytecodeImplementation.h    |  13 +++
 mlir/include/mlir/Bytecode/BytecodeWriter.h   |  16 ++-
 mlir/lib/Bytecode/Writer/BytecodeWriter.cpp   |  42 +++++++-
 mlir/lib/Bytecode/Writer/IRNumbering.cpp      |  27 +++--
 mlir/test/Bytecode/bytecode_callback.mlir     |   4 +-
 .../bytecode_callback_full_override.mlir      |   2 +-
 ...tecode_callback_with_custom_attribute.mlir |   4 +-
 .../bytecode_callback_with_custom_type.mlir   |   4 +-
 mlir/test/Bytecode/bytecode_downgrade.mlir    |  10 ++
 mlir/test/lib/Dialect/Test/TestDialect.cpp    |  15 ++-
 .../Dialect/Test/TestDialectInterfaces.cpp    |  15 ++-
 mlir/test/lib/IR/CMakeLists.txt               |   2 +-
 ...allbacks.cpp => TestBytecodeRoundtrip.cpp} | 100 ++++++++++++++----
 mlir/tools/mlir-opt/mlir-opt.cpp              |   4 +-
 14 files changed, 207 insertions(+), 51 deletions(-)
 create mode 100644 mlir/test/Bytecode/bytecode_downgrade.mlir
 rename mlir/test/lib/IR/{TestBytecodeCallbacks.cpp => TestBytecodeRoundtrip.cpp} (80%)

diff --git a/mlir/include/mlir/Bytecode/BytecodeImplementation.h b/mlir/include/mlir/Bytecode/BytecodeImplementation.h
index bb1f0f717d800..396c821d4a9fd 100644
--- a/mlir/include/mlir/Bytecode/BytecodeImplementation.h
+++ b/mlir/include/mlir/Bytecode/BytecodeImplementation.h
@@ -54,6 +54,10 @@ class DialectBytecodeReader {
   /// Retrieve the dialect version by name if available.
   virtual FailureOr<const DialectVersion *>
   getDialectVersion(StringRef dialectName) const = 0;
+  template <class T>
+  FailureOr<const DialectVersion *> getDialectVersion() const {
+    return getDialectVersion(T::getDialectNamespace());
+  }
 
   /// Retrieve the context associated to the reader.
   virtual MLIRContext *getContext() const = 0;
@@ -400,6 +404,15 @@ class DialectBytecodeWriter {
 
   /// Return the bytecode version being emitted for.
   virtual int64_t getBytecodeVersion() const = 0;
+
+  /// Retrieve the dialect version by name if available.
+  virtual FailureOr<const DialectVersion *>
+  getDialectVersion(StringRef dialectName) const = 0;
+
+  template <class T>
+  FailureOr<const DialectVersion *> getDialectVersion() const {
+    return getDialectVersion(T::getDialectNamespace());
+  };
 };
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/Bytecode/BytecodeWriter.h b/mlir/include/mlir/Bytecode/BytecodeWriter.h
index e0c46c3dab27a..2cc489778d4de 100644
--- a/mlir/include/mlir/Bytecode/BytecodeWriter.h
+++ b/mlir/include/mlir/Bytecode/BytecodeWriter.h
@@ -16,8 +16,9 @@
 #include "mlir/IR/AsmState.h"
 
 namespace mlir {
-class Operation;
 class DialectBytecodeWriter;
+class DialectVersion;
+class Operation;
 
 /// A class to interact with the attributes and types printer when emitting MLIR
 /// bytecode.
@@ -97,6 +98,19 @@ class BytecodeWriterConfig {
   /// Get the set desired bytecode version to emit.
   int64_t getDesiredBytecodeVersion() const;
 
+  /// A map containing the dialect versions to emit.
+  llvm::StringMap<std::unique_ptr<DialectVersion>> &
+  getDialectVersionMap() const;
+
+  /// Set a given dialect version to emit on the map.
+  template <class T>
+  void setDialectVersion(std::unique_ptr<DialectVersion> dialectVersion) const {
+    return setDialectVersion(T::getDialectNamespace(),
+                             std::move(dialectVersion));
+  };
+  void setDialectVersion(StringRef dialectName,
+                         std::unique_ptr<DialectVersion> dialectVersion) const;
+
   //===--------------------------------------------------------------------===//
   // Types and Attributes encoding
   //===--------------------------------------------------------------------===//
diff --git a/mlir/lib/Bytecode/Writer/BytecodeWriter.cpp b/mlir/lib/Bytecode/Writer/BytecodeWriter.cpp
index 5628ff6c54af6..01dcea1ca3848 100644
--- a/mlir/lib/Bytecode/Writer/BytecodeWriter.cpp
+++ b/mlir/lib/Bytecode/Writer/BytecodeWriter.cpp
@@ -39,6 +39,9 @@ struct BytecodeWriterConfig::Impl {
   /// Note: This only differs from kVersion if a specific version is set.
   int64_t bytecodeVersion = bytecode::kVersion;
 
+  /// A map containing dialect version information for each dialect to emit.
+  llvm::StringMap<std::unique_ptr<DialectVersion>> dialectVersionMap;
+
   /// The producer of the bytecode.
   StringRef producer;
 
@@ -94,6 +97,19 @@ int64_t BytecodeWriterConfig::getDesiredBytecodeVersion() const {
   return impl->bytecodeVersion;
 }
 
+llvm::StringMap<std::unique_ptr<DialectVersion>> &
+BytecodeWriterConfig::getDialectVersionMap() const {
+  return impl->dialectVersionMap;
+}
+
+void BytecodeWriterConfig::setDialectVersion(
+    llvm::StringRef dialectName,
+    std::unique_ptr<DialectVersion> dialectVersion) const {
+  assert(!impl->dialectVersionMap.contains(dialectName) &&
+         "cannot override a previously set dialect version");
+  impl->dialectVersionMap.insert({dialectName, std::move(dialectVersion)});
+}
+
 //===----------------------------------------------------------------------===//
 // EncodingEmitter
 //===----------------------------------------------------------------------===//
@@ -340,12 +356,16 @@ class StringSectionBuilder {
 } // namespace
 
 class DialectWriter : public DialectBytecodeWriter {
+  using DialectVersionMapT = llvm::StringMap<std::unique_ptr<DialectVersion>>;
+
 public:
   DialectWriter(int64_t bytecodeVersion, EncodingEmitter &emitter,
                 IRNumberingState &numberingState,
-                StringSectionBuilder &stringSection)
+                StringSectionBuilder &stringSection,
+                const DialectVersionMapT &dialectVersionMap)
       : bytecodeVersion(bytecodeVersion), emitter(emitter),
-        numberingState(numberingState), stringSection(stringSection) {}
+        numberingState(numberingState), stringSection(stringSection),
+        dialectVersionMap(dialectVersionMap) {}
 
   //===--------------------------------------------------------------------===//
   // IR
@@ -421,11 +441,20 @@ class DialectWriter : public DialectBytecodeWriter {
 
   int64_t getBytecodeVersion() const override { return bytecodeVersion; }
 
+  FailureOr<const DialectVersion *>
+  getDialectVersion(StringRef dialectName) const override {
+    auto dialectEntry = dialectVersionMap.find(dialectName);
+    if (dialectEntry == dialectVersionMap.end())
+      return failure();
+    return dialectEntry->getValue().get();
+  }
+
 private:
   int64_t bytecodeVersion;
   EncodingEmitter &emitter;
   IRNumberingState &numberingState;
   StringSectionBuilder &stringSection;
+  const DialectVersionMapT &dialectVersionMap;
 };
 
 namespace {
@@ -458,7 +487,8 @@ class PropertiesSectionBuilder {
 
     EncodingEmitter emitter;
     DialectWriter propertiesWriter(config.bytecodeVersion, emitter,
-                                   numberingState, stringSection);
+                                   numberingState, stringSection,
+                                   config.dialectVersionMap);
     auto iface = cast<BytecodeOpInterface>(op);
     iface.writeProperties(propertiesWriter);
     scratch.clear();
@@ -751,7 +781,8 @@ void BytecodeWriter::writeDialectSection(EncodingEmitter &emitter) {
     if (dialect.interface) {
       // The writer used when emitting using a custom bytecode encoding.
       DialectWriter versionWriter(config.bytecodeVersion, versionEmitter,
-                                  numberingState, stringSection);
+                                  numberingState, stringSection,
+                                  config.dialectVersionMap);
       dialect.interface->writeVersion(versionWriter);
     }
 
@@ -809,7 +840,8 @@ void BytecodeWriter::writeAttrTypeSection(EncodingEmitter &emitter) {
       }
 
       DialectWriter dialectWriter(config.bytecodeVersion, attrTypeEmitter,
-                                  numberingState, stringSection);
+                                  numberingState, stringSection,
+                                  config.dialectVersionMap);
       if constexpr (std::is_same_v<std::decay_t<decltype(entryValue)>, Type>) {
         for (const auto &callback : config.typeWriterCallbacks) {
           if (succeeded(callback->write(entryValue, dialectWriter)))
diff --git a/mlir/lib/Bytecode/Writer/IRNumbering.cpp b/mlir/lib/Bytecode/Writer/IRNumbering.cpp
index 74c45723c222d..036a9477cce6c 100644
--- a/mlir/lib/Bytecode/Writer/IRNumbering.cpp
+++ b/mlir/lib/Bytecode/Writer/IRNumbering.cpp
@@ -12,7 +12,6 @@
 #include "mlir/IR/AsmState.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/OpDefinition.h"
-#include "llvm/Support/ErrorHandling.h"
 
 using namespace mlir;
 using namespace mlir::bytecode::detail;
@@ -22,7 +21,10 @@ using namespace mlir::bytecode::detail;
 //===----------------------------------------------------------------------===//
 
 struct IRNumberingState::NumberingDialectWriter : public DialectBytecodeWriter {
-  NumberingDialectWriter(IRNumberingState &state) : state(state) {}
+  NumberingDialectWriter(
+      IRNumberingState &state,
+      llvm::StringMap<std::unique_ptr<DialectVersion>> &dialectVersionMap)
+      : state(state), dialectVersionMap(dialectVersionMap) {}
 
   void writeAttribute(Attribute attr) override { state.number(attr); }
   void writeOptionalAttribute(Attribute attr) override {
@@ -51,8 +53,19 @@ struct IRNumberingState::NumberingDialectWriter : public DialectBytecodeWriter {
     return state.getDesiredBytecodeVersion();
   }
 
+  FailureOr<const DialectVersion *>
+  getDialectVersion(StringRef dialectName) const override {
+    auto dialectEntry = dialectVersionMap.find(dialectName);
+    if (dialectEntry == dialectVersionMap.end())
+      return failure();
+    return dialectEntry->getValue().get();
+  }
+
   /// The parent numbering state that is populated by this writer.
   IRNumberingState &state;
+
+  /// A map containing dialect version information for each dialect to emit.
+  llvm::StringMap<std::unique_ptr<DialectVersion>> &dialectVersionMap;
 };
 
 //===----------------------------------------------------------------------===//
@@ -318,7 +331,7 @@ void IRNumberingState::number(Attribute attr) {
   if (!attr.hasTrait<AttributeTrait::IsMutable>()) {
     // Try overriding emission with callbacks.
     for (const auto &callback : config.getAttributeWriterCallbacks()) {
-      NumberingDialectWriter writer(*this);
+      NumberingDialectWriter writer(*this, config.getDialectVersionMap());
       // The client has the ability to override the group name through the
       // callback.
       std::optional<StringRef> groupNameOverride;
@@ -330,7 +343,7 @@ void IRNumberingState::number(Attribute attr) {
     }
 
     if (const auto *interface = numbering->dialect->interface) {
-      NumberingDialectWriter writer(*this);
+      NumberingDialectWriter writer(*this, config.getDialectVersionMap());
       if (succeeded(interface->writeAttribute(attr, writer)))
         return;
     }
@@ -426,7 +439,7 @@ void IRNumberingState::number(Operation &op) {
     if (op.isRegistered()) {
       // Operation that have properties *must* implement this interface.
       auto iface = cast<BytecodeOpInterface>(op);
-      NumberingDialectWriter writer(*this);
+      NumberingDialectWriter writer(*this, config.getDialectVersionMap());
       iface.writeProperties(writer);
     } else {
       // Unregistered op are storing properties as an optional attribute.
@@ -481,7 +494,7 @@ void IRNumberingState::number(Type type) {
   if (!type.hasTrait<TypeTrait::IsMutable>()) {
     // Try overriding emission with callbacks.
     for (const auto &callback : config.getTypeWriterCallbacks()) {
-      NumberingDialectWriter writer(*this);
+      NumberingDialectWriter writer(*this, config.getDialectVersionMap());
       // The client has the ability to override the group name through the
       // callback.
       std::optional<StringRef> groupNameOverride;
@@ -495,7 +508,7 @@ void IRNumberingState::number(Type type) {
     // If this attribute will be emitted using the bytecode format, perform a
     // dummy writing to number any nested components.
     if (const auto *interface = numbering->dialect->interface) {
-      NumberingDialectWriter writer(*this);
+      NumberingDialectWriter writer(*this, config.getDialectVersionMap());
       if (succeeded(interface->writeType(type, writer)))
         return;
     }
diff --git a/mlir/test/Bytecode/bytecode_callback.mlir b/mlir/test/Bytecode/bytecode_callback.mlir
index cf3981c86b944..3b537bb974da9 100644
--- a/mlir/test/Bytecode/bytecode_callback.mlir
+++ b/mlir/test/Bytecode/bytecode_callback.mlir
@@ -1,5 +1,5 @@
-// RUN: mlir-opt %s --test-bytecode-callback="test-dialect-version=1.2" -verify-diagnostics | FileCheck %s --check-prefix=VERSION_1_2
-// RUN: mlir-opt %s --test-bytecode-callback="test-dialect-version=2.0" -verify-diagnostics | FileCheck %s --check-prefix=VERSION_2_0
+// RUN: mlir-opt %s --test-bytecode-roundtrip="test-dialect-version=1.2" -verify-diagnostics | FileCheck %s --check-prefix=VERSION_1_2
+// RUN: mlir-opt %s --test-bytecode-roundtrip="test-dialect-version=2.0" -verify-diagnostics | FileCheck %s --check-prefix=VERSION_2_0
 
 func.func @base_test(%arg0 : i32) -> f32 {
   %0 = "test.addi"(%arg0, %arg0) : (i32, i32) -> i32
diff --git a/mlir/test/Bytecode/bytecode_callback_full_override.mlir b/mlir/test/Bytecode/bytecode_callback_full_override.mlir
index 21ff947ad389b..792f8f6e2959b 100644
--- a/mlir/test/Bytecode/bytecode_callback_full_override.mlir
+++ b/mlir/test/Bytecode/bytecode_callback_full_override.mlir
@@ -1,4 +1,4 @@
-// RUN: not mlir-opt %s -split-input-file --test-bytecode-callback="callback-test=5" 2>&1 | FileCheck %s
+// RUN: not mlir-opt %s -split-input-file --test-bytecode-roundtrip="test-kind=5" 2>&1 | FileCheck %s
 
 // CHECK-NOT: failed to read bytecode
 func.func @base_test(%arg0 : i32) -> f32 {
diff --git a/mlir/test/Bytecode/bytecode_callback_with_custom_attribute.mlir b/mlir/test/Bytecode/bytecode_callback_with_custom_attribute.mlir
index 487972f85af5b..30f25d9865792 100644
--- a/mlir/test/Bytecode/bytecode_callback_with_custom_attribute.mlir
+++ b/mlir/test/Bytecode/bytecode_callback_with_custom_attribute.mlir
@@ -1,5 +1,5 @@
-// RUN: mlir-opt %s -split-input-file --test-bytecode-callback="callback-test=3" | FileCheck %s --check-prefix=TEST_3
-// RUN: mlir-opt %s -split-input-file --test-bytecode-callback="callback-test=4" | FileCheck %s --check-prefix=TEST_4
+// RUN: mlir-opt %s -split-input-file --test-bytecode-roundtrip="test-kind=3" | FileCheck %s --check-prefix=TEST_3
+// RUN: mlir-opt %s -split-input-file --test-bytecode-roundtrip="test-kind=4" | FileCheck %s --check-prefix=TEST_4
 
 "test.versionedC"() <{attribute = #test.attr_params<42, 24>}> : () -> ()
 
diff --git a/mlir/test/Bytecode/bytecode_callback_with_custom_type.mlir b/mlir/test/Bytecode/bytecode_callback_with_custom_type.mlir
index 1e272ec4f3afc..aba194f1681a2 100644
--- a/mlir/test/Bytecode/bytecode_callback_with_custom_type.mlir
+++ b/mlir/test/Bytecode/bytecode_callback_with_custom_type.mlir
@@ -1,5 +1,5 @@
-// RUN: mlir-opt %s -split-input-file --test-bytecode-callback="callback-test=1" | FileCheck %s --check-prefix=TEST_1
-// RUN: mlir-opt %s -split-input-file --test-bytecode-callback="callback-test=2" | FileCheck %s --check-prefix=TEST_2
+// RUN: mlir-opt %s -split-input-file --test-bytecode-roundtrip="test-kind=1" | FileCheck %s --check-prefix=TEST_1
+// RUN: mlir-opt %s -split-input-file --test-bytecode-roundtrip="test-kind=2" | FileCheck %s --check-prefix=TEST_2
 
 func.func @base_test(%arg0: !test.i32, %arg1: f32) {
   return
diff --git a/mlir/test/Bytecode/bytecode_downgrade.mlir b/mlir/test/Bytecode/bytecode_downgrade.mlir
new file mode 100644
index 0000000000000..fd98b9d7ee6a5
--- /dev/null
+++ b/mlir/test/Bytecode/bytecode_downgrade.mlir
@@ -0,0 +1,10 @@
+// RUN: mlir-opt %s --test-bytecode-roundtrip="test-dialect-version=1.2 test-kind=6" -verify-diagnostics | FileCheck %s
+
+module {
+  "test.versionedA"() <{dims = 123 : i64, modifier = false}> : () -> ()
+}
+
+// COM: the property downgrader is executed twice: first for IR numbering and then for emission.
+// CHECK: downgrading op...
+// CHECK: downgrading op properties...
+// CHECK: downgrading op properties...
diff --git a/mlir/test/lib/Dialect/Test/TestDialect.cpp b/mlir/test/lib/Dialect/Test/TestDialect.cpp
index deef4a9683e74..21400a60e6532 100644
--- a/mlir/test/lib/Dialect/Test/TestDialect.cpp
+++ b/mlir/test/lib/Dialect/Test/TestDialect.cpp
@@ -1339,7 +1339,7 @@ TestVersionedOpA::readProperties(::mlir::DialectBytecodeReader &reader,
 
   // Check if we have a version. If not, assume we are parsing the current
   // version.
-  auto maybeVersion = reader.getDialectVersion("test");
+  auto maybeVersion = reader.getDialectVersion<test::TestDialect>();
   if (succeeded(maybeVersion)) {
     // If version is less than 2.0, there is no additional attribute to parse.
     // We can materialize missing properties post parsing before verification.
@@ -1358,6 +1358,17 @@ TestVersionedOpA::readProperties(::mlir::DialectBytecodeReader &reader,
 void TestVersionedOpA::writeProperties(::mlir::DialectBytecodeWriter &writer) {
   auto &prop = getProperties();
   writer.writeAttribute(prop.dims);
+
+  auto maybeVersion = writer.getDialectVersion<test::TestDialect>();
+  if (succeeded(maybeVersion)) {
+    // If version is less than 2.0, there is no additional attribute to write.
+    const auto *version =
+        reinterpret_cast<const TestDialectVersion *>(*maybeVersion);
+    if ((version->major_ < 2)) {
+      llvm::outs() << "downgrading op properties...\n";
+      return;
+    }
+  }
   writer.writeAttribute(prop.modifier);
 }
 
@@ -1369,7 +1380,7 @@ ::mlir::LogicalResult TestOpWithVersionedProperties::readFromMlirBytecode(
 
   // Check if we have a version. If not, assume we are parsing the current
   // version.
-  auto maybeVersion = reader.getDialectVersion("test");
+  auto maybeVersion = reader.getDialectVersion<test::TestDialect>();
   bool needToParseAnotherInt = true;
   if (succeeded(maybeVersion)) {
     // If version is less than 2.0, there is no additional attribute to parse.
diff --git a/mlir/test/lib/Dialect/Test/TestDialectInterfaces.cpp b/mlir/test/lib/Dialect/Test/TestDialectInterfaces.cpp
index 950af85007475..ab7d2486db9ae 100644
--- a/mlir/test/lib/Dialect/Test/TestDialectInterfaces.cpp
+++ b/mlir/test/lib/Dialect/Test/TestDialectInterfaces.cpp
@@ -77,7 +77,7 @@ struct TestBytecodeDialectInterface : public BytecodeDialectInterface {
   }
 
   Attribute readAttribute(DialectBytecodeReader &reader) const final {
-    auto versionOr = reader.getDialectVersion("test");
+    auto versionOr = reader.getDialectVersion<test::TestDialect>();
     // Assume current version if not available through the reader.
     const auto version =
         (succeeded(versionOr))
@@ -93,9 +93,16 @@ struct TestBytecodeDialectInterface : public BytecodeDialectInterface {
 
   // Emit a specific version of the dialect.
   void writeVersion(DialectBytecodeWriter &writer) const final {
-    auto version = TestDialectVersion();
-    writer.writeVarInt(version.major_); // major
-    writer.writeVarInt(version.minor_); // minor
+    // Construct the current dialect version.
+    test::TestDialectVersion versionToEmit;
+
+    // Check if a target version to emit was specified on the writer configs.
+    auto versionOr = writer.getDialectVersion<test::TestDialect>();
+    if (succeeded(versionOr))
+      versionToEmit =
+          *reinterpret_cast<const test::TestDialectVersion *>(*versionOr);
+    writer.writeVarInt(versionToEmit.major_); // major
+    writer.writeVarInt(versionToEmit.minor_); // minor
   }
 
   std::unique_ptr<DialectVersion>
diff --git a/mlir/test/lib/IR/CMakeLists.txt b/mlir/test/lib/IR/CMakeLists.txt
index fb0eac3992455..69c63fd7e524b 100644
--- a/mlir/test/lib/IR/CMakeLists.txt
+++ b/mlir/test/lib/IR/CMakeLists.txt
@@ -1,6 +1,6 @@
 # Exclude tests from libMLIR.so
 add_mlir_library(MLIRTestIR
-  TestBytecodeCallbacks.cpp
+  TestBytecodeRoundtrip.cpp
   TestBuiltinAttributeInterfaces.cpp
   TestBuiltinDistinctAttributes.cpp
   TestClone.cpp
diff --git a/mlir/test/lib/IR/TestBytecodeCallbacks.cpp b/mlir/test/lib/IR/TestBytecodeRoundtrip.cpp
similarity index 80%
rename from mlir/test/lib/IR/TestBytecodeCallbacks.cpp
rename to mlir/test/lib/IR/TestBytecodeRoundtrip.cpp
index cc884c7cc76f3..beecc57d7cdd0 100644
--- a/mlir/test/lib/IR/TestBytecodeCallbacks.cpp
+++ b/mlir/test/lib/IR/TestBytecodeRoundtrip.cpp
@@ -45,22 +45,28 @@ class TestDialectVersionParser : public cl::parser<test::TestDialectVersion> {
 
 /// This is a test pass which uses callbacks to encode attributes and types in a
 /// custom fashion.
-struct TestBytecodeCallbackPass
-    : public PassWrapper<TestBytecodeCallbackPass, OperationPass<ModuleOp>> {
-  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TestBytecodeCallbackPass)
+struct TestBytecodeRoundtripPass
+    : public PassWrapper<TestBytecodeRoundtripPass, OperationPass<ModuleOp>> {
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TestBytecodeRoundtripPass)
 
-  StringRef getArgument() const final { return "test-bytecode-callback"; }
+  StringRef getArgument() const final { return "test-bytecode-roundtrip"; }
   StringRef getDescription() const final {
-    return "Test encoding of a dialect type/attributes with a custom callback";
+    return "Test pass to implement bytecode roundtrip tests.";
   }
   void getDependentDialects(DialectRegistry &registry) const override {
     registry.insert<test::TestDialect>();
   }
-  TestBytecodeCallbackPass() = default;
-  TestBytecodeCallbackPass(const TestBytecodeCallbackPass &) {}
+  TestBytecodeRoundtripPass() = default;
+  TestBytecodeRoundtripPass(const TestBytecodeRoundtripPass &) {}
+
+  LogicalResult initialize(MLIRContext *context) override {
+    testDialect = context->getOrLoadDialect<test::TestDialect>();
+    return success();
+  }
 
   void runOnOperation() override {
     switch (testKind) {
+      // Tests 0-5 implement a custom roundtrip with callbacks.
     case (0):
       return runTest0(getOperation());
     case (1):
@@ -73,6 +79,10 @@ struct TestBytecodeCallbackPass
       return runTest4(getOperation());
     case (5):
       return runTest5(getOperation());
+    case (6):
+      // test-kind 6 is a plain roundtrip with downgrade/upgrade to/from
+      // `targetVersion`.
+      return runTest6(getOperation());
     default:
       llvm_unreachable("unhandled test kind for TestBytecodeCallbacks pass");
     }
@@ -85,8 +95,8 @@ struct TestBytecodeCallbackPass
                     cl::init(test::TestDialectVersion())};
 
   mlir::Pass::Option<int> testKind{
-      *this, "callback-test",
-      llvm::cl::desc("Specifies the test kind to execute"), cl::init(0)};
+      *this, "test-kind", llvm::cl::desc("Specifies the test kind to execute"),
+      cl::init(0)};
 
 private:
   void doRoundtripWithConfigs(Operation *op,
@@ -122,11 +132,19 @@ struct TestBytecodeCallbackPass
     auto newCtx = std::make_shared<MLIRContext>();
     test::TestDialectVersion targetEmissionVersion = targetVersion;
     BytecodeWriterConfig writeConfig;
+    // Set the emission version for the test dialect.
+    writeConfig.setDialectVersion<test::TestDialect>(
+        std::make_unique<test::TestDialectVersion>(targetEmissionVersion));
     writeConfig.attachTypeCallback(
         [&](Type entryValue, std::optional<StringRef> &dialectGroupName,
             DialectBytecodeWriter &writer) -> LogicalResult {
-          // Do not override anything if version less than 2.0.
-          if (targetEmissionVersion.major_ >= 2)
+          // Do not override anything if version greater than 2.0.
+          auto versionOr = writer.getDialectVersion<test::TestDialect>();
+          assert(succeeded(versionOr) && "expected reader to be able to access "
+                                         "the version for test dialect");
+          const auto *version =
+              reinterpret_cast<const test::TestDialectVersion *>(*versionOr);
+          if (version->major_ >= 2)
             return failure();
 
           // For version less than 2.0, override the encoding of IntegerType.
@@ -146,19 +164,12 @@ struct TestBytecodeCallbackPass
         [&](DialectBytecodeReader &reader, StringRef dialectName,
             Type &entry) -> LogicalResult {
           // Get test dialect version from the version map.
-          auto versionOr = reader.getDialectVersion("test");
+          auto versionOr = reader.getDialectVersion<test::TestDialect>();
           assert(succeeded(versionOr) && "expected reader to be able to access "
                                          "the version for test dialect");
           const auto *version =
               reinterpret_cast<const test::TestDialectVersion *>(*versionOr);
-
-          // TODO: once back-deployment is formally supported,
-          // `targetEmissionVersion` will be encoded in the bytecode file, and
-          // exposed through the versionMap. Right now though this is not yet
-          // supported. For the purpose of the test, just use
-          // `targetEmissionVersion`.
-          (void)version;
-          if (targetEmissionVersion.major_ >= 2)
+          if (version->major_ >= 2)
             return success();
 
           // `dialectName` is the name of the group we have the opportunity to
@@ -355,11 +366,56 @@ struct TestBytecodeCallbackPass
         });
     doRoundtripWithConfigs(op, writeConfig, parseConfig);
   }
+
+  LogicalResult downgradeToVersion(Operation *op,
+                                   const test::TestDialectVersion &version) {
+    if ((version.major_ == 2) && (version.minor_ == 0))
+      return success();
+    if (version.major_ > 2 || (version.major_ == 2 && version.minor_ > 0)) {
+      return op->emitError() << "current test dialect version is 2.0, "
+                                "can't downgrade to version: "
+                             << version.major_ << "." << version.minor_;
+    }
+    // Prior version 2.0, the old op supported only a single attribute called
+    // "dimensions". We need to check that the modifier is false, otherwise we
+    // can't do the downgrade.
+    auto status = op->walk([&](test::TestVersionedOpA op) {
+      auto &prop = op.getProperties();
+      if (prop.modifier.getValue()) {
+        op->emitOpError() << "cannot downgrade to version " << version.major_
+                          << "." << version.minor_
+                          << " since the modifier is not compatible";
+        return WalkResult::interrupt();
+      }
+      llvm::outs() << "downgrading op...\n";
+      return WalkResult::advance();
+    });
+    return failure(status.wasInterrupted());
+  }
+
+  // Test6: Downgrade IR to `targetVersion`, write to bytecode. Then, read and
+  // upgrade IR when back in memory. The module is expected to be unmodified at
+  // the end of the function.
+  void runTest6(Operation *op) {
+    test::TestDialectVersion targetEmissionVersion = targetVersion;
+
+    // Downgrade IR constructs before writing the IR to bytecode.
+    auto status = downgradeToVersion(op, targetEmissionVersion);
+    assert(succeeded(status) && "expected the downgrade to succeed");
+
+    BytecodeWriterConfig writeConfig;
+    writeConfig.setDialectVersion<test::TestDialect>(
+        std::make_unique<test::TestDialectVersion>(targetEmissionVersion));
+    ParserConfig parseConfig(op->getContext(), /*verifyAfterParse=*/true);
+    doRoundtripWithConfigs(op, writeConfig, parseConfig);
+  }
+
+  test::TestDialect *testDialect;
 };
 } // namespace
 
 namespace mlir {
-void registerTestBytecodeCallbackPasses() {
-  PassRegistration<TestBytecodeCallbackPass>();
+void registerTestBytecodeRoundtripPasses() {
+  PassRegistration<TestBytecodeRoundtripPass>();
 }
 } // namespace mlir
diff --git a/mlir/tools/mlir-opt/mlir-opt.cpp b/mlir/tools/mlir-opt/mlir-opt.cpp
index b7647d7de78a1..3e3223b485056 100644
--- a/mlir/tools/mlir-opt/mlir-opt.cpp
+++ b/mlir/tools/mlir-opt/mlir-opt.cpp
@@ -44,7 +44,7 @@ void registerSymbolTestPasses();
 void registerRegionTestPasses();
 void registerTestAffineDataCopyPass();
 void registerTestAffineReifyValueBoundsPass();
-void registerTestBytecodeCallbackPasses();
+void registerTestBytecodeRoundtripPasses();
 void registerTestDecomposeAffineOpPass();
 void registerTestAffineLoopUnswitchingPass();
 void registerTestAllReduceLoweringPass();
@@ -167,7 +167,7 @@ void registerTestPasses() {
   registerTestDecomposeAffineOpPass();
   registerTestAffineLoopUnswitchingPass();
   registerTestAllReduceLoweringPass();
-  registerTestBytecodeCallbackPasses();
+  registerTestBytecodeRoundtripPasses();
   registerTestFunc();
   registerTestGpuMemoryPromotionPass();
   registerTestLoopPermutationPass();

From 50f69e5f8149a8577ab07f30a8bae49bbb79ff7d Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Tue, 31 Oct 2023 15:41:53 -0700
Subject: [PATCH 591/877] insertSSPDeclarations: adjust Darwin condition that
 sets dso_local

This change is for AArch32 and not strictly needed, but it ensures that
we follow the model that direct accesses are only emitted for dso_local
and we do not need TargetMachine::shouldAssumeDSOLocal to force
dso_local for a dso_preemptable variable.

There is no behavior change to the arm/arm64 configurations listed in
commit 5888dee7d04748744743a35d3aef030018bdc275.
---
 llvm/lib/CodeGen/TargetLoweringBase.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index 99eadf4bb9d57..722cefb1eddb3 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -2019,7 +2019,8 @@ void TargetLoweringBase::insertSSPDeclarations(Module &M) const {
     if (M.getDirectAccessExternalData() &&
         !TM.getTargetTriple().isWindowsGNUEnvironment() &&
         !TM.getTargetTriple().isOSFreeBSD() &&
-        !TM.getTargetTriple().isOSDarwin())
+        (!TM.getTargetTriple().isOSDarwin() ||
+         TM.getRelocationModel() == Reloc::Static))
       GV->setDSOLocal(true);
   }
 }

From 157ba33007fe2fd0609c87b7e9c8615ba427740f Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Tue, 31 Oct 2023 15:47:05 -0700
Subject: [PATCH 592/877] [CSKY,test] Update switch.ll

---
 llvm/test/CodeGen/CSKY/switch.ll | 72 ++++++++++++++++----------------
 1 file changed, 36 insertions(+), 36 deletions(-)

diff --git a/llvm/test/CodeGen/CSKY/switch.ll b/llvm/test/CodeGen/CSKY/switch.ll
index 515ccd4960dc0..dce8579640428 100644
--- a/llvm/test/CodeGen/CSKY/switch.ll
+++ b/llvm/test/CodeGen/CSKY/switch.ll
@@ -8,7 +8,7 @@ define i32 @f(i32 %val) {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    movi16 a1, 4
 ; CHECK-NEXT:    cmphs16 a1, a0
-; CHECK-NEXT:    bf32 .LBB0_3
+; CHECK-NEXT:    bf32 .LBB0_7
 ; CHECK-NEXT:  # %bb.1: # %entry
 ; CHECK-NEXT:    lrw32 a1, [.LCPI0_0]
 ; CHECK-NEXT:    ldr32.w a0, (a1, a0 << 2)
@@ -16,25 +16,25 @@ define i32 @f(i32 %val) {
 ; CHECK-NEXT:  .LBB0_2: # %onzero
 ; CHECK-NEXT:    movi16 a0, 0
 ; CHECK-NEXT:    rts16
-; CHECK-NEXT:  .LBB0_3: # %otherwise
-; CHECK-NEXT:    movih32 a0, 65535
-; CHECK-NEXT:    ori32 a0, a0, 65535
-; CHECK-NEXT:    rts16
-; CHECK-NEXT:  .LBB0_4: # %onone
-; CHECK-NEXT:    movi16 a0, 1
+; CHECK-NEXT:  .LBB0_3: # %onthree
+; CHECK-NEXT:    movi16 a0, 3
 ; CHECK-NEXT:    rts16
-; CHECK-NEXT:  .LBB0_5: # %ontwo
+; CHECK-NEXT:  .LBB0_4: # %ontwo
 ; CHECK-NEXT:    movi16 a0, 2
 ; CHECK-NEXT:    rts16
-; CHECK-NEXT:  .LBB0_6: # %onfour
+; CHECK-NEXT:  .LBB0_5: # %onfour
 ; CHECK-NEXT:    movi16 a0, 4
 ; CHECK-NEXT:    rts16
-; CHECK-NEXT:  .LBB0_7: # %onthree
-; CHECK-NEXT:    movi16 a0, 3
+; CHECK-NEXT:  .LBB0_6: # %onone
+; CHECK-NEXT:    movi16 a0, 1
+; CHECK-NEXT:    rts16
+; CHECK-NEXT:  .LBB0_7: # %otherwise
+; CHECK-NEXT:    movih32 a0, 65535
+; CHECK-NEXT:    ori32 a0, a0, 65535
 ; CHECK-NEXT:    rts16
 ; CHECK-NEXT:    .p2align 1
 ; CHECK-NEXT:  # %bb.8:
-; CHECK-NEXT:    .p2align 2
+; CHECK-NEXT:    .p2align 2, 0x0
 ; CHECK-NEXT:  .LCPI0_0:
 ; CHECK-NEXT:    .long .LJTI0_0
 ;
@@ -48,7 +48,7 @@ define i32 @f(i32 %val) {
 ; CHECK-PIC-SMALL-NEXT:    lrw32 rgb, [.LCPI0_0]
 ; CHECK-PIC-SMALL-NEXT:    movi16 a1, 4
 ; CHECK-PIC-SMALL-NEXT:    cmphs16 a1, a0
-; CHECK-PIC-SMALL-NEXT:    bf32 .LBB0_3
+; CHECK-PIC-SMALL-NEXT:    bf32 .LBB0_7
 ; CHECK-PIC-SMALL-NEXT:  # %bb.1: # %entry
 ; CHECK-PIC-SMALL-NEXT:    lrw32 a1, [.LCPI0_1]
 ; CHECK-PIC-SMALL-NEXT:    addu32 a1, rgb, a1
@@ -58,28 +58,28 @@ define i32 @f(i32 %val) {
 ; CHECK-PIC-SMALL-NEXT:  .LBB0_2: # %onzero
 ; CHECK-PIC-SMALL-NEXT:    movi16 a0, 0
 ; CHECK-PIC-SMALL-NEXT:    br32 .LBB0_8
-; CHECK-PIC-SMALL-NEXT:  .LBB0_3: # %otherwise
-; CHECK-PIC-SMALL-NEXT:    movih32 a0, 65535
-; CHECK-PIC-SMALL-NEXT:    ori32 a0, a0, 65535
-; CHECK-PIC-SMALL-NEXT:    br32 .LBB0_8
-; CHECK-PIC-SMALL-NEXT:  .LBB0_4: # %onone
-; CHECK-PIC-SMALL-NEXT:    movi16 a0, 1
+; CHECK-PIC-SMALL-NEXT:  .LBB0_3: # %onthree
+; CHECK-PIC-SMALL-NEXT:    movi16 a0, 3
 ; CHECK-PIC-SMALL-NEXT:    br32 .LBB0_8
-; CHECK-PIC-SMALL-NEXT:  .LBB0_5: # %ontwo
+; CHECK-PIC-SMALL-NEXT:  .LBB0_4: # %ontwo
 ; CHECK-PIC-SMALL-NEXT:    movi16 a0, 2
 ; CHECK-PIC-SMALL-NEXT:    br32 .LBB0_8
-; CHECK-PIC-SMALL-NEXT:  .LBB0_6: # %onfour
+; CHECK-PIC-SMALL-NEXT:  .LBB0_5: # %onfour
 ; CHECK-PIC-SMALL-NEXT:    movi16 a0, 4
 ; CHECK-PIC-SMALL-NEXT:    br32 .LBB0_8
-; CHECK-PIC-SMALL-NEXT:  .LBB0_7: # %onthree
-; CHECK-PIC-SMALL-NEXT:    movi16 a0, 3
+; CHECK-PIC-SMALL-NEXT:  .LBB0_6: # %onone
+; CHECK-PIC-SMALL-NEXT:    movi16 a0, 1
+; CHECK-PIC-SMALL-NEXT:    br32 .LBB0_8
+; CHECK-PIC-SMALL-NEXT:  .LBB0_7: # %otherwise
+; CHECK-PIC-SMALL-NEXT:    movih32 a0, 65535
+; CHECK-PIC-SMALL-NEXT:    ori32 a0, a0, 65535
 ; CHECK-PIC-SMALL-NEXT:  .LBB0_8: # %onone
 ; CHECK-PIC-SMALL-NEXT:    ld32.w rgb, (sp, 0) # 4-byte Folded Reload
 ; CHECK-PIC-SMALL-NEXT:    addi16 sp, sp, 4
 ; CHECK-PIC-SMALL-NEXT:    rts16
 ; CHECK-PIC-SMALL-NEXT:    .p2align 1
 ; CHECK-PIC-SMALL-NEXT:  # %bb.9:
-; CHECK-PIC-SMALL-NEXT:    .p2align 2
+; CHECK-PIC-SMALL-NEXT:    .p2align 2, 0x0
 ; CHECK-PIC-SMALL-NEXT:  .LCPI0_0:
 ; CHECK-PIC-SMALL-NEXT:    .long _GLOBAL_OFFSET_TABLE_
 ; CHECK-PIC-SMALL-NEXT:  .LCPI0_1:
@@ -95,7 +95,7 @@ define i32 @f(i32 %val) {
 ; CHECK-PIC-LARGE-NEXT:    lrw32 rgb, [.LCPI0_0]
 ; CHECK-PIC-LARGE-NEXT:    movi16 a1, 4
 ; CHECK-PIC-LARGE-NEXT:    cmphs16 a1, a0
-; CHECK-PIC-LARGE-NEXT:    bf32 .LBB0_3
+; CHECK-PIC-LARGE-NEXT:    bf32 .LBB0_7
 ; CHECK-PIC-LARGE-NEXT:  # %bb.1: # %entry
 ; CHECK-PIC-LARGE-NEXT:    lrw32 a1, [.LCPI0_1]
 ; CHECK-PIC-LARGE-NEXT:    addu32 a1, rgb, a1
@@ -105,28 +105,28 @@ define i32 @f(i32 %val) {
 ; CHECK-PIC-LARGE-NEXT:  .LBB0_2: # %onzero
 ; CHECK-PIC-LARGE-NEXT:    movi16 a0, 0
 ; CHECK-PIC-LARGE-NEXT:    br32 .LBB0_8
-; CHECK-PIC-LARGE-NEXT:  .LBB0_3: # %otherwise
-; CHECK-PIC-LARGE-NEXT:    movih32 a0, 65535
-; CHECK-PIC-LARGE-NEXT:    ori32 a0, a0, 65535
-; CHECK-PIC-LARGE-NEXT:    br32 .LBB0_8
-; CHECK-PIC-LARGE-NEXT:  .LBB0_4: # %onone
-; CHECK-PIC-LARGE-NEXT:    movi16 a0, 1
+; CHECK-PIC-LARGE-NEXT:  .LBB0_3: # %onthree
+; CHECK-PIC-LARGE-NEXT:    movi16 a0, 3
 ; CHECK-PIC-LARGE-NEXT:    br32 .LBB0_8
-; CHECK-PIC-LARGE-NEXT:  .LBB0_5: # %ontwo
+; CHECK-PIC-LARGE-NEXT:  .LBB0_4: # %ontwo
 ; CHECK-PIC-LARGE-NEXT:    movi16 a0, 2
 ; CHECK-PIC-LARGE-NEXT:    br32 .LBB0_8
-; CHECK-PIC-LARGE-NEXT:  .LBB0_6: # %onfour
+; CHECK-PIC-LARGE-NEXT:  .LBB0_5: # %onfour
 ; CHECK-PIC-LARGE-NEXT:    movi16 a0, 4
 ; CHECK-PIC-LARGE-NEXT:    br32 .LBB0_8
-; CHECK-PIC-LARGE-NEXT:  .LBB0_7: # %onthree
-; CHECK-PIC-LARGE-NEXT:    movi16 a0, 3
+; CHECK-PIC-LARGE-NEXT:  .LBB0_6: # %onone
+; CHECK-PIC-LARGE-NEXT:    movi16 a0, 1
+; CHECK-PIC-LARGE-NEXT:    br32 .LBB0_8
+; CHECK-PIC-LARGE-NEXT:  .LBB0_7: # %otherwise
+; CHECK-PIC-LARGE-NEXT:    movih32 a0, 65535
+; CHECK-PIC-LARGE-NEXT:    ori32 a0, a0, 65535
 ; CHECK-PIC-LARGE-NEXT:  .LBB0_8: # %onone
 ; CHECK-PIC-LARGE-NEXT:    ld32.w rgb, (sp, 0) # 4-byte Folded Reload
 ; CHECK-PIC-LARGE-NEXT:    addi16 sp, sp, 4
 ; CHECK-PIC-LARGE-NEXT:    rts16
 ; CHECK-PIC-LARGE-NEXT:    .p2align 1
 ; CHECK-PIC-LARGE-NEXT:  # %bb.9:
-; CHECK-PIC-LARGE-NEXT:    .p2align 2
+; CHECK-PIC-LARGE-NEXT:    .p2align 2, 0x0
 ; CHECK-PIC-LARGE-NEXT:  .LCPI0_0:
 ; CHECK-PIC-LARGE-NEXT:    .long _GLOBAL_OFFSET_TABLE_
 ; CHECK-PIC-LARGE-NEXT:  .LCPI0_1:

From 8c0e2963eb7d3ec70006833ee546b0c95dca9ba2 Mon Sep 17 00:00:00 2001
From: Will Hawkins <hawkinsw@obs.cr>
Date: Tue, 31 Oct 2023 18:56:46 -0400
Subject: [PATCH 593/877] [libc++][NFC] Update ignore_format for expected tests
 (#70869)

After changes to <expected>, more test files are now properly formatted.
---
 libcxx/utils/data/ignore_format.txt | 2 --
 1 file changed, 2 deletions(-)

diff --git a/libcxx/utils/data/ignore_format.txt b/libcxx/utils/data/ignore_format.txt
index d4afd8c87350f..cf6dca7cd7d3c 100644
--- a/libcxx/utils/data/ignore_format.txt
+++ b/libcxx/utils/data/ignore_format.txt
@@ -6257,7 +6257,6 @@ libcxx/test/std/utilities/expected/expected.expected/observers/deref.pass.cpp
 libcxx/test/std/utilities/expected/expected.expected/observers/error.pass.cpp
 libcxx/test/std/utilities/expected/expected.expected/observers/has_value.pass.cpp
 libcxx/test/std/utilities/expected/expected.expected/observers/value.pass.cpp
-libcxx/test/std/utilities/expected/expected.expected/swap/member.swap.pass.cpp
 libcxx/test/std/utilities/expected/expected.unexpected/ctad.compile.pass.cpp
 libcxx/test/std/utilities/expected/expected.unexpected/ctor/ctor.inplace.pass.cpp
 libcxx/test/std/utilities/expected/expected.unexpected/ctor/ctor.move.pass.cpp
@@ -6277,7 +6276,6 @@ libcxx/test/std/utilities/expected/expected.void/observers/deref.pass.cpp
 libcxx/test/std/utilities/expected/expected.void/observers/error.pass.cpp
 libcxx/test/std/utilities/expected/expected.void/observers/has_value.pass.cpp
 libcxx/test/std/utilities/expected/expected.void/swap/free.swap.pass.cpp
-libcxx/test/std/utilities/expected/expected.void/swap/member.swap.pass.cpp
 libcxx/test/std/utilities/format/format.arguments/format.args/get.pass.cpp
 libcxx/test/std/utilities/format/format.arguments/format.args/types.compile.pass.cpp
 libcxx/test/std/utilities/format/format.arguments/format.arg/visit_format_arg.pass.cpp

From 5bfa16cbc688abae750a243da2d46bd89e0c3758 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Tue, 31 Oct 2023 15:58:25 -0700
Subject: [PATCH 594/877] [mlir] Fix an unused variable warning

This patch fixes:

  mlir/test/lib/IR/TestBytecodeRoundtrip.cpp:403:10: error: unused
  variable 'status' [-Werror,-Wunused-variable]
---
 mlir/test/lib/IR/TestBytecodeRoundtrip.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/mlir/test/lib/IR/TestBytecodeRoundtrip.cpp b/mlir/test/lib/IR/TestBytecodeRoundtrip.cpp
index beecc57d7cdd0..668701f1352e0 100644
--- a/mlir/test/lib/IR/TestBytecodeRoundtrip.cpp
+++ b/mlir/test/lib/IR/TestBytecodeRoundtrip.cpp
@@ -402,6 +402,7 @@ struct TestBytecodeRoundtripPass
     // Downgrade IR constructs before writing the IR to bytecode.
     auto status = downgradeToVersion(op, targetEmissionVersion);
     assert(succeeded(status) && "expected the downgrade to succeed");
+    (void)status;
 
     BytecodeWriterConfig writeConfig;
     writeConfig.setDialectVersion<test::TestDialect>(

From 91917387648f5638058f7f9391f144b8858c4975 Mon Sep 17 00:00:00 2001
From: Peter Klausler <35819229+klausler@users.noreply.github.com>
Date: Tue, 31 Oct 2023 16:04:35 -0700
Subject: [PATCH 595/877] [flang] Accept CONTIGUOUS attribute when redundant
 (#70853)

As an extension, accept the redundant use of the CONTIGUOUS attribute
when applied to scalars and to simply contiguous objects, with a
portability warning.
---
 flang/docs/Extensions.md                   |  2 ++
 flang/lib/Semantics/check-declarations.cpp |  4 ++--
 flang/test/Semantics/bind-c13.f90          |  6 +++---
 flang/test/Semantics/call07.f90            |  4 ++--
 flang/test/Semantics/contiguous01.f90      | 16 ++++++++--------
 flang/test/Semantics/resolve90.f90         |  4 ++--
 6 files changed, 19 insertions(+), 17 deletions(-)

diff --git a/flang/docs/Extensions.md b/flang/docs/Extensions.md
index 2786a8cca73b5..1aa0e03253455 100644
--- a/flang/docs/Extensions.md
+++ b/flang/docs/Extensions.md
@@ -222,6 +222,8 @@ end
   we also treat scalars as being trivially contiguous, so that they
   can be used in contexts like data targets in pointer assignments
   with bounds remapping.
+* The `CONTIGUOUS` attribute can be redundantly applied to simply
+  contiguous objects, including scalars, with a portability warning.
 * We support some combinations of specific procedures in generic
   interfaces that a strict reading of the standard would preclude
   when their calls must nonetheless be distinguishable.
diff --git a/flang/lib/Semantics/check-declarations.cpp b/flang/lib/Semantics/check-declarations.cpp
index fdd29f081324c..6d69eb187bda0 100644
--- a/flang/lib/Semantics/check-declarations.cpp
+++ b/flang/lib/Semantics/check-declarations.cpp
@@ -2129,11 +2129,11 @@ void CheckHelper::CheckContiguous(const Symbol &symbol) {
           evaluate::IsAssumedRank(symbol))) {
   } else if (symbol.owner().IsDerivedType()) { // C752
     messages_.Say(
-        "CONTIGUOUS component '%s' must be an array with the POINTER attribute"_err_en_US,
+        "CONTIGUOUS component '%s' should be an array with the POINTER attribute"_port_en_US,
         symbol.name());
   } else {
     messages_.Say(
-        "CONTIGUOUS entity '%s' must be an array pointer, assumed-shape, or assumed-rank"_err_en_US,
+        "CONTIGUOUS entity '%s' should be an array pointer, assumed-shape, or assumed-rank"_port_en_US,
         symbol.name());
   }
 }
diff --git a/flang/test/Semantics/bind-c13.f90 b/flang/test/Semantics/bind-c13.f90
index 14e20a36f4f33..81815c1a95efa 100644
--- a/flang/test/Semantics/bind-c13.f90
+++ b/flang/test/Semantics/bind-c13.f90
@@ -1,12 +1,12 @@
-! RUN: %python %S/test_errors.py %s %flang_fc1
+! RUN: %python %S/test_errors.py %s %flang_fc1 -pedantic
 ! Interoperable objects that require descriptors cannot be CONTIGUOUS
 subroutine interop(ptr,ashape,arank,eshape,asize) bind(c)
   !ERROR: An interoperable pointer must not be CONTIGUOUS
   real, pointer, contiguous :: ptr(:)
   real, contiguous :: ashape(:) ! ok
   real, contiguous :: arank(..) ! ok
-  !ERROR: CONTIGUOUS entity 'eshape' must be an array pointer, assumed-shape, or assumed-rank
+  !PORTABILITY: CONTIGUOUS entity 'eshape' should be an array pointer, assumed-shape, or assumed-rank
   real, contiguous :: eshape(10)
-  !ERROR: CONTIGUOUS entity 'asize' must be an array pointer, assumed-shape, or assumed-rank
+  !PORTABILITY: CONTIGUOUS entity 'asize' should be an array pointer, assumed-shape, or assumed-rank
   real, contiguous :: asize(*)
 end
diff --git a/flang/test/Semantics/call07.f90 b/flang/test/Semantics/call07.f90
index ff372206fe824..3b5c2838fadf7 100644
--- a/flang/test/Semantics/call07.f90
+++ b/flang/test/Semantics/call07.f90
@@ -19,12 +19,12 @@ subroutine s04(p)
   end subroutine
 
   subroutine test
-    !ERROR: CONTIGUOUS entity 'a01' must be an array pointer, assumed-shape, or assumed-rank
+    !PORTABILITY: CONTIGUOUS entity 'a01' should be an array pointer, assumed-shape, or assumed-rank
     real, pointer, contiguous :: a01 ! C830
     real, pointer :: a02(:)
     real, target :: a03(10)
     real :: a04(10) ! not TARGET
-    !ERROR: CONTIGUOUS entity 'scalar' must be an array pointer, assumed-shape, or assumed-rank
+    !PORTABILITY: CONTIGUOUS entity 'scalar' should be an array pointer, assumed-shape, or assumed-rank
     real, contiguous :: scalar
     call s01(a03) ! ok
     !WARNING: Target of CONTIGUOUS pointer association is not known to be contiguous
diff --git a/flang/test/Semantics/contiguous01.f90 b/flang/test/Semantics/contiguous01.f90
index 77820b94bb654..1d3600aef6c55 100644
--- a/flang/test/Semantics/contiguous01.f90
+++ b/flang/test/Semantics/contiguous01.f90
@@ -1,4 +1,4 @@
-! RUN: %python %S/test_errors.py %s %flang_fc1
+! RUN: %python %S/test_errors.py %s %flang_fc1 -pedantic
 module m0
   real, pointer, contiguous :: p1(:) ! ok
   real, pointer :: p2(:)
@@ -9,26 +9,26 @@ module m
   contiguous p1
   !ERROR: Cannot change CONTIGUOUS attribute on use-associated 'p2'
   contiguous p2
-  !ERROR: CONTIGUOUS entity 'x' must be an array pointer, assumed-shape, or assumed-rank
+  !PORTABILITY: CONTIGUOUS entity 'x' should be an array pointer, assumed-shape, or assumed-rank
   real, contiguous :: x
-  !ERROR: CONTIGUOUS entity 'scalar' must be an array pointer, assumed-shape, or assumed-rank
+  !PORTABILITY: CONTIGUOUS entity 'scalar' should be an array pointer, assumed-shape, or assumed-rank
   real, contiguous, pointer :: scalar
-  !ERROR: CONTIGUOUS entity 'allocatable' must be an array pointer, assumed-shape, or assumed-rank
+  !PORTABILITY: CONTIGUOUS entity 'allocatable' should be an array pointer, assumed-shape, or assumed-rank
   real, contiguous, allocatable :: allocatable
  contains
-  !ERROR: CONTIGUOUS entity 'func' must be an array pointer, assumed-shape, or assumed-rank
+  !PORTABILITY: CONTIGUOUS entity 'func' should be an array pointer, assumed-shape, or assumed-rank
   function func(ashape,arank) result(r)
     real, contiguous :: ashape(:) ! ok
     real, contiguous :: arank(..) ! ok
-    !ERROR: CONTIGUOUS entity 'r' must be an array pointer, assumed-shape, or assumed-rank
+    !PORTABILITY: CONTIGUOUS entity 'r' should be an array pointer, assumed-shape, or assumed-rank
     real :: r(10)
-    !ERROR: CONTIGUOUS entity 'r2' must be an array pointer, assumed-shape, or assumed-rank
+    !PORTABILITY: CONTIGUOUS entity 'r2' should be an array pointer, assumed-shape, or assumed-rank
     real :: r2(10)
     contiguous func
     contiguous r
     contiguous e
     contiguous r2
-    !ERROR: CONTIGUOUS entity 'e' must be an array pointer, assumed-shape, or assumed-rank
+    !PORTABILITY: CONTIGUOUS entity 'e' should be an array pointer, assumed-shape, or assumed-rank
     entry e() result(r2)
   end
   function fp()
diff --git a/flang/test/Semantics/resolve90.f90 b/flang/test/Semantics/resolve90.f90
index 16cb641adc663..baa108f4d0839 100644
--- a/flang/test/Semantics/resolve90.f90
+++ b/flang/test/Semantics/resolve90.f90
@@ -1,4 +1,4 @@
-! RUN: %python %S/test_errors.py %s %flang_fc1
+! RUN: %python %S/test_errors.py %s %flang_fc1 -pedantic
 ! Testing for pointer constant, along with :
 ! C751 A component shall not have both the ALLOCATABLE and POINTER attributes.
 ! C752 If the CONTIGUOUS attribute is specified, the component shall be an 
@@ -12,7 +12,7 @@ subroutine s()
     !ERROR: 'pointerallocatablefield' may not have both the POINTER and ALLOCATABLE attributes
     real, pointer, allocatable :: pointerAllocatableField
     real, dimension(:), contiguous, pointer :: goodContigField
-    !ERROR: CONTIGUOUS component 'badcontigfield' must be an array with the POINTER attribute
+    !PORTABILITY: CONTIGUOUS component 'badcontigfield' should be an array with the POINTER attribute
     real, dimension(:), contiguous, allocatable :: badContigField
     character :: charField * 3
     !ERROR: A length specifier cannot be used to declare the non-character entity 'realfield'

From c5dafd15ae334b71672c8e30f0e7c5f0a580a4ac Mon Sep 17 00:00:00 2001
From: Jakub Kuderski <jakub@nod-labs.com>
Date: Tue, 31 Oct 2023 19:15:44 -0400
Subject: [PATCH 596/877] [mlir][vector] Clean up outerproduct docs. NFC.
 (#70824)

Remove outdated content.
Suggested by @dcaballe in
https://github.com/llvm/llvm-project/pull/70673.
---
 mlir/include/mlir/Dialect/Vector/IR/VectorOps.td | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
index 62ae300b3cdc8..e6189bce88f52 100644
--- a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
+++ b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
@@ -942,10 +942,7 @@ def Vector_OuterProductOp :
 
     An optional kind attribute may be specified to be: `add`/`mul`/`minsi`
     /`minui`/`maxsi`/`maxui`/`and`/`or`/`xor` for integers, and `add`/`mul`
-    /`minf`/`maxf`/`minimumf`/`maximumf` for floats.
-    The default is `add`, in which case the operation returns a fused
-    multiply-add. In other cases it returns a multiply followed by the
-    appropriate operation (for example, a compare and select for `maxf`).
+    /`minf`/`maxf`/`minimumf`/`maximumf` for floats. The default is `add`.
 
     Example:
 

From e599978760e88231b834772eca080c42b71716c3 Mon Sep 17 00:00:00 2001
From: Aart Bik <39774503+aartbik@users.noreply.github.com>
Date: Tue, 31 Oct 2023 16:19:27 -0700
Subject: [PATCH 597/877]  [mlir][sparse] first proof-of-concept
 non-permutation rewriter (#70863)

Rather than extending sparsifier codegen with higher order
non-permutations, we follow the path of rewriting linalg generic ops
into higher order operations. That way, code generation will simply work
out of the box. This is a very first proof-of-concept rewriting of that
idea.
---
 .../Transforms/SparseReinterpretMap.cpp       | 138 +++++++++++++++++-
 .../SparseTensor/sparse_reinterpret_map.mlir  |  49 ++++++-
 2 files changed, 178 insertions(+), 9 deletions(-)

diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseReinterpretMap.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseReinterpretMap.cpp
index 5880f2158b8cd..31cc8525725d4 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseReinterpretMap.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseReinterpretMap.cpp
@@ -7,6 +7,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Linalg/IR/Linalg.h"
+#include "mlir/Dialect/Linalg/Utils/Utils.h"
 #include "mlir/Dialect/SparseTensor/IR/SparseTensor.h"
 #include "mlir/Dialect/SparseTensor/IR/SparseTensorType.h"
 #include "mlir/Dialect/SparseTensor/Transforms/Passes.h"
@@ -18,10 +20,130 @@ using namespace mlir::sparse_tensor;
 
 namespace {
 
-// TODO:
-//   (1) insert the zero-cost sparse_tensor.reinterpret_map ops
-//   (2) rewrite linalg.generic ops traits on level crds
-//   (3) compute topsort, and resolve cyles with sparse_tensor.convert ops
+//===----------------------------------------------------------------------===//
+// Helper methods.
+//===----------------------------------------------------------------------===//
+
+// Translates a "simple" map according to an identity lvl-map.
+static AffineMap translateMap(OpBuilder &builder, SparseTensorType stt,
+                              AffineMap map) {
+  unsigned lvlRank = stt.getLvlRank();
+  AffineMap lvl2dim = stt.getLvlToDim();
+  assert(lvl2dim.getNumInputs() == lvlRank);
+  SmallVector<AffineExpr> exps;
+  for (unsigned i = 0, n = map.getNumResults(); i < n; i++) {
+    unsigned pos = map.getResult(i).cast<AffineDimExpr>().getPosition();
+    exps.push_back(lvl2dim.getResult(pos));
+  }
+  return AffineMap::get(lvlRank, 0, exps, builder.getContext());
+}
+
+// Generates a "de"mapping reinterpretation of the map.
+static Value genDemap(OpBuilder &builder, SparseTensorEncodingAttr enc,
+                      Value val) {
+  return builder.create<ReinterpretMapOp>(val.getLoc(), enc.withoutDimToLvl(),
+                                          val);
+}
+
+// Generates a "re"mapping reinterpretation of the map.
+static Value genRemap(OpBuilder &builder, SparseTensorEncodingAttr enc,
+                      Value val) {
+  return builder.create<ReinterpretMapOp>(val.getLoc(), enc, val);
+}
+
+// Generates a clone of the given linalg generic operation, but with
+// remapped arguments, index maps, and iteration types.
+//
+// TODO: As decribed below, this is proof-of-concept code which makes a lot
+//       of simplifying assumptions for now.
+//
+static linalg::GenericOp genGenericLinalg(PatternRewriter &rewriter,
+                                          linalg::GenericOp linalgOp,
+                                          SparseTensorType stt, Value out) {
+  unsigned dimRank = stt.getDimRank();
+  unsigned lvlRank = stt.getLvlRank();
+  SmallVector<Value> inputOps = linalgOp.getInputs();
+  SmallVector<Value> outputOps = {out};
+  SmallVector<AffineMap> indexMaps;
+  SmallVector<utils::IteratorType> iterTypes;
+  // Translate the index maps, except output map, which is lvl-identity.
+  auto maps = linalgOp.getIndexingMapsArray();
+  for (unsigned i = 0, n = maps.size() - 1; i < n; i++)
+    indexMaps.push_back(translateMap(rewriter, stt, maps[i]));
+  indexMaps.push_back(
+      AffineMap::getMultiDimIdentityMap(lvlRank, rewriter.getContext()));
+  // Add additional "parallel" iteration types at the top.
+  for (unsigned i = 0, diff = lvlRank = dimRank; i < diff; i++)
+    iterTypes.push_back(utils::IteratorType::parallel);
+  for (auto &i : linalgOp.getIteratorTypesArray())
+    iterTypes.push_back(i);
+  // Generate the new linalg generic operation and clone body.
+  auto newOp = rewriter.create<linalg::GenericOp>(
+      linalgOp.getLoc(), out.getType(), inputOps, outputOps, indexMaps,
+      iterTypes);
+  rewriter.cloneRegionBefore(linalgOp.getRegion(), newOp.getRegion(),
+                             newOp.getRegion().begin());
+  return newOp;
+}
+
+//===----------------------------------------------------------------------===//
+// Rewriting rules for linalg generic ops.
+//===----------------------------------------------------------------------===//
+
+/// Sparse rewriting rule for the generic `linalg` operation.
+struct GenericOpReinterpretMap : public OpRewritePattern<linalg::GenericOp> {
+public:
+  GenericOpReinterpretMap(MLIRContext *context)
+      : OpRewritePattern<linalg::GenericOp>(context) {}
+
+  LogicalResult matchAndRewrite(linalg::GenericOp linalgOp,
+                                PatternRewriter &rewriter) const override {
+    // Only rewrite single output operations with pure tensor semantics.
+    if (linalgOp.getNumDpsInits() != 1 || !linalgOp.hasTensorSemantics())
+      return failure();
+    // Scan all operands, inspect sparse tensors.
+    //
+    // TODO: generalize this proof-of-concept algorithm, since the current
+    //       implementation accepts only simple indexing maps, and one
+    //       non-permutation sparse tensor, which must have an identity
+    //       indexing map and be the output.
+    //
+    OpOperand *tx = nullptr;
+    for (OpOperand &t : linalgOp->getOpOperands()) {
+      // Ensure every index map is "simple".
+      const auto map = linalgOp.getMatchingIndexingMap(&t);
+      for (unsigned i = 0, n = map.getNumResults(); i < n; i++)
+        if (map.getResult(i).getKind() != AffineExprKind::DimId)
+          return failure();
+      // Inspect sparse operands.
+      auto stt = getSparseTensorType(t.get());
+      if (stt.hasEncoding()) {
+        if (stt.isPermutation())
+          continue;
+        assert(stt.getDimRank() < stt.getLvlRank()); // only allowed non-perm
+        if (tx)
+          return failure(); // more than one non-perm
+        if (!map.isIdentity())
+          return failure(); // no ID indexing map on the non-perm
+        tx = &t;
+      }
+    }
+    // Found a non-permutation, rewrite when this is the output.
+    if (tx && tx == linalgOp.getDpsInitOperand(0)) {
+      auto stt = getSparseTensorType(tx->get());
+      auto demap = genDemap(rewriter, stt.getEncoding(), tx->get());
+      auto newOp = genGenericLinalg(rewriter, linalgOp, stt, demap);
+      auto remap = genRemap(rewriter, stt.getEncoding(), newOp.getResult(0));
+      rewriter.replaceOp(linalgOp, remap);
+      return success();
+    }
+    return failure();
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// Rewriting rules for operations other than linalg generic ops.
+//===----------------------------------------------------------------------===//
 
 // CRTP to help implementing a rewriter that demaps all its inputs and remaps
 // all its outputs.
@@ -59,10 +181,6 @@ struct DemapInsRemapOutsRewriter : public OpRewritePattern<SourceOp> {
   }
 };
 
-//===----------------------------------------------------------------------===//
-// Reinterpret Map Rewriters for operations other than linalg.generics
-//===----------------------------------------------------------------------===//
-
 struct CrdTranslateRewriter : public OpRewritePattern<CrdTranslateOp> {
   using OpRewritePattern::OpRewritePattern;
   LogicalResult matchAndRewrite(CrdTranslateOp op,
@@ -110,6 +228,10 @@ struct TensorInsertRewriter
 
 void mlir::populateSparseReinterpretMap(RewritePatternSet &patterns,
                                         ReinterpretMapScope scope) {
+  if (scope == ReinterpretMapScope::kAll ||
+      scope == ReinterpretMapScope::kGenericOnly) {
+    patterns.add<GenericOpReinterpretMap>(patterns.getContext());
+  }
   if (scope == ReinterpretMapScope::kAll ||
       scope == ReinterpretMapScope::kExceptGeneric) {
     patterns.add<CrdTranslateRewriter, TensorInsertRewriter>(
diff --git a/mlir/test/Dialect/SparseTensor/sparse_reinterpret_map.mlir b/mlir/test/Dialect/SparseTensor/sparse_reinterpret_map.mlir
index 8517f2a27ae3f..149c0bc46e251 100644
--- a/mlir/test/Dialect/SparseTensor/sparse_reinterpret_map.mlir
+++ b/mlir/test/Dialect/SparseTensor/sparse_reinterpret_map.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s --sparse-reinterpret-map | FileCheck %s
+// RUN: mlir-opt %s -split-input-file  --sparse-reinterpret-map | FileCheck %s
 
 #SparseVector = #sparse_tensor.encoding<{ map = (d0) -> (d0 : compressed) }>
 
@@ -8,3 +8,50 @@
 func.func @sparse_nop(%arg0: tensor<?xf64, #SparseVector>) -> tensor<?xf64, #SparseVector> {
   return %arg0 : tensor<?xf64, #SparseVector>
 }
+
+// -----
+
+#trait_mul = {
+  indexing_maps = [
+    affine_map<(i,j) -> (i,j)>,  // A (in)
+    affine_map<(i,j) -> (j,i)>,  // B (in, transposed)
+    affine_map<(i,j) -> (i,j)>   // X (out)
+  ],
+  iterator_types = ["parallel", "parallel"],
+  doc = "X(i,j) *= A(i,j) * B(j,i)"
+}
+
+#BSR = #sparse_tensor.encoding<{   // 2x4 blocks
+  map = (i, j) ->
+    ( i floordiv 2 : dense
+    , j floordiv 4 : compressed
+    , i mod 2 : dense
+    , j mod 4 : dense
+    )
+}>
+
+// CHECK: #[[$map0:.*]] = affine_map<(d0, d1, d2, d3) -> (d0 * 2 + d2, d1 * 4 + d3)>
+// CHECK: #[[$map1:.*]] = affine_map<(d0, d1, d2, d3) -> (d1 * 4 + d3, d0 * 2 + d2)>
+// CHECK: #[[$map2:.*]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
+// CHECK-LABEL: func @mul(
+// CHECK-SAME:  %[[A0:.*0]]: tensor<32x32xf32>,
+// CHECK-SAME:  %[[A1:.*1]]: tensor<32x32xf32>,
+// CHECK-SAME:  %[[A2:.*2]]: tensor<32x32xf32, #sparse_tensor.encoding<{{{.*}}}>>)
+// CHECK:       %[[T0:.*]] = sparse_tensor.reinterpret_map %[[A2]]
+// CHECK:       %[[T1:.*]] = linalg.generic {indexing_maps = [#[[$map0]], #[[$map1]], #[[$map2]]], iterator_types = ["parallel", "parallel", "parallel", "parallel"]}
+// CHECK:       %[[T2:.*]] = sparse_tensor.reinterpret_map %[[T1]]
+// CHECK:       return %[[T2]] : tensor<32x32xf32, #sparse_tensor.encoding<{{{.*}}}>>
+func.func @mul(%arg0: tensor<32x32xf32>,
+               %arg1: tensor<32x32xf32>,
+               %arg2: tensor<32x32xf32, #BSR>) -> tensor<32x32xf32, #BSR> {
+  %0 = linalg.generic #trait_mul
+    ins(%arg0, %arg1: tensor<32x32xf32>, tensor<32x32xf32>)
+    outs(%arg2: tensor<32x32xf32, #BSR>) {
+      ^bb(%x: f32, %y : f32, %z : f32):
+        %1 = arith.mulf %x, %y : f32
+        %2 = arith.mulf %1, %z : f32
+        linalg.yield %2 : f32
+  } -> tensor<32x32xf32, #BSR>
+  return %0 : tensor<32x32xf32, #BSR>
+}
+

From 98a6edd38f960679e65124d52e3c61f4abd1419f Mon Sep 17 00:00:00 2001
From: Matthias Springer <me@m-sp.org>
Date: Wed, 1 Nov 2023 08:34:14 +0900
Subject: [PATCH 598/877] [mlir][Interfaces] `LoopLikeOpInterface`: Expose tied
 loop results (#70535)

Expose loop results, which correspond to the region iter_arg values that
are returned from the loop when there are no more iterations. Exposing
loop results is optional because some loops (e.g., `scf.while`) do not
have a 1-to-1 mapping between region iter_args and op results.

Also add additional helper functions to query tied
results/iter_args/inits.
---
 mlir/include/mlir/Dialect/SCF/IR/SCFOps.td    | 26 +-----
 .../mlir/Interfaces/LoopLikeInterface.td      | 87 +++++++++++++++++++
 .../Linalg/Transforms/HoistPadding.cpp        |  2 +-
 mlir/lib/Dialect/SCF/IR/SCF.cpp               |  2 +
 .../BufferizableOpInterfaceImpl.cpp           | 14 ++-
 .../SCF/Transforms/TileUsingInterface.cpp     |  3 +-
 mlir/lib/Interfaces/LoopLikeInterface.cpp     | 40 +++++++--
 mlir/test/Dialect/SCF/invalid.mlir            | 14 ++-
 8 files changed, 142 insertions(+), 46 deletions(-)

diff --git a/mlir/include/mlir/Dialect/SCF/IR/SCFOps.td b/mlir/include/mlir/Dialect/SCF/IR/SCFOps.td
index 43beebc1bf541..38937fe289494 100644
--- a/mlir/include/mlir/Dialect/SCF/IR/SCFOps.td
+++ b/mlir/include/mlir/Dialect/SCF/IR/SCFOps.td
@@ -269,28 +269,6 @@ def ForOp : SCF_Op<"for",
     /// Number of operands controlling the loop: lb, ub, step
     unsigned getNumControlOperands() { return 3; }
 
-    /// Get the OpResult that corresponds to an OpOperand.
-    /// Assert that opOperand is an iterArg.
-    /// This helper prevents internal op implementation detail leakage to
-    /// clients by hiding the operand / block argument mapping.
-    OpResult getResultForOpOperand(OpOperand &opOperand) {
-      assert(opOperand.getOperandNumber() >= getNumControlOperands() &&
-             "expected an iter args operand");
-      assert(opOperand.getOwner() == getOperation() &&
-             "opOperand does not belong to this scf::ForOp operation");
-      return getOperation()->getResult(
-        opOperand.getOperandNumber() - getNumControlOperands());
-    }
-    /// Get the OpOperand& that corresponds to an OpResultOpOperand.
-    /// This helper prevents internal op implementation detail leakage to
-    /// clients by hiding the operand / block argument mapping.
-    OpOperand &getOpOperandForResult(OpResult opResult) {
-      assert(opResult.getDefiningOp() == getOperation() &&
-             "opResult does not belong to the scf::ForOp operation");
-      return getOperation()->getOpOperand(
-        getNumControlOperands() + opResult.getResultNumber());
-    }
-
     /// Returns the step as an `APInt` if it is constant.
     std::optional<APInt> getConstantStep();
 
@@ -942,7 +920,7 @@ def WhileOp : SCF_Op<"while",
     [DeclareOpInterfaceMethods<RegionBranchOpInterface,
         ["getEntrySuccessorOperands"]>,
      DeclareOpInterfaceMethods<LoopLikeOpInterface,
-        ["getLoopResults", "getRegionIterArgs", "getYieldedValuesMutable"]>,
+        ["getRegionIterArgs", "getYieldedValuesMutable"]>,
      RecursiveMemoryEffects, SingleBlock]> {
   let summary = "a generic 'while' loop";
   let description = [{
@@ -1156,7 +1134,7 @@ def IndexSwitchOp : SCF_Op<"index_switch", [RecursiveMemoryEffects,
 //===----------------------------------------------------------------------===//
 
 def YieldOp : SCF_Op<"yield", [Pure, ReturnLike, Terminator,
-    ParentOneOf<["ExecuteRegionOp, ForOp", "IfOp", "IndexSwitchOp",
+    ParentOneOf<["ExecuteRegionOp", "ForOp", "IfOp", "IndexSwitchOp",
                  "ParallelOp", "WhileOp"]>]> {
   let summary = "loop yield and termination operation";
   let description = [{
diff --git a/mlir/include/mlir/Interfaces/LoopLikeInterface.td b/mlir/include/mlir/Interfaces/LoopLikeInterface.td
index d3d07eec8ebff..75d90b67bd82f 100644
--- a/mlir/include/mlir/Interfaces/LoopLikeInterface.td
+++ b/mlir/include/mlir/Interfaces/LoopLikeInterface.td
@@ -33,6 +33,13 @@ def LoopLikeOpInterface : OpInterface<"LoopLikeOpInterface"> {
     If one of the respective interface methods is implemented, so must the other
     two. The interface verifier ensures that the number of types of the region
     iter_args, init values and yielded values match.
+
+    Optionally, "loop results" can be exposed through this interface. These are
+    the values that are returned from the loop op when there are no more
+    iterations. The number and types of the loop results must match with the
+    region iter_args. Note: Loop results are optional because some loops
+    (e.g., `scf.while`) may produce results that do match 1-to-1 with the
+    region iter_args.
   }];
   let cppNamespace = "::mlir";
 
@@ -166,6 +173,26 @@ def LoopLikeOpInterface : OpInterface<"LoopLikeOpInterface"> {
         return {};
       }]
     >,
+    InterfaceMethod<[{
+        Return the range of results that are return from this loop and
+        correspond to the "init" operands.
+
+        Note: This interface method is optional. If loop results are not
+        exposed via this interface, "std::nullopt" should be returned.
+        Otherwise, the number and types of results must match with the
+        region iter_args, inits and yielded values that are exposed via this
+        interface. If loop results are exposed but this loop op has no
+        loop-carried variables, an empty result range (and not "std::nullopt")
+        should be returned.
+      }],
+      /*retTy=*/"::std::optional<::mlir::ResultRange>",
+      /*methodName=*/"getLoopResults",
+      /*args=*/(ins),
+      /*methodBody=*/"",
+      /*defaultImplementation=*/[{
+        return ::std::nullopt;
+      }]
+    >,
     InterfaceMethod<[{
         Append the specified additional "init" operands: replace this loop with
         a new loop that has the additional init operands. The loop body of
@@ -242,6 +269,8 @@ def LoopLikeOpInterface : OpInterface<"LoopLikeOpInterface"> {
     }
 
     /// Return the region iter_arg that corresponds to the given init operand.
+    /// Return an "empty" block argument if the given operand is not an init
+    /// operand of this loop op.
     BlockArgument getTiedLoopRegionIterArg(OpOperand *opOperand) {
       auto initsMutable = $_op.getInitsMutable();
       auto it = llvm::find(initsMutable, *opOperand);
@@ -250,7 +279,22 @@ def LoopLikeOpInterface : OpInterface<"LoopLikeOpInterface"> {
       return $_op.getRegionIterArgs()[std::distance(initsMutable.begin(), it)];
     }
 
+    /// Return the region iter_arg that corresponds to the given loop result.
+    /// Return an "empty" block argument if the given OpResult is not a loop
+    /// result or if this op does not expose any loop results.
+    BlockArgument getTiedLoopRegionIterArg(OpResult opResult) {
+      auto loopResults = $_op.getLoopResults();
+      if (!loopResults)
+        return {};
+      auto it = llvm::find(*loopResults, opResult);
+      if (it == loopResults->end())
+        return {};
+      return $_op.getRegionIterArgs()[std::distance(loopResults->begin(), it)];
+    }
+
     /// Return the init operand that corresponds to the given region iter_arg.
+    /// Return "nullptr" if the given block argument is not a region iter_arg
+    /// of this loop op.
     OpOperand *getTiedLoopInit(BlockArgument bbArg) {
       auto iterArgs = $_op.getRegionIterArgs();
       auto it = llvm::find(iterArgs, bbArg);
@@ -259,7 +303,22 @@ def LoopLikeOpInterface : OpInterface<"LoopLikeOpInterface"> {
       return &$_op.getInitsMutable()[std::distance(iterArgs.begin(), it)];
     }
 
+    /// Return the init operand that corresponds to the given loop result.
+    /// Return "nullptr" if the given OpResult is not a loop result or if this
+    /// op does not expose any loop results.
+    OpOperand *getTiedLoopInit(OpResult opResult) {
+      auto loopResults = $_op.getLoopResults();
+      if (!loopResults)
+        return nullptr;
+      auto it = llvm::find(*loopResults, opResult);
+      if (it == loopResults->end())
+        return nullptr;
+      return &$_op.getInitsMutable()[std::distance(loopResults->begin(), it)];
+    }
+
     /// Return the yielded value that corresponds to the given region iter_arg.
+    /// Return "nullptr" if the given block argument is not a region iter_arg
+    /// of this loop op.
     OpOperand *getTiedLoopYieldedValue(BlockArgument bbArg) {
       auto iterArgs = $_op.getRegionIterArgs();
       auto it = llvm::find(iterArgs, bbArg);
@@ -268,6 +327,34 @@ def LoopLikeOpInterface : OpInterface<"LoopLikeOpInterface"> {
       return
           &$_op.getYieldedValuesMutable()[std::distance(iterArgs.begin(), it)];
     }
+
+    /// Return the loop result that corresponds to the given init operand.
+    /// Return an "empty" OpResult if the given operand is not an init operand
+    /// of this loop op or if this op does not expose any loop results.
+    OpResult getTiedLoopResult(OpOperand *opOperand) {
+      auto loopResults = $_op.getLoopResults();
+      if (!loopResults)
+        return {};
+      auto initsMutable = $_op.getInitsMutable();
+      auto it = llvm::find(initsMutable, *opOperand);
+      if (it == initsMutable.end())
+        return {};
+      return (*loopResults)[std::distance(initsMutable.begin(), it)];
+    }
+
+    /// Return the loop result that corresponds to the given region iter_arg.
+    /// Return an "empty" OpResult if the given block argument is not a region
+    /// iter_arg of this loop op or if this op does not expose any loop results.
+    OpResult getTiedLoopResult(BlockArgument bbArg) {
+      auto loopResults = $_op.getLoopResults();
+      if (!loopResults)
+        return {};
+      auto iterArgs = $_op.getRegionIterArgs();
+      auto it = llvm::find(iterArgs, bbArg);
+      if (it == iterArgs.end())
+        return {};
+      return (*loopResults)[std::distance(iterArgs.begin(), it)];
+    }
   }];
 
   let verifyWithRegions = 1;
diff --git a/mlir/lib/Dialect/Linalg/Transforms/HoistPadding.cpp b/mlir/lib/Dialect/Linalg/Transforms/HoistPadding.cpp
index 19f704f5232ed..866f51b0e92bb 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/HoistPadding.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/HoistPadding.cpp
@@ -810,7 +810,7 @@ padThroughLoopIterArg(RewriterBase &rewriter, Value paddedValueBeforeHoisting,
   OpBuilder::InsertionGuard g(rewriter);
   rewriter.setInsertionPointAfter(hoistedPackedTensor.getDefiningOp());
 
-  unsigned iterArgNumber = forOp.getResultForOpOperand(*pUse).getResultNumber();
+  unsigned iterArgNumber = forOp.getTiedLoopResult(pUse).getResultNumber();
   auto yieldingExtractSliceOp = forOp.getYieldedValues()[iterArgNumber]
                                     .getDefiningOp<tensor::ExtractSliceOp>();
   if (!yieldingExtractSliceOp)
diff --git a/mlir/lib/Dialect/SCF/IR/SCF.cpp b/mlir/lib/Dialect/SCF/IR/SCF.cpp
index b8b75f3f476a5..bc33fe2a9a010 100644
--- a/mlir/lib/Dialect/SCF/IR/SCF.cpp
+++ b/mlir/lib/Dialect/SCF/IR/SCF.cpp
@@ -390,6 +390,8 @@ std::optional<OpFoldResult> ForOp::getSingleUpperBound() {
   return OpFoldResult(getUpperBound());
 }
 
+std::optional<ResultRange> ForOp::getLoopResults() { return getResults(); }
+
 /// Promotes the loop body of a forOp to its containing block if the forOp
 /// it can be determined that the loop has a single iteration.
 LogicalResult ForOp::promoteIfSingleIteration(RewriterBase &rewriter) {
diff --git a/mlir/lib/Dialect/SCF/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Dialect/SCF/Transforms/BufferizableOpInterfaceImpl.cpp
index 885e00b48ff84..dc3c46bf896a9 100644
--- a/mlir/lib/Dialect/SCF/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/SCF/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -614,7 +614,7 @@ struct ForOpInterface
   AliasingValueList getAliasingValues(Operation *op, OpOperand &opOperand,
                                       const AnalysisState &state) const {
     auto forOp = cast<scf::ForOp>(op);
-    OpResult opResult = forOp.getResultForOpOperand(opOperand);
+    OpResult opResult = forOp.getTiedLoopResult(&opOperand);
     BufferRelation relation = bufferRelation(op, opResult, state);
     return {{opResult, relation,
              /*isDefinite=*/relation == BufferRelation::Equivalent}};
@@ -625,10 +625,9 @@ struct ForOpInterface
     // ForOp results are equivalent to their corresponding init_args if the
     // corresponding iter_args and yield values are equivalent.
     auto forOp = cast<scf::ForOp>(op);
-    OpOperand &forOperand = forOp.getOpOperandForResult(opResult);
-    auto bbArg = forOp.getTiedLoopRegionIterArg(&forOperand);
+    BlockArgument bbArg = forOp.getTiedLoopRegionIterArg(opResult);
     bool equivalentYield = state.areEquivalentBufferizedValues(
-        bbArg, forOp.getYieldedValues()[opResult.getResultNumber()]);
+        bbArg, forOp.getTiedLoopYieldedValue(bbArg)->get());
     return equivalentYield ? BufferRelation::Equivalent
                            : BufferRelation::Unknown;
   }
@@ -703,16 +702,13 @@ struct ForOpInterface
 
     if (auto opResult = dyn_cast<OpResult>(value)) {
       // The type of an OpResult must match the corresponding iter_arg type.
-      BlockArgument bbArg = forOp.getTiedLoopRegionIterArg(
-          &forOp.getOpOperandForResult(opResult));
+      BlockArgument bbArg = forOp.getTiedLoopRegionIterArg(opResult);
       return bufferization::getBufferType(bbArg, options, invocationStack);
     }
 
     // Compute result/argument number.
     BlockArgument bbArg = cast<BlockArgument>(value);
-    unsigned resultNum =
-        forOp.getResultForOpOperand(*forOp.getTiedLoopInit(bbArg))
-            .getResultNumber();
+    unsigned resultNum = forOp.getTiedLoopResult(bbArg).getResultNumber();
 
     // Compute the bufferized type.
     auto yieldOp = cast<scf::YieldOp>(forOp.getBody()->getTerminator());
diff --git a/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp b/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp
index e649125a09fea..df162d29a48eb 100644
--- a/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp
+++ b/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp
@@ -609,8 +609,7 @@ mlir::scf::tileAndFuseProducerOfSlice(RewriterBase &rewriter,
   if (destinationInitArg &&
       (*destinationInitArg)->getOwner() == outerMostLoop) {
     unsigned iterArgNumber =
-        outerMostLoop.getResultForOpOperand(**destinationInitArg)
-            .getResultNumber();
+        outerMostLoop.getTiedLoopResult(*destinationInitArg).getResultNumber();
     int64_t resultNumber = fusableProducer.getResultNumber();
     if (auto dstOp =
             dyn_cast<DestinationStyleOpInterface>(fusableProducer.getOwner())) {
diff --git a/mlir/lib/Interfaces/LoopLikeInterface.cpp b/mlir/lib/Interfaces/LoopLikeInterface.cpp
index 15a816f4e4488..be1316b95688b 100644
--- a/mlir/lib/Interfaces/LoopLikeInterface.cpp
+++ b/mlir/lib/Interfaces/LoopLikeInterface.cpp
@@ -58,7 +58,7 @@ LogicalResult detail::verifyLoopLikeOpInterface(Operation *op) {
   // but the LoopLikeOpInterface provides better error messages.
   auto loopLikeOp = cast<LoopLikeOpInterface>(op);
 
-  // Verify number of inits/iter_args/yielded values.
+  // Verify number of inits/iter_args/yielded values/loop results.
   if (loopLikeOp.getInits().size() != loopLikeOp.getRegionIterArgs().size())
     return op->emitOpError("different number of inits and region iter_args: ")
            << loopLikeOp.getInits().size()
@@ -69,21 +69,43 @@ LogicalResult detail::verifyLoopLikeOpInterface(Operation *op) {
                "different number of region iter_args and yielded values: ")
            << loopLikeOp.getRegionIterArgs().size()
            << " != " << loopLikeOp.getYieldedValues().size();
+  if (loopLikeOp.getLoopResults() && loopLikeOp.getLoopResults()->size() !=
+                                         loopLikeOp.getRegionIterArgs().size())
+    return op->emitOpError(
+               "different number of loop results and region iter_args: ")
+           << loopLikeOp.getLoopResults()->size()
+           << " != " << loopLikeOp.getRegionIterArgs().size();
 
-  // Verify types of inits/iter_args/yielded values.
+  // Verify types of inits/iter_args/yielded values/loop results.
   int64_t i = 0;
   for (const auto it :
        llvm::zip_equal(loopLikeOp.getInits(), loopLikeOp.getRegionIterArgs(),
                        loopLikeOp.getYieldedValues())) {
     if (std::get<0>(it).getType() != std::get<1>(it).getType())
-      op->emitOpError(std::to_string(i))
-          << "-th init and " << i << "-th region iter_arg have different type: "
-          << std::get<0>(it).getType() << " != " << std::get<1>(it).getType();
+      return op->emitOpError(std::to_string(i))
+             << "-th init and " << i
+             << "-th region iter_arg have different type: "
+             << std::get<0>(it).getType()
+             << " != " << std::get<1>(it).getType();
     if (std::get<1>(it).getType() != std::get<2>(it).getType())
-      op->emitOpError(std::to_string(i))
-          << "-th region iter_arg and " << i
-          << "-th yielded value have different type: "
-          << std::get<1>(it).getType() << " != " << std::get<2>(it).getType();
+      return op->emitOpError(std::to_string(i))
+             << "-th region iter_arg and " << i
+             << "-th yielded value have different type: "
+             << std::get<1>(it).getType()
+             << " != " << std::get<2>(it).getType();
+    ++i;
+  }
+  i = 0;
+  if (loopLikeOp.getLoopResults()) {
+    for (const auto it : llvm::zip_equal(loopLikeOp.getRegionIterArgs(),
+                                         *loopLikeOp.getLoopResults())) {
+      if (std::get<0>(it).getType() != std::get<1>(it).getType())
+        return op->emitOpError(std::to_string(i))
+               << "-th region iter_arg and " << i
+               << "-th loop result have different type: "
+               << std::get<0>(it).getType()
+               << " != " << std::get<1>(it).getType();
+    }
     ++i;
   }
 
diff --git a/mlir/test/Dialect/SCF/invalid.mlir b/mlir/test/Dialect/SCF/invalid.mlir
index 1b2c3f563195c..ad07a8b11327d 100644
--- a/mlir/test/Dialect/SCF/invalid.mlir
+++ b/mlir/test/Dialect/SCF/invalid.mlir
@@ -96,6 +96,19 @@ func.func @not_enough_loop_results(%arg0: index, %init: f32) {
 
 // -----
 
+func.func @scf_for_incorrect_result_type(%arg0: index, %init: f32) {
+  // expected-error @below{{0-th region iter_arg and 0-th loop result have different type: 'f32' != 'f64'}}
+  "scf.for"(%arg0, %arg0, %arg0, %init) (
+    {
+    ^bb0(%i0 : index, %iter: f32):
+      scf.yield %iter : f32
+    }
+  ) : (index, index, index, f32) -> (f64)
+  return
+}
+
+// -----
+
 func.func @too_many_iter_args(%arg0: index, %init: f32) {
   // expected-error @below{{different number of inits and region iter_args: 1 != 2}}
   %x = "scf.for"(%arg0, %arg0, %arg0, %init) (
@@ -449,7 +462,6 @@ func.func @std_for_operands_mismatch_4(%arg0 : index, %arg1 : index, %arg2 : ind
   %s0 = arith.constant 0.0 : f32
   %t0 = arith.constant 1.0 : f32
   // expected-error @below {{1-th region iter_arg and 1-th yielded value have different type: 'f32' != 'i32'}}
-  // expected-error @below {{along control flow edge from Region #0 to Region #0: source type #1 'i32' should match input type #1 'f32'}}
   %result1:2 = scf.for %i0 = %arg0 to %arg1 step %arg2
                     iter_args(%si = %s0, %ti = %t0) -> (f32, f32) {
     %sn = arith.addf %si, %si : f32

From af010e79163b373e49fb45bf39878fd35d8dcd49 Mon Sep 17 00:00:00 2001
From: Matthias Braun <matze@braunis.de>
Date: Tue, 31 Oct 2023 16:40:41 -0700
Subject: [PATCH 599/877] Metadata: Optimize metadata queries (#70700)

Optimize metadata query code:
- Avoid `DenseMap::operator[]` in situations where it is known that the
key exists in the map. Instead use `DenseMap::at()`/
`DenseMap::find()->second`. This avoids code-bloat and bad inlining
decisions for the unused insertion/growing code in `operator[]`.
- Avoid a redundant `Value::hasMetadata()` check.
- Move the `KindID == LLVMContext::MD_dbg` case to
`Instruction::getMetadata` and check it first assuming that it can be
constant folded after inlining in many situations.

The motivation for this change is a regression triggered by
e3cf80c5c1fe55efd8216575ccadea0ab087e79c which could attributed to the
compiler inlining the insertion part of `DenseMap::operator[]` in more
cases while unbeknownst to a compiler (without PGO) that code is never
used in this context. This change improves performance and eliminates
difference before and after that change in my measurements.
---
 llvm/include/llvm/IR/Instruction.h |  7 +++--
 llvm/include/llvm/IR/Value.h       | 11 +++++++-
 llvm/lib/IR/Metadata.cpp           | 45 +++++++++++++-----------------
 3 files changed, 33 insertions(+), 30 deletions(-)

diff --git a/llvm/include/llvm/IR/Instruction.h b/llvm/include/llvm/IR/Instruction.h
index 5142847fa75ff..b5ccdf020a4c0 100644
--- a/llvm/include/llvm/IR/Instruction.h
+++ b/llvm/include/llvm/IR/Instruction.h
@@ -303,8 +303,10 @@ class Instruction : public User,
   /// Get the metadata of given kind attached to this Instruction.
   /// If the metadata is not found then return null.
   MDNode *getMetadata(unsigned KindID) const {
-    if (!hasMetadata()) return nullptr;
-    return getMetadataImpl(KindID);
+    // Handle 'dbg' as a special case since it is not stored in the hash table.
+    if (KindID == LLVMContext::MD_dbg)
+      return DbgLoc.getAsMDNode();
+    return Value::getMetadata(KindID);
   }
 
   /// Get the metadata of given kind attached to this Instruction.
@@ -594,7 +596,6 @@ class Instruction : public User,
 
 private:
   // These are all implemented in Metadata.cpp.
-  MDNode *getMetadataImpl(unsigned KindID) const;
   MDNode *getMetadataImpl(StringRef Kind) const;
   void
   getAllMetadataImpl(SmallVectorImpl<std::pair<unsigned, MDNode *>> &) const;
diff --git a/llvm/include/llvm/IR/Value.h b/llvm/include/llvm/IR/Value.h
index bc8284c068d0e..8fce56e12c523 100644
--- a/llvm/include/llvm/IR/Value.h
+++ b/llvm/include/llvm/IR/Value.h
@@ -562,7 +562,11 @@ class Value {
   /// These functions require that the value have at most a single attachment
   /// of the given kind, and return \c nullptr if such an attachment is missing.
   /// @{
-  MDNode *getMetadata(unsigned KindID) const;
+  MDNode *getMetadata(unsigned KindID) const {
+    if (!HasMetadata)
+      return nullptr;
+    return getMetadataImpl(KindID);
+  }
   MDNode *getMetadata(StringRef Kind) const;
   /// @}
 
@@ -617,6 +621,11 @@ class Value {
   /// Erase all metadata attached to this Value.
   void clearMetadata();
 
+  /// Get metadata for the given kind, if any.
+  /// This is an internal function that must only be called after
+  /// checking that `hasMetadata()` returns true.
+  MDNode *getMetadataImpl(unsigned KindID) const;
+
 public:
   /// Return true if this value is a swifterror value.
   ///
diff --git a/llvm/lib/IR/Metadata.cpp b/llvm/lib/IR/Metadata.cpp
index c153ffb71a73b..7860280619bb9 100644
--- a/llvm/lib/IR/Metadata.cpp
+++ b/llvm/lib/IR/Metadata.cpp
@@ -1351,25 +1351,22 @@ bool MDAttachments::erase(unsigned ID) {
   return OldSize != Attachments.size();
 }
 
-MDNode *Value::getMetadata(unsigned KindID) const {
+MDNode *Value::getMetadata(StringRef Kind) const {
   if (!hasMetadata())
     return nullptr;
-  const auto &Info = getContext().pImpl->ValueMetadata[this];
-  assert(!Info.empty() && "bit out of sync with hash table");
-  return Info.lookup(KindID);
+  unsigned KindID = getContext().getMDKindID(Kind);
+  return getMetadataImpl(KindID);
 }
 
-MDNode *Value::getMetadata(StringRef Kind) const {
-  if (!hasMetadata())
-    return nullptr;
-  const auto &Info = getContext().pImpl->ValueMetadata[this];
-  assert(!Info.empty() && "bit out of sync with hash table");
-  return Info.lookup(getContext().getMDKindID(Kind));
+MDNode *Value::getMetadataImpl(unsigned KindID) const {
+  const LLVMContext &Ctx = getContext();
+  const MDAttachments &Attachements = Ctx.pImpl->ValueMetadata.at(this);
+  return Attachements.lookup(KindID);
 }
 
 void Value::getMetadata(unsigned KindID, SmallVectorImpl<MDNode *> &MDs) const {
   if (hasMetadata())
-    getContext().pImpl->ValueMetadata[this].get(KindID, MDs);
+    getContext().pImpl->ValueMetadata.at(this).get(KindID, MDs);
 }
 
 void Value::getMetadata(StringRef Kind, SmallVectorImpl<MDNode *> &MDs) const {
@@ -1382,8 +1379,7 @@ void Value::getAllMetadata(
   if (hasMetadata()) {
     assert(getContext().pImpl->ValueMetadata.count(this) &&
            "bit out of sync with hash table");
-    const auto &Info = getContext().pImpl->ValueMetadata.find(this)->second;
-    assert(!Info.empty() && "Shouldn't have called this");
+    const MDAttachments &Info = getContext().pImpl->ValueMetadata.at(this);
     Info.getAll(MDs);
   }
 }
@@ -1393,7 +1389,7 @@ void Value::setMetadata(unsigned KindID, MDNode *Node) {
 
   // Handle the case when we're adding/updating metadata on a value.
   if (Node) {
-    auto &Info = getContext().pImpl->ValueMetadata[this];
+    MDAttachments &Info = getContext().pImpl->ValueMetadata[this];
     assert(!Info.empty() == HasMetadata && "bit out of sync with hash table");
     if (Info.empty())
       HasMetadata = true;
@@ -1406,7 +1402,7 @@ void Value::setMetadata(unsigned KindID, MDNode *Node) {
          "bit out of sync with hash table");
   if (!HasMetadata)
     return; // Nothing to remove!
-  auto &Info = getContext().pImpl->ValueMetadata[this];
+  MDAttachments &Info = getContext().pImpl->ValueMetadata.find(this)->second;
 
   // Handle removal of an existing value.
   Info.erase(KindID);
@@ -1438,7 +1434,7 @@ bool Value::eraseMetadata(unsigned KindID) {
   if (!HasMetadata)
     return false;
 
-  auto &Store = getContext().pImpl->ValueMetadata[this];
+  MDAttachments &Store = getContext().pImpl->ValueMetadata.find(this)->second;
   bool Changed = Store.erase(KindID);
   if (Store.empty())
     clearMetadata();
@@ -1461,7 +1457,11 @@ void Instruction::setMetadata(StringRef Kind, MDNode *Node) {
 }
 
 MDNode *Instruction::getMetadataImpl(StringRef Kind) const {
-  return getMetadataImpl(getContext().getMDKindID(Kind));
+  const LLVMContext &Ctx = getContext();
+  unsigned KindID = Ctx.getMDKindID(Kind);
+  if (KindID == LLVMContext::MD_dbg)
+    return DbgLoc.getAsMDNode();
+  return Value::getMetadata(KindID);
 }
 
 void Instruction::dropUnknownNonDebugMetadata(ArrayRef<unsigned> KnownIDs) {
@@ -1475,7 +1475,7 @@ void Instruction::dropUnknownNonDebugMetadata(ArrayRef<unsigned> KnownIDs) {
   KnownSet.insert(LLVMContext::MD_DIAssignID);
 
   auto &MetadataStore = getContext().pImpl->ValueMetadata;
-  auto &Info = MetadataStore[this];
+  MDAttachments &Info = MetadataStore.find(this)->second;
   assert(!Info.empty() && "bit out of sync with hash table");
   Info.remove_if([&KnownSet](const MDAttachments::Attachment &I) {
     return !KnownSet.count(I.MDKind);
@@ -1598,7 +1598,7 @@ AAMDNodes Instruction::getAAMetadata() const {
   // Not using Instruction::hasMetadata() because we're not interested in
   // DebugInfoMetadata.
   if (Value::hasMetadata()) {
-    const auto &Info = getContext().pImpl->ValueMetadata[this];
+    const MDAttachments &Info = getContext().pImpl->ValueMetadata.at(this);
     Result.TBAA = Info.lookup(LLVMContext::MD_tbaa);
     Result.TBAAStruct = Info.lookup(LLVMContext::MD_tbaa_struct);
     Result.Scope = Info.lookup(LLVMContext::MD_alias_scope);
@@ -1619,13 +1619,6 @@ void Instruction::setNoSanitizeMetadata() {
               llvm::MDNode::get(getContext(), std::nullopt));
 }
 
-MDNode *Instruction::getMetadataImpl(unsigned KindID) const {
-  // Handle 'dbg' as a special case since it is not stored in the hash table.
-  if (KindID == LLVMContext::MD_dbg)
-    return DbgLoc.getAsMDNode();
-  return Value::getMetadata(KindID);
-}
-
 void Instruction::getAllMetadataImpl(
     SmallVectorImpl<std::pair<unsigned, MDNode *>> &Result) const {
   Result.clear();

From e11148fda6d8c0eec496463adb46fd6f03a91081 Mon Sep 17 00:00:00 2001
From: Peter Collingbourne <peter@pcc.me.uk>
Date: Tue, 31 Oct 2023 16:43:40 -0700
Subject: [PATCH 600/877] Revert recent changes to test_demangle.pass.cpp

These tests do not need to be updated when making changes to the API.
---
 libcxxabi/test/test_demangle.pass.cpp | 32 +++++++++++++--------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/libcxxabi/test/test_demangle.pass.cpp b/libcxxabi/test/test_demangle.pass.cpp
index cd640aad80df9..77741a952850a 100644
--- a/libcxxabi/test/test_demangle.pass.cpp
+++ b/libcxxabi/test/test_demangle.pass.cpp
@@ -5989,7 +5989,7 @@ const char* cases[][2] =
     {"_ZN5clang13TreeTransformIN12_GLOBAL__N_129CurrentInstantiationRebuilderEE13TransformTypeERNS_14TypeLocBuilderENS_7TypeLocE", "clang::TreeTransform<(anonymous namespace)::CurrentInstantiationRebuilder>::TransformType(clang::TypeLocBuilder&, clang::TypeLoc)"},
     {"_ZN5clang13TreeTransformIN12_GLOBAL__N_129CurrentInstantiationRebuilderEE27TransformFunctionTypeParamsENS_14SourceLocationEPPNS_11ParmVarDeclEjPKNS_8QualTypeERN4llvm15SmallVectorImplIS8_EEPNSC_IS6_EE", "clang::TreeTransform<(anonymous namespace)::CurrentInstantiationRebuilder>::TransformFunctionTypeParams(clang::SourceLocation, clang::ParmVarDecl**, unsigned int, clang::QualType const*, llvm::SmallVectorImpl<clang::QualType>&, llvm::SmallVectorImpl<clang::ParmVarDecl*>*)"},
     {"_ZN5clang13TreeTransformIN12_GLOBAL__N_129CurrentInstantiationRebuilderEE26TransformFunctionTypeParamEPNS_11ParmVarDeclEN4llvm8OptionalIjEE", "clang::TreeTransform<(anonymous namespace)::CurrentInstantiationRebuilder>::TransformFunctionTypeParam(clang::ParmVarDecl*, llvm::Optional<unsigned int>)"},
-    {"_ZN5clang13TreeTransformIN12_GLOBAL__N_129CurrentInstantiationRebuilderEE16RebuildArrayTypeENS_8QualTypeENS_9ArrayType17ArraySizeModifierEPKN4llvm5APIntEPNS_4ExprEjNS_11SourceRangeE", "clang::TreeTransform<(anonymous namespace)::CurrentInstantiationRebuilder>::RebuildArrayType(clang::QualType, clang::ArraySizeModifier, llvm::APInt const*, clang::Expr*, unsigned int, clang::SourceRange)"},
+    {"_ZN5clang13TreeTransformIN12_GLOBAL__N_129CurrentInstantiationRebuilderEE16RebuildArrayTypeENS_8QualTypeENS_9ArrayType17ArraySizeModifierEPKN4llvm5APIntEPNS_4ExprEjNS_11SourceRangeE", "clang::TreeTransform<(anonymous namespace)::CurrentInstantiationRebuilder>::RebuildArrayType(clang::QualType, clang::ArrayType::ArraySizeModifier, llvm::APInt const*, clang::Expr*, unsigned int, clang::SourceRange)"},
     {"_ZN5clang13TreeTransformIN12_GLOBAL__N_129CurrentInstantiationRebuilderEE22TransformReferenceTypeERNS_14TypeLocBuilderENS_16ReferenceTypeLocE", "clang::TreeTransform<(anonymous namespace)::CurrentInstantiationRebuilder>::TransformReferenceType(clang::TypeLocBuilder&, clang::ReferenceTypeLoc)"},
     {"_ZN5clang13TreeTransformIN12_GLOBAL__N_129CurrentInstantiationRebuilderEE26TransformTemplateArgumentsINS_33TemplateArgumentLocInventIteratorIS2_PKNS_16TemplateArgumentEEEEEbT_SA_RNS_24TemplateArgumentListInfoE", "bool clang::TreeTransform<(anonymous namespace)::CurrentInstantiationRebuilder>::TransformTemplateArguments<clang::TemplateArgumentLocInventIterator<(anonymous namespace)::CurrentInstantiationRebuilder, clang::TemplateArgument const*>>(clang::TemplateArgumentLocInventIterator<(anonymous namespace)::CurrentInstantiationRebuilder, clang::TemplateArgument const*>, clang::TemplateArgumentLocInventIterator<(anonymous namespace)::CurrentInstantiationRebuilder, clang::TemplateArgument const*>, clang::TemplateArgumentListInfo&)"},
     {"_ZN5clang13TreeTransformIN12_GLOBAL__N_129CurrentInstantiationRebuilderEE25TransformTemplateArgumentERKNS_19TemplateArgumentLocERS4_", "clang::TreeTransform<(anonymous namespace)::CurrentInstantiationRebuilder>::TransformTemplateArgument(clang::TemplateArgumentLoc const&, clang::TemplateArgumentLoc&)"},
@@ -6152,7 +6152,7 @@ const char* cases[][2] =
     {"_ZN5clang13TreeTransformIN12_GLOBAL__N_120TemplateInstantiatorEE17TransformCallExprEPNS_8CallExprE", "clang::TreeTransform<(anonymous namespace)::TemplateInstantiator>::TransformCallExpr(clang::CallExpr*)"},
     {"_ZN5clang13TreeTransformIN12_GLOBAL__N_120TemplateInstantiatorEE21TransformCXXCatchStmtEPNS_12CXXCatchStmtE", "clang::TreeTransform<(anonymous namespace)::TemplateInstantiator>::TransformCXXCatchStmt(clang::CXXCatchStmt*)"},
     {"_ZN5clang13TreeTransformIN12_GLOBAL__N_120TemplateInstantiatorEE26TransformFunctionProtoTypeERNS_14TypeLocBuilderENS_20FunctionProtoTypeLocE", "clang::TreeTransform<(anonymous namespace)::TemplateInstantiator>::TransformFunctionProtoType(clang::TypeLocBuilder&, clang::FunctionProtoTypeLoc)"},
-    {"_ZN5clang13TreeTransformIN12_GLOBAL__N_120TemplateInstantiatorEE16RebuildArrayTypeENS_8QualTypeENS_9ArrayType17ArraySizeModifierEPKN4llvm5APIntEPNS_4ExprEjNS_11SourceRangeE", "clang::TreeTransform<(anonymous namespace)::TemplateInstantiator>::RebuildArrayType(clang::QualType, clang::ArraySizeModifier, llvm::APInt const*, clang::Expr*, unsigned int, clang::SourceRange)"},
+    {"_ZN5clang13TreeTransformIN12_GLOBAL__N_120TemplateInstantiatorEE16RebuildArrayTypeENS_8QualTypeENS_9ArrayType17ArraySizeModifierEPKN4llvm5APIntEPNS_4ExprEjNS_11SourceRangeE", "clang::TreeTransform<(anonymous namespace)::TemplateInstantiator>::RebuildArrayType(clang::QualType, clang::ArrayType::ArraySizeModifier, llvm::APInt const*, clang::Expr*, unsigned int, clang::SourceRange)"},
     {"_ZN5clang13TreeTransformIN12_GLOBAL__N_120TemplateInstantiatorEE22TransformReferenceTypeERNS_14TypeLocBuilderENS_16ReferenceTypeLocE", "clang::TreeTransform<(anonymous namespace)::TemplateInstantiator>::TransformReferenceType(clang::TypeLocBuilder&, clang::ReferenceTypeLoc)"},
     {"_ZN5clang14TypeLocBuilder12pushFullCopyENS_7TypeLocE", "clang::TypeLocBuilder::pushFullCopy(clang::TypeLoc)"},
     {"_ZN5clang11DeclVisitorINS_24TemplateDeclInstantiatorEPNS_4DeclEE5VisitES3_", "clang::DeclVisitor<clang::TemplateDeclInstantiator, clang::Decl*>::Visit(clang::Decl*)"},
@@ -6262,7 +6262,7 @@ const char* cases[][2] =
     {"_ZN5clang4Sema14BuildParenTypeENS_8QualTypeE", "clang::Sema::BuildParenType(clang::QualType)"},
     {"_ZN5clang4Sema16BuildPointerTypeENS_8QualTypeENS_14SourceLocationENS_15DeclarationNameE", "clang::Sema::BuildPointerType(clang::QualType, clang::SourceLocation, clang::DeclarationName)"},
     {"_ZN5clang4Sema18BuildReferenceTypeENS_8QualTypeEbNS_14SourceLocationENS_15DeclarationNameE", "clang::Sema::BuildReferenceType(clang::QualType, bool, clang::SourceLocation, clang::DeclarationName)"},
-    {"_ZN5clang4Sema14BuildArrayTypeENS_8QualTypeENS_9ArrayType17ArraySizeModifierEPNS_4ExprEjNS_11SourceRangeENS_15DeclarationNameE", "clang::Sema::BuildArrayType(clang::QualType, clang::ArraySizeModifier, clang::Expr*, unsigned int, clang::SourceRange, clang::DeclarationName)"},
+    {"_ZN5clang4Sema14BuildArrayTypeENS_8QualTypeENS_9ArrayType17ArraySizeModifierEPNS_4ExprEjNS_11SourceRangeENS_15DeclarationNameE", "clang::Sema::BuildArrayType(clang::QualType, clang::ArrayType::ArraySizeModifier, clang::Expr*, unsigned int, clang::SourceRange, clang::DeclarationName)"},
     {"_ZN5clang4Sema19RequireCompleteTypeENS_14SourceLocationENS_8QualTypeEj", "clang::Sema::RequireCompleteType(clang::SourceLocation, clang::QualType, unsigned int)"},
     {"_ZN5clang4Sema18BuildExtVectorTypeENS_8QualTypeEPNS_4ExprENS_14SourceLocationE", "clang::Sema::BuildExtVectorType(clang::QualType, clang::Expr*, clang::SourceLocation)"},
     {"_ZN5clang4Sema17BuildFunctionTypeENS_8QualTypeEPS1_jbjNS_16RefQualifierKindENS_14SourceLocationENS_15DeclarationNameENS_12FunctionType7ExtInfoE", "clang::Sema::BuildFunctionType(clang::QualType, clang::QualType*, unsigned int, bool, unsigned int, clang::RefQualifierKind, clang::SourceLocation, clang::DeclarationName, clang::FunctionType::ExtInfo)"},
@@ -6293,7 +6293,7 @@ const char* cases[][2] =
     {"_ZN12_GLOBAL__N_117TypeSpecLocFiller22VisitObjCObjectTypeLocEN5clang17ObjCObjectTypeLocE", "(anonymous namespace)::TypeSpecLocFiller::VisitObjCObjectTypeLoc(clang::ObjCObjectTypeLoc)"},
     {"_ZN5clang14TypeLocVisitorIN12_GLOBAL__N_117TypeSpecLocFillerEvE5VisitENS_7TypeLocE", "clang::TypeLocVisitor<(anonymous namespace)::TypeSpecLocFiller, void>::Visit(clang::TypeLoc)"},
     {"_Z25handleObjCPointerTypeAttrRN12_GLOBAL__N_119TypeProcessingStateERN5clang13AttributeListERNS2_8QualTypeE", "handleObjCPointerTypeAttr((anonymous namespace)::TypeProcessingState&, clang::AttributeList&, clang::QualType&)"},
-    {"_Z24HandleNeonVectorTypeAttrRN5clang8QualTypeERKNS_13AttributeListERNS_4SemaENS_10VectorType10VectorKindEPKc", "HandleNeonVectorTypeAttr(clang::QualType&, clang::AttributeList const&, clang::Sema&, clang::VectorKind, char const*)"},
+    {"_Z24HandleNeonVectorTypeAttrRN5clang8QualTypeERKNS_13AttributeListERNS_4SemaENS_10VectorType10VectorKindEPKc", "HandleNeonVectorTypeAttr(clang::QualType&, clang::AttributeList const&, clang::Sema&, clang::VectorType::VectorKind, char const*)"},
     {"_Z22handleFunctionTypeAttrRN12_GLOBAL__N_119TypeProcessingStateERN5clang13AttributeListERNS2_8QualTypeE", "handleFunctionTypeAttr((anonymous namespace)::TypeProcessingState&, clang::AttributeList&, clang::QualType&)"},
     {"_ZN12_GLOBAL__N_121FunctionTypeUnwrapper4wrapERN5clang10ASTContextENS1_8QualTypeEj", "(anonymous namespace)::FunctionTypeUnwrapper::wrap(clang::ASTContext&, clang::QualType, unsigned int)"},
     {"_ZN12_GLOBAL__N_121FunctionTypeUnwrapper4wrapERN5clang10ASTContextEPKNS1_4TypeEj", "(anonymous namespace)::FunctionTypeUnwrapper::wrap(clang::ASTContext&, clang::Type const*, unsigned int)"},
@@ -8880,12 +8880,12 @@ const char* cases[][2] =
     {"_ZNK5clang10ASTContext22getLValueReferenceTypeENS_8QualTypeEb", "clang::ASTContext::getLValueReferenceType(clang::QualType, bool) const"},
     {"_ZNK5clang10ASTContext22getRValueReferenceTypeENS_8QualTypeE", "clang::ASTContext::getRValueReferenceType(clang::QualType) const"},
     {"_ZNK5clang10ASTContext20getMemberPointerTypeENS_8QualTypeEPKNS_4TypeE", "clang::ASTContext::getMemberPointerType(clang::QualType, clang::Type const*) const"},
-    {"_ZNK5clang10ASTContext20getConstantArrayTypeENS_8QualTypeERKN4llvm5APIntENS_9ArrayType17ArraySizeModifierEj", "clang::ASTContext::getConstantArrayType(clang::QualType, llvm::APInt const&, clang::ArraySizeModifier, unsigned int) const"},
+    {"_ZNK5clang10ASTContext20getConstantArrayTypeENS_8QualTypeERKN4llvm5APIntENS_9ArrayType17ArraySizeModifierEj", "clang::ASTContext::getConstantArrayType(clang::QualType, llvm::APInt const&, clang::ArrayType::ArraySizeModifier, unsigned int) const"},
     {"_ZNK5clang10ASTContext27getVariableArrayDecayedTypeENS_8QualTypeE", "clang::ASTContext::getVariableArrayDecayedType(clang::QualType) const"},
-    {"_ZNK5clang10ASTContext26getDependentSizedArrayTypeENS_8QualTypeEPNS_4ExprENS_9ArrayType17ArraySizeModifierEjNS_11SourceRangeE", "clang::ASTContext::getDependentSizedArrayType(clang::QualType, clang::Expr*, clang::ArraySizeModifier, unsigned int, clang::SourceRange) const"},
-    {"_ZNK5clang10ASTContext20getVariableArrayTypeENS_8QualTypeEPNS_4ExprENS_9ArrayType17ArraySizeModifierEjNS_11SourceRangeE", "clang::ASTContext::getVariableArrayType(clang::QualType, clang::Expr*, clang::ArraySizeModifier, unsigned int, clang::SourceRange) const"},
-    {"_ZNK5clang10ASTContext22getIncompleteArrayTypeENS_8QualTypeENS_9ArrayType17ArraySizeModifierEj", "clang::ASTContext::getIncompleteArrayType(clang::QualType, clang::ArraySizeModifier, unsigned int) const"},
-    {"_ZNK5clang10ASTContext13getVectorTypeENS_8QualTypeEjNS_10VectorType10VectorKindE", "clang::ASTContext::getVectorType(clang::QualType, unsigned int, clang::VectorKind) const"},
+    {"_ZNK5clang10ASTContext26getDependentSizedArrayTypeENS_8QualTypeEPNS_4ExprENS_9ArrayType17ArraySizeModifierEjNS_11SourceRangeE", "clang::ASTContext::getDependentSizedArrayType(clang::QualType, clang::Expr*, clang::ArrayType::ArraySizeModifier, unsigned int, clang::SourceRange) const"},
+    {"_ZNK5clang10ASTContext20getVariableArrayTypeENS_8QualTypeEPNS_4ExprENS_9ArrayType17ArraySizeModifierEjNS_11SourceRangeE", "clang::ASTContext::getVariableArrayType(clang::QualType, clang::Expr*, clang::ArrayType::ArraySizeModifier, unsigned int, clang::SourceRange) const"},
+    {"_ZNK5clang10ASTContext22getIncompleteArrayTypeENS_8QualTypeENS_9ArrayType17ArraySizeModifierEj", "clang::ASTContext::getIncompleteArrayType(clang::QualType, clang::ArrayType::ArraySizeModifier, unsigned int) const"},
+    {"_ZNK5clang10ASTContext13getVectorTypeENS_8QualTypeEjNS_10VectorType10VectorKindE", "clang::ASTContext::getVectorType(clang::QualType, unsigned int, clang::VectorType::VectorKind) const"},
     {"_ZNK5clang10ASTContext16getExtVectorTypeENS_8QualTypeEj", "clang::ASTContext::getExtVectorType(clang::QualType, unsigned int) const"},
     {"_ZNK5clang10ASTContext30getDependentSizedExtVectorTypeENS_8QualTypeEPNS_4ExprENS_14SourceLocationE", "clang::ASTContext::getDependentSizedExtVectorType(clang::QualType, clang::Expr*, clang::SourceLocation) const"},
     {"_ZNK5clang10ASTContext21getCanonicalParamTypeENS_8QualTypeE", "clang::ASTContext::getCanonicalParamType(clang::QualType) const"},
@@ -10865,16 +10865,16 @@ const char* cases[][2] =
     {"_ZN5clang8QualType10isConstantES0_RNS_10ASTContextE", "clang::QualType::isConstant(clang::QualType, clang::ASTContext&)"},
     {"_ZN5clang17ConstantArrayType20getNumAddressingBitsERNS_10ASTContextENS_8QualTypeERKN4llvm5APIntE", "clang::ConstantArrayType::getNumAddressingBits(clang::ASTContext&, clang::QualType, llvm::APInt const&)"},
     {"_ZN5clang17ConstantArrayType14getMaxSizeBitsERNS_10ASTContextE", "clang::ConstantArrayType::getMaxSizeBits(clang::ASTContext&)"},
-    {"_ZN5clang23DependentSizedArrayTypeC1ERKNS_10ASTContextENS_8QualTypeES4_PNS_4ExprENS_9ArrayType17ArraySizeModifierEjNS_11SourceRangeE", "clang::DependentSizedArrayType::DependentSizedArrayType(clang::ASTContext const&, clang::QualType, clang::QualType, clang::Expr*, clang::ArraySizeModifier, unsigned int, clang::SourceRange)"},
-    {"_ZN5clang23DependentSizedArrayTypeC2ERKNS_10ASTContextENS_8QualTypeES4_PNS_4ExprENS_9ArrayType17ArraySizeModifierEjNS_11SourceRangeE", "clang::DependentSizedArrayType::DependentSizedArrayType(clang::ASTContext const&, clang::QualType, clang::QualType, clang::Expr*, clang::ArraySizeModifier, unsigned int, clang::SourceRange)"},
-    {"_ZN5clang23DependentSizedArrayType7ProfileERN4llvm16FoldingSetNodeIDERKNS_10ASTContextENS_8QualTypeENS_9ArrayType17ArraySizeModifierEjPNS_4ExprE", "clang::DependentSizedArrayType::Profile(llvm::FoldingSetNodeID&, clang::ASTContext const&, clang::QualType, clang::ArraySizeModifier, unsigned int, clang::Expr*)"},
+    {"_ZN5clang23DependentSizedArrayTypeC1ERKNS_10ASTContextENS_8QualTypeES4_PNS_4ExprENS_9ArrayType17ArraySizeModifierEjNS_11SourceRangeE", "clang::DependentSizedArrayType::DependentSizedArrayType(clang::ASTContext const&, clang::QualType, clang::QualType, clang::Expr*, clang::ArrayType::ArraySizeModifier, unsigned int, clang::SourceRange)"},
+    {"_ZN5clang23DependentSizedArrayTypeC2ERKNS_10ASTContextENS_8QualTypeES4_PNS_4ExprENS_9ArrayType17ArraySizeModifierEjNS_11SourceRangeE", "clang::DependentSizedArrayType::DependentSizedArrayType(clang::ASTContext const&, clang::QualType, clang::QualType, clang::Expr*, clang::ArrayType::ArraySizeModifier, unsigned int, clang::SourceRange)"},
+    {"_ZN5clang23DependentSizedArrayType7ProfileERN4llvm16FoldingSetNodeIDERKNS_10ASTContextENS_8QualTypeENS_9ArrayType17ArraySizeModifierEjPNS_4ExprE", "clang::DependentSizedArrayType::Profile(llvm::FoldingSetNodeID&, clang::ASTContext const&, clang::QualType, clang::ArrayType::ArraySizeModifier, unsigned int, clang::Expr*)"},
     {"_ZN5clang27DependentSizedExtVectorTypeC1ERKNS_10ASTContextENS_8QualTypeES4_PNS_4ExprENS_14SourceLocationE", "clang::DependentSizedExtVectorType::DependentSizedExtVectorType(clang::ASTContext const&, clang::QualType, clang::QualType, clang::Expr*, clang::SourceLocation)"},
     {"_ZN5clang27DependentSizedExtVectorTypeC2ERKNS_10ASTContextENS_8QualTypeES4_PNS_4ExprENS_14SourceLocationE", "clang::DependentSizedExtVectorType::DependentSizedExtVectorType(clang::ASTContext const&, clang::QualType, clang::QualType, clang::Expr*, clang::SourceLocation)"},
     {"_ZN5clang27DependentSizedExtVectorType7ProfileERN4llvm16FoldingSetNodeIDERKNS_10ASTContextENS_8QualTypeEPNS_4ExprE", "clang::DependentSizedExtVectorType::Profile(llvm::FoldingSetNodeID&, clang::ASTContext const&, clang::QualType, clang::Expr*)"},
-    {"_ZN5clang10VectorTypeC1ENS_8QualTypeEjS1_NS0_10VectorKindE", "clang::VectorType::VectorType(clang::QualType, unsigned int, clang::QualType, clang::VectorKind)"},
-    {"_ZN5clang10VectorTypeC2ENS_8QualTypeEjS1_NS0_10VectorKindE", "clang::VectorType::VectorType(clang::QualType, unsigned int, clang::QualType, clang::VectorKind)"},
-    {"_ZN5clang10VectorTypeC1ENS_4Type9TypeClassENS_8QualTypeEjS3_NS0_10VectorKindE", "clang::VectorType::VectorType(clang::Type::TypeClass, clang::QualType, unsigned int, clang::QualType, clang::VectorKind)"},
-    {"_ZN5clang10VectorTypeC2ENS_4Type9TypeClassENS_8QualTypeEjS3_NS0_10VectorKindE", "clang::VectorType::VectorType(clang::Type::TypeClass, clang::QualType, unsigned int, clang::QualType, clang::VectorKind)"},
+    {"_ZN5clang10VectorTypeC1ENS_8QualTypeEjS1_NS0_10VectorKindE", "clang::VectorType::VectorType(clang::QualType, unsigned int, clang::QualType, clang::VectorType::VectorKind)"},
+    {"_ZN5clang10VectorTypeC2ENS_8QualTypeEjS1_NS0_10VectorKindE", "clang::VectorType::VectorType(clang::QualType, unsigned int, clang::QualType, clang::VectorType::VectorKind)"},
+    {"_ZN5clang10VectorTypeC1ENS_4Type9TypeClassENS_8QualTypeEjS3_NS0_10VectorKindE", "clang::VectorType::VectorType(clang::Type::TypeClass, clang::QualType, unsigned int, clang::QualType, clang::VectorType::VectorKind)"},
+    {"_ZN5clang10VectorTypeC2ENS_4Type9TypeClassENS_8QualTypeEjS3_NS0_10VectorKindE", "clang::VectorType::VectorType(clang::Type::TypeClass, clang::QualType, unsigned int, clang::QualType, clang::VectorType::VectorKind)"},
     {"_ZNK5clang4Type29getArrayElementTypeNoTypeQualEv", "clang::Type::getArrayElementTypeNoTypeQual() const"},
     {"_ZNK5clang4Type27getUnqualifiedDesugaredTypeEv", "clang::Type::getUnqualifiedDesugaredType() const"},
     {"_ZN5clang8QualType16getDesugaredTypeES0_RKNS_10ASTContextE", "clang::QualType::getDesugaredType(clang::QualType, clang::ASTContext const&)"},

From 604eff60abfce09f956c3b0b1414f8d0d04b5d47 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Tue, 31 Oct 2023 16:47:27 -0700
Subject: [PATCH 601/877] [SelectionDAG] Update comments that refer to
 MVT::Glue as a 'flag'

---
 llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 82481c60d16c4..4ef5fd87d1340 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -963,7 +963,7 @@ static void AddNodeIDNode(FoldingSetNodeID &ID, const SDNode *N) {
 /// doNotCSE - Return true if CSE should not be performed for this node.
 static bool doNotCSE(SDNode *N) {
   if (N->getValueType(0) == MVT::Glue)
-    return true; // Never CSE anything that produces a flag.
+    return true; // Never CSE anything that produces a glue result.
 
   switch (N->getOpcode()) {
   default: break;
@@ -975,7 +975,7 @@ static bool doNotCSE(SDNode *N) {
   // Check that remaining values produced are not flags.
   for (unsigned i = 1, e = N->getNumValues(); i != e; ++i)
     if (N->getValueType(i) == MVT::Glue)
-      return true; // Never CSE anything that produces a flag.
+      return true; // Never CSE anything that produces a glue result.
 
   return false;
 }
@@ -1209,7 +1209,7 @@ bool SelectionDAG::RemoveNodeFromCSEMaps(SDNode *N) {
   }
 #ifndef NDEBUG
   // Verify that the node was actually in one of the CSE maps, unless it has a
-  // flag result (which cannot be CSE'd) or is one of the special cases that are
+  // glue result (which cannot be CSE'd) or is one of the special cases that are
   // not subject to CSE.
   if (!Erased && N->getValueType(N->getNumValues()-1) != MVT::Glue &&
       !N->isMachineOpcode() && !doNotCSE(N)) {
@@ -5886,7 +5886,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
   SDNode *N;
   SDVTList VTs = getVTList(VT);
   SDValue Ops[] = {N1};
-  if (VT != MVT::Glue) { // Don't CSE flag producing nodes
+  if (VT != MVT::Glue) { // Don't CSE glue producing nodes
     FoldingSetNodeID ID;
     AddNodeIDNode(ID, Opcode, VTs, Ops);
     void *IP = nullptr;
@@ -7217,7 +7217,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
     break;
   }
 
-  // Memoize node if it doesn't produce a flag.
+  // Memoize node if it doesn't produce a glue result.
   SDNode *N;
   SDVTList VTs = getVTList(VT);
   SDValue Ops[] = {N1, N2, N3};
@@ -8356,7 +8356,7 @@ SDValue SelectionDAG::getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl,
            (int)Opcode >= ISD::FIRST_TARGET_MEMORY_OPCODE)) &&
          "Opcode is not a memory-accessing opcode!");
 
-  // Memoize the node unless it returns a flag.
+  // Memoize the node unless it returns a glue result.
   MemIntrinsicSDNode *N;
   if (VTList.VTs[VTList.NumVTs-1] != MVT::Glue) {
     FoldingSetNodeID ID;
@@ -9999,7 +9999,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList,
 #endif
   }
 
-  // Memoize the node unless it returns a flag.
+  // Memoize the node unless it returns a glue result.
   SDNode *N;
   if (VTList.VTs[VTList.NumVTs-1] != MVT::Glue) {
     FoldingSetNodeID ID;

From 4fbbb7ad7c1c96f0fa795e2b1528b93f604e2303 Mon Sep 17 00:00:00 2001
From: Matthias Springer <me@m-sp.org>
Date: Wed, 1 Nov 2023 09:26:47 +0900
Subject: [PATCH 602/877] [mlir][bufferization] Fix ownership computation of
 unknown ops (#70773)

No ownership is assumed for memref results of ops that implement none of
the relevant interfaces and have no memref operands. This fixes #68948.
---
 .../OwnershipBasedBufferDeallocation.cpp      | 24 ++++++++------
 .../dealloc-other.mlir                        | 31 +++++++++++++++++++
 2 files changed, 45 insertions(+), 10 deletions(-)
 create mode 100644 mlir/test/Dialect/Bufferization/Transforms/OwnershipBasedBufferDeallocation/dealloc-other.mlir

diff --git a/mlir/lib/Dialect/Bufferization/Transforms/OwnershipBasedBufferDeallocation.cpp b/mlir/lib/Dialect/Bufferization/Transforms/OwnershipBasedBufferDeallocation.cpp
index b2b77eda92ef2..9459cc43547fa 100644
--- a/mlir/lib/Dialect/Bufferization/Transforms/OwnershipBasedBufferDeallocation.cpp
+++ b/mlir/lib/Dialect/Bufferization/Transforms/OwnershipBasedBufferDeallocation.cpp
@@ -973,16 +973,9 @@ void BufferDeallocation::populateRemainingOwnerships(Operation *op) {
     if (!state.getOwnership(res, op->getBlock()).isUninitialized())
       continue;
 
-    // Don't take ownership of a returned memref if no allocate side-effect is
-    // present, relevant for memref.get_global, for example.
-    if (op->getNumOperands() == 0) {
-      OpBuilder builder(op);
-      state.updateOwnership(res, buildBoolValue(builder, op->getLoc(), false));
-      continue;
-    }
-
-    // Assume the result may alias with any operand and thus combine all their
-    // ownerships.
+    // The op does not allocate memory, otherwise, it would have been assigned
+    // an ownership during `handleInterface`. Assume the result may alias with
+    // any memref operand and thus combine all their ownerships.
     for (auto operand : op->getOperands()) {
       if (!isMemref(operand))
         continue;
@@ -991,6 +984,17 @@ void BufferDeallocation::populateRemainingOwnerships(Operation *op) {
           res, state.getOwnership(operand, operand.getParentBlock()),
           op->getBlock());
     }
+
+    // If the ownership value is still uninitialized (e.g., because the op has
+    // no memref operands), assume that no ownership is taken. E.g., this is the
+    // case for "memref.get_global".
+    //
+    // Note: This can lead to memory leaks if memory side effects are not
+    // properly specified on the op.
+    if (state.getOwnership(res, op->getBlock()).isUninitialized()) {
+      OpBuilder builder(op);
+      state.updateOwnership(res, buildBoolValue(builder, op->getLoc(), false));
+    }
   }
 }
 
diff --git a/mlir/test/Dialect/Bufferization/Transforms/OwnershipBasedBufferDeallocation/dealloc-other.mlir b/mlir/test/Dialect/Bufferization/Transforms/OwnershipBasedBufferDeallocation/dealloc-other.mlir
new file mode 100644
index 0000000000000..be894dbe4c5fc
--- /dev/null
+++ b/mlir/test/Dialect/Bufferization/Transforms/OwnershipBasedBufferDeallocation/dealloc-other.mlir
@@ -0,0 +1,31 @@
+// RUN: mlir-opt -verify-diagnostics -ownership-based-buffer-deallocation \
+// RUN:   --buffer-deallocation-simplification -canonicalize -split-input-file %s | FileCheck %s
+
+// No ownership is assumed for ops that do not implement any interface and have
+// no memref operands.
+
+// CHECK-LABEL: func private @no_interface_no_operands(
+//  CHECK-NEXT:   %[[m:.*]] = bufferization.to_memref
+//  CHECK-NEXT:   %[[clone:.*]] = bufferization.clone %[[m]]
+//  CHECK-NEXT:   return %[[clone]]
+func.func private @no_interface_no_operands(%t : tensor<?x?x?xf16>) -> memref<?x?x?xf16> {
+  %0 = bufferization.to_memref %t : memref<?x?x?xf16>
+  return %0 : memref<?x?x?xf16>
+}
+
+// -----
+
+// If an op does not implement any interface but has memref operands, the
+// ownership of the memref results is computed from the operands.
+
+// CHECK-LABEL: func private @no_interface(
+//       CHECK:   %[[true:.*]] = arith.constant true
+//       CHECK:   %[[alloc:.*]] = memref.alloc
+//       CHECK:   %[[foo:.*]] = "test.foo"(%[[alloc]])
+//       CHECK:   %[[r:.*]] = bufferization.dealloc (%[[alloc]] : {{.*}}) if (%[[true]]) retain (%[[foo]] : {{.*}})
+//       CHECK:   return %[[foo]]
+func.func private @no_interface() -> memref<5xf32> {
+  %0 = memref.alloc() : memref<5xf32>
+  %1 = "test.foo"(%0) : (memref<5xf32>) -> (memref<5xf32>)
+  return %1 : memref<5xf32>
+}

From 87f671756dc64bca4ae717479fa84ac49fb77512 Mon Sep 17 00:00:00 2001
From: Min-Yih Hsu <min.hsu@sifive.com>
Date: Tue, 31 Oct 2023 17:52:50 -0700
Subject: [PATCH 603/877] [RISCV] Use FLI + FNEG to materialize some negative
 FP constants (#70825)

Most of the FP constants supported by FLI are positive. For negative FP
constants X whose positive values is supported by FLI, we can use `(FNEG
(FLI -X))` to materialize X.
---
 llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp | 22 +++++++++++++-----
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 25 +++++++++++++++------
 llvm/lib/Target/RISCV/RISCVISelLowering.h   |  2 +-
 llvm/test/CodeGen/RISCV/double-zfa.ll       | 12 +++++-----
 llvm/test/CodeGen/RISCV/float-zfa.ll        | 12 +++++-----
 llvm/test/CodeGen/RISCV/half-zfa-fli.ll     | 13 +++++------
 6 files changed, 54 insertions(+), 32 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index c2cac993fe13c..9f3c387914944 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -857,26 +857,34 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
   }
   case ISD::ConstantFP: {
     const APFloat &APF = cast<ConstantFPSDNode>(Node)->getValueAPF();
-    int FPImm = static_cast<const RISCVTargetLowering *>(TLI)->getLegalZfaFPImm(
-        APF, VT);
+    auto [FPImm, NeedsFNeg] =
+        static_cast<const RISCVTargetLowering *>(TLI)->getLegalZfaFPImm(APF,
+                                                                        VT);
     if (FPImm >= 0) {
       unsigned Opc;
+      unsigned FNegOpc;
       switch (VT.SimpleTy) {
       default:
         llvm_unreachable("Unexpected size");
       case MVT::f16:
         Opc = RISCV::FLI_H;
+        FNegOpc = RISCV::FSGNJN_H;
         break;
       case MVT::f32:
         Opc = RISCV::FLI_S;
+        FNegOpc = RISCV::FSGNJN_S;
         break;
       case MVT::f64:
         Opc = RISCV::FLI_D;
+        FNegOpc = RISCV::FSGNJN_D;
         break;
       }
-
       SDNode *Res = CurDAG->getMachineNode(
           Opc, DL, VT, CurDAG->getTargetConstant(FPImm, DL, XLenVT));
+      if (NeedsFNeg)
+        Res = CurDAG->getMachineNode(FNegOpc, DL, VT, SDValue(Res, 0),
+                                     SDValue(Res, 0));
+
       ReplaceNode(Node, Res);
       return;
     }
@@ -3200,8 +3208,12 @@ bool RISCVDAGToDAGISel::selectFPImm(SDValue N, SDValue &Imm) {
 
   MVT VT = CFP->getSimpleValueType(0);
 
-  if (static_cast<const RISCVTargetLowering *>(TLI)->getLegalZfaFPImm(APF,
-                                                                      VT) >= 0)
+  // Even if this FPImm requires an additional FNEG (i.e. the second element of
+  // the returned pair is true) we still prefer FLI + FNEG over immediate
+  // materialization as the latter might generate a longer instruction sequence.
+  if (static_cast<const RISCVTargetLowering *>(TLI)
+          ->getLegalZfaFPImm(APF, VT)
+          .first >= 0)
     return false;
 
   MVT XLenVT = Subtarget->getXLenVT();
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index beb371063f89b..e9f80432ab190 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -1978,11 +1978,17 @@ bool RISCVTargetLowering::isOffsetFoldingLegal(
   return false;
 }
 
-// Returns 0-31 if the fli instruction is available for the type and this is
-// legal FP immediate for the type. Returns -1 otherwise.
-int RISCVTargetLowering::getLegalZfaFPImm(const APFloat &Imm, EVT VT) const {
+// Return one of the followings:
+// (1) `{0-31 value, false}` if FLI is available for Imm's type and FP value.
+// (2) `{0-31 value, true}` if Imm is negative and FLI is available for its
+// positive counterpart, which will be materialized from the first returned
+// element. The second returned element indicated that there should be a FNEG
+// followed.
+// (3) `{-1, _}` if there is no way FLI can be used to materialize Imm.
+std::pair<int, bool> RISCVTargetLowering::getLegalZfaFPImm(const APFloat &Imm,
+                                                           EVT VT) const {
   if (!Subtarget.hasStdExtZfa())
-    return -1;
+    return std::make_pair(-1, false);
 
   bool IsSupportedVT = false;
   if (VT == MVT::f16) {
@@ -1995,9 +2001,14 @@ int RISCVTargetLowering::getLegalZfaFPImm(const APFloat &Imm, EVT VT) const {
   }
 
   if (!IsSupportedVT)
-    return -1;
+    return std::make_pair(-1, false);
 
-  return RISCVLoadFPImm::getLoadFPImm(Imm);
+  int Index = RISCVLoadFPImm::getLoadFPImm(Imm);
+  if (Index < 0 && Imm.isNegative())
+    // Try the combination of its positive counterpart + FNEG.
+    return std::make_pair(RISCVLoadFPImm::getLoadFPImm(-Imm), true);
+  else
+    return std::make_pair(Index, false);
 }
 
 bool RISCVTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
@@ -2015,7 +2026,7 @@ bool RISCVTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
   if (!IsLegalVT)
     return false;
 
-  if (getLegalZfaFPImm(Imm, VT) >= 0)
+  if (getLegalZfaFPImm(Imm, VT).first >= 0)
     return true;
 
   // Cannot create a 64 bit floating-point immediate value for rv32.
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index 5ca6376f858c4..49dd01eccb02a 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -464,7 +464,7 @@ class RISCVTargetLowering : public TargetLowering {
                           SmallVectorImpl<Use *> &Ops) const override;
   bool shouldScalarizeBinop(SDValue VecOp) const override;
   bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override;
-  int getLegalZfaFPImm(const APFloat &Imm, EVT VT) const;
+  std::pair<int, bool> getLegalZfaFPImm(const APFloat &Imm, EVT VT) const;
   bool isFPImmLegal(const APFloat &Imm, EVT VT,
                     bool ForCodeSize) const override;
   bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
diff --git a/llvm/test/CodeGen/RISCV/double-zfa.ll b/llvm/test/CodeGen/RISCV/double-zfa.ll
index 162f33dd5ac9d..f159002680bd1 100644
--- a/llvm/test/CodeGen/RISCV/double-zfa.ll
+++ b/llvm/test/CodeGen/RISCV/double-zfa.ll
@@ -141,23 +141,23 @@ define double @loadfpimm16() {
   ret double -1.0
 }
 
-; Ensure fli isn't incorrectly used for negated versions of numbers in the fli
+; Ensure fli isn't directly used for negated versions of numbers in the fli
 ; table.
 define double @loadfpimm17() {
 ; CHECK-LABEL: loadfpimm17:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, %hi(.LCPI15_0)
-; CHECK-NEXT:    fld fa0, %lo(.LCPI15_0)(a0)
+; CHECK-NEXT:    fli.d fa5, 2.0
+; CHECK-NEXT:    fneg.d fa0, fa5
 ; CHECK-NEXT:    ret
   ret double -2.0
 }
 
-; Ensure fli isn't incorrecty used for negative min normal value.
+; Ensure fli isn't directly used for negative min normal value.
 define double @loadfpimm18() {
 ; CHECK-LABEL: loadfpimm18:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, %hi(.LCPI16_0)
-; CHECK-NEXT:    fld fa0, %lo(.LCPI16_0)(a0)
+; CHECK-NEXT:    fli.d fa5, min
+; CHECK-NEXT:    fneg.d fa0, fa5
 ; CHECK-NEXT:    ret
   ret double 0x8010000000000000
 }
diff --git a/llvm/test/CodeGen/RISCV/float-zfa.ll b/llvm/test/CodeGen/RISCV/float-zfa.ll
index 6f9e5ec035224..52c9ac7333fc5 100644
--- a/llvm/test/CodeGen/RISCV/float-zfa.ll
+++ b/llvm/test/CodeGen/RISCV/float-zfa.ll
@@ -95,23 +95,23 @@ define float @loadfpimm11() {
   ret float -1.0
 }
 
-; Ensure fli isn't incorrectly used for negated versions of numbers in the fli
+; Ensure fli isn't directly used for negated versions of numbers in the fli
 ; table.
 define float @loadfpimm12() {
 ; CHECK-LABEL: loadfpimm12:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, 786432
-; CHECK-NEXT:    fmv.w.x fa0, a0
+; CHECK-NEXT:    fli.s fa5, 2.0
+; CHECK-NEXT:    fneg.s fa0, fa5
 ; CHECK-NEXT:    ret
   ret float -2.0
 }
 
-; Ensure fli isn't incorrecty used for negative min normal value.
+; Ensure fli isn't directly used for negative min normal value.
 define float @loadfpimm13() {
 ; CHECK-LABEL: loadfpimm13:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, 526336
-; CHECK-NEXT:    fmv.w.x fa0, a0
+; CHECK-NEXT:    fli.s fa5, min
+; CHECK-NEXT:    fneg.s fa0, fa5
 ; CHECK-NEXT:    ret
   ret float 0xb810000000000000
 }
diff --git a/llvm/test/CodeGen/RISCV/half-zfa-fli.ll b/llvm/test/CodeGen/RISCV/half-zfa-fli.ll
index 577b774c27a0b..281a873235623 100644
--- a/llvm/test/CodeGen/RISCV/half-zfa-fli.ll
+++ b/llvm/test/CodeGen/RISCV/half-zfa-fli.ll
@@ -195,13 +195,13 @@ define half @loadfpimm13() {
   ret half -1.0
 }
 
-; Ensure fli isn't incorrectly used for negated versions of numbers in the fli
+; Ensure fli isn't directly used for negated versions of numbers in the fli
 ; table.
 define half @loadfpimm14() {
 ; CHECK-LABEL: loadfpimm14:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, 1048572
-; CHECK-NEXT:    fmv.h.x fa0, a0
+; CHECK-NEXT:    fli.h fa5, 2.0
+; CHECK-NEXT:    fneg.h fa0, fa5
 ; CHECK-NEXT:    ret
 ;
 ; ZFHMIN-LABEL: loadfpimm14:
@@ -212,12 +212,12 @@ define half @loadfpimm14() {
   ret half -2.0
 }
 
-; Ensure fli isn't incorrecty used for negative min normal value.
+; Ensure fli isn't directly used for negative min normal value.
 define half @loadfpimm15() {
 ; CHECK-LABEL: loadfpimm15:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, %hi(.LCPI14_0)
-; CHECK-NEXT:    flh fa0, %lo(.LCPI14_0)(a0)
+; CHECK-NEXT:    fli.h fa5, min
+; CHECK-NEXT:    fneg.h fa0, fa5
 ; CHECK-NEXT:    ret
 ;
 ; ZFHMIN-LABEL: loadfpimm15:
@@ -227,4 +227,3 @@ define half @loadfpimm15() {
 ; ZFHMIN-NEXT:    ret
   ret half 0xH8400
 }
-

From 2ab14dff4355b5b5492214f4a21763be77f0f223 Mon Sep 17 00:00:00 2001
From: Maksim Levental <maksim.levental@gmail.com>
Date: Tue, 31 Oct 2023 19:55:42 -0500
Subject: [PATCH 604/877] [mlir][python] fix python_test dialect and
 I32/I64ElementsBuilder (#70871)

This PR fixes the `I32ElementsAttr` and `I64ElementsAttr` builders and
tests them through the `python_test` dialect.
---
 mlir/python/mlir/ir.py                   |  4 ++--
 mlir/test/python/dialects/python_test.py | 19 +++++++++----------
 mlir/test/python/python_test_ops.td      |  4 +++-
 3 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/mlir/python/mlir/ir.py b/mlir/python/mlir/ir.py
index 43553f3118a51..cf4228c2a63a9 100644
--- a/mlir/python/mlir/ir.py
+++ b/mlir/python/mlir/ir.py
@@ -277,7 +277,7 @@ def _f64ElementsAttr(x, context):
     def _i32ElementsAttr(x, context):
         return DenseElementsAttr.get(
             np.array(x, dtype=np.int32),
-            type=IntegerType.get_signed(32, context=context),
+            type=IntegerType.get_signless(32, context=context),
             context=context,
         )
 
@@ -285,7 +285,7 @@ def _i32ElementsAttr(x, context):
     def _i64ElementsAttr(x, context):
         return DenseElementsAttr.get(
             np.array(x, dtype=np.int64),
-            type=IntegerType.get_signed(64, context=context),
+            type=IntegerType.get_signless(64, context=context),
             context=context,
         )
 
diff --git a/mlir/test/python/dialects/python_test.py b/mlir/test/python/dialects/python_test.py
index 3d4cd087fbfed..472db7e5124db 100644
--- a/mlir/test/python/dialects/python_test.py
+++ b/mlir/test/python/dialects/python_test.py
@@ -17,8 +17,7 @@ def run(f):
 @run
 def testAttributes():
     with Context() as ctx, Location.unknown():
-        ctx.allow_unregistered_dialects = True
-
+        test.register_python_test_dialect(ctx)
         #
         # Check op construction with attributes.
         #
@@ -28,7 +27,7 @@ def testAttributes():
         two = IntegerAttr.get(i32, 2)
         unit = UnitAttr.get()
 
-        # CHECK: "python_test.attributed_op"() {
+        # CHECK: python_test.attributed_op  {
         # CHECK-DAG: mandatory_i32 = 1 : i32
         # CHECK-DAG: optional_i32 = 2 : i32
         # CHECK-DAG: unit
@@ -36,7 +35,7 @@ def testAttributes():
         op = test.AttributedOp(one, optional_i32=two, unit=unit)
         print(f"{op}")
 
-        # CHECK: "python_test.attributed_op"() {
+        # CHECK: python_test.attributed_op  {
         # CHECK: mandatory_i32 = 2 : i32
         # CHECK: }
         op2 = test.AttributedOp(two)
@@ -48,21 +47,21 @@ def testAttributes():
 
         assert "additional" not in op.attributes
 
-        # CHECK: "python_test.attributed_op"() {
+        # CHECK: python_test.attributed_op  {
         # CHECK-DAG: additional = 1 : i32
         # CHECK-DAG: mandatory_i32 = 2 : i32
         # CHECK: }
         op2.attributes["additional"] = one
         print(f"{op2}")
 
-        # CHECK: "python_test.attributed_op"() {
+        # CHECK: python_test.attributed_op  {
         # CHECK-DAG: additional = 2 : i32
         # CHECK-DAG: mandatory_i32 = 2 : i32
         # CHECK: }
         op2.attributes["additional"] = two
         print(f"{op2}")
 
-        # CHECK: "python_test.attributed_op"() {
+        # CHECK: python_test.attributed_op  {
         # CHECK-NOT: additional = 2 : i32
         # CHECK:     mandatory_i32 = 2 : i32
         # CHECK: }
@@ -139,7 +138,7 @@ def testAttributes():
 @run
 def attrBuilder():
     with Context() as ctx, Location.unknown():
-        ctx.allow_unregistered_dialects = True
+        test.register_python_test_dialect(ctx)
         # CHECK: python_test.attributes_op
         op = test.AttributesOp(
             # CHECK-DAG: x_affinemap = affine_map<() -> (2)>
@@ -177,10 +176,10 @@ def attrBuilder():
             x_i16=42,  # CHECK-DAG: x_i16 = 42 : i16
             x_i32=6,  # CHECK-DAG: x_i32 = 6 : i32
             x_i32arr=[4, 5],  # CHECK-DAG: x_i32arr = [4 : i32, 5 : i32]
-            x_i32elems=[5, 6],  # CHECK-DAG: x_i32elems = dense<[5, 6]> : tensor<2xsi32>
+            x_i32elems=[5, 6],  # CHECK-DAG: x_i32elems = dense<[5, 6]> : tensor<2xi32>
             x_i64=9,  # CHECK-DAG: x_i64 = 9 : i64
             x_i64arr=[7, 8],  # CHECK-DAG: x_i64arr = [7, 8]
-            x_i64elems=[8, 9],  # CHECK-DAG: x_i64elems = dense<[8, 9]> : tensor<2xsi64>
+            x_i64elems=[8, 9],  # CHECK-DAG: x_i64elems = dense<[8, 9]> : tensor<2xi64>
             x_i64svecarr=[10, 11],  # CHECK-DAG: x_i64svecarr = [10, 11]
             x_i8=11,  # CHECK-DAG: x_i8 = 11 : i8
             x_idx=10,  # CHECK-DAG: x_idx = 10 : index
diff --git a/mlir/test/python/python_test_ops.td b/mlir/test/python/python_test_ops.td
index d79714301ae95..95301985e3fde 100644
--- a/mlir/test/python/python_test_ops.td
+++ b/mlir/test/python/python_test_ops.td
@@ -32,7 +32,9 @@ class TestAttr<string name, string attrMnemonic>
 }
 
 class TestOp<string mnemonic, list<Trait> traits = []>
-    : Op<Python_Test_Dialect, mnemonic, traits>;
+    : Op<Python_Test_Dialect, mnemonic, traits> {
+  let assemblyFormat = "operands attr-dict functional-type(operands, results)";
+}
 
 //===----------------------------------------------------------------------===//
 // Type definitions.

From 8b91de5d6a3f98dcc00bbd286e339e512f7e3682 Mon Sep 17 00:00:00 2001
From: SiHuaN <sihuan@sakuya.love>
Date: Wed, 1 Nov 2023 09:05:33 +0800
Subject: [PATCH 605/877] [Flang][OpenMP] Correctly handling continuation lines
 with Fixed Source Form Conditional Compilation Sentinels (#70309)

Fixes #67947

---------

Co-authored-by: SiHuaN <liyongtai@iscas.ac.cn>
Co-authored-by: Leandro Lupori <leandro.lupori@gmail.com>
---
 flang/lib/Parser/prescan.cpp                  | 27 ++++++++++++-------
 .../continuation-in-conditional-compilation.f | 16 +++++++++++
 2 files changed, 33 insertions(+), 10 deletions(-)
 create mode 100644 flang/test/Parser/continuation-in-conditional-compilation.f

diff --git a/flang/lib/Parser/prescan.cpp b/flang/lib/Parser/prescan.cpp
index 0408de6208647..1781cc8929f56 100644
--- a/flang/lib/Parser/prescan.cpp
+++ b/flang/lib/Parser/prescan.cpp
@@ -1008,20 +1008,27 @@ const char *Prescanner::FixedFormContinuationLine(bool mightNeedSpace) {
   }
   tabInCurrentLine_ = false;
   char col1{*nextLine_};
-  if (InCompilerDirective()) {
-    // Must be a continued compiler directive.
-    if (!IsFixedFormCommentChar(col1)) {
-      return nullptr;
-    }
+  if (IsFixedFormCommentChar(col1)) {
     int j{1};
-    for (; j < 5; ++j) {
-      char ch{directiveSentinel_[j - 1]};
-      if (ch == '\0') {
-        break;
+    if (InCompilerDirective()) {
+      // Must be a continued compiler directive.
+      for (; j < 5; ++j) {
+        char ch{directiveSentinel_[j - 1]};
+        if (ch == '\0') {
+          break;
+        }
+        if (ch != ToLowerCaseLetter(nextLine_[j])) {
+          return nullptr;
+        }
       }
-      if (ch != ToLowerCaseLetter(nextLine_[j])) {
+    } else if (features_.IsEnabled(LanguageFeature::OpenMP)) {
+      // Fixed Source Form Conditional Compilation Sentinels.
+      if (nextLine_[1] != '$') {
         return nullptr;
       }
+      j++;
+    } else {
+      return nullptr;
     }
     for (; j < 5; ++j) {
       if (nextLine_[j] != ' ') {
diff --git a/flang/test/Parser/continuation-in-conditional-compilation.f b/flang/test/Parser/continuation-in-conditional-compilation.f
new file mode 100644
index 0000000000000..35eecbc0f16ea
--- /dev/null
+++ b/flang/test/Parser/continuation-in-conditional-compilation.f
@@ -0,0 +1,16 @@
+! RUN: %flang_fc1 -fopenmp -fopenacc -E %s 2>&1 | FileCheck %s
+      program main
+! CHECK: k01=1+1
+      k01=1+
+!$   &  1
+
+! CHECK: !$omp parallel private(k01)
+!$omp parallel
+!$omp+ private(k01)
+!$omp end parallel
+
+! CHECK-NOT: comment
+!$omp parallel
+!$acc+comment
+!$omp end parallel
+      end

From 1abd8d1a8d962a14ca96e19c0f6da4f9ac394d0a Mon Sep 17 00:00:00 2001
From: Matthias Springer <me@m-sp.org>
Date: Wed, 1 Nov 2023 10:26:31 +0900
Subject: [PATCH 606/877] [mlir][Interfaces] Add `SubsetOpInterface` and
 `SubsetExtractionOpInterface` (#70617)

There is currently an op interface for subset insertion ops
(`SubsetInsertionOpInterface`), but not for subset extraction ops. This
commit adds `SubsetExtractionOpInterface` to `mlir/Interfaces`, as well
as a common dependent op interface: `SubsetOpInterface`.

- `SubsetOpInterface` is for ops that operate on tensor subsets. It
provides interface methods to check if two subset ops operate on
equivalent or disjoint subsets. Ops that implement this interface must
implement either `SubsetExtractionOpInterface` or
`SubsetInsertionOpInterface`.
- `SubsetExtractionOpInterface` is for ops that extract from a tensor at
a subset. E.g., `tensor.extract_slice`, `tensor.gather`,
`vector.transfer_read`. Current implemented only on
`tensor.extract_slice`.
- `SubsetInsertionOpInterface` is for ops that insert into a destination
tensor at a subset. E.g., `tensor.insert_slice`,
`tensor.parallel_insert_slice`, `tensor.scatter`,
`vector.transfer_write`. Currently only implemented on
`tensor.insert_slice`, `tensor.parallel_insert_slice`.

Other changes:
- Rename `SubsetInsertionOpInterface.td` to `SubsetOpInterface.td`.
- Add helper functions to `ValueBoundsOpInterface.cpp` for checking
whether two slices are disjoint.

The new interfaces will be utilized by a new "loop-invariant subset
hoisting"
transformation. (This new transform is roughly
what `Linalg/Transforms/SubsetHoisting.cpp` is doing, but in a generic
and interface-driven way.)
---
 .../Dialect/Bufferization/IR/Bufferization.h  |   2 +-
 .../Bufferization/IR/BufferizationOps.td      |   3 +-
 .../SubsetInsertionOpInterfaceImpl.h          |   3 +-
 .../SubsetInsertionOpInterfaceImpl.h          |   3 +-
 mlir/include/mlir/InitAllDialects.h           |   4 +-
 mlir/include/mlir/Interfaces/CMakeLists.txt   |   2 +-
 .../Interfaces/SubsetInsertionOpInterface.h   |  27 --
 .../Interfaces/SubsetInsertionOpInterface.td  | 155 ----------
 .../mlir/Interfaces/SubsetOpInterface.h       |  45 +++
 .../mlir/Interfaces/SubsetOpInterface.td      | 267 ++++++++++++++++++
 .../mlir/Interfaces/ValueBoundsOpInterface.h  |  36 +--
 .../Bufferization/IR/BufferizationOps.cpp     |  12 +
 .../Dialect/Bufferization/IR/CMakeLists.txt   |   2 +-
 .../Bufferization/Transforms/CMakeLists.txt   |   2 +-
 .../Transforms/EmptyTensorElimination.cpp     |   2 +-
 .../Transforms/OneShotAnalysis.cpp            |   2 +-
 .../Dialect/Linalg/Transforms/CMakeLists.txt  |   2 +-
 .../SubsetInsertionOpInterfaceImpl.cpp        |  29 +-
 .../BufferizableOpInterfaceImpl.cpp           |   2 +-
 .../Dialect/Tensor/Transforms/CMakeLists.txt  |   2 +-
 .../SubsetInsertionOpInterfaceImpl.cpp        | 137 +++++++--
 mlir/lib/Interfaces/CMakeLists.txt            |   9 +-
 .../Interfaces/SubsetInsertionOpInterface.cpp |  23 --
 mlir/lib/Interfaces/SubsetOpInterface.cpp     |  58 ++++
 .../lib/Interfaces/ValueBoundsOpInterface.cpp |  80 +++++-
 .../llvm-project-overlay/mlir/BUILD.bazel     |  33 +--
 .../mlir/python/BUILD.bazel                   |   2 +-
 27 files changed, 654 insertions(+), 290 deletions(-)
 delete mode 100644 mlir/include/mlir/Interfaces/SubsetInsertionOpInterface.h
 delete mode 100644 mlir/include/mlir/Interfaces/SubsetInsertionOpInterface.td
 create mode 100644 mlir/include/mlir/Interfaces/SubsetOpInterface.h
 create mode 100644 mlir/include/mlir/Interfaces/SubsetOpInterface.td
 delete mode 100644 mlir/lib/Interfaces/SubsetInsertionOpInterface.cpp
 create mode 100644 mlir/lib/Interfaces/SubsetOpInterface.cpp

diff --git a/mlir/include/mlir/Dialect/Bufferization/IR/Bufferization.h b/mlir/include/mlir/Dialect/Bufferization/IR/Bufferization.h
index c035190f43e39..e98b5728b38ef 100644
--- a/mlir/include/mlir/Dialect/Bufferization/IR/Bufferization.h
+++ b/mlir/include/mlir/Dialect/Bufferization/IR/Bufferization.h
@@ -15,7 +15,7 @@
 #include "mlir/Interfaces/CopyOpInterface.h"
 #include "mlir/Interfaces/DestinationStyleOpInterface.h"
 #include "mlir/Interfaces/InferTypeOpInterface.h"
-#include "mlir/Interfaces/SubsetInsertionOpInterface.h"
+#include "mlir/Interfaces/SubsetOpInterface.h"
 
 //===----------------------------------------------------------------------===//
 // Bufferization Dialect
diff --git a/mlir/include/mlir/Dialect/Bufferization/IR/BufferizationOps.td b/mlir/include/mlir/Dialect/Bufferization/IR/BufferizationOps.td
index 72a4aa712f49c..e6b6d052df96a 100644
--- a/mlir/include/mlir/Dialect/Bufferization/IR/BufferizationOps.td
+++ b/mlir/include/mlir/Dialect/Bufferization/IR/BufferizationOps.td
@@ -15,7 +15,7 @@ include "mlir/Dialect/Bufferization/IR/BufferizationBase.td"
 include "mlir/Interfaces/DestinationStyleOpInterface.td"
 include "mlir/Interfaces/InferTypeOpInterface.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
-include "mlir/Interfaces/SubsetInsertionOpInterface.td"
+include "mlir/Interfaces/SubsetOpInterface.td"
 include "mlir/Interfaces/CopyOpInterface.td"
 
 class Bufferization_Op<string mnemonic, list<Trait> traits = []>
@@ -220,6 +220,7 @@ def Bufferization_MaterializeInDestinationOp
          AllElementTypesMatch<["source", "dest"]>,
          BufferizableOpInterface, DestinationStyleOpInterface,
          DeclareOpInterfaceMethods<ReifyRankedShapedTypeOpInterface>,
+         DeclareOpInterfaceMethods<SubsetOpInterface>,
          DeclareOpInterfaceMethods<SubsetInsertionOpInterface,
             ["getSourceOperand", "getValuesNeededToBuildSubsetExtraction",
              "buildSubsetExtraction", "isEquivalentSubset"]>,
diff --git a/mlir/include/mlir/Dialect/Linalg/Transforms/SubsetInsertionOpInterfaceImpl.h b/mlir/include/mlir/Dialect/Linalg/Transforms/SubsetInsertionOpInterfaceImpl.h
index 023a46df26201..94b0fb25b5066 100644
--- a/mlir/include/mlir/Dialect/Linalg/Transforms/SubsetInsertionOpInterfaceImpl.h
+++ b/mlir/include/mlir/Dialect/Linalg/Transforms/SubsetInsertionOpInterfaceImpl.h
@@ -13,8 +13,7 @@ namespace mlir {
 class DialectRegistry;
 
 namespace linalg {
-void registerSubsetInsertionOpInterfaceExternalModels(
-    DialectRegistry &registry);
+void registerSubsetOpInterfaceExternalModels(DialectRegistry &registry);
 } // namespace linalg
 } // namespace mlir
 
diff --git a/mlir/include/mlir/Dialect/Tensor/Transforms/SubsetInsertionOpInterfaceImpl.h b/mlir/include/mlir/Dialect/Tensor/Transforms/SubsetInsertionOpInterfaceImpl.h
index e21b07d8a2705..019da189a8c99 100644
--- a/mlir/include/mlir/Dialect/Tensor/Transforms/SubsetInsertionOpInterfaceImpl.h
+++ b/mlir/include/mlir/Dialect/Tensor/Transforms/SubsetInsertionOpInterfaceImpl.h
@@ -13,8 +13,7 @@ namespace mlir {
 class DialectRegistry;
 
 namespace tensor {
-void registerSubsetInsertionOpInterfaceExternalModels(
-    DialectRegistry &registry);
+void registerSubsetOpInterfaceExternalModels(DialectRegistry &registry);
 } // namespace tensor
 } // namespace mlir
 
diff --git a/mlir/include/mlir/InitAllDialects.h b/mlir/include/mlir/InitAllDialects.h
index 00f400aab5d50..7c2ffb7408d9a 100644
--- a/mlir/include/mlir/InitAllDialects.h
+++ b/mlir/include/mlir/InitAllDialects.h
@@ -151,7 +151,7 @@ inline void registerAllDialects(DialectRegistry &registry) {
   cf::registerBufferDeallocationOpInterfaceExternalModels(registry);
   gpu::registerBufferDeallocationOpInterfaceExternalModels(registry);
   linalg::registerBufferizableOpInterfaceExternalModels(registry);
-  linalg::registerSubsetInsertionOpInterfaceExternalModels(registry);
+  linalg::registerSubsetOpInterfaceExternalModels(registry);
   linalg::registerTilingInterfaceExternalModels(registry);
   linalg::registerValueBoundsOpInterfaceExternalModels(registry);
   memref::registerAllocationOpInterfaceExternalModels(registry);
@@ -167,7 +167,7 @@ inline void registerAllDialects(DialectRegistry &registry) {
   tensor::registerBufferizableOpInterfaceExternalModels(registry);
   tensor::registerFindPayloadReplacementOpInterfaceExternalModels(registry);
   tensor::registerInferTypeOpInterfaceExternalModels(registry);
-  tensor::registerSubsetInsertionOpInterfaceExternalModels(registry);
+  tensor::registerSubsetOpInterfaceExternalModels(registry);
   tensor::registerTilingInterfaceExternalModels(registry);
   tensor::registerValueBoundsOpInterfaceExternalModels(registry);
   vector::registerBufferizableOpInterfaceExternalModels(registry);
diff --git a/mlir/include/mlir/Interfaces/CMakeLists.txt b/mlir/include/mlir/Interfaces/CMakeLists.txt
index 36a04ff0eaeaf..d81298bb4daf0 100644
--- a/mlir/include/mlir/Interfaces/CMakeLists.txt
+++ b/mlir/include/mlir/Interfaces/CMakeLists.txt
@@ -12,7 +12,7 @@ add_mlir_interface(ParallelCombiningOpInterface)
 add_mlir_interface(RuntimeVerifiableOpInterface)
 add_mlir_interface(ShapedOpInterfaces)
 add_mlir_interface(SideEffectInterfaces)
-add_mlir_interface(SubsetInsertionOpInterface)
+add_mlir_interface(SubsetOpInterface)
 add_mlir_interface(TilingInterface)
 add_mlir_interface(ValueBoundsOpInterface)
 add_mlir_interface(VectorInterfaces)
diff --git a/mlir/include/mlir/Interfaces/SubsetInsertionOpInterface.h b/mlir/include/mlir/Interfaces/SubsetInsertionOpInterface.h
deleted file mode 100644
index 3a6dfceadcce7..0000000000000
--- a/mlir/include/mlir/Interfaces/SubsetInsertionOpInterface.h
+++ /dev/null
@@ -1,27 +0,0 @@
-//===- SubsetInsertionOpInterface.h - Tensor Subsets ------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef MLIR_INTERFACES_SUBSETINSERTIONOPINTERFACE_H_
-#define MLIR_INTERFACES_SUBSETINSERTIONOPINTERFACE_H_
-
-#include "mlir/IR/OpDefinition.h"
-
-namespace mlir {
-namespace detail {
-
-/// Return the destination/"init" operand of the op if it implements the
-/// `DestinationStyleOpInterface` and has exactly one "init" operand. Asserts
-/// otherwise.
-OpOperand &defaultGetDestinationOperand(Operation *op);
-
-} // namespace detail
-} // namespace mlir
-
-#include "mlir/Interfaces/SubsetInsertionOpInterface.h.inc"
-
-#endif // MLIR_INTERFACES_SUBSETINSERTIONOPINTERFACE_H_
diff --git a/mlir/include/mlir/Interfaces/SubsetInsertionOpInterface.td b/mlir/include/mlir/Interfaces/SubsetInsertionOpInterface.td
deleted file mode 100644
index ef94a8ae9a60e..0000000000000
--- a/mlir/include/mlir/Interfaces/SubsetInsertionOpInterface.td
+++ /dev/null
@@ -1,155 +0,0 @@
-//===-- SubsetInsertionOpInterface.td - Tensor Subsets -----*- tablegen -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef SUBSET_INSERTION_OP_INTERFACE
-#define SUBSET_INSERTION_OP_INTERFACE
-
-include "mlir/IR/OpBase.td"
-
-def SubsetInsertionOpInterface : OpInterface<"SubsetInsertionOpInterface"> {
-  let description = [{
-    This interface can be implemented by ops that insert a source tensor into
-    a destination tensor.
-
-    The elements in the destination tensor that are overwritten by this
-    insertion are called the "subset". How the subset is defined is up to the
-    op. E.g., "tensor.insert_slice" defines the subset via a hyperrectangular
-    slice. A scatter operation could define the subset via a list of indices.
-
-    Ops that deal with tensor subsets come in two flavours:
-    - Insertion flavor: Ops that insert a source tensor into a destination
-      tensor at the specified subset. Such ops usually return a new destination
-      tensor and implement the `DestinationStyleOpInterface`. Insertion ops can
-      implement the `SubsetInsertionOpInterface`. Example: "tensor.insert_slice"
-    - Extraction flavor: Ops that define a tensor subset. They extract a
-      specified subset from a tensor. There is currently no op interface for
-      such ops. Example: "tensor.extract_slice"
-
-    This interface provides helper methods for efficient bufferization of
-    subset-based tensor IR. Tensor subsets can bufferize to buffer "views"/
-    "aliases" (in contrast to one or multiple less efficient buffer allocation).
-
-    This interface is queried by One-Shot Bufferize to detect cases where a
-    seeming read-after-write is not actually a conflict because the respective
-    ops are operating on equivalent subsets. More details can be found in the
-    documentation of One-Shot Analysis (see `areNonConflictingSubsets`).
-
-    Note: This interface currently assumes that a subset op inserts a single
-    tensor (source) into a destination tensor at a single subset.
-  }];
-  let cppNamespace = "::mlir";
-  let methods = [
-      InterfaceMethod<
-        /*desc=*/[{
-          Return the source tensor operand.
-        }],
-        /*retType=*/"::mlir::OpOperand &",
-        /*methodName=*/"getSourceOperand",
-        /*args=*/(ins)
-      >,
-      InterfaceMethod<
-        /*desc=*/[{
-          Return the destination tensor operand.
-        }],
-        /*retType=*/"::mlir::OpOperand &",
-        /*methodName=*/"getDestinationOperand",
-        /*args=*/(ins),
-        /*methodBody=*/"",
-        /*defaultImplementation=*/[{
-          return ::mlir::detail::defaultGetDestinationOperand(
-              $_op.getOperation());
-        }]
-      >,
-      InterfaceMethod<
-        /*desc=*/[{
-          Return "true" if this operation inserts into a subset that is
-          equivalent to the subset defined by `candidate`.
-
-          Two subsets are "equivalent" and "same" if they can bufferize to the
-          same buffer views/aliases. If they are "equivalent", the tensor IR
-          may be expressed in terms of different SSA values (but they could
-          bufferize to MemRef SSA values that can CSE without breaking
-          correctness). `equivalenceFn` should return "true" if the two given
-          values are equivalent.
-
-          Example:
-          ```
-          // The subset of the SubsetInsertionOpInterface op %1 is equivalent to
-          // the subset defined by %2 (but not "same"):
-          %0 = arith.select %c, %t, %t : tensor<?xf32>
-          %1 = tensor.insert_slice %x into %0[0][5][1]
-              : tensor<5xf32> into tensor<?xf32>
-          %2 = tensor.extract_slice %t[0][5][1] : tensor<?xf32> to tensor<5xf32>
-
-          // The subset of the SubsetInsertionOpInterface op %1 is equivalent to
-          // and "same" as the subset defined by %2.
-          %1 = tensor.insert_slice %x into %t[0][5][1]
-              : tensor<5xf32> into tensor<?xf32>
-          %2 = tensor.extract_slice %t[0][5][1] : tensor<?xf32> to tensor<5xf32>
-          ```
-        }],
-        /*retType=*/"bool",
-        /*methodName=*/"isEquivalentSubset",
-        /*args=*/(ins
-            "::mlir::Value":$candidate,
-            "::llvm::function_ref<bool(Value, Value)>":$equivalenceFn)
-      >,
-      InterfaceMethod<
-        /*desc=*/[{
-          Return the subset of the destination tensor that this operation
-          inserts into.
-
-          Example:
-          ```
-          // SubsetOpInterface op:
-          %0 = tensor.insert_slice %t0 into %t1[%pos][5][1]
-              : tensor<5xf32> into tensor<?xf32>
-          // Subset (built by this function):
-          %1 = tensor.extract_slice %t1[%pos][5][1]
-              : tensor<?xf32> to tensor<5xf32>
-          ```
-
-          Note: Implementations do not necessarily have to build new IR. They
-          may return existing SSA values.
-        }],
-        /*retType=*/"::mlir::Value",
-        /*methodName=*/"buildSubsetExtraction",
-        /*args=*/(ins "::mlir::OpBuilder &":$builder, "Location":$loc)
-      >,
-      InterfaceMethod<
-        /*desc=*/[{
-          Return all SSA values that are needed (i.e., must be in scope) at the
-          insertion of the builder when calling `buildSubsetExtraction`. Users
-          of `buildSubsetExtraction` can use this helper method to find a
-          suitable insertion point.
-
-          Example: The SSA values needed to build the subset in the example of
-          `buildSubsetExtraction` are %t1 and %pos.
-        }],
-        /*retType=*/"::llvm::SmallVector<::mlir::Value>",
-        /*methodName=*/"getValuesNeededToBuildSubsetExtraction",
-        /*args=*/(ins)
-      >,
-  ];
-
-  let extraClassDeclaration = [{
-    /// Return "true" if this operation inserts into the same subset as defined
-    /// by `candidate`.
-    ///
-    /// Note: This function is useful outside of bufferization, where no tensor
-    /// equivalence information is available.
-    bool isSameSubset(OpResult candidate) {
-      auto subsetOp = cast<::mlir::SubsetInsertionOpInterface>(
-          getOperation());
-      return subsetOp.isEquivalentSubset(
-          candidate, [](Value v1, Value v2) { return v1 == v2; });
-    }
-  }];
-}
-
-#endif // SUBSET_INSERTION_OP_INTERFACE
diff --git a/mlir/include/mlir/Interfaces/SubsetOpInterface.h b/mlir/include/mlir/Interfaces/SubsetOpInterface.h
new file mode 100644
index 0000000000000..049cf2456a9c8
--- /dev/null
+++ b/mlir/include/mlir/Interfaces/SubsetOpInterface.h
@@ -0,0 +1,45 @@
+//===- SubsetOpInterface.h - Tensor Subsets ---------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_INTERFACES_SUBSETOPINTERFACE_H_
+#define MLIR_INTERFACES_SUBSETOPINTERFACE_H_
+
+#include "mlir/IR/OpDefinition.h"
+
+namespace mlir {
+class SubsetOpInterface;
+class SubsetExtractionOpInterface;
+class SubsetInsertionOpInterface;
+
+namespace detail {
+
+/// Return the destination/"init" operand of the op if it implements the
+/// `DestinationStyleOpInterface` and has exactly one "init" operand. Asserts
+/// otherwise.
+OpOperand &defaultGetDestinationOperand(Operation *op);
+
+/// Return the updated destination result of the op if it implements the
+/// `DestinationStyleOpInterface`.
+OpResult defaultGetUpdatedDestination(Operation *op);
+
+/// Default implementation of `isEquivalentSubset`.
+bool defaultIsEquivalentSubset(Operation *op, Value candidate,
+                               function_ref<bool(Value, Value)> equivalenceFn);
+
+/// Verify `SubsetOpInterface`.
+LogicalResult verifySubsetOpInterface(SubsetOpInterface op);
+
+/// Verify `SubsetExtractionOpInterface`.
+LogicalResult verifySubsetExtractionOpInterface(SubsetExtractionOpInterface op);
+
+} // namespace detail
+} // namespace mlir
+
+#include "mlir/Interfaces/SubsetOpInterface.h.inc"
+
+#endif // MLIR_INTERFACES_SUBSETOPINTERFACE_H_
diff --git a/mlir/include/mlir/Interfaces/SubsetOpInterface.td b/mlir/include/mlir/Interfaces/SubsetOpInterface.td
new file mode 100644
index 0000000000000..9ebed2c94818d
--- /dev/null
+++ b/mlir/include/mlir/Interfaces/SubsetOpInterface.td
@@ -0,0 +1,267 @@
+//===-- SubsetOpInterface.td - Tensor Subsets --------------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SUBSET_OP_INTERFACE
+#define SUBSET_OP_INTERFACE
+
+include "mlir/IR/OpBase.td"
+
+def SubsetOpInterface : OpInterface<"SubsetOpInterface"> {
+  let description = [{
+    This interface can be implemented by ops that operate on tensor subsets. A
+    "subset" is a part of a tensor. This interface describes the subset that
+    an implementing op operates on. Only the specified subset may be accessed by
+    the op.
+
+    Subset ops come in two flavours and ops that implement the
+    `SubsetOpInterface` must also implement one of the respective interfaces.
+    - Insertion flavor: Ops that insert a source value into a destination
+      tensor at the specified subset. Such ops return an updated destination
+      tensor and usually implement the `DestinationStyleOpInterface`. Insertion
+      ops must implement the `SubsetInsertionOpInterface`.
+    - Extraction flavor: Ops that extract at a subset. Extraction ops must
+      implement the `SubsetExtractionOpInterface`.
+
+    How the subset is specified is up to the implementing op. E.g.:
+    - `tensor.extract_slice/insert_slice` describe the subset as a
+      hyperrectangular slice.
+    - `tensor.gather/scatter` describe the subset as list of indices. (Not
+      implemented yet.)
+
+    Note: This interface does not expose any interface methods to get a
+    description of the accessed subset. That is because there is currently no
+    efficient way to describe arbitrary subsets. This interface merely provides
+    interface methods to check if two subsets are equivalent or disjoint.
+  }];
+
+  let cppNamespace = "::mlir";
+  let methods = [
+      InterfaceMethod<
+        /*desc=*/[{
+          Return "true" if this op and the given candidate subset op operate on
+          equivalent subsets. Return "false" if the two subsets are disjoint
+          or cannot be proven to be equivalent.
+        }],
+        /*retType=*/"bool",
+        /*methodName=*/"operatesOnEquivalentSubset",
+        /*args=*/(ins
+            "::mlir::SubsetOpInterface":$candidate,
+            "::llvm::function_ref<bool(Value, Value)>":$equivalenceFn)
+      >,
+      InterfaceMethod<
+        /*desc=*/[{
+          Return "true" if this op and the given candidate subset op operate on
+          disjoint subsets. Return "false" if the two subsets are equivalent,
+          overlapping or cannot be proven to be disjoint.
+        }],
+        /*retType=*/"bool",
+        /*methodName=*/"operatesOnDisjointSubset",
+        /*args=*/(ins
+            "::mlir::SubsetOpInterface":$candidate,
+            "::llvm::function_ref<bool(Value, Value)>":$equivalenceFn)
+      >,
+  ];
+
+  let verify = [{
+    return ::mlir::detail::verifySubsetOpInterface(
+        ::mlir::cast<::mlir::SubsetOpInterface>($_op));
+  }];
+}
+
+def SubsetExtractionOpInterface
+    : OpInterface<"SubsetExtractionOpInterface", [SubsetOpInterface]> {
+  let description = [{
+    This interface can be implemented by ops that extract a value from
+    a source tensor at a specified subset. The elements in the source tensor
+    that are read by this extraction are called "subset".
+
+    Extraction ops must have a single result value.
+  }];
+
+  let cppNamespace = "::mlir";
+  let methods = [
+      InterfaceMethod<
+        /*desc=*/[{
+          Return the source tensor operand.
+        }],
+        /*retType=*/"::mlir::OpOperand &",
+        /*methodName=*/"getSourceOperand",
+        /*args=*/(ins)
+      >,
+  ];
+
+  let verify = [{
+    return ::mlir::detail::verifySubsetExtractionOpInterface(
+        ::mlir::cast<::mlir::SubsetExtractionOpInterface>($_op));
+  }];
+
+  let extraClassDeclaration = [{
+    /// Return the single result of this op.
+    ::mlir::Value getResult() {
+      return getOperation()->getResult(0);
+    }
+  }];
+}
+
+def SubsetInsertionOpInterface
+    : OpInterface<"SubsetInsertionOpInterface", [SubsetOpInterface]> {
+  let description = [{
+    This interface can be implemented by ops that insert a source value into
+    a destination tensor at a specified subset. The elements in the destination
+    tensor that are overwritten by this insertion are called "subset". The
+    updated destination tensor is returned.
+
+    This interface provides helper methods for efficient bufferization of
+    subset-based tensor IR. Tensor subsets can bufferize to buffer "views"/
+    "aliases" (in contrast to one or multiple less efficient buffer allocation).
+
+    This interface is queried by One-Shot Bufferize to detect cases where a
+    seeming read-after-write is not actually a conflict because the respective
+    ops are operating on equivalent subsets. More details can be found in the
+    documentation of One-Shot Analysis (see `areNonConflictingSubsets`).
+  }];
+
+  let cppNamespace = "::mlir";
+  let methods = [
+      InterfaceMethod<
+        /*desc=*/[{
+          Return the source operand. The source operand can have any type.
+        }],
+        /*retType=*/"::mlir::OpOperand &",
+        /*methodName=*/"getSourceOperand",
+        /*args=*/(ins)
+      >,
+      InterfaceMethod<
+        /*desc=*/[{
+          Return the destination operand. The destination operand must be a
+          tensor.
+
+          This function does not have to be implemented for destination style
+          ops that have exactly one "init" operand.
+        }],
+        /*retType=*/"::mlir::OpOperand &",
+        /*methodName=*/"getDestinationOperand",
+        /*args=*/(ins),
+        /*methodBody=*/"",
+        /*defaultImplementation=*/[{
+          return ::mlir::detail::defaultGetDestinationOperand(
+              $_op.getOperation());
+        }]
+      >,
+      InterfaceMethod<
+        /*desc=*/[{
+          Return the updated destination result.
+
+          This function does not have to be implemented for destination style
+          ops.
+        }],
+        /*retType=*/"::mlir::OpResult",
+        /*methodName=*/"getUpdatedDestination",
+        /*args=*/(ins),
+        /*methodBody=*/"",
+        /*defaultImplementation=*/[{
+          return ::mlir::detail::defaultGetUpdatedDestination(
+              $_op.getOperation());
+        }]
+      >,
+      InterfaceMethod<
+        /*desc=*/[{
+          Return "true" if this operation inserts into a subset that is
+          equivalent to the subset defined by `candidate`.
+
+          Two subsets are "equivalent" and "same" if they can bufferize to the
+          same buffer views/aliases. If they are "equivalent", the tensor IR
+          may be expressed in terms of different SSA values (but they could
+          bufferize to MemRef SSA values that can CSE without breaking
+          correctness). `equivalenceFn` should return "true" if the two given
+          values are equivalent.
+
+          Example:
+          ```
+          // The subset of the SubsetInsertionOpInterface op %1 is equivalent to
+          // the subset defined by %2 (but not "same"):
+          %0 = arith.select %c, %t, %t : tensor<?xf32>
+          %1 = tensor.insert_slice %x into %0[0][5][1]
+              : tensor<5xf32> into tensor<?xf32>
+          %2 = tensor.extract_slice %t[0][5][1] : tensor<?xf32> to tensor<5xf32>
+
+          // The subset of the SubsetInsertionOpInterface op %1 is equivalent to
+          // and "same" as the subset defined by %2.
+          %1 = tensor.insert_slice %x into %t[0][5][1]
+              : tensor<5xf32> into tensor<?xf32>
+          %2 = tensor.extract_slice %t[0][5][1] : tensor<?xf32> to tensor<5xf32>
+          ```
+
+          Note: By default, this function calls
+          `SubsetOpInterface::operatesOnEquivalentSubset`.
+        }],
+        /*retType=*/"bool",
+        /*methodName=*/"isEquivalentSubset",
+        /*args=*/(ins
+            "::mlir::Value":$candidate,
+            "::llvm::function_ref<bool(Value, Value)>":$equivalenceFn),
+        /*methodBody=*/"",
+        /*defaultImplementation=*/[{
+          return ::mlir::detail::defaultIsEquivalentSubset(
+              $_op.getOperation(), candidate, equivalenceFn);
+        }]
+      >,
+      InterfaceMethod<
+        /*desc=*/[{
+          Return the subset of the destination tensor that this operation
+          inserts into.
+
+          Example:
+          ```
+          // SubsetOpInterface op:
+          %0 = tensor.insert_slice %t0 into %t1[%pos][5][1]
+              : tensor<5xf32> into tensor<?xf32>
+          // Subset (built by this function):
+          %1 = tensor.extract_slice %t1[%pos][5][1]
+              : tensor<?xf32> to tensor<5xf32>
+          ```
+
+          Note: Implementations do not necessarily have to build new IR. They
+          may return existing SSA values.
+        }],
+        /*retType=*/"::mlir::Value",
+        /*methodName=*/"buildSubsetExtraction",
+        /*args=*/(ins "::mlir::OpBuilder &":$builder, "Location":$loc)
+      >,
+      InterfaceMethod<
+        /*desc=*/[{
+          Return all SSA values that are needed (i.e., must be in scope) at the
+          insertion of the builder when calling `buildSubsetExtraction`. Users
+          of `buildSubsetExtraction` can use this helper method to find a
+          suitable insertion point.
+
+          Example: The SSA values needed to build the subset in the example of
+          `buildSubsetExtraction` are %t1 and %pos.
+        }],
+        /*retType=*/"::llvm::SmallVector<::mlir::Value>",
+        /*methodName=*/"getValuesNeededToBuildSubsetExtraction",
+        /*args=*/(ins)
+      >,
+  ];
+
+  let extraClassDeclaration = [{
+    /// Return "true" if this operation inserts into the same subset as defined
+    /// by `candidate`.
+    ///
+    /// Note: This function is useful outside of bufferization, where no tensor
+    /// equivalence information is available.
+    bool isSameSubset(OpResult candidate) {
+      auto subsetOp = cast<::mlir::SubsetInsertionOpInterface>(
+          getOperation());
+      return subsetOp.isEquivalentSubset(
+          candidate, [](Value v1, Value v2) { return v1 == v2; });
+    }
+  }];
+}
+
+#endif // SUBSET_OP_INTERFACE
diff --git a/mlir/include/mlir/Interfaces/ValueBoundsOpInterface.h b/mlir/include/mlir/Interfaces/ValueBoundsOpInterface.h
index 8f11c563e0cbd..8e2986a2d1f05 100644
--- a/mlir/include/mlir/Interfaces/ValueBoundsOpInterface.h
+++ b/mlir/include/mlir/Interfaces/ValueBoundsOpInterface.h
@@ -19,6 +19,7 @@
 #include <queue>
 
 namespace mlir {
+class OffsetSizeAndStrideOpInterface;
 
 using ValueDimList = SmallVector<std::pair<Value, std::optional<int64_t>>>;
 
@@ -134,11 +135,11 @@ class ValueBoundsConstraintSet {
                           std::optional<int64_t> dim, ValueRange independencies,
                           bool closedUB = false);
 
-  /// Compute a constant bound for the given index-typed value or shape
-  /// dimension size.
+  /// Compute a constant bound for the given affine map, where dims and symbols
+  /// are bound to the given operands. The affine map must have exactly one
+  /// result.
   ///
-  /// `dim` must be `nullopt` if and only if `value` is index-typed. This
-  /// function traverses the backward slice of the given value in a
+  /// This function traverses the backward slice of the given operands in a
   /// worklist-driven manner until `stopCondition` evaluates to "true". The
   /// constraint set is populated according to `ValueBoundsOpInterface` for each
   /// visited value. (No constraints are added for values for which the stop
@@ -155,26 +156,12 @@ class ValueBoundsConstraintSet {
                        std::optional<int64_t> dim = std::nullopt,
                        StopConditionFn stopCondition = nullptr,
                        bool closedUB = false);
-
-  /// Compute a constant bound for the given affine map, where dims and symbols
-  /// are bound to the given operands. The affine map must have exactly one
-  /// result.
-  ///
-  /// This function traverses the backward slice of the given operands in a
-  /// worklist-driven manner until `stopCondition` evaluates to "true". The
-  /// constraint set is populated according to `ValueBoundsOpInterface` for each
-  /// visited value. (No constraints are added for values for which the stop
-  /// condition evaluates to "true".)
-  ///
-  /// The stop condition is optional: If none is specified, the backward slice
-  /// is traversed in a breadth-first manner until a constant bound could be
-  /// computed.
-  ///
-  /// By default, lower/equal bounds are closed and upper bounds are open. If
-  /// `closedUB` is set to "true", upper bounds are also closed.
   static FailureOr<int64_t> computeConstantBound(
       presburger::BoundType type, AffineMap map, ValueDimList mapOperands,
       StopConditionFn stopCondition = nullptr, bool closedUB = false);
+  static FailureOr<int64_t> computeConstantBound(
+      presburger::BoundType type, AffineMap map, ArrayRef<Value> mapOperands,
+      StopConditionFn stopCondition = nullptr, bool closedUB = false);
 
   /// Compute a constant delta between the given two values. Return "failure"
   /// if a constant delta could not be determined.
@@ -195,6 +182,13 @@ class ValueBoundsConstraintSet {
                                   std::optional<int64_t> dim1 = std::nullopt,
                                   std::optional<int64_t> dim2 = std::nullopt);
 
+  /// Return "true" if the given slices are guaranteed to be overlapping.
+  /// Return "false" if the given slices are guaranteed to be non-overlapping.
+  /// Return "failure" if unknown.
+  static FailureOr<bool>
+  areOverlappingSlices(OffsetSizeAndStrideOpInterface slice1,
+                       OffsetSizeAndStrideOpInterface slice2);
+
   /// Add a bound for the given index-typed value or shaped value. This function
   /// returns a builder that adds the bound.
   BoundBuilder bound(Value value) { return BoundBuilder(*this, value); }
diff --git a/mlir/lib/Dialect/Bufferization/IR/BufferizationOps.cpp b/mlir/lib/Dialect/Bufferization/IR/BufferizationOps.cpp
index 52ff6ceeee85b..8f19245efdba6 100644
--- a/mlir/lib/Dialect/Bufferization/IR/BufferizationOps.cpp
+++ b/mlir/lib/Dialect/Bufferization/IR/BufferizationOps.cpp
@@ -643,6 +643,18 @@ OpOperand &MaterializeInDestinationOp::getSourceOperand() {
   return getOperation()->getOpOperand(0) /*source*/;
 }
 
+bool MaterializeInDestinationOp::operatesOnEquivalentSubset(
+    SubsetOpInterface subsetOp,
+    function_ref<bool(Value, Value)> equivalenceFn) {
+  return false;
+}
+
+bool MaterializeInDestinationOp::operatesOnDisjointSubset(
+    SubsetOpInterface subsetOp,
+    function_ref<bool(Value, Value)> equivalenceFn) {
+  return false;
+}
+
 LogicalResult MaterializeInDestinationOp::verify() {
   if (!isa<TensorType, BaseMemRefType>(getDest().getType()))
     return emitOpError("'dest' must be a tensor or a memref");
diff --git a/mlir/lib/Dialect/Bufferization/IR/CMakeLists.txt b/mlir/lib/Dialect/Bufferization/IR/CMakeLists.txt
index 385d8dc9364e3..9895db9d93ce0 100644
--- a/mlir/lib/Dialect/Bufferization/IR/CMakeLists.txt
+++ b/mlir/lib/Dialect/Bufferization/IR/CMakeLists.txt
@@ -23,7 +23,7 @@ add_mlir_dialect_library(MLIRBufferizationDialect
   MLIRFunctionInterfaces
   MLIRIR
   MLIRSparseTensorDialect
-  MLIRSubsetInsertionOpInterface
+  MLIRSubsetOpInterface
   MLIRTensorDialect
   MLIRMemRefDialect
   )
diff --git a/mlir/lib/Dialect/Bufferization/Transforms/CMakeLists.txt b/mlir/lib/Dialect/Bufferization/Transforms/CMakeLists.txt
index a6876c7c824e0..8617c17e7a5e5 100644
--- a/mlir/lib/Dialect/Bufferization/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/Bufferization/Transforms/CMakeLists.txt
@@ -36,7 +36,7 @@ add_mlir_dialect_library(MLIRBufferizationTransforms
   MLIRTensorDialect
   MLIRSCFDialect
   MLIRSideEffectInterfaces
-  MLIRSubsetInsertionOpInterface
+  MLIRSubsetOpInterface
   MLIRTransforms
   MLIRViewLikeInterface
   MLIRSupport
diff --git a/mlir/lib/Dialect/Bufferization/Transforms/EmptyTensorElimination.cpp b/mlir/lib/Dialect/Bufferization/Transforms/EmptyTensorElimination.cpp
index 6622cfefa76a2..4a418a05e6ff5 100644
--- a/mlir/lib/Dialect/Bufferization/Transforms/EmptyTensorElimination.cpp
+++ b/mlir/lib/Dialect/Bufferization/Transforms/EmptyTensorElimination.cpp
@@ -15,7 +15,7 @@
 #include "mlir/Dialect/Bufferization/Transforms/Transforms.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/IR/Dominance.h"
-#include "mlir/Interfaces/SubsetInsertionOpInterface.h"
+#include "mlir/Interfaces/SubsetOpInterface.h"
 #include "mlir/Pass/Pass.h"
 
 namespace mlir {
diff --git a/mlir/lib/Dialect/Bufferization/Transforms/OneShotAnalysis.cpp b/mlir/lib/Dialect/Bufferization/Transforms/OneShotAnalysis.cpp
index 0bbfdba2b6e6e..c48402c3742a7 100644
--- a/mlir/lib/Dialect/Bufferization/Transforms/OneShotAnalysis.cpp
+++ b/mlir/lib/Dialect/Bufferization/Transforms/OneShotAnalysis.cpp
@@ -54,7 +54,7 @@
 #include "mlir/IR/Operation.h"
 #include "mlir/IR/TypeUtilities.h"
 #include "mlir/Interfaces/ControlFlowInterfaces.h"
-#include "mlir/Interfaces/SubsetInsertionOpInterface.h"
+#include "mlir/Interfaces/SubsetOpInterface.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/SetVector.h"
 
diff --git a/mlir/lib/Dialect/Linalg/Transforms/CMakeLists.txt b/mlir/lib/Dialect/Linalg/Transforms/CMakeLists.txt
index e0a43a29c32d8..7af1148bb93d5 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/Linalg/Transforms/CMakeLists.txt
@@ -67,7 +67,7 @@ add_mlir_dialect_library(MLIRLinalgTransforms
   MLIRSCFTransforms
   MLIRSCFUtils
   MLIRPass
-  MLIRSubsetInsertionOpInterface
+  MLIRSubsetOpInterface
   MLIRSparseTensorDialect
   MLIRTensorDialect
   MLIRTensorTilingInterfaceImpl
diff --git a/mlir/lib/Dialect/Linalg/Transforms/SubsetInsertionOpInterfaceImpl.cpp b/mlir/lib/Dialect/Linalg/Transforms/SubsetInsertionOpInterfaceImpl.cpp
index e0819082102ef..6fcfa05468eea 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/SubsetInsertionOpInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/SubsetInsertionOpInterfaceImpl.cpp
@@ -9,12 +9,36 @@
 #include "mlir/Dialect/Linalg/Transforms/SubsetInsertionOpInterfaceImpl.h"
 
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
-#include "mlir/Interfaces/SubsetInsertionOpInterface.h"
+#include "mlir/Interfaces/SubsetOpInterface.h"
 
 using namespace mlir;
 using namespace mlir::linalg;
 
 namespace {
+struct LinalgCopyOpSubsetOpInterface
+    : public SubsetOpInterface::ExternalModel<LinalgCopyOpSubsetOpInterface,
+                                              linalg::CopyOp> {
+  bool operatesOnEquivalentSubset(
+      Operation *op, SubsetOpInterface candidate,
+      function_ref<bool(Value, Value)> equivalenceFn) const {
+    // linalg.copy operates on the entire destination tensor.
+    if (auto otherCopyOp = dyn_cast<linalg::CopyOp>(candidate.getOperation()))
+      return equivalenceFn(cast<linalg::CopyOp>(op).getOutputs()[0],
+                           otherCopyOp.getOutputs()[0]);
+    // In the absence of an analysis, "false" is a conservative way to implement
+    // this interface.
+    return false;
+  }
+
+  bool operatesOnDisjointSubset(
+      Operation *op, SubsetOpInterface candidate,
+      function_ref<bool(Value, Value)> equivalenceFn) const {
+    // In the absence of an analysis, "false" is a conservative way to implement
+    // this interface.
+    return false;
+  }
+};
+
 struct LinalgCopyOpInterface
     : public SubsetInsertionOpInterface::ExternalModel<LinalgCopyOpInterface,
                                                        linalg::CopyOp> {
@@ -48,9 +72,10 @@ struct LinalgCopyOpInterface
 };
 } // namespace
 
-void mlir::linalg::registerSubsetInsertionOpInterfaceExternalModels(
+void mlir::linalg::registerSubsetOpInterfaceExternalModels(
     DialectRegistry &registry) {
   registry.addExtension(+[](MLIRContext *ctx, linalg::LinalgDialect *dialect) {
+    linalg::CopyOp::attachInterface<LinalgCopyOpSubsetOpInterface>(*ctx);
     linalg::CopyOp::attachInterface<LinalgCopyOpInterface>(*ctx);
   });
 }
diff --git a/mlir/lib/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.cpp
index a95443db88b50..2cd57e7324b4d 100644
--- a/mlir/lib/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -1066,5 +1066,5 @@ void mlir::tensor::registerBufferizableOpInterfaceExternalModels(
 
   // Bufferization requires SubsetInsertionOpInterface models. Make sure that
   // they are registered.
-  tensor::registerSubsetInsertionOpInterfaceExternalModels(registry);
+  tensor::registerSubsetOpInterfaceExternalModels(registry);
 }
diff --git a/mlir/lib/Dialect/Tensor/Transforms/CMakeLists.txt b/mlir/lib/Dialect/Tensor/Transforms/CMakeLists.txt
index a0c172ac52e4b..c5fd4e65bbf70 100644
--- a/mlir/lib/Dialect/Tensor/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/Tensor/Transforms/CMakeLists.txt
@@ -30,7 +30,7 @@ add_mlir_dialect_library(MLIRTensorTransforms
   MLIRMemRefDialect
   MLIRPass
   MLIRSCFDialect
-  MLIRSubsetInsertionOpInterface
+  MLIRSubsetOpInterface
   MLIRTensorDialect
   MLIRTensorUtils
   MLIRTilingInterface
diff --git a/mlir/lib/Dialect/Tensor/Transforms/SubsetInsertionOpInterfaceImpl.cpp b/mlir/lib/Dialect/Tensor/Transforms/SubsetInsertionOpInterfaceImpl.cpp
index dbda9953684f4..7a1bafd409eea 100644
--- a/mlir/lib/Dialect/Tensor/Transforms/SubsetInsertionOpInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/Tensor/Transforms/SubsetInsertionOpInterfaceImpl.cpp
@@ -9,17 +9,115 @@
 #include "mlir/Dialect/Tensor/Transforms/SubsetInsertionOpInterfaceImpl.h"
 
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
-#include "mlir/Interfaces/SubsetInsertionOpInterface.h"
+#include "mlir/Interfaces/SubsetOpInterface.h"
+#include "mlir/Interfaces/ValueBoundsOpInterface.h"
 
 using namespace mlir;
 using namespace mlir::tensor;
 
 namespace {
 
+/// Return the tensor that the given subset op operates on.
+Value getContainerOperand(SubsetOpInterface op) {
+  if (auto extractionOp =
+          dyn_cast<SubsetExtractionOpInterface>(op.getOperation()))
+    return extractionOp.getSourceOperand().get();
+  if (auto insertionOp =
+          dyn_cast<SubsetInsertionOpInterface>(op.getOperation()))
+    return insertionOp.getDestinationOperand().get();
+  llvm_unreachable("expected SubsetExtraction/InsertionOpInterface");
+}
+
+/// Return "true" if the two ops operate on an equivalent subset.
+/// `equivalenceFn` is used to determine equivalence of tensors. Return "false"
+/// if the two ops operate non-equivalent subsets, if equivalence cannot be
+/// determined or if `op1` is not a subset op.
+template <typename OpTy>
+bool operateOnEquivalentSubsets(
+    OpTy op1, SubsetOpInterface op2,
+    function_ref<bool(Value, Value)> equivalenceFn) {
+  auto offsetsSizesAndStrides2 =
+      dyn_cast<OffsetSizeAndStrideOpInterface>(op2.getOperation());
+  if (!offsetsSizesAndStrides2)
+    return false;
+  if (!sameOffsetsSizesAndStrides(op1, offsetsSizesAndStrides2,
+                                  isEqualConstantIntOrValue))
+    return false;
+  return equivalenceFn(
+      getContainerOperand(cast<SubsetOpInterface>(op1.getOperation())),
+      getContainerOperand(op2));
+}
+
+/// Return "true" if the two ops operate on a disjoint subsets.
+/// `equivalenceFn` is used to determine equivalence of tensors. Return "false"
+/// if the two ops operate non-disjoint subsets, if disjointness cannot be
+/// determined or if `op1` is not a subset op.
+template <typename OpTy>
+bool operateOnDisjointSubsets(OpTy op1, SubsetOpInterface op2,
+                              function_ref<bool(Value, Value)> equivalenceFn) {
+  auto offsetsSizesAndStrides2 =
+      dyn_cast<OffsetSizeAndStrideOpInterface>(op2.getOperation());
+  if (!offsetsSizesAndStrides2)
+    return false;
+  FailureOr<bool> overlappingSlices =
+      ValueBoundsConstraintSet::areOverlappingSlices(op1,
+                                                     offsetsSizesAndStrides2);
+  if (failed(overlappingSlices) || *overlappingSlices)
+    return false;
+  return equivalenceFn(
+      getContainerOperand(cast<SubsetOpInterface>(op1.getOperation())),
+      getContainerOperand(op2));
+}
+
+struct ExtractSliceOpSubsetOpInterface
+    : public SubsetOpInterface::ExternalModel<ExtractSliceOpSubsetOpInterface,
+                                              tensor::ExtractSliceOp> {
+  bool operatesOnEquivalentSubset(
+      Operation *op, SubsetOpInterface candidate,
+      function_ref<bool(Value, Value)> equivalenceFn) const {
+    auto extractSliceOp = cast<tensor::ExtractSliceOp>(op);
+    return operateOnEquivalentSubsets(extractSliceOp, candidate, equivalenceFn);
+  }
+
+  bool operatesOnDisjointSubset(
+      Operation *op, SubsetOpInterface candidate,
+      function_ref<bool(Value, Value)> equivalenceFn) const {
+    auto extractSliceOp = cast<tensor::ExtractSliceOp>(op);
+    return operateOnDisjointSubsets(extractSliceOp, candidate, equivalenceFn);
+  }
+};
+
+struct ExtractSliceOpSubsetExtractionOpInterface
+    : public SubsetExtractionOpInterface::ExternalModel<
+          ExtractSliceOpSubsetExtractionOpInterface, tensor::ExtractSliceOp> {
+  OpOperand &getSourceOperand(Operation *op) const {
+    return cast<tensor::ExtractSliceOp>(op).getSourceMutable();
+  }
+};
+
 template <typename OpTy>
-struct InsertSliceLikeOpInterface
+struct InsertSliceLikeOpSubsetOpInterface
+    : public SubsetOpInterface::ExternalModel<
+          InsertSliceLikeOpSubsetOpInterface<OpTy>, OpTy> {
+  bool operatesOnEquivalentSubset(
+      Operation *op, SubsetOpInterface candidate,
+      function_ref<bool(Value, Value)> equivalenceFn) const {
+    auto insertSliceOp = cast<OpTy>(op);
+    return operateOnEquivalentSubsets(insertSliceOp, candidate, equivalenceFn);
+  }
+
+  bool operatesOnDisjointSubset(
+      Operation *op, SubsetOpInterface candidate,
+      function_ref<bool(Value, Value)> equivalenceFn) const {
+    auto insertSliceOp = cast<OpTy>(op);
+    return operateOnDisjointSubsets(insertSliceOp, candidate, equivalenceFn);
+  }
+};
+
+template <typename OpTy>
+struct InsertSliceLikeOpSubsetInsertionOpInterface
     : public SubsetInsertionOpInterface::ExternalModel<
-          InsertSliceLikeOpInterface<OpTy>, OpTy> {
+          InsertSliceLikeOpSubsetInsertionOpInterface<OpTy>, OpTy> {
   OpOperand &getSourceOperand(Operation *op) const {
     return cast<OpTy>(op).getSourceMutable();
   }
@@ -28,23 +126,6 @@ struct InsertSliceLikeOpInterface
     return cast<OpTy>(op).getDestMutable();
   }
 
-  /// Return "true" if `insertSliceOp` inserts into a subset that is equivalent
-  /// to the subset defined by `candidate`. `equivalenceFn` is used to determine
-  /// equivalence of tensors.
-  bool
-  isEquivalentSubset(Operation *op, Value candidate,
-                     function_ref<bool(Value, Value)> equivalenceFn) const {
-    auto insertSliceOp = cast<OpTy>(op);
-    // Look for a matching tensor.extract_slice op.
-    auto extractSliceOp = candidate.getDefiningOp<tensor::ExtractSliceOp>();
-    if (!extractSliceOp)
-      return false;
-    if (!equivalenceFn(extractSliceOp.getSource(), insertSliceOp.getDest()))
-      return false;
-    return sameOffsetsSizesAndStrides(extractSliceOp, insertSliceOp,
-                                      isEqualConstantIntOrValue);
-  }
-
   Value buildSubsetExtraction(Operation *op, OpBuilder &builder,
                               Location loc) const {
     auto insertSliceOp = cast<OpTy>(op);
@@ -73,12 +154,22 @@ struct InsertSliceLikeOpInterface
 
 } // namespace
 
-void mlir::tensor::registerSubsetInsertionOpInterfaceExternalModels(
+void mlir::tensor::registerSubsetOpInterfaceExternalModels(
     DialectRegistry &registry) {
   registry.addExtension(+[](MLIRContext *ctx, tensor::TensorDialect *dialect) {
-    InsertSliceOp::attachInterface<InsertSliceLikeOpInterface<InsertSliceOp>>(
+    // Note: `SubsetExtractionOpInterface` and `SubsetInsertionOpInterface`
+    // require `SubsetOpInterface`.
+    ExtractSliceOp::attachInterface<ExtractSliceOpSubsetOpInterface>(*ctx);
+    ExtractSliceOp::attachInterface<ExtractSliceOpSubsetExtractionOpInterface>(
         *ctx);
+    InsertSliceOp::attachInterface<
+        InsertSliceLikeOpSubsetOpInterface<InsertSliceOp>>(*ctx);
+    InsertSliceOp::attachInterface<
+        InsertSliceLikeOpSubsetInsertionOpInterface<InsertSliceOp>>(*ctx);
+    ParallelInsertSliceOp::attachInterface<
+        InsertSliceLikeOpSubsetOpInterface<ParallelInsertSliceOp>>(*ctx);
     ParallelInsertSliceOp::attachInterface<
-        InsertSliceLikeOpInterface<ParallelInsertSliceOp>>(*ctx);
+        InsertSliceLikeOpSubsetInsertionOpInterface<ParallelInsertSliceOp>>(
+        *ctx);
   });
 }
diff --git a/mlir/lib/Interfaces/CMakeLists.txt b/mlir/lib/Interfaces/CMakeLists.txt
index f74306206d63f..2652d261f480b 100644
--- a/mlir/lib/Interfaces/CMakeLists.txt
+++ b/mlir/lib/Interfaces/CMakeLists.txt
@@ -16,7 +16,7 @@ set(LLVM_OPTIONAL_SOURCES
   RuntimeVerifiableOpInterface.cpp
   ShapedOpInterfaces.cpp
   SideEffectInterfaces.cpp
-  SubsetInsertionOpInterface.cpp
+  SubsetOpInterface.cpp
   TilingInterface.cpp
   ValueBoundsOpInterface.cpp
   VectorInterfaces.cpp
@@ -84,15 +84,15 @@ add_mlir_interface_library(RuntimeVerifiableOpInterface)
 add_mlir_interface_library(ShapedOpInterfaces)
 add_mlir_interface_library(SideEffectInterfaces)
 
-add_mlir_library(MLIRSubsetInsertionOpInterface
-  SubsetInsertionOpInterface.cpp
+add_mlir_library(MLIRSubsetOpInterface
+  SubsetOpInterface.cpp
 
   ADDITIONAL_HEADER_DIRS
   ${MLIR_MAIN_INCLUDE_DIR}/mlir/Interfaces
 
   DEPENDS
   MLIRDestinationStyleOpInterface
-  MLIRSubsetInsertionOpInterfaceIncGen
+  MLIRSubsetOpInterfaceIncGen
 
   LINK_LIBS PUBLIC
   MLIRDestinationStyleOpInterface
@@ -112,6 +112,7 @@ add_mlir_library(MLIRValueBoundsOpInterface
   DEPENDS
   MLIRDestinationStyleOpInterface
   MLIRValueBoundsOpInterfaceIncGen
+  MLIRViewLikeInterface
 
   LINK_LIBS PUBLIC
   MLIRAnalysis
diff --git a/mlir/lib/Interfaces/SubsetInsertionOpInterface.cpp b/mlir/lib/Interfaces/SubsetInsertionOpInterface.cpp
deleted file mode 100644
index b2b092287f96b..0000000000000
--- a/mlir/lib/Interfaces/SubsetInsertionOpInterface.cpp
+++ /dev/null
@@ -1,23 +0,0 @@
-//===- SubsetInsertionOpInterface.cpp - Tensor Subsets --------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "mlir/Interfaces/SubsetInsertionOpInterface.h"
-#include "mlir/Interfaces/DestinationStyleOpInterface.h"
-
-#include "mlir/Interfaces/SubsetInsertionOpInterface.cpp.inc"
-
-using namespace mlir;
-
-OpOperand &detail::defaultGetDestinationOperand(Operation *op) {
-  auto dstOp = dyn_cast<DestinationStyleOpInterface>(op);
-  assert(dstOp && "getDestination must be implemented for non-DPS ops");
-  assert(
-      dstOp.getNumDpsInits() == 1 &&
-      "getDestination must be implemented for ops with 0 or more than 1 init");
-  return *dstOp.getDpsInitOperand(0);
-}
diff --git a/mlir/lib/Interfaces/SubsetOpInterface.cpp b/mlir/lib/Interfaces/SubsetOpInterface.cpp
new file mode 100644
index 0000000000000..7245ab20c499e
--- /dev/null
+++ b/mlir/lib/Interfaces/SubsetOpInterface.cpp
@@ -0,0 +1,58 @@
+//===- SubsetOpInterface.cpp - Tensor Subsets -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Interfaces/SubsetOpInterface.h"
+#include "mlir/Interfaces/DestinationStyleOpInterface.h"
+
+#include "mlir/Interfaces/SubsetOpInterface.cpp.inc"
+
+using namespace mlir;
+
+OpOperand &detail::defaultGetDestinationOperand(Operation *op) {
+  auto dstOp = dyn_cast<DestinationStyleOpInterface>(op);
+  assert(dstOp && "getDestination must be implemented for non-DPS ops");
+  assert(
+      dstOp.getNumDpsInits() == 1 &&
+      "getDestination must be implemented for ops with 0 or more than 1 init");
+  return *dstOp.getDpsInitOperand(0);
+}
+
+OpResult detail::defaultGetUpdatedDestination(Operation *op) {
+  auto dstOp = dyn_cast<DestinationStyleOpInterface>(op);
+  assert(dstOp && "getUpdatedDestination must be implemented for non-DPS ops");
+  auto insertionOp = cast<SubsetInsertionOpInterface>(op);
+  return dstOp.getTiedOpResult(&insertionOp.getDestinationOperand());
+}
+
+bool detail::defaultIsEquivalentSubset(
+    Operation *op, Value candidate,
+    function_ref<bool(Value, Value)> equivalenceFn) {
+  assert(isa<SubsetInsertionOpInterface>(op) &&
+         "expected SubsetInsertionOpInterface");
+  if (!candidate.getDefiningOp<SubsetExtractionOpInterface>())
+    return false;
+  return cast<SubsetOpInterface>(op).operatesOnEquivalentSubset(
+      candidate.getDefiningOp<SubsetOpInterface>(), equivalenceFn);
+}
+
+LogicalResult detail::verifySubsetOpInterface(SubsetOpInterface op) {
+  if (!(isa<SubsetExtractionOpInterface>(op.getOperation()) ^
+        isa<SubsetInsertionOpInterface>(op.getOperation())))
+    return op->emitOpError(
+        "SubsetOpInterface ops must implement either "
+        "SubsetExtractionOpInterface or SubsetInsertionOpInterface");
+  return success();
+}
+
+LogicalResult
+detail::verifySubsetExtractionOpInterface(SubsetExtractionOpInterface op) {
+  if (op->getNumResults() != 1)
+    return op->emitOpError(
+        "SubsetExtractionOpInterface ops must have one result");
+  return success();
+}
diff --git a/mlir/lib/Interfaces/ValueBoundsOpInterface.cpp b/mlir/lib/Interfaces/ValueBoundsOpInterface.cpp
index ff941115219f6..f0c37c872e6d3 100644
--- a/mlir/lib/Interfaces/ValueBoundsOpInterface.cpp
+++ b/mlir/lib/Interfaces/ValueBoundsOpInterface.cpp
@@ -11,6 +11,7 @@
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/Matchers.h"
 #include "mlir/Interfaces/DestinationStyleOpInterface.h"
+#include "mlir/Interfaces/ViewLikeInterface.h"
 #include "llvm/ADT/APSInt.h"
 #include "llvm/Support/Debug.h"
 
@@ -85,7 +86,7 @@ AffineExpr ValueBoundsConstraintSet::getExpr(Value value,
       return builder.getAffineConstantExpr(shapedType.getDimSize(*dim));
   } else {
     // Constant index value: return directly.
-    if (auto constInt = getConstantIntValue(value))
+    if (auto constInt = ::getConstantIntValue(value))
       return builder.getAffineConstantExpr(*constInt);
   }
 
@@ -102,7 +103,7 @@ AffineExpr ValueBoundsConstraintSet::getExpr(Value value,
 AffineExpr ValueBoundsConstraintSet::getExpr(OpFoldResult ofr) {
   if (Value value = llvm::dyn_cast_if_present<Value>(ofr))
     return getExpr(value, /*dim=*/std::nullopt);
-  auto constInt = getConstantIntValue(ofr);
+  auto constInt = ::getConstantIntValue(ofr);
   assert(constInt.has_value() && "expected Integer constant");
   return builder.getAffineConstantExpr(*constInt);
 }
@@ -484,6 +485,17 @@ FailureOr<int64_t> ValueBoundsConstraintSet::computeConstantBound(
   return failure();
 }
 
+FailureOr<int64_t> ValueBoundsConstraintSet::computeConstantBound(
+    presburger::BoundType type, AffineMap map, ArrayRef<Value> operands,
+    StopConditionFn stopCondition, bool closedUB) {
+  ValueDimList valueDims;
+  for (Value v : operands) {
+    assert(v.getType().isIndex() && "expected index type");
+    valueDims.emplace_back(v, std::nullopt);
+  }
+  return computeConstantBound(type, map, valueDims, stopCondition, closedUB);
+}
+
 FailureOr<int64_t>
 ValueBoundsConstraintSet::computeConstantDelta(Value value1, Value value2,
                                                std::optional<int64_t> dim1,
@@ -512,6 +524,70 @@ ValueBoundsConstraintSet::areEqual(Value value1, Value value2,
   return *delta == 0;
 }
 
+FailureOr<bool> ValueBoundsConstraintSet::areOverlappingSlices(
+    OffsetSizeAndStrideOpInterface slice1,
+    OffsetSizeAndStrideOpInterface slice2) {
+  assert(slice1.getStaticOffsets().size() == slice1.getStaticOffsets().size() &&
+         "expected slices of same rank");
+  assert(slice1.getStaticSizes().size() == slice1.getStaticSizes().size() &&
+         "expected slices of same rank");
+  assert(slice1.getStaticStrides().size() == slice1.getStaticStrides().size() &&
+         "expected slices of same rank");
+
+  Builder b(slice1.getContext());
+  bool foundUnknownBound = false;
+  for (int64_t i = 0, e = slice1.getStaticOffsets().size(); i < e; ++i) {
+    AffineMap map =
+        AffineMap::get(/*dimCount=*/0, /*symbolCount=*/4,
+                       b.getAffineSymbolExpr(0) +
+                           b.getAffineSymbolExpr(1) * b.getAffineSymbolExpr(2) -
+                           b.getAffineSymbolExpr(3));
+    {
+      // Case 1: Slices are guaranteed to be non-overlapping if
+      // offset1 + size1 * stride1 <= offset2 (for at least one dimension).
+      SmallVector<OpFoldResult> ofrOperands;
+      ofrOperands.push_back(slice1.getMixedOffsets()[i]);
+      ofrOperands.push_back(slice1.getMixedSizes()[i]);
+      ofrOperands.push_back(slice1.getMixedStrides()[i]);
+      ofrOperands.push_back(slice2.getMixedOffsets()[i]);
+      SmallVector<Value> valueOperands;
+      AffineMap foldedMap =
+          foldAttributesIntoMap(b, map, ofrOperands, valueOperands);
+      FailureOr<int64_t> constBound = computeConstantBound(
+          presburger::BoundType::EQ, foldedMap, valueOperands);
+      foundUnknownBound |= failed(constBound);
+      if (succeeded(constBound) && *constBound <= 0)
+        return false;
+    }
+    {
+      // Case 2: Slices are guaranteed to be non-overlapping if
+      // offset2 + size2 * stride2 <= offset1 (for at least one dimension).
+      SmallVector<OpFoldResult> ofrOperands;
+      ofrOperands.push_back(slice2.getMixedOffsets()[i]);
+      ofrOperands.push_back(slice2.getMixedSizes()[i]);
+      ofrOperands.push_back(slice2.getMixedStrides()[i]);
+      ofrOperands.push_back(slice1.getMixedOffsets()[i]);
+      SmallVector<Value> valueOperands;
+      AffineMap foldedMap =
+          foldAttributesIntoMap(b, map, ofrOperands, valueOperands);
+      FailureOr<int64_t> constBound = computeConstantBound(
+          presburger::BoundType::EQ, foldedMap, valueOperands);
+      foundUnknownBound |= failed(constBound);
+      if (succeeded(constBound) && *constBound <= 0)
+        return false;
+    }
+  }
+
+  // If at least one bound could not be computed, we cannot be certain that the
+  // slices are really overlapping.
+  if (foundUnknownBound)
+    return failure();
+
+  // All bounds could be computed and none of the above cases applied.
+  // Therefore, the slices are guaranteed to overlap.
+  return true;
+}
+
 ValueBoundsConstraintSet::BoundBuilder &
 ValueBoundsConstraintSet::BoundBuilder::operator[](int64_t dim) {
   assert(!this->dim.has_value() && "dim was already set");
diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index 0a534cf1f1cda..0448ecbef655f 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -6933,7 +6933,7 @@ cc_library(
         ":MemRefDialect",
         ":Pass",
         ":SCFDialect",
-        ":SubsetInsertionOpInterface",
+        ":SubsetOpInterface",
         ":TensorDialect",
         ":TensorPassIncGen",
         ":TensorUtils",
@@ -10190,9 +10190,9 @@ gentbl_cc_library(
 )
 
 td_library(
-    name = "SubsetInsertionOpInterfaceTdFiles",
+    name = "SubsetOpInterfaceTdFiles",
     srcs = [
-        "include/mlir/Interfaces/SubsetInsertionOpInterface.td",
+        "include/mlir/Interfaces/SubsetOpInterface.td",
     ],
     includes = ["include"],
     deps = [
@@ -10201,33 +10201,33 @@ td_library(
 )
 
 gentbl_cc_library(
-    name = "SubsetInsertionOpInterfaceIncGen",
+    name = "SubsetOpInterfaceIncGen",
     tbl_outs = [
         (
             ["-gen-op-interface-decls"],
-            "include/mlir/Interfaces/SubsetInsertionOpInterface.h.inc",
+            "include/mlir/Interfaces/SubsetOpInterface.h.inc",
         ),
         (
             ["-gen-op-interface-defs"],
-            "include/mlir/Interfaces/SubsetInsertionOpInterface.cpp.inc",
+            "include/mlir/Interfaces/SubsetOpInterface.cpp.inc",
         ),
     ],
     tblgen = ":mlir-tblgen",
-    td_file = "include/mlir/Interfaces/SubsetInsertionOpInterface.td",
+    td_file = "include/mlir/Interfaces/SubsetOpInterface.td",
     deps = [
-        ":SubsetInsertionOpInterfaceTdFiles",
+        ":SubsetOpInterfaceTdFiles",
     ],
 )
 
 cc_library(
-    name = "SubsetInsertionOpInterface",
-    srcs = ["lib/Interfaces/SubsetInsertionOpInterface.cpp"],
-    hdrs = ["include/mlir/Interfaces/SubsetInsertionOpInterface.h"],
+    name = "SubsetOpInterface",
+    srcs = ["lib/Interfaces/SubsetOpInterface.cpp"],
+    hdrs = ["include/mlir/Interfaces/SubsetOpInterface.h"],
     includes = ["include"],
     deps = [
         ":DestinationStyleOpInterface",
         ":IR",
-        ":SubsetInsertionOpInterfaceIncGen",
+        ":SubsetOpInterfaceIncGen",
         ":Support",
         "//llvm:Support",
     ],
@@ -10471,7 +10471,7 @@ cc_library(
         ":SCFTransforms",
         ":SCFUtils",
         ":SparseTensorDialect",
-        ":SubsetInsertionOpInterface",
+        ":SubsetOpInterface",
         ":Support",
         ":TensorDialect",
         ":TensorTilingInterfaceImpl",
@@ -10531,6 +10531,7 @@ cc_library(
         ":IR",
         ":Support",
         ":ValueBoundsOpInterfaceIncGen",
+        ":ViewLikeInterface",
         "//llvm:Support",
     ],
 )
@@ -12580,7 +12581,7 @@ gentbl_cc_library(
         ":BufferizableOpInterfaceTdFiles",
         ":BufferizationOpsTdFiles",
         ":DestinationStyleOpInterfaceTdFiles",
-        ":SubsetInsertionOpInterfaceTdFiles",
+        ":SubsetOpInterfaceTdFiles",
     ],
 )
 
@@ -12620,7 +12621,7 @@ cc_library(
         ":InferTypeOpInterface",
         ":MemRefDialect",
         ":SparseTensorDialect",
-        ":SubsetInsertionOpInterface",
+        ":SubsetOpInterface",
         ":Support",
         ":TensorDialect",
         "//llvm:Support",
@@ -12670,7 +12671,7 @@ cc_library(
         ":Pass",
         ":SCFDialect",
         ":SideEffectInterfaces",
-        ":SubsetInsertionOpInterface",
+        ":SubsetOpInterface",
         ":Support",
         ":TensorDialect",
         ":Transforms",
diff --git a/utils/bazel/llvm-project-overlay/mlir/python/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/python/BUILD.bazel
index c83e4cc7ada23..348ee2beabeb0 100644
--- a/utils/bazel/llvm-project-overlay/mlir/python/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/python/BUILD.bazel
@@ -413,7 +413,7 @@ gentbl_filegroup(
     deps = [
         ":BufferizationOpsPyTdFiles",
         "//mlir:DestinationStyleOpInterfaceTdFiles",
-        "//mlir:SubsetInsertionOpInterfaceTdFiles",
+        "//mlir:SubsetOpInterfaceTdFiles",
     ],
 )
 

From 2164a449dc7a9daf610b789ba0ec7d1eca7305ef Mon Sep 17 00:00:00 2001
From: Matthias Springer <me@m-sp.org>
Date: Wed, 1 Nov 2023 10:57:17 +0900
Subject: [PATCH 607/877] [mlir][Transforms] Add loop-invariant subset hoisting
 (LISH) transformation (#70619)

Add a loop-invariant subset hoisting pass to `mlir/Interfaces`. This
pass hoist loop-invariant tensor subsets (subset extraction and subset
insertion ops) from loop-like ops. Extraction ops are moved before the
loop. Insertion ops are moved after the loop. The loop body operates on
newly added region iter_args (one per extraction-insertion pair).

This new pass will be improved in subsequent commits (to support more
cases/ops) and will eventually replace
`Linalg/Transforms/SubsetHoisting.cpp`. In contrast to the existing
Linalg subset hoisting, the new pass is op interface-based
(`SubsetOpInterface` and `LoopLikeOpInterface`).
---
 .../Transforms/LoopInvariantCodeMotionUtils.h |  39 +++
 mlir/include/mlir/Transforms/Passes.h         |   3 +
 mlir/include/mlir/Transforms/Passes.td        |   5 +
 .../Transforms/LoopInvariantCodeMotion.cpp    |  20 ++
 mlir/lib/Transforms/Utils/CMakeLists.txt      |   1 +
 .../Utils/LoopInvariantCodeMotionUtils.cpp    | 254 +++++++++++++++++-
 .../loop-invariant-subset-hoisting.mlir       | 237 ++++++++++++++++
 .../llvm-project-overlay/mlir/BUILD.bazel     |   1 +
 8 files changed, 556 insertions(+), 4 deletions(-)
 create mode 100644 mlir/test/Transforms/loop-invariant-subset-hoisting.mlir

diff --git a/mlir/include/mlir/Transforms/LoopInvariantCodeMotionUtils.h b/mlir/include/mlir/Transforms/LoopInvariantCodeMotionUtils.h
index c7b816eb28faf..579054070f729 100644
--- a/mlir/include/mlir/Transforms/LoopInvariantCodeMotionUtils.h
+++ b/mlir/include/mlir/Transforms/LoopInvariantCodeMotionUtils.h
@@ -71,6 +71,45 @@ size_t moveLoopInvariantCode(
 /// methods provided by the interface.
 size_t moveLoopInvariantCode(LoopLikeOpInterface loopLike);
 
+/// Hoist loop-invariant tensor subsets (subset extraction and subset insertion
+/// ops) from loop-like ops. Extraction ops are moved before the loop. Insertion
+/// ops are moved after the loop. The loop body operates on newly added region
+/// iter_args (one per extraction-insertion pair).
+///
+/// A subset extraction op (`SubsetExtractionOpInterface`) extracts from a
+/// tensor value at a subset. The result of the op may have an arbitrary type,
+/// i.e., not necessarily a tensor type. Example: "tensor.extract_slice".
+///
+/// A subset insertion op  (`SubsetInsertionOpInterface`) inserts into a tensor
+/// value ("destination") at a subset. Example: "tensor.insert_slice".
+///
+/// Matching extraction-insertion subset ops can be hoisted from a loop if there
+/// are no other ops within the loop that operate on the same or on an
+/// overlapping subset. In particular, non-subset ops can prevent hoisting
+/// because the analysis does not know what subset they operate on.
+///
+/// Example:
+/// ```
+/// %r = scf.for ... iter_args(%t = %a) -> (tensor<?xf32>) {
+///   %0 = tensor.extract_slice %t[0][5][1] : tensor<?xf32> to tensor<5xf32>
+///   %1 = "test.foo"(%0) : (tensor<5xf32>) -> (tensor<5xf32>)
+///   %2 = tensor.insert_slice %1 into %t[0][5][1]
+///       : tensor<5xf32> into tensor<?xf32>
+///   scf.yield %2 : tensor<?xf32>
+/// }
+/// ```
+/// Is rewritten to:
+/// ```
+/// %0 = tensor.extract_slice %a[0][5][1] : tensor<?xf32> to tensor<5xf32>
+/// %new_loop:2 = scf.for ... iter_args(%t = %a, %h = %0) -> (tensor<?xf32>) {
+///   %1 = "test.foo"(%h) : (tensor<5xf32>) -> (tensor<5xf32>)
+///   scf.yield %t, %2 : tensor<?xf32>, tensor<5xf32>
+/// }
+/// %r = tensor.insert_slice %new_loop#1 into %new_loop#0
+///     : tensor<5xf32> into tensor<?xf32>
+/// ```
+LoopLikeOpInterface hoistLoopInvariantSubsets(LoopLikeOpInterface loopLike);
+
 } // end namespace mlir
 
 #endif // MLIR_TRANSFORMS_LOOPINVARIANTCODEMOTIONUTILS_H
diff --git a/mlir/include/mlir/Transforms/Passes.h b/mlir/include/mlir/Transforms/Passes.h
index 320932bb99956..11f5b23e62c66 100644
--- a/mlir/include/mlir/Transforms/Passes.h
+++ b/mlir/include/mlir/Transforms/Passes.h
@@ -78,6 +78,9 @@ std::unique_ptr<Pass> createGenerateRuntimeVerificationPass();
 /// instructions out of the loop.
 std::unique_ptr<Pass> createLoopInvariantCodeMotionPass();
 
+/// Creates a pass that hoists loop-invariant subset ops.
+std::unique_ptr<Pass> createLoopInvariantSubsetHoistingPass();
+
 /// Creates a pass to strip debug information from a function.
 std::unique_ptr<Pass> createStripDebugInfoPass();
 
diff --git a/mlir/include/mlir/Transforms/Passes.td b/mlir/include/mlir/Transforms/Passes.td
index 26d2ff3c30ded..2d2d54fb8fb5e 100644
--- a/mlir/include/mlir/Transforms/Passes.td
+++ b/mlir/include/mlir/Transforms/Passes.td
@@ -329,6 +329,11 @@ def LoopInvariantCodeMotion : Pass<"loop-invariant-code-motion"> {
   let constructor = "mlir::createLoopInvariantCodeMotionPass()";
 }
 
+def LoopInvariantSubsetHoisting : Pass<"loop-invariant-subset-hoisting"> {
+  let summary = "Hoist loop invariant subset ops outside of the loop";
+  let constructor = "mlir::createLoopInvariantSubsetHoistingPass()";
+}
+
 def Mem2Reg : Pass<"mem2reg"> {
   let summary = "Promotes memory slots into values.";
   let description = [{
diff --git a/mlir/lib/Transforms/LoopInvariantCodeMotion.cpp b/mlir/lib/Transforms/LoopInvariantCodeMotion.cpp
index 854fde09bac79..e6d8af8f05832 100644
--- a/mlir/lib/Transforms/LoopInvariantCodeMotion.cpp
+++ b/mlir/lib/Transforms/LoopInvariantCodeMotion.cpp
@@ -18,6 +18,7 @@
 
 namespace mlir {
 #define GEN_PASS_DEF_LOOPINVARIANTCODEMOTION
+#define GEN_PASS_DEF_LOOPINVARIANTSUBSETHOISTING
 #include "mlir/Transforms/Passes.h.inc"
 } // namespace mlir
 
@@ -29,6 +30,12 @@ struct LoopInvariantCodeMotion
     : public impl::LoopInvariantCodeMotionBase<LoopInvariantCodeMotion> {
   void runOnOperation() override;
 };
+
+struct LoopInvariantSubsetHoisting
+    : public impl::LoopInvariantSubsetHoistingBase<
+          LoopInvariantSubsetHoisting> {
+  void runOnOperation() override;
+};
 } // namespace
 
 void LoopInvariantCodeMotion::runOnOperation() {
@@ -39,6 +46,19 @@ void LoopInvariantCodeMotion::runOnOperation() {
       [&](LoopLikeOpInterface loopLike) { moveLoopInvariantCode(loopLike); });
 }
 
+void LoopInvariantSubsetHoisting::runOnOperation() {
+  // Walk through all loops in a function in innermost-loop-first order. This
+  // way, we first hoist from the inner loop, and place the ops in the outer
+  // loop, which in turn can be further hoisted from.
+  getOperation()->walk([&](LoopLikeOpInterface loopLike) {
+    (void)hoistLoopInvariantSubsets(loopLike);
+  });
+}
+
 std::unique_ptr<Pass> mlir::createLoopInvariantCodeMotionPass() {
   return std::make_unique<LoopInvariantCodeMotion>();
 }
+
+std::unique_ptr<Pass> mlir::createLoopInvariantSubsetHoistingPass() {
+  return std::make_unique<LoopInvariantSubsetHoisting>();
+}
diff --git a/mlir/lib/Transforms/Utils/CMakeLists.txt b/mlir/lib/Transforms/Utils/CMakeLists.txt
index efc7a5160b239..1c608e0634a67 100644
--- a/mlir/lib/Transforms/Utils/CMakeLists.txt
+++ b/mlir/lib/Transforms/Utils/CMakeLists.txt
@@ -20,5 +20,6 @@ add_mlir_library(MLIRTransformUtils
   MLIRFunctionInterfaces
   MLIRLoopLikeInterface
   MLIRSideEffectInterfaces
+  MLIRSubsetOpInterface
   MLIRRewrite
   )
diff --git a/mlir/lib/Transforms/Utils/LoopInvariantCodeMotionUtils.cpp b/mlir/lib/Transforms/Utils/LoopInvariantCodeMotionUtils.cpp
index 080492da6ae4b..01318cf7328b5 100644
--- a/mlir/lib/Transforms/Utils/LoopInvariantCodeMotionUtils.cpp
+++ b/mlir/lib/Transforms/Utils/LoopInvariantCodeMotionUtils.cpp
@@ -11,9 +11,12 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/Transforms/LoopInvariantCodeMotionUtils.h"
+
 #include "mlir/IR/Operation.h"
+#include "mlir/IR/PatternMatch.h"
 #include "mlir/Interfaces/LoopLikeInterface.h"
 #include "mlir/Interfaces/SideEffectInterfaces.h"
+#include "mlir/Interfaces/SubsetOpInterface.h"
 #include "llvm/Support/Debug.h"
 #include <queue>
 
@@ -26,7 +29,7 @@ using namespace mlir;
 ///   loop (by means of calling definedOutside).
 /// - the op has no side-effects.
 static bool canBeHoisted(Operation *op,
-                         function_ref<bool(Value)> definedOutside) {
+                         function_ref<bool(OpOperand &)> condition) {
   // Do not move terminators.
   if (op->hasTrait<OpTrait::IsTerminator>())
     return false;
@@ -35,11 +38,11 @@ static bool canBeHoisted(Operation *op,
   // defined outside of the loop or in a nested region, but not at the level of
   // the loop body.
   auto walkFn = [&](Operation *child) {
-    for (Value operand : child->getOperands()) {
+    for (OpOperand &operand : child->getOpOperands()) {
       // Ignore values defined in a nested region.
-      if (op->isAncestor(operand.getParentRegion()->getParentOp()))
+      if (op->isAncestor(operand.get().getParentRegion()->getParentOp()))
         continue;
-      if (!definedOutside(operand))
+      if (!condition(operand))
         return WalkResult::interrupt();
     }
     return WalkResult::advance();
@@ -47,6 +50,12 @@ static bool canBeHoisted(Operation *op,
   return !op->walk(walkFn).wasInterrupted();
 }
 
+static bool canBeHoisted(Operation *op,
+                         function_ref<bool(Value)> definedOutside) {
+  return canBeHoisted(
+      op, [&](OpOperand &operand) { return definedOutside(operand.get()); });
+}
+
 size_t mlir::moveLoopInvariantCode(
     ArrayRef<Region *> regions,
     function_ref<bool(Value, Region *)> isDefinedOutsideRegion,
@@ -105,3 +114,240 @@ size_t mlir::moveLoopInvariantCode(LoopLikeOpInterface loopLike) {
       },
       [&](Operation *op, Region *) { loopLike.moveOutOfLoop(op); });
 }
+
+namespace {
+/// Helper data structure that keeps track of equivalent/disjoint subset ops.
+class MatchingSubsets {
+public:
+  /// Insert a subset op.
+  void insert(SubsetOpInterface op) {
+    allSubsetOps.push_back(op);
+    if (auto extractionOp =
+            dyn_cast<SubsetExtractionOpInterface>(op.getOperation()))
+      insertExtractionOp(extractionOp);
+    if (auto insertionOp =
+            dyn_cast<SubsetInsertionOpInterface>(op.getOperation()))
+      insertInsertionOp(insertionOp);
+  }
+
+  /// Return a range of matching extraction-insertion subset ops. If there is no
+  /// matching extraction/insertion op, the respective value is empty. Ops are
+  /// skipped if there are other subset ops that are not guaranteed to operate
+  /// on disjoint subsets.
+  auto getHoistableSubsetOps() {
+    return llvm::make_filter_range(
+        llvm::zip(extractions, insertions), [&](auto pair) {
+          auto [extractionOp, insertionOp] = pair;
+          // Hoist only if the extracted and inserted values have the same type.
+          if (extractionOp && insertionOp &&
+              extractionOp->getResult(0).getType() !=
+                  insertionOp.getSourceOperand().get().getType())
+            return false;
+          // Hoist only if there are no conflicting subset ops.
+          return allDisjoint(extractionOp, insertionOp);
+        });
+  }
+
+private:
+  /// Helper function for equivalence of tensor values. Since only insertion
+  /// subset ops (that are also destination style ops) are followed when
+  /// traversing the SSA use-def chain, all tensor values are equivalent.
+  static bool isEquivalent(Value v1, Value v2) { return true; }
+
+  /// Return "true" if the subsets of the given extraction and insertion ops
+  /// are operating disjoint from the subsets that all other known subset ops
+  /// are operating on.
+  bool allDisjoint(SubsetExtractionOpInterface extractionOp,
+                   SubsetInsertionOpInterface insertionOp) const {
+    for (SubsetOpInterface other : allSubsetOps) {
+      if (other == extractionOp || other == insertionOp)
+        continue;
+      if (extractionOp &&
+          !other.operatesOnDisjointSubset(extractionOp, isEquivalent))
+        return false;
+      if (insertionOp &&
+          !other.operatesOnDisjointSubset(insertionOp, isEquivalent))
+        return false;
+    }
+    return true;
+  }
+
+  /// Insert a subset extraction op. If the subset is equivalent to an existing
+  /// subset insertion op, pair them up. (If there is already a paired up subset
+  /// extraction op, overwrite the subset extraction op.)
+  void insertExtractionOp(SubsetExtractionOpInterface extractionOp) {
+    for (auto it : llvm::enumerate(insertions)) {
+      if (!it.value())
+        continue;
+      auto other = cast<SubsetOpInterface>(it.value().getOperation());
+      if (other.operatesOnEquivalentSubset(extractionOp, isEquivalent)) {
+        extractions[it.index()] = extractionOp;
+        return;
+      }
+    }
+    // There is no known equivalent insertion op. Create a new entry.
+    extractions.push_back(extractionOp);
+    insertions.push_back({});
+  }
+
+  /// Insert a subset insertion op. If the subset is equivalent to an existing
+  /// subset extraction op, pair them up. (If there is already a paired up
+  /// subset insertion op, overwrite the subset insertion op.)
+  void insertInsertionOp(SubsetInsertionOpInterface insertionOp) {
+    for (auto it : llvm::enumerate(extractions)) {
+      if (!it.value())
+        continue;
+      auto other = cast<SubsetOpInterface>(it.value().getOperation());
+      if (other.operatesOnEquivalentSubset(insertionOp, isEquivalent)) {
+        insertions[it.index()] = insertionOp;
+        return;
+      }
+    }
+    // There is no known equivalent extraction op. Create a new entry.
+    extractions.push_back({});
+    insertions.push_back(insertionOp);
+  }
+
+  SmallVector<SubsetExtractionOpInterface> extractions;
+  SmallVector<SubsetInsertionOpInterface> insertions;
+  SmallVector<SubsetOpInterface> allSubsetOps;
+};
+} // namespace
+
+/// If the given value has a single use by an op that is a terminator, return
+/// that use. Otherwise, return nullptr.
+static OpOperand *getSingleTerminatorUse(Value value) {
+  if (!value.hasOneUse())
+    return nullptr;
+  OpOperand &use = *value.getUses().begin();
+  if (use.getOwner()->hasTrait<OpTrait::IsTerminator>())
+    return &use;
+  return nullptr;
+}
+
+/// Hoist all subset ops that operate on the idx-th region iter_arg of the given
+/// loop-like op and index into loop-invariant subset locations. Return the
+/// newly created loop op (that has extra iter_args) or the original loop op if
+/// nothing was hoisted.
+static LoopLikeOpInterface hoistSubsetAtIterArg(LoopLikeOpInterface loopLike,
+                                                BlockArgument iterArg) {
+  IRRewriter rewriter(loopLike.getContext());
+  assert(iterArg.getOwner()->getParentOp() == loopLike && "invalid iter_arg");
+  auto it = llvm::find(loopLike.getRegionIterArgs(), iterArg);
+  int64_t iterArgIdx = std::distance(loopLike.getRegionIterArgs().begin(), it);
+  Value value = iterArg;
+  MatchingSubsets subsets;
+
+  // Traverse use-def chain. Subset ops can be hoisted only if all ops along the
+  // use-def chain starting from the region iter_arg are subset extraction or
+  // subset insertion ops. The chain must terminate at the corresponding yield
+  // operand (e.g., no swapping of iter_args).
+  OpOperand *yieldedOperand = nullptr;
+  // Iterate until the single use of the current SSA value is a terminator,
+  // which is expected to be the yielding operation of the loop.
+  while (!(yieldedOperand = getSingleTerminatorUse(value))) {
+    Value nextValue = {};
+
+    for (OpOperand &use : value.getUses()) {
+      auto subsetOp = dyn_cast<SubsetOpInterface>(use.getOwner());
+      if (!subsetOp)
+        return loopLike;
+      subsets.insert(subsetOp);
+
+      if (auto insertionOp =
+              dyn_cast<SubsetInsertionOpInterface>(use.getOwner())) {
+        // The value must be used as a destination. (In case of a source, the
+        // entire tensor would be read, which would prevent any hoisting.)
+        if (&use != &insertionOp.getDestinationOperand())
+          return loopLike;
+        // There must be a single use-def chain from the region iter_arg to the
+        // terminator. I.e., only one insertion op. Branches are not supported.
+        if (nextValue)
+          return loopLike;
+        nextValue = insertionOp.getUpdatedDestination();
+      }
+    }
+
+    // Nothing can be hoisted if the chain does not continue with loop yielding
+    // op or a subset insertion op.
+    if (!nextValue)
+      return loopLike;
+    value = nextValue;
+  }
+
+  // Hoist only if the SSA use-def chain ends in the yielding terminator of the
+  // loop and the yielded value is the `idx`-th operand. (I.e., there is no
+  // swapping yield.)
+  if (loopLike.getTiedLoopYieldedValue(iterArg) != yieldedOperand)
+    return loopLike;
+
+  // Hoist all matching extraction-insertion pairs one-by-one.
+  for (auto it : subsets.getHoistableSubsetOps()) {
+    auto extractionOp = std::get<0>(it);
+    auto insertionOp = std::get<1>(it);
+
+    // Ops cannot be hoisted if they depend on loop-variant values.
+    if (extractionOp) {
+      if (!canBeHoisted(extractionOp, [&](OpOperand &operand) {
+            return loopLike.isDefinedOutsideOfLoop(operand.get()) ||
+                   &operand == &extractionOp.getSourceOperand();
+          }))
+        extractionOp = {};
+    }
+    if (insertionOp) {
+      if (!canBeHoisted(insertionOp, [&](OpOperand &operand) {
+            return loopLike.isDefinedOutsideOfLoop(operand.get()) ||
+                   &operand == &insertionOp.getSourceOperand() ||
+                   &operand == &insertionOp.getDestinationOperand();
+          }))
+        insertionOp = {};
+    }
+
+    // Only hoist extraction-insertion pairs for now. Standalone extractions/
+    // insertions that are loop-invariant could be hoisted, but there may be
+    // easier ways to canonicalize the IR.
+    if (extractionOp && insertionOp) {
+      // Create a new loop with an additional iter_arg.
+      NewYieldValuesFn newYieldValuesFn =
+          [&](OpBuilder &b, Location loc,
+              ArrayRef<BlockArgument> innerNewBBArgs) -> SmallVector<Value> {
+        return {insertionOp.getSourceOperand().get()};
+      };
+      FailureOr<LoopLikeOpInterface> newLoop =
+          loopLike.replaceWithAdditionalYields(
+              rewriter, extractionOp.getResult(),
+              /*replaceInitOperandUsesInLoop=*/true, newYieldValuesFn);
+      if (failed(newLoop))
+        return loopLike;
+      loopLike = *newLoop;
+
+      // Hoist the extraction/insertion ops.
+      iterArg = loopLike.getRegionIterArgs()[iterArgIdx];
+      OpResult loopResult = loopLike.getTiedLoopResult(iterArg);
+      OpResult newLoopResult = loopLike.getLoopResults()->back();
+      extractionOp->moveBefore(loopLike);
+      insertionOp->moveAfter(loopLike);
+      insertionOp.getUpdatedDestination().replaceAllUsesWith(
+          insertionOp.getDestinationOperand().get());
+      extractionOp.getSourceOperand().set(
+          loopLike.getTiedLoopInit(iterArg)->get());
+      loopResult.replaceAllUsesWith(insertionOp.getUpdatedDestination());
+      insertionOp.getSourceOperand().set(newLoopResult);
+      insertionOp.getDestinationOperand().set(loopResult);
+    }
+  }
+
+  return loopLike;
+}
+
+LoopLikeOpInterface
+mlir::hoistLoopInvariantSubsets(LoopLikeOpInterface loopLike) {
+  // Note: As subset ops are getting hoisted, the number of region iter_args
+  // increases. This can enable further hoisting opportunities on the new
+  // iter_args.
+  for (int64_t i = 0;
+       i < static_cast<int64_t>(loopLike.getRegionIterArgs().size()); ++i) {
+    loopLike = hoistSubsetAtIterArg(loopLike, loopLike.getRegionIterArgs()[i]);
+  }
+  return loopLike;
+}
diff --git a/mlir/test/Transforms/loop-invariant-subset-hoisting.mlir b/mlir/test/Transforms/loop-invariant-subset-hoisting.mlir
new file mode 100644
index 0000000000000..5cded4c99182c
--- /dev/null
+++ b/mlir/test/Transforms/loop-invariant-subset-hoisting.mlir
@@ -0,0 +1,237 @@
+// RUN: mlir-opt %s  -split-input-file -loop-invariant-subset-hoisting | FileCheck %s
+
+// CHECK-LABEL: func @hoist_matching_extract_insert(
+//  CHECK-SAME:     %[[arg:.*]]: tensor<?xf32>
+func.func @hoist_matching_extract_insert(%arg: tensor<?xf32>) -> tensor<?xf32> {
+  %lb = "test.foo"() : () -> (index)
+  %ub = "test.foo"() : () -> (index)
+  %step = "test.foo"() : () -> (index)
+
+  // CHECK: %[[extract:.*]] = tensor.extract_slice %[[arg]]
+  // CHECK: %[[for:.*]]:2 = scf.for {{.*}} iter_args(%[[t:.*]] = %[[arg]], %[[hoisted:.*]] = %[[extract]])
+  %0 = scf.for %iv = %lb to %ub step %step iter_args(%t = %arg) -> (tensor<?xf32>) {
+    // CHECK: tensor.extract_slice %[[t]][9] [5] [1]
+    %standalone = tensor.extract_slice %t[9][5][1] : tensor<?xf32> to tensor<5xf32>
+    "test.foo"(%standalone) : (tensor<5xf32>) -> ()
+
+    %1 = tensor.extract_slice %t[0][5][1] : tensor<?xf32> to tensor<5xf32>
+    // CHECK: %[[foo:.*]] = "test.foo"(%[[hoisted]])
+    %2 = "test.foo"(%1) : (tensor<5xf32>) -> (tensor<5xf32>)
+    %3 = tensor.insert_slice %2 into %t[0][5][1] : tensor<5xf32> into tensor<?xf32>
+    // CHECK: scf.yield %[[t]], %[[foo]]
+    scf.yield %3 : tensor<?xf32>
+  }
+  // CHECK: %[[insert:.*]] = tensor.insert_slice %[[for]]#1 into %[[for]]#0
+
+  // CHECK: return %[[insert]]
+  return %0 : tensor<?xf32>
+}
+
+// -----
+
+func.func @subset_of_subset(%arg: tensor<?xf32>) -> tensor<?xf32> {
+  %lb = "test.foo"() : () -> (index)
+  %ub = "test.foo"() : () -> (index)
+  %step = "test.foo"() : () -> (index)
+
+  // CHECK: %[[extract1:.*]] = tensor.extract_slice %[[arg]]
+  // CHECK: %[[extract2:.*]] = tensor.extract_slice %[[extract1]]
+  // CHECK: %[[for:.*]]:3 = scf.for {{.*}} iter_args(%[[t:.*]] = %[[arg]], %[[hoisted1:.*]] = %[[extract1]], %[[hoisted2:.*]] = %[[extract2]])
+  %0 = scf.for %iv = %lb to %ub step %step iter_args(%t = %arg) -> (tensor<?xf32>) {
+    %extract1 = tensor.extract_slice %t[0][5][1] : tensor<?xf32> to tensor<5xf32>
+    %extract2 = tensor.extract_slice %extract1[1][2][1] : tensor<5xf32> to tensor<2xf32>
+
+    // CHECK: %[[foo:.*]] = "test.foo"(%[[hoisted2]])
+    %2 = "test.foo"(%extract2) : (tensor<2xf32>) -> (tensor<2xf32>)
+
+    %insert1 = tensor.insert_slice %2 into %extract1[1][2][1] : tensor<2xf32> into tensor<5xf32>
+    %insert2 = tensor.insert_slice %insert1 into %t[0][5][1] : tensor<5xf32> into tensor<?xf32>
+
+    // CHECK: scf.yield %[[t]], %[[hoisted1]], %[[foo]]
+    scf.yield %insert2 : tensor<?xf32>
+  }
+  // CHECK: %[[insert2:.*]] = tensor.insert_slice %[[for]]#2 into %[[for]]#1[1] [2] [1]
+  // CHECK: %[[insert1:.*]] = tensor.insert_slice %[[insert2]] into %[[for]]#0[0] [5] [1]
+
+  // CHECK: return %[[insert1]]
+  return %0 : tensor<?xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @hoist_matching_chain(
+//  CHECK-SAME:     %[[arg:.*]]: tensor<?xf32>
+func.func @hoist_matching_chain(%arg: tensor<?xf32>) -> tensor<?xf32> {
+  %lb = "test.foo"() : () -> (index)
+  %ub = "test.foo"() : () -> (index)
+  %step = "test.foo"() : () -> (index)
+  %sz = "test.foo"() : () -> (index)
+
+  // CHECK: %[[extract2:.*]] = tensor.extract_slice %[[arg]][%{{.*}}] [5] [1]
+  // CHECK: %[[extract1:.*]] = tensor.extract_slice %[[arg]][0] [%{{.*}}] [1]
+  // CHECK: %[[for:.*]]:3 = scf.for {{.*}} iter_args(%[[t:.*]] = %[[arg]], %[[hoisted2:.*]] = %[[extract2]], %[[hoisted1:.*]] = %[[extract1]])
+  %0 = scf.for %iv = %lb to %ub step %step iter_args(%t = %arg) -> (tensor<?xf32>) {
+    %1 = tensor.extract_slice %t[0][%sz][1] : tensor<?xf32> to tensor<?xf32>
+    %2 = tensor.extract_slice %t[%sz][5][1] : tensor<?xf32> to tensor<5xf32>
+    // CHECK-DAG: %[[foo1:.*]] = "test.foo"(%[[hoisted1]])
+    // CHECK-DAG: %[[foo2:.*]] = "test.foo"(%[[hoisted2]])
+    %foo1 = "test.foo"(%1) : (tensor<?xf32>) -> (tensor<?xf32>)
+    %foo2 = "test.foo"(%2) : (tensor<5xf32>) -> (tensor<5xf32>)
+    %5 = tensor.insert_slice %foo2 into %t[%sz][5][1] : tensor<5xf32> into tensor<?xf32>
+    %6 = tensor.insert_slice %foo1 into %5[0][%sz][1] : tensor<?xf32> into tensor<?xf32>
+    // CHECK: scf.yield %[[t]], %[[foo2]], %[[foo1]]
+    scf.yield %6 : tensor<?xf32>
+  }
+  // CHECK: %[[insert2:.*]] = tensor.insert_slice %[[for]]#2 into %[[for]]#0[0] [%{{.*}}] [1]
+  // CHECK: %[[insert1:.*]] = tensor.insert_slice %[[for]]#1 into %[[insert2]][%{{.*}}] [5] [1]
+
+  // CHECK: return %[[insert1]]
+  return %0 : tensor<?xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @do_not_hoist_overlapping_subsets(
+func.func @do_not_hoist_overlapping_subsets(%arg: tensor<?xf32>) -> tensor<?xf32> {
+  %lb = "test.foo"() : () -> (index)
+  %ub = "test.foo"() : () -> (index)
+  %step = "test.foo"() : () -> (index)
+  %sz1 = "test.foo"() : () -> (index)
+  %sz2 = "test.foo"() : () -> (index)
+
+  // CHECK: scf.for
+  %0 = scf.for %iv = %lb to %ub step %step iter_args(%t = %arg) -> (tensor<?xf32>) {
+    // These two slices are potentially overlapping. Do not hoist.
+    // CHECK: tensor.extract_slice
+    // CHECK: tensor.extract_slice
+    %1 = tensor.extract_slice %t[0][%sz1][1] : tensor<?xf32> to tensor<?xf32>
+    %2 = tensor.extract_slice %t[10][%sz2][1] : tensor<?xf32> to tensor<?xf32>
+    // CHECK: "test.foo"
+    // CHECK: "test.foo"
+    %foo1 = "test.foo"(%1) : (tensor<?xf32>) -> (tensor<?xf32>)
+    %foo2 = "test.foo"(%2) : (tensor<?xf32>) -> (tensor<?xf32>)
+    // CHECK: tensor.insert_slice
+    // CHECK: tensor.insert_slice
+    %5 = tensor.insert_slice %foo2 into %t[0][%sz1][1] : tensor<?xf32> into tensor<?xf32>
+    %6 = tensor.insert_slice %foo1 into %5[10][%sz2][1] : tensor<?xf32> into tensor<?xf32>
+    // CHECK: scf.yield
+    scf.yield %6 : tensor<?xf32>
+  }
+
+  return %0 : tensor<?xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @multiple_yields(
+//  CHECK-SAME:     %[[arg:.*]]: tensor<?xf32>
+func.func @multiple_yields(%arg: tensor<?xf32>) -> (tensor<?xf32>, tensor<?xf32>) {
+  %lb = "test.foo"() : () -> (index)
+  %ub = "test.foo"() : () -> (index)
+  %step = "test.foo"() : () -> (index)
+
+  // CHECK: %[[extract1:.*]] = tensor.extract_slice
+  // CHECK: %[[extract2:.*]] = tensor.extract_slice
+  // CHECK: scf.for {{.*}} iter_args(%{{.*}} = %[[arg]], %{{.*}} = %[[arg]], %{{.*}} = %[[extract1]], %{{.*}} = %[[extract2]])
+  %0:2 = scf.for %iv = %lb to %ub step %step iter_args(%t1 = %arg, %t2 = %arg)
+      -> (tensor<?xf32>, tensor<?xf32>) {
+    %1 = tensor.extract_slice %t1[0][5][1] : tensor<?xf32> to tensor<5xf32>
+    %2 = tensor.extract_slice %t2[5][5][1] : tensor<?xf32> to tensor<5xf32>
+    // CHECK: "test.foo"
+    // CHECK: "test.foo"
+    %foo1 = "test.foo"(%1) : (tensor<5xf32>) -> (tensor<5xf32>)
+    %foo2 = "test.foo"(%2) : (tensor<5xf32>) -> (tensor<5xf32>)
+    %5 = tensor.insert_slice %foo2 into %t1[0][5][1] : tensor<5xf32> into tensor<?xf32>
+    %6 = tensor.insert_slice %foo1 into %t2[5][5][1] : tensor<5xf32> into tensor<?xf32>
+    // CHECK: scf.yield
+    scf.yield %5, %6 : tensor<?xf32>, tensor<?xf32>
+  }
+  // CHECK: tensor.insert_slice
+  // CHECK: tensor.insert_slice
+
+  return %0#0, %0#1 : tensor<?xf32>, tensor<?xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @do_not_hoist_swapping_yields(
+func.func @do_not_hoist_swapping_yields(%arg: tensor<?xf32>) -> (tensor<?xf32>, tensor<?xf32>) {
+  %lb = "test.foo"() : () -> (index)
+  %ub = "test.foo"() : () -> (index)
+  %step = "test.foo"() : () -> (index)
+
+  // CHECK: scf.for
+  %0:2 = scf.for %iv = %lb to %ub step %step iter_args(%t1 = %arg, %t2 = %arg)
+      -> (tensor<?xf32>, tensor<?xf32>) {
+    // CHECK: tensor.extract_slice
+    // CHECK: tensor.extract_slice
+    %1 = tensor.extract_slice %t1[0][5][1] : tensor<?xf32> to tensor<5xf32>
+    %2 = tensor.extract_slice %t2[5][5][1] : tensor<?xf32> to tensor<5xf32>
+    // CHECK: "test.foo"
+    // CHECK: "test.foo"
+    %foo1 = "test.foo"(%1) : (tensor<5xf32>) -> (tensor<5xf32>)
+    %foo2 = "test.foo"(%2) : (tensor<5xf32>) -> (tensor<5xf32>)
+    // CHECK: tensor.insert_slice
+    // CHECK: tensor.insert_slice
+    %5 = tensor.insert_slice %foo2 into %t1[0][5][1] : tensor<5xf32> into tensor<?xf32>
+    %6 = tensor.insert_slice %foo1 into %t2[5][5][1] : tensor<5xf32> into tensor<?xf32>
+    // Swapping yields: do not hoist.
+    // CHECK: scf.yield
+    scf.yield %6, %5 : tensor<?xf32>, tensor<?xf32>
+  }
+
+  return %0#0, %0#1 : tensor<?xf32>, tensor<?xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @non_subset_op(
+func.func @non_subset_op(%arg: tensor<?xf32>) -> tensor<?xf32> {
+  %lb = "test.foo"() : () -> (index)
+  %ub = "test.foo"() : () -> (index)
+  %step = "test.foo"() : () -> (index)
+
+  // CHECK: scf.for
+  %0 = scf.for %iv = %lb to %ub step %step iter_args(%t = %arg) -> (tensor<?xf32>) {
+    // If any value along the use-def chain from the region iter_arg to the
+    // terminator is used by a non-subset op, no subset op along that chain can
+    // be hoisted. That is because it is unknown which parts of the value are
+    // accessed by the non-subset op.
+    // CHECK: "test.non_subset_op"
+    "test.non_subset_op"(%t) : (tensor<?xf32>) -> ()
+    // CHECK: tensor.extract_slice
+    %1 = tensor.extract_slice %t[0][5][1] : tensor<?xf32> to tensor<5xf32>
+    // CHECK: "test.foo"
+    %2 = "test.foo"(%1) : (tensor<5xf32>) -> (tensor<5xf32>)
+    // CHECK: tensor.insert_slice
+    %3 = tensor.insert_slice %2 into %t[0][5][1] : tensor<5xf32> into tensor<?xf32>
+    // CHECK: scf.yield
+    scf.yield %3 : tensor<?xf32>
+  }
+
+  return %0 : tensor<?xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @non_loop_invariant_subset_op(
+func.func @non_loop_invariant_subset_op(%arg: tensor<?xf32>) -> tensor<?xf32> {
+  %lb = "test.foo"() : () -> (index)
+  %ub = "test.foo"() : () -> (index)
+  %step = "test.foo"() : () -> (index)
+
+  // CHECK: scf.for
+  %0 = scf.for %iv = %lb to %ub step %step iter_args(%t = %arg) -> (tensor<?xf32>) {
+    // Subset ops that are not loop-invariant cannot be hoisted.
+    // CHECK: tensor.extract_slice
+    %1 = tensor.extract_slice %t[%iv][5][1] : tensor<?xf32> to tensor<5xf32>
+    // CHECK: "test.foo"
+    %2 = "test.foo"(%1) : (tensor<5xf32>) -> (tensor<5xf32>)
+    // CHECK: tensor.insert_slice
+    %3 = tensor.insert_slice %2 into %t[%iv][5][1] : tensor<5xf32> into tensor<?xf32>
+    // CHECK: scf.yield
+    scf.yield %3 : tensor<?xf32>
+  }
+
+  return %0 : tensor<?xf32>
+}
diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index 0448ecbef655f..0a2ae427169a9 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -7035,6 +7035,7 @@ cc_library(
         ":MemorySlotInterfaces",
         ":Rewrite",
         ":SideEffectInterfaces",
+        ":SubsetOpInterface",
         ":Support",
         ":TransformsPassIncGen",
         ":config",

From 97c9c1638afb2c50b006d42f3f9474055cc72d29 Mon Sep 17 00:00:00 2001
From: anemet <anemet@apple.com>
Date: Tue, 31 Oct 2023 19:09:51 -0700
Subject: [PATCH 608/877] [mlir][arith] Add ArithOpsInterfaces to
 mlir-generic-headers (#70862)

This is achieved by calling add_mlir_interface.

The issue manifests as ArithOpsInterfaces.h.inc not being available when
a pass includes something like MemRef.h which includes Arith.h.

Tested with check-mlir.
---
 mlir/include/mlir/Dialect/Arith/IR/CMakeLists.txt | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Arith/IR/CMakeLists.txt b/mlir/include/mlir/Dialect/Arith/IR/CMakeLists.txt
index 5cdde2edd50f1..a2f24bdd94444 100644
--- a/mlir/include/mlir/Dialect/Arith/IR/CMakeLists.txt
+++ b/mlir/include/mlir/Dialect/Arith/IR/CMakeLists.txt
@@ -7,8 +7,4 @@ mlir_tablegen(ArithOpsAttributes.cpp.inc -gen-attrdef-defs
               -attrdefs-dialect=arith)
 add_mlir_dialect(ArithOps arith)
 add_mlir_doc(ArithOps ArithOps Dialects/ -gen-dialect-doc)
-
-set(LLVM_TARGET_DEFINITIONS ArithOpsInterfaces.td)
-mlir_tablegen(ArithOpsInterfaces.h.inc -gen-op-interface-decls)
-mlir_tablegen(ArithOpsInterfaces.cpp.inc -gen-op-interface-defs)
-add_public_tablegen_target(MLIRArithOpsInterfacesIncGen)
+add_mlir_interface(ArithOpsInterfaces)

From 7ea1c395ccb6c79d5d3b124b734fa5637dcd48a4 Mon Sep 17 00:00:00 2001
From: Matthias Springer <me@m-sp.org>
Date: Wed, 1 Nov 2023 11:14:10 +0900
Subject: [PATCH 609/877] [mlir][Transforms] LISH: Improve bypass analysis for
 loop-like ops (#70623)

Improve the bypass analysis for loop-like ops. Until now, loop-like ops
were treated like any other non-subset ops: they prevent hoisting of any
sort because the analysis does not know which parts of a tensor init
operand are accessed by the loop-like op. With this change, the analysis
can look into loop-like ops and analyze which subset they are operating
on.
---
 .../Utils/LoopInvariantCodeMotionUtils.cpp    | 72 ++++++++++++++-----
 .../loop-invariant-subset-hoisting.mlir       | 35 +++++++++
 2 files changed, 91 insertions(+), 16 deletions(-)

diff --git a/mlir/lib/Transforms/Utils/LoopInvariantCodeMotionUtils.cpp b/mlir/lib/Transforms/Utils/LoopInvariantCodeMotionUtils.cpp
index 01318cf7328b5..53bdb7aafe41a 100644
--- a/mlir/lib/Transforms/Utils/LoopInvariantCodeMotionUtils.cpp
+++ b/mlir/lib/Transforms/Utils/LoopInvariantCodeMotionUtils.cpp
@@ -120,8 +120,10 @@ namespace {
 class MatchingSubsets {
 public:
   /// Insert a subset op.
-  void insert(SubsetOpInterface op) {
+  void insert(SubsetOpInterface op, bool collectHoistableOps = true) {
     allSubsetOps.push_back(op);
+    if (!collectHoistableOps)
+      return;
     if (auto extractionOp =
             dyn_cast<SubsetExtractionOpInterface>(op.getOperation()))
       insertExtractionOp(extractionOp);
@@ -148,6 +150,15 @@ class MatchingSubsets {
         });
   }
 
+  /// Populate subset ops starting from the given region iter_arg. Return
+  /// "failure" if non-subset ops are found along the path to the loop yielding
+  /// op or if there is no single path to the tied yielded operand. If
+  /// `collectHoistableOps` is set to "false", subset ops are gathered
+  /// throughout the traversal, but not enumerated by `getHoistableSubsetOps`.
+  LogicalResult populateSubsetOpsAtIterArg(LoopLikeOpInterface loopLike,
+                                           BlockArgument iterArg,
+                                           bool collectHoistableOps = true);
+
 private:
   /// Helper function for equivalence of tensor values. Since only insertion
   /// subset ops (that are also destination style ops) are followed when
@@ -225,18 +236,12 @@ static OpOperand *getSingleTerminatorUse(Value value) {
   return nullptr;
 }
 
-/// Hoist all subset ops that operate on the idx-th region iter_arg of the given
-/// loop-like op and index into loop-invariant subset locations. Return the
-/// newly created loop op (that has extra iter_args) or the original loop op if
-/// nothing was hoisted.
-static LoopLikeOpInterface hoistSubsetAtIterArg(LoopLikeOpInterface loopLike,
-                                                BlockArgument iterArg) {
-  IRRewriter rewriter(loopLike.getContext());
+LogicalResult
+MatchingSubsets::populateSubsetOpsAtIterArg(LoopLikeOpInterface loopLike,
+                                            BlockArgument iterArg,
+                                            bool collectHoistableOps) {
   assert(iterArg.getOwner()->getParentOp() == loopLike && "invalid iter_arg");
-  auto it = llvm::find(loopLike.getRegionIterArgs(), iterArg);
-  int64_t iterArgIdx = std::distance(loopLike.getRegionIterArgs().begin(), it);
   Value value = iterArg;
-  MatchingSubsets subsets;
 
   // Traverse use-def chain. Subset ops can be hoisted only if all ops along the
   // use-def chain starting from the region iter_arg are subset extraction or
@@ -249,21 +254,39 @@ static LoopLikeOpInterface hoistSubsetAtIterArg(LoopLikeOpInterface loopLike,
     Value nextValue = {};
 
     for (OpOperand &use : value.getUses()) {
+      if (auto nestedLoop = dyn_cast<LoopLikeOpInterface>(use.getOwner())) {
+        // Subset ops in nested loops are collected to check if there are only
+        // disjoint subset ops, but such subset ops are not subject to hoisting.
+        // To hoist subset ops from nested loops, the hoisting transformation
+        // should be run on the nested loop.
+        auto nestedIterArg = nestedLoop.getTiedLoopRegionIterArg(&use);
+        if (!nestedIterArg)
+          return failure();
+        // Note: `populateSubsetOpsAtIterArg` fails if there is no single SSA
+        // use-def chain starting at `nestedIterArg` and terminating in the
+        // tied, yielding operand.
+        if (failed(populateSubsetOpsAtIterArg(nestedLoop, nestedIterArg,
+                                              /*collectHoistableOps=*/false)))
+          return failure();
+        nextValue = nestedLoop.getTiedLoopResult(&use);
+        continue;
+      }
+
       auto subsetOp = dyn_cast<SubsetOpInterface>(use.getOwner());
       if (!subsetOp)
-        return loopLike;
-      subsets.insert(subsetOp);
+        return failure();
+      insert(subsetOp);
 
       if (auto insertionOp =
               dyn_cast<SubsetInsertionOpInterface>(use.getOwner())) {
         // The value must be used as a destination. (In case of a source, the
         // entire tensor would be read, which would prevent any hoisting.)
         if (&use != &insertionOp.getDestinationOperand())
-          return loopLike;
+          return failure();
         // There must be a single use-def chain from the region iter_arg to the
         // terminator. I.e., only one insertion op. Branches are not supported.
         if (nextValue)
-          return loopLike;
+          return failure();
         nextValue = insertionOp.getUpdatedDestination();
       }
     }
@@ -271,7 +294,7 @@ static LoopLikeOpInterface hoistSubsetAtIterArg(LoopLikeOpInterface loopLike,
     // Nothing can be hoisted if the chain does not continue with loop yielding
     // op or a subset insertion op.
     if (!nextValue)
-      return loopLike;
+      return failure();
     value = nextValue;
   }
 
@@ -279,6 +302,23 @@ static LoopLikeOpInterface hoistSubsetAtIterArg(LoopLikeOpInterface loopLike,
   // loop and the yielded value is the `idx`-th operand. (I.e., there is no
   // swapping yield.)
   if (loopLike.getTiedLoopYieldedValue(iterArg) != yieldedOperand)
+    return failure();
+
+  return success();
+}
+
+/// Hoist all subset ops that operate on the idx-th region iter_arg of the given
+/// loop-like op and index into loop-invariant subset locations. Return the
+/// newly created loop op (that has extra iter_args) or the original loop op if
+/// nothing was hoisted.
+static LoopLikeOpInterface hoistSubsetAtIterArg(LoopLikeOpInterface loopLike,
+                                                BlockArgument iterArg) {
+  assert(iterArg.getOwner()->getParentOp() == loopLike && "invalid iter_arg");
+  auto it = llvm::find(loopLike.getRegionIterArgs(), iterArg);
+  int64_t iterArgIdx = std::distance(loopLike.getRegionIterArgs().begin(), it);
+  IRRewriter rewriter(loopLike.getContext());
+  MatchingSubsets subsets;
+  if (failed(subsets.populateSubsetOpsAtIterArg(loopLike, iterArg)))
     return loopLike;
 
   // Hoist all matching extraction-insertion pairs one-by-one.
diff --git a/mlir/test/Transforms/loop-invariant-subset-hoisting.mlir b/mlir/test/Transforms/loop-invariant-subset-hoisting.mlir
index 5cded4c99182c..b9161f4e20d19 100644
--- a/mlir/test/Transforms/loop-invariant-subset-hoisting.mlir
+++ b/mlir/test/Transforms/loop-invariant-subset-hoisting.mlir
@@ -235,3 +235,38 @@ func.func @non_loop_invariant_subset_op(%arg: tensor<?xf32>) -> tensor<?xf32> {
 
   return %0 : tensor<?xf32>
 }
+
+// -----
+
+// CHECK-LABEL: func @nested_hoisting(
+//  CHECK-SAME:     %[[arg:.*]]: tensor<?xf32>
+func.func @nested_hoisting(%arg: tensor<?xf32>) -> tensor<?xf32> {
+  %lb = "test.foo"() : () -> (index)
+  %ub = "test.foo"() : () -> (index)
+  %step = "test.foo"() : () -> (index)
+
+  // CHECK: %[[extract:.*]] = tensor.extract_slice %[[arg]][0] [5] [1]
+  // CHECK: %[[extract2:.*]] = tensor.extract_slice %[[arg]][5] [5] [1]
+  // CHECK: %[[for:.*]]:3 = scf.for {{.*}} iter_args(%[[t:.*]] = %[[arg]], %[[hoisted:.*]] = %[[extract]], %[[hoisted2:.*]] = %[[extract2]])
+  %0 = scf.for %iv = %lb to %ub step %step iter_args(%t = %arg) -> (tensor<?xf32>) {
+    %1 = tensor.extract_slice %t[0][5][1] : tensor<?xf32> to tensor<5xf32>
+    // CHECK: %[[foo:.*]] = "test.foo"(%[[hoisted]])
+    %2 = "test.foo"(%1) : (tensor<5xf32>) -> (tensor<5xf32>)
+    %3 = tensor.insert_slice %2 into %t[0][5][1] : tensor<5xf32> into tensor<?xf32>
+    // CHECK: %[[for2:.*]]:2 = {{.*}} iter_args(%[[t2:.*]] = %[[t]], %[[hoisted2_nested:.*]] = %[[hoisted2]])
+    %4 = scf.for %iv2 = %lb to %ub step %step iter_args(%t2 = %3) -> (tensor<?xf32>) {
+      %5 = tensor.extract_slice %t2[5][5][1] : tensor<?xf32> to tensor<5xf32>
+      // CHECK: %[[foo2:.*]] = "test.foo"(%[[hoisted2_nested]])
+      %6 = "test.foo"(%5) : (tensor<5xf32>) -> (tensor<5xf32>)
+      %7 = tensor.insert_slice %6 into %t2[5][5][1] : tensor<5xf32> into tensor<?xf32>
+      // CHECK: scf.yield %[[t2]], %[[foo2]]
+      scf.yield %7 : tensor<?xf32>
+    }
+    // CHECK: scf.yield %[[for2]]#0, %[[foo]], %[[for2]]#1
+    scf.yield %4 : tensor<?xf32>
+  }
+  // CHECK: %[[insert:.*]] = tensor.insert_slice %[[for]]#2 into %[[for]]#0[5] [5] [1]
+  // CHECK: %[[insert2:.*]] = tensor.insert_slice %[[for]]#1 into %[[insert]][0] [5] [1]
+  // CHECK: return %[[insert2]]
+  return %0 : tensor<?xf32>
+}

From 5b6ceaf8c325625bc291f97cd10dae5c51581f3c Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Tue, 31 Oct 2023 19:28:47 -0700
Subject: [PATCH 610/877] [Support] Use llvm::to_underlying (NFC)

---
 llvm/include/llvm/Support/SwapByteOrder.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/include/llvm/Support/SwapByteOrder.h b/llvm/include/llvm/Support/SwapByteOrder.h
index 18fd41ac5b04e..e6d4c26838c6a 100644
--- a/llvm/include/llvm/Support/SwapByteOrder.h
+++ b/llvm/include/llvm/Support/SwapByteOrder.h
@@ -14,6 +14,7 @@
 #ifndef LLVM_SUPPORT_SWAPBYTEORDER_H
 #define LLVM_SUPPORT_SWAPBYTEORDER_H
 
+#include "llvm/ADT/STLForwardCompat.h"
 #include "llvm/ADT/bit.h"
 #include <cstdint>
 #include <type_traits>
@@ -53,8 +54,7 @@ inline double getSwappedBytes(double C) {
 
 template <typename T>
 inline std::enable_if_t<std::is_enum_v<T>, T> getSwappedBytes(T C) {
-  return static_cast<T>(
-      llvm::byteswap(static_cast<std::underlying_type_t<T>>(C)));
+  return static_cast<T>(llvm::byteswap(llvm::to_underlying(C)));
 }
 
 template<typename T>

From ff614a5729e9a4fc32465ad5ff3b87e044429c2d Mon Sep 17 00:00:00 2001
From: Matthias Springer <me@m-sp.org>
Date: Wed, 1 Nov 2023 11:29:00 +0900
Subject: [PATCH 611/877] [mlir][Interfaces] LISH: Add helpers for
 hyperrectangular subsets (#70628)

The majority of subset ops operate on hyperrectangular subsets. This
commit adds a new optional interface method
(`getAccessedHyperrectangularSlice`) that can be implemented by such
subset ops. If implemented, the other `operatesOn...` interface methods
of the `SubsetOpInterface` do not have to be implemented anymore.

The comparison logic for hyperrectangular subsets (is
disjoint/equivalent) is implemented with `ValueBoundsOpInterface`. This
makes the subset hoisting more powerful: simple cases where two
different SSA values always have the same runtime value can now be
supported.
---
 .../Bufferization/IR/BufferizationOps.td      |   3 +-
 mlir/include/mlir/IR/OpDefinition.h           |   5 +
 .../mlir/Interfaces/SubsetOpInterface.h       |  16 ++-
 .../mlir/Interfaces/SubsetOpInterface.td      |  53 +++++++--
 .../mlir/Interfaces/ValueBoundsOpInterface.h  |  53 ++++++++-
 .../SubsetInsertionOpInterfaceImpl.cpp        |  82 +------------
 mlir/lib/Interfaces/CMakeLists.txt            |   2 +
 mlir/lib/Interfaces/SubsetOpInterface.cpp     |  49 ++++++++
 .../lib/Interfaces/ValueBoundsOpInterface.cpp | 109 ++++++++++++++++--
 .../loop-invariant-subset-hoisting.mlir       |   9 +-
 .../llvm-project-overlay/mlir/BUILD.bazel     |   1 +
 11 files changed, 285 insertions(+), 97 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Bufferization/IR/BufferizationOps.td b/mlir/include/mlir/Dialect/Bufferization/IR/BufferizationOps.td
index e6b6d052df96a..9dc6afcaab31c 100644
--- a/mlir/include/mlir/Dialect/Bufferization/IR/BufferizationOps.td
+++ b/mlir/include/mlir/Dialect/Bufferization/IR/BufferizationOps.td
@@ -220,7 +220,8 @@ def Bufferization_MaterializeInDestinationOp
          AllElementTypesMatch<["source", "dest"]>,
          BufferizableOpInterface, DestinationStyleOpInterface,
          DeclareOpInterfaceMethods<ReifyRankedShapedTypeOpInterface>,
-         DeclareOpInterfaceMethods<SubsetOpInterface>,
+         DeclareOpInterfaceMethods<SubsetOpInterface,
+            ["operatesOnEquivalentSubset", "operatesOnDisjointSubset"]>,
          DeclareOpInterfaceMethods<SubsetInsertionOpInterface,
             ["getSourceOperand", "getValuesNeededToBuildSubsetExtraction",
              "buildSubsetExtraction", "isEquivalentSubset"]>,
diff --git a/mlir/include/mlir/IR/OpDefinition.h b/mlir/include/mlir/IR/OpDefinition.h
index 8ab37c1d51d6b..bd68c27445744 100644
--- a/mlir/include/mlir/IR/OpDefinition.h
+++ b/mlir/include/mlir/IR/OpDefinition.h
@@ -268,6 +268,11 @@ class OpFoldResult : public PointerUnion<Attribute, Value> {
 
 public:
   void dump() const { llvm::errs() << *this << "\n"; }
+
+  MLIRContext *getContext() const {
+    return is<Attribute>() ? get<Attribute>().getContext()
+                           : get<Value>().getContext();
+  }
 };
 
 // Temporarily exit the MLIR namespace to add casting support as later code in
diff --git a/mlir/include/mlir/Interfaces/SubsetOpInterface.h b/mlir/include/mlir/Interfaces/SubsetOpInterface.h
index 049cf2456a9c8..98c33ec65012f 100644
--- a/mlir/include/mlir/Interfaces/SubsetOpInterface.h
+++ b/mlir/include/mlir/Interfaces/SubsetOpInterface.h
@@ -10,6 +10,7 @@
 #define MLIR_INTERFACES_SUBSETOPINTERFACE_H_
 
 #include "mlir/IR/OpDefinition.h"
+#include "mlir/Interfaces/ValueBoundsOpInterface.h"
 
 namespace mlir {
 class SubsetOpInterface;
@@ -27,10 +28,23 @@ OpOperand &defaultGetDestinationOperand(Operation *op);
 /// `DestinationStyleOpInterface`.
 OpResult defaultGetUpdatedDestination(Operation *op);
 
-/// Default implementation of `isEquivalentSubset`.
+/// Default implementation of `SubsetInsertionOpInterface::isEquivalentSubset`.
 bool defaultIsEquivalentSubset(Operation *op, Value candidate,
                                function_ref<bool(Value, Value)> equivalenceFn);
 
+/// Default implementation of `SubsetOpInterface::operatesOnEquivalentSubset`.
+bool defaultOperatesOnEquivalentSubset(
+    Operation *op, SubsetOpInterface candidate,
+    function_ref<bool(Value, Value)> equivalenceFn);
+
+/// Default implementation of `SubsetOpInterface::operatesOnDisjointSubset`.
+bool defaultOperatesOnDisjointSubset(
+    Operation *op, SubsetOpInterface candidate,
+    function_ref<bool(Value, Value)> equivalenceFn);
+
+/// Return the container that the given subset op is operating on.
+Value getTensorContainer(Operation *op);
+
 /// Verify `SubsetOpInterface`.
 LogicalResult verifySubsetOpInterface(SubsetOpInterface op);
 
diff --git a/mlir/include/mlir/Interfaces/SubsetOpInterface.td b/mlir/include/mlir/Interfaces/SubsetOpInterface.td
index 9ebed2c94818d..7000e7dfc89cd 100644
--- a/mlir/include/mlir/Interfaces/SubsetOpInterface.td
+++ b/mlir/include/mlir/Interfaces/SubsetOpInterface.td
@@ -32,11 +32,6 @@ def SubsetOpInterface : OpInterface<"SubsetOpInterface"> {
       hyperrectangular slice.
     - `tensor.gather/scatter` describe the subset as list of indices. (Not
       implemented yet.)
-
-    Note: This interface does not expose any interface methods to get a
-    description of the accessed subset. That is because there is currently no
-    efficient way to describe arbitrary subsets. This interface merely provides
-    interface methods to check if two subsets are equivalent or disjoint.
   }];
 
   let cppNamespace = "::mlir";
@@ -46,24 +41,59 @@ def SubsetOpInterface : OpInterface<"SubsetOpInterface"> {
           Return "true" if this op and the given candidate subset op operate on
           equivalent subsets. Return "false" if the two subsets are disjoint
           or cannot be proven to be equivalent.
+
+          This interface method does not have to be implemented if
+          `getAccessedHyperrectangularSlice` is implemented.
         }],
         /*retType=*/"bool",
         /*methodName=*/"operatesOnEquivalentSubset",
         /*args=*/(ins
             "::mlir::SubsetOpInterface":$candidate,
-            "::llvm::function_ref<bool(Value, Value)>":$equivalenceFn)
+            "::llvm::function_ref<bool(Value, Value)>":$equivalenceFn),
+        /*methodBody=*/"",
+        /*defaultImplementation=*/[{
+          return ::mlir::detail::defaultOperatesOnEquivalentSubset(
+              $_op, candidate, equivalenceFn);
+        }]
       >,
       InterfaceMethod<
         /*desc=*/[{
           Return "true" if this op and the given candidate subset op operate on
           disjoint subsets. Return "false" if the two subsets are equivalent,
           overlapping or cannot be proven to be disjoint.
+
+          This interface method does not have to be implemented if
+          `getAccessedHyperrectangularSlice` is implemented.
         }],
         /*retType=*/"bool",
         /*methodName=*/"operatesOnDisjointSubset",
         /*args=*/(ins
             "::mlir::SubsetOpInterface":$candidate,
-            "::llvm::function_ref<bool(Value, Value)>":$equivalenceFn)
+            "::llvm::function_ref<bool(Value, Value)>":$equivalenceFn),
+        /*methodBody=*/"",
+        /*defaultImplementation=*/[{
+          return ::mlir::detail::defaultOperatesOnDisjointSubset(
+              $_op, candidate, equivalenceFn);
+        }]
+      >,
+      InterfaceMethod<
+        /*desc=*/[{
+          If this op operates on a hyperrectangular subset, return a
+          description of the subset in terms of offsets, sizes and strides.
+          Otherwise, return "failure".
+
+          This interface method is a convenience method for the most common case
+          of hyperrectangular subset ops. It is optional. If it is implemented,
+          `operatesOnEquivalentSubset` and `operatesOnDisjointSubset` do not
+          have to be implemented.
+        }],
+        /*retType=*/"::mlir::FailureOr<::mlir::HyperrectangularSlice>",
+        /*methodName=*/"getAccessedHyperrectangularSlice",
+        /*args=*/(ins),
+        /*methodBody=*/"",
+        /*defaultImplementation=*/[{
+          return ::mlir::failure();
+        }]
       >,
   ];
 
@@ -71,6 +101,15 @@ def SubsetOpInterface : OpInterface<"SubsetOpInterface"> {
     return ::mlir::detail::verifySubsetOpInterface(
         ::mlir::cast<::mlir::SubsetOpInterface>($_op));
   }];
+
+  let extraClassDeclaration = [{
+    /// Return the container that this operation is operating on. In case of an
+    /// extraction op, the container is the source tensor. In case of an
+    /// insertion op, the container is the destination tensor.
+    Value getTensorContainer() {
+      return ::mlir::detail::getTensorContainer(getOperation());
+    }
+  }];
 }
 
 def SubsetExtractionOpInterface
diff --git a/mlir/include/mlir/Interfaces/ValueBoundsOpInterface.h b/mlir/include/mlir/Interfaces/ValueBoundsOpInterface.h
index 8e2986a2d1f05..28dadfb9ecf86 100644
--- a/mlir/include/mlir/Interfaces/ValueBoundsOpInterface.h
+++ b/mlir/include/mlir/Interfaces/ValueBoundsOpInterface.h
@@ -21,6 +21,31 @@
 namespace mlir {
 class OffsetSizeAndStrideOpInterface;
 
+/// A hyperrectangular slice, represented as a list of offsets, sizes and
+/// strides.
+class HyperrectangularSlice {
+public:
+  HyperrectangularSlice(ArrayRef<OpFoldResult> offsets,
+                        ArrayRef<OpFoldResult> sizes,
+                        ArrayRef<OpFoldResult> strides);
+
+  /// Create a hyperrectangular slice with unit strides.
+  HyperrectangularSlice(ArrayRef<OpFoldResult> offsets,
+                        ArrayRef<OpFoldResult> sizes);
+
+  /// Infer a hyperrectangular slice from `OffsetSizeAndStrideOpInterface`.
+  HyperrectangularSlice(OffsetSizeAndStrideOpInterface op);
+
+  ArrayRef<OpFoldResult> getMixedOffsets() const { return mixedOffsets; }
+  ArrayRef<OpFoldResult> getMixedSizes() const { return mixedSizes; }
+  ArrayRef<OpFoldResult> getMixedStrides() const { return mixedStrides; }
+
+private:
+  SmallVector<OpFoldResult> mixedOffsets;
+  SmallVector<OpFoldResult> mixedSizes;
+  SmallVector<OpFoldResult> mixedStrides;
+};
+
 using ValueDimList = SmallVector<std::pair<Value, std::optional<int64_t>>>;
 
 /// A helper class to be used with `ValueBoundsOpInterface`. This class stores a
@@ -182,12 +207,34 @@ class ValueBoundsConstraintSet {
                                   std::optional<int64_t> dim1 = std::nullopt,
                                   std::optional<int64_t> dim2 = std::nullopt);
 
+  /// Compute whether the given values/attributes are equal. Return "failure" if
+  /// equality could not be determined.
+  ///
+  /// `ofr1`/`ofr2` must be of index type.
+  static FailureOr<bool> areEqual(OpFoldResult ofr1, OpFoldResult ofr2);
+
   /// Return "true" if the given slices are guaranteed to be overlapping.
   /// Return "false" if the given slices are guaranteed to be non-overlapping.
   /// Return "failure" if unknown.
-  static FailureOr<bool>
-  areOverlappingSlices(OffsetSizeAndStrideOpInterface slice1,
-                       OffsetSizeAndStrideOpInterface slice2);
+  ///
+  /// Slices are overlapping if for all dimensions:
+  /// *      offset1 + size1 * stride1 <= offset2
+  /// * and  offset2 + size2 * stride2 <= offset1
+  ///
+  /// Slice are non-overlapping if the above constraint is not satisfied for
+  /// at least one dimension.
+  static FailureOr<bool> areOverlappingSlices(MLIRContext *ctx,
+                                              HyperrectangularSlice slice1,
+                                              HyperrectangularSlice slice2);
+
+  /// Return "true" if the given slices are guaranteed to be equivalent.
+  /// Return "false" if the given slices are guaranteed to be non-equivalent.
+  /// Return "failure" if unknown.
+  ///
+  /// Slices are equivalent if their offsets, sizes and strices are equal.
+  static FailureOr<bool> areEquivalentSlices(MLIRContext *ctx,
+                                             HyperrectangularSlice slice1,
+                                             HyperrectangularSlice slice2);
 
   /// Add a bound for the given index-typed value or shaped value. This function
   /// returns a builder that adds the bound.
diff --git a/mlir/lib/Dialect/Tensor/Transforms/SubsetInsertionOpInterfaceImpl.cpp b/mlir/lib/Dialect/Tensor/Transforms/SubsetInsertionOpInterfaceImpl.cpp
index 7a1bafd409eea..d50d7c62b789c 100644
--- a/mlir/lib/Dialect/Tensor/Transforms/SubsetInsertionOpInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/Tensor/Transforms/SubsetInsertionOpInterfaceImpl.cpp
@@ -17,73 +17,12 @@ using namespace mlir::tensor;
 
 namespace {
 
-/// Return the tensor that the given subset op operates on.
-Value getContainerOperand(SubsetOpInterface op) {
-  if (auto extractionOp =
-          dyn_cast<SubsetExtractionOpInterface>(op.getOperation()))
-    return extractionOp.getSourceOperand().get();
-  if (auto insertionOp =
-          dyn_cast<SubsetInsertionOpInterface>(op.getOperation()))
-    return insertionOp.getDestinationOperand().get();
-  llvm_unreachable("expected SubsetExtraction/InsertionOpInterface");
-}
-
-/// Return "true" if the two ops operate on an equivalent subset.
-/// `equivalenceFn` is used to determine equivalence of tensors. Return "false"
-/// if the two ops operate non-equivalent subsets, if equivalence cannot be
-/// determined or if `op1` is not a subset op.
-template <typename OpTy>
-bool operateOnEquivalentSubsets(
-    OpTy op1, SubsetOpInterface op2,
-    function_ref<bool(Value, Value)> equivalenceFn) {
-  auto offsetsSizesAndStrides2 =
-      dyn_cast<OffsetSizeAndStrideOpInterface>(op2.getOperation());
-  if (!offsetsSizesAndStrides2)
-    return false;
-  if (!sameOffsetsSizesAndStrides(op1, offsetsSizesAndStrides2,
-                                  isEqualConstantIntOrValue))
-    return false;
-  return equivalenceFn(
-      getContainerOperand(cast<SubsetOpInterface>(op1.getOperation())),
-      getContainerOperand(op2));
-}
-
-/// Return "true" if the two ops operate on a disjoint subsets.
-/// `equivalenceFn` is used to determine equivalence of tensors. Return "false"
-/// if the two ops operate non-disjoint subsets, if disjointness cannot be
-/// determined or if `op1` is not a subset op.
-template <typename OpTy>
-bool operateOnDisjointSubsets(OpTy op1, SubsetOpInterface op2,
-                              function_ref<bool(Value, Value)> equivalenceFn) {
-  auto offsetsSizesAndStrides2 =
-      dyn_cast<OffsetSizeAndStrideOpInterface>(op2.getOperation());
-  if (!offsetsSizesAndStrides2)
-    return false;
-  FailureOr<bool> overlappingSlices =
-      ValueBoundsConstraintSet::areOverlappingSlices(op1,
-                                                     offsetsSizesAndStrides2);
-  if (failed(overlappingSlices) || *overlappingSlices)
-    return false;
-  return equivalenceFn(
-      getContainerOperand(cast<SubsetOpInterface>(op1.getOperation())),
-      getContainerOperand(op2));
-}
-
 struct ExtractSliceOpSubsetOpInterface
     : public SubsetOpInterface::ExternalModel<ExtractSliceOpSubsetOpInterface,
                                               tensor::ExtractSliceOp> {
-  bool operatesOnEquivalentSubset(
-      Operation *op, SubsetOpInterface candidate,
-      function_ref<bool(Value, Value)> equivalenceFn) const {
-    auto extractSliceOp = cast<tensor::ExtractSliceOp>(op);
-    return operateOnEquivalentSubsets(extractSliceOp, candidate, equivalenceFn);
-  }
-
-  bool operatesOnDisjointSubset(
-      Operation *op, SubsetOpInterface candidate,
-      function_ref<bool(Value, Value)> equivalenceFn) const {
-    auto extractSliceOp = cast<tensor::ExtractSliceOp>(op);
-    return operateOnDisjointSubsets(extractSliceOp, candidate, equivalenceFn);
+  FailureOr<HyperrectangularSlice>
+  getAccessedHyperrectangularSlice(Operation *op) const {
+    return HyperrectangularSlice(cast<OffsetSizeAndStrideOpInterface>(op));
   }
 };
 
@@ -99,18 +38,9 @@ template <typename OpTy>
 struct InsertSliceLikeOpSubsetOpInterface
     : public SubsetOpInterface::ExternalModel<
           InsertSliceLikeOpSubsetOpInterface<OpTy>, OpTy> {
-  bool operatesOnEquivalentSubset(
-      Operation *op, SubsetOpInterface candidate,
-      function_ref<bool(Value, Value)> equivalenceFn) const {
-    auto insertSliceOp = cast<OpTy>(op);
-    return operateOnEquivalentSubsets(insertSliceOp, candidate, equivalenceFn);
-  }
-
-  bool operatesOnDisjointSubset(
-      Operation *op, SubsetOpInterface candidate,
-      function_ref<bool(Value, Value)> equivalenceFn) const {
-    auto insertSliceOp = cast<OpTy>(op);
-    return operateOnDisjointSubsets(insertSliceOp, candidate, equivalenceFn);
+  FailureOr<HyperrectangularSlice>
+  getAccessedHyperrectangularSlice(Operation *op) const {
+    return HyperrectangularSlice(cast<OffsetSizeAndStrideOpInterface>(op));
   }
 };
 
diff --git a/mlir/lib/Interfaces/CMakeLists.txt b/mlir/lib/Interfaces/CMakeLists.txt
index 2652d261f480b..e7c76e70ed6b5 100644
--- a/mlir/lib/Interfaces/CMakeLists.txt
+++ b/mlir/lib/Interfaces/CMakeLists.txt
@@ -93,10 +93,12 @@ add_mlir_library(MLIRSubsetOpInterface
   DEPENDS
   MLIRDestinationStyleOpInterface
   MLIRSubsetOpInterfaceIncGen
+  MLIRValueBoundsOpInterface
 
   LINK_LIBS PUBLIC
   MLIRDestinationStyleOpInterface
   MLIRIR
+  MLIRValueBoundsOpInterface
   )
 
 add_mlir_interface_library(TilingInterface)
diff --git a/mlir/lib/Interfaces/SubsetOpInterface.cpp b/mlir/lib/Interfaces/SubsetOpInterface.cpp
index 7245ab20c499e..d0bdadf500f6f 100644
--- a/mlir/lib/Interfaces/SubsetOpInterface.cpp
+++ b/mlir/lib/Interfaces/SubsetOpInterface.cpp
@@ -8,6 +8,7 @@
 
 #include "mlir/Interfaces/SubsetOpInterface.h"
 #include "mlir/Interfaces/DestinationStyleOpInterface.h"
+#include "mlir/Interfaces/ValueBoundsOpInterface.h"
 
 #include "mlir/Interfaces/SubsetOpInterface.cpp.inc"
 
@@ -40,6 +41,54 @@ bool detail::defaultIsEquivalentSubset(
       candidate.getDefiningOp<SubsetOpInterface>(), equivalenceFn);
 }
 
+bool detail::defaultOperatesOnEquivalentSubset(
+    Operation *op, SubsetOpInterface candidate,
+    function_ref<bool(Value, Value)> equivalenceFn) {
+  auto subsetOp = cast<SubsetOpInterface>(op);
+  FailureOr<HyperrectangularSlice> slice =
+      subsetOp.getAccessedHyperrectangularSlice();
+  assert(succeeded(slice) &&
+         "operatesOnEquivalentSubset must be implemented if "
+         "getAccessedHyperrectangularSlice is not implemented");
+  FailureOr<HyperrectangularSlice> otherSlice =
+      candidate.getAccessedHyperrectangularSlice();
+  if (failed(otherSlice))
+    return false;
+  if (!equivalenceFn(subsetOp.getTensorContainer(),
+                     candidate.getTensorContainer()))
+    return false;
+  FailureOr<bool> equivalent = ValueBoundsConstraintSet::areEquivalentSlices(
+      op->getContext(), *slice, *otherSlice);
+  return succeeded(equivalent) && *equivalent;
+}
+
+bool detail::defaultOperatesOnDisjointSubset(
+    Operation *op, SubsetOpInterface candidate,
+    function_ref<bool(Value, Value)> equivalenceFn) {
+  auto subsetOp = cast<SubsetOpInterface>(op);
+  FailureOr<HyperrectangularSlice> slice =
+      subsetOp.getAccessedHyperrectangularSlice();
+  assert(succeeded(slice) &&
+         "defaultOperatesOnDisjointSubset must be implemented if "
+         "getAccessedHyperrectangularSlice is not implemented");
+  FailureOr<HyperrectangularSlice> otherSlice =
+      candidate.getAccessedHyperrectangularSlice();
+  if (failed(otherSlice))
+    return false;
+  if (!equivalenceFn(subsetOp.getTensorContainer(),
+                     candidate.getTensorContainer()))
+    return false;
+  FailureOr<bool> overlapping = ValueBoundsConstraintSet::areOverlappingSlices(
+      op->getContext(), *slice, *otherSlice);
+  return succeeded(overlapping) && !*overlapping;
+}
+
+Value detail::getTensorContainer(Operation *op) {
+  if (auto insertionOp = dyn_cast<::mlir::SubsetInsertionOpInterface>(op))
+    return insertionOp.getDestinationOperand().get();
+  return cast<::mlir::SubsetExtractionOpInterface>(op).getSourceOperand().get();
+}
+
 LogicalResult detail::verifySubsetOpInterface(SubsetOpInterface op) {
   if (!(isa<SubsetExtractionOpInterface>(op.getOperation()) ^
         isa<SubsetInsertionOpInterface>(op.getOperation())))
diff --git a/mlir/lib/Interfaces/ValueBoundsOpInterface.cpp b/mlir/lib/Interfaces/ValueBoundsOpInterface.cpp
index f0c37c872e6d3..62ba63402925e 100644
--- a/mlir/lib/Interfaces/ValueBoundsOpInterface.cpp
+++ b/mlir/lib/Interfaces/ValueBoundsOpInterface.cpp
@@ -25,6 +25,32 @@ namespace mlir {
 #include "mlir/Interfaces/ValueBoundsOpInterface.cpp.inc"
 } // namespace mlir
 
+HyperrectangularSlice::HyperrectangularSlice(ArrayRef<OpFoldResult> offsets,
+                                             ArrayRef<OpFoldResult> sizes,
+                                             ArrayRef<OpFoldResult> strides)
+    : mixedOffsets(offsets), mixedSizes(sizes), mixedStrides(strides) {
+  assert(offsets.size() == sizes.size() &&
+         "expected same number of offsets, sizes, strides");
+  assert(offsets.size() == strides.size() &&
+         "expected same number of offsets, sizes, strides");
+}
+
+HyperrectangularSlice::HyperrectangularSlice(ArrayRef<OpFoldResult> offsets,
+                                             ArrayRef<OpFoldResult> sizes)
+    : mixedOffsets(offsets), mixedSizes(sizes) {
+  assert(offsets.size() == sizes.size() &&
+         "expected same number of offsets and sizes");
+  // Assume that all strides are 1.
+  if (offsets.empty())
+    return;
+  MLIRContext *ctx = offsets.front().getContext();
+  mixedStrides.append(offsets.size(), Builder(ctx).getIndexAttr(1));
+}
+
+HyperrectangularSlice::HyperrectangularSlice(OffsetSizeAndStrideOpInterface op)
+    : HyperrectangularSlice(op.getMixedOffsets(), op.getMixedSizes(),
+                            op.getMixedStrides()) {}
+
 /// If ofr is a constant integer or an IntegerAttr, return the integer.
 static std::optional<int64_t> getConstantIntValue(OpFoldResult ofr) {
   // Case 1: Check for Constant integer.
@@ -524,19 +550,44 @@ ValueBoundsConstraintSet::areEqual(Value value1, Value value2,
   return *delta == 0;
 }
 
-FailureOr<bool> ValueBoundsConstraintSet::areOverlappingSlices(
-    OffsetSizeAndStrideOpInterface slice1,
-    OffsetSizeAndStrideOpInterface slice2) {
-  assert(slice1.getStaticOffsets().size() == slice1.getStaticOffsets().size() &&
+FailureOr<bool> ValueBoundsConstraintSet::areEqual(OpFoldResult ofr1,
+                                                   OpFoldResult ofr2) {
+  Builder b(ofr1.getContext());
+  AffineMap map =
+      AffineMap::get(/*dimCount=*/0, /*symbolCount=*/2,
+                     b.getAffineSymbolExpr(0) - b.getAffineSymbolExpr(1));
+  SmallVector<OpFoldResult> ofrOperands;
+  ofrOperands.push_back(ofr1);
+  ofrOperands.push_back(ofr2);
+  SmallVector<Value> valueOperands;
+  AffineMap foldedMap =
+      foldAttributesIntoMap(b, map, ofrOperands, valueOperands);
+  ValueDimList valueDims;
+  for (Value v : valueOperands) {
+    assert(v.getType().isIndex() && "expected index type");
+    valueDims.emplace_back(v, std::nullopt);
+  }
+  FailureOr<int64_t> delta =
+      computeConstantBound(presburger::BoundType::EQ, foldedMap, valueDims);
+  if (failed(delta))
+    return failure();
+  return *delta == 0;
+}
+
+FailureOr<bool>
+ValueBoundsConstraintSet::areOverlappingSlices(MLIRContext *ctx,
+                                               HyperrectangularSlice slice1,
+                                               HyperrectangularSlice slice2) {
+  assert(slice1.getMixedOffsets().size() == slice1.getMixedOffsets().size() &&
          "expected slices of same rank");
-  assert(slice1.getStaticSizes().size() == slice1.getStaticSizes().size() &&
+  assert(slice1.getMixedSizes().size() == slice1.getMixedSizes().size() &&
          "expected slices of same rank");
-  assert(slice1.getStaticStrides().size() == slice1.getStaticStrides().size() &&
+  assert(slice1.getMixedStrides().size() == slice1.getMixedStrides().size() &&
          "expected slices of same rank");
 
-  Builder b(slice1.getContext());
+  Builder b(ctx);
   bool foundUnknownBound = false;
-  for (int64_t i = 0, e = slice1.getStaticOffsets().size(); i < e; ++i) {
+  for (int64_t i = 0, e = slice1.getMixedOffsets().size(); i < e; ++i) {
     AffineMap map =
         AffineMap::get(/*dimCount=*/0, /*symbolCount=*/4,
                        b.getAffineSymbolExpr(0) +
@@ -588,6 +639,48 @@ FailureOr<bool> ValueBoundsConstraintSet::areOverlappingSlices(
   return true;
 }
 
+FailureOr<bool>
+ValueBoundsConstraintSet::areEquivalentSlices(MLIRContext *ctx,
+                                              HyperrectangularSlice slice1,
+                                              HyperrectangularSlice slice2) {
+  assert(slice1.getMixedOffsets().size() == slice1.getMixedOffsets().size() &&
+         "expected slices of same rank");
+  assert(slice1.getMixedSizes().size() == slice1.getMixedSizes().size() &&
+         "expected slices of same rank");
+  assert(slice1.getMixedStrides().size() == slice1.getMixedStrides().size() &&
+         "expected slices of same rank");
+
+  // The two slices are equivalent if all of their offsets, sizes and strides
+  // are equal. If equality cannot be determined for at least one of those
+  // values, equivalence cannot be determined and this function returns
+  // "failure".
+  for (auto [offset1, offset2] :
+       llvm::zip_equal(slice1.getMixedOffsets(), slice2.getMixedOffsets())) {
+    FailureOr<bool> equal = areEqual(offset1, offset2);
+    if (failed(equal))
+      return failure();
+    if (!equal.value())
+      return false;
+  }
+  for (auto [size1, size2] :
+       llvm::zip_equal(slice1.getMixedSizes(), slice2.getMixedSizes())) {
+    FailureOr<bool> equal = areEqual(size1, size2);
+    if (failed(equal))
+      return failure();
+    if (!equal.value())
+      return false;
+  }
+  for (auto [stride1, stride2] :
+       llvm::zip_equal(slice1.getMixedStrides(), slice2.getMixedStrides())) {
+    FailureOr<bool> equal = areEqual(stride1, stride2);
+    if (failed(equal))
+      return failure();
+    if (!equal.value())
+      return false;
+  }
+  return true;
+}
+
 ValueBoundsConstraintSet::BoundBuilder &
 ValueBoundsConstraintSet::BoundBuilder::operator[](int64_t dim) {
   assert(!this->dim.has_value() && "dim was already set");
diff --git a/mlir/test/Transforms/loop-invariant-subset-hoisting.mlir b/mlir/test/Transforms/loop-invariant-subset-hoisting.mlir
index b9161f4e20d19..bb60eeaba5245 100644
--- a/mlir/test/Transforms/loop-invariant-subset-hoisting.mlir
+++ b/mlir/test/Transforms/loop-invariant-subset-hoisting.mlir
@@ -7,6 +7,11 @@ func.func @hoist_matching_extract_insert(%arg: tensor<?xf32>) -> tensor<?xf32> {
   %ub = "test.foo"() : () -> (index)
   %step = "test.foo"() : () -> (index)
 
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %add = arith.addi %c0, %c1 : index
+  %sub = arith.subi %add, %c1 : index
+
   // CHECK: %[[extract:.*]] = tensor.extract_slice %[[arg]]
   // CHECK: %[[for:.*]]:2 = scf.for {{.*}} iter_args(%[[t:.*]] = %[[arg]], %[[hoisted:.*]] = %[[extract]])
   %0 = scf.for %iv = %lb to %ub step %step iter_args(%t = %arg) -> (tensor<?xf32>) {
@@ -17,7 +22,9 @@ func.func @hoist_matching_extract_insert(%arg: tensor<?xf32>) -> tensor<?xf32> {
     %1 = tensor.extract_slice %t[0][5][1] : tensor<?xf32> to tensor<5xf32>
     // CHECK: %[[foo:.*]] = "test.foo"(%[[hoisted]])
     %2 = "test.foo"(%1) : (tensor<5xf32>) -> (tensor<5xf32>)
-    %3 = tensor.insert_slice %2 into %t[0][5][1] : tensor<5xf32> into tensor<?xf32>
+    // Obfuscate the IR by inserting at offset %sub instead of 0; both of them
+    // have the same value.
+    %3 = tensor.insert_slice %2 into %t[%sub][5][1] : tensor<5xf32> into tensor<?xf32>
     // CHECK: scf.yield %[[t]], %[[foo]]
     scf.yield %3 : tensor<?xf32>
   }
diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index 0a2ae427169a9..7109b6c439057 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -10230,6 +10230,7 @@ cc_library(
         ":IR",
         ":SubsetOpInterfaceIncGen",
         ":Support",
+        ":ValueBoundsOpInterface",
         "//llvm:Support",
     ],
 )

From b8cbc5c02c0c2da1077f125ad1c0f8643cdf8072 Mon Sep 17 00:00:00 2001
From: Johannes Doerfert <johannes@jdoerfert.de>
Date: Tue, 31 Oct 2023 19:38:43 -0700
Subject: [PATCH 612/877]  [OpenMP] Introduce the KernelLaunchEnvironment as
 implicit argument (#70401)

The KernelEnvironment is for compile time information about a kernel. It
allows the compiler to feed information to the runtime. The
KernelLaunchEnvironment is for dynamic information *per* kernel launch.
It allows the rutime to feed information to the kernel that is not
shared with other invocations of the kernel. The first use case is to
replace the globals that synchronize teams reductions with per-launch
versions. This allows concurrent teams reductions. More uses cases will
follow, e.g., per launch memory pools.

Fixes: https://github.com/llvm/llvm-project/issues/70249
---
 clang/lib/Sema/SemaOpenMP.cpp                 |   26 +-
 clang/test/OpenMP/amdgcn_target_codegen.cpp   |   14 +-
 .../test/OpenMP/amdgcn_target_device_vla.cpp  |   28 +-
 .../OpenMP/amdgcn_target_init_temp_alloca.cpp |    2 +
 .../amdgpu_target_with_aligned_attribute.c    |    7 +-
 clang/test/OpenMP/assumes_include_nvptx.cpp   |    4 +-
 clang/test/OpenMP/bug60602.cpp                |   14 +-
 clang/test/OpenMP/declare_target_codegen.cpp  |   12 +-
 .../declare_target_codegen_globalization.cpp  |    6 +-
 .../OpenMP/declare_target_link_codegen.cpp    |    2 +-
 .../OpenMP/declare_variant_mixed_codegen.c    |    2 +-
 clang/test/OpenMP/distribute_codegen.cpp      |  104 +-
 .../distribute_firstprivate_codegen.cpp       |   72 +-
 .../OpenMP/distribute_lastprivate_codegen.cpp |   72 +-
 .../distribute_parallel_for_codegen.cpp       |  236 +-
 ...bute_parallel_for_firstprivate_codegen.cpp |  100 +-
 .../distribute_parallel_for_if_codegen.cpp    |   62 +-
 ...ibute_parallel_for_lastprivate_codegen.cpp |  100 +-
 ...ibute_parallel_for_num_threads_codegen.cpp |  304 +-
 ...istribute_parallel_for_private_codegen.cpp |  100 +-
 ...tribute_parallel_for_proc_bind_codegen.cpp |   22 +-
 .../distribute_parallel_for_simd_codegen.cpp  |  236 +-
 ...parallel_for_simd_firstprivate_codegen.cpp |  100 +-
 ...istribute_parallel_for_simd_if_codegen.cpp |  256 +-
 ..._parallel_for_simd_lastprivate_codegen.cpp |  100 +-
 ..._parallel_for_simd_num_threads_codegen.cpp |  304 +-
 ...bute_parallel_for_simd_private_codegen.cpp |  100 +-
 ...te_parallel_for_simd_proc_bind_codegen.cpp |   22 +-
 .../OpenMP/distribute_private_codegen.cpp     |   80 +-
 clang/test/OpenMP/distribute_simd_codegen.cpp |   80 +-
 .../distribute_simd_firstprivate_codegen.cpp  |   72 +-
 .../distribute_simd_lastprivate_codegen.cpp   |   72 +-
 .../distribute_simd_private_codegen.cpp       |   80 +-
 .../distribute_simd_reduction_codegen.cpp     |   28 +-
 clang/test/OpenMP/nvptx_SPMD_codegen.cpp      | 4980 +++++++++--------
 clang/test/OpenMP/nvptx_data_sharing.cpp      |    6 +-
 ...x_declare_target_var_ctor_dtor_codegen.cpp |    2 +-
 ...stribute_parallel_generic_mode_codegen.cpp |   12 +-
 clang/test/OpenMP/nvptx_lambda_capturing.cpp  |   74 +-
 .../nvptx_multi_target_parallel_codegen.cpp   |   24 +-
 .../OpenMP/nvptx_nested_parallel_codegen.cpp  |   12 +-
 clang/test/OpenMP/nvptx_parallel_codegen.cpp  |   36 +-
 .../OpenMP/nvptx_parallel_for_codegen.cpp     |    6 +-
 clang/test/OpenMP/nvptx_target_codegen.cpp    |   96 +-
 .../nvptx_target_firstprivate_codegen.cpp     |   18 +-
 .../OpenMP/nvptx_target_parallel_codegen.cpp  |   24 +-
 ...tx_target_parallel_num_threads_codegen.cpp |   24 +-
 ...vptx_target_parallel_proc_bind_codegen.cpp |  108 +-
 ...vptx_target_parallel_reduction_codegen.cpp |   54 +-
 ...arallel_reduction_codegen_tbaa_PR46146.cpp |  540 +-
 .../test/OpenMP/nvptx_target_printf_codegen.c |   36 +-
 .../test/OpenMP/nvptx_target_simd_codegen.cpp |  588 +-
 .../OpenMP/nvptx_target_teams_codegen.cpp     |   36 +-
 .../nvptx_target_teams_distribute_codegen.cpp |   12 +-
 ..._teams_distribute_parallel_for_codegen.cpp |  108 +-
 ...bute_parallel_for_generic_mode_codegen.cpp |   12 +-
 ...s_distribute_parallel_for_simd_codegen.cpp |  712 +--
 ...x_target_teams_distribute_simd_codegen.cpp |  732 +--
 ...vptx_target_teams_generic_loop_codegen.cpp |   90 +-
 ...eams_generic_loop_generic_mode_codegen.cpp |   12 +-
 .../nvptx_target_teams_ompx_bare_codegen.cpp  |    4 +-
 clang/test/OpenMP/nvptx_teams_codegen.cpp     |   48 +-
 .../OpenMP/nvptx_teams_reduction_codegen.cpp  |   54 +-
 clang/test/OpenMP/ompx_attributes_codegen.cpp |    6 +-
 clang/test/OpenMP/openmp_offload_codegen.cpp  |    2 +-
 clang/test/OpenMP/reduction_implicit_map.cpp  |   68 +-
 ...rallel_in_multiple_target_state_machines.c |    3 +-
 ...remarks_parallel_in_target_state_machine.c |    3 +-
 .../OpenMP/target_codegen_global_capture.cpp  |   60 +-
 .../OpenMP/target_firstprivate_codegen.cpp    |   96 +-
 clang/test/OpenMP/target_map_codegen_03.cpp   |   12 +-
 .../OpenMP/target_map_member_expr_codegen.cpp |    4 +-
 .../target_ompx_dyn_cgroup_mem_codegen.cpp    |   48 +-
 clang/test/OpenMP/target_parallel_codegen.cpp |   56 +-
 .../OpenMP/target_parallel_debug_codegen.cpp  |  861 +--
 .../OpenMP/target_parallel_for_codegen.cpp    |   56 +-
 .../target_parallel_for_debug_codegen.cpp     | 1199 ++--
 .../target_parallel_for_simd_codegen.cpp      |  112 +-
 .../target_parallel_for_simd_tl_codegen.cpp   |   84 +-
 .../OpenMP/target_parallel_for_tl_codegen.cpp |   77 +-
 ...target_parallel_generic_loop_codegen-1.cpp |   88 +-
 ...target_parallel_generic_loop_codegen-2.cpp |   40 +-
 ...target_parallel_generic_loop_codegen-3.cpp | 1199 ++--
 .../target_parallel_generic_loop_codegen.cpp  |    7 +-
 ...t_parallel_generic_loop_depend_codegen.cpp |   10 +-
 ...arget_parallel_generic_loop_tl_codegen.cpp |   77 +-
 ...l_generic_loop_uses_allocators_codegen.cpp |    4 +-
 .../OpenMP/target_parallel_if_codegen.cpp     |  168 +-
 .../target_parallel_num_threads_codegen.cpp   |  132 +-
 .../OpenMP/target_parallel_tl_codegen.cpp     |   27 +-
 clang/test/OpenMP/target_private_codegen.cpp  |   21 +-
 .../test/OpenMP/target_reduction_codegen.cpp  |   18 +-
 clang/test/OpenMP/target_simd_tl_codegen.cpp  |   40 +-
 .../OpenMP/target_task_affinity_codegen.cpp   |    8 +-
 clang/test/OpenMP/target_teams_codegen.cpp    |   88 +-
 .../target_teams_distribute_codegen.cpp       |   56 +-
 ...rget_teams_distribute_collapse_codegen.cpp |   36 +-
 ...teams_distribute_dist_schedule_codegen.cpp |   84 +-
 ..._teams_distribute_firstprivate_codegen.cpp |   14 +-
 ...t_teams_distribute_lastprivate_codegen.cpp |   72 +-
 ..._teams_distribute_parallel_for_codegen.cpp |   24 +-
 ...stribute_parallel_for_collapse_codegen.cpp |   48 +-
 ...ute_parallel_for_dist_schedule_codegen.cpp |  120 +-
 ...bute_parallel_for_firstprivate_codegen.cpp |  266 +-
 ...ams_distribute_parallel_for_if_codegen.cpp |   68 +-
 ...ibute_parallel_for_lastprivate_codegen.cpp |  100 +-
 ..._distribute_parallel_for_order_codegen.cpp |    8 +-
 ...istribute_parallel_for_private_codegen.cpp |  178 +-
 ...tribute_parallel_for_proc_bind_codegen.cpp |   22 +-
 ...tribute_parallel_for_reduction_codegen.cpp |   58 +-
 ...stribute_parallel_for_schedule_codegen.cpp |  384 +-
 ...s_distribute_parallel_for_simd_codegen.cpp |   36 +-
 ...ute_parallel_for_simd_collapse_codegen.cpp |   48 +-
 ...arallel_for_simd_dist_schedule_codegen.cpp |  120 +-
 ...parallel_for_simd_firstprivate_codegen.cpp |  266 +-
 ..._parallel_for_simd_lastprivate_codegen.cpp |  100 +-
 ...bute_parallel_for_simd_private_codegen.cpp |  178 +-
 ...te_parallel_for_simd_proc_bind_codegen.cpp |   22 +-
 ...te_parallel_for_simd_reduction_codegen.cpp |   58 +-
 ...ute_parallel_for_simd_schedule_codegen.cpp |  384 +-
 ...arget_teams_distribute_private_codegen.cpp |   14 +-
 ...get_teams_distribute_reduction_codegen.cpp |  290 +-
 .../target_teams_distribute_simd_codegen.cpp  |  112 +-
 ...teams_distribute_simd_collapse_codegen.cpp |   36 +-
 ..._distribute_simd_dist_schedule_codegen.cpp |   84 +-
 ...s_distribute_simd_firstprivate_codegen.cpp |   14 +-
 ...ms_distribute_simd_lastprivate_codegen.cpp |   72 +-
 ..._teams_distribute_simd_private_codegen.cpp |   14 +-
 ...eams_distribute_simd_reduction_codegen.cpp |   38 +-
 .../target_teams_generic_loop_codegen-1.cpp   |   24 +-
 .../target_teams_generic_loop_codegen.cpp     |    7 +-
 ...et_teams_generic_loop_collapse_codegen.cpp |   48 +-
 ...rget_teams_generic_loop_depend_codegen.cpp |   10 +-
 .../target_teams_generic_loop_if_codegen.cpp  |   68 +-
 ...arget_teams_generic_loop_order_codegen.cpp |    8 +-
 ...get_teams_generic_loop_private_codegen.cpp |  178 +-
 ...t_teams_generic_loop_reduction_codegen.cpp |   58 +-
 ...s_generic_loop_uses_allocators_codegen.cpp |    6 +-
 .../test/OpenMP/target_teams_map_codegen.cpp  |  224 +-
 .../OpenMP/target_teams_num_teams_codegen.cpp |  132 +-
 .../target_teams_thread_limit_codegen.cpp     |   64 +-
 clang/test/OpenMP/teams_codegen.cpp           |  128 +-
 .../llvm/Frontend/OpenMP/OMPIRBuilder.h       |    5 +-
 .../include/llvm/Frontend/OpenMP/OMPKinds.def |    8 +-
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp     |   40 +-
 llvm/test/Transforms/OpenMP/add_attributes.ll |    8 +-
 .../Transforms/OpenMP/always_inline_device.ll |    8 +-
 .../OpenMP/custom_state_machines.ll           |  170 +-
 .../OpenMP/custom_state_machines_pre_lto.ll   |  296 +-
 .../OpenMP/custom_state_machines_remarks.ll   |   10 +-
 .../Transforms/OpenMP/deduplication_target.ll |   10 +-
 .../get_hardware_num_threads_in_block_fold.ll |   26 +-
 ...dware_num_threads_in_block_fold_optnone.ll |   14 +-
 .../Transforms/OpenMP/global_constructor.ll   |   10 +-
 .../OpenMP/globalization_remarks.ll           |    4 +-
 ..._state_machine_function_ptr_replacement.ll |    4 +-
 .../OpenMP/indirect_call_kernel_info_crash.ll |    6 +-
 .../OpenMP/is_spmd_exec_mode_fold.ll          |   18 +-
 .../Transforms/OpenMP/nested_parallelism.ll   |   14 +-
 .../Transforms/OpenMP/parallel_level_fold.ll  |   14 +-
 .../Transforms/OpenMP/remove_globalization.ll |   18 +-
 .../OpenMP/replace_globalization.ll           |   28 +-
 .../OpenMP/single_threaded_execution.ll       |    6 +-
 llvm/test/Transforms/OpenMP/spmdization.ll    |   98 +-
 .../Transforms/OpenMP/spmdization_assumes.ll  |   10 +-
 .../OpenMP/spmdization_constant_prop.ll       |    6 +-
 .../Transforms/OpenMP/spmdization_guarding.ll |   18 +-
 ...mdization_guarding_two_reaching_kernels.ll |   30 +-
 .../Transforms/OpenMP/spmdization_indirect.ll |   30 +-
 .../OpenMP/spmdization_kernel_env_dep.ll      |   13 +-
 ...zation_no_guarding_two_reaching_kernels.ll |   30 +-
 .../Transforms/OpenMP/spmdization_remarks.ll  |   10 +-
 .../OpenMP/value-simplify-openmp-opt.ll       |   14 +-
 .../Frontend/OpenMPIRBuilderTest.cpp          |    7 +-
 ...target-byref-bycopy-generation-device.mlir |    2 +-
 .../omptarget-declare-target-llvm-device.mlir |    4 +-
 .../LLVMIR/omptarget-region-device-llvm.mlir  |    6 +-
 .../DeviceRTL/include/Interface.h             |    4 +-
 openmp/libomptarget/DeviceRTL/include/State.h |   10 +-
 openmp/libomptarget/DeviceRTL/src/Kernel.cpp  |   16 +-
 openmp/libomptarget/DeviceRTL/src/State.cpp   |   12 +-
 openmp/libomptarget/include/Environment.h     |    2 +
 openmp/libomptarget/include/omptarget.h       |   11 +
 .../PluginInterface/PluginInterface.cpp       |   66 +-
 .../common/PluginInterface/PluginInterface.h  |   19 +-
 185 files changed, 11693 insertions(+), 9774 deletions(-)

diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp
index a471a20aa7200..29b2c7531f4b5 100644
--- a/clang/lib/Sema/SemaOpenMP.cpp
+++ b/clang/lib/Sema/SemaOpenMP.cpp
@@ -4249,12 +4249,15 @@ void Sema::ActOnOpenMPRegionStart(OpenMPDirectiveKind DKind, Scope *CurScope) {
     getCurCapturedRegion()->TheCapturedDecl->addAttr(
         AlwaysInlineAttr::CreateImplicit(
             Context, {}, AlwaysInlineAttr::Keyword_forceinline));
-    Sema::CapturedParamNameType ParamsTarget[] = {
-        std::make_pair(StringRef(), QualType()) // __context with shared vars
-    };
+    SmallVector<Sema::CapturedParamNameType, 2> ParamsTarget;
+    if (getLangOpts().OpenMPIsTargetDevice)
+      ParamsTarget.push_back(std::make_pair(StringRef("dyn_ptr"), VoidPtrTy));
+    ParamsTarget.push_back(
+        std::make_pair(StringRef(), QualType())); // __context with shared vars;
     // Start a captured region for 'target' with no implicit parameters.
     ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP,
-                             ParamsTarget, /*OpenMPCaptureLevel=*/1);
+                             ParamsTarget,
+                             /*OpenMPCaptureLevel=*/1);
     Sema::CapturedParamNameType ParamsTeamsOrParallel[] = {
         std::make_pair(".global_tid.", KmpInt32PtrTy),
         std::make_pair(".bound_tid.", KmpInt32PtrTy),
@@ -4293,8 +4296,13 @@ void Sema::ActOnOpenMPRegionStart(OpenMPDirectiveKind DKind, Scope *CurScope) {
     getCurCapturedRegion()->TheCapturedDecl->addAttr(
         AlwaysInlineAttr::CreateImplicit(
             Context, {}, AlwaysInlineAttr::Keyword_forceinline));
+    SmallVector<Sema::CapturedParamNameType, 2> ParamsTarget;
+    if (getLangOpts().OpenMPIsTargetDevice)
+      ParamsTarget.push_back(std::make_pair(StringRef("dyn_ptr"), VoidPtrTy));
+    ParamsTarget.push_back(
+        std::make_pair(StringRef(), QualType())); // __context with shared vars;
     ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP,
-                             std::make_pair(StringRef(), QualType()),
+                             ParamsTarget,
                              /*OpenMPCaptureLevel=*/1);
     break;
   }
@@ -4499,9 +4507,11 @@ void Sema::ActOnOpenMPRegionStart(OpenMPDirectiveKind DKind, Scope *CurScope) {
     getCurCapturedRegion()->TheCapturedDecl->addAttr(
         AlwaysInlineAttr::CreateImplicit(
             Context, {}, AlwaysInlineAttr::Keyword_forceinline));
-    Sema::CapturedParamNameType ParamsTarget[] = {
-        std::make_pair(StringRef(), QualType()) // __context with shared vars
-    };
+    SmallVector<Sema::CapturedParamNameType, 2> ParamsTarget;
+    if (getLangOpts().OpenMPIsTargetDevice)
+      ParamsTarget.push_back(std::make_pair(StringRef("dyn_ptr"), VoidPtrTy));
+    ParamsTarget.push_back(
+        std::make_pair(StringRef(), QualType())); // __context with shared vars;
     // Start a captured region for 'target' with no implicit parameters.
     ActOnCapturedRegionStart(DSAStack->getConstructLoc(), CurScope, CR_OpenMP,
                              ParamsTarget, /*OpenMPCaptureLevel=*/1);
diff --git a/clang/test/OpenMP/amdgcn_target_codegen.cpp b/clang/test/OpenMP/amdgcn_target_codegen.cpp
index 90d2ebdf26bd6..3ea2d107f072a 100644
--- a/clang/test/OpenMP/amdgcn_target_codegen.cpp
+++ b/clang/test/OpenMP/amdgcn_target_codegen.cpp
@@ -29,15 +29,18 @@ int test_amdgcn_target_tid_threads_simd() {
 
 #endif
 // CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z30test_amdgcn_target_tid_threadsv_l14
-// CHECK-SAME: (ptr noundef nonnull align 4 dereferenceable(4000) [[ARR:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[ARR:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 // CHECK-NEXT:    [[ARR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 // CHECK-NEXT:    [[I:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:    [[DYN_PTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DYN_PTR_ADDR]] to ptr
 // CHECK-NEXT:    [[ARR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ARR_ADDR]] to ptr
 // CHECK-NEXT:    [[I_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I]] to ptr
+// CHECK-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR_ASCAST]], align 8
 // CHECK-NEXT:    store ptr [[ARR]], ptr [[ARR_ADDR_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ARR_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z30test_amdgcn_target_tid_threadsv_l14_kernel_environment to ptr))
+// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z30test_amdgcn_target_tid_threadsv_l14_kernel_environment to ptr), ptr [[DYN_PTR]])
 // CHECK-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // CHECK-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK:       user_code.entry:
@@ -66,19 +69,22 @@ int test_amdgcn_target_tid_threads_simd() {
 //
 //
 // CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z35test_amdgcn_target_tid_threads_simdv_l23
-// CHECK-SAME: (ptr noundef nonnull align 4 dereferenceable(4000) [[ARR:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[ARR:%.*]]) #[[ATTR1:[0-9]+]] {
 // CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 // CHECK-NEXT:    [[ARR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 // CHECK-NEXT:    [[TMP:%.*]] = alloca i32, align 4, addrspace(5)
 // CHECK-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4, addrspace(5)
 // CHECK-NEXT:    [[I:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:    [[DYN_PTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DYN_PTR_ADDR]] to ptr
 // CHECK-NEXT:    [[ARR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ARR_ADDR]] to ptr
 // CHECK-NEXT:    [[TMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TMP]] to ptr
 // CHECK-NEXT:    [[DOTOMP_IV_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTOMP_IV]] to ptr
 // CHECK-NEXT:    [[I_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I]] to ptr
+// CHECK-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR_ASCAST]], align 8
 // CHECK-NEXT:    store ptr [[ARR]], ptr [[ARR_ADDR_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ARR_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z35test_amdgcn_target_tid_threads_simdv_l23_kernel_environment to ptr))
+// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z35test_amdgcn_target_tid_threads_simdv_l23_kernel_environment to ptr), ptr [[DYN_PTR]])
 // CHECK-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // CHECK-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK:       user_code.entry:
diff --git a/clang/test/OpenMP/amdgcn_target_device_vla.cpp b/clang/test/OpenMP/amdgcn_target_device_vla.cpp
index b2b630b546713..de150a0fcb4af 100644
--- a/clang/test/OpenMP/amdgcn_target_device_vla.cpp
+++ b/clang/test/OpenMP/amdgcn_target_device_vla.cpp
@@ -97,21 +97,24 @@ int main() {
 
 #endif
 // CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo1v_l12
-// CHECK-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[SUM:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SUM:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 // CHECK-NEXT:    [[SUM_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 // CHECK-NEXT:    [[N:%.*]] = alloca i32, align 4, addrspace(5)
 // CHECK-NEXT:    [[__VLA_EXPR0:%.*]] = alloca i64, align 8, addrspace(5)
 // CHECK-NEXT:    [[I:%.*]] = alloca i32, align 4, addrspace(5)
 // CHECK-NEXT:    [[I1:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:    [[DYN_PTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DYN_PTR_ADDR]] to ptr
 // CHECK-NEXT:    [[SUM_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM_ADDR]] to ptr
 // CHECK-NEXT:    [[N_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[N]] to ptr
 // CHECK-NEXT:    [[__VLA_EXPR0_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[__VLA_EXPR0]] to ptr
 // CHECK-NEXT:    [[I_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I]] to ptr
 // CHECK-NEXT:    [[I1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I1]] to ptr
+// CHECK-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR_ASCAST]], align 8
 // CHECK-NEXT:    store ptr [[SUM]], ptr [[SUM_ADDR_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[SUM_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo1v_l12_kernel_environment to ptr))
+// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo1v_l12_kernel_environment to ptr), ptr [[DYN_PTR]])
 // CHECK-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // CHECK-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK:       user_code.entry:
@@ -174,26 +177,29 @@ int main() {
 //
 //
 // CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo2v_l30
-// CHECK-SAME: (i64 noundef [[M:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[RESULT:%.*]]) #[[ATTR0]] {
+// CHECK-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[M:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[RESULT:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 // CHECK-NEXT:    [[M_ADDR:%.*]] = alloca i64, align 8, addrspace(5)
 // CHECK-NEXT:    [[VLA_ADDR:%.*]] = alloca i64, align 8, addrspace(5)
 // CHECK-NEXT:    [[RESULT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 // CHECK-NEXT:    [[M_CASTED:%.*]] = alloca i64, align 8, addrspace(5)
 // CHECK-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
 // CHECK-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:    [[DYN_PTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DYN_PTR_ADDR]] to ptr
 // CHECK-NEXT:    [[M_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[M_ADDR]] to ptr
 // CHECK-NEXT:    [[VLA_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VLA_ADDR]] to ptr
 // CHECK-NEXT:    [[RESULT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RESULT_ADDR]] to ptr
 // CHECK-NEXT:    [[M_CASTED_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[M_CASTED]] to ptr
 // CHECK-NEXT:    [[DOTZERO_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTZERO_ADDR]] to ptr
 // CHECK-NEXT:    [[DOTTHREADID_TEMP__ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTTHREADID_TEMP_]] to ptr
+// CHECK-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR_ASCAST]], align 8
 // CHECK-NEXT:    store i64 [[M]], ptr [[M_ADDR_ASCAST]], align 8
 // CHECK-NEXT:    store i64 [[VLA]], ptr [[VLA_ADDR_ASCAST]], align 8
 // CHECK-NEXT:    store ptr [[RESULT]], ptr [[RESULT_ADDR_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[VLA_ADDR_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[RESULT_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo2v_l30_kernel_environment to ptr))
+// CHECK-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo2v_l30_kernel_environment to ptr), ptr [[DYN_PTR]])
 // CHECK-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP2]], -1
 // CHECK-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK:       user_code.entry:
@@ -540,26 +546,29 @@ int main() {
 //
 //
 // CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo3v_l52
-// CHECK-SAME: (i64 noundef [[M:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[RESULT:%.*]]) #[[ATTR0]] {
+// CHECK-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[M:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[RESULT:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 // CHECK-NEXT:    [[M_ADDR:%.*]] = alloca i64, align 8, addrspace(5)
 // CHECK-NEXT:    [[VLA_ADDR:%.*]] = alloca i64, align 8, addrspace(5)
 // CHECK-NEXT:    [[RESULT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 // CHECK-NEXT:    [[M_CASTED:%.*]] = alloca i64, align 8, addrspace(5)
 // CHECK-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
 // CHECK-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:    [[DYN_PTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DYN_PTR_ADDR]] to ptr
 // CHECK-NEXT:    [[M_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[M_ADDR]] to ptr
 // CHECK-NEXT:    [[VLA_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VLA_ADDR]] to ptr
 // CHECK-NEXT:    [[RESULT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RESULT_ADDR]] to ptr
 // CHECK-NEXT:    [[M_CASTED_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[M_CASTED]] to ptr
 // CHECK-NEXT:    [[DOTZERO_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTZERO_ADDR]] to ptr
 // CHECK-NEXT:    [[DOTTHREADID_TEMP__ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTTHREADID_TEMP_]] to ptr
+// CHECK-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR_ASCAST]], align 8
 // CHECK-NEXT:    store i64 [[M]], ptr [[M_ADDR_ASCAST]], align 8
 // CHECK-NEXT:    store i64 [[VLA]], ptr [[VLA_ADDR_ASCAST]], align 8
 // CHECK-NEXT:    store ptr [[RESULT]], ptr [[RESULT_ADDR_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[VLA_ADDR_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[RESULT_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo3v_l52_kernel_environment to ptr))
+// CHECK-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo3v_l52_kernel_environment to ptr), ptr [[DYN_PTR]])
 // CHECK-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP2]], -1
 // CHECK-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK:       user_code.entry:
@@ -894,8 +903,9 @@ int main() {
 //
 //
 // CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo4v_l76
-// CHECK-SAME: (i64 noundef [[M:%.*]], i64 noundef [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[RESULT:%.*]]) #[[ATTR0]] {
+// CHECK-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[M:%.*]], i64 noundef [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[RESULT:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 // CHECK-NEXT:    [[M_ADDR:%.*]] = alloca i64, align 8, addrspace(5)
 // CHECK-NEXT:    [[N_ADDR:%.*]] = alloca i64, align 8, addrspace(5)
 // CHECK-NEXT:    [[VLA_ADDR:%.*]] = alloca i64, align 8, addrspace(5)
@@ -904,6 +914,7 @@ int main() {
 // CHECK-NEXT:    [[N_CASTED:%.*]] = alloca i64, align 8, addrspace(5)
 // CHECK-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
 // CHECK-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:    [[DYN_PTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DYN_PTR_ADDR]] to ptr
 // CHECK-NEXT:    [[M_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[M_ADDR]] to ptr
 // CHECK-NEXT:    [[N_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[N_ADDR]] to ptr
 // CHECK-NEXT:    [[VLA_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[VLA_ADDR]] to ptr
@@ -912,13 +923,14 @@ int main() {
 // CHECK-NEXT:    [[N_CASTED_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[N_CASTED]] to ptr
 // CHECK-NEXT:    [[DOTZERO_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTZERO_ADDR]] to ptr
 // CHECK-NEXT:    [[DOTTHREADID_TEMP__ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTTHREADID_TEMP_]] to ptr
+// CHECK-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR_ASCAST]], align 8
 // CHECK-NEXT:    store i64 [[M]], ptr [[M_ADDR_ASCAST]], align 8
 // CHECK-NEXT:    store i64 [[N]], ptr [[N_ADDR_ASCAST]], align 8
 // CHECK-NEXT:    store i64 [[VLA]], ptr [[VLA_ADDR_ASCAST]], align 8
 // CHECK-NEXT:    store ptr [[RESULT]], ptr [[RESULT_ADDR_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[VLA_ADDR_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[RESULT_ADDR_ASCAST]], align 8
-// CHECK-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo4v_l76_kernel_environment to ptr))
+// CHECK-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4foo4v_l76_kernel_environment to ptr), ptr [[DYN_PTR]])
 // CHECK-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP2]], -1
 // CHECK-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK:       user_code.entry:
diff --git a/clang/test/OpenMP/amdgcn_target_init_temp_alloca.cpp b/clang/test/OpenMP/amdgcn_target_init_temp_alloca.cpp
index 32c7cd5a28885..e3c350969dd79 100644
--- a/clang/test/OpenMP/amdgcn_target_init_temp_alloca.cpp
+++ b/clang/test/OpenMP/amdgcn_target_init_temp_alloca.cpp
@@ -11,8 +11,10 @@ int test_amdgcn_target_temp_alloca() {
 
   int arr[N];
 
+  // CHECK:      [[DYN_PTR_ADDR:%.+]] = alloca ptr, align 8, addrspace(5)
   // CHECK:      [[VAR_ADDR:%.+]] = alloca ptr, align 8, addrspace(5)
   // CHECK-NEXT: [[VAR2_ADDR:%.+]] = alloca i32, align 4, addrspace(5)
+  // CHECK-NEXT: [[DYN_PTR_ADDR_CAST:%.+]] = addrspacecast ptr addrspace(5) [[DYN_PTR_ADDR]] to ptr
   // CHECK-NEXT: [[VAR_ADDR_CAST:%.+]] = addrspacecast ptr addrspace(5) [[VAR_ADDR]] to ptr
   // CHECK-NEXT: [[VAR2_ADDR_CAST:%.+]] = addrspacecast ptr addrspace(5) [[VAR2_ADDR]] to ptr
   // CHECK:  store ptr [[VAR:%.+]], ptr [[VAR_ADDR_CAST]], align 8
diff --git a/clang/test/OpenMP/amdgpu_target_with_aligned_attribute.c b/clang/test/OpenMP/amdgpu_target_with_aligned_attribute.c
index cc8f0b1bc763d..dd33e8405c342 100644
--- a/clang/test/OpenMP/amdgpu_target_with_aligned_attribute.c
+++ b/clang/test/OpenMP/amdgpu_target_with_aligned_attribute.c
@@ -19,21 +19,24 @@ void write_to_aligned_array(int *a, int N) {
 
 #endif
 // CHECK-AMD-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_write_to_aligned_array_l14
-// CHECK-AMD-SAME: (i64 noundef [[N:%.*]], ptr noundef [[APTR:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-AMD-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[N:%.*]], ptr noundef [[APTR:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK-AMD-NEXT:  entry:
+// CHECK-AMD-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 // CHECK-AMD-NEXT:    [[N_ADDR:%.*]] = alloca i64, align 8, addrspace(5)
 // CHECK-AMD-NEXT:    [[APTR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 // CHECK-AMD-NEXT:    [[N_CASTED:%.*]] = alloca i64, align 8, addrspace(5)
 // CHECK-AMD-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
 // CHECK-AMD-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-AMD-NEXT:    [[DYN_PTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DYN_PTR_ADDR]] to ptr
 // CHECK-AMD-NEXT:    [[N_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[N_ADDR]] to ptr
 // CHECK-AMD-NEXT:    [[APTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[APTR_ADDR]] to ptr
 // CHECK-AMD-NEXT:    [[N_CASTED_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[N_CASTED]] to ptr
 // CHECK-AMD-NEXT:    [[DOTZERO_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTZERO_ADDR]] to ptr
 // CHECK-AMD-NEXT:    [[DOTTHREADID_TEMP__ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTTHREADID_TEMP_]] to ptr
+// CHECK-AMD-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR_ASCAST]], align 8
 // CHECK-AMD-NEXT:    store i64 [[N]], ptr [[N_ADDR_ASCAST]], align 8
 // CHECK-AMD-NEXT:    store ptr [[APTR]], ptr [[APTR_ADDR_ASCAST]], align 8
-// CHECK-AMD-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_write_to_aligned_array_l14_kernel_environment to ptr))
+// CHECK-AMD-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_write_to_aligned_array_l14_kernel_environment to ptr), ptr [[DYN_PTR]])
 // CHECK-AMD-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-AMD-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-AMD:       user_code.entry:
diff --git a/clang/test/OpenMP/assumes_include_nvptx.cpp b/clang/test/OpenMP/assumes_include_nvptx.cpp
index c79290fbe7b37..4577ea4c9c2b5 100644
--- a/clang/test/OpenMP/assumes_include_nvptx.cpp
+++ b/clang/test/OpenMP/assumes_include_nvptx.cpp
@@ -11,11 +11,11 @@
 
 // TODO: Think about teaching the OMPIRBuilder about default attributes as well so the __kmpc* declarations are annotated.
 
-// CHECK: define weak_odr protected void @__omp_offloading_{{.*}}__Z17complex_reductionIfEvv_{{.*}}() [[attr0:#[0-9]]]
+// CHECK: define weak_odr protected void @__omp_offloading_{{.*}}__Z17complex_reductionIfEvv_{{.*}}({{.*}}) [[attr0:#[0-9]]]
 // CHECK: call i32 @__kmpc_target_init(
 // CHECK: declare noundef float @_Z3sinf(float noundef) [[attr1:#[0-9]*]]
 // CHECK: declare void @__kmpc_target_deinit(
-// CHECK: define weak_odr protected void @__omp_offloading_{{.*}}__Z17complex_reductionIdEvv_{{.*}}() [[attr0]]
+// CHECK: define weak_odr protected void @__omp_offloading_{{.*}}__Z17complex_reductionIdEvv_{{.*}}({{.*}}) [[attr0]]
 // CHECK: %call = call noundef double @_Z3sind(double noundef 0.000000e+00) [[attr2:#[0-9]]]
 // CHECK: declare noundef double @_Z3sind(double noundef) [[attr1]]
 
diff --git a/clang/test/OpenMP/bug60602.cpp b/clang/test/OpenMP/bug60602.cpp
index 1f9433c9e3527..2fbfdfde07a0c 100644
--- a/clang/test/OpenMP/bug60602.cpp
+++ b/clang/test/OpenMP/bug60602.cpp
@@ -124,7 +124,7 @@ int kernel_within_loop(int *a, int *b, int N, int num_iters) {
 // CHECK-NEXT:    [[TMP42:%.*]] = icmp ne i32 [[TMP41]], 0
 // CHECK-NEXT:    br i1 [[TMP42]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK:       omp_offload.failed:
-// CHECK-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z18kernel_within_loopPiS_ii_l9(i64 [[TMP3]], ptr [[TMP4]], ptr [[TMP5]]) #[[ATTR3:[0-9]+]]
+// CHECK-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z18kernel_within_loopPiS_ii_l9(i64 [[TMP3]], ptr [[TMP4]], ptr [[TMP5]]) #[[ATTR2:[0-9]+]]
 // CHECK-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK:       omp_offload.cont:
 // CHECK-NEXT:    [[TMP43:%.*]] = load i32, ptr [[N_ADDR]], align 4
@@ -210,7 +210,7 @@ int kernel_within_loop(int *a, int *b, int N, int num_iters) {
 // CHECK-NEXT:    [[TMP87:%.*]] = icmp ne i32 [[TMP86]], 0
 // CHECK-NEXT:    br i1 [[TMP87]], label [[OMP_OFFLOAD_FAILED15:%.*]], label [[OMP_OFFLOAD_CONT16:%.*]]
 // CHECK:       omp_offload.failed15:
-// CHECK-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z18kernel_within_loopPiS_ii_l13(i64 [[TMP44]], ptr [[TMP45]], ptr [[TMP46]]) #[[ATTR3]]
+// CHECK-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z18kernel_within_loopPiS_ii_l13(i64 [[TMP44]], ptr [[TMP45]], ptr [[TMP46]]) #[[ATTR2]]
 // CHECK-NEXT:    br label [[OMP_OFFLOAD_CONT16]]
 // CHECK:       omp_offload.cont16:
 // CHECK-NEXT:    br label [[FOR_INC:%.*]]
@@ -249,7 +249,7 @@ int kernel_within_loop(int *a, int *b, int N, int num_iters) {
 //
 //
 // CHECK-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z18kernel_within_loopPiS_ii_l9.omp_outlined
-// CHECK-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[N:%.*]], ptr noundef [[A:%.*]], ptr noundef [[B:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[N:%.*]], ptr noundef [[A:%.*]], ptr noundef [[B:%.*]]) #[[ATTR1]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -347,7 +347,7 @@ int kernel_within_loop(int *a, int *b, int N, int num_iters) {
 //
 //
 // CHECK-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z18kernel_within_loopPiS_ii_l13
-// CHECK-SAME: (i64 noundef [[N:%.*]], ptr noundef [[A:%.*]], ptr noundef [[B:%.*]]) #[[ATTR5:[0-9]+]] {
+// CHECK-SAME: (i64 noundef [[N:%.*]], ptr noundef [[A:%.*]], ptr noundef [[B:%.*]]) #[[ATTR1]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[N_ADDR:%.*]] = alloca i64, align 8
 // CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
@@ -366,7 +366,7 @@ int kernel_within_loop(int *a, int *b, int N, int num_iters) {
 //
 //
 // CHECK-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z18kernel_within_loopPiS_ii_l13.omp_outlined
-// CHECK-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[N:%.*]], ptr noundef [[A:%.*]], ptr noundef [[B:%.*]]) #[[ATTR2]] {
+// CHECK-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[N:%.*]], ptr noundef [[A:%.*]], ptr noundef [[B:%.*]]) #[[ATTR1]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -461,7 +461,7 @@ int kernel_within_loop(int *a, int *b, int N, int num_iters) {
 //
 //
 // CHECK-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z18kernel_within_loopPiS_ii_l13.omp_outlined.omp_outlined
-// CHECK-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[N:%.*]], ptr noundef [[A:%.*]], ptr noundef [[B:%.*]]) #[[ATTR2]] {
+// CHECK-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[N:%.*]], ptr noundef [[A:%.*]], ptr noundef [[B:%.*]]) #[[ATTR1]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -571,7 +571,7 @@ int kernel_within_loop(int *a, int *b, int N, int num_iters) {
 //
 //
 // CHECK-LABEL: define internal void @.omp_offloading.requires_reg
-// CHECK-SAME: () #[[ATTR6:[0-9]+]] {
+// CHECK-SAME: () #[[ATTR4:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK-NEXT:    ret void
diff --git a/clang/test/OpenMP/declare_target_codegen.cpp b/clang/test/OpenMP/declare_target_codegen.cpp
index 71c742198af6b..b4f6e43374541 100644
--- a/clang/test/OpenMP/declare_target_codegen.cpp
+++ b/clang/test/OpenMP/declare_target_codegen.cpp
@@ -151,7 +151,7 @@ int bar() { return 1 + foo() + bar() + baz1() + baz2(); }
 int maini1() {
   int a;
   static long aa = 32 + bbb + ccc + fff + ggg;
-// CHECK-DAG: define weak_odr protected void @__omp_offloading_{{.*}}maini1{{.*}}_l[[@LINE+1]](ptr noundef nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %{{.*}}, i64 {{.*}}, i64 {{.*}})
+// CHECK-DAG: define weak_odr protected void @__omp_offloading_{{.*}}maini1{{.*}}_l[[@LINE+1]](ptr {{.*}}, ptr noundef nonnull align {{[0-9]+}} dereferenceable({{[0-9]+}}) %{{.*}}, i64 {{.*}}, i64 {{.*}})
 #pragma omp target map(tofrom \
                        : a, b)
   {
@@ -164,7 +164,7 @@ int maini1() {
 
 int baz3() { return 2 + baz2(); }
 int baz2() {
-// CHECK-DAG: define weak_odr protected void @__omp_offloading_{{.*}}baz2{{.*}}_l[[@LINE+1]](i64 {{.*}})
+// CHECK-DAG: define weak_odr protected void @__omp_offloading_{{.*}}baz2{{.*}}_l[[@LINE+1]](ptr {{.*}}, i64 {{.*}})
 #pragma omp target parallel
   ++c;
   return 2 + baz3();
@@ -176,7 +176,7 @@ static __typeof(create) __t_create __attribute__((__weakref__("__create")));
 
 int baz5() {
   bool a;
-// CHECK-DAG: define weak_odr protected void @__omp_offloading_{{.*}}baz5{{.*}}_l[[@LINE+1]](i64 {{.*}})
+// CHECK-DAG: define weak_odr protected void @__omp_offloading_{{.*}}baz5{{.*}}_l[[@LINE+1]](ptr {{.*}}, i64 {{.*}})
 #pragma omp target
   a = __extension__(void *) & __t_create != 0;
   return a;
@@ -242,8 +242,8 @@ int main() {
   return 0;
 }
 
-// CHECK-DAG: define {{.*}}void @__omp_offloading_{{.*}}virtual_foo{{.*}}_l[[@LINE-25]]()
-// CHECK-DAG: define {{.*}}void @__omp_offloading_{{.*}}emitted{{.*}}_l[[@LINE-11]]()
+// CHECK-DAG: define {{.*}}void @__omp_offloading_{{.*}}virtual_foo{{.*}}_l[[@LINE-25]](ptr {{.*}})
+// CHECK-DAG: define {{.*}}void @__omp_offloading_{{.*}}emitted{{.*}}_l[[@LINE-11]](ptr {{.*}})
 
 template <typename T>
 struct TTT {
@@ -253,7 +253,7 @@ struct TTT {
   }
 };
 
-// CHECK-DAG: define {{.*}}void @__omp_offloading_{{.*}}emitted{{.*}}_l[[@LINE-5]]()
+// CHECK-DAG: define {{.*}}void @__omp_offloading_{{.*}}emitted{{.*}}_l[[@LINE-5]](ptr {{.*}})
 
 // CHECK-DAG: declare extern_weak noundef signext i32 @__create()
 
diff --git a/clang/test/OpenMP/declare_target_codegen_globalization.cpp b/clang/test/OpenMP/declare_target_codegen_globalization.cpp
index 643820ae18df5..b3c810b4b1455 100644
--- a/clang/test/OpenMP/declare_target_codegen_globalization.cpp
+++ b/clang/test/OpenMP/declare_target_codegen_globalization.cpp
@@ -25,13 +25,15 @@ int maini1() {
 
 
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z6maini1v_l16
-// CHECK1-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x ptr], align 8
+// CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK1-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z6maini1v_l16_kernel_environment)
+// CHECK1-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z6maini1v_l16_kernel_environment, ptr [[DYN_PTR]])
 // CHECK1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK1:       user_code.entry:
diff --git a/clang/test/OpenMP/declare_target_link_codegen.cpp b/clang/test/OpenMP/declare_target_link_codegen.cpp
index d5d77cdb51f55..12fc92183ea9a 100644
--- a/clang/test/OpenMP/declare_target_link_codegen.cpp
+++ b/clang/test/OpenMP/declare_target_link_codegen.cpp
@@ -50,7 +50,7 @@ int maini1() {
   return 0;
 }
 
-// DEVICE: define weak_odr protected void @__omp_offloading_{{.*}}_{{.*}}maini1{{.*}}_l42(ptr noundef nonnull align {{[0-9]+}} dereferenceable{{[^,]*}}
+// DEVICE: define weak_odr protected void @__omp_offloading_{{.*}}_{{.*}}maini1{{.*}}_l42(ptr {{[^,]+}}, ptr noundef nonnull align {{[0-9]+}} dereferenceable{{[^,]*}}
 // DEVICE: [[C_REF:%.+]] = load ptr, ptr @c_decl_tgt_ref_ptr,
 // DEVICE: [[C:%.+]] = load i32, ptr [[C_REF]],
 // DEVICE: store i32 [[C]], ptr %
diff --git a/clang/test/OpenMP/declare_variant_mixed_codegen.c b/clang/test/OpenMP/declare_variant_mixed_codegen.c
index fd30852fa0de0..2ee8a4e184c9c 100644
--- a/clang/test/OpenMP/declare_variant_mixed_codegen.c
+++ b/clang/test/OpenMP/declare_variant_mixed_codegen.c
@@ -44,7 +44,7 @@ void foo() {
 // HOST: call i32 @hst(double noundef -3.000000e+00)
 // HOST: call i32 @dev(double noundef -4.000000e+00)
 
-// GPU: define {{.*}}void @__omp_offloading_{{.+}}_foo_l36()
+// GPU: define {{.*}}void @__omp_offloading_{{.+}}_foo_l36(ptr {{[^,]+}})
 // GPU: call i32 @dev(double noundef -3.000000e+00)
 // GPU: call i32 @dev(double noundef -4.000000e+00)
 
diff --git a/clang/test/OpenMP/distribute_codegen.cpp b/clang/test/OpenMP/distribute_codegen.cpp
index 34f6156905a32..e3b43002a0518 100644
--- a/clang/test/OpenMP/distribute_codegen.cpp
+++ b/clang/test/OpenMP/distribute_codegen.cpp
@@ -192,7 +192,7 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK1-NEXT:    [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0
 // CHECK1-NEXT:    br i1 [[TMP32]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK1:       omp_offload.failed:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z23without_schedule_clausePfS_S_S__l56(ptr [[TMP0]], ptr [[TMP1]], ptr [[TMP2]], ptr [[TMP3]]) #[[ATTR3:[0-9]+]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z23without_schedule_clausePfS_S_S__l56(ptr [[TMP0]], ptr [[TMP1]], ptr [[TMP2]], ptr [[TMP3]]) #[[ATTR2:[0-9]+]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    ret void
@@ -214,7 +214,7 @@ int fint(void) { return ftemplate<int>(); }
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z23without_schedule_clausePfS_S_S__l56.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[D:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[D:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -383,7 +383,7 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK1-NEXT:    [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0
 // CHECK1-NEXT:    br i1 [[TMP32]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK1:       omp_offload.failed:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z18static_not_chunkedPfS_S_S__l68(ptr [[TMP0]], ptr [[TMP1]], ptr [[TMP2]], ptr [[TMP3]]) #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z18static_not_chunkedPfS_S_S__l68(ptr [[TMP0]], ptr [[TMP1]], ptr [[TMP2]], ptr [[TMP3]]) #[[ATTR2]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    ret void
@@ -405,7 +405,7 @@ int fint(void) { return ftemplate<int>(); }
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z18static_not_chunkedPfS_S_S__l68.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[D:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[D:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -574,7 +574,7 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK1-NEXT:    [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0
 // CHECK1-NEXT:    br i1 [[TMP32]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK1:       omp_offload.failed:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z14static_chunkedPfS_S_S__l80(ptr [[TMP0]], ptr [[TMP1]], ptr [[TMP2]], ptr [[TMP3]]) #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z14static_chunkedPfS_S_S__l80(ptr [[TMP0]], ptr [[TMP1]], ptr [[TMP2]], ptr [[TMP3]]) #[[ATTR2]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    ret void
@@ -596,7 +596,7 @@ int fint(void) { return ftemplate<int>(); }
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z14static_chunkedPfS_S_S__l80.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[D:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[D:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -773,7 +773,7 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK1-NEXT:    [[TMP25:%.*]] = icmp ne i32 [[TMP24]], 0
 // CHECK1-NEXT:    br i1 [[TMP25]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK1:       omp_offload.failed:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z12test_precondv_l92(i64 [[TMP1]]) #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z12test_precondv_l92(i64 [[TMP1]]) #[[ATTR2]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    ret void
@@ -789,7 +789,7 @@ int fint(void) { return ftemplate<int>(); }
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z12test_precondv_l92.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[A:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[A:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -940,7 +940,7 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK1-NEXT:    [[TMP21:%.*]] = icmp ne i32 [[TMP20]], 0
 // CHECK1-NEXT:    br i1 [[TMP21]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK1:       omp_offload.failed:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_v_l108(i64 [[TMP1]]) #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_v_l108(i64 [[TMP1]]) #[[ATTR2]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    ret i32 0
@@ -956,7 +956,7 @@ int fint(void) { return ftemplate<int>(); }
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_v_l108.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[AA:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[AA:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1038,7 +1038,7 @@ int fint(void) { return ftemplate<int>(); }
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK1-SAME: () #[[ATTR4:[0-9]+]] {
+// CHECK1-SAME: () #[[ATTR3:[0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK1-NEXT:    ret void
@@ -1120,7 +1120,7 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK3-NEXT:    [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0
 // CHECK3-NEXT:    br i1 [[TMP32]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK3:       omp_offload.failed:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z23without_schedule_clausePfS_S_S__l56(ptr [[TMP0]], ptr [[TMP1]], ptr [[TMP2]], ptr [[TMP3]]) #[[ATTR3:[0-9]+]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z23without_schedule_clausePfS_S_S__l56(ptr [[TMP0]], ptr [[TMP1]], ptr [[TMP2]], ptr [[TMP3]]) #[[ATTR2:[0-9]+]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK3:       omp_offload.cont:
 // CHECK3-NEXT:    ret void
@@ -1142,7 +1142,7 @@ int fint(void) { return ftemplate<int>(); }
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z23without_schedule_clausePfS_S_S__l56.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[D:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[D:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1307,7 +1307,7 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK3-NEXT:    [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0
 // CHECK3-NEXT:    br i1 [[TMP32]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK3:       omp_offload.failed:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z18static_not_chunkedPfS_S_S__l68(ptr [[TMP0]], ptr [[TMP1]], ptr [[TMP2]], ptr [[TMP3]]) #[[ATTR3]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z18static_not_chunkedPfS_S_S__l68(ptr [[TMP0]], ptr [[TMP1]], ptr [[TMP2]], ptr [[TMP3]]) #[[ATTR2]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK3:       omp_offload.cont:
 // CHECK3-NEXT:    ret void
@@ -1329,7 +1329,7 @@ int fint(void) { return ftemplate<int>(); }
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z18static_not_chunkedPfS_S_S__l68.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[D:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[D:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1494,7 +1494,7 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK3-NEXT:    [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0
 // CHECK3-NEXT:    br i1 [[TMP32]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK3:       omp_offload.failed:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z14static_chunkedPfS_S_S__l80(ptr [[TMP0]], ptr [[TMP1]], ptr [[TMP2]], ptr [[TMP3]]) #[[ATTR3]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z14static_chunkedPfS_S_S__l80(ptr [[TMP0]], ptr [[TMP1]], ptr [[TMP2]], ptr [[TMP3]]) #[[ATTR2]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK3:       omp_offload.cont:
 // CHECK3-NEXT:    ret void
@@ -1516,7 +1516,7 @@ int fint(void) { return ftemplate<int>(); }
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z14static_chunkedPfS_S_S__l80.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[D:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[D:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1689,7 +1689,7 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK3-NEXT:    [[TMP25:%.*]] = icmp ne i32 [[TMP24]], 0
 // CHECK3-NEXT:    br i1 [[TMP25]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK3:       omp_offload.failed:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z12test_precondv_l92(i32 [[TMP1]]) #[[ATTR3]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z12test_precondv_l92(i32 [[TMP1]]) #[[ATTR2]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK3:       omp_offload.cont:
 // CHECK3-NEXT:    ret void
@@ -1705,7 +1705,7 @@ int fint(void) { return ftemplate<int>(); }
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z12test_precondv_l92.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[A:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[A:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1856,7 +1856,7 @@ int fint(void) { return ftemplate<int>(); }
 // CHECK3-NEXT:    [[TMP21:%.*]] = icmp ne i32 [[TMP20]], 0
 // CHECK3-NEXT:    br i1 [[TMP21]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK3:       omp_offload.failed:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_v_l108(i32 [[TMP1]]) #[[ATTR3]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_v_l108(i32 [[TMP1]]) #[[ATTR2]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK3:       omp_offload.cont:
 // CHECK3-NEXT:    ret i32 0
@@ -1872,7 +1872,7 @@ int fint(void) { return ftemplate<int>(); }
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_v_l108.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[AA:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[AA:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1954,19 +1954,21 @@ int fint(void) { return ftemplate<int>(); }
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK3-SAME: () #[[ATTR4:[0-9]+]] {
+// CHECK3-SAME: () #[[ATTR3:[0-9]+]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK3-NEXT:    ret void
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z23without_schedule_clausePfS_S_S__l56
-// CHECK17-SAME: (ptr noundef [[A:%.*]], ptr noundef [[B:%.*]], ptr noundef [[C:%.*]], ptr noundef [[D:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK17-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef [[A:%.*]], ptr noundef [[B:%.*]], ptr noundef [[C:%.*]], ptr noundef [[D:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK17-NEXT:  entry:
+// CHECK17-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[D_ADDR:%.*]] = alloca ptr, align 8
+// CHECK17-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK17-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
 // CHECK17-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
 // CHECK17-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
@@ -1976,7 +1978,7 @@ int fint(void) { return ftemplate<int>(); }
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z23without_schedule_clausePfS_S_S__l56.omp_outlined
-// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[D:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[D:%.*]]) #[[ATTR0]] {
 // CHECK17-NEXT:  entry:
 // CHECK17-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -2070,12 +2072,14 @@ int fint(void) { return ftemplate<int>(); }
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z18static_not_chunkedPfS_S_S__l68
-// CHECK17-SAME: (ptr noundef [[A:%.*]], ptr noundef [[B:%.*]], ptr noundef [[C:%.*]], ptr noundef [[D:%.*]]) #[[ATTR0]] {
+// CHECK17-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef [[A:%.*]], ptr noundef [[B:%.*]], ptr noundef [[C:%.*]], ptr noundef [[D:%.*]]) #[[ATTR0]] {
 // CHECK17-NEXT:  entry:
+// CHECK17-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[D_ADDR:%.*]] = alloca ptr, align 8
+// CHECK17-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK17-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
 // CHECK17-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
 // CHECK17-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
@@ -2085,7 +2089,7 @@ int fint(void) { return ftemplate<int>(); }
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z18static_not_chunkedPfS_S_S__l68.omp_outlined
-// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[D:%.*]]) #[[ATTR1]] {
+// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[D:%.*]]) #[[ATTR0]] {
 // CHECK17-NEXT:  entry:
 // CHECK17-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -2179,12 +2183,14 @@ int fint(void) { return ftemplate<int>(); }
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z14static_chunkedPfS_S_S__l80
-// CHECK17-SAME: (ptr noundef [[A:%.*]], ptr noundef [[B:%.*]], ptr noundef [[C:%.*]], ptr noundef [[D:%.*]]) #[[ATTR0]] {
+// CHECK17-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef [[A:%.*]], ptr noundef [[B:%.*]], ptr noundef [[C:%.*]], ptr noundef [[D:%.*]]) #[[ATTR0]] {
 // CHECK17-NEXT:  entry:
+// CHECK17-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[D_ADDR:%.*]] = alloca ptr, align 8
+// CHECK17-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK17-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
 // CHECK17-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
 // CHECK17-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
@@ -2194,7 +2200,7 @@ int fint(void) { return ftemplate<int>(); }
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z14static_chunkedPfS_S_S__l80.omp_outlined
-// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[D:%.*]]) #[[ATTR1]] {
+// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[D:%.*]]) #[[ATTR0]] {
 // CHECK17-NEXT:  entry:
 // CHECK17-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -2305,16 +2311,18 @@ int fint(void) { return ftemplate<int>(); }
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z12test_precondv_l92
-// CHECK17-SAME: (i64 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK17-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK17-NEXT:  entry:
+// CHECK17-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
+// CHECK17-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK17-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
 // CHECK17-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB2]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z12test_precondv_l92.omp_outlined, ptr [[A_ADDR]])
 // CHECK17-NEXT:    ret void
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z12test_precondv_l92.omp_outlined
-// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[A:%.*]]) #[[ATTR1]] {
+// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[A:%.*]]) #[[ATTR0]] {
 // CHECK17-NEXT:  entry:
 // CHECK17-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -2407,16 +2415,18 @@ int fint(void) { return ftemplate<int>(); }
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_v_l108
-// CHECK17-SAME: (i64 noundef [[AA:%.*]]) #[[ATTR0]] {
+// CHECK17-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[AA:%.*]]) #[[ATTR0]] {
 // CHECK17-NEXT:  entry:
+// CHECK17-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[AA_ADDR:%.*]] = alloca i64, align 8
+// CHECK17-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK17-NEXT:    store i64 [[AA]], ptr [[AA_ADDR]], align 8
 // CHECK17-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB2]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_v_l108.omp_outlined, ptr [[AA_ADDR]])
 // CHECK17-NEXT:    ret void
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_v_l108.omp_outlined
-// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[AA:%.*]]) #[[ATTR1]] {
+// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[AA:%.*]]) #[[ATTR0]] {
 // CHECK17-NEXT:  entry:
 // CHECK17-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -2498,12 +2508,14 @@ int fint(void) { return ftemplate<int>(); }
 //
 //
 // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z23without_schedule_clausePfS_S_S__l56
-// CHECK19-SAME: (ptr noundef [[A:%.*]], ptr noundef [[B:%.*]], ptr noundef [[C:%.*]], ptr noundef [[D:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK19-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef [[A:%.*]], ptr noundef [[B:%.*]], ptr noundef [[C:%.*]], ptr noundef [[D:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK19-NEXT:  entry:
+// CHECK19-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[D_ADDR:%.*]] = alloca ptr, align 4
+// CHECK19-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK19-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
 // CHECK19-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 4
 // CHECK19-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 4
@@ -2513,7 +2525,7 @@ int fint(void) { return ftemplate<int>(); }
 //
 //
 // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z23without_schedule_clausePfS_S_S__l56.omp_outlined
-// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[D:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[D:%.*]]) #[[ATTR0]] {
 // CHECK19-NEXT:  entry:
 // CHECK19-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -2603,12 +2615,14 @@ int fint(void) { return ftemplate<int>(); }
 //
 //
 // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z18static_not_chunkedPfS_S_S__l68
-// CHECK19-SAME: (ptr noundef [[A:%.*]], ptr noundef [[B:%.*]], ptr noundef [[C:%.*]], ptr noundef [[D:%.*]]) #[[ATTR0]] {
+// CHECK19-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef [[A:%.*]], ptr noundef [[B:%.*]], ptr noundef [[C:%.*]], ptr noundef [[D:%.*]]) #[[ATTR0]] {
 // CHECK19-NEXT:  entry:
+// CHECK19-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[D_ADDR:%.*]] = alloca ptr, align 4
+// CHECK19-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK19-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
 // CHECK19-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 4
 // CHECK19-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 4
@@ -2618,7 +2632,7 @@ int fint(void) { return ftemplate<int>(); }
 //
 //
 // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z18static_not_chunkedPfS_S_S__l68.omp_outlined
-// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[D:%.*]]) #[[ATTR1]] {
+// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[D:%.*]]) #[[ATTR0]] {
 // CHECK19-NEXT:  entry:
 // CHECK19-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -2708,12 +2722,14 @@ int fint(void) { return ftemplate<int>(); }
 //
 //
 // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z14static_chunkedPfS_S_S__l80
-// CHECK19-SAME: (ptr noundef [[A:%.*]], ptr noundef [[B:%.*]], ptr noundef [[C:%.*]], ptr noundef [[D:%.*]]) #[[ATTR0]] {
+// CHECK19-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef [[A:%.*]], ptr noundef [[B:%.*]], ptr noundef [[C:%.*]], ptr noundef [[D:%.*]]) #[[ATTR0]] {
 // CHECK19-NEXT:  entry:
+// CHECK19-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[D_ADDR:%.*]] = alloca ptr, align 4
+// CHECK19-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK19-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
 // CHECK19-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 4
 // CHECK19-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 4
@@ -2723,7 +2739,7 @@ int fint(void) { return ftemplate<int>(); }
 //
 //
 // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z14static_chunkedPfS_S_S__l80.omp_outlined
-// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[D:%.*]]) #[[ATTR1]] {
+// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[D:%.*]]) #[[ATTR0]] {
 // CHECK19-NEXT:  entry:
 // CHECK19-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -2830,16 +2846,18 @@ int fint(void) { return ftemplate<int>(); }
 //
 //
 // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z12test_precondv_l92
-// CHECK19-SAME: (i32 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK19-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK19-NEXT:  entry:
+// CHECK19-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// CHECK19-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK19-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
 // CHECK19-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB2]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z12test_precondv_l92.omp_outlined, ptr [[A_ADDR]])
 // CHECK19-NEXT:    ret void
 //
 //
 // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z12test_precondv_l92.omp_outlined
-// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[A:%.*]]) #[[ATTR1]] {
+// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[A:%.*]]) #[[ATTR0]] {
 // CHECK19-NEXT:  entry:
 // CHECK19-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -2932,16 +2950,18 @@ int fint(void) { return ftemplate<int>(); }
 //
 //
 // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_v_l108
-// CHECK19-SAME: (i32 noundef [[AA:%.*]]) #[[ATTR0]] {
+// CHECK19-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[AA:%.*]]) #[[ATTR0]] {
 // CHECK19-NEXT:  entry:
+// CHECK19-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[AA_ADDR:%.*]] = alloca i32, align 4
+// CHECK19-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK19-NEXT:    store i32 [[AA]], ptr [[AA_ADDR]], align 4
 // CHECK19-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB2]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_v_l108.omp_outlined, ptr [[AA_ADDR]])
 // CHECK19-NEXT:    ret void
 //
 //
 // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_v_l108.omp_outlined
-// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[AA:%.*]]) #[[ATTR1]] {
+// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[AA:%.*]]) #[[ATTR0]] {
 // CHECK19-NEXT:  entry:
 // CHECK19-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
diff --git a/clang/test/OpenMP/distribute_firstprivate_codegen.cpp b/clang/test/OpenMP/distribute_firstprivate_codegen.cpp
index 44f4de3142ad4..361e26bc2984c 100644
--- a/clang/test/OpenMP/distribute_firstprivate_codegen.cpp
+++ b/clang/test/OpenMP/distribute_firstprivate_codegen.cpp
@@ -187,7 +187,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[G:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[G1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SFVAR:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[G:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[G1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SFVAR:%.*]]) #[[ATTR2]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -305,7 +305,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK1-SAME: () #[[ATTR5:[0-9]+]] {
+// CHECK1-SAME: () #[[ATTR4:[0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK1-NEXT:    ret void
@@ -359,7 +359,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[G:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[G1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SFVAR:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[G:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[G1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SFVAR:%.*]]) #[[ATTR2]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -477,7 +477,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK3-SAME: () #[[ATTR5:[0-9]+]] {
+// CHECK3-SAME: () #[[ATTR4:[0-9]+]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK3-NEXT:    ret void
@@ -585,7 +585,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP39:%.*]] = icmp ne i32 [[TMP38]], 0
 // CHECK9-NEXT:    br i1 [[TMP39]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK9:       omp_offload.failed:
-// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l103(i64 [[TMP2]], ptr [[VEC]], ptr [[S_ARR]], ptr [[TMP3]], i64 [[TMP5]]) #[[ATTR5:[0-9]+]]
+// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l103(i64 [[TMP2]], ptr [[VEC]], ptr [[S_ARR]], ptr [[TMP3]], i64 [[TMP5]]) #[[ATTR4:[0-9]+]]
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK9:       omp_offload.cont:
 // CHECK9-NEXT:    [[CALL:%.*]] = call noundef signext i32 @_Z5tmainIiET_v()
@@ -596,11 +596,11 @@ int main() {
 // CHECK9:       arraydestroy.body:
 // CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP40]], [[OMP_OFFLOAD_CONT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
-// CHECK9-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK9-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN]]
 // CHECK9-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE2:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK9:       arraydestroy.done2:
-// CHECK9-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]]
 // CHECK9-NEXT:    [[TMP41:%.*]] = load i32, ptr [[RETVAL]], align 4
 // CHECK9-NEXT:    ret i32 [[TMP41]]
 //
@@ -652,7 +652,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l103.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]]) #[[ATTR4:[0-9]+]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]]) #[[ATTR3]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -769,14 +769,14 @@ int main() {
 // CHECK9-NEXT:    [[TMP23:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
 // CHECK9-NEXT:    [[TMP24:%.*]] = load i32, ptr [[TMP23]], align 4
 // CHECK9-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP24]])
-// CHECK9-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR7]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR7]]) #[[ATTR4]]
 // CHECK9-NEXT:    [[ARRAY_BEGIN14:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR5]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN14]], i64 2
 // CHECK9-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK9:       arraydestroy.body:
 // CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP25]], [[OMP_LOOP_EXIT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
-// CHECK9-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK9-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN14]]
 // CHECK9-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE15:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK9:       arraydestroy.done15:
@@ -789,12 +789,12 @@ int main() {
 // CHECK9-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
 // CHECK9-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
-// CHECK9-NEXT:    call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]]
 // CHECK9-NEXT:    ret void
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@_Z5tmainIiET_v
-// CHECK9-SAME: () #[[ATTR6:[0-9]+]] comdat {
+// CHECK9-SAME: () #[[ATTR5:[0-9]+]] comdat {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
 // CHECK9-NEXT:    [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4
@@ -881,7 +881,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP34:%.*]] = icmp ne i32 [[TMP33]], 0
 // CHECK9-NEXT:    br i1 [[TMP34]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK9:       omp_offload.failed:
-// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l49(i64 [[TMP2]], ptr [[VEC]], ptr [[S_ARR]], ptr [[TMP3]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l49(i64 [[TMP2]], ptr [[VEC]], ptr [[S_ARR]], ptr [[TMP3]]) #[[ATTR4]]
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK9:       omp_offload.cont:
 // CHECK9-NEXT:    store i32 0, ptr [[RETVAL]], align 4
@@ -891,11 +891,11 @@ int main() {
 // CHECK9:       arraydestroy.body:
 // CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP35]], [[OMP_OFFLOAD_CONT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
-// CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK9-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN]]
 // CHECK9-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE2:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK9:       arraydestroy.done2:
-// CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]]
 // CHECK9-NEXT:    [[TMP36:%.*]] = load i32, ptr [[RETVAL]], align 4
 // CHECK9-NEXT:    ret i32 [[TMP36]]
 //
@@ -979,7 +979,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l49.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR4]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR3]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1090,14 +1090,14 @@ int main() {
 // CHECK9-NEXT:    [[TMP21:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
 // CHECK9-NEXT:    [[TMP22:%.*]] = load i32, ptr [[TMP21]], align 4
 // CHECK9-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP22]])
-// CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR7]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR7]]) #[[ATTR4]]
 // CHECK9-NEXT:    [[ARRAY_BEGIN13:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR5]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN13]], i64 2
 // CHECK9-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK9:       arraydestroy.body:
 // CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP23]], [[OMP_LOOP_EXIT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
-// CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK9-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN13]]
 // CHECK9-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE14:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK9:       arraydestroy.done14:
@@ -1110,7 +1110,7 @@ int main() {
 // CHECK9-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
 // CHECK9-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
-// CHECK9-NEXT:    call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]]
 // CHECK9-NEXT:    ret void
 //
 //
@@ -1149,7 +1149,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK9-SAME: () #[[ATTR7:[0-9]+]] {
+// CHECK9-SAME: () #[[ATTR6:[0-9]+]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK9-NEXT:    ret void
@@ -1257,7 +1257,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP39:%.*]] = icmp ne i32 [[TMP38]], 0
 // CHECK11-NEXT:    br i1 [[TMP39]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK11:       omp_offload.failed:
-// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l103(i32 [[TMP2]], ptr [[VEC]], ptr [[S_ARR]], ptr [[TMP3]], i32 [[TMP5]]) #[[ATTR5:[0-9]+]]
+// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l103(i32 [[TMP2]], ptr [[VEC]], ptr [[S_ARR]], ptr [[TMP3]], i32 [[TMP5]]) #[[ATTR4:[0-9]+]]
 // CHECK11-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK11:       omp_offload.cont:
 // CHECK11-NEXT:    [[CALL:%.*]] = call noundef i32 @_Z5tmainIiET_v()
@@ -1268,11 +1268,11 @@ int main() {
 // CHECK11:       arraydestroy.body:
 // CHECK11-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP40]], [[OMP_OFFLOAD_CONT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK11-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1
-// CHECK11-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK11-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN]]
 // CHECK11-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE2:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK11:       arraydestroy.done2:
-// CHECK11-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]]
 // CHECK11-NEXT:    [[TMP41:%.*]] = load i32, ptr [[RETVAL]], align 4
 // CHECK11-NEXT:    ret i32 [[TMP41]]
 //
@@ -1324,7 +1324,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l103.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]]) #[[ATTR4:[0-9]+]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]]) #[[ATTR3]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1439,14 +1439,14 @@ int main() {
 // CHECK11-NEXT:    [[TMP23:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
 // CHECK11-NEXT:    [[TMP24:%.*]] = load i32, ptr [[TMP23]], align 4
 // CHECK11-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP24]])
-// CHECK11-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR7]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR7]]) #[[ATTR4]]
 // CHECK11-NEXT:    [[ARRAY_BEGIN13:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR5]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN13]], i32 2
 // CHECK11-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK11:       arraydestroy.body:
 // CHECK11-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP25]], [[OMP_LOOP_EXIT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK11-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1
-// CHECK11-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK11-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN13]]
 // CHECK11-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE14:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK11:       arraydestroy.done14:
@@ -1459,12 +1459,12 @@ int main() {
 // CHECK11-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
 // CHECK11-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
-// CHECK11-NEXT:    call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]]
 // CHECK11-NEXT:    ret void
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@_Z5tmainIiET_v
-// CHECK11-SAME: () #[[ATTR6:[0-9]+]] comdat {
+// CHECK11-SAME: () #[[ATTR5:[0-9]+]] comdat {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4
@@ -1551,7 +1551,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP34:%.*]] = icmp ne i32 [[TMP33]], 0
 // CHECK11-NEXT:    br i1 [[TMP34]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK11:       omp_offload.failed:
-// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l49(i32 [[TMP2]], ptr [[VEC]], ptr [[S_ARR]], ptr [[TMP3]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l49(i32 [[TMP2]], ptr [[VEC]], ptr [[S_ARR]], ptr [[TMP3]]) #[[ATTR4]]
 // CHECK11-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK11:       omp_offload.cont:
 // CHECK11-NEXT:    store i32 0, ptr [[RETVAL]], align 4
@@ -1561,11 +1561,11 @@ int main() {
 // CHECK11:       arraydestroy.body:
 // CHECK11-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP35]], [[OMP_OFFLOAD_CONT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK11-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1
-// CHECK11-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK11-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN]]
 // CHECK11-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE2:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK11:       arraydestroy.done2:
-// CHECK11-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]]
 // CHECK11-NEXT:    [[TMP36:%.*]] = load i32, ptr [[RETVAL]], align 4
 // CHECK11-NEXT:    ret i32 [[TMP36]]
 //
@@ -1649,7 +1649,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l49.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR4]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR3]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1758,14 +1758,14 @@ int main() {
 // CHECK11-NEXT:    [[TMP21:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
 // CHECK11-NEXT:    [[TMP22:%.*]] = load i32, ptr [[TMP21]], align 4
 // CHECK11-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP22]])
-// CHECK11-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR7]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR7]]) #[[ATTR4]]
 // CHECK11-NEXT:    [[ARRAY_BEGIN12:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR5]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN12]], i32 2
 // CHECK11-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK11:       arraydestroy.body:
 // CHECK11-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP23]], [[OMP_LOOP_EXIT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK11-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1
-// CHECK11-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK11-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN12]]
 // CHECK11-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE13:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK11:       arraydestroy.done13:
@@ -1778,7 +1778,7 @@ int main() {
 // CHECK11-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
 // CHECK11-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
-// CHECK11-NEXT:    call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]]
 // CHECK11-NEXT:    ret void
 //
 //
@@ -1817,7 +1817,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK11-SAME: () #[[ATTR7:[0-9]+]] {
+// CHECK11-SAME: () #[[ATTR6:[0-9]+]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK11-NEXT:    ret void
diff --git a/clang/test/OpenMP/distribute_lastprivate_codegen.cpp b/clang/test/OpenMP/distribute_lastprivate_codegen.cpp
index 44f62dfe6610d..e005de30e14d1 100644
--- a/clang/test/OpenMP/distribute_lastprivate_codegen.cpp
+++ b/clang/test/OpenMP/distribute_lastprivate_codegen.cpp
@@ -177,7 +177,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[G:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[G1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SFVAR:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[G:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[G1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SFVAR:%.*]]) #[[ATTR2]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -292,7 +292,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK1-SAME: () #[[ATTR5:[0-9]+]] {
+// CHECK1-SAME: () #[[ATTR4:[0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK1-NEXT:    ret void
@@ -346,7 +346,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[G:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[G1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SFVAR:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[G:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[G1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SFVAR:%.*]]) #[[ATTR2]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -461,7 +461,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK3-SAME: () #[[ATTR5:[0-9]+]] {
+// CHECK3-SAME: () #[[ATTR4:[0-9]+]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK3-NEXT:    ret void
@@ -570,7 +570,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP39:%.*]] = icmp ne i32 [[TMP38]], 0
 // CHECK9-NEXT:    br i1 [[TMP39]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK9:       omp_offload.failed:
-// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l98(i64 [[TMP2]], ptr [[VEC]], ptr [[S_ARR]], ptr [[TMP3]], i64 [[TMP5]]) #[[ATTR5:[0-9]+]]
+// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l98(i64 [[TMP2]], ptr [[VEC]], ptr [[S_ARR]], ptr [[TMP3]], i64 [[TMP5]]) #[[ATTR4:[0-9]+]]
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK9:       omp_offload.cont:
 // CHECK9-NEXT:    [[CALL:%.*]] = call noundef signext i32 @_Z5tmainIiET_v()
@@ -581,11 +581,11 @@ int main() {
 // CHECK9:       arraydestroy.body:
 // CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP40]], [[OMP_OFFLOAD_CONT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
-// CHECK9-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK9-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN]]
 // CHECK9-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE2:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK9:       arraydestroy.done2:
-// CHECK9-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]]
 // CHECK9-NEXT:    [[TMP41:%.*]] = load i32, ptr [[RETVAL]], align 4
 // CHECK9-NEXT:    ret i32 [[TMP41]]
 //
@@ -637,7 +637,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l98.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]]) #[[ATTR4:[0-9]+]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]]) #[[ATTR3]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -772,14 +772,14 @@ int main() {
 // CHECK9-NEXT:    store i32 [[TMP27]], ptr [[TMP4]], align 4
 // CHECK9-NEXT:    br label [[DOTOMP_LASTPRIVATE_DONE]]
 // CHECK9:       .omp.lastprivate.done:
-// CHECK9-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR6]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR6]]) #[[ATTR4]]
 // CHECK9-NEXT:    [[ARRAY_BEGIN15:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR5]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN15]], i64 2
 // CHECK9-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK9:       arraydestroy.body:
 // CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP28]], [[DOTOMP_LASTPRIVATE_DONE]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
-// CHECK9-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK9-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN15]]
 // CHECK9-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE16:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK9:       arraydestroy.done16:
@@ -792,12 +792,12 @@ int main() {
 // CHECK9-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
 // CHECK9-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
-// CHECK9-NEXT:    call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]]
 // CHECK9-NEXT:    ret void
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@_Z5tmainIiET_v
-// CHECK9-SAME: () #[[ATTR6:[0-9]+]] comdat {
+// CHECK9-SAME: () #[[ATTR5:[0-9]+]] comdat {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
 // CHECK9-NEXT:    [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4
@@ -884,7 +884,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP34:%.*]] = icmp ne i32 [[TMP33]], 0
 // CHECK9-NEXT:    br i1 [[TMP34]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK9:       omp_offload.failed:
-// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l49(i64 [[TMP2]], ptr [[VEC]], ptr [[S_ARR]], ptr [[TMP3]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l49(i64 [[TMP2]], ptr [[VEC]], ptr [[S_ARR]], ptr [[TMP3]]) #[[ATTR4]]
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK9:       omp_offload.cont:
 // CHECK9-NEXT:    store i32 0, ptr [[RETVAL]], align 4
@@ -894,11 +894,11 @@ int main() {
 // CHECK9:       arraydestroy.body:
 // CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP35]], [[OMP_OFFLOAD_CONT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
-// CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK9-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN]]
 // CHECK9-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE2:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK9:       arraydestroy.done2:
-// CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]]
 // CHECK9-NEXT:    [[TMP36:%.*]] = load i32, ptr [[RETVAL]], align 4
 // CHECK9-NEXT:    ret i32 [[TMP36]]
 //
@@ -982,7 +982,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l49.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR4]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR3]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1111,14 +1111,14 @@ int main() {
 // CHECK9-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP5]], ptr align 4 [[TMP25]], i64 4, i1 false)
 // CHECK9-NEXT:    br label [[DOTOMP_LASTPRIVATE_DONE]]
 // CHECK9:       .omp.lastprivate.done:
-// CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR6]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR6]]) #[[ATTR4]]
 // CHECK9-NEXT:    [[ARRAY_BEGIN14:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR5]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN14]], i64 2
 // CHECK9-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK9:       arraydestroy.body:
 // CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP26]], [[DOTOMP_LASTPRIVATE_DONE]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
-// CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK9-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN14]]
 // CHECK9-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE15:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK9:       arraydestroy.done15:
@@ -1131,7 +1131,7 @@ int main() {
 // CHECK9-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
 // CHECK9-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
-// CHECK9-NEXT:    call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]]
 // CHECK9-NEXT:    ret void
 //
 //
@@ -1170,7 +1170,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK9-SAME: () #[[ATTR7:[0-9]+]] {
+// CHECK9-SAME: () #[[ATTR6:[0-9]+]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK9-NEXT:    ret void
@@ -1279,7 +1279,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP39:%.*]] = icmp ne i32 [[TMP38]], 0
 // CHECK11-NEXT:    br i1 [[TMP39]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK11:       omp_offload.failed:
-// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l98(i32 [[TMP2]], ptr [[VEC]], ptr [[S_ARR]], ptr [[TMP3]], i32 [[TMP5]]) #[[ATTR5:[0-9]+]]
+// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l98(i32 [[TMP2]], ptr [[VEC]], ptr [[S_ARR]], ptr [[TMP3]], i32 [[TMP5]]) #[[ATTR4:[0-9]+]]
 // CHECK11-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK11:       omp_offload.cont:
 // CHECK11-NEXT:    [[CALL:%.*]] = call noundef i32 @_Z5tmainIiET_v()
@@ -1290,11 +1290,11 @@ int main() {
 // CHECK11:       arraydestroy.body:
 // CHECK11-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP40]], [[OMP_OFFLOAD_CONT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK11-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1
-// CHECK11-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK11-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN]]
 // CHECK11-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE2:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK11:       arraydestroy.done2:
-// CHECK11-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]]
 // CHECK11-NEXT:    [[TMP41:%.*]] = load i32, ptr [[RETVAL]], align 4
 // CHECK11-NEXT:    ret i32 [[TMP41]]
 //
@@ -1346,7 +1346,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l98.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]]) #[[ATTR4:[0-9]+]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]]) #[[ATTR3]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1479,14 +1479,14 @@ int main() {
 // CHECK11-NEXT:    store i32 [[TMP27]], ptr [[TMP4]], align 4
 // CHECK11-NEXT:    br label [[DOTOMP_LASTPRIVATE_DONE]]
 // CHECK11:       .omp.lastprivate.done:
-// CHECK11-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR6]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR6]]) #[[ATTR4]]
 // CHECK11-NEXT:    [[ARRAY_BEGIN14:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR5]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN14]], i32 2
 // CHECK11-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK11:       arraydestroy.body:
 // CHECK11-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP28]], [[DOTOMP_LASTPRIVATE_DONE]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK11-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1
-// CHECK11-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK11-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN14]]
 // CHECK11-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE15:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK11:       arraydestroy.done15:
@@ -1499,12 +1499,12 @@ int main() {
 // CHECK11-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
 // CHECK11-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
-// CHECK11-NEXT:    call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]]
 // CHECK11-NEXT:    ret void
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@_Z5tmainIiET_v
-// CHECK11-SAME: () #[[ATTR6:[0-9]+]] comdat {
+// CHECK11-SAME: () #[[ATTR5:[0-9]+]] comdat {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4
@@ -1591,7 +1591,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP34:%.*]] = icmp ne i32 [[TMP33]], 0
 // CHECK11-NEXT:    br i1 [[TMP34]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK11:       omp_offload.failed:
-// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l49(i32 [[TMP2]], ptr [[VEC]], ptr [[S_ARR]], ptr [[TMP3]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l49(i32 [[TMP2]], ptr [[VEC]], ptr [[S_ARR]], ptr [[TMP3]]) #[[ATTR4]]
 // CHECK11-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK11:       omp_offload.cont:
 // CHECK11-NEXT:    store i32 0, ptr [[RETVAL]], align 4
@@ -1601,11 +1601,11 @@ int main() {
 // CHECK11:       arraydestroy.body:
 // CHECK11-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP35]], [[OMP_OFFLOAD_CONT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK11-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1
-// CHECK11-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK11-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN]]
 // CHECK11-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE2:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK11:       arraydestroy.done2:
-// CHECK11-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]]
 // CHECK11-NEXT:    [[TMP36:%.*]] = load i32, ptr [[RETVAL]], align 4
 // CHECK11-NEXT:    ret i32 [[TMP36]]
 //
@@ -1689,7 +1689,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l49.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR4]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR3]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1816,14 +1816,14 @@ int main() {
 // CHECK11-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[TMP5]], ptr align 4 [[TMP25]], i32 4, i1 false)
 // CHECK11-NEXT:    br label [[DOTOMP_LASTPRIVATE_DONE]]
 // CHECK11:       .omp.lastprivate.done:
-// CHECK11-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR6]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR6]]) #[[ATTR4]]
 // CHECK11-NEXT:    [[ARRAY_BEGIN13:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR5]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN13]], i32 2
 // CHECK11-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK11:       arraydestroy.body:
 // CHECK11-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP26]], [[DOTOMP_LASTPRIVATE_DONE]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK11-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1
-// CHECK11-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK11-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN13]]
 // CHECK11-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE14:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK11:       arraydestroy.done14:
@@ -1836,7 +1836,7 @@ int main() {
 // CHECK11-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
 // CHECK11-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
-// CHECK11-NEXT:    call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]]
 // CHECK11-NEXT:    ret void
 //
 //
@@ -1875,7 +1875,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK11-SAME: () #[[ATTR7:[0-9]+]] {
+// CHECK11-SAME: () #[[ATTR6:[0-9]+]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK11-NEXT:    ret void
diff --git a/clang/test/OpenMP/distribute_parallel_for_codegen.cpp b/clang/test/OpenMP/distribute_parallel_for_codegen.cpp
index c42f22b06ae50..7bdc4c5ab21a7 100644
--- a/clang/test/OpenMP/distribute_parallel_for_codegen.cpp
+++ b/clang/test/OpenMP/distribute_parallel_for_codegen.cpp
@@ -810,7 +810,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l117.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR2]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -904,7 +904,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l117.omp_outlined.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR3]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR2]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1049,7 +1049,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l160.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR3]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR2]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1143,7 +1143,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l160.omp_outlined.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR3]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR2]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1290,7 +1290,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l202.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[CH:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR3]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[CH:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR2]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1412,7 +1412,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l202.omp_outlined.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR3]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR2]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1557,7 +1557,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l235.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR3]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR2]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1651,7 +1651,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l235.omp_outlined.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR3]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR2]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1798,7 +1798,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l267.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[CH:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR3]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[CH:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR2]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1902,7 +1902,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l267.omp_outlined.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -2069,7 +2069,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l300.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR3]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR2]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -2163,7 +2163,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l300.omp_outlined.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR3]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR2]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -2306,7 +2306,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l329.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[CH:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR3]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[CH:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR2]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -2410,7 +2410,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l329.omp_outlined.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -2539,7 +2539,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK1-SAME: () #[[ATTR5:[0-9]+]] {
+// CHECK1-SAME: () #[[ATTR4:[0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK1-NEXT:    ret void
@@ -2588,7 +2588,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l117.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR2]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -2680,7 +2680,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l117.omp_outlined.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR3]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR2]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -2820,7 +2820,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l160.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR3]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR2]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -2912,7 +2912,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l160.omp_outlined.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR3]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR2]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -3054,7 +3054,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l202.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[CH:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR3]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[CH:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR2]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -3174,7 +3174,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l202.omp_outlined.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR3]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR2]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -3314,7 +3314,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l235.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR3]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR2]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -3406,7 +3406,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l235.omp_outlined.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR3]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR2]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -3548,7 +3548,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l267.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[CH:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR3]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[CH:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR2]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -3650,7 +3650,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l267.omp_outlined.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -3810,7 +3810,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l300.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR3]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR2]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -3902,7 +3902,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l300.omp_outlined.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR3]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR2]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -4040,7 +4040,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l329.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[CH:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR3]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[CH:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR2]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -4142,7 +4142,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l329.omp_outlined.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -4266,7 +4266,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK3-SAME: () #[[ATTR5:[0-9]+]] {
+// CHECK3-SAME: () #[[ATTR4:[0-9]+]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK3-NEXT:    ret void
@@ -4415,7 +4415,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP37:%.*]] = icmp ne i32 [[TMP36]], 0
 // CHECK9-NEXT:    br i1 [[TMP37]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK9:       omp_offload.failed:
-// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l369(i64 [[TMP1]], ptr [[TMP2]], ptr [[TMP3]], ptr [[TMP4]]) #[[ATTR3:[0-9]+]]
+// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l369(i64 [[TMP1]], ptr [[TMP2]], ptr [[TMP3]], ptr [[TMP4]]) #[[ATTR2:[0-9]+]]
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK9:       omp_offload.cont:
 // CHECK9-NEXT:    [[TMP38:%.*]] = load i32, ptr [[N]], align 4
@@ -4490,7 +4490,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP75:%.*]] = icmp ne i32 [[TMP74]], 0
 // CHECK9-NEXT:    br i1 [[TMP75]], label [[OMP_OFFLOAD_FAILED15:%.*]], label [[OMP_OFFLOAD_CONT16:%.*]]
 // CHECK9:       omp_offload.failed15:
-// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l408(i64 [[TMP39]], ptr [[TMP40]], ptr [[TMP41]], ptr [[TMP42]]) #[[ATTR3]]
+// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l408(i64 [[TMP39]], ptr [[TMP40]], ptr [[TMP41]], ptr [[TMP42]]) #[[ATTR2]]
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT16]]
 // CHECK9:       omp_offload.cont16:
 // CHECK9-NEXT:    [[TMP76:%.*]] = load i32, ptr [[CH]], align 4
@@ -4574,7 +4574,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP118:%.*]] = icmp ne i32 [[TMP117]], 0
 // CHECK9-NEXT:    br i1 [[TMP118]], label [[OMP_OFFLOAD_FAILED29:%.*]], label [[OMP_OFFLOAD_CONT30:%.*]]
 // CHECK9:       omp_offload.failed29:
-// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l447(i64 [[TMP77]], i64 [[TMP79]], ptr [[TMP80]], ptr [[TMP81]], ptr [[TMP82]]) #[[ATTR3]]
+// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l447(i64 [[TMP77]], i64 [[TMP79]], ptr [[TMP80]], ptr [[TMP81]], ptr [[TMP82]]) #[[ATTR2]]
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT30]]
 // CHECK9:       omp_offload.cont30:
 // CHECK9-NEXT:    [[TMP119:%.*]] = load i32, ptr [[N]], align 4
@@ -4649,7 +4649,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP156:%.*]] = icmp ne i32 [[TMP155]], 0
 // CHECK9-NEXT:    br i1 [[TMP156]], label [[OMP_OFFLOAD_FAILED43:%.*]], label [[OMP_OFFLOAD_CONT44:%.*]]
 // CHECK9:       omp_offload.failed43:
-// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l478(i64 [[TMP120]], ptr [[TMP121]], ptr [[TMP122]], ptr [[TMP123]]) #[[ATTR3]]
+// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l478(i64 [[TMP120]], ptr [[TMP121]], ptr [[TMP122]], ptr [[TMP123]]) #[[ATTR2]]
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT44]]
 // CHECK9:       omp_offload.cont44:
 // CHECK9-NEXT:    [[TMP157:%.*]] = load i32, ptr [[CH]], align 4
@@ -4733,7 +4733,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP199:%.*]] = icmp ne i32 [[TMP198]], 0
 // CHECK9-NEXT:    br i1 [[TMP199]], label [[OMP_OFFLOAD_FAILED58:%.*]], label [[OMP_OFFLOAD_CONT59:%.*]]
 // CHECK9:       omp_offload.failed58:
-// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l506(i64 [[TMP158]], i64 [[TMP160]], ptr [[TMP161]], ptr [[TMP162]], ptr [[TMP163]]) #[[ATTR3]]
+// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l506(i64 [[TMP158]], i64 [[TMP160]], ptr [[TMP161]], ptr [[TMP162]], ptr [[TMP163]]) #[[ATTR2]]
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT59]]
 // CHECK9:       omp_offload.cont59:
 // CHECK9-NEXT:    [[TMP200:%.*]] = load i32, ptr [[N]], align 4
@@ -4808,7 +4808,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP237:%.*]] = icmp ne i32 [[TMP236]], 0
 // CHECK9-NEXT:    br i1 [[TMP237]], label [[OMP_OFFLOAD_FAILED72:%.*]], label [[OMP_OFFLOAD_CONT73:%.*]]
 // CHECK9:       omp_offload.failed72:
-// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l536(i64 [[TMP201]], ptr [[TMP202]], ptr [[TMP203]], ptr [[TMP204]]) #[[ATTR3]]
+// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l536(i64 [[TMP201]], ptr [[TMP202]], ptr [[TMP203]], ptr [[TMP204]]) #[[ATTR2]]
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT73]]
 // CHECK9:       omp_offload.cont73:
 // CHECK9-NEXT:    [[TMP238:%.*]] = load i32, ptr [[CH]], align 4
@@ -4892,7 +4892,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP280:%.*]] = icmp ne i32 [[TMP279]], 0
 // CHECK9-NEXT:    br i1 [[TMP280]], label [[OMP_OFFLOAD_FAILED87:%.*]], label [[OMP_OFFLOAD_CONT88:%.*]]
 // CHECK9:       omp_offload.failed87:
-// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l562(i64 [[TMP239]], i64 [[TMP241]], ptr [[TMP242]], ptr [[TMP243]], ptr [[TMP244]]) #[[ATTR3]]
+// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l562(i64 [[TMP239]], i64 [[TMP241]], ptr [[TMP242]], ptr [[TMP243]], ptr [[TMP244]]) #[[ATTR2]]
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT88]]
 // CHECK9:       omp_offload.cont88:
 // CHECK9-NEXT:    [[CALL:%.*]] = call noundef signext i32 @_Z5tmainIiET_v()
@@ -4915,7 +4915,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l369.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR1]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -5009,7 +5009,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l369.omp_outlined.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR2]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR1]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -5144,7 +5144,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l408.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR2]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR1]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -5238,7 +5238,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l408.omp_outlined.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR2]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR1]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -5375,7 +5375,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l447.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[CH:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR2]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[CH:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR1]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -5497,7 +5497,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l447.omp_outlined.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR2]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR1]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -5632,7 +5632,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l478.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR2]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR1]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -5726,7 +5726,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l478.omp_outlined.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR2]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR1]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -5863,7 +5863,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l506.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[CH:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR2]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[CH:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR1]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -5967,7 +5967,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l506.omp_outlined.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR1]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -6124,7 +6124,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l536.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR2]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR1]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -6218,7 +6218,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l536.omp_outlined.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR2]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR1]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -6351,7 +6351,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l562.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[CH:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR2]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[CH:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR1]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -6455,7 +6455,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l562.omp_outlined.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR1]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -6574,7 +6574,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@_Z5tmainIiET_v
-// CHECK9-SAME: () #[[ATTR4:[0-9]+]] comdat {
+// CHECK9-SAME: () #[[ATTR3:[0-9]+]] comdat {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[A:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[B:%.*]] = alloca ptr, align 8
@@ -6714,7 +6714,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP37:%.*]] = icmp ne i32 [[TMP36]], 0
 // CHECK9-NEXT:    br i1 [[TMP37]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK9:       omp_offload.failed:
-// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l42(i64 [[TMP1]], ptr [[TMP2]], ptr [[TMP3]], ptr [[TMP4]]) #[[ATTR3]]
+// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l42(i64 [[TMP1]], ptr [[TMP2]], ptr [[TMP3]], ptr [[TMP4]]) #[[ATTR2]]
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK9:       omp_offload.cont:
 // CHECK9-NEXT:    [[TMP38:%.*]] = load i32, ptr [[N]], align 4
@@ -6789,7 +6789,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP75:%.*]] = icmp ne i32 [[TMP74]], 0
 // CHECK9-NEXT:    br i1 [[TMP75]], label [[OMP_OFFLOAD_FAILED15:%.*]], label [[OMP_OFFLOAD_CONT16:%.*]]
 // CHECK9:       omp_offload.failed15:
-// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l51(i64 [[TMP39]], ptr [[TMP40]], ptr [[TMP41]], ptr [[TMP42]]) #[[ATTR3]]
+// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l51(i64 [[TMP39]], ptr [[TMP40]], ptr [[TMP41]], ptr [[TMP42]]) #[[ATTR2]]
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT16]]
 // CHECK9:       omp_offload.cont16:
 // CHECK9-NEXT:    [[TMP76:%.*]] = load i32, ptr [[CH]], align 4
@@ -6873,7 +6873,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP118:%.*]] = icmp ne i32 [[TMP117]], 0
 // CHECK9-NEXT:    br i1 [[TMP118]], label [[OMP_OFFLOAD_FAILED29:%.*]], label [[OMP_OFFLOAD_CONT30:%.*]]
 // CHECK9:       omp_offload.failed29:
-// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l59(i64 [[TMP77]], i64 [[TMP79]], ptr [[TMP80]], ptr [[TMP81]], ptr [[TMP82]]) #[[ATTR3]]
+// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l59(i64 [[TMP77]], i64 [[TMP79]], ptr [[TMP80]], ptr [[TMP81]], ptr [[TMP82]]) #[[ATTR2]]
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT30]]
 // CHECK9:       omp_offload.cont30:
 // CHECK9-NEXT:    [[TMP119:%.*]] = load i32, ptr [[N]], align 4
@@ -6948,7 +6948,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP156:%.*]] = icmp ne i32 [[TMP155]], 0
 // CHECK9-NEXT:    br i1 [[TMP156]], label [[OMP_OFFLOAD_FAILED43:%.*]], label [[OMP_OFFLOAD_CONT44:%.*]]
 // CHECK9:       omp_offload.failed43:
-// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l67(i64 [[TMP120]], ptr [[TMP121]], ptr [[TMP122]], ptr [[TMP123]]) #[[ATTR3]]
+// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l67(i64 [[TMP120]], ptr [[TMP121]], ptr [[TMP122]], ptr [[TMP123]]) #[[ATTR2]]
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT44]]
 // CHECK9:       omp_offload.cont44:
 // CHECK9-NEXT:    [[TMP157:%.*]] = load i32, ptr [[CH]], align 4
@@ -7032,7 +7032,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP199:%.*]] = icmp ne i32 [[TMP198]], 0
 // CHECK9-NEXT:    br i1 [[TMP199]], label [[OMP_OFFLOAD_FAILED58:%.*]], label [[OMP_OFFLOAD_CONT59:%.*]]
 // CHECK9:       omp_offload.failed58:
-// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l75(i64 [[TMP158]], i64 [[TMP160]], ptr [[TMP161]], ptr [[TMP162]], ptr [[TMP163]]) #[[ATTR3]]
+// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l75(i64 [[TMP158]], i64 [[TMP160]], ptr [[TMP161]], ptr [[TMP162]], ptr [[TMP163]]) #[[ATTR2]]
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT59]]
 // CHECK9:       omp_offload.cont59:
 // CHECK9-NEXT:    [[TMP200:%.*]] = load i32, ptr [[N]], align 4
@@ -7107,7 +7107,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP237:%.*]] = icmp ne i32 [[TMP236]], 0
 // CHECK9-NEXT:    br i1 [[TMP237]], label [[OMP_OFFLOAD_FAILED72:%.*]], label [[OMP_OFFLOAD_CONT73:%.*]]
 // CHECK9:       omp_offload.failed72:
-// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l83(i64 [[TMP201]], ptr [[TMP202]], ptr [[TMP203]], ptr [[TMP204]]) #[[ATTR3]]
+// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l83(i64 [[TMP201]], ptr [[TMP202]], ptr [[TMP203]], ptr [[TMP204]]) #[[ATTR2]]
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT73]]
 // CHECK9:       omp_offload.cont73:
 // CHECK9-NEXT:    [[TMP238:%.*]] = load i32, ptr [[CH]], align 4
@@ -7191,7 +7191,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP280:%.*]] = icmp ne i32 [[TMP279]], 0
 // CHECK9-NEXT:    br i1 [[TMP280]], label [[OMP_OFFLOAD_FAILED87:%.*]], label [[OMP_OFFLOAD_CONT88:%.*]]
 // CHECK9:       omp_offload.failed87:
-// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l91(i64 [[TMP239]], i64 [[TMP241]], ptr [[TMP242]], ptr [[TMP243]], ptr [[TMP244]]) #[[ATTR3]]
+// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l91(i64 [[TMP239]], i64 [[TMP241]], ptr [[TMP242]], ptr [[TMP243]], ptr [[TMP244]]) #[[ATTR2]]
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT88]]
 // CHECK9:       omp_offload.cont88:
 // CHECK9-NEXT:    ret i32 0
@@ -7213,7 +7213,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l42.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR2]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR1]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -7307,7 +7307,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l42.omp_outlined.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR2]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR1]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -7457,7 +7457,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l51.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR2]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR1]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -7551,7 +7551,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l51.omp_outlined.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR2]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR1]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -7688,7 +7688,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l59.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[CH:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR2]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[CH:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR1]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -7810,7 +7810,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l59.omp_outlined.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR2]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR1]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -7945,7 +7945,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l67.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR2]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR1]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -8039,7 +8039,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l67.omp_outlined.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR2]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR1]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -8176,7 +8176,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l75.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[CH:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR2]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[CH:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR1]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -8280,7 +8280,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l75.omp_outlined.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR1]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -8437,7 +8437,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l83.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR2]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR1]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -8531,7 +8531,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l83.omp_outlined.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR2]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR1]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -8664,7 +8664,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l91.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[CH:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR2]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[CH:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR1]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -8768,7 +8768,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l91.omp_outlined.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR1]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -8887,7 +8887,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK9-SAME: () #[[ATTR5:[0-9]+]] {
+// CHECK9-SAME: () #[[ATTR4:[0-9]+]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK9-NEXT:    ret void
@@ -9036,7 +9036,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP37:%.*]] = icmp ne i32 [[TMP36]], 0
 // CHECK11-NEXT:    br i1 [[TMP37]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK11:       omp_offload.failed:
-// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l369(i32 [[TMP1]], ptr [[TMP2]], ptr [[TMP3]], ptr [[TMP4]]) #[[ATTR3:[0-9]+]]
+// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l369(i32 [[TMP1]], ptr [[TMP2]], ptr [[TMP3]], ptr [[TMP4]]) #[[ATTR2:[0-9]+]]
 // CHECK11-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK11:       omp_offload.cont:
 // CHECK11-NEXT:    [[TMP38:%.*]] = load i32, ptr [[N]], align 4
@@ -9111,7 +9111,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP75:%.*]] = icmp ne i32 [[TMP74]], 0
 // CHECK11-NEXT:    br i1 [[TMP75]], label [[OMP_OFFLOAD_FAILED15:%.*]], label [[OMP_OFFLOAD_CONT16:%.*]]
 // CHECK11:       omp_offload.failed15:
-// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l408(i32 [[TMP39]], ptr [[TMP40]], ptr [[TMP41]], ptr [[TMP42]]) #[[ATTR3]]
+// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l408(i32 [[TMP39]], ptr [[TMP40]], ptr [[TMP41]], ptr [[TMP42]]) #[[ATTR2]]
 // CHECK11-NEXT:    br label [[OMP_OFFLOAD_CONT16]]
 // CHECK11:       omp_offload.cont16:
 // CHECK11-NEXT:    [[TMP76:%.*]] = load i32, ptr [[CH]], align 4
@@ -9195,7 +9195,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP118:%.*]] = icmp ne i32 [[TMP117]], 0
 // CHECK11-NEXT:    br i1 [[TMP118]], label [[OMP_OFFLOAD_FAILED29:%.*]], label [[OMP_OFFLOAD_CONT30:%.*]]
 // CHECK11:       omp_offload.failed29:
-// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l447(i32 [[TMP77]], i32 [[TMP79]], ptr [[TMP80]], ptr [[TMP81]], ptr [[TMP82]]) #[[ATTR3]]
+// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l447(i32 [[TMP77]], i32 [[TMP79]], ptr [[TMP80]], ptr [[TMP81]], ptr [[TMP82]]) #[[ATTR2]]
 // CHECK11-NEXT:    br label [[OMP_OFFLOAD_CONT30]]
 // CHECK11:       omp_offload.cont30:
 // CHECK11-NEXT:    [[TMP119:%.*]] = load i32, ptr [[N]], align 4
@@ -9270,7 +9270,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP156:%.*]] = icmp ne i32 [[TMP155]], 0
 // CHECK11-NEXT:    br i1 [[TMP156]], label [[OMP_OFFLOAD_FAILED43:%.*]], label [[OMP_OFFLOAD_CONT44:%.*]]
 // CHECK11:       omp_offload.failed43:
-// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l478(i32 [[TMP120]], ptr [[TMP121]], ptr [[TMP122]], ptr [[TMP123]]) #[[ATTR3]]
+// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l478(i32 [[TMP120]], ptr [[TMP121]], ptr [[TMP122]], ptr [[TMP123]]) #[[ATTR2]]
 // CHECK11-NEXT:    br label [[OMP_OFFLOAD_CONT44]]
 // CHECK11:       omp_offload.cont44:
 // CHECK11-NEXT:    [[TMP157:%.*]] = load i32, ptr [[CH]], align 4
@@ -9354,7 +9354,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP199:%.*]] = icmp ne i32 [[TMP198]], 0
 // CHECK11-NEXT:    br i1 [[TMP199]], label [[OMP_OFFLOAD_FAILED58:%.*]], label [[OMP_OFFLOAD_CONT59:%.*]]
 // CHECK11:       omp_offload.failed58:
-// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l506(i32 [[TMP158]], i32 [[TMP160]], ptr [[TMP161]], ptr [[TMP162]], ptr [[TMP163]]) #[[ATTR3]]
+// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l506(i32 [[TMP158]], i32 [[TMP160]], ptr [[TMP161]], ptr [[TMP162]], ptr [[TMP163]]) #[[ATTR2]]
 // CHECK11-NEXT:    br label [[OMP_OFFLOAD_CONT59]]
 // CHECK11:       omp_offload.cont59:
 // CHECK11-NEXT:    [[TMP200:%.*]] = load i32, ptr [[N]], align 4
@@ -9429,7 +9429,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP237:%.*]] = icmp ne i32 [[TMP236]], 0
 // CHECK11-NEXT:    br i1 [[TMP237]], label [[OMP_OFFLOAD_FAILED72:%.*]], label [[OMP_OFFLOAD_CONT73:%.*]]
 // CHECK11:       omp_offload.failed72:
-// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l536(i32 [[TMP201]], ptr [[TMP202]], ptr [[TMP203]], ptr [[TMP204]]) #[[ATTR3]]
+// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l536(i32 [[TMP201]], ptr [[TMP202]], ptr [[TMP203]], ptr [[TMP204]]) #[[ATTR2]]
 // CHECK11-NEXT:    br label [[OMP_OFFLOAD_CONT73]]
 // CHECK11:       omp_offload.cont73:
 // CHECK11-NEXT:    [[TMP238:%.*]] = load i32, ptr [[CH]], align 4
@@ -9513,7 +9513,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP280:%.*]] = icmp ne i32 [[TMP279]], 0
 // CHECK11-NEXT:    br i1 [[TMP280]], label [[OMP_OFFLOAD_FAILED87:%.*]], label [[OMP_OFFLOAD_CONT88:%.*]]
 // CHECK11:       omp_offload.failed87:
-// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l562(i32 [[TMP239]], i32 [[TMP241]], ptr [[TMP242]], ptr [[TMP243]], ptr [[TMP244]]) #[[ATTR3]]
+// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l562(i32 [[TMP239]], i32 [[TMP241]], ptr [[TMP242]], ptr [[TMP243]], ptr [[TMP244]]) #[[ATTR2]]
 // CHECK11-NEXT:    br label [[OMP_OFFLOAD_CONT88]]
 // CHECK11:       omp_offload.cont88:
 // CHECK11-NEXT:    [[CALL:%.*]] = call noundef i32 @_Z5tmainIiET_v()
@@ -9536,7 +9536,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l369.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR1]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -9628,7 +9628,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l369.omp_outlined.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR2]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR1]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -9758,7 +9758,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l408.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR2]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR1]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -9850,7 +9850,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l408.omp_outlined.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR2]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR1]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -9982,7 +9982,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l447.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[CH:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR2]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[CH:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR1]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -10102,7 +10102,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l447.omp_outlined.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR2]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR1]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -10232,7 +10232,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l478.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR2]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR1]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -10324,7 +10324,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l478.omp_outlined.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR2]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR1]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -10456,7 +10456,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l506.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[CH:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR2]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[CH:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR1]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -10558,7 +10558,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l506.omp_outlined.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR1]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -10708,7 +10708,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l536.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR2]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR1]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -10800,7 +10800,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l536.omp_outlined.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR2]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR1]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -10928,7 +10928,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l562.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[CH:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR2]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[CH:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR1]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -11030,7 +11030,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l562.omp_outlined.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR1]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -11144,7 +11144,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@_Z5tmainIiET_v
-// CHECK11-SAME: () #[[ATTR4:[0-9]+]] comdat {
+// CHECK11-SAME: () #[[ATTR3:[0-9]+]] comdat {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[A:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[B:%.*]] = alloca ptr, align 4
@@ -11284,7 +11284,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP37:%.*]] = icmp ne i32 [[TMP36]], 0
 // CHECK11-NEXT:    br i1 [[TMP37]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK11:       omp_offload.failed:
-// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l42(i32 [[TMP1]], ptr [[TMP2]], ptr [[TMP3]], ptr [[TMP4]]) #[[ATTR3]]
+// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l42(i32 [[TMP1]], ptr [[TMP2]], ptr [[TMP3]], ptr [[TMP4]]) #[[ATTR2]]
 // CHECK11-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK11:       omp_offload.cont:
 // CHECK11-NEXT:    [[TMP38:%.*]] = load i32, ptr [[N]], align 4
@@ -11359,7 +11359,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP75:%.*]] = icmp ne i32 [[TMP74]], 0
 // CHECK11-NEXT:    br i1 [[TMP75]], label [[OMP_OFFLOAD_FAILED15:%.*]], label [[OMP_OFFLOAD_CONT16:%.*]]
 // CHECK11:       omp_offload.failed15:
-// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l51(i32 [[TMP39]], ptr [[TMP40]], ptr [[TMP41]], ptr [[TMP42]]) #[[ATTR3]]
+// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l51(i32 [[TMP39]], ptr [[TMP40]], ptr [[TMP41]], ptr [[TMP42]]) #[[ATTR2]]
 // CHECK11-NEXT:    br label [[OMP_OFFLOAD_CONT16]]
 // CHECK11:       omp_offload.cont16:
 // CHECK11-NEXT:    [[TMP76:%.*]] = load i32, ptr [[CH]], align 4
@@ -11443,7 +11443,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP118:%.*]] = icmp ne i32 [[TMP117]], 0
 // CHECK11-NEXT:    br i1 [[TMP118]], label [[OMP_OFFLOAD_FAILED29:%.*]], label [[OMP_OFFLOAD_CONT30:%.*]]
 // CHECK11:       omp_offload.failed29:
-// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l59(i32 [[TMP77]], i32 [[TMP79]], ptr [[TMP80]], ptr [[TMP81]], ptr [[TMP82]]) #[[ATTR3]]
+// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l59(i32 [[TMP77]], i32 [[TMP79]], ptr [[TMP80]], ptr [[TMP81]], ptr [[TMP82]]) #[[ATTR2]]
 // CHECK11-NEXT:    br label [[OMP_OFFLOAD_CONT30]]
 // CHECK11:       omp_offload.cont30:
 // CHECK11-NEXT:    [[TMP119:%.*]] = load i32, ptr [[N]], align 4
@@ -11518,7 +11518,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP156:%.*]] = icmp ne i32 [[TMP155]], 0
 // CHECK11-NEXT:    br i1 [[TMP156]], label [[OMP_OFFLOAD_FAILED43:%.*]], label [[OMP_OFFLOAD_CONT44:%.*]]
 // CHECK11:       omp_offload.failed43:
-// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l67(i32 [[TMP120]], ptr [[TMP121]], ptr [[TMP122]], ptr [[TMP123]]) #[[ATTR3]]
+// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l67(i32 [[TMP120]], ptr [[TMP121]], ptr [[TMP122]], ptr [[TMP123]]) #[[ATTR2]]
 // CHECK11-NEXT:    br label [[OMP_OFFLOAD_CONT44]]
 // CHECK11:       omp_offload.cont44:
 // CHECK11-NEXT:    [[TMP157:%.*]] = load i32, ptr [[CH]], align 4
@@ -11602,7 +11602,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP199:%.*]] = icmp ne i32 [[TMP198]], 0
 // CHECK11-NEXT:    br i1 [[TMP199]], label [[OMP_OFFLOAD_FAILED58:%.*]], label [[OMP_OFFLOAD_CONT59:%.*]]
 // CHECK11:       omp_offload.failed58:
-// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l75(i32 [[TMP158]], i32 [[TMP160]], ptr [[TMP161]], ptr [[TMP162]], ptr [[TMP163]]) #[[ATTR3]]
+// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l75(i32 [[TMP158]], i32 [[TMP160]], ptr [[TMP161]], ptr [[TMP162]], ptr [[TMP163]]) #[[ATTR2]]
 // CHECK11-NEXT:    br label [[OMP_OFFLOAD_CONT59]]
 // CHECK11:       omp_offload.cont59:
 // CHECK11-NEXT:    [[TMP200:%.*]] = load i32, ptr [[N]], align 4
@@ -11677,7 +11677,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP237:%.*]] = icmp ne i32 [[TMP236]], 0
 // CHECK11-NEXT:    br i1 [[TMP237]], label [[OMP_OFFLOAD_FAILED72:%.*]], label [[OMP_OFFLOAD_CONT73:%.*]]
 // CHECK11:       omp_offload.failed72:
-// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l83(i32 [[TMP201]], ptr [[TMP202]], ptr [[TMP203]], ptr [[TMP204]]) #[[ATTR3]]
+// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l83(i32 [[TMP201]], ptr [[TMP202]], ptr [[TMP203]], ptr [[TMP204]]) #[[ATTR2]]
 // CHECK11-NEXT:    br label [[OMP_OFFLOAD_CONT73]]
 // CHECK11:       omp_offload.cont73:
 // CHECK11-NEXT:    [[TMP238:%.*]] = load i32, ptr [[CH]], align 4
@@ -11761,7 +11761,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP280:%.*]] = icmp ne i32 [[TMP279]], 0
 // CHECK11-NEXT:    br i1 [[TMP280]], label [[OMP_OFFLOAD_FAILED87:%.*]], label [[OMP_OFFLOAD_CONT88:%.*]]
 // CHECK11:       omp_offload.failed87:
-// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l91(i32 [[TMP239]], i32 [[TMP241]], ptr [[TMP242]], ptr [[TMP243]], ptr [[TMP244]]) #[[ATTR3]]
+// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l91(i32 [[TMP239]], i32 [[TMP241]], ptr [[TMP242]], ptr [[TMP243]], ptr [[TMP244]]) #[[ATTR2]]
 // CHECK11-NEXT:    br label [[OMP_OFFLOAD_CONT88]]
 // CHECK11:       omp_offload.cont88:
 // CHECK11-NEXT:    ret i32 0
@@ -11783,7 +11783,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l42.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR2]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR1]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -11875,7 +11875,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l42.omp_outlined.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR2]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR1]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -12020,7 +12020,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l51.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR2]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR1]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -12112,7 +12112,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l51.omp_outlined.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR2]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR1]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -12244,7 +12244,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l59.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[CH:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR2]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[CH:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR1]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -12364,7 +12364,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l59.omp_outlined.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR2]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR1]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -12494,7 +12494,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l67.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR2]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR1]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -12586,7 +12586,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l67.omp_outlined.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR2]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR1]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -12718,7 +12718,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l75.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[CH:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR2]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[CH:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR1]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -12820,7 +12820,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l75.omp_outlined.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR1]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -12970,7 +12970,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l83.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR2]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR1]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -13062,7 +13062,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l83.omp_outlined.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR2]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR1]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -13190,7 +13190,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l91.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[CH:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR2]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[CH:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR1]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -13292,7 +13292,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l91.omp_outlined.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR1]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -13406,7 +13406,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK11-SAME: () #[[ATTR5:[0-9]+]] {
+// CHECK11-SAME: () #[[ATTR4:[0-9]+]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK11-NEXT:    ret void
diff --git a/clang/test/OpenMP/distribute_parallel_for_firstprivate_codegen.cpp b/clang/test/OpenMP/distribute_parallel_for_firstprivate_codegen.cpp
index fd9b1af08d84c..9f900facc6a54 100644
--- a/clang/test/OpenMP/distribute_parallel_for_firstprivate_codegen.cpp
+++ b/clang/test/OpenMP/distribute_parallel_for_firstprivate_codegen.cpp
@@ -300,7 +300,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l67.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[G:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[G1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SFVAR:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[G:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[G1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SFVAR:%.*]]) #[[ATTR2]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -409,7 +409,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l67.omp_outlined.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[G:%.*]], i64 noundef [[G1:%.*]], i64 noundef [[SVAR:%.*]], i64 noundef [[SFVAR:%.*]]) #[[ATTR3]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[G:%.*]], i64 noundef [[G1:%.*]], i64 noundef [[SVAR:%.*]], i64 noundef [[SFVAR:%.*]]) #[[ATTR2]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -505,7 +505,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK1-SAME: () #[[ATTR5:[0-9]+]] {
+// CHECK1-SAME: () #[[ATTR4:[0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK1-NEXT:    ret void
@@ -559,7 +559,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l67.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[G:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[G1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SFVAR:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[G:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[G1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SFVAR:%.*]]) #[[ATTR2]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -662,7 +662,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l67.omp_outlined.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[G:%.*]], i32 noundef [[G1:%.*]], i32 noundef [[SVAR:%.*]], i32 noundef [[SFVAR:%.*]]) #[[ATTR3]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[G:%.*]], i32 noundef [[G1:%.*]], i32 noundef [[SVAR:%.*]], i32 noundef [[SFVAR:%.*]]) #[[ATTR2]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -760,7 +760,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK3-SAME: () #[[ATTR5:[0-9]+]] {
+// CHECK3-SAME: () #[[ATTR4:[0-9]+]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK3-NEXT:    ret void
@@ -868,7 +868,7 @@ int main() {
 // CHECK8-NEXT:    [[TMP39:%.*]] = icmp ne i32 [[TMP38]], 0
 // CHECK8-NEXT:    br i1 [[TMP39]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK8:       omp_offload.failed:
-// CHECK8-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l139(i64 [[TMP2]], ptr [[VEC]], ptr [[S_ARR]], ptr [[TMP3]], i64 [[TMP5]]) #[[ATTR5:[0-9]+]]
+// CHECK8-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l139(i64 [[TMP2]], ptr [[VEC]], ptr [[S_ARR]], ptr [[TMP3]], i64 [[TMP5]]) #[[ATTR4:[0-9]+]]
 // CHECK8-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK8:       omp_offload.cont:
 // CHECK8-NEXT:    [[CALL:%.*]] = call noundef signext i32 @_Z5tmainIiET_v()
@@ -879,11 +879,11 @@ int main() {
 // CHECK8:       arraydestroy.body:
 // CHECK8-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP40]], [[OMP_OFFLOAD_CONT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK8-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
-// CHECK8-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK8-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK8-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN]]
 // CHECK8-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE2:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK8:       arraydestroy.done2:
-// CHECK8-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR5]]
+// CHECK8-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]]
 // CHECK8-NEXT:    [[TMP41:%.*]] = load i32, ptr [[RETVAL]], align 4
 // CHECK8-NEXT:    ret i32 [[TMP41]]
 //
@@ -935,7 +935,7 @@ int main() {
 //
 //
 // CHECK8-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l139.omp_outlined
-// CHECK8-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]]) #[[ATTR4:[0-9]+]] {
+// CHECK8-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]]) #[[ATTR3]] {
 // CHECK8-NEXT:  entry:
 // CHECK8-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK8-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1051,14 +1051,14 @@ int main() {
 // CHECK8-NEXT:    [[TMP28:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
 // CHECK8-NEXT:    [[TMP29:%.*]] = load i32, ptr [[TMP28]], align 4
 // CHECK8-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP29]])
-// CHECK8-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR7]]) #[[ATTR5]]
+// CHECK8-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR7]]) #[[ATTR4]]
 // CHECK8-NEXT:    [[ARRAY_BEGIN11:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR5]], i32 0, i32 0
 // CHECK8-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN11]], i64 2
 // CHECK8-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK8:       arraydestroy.body:
 // CHECK8-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP30]], [[OMP_LOOP_EXIT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK8-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
-// CHECK8-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK8-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK8-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN11]]
 // CHECK8-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE12:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK8:       arraydestroy.done12:
@@ -1066,7 +1066,7 @@ int main() {
 //
 //
 // CHECK8-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l139.omp_outlined.omp_outlined
-// CHECK8-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i64 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], i64 noundef [[SVAR:%.*]]) #[[ATTR4]] {
+// CHECK8-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i64 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], i64 noundef [[SVAR:%.*]]) #[[ATTR3]] {
 // CHECK8-NEXT:  entry:
 // CHECK8-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK8-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1182,14 +1182,14 @@ int main() {
 // CHECK8-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
 // CHECK8-NEXT:    [[TMP21:%.*]] = load i32, ptr [[TMP20]], align 4
 // CHECK8-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP21]])
-// CHECK8-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR6]]) #[[ATTR5]]
+// CHECK8-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR6]]) #[[ATTR4]]
 // CHECK8-NEXT:    [[ARRAY_BEGIN12:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR4]], i32 0, i32 0
 // CHECK8-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN12]], i64 2
 // CHECK8-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK8:       arraydestroy.body:
 // CHECK8-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP22]], [[OMP_LOOP_EXIT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK8-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
-// CHECK8-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK8-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK8-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN12]]
 // CHECK8-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE13:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK8:       arraydestroy.done13:
@@ -1202,12 +1202,12 @@ int main() {
 // CHECK8-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK8-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
 // CHECK8-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
-// CHECK8-NEXT:    call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]]
+// CHECK8-NEXT:    call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]]
 // CHECK8-NEXT:    ret void
 //
 //
 // CHECK8-LABEL: define {{[^@]+}}@_Z5tmainIiET_v
-// CHECK8-SAME: () #[[ATTR6:[0-9]+]] comdat {
+// CHECK8-SAME: () #[[ATTR5:[0-9]+]] comdat {
 // CHECK8-NEXT:  entry:
 // CHECK8-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
 // CHECK8-NEXT:    [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4
@@ -1294,7 +1294,7 @@ int main() {
 // CHECK8-NEXT:    [[TMP34:%.*]] = icmp ne i32 [[TMP33]], 0
 // CHECK8-NEXT:    br i1 [[TMP34]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK8:       omp_offload.failed:
-// CHECK8-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l48(i64 [[TMP2]], ptr [[VEC]], ptr [[S_ARR]], ptr [[TMP3]]) #[[ATTR5]]
+// CHECK8-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l48(i64 [[TMP2]], ptr [[VEC]], ptr [[S_ARR]], ptr [[TMP3]]) #[[ATTR4]]
 // CHECK8-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK8:       omp_offload.cont:
 // CHECK8-NEXT:    store i32 0, ptr [[RETVAL]], align 4
@@ -1304,11 +1304,11 @@ int main() {
 // CHECK8:       arraydestroy.body:
 // CHECK8-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP35]], [[OMP_OFFLOAD_CONT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK8-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
-// CHECK8-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK8-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK8-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN]]
 // CHECK8-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE2:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK8:       arraydestroy.done2:
-// CHECK8-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR5]]
+// CHECK8-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]]
 // CHECK8-NEXT:    [[TMP36:%.*]] = load i32, ptr [[RETVAL]], align 4
 // CHECK8-NEXT:    ret i32 [[TMP36]]
 //
@@ -1392,7 +1392,7 @@ int main() {
 //
 //
 // CHECK8-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l48.omp_outlined
-// CHECK8-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR4]] {
+// CHECK8-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR3]] {
 // CHECK8-NEXT:  entry:
 // CHECK8-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK8-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1498,14 +1498,14 @@ int main() {
 // CHECK8-NEXT:    [[TMP24:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
 // CHECK8-NEXT:    [[TMP25:%.*]] = load i32, ptr [[TMP24]], align 4
 // CHECK8-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP25]])
-// CHECK8-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR7]]) #[[ATTR5]]
+// CHECK8-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR7]]) #[[ATTR4]]
 // CHECK8-NEXT:    [[ARRAY_BEGIN10:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR5]], i32 0, i32 0
 // CHECK8-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN10]], i64 2
 // CHECK8-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK8:       arraydestroy.body:
 // CHECK8-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP26]], [[OMP_LOOP_EXIT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK8-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
-// CHECK8-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK8-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK8-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN10]]
 // CHECK8-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE11:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK8:       arraydestroy.done11:
@@ -1513,7 +1513,7 @@ int main() {
 //
 //
 // CHECK8-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l48.omp_outlined.omp_outlined
-// CHECK8-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i64 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR4]] {
+// CHECK8-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i64 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR3]] {
 // CHECK8-NEXT:  entry:
 // CHECK8-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK8-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1627,14 +1627,14 @@ int main() {
 // CHECK8-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
 // CHECK8-NEXT:    [[TMP21:%.*]] = load i32, ptr [[TMP20]], align 4
 // CHECK8-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP21]])
-// CHECK8-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR6]]) #[[ATTR5]]
+// CHECK8-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR6]]) #[[ATTR4]]
 // CHECK8-NEXT:    [[ARRAY_BEGIN12:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR4]], i32 0, i32 0
 // CHECK8-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN12]], i64 2
 // CHECK8-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK8:       arraydestroy.body:
 // CHECK8-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP22]], [[OMP_LOOP_EXIT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK8-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
-// CHECK8-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK8-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK8-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN12]]
 // CHECK8-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE13:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK8:       arraydestroy.done13:
@@ -1647,7 +1647,7 @@ int main() {
 // CHECK8-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK8-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
 // CHECK8-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
-// CHECK8-NEXT:    call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]]
+// CHECK8-NEXT:    call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]]
 // CHECK8-NEXT:    ret void
 //
 //
@@ -1686,7 +1686,7 @@ int main() {
 //
 //
 // CHECK8-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK8-SAME: () #[[ATTR7:[0-9]+]] {
+// CHECK8-SAME: () #[[ATTR6:[0-9]+]] {
 // CHECK8-NEXT:  entry:
 // CHECK8-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK8-NEXT:    ret void
@@ -1794,7 +1794,7 @@ int main() {
 // CHECK10-NEXT:    [[TMP39:%.*]] = icmp ne i32 [[TMP38]], 0
 // CHECK10-NEXT:    br i1 [[TMP39]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK10:       omp_offload.failed:
-// CHECK10-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l139(i32 [[TMP2]], ptr [[VEC]], ptr [[S_ARR]], ptr [[TMP3]], i32 [[TMP5]]) #[[ATTR5:[0-9]+]]
+// CHECK10-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l139(i32 [[TMP2]], ptr [[VEC]], ptr [[S_ARR]], ptr [[TMP3]], i32 [[TMP5]]) #[[ATTR4:[0-9]+]]
 // CHECK10-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK10:       omp_offload.cont:
 // CHECK10-NEXT:    [[CALL:%.*]] = call noundef i32 @_Z5tmainIiET_v()
@@ -1805,11 +1805,11 @@ int main() {
 // CHECK10:       arraydestroy.body:
 // CHECK10-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP40]], [[OMP_OFFLOAD_CONT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK10-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1
-// CHECK10-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK10-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK10-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN]]
 // CHECK10-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE2:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK10:       arraydestroy.done2:
-// CHECK10-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR5]]
+// CHECK10-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]]
 // CHECK10-NEXT:    [[TMP41:%.*]] = load i32, ptr [[RETVAL]], align 4
 // CHECK10-NEXT:    ret i32 [[TMP41]]
 //
@@ -1861,7 +1861,7 @@ int main() {
 //
 //
 // CHECK10-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l139.omp_outlined
-// CHECK10-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]]) #[[ATTR4:[0-9]+]] {
+// CHECK10-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]]) #[[ATTR3]] {
 // CHECK10-NEXT:  entry:
 // CHECK10-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK10-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1975,14 +1975,14 @@ int main() {
 // CHECK10-NEXT:    [[TMP26:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
 // CHECK10-NEXT:    [[TMP27:%.*]] = load i32, ptr [[TMP26]], align 4
 // CHECK10-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP27]])
-// CHECK10-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR7]]) #[[ATTR5]]
+// CHECK10-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR7]]) #[[ATTR4]]
 // CHECK10-NEXT:    [[ARRAY_BEGIN11:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR5]], i32 0, i32 0
 // CHECK10-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN11]], i32 2
 // CHECK10-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK10:       arraydestroy.body:
 // CHECK10-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP28]], [[OMP_LOOP_EXIT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK10-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1
-// CHECK10-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK10-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK10-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN11]]
 // CHECK10-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE12:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK10:       arraydestroy.done12:
@@ -1990,7 +1990,7 @@ int main() {
 //
 //
 // CHECK10-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l139.omp_outlined.omp_outlined
-// CHECK10-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i32 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], i32 noundef [[SVAR:%.*]]) #[[ATTR4]] {
+// CHECK10-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i32 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], i32 noundef [[SVAR:%.*]]) #[[ATTR3]] {
 // CHECK10-NEXT:  entry:
 // CHECK10-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK10-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -2102,14 +2102,14 @@ int main() {
 // CHECK10-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
 // CHECK10-NEXT:    [[TMP21:%.*]] = load i32, ptr [[TMP20]], align 4
 // CHECK10-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP21]])
-// CHECK10-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR5]]
+// CHECK10-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR4]]
 // CHECK10-NEXT:    [[ARRAY_BEGIN10:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR3]], i32 0, i32 0
 // CHECK10-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN10]], i32 2
 // CHECK10-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK10:       arraydestroy.body:
 // CHECK10-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP22]], [[OMP_LOOP_EXIT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK10-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1
-// CHECK10-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK10-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK10-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN10]]
 // CHECK10-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE11:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK10:       arraydestroy.done11:
@@ -2122,12 +2122,12 @@ int main() {
 // CHECK10-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
 // CHECK10-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
 // CHECK10-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
-// CHECK10-NEXT:    call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]]
+// CHECK10-NEXT:    call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]]
 // CHECK10-NEXT:    ret void
 //
 //
 // CHECK10-LABEL: define {{[^@]+}}@_Z5tmainIiET_v
-// CHECK10-SAME: () #[[ATTR6:[0-9]+]] comdat {
+// CHECK10-SAME: () #[[ATTR5:[0-9]+]] comdat {
 // CHECK10-NEXT:  entry:
 // CHECK10-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
 // CHECK10-NEXT:    [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4
@@ -2214,7 +2214,7 @@ int main() {
 // CHECK10-NEXT:    [[TMP34:%.*]] = icmp ne i32 [[TMP33]], 0
 // CHECK10-NEXT:    br i1 [[TMP34]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK10:       omp_offload.failed:
-// CHECK10-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l48(i32 [[TMP2]], ptr [[VEC]], ptr [[S_ARR]], ptr [[TMP3]]) #[[ATTR5]]
+// CHECK10-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l48(i32 [[TMP2]], ptr [[VEC]], ptr [[S_ARR]], ptr [[TMP3]]) #[[ATTR4]]
 // CHECK10-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK10:       omp_offload.cont:
 // CHECK10-NEXT:    store i32 0, ptr [[RETVAL]], align 4
@@ -2224,11 +2224,11 @@ int main() {
 // CHECK10:       arraydestroy.body:
 // CHECK10-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP35]], [[OMP_OFFLOAD_CONT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK10-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1
-// CHECK10-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK10-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK10-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN]]
 // CHECK10-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE2:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK10:       arraydestroy.done2:
-// CHECK10-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR5]]
+// CHECK10-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]]
 // CHECK10-NEXT:    [[TMP36:%.*]] = load i32, ptr [[RETVAL]], align 4
 // CHECK10-NEXT:    ret i32 [[TMP36]]
 //
@@ -2312,7 +2312,7 @@ int main() {
 //
 //
 // CHECK10-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l48.omp_outlined
-// CHECK10-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR4]] {
+// CHECK10-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR3]] {
 // CHECK10-NEXT:  entry:
 // CHECK10-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK10-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -2416,14 +2416,14 @@ int main() {
 // CHECK10-NEXT:    [[TMP22:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
 // CHECK10-NEXT:    [[TMP23:%.*]] = load i32, ptr [[TMP22]], align 4
 // CHECK10-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP23]])
-// CHECK10-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR7]]) #[[ATTR5]]
+// CHECK10-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR7]]) #[[ATTR4]]
 // CHECK10-NEXT:    [[ARRAY_BEGIN10:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR5]], i32 0, i32 0
 // CHECK10-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN10]], i32 2
 // CHECK10-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK10:       arraydestroy.body:
 // CHECK10-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP24]], [[OMP_LOOP_EXIT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK10-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1
-// CHECK10-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK10-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK10-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN10]]
 // CHECK10-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE11:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK10:       arraydestroy.done11:
@@ -2431,7 +2431,7 @@ int main() {
 //
 //
 // CHECK10-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l48.omp_outlined.omp_outlined
-// CHECK10-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i32 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR4]] {
+// CHECK10-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i32 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR3]] {
 // CHECK10-NEXT:  entry:
 // CHECK10-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK10-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -2541,14 +2541,14 @@ int main() {
 // CHECK10-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
 // CHECK10-NEXT:    [[TMP21:%.*]] = load i32, ptr [[TMP20]], align 4
 // CHECK10-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP21]])
-// CHECK10-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR5]]
+// CHECK10-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR4]]
 // CHECK10-NEXT:    [[ARRAY_BEGIN10:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR3]], i32 0, i32 0
 // CHECK10-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN10]], i32 2
 // CHECK10-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK10:       arraydestroy.body:
 // CHECK10-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP22]], [[OMP_LOOP_EXIT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK10-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1
-// CHECK10-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK10-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK10-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN10]]
 // CHECK10-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE11:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK10:       arraydestroy.done11:
@@ -2561,7 +2561,7 @@ int main() {
 // CHECK10-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
 // CHECK10-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
 // CHECK10-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
-// CHECK10-NEXT:    call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]]
+// CHECK10-NEXT:    call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]]
 // CHECK10-NEXT:    ret void
 //
 //
@@ -2600,7 +2600,7 @@ int main() {
 //
 //
 // CHECK10-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK10-SAME: () #[[ATTR7:[0-9]+]] {
+// CHECK10-SAME: () #[[ATTR6:[0-9]+]] {
 // CHECK10-NEXT:  entry:
 // CHECK10-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK10-NEXT:    ret void
diff --git a/clang/test/OpenMP/distribute_parallel_for_if_codegen.cpp b/clang/test/OpenMP/distribute_parallel_for_if_codegen.cpp
index b539d2a8e9458..83c9f504ccaca 100644
--- a/clang/test/OpenMP/distribute_parallel_for_if_codegen.cpp
+++ b/clang/test/OpenMP/distribute_parallel_for_if_codegen.cpp
@@ -157,7 +157,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
 // CHECK1-NEXT:    br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK1:       omp_offload.failed:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l47() #[[ATTR3:[0-9]+]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l47() #[[ATTR2:[0-9]+]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
@@ -190,7 +190,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0
 // CHECK1-NEXT:    br i1 [[TMP29]], label [[OMP_OFFLOAD_FAILED3:%.*]], label [[OMP_OFFLOAD_CONT4:%.*]]
 // CHECK1:       omp_offload.failed3:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l52() #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l52() #[[ATTR2]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT4]]
 // CHECK1:       omp_offload.cont4:
 // CHECK1-NEXT:    ret void
@@ -204,7 +204,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l47.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -264,7 +264,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l47.omp_outlined.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -341,7 +341,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l52.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -389,7 +389,7 @@ int main() {
 // CHECK1-NEXT:    call void @__kmpc_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP1]])
 // CHECK1-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
 // CHECK1-NEXT:    store i32 0, ptr [[DOTBOUND_ZERO_ADDR]], align 4
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l52.omp_outlined.omp_outlined(ptr [[TMP11]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP8]], i64 [[TMP10]]) #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l52.omp_outlined.omp_outlined(ptr [[TMP11]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP8]], i64 [[TMP10]]) #[[ATTR2]]
 // CHECK1-NEXT:    call void @__kmpc_end_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP1]])
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK1:       omp.inner.for.inc:
@@ -406,7 +406,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l52.omp_outlined.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -477,7 +477,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@main
-// CHECK1-SAME: () #[[ATTR4:[0-9]+]] {
+// CHECK1-SAME: () #[[ATTR3:[0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[TMP:%.*]] = alloca i32, align 4
@@ -521,7 +521,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
 // CHECK1-NEXT:    br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK1:       omp_offload.failed:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l85() #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l85() #[[ATTR2]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
@@ -554,7 +554,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0
 // CHECK1-NEXT:    br i1 [[TMP29]], label [[OMP_OFFLOAD_FAILED3:%.*]], label [[OMP_OFFLOAD_CONT4:%.*]]
 // CHECK1:       omp_offload.failed3:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l94() #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l94() #[[ATTR2]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT4]]
 // CHECK1:       omp_offload.cont4:
 // CHECK1-NEXT:    [[TMP30:%.*]] = load i32, ptr @Arg, align 4
@@ -598,7 +598,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP51:%.*]] = icmp ne i32 [[TMP50]], 0
 // CHECK1-NEXT:    br i1 [[TMP51]], label [[OMP_OFFLOAD_FAILED7:%.*]], label [[OMP_OFFLOAD_CONT8:%.*]]
 // CHECK1:       omp_offload.failed7:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l103(i64 [[TMP31]]) #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l103(i64 [[TMP31]]) #[[ATTR2]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT8]]
 // CHECK1:       omp_offload.cont8:
 // CHECK1-NEXT:    [[TMP52:%.*]] = load i32, ptr @Arg, align 4
@@ -614,7 +614,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l85.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -674,7 +674,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l85.omp_outlined.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -752,7 +752,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l94.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -800,7 +800,7 @@ int main() {
 // CHECK1-NEXT:    call void @__kmpc_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP1]])
 // CHECK1-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
 // CHECK1-NEXT:    store i32 0, ptr [[DOTBOUND_ZERO_ADDR]], align 4
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l94.omp_outlined.omp_outlined(ptr [[TMP11]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP8]], i64 [[TMP10]]) #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l94.omp_outlined.omp_outlined(ptr [[TMP11]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP8]], i64 [[TMP10]]) #[[ATTR2]]
 // CHECK1-NEXT:    call void @__kmpc_end_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP1]])
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK1:       omp.inner.for.inc:
@@ -817,7 +817,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l94.omp_outlined.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -897,7 +897,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l103.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[ARG:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[ARG:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -955,7 +955,7 @@ int main() {
 // CHECK1-NEXT:    call void @__kmpc_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP2]])
 // CHECK1-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
 // CHECK1-NEXT:    store i32 0, ptr [[DOTBOUND_ZERO_ADDR]], align 4
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l103.omp_outlined.omp_outlined(ptr [[TMP13]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP9]], i64 [[TMP11]]) #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l103.omp_outlined.omp_outlined(ptr [[TMP13]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP9]], i64 [[TMP11]]) #[[ATTR2]]
 // CHECK1-NEXT:    call void @__kmpc_end_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP2]])
 // CHECK1-NEXT:    br label [[OMP_IF_END]]
 // CHECK1:       omp_if.end:
@@ -974,7 +974,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l103.omp_outlined.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1089,7 +1089,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
 // CHECK1-NEXT:    br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK1:       omp_offload.failed:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l63() #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l63() #[[ATTR2]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
@@ -1122,7 +1122,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0
 // CHECK1-NEXT:    br i1 [[TMP29]], label [[OMP_OFFLOAD_FAILED3:%.*]], label [[OMP_OFFLOAD_CONT4:%.*]]
 // CHECK1:       omp_offload.failed3:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l69() #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l69() #[[ATTR2]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT4]]
 // CHECK1:       omp_offload.cont4:
 // CHECK1-NEXT:    [[TMP30:%.*]] = load i32, ptr [[ARG_ADDR]], align 4
@@ -1166,7 +1166,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP51:%.*]] = icmp ne i32 [[TMP50]], 0
 // CHECK1-NEXT:    br i1 [[TMP51]], label [[OMP_OFFLOAD_FAILED7:%.*]], label [[OMP_OFFLOAD_CONT8:%.*]]
 // CHECK1:       omp_offload.failed7:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l75(i64 [[TMP31]]) #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l75(i64 [[TMP31]]) #[[ATTR2]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT8]]
 // CHECK1:       omp_offload.cont8:
 // CHECK1-NEXT:    ret i32 0
@@ -1180,7 +1180,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l63.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1240,7 +1240,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l63.omp_outlined.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1318,7 +1318,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l69.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1366,7 +1366,7 @@ int main() {
 // CHECK1-NEXT:    call void @__kmpc_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP1]])
 // CHECK1-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
 // CHECK1-NEXT:    store i32 0, ptr [[DOTBOUND_ZERO_ADDR]], align 4
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l69.omp_outlined.omp_outlined(ptr [[TMP11]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP8]], i64 [[TMP10]]) #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l69.omp_outlined.omp_outlined(ptr [[TMP11]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP8]], i64 [[TMP10]]) #[[ATTR2]]
 // CHECK1-NEXT:    call void @__kmpc_end_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP1]])
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK1:       omp.inner.for.inc:
@@ -1383,7 +1383,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l69.omp_outlined.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1463,7 +1463,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l75.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[ARG:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[ARG:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1521,7 +1521,7 @@ int main() {
 // CHECK1-NEXT:    call void @__kmpc_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP2]])
 // CHECK1-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
 // CHECK1-NEXT:    store i32 0, ptr [[DOTBOUND_ZERO_ADDR]], align 4
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l75.omp_outlined.omp_outlined(ptr [[TMP13]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP9]], i64 [[TMP11]]) #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l75.omp_outlined.omp_outlined(ptr [[TMP13]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP9]], i64 [[TMP11]]) #[[ATTR2]]
 // CHECK1-NEXT:    call void @__kmpc_end_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP2]])
 // CHECK1-NEXT:    br label [[OMP_IF_END]]
 // CHECK1:       omp_if.end:
@@ -1540,7 +1540,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l75.omp_outlined.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1611,7 +1611,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK1-SAME: () #[[ATTR6:[0-9]+]] {
+// CHECK1-SAME: () #[[ATTR5:[0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK1-NEXT:    ret void
diff --git a/clang/test/OpenMP/distribute_parallel_for_lastprivate_codegen.cpp b/clang/test/OpenMP/distribute_parallel_for_lastprivate_codegen.cpp
index cbd9c3fabaee2..8c44a1e71ae79 100644
--- a/clang/test/OpenMP/distribute_parallel_for_lastprivate_codegen.cpp
+++ b/clang/test/OpenMP/distribute_parallel_for_lastprivate_codegen.cpp
@@ -241,7 +241,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[G:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[G1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SFVAR:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[G:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[G1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SFVAR:%.*]]) #[[ATTR2]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -341,7 +341,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68.omp_outlined.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[G:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[G1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SFVAR:%.*]]) #[[ATTR3]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[G:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[G1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SFVAR:%.*]]) #[[ATTR2]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -463,7 +463,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK1-SAME: () #[[ATTR5:[0-9]+]] {
+// CHECK1-SAME: () #[[ATTR4:[0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK1-NEXT:    ret void
@@ -517,7 +517,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[G:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[G1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SFVAR:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[G:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[G1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SFVAR:%.*]]) #[[ATTR2]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -615,7 +615,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68.omp_outlined.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[G:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[G1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SFVAR:%.*]]) #[[ATTR3]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[G:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[G1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SFVAR:%.*]]) #[[ATTR2]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -735,7 +735,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK3-SAME: () #[[ATTR5:[0-9]+]] {
+// CHECK3-SAME: () #[[ATTR4:[0-9]+]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK3-NEXT:    ret void
@@ -844,7 +844,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP39:%.*]] = icmp ne i32 [[TMP38]], 0
 // CHECK9-NEXT:    br i1 [[TMP39]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK9:       omp_offload.failed:
-// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l123(i64 [[TMP2]], ptr [[VEC]], ptr [[S_ARR]], ptr [[TMP3]], i64 [[TMP5]]) #[[ATTR5:[0-9]+]]
+// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l123(i64 [[TMP2]], ptr [[VEC]], ptr [[S_ARR]], ptr [[TMP3]], i64 [[TMP5]]) #[[ATTR4:[0-9]+]]
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK9:       omp_offload.cont:
 // CHECK9-NEXT:    [[CALL:%.*]] = call noundef signext i32 @_Z5tmainIiET_v()
@@ -855,11 +855,11 @@ int main() {
 // CHECK9:       arraydestroy.body:
 // CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP40]], [[OMP_OFFLOAD_CONT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
-// CHECK9-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK9-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN]]
 // CHECK9-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE2:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK9:       arraydestroy.done2:
-// CHECK9-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]]
 // CHECK9-NEXT:    [[TMP41:%.*]] = load i32, ptr [[RETVAL]], align 4
 // CHECK9-NEXT:    ret i32 [[TMP41]]
 //
@@ -911,7 +911,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l123.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]]) #[[ATTR4:[0-9]+]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]]) #[[ATTR3]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1037,14 +1037,14 @@ int main() {
 // CHECK9-NEXT:    store i32 [[TMP28]], ptr [[TMP4]], align 4
 // CHECK9-NEXT:    br label [[DOTOMP_LASTPRIVATE_DONE]]
 // CHECK9:       .omp.lastprivate.done:
-// CHECK9-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR6]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR6]]) #[[ATTR4]]
 // CHECK9-NEXT:    [[ARRAY_BEGIN12:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR5]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN12]], i64 2
 // CHECK9-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK9:       arraydestroy.body:
 // CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP29]], [[DOTOMP_LASTPRIVATE_DONE]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
-// CHECK9-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK9-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN12]]
 // CHECK9-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE13:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK9:       arraydestroy.done13:
@@ -1052,7 +1052,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l123.omp_outlined.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]]) #[[ATTR4]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]]) #[[ATTR3]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1194,14 +1194,14 @@ int main() {
 // CHECK9-NEXT:    store i32 [[TMP28]], ptr [[TMP4]], align 4
 // CHECK9-NEXT:    br label [[DOTOMP_LASTPRIVATE_DONE]]
 // CHECK9:       .omp.lastprivate.done:
-// CHECK9-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR6]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR6]]) #[[ATTR4]]
 // CHECK9-NEXT:    [[ARRAY_BEGIN15:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR5]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN15]], i64 2
 // CHECK9-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK9:       arraydestroy.body:
 // CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP29]], [[DOTOMP_LASTPRIVATE_DONE]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
-// CHECK9-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK9-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN15]]
 // CHECK9-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE16:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK9:       arraydestroy.done16:
@@ -1214,12 +1214,12 @@ int main() {
 // CHECK9-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
 // CHECK9-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
-// CHECK9-NEXT:    call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]]
 // CHECK9-NEXT:    ret void
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@_Z5tmainIiET_v
-// CHECK9-SAME: () #[[ATTR6:[0-9]+]] comdat {
+// CHECK9-SAME: () #[[ATTR5:[0-9]+]] comdat {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
 // CHECK9-NEXT:    [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4
@@ -1306,7 +1306,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP34:%.*]] = icmp ne i32 [[TMP33]], 0
 // CHECK9-NEXT:    br i1 [[TMP34]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK9:       omp_offload.failed:
-// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l49(i64 [[TMP2]], ptr [[VEC]], ptr [[S_ARR]], ptr [[TMP3]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l49(i64 [[TMP2]], ptr [[VEC]], ptr [[S_ARR]], ptr [[TMP3]]) #[[ATTR4]]
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK9:       omp_offload.cont:
 // CHECK9-NEXT:    store i32 0, ptr [[RETVAL]], align 4
@@ -1316,11 +1316,11 @@ int main() {
 // CHECK9:       arraydestroy.body:
 // CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP35]], [[OMP_OFFLOAD_CONT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
-// CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK9-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN]]
 // CHECK9-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE2:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK9:       arraydestroy.done2:
-// CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]]
 // CHECK9-NEXT:    [[TMP36:%.*]] = load i32, ptr [[RETVAL]], align 4
 // CHECK9-NEXT:    ret i32 [[TMP36]]
 //
@@ -1404,7 +1404,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l49.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR4]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR3]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1524,14 +1524,14 @@ int main() {
 // CHECK9-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP5]], ptr align 4 [[TMP26]], i64 4, i1 false)
 // CHECK9-NEXT:    br label [[DOTOMP_LASTPRIVATE_DONE]]
 // CHECK9:       .omp.lastprivate.done:
-// CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR6]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR6]]) #[[ATTR4]]
 // CHECK9-NEXT:    [[ARRAY_BEGIN11:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR5]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN11]], i64 2
 // CHECK9-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK9:       arraydestroy.body:
 // CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP27]], [[DOTOMP_LASTPRIVATE_DONE]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
-// CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK9-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN11]]
 // CHECK9-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE12:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK9:       arraydestroy.done12:
@@ -1539,7 +1539,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l49.omp_outlined.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR4]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR3]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1675,14 +1675,14 @@ int main() {
 // CHECK9-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP6]], ptr align 4 [[TMP26]], i64 4, i1 false)
 // CHECK9-NEXT:    br label [[DOTOMP_LASTPRIVATE_DONE]]
 // CHECK9:       .omp.lastprivate.done:
-// CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR6]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR6]]) #[[ATTR4]]
 // CHECK9-NEXT:    [[ARRAY_BEGIN14:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR5]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN14]], i64 2
 // CHECK9-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK9:       arraydestroy.body:
 // CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP27]], [[DOTOMP_LASTPRIVATE_DONE]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
-// CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK9-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN14]]
 // CHECK9-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE15:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK9:       arraydestroy.done15:
@@ -1695,7 +1695,7 @@ int main() {
 // CHECK9-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
 // CHECK9-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
-// CHECK9-NEXT:    call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]]
 // CHECK9-NEXT:    ret void
 //
 //
@@ -1734,7 +1734,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK9-SAME: () #[[ATTR7:[0-9]+]] {
+// CHECK9-SAME: () #[[ATTR6:[0-9]+]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK9-NEXT:    ret void
@@ -1843,7 +1843,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP39:%.*]] = icmp ne i32 [[TMP38]], 0
 // CHECK11-NEXT:    br i1 [[TMP39]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK11:       omp_offload.failed:
-// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l123(i32 [[TMP2]], ptr [[VEC]], ptr [[S_ARR]], ptr [[TMP3]], i32 [[TMP5]]) #[[ATTR5:[0-9]+]]
+// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l123(i32 [[TMP2]], ptr [[VEC]], ptr [[S_ARR]], ptr [[TMP3]], i32 [[TMP5]]) #[[ATTR4:[0-9]+]]
 // CHECK11-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK11:       omp_offload.cont:
 // CHECK11-NEXT:    [[CALL:%.*]] = call noundef i32 @_Z5tmainIiET_v()
@@ -1854,11 +1854,11 @@ int main() {
 // CHECK11:       arraydestroy.body:
 // CHECK11-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP40]], [[OMP_OFFLOAD_CONT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK11-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1
-// CHECK11-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK11-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN]]
 // CHECK11-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE2:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK11:       arraydestroy.done2:
-// CHECK11-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]]
 // CHECK11-NEXT:    [[TMP41:%.*]] = load i32, ptr [[RETVAL]], align 4
 // CHECK11-NEXT:    ret i32 [[TMP41]]
 //
@@ -1910,7 +1910,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l123.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]]) #[[ATTR4:[0-9]+]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]]) #[[ATTR3]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -2034,14 +2034,14 @@ int main() {
 // CHECK11-NEXT:    store i32 [[TMP26]], ptr [[TMP4]], align 4
 // CHECK11-NEXT:    br label [[DOTOMP_LASTPRIVATE_DONE]]
 // CHECK11:       .omp.lastprivate.done:
-// CHECK11-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR6]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR6]]) #[[ATTR4]]
 // CHECK11-NEXT:    [[ARRAY_BEGIN12:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR5]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN12]], i32 2
 // CHECK11-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK11:       arraydestroy.body:
 // CHECK11-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP27]], [[DOTOMP_LASTPRIVATE_DONE]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK11-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1
-// CHECK11-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK11-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN12]]
 // CHECK11-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE13:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK11:       arraydestroy.done13:
@@ -2049,7 +2049,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l123.omp_outlined.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]]) #[[ATTR4]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]]) #[[ATTR3]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -2187,14 +2187,14 @@ int main() {
 // CHECK11-NEXT:    store i32 [[TMP28]], ptr [[TMP4]], align 4
 // CHECK11-NEXT:    br label [[DOTOMP_LASTPRIVATE_DONE]]
 // CHECK11:       .omp.lastprivate.done:
-// CHECK11-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR4]]
 // CHECK11-NEXT:    [[ARRAY_BEGIN13:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR4]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN13]], i32 2
 // CHECK11-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK11:       arraydestroy.body:
 // CHECK11-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP29]], [[DOTOMP_LASTPRIVATE_DONE]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK11-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1
-// CHECK11-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK11-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN13]]
 // CHECK11-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE14:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK11:       arraydestroy.done14:
@@ -2207,12 +2207,12 @@ int main() {
 // CHECK11-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
 // CHECK11-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
-// CHECK11-NEXT:    call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]]
 // CHECK11-NEXT:    ret void
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@_Z5tmainIiET_v
-// CHECK11-SAME: () #[[ATTR6:[0-9]+]] comdat {
+// CHECK11-SAME: () #[[ATTR5:[0-9]+]] comdat {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4
@@ -2299,7 +2299,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP34:%.*]] = icmp ne i32 [[TMP33]], 0
 // CHECK11-NEXT:    br i1 [[TMP34]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK11:       omp_offload.failed:
-// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l49(i32 [[TMP2]], ptr [[VEC]], ptr [[S_ARR]], ptr [[TMP3]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l49(i32 [[TMP2]], ptr [[VEC]], ptr [[S_ARR]], ptr [[TMP3]]) #[[ATTR4]]
 // CHECK11-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK11:       omp_offload.cont:
 // CHECK11-NEXT:    store i32 0, ptr [[RETVAL]], align 4
@@ -2309,11 +2309,11 @@ int main() {
 // CHECK11:       arraydestroy.body:
 // CHECK11-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP35]], [[OMP_OFFLOAD_CONT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK11-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1
-// CHECK11-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK11-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN]]
 // CHECK11-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE2:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK11:       arraydestroy.done2:
-// CHECK11-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]]
 // CHECK11-NEXT:    [[TMP36:%.*]] = load i32, ptr [[RETVAL]], align 4
 // CHECK11-NEXT:    ret i32 [[TMP36]]
 //
@@ -2397,7 +2397,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l49.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR4]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR3]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -2515,14 +2515,14 @@ int main() {
 // CHECK11-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[TMP5]], ptr align 4 [[TMP24]], i32 4, i1 false)
 // CHECK11-NEXT:    br label [[DOTOMP_LASTPRIVATE_DONE]]
 // CHECK11:       .omp.lastprivate.done:
-// CHECK11-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR6]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR6]]) #[[ATTR4]]
 // CHECK11-NEXT:    [[ARRAY_BEGIN11:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR5]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN11]], i32 2
 // CHECK11-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK11:       arraydestroy.body:
 // CHECK11-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP25]], [[DOTOMP_LASTPRIVATE_DONE]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK11-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1
-// CHECK11-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK11-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN11]]
 // CHECK11-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE12:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK11:       arraydestroy.done12:
@@ -2530,7 +2530,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l49.omp_outlined.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR4]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR3]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -2662,14 +2662,14 @@ int main() {
 // CHECK11-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[TMP6]], ptr align 4 [[TMP26]], i32 4, i1 false)
 // CHECK11-NEXT:    br label [[DOTOMP_LASTPRIVATE_DONE]]
 // CHECK11:       .omp.lastprivate.done:
-// CHECK11-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR4]]
 // CHECK11-NEXT:    [[ARRAY_BEGIN12:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR4]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN12]], i32 2
 // CHECK11-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK11:       arraydestroy.body:
 // CHECK11-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP27]], [[DOTOMP_LASTPRIVATE_DONE]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK11-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1
-// CHECK11-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK11-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN12]]
 // CHECK11-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE13:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK11:       arraydestroy.done13:
@@ -2682,7 +2682,7 @@ int main() {
 // CHECK11-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
 // CHECK11-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
-// CHECK11-NEXT:    call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]]
 // CHECK11-NEXT:    ret void
 //
 //
@@ -2721,7 +2721,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK11-SAME: () #[[ATTR7:[0-9]+]] {
+// CHECK11-SAME: () #[[ATTR6:[0-9]+]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK11-NEXT:    ret void
diff --git a/clang/test/OpenMP/distribute_parallel_for_num_threads_codegen.cpp b/clang/test/OpenMP/distribute_parallel_for_num_threads_codegen.cpp
index 37c50321bdcdf..9f769ca2886fe 100644
--- a/clang/test/OpenMP/distribute_parallel_for_num_threads_codegen.cpp
+++ b/clang/test/OpenMP/distribute_parallel_for_num_threads_codegen.cpp
@@ -145,7 +145,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
 // CHECK1-NEXT:    br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK1:       omp_offload.failed:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68() #[[ATTR5:[0-9]+]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68() #[[ATTR4:[0-9]+]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       lpad:
 // CHECK1-NEXT:    [[TMP15:%.*]] = landingpad { ptr, i32 }
@@ -154,7 +154,7 @@ int main() {
 // CHECK1-NEXT:    store ptr [[TMP16]], ptr [[EXN_SLOT]], align 8
 // CHECK1-NEXT:    [[TMP17:%.*]] = extractvalue { ptr, i32 } [[TMP15]], 1
 // CHECK1-NEXT:    store i32 [[TMP17]], ptr [[EHSELECTOR_SLOT]], align 4
-// CHECK1-NEXT:    call void @_ZN1SD1Ev(ptr noundef nonnull align 8 dereferenceable(24) [[S]]) #[[ATTR5]]
+// CHECK1-NEXT:    call void @_ZN1SD1Ev(ptr noundef nonnull align 8 dereferenceable(24) [[S]]) #[[ATTR4]]
 // CHECK1-NEXT:    br label [[EH_RESUME:%.*]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    [[TMP18:%.*]] = load i8, ptr [[A]], align 1
@@ -198,7 +198,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP39:%.*]] = icmp ne i32 [[TMP38]], 0
 // CHECK1-NEXT:    br i1 [[TMP39]], label [[OMP_OFFLOAD_FAILED3:%.*]], label [[OMP_OFFLOAD_CONT4:%.*]]
 // CHECK1:       omp_offload.failed3:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l74(i64 [[TMP19]]) #[[ATTR5]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l74(i64 [[TMP19]]) #[[ATTR4]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT4]]
 // CHECK1:       omp_offload.cont4:
 // CHECK1-NEXT:    [[TMP40:%.*]] = load i8, ptr [[A]], align 1
@@ -212,7 +212,7 @@ int main() {
 // CHECK1:       invoke.cont7:
 // CHECK1-NEXT:    [[ADD9:%.*]] = add nsw i32 [[ADD]], [[CALL8]]
 // CHECK1-NEXT:    store i32 [[ADD9]], ptr [[RETVAL]], align 4
-// CHECK1-NEXT:    call void @_ZN1SD1Ev(ptr noundef nonnull align 8 dereferenceable(24) [[S]]) #[[ATTR5]]
+// CHECK1-NEXT:    call void @_ZN1SD1Ev(ptr noundef nonnull align 8 dereferenceable(24) [[S]]) #[[ATTR4]]
 // CHECK1-NEXT:    [[TMP41:%.*]] = load i32, ptr [[RETVAL]], align 4
 // CHECK1-NEXT:    ret i32 [[TMP41]]
 // CHECK1:       eh.resume:
@@ -257,7 +257,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR4:[0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR3]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -318,7 +318,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68.omp_outlined.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR4]] personality ptr @__gxx_personality_v0 {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -392,14 +392,14 @@ int main() {
 // CHECK1-NEXT:    [[TMP11:%.*]] = landingpad { ptr, i32 }
 // CHECK1-NEXT:    catch ptr null
 // CHECK1-NEXT:    [[TMP12:%.*]] = extractvalue { ptr, i32 } [[TMP11]], 0
-// CHECK1-NEXT:    call void @__clang_call_terminate(ptr [[TMP12]]) #[[ATTR11:[0-9]+]]
+// CHECK1-NEXT:    call void @__clang_call_terminate(ptr [[TMP12]]) #[[ATTR10:[0-9]+]]
 // CHECK1-NEXT:    unreachable
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@__clang_call_terminate
-// CHECK1-SAME: (ptr noundef [[TMP0:%.*]]) #[[ATTR7:[0-9]+]] comdat {
-// CHECK1-NEXT:    [[TMP2:%.*]] = call ptr @__cxa_begin_catch(ptr [[TMP0]]) #[[ATTR5]]
-// CHECK1-NEXT:    call void @_ZSt9terminatev() #[[ATTR11]]
+// CHECK1-SAME: (ptr noundef [[TMP0:%.*]]) #[[ATTR6:[0-9]+]] comdat {
+// CHECK1-NEXT:    [[TMP2:%.*]] = call ptr @__cxa_begin_catch(ptr [[TMP0]]) #[[ATTR4]]
+// CHECK1-NEXT:    call void @_ZSt9terminatev() #[[ATTR10]]
 // CHECK1-NEXT:    unreachable
 //
 //
@@ -413,7 +413,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l74.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[A:%.*]]) #[[ATTR4]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[A:%.*]]) #[[ATTR3]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -479,7 +479,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l74.omp_outlined.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR4]] personality ptr @__gxx_personality_v0 {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -553,12 +553,12 @@ int main() {
 // CHECK1-NEXT:    [[TMP11:%.*]] = landingpad { ptr, i32 }
 // CHECK1-NEXT:    catch ptr null
 // CHECK1-NEXT:    [[TMP12:%.*]] = extractvalue { ptr, i32 } [[TMP11]], 0
-// CHECK1-NEXT:    call void @__clang_call_terminate(ptr [[TMP12]]) #[[ATTR11]]
+// CHECK1-NEXT:    call void @__clang_call_terminate(ptr [[TMP12]]) #[[ATTR10]]
 // CHECK1-NEXT:    unreachable
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@_Z5tmainIcLi5EEiv
-// CHECK1-SAME: () #[[ATTR8:[0-9]+]] comdat {
+// CHECK1-SAME: () #[[ATTR7:[0-9]+]] comdat {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[TMP:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
@@ -594,7 +594,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
 // CHECK1-NEXT:    br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK1:       omp_offload.failed:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l52() #[[ATTR5]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l52() #[[ATTR4]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
@@ -627,14 +627,14 @@ int main() {
 // CHECK1-NEXT:    [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0
 // CHECK1-NEXT:    br i1 [[TMP29]], label [[OMP_OFFLOAD_FAILED3:%.*]], label [[OMP_OFFLOAD_CONT4:%.*]]
 // CHECK1:       omp_offload.failed3:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l57() #[[ATTR5]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l57() #[[ATTR4]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT4]]
 // CHECK1:       omp_offload.cont4:
 // CHECK1-NEXT:    ret i32 0
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@_Z5tmainI1SLi1EEiv
-// CHECK1-SAME: () #[[ATTR8]] comdat {
+// CHECK1-SAME: () #[[ATTR7]] comdat {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[TMP:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
@@ -670,7 +670,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
 // CHECK1-NEXT:    br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK1:       omp_offload.failed:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l52() #[[ATTR5]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l52() #[[ATTR4]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
@@ -703,24 +703,24 @@ int main() {
 // CHECK1-NEXT:    [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0
 // CHECK1-NEXT:    br i1 [[TMP29]], label [[OMP_OFFLOAD_FAILED3:%.*]], label [[OMP_OFFLOAD_CONT4:%.*]]
 // CHECK1:       omp_offload.failed3:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l57() #[[ATTR5]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l57() #[[ATTR4]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT4]]
 // CHECK1:       omp_offload.cont4:
 // CHECK1-NEXT:    ret i32 0
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@_ZN1SD1Ev
-// CHECK1-SAME: (ptr noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]]) unnamed_addr #[[ATTR9:[0-9]+]] comdat align 2 {
+// CHECK1-SAME: (ptr noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]]) unnamed_addr #[[ATTR8:[0-9]+]] comdat align 2 {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
 // CHECK1-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
-// CHECK1-NEXT:    call void @_ZN1SD2Ev(ptr noundef nonnull align 8 dereferenceable(24) [[THIS1]]) #[[ATTR5]]
+// CHECK1-NEXT:    call void @_ZN1SD2Ev(ptr noundef nonnull align 8 dereferenceable(24) [[THIS1]]) #[[ATTR4]]
 // CHECK1-NEXT:    ret void
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@_ZN1SC2El
-// CHECK1-SAME: (ptr noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]], i64 noundef [[A:%.*]]) unnamed_addr #[[ATTR9]] comdat align 2 {
+// CHECK1-SAME: (ptr noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]], i64 noundef [[A:%.*]]) unnamed_addr #[[ATTR8]] comdat align 2 {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
@@ -734,7 +734,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@_ZN1SD2Ev
-// CHECK1-SAME: (ptr noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]]) unnamed_addr #[[ATTR9]] comdat align 2 {
+// CHECK1-SAME: (ptr noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]]) unnamed_addr #[[ATTR8]] comdat align 2 {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
@@ -750,7 +750,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l52.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR4]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR3]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -811,7 +811,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l52.omp_outlined.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR4]] personality ptr @__gxx_personality_v0 {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -885,7 +885,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP11:%.*]] = landingpad { ptr, i32 }
 // CHECK1-NEXT:    catch ptr null
 // CHECK1-NEXT:    [[TMP12:%.*]] = extractvalue { ptr, i32 } [[TMP11]], 0
-// CHECK1-NEXT:    call void @__clang_call_terminate(ptr [[TMP12]]) #[[ATTR11]]
+// CHECK1-NEXT:    call void @__clang_call_terminate(ptr [[TMP12]]) #[[ATTR10]]
 // CHECK1-NEXT:    unreachable
 //
 //
@@ -897,7 +897,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l57.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR4]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR3]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -958,7 +958,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l57.omp_outlined.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR4]] personality ptr @__gxx_personality_v0 {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1032,7 +1032,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP11:%.*]] = landingpad { ptr, i32 }
 // CHECK1-NEXT:    catch ptr null
 // CHECK1-NEXT:    [[TMP12:%.*]] = extractvalue { ptr, i32 } [[TMP11]], 0
-// CHECK1-NEXT:    call void @__clang_call_terminate(ptr [[TMP12]]) #[[ATTR11]]
+// CHECK1-NEXT:    call void @__clang_call_terminate(ptr [[TMP12]]) #[[ATTR10]]
 // CHECK1-NEXT:    unreachable
 //
 //
@@ -1044,7 +1044,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l52.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR4]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR3]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1105,7 +1105,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l52.omp_outlined.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR4]] personality ptr @__gxx_personality_v0 {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1179,7 +1179,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP11:%.*]] = landingpad { ptr, i32 }
 // CHECK1-NEXT:    catch ptr null
 // CHECK1-NEXT:    [[TMP12:%.*]] = extractvalue { ptr, i32 } [[TMP11]], 0
-// CHECK1-NEXT:    call void @__clang_call_terminate(ptr [[TMP12]]) #[[ATTR11]]
+// CHECK1-NEXT:    call void @__clang_call_terminate(ptr [[TMP12]]) #[[ATTR10]]
 // CHECK1-NEXT:    unreachable
 //
 //
@@ -1191,7 +1191,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l57.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR4]] personality ptr @__gxx_personality_v0 {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1240,7 +1240,7 @@ int main() {
 // CHECK1:       invoke.cont2:
 // CHECK1-NEXT:    [[TMP7:%.*]] = sext i8 [[CALL]] to i32
 // CHECK1-NEXT:    call void @__kmpc_push_num_threads(ptr @[[GLOB3]], i32 [[TMP1]], i32 [[TMP7]])
-// CHECK1-NEXT:    call void @_ZN1SD1Ev(ptr noundef nonnull align 8 dereferenceable(24) [[REF_TMP]]) #[[ATTR5]]
+// CHECK1-NEXT:    call void @_ZN1SD1Ev(ptr noundef nonnull align 8 dereferenceable(24) [[REF_TMP]]) #[[ATTR4]]
 // CHECK1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
 // CHECK1-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP8]] to i64
 // CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
@@ -1262,12 +1262,12 @@ int main() {
 // CHECK1-NEXT:    [[TMP14:%.*]] = landingpad { ptr, i32 }
 // CHECK1-NEXT:    catch ptr null
 // CHECK1-NEXT:    [[TMP15:%.*]] = extractvalue { ptr, i32 } [[TMP14]], 0
-// CHECK1-NEXT:    call void @__clang_call_terminate(ptr [[TMP15]]) #[[ATTR11]]
+// CHECK1-NEXT:    call void @__clang_call_terminate(ptr [[TMP15]]) #[[ATTR10]]
 // CHECK1-NEXT:    unreachable
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l57.omp_outlined.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR4]] personality ptr @__gxx_personality_v0 {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1341,12 +1341,12 @@ int main() {
 // CHECK1-NEXT:    [[TMP11:%.*]] = landingpad { ptr, i32 }
 // CHECK1-NEXT:    catch ptr null
 // CHECK1-NEXT:    [[TMP12:%.*]] = extractvalue { ptr, i32 } [[TMP11]], 0
-// CHECK1-NEXT:    call void @__clang_call_terminate(ptr [[TMP12]]) #[[ATTR11]]
+// CHECK1-NEXT:    call void @__clang_call_terminate(ptr [[TMP12]]) #[[ATTR10]]
 // CHECK1-NEXT:    unreachable
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK1-SAME: () #[[ATTR10:[0-9]+]] {
+// CHECK1-SAME: () #[[ATTR9:[0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK1-NEXT:    ret void
@@ -1404,7 +1404,7 @@ int main() {
 // CHECK5-NEXT:    [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
 // CHECK5-NEXT:    br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK5:       omp_offload.failed:
-// CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68() #[[ATTR5:[0-9]+]]
+// CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68() #[[ATTR4:[0-9]+]]
 // CHECK5-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK5:       lpad:
 // CHECK5-NEXT:    [[TMP15:%.*]] = landingpad { ptr, i32 }
@@ -1413,7 +1413,7 @@ int main() {
 // CHECK5-NEXT:    store ptr [[TMP16]], ptr [[EXN_SLOT]], align 8
 // CHECK5-NEXT:    [[TMP17:%.*]] = extractvalue { ptr, i32 } [[TMP15]], 1
 // CHECK5-NEXT:    store i32 [[TMP17]], ptr [[EHSELECTOR_SLOT]], align 4
-// CHECK5-NEXT:    call void @_ZN1SD1Ev(ptr noundef nonnull align 8 dereferenceable(24) [[S]]) #[[ATTR5]]
+// CHECK5-NEXT:    call void @_ZN1SD1Ev(ptr noundef nonnull align 8 dereferenceable(24) [[S]]) #[[ATTR4]]
 // CHECK5-NEXT:    br label [[EH_RESUME:%.*]]
 // CHECK5:       omp_offload.cont:
 // CHECK5-NEXT:    [[TMP18:%.*]] = load i8, ptr [[A]], align 1
@@ -1457,7 +1457,7 @@ int main() {
 // CHECK5-NEXT:    [[TMP39:%.*]] = icmp ne i32 [[TMP38]], 0
 // CHECK5-NEXT:    br i1 [[TMP39]], label [[OMP_OFFLOAD_FAILED3:%.*]], label [[OMP_OFFLOAD_CONT4:%.*]]
 // CHECK5:       omp_offload.failed3:
-// CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l74(i64 [[TMP19]]) #[[ATTR5]]
+// CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l74(i64 [[TMP19]]) #[[ATTR4]]
 // CHECK5-NEXT:    br label [[OMP_OFFLOAD_CONT4]]
 // CHECK5:       omp_offload.cont4:
 // CHECK5-NEXT:    [[TMP40:%.*]] = load i8, ptr [[A]], align 1
@@ -1471,7 +1471,7 @@ int main() {
 // CHECK5:       invoke.cont7:
 // CHECK5-NEXT:    [[ADD9:%.*]] = add nsw i32 [[ADD]], [[CALL8]]
 // CHECK5-NEXT:    store i32 [[ADD9]], ptr [[RETVAL]], align 4
-// CHECK5-NEXT:    call void @_ZN1SD1Ev(ptr noundef nonnull align 8 dereferenceable(24) [[S]]) #[[ATTR5]]
+// CHECK5-NEXT:    call void @_ZN1SD1Ev(ptr noundef nonnull align 8 dereferenceable(24) [[S]]) #[[ATTR4]]
 // CHECK5-NEXT:    [[TMP41:%.*]] = load i32, ptr [[RETVAL]], align 4
 // CHECK5-NEXT:    ret i32 [[TMP41]]
 // CHECK5:       eh.resume:
@@ -1516,7 +1516,7 @@ int main() {
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68.omp_outlined
-// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR4:[0-9]+]] {
+// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR3]] {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1577,7 +1577,7 @@ int main() {
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68.omp_outlined.omp_outlined
-// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR4]] personality ptr @__gxx_personality_v0 {
+// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1651,14 +1651,14 @@ int main() {
 // CHECK5-NEXT:    [[TMP11:%.*]] = landingpad { ptr, i32 }
 // CHECK5-NEXT:    catch ptr null
 // CHECK5-NEXT:    [[TMP12:%.*]] = extractvalue { ptr, i32 } [[TMP11]], 0
-// CHECK5-NEXT:    call void @__clang_call_terminate(ptr [[TMP12]]) #[[ATTR11:[0-9]+]]
+// CHECK5-NEXT:    call void @__clang_call_terminate(ptr [[TMP12]]) #[[ATTR10:[0-9]+]]
 // CHECK5-NEXT:    unreachable
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@__clang_call_terminate
-// CHECK5-SAME: (ptr noundef [[TMP0:%.*]]) #[[ATTR7:[0-9]+]] comdat {
-// CHECK5-NEXT:    [[TMP2:%.*]] = call ptr @__cxa_begin_catch(ptr [[TMP0]]) #[[ATTR5]]
-// CHECK5-NEXT:    call void @_ZSt9terminatev() #[[ATTR11]]
+// CHECK5-SAME: (ptr noundef [[TMP0:%.*]]) #[[ATTR6:[0-9]+]] comdat {
+// CHECK5-NEXT:    [[TMP2:%.*]] = call ptr @__cxa_begin_catch(ptr [[TMP0]]) #[[ATTR4]]
+// CHECK5-NEXT:    call void @_ZSt9terminatev() #[[ATTR10]]
 // CHECK5-NEXT:    unreachable
 //
 //
@@ -1672,7 +1672,7 @@ int main() {
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l74.omp_outlined
-// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[A:%.*]]) #[[ATTR4]] {
+// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[A:%.*]]) #[[ATTR3]] {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1738,7 +1738,7 @@ int main() {
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l74.omp_outlined.omp_outlined
-// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR4]] personality ptr @__gxx_personality_v0 {
+// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1812,12 +1812,12 @@ int main() {
 // CHECK5-NEXT:    [[TMP11:%.*]] = landingpad { ptr, i32 }
 // CHECK5-NEXT:    catch ptr null
 // CHECK5-NEXT:    [[TMP12:%.*]] = extractvalue { ptr, i32 } [[TMP11]], 0
-// CHECK5-NEXT:    call void @__clang_call_terminate(ptr [[TMP12]]) #[[ATTR11]]
+// CHECK5-NEXT:    call void @__clang_call_terminate(ptr [[TMP12]]) #[[ATTR10]]
 // CHECK5-NEXT:    unreachable
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@_Z5tmainIcLi5EEiv
-// CHECK5-SAME: () #[[ATTR8:[0-9]+]] comdat {
+// CHECK5-SAME: () #[[ATTR7:[0-9]+]] comdat {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[TMP:%.*]] = alloca i32, align 4
 // CHECK5-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
@@ -1853,7 +1853,7 @@ int main() {
 // CHECK5-NEXT:    [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
 // CHECK5-NEXT:    br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK5:       omp_offload.failed:
-// CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l52() #[[ATTR5]]
+// CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l52() #[[ATTR4]]
 // CHECK5-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK5:       omp_offload.cont:
 // CHECK5-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
@@ -1886,14 +1886,14 @@ int main() {
 // CHECK5-NEXT:    [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0
 // CHECK5-NEXT:    br i1 [[TMP29]], label [[OMP_OFFLOAD_FAILED3:%.*]], label [[OMP_OFFLOAD_CONT4:%.*]]
 // CHECK5:       omp_offload.failed3:
-// CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l57() #[[ATTR5]]
+// CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l57() #[[ATTR4]]
 // CHECK5-NEXT:    br label [[OMP_OFFLOAD_CONT4]]
 // CHECK5:       omp_offload.cont4:
 // CHECK5-NEXT:    ret i32 0
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@_Z5tmainI1SLi1EEiv
-// CHECK5-SAME: () #[[ATTR8]] comdat {
+// CHECK5-SAME: () #[[ATTR7]] comdat {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[TMP:%.*]] = alloca i32, align 4
 // CHECK5-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
@@ -1929,7 +1929,7 @@ int main() {
 // CHECK5-NEXT:    [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
 // CHECK5-NEXT:    br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK5:       omp_offload.failed:
-// CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l52() #[[ATTR5]]
+// CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l52() #[[ATTR4]]
 // CHECK5-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK5:       omp_offload.cont:
 // CHECK5-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
@@ -1962,24 +1962,24 @@ int main() {
 // CHECK5-NEXT:    [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0
 // CHECK5-NEXT:    br i1 [[TMP29]], label [[OMP_OFFLOAD_FAILED3:%.*]], label [[OMP_OFFLOAD_CONT4:%.*]]
 // CHECK5:       omp_offload.failed3:
-// CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l57() #[[ATTR5]]
+// CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l57() #[[ATTR4]]
 // CHECK5-NEXT:    br label [[OMP_OFFLOAD_CONT4]]
 // CHECK5:       omp_offload.cont4:
 // CHECK5-NEXT:    ret i32 0
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@_ZN1SD1Ev
-// CHECK5-SAME: (ptr noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]]) unnamed_addr #[[ATTR9:[0-9]+]] comdat align 2 {
+// CHECK5-SAME: (ptr noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]]) unnamed_addr #[[ATTR8:[0-9]+]] comdat align 2 {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
 // CHECK5-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
-// CHECK5-NEXT:    call void @_ZN1SD2Ev(ptr noundef nonnull align 8 dereferenceable(24) [[THIS1]]) #[[ATTR5]]
+// CHECK5-NEXT:    call void @_ZN1SD2Ev(ptr noundef nonnull align 8 dereferenceable(24) [[THIS1]]) #[[ATTR4]]
 // CHECK5-NEXT:    ret void
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@_ZN1SC2El
-// CHECK5-SAME: (ptr noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]], i64 noundef [[A:%.*]]) unnamed_addr #[[ATTR9]] comdat align 2 {
+// CHECK5-SAME: (ptr noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]], i64 noundef [[A:%.*]]) unnamed_addr #[[ATTR8]] comdat align 2 {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
@@ -2000,7 +2000,7 @@ int main() {
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l52.omp_outlined
-// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR4]] {
+// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR3]] {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -2061,7 +2061,7 @@ int main() {
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l52.omp_outlined.omp_outlined
-// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR4]] personality ptr @__gxx_personality_v0 {
+// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -2135,7 +2135,7 @@ int main() {
 // CHECK5-NEXT:    [[TMP11:%.*]] = landingpad { ptr, i32 }
 // CHECK5-NEXT:    catch ptr null
 // CHECK5-NEXT:    [[TMP12:%.*]] = extractvalue { ptr, i32 } [[TMP11]], 0
-// CHECK5-NEXT:    call void @__clang_call_terminate(ptr [[TMP12]]) #[[ATTR11]]
+// CHECK5-NEXT:    call void @__clang_call_terminate(ptr [[TMP12]]) #[[ATTR10]]
 // CHECK5-NEXT:    unreachable
 //
 //
@@ -2147,7 +2147,7 @@ int main() {
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l57.omp_outlined
-// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR4]] {
+// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR3]] {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -2208,7 +2208,7 @@ int main() {
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l57.omp_outlined.omp_outlined
-// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR4]] personality ptr @__gxx_personality_v0 {
+// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -2282,7 +2282,7 @@ int main() {
 // CHECK5-NEXT:    [[TMP11:%.*]] = landingpad { ptr, i32 }
 // CHECK5-NEXT:    catch ptr null
 // CHECK5-NEXT:    [[TMP12:%.*]] = extractvalue { ptr, i32 } [[TMP11]], 0
-// CHECK5-NEXT:    call void @__clang_call_terminate(ptr [[TMP12]]) #[[ATTR11]]
+// CHECK5-NEXT:    call void @__clang_call_terminate(ptr [[TMP12]]) #[[ATTR10]]
 // CHECK5-NEXT:    unreachable
 //
 //
@@ -2294,7 +2294,7 @@ int main() {
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l52.omp_outlined
-// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR4]] {
+// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR3]] {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -2355,7 +2355,7 @@ int main() {
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l52.omp_outlined.omp_outlined
-// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR4]] personality ptr @__gxx_personality_v0 {
+// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -2429,7 +2429,7 @@ int main() {
 // CHECK5-NEXT:    [[TMP11:%.*]] = landingpad { ptr, i32 }
 // CHECK5-NEXT:    catch ptr null
 // CHECK5-NEXT:    [[TMP12:%.*]] = extractvalue { ptr, i32 } [[TMP11]], 0
-// CHECK5-NEXT:    call void @__clang_call_terminate(ptr [[TMP12]]) #[[ATTR11]]
+// CHECK5-NEXT:    call void @__clang_call_terminate(ptr [[TMP12]]) #[[ATTR10]]
 // CHECK5-NEXT:    unreachable
 //
 //
@@ -2441,7 +2441,7 @@ int main() {
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l57.omp_outlined
-// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR4]] personality ptr @__gxx_personality_v0 {
+// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -2490,7 +2490,7 @@ int main() {
 // CHECK5:       invoke.cont2:
 // CHECK5-NEXT:    [[TMP7:%.*]] = sext i8 [[CALL]] to i32
 // CHECK5-NEXT:    call void @__kmpc_push_num_threads(ptr @[[GLOB3]], i32 [[TMP1]], i32 [[TMP7]])
-// CHECK5-NEXT:    call void @_ZN1SD1Ev(ptr noundef nonnull align 8 dereferenceable(24) [[REF_TMP]]) #[[ATTR5]]
+// CHECK5-NEXT:    call void @_ZN1SD1Ev(ptr noundef nonnull align 8 dereferenceable(24) [[REF_TMP]]) #[[ATTR4]]
 // CHECK5-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
 // CHECK5-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP8]] to i64
 // CHECK5-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
@@ -2512,12 +2512,12 @@ int main() {
 // CHECK5-NEXT:    [[TMP14:%.*]] = landingpad { ptr, i32 }
 // CHECK5-NEXT:    catch ptr null
 // CHECK5-NEXT:    [[TMP15:%.*]] = extractvalue { ptr, i32 } [[TMP14]], 0
-// CHECK5-NEXT:    call void @__clang_call_terminate(ptr [[TMP15]]) #[[ATTR11]]
+// CHECK5-NEXT:    call void @__clang_call_terminate(ptr [[TMP15]]) #[[ATTR10]]
 // CHECK5-NEXT:    unreachable
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l57.omp_outlined.omp_outlined
-// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR4]] personality ptr @__gxx_personality_v0 {
+// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -2591,12 +2591,12 @@ int main() {
 // CHECK5-NEXT:    [[TMP11:%.*]] = landingpad { ptr, i32 }
 // CHECK5-NEXT:    catch ptr null
 // CHECK5-NEXT:    [[TMP12:%.*]] = extractvalue { ptr, i32 } [[TMP11]], 0
-// CHECK5-NEXT:    call void @__clang_call_terminate(ptr [[TMP12]]) #[[ATTR11]]
+// CHECK5-NEXT:    call void @__clang_call_terminate(ptr [[TMP12]]) #[[ATTR10]]
 // CHECK5-NEXT:    unreachable
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@_ZN1SD2Ev
-// CHECK5-SAME: (ptr noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]]) unnamed_addr #[[ATTR9]] comdat align 2 {
+// CHECK5-SAME: (ptr noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]]) unnamed_addr #[[ATTR8]] comdat align 2 {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
@@ -2605,7 +2605,7 @@ int main() {
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK5-SAME: () #[[ATTR10:[0-9]+]] {
+// CHECK5-SAME: () #[[ATTR9:[0-9]+]] {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK5-NEXT:    ret void
@@ -2663,7 +2663,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
 // CHECK9-NEXT:    br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK9:       omp_offload.failed:
-// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68() #[[ATTR5:[0-9]+]]
+// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68() #[[ATTR4:[0-9]+]]
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK9:       lpad:
 // CHECK9-NEXT:    [[TMP15:%.*]] = landingpad { ptr, i32 }
@@ -2672,7 +2672,7 @@ int main() {
 // CHECK9-NEXT:    store ptr [[TMP16]], ptr [[EXN_SLOT]], align 8
 // CHECK9-NEXT:    [[TMP17:%.*]] = extractvalue { ptr, i32 } [[TMP15]], 1
 // CHECK9-NEXT:    store i32 [[TMP17]], ptr [[EHSELECTOR_SLOT]], align 4
-// CHECK9-NEXT:    call void @_ZN1SD1Ev(ptr noundef nonnull align 8 dereferenceable(24) [[S]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SD1Ev(ptr noundef nonnull align 8 dereferenceable(24) [[S]]) #[[ATTR4]]
 // CHECK9-NEXT:    br label [[EH_RESUME:%.*]]
 // CHECK9:       omp_offload.cont:
 // CHECK9-NEXT:    [[TMP18:%.*]] = load i8, ptr [[A]], align 1
@@ -2716,7 +2716,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP39:%.*]] = icmp ne i32 [[TMP38]], 0
 // CHECK9-NEXT:    br i1 [[TMP39]], label [[OMP_OFFLOAD_FAILED3:%.*]], label [[OMP_OFFLOAD_CONT4:%.*]]
 // CHECK9:       omp_offload.failed3:
-// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l74(i64 [[TMP19]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l74(i64 [[TMP19]]) #[[ATTR4]]
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT4]]
 // CHECK9:       omp_offload.cont4:
 // CHECK9-NEXT:    [[TMP40:%.*]] = load i8, ptr [[A]], align 1
@@ -2730,7 +2730,7 @@ int main() {
 // CHECK9:       invoke.cont7:
 // CHECK9-NEXT:    [[ADD9:%.*]] = add nsw i32 [[ADD]], [[CALL8]]
 // CHECK9-NEXT:    store i32 [[ADD9]], ptr [[RETVAL]], align 4
-// CHECK9-NEXT:    call void @_ZN1SD1Ev(ptr noundef nonnull align 8 dereferenceable(24) [[S]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SD1Ev(ptr noundef nonnull align 8 dereferenceable(24) [[S]]) #[[ATTR4]]
 // CHECK9-NEXT:    [[TMP41:%.*]] = load i32, ptr [[RETVAL]], align 4
 // CHECK9-NEXT:    ret i32 [[TMP41]]
 // CHECK9:       eh.resume:
@@ -2775,7 +2775,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR4:[0-9]+]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR3]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -2836,7 +2836,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68.omp_outlined.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR4]] personality ptr @__gxx_personality_v0 {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -2910,14 +2910,14 @@ int main() {
 // CHECK9-NEXT:    [[TMP11:%.*]] = landingpad { ptr, i32 }
 // CHECK9-NEXT:    catch ptr null
 // CHECK9-NEXT:    [[TMP12:%.*]] = extractvalue { ptr, i32 } [[TMP11]], 0
-// CHECK9-NEXT:    call void @__clang_call_terminate(ptr [[TMP12]]) #[[ATTR11:[0-9]+]]
+// CHECK9-NEXT:    call void @__clang_call_terminate(ptr [[TMP12]]) #[[ATTR10:[0-9]+]]
 // CHECK9-NEXT:    unreachable
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@__clang_call_terminate
-// CHECK9-SAME: (ptr noundef [[TMP0:%.*]]) #[[ATTR7:[0-9]+]] comdat {
-// CHECK9-NEXT:    [[TMP2:%.*]] = call ptr @__cxa_begin_catch(ptr [[TMP0]]) #[[ATTR5]]
-// CHECK9-NEXT:    call void @_ZSt9terminatev() #[[ATTR11]]
+// CHECK9-SAME: (ptr noundef [[TMP0:%.*]]) #[[ATTR6:[0-9]+]] comdat {
+// CHECK9-NEXT:    [[TMP2:%.*]] = call ptr @__cxa_begin_catch(ptr [[TMP0]]) #[[ATTR4]]
+// CHECK9-NEXT:    call void @_ZSt9terminatev() #[[ATTR10]]
 // CHECK9-NEXT:    unreachable
 //
 //
@@ -2931,7 +2931,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l74.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[A:%.*]]) #[[ATTR4]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[A:%.*]]) #[[ATTR3]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -2997,7 +2997,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l74.omp_outlined.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR4]] personality ptr @__gxx_personality_v0 {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -3071,12 +3071,12 @@ int main() {
 // CHECK9-NEXT:    [[TMP11:%.*]] = landingpad { ptr, i32 }
 // CHECK9-NEXT:    catch ptr null
 // CHECK9-NEXT:    [[TMP12:%.*]] = extractvalue { ptr, i32 } [[TMP11]], 0
-// CHECK9-NEXT:    call void @__clang_call_terminate(ptr [[TMP12]]) #[[ATTR11]]
+// CHECK9-NEXT:    call void @__clang_call_terminate(ptr [[TMP12]]) #[[ATTR10]]
 // CHECK9-NEXT:    unreachable
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@_Z5tmainIcLi5EEiv
-// CHECK9-SAME: () #[[ATTR8:[0-9]+]] comdat {
+// CHECK9-SAME: () #[[ATTR7:[0-9]+]] comdat {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[TMP:%.*]] = alloca i32, align 4
 // CHECK9-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
@@ -3112,7 +3112,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
 // CHECK9-NEXT:    br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK9:       omp_offload.failed:
-// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l52() #[[ATTR5]]
+// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l52() #[[ATTR4]]
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK9:       omp_offload.cont:
 // CHECK9-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
@@ -3145,14 +3145,14 @@ int main() {
 // CHECK9-NEXT:    [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0
 // CHECK9-NEXT:    br i1 [[TMP29]], label [[OMP_OFFLOAD_FAILED3:%.*]], label [[OMP_OFFLOAD_CONT4:%.*]]
 // CHECK9:       omp_offload.failed3:
-// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l57() #[[ATTR5]]
+// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l57() #[[ATTR4]]
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT4]]
 // CHECK9:       omp_offload.cont4:
 // CHECK9-NEXT:    ret i32 0
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@_Z5tmainI1SLi1EEiv
-// CHECK9-SAME: () #[[ATTR8]] comdat {
+// CHECK9-SAME: () #[[ATTR7]] comdat {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[TMP:%.*]] = alloca i32, align 4
 // CHECK9-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
@@ -3188,7 +3188,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
 // CHECK9-NEXT:    br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK9:       omp_offload.failed:
-// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l52() #[[ATTR5]]
+// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l52() #[[ATTR4]]
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK9:       omp_offload.cont:
 // CHECK9-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
@@ -3221,24 +3221,24 @@ int main() {
 // CHECK9-NEXT:    [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0
 // CHECK9-NEXT:    br i1 [[TMP29]], label [[OMP_OFFLOAD_FAILED3:%.*]], label [[OMP_OFFLOAD_CONT4:%.*]]
 // CHECK9:       omp_offload.failed3:
-// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l57() #[[ATTR5]]
+// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l57() #[[ATTR4]]
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT4]]
 // CHECK9:       omp_offload.cont4:
 // CHECK9-NEXT:    ret i32 0
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@_ZN1SD1Ev
-// CHECK9-SAME: (ptr noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]]) unnamed_addr #[[ATTR9:[0-9]+]] comdat align 2 {
+// CHECK9-SAME: (ptr noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]]) unnamed_addr #[[ATTR8:[0-9]+]] comdat align 2 {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
 // CHECK9-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
-// CHECK9-NEXT:    call void @_ZN1SD2Ev(ptr noundef nonnull align 8 dereferenceable(24) [[THIS1]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SD2Ev(ptr noundef nonnull align 8 dereferenceable(24) [[THIS1]]) #[[ATTR4]]
 // CHECK9-NEXT:    ret void
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@_ZN1SC2El
-// CHECK9-SAME: (ptr noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]], i64 noundef [[A:%.*]]) unnamed_addr #[[ATTR9]] comdat align 2 {
+// CHECK9-SAME: (ptr noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]], i64 noundef [[A:%.*]]) unnamed_addr #[[ATTR8]] comdat align 2 {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
@@ -3252,7 +3252,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@_ZN1SD2Ev
-// CHECK9-SAME: (ptr noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]]) unnamed_addr #[[ATTR9]] comdat align 2 {
+// CHECK9-SAME: (ptr noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]]) unnamed_addr #[[ATTR8]] comdat align 2 {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
@@ -3268,7 +3268,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l52.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR4]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR3]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -3329,7 +3329,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l52.omp_outlined.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR4]] personality ptr @__gxx_personality_v0 {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -3403,7 +3403,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP11:%.*]] = landingpad { ptr, i32 }
 // CHECK9-NEXT:    catch ptr null
 // CHECK9-NEXT:    [[TMP12:%.*]] = extractvalue { ptr, i32 } [[TMP11]], 0
-// CHECK9-NEXT:    call void @__clang_call_terminate(ptr [[TMP12]]) #[[ATTR11]]
+// CHECK9-NEXT:    call void @__clang_call_terminate(ptr [[TMP12]]) #[[ATTR10]]
 // CHECK9-NEXT:    unreachable
 //
 //
@@ -3415,7 +3415,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l57.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR4]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR3]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -3476,7 +3476,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l57.omp_outlined.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR4]] personality ptr @__gxx_personality_v0 {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -3550,7 +3550,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP11:%.*]] = landingpad { ptr, i32 }
 // CHECK9-NEXT:    catch ptr null
 // CHECK9-NEXT:    [[TMP12:%.*]] = extractvalue { ptr, i32 } [[TMP11]], 0
-// CHECK9-NEXT:    call void @__clang_call_terminate(ptr [[TMP12]]) #[[ATTR11]]
+// CHECK9-NEXT:    call void @__clang_call_terminate(ptr [[TMP12]]) #[[ATTR10]]
 // CHECK9-NEXT:    unreachable
 //
 //
@@ -3562,7 +3562,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l52.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR4]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR3]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -3623,7 +3623,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l52.omp_outlined.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR4]] personality ptr @__gxx_personality_v0 {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -3697,7 +3697,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP11:%.*]] = landingpad { ptr, i32 }
 // CHECK9-NEXT:    catch ptr null
 // CHECK9-NEXT:    [[TMP12:%.*]] = extractvalue { ptr, i32 } [[TMP11]], 0
-// CHECK9-NEXT:    call void @__clang_call_terminate(ptr [[TMP12]]) #[[ATTR11]]
+// CHECK9-NEXT:    call void @__clang_call_terminate(ptr [[TMP12]]) #[[ATTR10]]
 // CHECK9-NEXT:    unreachable
 //
 //
@@ -3709,7 +3709,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l57.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR4]] personality ptr @__gxx_personality_v0 {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -3758,7 +3758,7 @@ int main() {
 // CHECK9:       invoke.cont2:
 // CHECK9-NEXT:    [[TMP7:%.*]] = sext i8 [[CALL]] to i32
 // CHECK9-NEXT:    call void @__kmpc_push_num_threads(ptr @[[GLOB3]], i32 [[TMP1]], i32 [[TMP7]])
-// CHECK9-NEXT:    call void @_ZN1SD1Ev(ptr noundef nonnull align 8 dereferenceable(24) [[REF_TMP]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SD1Ev(ptr noundef nonnull align 8 dereferenceable(24) [[REF_TMP]]) #[[ATTR4]]
 // CHECK9-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
 // CHECK9-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP8]] to i64
 // CHECK9-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
@@ -3780,12 +3780,12 @@ int main() {
 // CHECK9-NEXT:    [[TMP14:%.*]] = landingpad { ptr, i32 }
 // CHECK9-NEXT:    catch ptr null
 // CHECK9-NEXT:    [[TMP15:%.*]] = extractvalue { ptr, i32 } [[TMP14]], 0
-// CHECK9-NEXT:    call void @__clang_call_terminate(ptr [[TMP15]]) #[[ATTR11]]
+// CHECK9-NEXT:    call void @__clang_call_terminate(ptr [[TMP15]]) #[[ATTR10]]
 // CHECK9-NEXT:    unreachable
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l57.omp_outlined.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR4]] personality ptr @__gxx_personality_v0 {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -3859,12 +3859,12 @@ int main() {
 // CHECK9-NEXT:    [[TMP11:%.*]] = landingpad { ptr, i32 }
 // CHECK9-NEXT:    catch ptr null
 // CHECK9-NEXT:    [[TMP12:%.*]] = extractvalue { ptr, i32 } [[TMP11]], 0
-// CHECK9-NEXT:    call void @__clang_call_terminate(ptr [[TMP12]]) #[[ATTR11]]
+// CHECK9-NEXT:    call void @__clang_call_terminate(ptr [[TMP12]]) #[[ATTR10]]
 // CHECK9-NEXT:    unreachable
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK9-SAME: () #[[ATTR10:[0-9]+]] {
+// CHECK9-SAME: () #[[ATTR9:[0-9]+]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK9-NEXT:    ret void
@@ -3922,7 +3922,7 @@ int main() {
 // CHECK13-NEXT:    [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
 // CHECK13-NEXT:    br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK13:       omp_offload.failed:
-// CHECK13-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68() #[[ATTR5:[0-9]+]]
+// CHECK13-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68() #[[ATTR4:[0-9]+]]
 // CHECK13-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK13:       lpad:
 // CHECK13-NEXT:    [[TMP15:%.*]] = landingpad { ptr, i32 }
@@ -3931,7 +3931,7 @@ int main() {
 // CHECK13-NEXT:    store ptr [[TMP16]], ptr [[EXN_SLOT]], align 8
 // CHECK13-NEXT:    [[TMP17:%.*]] = extractvalue { ptr, i32 } [[TMP15]], 1
 // CHECK13-NEXT:    store i32 [[TMP17]], ptr [[EHSELECTOR_SLOT]], align 4
-// CHECK13-NEXT:    call void @_ZN1SD1Ev(ptr noundef nonnull align 8 dereferenceable(24) [[S]]) #[[ATTR5]]
+// CHECK13-NEXT:    call void @_ZN1SD1Ev(ptr noundef nonnull align 8 dereferenceable(24) [[S]]) #[[ATTR4]]
 // CHECK13-NEXT:    br label [[EH_RESUME:%.*]]
 // CHECK13:       omp_offload.cont:
 // CHECK13-NEXT:    [[TMP18:%.*]] = load i8, ptr [[A]], align 1
@@ -3975,7 +3975,7 @@ int main() {
 // CHECK13-NEXT:    [[TMP39:%.*]] = icmp ne i32 [[TMP38]], 0
 // CHECK13-NEXT:    br i1 [[TMP39]], label [[OMP_OFFLOAD_FAILED3:%.*]], label [[OMP_OFFLOAD_CONT4:%.*]]
 // CHECK13:       omp_offload.failed3:
-// CHECK13-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l74(i64 [[TMP19]]) #[[ATTR5]]
+// CHECK13-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l74(i64 [[TMP19]]) #[[ATTR4]]
 // CHECK13-NEXT:    br label [[OMP_OFFLOAD_CONT4]]
 // CHECK13:       omp_offload.cont4:
 // CHECK13-NEXT:    [[TMP40:%.*]] = load i8, ptr [[A]], align 1
@@ -3989,7 +3989,7 @@ int main() {
 // CHECK13:       invoke.cont7:
 // CHECK13-NEXT:    [[ADD9:%.*]] = add nsw i32 [[ADD]], [[CALL8]]
 // CHECK13-NEXT:    store i32 [[ADD9]], ptr [[RETVAL]], align 4
-// CHECK13-NEXT:    call void @_ZN1SD1Ev(ptr noundef nonnull align 8 dereferenceable(24) [[S]]) #[[ATTR5]]
+// CHECK13-NEXT:    call void @_ZN1SD1Ev(ptr noundef nonnull align 8 dereferenceable(24) [[S]]) #[[ATTR4]]
 // CHECK13-NEXT:    [[TMP41:%.*]] = load i32, ptr [[RETVAL]], align 4
 // CHECK13-NEXT:    ret i32 [[TMP41]]
 // CHECK13:       eh.resume:
@@ -4034,7 +4034,7 @@ int main() {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68.omp_outlined
-// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR4:[0-9]+]] {
+// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR3]] {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -4095,7 +4095,7 @@ int main() {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68.omp_outlined.omp_outlined
-// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR4]] personality ptr @__gxx_personality_v0 {
+// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -4169,14 +4169,14 @@ int main() {
 // CHECK13-NEXT:    [[TMP11:%.*]] = landingpad { ptr, i32 }
 // CHECK13-NEXT:    catch ptr null
 // CHECK13-NEXT:    [[TMP12:%.*]] = extractvalue { ptr, i32 } [[TMP11]], 0
-// CHECK13-NEXT:    call void @__clang_call_terminate(ptr [[TMP12]]) #[[ATTR11:[0-9]+]]
+// CHECK13-NEXT:    call void @__clang_call_terminate(ptr [[TMP12]]) #[[ATTR10:[0-9]+]]
 // CHECK13-NEXT:    unreachable
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@__clang_call_terminate
-// CHECK13-SAME: (ptr noundef [[TMP0:%.*]]) #[[ATTR7:[0-9]+]] comdat {
-// CHECK13-NEXT:    [[TMP2:%.*]] = call ptr @__cxa_begin_catch(ptr [[TMP0]]) #[[ATTR5]]
-// CHECK13-NEXT:    call void @_ZSt9terminatev() #[[ATTR11]]
+// CHECK13-SAME: (ptr noundef [[TMP0:%.*]]) #[[ATTR6:[0-9]+]] comdat {
+// CHECK13-NEXT:    [[TMP2:%.*]] = call ptr @__cxa_begin_catch(ptr [[TMP0]]) #[[ATTR4]]
+// CHECK13-NEXT:    call void @_ZSt9terminatev() #[[ATTR10]]
 // CHECK13-NEXT:    unreachable
 //
 //
@@ -4190,7 +4190,7 @@ int main() {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l74.omp_outlined
-// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[A:%.*]]) #[[ATTR4]] {
+// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[A:%.*]]) #[[ATTR3]] {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -4256,7 +4256,7 @@ int main() {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l74.omp_outlined.omp_outlined
-// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR4]] personality ptr @__gxx_personality_v0 {
+// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -4330,12 +4330,12 @@ int main() {
 // CHECK13-NEXT:    [[TMP11:%.*]] = landingpad { ptr, i32 }
 // CHECK13-NEXT:    catch ptr null
 // CHECK13-NEXT:    [[TMP12:%.*]] = extractvalue { ptr, i32 } [[TMP11]], 0
-// CHECK13-NEXT:    call void @__clang_call_terminate(ptr [[TMP12]]) #[[ATTR11]]
+// CHECK13-NEXT:    call void @__clang_call_terminate(ptr [[TMP12]]) #[[ATTR10]]
 // CHECK13-NEXT:    unreachable
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@_Z5tmainIcLi5EEiv
-// CHECK13-SAME: () #[[ATTR8:[0-9]+]] comdat {
+// CHECK13-SAME: () #[[ATTR7:[0-9]+]] comdat {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[TMP:%.*]] = alloca i32, align 4
 // CHECK13-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
@@ -4371,7 +4371,7 @@ int main() {
 // CHECK13-NEXT:    [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
 // CHECK13-NEXT:    br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK13:       omp_offload.failed:
-// CHECK13-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l52() #[[ATTR5]]
+// CHECK13-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l52() #[[ATTR4]]
 // CHECK13-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK13:       omp_offload.cont:
 // CHECK13-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
@@ -4404,14 +4404,14 @@ int main() {
 // CHECK13-NEXT:    [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0
 // CHECK13-NEXT:    br i1 [[TMP29]], label [[OMP_OFFLOAD_FAILED3:%.*]], label [[OMP_OFFLOAD_CONT4:%.*]]
 // CHECK13:       omp_offload.failed3:
-// CHECK13-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l57() #[[ATTR5]]
+// CHECK13-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l57() #[[ATTR4]]
 // CHECK13-NEXT:    br label [[OMP_OFFLOAD_CONT4]]
 // CHECK13:       omp_offload.cont4:
 // CHECK13-NEXT:    ret i32 0
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@_Z5tmainI1SLi1EEiv
-// CHECK13-SAME: () #[[ATTR8]] comdat {
+// CHECK13-SAME: () #[[ATTR7]] comdat {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[TMP:%.*]] = alloca i32, align 4
 // CHECK13-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
@@ -4447,7 +4447,7 @@ int main() {
 // CHECK13-NEXT:    [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
 // CHECK13-NEXT:    br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK13:       omp_offload.failed:
-// CHECK13-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l52() #[[ATTR5]]
+// CHECK13-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l52() #[[ATTR4]]
 // CHECK13-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK13:       omp_offload.cont:
 // CHECK13-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
@@ -4480,24 +4480,24 @@ int main() {
 // CHECK13-NEXT:    [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0
 // CHECK13-NEXT:    br i1 [[TMP29]], label [[OMP_OFFLOAD_FAILED3:%.*]], label [[OMP_OFFLOAD_CONT4:%.*]]
 // CHECK13:       omp_offload.failed3:
-// CHECK13-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l57() #[[ATTR5]]
+// CHECK13-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l57() #[[ATTR4]]
 // CHECK13-NEXT:    br label [[OMP_OFFLOAD_CONT4]]
 // CHECK13:       omp_offload.cont4:
 // CHECK13-NEXT:    ret i32 0
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@_ZN1SD1Ev
-// CHECK13-SAME: (ptr noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]]) unnamed_addr #[[ATTR9:[0-9]+]] comdat align 2 {
+// CHECK13-SAME: (ptr noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]]) unnamed_addr #[[ATTR8:[0-9]+]] comdat align 2 {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
 // CHECK13-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
-// CHECK13-NEXT:    call void @_ZN1SD2Ev(ptr noundef nonnull align 8 dereferenceable(24) [[THIS1]]) #[[ATTR5]]
+// CHECK13-NEXT:    call void @_ZN1SD2Ev(ptr noundef nonnull align 8 dereferenceable(24) [[THIS1]]) #[[ATTR4]]
 // CHECK13-NEXT:    ret void
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@_ZN1SC2El
-// CHECK13-SAME: (ptr noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]], i64 noundef [[A:%.*]]) unnamed_addr #[[ATTR9]] comdat align 2 {
+// CHECK13-SAME: (ptr noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]], i64 noundef [[A:%.*]]) unnamed_addr #[[ATTR8]] comdat align 2 {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
@@ -4518,7 +4518,7 @@ int main() {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l52.omp_outlined
-// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR4]] {
+// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR3]] {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -4579,7 +4579,7 @@ int main() {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l52.omp_outlined.omp_outlined
-// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR4]] personality ptr @__gxx_personality_v0 {
+// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -4653,7 +4653,7 @@ int main() {
 // CHECK13-NEXT:    [[TMP11:%.*]] = landingpad { ptr, i32 }
 // CHECK13-NEXT:    catch ptr null
 // CHECK13-NEXT:    [[TMP12:%.*]] = extractvalue { ptr, i32 } [[TMP11]], 0
-// CHECK13-NEXT:    call void @__clang_call_terminate(ptr [[TMP12]]) #[[ATTR11]]
+// CHECK13-NEXT:    call void @__clang_call_terminate(ptr [[TMP12]]) #[[ATTR10]]
 // CHECK13-NEXT:    unreachable
 //
 //
@@ -4665,7 +4665,7 @@ int main() {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l57.omp_outlined
-// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR4]] {
+// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR3]] {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -4726,7 +4726,7 @@ int main() {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l57.omp_outlined.omp_outlined
-// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR4]] personality ptr @__gxx_personality_v0 {
+// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -4800,7 +4800,7 @@ int main() {
 // CHECK13-NEXT:    [[TMP11:%.*]] = landingpad { ptr, i32 }
 // CHECK13-NEXT:    catch ptr null
 // CHECK13-NEXT:    [[TMP12:%.*]] = extractvalue { ptr, i32 } [[TMP11]], 0
-// CHECK13-NEXT:    call void @__clang_call_terminate(ptr [[TMP12]]) #[[ATTR11]]
+// CHECK13-NEXT:    call void @__clang_call_terminate(ptr [[TMP12]]) #[[ATTR10]]
 // CHECK13-NEXT:    unreachable
 //
 //
@@ -4812,7 +4812,7 @@ int main() {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l52.omp_outlined
-// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR4]] {
+// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR3]] {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -4873,7 +4873,7 @@ int main() {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l52.omp_outlined.omp_outlined
-// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR4]] personality ptr @__gxx_personality_v0 {
+// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -4947,7 +4947,7 @@ int main() {
 // CHECK13-NEXT:    [[TMP11:%.*]] = landingpad { ptr, i32 }
 // CHECK13-NEXT:    catch ptr null
 // CHECK13-NEXT:    [[TMP12:%.*]] = extractvalue { ptr, i32 } [[TMP11]], 0
-// CHECK13-NEXT:    call void @__clang_call_terminate(ptr [[TMP12]]) #[[ATTR11]]
+// CHECK13-NEXT:    call void @__clang_call_terminate(ptr [[TMP12]]) #[[ATTR10]]
 // CHECK13-NEXT:    unreachable
 //
 //
@@ -4959,7 +4959,7 @@ int main() {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l57.omp_outlined
-// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR4]] personality ptr @__gxx_personality_v0 {
+// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -5008,7 +5008,7 @@ int main() {
 // CHECK13:       invoke.cont2:
 // CHECK13-NEXT:    [[TMP7:%.*]] = sext i8 [[CALL]] to i32
 // CHECK13-NEXT:    call void @__kmpc_push_num_threads(ptr @[[GLOB3]], i32 [[TMP1]], i32 [[TMP7]])
-// CHECK13-NEXT:    call void @_ZN1SD1Ev(ptr noundef nonnull align 8 dereferenceable(24) [[REF_TMP]]) #[[ATTR5]]
+// CHECK13-NEXT:    call void @_ZN1SD1Ev(ptr noundef nonnull align 8 dereferenceable(24) [[REF_TMP]]) #[[ATTR4]]
 // CHECK13-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
 // CHECK13-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP8]] to i64
 // CHECK13-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4
@@ -5030,12 +5030,12 @@ int main() {
 // CHECK13-NEXT:    [[TMP14:%.*]] = landingpad { ptr, i32 }
 // CHECK13-NEXT:    catch ptr null
 // CHECK13-NEXT:    [[TMP15:%.*]] = extractvalue { ptr, i32 } [[TMP14]], 0
-// CHECK13-NEXT:    call void @__clang_call_terminate(ptr [[TMP15]]) #[[ATTR11]]
+// CHECK13-NEXT:    call void @__clang_call_terminate(ptr [[TMP15]]) #[[ATTR10]]
 // CHECK13-NEXT:    unreachable
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l57.omp_outlined.omp_outlined
-// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR4]] personality ptr @__gxx_personality_v0 {
+// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -5109,12 +5109,12 @@ int main() {
 // CHECK13-NEXT:    [[TMP11:%.*]] = landingpad { ptr, i32 }
 // CHECK13-NEXT:    catch ptr null
 // CHECK13-NEXT:    [[TMP12:%.*]] = extractvalue { ptr, i32 } [[TMP11]], 0
-// CHECK13-NEXT:    call void @__clang_call_terminate(ptr [[TMP12]]) #[[ATTR11]]
+// CHECK13-NEXT:    call void @__clang_call_terminate(ptr [[TMP12]]) #[[ATTR10]]
 // CHECK13-NEXT:    unreachable
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@_ZN1SD2Ev
-// CHECK13-SAME: (ptr noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]]) unnamed_addr #[[ATTR9]] comdat align 2 {
+// CHECK13-SAME: (ptr noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]]) unnamed_addr #[[ATTR8]] comdat align 2 {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
@@ -5123,7 +5123,7 @@ int main() {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK13-SAME: () #[[ATTR10:[0-9]+]] {
+// CHECK13-SAME: () #[[ATTR9:[0-9]+]] {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK13-NEXT:    ret void
diff --git a/clang/test/OpenMP/distribute_parallel_for_private_codegen.cpp b/clang/test/OpenMP/distribute_parallel_for_private_codegen.cpp
index 4e97ee2a3b5d4..288ebad9b673a 100644
--- a/clang/test/OpenMP/distribute_parallel_for_private_codegen.cpp
+++ b/clang/test/OpenMP/distribute_parallel_for_private_codegen.cpp
@@ -156,7 +156,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -224,7 +224,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68.omp_outlined.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -318,7 +318,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK1-SAME: () #[[ATTR5:[0-9]+]] {
+// CHECK1-SAME: () #[[ATTR4:[0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK1-NEXT:    ret void
@@ -345,7 +345,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -411,7 +411,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68.omp_outlined.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -503,7 +503,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK3-SAME: () #[[ATTR5:[0-9]+]] {
+// CHECK3-SAME: () #[[ATTR4:[0-9]+]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK3-NEXT:    ret void
@@ -564,7 +564,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
 // CHECK9-NEXT:    br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK9:       omp_offload.failed:
-// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l95() #[[ATTR5:[0-9]+]]
+// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l95() #[[ATTR4:[0-9]+]]
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK9:       omp_offload.cont:
 // CHECK9-NEXT:    [[CALL:%.*]] = call noundef signext i32 @_Z5tmainIiET_v()
@@ -575,11 +575,11 @@ int main() {
 // CHECK9:       arraydestroy.body:
 // CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP15]], [[OMP_OFFLOAD_CONT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
-// CHECK9-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK9-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN]]
 // CHECK9-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE2:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK9:       arraydestroy.done2:
-// CHECK9-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]]
 // CHECK9-NEXT:    [[TMP16:%.*]] = load i32, ptr [[RETVAL]], align 4
 // CHECK9-NEXT:    ret i32 [[TMP16]]
 //
@@ -615,7 +615,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l95.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR4:[0-9]+]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR3]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -695,14 +695,14 @@ int main() {
 // CHECK9-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
 // CHECK9-NEXT:    [[TMP14:%.*]] = load i32, ptr [[TMP13]], align 4
 // CHECK9-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP14]])
-// CHECK9-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR4]]
 // CHECK9-NEXT:    [[ARRAY_BEGIN4:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN4]], i64 2
 // CHECK9-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK9:       arraydestroy.body:
 // CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP15]], [[OMP_LOOP_EXIT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
-// CHECK9-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK9-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN4]]
 // CHECK9-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE5:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK9:       arraydestroy.done5:
@@ -710,7 +710,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l95.omp_outlined.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR4]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -810,14 +810,14 @@ int main() {
 // CHECK9-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
 // CHECK9-NEXT:    [[TMP16:%.*]] = load i32, ptr [[TMP15]], align 4
 // CHECK9-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP16]])
-// CHECK9-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR4]]
 // CHECK9-NEXT:    [[ARRAY_BEGIN8:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN8]], i64 2
 // CHECK9-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK9:       arraydestroy.body:
 // CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP17]], [[OMP_LOOP_EXIT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
-// CHECK9-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK9-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN8]]
 // CHECK9-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE9:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK9:       arraydestroy.done9:
@@ -830,12 +830,12 @@ int main() {
 // CHECK9-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
 // CHECK9-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
-// CHECK9-NEXT:    call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]]
 // CHECK9-NEXT:    ret void
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@_Z5tmainIiET_v
-// CHECK9-SAME: () #[[ATTR6:[0-9]+]] comdat {
+// CHECK9-SAME: () #[[ATTR5:[0-9]+]] comdat {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
 // CHECK9-NEXT:    [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4
@@ -885,7 +885,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
 // CHECK9-NEXT:    br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK9:       omp_offload.failed:
-// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l49() #[[ATTR5]]
+// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l49() #[[ATTR4]]
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK9:       omp_offload.cont:
 // CHECK9-NEXT:    store i32 0, ptr [[RETVAL]], align 4
@@ -895,11 +895,11 @@ int main() {
 // CHECK9:       arraydestroy.body:
 // CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP15]], [[OMP_OFFLOAD_CONT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
-// CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK9-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN]]
 // CHECK9-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE2:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK9:       arraydestroy.done2:
-// CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]]
 // CHECK9-NEXT:    [[TMP16:%.*]] = load i32, ptr [[RETVAL]], align 4
 // CHECK9-NEXT:    ret i32 [[TMP16]]
 //
@@ -969,7 +969,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l49.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR4]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR3]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1048,14 +1048,14 @@ int main() {
 // CHECK9-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
 // CHECK9-NEXT:    [[TMP14:%.*]] = load i32, ptr [[TMP13]], align 4
 // CHECK9-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP14]])
-// CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR4]]
 // CHECK9-NEXT:    [[ARRAY_BEGIN4:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN4]], i64 2
 // CHECK9-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK9:       arraydestroy.body:
 // CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP15]], [[OMP_LOOP_EXIT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
-// CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK9-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN4]]
 // CHECK9-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE5:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK9:       arraydestroy.done5:
@@ -1063,7 +1063,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l49.omp_outlined.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR4]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1162,14 +1162,14 @@ int main() {
 // CHECK9-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
 // CHECK9-NEXT:    [[TMP16:%.*]] = load i32, ptr [[TMP15]], align 4
 // CHECK9-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP16]])
-// CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR4]]
 // CHECK9-NEXT:    [[ARRAY_BEGIN8:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN8]], i64 2
 // CHECK9-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK9:       arraydestroy.body:
 // CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP17]], [[OMP_LOOP_EXIT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
-// CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK9-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN8]]
 // CHECK9-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE9:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK9:       arraydestroy.done9:
@@ -1182,7 +1182,7 @@ int main() {
 // CHECK9-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
 // CHECK9-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
-// CHECK9-NEXT:    call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]]
 // CHECK9-NEXT:    ret void
 //
 //
@@ -1221,7 +1221,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK9-SAME: () #[[ATTR7:[0-9]+]] {
+// CHECK9-SAME: () #[[ATTR6:[0-9]+]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK9-NEXT:    ret void
@@ -1282,7 +1282,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
 // CHECK11-NEXT:    br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK11:       omp_offload.failed:
-// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l95() #[[ATTR5:[0-9]+]]
+// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l95() #[[ATTR4:[0-9]+]]
 // CHECK11-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK11:       omp_offload.cont:
 // CHECK11-NEXT:    [[CALL:%.*]] = call noundef i32 @_Z5tmainIiET_v()
@@ -1293,11 +1293,11 @@ int main() {
 // CHECK11:       arraydestroy.body:
 // CHECK11-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP15]], [[OMP_OFFLOAD_CONT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK11-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1
-// CHECK11-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK11-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN]]
 // CHECK11-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE2:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK11:       arraydestroy.done2:
-// CHECK11-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]]
 // CHECK11-NEXT:    [[TMP16:%.*]] = load i32, ptr [[RETVAL]], align 4
 // CHECK11-NEXT:    ret i32 [[TMP16]]
 //
@@ -1333,7 +1333,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l95.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR4:[0-9]+]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR3]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1411,14 +1411,14 @@ int main() {
 // CHECK11-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
 // CHECK11-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP11]], align 4
 // CHECK11-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP12]])
-// CHECK11-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR4]]
 // CHECK11-NEXT:    [[ARRAY_BEGIN4:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN4]], i32 2
 // CHECK11-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK11:       arraydestroy.body:
 // CHECK11-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP13]], [[OMP_LOOP_EXIT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK11-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1
-// CHECK11-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK11-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN4]]
 // CHECK11-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE5:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK11:       arraydestroy.done5:
@@ -1426,7 +1426,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l95.omp_outlined.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR4]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1522,14 +1522,14 @@ int main() {
 // CHECK11-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
 // CHECK11-NEXT:    [[TMP16:%.*]] = load i32, ptr [[TMP15]], align 4
 // CHECK11-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP16]])
-// CHECK11-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR4]]
 // CHECK11-NEXT:    [[ARRAY_BEGIN6:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN6]], i32 2
 // CHECK11-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK11:       arraydestroy.body:
 // CHECK11-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP17]], [[OMP_LOOP_EXIT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK11-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1
-// CHECK11-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK11-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN6]]
 // CHECK11-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE7:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK11:       arraydestroy.done7:
@@ -1542,12 +1542,12 @@ int main() {
 // CHECK11-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
 // CHECK11-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
-// CHECK11-NEXT:    call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]]
 // CHECK11-NEXT:    ret void
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@_Z5tmainIiET_v
-// CHECK11-SAME: () #[[ATTR6:[0-9]+]] comdat {
+// CHECK11-SAME: () #[[ATTR5:[0-9]+]] comdat {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4
@@ -1597,7 +1597,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
 // CHECK11-NEXT:    br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK11:       omp_offload.failed:
-// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l49() #[[ATTR5]]
+// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l49() #[[ATTR4]]
 // CHECK11-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK11:       omp_offload.cont:
 // CHECK11-NEXT:    store i32 0, ptr [[RETVAL]], align 4
@@ -1607,11 +1607,11 @@ int main() {
 // CHECK11:       arraydestroy.body:
 // CHECK11-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP15]], [[OMP_OFFLOAD_CONT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK11-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1
-// CHECK11-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK11-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN]]
 // CHECK11-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE2:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK11:       arraydestroy.done2:
-// CHECK11-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]]
 // CHECK11-NEXT:    [[TMP16:%.*]] = load i32, ptr [[RETVAL]], align 4
 // CHECK11-NEXT:    ret i32 [[TMP16]]
 //
@@ -1681,7 +1681,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l49.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR4]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR3]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1758,14 +1758,14 @@ int main() {
 // CHECK11-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
 // CHECK11-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP11]], align 4
 // CHECK11-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP12]])
-// CHECK11-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR4]]
 // CHECK11-NEXT:    [[ARRAY_BEGIN4:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN4]], i32 2
 // CHECK11-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK11:       arraydestroy.body:
 // CHECK11-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP13]], [[OMP_LOOP_EXIT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK11-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1
-// CHECK11-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK11-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN4]]
 // CHECK11-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE5:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK11:       arraydestroy.done5:
@@ -1773,7 +1773,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l49.omp_outlined.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR4]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1868,14 +1868,14 @@ int main() {
 // CHECK11-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
 // CHECK11-NEXT:    [[TMP16:%.*]] = load i32, ptr [[TMP15]], align 4
 // CHECK11-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP16]])
-// CHECK11-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR4]]
 // CHECK11-NEXT:    [[ARRAY_BEGIN6:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN6]], i32 2
 // CHECK11-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK11:       arraydestroy.body:
 // CHECK11-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP17]], [[OMP_LOOP_EXIT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK11-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1
-// CHECK11-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK11-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN6]]
 // CHECK11-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE7:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK11:       arraydestroy.done7:
@@ -1888,7 +1888,7 @@ int main() {
 // CHECK11-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
 // CHECK11-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
-// CHECK11-NEXT:    call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]]
 // CHECK11-NEXT:    ret void
 //
 //
@@ -1927,7 +1927,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK11-SAME: () #[[ATTR7:[0-9]+]] {
+// CHECK11-SAME: () #[[ATTR6:[0-9]+]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK11-NEXT:    ret void
diff --git a/clang/test/OpenMP/distribute_parallel_for_proc_bind_codegen.cpp b/clang/test/OpenMP/distribute_parallel_for_proc_bind_codegen.cpp
index 297ffaf91cb2b..3994714891b5b 100644
--- a/clang/test/OpenMP/distribute_parallel_for_proc_bind_codegen.cpp
+++ b/clang/test/OpenMP/distribute_parallel_for_proc_bind_codegen.cpp
@@ -92,7 +92,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
 // CHECK1-NEXT:    br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK1:       omp_offload.failed:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l37() #[[ATTR3:[0-9]+]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l37() #[[ATTR2:[0-9]+]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
@@ -125,7 +125,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0
 // CHECK1-NEXT:    br i1 [[TMP29]], label [[OMP_OFFLOAD_FAILED3:%.*]], label [[OMP_OFFLOAD_CONT4:%.*]]
 // CHECK1:       omp_offload.failed3:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41() #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41() #[[ATTR2]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT4]]
 // CHECK1:       omp_offload.cont4:
 // CHECK1-NEXT:    [[CALL:%.*]] = call noundef signext i32 @_Z5tmainIiET_v()
@@ -140,7 +140,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l37.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -201,7 +201,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l37.omp_outlined.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -278,7 +278,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -339,7 +339,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41.omp_outlined.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -409,7 +409,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@_Z5tmainIiET_v
-// CHECK1-SAME: () #[[ATTR4:[0-9]+]] comdat {
+// CHECK1-SAME: () #[[ATTR3:[0-9]+]] comdat {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[TMP:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
@@ -443,7 +443,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
 // CHECK1-NEXT:    br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK1:       omp_offload.failed:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l29() #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l29() #[[ATTR2]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    ret i32 0
@@ -457,7 +457,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l29.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -518,7 +518,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l29.omp_outlined.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -588,7 +588,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK1-SAME: () #[[ATTR5:[0-9]+]] {
+// CHECK1-SAME: () #[[ATTR4:[0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK1-NEXT:    ret void
diff --git a/clang/test/OpenMP/distribute_parallel_for_simd_codegen.cpp b/clang/test/OpenMP/distribute_parallel_for_simd_codegen.cpp
index 51e06f820f6bf..5a4285bb95e35 100644
--- a/clang/test/OpenMP/distribute_parallel_for_simd_codegen.cpp
+++ b/clang/test/OpenMP/distribute_parallel_for_simd_codegen.cpp
@@ -810,7 +810,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l116.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR2]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -916,7 +916,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l116.omp_outlined.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR3]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR2]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1073,7 +1073,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l159.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR3]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR2]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1179,7 +1179,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l159.omp_outlined.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR3]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR2]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1338,7 +1338,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l201.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[CH:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR3]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[CH:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR2]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1472,7 +1472,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l201.omp_outlined.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR3]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR2]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1629,7 +1629,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l234.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR3]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR2]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1735,7 +1735,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l234.omp_outlined.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR3]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR2]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1894,7 +1894,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l266.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[CH:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR3]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[CH:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR2]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -2010,7 +2010,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l266.omp_outlined.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -2189,7 +2189,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l299.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR3]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR2]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -2295,7 +2295,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l299.omp_outlined.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR3]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR2]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -2450,7 +2450,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l328.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[CH:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR3]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[CH:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR2]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -2566,7 +2566,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l328.omp_outlined.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -2707,7 +2707,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK1-SAME: () #[[ATTR5:[0-9]+]] {
+// CHECK1-SAME: () #[[ATTR4:[0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK1-NEXT:    ret void
@@ -2756,7 +2756,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l116.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR2]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -2860,7 +2860,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l116.omp_outlined.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR3]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR2]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -3012,7 +3012,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l159.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR3]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR2]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -3116,7 +3116,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l159.omp_outlined.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR3]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR2]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -3270,7 +3270,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l201.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[CH:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR3]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[CH:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR2]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -3402,7 +3402,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l201.omp_outlined.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR3]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR2]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -3554,7 +3554,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l234.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR3]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR2]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -3658,7 +3658,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l234.omp_outlined.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR3]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR2]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -3812,7 +3812,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l266.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[CH:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR3]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[CH:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR2]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -3926,7 +3926,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l266.omp_outlined.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -4098,7 +4098,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l299.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR3]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR2]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -4202,7 +4202,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l299.omp_outlined.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR3]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR2]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -4352,7 +4352,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l328.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[CH:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR3]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[CH:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR2]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -4466,7 +4466,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l328.omp_outlined.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -4602,7 +4602,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK3-SAME: () #[[ATTR5:[0-9]+]] {
+// CHECK3-SAME: () #[[ATTR4:[0-9]+]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK3-NEXT:    ret void
@@ -4805,7 +4805,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP37:%.*]] = icmp ne i32 [[TMP36]], 0
 // CHECK9-NEXT:    br i1 [[TMP37]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK9:       omp_offload.failed:
-// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l368(i64 [[TMP1]], ptr [[TMP2]], ptr [[TMP3]], ptr [[TMP4]]) #[[ATTR3:[0-9]+]]
+// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l368(i64 [[TMP1]], ptr [[TMP2]], ptr [[TMP3]], ptr [[TMP4]]) #[[ATTR2:[0-9]+]]
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK9:       omp_offload.cont:
 // CHECK9-NEXT:    [[TMP38:%.*]] = load i32, ptr [[N]], align 4
@@ -4880,7 +4880,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP75:%.*]] = icmp ne i32 [[TMP74]], 0
 // CHECK9-NEXT:    br i1 [[TMP75]], label [[OMP_OFFLOAD_FAILED15:%.*]], label [[OMP_OFFLOAD_CONT16:%.*]]
 // CHECK9:       omp_offload.failed15:
-// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l407(i64 [[TMP39]], ptr [[TMP40]], ptr [[TMP41]], ptr [[TMP42]]) #[[ATTR3]]
+// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l407(i64 [[TMP39]], ptr [[TMP40]], ptr [[TMP41]], ptr [[TMP42]]) #[[ATTR2]]
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT16]]
 // CHECK9:       omp_offload.cont16:
 // CHECK9-NEXT:    [[TMP76:%.*]] = load i32, ptr [[CH]], align 4
@@ -4964,7 +4964,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP118:%.*]] = icmp ne i32 [[TMP117]], 0
 // CHECK9-NEXT:    br i1 [[TMP118]], label [[OMP_OFFLOAD_FAILED29:%.*]], label [[OMP_OFFLOAD_CONT30:%.*]]
 // CHECK9:       omp_offload.failed29:
-// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l446(i64 [[TMP77]], i64 [[TMP79]], ptr [[TMP80]], ptr [[TMP81]], ptr [[TMP82]]) #[[ATTR3]]
+// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l446(i64 [[TMP77]], i64 [[TMP79]], ptr [[TMP80]], ptr [[TMP81]], ptr [[TMP82]]) #[[ATTR2]]
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT30]]
 // CHECK9:       omp_offload.cont30:
 // CHECK9-NEXT:    [[TMP119:%.*]] = load i32, ptr [[N]], align 4
@@ -5039,7 +5039,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP156:%.*]] = icmp ne i32 [[TMP155]], 0
 // CHECK9-NEXT:    br i1 [[TMP156]], label [[OMP_OFFLOAD_FAILED43:%.*]], label [[OMP_OFFLOAD_CONT44:%.*]]
 // CHECK9:       omp_offload.failed43:
-// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l477(i64 [[TMP120]], ptr [[TMP121]], ptr [[TMP122]], ptr [[TMP123]]) #[[ATTR3]]
+// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l477(i64 [[TMP120]], ptr [[TMP121]], ptr [[TMP122]], ptr [[TMP123]]) #[[ATTR2]]
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT44]]
 // CHECK9:       omp_offload.cont44:
 // CHECK9-NEXT:    [[TMP157:%.*]] = load i32, ptr [[CH]], align 4
@@ -5123,7 +5123,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP199:%.*]] = icmp ne i32 [[TMP198]], 0
 // CHECK9-NEXT:    br i1 [[TMP199]], label [[OMP_OFFLOAD_FAILED58:%.*]], label [[OMP_OFFLOAD_CONT59:%.*]]
 // CHECK9:       omp_offload.failed58:
-// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l505(i64 [[TMP158]], i64 [[TMP160]], ptr [[TMP161]], ptr [[TMP162]], ptr [[TMP163]]) #[[ATTR3]]
+// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l505(i64 [[TMP158]], i64 [[TMP160]], ptr [[TMP161]], ptr [[TMP162]], ptr [[TMP163]]) #[[ATTR2]]
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT59]]
 // CHECK9:       omp_offload.cont59:
 // CHECK9-NEXT:    [[TMP200:%.*]] = load i32, ptr [[N]], align 4
@@ -5198,7 +5198,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP237:%.*]] = icmp ne i32 [[TMP236]], 0
 // CHECK9-NEXT:    br i1 [[TMP237]], label [[OMP_OFFLOAD_FAILED72:%.*]], label [[OMP_OFFLOAD_CONT73:%.*]]
 // CHECK9:       omp_offload.failed72:
-// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l535(i64 [[TMP201]], ptr [[TMP202]], ptr [[TMP203]], ptr [[TMP204]]) #[[ATTR3]]
+// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l535(i64 [[TMP201]], ptr [[TMP202]], ptr [[TMP203]], ptr [[TMP204]]) #[[ATTR2]]
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT73]]
 // CHECK9:       omp_offload.cont73:
 // CHECK9-NEXT:    [[TMP238:%.*]] = load i32, ptr [[CH]], align 4
@@ -5282,7 +5282,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP280:%.*]] = icmp ne i32 [[TMP279]], 0
 // CHECK9-NEXT:    br i1 [[TMP280]], label [[OMP_OFFLOAD_FAILED87:%.*]], label [[OMP_OFFLOAD_CONT88:%.*]]
 // CHECK9:       omp_offload.failed87:
-// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l561(i64 [[TMP239]], i64 [[TMP241]], ptr [[TMP242]], ptr [[TMP243]], ptr [[TMP244]]) #[[ATTR3]]
+// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l561(i64 [[TMP239]], i64 [[TMP241]], ptr [[TMP242]], ptr [[TMP243]], ptr [[TMP244]]) #[[ATTR2]]
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT88]]
 // CHECK9:       omp_offload.cont88:
 // CHECK9-NEXT:    [[CALL:%.*]] = call noundef signext i32 @_Z5tmainIiET_v()
@@ -5305,7 +5305,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l368.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR1]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -5411,7 +5411,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l368.omp_outlined.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR2]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR1]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -5558,7 +5558,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l407.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR2]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR1]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -5664,7 +5664,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l407.omp_outlined.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR2]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR1]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -5813,7 +5813,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l446.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[CH:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR2]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[CH:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR1]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -5947,7 +5947,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l446.omp_outlined.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR2]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR1]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -6094,7 +6094,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l477.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR2]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR1]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -6200,7 +6200,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l477.omp_outlined.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR2]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR1]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -6349,7 +6349,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l505.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[CH:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR2]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[CH:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR1]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -6465,7 +6465,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l505.omp_outlined.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR1]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -6634,7 +6634,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l535.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR2]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR1]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -6740,7 +6740,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l535.omp_outlined.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR2]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR1]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -6885,7 +6885,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l561.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[CH:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR2]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[CH:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR1]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -7001,7 +7001,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l561.omp_outlined.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR1]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -7132,7 +7132,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@_Z5tmainIiET_v
-// CHECK9-SAME: () #[[ATTR4:[0-9]+]] comdat {
+// CHECK9-SAME: () #[[ATTR3:[0-9]+]] comdat {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[A:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[B:%.*]] = alloca ptr, align 8
@@ -7272,7 +7272,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP37:%.*]] = icmp ne i32 [[TMP36]], 0
 // CHECK9-NEXT:    br i1 [[TMP37]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK9:       omp_offload.failed:
-// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l42(i64 [[TMP1]], ptr [[TMP2]], ptr [[TMP3]], ptr [[TMP4]]) #[[ATTR3]]
+// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l42(i64 [[TMP1]], ptr [[TMP2]], ptr [[TMP3]], ptr [[TMP4]]) #[[ATTR2]]
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK9:       omp_offload.cont:
 // CHECK9-NEXT:    [[TMP38:%.*]] = load i32, ptr [[N]], align 4
@@ -7347,7 +7347,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP75:%.*]] = icmp ne i32 [[TMP74]], 0
 // CHECK9-NEXT:    br i1 [[TMP75]], label [[OMP_OFFLOAD_FAILED15:%.*]], label [[OMP_OFFLOAD_CONT16:%.*]]
 // CHECK9:       omp_offload.failed15:
-// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l50(i64 [[TMP39]], ptr [[TMP40]], ptr [[TMP41]], ptr [[TMP42]]) #[[ATTR3]]
+// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l50(i64 [[TMP39]], ptr [[TMP40]], ptr [[TMP41]], ptr [[TMP42]]) #[[ATTR2]]
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT16]]
 // CHECK9:       omp_offload.cont16:
 // CHECK9-NEXT:    [[TMP76:%.*]] = load i32, ptr [[CH]], align 4
@@ -7431,7 +7431,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP118:%.*]] = icmp ne i32 [[TMP117]], 0
 // CHECK9-NEXT:    br i1 [[TMP118]], label [[OMP_OFFLOAD_FAILED29:%.*]], label [[OMP_OFFLOAD_CONT30:%.*]]
 // CHECK9:       omp_offload.failed29:
-// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l58(i64 [[TMP77]], i64 [[TMP79]], ptr [[TMP80]], ptr [[TMP81]], ptr [[TMP82]]) #[[ATTR3]]
+// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l58(i64 [[TMP77]], i64 [[TMP79]], ptr [[TMP80]], ptr [[TMP81]], ptr [[TMP82]]) #[[ATTR2]]
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT30]]
 // CHECK9:       omp_offload.cont30:
 // CHECK9-NEXT:    [[TMP119:%.*]] = load i32, ptr [[N]], align 4
@@ -7506,7 +7506,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP156:%.*]] = icmp ne i32 [[TMP155]], 0
 // CHECK9-NEXT:    br i1 [[TMP156]], label [[OMP_OFFLOAD_FAILED43:%.*]], label [[OMP_OFFLOAD_CONT44:%.*]]
 // CHECK9:       omp_offload.failed43:
-// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l66(i64 [[TMP120]], ptr [[TMP121]], ptr [[TMP122]], ptr [[TMP123]]) #[[ATTR3]]
+// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l66(i64 [[TMP120]], ptr [[TMP121]], ptr [[TMP122]], ptr [[TMP123]]) #[[ATTR2]]
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT44]]
 // CHECK9:       omp_offload.cont44:
 // CHECK9-NEXT:    [[TMP157:%.*]] = load i32, ptr [[CH]], align 4
@@ -7590,7 +7590,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP199:%.*]] = icmp ne i32 [[TMP198]], 0
 // CHECK9-NEXT:    br i1 [[TMP199]], label [[OMP_OFFLOAD_FAILED58:%.*]], label [[OMP_OFFLOAD_CONT59:%.*]]
 // CHECK9:       omp_offload.failed58:
-// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l74(i64 [[TMP158]], i64 [[TMP160]], ptr [[TMP161]], ptr [[TMP162]], ptr [[TMP163]]) #[[ATTR3]]
+// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l74(i64 [[TMP158]], i64 [[TMP160]], ptr [[TMP161]], ptr [[TMP162]], ptr [[TMP163]]) #[[ATTR2]]
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT59]]
 // CHECK9:       omp_offload.cont59:
 // CHECK9-NEXT:    [[TMP200:%.*]] = load i32, ptr [[N]], align 4
@@ -7665,7 +7665,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP237:%.*]] = icmp ne i32 [[TMP236]], 0
 // CHECK9-NEXT:    br i1 [[TMP237]], label [[OMP_OFFLOAD_FAILED72:%.*]], label [[OMP_OFFLOAD_CONT73:%.*]]
 // CHECK9:       omp_offload.failed72:
-// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l82(i64 [[TMP201]], ptr [[TMP202]], ptr [[TMP203]], ptr [[TMP204]]) #[[ATTR3]]
+// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l82(i64 [[TMP201]], ptr [[TMP202]], ptr [[TMP203]], ptr [[TMP204]]) #[[ATTR2]]
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT73]]
 // CHECK9:       omp_offload.cont73:
 // CHECK9-NEXT:    [[TMP238:%.*]] = load i32, ptr [[CH]], align 4
@@ -7749,7 +7749,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP280:%.*]] = icmp ne i32 [[TMP279]], 0
 // CHECK9-NEXT:    br i1 [[TMP280]], label [[OMP_OFFLOAD_FAILED87:%.*]], label [[OMP_OFFLOAD_CONT88:%.*]]
 // CHECK9:       omp_offload.failed87:
-// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l90(i64 [[TMP239]], i64 [[TMP241]], ptr [[TMP242]], ptr [[TMP243]], ptr [[TMP244]]) #[[ATTR3]]
+// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l90(i64 [[TMP239]], i64 [[TMP241]], ptr [[TMP242]], ptr [[TMP243]], ptr [[TMP244]]) #[[ATTR2]]
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT88]]
 // CHECK9:       omp_offload.cont88:
 // CHECK9-NEXT:    ret i32 0
@@ -7771,7 +7771,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l42.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR2]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR1]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -7877,7 +7877,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l42.omp_outlined.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR2]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR1]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -8024,7 +8024,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l50.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR2]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR1]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -8130,7 +8130,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l50.omp_outlined.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR2]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR1]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -8279,7 +8279,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l58.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[CH:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR2]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[CH:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR1]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -8413,7 +8413,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l58.omp_outlined.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR2]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR1]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -8560,7 +8560,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l66.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR2]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR1]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -8666,7 +8666,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l66.omp_outlined.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR2]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR1]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -8815,7 +8815,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l74.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[CH:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR2]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[CH:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR1]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -8931,7 +8931,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l74.omp_outlined.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR1]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -9100,7 +9100,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l82.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR2]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR1]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -9206,7 +9206,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l82.omp_outlined.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR2]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR1]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -9351,7 +9351,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l90.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[CH:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR2]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[CH:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]]) #[[ATTR1]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -9467,7 +9467,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l90.omp_outlined.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[B:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[C:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR1]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -9598,7 +9598,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK9-SAME: () #[[ATTR5:[0-9]+]] {
+// CHECK9-SAME: () #[[ATTR4:[0-9]+]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK9-NEXT:    ret void
@@ -9747,7 +9747,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP37:%.*]] = icmp ne i32 [[TMP36]], 0
 // CHECK11-NEXT:    br i1 [[TMP37]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK11:       omp_offload.failed:
-// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l368(i32 [[TMP1]], ptr [[TMP2]], ptr [[TMP3]], ptr [[TMP4]]) #[[ATTR3:[0-9]+]]
+// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l368(i32 [[TMP1]], ptr [[TMP2]], ptr [[TMP3]], ptr [[TMP4]]) #[[ATTR2:[0-9]+]]
 // CHECK11-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK11:       omp_offload.cont:
 // CHECK11-NEXT:    [[TMP38:%.*]] = load i32, ptr [[N]], align 4
@@ -9822,7 +9822,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP75:%.*]] = icmp ne i32 [[TMP74]], 0
 // CHECK11-NEXT:    br i1 [[TMP75]], label [[OMP_OFFLOAD_FAILED15:%.*]], label [[OMP_OFFLOAD_CONT16:%.*]]
 // CHECK11:       omp_offload.failed15:
-// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l407(i32 [[TMP39]], ptr [[TMP40]], ptr [[TMP41]], ptr [[TMP42]]) #[[ATTR3]]
+// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l407(i32 [[TMP39]], ptr [[TMP40]], ptr [[TMP41]], ptr [[TMP42]]) #[[ATTR2]]
 // CHECK11-NEXT:    br label [[OMP_OFFLOAD_CONT16]]
 // CHECK11:       omp_offload.cont16:
 // CHECK11-NEXT:    [[TMP76:%.*]] = load i32, ptr [[CH]], align 4
@@ -9906,7 +9906,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP118:%.*]] = icmp ne i32 [[TMP117]], 0
 // CHECK11-NEXT:    br i1 [[TMP118]], label [[OMP_OFFLOAD_FAILED29:%.*]], label [[OMP_OFFLOAD_CONT30:%.*]]
 // CHECK11:       omp_offload.failed29:
-// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l446(i32 [[TMP77]], i32 [[TMP79]], ptr [[TMP80]], ptr [[TMP81]], ptr [[TMP82]]) #[[ATTR3]]
+// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l446(i32 [[TMP77]], i32 [[TMP79]], ptr [[TMP80]], ptr [[TMP81]], ptr [[TMP82]]) #[[ATTR2]]
 // CHECK11-NEXT:    br label [[OMP_OFFLOAD_CONT30]]
 // CHECK11:       omp_offload.cont30:
 // CHECK11-NEXT:    [[TMP119:%.*]] = load i32, ptr [[N]], align 4
@@ -9981,7 +9981,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP156:%.*]] = icmp ne i32 [[TMP155]], 0
 // CHECK11-NEXT:    br i1 [[TMP156]], label [[OMP_OFFLOAD_FAILED43:%.*]], label [[OMP_OFFLOAD_CONT44:%.*]]
 // CHECK11:       omp_offload.failed43:
-// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l477(i32 [[TMP120]], ptr [[TMP121]], ptr [[TMP122]], ptr [[TMP123]]) #[[ATTR3]]
+// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l477(i32 [[TMP120]], ptr [[TMP121]], ptr [[TMP122]], ptr [[TMP123]]) #[[ATTR2]]
 // CHECK11-NEXT:    br label [[OMP_OFFLOAD_CONT44]]
 // CHECK11:       omp_offload.cont44:
 // CHECK11-NEXT:    [[TMP157:%.*]] = load i32, ptr [[CH]], align 4
@@ -10065,7 +10065,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP199:%.*]] = icmp ne i32 [[TMP198]], 0
 // CHECK11-NEXT:    br i1 [[TMP199]], label [[OMP_OFFLOAD_FAILED58:%.*]], label [[OMP_OFFLOAD_CONT59:%.*]]
 // CHECK11:       omp_offload.failed58:
-// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l505(i32 [[TMP158]], i32 [[TMP160]], ptr [[TMP161]], ptr [[TMP162]], ptr [[TMP163]]) #[[ATTR3]]
+// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l505(i32 [[TMP158]], i32 [[TMP160]], ptr [[TMP161]], ptr [[TMP162]], ptr [[TMP163]]) #[[ATTR2]]
 // CHECK11-NEXT:    br label [[OMP_OFFLOAD_CONT59]]
 // CHECK11:       omp_offload.cont59:
 // CHECK11-NEXT:    [[TMP200:%.*]] = load i32, ptr [[N]], align 4
@@ -10140,7 +10140,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP237:%.*]] = icmp ne i32 [[TMP236]], 0
 // CHECK11-NEXT:    br i1 [[TMP237]], label [[OMP_OFFLOAD_FAILED72:%.*]], label [[OMP_OFFLOAD_CONT73:%.*]]
 // CHECK11:       omp_offload.failed72:
-// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l535(i32 [[TMP201]], ptr [[TMP202]], ptr [[TMP203]], ptr [[TMP204]]) #[[ATTR3]]
+// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l535(i32 [[TMP201]], ptr [[TMP202]], ptr [[TMP203]], ptr [[TMP204]]) #[[ATTR2]]
 // CHECK11-NEXT:    br label [[OMP_OFFLOAD_CONT73]]
 // CHECK11:       omp_offload.cont73:
 // CHECK11-NEXT:    [[TMP238:%.*]] = load i32, ptr [[CH]], align 4
@@ -10224,7 +10224,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP280:%.*]] = icmp ne i32 [[TMP279]], 0
 // CHECK11-NEXT:    br i1 [[TMP280]], label [[OMP_OFFLOAD_FAILED87:%.*]], label [[OMP_OFFLOAD_CONT88:%.*]]
 // CHECK11:       omp_offload.failed87:
-// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l561(i32 [[TMP239]], i32 [[TMP241]], ptr [[TMP242]], ptr [[TMP243]], ptr [[TMP244]]) #[[ATTR3]]
+// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l561(i32 [[TMP239]], i32 [[TMP241]], ptr [[TMP242]], ptr [[TMP243]], ptr [[TMP244]]) #[[ATTR2]]
 // CHECK11-NEXT:    br label [[OMP_OFFLOAD_CONT88]]
 // CHECK11:       omp_offload.cont88:
 // CHECK11-NEXT:    [[CALL:%.*]] = call noundef i32 @_Z5tmainIiET_v()
@@ -10247,7 +10247,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l368.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR1]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -10351,7 +10351,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l368.omp_outlined.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR2]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR1]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -10493,7 +10493,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l407.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR2]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR1]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -10597,7 +10597,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l407.omp_outlined.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR2]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR1]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -10741,7 +10741,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l446.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[CH:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR2]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[CH:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR1]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -10873,7 +10873,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l446.omp_outlined.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR2]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR1]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -11015,7 +11015,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l477.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR2]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR1]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -11119,7 +11119,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l477.omp_outlined.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR2]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR1]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -11263,7 +11263,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l505.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[CH:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR2]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[CH:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR1]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -11377,7 +11377,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l505.omp_outlined.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR1]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -11539,7 +11539,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l535.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR2]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR1]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -11643,7 +11643,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l535.omp_outlined.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR2]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR1]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -11783,7 +11783,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l561.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[CH:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR2]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[CH:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR1]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -11897,7 +11897,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l561.omp_outlined.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR1]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -12023,7 +12023,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@_Z5tmainIiET_v
-// CHECK11-SAME: () #[[ATTR4:[0-9]+]] comdat {
+// CHECK11-SAME: () #[[ATTR3:[0-9]+]] comdat {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[A:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[B:%.*]] = alloca ptr, align 4
@@ -12163,7 +12163,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP37:%.*]] = icmp ne i32 [[TMP36]], 0
 // CHECK11-NEXT:    br i1 [[TMP37]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK11:       omp_offload.failed:
-// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l42(i32 [[TMP1]], ptr [[TMP2]], ptr [[TMP3]], ptr [[TMP4]]) #[[ATTR3]]
+// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l42(i32 [[TMP1]], ptr [[TMP2]], ptr [[TMP3]], ptr [[TMP4]]) #[[ATTR2]]
 // CHECK11-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK11:       omp_offload.cont:
 // CHECK11-NEXT:    [[TMP38:%.*]] = load i32, ptr [[N]], align 4
@@ -12238,7 +12238,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP75:%.*]] = icmp ne i32 [[TMP74]], 0
 // CHECK11-NEXT:    br i1 [[TMP75]], label [[OMP_OFFLOAD_FAILED15:%.*]], label [[OMP_OFFLOAD_CONT16:%.*]]
 // CHECK11:       omp_offload.failed15:
-// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l50(i32 [[TMP39]], ptr [[TMP40]], ptr [[TMP41]], ptr [[TMP42]]) #[[ATTR3]]
+// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l50(i32 [[TMP39]], ptr [[TMP40]], ptr [[TMP41]], ptr [[TMP42]]) #[[ATTR2]]
 // CHECK11-NEXT:    br label [[OMP_OFFLOAD_CONT16]]
 // CHECK11:       omp_offload.cont16:
 // CHECK11-NEXT:    [[TMP76:%.*]] = load i32, ptr [[CH]], align 4
@@ -12322,7 +12322,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP118:%.*]] = icmp ne i32 [[TMP117]], 0
 // CHECK11-NEXT:    br i1 [[TMP118]], label [[OMP_OFFLOAD_FAILED29:%.*]], label [[OMP_OFFLOAD_CONT30:%.*]]
 // CHECK11:       omp_offload.failed29:
-// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l58(i32 [[TMP77]], i32 [[TMP79]], ptr [[TMP80]], ptr [[TMP81]], ptr [[TMP82]]) #[[ATTR3]]
+// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l58(i32 [[TMP77]], i32 [[TMP79]], ptr [[TMP80]], ptr [[TMP81]], ptr [[TMP82]]) #[[ATTR2]]
 // CHECK11-NEXT:    br label [[OMP_OFFLOAD_CONT30]]
 // CHECK11:       omp_offload.cont30:
 // CHECK11-NEXT:    [[TMP119:%.*]] = load i32, ptr [[N]], align 4
@@ -12397,7 +12397,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP156:%.*]] = icmp ne i32 [[TMP155]], 0
 // CHECK11-NEXT:    br i1 [[TMP156]], label [[OMP_OFFLOAD_FAILED43:%.*]], label [[OMP_OFFLOAD_CONT44:%.*]]
 // CHECK11:       omp_offload.failed43:
-// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l66(i32 [[TMP120]], ptr [[TMP121]], ptr [[TMP122]], ptr [[TMP123]]) #[[ATTR3]]
+// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l66(i32 [[TMP120]], ptr [[TMP121]], ptr [[TMP122]], ptr [[TMP123]]) #[[ATTR2]]
 // CHECK11-NEXT:    br label [[OMP_OFFLOAD_CONT44]]
 // CHECK11:       omp_offload.cont44:
 // CHECK11-NEXT:    [[TMP157:%.*]] = load i32, ptr [[CH]], align 4
@@ -12481,7 +12481,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP199:%.*]] = icmp ne i32 [[TMP198]], 0
 // CHECK11-NEXT:    br i1 [[TMP199]], label [[OMP_OFFLOAD_FAILED58:%.*]], label [[OMP_OFFLOAD_CONT59:%.*]]
 // CHECK11:       omp_offload.failed58:
-// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l74(i32 [[TMP158]], i32 [[TMP160]], ptr [[TMP161]], ptr [[TMP162]], ptr [[TMP163]]) #[[ATTR3]]
+// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l74(i32 [[TMP158]], i32 [[TMP160]], ptr [[TMP161]], ptr [[TMP162]], ptr [[TMP163]]) #[[ATTR2]]
 // CHECK11-NEXT:    br label [[OMP_OFFLOAD_CONT59]]
 // CHECK11:       omp_offload.cont59:
 // CHECK11-NEXT:    [[TMP200:%.*]] = load i32, ptr [[N]], align 4
@@ -12556,7 +12556,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP237:%.*]] = icmp ne i32 [[TMP236]], 0
 // CHECK11-NEXT:    br i1 [[TMP237]], label [[OMP_OFFLOAD_FAILED72:%.*]], label [[OMP_OFFLOAD_CONT73:%.*]]
 // CHECK11:       omp_offload.failed72:
-// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l82(i32 [[TMP201]], ptr [[TMP202]], ptr [[TMP203]], ptr [[TMP204]]) #[[ATTR3]]
+// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l82(i32 [[TMP201]], ptr [[TMP202]], ptr [[TMP203]], ptr [[TMP204]]) #[[ATTR2]]
 // CHECK11-NEXT:    br label [[OMP_OFFLOAD_CONT73]]
 // CHECK11:       omp_offload.cont73:
 // CHECK11-NEXT:    [[TMP238:%.*]] = load i32, ptr [[CH]], align 4
@@ -12640,7 +12640,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP280:%.*]] = icmp ne i32 [[TMP279]], 0
 // CHECK11-NEXT:    br i1 [[TMP280]], label [[OMP_OFFLOAD_FAILED87:%.*]], label [[OMP_OFFLOAD_CONT88:%.*]]
 // CHECK11:       omp_offload.failed87:
-// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l90(i32 [[TMP239]], i32 [[TMP241]], ptr [[TMP242]], ptr [[TMP243]], ptr [[TMP244]]) #[[ATTR3]]
+// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l90(i32 [[TMP239]], i32 [[TMP241]], ptr [[TMP242]], ptr [[TMP243]], ptr [[TMP244]]) #[[ATTR2]]
 // CHECK11-NEXT:    br label [[OMP_OFFLOAD_CONT88]]
 // CHECK11:       omp_offload.cont88:
 // CHECK11-NEXT:    ret i32 0
@@ -12662,7 +12662,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l42.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR2]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR1]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -12766,7 +12766,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l42.omp_outlined.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR2]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR1]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -12908,7 +12908,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l50.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR2]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR1]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -13012,7 +13012,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l50.omp_outlined.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR2]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR1]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -13156,7 +13156,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l58.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[CH:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR2]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[CH:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR1]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -13288,7 +13288,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l58.omp_outlined.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR2]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR1]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -13430,7 +13430,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l66.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR2]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR1]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -13534,7 +13534,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l66.omp_outlined.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR2]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR1]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -13678,7 +13678,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l74.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[CH:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR2]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[CH:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR1]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -13792,7 +13792,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l74.omp_outlined.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR1]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -13954,7 +13954,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l82.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR2]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR1]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -14058,7 +14058,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l82.omp_outlined.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR2]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR1]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -14198,7 +14198,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l90.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[CH:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR2]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[CH:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR1]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -14312,7 +14312,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l90.omp_outlined.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR1]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -14438,7 +14438,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK11-SAME: () #[[ATTR5:[0-9]+]] {
+// CHECK11-SAME: () #[[ATTR4:[0-9]+]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK11-NEXT:    ret void
diff --git a/clang/test/OpenMP/distribute_parallel_for_simd_firstprivate_codegen.cpp b/clang/test/OpenMP/distribute_parallel_for_simd_firstprivate_codegen.cpp
index 9d7a352c0b3e8..86b097256edc1 100644
--- a/clang/test/OpenMP/distribute_parallel_for_simd_firstprivate_codegen.cpp
+++ b/clang/test/OpenMP/distribute_parallel_for_simd_firstprivate_codegen.cpp
@@ -299,7 +299,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l67.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[G:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[G1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SFVAR:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[G:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[G1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SFVAR:%.*]]) #[[ATTR2]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -415,7 +415,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l67.omp_outlined.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[G:%.*]], i64 noundef [[G1:%.*]], i64 noundef [[SVAR:%.*]], i64 noundef [[SFVAR:%.*]]) #[[ATTR3]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[G:%.*]], i64 noundef [[G1:%.*]], i64 noundef [[SVAR:%.*]], i64 noundef [[SFVAR:%.*]]) #[[ATTR2]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -518,7 +518,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK1-SAME: () #[[ATTR5:[0-9]+]] {
+// CHECK1-SAME: () #[[ATTR4:[0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK1-NEXT:    ret void
@@ -572,7 +572,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l67.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[G:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[G1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SFVAR:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[G:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[G1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SFVAR:%.*]]) #[[ATTR2]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -682,7 +682,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l67.omp_outlined.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[G:%.*]], i32 noundef [[G1:%.*]], i32 noundef [[SVAR:%.*]], i32 noundef [[SFVAR:%.*]]) #[[ATTR3]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[G:%.*]], i32 noundef [[G1:%.*]], i32 noundef [[SVAR:%.*]], i32 noundef [[SFVAR:%.*]]) #[[ATTR2]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -787,7 +787,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK3-SAME: () #[[ATTR5:[0-9]+]] {
+// CHECK3-SAME: () #[[ATTR4:[0-9]+]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK3-NEXT:    ret void
@@ -931,7 +931,7 @@ int main() {
 // CHECK8-NEXT:    [[TMP39:%.*]] = icmp ne i32 [[TMP38]], 0
 // CHECK8-NEXT:    br i1 [[TMP39]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK8:       omp_offload.failed:
-// CHECK8-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l138(i64 [[TMP2]], ptr [[VEC]], ptr [[S_ARR]], ptr [[TMP3]], i64 [[TMP5]]) #[[ATTR5:[0-9]+]]
+// CHECK8-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l138(i64 [[TMP2]], ptr [[VEC]], ptr [[S_ARR]], ptr [[TMP3]], i64 [[TMP5]]) #[[ATTR4:[0-9]+]]
 // CHECK8-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK8:       omp_offload.cont:
 // CHECK8-NEXT:    [[CALL:%.*]] = call noundef signext i32 @_Z5tmainIiET_v()
@@ -942,11 +942,11 @@ int main() {
 // CHECK8:       arraydestroy.body:
 // CHECK8-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP40]], [[OMP_OFFLOAD_CONT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK8-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
-// CHECK8-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK8-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK8-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN]]
 // CHECK8-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE2:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK8:       arraydestroy.done2:
-// CHECK8-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR5]]
+// CHECK8-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]]
 // CHECK8-NEXT:    [[TMP41:%.*]] = load i32, ptr [[RETVAL]], align 4
 // CHECK8-NEXT:    ret i32 [[TMP41]]
 //
@@ -998,7 +998,7 @@ int main() {
 //
 //
 // CHECK8-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l138.omp_outlined
-// CHECK8-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]]) #[[ATTR4:[0-9]+]] {
+// CHECK8-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]]) #[[ATTR3]] {
 // CHECK8-NEXT:  entry:
 // CHECK8-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK8-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1121,14 +1121,14 @@ int main() {
 // CHECK8-NEXT:    store i32 2, ptr [[I]], align 4
 // CHECK8-NEXT:    br label [[DOTOMP_FINAL_DONE]]
 // CHECK8:       .omp.final.done:
-// CHECK8-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR7]]) #[[ATTR5]]
+// CHECK8-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR7]]) #[[ATTR4]]
 // CHECK8-NEXT:    [[ARRAY_BEGIN11:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR5]], i32 0, i32 0
 // CHECK8-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN11]], i64 2
 // CHECK8-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK8:       arraydestroy.body:
 // CHECK8-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP32]], [[DOTOMP_FINAL_DONE]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK8-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
-// CHECK8-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK8-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK8-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN11]]
 // CHECK8-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE12:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK8:       arraydestroy.done12:
@@ -1136,7 +1136,7 @@ int main() {
 //
 //
 // CHECK8-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l138.omp_outlined.omp_outlined
-// CHECK8-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i64 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], i64 noundef [[SVAR:%.*]]) #[[ATTR4]] {
+// CHECK8-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i64 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], i64 noundef [[SVAR:%.*]]) #[[ATTR3]] {
 // CHECK8-NEXT:  entry:
 // CHECK8-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK8-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1259,14 +1259,14 @@ int main() {
 // CHECK8-NEXT:    store i32 2, ptr [[I]], align 4
 // CHECK8-NEXT:    br label [[DOTOMP_FINAL_DONE]]
 // CHECK8:       .omp.final.done:
-// CHECK8-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR6]]) #[[ATTR5]]
+// CHECK8-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR6]]) #[[ATTR4]]
 // CHECK8-NEXT:    [[ARRAY_BEGIN12:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR4]], i32 0, i32 0
 // CHECK8-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN12]], i64 2
 // CHECK8-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK8:       arraydestroy.body:
 // CHECK8-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP24]], [[DOTOMP_FINAL_DONE]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK8-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
-// CHECK8-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK8-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK8-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN12]]
 // CHECK8-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE13:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK8:       arraydestroy.done13:
@@ -1279,12 +1279,12 @@ int main() {
 // CHECK8-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK8-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
 // CHECK8-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
-// CHECK8-NEXT:    call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]]
+// CHECK8-NEXT:    call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]]
 // CHECK8-NEXT:    ret void
 //
 //
 // CHECK8-LABEL: define {{[^@]+}}@_Z5tmainIiET_v
-// CHECK8-SAME: () #[[ATTR6:[0-9]+]] comdat {
+// CHECK8-SAME: () #[[ATTR5:[0-9]+]] comdat {
 // CHECK8-NEXT:  entry:
 // CHECK8-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
 // CHECK8-NEXT:    [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4
@@ -1371,7 +1371,7 @@ int main() {
 // CHECK8-NEXT:    [[TMP34:%.*]] = icmp ne i32 [[TMP33]], 0
 // CHECK8-NEXT:    br i1 [[TMP34]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK8:       omp_offload.failed:
-// CHECK8-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l48(i64 [[TMP2]], ptr [[VEC]], ptr [[S_ARR]], ptr [[TMP3]]) #[[ATTR5]]
+// CHECK8-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l48(i64 [[TMP2]], ptr [[VEC]], ptr [[S_ARR]], ptr [[TMP3]]) #[[ATTR4]]
 // CHECK8-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK8:       omp_offload.cont:
 // CHECK8-NEXT:    store i32 0, ptr [[RETVAL]], align 4
@@ -1381,11 +1381,11 @@ int main() {
 // CHECK8:       arraydestroy.body:
 // CHECK8-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP35]], [[OMP_OFFLOAD_CONT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK8-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
-// CHECK8-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK8-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK8-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN]]
 // CHECK8-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE2:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK8:       arraydestroy.done2:
-// CHECK8-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR5]]
+// CHECK8-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]]
 // CHECK8-NEXT:    [[TMP36:%.*]] = load i32, ptr [[RETVAL]], align 4
 // CHECK8-NEXT:    ret i32 [[TMP36]]
 //
@@ -1469,7 +1469,7 @@ int main() {
 //
 //
 // CHECK8-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l48.omp_outlined
-// CHECK8-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR4]] {
+// CHECK8-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR3]] {
 // CHECK8-NEXT:  entry:
 // CHECK8-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK8-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1582,14 +1582,14 @@ int main() {
 // CHECK8-NEXT:    store i32 2, ptr [[I]], align 4
 // CHECK8-NEXT:    br label [[DOTOMP_FINAL_DONE]]
 // CHECK8:       .omp.final.done:
-// CHECK8-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR7]]) #[[ATTR5]]
+// CHECK8-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR7]]) #[[ATTR4]]
 // CHECK8-NEXT:    [[ARRAY_BEGIN10:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR5]], i32 0, i32 0
 // CHECK8-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN10]], i64 2
 // CHECK8-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK8:       arraydestroy.body:
 // CHECK8-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP28]], [[DOTOMP_FINAL_DONE]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK8-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
-// CHECK8-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK8-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK8-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN10]]
 // CHECK8-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE11:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK8:       arraydestroy.done11:
@@ -1597,7 +1597,7 @@ int main() {
 //
 //
 // CHECK8-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l48.omp_outlined.omp_outlined
-// CHECK8-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i64 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR4]] {
+// CHECK8-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i64 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR3]] {
 // CHECK8-NEXT:  entry:
 // CHECK8-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK8-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1718,14 +1718,14 @@ int main() {
 // CHECK8-NEXT:    store i32 2, ptr [[I]], align 4
 // CHECK8-NEXT:    br label [[DOTOMP_FINAL_DONE]]
 // CHECK8:       .omp.final.done:
-// CHECK8-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR6]]) #[[ATTR5]]
+// CHECK8-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR6]]) #[[ATTR4]]
 // CHECK8-NEXT:    [[ARRAY_BEGIN12:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR4]], i32 0, i32 0
 // CHECK8-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN12]], i64 2
 // CHECK8-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK8:       arraydestroy.body:
 // CHECK8-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP24]], [[DOTOMP_FINAL_DONE]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK8-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
-// CHECK8-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK8-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK8-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN12]]
 // CHECK8-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE13:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK8:       arraydestroy.done13:
@@ -1738,7 +1738,7 @@ int main() {
 // CHECK8-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK8-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
 // CHECK8-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
-// CHECK8-NEXT:    call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]]
+// CHECK8-NEXT:    call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]]
 // CHECK8-NEXT:    ret void
 //
 //
@@ -1777,7 +1777,7 @@ int main() {
 //
 //
 // CHECK8-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK8-SAME: () #[[ATTR7:[0-9]+]] {
+// CHECK8-SAME: () #[[ATTR6:[0-9]+]] {
 // CHECK8-NEXT:  entry:
 // CHECK8-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK8-NEXT:    ret void
@@ -1885,7 +1885,7 @@ int main() {
 // CHECK10-NEXT:    [[TMP39:%.*]] = icmp ne i32 [[TMP38]], 0
 // CHECK10-NEXT:    br i1 [[TMP39]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK10:       omp_offload.failed:
-// CHECK10-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l138(i32 [[TMP2]], ptr [[VEC]], ptr [[S_ARR]], ptr [[TMP3]], i32 [[TMP5]]) #[[ATTR5:[0-9]+]]
+// CHECK10-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l138(i32 [[TMP2]], ptr [[VEC]], ptr [[S_ARR]], ptr [[TMP3]], i32 [[TMP5]]) #[[ATTR4:[0-9]+]]
 // CHECK10-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK10:       omp_offload.cont:
 // CHECK10-NEXT:    [[CALL:%.*]] = call noundef i32 @_Z5tmainIiET_v()
@@ -1896,11 +1896,11 @@ int main() {
 // CHECK10:       arraydestroy.body:
 // CHECK10-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP40]], [[OMP_OFFLOAD_CONT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK10-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1
-// CHECK10-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK10-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK10-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN]]
 // CHECK10-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE2:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK10:       arraydestroy.done2:
-// CHECK10-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR5]]
+// CHECK10-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]]
 // CHECK10-NEXT:    [[TMP41:%.*]] = load i32, ptr [[RETVAL]], align 4
 // CHECK10-NEXT:    ret i32 [[TMP41]]
 //
@@ -1952,7 +1952,7 @@ int main() {
 //
 //
 // CHECK10-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l138.omp_outlined
-// CHECK10-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]]) #[[ATTR4:[0-9]+]] {
+// CHECK10-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]]) #[[ATTR3]] {
 // CHECK10-NEXT:  entry:
 // CHECK10-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK10-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -2073,14 +2073,14 @@ int main() {
 // CHECK10-NEXT:    store i32 2, ptr [[I]], align 4
 // CHECK10-NEXT:    br label [[DOTOMP_FINAL_DONE]]
 // CHECK10:       .omp.final.done:
-// CHECK10-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR7]]) #[[ATTR5]]
+// CHECK10-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR7]]) #[[ATTR4]]
 // CHECK10-NEXT:    [[ARRAY_BEGIN11:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR5]], i32 0, i32 0
 // CHECK10-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN11]], i32 2
 // CHECK10-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK10:       arraydestroy.body:
 // CHECK10-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP30]], [[DOTOMP_FINAL_DONE]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK10-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1
-// CHECK10-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK10-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK10-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN11]]
 // CHECK10-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE12:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK10:       arraydestroy.done12:
@@ -2088,7 +2088,7 @@ int main() {
 //
 //
 // CHECK10-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l138.omp_outlined.omp_outlined
-// CHECK10-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i32 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], i32 noundef [[SVAR:%.*]]) #[[ATTR4]] {
+// CHECK10-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i32 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], i32 noundef [[SVAR:%.*]]) #[[ATTR3]] {
 // CHECK10-NEXT:  entry:
 // CHECK10-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK10-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -2207,14 +2207,14 @@ int main() {
 // CHECK10-NEXT:    store i32 2, ptr [[I]], align 4
 // CHECK10-NEXT:    br label [[DOTOMP_FINAL_DONE]]
 // CHECK10:       .omp.final.done:
-// CHECK10-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR5]]
+// CHECK10-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR4]]
 // CHECK10-NEXT:    [[ARRAY_BEGIN10:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR3]], i32 0, i32 0
 // CHECK10-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN10]], i32 2
 // CHECK10-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK10:       arraydestroy.body:
 // CHECK10-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP24]], [[DOTOMP_FINAL_DONE]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK10-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1
-// CHECK10-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK10-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK10-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN10]]
 // CHECK10-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE11:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK10:       arraydestroy.done11:
@@ -2227,12 +2227,12 @@ int main() {
 // CHECK10-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
 // CHECK10-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
 // CHECK10-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
-// CHECK10-NEXT:    call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]]
+// CHECK10-NEXT:    call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]]
 // CHECK10-NEXT:    ret void
 //
 //
 // CHECK10-LABEL: define {{[^@]+}}@_Z5tmainIiET_v
-// CHECK10-SAME: () #[[ATTR6:[0-9]+]] comdat {
+// CHECK10-SAME: () #[[ATTR5:[0-9]+]] comdat {
 // CHECK10-NEXT:  entry:
 // CHECK10-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
 // CHECK10-NEXT:    [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4
@@ -2319,7 +2319,7 @@ int main() {
 // CHECK10-NEXT:    [[TMP34:%.*]] = icmp ne i32 [[TMP33]], 0
 // CHECK10-NEXT:    br i1 [[TMP34]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK10:       omp_offload.failed:
-// CHECK10-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l48(i32 [[TMP2]], ptr [[VEC]], ptr [[S_ARR]], ptr [[TMP3]]) #[[ATTR5]]
+// CHECK10-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l48(i32 [[TMP2]], ptr [[VEC]], ptr [[S_ARR]], ptr [[TMP3]]) #[[ATTR4]]
 // CHECK10-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK10:       omp_offload.cont:
 // CHECK10-NEXT:    store i32 0, ptr [[RETVAL]], align 4
@@ -2329,11 +2329,11 @@ int main() {
 // CHECK10:       arraydestroy.body:
 // CHECK10-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP35]], [[OMP_OFFLOAD_CONT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK10-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1
-// CHECK10-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK10-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK10-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN]]
 // CHECK10-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE2:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK10:       arraydestroy.done2:
-// CHECK10-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR5]]
+// CHECK10-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]]
 // CHECK10-NEXT:    [[TMP36:%.*]] = load i32, ptr [[RETVAL]], align 4
 // CHECK10-NEXT:    ret i32 [[TMP36]]
 //
@@ -2417,7 +2417,7 @@ int main() {
 //
 //
 // CHECK10-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l48.omp_outlined
-// CHECK10-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR4]] {
+// CHECK10-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR3]] {
 // CHECK10-NEXT:  entry:
 // CHECK10-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK10-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -2528,14 +2528,14 @@ int main() {
 // CHECK10-NEXT:    store i32 2, ptr [[I]], align 4
 // CHECK10-NEXT:    br label [[DOTOMP_FINAL_DONE]]
 // CHECK10:       .omp.final.done:
-// CHECK10-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR7]]) #[[ATTR5]]
+// CHECK10-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR7]]) #[[ATTR4]]
 // CHECK10-NEXT:    [[ARRAY_BEGIN10:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR5]], i32 0, i32 0
 // CHECK10-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN10]], i32 2
 // CHECK10-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK10:       arraydestroy.body:
 // CHECK10-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP26]], [[DOTOMP_FINAL_DONE]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK10-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1
-// CHECK10-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK10-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK10-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN10]]
 // CHECK10-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE11:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK10:       arraydestroy.done11:
@@ -2543,7 +2543,7 @@ int main() {
 //
 //
 // CHECK10-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l48.omp_outlined.omp_outlined
-// CHECK10-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i32 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR4]] {
+// CHECK10-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i32 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR3]] {
 // CHECK10-NEXT:  entry:
 // CHECK10-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK10-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -2660,14 +2660,14 @@ int main() {
 // CHECK10-NEXT:    store i32 2, ptr [[I]], align 4
 // CHECK10-NEXT:    br label [[DOTOMP_FINAL_DONE]]
 // CHECK10:       .omp.final.done:
-// CHECK10-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR5]]
+// CHECK10-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR4]]
 // CHECK10-NEXT:    [[ARRAY_BEGIN10:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR3]], i32 0, i32 0
 // CHECK10-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN10]], i32 2
 // CHECK10-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK10:       arraydestroy.body:
 // CHECK10-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP24]], [[DOTOMP_FINAL_DONE]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK10-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1
-// CHECK10-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK10-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK10-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN10]]
 // CHECK10-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE11:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK10:       arraydestroy.done11:
@@ -2680,7 +2680,7 @@ int main() {
 // CHECK10-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
 // CHECK10-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
 // CHECK10-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
-// CHECK10-NEXT:    call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]]
+// CHECK10-NEXT:    call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]]
 // CHECK10-NEXT:    ret void
 //
 //
@@ -2719,7 +2719,7 @@ int main() {
 //
 //
 // CHECK10-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK10-SAME: () #[[ATTR7:[0-9]+]] {
+// CHECK10-SAME: () #[[ATTR6:[0-9]+]] {
 // CHECK10-NEXT:  entry:
 // CHECK10-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK10-NEXT:    ret void
diff --git a/clang/test/OpenMP/distribute_parallel_for_simd_if_codegen.cpp b/clang/test/OpenMP/distribute_parallel_for_simd_if_codegen.cpp
index b0c49d083f381..de8d061aa6a6a 100644
--- a/clang/test/OpenMP/distribute_parallel_for_simd_if_codegen.cpp
+++ b/clang/test/OpenMP/distribute_parallel_for_simd_if_codegen.cpp
@@ -154,7 +154,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
 // CHECK1-NEXT:    br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK1:       omp_offload.failed:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l43() #[[ATTR3:[0-9]+]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l43() #[[ATTR2:[0-9]+]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
@@ -187,7 +187,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0
 // CHECK1-NEXT:    br i1 [[TMP29]], label [[OMP_OFFLOAD_FAILED3:%.*]], label [[OMP_OFFLOAD_CONT4:%.*]]
 // CHECK1:       omp_offload.failed3:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l48() #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l48() #[[ATTR2]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT4]]
 // CHECK1:       omp_offload.cont4:
 // CHECK1-NEXT:    ret void
@@ -201,7 +201,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l43.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -268,7 +268,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l43.omp_outlined.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -352,7 +352,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l48.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -400,7 +400,7 @@ int main() {
 // CHECK1-NEXT:    call void @__kmpc_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP1]]), !llvm.access.group [[ACC_GRP20]]
 // CHECK1-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !llvm.access.group [[ACC_GRP20]]
 // CHECK1-NEXT:    store i32 0, ptr [[DOTBOUND_ZERO_ADDR]], align 4, !llvm.access.group [[ACC_GRP20]]
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l48.omp_outlined.omp_outlined(ptr [[TMP11]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP8]], i64 [[TMP10]]) #[[ATTR3]], !llvm.access.group [[ACC_GRP20]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l48.omp_outlined.omp_outlined(ptr [[TMP11]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP8]], i64 [[TMP10]]) #[[ATTR2]], !llvm.access.group [[ACC_GRP20]]
 // CHECK1-NEXT:    call void @__kmpc_end_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP1]]), !llvm.access.group [[ACC_GRP20]]
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK1:       omp.inner.for.inc:
@@ -424,7 +424,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l48.omp_outlined.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -502,7 +502,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@main
-// CHECK1-SAME: () #[[ATTR4:[0-9]+]] {
+// CHECK1-SAME: () #[[ATTR3:[0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[TMP:%.*]] = alloca i32, align 4
@@ -546,7 +546,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
 // CHECK1-NEXT:    br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK1:       omp_offload.failed:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l81() #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l81() #[[ATTR2]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
@@ -579,7 +579,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0
 // CHECK1-NEXT:    br i1 [[TMP29]], label [[OMP_OFFLOAD_FAILED3:%.*]], label [[OMP_OFFLOAD_CONT4:%.*]]
 // CHECK1:       omp_offload.failed3:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l90() #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l90() #[[ATTR2]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT4]]
 // CHECK1:       omp_offload.cont4:
 // CHECK1-NEXT:    [[TMP30:%.*]] = load i32, ptr @Arg, align 4
@@ -623,7 +623,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP51:%.*]] = icmp ne i32 [[TMP50]], 0
 // CHECK1-NEXT:    br i1 [[TMP51]], label [[OMP_OFFLOAD_FAILED7:%.*]], label [[OMP_OFFLOAD_CONT8:%.*]]
 // CHECK1:       omp_offload.failed7:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l99(i64 [[TMP31]]) #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l99(i64 [[TMP31]]) #[[ATTR2]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT8]]
 // CHECK1:       omp_offload.cont8:
 // CHECK1-NEXT:    [[TMP52:%.*]] = load i32, ptr @Arg, align 4
@@ -639,7 +639,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l81.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -706,7 +706,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l81.omp_outlined.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -791,7 +791,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l90.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -839,7 +839,7 @@ int main() {
 // CHECK1-NEXT:    call void @__kmpc_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP1]]), !llvm.access.group [[ACC_GRP32]]
 // CHECK1-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !llvm.access.group [[ACC_GRP32]]
 // CHECK1-NEXT:    store i32 0, ptr [[DOTBOUND_ZERO_ADDR]], align 4, !llvm.access.group [[ACC_GRP32]]
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l90.omp_outlined.omp_outlined(ptr [[TMP11]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP8]], i64 [[TMP10]]) #[[ATTR3]], !llvm.access.group [[ACC_GRP32]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l90.omp_outlined.omp_outlined(ptr [[TMP11]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP8]], i64 [[TMP10]]) #[[ATTR2]], !llvm.access.group [[ACC_GRP32]]
 // CHECK1-NEXT:    call void @__kmpc_end_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP1]]), !llvm.access.group [[ACC_GRP32]]
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK1:       omp.inner.for.inc:
@@ -863,7 +863,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l90.omp_outlined.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -950,7 +950,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l99.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[ARG:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[ARG:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1008,7 +1008,7 @@ int main() {
 // CHECK1-NEXT:    call void @__kmpc_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP2]]), !llvm.access.group [[ACC_GRP38]]
 // CHECK1-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !llvm.access.group [[ACC_GRP38]]
 // CHECK1-NEXT:    store i32 0, ptr [[DOTBOUND_ZERO_ADDR]], align 4, !llvm.access.group [[ACC_GRP38]]
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l99.omp_outlined.omp_outlined(ptr [[TMP13]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP9]], i64 [[TMP11]]) #[[ATTR3]], !llvm.access.group [[ACC_GRP38]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l99.omp_outlined.omp_outlined(ptr [[TMP13]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP9]], i64 [[TMP11]]) #[[ATTR2]], !llvm.access.group [[ACC_GRP38]]
 // CHECK1-NEXT:    call void @__kmpc_end_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP2]]), !llvm.access.group [[ACC_GRP38]]
 // CHECK1-NEXT:    br label [[OMP_IF_END]]
 // CHECK1:       omp_if.end:
@@ -1034,7 +1034,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l99.omp_outlined.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1156,7 +1156,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
 // CHECK1-NEXT:    br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK1:       omp_offload.failed:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l59() #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l59() #[[ATTR2]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
@@ -1189,7 +1189,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0
 // CHECK1-NEXT:    br i1 [[TMP29]], label [[OMP_OFFLOAD_FAILED3:%.*]], label [[OMP_OFFLOAD_CONT4:%.*]]
 // CHECK1:       omp_offload.failed3:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l65() #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l65() #[[ATTR2]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT4]]
 // CHECK1:       omp_offload.cont4:
 // CHECK1-NEXT:    [[TMP30:%.*]] = load i32, ptr [[ARG_ADDR]], align 4
@@ -1233,7 +1233,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP51:%.*]] = icmp ne i32 [[TMP50]], 0
 // CHECK1-NEXT:    br i1 [[TMP51]], label [[OMP_OFFLOAD_FAILED7:%.*]], label [[OMP_OFFLOAD_CONT8:%.*]]
 // CHECK1:       omp_offload.failed7:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l71(i64 [[TMP31]]) #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l71(i64 [[TMP31]]) #[[ATTR2]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT8]]
 // CHECK1:       omp_offload.cont8:
 // CHECK1-NEXT:    ret i32 0
@@ -1247,7 +1247,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l59.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1314,7 +1314,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l59.omp_outlined.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1399,7 +1399,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l65.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1447,7 +1447,7 @@ int main() {
 // CHECK1-NEXT:    call void @__kmpc_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP1]]), !llvm.access.group [[ACC_GRP50]]
 // CHECK1-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !llvm.access.group [[ACC_GRP50]]
 // CHECK1-NEXT:    store i32 0, ptr [[DOTBOUND_ZERO_ADDR]], align 4, !llvm.access.group [[ACC_GRP50]]
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l65.omp_outlined.omp_outlined(ptr [[TMP11]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP8]], i64 [[TMP10]]) #[[ATTR3]], !llvm.access.group [[ACC_GRP50]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l65.omp_outlined.omp_outlined(ptr [[TMP11]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP8]], i64 [[TMP10]]) #[[ATTR2]], !llvm.access.group [[ACC_GRP50]]
 // CHECK1-NEXT:    call void @__kmpc_end_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP1]]), !llvm.access.group [[ACC_GRP50]]
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK1:       omp.inner.for.inc:
@@ -1471,7 +1471,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l65.omp_outlined.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1558,7 +1558,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l71.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[ARG:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[ARG:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1616,7 +1616,7 @@ int main() {
 // CHECK1-NEXT:    call void @__kmpc_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP2]]), !llvm.access.group [[ACC_GRP56]]
 // CHECK1-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !llvm.access.group [[ACC_GRP56]]
 // CHECK1-NEXT:    store i32 0, ptr [[DOTBOUND_ZERO_ADDR]], align 4, !llvm.access.group [[ACC_GRP56]]
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l71.omp_outlined.omp_outlined(ptr [[TMP13]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP9]], i64 [[TMP11]]) #[[ATTR3]], !llvm.access.group [[ACC_GRP56]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l71.omp_outlined.omp_outlined(ptr [[TMP13]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP9]], i64 [[TMP11]]) #[[ATTR2]], !llvm.access.group [[ACC_GRP56]]
 // CHECK1-NEXT:    call void @__kmpc_end_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP2]]), !llvm.access.group [[ACC_GRP56]]
 // CHECK1-NEXT:    br label [[OMP_IF_END]]
 // CHECK1:       omp_if.end:
@@ -1642,7 +1642,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l71.omp_outlined.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1720,7 +1720,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK1-SAME: () #[[ATTR6:[0-9]+]] {
+// CHECK1-SAME: () #[[ATTR5:[0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK1-NEXT:    ret void
@@ -1763,7 +1763,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
 // CHECK3-NEXT:    br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK3:       omp_offload.failed:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l43() #[[ATTR3:[0-9]+]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l43() #[[ATTR2:[0-9]+]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK3:       omp_offload.cont:
 // CHECK3-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
@@ -1796,7 +1796,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0
 // CHECK3-NEXT:    br i1 [[TMP29]], label [[OMP_OFFLOAD_FAILED3:%.*]], label [[OMP_OFFLOAD_CONT4:%.*]]
 // CHECK3:       omp_offload.failed3:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l48() #[[ATTR3]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l48() #[[ATTR2]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT4]]
 // CHECK3:       omp_offload.cont4:
 // CHECK3-NEXT:    ret void
@@ -1810,7 +1810,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l43.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1877,7 +1877,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l43.omp_outlined.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1961,7 +1961,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l48.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -2009,7 +2009,7 @@ int main() {
 // CHECK3-NEXT:    call void @__kmpc_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP1]]), !llvm.access.group [[ACC_GRP20]]
 // CHECK3-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !llvm.access.group [[ACC_GRP20]]
 // CHECK3-NEXT:    store i32 0, ptr [[DOTBOUND_ZERO_ADDR]], align 4, !llvm.access.group [[ACC_GRP20]]
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l48.omp_outlined.omp_outlined(ptr [[TMP11]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP8]], i64 [[TMP10]]) #[[ATTR3]], !llvm.access.group [[ACC_GRP20]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l48.omp_outlined.omp_outlined(ptr [[TMP11]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP8]], i64 [[TMP10]]) #[[ATTR2]], !llvm.access.group [[ACC_GRP20]]
 // CHECK3-NEXT:    call void @__kmpc_end_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP1]]), !llvm.access.group [[ACC_GRP20]]
 // CHECK3-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK3:       omp.inner.for.inc:
@@ -2033,7 +2033,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l48.omp_outlined.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -2111,7 +2111,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@main
-// CHECK3-SAME: () #[[ATTR4:[0-9]+]] {
+// CHECK3-SAME: () #[[ATTR3:[0-9]+]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
 // CHECK3-NEXT:    [[TMP:%.*]] = alloca i32, align 4
@@ -2155,7 +2155,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
 // CHECK3-NEXT:    br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK3:       omp_offload.failed:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l81() #[[ATTR3]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l81() #[[ATTR2]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK3:       omp_offload.cont:
 // CHECK3-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
@@ -2188,7 +2188,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0
 // CHECK3-NEXT:    br i1 [[TMP29]], label [[OMP_OFFLOAD_FAILED3:%.*]], label [[OMP_OFFLOAD_CONT4:%.*]]
 // CHECK3:       omp_offload.failed3:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l90() #[[ATTR3]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l90() #[[ATTR2]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT4]]
 // CHECK3:       omp_offload.cont4:
 // CHECK3-NEXT:    [[TMP30:%.*]] = load i32, ptr @Arg, align 4
@@ -2232,7 +2232,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP51:%.*]] = icmp ne i32 [[TMP50]], 0
 // CHECK3-NEXT:    br i1 [[TMP51]], label [[OMP_OFFLOAD_FAILED7:%.*]], label [[OMP_OFFLOAD_CONT8:%.*]]
 // CHECK3:       omp_offload.failed7:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l99(i64 [[TMP31]]) #[[ATTR3]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l99(i64 [[TMP31]]) #[[ATTR2]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT8]]
 // CHECK3:       omp_offload.cont8:
 // CHECK3-NEXT:    [[TMP52:%.*]] = load i32, ptr @Arg, align 4
@@ -2248,7 +2248,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l81.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -2315,7 +2315,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l81.omp_outlined.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -2400,7 +2400,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l90.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -2448,7 +2448,7 @@ int main() {
 // CHECK3-NEXT:    call void @__kmpc_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP1]])
 // CHECK3-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
 // CHECK3-NEXT:    store i32 0, ptr [[DOTBOUND_ZERO_ADDR]], align 4
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l90.omp_outlined.omp_outlined(ptr [[TMP11]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP8]], i64 [[TMP10]]) #[[ATTR3]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l90.omp_outlined.omp_outlined(ptr [[TMP11]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP8]], i64 [[TMP10]]) #[[ATTR2]]
 // CHECK3-NEXT:    call void @__kmpc_end_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP1]])
 // CHECK3-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK3:       omp.inner.for.inc:
@@ -2472,7 +2472,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l90.omp_outlined.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -2559,7 +2559,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l99.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[ARG:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[ARG:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -2634,7 +2634,7 @@ int main() {
 // CHECK3-NEXT:    call void @__kmpc_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP3]]), !llvm.access.group [[ACC_GRP35]]
 // CHECK3-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !llvm.access.group [[ACC_GRP35]]
 // CHECK3-NEXT:    store i32 0, ptr [[DOTBOUND_ZERO_ADDR]], align 4, !llvm.access.group [[ACC_GRP35]]
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l99.omp_outlined.omp_outlined(ptr [[TMP17]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP11]], i64 [[TMP13]], i64 [[TMP15]]) #[[ATTR3]], !llvm.access.group [[ACC_GRP35]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l99.omp_outlined.omp_outlined(ptr [[TMP17]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP11]], i64 [[TMP13]], i64 [[TMP15]]) #[[ATTR2]], !llvm.access.group [[ACC_GRP35]]
 // CHECK3-NEXT:    call void @__kmpc_end_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP3]]), !llvm.access.group [[ACC_GRP35]]
 // CHECK3-NEXT:    br label [[OMP_IF_END]]
 // CHECK3:       omp_if.end:
@@ -2674,7 +2674,7 @@ int main() {
 // CHECK3-NEXT:    call void @__kmpc_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP3]])
 // CHECK3-NEXT:    [[TMP29:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
 // CHECK3-NEXT:    store i32 0, ptr [[DOTBOUND_ZERO_ADDR17]], align 4
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l99.omp_outlined.omp_outlined.1(ptr [[TMP29]], ptr [[DOTBOUND_ZERO_ADDR17]], i64 [[TMP23]], i64 [[TMP25]], i64 [[TMP27]]) #[[ATTR3]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l99.omp_outlined.omp_outlined.1(ptr [[TMP29]], ptr [[DOTBOUND_ZERO_ADDR17]], i64 [[TMP23]], i64 [[TMP25]], i64 [[TMP27]]) #[[ATTR2]]
 // CHECK3-NEXT:    call void @__kmpc_end_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP3]])
 // CHECK3-NEXT:    br label [[OMP_IF_END18]]
 // CHECK3:       omp_if.end18:
@@ -2702,7 +2702,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l99.omp_outlined.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -2829,7 +2829,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l99.omp_outlined.omp_outlined.1
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -3000,7 +3000,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
 // CHECK3-NEXT:    br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK3:       omp_offload.failed:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l59() #[[ATTR3]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l59() #[[ATTR2]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK3:       omp_offload.cont:
 // CHECK3-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
@@ -3033,7 +3033,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0
 // CHECK3-NEXT:    br i1 [[TMP29]], label [[OMP_OFFLOAD_FAILED3:%.*]], label [[OMP_OFFLOAD_CONT4:%.*]]
 // CHECK3:       omp_offload.failed3:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l65() #[[ATTR3]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l65() #[[ATTR2]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT4]]
 // CHECK3:       omp_offload.cont4:
 // CHECK3-NEXT:    [[TMP30:%.*]] = load i32, ptr [[ARG_ADDR]], align 4
@@ -3077,7 +3077,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP51:%.*]] = icmp ne i32 [[TMP50]], 0
 // CHECK3-NEXT:    br i1 [[TMP51]], label [[OMP_OFFLOAD_FAILED7:%.*]], label [[OMP_OFFLOAD_CONT8:%.*]]
 // CHECK3:       omp_offload.failed7:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l71(i64 [[TMP31]]) #[[ATTR3]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l71(i64 [[TMP31]]) #[[ATTR2]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT8]]
 // CHECK3:       omp_offload.cont8:
 // CHECK3-NEXT:    ret i32 0
@@ -3091,7 +3091,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l59.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -3158,7 +3158,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l59.omp_outlined.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -3243,7 +3243,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l65.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -3291,7 +3291,7 @@ int main() {
 // CHECK3-NEXT:    call void @__kmpc_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP1]])
 // CHECK3-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
 // CHECK3-NEXT:    store i32 0, ptr [[DOTBOUND_ZERO_ADDR]], align 4
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l65.omp_outlined.omp_outlined(ptr [[TMP11]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP8]], i64 [[TMP10]]) #[[ATTR3]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l65.omp_outlined.omp_outlined(ptr [[TMP11]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP8]], i64 [[TMP10]]) #[[ATTR2]]
 // CHECK3-NEXT:    call void @__kmpc_end_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP1]])
 // CHECK3-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK3:       omp.inner.for.inc:
@@ -3315,7 +3315,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l65.omp_outlined.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -3402,7 +3402,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l71.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[ARG:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[ARG:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -3460,7 +3460,7 @@ int main() {
 // CHECK3-NEXT:    call void @__kmpc_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP2]]), !llvm.access.group [[ACC_GRP55]]
 // CHECK3-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !llvm.access.group [[ACC_GRP55]]
 // CHECK3-NEXT:    store i32 0, ptr [[DOTBOUND_ZERO_ADDR]], align 4, !llvm.access.group [[ACC_GRP55]]
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l71.omp_outlined.omp_outlined(ptr [[TMP13]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP9]], i64 [[TMP11]]) #[[ATTR3]], !llvm.access.group [[ACC_GRP55]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l71.omp_outlined.omp_outlined(ptr [[TMP13]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP9]], i64 [[TMP11]]) #[[ATTR2]], !llvm.access.group [[ACC_GRP55]]
 // CHECK3-NEXT:    call void @__kmpc_end_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP2]]), !llvm.access.group [[ACC_GRP55]]
 // CHECK3-NEXT:    br label [[OMP_IF_END]]
 // CHECK3:       omp_if.end:
@@ -3486,7 +3486,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l71.omp_outlined.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -3564,7 +3564,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK3-SAME: () #[[ATTR6:[0-9]+]] {
+// CHECK3-SAME: () #[[ATTR5:[0-9]+]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK3-NEXT:    ret void
@@ -4183,7 +4183,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
 // CHECK9-NEXT:    br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK9:       omp_offload.failed:
-// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l43() #[[ATTR3:[0-9]+]]
+// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l43() #[[ATTR2:[0-9]+]]
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK9:       omp_offload.cont:
 // CHECK9-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
@@ -4216,7 +4216,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0
 // CHECK9-NEXT:    br i1 [[TMP29]], label [[OMP_OFFLOAD_FAILED3:%.*]], label [[OMP_OFFLOAD_CONT4:%.*]]
 // CHECK9:       omp_offload.failed3:
-// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l48() #[[ATTR3]]
+// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l48() #[[ATTR2]]
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT4]]
 // CHECK9:       omp_offload.cont4:
 // CHECK9-NEXT:    ret void
@@ -4230,7 +4230,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l43.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -4297,7 +4297,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l43.omp_outlined.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -4381,7 +4381,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l48.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -4429,7 +4429,7 @@ int main() {
 // CHECK9-NEXT:    call void @__kmpc_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP1]]), !llvm.access.group [[ACC_GRP20]]
 // CHECK9-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !llvm.access.group [[ACC_GRP20]]
 // CHECK9-NEXT:    store i32 0, ptr [[DOTBOUND_ZERO_ADDR]], align 4, !llvm.access.group [[ACC_GRP20]]
-// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l48.omp_outlined.omp_outlined(ptr [[TMP11]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP8]], i64 [[TMP10]]) #[[ATTR3]], !llvm.access.group [[ACC_GRP20]]
+// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l48.omp_outlined.omp_outlined(ptr [[TMP11]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP8]], i64 [[TMP10]]) #[[ATTR2]], !llvm.access.group [[ACC_GRP20]]
 // CHECK9-NEXT:    call void @__kmpc_end_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP1]]), !llvm.access.group [[ACC_GRP20]]
 // CHECK9-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK9:       omp.inner.for.inc:
@@ -4453,7 +4453,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l48.omp_outlined.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -4531,7 +4531,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@main
-// CHECK9-SAME: () #[[ATTR4:[0-9]+]] {
+// CHECK9-SAME: () #[[ATTR3:[0-9]+]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
 // CHECK9-NEXT:    [[TMP:%.*]] = alloca i32, align 4
@@ -4575,7 +4575,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
 // CHECK9-NEXT:    br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK9:       omp_offload.failed:
-// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l81() #[[ATTR3]]
+// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l81() #[[ATTR2]]
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK9:       omp_offload.cont:
 // CHECK9-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
@@ -4608,7 +4608,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0
 // CHECK9-NEXT:    br i1 [[TMP29]], label [[OMP_OFFLOAD_FAILED3:%.*]], label [[OMP_OFFLOAD_CONT4:%.*]]
 // CHECK9:       omp_offload.failed3:
-// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l90() #[[ATTR3]]
+// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l90() #[[ATTR2]]
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT4]]
 // CHECK9:       omp_offload.cont4:
 // CHECK9-NEXT:    [[TMP30:%.*]] = load i32, ptr @Arg, align 4
@@ -4652,7 +4652,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP51:%.*]] = icmp ne i32 [[TMP50]], 0
 // CHECK9-NEXT:    br i1 [[TMP51]], label [[OMP_OFFLOAD_FAILED7:%.*]], label [[OMP_OFFLOAD_CONT8:%.*]]
 // CHECK9:       omp_offload.failed7:
-// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l99(i64 [[TMP31]]) #[[ATTR3]]
+// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l99(i64 [[TMP31]]) #[[ATTR2]]
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT8]]
 // CHECK9:       omp_offload.cont8:
 // CHECK9-NEXT:    [[TMP52:%.*]] = load i32, ptr @Arg, align 4
@@ -4668,7 +4668,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l81.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -4735,7 +4735,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l81.omp_outlined.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -4820,7 +4820,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l90.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -4868,7 +4868,7 @@ int main() {
 // CHECK9-NEXT:    call void @__kmpc_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP1]]), !llvm.access.group [[ACC_GRP32]]
 // CHECK9-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !llvm.access.group [[ACC_GRP32]]
 // CHECK9-NEXT:    store i32 0, ptr [[DOTBOUND_ZERO_ADDR]], align 4, !llvm.access.group [[ACC_GRP32]]
-// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l90.omp_outlined.omp_outlined(ptr [[TMP11]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP8]], i64 [[TMP10]]) #[[ATTR3]], !llvm.access.group [[ACC_GRP32]]
+// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l90.omp_outlined.omp_outlined(ptr [[TMP11]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP8]], i64 [[TMP10]]) #[[ATTR2]], !llvm.access.group [[ACC_GRP32]]
 // CHECK9-NEXT:    call void @__kmpc_end_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP1]]), !llvm.access.group [[ACC_GRP32]]
 // CHECK9-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK9:       omp.inner.for.inc:
@@ -4892,7 +4892,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l90.omp_outlined.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -4979,7 +4979,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l99.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[ARG:%.*]]) #[[ATTR2]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[ARG:%.*]]) #[[ATTR1]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -5037,7 +5037,7 @@ int main() {
 // CHECK9-NEXT:    call void @__kmpc_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP2]]), !llvm.access.group [[ACC_GRP38]]
 // CHECK9-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !llvm.access.group [[ACC_GRP38]]
 // CHECK9-NEXT:    store i32 0, ptr [[DOTBOUND_ZERO_ADDR]], align 4, !llvm.access.group [[ACC_GRP38]]
-// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l99.omp_outlined.omp_outlined(ptr [[TMP13]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP9]], i64 [[TMP11]]) #[[ATTR3]], !llvm.access.group [[ACC_GRP38]]
+// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l99.omp_outlined.omp_outlined(ptr [[TMP13]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP9]], i64 [[TMP11]]) #[[ATTR2]], !llvm.access.group [[ACC_GRP38]]
 // CHECK9-NEXT:    call void @__kmpc_end_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP2]]), !llvm.access.group [[ACC_GRP38]]
 // CHECK9-NEXT:    br label [[OMP_IF_END]]
 // CHECK9:       omp_if.end:
@@ -5063,7 +5063,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l99.omp_outlined.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -5185,7 +5185,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
 // CHECK9-NEXT:    br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK9:       omp_offload.failed:
-// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l59() #[[ATTR3]]
+// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l59() #[[ATTR2]]
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK9:       omp_offload.cont:
 // CHECK9-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
@@ -5218,7 +5218,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0
 // CHECK9-NEXT:    br i1 [[TMP29]], label [[OMP_OFFLOAD_FAILED3:%.*]], label [[OMP_OFFLOAD_CONT4:%.*]]
 // CHECK9:       omp_offload.failed3:
-// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l65() #[[ATTR3]]
+// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l65() #[[ATTR2]]
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT4]]
 // CHECK9:       omp_offload.cont4:
 // CHECK9-NEXT:    [[TMP30:%.*]] = load i32, ptr [[ARG_ADDR]], align 4
@@ -5262,7 +5262,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP51:%.*]] = icmp ne i32 [[TMP50]], 0
 // CHECK9-NEXT:    br i1 [[TMP51]], label [[OMP_OFFLOAD_FAILED7:%.*]], label [[OMP_OFFLOAD_CONT8:%.*]]
 // CHECK9:       omp_offload.failed7:
-// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l71(i64 [[TMP31]]) #[[ATTR3]]
+// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l71(i64 [[TMP31]]) #[[ATTR2]]
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT8]]
 // CHECK9:       omp_offload.cont8:
 // CHECK9-NEXT:    ret i32 0
@@ -5276,7 +5276,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l59.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -5343,7 +5343,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l59.omp_outlined.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -5428,7 +5428,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l65.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -5476,7 +5476,7 @@ int main() {
 // CHECK9-NEXT:    call void @__kmpc_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP1]]), !llvm.access.group [[ACC_GRP50]]
 // CHECK9-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !llvm.access.group [[ACC_GRP50]]
 // CHECK9-NEXT:    store i32 0, ptr [[DOTBOUND_ZERO_ADDR]], align 4, !llvm.access.group [[ACC_GRP50]]
-// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l65.omp_outlined.omp_outlined(ptr [[TMP11]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP8]], i64 [[TMP10]]) #[[ATTR3]], !llvm.access.group [[ACC_GRP50]]
+// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l65.omp_outlined.omp_outlined(ptr [[TMP11]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP8]], i64 [[TMP10]]) #[[ATTR2]], !llvm.access.group [[ACC_GRP50]]
 // CHECK9-NEXT:    call void @__kmpc_end_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP1]]), !llvm.access.group [[ACC_GRP50]]
 // CHECK9-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK9:       omp.inner.for.inc:
@@ -5500,7 +5500,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l65.omp_outlined.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -5587,7 +5587,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l71.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[ARG:%.*]]) #[[ATTR2]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[ARG:%.*]]) #[[ATTR1]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -5645,7 +5645,7 @@ int main() {
 // CHECK9-NEXT:    call void @__kmpc_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP2]]), !llvm.access.group [[ACC_GRP56]]
 // CHECK9-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !llvm.access.group [[ACC_GRP56]]
 // CHECK9-NEXT:    store i32 0, ptr [[DOTBOUND_ZERO_ADDR]], align 4, !llvm.access.group [[ACC_GRP56]]
-// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l71.omp_outlined.omp_outlined(ptr [[TMP13]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP9]], i64 [[TMP11]]) #[[ATTR3]], !llvm.access.group [[ACC_GRP56]]
+// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l71.omp_outlined.omp_outlined(ptr [[TMP13]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP9]], i64 [[TMP11]]) #[[ATTR2]], !llvm.access.group [[ACC_GRP56]]
 // CHECK9-NEXT:    call void @__kmpc_end_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP2]]), !llvm.access.group [[ACC_GRP56]]
 // CHECK9-NEXT:    br label [[OMP_IF_END]]
 // CHECK9:       omp_if.end:
@@ -5671,7 +5671,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l71.omp_outlined.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -5749,7 +5749,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK9-SAME: () #[[ATTR6:[0-9]+]] {
+// CHECK9-SAME: () #[[ATTR5:[0-9]+]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK9-NEXT:    ret void
@@ -5792,7 +5792,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
 // CHECK11-NEXT:    br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK11:       omp_offload.failed:
-// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l43() #[[ATTR3:[0-9]+]]
+// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l43() #[[ATTR2:[0-9]+]]
 // CHECK11-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK11:       omp_offload.cont:
 // CHECK11-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
@@ -5825,7 +5825,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0
 // CHECK11-NEXT:    br i1 [[TMP29]], label [[OMP_OFFLOAD_FAILED3:%.*]], label [[OMP_OFFLOAD_CONT4:%.*]]
 // CHECK11:       omp_offload.failed3:
-// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l48() #[[ATTR3]]
+// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l48() #[[ATTR2]]
 // CHECK11-NEXT:    br label [[OMP_OFFLOAD_CONT4]]
 // CHECK11:       omp_offload.cont4:
 // CHECK11-NEXT:    ret void
@@ -5839,7 +5839,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l43.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -5906,7 +5906,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l43.omp_outlined.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -5990,7 +5990,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l48.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -6038,7 +6038,7 @@ int main() {
 // CHECK11-NEXT:    call void @__kmpc_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP1]]), !llvm.access.group [[ACC_GRP20]]
 // CHECK11-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !llvm.access.group [[ACC_GRP20]]
 // CHECK11-NEXT:    store i32 0, ptr [[DOTBOUND_ZERO_ADDR]], align 4, !llvm.access.group [[ACC_GRP20]]
-// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l48.omp_outlined.omp_outlined(ptr [[TMP11]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP8]], i64 [[TMP10]]) #[[ATTR3]], !llvm.access.group [[ACC_GRP20]]
+// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l48.omp_outlined.omp_outlined(ptr [[TMP11]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP8]], i64 [[TMP10]]) #[[ATTR2]], !llvm.access.group [[ACC_GRP20]]
 // CHECK11-NEXT:    call void @__kmpc_end_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP1]]), !llvm.access.group [[ACC_GRP20]]
 // CHECK11-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK11:       omp.inner.for.inc:
@@ -6062,7 +6062,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l48.omp_outlined.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -6140,7 +6140,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@main
-// CHECK11-SAME: () #[[ATTR4:[0-9]+]] {
+// CHECK11-SAME: () #[[ATTR3:[0-9]+]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[TMP:%.*]] = alloca i32, align 4
@@ -6184,7 +6184,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
 // CHECK11-NEXT:    br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK11:       omp_offload.failed:
-// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l81() #[[ATTR3]]
+// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l81() #[[ATTR2]]
 // CHECK11-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK11:       omp_offload.cont:
 // CHECK11-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
@@ -6217,7 +6217,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0
 // CHECK11-NEXT:    br i1 [[TMP29]], label [[OMP_OFFLOAD_FAILED3:%.*]], label [[OMP_OFFLOAD_CONT4:%.*]]
 // CHECK11:       omp_offload.failed3:
-// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l90() #[[ATTR3]]
+// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l90() #[[ATTR2]]
 // CHECK11-NEXT:    br label [[OMP_OFFLOAD_CONT4]]
 // CHECK11:       omp_offload.cont4:
 // CHECK11-NEXT:    [[TMP30:%.*]] = load i32, ptr @Arg, align 4
@@ -6261,7 +6261,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP51:%.*]] = icmp ne i32 [[TMP50]], 0
 // CHECK11-NEXT:    br i1 [[TMP51]], label [[OMP_OFFLOAD_FAILED7:%.*]], label [[OMP_OFFLOAD_CONT8:%.*]]
 // CHECK11:       omp_offload.failed7:
-// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l99(i64 [[TMP31]]) #[[ATTR3]]
+// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l99(i64 [[TMP31]]) #[[ATTR2]]
 // CHECK11-NEXT:    br label [[OMP_OFFLOAD_CONT8]]
 // CHECK11:       omp_offload.cont8:
 // CHECK11-NEXT:    [[TMP52:%.*]] = load i32, ptr @Arg, align 4
@@ -6277,7 +6277,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l81.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -6344,7 +6344,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l81.omp_outlined.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -6429,7 +6429,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l90.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -6477,7 +6477,7 @@ int main() {
 // CHECK11-NEXT:    call void @__kmpc_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP1]])
 // CHECK11-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
 // CHECK11-NEXT:    store i32 0, ptr [[DOTBOUND_ZERO_ADDR]], align 4
-// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l90.omp_outlined.omp_outlined(ptr [[TMP11]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP8]], i64 [[TMP10]]) #[[ATTR3]]
+// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l90.omp_outlined.omp_outlined(ptr [[TMP11]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP8]], i64 [[TMP10]]) #[[ATTR2]]
 // CHECK11-NEXT:    call void @__kmpc_end_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP1]])
 // CHECK11-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK11:       omp.inner.for.inc:
@@ -6501,7 +6501,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l90.omp_outlined.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -6588,7 +6588,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l99.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[ARG:%.*]]) #[[ATTR2]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[ARG:%.*]]) #[[ATTR1]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -6663,7 +6663,7 @@ int main() {
 // CHECK11-NEXT:    call void @__kmpc_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP3]]), !llvm.access.group [[ACC_GRP35]]
 // CHECK11-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !llvm.access.group [[ACC_GRP35]]
 // CHECK11-NEXT:    store i32 0, ptr [[DOTBOUND_ZERO_ADDR]], align 4, !llvm.access.group [[ACC_GRP35]]
-// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l99.omp_outlined.omp_outlined(ptr [[TMP17]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP11]], i64 [[TMP13]], i64 [[TMP15]]) #[[ATTR3]], !llvm.access.group [[ACC_GRP35]]
+// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l99.omp_outlined.omp_outlined(ptr [[TMP17]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP11]], i64 [[TMP13]], i64 [[TMP15]]) #[[ATTR2]], !llvm.access.group [[ACC_GRP35]]
 // CHECK11-NEXT:    call void @__kmpc_end_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP3]]), !llvm.access.group [[ACC_GRP35]]
 // CHECK11-NEXT:    br label [[OMP_IF_END]]
 // CHECK11:       omp_if.end:
@@ -6703,7 +6703,7 @@ int main() {
 // CHECK11-NEXT:    call void @__kmpc_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP3]])
 // CHECK11-NEXT:    [[TMP29:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
 // CHECK11-NEXT:    store i32 0, ptr [[DOTBOUND_ZERO_ADDR17]], align 4
-// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l99.omp_outlined.omp_outlined.1(ptr [[TMP29]], ptr [[DOTBOUND_ZERO_ADDR17]], i64 [[TMP23]], i64 [[TMP25]], i64 [[TMP27]]) #[[ATTR3]]
+// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l99.omp_outlined.omp_outlined.1(ptr [[TMP29]], ptr [[DOTBOUND_ZERO_ADDR17]], i64 [[TMP23]], i64 [[TMP25]], i64 [[TMP27]]) #[[ATTR2]]
 // CHECK11-NEXT:    call void @__kmpc_end_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP3]])
 // CHECK11-NEXT:    br label [[OMP_IF_END18]]
 // CHECK11:       omp_if.end18:
@@ -6731,7 +6731,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l99.omp_outlined.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR1]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -6858,7 +6858,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l99.omp_outlined.omp_outlined.1
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR1]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -7029,7 +7029,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
 // CHECK11-NEXT:    br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK11:       omp_offload.failed:
-// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l59() #[[ATTR3]]
+// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l59() #[[ATTR2]]
 // CHECK11-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK11:       omp_offload.cont:
 // CHECK11-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
@@ -7062,7 +7062,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0
 // CHECK11-NEXT:    br i1 [[TMP29]], label [[OMP_OFFLOAD_FAILED3:%.*]], label [[OMP_OFFLOAD_CONT4:%.*]]
 // CHECK11:       omp_offload.failed3:
-// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l65() #[[ATTR3]]
+// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l65() #[[ATTR2]]
 // CHECK11-NEXT:    br label [[OMP_OFFLOAD_CONT4]]
 // CHECK11:       omp_offload.cont4:
 // CHECK11-NEXT:    [[TMP30:%.*]] = load i32, ptr [[ARG_ADDR]], align 4
@@ -7106,7 +7106,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP51:%.*]] = icmp ne i32 [[TMP50]], 0
 // CHECK11-NEXT:    br i1 [[TMP51]], label [[OMP_OFFLOAD_FAILED7:%.*]], label [[OMP_OFFLOAD_CONT8:%.*]]
 // CHECK11:       omp_offload.failed7:
-// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l71(i64 [[TMP31]]) #[[ATTR3]]
+// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l71(i64 [[TMP31]]) #[[ATTR2]]
 // CHECK11-NEXT:    br label [[OMP_OFFLOAD_CONT8]]
 // CHECK11:       omp_offload.cont8:
 // CHECK11-NEXT:    ret i32 0
@@ -7120,7 +7120,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l59.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -7187,7 +7187,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l59.omp_outlined.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -7272,7 +7272,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l65.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -7320,7 +7320,7 @@ int main() {
 // CHECK11-NEXT:    call void @__kmpc_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP1]])
 // CHECK11-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
 // CHECK11-NEXT:    store i32 0, ptr [[DOTBOUND_ZERO_ADDR]], align 4
-// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l65.omp_outlined.omp_outlined(ptr [[TMP11]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP8]], i64 [[TMP10]]) #[[ATTR3]]
+// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l65.omp_outlined.omp_outlined(ptr [[TMP11]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP8]], i64 [[TMP10]]) #[[ATTR2]]
 // CHECK11-NEXT:    call void @__kmpc_end_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP1]])
 // CHECK11-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK11:       omp.inner.for.inc:
@@ -7344,7 +7344,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l65.omp_outlined.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -7431,7 +7431,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l71.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[ARG:%.*]]) #[[ATTR2]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[ARG:%.*]]) #[[ATTR1]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -7489,7 +7489,7 @@ int main() {
 // CHECK11-NEXT:    call void @__kmpc_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP2]]), !llvm.access.group [[ACC_GRP55]]
 // CHECK11-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !llvm.access.group [[ACC_GRP55]]
 // CHECK11-NEXT:    store i32 0, ptr [[DOTBOUND_ZERO_ADDR]], align 4, !llvm.access.group [[ACC_GRP55]]
-// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l71.omp_outlined.omp_outlined(ptr [[TMP13]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP9]], i64 [[TMP11]]) #[[ATTR3]], !llvm.access.group [[ACC_GRP55]]
+// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l71.omp_outlined.omp_outlined(ptr [[TMP13]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP9]], i64 [[TMP11]]) #[[ATTR2]], !llvm.access.group [[ACC_GRP55]]
 // CHECK11-NEXT:    call void @__kmpc_end_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP2]]), !llvm.access.group [[ACC_GRP55]]
 // CHECK11-NEXT:    br label [[OMP_IF_END]]
 // CHECK11:       omp_if.end:
@@ -7515,7 +7515,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l71.omp_outlined.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -7593,7 +7593,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK11-SAME: () #[[ATTR6:[0-9]+]] {
+// CHECK11-SAME: () #[[ATTR5:[0-9]+]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK11-NEXT:    ret void
diff --git a/clang/test/OpenMP/distribute_parallel_for_simd_lastprivate_codegen.cpp b/clang/test/OpenMP/distribute_parallel_for_simd_lastprivate_codegen.cpp
index fe1495d00ef6a..53ea358f1adc3 100644
--- a/clang/test/OpenMP/distribute_parallel_for_simd_lastprivate_codegen.cpp
+++ b/clang/test/OpenMP/distribute_parallel_for_simd_lastprivate_codegen.cpp
@@ -244,7 +244,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[G:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[G1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SFVAR:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[G:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[G1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SFVAR:%.*]]) #[[ATTR2]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -351,7 +351,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68.omp_outlined.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[G:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[G1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SFVAR:%.*]]) #[[ATTR3]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[G:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[G1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SFVAR:%.*]]) #[[ATTR2]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -480,7 +480,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK1-SAME: () #[[ATTR5:[0-9]+]] {
+// CHECK1-SAME: () #[[ATTR4:[0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK1-NEXT:    ret void
@@ -534,7 +534,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[G:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[G1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SFVAR:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[G:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[G1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SFVAR:%.*]]) #[[ATTR2]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -639,7 +639,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68.omp_outlined.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[G:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[G1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SFVAR:%.*]]) #[[ATTR3]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[G:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[G1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SFVAR:%.*]]) #[[ATTR2]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -766,7 +766,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK3-SAME: () #[[ATTR5:[0-9]+]] {
+// CHECK3-SAME: () #[[ATTR4:[0-9]+]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK3-NEXT:    ret void
@@ -911,7 +911,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP39:%.*]] = icmp ne i32 [[TMP38]], 0
 // CHECK9-NEXT:    br i1 [[TMP39]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK9:       omp_offload.failed:
-// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l125(i64 [[TMP2]], ptr [[VEC]], ptr [[S_ARR]], ptr [[TMP3]], i64 [[TMP5]]) #[[ATTR5:[0-9]+]]
+// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l125(i64 [[TMP2]], ptr [[VEC]], ptr [[S_ARR]], ptr [[TMP3]], i64 [[TMP5]]) #[[ATTR4:[0-9]+]]
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK9:       omp_offload.cont:
 // CHECK9-NEXT:    [[CALL:%.*]] = call noundef signext i32 @_Z5tmainIiET_v()
@@ -922,11 +922,11 @@ int main() {
 // CHECK9:       arraydestroy.body:
 // CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP40]], [[OMP_OFFLOAD_CONT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
-// CHECK9-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK9-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN]]
 // CHECK9-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE2:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK9:       arraydestroy.done2:
-// CHECK9-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]]
 // CHECK9-NEXT:    [[TMP41:%.*]] = load i32, ptr [[RETVAL]], align 4
 // CHECK9-NEXT:    ret i32 [[TMP41]]
 //
@@ -978,7 +978,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l125.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]]) #[[ATTR4:[0-9]+]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]]) #[[ATTR3]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1111,14 +1111,14 @@ int main() {
 // CHECK9-NEXT:    store i32 [[TMP30]], ptr [[TMP4]], align 4
 // CHECK9-NEXT:    br label [[DOTOMP_LASTPRIVATE_DONE]]
 // CHECK9:       .omp.lastprivate.done:
-// CHECK9-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR6]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR6]]) #[[ATTR4]]
 // CHECK9-NEXT:    [[ARRAY_BEGIN12:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR5]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN12]], i64 2
 // CHECK9-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK9:       arraydestroy.body:
 // CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP31]], [[DOTOMP_LASTPRIVATE_DONE]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
-// CHECK9-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK9-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN12]]
 // CHECK9-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE13:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK9:       arraydestroy.done13:
@@ -1126,7 +1126,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l125.omp_outlined.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]]) #[[ATTR4]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]]) #[[ATTR3]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1275,14 +1275,14 @@ int main() {
 // CHECK9-NEXT:    store i32 [[TMP30]], ptr [[TMP4]], align 4
 // CHECK9-NEXT:    br label [[DOTOMP_LASTPRIVATE_DONE]]
 // CHECK9:       .omp.lastprivate.done:
-// CHECK9-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR6]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR6]]) #[[ATTR4]]
 // CHECK9-NEXT:    [[ARRAY_BEGIN15:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR5]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN15]], i64 2
 // CHECK9-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK9:       arraydestroy.body:
 // CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP31]], [[DOTOMP_LASTPRIVATE_DONE]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
-// CHECK9-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK9-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN15]]
 // CHECK9-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE16:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK9:       arraydestroy.done16:
@@ -1295,12 +1295,12 @@ int main() {
 // CHECK9-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
 // CHECK9-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
-// CHECK9-NEXT:    call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]]
 // CHECK9-NEXT:    ret void
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@_Z5tmainIiET_v
-// CHECK9-SAME: () #[[ATTR6:[0-9]+]] comdat {
+// CHECK9-SAME: () #[[ATTR5:[0-9]+]] comdat {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
 // CHECK9-NEXT:    [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4
@@ -1387,7 +1387,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP34:%.*]] = icmp ne i32 [[TMP33]], 0
 // CHECK9-NEXT:    br i1 [[TMP34]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK9:       omp_offload.failed:
-// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l49(i64 [[TMP2]], ptr [[VEC]], ptr [[S_ARR]], ptr [[TMP3]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l49(i64 [[TMP2]], ptr [[VEC]], ptr [[S_ARR]], ptr [[TMP3]]) #[[ATTR4]]
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK9:       omp_offload.cont:
 // CHECK9-NEXT:    store i32 0, ptr [[RETVAL]], align 4
@@ -1397,11 +1397,11 @@ int main() {
 // CHECK9:       arraydestroy.body:
 // CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP35]], [[OMP_OFFLOAD_CONT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
-// CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK9-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN]]
 // CHECK9-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE2:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK9:       arraydestroy.done2:
-// CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]]
 // CHECK9-NEXT:    [[TMP36:%.*]] = load i32, ptr [[RETVAL]], align 4
 // CHECK9-NEXT:    ret i32 [[TMP36]]
 //
@@ -1485,7 +1485,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l49.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR4]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR3]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1612,14 +1612,14 @@ int main() {
 // CHECK9-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP5]], ptr align 4 [[TMP28]], i64 4, i1 false)
 // CHECK9-NEXT:    br label [[DOTOMP_LASTPRIVATE_DONE]]
 // CHECK9:       .omp.lastprivate.done:
-// CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR6]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR6]]) #[[ATTR4]]
 // CHECK9-NEXT:    [[ARRAY_BEGIN11:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR5]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN11]], i64 2
 // CHECK9-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK9:       arraydestroy.body:
 // CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP29]], [[DOTOMP_LASTPRIVATE_DONE]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
-// CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK9-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN11]]
 // CHECK9-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE12:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK9:       arraydestroy.done12:
@@ -1627,7 +1627,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l49.omp_outlined.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR4]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR3]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1770,14 +1770,14 @@ int main() {
 // CHECK9-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP6]], ptr align 4 [[TMP28]], i64 4, i1 false)
 // CHECK9-NEXT:    br label [[DOTOMP_LASTPRIVATE_DONE]]
 // CHECK9:       .omp.lastprivate.done:
-// CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR6]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR6]]) #[[ATTR4]]
 // CHECK9-NEXT:    [[ARRAY_BEGIN14:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR5]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN14]], i64 2
 // CHECK9-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK9:       arraydestroy.body:
 // CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP29]], [[DOTOMP_LASTPRIVATE_DONE]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
-// CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK9-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN14]]
 // CHECK9-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE15:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK9:       arraydestroy.done15:
@@ -1790,7 +1790,7 @@ int main() {
 // CHECK9-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
 // CHECK9-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
-// CHECK9-NEXT:    call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]]
 // CHECK9-NEXT:    ret void
 //
 //
@@ -1829,7 +1829,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK9-SAME: () #[[ATTR7:[0-9]+]] {
+// CHECK9-SAME: () #[[ATTR6:[0-9]+]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK9-NEXT:    ret void
@@ -1938,7 +1938,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP39:%.*]] = icmp ne i32 [[TMP38]], 0
 // CHECK11-NEXT:    br i1 [[TMP39]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK11:       omp_offload.failed:
-// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l125(i32 [[TMP2]], ptr [[VEC]], ptr [[S_ARR]], ptr [[TMP3]], i32 [[TMP5]]) #[[ATTR5:[0-9]+]]
+// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l125(i32 [[TMP2]], ptr [[VEC]], ptr [[S_ARR]], ptr [[TMP3]], i32 [[TMP5]]) #[[ATTR4:[0-9]+]]
 // CHECK11-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK11:       omp_offload.cont:
 // CHECK11-NEXT:    [[CALL:%.*]] = call noundef i32 @_Z5tmainIiET_v()
@@ -1949,11 +1949,11 @@ int main() {
 // CHECK11:       arraydestroy.body:
 // CHECK11-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP40]], [[OMP_OFFLOAD_CONT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK11-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1
-// CHECK11-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK11-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN]]
 // CHECK11-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE2:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK11:       arraydestroy.done2:
-// CHECK11-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]]
 // CHECK11-NEXT:    [[TMP41:%.*]] = load i32, ptr [[RETVAL]], align 4
 // CHECK11-NEXT:    ret i32 [[TMP41]]
 //
@@ -2005,7 +2005,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l125.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]]) #[[ATTR4:[0-9]+]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]]) #[[ATTR3]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -2136,14 +2136,14 @@ int main() {
 // CHECK11-NEXT:    store i32 [[TMP28]], ptr [[TMP4]], align 4
 // CHECK11-NEXT:    br label [[DOTOMP_LASTPRIVATE_DONE]]
 // CHECK11:       .omp.lastprivate.done:
-// CHECK11-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR6]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR6]]) #[[ATTR4]]
 // CHECK11-NEXT:    [[ARRAY_BEGIN12:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR5]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN12]], i32 2
 // CHECK11-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK11:       arraydestroy.body:
 // CHECK11-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP29]], [[DOTOMP_LASTPRIVATE_DONE]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK11-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1
-// CHECK11-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK11-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN12]]
 // CHECK11-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE13:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK11:       arraydestroy.done13:
@@ -2151,7 +2151,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l125.omp_outlined.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]]) #[[ATTR4]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]]) #[[ATTR3]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -2296,14 +2296,14 @@ int main() {
 // CHECK11-NEXT:    store i32 [[TMP30]], ptr [[TMP4]], align 4
 // CHECK11-NEXT:    br label [[DOTOMP_LASTPRIVATE_DONE]]
 // CHECK11:       .omp.lastprivate.done:
-// CHECK11-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR4]]
 // CHECK11-NEXT:    [[ARRAY_BEGIN13:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR4]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP31:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN13]], i32 2
 // CHECK11-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK11:       arraydestroy.body:
 // CHECK11-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP31]], [[DOTOMP_LASTPRIVATE_DONE]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK11-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1
-// CHECK11-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK11-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN13]]
 // CHECK11-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE14:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK11:       arraydestroy.done14:
@@ -2316,12 +2316,12 @@ int main() {
 // CHECK11-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
 // CHECK11-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
-// CHECK11-NEXT:    call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]]
 // CHECK11-NEXT:    ret void
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@_Z5tmainIiET_v
-// CHECK11-SAME: () #[[ATTR6:[0-9]+]] comdat {
+// CHECK11-SAME: () #[[ATTR5:[0-9]+]] comdat {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4
@@ -2408,7 +2408,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP34:%.*]] = icmp ne i32 [[TMP33]], 0
 // CHECK11-NEXT:    br i1 [[TMP34]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK11:       omp_offload.failed:
-// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l49(i32 [[TMP2]], ptr [[VEC]], ptr [[S_ARR]], ptr [[TMP3]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l49(i32 [[TMP2]], ptr [[VEC]], ptr [[S_ARR]], ptr [[TMP3]]) #[[ATTR4]]
 // CHECK11-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK11:       omp_offload.cont:
 // CHECK11-NEXT:    store i32 0, ptr [[RETVAL]], align 4
@@ -2418,11 +2418,11 @@ int main() {
 // CHECK11:       arraydestroy.body:
 // CHECK11-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP35]], [[OMP_OFFLOAD_CONT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK11-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1
-// CHECK11-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK11-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN]]
 // CHECK11-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE2:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK11:       arraydestroy.done2:
-// CHECK11-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]]
 // CHECK11-NEXT:    [[TMP36:%.*]] = load i32, ptr [[RETVAL]], align 4
 // CHECK11-NEXT:    ret i32 [[TMP36]]
 //
@@ -2506,7 +2506,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l49.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR4]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR3]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -2631,14 +2631,14 @@ int main() {
 // CHECK11-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[TMP5]], ptr align 4 [[TMP26]], i32 4, i1 false)
 // CHECK11-NEXT:    br label [[DOTOMP_LASTPRIVATE_DONE]]
 // CHECK11:       .omp.lastprivate.done:
-// CHECK11-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR6]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR6]]) #[[ATTR4]]
 // CHECK11-NEXT:    [[ARRAY_BEGIN11:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR5]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN11]], i32 2
 // CHECK11-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK11:       arraydestroy.body:
 // CHECK11-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP27]], [[DOTOMP_LASTPRIVATE_DONE]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK11-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1
-// CHECK11-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK11-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN11]]
 // CHECK11-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE12:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK11:       arraydestroy.done12:
@@ -2646,7 +2646,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l49.omp_outlined.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR4]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR3]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -2785,14 +2785,14 @@ int main() {
 // CHECK11-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[TMP6]], ptr align 4 [[TMP28]], i32 4, i1 false)
 // CHECK11-NEXT:    br label [[DOTOMP_LASTPRIVATE_DONE]]
 // CHECK11:       .omp.lastprivate.done:
-// CHECK11-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR4]]
 // CHECK11-NEXT:    [[ARRAY_BEGIN12:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR4]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN12]], i32 2
 // CHECK11-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK11:       arraydestroy.body:
 // CHECK11-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP29]], [[DOTOMP_LASTPRIVATE_DONE]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK11-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1
-// CHECK11-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK11-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN12]]
 // CHECK11-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE13:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK11:       arraydestroy.done13:
@@ -2805,7 +2805,7 @@ int main() {
 // CHECK11-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
 // CHECK11-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
-// CHECK11-NEXT:    call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]]
 // CHECK11-NEXT:    ret void
 //
 //
@@ -2844,7 +2844,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK11-SAME: () #[[ATTR7:[0-9]+]] {
+// CHECK11-SAME: () #[[ATTR6:[0-9]+]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK11-NEXT:    ret void
diff --git a/clang/test/OpenMP/distribute_parallel_for_simd_num_threads_codegen.cpp b/clang/test/OpenMP/distribute_parallel_for_simd_num_threads_codegen.cpp
index a398d66d43463..c717d827eed48 100644
--- a/clang/test/OpenMP/distribute_parallel_for_simd_num_threads_codegen.cpp
+++ b/clang/test/OpenMP/distribute_parallel_for_simd_num_threads_codegen.cpp
@@ -145,7 +145,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
 // CHECK1-NEXT:    br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK1:       omp_offload.failed:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68() #[[ATTR5:[0-9]+]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68() #[[ATTR4:[0-9]+]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       lpad:
 // CHECK1-NEXT:    [[TMP15:%.*]] = landingpad { ptr, i32 }
@@ -154,7 +154,7 @@ int main() {
 // CHECK1-NEXT:    store ptr [[TMP16]], ptr [[EXN_SLOT]], align 8
 // CHECK1-NEXT:    [[TMP17:%.*]] = extractvalue { ptr, i32 } [[TMP15]], 1
 // CHECK1-NEXT:    store i32 [[TMP17]], ptr [[EHSELECTOR_SLOT]], align 4
-// CHECK1-NEXT:    call void @_ZN1SD1Ev(ptr nonnull align 8 dereferenceable(24) [[S]]) #[[ATTR5]]
+// CHECK1-NEXT:    call void @_ZN1SD1Ev(ptr nonnull align 8 dereferenceable(24) [[S]]) #[[ATTR4]]
 // CHECK1-NEXT:    br label [[EH_RESUME:%.*]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    [[TMP18:%.*]] = load i8, ptr [[A]], align 1
@@ -198,7 +198,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP39:%.*]] = icmp ne i32 [[TMP38]], 0
 // CHECK1-NEXT:    br i1 [[TMP39]], label [[OMP_OFFLOAD_FAILED3:%.*]], label [[OMP_OFFLOAD_CONT4:%.*]]
 // CHECK1:       omp_offload.failed3:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l74(i64 [[TMP19]]) #[[ATTR5]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l74(i64 [[TMP19]]) #[[ATTR4]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT4]]
 // CHECK1:       omp_offload.cont4:
 // CHECK1-NEXT:    [[TMP40:%.*]] = load i8, ptr [[A]], align 1
@@ -212,7 +212,7 @@ int main() {
 // CHECK1:       invoke.cont7:
 // CHECK1-NEXT:    [[ADD9:%.*]] = add nsw i32 [[ADD]], [[CALL8]]
 // CHECK1-NEXT:    store i32 [[ADD9]], ptr [[RETVAL]], align 4
-// CHECK1-NEXT:    call void @_ZN1SD1Ev(ptr nonnull align 8 dereferenceable(24) [[S]]) #[[ATTR5]]
+// CHECK1-NEXT:    call void @_ZN1SD1Ev(ptr nonnull align 8 dereferenceable(24) [[S]]) #[[ATTR4]]
 // CHECK1-NEXT:    [[TMP41:%.*]] = load i32, ptr [[RETVAL]], align 4
 // CHECK1-NEXT:    ret i32 [[TMP41]]
 // CHECK1:       eh.resume:
@@ -257,7 +257,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68.omp_outlined
-// CHECK1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR4:[0-9]+]] {
+// CHECK1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR3]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -325,7 +325,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68.omp_outlined.omp_outlined
-// CHECK1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR4]] personality ptr @__gxx_personality_v0 {
+// CHECK1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -406,14 +406,14 @@ int main() {
 // CHECK1-NEXT:    [[TMP13:%.*]] = landingpad { ptr, i32 }
 // CHECK1-NEXT:    catch ptr null
 // CHECK1-NEXT:    [[TMP14:%.*]] = extractvalue { ptr, i32 } [[TMP13]], 0
-// CHECK1-NEXT:    call void @__clang_call_terminate(ptr [[TMP14]]) #[[ATTR11:[0-9]+]], !llvm.access.group [[ACC_GRP13]]
+// CHECK1-NEXT:    call void @__clang_call_terminate(ptr [[TMP14]]) #[[ATTR10:[0-9]+]], !llvm.access.group [[ACC_GRP13]]
 // CHECK1-NEXT:    unreachable
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@__clang_call_terminate
-// CHECK1-SAME: (ptr [[TMP0:%.*]]) #[[ATTR7:[0-9]+]] comdat {
-// CHECK1-NEXT:    [[TMP2:%.*]] = call ptr @__cxa_begin_catch(ptr [[TMP0]]) #[[ATTR5]]
-// CHECK1-NEXT:    call void @_ZSt9terminatev() #[[ATTR11]]
+// CHECK1-SAME: (ptr [[TMP0:%.*]]) #[[ATTR6:[0-9]+]] comdat {
+// CHECK1-NEXT:    [[TMP2:%.*]] = call ptr @__cxa_begin_catch(ptr [[TMP0]]) #[[ATTR4]]
+// CHECK1-NEXT:    call void @_ZSt9terminatev() #[[ATTR10]]
 // CHECK1-NEXT:    unreachable
 //
 //
@@ -427,7 +427,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l74.omp_outlined
-// CHECK1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 1 dereferenceable(1) [[A:%.*]]) #[[ATTR4]] {
+// CHECK1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 1 dereferenceable(1) [[A:%.*]]) #[[ATTR3]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -500,7 +500,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l74.omp_outlined.omp_outlined
-// CHECK1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR4]] personality ptr @__gxx_personality_v0 {
+// CHECK1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -581,12 +581,12 @@ int main() {
 // CHECK1-NEXT:    [[TMP13:%.*]] = landingpad { ptr, i32 }
 // CHECK1-NEXT:    catch ptr null
 // CHECK1-NEXT:    [[TMP14:%.*]] = extractvalue { ptr, i32 } [[TMP13]], 0
-// CHECK1-NEXT:    call void @__clang_call_terminate(ptr [[TMP14]]) #[[ATTR11]], !llvm.access.group [[ACC_GRP21]]
+// CHECK1-NEXT:    call void @__clang_call_terminate(ptr [[TMP14]]) #[[ATTR10]], !llvm.access.group [[ACC_GRP21]]
 // CHECK1-NEXT:    unreachable
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@_Z5tmainIcLi5EEiv
-// CHECK1-SAME: () #[[ATTR8:[0-9]+]] comdat {
+// CHECK1-SAME: () #[[ATTR7:[0-9]+]] comdat {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[TMP:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
@@ -622,7 +622,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
 // CHECK1-NEXT:    br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK1:       omp_offload.failed:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l52() #[[ATTR5]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l52() #[[ATTR4]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
@@ -655,14 +655,14 @@ int main() {
 // CHECK1-NEXT:    [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0
 // CHECK1-NEXT:    br i1 [[TMP29]], label [[OMP_OFFLOAD_FAILED3:%.*]], label [[OMP_OFFLOAD_CONT4:%.*]]
 // CHECK1:       omp_offload.failed3:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l57() #[[ATTR5]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l57() #[[ATTR4]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT4]]
 // CHECK1:       omp_offload.cont4:
 // CHECK1-NEXT:    ret i32 0
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@_Z5tmainI1SLi1EEiv
-// CHECK1-SAME: () #[[ATTR8]] comdat {
+// CHECK1-SAME: () #[[ATTR7]] comdat {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[TMP:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
@@ -698,7 +698,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
 // CHECK1-NEXT:    br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK1:       omp_offload.failed:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l52() #[[ATTR5]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l52() #[[ATTR4]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
@@ -731,24 +731,24 @@ int main() {
 // CHECK1-NEXT:    [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0
 // CHECK1-NEXT:    br i1 [[TMP29]], label [[OMP_OFFLOAD_FAILED3:%.*]], label [[OMP_OFFLOAD_CONT4:%.*]]
 // CHECK1:       omp_offload.failed3:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l57() #[[ATTR5]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l57() #[[ATTR4]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT4]]
 // CHECK1:       omp_offload.cont4:
 // CHECK1-NEXT:    ret i32 0
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@_ZN1SD1Ev
-// CHECK1-SAME: (ptr nonnull align 8 dereferenceable(24) [[THIS:%.*]]) unnamed_addr #[[ATTR9:[0-9]+]] comdat align 2 {
+// CHECK1-SAME: (ptr nonnull align 8 dereferenceable(24) [[THIS:%.*]]) unnamed_addr #[[ATTR8:[0-9]+]] comdat align 2 {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
 // CHECK1-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
-// CHECK1-NEXT:    call void @_ZN1SD2Ev(ptr nonnull align 8 dereferenceable(24) [[THIS1]]) #[[ATTR5]]
+// CHECK1-NEXT:    call void @_ZN1SD2Ev(ptr nonnull align 8 dereferenceable(24) [[THIS1]]) #[[ATTR4]]
 // CHECK1-NEXT:    ret void
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@_ZN1SC2El
-// CHECK1-SAME: (ptr nonnull align 8 dereferenceable(24) [[THIS:%.*]], i64 [[A:%.*]]) unnamed_addr #[[ATTR9]] comdat align 2 {
+// CHECK1-SAME: (ptr nonnull align 8 dereferenceable(24) [[THIS:%.*]], i64 [[A:%.*]]) unnamed_addr #[[ATTR8]] comdat align 2 {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
@@ -762,7 +762,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@_ZN1SD2Ev
-// CHECK1-SAME: (ptr nonnull align 8 dereferenceable(24) [[THIS:%.*]]) unnamed_addr #[[ATTR9]] comdat align 2 {
+// CHECK1-SAME: (ptr nonnull align 8 dereferenceable(24) [[THIS:%.*]]) unnamed_addr #[[ATTR8]] comdat align 2 {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
@@ -778,7 +778,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l52.omp_outlined
-// CHECK1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR4]] {
+// CHECK1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR3]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -846,7 +846,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l52.omp_outlined.omp_outlined
-// CHECK1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR4]] personality ptr @__gxx_personality_v0 {
+// CHECK1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -927,7 +927,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP13:%.*]] = landingpad { ptr, i32 }
 // CHECK1-NEXT:    catch ptr null
 // CHECK1-NEXT:    [[TMP14:%.*]] = extractvalue { ptr, i32 } [[TMP13]], 0
-// CHECK1-NEXT:    call void @__clang_call_terminate(ptr [[TMP14]]) #[[ATTR11]], !llvm.access.group [[ACC_GRP27]]
+// CHECK1-NEXT:    call void @__clang_call_terminate(ptr [[TMP14]]) #[[ATTR10]], !llvm.access.group [[ACC_GRP27]]
 // CHECK1-NEXT:    unreachable
 //
 //
@@ -939,7 +939,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l57.omp_outlined
-// CHECK1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR4]] {
+// CHECK1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR3]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1007,7 +1007,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l57.omp_outlined.omp_outlined
-// CHECK1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR4]] personality ptr @__gxx_personality_v0 {
+// CHECK1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1088,7 +1088,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP13:%.*]] = landingpad { ptr, i32 }
 // CHECK1-NEXT:    catch ptr null
 // CHECK1-NEXT:    [[TMP14:%.*]] = extractvalue { ptr, i32 } [[TMP13]], 0
-// CHECK1-NEXT:    call void @__clang_call_terminate(ptr [[TMP14]]) #[[ATTR11]], !llvm.access.group [[ACC_GRP33]]
+// CHECK1-NEXT:    call void @__clang_call_terminate(ptr [[TMP14]]) #[[ATTR10]], !llvm.access.group [[ACC_GRP33]]
 // CHECK1-NEXT:    unreachable
 //
 //
@@ -1100,7 +1100,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l52.omp_outlined
-// CHECK1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR4]] {
+// CHECK1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR3]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1168,7 +1168,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l52.omp_outlined.omp_outlined
-// CHECK1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR4]] personality ptr @__gxx_personality_v0 {
+// CHECK1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1249,7 +1249,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP13:%.*]] = landingpad { ptr, i32 }
 // CHECK1-NEXT:    catch ptr null
 // CHECK1-NEXT:    [[TMP14:%.*]] = extractvalue { ptr, i32 } [[TMP13]], 0
-// CHECK1-NEXT:    call void @__clang_call_terminate(ptr [[TMP14]]) #[[ATTR11]], !llvm.access.group [[ACC_GRP39]]
+// CHECK1-NEXT:    call void @__clang_call_terminate(ptr [[TMP14]]) #[[ATTR10]], !llvm.access.group [[ACC_GRP39]]
 // CHECK1-NEXT:    unreachable
 //
 //
@@ -1261,7 +1261,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l57.omp_outlined
-// CHECK1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR4]] personality ptr @__gxx_personality_v0 {
+// CHECK1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1310,7 +1310,7 @@ int main() {
 // CHECK1:       invoke.cont2:
 // CHECK1-NEXT:    [[TMP7:%.*]] = sext i8 [[CALL]] to i32
 // CHECK1-NEXT:    call void @__kmpc_push_num_threads(ptr @[[GLOB3]], i32 [[TMP1]], i32 [[TMP7]]), !llvm.access.group [[ACC_GRP42]]
-// CHECK1-NEXT:    call void @_ZN1SD1Ev(ptr nonnull align 8 dereferenceable(24) [[REF_TMP]]) #[[ATTR5]], !llvm.access.group [[ACC_GRP42]]
+// CHECK1-NEXT:    call void @_ZN1SD1Ev(ptr nonnull align 8 dereferenceable(24) [[REF_TMP]]) #[[ATTR4]], !llvm.access.group [[ACC_GRP42]]
 // CHECK1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP42]]
 // CHECK1-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP8]] to i64
 // CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP42]]
@@ -1339,12 +1339,12 @@ int main() {
 // CHECK1-NEXT:    [[TMP16:%.*]] = landingpad { ptr, i32 }
 // CHECK1-NEXT:    catch ptr null
 // CHECK1-NEXT:    [[TMP17:%.*]] = extractvalue { ptr, i32 } [[TMP16]], 0
-// CHECK1-NEXT:    call void @__clang_call_terminate(ptr [[TMP17]]) #[[ATTR11]], !llvm.access.group [[ACC_GRP42]]
+// CHECK1-NEXT:    call void @__clang_call_terminate(ptr [[TMP17]]) #[[ATTR10]], !llvm.access.group [[ACC_GRP42]]
 // CHECK1-NEXT:    unreachable
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l57.omp_outlined.omp_outlined
-// CHECK1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR4]] personality ptr @__gxx_personality_v0 {
+// CHECK1-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1425,12 +1425,12 @@ int main() {
 // CHECK1-NEXT:    [[TMP13:%.*]] = landingpad { ptr, i32 }
 // CHECK1-NEXT:    catch ptr null
 // CHECK1-NEXT:    [[TMP14:%.*]] = extractvalue { ptr, i32 } [[TMP13]], 0
-// CHECK1-NEXT:    call void @__clang_call_terminate(ptr [[TMP14]]) #[[ATTR11]], !llvm.access.group [[ACC_GRP45]]
+// CHECK1-NEXT:    call void @__clang_call_terminate(ptr [[TMP14]]) #[[ATTR10]], !llvm.access.group [[ACC_GRP45]]
 // CHECK1-NEXT:    unreachable
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK1-SAME: () #[[ATTR10:[0-9]+]] {
+// CHECK1-SAME: () #[[ATTR9:[0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK1-NEXT:    ret void
@@ -1827,7 +1827,7 @@ int main() {
 // CHECK5-NEXT:    [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
 // CHECK5-NEXT:    br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK5:       omp_offload.failed:
-// CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68() #[[ATTR5:[0-9]+]]
+// CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68() #[[ATTR4:[0-9]+]]
 // CHECK5-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK5:       lpad:
 // CHECK5-NEXT:    [[TMP15:%.*]] = landingpad { ptr, i32 }
@@ -1836,7 +1836,7 @@ int main() {
 // CHECK5-NEXT:    store ptr [[TMP16]], ptr [[EXN_SLOT]], align 8
 // CHECK5-NEXT:    [[TMP17:%.*]] = extractvalue { ptr, i32 } [[TMP15]], 1
 // CHECK5-NEXT:    store i32 [[TMP17]], ptr [[EHSELECTOR_SLOT]], align 4
-// CHECK5-NEXT:    call void @_ZN1SD1Ev(ptr nonnull align 8 dereferenceable(24) [[S]]) #[[ATTR5]]
+// CHECK5-NEXT:    call void @_ZN1SD1Ev(ptr nonnull align 8 dereferenceable(24) [[S]]) #[[ATTR4]]
 // CHECK5-NEXT:    br label [[EH_RESUME:%.*]]
 // CHECK5:       omp_offload.cont:
 // CHECK5-NEXT:    [[TMP18:%.*]] = load i8, ptr [[A]], align 1
@@ -1880,7 +1880,7 @@ int main() {
 // CHECK5-NEXT:    [[TMP39:%.*]] = icmp ne i32 [[TMP38]], 0
 // CHECK5-NEXT:    br i1 [[TMP39]], label [[OMP_OFFLOAD_FAILED3:%.*]], label [[OMP_OFFLOAD_CONT4:%.*]]
 // CHECK5:       omp_offload.failed3:
-// CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l74(i64 [[TMP19]]) #[[ATTR5]]
+// CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l74(i64 [[TMP19]]) #[[ATTR4]]
 // CHECK5-NEXT:    br label [[OMP_OFFLOAD_CONT4]]
 // CHECK5:       omp_offload.cont4:
 // CHECK5-NEXT:    [[TMP40:%.*]] = load i8, ptr [[A]], align 1
@@ -1894,7 +1894,7 @@ int main() {
 // CHECK5:       invoke.cont7:
 // CHECK5-NEXT:    [[ADD9:%.*]] = add nsw i32 [[ADD]], [[CALL8]]
 // CHECK5-NEXT:    store i32 [[ADD9]], ptr [[RETVAL]], align 4
-// CHECK5-NEXT:    call void @_ZN1SD1Ev(ptr nonnull align 8 dereferenceable(24) [[S]]) #[[ATTR5]]
+// CHECK5-NEXT:    call void @_ZN1SD1Ev(ptr nonnull align 8 dereferenceable(24) [[S]]) #[[ATTR4]]
 // CHECK5-NEXT:    [[TMP41:%.*]] = load i32, ptr [[RETVAL]], align 4
 // CHECK5-NEXT:    ret i32 [[TMP41]]
 // CHECK5:       eh.resume:
@@ -1939,7 +1939,7 @@ int main() {
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68.omp_outlined
-// CHECK5-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR4:[0-9]+]] {
+// CHECK5-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR3]] {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -2007,7 +2007,7 @@ int main() {
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68.omp_outlined.omp_outlined
-// CHECK5-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR4]] personality ptr @__gxx_personality_v0 {
+// CHECK5-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -2088,14 +2088,14 @@ int main() {
 // CHECK5-NEXT:    [[TMP13:%.*]] = landingpad { ptr, i32 }
 // CHECK5-NEXT:    catch ptr null
 // CHECK5-NEXT:    [[TMP14:%.*]] = extractvalue { ptr, i32 } [[TMP13]], 0
-// CHECK5-NEXT:    call void @__clang_call_terminate(ptr [[TMP14]]) #[[ATTR11:[0-9]+]], !llvm.access.group [[ACC_GRP13]]
+// CHECK5-NEXT:    call void @__clang_call_terminate(ptr [[TMP14]]) #[[ATTR10:[0-9]+]], !llvm.access.group [[ACC_GRP13]]
 // CHECK5-NEXT:    unreachable
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@__clang_call_terminate
-// CHECK5-SAME: (ptr [[TMP0:%.*]]) #[[ATTR7:[0-9]+]] comdat {
-// CHECK5-NEXT:    [[TMP2:%.*]] = call ptr @__cxa_begin_catch(ptr [[TMP0]]) #[[ATTR5]]
-// CHECK5-NEXT:    call void @_ZSt9terminatev() #[[ATTR11]]
+// CHECK5-SAME: (ptr [[TMP0:%.*]]) #[[ATTR6:[0-9]+]] comdat {
+// CHECK5-NEXT:    [[TMP2:%.*]] = call ptr @__cxa_begin_catch(ptr [[TMP0]]) #[[ATTR4]]
+// CHECK5-NEXT:    call void @_ZSt9terminatev() #[[ATTR10]]
 // CHECK5-NEXT:    unreachable
 //
 //
@@ -2109,7 +2109,7 @@ int main() {
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l74.omp_outlined
-// CHECK5-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 1 dereferenceable(1) [[A:%.*]]) #[[ATTR4]] {
+// CHECK5-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 1 dereferenceable(1) [[A:%.*]]) #[[ATTR3]] {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -2182,7 +2182,7 @@ int main() {
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l74.omp_outlined.omp_outlined
-// CHECK5-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR4]] personality ptr @__gxx_personality_v0 {
+// CHECK5-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -2263,12 +2263,12 @@ int main() {
 // CHECK5-NEXT:    [[TMP13:%.*]] = landingpad { ptr, i32 }
 // CHECK5-NEXT:    catch ptr null
 // CHECK5-NEXT:    [[TMP14:%.*]] = extractvalue { ptr, i32 } [[TMP13]], 0
-// CHECK5-NEXT:    call void @__clang_call_terminate(ptr [[TMP14]]) #[[ATTR11]], !llvm.access.group [[ACC_GRP21]]
+// CHECK5-NEXT:    call void @__clang_call_terminate(ptr [[TMP14]]) #[[ATTR10]], !llvm.access.group [[ACC_GRP21]]
 // CHECK5-NEXT:    unreachable
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@_Z5tmainIcLi5EEiv
-// CHECK5-SAME: () #[[ATTR8:[0-9]+]] comdat {
+// CHECK5-SAME: () #[[ATTR7:[0-9]+]] comdat {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[TMP:%.*]] = alloca i32, align 4
 // CHECK5-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
@@ -2304,7 +2304,7 @@ int main() {
 // CHECK5-NEXT:    [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
 // CHECK5-NEXT:    br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK5:       omp_offload.failed:
-// CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l52() #[[ATTR5]]
+// CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l52() #[[ATTR4]]
 // CHECK5-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK5:       omp_offload.cont:
 // CHECK5-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
@@ -2337,14 +2337,14 @@ int main() {
 // CHECK5-NEXT:    [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0
 // CHECK5-NEXT:    br i1 [[TMP29]], label [[OMP_OFFLOAD_FAILED3:%.*]], label [[OMP_OFFLOAD_CONT4:%.*]]
 // CHECK5:       omp_offload.failed3:
-// CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l57() #[[ATTR5]]
+// CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l57() #[[ATTR4]]
 // CHECK5-NEXT:    br label [[OMP_OFFLOAD_CONT4]]
 // CHECK5:       omp_offload.cont4:
 // CHECK5-NEXT:    ret i32 0
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@_Z5tmainI1SLi1EEiv
-// CHECK5-SAME: () #[[ATTR8]] comdat {
+// CHECK5-SAME: () #[[ATTR7]] comdat {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[TMP:%.*]] = alloca i32, align 4
 // CHECK5-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
@@ -2380,7 +2380,7 @@ int main() {
 // CHECK5-NEXT:    [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
 // CHECK5-NEXT:    br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK5:       omp_offload.failed:
-// CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l52() #[[ATTR5]]
+// CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l52() #[[ATTR4]]
 // CHECK5-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK5:       omp_offload.cont:
 // CHECK5-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
@@ -2413,24 +2413,24 @@ int main() {
 // CHECK5-NEXT:    [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0
 // CHECK5-NEXT:    br i1 [[TMP29]], label [[OMP_OFFLOAD_FAILED3:%.*]], label [[OMP_OFFLOAD_CONT4:%.*]]
 // CHECK5:       omp_offload.failed3:
-// CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l57() #[[ATTR5]]
+// CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l57() #[[ATTR4]]
 // CHECK5-NEXT:    br label [[OMP_OFFLOAD_CONT4]]
 // CHECK5:       omp_offload.cont4:
 // CHECK5-NEXT:    ret i32 0
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@_ZN1SD1Ev
-// CHECK5-SAME: (ptr nonnull align 8 dereferenceable(24) [[THIS:%.*]]) unnamed_addr #[[ATTR9:[0-9]+]] comdat align 2 {
+// CHECK5-SAME: (ptr nonnull align 8 dereferenceable(24) [[THIS:%.*]]) unnamed_addr #[[ATTR8:[0-9]+]] comdat align 2 {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
 // CHECK5-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
-// CHECK5-NEXT:    call void @_ZN1SD2Ev(ptr nonnull align 8 dereferenceable(24) [[THIS1]]) #[[ATTR5]]
+// CHECK5-NEXT:    call void @_ZN1SD2Ev(ptr nonnull align 8 dereferenceable(24) [[THIS1]]) #[[ATTR4]]
 // CHECK5-NEXT:    ret void
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@_ZN1SC2El
-// CHECK5-SAME: (ptr nonnull align 8 dereferenceable(24) [[THIS:%.*]], i64 [[A:%.*]]) unnamed_addr #[[ATTR9]] comdat align 2 {
+// CHECK5-SAME: (ptr nonnull align 8 dereferenceable(24) [[THIS:%.*]], i64 [[A:%.*]]) unnamed_addr #[[ATTR8]] comdat align 2 {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
@@ -2451,7 +2451,7 @@ int main() {
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l52.omp_outlined
-// CHECK5-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR4]] {
+// CHECK5-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR3]] {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -2519,7 +2519,7 @@ int main() {
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l52.omp_outlined.omp_outlined
-// CHECK5-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR4]] personality ptr @__gxx_personality_v0 {
+// CHECK5-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -2600,7 +2600,7 @@ int main() {
 // CHECK5-NEXT:    [[TMP13:%.*]] = landingpad { ptr, i32 }
 // CHECK5-NEXT:    catch ptr null
 // CHECK5-NEXT:    [[TMP14:%.*]] = extractvalue { ptr, i32 } [[TMP13]], 0
-// CHECK5-NEXT:    call void @__clang_call_terminate(ptr [[TMP14]]) #[[ATTR11]], !llvm.access.group [[ACC_GRP27]]
+// CHECK5-NEXT:    call void @__clang_call_terminate(ptr [[TMP14]]) #[[ATTR10]], !llvm.access.group [[ACC_GRP27]]
 // CHECK5-NEXT:    unreachable
 //
 //
@@ -2612,7 +2612,7 @@ int main() {
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l57.omp_outlined
-// CHECK5-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR4]] {
+// CHECK5-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR3]] {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -2680,7 +2680,7 @@ int main() {
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l57.omp_outlined.omp_outlined
-// CHECK5-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR4]] personality ptr @__gxx_personality_v0 {
+// CHECK5-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -2761,7 +2761,7 @@ int main() {
 // CHECK5-NEXT:    [[TMP13:%.*]] = landingpad { ptr, i32 }
 // CHECK5-NEXT:    catch ptr null
 // CHECK5-NEXT:    [[TMP14:%.*]] = extractvalue { ptr, i32 } [[TMP13]], 0
-// CHECK5-NEXT:    call void @__clang_call_terminate(ptr [[TMP14]]) #[[ATTR11]], !llvm.access.group [[ACC_GRP33]]
+// CHECK5-NEXT:    call void @__clang_call_terminate(ptr [[TMP14]]) #[[ATTR10]], !llvm.access.group [[ACC_GRP33]]
 // CHECK5-NEXT:    unreachable
 //
 //
@@ -2773,7 +2773,7 @@ int main() {
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l52.omp_outlined
-// CHECK5-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR4]] {
+// CHECK5-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR3]] {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -2841,7 +2841,7 @@ int main() {
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l52.omp_outlined.omp_outlined
-// CHECK5-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR4]] personality ptr @__gxx_personality_v0 {
+// CHECK5-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -2922,7 +2922,7 @@ int main() {
 // CHECK5-NEXT:    [[TMP13:%.*]] = landingpad { ptr, i32 }
 // CHECK5-NEXT:    catch ptr null
 // CHECK5-NEXT:    [[TMP14:%.*]] = extractvalue { ptr, i32 } [[TMP13]], 0
-// CHECK5-NEXT:    call void @__clang_call_terminate(ptr [[TMP14]]) #[[ATTR11]], !llvm.access.group [[ACC_GRP39]]
+// CHECK5-NEXT:    call void @__clang_call_terminate(ptr [[TMP14]]) #[[ATTR10]], !llvm.access.group [[ACC_GRP39]]
 // CHECK5-NEXT:    unreachable
 //
 //
@@ -2934,7 +2934,7 @@ int main() {
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l57.omp_outlined
-// CHECK5-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR4]] personality ptr @__gxx_personality_v0 {
+// CHECK5-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -2983,7 +2983,7 @@ int main() {
 // CHECK5:       invoke.cont2:
 // CHECK5-NEXT:    [[TMP7:%.*]] = sext i8 [[CALL]] to i32
 // CHECK5-NEXT:    call void @__kmpc_push_num_threads(ptr @[[GLOB3]], i32 [[TMP1]], i32 [[TMP7]]), !llvm.access.group [[ACC_GRP42]]
-// CHECK5-NEXT:    call void @_ZN1SD1Ev(ptr nonnull align 8 dereferenceable(24) [[REF_TMP]]) #[[ATTR5]], !llvm.access.group [[ACC_GRP42]]
+// CHECK5-NEXT:    call void @_ZN1SD1Ev(ptr nonnull align 8 dereferenceable(24) [[REF_TMP]]) #[[ATTR4]], !llvm.access.group [[ACC_GRP42]]
 // CHECK5-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP42]]
 // CHECK5-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP8]] to i64
 // CHECK5-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP42]]
@@ -3012,12 +3012,12 @@ int main() {
 // CHECK5-NEXT:    [[TMP16:%.*]] = landingpad { ptr, i32 }
 // CHECK5-NEXT:    catch ptr null
 // CHECK5-NEXT:    [[TMP17:%.*]] = extractvalue { ptr, i32 } [[TMP16]], 0
-// CHECK5-NEXT:    call void @__clang_call_terminate(ptr [[TMP17]]) #[[ATTR11]], !llvm.access.group [[ACC_GRP42]]
+// CHECK5-NEXT:    call void @__clang_call_terminate(ptr [[TMP17]]) #[[ATTR10]], !llvm.access.group [[ACC_GRP42]]
 // CHECK5-NEXT:    unreachable
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l57.omp_outlined.omp_outlined
-// CHECK5-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR4]] personality ptr @__gxx_personality_v0 {
+// CHECK5-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -3098,12 +3098,12 @@ int main() {
 // CHECK5-NEXT:    [[TMP13:%.*]] = landingpad { ptr, i32 }
 // CHECK5-NEXT:    catch ptr null
 // CHECK5-NEXT:    [[TMP14:%.*]] = extractvalue { ptr, i32 } [[TMP13]], 0
-// CHECK5-NEXT:    call void @__clang_call_terminate(ptr [[TMP14]]) #[[ATTR11]], !llvm.access.group [[ACC_GRP45]]
+// CHECK5-NEXT:    call void @__clang_call_terminate(ptr [[TMP14]]) #[[ATTR10]], !llvm.access.group [[ACC_GRP45]]
 // CHECK5-NEXT:    unreachable
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@_ZN1SD2Ev
-// CHECK5-SAME: (ptr nonnull align 8 dereferenceable(24) [[THIS:%.*]]) unnamed_addr #[[ATTR9]] comdat align 2 {
+// CHECK5-SAME: (ptr nonnull align 8 dereferenceable(24) [[THIS:%.*]]) unnamed_addr #[[ATTR8]] comdat align 2 {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
@@ -3112,7 +3112,7 @@ int main() {
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK5-SAME: () #[[ATTR10:[0-9]+]] {
+// CHECK5-SAME: () #[[ATTR9:[0-9]+]] {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK5-NEXT:    ret void
@@ -3170,7 +3170,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
 // CHECK9-NEXT:    br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK9:       omp_offload.failed:
-// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68() #[[ATTR5:[0-9]+]]
+// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68() #[[ATTR4:[0-9]+]]
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK9:       lpad:
 // CHECK9-NEXT:    [[TMP15:%.*]] = landingpad { ptr, i32 }
@@ -3179,7 +3179,7 @@ int main() {
 // CHECK9-NEXT:    store ptr [[TMP16]], ptr [[EXN_SLOT]], align 8
 // CHECK9-NEXT:    [[TMP17:%.*]] = extractvalue { ptr, i32 } [[TMP15]], 1
 // CHECK9-NEXT:    store i32 [[TMP17]], ptr [[EHSELECTOR_SLOT]], align 4
-// CHECK9-NEXT:    call void @_ZN1SD1Ev(ptr nonnull align 8 dereferenceable(24) [[S]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SD1Ev(ptr nonnull align 8 dereferenceable(24) [[S]]) #[[ATTR4]]
 // CHECK9-NEXT:    br label [[EH_RESUME:%.*]]
 // CHECK9:       omp_offload.cont:
 // CHECK9-NEXT:    [[TMP18:%.*]] = load i8, ptr [[A]], align 1
@@ -3223,7 +3223,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP39:%.*]] = icmp ne i32 [[TMP38]], 0
 // CHECK9-NEXT:    br i1 [[TMP39]], label [[OMP_OFFLOAD_FAILED3:%.*]], label [[OMP_OFFLOAD_CONT4:%.*]]
 // CHECK9:       omp_offload.failed3:
-// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l74(i64 [[TMP19]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l74(i64 [[TMP19]]) #[[ATTR4]]
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT4]]
 // CHECK9:       omp_offload.cont4:
 // CHECK9-NEXT:    [[TMP40:%.*]] = load i8, ptr [[A]], align 1
@@ -3237,7 +3237,7 @@ int main() {
 // CHECK9:       invoke.cont7:
 // CHECK9-NEXT:    [[ADD9:%.*]] = add nsw i32 [[ADD]], [[CALL8]]
 // CHECK9-NEXT:    store i32 [[ADD9]], ptr [[RETVAL]], align 4
-// CHECK9-NEXT:    call void @_ZN1SD1Ev(ptr nonnull align 8 dereferenceable(24) [[S]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SD1Ev(ptr nonnull align 8 dereferenceable(24) [[S]]) #[[ATTR4]]
 // CHECK9-NEXT:    [[TMP41:%.*]] = load i32, ptr [[RETVAL]], align 4
 // CHECK9-NEXT:    ret i32 [[TMP41]]
 // CHECK9:       eh.resume:
@@ -3282,7 +3282,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68.omp_outlined
-// CHECK9-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR4:[0-9]+]] {
+// CHECK9-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR3]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -3350,7 +3350,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68.omp_outlined.omp_outlined
-// CHECK9-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR4]] personality ptr @__gxx_personality_v0 {
+// CHECK9-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -3431,14 +3431,14 @@ int main() {
 // CHECK9-NEXT:    [[TMP13:%.*]] = landingpad { ptr, i32 }
 // CHECK9-NEXT:    catch ptr null
 // CHECK9-NEXT:    [[TMP14:%.*]] = extractvalue { ptr, i32 } [[TMP13]], 0
-// CHECK9-NEXT:    call void @__clang_call_terminate(ptr [[TMP14]]) #[[ATTR11:[0-9]+]], !llvm.access.group [[ACC_GRP13]]
+// CHECK9-NEXT:    call void @__clang_call_terminate(ptr [[TMP14]]) #[[ATTR10:[0-9]+]], !llvm.access.group [[ACC_GRP13]]
 // CHECK9-NEXT:    unreachable
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@__clang_call_terminate
-// CHECK9-SAME: (ptr [[TMP0:%.*]]) #[[ATTR7:[0-9]+]] comdat {
-// CHECK9-NEXT:    [[TMP2:%.*]] = call ptr @__cxa_begin_catch(ptr [[TMP0]]) #[[ATTR5]]
-// CHECK9-NEXT:    call void @_ZSt9terminatev() #[[ATTR11]]
+// CHECK9-SAME: (ptr [[TMP0:%.*]]) #[[ATTR6:[0-9]+]] comdat {
+// CHECK9-NEXT:    [[TMP2:%.*]] = call ptr @__cxa_begin_catch(ptr [[TMP0]]) #[[ATTR4]]
+// CHECK9-NEXT:    call void @_ZSt9terminatev() #[[ATTR10]]
 // CHECK9-NEXT:    unreachable
 //
 //
@@ -3452,7 +3452,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l74.omp_outlined
-// CHECK9-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 1 dereferenceable(1) [[A:%.*]]) #[[ATTR4]] {
+// CHECK9-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 1 dereferenceable(1) [[A:%.*]]) #[[ATTR3]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -3525,7 +3525,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l74.omp_outlined.omp_outlined
-// CHECK9-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR4]] personality ptr @__gxx_personality_v0 {
+// CHECK9-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -3606,12 +3606,12 @@ int main() {
 // CHECK9-NEXT:    [[TMP13:%.*]] = landingpad { ptr, i32 }
 // CHECK9-NEXT:    catch ptr null
 // CHECK9-NEXT:    [[TMP14:%.*]] = extractvalue { ptr, i32 } [[TMP13]], 0
-// CHECK9-NEXT:    call void @__clang_call_terminate(ptr [[TMP14]]) #[[ATTR11]], !llvm.access.group [[ACC_GRP21]]
+// CHECK9-NEXT:    call void @__clang_call_terminate(ptr [[TMP14]]) #[[ATTR10]], !llvm.access.group [[ACC_GRP21]]
 // CHECK9-NEXT:    unreachable
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@_Z5tmainIcLi5EEiv
-// CHECK9-SAME: () #[[ATTR8:[0-9]+]] comdat {
+// CHECK9-SAME: () #[[ATTR7:[0-9]+]] comdat {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[TMP:%.*]] = alloca i32, align 4
 // CHECK9-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
@@ -3647,7 +3647,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
 // CHECK9-NEXT:    br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK9:       omp_offload.failed:
-// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l52() #[[ATTR5]]
+// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l52() #[[ATTR4]]
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK9:       omp_offload.cont:
 // CHECK9-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
@@ -3680,14 +3680,14 @@ int main() {
 // CHECK9-NEXT:    [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0
 // CHECK9-NEXT:    br i1 [[TMP29]], label [[OMP_OFFLOAD_FAILED3:%.*]], label [[OMP_OFFLOAD_CONT4:%.*]]
 // CHECK9:       omp_offload.failed3:
-// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l57() #[[ATTR5]]
+// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l57() #[[ATTR4]]
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT4]]
 // CHECK9:       omp_offload.cont4:
 // CHECK9-NEXT:    ret i32 0
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@_Z5tmainI1SLi1EEiv
-// CHECK9-SAME: () #[[ATTR8]] comdat {
+// CHECK9-SAME: () #[[ATTR7]] comdat {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[TMP:%.*]] = alloca i32, align 4
 // CHECK9-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
@@ -3723,7 +3723,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
 // CHECK9-NEXT:    br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK9:       omp_offload.failed:
-// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l52() #[[ATTR5]]
+// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l52() #[[ATTR4]]
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK9:       omp_offload.cont:
 // CHECK9-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
@@ -3756,24 +3756,24 @@ int main() {
 // CHECK9-NEXT:    [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0
 // CHECK9-NEXT:    br i1 [[TMP29]], label [[OMP_OFFLOAD_FAILED3:%.*]], label [[OMP_OFFLOAD_CONT4:%.*]]
 // CHECK9:       omp_offload.failed3:
-// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l57() #[[ATTR5]]
+// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l57() #[[ATTR4]]
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT4]]
 // CHECK9:       omp_offload.cont4:
 // CHECK9-NEXT:    ret i32 0
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@_ZN1SD1Ev
-// CHECK9-SAME: (ptr nonnull align 8 dereferenceable(24) [[THIS:%.*]]) unnamed_addr #[[ATTR9:[0-9]+]] comdat align 2 {
+// CHECK9-SAME: (ptr nonnull align 8 dereferenceable(24) [[THIS:%.*]]) unnamed_addr #[[ATTR8:[0-9]+]] comdat align 2 {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
 // CHECK9-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
-// CHECK9-NEXT:    call void @_ZN1SD2Ev(ptr nonnull align 8 dereferenceable(24) [[THIS1]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SD2Ev(ptr nonnull align 8 dereferenceable(24) [[THIS1]]) #[[ATTR4]]
 // CHECK9-NEXT:    ret void
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@_ZN1SC2El
-// CHECK9-SAME: (ptr nonnull align 8 dereferenceable(24) [[THIS:%.*]], i64 [[A:%.*]]) unnamed_addr #[[ATTR9]] comdat align 2 {
+// CHECK9-SAME: (ptr nonnull align 8 dereferenceable(24) [[THIS:%.*]], i64 [[A:%.*]]) unnamed_addr #[[ATTR8]] comdat align 2 {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
@@ -3787,7 +3787,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@_ZN1SD2Ev
-// CHECK9-SAME: (ptr nonnull align 8 dereferenceable(24) [[THIS:%.*]]) unnamed_addr #[[ATTR9]] comdat align 2 {
+// CHECK9-SAME: (ptr nonnull align 8 dereferenceable(24) [[THIS:%.*]]) unnamed_addr #[[ATTR8]] comdat align 2 {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
@@ -3803,7 +3803,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l52.omp_outlined
-// CHECK9-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR4]] {
+// CHECK9-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR3]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -3871,7 +3871,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l52.omp_outlined.omp_outlined
-// CHECK9-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR4]] personality ptr @__gxx_personality_v0 {
+// CHECK9-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -3952,7 +3952,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP13:%.*]] = landingpad { ptr, i32 }
 // CHECK9-NEXT:    catch ptr null
 // CHECK9-NEXT:    [[TMP14:%.*]] = extractvalue { ptr, i32 } [[TMP13]], 0
-// CHECK9-NEXT:    call void @__clang_call_terminate(ptr [[TMP14]]) #[[ATTR11]], !llvm.access.group [[ACC_GRP27]]
+// CHECK9-NEXT:    call void @__clang_call_terminate(ptr [[TMP14]]) #[[ATTR10]], !llvm.access.group [[ACC_GRP27]]
 // CHECK9-NEXT:    unreachable
 //
 //
@@ -3964,7 +3964,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l57.omp_outlined
-// CHECK9-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR4]] {
+// CHECK9-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR3]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -4032,7 +4032,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l57.omp_outlined.omp_outlined
-// CHECK9-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR4]] personality ptr @__gxx_personality_v0 {
+// CHECK9-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -4113,7 +4113,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP13:%.*]] = landingpad { ptr, i32 }
 // CHECK9-NEXT:    catch ptr null
 // CHECK9-NEXT:    [[TMP14:%.*]] = extractvalue { ptr, i32 } [[TMP13]], 0
-// CHECK9-NEXT:    call void @__clang_call_terminate(ptr [[TMP14]]) #[[ATTR11]], !llvm.access.group [[ACC_GRP33]]
+// CHECK9-NEXT:    call void @__clang_call_terminate(ptr [[TMP14]]) #[[ATTR10]], !llvm.access.group [[ACC_GRP33]]
 // CHECK9-NEXT:    unreachable
 //
 //
@@ -4125,7 +4125,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l52.omp_outlined
-// CHECK9-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR4]] {
+// CHECK9-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR3]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -4193,7 +4193,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l52.omp_outlined.omp_outlined
-// CHECK9-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR4]] personality ptr @__gxx_personality_v0 {
+// CHECK9-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -4274,7 +4274,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP13:%.*]] = landingpad { ptr, i32 }
 // CHECK9-NEXT:    catch ptr null
 // CHECK9-NEXT:    [[TMP14:%.*]] = extractvalue { ptr, i32 } [[TMP13]], 0
-// CHECK9-NEXT:    call void @__clang_call_terminate(ptr [[TMP14]]) #[[ATTR11]], !llvm.access.group [[ACC_GRP39]]
+// CHECK9-NEXT:    call void @__clang_call_terminate(ptr [[TMP14]]) #[[ATTR10]], !llvm.access.group [[ACC_GRP39]]
 // CHECK9-NEXT:    unreachable
 //
 //
@@ -4286,7 +4286,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l57.omp_outlined
-// CHECK9-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR4]] personality ptr @__gxx_personality_v0 {
+// CHECK9-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -4335,7 +4335,7 @@ int main() {
 // CHECK9:       invoke.cont2:
 // CHECK9-NEXT:    [[TMP7:%.*]] = sext i8 [[CALL]] to i32
 // CHECK9-NEXT:    call void @__kmpc_push_num_threads(ptr @[[GLOB3]], i32 [[TMP1]], i32 [[TMP7]]), !llvm.access.group [[ACC_GRP42]]
-// CHECK9-NEXT:    call void @_ZN1SD1Ev(ptr nonnull align 8 dereferenceable(24) [[REF_TMP]]) #[[ATTR5]], !llvm.access.group [[ACC_GRP42]]
+// CHECK9-NEXT:    call void @_ZN1SD1Ev(ptr nonnull align 8 dereferenceable(24) [[REF_TMP]]) #[[ATTR4]], !llvm.access.group [[ACC_GRP42]]
 // CHECK9-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP42]]
 // CHECK9-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP8]] to i64
 // CHECK9-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP42]]
@@ -4364,12 +4364,12 @@ int main() {
 // CHECK9-NEXT:    [[TMP16:%.*]] = landingpad { ptr, i32 }
 // CHECK9-NEXT:    catch ptr null
 // CHECK9-NEXT:    [[TMP17:%.*]] = extractvalue { ptr, i32 } [[TMP16]], 0
-// CHECK9-NEXT:    call void @__clang_call_terminate(ptr [[TMP17]]) #[[ATTR11]], !llvm.access.group [[ACC_GRP42]]
+// CHECK9-NEXT:    call void @__clang_call_terminate(ptr [[TMP17]]) #[[ATTR10]], !llvm.access.group [[ACC_GRP42]]
 // CHECK9-NEXT:    unreachable
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l57.omp_outlined.omp_outlined
-// CHECK9-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR4]] personality ptr @__gxx_personality_v0 {
+// CHECK9-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -4450,12 +4450,12 @@ int main() {
 // CHECK9-NEXT:    [[TMP13:%.*]] = landingpad { ptr, i32 }
 // CHECK9-NEXT:    catch ptr null
 // CHECK9-NEXT:    [[TMP14:%.*]] = extractvalue { ptr, i32 } [[TMP13]], 0
-// CHECK9-NEXT:    call void @__clang_call_terminate(ptr [[TMP14]]) #[[ATTR11]], !llvm.access.group [[ACC_GRP45]]
+// CHECK9-NEXT:    call void @__clang_call_terminate(ptr [[TMP14]]) #[[ATTR10]], !llvm.access.group [[ACC_GRP45]]
 // CHECK9-NEXT:    unreachable
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK9-SAME: () #[[ATTR10:[0-9]+]] {
+// CHECK9-SAME: () #[[ATTR9:[0-9]+]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK9-NEXT:    ret void
@@ -4852,7 +4852,7 @@ int main() {
 // CHECK13-NEXT:    [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
 // CHECK13-NEXT:    br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK13:       omp_offload.failed:
-// CHECK13-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68() #[[ATTR5:[0-9]+]]
+// CHECK13-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68() #[[ATTR4:[0-9]+]]
 // CHECK13-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK13:       lpad:
 // CHECK13-NEXT:    [[TMP15:%.*]] = landingpad { ptr, i32 }
@@ -4861,7 +4861,7 @@ int main() {
 // CHECK13-NEXT:    store ptr [[TMP16]], ptr [[EXN_SLOT]], align 8
 // CHECK13-NEXT:    [[TMP17:%.*]] = extractvalue { ptr, i32 } [[TMP15]], 1
 // CHECK13-NEXT:    store i32 [[TMP17]], ptr [[EHSELECTOR_SLOT]], align 4
-// CHECK13-NEXT:    call void @_ZN1SD1Ev(ptr nonnull align 8 dereferenceable(24) [[S]]) #[[ATTR5]]
+// CHECK13-NEXT:    call void @_ZN1SD1Ev(ptr nonnull align 8 dereferenceable(24) [[S]]) #[[ATTR4]]
 // CHECK13-NEXT:    br label [[EH_RESUME:%.*]]
 // CHECK13:       omp_offload.cont:
 // CHECK13-NEXT:    [[TMP18:%.*]] = load i8, ptr [[A]], align 1
@@ -4905,7 +4905,7 @@ int main() {
 // CHECK13-NEXT:    [[TMP39:%.*]] = icmp ne i32 [[TMP38]], 0
 // CHECK13-NEXT:    br i1 [[TMP39]], label [[OMP_OFFLOAD_FAILED3:%.*]], label [[OMP_OFFLOAD_CONT4:%.*]]
 // CHECK13:       omp_offload.failed3:
-// CHECK13-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l74(i64 [[TMP19]]) #[[ATTR5]]
+// CHECK13-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l74(i64 [[TMP19]]) #[[ATTR4]]
 // CHECK13-NEXT:    br label [[OMP_OFFLOAD_CONT4]]
 // CHECK13:       omp_offload.cont4:
 // CHECK13-NEXT:    [[TMP40:%.*]] = load i8, ptr [[A]], align 1
@@ -4919,7 +4919,7 @@ int main() {
 // CHECK13:       invoke.cont7:
 // CHECK13-NEXT:    [[ADD9:%.*]] = add nsw i32 [[ADD]], [[CALL8]]
 // CHECK13-NEXT:    store i32 [[ADD9]], ptr [[RETVAL]], align 4
-// CHECK13-NEXT:    call void @_ZN1SD1Ev(ptr nonnull align 8 dereferenceable(24) [[S]]) #[[ATTR5]]
+// CHECK13-NEXT:    call void @_ZN1SD1Ev(ptr nonnull align 8 dereferenceable(24) [[S]]) #[[ATTR4]]
 // CHECK13-NEXT:    [[TMP41:%.*]] = load i32, ptr [[RETVAL]], align 4
 // CHECK13-NEXT:    ret i32 [[TMP41]]
 // CHECK13:       eh.resume:
@@ -4964,7 +4964,7 @@ int main() {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68.omp_outlined
-// CHECK13-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR4:[0-9]+]] {
+// CHECK13-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR3]] {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -5032,7 +5032,7 @@ int main() {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68.omp_outlined.omp_outlined
-// CHECK13-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR4]] personality ptr @__gxx_personality_v0 {
+// CHECK13-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -5113,14 +5113,14 @@ int main() {
 // CHECK13-NEXT:    [[TMP13:%.*]] = landingpad { ptr, i32 }
 // CHECK13-NEXT:    catch ptr null
 // CHECK13-NEXT:    [[TMP14:%.*]] = extractvalue { ptr, i32 } [[TMP13]], 0
-// CHECK13-NEXT:    call void @__clang_call_terminate(ptr [[TMP14]]) #[[ATTR11:[0-9]+]], !llvm.access.group [[ACC_GRP13]]
+// CHECK13-NEXT:    call void @__clang_call_terminate(ptr [[TMP14]]) #[[ATTR10:[0-9]+]], !llvm.access.group [[ACC_GRP13]]
 // CHECK13-NEXT:    unreachable
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@__clang_call_terminate
-// CHECK13-SAME: (ptr [[TMP0:%.*]]) #[[ATTR7:[0-9]+]] comdat {
-// CHECK13-NEXT:    [[TMP2:%.*]] = call ptr @__cxa_begin_catch(ptr [[TMP0]]) #[[ATTR5]]
-// CHECK13-NEXT:    call void @_ZSt9terminatev() #[[ATTR11]]
+// CHECK13-SAME: (ptr [[TMP0:%.*]]) #[[ATTR6:[0-9]+]] comdat {
+// CHECK13-NEXT:    [[TMP2:%.*]] = call ptr @__cxa_begin_catch(ptr [[TMP0]]) #[[ATTR4]]
+// CHECK13-NEXT:    call void @_ZSt9terminatev() #[[ATTR10]]
 // CHECK13-NEXT:    unreachable
 //
 //
@@ -5134,7 +5134,7 @@ int main() {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l74.omp_outlined
-// CHECK13-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 1 dereferenceable(1) [[A:%.*]]) #[[ATTR4]] {
+// CHECK13-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], ptr nonnull align 1 dereferenceable(1) [[A:%.*]]) #[[ATTR3]] {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -5207,7 +5207,7 @@ int main() {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l74.omp_outlined.omp_outlined
-// CHECK13-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR4]] personality ptr @__gxx_personality_v0 {
+// CHECK13-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -5288,12 +5288,12 @@ int main() {
 // CHECK13-NEXT:    [[TMP13:%.*]] = landingpad { ptr, i32 }
 // CHECK13-NEXT:    catch ptr null
 // CHECK13-NEXT:    [[TMP14:%.*]] = extractvalue { ptr, i32 } [[TMP13]], 0
-// CHECK13-NEXT:    call void @__clang_call_terminate(ptr [[TMP14]]) #[[ATTR11]], !llvm.access.group [[ACC_GRP21]]
+// CHECK13-NEXT:    call void @__clang_call_terminate(ptr [[TMP14]]) #[[ATTR10]], !llvm.access.group [[ACC_GRP21]]
 // CHECK13-NEXT:    unreachable
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@_Z5tmainIcLi5EEiv
-// CHECK13-SAME: () #[[ATTR8:[0-9]+]] comdat {
+// CHECK13-SAME: () #[[ATTR7:[0-9]+]] comdat {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[TMP:%.*]] = alloca i32, align 4
 // CHECK13-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
@@ -5329,7 +5329,7 @@ int main() {
 // CHECK13-NEXT:    [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
 // CHECK13-NEXT:    br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK13:       omp_offload.failed:
-// CHECK13-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l52() #[[ATTR5]]
+// CHECK13-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l52() #[[ATTR4]]
 // CHECK13-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK13:       omp_offload.cont:
 // CHECK13-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
@@ -5362,14 +5362,14 @@ int main() {
 // CHECK13-NEXT:    [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0
 // CHECK13-NEXT:    br i1 [[TMP29]], label [[OMP_OFFLOAD_FAILED3:%.*]], label [[OMP_OFFLOAD_CONT4:%.*]]
 // CHECK13:       omp_offload.failed3:
-// CHECK13-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l57() #[[ATTR5]]
+// CHECK13-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l57() #[[ATTR4]]
 // CHECK13-NEXT:    br label [[OMP_OFFLOAD_CONT4]]
 // CHECK13:       omp_offload.cont4:
 // CHECK13-NEXT:    ret i32 0
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@_Z5tmainI1SLi1EEiv
-// CHECK13-SAME: () #[[ATTR8]] comdat {
+// CHECK13-SAME: () #[[ATTR7]] comdat {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[TMP:%.*]] = alloca i32, align 4
 // CHECK13-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
@@ -5405,7 +5405,7 @@ int main() {
 // CHECK13-NEXT:    [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
 // CHECK13-NEXT:    br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK13:       omp_offload.failed:
-// CHECK13-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l52() #[[ATTR5]]
+// CHECK13-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l52() #[[ATTR4]]
 // CHECK13-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK13:       omp_offload.cont:
 // CHECK13-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
@@ -5438,24 +5438,24 @@ int main() {
 // CHECK13-NEXT:    [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0
 // CHECK13-NEXT:    br i1 [[TMP29]], label [[OMP_OFFLOAD_FAILED3:%.*]], label [[OMP_OFFLOAD_CONT4:%.*]]
 // CHECK13:       omp_offload.failed3:
-// CHECK13-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l57() #[[ATTR5]]
+// CHECK13-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l57() #[[ATTR4]]
 // CHECK13-NEXT:    br label [[OMP_OFFLOAD_CONT4]]
 // CHECK13:       omp_offload.cont4:
 // CHECK13-NEXT:    ret i32 0
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@_ZN1SD1Ev
-// CHECK13-SAME: (ptr nonnull align 8 dereferenceable(24) [[THIS:%.*]]) unnamed_addr #[[ATTR9:[0-9]+]] comdat align 2 {
+// CHECK13-SAME: (ptr nonnull align 8 dereferenceable(24) [[THIS:%.*]]) unnamed_addr #[[ATTR8:[0-9]+]] comdat align 2 {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
 // CHECK13-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
-// CHECK13-NEXT:    call void @_ZN1SD2Ev(ptr nonnull align 8 dereferenceable(24) [[THIS1]]) #[[ATTR5]]
+// CHECK13-NEXT:    call void @_ZN1SD2Ev(ptr nonnull align 8 dereferenceable(24) [[THIS1]]) #[[ATTR4]]
 // CHECK13-NEXT:    ret void
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@_ZN1SC2El
-// CHECK13-SAME: (ptr nonnull align 8 dereferenceable(24) [[THIS:%.*]], i64 [[A:%.*]]) unnamed_addr #[[ATTR9]] comdat align 2 {
+// CHECK13-SAME: (ptr nonnull align 8 dereferenceable(24) [[THIS:%.*]], i64 [[A:%.*]]) unnamed_addr #[[ATTR8]] comdat align 2 {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
@@ -5476,7 +5476,7 @@ int main() {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l52.omp_outlined
-// CHECK13-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR4]] {
+// CHECK13-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR3]] {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -5544,7 +5544,7 @@ int main() {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l52.omp_outlined.omp_outlined
-// CHECK13-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR4]] personality ptr @__gxx_personality_v0 {
+// CHECK13-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -5625,7 +5625,7 @@ int main() {
 // CHECK13-NEXT:    [[TMP13:%.*]] = landingpad { ptr, i32 }
 // CHECK13-NEXT:    catch ptr null
 // CHECK13-NEXT:    [[TMP14:%.*]] = extractvalue { ptr, i32 } [[TMP13]], 0
-// CHECK13-NEXT:    call void @__clang_call_terminate(ptr [[TMP14]]) #[[ATTR11]], !llvm.access.group [[ACC_GRP27]]
+// CHECK13-NEXT:    call void @__clang_call_terminate(ptr [[TMP14]]) #[[ATTR10]], !llvm.access.group [[ACC_GRP27]]
 // CHECK13-NEXT:    unreachable
 //
 //
@@ -5637,7 +5637,7 @@ int main() {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l57.omp_outlined
-// CHECK13-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR4]] {
+// CHECK13-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR3]] {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -5705,7 +5705,7 @@ int main() {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIcLi5EEiv_l57.omp_outlined.omp_outlined
-// CHECK13-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR4]] personality ptr @__gxx_personality_v0 {
+// CHECK13-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -5786,7 +5786,7 @@ int main() {
 // CHECK13-NEXT:    [[TMP13:%.*]] = landingpad { ptr, i32 }
 // CHECK13-NEXT:    catch ptr null
 // CHECK13-NEXT:    [[TMP14:%.*]] = extractvalue { ptr, i32 } [[TMP13]], 0
-// CHECK13-NEXT:    call void @__clang_call_terminate(ptr [[TMP14]]) #[[ATTR11]], !llvm.access.group [[ACC_GRP33]]
+// CHECK13-NEXT:    call void @__clang_call_terminate(ptr [[TMP14]]) #[[ATTR10]], !llvm.access.group [[ACC_GRP33]]
 // CHECK13-NEXT:    unreachable
 //
 //
@@ -5798,7 +5798,7 @@ int main() {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l52.omp_outlined
-// CHECK13-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR4]] {
+// CHECK13-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR3]] {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -5866,7 +5866,7 @@ int main() {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l52.omp_outlined.omp_outlined
-// CHECK13-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR4]] personality ptr @__gxx_personality_v0 {
+// CHECK13-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -5947,7 +5947,7 @@ int main() {
 // CHECK13-NEXT:    [[TMP13:%.*]] = landingpad { ptr, i32 }
 // CHECK13-NEXT:    catch ptr null
 // CHECK13-NEXT:    [[TMP14:%.*]] = extractvalue { ptr, i32 } [[TMP13]], 0
-// CHECK13-NEXT:    call void @__clang_call_terminate(ptr [[TMP14]]) #[[ATTR11]], !llvm.access.group [[ACC_GRP39]]
+// CHECK13-NEXT:    call void @__clang_call_terminate(ptr [[TMP14]]) #[[ATTR10]], !llvm.access.group [[ACC_GRP39]]
 // CHECK13-NEXT:    unreachable
 //
 //
@@ -5959,7 +5959,7 @@ int main() {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l57.omp_outlined
-// CHECK13-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR4]] personality ptr @__gxx_personality_v0 {
+// CHECK13-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -6008,7 +6008,7 @@ int main() {
 // CHECK13:       invoke.cont2:
 // CHECK13-NEXT:    [[TMP7:%.*]] = sext i8 [[CALL]] to i32
 // CHECK13-NEXT:    call void @__kmpc_push_num_threads(ptr @[[GLOB3]], i32 [[TMP1]], i32 [[TMP7]]), !llvm.access.group [[ACC_GRP42]]
-// CHECK13-NEXT:    call void @_ZN1SD1Ev(ptr nonnull align 8 dereferenceable(24) [[REF_TMP]]) #[[ATTR5]], !llvm.access.group [[ACC_GRP42]]
+// CHECK13-NEXT:    call void @_ZN1SD1Ev(ptr nonnull align 8 dereferenceable(24) [[REF_TMP]]) #[[ATTR4]], !llvm.access.group [[ACC_GRP42]]
 // CHECK13-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP42]]
 // CHECK13-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP8]] to i64
 // CHECK13-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP42]]
@@ -6037,12 +6037,12 @@ int main() {
 // CHECK13-NEXT:    [[TMP16:%.*]] = landingpad { ptr, i32 }
 // CHECK13-NEXT:    catch ptr null
 // CHECK13-NEXT:    [[TMP17:%.*]] = extractvalue { ptr, i32 } [[TMP16]], 0
-// CHECK13-NEXT:    call void @__clang_call_terminate(ptr [[TMP17]]) #[[ATTR11]], !llvm.access.group [[ACC_GRP42]]
+// CHECK13-NEXT:    call void @__clang_call_terminate(ptr [[TMP17]]) #[[ATTR10]], !llvm.access.group [[ACC_GRP42]]
 // CHECK13-NEXT:    unreachable
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainI1SLi1EEiv_l57.omp_outlined.omp_outlined
-// CHECK13-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR4]] personality ptr @__gxx_personality_v0 {
+// CHECK13-SAME: (ptr noalias [[DOTGLOBAL_TID_:%.*]], ptr noalias [[DOTBOUND_TID_:%.*]], i64 [[DOTPREVIOUS_LB_:%.*]], i64 [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] personality ptr @__gxx_personality_v0 {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -6123,12 +6123,12 @@ int main() {
 // CHECK13-NEXT:    [[TMP13:%.*]] = landingpad { ptr, i32 }
 // CHECK13-NEXT:    catch ptr null
 // CHECK13-NEXT:    [[TMP14:%.*]] = extractvalue { ptr, i32 } [[TMP13]], 0
-// CHECK13-NEXT:    call void @__clang_call_terminate(ptr [[TMP14]]) #[[ATTR11]], !llvm.access.group [[ACC_GRP45]]
+// CHECK13-NEXT:    call void @__clang_call_terminate(ptr [[TMP14]]) #[[ATTR10]], !llvm.access.group [[ACC_GRP45]]
 // CHECK13-NEXT:    unreachable
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@_ZN1SD2Ev
-// CHECK13-SAME: (ptr nonnull align 8 dereferenceable(24) [[THIS:%.*]]) unnamed_addr #[[ATTR9]] comdat align 2 {
+// CHECK13-SAME: (ptr nonnull align 8 dereferenceable(24) [[THIS:%.*]]) unnamed_addr #[[ATTR8]] comdat align 2 {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
@@ -6137,7 +6137,7 @@ int main() {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK13-SAME: () #[[ATTR10:[0-9]+]] {
+// CHECK13-SAME: () #[[ATTR9:[0-9]+]] {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK13-NEXT:    ret void
diff --git a/clang/test/OpenMP/distribute_parallel_for_simd_private_codegen.cpp b/clang/test/OpenMP/distribute_parallel_for_simd_private_codegen.cpp
index 79b1821e1d8a1..6bbde2f72870a 100644
--- a/clang/test/OpenMP/distribute_parallel_for_simd_private_codegen.cpp
+++ b/clang/test/OpenMP/distribute_parallel_for_simd_private_codegen.cpp
@@ -156,7 +156,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -231,7 +231,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68.omp_outlined.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -332,7 +332,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK1-SAME: () #[[ATTR5:[0-9]+]] {
+// CHECK1-SAME: () #[[ATTR4:[0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK1-NEXT:    ret void
@@ -359,7 +359,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -432,7 +432,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68.omp_outlined.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -531,7 +531,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK3-SAME: () #[[ATTR5:[0-9]+]] {
+// CHECK3-SAME: () #[[ATTR4:[0-9]+]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK3-NEXT:    ret void
@@ -618,7 +618,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
 // CHECK9-NEXT:    br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK9:       omp_offload.failed:
-// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l95() #[[ATTR5:[0-9]+]]
+// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l95() #[[ATTR4:[0-9]+]]
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK9:       omp_offload.cont:
 // CHECK9-NEXT:    [[CALL:%.*]] = call noundef signext i32 @_Z5tmainIiET_v()
@@ -629,11 +629,11 @@ int main() {
 // CHECK9:       arraydestroy.body:
 // CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP15]], [[OMP_OFFLOAD_CONT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
-// CHECK9-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK9-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN]]
 // CHECK9-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE2:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK9:       arraydestroy.done2:
-// CHECK9-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]]
 // CHECK9-NEXT:    [[TMP16:%.*]] = load i32, ptr [[RETVAL]], align 4
 // CHECK9-NEXT:    ret i32 [[TMP16]]
 //
@@ -669,7 +669,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l95.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR4:[0-9]+]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR3]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -756,14 +756,14 @@ int main() {
 // CHECK9-NEXT:    store i32 2, ptr [[I]], align 4
 // CHECK9-NEXT:    br label [[DOTOMP_FINAL_DONE]]
 // CHECK9:       .omp.final.done:
-// CHECK9-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR4]]
 // CHECK9-NEXT:    [[ARRAY_BEGIN4:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN4]], i64 2
 // CHECK9-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK9:       arraydestroy.body:
 // CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP17]], [[DOTOMP_FINAL_DONE]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
-// CHECK9-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK9-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN4]]
 // CHECK9-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE5:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK9:       arraydestroy.done5:
@@ -771,7 +771,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l95.omp_outlined.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR4]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -878,14 +878,14 @@ int main() {
 // CHECK9-NEXT:    store i32 2, ptr [[I]], align 4
 // CHECK9-NEXT:    br label [[DOTOMP_FINAL_DONE]]
 // CHECK9:       .omp.final.done:
-// CHECK9-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR4]]
 // CHECK9-NEXT:    [[ARRAY_BEGIN8:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN8]], i64 2
 // CHECK9-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK9:       arraydestroy.body:
 // CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP19]], [[DOTOMP_FINAL_DONE]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
-// CHECK9-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK9-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN8]]
 // CHECK9-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE9:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK9:       arraydestroy.done9:
@@ -898,12 +898,12 @@ int main() {
 // CHECK9-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
 // CHECK9-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
-// CHECK9-NEXT:    call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]]
 // CHECK9-NEXT:    ret void
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@_Z5tmainIiET_v
-// CHECK9-SAME: () #[[ATTR6:[0-9]+]] comdat {
+// CHECK9-SAME: () #[[ATTR5:[0-9]+]] comdat {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
 // CHECK9-NEXT:    [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4
@@ -953,7 +953,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
 // CHECK9-NEXT:    br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK9:       omp_offload.failed:
-// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l49() #[[ATTR5]]
+// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l49() #[[ATTR4]]
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK9:       omp_offload.cont:
 // CHECK9-NEXT:    store i32 0, ptr [[RETVAL]], align 4
@@ -963,11 +963,11 @@ int main() {
 // CHECK9:       arraydestroy.body:
 // CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP15]], [[OMP_OFFLOAD_CONT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
-// CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK9-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN]]
 // CHECK9-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE2:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK9:       arraydestroy.done2:
-// CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]]
 // CHECK9-NEXT:    [[TMP16:%.*]] = load i32, ptr [[RETVAL]], align 4
 // CHECK9-NEXT:    ret i32 [[TMP16]]
 //
@@ -1037,7 +1037,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l49.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR4]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR3]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1123,14 +1123,14 @@ int main() {
 // CHECK9-NEXT:    store i32 2, ptr [[I]], align 4
 // CHECK9-NEXT:    br label [[DOTOMP_FINAL_DONE]]
 // CHECK9:       .omp.final.done:
-// CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR4]]
 // CHECK9-NEXT:    [[ARRAY_BEGIN4:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN4]], i64 2
 // CHECK9-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK9:       arraydestroy.body:
 // CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP17]], [[DOTOMP_FINAL_DONE]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
-// CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK9-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN4]]
 // CHECK9-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE5:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK9:       arraydestroy.done5:
@@ -1138,7 +1138,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l49.omp_outlined.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR4]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1244,14 +1244,14 @@ int main() {
 // CHECK9-NEXT:    store i32 2, ptr [[I]], align 4
 // CHECK9-NEXT:    br label [[DOTOMP_FINAL_DONE]]
 // CHECK9:       .omp.final.done:
-// CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR4]]
 // CHECK9-NEXT:    [[ARRAY_BEGIN8:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN8]], i64 2
 // CHECK9-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK9:       arraydestroy.body:
 // CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP19]], [[DOTOMP_FINAL_DONE]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
-// CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK9-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN8]]
 // CHECK9-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE9:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK9:       arraydestroy.done9:
@@ -1264,7 +1264,7 @@ int main() {
 // CHECK9-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
 // CHECK9-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
-// CHECK9-NEXT:    call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]]
 // CHECK9-NEXT:    ret void
 //
 //
@@ -1303,7 +1303,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK9-SAME: () #[[ATTR7:[0-9]+]] {
+// CHECK9-SAME: () #[[ATTR6:[0-9]+]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK9-NEXT:    ret void
@@ -1364,7 +1364,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
 // CHECK11-NEXT:    br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK11:       omp_offload.failed:
-// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l95() #[[ATTR5:[0-9]+]]
+// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l95() #[[ATTR4:[0-9]+]]
 // CHECK11-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK11:       omp_offload.cont:
 // CHECK11-NEXT:    [[CALL:%.*]] = call noundef i32 @_Z5tmainIiET_v()
@@ -1375,11 +1375,11 @@ int main() {
 // CHECK11:       arraydestroy.body:
 // CHECK11-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP15]], [[OMP_OFFLOAD_CONT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK11-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1
-// CHECK11-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK11-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN]]
 // CHECK11-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE2:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK11:       arraydestroy.done2:
-// CHECK11-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]]
 // CHECK11-NEXT:    [[TMP16:%.*]] = load i32, ptr [[RETVAL]], align 4
 // CHECK11-NEXT:    ret i32 [[TMP16]]
 //
@@ -1415,7 +1415,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l95.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR4:[0-9]+]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR3]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1500,14 +1500,14 @@ int main() {
 // CHECK11-NEXT:    store i32 2, ptr [[I]], align 4
 // CHECK11-NEXT:    br label [[DOTOMP_FINAL_DONE]]
 // CHECK11:       .omp.final.done:
-// CHECK11-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR4]]
 // CHECK11-NEXT:    [[ARRAY_BEGIN4:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN4]], i32 2
 // CHECK11-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK11:       arraydestroy.body:
 // CHECK11-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP15]], [[DOTOMP_FINAL_DONE]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK11-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1
-// CHECK11-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK11-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN4]]
 // CHECK11-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE5:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK11:       arraydestroy.done5:
@@ -1515,7 +1515,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l95.omp_outlined.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR4]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1618,14 +1618,14 @@ int main() {
 // CHECK11-NEXT:    store i32 2, ptr [[I]], align 4
 // CHECK11-NEXT:    br label [[DOTOMP_FINAL_DONE]]
 // CHECK11:       .omp.final.done:
-// CHECK11-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR4]]
 // CHECK11-NEXT:    [[ARRAY_BEGIN6:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN6]], i32 2
 // CHECK11-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK11:       arraydestroy.body:
 // CHECK11-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP19]], [[DOTOMP_FINAL_DONE]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK11-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1
-// CHECK11-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK11-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN6]]
 // CHECK11-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE7:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK11:       arraydestroy.done7:
@@ -1638,12 +1638,12 @@ int main() {
 // CHECK11-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
 // CHECK11-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
-// CHECK11-NEXT:    call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]]
 // CHECK11-NEXT:    ret void
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@_Z5tmainIiET_v
-// CHECK11-SAME: () #[[ATTR6:[0-9]+]] comdat {
+// CHECK11-SAME: () #[[ATTR5:[0-9]+]] comdat {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4
@@ -1693,7 +1693,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
 // CHECK11-NEXT:    br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK11:       omp_offload.failed:
-// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l49() #[[ATTR5]]
+// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l49() #[[ATTR4]]
 // CHECK11-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK11:       omp_offload.cont:
 // CHECK11-NEXT:    store i32 0, ptr [[RETVAL]], align 4
@@ -1703,11 +1703,11 @@ int main() {
 // CHECK11:       arraydestroy.body:
 // CHECK11-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP15]], [[OMP_OFFLOAD_CONT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK11-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1
-// CHECK11-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK11-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN]]
 // CHECK11-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE2:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK11:       arraydestroy.done2:
-// CHECK11-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]]
 // CHECK11-NEXT:    [[TMP16:%.*]] = load i32, ptr [[RETVAL]], align 4
 // CHECK11-NEXT:    ret i32 [[TMP16]]
 //
@@ -1777,7 +1777,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l49.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR4]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR3]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1861,14 +1861,14 @@ int main() {
 // CHECK11-NEXT:    store i32 2, ptr [[I]], align 4
 // CHECK11-NEXT:    br label [[DOTOMP_FINAL_DONE]]
 // CHECK11:       .omp.final.done:
-// CHECK11-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR4]]
 // CHECK11-NEXT:    [[ARRAY_BEGIN4:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN4]], i32 2
 // CHECK11-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK11:       arraydestroy.body:
 // CHECK11-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP15]], [[DOTOMP_FINAL_DONE]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK11-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1
-// CHECK11-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK11-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN4]]
 // CHECK11-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE5:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK11:       arraydestroy.done5:
@@ -1876,7 +1876,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l49.omp_outlined.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR4]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1978,14 +1978,14 @@ int main() {
 // CHECK11-NEXT:    store i32 2, ptr [[I]], align 4
 // CHECK11-NEXT:    br label [[DOTOMP_FINAL_DONE]]
 // CHECK11:       .omp.final.done:
-// CHECK11-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR4]]
 // CHECK11-NEXT:    [[ARRAY_BEGIN6:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN6]], i32 2
 // CHECK11-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK11:       arraydestroy.body:
 // CHECK11-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP19]], [[DOTOMP_FINAL_DONE]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK11-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1
-// CHECK11-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK11-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN6]]
 // CHECK11-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE7:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK11:       arraydestroy.done7:
@@ -1998,7 +1998,7 @@ int main() {
 // CHECK11-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
 // CHECK11-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
-// CHECK11-NEXT:    call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]]
 // CHECK11-NEXT:    ret void
 //
 //
@@ -2037,7 +2037,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK11-SAME: () #[[ATTR7:[0-9]+]] {
+// CHECK11-SAME: () #[[ATTR6:[0-9]+]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK11-NEXT:    ret void
diff --git a/clang/test/OpenMP/distribute_parallel_for_simd_proc_bind_codegen.cpp b/clang/test/OpenMP/distribute_parallel_for_simd_proc_bind_codegen.cpp
index 1afb6249cdc27..9eeb1fc36a03e 100644
--- a/clang/test/OpenMP/distribute_parallel_for_simd_proc_bind_codegen.cpp
+++ b/clang/test/OpenMP/distribute_parallel_for_simd_proc_bind_codegen.cpp
@@ -92,7 +92,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
 // CHECK1-NEXT:    br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK1:       omp_offload.failed:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l37() #[[ATTR3:[0-9]+]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l37() #[[ATTR2:[0-9]+]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
@@ -125,7 +125,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0
 // CHECK1-NEXT:    br i1 [[TMP29]], label [[OMP_OFFLOAD_FAILED3:%.*]], label [[OMP_OFFLOAD_CONT4:%.*]]
 // CHECK1:       omp_offload.failed3:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41() #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41() #[[ATTR2]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT4]]
 // CHECK1:       omp_offload.cont4:
 // CHECK1-NEXT:    [[CALL:%.*]] = call noundef signext i32 @_Z5tmainIiET_v()
@@ -140,7 +140,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l37.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -208,7 +208,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l37.omp_outlined.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -292,7 +292,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -360,7 +360,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41.omp_outlined.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -437,7 +437,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@_Z5tmainIiET_v
-// CHECK1-SAME: () #[[ATTR4:[0-9]+]] comdat {
+// CHECK1-SAME: () #[[ATTR3:[0-9]+]] comdat {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[TMP:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
@@ -471,7 +471,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
 // CHECK1-NEXT:    br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK1:       omp_offload.failed:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l29() #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l29() #[[ATTR2]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    ret i32 0
@@ -485,7 +485,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l29.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -553,7 +553,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l29.omp_outlined.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -630,7 +630,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK1-SAME: () #[[ATTR5:[0-9]+]] {
+// CHECK1-SAME: () #[[ATTR4:[0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK1-NEXT:    ret void
diff --git a/clang/test/OpenMP/distribute_private_codegen.cpp b/clang/test/OpenMP/distribute_private_codegen.cpp
index a99a82697b6d8..5137fbdb3ebaf 100644
--- a/clang/test/OpenMP/distribute_private_codegen.cpp
+++ b/clang/test/OpenMP/distribute_private_codegen.cpp
@@ -135,7 +135,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -219,7 +219,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK1-SAME: () #[[ATTR5:[0-9]+]] {
+// CHECK1-SAME: () #[[ATTR4:[0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK1-NEXT:    ret void
@@ -246,7 +246,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -330,7 +330,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK3-SAME: () #[[ATTR5:[0-9]+]] {
+// CHECK3-SAME: () #[[ATTR4:[0-9]+]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK3-NEXT:    ret void
@@ -394,7 +394,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
 // CHECK9-NEXT:    br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK9:       omp_offload.failed:
-// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l93() #[[ATTR5:[0-9]+]]
+// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l93() #[[ATTR4:[0-9]+]]
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK9:       omp_offload.cont:
 // CHECK9-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS3]], i32 0, i32 0
@@ -427,7 +427,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0
 // CHECK9-NEXT:    br i1 [[TMP29]], label [[OMP_OFFLOAD_FAILED4:%.*]], label [[OMP_OFFLOAD_CONT5:%.*]]
 // CHECK9:       omp_offload.failed4:
-// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l102() #[[ATTR5]]
+// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l102() #[[ATTR4]]
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT5]]
 // CHECK9:       omp_offload.cont5:
 // CHECK9-NEXT:    [[CALL:%.*]] = call noundef signext i32 @_Z5tmainIiET_v()
@@ -438,11 +438,11 @@ int main() {
 // CHECK9:       arraydestroy.body:
 // CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP30]], [[OMP_OFFLOAD_CONT5]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
-// CHECK9-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK9-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN]]
 // CHECK9-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE6:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK9:       arraydestroy.done6:
-// CHECK9-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]]
 // CHECK9-NEXT:    [[TMP31:%.*]] = load i32, ptr [[RETVAL]], align 4
 // CHECK9-NEXT:    ret i32 [[TMP31]]
 //
@@ -478,7 +478,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l93.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR4:[0-9]+]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR3]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -568,14 +568,14 @@ int main() {
 // CHECK9-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
 // CHECK9-NEXT:    [[TMP14:%.*]] = load i32, ptr [[TMP13]], align 4
 // CHECK9-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP14]])
-// CHECK9-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR4]]
 // CHECK9-NEXT:    [[ARRAY_BEGIN7:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN7]], i64 2
 // CHECK9-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK9:       arraydestroy.body:
 // CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP15]], [[OMP_LOOP_EXIT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
-// CHECK9-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK9-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN7]]
 // CHECK9-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE8:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK9:       arraydestroy.done8:
@@ -588,7 +588,7 @@ int main() {
 // CHECK9-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
 // CHECK9-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
-// CHECK9-NEXT:    call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]]
 // CHECK9-NEXT:    ret void
 //
 //
@@ -600,7 +600,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l102.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR4]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR3]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -661,7 +661,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@_Z5tmainIiET_v
-// CHECK9-SAME: () #[[ATTR6:[0-9]+]] comdat {
+// CHECK9-SAME: () #[[ATTR5:[0-9]+]] comdat {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
 // CHECK9-NEXT:    [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4
@@ -711,7 +711,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
 // CHECK9-NEXT:    br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK9:       omp_offload.failed:
-// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l49() #[[ATTR5]]
+// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l49() #[[ATTR4]]
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK9:       omp_offload.cont:
 // CHECK9-NEXT:    store i32 0, ptr [[RETVAL]], align 4
@@ -721,11 +721,11 @@ int main() {
 // CHECK9:       arraydestroy.body:
 // CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP15]], [[OMP_OFFLOAD_CONT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
-// CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK9-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN]]
 // CHECK9-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE2:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK9:       arraydestroy.done2:
-// CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]]
 // CHECK9-NEXT:    [[TMP16:%.*]] = load i32, ptr [[RETVAL]], align 4
 // CHECK9-NEXT:    ret i32 [[TMP16]]
 //
@@ -795,7 +795,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l49.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR4]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR3]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -884,14 +884,14 @@ int main() {
 // CHECK9-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
 // CHECK9-NEXT:    [[TMP14:%.*]] = load i32, ptr [[TMP13]], align 4
 // CHECK9-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP14]])
-// CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR4]]
 // CHECK9-NEXT:    [[ARRAY_BEGIN7:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN7]], i64 2
 // CHECK9-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK9:       arraydestroy.body:
 // CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP15]], [[OMP_LOOP_EXIT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
-// CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK9-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN7]]
 // CHECK9-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE8:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK9:       arraydestroy.done8:
@@ -904,7 +904,7 @@ int main() {
 // CHECK9-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
 // CHECK9-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
-// CHECK9-NEXT:    call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]]
 // CHECK9-NEXT:    ret void
 //
 //
@@ -943,7 +943,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK9-SAME: () #[[ATTR7:[0-9]+]] {
+// CHECK9-SAME: () #[[ATTR6:[0-9]+]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK9-NEXT:    ret void
@@ -1007,7 +1007,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
 // CHECK11-NEXT:    br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK11:       omp_offload.failed:
-// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l93() #[[ATTR5:[0-9]+]]
+// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l93() #[[ATTR4:[0-9]+]]
 // CHECK11-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK11:       omp_offload.cont:
 // CHECK11-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS3]], i32 0, i32 0
@@ -1040,7 +1040,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0
 // CHECK11-NEXT:    br i1 [[TMP29]], label [[OMP_OFFLOAD_FAILED4:%.*]], label [[OMP_OFFLOAD_CONT5:%.*]]
 // CHECK11:       omp_offload.failed4:
-// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l102() #[[ATTR5]]
+// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l102() #[[ATTR4]]
 // CHECK11-NEXT:    br label [[OMP_OFFLOAD_CONT5]]
 // CHECK11:       omp_offload.cont5:
 // CHECK11-NEXT:    [[CALL:%.*]] = call noundef i32 @_Z5tmainIiET_v()
@@ -1051,11 +1051,11 @@ int main() {
 // CHECK11:       arraydestroy.body:
 // CHECK11-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP30]], [[OMP_OFFLOAD_CONT5]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK11-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1
-// CHECK11-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK11-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN]]
 // CHECK11-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE6:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK11:       arraydestroy.done6:
-// CHECK11-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]]
 // CHECK11-NEXT:    [[TMP31:%.*]] = load i32, ptr [[RETVAL]], align 4
 // CHECK11-NEXT:    ret i32 [[TMP31]]
 //
@@ -1091,7 +1091,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l93.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR4:[0-9]+]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR3]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1179,14 +1179,14 @@ int main() {
 // CHECK11-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
 // CHECK11-NEXT:    [[TMP14:%.*]] = load i32, ptr [[TMP13]], align 4
 // CHECK11-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP14]])
-// CHECK11-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR4]]
 // CHECK11-NEXT:    [[ARRAY_BEGIN6:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN6]], i32 2
 // CHECK11-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK11:       arraydestroy.body:
 // CHECK11-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP15]], [[OMP_LOOP_EXIT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK11-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1
-// CHECK11-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK11-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN6]]
 // CHECK11-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE7:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK11:       arraydestroy.done7:
@@ -1199,7 +1199,7 @@ int main() {
 // CHECK11-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
 // CHECK11-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
-// CHECK11-NEXT:    call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]]
 // CHECK11-NEXT:    ret void
 //
 //
@@ -1211,7 +1211,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l102.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR4]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR3]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1272,7 +1272,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@_Z5tmainIiET_v
-// CHECK11-SAME: () #[[ATTR6:[0-9]+]] comdat {
+// CHECK11-SAME: () #[[ATTR5:[0-9]+]] comdat {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4
@@ -1322,7 +1322,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
 // CHECK11-NEXT:    br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK11:       omp_offload.failed:
-// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l49() #[[ATTR5]]
+// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l49() #[[ATTR4]]
 // CHECK11-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK11:       omp_offload.cont:
 // CHECK11-NEXT:    store i32 0, ptr [[RETVAL]], align 4
@@ -1332,11 +1332,11 @@ int main() {
 // CHECK11:       arraydestroy.body:
 // CHECK11-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP15]], [[OMP_OFFLOAD_CONT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK11-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1
-// CHECK11-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK11-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN]]
 // CHECK11-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE2:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK11:       arraydestroy.done2:
-// CHECK11-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]]
 // CHECK11-NEXT:    [[TMP16:%.*]] = load i32, ptr [[RETVAL]], align 4
 // CHECK11-NEXT:    ret i32 [[TMP16]]
 //
@@ -1406,7 +1406,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l49.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR4]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR3]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1493,14 +1493,14 @@ int main() {
 // CHECK11-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
 // CHECK11-NEXT:    [[TMP14:%.*]] = load i32, ptr [[TMP13]], align 4
 // CHECK11-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP14]])
-// CHECK11-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR4]]
 // CHECK11-NEXT:    [[ARRAY_BEGIN6:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN6]], i32 2
 // CHECK11-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK11:       arraydestroy.body:
 // CHECK11-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP15]], [[OMP_LOOP_EXIT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK11-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1
-// CHECK11-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK11-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN6]]
 // CHECK11-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE7:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK11:       arraydestroy.done7:
@@ -1513,7 +1513,7 @@ int main() {
 // CHECK11-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
 // CHECK11-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
-// CHECK11-NEXT:    call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]]
 // CHECK11-NEXT:    ret void
 //
 //
@@ -1552,7 +1552,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK11-SAME: () #[[ATTR7:[0-9]+]] {
+// CHECK11-SAME: () #[[ATTR6:[0-9]+]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK11-NEXT:    ret void
diff --git a/clang/test/OpenMP/distribute_simd_codegen.cpp b/clang/test/OpenMP/distribute_simd_codegen.cpp
index f74abbe32e454..2283cb0df0c08 100644
--- a/clang/test/OpenMP/distribute_simd_codegen.cpp
+++ b/clang/test/OpenMP/distribute_simd_codegen.cpp
@@ -5489,12 +5489,14 @@ int fint(void) { return ftemplate<int>(); }
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z23without_schedule_clausePfS_S_S__l70
-// CHECK17-SAME: (ptr noundef [[A:%.*]], ptr noundef [[B:%.*]], ptr noundef [[C:%.*]], ptr noundef [[D:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK17-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef [[A:%.*]], ptr noundef [[B:%.*]], ptr noundef [[C:%.*]], ptr noundef [[D:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK17-NEXT:  entry:
+// CHECK17-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[D_ADDR:%.*]] = alloca ptr, align 8
+// CHECK17-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK17-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
 // CHECK17-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
 // CHECK17-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
@@ -5607,12 +5609,14 @@ int fint(void) { return ftemplate<int>(); }
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z18static_not_chunkedPfS_S_S__l86
-// CHECK17-SAME: (ptr noundef [[A:%.*]], ptr noundef [[B:%.*]], ptr noundef [[C:%.*]], ptr noundef [[D:%.*]]) #[[ATTR0]] {
+// CHECK17-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef [[A:%.*]], ptr noundef [[B:%.*]], ptr noundef [[C:%.*]], ptr noundef [[D:%.*]]) #[[ATTR0]] {
 // CHECK17-NEXT:  entry:
+// CHECK17-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[D_ADDR:%.*]] = alloca ptr, align 8
+// CHECK17-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK17-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
 // CHECK17-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
 // CHECK17-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
@@ -5723,12 +5727,14 @@ int fint(void) { return ftemplate<int>(); }
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z14static_chunkedPfS_S_S__l103
-// CHECK17-SAME: (ptr noundef [[A:%.*]], ptr noundef [[B:%.*]], ptr noundef [[C:%.*]], ptr noundef [[D:%.*]]) #[[ATTR0]] {
+// CHECK17-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef [[A:%.*]], ptr noundef [[B:%.*]], ptr noundef [[C:%.*]], ptr noundef [[D:%.*]]) #[[ATTR0]] {
 // CHECK17-NEXT:  entry:
+// CHECK17-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[D_ADDR:%.*]] = alloca ptr, align 8
+// CHECK17-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK17-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
 // CHECK17-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
 // CHECK17-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
@@ -5856,10 +5862,12 @@ int fint(void) { return ftemplate<int>(); }
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z12test_precondv_l115
-// CHECK17-SAME: (i64 noundef [[I:%.*]], i64 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK17-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[I:%.*]], i64 noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK17-NEXT:  entry:
+// CHECK17-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[I_ADDR:%.*]] = alloca i64, align 8
 // CHECK17-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
+// CHECK17-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK17-NEXT:    store i64 [[I]], ptr [[I_ADDR]], align 8
 // CHECK17-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
 // CHECK17-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB2]], i32 2, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z12test_precondv_l115.omp_outlined, ptr [[I_ADDR]], ptr [[A_ADDR]])
@@ -5981,9 +5989,11 @@ int fint(void) { return ftemplate<int>(); }
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_v_l135
-// CHECK17-SAME: (i64 noundef [[AA:%.*]]) #[[ATTR0]] {
+// CHECK17-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[AA:%.*]]) #[[ATTR0]] {
 // CHECK17-NEXT:  entry:
+// CHECK17-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[AA_ADDR:%.*]] = alloca i64, align 8
+// CHECK17-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK17-NEXT:    store i64 [[AA]], ptr [[AA_ADDR]], align 8
 // CHECK17-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB2]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_v_l135.omp_outlined, ptr [[AA_ADDR]])
 // CHECK17-NEXT:    ret void
@@ -6079,12 +6089,14 @@ int fint(void) { return ftemplate<int>(); }
 //
 //
 // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z23without_schedule_clausePfS_S_S__l70
-// CHECK19-SAME: (ptr noundef [[A:%.*]], ptr noundef [[B:%.*]], ptr noundef [[C:%.*]], ptr noundef [[D:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK19-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef [[A:%.*]], ptr noundef [[B:%.*]], ptr noundef [[C:%.*]], ptr noundef [[D:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK19-NEXT:  entry:
+// CHECK19-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[D_ADDR:%.*]] = alloca ptr, align 4
+// CHECK19-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK19-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
 // CHECK19-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 4
 // CHECK19-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 4
@@ -6193,12 +6205,14 @@ int fint(void) { return ftemplate<int>(); }
 //
 //
 // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z18static_not_chunkedPfS_S_S__l86
-// CHECK19-SAME: (ptr noundef [[A:%.*]], ptr noundef [[B:%.*]], ptr noundef [[C:%.*]], ptr noundef [[D:%.*]]) #[[ATTR0]] {
+// CHECK19-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef [[A:%.*]], ptr noundef [[B:%.*]], ptr noundef [[C:%.*]], ptr noundef [[D:%.*]]) #[[ATTR0]] {
 // CHECK19-NEXT:  entry:
+// CHECK19-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[D_ADDR:%.*]] = alloca ptr, align 4
+// CHECK19-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK19-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
 // CHECK19-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 4
 // CHECK19-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 4
@@ -6305,12 +6319,14 @@ int fint(void) { return ftemplate<int>(); }
 //
 //
 // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z14static_chunkedPfS_S_S__l103
-// CHECK19-SAME: (ptr noundef [[A:%.*]], ptr noundef [[B:%.*]], ptr noundef [[C:%.*]], ptr noundef [[D:%.*]]) #[[ATTR0]] {
+// CHECK19-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef [[A:%.*]], ptr noundef [[B:%.*]], ptr noundef [[C:%.*]], ptr noundef [[D:%.*]]) #[[ATTR0]] {
 // CHECK19-NEXT:  entry:
+// CHECK19-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[D_ADDR:%.*]] = alloca ptr, align 4
+// CHECK19-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK19-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
 // CHECK19-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 4
 // CHECK19-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 4
@@ -6434,10 +6450,12 @@ int fint(void) { return ftemplate<int>(); }
 //
 //
 // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z12test_precondv_l115
-// CHECK19-SAME: (i32 noundef [[I:%.*]], i32 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK19-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[I:%.*]], i32 noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK19-NEXT:  entry:
+// CHECK19-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[I_ADDR:%.*]] = alloca i32, align 4
 // CHECK19-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// CHECK19-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK19-NEXT:    store i32 [[I]], ptr [[I_ADDR]], align 4
 // CHECK19-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
 // CHECK19-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB2]], i32 2, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z12test_precondv_l115.omp_outlined, ptr [[I_ADDR]], ptr [[A_ADDR]])
@@ -6559,9 +6577,11 @@ int fint(void) { return ftemplate<int>(); }
 //
 //
 // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_v_l135
-// CHECK19-SAME: (i32 noundef [[AA:%.*]]) #[[ATTR0]] {
+// CHECK19-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[AA:%.*]]) #[[ATTR0]] {
 // CHECK19-NEXT:  entry:
+// CHECK19-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[AA_ADDR:%.*]] = alloca i32, align 4
+// CHECK19-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK19-NEXT:    store i32 [[AA]], ptr [[AA_ADDR]], align 4
 // CHECK19-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB2]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_v_l135.omp_outlined, ptr [[AA_ADDR]])
 // CHECK19-NEXT:    ret void
@@ -6657,12 +6677,14 @@ int fint(void) { return ftemplate<int>(); }
 //
 //
 // CHECK21-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z23without_schedule_clausePfS_S_S__l70
-// CHECK21-SAME: (ptr noundef [[A:%.*]], ptr noundef [[B:%.*]], ptr noundef [[C:%.*]], ptr noundef [[D:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK21-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef [[A:%.*]], ptr noundef [[B:%.*]], ptr noundef [[C:%.*]], ptr noundef [[D:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK21-NEXT:  entry:
+// CHECK21-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK21-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
 // CHECK21-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
 // CHECK21-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8
 // CHECK21-NEXT:    [[D_ADDR:%.*]] = alloca ptr, align 8
+// CHECK21-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK21-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
 // CHECK21-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
 // CHECK21-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
@@ -6775,12 +6797,14 @@ int fint(void) { return ftemplate<int>(); }
 //
 //
 // CHECK21-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z18static_not_chunkedPfS_S_S__l86
-// CHECK21-SAME: (ptr noundef [[A:%.*]], ptr noundef [[B:%.*]], ptr noundef [[C:%.*]], ptr noundef [[D:%.*]]) #[[ATTR0]] {
+// CHECK21-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef [[A:%.*]], ptr noundef [[B:%.*]], ptr noundef [[C:%.*]], ptr noundef [[D:%.*]]) #[[ATTR0]] {
 // CHECK21-NEXT:  entry:
+// CHECK21-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK21-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
 // CHECK21-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
 // CHECK21-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8
 // CHECK21-NEXT:    [[D_ADDR:%.*]] = alloca ptr, align 8
+// CHECK21-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK21-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
 // CHECK21-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
 // CHECK21-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
@@ -6891,12 +6915,14 @@ int fint(void) { return ftemplate<int>(); }
 //
 //
 // CHECK21-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z14static_chunkedPfS_S_S__l103
-// CHECK21-SAME: (ptr noundef [[A:%.*]], ptr noundef [[B:%.*]], ptr noundef [[C:%.*]], ptr noundef [[D:%.*]]) #[[ATTR0]] {
+// CHECK21-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef [[A:%.*]], ptr noundef [[B:%.*]], ptr noundef [[C:%.*]], ptr noundef [[D:%.*]]) #[[ATTR0]] {
 // CHECK21-NEXT:  entry:
+// CHECK21-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK21-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
 // CHECK21-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
 // CHECK21-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8
 // CHECK21-NEXT:    [[D_ADDR:%.*]] = alloca ptr, align 8
+// CHECK21-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK21-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
 // CHECK21-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
 // CHECK21-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
@@ -7024,10 +7050,12 @@ int fint(void) { return ftemplate<int>(); }
 //
 //
 // CHECK21-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z12test_precondv_l115
-// CHECK21-SAME: (i64 noundef [[I:%.*]], i64 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK21-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[I:%.*]], i64 noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK21-NEXT:  entry:
+// CHECK21-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK21-NEXT:    [[I_ADDR:%.*]] = alloca i64, align 8
 // CHECK21-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
+// CHECK21-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK21-NEXT:    store i64 [[I]], ptr [[I_ADDR]], align 8
 // CHECK21-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
 // CHECK21-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB2]], i32 2, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z12test_precondv_l115.omp_outlined, ptr [[I_ADDR]], ptr [[A_ADDR]])
@@ -7180,9 +7208,11 @@ int fint(void) { return ftemplate<int>(); }
 //
 //
 // CHECK21-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_v_l135
-// CHECK21-SAME: (i64 noundef [[AA:%.*]]) #[[ATTR0]] {
+// CHECK21-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[AA:%.*]]) #[[ATTR0]] {
 // CHECK21-NEXT:  entry:
+// CHECK21-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK21-NEXT:    [[AA_ADDR:%.*]] = alloca i64, align 8
+// CHECK21-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK21-NEXT:    store i64 [[AA]], ptr [[AA_ADDR]], align 8
 // CHECK21-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB2]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_v_l135.omp_outlined, ptr [[AA_ADDR]])
 // CHECK21-NEXT:    ret void
@@ -7278,12 +7308,14 @@ int fint(void) { return ftemplate<int>(); }
 //
 //
 // CHECK23-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z23without_schedule_clausePfS_S_S__l70
-// CHECK23-SAME: (ptr noundef [[A:%.*]], ptr noundef [[B:%.*]], ptr noundef [[C:%.*]], ptr noundef [[D:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK23-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef [[A:%.*]], ptr noundef [[B:%.*]], ptr noundef [[C:%.*]], ptr noundef [[D:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK23-NEXT:  entry:
+// CHECK23-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK23-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
 // CHECK23-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 4
 // CHECK23-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 4
 // CHECK23-NEXT:    [[D_ADDR:%.*]] = alloca ptr, align 4
+// CHECK23-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK23-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
 // CHECK23-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 4
 // CHECK23-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 4
@@ -7392,12 +7424,14 @@ int fint(void) { return ftemplate<int>(); }
 //
 //
 // CHECK23-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z18static_not_chunkedPfS_S_S__l86
-// CHECK23-SAME: (ptr noundef [[A:%.*]], ptr noundef [[B:%.*]], ptr noundef [[C:%.*]], ptr noundef [[D:%.*]]) #[[ATTR0]] {
+// CHECK23-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef [[A:%.*]], ptr noundef [[B:%.*]], ptr noundef [[C:%.*]], ptr noundef [[D:%.*]]) #[[ATTR0]] {
 // CHECK23-NEXT:  entry:
+// CHECK23-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK23-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
 // CHECK23-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 4
 // CHECK23-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 4
 // CHECK23-NEXT:    [[D_ADDR:%.*]] = alloca ptr, align 4
+// CHECK23-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK23-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
 // CHECK23-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 4
 // CHECK23-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 4
@@ -7504,12 +7538,14 @@ int fint(void) { return ftemplate<int>(); }
 //
 //
 // CHECK23-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z14static_chunkedPfS_S_S__l103
-// CHECK23-SAME: (ptr noundef [[A:%.*]], ptr noundef [[B:%.*]], ptr noundef [[C:%.*]], ptr noundef [[D:%.*]]) #[[ATTR0]] {
+// CHECK23-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef [[A:%.*]], ptr noundef [[B:%.*]], ptr noundef [[C:%.*]], ptr noundef [[D:%.*]]) #[[ATTR0]] {
 // CHECK23-NEXT:  entry:
+// CHECK23-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK23-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
 // CHECK23-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 4
 // CHECK23-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 4
 // CHECK23-NEXT:    [[D_ADDR:%.*]] = alloca ptr, align 4
+// CHECK23-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK23-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
 // CHECK23-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 4
 // CHECK23-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 4
@@ -7633,10 +7669,12 @@ int fint(void) { return ftemplate<int>(); }
 //
 //
 // CHECK23-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z12test_precondv_l115
-// CHECK23-SAME: (i32 noundef [[I:%.*]], i32 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK23-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[I:%.*]], i32 noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK23-NEXT:  entry:
+// CHECK23-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK23-NEXT:    [[I_ADDR:%.*]] = alloca i32, align 4
 // CHECK23-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+// CHECK23-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK23-NEXT:    store i32 [[I]], ptr [[I_ADDR]], align 4
 // CHECK23-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
 // CHECK23-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB2]], i32 2, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z12test_precondv_l115.omp_outlined, ptr [[I_ADDR]], ptr [[A_ADDR]])
@@ -7789,9 +7827,11 @@ int fint(void) { return ftemplate<int>(); }
 //
 //
 // CHECK23-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_v_l135
-// CHECK23-SAME: (i32 noundef [[AA:%.*]]) #[[ATTR0]] {
+// CHECK23-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[AA:%.*]]) #[[ATTR0]] {
 // CHECK23-NEXT:  entry:
+// CHECK23-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK23-NEXT:    [[AA_ADDR:%.*]] = alloca i32, align 4
+// CHECK23-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK23-NEXT:    store i32 [[AA]], ptr [[AA_ADDR]], align 4
 // CHECK23-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB2]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_v_l135.omp_outlined, ptr [[AA_ADDR]])
 // CHECK23-NEXT:    ret void
diff --git a/clang/test/OpenMP/distribute_simd_firstprivate_codegen.cpp b/clang/test/OpenMP/distribute_simd_firstprivate_codegen.cpp
index aa461347fafcd..c9083e3749232 100644
--- a/clang/test/OpenMP/distribute_simd_firstprivate_codegen.cpp
+++ b/clang/test/OpenMP/distribute_simd_firstprivate_codegen.cpp
@@ -190,7 +190,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[G:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[G1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SFVAR:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[G:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[G1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SFVAR:%.*]]) #[[ATTR2]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -315,7 +315,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK1-SAME: () #[[ATTR5:[0-9]+]] {
+// CHECK1-SAME: () #[[ATTR4:[0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK1-NEXT:    ret void
@@ -369,7 +369,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[G:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[G1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SFVAR:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[G:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[G1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SFVAR:%.*]]) #[[ATTR2]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -494,7 +494,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK3-SAME: () #[[ATTR5:[0-9]+]] {
+// CHECK3-SAME: () #[[ATTR4:[0-9]+]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK3-NEXT:    ret void
@@ -638,7 +638,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP39:%.*]] = icmp ne i32 [[TMP38]], 0
 // CHECK9-NEXT:    br i1 [[TMP39]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK9:       omp_offload.failed:
-// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l105(i64 [[TMP2]], ptr [[VEC]], ptr [[S_ARR]], ptr [[TMP3]], i64 [[TMP5]]) #[[ATTR5:[0-9]+]]
+// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l105(i64 [[TMP2]], ptr [[VEC]], ptr [[S_ARR]], ptr [[TMP3]], i64 [[TMP5]]) #[[ATTR4:[0-9]+]]
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK9:       omp_offload.cont:
 // CHECK9-NEXT:    [[CALL:%.*]] = call noundef signext i32 @_Z5tmainIiET_v()
@@ -649,11 +649,11 @@ int main() {
 // CHECK9:       arraydestroy.body:
 // CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP40]], [[OMP_OFFLOAD_CONT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
-// CHECK9-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK9-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN]]
 // CHECK9-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE2:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK9:       arraydestroy.done2:
-// CHECK9-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]]
 // CHECK9-NEXT:    [[TMP41:%.*]] = load i32, ptr [[RETVAL]], align 4
 // CHECK9-NEXT:    ret i32 [[TMP41]]
 //
@@ -705,7 +705,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l105.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]]) #[[ATTR4:[0-9]+]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]]) #[[ATTR3]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -829,14 +829,14 @@ int main() {
 // CHECK9-NEXT:    store i32 2, ptr [[I]], align 4
 // CHECK9-NEXT:    br label [[DOTOMP_FINAL_DONE]]
 // CHECK9:       .omp.final.done:
-// CHECK9-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR7]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR7]]) #[[ATTR4]]
 // CHECK9-NEXT:    [[ARRAY_BEGIN14:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR5]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN14]], i64 2
 // CHECK9-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK9:       arraydestroy.body:
 // CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP27]], [[DOTOMP_FINAL_DONE]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
-// CHECK9-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK9-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN14]]
 // CHECK9-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE15:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK9:       arraydestroy.done15:
@@ -849,12 +849,12 @@ int main() {
 // CHECK9-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
 // CHECK9-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
-// CHECK9-NEXT:    call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]]
 // CHECK9-NEXT:    ret void
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@_Z5tmainIiET_v
-// CHECK9-SAME: () #[[ATTR6:[0-9]+]] comdat {
+// CHECK9-SAME: () #[[ATTR5:[0-9]+]] comdat {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
 // CHECK9-NEXT:    [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4
@@ -941,7 +941,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP34:%.*]] = icmp ne i32 [[TMP33]], 0
 // CHECK9-NEXT:    br i1 [[TMP34]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK9:       omp_offload.failed:
-// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l49(i64 [[TMP2]], ptr [[VEC]], ptr [[S_ARR]], ptr [[TMP3]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l49(i64 [[TMP2]], ptr [[VEC]], ptr [[S_ARR]], ptr [[TMP3]]) #[[ATTR4]]
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK9:       omp_offload.cont:
 // CHECK9-NEXT:    store i32 0, ptr [[RETVAL]], align 4
@@ -951,11 +951,11 @@ int main() {
 // CHECK9:       arraydestroy.body:
 // CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP35]], [[OMP_OFFLOAD_CONT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
-// CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK9-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN]]
 // CHECK9-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE2:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK9:       arraydestroy.done2:
-// CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]]
 // CHECK9-NEXT:    [[TMP36:%.*]] = load i32, ptr [[RETVAL]], align 4
 // CHECK9-NEXT:    ret i32 [[TMP36]]
 //
@@ -1039,7 +1039,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l49.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR4]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR3]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1157,14 +1157,14 @@ int main() {
 // CHECK9-NEXT:    store i32 2, ptr [[I]], align 4
 // CHECK9-NEXT:    br label [[DOTOMP_FINAL_DONE]]
 // CHECK9:       .omp.final.done:
-// CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR7]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR7]]) #[[ATTR4]]
 // CHECK9-NEXT:    [[ARRAY_BEGIN13:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR5]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN13]], i64 2
 // CHECK9-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK9:       arraydestroy.body:
 // CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP25]], [[DOTOMP_FINAL_DONE]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
-// CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK9-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN13]]
 // CHECK9-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE14:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK9:       arraydestroy.done14:
@@ -1177,7 +1177,7 @@ int main() {
 // CHECK9-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
 // CHECK9-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
-// CHECK9-NEXT:    call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]]
 // CHECK9-NEXT:    ret void
 //
 //
@@ -1216,7 +1216,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK9-SAME: () #[[ATTR7:[0-9]+]] {
+// CHECK9-SAME: () #[[ATTR6:[0-9]+]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK9-NEXT:    ret void
@@ -1324,7 +1324,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP39:%.*]] = icmp ne i32 [[TMP38]], 0
 // CHECK11-NEXT:    br i1 [[TMP39]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK11:       omp_offload.failed:
-// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l105(i32 [[TMP2]], ptr [[VEC]], ptr [[S_ARR]], ptr [[TMP3]], i32 [[TMP5]]) #[[ATTR5:[0-9]+]]
+// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l105(i32 [[TMP2]], ptr [[VEC]], ptr [[S_ARR]], ptr [[TMP3]], i32 [[TMP5]]) #[[ATTR4:[0-9]+]]
 // CHECK11-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK11:       omp_offload.cont:
 // CHECK11-NEXT:    [[CALL:%.*]] = call noundef i32 @_Z5tmainIiET_v()
@@ -1335,11 +1335,11 @@ int main() {
 // CHECK11:       arraydestroy.body:
 // CHECK11-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP40]], [[OMP_OFFLOAD_CONT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK11-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1
-// CHECK11-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK11-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN]]
 // CHECK11-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE2:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK11:       arraydestroy.done2:
-// CHECK11-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]]
 // CHECK11-NEXT:    [[TMP41:%.*]] = load i32, ptr [[RETVAL]], align 4
 // CHECK11-NEXT:    ret i32 [[TMP41]]
 //
@@ -1391,7 +1391,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l105.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]]) #[[ATTR4:[0-9]+]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]]) #[[ATTR3]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1513,14 +1513,14 @@ int main() {
 // CHECK11-NEXT:    store i32 2, ptr [[I]], align 4
 // CHECK11-NEXT:    br label [[DOTOMP_FINAL_DONE]]
 // CHECK11:       .omp.final.done:
-// CHECK11-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR7]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR7]]) #[[ATTR4]]
 // CHECK11-NEXT:    [[ARRAY_BEGIN13:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR5]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN13]], i32 2
 // CHECK11-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK11:       arraydestroy.body:
 // CHECK11-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP27]], [[DOTOMP_FINAL_DONE]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK11-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1
-// CHECK11-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK11-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN13]]
 // CHECK11-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE14:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK11:       arraydestroy.done14:
@@ -1533,12 +1533,12 @@ int main() {
 // CHECK11-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
 // CHECK11-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
-// CHECK11-NEXT:    call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]]
 // CHECK11-NEXT:    ret void
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@_Z5tmainIiET_v
-// CHECK11-SAME: () #[[ATTR6:[0-9]+]] comdat {
+// CHECK11-SAME: () #[[ATTR5:[0-9]+]] comdat {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4
@@ -1625,7 +1625,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP34:%.*]] = icmp ne i32 [[TMP33]], 0
 // CHECK11-NEXT:    br i1 [[TMP34]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK11:       omp_offload.failed:
-// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l49(i32 [[TMP2]], ptr [[VEC]], ptr [[S_ARR]], ptr [[TMP3]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l49(i32 [[TMP2]], ptr [[VEC]], ptr [[S_ARR]], ptr [[TMP3]]) #[[ATTR4]]
 // CHECK11-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK11:       omp_offload.cont:
 // CHECK11-NEXT:    store i32 0, ptr [[RETVAL]], align 4
@@ -1635,11 +1635,11 @@ int main() {
 // CHECK11:       arraydestroy.body:
 // CHECK11-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP35]], [[OMP_OFFLOAD_CONT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK11-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1
-// CHECK11-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK11-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN]]
 // CHECK11-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE2:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK11:       arraydestroy.done2:
-// CHECK11-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]]
 // CHECK11-NEXT:    [[TMP36:%.*]] = load i32, ptr [[RETVAL]], align 4
 // CHECK11-NEXT:    ret i32 [[TMP36]]
 //
@@ -1723,7 +1723,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l49.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR4]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR3]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1839,14 +1839,14 @@ int main() {
 // CHECK11-NEXT:    store i32 2, ptr [[I]], align 4
 // CHECK11-NEXT:    br label [[DOTOMP_FINAL_DONE]]
 // CHECK11:       .omp.final.done:
-// CHECK11-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR7]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR7]]) #[[ATTR4]]
 // CHECK11-NEXT:    [[ARRAY_BEGIN12:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR5]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN12]], i32 2
 // CHECK11-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK11:       arraydestroy.body:
 // CHECK11-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP25]], [[DOTOMP_FINAL_DONE]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK11-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1
-// CHECK11-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK11-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN12]]
 // CHECK11-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE13:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK11:       arraydestroy.done13:
@@ -1859,7 +1859,7 @@ int main() {
 // CHECK11-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
 // CHECK11-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
-// CHECK11-NEXT:    call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]]
 // CHECK11-NEXT:    ret void
 //
 //
@@ -1898,7 +1898,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK11-SAME: () #[[ATTR7:[0-9]+]] {
+// CHECK11-SAME: () #[[ATTR6:[0-9]+]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK11-NEXT:    ret void
diff --git a/clang/test/OpenMP/distribute_simd_lastprivate_codegen.cpp b/clang/test/OpenMP/distribute_simd_lastprivate_codegen.cpp
index 63b4a25abf9c3..84806e722ebd5 100644
--- a/clang/test/OpenMP/distribute_simd_lastprivate_codegen.cpp
+++ b/clang/test/OpenMP/distribute_simd_lastprivate_codegen.cpp
@@ -182,7 +182,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[G:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[G1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SFVAR:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[G:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[G1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SFVAR:%.*]]) #[[ATTR2]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -304,7 +304,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK1-SAME: () #[[ATTR5:[0-9]+]] {
+// CHECK1-SAME: () #[[ATTR4:[0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK1-NEXT:    ret void
@@ -358,7 +358,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[G:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[G1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SFVAR:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[G:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[G1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SFVAR:%.*]]) #[[ATTR2]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -480,7 +480,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK3-SAME: () #[[ATTR5:[0-9]+]] {
+// CHECK3-SAME: () #[[ATTR4:[0-9]+]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK3-NEXT:    ret void
@@ -625,7 +625,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP39:%.*]] = icmp ne i32 [[TMP38]], 0
 // CHECK9-NEXT:    br i1 [[TMP39]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK9:       omp_offload.failed:
-// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l102(i64 [[TMP2]], ptr [[VEC]], ptr [[S_ARR]], ptr [[TMP3]], i64 [[TMP5]]) #[[ATTR5:[0-9]+]]
+// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l102(i64 [[TMP2]], ptr [[VEC]], ptr [[S_ARR]], ptr [[TMP3]], i64 [[TMP5]]) #[[ATTR4:[0-9]+]]
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK9:       omp_offload.cont:
 // CHECK9-NEXT:    [[CALL:%.*]] = call noundef signext i32 @_Z5tmainIiET_v()
@@ -636,11 +636,11 @@ int main() {
 // CHECK9:       arraydestroy.body:
 // CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP40]], [[OMP_OFFLOAD_CONT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
-// CHECK9-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK9-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN]]
 // CHECK9-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE2:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK9:       arraydestroy.done2:
-// CHECK9-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]]
 // CHECK9-NEXT:    [[TMP41:%.*]] = load i32, ptr [[RETVAL]], align 4
 // CHECK9-NEXT:    ret i32 [[TMP41]]
 //
@@ -692,7 +692,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l102.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]]) #[[ATTR4:[0-9]+]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]]) #[[ATTR3]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -834,14 +834,14 @@ int main() {
 // CHECK9-NEXT:    store i32 [[TMP29]], ptr [[TMP4]], align 4
 // CHECK9-NEXT:    br label [[DOTOMP_LASTPRIVATE_DONE]]
 // CHECK9:       .omp.lastprivate.done:
-// CHECK9-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR6]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR6]]) #[[ATTR4]]
 // CHECK9-NEXT:    [[ARRAY_BEGIN15:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR5]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN15]], i64 2
 // CHECK9-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK9:       arraydestroy.body:
 // CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP30]], [[DOTOMP_LASTPRIVATE_DONE]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
-// CHECK9-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK9-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN15]]
 // CHECK9-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE16:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK9:       arraydestroy.done16:
@@ -854,12 +854,12 @@ int main() {
 // CHECK9-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
 // CHECK9-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
-// CHECK9-NEXT:    call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]]
 // CHECK9-NEXT:    ret void
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@_Z5tmainIiET_v
-// CHECK9-SAME: () #[[ATTR6:[0-9]+]] comdat {
+// CHECK9-SAME: () #[[ATTR5:[0-9]+]] comdat {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
 // CHECK9-NEXT:    [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4
@@ -946,7 +946,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP34:%.*]] = icmp ne i32 [[TMP33]], 0
 // CHECK9-NEXT:    br i1 [[TMP34]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK9:       omp_offload.failed:
-// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l49(i64 [[TMP2]], ptr [[VEC]], ptr [[S_ARR]], ptr [[TMP3]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l49(i64 [[TMP2]], ptr [[VEC]], ptr [[S_ARR]], ptr [[TMP3]]) #[[ATTR4]]
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK9:       omp_offload.cont:
 // CHECK9-NEXT:    store i32 0, ptr [[RETVAL]], align 4
@@ -956,11 +956,11 @@ int main() {
 // CHECK9:       arraydestroy.body:
 // CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP35]], [[OMP_OFFLOAD_CONT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
-// CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK9-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN]]
 // CHECK9-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE2:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK9:       arraydestroy.done2:
-// CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]]
 // CHECK9-NEXT:    [[TMP36:%.*]] = load i32, ptr [[RETVAL]], align 4
 // CHECK9-NEXT:    ret i32 [[TMP36]]
 //
@@ -1044,7 +1044,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l49.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR4]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR3]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1180,14 +1180,14 @@ int main() {
 // CHECK9-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP5]], ptr align 4 [[TMP27]], i64 4, i1 false)
 // CHECK9-NEXT:    br label [[DOTOMP_LASTPRIVATE_DONE]]
 // CHECK9:       .omp.lastprivate.done:
-// CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR6]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR6]]) #[[ATTR4]]
 // CHECK9-NEXT:    [[ARRAY_BEGIN14:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR5]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN14]], i64 2
 // CHECK9-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK9:       arraydestroy.body:
 // CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP28]], [[DOTOMP_LASTPRIVATE_DONE]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
-// CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK9-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN14]]
 // CHECK9-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE15:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK9:       arraydestroy.done15:
@@ -1200,7 +1200,7 @@ int main() {
 // CHECK9-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
 // CHECK9-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
-// CHECK9-NEXT:    call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]]
 // CHECK9-NEXT:    ret void
 //
 //
@@ -1239,7 +1239,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK9-SAME: () #[[ATTR7:[0-9]+]] {
+// CHECK9-SAME: () #[[ATTR6:[0-9]+]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK9-NEXT:    ret void
@@ -1348,7 +1348,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP39:%.*]] = icmp ne i32 [[TMP38]], 0
 // CHECK11-NEXT:    br i1 [[TMP39]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK11:       omp_offload.failed:
-// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l102(i32 [[TMP2]], ptr [[VEC]], ptr [[S_ARR]], ptr [[TMP3]], i32 [[TMP5]]) #[[ATTR5:[0-9]+]]
+// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l102(i32 [[TMP2]], ptr [[VEC]], ptr [[S_ARR]], ptr [[TMP3]], i32 [[TMP5]]) #[[ATTR4:[0-9]+]]
 // CHECK11-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK11:       omp_offload.cont:
 // CHECK11-NEXT:    [[CALL:%.*]] = call noundef i32 @_Z5tmainIiET_v()
@@ -1359,11 +1359,11 @@ int main() {
 // CHECK11:       arraydestroy.body:
 // CHECK11-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP40]], [[OMP_OFFLOAD_CONT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK11-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1
-// CHECK11-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK11-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN]]
 // CHECK11-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE2:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK11:       arraydestroy.done2:
-// CHECK11-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]]
 // CHECK11-NEXT:    [[TMP41:%.*]] = load i32, ptr [[RETVAL]], align 4
 // CHECK11-NEXT:    ret i32 [[TMP41]]
 //
@@ -1415,7 +1415,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l102.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]]) #[[ATTR4:[0-9]+]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SVAR:%.*]]) #[[ATTR3]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1555,14 +1555,14 @@ int main() {
 // CHECK11-NEXT:    store i32 [[TMP29]], ptr [[TMP4]], align 4
 // CHECK11-NEXT:    br label [[DOTOMP_LASTPRIVATE_DONE]]
 // CHECK11:       .omp.lastprivate.done:
-// CHECK11-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR6]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR6]]) #[[ATTR4]]
 // CHECK11-NEXT:    [[ARRAY_BEGIN14:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR5]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN14]], i32 2
 // CHECK11-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK11:       arraydestroy.body:
 // CHECK11-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP30]], [[DOTOMP_LASTPRIVATE_DONE]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK11-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1
-// CHECK11-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK11-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN14]]
 // CHECK11-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE15:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK11:       arraydestroy.done15:
@@ -1575,12 +1575,12 @@ int main() {
 // CHECK11-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
 // CHECK11-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
-// CHECK11-NEXT:    call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]]
 // CHECK11-NEXT:    ret void
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@_Z5tmainIiET_v
-// CHECK11-SAME: () #[[ATTR6:[0-9]+]] comdat {
+// CHECK11-SAME: () #[[ATTR5:[0-9]+]] comdat {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4
@@ -1667,7 +1667,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP34:%.*]] = icmp ne i32 [[TMP33]], 0
 // CHECK11-NEXT:    br i1 [[TMP34]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK11:       omp_offload.failed:
-// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l49(i32 [[TMP2]], ptr [[VEC]], ptr [[S_ARR]], ptr [[TMP3]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l49(i32 [[TMP2]], ptr [[VEC]], ptr [[S_ARR]], ptr [[TMP3]]) #[[ATTR4]]
 // CHECK11-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK11:       omp_offload.cont:
 // CHECK11-NEXT:    store i32 0, ptr [[RETVAL]], align 4
@@ -1677,11 +1677,11 @@ int main() {
 // CHECK11:       arraydestroy.body:
 // CHECK11-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP35]], [[OMP_OFFLOAD_CONT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK11-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1
-// CHECK11-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK11-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN]]
 // CHECK11-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE2:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK11:       arraydestroy.done2:
-// CHECK11-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]]
 // CHECK11-NEXT:    [[TMP36:%.*]] = load i32, ptr [[RETVAL]], align 4
 // CHECK11-NEXT:    ret i32 [[TMP36]]
 //
@@ -1765,7 +1765,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l49.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR4]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR3]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1899,14 +1899,14 @@ int main() {
 // CHECK11-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[TMP5]], ptr align 4 [[TMP27]], i32 4, i1 false)
 // CHECK11-NEXT:    br label [[DOTOMP_LASTPRIVATE_DONE]]
 // CHECK11:       .omp.lastprivate.done:
-// CHECK11-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR6]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR6]]) #[[ATTR4]]
 // CHECK11-NEXT:    [[ARRAY_BEGIN13:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR5]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN13]], i32 2
 // CHECK11-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK11:       arraydestroy.body:
 // CHECK11-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP28]], [[DOTOMP_LASTPRIVATE_DONE]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK11-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1
-// CHECK11-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK11-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN13]]
 // CHECK11-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE14:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK11:       arraydestroy.done14:
@@ -1919,7 +1919,7 @@ int main() {
 // CHECK11-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
 // CHECK11-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
-// CHECK11-NEXT:    call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]]
 // CHECK11-NEXT:    ret void
 //
 //
@@ -1958,7 +1958,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK11-SAME: () #[[ATTR7:[0-9]+]] {
+// CHECK11-SAME: () #[[ATTR6:[0-9]+]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK11-NEXT:    ret void
diff --git a/clang/test/OpenMP/distribute_simd_private_codegen.cpp b/clang/test/OpenMP/distribute_simd_private_codegen.cpp
index 8f3e292e36559..bd586046fabac 100644
--- a/clang/test/OpenMP/distribute_simd_private_codegen.cpp
+++ b/clang/test/OpenMP/distribute_simd_private_codegen.cpp
@@ -136,7 +136,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -227,7 +227,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK1-SAME: () #[[ATTR5:[0-9]+]] {
+// CHECK1-SAME: () #[[ATTR4:[0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK1-NEXT:    ret void
@@ -254,7 +254,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l68.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -345,7 +345,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK3-SAME: () #[[ATTR5:[0-9]+]] {
+// CHECK3-SAME: () #[[ATTR4:[0-9]+]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK3-NEXT:    ret void
@@ -439,7 +439,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
 // CHECK9-NEXT:    br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK9:       omp_offload.failed:
-// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l93() #[[ATTR5:[0-9]+]]
+// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l93() #[[ATTR4:[0-9]+]]
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK9:       omp_offload.cont:
 // CHECK9-NEXT:    [[TMP15:%.*]] = load i32, ptr [[I]], align 4
@@ -483,7 +483,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP36:%.*]] = icmp ne i32 [[TMP35]], 0
 // CHECK9-NEXT:    br i1 [[TMP36]], label [[OMP_OFFLOAD_FAILED4:%.*]], label [[OMP_OFFLOAD_CONT5:%.*]]
 // CHECK9:       omp_offload.failed4:
-// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l102(i64 [[TMP16]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l102(i64 [[TMP16]]) #[[ATTR4]]
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT5]]
 // CHECK9:       omp_offload.cont5:
 // CHECK9-NEXT:    [[CALL:%.*]] = call noundef signext i32 @_Z5tmainIiET_v()
@@ -494,11 +494,11 @@ int main() {
 // CHECK9:       arraydestroy.body:
 // CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP37]], [[OMP_OFFLOAD_CONT5]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
-// CHECK9-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK9-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN]]
 // CHECK9-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE6:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK9:       arraydestroy.done6:
-// CHECK9-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]]
 // CHECK9-NEXT:    [[TMP38:%.*]] = load i32, ptr [[RETVAL]], align 4
 // CHECK9-NEXT:    ret i32 [[TMP38]]
 //
@@ -534,7 +534,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l93.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR4:[0-9]+]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR3]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -631,14 +631,14 @@ int main() {
 // CHECK9-NEXT:    store i32 2, ptr [[I]], align 4
 // CHECK9-NEXT:    br label [[DOTOMP_FINAL_DONE]]
 // CHECK9:       .omp.final.done:
-// CHECK9-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR4]]
 // CHECK9-NEXT:    [[ARRAY_BEGIN7:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN7]], i64 2
 // CHECK9-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK9:       arraydestroy.body:
 // CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP17]], [[DOTOMP_FINAL_DONE]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
-// CHECK9-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK9-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN7]]
 // CHECK9-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE8:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK9:       arraydestroy.done8:
@@ -651,7 +651,7 @@ int main() {
 // CHECK9-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
 // CHECK9-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
-// CHECK9-NEXT:    call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]]
 // CHECK9-NEXT:    ret void
 //
 //
@@ -665,7 +665,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l102.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[I:%.*]]) #[[ATTR4]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[I:%.*]]) #[[ATTR3]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -735,7 +735,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@_Z5tmainIiET_v
-// CHECK9-SAME: () #[[ATTR6:[0-9]+]] comdat {
+// CHECK9-SAME: () #[[ATTR5:[0-9]+]] comdat {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
 // CHECK9-NEXT:    [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4
@@ -785,7 +785,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
 // CHECK9-NEXT:    br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK9:       omp_offload.failed:
-// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l49() #[[ATTR5]]
+// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l49() #[[ATTR4]]
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK9:       omp_offload.cont:
 // CHECK9-NEXT:    store i32 0, ptr [[RETVAL]], align 4
@@ -795,11 +795,11 @@ int main() {
 // CHECK9:       arraydestroy.body:
 // CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP15]], [[OMP_OFFLOAD_CONT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
-// CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK9-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN]]
 // CHECK9-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE2:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK9:       arraydestroy.done2:
-// CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]]
 // CHECK9-NEXT:    [[TMP16:%.*]] = load i32, ptr [[RETVAL]], align 4
 // CHECK9-NEXT:    ret i32 [[TMP16]]
 //
@@ -869,7 +869,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l49.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR4]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR3]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -965,14 +965,14 @@ int main() {
 // CHECK9-NEXT:    store i32 2, ptr [[I]], align 4
 // CHECK9-NEXT:    br label [[DOTOMP_FINAL_DONE]]
 // CHECK9:       .omp.final.done:
-// CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR4]]
 // CHECK9-NEXT:    [[ARRAY_BEGIN7:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN7]], i64 2
 // CHECK9-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK9:       arraydestroy.body:
 // CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP17]], [[DOTOMP_FINAL_DONE]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
-// CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK9-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN7]]
 // CHECK9-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE8:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK9:       arraydestroy.done8:
@@ -985,7 +985,7 @@ int main() {
 // CHECK9-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
 // CHECK9-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
-// CHECK9-NEXT:    call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]]
 // CHECK9-NEXT:    ret void
 //
 //
@@ -1024,7 +1024,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK9-SAME: () #[[ATTR7:[0-9]+]] {
+// CHECK9-SAME: () #[[ATTR6:[0-9]+]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK9-NEXT:    ret void
@@ -1092,7 +1092,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
 // CHECK11-NEXT:    br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK11:       omp_offload.failed:
-// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l93() #[[ATTR5:[0-9]+]]
+// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l93() #[[ATTR4:[0-9]+]]
 // CHECK11-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK11:       omp_offload.cont:
 // CHECK11-NEXT:    [[TMP15:%.*]] = load i32, ptr [[I]], align 4
@@ -1136,7 +1136,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP36:%.*]] = icmp ne i32 [[TMP35]], 0
 // CHECK11-NEXT:    br i1 [[TMP36]], label [[OMP_OFFLOAD_FAILED4:%.*]], label [[OMP_OFFLOAD_CONT5:%.*]]
 // CHECK11:       omp_offload.failed4:
-// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l102(i32 [[TMP16]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l102(i32 [[TMP16]]) #[[ATTR4]]
 // CHECK11-NEXT:    br label [[OMP_OFFLOAD_CONT5]]
 // CHECK11:       omp_offload.cont5:
 // CHECK11-NEXT:    [[CALL:%.*]] = call noundef i32 @_Z5tmainIiET_v()
@@ -1147,11 +1147,11 @@ int main() {
 // CHECK11:       arraydestroy.body:
 // CHECK11-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP37]], [[OMP_OFFLOAD_CONT5]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK11-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1
-// CHECK11-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK11-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN]]
 // CHECK11-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE6:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK11:       arraydestroy.done6:
-// CHECK11-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]]
 // CHECK11-NEXT:    [[TMP38:%.*]] = load i32, ptr [[RETVAL]], align 4
 // CHECK11-NEXT:    ret i32 [[TMP38]]
 //
@@ -1187,7 +1187,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l93.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR4:[0-9]+]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR3]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1282,14 +1282,14 @@ int main() {
 // CHECK11-NEXT:    store i32 2, ptr [[I]], align 4
 // CHECK11-NEXT:    br label [[DOTOMP_FINAL_DONE]]
 // CHECK11:       .omp.final.done:
-// CHECK11-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR4]]
 // CHECK11-NEXT:    [[ARRAY_BEGIN6:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN6]], i32 2
 // CHECK11-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK11:       arraydestroy.body:
 // CHECK11-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP17]], [[DOTOMP_FINAL_DONE]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK11-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1
-// CHECK11-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK11-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN6]]
 // CHECK11-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE7:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK11:       arraydestroy.done7:
@@ -1302,7 +1302,7 @@ int main() {
 // CHECK11-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
 // CHECK11-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
-// CHECK11-NEXT:    call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]]
 // CHECK11-NEXT:    ret void
 //
 //
@@ -1316,7 +1316,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l102.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[I:%.*]]) #[[ATTR4]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[I:%.*]]) #[[ATTR3]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1386,7 +1386,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@_Z5tmainIiET_v
-// CHECK11-SAME: () #[[ATTR6:[0-9]+]] comdat {
+// CHECK11-SAME: () #[[ATTR5:[0-9]+]] comdat {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4
@@ -1436,7 +1436,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
 // CHECK11-NEXT:    br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK11:       omp_offload.failed:
-// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l49() #[[ATTR5]]
+// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l49() #[[ATTR4]]
 // CHECK11-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK11:       omp_offload.cont:
 // CHECK11-NEXT:    store i32 0, ptr [[RETVAL]], align 4
@@ -1446,11 +1446,11 @@ int main() {
 // CHECK11:       arraydestroy.body:
 // CHECK11-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP15]], [[OMP_OFFLOAD_CONT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK11-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1
-// CHECK11-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK11-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN]]
 // CHECK11-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE2:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK11:       arraydestroy.done2:
-// CHECK11-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]]
 // CHECK11-NEXT:    [[TMP16:%.*]] = load i32, ptr [[RETVAL]], align 4
 // CHECK11-NEXT:    ret i32 [[TMP16]]
 //
@@ -1520,7 +1520,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l49.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR4]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR3]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1614,14 +1614,14 @@ int main() {
 // CHECK11-NEXT:    store i32 2, ptr [[I]], align 4
 // CHECK11-NEXT:    br label [[DOTOMP_FINAL_DONE]]
 // CHECK11:       .omp.final.done:
-// CHECK11-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR4]]
 // CHECK11-NEXT:    [[ARRAY_BEGIN6:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN6]], i32 2
 // CHECK11-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK11:       arraydestroy.body:
 // CHECK11-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP17]], [[DOTOMP_FINAL_DONE]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK11-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1
-// CHECK11-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK11-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN6]]
 // CHECK11-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE7:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK11:       arraydestroy.done7:
@@ -1634,7 +1634,7 @@ int main() {
 // CHECK11-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
 // CHECK11-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
-// CHECK11-NEXT:    call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]]
 // CHECK11-NEXT:    ret void
 //
 //
@@ -1673,7 +1673,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK11-SAME: () #[[ATTR7:[0-9]+]] {
+// CHECK11-SAME: () #[[ATTR6:[0-9]+]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK11-NEXT:    ret void
diff --git a/clang/test/OpenMP/distribute_simd_reduction_codegen.cpp b/clang/test/OpenMP/distribute_simd_reduction_codegen.cpp
index f30554af92e28..30175edf1858c 100644
--- a/clang/test/OpenMP/distribute_simd_reduction_codegen.cpp
+++ b/clang/test/OpenMP/distribute_simd_reduction_codegen.cpp
@@ -134,7 +134,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP21:%.*]] = icmp ne i32 [[TMP20]], 0
 // CHECK1-NEXT:    br i1 [[TMP21]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK1:       omp_offload.failed:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l64(i64 [[TMP1]]) #[[ATTR3:[0-9]+]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l64(i64 [[TMP1]]) #[[ATTR2:[0-9]+]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    [[CALL:%.*]] = call noundef signext i32 @_Z5tmainIiET_v()
@@ -151,7 +151,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l64.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -231,7 +231,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@_Z5tmainIiET_v
-// CHECK1-SAME: () #[[ATTR4:[0-9]+]] comdat {
+// CHECK1-SAME: () #[[ATTR3:[0-9]+]] comdat {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[T_VAR:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[VEC:%.*]] = alloca [2 x i32], align 4
@@ -284,7 +284,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP21:%.*]] = icmp ne i32 [[TMP20]], 0
 // CHECK1-NEXT:    br i1 [[TMP21]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK1:       omp_offload.failed:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32(i64 [[TMP1]]) #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32(i64 [[TMP1]]) #[[ATTR2]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    ret i32 0
@@ -300,7 +300,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -380,7 +380,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK1-SAME: () #[[ATTR6:[0-9]+]] {
+// CHECK1-SAME: () #[[ATTR5:[0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK1-NEXT:    ret void
@@ -438,7 +438,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP21:%.*]] = icmp ne i32 [[TMP20]], 0
 // CHECK3-NEXT:    br i1 [[TMP21]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK3:       omp_offload.failed:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l64(i32 [[TMP1]]) #[[ATTR3:[0-9]+]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l64(i32 [[TMP1]]) #[[ATTR2:[0-9]+]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK3:       omp_offload.cont:
 // CHECK3-NEXT:    [[CALL:%.*]] = call noundef i32 @_Z5tmainIiET_v()
@@ -455,7 +455,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l64.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -535,7 +535,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@_Z5tmainIiET_v
-// CHECK3-SAME: () #[[ATTR4:[0-9]+]] comdat {
+// CHECK3-SAME: () #[[ATTR3:[0-9]+]] comdat {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[T_VAR:%.*]] = alloca i32, align 4
 // CHECK3-NEXT:    [[VEC:%.*]] = alloca [2 x i32], align 4
@@ -588,7 +588,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP21:%.*]] = icmp ne i32 [[TMP20]], 0
 // CHECK3-NEXT:    br i1 [[TMP21]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK3:       omp_offload.failed:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32(i32 [[TMP1]]) #[[ATTR3]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32(i32 [[TMP1]]) #[[ATTR2]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK3:       omp_offload.cont:
 // CHECK3-NEXT:    ret i32 0
@@ -604,7 +604,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -684,7 +684,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK3-SAME: () #[[ATTR6:[0-9]+]] {
+// CHECK3-SAME: () #[[ATTR5:[0-9]+]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK3-NEXT:    ret void
@@ -908,7 +908,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l45.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR2]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -992,7 +992,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK9-SAME: () #[[ATTR5:[0-9]+]] {
+// CHECK9-SAME: () #[[ATTR4:[0-9]+]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK9-NEXT:    ret void
diff --git a/clang/test/OpenMP/nvptx_SPMD_codegen.cpp b/clang/test/OpenMP/nvptx_SPMD_codegen.cpp
index ce5c08c478ced..1ac5454994277 100644
--- a/clang/test/OpenMP/nvptx_SPMD_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_SPMD_codegen.cpp
@@ -273,14 +273,16 @@ int a;
 
 #endif
 // CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l15
-// CHECK-64-SAME: (i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-64-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-64-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8
 // CHECK-64-NEXT:    [[DOTCAPTURE_EXPR__CASTED:%.*]] = alloca i64, align 8
 // CHECK-64-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK-64-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK-64-NEXT:    store i64 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 8
-// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l15_kernel_environment)
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l15_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-64:       user_code.entry:
@@ -346,60 +348,60 @@ int a;
 // CHECK-64:       omp_if.then:
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-64:       omp.inner.for.cond:
-// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP130:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP249:![0-9]+]]
 // CHECK-64-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP6]], 10
 // CHECK-64-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-64:       omp.inner.for.body:
-// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP130]]
+// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP249]]
 // CHECK-64-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP7]] to i64
-// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP130]]
+// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP249]]
 // CHECK-64-NEXT:    [[TMP10:%.*]] = zext i32 [[TMP9]] to i64
-// CHECK-64-NEXT:    [[TMP11:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1, !llvm.access.group [[ACC_GRP130]]
+// CHECK-64-NEXT:    [[TMP11:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1, !llvm.access.group [[ACC_GRP249]]
 // CHECK-64-NEXT:    [[TOBOOL2:%.*]] = trunc i8 [[TMP11]] to i1
 // CHECK-64-NEXT:    [[FROMBOOL:%.*]] = zext i1 [[TOBOOL2]] to i8
-// CHECK-64-NEXT:    store i8 [[FROMBOOL]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 1, !llvm.access.group [[ACC_GRP130]]
-// CHECK-64-NEXT:    [[TMP12:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED]], align 8, !llvm.access.group [[ACC_GRP130]]
+// CHECK-64-NEXT:    store i8 [[FROMBOOL]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 1, !llvm.access.group [[ACC_GRP249]]
+// CHECK-64-NEXT:    [[TMP12:%.*]] = load i64, ptr [[DOTCAPTURE_EXPR__CASTED]], align 8, !llvm.access.group [[ACC_GRP249]]
 // CHECK-64-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [3 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
 // CHECK-64-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP8]] to ptr
-// CHECK-64-NEXT:    store ptr [[TMP14]], ptr [[TMP13]], align 8, !llvm.access.group [[ACC_GRP130]]
+// CHECK-64-NEXT:    store ptr [[TMP14]], ptr [[TMP13]], align 8, !llvm.access.group [[ACC_GRP249]]
 // CHECK-64-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [3 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 1
 // CHECK-64-NEXT:    [[TMP16:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-64-NEXT:    store ptr [[TMP16]], ptr [[TMP15]], align 8, !llvm.access.group [[ACC_GRP130]]
+// CHECK-64-NEXT:    store ptr [[TMP16]], ptr [[TMP15]], align 8, !llvm.access.group [[ACC_GRP249]]
 // CHECK-64-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [3 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 2
 // CHECK-64-NEXT:    [[TMP18:%.*]] = inttoptr i64 [[TMP12]] to ptr
-// CHECK-64-NEXT:    store ptr [[TMP18]], ptr [[TMP17]], align 8, !llvm.access.group [[ACC_GRP130]]
-// CHECK-64-NEXT:    [[TMP19:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1, !llvm.access.group [[ACC_GRP130]]
+// CHECK-64-NEXT:    store ptr [[TMP18]], ptr [[TMP17]], align 8, !llvm.access.group [[ACC_GRP249]]
+// CHECK-64-NEXT:    [[TMP19:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1, !llvm.access.group [[ACC_GRP249]]
 // CHECK-64-NEXT:    [[TOBOOL3:%.*]] = trunc i8 [[TMP19]] to i1
 // CHECK-64-NEXT:    [[TMP20:%.*]] = zext i1 [[TOBOOL3]] to i32
-// CHECK-64-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 [[TMP20]], i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l15_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 3), !llvm.access.group [[ACC_GRP130]]
+// CHECK-64-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 [[TMP20]], i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l15_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 3), !llvm.access.group [[ACC_GRP249]]
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-64:       omp.inner.for.inc:
-// CHECK-64-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP130]]
-// CHECK-64-NEXT:    [[TMP22:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP130]]
+// CHECK-64-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP249]]
+// CHECK-64-NEXT:    [[TMP22:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP249]]
 // CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP21]], [[TMP22]]
-// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP130]]
-// CHECK-64-NEXT:    [[TMP23:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP130]]
-// CHECK-64-NEXT:    [[TMP24:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP130]]
+// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP249]]
+// CHECK-64-NEXT:    [[TMP23:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP249]]
+// CHECK-64-NEXT:    [[TMP24:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP249]]
 // CHECK-64-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP23]], [[TMP24]]
-// CHECK-64-NEXT:    store i32 [[ADD4]], ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP130]]
-// CHECK-64-NEXT:    [[TMP25:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP130]]
-// CHECK-64-NEXT:    [[TMP26:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP130]]
+// CHECK-64-NEXT:    store i32 [[ADD4]], ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP249]]
+// CHECK-64-NEXT:    [[TMP25:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP249]]
+// CHECK-64-NEXT:    [[TMP26:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP249]]
 // CHECK-64-NEXT:    [[ADD5:%.*]] = add nsw i32 [[TMP25]], [[TMP26]]
-// CHECK-64-NEXT:    store i32 [[ADD5]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP130]]
-// CHECK-64-NEXT:    [[TMP27:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP130]]
+// CHECK-64-NEXT:    store i32 [[ADD5]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP249]]
+// CHECK-64-NEXT:    [[TMP27:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP249]]
 // CHECK-64-NEXT:    [[CMP6:%.*]] = icmp sgt i32 [[TMP27]], 9
 // CHECK-64-NEXT:    br i1 [[CMP6]], label [[COND_TRUE7:%.*]], label [[COND_FALSE8:%.*]]
 // CHECK-64:       cond.true7:
 // CHECK-64-NEXT:    br label [[COND_END9:%.*]]
 // CHECK-64:       cond.false8:
-// CHECK-64-NEXT:    [[TMP28:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP130]]
+// CHECK-64-NEXT:    [[TMP28:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP249]]
 // CHECK-64-NEXT:    br label [[COND_END9]]
 // CHECK-64:       cond.end9:
 // CHECK-64-NEXT:    [[COND10:%.*]] = phi i32 [ 9, [[COND_TRUE7]] ], [ [[TMP28]], [[COND_FALSE8]] ]
-// CHECK-64-NEXT:    store i32 [[COND10]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP130]]
-// CHECK-64-NEXT:    [[TMP29:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP130]]
-// CHECK-64-NEXT:    store i32 [[TMP29]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP130]]
-// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP131:![0-9]+]]
+// CHECK-64-NEXT:    store i32 [[COND10]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP249]]
+// CHECK-64-NEXT:    [[TMP29:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP249]]
+// CHECK-64-NEXT:    store i32 [[TMP29]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP249]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP250:![0-9]+]]
 // CHECK-64:       omp.inner.for.end:
 // CHECK-64-NEXT:    br label [[OMP_IF_END:%.*]]
 // CHECK-64:       omp_if.else:
@@ -458,7 +460,7 @@ int a;
 // CHECK-64-NEXT:    store i32 [[COND27]], ptr [[DOTOMP_COMB_UB]], align 4
 // CHECK-64-NEXT:    [[TMP53:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
 // CHECK-64-NEXT:    store i32 [[TMP53]], ptr [[DOTOMP_IV]], align 4
-// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND11]], !llvm.loop [[LOOP134:![0-9]+]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND11]], !llvm.loop [[LOOP253:![0-9]+]]
 // CHECK-64:       omp.inner.for.end28:
 // CHECK-64-NEXT:    br label [[OMP_IF_END]]
 // CHECK-64:       omp_if.end:
@@ -516,25 +518,25 @@ int a;
 // CHECK-64-NEXT:    store i32 [[TMP5]], ptr [[DOTOMP_IV]], align 4
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-64:       omp.inner.for.cond:
-// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP136:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP255:![0-9]+]]
 // CHECK-64-NEXT:    [[CONV2:%.*]] = sext i32 [[TMP6]] to i64
-// CHECK-64-NEXT:    [[TMP7:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR]], align 8, !llvm.access.group [[ACC_GRP136]]
+// CHECK-64-NEXT:    [[TMP7:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR]], align 8, !llvm.access.group [[ACC_GRP255]]
 // CHECK-64-NEXT:    [[CMP:%.*]] = icmp ule i64 [[CONV2]], [[TMP7]]
 // CHECK-64-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-64:       omp.inner.for.body:
-// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP136]]
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP255]]
 // CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1
 // CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP136]]
+// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP255]]
 // CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-64:       omp.body.continue:
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-64:       omp.inner.for.inc:
-// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP136]]
-// CHECK-64-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP136]]
+// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP255]]
+// CHECK-64-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP255]]
 // CHECK-64-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP9]], [[TMP10]]
-// CHECK-64-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP136]]
-// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP137:![0-9]+]]
+// CHECK-64-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP255]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP256:![0-9]+]]
 // CHECK-64:       omp.inner.for.end:
 // CHECK-64-NEXT:    br label [[OMP_IF_END:%.*]]
 // CHECK-64:       omp_if.else:
@@ -563,7 +565,7 @@ int a;
 // CHECK-64-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
 // CHECK-64-NEXT:    [[ADD12:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
 // CHECK-64-NEXT:    store i32 [[ADD12]], ptr [[DOTOMP_IV]], align 4
-// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND4]], !llvm.loop [[LOOP139:![0-9]+]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND4]], !llvm.loop [[LOOP258:![0-9]+]]
 // CHECK-64:       omp.inner.for.end13:
 // CHECK-64-NEXT:    br label [[OMP_IF_END]]
 // CHECK-64:       omp_if.end:
@@ -623,25 +625,25 @@ int a;
 // CHECK-64-NEXT:    store i32 [[TMP5]], ptr [[DOTOMP_IV]], align 4
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-64:       omp.inner.for.cond:
-// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP140:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP259:![0-9]+]]
 // CHECK-64-NEXT:    [[CONV2:%.*]] = sext i32 [[TMP6]] to i64
-// CHECK-64-NEXT:    [[TMP7:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR]], align 8, !llvm.access.group [[ACC_GRP140]]
+// CHECK-64-NEXT:    [[TMP7:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR]], align 8, !llvm.access.group [[ACC_GRP259]]
 // CHECK-64-NEXT:    [[CMP:%.*]] = icmp ule i64 [[CONV2]], [[TMP7]]
 // CHECK-64-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-64:       omp.inner.for.body:
-// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP140]]
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP259]]
 // CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1
 // CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP140]]
+// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP259]]
 // CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-64:       omp.body.continue:
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-64:       omp.inner.for.inc:
-// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP140]]
-// CHECK-64-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP140]]
+// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP259]]
+// CHECK-64-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP259]]
 // CHECK-64-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP9]], [[TMP10]]
-// CHECK-64-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP140]]
-// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP141:![0-9]+]]
+// CHECK-64-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP259]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP260:![0-9]+]]
 // CHECK-64:       omp.inner.for.end:
 // CHECK-64-NEXT:    br label [[OMP_IF_END:%.*]]
 // CHECK-64:       omp_if.else:
@@ -670,7 +672,7 @@ int a;
 // CHECK-64-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
 // CHECK-64-NEXT:    [[ADD12:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
 // CHECK-64-NEXT:    store i32 [[ADD12]], ptr [[DOTOMP_IV]], align 4
-// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND4]], !llvm.loop [[LOOP143:![0-9]+]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND4]], !llvm.loop [[LOOP262:![0-9]+]]
 // CHECK-64:       omp.inner.for.end13:
 // CHECK-64-NEXT:    br label [[OMP_IF_END]]
 // CHECK-64:       omp_if.end:
@@ -690,11 +692,13 @@ int a;
 //
 //
 // CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l18
-// CHECK-64-SAME: () #[[ATTR0]] {
+// CHECK-64-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
 // CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-64-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK-64-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l18_kernel_environment)
+// CHECK-64-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l18_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-64:       user_code.entry:
@@ -746,49 +750,49 @@ int a;
 // CHECK-64-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-64:       omp.inner.for.cond:
-// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP144:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP263:![0-9]+]]
 // CHECK-64-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
 // CHECK-64-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-64:       omp.inner.for.body:
-// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP144]]
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP263]]
 // CHECK-64-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
-// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP144]]
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP263]]
 // CHECK-64-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP8]] to i64
 // CHECK-64-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
 // CHECK-64-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-64-NEXT:    store ptr [[TMP11]], ptr [[TMP10]], align 8, !llvm.access.group [[ACC_GRP144]]
+// CHECK-64-NEXT:    store ptr [[TMP11]], ptr [[TMP10]], align 8, !llvm.access.group [[ACC_GRP263]]
 // CHECK-64-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [2 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 1
 // CHECK-64-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP9]] to ptr
-// CHECK-64-NEXT:    store ptr [[TMP13]], ptr [[TMP12]], align 8, !llvm.access.group [[ACC_GRP144]]
-// CHECK-64-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l18_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 2), !llvm.access.group [[ACC_GRP144]]
+// CHECK-64-NEXT:    store ptr [[TMP13]], ptr [[TMP12]], align 8, !llvm.access.group [[ACC_GRP263]]
+// CHECK-64-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l18_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 2), !llvm.access.group [[ACC_GRP263]]
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-64:       omp.inner.for.inc:
-// CHECK-64-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP144]]
-// CHECK-64-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP144]]
+// CHECK-64-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP263]]
+// CHECK-64-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP263]]
 // CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP14]], [[TMP15]]
-// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP144]]
-// CHECK-64-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP144]]
-// CHECK-64-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP144]]
+// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP263]]
+// CHECK-64-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP263]]
+// CHECK-64-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP263]]
 // CHECK-64-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP16]], [[TMP17]]
-// CHECK-64-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP144]]
-// CHECK-64-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP144]]
-// CHECK-64-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP144]]
+// CHECK-64-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP263]]
+// CHECK-64-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP263]]
+// CHECK-64-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP263]]
 // CHECK-64-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP18]], [[TMP19]]
-// CHECK-64-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP144]]
-// CHECK-64-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP144]]
+// CHECK-64-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP263]]
+// CHECK-64-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP263]]
 // CHECK-64-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP20]], 9
 // CHECK-64-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
 // CHECK-64:       cond.true5:
 // CHECK-64-NEXT:    br label [[COND_END7:%.*]]
 // CHECK-64:       cond.false6:
-// CHECK-64-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP144]]
+// CHECK-64-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP263]]
 // CHECK-64-NEXT:    br label [[COND_END7]]
 // CHECK-64:       cond.end7:
 // CHECK-64-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP21]], [[COND_FALSE6]] ]
-// CHECK-64-NEXT:    store i32 [[COND8]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP144]]
-// CHECK-64-NEXT:    [[TMP22:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP144]]
-// CHECK-64-NEXT:    store i32 [[TMP22]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP144]]
-// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP145:![0-9]+]]
+// CHECK-64-NEXT:    store i32 [[COND8]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP263]]
+// CHECK-64-NEXT:    [[TMP22:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP263]]
+// CHECK-64-NEXT:    store i32 [[TMP22]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP263]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP264:![0-9]+]]
 // CHECK-64:       omp.inner.for.end:
 // CHECK-64-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK-64:       omp.loop.exit:
@@ -849,23 +853,23 @@ int a;
 // CHECK-64-NEXT:    store i32 [[TMP6]], ptr [[DOTOMP_IV]], align 4
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-64:       omp.inner.for.cond:
-// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP147:![0-9]+]]
-// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP147]]
+// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP266:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP266]]
 // CHECK-64-NEXT:    [[CMP2:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]]
 // CHECK-64-NEXT:    br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-64:       omp.inner.for.body:
-// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP147]]
+// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP266]]
 // CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP9]], 1
 // CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP147]]
+// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP266]]
 // CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-64:       omp.body.continue:
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-64:       omp.inner.for.inc:
-// CHECK-64-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP147]]
+// CHECK-64-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP266]]
 // CHECK-64-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP10]], 1
-// CHECK-64-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP147]]
-// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP148:![0-9]+]]
+// CHECK-64-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP266]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP267:![0-9]+]]
 // CHECK-64:       omp.inner.for.end:
 // CHECK-64-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK-64:       omp.loop.exit:
@@ -881,11 +885,13 @@ int a;
 //
 //
 // CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l21
-// CHECK-64-SAME: () #[[ATTR0]] {
+// CHECK-64-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
 // CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-64-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK-64-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l21_kernel_environment)
+// CHECK-64-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l21_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-64:       user_code.entry:
@@ -937,49 +943,49 @@ int a;
 // CHECK-64-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-64:       omp.inner.for.cond:
-// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP150:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP269:![0-9]+]]
 // CHECK-64-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
 // CHECK-64-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-64:       omp.inner.for.body:
-// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP150]]
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP269]]
 // CHECK-64-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
-// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP150]]
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP269]]
 // CHECK-64-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP8]] to i64
 // CHECK-64-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
 // CHECK-64-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-64-NEXT:    store ptr [[TMP11]], ptr [[TMP10]], align 8, !llvm.access.group [[ACC_GRP150]]
+// CHECK-64-NEXT:    store ptr [[TMP11]], ptr [[TMP10]], align 8, !llvm.access.group [[ACC_GRP269]]
 // CHECK-64-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [2 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 1
 // CHECK-64-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP9]] to ptr
-// CHECK-64-NEXT:    store ptr [[TMP13]], ptr [[TMP12]], align 8, !llvm.access.group [[ACC_GRP150]]
-// CHECK-64-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l21_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 2), !llvm.access.group [[ACC_GRP150]]
+// CHECK-64-NEXT:    store ptr [[TMP13]], ptr [[TMP12]], align 8, !llvm.access.group [[ACC_GRP269]]
+// CHECK-64-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l21_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 2), !llvm.access.group [[ACC_GRP269]]
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-64:       omp.inner.for.inc:
-// CHECK-64-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP150]]
-// CHECK-64-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP150]]
+// CHECK-64-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP269]]
+// CHECK-64-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP269]]
 // CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP14]], [[TMP15]]
-// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP150]]
-// CHECK-64-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP150]]
-// CHECK-64-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP150]]
+// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP269]]
+// CHECK-64-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP269]]
+// CHECK-64-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP269]]
 // CHECK-64-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP16]], [[TMP17]]
-// CHECK-64-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP150]]
-// CHECK-64-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP150]]
-// CHECK-64-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP150]]
+// CHECK-64-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP269]]
+// CHECK-64-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP269]]
+// CHECK-64-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP269]]
 // CHECK-64-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP18]], [[TMP19]]
-// CHECK-64-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP150]]
-// CHECK-64-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP150]]
+// CHECK-64-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP269]]
+// CHECK-64-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP269]]
 // CHECK-64-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP20]], 9
 // CHECK-64-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
 // CHECK-64:       cond.true5:
 // CHECK-64-NEXT:    br label [[COND_END7:%.*]]
 // CHECK-64:       cond.false6:
-// CHECK-64-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP150]]
+// CHECK-64-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP269]]
 // CHECK-64-NEXT:    br label [[COND_END7]]
 // CHECK-64:       cond.end7:
 // CHECK-64-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP21]], [[COND_FALSE6]] ]
-// CHECK-64-NEXT:    store i32 [[COND8]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP150]]
-// CHECK-64-NEXT:    [[TMP22:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP150]]
-// CHECK-64-NEXT:    store i32 [[TMP22]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP150]]
-// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP151:![0-9]+]]
+// CHECK-64-NEXT:    store i32 [[COND8]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP269]]
+// CHECK-64-NEXT:    [[TMP22:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP269]]
+// CHECK-64-NEXT:    store i32 [[TMP22]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP269]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP270:![0-9]+]]
 // CHECK-64:       omp.inner.for.end:
 // CHECK-64-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK-64:       omp.loop.exit:
@@ -1029,25 +1035,25 @@ int a;
 // CHECK-64-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-64:       omp.inner.for.cond:
-// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP153:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP272:![0-9]+]]
 // CHECK-64-NEXT:    [[CONV2:%.*]] = sext i32 [[TMP5]] to i64
-// CHECK-64-NEXT:    [[TMP6:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR]], align 8, !llvm.access.group [[ACC_GRP153]]
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR]], align 8, !llvm.access.group [[ACC_GRP272]]
 // CHECK-64-NEXT:    [[CMP:%.*]] = icmp ule i64 [[CONV2]], [[TMP6]]
 // CHECK-64-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-64:       omp.inner.for.body:
-// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP153]]
+// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP272]]
 // CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
 // CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP153]]
+// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP272]]
 // CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-64:       omp.body.continue:
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-64:       omp.inner.for.inc:
-// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP153]]
-// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP153]]
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP272]]
+// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP272]]
 // CHECK-64-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP8]], [[TMP9]]
-// CHECK-64-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP153]]
-// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP154:![0-9]+]]
+// CHECK-64-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP272]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP273:![0-9]+]]
 // CHECK-64:       omp.inner.for.end:
 // CHECK-64-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK-64:       omp.loop.exit:
@@ -1063,11 +1069,13 @@ int a;
 //
 //
 // CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l24
-// CHECK-64-SAME: () #[[ATTR0]] {
+// CHECK-64-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
 // CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-64-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK-64-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l24_kernel_environment)
+// CHECK-64-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l24_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-64:       user_code.entry:
@@ -1119,49 +1127,49 @@ int a;
 // CHECK-64-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-64:       omp.inner.for.cond:
-// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP156:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP275:![0-9]+]]
 // CHECK-64-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
 // CHECK-64-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-64:       omp.inner.for.body:
-// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP156]]
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP275]]
 // CHECK-64-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
-// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP156]]
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP275]]
 // CHECK-64-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP8]] to i64
 // CHECK-64-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
 // CHECK-64-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-64-NEXT:    store ptr [[TMP11]], ptr [[TMP10]], align 8, !llvm.access.group [[ACC_GRP156]]
+// CHECK-64-NEXT:    store ptr [[TMP11]], ptr [[TMP10]], align 8, !llvm.access.group [[ACC_GRP275]]
 // CHECK-64-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [2 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 1
 // CHECK-64-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP9]] to ptr
-// CHECK-64-NEXT:    store ptr [[TMP13]], ptr [[TMP12]], align 8, !llvm.access.group [[ACC_GRP156]]
-// CHECK-64-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l24_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 2), !llvm.access.group [[ACC_GRP156]]
+// CHECK-64-NEXT:    store ptr [[TMP13]], ptr [[TMP12]], align 8, !llvm.access.group [[ACC_GRP275]]
+// CHECK-64-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l24_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 2), !llvm.access.group [[ACC_GRP275]]
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-64:       omp.inner.for.inc:
-// CHECK-64-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP156]]
-// CHECK-64-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP156]]
+// CHECK-64-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP275]]
+// CHECK-64-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP275]]
 // CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP14]], [[TMP15]]
-// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP156]]
-// CHECK-64-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP156]]
-// CHECK-64-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP156]]
+// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP275]]
+// CHECK-64-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP275]]
+// CHECK-64-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP275]]
 // CHECK-64-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP16]], [[TMP17]]
-// CHECK-64-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP156]]
-// CHECK-64-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP156]]
-// CHECK-64-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP156]]
+// CHECK-64-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP275]]
+// CHECK-64-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP275]]
+// CHECK-64-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP275]]
 // CHECK-64-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP18]], [[TMP19]]
-// CHECK-64-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP156]]
-// CHECK-64-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP156]]
+// CHECK-64-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP275]]
+// CHECK-64-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP275]]
 // CHECK-64-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP20]], 9
 // CHECK-64-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
 // CHECK-64:       cond.true5:
 // CHECK-64-NEXT:    br label [[COND_END7:%.*]]
 // CHECK-64:       cond.false6:
-// CHECK-64-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP156]]
+// CHECK-64-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP275]]
 // CHECK-64-NEXT:    br label [[COND_END7]]
 // CHECK-64:       cond.end7:
 // CHECK-64-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP21]], [[COND_FALSE6]] ]
-// CHECK-64-NEXT:    store i32 [[COND8]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP156]]
-// CHECK-64-NEXT:    [[TMP22:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP156]]
-// CHECK-64-NEXT:    store i32 [[TMP22]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP156]]
-// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP157:![0-9]+]]
+// CHECK-64-NEXT:    store i32 [[COND8]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP275]]
+// CHECK-64-NEXT:    [[TMP22:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP275]]
+// CHECK-64-NEXT:    store i32 [[TMP22]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP275]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP276:![0-9]+]]
 // CHECK-64:       omp.inner.for.end:
 // CHECK-64-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK-64:       omp.loop.exit:
@@ -1219,23 +1227,23 @@ int a;
 // CHECK-64-NEXT:    store i32 [[TMP7]], ptr [[DOTOMP_IV]], align 4
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-64:       omp.inner.for.cond:
-// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP159:![0-9]+]]
-// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP159]]
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP278:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP278]]
 // CHECK-64-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
 // CHECK-64-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-64:       omp.inner.for.body:
-// CHECK-64-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP159]]
+// CHECK-64-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP278]]
 // CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
 // CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP159]]
+// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP278]]
 // CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-64:       omp.body.continue:
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-64:       omp.inner.for.inc:
-// CHECK-64-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP159]]
+// CHECK-64-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP278]]
 // CHECK-64-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP11]], 1
-// CHECK-64-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP159]]
-// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP160:![0-9]+]]
+// CHECK-64-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP278]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP279:![0-9]+]]
 // CHECK-64:       omp.inner.for.end:
 // CHECK-64-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-64:       omp.dispatch.inc:
@@ -1252,11 +1260,13 @@ int a;
 //
 //
 // CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l27
-// CHECK-64-SAME: () #[[ATTR0]] {
+// CHECK-64-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
 // CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-64-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK-64-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l27_kernel_environment)
+// CHECK-64-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l27_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-64:       user_code.entry:
@@ -1308,49 +1318,49 @@ int a;
 // CHECK-64-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-64:       omp.inner.for.cond:
-// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP162:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP281:![0-9]+]]
 // CHECK-64-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
 // CHECK-64-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-64:       omp.inner.for.body:
-// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP162]]
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP281]]
 // CHECK-64-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
-// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP162]]
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP281]]
 // CHECK-64-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP8]] to i64
 // CHECK-64-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
 // CHECK-64-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-64-NEXT:    store ptr [[TMP11]], ptr [[TMP10]], align 8, !llvm.access.group [[ACC_GRP162]]
+// CHECK-64-NEXT:    store ptr [[TMP11]], ptr [[TMP10]], align 8, !llvm.access.group [[ACC_GRP281]]
 // CHECK-64-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [2 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 1
 // CHECK-64-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP9]] to ptr
-// CHECK-64-NEXT:    store ptr [[TMP13]], ptr [[TMP12]], align 8, !llvm.access.group [[ACC_GRP162]]
-// CHECK-64-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l27_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 2), !llvm.access.group [[ACC_GRP162]]
+// CHECK-64-NEXT:    store ptr [[TMP13]], ptr [[TMP12]], align 8, !llvm.access.group [[ACC_GRP281]]
+// CHECK-64-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l27_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 2), !llvm.access.group [[ACC_GRP281]]
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-64:       omp.inner.for.inc:
-// CHECK-64-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP162]]
-// CHECK-64-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP162]]
+// CHECK-64-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP281]]
+// CHECK-64-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP281]]
 // CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP14]], [[TMP15]]
-// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP162]]
-// CHECK-64-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP162]]
-// CHECK-64-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP162]]
+// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP281]]
+// CHECK-64-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP281]]
+// CHECK-64-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP281]]
 // CHECK-64-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP16]], [[TMP17]]
-// CHECK-64-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP162]]
-// CHECK-64-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP162]]
-// CHECK-64-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP162]]
+// CHECK-64-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP281]]
+// CHECK-64-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP281]]
+// CHECK-64-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP281]]
 // CHECK-64-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP18]], [[TMP19]]
-// CHECK-64-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP162]]
-// CHECK-64-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP162]]
+// CHECK-64-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP281]]
+// CHECK-64-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP281]]
 // CHECK-64-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP20]], 9
 // CHECK-64-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
 // CHECK-64:       cond.true5:
 // CHECK-64-NEXT:    br label [[COND_END7:%.*]]
 // CHECK-64:       cond.false6:
-// CHECK-64-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP162]]
+// CHECK-64-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP281]]
 // CHECK-64-NEXT:    br label [[COND_END7]]
 // CHECK-64:       cond.end7:
 // CHECK-64-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP21]], [[COND_FALSE6]] ]
-// CHECK-64-NEXT:    store i32 [[COND8]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP162]]
-// CHECK-64-NEXT:    [[TMP22:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP162]]
-// CHECK-64-NEXT:    store i32 [[TMP22]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP162]]
-// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP163:![0-9]+]]
+// CHECK-64-NEXT:    store i32 [[COND8]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP281]]
+// CHECK-64-NEXT:    [[TMP22:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP281]]
+// CHECK-64-NEXT:    store i32 [[TMP22]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP281]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP282:![0-9]+]]
 // CHECK-64:       omp.inner.for.end:
 // CHECK-64-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK-64:       omp.loop.exit:
@@ -1408,23 +1418,23 @@ int a;
 // CHECK-64-NEXT:    store i32 [[TMP7]], ptr [[DOTOMP_IV]], align 4
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-64:       omp.inner.for.cond:
-// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP165:![0-9]+]]
-// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP165]]
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP284:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP284]]
 // CHECK-64-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
 // CHECK-64-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-64:       omp.inner.for.body:
-// CHECK-64-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP165]]
+// CHECK-64-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP284]]
 // CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
 // CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP165]]
+// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP284]]
 // CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-64:       omp.body.continue:
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-64:       omp.inner.for.inc:
-// CHECK-64-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP165]]
+// CHECK-64-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP284]]
 // CHECK-64-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP11]], 1
-// CHECK-64-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP165]]
-// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP166:![0-9]+]]
+// CHECK-64-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP284]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP285:![0-9]+]]
 // CHECK-64:       omp.inner.for.end:
 // CHECK-64-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-64:       omp.dispatch.inc:
@@ -1441,11 +1451,13 @@ int a;
 //
 //
 // CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l30
-// CHECK-64-SAME: () #[[ATTR0]] {
+// CHECK-64-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
 // CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-64-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK-64-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l30_kernel_environment)
+// CHECK-64-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l30_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-64:       user_code.entry:
@@ -1497,49 +1509,49 @@ int a;
 // CHECK-64-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-64:       omp.inner.for.cond:
-// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP168:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP287:![0-9]+]]
 // CHECK-64-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
 // CHECK-64-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-64:       omp.inner.for.body:
-// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP168]]
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP287]]
 // CHECK-64-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
-// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP168]]
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP287]]
 // CHECK-64-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP8]] to i64
 // CHECK-64-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
 // CHECK-64-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-64-NEXT:    store ptr [[TMP11]], ptr [[TMP10]], align 8, !llvm.access.group [[ACC_GRP168]]
+// CHECK-64-NEXT:    store ptr [[TMP11]], ptr [[TMP10]], align 8, !llvm.access.group [[ACC_GRP287]]
 // CHECK-64-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [2 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 1
 // CHECK-64-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP9]] to ptr
-// CHECK-64-NEXT:    store ptr [[TMP13]], ptr [[TMP12]], align 8, !llvm.access.group [[ACC_GRP168]]
-// CHECK-64-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l30_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 2), !llvm.access.group [[ACC_GRP168]]
+// CHECK-64-NEXT:    store ptr [[TMP13]], ptr [[TMP12]], align 8, !llvm.access.group [[ACC_GRP287]]
+// CHECK-64-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l30_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 2), !llvm.access.group [[ACC_GRP287]]
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-64:       omp.inner.for.inc:
-// CHECK-64-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP168]]
-// CHECK-64-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP168]]
+// CHECK-64-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP287]]
+// CHECK-64-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP287]]
 // CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP14]], [[TMP15]]
-// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP168]]
-// CHECK-64-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP168]]
-// CHECK-64-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP168]]
+// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP287]]
+// CHECK-64-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP287]]
+// CHECK-64-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP287]]
 // CHECK-64-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP16]], [[TMP17]]
-// CHECK-64-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP168]]
-// CHECK-64-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP168]]
-// CHECK-64-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP168]]
+// CHECK-64-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP287]]
+// CHECK-64-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP287]]
+// CHECK-64-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP287]]
 // CHECK-64-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP18]], [[TMP19]]
-// CHECK-64-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP168]]
-// CHECK-64-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP168]]
+// CHECK-64-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP287]]
+// CHECK-64-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP287]]
 // CHECK-64-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP20]], 9
 // CHECK-64-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
 // CHECK-64:       cond.true5:
 // CHECK-64-NEXT:    br label [[COND_END7:%.*]]
 // CHECK-64:       cond.false6:
-// CHECK-64-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP168]]
+// CHECK-64-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP287]]
 // CHECK-64-NEXT:    br label [[COND_END7]]
 // CHECK-64:       cond.end7:
 // CHECK-64-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP21]], [[COND_FALSE6]] ]
-// CHECK-64-NEXT:    store i32 [[COND8]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP168]]
-// CHECK-64-NEXT:    [[TMP22:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP168]]
-// CHECK-64-NEXT:    store i32 [[TMP22]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP168]]
-// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP169:![0-9]+]]
+// CHECK-64-NEXT:    store i32 [[COND8]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP287]]
+// CHECK-64-NEXT:    [[TMP22:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP287]]
+// CHECK-64-NEXT:    store i32 [[TMP22]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP287]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP288:![0-9]+]]
 // CHECK-64:       omp.inner.for.end:
 // CHECK-64-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK-64:       omp.loop.exit:
@@ -1597,23 +1609,23 @@ int a;
 // CHECK-64-NEXT:    store i32 [[TMP7]], ptr [[DOTOMP_IV]], align 4
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-64:       omp.inner.for.cond:
-// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP171:![0-9]+]]
-// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP171]]
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP290:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP290]]
 // CHECK-64-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
 // CHECK-64-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-64:       omp.inner.for.body:
-// CHECK-64-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP171]]
+// CHECK-64-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP290]]
 // CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
 // CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP171]]
+// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP290]]
 // CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-64:       omp.body.continue:
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-64:       omp.inner.for.inc:
-// CHECK-64-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP171]]
+// CHECK-64-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP290]]
 // CHECK-64-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP11]], 1
-// CHECK-64-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP171]]
-// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP172:![0-9]+]]
+// CHECK-64-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP290]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP291:![0-9]+]]
 // CHECK-64:       omp.inner.for.end:
 // CHECK-64-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-64:       omp.dispatch.inc:
@@ -1630,11 +1642,13 @@ int a;
 //
 //
 // CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l33
-// CHECK-64-SAME: () #[[ATTR0]] {
+// CHECK-64-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
 // CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-64-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK-64-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l33_kernel_environment)
+// CHECK-64-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l33_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-64:       user_code.entry:
@@ -1686,49 +1700,49 @@ int a;
 // CHECK-64-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-64:       omp.inner.for.cond:
-// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP174:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP293:![0-9]+]]
 // CHECK-64-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
 // CHECK-64-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-64:       omp.inner.for.body:
-// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP174]]
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP293]]
 // CHECK-64-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
-// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP174]]
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP293]]
 // CHECK-64-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP8]] to i64
 // CHECK-64-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
 // CHECK-64-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-64-NEXT:    store ptr [[TMP11]], ptr [[TMP10]], align 8, !llvm.access.group [[ACC_GRP174]]
+// CHECK-64-NEXT:    store ptr [[TMP11]], ptr [[TMP10]], align 8, !llvm.access.group [[ACC_GRP293]]
 // CHECK-64-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [2 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 1
 // CHECK-64-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP9]] to ptr
-// CHECK-64-NEXT:    store ptr [[TMP13]], ptr [[TMP12]], align 8, !llvm.access.group [[ACC_GRP174]]
-// CHECK-64-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l33_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 2), !llvm.access.group [[ACC_GRP174]]
+// CHECK-64-NEXT:    store ptr [[TMP13]], ptr [[TMP12]], align 8, !llvm.access.group [[ACC_GRP293]]
+// CHECK-64-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l33_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 2), !llvm.access.group [[ACC_GRP293]]
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-64:       omp.inner.for.inc:
-// CHECK-64-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP174]]
-// CHECK-64-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP174]]
+// CHECK-64-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP293]]
+// CHECK-64-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP293]]
 // CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP14]], [[TMP15]]
-// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP174]]
-// CHECK-64-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP174]]
-// CHECK-64-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP174]]
+// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP293]]
+// CHECK-64-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP293]]
+// CHECK-64-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP293]]
 // CHECK-64-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP16]], [[TMP17]]
-// CHECK-64-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP174]]
-// CHECK-64-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP174]]
-// CHECK-64-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP174]]
+// CHECK-64-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP293]]
+// CHECK-64-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP293]]
+// CHECK-64-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP293]]
 // CHECK-64-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP18]], [[TMP19]]
-// CHECK-64-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP174]]
-// CHECK-64-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP174]]
+// CHECK-64-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP293]]
+// CHECK-64-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP293]]
 // CHECK-64-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP20]], 9
 // CHECK-64-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
 // CHECK-64:       cond.true5:
 // CHECK-64-NEXT:    br label [[COND_END7:%.*]]
 // CHECK-64:       cond.false6:
-// CHECK-64-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP174]]
+// CHECK-64-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP293]]
 // CHECK-64-NEXT:    br label [[COND_END7]]
 // CHECK-64:       cond.end7:
 // CHECK-64-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP21]], [[COND_FALSE6]] ]
-// CHECK-64-NEXT:    store i32 [[COND8]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP174]]
-// CHECK-64-NEXT:    [[TMP22:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP174]]
-// CHECK-64-NEXT:    store i32 [[TMP22]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP174]]
-// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP175:![0-9]+]]
+// CHECK-64-NEXT:    store i32 [[COND8]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP293]]
+// CHECK-64-NEXT:    [[TMP22:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP293]]
+// CHECK-64-NEXT:    store i32 [[TMP22]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP293]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP294:![0-9]+]]
 // CHECK-64:       omp.inner.for.end:
 // CHECK-64-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK-64:       omp.loop.exit:
@@ -1786,23 +1800,23 @@ int a;
 // CHECK-64-NEXT:    store i32 [[TMP7]], ptr [[DOTOMP_IV]], align 4
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-64:       omp.inner.for.cond:
-// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP177:![0-9]+]]
-// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP177]]
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP296:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP296]]
 // CHECK-64-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
 // CHECK-64-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-64:       omp.inner.for.body:
-// CHECK-64-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP177]]
+// CHECK-64-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP296]]
 // CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
 // CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP177]]
+// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP296]]
 // CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-64:       omp.body.continue:
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-64:       omp.inner.for.inc:
-// CHECK-64-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP177]]
+// CHECK-64-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP296]]
 // CHECK-64-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP11]], 1
-// CHECK-64-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP177]]
-// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP178:![0-9]+]]
+// CHECK-64-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP296]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP297:![0-9]+]]
 // CHECK-64:       omp.inner.for.end:
 // CHECK-64-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-64:       omp.dispatch.inc:
@@ -1819,14 +1833,16 @@ int a;
 //
 //
 // CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l37
-// CHECK-64-SAME: (i64 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-64-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-64-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
 // CHECK-64-NEXT:    [[A_CASTED:%.*]] = alloca i64, align 8
 // CHECK-64-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK-64-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK-64-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
-// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l37_kernel_environment)
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l37_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-64:       user_code.entry:
@@ -2025,11 +2041,13 @@ int a;
 //
 //
 // CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l40
-// CHECK-64-SAME: () #[[ATTR0]] {
+// CHECK-64-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
 // CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-64-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK-64-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l40_kernel_environment)
+// CHECK-64-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l40_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-64:       user_code.entry:
@@ -2202,11 +2220,13 @@ int a;
 //
 //
 // CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l43
-// CHECK-64-SAME: () #[[ATTR0]] {
+// CHECK-64-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
 // CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-64-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK-64-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l43_kernel_environment)
+// CHECK-64-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l43_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-64:       user_code.entry:
@@ -2370,11 +2390,13 @@ int a;
 //
 //
 // CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l46
-// CHECK-64-SAME: () #[[ATTR0]] {
+// CHECK-64-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
 // CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-64-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK-64-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l46_kernel_environment)
+// CHECK-64-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l46_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-64:       user_code.entry:
@@ -2519,23 +2541,23 @@ int a;
 // CHECK-64-NEXT:    store i32 [[TMP7]], ptr [[DOTOMP_IV]], align 4
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-64:       omp.inner.for.cond:
-// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP180:![0-9]+]]
-// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP180]]
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP299:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP299]]
 // CHECK-64-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
 // CHECK-64-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-64:       omp.inner.for.body:
-// CHECK-64-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP180]]
+// CHECK-64-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP299]]
 // CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
 // CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP180]]
+// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP299]]
 // CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-64:       omp.body.continue:
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-64:       omp.inner.for.inc:
-// CHECK-64-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP180]]
+// CHECK-64-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP299]]
 // CHECK-64-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP11]], 1
-// CHECK-64-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP180]]
-// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP181:![0-9]+]]
+// CHECK-64-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP299]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP300:![0-9]+]]
 // CHECK-64:       omp.inner.for.end:
 // CHECK-64-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-64:       omp.dispatch.inc:
@@ -2545,11 +2567,13 @@ int a;
 //
 //
 // CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l49
-// CHECK-64-SAME: () #[[ATTR0]] {
+// CHECK-64-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
 // CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-64-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK-64-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l49_kernel_environment)
+// CHECK-64-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l49_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-64:       user_code.entry:
@@ -2694,23 +2718,23 @@ int a;
 // CHECK-64-NEXT:    store i32 [[TMP7]], ptr [[DOTOMP_IV]], align 4
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-64:       omp.inner.for.cond:
-// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP183:![0-9]+]]
-// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP183]]
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP302:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP302]]
 // CHECK-64-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
 // CHECK-64-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-64:       omp.inner.for.body:
-// CHECK-64-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP183]]
+// CHECK-64-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP302]]
 // CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
 // CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP183]]
+// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP302]]
 // CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-64:       omp.body.continue:
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-64:       omp.inner.for.inc:
-// CHECK-64-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP183]]
+// CHECK-64-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP302]]
 // CHECK-64-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP11]], 1
-// CHECK-64-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP183]]
-// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP184:![0-9]+]]
+// CHECK-64-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP302]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP303:![0-9]+]]
 // CHECK-64:       omp.inner.for.end:
 // CHECK-64-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-64:       omp.dispatch.inc:
@@ -2720,11 +2744,13 @@ int a;
 //
 //
 // CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l52
-// CHECK-64-SAME: () #[[ATTR0]] {
+// CHECK-64-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
 // CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-64-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK-64-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l52_kernel_environment)
+// CHECK-64-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l52_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-64:       user_code.entry:
@@ -2869,23 +2895,23 @@ int a;
 // CHECK-64-NEXT:    store i32 [[TMP7]], ptr [[DOTOMP_IV]], align 4
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-64:       omp.inner.for.cond:
-// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP186:![0-9]+]]
-// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP186]]
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP305:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP305]]
 // CHECK-64-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
 // CHECK-64-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-64:       omp.inner.for.body:
-// CHECK-64-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP186]]
+// CHECK-64-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP305]]
 // CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
 // CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP186]]
+// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP305]]
 // CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-64:       omp.body.continue:
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-64:       omp.inner.for.inc:
-// CHECK-64-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP186]]
+// CHECK-64-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP305]]
 // CHECK-64-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP11]], 1
-// CHECK-64-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP186]]
-// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP187:![0-9]+]]
+// CHECK-64-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP305]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP306:![0-9]+]]
 // CHECK-64:       omp.inner.for.end:
 // CHECK-64-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-64:       omp.dispatch.inc:
@@ -2895,11 +2921,13 @@ int a;
 //
 //
 // CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l55
-// CHECK-64-SAME: () #[[ATTR0]] {
+// CHECK-64-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
 // CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-64-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK-64-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l55_kernel_environment)
+// CHECK-64-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l55_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-64:       user_code.entry:
@@ -3044,23 +3072,23 @@ int a;
 // CHECK-64-NEXT:    store i32 [[TMP7]], ptr [[DOTOMP_IV]], align 4
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-64:       omp.inner.for.cond:
-// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP189:![0-9]+]]
-// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP189]]
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP308:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP308]]
 // CHECK-64-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
 // CHECK-64-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-64:       omp.inner.for.body:
-// CHECK-64-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP189]]
+// CHECK-64-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP308]]
 // CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
 // CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP189]]
+// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP308]]
 // CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-64:       omp.body.continue:
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-64:       omp.inner.for.inc:
-// CHECK-64-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP189]]
+// CHECK-64-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP308]]
 // CHECK-64-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP11]], 1
-// CHECK-64-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP189]]
-// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP190:![0-9]+]]
+// CHECK-64-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP308]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP309:![0-9]+]]
 // CHECK-64:       omp.inner.for.end:
 // CHECK-64-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-64:       omp.dispatch.inc:
@@ -3070,11 +3098,13 @@ int a;
 //
 //
 // CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l58
-// CHECK-64-SAME: () #[[ATTR0]] {
+// CHECK-64-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
 // CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-64-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK-64-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l58_kernel_environment)
+// CHECK-64-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l58_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-64:       user_code.entry:
@@ -3127,49 +3157,49 @@ int a;
 // CHECK-64-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-64:       omp.inner.for.cond:
-// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP192:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP311:![0-9]+]]
 // CHECK-64-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
 // CHECK-64-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-64:       omp.inner.for.body:
-// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP192]]
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP311]]
 // CHECK-64-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
-// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP192]]
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP311]]
 // CHECK-64-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP8]] to i64
 // CHECK-64-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
 // CHECK-64-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-64-NEXT:    store ptr [[TMP11]], ptr [[TMP10]], align 8, !llvm.access.group [[ACC_GRP192]]
+// CHECK-64-NEXT:    store ptr [[TMP11]], ptr [[TMP10]], align 8, !llvm.access.group [[ACC_GRP311]]
 // CHECK-64-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [2 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 1
 // CHECK-64-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP9]] to ptr
-// CHECK-64-NEXT:    store ptr [[TMP13]], ptr [[TMP12]], align 8, !llvm.access.group [[ACC_GRP192]]
-// CHECK-64-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l58_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 2), !llvm.access.group [[ACC_GRP192]]
+// CHECK-64-NEXT:    store ptr [[TMP13]], ptr [[TMP12]], align 8, !llvm.access.group [[ACC_GRP311]]
+// CHECK-64-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l58_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 2), !llvm.access.group [[ACC_GRP311]]
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-64:       omp.inner.for.inc:
-// CHECK-64-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP192]]
-// CHECK-64-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP192]]
+// CHECK-64-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP311]]
+// CHECK-64-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP311]]
 // CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP14]], [[TMP15]]
-// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP192]]
-// CHECK-64-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP192]]
-// CHECK-64-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP192]]
+// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP311]]
+// CHECK-64-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP311]]
+// CHECK-64-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP311]]
 // CHECK-64-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP16]], [[TMP17]]
-// CHECK-64-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP192]]
-// CHECK-64-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP192]]
-// CHECK-64-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP192]]
+// CHECK-64-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP311]]
+// CHECK-64-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP311]]
+// CHECK-64-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP311]]
 // CHECK-64-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP18]], [[TMP19]]
-// CHECK-64-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP192]]
-// CHECK-64-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP192]]
+// CHECK-64-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP311]]
+// CHECK-64-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP311]]
 // CHECK-64-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP20]], 9
 // CHECK-64-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
 // CHECK-64:       cond.true5:
 // CHECK-64-NEXT:    br label [[COND_END7:%.*]]
 // CHECK-64:       cond.false6:
-// CHECK-64-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP192]]
+// CHECK-64-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP311]]
 // CHECK-64-NEXT:    br label [[COND_END7]]
 // CHECK-64:       cond.end7:
 // CHECK-64-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP21]], [[COND_FALSE6]] ]
-// CHECK-64-NEXT:    store i32 [[COND8]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP192]]
-// CHECK-64-NEXT:    [[TMP22:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP192]]
-// CHECK-64-NEXT:    store i32 [[TMP22]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP192]]
-// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP193:![0-9]+]]
+// CHECK-64-NEXT:    store i32 [[COND8]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP311]]
+// CHECK-64-NEXT:    [[TMP22:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP311]]
+// CHECK-64-NEXT:    store i32 [[TMP22]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP311]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP312:![0-9]+]]
 // CHECK-64:       omp.inner.for.end:
 // CHECK-64-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK-64:       omp.loop.exit:
@@ -3219,25 +3249,25 @@ int a;
 // CHECK-64-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-64:       omp.inner.for.cond:
-// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP195:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP314:![0-9]+]]
 // CHECK-64-NEXT:    [[CONV2:%.*]] = sext i32 [[TMP5]] to i64
-// CHECK-64-NEXT:    [[TMP6:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR]], align 8, !llvm.access.group [[ACC_GRP195]]
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR]], align 8, !llvm.access.group [[ACC_GRP314]]
 // CHECK-64-NEXT:    [[CMP:%.*]] = icmp ule i64 [[CONV2]], [[TMP6]]
 // CHECK-64-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-64:       omp.inner.for.body:
-// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP195]]
+// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP314]]
 // CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
 // CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP195]]
+// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP314]]
 // CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-64:       omp.body.continue:
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-64:       omp.inner.for.inc:
-// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP195]]
-// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP195]]
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP314]]
+// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP314]]
 // CHECK-64-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP8]], [[TMP9]]
-// CHECK-64-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP195]]
-// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP196:![0-9]+]]
+// CHECK-64-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP314]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP315:![0-9]+]]
 // CHECK-64:       omp.inner.for.end:
 // CHECK-64-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK-64:       omp.loop.exit:
@@ -3253,11 +3283,13 @@ int a;
 //
 //
 // CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l66
-// CHECK-64-SAME: () #[[ATTR0]] {
+// CHECK-64-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
 // CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-64-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK-64-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l66_kernel_environment)
+// CHECK-64-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l66_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-64:       user_code.entry:
@@ -3311,49 +3343,49 @@ int a;
 // CHECK-64-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-64:       omp.inner.for.cond:
-// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP198:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP317:![0-9]+]]
 // CHECK-64-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
 // CHECK-64-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-64:       omp.inner.for.body:
-// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP198]]
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP317]]
 // CHECK-64-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
-// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP198]]
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP317]]
 // CHECK-64-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP8]] to i64
 // CHECK-64-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
 // CHECK-64-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-64-NEXT:    store ptr [[TMP11]], ptr [[TMP10]], align 8, !llvm.access.group [[ACC_GRP198]]
+// CHECK-64-NEXT:    store ptr [[TMP11]], ptr [[TMP10]], align 8, !llvm.access.group [[ACC_GRP317]]
 // CHECK-64-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [2 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 1
 // CHECK-64-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP9]] to ptr
-// CHECK-64-NEXT:    store ptr [[TMP13]], ptr [[TMP12]], align 8, !llvm.access.group [[ACC_GRP198]]
-// CHECK-64-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l66_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 2), !llvm.access.group [[ACC_GRP198]]
+// CHECK-64-NEXT:    store ptr [[TMP13]], ptr [[TMP12]], align 8, !llvm.access.group [[ACC_GRP317]]
+// CHECK-64-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l66_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 2), !llvm.access.group [[ACC_GRP317]]
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-64:       omp.inner.for.inc:
-// CHECK-64-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP198]]
-// CHECK-64-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP198]]
+// CHECK-64-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP317]]
+// CHECK-64-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP317]]
 // CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP14]], [[TMP15]]
-// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP198]]
-// CHECK-64-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP198]]
-// CHECK-64-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP198]]
+// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP317]]
+// CHECK-64-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP317]]
+// CHECK-64-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP317]]
 // CHECK-64-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP16]], [[TMP17]]
-// CHECK-64-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP198]]
-// CHECK-64-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP198]]
-// CHECK-64-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP198]]
+// CHECK-64-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP317]]
+// CHECK-64-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP317]]
+// CHECK-64-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP317]]
 // CHECK-64-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP18]], [[TMP19]]
-// CHECK-64-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP198]]
-// CHECK-64-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP198]]
+// CHECK-64-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP317]]
+// CHECK-64-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP317]]
 // CHECK-64-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP20]], 9
 // CHECK-64-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
 // CHECK-64:       cond.true5:
 // CHECK-64-NEXT:    br label [[COND_END7:%.*]]
 // CHECK-64:       cond.false6:
-// CHECK-64-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP198]]
+// CHECK-64-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP317]]
 // CHECK-64-NEXT:    br label [[COND_END7]]
 // CHECK-64:       cond.end7:
 // CHECK-64-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP21]], [[COND_FALSE6]] ]
-// CHECK-64-NEXT:    store i32 [[COND8]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP198]]
-// CHECK-64-NEXT:    [[TMP22:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP198]]
-// CHECK-64-NEXT:    store i32 [[TMP22]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP198]]
-// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP199:![0-9]+]]
+// CHECK-64-NEXT:    store i32 [[COND8]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP317]]
+// CHECK-64-NEXT:    [[TMP22:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP317]]
+// CHECK-64-NEXT:    store i32 [[TMP22]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP317]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP318:![0-9]+]]
 // CHECK-64:       omp.inner.for.end:
 // CHECK-64-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK-64:       omp.loop.exit:
@@ -3414,23 +3446,23 @@ int a;
 // CHECK-64-NEXT:    store i32 [[TMP6]], ptr [[DOTOMP_IV]], align 4
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-64:       omp.inner.for.cond:
-// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP201:![0-9]+]]
-// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP201]]
+// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP320:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP320]]
 // CHECK-64-NEXT:    [[CMP2:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]]
 // CHECK-64-NEXT:    br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-64:       omp.inner.for.body:
-// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP201]]
+// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP320]]
 // CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP9]], 1
 // CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP201]]
+// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP320]]
 // CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-64:       omp.body.continue:
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-64:       omp.inner.for.inc:
-// CHECK-64-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP201]]
+// CHECK-64-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP320]]
 // CHECK-64-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP10]], 1
-// CHECK-64-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP201]]
-// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP202:![0-9]+]]
+// CHECK-64-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP320]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP321:![0-9]+]]
 // CHECK-64:       omp.inner.for.end:
 // CHECK-64-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK-64:       omp.loop.exit:
@@ -3446,11 +3478,13 @@ int a;
 //
 //
 // CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l73
-// CHECK-64-SAME: () #[[ATTR0]] {
+// CHECK-64-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
 // CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-64-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK-64-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l73_kernel_environment)
+// CHECK-64-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l73_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-64:       user_code.entry:
@@ -3503,29 +3537,29 @@ int a;
 // CHECK-64-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-64:       omp.inner.for.cond:
-// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP204:![0-9]+]]
-// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP204]]
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP323:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP323]]
 // CHECK-64-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
 // CHECK-64-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-64:       omp.inner.for.body:
-// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP204]]
+// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP323]]
 // CHECK-64-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP7]] to i64
-// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP204]]
+// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP323]]
 // CHECK-64-NEXT:    [[TMP10:%.*]] = zext i32 [[TMP9]] to i64
 // CHECK-64-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [2 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
 // CHECK-64-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP8]] to ptr
-// CHECK-64-NEXT:    store ptr [[TMP12]], ptr [[TMP11]], align 8, !llvm.access.group [[ACC_GRP204]]
+// CHECK-64-NEXT:    store ptr [[TMP12]], ptr [[TMP11]], align 8, !llvm.access.group [[ACC_GRP323]]
 // CHECK-64-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [2 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 1
 // CHECK-64-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK-64-NEXT:    store ptr [[TMP14]], ptr [[TMP13]], align 8, !llvm.access.group [[ACC_GRP204]]
-// CHECK-64-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l73_omp_outlined_omp_outlined, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l73_omp_outlined_omp_outlined_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 2), !llvm.access.group [[ACC_GRP204]]
+// CHECK-64-NEXT:    store ptr [[TMP14]], ptr [[TMP13]], align 8, !llvm.access.group [[ACC_GRP323]]
+// CHECK-64-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l73_omp_outlined_omp_outlined, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l73_omp_outlined_omp_outlined_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 2), !llvm.access.group [[ACC_GRP323]]
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-64:       omp.inner.for.inc:
-// CHECK-64-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP204]]
-// CHECK-64-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP204]]
+// CHECK-64-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP323]]
+// CHECK-64-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP323]]
 // CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP15]], [[TMP16]]
-// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP204]]
-// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP205:![0-9]+]]
+// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP323]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP324:![0-9]+]]
 // CHECK-64:       omp.inner.for.end:
 // CHECK-64-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK-64:       omp.loop.exit:
@@ -3578,25 +3612,25 @@ int a;
 // CHECK-64-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-64:       omp.inner.for.cond:
-// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP207:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP326:![0-9]+]]
 // CHECK-64-NEXT:    [[CONV2:%.*]] = sext i32 [[TMP5]] to i64
-// CHECK-64-NEXT:    [[TMP6:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR]], align 8, !llvm.access.group [[ACC_GRP207]]
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR]], align 8, !llvm.access.group [[ACC_GRP326]]
 // CHECK-64-NEXT:    [[CMP:%.*]] = icmp ule i64 [[CONV2]], [[TMP6]]
 // CHECK-64-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-64:       omp.inner.for.body:
-// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP207]]
+// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP326]]
 // CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
 // CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP207]]
+// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP326]]
 // CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-64:       omp.body.continue:
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-64:       omp.inner.for.inc:
-// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP207]]
-// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP207]]
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP326]]
+// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP326]]
 // CHECK-64-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP8]], [[TMP9]]
-// CHECK-64-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP207]]
-// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP208:![0-9]+]]
+// CHECK-64-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP326]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP327:![0-9]+]]
 // CHECK-64:       omp.inner.for.end:
 // CHECK-64-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK-64:       omp.loop.exit:
@@ -3632,11 +3666,13 @@ int a;
 //
 //
 // CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l81
-// CHECK-64-SAME: () #[[ATTR0]] {
+// CHECK-64-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
 // CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-64-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK-64-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l81_kernel_environment)
+// CHECK-64-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l81_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-64:       user_code.entry:
@@ -3688,49 +3724,49 @@ int a;
 // CHECK-64-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-64:       omp.inner.for.cond:
-// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP210:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP329:![0-9]+]]
 // CHECK-64-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
 // CHECK-64-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-64:       omp.inner.for.body:
-// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP210]]
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP329]]
 // CHECK-64-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
-// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP210]]
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP329]]
 // CHECK-64-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP8]] to i64
 // CHECK-64-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
 // CHECK-64-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-64-NEXT:    store ptr [[TMP11]], ptr [[TMP10]], align 8, !llvm.access.group [[ACC_GRP210]]
+// CHECK-64-NEXT:    store ptr [[TMP11]], ptr [[TMP10]], align 8, !llvm.access.group [[ACC_GRP329]]
 // CHECK-64-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [2 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 1
 // CHECK-64-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP9]] to ptr
-// CHECK-64-NEXT:    store ptr [[TMP13]], ptr [[TMP12]], align 8, !llvm.access.group [[ACC_GRP210]]
-// CHECK-64-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l81_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 2), !llvm.access.group [[ACC_GRP210]]
+// CHECK-64-NEXT:    store ptr [[TMP13]], ptr [[TMP12]], align 8, !llvm.access.group [[ACC_GRP329]]
+// CHECK-64-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l81_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 2), !llvm.access.group [[ACC_GRP329]]
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-64:       omp.inner.for.inc:
-// CHECK-64-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP210]]
-// CHECK-64-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP210]]
+// CHECK-64-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP329]]
+// CHECK-64-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP329]]
 // CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP14]], [[TMP15]]
-// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP210]]
-// CHECK-64-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP210]]
-// CHECK-64-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP210]]
+// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP329]]
+// CHECK-64-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP329]]
+// CHECK-64-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP329]]
 // CHECK-64-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP16]], [[TMP17]]
-// CHECK-64-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP210]]
-// CHECK-64-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP210]]
-// CHECK-64-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP210]]
+// CHECK-64-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP329]]
+// CHECK-64-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP329]]
+// CHECK-64-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP329]]
 // CHECK-64-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP18]], [[TMP19]]
-// CHECK-64-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP210]]
-// CHECK-64-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP210]]
+// CHECK-64-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP329]]
+// CHECK-64-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP329]]
 // CHECK-64-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP20]], 9
 // CHECK-64-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
 // CHECK-64:       cond.true5:
 // CHECK-64-NEXT:    br label [[COND_END7:%.*]]
 // CHECK-64:       cond.false6:
-// CHECK-64-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP210]]
+// CHECK-64-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP329]]
 // CHECK-64-NEXT:    br label [[COND_END7]]
 // CHECK-64:       cond.end7:
 // CHECK-64-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP21]], [[COND_FALSE6]] ]
-// CHECK-64-NEXT:    store i32 [[COND8]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP210]]
-// CHECK-64-NEXT:    [[TMP22:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP210]]
-// CHECK-64-NEXT:    store i32 [[TMP22]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP210]]
-// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP211:![0-9]+]]
+// CHECK-64-NEXT:    store i32 [[COND8]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP329]]
+// CHECK-64-NEXT:    [[TMP22:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP329]]
+// CHECK-64-NEXT:    store i32 [[TMP22]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP329]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP330:![0-9]+]]
 // CHECK-64:       omp.inner.for.end:
 // CHECK-64-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK-64:       omp.loop.exit:
@@ -3788,23 +3824,23 @@ int a;
 // CHECK-64-NEXT:    store i32 [[TMP7]], ptr [[DOTOMP_IV]], align 4
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-64:       omp.inner.for.cond:
-// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP213:![0-9]+]]
-// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP213]]
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP332:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP332]]
 // CHECK-64-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
 // CHECK-64-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-64:       omp.inner.for.body:
-// CHECK-64-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP213]]
+// CHECK-64-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP332]]
 // CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
 // CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP213]]
+// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP332]]
 // CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-64:       omp.body.continue:
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-64:       omp.inner.for.inc:
-// CHECK-64-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP213]]
+// CHECK-64-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP332]]
 // CHECK-64-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP11]], 1
-// CHECK-64-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP213]]
-// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP214:![0-9]+]]
+// CHECK-64-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP332]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP333:![0-9]+]]
 // CHECK-64:       omp.inner.for.end:
 // CHECK-64-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-64:       omp.dispatch.inc:
@@ -3821,11 +3857,13 @@ int a;
 //
 //
 // CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l85
-// CHECK-64-SAME: () #[[ATTR0]] {
+// CHECK-64-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
 // CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-64-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK-64-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l85_kernel_environment)
+// CHECK-64-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l85_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-64:       user_code.entry:
@@ -3877,49 +3915,49 @@ int a;
 // CHECK-64-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-64:       omp.inner.for.cond:
-// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP216:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP335:![0-9]+]]
 // CHECK-64-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
 // CHECK-64-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-64:       omp.inner.for.body:
-// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP216]]
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP335]]
 // CHECK-64-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
-// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP216]]
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP335]]
 // CHECK-64-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP8]] to i64
 // CHECK-64-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
 // CHECK-64-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-64-NEXT:    store ptr [[TMP11]], ptr [[TMP10]], align 8, !llvm.access.group [[ACC_GRP216]]
+// CHECK-64-NEXT:    store ptr [[TMP11]], ptr [[TMP10]], align 8, !llvm.access.group [[ACC_GRP335]]
 // CHECK-64-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [2 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 1
 // CHECK-64-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP9]] to ptr
-// CHECK-64-NEXT:    store ptr [[TMP13]], ptr [[TMP12]], align 8, !llvm.access.group [[ACC_GRP216]]
-// CHECK-64-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l85_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 2), !llvm.access.group [[ACC_GRP216]]
+// CHECK-64-NEXT:    store ptr [[TMP13]], ptr [[TMP12]], align 8, !llvm.access.group [[ACC_GRP335]]
+// CHECK-64-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l85_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 2), !llvm.access.group [[ACC_GRP335]]
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-64:       omp.inner.for.inc:
-// CHECK-64-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP216]]
-// CHECK-64-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP216]]
+// CHECK-64-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP335]]
+// CHECK-64-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP335]]
 // CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP14]], [[TMP15]]
-// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP216]]
-// CHECK-64-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP216]]
-// CHECK-64-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP216]]
+// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP335]]
+// CHECK-64-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP335]]
+// CHECK-64-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP335]]
 // CHECK-64-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP16]], [[TMP17]]
-// CHECK-64-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP216]]
-// CHECK-64-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP216]]
-// CHECK-64-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP216]]
+// CHECK-64-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP335]]
+// CHECK-64-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP335]]
+// CHECK-64-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP335]]
 // CHECK-64-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP18]], [[TMP19]]
-// CHECK-64-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP216]]
-// CHECK-64-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP216]]
+// CHECK-64-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP335]]
+// CHECK-64-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP335]]
 // CHECK-64-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP20]], 9
 // CHECK-64-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
 // CHECK-64:       cond.true5:
 // CHECK-64-NEXT:    br label [[COND_END7:%.*]]
 // CHECK-64:       cond.false6:
-// CHECK-64-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP216]]
+// CHECK-64-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP335]]
 // CHECK-64-NEXT:    br label [[COND_END7]]
 // CHECK-64:       cond.end7:
 // CHECK-64-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP21]], [[COND_FALSE6]] ]
-// CHECK-64-NEXT:    store i32 [[COND8]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP216]]
-// CHECK-64-NEXT:    [[TMP22:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP216]]
-// CHECK-64-NEXT:    store i32 [[TMP22]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP216]]
-// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP217:![0-9]+]]
+// CHECK-64-NEXT:    store i32 [[COND8]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP335]]
+// CHECK-64-NEXT:    [[TMP22:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP335]]
+// CHECK-64-NEXT:    store i32 [[TMP22]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP335]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP336:![0-9]+]]
 // CHECK-64:       omp.inner.for.end:
 // CHECK-64-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK-64:       omp.loop.exit:
@@ -3977,23 +4015,23 @@ int a;
 // CHECK-64-NEXT:    store i32 [[TMP7]], ptr [[DOTOMP_IV]], align 4
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-64:       omp.inner.for.cond:
-// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP219:![0-9]+]]
-// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP219]]
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP338:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP338]]
 // CHECK-64-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
 // CHECK-64-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-64:       omp.inner.for.body:
-// CHECK-64-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP219]]
+// CHECK-64-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP338]]
 // CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
 // CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP219]]
+// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP338]]
 // CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-64:       omp.body.continue:
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-64:       omp.inner.for.inc:
-// CHECK-64-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP219]]
+// CHECK-64-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP338]]
 // CHECK-64-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP11]], 1
-// CHECK-64-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP219]]
-// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP220:![0-9]+]]
+// CHECK-64-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP338]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP339:![0-9]+]]
 // CHECK-64:       omp.inner.for.end:
 // CHECK-64-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-64:       omp.dispatch.inc:
@@ -4010,11 +4048,13 @@ int a;
 //
 //
 // CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l89
-// CHECK-64-SAME: () #[[ATTR0]] {
+// CHECK-64-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
 // CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-64-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK-64-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l89_kernel_environment)
+// CHECK-64-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l89_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-64:       user_code.entry:
@@ -4066,49 +4106,49 @@ int a;
 // CHECK-64-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-64:       omp.inner.for.cond:
-// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP222:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP341:![0-9]+]]
 // CHECK-64-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
 // CHECK-64-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-64:       omp.inner.for.body:
-// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP222]]
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP341]]
 // CHECK-64-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
-// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP222]]
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP341]]
 // CHECK-64-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP8]] to i64
 // CHECK-64-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
 // CHECK-64-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-64-NEXT:    store ptr [[TMP11]], ptr [[TMP10]], align 8, !llvm.access.group [[ACC_GRP222]]
+// CHECK-64-NEXT:    store ptr [[TMP11]], ptr [[TMP10]], align 8, !llvm.access.group [[ACC_GRP341]]
 // CHECK-64-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [2 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 1
 // CHECK-64-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP9]] to ptr
-// CHECK-64-NEXT:    store ptr [[TMP13]], ptr [[TMP12]], align 8, !llvm.access.group [[ACC_GRP222]]
-// CHECK-64-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l89_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 2), !llvm.access.group [[ACC_GRP222]]
+// CHECK-64-NEXT:    store ptr [[TMP13]], ptr [[TMP12]], align 8, !llvm.access.group [[ACC_GRP341]]
+// CHECK-64-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l89_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 2), !llvm.access.group [[ACC_GRP341]]
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-64:       omp.inner.for.inc:
-// CHECK-64-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP222]]
-// CHECK-64-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP222]]
+// CHECK-64-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP341]]
+// CHECK-64-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP341]]
 // CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP14]], [[TMP15]]
-// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP222]]
-// CHECK-64-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP222]]
-// CHECK-64-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP222]]
+// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP341]]
+// CHECK-64-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP341]]
+// CHECK-64-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP341]]
 // CHECK-64-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP16]], [[TMP17]]
-// CHECK-64-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP222]]
-// CHECK-64-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP222]]
-// CHECK-64-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP222]]
+// CHECK-64-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP341]]
+// CHECK-64-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP341]]
+// CHECK-64-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP341]]
 // CHECK-64-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP18]], [[TMP19]]
-// CHECK-64-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP222]]
-// CHECK-64-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP222]]
+// CHECK-64-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP341]]
+// CHECK-64-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP341]]
 // CHECK-64-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP20]], 9
 // CHECK-64-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
 // CHECK-64:       cond.true5:
 // CHECK-64-NEXT:    br label [[COND_END7:%.*]]
 // CHECK-64:       cond.false6:
-// CHECK-64-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP222]]
+// CHECK-64-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP341]]
 // CHECK-64-NEXT:    br label [[COND_END7]]
 // CHECK-64:       cond.end7:
 // CHECK-64-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP21]], [[COND_FALSE6]] ]
-// CHECK-64-NEXT:    store i32 [[COND8]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP222]]
-// CHECK-64-NEXT:    [[TMP22:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP222]]
-// CHECK-64-NEXT:    store i32 [[TMP22]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP222]]
-// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP223:![0-9]+]]
+// CHECK-64-NEXT:    store i32 [[COND8]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP341]]
+// CHECK-64-NEXT:    [[TMP22:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP341]]
+// CHECK-64-NEXT:    store i32 [[TMP22]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP341]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP342:![0-9]+]]
 // CHECK-64:       omp.inner.for.end:
 // CHECK-64-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK-64:       omp.loop.exit:
@@ -4166,23 +4206,23 @@ int a;
 // CHECK-64-NEXT:    store i32 [[TMP7]], ptr [[DOTOMP_IV]], align 4
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-64:       omp.inner.for.cond:
-// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP225:![0-9]+]]
-// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP225]]
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP344:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP344]]
 // CHECK-64-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
 // CHECK-64-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-64:       omp.inner.for.body:
-// CHECK-64-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP225]]
+// CHECK-64-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP344]]
 // CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
 // CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP225]]
+// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP344]]
 // CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-64:       omp.body.continue:
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-64:       omp.inner.for.inc:
-// CHECK-64-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP225]]
+// CHECK-64-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP344]]
 // CHECK-64-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP11]], 1
-// CHECK-64-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP225]]
-// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP226:![0-9]+]]
+// CHECK-64-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP344]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP345:![0-9]+]]
 // CHECK-64:       omp.inner.for.end:
 // CHECK-64-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-64:       omp.dispatch.inc:
@@ -4199,11 +4239,13 @@ int a;
 //
 //
 // CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l93
-// CHECK-64-SAME: () #[[ATTR0]] {
+// CHECK-64-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
 // CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-64-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK-64-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l93_kernel_environment)
+// CHECK-64-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l93_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-64:       user_code.entry:
@@ -4255,49 +4297,49 @@ int a;
 // CHECK-64-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-64:       omp.inner.for.cond:
-// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP228:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP347:![0-9]+]]
 // CHECK-64-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
 // CHECK-64-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-64:       omp.inner.for.body:
-// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP228]]
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP347]]
 // CHECK-64-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
-// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP228]]
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP347]]
 // CHECK-64-NEXT:    [[TMP9:%.*]] = zext i32 [[TMP8]] to i64
 // CHECK-64-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
 // CHECK-64-NEXT:    [[TMP11:%.*]] = inttoptr i64 [[TMP7]] to ptr
-// CHECK-64-NEXT:    store ptr [[TMP11]], ptr [[TMP10]], align 8, !llvm.access.group [[ACC_GRP228]]
+// CHECK-64-NEXT:    store ptr [[TMP11]], ptr [[TMP10]], align 8, !llvm.access.group [[ACC_GRP347]]
 // CHECK-64-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [2 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 1
 // CHECK-64-NEXT:    [[TMP13:%.*]] = inttoptr i64 [[TMP9]] to ptr
-// CHECK-64-NEXT:    store ptr [[TMP13]], ptr [[TMP12]], align 8, !llvm.access.group [[ACC_GRP228]]
-// CHECK-64-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l93_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 2), !llvm.access.group [[ACC_GRP228]]
+// CHECK-64-NEXT:    store ptr [[TMP13]], ptr [[TMP12]], align 8, !llvm.access.group [[ACC_GRP347]]
+// CHECK-64-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l93_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 2), !llvm.access.group [[ACC_GRP347]]
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-64:       omp.inner.for.inc:
-// CHECK-64-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP228]]
-// CHECK-64-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP228]]
+// CHECK-64-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP347]]
+// CHECK-64-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP347]]
 // CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP14]], [[TMP15]]
-// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP228]]
-// CHECK-64-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP228]]
-// CHECK-64-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP228]]
+// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP347]]
+// CHECK-64-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP347]]
+// CHECK-64-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP347]]
 // CHECK-64-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP16]], [[TMP17]]
-// CHECK-64-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP228]]
-// CHECK-64-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP228]]
-// CHECK-64-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP228]]
+// CHECK-64-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP347]]
+// CHECK-64-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP347]]
+// CHECK-64-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP347]]
 // CHECK-64-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP18]], [[TMP19]]
-// CHECK-64-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP228]]
-// CHECK-64-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP228]]
+// CHECK-64-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP347]]
+// CHECK-64-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP347]]
 // CHECK-64-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP20]], 9
 // CHECK-64-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
 // CHECK-64:       cond.true5:
 // CHECK-64-NEXT:    br label [[COND_END7:%.*]]
 // CHECK-64:       cond.false6:
-// CHECK-64-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP228]]
+// CHECK-64-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP347]]
 // CHECK-64-NEXT:    br label [[COND_END7]]
 // CHECK-64:       cond.end7:
 // CHECK-64-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP21]], [[COND_FALSE6]] ]
-// CHECK-64-NEXT:    store i32 [[COND8]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP228]]
-// CHECK-64-NEXT:    [[TMP22:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP228]]
-// CHECK-64-NEXT:    store i32 [[TMP22]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP228]]
-// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP229:![0-9]+]]
+// CHECK-64-NEXT:    store i32 [[COND8]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP347]]
+// CHECK-64-NEXT:    [[TMP22:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP347]]
+// CHECK-64-NEXT:    store i32 [[TMP22]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP347]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP348:![0-9]+]]
 // CHECK-64:       omp.inner.for.end:
 // CHECK-64-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK-64:       omp.loop.exit:
@@ -4355,23 +4397,23 @@ int a;
 // CHECK-64-NEXT:    store i32 [[TMP7]], ptr [[DOTOMP_IV]], align 4
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-64:       omp.inner.for.cond:
-// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP231:![0-9]+]]
-// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP231]]
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP350:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP350]]
 // CHECK-64-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
 // CHECK-64-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-64:       omp.inner.for.body:
-// CHECK-64-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP231]]
+// CHECK-64-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP350]]
 // CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
 // CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP231]]
+// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP350]]
 // CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-64:       omp.body.continue:
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-64:       omp.inner.for.inc:
-// CHECK-64-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP231]]
+// CHECK-64-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP350]]
 // CHECK-64-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP11]], 1
-// CHECK-64-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP231]]
-// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP232:![0-9]+]]
+// CHECK-64-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP350]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP351:![0-9]+]]
 // CHECK-64:       omp.inner.for.end:
 // CHECK-64-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-64:       omp.dispatch.inc:
@@ -4388,11 +4430,13 @@ int a;
 //
 //
 // CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l97
-// CHECK-64-SAME: () #[[ATTR0]] {
+// CHECK-64-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
 // CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-64-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK-64-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l97_kernel_environment)
+// CHECK-64-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l97_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-64:       user_code.entry:
@@ -4556,11 +4600,13 @@ int a;
 //
 //
 // CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l101
-// CHECK-64-SAME: () #[[ATTR0]] {
+// CHECK-64-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
 // CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-64-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK-64-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l101_kernel_environment)
+// CHECK-64-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l101_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-64:       user_code.entry:
@@ -4733,11 +4779,13 @@ int a;
 //
 //
 // CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l105
-// CHECK-64-SAME: () #[[ATTR0]] {
+// CHECK-64-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
 // CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-64-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK-64-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l105_kernel_environment)
+// CHECK-64-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l105_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-64:       user_code.entry:
@@ -4901,11 +4949,13 @@ int a;
 //
 //
 // CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l109
-// CHECK-64-SAME: () #[[ATTR0]] {
+// CHECK-64-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
 // CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-64-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK-64-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l109_kernel_environment)
+// CHECK-64-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l109_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-64:       user_code.entry:
@@ -5050,23 +5100,23 @@ int a;
 // CHECK-64-NEXT:    store i32 [[TMP7]], ptr [[DOTOMP_IV]], align 4
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-64:       omp.inner.for.cond:
-// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP234:![0-9]+]]
-// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP234]]
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP353:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP353]]
 // CHECK-64-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
 // CHECK-64-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-64:       omp.inner.for.body:
-// CHECK-64-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP234]]
+// CHECK-64-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP353]]
 // CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
 // CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP234]]
+// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP353]]
 // CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-64:       omp.body.continue:
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-64:       omp.inner.for.inc:
-// CHECK-64-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP234]]
+// CHECK-64-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP353]]
 // CHECK-64-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP11]], 1
-// CHECK-64-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP234]]
-// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP235:![0-9]+]]
+// CHECK-64-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP353]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP354:![0-9]+]]
 // CHECK-64:       omp.inner.for.end:
 // CHECK-64-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-64:       omp.dispatch.inc:
@@ -5076,11 +5126,13 @@ int a;
 //
 //
 // CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l113
-// CHECK-64-SAME: () #[[ATTR0]] {
+// CHECK-64-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
 // CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-64-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK-64-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l113_kernel_environment)
+// CHECK-64-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l113_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-64:       user_code.entry:
@@ -5225,23 +5277,23 @@ int a;
 // CHECK-64-NEXT:    store i32 [[TMP7]], ptr [[DOTOMP_IV]], align 4
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-64:       omp.inner.for.cond:
-// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP237:![0-9]+]]
-// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP237]]
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP356:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP356]]
 // CHECK-64-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
 // CHECK-64-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-64:       omp.inner.for.body:
-// CHECK-64-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP237]]
+// CHECK-64-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP356]]
 // CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
 // CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP237]]
+// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP356]]
 // CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-64:       omp.body.continue:
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-64:       omp.inner.for.inc:
-// CHECK-64-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP237]]
+// CHECK-64-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP356]]
 // CHECK-64-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP11]], 1
-// CHECK-64-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP237]]
-// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP238:![0-9]+]]
+// CHECK-64-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP356]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP357:![0-9]+]]
 // CHECK-64:       omp.inner.for.end:
 // CHECK-64-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-64:       omp.dispatch.inc:
@@ -5251,11 +5303,13 @@ int a;
 //
 //
 // CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l117
-// CHECK-64-SAME: () #[[ATTR0]] {
+// CHECK-64-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
 // CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-64-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK-64-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l117_kernel_environment)
+// CHECK-64-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l117_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-64:       user_code.entry:
@@ -5400,23 +5454,23 @@ int a;
 // CHECK-64-NEXT:    store i32 [[TMP7]], ptr [[DOTOMP_IV]], align 4
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-64:       omp.inner.for.cond:
-// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP240:![0-9]+]]
-// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP240]]
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP359:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP359]]
 // CHECK-64-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
 // CHECK-64-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-64:       omp.inner.for.body:
-// CHECK-64-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP240]]
+// CHECK-64-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP359]]
 // CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
 // CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP240]]
+// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP359]]
 // CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-64:       omp.body.continue:
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-64:       omp.inner.for.inc:
-// CHECK-64-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP240]]
+// CHECK-64-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP359]]
 // CHECK-64-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP11]], 1
-// CHECK-64-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP240]]
-// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP241:![0-9]+]]
+// CHECK-64-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP359]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP360:![0-9]+]]
 // CHECK-64:       omp.inner.for.end:
 // CHECK-64-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-64:       omp.dispatch.inc:
@@ -5426,11 +5480,13 @@ int a;
 //
 //
 // CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l121
-// CHECK-64-SAME: () #[[ATTR0]] {
+// CHECK-64-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
 // CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-64-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK-64-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l121_kernel_environment)
+// CHECK-64-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l121_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-64:       user_code.entry:
@@ -5575,23 +5631,23 @@ int a;
 // CHECK-64-NEXT:    store i32 [[TMP7]], ptr [[DOTOMP_IV]], align 4
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-64:       omp.inner.for.cond:
-// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP243:![0-9]+]]
-// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP243]]
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP362:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP362]]
 // CHECK-64-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
 // CHECK-64-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-64:       omp.inner.for.body:
-// CHECK-64-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP243]]
+// CHECK-64-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP362]]
 // CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
 // CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP243]]
+// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP362]]
 // CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-64:       omp.body.continue:
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-64:       omp.inner.for.inc:
-// CHECK-64-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP243]]
+// CHECK-64-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP362]]
 // CHECK-64-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP11]], 1
-// CHECK-64-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP243]]
-// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP244:![0-9]+]]
+// CHECK-64-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP362]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP363:![0-9]+]]
 // CHECK-64:       omp.inner.for.end:
 // CHECK-64-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-64:       omp.dispatch.inc:
@@ -5601,11 +5657,13 @@ int a;
 //
 //
 // CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l125
-// CHECK-64-SAME: () #[[ATTR0]] {
+// CHECK-64-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
 // CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-64-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK-64-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l125_kernel_environment)
+// CHECK-64-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l125_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-64:       user_code.entry:
@@ -5769,11 +5827,13 @@ int a;
 //
 //
 // CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l130
-// CHECK-64-SAME: () #[[ATTR0]] {
+// CHECK-64-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
 // CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-64-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK-64-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l130_kernel_environment)
+// CHECK-64-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l130_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-64:       user_code.entry:
@@ -5946,11 +6006,13 @@ int a;
 //
 //
 // CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l135
-// CHECK-64-SAME: () #[[ATTR0]] {
+// CHECK-64-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
 // CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-64-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK-64-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l135_kernel_environment)
+// CHECK-64-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l135_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-64:       user_code.entry:
@@ -6114,11 +6176,13 @@ int a;
 //
 //
 // CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l140
-// CHECK-64-SAME: () #[[ATTR0]] {
+// CHECK-64-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
 // CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-64-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK-64-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l140_kernel_environment)
+// CHECK-64-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l140_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-64:       user_code.entry:
@@ -6263,23 +6327,23 @@ int a;
 // CHECK-64-NEXT:    store i32 [[TMP7]], ptr [[DOTOMP_IV]], align 4
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-64:       omp.inner.for.cond:
-// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP246:![0-9]+]]
-// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP246]]
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP365:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP365]]
 // CHECK-64-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
 // CHECK-64-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-64:       omp.inner.for.body:
-// CHECK-64-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP246]]
+// CHECK-64-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP365]]
 // CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
 // CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP246]]
+// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP365]]
 // CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-64:       omp.body.continue:
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-64:       omp.inner.for.inc:
-// CHECK-64-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP246]]
+// CHECK-64-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP365]]
 // CHECK-64-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP11]], 1
-// CHECK-64-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP246]]
-// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP247:![0-9]+]]
+// CHECK-64-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP365]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP366:![0-9]+]]
 // CHECK-64:       omp.inner.for.end:
 // CHECK-64-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-64:       omp.dispatch.inc:
@@ -6289,11 +6353,13 @@ int a;
 //
 //
 // CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l145
-// CHECK-64-SAME: () #[[ATTR0]] {
+// CHECK-64-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
 // CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-64-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK-64-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l145_kernel_environment)
+// CHECK-64-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l145_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-64:       user_code.entry:
@@ -6438,23 +6504,23 @@ int a;
 // CHECK-64-NEXT:    store i32 [[TMP7]], ptr [[DOTOMP_IV]], align 4
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-64:       omp.inner.for.cond:
-// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP249:![0-9]+]]
-// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP249]]
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP368:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP368]]
 // CHECK-64-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
 // CHECK-64-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-64:       omp.inner.for.body:
-// CHECK-64-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP249]]
+// CHECK-64-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP368]]
 // CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
 // CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP249]]
+// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP368]]
 // CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-64:       omp.body.continue:
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-64:       omp.inner.for.inc:
-// CHECK-64-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP249]]
+// CHECK-64-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP368]]
 // CHECK-64-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP11]], 1
-// CHECK-64-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP249]]
-// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP250:![0-9]+]]
+// CHECK-64-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP368]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP369:![0-9]+]]
 // CHECK-64:       omp.inner.for.end:
 // CHECK-64-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-64:       omp.dispatch.inc:
@@ -6464,11 +6530,13 @@ int a;
 //
 //
 // CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l150
-// CHECK-64-SAME: () #[[ATTR0]] {
+// CHECK-64-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
 // CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-64-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK-64-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l150_kernel_environment)
+// CHECK-64-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l150_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-64:       user_code.entry:
@@ -6613,23 +6681,23 @@ int a;
 // CHECK-64-NEXT:    store i32 [[TMP7]], ptr [[DOTOMP_IV]], align 4
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-64:       omp.inner.for.cond:
-// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP252:![0-9]+]]
-// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP252]]
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP371:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP371]]
 // CHECK-64-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
 // CHECK-64-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-64:       omp.inner.for.body:
-// CHECK-64-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP252]]
+// CHECK-64-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP371]]
 // CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
 // CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP252]]
+// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP371]]
 // CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-64:       omp.body.continue:
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-64:       omp.inner.for.inc:
-// CHECK-64-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP252]]
+// CHECK-64-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP371]]
 // CHECK-64-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP11]], 1
-// CHECK-64-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP252]]
-// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP253:![0-9]+]]
+// CHECK-64-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP371]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP372:![0-9]+]]
 // CHECK-64:       omp.inner.for.end:
 // CHECK-64-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-64:       omp.dispatch.inc:
@@ -6639,11 +6707,13 @@ int a;
 //
 //
 // CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l155
-// CHECK-64-SAME: () #[[ATTR0]] {
+// CHECK-64-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
 // CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-64-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK-64-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l155_kernel_environment)
+// CHECK-64-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l155_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-64:       user_code.entry:
@@ -6788,23 +6858,23 @@ int a;
 // CHECK-64-NEXT:    store i32 [[TMP7]], ptr [[DOTOMP_IV]], align 4
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-64:       omp.inner.for.cond:
-// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP255:![0-9]+]]
-// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP255]]
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP374:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP374]]
 // CHECK-64-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
 // CHECK-64-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-64:       omp.inner.for.body:
-// CHECK-64-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP255]]
+// CHECK-64-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP374]]
 // CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
 // CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP255]]
+// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP374]]
 // CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-64:       omp.body.continue:
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-64:       omp.inner.for.inc:
-// CHECK-64-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP255]]
+// CHECK-64-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP374]]
 // CHECK-64-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP11]], 1
-// CHECK-64-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP255]]
-// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP256:![0-9]+]]
+// CHECK-64-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP374]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP375:![0-9]+]]
 // CHECK-64:       omp.inner.for.end:
 // CHECK-64-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-64:       omp.dispatch.inc:
@@ -6814,12 +6884,14 @@ int a;
 //
 //
 // CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l160
-// CHECK-64-SAME: (i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR8:[0-9]+]] {
+// CHECK-64-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR8:[0-9]+]] {
 // CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-64-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8
 // CHECK-64-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8
+// CHECK-64-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK-64-NEXT:    store i64 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 8
-// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l160_kernel_environment)
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l160_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-64:       user_code.entry:
@@ -6912,10 +6984,12 @@ int a;
 //
 //
 // CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l163
-// CHECK-64-SAME: () #[[ATTR8]] {
+// CHECK-64-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR8]] {
 // CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-64-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8
-// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l163_kernel_environment)
+// CHECK-64-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l163_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-64:       user_code.entry:
@@ -6988,10 +7062,12 @@ int a;
 //
 //
 // CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l166
-// CHECK-64-SAME: () #[[ATTR8]] {
+// CHECK-64-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR8]] {
 // CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-64-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8
-// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l166_kernel_environment)
+// CHECK-64-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l166_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-64:       user_code.entry:
@@ -7081,10 +7157,12 @@ int a;
 //
 //
 // CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l169
-// CHECK-64-SAME: () #[[ATTR8]] {
+// CHECK-64-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR8]] {
 // CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-64-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8
-// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l169_kernel_environment)
+// CHECK-64-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l169_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-64:       user_code.entry:
@@ -7127,23 +7205,23 @@ int a;
 // CHECK-64-NEXT:    store i32 [[TMP3]], ptr [[DOTOMP_IV]], align 4
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-64:       omp.inner.for.cond:
-// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP258:![0-9]+]]
-// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP258]]
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP377:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP377]]
 // CHECK-64-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP4]], [[TMP5]]
 // CHECK-64-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-64:       omp.inner.for.body:
-// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP258]]
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP377]]
 // CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP6]], 1
 // CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP258]]
+// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP377]]
 // CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-64:       omp.body.continue:
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-64:       omp.inner.for.inc:
-// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP258]]
+// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP377]]
 // CHECK-64-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP7]], 1
-// CHECK-64-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP258]]
-// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP259:![0-9]+]]
+// CHECK-64-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP377]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP378:![0-9]+]]
 // CHECK-64:       omp.inner.for.end:
 // CHECK-64-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-64:       omp.dispatch.inc:
@@ -7153,10 +7231,12 @@ int a;
 //
 //
 // CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l172
-// CHECK-64-SAME: () #[[ATTR8]] {
+// CHECK-64-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR8]] {
 // CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-64-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8
-// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l172_kernel_environment)
+// CHECK-64-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l172_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-64:       user_code.entry:
@@ -7199,23 +7279,23 @@ int a;
 // CHECK-64-NEXT:    store i32 [[TMP3]], ptr [[DOTOMP_IV]], align 4
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-64:       omp.inner.for.cond:
-// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP261:![0-9]+]]
-// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP261]]
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP380:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP380]]
 // CHECK-64-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP4]], [[TMP5]]
 // CHECK-64-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-64:       omp.inner.for.body:
-// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP261]]
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP380]]
 // CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP6]], 1
 // CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP261]]
+// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP380]]
 // CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-64:       omp.body.continue:
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-64:       omp.inner.for.inc:
-// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP261]]
+// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP380]]
 // CHECK-64-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP7]], 1
-// CHECK-64-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP261]]
-// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP262:![0-9]+]]
+// CHECK-64-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP380]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP381:![0-9]+]]
 // CHECK-64:       omp.inner.for.end:
 // CHECK-64-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-64:       omp.dispatch.inc:
@@ -7225,10 +7305,12 @@ int a;
 //
 //
 // CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l175
-// CHECK-64-SAME: () #[[ATTR8]] {
+// CHECK-64-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR8]] {
 // CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-64-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8
-// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l175_kernel_environment)
+// CHECK-64-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l175_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-64:       user_code.entry:
@@ -7271,23 +7353,23 @@ int a;
 // CHECK-64-NEXT:    store i32 [[TMP3]], ptr [[DOTOMP_IV]], align 4
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-64:       omp.inner.for.cond:
-// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP264:![0-9]+]]
-// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP264]]
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP383:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP383]]
 // CHECK-64-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP4]], [[TMP5]]
 // CHECK-64-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-64:       omp.inner.for.body:
-// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP264]]
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP383]]
 // CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP6]], 1
 // CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP264]]
+// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP383]]
 // CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-64:       omp.body.continue:
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-64:       omp.inner.for.inc:
-// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP264]]
+// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP383]]
 // CHECK-64-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP7]], 1
-// CHECK-64-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP264]]
-// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP265:![0-9]+]]
+// CHECK-64-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP383]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP384:![0-9]+]]
 // CHECK-64:       omp.inner.for.end:
 // CHECK-64-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-64:       omp.dispatch.inc:
@@ -7297,10 +7379,12 @@ int a;
 //
 //
 // CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l178
-// CHECK-64-SAME: () #[[ATTR8]] {
+// CHECK-64-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR8]] {
 // CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-64-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8
-// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l178_kernel_environment)
+// CHECK-64-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l178_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-64:       user_code.entry:
@@ -7343,23 +7427,23 @@ int a;
 // CHECK-64-NEXT:    store i32 [[TMP3]], ptr [[DOTOMP_IV]], align 4
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-64:       omp.inner.for.cond:
-// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP267:![0-9]+]]
-// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP267]]
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP386:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP386]]
 // CHECK-64-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP4]], [[TMP5]]
 // CHECK-64-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-64:       omp.inner.for.body:
-// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP267]]
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP386]]
 // CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP6]], 1
 // CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP267]]
+// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP386]]
 // CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-64:       omp.body.continue:
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-64:       omp.inner.for.inc:
-// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP267]]
+// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP386]]
 // CHECK-64-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP7]], 1
-// CHECK-64-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP267]]
-// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP268:![0-9]+]]
+// CHECK-64-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP386]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP387:![0-9]+]]
 // CHECK-64:       omp.inner.for.end:
 // CHECK-64-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-64:       omp.dispatch.inc:
@@ -7369,12 +7453,14 @@ int a;
 //
 //
 // CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l181
-// CHECK-64-SAME: (i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR8]] {
+// CHECK-64-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR8]] {
 // CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-64-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8
 // CHECK-64-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8
+// CHECK-64-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK-64-NEXT:    store i64 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 8
-// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l181_kernel_environment)
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l181_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-64:       user_code.entry:
@@ -7432,23 +7518,23 @@ int a;
 // CHECK-64:       omp.dispatch.body:
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-64:       omp.inner.for.cond:
-// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP270:![0-9]+]]
-// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP270]]
+// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP389:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP389]]
 // CHECK-64-NEXT:    [[CMP2:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]]
 // CHECK-64-NEXT:    br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-64:       omp.inner.for.body:
-// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP270]]
+// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP389]]
 // CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP9]], 1
 // CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP270]]
+// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP389]]
 // CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-64:       omp.body.continue:
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-64:       omp.inner.for.inc:
-// CHECK-64-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP270]]
+// CHECK-64-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP389]]
 // CHECK-64-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP10]], 1
-// CHECK-64-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP270]]
-// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP271:![0-9]+]]
+// CHECK-64-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP389]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP390:![0-9]+]]
 // CHECK-64:       omp.inner.for.end:
 // CHECK-64-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-64:       omp.dispatch.inc:
@@ -7475,10 +7561,12 @@ int a;
 //
 //
 // CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l185
-// CHECK-64-SAME: () #[[ATTR8]] {
+// CHECK-64-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR8]] {
 // CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-64-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8
-// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l185_kernel_environment)
+// CHECK-64-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l185_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-64:       user_code.entry:
@@ -7526,23 +7614,23 @@ int a;
 // CHECK-64-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-64:       omp.inner.for.cond:
-// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP273:![0-9]+]]
-// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP273]]
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP392:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP392]]
 // CHECK-64-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
 // CHECK-64-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-64:       omp.inner.for.body:
-// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP273]]
+// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP392]]
 // CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
 // CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP273]]
+// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP392]]
 // CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-64:       omp.body.continue:
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-64:       omp.inner.for.inc:
-// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP273]]
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP392]]
 // CHECK-64-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP8]], 1
-// CHECK-64-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP273]]
-// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP274:![0-9]+]]
+// CHECK-64-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP392]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP393:![0-9]+]]
 // CHECK-64:       omp.inner.for.end:
 // CHECK-64-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK-64:       omp.loop.exit:
@@ -7559,10 +7647,12 @@ int a;
 //
 //
 // CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l189
-// CHECK-64-SAME: () #[[ATTR8]] {
+// CHECK-64-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR8]] {
 // CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-64-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8
-// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l189_kernel_environment)
+// CHECK-64-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l189_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-64:       user_code.entry:
@@ -7617,23 +7707,23 @@ int a;
 // CHECK-64:       omp.dispatch.body:
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-64:       omp.inner.for.cond:
-// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP276:![0-9]+]]
-// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP276]]
+// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP395:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP395]]
 // CHECK-64-NEXT:    [[CMP2:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]]
 // CHECK-64-NEXT:    br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-64:       omp.inner.for.body:
-// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP276]]
+// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP395]]
 // CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP9]], 1
 // CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP276]]
+// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP395]]
 // CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-64:       omp.body.continue:
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-64:       omp.inner.for.inc:
-// CHECK-64-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP276]]
+// CHECK-64-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP395]]
 // CHECK-64-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP10]], 1
-// CHECK-64-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP276]]
-// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP277:![0-9]+]]
+// CHECK-64-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP395]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP396:![0-9]+]]
 // CHECK-64:       omp.inner.for.end:
 // CHECK-64-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-64:       omp.dispatch.inc:
@@ -7660,10 +7750,12 @@ int a;
 //
 //
 // CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l193
-// CHECK-64-SAME: () #[[ATTR8]] {
+// CHECK-64-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR8]] {
 // CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-64-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8
-// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l193_kernel_environment)
+// CHECK-64-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l193_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-64:       user_code.entry:
@@ -7706,23 +7798,23 @@ int a;
 // CHECK-64-NEXT:    store i32 [[TMP3]], ptr [[DOTOMP_IV]], align 4
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-64:       omp.inner.for.cond:
-// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP279:![0-9]+]]
-// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP279]]
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP398:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP398]]
 // CHECK-64-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP4]], [[TMP5]]
 // CHECK-64-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-64:       omp.inner.for.body:
-// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP279]]
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP398]]
 // CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP6]], 1
 // CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP279]]
+// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP398]]
 // CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-64:       omp.body.continue:
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-64:       omp.inner.for.inc:
-// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP279]]
+// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP398]]
 // CHECK-64-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP7]], 1
-// CHECK-64-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP279]]
-// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP280:![0-9]+]]
+// CHECK-64-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP398]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP399:![0-9]+]]
 // CHECK-64:       omp.inner.for.end:
 // CHECK-64-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-64:       omp.dispatch.inc:
@@ -7740,10 +7832,12 @@ int a;
 //
 //
 // CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l197
-// CHECK-64-SAME: () #[[ATTR8]] {
+// CHECK-64-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR8]] {
 // CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-64-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8
-// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l197_kernel_environment)
+// CHECK-64-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l197_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-64:       user_code.entry:
@@ -7786,23 +7880,23 @@ int a;
 // CHECK-64-NEXT:    store i32 [[TMP3]], ptr [[DOTOMP_IV]], align 4
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-64:       omp.inner.for.cond:
-// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP282:![0-9]+]]
-// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP282]]
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP401:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP401]]
 // CHECK-64-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP4]], [[TMP5]]
 // CHECK-64-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-64:       omp.inner.for.body:
-// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP282]]
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP401]]
 // CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP6]], 1
 // CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP282]]
+// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP401]]
 // CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-64:       omp.body.continue:
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-64:       omp.inner.for.inc:
-// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP282]]
+// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP401]]
 // CHECK-64-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP7]], 1
-// CHECK-64-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP282]]
-// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP283:![0-9]+]]
+// CHECK-64-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP401]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP402:![0-9]+]]
 // CHECK-64:       omp.inner.for.end:
 // CHECK-64-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-64:       omp.dispatch.inc:
@@ -7820,10 +7914,12 @@ int a;
 //
 //
 // CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l201
-// CHECK-64-SAME: () #[[ATTR8]] {
+// CHECK-64-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR8]] {
 // CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-64-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8
-// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l201_kernel_environment)
+// CHECK-64-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l201_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-64:       user_code.entry:
@@ -7866,23 +7962,23 @@ int a;
 // CHECK-64-NEXT:    store i32 [[TMP3]], ptr [[DOTOMP_IV]], align 4
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-64:       omp.inner.for.cond:
-// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP285:![0-9]+]]
-// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP285]]
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP404:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP404]]
 // CHECK-64-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP4]], [[TMP5]]
 // CHECK-64-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-64:       omp.inner.for.body:
-// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP285]]
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP404]]
 // CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP6]], 1
 // CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP285]]
+// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP404]]
 // CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-64:       omp.body.continue:
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-64:       omp.inner.for.inc:
-// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP285]]
+// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP404]]
 // CHECK-64-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP7]], 1
-// CHECK-64-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP285]]
-// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP286:![0-9]+]]
+// CHECK-64-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP404]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP405:![0-9]+]]
 // CHECK-64:       omp.inner.for.end:
 // CHECK-64-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-64:       omp.dispatch.inc:
@@ -7900,10 +7996,12 @@ int a;
 //
 //
 // CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l205
-// CHECK-64-SAME: () #[[ATTR8]] {
+// CHECK-64-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR8]] {
 // CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-64-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8
-// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l205_kernel_environment)
+// CHECK-64-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l205_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-64:       user_code.entry:
@@ -7946,23 +8044,23 @@ int a;
 // CHECK-64-NEXT:    store i32 [[TMP3]], ptr [[DOTOMP_IV]], align 4
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-64:       omp.inner.for.cond:
-// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP288:![0-9]+]]
-// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP288]]
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP407:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP407]]
 // CHECK-64-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP4]], [[TMP5]]
 // CHECK-64-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-64:       omp.inner.for.body:
-// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP288]]
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP407]]
 // CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP6]], 1
 // CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP288]]
+// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP407]]
 // CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-64:       omp.body.continue:
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-64:       omp.inner.for.inc:
-// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP288]]
+// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP407]]
 // CHECK-64-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP7]], 1
-// CHECK-64-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP288]]
-// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP289:![0-9]+]]
+// CHECK-64-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP407]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP408:![0-9]+]]
 // CHECK-64:       omp.inner.for.end:
 // CHECK-64-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-64:       omp.dispatch.inc:
@@ -7980,10 +8078,12 @@ int a;
 //
 //
 // CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l209
-// CHECK-64-SAME: () #[[ATTR10:[0-9]+]] {
+// CHECK-64-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR10:[0-9]+]] {
 // CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-64-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8
-// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l209_kernel_environment)
+// CHECK-64-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l209_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-64:       user_code.entry:
@@ -8026,24 +8126,24 @@ int a;
 // CHECK-64-NEXT:    store i32 [[TMP3]], ptr [[DOTOMP_IV]], align 4
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-64:       omp.inner.for.cond:
-// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP291:![0-9]+]]
-// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP291]]
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP410:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP410]]
 // CHECK-64-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP4]], [[TMP5]]
 // CHECK-64-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-64:       omp.inner.for.body:
-// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP291]]
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP410]]
 // CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP6]], 1
 // CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP291]]
+// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP410]]
 // CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-64:       omp.body.continue:
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-64:       omp.inner.for.inc:
-// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP291]]
+// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP410]]
 // CHECK-64-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP7]], 1
-// CHECK-64-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP291]]
-// CHECK-64-NEXT:    call void @__kmpc_dispatch_fini_4(ptr @[[GLOB1]], i32 [[TMP1]]), !llvm.access.group [[ACC_GRP291]]
-// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP292:![0-9]+]]
+// CHECK-64-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP410]]
+// CHECK-64-NEXT:    call void @__kmpc_dispatch_fini_4(ptr @[[GLOB1]], i32 [[TMP1]]), !llvm.access.group [[ACC_GRP410]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP411:![0-9]+]]
 // CHECK-64:       omp.inner.for.end:
 // CHECK-64-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-64:       omp.dispatch.inc:
@@ -8061,10 +8161,12 @@ int a;
 //
 //
 // CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l214
-// CHECK-64-SAME: () #[[ATTR10]] {
+// CHECK-64-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR10]] {
 // CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-64-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8
-// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l214_kernel_environment)
+// CHECK-64-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l214_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-64:       user_code.entry:
@@ -8112,23 +8214,23 @@ int a;
 // CHECK-64-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-64:       omp.inner.for.cond:
-// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP294:![0-9]+]]
-// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP294]]
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP413:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP413]]
 // CHECK-64-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
 // CHECK-64-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-64:       omp.inner.for.body:
-// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP294]]
+// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP413]]
 // CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
 // CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP294]]
+// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP413]]
 // CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-64:       omp.body.continue:
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-64:       omp.inner.for.inc:
-// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP294]]
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP413]]
 // CHECK-64-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP8]], 1
-// CHECK-64-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP294]]
-// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP295:![0-9]+]]
+// CHECK-64-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP413]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP414:![0-9]+]]
 // CHECK-64:       omp.inner.for.end:
 // CHECK-64-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK-64:       omp.loop.exit:
@@ -8145,10 +8247,12 @@ int a;
 //
 //
 // CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l219
-// CHECK-64-SAME: () #[[ATTR10]] {
+// CHECK-64-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR10]] {
 // CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-64-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8
-// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l219_kernel_environment)
+// CHECK-64-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l219_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-64:       user_code.entry:
@@ -8203,23 +8307,23 @@ int a;
 // CHECK-64:       omp.dispatch.body:
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-64:       omp.inner.for.cond:
-// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP297:![0-9]+]]
-// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP297]]
+// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP416:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP416]]
 // CHECK-64-NEXT:    [[CMP2:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]]
 // CHECK-64-NEXT:    br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-64:       omp.inner.for.body:
-// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP297]]
+// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP416]]
 // CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP9]], 1
 // CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP297]]
+// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP416]]
 // CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-64:       omp.body.continue:
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-64:       omp.inner.for.inc:
-// CHECK-64-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP297]]
+// CHECK-64-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP416]]
 // CHECK-64-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP10]], 1
-// CHECK-64-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP297]]
-// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP298:![0-9]+]]
+// CHECK-64-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP416]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP417:![0-9]+]]
 // CHECK-64:       omp.inner.for.end:
 // CHECK-64-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-64:       omp.dispatch.inc:
@@ -8246,10 +8350,12 @@ int a;
 //
 //
 // CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l224
-// CHECK-64-SAME: () #[[ATTR10]] {
+// CHECK-64-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR10]] {
 // CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-64-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8
-// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l224_kernel_environment)
+// CHECK-64-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l224_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-64:       user_code.entry:
@@ -8292,23 +8398,23 @@ int a;
 // CHECK-64-NEXT:    store i32 [[TMP3]], ptr [[DOTOMP_IV]], align 4
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-64:       omp.inner.for.cond:
-// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP300:![0-9]+]]
-// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP300]]
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP419:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP419]]
 // CHECK-64-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP4]], [[TMP5]]
 // CHECK-64-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-64:       omp.inner.for.body:
-// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP300]]
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP419]]
 // CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP6]], 1
 // CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP300]]
+// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP419]]
 // CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-64:       omp.body.continue:
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-64:       omp.inner.for.inc:
-// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP300]]
+// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP419]]
 // CHECK-64-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP7]], 1
-// CHECK-64-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP300]]
-// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP301:![0-9]+]]
+// CHECK-64-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP419]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP420:![0-9]+]]
 // CHECK-64:       omp.inner.for.end:
 // CHECK-64-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-64:       omp.dispatch.inc:
@@ -8326,10 +8432,12 @@ int a;
 //
 //
 // CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l229
-// CHECK-64-SAME: () #[[ATTR10]] {
+// CHECK-64-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR10]] {
 // CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-64-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8
-// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l229_kernel_environment)
+// CHECK-64-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l229_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-64:       user_code.entry:
@@ -8372,23 +8480,23 @@ int a;
 // CHECK-64-NEXT:    store i32 [[TMP3]], ptr [[DOTOMP_IV]], align 4
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-64:       omp.inner.for.cond:
-// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP303:![0-9]+]]
-// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP303]]
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP422:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP422]]
 // CHECK-64-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP4]], [[TMP5]]
 // CHECK-64-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-64:       omp.inner.for.body:
-// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP303]]
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP422]]
 // CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP6]], 1
 // CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP303]]
+// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP422]]
 // CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-64:       omp.body.continue:
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-64:       omp.inner.for.inc:
-// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP303]]
+// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP422]]
 // CHECK-64-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP7]], 1
-// CHECK-64-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP303]]
-// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP304:![0-9]+]]
+// CHECK-64-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP422]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP423:![0-9]+]]
 // CHECK-64:       omp.inner.for.end:
 // CHECK-64-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-64:       omp.dispatch.inc:
@@ -8406,10 +8514,12 @@ int a;
 //
 //
 // CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l234
-// CHECK-64-SAME: () #[[ATTR10]] {
+// CHECK-64-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR10]] {
 // CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-64-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8
-// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l234_kernel_environment)
+// CHECK-64-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l234_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-64:       user_code.entry:
@@ -8452,23 +8562,23 @@ int a;
 // CHECK-64-NEXT:    store i32 [[TMP3]], ptr [[DOTOMP_IV]], align 4
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-64:       omp.inner.for.cond:
-// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP306:![0-9]+]]
-// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP306]]
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP425:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP425]]
 // CHECK-64-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP4]], [[TMP5]]
 // CHECK-64-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-64:       omp.inner.for.body:
-// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP306]]
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP425]]
 // CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP6]], 1
 // CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP306]]
+// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP425]]
 // CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-64:       omp.body.continue:
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-64:       omp.inner.for.inc:
-// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP306]]
+// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP425]]
 // CHECK-64-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP7]], 1
-// CHECK-64-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP306]]
-// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP307:![0-9]+]]
+// CHECK-64-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP425]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP426:![0-9]+]]
 // CHECK-64:       omp.inner.for.end:
 // CHECK-64-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-64:       omp.dispatch.inc:
@@ -8486,10 +8596,12 @@ int a;
 //
 //
 // CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l239
-// CHECK-64-SAME: () #[[ATTR10]] {
+// CHECK-64-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR10]] {
 // CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-64-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8
-// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l239_kernel_environment)
+// CHECK-64-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l239_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-64:       user_code.entry:
@@ -8532,23 +8644,23 @@ int a;
 // CHECK-64-NEXT:    store i32 [[TMP3]], ptr [[DOTOMP_IV]], align 4
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-64:       omp.inner.for.cond:
-// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP309:![0-9]+]]
-// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP309]]
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP428:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP428]]
 // CHECK-64-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP4]], [[TMP5]]
 // CHECK-64-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-64:       omp.inner.for.body:
-// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP309]]
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP428]]
 // CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP6]], 1
 // CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP309]]
+// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP428]]
 // CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-64:       omp.body.continue:
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-64:       omp.inner.for.inc:
-// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP309]]
+// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP428]]
 // CHECK-64-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP7]], 1
-// CHECK-64-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP309]]
-// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP310:![0-9]+]]
+// CHECK-64-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP428]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP429:![0-9]+]]
 // CHECK-64:       omp.inner.for.end:
 // CHECK-64-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-64:       omp.dispatch.inc:
@@ -8566,10 +8678,12 @@ int a;
 //
 //
 // CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l244
-// CHECK-64-SAME: () #[[ATTR8]] {
+// CHECK-64-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR8]] {
 // CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-64-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8
-// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l244_kernel_environment)
+// CHECK-64-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l244_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-64:       user_code.entry:
@@ -8659,10 +8773,12 @@ int a;
 //
 //
 // CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l248
-// CHECK-64-SAME: () #[[ATTR8]] {
+// CHECK-64-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR8]] {
 // CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-64-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8
-// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l248_kernel_environment)
+// CHECK-64-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l248_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-64:       user_code.entry:
@@ -8735,10 +8851,12 @@ int a;
 //
 //
 // CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l252
-// CHECK-64-SAME: () #[[ATTR8]] {
+// CHECK-64-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR8]] {
 // CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-64-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8
-// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l252_kernel_environment)
+// CHECK-64-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l252_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-64:       user_code.entry:
@@ -8828,10 +8946,12 @@ int a;
 //
 //
 // CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l256
-// CHECK-64-SAME: () #[[ATTR8]] {
+// CHECK-64-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR8]] {
 // CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-64-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8
-// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l256_kernel_environment)
+// CHECK-64-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l256_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-64:       user_code.entry:
@@ -8874,23 +8994,23 @@ int a;
 // CHECK-64-NEXT:    store i32 [[TMP3]], ptr [[DOTOMP_IV]], align 4
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-64:       omp.inner.for.cond:
-// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP312:![0-9]+]]
-// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP312]]
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP431:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP431]]
 // CHECK-64-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP4]], [[TMP5]]
 // CHECK-64-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-64:       omp.inner.for.body:
-// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP312]]
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP431]]
 // CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP6]], 1
 // CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP312]]
+// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP431]]
 // CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-64:       omp.body.continue:
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-64:       omp.inner.for.inc:
-// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP312]]
+// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP431]]
 // CHECK-64-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP7]], 1
-// CHECK-64-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP312]]
-// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP313:![0-9]+]]
+// CHECK-64-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP431]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP432:![0-9]+]]
 // CHECK-64:       omp.inner.for.end:
 // CHECK-64-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-64:       omp.dispatch.inc:
@@ -8900,10 +9020,12 @@ int a;
 //
 //
 // CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l260
-// CHECK-64-SAME: () #[[ATTR8]] {
+// CHECK-64-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR8]] {
 // CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-64-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8
-// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l260_kernel_environment)
+// CHECK-64-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l260_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-64:       user_code.entry:
@@ -8946,23 +9068,23 @@ int a;
 // CHECK-64-NEXT:    store i32 [[TMP3]], ptr [[DOTOMP_IV]], align 4
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-64:       omp.inner.for.cond:
-// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP315:![0-9]+]]
-// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP315]]
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP434:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP434]]
 // CHECK-64-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP4]], [[TMP5]]
 // CHECK-64-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-64:       omp.inner.for.body:
-// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP315]]
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP434]]
 // CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP6]], 1
 // CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP315]]
+// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP434]]
 // CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-64:       omp.body.continue:
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-64:       omp.inner.for.inc:
-// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP315]]
+// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP434]]
 // CHECK-64-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP7]], 1
-// CHECK-64-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP315]]
-// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP316:![0-9]+]]
+// CHECK-64-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP434]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP435:![0-9]+]]
 // CHECK-64:       omp.inner.for.end:
 // CHECK-64-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-64:       omp.dispatch.inc:
@@ -8972,10 +9094,12 @@ int a;
 //
 //
 // CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l264
-// CHECK-64-SAME: () #[[ATTR8]] {
+// CHECK-64-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR8]] {
 // CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-64-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8
-// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l264_kernel_environment)
+// CHECK-64-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l264_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-64:       user_code.entry:
@@ -9018,23 +9142,23 @@ int a;
 // CHECK-64-NEXT:    store i32 [[TMP3]], ptr [[DOTOMP_IV]], align 4
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-64:       omp.inner.for.cond:
-// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP318:![0-9]+]]
-// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP318]]
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP437:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP437]]
 // CHECK-64-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP4]], [[TMP5]]
 // CHECK-64-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-64:       omp.inner.for.body:
-// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP318]]
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP437]]
 // CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP6]], 1
 // CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP318]]
+// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP437]]
 // CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-64:       omp.body.continue:
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-64:       omp.inner.for.inc:
-// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP318]]
+// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP437]]
 // CHECK-64-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP7]], 1
-// CHECK-64-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP318]]
-// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP319:![0-9]+]]
+// CHECK-64-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP437]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP438:![0-9]+]]
 // CHECK-64:       omp.inner.for.end:
 // CHECK-64-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-64:       omp.dispatch.inc:
@@ -9044,10 +9168,12 @@ int a;
 //
 //
 // CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l268
-// CHECK-64-SAME: () #[[ATTR8]] {
+// CHECK-64-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR8]] {
 // CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-64-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8
-// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l268_kernel_environment)
+// CHECK-64-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l268_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-64:       user_code.entry:
@@ -9090,23 +9216,23 @@ int a;
 // CHECK-64-NEXT:    store i32 [[TMP3]], ptr [[DOTOMP_IV]], align 4
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-64:       omp.inner.for.cond:
-// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP321:![0-9]+]]
-// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP321]]
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP440:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP440]]
 // CHECK-64-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP4]], [[TMP5]]
 // CHECK-64-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-64:       omp.inner.for.body:
-// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP321]]
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP440]]
 // CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP6]], 1
 // CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP321]]
+// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP440]]
 // CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-64:       omp.body.continue:
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-64:       omp.inner.for.inc:
-// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP321]]
+// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP440]]
 // CHECK-64-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP7]], 1
-// CHECK-64-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP321]]
-// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP322:![0-9]+]]
+// CHECK-64-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP440]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP441:![0-9]+]]
 // CHECK-64:       omp.inner.for.end:
 // CHECK-64-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-64:       omp.dispatch.inc:
@@ -9116,14 +9242,16 @@ int a;
 //
 //
 // CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l15
-// CHECK-32-SAME: (i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-32-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i32, align 4
 // CHECK-32-NEXT:    [[DOTCAPTURE_EXPR__CASTED:%.*]] = alloca i32, align 4
 // CHECK-32-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK-32-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK-32-NEXT:    store i32 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 4
-// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l15_kernel_environment)
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l15_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32:       user_code.entry:
@@ -9189,58 +9317,58 @@ int a;
 // CHECK-32:       omp_if.then:
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32:       omp.inner.for.cond:
-// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP130:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP249:![0-9]+]]
 // CHECK-32-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP6]], 10
 // CHECK-32-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32:       omp.inner.for.body:
-// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP130]]
-// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP130]]
-// CHECK-32-NEXT:    [[TMP9:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1, !llvm.access.group [[ACC_GRP130]]
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP249]]
+// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP249]]
+// CHECK-32-NEXT:    [[TMP9:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1, !llvm.access.group [[ACC_GRP249]]
 // CHECK-32-NEXT:    [[TOBOOL2:%.*]] = trunc i8 [[TMP9]] to i1
 // CHECK-32-NEXT:    [[FROMBOOL:%.*]] = zext i1 [[TOBOOL2]] to i8
-// CHECK-32-NEXT:    store i8 [[FROMBOOL]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 1, !llvm.access.group [[ACC_GRP130]]
-// CHECK-32-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__CASTED]], align 4, !llvm.access.group [[ACC_GRP130]]
+// CHECK-32-NEXT:    store i8 [[FROMBOOL]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 1, !llvm.access.group [[ACC_GRP249]]
+// CHECK-32-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__CASTED]], align 4, !llvm.access.group [[ACC_GRP249]]
 // CHECK-32-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [3 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
 // CHECK-32-NEXT:    [[TMP12:%.*]] = inttoptr i32 [[TMP7]] to ptr
-// CHECK-32-NEXT:    store ptr [[TMP12]], ptr [[TMP11]], align 4, !llvm.access.group [[ACC_GRP130]]
+// CHECK-32-NEXT:    store ptr [[TMP12]], ptr [[TMP11]], align 4, !llvm.access.group [[ACC_GRP249]]
 // CHECK-32-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [3 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
 // CHECK-32-NEXT:    [[TMP14:%.*]] = inttoptr i32 [[TMP8]] to ptr
-// CHECK-32-NEXT:    store ptr [[TMP14]], ptr [[TMP13]], align 4, !llvm.access.group [[ACC_GRP130]]
+// CHECK-32-NEXT:    store ptr [[TMP14]], ptr [[TMP13]], align 4, !llvm.access.group [[ACC_GRP249]]
 // CHECK-32-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [3 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 2
 // CHECK-32-NEXT:    [[TMP16:%.*]] = inttoptr i32 [[TMP10]] to ptr
-// CHECK-32-NEXT:    store ptr [[TMP16]], ptr [[TMP15]], align 4, !llvm.access.group [[ACC_GRP130]]
-// CHECK-32-NEXT:    [[TMP17:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1, !llvm.access.group [[ACC_GRP130]]
+// CHECK-32-NEXT:    store ptr [[TMP16]], ptr [[TMP15]], align 4, !llvm.access.group [[ACC_GRP249]]
+// CHECK-32-NEXT:    [[TMP17:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1, !llvm.access.group [[ACC_GRP249]]
 // CHECK-32-NEXT:    [[TOBOOL3:%.*]] = trunc i8 [[TMP17]] to i1
 // CHECK-32-NEXT:    [[TMP18:%.*]] = zext i1 [[TOBOOL3]] to i32
-// CHECK-32-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 [[TMP18]], i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l15_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i32 3), !llvm.access.group [[ACC_GRP130]]
+// CHECK-32-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 [[TMP18]], i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l15_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i32 3), !llvm.access.group [[ACC_GRP249]]
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32:       omp.inner.for.inc:
-// CHECK-32-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP130]]
-// CHECK-32-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP130]]
+// CHECK-32-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP249]]
+// CHECK-32-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP249]]
 // CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP19]], [[TMP20]]
-// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP130]]
-// CHECK-32-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP130]]
-// CHECK-32-NEXT:    [[TMP22:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP130]]
+// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP249]]
+// CHECK-32-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP249]]
+// CHECK-32-NEXT:    [[TMP22:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP249]]
 // CHECK-32-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP21]], [[TMP22]]
-// CHECK-32-NEXT:    store i32 [[ADD4]], ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP130]]
-// CHECK-32-NEXT:    [[TMP23:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP130]]
-// CHECK-32-NEXT:    [[TMP24:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP130]]
+// CHECK-32-NEXT:    store i32 [[ADD4]], ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP249]]
+// CHECK-32-NEXT:    [[TMP23:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP249]]
+// CHECK-32-NEXT:    [[TMP24:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP249]]
 // CHECK-32-NEXT:    [[ADD5:%.*]] = add nsw i32 [[TMP23]], [[TMP24]]
-// CHECK-32-NEXT:    store i32 [[ADD5]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP130]]
-// CHECK-32-NEXT:    [[TMP25:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP130]]
+// CHECK-32-NEXT:    store i32 [[ADD5]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP249]]
+// CHECK-32-NEXT:    [[TMP25:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP249]]
 // CHECK-32-NEXT:    [[CMP6:%.*]] = icmp sgt i32 [[TMP25]], 9
 // CHECK-32-NEXT:    br i1 [[CMP6]], label [[COND_TRUE7:%.*]], label [[COND_FALSE8:%.*]]
 // CHECK-32:       cond.true7:
 // CHECK-32-NEXT:    br label [[COND_END9:%.*]]
 // CHECK-32:       cond.false8:
-// CHECK-32-NEXT:    [[TMP26:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP130]]
+// CHECK-32-NEXT:    [[TMP26:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP249]]
 // CHECK-32-NEXT:    br label [[COND_END9]]
 // CHECK-32:       cond.end9:
 // CHECK-32-NEXT:    [[COND10:%.*]] = phi i32 [ 9, [[COND_TRUE7]] ], [ [[TMP26]], [[COND_FALSE8]] ]
-// CHECK-32-NEXT:    store i32 [[COND10]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP130]]
-// CHECK-32-NEXT:    [[TMP27:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP130]]
-// CHECK-32-NEXT:    store i32 [[TMP27]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP130]]
-// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP131:![0-9]+]]
+// CHECK-32-NEXT:    store i32 [[COND10]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP249]]
+// CHECK-32-NEXT:    [[TMP27:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP249]]
+// CHECK-32-NEXT:    store i32 [[TMP27]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP249]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP250:![0-9]+]]
 // CHECK-32:       omp.inner.for.end:
 // CHECK-32-NEXT:    br label [[OMP_IF_END:%.*]]
 // CHECK-32:       omp_if.else:
@@ -9297,7 +9425,7 @@ int a;
 // CHECK-32-NEXT:    store i32 [[COND27]], ptr [[DOTOMP_COMB_UB]], align 4
 // CHECK-32-NEXT:    [[TMP49:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
 // CHECK-32-NEXT:    store i32 [[TMP49]], ptr [[DOTOMP_IV]], align 4
-// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND11]], !llvm.loop [[LOOP134:![0-9]+]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND11]], !llvm.loop [[LOOP253:![0-9]+]]
 // CHECK-32:       omp.inner.for.end28:
 // CHECK-32-NEXT:    br label [[OMP_IF_END]]
 // CHECK-32:       omp_if.end:
@@ -9353,24 +9481,24 @@ int a;
 // CHECK-32-NEXT:    store i32 [[TMP5]], ptr [[DOTOMP_IV]], align 4
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32:       omp.inner.for.cond:
-// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP136:![0-9]+]]
-// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTPREVIOUS_UB__ADDR]], align 4, !llvm.access.group [[ACC_GRP136]]
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP255:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTPREVIOUS_UB__ADDR]], align 4, !llvm.access.group [[ACC_GRP255]]
 // CHECK-32-NEXT:    [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]]
 // CHECK-32-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32:       omp.inner.for.body:
-// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP136]]
+// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP255]]
 // CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1
 // CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP136]]
+// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP255]]
 // CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32:       omp.body.continue:
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32:       omp.inner.for.inc:
-// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP136]]
-// CHECK-32-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP136]]
+// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP255]]
+// CHECK-32-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP255]]
 // CHECK-32-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP9]], [[TMP10]]
-// CHECK-32-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP136]]
-// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP137:![0-9]+]]
+// CHECK-32-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP255]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP256:![0-9]+]]
 // CHECK-32:       omp.inner.for.end:
 // CHECK-32-NEXT:    br label [[OMP_IF_END:%.*]]
 // CHECK-32:       omp_if.else:
@@ -9398,7 +9526,7 @@ int a;
 // CHECK-32-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
 // CHECK-32-NEXT:    [[ADD9:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
 // CHECK-32-NEXT:    store i32 [[ADD9]], ptr [[DOTOMP_IV]], align 4
-// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND2]], !llvm.loop [[LOOP139:![0-9]+]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND2]], !llvm.loop [[LOOP258:![0-9]+]]
 // CHECK-32:       omp.inner.for.end10:
 // CHECK-32-NEXT:    br label [[OMP_IF_END]]
 // CHECK-32:       omp_if.end:
@@ -9456,24 +9584,24 @@ int a;
 // CHECK-32-NEXT:    store i32 [[TMP5]], ptr [[DOTOMP_IV]], align 4
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32:       omp.inner.for.cond:
-// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP140:![0-9]+]]
-// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTPREVIOUS_UB__ADDR]], align 4, !llvm.access.group [[ACC_GRP140]]
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP259:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTPREVIOUS_UB__ADDR]], align 4, !llvm.access.group [[ACC_GRP259]]
 // CHECK-32-NEXT:    [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]]
 // CHECK-32-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32:       omp.inner.for.body:
-// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP140]]
+// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP259]]
 // CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1
 // CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP140]]
+// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP259]]
 // CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32:       omp.body.continue:
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32:       omp.inner.for.inc:
-// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP140]]
-// CHECK-32-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP140]]
+// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP259]]
+// CHECK-32-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP259]]
 // CHECK-32-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP9]], [[TMP10]]
-// CHECK-32-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP140]]
-// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP141:![0-9]+]]
+// CHECK-32-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP259]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP260:![0-9]+]]
 // CHECK-32:       omp.inner.for.end:
 // CHECK-32-NEXT:    br label [[OMP_IF_END:%.*]]
 // CHECK-32:       omp_if.else:
@@ -9501,7 +9629,7 @@ int a;
 // CHECK-32-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
 // CHECK-32-NEXT:    [[ADD9:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
 // CHECK-32-NEXT:    store i32 [[ADD9]], ptr [[DOTOMP_IV]], align 4
-// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND2]], !llvm.loop [[LOOP143:![0-9]+]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND2]], !llvm.loop [[LOOP262:![0-9]+]]
 // CHECK-32:       omp.inner.for.end10:
 // CHECK-32-NEXT:    br label [[OMP_IF_END]]
 // CHECK-32:       omp_if.end:
@@ -9521,11 +9649,13 @@ int a;
 //
 //
 // CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l18
-// CHECK-32-SAME: () #[[ATTR0]] {
+// CHECK-32-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
 // CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK-32-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l18_kernel_environment)
+// CHECK-32-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l18_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32:       user_code.entry:
@@ -9577,47 +9707,47 @@ int a;
 // CHECK-32-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32:       omp.inner.for.cond:
-// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP144:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP263:![0-9]+]]
 // CHECK-32-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
 // CHECK-32-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32:       omp.inner.for.body:
-// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP144]]
-// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP144]]
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP263]]
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP263]]
 // CHECK-32-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
 // CHECK-32-NEXT:    [[TMP9:%.*]] = inttoptr i32 [[TMP6]] to ptr
-// CHECK-32-NEXT:    store ptr [[TMP9]], ptr [[TMP8]], align 4, !llvm.access.group [[ACC_GRP144]]
+// CHECK-32-NEXT:    store ptr [[TMP9]], ptr [[TMP8]], align 4, !llvm.access.group [[ACC_GRP263]]
 // CHECK-32-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
 // CHECK-32-NEXT:    [[TMP11:%.*]] = inttoptr i32 [[TMP7]] to ptr
-// CHECK-32-NEXT:    store ptr [[TMP11]], ptr [[TMP10]], align 4, !llvm.access.group [[ACC_GRP144]]
-// CHECK-32-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l18_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i32 2), !llvm.access.group [[ACC_GRP144]]
+// CHECK-32-NEXT:    store ptr [[TMP11]], ptr [[TMP10]], align 4, !llvm.access.group [[ACC_GRP263]]
+// CHECK-32-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l18_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i32 2), !llvm.access.group [[ACC_GRP263]]
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32:       omp.inner.for.inc:
-// CHECK-32-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP144]]
-// CHECK-32-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP144]]
+// CHECK-32-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP263]]
+// CHECK-32-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP263]]
 // CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP12]], [[TMP13]]
-// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP144]]
-// CHECK-32-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP144]]
-// CHECK-32-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP144]]
+// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP263]]
+// CHECK-32-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP263]]
+// CHECK-32-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP263]]
 // CHECK-32-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP14]], [[TMP15]]
-// CHECK-32-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP144]]
-// CHECK-32-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP144]]
-// CHECK-32-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP144]]
+// CHECK-32-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP263]]
+// CHECK-32-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP263]]
+// CHECK-32-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP263]]
 // CHECK-32-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP16]], [[TMP17]]
-// CHECK-32-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP144]]
-// CHECK-32-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP144]]
+// CHECK-32-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP263]]
+// CHECK-32-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP263]]
 // CHECK-32-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP18]], 9
 // CHECK-32-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
 // CHECK-32:       cond.true5:
 // CHECK-32-NEXT:    br label [[COND_END7:%.*]]
 // CHECK-32:       cond.false6:
-// CHECK-32-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP144]]
+// CHECK-32-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP263]]
 // CHECK-32-NEXT:    br label [[COND_END7]]
 // CHECK-32:       cond.end7:
 // CHECK-32-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP19]], [[COND_FALSE6]] ]
-// CHECK-32-NEXT:    store i32 [[COND8]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP144]]
-// CHECK-32-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP144]]
-// CHECK-32-NEXT:    store i32 [[TMP20]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP144]]
-// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP145:![0-9]+]]
+// CHECK-32-NEXT:    store i32 [[COND8]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP263]]
+// CHECK-32-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP263]]
+// CHECK-32-NEXT:    store i32 [[TMP20]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP263]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP264:![0-9]+]]
 // CHECK-32:       omp.inner.for.end:
 // CHECK-32-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK-32:       omp.loop.exit:
@@ -9676,23 +9806,23 @@ int a;
 // CHECK-32-NEXT:    store i32 [[TMP6]], ptr [[DOTOMP_IV]], align 4
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32:       omp.inner.for.cond:
-// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP147:![0-9]+]]
-// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP147]]
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP266:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP266]]
 // CHECK-32-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]]
 // CHECK-32-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32:       omp.inner.for.body:
-// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP147]]
+// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP266]]
 // CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP9]], 1
 // CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP147]]
+// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP266]]
 // CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32:       omp.body.continue:
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32:       omp.inner.for.inc:
-// CHECK-32-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP147]]
+// CHECK-32-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP266]]
 // CHECK-32-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP10]], 1
-// CHECK-32-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP147]]
-// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP148:![0-9]+]]
+// CHECK-32-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP266]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP267:![0-9]+]]
 // CHECK-32:       omp.inner.for.end:
 // CHECK-32-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK-32:       omp.loop.exit:
@@ -9708,11 +9838,13 @@ int a;
 //
 //
 // CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l21
-// CHECK-32-SAME: () #[[ATTR0]] {
+// CHECK-32-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
 // CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK-32-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l21_kernel_environment)
+// CHECK-32-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l21_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32:       user_code.entry:
@@ -9764,47 +9896,47 @@ int a;
 // CHECK-32-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32:       omp.inner.for.cond:
-// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP150:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP269:![0-9]+]]
 // CHECK-32-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
 // CHECK-32-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32:       omp.inner.for.body:
-// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP150]]
-// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP150]]
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP269]]
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP269]]
 // CHECK-32-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
 // CHECK-32-NEXT:    [[TMP9:%.*]] = inttoptr i32 [[TMP6]] to ptr
-// CHECK-32-NEXT:    store ptr [[TMP9]], ptr [[TMP8]], align 4, !llvm.access.group [[ACC_GRP150]]
+// CHECK-32-NEXT:    store ptr [[TMP9]], ptr [[TMP8]], align 4, !llvm.access.group [[ACC_GRP269]]
 // CHECK-32-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
 // CHECK-32-NEXT:    [[TMP11:%.*]] = inttoptr i32 [[TMP7]] to ptr
-// CHECK-32-NEXT:    store ptr [[TMP11]], ptr [[TMP10]], align 4, !llvm.access.group [[ACC_GRP150]]
-// CHECK-32-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l21_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i32 2), !llvm.access.group [[ACC_GRP150]]
+// CHECK-32-NEXT:    store ptr [[TMP11]], ptr [[TMP10]], align 4, !llvm.access.group [[ACC_GRP269]]
+// CHECK-32-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l21_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i32 2), !llvm.access.group [[ACC_GRP269]]
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32:       omp.inner.for.inc:
-// CHECK-32-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP150]]
-// CHECK-32-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP150]]
+// CHECK-32-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP269]]
+// CHECK-32-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP269]]
 // CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP12]], [[TMP13]]
-// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP150]]
-// CHECK-32-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP150]]
-// CHECK-32-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP150]]
+// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP269]]
+// CHECK-32-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP269]]
+// CHECK-32-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP269]]
 // CHECK-32-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP14]], [[TMP15]]
-// CHECK-32-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP150]]
-// CHECK-32-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP150]]
-// CHECK-32-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP150]]
+// CHECK-32-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP269]]
+// CHECK-32-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP269]]
+// CHECK-32-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP269]]
 // CHECK-32-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP16]], [[TMP17]]
-// CHECK-32-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP150]]
-// CHECK-32-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP150]]
+// CHECK-32-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP269]]
+// CHECK-32-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP269]]
 // CHECK-32-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP18]], 9
 // CHECK-32-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
 // CHECK-32:       cond.true5:
 // CHECK-32-NEXT:    br label [[COND_END7:%.*]]
 // CHECK-32:       cond.false6:
-// CHECK-32-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP150]]
+// CHECK-32-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP269]]
 // CHECK-32-NEXT:    br label [[COND_END7]]
 // CHECK-32:       cond.end7:
 // CHECK-32-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP19]], [[COND_FALSE6]] ]
-// CHECK-32-NEXT:    store i32 [[COND8]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP150]]
-// CHECK-32-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP150]]
-// CHECK-32-NEXT:    store i32 [[TMP20]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP150]]
-// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP151:![0-9]+]]
+// CHECK-32-NEXT:    store i32 [[COND8]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP269]]
+// CHECK-32-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP269]]
+// CHECK-32-NEXT:    store i32 [[TMP20]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP269]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP270:![0-9]+]]
 // CHECK-32:       omp.inner.for.end:
 // CHECK-32-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK-32:       omp.loop.exit:
@@ -9852,24 +9984,24 @@ int a;
 // CHECK-32-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32:       omp.inner.for.cond:
-// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP153:![0-9]+]]
-// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTPREVIOUS_UB__ADDR]], align 4, !llvm.access.group [[ACC_GRP153]]
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP272:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTPREVIOUS_UB__ADDR]], align 4, !llvm.access.group [[ACC_GRP272]]
 // CHECK-32-NEXT:    [[CMP:%.*]] = icmp ule i32 [[TMP5]], [[TMP6]]
 // CHECK-32-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32:       omp.inner.for.body:
-// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP153]]
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP272]]
 // CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
 // CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP153]]
+// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP272]]
 // CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32:       omp.body.continue:
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32:       omp.inner.for.inc:
-// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP153]]
-// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP153]]
+// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP272]]
+// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP272]]
 // CHECK-32-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP8]], [[TMP9]]
-// CHECK-32-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP153]]
-// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP154:![0-9]+]]
+// CHECK-32-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP272]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP273:![0-9]+]]
 // CHECK-32:       omp.inner.for.end:
 // CHECK-32-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK-32:       omp.loop.exit:
@@ -9885,11 +10017,13 @@ int a;
 //
 //
 // CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l24
-// CHECK-32-SAME: () #[[ATTR0]] {
+// CHECK-32-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
 // CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK-32-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l24_kernel_environment)
+// CHECK-32-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l24_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32:       user_code.entry:
@@ -9941,47 +10075,47 @@ int a;
 // CHECK-32-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32:       omp.inner.for.cond:
-// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP156:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP275:![0-9]+]]
 // CHECK-32-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
 // CHECK-32-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32:       omp.inner.for.body:
-// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP156]]
-// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP156]]
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP275]]
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP275]]
 // CHECK-32-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
 // CHECK-32-NEXT:    [[TMP9:%.*]] = inttoptr i32 [[TMP6]] to ptr
-// CHECK-32-NEXT:    store ptr [[TMP9]], ptr [[TMP8]], align 4, !llvm.access.group [[ACC_GRP156]]
+// CHECK-32-NEXT:    store ptr [[TMP9]], ptr [[TMP8]], align 4, !llvm.access.group [[ACC_GRP275]]
 // CHECK-32-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
 // CHECK-32-NEXT:    [[TMP11:%.*]] = inttoptr i32 [[TMP7]] to ptr
-// CHECK-32-NEXT:    store ptr [[TMP11]], ptr [[TMP10]], align 4, !llvm.access.group [[ACC_GRP156]]
-// CHECK-32-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l24_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i32 2), !llvm.access.group [[ACC_GRP156]]
+// CHECK-32-NEXT:    store ptr [[TMP11]], ptr [[TMP10]], align 4, !llvm.access.group [[ACC_GRP275]]
+// CHECK-32-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l24_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i32 2), !llvm.access.group [[ACC_GRP275]]
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32:       omp.inner.for.inc:
-// CHECK-32-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP156]]
-// CHECK-32-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP156]]
+// CHECK-32-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP275]]
+// CHECK-32-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP275]]
 // CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP12]], [[TMP13]]
-// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP156]]
-// CHECK-32-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP156]]
-// CHECK-32-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP156]]
+// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP275]]
+// CHECK-32-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP275]]
+// CHECK-32-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP275]]
 // CHECK-32-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP14]], [[TMP15]]
-// CHECK-32-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP156]]
-// CHECK-32-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP156]]
-// CHECK-32-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP156]]
+// CHECK-32-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP275]]
+// CHECK-32-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP275]]
+// CHECK-32-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP275]]
 // CHECK-32-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP16]], [[TMP17]]
-// CHECK-32-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP156]]
-// CHECK-32-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP156]]
+// CHECK-32-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP275]]
+// CHECK-32-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP275]]
 // CHECK-32-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP18]], 9
 // CHECK-32-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
 // CHECK-32:       cond.true5:
 // CHECK-32-NEXT:    br label [[COND_END7:%.*]]
 // CHECK-32:       cond.false6:
-// CHECK-32-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP156]]
+// CHECK-32-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP275]]
 // CHECK-32-NEXT:    br label [[COND_END7]]
 // CHECK-32:       cond.end7:
 // CHECK-32-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP19]], [[COND_FALSE6]] ]
-// CHECK-32-NEXT:    store i32 [[COND8]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP156]]
-// CHECK-32-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP156]]
-// CHECK-32-NEXT:    store i32 [[TMP20]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP156]]
-// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP157:![0-9]+]]
+// CHECK-32-NEXT:    store i32 [[COND8]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP275]]
+// CHECK-32-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP275]]
+// CHECK-32-NEXT:    store i32 [[TMP20]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP275]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP276:![0-9]+]]
 // CHECK-32:       omp.inner.for.end:
 // CHECK-32-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK-32:       omp.loop.exit:
@@ -10037,23 +10171,23 @@ int a;
 // CHECK-32-NEXT:    store i32 [[TMP7]], ptr [[DOTOMP_IV]], align 4
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32:       omp.inner.for.cond:
-// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP159:![0-9]+]]
-// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP159]]
+// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP278:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP278]]
 // CHECK-32-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
 // CHECK-32-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32:       omp.inner.for.body:
-// CHECK-32-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP159]]
+// CHECK-32-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP278]]
 // CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
 // CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP159]]
+// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP278]]
 // CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32:       omp.body.continue:
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32:       omp.inner.for.inc:
-// CHECK-32-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP159]]
+// CHECK-32-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP278]]
 // CHECK-32-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP11]], 1
-// CHECK-32-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP159]]
-// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP160:![0-9]+]]
+// CHECK-32-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP278]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP279:![0-9]+]]
 // CHECK-32:       omp.inner.for.end:
 // CHECK-32-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-32:       omp.dispatch.inc:
@@ -10070,11 +10204,13 @@ int a;
 //
 //
 // CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l27
-// CHECK-32-SAME: () #[[ATTR0]] {
+// CHECK-32-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
 // CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK-32-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l27_kernel_environment)
+// CHECK-32-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l27_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32:       user_code.entry:
@@ -10126,47 +10262,47 @@ int a;
 // CHECK-32-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32:       omp.inner.for.cond:
-// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP162:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP281:![0-9]+]]
 // CHECK-32-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
 // CHECK-32-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32:       omp.inner.for.body:
-// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP162]]
-// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP162]]
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP281]]
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP281]]
 // CHECK-32-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
 // CHECK-32-NEXT:    [[TMP9:%.*]] = inttoptr i32 [[TMP6]] to ptr
-// CHECK-32-NEXT:    store ptr [[TMP9]], ptr [[TMP8]], align 4, !llvm.access.group [[ACC_GRP162]]
+// CHECK-32-NEXT:    store ptr [[TMP9]], ptr [[TMP8]], align 4, !llvm.access.group [[ACC_GRP281]]
 // CHECK-32-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
 // CHECK-32-NEXT:    [[TMP11:%.*]] = inttoptr i32 [[TMP7]] to ptr
-// CHECK-32-NEXT:    store ptr [[TMP11]], ptr [[TMP10]], align 4, !llvm.access.group [[ACC_GRP162]]
-// CHECK-32-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l27_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i32 2), !llvm.access.group [[ACC_GRP162]]
+// CHECK-32-NEXT:    store ptr [[TMP11]], ptr [[TMP10]], align 4, !llvm.access.group [[ACC_GRP281]]
+// CHECK-32-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l27_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i32 2), !llvm.access.group [[ACC_GRP281]]
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32:       omp.inner.for.inc:
-// CHECK-32-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP162]]
-// CHECK-32-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP162]]
+// CHECK-32-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP281]]
+// CHECK-32-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP281]]
 // CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP12]], [[TMP13]]
-// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP162]]
-// CHECK-32-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP162]]
-// CHECK-32-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP162]]
+// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP281]]
+// CHECK-32-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP281]]
+// CHECK-32-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP281]]
 // CHECK-32-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP14]], [[TMP15]]
-// CHECK-32-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP162]]
-// CHECK-32-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP162]]
-// CHECK-32-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP162]]
+// CHECK-32-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP281]]
+// CHECK-32-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP281]]
+// CHECK-32-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP281]]
 // CHECK-32-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP16]], [[TMP17]]
-// CHECK-32-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP162]]
-// CHECK-32-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP162]]
+// CHECK-32-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP281]]
+// CHECK-32-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP281]]
 // CHECK-32-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP18]], 9
 // CHECK-32-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
 // CHECK-32:       cond.true5:
 // CHECK-32-NEXT:    br label [[COND_END7:%.*]]
 // CHECK-32:       cond.false6:
-// CHECK-32-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP162]]
+// CHECK-32-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP281]]
 // CHECK-32-NEXT:    br label [[COND_END7]]
 // CHECK-32:       cond.end7:
 // CHECK-32-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP19]], [[COND_FALSE6]] ]
-// CHECK-32-NEXT:    store i32 [[COND8]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP162]]
-// CHECK-32-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP162]]
-// CHECK-32-NEXT:    store i32 [[TMP20]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP162]]
-// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP163:![0-9]+]]
+// CHECK-32-NEXT:    store i32 [[COND8]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP281]]
+// CHECK-32-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP281]]
+// CHECK-32-NEXT:    store i32 [[TMP20]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP281]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP282:![0-9]+]]
 // CHECK-32:       omp.inner.for.end:
 // CHECK-32-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK-32:       omp.loop.exit:
@@ -10222,23 +10358,23 @@ int a;
 // CHECK-32-NEXT:    store i32 [[TMP7]], ptr [[DOTOMP_IV]], align 4
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32:       omp.inner.for.cond:
-// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP165:![0-9]+]]
-// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP165]]
+// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP284:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP284]]
 // CHECK-32-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
 // CHECK-32-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32:       omp.inner.for.body:
-// CHECK-32-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP165]]
+// CHECK-32-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP284]]
 // CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
 // CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP165]]
+// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP284]]
 // CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32:       omp.body.continue:
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32:       omp.inner.for.inc:
-// CHECK-32-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP165]]
+// CHECK-32-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP284]]
 // CHECK-32-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP11]], 1
-// CHECK-32-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP165]]
-// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP166:![0-9]+]]
+// CHECK-32-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP284]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP285:![0-9]+]]
 // CHECK-32:       omp.inner.for.end:
 // CHECK-32-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-32:       omp.dispatch.inc:
@@ -10255,11 +10391,13 @@ int a;
 //
 //
 // CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l30
-// CHECK-32-SAME: () #[[ATTR0]] {
+// CHECK-32-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
 // CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK-32-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l30_kernel_environment)
+// CHECK-32-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l30_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32:       user_code.entry:
@@ -10311,47 +10449,47 @@ int a;
 // CHECK-32-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32:       omp.inner.for.cond:
-// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP168:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP287:![0-9]+]]
 // CHECK-32-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
 // CHECK-32-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32:       omp.inner.for.body:
-// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP168]]
-// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP168]]
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP287]]
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP287]]
 // CHECK-32-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
 // CHECK-32-NEXT:    [[TMP9:%.*]] = inttoptr i32 [[TMP6]] to ptr
-// CHECK-32-NEXT:    store ptr [[TMP9]], ptr [[TMP8]], align 4, !llvm.access.group [[ACC_GRP168]]
+// CHECK-32-NEXT:    store ptr [[TMP9]], ptr [[TMP8]], align 4, !llvm.access.group [[ACC_GRP287]]
 // CHECK-32-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
 // CHECK-32-NEXT:    [[TMP11:%.*]] = inttoptr i32 [[TMP7]] to ptr
-// CHECK-32-NEXT:    store ptr [[TMP11]], ptr [[TMP10]], align 4, !llvm.access.group [[ACC_GRP168]]
-// CHECK-32-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l30_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i32 2), !llvm.access.group [[ACC_GRP168]]
+// CHECK-32-NEXT:    store ptr [[TMP11]], ptr [[TMP10]], align 4, !llvm.access.group [[ACC_GRP287]]
+// CHECK-32-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l30_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i32 2), !llvm.access.group [[ACC_GRP287]]
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32:       omp.inner.for.inc:
-// CHECK-32-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP168]]
-// CHECK-32-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP168]]
+// CHECK-32-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP287]]
+// CHECK-32-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP287]]
 // CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP12]], [[TMP13]]
-// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP168]]
-// CHECK-32-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP168]]
-// CHECK-32-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP168]]
+// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP287]]
+// CHECK-32-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP287]]
+// CHECK-32-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP287]]
 // CHECK-32-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP14]], [[TMP15]]
-// CHECK-32-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP168]]
-// CHECK-32-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP168]]
-// CHECK-32-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP168]]
+// CHECK-32-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP287]]
+// CHECK-32-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP287]]
+// CHECK-32-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP287]]
 // CHECK-32-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP16]], [[TMP17]]
-// CHECK-32-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP168]]
-// CHECK-32-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP168]]
+// CHECK-32-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP287]]
+// CHECK-32-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP287]]
 // CHECK-32-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP18]], 9
 // CHECK-32-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
 // CHECK-32:       cond.true5:
 // CHECK-32-NEXT:    br label [[COND_END7:%.*]]
 // CHECK-32:       cond.false6:
-// CHECK-32-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP168]]
+// CHECK-32-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP287]]
 // CHECK-32-NEXT:    br label [[COND_END7]]
 // CHECK-32:       cond.end7:
 // CHECK-32-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP19]], [[COND_FALSE6]] ]
-// CHECK-32-NEXT:    store i32 [[COND8]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP168]]
-// CHECK-32-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP168]]
-// CHECK-32-NEXT:    store i32 [[TMP20]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP168]]
-// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP169:![0-9]+]]
+// CHECK-32-NEXT:    store i32 [[COND8]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP287]]
+// CHECK-32-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP287]]
+// CHECK-32-NEXT:    store i32 [[TMP20]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP287]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP288:![0-9]+]]
 // CHECK-32:       omp.inner.for.end:
 // CHECK-32-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK-32:       omp.loop.exit:
@@ -10407,23 +10545,23 @@ int a;
 // CHECK-32-NEXT:    store i32 [[TMP7]], ptr [[DOTOMP_IV]], align 4
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32:       omp.inner.for.cond:
-// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP171:![0-9]+]]
-// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP171]]
+// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP290:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP290]]
 // CHECK-32-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
 // CHECK-32-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32:       omp.inner.for.body:
-// CHECK-32-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP171]]
+// CHECK-32-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP290]]
 // CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
 // CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP171]]
+// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP290]]
 // CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32:       omp.body.continue:
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32:       omp.inner.for.inc:
-// CHECK-32-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP171]]
+// CHECK-32-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP290]]
 // CHECK-32-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP11]], 1
-// CHECK-32-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP171]]
-// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP172:![0-9]+]]
+// CHECK-32-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP290]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP291:![0-9]+]]
 // CHECK-32:       omp.inner.for.end:
 // CHECK-32-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-32:       omp.dispatch.inc:
@@ -10440,11 +10578,13 @@ int a;
 //
 //
 // CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l33
-// CHECK-32-SAME: () #[[ATTR0]] {
+// CHECK-32-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
 // CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK-32-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l33_kernel_environment)
+// CHECK-32-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l33_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32:       user_code.entry:
@@ -10496,47 +10636,47 @@ int a;
 // CHECK-32-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32:       omp.inner.for.cond:
-// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP174:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP293:![0-9]+]]
 // CHECK-32-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
 // CHECK-32-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32:       omp.inner.for.body:
-// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP174]]
-// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP174]]
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP293]]
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP293]]
 // CHECK-32-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
 // CHECK-32-NEXT:    [[TMP9:%.*]] = inttoptr i32 [[TMP6]] to ptr
-// CHECK-32-NEXT:    store ptr [[TMP9]], ptr [[TMP8]], align 4, !llvm.access.group [[ACC_GRP174]]
+// CHECK-32-NEXT:    store ptr [[TMP9]], ptr [[TMP8]], align 4, !llvm.access.group [[ACC_GRP293]]
 // CHECK-32-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
 // CHECK-32-NEXT:    [[TMP11:%.*]] = inttoptr i32 [[TMP7]] to ptr
-// CHECK-32-NEXT:    store ptr [[TMP11]], ptr [[TMP10]], align 4, !llvm.access.group [[ACC_GRP174]]
-// CHECK-32-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l33_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i32 2), !llvm.access.group [[ACC_GRP174]]
+// CHECK-32-NEXT:    store ptr [[TMP11]], ptr [[TMP10]], align 4, !llvm.access.group [[ACC_GRP293]]
+// CHECK-32-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l33_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i32 2), !llvm.access.group [[ACC_GRP293]]
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32:       omp.inner.for.inc:
-// CHECK-32-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP174]]
-// CHECK-32-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP174]]
+// CHECK-32-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP293]]
+// CHECK-32-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP293]]
 // CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP12]], [[TMP13]]
-// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP174]]
-// CHECK-32-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP174]]
-// CHECK-32-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP174]]
+// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP293]]
+// CHECK-32-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP293]]
+// CHECK-32-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP293]]
 // CHECK-32-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP14]], [[TMP15]]
-// CHECK-32-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP174]]
-// CHECK-32-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP174]]
-// CHECK-32-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP174]]
+// CHECK-32-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP293]]
+// CHECK-32-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP293]]
+// CHECK-32-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP293]]
 // CHECK-32-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP16]], [[TMP17]]
-// CHECK-32-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP174]]
-// CHECK-32-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP174]]
+// CHECK-32-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP293]]
+// CHECK-32-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP293]]
 // CHECK-32-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP18]], 9
 // CHECK-32-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
 // CHECK-32:       cond.true5:
 // CHECK-32-NEXT:    br label [[COND_END7:%.*]]
 // CHECK-32:       cond.false6:
-// CHECK-32-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP174]]
+// CHECK-32-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP293]]
 // CHECK-32-NEXT:    br label [[COND_END7]]
 // CHECK-32:       cond.end7:
 // CHECK-32-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP19]], [[COND_FALSE6]] ]
-// CHECK-32-NEXT:    store i32 [[COND8]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP174]]
-// CHECK-32-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP174]]
-// CHECK-32-NEXT:    store i32 [[TMP20]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP174]]
-// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP175:![0-9]+]]
+// CHECK-32-NEXT:    store i32 [[COND8]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP293]]
+// CHECK-32-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP293]]
+// CHECK-32-NEXT:    store i32 [[TMP20]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP293]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP294:![0-9]+]]
 // CHECK-32:       omp.inner.for.end:
 // CHECK-32-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK-32:       omp.loop.exit:
@@ -10592,23 +10732,23 @@ int a;
 // CHECK-32-NEXT:    store i32 [[TMP7]], ptr [[DOTOMP_IV]], align 4
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32:       omp.inner.for.cond:
-// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP177:![0-9]+]]
-// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP177]]
+// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP296:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP296]]
 // CHECK-32-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
 // CHECK-32-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32:       omp.inner.for.body:
-// CHECK-32-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP177]]
+// CHECK-32-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP296]]
 // CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
 // CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP177]]
+// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP296]]
 // CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32:       omp.body.continue:
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32:       omp.inner.for.inc:
-// CHECK-32-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP177]]
+// CHECK-32-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP296]]
 // CHECK-32-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP11]], 1
-// CHECK-32-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP177]]
-// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP178:![0-9]+]]
+// CHECK-32-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP296]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP297:![0-9]+]]
 // CHECK-32:       omp.inner.for.end:
 // CHECK-32-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-32:       omp.dispatch.inc:
@@ -10625,14 +10765,16 @@ int a;
 //
 //
 // CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l37
-// CHECK-32-SAME: (i32 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-32-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 // CHECK-32-NEXT:    [[A_CASTED:%.*]] = alloca i32, align 4
 // CHECK-32-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK-32-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK-32-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
-// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l37_kernel_environment)
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l37_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32:       user_code.entry:
@@ -10826,11 +10968,13 @@ int a;
 //
 //
 // CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l40
-// CHECK-32-SAME: () #[[ATTR0]] {
+// CHECK-32-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
 // CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK-32-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l40_kernel_environment)
+// CHECK-32-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l40_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32:       user_code.entry:
@@ -10999,11 +11143,13 @@ int a;
 //
 //
 // CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l43
-// CHECK-32-SAME: () #[[ATTR0]] {
+// CHECK-32-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
 // CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK-32-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l43_kernel_environment)
+// CHECK-32-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l43_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32:       user_code.entry:
@@ -11162,11 +11308,13 @@ int a;
 //
 //
 // CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l46
-// CHECK-32-SAME: () #[[ATTR0]] {
+// CHECK-32-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
 // CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK-32-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l46_kernel_environment)
+// CHECK-32-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l46_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32:       user_code.entry:
@@ -11307,23 +11455,23 @@ int a;
 // CHECK-32-NEXT:    store i32 [[TMP7]], ptr [[DOTOMP_IV]], align 4
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32:       omp.inner.for.cond:
-// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP180:![0-9]+]]
-// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP180]]
+// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP299:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP299]]
 // CHECK-32-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
 // CHECK-32-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32:       omp.inner.for.body:
-// CHECK-32-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP180]]
+// CHECK-32-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP299]]
 // CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
 // CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP180]]
+// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP299]]
 // CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32:       omp.body.continue:
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32:       omp.inner.for.inc:
-// CHECK-32-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP180]]
+// CHECK-32-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP299]]
 // CHECK-32-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP11]], 1
-// CHECK-32-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP180]]
-// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP181:![0-9]+]]
+// CHECK-32-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP299]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP300:![0-9]+]]
 // CHECK-32:       omp.inner.for.end:
 // CHECK-32-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-32:       omp.dispatch.inc:
@@ -11333,11 +11481,13 @@ int a;
 //
 //
 // CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l49
-// CHECK-32-SAME: () #[[ATTR0]] {
+// CHECK-32-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
 // CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK-32-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l49_kernel_environment)
+// CHECK-32-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l49_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32:       user_code.entry:
@@ -11478,23 +11628,23 @@ int a;
 // CHECK-32-NEXT:    store i32 [[TMP7]], ptr [[DOTOMP_IV]], align 4
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32:       omp.inner.for.cond:
-// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP183:![0-9]+]]
-// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP183]]
+// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP302:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP302]]
 // CHECK-32-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
 // CHECK-32-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32:       omp.inner.for.body:
-// CHECK-32-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP183]]
+// CHECK-32-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP302]]
 // CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
 // CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP183]]
+// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP302]]
 // CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32:       omp.body.continue:
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32:       omp.inner.for.inc:
-// CHECK-32-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP183]]
+// CHECK-32-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP302]]
 // CHECK-32-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP11]], 1
-// CHECK-32-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP183]]
-// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP184:![0-9]+]]
+// CHECK-32-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP302]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP303:![0-9]+]]
 // CHECK-32:       omp.inner.for.end:
 // CHECK-32-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-32:       omp.dispatch.inc:
@@ -11504,11 +11654,13 @@ int a;
 //
 //
 // CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l52
-// CHECK-32-SAME: () #[[ATTR0]] {
+// CHECK-32-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
 // CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK-32-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l52_kernel_environment)
+// CHECK-32-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l52_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32:       user_code.entry:
@@ -11649,23 +11801,23 @@ int a;
 // CHECK-32-NEXT:    store i32 [[TMP7]], ptr [[DOTOMP_IV]], align 4
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32:       omp.inner.for.cond:
-// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP186:![0-9]+]]
-// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP186]]
+// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP305:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP305]]
 // CHECK-32-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
 // CHECK-32-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32:       omp.inner.for.body:
-// CHECK-32-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP186]]
+// CHECK-32-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP305]]
 // CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
 // CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP186]]
+// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP305]]
 // CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32:       omp.body.continue:
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32:       omp.inner.for.inc:
-// CHECK-32-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP186]]
+// CHECK-32-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP305]]
 // CHECK-32-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP11]], 1
-// CHECK-32-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP186]]
-// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP187:![0-9]+]]
+// CHECK-32-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP305]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP306:![0-9]+]]
 // CHECK-32:       omp.inner.for.end:
 // CHECK-32-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-32:       omp.dispatch.inc:
@@ -11675,11 +11827,13 @@ int a;
 //
 //
 // CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l55
-// CHECK-32-SAME: () #[[ATTR0]] {
+// CHECK-32-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
 // CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK-32-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l55_kernel_environment)
+// CHECK-32-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l55_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32:       user_code.entry:
@@ -11820,23 +11974,23 @@ int a;
 // CHECK-32-NEXT:    store i32 [[TMP7]], ptr [[DOTOMP_IV]], align 4
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32:       omp.inner.for.cond:
-// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP189:![0-9]+]]
-// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP189]]
+// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP308:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP308]]
 // CHECK-32-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
 // CHECK-32-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32:       omp.inner.for.body:
-// CHECK-32-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP189]]
+// CHECK-32-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP308]]
 // CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
 // CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP189]]
+// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP308]]
 // CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32:       omp.body.continue:
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32:       omp.inner.for.inc:
-// CHECK-32-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP189]]
+// CHECK-32-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP308]]
 // CHECK-32-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP11]], 1
-// CHECK-32-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP189]]
-// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP190:![0-9]+]]
+// CHECK-32-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP308]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP309:![0-9]+]]
 // CHECK-32:       omp.inner.for.end:
 // CHECK-32-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-32:       omp.dispatch.inc:
@@ -11846,11 +12000,13 @@ int a;
 //
 //
 // CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l58
-// CHECK-32-SAME: () #[[ATTR0]] {
+// CHECK-32-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
 // CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK-32-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l58_kernel_environment)
+// CHECK-32-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l58_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32:       user_code.entry:
@@ -11903,47 +12059,47 @@ int a;
 // CHECK-32-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32:       omp.inner.for.cond:
-// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP192:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP311:![0-9]+]]
 // CHECK-32-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
 // CHECK-32-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32:       omp.inner.for.body:
-// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP192]]
-// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP192]]
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP311]]
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP311]]
 // CHECK-32-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
 // CHECK-32-NEXT:    [[TMP9:%.*]] = inttoptr i32 [[TMP6]] to ptr
-// CHECK-32-NEXT:    store ptr [[TMP9]], ptr [[TMP8]], align 4, !llvm.access.group [[ACC_GRP192]]
+// CHECK-32-NEXT:    store ptr [[TMP9]], ptr [[TMP8]], align 4, !llvm.access.group [[ACC_GRP311]]
 // CHECK-32-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
 // CHECK-32-NEXT:    [[TMP11:%.*]] = inttoptr i32 [[TMP7]] to ptr
-// CHECK-32-NEXT:    store ptr [[TMP11]], ptr [[TMP10]], align 4, !llvm.access.group [[ACC_GRP192]]
-// CHECK-32-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l58_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i32 2), !llvm.access.group [[ACC_GRP192]]
+// CHECK-32-NEXT:    store ptr [[TMP11]], ptr [[TMP10]], align 4, !llvm.access.group [[ACC_GRP311]]
+// CHECK-32-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l58_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i32 2), !llvm.access.group [[ACC_GRP311]]
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32:       omp.inner.for.inc:
-// CHECK-32-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP192]]
-// CHECK-32-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP192]]
+// CHECK-32-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP311]]
+// CHECK-32-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP311]]
 // CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP12]], [[TMP13]]
-// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP192]]
-// CHECK-32-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP192]]
-// CHECK-32-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP192]]
+// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP311]]
+// CHECK-32-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP311]]
+// CHECK-32-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP311]]
 // CHECK-32-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP14]], [[TMP15]]
-// CHECK-32-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP192]]
-// CHECK-32-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP192]]
-// CHECK-32-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP192]]
+// CHECK-32-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP311]]
+// CHECK-32-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP311]]
+// CHECK-32-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP311]]
 // CHECK-32-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP16]], [[TMP17]]
-// CHECK-32-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP192]]
-// CHECK-32-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP192]]
+// CHECK-32-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP311]]
+// CHECK-32-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP311]]
 // CHECK-32-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP18]], 9
 // CHECK-32-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
 // CHECK-32:       cond.true5:
 // CHECK-32-NEXT:    br label [[COND_END7:%.*]]
 // CHECK-32:       cond.false6:
-// CHECK-32-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP192]]
+// CHECK-32-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP311]]
 // CHECK-32-NEXT:    br label [[COND_END7]]
 // CHECK-32:       cond.end7:
 // CHECK-32-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP19]], [[COND_FALSE6]] ]
-// CHECK-32-NEXT:    store i32 [[COND8]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP192]]
-// CHECK-32-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP192]]
-// CHECK-32-NEXT:    store i32 [[TMP20]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP192]]
-// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP193:![0-9]+]]
+// CHECK-32-NEXT:    store i32 [[COND8]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP311]]
+// CHECK-32-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP311]]
+// CHECK-32-NEXT:    store i32 [[TMP20]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP311]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP312:![0-9]+]]
 // CHECK-32:       omp.inner.for.end:
 // CHECK-32-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK-32:       omp.loop.exit:
@@ -11991,24 +12147,24 @@ int a;
 // CHECK-32-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32:       omp.inner.for.cond:
-// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP195:![0-9]+]]
-// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTPREVIOUS_UB__ADDR]], align 4, !llvm.access.group [[ACC_GRP195]]
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP314:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTPREVIOUS_UB__ADDR]], align 4, !llvm.access.group [[ACC_GRP314]]
 // CHECK-32-NEXT:    [[CMP:%.*]] = icmp ule i32 [[TMP5]], [[TMP6]]
 // CHECK-32-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32:       omp.inner.for.body:
-// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP195]]
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP314]]
 // CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
 // CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP195]]
+// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP314]]
 // CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32:       omp.body.continue:
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32:       omp.inner.for.inc:
-// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP195]]
-// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP195]]
+// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP314]]
+// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP314]]
 // CHECK-32-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP8]], [[TMP9]]
-// CHECK-32-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP195]]
-// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP196:![0-9]+]]
+// CHECK-32-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP314]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP315:![0-9]+]]
 // CHECK-32:       omp.inner.for.end:
 // CHECK-32-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK-32:       omp.loop.exit:
@@ -12024,11 +12180,13 @@ int a;
 //
 //
 // CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l66
-// CHECK-32-SAME: () #[[ATTR0]] {
+// CHECK-32-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
 // CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK-32-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l66_kernel_environment)
+// CHECK-32-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l66_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32:       user_code.entry:
@@ -12082,47 +12240,47 @@ int a;
 // CHECK-32-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32:       omp.inner.for.cond:
-// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP198:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP317:![0-9]+]]
 // CHECK-32-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
 // CHECK-32-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32:       omp.inner.for.body:
-// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP198]]
-// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP198]]
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP317]]
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP317]]
 // CHECK-32-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
 // CHECK-32-NEXT:    [[TMP9:%.*]] = inttoptr i32 [[TMP6]] to ptr
-// CHECK-32-NEXT:    store ptr [[TMP9]], ptr [[TMP8]], align 4, !llvm.access.group [[ACC_GRP198]]
+// CHECK-32-NEXT:    store ptr [[TMP9]], ptr [[TMP8]], align 4, !llvm.access.group [[ACC_GRP317]]
 // CHECK-32-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
 // CHECK-32-NEXT:    [[TMP11:%.*]] = inttoptr i32 [[TMP7]] to ptr
-// CHECK-32-NEXT:    store ptr [[TMP11]], ptr [[TMP10]], align 4, !llvm.access.group [[ACC_GRP198]]
-// CHECK-32-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l66_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i32 2), !llvm.access.group [[ACC_GRP198]]
+// CHECK-32-NEXT:    store ptr [[TMP11]], ptr [[TMP10]], align 4, !llvm.access.group [[ACC_GRP317]]
+// CHECK-32-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l66_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i32 2), !llvm.access.group [[ACC_GRP317]]
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32:       omp.inner.for.inc:
-// CHECK-32-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP198]]
-// CHECK-32-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP198]]
+// CHECK-32-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP317]]
+// CHECK-32-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP317]]
 // CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP12]], [[TMP13]]
-// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP198]]
-// CHECK-32-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP198]]
-// CHECK-32-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP198]]
+// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP317]]
+// CHECK-32-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP317]]
+// CHECK-32-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP317]]
 // CHECK-32-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP14]], [[TMP15]]
-// CHECK-32-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP198]]
-// CHECK-32-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP198]]
-// CHECK-32-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP198]]
+// CHECK-32-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP317]]
+// CHECK-32-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP317]]
+// CHECK-32-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP317]]
 // CHECK-32-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP16]], [[TMP17]]
-// CHECK-32-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP198]]
-// CHECK-32-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP198]]
+// CHECK-32-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP317]]
+// CHECK-32-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP317]]
 // CHECK-32-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP18]], 9
 // CHECK-32-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
 // CHECK-32:       cond.true5:
 // CHECK-32-NEXT:    br label [[COND_END7:%.*]]
 // CHECK-32:       cond.false6:
-// CHECK-32-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP198]]
+// CHECK-32-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP317]]
 // CHECK-32-NEXT:    br label [[COND_END7]]
 // CHECK-32:       cond.end7:
 // CHECK-32-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP19]], [[COND_FALSE6]] ]
-// CHECK-32-NEXT:    store i32 [[COND8]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP198]]
-// CHECK-32-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP198]]
-// CHECK-32-NEXT:    store i32 [[TMP20]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP198]]
-// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP199:![0-9]+]]
+// CHECK-32-NEXT:    store i32 [[COND8]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP317]]
+// CHECK-32-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP317]]
+// CHECK-32-NEXT:    store i32 [[TMP20]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP317]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP318:![0-9]+]]
 // CHECK-32:       omp.inner.for.end:
 // CHECK-32-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK-32:       omp.loop.exit:
@@ -12181,23 +12339,23 @@ int a;
 // CHECK-32-NEXT:    store i32 [[TMP6]], ptr [[DOTOMP_IV]], align 4
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32:       omp.inner.for.cond:
-// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP201:![0-9]+]]
-// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP201]]
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP320:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP320]]
 // CHECK-32-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]]
 // CHECK-32-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32:       omp.inner.for.body:
-// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP201]]
+// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP320]]
 // CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP9]], 1
 // CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP201]]
+// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP320]]
 // CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32:       omp.body.continue:
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32:       omp.inner.for.inc:
-// CHECK-32-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP201]]
+// CHECK-32-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP320]]
 // CHECK-32-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP10]], 1
-// CHECK-32-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP201]]
-// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP202:![0-9]+]]
+// CHECK-32-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP320]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP321:![0-9]+]]
 // CHECK-32:       omp.inner.for.end:
 // CHECK-32-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK-32:       omp.loop.exit:
@@ -12213,11 +12371,13 @@ int a;
 //
 //
 // CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l73
-// CHECK-32-SAME: () #[[ATTR0]] {
+// CHECK-32-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
 // CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK-32-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l73_kernel_environment)
+// CHECK-32-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l73_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32:       user_code.entry:
@@ -12270,27 +12430,27 @@ int a;
 // CHECK-32-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32:       omp.inner.for.cond:
-// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP204:![0-9]+]]
-// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP204]]
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP323:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP323]]
 // CHECK-32-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
 // CHECK-32-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32:       omp.inner.for.body:
-// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP204]]
-// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP204]]
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP323]]
+// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP323]]
 // CHECK-32-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [2 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
 // CHECK-32-NEXT:    [[TMP10:%.*]] = inttoptr i32 [[TMP7]] to ptr
-// CHECK-32-NEXT:    store ptr [[TMP10]], ptr [[TMP9]], align 4, !llvm.access.group [[ACC_GRP204]]
+// CHECK-32-NEXT:    store ptr [[TMP10]], ptr [[TMP9]], align 4, !llvm.access.group [[ACC_GRP323]]
 // CHECK-32-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [2 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
 // CHECK-32-NEXT:    [[TMP12:%.*]] = inttoptr i32 [[TMP8]] to ptr
-// CHECK-32-NEXT:    store ptr [[TMP12]], ptr [[TMP11]], align 4, !llvm.access.group [[ACC_GRP204]]
-// CHECK-32-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l73_omp_outlined_omp_outlined, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l73_omp_outlined_omp_outlined_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i32 2), !llvm.access.group [[ACC_GRP204]]
+// CHECK-32-NEXT:    store ptr [[TMP12]], ptr [[TMP11]], align 4, !llvm.access.group [[ACC_GRP323]]
+// CHECK-32-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l73_omp_outlined_omp_outlined, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l73_omp_outlined_omp_outlined_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i32 2), !llvm.access.group [[ACC_GRP323]]
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32:       omp.inner.for.inc:
-// CHECK-32-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP204]]
-// CHECK-32-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP204]]
+// CHECK-32-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP323]]
+// CHECK-32-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP323]]
 // CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP13]], [[TMP14]]
-// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP204]]
-// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP205:![0-9]+]]
+// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP323]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP324:![0-9]+]]
 // CHECK-32:       omp.inner.for.end:
 // CHECK-32-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK-32:       omp.loop.exit:
@@ -12341,24 +12501,24 @@ int a;
 // CHECK-32-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32:       omp.inner.for.cond:
-// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP207:![0-9]+]]
-// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTPREVIOUS_UB__ADDR]], align 4, !llvm.access.group [[ACC_GRP207]]
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP326:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTPREVIOUS_UB__ADDR]], align 4, !llvm.access.group [[ACC_GRP326]]
 // CHECK-32-NEXT:    [[CMP:%.*]] = icmp ule i32 [[TMP5]], [[TMP6]]
 // CHECK-32-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32:       omp.inner.for.body:
-// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP207]]
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP326]]
 // CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
 // CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP207]]
+// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP326]]
 // CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32:       omp.body.continue:
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32:       omp.inner.for.inc:
-// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP207]]
-// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP207]]
+// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP326]]
+// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP326]]
 // CHECK-32-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP8]], [[TMP9]]
-// CHECK-32-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP207]]
-// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP208:![0-9]+]]
+// CHECK-32-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP326]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP327:![0-9]+]]
 // CHECK-32:       omp.inner.for.end:
 // CHECK-32-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK-32:       omp.loop.exit:
@@ -12394,11 +12554,13 @@ int a;
 //
 //
 // CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l81
-// CHECK-32-SAME: () #[[ATTR0]] {
+// CHECK-32-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
 // CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK-32-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l81_kernel_environment)
+// CHECK-32-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l81_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32:       user_code.entry:
@@ -12450,47 +12612,47 @@ int a;
 // CHECK-32-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32:       omp.inner.for.cond:
-// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP210:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP329:![0-9]+]]
 // CHECK-32-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
 // CHECK-32-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32:       omp.inner.for.body:
-// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP210]]
-// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP210]]
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP329]]
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP329]]
 // CHECK-32-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
 // CHECK-32-NEXT:    [[TMP9:%.*]] = inttoptr i32 [[TMP6]] to ptr
-// CHECK-32-NEXT:    store ptr [[TMP9]], ptr [[TMP8]], align 4, !llvm.access.group [[ACC_GRP210]]
+// CHECK-32-NEXT:    store ptr [[TMP9]], ptr [[TMP8]], align 4, !llvm.access.group [[ACC_GRP329]]
 // CHECK-32-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
 // CHECK-32-NEXT:    [[TMP11:%.*]] = inttoptr i32 [[TMP7]] to ptr
-// CHECK-32-NEXT:    store ptr [[TMP11]], ptr [[TMP10]], align 4, !llvm.access.group [[ACC_GRP210]]
-// CHECK-32-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l81_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i32 2), !llvm.access.group [[ACC_GRP210]]
+// CHECK-32-NEXT:    store ptr [[TMP11]], ptr [[TMP10]], align 4, !llvm.access.group [[ACC_GRP329]]
+// CHECK-32-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l81_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i32 2), !llvm.access.group [[ACC_GRP329]]
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32:       omp.inner.for.inc:
-// CHECK-32-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP210]]
-// CHECK-32-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP210]]
+// CHECK-32-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP329]]
+// CHECK-32-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP329]]
 // CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP12]], [[TMP13]]
-// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP210]]
-// CHECK-32-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP210]]
-// CHECK-32-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP210]]
+// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP329]]
+// CHECK-32-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP329]]
+// CHECK-32-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP329]]
 // CHECK-32-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP14]], [[TMP15]]
-// CHECK-32-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP210]]
-// CHECK-32-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP210]]
-// CHECK-32-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP210]]
+// CHECK-32-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP329]]
+// CHECK-32-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP329]]
+// CHECK-32-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP329]]
 // CHECK-32-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP16]], [[TMP17]]
-// CHECK-32-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP210]]
-// CHECK-32-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP210]]
+// CHECK-32-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP329]]
+// CHECK-32-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP329]]
 // CHECK-32-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP18]], 9
 // CHECK-32-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
 // CHECK-32:       cond.true5:
 // CHECK-32-NEXT:    br label [[COND_END7:%.*]]
 // CHECK-32:       cond.false6:
-// CHECK-32-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP210]]
+// CHECK-32-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP329]]
 // CHECK-32-NEXT:    br label [[COND_END7]]
 // CHECK-32:       cond.end7:
 // CHECK-32-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP19]], [[COND_FALSE6]] ]
-// CHECK-32-NEXT:    store i32 [[COND8]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP210]]
-// CHECK-32-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP210]]
-// CHECK-32-NEXT:    store i32 [[TMP20]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP210]]
-// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP211:![0-9]+]]
+// CHECK-32-NEXT:    store i32 [[COND8]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP329]]
+// CHECK-32-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP329]]
+// CHECK-32-NEXT:    store i32 [[TMP20]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP329]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP330:![0-9]+]]
 // CHECK-32:       omp.inner.for.end:
 // CHECK-32-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK-32:       omp.loop.exit:
@@ -12546,23 +12708,23 @@ int a;
 // CHECK-32-NEXT:    store i32 [[TMP7]], ptr [[DOTOMP_IV]], align 4
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32:       omp.inner.for.cond:
-// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP213:![0-9]+]]
-// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP213]]
+// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP332:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP332]]
 // CHECK-32-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
 // CHECK-32-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32:       omp.inner.for.body:
-// CHECK-32-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP213]]
+// CHECK-32-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP332]]
 // CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
 // CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP213]]
+// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP332]]
 // CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32:       omp.body.continue:
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32:       omp.inner.for.inc:
-// CHECK-32-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP213]]
+// CHECK-32-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP332]]
 // CHECK-32-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP11]], 1
-// CHECK-32-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP213]]
-// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP214:![0-9]+]]
+// CHECK-32-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP332]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP333:![0-9]+]]
 // CHECK-32:       omp.inner.for.end:
 // CHECK-32-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-32:       omp.dispatch.inc:
@@ -12579,11 +12741,13 @@ int a;
 //
 //
 // CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l85
-// CHECK-32-SAME: () #[[ATTR0]] {
+// CHECK-32-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
 // CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK-32-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l85_kernel_environment)
+// CHECK-32-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l85_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32:       user_code.entry:
@@ -12635,47 +12799,47 @@ int a;
 // CHECK-32-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32:       omp.inner.for.cond:
-// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP216:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP335:![0-9]+]]
 // CHECK-32-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
 // CHECK-32-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32:       omp.inner.for.body:
-// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP216]]
-// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP216]]
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP335]]
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP335]]
 // CHECK-32-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
 // CHECK-32-NEXT:    [[TMP9:%.*]] = inttoptr i32 [[TMP6]] to ptr
-// CHECK-32-NEXT:    store ptr [[TMP9]], ptr [[TMP8]], align 4, !llvm.access.group [[ACC_GRP216]]
+// CHECK-32-NEXT:    store ptr [[TMP9]], ptr [[TMP8]], align 4, !llvm.access.group [[ACC_GRP335]]
 // CHECK-32-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
 // CHECK-32-NEXT:    [[TMP11:%.*]] = inttoptr i32 [[TMP7]] to ptr
-// CHECK-32-NEXT:    store ptr [[TMP11]], ptr [[TMP10]], align 4, !llvm.access.group [[ACC_GRP216]]
-// CHECK-32-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l85_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i32 2), !llvm.access.group [[ACC_GRP216]]
+// CHECK-32-NEXT:    store ptr [[TMP11]], ptr [[TMP10]], align 4, !llvm.access.group [[ACC_GRP335]]
+// CHECK-32-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l85_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i32 2), !llvm.access.group [[ACC_GRP335]]
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32:       omp.inner.for.inc:
-// CHECK-32-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP216]]
-// CHECK-32-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP216]]
+// CHECK-32-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP335]]
+// CHECK-32-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP335]]
 // CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP12]], [[TMP13]]
-// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP216]]
-// CHECK-32-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP216]]
-// CHECK-32-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP216]]
+// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP335]]
+// CHECK-32-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP335]]
+// CHECK-32-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP335]]
 // CHECK-32-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP14]], [[TMP15]]
-// CHECK-32-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP216]]
-// CHECK-32-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP216]]
-// CHECK-32-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP216]]
+// CHECK-32-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP335]]
+// CHECK-32-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP335]]
+// CHECK-32-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP335]]
 // CHECK-32-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP16]], [[TMP17]]
-// CHECK-32-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP216]]
-// CHECK-32-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP216]]
+// CHECK-32-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP335]]
+// CHECK-32-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP335]]
 // CHECK-32-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP18]], 9
 // CHECK-32-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
 // CHECK-32:       cond.true5:
 // CHECK-32-NEXT:    br label [[COND_END7:%.*]]
 // CHECK-32:       cond.false6:
-// CHECK-32-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP216]]
+// CHECK-32-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP335]]
 // CHECK-32-NEXT:    br label [[COND_END7]]
 // CHECK-32:       cond.end7:
 // CHECK-32-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP19]], [[COND_FALSE6]] ]
-// CHECK-32-NEXT:    store i32 [[COND8]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP216]]
-// CHECK-32-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP216]]
-// CHECK-32-NEXT:    store i32 [[TMP20]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP216]]
-// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP217:![0-9]+]]
+// CHECK-32-NEXT:    store i32 [[COND8]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP335]]
+// CHECK-32-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP335]]
+// CHECK-32-NEXT:    store i32 [[TMP20]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP335]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP336:![0-9]+]]
 // CHECK-32:       omp.inner.for.end:
 // CHECK-32-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK-32:       omp.loop.exit:
@@ -12731,23 +12895,23 @@ int a;
 // CHECK-32-NEXT:    store i32 [[TMP7]], ptr [[DOTOMP_IV]], align 4
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32:       omp.inner.for.cond:
-// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP219:![0-9]+]]
-// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP219]]
+// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP338:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP338]]
 // CHECK-32-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
 // CHECK-32-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32:       omp.inner.for.body:
-// CHECK-32-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP219]]
+// CHECK-32-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP338]]
 // CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
 // CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP219]]
+// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP338]]
 // CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32:       omp.body.continue:
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32:       omp.inner.for.inc:
-// CHECK-32-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP219]]
+// CHECK-32-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP338]]
 // CHECK-32-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP11]], 1
-// CHECK-32-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP219]]
-// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP220:![0-9]+]]
+// CHECK-32-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP338]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP339:![0-9]+]]
 // CHECK-32:       omp.inner.for.end:
 // CHECK-32-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-32:       omp.dispatch.inc:
@@ -12764,11 +12928,13 @@ int a;
 //
 //
 // CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l89
-// CHECK-32-SAME: () #[[ATTR0]] {
+// CHECK-32-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
 // CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK-32-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l89_kernel_environment)
+// CHECK-32-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l89_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32:       user_code.entry:
@@ -12820,47 +12986,47 @@ int a;
 // CHECK-32-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32:       omp.inner.for.cond:
-// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP222:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP341:![0-9]+]]
 // CHECK-32-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
 // CHECK-32-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32:       omp.inner.for.body:
-// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP222]]
-// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP222]]
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP341]]
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP341]]
 // CHECK-32-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
 // CHECK-32-NEXT:    [[TMP9:%.*]] = inttoptr i32 [[TMP6]] to ptr
-// CHECK-32-NEXT:    store ptr [[TMP9]], ptr [[TMP8]], align 4, !llvm.access.group [[ACC_GRP222]]
+// CHECK-32-NEXT:    store ptr [[TMP9]], ptr [[TMP8]], align 4, !llvm.access.group [[ACC_GRP341]]
 // CHECK-32-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
 // CHECK-32-NEXT:    [[TMP11:%.*]] = inttoptr i32 [[TMP7]] to ptr
-// CHECK-32-NEXT:    store ptr [[TMP11]], ptr [[TMP10]], align 4, !llvm.access.group [[ACC_GRP222]]
-// CHECK-32-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l89_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i32 2), !llvm.access.group [[ACC_GRP222]]
+// CHECK-32-NEXT:    store ptr [[TMP11]], ptr [[TMP10]], align 4, !llvm.access.group [[ACC_GRP341]]
+// CHECK-32-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l89_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i32 2), !llvm.access.group [[ACC_GRP341]]
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32:       omp.inner.for.inc:
-// CHECK-32-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP222]]
-// CHECK-32-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP222]]
+// CHECK-32-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP341]]
+// CHECK-32-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP341]]
 // CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP12]], [[TMP13]]
-// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP222]]
-// CHECK-32-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP222]]
-// CHECK-32-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP222]]
+// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP341]]
+// CHECK-32-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP341]]
+// CHECK-32-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP341]]
 // CHECK-32-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP14]], [[TMP15]]
-// CHECK-32-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP222]]
-// CHECK-32-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP222]]
-// CHECK-32-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP222]]
+// CHECK-32-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP341]]
+// CHECK-32-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP341]]
+// CHECK-32-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP341]]
 // CHECK-32-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP16]], [[TMP17]]
-// CHECK-32-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP222]]
-// CHECK-32-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP222]]
+// CHECK-32-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP341]]
+// CHECK-32-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP341]]
 // CHECK-32-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP18]], 9
 // CHECK-32-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
 // CHECK-32:       cond.true5:
 // CHECK-32-NEXT:    br label [[COND_END7:%.*]]
 // CHECK-32:       cond.false6:
-// CHECK-32-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP222]]
+// CHECK-32-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP341]]
 // CHECK-32-NEXT:    br label [[COND_END7]]
 // CHECK-32:       cond.end7:
 // CHECK-32-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP19]], [[COND_FALSE6]] ]
-// CHECK-32-NEXT:    store i32 [[COND8]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP222]]
-// CHECK-32-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP222]]
-// CHECK-32-NEXT:    store i32 [[TMP20]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP222]]
-// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP223:![0-9]+]]
+// CHECK-32-NEXT:    store i32 [[COND8]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP341]]
+// CHECK-32-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP341]]
+// CHECK-32-NEXT:    store i32 [[TMP20]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP341]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP342:![0-9]+]]
 // CHECK-32:       omp.inner.for.end:
 // CHECK-32-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK-32:       omp.loop.exit:
@@ -12916,23 +13082,23 @@ int a;
 // CHECK-32-NEXT:    store i32 [[TMP7]], ptr [[DOTOMP_IV]], align 4
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32:       omp.inner.for.cond:
-// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP225:![0-9]+]]
-// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP225]]
+// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP344:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP344]]
 // CHECK-32-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
 // CHECK-32-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32:       omp.inner.for.body:
-// CHECK-32-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP225]]
+// CHECK-32-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP344]]
 // CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
 // CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP225]]
+// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP344]]
 // CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32:       omp.body.continue:
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32:       omp.inner.for.inc:
-// CHECK-32-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP225]]
+// CHECK-32-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP344]]
 // CHECK-32-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP11]], 1
-// CHECK-32-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP225]]
-// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP226:![0-9]+]]
+// CHECK-32-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP344]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP345:![0-9]+]]
 // CHECK-32:       omp.inner.for.end:
 // CHECK-32-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-32:       omp.dispatch.inc:
@@ -12949,11 +13115,13 @@ int a;
 //
 //
 // CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l93
-// CHECK-32-SAME: () #[[ATTR0]] {
+// CHECK-32-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
 // CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK-32-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l93_kernel_environment)
+// CHECK-32-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l93_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32:       user_code.entry:
@@ -13005,47 +13173,47 @@ int a;
 // CHECK-32-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32:       omp.inner.for.cond:
-// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP228:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP347:![0-9]+]]
 // CHECK-32-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
 // CHECK-32-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32:       omp.inner.for.body:
-// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP228]]
-// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP228]]
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP347]]
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP347]]
 // CHECK-32-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
 // CHECK-32-NEXT:    [[TMP9:%.*]] = inttoptr i32 [[TMP6]] to ptr
-// CHECK-32-NEXT:    store ptr [[TMP9]], ptr [[TMP8]], align 4, !llvm.access.group [[ACC_GRP228]]
+// CHECK-32-NEXT:    store ptr [[TMP9]], ptr [[TMP8]], align 4, !llvm.access.group [[ACC_GRP347]]
 // CHECK-32-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
 // CHECK-32-NEXT:    [[TMP11:%.*]] = inttoptr i32 [[TMP7]] to ptr
-// CHECK-32-NEXT:    store ptr [[TMP11]], ptr [[TMP10]], align 4, !llvm.access.group [[ACC_GRP228]]
-// CHECK-32-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l93_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i32 2), !llvm.access.group [[ACC_GRP228]]
+// CHECK-32-NEXT:    store ptr [[TMP11]], ptr [[TMP10]], align 4, !llvm.access.group [[ACC_GRP347]]
+// CHECK-32-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l93_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i32 2), !llvm.access.group [[ACC_GRP347]]
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32:       omp.inner.for.inc:
-// CHECK-32-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP228]]
-// CHECK-32-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP228]]
+// CHECK-32-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP347]]
+// CHECK-32-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP347]]
 // CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP12]], [[TMP13]]
-// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP228]]
-// CHECK-32-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP228]]
-// CHECK-32-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP228]]
+// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP347]]
+// CHECK-32-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP347]]
+// CHECK-32-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP347]]
 // CHECK-32-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP14]], [[TMP15]]
-// CHECK-32-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP228]]
-// CHECK-32-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP228]]
-// CHECK-32-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP228]]
+// CHECK-32-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP347]]
+// CHECK-32-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP347]]
+// CHECK-32-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP347]]
 // CHECK-32-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP16]], [[TMP17]]
-// CHECK-32-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP228]]
-// CHECK-32-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP228]]
+// CHECK-32-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP347]]
+// CHECK-32-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP347]]
 // CHECK-32-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP18]], 9
 // CHECK-32-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
 // CHECK-32:       cond.true5:
 // CHECK-32-NEXT:    br label [[COND_END7:%.*]]
 // CHECK-32:       cond.false6:
-// CHECK-32-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP228]]
+// CHECK-32-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP347]]
 // CHECK-32-NEXT:    br label [[COND_END7]]
 // CHECK-32:       cond.end7:
 // CHECK-32-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP19]], [[COND_FALSE6]] ]
-// CHECK-32-NEXT:    store i32 [[COND8]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP228]]
-// CHECK-32-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP228]]
-// CHECK-32-NEXT:    store i32 [[TMP20]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP228]]
-// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP229:![0-9]+]]
+// CHECK-32-NEXT:    store i32 [[COND8]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP347]]
+// CHECK-32-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP347]]
+// CHECK-32-NEXT:    store i32 [[TMP20]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP347]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP348:![0-9]+]]
 // CHECK-32:       omp.inner.for.end:
 // CHECK-32-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK-32:       omp.loop.exit:
@@ -13101,23 +13269,23 @@ int a;
 // CHECK-32-NEXT:    store i32 [[TMP7]], ptr [[DOTOMP_IV]], align 4
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32:       omp.inner.for.cond:
-// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP231:![0-9]+]]
-// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP231]]
+// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP350:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP350]]
 // CHECK-32-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
 // CHECK-32-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32:       omp.inner.for.body:
-// CHECK-32-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP231]]
+// CHECK-32-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP350]]
 // CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
 // CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP231]]
+// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP350]]
 // CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32:       omp.body.continue:
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32:       omp.inner.for.inc:
-// CHECK-32-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP231]]
+// CHECK-32-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP350]]
 // CHECK-32-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP11]], 1
-// CHECK-32-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP231]]
-// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP232:![0-9]+]]
+// CHECK-32-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP350]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP351:![0-9]+]]
 // CHECK-32:       omp.inner.for.end:
 // CHECK-32-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-32:       omp.dispatch.inc:
@@ -13134,11 +13302,13 @@ int a;
 //
 //
 // CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l97
-// CHECK-32-SAME: () #[[ATTR0]] {
+// CHECK-32-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
 // CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK-32-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l97_kernel_environment)
+// CHECK-32-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l97_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32:       user_code.entry:
@@ -13297,11 +13467,13 @@ int a;
 //
 //
 // CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l101
-// CHECK-32-SAME: () #[[ATTR0]] {
+// CHECK-32-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
 // CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK-32-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l101_kernel_environment)
+// CHECK-32-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l101_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32:       user_code.entry:
@@ -13470,11 +13642,13 @@ int a;
 //
 //
 // CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l105
-// CHECK-32-SAME: () #[[ATTR0]] {
+// CHECK-32-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
 // CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK-32-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l105_kernel_environment)
+// CHECK-32-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l105_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32:       user_code.entry:
@@ -13633,11 +13807,13 @@ int a;
 //
 //
 // CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l109
-// CHECK-32-SAME: () #[[ATTR0]] {
+// CHECK-32-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
 // CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK-32-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l109_kernel_environment)
+// CHECK-32-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l109_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32:       user_code.entry:
@@ -13778,23 +13954,23 @@ int a;
 // CHECK-32-NEXT:    store i32 [[TMP7]], ptr [[DOTOMP_IV]], align 4
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32:       omp.inner.for.cond:
-// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP234:![0-9]+]]
-// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP234]]
+// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP353:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP353]]
 // CHECK-32-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
 // CHECK-32-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32:       omp.inner.for.body:
-// CHECK-32-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP234]]
+// CHECK-32-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP353]]
 // CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
 // CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP234]]
+// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP353]]
 // CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32:       omp.body.continue:
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32:       omp.inner.for.inc:
-// CHECK-32-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP234]]
+// CHECK-32-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP353]]
 // CHECK-32-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP11]], 1
-// CHECK-32-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP234]]
-// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP235:![0-9]+]]
+// CHECK-32-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP353]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP354:![0-9]+]]
 // CHECK-32:       omp.inner.for.end:
 // CHECK-32-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-32:       omp.dispatch.inc:
@@ -13804,11 +13980,13 @@ int a;
 //
 //
 // CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l113
-// CHECK-32-SAME: () #[[ATTR0]] {
+// CHECK-32-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
 // CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK-32-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l113_kernel_environment)
+// CHECK-32-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l113_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32:       user_code.entry:
@@ -13949,23 +14127,23 @@ int a;
 // CHECK-32-NEXT:    store i32 [[TMP7]], ptr [[DOTOMP_IV]], align 4
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32:       omp.inner.for.cond:
-// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP237:![0-9]+]]
-// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP237]]
+// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP356:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP356]]
 // CHECK-32-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
 // CHECK-32-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32:       omp.inner.for.body:
-// CHECK-32-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP237]]
+// CHECK-32-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP356]]
 // CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
 // CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP237]]
+// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP356]]
 // CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32:       omp.body.continue:
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32:       omp.inner.for.inc:
-// CHECK-32-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP237]]
+// CHECK-32-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP356]]
 // CHECK-32-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP11]], 1
-// CHECK-32-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP237]]
-// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP238:![0-9]+]]
+// CHECK-32-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP356]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP357:![0-9]+]]
 // CHECK-32:       omp.inner.for.end:
 // CHECK-32-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-32:       omp.dispatch.inc:
@@ -13975,11 +14153,13 @@ int a;
 //
 //
 // CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l117
-// CHECK-32-SAME: () #[[ATTR0]] {
+// CHECK-32-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
 // CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK-32-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l117_kernel_environment)
+// CHECK-32-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l117_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32:       user_code.entry:
@@ -14120,23 +14300,23 @@ int a;
 // CHECK-32-NEXT:    store i32 [[TMP7]], ptr [[DOTOMP_IV]], align 4
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32:       omp.inner.for.cond:
-// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP240:![0-9]+]]
-// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP240]]
+// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP359:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP359]]
 // CHECK-32-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
 // CHECK-32-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32:       omp.inner.for.body:
-// CHECK-32-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP240]]
+// CHECK-32-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP359]]
 // CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
 // CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP240]]
+// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP359]]
 // CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32:       omp.body.continue:
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32:       omp.inner.for.inc:
-// CHECK-32-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP240]]
+// CHECK-32-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP359]]
 // CHECK-32-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP11]], 1
-// CHECK-32-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP240]]
-// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP241:![0-9]+]]
+// CHECK-32-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP359]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP360:![0-9]+]]
 // CHECK-32:       omp.inner.for.end:
 // CHECK-32-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-32:       omp.dispatch.inc:
@@ -14146,11 +14326,13 @@ int a;
 //
 //
 // CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l121
-// CHECK-32-SAME: () #[[ATTR0]] {
+// CHECK-32-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
 // CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK-32-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l121_kernel_environment)
+// CHECK-32-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l121_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32:       user_code.entry:
@@ -14291,23 +14473,23 @@ int a;
 // CHECK-32-NEXT:    store i32 [[TMP7]], ptr [[DOTOMP_IV]], align 4
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32:       omp.inner.for.cond:
-// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP243:![0-9]+]]
-// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP243]]
+// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP362:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP362]]
 // CHECK-32-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
 // CHECK-32-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32:       omp.inner.for.body:
-// CHECK-32-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP243]]
+// CHECK-32-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP362]]
 // CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
 // CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP243]]
+// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP362]]
 // CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32:       omp.body.continue:
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32:       omp.inner.for.inc:
-// CHECK-32-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP243]]
+// CHECK-32-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP362]]
 // CHECK-32-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP11]], 1
-// CHECK-32-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP243]]
-// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP244:![0-9]+]]
+// CHECK-32-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP362]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP363:![0-9]+]]
 // CHECK-32:       omp.inner.for.end:
 // CHECK-32-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-32:       omp.dispatch.inc:
@@ -14317,11 +14499,13 @@ int a;
 //
 //
 // CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l125
-// CHECK-32-SAME: () #[[ATTR0]] {
+// CHECK-32-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
 // CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK-32-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l125_kernel_environment)
+// CHECK-32-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l125_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32:       user_code.entry:
@@ -14480,11 +14664,13 @@ int a;
 //
 //
 // CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l130
-// CHECK-32-SAME: () #[[ATTR0]] {
+// CHECK-32-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
 // CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK-32-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l130_kernel_environment)
+// CHECK-32-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l130_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32:       user_code.entry:
@@ -14653,11 +14839,13 @@ int a;
 //
 //
 // CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l135
-// CHECK-32-SAME: () #[[ATTR0]] {
+// CHECK-32-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
 // CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK-32-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l135_kernel_environment)
+// CHECK-32-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l135_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32:       user_code.entry:
@@ -14816,11 +15004,13 @@ int a;
 //
 //
 // CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l140
-// CHECK-32-SAME: () #[[ATTR0]] {
+// CHECK-32-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
 // CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK-32-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l140_kernel_environment)
+// CHECK-32-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l140_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32:       user_code.entry:
@@ -14961,23 +15151,23 @@ int a;
 // CHECK-32-NEXT:    store i32 [[TMP7]], ptr [[DOTOMP_IV]], align 4
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32:       omp.inner.for.cond:
-// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP246:![0-9]+]]
-// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP246]]
+// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP365:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP365]]
 // CHECK-32-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
 // CHECK-32-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32:       omp.inner.for.body:
-// CHECK-32-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP246]]
+// CHECK-32-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP365]]
 // CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
 // CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP246]]
+// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP365]]
 // CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32:       omp.body.continue:
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32:       omp.inner.for.inc:
-// CHECK-32-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP246]]
+// CHECK-32-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP365]]
 // CHECK-32-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP11]], 1
-// CHECK-32-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP246]]
-// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP247:![0-9]+]]
+// CHECK-32-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP365]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP366:![0-9]+]]
 // CHECK-32:       omp.inner.for.end:
 // CHECK-32-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-32:       omp.dispatch.inc:
@@ -14987,11 +15177,13 @@ int a;
 //
 //
 // CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l145
-// CHECK-32-SAME: () #[[ATTR0]] {
+// CHECK-32-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
 // CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK-32-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l145_kernel_environment)
+// CHECK-32-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l145_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32:       user_code.entry:
@@ -15132,23 +15324,23 @@ int a;
 // CHECK-32-NEXT:    store i32 [[TMP7]], ptr [[DOTOMP_IV]], align 4
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32:       omp.inner.for.cond:
-// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP249:![0-9]+]]
-// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP249]]
+// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP368:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP368]]
 // CHECK-32-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
 // CHECK-32-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32:       omp.inner.for.body:
-// CHECK-32-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP249]]
+// CHECK-32-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP368]]
 // CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
 // CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP249]]
+// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP368]]
 // CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32:       omp.body.continue:
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32:       omp.inner.for.inc:
-// CHECK-32-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP249]]
+// CHECK-32-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP368]]
 // CHECK-32-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP11]], 1
-// CHECK-32-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP249]]
-// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP250:![0-9]+]]
+// CHECK-32-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP368]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP369:![0-9]+]]
 // CHECK-32:       omp.inner.for.end:
 // CHECK-32-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-32:       omp.dispatch.inc:
@@ -15158,11 +15350,13 @@ int a;
 //
 //
 // CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l150
-// CHECK-32-SAME: () #[[ATTR0]] {
+// CHECK-32-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
 // CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK-32-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l150_kernel_environment)
+// CHECK-32-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l150_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32:       user_code.entry:
@@ -15303,23 +15497,23 @@ int a;
 // CHECK-32-NEXT:    store i32 [[TMP7]], ptr [[DOTOMP_IV]], align 4
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32:       omp.inner.for.cond:
-// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP252:![0-9]+]]
-// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP252]]
+// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP371:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP371]]
 // CHECK-32-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
 // CHECK-32-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32:       omp.inner.for.body:
-// CHECK-32-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP252]]
+// CHECK-32-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP371]]
 // CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
 // CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP252]]
+// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP371]]
 // CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32:       omp.body.continue:
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32:       omp.inner.for.inc:
-// CHECK-32-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP252]]
+// CHECK-32-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP371]]
 // CHECK-32-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP11]], 1
-// CHECK-32-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP252]]
-// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP253:![0-9]+]]
+// CHECK-32-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP371]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP372:![0-9]+]]
 // CHECK-32:       omp.inner.for.end:
 // CHECK-32-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-32:       omp.dispatch.inc:
@@ -15329,11 +15523,13 @@ int a;
 //
 //
 // CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l155
-// CHECK-32-SAME: () #[[ATTR0]] {
+// CHECK-32-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
 // CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK-32-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l155_kernel_environment)
+// CHECK-32-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l155_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32:       user_code.entry:
@@ -15474,23 +15670,23 @@ int a;
 // CHECK-32-NEXT:    store i32 [[TMP7]], ptr [[DOTOMP_IV]], align 4
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32:       omp.inner.for.cond:
-// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP255:![0-9]+]]
-// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP255]]
+// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP374:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP374]]
 // CHECK-32-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
 // CHECK-32-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32:       omp.inner.for.body:
-// CHECK-32-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP255]]
+// CHECK-32-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP374]]
 // CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
 // CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP255]]
+// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP374]]
 // CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32:       omp.body.continue:
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32:       omp.inner.for.inc:
-// CHECK-32-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP255]]
+// CHECK-32-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP374]]
 // CHECK-32-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP11]], 1
-// CHECK-32-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP255]]
-// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP256:![0-9]+]]
+// CHECK-32-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP374]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP375:![0-9]+]]
 // CHECK-32:       omp.inner.for.end:
 // CHECK-32-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-32:       omp.dispatch.inc:
@@ -15500,12 +15696,14 @@ int a;
 //
 //
 // CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l160
-// CHECK-32-SAME: (i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR8:[0-9]+]] {
+// CHECK-32-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR8:[0-9]+]] {
 // CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i32, align 4
 // CHECK-32-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 4
+// CHECK-32-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK-32-NEXT:    store i32 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 4
-// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l160_kernel_environment)
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l160_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32:       user_code.entry:
@@ -15598,10 +15796,12 @@ int a;
 //
 //
 // CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l163
-// CHECK-32-SAME: () #[[ATTR8]] {
+// CHECK-32-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR8]] {
 // CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 4
-// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l163_kernel_environment)
+// CHECK-32-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l163_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32:       user_code.entry:
@@ -15674,10 +15874,12 @@ int a;
 //
 //
 // CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l166
-// CHECK-32-SAME: () #[[ATTR8]] {
+// CHECK-32-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR8]] {
 // CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 4
-// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l166_kernel_environment)
+// CHECK-32-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l166_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32:       user_code.entry:
@@ -15767,10 +15969,12 @@ int a;
 //
 //
 // CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l169
-// CHECK-32-SAME: () #[[ATTR8]] {
+// CHECK-32-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR8]] {
 // CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 4
-// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l169_kernel_environment)
+// CHECK-32-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l169_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32:       user_code.entry:
@@ -15813,23 +16017,23 @@ int a;
 // CHECK-32-NEXT:    store i32 [[TMP3]], ptr [[DOTOMP_IV]], align 4
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32:       omp.inner.for.cond:
-// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP258:![0-9]+]]
-// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP258]]
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP377:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP377]]
 // CHECK-32-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP4]], [[TMP5]]
 // CHECK-32-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32:       omp.inner.for.body:
-// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP258]]
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP377]]
 // CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP6]], 1
 // CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP258]]
+// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP377]]
 // CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32:       omp.body.continue:
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32:       omp.inner.for.inc:
-// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP258]]
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP377]]
 // CHECK-32-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP7]], 1
-// CHECK-32-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP258]]
-// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP259:![0-9]+]]
+// CHECK-32-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP377]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP378:![0-9]+]]
 // CHECK-32:       omp.inner.for.end:
 // CHECK-32-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-32:       omp.dispatch.inc:
@@ -15839,10 +16043,12 @@ int a;
 //
 //
 // CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l172
-// CHECK-32-SAME: () #[[ATTR8]] {
+// CHECK-32-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR8]] {
 // CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 4
-// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l172_kernel_environment)
+// CHECK-32-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l172_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32:       user_code.entry:
@@ -15885,23 +16091,23 @@ int a;
 // CHECK-32-NEXT:    store i32 [[TMP3]], ptr [[DOTOMP_IV]], align 4
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32:       omp.inner.for.cond:
-// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP261:![0-9]+]]
-// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP261]]
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP380:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP380]]
 // CHECK-32-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP4]], [[TMP5]]
 // CHECK-32-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32:       omp.inner.for.body:
-// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP261]]
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP380]]
 // CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP6]], 1
 // CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP261]]
+// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP380]]
 // CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32:       omp.body.continue:
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32:       omp.inner.for.inc:
-// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP261]]
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP380]]
 // CHECK-32-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP7]], 1
-// CHECK-32-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP261]]
-// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP262:![0-9]+]]
+// CHECK-32-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP380]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP381:![0-9]+]]
 // CHECK-32:       omp.inner.for.end:
 // CHECK-32-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-32:       omp.dispatch.inc:
@@ -15911,10 +16117,12 @@ int a;
 //
 //
 // CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l175
-// CHECK-32-SAME: () #[[ATTR8]] {
+// CHECK-32-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR8]] {
 // CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 4
-// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l175_kernel_environment)
+// CHECK-32-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l175_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32:       user_code.entry:
@@ -15957,23 +16165,23 @@ int a;
 // CHECK-32-NEXT:    store i32 [[TMP3]], ptr [[DOTOMP_IV]], align 4
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32:       omp.inner.for.cond:
-// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP264:![0-9]+]]
-// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP264]]
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP383:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP383]]
 // CHECK-32-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP4]], [[TMP5]]
 // CHECK-32-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32:       omp.inner.for.body:
-// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP264]]
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP383]]
 // CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP6]], 1
 // CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP264]]
+// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP383]]
 // CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32:       omp.body.continue:
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32:       omp.inner.for.inc:
-// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP264]]
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP383]]
 // CHECK-32-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP7]], 1
-// CHECK-32-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP264]]
-// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP265:![0-9]+]]
+// CHECK-32-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP383]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP384:![0-9]+]]
 // CHECK-32:       omp.inner.for.end:
 // CHECK-32-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-32:       omp.dispatch.inc:
@@ -15983,10 +16191,12 @@ int a;
 //
 //
 // CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l178
-// CHECK-32-SAME: () #[[ATTR8]] {
+// CHECK-32-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR8]] {
 // CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 4
-// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l178_kernel_environment)
+// CHECK-32-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l178_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32:       user_code.entry:
@@ -16029,23 +16239,23 @@ int a;
 // CHECK-32-NEXT:    store i32 [[TMP3]], ptr [[DOTOMP_IV]], align 4
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32:       omp.inner.for.cond:
-// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP267:![0-9]+]]
-// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP267]]
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP386:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP386]]
 // CHECK-32-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP4]], [[TMP5]]
 // CHECK-32-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32:       omp.inner.for.body:
-// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP267]]
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP386]]
 // CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP6]], 1
 // CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP267]]
+// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP386]]
 // CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32:       omp.body.continue:
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32:       omp.inner.for.inc:
-// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP267]]
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP386]]
 // CHECK-32-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP7]], 1
-// CHECK-32-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP267]]
-// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP268:![0-9]+]]
+// CHECK-32-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP386]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP387:![0-9]+]]
 // CHECK-32:       omp.inner.for.end:
 // CHECK-32-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-32:       omp.dispatch.inc:
@@ -16055,12 +16265,14 @@ int a;
 //
 //
 // CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l181
-// CHECK-32-SAME: (i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR8]] {
+// CHECK-32-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR8]] {
 // CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i32, align 4
 // CHECK-32-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 4
+// CHECK-32-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK-32-NEXT:    store i32 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 4
-// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l181_kernel_environment)
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l181_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32:       user_code.entry:
@@ -16118,23 +16330,23 @@ int a;
 // CHECK-32:       omp.dispatch.body:
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32:       omp.inner.for.cond:
-// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP270:![0-9]+]]
-// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP270]]
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP389:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP389]]
 // CHECK-32-NEXT:    [[CMP2:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]]
 // CHECK-32-NEXT:    br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32:       omp.inner.for.body:
-// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP270]]
+// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP389]]
 // CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP9]], 1
 // CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP270]]
+// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP389]]
 // CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32:       omp.body.continue:
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32:       omp.inner.for.inc:
-// CHECK-32-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP270]]
+// CHECK-32-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP389]]
 // CHECK-32-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP10]], 1
-// CHECK-32-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP270]]
-// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP271:![0-9]+]]
+// CHECK-32-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP389]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP390:![0-9]+]]
 // CHECK-32:       omp.inner.for.end:
 // CHECK-32-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-32:       omp.dispatch.inc:
@@ -16161,10 +16373,12 @@ int a;
 //
 //
 // CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l185
-// CHECK-32-SAME: () #[[ATTR8]] {
+// CHECK-32-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR8]] {
 // CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 4
-// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l185_kernel_environment)
+// CHECK-32-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l185_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32:       user_code.entry:
@@ -16212,23 +16426,23 @@ int a;
 // CHECK-32-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32:       omp.inner.for.cond:
-// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP273:![0-9]+]]
-// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP273]]
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP392:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP392]]
 // CHECK-32-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
 // CHECK-32-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32:       omp.inner.for.body:
-// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP273]]
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP392]]
 // CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
 // CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP273]]
+// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP392]]
 // CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32:       omp.body.continue:
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32:       omp.inner.for.inc:
-// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP273]]
+// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP392]]
 // CHECK-32-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP8]], 1
-// CHECK-32-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP273]]
-// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP274:![0-9]+]]
+// CHECK-32-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP392]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP393:![0-9]+]]
 // CHECK-32:       omp.inner.for.end:
 // CHECK-32-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK-32:       omp.loop.exit:
@@ -16245,10 +16459,12 @@ int a;
 //
 //
 // CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l189
-// CHECK-32-SAME: () #[[ATTR8]] {
+// CHECK-32-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR8]] {
 // CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 4
-// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l189_kernel_environment)
+// CHECK-32-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l189_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32:       user_code.entry:
@@ -16303,23 +16519,23 @@ int a;
 // CHECK-32:       omp.dispatch.body:
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32:       omp.inner.for.cond:
-// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP276:![0-9]+]]
-// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP276]]
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP395:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP395]]
 // CHECK-32-NEXT:    [[CMP2:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]]
 // CHECK-32-NEXT:    br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32:       omp.inner.for.body:
-// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP276]]
+// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP395]]
 // CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP9]], 1
 // CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP276]]
+// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP395]]
 // CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32:       omp.body.continue:
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32:       omp.inner.for.inc:
-// CHECK-32-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP276]]
+// CHECK-32-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP395]]
 // CHECK-32-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP10]], 1
-// CHECK-32-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP276]]
-// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP277:![0-9]+]]
+// CHECK-32-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP395]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP396:![0-9]+]]
 // CHECK-32:       omp.inner.for.end:
 // CHECK-32-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-32:       omp.dispatch.inc:
@@ -16346,10 +16562,12 @@ int a;
 //
 //
 // CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l193
-// CHECK-32-SAME: () #[[ATTR8]] {
+// CHECK-32-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR8]] {
 // CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 4
-// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l193_kernel_environment)
+// CHECK-32-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l193_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32:       user_code.entry:
@@ -16392,23 +16610,23 @@ int a;
 // CHECK-32-NEXT:    store i32 [[TMP3]], ptr [[DOTOMP_IV]], align 4
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32:       omp.inner.for.cond:
-// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP279:![0-9]+]]
-// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP279]]
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP398:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP398]]
 // CHECK-32-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP4]], [[TMP5]]
 // CHECK-32-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32:       omp.inner.for.body:
-// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP279]]
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP398]]
 // CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP6]], 1
 // CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP279]]
+// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP398]]
 // CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32:       omp.body.continue:
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32:       omp.inner.for.inc:
-// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP279]]
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP398]]
 // CHECK-32-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP7]], 1
-// CHECK-32-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP279]]
-// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP280:![0-9]+]]
+// CHECK-32-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP398]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP399:![0-9]+]]
 // CHECK-32:       omp.inner.for.end:
 // CHECK-32-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-32:       omp.dispatch.inc:
@@ -16426,10 +16644,12 @@ int a;
 //
 //
 // CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l197
-// CHECK-32-SAME: () #[[ATTR8]] {
+// CHECK-32-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR8]] {
 // CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 4
-// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l197_kernel_environment)
+// CHECK-32-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l197_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32:       user_code.entry:
@@ -16472,23 +16692,23 @@ int a;
 // CHECK-32-NEXT:    store i32 [[TMP3]], ptr [[DOTOMP_IV]], align 4
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32:       omp.inner.for.cond:
-// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP282:![0-9]+]]
-// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP282]]
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP401:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP401]]
 // CHECK-32-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP4]], [[TMP5]]
 // CHECK-32-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32:       omp.inner.for.body:
-// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP282]]
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP401]]
 // CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP6]], 1
 // CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP282]]
+// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP401]]
 // CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32:       omp.body.continue:
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32:       omp.inner.for.inc:
-// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP282]]
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP401]]
 // CHECK-32-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP7]], 1
-// CHECK-32-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP282]]
-// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP283:![0-9]+]]
+// CHECK-32-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP401]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP402:![0-9]+]]
 // CHECK-32:       omp.inner.for.end:
 // CHECK-32-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-32:       omp.dispatch.inc:
@@ -16506,10 +16726,12 @@ int a;
 //
 //
 // CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l201
-// CHECK-32-SAME: () #[[ATTR8]] {
+// CHECK-32-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR8]] {
 // CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 4
-// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l201_kernel_environment)
+// CHECK-32-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l201_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32:       user_code.entry:
@@ -16552,23 +16774,23 @@ int a;
 // CHECK-32-NEXT:    store i32 [[TMP3]], ptr [[DOTOMP_IV]], align 4
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32:       omp.inner.for.cond:
-// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP285:![0-9]+]]
-// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP285]]
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP404:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP404]]
 // CHECK-32-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP4]], [[TMP5]]
 // CHECK-32-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32:       omp.inner.for.body:
-// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP285]]
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP404]]
 // CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP6]], 1
 // CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP285]]
+// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP404]]
 // CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32:       omp.body.continue:
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32:       omp.inner.for.inc:
-// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP285]]
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP404]]
 // CHECK-32-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP7]], 1
-// CHECK-32-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP285]]
-// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP286:![0-9]+]]
+// CHECK-32-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP404]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP405:![0-9]+]]
 // CHECK-32:       omp.inner.for.end:
 // CHECK-32-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-32:       omp.dispatch.inc:
@@ -16586,10 +16808,12 @@ int a;
 //
 //
 // CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l205
-// CHECK-32-SAME: () #[[ATTR8]] {
+// CHECK-32-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR8]] {
 // CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 4
-// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l205_kernel_environment)
+// CHECK-32-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l205_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32:       user_code.entry:
@@ -16632,23 +16856,23 @@ int a;
 // CHECK-32-NEXT:    store i32 [[TMP3]], ptr [[DOTOMP_IV]], align 4
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32:       omp.inner.for.cond:
-// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP288:![0-9]+]]
-// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP288]]
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP407:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP407]]
 // CHECK-32-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP4]], [[TMP5]]
 // CHECK-32-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32:       omp.inner.for.body:
-// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP288]]
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP407]]
 // CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP6]], 1
 // CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP288]]
+// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP407]]
 // CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32:       omp.body.continue:
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32:       omp.inner.for.inc:
-// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP288]]
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP407]]
 // CHECK-32-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP7]], 1
-// CHECK-32-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP288]]
-// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP289:![0-9]+]]
+// CHECK-32-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP407]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP408:![0-9]+]]
 // CHECK-32:       omp.inner.for.end:
 // CHECK-32-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-32:       omp.dispatch.inc:
@@ -16666,10 +16890,12 @@ int a;
 //
 //
 // CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l209
-// CHECK-32-SAME: () #[[ATTR10:[0-9]+]] {
+// CHECK-32-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR10:[0-9]+]] {
 // CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 4
-// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l209_kernel_environment)
+// CHECK-32-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l209_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32:       user_code.entry:
@@ -16712,24 +16938,24 @@ int a;
 // CHECK-32-NEXT:    store i32 [[TMP3]], ptr [[DOTOMP_IV]], align 4
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32:       omp.inner.for.cond:
-// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP291:![0-9]+]]
-// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP291]]
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP410:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP410]]
 // CHECK-32-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP4]], [[TMP5]]
 // CHECK-32-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32:       omp.inner.for.body:
-// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP291]]
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP410]]
 // CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP6]], 1
 // CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP291]]
+// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP410]]
 // CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32:       omp.body.continue:
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32:       omp.inner.for.inc:
-// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP291]]
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP410]]
 // CHECK-32-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP7]], 1
-// CHECK-32-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP291]]
-// CHECK-32-NEXT:    call void @__kmpc_dispatch_fini_4(ptr @[[GLOB1]], i32 [[TMP1]]), !llvm.access.group [[ACC_GRP291]]
-// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP292:![0-9]+]]
+// CHECK-32-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP410]]
+// CHECK-32-NEXT:    call void @__kmpc_dispatch_fini_4(ptr @[[GLOB1]], i32 [[TMP1]]), !llvm.access.group [[ACC_GRP410]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP411:![0-9]+]]
 // CHECK-32:       omp.inner.for.end:
 // CHECK-32-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-32:       omp.dispatch.inc:
@@ -16747,10 +16973,12 @@ int a;
 //
 //
 // CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l214
-// CHECK-32-SAME: () #[[ATTR10]] {
+// CHECK-32-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR10]] {
 // CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 4
-// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l214_kernel_environment)
+// CHECK-32-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l214_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32:       user_code.entry:
@@ -16798,23 +17026,23 @@ int a;
 // CHECK-32-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32:       omp.inner.for.cond:
-// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP294:![0-9]+]]
-// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP294]]
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP413:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP413]]
 // CHECK-32-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
 // CHECK-32-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32:       omp.inner.for.body:
-// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP294]]
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP413]]
 // CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
 // CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP294]]
+// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP413]]
 // CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32:       omp.body.continue:
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32:       omp.inner.for.inc:
-// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP294]]
+// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP413]]
 // CHECK-32-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP8]], 1
-// CHECK-32-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP294]]
-// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP295:![0-9]+]]
+// CHECK-32-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP413]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP414:![0-9]+]]
 // CHECK-32:       omp.inner.for.end:
 // CHECK-32-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK-32:       omp.loop.exit:
@@ -16831,10 +17059,12 @@ int a;
 //
 //
 // CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l219
-// CHECK-32-SAME: () #[[ATTR10]] {
+// CHECK-32-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR10]] {
 // CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 4
-// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l219_kernel_environment)
+// CHECK-32-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l219_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32:       user_code.entry:
@@ -16889,23 +17119,23 @@ int a;
 // CHECK-32:       omp.dispatch.body:
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32:       omp.inner.for.cond:
-// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP297:![0-9]+]]
-// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP297]]
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP416:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP416]]
 // CHECK-32-NEXT:    [[CMP2:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]]
 // CHECK-32-NEXT:    br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32:       omp.inner.for.body:
-// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP297]]
+// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP416]]
 // CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP9]], 1
 // CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP297]]
+// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP416]]
 // CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32:       omp.body.continue:
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32:       omp.inner.for.inc:
-// CHECK-32-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP297]]
+// CHECK-32-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP416]]
 // CHECK-32-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP10]], 1
-// CHECK-32-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP297]]
-// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP298:![0-9]+]]
+// CHECK-32-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP416]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP417:![0-9]+]]
 // CHECK-32:       omp.inner.for.end:
 // CHECK-32-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-32:       omp.dispatch.inc:
@@ -16932,10 +17162,12 @@ int a;
 //
 //
 // CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l224
-// CHECK-32-SAME: () #[[ATTR10]] {
+// CHECK-32-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR10]] {
 // CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 4
-// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l224_kernel_environment)
+// CHECK-32-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l224_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32:       user_code.entry:
@@ -16978,23 +17210,23 @@ int a;
 // CHECK-32-NEXT:    store i32 [[TMP3]], ptr [[DOTOMP_IV]], align 4
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32:       omp.inner.for.cond:
-// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP300:![0-9]+]]
-// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP300]]
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP419:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP419]]
 // CHECK-32-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP4]], [[TMP5]]
 // CHECK-32-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32:       omp.inner.for.body:
-// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP300]]
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP419]]
 // CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP6]], 1
 // CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP300]]
+// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP419]]
 // CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32:       omp.body.continue:
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32:       omp.inner.for.inc:
-// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP300]]
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP419]]
 // CHECK-32-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP7]], 1
-// CHECK-32-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP300]]
-// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP301:![0-9]+]]
+// CHECK-32-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP419]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP420:![0-9]+]]
 // CHECK-32:       omp.inner.for.end:
 // CHECK-32-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-32:       omp.dispatch.inc:
@@ -17012,10 +17244,12 @@ int a;
 //
 //
 // CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l229
-// CHECK-32-SAME: () #[[ATTR10]] {
+// CHECK-32-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR10]] {
 // CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 4
-// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l229_kernel_environment)
+// CHECK-32-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l229_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32:       user_code.entry:
@@ -17058,23 +17292,23 @@ int a;
 // CHECK-32-NEXT:    store i32 [[TMP3]], ptr [[DOTOMP_IV]], align 4
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32:       omp.inner.for.cond:
-// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP303:![0-9]+]]
-// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP303]]
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP422:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP422]]
 // CHECK-32-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP4]], [[TMP5]]
 // CHECK-32-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32:       omp.inner.for.body:
-// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP303]]
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP422]]
 // CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP6]], 1
 // CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP303]]
+// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP422]]
 // CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32:       omp.body.continue:
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32:       omp.inner.for.inc:
-// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP303]]
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP422]]
 // CHECK-32-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP7]], 1
-// CHECK-32-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP303]]
-// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP304:![0-9]+]]
+// CHECK-32-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP422]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP423:![0-9]+]]
 // CHECK-32:       omp.inner.for.end:
 // CHECK-32-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-32:       omp.dispatch.inc:
@@ -17092,10 +17326,12 @@ int a;
 //
 //
 // CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l234
-// CHECK-32-SAME: () #[[ATTR10]] {
+// CHECK-32-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR10]] {
 // CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 4
-// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l234_kernel_environment)
+// CHECK-32-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l234_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32:       user_code.entry:
@@ -17138,23 +17374,23 @@ int a;
 // CHECK-32-NEXT:    store i32 [[TMP3]], ptr [[DOTOMP_IV]], align 4
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32:       omp.inner.for.cond:
-// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP306:![0-9]+]]
-// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP306]]
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP425:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP425]]
 // CHECK-32-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP4]], [[TMP5]]
 // CHECK-32-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32:       omp.inner.for.body:
-// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP306]]
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP425]]
 // CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP6]], 1
 // CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP306]]
+// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP425]]
 // CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32:       omp.body.continue:
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32:       omp.inner.for.inc:
-// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP306]]
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP425]]
 // CHECK-32-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP7]], 1
-// CHECK-32-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP306]]
-// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP307:![0-9]+]]
+// CHECK-32-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP425]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP426:![0-9]+]]
 // CHECK-32:       omp.inner.for.end:
 // CHECK-32-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-32:       omp.dispatch.inc:
@@ -17172,10 +17408,12 @@ int a;
 //
 //
 // CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l239
-// CHECK-32-SAME: () #[[ATTR10]] {
+// CHECK-32-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR10]] {
 // CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 4
-// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l239_kernel_environment)
+// CHECK-32-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l239_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32:       user_code.entry:
@@ -17218,23 +17456,23 @@ int a;
 // CHECK-32-NEXT:    store i32 [[TMP3]], ptr [[DOTOMP_IV]], align 4
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32:       omp.inner.for.cond:
-// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP309:![0-9]+]]
-// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP309]]
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP428:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP428]]
 // CHECK-32-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP4]], [[TMP5]]
 // CHECK-32-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32:       omp.inner.for.body:
-// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP309]]
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP428]]
 // CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP6]], 1
 // CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP309]]
+// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP428]]
 // CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32:       omp.body.continue:
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32:       omp.inner.for.inc:
-// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP309]]
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP428]]
 // CHECK-32-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP7]], 1
-// CHECK-32-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP309]]
-// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP310:![0-9]+]]
+// CHECK-32-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP428]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP429:![0-9]+]]
 // CHECK-32:       omp.inner.for.end:
 // CHECK-32-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-32:       omp.dispatch.inc:
@@ -17252,10 +17490,12 @@ int a;
 //
 //
 // CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l244
-// CHECK-32-SAME: () #[[ATTR8]] {
+// CHECK-32-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR8]] {
 // CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 4
-// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l244_kernel_environment)
+// CHECK-32-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l244_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32:       user_code.entry:
@@ -17345,10 +17585,12 @@ int a;
 //
 //
 // CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l248
-// CHECK-32-SAME: () #[[ATTR8]] {
+// CHECK-32-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR8]] {
 // CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 4
-// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l248_kernel_environment)
+// CHECK-32-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l248_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32:       user_code.entry:
@@ -17421,10 +17663,12 @@ int a;
 //
 //
 // CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l252
-// CHECK-32-SAME: () #[[ATTR8]] {
+// CHECK-32-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR8]] {
 // CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 4
-// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l252_kernel_environment)
+// CHECK-32-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l252_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32:       user_code.entry:
@@ -17514,10 +17758,12 @@ int a;
 //
 //
 // CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l256
-// CHECK-32-SAME: () #[[ATTR8]] {
+// CHECK-32-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR8]] {
 // CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 4
-// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l256_kernel_environment)
+// CHECK-32-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l256_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32:       user_code.entry:
@@ -17560,23 +17806,23 @@ int a;
 // CHECK-32-NEXT:    store i32 [[TMP3]], ptr [[DOTOMP_IV]], align 4
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32:       omp.inner.for.cond:
-// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP312:![0-9]+]]
-// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP312]]
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP431:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP431]]
 // CHECK-32-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP4]], [[TMP5]]
 // CHECK-32-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32:       omp.inner.for.body:
-// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP312]]
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP431]]
 // CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP6]], 1
 // CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP312]]
+// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP431]]
 // CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32:       omp.body.continue:
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32:       omp.inner.for.inc:
-// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP312]]
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP431]]
 // CHECK-32-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP7]], 1
-// CHECK-32-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP312]]
-// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP313:![0-9]+]]
+// CHECK-32-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP431]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP432:![0-9]+]]
 // CHECK-32:       omp.inner.for.end:
 // CHECK-32-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-32:       omp.dispatch.inc:
@@ -17586,10 +17832,12 @@ int a;
 //
 //
 // CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l260
-// CHECK-32-SAME: () #[[ATTR8]] {
+// CHECK-32-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR8]] {
 // CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 4
-// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l260_kernel_environment)
+// CHECK-32-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l260_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32:       user_code.entry:
@@ -17632,23 +17880,23 @@ int a;
 // CHECK-32-NEXT:    store i32 [[TMP3]], ptr [[DOTOMP_IV]], align 4
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32:       omp.inner.for.cond:
-// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP315:![0-9]+]]
-// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP315]]
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP434:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP434]]
 // CHECK-32-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP4]], [[TMP5]]
 // CHECK-32-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32:       omp.inner.for.body:
-// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP315]]
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP434]]
 // CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP6]], 1
 // CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP315]]
+// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP434]]
 // CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32:       omp.body.continue:
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32:       omp.inner.for.inc:
-// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP315]]
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP434]]
 // CHECK-32-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP7]], 1
-// CHECK-32-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP315]]
-// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP316:![0-9]+]]
+// CHECK-32-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP434]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP435:![0-9]+]]
 // CHECK-32:       omp.inner.for.end:
 // CHECK-32-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-32:       omp.dispatch.inc:
@@ -17658,10 +17906,12 @@ int a;
 //
 //
 // CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l264
-// CHECK-32-SAME: () #[[ATTR8]] {
+// CHECK-32-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR8]] {
 // CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 4
-// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l264_kernel_environment)
+// CHECK-32-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l264_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32:       user_code.entry:
@@ -17704,23 +17954,23 @@ int a;
 // CHECK-32-NEXT:    store i32 [[TMP3]], ptr [[DOTOMP_IV]], align 4
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32:       omp.inner.for.cond:
-// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP318:![0-9]+]]
-// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP318]]
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP437:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP437]]
 // CHECK-32-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP4]], [[TMP5]]
 // CHECK-32-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32:       omp.inner.for.body:
-// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP318]]
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP437]]
 // CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP6]], 1
 // CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP318]]
+// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP437]]
 // CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32:       omp.body.continue:
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32:       omp.inner.for.inc:
-// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP318]]
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP437]]
 // CHECK-32-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP7]], 1
-// CHECK-32-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP318]]
-// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP319:![0-9]+]]
+// CHECK-32-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP437]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP438:![0-9]+]]
 // CHECK-32:       omp.inner.for.end:
 // CHECK-32-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-32:       omp.dispatch.inc:
@@ -17730,10 +17980,12 @@ int a;
 //
 //
 // CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l268
-// CHECK-32-SAME: () #[[ATTR8]] {
+// CHECK-32-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR8]] {
 // CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 4
-// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l268_kernel_environment)
+// CHECK-32-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l268_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32:       user_code.entry:
@@ -17776,23 +18028,23 @@ int a;
 // CHECK-32-NEXT:    store i32 [[TMP3]], ptr [[DOTOMP_IV]], align 4
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32:       omp.inner.for.cond:
-// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP321:![0-9]+]]
-// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP321]]
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP440:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP440]]
 // CHECK-32-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP4]], [[TMP5]]
 // CHECK-32-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32:       omp.inner.for.body:
-// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP321]]
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP440]]
 // CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP6]], 1
 // CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP321]]
+// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP440]]
 // CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32:       omp.body.continue:
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32:       omp.inner.for.inc:
-// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP321]]
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP440]]
 // CHECK-32-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP7]], 1
-// CHECK-32-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP321]]
-// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP322:![0-9]+]]
+// CHECK-32-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP440]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP441:![0-9]+]]
 // CHECK-32:       omp.inner.for.end:
 // CHECK-32-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-32:       omp.dispatch.inc:
@@ -17802,14 +18054,16 @@ int a;
 //
 //
 // CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l15
-// CHECK-32-EX-SAME: (i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-32-EX-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-EX-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i32, align 4
 // CHECK-32-EX-NEXT:    [[DOTCAPTURE_EXPR__CASTED:%.*]] = alloca i32, align 4
 // CHECK-32-EX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK-32-EX-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK-32-EX-NEXT:    store i32 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 4
-// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l15_kernel_environment)
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l15_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32-EX:       user_code.entry:
@@ -17875,58 +18129,58 @@ int a;
 // CHECK-32-EX:       omp_if.then:
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32-EX:       omp.inner.for.cond:
-// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP130:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP249:![0-9]+]]
 // CHECK-32-EX-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP6]], 10
 // CHECK-32-EX-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32-EX:       omp.inner.for.body:
-// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP130]]
-// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP130]]
-// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1, !llvm.access.group [[ACC_GRP130]]
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP249]]
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP249]]
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1, !llvm.access.group [[ACC_GRP249]]
 // CHECK-32-EX-NEXT:    [[TOBOOL2:%.*]] = trunc i8 [[TMP9]] to i1
 // CHECK-32-EX-NEXT:    [[FROMBOOL:%.*]] = zext i1 [[TOBOOL2]] to i8
-// CHECK-32-EX-NEXT:    store i8 [[FROMBOOL]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 1, !llvm.access.group [[ACC_GRP130]]
-// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__CASTED]], align 4, !llvm.access.group [[ACC_GRP130]]
+// CHECK-32-EX-NEXT:    store i8 [[FROMBOOL]], ptr [[DOTCAPTURE_EXPR__CASTED]], align 1, !llvm.access.group [[ACC_GRP249]]
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__CASTED]], align 4, !llvm.access.group [[ACC_GRP249]]
 // CHECK-32-EX-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [3 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
 // CHECK-32-EX-NEXT:    [[TMP12:%.*]] = inttoptr i32 [[TMP7]] to ptr
-// CHECK-32-EX-NEXT:    store ptr [[TMP12]], ptr [[TMP11]], align 4, !llvm.access.group [[ACC_GRP130]]
+// CHECK-32-EX-NEXT:    store ptr [[TMP12]], ptr [[TMP11]], align 4, !llvm.access.group [[ACC_GRP249]]
 // CHECK-32-EX-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [3 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
 // CHECK-32-EX-NEXT:    [[TMP14:%.*]] = inttoptr i32 [[TMP8]] to ptr
-// CHECK-32-EX-NEXT:    store ptr [[TMP14]], ptr [[TMP13]], align 4, !llvm.access.group [[ACC_GRP130]]
+// CHECK-32-EX-NEXT:    store ptr [[TMP14]], ptr [[TMP13]], align 4, !llvm.access.group [[ACC_GRP249]]
 // CHECK-32-EX-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [3 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 2
 // CHECK-32-EX-NEXT:    [[TMP16:%.*]] = inttoptr i32 [[TMP10]] to ptr
-// CHECK-32-EX-NEXT:    store ptr [[TMP16]], ptr [[TMP15]], align 4, !llvm.access.group [[ACC_GRP130]]
-// CHECK-32-EX-NEXT:    [[TMP17:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1, !llvm.access.group [[ACC_GRP130]]
+// CHECK-32-EX-NEXT:    store ptr [[TMP16]], ptr [[TMP15]], align 4, !llvm.access.group [[ACC_GRP249]]
+// CHECK-32-EX-NEXT:    [[TMP17:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1, !llvm.access.group [[ACC_GRP249]]
 // CHECK-32-EX-NEXT:    [[TOBOOL3:%.*]] = trunc i8 [[TMP17]] to i1
 // CHECK-32-EX-NEXT:    [[TMP18:%.*]] = zext i1 [[TOBOOL3]] to i32
-// CHECK-32-EX-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 [[TMP18]], i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l15_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i32 3), !llvm.access.group [[ACC_GRP130]]
+// CHECK-32-EX-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 [[TMP18]], i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l15_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i32 3), !llvm.access.group [[ACC_GRP249]]
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32-EX:       omp.inner.for.inc:
-// CHECK-32-EX-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP130]]
-// CHECK-32-EX-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP130]]
+// CHECK-32-EX-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP249]]
+// CHECK-32-EX-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP249]]
 // CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP19]], [[TMP20]]
-// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP130]]
-// CHECK-32-EX-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP130]]
-// CHECK-32-EX-NEXT:    [[TMP22:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP130]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP249]]
+// CHECK-32-EX-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP249]]
+// CHECK-32-EX-NEXT:    [[TMP22:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP249]]
 // CHECK-32-EX-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP21]], [[TMP22]]
-// CHECK-32-EX-NEXT:    store i32 [[ADD4]], ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP130]]
-// CHECK-32-EX-NEXT:    [[TMP23:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP130]]
-// CHECK-32-EX-NEXT:    [[TMP24:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP130]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD4]], ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP249]]
+// CHECK-32-EX-NEXT:    [[TMP23:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP249]]
+// CHECK-32-EX-NEXT:    [[TMP24:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP249]]
 // CHECK-32-EX-NEXT:    [[ADD5:%.*]] = add nsw i32 [[TMP23]], [[TMP24]]
-// CHECK-32-EX-NEXT:    store i32 [[ADD5]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP130]]
-// CHECK-32-EX-NEXT:    [[TMP25:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP130]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD5]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP249]]
+// CHECK-32-EX-NEXT:    [[TMP25:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP249]]
 // CHECK-32-EX-NEXT:    [[CMP6:%.*]] = icmp sgt i32 [[TMP25]], 9
 // CHECK-32-EX-NEXT:    br i1 [[CMP6]], label [[COND_TRUE7:%.*]], label [[COND_FALSE8:%.*]]
 // CHECK-32-EX:       cond.true7:
 // CHECK-32-EX-NEXT:    br label [[COND_END9:%.*]]
 // CHECK-32-EX:       cond.false8:
-// CHECK-32-EX-NEXT:    [[TMP26:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP130]]
+// CHECK-32-EX-NEXT:    [[TMP26:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP249]]
 // CHECK-32-EX-NEXT:    br label [[COND_END9]]
 // CHECK-32-EX:       cond.end9:
 // CHECK-32-EX-NEXT:    [[COND10:%.*]] = phi i32 [ 9, [[COND_TRUE7]] ], [ [[TMP26]], [[COND_FALSE8]] ]
-// CHECK-32-EX-NEXT:    store i32 [[COND10]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP130]]
-// CHECK-32-EX-NEXT:    [[TMP27:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP130]]
-// CHECK-32-EX-NEXT:    store i32 [[TMP27]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP130]]
-// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP131:![0-9]+]]
+// CHECK-32-EX-NEXT:    store i32 [[COND10]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP249]]
+// CHECK-32-EX-NEXT:    [[TMP27:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP249]]
+// CHECK-32-EX-NEXT:    store i32 [[TMP27]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP249]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP250:![0-9]+]]
 // CHECK-32-EX:       omp.inner.for.end:
 // CHECK-32-EX-NEXT:    br label [[OMP_IF_END:%.*]]
 // CHECK-32-EX:       omp_if.else:
@@ -17983,7 +18237,7 @@ int a;
 // CHECK-32-EX-NEXT:    store i32 [[COND27]], ptr [[DOTOMP_COMB_UB]], align 4
 // CHECK-32-EX-NEXT:    [[TMP49:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4
 // CHECK-32-EX-NEXT:    store i32 [[TMP49]], ptr [[DOTOMP_IV]], align 4
-// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND11]], !llvm.loop [[LOOP134:![0-9]+]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND11]], !llvm.loop [[LOOP253:![0-9]+]]
 // CHECK-32-EX:       omp.inner.for.end28:
 // CHECK-32-EX-NEXT:    br label [[OMP_IF_END]]
 // CHECK-32-EX:       omp_if.end:
@@ -18039,24 +18293,24 @@ int a;
 // CHECK-32-EX-NEXT:    store i32 [[TMP5]], ptr [[DOTOMP_IV]], align 4
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32-EX:       omp.inner.for.cond:
-// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP136:![0-9]+]]
-// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTPREVIOUS_UB__ADDR]], align 4, !llvm.access.group [[ACC_GRP136]]
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP255:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTPREVIOUS_UB__ADDR]], align 4, !llvm.access.group [[ACC_GRP255]]
 // CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]]
 // CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32-EX:       omp.inner.for.body:
-// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP136]]
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP255]]
 // CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1
 // CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP136]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP255]]
 // CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32-EX:       omp.body.continue:
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32-EX:       omp.inner.for.inc:
-// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP136]]
-// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP136]]
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP255]]
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP255]]
 // CHECK-32-EX-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP9]], [[TMP10]]
-// CHECK-32-EX-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP136]]
-// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP137:![0-9]+]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP255]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP256:![0-9]+]]
 // CHECK-32-EX:       omp.inner.for.end:
 // CHECK-32-EX-NEXT:    br label [[OMP_IF_END:%.*]]
 // CHECK-32-EX:       omp_if.else:
@@ -18084,7 +18338,7 @@ int a;
 // CHECK-32-EX-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
 // CHECK-32-EX-NEXT:    [[ADD9:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
 // CHECK-32-EX-NEXT:    store i32 [[ADD9]], ptr [[DOTOMP_IV]], align 4
-// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND2]], !llvm.loop [[LOOP139:![0-9]+]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND2]], !llvm.loop [[LOOP258:![0-9]+]]
 // CHECK-32-EX:       omp.inner.for.end10:
 // CHECK-32-EX-NEXT:    br label [[OMP_IF_END]]
 // CHECK-32-EX:       omp_if.end:
@@ -18142,24 +18396,24 @@ int a;
 // CHECK-32-EX-NEXT:    store i32 [[TMP5]], ptr [[DOTOMP_IV]], align 4
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32-EX:       omp.inner.for.cond:
-// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP140:![0-9]+]]
-// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTPREVIOUS_UB__ADDR]], align 4, !llvm.access.group [[ACC_GRP140]]
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP259:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTPREVIOUS_UB__ADDR]], align 4, !llvm.access.group [[ACC_GRP259]]
 // CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]]
 // CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32-EX:       omp.inner.for.body:
-// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP140]]
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP259]]
 // CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1
 // CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP140]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP259]]
 // CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32-EX:       omp.body.continue:
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32-EX:       omp.inner.for.inc:
-// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP140]]
-// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP140]]
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP259]]
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP259]]
 // CHECK-32-EX-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP9]], [[TMP10]]
-// CHECK-32-EX-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP140]]
-// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP141:![0-9]+]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP259]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP260:![0-9]+]]
 // CHECK-32-EX:       omp.inner.for.end:
 // CHECK-32-EX-NEXT:    br label [[OMP_IF_END:%.*]]
 // CHECK-32-EX:       omp_if.else:
@@ -18187,7 +18441,7 @@ int a;
 // CHECK-32-EX-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4
 // CHECK-32-EX-NEXT:    [[ADD9:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
 // CHECK-32-EX-NEXT:    store i32 [[ADD9]], ptr [[DOTOMP_IV]], align 4
-// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND2]], !llvm.loop [[LOOP143:![0-9]+]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND2]], !llvm.loop [[LOOP262:![0-9]+]]
 // CHECK-32-EX:       omp.inner.for.end10:
 // CHECK-32-EX-NEXT:    br label [[OMP_IF_END]]
 // CHECK-32-EX:       omp_if.end:
@@ -18207,11 +18461,13 @@ int a;
 //
 //
 // CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l18
-// CHECK-32-EX-SAME: () #[[ATTR0]] {
+// CHECK-32-EX-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
 // CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-EX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK-32-EX-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l18_kernel_environment)
+// CHECK-32-EX-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l18_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32-EX:       user_code.entry:
@@ -18263,47 +18519,47 @@ int a;
 // CHECK-32-EX-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32-EX:       omp.inner.for.cond:
-// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP144:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP263:![0-9]+]]
 // CHECK-32-EX-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
 // CHECK-32-EX-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32-EX:       omp.inner.for.body:
-// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP144]]
-// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP144]]
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP263]]
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP263]]
 // CHECK-32-EX-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
 // CHECK-32-EX-NEXT:    [[TMP9:%.*]] = inttoptr i32 [[TMP6]] to ptr
-// CHECK-32-EX-NEXT:    store ptr [[TMP9]], ptr [[TMP8]], align 4, !llvm.access.group [[ACC_GRP144]]
+// CHECK-32-EX-NEXT:    store ptr [[TMP9]], ptr [[TMP8]], align 4, !llvm.access.group [[ACC_GRP263]]
 // CHECK-32-EX-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
 // CHECK-32-EX-NEXT:    [[TMP11:%.*]] = inttoptr i32 [[TMP7]] to ptr
-// CHECK-32-EX-NEXT:    store ptr [[TMP11]], ptr [[TMP10]], align 4, !llvm.access.group [[ACC_GRP144]]
-// CHECK-32-EX-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l18_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i32 2), !llvm.access.group [[ACC_GRP144]]
+// CHECK-32-EX-NEXT:    store ptr [[TMP11]], ptr [[TMP10]], align 4, !llvm.access.group [[ACC_GRP263]]
+// CHECK-32-EX-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l18_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i32 2), !llvm.access.group [[ACC_GRP263]]
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32-EX:       omp.inner.for.inc:
-// CHECK-32-EX-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP144]]
-// CHECK-32-EX-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP144]]
+// CHECK-32-EX-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP263]]
+// CHECK-32-EX-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP263]]
 // CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP12]], [[TMP13]]
-// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP144]]
-// CHECK-32-EX-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP144]]
-// CHECK-32-EX-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP144]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP263]]
+// CHECK-32-EX-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP263]]
+// CHECK-32-EX-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP263]]
 // CHECK-32-EX-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP14]], [[TMP15]]
-// CHECK-32-EX-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP144]]
-// CHECK-32-EX-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP144]]
-// CHECK-32-EX-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP144]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP263]]
+// CHECK-32-EX-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP263]]
+// CHECK-32-EX-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP263]]
 // CHECK-32-EX-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP16]], [[TMP17]]
-// CHECK-32-EX-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP144]]
-// CHECK-32-EX-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP144]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP263]]
+// CHECK-32-EX-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP263]]
 // CHECK-32-EX-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP18]], 9
 // CHECK-32-EX-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
 // CHECK-32-EX:       cond.true5:
 // CHECK-32-EX-NEXT:    br label [[COND_END7:%.*]]
 // CHECK-32-EX:       cond.false6:
-// CHECK-32-EX-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP144]]
+// CHECK-32-EX-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP263]]
 // CHECK-32-EX-NEXT:    br label [[COND_END7]]
 // CHECK-32-EX:       cond.end7:
 // CHECK-32-EX-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP19]], [[COND_FALSE6]] ]
-// CHECK-32-EX-NEXT:    store i32 [[COND8]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP144]]
-// CHECK-32-EX-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP144]]
-// CHECK-32-EX-NEXT:    store i32 [[TMP20]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP144]]
-// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP145:![0-9]+]]
+// CHECK-32-EX-NEXT:    store i32 [[COND8]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP263]]
+// CHECK-32-EX-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP263]]
+// CHECK-32-EX-NEXT:    store i32 [[TMP20]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP263]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP264:![0-9]+]]
 // CHECK-32-EX:       omp.inner.for.end:
 // CHECK-32-EX-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK-32-EX:       omp.loop.exit:
@@ -18362,23 +18618,23 @@ int a;
 // CHECK-32-EX-NEXT:    store i32 [[TMP6]], ptr [[DOTOMP_IV]], align 4
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32-EX:       omp.inner.for.cond:
-// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP147:![0-9]+]]
-// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP147]]
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP266:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP266]]
 // CHECK-32-EX-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]]
 // CHECK-32-EX-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32-EX:       omp.inner.for.body:
-// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP147]]
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP266]]
 // CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP9]], 1
 // CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP147]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP266]]
 // CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32-EX:       omp.body.continue:
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32-EX:       omp.inner.for.inc:
-// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP147]]
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP266]]
 // CHECK-32-EX-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP10]], 1
-// CHECK-32-EX-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP147]]
-// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP148:![0-9]+]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP266]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP267:![0-9]+]]
 // CHECK-32-EX:       omp.inner.for.end:
 // CHECK-32-EX-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK-32-EX:       omp.loop.exit:
@@ -18394,11 +18650,13 @@ int a;
 //
 //
 // CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l21
-// CHECK-32-EX-SAME: () #[[ATTR0]] {
+// CHECK-32-EX-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
 // CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-EX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK-32-EX-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l21_kernel_environment)
+// CHECK-32-EX-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l21_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32-EX:       user_code.entry:
@@ -18450,47 +18708,47 @@ int a;
 // CHECK-32-EX-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32-EX:       omp.inner.for.cond:
-// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP150:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP269:![0-9]+]]
 // CHECK-32-EX-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
 // CHECK-32-EX-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32-EX:       omp.inner.for.body:
-// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP150]]
-// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP150]]
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP269]]
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP269]]
 // CHECK-32-EX-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
 // CHECK-32-EX-NEXT:    [[TMP9:%.*]] = inttoptr i32 [[TMP6]] to ptr
-// CHECK-32-EX-NEXT:    store ptr [[TMP9]], ptr [[TMP8]], align 4, !llvm.access.group [[ACC_GRP150]]
+// CHECK-32-EX-NEXT:    store ptr [[TMP9]], ptr [[TMP8]], align 4, !llvm.access.group [[ACC_GRP269]]
 // CHECK-32-EX-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
 // CHECK-32-EX-NEXT:    [[TMP11:%.*]] = inttoptr i32 [[TMP7]] to ptr
-// CHECK-32-EX-NEXT:    store ptr [[TMP11]], ptr [[TMP10]], align 4, !llvm.access.group [[ACC_GRP150]]
-// CHECK-32-EX-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l21_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i32 2), !llvm.access.group [[ACC_GRP150]]
+// CHECK-32-EX-NEXT:    store ptr [[TMP11]], ptr [[TMP10]], align 4, !llvm.access.group [[ACC_GRP269]]
+// CHECK-32-EX-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l21_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i32 2), !llvm.access.group [[ACC_GRP269]]
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32-EX:       omp.inner.for.inc:
-// CHECK-32-EX-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP150]]
-// CHECK-32-EX-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP150]]
+// CHECK-32-EX-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP269]]
+// CHECK-32-EX-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP269]]
 // CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP12]], [[TMP13]]
-// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP150]]
-// CHECK-32-EX-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP150]]
-// CHECK-32-EX-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP150]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP269]]
+// CHECK-32-EX-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP269]]
+// CHECK-32-EX-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP269]]
 // CHECK-32-EX-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP14]], [[TMP15]]
-// CHECK-32-EX-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP150]]
-// CHECK-32-EX-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP150]]
-// CHECK-32-EX-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP150]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP269]]
+// CHECK-32-EX-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP269]]
+// CHECK-32-EX-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP269]]
 // CHECK-32-EX-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP16]], [[TMP17]]
-// CHECK-32-EX-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP150]]
-// CHECK-32-EX-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP150]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP269]]
+// CHECK-32-EX-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP269]]
 // CHECK-32-EX-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP18]], 9
 // CHECK-32-EX-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
 // CHECK-32-EX:       cond.true5:
 // CHECK-32-EX-NEXT:    br label [[COND_END7:%.*]]
 // CHECK-32-EX:       cond.false6:
-// CHECK-32-EX-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP150]]
+// CHECK-32-EX-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP269]]
 // CHECK-32-EX-NEXT:    br label [[COND_END7]]
 // CHECK-32-EX:       cond.end7:
 // CHECK-32-EX-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP19]], [[COND_FALSE6]] ]
-// CHECK-32-EX-NEXT:    store i32 [[COND8]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP150]]
-// CHECK-32-EX-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP150]]
-// CHECK-32-EX-NEXT:    store i32 [[TMP20]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP150]]
-// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP151:![0-9]+]]
+// CHECK-32-EX-NEXT:    store i32 [[COND8]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP269]]
+// CHECK-32-EX-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP269]]
+// CHECK-32-EX-NEXT:    store i32 [[TMP20]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP269]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP270:![0-9]+]]
 // CHECK-32-EX:       omp.inner.for.end:
 // CHECK-32-EX-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK-32-EX:       omp.loop.exit:
@@ -18538,24 +18796,24 @@ int a;
 // CHECK-32-EX-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32-EX:       omp.inner.for.cond:
-// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP153:![0-9]+]]
-// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTPREVIOUS_UB__ADDR]], align 4, !llvm.access.group [[ACC_GRP153]]
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP272:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTPREVIOUS_UB__ADDR]], align 4, !llvm.access.group [[ACC_GRP272]]
 // CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp ule i32 [[TMP5]], [[TMP6]]
 // CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32-EX:       omp.inner.for.body:
-// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP153]]
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP272]]
 // CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
 // CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP153]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP272]]
 // CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32-EX:       omp.body.continue:
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32-EX:       omp.inner.for.inc:
-// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP153]]
-// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP153]]
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP272]]
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP272]]
 // CHECK-32-EX-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP8]], [[TMP9]]
-// CHECK-32-EX-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP153]]
-// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP154:![0-9]+]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP272]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP273:![0-9]+]]
 // CHECK-32-EX:       omp.inner.for.end:
 // CHECK-32-EX-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK-32-EX:       omp.loop.exit:
@@ -18571,11 +18829,13 @@ int a;
 //
 //
 // CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l24
-// CHECK-32-EX-SAME: () #[[ATTR0]] {
+// CHECK-32-EX-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
 // CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-EX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK-32-EX-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l24_kernel_environment)
+// CHECK-32-EX-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l24_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32-EX:       user_code.entry:
@@ -18627,47 +18887,47 @@ int a;
 // CHECK-32-EX-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32-EX:       omp.inner.for.cond:
-// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP156:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP275:![0-9]+]]
 // CHECK-32-EX-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
 // CHECK-32-EX-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32-EX:       omp.inner.for.body:
-// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP156]]
-// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP156]]
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP275]]
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP275]]
 // CHECK-32-EX-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
 // CHECK-32-EX-NEXT:    [[TMP9:%.*]] = inttoptr i32 [[TMP6]] to ptr
-// CHECK-32-EX-NEXT:    store ptr [[TMP9]], ptr [[TMP8]], align 4, !llvm.access.group [[ACC_GRP156]]
+// CHECK-32-EX-NEXT:    store ptr [[TMP9]], ptr [[TMP8]], align 4, !llvm.access.group [[ACC_GRP275]]
 // CHECK-32-EX-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
 // CHECK-32-EX-NEXT:    [[TMP11:%.*]] = inttoptr i32 [[TMP7]] to ptr
-// CHECK-32-EX-NEXT:    store ptr [[TMP11]], ptr [[TMP10]], align 4, !llvm.access.group [[ACC_GRP156]]
-// CHECK-32-EX-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l24_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i32 2), !llvm.access.group [[ACC_GRP156]]
+// CHECK-32-EX-NEXT:    store ptr [[TMP11]], ptr [[TMP10]], align 4, !llvm.access.group [[ACC_GRP275]]
+// CHECK-32-EX-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l24_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i32 2), !llvm.access.group [[ACC_GRP275]]
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32-EX:       omp.inner.for.inc:
-// CHECK-32-EX-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP156]]
-// CHECK-32-EX-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP156]]
+// CHECK-32-EX-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP275]]
+// CHECK-32-EX-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP275]]
 // CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP12]], [[TMP13]]
-// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP156]]
-// CHECK-32-EX-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP156]]
-// CHECK-32-EX-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP156]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP275]]
+// CHECK-32-EX-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP275]]
+// CHECK-32-EX-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP275]]
 // CHECK-32-EX-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP14]], [[TMP15]]
-// CHECK-32-EX-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP156]]
-// CHECK-32-EX-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP156]]
-// CHECK-32-EX-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP156]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP275]]
+// CHECK-32-EX-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP275]]
+// CHECK-32-EX-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP275]]
 // CHECK-32-EX-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP16]], [[TMP17]]
-// CHECK-32-EX-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP156]]
-// CHECK-32-EX-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP156]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP275]]
+// CHECK-32-EX-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP275]]
 // CHECK-32-EX-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP18]], 9
 // CHECK-32-EX-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
 // CHECK-32-EX:       cond.true5:
 // CHECK-32-EX-NEXT:    br label [[COND_END7:%.*]]
 // CHECK-32-EX:       cond.false6:
-// CHECK-32-EX-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP156]]
+// CHECK-32-EX-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP275]]
 // CHECK-32-EX-NEXT:    br label [[COND_END7]]
 // CHECK-32-EX:       cond.end7:
 // CHECK-32-EX-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP19]], [[COND_FALSE6]] ]
-// CHECK-32-EX-NEXT:    store i32 [[COND8]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP156]]
-// CHECK-32-EX-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP156]]
-// CHECK-32-EX-NEXT:    store i32 [[TMP20]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP156]]
-// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP157:![0-9]+]]
+// CHECK-32-EX-NEXT:    store i32 [[COND8]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP275]]
+// CHECK-32-EX-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP275]]
+// CHECK-32-EX-NEXT:    store i32 [[TMP20]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP275]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP276:![0-9]+]]
 // CHECK-32-EX:       omp.inner.for.end:
 // CHECK-32-EX-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK-32-EX:       omp.loop.exit:
@@ -18723,23 +18983,23 @@ int a;
 // CHECK-32-EX-NEXT:    store i32 [[TMP7]], ptr [[DOTOMP_IV]], align 4
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32-EX:       omp.inner.for.cond:
-// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP159:![0-9]+]]
-// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP159]]
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP278:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP278]]
 // CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
 // CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32-EX:       omp.inner.for.body:
-// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP159]]
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP278]]
 // CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
 // CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP159]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP278]]
 // CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32-EX:       omp.body.continue:
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32-EX:       omp.inner.for.inc:
-// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP159]]
+// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP278]]
 // CHECK-32-EX-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP11]], 1
-// CHECK-32-EX-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP159]]
-// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP160:![0-9]+]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP278]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP279:![0-9]+]]
 // CHECK-32-EX:       omp.inner.for.end:
 // CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-32-EX:       omp.dispatch.inc:
@@ -18756,11 +19016,13 @@ int a;
 //
 //
 // CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l27
-// CHECK-32-EX-SAME: () #[[ATTR0]] {
+// CHECK-32-EX-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
 // CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-EX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK-32-EX-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l27_kernel_environment)
+// CHECK-32-EX-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l27_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32-EX:       user_code.entry:
@@ -18812,47 +19074,47 @@ int a;
 // CHECK-32-EX-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32-EX:       omp.inner.for.cond:
-// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP162:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP281:![0-9]+]]
 // CHECK-32-EX-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
 // CHECK-32-EX-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32-EX:       omp.inner.for.body:
-// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP162]]
-// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP162]]
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP281]]
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP281]]
 // CHECK-32-EX-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
 // CHECK-32-EX-NEXT:    [[TMP9:%.*]] = inttoptr i32 [[TMP6]] to ptr
-// CHECK-32-EX-NEXT:    store ptr [[TMP9]], ptr [[TMP8]], align 4, !llvm.access.group [[ACC_GRP162]]
+// CHECK-32-EX-NEXT:    store ptr [[TMP9]], ptr [[TMP8]], align 4, !llvm.access.group [[ACC_GRP281]]
 // CHECK-32-EX-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
 // CHECK-32-EX-NEXT:    [[TMP11:%.*]] = inttoptr i32 [[TMP7]] to ptr
-// CHECK-32-EX-NEXT:    store ptr [[TMP11]], ptr [[TMP10]], align 4, !llvm.access.group [[ACC_GRP162]]
-// CHECK-32-EX-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l27_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i32 2), !llvm.access.group [[ACC_GRP162]]
+// CHECK-32-EX-NEXT:    store ptr [[TMP11]], ptr [[TMP10]], align 4, !llvm.access.group [[ACC_GRP281]]
+// CHECK-32-EX-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l27_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i32 2), !llvm.access.group [[ACC_GRP281]]
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32-EX:       omp.inner.for.inc:
-// CHECK-32-EX-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP162]]
-// CHECK-32-EX-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP162]]
+// CHECK-32-EX-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP281]]
+// CHECK-32-EX-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP281]]
 // CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP12]], [[TMP13]]
-// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP162]]
-// CHECK-32-EX-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP162]]
-// CHECK-32-EX-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP162]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP281]]
+// CHECK-32-EX-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP281]]
+// CHECK-32-EX-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP281]]
 // CHECK-32-EX-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP14]], [[TMP15]]
-// CHECK-32-EX-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP162]]
-// CHECK-32-EX-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP162]]
-// CHECK-32-EX-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP162]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP281]]
+// CHECK-32-EX-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP281]]
+// CHECK-32-EX-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP281]]
 // CHECK-32-EX-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP16]], [[TMP17]]
-// CHECK-32-EX-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP162]]
-// CHECK-32-EX-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP162]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP281]]
+// CHECK-32-EX-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP281]]
 // CHECK-32-EX-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP18]], 9
 // CHECK-32-EX-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
 // CHECK-32-EX:       cond.true5:
 // CHECK-32-EX-NEXT:    br label [[COND_END7:%.*]]
 // CHECK-32-EX:       cond.false6:
-// CHECK-32-EX-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP162]]
+// CHECK-32-EX-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP281]]
 // CHECK-32-EX-NEXT:    br label [[COND_END7]]
 // CHECK-32-EX:       cond.end7:
 // CHECK-32-EX-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP19]], [[COND_FALSE6]] ]
-// CHECK-32-EX-NEXT:    store i32 [[COND8]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP162]]
-// CHECK-32-EX-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP162]]
-// CHECK-32-EX-NEXT:    store i32 [[TMP20]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP162]]
-// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP163:![0-9]+]]
+// CHECK-32-EX-NEXT:    store i32 [[COND8]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP281]]
+// CHECK-32-EX-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP281]]
+// CHECK-32-EX-NEXT:    store i32 [[TMP20]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP281]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP282:![0-9]+]]
 // CHECK-32-EX:       omp.inner.for.end:
 // CHECK-32-EX-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK-32-EX:       omp.loop.exit:
@@ -18908,23 +19170,23 @@ int a;
 // CHECK-32-EX-NEXT:    store i32 [[TMP7]], ptr [[DOTOMP_IV]], align 4
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32-EX:       omp.inner.for.cond:
-// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP165:![0-9]+]]
-// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP165]]
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP284:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP284]]
 // CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
 // CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32-EX:       omp.inner.for.body:
-// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP165]]
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP284]]
 // CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
 // CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP165]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP284]]
 // CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32-EX:       omp.body.continue:
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32-EX:       omp.inner.for.inc:
-// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP165]]
+// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP284]]
 // CHECK-32-EX-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP11]], 1
-// CHECK-32-EX-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP165]]
-// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP166:![0-9]+]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP284]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP285:![0-9]+]]
 // CHECK-32-EX:       omp.inner.for.end:
 // CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-32-EX:       omp.dispatch.inc:
@@ -18941,11 +19203,13 @@ int a;
 //
 //
 // CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l30
-// CHECK-32-EX-SAME: () #[[ATTR0]] {
+// CHECK-32-EX-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
 // CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-EX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK-32-EX-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l30_kernel_environment)
+// CHECK-32-EX-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l30_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32-EX:       user_code.entry:
@@ -18997,47 +19261,47 @@ int a;
 // CHECK-32-EX-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32-EX:       omp.inner.for.cond:
-// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP168:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP287:![0-9]+]]
 // CHECK-32-EX-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
 // CHECK-32-EX-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32-EX:       omp.inner.for.body:
-// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP168]]
-// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP168]]
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP287]]
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP287]]
 // CHECK-32-EX-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
 // CHECK-32-EX-NEXT:    [[TMP9:%.*]] = inttoptr i32 [[TMP6]] to ptr
-// CHECK-32-EX-NEXT:    store ptr [[TMP9]], ptr [[TMP8]], align 4, !llvm.access.group [[ACC_GRP168]]
+// CHECK-32-EX-NEXT:    store ptr [[TMP9]], ptr [[TMP8]], align 4, !llvm.access.group [[ACC_GRP287]]
 // CHECK-32-EX-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
 // CHECK-32-EX-NEXT:    [[TMP11:%.*]] = inttoptr i32 [[TMP7]] to ptr
-// CHECK-32-EX-NEXT:    store ptr [[TMP11]], ptr [[TMP10]], align 4, !llvm.access.group [[ACC_GRP168]]
-// CHECK-32-EX-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l30_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i32 2), !llvm.access.group [[ACC_GRP168]]
+// CHECK-32-EX-NEXT:    store ptr [[TMP11]], ptr [[TMP10]], align 4, !llvm.access.group [[ACC_GRP287]]
+// CHECK-32-EX-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l30_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i32 2), !llvm.access.group [[ACC_GRP287]]
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32-EX:       omp.inner.for.inc:
-// CHECK-32-EX-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP168]]
-// CHECK-32-EX-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP168]]
+// CHECK-32-EX-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP287]]
+// CHECK-32-EX-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP287]]
 // CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP12]], [[TMP13]]
-// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP168]]
-// CHECK-32-EX-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP168]]
-// CHECK-32-EX-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP168]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP287]]
+// CHECK-32-EX-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP287]]
+// CHECK-32-EX-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP287]]
 // CHECK-32-EX-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP14]], [[TMP15]]
-// CHECK-32-EX-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP168]]
-// CHECK-32-EX-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP168]]
-// CHECK-32-EX-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP168]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP287]]
+// CHECK-32-EX-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP287]]
+// CHECK-32-EX-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP287]]
 // CHECK-32-EX-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP16]], [[TMP17]]
-// CHECK-32-EX-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP168]]
-// CHECK-32-EX-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP168]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP287]]
+// CHECK-32-EX-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP287]]
 // CHECK-32-EX-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP18]], 9
 // CHECK-32-EX-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
 // CHECK-32-EX:       cond.true5:
 // CHECK-32-EX-NEXT:    br label [[COND_END7:%.*]]
 // CHECK-32-EX:       cond.false6:
-// CHECK-32-EX-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP168]]
+// CHECK-32-EX-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP287]]
 // CHECK-32-EX-NEXT:    br label [[COND_END7]]
 // CHECK-32-EX:       cond.end7:
 // CHECK-32-EX-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP19]], [[COND_FALSE6]] ]
-// CHECK-32-EX-NEXT:    store i32 [[COND8]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP168]]
-// CHECK-32-EX-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP168]]
-// CHECK-32-EX-NEXT:    store i32 [[TMP20]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP168]]
-// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP169:![0-9]+]]
+// CHECK-32-EX-NEXT:    store i32 [[COND8]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP287]]
+// CHECK-32-EX-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP287]]
+// CHECK-32-EX-NEXT:    store i32 [[TMP20]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP287]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP288:![0-9]+]]
 // CHECK-32-EX:       omp.inner.for.end:
 // CHECK-32-EX-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK-32-EX:       omp.loop.exit:
@@ -19093,23 +19357,23 @@ int a;
 // CHECK-32-EX-NEXT:    store i32 [[TMP7]], ptr [[DOTOMP_IV]], align 4
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32-EX:       omp.inner.for.cond:
-// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP171:![0-9]+]]
-// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP171]]
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP290:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP290]]
 // CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
 // CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32-EX:       omp.inner.for.body:
-// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP171]]
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP290]]
 // CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
 // CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP171]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP290]]
 // CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32-EX:       omp.body.continue:
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32-EX:       omp.inner.for.inc:
-// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP171]]
+// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP290]]
 // CHECK-32-EX-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP11]], 1
-// CHECK-32-EX-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP171]]
-// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP172:![0-9]+]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP290]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP291:![0-9]+]]
 // CHECK-32-EX:       omp.inner.for.end:
 // CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-32-EX:       omp.dispatch.inc:
@@ -19126,11 +19390,13 @@ int a;
 //
 //
 // CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l33
-// CHECK-32-EX-SAME: () #[[ATTR0]] {
+// CHECK-32-EX-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
 // CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-EX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK-32-EX-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l33_kernel_environment)
+// CHECK-32-EX-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l33_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32-EX:       user_code.entry:
@@ -19182,47 +19448,47 @@ int a;
 // CHECK-32-EX-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32-EX:       omp.inner.for.cond:
-// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP174:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP293:![0-9]+]]
 // CHECK-32-EX-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
 // CHECK-32-EX-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32-EX:       omp.inner.for.body:
-// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP174]]
-// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP174]]
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP293]]
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP293]]
 // CHECK-32-EX-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
 // CHECK-32-EX-NEXT:    [[TMP9:%.*]] = inttoptr i32 [[TMP6]] to ptr
-// CHECK-32-EX-NEXT:    store ptr [[TMP9]], ptr [[TMP8]], align 4, !llvm.access.group [[ACC_GRP174]]
+// CHECK-32-EX-NEXT:    store ptr [[TMP9]], ptr [[TMP8]], align 4, !llvm.access.group [[ACC_GRP293]]
 // CHECK-32-EX-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
 // CHECK-32-EX-NEXT:    [[TMP11:%.*]] = inttoptr i32 [[TMP7]] to ptr
-// CHECK-32-EX-NEXT:    store ptr [[TMP11]], ptr [[TMP10]], align 4, !llvm.access.group [[ACC_GRP174]]
-// CHECK-32-EX-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l33_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i32 2), !llvm.access.group [[ACC_GRP174]]
+// CHECK-32-EX-NEXT:    store ptr [[TMP11]], ptr [[TMP10]], align 4, !llvm.access.group [[ACC_GRP293]]
+// CHECK-32-EX-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l33_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i32 2), !llvm.access.group [[ACC_GRP293]]
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32-EX:       omp.inner.for.inc:
-// CHECK-32-EX-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP174]]
-// CHECK-32-EX-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP174]]
+// CHECK-32-EX-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP293]]
+// CHECK-32-EX-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP293]]
 // CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP12]], [[TMP13]]
-// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP174]]
-// CHECK-32-EX-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP174]]
-// CHECK-32-EX-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP174]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP293]]
+// CHECK-32-EX-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP293]]
+// CHECK-32-EX-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP293]]
 // CHECK-32-EX-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP14]], [[TMP15]]
-// CHECK-32-EX-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP174]]
-// CHECK-32-EX-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP174]]
-// CHECK-32-EX-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP174]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP293]]
+// CHECK-32-EX-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP293]]
+// CHECK-32-EX-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP293]]
 // CHECK-32-EX-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP16]], [[TMP17]]
-// CHECK-32-EX-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP174]]
-// CHECK-32-EX-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP174]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP293]]
+// CHECK-32-EX-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP293]]
 // CHECK-32-EX-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP18]], 9
 // CHECK-32-EX-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
 // CHECK-32-EX:       cond.true5:
 // CHECK-32-EX-NEXT:    br label [[COND_END7:%.*]]
 // CHECK-32-EX:       cond.false6:
-// CHECK-32-EX-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP174]]
+// CHECK-32-EX-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP293]]
 // CHECK-32-EX-NEXT:    br label [[COND_END7]]
 // CHECK-32-EX:       cond.end7:
 // CHECK-32-EX-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP19]], [[COND_FALSE6]] ]
-// CHECK-32-EX-NEXT:    store i32 [[COND8]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP174]]
-// CHECK-32-EX-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP174]]
-// CHECK-32-EX-NEXT:    store i32 [[TMP20]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP174]]
-// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP175:![0-9]+]]
+// CHECK-32-EX-NEXT:    store i32 [[COND8]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP293]]
+// CHECK-32-EX-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP293]]
+// CHECK-32-EX-NEXT:    store i32 [[TMP20]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP293]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP294:![0-9]+]]
 // CHECK-32-EX:       omp.inner.for.end:
 // CHECK-32-EX-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK-32-EX:       omp.loop.exit:
@@ -19278,23 +19544,23 @@ int a;
 // CHECK-32-EX-NEXT:    store i32 [[TMP7]], ptr [[DOTOMP_IV]], align 4
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32-EX:       omp.inner.for.cond:
-// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP177:![0-9]+]]
-// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP177]]
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP296:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP296]]
 // CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
 // CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32-EX:       omp.inner.for.body:
-// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP177]]
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP296]]
 // CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
 // CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP177]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP296]]
 // CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32-EX:       omp.body.continue:
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32-EX:       omp.inner.for.inc:
-// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP177]]
+// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP296]]
 // CHECK-32-EX-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP11]], 1
-// CHECK-32-EX-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP177]]
-// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP178:![0-9]+]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP296]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP297:![0-9]+]]
 // CHECK-32-EX:       omp.inner.for.end:
 // CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-32-EX:       omp.dispatch.inc:
@@ -19311,14 +19577,16 @@ int a;
 //
 //
 // CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l37
-// CHECK-32-EX-SAME: (i32 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK-32-EX-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-EX-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 // CHECK-32-EX-NEXT:    [[A_CASTED:%.*]] = alloca i32, align 4
 // CHECK-32-EX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK-32-EX-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK-32-EX-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
-// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l37_kernel_environment)
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l37_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32-EX:       user_code.entry:
@@ -19512,11 +19780,13 @@ int a;
 //
 //
 // CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l40
-// CHECK-32-EX-SAME: () #[[ATTR0]] {
+// CHECK-32-EX-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
 // CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-EX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK-32-EX-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l40_kernel_environment)
+// CHECK-32-EX-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l40_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32-EX:       user_code.entry:
@@ -19685,11 +19955,13 @@ int a;
 //
 //
 // CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l43
-// CHECK-32-EX-SAME: () #[[ATTR0]] {
+// CHECK-32-EX-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
 // CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-EX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK-32-EX-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l43_kernel_environment)
+// CHECK-32-EX-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l43_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32-EX:       user_code.entry:
@@ -19848,11 +20120,13 @@ int a;
 //
 //
 // CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l46
-// CHECK-32-EX-SAME: () #[[ATTR0]] {
+// CHECK-32-EX-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
 // CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-EX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK-32-EX-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l46_kernel_environment)
+// CHECK-32-EX-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l46_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32-EX:       user_code.entry:
@@ -19993,23 +20267,23 @@ int a;
 // CHECK-32-EX-NEXT:    store i32 [[TMP7]], ptr [[DOTOMP_IV]], align 4
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32-EX:       omp.inner.for.cond:
-// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP180:![0-9]+]]
-// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP180]]
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP299:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP299]]
 // CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
 // CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32-EX:       omp.inner.for.body:
-// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP180]]
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP299]]
 // CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
 // CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP180]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP299]]
 // CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32-EX:       omp.body.continue:
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32-EX:       omp.inner.for.inc:
-// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP180]]
+// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP299]]
 // CHECK-32-EX-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP11]], 1
-// CHECK-32-EX-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP180]]
-// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP181:![0-9]+]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP299]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP300:![0-9]+]]
 // CHECK-32-EX:       omp.inner.for.end:
 // CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-32-EX:       omp.dispatch.inc:
@@ -20019,11 +20293,13 @@ int a;
 //
 //
 // CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l49
-// CHECK-32-EX-SAME: () #[[ATTR0]] {
+// CHECK-32-EX-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
 // CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-EX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK-32-EX-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l49_kernel_environment)
+// CHECK-32-EX-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l49_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32-EX:       user_code.entry:
@@ -20164,23 +20440,23 @@ int a;
 // CHECK-32-EX-NEXT:    store i32 [[TMP7]], ptr [[DOTOMP_IV]], align 4
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32-EX:       omp.inner.for.cond:
-// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP183:![0-9]+]]
-// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP183]]
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP302:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP302]]
 // CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
 // CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32-EX:       omp.inner.for.body:
-// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP183]]
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP302]]
 // CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
 // CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP183]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP302]]
 // CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32-EX:       omp.body.continue:
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32-EX:       omp.inner.for.inc:
-// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP183]]
+// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP302]]
 // CHECK-32-EX-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP11]], 1
-// CHECK-32-EX-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP183]]
-// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP184:![0-9]+]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP302]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP303:![0-9]+]]
 // CHECK-32-EX:       omp.inner.for.end:
 // CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-32-EX:       omp.dispatch.inc:
@@ -20190,11 +20466,13 @@ int a;
 //
 //
 // CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l52
-// CHECK-32-EX-SAME: () #[[ATTR0]] {
+// CHECK-32-EX-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
 // CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-EX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK-32-EX-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l52_kernel_environment)
+// CHECK-32-EX-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l52_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32-EX:       user_code.entry:
@@ -20335,23 +20613,23 @@ int a;
 // CHECK-32-EX-NEXT:    store i32 [[TMP7]], ptr [[DOTOMP_IV]], align 4
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32-EX:       omp.inner.for.cond:
-// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP186:![0-9]+]]
-// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP186]]
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP305:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP305]]
 // CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
 // CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32-EX:       omp.inner.for.body:
-// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP186]]
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP305]]
 // CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
 // CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP186]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP305]]
 // CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32-EX:       omp.body.continue:
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32-EX:       omp.inner.for.inc:
-// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP186]]
+// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP305]]
 // CHECK-32-EX-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP11]], 1
-// CHECK-32-EX-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP186]]
-// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP187:![0-9]+]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP305]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP306:![0-9]+]]
 // CHECK-32-EX:       omp.inner.for.end:
 // CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-32-EX:       omp.dispatch.inc:
@@ -20361,11 +20639,13 @@ int a;
 //
 //
 // CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l55
-// CHECK-32-EX-SAME: () #[[ATTR0]] {
+// CHECK-32-EX-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
 // CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-EX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK-32-EX-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l55_kernel_environment)
+// CHECK-32-EX-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l55_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32-EX:       user_code.entry:
@@ -20506,23 +20786,23 @@ int a;
 // CHECK-32-EX-NEXT:    store i32 [[TMP7]], ptr [[DOTOMP_IV]], align 4
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32-EX:       omp.inner.for.cond:
-// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP189:![0-9]+]]
-// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP189]]
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP308:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP308]]
 // CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
 // CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32-EX:       omp.inner.for.body:
-// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP189]]
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP308]]
 // CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
 // CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP189]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP308]]
 // CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32-EX:       omp.body.continue:
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32-EX:       omp.inner.for.inc:
-// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP189]]
+// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP308]]
 // CHECK-32-EX-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP11]], 1
-// CHECK-32-EX-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP189]]
-// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP190:![0-9]+]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP308]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP309:![0-9]+]]
 // CHECK-32-EX:       omp.inner.for.end:
 // CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-32-EX:       omp.dispatch.inc:
@@ -20532,11 +20812,13 @@ int a;
 //
 //
 // CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l58
-// CHECK-32-EX-SAME: () #[[ATTR0]] {
+// CHECK-32-EX-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
 // CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-EX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK-32-EX-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l58_kernel_environment)
+// CHECK-32-EX-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l58_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32-EX:       user_code.entry:
@@ -20589,47 +20871,47 @@ int a;
 // CHECK-32-EX-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32-EX:       omp.inner.for.cond:
-// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP192:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP311:![0-9]+]]
 // CHECK-32-EX-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
 // CHECK-32-EX-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32-EX:       omp.inner.for.body:
-// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP192]]
-// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP192]]
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP311]]
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP311]]
 // CHECK-32-EX-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
 // CHECK-32-EX-NEXT:    [[TMP9:%.*]] = inttoptr i32 [[TMP6]] to ptr
-// CHECK-32-EX-NEXT:    store ptr [[TMP9]], ptr [[TMP8]], align 4, !llvm.access.group [[ACC_GRP192]]
+// CHECK-32-EX-NEXT:    store ptr [[TMP9]], ptr [[TMP8]], align 4, !llvm.access.group [[ACC_GRP311]]
 // CHECK-32-EX-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
 // CHECK-32-EX-NEXT:    [[TMP11:%.*]] = inttoptr i32 [[TMP7]] to ptr
-// CHECK-32-EX-NEXT:    store ptr [[TMP11]], ptr [[TMP10]], align 4, !llvm.access.group [[ACC_GRP192]]
-// CHECK-32-EX-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l58_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i32 2), !llvm.access.group [[ACC_GRP192]]
+// CHECK-32-EX-NEXT:    store ptr [[TMP11]], ptr [[TMP10]], align 4, !llvm.access.group [[ACC_GRP311]]
+// CHECK-32-EX-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l58_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i32 2), !llvm.access.group [[ACC_GRP311]]
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32-EX:       omp.inner.for.inc:
-// CHECK-32-EX-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP192]]
-// CHECK-32-EX-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP192]]
+// CHECK-32-EX-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP311]]
+// CHECK-32-EX-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP311]]
 // CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP12]], [[TMP13]]
-// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP192]]
-// CHECK-32-EX-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP192]]
-// CHECK-32-EX-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP192]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP311]]
+// CHECK-32-EX-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP311]]
+// CHECK-32-EX-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP311]]
 // CHECK-32-EX-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP14]], [[TMP15]]
-// CHECK-32-EX-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP192]]
-// CHECK-32-EX-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP192]]
-// CHECK-32-EX-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP192]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP311]]
+// CHECK-32-EX-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP311]]
+// CHECK-32-EX-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP311]]
 // CHECK-32-EX-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP16]], [[TMP17]]
-// CHECK-32-EX-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP192]]
-// CHECK-32-EX-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP192]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP311]]
+// CHECK-32-EX-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP311]]
 // CHECK-32-EX-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP18]], 9
 // CHECK-32-EX-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
 // CHECK-32-EX:       cond.true5:
 // CHECK-32-EX-NEXT:    br label [[COND_END7:%.*]]
 // CHECK-32-EX:       cond.false6:
-// CHECK-32-EX-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP192]]
+// CHECK-32-EX-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP311]]
 // CHECK-32-EX-NEXT:    br label [[COND_END7]]
 // CHECK-32-EX:       cond.end7:
 // CHECK-32-EX-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP19]], [[COND_FALSE6]] ]
-// CHECK-32-EX-NEXT:    store i32 [[COND8]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP192]]
-// CHECK-32-EX-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP192]]
-// CHECK-32-EX-NEXT:    store i32 [[TMP20]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP192]]
-// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP193:![0-9]+]]
+// CHECK-32-EX-NEXT:    store i32 [[COND8]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP311]]
+// CHECK-32-EX-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP311]]
+// CHECK-32-EX-NEXT:    store i32 [[TMP20]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP311]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP312:![0-9]+]]
 // CHECK-32-EX:       omp.inner.for.end:
 // CHECK-32-EX-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK-32-EX:       omp.loop.exit:
@@ -20677,24 +20959,24 @@ int a;
 // CHECK-32-EX-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32-EX:       omp.inner.for.cond:
-// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP195:![0-9]+]]
-// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTPREVIOUS_UB__ADDR]], align 4, !llvm.access.group [[ACC_GRP195]]
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP314:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTPREVIOUS_UB__ADDR]], align 4, !llvm.access.group [[ACC_GRP314]]
 // CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp ule i32 [[TMP5]], [[TMP6]]
 // CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32-EX:       omp.inner.for.body:
-// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP195]]
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP314]]
 // CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
 // CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP195]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP314]]
 // CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32-EX:       omp.body.continue:
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32-EX:       omp.inner.for.inc:
-// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP195]]
-// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP195]]
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP314]]
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP314]]
 // CHECK-32-EX-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP8]], [[TMP9]]
-// CHECK-32-EX-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP195]]
-// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP196:![0-9]+]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP314]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP315:![0-9]+]]
 // CHECK-32-EX:       omp.inner.for.end:
 // CHECK-32-EX-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK-32-EX:       omp.loop.exit:
@@ -20710,11 +20992,13 @@ int a;
 //
 //
 // CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l66
-// CHECK-32-EX-SAME: () #[[ATTR0]] {
+// CHECK-32-EX-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
 // CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-EX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK-32-EX-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l66_kernel_environment)
+// CHECK-32-EX-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l66_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32-EX:       user_code.entry:
@@ -20768,47 +21052,47 @@ int a;
 // CHECK-32-EX-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32-EX:       omp.inner.for.cond:
-// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP198:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP317:![0-9]+]]
 // CHECK-32-EX-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
 // CHECK-32-EX-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32-EX:       omp.inner.for.body:
-// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP198]]
-// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP198]]
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP317]]
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP317]]
 // CHECK-32-EX-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
 // CHECK-32-EX-NEXT:    [[TMP9:%.*]] = inttoptr i32 [[TMP6]] to ptr
-// CHECK-32-EX-NEXT:    store ptr [[TMP9]], ptr [[TMP8]], align 4, !llvm.access.group [[ACC_GRP198]]
+// CHECK-32-EX-NEXT:    store ptr [[TMP9]], ptr [[TMP8]], align 4, !llvm.access.group [[ACC_GRP317]]
 // CHECK-32-EX-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
 // CHECK-32-EX-NEXT:    [[TMP11:%.*]] = inttoptr i32 [[TMP7]] to ptr
-// CHECK-32-EX-NEXT:    store ptr [[TMP11]], ptr [[TMP10]], align 4, !llvm.access.group [[ACC_GRP198]]
-// CHECK-32-EX-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l66_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i32 2), !llvm.access.group [[ACC_GRP198]]
+// CHECK-32-EX-NEXT:    store ptr [[TMP11]], ptr [[TMP10]], align 4, !llvm.access.group [[ACC_GRP317]]
+// CHECK-32-EX-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l66_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i32 2), !llvm.access.group [[ACC_GRP317]]
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32-EX:       omp.inner.for.inc:
-// CHECK-32-EX-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP198]]
-// CHECK-32-EX-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP198]]
+// CHECK-32-EX-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP317]]
+// CHECK-32-EX-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP317]]
 // CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP12]], [[TMP13]]
-// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP198]]
-// CHECK-32-EX-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP198]]
-// CHECK-32-EX-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP198]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP317]]
+// CHECK-32-EX-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP317]]
+// CHECK-32-EX-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP317]]
 // CHECK-32-EX-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP14]], [[TMP15]]
-// CHECK-32-EX-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP198]]
-// CHECK-32-EX-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP198]]
-// CHECK-32-EX-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP198]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP317]]
+// CHECK-32-EX-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP317]]
+// CHECK-32-EX-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP317]]
 // CHECK-32-EX-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP16]], [[TMP17]]
-// CHECK-32-EX-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP198]]
-// CHECK-32-EX-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP198]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP317]]
+// CHECK-32-EX-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP317]]
 // CHECK-32-EX-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP18]], 9
 // CHECK-32-EX-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
 // CHECK-32-EX:       cond.true5:
 // CHECK-32-EX-NEXT:    br label [[COND_END7:%.*]]
 // CHECK-32-EX:       cond.false6:
-// CHECK-32-EX-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP198]]
+// CHECK-32-EX-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP317]]
 // CHECK-32-EX-NEXT:    br label [[COND_END7]]
 // CHECK-32-EX:       cond.end7:
 // CHECK-32-EX-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP19]], [[COND_FALSE6]] ]
-// CHECK-32-EX-NEXT:    store i32 [[COND8]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP198]]
-// CHECK-32-EX-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP198]]
-// CHECK-32-EX-NEXT:    store i32 [[TMP20]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP198]]
-// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP199:![0-9]+]]
+// CHECK-32-EX-NEXT:    store i32 [[COND8]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP317]]
+// CHECK-32-EX-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP317]]
+// CHECK-32-EX-NEXT:    store i32 [[TMP20]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP317]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP318:![0-9]+]]
 // CHECK-32-EX:       omp.inner.for.end:
 // CHECK-32-EX-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK-32-EX:       omp.loop.exit:
@@ -20867,23 +21151,23 @@ int a;
 // CHECK-32-EX-NEXT:    store i32 [[TMP6]], ptr [[DOTOMP_IV]], align 4
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32-EX:       omp.inner.for.cond:
-// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP201:![0-9]+]]
-// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP201]]
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP320:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP320]]
 // CHECK-32-EX-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]]
 // CHECK-32-EX-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32-EX:       omp.inner.for.body:
-// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP201]]
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP320]]
 // CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP9]], 1
 // CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP201]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP320]]
 // CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32-EX:       omp.body.continue:
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32-EX:       omp.inner.for.inc:
-// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP201]]
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP320]]
 // CHECK-32-EX-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP10]], 1
-// CHECK-32-EX-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP201]]
-// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP202:![0-9]+]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP320]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP321:![0-9]+]]
 // CHECK-32-EX:       omp.inner.for.end:
 // CHECK-32-EX-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK-32-EX:       omp.loop.exit:
@@ -20899,11 +21183,13 @@ int a;
 //
 //
 // CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l73
-// CHECK-32-EX-SAME: () #[[ATTR0]] {
+// CHECK-32-EX-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
 // CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-EX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK-32-EX-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l73_kernel_environment)
+// CHECK-32-EX-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l73_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32-EX:       user_code.entry:
@@ -20956,27 +21242,27 @@ int a;
 // CHECK-32-EX-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32-EX:       omp.inner.for.cond:
-// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP204:![0-9]+]]
-// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP204]]
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP323:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP323]]
 // CHECK-32-EX-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
 // CHECK-32-EX-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32-EX:       omp.inner.for.body:
-// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP204]]
-// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP204]]
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP323]]
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP323]]
 // CHECK-32-EX-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [2 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
 // CHECK-32-EX-NEXT:    [[TMP10:%.*]] = inttoptr i32 [[TMP7]] to ptr
-// CHECK-32-EX-NEXT:    store ptr [[TMP10]], ptr [[TMP9]], align 4, !llvm.access.group [[ACC_GRP204]]
+// CHECK-32-EX-NEXT:    store ptr [[TMP10]], ptr [[TMP9]], align 4, !llvm.access.group [[ACC_GRP323]]
 // CHECK-32-EX-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [2 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
 // CHECK-32-EX-NEXT:    [[TMP12:%.*]] = inttoptr i32 [[TMP8]] to ptr
-// CHECK-32-EX-NEXT:    store ptr [[TMP12]], ptr [[TMP11]], align 4, !llvm.access.group [[ACC_GRP204]]
-// CHECK-32-EX-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l73_omp_outlined_omp_outlined, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l73_omp_outlined_omp_outlined_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i32 2), !llvm.access.group [[ACC_GRP204]]
+// CHECK-32-EX-NEXT:    store ptr [[TMP12]], ptr [[TMP11]], align 4, !llvm.access.group [[ACC_GRP323]]
+// CHECK-32-EX-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l73_omp_outlined_omp_outlined, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l73_omp_outlined_omp_outlined_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i32 2), !llvm.access.group [[ACC_GRP323]]
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32-EX:       omp.inner.for.inc:
-// CHECK-32-EX-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP204]]
-// CHECK-32-EX-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP204]]
+// CHECK-32-EX-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP323]]
+// CHECK-32-EX-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP323]]
 // CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP13]], [[TMP14]]
-// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP204]]
-// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP205:![0-9]+]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP323]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP324:![0-9]+]]
 // CHECK-32-EX:       omp.inner.for.end:
 // CHECK-32-EX-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK-32-EX:       omp.loop.exit:
@@ -21027,24 +21313,24 @@ int a;
 // CHECK-32-EX-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32-EX:       omp.inner.for.cond:
-// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP207:![0-9]+]]
-// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTPREVIOUS_UB__ADDR]], align 4, !llvm.access.group [[ACC_GRP207]]
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP326:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTPREVIOUS_UB__ADDR]], align 4, !llvm.access.group [[ACC_GRP326]]
 // CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp ule i32 [[TMP5]], [[TMP6]]
 // CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32-EX:       omp.inner.for.body:
-// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP207]]
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP326]]
 // CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
 // CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP207]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP326]]
 // CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32-EX:       omp.body.continue:
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32-EX:       omp.inner.for.inc:
-// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP207]]
-// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP207]]
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP326]]
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP326]]
 // CHECK-32-EX-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP8]], [[TMP9]]
-// CHECK-32-EX-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP207]]
-// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP208:![0-9]+]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP326]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP327:![0-9]+]]
 // CHECK-32-EX:       omp.inner.for.end:
 // CHECK-32-EX-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK-32-EX:       omp.loop.exit:
@@ -21080,11 +21366,13 @@ int a;
 //
 //
 // CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l81
-// CHECK-32-EX-SAME: () #[[ATTR0]] {
+// CHECK-32-EX-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
 // CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-EX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK-32-EX-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l81_kernel_environment)
+// CHECK-32-EX-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l81_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32-EX:       user_code.entry:
@@ -21136,47 +21424,47 @@ int a;
 // CHECK-32-EX-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32-EX:       omp.inner.for.cond:
-// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP210:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP329:![0-9]+]]
 // CHECK-32-EX-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
 // CHECK-32-EX-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32-EX:       omp.inner.for.body:
-// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP210]]
-// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP210]]
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP329]]
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP329]]
 // CHECK-32-EX-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
 // CHECK-32-EX-NEXT:    [[TMP9:%.*]] = inttoptr i32 [[TMP6]] to ptr
-// CHECK-32-EX-NEXT:    store ptr [[TMP9]], ptr [[TMP8]], align 4, !llvm.access.group [[ACC_GRP210]]
+// CHECK-32-EX-NEXT:    store ptr [[TMP9]], ptr [[TMP8]], align 4, !llvm.access.group [[ACC_GRP329]]
 // CHECK-32-EX-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
 // CHECK-32-EX-NEXT:    [[TMP11:%.*]] = inttoptr i32 [[TMP7]] to ptr
-// CHECK-32-EX-NEXT:    store ptr [[TMP11]], ptr [[TMP10]], align 4, !llvm.access.group [[ACC_GRP210]]
-// CHECK-32-EX-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l81_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i32 2), !llvm.access.group [[ACC_GRP210]]
+// CHECK-32-EX-NEXT:    store ptr [[TMP11]], ptr [[TMP10]], align 4, !llvm.access.group [[ACC_GRP329]]
+// CHECK-32-EX-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l81_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i32 2), !llvm.access.group [[ACC_GRP329]]
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32-EX:       omp.inner.for.inc:
-// CHECK-32-EX-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP210]]
-// CHECK-32-EX-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP210]]
+// CHECK-32-EX-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP329]]
+// CHECK-32-EX-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP329]]
 // CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP12]], [[TMP13]]
-// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP210]]
-// CHECK-32-EX-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP210]]
-// CHECK-32-EX-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP210]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP329]]
+// CHECK-32-EX-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP329]]
+// CHECK-32-EX-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP329]]
 // CHECK-32-EX-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP14]], [[TMP15]]
-// CHECK-32-EX-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP210]]
-// CHECK-32-EX-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP210]]
-// CHECK-32-EX-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP210]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP329]]
+// CHECK-32-EX-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP329]]
+// CHECK-32-EX-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP329]]
 // CHECK-32-EX-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP16]], [[TMP17]]
-// CHECK-32-EX-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP210]]
-// CHECK-32-EX-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP210]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP329]]
+// CHECK-32-EX-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP329]]
 // CHECK-32-EX-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP18]], 9
 // CHECK-32-EX-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
 // CHECK-32-EX:       cond.true5:
 // CHECK-32-EX-NEXT:    br label [[COND_END7:%.*]]
 // CHECK-32-EX:       cond.false6:
-// CHECK-32-EX-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP210]]
+// CHECK-32-EX-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP329]]
 // CHECK-32-EX-NEXT:    br label [[COND_END7]]
 // CHECK-32-EX:       cond.end7:
 // CHECK-32-EX-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP19]], [[COND_FALSE6]] ]
-// CHECK-32-EX-NEXT:    store i32 [[COND8]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP210]]
-// CHECK-32-EX-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP210]]
-// CHECK-32-EX-NEXT:    store i32 [[TMP20]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP210]]
-// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP211:![0-9]+]]
+// CHECK-32-EX-NEXT:    store i32 [[COND8]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP329]]
+// CHECK-32-EX-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP329]]
+// CHECK-32-EX-NEXT:    store i32 [[TMP20]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP329]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP330:![0-9]+]]
 // CHECK-32-EX:       omp.inner.for.end:
 // CHECK-32-EX-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK-32-EX:       omp.loop.exit:
@@ -21232,23 +21520,23 @@ int a;
 // CHECK-32-EX-NEXT:    store i32 [[TMP7]], ptr [[DOTOMP_IV]], align 4
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32-EX:       omp.inner.for.cond:
-// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP213:![0-9]+]]
-// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP213]]
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP332:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP332]]
 // CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
 // CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32-EX:       omp.inner.for.body:
-// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP213]]
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP332]]
 // CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
 // CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP213]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP332]]
 // CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32-EX:       omp.body.continue:
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32-EX:       omp.inner.for.inc:
-// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP213]]
+// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP332]]
 // CHECK-32-EX-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP11]], 1
-// CHECK-32-EX-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP213]]
-// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP214:![0-9]+]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP332]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP333:![0-9]+]]
 // CHECK-32-EX:       omp.inner.for.end:
 // CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-32-EX:       omp.dispatch.inc:
@@ -21265,11 +21553,13 @@ int a;
 //
 //
 // CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l85
-// CHECK-32-EX-SAME: () #[[ATTR0]] {
+// CHECK-32-EX-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
 // CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-EX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK-32-EX-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l85_kernel_environment)
+// CHECK-32-EX-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l85_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32-EX:       user_code.entry:
@@ -21321,47 +21611,47 @@ int a;
 // CHECK-32-EX-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32-EX:       omp.inner.for.cond:
-// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP216:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP335:![0-9]+]]
 // CHECK-32-EX-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
 // CHECK-32-EX-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32-EX:       omp.inner.for.body:
-// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP216]]
-// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP216]]
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP335]]
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP335]]
 // CHECK-32-EX-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
 // CHECK-32-EX-NEXT:    [[TMP9:%.*]] = inttoptr i32 [[TMP6]] to ptr
-// CHECK-32-EX-NEXT:    store ptr [[TMP9]], ptr [[TMP8]], align 4, !llvm.access.group [[ACC_GRP216]]
+// CHECK-32-EX-NEXT:    store ptr [[TMP9]], ptr [[TMP8]], align 4, !llvm.access.group [[ACC_GRP335]]
 // CHECK-32-EX-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
 // CHECK-32-EX-NEXT:    [[TMP11:%.*]] = inttoptr i32 [[TMP7]] to ptr
-// CHECK-32-EX-NEXT:    store ptr [[TMP11]], ptr [[TMP10]], align 4, !llvm.access.group [[ACC_GRP216]]
-// CHECK-32-EX-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l85_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i32 2), !llvm.access.group [[ACC_GRP216]]
+// CHECK-32-EX-NEXT:    store ptr [[TMP11]], ptr [[TMP10]], align 4, !llvm.access.group [[ACC_GRP335]]
+// CHECK-32-EX-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l85_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i32 2), !llvm.access.group [[ACC_GRP335]]
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32-EX:       omp.inner.for.inc:
-// CHECK-32-EX-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP216]]
-// CHECK-32-EX-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP216]]
+// CHECK-32-EX-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP335]]
+// CHECK-32-EX-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP335]]
 // CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP12]], [[TMP13]]
-// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP216]]
-// CHECK-32-EX-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP216]]
-// CHECK-32-EX-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP216]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP335]]
+// CHECK-32-EX-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP335]]
+// CHECK-32-EX-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP335]]
 // CHECK-32-EX-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP14]], [[TMP15]]
-// CHECK-32-EX-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP216]]
-// CHECK-32-EX-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP216]]
-// CHECK-32-EX-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP216]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP335]]
+// CHECK-32-EX-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP335]]
+// CHECK-32-EX-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP335]]
 // CHECK-32-EX-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP16]], [[TMP17]]
-// CHECK-32-EX-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP216]]
-// CHECK-32-EX-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP216]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP335]]
+// CHECK-32-EX-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP335]]
 // CHECK-32-EX-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP18]], 9
 // CHECK-32-EX-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
 // CHECK-32-EX:       cond.true5:
 // CHECK-32-EX-NEXT:    br label [[COND_END7:%.*]]
 // CHECK-32-EX:       cond.false6:
-// CHECK-32-EX-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP216]]
+// CHECK-32-EX-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP335]]
 // CHECK-32-EX-NEXT:    br label [[COND_END7]]
 // CHECK-32-EX:       cond.end7:
 // CHECK-32-EX-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP19]], [[COND_FALSE6]] ]
-// CHECK-32-EX-NEXT:    store i32 [[COND8]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP216]]
-// CHECK-32-EX-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP216]]
-// CHECK-32-EX-NEXT:    store i32 [[TMP20]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP216]]
-// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP217:![0-9]+]]
+// CHECK-32-EX-NEXT:    store i32 [[COND8]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP335]]
+// CHECK-32-EX-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP335]]
+// CHECK-32-EX-NEXT:    store i32 [[TMP20]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP335]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP336:![0-9]+]]
 // CHECK-32-EX:       omp.inner.for.end:
 // CHECK-32-EX-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK-32-EX:       omp.loop.exit:
@@ -21417,23 +21707,23 @@ int a;
 // CHECK-32-EX-NEXT:    store i32 [[TMP7]], ptr [[DOTOMP_IV]], align 4
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32-EX:       omp.inner.for.cond:
-// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP219:![0-9]+]]
-// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP219]]
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP338:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP338]]
 // CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
 // CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32-EX:       omp.inner.for.body:
-// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP219]]
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP338]]
 // CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
 // CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP219]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP338]]
 // CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32-EX:       omp.body.continue:
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32-EX:       omp.inner.for.inc:
-// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP219]]
+// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP338]]
 // CHECK-32-EX-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP11]], 1
-// CHECK-32-EX-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP219]]
-// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP220:![0-9]+]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP338]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP339:![0-9]+]]
 // CHECK-32-EX:       omp.inner.for.end:
 // CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-32-EX:       omp.dispatch.inc:
@@ -21450,11 +21740,13 @@ int a;
 //
 //
 // CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l89
-// CHECK-32-EX-SAME: () #[[ATTR0]] {
+// CHECK-32-EX-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
 // CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-EX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK-32-EX-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l89_kernel_environment)
+// CHECK-32-EX-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l89_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32-EX:       user_code.entry:
@@ -21506,47 +21798,47 @@ int a;
 // CHECK-32-EX-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32-EX:       omp.inner.for.cond:
-// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP222:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP341:![0-9]+]]
 // CHECK-32-EX-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
 // CHECK-32-EX-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32-EX:       omp.inner.for.body:
-// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP222]]
-// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP222]]
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP341]]
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP341]]
 // CHECK-32-EX-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
 // CHECK-32-EX-NEXT:    [[TMP9:%.*]] = inttoptr i32 [[TMP6]] to ptr
-// CHECK-32-EX-NEXT:    store ptr [[TMP9]], ptr [[TMP8]], align 4, !llvm.access.group [[ACC_GRP222]]
+// CHECK-32-EX-NEXT:    store ptr [[TMP9]], ptr [[TMP8]], align 4, !llvm.access.group [[ACC_GRP341]]
 // CHECK-32-EX-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
 // CHECK-32-EX-NEXT:    [[TMP11:%.*]] = inttoptr i32 [[TMP7]] to ptr
-// CHECK-32-EX-NEXT:    store ptr [[TMP11]], ptr [[TMP10]], align 4, !llvm.access.group [[ACC_GRP222]]
-// CHECK-32-EX-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l89_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i32 2), !llvm.access.group [[ACC_GRP222]]
+// CHECK-32-EX-NEXT:    store ptr [[TMP11]], ptr [[TMP10]], align 4, !llvm.access.group [[ACC_GRP341]]
+// CHECK-32-EX-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l89_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i32 2), !llvm.access.group [[ACC_GRP341]]
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32-EX:       omp.inner.for.inc:
-// CHECK-32-EX-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP222]]
-// CHECK-32-EX-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP222]]
+// CHECK-32-EX-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP341]]
+// CHECK-32-EX-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP341]]
 // CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP12]], [[TMP13]]
-// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP222]]
-// CHECK-32-EX-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP222]]
-// CHECK-32-EX-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP222]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP341]]
+// CHECK-32-EX-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP341]]
+// CHECK-32-EX-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP341]]
 // CHECK-32-EX-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP14]], [[TMP15]]
-// CHECK-32-EX-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP222]]
-// CHECK-32-EX-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP222]]
-// CHECK-32-EX-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP222]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP341]]
+// CHECK-32-EX-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP341]]
+// CHECK-32-EX-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP341]]
 // CHECK-32-EX-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP16]], [[TMP17]]
-// CHECK-32-EX-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP222]]
-// CHECK-32-EX-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP222]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP341]]
+// CHECK-32-EX-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP341]]
 // CHECK-32-EX-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP18]], 9
 // CHECK-32-EX-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
 // CHECK-32-EX:       cond.true5:
 // CHECK-32-EX-NEXT:    br label [[COND_END7:%.*]]
 // CHECK-32-EX:       cond.false6:
-// CHECK-32-EX-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP222]]
+// CHECK-32-EX-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP341]]
 // CHECK-32-EX-NEXT:    br label [[COND_END7]]
 // CHECK-32-EX:       cond.end7:
 // CHECK-32-EX-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP19]], [[COND_FALSE6]] ]
-// CHECK-32-EX-NEXT:    store i32 [[COND8]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP222]]
-// CHECK-32-EX-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP222]]
-// CHECK-32-EX-NEXT:    store i32 [[TMP20]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP222]]
-// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP223:![0-9]+]]
+// CHECK-32-EX-NEXT:    store i32 [[COND8]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP341]]
+// CHECK-32-EX-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP341]]
+// CHECK-32-EX-NEXT:    store i32 [[TMP20]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP341]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP342:![0-9]+]]
 // CHECK-32-EX:       omp.inner.for.end:
 // CHECK-32-EX-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK-32-EX:       omp.loop.exit:
@@ -21602,23 +21894,23 @@ int a;
 // CHECK-32-EX-NEXT:    store i32 [[TMP7]], ptr [[DOTOMP_IV]], align 4
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32-EX:       omp.inner.for.cond:
-// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP225:![0-9]+]]
-// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP225]]
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP344:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP344]]
 // CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
 // CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32-EX:       omp.inner.for.body:
-// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP225]]
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP344]]
 // CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
 // CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP225]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP344]]
 // CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32-EX:       omp.body.continue:
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32-EX:       omp.inner.for.inc:
-// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP225]]
+// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP344]]
 // CHECK-32-EX-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP11]], 1
-// CHECK-32-EX-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP225]]
-// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP226:![0-9]+]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP344]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP345:![0-9]+]]
 // CHECK-32-EX:       omp.inner.for.end:
 // CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-32-EX:       omp.dispatch.inc:
@@ -21635,11 +21927,13 @@ int a;
 //
 //
 // CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l93
-// CHECK-32-EX-SAME: () #[[ATTR0]] {
+// CHECK-32-EX-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
 // CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-EX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK-32-EX-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l93_kernel_environment)
+// CHECK-32-EX-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l93_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32-EX:       user_code.entry:
@@ -21691,47 +21985,47 @@ int a;
 // CHECK-32-EX-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32-EX:       omp.inner.for.cond:
-// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP228:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP347:![0-9]+]]
 // CHECK-32-EX-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP5]], 10
 // CHECK-32-EX-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32-EX:       omp.inner.for.body:
-// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP228]]
-// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP228]]
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP347]]
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP347]]
 // CHECK-32-EX-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
 // CHECK-32-EX-NEXT:    [[TMP9:%.*]] = inttoptr i32 [[TMP6]] to ptr
-// CHECK-32-EX-NEXT:    store ptr [[TMP9]], ptr [[TMP8]], align 4, !llvm.access.group [[ACC_GRP228]]
+// CHECK-32-EX-NEXT:    store ptr [[TMP9]], ptr [[TMP8]], align 4, !llvm.access.group [[ACC_GRP347]]
 // CHECK-32-EX-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
 // CHECK-32-EX-NEXT:    [[TMP11:%.*]] = inttoptr i32 [[TMP7]] to ptr
-// CHECK-32-EX-NEXT:    store ptr [[TMP11]], ptr [[TMP10]], align 4, !llvm.access.group [[ACC_GRP228]]
-// CHECK-32-EX-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l93_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i32 2), !llvm.access.group [[ACC_GRP228]]
+// CHECK-32-EX-NEXT:    store ptr [[TMP11]], ptr [[TMP10]], align 4, !llvm.access.group [[ACC_GRP347]]
+// CHECK-32-EX-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l93_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i32 2), !llvm.access.group [[ACC_GRP347]]
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32-EX:       omp.inner.for.inc:
-// CHECK-32-EX-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP228]]
-// CHECK-32-EX-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP228]]
+// CHECK-32-EX-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP347]]
+// CHECK-32-EX-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP347]]
 // CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP12]], [[TMP13]]
-// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP228]]
-// CHECK-32-EX-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP228]]
-// CHECK-32-EX-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP228]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP347]]
+// CHECK-32-EX-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP347]]
+// CHECK-32-EX-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP347]]
 // CHECK-32-EX-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP14]], [[TMP15]]
-// CHECK-32-EX-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP228]]
-// CHECK-32-EX-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP228]]
-// CHECK-32-EX-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP228]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP347]]
+// CHECK-32-EX-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP347]]
+// CHECK-32-EX-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP347]]
 // CHECK-32-EX-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP16]], [[TMP17]]
-// CHECK-32-EX-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP228]]
-// CHECK-32-EX-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP228]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP347]]
+// CHECK-32-EX-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP347]]
 // CHECK-32-EX-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP18]], 9
 // CHECK-32-EX-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
 // CHECK-32-EX:       cond.true5:
 // CHECK-32-EX-NEXT:    br label [[COND_END7:%.*]]
 // CHECK-32-EX:       cond.false6:
-// CHECK-32-EX-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP228]]
+// CHECK-32-EX-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP347]]
 // CHECK-32-EX-NEXT:    br label [[COND_END7]]
 // CHECK-32-EX:       cond.end7:
 // CHECK-32-EX-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP19]], [[COND_FALSE6]] ]
-// CHECK-32-EX-NEXT:    store i32 [[COND8]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP228]]
-// CHECK-32-EX-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP228]]
-// CHECK-32-EX-NEXT:    store i32 [[TMP20]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP228]]
-// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP229:![0-9]+]]
+// CHECK-32-EX-NEXT:    store i32 [[COND8]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP347]]
+// CHECK-32-EX-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP347]]
+// CHECK-32-EX-NEXT:    store i32 [[TMP20]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP347]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP348:![0-9]+]]
 // CHECK-32-EX:       omp.inner.for.end:
 // CHECK-32-EX-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK-32-EX:       omp.loop.exit:
@@ -21787,23 +22081,23 @@ int a;
 // CHECK-32-EX-NEXT:    store i32 [[TMP7]], ptr [[DOTOMP_IV]], align 4
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32-EX:       omp.inner.for.cond:
-// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP231:![0-9]+]]
-// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP231]]
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP350:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP350]]
 // CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
 // CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32-EX:       omp.inner.for.body:
-// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP231]]
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP350]]
 // CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
 // CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP231]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP350]]
 // CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32-EX:       omp.body.continue:
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32-EX:       omp.inner.for.inc:
-// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP231]]
+// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP350]]
 // CHECK-32-EX-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP11]], 1
-// CHECK-32-EX-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP231]]
-// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP232:![0-9]+]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP350]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP351:![0-9]+]]
 // CHECK-32-EX:       omp.inner.for.end:
 // CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-32-EX:       omp.dispatch.inc:
@@ -21820,11 +22114,13 @@ int a;
 //
 //
 // CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l97
-// CHECK-32-EX-SAME: () #[[ATTR0]] {
+// CHECK-32-EX-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
 // CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-EX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK-32-EX-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l97_kernel_environment)
+// CHECK-32-EX-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l97_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32-EX:       user_code.entry:
@@ -21983,11 +22279,13 @@ int a;
 //
 //
 // CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l101
-// CHECK-32-EX-SAME: () #[[ATTR0]] {
+// CHECK-32-EX-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
 // CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-EX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK-32-EX-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l101_kernel_environment)
+// CHECK-32-EX-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l101_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32-EX:       user_code.entry:
@@ -22156,11 +22454,13 @@ int a;
 //
 //
 // CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l105
-// CHECK-32-EX-SAME: () #[[ATTR0]] {
+// CHECK-32-EX-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
 // CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-EX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK-32-EX-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l105_kernel_environment)
+// CHECK-32-EX-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l105_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32-EX:       user_code.entry:
@@ -22319,11 +22619,13 @@ int a;
 //
 //
 // CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l109
-// CHECK-32-EX-SAME: () #[[ATTR0]] {
+// CHECK-32-EX-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
 // CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-EX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK-32-EX-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l109_kernel_environment)
+// CHECK-32-EX-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l109_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32-EX:       user_code.entry:
@@ -22464,23 +22766,23 @@ int a;
 // CHECK-32-EX-NEXT:    store i32 [[TMP7]], ptr [[DOTOMP_IV]], align 4
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32-EX:       omp.inner.for.cond:
-// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP234:![0-9]+]]
-// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP234]]
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP353:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP353]]
 // CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
 // CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32-EX:       omp.inner.for.body:
-// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP234]]
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP353]]
 // CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
 // CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP234]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP353]]
 // CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32-EX:       omp.body.continue:
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32-EX:       omp.inner.for.inc:
-// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP234]]
+// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP353]]
 // CHECK-32-EX-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP11]], 1
-// CHECK-32-EX-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP234]]
-// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP235:![0-9]+]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP353]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP354:![0-9]+]]
 // CHECK-32-EX:       omp.inner.for.end:
 // CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-32-EX:       omp.dispatch.inc:
@@ -22490,11 +22792,13 @@ int a;
 //
 //
 // CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l113
-// CHECK-32-EX-SAME: () #[[ATTR0]] {
+// CHECK-32-EX-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
 // CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-EX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK-32-EX-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l113_kernel_environment)
+// CHECK-32-EX-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l113_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32-EX:       user_code.entry:
@@ -22635,23 +22939,23 @@ int a;
 // CHECK-32-EX-NEXT:    store i32 [[TMP7]], ptr [[DOTOMP_IV]], align 4
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32-EX:       omp.inner.for.cond:
-// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP237:![0-9]+]]
-// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP237]]
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP356:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP356]]
 // CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
 // CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32-EX:       omp.inner.for.body:
-// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP237]]
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP356]]
 // CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
 // CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP237]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP356]]
 // CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32-EX:       omp.body.continue:
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32-EX:       omp.inner.for.inc:
-// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP237]]
+// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP356]]
 // CHECK-32-EX-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP11]], 1
-// CHECK-32-EX-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP237]]
-// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP238:![0-9]+]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP356]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP357:![0-9]+]]
 // CHECK-32-EX:       omp.inner.for.end:
 // CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-32-EX:       omp.dispatch.inc:
@@ -22661,11 +22965,13 @@ int a;
 //
 //
 // CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l117
-// CHECK-32-EX-SAME: () #[[ATTR0]] {
+// CHECK-32-EX-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
 // CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-EX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK-32-EX-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l117_kernel_environment)
+// CHECK-32-EX-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l117_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32-EX:       user_code.entry:
@@ -22806,23 +23112,23 @@ int a;
 // CHECK-32-EX-NEXT:    store i32 [[TMP7]], ptr [[DOTOMP_IV]], align 4
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32-EX:       omp.inner.for.cond:
-// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP240:![0-9]+]]
-// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP240]]
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP359:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP359]]
 // CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
 // CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32-EX:       omp.inner.for.body:
-// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP240]]
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP359]]
 // CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
 // CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP240]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP359]]
 // CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32-EX:       omp.body.continue:
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32-EX:       omp.inner.for.inc:
-// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP240]]
+// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP359]]
 // CHECK-32-EX-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP11]], 1
-// CHECK-32-EX-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP240]]
-// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP241:![0-9]+]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP359]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP360:![0-9]+]]
 // CHECK-32-EX:       omp.inner.for.end:
 // CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-32-EX:       omp.dispatch.inc:
@@ -22832,11 +23138,13 @@ int a;
 //
 //
 // CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l121
-// CHECK-32-EX-SAME: () #[[ATTR0]] {
+// CHECK-32-EX-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
 // CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-EX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK-32-EX-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l121_kernel_environment)
+// CHECK-32-EX-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l121_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32-EX:       user_code.entry:
@@ -22977,23 +23285,23 @@ int a;
 // CHECK-32-EX-NEXT:    store i32 [[TMP7]], ptr [[DOTOMP_IV]], align 4
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32-EX:       omp.inner.for.cond:
-// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP243:![0-9]+]]
-// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP243]]
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP362:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP362]]
 // CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
 // CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32-EX:       omp.inner.for.body:
-// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP243]]
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP362]]
 // CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
 // CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP243]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP362]]
 // CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32-EX:       omp.body.continue:
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32-EX:       omp.inner.for.inc:
-// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP243]]
+// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP362]]
 // CHECK-32-EX-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP11]], 1
-// CHECK-32-EX-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP243]]
-// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP244:![0-9]+]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP362]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP363:![0-9]+]]
 // CHECK-32-EX:       omp.inner.for.end:
 // CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-32-EX:       omp.dispatch.inc:
@@ -23003,11 +23311,13 @@ int a;
 //
 //
 // CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l125
-// CHECK-32-EX-SAME: () #[[ATTR0]] {
+// CHECK-32-EX-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
 // CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-EX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK-32-EX-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l125_kernel_environment)
+// CHECK-32-EX-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l125_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32-EX:       user_code.entry:
@@ -23166,11 +23476,13 @@ int a;
 //
 //
 // CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l130
-// CHECK-32-EX-SAME: () #[[ATTR0]] {
+// CHECK-32-EX-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
 // CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-EX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK-32-EX-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l130_kernel_environment)
+// CHECK-32-EX-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l130_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32-EX:       user_code.entry:
@@ -23339,11 +23651,13 @@ int a;
 //
 //
 // CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l135
-// CHECK-32-EX-SAME: () #[[ATTR0]] {
+// CHECK-32-EX-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
 // CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-EX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK-32-EX-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l135_kernel_environment)
+// CHECK-32-EX-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l135_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32-EX:       user_code.entry:
@@ -23502,11 +23816,13 @@ int a;
 //
 //
 // CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l140
-// CHECK-32-EX-SAME: () #[[ATTR0]] {
+// CHECK-32-EX-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
 // CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-EX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK-32-EX-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l140_kernel_environment)
+// CHECK-32-EX-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l140_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32-EX:       user_code.entry:
@@ -23647,23 +23963,23 @@ int a;
 // CHECK-32-EX-NEXT:    store i32 [[TMP7]], ptr [[DOTOMP_IV]], align 4
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32-EX:       omp.inner.for.cond:
-// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP246:![0-9]+]]
-// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP246]]
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP365:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP365]]
 // CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
 // CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32-EX:       omp.inner.for.body:
-// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP246]]
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP365]]
 // CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
 // CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP246]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP365]]
 // CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32-EX:       omp.body.continue:
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32-EX:       omp.inner.for.inc:
-// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP246]]
+// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP365]]
 // CHECK-32-EX-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP11]], 1
-// CHECK-32-EX-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP246]]
-// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP247:![0-9]+]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP365]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP366:![0-9]+]]
 // CHECK-32-EX:       omp.inner.for.end:
 // CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-32-EX:       omp.dispatch.inc:
@@ -23673,11 +23989,13 @@ int a;
 //
 //
 // CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l145
-// CHECK-32-EX-SAME: () #[[ATTR0]] {
+// CHECK-32-EX-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
 // CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-EX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK-32-EX-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l145_kernel_environment)
+// CHECK-32-EX-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l145_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32-EX:       user_code.entry:
@@ -23818,23 +24136,23 @@ int a;
 // CHECK-32-EX-NEXT:    store i32 [[TMP7]], ptr [[DOTOMP_IV]], align 4
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32-EX:       omp.inner.for.cond:
-// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP249:![0-9]+]]
-// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP249]]
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP368:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP368]]
 // CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
 // CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32-EX:       omp.inner.for.body:
-// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP249]]
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP368]]
 // CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
 // CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP249]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP368]]
 // CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32-EX:       omp.body.continue:
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32-EX:       omp.inner.for.inc:
-// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP249]]
+// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP368]]
 // CHECK-32-EX-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP11]], 1
-// CHECK-32-EX-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP249]]
-// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP250:![0-9]+]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP368]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP369:![0-9]+]]
 // CHECK-32-EX:       omp.inner.for.end:
 // CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-32-EX:       omp.dispatch.inc:
@@ -23844,11 +24162,13 @@ int a;
 //
 //
 // CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l150
-// CHECK-32-EX-SAME: () #[[ATTR0]] {
+// CHECK-32-EX-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
 // CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-EX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK-32-EX-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l150_kernel_environment)
+// CHECK-32-EX-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l150_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32-EX:       user_code.entry:
@@ -23989,23 +24309,23 @@ int a;
 // CHECK-32-EX-NEXT:    store i32 [[TMP7]], ptr [[DOTOMP_IV]], align 4
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32-EX:       omp.inner.for.cond:
-// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP252:![0-9]+]]
-// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP252]]
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP371:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP371]]
 // CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
 // CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32-EX:       omp.inner.for.body:
-// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP252]]
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP371]]
 // CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
 // CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP252]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP371]]
 // CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32-EX:       omp.body.continue:
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32-EX:       omp.inner.for.inc:
-// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP252]]
+// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP371]]
 // CHECK-32-EX-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP11]], 1
-// CHECK-32-EX-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP252]]
-// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP253:![0-9]+]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP371]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP372:![0-9]+]]
 // CHECK-32-EX:       omp.inner.for.end:
 // CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-32-EX:       omp.dispatch.inc:
@@ -24015,11 +24335,13 @@ int a;
 //
 //
 // CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l155
-// CHECK-32-EX-SAME: () #[[ATTR0]] {
+// CHECK-32-EX-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
 // CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-EX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK-32-EX-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l155_kernel_environment)
+// CHECK-32-EX-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l155_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32-EX:       user_code.entry:
@@ -24160,23 +24482,23 @@ int a;
 // CHECK-32-EX-NEXT:    store i32 [[TMP7]], ptr [[DOTOMP_IV]], align 4
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32-EX:       omp.inner.for.cond:
-// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP255:![0-9]+]]
-// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP255]]
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP374:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP374]]
 // CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
 // CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32-EX:       omp.inner.for.body:
-// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP255]]
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP374]]
 // CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
 // CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP255]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP374]]
 // CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32-EX:       omp.body.continue:
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32-EX:       omp.inner.for.inc:
-// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP255]]
+// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP374]]
 // CHECK-32-EX-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP11]], 1
-// CHECK-32-EX-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP255]]
-// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP256:![0-9]+]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP374]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP375:![0-9]+]]
 // CHECK-32-EX:       omp.inner.for.end:
 // CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-32-EX:       omp.dispatch.inc:
@@ -24186,12 +24508,14 @@ int a;
 //
 //
 // CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l160
-// CHECK-32-EX-SAME: (i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR8:[0-9]+]] {
+// CHECK-32-EX-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR8:[0-9]+]] {
 // CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-EX-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i32, align 4
 // CHECK-32-EX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 4
+// CHECK-32-EX-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK-32-EX-NEXT:    store i32 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 4
-// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l160_kernel_environment)
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l160_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32-EX:       user_code.entry:
@@ -24284,10 +24608,12 @@ int a;
 //
 //
 // CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l163
-// CHECK-32-EX-SAME: () #[[ATTR8]] {
+// CHECK-32-EX-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR8]] {
 // CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-EX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 4
-// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l163_kernel_environment)
+// CHECK-32-EX-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l163_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32-EX:       user_code.entry:
@@ -24360,10 +24686,12 @@ int a;
 //
 //
 // CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l166
-// CHECK-32-EX-SAME: () #[[ATTR8]] {
+// CHECK-32-EX-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR8]] {
 // CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-EX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 4
-// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l166_kernel_environment)
+// CHECK-32-EX-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l166_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32-EX:       user_code.entry:
@@ -24453,10 +24781,12 @@ int a;
 //
 //
 // CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l169
-// CHECK-32-EX-SAME: () #[[ATTR8]] {
+// CHECK-32-EX-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR8]] {
 // CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-EX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 4
-// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l169_kernel_environment)
+// CHECK-32-EX-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l169_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32-EX:       user_code.entry:
@@ -24499,23 +24829,23 @@ int a;
 // CHECK-32-EX-NEXT:    store i32 [[TMP3]], ptr [[DOTOMP_IV]], align 4
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32-EX:       omp.inner.for.cond:
-// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP258:![0-9]+]]
-// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP258]]
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP377:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP377]]
 // CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP4]], [[TMP5]]
 // CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32-EX:       omp.inner.for.body:
-// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP258]]
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP377]]
 // CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP6]], 1
 // CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP258]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP377]]
 // CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32-EX:       omp.body.continue:
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32-EX:       omp.inner.for.inc:
-// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP258]]
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP377]]
 // CHECK-32-EX-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP7]], 1
-// CHECK-32-EX-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP258]]
-// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP259:![0-9]+]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP377]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP378:![0-9]+]]
 // CHECK-32-EX:       omp.inner.for.end:
 // CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-32-EX:       omp.dispatch.inc:
@@ -24525,10 +24855,12 @@ int a;
 //
 //
 // CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l172
-// CHECK-32-EX-SAME: () #[[ATTR8]] {
+// CHECK-32-EX-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR8]] {
 // CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-EX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 4
-// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l172_kernel_environment)
+// CHECK-32-EX-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l172_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32-EX:       user_code.entry:
@@ -24571,23 +24903,23 @@ int a;
 // CHECK-32-EX-NEXT:    store i32 [[TMP3]], ptr [[DOTOMP_IV]], align 4
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32-EX:       omp.inner.for.cond:
-// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP261:![0-9]+]]
-// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP261]]
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP380:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP380]]
 // CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP4]], [[TMP5]]
 // CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32-EX:       omp.inner.for.body:
-// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP261]]
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP380]]
 // CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP6]], 1
 // CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP261]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP380]]
 // CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32-EX:       omp.body.continue:
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32-EX:       omp.inner.for.inc:
-// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP261]]
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP380]]
 // CHECK-32-EX-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP7]], 1
-// CHECK-32-EX-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP261]]
-// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP262:![0-9]+]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP380]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP381:![0-9]+]]
 // CHECK-32-EX:       omp.inner.for.end:
 // CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-32-EX:       omp.dispatch.inc:
@@ -24597,10 +24929,12 @@ int a;
 //
 //
 // CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l175
-// CHECK-32-EX-SAME: () #[[ATTR8]] {
+// CHECK-32-EX-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR8]] {
 // CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-EX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 4
-// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l175_kernel_environment)
+// CHECK-32-EX-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l175_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32-EX:       user_code.entry:
@@ -24643,23 +24977,23 @@ int a;
 // CHECK-32-EX-NEXT:    store i32 [[TMP3]], ptr [[DOTOMP_IV]], align 4
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32-EX:       omp.inner.for.cond:
-// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP264:![0-9]+]]
-// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP264]]
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP383:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP383]]
 // CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP4]], [[TMP5]]
 // CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32-EX:       omp.inner.for.body:
-// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP264]]
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP383]]
 // CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP6]], 1
 // CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP264]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP383]]
 // CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32-EX:       omp.body.continue:
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32-EX:       omp.inner.for.inc:
-// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP264]]
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP383]]
 // CHECK-32-EX-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP7]], 1
-// CHECK-32-EX-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP264]]
-// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP265:![0-9]+]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP383]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP384:![0-9]+]]
 // CHECK-32-EX:       omp.inner.for.end:
 // CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-32-EX:       omp.dispatch.inc:
@@ -24669,10 +25003,12 @@ int a;
 //
 //
 // CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l178
-// CHECK-32-EX-SAME: () #[[ATTR8]] {
+// CHECK-32-EX-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR8]] {
 // CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-EX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 4
-// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l178_kernel_environment)
+// CHECK-32-EX-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l178_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32-EX:       user_code.entry:
@@ -24715,23 +25051,23 @@ int a;
 // CHECK-32-EX-NEXT:    store i32 [[TMP3]], ptr [[DOTOMP_IV]], align 4
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32-EX:       omp.inner.for.cond:
-// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP267:![0-9]+]]
-// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP267]]
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP386:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP386]]
 // CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP4]], [[TMP5]]
 // CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32-EX:       omp.inner.for.body:
-// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP267]]
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP386]]
 // CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP6]], 1
 // CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP267]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP386]]
 // CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32-EX:       omp.body.continue:
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32-EX:       omp.inner.for.inc:
-// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP267]]
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP386]]
 // CHECK-32-EX-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP7]], 1
-// CHECK-32-EX-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP267]]
-// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP268:![0-9]+]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP386]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP387:![0-9]+]]
 // CHECK-32-EX:       omp.inner.for.end:
 // CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-32-EX:       omp.dispatch.inc:
@@ -24741,12 +25077,14 @@ int a;
 //
 //
 // CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l181
-// CHECK-32-EX-SAME: (i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR8]] {
+// CHECK-32-EX-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR8]] {
 // CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-EX-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i32, align 4
 // CHECK-32-EX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 4
+// CHECK-32-EX-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK-32-EX-NEXT:    store i32 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 4
-// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l181_kernel_environment)
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l181_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32-EX:       user_code.entry:
@@ -24804,23 +25142,23 @@ int a;
 // CHECK-32-EX:       omp.dispatch.body:
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32-EX:       omp.inner.for.cond:
-// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP270:![0-9]+]]
-// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP270]]
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP389:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP389]]
 // CHECK-32-EX-NEXT:    [[CMP2:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]]
 // CHECK-32-EX-NEXT:    br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32-EX:       omp.inner.for.body:
-// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP270]]
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP389]]
 // CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP9]], 1
 // CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP270]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP389]]
 // CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32-EX:       omp.body.continue:
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32-EX:       omp.inner.for.inc:
-// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP270]]
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP389]]
 // CHECK-32-EX-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP10]], 1
-// CHECK-32-EX-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP270]]
-// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP271:![0-9]+]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP389]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP390:![0-9]+]]
 // CHECK-32-EX:       omp.inner.for.end:
 // CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-32-EX:       omp.dispatch.inc:
@@ -24847,10 +25185,12 @@ int a;
 //
 //
 // CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l185
-// CHECK-32-EX-SAME: () #[[ATTR8]] {
+// CHECK-32-EX-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR8]] {
 // CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-EX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 4
-// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l185_kernel_environment)
+// CHECK-32-EX-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l185_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32-EX:       user_code.entry:
@@ -24898,23 +25238,23 @@ int a;
 // CHECK-32-EX-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32-EX:       omp.inner.for.cond:
-// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP273:![0-9]+]]
-// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP273]]
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP392:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP392]]
 // CHECK-32-EX-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
 // CHECK-32-EX-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32-EX:       omp.inner.for.body:
-// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP273]]
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP392]]
 // CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
 // CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP273]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP392]]
 // CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32-EX:       omp.body.continue:
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32-EX:       omp.inner.for.inc:
-// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP273]]
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP392]]
 // CHECK-32-EX-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP8]], 1
-// CHECK-32-EX-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP273]]
-// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP274:![0-9]+]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP392]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP393:![0-9]+]]
 // CHECK-32-EX:       omp.inner.for.end:
 // CHECK-32-EX-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK-32-EX:       omp.loop.exit:
@@ -24931,10 +25271,12 @@ int a;
 //
 //
 // CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l189
-// CHECK-32-EX-SAME: () #[[ATTR8]] {
+// CHECK-32-EX-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR8]] {
 // CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-EX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 4
-// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l189_kernel_environment)
+// CHECK-32-EX-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l189_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32-EX:       user_code.entry:
@@ -24989,23 +25331,23 @@ int a;
 // CHECK-32-EX:       omp.dispatch.body:
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32-EX:       omp.inner.for.cond:
-// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP276:![0-9]+]]
-// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP276]]
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP395:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP395]]
 // CHECK-32-EX-NEXT:    [[CMP2:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]]
 // CHECK-32-EX-NEXT:    br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32-EX:       omp.inner.for.body:
-// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP276]]
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP395]]
 // CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP9]], 1
 // CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP276]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP395]]
 // CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32-EX:       omp.body.continue:
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32-EX:       omp.inner.for.inc:
-// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP276]]
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP395]]
 // CHECK-32-EX-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP10]], 1
-// CHECK-32-EX-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP276]]
-// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP277:![0-9]+]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP395]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP396:![0-9]+]]
 // CHECK-32-EX:       omp.inner.for.end:
 // CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-32-EX:       omp.dispatch.inc:
@@ -25032,10 +25374,12 @@ int a;
 //
 //
 // CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l193
-// CHECK-32-EX-SAME: () #[[ATTR8]] {
+// CHECK-32-EX-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR8]] {
 // CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-EX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 4
-// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l193_kernel_environment)
+// CHECK-32-EX-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l193_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32-EX:       user_code.entry:
@@ -25078,23 +25422,23 @@ int a;
 // CHECK-32-EX-NEXT:    store i32 [[TMP3]], ptr [[DOTOMP_IV]], align 4
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32-EX:       omp.inner.for.cond:
-// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP279:![0-9]+]]
-// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP279]]
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP398:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP398]]
 // CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP4]], [[TMP5]]
 // CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32-EX:       omp.inner.for.body:
-// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP279]]
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP398]]
 // CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP6]], 1
 // CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP279]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP398]]
 // CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32-EX:       omp.body.continue:
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32-EX:       omp.inner.for.inc:
-// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP279]]
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP398]]
 // CHECK-32-EX-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP7]], 1
-// CHECK-32-EX-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP279]]
-// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP280:![0-9]+]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP398]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP399:![0-9]+]]
 // CHECK-32-EX:       omp.inner.for.end:
 // CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-32-EX:       omp.dispatch.inc:
@@ -25112,10 +25456,12 @@ int a;
 //
 //
 // CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l197
-// CHECK-32-EX-SAME: () #[[ATTR8]] {
+// CHECK-32-EX-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR8]] {
 // CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-EX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 4
-// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l197_kernel_environment)
+// CHECK-32-EX-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l197_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32-EX:       user_code.entry:
@@ -25158,23 +25504,23 @@ int a;
 // CHECK-32-EX-NEXT:    store i32 [[TMP3]], ptr [[DOTOMP_IV]], align 4
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32-EX:       omp.inner.for.cond:
-// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP282:![0-9]+]]
-// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP282]]
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP401:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP401]]
 // CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP4]], [[TMP5]]
 // CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32-EX:       omp.inner.for.body:
-// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP282]]
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP401]]
 // CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP6]], 1
 // CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP282]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP401]]
 // CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32-EX:       omp.body.continue:
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32-EX:       omp.inner.for.inc:
-// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP282]]
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP401]]
 // CHECK-32-EX-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP7]], 1
-// CHECK-32-EX-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP282]]
-// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP283:![0-9]+]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP401]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP402:![0-9]+]]
 // CHECK-32-EX:       omp.inner.for.end:
 // CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-32-EX:       omp.dispatch.inc:
@@ -25192,10 +25538,12 @@ int a;
 //
 //
 // CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l201
-// CHECK-32-EX-SAME: () #[[ATTR8]] {
+// CHECK-32-EX-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR8]] {
 // CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-EX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 4
-// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l201_kernel_environment)
+// CHECK-32-EX-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l201_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32-EX:       user_code.entry:
@@ -25238,23 +25586,23 @@ int a;
 // CHECK-32-EX-NEXT:    store i32 [[TMP3]], ptr [[DOTOMP_IV]], align 4
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32-EX:       omp.inner.for.cond:
-// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP285:![0-9]+]]
-// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP285]]
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP404:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP404]]
 // CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP4]], [[TMP5]]
 // CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32-EX:       omp.inner.for.body:
-// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP285]]
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP404]]
 // CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP6]], 1
 // CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP285]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP404]]
 // CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32-EX:       omp.body.continue:
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32-EX:       omp.inner.for.inc:
-// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP285]]
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP404]]
 // CHECK-32-EX-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP7]], 1
-// CHECK-32-EX-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP285]]
-// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP286:![0-9]+]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP404]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP405:![0-9]+]]
 // CHECK-32-EX:       omp.inner.for.end:
 // CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-32-EX:       omp.dispatch.inc:
@@ -25272,10 +25620,12 @@ int a;
 //
 //
 // CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l205
-// CHECK-32-EX-SAME: () #[[ATTR8]] {
+// CHECK-32-EX-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR8]] {
 // CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-EX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 4
-// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l205_kernel_environment)
+// CHECK-32-EX-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l205_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32-EX:       user_code.entry:
@@ -25318,23 +25668,23 @@ int a;
 // CHECK-32-EX-NEXT:    store i32 [[TMP3]], ptr [[DOTOMP_IV]], align 4
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32-EX:       omp.inner.for.cond:
-// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP288:![0-9]+]]
-// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP288]]
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP407:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP407]]
 // CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP4]], [[TMP5]]
 // CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32-EX:       omp.inner.for.body:
-// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP288]]
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP407]]
 // CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP6]], 1
 // CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP288]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP407]]
 // CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32-EX:       omp.body.continue:
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32-EX:       omp.inner.for.inc:
-// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP288]]
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP407]]
 // CHECK-32-EX-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP7]], 1
-// CHECK-32-EX-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP288]]
-// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP289:![0-9]+]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP407]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP408:![0-9]+]]
 // CHECK-32-EX:       omp.inner.for.end:
 // CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-32-EX:       omp.dispatch.inc:
@@ -25352,10 +25702,12 @@ int a;
 //
 //
 // CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l209
-// CHECK-32-EX-SAME: () #[[ATTR10:[0-9]+]] {
+// CHECK-32-EX-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR10:[0-9]+]] {
 // CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-EX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 4
-// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l209_kernel_environment)
+// CHECK-32-EX-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l209_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32-EX:       user_code.entry:
@@ -25398,24 +25750,24 @@ int a;
 // CHECK-32-EX-NEXT:    store i32 [[TMP3]], ptr [[DOTOMP_IV]], align 4
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32-EX:       omp.inner.for.cond:
-// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP291:![0-9]+]]
-// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP291]]
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP410:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP410]]
 // CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP4]], [[TMP5]]
 // CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32-EX:       omp.inner.for.body:
-// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP291]]
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP410]]
 // CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP6]], 1
 // CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP291]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP410]]
 // CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32-EX:       omp.body.continue:
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32-EX:       omp.inner.for.inc:
-// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP291]]
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP410]]
 // CHECK-32-EX-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP7]], 1
-// CHECK-32-EX-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP291]]
-// CHECK-32-EX-NEXT:    call void @__kmpc_dispatch_fini_4(ptr @[[GLOB1]], i32 [[TMP1]]), !llvm.access.group [[ACC_GRP291]]
-// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP292:![0-9]+]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP410]]
+// CHECK-32-EX-NEXT:    call void @__kmpc_dispatch_fini_4(ptr @[[GLOB1]], i32 [[TMP1]]), !llvm.access.group [[ACC_GRP410]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP411:![0-9]+]]
 // CHECK-32-EX:       omp.inner.for.end:
 // CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-32-EX:       omp.dispatch.inc:
@@ -25433,10 +25785,12 @@ int a;
 //
 //
 // CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l214
-// CHECK-32-EX-SAME: () #[[ATTR10]] {
+// CHECK-32-EX-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR10]] {
 // CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-EX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 4
-// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l214_kernel_environment)
+// CHECK-32-EX-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l214_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32-EX:       user_code.entry:
@@ -25484,23 +25838,23 @@ int a;
 // CHECK-32-EX-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32-EX:       omp.inner.for.cond:
-// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP294:![0-9]+]]
-// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP294]]
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP413:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP413]]
 // CHECK-32-EX-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
 // CHECK-32-EX-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32-EX:       omp.inner.for.body:
-// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP294]]
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP413]]
 // CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
 // CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP294]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP413]]
 // CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32-EX:       omp.body.continue:
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32-EX:       omp.inner.for.inc:
-// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP294]]
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP413]]
 // CHECK-32-EX-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP8]], 1
-// CHECK-32-EX-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP294]]
-// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP295:![0-9]+]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP413]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP414:![0-9]+]]
 // CHECK-32-EX:       omp.inner.for.end:
 // CHECK-32-EX-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK-32-EX:       omp.loop.exit:
@@ -25517,10 +25871,12 @@ int a;
 //
 //
 // CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l219
-// CHECK-32-EX-SAME: () #[[ATTR10]] {
+// CHECK-32-EX-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR10]] {
 // CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-EX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 4
-// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l219_kernel_environment)
+// CHECK-32-EX-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l219_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32-EX:       user_code.entry:
@@ -25575,23 +25931,23 @@ int a;
 // CHECK-32-EX:       omp.dispatch.body:
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32-EX:       omp.inner.for.cond:
-// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP297:![0-9]+]]
-// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP297]]
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP416:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP416]]
 // CHECK-32-EX-NEXT:    [[CMP2:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]]
 // CHECK-32-EX-NEXT:    br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32-EX:       omp.inner.for.body:
-// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP297]]
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP416]]
 // CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP9]], 1
 // CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP297]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP416]]
 // CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32-EX:       omp.body.continue:
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32-EX:       omp.inner.for.inc:
-// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP297]]
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP416]]
 // CHECK-32-EX-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP10]], 1
-// CHECK-32-EX-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP297]]
-// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP298:![0-9]+]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP416]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP417:![0-9]+]]
 // CHECK-32-EX:       omp.inner.for.end:
 // CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-32-EX:       omp.dispatch.inc:
@@ -25618,10 +25974,12 @@ int a;
 //
 //
 // CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l224
-// CHECK-32-EX-SAME: () #[[ATTR10]] {
+// CHECK-32-EX-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR10]] {
 // CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-EX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 4
-// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l224_kernel_environment)
+// CHECK-32-EX-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l224_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32-EX:       user_code.entry:
@@ -25664,23 +26022,23 @@ int a;
 // CHECK-32-EX-NEXT:    store i32 [[TMP3]], ptr [[DOTOMP_IV]], align 4
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32-EX:       omp.inner.for.cond:
-// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP300:![0-9]+]]
-// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP300]]
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP419:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP419]]
 // CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP4]], [[TMP5]]
 // CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32-EX:       omp.inner.for.body:
-// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP300]]
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP419]]
 // CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP6]], 1
 // CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP300]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP419]]
 // CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32-EX:       omp.body.continue:
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32-EX:       omp.inner.for.inc:
-// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP300]]
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP419]]
 // CHECK-32-EX-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP7]], 1
-// CHECK-32-EX-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP300]]
-// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP301:![0-9]+]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP419]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP420:![0-9]+]]
 // CHECK-32-EX:       omp.inner.for.end:
 // CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-32-EX:       omp.dispatch.inc:
@@ -25698,10 +26056,12 @@ int a;
 //
 //
 // CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l229
-// CHECK-32-EX-SAME: () #[[ATTR10]] {
+// CHECK-32-EX-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR10]] {
 // CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-EX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 4
-// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l229_kernel_environment)
+// CHECK-32-EX-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l229_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32-EX:       user_code.entry:
@@ -25744,23 +26104,23 @@ int a;
 // CHECK-32-EX-NEXT:    store i32 [[TMP3]], ptr [[DOTOMP_IV]], align 4
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32-EX:       omp.inner.for.cond:
-// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP303:![0-9]+]]
-// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP303]]
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP422:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP422]]
 // CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP4]], [[TMP5]]
 // CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32-EX:       omp.inner.for.body:
-// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP303]]
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP422]]
 // CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP6]], 1
 // CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP303]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP422]]
 // CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32-EX:       omp.body.continue:
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32-EX:       omp.inner.for.inc:
-// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP303]]
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP422]]
 // CHECK-32-EX-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP7]], 1
-// CHECK-32-EX-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP303]]
-// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP304:![0-9]+]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP422]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP423:![0-9]+]]
 // CHECK-32-EX:       omp.inner.for.end:
 // CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-32-EX:       omp.dispatch.inc:
@@ -25778,10 +26138,12 @@ int a;
 //
 //
 // CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l234
-// CHECK-32-EX-SAME: () #[[ATTR10]] {
+// CHECK-32-EX-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR10]] {
 // CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-EX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 4
-// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l234_kernel_environment)
+// CHECK-32-EX-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l234_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32-EX:       user_code.entry:
@@ -25824,23 +26186,23 @@ int a;
 // CHECK-32-EX-NEXT:    store i32 [[TMP3]], ptr [[DOTOMP_IV]], align 4
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32-EX:       omp.inner.for.cond:
-// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP306:![0-9]+]]
-// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP306]]
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP425:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP425]]
 // CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP4]], [[TMP5]]
 // CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32-EX:       omp.inner.for.body:
-// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP306]]
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP425]]
 // CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP6]], 1
 // CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP306]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP425]]
 // CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32-EX:       omp.body.continue:
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32-EX:       omp.inner.for.inc:
-// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP306]]
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP425]]
 // CHECK-32-EX-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP7]], 1
-// CHECK-32-EX-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP306]]
-// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP307:![0-9]+]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP425]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP426:![0-9]+]]
 // CHECK-32-EX:       omp.inner.for.end:
 // CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-32-EX:       omp.dispatch.inc:
@@ -25858,10 +26220,12 @@ int a;
 //
 //
 // CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l239
-// CHECK-32-EX-SAME: () #[[ATTR10]] {
+// CHECK-32-EX-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR10]] {
 // CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-EX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 4
-// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l239_kernel_environment)
+// CHECK-32-EX-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l239_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32-EX:       user_code.entry:
@@ -25904,23 +26268,23 @@ int a;
 // CHECK-32-EX-NEXT:    store i32 [[TMP3]], ptr [[DOTOMP_IV]], align 4
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32-EX:       omp.inner.for.cond:
-// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP309:![0-9]+]]
-// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP309]]
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP428:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP428]]
 // CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP4]], [[TMP5]]
 // CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32-EX:       omp.inner.for.body:
-// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP309]]
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP428]]
 // CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP6]], 1
 // CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP309]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP428]]
 // CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32-EX:       omp.body.continue:
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32-EX:       omp.inner.for.inc:
-// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP309]]
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP428]]
 // CHECK-32-EX-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP7]], 1
-// CHECK-32-EX-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP309]]
-// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP310:![0-9]+]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP428]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP429:![0-9]+]]
 // CHECK-32-EX:       omp.inner.for.end:
 // CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-32-EX:       omp.dispatch.inc:
@@ -25938,10 +26302,12 @@ int a;
 //
 //
 // CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l244
-// CHECK-32-EX-SAME: () #[[ATTR8]] {
+// CHECK-32-EX-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR8]] {
 // CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-EX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 4
-// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l244_kernel_environment)
+// CHECK-32-EX-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l244_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32-EX:       user_code.entry:
@@ -26031,10 +26397,12 @@ int a;
 //
 //
 // CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l248
-// CHECK-32-EX-SAME: () #[[ATTR8]] {
+// CHECK-32-EX-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR8]] {
 // CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-EX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 4
-// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l248_kernel_environment)
+// CHECK-32-EX-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l248_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32-EX:       user_code.entry:
@@ -26107,10 +26475,12 @@ int a;
 //
 //
 // CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l252
-// CHECK-32-EX-SAME: () #[[ATTR8]] {
+// CHECK-32-EX-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR8]] {
 // CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-EX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 4
-// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l252_kernel_environment)
+// CHECK-32-EX-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l252_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32-EX:       user_code.entry:
@@ -26200,10 +26570,12 @@ int a;
 //
 //
 // CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l256
-// CHECK-32-EX-SAME: () #[[ATTR8]] {
+// CHECK-32-EX-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR8]] {
 // CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-EX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 4
-// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l256_kernel_environment)
+// CHECK-32-EX-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l256_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32-EX:       user_code.entry:
@@ -26246,23 +26618,23 @@ int a;
 // CHECK-32-EX-NEXT:    store i32 [[TMP3]], ptr [[DOTOMP_IV]], align 4
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32-EX:       omp.inner.for.cond:
-// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP312:![0-9]+]]
-// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP312]]
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP431:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP431]]
 // CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP4]], [[TMP5]]
 // CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32-EX:       omp.inner.for.body:
-// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP312]]
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP431]]
 // CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP6]], 1
 // CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP312]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP431]]
 // CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32-EX:       omp.body.continue:
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32-EX:       omp.inner.for.inc:
-// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP312]]
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP431]]
 // CHECK-32-EX-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP7]], 1
-// CHECK-32-EX-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP312]]
-// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP313:![0-9]+]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP431]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP432:![0-9]+]]
 // CHECK-32-EX:       omp.inner.for.end:
 // CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-32-EX:       omp.dispatch.inc:
@@ -26272,10 +26644,12 @@ int a;
 //
 //
 // CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l260
-// CHECK-32-EX-SAME: () #[[ATTR8]] {
+// CHECK-32-EX-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR8]] {
 // CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-EX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 4
-// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l260_kernel_environment)
+// CHECK-32-EX-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l260_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32-EX:       user_code.entry:
@@ -26318,23 +26692,23 @@ int a;
 // CHECK-32-EX-NEXT:    store i32 [[TMP3]], ptr [[DOTOMP_IV]], align 4
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32-EX:       omp.inner.for.cond:
-// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP315:![0-9]+]]
-// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP315]]
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP434:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP434]]
 // CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP4]], [[TMP5]]
 // CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32-EX:       omp.inner.for.body:
-// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP315]]
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP434]]
 // CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP6]], 1
 // CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP315]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP434]]
 // CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32-EX:       omp.body.continue:
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32-EX:       omp.inner.for.inc:
-// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP315]]
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP434]]
 // CHECK-32-EX-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP7]], 1
-// CHECK-32-EX-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP315]]
-// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP316:![0-9]+]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP434]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP435:![0-9]+]]
 // CHECK-32-EX:       omp.inner.for.end:
 // CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-32-EX:       omp.dispatch.inc:
@@ -26344,10 +26718,12 @@ int a;
 //
 //
 // CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l264
-// CHECK-32-EX-SAME: () #[[ATTR8]] {
+// CHECK-32-EX-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR8]] {
 // CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-EX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 4
-// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l264_kernel_environment)
+// CHECK-32-EX-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l264_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32-EX:       user_code.entry:
@@ -26390,23 +26766,23 @@ int a;
 // CHECK-32-EX-NEXT:    store i32 [[TMP3]], ptr [[DOTOMP_IV]], align 4
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32-EX:       omp.inner.for.cond:
-// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP318:![0-9]+]]
-// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP318]]
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP437:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP437]]
 // CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP4]], [[TMP5]]
 // CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32-EX:       omp.inner.for.body:
-// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP318]]
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP437]]
 // CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP6]], 1
 // CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP318]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP437]]
 // CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32-EX:       omp.body.continue:
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32-EX:       omp.inner.for.inc:
-// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP318]]
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP437]]
 // CHECK-32-EX-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP7]], 1
-// CHECK-32-EX-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP318]]
-// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP319:![0-9]+]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP437]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP438:![0-9]+]]
 // CHECK-32-EX:       omp.inner.for.end:
 // CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-32-EX:       omp.dispatch.inc:
@@ -26416,10 +26792,12 @@ int a;
 //
 //
 // CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l268
-// CHECK-32-EX-SAME: () #[[ATTR8]] {
+// CHECK-32-EX-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR8]] {
 // CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-EX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 4
-// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l268_kernel_environment)
+// CHECK-32-EX-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l268_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32-EX:       user_code.entry:
@@ -26462,23 +26840,23 @@ int a;
 // CHECK-32-EX-NEXT:    store i32 [[TMP3]], ptr [[DOTOMP_IV]], align 4
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32-EX:       omp.inner.for.cond:
-// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP321:![0-9]+]]
-// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP321]]
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP440:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP440]]
 // CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp sle i32 [[TMP4]], [[TMP5]]
 // CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32-EX:       omp.inner.for.body:
-// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP321]]
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP440]]
 // CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP6]], 1
 // CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP321]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP440]]
 // CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32-EX:       omp.body.continue:
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32-EX:       omp.inner.for.inc:
-// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP321]]
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP440]]
 // CHECK-32-EX-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP7]], 1
-// CHECK-32-EX-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP321]]
-// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP322:![0-9]+]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP440]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP441:![0-9]+]]
 // CHECK-32-EX:       omp.inner.for.end:
 // CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-32-EX:       omp.dispatch.inc:
diff --git a/clang/test/OpenMP/nvptx_data_sharing.cpp b/clang/test/OpenMP/nvptx_data_sharing.cpp
index 180d6c2575f40..5c2f6e5de0a5f 100644
--- a/clang/test/OpenMP/nvptx_data_sharing.cpp
+++ b/clang/test/OpenMP/nvptx_data_sharing.cpp
@@ -31,12 +31,14 @@ void test_ds(){
 #endif
 
 // CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z7test_dsv_l14
-// CHECK-SAME: () #[[ATTR0:[0-9]+]] {
+// CHECK-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x ptr], align 8
 // CHECK-NEXT:    [[C:%.*]] = alloca i32, align 4
 // CHECK-NEXT:    [[CAPTURED_VARS_ADDRS1:%.*]] = alloca [2 x ptr], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z7test_dsv_l14_kernel_environment)
+// CHECK-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z7test_dsv_l14_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK:       user_code.entry:
diff --git a/clang/test/OpenMP/nvptx_declare_target_var_ctor_dtor_codegen.cpp b/clang/test/OpenMP/nvptx_declare_target_var_ctor_dtor_codegen.cpp
index 218d57377eb3f..a612ec10f34c4 100644
--- a/clang/test/OpenMP/nvptx_declare_target_var_ctor_dtor_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_declare_target_var_ctor_dtor_codegen.cpp
@@ -89,7 +89,7 @@ int maini1() {
   return 0;
 }
 
-// DEVICE-DAG: define weak{{.*}} void @__omp_offloading_{{.*}}_{{.*}}maini1{{.*}}_l[[@LINE-7]](ptr noundef nonnull align {{[0-9]+}} dereferenceable{{[^,]*}}
+// DEVICE-DAG: define weak{{.*}} void @__omp_offloading_{{.*}}_{{.*}}maini1{{.*}}_l[[@LINE-7]](ptr {{[^,]*}}, ptr noundef nonnull align {{[0-9]+}} dereferenceable{{[^,]*}}
 // DEVICE-DAG: [[C:%.+]] = load i32, ptr [[C_ADDR]],
 // DEVICE-DAG: store i32 [[C]], ptr %
 
diff --git a/clang/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp b/clang/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp
index 9369ca3b45b9f..7402698af3e4c 100644
--- a/clang/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp
@@ -25,8 +25,9 @@ int main(int argc, char **argv) {
 
 #endif
 // CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l19
-// CHECK4-SAME: (ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[C:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i64 noundef [[ARGC:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK4-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[C:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i64 noundef [[ARGC:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK4-NEXT:  entry:
+// CHECK4-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK4-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
 // CHECK4-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8
 // CHECK4-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
@@ -35,6 +36,7 @@ int main(int argc, char **argv) {
 // CHECK4-NEXT:    [[ARGC_CASTED:%.*]] = alloca i64, align 8
 // CHECK4-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK4-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK4-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK4-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
 // CHECK4-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
 // CHECK4-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
@@ -44,7 +46,7 @@ int main(int argc, char **argv) {
 // CHECK4-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[C_ADDR]], align 8
 // CHECK4-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[A_ADDR]], align 8
 // CHECK4-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[D_ADDR]], align 8
-// CHECK4-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l19_kernel_environment)
+// CHECK4-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l19_kernel_environment, ptr [[DYN_PTR]])
 // CHECK4-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP4]], -1
 // CHECK4-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK4:       user_code.entry:
@@ -341,8 +343,9 @@ int main(int argc, char **argv) {
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l19
-// CHECK5-SAME: (ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[C:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i32 noundef [[ARGC:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK5-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[C:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i32 noundef [[ARGC:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[D:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK5-NEXT:  entry:
+// CHECK5-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK5-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 4
 // CHECK5-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 4
 // CHECK5-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
@@ -351,6 +354,7 @@ int main(int argc, char **argv) {
 // CHECK5-NEXT:    [[ARGC_CASTED:%.*]] = alloca i32, align 4
 // CHECK5-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK5-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK5-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK5-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 4
 // CHECK5-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 4
 // CHECK5-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
@@ -360,7 +364,7 @@ int main(int argc, char **argv) {
 // CHECK5-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[C_ADDR]], align 4
 // CHECK5-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[A_ADDR]], align 4
 // CHECK5-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[D_ADDR]], align 4
-// CHECK5-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l19_kernel_environment)
+// CHECK5-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l19_kernel_environment, ptr [[DYN_PTR]])
 // CHECK5-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP4]], -1
 // CHECK5-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK5:       user_code.entry:
diff --git a/clang/test/OpenMP/nvptx_lambda_capturing.cpp b/clang/test/OpenMP/nvptx_lambda_capturing.cpp
index 1e50d6aff2dfc..b3a4ab2e7e9e8 100644
--- a/clang/test/OpenMP/nvptx_lambda_capturing.cpp
+++ b/clang/test/OpenMP/nvptx_lambda_capturing.cpp
@@ -403,7 +403,7 @@ int main(int argc, char **argv) {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l43
-// CHECK1-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[ARGC:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]], ptr noundef [[D:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(40) [[L:%.*]]) #[[ATTR5:[0-9]+]] {
+// CHECK1-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[ARGC:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]], ptr noundef [[D:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(40) [[L:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[ARGC_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
@@ -437,7 +437,7 @@ int main(int argc, char **argv) {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l43.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[ARGC:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]], ptr noundef [[D:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(40) [[L:%.*]]) #[[ATTR6:[0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[ARGC:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]], ptr noundef [[D:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(40) [[L:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -674,7 +674,7 @@ int main(int argc, char **argv) {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN1S3fooEv_l29
-// CHECK1-SAME: (ptr noundef [[THIS:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[L:%.*]]) #[[ATTR5]] {
+// CHECK1-SAME: (ptr noundef [[THIS:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[L:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[L_ADDR:%.*]] = alloca ptr, align 8
@@ -690,7 +690,7 @@ int main(int argc, char **argv) {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN1S3fooEv_l29.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[L:%.*]]) #[[ATTR6]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[L:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -780,7 +780,7 @@ int main(int argc, char **argv) {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIZN1S3fooEvEUlvE_EiRKT__l18
-// CHECK1-SAME: (ptr noundef nonnull align 8 dereferenceable(8) [[T:%.*]]) #[[ATTR5]] {
+// CHECK1-SAME: (ptr noundef nonnull align 8 dereferenceable(8) [[T:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[T_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[TMP:%.*]] = alloca ptr, align 8
@@ -793,7 +793,7 @@ int main(int argc, char **argv) {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIZN1S3fooEvEUlvE_EiRKT__l18.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[T:%.*]]) #[[ATTR6]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[T:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -815,26 +815,28 @@ int main(int argc, char **argv) {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK1-SAME: () #[[ATTR7:[0-9]+]] {
+// CHECK1-SAME: () #[[ATTR5:[0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK1-NEXT:    ret void
 //
 //
 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN1S3fooEv_l27
-// CHECK2-SAME: (ptr noundef [[THIS:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[L:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK2-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef [[THIS:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[L:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK2-NEXT:  entry:
+// CHECK2-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK2-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK2-NEXT:    [[L_ADDR:%.*]] = alloca ptr, align 8
 // CHECK2-NEXT:    [[TMP:%.*]] = alloca ptr, align 8
 // CHECK2-NEXT:    [[L1:%.*]] = alloca [[CLASS_ANON:%.*]], align 8
 // CHECK2-NEXT:    [[_TMP2:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK2-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
 // CHECK2-NEXT:    store ptr [[L]], ptr [[L_ADDR]], align 8
 // CHECK2-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CHECK2-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[L_ADDR]], align 8
 // CHECK2-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8
-// CHECK2-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN1S3fooEv_l27_kernel_environment)
+// CHECK2-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN1S3fooEv_l27_kernel_environment, ptr [[DYN_PTR]])
 // CHECK2-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP2]], -1
 // CHECK2-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK2:       user_code.entry:
@@ -866,18 +868,20 @@ int main(int argc, char **argv) {
 //
 //
 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN1S3fooEv_l29
-// CHECK2-SAME: (ptr noundef [[THIS:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[L:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK2-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef [[THIS:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[L:%.*]]) #[[ATTR3:[0-9]+]] {
 // CHECK2-NEXT:  entry:
+// CHECK2-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK2-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK2-NEXT:    [[L_ADDR:%.*]] = alloca ptr, align 8
 // CHECK2-NEXT:    [[TMP:%.*]] = alloca ptr, align 8
 // CHECK2-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x ptr], align 8
+// CHECK2-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK2-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
 // CHECK2-NEXT:    store ptr [[L]], ptr [[L_ADDR]], align 8
 // CHECK2-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CHECK2-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[L_ADDR]], align 8
 // CHECK2-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8
-// CHECK2-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN1S3fooEv_l29_kernel_environment)
+// CHECK2-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN1S3fooEv_l29_kernel_environment, ptr [[DYN_PTR]])
 // CHECK2-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP2]], -1
 // CHECK2-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK2:       user_code.entry:
@@ -923,8 +927,9 @@ int main(int argc, char **argv) {
 //
 //
 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41
-// CHECK2-SAME: (i64 noundef [[ARGC:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]], ptr noundef [[D:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(40) [[L:%.*]]) #[[ATTR0]] {
+// CHECK2-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[ARGC:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]], ptr noundef [[D:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(40) [[L:%.*]]) #[[ATTR0]] {
 // CHECK2-NEXT:  entry:
+// CHECK2-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK2-NEXT:    [[ARGC_ADDR:%.*]] = alloca i64, align 8
 // CHECK2-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
 // CHECK2-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8
@@ -940,6 +945,7 @@ int main(int argc, char **argv) {
 // CHECK2-NEXT:    [[_TMP6:%.*]] = alloca ptr, align 8
 // CHECK2-NEXT:    [[C7:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    [[_TMP8:%.*]] = alloca ptr, align 8
+// CHECK2-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK2-NEXT:    store i64 [[ARGC]], ptr [[ARGC_ADDR]], align 8
 // CHECK2-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
 // CHECK2-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
@@ -953,7 +959,7 @@ int main(int argc, char **argv) {
 // CHECK2-NEXT:    store ptr [[TMP0]], ptr [[TMP]], align 8
 // CHECK2-NEXT:    store ptr [[TMP1]], ptr [[_TMP1]], align 8
 // CHECK2-NEXT:    store ptr [[TMP3]], ptr [[_TMP2]], align 8
-// CHECK2-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41_kernel_environment)
+// CHECK2-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41_kernel_environment, ptr [[DYN_PTR]])
 // CHECK2-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP4]], -1
 // CHECK2-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK2:       user_code.entry:
@@ -990,8 +996,9 @@ int main(int argc, char **argv) {
 //
 //
 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l43
-// CHECK2-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[ARGC:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]], ptr noundef [[D:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(40) [[L:%.*]]) #[[ATTR3]] {
+// CHECK2-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[ARGC:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]], ptr noundef [[D:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(40) [[L:%.*]]) #[[ATTR3]] {
 // CHECK2-NEXT:  entry:
+// CHECK2-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK2-NEXT:    [[ARGC_ADDR:%.*]] = alloca ptr, align 8
 // CHECK2-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
 // CHECK2-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8
@@ -1002,6 +1009,7 @@ int main(int argc, char **argv) {
 // CHECK2-NEXT:    [[_TMP1:%.*]] = alloca ptr, align 8
 // CHECK2-NEXT:    [[_TMP2:%.*]] = alloca ptr, align 8
 // CHECK2-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [6 x ptr], align 8
+// CHECK2-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK2-NEXT:    store ptr [[ARGC]], ptr [[ARGC_ADDR]], align 8
 // CHECK2-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
 // CHECK2-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
@@ -1016,7 +1024,7 @@ int main(int argc, char **argv) {
 // CHECK2-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8
 // CHECK2-NEXT:    store ptr [[TMP2]], ptr [[_TMP1]], align 8
 // CHECK2-NEXT:    store ptr [[TMP4]], ptr [[_TMP2]], align 8
-// CHECK2-NEXT:    [[TMP5:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l43_kernel_environment)
+// CHECK2-NEXT:    [[TMP5:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l43_kernel_environment, ptr [[DYN_PTR]])
 // CHECK2-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP5]], -1
 // CHECK2-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK2:       user_code.entry:
@@ -1116,15 +1124,17 @@ int main(int argc, char **argv) {
 //
 //
 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIZN1S3fooEvEUlvE_EiRKT__l18
-// CHECK2-SAME: (ptr noundef nonnull align 8 dereferenceable(8) [[T:%.*]]) #[[ATTR3]] {
+// CHECK2-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[T:%.*]]) #[[ATTR3]] {
 // CHECK2-NEXT:  entry:
+// CHECK2-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK2-NEXT:    [[T_ADDR:%.*]] = alloca ptr, align 8
 // CHECK2-NEXT:    [[TMP:%.*]] = alloca ptr, align 8
 // CHECK2-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x ptr], align 8
+// CHECK2-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK2-NEXT:    store ptr [[T]], ptr [[T_ADDR]], align 8
 // CHECK2-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[T_ADDR]], align 8
 // CHECK2-NEXT:    store ptr [[TMP0]], ptr [[TMP]], align 8
-// CHECK2-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIZN1S3fooEvEUlvE_EiRKT__l18_kernel_environment)
+// CHECK2-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIZN1S3fooEvEUlvE_EiRKT__l18_kernel_environment, ptr [[DYN_PTR]])
 // CHECK2-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // CHECK2-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK2:       user_code.entry:
@@ -1163,8 +1173,9 @@ int main(int argc, char **argv) {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41
-// CHECK3-SAME: (i64 noundef [[ARGC:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]], ptr noundef [[D:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(40) [[L:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK3-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[ARGC:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]], ptr noundef [[D:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(40) [[L:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK3-NEXT:  entry:
+// CHECK3-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK3-NEXT:    [[ARGC_ADDR:%.*]] = alloca i64, align 8
 // CHECK3-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
 // CHECK3-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8
@@ -1180,6 +1191,7 @@ int main(int argc, char **argv) {
 // CHECK3-NEXT:    [[_TMP6:%.*]] = alloca ptr, align 8
 // CHECK3-NEXT:    [[C7:%.*]] = alloca i32, align 4
 // CHECK3-NEXT:    [[_TMP8:%.*]] = alloca ptr, align 8
+// CHECK3-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK3-NEXT:    store i64 [[ARGC]], ptr [[ARGC_ADDR]], align 8
 // CHECK3-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
 // CHECK3-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
@@ -1193,7 +1205,7 @@ int main(int argc, char **argv) {
 // CHECK3-NEXT:    store ptr [[TMP0]], ptr [[TMP]], align 8
 // CHECK3-NEXT:    store ptr [[TMP1]], ptr [[_TMP1]], align 8
 // CHECK3-NEXT:    store ptr [[TMP3]], ptr [[_TMP2]], align 8
-// CHECK3-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41_kernel_environment)
+// CHECK3-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41_kernel_environment, ptr [[DYN_PTR]])
 // CHECK3-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP4]], -1
 // CHECK3-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK3:       user_code.entry:
@@ -1230,8 +1242,9 @@ int main(int argc, char **argv) {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l43
-// CHECK3-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[ARGC:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]], ptr noundef [[D:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(40) [[L:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK3-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[ARGC:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]], ptr noundef [[D:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 8 dereferenceable(40) [[L:%.*]]) #[[ATTR3:[0-9]+]] {
 // CHECK3-NEXT:  entry:
+// CHECK3-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK3-NEXT:    [[ARGC_ADDR:%.*]] = alloca ptr, align 8
 // CHECK3-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
 // CHECK3-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8
@@ -1242,6 +1255,7 @@ int main(int argc, char **argv) {
 // CHECK3-NEXT:    [[_TMP1:%.*]] = alloca ptr, align 8
 // CHECK3-NEXT:    [[_TMP2:%.*]] = alloca ptr, align 8
 // CHECK3-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [6 x ptr], align 8
+// CHECK3-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK3-NEXT:    store ptr [[ARGC]], ptr [[ARGC_ADDR]], align 8
 // CHECK3-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
 // CHECK3-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
@@ -1256,7 +1270,7 @@ int main(int argc, char **argv) {
 // CHECK3-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8
 // CHECK3-NEXT:    store ptr [[TMP2]], ptr [[_TMP1]], align 8
 // CHECK3-NEXT:    store ptr [[TMP4]], ptr [[_TMP2]], align 8
-// CHECK3-NEXT:    [[TMP5:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l43_kernel_environment)
+// CHECK3-NEXT:    [[TMP5:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l43_kernel_environment, ptr [[DYN_PTR]])
 // CHECK3-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP5]], -1
 // CHECK3-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK3:       user_code.entry:
@@ -1356,19 +1370,21 @@ int main(int argc, char **argv) {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN1S3fooEv_l27
-// CHECK3-SAME: (ptr noundef [[THIS:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[L:%.*]]) #[[ATTR0]] {
+// CHECK3-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef [[THIS:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[L:%.*]]) #[[ATTR0]] {
 // CHECK3-NEXT:  entry:
+// CHECK3-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK3-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK3-NEXT:    [[L_ADDR:%.*]] = alloca ptr, align 8
 // CHECK3-NEXT:    [[TMP:%.*]] = alloca ptr, align 8
 // CHECK3-NEXT:    [[L1:%.*]] = alloca [[CLASS_ANON_0:%.*]], align 8
 // CHECK3-NEXT:    [[_TMP2:%.*]] = alloca ptr, align 8
+// CHECK3-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK3-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
 // CHECK3-NEXT:    store ptr [[L]], ptr [[L_ADDR]], align 8
 // CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CHECK3-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[L_ADDR]], align 8
 // CHECK3-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8
-// CHECK3-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN1S3fooEv_l27_kernel_environment)
+// CHECK3-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN1S3fooEv_l27_kernel_environment, ptr [[DYN_PTR]])
 // CHECK3-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP2]], -1
 // CHECK3-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK3:       user_code.entry:
@@ -1400,18 +1416,20 @@ int main(int argc, char **argv) {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN1S3fooEv_l29
-// CHECK3-SAME: (ptr noundef [[THIS:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[L:%.*]]) #[[ATTR3]] {
+// CHECK3-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef [[THIS:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[L:%.*]]) #[[ATTR3]] {
 // CHECK3-NEXT:  entry:
+// CHECK3-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK3-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK3-NEXT:    [[L_ADDR:%.*]] = alloca ptr, align 8
 // CHECK3-NEXT:    [[TMP:%.*]] = alloca ptr, align 8
 // CHECK3-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x ptr], align 8
+// CHECK3-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK3-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
 // CHECK3-NEXT:    store ptr [[L]], ptr [[L_ADDR]], align 8
 // CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CHECK3-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[L_ADDR]], align 8
 // CHECK3-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8
-// CHECK3-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN1S3fooEv_l29_kernel_environment)
+// CHECK3-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN1S3fooEv_l29_kernel_environment, ptr [[DYN_PTR]])
 // CHECK3-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP2]], -1
 // CHECK3-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK3:       user_code.entry:
@@ -1457,15 +1475,17 @@ int main(int argc, char **argv) {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIZN1S3fooEvEUlvE_EiRKT__l18
-// CHECK3-SAME: (ptr noundef nonnull align 8 dereferenceable(8) [[T:%.*]]) #[[ATTR3]] {
+// CHECK3-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[T:%.*]]) #[[ATTR3]] {
 // CHECK3-NEXT:  entry:
+// CHECK3-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK3-NEXT:    [[T_ADDR:%.*]] = alloca ptr, align 8
 // CHECK3-NEXT:    [[TMP:%.*]] = alloca ptr, align 8
 // CHECK3-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x ptr], align 8
+// CHECK3-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK3-NEXT:    store ptr [[T]], ptr [[T_ADDR]], align 8
 // CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[T_ADDR]], align 8
 // CHECK3-NEXT:    store ptr [[TMP0]], ptr [[TMP]], align 8
-// CHECK3-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIZN1S3fooEvEUlvE_EiRKT__l18_kernel_environment)
+// CHECK3-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooIZN1S3fooEvEUlvE_EiRKT__l18_kernel_environment, ptr [[DYN_PTR]])
 // CHECK3-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // CHECK3-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK3:       user_code.entry:
diff --git a/clang/test/OpenMP/nvptx_multi_target_parallel_codegen.cpp b/clang/test/OpenMP/nvptx_multi_target_parallel_codegen.cpp
index d73212cd344c5..8868250631ec9 100644
--- a/clang/test/OpenMP/nvptx_multi_target_parallel_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_multi_target_parallel_codegen.cpp
@@ -26,10 +26,12 @@ int main() {
 
 #endif
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l21
-// CHECK1-SAME: () #[[ATTR0:[0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8
-// CHECK1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l21_kernel_environment)
+// CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
+// CHECK1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l21_kernel_environment, ptr [[DYN_PTR]])
 // CHECK1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK1:       user_code.entry:
@@ -62,9 +64,11 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l23
-// CHECK1-SAME: () #[[ATTR5:[0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR5:[0-9]+]] {
 // CHECK1-NEXT:  entry:
-// CHECK1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l23_kernel_environment)
+// CHECK1-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
+// CHECK1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l23_kernel_environment, ptr [[DYN_PTR]])
 // CHECK1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK1:       user_code.entry:
@@ -102,10 +106,12 @@ int main() {
 //
 //
 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l21
-// CHECK2-SAME: () #[[ATTR0:[0-9]+]] {
+// CHECK2-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK2-NEXT:  entry:
+// CHECK2-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK2-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 4
-// CHECK2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l21_kernel_environment)
+// CHECK2-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l21_kernel_environment, ptr [[DYN_PTR]])
 // CHECK2-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK2-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK2:       user_code.entry:
@@ -138,9 +144,11 @@ int main() {
 //
 //
 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l23
-// CHECK2-SAME: () #[[ATTR5:[0-9]+]] {
+// CHECK2-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR5:[0-9]+]] {
 // CHECK2-NEXT:  entry:
-// CHECK2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l23_kernel_environment)
+// CHECK2-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
+// CHECK2-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l23_kernel_environment, ptr [[DYN_PTR]])
 // CHECK2-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK2-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK2:       user_code.entry:
diff --git a/clang/test/OpenMP/nvptx_nested_parallel_codegen.cpp b/clang/test/OpenMP/nvptx_nested_parallel_codegen.cpp
index 85c736b985925..9c985c1fde608 100644
--- a/clang/test/OpenMP/nvptx_nested_parallel_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_nested_parallel_codegen.cpp
@@ -34,13 +34,15 @@ int main() {
 
 #endif
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l25
-// CHECK1-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x ptr], align 8
+// CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK1-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l25_kernel_environment)
+// CHECK1-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l25_kernel_environment, ptr [[DYN_PTR]])
 // CHECK1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK1:       user_code.entry:
@@ -144,13 +146,15 @@ int main() {
 //
 //
 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l25
-// CHECK2-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK2-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[C:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK2-NEXT:  entry:
+// CHECK2-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK2-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 4
 // CHECK2-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x ptr], align 4
+// CHECK2-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK2-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 4
 // CHECK2-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 4
-// CHECK2-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l25_kernel_environment)
+// CHECK2-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l25_kernel_environment, ptr [[DYN_PTR]])
 // CHECK2-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // CHECK2-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK2:       user_code.entry:
diff --git a/clang/test/OpenMP/nvptx_parallel_codegen.cpp b/clang/test/OpenMP/nvptx_parallel_codegen.cpp
index ef17cc1f5a1a2..0fea287bce94e 100644
--- a/clang/test/OpenMP/nvptx_parallel_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_parallel_codegen.cpp
@@ -74,14 +74,16 @@ int bar(int n){
 
 #endif
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l26
-// CHECK1-SAME: (i64 noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
 // CHECK1-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8
 // CHECK1-NEXT:    [[CAPTURED_VARS_ADDRS1:%.*]] = alloca [0 x ptr], align 8
 // CHECK1-NEXT:    [[CAPTURED_VARS_ADDRS2:%.*]] = alloca [0 x ptr], align 8
+// CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK1-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l26_kernel_environment)
+// CHECK1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l26_kernel_environment, ptr [[DYN_PTR]])
 // CHECK1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK1:       user_code.entry:
@@ -180,19 +182,21 @@ int bar(int n){
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l43
-// CHECK1-SAME: (i64 noundef [[N:%.*]], i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
+// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[N:%.*]], i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
 // CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[N_ADDR:%.*]] = alloca i64, align 8
 // CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
 // CHECK1-NEXT:    [[AA_ADDR:%.*]] = alloca i64, align 8
 // CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8
+// CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK1-NEXT:    store i64 [[N]], ptr [[N_ADDR]], align 8
 // CHECK1-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
 // CHECK1-NEXT:    store i64 [[AA]], ptr [[AA_ADDR]], align 8
 // CHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l43_kernel_environment)
+// CHECK1-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l43_kernel_environment, ptr [[DYN_PTR]])
 // CHECK1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK1:       user_code.entry:
@@ -250,12 +254,14 @@ int bar(int n){
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l55
-// CHECK1-SAME: (i64 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
 // CHECK1-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x ptr], align 8
+// CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK1-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l55_kernel_environment)
+// CHECK1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l55_kernel_environment, ptr [[DYN_PTR]])
 // CHECK1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK1:       user_code.entry:
@@ -337,14 +343,16 @@ int bar(int n){
 //
 //
 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l26
-// CHECK2-SAME: (i32 noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK2-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK2-NEXT:  entry:
+// CHECK2-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK2-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 4
 // CHECK2-NEXT:    [[CAPTURED_VARS_ADDRS1:%.*]] = alloca [0 x ptr], align 4
 // CHECK2-NEXT:    [[CAPTURED_VARS_ADDRS2:%.*]] = alloca [0 x ptr], align 4
+// CHECK2-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK2-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
-// CHECK2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l26_kernel_environment)
+// CHECK2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l26_kernel_environment, ptr [[DYN_PTR]])
 // CHECK2-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK2-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK2:       user_code.entry:
@@ -443,19 +451,21 @@ int bar(int n){
 //
 //
 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l43
-// CHECK2-SAME: (i32 noundef [[N:%.*]], i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
+// CHECK2-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[N:%.*]], i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
 // CHECK2-NEXT:  entry:
+// CHECK2-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK2-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    [[AA_ADDR:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 4
 // CHECK2-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 4
+// CHECK2-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK2-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
 // CHECK2-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
 // CHECK2-NEXT:    store i32 [[AA]], ptr [[AA_ADDR]], align 4
 // CHECK2-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 4
 // CHECK2-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 4
-// CHECK2-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l43_kernel_environment)
+// CHECK2-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l43_kernel_environment, ptr [[DYN_PTR]])
 // CHECK2-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // CHECK2-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK2:       user_code.entry:
@@ -513,12 +523,14 @@ int bar(int n){
 //
 //
 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l55
-// CHECK2-SAME: (i32 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK2-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK2-NEXT:  entry:
+// CHECK2-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK2-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x ptr], align 4
+// CHECK2-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK2-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
-// CHECK2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l55_kernel_environment)
+// CHECK2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l55_kernel_environment, ptr [[DYN_PTR]])
 // CHECK2-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK2-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK2:       user_code.entry:
diff --git a/clang/test/OpenMP/nvptx_parallel_for_codegen.cpp b/clang/test/OpenMP/nvptx_parallel_for_codegen.cpp
index fc01baa34c18b..bc9e53bdeb478 100644
--- a/clang/test/OpenMP/nvptx_parallel_for_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_parallel_for_codegen.cpp
@@ -33,15 +33,17 @@ int bar(int n){
 
 #endif
 // CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l13
-// CHECK-SAME: (i64 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-NEXT:    [[N_ADDR:%.*]] = alloca i64, align 8
 // CHECK-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x ptr], align 8
+// CHECK-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK-NEXT:    store i64 [[N]], ptr [[N_ADDR]], align 8
 // CHECK-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l13_kernel_environment)
+// CHECK-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l13_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // CHECK-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK:       user_code.entry:
diff --git a/clang/test/OpenMP/nvptx_target_codegen.cpp b/clang/test/OpenMP/nvptx_target_codegen.cpp
index cea2fd4e75125..14980853320d2 100644
--- a/clang/test/OpenMP/nvptx_target_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_target_codegen.cpp
@@ -145,15 +145,17 @@ void unreachable_call() {
 
 #endif
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9targetBarPiS__l25
-// CHECK1-SAME: (ptr [[PTR1:%.*]], ptr nonnull align 8 dereferenceable(8) [[PTR2:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK1-SAME: (ptr noalias [[DYN_PTR:%.*]], ptr [[PTR1:%.*]], ptr nonnull align 8 dereferenceable(8) [[PTR2:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[PTR1_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[PTR2_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x ptr], align 8
+// CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK1-NEXT:    store ptr [[PTR1]], ptr [[PTR1_ADDR]], align 8
 // CHECK1-NEXT:    store ptr [[PTR2]], ptr [[PTR2_ADDR]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PTR2_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9targetBarPiS__l25_kernel_environment)
+// CHECK1-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9targetBarPiS__l25_kernel_environment, ptr [[DYN_PTR]])
 // CHECK1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK1:       user_code.entry:
@@ -190,9 +192,11 @@ void unreachable_call() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l39
-// CHECK1-SAME: () #[[ATTR4:[0-9]+]] {
+// CHECK1-SAME: (ptr noalias [[DYN_PTR:%.*]]) #[[ATTR4:[0-9]+]] {
 // CHECK1-NEXT:  entry:
-// CHECK1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l39_kernel_environment)
+// CHECK1-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
+// CHECK1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l39_kernel_environment, ptr [[DYN_PTR]])
 // CHECK1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK1:       user_code.entry:
@@ -203,11 +207,13 @@ void unreachable_call() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l47
-// CHECK1-SAME: (i64 [[AA:%.*]]) #[[ATTR4]] {
+// CHECK1-SAME: (ptr noalias [[DYN_PTR:%.*]], i64 [[AA:%.*]]) #[[ATTR4]] {
 // CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[AA_ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK1-NEXT:    store i64 [[AA]], ptr [[AA_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l47_kernel_environment)
+// CHECK1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l47_kernel_environment, ptr [[DYN_PTR]])
 // CHECK1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK1:       user_code.entry:
@@ -228,8 +234,9 @@ void unreachable_call() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l53
-// CHECK1-SAME: (i64 [[A:%.*]], ptr nonnull align 4 dereferenceable(40) [[B:%.*]], i64 [[VLA:%.*]], ptr nonnull align 4 dereferenceable(4) [[BN:%.*]], ptr nonnull align 8 dereferenceable(400) [[C:%.*]], i64 [[VLA1:%.*]], i64 [[VLA3:%.*]], ptr nonnull align 8 dereferenceable(8) [[CN:%.*]], ptr nonnull align 8 dereferenceable(16) [[D:%.*]]) #[[ATTR4]] {
+// CHECK1-SAME: (ptr noalias [[DYN_PTR:%.*]], i64 [[A:%.*]], ptr nonnull align 4 dereferenceable(40) [[B:%.*]], i64 [[VLA:%.*]], ptr nonnull align 4 dereferenceable(4) [[BN:%.*]], ptr nonnull align 8 dereferenceable(400) [[C:%.*]], i64 [[VLA1:%.*]], i64 [[VLA3:%.*]], ptr nonnull align 8 dereferenceable(8) [[CN:%.*]], ptr nonnull align 8 dereferenceable(16) [[D:%.*]]) #[[ATTR4]] {
 // CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
 // CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[VLA_ADDR:%.*]] = alloca i64, align 8
@@ -239,6 +246,7 @@ void unreachable_call() {
 // CHECK1-NEXT:    [[VLA_ADDR4:%.*]] = alloca i64, align 8
 // CHECK1-NEXT:    [[CN_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[D_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK1-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
 // CHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
 // CHECK1-NEXT:    store i64 [[VLA]], ptr [[VLA_ADDR]], align 8
@@ -256,7 +264,7 @@ void unreachable_call() {
 // CHECK1-NEXT:    [[TMP5:%.*]] = load i64, ptr [[VLA_ADDR4]], align 8
 // CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[CN_ADDR]], align 8
 // CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[D_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP8:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l53_kernel_environment)
+// CHECK1-NEXT:    [[TMP8:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l53_kernel_environment, ptr [[DYN_PTR]])
 // CHECK1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP8]], -1
 // CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK1:       user_code.entry:
@@ -319,18 +327,20 @@ void unreachable_call() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l90
-// CHECK1-SAME: (i64 [[A:%.*]], i64 [[AA:%.*]], i64 [[AAA:%.*]], ptr nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR4]] {
+// CHECK1-SAME: (ptr noalias [[DYN_PTR:%.*]], i64 [[A:%.*]], i64 [[AA:%.*]], i64 [[AAA:%.*]], ptr nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR4]] {
 // CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
 // CHECK1-NEXT:    [[AA_ADDR:%.*]] = alloca i64, align 8
 // CHECK1-NEXT:    [[AAA_ADDR:%.*]] = alloca i64, align 8
 // CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK1-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
 // CHECK1-NEXT:    store i64 [[AA]], ptr [[AA_ADDR]], align 8
 // CHECK1-NEXT:    store i64 [[AAA]], ptr [[AAA_ADDR]], align 8
 // CHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l90_kernel_environment)
+// CHECK1-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l90_kernel_environment, ptr [[DYN_PTR]])
 // CHECK1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK1:       user_code.entry:
@@ -358,13 +368,15 @@ void unreachable_call() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l108
-// CHECK1-SAME: (ptr [[THIS:%.*]], i64 [[B:%.*]], i64 [[VLA:%.*]], i64 [[VLA1:%.*]], ptr nonnull align 2 dereferenceable(2) [[C:%.*]]) #[[ATTR4]] {
+// CHECK1-SAME: (ptr noalias [[DYN_PTR:%.*]], ptr [[THIS:%.*]], i64 [[B:%.*]], i64 [[VLA:%.*]], i64 [[VLA1:%.*]], ptr nonnull align 2 dereferenceable(2) [[C:%.*]]) #[[ATTR4]] {
 // CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca i64, align 8
 // CHECK1-NEXT:    [[VLA_ADDR:%.*]] = alloca i64, align 8
 // CHECK1-NEXT:    [[VLA_ADDR2:%.*]] = alloca i64, align 8
 // CHECK1-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
 // CHECK1-NEXT:    store i64 [[B]], ptr [[B_ADDR]], align 8
 // CHECK1-NEXT:    store i64 [[VLA]], ptr [[VLA_ADDR]], align 8
@@ -374,7 +386,7 @@ void unreachable_call() {
 // CHECK1-NEXT:    [[TMP1:%.*]] = load i64, ptr [[VLA_ADDR]], align 8
 // CHECK1-NEXT:    [[TMP2:%.*]] = load i64, ptr [[VLA_ADDR2]], align 8
 // CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l108_kernel_environment)
+// CHECK1-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l108_kernel_environment, ptr [[DYN_PTR]])
 // CHECK1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP4]], -1
 // CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK1:       user_code.entry:
@@ -424,9 +436,11 @@ void unreachable_call() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16unreachable_callv_l142
-// CHECK1-SAME: () #[[ATTR4]] {
+// CHECK1-SAME: (ptr noalias [[DYN_PTR:%.*]]) #[[ATTR4]] {
 // CHECK1-NEXT:  entry:
-// CHECK1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16unreachable_callv_l142_kernel_environment)
+// CHECK1-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
+// CHECK1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16unreachable_callv_l142_kernel_environment, ptr [[DYN_PTR]])
 // CHECK1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK1:       user_code.entry:
@@ -440,16 +454,18 @@ void unreachable_call() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l74
-// CHECK1-SAME: (i64 [[A:%.*]], i64 [[AA:%.*]], ptr nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR4]] {
+// CHECK1-SAME: (ptr noalias [[DYN_PTR:%.*]], i64 [[A:%.*]], i64 [[AA:%.*]], ptr nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR4]] {
 // CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
 // CHECK1-NEXT:    [[AA_ADDR:%.*]] = alloca i64, align 8
 // CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK1-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
 // CHECK1-NEXT:    store i64 [[AA]], ptr [[AA_ADDR]], align 8
 // CHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l74_kernel_environment)
+// CHECK1-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l74_kernel_environment, ptr [[DYN_PTR]])
 // CHECK1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK1:       user_code.entry:
@@ -515,15 +531,17 @@ void unreachable_call() {
 //
 //
 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9targetBarPiS__l25
-// CHECK2-SAME: (ptr [[PTR1:%.*]], ptr nonnull align 4 dereferenceable(4) [[PTR2:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK2-SAME: (ptr noalias [[DYN_PTR:%.*]], ptr [[PTR1:%.*]], ptr nonnull align 4 dereferenceable(4) [[PTR2:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK2-NEXT:  entry:
+// CHECK2-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK2-NEXT:    [[PTR1_ADDR:%.*]] = alloca ptr, align 4
 // CHECK2-NEXT:    [[PTR2_ADDR:%.*]] = alloca ptr, align 4
 // CHECK2-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x ptr], align 4
+// CHECK2-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK2-NEXT:    store ptr [[PTR1]], ptr [[PTR1_ADDR]], align 4
 // CHECK2-NEXT:    store ptr [[PTR2]], ptr [[PTR2_ADDR]], align 4
 // CHECK2-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PTR2_ADDR]], align 4
-// CHECK2-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9targetBarPiS__l25_kernel_environment)
+// CHECK2-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9targetBarPiS__l25_kernel_environment, ptr [[DYN_PTR]])
 // CHECK2-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // CHECK2-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK2:       user_code.entry:
@@ -560,9 +578,11 @@ void unreachable_call() {
 //
 //
 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l39
-// CHECK2-SAME: () #[[ATTR4:[0-9]+]] {
+// CHECK2-SAME: (ptr noalias [[DYN_PTR:%.*]]) #[[ATTR4:[0-9]+]] {
 // CHECK2-NEXT:  entry:
-// CHECK2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l39_kernel_environment)
+// CHECK2-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
+// CHECK2-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l39_kernel_environment, ptr [[DYN_PTR]])
 // CHECK2-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK2-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK2:       user_code.entry:
@@ -573,11 +593,13 @@ void unreachable_call() {
 //
 //
 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l47
-// CHECK2-SAME: (i32 [[AA:%.*]]) #[[ATTR4]] {
+// CHECK2-SAME: (ptr noalias [[DYN_PTR:%.*]], i32 [[AA:%.*]]) #[[ATTR4]] {
 // CHECK2-NEXT:  entry:
+// CHECK2-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK2-NEXT:    [[AA_ADDR:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK2-NEXT:    store i32 [[AA]], ptr [[AA_ADDR]], align 4
-// CHECK2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l47_kernel_environment)
+// CHECK2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l47_kernel_environment, ptr [[DYN_PTR]])
 // CHECK2-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK2-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK2:       user_code.entry:
@@ -598,8 +620,9 @@ void unreachable_call() {
 //
 //
 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l53
-// CHECK2-SAME: (i32 [[A:%.*]], ptr nonnull align 4 dereferenceable(40) [[B:%.*]], i32 [[VLA:%.*]], ptr nonnull align 4 dereferenceable(4) [[BN:%.*]], ptr nonnull align 8 dereferenceable(400) [[C:%.*]], i32 [[VLA1:%.*]], i32 [[VLA3:%.*]], ptr nonnull align 8 dereferenceable(8) [[CN:%.*]], ptr nonnull align 8 dereferenceable(16) [[D:%.*]]) #[[ATTR4]] {
+// CHECK2-SAME: (ptr noalias [[DYN_PTR:%.*]], i32 [[A:%.*]], ptr nonnull align 4 dereferenceable(40) [[B:%.*]], i32 [[VLA:%.*]], ptr nonnull align 4 dereferenceable(4) [[BN:%.*]], ptr nonnull align 8 dereferenceable(400) [[C:%.*]], i32 [[VLA1:%.*]], i32 [[VLA3:%.*]], ptr nonnull align 8 dereferenceable(8) [[CN:%.*]], ptr nonnull align 8 dereferenceable(16) [[D:%.*]]) #[[ATTR4]] {
 // CHECK2-NEXT:  entry:
+// CHECK2-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK2-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 4
 // CHECK2-NEXT:    [[VLA_ADDR:%.*]] = alloca i32, align 4
@@ -609,6 +632,7 @@ void unreachable_call() {
 // CHECK2-NEXT:    [[VLA_ADDR4:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    [[CN_ADDR:%.*]] = alloca ptr, align 4
 // CHECK2-NEXT:    [[D_ADDR:%.*]] = alloca ptr, align 4
+// CHECK2-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK2-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
 // CHECK2-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 4
 // CHECK2-NEXT:    store i32 [[VLA]], ptr [[VLA_ADDR]], align 4
@@ -626,7 +650,7 @@ void unreachable_call() {
 // CHECK2-NEXT:    [[TMP5:%.*]] = load i32, ptr [[VLA_ADDR4]], align 4
 // CHECK2-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[CN_ADDR]], align 4
 // CHECK2-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[D_ADDR]], align 4
-// CHECK2-NEXT:    [[TMP8:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l53_kernel_environment)
+// CHECK2-NEXT:    [[TMP8:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l53_kernel_environment, ptr [[DYN_PTR]])
 // CHECK2-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP8]], -1
 // CHECK2-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK2:       user_code.entry:
@@ -689,18 +713,20 @@ void unreachable_call() {
 //
 //
 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l90
-// CHECK2-SAME: (i32 [[A:%.*]], i32 [[AA:%.*]], i32 [[AAA:%.*]], ptr nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR4]] {
+// CHECK2-SAME: (ptr noalias [[DYN_PTR:%.*]], i32 [[A:%.*]], i32 [[AA:%.*]], i32 [[AAA:%.*]], ptr nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR4]] {
 // CHECK2-NEXT:  entry:
+// CHECK2-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK2-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    [[AA_ADDR:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    [[AAA_ADDR:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 4
+// CHECK2-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK2-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
 // CHECK2-NEXT:    store i32 [[AA]], ptr [[AA_ADDR]], align 4
 // CHECK2-NEXT:    store i32 [[AAA]], ptr [[AAA_ADDR]], align 4
 // CHECK2-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 4
 // CHECK2-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 4
-// CHECK2-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l90_kernel_environment)
+// CHECK2-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l90_kernel_environment, ptr [[DYN_PTR]])
 // CHECK2-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // CHECK2-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK2:       user_code.entry:
@@ -728,13 +754,15 @@ void unreachable_call() {
 //
 //
 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l108
-// CHECK2-SAME: (ptr [[THIS:%.*]], i32 [[B:%.*]], i32 [[VLA:%.*]], i32 [[VLA1:%.*]], ptr nonnull align 2 dereferenceable(2) [[C:%.*]]) #[[ATTR4]] {
+// CHECK2-SAME: (ptr noalias [[DYN_PTR:%.*]], ptr [[THIS:%.*]], i32 [[B:%.*]], i32 [[VLA:%.*]], i32 [[VLA1:%.*]], ptr nonnull align 2 dereferenceable(2) [[C:%.*]]) #[[ATTR4]] {
 // CHECK2-NEXT:  entry:
+// CHECK2-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK2-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
 // CHECK2-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    [[VLA_ADDR:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    [[VLA_ADDR2:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 4
+// CHECK2-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK2-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
 // CHECK2-NEXT:    store i32 [[B]], ptr [[B_ADDR]], align 4
 // CHECK2-NEXT:    store i32 [[VLA]], ptr [[VLA_ADDR]], align 4
@@ -744,7 +772,7 @@ void unreachable_call() {
 // CHECK2-NEXT:    [[TMP1:%.*]] = load i32, ptr [[VLA_ADDR]], align 4
 // CHECK2-NEXT:    [[TMP2:%.*]] = load i32, ptr [[VLA_ADDR2]], align 4
 // CHECK2-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[C_ADDR]], align 4
-// CHECK2-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l108_kernel_environment)
+// CHECK2-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l108_kernel_environment, ptr [[DYN_PTR]])
 // CHECK2-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP4]], -1
 // CHECK2-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK2:       user_code.entry:
@@ -794,9 +822,11 @@ void unreachable_call() {
 //
 //
 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16unreachable_callv_l142
-// CHECK2-SAME: () #[[ATTR4]] {
+// CHECK2-SAME: (ptr noalias [[DYN_PTR:%.*]]) #[[ATTR4]] {
 // CHECK2-NEXT:  entry:
-// CHECK2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16unreachable_callv_l142_kernel_environment)
+// CHECK2-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
+// CHECK2-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16unreachable_callv_l142_kernel_environment, ptr [[DYN_PTR]])
 // CHECK2-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK2-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK2:       user_code.entry:
@@ -810,16 +840,18 @@ void unreachable_call() {
 //
 //
 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l74
-// CHECK2-SAME: (i32 [[A:%.*]], i32 [[AA:%.*]], ptr nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR4]] {
+// CHECK2-SAME: (ptr noalias [[DYN_PTR:%.*]], i32 [[A:%.*]], i32 [[AA:%.*]], ptr nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR4]] {
 // CHECK2-NEXT:  entry:
+// CHECK2-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK2-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    [[AA_ADDR:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 4
+// CHECK2-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK2-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
 // CHECK2-NEXT:    store i32 [[AA]], ptr [[AA_ADDR]], align 4
 // CHECK2-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 4
 // CHECK2-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 4
-// CHECK2-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l74_kernel_environment)
+// CHECK2-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l74_kernel_environment, ptr [[DYN_PTR]])
 // CHECK2-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // CHECK2-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK2:       user_code.entry:
diff --git a/clang/test/OpenMP/nvptx_target_firstprivate_codegen.cpp b/clang/test/OpenMP/nvptx_target_firstprivate_codegen.cpp
index 4e7d1541a13eb..d573f1cd193d6 100644
--- a/clang/test/OpenMP/nvptx_target_firstprivate_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_target_firstprivate_codegen.cpp
@@ -32,7 +32,8 @@ int foo(int n, double *ptr) {
     b[a] += e.X;
   }
 
-  // TCHECK:  define {{.*}}void @__omp_offloading_{{.+}}(ptr addrspace(1) noalias noundef [[B_IN:%.+]], i{{[0-9]+}} noundef [[A_IN:%.+]], ptr noalias noundef [[E_IN:%.+]])
+  // TCHECK:  define {{.*}}void @__omp_offloading_{{.+}}(ptr {{[^,]+}}, ptr addrspace(1) noalias noundef [[B_IN:%.+]], i{{[0-9]+}} noundef [[A_IN:%.+]], ptr noalias noundef [[E_IN:%.+]])
+  // TCHECK:  [[DYN_PTR_ADDR:%.+]] = alloca ptr
   // TCHECK:  [[A_ADDR:%.+]] = alloca i{{[0-9]+}},
   // TCHECK-NOT: alloca [[TTII]],
   // TCHECK: alloca i{{[0-9]+}},
@@ -50,7 +51,8 @@ int foo(int n, double *ptr) {
 
   // make sure that firstprivate variables are generated in all cases and that we use those instances for operations inside the
   // target region
-  // TCHECK:  define {{.*}}void @__omp_offloading_{{.+}}(i{{[0-9]+}}{{.*}} [[A2_IN:%.+]], ptr{{.*}} [[B_IN:%.+]], ptr{{.*}} [[C_IN:%.+]], ptr{{.*}} [[D_IN:%.+]])
+  // TCHECK:  define {{.*}}void @__omp_offloading_{{.+}}(ptr {{[^,]+}}, i{{[0-9]+}}{{.*}} [[A2_IN:%.+]], ptr{{.*}} [[B_IN:%.+]], ptr{{.*}} [[C_IN:%.+]], ptr{{.*}} [[D_IN:%.+]])
+  // TCHECK:  [[DYN_PTR_ADDR:%.+]] = alloca ptr
   // TCHECK:  [[A2_ADDR:%.+]] = alloca i{{[0-9]+}},
   // TCHECK:  [[B_ADDR:%.+]] = alloca ptr,
   // TCHECK:  [[C_ADDR:%.+]] = alloca ptr,
@@ -88,7 +90,8 @@ int foo(int n, double *ptr) {
     ptr[0]++;
   }
 
-  // TCHECK:  define weak_odr protected void @__omp_offloading_{{.+}}(ptr noundef [[PTR_IN:%.+]])
+  // TCHECK:  define weak_odr protected void @__omp_offloading_{{.+}}(ptr {{[^,]+}}, ptr noundef [[PTR_IN:%.+]])
+  // TCHECK:  [[DYN_PTR_ADDR:%.+]] = alloca ptr,
   // TCHECK:  [[PTR_ADDR:%.+]] = alloca ptr,
   // TCHECK-NOT: alloca ptr,
   // TCHECK:  store ptr [[PTR_IN]], ptr [[PTR_ADDR]],
@@ -133,7 +136,8 @@ void fconst(const tx t) {
   { }
 }
 
-// TCHECK: define {{.*}}void @__omp_offloading_{{.+}}(i{{[0-9]+}}{{.*}} [[A_IN:%.+]], i{{[0-9]+}}{{.*}} [[A3_IN:%.+]], ptr {{.+}} [[B_IN:%.+]])
+// TCHECK: define {{.*}}void @__omp_offloading_{{.+}}(ptr {{[^,]+}}, i{{[0-9]+}}{{.*}} [[A_IN:%.+]], i{{[0-9]+}}{{.*}} [[A3_IN:%.+]], ptr {{.+}} [[B_IN:%.+]])
+// TCHECK:  [[DYN_PTR_ADDR:%.+]] = alloca ptr
 // TCHECK:  [[A_ADDR:%.+]] = alloca i{{[0-9]+}},
 // TCHECK:  [[A3_ADDR:%.+]] = alloca i{{[0-9]+}},
 // TCHECK:  [[B_ADDR:%.+]] = alloca ptr,
@@ -169,7 +173,8 @@ struct S1 {
     return (int)b;
   }
 
-  // TCHECK: define internal void @__omp_offloading_{{.+}}(ptr noundef [[TH:%.+]], i{{[0-9]+}} noundef [[B_IN:%.+]])
+  // TCHECK: define internal void @__omp_offloading_{{.+}}(ptr {{[^,]+}}, ptr noundef [[TH:%.+]], i{{[0-9]+}} noundef [[B_IN:%.+]])
+  // TCHECK:  [[DYN_PTR_ADDR:%.+]] = alloca ptr
   // TCHECK:  [[TH_ADDR:%.+]] = alloca ptr,
   // TCHECK:  [[B_ADDR:%.+]] = alloca i{{[0-9]+}},
   // TCHECK-NOT: alloca i{{[0-9]+}},
@@ -200,7 +205,8 @@ int bar(int n, double *ptr) {
 
 // template
 
-// TCHECK: define internal void @__omp_offloading_{{.+}}(i{{[0-9]+}} noundef [[A_IN:%.+]], ptr{{.+}} noundef [[B_IN:%.+]])
+// TCHECK: define internal void @__omp_offloading_{{.+}}(ptr {{[^,]+}}, i{{[0-9]+}} noundef [[A_IN:%.+]], ptr{{.+}} noundef [[B_IN:%.+]])
+// TCHECK:  [[DYN_PTR_ADDR:%.+]] = alloca ptr
 // TCHECK:  [[A_ADDR:%.+]] = alloca i{{[0-9]+}},
 // TCHECK:  [[B_ADDR:%.+]] = alloca ptr,
 // TCHECK-NOT: alloca i{{[0-9]+}},
diff --git a/clang/test/OpenMP/nvptx_target_parallel_codegen.cpp b/clang/test/OpenMP/nvptx_target_parallel_codegen.cpp
index cb67702f34c82..35f8089662bd5 100644
--- a/clang/test/OpenMP/nvptx_target_parallel_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_target_parallel_codegen.cpp
@@ -52,13 +52,15 @@ int bar(int n){
 
 #endif
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l30
-// CHECK1-SAME: (ptr noundef nonnull align 2 dereferenceable(2) [[AA:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[AA:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[AA_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x ptr], align 8
+// CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK1-NEXT:    store ptr [[AA]], ptr [[AA_ADDR]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[AA_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l30_kernel_environment)
+// CHECK1-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l30_kernel_environment, ptr [[DYN_PTR]])
 // CHECK1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK1:       user_code.entry:
@@ -91,19 +93,21 @@ int bar(int n){
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l35
-// CHECK1-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
+// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
 // CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[AA_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x ptr], align 8
+// CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK1-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
 // CHECK1-NEXT:    store ptr [[AA]], ptr [[AA_ADDR]], align 8
 // CHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 8
 // CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[AA_ADDR]], align 8
 // CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP3:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l35_kernel_environment)
+// CHECK1-NEXT:    [[TMP3:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l35_kernel_environment, ptr [[DYN_PTR]])
 // CHECK1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP3]], -1
 // CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK1:       user_code.entry:
@@ -153,13 +157,15 @@ int bar(int n){
 //
 //
 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l30
-// CHECK2-SAME: (ptr noundef nonnull align 2 dereferenceable(2) [[AA:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK2-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[AA:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK2-NEXT:  entry:
+// CHECK2-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK2-NEXT:    [[AA_ADDR:%.*]] = alloca ptr, align 4
 // CHECK2-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x ptr], align 4
+// CHECK2-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK2-NEXT:    store ptr [[AA]], ptr [[AA_ADDR]], align 4
 // CHECK2-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[AA_ADDR]], align 4
-// CHECK2-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l30_kernel_environment)
+// CHECK2-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l30_kernel_environment, ptr [[DYN_PTR]])
 // CHECK2-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // CHECK2-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK2:       user_code.entry:
@@ -192,19 +198,21 @@ int bar(int n){
 //
 //
 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l35
-// CHECK2-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
+// CHECK2-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
 // CHECK2-NEXT:  entry:
+// CHECK2-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK2-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
 // CHECK2-NEXT:    [[AA_ADDR:%.*]] = alloca ptr, align 4
 // CHECK2-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 4
 // CHECK2-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x ptr], align 4
+// CHECK2-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK2-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
 // CHECK2-NEXT:    store ptr [[AA]], ptr [[AA_ADDR]], align 4
 // CHECK2-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 4
 // CHECK2-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
 // CHECK2-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[AA_ADDR]], align 4
 // CHECK2-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[B_ADDR]], align 4
-// CHECK2-NEXT:    [[TMP3:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l35_kernel_environment)
+// CHECK2-NEXT:    [[TMP3:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l35_kernel_environment, ptr [[DYN_PTR]])
 // CHECK2-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP3]], -1
 // CHECK2-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK2:       user_code.entry:
diff --git a/clang/test/OpenMP/nvptx_target_parallel_num_threads_codegen.cpp b/clang/test/OpenMP/nvptx_target_parallel_num_threads_codegen.cpp
index bc589c52ffb51..f92ce4e89464b 100644
--- a/clang/test/OpenMP/nvptx_target_parallel_num_threads_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_target_parallel_num_threads_codegen.cpp
@@ -47,13 +47,15 @@ int bar(int n){
 
 #endif
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l25
-// CHECK1-SAME: (ptr noundef nonnull align 2 dereferenceable(2) [[AA:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[AA:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[AA_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x ptr], align 8
+// CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK1-NEXT:    store ptr [[AA]], ptr [[AA_ADDR]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[AA_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l25_kernel_environment)
+// CHECK1-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l25_kernel_environment, ptr [[DYN_PTR]])
 // CHECK1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK1:       user_code.entry:
@@ -86,13 +88,15 @@ int bar(int n){
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l30
-// CHECK1-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR4:[0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR4:[0-9]+]] {
 // CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[AA_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8
 // CHECK1-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x ptr], align 8
+// CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK1-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
 // CHECK1-NEXT:    store ptr [[AA]], ptr [[AA_ADDR]], align 8
 // CHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
@@ -100,7 +104,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 8
 // CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[AA_ADDR]], align 8
 // CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP3:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l30_kernel_environment)
+// CHECK1-NEXT:    [[TMP3:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l30_kernel_environment, ptr [[DYN_PTR]])
 // CHECK1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP3]], -1
 // CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK1:       user_code.entry:
@@ -151,13 +155,15 @@ int bar(int n){
 //
 //
 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l25
-// CHECK2-SAME: (ptr noundef nonnull align 2 dereferenceable(2) [[AA:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK2-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[AA:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK2-NEXT:  entry:
+// CHECK2-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK2-NEXT:    [[AA_ADDR:%.*]] = alloca ptr, align 4
 // CHECK2-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x ptr], align 4
+// CHECK2-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK2-NEXT:    store ptr [[AA]], ptr [[AA_ADDR]], align 4
 // CHECK2-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[AA_ADDR]], align 4
-// CHECK2-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l25_kernel_environment)
+// CHECK2-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l25_kernel_environment, ptr [[DYN_PTR]])
 // CHECK2-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // CHECK2-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK2:       user_code.entry:
@@ -190,13 +196,15 @@ int bar(int n){
 //
 //
 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l30
-// CHECK2-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR4:[0-9]+]] {
+// CHECK2-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR4:[0-9]+]] {
 // CHECK2-NEXT:  entry:
+// CHECK2-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK2-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
 // CHECK2-NEXT:    [[AA_ADDR:%.*]] = alloca ptr, align 4
 // CHECK2-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 4
 // CHECK2-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x ptr], align 4
+// CHECK2-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK2-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
 // CHECK2-NEXT:    store ptr [[AA]], ptr [[AA_ADDR]], align 4
 // CHECK2-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 4
@@ -204,7 +212,7 @@ int bar(int n){
 // CHECK2-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
 // CHECK2-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[AA_ADDR]], align 4
 // CHECK2-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[B_ADDR]], align 4
-// CHECK2-NEXT:    [[TMP3:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l30_kernel_environment)
+// CHECK2-NEXT:    [[TMP3:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l30_kernel_environment, ptr [[DYN_PTR]])
 // CHECK2-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP3]], -1
 // CHECK2-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK2:       user_code.entry:
diff --git a/clang/test/OpenMP/nvptx_target_parallel_proc_bind_codegen.cpp b/clang/test/OpenMP/nvptx_target_parallel_proc_bind_codegen.cpp
index 7bf0b934962e5..bfcdf9b5b1839 100644
--- a/clang/test/OpenMP/nvptx_target_parallel_proc_bind_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_target_parallel_proc_bind_codegen.cpp
@@ -53,10 +53,12 @@ int bar(int n){
 
 #endif
 // CHECK45-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l27
-// CHECK45-64-SAME: () #[[ATTR0:[0-9]+]] {
+// CHECK45-64-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK45-64-NEXT:  entry:
+// CHECK45-64-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK45-64-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8
-// CHECK45-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l27_kernel_environment)
+// CHECK45-64-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
+// CHECK45-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l27_kernel_environment, ptr [[DYN_PTR]])
 // CHECK45-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK45-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK45-64:       user_code.entry:
@@ -79,13 +81,15 @@ int bar(int n){
 //
 //
 // CHECK45-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l31
-// CHECK45-64-SAME: (i64 noundef [[AA:%.*]]) #[[ATTR0]] {
+// CHECK45-64-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[AA:%.*]]) #[[ATTR0]] {
 // CHECK45-64-NEXT:  entry:
+// CHECK45-64-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK45-64-NEXT:    [[AA_ADDR:%.*]] = alloca i64, align 8
 // CHECK45-64-NEXT:    [[AA_CASTED:%.*]] = alloca i64, align 8
 // CHECK45-64-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x ptr], align 8
+// CHECK45-64-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK45-64-NEXT:    store i64 [[AA]], ptr [[AA_ADDR]], align 8
-// CHECK45-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l31_kernel_environment)
+// CHECK45-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l31_kernel_environment, ptr [[DYN_PTR]])
 // CHECK45-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK45-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK45-64:       user_code.entry:
@@ -121,19 +125,21 @@ int bar(int n){
 //
 //
 // CHECK45-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l36
-// CHECK45-64-SAME: (i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
+// CHECK45-64-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
 // CHECK45-64-NEXT:  entry:
+// CHECK45-64-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK45-64-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
 // CHECK45-64-NEXT:    [[AA_ADDR:%.*]] = alloca i64, align 8
 // CHECK45-64-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
 // CHECK45-64-NEXT:    [[A_CASTED:%.*]] = alloca i64, align 8
 // CHECK45-64-NEXT:    [[AA_CASTED:%.*]] = alloca i64, align 8
 // CHECK45-64-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x ptr], align 8
+// CHECK45-64-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK45-64-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
 // CHECK45-64-NEXT:    store i64 [[AA]], ptr [[AA_ADDR]], align 8
 // CHECK45-64-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
 // CHECK45-64-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 8
-// CHECK45-64-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l36_kernel_environment)
+// CHECK45-64-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l36_kernel_environment, ptr [[DYN_PTR]])
 // CHECK45-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // CHECK45-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK45-64:       user_code.entry:
@@ -189,10 +195,12 @@ int bar(int n){
 //
 //
 // CHECK45-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l27
-// CHECK45-32-SAME: () #[[ATTR0:[0-9]+]] {
+// CHECK45-32-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK45-32-NEXT:  entry:
+// CHECK45-32-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK45-32-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 4
-// CHECK45-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l27_kernel_environment)
+// CHECK45-32-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK45-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l27_kernel_environment, ptr [[DYN_PTR]])
 // CHECK45-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK45-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK45-32:       user_code.entry:
@@ -215,13 +223,15 @@ int bar(int n){
 //
 //
 // CHECK45-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l31
-// CHECK45-32-SAME: (i32 noundef [[AA:%.*]]) #[[ATTR0]] {
+// CHECK45-32-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[AA:%.*]]) #[[ATTR0]] {
 // CHECK45-32-NEXT:  entry:
+// CHECK45-32-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK45-32-NEXT:    [[AA_ADDR:%.*]] = alloca i32, align 4
 // CHECK45-32-NEXT:    [[AA_CASTED:%.*]] = alloca i32, align 4
 // CHECK45-32-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x ptr], align 4
+// CHECK45-32-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK45-32-NEXT:    store i32 [[AA]], ptr [[AA_ADDR]], align 4
-// CHECK45-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l31_kernel_environment)
+// CHECK45-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l31_kernel_environment, ptr [[DYN_PTR]])
 // CHECK45-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK45-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK45-32:       user_code.entry:
@@ -257,19 +267,21 @@ int bar(int n){
 //
 //
 // CHECK45-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l36
-// CHECK45-32-SAME: (i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
+// CHECK45-32-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
 // CHECK45-32-NEXT:  entry:
+// CHECK45-32-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK45-32-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 // CHECK45-32-NEXT:    [[AA_ADDR:%.*]] = alloca i32, align 4
 // CHECK45-32-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 4
 // CHECK45-32-NEXT:    [[A_CASTED:%.*]] = alloca i32, align 4
 // CHECK45-32-NEXT:    [[AA_CASTED:%.*]] = alloca i32, align 4
 // CHECK45-32-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x ptr], align 4
+// CHECK45-32-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK45-32-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
 // CHECK45-32-NEXT:    store i32 [[AA]], ptr [[AA_ADDR]], align 4
 // CHECK45-32-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 4
 // CHECK45-32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 4
-// CHECK45-32-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l36_kernel_environment)
+// CHECK45-32-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l36_kernel_environment, ptr [[DYN_PTR]])
 // CHECK45-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // CHECK45-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK45-32:       user_code.entry:
@@ -325,10 +337,12 @@ int bar(int n){
 //
 //
 // CHECK45-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l27
-// CHECK45-32-EX-SAME: () #[[ATTR0:[0-9]+]] {
+// CHECK45-32-EX-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK45-32-EX-NEXT:  entry:
+// CHECK45-32-EX-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK45-32-EX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 4
-// CHECK45-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l27_kernel_environment)
+// CHECK45-32-EX-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK45-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l27_kernel_environment, ptr [[DYN_PTR]])
 // CHECK45-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK45-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK45-32-EX:       user_code.entry:
@@ -351,13 +365,15 @@ int bar(int n){
 //
 //
 // CHECK45-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l31
-// CHECK45-32-EX-SAME: (i32 noundef [[AA:%.*]]) #[[ATTR0]] {
+// CHECK45-32-EX-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[AA:%.*]]) #[[ATTR0]] {
 // CHECK45-32-EX-NEXT:  entry:
+// CHECK45-32-EX-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK45-32-EX-NEXT:    [[AA_ADDR:%.*]] = alloca i32, align 4
 // CHECK45-32-EX-NEXT:    [[AA_CASTED:%.*]] = alloca i32, align 4
 // CHECK45-32-EX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x ptr], align 4
+// CHECK45-32-EX-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK45-32-EX-NEXT:    store i32 [[AA]], ptr [[AA_ADDR]], align 4
-// CHECK45-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l31_kernel_environment)
+// CHECK45-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l31_kernel_environment, ptr [[DYN_PTR]])
 // CHECK45-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK45-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK45-32-EX:       user_code.entry:
@@ -393,19 +409,21 @@ int bar(int n){
 //
 //
 // CHECK45-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l36
-// CHECK45-32-EX-SAME: (i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
+// CHECK45-32-EX-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
 // CHECK45-32-EX-NEXT:  entry:
+// CHECK45-32-EX-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK45-32-EX-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 // CHECK45-32-EX-NEXT:    [[AA_ADDR:%.*]] = alloca i32, align 4
 // CHECK45-32-EX-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 4
 // CHECK45-32-EX-NEXT:    [[A_CASTED:%.*]] = alloca i32, align 4
 // CHECK45-32-EX-NEXT:    [[AA_CASTED:%.*]] = alloca i32, align 4
 // CHECK45-32-EX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x ptr], align 4
+// CHECK45-32-EX-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK45-32-EX-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
 // CHECK45-32-EX-NEXT:    store i32 [[AA]], ptr [[AA_ADDR]], align 4
 // CHECK45-32-EX-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 4
 // CHECK45-32-EX-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 4
-// CHECK45-32-EX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l36_kernel_environment)
+// CHECK45-32-EX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l36_kernel_environment, ptr [[DYN_PTR]])
 // CHECK45-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // CHECK45-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK45-32-EX:       user_code.entry:
@@ -461,10 +479,12 @@ int bar(int n){
 //
 //
 // CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l27
-// CHECK-64-SAME: () #[[ATTR0:[0-9]+]] {
+// CHECK-64-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-64-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8
-// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l27_kernel_environment)
+// CHECK-64-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l27_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-64:       user_code.entry:
@@ -487,13 +507,15 @@ int bar(int n){
 //
 //
 // CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l31
-// CHECK-64-SAME: (i64 noundef [[AA:%.*]]) #[[ATTR0]] {
+// CHECK-64-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[AA:%.*]]) #[[ATTR0]] {
 // CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-64-NEXT:    [[AA_ADDR:%.*]] = alloca i64, align 8
 // CHECK-64-NEXT:    [[AA_CASTED:%.*]] = alloca i64, align 8
 // CHECK-64-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x ptr], align 8
+// CHECK-64-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK-64-NEXT:    store i64 [[AA]], ptr [[AA_ADDR]], align 8
-// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l31_kernel_environment)
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l31_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-64:       user_code.entry:
@@ -529,19 +551,21 @@ int bar(int n){
 //
 //
 // CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l36
-// CHECK-64-SAME: (i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
+// CHECK-64-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
 // CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-64-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
 // CHECK-64-NEXT:    [[AA_ADDR:%.*]] = alloca i64, align 8
 // CHECK-64-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-64-NEXT:    [[A_CASTED:%.*]] = alloca i64, align 8
 // CHECK-64-NEXT:    [[AA_CASTED:%.*]] = alloca i64, align 8
 // CHECK-64-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x ptr], align 8
+// CHECK-64-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK-64-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
 // CHECK-64-NEXT:    store i64 [[AA]], ptr [[AA_ADDR]], align 8
 // CHECK-64-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
 // CHECK-64-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 8
-// CHECK-64-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l36_kernel_environment)
+// CHECK-64-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l36_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-64:       user_code.entry:
@@ -597,10 +621,12 @@ int bar(int n){
 //
 //
 // CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l27
-// CHECK-32-SAME: () #[[ATTR0:[0-9]+]] {
+// CHECK-32-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 4
-// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l27_kernel_environment)
+// CHECK-32-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l27_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32:       user_code.entry:
@@ -623,13 +649,15 @@ int bar(int n){
 //
 //
 // CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l31
-// CHECK-32-SAME: (i32 noundef [[AA:%.*]]) #[[ATTR0]] {
+// CHECK-32-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[AA:%.*]]) #[[ATTR0]] {
 // CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-NEXT:    [[AA_ADDR:%.*]] = alloca i32, align 4
 // CHECK-32-NEXT:    [[AA_CASTED:%.*]] = alloca i32, align 4
 // CHECK-32-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x ptr], align 4
+// CHECK-32-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK-32-NEXT:    store i32 [[AA]], ptr [[AA_ADDR]], align 4
-// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l31_kernel_environment)
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l31_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32:       user_code.entry:
@@ -665,19 +693,21 @@ int bar(int n){
 //
 //
 // CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l36
-// CHECK-32-SAME: (i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
+// CHECK-32-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
 // CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 // CHECK-32-NEXT:    [[AA_ADDR:%.*]] = alloca i32, align 4
 // CHECK-32-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-NEXT:    [[A_CASTED:%.*]] = alloca i32, align 4
 // CHECK-32-NEXT:    [[AA_CASTED:%.*]] = alloca i32, align 4
 // CHECK-32-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x ptr], align 4
+// CHECK-32-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK-32-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
 // CHECK-32-NEXT:    store i32 [[AA]], ptr [[AA_ADDR]], align 4
 // CHECK-32-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 4
 // CHECK-32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 4
-// CHECK-32-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l36_kernel_environment)
+// CHECK-32-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l36_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32:       user_code.entry:
@@ -733,10 +763,12 @@ int bar(int n){
 //
 //
 // CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l27
-// CHECK-32-EX-SAME: () #[[ATTR0:[0-9]+]] {
+// CHECK-32-EX-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-EX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 4
-// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l27_kernel_environment)
+// CHECK-32-EX-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l27_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32-EX:       user_code.entry:
@@ -759,13 +791,15 @@ int bar(int n){
 //
 //
 // CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l31
-// CHECK-32-EX-SAME: (i32 noundef [[AA:%.*]]) #[[ATTR0]] {
+// CHECK-32-EX-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[AA:%.*]]) #[[ATTR0]] {
 // CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-EX-NEXT:    [[AA_ADDR:%.*]] = alloca i32, align 4
 // CHECK-32-EX-NEXT:    [[AA_CASTED:%.*]] = alloca i32, align 4
 // CHECK-32-EX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x ptr], align 4
+// CHECK-32-EX-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK-32-EX-NEXT:    store i32 [[AA]], ptr [[AA_ADDR]], align 4
-// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l31_kernel_environment)
+// CHECK-32-EX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l31_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32-EX:       user_code.entry:
@@ -801,19 +835,21 @@ int bar(int n){
 //
 //
 // CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l36
-// CHECK-32-EX-SAME: (i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
+// CHECK-32-EX-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
 // CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-EX-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 // CHECK-32-EX-NEXT:    [[AA_ADDR:%.*]] = alloca i32, align 4
 // CHECK-32-EX-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-EX-NEXT:    [[A_CASTED:%.*]] = alloca i32, align 4
 // CHECK-32-EX-NEXT:    [[AA_CASTED:%.*]] = alloca i32, align 4
 // CHECK-32-EX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x ptr], align 4
+// CHECK-32-EX-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK-32-EX-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
 // CHECK-32-EX-NEXT:    store i32 [[AA]], ptr [[AA_ADDR]], align 4
 // CHECK-32-EX-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 4
 // CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 4
-// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l36_kernel_environment)
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l36_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32-EX:       user_code.entry:
diff --git a/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp b/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp
index 36c7f58fdd938..094c5ae3522f9 100644
--- a/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp
@@ -98,13 +98,15 @@ int bar(int n){
 
 #endif
 // CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l24
-// CHECK-64-SAME: (ptr noundef nonnull align 8 dereferenceable(8) [[E:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-64-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[E:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-64-NEXT:    [[E_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-64-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x ptr], align 8
+// CHECK-64-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK-64-NEXT:    store ptr [[E]], ptr [[E_ADDR]], align 8
 // CHECK-64-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[E_ADDR]], align 8
-// CHECK-64-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l24_kernel_environment)
+// CHECK-64-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l24_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-64:       user_code.entry:
@@ -277,16 +279,18 @@ int bar(int n){
 //
 //
 // CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l29
-// CHECK-64-SAME: (ptr noundef nonnull align 1 dereferenceable(1) [[C:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[D:%.*]]) #[[ATTR0]] {
+// CHECK-64-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[C:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[D:%.*]]) #[[ATTR0]] {
 // CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-64-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-64-NEXT:    [[D_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-64-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x ptr], align 8
+// CHECK-64-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK-64-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
 // CHECK-64-NEXT:    store ptr [[D]], ptr [[D_ADDR]], align 8
 // CHECK-64-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8
 // CHECK-64-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[D_ADDR]], align 8
-// CHECK-64-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l29_kernel_environment)
+// CHECK-64-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l29_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP2]], -1
 // CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-64:       user_code.entry:
@@ -514,16 +518,18 @@ int bar(int n){
 //
 //
 // CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l35
-// CHECK-64-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[B:%.*]]) #[[ATTR0]] {
+// CHECK-64-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[B:%.*]]) #[[ATTR0]] {
 // CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-64-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-64-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-64-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x ptr], align 8
+// CHECK-64-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK-64-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
 // CHECK-64-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
 // CHECK-64-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 8
 // CHECK-64-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8
-// CHECK-64-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l35_kernel_environment)
+// CHECK-64-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l35_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP2]], -1
 // CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-64:       user_code.entry:
@@ -768,13 +774,15 @@ int bar(int n){
 //
 //
 // CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l24
-// CHECK-32-SAME: (ptr noundef nonnull align 8 dereferenceable(8) [[E:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-32-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[E:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-NEXT:    [[E_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x ptr], align 4
+// CHECK-32-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK-32-NEXT:    store ptr [[E]], ptr [[E_ADDR]], align 4
 // CHECK-32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[E_ADDR]], align 4
-// CHECK-32-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l24_kernel_environment)
+// CHECK-32-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l24_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32:       user_code.entry:
@@ -947,16 +955,18 @@ int bar(int n){
 //
 //
 // CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l29
-// CHECK-32-SAME: (ptr noundef nonnull align 1 dereferenceable(1) [[C:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[D:%.*]]) #[[ATTR0]] {
+// CHECK-32-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[C:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[D:%.*]]) #[[ATTR0]] {
 // CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-NEXT:    [[D_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x ptr], align 4
+// CHECK-32-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK-32-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 4
 // CHECK-32-NEXT:    store ptr [[D]], ptr [[D_ADDR]], align 4
 // CHECK-32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 4
 // CHECK-32-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[D_ADDR]], align 4
-// CHECK-32-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l29_kernel_environment)
+// CHECK-32-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l29_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP2]], -1
 // CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32:       user_code.entry:
@@ -1184,16 +1194,18 @@ int bar(int n){
 //
 //
 // CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l35
-// CHECK-32-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[B:%.*]]) #[[ATTR0]] {
+// CHECK-32-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[B:%.*]]) #[[ATTR0]] {
 // CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x ptr], align 4
+// CHECK-32-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK-32-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
 // CHECK-32-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 4
 // CHECK-32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
 // CHECK-32-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 4
-// CHECK-32-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l35_kernel_environment)
+// CHECK-32-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l35_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP2]], -1
 // CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32:       user_code.entry:
@@ -1438,13 +1450,15 @@ int bar(int n){
 //
 //
 // CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l24
-// CHECK-32-EX-SAME: (ptr noundef nonnull align 8 dereferenceable(8) [[E:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-32-EX-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[E:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-EX-NEXT:    [[E_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-EX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x ptr], align 4
+// CHECK-32-EX-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK-32-EX-NEXT:    store ptr [[E]], ptr [[E_ADDR]], align 4
 // CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[E_ADDR]], align 4
-// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l24_kernel_environment)
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l24_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32-EX:       user_code.entry:
@@ -1617,16 +1631,18 @@ int bar(int n){
 //
 //
 // CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l29
-// CHECK-32-EX-SAME: (ptr noundef nonnull align 1 dereferenceable(1) [[C:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[D:%.*]]) #[[ATTR0]] {
+// CHECK-32-EX-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[C:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[D:%.*]]) #[[ATTR0]] {
 // CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-EX-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-EX-NEXT:    [[D_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-EX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x ptr], align 4
+// CHECK-32-EX-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK-32-EX-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 4
 // CHECK-32-EX-NEXT:    store ptr [[D]], ptr [[D_ADDR]], align 4
 // CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 4
 // CHECK-32-EX-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[D_ADDR]], align 4
-// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l29_kernel_environment)
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l29_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP2]], -1
 // CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32-EX:       user_code.entry:
@@ -1854,16 +1870,18 @@ int bar(int n){
 //
 //
 // CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l35
-// CHECK-32-EX-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[B:%.*]]) #[[ATTR0]] {
+// CHECK-32-EX-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[B:%.*]]) #[[ATTR0]] {
 // CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-EX-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-EX-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-EX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x ptr], align 4
+// CHECK-32-EX-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK-32-EX-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
 // CHECK-32-EX-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 4
 // CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
 // CHECK-32-EX-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 4
-// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l35_kernel_environment)
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l35_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP2]], -1
 // CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32-EX:       user_code.entry:
diff --git a/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen_tbaa_PR46146.cpp b/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen_tbaa_PR46146.cpp
index 4a684ebed1ee9..5e91833c9be87 100644
--- a/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen_tbaa_PR46146.cpp
+++ b/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen_tbaa_PR46146.cpp
@@ -31,17 +31,19 @@ void test() {
 }
 #endif
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z17complex_reductionIfEvv_l16
-// CHECK1-SAME: () #[[ATTR0:[0-9]+]] {
+// CHECK1-SAME: (ptr noalias [[DYN_PTR:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-// CHECK1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z17complex_reductionIfEvv_l16_kernel_environment)
+// CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8, !tbaa [[TBAA10:![0-9]+]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z17complex_reductionIfEvv_l16_kernel_environment, ptr [[DYN_PTR]])
 // CHECK1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK1:       user_code.entry:
 // CHECK1-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1:[0-9]+]])
 // CHECK1-NEXT:    store i32 0, ptr [[DOTZERO_ADDR]], align 4
-// CHECK1-NEXT:    store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA8:![0-9]+]]
+// CHECK1-NEXT:    store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA14:![0-9]+]]
 // CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z17complex_reductionIfEvv_l16_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4:[0-9]+]]
 // CHECK1-NEXT:    call void @__kmpc_target_deinit()
 // CHECK1-NEXT:    ret void
@@ -64,78 +66,78 @@ void test() {
 // CHECK1-NEXT:    [[REF_TMP:%.*]] = alloca float, align 4
 // CHECK1-NEXT:    [[REF_TMP2:%.*]] = alloca float, align 4
 // CHECK1-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x ptr], align 8
-// CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8, !tbaa [[TBAA12:![0-9]+]]
-// CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8, !tbaa [[TBAA12]]
+// CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8, !tbaa [[TBAA10]]
 // CHECK1-NEXT:    [[ISTART:%.*]] = call align 16 ptr @__kmpc_alloc_shared(i64 4)
 // CHECK1-NEXT:    [[IEND:%.*]] = call align 16 ptr @__kmpc_alloc_shared(i64 4)
 // CHECK1-NEXT:    [[PARTIAL_SUM:%.*]] = call align 16 ptr @__kmpc_alloc_shared(i64 8)
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTOMP_IV]]) #[[ATTR4]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTOMP_LB]]) #[[ATTR4]]
-// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4, !tbaa [[TBAA8]]
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4, !tbaa [[TBAA14]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTOMP_UB]]) #[[ATTR4]]
-// CHECK1-NEXT:    store i32 99, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA8]]
+// CHECK1-NEXT:    store i32 99, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA14]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTOMP_STRIDE]]) #[[ATTR4]]
-// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4, !tbaa [[TBAA8]]
+// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4, !tbaa [[TBAA14]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTOMP_IS_LAST]]) #[[ATTR4]]
-// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4, !tbaa [[TBAA8]]
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4, !tbaa [[TBAA14]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[IB]]) #[[ATTR4]]
 // CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4, !tbaa [[TBAA8]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4, !tbaa [[TBAA14]]
 // CHECK1-NEXT:    call void @__kmpc_distribute_static_init_4(ptr @[[GLOB2:[0-9]+]], i32 [[TMP1]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
-// CHECK1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA8]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA14]]
 // CHECK1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 99
 // CHECK1-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
 // CHECK1:       cond.true:
 // CHECK1-NEXT:    br label [[COND_END:%.*]]
 // CHECK1:       cond.false:
-// CHECK1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA8]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA14]]
 // CHECK1-NEXT:    br label [[COND_END]]
 // CHECK1:       cond.end:
 // CHECK1-NEXT:    [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
-// CHECK1-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA8]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !tbaa [[TBAA8]]
-// CHECK1-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA8]]
+// CHECK1-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA14]]
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK1:       omp.inner.for.cond:
-// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA8]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA8]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA14]]
 // CHECK1-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
 // CHECK1-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_COND_CLEANUP:%.*]]
 // CHECK1:       omp.inner.for.cond.cleanup:
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_END:%.*]]
 // CHECK1:       omp.inner.for.body:
-// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA8]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA14]]
 // CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
 // CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK1-NEXT:    store i32 [[ADD]], ptr [[IB]], align 4, !tbaa [[TBAA8]]
+// CHECK1-NEXT:    store i32 [[ADD]], ptr [[IB]], align 4, !tbaa [[TBAA14]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[REF_TMP]]) #[[ATTR4]]
-// CHECK1-NEXT:    store float 0.000000e+00, ptr [[REF_TMP]], align 4, !tbaa [[TBAA14:![0-9]+]]
+// CHECK1-NEXT:    store float 0.000000e+00, ptr [[REF_TMP]], align 4, !tbaa [[TBAA16:![0-9]+]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[REF_TMP2]]) #[[ATTR4]]
-// CHECK1-NEXT:    store float 0.000000e+00, ptr [[REF_TMP2]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    store float 0.000000e+00, ptr [[REF_TMP2]], align 4, !tbaa [[TBAA16]]
 // CHECK1-NEXT:    call void @_ZNSt7complexIfEC1ERKfS2_(ptr nonnull align 4 dereferenceable(8) [[PARTIAL_SUM]], ptr nonnull align 4 dereferenceable(4) [[REF_TMP]], ptr nonnull align 4 dereferenceable(4) [[REF_TMP2]]) #[[ATTR12:[0-9]+]]
 // CHECK1-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr [[REF_TMP2]]) #[[ATTR4]]
 // CHECK1-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr [[REF_TMP]]) #[[ATTR4]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[IB]], align 4, !tbaa [[TBAA8]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[IB]], align 4, !tbaa [[TBAA14]]
 // CHECK1-NEXT:    [[MUL3:%.*]] = mul nsw i32 [[TMP8]], 4
-// CHECK1-NEXT:    store i32 [[MUL3]], ptr [[ISTART]], align 4, !tbaa [[TBAA8]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[IB]], align 4, !tbaa [[TBAA8]]
+// CHECK1-NEXT:    store i32 [[MUL3]], ptr [[ISTART]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[IB]], align 4, !tbaa [[TBAA14]]
 // CHECK1-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP9]], 1
 // CHECK1-NEXT:    [[MUL5:%.*]] = mul nsw i32 [[ADD4]], 4
-// CHECK1-NEXT:    store i32 [[MUL5]], ptr [[IEND]], align 4, !tbaa [[TBAA8]]
+// CHECK1-NEXT:    store i32 [[MUL5]], ptr [[IEND]], align 4, !tbaa [[TBAA14]]
 // CHECK1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [3 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
-// CHECK1-NEXT:    store ptr [[ISTART]], ptr [[TMP10]], align 8, !tbaa [[TBAA12]]
+// CHECK1-NEXT:    store ptr [[ISTART]], ptr [[TMP10]], align 8, !tbaa [[TBAA10]]
 // CHECK1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [3 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 1
-// CHECK1-NEXT:    store ptr [[IEND]], ptr [[TMP11]], align 8, !tbaa [[TBAA12]]
+// CHECK1-NEXT:    store ptr [[IEND]], ptr [[TMP11]], align 8, !tbaa [[TBAA10]]
 // CHECK1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [3 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 2
-// CHECK1-NEXT:    store ptr [[PARTIAL_SUM]], ptr [[TMP12]], align 8, !tbaa [[TBAA12]]
+// CHECK1-NEXT:    store ptr [[PARTIAL_SUM]], ptr [[TMP12]], align 8, !tbaa [[TBAA10]]
 // CHECK1-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z17complex_reductionIfEvv_l16_omp_outlined_omp_outlined, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z17complex_reductionIfEvv_l16_omp_outlined_omp_outlined_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 3)
 // CHECK1-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK1:       omp.body.continue:
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK1:       omp.inner.for.inc:
-// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA8]]
+// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA14]]
 // CHECK1-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP13]], 1
-// CHECK1-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA8]]
+// CHECK1-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA14]]
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]]
 // CHECK1:       omp.inner.for.end:
 // CHECK1-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
@@ -159,9 +161,9 @@ void test() {
 // CHECK1-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[__RE_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[__IM_ADDR:%.*]] = alloca ptr, align 8
-// CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8, !tbaa [[TBAA12]]
-// CHECK1-NEXT:    store ptr [[__RE]], ptr [[__RE_ADDR]], align 8, !tbaa [[TBAA12]]
-// CHECK1-NEXT:    store ptr [[__IM]], ptr [[__IM_ADDR]], align 8, !tbaa [[TBAA12]]
+// CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT:    store ptr [[__RE]], ptr [[__RE_ADDR]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT:    store ptr [[__IM]], ptr [[__IM_ADDR]], align 8, !tbaa [[TBAA10]]
 // CHECK1-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[__RE_ADDR]], align 8
 // CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[__IM_ADDR]], align 8
@@ -195,79 +197,79 @@ void test() {
 // CHECK1-NEXT:    [[REF_TMP15:%.*]] = alloca float, align 4
 // CHECK1-NEXT:    [[REF_TMP16:%.*]] = alloca float, align 4
 // CHECK1-NEXT:    [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 8
-// CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8, !tbaa [[TBAA12]]
-// CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8, !tbaa [[TBAA12]]
-// CHECK1-NEXT:    store ptr [[ISTART]], ptr [[ISTART_ADDR]], align 8, !tbaa [[TBAA12]]
-// CHECK1-NEXT:    store ptr [[IEND]], ptr [[IEND_ADDR]], align 8, !tbaa [[TBAA12]]
-// CHECK1-NEXT:    store ptr [[PARTIAL_SUM]], ptr [[PARTIAL_SUM_ADDR]], align 8, !tbaa [[TBAA12]]
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ISTART_ADDR]], align 8, !tbaa [[TBAA12]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[IEND_ADDR]], align 8, !tbaa [[TBAA12]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[PARTIAL_SUM_ADDR]], align 8, !tbaa [[TBAA12]]
+// CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT:    store ptr [[ISTART]], ptr [[ISTART_ADDR]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT:    store ptr [[IEND]], ptr [[IEND_ADDR]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT:    store ptr [[PARTIAL_SUM]], ptr [[PARTIAL_SUM_ADDR]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ISTART_ADDR]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[IEND_ADDR]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[PARTIAL_SUM_ADDR]], align 8, !tbaa [[TBAA10]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTOMP_IV]]) #[[ATTR4]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTCAPTURE_EXPR_]]) #[[ATTR4]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP0]], align 4, !tbaa [[TBAA8]]
-// CHECK1-NEXT:    store i32 [[TMP3]], ptr [[DOTCAPTURE_EXPR_]], align 4, !tbaa [[TBAA8]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP0]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    store i32 [[TMP3]], ptr [[DOTCAPTURE_EXPR_]], align 4, !tbaa [[TBAA14]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTCAPTURE_EXPR_1]]) #[[ATTR4]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP1]], align 4, !tbaa [[TBAA8]]
-// CHECK1-NEXT:    store i32 [[TMP4]], ptr [[DOTCAPTURE_EXPR_1]], align 4, !tbaa [[TBAA8]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP1]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    store i32 [[TMP4]], ptr [[DOTCAPTURE_EXPR_1]], align 4, !tbaa [[TBAA14]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTCAPTURE_EXPR_2]]) #[[ATTR4]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4, !tbaa [[TBAA8]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4, !tbaa [[TBAA8]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4, !tbaa [[TBAA14]]
 // CHECK1-NEXT:    [[SUB:%.*]] = sub i32 [[TMP5]], [[TMP6]]
 // CHECK1-NEXT:    [[SUB3:%.*]] = sub i32 [[SUB]], 1
 // CHECK1-NEXT:    [[ADD:%.*]] = add i32 [[SUB3]], 1
 // CHECK1-NEXT:    [[DIV:%.*]] = udiv i32 [[ADD]], 1
 // CHECK1-NEXT:    [[SUB4:%.*]] = sub i32 [[DIV]], 1
-// CHECK1-NEXT:    store i32 [[SUB4]], ptr [[DOTCAPTURE_EXPR_2]], align 4, !tbaa [[TBAA8]]
+// CHECK1-NEXT:    store i32 [[SUB4]], ptr [[DOTCAPTURE_EXPR_2]], align 4, !tbaa [[TBAA14]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[I]]) #[[ATTR4]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4, !tbaa [[TBAA8]]
-// CHECK1-NEXT:    store i32 [[TMP7]], ptr [[I]], align 4, !tbaa [[TBAA8]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    store i32 [[TMP7]], ptr [[I]], align 4, !tbaa [[TBAA14]]
 // CHECK1-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr [[I]]) #[[ATTR4]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4, !tbaa [[TBAA8]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4, !tbaa [[TBAA8]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4, !tbaa [[TBAA14]]
 // CHECK1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP8]], [[TMP9]]
 // CHECK1-NEXT:    br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
 // CHECK1:       omp.precond.then:
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTOMP_LB]]) #[[ATTR4]]
-// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4, !tbaa [[TBAA8]]
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4, !tbaa [[TBAA14]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTOMP_UB]]) #[[ATTR4]]
-// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4, !tbaa [[TBAA8]]
-// CHECK1-NEXT:    store i32 [[TMP10]], ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA8]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    store i32 [[TMP10]], ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA14]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTOMP_STRIDE]]) #[[ATTR4]]
-// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4, !tbaa [[TBAA8]]
+// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4, !tbaa [[TBAA14]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTOMP_IS_LAST]]) #[[ATTR4]]
-// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4, !tbaa [[TBAA8]]
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4, !tbaa [[TBAA14]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[PARTIAL_SUM5]]) #[[ATTR4]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[REF_TMP]]) #[[ATTR4]]
-// CHECK1-NEXT:    store float 0.000000e+00, ptr [[REF_TMP]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    store float 0.000000e+00, ptr [[REF_TMP]], align 4, !tbaa [[TBAA16]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[REF_TMP6]]) #[[ATTR4]]
-// CHECK1-NEXT:    store float 0.000000e+00, ptr [[REF_TMP6]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    store float 0.000000e+00, ptr [[REF_TMP6]], align 4, !tbaa [[TBAA16]]
 // CHECK1-NEXT:    call void @_ZNSt7complexIfEC1ERKfS2_(ptr nonnull align 4 dereferenceable(8) [[PARTIAL_SUM5]], ptr nonnull align 4 dereferenceable(4) [[REF_TMP]], ptr nonnull align 4 dereferenceable(4) [[REF_TMP6]]) #[[ATTR12]]
 // CHECK1-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr [[REF_TMP6]]) #[[ATTR4]]
 // CHECK1-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr [[REF_TMP]]) #[[ATTR4]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[I7]]) #[[ATTR4]]
 // CHECK1-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP11]], align 4, !tbaa [[TBAA8]]
+// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP11]], align 4, !tbaa [[TBAA14]]
 // CHECK1-NEXT:    call void @__kmpc_for_static_init_4u(ptr @[[GLOB3:[0-9]+]], i32 [[TMP12]], i32 33, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
 // CHECK1-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
 // CHECK1:       omp.dispatch.cond:
-// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA8]]
-// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4, !tbaa [[TBAA8]]
+// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4, !tbaa [[TBAA14]]
 // CHECK1-NEXT:    [[CMP8:%.*]] = icmp ugt i32 [[TMP13]], [[TMP14]]
 // CHECK1-NEXT:    br i1 [[CMP8]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
 // CHECK1:       cond.true:
-// CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4, !tbaa [[TBAA8]]
+// CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4, !tbaa [[TBAA14]]
 // CHECK1-NEXT:    br label [[COND_END:%.*]]
 // CHECK1:       cond.false:
-// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA8]]
+// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA14]]
 // CHECK1-NEXT:    br label [[COND_END]]
 // CHECK1:       cond.end:
 // CHECK1-NEXT:    [[COND:%.*]] = phi i32 [ [[TMP15]], [[COND_TRUE]] ], [ [[TMP16]], [[COND_FALSE]] ]
-// CHECK1-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA8]]
-// CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !tbaa [[TBAA8]]
-// CHECK1-NEXT:    store i32 [[TMP17]], ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA8]]
-// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA8]]
-// CHECK1-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA8]]
+// CHECK1-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    store i32 [[TMP17]], ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA14]]
 // CHECK1-NEXT:    [[ADD9:%.*]] = add i32 [[TMP19]], 1
 // CHECK1-NEXT:    [[CMP10:%.*]] = icmp ult i32 [[TMP18]], [[ADD9]]
 // CHECK1-NEXT:    br i1 [[CMP10]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_CLEANUP:%.*]]
@@ -276,28 +278,28 @@ void test() {
 // CHECK1:       omp.dispatch.body:
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK1:       omp.inner.for.cond:
-// CHECK1-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA8]]
-// CHECK1-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA8]]
+// CHECK1-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA14]]
 // CHECK1-NEXT:    [[ADD11:%.*]] = add i32 [[TMP21]], 1
 // CHECK1-NEXT:    [[CMP12:%.*]] = icmp ult i32 [[TMP20]], [[ADD11]]
 // CHECK1-NEXT:    br i1 [[CMP12]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_COND_CLEANUP:%.*]]
 // CHECK1:       omp.inner.for.cond.cleanup:
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_END:%.*]]
 // CHECK1:       omp.inner.for.body:
-// CHECK1-NEXT:    [[TMP22:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4, !tbaa [[TBAA8]]
-// CHECK1-NEXT:    [[TMP23:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA8]]
+// CHECK1-NEXT:    [[TMP22:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP23:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA14]]
 // CHECK1-NEXT:    [[MUL:%.*]] = mul i32 [[TMP23]], 1
 // CHECK1-NEXT:    [[ADD13:%.*]] = add i32 [[TMP22]], [[MUL]]
-// CHECK1-NEXT:    store i32 [[ADD13]], ptr [[I7]], align 4, !tbaa [[TBAA8]]
+// CHECK1-NEXT:    store i32 [[ADD13]], ptr [[I7]], align 4, !tbaa [[TBAA14]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[REF_TMP14]]) #[[ATTR4]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[REF_TMP15]]) #[[ATTR4]]
-// CHECK1-NEXT:    [[TMP24:%.*]] = load i32, ptr [[I7]], align 4, !tbaa [[TBAA8]]
+// CHECK1-NEXT:    [[TMP24:%.*]] = load i32, ptr [[I7]], align 4, !tbaa [[TBAA14]]
 // CHECK1-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP24]] to float
-// CHECK1-NEXT:    store float [[CONV]], ptr [[REF_TMP15]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    store float [[CONV]], ptr [[REF_TMP15]], align 4, !tbaa [[TBAA16]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[REF_TMP16]]) #[[ATTR4]]
-// CHECK1-NEXT:    [[TMP25:%.*]] = load i32, ptr [[I7]], align 4, !tbaa [[TBAA8]]
+// CHECK1-NEXT:    [[TMP25:%.*]] = load i32, ptr [[I7]], align 4, !tbaa [[TBAA14]]
 // CHECK1-NEXT:    [[CONV17:%.*]] = sitofp i32 [[TMP25]] to float
-// CHECK1-NEXT:    store float [[CONV17]], ptr [[REF_TMP16]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    store float [[CONV17]], ptr [[REF_TMP16]], align 4, !tbaa [[TBAA16]]
 // CHECK1-NEXT:    call void @_ZNSt7complexIfEC1ERKfS2_(ptr nonnull align 4 dereferenceable(8) [[REF_TMP14]], ptr nonnull align 4 dereferenceable(4) [[REF_TMP15]], ptr nonnull align 4 dereferenceable(4) [[REF_TMP16]]) #[[ATTR12]]
 // CHECK1-NEXT:    [[CALL:%.*]] = call nonnull align 4 dereferenceable(8) ptr @_ZNSt7complexIfEpLIfEERS0_RKS_IT_E(ptr nonnull align 4 dereferenceable(8) [[PARTIAL_SUM5]], ptr nonnull align 4 dereferenceable(8) [[REF_TMP14]]) #[[ATTR12]]
 // CHECK1-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr [[REF_TMP16]]) #[[ATTR4]]
@@ -307,28 +309,28 @@ void test() {
 // CHECK1:       omp.body.continue:
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK1:       omp.inner.for.inc:
-// CHECK1-NEXT:    [[TMP26:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA8]]
+// CHECK1-NEXT:    [[TMP26:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA14]]
 // CHECK1-NEXT:    [[ADD18:%.*]] = add i32 [[TMP26]], 1
-// CHECK1-NEXT:    store i32 [[ADD18]], ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA8]]
+// CHECK1-NEXT:    store i32 [[ADD18]], ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA14]]
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]]
 // CHECK1:       omp.inner.for.end:
 // CHECK1-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK1:       omp.dispatch.inc:
-// CHECK1-NEXT:    [[TMP27:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !tbaa [[TBAA8]]
-// CHECK1-NEXT:    [[TMP28:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !tbaa [[TBAA8]]
+// CHECK1-NEXT:    [[TMP27:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP28:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !tbaa [[TBAA14]]
 // CHECK1-NEXT:    [[ADD19:%.*]] = add i32 [[TMP27]], [[TMP28]]
-// CHECK1-NEXT:    store i32 [[ADD19]], ptr [[DOTOMP_LB]], align 4, !tbaa [[TBAA8]]
-// CHECK1-NEXT:    [[TMP29:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA8]]
-// CHECK1-NEXT:    [[TMP30:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !tbaa [[TBAA8]]
+// CHECK1-NEXT:    store i32 [[ADD19]], ptr [[DOTOMP_LB]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP29:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP30:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !tbaa [[TBAA14]]
 // CHECK1-NEXT:    [[ADD20:%.*]] = add i32 [[TMP29]], [[TMP30]]
-// CHECK1-NEXT:    store i32 [[ADD20]], ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA8]]
+// CHECK1-NEXT:    store i32 [[ADD20]], ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA14]]
 // CHECK1-NEXT:    br label [[OMP_DISPATCH_COND]]
 // CHECK1:       omp.dispatch.end:
 // CHECK1-NEXT:    [[TMP31:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK1-NEXT:    [[TMP32:%.*]] = load i32, ptr [[TMP31]], align 4, !tbaa [[TBAA8]]
+// CHECK1-NEXT:    [[TMP32:%.*]] = load i32, ptr [[TMP31]], align 4, !tbaa [[TBAA14]]
 // CHECK1-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB3]], i32 [[TMP32]])
 // CHECK1-NEXT:    [[TMP33:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK1-NEXT:    [[TMP34:%.*]] = load i32, ptr [[TMP33]], align 4, !tbaa [[TBAA8]]
+// CHECK1-NEXT:    [[TMP34:%.*]] = load i32, ptr [[TMP33]], align 4, !tbaa [[TBAA14]]
 // CHECK1-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0
 // CHECK1-NEXT:    store ptr [[PARTIAL_SUM5]], ptr [[TMP35]], align 8
 // CHECK1-NEXT:    [[TMP36:%.*]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(ptr @[[GLOB1]], i32 [[TMP34]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_omp_reduction_shuffle_and_reduce_func, ptr @_omp_reduction_inter_warp_copy_func)
@@ -359,21 +361,21 @@ void test() {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[__C_ADDR:%.*]] = alloca ptr, align 8
-// CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8, !tbaa [[TBAA12]]
-// CHECK1-NEXT:    store ptr [[__C]], ptr [[__C_ADDR]], align 8, !tbaa [[TBAA12]]
+// CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT:    store ptr [[__C]], ptr [[__C_ADDR]], align 8, !tbaa [[TBAA10]]
 // CHECK1-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[__C_ADDR]], align 8, !tbaa [[TBAA12]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[__C_ADDR]], align 8, !tbaa [[TBAA10]]
 // CHECK1-NEXT:    [[CALL:%.*]] = call float @_ZNKSt7complexIfE4realEv(ptr nonnull align 4 dereferenceable(8) [[TMP0]]) #[[ATTR12]]
 // CHECK1-NEXT:    [[__RE_:%.*]] = getelementptr inbounds %"class.std::complex", ptr [[THIS1]], i32 0, i32 0
-// CHECK1-NEXT:    [[TMP1:%.*]] = load float, ptr [[__RE_]], align 4, !tbaa [[TBAA16:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = load float, ptr [[__RE_]], align 4, !tbaa [[TBAA18:![0-9]+]]
 // CHECK1-NEXT:    [[ADD:%.*]] = fadd float [[TMP1]], [[CALL]]
-// CHECK1-NEXT:    store float [[ADD]], ptr [[__RE_]], align 4, !tbaa [[TBAA16]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[__C_ADDR]], align 8, !tbaa [[TBAA12]]
+// CHECK1-NEXT:    store float [[ADD]], ptr [[__RE_]], align 4, !tbaa [[TBAA18]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[__C_ADDR]], align 8, !tbaa [[TBAA10]]
 // CHECK1-NEXT:    [[CALL2:%.*]] = call float @_ZNKSt7complexIfE4imagEv(ptr nonnull align 4 dereferenceable(8) [[TMP2]]) #[[ATTR12]]
 // CHECK1-NEXT:    [[__IM_:%.*]] = getelementptr inbounds %"class.std::complex", ptr [[THIS1]], i32 0, i32 1
-// CHECK1-NEXT:    [[TMP3:%.*]] = load float, ptr [[__IM_]], align 4, !tbaa [[TBAA18:![0-9]+]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load float, ptr [[__IM_]], align 4, !tbaa [[TBAA20:![0-9]+]]
 // CHECK1-NEXT:    [[ADD3:%.*]] = fadd float [[TMP3]], [[CALL2]]
-// CHECK1-NEXT:    store float [[ADD3]], ptr [[__IM_]], align 4, !tbaa [[TBAA18]]
+// CHECK1-NEXT:    store float [[ADD3]], ptr [[__IM_]], align 4, !tbaa [[TBAA20]]
 // CHECK1-NEXT:    ret ptr [[THIS1]]
 //
 //
@@ -386,14 +388,14 @@ void test() {
 // CHECK1-NEXT:    [[DOTADDR3:%.*]] = alloca i16, align 2
 // CHECK1-NEXT:    [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [1 x ptr], align 8
 // CHECK1-NEXT:    [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca %"class.std::complex", align 4
-// CHECK1-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR]], align 8, !tbaa [[TBAA12]]
-// CHECK1-NEXT:    store i16 [[TMP1]], ptr [[DOTADDR1]], align 2, !tbaa [[TBAA19:![0-9]+]]
-// CHECK1-NEXT:    store i16 [[TMP2]], ptr [[DOTADDR2]], align 2, !tbaa [[TBAA19]]
-// CHECK1-NEXT:    store i16 [[TMP3]], ptr [[DOTADDR3]], align 2, !tbaa [[TBAA19]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTADDR]], align 8, !tbaa [[TBAA12]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load i16, ptr [[DOTADDR1]], align 2, !tbaa [[TBAA19]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load i16, ptr [[DOTADDR2]], align 2, !tbaa [[TBAA19]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = load i16, ptr [[DOTADDR3]], align 2, !tbaa [[TBAA19]]
+// CHECK1-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT:    store i16 [[TMP1]], ptr [[DOTADDR1]], align 2, !tbaa [[TBAA21:![0-9]+]]
+// CHECK1-NEXT:    store i16 [[TMP2]], ptr [[DOTADDR2]], align 2, !tbaa [[TBAA21]]
+// CHECK1-NEXT:    store i16 [[TMP3]], ptr [[DOTADDR3]], align 2, !tbaa [[TBAA21]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTADDR]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load i16, ptr [[DOTADDR1]], align 2, !tbaa [[TBAA21]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load i16, ptr [[DOTADDR2]], align 2, !tbaa [[TBAA21]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load i16, ptr [[DOTADDR3]], align 2, !tbaa [[TBAA21]]
 // CHECK1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP4]], i64 0, i64 0
 // CHECK1-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[TMP8]], align 8
 // CHECK1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0
@@ -405,7 +407,7 @@ void test() {
 // CHECK1-NEXT:    store i64 [[TMP15]], ptr [[DOTOMP_REDUCTION_ELEMENT]], align 4
 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr i64, ptr [[TMP9]], i64 1
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr i64, ptr [[DOTOMP_REDUCTION_ELEMENT]], i64 1
-// CHECK1-NEXT:    store ptr [[DOTOMP_REDUCTION_ELEMENT]], ptr [[TMP10]], align 8, !tbaa [[TBAA12]]
+// CHECK1-NEXT:    store ptr [[DOTOMP_REDUCTION_ELEMENT]], ptr [[TMP10]], align 8, !tbaa [[TBAA10]]
 // CHECK1-NEXT:    [[TMP18:%.*]] = icmp eq i16 [[TMP7]], 0
 // CHECK1-NEXT:    [[TMP19:%.*]] = icmp eq i16 [[TMP7]], 1
 // CHECK1-NEXT:    [[TMP20:%.*]] = icmp ult i16 [[TMP5]], [[TMP6]]
@@ -434,7 +436,7 @@ void test() {
 // CHECK1-NEXT:    [[TMP34:%.*]] = load ptr, ptr [[TMP33]], align 8
 // CHECK1-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP4]], i64 0, i64 0
 // CHECK1-NEXT:    [[TMP36:%.*]] = load ptr, ptr [[TMP35]], align 8
-// CHECK1-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP36]], ptr align 4 [[TMP34]], i64 8, i1 false), !tbaa.struct [[TBAA_STRUCT21:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP36]], ptr align 4 [[TMP34]], i64 8, i1 false), !tbaa.struct [[TBAA_STRUCT23:![0-9]+]]
 // CHECK1-NEXT:    br label [[IFCONT6:%.*]]
 // CHECK1:       else5:
 // CHECK1-NEXT:    br label [[IFCONT6]]
@@ -449,18 +451,18 @@ void test() {
 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[DOTCNT_ADDR:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
-// CHECK1-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR]], align 8, !tbaa [[TBAA12]]
-// CHECK1-NEXT:    store i32 [[TMP1]], ptr [[DOTADDR1]], align 4, !tbaa [[TBAA8]]
+// CHECK1-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT:    store i32 [[TMP1]], ptr [[DOTADDR1]], align 4, !tbaa [[TBAA14]]
 // CHECK1-NEXT:    [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
 // CHECK1-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
 // CHECK1-NEXT:    [[NVPTX_LANE_ID:%.*]] = and i32 [[TMP4]], 31
 // CHECK1-NEXT:    [[TMP5:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
 // CHECK1-NEXT:    [[NVPTX_WARP_ID:%.*]] = ashr i32 [[TMP5]], 5
 // CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[DOTADDR]], align 8
-// CHECK1-NEXT:    store i32 0, ptr [[DOTCNT_ADDR]], align 4, !tbaa [[TBAA8]]
+// CHECK1-NEXT:    store i32 0, ptr [[DOTCNT_ADDR]], align 4, !tbaa [[TBAA14]]
 // CHECK1-NEXT:    br label [[PRECOND:%.*]]
 // CHECK1:       precond:
-// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTCNT_ADDR]], align 4, !tbaa [[TBAA8]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTCNT_ADDR]], align 4, !tbaa [[TBAA14]]
 // CHECK1-NEXT:    [[TMP8:%.*]] = icmp ult i32 [[TMP7]], 2
 // CHECK1-NEXT:    br i1 [[TMP8]], label [[BODY:%.*]], label [[EXIT:%.*]]
 // CHECK1:       body:
@@ -469,7 +471,7 @@ void test() {
 // CHECK1-NEXT:    br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
 // CHECK1:       then:
 // CHECK1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP6]], i64 0, i64 0
-// CHECK1-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[TMP9]], align 8, !tbaa [[TBAA12]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[TMP9]], align 8, !tbaa [[TBAA10]]
 // CHECK1-NEXT:    [[TMP11:%.*]] = getelementptr i32, ptr [[TMP10]], i32 [[TMP7]]
 // CHECK1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
 // CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[TMP11]], align 4
@@ -479,22 +481,22 @@ void test() {
 // CHECK1-NEXT:    br label [[IFCONT]]
 // CHECK1:       ifcont:
 // CHECK1-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB4]], i32 [[TMP2]])
-// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTADDR1]], align 4, !tbaa [[TBAA8]]
+// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTADDR1]], align 4, !tbaa [[TBAA14]]
 // CHECK1-NEXT:    [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP3]], [[TMP14]]
 // CHECK1-NEXT:    br i1 [[IS_ACTIVE_THREAD]], label [[THEN2:%.*]], label [[ELSE3:%.*]]
 // CHECK1:       then2:
 // CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP6]], i64 0, i64 0
-// CHECK1-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[TMP16]], align 8, !tbaa [[TBAA12]]
+// CHECK1-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[TMP16]], align 8, !tbaa [[TBAA10]]
 // CHECK1-NEXT:    [[TMP18:%.*]] = getelementptr i32, ptr [[TMP17]], i32 [[TMP7]]
-// CHECK1-NEXT:    [[TMP19:%.*]] = load volatile i32, ptr addrspace(3) [[TMP15]], align 4, !tbaa [[TBAA8]]
-// CHECK1-NEXT:    store i32 [[TMP19]], ptr [[TMP18]], align 4, !tbaa [[TBAA8]]
+// CHECK1-NEXT:    [[TMP19:%.*]] = load volatile i32, ptr addrspace(3) [[TMP15]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    store i32 [[TMP19]], ptr [[TMP18]], align 4, !tbaa [[TBAA14]]
 // CHECK1-NEXT:    br label [[IFCONT4:%.*]]
 // CHECK1:       else3:
 // CHECK1-NEXT:    br label [[IFCONT4]]
 // CHECK1:       ifcont4:
 // CHECK1-NEXT:    [[TMP20:%.*]] = add nsw i32 [[TMP7]], 1
-// CHECK1-NEXT:    store i32 [[TMP20]], ptr [[DOTCNT_ADDR]], align 4, !tbaa [[TBAA8]]
+// CHECK1-NEXT:    store i32 [[TMP20]], ptr [[DOTCNT_ADDR]], align 4, !tbaa [[TBAA14]]
 // CHECK1-NEXT:    br label [[PRECOND]]
 // CHECK1:       exit:
 // CHECK1-NEXT:    ret void
@@ -507,33 +509,35 @@ void test() {
 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8
-// CHECK1-NEXT:    store i16 [[TMP0]], ptr [[DOTADDR]], align 2, !tbaa [[TBAA19]]
-// CHECK1-NEXT:    store i32 [[TMP1]], ptr [[DOTADDR1]], align 4, !tbaa [[TBAA8]]
+// CHECK1-NEXT:    store i16 [[TMP0]], ptr [[DOTADDR]], align 2, !tbaa [[TBAA21]]
+// CHECK1-NEXT:    store i32 [[TMP1]], ptr [[DOTADDR1]], align 4, !tbaa [[TBAA14]]
 // CHECK1-NEXT:    store i32 0, ptr [[DOTZERO_ADDR]], align 4
 // CHECK1-NEXT:    call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]])
 // CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[GLOBAL_ARGS]], align 8
 // CHECK1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds ptr, ptr [[TMP2]], i64 0
-// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[TMP3]], align 8, !tbaa [[TBAA12]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[TMP3]], align 8, !tbaa [[TBAA10]]
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds ptr, ptr [[TMP2]], i64 1
-// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[TMP5]], align 8, !tbaa [[TBAA12]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[TMP5]], align 8, !tbaa [[TBAA10]]
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds ptr, ptr [[TMP2]], i64 2
-// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[TMP7]], align 8, !tbaa [[TBAA12]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[TMP7]], align 8, !tbaa [[TBAA10]]
 // CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z17complex_reductionIfEvv_l16_omp_outlined_omp_outlined(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]], ptr [[TMP4]], ptr [[TMP6]], ptr [[TMP8]]) #[[ATTR4]]
 // CHECK1-NEXT:    ret void
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z17complex_reductionIdEvv_l16
-// CHECK1-SAME: () #[[ATTR0]] {
+// CHECK1-SAME: (ptr noalias [[DYN_PTR:%.*]]) #[[ATTR0]] {
 // CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-// CHECK1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z17complex_reductionIdEvv_l16_kernel_environment)
+// CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z17complex_reductionIdEvv_l16_kernel_environment, ptr [[DYN_PTR]])
 // CHECK1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK1:       user_code.entry:
 // CHECK1-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK1-NEXT:    store i32 0, ptr [[DOTZERO_ADDR]], align 4
-// CHECK1-NEXT:    store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA8]]
+// CHECK1-NEXT:    store i32 [[TMP1]], ptr [[DOTTHREADID_TEMP_]], align 4, !tbaa [[TBAA14]]
 // CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z17complex_reductionIdEvv_l16_omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTZERO_ADDR]]) #[[ATTR4]]
 // CHECK1-NEXT:    call void @__kmpc_target_deinit()
 // CHECK1-NEXT:    ret void
@@ -556,78 +560,78 @@ void test() {
 // CHECK1-NEXT:    [[REF_TMP:%.*]] = alloca double, align 8
 // CHECK1-NEXT:    [[REF_TMP2:%.*]] = alloca double, align 8
 // CHECK1-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [3 x ptr], align 8
-// CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8, !tbaa [[TBAA12]]
-// CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8, !tbaa [[TBAA12]]
+// CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8, !tbaa [[TBAA10]]
 // CHECK1-NEXT:    [[ISTART:%.*]] = call align 16 ptr @__kmpc_alloc_shared(i64 4)
 // CHECK1-NEXT:    [[IEND:%.*]] = call align 16 ptr @__kmpc_alloc_shared(i64 4)
 // CHECK1-NEXT:    [[PARTIAL_SUM:%.*]] = call align 16 ptr @__kmpc_alloc_shared(i64 16)
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTOMP_IV]]) #[[ATTR4]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTOMP_LB]]) #[[ATTR4]]
-// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4, !tbaa [[TBAA8]]
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4, !tbaa [[TBAA14]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTOMP_UB]]) #[[ATTR4]]
-// CHECK1-NEXT:    store i32 99, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA8]]
+// CHECK1-NEXT:    store i32 99, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA14]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTOMP_STRIDE]]) #[[ATTR4]]
-// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4, !tbaa [[TBAA8]]
+// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4, !tbaa [[TBAA14]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTOMP_IS_LAST]]) #[[ATTR4]]
-// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4, !tbaa [[TBAA8]]
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4, !tbaa [[TBAA14]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[IB]]) #[[ATTR4]]
 // CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4, !tbaa [[TBAA8]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4, !tbaa [[TBAA14]]
 // CHECK1-NEXT:    call void @__kmpc_distribute_static_init_4(ptr @[[GLOB2]], i32 [[TMP1]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
-// CHECK1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA8]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA14]]
 // CHECK1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 99
 // CHECK1-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
 // CHECK1:       cond.true:
 // CHECK1-NEXT:    br label [[COND_END:%.*]]
 // CHECK1:       cond.false:
-// CHECK1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA8]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA14]]
 // CHECK1-NEXT:    br label [[COND_END]]
 // CHECK1:       cond.end:
 // CHECK1-NEXT:    [[COND:%.*]] = phi i32 [ 99, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
-// CHECK1-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA8]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !tbaa [[TBAA8]]
-// CHECK1-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA8]]
+// CHECK1-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA14]]
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK1:       omp.inner.for.cond:
-// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA8]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA8]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA14]]
 // CHECK1-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
 // CHECK1-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_COND_CLEANUP:%.*]]
 // CHECK1:       omp.inner.for.cond.cleanup:
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_END:%.*]]
 // CHECK1:       omp.inner.for.body:
-// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA8]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA14]]
 // CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
 // CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK1-NEXT:    store i32 [[ADD]], ptr [[IB]], align 4, !tbaa [[TBAA8]]
+// CHECK1-NEXT:    store i32 [[ADD]], ptr [[IB]], align 4, !tbaa [[TBAA14]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[REF_TMP]]) #[[ATTR4]]
-// CHECK1-NEXT:    store double 0.000000e+00, ptr [[REF_TMP]], align 8, !tbaa [[TBAA22:![0-9]+]]
+// CHECK1-NEXT:    store double 0.000000e+00, ptr [[REF_TMP]], align 8, !tbaa [[TBAA24:![0-9]+]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[REF_TMP2]]) #[[ATTR4]]
-// CHECK1-NEXT:    store double 0.000000e+00, ptr [[REF_TMP2]], align 8, !tbaa [[TBAA22]]
+// CHECK1-NEXT:    store double 0.000000e+00, ptr [[REF_TMP2]], align 8, !tbaa [[TBAA24]]
 // CHECK1-NEXT:    call void @_ZNSt7complexIdEC1ERKdS2_(ptr nonnull align 8 dereferenceable(16) [[PARTIAL_SUM]], ptr nonnull align 8 dereferenceable(8) [[REF_TMP]], ptr nonnull align 8 dereferenceable(8) [[REF_TMP2]]) #[[ATTR12]]
 // CHECK1-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[REF_TMP2]]) #[[ATTR4]]
 // CHECK1-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[REF_TMP]]) #[[ATTR4]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[IB]], align 4, !tbaa [[TBAA8]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[IB]], align 4, !tbaa [[TBAA14]]
 // CHECK1-NEXT:    [[MUL3:%.*]] = mul nsw i32 [[TMP8]], 4
-// CHECK1-NEXT:    store i32 [[MUL3]], ptr [[ISTART]], align 4, !tbaa [[TBAA8]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[IB]], align 4, !tbaa [[TBAA8]]
+// CHECK1-NEXT:    store i32 [[MUL3]], ptr [[ISTART]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[IB]], align 4, !tbaa [[TBAA14]]
 // CHECK1-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP9]], 1
 // CHECK1-NEXT:    [[MUL5:%.*]] = mul nsw i32 [[ADD4]], 4
-// CHECK1-NEXT:    store i32 [[MUL5]], ptr [[IEND]], align 4, !tbaa [[TBAA8]]
+// CHECK1-NEXT:    store i32 [[MUL5]], ptr [[IEND]], align 4, !tbaa [[TBAA14]]
 // CHECK1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [3 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
-// CHECK1-NEXT:    store ptr [[ISTART]], ptr [[TMP10]], align 8, !tbaa [[TBAA12]]
+// CHECK1-NEXT:    store ptr [[ISTART]], ptr [[TMP10]], align 8, !tbaa [[TBAA10]]
 // CHECK1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [3 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 1
-// CHECK1-NEXT:    store ptr [[IEND]], ptr [[TMP11]], align 8, !tbaa [[TBAA12]]
+// CHECK1-NEXT:    store ptr [[IEND]], ptr [[TMP11]], align 8, !tbaa [[TBAA10]]
 // CHECK1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [3 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 2
-// CHECK1-NEXT:    store ptr [[PARTIAL_SUM]], ptr [[TMP12]], align 8, !tbaa [[TBAA12]]
+// CHECK1-NEXT:    store ptr [[PARTIAL_SUM]], ptr [[TMP12]], align 8, !tbaa [[TBAA10]]
 // CHECK1-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP1]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z17complex_reductionIdEvv_l16_omp_outlined_omp_outlined, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z17complex_reductionIdEvv_l16_omp_outlined_omp_outlined_wrapper, ptr [[CAPTURED_VARS_ADDRS]], i64 3)
 // CHECK1-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK1:       omp.body.continue:
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK1:       omp.inner.for.inc:
-// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA8]]
+// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA14]]
 // CHECK1-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP13]], 1
-// CHECK1-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA8]]
+// CHECK1-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA14]]
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]]
 // CHECK1:       omp.inner.for.end:
 // CHECK1-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
@@ -651,9 +655,9 @@ void test() {
 // CHECK1-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[__RE_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[__IM_ADDR:%.*]] = alloca ptr, align 8
-// CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8, !tbaa [[TBAA12]]
-// CHECK1-NEXT:    store ptr [[__RE]], ptr [[__RE_ADDR]], align 8, !tbaa [[TBAA12]]
-// CHECK1-NEXT:    store ptr [[__IM]], ptr [[__IM_ADDR]], align 8, !tbaa [[TBAA12]]
+// CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT:    store ptr [[__RE]], ptr [[__RE_ADDR]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT:    store ptr [[__IM]], ptr [[__IM_ADDR]], align 8, !tbaa [[TBAA10]]
 // CHECK1-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[__RE_ADDR]], align 8
 // CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[__IM_ADDR]], align 8
@@ -687,79 +691,79 @@ void test() {
 // CHECK1-NEXT:    [[REF_TMP15:%.*]] = alloca double, align 8
 // CHECK1-NEXT:    [[REF_TMP16:%.*]] = alloca double, align 8
 // CHECK1-NEXT:    [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [1 x ptr], align 8
-// CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8, !tbaa [[TBAA12]]
-// CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8, !tbaa [[TBAA12]]
-// CHECK1-NEXT:    store ptr [[ISTART]], ptr [[ISTART_ADDR]], align 8, !tbaa [[TBAA12]]
-// CHECK1-NEXT:    store ptr [[IEND]], ptr [[IEND_ADDR]], align 8, !tbaa [[TBAA12]]
-// CHECK1-NEXT:    store ptr [[PARTIAL_SUM]], ptr [[PARTIAL_SUM_ADDR]], align 8, !tbaa [[TBAA12]]
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ISTART_ADDR]], align 8, !tbaa [[TBAA12]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[IEND_ADDR]], align 8, !tbaa [[TBAA12]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[PARTIAL_SUM_ADDR]], align 8, !tbaa [[TBAA12]]
+// CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT:    store ptr [[ISTART]], ptr [[ISTART_ADDR]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT:    store ptr [[IEND]], ptr [[IEND_ADDR]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT:    store ptr [[PARTIAL_SUM]], ptr [[PARTIAL_SUM_ADDR]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ISTART_ADDR]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[IEND_ADDR]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[PARTIAL_SUM_ADDR]], align 8, !tbaa [[TBAA10]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTOMP_IV]]) #[[ATTR4]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTCAPTURE_EXPR_]]) #[[ATTR4]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP0]], align 4, !tbaa [[TBAA8]]
-// CHECK1-NEXT:    store i32 [[TMP3]], ptr [[DOTCAPTURE_EXPR_]], align 4, !tbaa [[TBAA8]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP0]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    store i32 [[TMP3]], ptr [[DOTCAPTURE_EXPR_]], align 4, !tbaa [[TBAA14]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTCAPTURE_EXPR_1]]) #[[ATTR4]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP1]], align 4, !tbaa [[TBAA8]]
-// CHECK1-NEXT:    store i32 [[TMP4]], ptr [[DOTCAPTURE_EXPR_1]], align 4, !tbaa [[TBAA8]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP1]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    store i32 [[TMP4]], ptr [[DOTCAPTURE_EXPR_1]], align 4, !tbaa [[TBAA14]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTCAPTURE_EXPR_2]]) #[[ATTR4]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4, !tbaa [[TBAA8]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4, !tbaa [[TBAA8]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4, !tbaa [[TBAA14]]
 // CHECK1-NEXT:    [[SUB:%.*]] = sub i32 [[TMP5]], [[TMP6]]
 // CHECK1-NEXT:    [[SUB3:%.*]] = sub i32 [[SUB]], 1
 // CHECK1-NEXT:    [[ADD:%.*]] = add i32 [[SUB3]], 1
 // CHECK1-NEXT:    [[DIV:%.*]] = udiv i32 [[ADD]], 1
 // CHECK1-NEXT:    [[SUB4:%.*]] = sub i32 [[DIV]], 1
-// CHECK1-NEXT:    store i32 [[SUB4]], ptr [[DOTCAPTURE_EXPR_2]], align 4, !tbaa [[TBAA8]]
+// CHECK1-NEXT:    store i32 [[SUB4]], ptr [[DOTCAPTURE_EXPR_2]], align 4, !tbaa [[TBAA14]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[I]]) #[[ATTR4]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4, !tbaa [[TBAA8]]
-// CHECK1-NEXT:    store i32 [[TMP7]], ptr [[I]], align 4, !tbaa [[TBAA8]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    store i32 [[TMP7]], ptr [[I]], align 4, !tbaa [[TBAA14]]
 // CHECK1-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr [[I]]) #[[ATTR4]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4, !tbaa [[TBAA8]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4, !tbaa [[TBAA8]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4, !tbaa [[TBAA14]]
 // CHECK1-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP8]], [[TMP9]]
 // CHECK1-NEXT:    br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
 // CHECK1:       omp.precond.then:
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTOMP_LB]]) #[[ATTR4]]
-// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4, !tbaa [[TBAA8]]
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4, !tbaa [[TBAA14]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTOMP_UB]]) #[[ATTR4]]
-// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4, !tbaa [[TBAA8]]
-// CHECK1-NEXT:    store i32 [[TMP10]], ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA8]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    store i32 [[TMP10]], ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA14]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTOMP_STRIDE]]) #[[ATTR4]]
-// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4, !tbaa [[TBAA8]]
+// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4, !tbaa [[TBAA14]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[DOTOMP_IS_LAST]]) #[[ATTR4]]
-// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4, !tbaa [[TBAA8]]
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4, !tbaa [[TBAA14]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[PARTIAL_SUM5]]) #[[ATTR4]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[REF_TMP]]) #[[ATTR4]]
-// CHECK1-NEXT:    store double 0.000000e+00, ptr [[REF_TMP]], align 8, !tbaa [[TBAA22]]
+// CHECK1-NEXT:    store double 0.000000e+00, ptr [[REF_TMP]], align 8, !tbaa [[TBAA24]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[REF_TMP6]]) #[[ATTR4]]
-// CHECK1-NEXT:    store double 0.000000e+00, ptr [[REF_TMP6]], align 8, !tbaa [[TBAA22]]
+// CHECK1-NEXT:    store double 0.000000e+00, ptr [[REF_TMP6]], align 8, !tbaa [[TBAA24]]
 // CHECK1-NEXT:    call void @_ZNSt7complexIdEC1ERKdS2_(ptr nonnull align 8 dereferenceable(16) [[PARTIAL_SUM5]], ptr nonnull align 8 dereferenceable(8) [[REF_TMP]], ptr nonnull align 8 dereferenceable(8) [[REF_TMP6]]) #[[ATTR12]]
 // CHECK1-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[REF_TMP6]]) #[[ATTR4]]
 // CHECK1-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[REF_TMP]]) #[[ATTR4]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[I7]]) #[[ATTR4]]
 // CHECK1-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP11]], align 4, !tbaa [[TBAA8]]
+// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP11]], align 4, !tbaa [[TBAA14]]
 // CHECK1-NEXT:    call void @__kmpc_for_static_init_4u(ptr @[[GLOB3]], i32 [[TMP12]], i32 33, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
 // CHECK1-NEXT:    br label [[OMP_DISPATCH_COND:%.*]]
 // CHECK1:       omp.dispatch.cond:
-// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA8]]
-// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4, !tbaa [[TBAA8]]
+// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4, !tbaa [[TBAA14]]
 // CHECK1-NEXT:    [[CMP8:%.*]] = icmp ugt i32 [[TMP13]], [[TMP14]]
 // CHECK1-NEXT:    br i1 [[CMP8]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
 // CHECK1:       cond.true:
-// CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4, !tbaa [[TBAA8]]
+// CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4, !tbaa [[TBAA14]]
 // CHECK1-NEXT:    br label [[COND_END:%.*]]
 // CHECK1:       cond.false:
-// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA8]]
+// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA14]]
 // CHECK1-NEXT:    br label [[COND_END]]
 // CHECK1:       cond.end:
 // CHECK1-NEXT:    [[COND:%.*]] = phi i32 [ [[TMP15]], [[COND_TRUE]] ], [ [[TMP16]], [[COND_FALSE]] ]
-// CHECK1-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA8]]
-// CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !tbaa [[TBAA8]]
-// CHECK1-NEXT:    store i32 [[TMP17]], ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA8]]
-// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA8]]
-// CHECK1-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA8]]
+// CHECK1-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    store i32 [[TMP17]], ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA14]]
 // CHECK1-NEXT:    [[ADD9:%.*]] = add i32 [[TMP19]], 1
 // CHECK1-NEXT:    [[CMP10:%.*]] = icmp ult i32 [[TMP18]], [[ADD9]]
 // CHECK1-NEXT:    br i1 [[CMP10]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_CLEANUP:%.*]]
@@ -768,28 +772,28 @@ void test() {
 // CHECK1:       omp.dispatch.body:
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK1:       omp.inner.for.cond:
-// CHECK1-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA8]]
-// CHECK1-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA8]]
+// CHECK1-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA14]]
 // CHECK1-NEXT:    [[ADD11:%.*]] = add i32 [[TMP21]], 1
 // CHECK1-NEXT:    [[CMP12:%.*]] = icmp ult i32 [[TMP20]], [[ADD11]]
 // CHECK1-NEXT:    br i1 [[CMP12]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_COND_CLEANUP:%.*]]
 // CHECK1:       omp.inner.for.cond.cleanup:
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_END:%.*]]
 // CHECK1:       omp.inner.for.body:
-// CHECK1-NEXT:    [[TMP22:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4, !tbaa [[TBAA8]]
-// CHECK1-NEXT:    [[TMP23:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA8]]
+// CHECK1-NEXT:    [[TMP22:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP23:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA14]]
 // CHECK1-NEXT:    [[MUL:%.*]] = mul i32 [[TMP23]], 1
 // CHECK1-NEXT:    [[ADD13:%.*]] = add i32 [[TMP22]], [[MUL]]
-// CHECK1-NEXT:    store i32 [[ADD13]], ptr [[I7]], align 4, !tbaa [[TBAA8]]
+// CHECK1-NEXT:    store i32 [[ADD13]], ptr [[I7]], align 4, !tbaa [[TBAA14]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr [[REF_TMP14]]) #[[ATTR4]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[REF_TMP15]]) #[[ATTR4]]
-// CHECK1-NEXT:    [[TMP24:%.*]] = load i32, ptr [[I7]], align 4, !tbaa [[TBAA8]]
+// CHECK1-NEXT:    [[TMP24:%.*]] = load i32, ptr [[I7]], align 4, !tbaa [[TBAA14]]
 // CHECK1-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP24]] to double
-// CHECK1-NEXT:    store double [[CONV]], ptr [[REF_TMP15]], align 8, !tbaa [[TBAA22]]
+// CHECK1-NEXT:    store double [[CONV]], ptr [[REF_TMP15]], align 8, !tbaa [[TBAA24]]
 // CHECK1-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr [[REF_TMP16]]) #[[ATTR4]]
-// CHECK1-NEXT:    [[TMP25:%.*]] = load i32, ptr [[I7]], align 4, !tbaa [[TBAA8]]
+// CHECK1-NEXT:    [[TMP25:%.*]] = load i32, ptr [[I7]], align 4, !tbaa [[TBAA14]]
 // CHECK1-NEXT:    [[CONV17:%.*]] = sitofp i32 [[TMP25]] to double
-// CHECK1-NEXT:    store double [[CONV17]], ptr [[REF_TMP16]], align 8, !tbaa [[TBAA22]]
+// CHECK1-NEXT:    store double [[CONV17]], ptr [[REF_TMP16]], align 8, !tbaa [[TBAA24]]
 // CHECK1-NEXT:    call void @_ZNSt7complexIdEC1ERKdS2_(ptr nonnull align 8 dereferenceable(16) [[REF_TMP14]], ptr nonnull align 8 dereferenceable(8) [[REF_TMP15]], ptr nonnull align 8 dereferenceable(8) [[REF_TMP16]]) #[[ATTR12]]
 // CHECK1-NEXT:    [[CALL:%.*]] = call nonnull align 8 dereferenceable(16) ptr @_ZNSt7complexIdEpLIdEERS0_RKS_IT_E(ptr nonnull align 8 dereferenceable(16) [[PARTIAL_SUM5]], ptr nonnull align 8 dereferenceable(16) [[REF_TMP14]]) #[[ATTR12]]
 // CHECK1-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr [[REF_TMP16]]) #[[ATTR4]]
@@ -799,28 +803,28 @@ void test() {
 // CHECK1:       omp.body.continue:
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK1:       omp.inner.for.inc:
-// CHECK1-NEXT:    [[TMP26:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA8]]
+// CHECK1-NEXT:    [[TMP26:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA14]]
 // CHECK1-NEXT:    [[ADD18:%.*]] = add i32 [[TMP26]], 1
-// CHECK1-NEXT:    store i32 [[ADD18]], ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA8]]
+// CHECK1-NEXT:    store i32 [[ADD18]], ptr [[DOTOMP_IV]], align 4, !tbaa [[TBAA14]]
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]]
 // CHECK1:       omp.inner.for.end:
 // CHECK1-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK1:       omp.dispatch.inc:
-// CHECK1-NEXT:    [[TMP27:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !tbaa [[TBAA8]]
-// CHECK1-NEXT:    [[TMP28:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !tbaa [[TBAA8]]
+// CHECK1-NEXT:    [[TMP27:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP28:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !tbaa [[TBAA14]]
 // CHECK1-NEXT:    [[ADD19:%.*]] = add i32 [[TMP27]], [[TMP28]]
-// CHECK1-NEXT:    store i32 [[ADD19]], ptr [[DOTOMP_LB]], align 4, !tbaa [[TBAA8]]
-// CHECK1-NEXT:    [[TMP29:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA8]]
-// CHECK1-NEXT:    [[TMP30:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !tbaa [[TBAA8]]
+// CHECK1-NEXT:    store i32 [[ADD19]], ptr [[DOTOMP_LB]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP29:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    [[TMP30:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !tbaa [[TBAA14]]
 // CHECK1-NEXT:    [[ADD20:%.*]] = add i32 [[TMP29]], [[TMP30]]
-// CHECK1-NEXT:    store i32 [[ADD20]], ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA8]]
+// CHECK1-NEXT:    store i32 [[ADD20]], ptr [[DOTOMP_UB]], align 4, !tbaa [[TBAA14]]
 // CHECK1-NEXT:    br label [[OMP_DISPATCH_COND]]
 // CHECK1:       omp.dispatch.end:
 // CHECK1-NEXT:    [[TMP31:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK1-NEXT:    [[TMP32:%.*]] = load i32, ptr [[TMP31]], align 4, !tbaa [[TBAA8]]
+// CHECK1-NEXT:    [[TMP32:%.*]] = load i32, ptr [[TMP31]], align 4, !tbaa [[TBAA14]]
 // CHECK1-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB3]], i32 [[TMP32]])
 // CHECK1-NEXT:    [[TMP33:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK1-NEXT:    [[TMP34:%.*]] = load i32, ptr [[TMP33]], align 4, !tbaa [[TBAA8]]
+// CHECK1-NEXT:    [[TMP34:%.*]] = load i32, ptr [[TMP33]], align 4, !tbaa [[TBAA14]]
 // CHECK1-NEXT:    [[TMP35:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0
 // CHECK1-NEXT:    store ptr [[PARTIAL_SUM5]], ptr [[TMP35]], align 8
 // CHECK1-NEXT:    [[TMP36:%.*]] = call i32 @__kmpc_nvptx_parallel_reduce_nowait_v2(ptr @[[GLOB1]], i32 [[TMP34]], i32 1, i64 8, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_omp_reduction_shuffle_and_reduce_func1, ptr @_omp_reduction_inter_warp_copy_func2)
@@ -851,21 +855,21 @@ void test() {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[__C_ADDR:%.*]] = alloca ptr, align 8
-// CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8, !tbaa [[TBAA12]]
-// CHECK1-NEXT:    store ptr [[__C]], ptr [[__C_ADDR]], align 8, !tbaa [[TBAA12]]
+// CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT:    store ptr [[__C]], ptr [[__C_ADDR]], align 8, !tbaa [[TBAA10]]
 // CHECK1-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[__C_ADDR]], align 8, !tbaa [[TBAA12]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[__C_ADDR]], align 8, !tbaa [[TBAA10]]
 // CHECK1-NEXT:    [[CALL:%.*]] = call double @_ZNKSt7complexIdE4realEv(ptr nonnull align 8 dereferenceable(16) [[TMP0]]) #[[ATTR12]]
 // CHECK1-NEXT:    [[__RE_:%.*]] = getelementptr inbounds %"class.std::complex.0", ptr [[THIS1]], i32 0, i32 0
-// CHECK1-NEXT:    [[TMP1:%.*]] = load double, ptr [[__RE_]], align 8, !tbaa [[TBAA24:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = load double, ptr [[__RE_]], align 8, !tbaa [[TBAA26:![0-9]+]]
 // CHECK1-NEXT:    [[ADD:%.*]] = fadd double [[TMP1]], [[CALL]]
-// CHECK1-NEXT:    store double [[ADD]], ptr [[__RE_]], align 8, !tbaa [[TBAA24]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[__C_ADDR]], align 8, !tbaa [[TBAA12]]
+// CHECK1-NEXT:    store double [[ADD]], ptr [[__RE_]], align 8, !tbaa [[TBAA26]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[__C_ADDR]], align 8, !tbaa [[TBAA10]]
 // CHECK1-NEXT:    [[CALL2:%.*]] = call double @_ZNKSt7complexIdE4imagEv(ptr nonnull align 8 dereferenceable(16) [[TMP2]]) #[[ATTR12]]
 // CHECK1-NEXT:    [[__IM_:%.*]] = getelementptr inbounds %"class.std::complex.0", ptr [[THIS1]], i32 0, i32 1
-// CHECK1-NEXT:    [[TMP3:%.*]] = load double, ptr [[__IM_]], align 8, !tbaa [[TBAA26:![0-9]+]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load double, ptr [[__IM_]], align 8, !tbaa [[TBAA28:![0-9]+]]
 // CHECK1-NEXT:    [[ADD3:%.*]] = fadd double [[TMP3]], [[CALL2]]
-// CHECK1-NEXT:    store double [[ADD3]], ptr [[__IM_]], align 8, !tbaa [[TBAA26]]
+// CHECK1-NEXT:    store double [[ADD3]], ptr [[__IM_]], align 8, !tbaa [[TBAA28]]
 // CHECK1-NEXT:    ret ptr [[THIS1]]
 //
 //
@@ -878,14 +882,14 @@ void test() {
 // CHECK1-NEXT:    [[DOTADDR3:%.*]] = alloca i16, align 2
 // CHECK1-NEXT:    [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST:%.*]] = alloca [1 x ptr], align 8
 // CHECK1-NEXT:    [[DOTOMP_REDUCTION_ELEMENT:%.*]] = alloca %"class.std::complex.0", align 8
-// CHECK1-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR]], align 8, !tbaa [[TBAA12]]
-// CHECK1-NEXT:    store i16 [[TMP1]], ptr [[DOTADDR1]], align 2, !tbaa [[TBAA19]]
-// CHECK1-NEXT:    store i16 [[TMP2]], ptr [[DOTADDR2]], align 2, !tbaa [[TBAA19]]
-// CHECK1-NEXT:    store i16 [[TMP3]], ptr [[DOTADDR3]], align 2, !tbaa [[TBAA19]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTADDR]], align 8, !tbaa [[TBAA12]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load i16, ptr [[DOTADDR1]], align 2, !tbaa [[TBAA19]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load i16, ptr [[DOTADDR2]], align 2, !tbaa [[TBAA19]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = load i16, ptr [[DOTADDR3]], align 2, !tbaa [[TBAA19]]
+// CHECK1-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT:    store i16 [[TMP1]], ptr [[DOTADDR1]], align 2, !tbaa [[TBAA21]]
+// CHECK1-NEXT:    store i16 [[TMP2]], ptr [[DOTADDR2]], align 2, !tbaa [[TBAA21]]
+// CHECK1-NEXT:    store i16 [[TMP3]], ptr [[DOTADDR3]], align 2, !tbaa [[TBAA21]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTADDR]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load i16, ptr [[DOTADDR1]], align 2, !tbaa [[TBAA21]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load i16, ptr [[DOTADDR2]], align 2, !tbaa [[TBAA21]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load i16, ptr [[DOTADDR3]], align 2, !tbaa [[TBAA21]]
 // CHECK1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP4]], i64 0, i64 0
 // CHECK1-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[TMP8]], align 8
 // CHECK1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_REMOTE_REDUCE_LIST]], i64 0, i64 0
@@ -910,7 +914,7 @@ void test() {
 // CHECK1-NEXT:    [[TMP24]] = getelementptr i64, ptr [[TMP13]], i64 1
 // CHECK1-NEXT:    br label [[DOTSHUFFLE_PRE_COND]]
 // CHECK1:       .shuffle.exit:
-// CHECK1-NEXT:    store ptr [[DOTOMP_REDUCTION_ELEMENT]], ptr [[TMP10]], align 8, !tbaa [[TBAA12]]
+// CHECK1-NEXT:    store ptr [[DOTOMP_REDUCTION_ELEMENT]], ptr [[TMP10]], align 8, !tbaa [[TBAA10]]
 // CHECK1-NEXT:    [[TMP25:%.*]] = icmp eq i16 [[TMP7]], 0
 // CHECK1-NEXT:    [[TMP26:%.*]] = icmp eq i16 [[TMP7]], 1
 // CHECK1-NEXT:    [[TMP27:%.*]] = icmp ult i16 [[TMP5]], [[TMP6]]
@@ -939,7 +943,7 @@ void test() {
 // CHECK1-NEXT:    [[TMP41:%.*]] = load ptr, ptr [[TMP40]], align 8
 // CHECK1-NEXT:    [[TMP42:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP4]], i64 0, i64 0
 // CHECK1-NEXT:    [[TMP43:%.*]] = load ptr, ptr [[TMP42]], align 8
-// CHECK1-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[TMP43]], ptr align 8 [[TMP41]], i64 16, i1 false), !tbaa.struct [[TBAA_STRUCT27:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[TMP43]], ptr align 8 [[TMP41]], i64 16, i1 false), !tbaa.struct [[TBAA_STRUCT29:![0-9]+]]
 // CHECK1-NEXT:    br label [[IFCONT6:%.*]]
 // CHECK1:       else5:
 // CHECK1-NEXT:    br label [[IFCONT6]]
@@ -954,18 +958,18 @@ void test() {
 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[DOTCNT_ADDR:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
-// CHECK1-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR]], align 8, !tbaa [[TBAA12]]
-// CHECK1-NEXT:    store i32 [[TMP1]], ptr [[DOTADDR1]], align 4, !tbaa [[TBAA8]]
+// CHECK1-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT:    store i32 [[TMP1]], ptr [[DOTADDR1]], align 4, !tbaa [[TBAA14]]
 // CHECK1-NEXT:    [[TMP3:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
 // CHECK1-NEXT:    [[TMP4:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
 // CHECK1-NEXT:    [[NVPTX_LANE_ID:%.*]] = and i32 [[TMP4]], 31
 // CHECK1-NEXT:    [[TMP5:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
 // CHECK1-NEXT:    [[NVPTX_WARP_ID:%.*]] = ashr i32 [[TMP5]], 5
 // CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[DOTADDR]], align 8
-// CHECK1-NEXT:    store i32 0, ptr [[DOTCNT_ADDR]], align 4, !tbaa [[TBAA8]]
+// CHECK1-NEXT:    store i32 0, ptr [[DOTCNT_ADDR]], align 4, !tbaa [[TBAA14]]
 // CHECK1-NEXT:    br label [[PRECOND:%.*]]
 // CHECK1:       precond:
-// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTCNT_ADDR]], align 4, !tbaa [[TBAA8]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTCNT_ADDR]], align 4, !tbaa [[TBAA14]]
 // CHECK1-NEXT:    [[TMP8:%.*]] = icmp ult i32 [[TMP7]], 4
 // CHECK1-NEXT:    br i1 [[TMP8]], label [[BODY:%.*]], label [[EXIT:%.*]]
 // CHECK1:       body:
@@ -974,7 +978,7 @@ void test() {
 // CHECK1-NEXT:    br i1 [[WARP_MASTER]], label [[THEN:%.*]], label [[ELSE:%.*]]
 // CHECK1:       then:
 // CHECK1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP6]], i64 0, i64 0
-// CHECK1-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[TMP9]], align 8, !tbaa [[TBAA12]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[TMP9]], align 8, !tbaa [[TBAA10]]
 // CHECK1-NEXT:    [[TMP11:%.*]] = getelementptr i32, ptr [[TMP10]], i32 [[TMP7]]
 // CHECK1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[NVPTX_WARP_ID]]
 // CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[TMP11]], align 4
@@ -984,22 +988,22 @@ void test() {
 // CHECK1-NEXT:    br label [[IFCONT]]
 // CHECK1:       ifcont:
 // CHECK1-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB4]], i32 [[TMP2]])
-// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTADDR1]], align 4, !tbaa [[TBAA8]]
+// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTADDR1]], align 4, !tbaa [[TBAA14]]
 // CHECK1-NEXT:    [[IS_ACTIVE_THREAD:%.*]] = icmp ult i32 [[TMP3]], [[TMP14]]
 // CHECK1-NEXT:    br i1 [[IS_ACTIVE_THREAD]], label [[THEN2:%.*]], label [[ELSE3:%.*]]
 // CHECK1:       then2:
 // CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [32 x i32], ptr addrspace(3) @__openmp_nvptx_data_transfer_temporary_storage, i64 0, i32 [[TMP3]]
 // CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [1 x ptr], ptr [[TMP6]], i64 0, i64 0
-// CHECK1-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[TMP16]], align 8, !tbaa [[TBAA12]]
+// CHECK1-NEXT:    [[TMP17:%.*]] = load ptr, ptr [[TMP16]], align 8, !tbaa [[TBAA10]]
 // CHECK1-NEXT:    [[TMP18:%.*]] = getelementptr i32, ptr [[TMP17]], i32 [[TMP7]]
-// CHECK1-NEXT:    [[TMP19:%.*]] = load volatile i32, ptr addrspace(3) [[TMP15]], align 4, !tbaa [[TBAA8]]
-// CHECK1-NEXT:    store i32 [[TMP19]], ptr [[TMP18]], align 4, !tbaa [[TBAA8]]
+// CHECK1-NEXT:    [[TMP19:%.*]] = load volatile i32, ptr addrspace(3) [[TMP15]], align 4, !tbaa [[TBAA14]]
+// CHECK1-NEXT:    store i32 [[TMP19]], ptr [[TMP18]], align 4, !tbaa [[TBAA14]]
 // CHECK1-NEXT:    br label [[IFCONT4:%.*]]
 // CHECK1:       else3:
 // CHECK1-NEXT:    br label [[IFCONT4]]
 // CHECK1:       ifcont4:
 // CHECK1-NEXT:    [[TMP20:%.*]] = add nsw i32 [[TMP7]], 1
-// CHECK1-NEXT:    store i32 [[TMP20]], ptr [[DOTCNT_ADDR]], align 4, !tbaa [[TBAA8]]
+// CHECK1-NEXT:    store i32 [[TMP20]], ptr [[DOTCNT_ADDR]], align 4, !tbaa [[TBAA14]]
 // CHECK1-NEXT:    br label [[PRECOND]]
 // CHECK1:       exit:
 // CHECK1-NEXT:    ret void
@@ -1012,17 +1016,17 @@ void test() {
 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[GLOBAL_ARGS:%.*]] = alloca ptr, align 8
-// CHECK1-NEXT:    store i16 [[TMP0]], ptr [[DOTADDR]], align 2, !tbaa [[TBAA19]]
-// CHECK1-NEXT:    store i32 [[TMP1]], ptr [[DOTADDR1]], align 4, !tbaa [[TBAA8]]
+// CHECK1-NEXT:    store i16 [[TMP0]], ptr [[DOTADDR]], align 2, !tbaa [[TBAA21]]
+// CHECK1-NEXT:    store i32 [[TMP1]], ptr [[DOTADDR1]], align 4, !tbaa [[TBAA14]]
 // CHECK1-NEXT:    store i32 0, ptr [[DOTZERO_ADDR]], align 4
 // CHECK1-NEXT:    call void @__kmpc_get_shared_variables(ptr [[GLOBAL_ARGS]])
 // CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[GLOBAL_ARGS]], align 8
 // CHECK1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds ptr, ptr [[TMP2]], i64 0
-// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[TMP3]], align 8, !tbaa [[TBAA12]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[TMP3]], align 8, !tbaa [[TBAA10]]
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds ptr, ptr [[TMP2]], i64 1
-// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[TMP5]], align 8, !tbaa [[TBAA12]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[TMP5]], align 8, !tbaa [[TBAA10]]
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds ptr, ptr [[TMP2]], i64 2
-// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[TMP7]], align 8, !tbaa [[TBAA12]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[TMP7]], align 8, !tbaa [[TBAA10]]
 // CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z17complex_reductionIdEvv_l16_omp_outlined_omp_outlined(ptr [[DOTADDR1]], ptr [[DOTZERO_ADDR]], ptr [[TMP4]], ptr [[TMP6]], ptr [[TMP8]]) #[[ATTR4]]
 // CHECK1-NEXT:    ret void
 //
@@ -1033,18 +1037,18 @@ void test() {
 // CHECK1-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[__RE_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[__IM_ADDR:%.*]] = alloca ptr, align 8
-// CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8, !tbaa [[TBAA12]]
-// CHECK1-NEXT:    store ptr [[__RE]], ptr [[__RE_ADDR]], align 8, !tbaa [[TBAA12]]
-// CHECK1-NEXT:    store ptr [[__IM]], ptr [[__IM_ADDR]], align 8, !tbaa [[TBAA12]]
+// CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT:    store ptr [[__RE]], ptr [[__RE_ADDR]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT:    store ptr [[__IM]], ptr [[__IM_ADDR]], align 8, !tbaa [[TBAA10]]
 // CHECK1-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CHECK1-NEXT:    [[__RE_:%.*]] = getelementptr inbounds %"class.std::complex", ptr [[THIS1]], i32 0, i32 0
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[__RE_ADDR]], align 8, !tbaa [[TBAA12]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = load float, ptr [[TMP0]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT:    store float [[TMP1]], ptr [[__RE_]], align 4, !tbaa [[TBAA16]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[__RE_ADDR]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = load float, ptr [[TMP0]], align 4, !tbaa [[TBAA16]]
+// CHECK1-NEXT:    store float [[TMP1]], ptr [[__RE_]], align 4, !tbaa [[TBAA18]]
 // CHECK1-NEXT:    [[__IM_:%.*]] = getelementptr inbounds %"class.std::complex", ptr [[THIS1]], i32 0, i32 1
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[__IM_ADDR]], align 8, !tbaa [[TBAA12]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load float, ptr [[TMP2]], align 4, !tbaa [[TBAA14]]
-// CHECK1-NEXT:    store float [[TMP3]], ptr [[__IM_]], align 4, !tbaa [[TBAA18]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[__IM_ADDR]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load float, ptr [[TMP2]], align 4, !tbaa [[TBAA16]]
+// CHECK1-NEXT:    store float [[TMP3]], ptr [[__IM_]], align 4, !tbaa [[TBAA20]]
 // CHECK1-NEXT:    ret void
 //
 //
@@ -1052,10 +1056,10 @@ void test() {
 // CHECK1-SAME: (ptr nonnull align 4 dereferenceable(8) [[THIS:%.*]]) #[[ATTR6]] comdat align 2 {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
-// CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8, !tbaa [[TBAA12]]
+// CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8, !tbaa [[TBAA10]]
 // CHECK1-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CHECK1-NEXT:    [[__RE_:%.*]] = getelementptr inbounds %"class.std::complex", ptr [[THIS1]], i32 0, i32 0
-// CHECK1-NEXT:    [[TMP0:%.*]] = load float, ptr [[__RE_]], align 4, !tbaa [[TBAA16]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load float, ptr [[__RE_]], align 4, !tbaa [[TBAA18]]
 // CHECK1-NEXT:    ret float [[TMP0]]
 //
 //
@@ -1063,10 +1067,10 @@ void test() {
 // CHECK1-SAME: (ptr nonnull align 4 dereferenceable(8) [[THIS:%.*]]) #[[ATTR6]] comdat align 2 {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
-// CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8, !tbaa [[TBAA12]]
+// CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8, !tbaa [[TBAA10]]
 // CHECK1-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CHECK1-NEXT:    [[__IM_:%.*]] = getelementptr inbounds %"class.std::complex", ptr [[THIS1]], i32 0, i32 1
-// CHECK1-NEXT:    [[TMP0:%.*]] = load float, ptr [[__IM_]], align 4, !tbaa [[TBAA18]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load float, ptr [[__IM_]], align 4, !tbaa [[TBAA20]]
 // CHECK1-NEXT:    ret float [[TMP0]]
 //
 //
@@ -1076,18 +1080,18 @@ void test() {
 // CHECK1-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[__RE_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[__IM_ADDR:%.*]] = alloca ptr, align 8
-// CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8, !tbaa [[TBAA12]]
-// CHECK1-NEXT:    store ptr [[__RE]], ptr [[__RE_ADDR]], align 8, !tbaa [[TBAA12]]
-// CHECK1-NEXT:    store ptr [[__IM]], ptr [[__IM_ADDR]], align 8, !tbaa [[TBAA12]]
+// CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT:    store ptr [[__RE]], ptr [[__RE_ADDR]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT:    store ptr [[__IM]], ptr [[__IM_ADDR]], align 8, !tbaa [[TBAA10]]
 // CHECK1-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CHECK1-NEXT:    [[__RE_:%.*]] = getelementptr inbounds %"class.std::complex.0", ptr [[THIS1]], i32 0, i32 0
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[__RE_ADDR]], align 8, !tbaa [[TBAA12]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = load double, ptr [[TMP0]], align 8, !tbaa [[TBAA22]]
-// CHECK1-NEXT:    store double [[TMP1]], ptr [[__RE_]], align 8, !tbaa [[TBAA24]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[__RE_ADDR]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = load double, ptr [[TMP0]], align 8, !tbaa [[TBAA24]]
+// CHECK1-NEXT:    store double [[TMP1]], ptr [[__RE_]], align 8, !tbaa [[TBAA26]]
 // CHECK1-NEXT:    [[__IM_:%.*]] = getelementptr inbounds %"class.std::complex.0", ptr [[THIS1]], i32 0, i32 1
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[__IM_ADDR]], align 8, !tbaa [[TBAA12]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load double, ptr [[TMP2]], align 8, !tbaa [[TBAA22]]
-// CHECK1-NEXT:    store double [[TMP3]], ptr [[__IM_]], align 8, !tbaa [[TBAA26]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[__IM_ADDR]], align 8, !tbaa [[TBAA10]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load double, ptr [[TMP2]], align 8, !tbaa [[TBAA24]]
+// CHECK1-NEXT:    store double [[TMP3]], ptr [[__IM_]], align 8, !tbaa [[TBAA28]]
 // CHECK1-NEXT:    ret void
 //
 //
@@ -1095,10 +1099,10 @@ void test() {
 // CHECK1-SAME: (ptr nonnull align 8 dereferenceable(16) [[THIS:%.*]]) #[[ATTR6]] comdat align 2 {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
-// CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8, !tbaa [[TBAA12]]
+// CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8, !tbaa [[TBAA10]]
 // CHECK1-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CHECK1-NEXT:    [[__RE_:%.*]] = getelementptr inbounds %"class.std::complex.0", ptr [[THIS1]], i32 0, i32 0
-// CHECK1-NEXT:    [[TMP0:%.*]] = load double, ptr [[__RE_]], align 8, !tbaa [[TBAA24]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load double, ptr [[__RE_]], align 8, !tbaa [[TBAA26]]
 // CHECK1-NEXT:    ret double [[TMP0]]
 //
 //
@@ -1106,9 +1110,9 @@ void test() {
 // CHECK1-SAME: (ptr nonnull align 8 dereferenceable(16) [[THIS:%.*]]) #[[ATTR6]] comdat align 2 {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
-// CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8, !tbaa [[TBAA12]]
+// CHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8, !tbaa [[TBAA10]]
 // CHECK1-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CHECK1-NEXT:    [[__IM_:%.*]] = getelementptr inbounds %"class.std::complex.0", ptr [[THIS1]], i32 0, i32 1
-// CHECK1-NEXT:    [[TMP0:%.*]] = load double, ptr [[__IM_]], align 8, !tbaa [[TBAA26]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load double, ptr [[__IM_]], align 8, !tbaa [[TBAA28]]
 // CHECK1-NEXT:    ret double [[TMP0]]
 //
diff --git a/clang/test/OpenMP/nvptx_target_printf_codegen.c b/clang/test/OpenMP/nvptx_target_printf_codegen.c
index 524a8f306a8c1..f53daf65205c9 100644
--- a/clang/test/OpenMP/nvptx_target_printf_codegen.c
+++ b/clang/test/OpenMP/nvptx_target_printf_codegen.c
@@ -41,11 +41,13 @@ void CheckAllocaIsInEntryBlock(void) {
   }
 }
 // CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_CheckSimple_l13
-// CHECK-64-SAME: () #[[ATTR0:[0-9]+]] {
+// CHECK-64-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-64-NEXT:    [[FMT:%.*]] = alloca ptr, align 8
 // CHECK-64-NEXT:    [[TMP:%.*]] = alloca [[PRINTF_ARGS:%.*]], align 8
-// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_CheckSimple_l13_kernel_environment)
+// CHECK-64-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_CheckSimple_l13_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-64:       user_code.entry:
@@ -65,9 +67,11 @@ void CheckAllocaIsInEntryBlock(void) {
 //
 //
 // CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_CheckNoArgs_l25
-// CHECK-64-SAME: () #[[ATTR0]] {
+// CHECK-64-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
 // CHECK-64-NEXT:  entry:
-// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_CheckNoArgs_l25_kernel_environment)
+// CHECK-64-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-64-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_CheckNoArgs_l25_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-64:       user_code.entry:
@@ -79,12 +83,14 @@ void CheckAllocaIsInEntryBlock(void) {
 //
 //
 // CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_CheckAllocaIsInEntryBlock_l36
-// CHECK-64-SAME: (i64 noundef [[FOO:%.*]]) #[[ATTR0]] {
+// CHECK-64-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[FOO:%.*]]) #[[ATTR0]] {
 // CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-64-NEXT:    [[FOO_ADDR:%.*]] = alloca i64, align 8
 // CHECK-64-NEXT:    [[TMP:%.*]] = alloca [[PRINTF_ARGS_0:%.*]], align 8
+// CHECK-64-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK-64-NEXT:    store i64 [[FOO]], ptr [[FOO_ADDR]], align 8
-// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_CheckAllocaIsInEntryBlock_l36_kernel_environment)
+// CHECK-64-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_CheckAllocaIsInEntryBlock_l36_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-64:       user_code.entry:
@@ -104,11 +110,13 @@ void CheckAllocaIsInEntryBlock(void) {
 //
 //
 // CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_CheckSimple_l13
-// CHECK-32-SAME: () #[[ATTR0:[0-9]+]] {
+// CHECK-32-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-NEXT:    [[FMT:%.*]] = alloca ptr, align 4
 // CHECK-32-NEXT:    [[TMP:%.*]] = alloca [[PRINTF_ARGS:%.*]], align 8
-// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_CheckSimple_l13_kernel_environment)
+// CHECK-32-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_CheckSimple_l13_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32:       user_code.entry:
@@ -128,9 +136,11 @@ void CheckAllocaIsInEntryBlock(void) {
 //
 //
 // CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_CheckNoArgs_l25
-// CHECK-32-SAME: () #[[ATTR0]] {
+// CHECK-32-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
 // CHECK-32-NEXT:  entry:
-// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_CheckNoArgs_l25_kernel_environment)
+// CHECK-32-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
+// CHECK-32-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_CheckNoArgs_l25_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32:       user_code.entry:
@@ -142,12 +152,14 @@ void CheckAllocaIsInEntryBlock(void) {
 //
 //
 // CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_CheckAllocaIsInEntryBlock_l36
-// CHECK-32-SAME: (i32 noundef [[FOO:%.*]]) #[[ATTR0]] {
+// CHECK-32-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[FOO:%.*]]) #[[ATTR0]] {
 // CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-NEXT:    [[FOO_ADDR:%.*]] = alloca i32, align 4
 // CHECK-32-NEXT:    [[TMP:%.*]] = alloca [[PRINTF_ARGS_0:%.*]], align 8
+// CHECK-32-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK-32-NEXT:    store i32 [[FOO]], ptr [[FOO_ADDR]], align 4
-// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_CheckAllocaIsInEntryBlock_l36_kernel_environment)
+// CHECK-32-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_CheckAllocaIsInEntryBlock_l36_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32:       user_code.entry:
diff --git a/clang/test/OpenMP/nvptx_target_simd_codegen.cpp b/clang/test/OpenMP/nvptx_target_simd_codegen.cpp
index a8edfd7fa174e..9f98c18ab1dcf 100644
--- a/clang/test/OpenMP/nvptx_target_simd_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_target_simd_codegen.cpp
@@ -59,8 +59,9 @@ int bar(int n){
 
 #endif
 // CHECK45-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l29
-// CHECK45-64-SAME: (i64 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK45-64-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[A:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK45-64-NEXT:  entry:
+// CHECK45-64-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK45-64-NEXT:    [[N_ADDR:%.*]] = alloca i64, align 8
 // CHECK45-64-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
 // CHECK45-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
@@ -69,10 +70,11 @@ int bar(int n){
 // CHECK45-64-NEXT:    [[I:%.*]] = alloca i32, align 4
 // CHECK45-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
 // CHECK45-64-NEXT:    [[I3:%.*]] = alloca i32, align 4
+// CHECK45-64-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK45-64-NEXT:    store i64 [[N]], ptr [[N_ADDR]], align 8
 // CHECK45-64-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
 // CHECK45-64-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK45-64-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l29_kernel_environment)
+// CHECK45-64-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l29_kernel_environment, ptr [[DYN_PTR]])
 // CHECK45-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // CHECK45-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK45-64:       user_code.entry:
@@ -91,28 +93,28 @@ int bar(int n){
 // CHECK45-64-NEXT:    store i32 0, ptr [[DOTOMP_IV]], align 4
 // CHECK45-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK45-64:       omp.inner.for.cond:
-// CHECK45-64-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12:![0-9]+]]
-// CHECK45-64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK45-64-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP24:![0-9]+]]
+// CHECK45-64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4, !llvm.access.group [[ACC_GRP24]]
 // CHECK45-64-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP6]], 1
 // CHECK45-64-NEXT:    [[CMP4:%.*]] = icmp slt i32 [[TMP5]], [[ADD]]
 // CHECK45-64-NEXT:    br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK45-64:       omp.inner.for.body:
-// CHECK45-64-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK45-64-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP24]]
 // CHECK45-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
 // CHECK45-64-NEXT:    [[ADD5:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK45-64-NEXT:    store i32 [[ADD5]], ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP12]]
-// CHECK45-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK45-64-NEXT:    store i32 [[ADD5]], ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP24]]
+// CHECK45-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP24]]
 // CHECK45-64-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP8]] to i64
 // CHECK45-64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], ptr [[TMP0]], i64 0, i64 [[IDXPROM]]
-// CHECK45-64-NEXT:    store i32 1, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK45-64-NEXT:    store i32 1, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP24]]
 // CHECK45-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK45-64:       omp.body.continue:
 // CHECK45-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK45-64:       omp.inner.for.inc:
-// CHECK45-64-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK45-64-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP24]]
 // CHECK45-64-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP9]], 1
-// CHECK45-64-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12]]
-// CHECK45-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP13:![0-9]+]]
+// CHECK45-64-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP24]]
+// CHECK45-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP25:![0-9]+]]
 // CHECK45-64:       worker.exit:
 // CHECK45-64-NEXT:    ret void
 // CHECK45-64:       omp.inner.for.end:
@@ -129,8 +131,9 @@ int bar(int n){
 //
 //
 // CHECK45-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l34
-// CHECK45-64-SAME: (i64 noundef [[N:%.*]], ptr noundef nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] {
+// CHECK45-64-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[N:%.*]], ptr noundef nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] {
 // CHECK45-64-NEXT:  entry:
+// CHECK45-64-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK45-64-NEXT:    [[N_ADDR:%.*]] = alloca i64, align 8
 // CHECK45-64-NEXT:    [[AA_ADDR:%.*]] = alloca ptr, align 8
 // CHECK45-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
@@ -139,10 +142,11 @@ int bar(int n){
 // CHECK45-64-NEXT:    [[I:%.*]] = alloca i32, align 4
 // CHECK45-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
 // CHECK45-64-NEXT:    [[I3:%.*]] = alloca i32, align 4
+// CHECK45-64-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK45-64-NEXT:    store i64 [[N]], ptr [[N_ADDR]], align 8
 // CHECK45-64-NEXT:    store ptr [[AA]], ptr [[AA_ADDR]], align 8
 // CHECK45-64-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[AA_ADDR]], align 8
-// CHECK45-64-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l34_kernel_environment)
+// CHECK45-64-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l34_kernel_environment, ptr [[DYN_PTR]])
 // CHECK45-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // CHECK45-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK45-64:       user_code.entry:
@@ -161,32 +165,32 @@ int bar(int n){
 // CHECK45-64-NEXT:    store i32 0, ptr [[DOTOMP_IV]], align 4
 // CHECK45-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK45-64:       omp.inner.for.cond:
-// CHECK45-64-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16:![0-9]+]]
-// CHECK45-64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK45-64-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP28:![0-9]+]]
+// CHECK45-64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4, !llvm.access.group [[ACC_GRP28]]
 // CHECK45-64-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP6]], 1
 // CHECK45-64-NEXT:    [[CMP4:%.*]] = icmp slt i32 [[TMP5]], [[ADD]]
 // CHECK45-64-NEXT:    br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK45-64:       omp.inner.for.body:
-// CHECK45-64-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK45-64-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP28]]
 // CHECK45-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
 // CHECK45-64-NEXT:    [[ADD5:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK45-64-NEXT:    store i32 [[ADD5]], ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK45-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK45-64-NEXT:    store i32 [[ADD5]], ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP28]]
+// CHECK45-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP28]]
 // CHECK45-64-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP8]] to i64
 // CHECK45-64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], ptr [[TMP0]], i64 0, i64 [[IDXPROM]]
-// CHECK45-64-NEXT:    [[TMP9:%.*]] = load i16, ptr [[ARRAYIDX]], align 2, !llvm.access.group [[ACC_GRP16]]
+// CHECK45-64-NEXT:    [[TMP9:%.*]] = load i16, ptr [[ARRAYIDX]], align 2, !llvm.access.group [[ACC_GRP28]]
 // CHECK45-64-NEXT:    [[CONV:%.*]] = sext i16 [[TMP9]] to i32
 // CHECK45-64-NEXT:    [[ADD6:%.*]] = add nsw i32 [[CONV]], 1
 // CHECK45-64-NEXT:    [[CONV7:%.*]] = trunc i32 [[ADD6]] to i16
-// CHECK45-64-NEXT:    store i16 [[CONV7]], ptr [[ARRAYIDX]], align 2, !llvm.access.group [[ACC_GRP16]]
+// CHECK45-64-NEXT:    store i16 [[CONV7]], ptr [[ARRAYIDX]], align 2, !llvm.access.group [[ACC_GRP28]]
 // CHECK45-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK45-64:       omp.body.continue:
 // CHECK45-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK45-64:       omp.inner.for.inc:
-// CHECK45-64-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK45-64-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP28]]
 // CHECK45-64-NEXT:    [[ADD8:%.*]] = add nsw i32 [[TMP10]], 1
-// CHECK45-64-NEXT:    store i32 [[ADD8]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK45-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP17:![0-9]+]]
+// CHECK45-64-NEXT:    store i32 [[ADD8]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP28]]
+// CHECK45-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP29:![0-9]+]]
 // CHECK45-64:       worker.exit:
 // CHECK45-64-NEXT:    ret void
 // CHECK45-64:       omp.inner.for.end:
@@ -203,43 +207,45 @@ int bar(int n){
 //
 //
 // CHECK45-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l39
-// CHECK45-64-SAME: (ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
+// CHECK45-64-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
 // CHECK45-64-NEXT:  entry:
+// CHECK45-64-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK45-64-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
 // CHECK45-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
 // CHECK45-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
 // CHECK45-64-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK45-64-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK45-64-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
 // CHECK45-64-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 8
-// CHECK45-64-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l39_kernel_environment)
+// CHECK45-64-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l39_kernel_environment, ptr [[DYN_PTR]])
 // CHECK45-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // CHECK45-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK45-64:       user_code.entry:
 // CHECK45-64-NEXT:    store i32 0, ptr [[DOTOMP_IV]], align 4
 // CHECK45-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK45-64:       omp.inner.for.cond:
-// CHECK45-64-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19:![0-9]+]]
+// CHECK45-64-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP31:![0-9]+]]
 // CHECK45-64-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP2]], 10
 // CHECK45-64-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK45-64:       omp.inner.for.body:
-// CHECK45-64-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK45-64-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP31]]
 // CHECK45-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP3]], 1
 // CHECK45-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK45-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP19]]
-// CHECK45-64-NEXT:    [[TMP4:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK45-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP31]]
+// CHECK45-64-NEXT:    [[TMP4:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP31]]
 // CHECK45-64-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP4]] to i64
 // CHECK45-64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP0]], i64 0, i64 [[IDXPROM]]
-// CHECK45-64-NEXT:    [[TMP5:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK45-64-NEXT:    [[TMP5:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP31]]
 // CHECK45-64-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP5]], 1
-// CHECK45-64-NEXT:    store i32 [[ADD1]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK45-64-NEXT:    store i32 [[ADD1]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP31]]
 // CHECK45-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK45-64:       omp.body.continue:
 // CHECK45-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK45-64:       omp.inner.for.inc:
-// CHECK45-64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK45-64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP31]]
 // CHECK45-64-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP6]], 1
-// CHECK45-64-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19]]
-// CHECK45-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP20:![0-9]+]]
+// CHECK45-64-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP31]]
+// CHECK45-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP32:![0-9]+]]
 // CHECK45-64:       worker.exit:
 // CHECK45-64-NEXT:    ret void
 // CHECK45-64:       omp.inner.for.end:
@@ -249,19 +255,21 @@ int bar(int n){
 //
 //
 // CHECK45-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l44
-// CHECK45-64-SAME: (ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]]) #[[ATTR0]] {
+// CHECK45-64-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]]) #[[ATTR0]] {
 // CHECK45-64-NEXT:  entry:
+// CHECK45-64-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK45-64-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
 // CHECK45-64-NEXT:    [[N_ADDR:%.*]] = alloca ptr, align 8
 // CHECK45-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
 // CHECK45-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
 // CHECK45-64-NEXT:    [[I:%.*]] = alloca i32, align 4
 // CHECK45-64-NEXT:    [[N1:%.*]] = alloca i32, align 4
+// CHECK45-64-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK45-64-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
 // CHECK45-64-NEXT:    store ptr [[N]], ptr [[N_ADDR]], align 8
 // CHECK45-64-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 8
 // CHECK45-64-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[N_ADDR]], align 8
-// CHECK45-64-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l44_kernel_environment)
+// CHECK45-64-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l44_kernel_environment, ptr [[DYN_PTR]])
 // CHECK45-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP2]], -1
 // CHECK45-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK45-64:       user_code.entry:
@@ -269,28 +277,28 @@ int bar(int n){
 // CHECK45-64-NEXT:    store i32 0, ptr [[N1]], align 4
 // CHECK45-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK45-64:       omp.inner.for.cond:
-// CHECK45-64-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22:![0-9]+]]
+// CHECK45-64-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP34:![0-9]+]]
 // CHECK45-64-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP3]], 10
 // CHECK45-64-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK45-64:       omp.inner.for.body:
-// CHECK45-64-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK45-64-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP34]]
 // CHECK45-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP4]], 1
 // CHECK45-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK45-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP22]]
-// CHECK45-64-NEXT:    [[TMP5:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK45-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP34]]
+// CHECK45-64-NEXT:    [[TMP5:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP34]]
 // CHECK45-64-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP5]] to i64
 // CHECK45-64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP0]], i64 0, i64 [[IDXPROM]]
-// CHECK45-64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK45-64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP34]]
 // CHECK45-64-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP6]], 1
-// CHECK45-64-NEXT:    store i32 [[ADD2]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK45-64-NEXT:    store i32 [[ADD2]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP34]]
 // CHECK45-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK45-64:       omp.body.continue:
 // CHECK45-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK45-64:       omp.inner.for.inc:
-// CHECK45-64-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK45-64-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP34]]
 // CHECK45-64-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP7]], 1
-// CHECK45-64-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
-// CHECK45-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP23:![0-9]+]]
+// CHECK45-64-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP34]]
+// CHECK45-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP35:![0-9]+]]
 // CHECK45-64:       worker.exit:
 // CHECK45-64-NEXT:    ret void
 // CHECK45-64:       omp.inner.for.end:
@@ -304,8 +312,9 @@ int bar(int n){
 //
 //
 // CHECK45-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l29
-// CHECK45-32-SAME: (i32 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK45-32-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[A:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK45-32-NEXT:  entry:
+// CHECK45-32-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK45-32-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
 // CHECK45-32-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
 // CHECK45-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
@@ -314,10 +323,11 @@ int bar(int n){
 // CHECK45-32-NEXT:    [[I:%.*]] = alloca i32, align 4
 // CHECK45-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
 // CHECK45-32-NEXT:    [[I3:%.*]] = alloca i32, align 4
+// CHECK45-32-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK45-32-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
 // CHECK45-32-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
 // CHECK45-32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
-// CHECK45-32-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l29_kernel_environment)
+// CHECK45-32-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l29_kernel_environment, ptr [[DYN_PTR]])
 // CHECK45-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // CHECK45-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK45-32:       user_code.entry:
@@ -336,27 +346,27 @@ int bar(int n){
 // CHECK45-32-NEXT:    store i32 0, ptr [[DOTOMP_IV]], align 4
 // CHECK45-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK45-32:       omp.inner.for.cond:
-// CHECK45-32-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12:![0-9]+]]
-// CHECK45-32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK45-32-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP24:![0-9]+]]
+// CHECK45-32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4, !llvm.access.group [[ACC_GRP24]]
 // CHECK45-32-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP6]], 1
 // CHECK45-32-NEXT:    [[CMP4:%.*]] = icmp slt i32 [[TMP5]], [[ADD]]
 // CHECK45-32-NEXT:    br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK45-32:       omp.inner.for.body:
-// CHECK45-32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK45-32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP24]]
 // CHECK45-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
 // CHECK45-32-NEXT:    [[ADD5:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK45-32-NEXT:    store i32 [[ADD5]], ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP12]]
-// CHECK45-32-NEXT:    [[TMP8:%.*]] = load i32, ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK45-32-NEXT:    store i32 [[ADD5]], ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP24]]
+// CHECK45-32-NEXT:    [[TMP8:%.*]] = load i32, ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP24]]
 // CHECK45-32-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], ptr [[TMP0]], i32 0, i32 [[TMP8]]
-// CHECK45-32-NEXT:    store i32 1, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK45-32-NEXT:    store i32 1, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP24]]
 // CHECK45-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK45-32:       omp.body.continue:
 // CHECK45-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK45-32:       omp.inner.for.inc:
-// CHECK45-32-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK45-32-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP24]]
 // CHECK45-32-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP9]], 1
-// CHECK45-32-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12]]
-// CHECK45-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP13:![0-9]+]]
+// CHECK45-32-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP24]]
+// CHECK45-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP25:![0-9]+]]
 // CHECK45-32:       worker.exit:
 // CHECK45-32-NEXT:    ret void
 // CHECK45-32:       omp.inner.for.end:
@@ -373,8 +383,9 @@ int bar(int n){
 //
 //
 // CHECK45-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l34
-// CHECK45-32-SAME: (i32 noundef [[N:%.*]], ptr noundef nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] {
+// CHECK45-32-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[N:%.*]], ptr noundef nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] {
 // CHECK45-32-NEXT:  entry:
+// CHECK45-32-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK45-32-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
 // CHECK45-32-NEXT:    [[AA_ADDR:%.*]] = alloca ptr, align 4
 // CHECK45-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
@@ -383,10 +394,11 @@ int bar(int n){
 // CHECK45-32-NEXT:    [[I:%.*]] = alloca i32, align 4
 // CHECK45-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
 // CHECK45-32-NEXT:    [[I3:%.*]] = alloca i32, align 4
+// CHECK45-32-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK45-32-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
 // CHECK45-32-NEXT:    store ptr [[AA]], ptr [[AA_ADDR]], align 4
 // CHECK45-32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[AA_ADDR]], align 4
-// CHECK45-32-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l34_kernel_environment)
+// CHECK45-32-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l34_kernel_environment, ptr [[DYN_PTR]])
 // CHECK45-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // CHECK45-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK45-32:       user_code.entry:
@@ -405,31 +417,31 @@ int bar(int n){
 // CHECK45-32-NEXT:    store i32 0, ptr [[DOTOMP_IV]], align 4
 // CHECK45-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK45-32:       omp.inner.for.cond:
-// CHECK45-32-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16:![0-9]+]]
-// CHECK45-32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK45-32-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP28:![0-9]+]]
+// CHECK45-32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4, !llvm.access.group [[ACC_GRP28]]
 // CHECK45-32-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP6]], 1
 // CHECK45-32-NEXT:    [[CMP4:%.*]] = icmp slt i32 [[TMP5]], [[ADD]]
 // CHECK45-32-NEXT:    br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK45-32:       omp.inner.for.body:
-// CHECK45-32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK45-32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP28]]
 // CHECK45-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
 // CHECK45-32-NEXT:    [[ADD5:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK45-32-NEXT:    store i32 [[ADD5]], ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK45-32-NEXT:    [[TMP8:%.*]] = load i32, ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK45-32-NEXT:    store i32 [[ADD5]], ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP28]]
+// CHECK45-32-NEXT:    [[TMP8:%.*]] = load i32, ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP28]]
 // CHECK45-32-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], ptr [[TMP0]], i32 0, i32 [[TMP8]]
-// CHECK45-32-NEXT:    [[TMP9:%.*]] = load i16, ptr [[ARRAYIDX]], align 2, !llvm.access.group [[ACC_GRP16]]
+// CHECK45-32-NEXT:    [[TMP9:%.*]] = load i16, ptr [[ARRAYIDX]], align 2, !llvm.access.group [[ACC_GRP28]]
 // CHECK45-32-NEXT:    [[CONV:%.*]] = sext i16 [[TMP9]] to i32
 // CHECK45-32-NEXT:    [[ADD6:%.*]] = add nsw i32 [[CONV]], 1
 // CHECK45-32-NEXT:    [[CONV7:%.*]] = trunc i32 [[ADD6]] to i16
-// CHECK45-32-NEXT:    store i16 [[CONV7]], ptr [[ARRAYIDX]], align 2, !llvm.access.group [[ACC_GRP16]]
+// CHECK45-32-NEXT:    store i16 [[CONV7]], ptr [[ARRAYIDX]], align 2, !llvm.access.group [[ACC_GRP28]]
 // CHECK45-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK45-32:       omp.body.continue:
 // CHECK45-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK45-32:       omp.inner.for.inc:
-// CHECK45-32-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK45-32-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP28]]
 // CHECK45-32-NEXT:    [[ADD8:%.*]] = add nsw i32 [[TMP10]], 1
-// CHECK45-32-NEXT:    store i32 [[ADD8]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK45-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP17:![0-9]+]]
+// CHECK45-32-NEXT:    store i32 [[ADD8]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP28]]
+// CHECK45-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP29:![0-9]+]]
 // CHECK45-32:       worker.exit:
 // CHECK45-32-NEXT:    ret void
 // CHECK45-32:       omp.inner.for.end:
@@ -446,42 +458,44 @@ int bar(int n){
 //
 //
 // CHECK45-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l39
-// CHECK45-32-SAME: (ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
+// CHECK45-32-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
 // CHECK45-32-NEXT:  entry:
+// CHECK45-32-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK45-32-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 4
 // CHECK45-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
 // CHECK45-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
 // CHECK45-32-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK45-32-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK45-32-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 4
 // CHECK45-32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 4
-// CHECK45-32-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l39_kernel_environment)
+// CHECK45-32-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l39_kernel_environment, ptr [[DYN_PTR]])
 // CHECK45-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // CHECK45-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK45-32:       user_code.entry:
 // CHECK45-32-NEXT:    store i32 0, ptr [[DOTOMP_IV]], align 4
 // CHECK45-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK45-32:       omp.inner.for.cond:
-// CHECK45-32-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19:![0-9]+]]
+// CHECK45-32-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP31:![0-9]+]]
 // CHECK45-32-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP2]], 10
 // CHECK45-32-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK45-32:       omp.inner.for.body:
-// CHECK45-32-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK45-32-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP31]]
 // CHECK45-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP3]], 1
 // CHECK45-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK45-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP19]]
-// CHECK45-32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK45-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP31]]
+// CHECK45-32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP31]]
 // CHECK45-32-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP0]], i32 0, i32 [[TMP4]]
-// CHECK45-32-NEXT:    [[TMP5:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK45-32-NEXT:    [[TMP5:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP31]]
 // CHECK45-32-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP5]], 1
-// CHECK45-32-NEXT:    store i32 [[ADD1]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK45-32-NEXT:    store i32 [[ADD1]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP31]]
 // CHECK45-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK45-32:       omp.body.continue:
 // CHECK45-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK45-32:       omp.inner.for.inc:
-// CHECK45-32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK45-32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP31]]
 // CHECK45-32-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP6]], 1
-// CHECK45-32-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19]]
-// CHECK45-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP20:![0-9]+]]
+// CHECK45-32-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP31]]
+// CHECK45-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP32:![0-9]+]]
 // CHECK45-32:       worker.exit:
 // CHECK45-32-NEXT:    ret void
 // CHECK45-32:       omp.inner.for.end:
@@ -491,19 +505,21 @@ int bar(int n){
 //
 //
 // CHECK45-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l44
-// CHECK45-32-SAME: (ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]]) #[[ATTR0]] {
+// CHECK45-32-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]]) #[[ATTR0]] {
 // CHECK45-32-NEXT:  entry:
+// CHECK45-32-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK45-32-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 4
 // CHECK45-32-NEXT:    [[N_ADDR:%.*]] = alloca ptr, align 4
 // CHECK45-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
 // CHECK45-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
 // CHECK45-32-NEXT:    [[I:%.*]] = alloca i32, align 4
 // CHECK45-32-NEXT:    [[N1:%.*]] = alloca i32, align 4
+// CHECK45-32-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK45-32-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 4
 // CHECK45-32-NEXT:    store ptr [[N]], ptr [[N_ADDR]], align 4
 // CHECK45-32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 4
 // CHECK45-32-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[N_ADDR]], align 4
-// CHECK45-32-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l44_kernel_environment)
+// CHECK45-32-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l44_kernel_environment, ptr [[DYN_PTR]])
 // CHECK45-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP2]], -1
 // CHECK45-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK45-32:       user_code.entry:
@@ -511,27 +527,27 @@ int bar(int n){
 // CHECK45-32-NEXT:    store i32 0, ptr [[N1]], align 4
 // CHECK45-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK45-32:       omp.inner.for.cond:
-// CHECK45-32-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22:![0-9]+]]
+// CHECK45-32-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP34:![0-9]+]]
 // CHECK45-32-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP3]], 10
 // CHECK45-32-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK45-32:       omp.inner.for.body:
-// CHECK45-32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK45-32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP34]]
 // CHECK45-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP4]], 1
 // CHECK45-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK45-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP22]]
-// CHECK45-32-NEXT:    [[TMP5:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK45-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP34]]
+// CHECK45-32-NEXT:    [[TMP5:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP34]]
 // CHECK45-32-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP0]], i32 0, i32 [[TMP5]]
-// CHECK45-32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK45-32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP34]]
 // CHECK45-32-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP6]], 1
-// CHECK45-32-NEXT:    store i32 [[ADD2]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK45-32-NEXT:    store i32 [[ADD2]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP34]]
 // CHECK45-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK45-32:       omp.body.continue:
 // CHECK45-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK45-32:       omp.inner.for.inc:
-// CHECK45-32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK45-32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP34]]
 // CHECK45-32-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP7]], 1
-// CHECK45-32-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
-// CHECK45-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP23:![0-9]+]]
+// CHECK45-32-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP34]]
+// CHECK45-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP35:![0-9]+]]
 // CHECK45-32:       worker.exit:
 // CHECK45-32-NEXT:    ret void
 // CHECK45-32:       omp.inner.for.end:
@@ -545,8 +561,9 @@ int bar(int n){
 //
 //
 // CHECK45-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l29
-// CHECK45-32-EX-SAME: (i32 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK45-32-EX-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[A:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK45-32-EX-NEXT:  entry:
+// CHECK45-32-EX-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK45-32-EX-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
 // CHECK45-32-EX-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
 // CHECK45-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
@@ -555,10 +572,11 @@ int bar(int n){
 // CHECK45-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
 // CHECK45-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
 // CHECK45-32-EX-NEXT:    [[I3:%.*]] = alloca i32, align 4
+// CHECK45-32-EX-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK45-32-EX-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
 // CHECK45-32-EX-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
 // CHECK45-32-EX-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
-// CHECK45-32-EX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l29_kernel_environment)
+// CHECK45-32-EX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l29_kernel_environment, ptr [[DYN_PTR]])
 // CHECK45-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // CHECK45-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK45-32-EX:       user_code.entry:
@@ -577,27 +595,27 @@ int bar(int n){
 // CHECK45-32-EX-NEXT:    store i32 0, ptr [[DOTOMP_IV]], align 4
 // CHECK45-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK45-32-EX:       omp.inner.for.cond:
-// CHECK45-32-EX-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12:![0-9]+]]
-// CHECK45-32-EX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK45-32-EX-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP24:![0-9]+]]
+// CHECK45-32-EX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4, !llvm.access.group [[ACC_GRP24]]
 // CHECK45-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP6]], 1
 // CHECK45-32-EX-NEXT:    [[CMP4:%.*]] = icmp slt i32 [[TMP5]], [[ADD]]
 // CHECK45-32-EX-NEXT:    br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK45-32-EX:       omp.inner.for.body:
-// CHECK45-32-EX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK45-32-EX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP24]]
 // CHECK45-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
 // CHECK45-32-EX-NEXT:    [[ADD5:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK45-32-EX-NEXT:    store i32 [[ADD5]], ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP12]]
-// CHECK45-32-EX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK45-32-EX-NEXT:    store i32 [[ADD5]], ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP24]]
+// CHECK45-32-EX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP24]]
 // CHECK45-32-EX-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], ptr [[TMP0]], i32 0, i32 [[TMP8]]
-// CHECK45-32-EX-NEXT:    store i32 1, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK45-32-EX-NEXT:    store i32 1, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP24]]
 // CHECK45-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK45-32-EX:       omp.body.continue:
 // CHECK45-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK45-32-EX:       omp.inner.for.inc:
-// CHECK45-32-EX-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK45-32-EX-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP24]]
 // CHECK45-32-EX-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP9]], 1
-// CHECK45-32-EX-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12]]
-// CHECK45-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP13:![0-9]+]]
+// CHECK45-32-EX-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP24]]
+// CHECK45-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP25:![0-9]+]]
 // CHECK45-32-EX:       worker.exit:
 // CHECK45-32-EX-NEXT:    ret void
 // CHECK45-32-EX:       omp.inner.for.end:
@@ -614,8 +632,9 @@ int bar(int n){
 //
 //
 // CHECK45-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l34
-// CHECK45-32-EX-SAME: (i32 noundef [[N:%.*]], ptr noundef nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] {
+// CHECK45-32-EX-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[N:%.*]], ptr noundef nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] {
 // CHECK45-32-EX-NEXT:  entry:
+// CHECK45-32-EX-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK45-32-EX-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
 // CHECK45-32-EX-NEXT:    [[AA_ADDR:%.*]] = alloca ptr, align 4
 // CHECK45-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
@@ -624,10 +643,11 @@ int bar(int n){
 // CHECK45-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
 // CHECK45-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
 // CHECK45-32-EX-NEXT:    [[I3:%.*]] = alloca i32, align 4
+// CHECK45-32-EX-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK45-32-EX-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
 // CHECK45-32-EX-NEXT:    store ptr [[AA]], ptr [[AA_ADDR]], align 4
 // CHECK45-32-EX-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[AA_ADDR]], align 4
-// CHECK45-32-EX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l34_kernel_environment)
+// CHECK45-32-EX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l34_kernel_environment, ptr [[DYN_PTR]])
 // CHECK45-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // CHECK45-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK45-32-EX:       user_code.entry:
@@ -646,31 +666,31 @@ int bar(int n){
 // CHECK45-32-EX-NEXT:    store i32 0, ptr [[DOTOMP_IV]], align 4
 // CHECK45-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK45-32-EX:       omp.inner.for.cond:
-// CHECK45-32-EX-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16:![0-9]+]]
-// CHECK45-32-EX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK45-32-EX-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP28:![0-9]+]]
+// CHECK45-32-EX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4, !llvm.access.group [[ACC_GRP28]]
 // CHECK45-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP6]], 1
 // CHECK45-32-EX-NEXT:    [[CMP4:%.*]] = icmp slt i32 [[TMP5]], [[ADD]]
 // CHECK45-32-EX-NEXT:    br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK45-32-EX:       omp.inner.for.body:
-// CHECK45-32-EX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK45-32-EX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP28]]
 // CHECK45-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
 // CHECK45-32-EX-NEXT:    [[ADD5:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK45-32-EX-NEXT:    store i32 [[ADD5]], ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK45-32-EX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK45-32-EX-NEXT:    store i32 [[ADD5]], ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP28]]
+// CHECK45-32-EX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP28]]
 // CHECK45-32-EX-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], ptr [[TMP0]], i32 0, i32 [[TMP8]]
-// CHECK45-32-EX-NEXT:    [[TMP9:%.*]] = load i16, ptr [[ARRAYIDX]], align 2, !llvm.access.group [[ACC_GRP16]]
+// CHECK45-32-EX-NEXT:    [[TMP9:%.*]] = load i16, ptr [[ARRAYIDX]], align 2, !llvm.access.group [[ACC_GRP28]]
 // CHECK45-32-EX-NEXT:    [[CONV:%.*]] = sext i16 [[TMP9]] to i32
 // CHECK45-32-EX-NEXT:    [[ADD6:%.*]] = add nsw i32 [[CONV]], 1
 // CHECK45-32-EX-NEXT:    [[CONV7:%.*]] = trunc i32 [[ADD6]] to i16
-// CHECK45-32-EX-NEXT:    store i16 [[CONV7]], ptr [[ARRAYIDX]], align 2, !llvm.access.group [[ACC_GRP16]]
+// CHECK45-32-EX-NEXT:    store i16 [[CONV7]], ptr [[ARRAYIDX]], align 2, !llvm.access.group [[ACC_GRP28]]
 // CHECK45-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK45-32-EX:       omp.body.continue:
 // CHECK45-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK45-32-EX:       omp.inner.for.inc:
-// CHECK45-32-EX-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK45-32-EX-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP28]]
 // CHECK45-32-EX-NEXT:    [[ADD8:%.*]] = add nsw i32 [[TMP10]], 1
-// CHECK45-32-EX-NEXT:    store i32 [[ADD8]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK45-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP17:![0-9]+]]
+// CHECK45-32-EX-NEXT:    store i32 [[ADD8]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP28]]
+// CHECK45-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP29:![0-9]+]]
 // CHECK45-32-EX:       worker.exit:
 // CHECK45-32-EX-NEXT:    ret void
 // CHECK45-32-EX:       omp.inner.for.end:
@@ -687,42 +707,44 @@ int bar(int n){
 //
 //
 // CHECK45-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l39
-// CHECK45-32-EX-SAME: (ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
+// CHECK45-32-EX-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
 // CHECK45-32-EX-NEXT:  entry:
+// CHECK45-32-EX-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK45-32-EX-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 4
 // CHECK45-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
 // CHECK45-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
 // CHECK45-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK45-32-EX-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK45-32-EX-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 4
 // CHECK45-32-EX-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 4
-// CHECK45-32-EX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l39_kernel_environment)
+// CHECK45-32-EX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l39_kernel_environment, ptr [[DYN_PTR]])
 // CHECK45-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // CHECK45-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK45-32-EX:       user_code.entry:
 // CHECK45-32-EX-NEXT:    store i32 0, ptr [[DOTOMP_IV]], align 4
 // CHECK45-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK45-32-EX:       omp.inner.for.cond:
-// CHECK45-32-EX-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19:![0-9]+]]
+// CHECK45-32-EX-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP31:![0-9]+]]
 // CHECK45-32-EX-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP2]], 10
 // CHECK45-32-EX-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK45-32-EX:       omp.inner.for.body:
-// CHECK45-32-EX-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK45-32-EX-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP31]]
 // CHECK45-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP3]], 1
 // CHECK45-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK45-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP19]]
-// CHECK45-32-EX-NEXT:    [[TMP4:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK45-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP31]]
+// CHECK45-32-EX-NEXT:    [[TMP4:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP31]]
 // CHECK45-32-EX-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP0]], i32 0, i32 [[TMP4]]
-// CHECK45-32-EX-NEXT:    [[TMP5:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK45-32-EX-NEXT:    [[TMP5:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP31]]
 // CHECK45-32-EX-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP5]], 1
-// CHECK45-32-EX-NEXT:    store i32 [[ADD1]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK45-32-EX-NEXT:    store i32 [[ADD1]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP31]]
 // CHECK45-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK45-32-EX:       omp.body.continue:
 // CHECK45-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK45-32-EX:       omp.inner.for.inc:
-// CHECK45-32-EX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK45-32-EX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP31]]
 // CHECK45-32-EX-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP6]], 1
-// CHECK45-32-EX-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19]]
-// CHECK45-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP20:![0-9]+]]
+// CHECK45-32-EX-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP31]]
+// CHECK45-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP32:![0-9]+]]
 // CHECK45-32-EX:       worker.exit:
 // CHECK45-32-EX-NEXT:    ret void
 // CHECK45-32-EX:       omp.inner.for.end:
@@ -732,19 +754,21 @@ int bar(int n){
 //
 //
 // CHECK45-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l44
-// CHECK45-32-EX-SAME: (ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]]) #[[ATTR0]] {
+// CHECK45-32-EX-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]]) #[[ATTR0]] {
 // CHECK45-32-EX-NEXT:  entry:
+// CHECK45-32-EX-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK45-32-EX-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 4
 // CHECK45-32-EX-NEXT:    [[N_ADDR:%.*]] = alloca ptr, align 4
 // CHECK45-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
 // CHECK45-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
 // CHECK45-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
 // CHECK45-32-EX-NEXT:    [[N1:%.*]] = alloca i32, align 4
+// CHECK45-32-EX-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK45-32-EX-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 4
 // CHECK45-32-EX-NEXT:    store ptr [[N]], ptr [[N_ADDR]], align 4
 // CHECK45-32-EX-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 4
 // CHECK45-32-EX-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[N_ADDR]], align 4
-// CHECK45-32-EX-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l44_kernel_environment)
+// CHECK45-32-EX-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l44_kernel_environment, ptr [[DYN_PTR]])
 // CHECK45-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP2]], -1
 // CHECK45-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK45-32-EX:       user_code.entry:
@@ -752,27 +776,27 @@ int bar(int n){
 // CHECK45-32-EX-NEXT:    store i32 0, ptr [[N1]], align 4
 // CHECK45-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK45-32-EX:       omp.inner.for.cond:
-// CHECK45-32-EX-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22:![0-9]+]]
+// CHECK45-32-EX-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP34:![0-9]+]]
 // CHECK45-32-EX-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP3]], 10
 // CHECK45-32-EX-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK45-32-EX:       omp.inner.for.body:
-// CHECK45-32-EX-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK45-32-EX-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP34]]
 // CHECK45-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP4]], 1
 // CHECK45-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK45-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP22]]
-// CHECK45-32-EX-NEXT:    [[TMP5:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK45-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP34]]
+// CHECK45-32-EX-NEXT:    [[TMP5:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP34]]
 // CHECK45-32-EX-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP0]], i32 0, i32 [[TMP5]]
-// CHECK45-32-EX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK45-32-EX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP34]]
 // CHECK45-32-EX-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP6]], 1
-// CHECK45-32-EX-NEXT:    store i32 [[ADD2]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK45-32-EX-NEXT:    store i32 [[ADD2]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP34]]
 // CHECK45-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK45-32-EX:       omp.body.continue:
 // CHECK45-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK45-32-EX:       omp.inner.for.inc:
-// CHECK45-32-EX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK45-32-EX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP34]]
 // CHECK45-32-EX-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP7]], 1
-// CHECK45-32-EX-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
-// CHECK45-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP23:![0-9]+]]
+// CHECK45-32-EX-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP34]]
+// CHECK45-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP35:![0-9]+]]
 // CHECK45-32-EX:       worker.exit:
 // CHECK45-32-EX-NEXT:    ret void
 // CHECK45-32-EX:       omp.inner.for.end:
@@ -786,8 +810,9 @@ int bar(int n){
 //
 //
 // CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l29
-// CHECK-64-SAME: (i64 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-64-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[A:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-64-NEXT:    [[N_ADDR:%.*]] = alloca i64, align 8
 // CHECK-64-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
@@ -796,10 +821,11 @@ int bar(int n){
 // CHECK-64-NEXT:    [[I:%.*]] = alloca i32, align 4
 // CHECK-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
 // CHECK-64-NEXT:    [[I3:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK-64-NEXT:    store i64 [[N]], ptr [[N_ADDR]], align 8
 // CHECK-64-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
 // CHECK-64-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-64-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l29_kernel_environment)
+// CHECK-64-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l29_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-64:       user_code.entry:
@@ -818,28 +844,28 @@ int bar(int n){
 // CHECK-64-NEXT:    store i32 0, ptr [[DOTOMP_IV]], align 4
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-64:       omp.inner.for.cond:
-// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12:![0-9]+]]
-// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP24:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4, !llvm.access.group [[ACC_GRP24]]
 // CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP6]], 1
 // CHECK-64-NEXT:    [[CMP4:%.*]] = icmp slt i32 [[TMP5]], [[ADD]]
 // CHECK-64-NEXT:    br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-64:       omp.inner.for.body:
-// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP24]]
 // CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
 // CHECK-64-NEXT:    [[ADD5:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-64-NEXT:    store i32 [[ADD5]], ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP12]]
-// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK-64-NEXT:    store i32 [[ADD5]], ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP24]]
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP24]]
 // CHECK-64-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP8]] to i64
 // CHECK-64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], ptr [[TMP0]], i64 0, i64 [[IDXPROM]]
-// CHECK-64-NEXT:    store i32 1, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK-64-NEXT:    store i32 1, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP24]]
 // CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-64:       omp.body.continue:
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-64:       omp.inner.for.inc:
-// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP24]]
 // CHECK-64-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP9]], 1
-// CHECK-64-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12]]
-// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP13:![0-9]+]]
+// CHECK-64-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP24]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP25:![0-9]+]]
 // CHECK-64:       worker.exit:
 // CHECK-64-NEXT:    ret void
 // CHECK-64:       omp.inner.for.end:
@@ -856,8 +882,9 @@ int bar(int n){
 //
 //
 // CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l34
-// CHECK-64-SAME: (i64 noundef [[N:%.*]], ptr noundef nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] {
+// CHECK-64-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[N:%.*]], ptr noundef nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] {
 // CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-64-NEXT:    [[N_ADDR:%.*]] = alloca i64, align 8
 // CHECK-64-NEXT:    [[AA_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
@@ -866,10 +893,11 @@ int bar(int n){
 // CHECK-64-NEXT:    [[I:%.*]] = alloca i32, align 4
 // CHECK-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
 // CHECK-64-NEXT:    [[I3:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK-64-NEXT:    store i64 [[N]], ptr [[N_ADDR]], align 8
 // CHECK-64-NEXT:    store ptr [[AA]], ptr [[AA_ADDR]], align 8
 // CHECK-64-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[AA_ADDR]], align 8
-// CHECK-64-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l34_kernel_environment)
+// CHECK-64-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l34_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-64:       user_code.entry:
@@ -888,32 +916,32 @@ int bar(int n){
 // CHECK-64-NEXT:    store i32 0, ptr [[DOTOMP_IV]], align 4
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-64:       omp.inner.for.cond:
-// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16:![0-9]+]]
-// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP28:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4, !llvm.access.group [[ACC_GRP28]]
 // CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP6]], 1
 // CHECK-64-NEXT:    [[CMP4:%.*]] = icmp slt i32 [[TMP5]], [[ADD]]
 // CHECK-64-NEXT:    br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-64:       omp.inner.for.body:
-// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP28]]
 // CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
 // CHECK-64-NEXT:    [[ADD5:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-64-NEXT:    store i32 [[ADD5]], ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK-64-NEXT:    store i32 [[ADD5]], ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP28]]
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP28]]
 // CHECK-64-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP8]] to i64
 // CHECK-64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], ptr [[TMP0]], i64 0, i64 [[IDXPROM]]
-// CHECK-64-NEXT:    [[TMP9:%.*]] = load i16, ptr [[ARRAYIDX]], align 2, !llvm.access.group [[ACC_GRP16]]
+// CHECK-64-NEXT:    [[TMP9:%.*]] = load i16, ptr [[ARRAYIDX]], align 2, !llvm.access.group [[ACC_GRP28]]
 // CHECK-64-NEXT:    [[CONV:%.*]] = sext i16 [[TMP9]] to i32
 // CHECK-64-NEXT:    [[ADD6:%.*]] = add nsw i32 [[CONV]], 1
 // CHECK-64-NEXT:    [[CONV7:%.*]] = trunc i32 [[ADD6]] to i16
-// CHECK-64-NEXT:    store i16 [[CONV7]], ptr [[ARRAYIDX]], align 2, !llvm.access.group [[ACC_GRP16]]
+// CHECK-64-NEXT:    store i16 [[CONV7]], ptr [[ARRAYIDX]], align 2, !llvm.access.group [[ACC_GRP28]]
 // CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-64:       omp.body.continue:
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-64:       omp.inner.for.inc:
-// CHECK-64-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK-64-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP28]]
 // CHECK-64-NEXT:    [[ADD8:%.*]] = add nsw i32 [[TMP10]], 1
-// CHECK-64-NEXT:    store i32 [[ADD8]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP17:![0-9]+]]
+// CHECK-64-NEXT:    store i32 [[ADD8]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP28]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP29:![0-9]+]]
 // CHECK-64:       worker.exit:
 // CHECK-64-NEXT:    ret void
 // CHECK-64:       omp.inner.for.end:
@@ -930,43 +958,45 @@ int bar(int n){
 //
 //
 // CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l39
-// CHECK-64-SAME: (ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
+// CHECK-64-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
 // CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-64-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
 // CHECK-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
 // CHECK-64-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK-64-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
 // CHECK-64-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 8
-// CHECK-64-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l39_kernel_environment)
+// CHECK-64-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l39_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-64:       user_code.entry:
 // CHECK-64-NEXT:    store i32 0, ptr [[DOTOMP_IV]], align 4
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-64:       omp.inner.for.cond:
-// CHECK-64-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP31:![0-9]+]]
 // CHECK-64-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP2]], 10
 // CHECK-64-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-64:       omp.inner.for.body:
-// CHECK-64-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP31]]
 // CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP3]], 1
 // CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP19]]
-// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP31]]
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP31]]
 // CHECK-64-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP4]] to i64
 // CHECK-64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP0]], i64 0, i64 [[IDXPROM]]
-// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP31]]
 // CHECK-64-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP5]], 1
-// CHECK-64-NEXT:    store i32 [[ADD1]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK-64-NEXT:    store i32 [[ADD1]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP31]]
 // CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-64:       omp.body.continue:
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-64:       omp.inner.for.inc:
-// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP31]]
 // CHECK-64-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP6]], 1
-// CHECK-64-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19]]
-// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP20:![0-9]+]]
+// CHECK-64-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP31]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP32:![0-9]+]]
 // CHECK-64:       worker.exit:
 // CHECK-64-NEXT:    ret void
 // CHECK-64:       omp.inner.for.end:
@@ -976,19 +1006,21 @@ int bar(int n){
 //
 //
 // CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l44
-// CHECK-64-SAME: (ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]]) #[[ATTR0]] {
+// CHECK-64-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]]) #[[ATTR0]] {
 // CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-64-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-64-NEXT:    [[N_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-64-NEXT:    [[TMP:%.*]] = alloca i32, align 4
 // CHECK-64-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
 // CHECK-64-NEXT:    [[I:%.*]] = alloca i32, align 4
 // CHECK-64-NEXT:    [[N1:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK-64-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
 // CHECK-64-NEXT:    store ptr [[N]], ptr [[N_ADDR]], align 8
 // CHECK-64-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 8
 // CHECK-64-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[N_ADDR]], align 8
-// CHECK-64-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l44_kernel_environment)
+// CHECK-64-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l44_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP2]], -1
 // CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-64:       user_code.entry:
@@ -996,28 +1028,28 @@ int bar(int n){
 // CHECK-64-NEXT:    store i32 0, ptr [[N1]], align 4
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-64:       omp.inner.for.cond:
-// CHECK-64-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP34:![0-9]+]]
 // CHECK-64-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP3]], 10
 // CHECK-64-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-64:       omp.inner.for.body:
-// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK-64-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP34]]
 // CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP4]], 1
 // CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP22]]
-// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP34]]
+// CHECK-64-NEXT:    [[TMP5:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP34]]
 // CHECK-64-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP5]] to i64
 // CHECK-64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP0]], i64 0, i64 [[IDXPROM]]
-// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK-64-NEXT:    [[TMP6:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP34]]
 // CHECK-64-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP6]], 1
-// CHECK-64-NEXT:    store i32 [[ADD2]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK-64-NEXT:    store i32 [[ADD2]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP34]]
 // CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-64:       omp.body.continue:
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-64:       omp.inner.for.inc:
-// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK-64-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP34]]
 // CHECK-64-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP7]], 1
-// CHECK-64-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
-// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP23:![0-9]+]]
+// CHECK-64-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP34]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP35:![0-9]+]]
 // CHECK-64:       worker.exit:
 // CHECK-64-NEXT:    ret void
 // CHECK-64:       omp.inner.for.end:
@@ -1031,8 +1063,9 @@ int bar(int n){
 //
 //
 // CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l29
-// CHECK-32-SAME: (i32 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-32-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[A:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
 // CHECK-32-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
@@ -1041,10 +1074,11 @@ int bar(int n){
 // CHECK-32-NEXT:    [[I:%.*]] = alloca i32, align 4
 // CHECK-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
 // CHECK-32-NEXT:    [[I3:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK-32-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
 // CHECK-32-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
 // CHECK-32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
-// CHECK-32-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l29_kernel_environment)
+// CHECK-32-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l29_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32:       user_code.entry:
@@ -1063,27 +1097,27 @@ int bar(int n){
 // CHECK-32-NEXT:    store i32 0, ptr [[DOTOMP_IV]], align 4
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32:       omp.inner.for.cond:
-// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12:![0-9]+]]
-// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP24:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4, !llvm.access.group [[ACC_GRP24]]
 // CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP6]], 1
 // CHECK-32-NEXT:    [[CMP4:%.*]] = icmp slt i32 [[TMP5]], [[ADD]]
 // CHECK-32-NEXT:    br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32:       omp.inner.for.body:
-// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP24]]
 // CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
 // CHECK-32-NEXT:    [[ADD5:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-NEXT:    store i32 [[ADD5]], ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP12]]
-// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK-32-NEXT:    store i32 [[ADD5]], ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP24]]
+// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP24]]
 // CHECK-32-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], ptr [[TMP0]], i32 0, i32 [[TMP8]]
-// CHECK-32-NEXT:    store i32 1, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK-32-NEXT:    store i32 1, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP24]]
 // CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32:       omp.body.continue:
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32:       omp.inner.for.inc:
-// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP24]]
 // CHECK-32-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP9]], 1
-// CHECK-32-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12]]
-// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP13:![0-9]+]]
+// CHECK-32-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP24]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP25:![0-9]+]]
 // CHECK-32:       worker.exit:
 // CHECK-32-NEXT:    ret void
 // CHECK-32:       omp.inner.for.end:
@@ -1100,8 +1134,9 @@ int bar(int n){
 //
 //
 // CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l34
-// CHECK-32-SAME: (i32 noundef [[N:%.*]], ptr noundef nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] {
+// CHECK-32-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[N:%.*]], ptr noundef nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] {
 // CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
 // CHECK-32-NEXT:    [[AA_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
@@ -1110,10 +1145,11 @@ int bar(int n){
 // CHECK-32-NEXT:    [[I:%.*]] = alloca i32, align 4
 // CHECK-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
 // CHECK-32-NEXT:    [[I3:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK-32-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
 // CHECK-32-NEXT:    store ptr [[AA]], ptr [[AA_ADDR]], align 4
 // CHECK-32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[AA_ADDR]], align 4
-// CHECK-32-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l34_kernel_environment)
+// CHECK-32-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l34_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32:       user_code.entry:
@@ -1132,31 +1168,31 @@ int bar(int n){
 // CHECK-32-NEXT:    store i32 0, ptr [[DOTOMP_IV]], align 4
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32:       omp.inner.for.cond:
-// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16:![0-9]+]]
-// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP28:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4, !llvm.access.group [[ACC_GRP28]]
 // CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP6]], 1
 // CHECK-32-NEXT:    [[CMP4:%.*]] = icmp slt i32 [[TMP5]], [[ADD]]
 // CHECK-32-NEXT:    br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32:       omp.inner.for.body:
-// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP28]]
 // CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
 // CHECK-32-NEXT:    [[ADD5:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-NEXT:    store i32 [[ADD5]], ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK-32-NEXT:    store i32 [[ADD5]], ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP28]]
+// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP28]]
 // CHECK-32-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], ptr [[TMP0]], i32 0, i32 [[TMP8]]
-// CHECK-32-NEXT:    [[TMP9:%.*]] = load i16, ptr [[ARRAYIDX]], align 2, !llvm.access.group [[ACC_GRP16]]
+// CHECK-32-NEXT:    [[TMP9:%.*]] = load i16, ptr [[ARRAYIDX]], align 2, !llvm.access.group [[ACC_GRP28]]
 // CHECK-32-NEXT:    [[CONV:%.*]] = sext i16 [[TMP9]] to i32
 // CHECK-32-NEXT:    [[ADD6:%.*]] = add nsw i32 [[CONV]], 1
 // CHECK-32-NEXT:    [[CONV7:%.*]] = trunc i32 [[ADD6]] to i16
-// CHECK-32-NEXT:    store i16 [[CONV7]], ptr [[ARRAYIDX]], align 2, !llvm.access.group [[ACC_GRP16]]
+// CHECK-32-NEXT:    store i16 [[CONV7]], ptr [[ARRAYIDX]], align 2, !llvm.access.group [[ACC_GRP28]]
 // CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32:       omp.body.continue:
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32:       omp.inner.for.inc:
-// CHECK-32-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK-32-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP28]]
 // CHECK-32-NEXT:    [[ADD8:%.*]] = add nsw i32 [[TMP10]], 1
-// CHECK-32-NEXT:    store i32 [[ADD8]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP17:![0-9]+]]
+// CHECK-32-NEXT:    store i32 [[ADD8]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP28]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP29:![0-9]+]]
 // CHECK-32:       worker.exit:
 // CHECK-32-NEXT:    ret void
 // CHECK-32:       omp.inner.for.end:
@@ -1173,42 +1209,44 @@ int bar(int n){
 //
 //
 // CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l39
-// CHECK-32-SAME: (ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
+// CHECK-32-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
 // CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
 // CHECK-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
 // CHECK-32-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK-32-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 4
 // CHECK-32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 4
-// CHECK-32-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l39_kernel_environment)
+// CHECK-32-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l39_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32:       user_code.entry:
 // CHECK-32-NEXT:    store i32 0, ptr [[DOTOMP_IV]], align 4
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32:       omp.inner.for.cond:
-// CHECK-32-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP31:![0-9]+]]
 // CHECK-32-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP2]], 10
 // CHECK-32-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32:       omp.inner.for.body:
-// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP31]]
 // CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP3]], 1
 // CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP19]]
-// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP31]]
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP31]]
 // CHECK-32-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP0]], i32 0, i32 [[TMP4]]
-// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP31]]
 // CHECK-32-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP5]], 1
-// CHECK-32-NEXT:    store i32 [[ADD1]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK-32-NEXT:    store i32 [[ADD1]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP31]]
 // CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32:       omp.body.continue:
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32:       omp.inner.for.inc:
-// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP31]]
 // CHECK-32-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP6]], 1
-// CHECK-32-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19]]
-// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP20:![0-9]+]]
+// CHECK-32-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP31]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP32:![0-9]+]]
 // CHECK-32:       worker.exit:
 // CHECK-32-NEXT:    ret void
 // CHECK-32:       omp.inner.for.end:
@@ -1218,19 +1256,21 @@ int bar(int n){
 //
 //
 // CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l44
-// CHECK-32-SAME: (ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]]) #[[ATTR0]] {
+// CHECK-32-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]]) #[[ATTR0]] {
 // CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-NEXT:    [[N_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-NEXT:    [[TMP:%.*]] = alloca i32, align 4
 // CHECK-32-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
 // CHECK-32-NEXT:    [[I:%.*]] = alloca i32, align 4
 // CHECK-32-NEXT:    [[N1:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK-32-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 4
 // CHECK-32-NEXT:    store ptr [[N]], ptr [[N_ADDR]], align 4
 // CHECK-32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 4
 // CHECK-32-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[N_ADDR]], align 4
-// CHECK-32-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l44_kernel_environment)
+// CHECK-32-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l44_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP2]], -1
 // CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32:       user_code.entry:
@@ -1238,27 +1278,27 @@ int bar(int n){
 // CHECK-32-NEXT:    store i32 0, ptr [[N1]], align 4
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32:       omp.inner.for.cond:
-// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP34:![0-9]+]]
 // CHECK-32-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP3]], 10
 // CHECK-32-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32:       omp.inner.for.body:
-// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK-32-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP34]]
 // CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP4]], 1
 // CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP22]]
-// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP34]]
+// CHECK-32-NEXT:    [[TMP5:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP34]]
 // CHECK-32-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP0]], i32 0, i32 [[TMP5]]
-// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK-32-NEXT:    [[TMP6:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP34]]
 // CHECK-32-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP6]], 1
-// CHECK-32-NEXT:    store i32 [[ADD2]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK-32-NEXT:    store i32 [[ADD2]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP34]]
 // CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32:       omp.body.continue:
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32:       omp.inner.for.inc:
-// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK-32-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP34]]
 // CHECK-32-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP7]], 1
-// CHECK-32-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
-// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP23:![0-9]+]]
+// CHECK-32-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP34]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP35:![0-9]+]]
 // CHECK-32:       worker.exit:
 // CHECK-32-NEXT:    ret void
 // CHECK-32:       omp.inner.for.end:
@@ -1272,8 +1312,9 @@ int bar(int n){
 //
 //
 // CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l29
-// CHECK-32-EX-SAME: (i32 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-32-EX-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[A:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-EX-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
 // CHECK-32-EX-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
@@ -1282,10 +1323,11 @@ int bar(int n){
 // CHECK-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
 // CHECK-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
 // CHECK-32-EX-NEXT:    [[I3:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK-32-EX-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
 // CHECK-32-EX-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
 // CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
-// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l29_kernel_environment)
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l29_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32-EX:       user_code.entry:
@@ -1304,27 +1346,27 @@ int bar(int n){
 // CHECK-32-EX-NEXT:    store i32 0, ptr [[DOTOMP_IV]], align 4
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32-EX:       omp.inner.for.cond:
-// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12:![0-9]+]]
-// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP24:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4, !llvm.access.group [[ACC_GRP24]]
 // CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP6]], 1
 // CHECK-32-EX-NEXT:    [[CMP4:%.*]] = icmp slt i32 [[TMP5]], [[ADD]]
 // CHECK-32-EX-NEXT:    br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32-EX:       omp.inner.for.body:
-// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP24]]
 // CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
 // CHECK-32-EX-NEXT:    [[ADD5:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-EX-NEXT:    store i32 [[ADD5]], ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP12]]
-// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD5]], ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP24]]
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP24]]
 // CHECK-32-EX-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], ptr [[TMP0]], i32 0, i32 [[TMP8]]
-// CHECK-32-EX-NEXT:    store i32 1, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK-32-EX-NEXT:    store i32 1, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP24]]
 // CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32-EX:       omp.body.continue:
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32-EX:       omp.inner.for.inc:
-// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12]]
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP24]]
 // CHECK-32-EX-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP9]], 1
-// CHECK-32-EX-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP12]]
-// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP13:![0-9]+]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP24]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP25:![0-9]+]]
 // CHECK-32-EX:       worker.exit:
 // CHECK-32-EX-NEXT:    ret void
 // CHECK-32-EX:       omp.inner.for.end:
@@ -1341,8 +1383,9 @@ int bar(int n){
 //
 //
 // CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l34
-// CHECK-32-EX-SAME: (i32 noundef [[N:%.*]], ptr noundef nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] {
+// CHECK-32-EX-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[N:%.*]], ptr noundef nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0]] {
 // CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-EX-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
 // CHECK-32-EX-NEXT:    [[AA_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
@@ -1351,10 +1394,11 @@ int bar(int n){
 // CHECK-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
 // CHECK-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
 // CHECK-32-EX-NEXT:    [[I3:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK-32-EX-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
 // CHECK-32-EX-NEXT:    store ptr [[AA]], ptr [[AA_ADDR]], align 4
 // CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[AA_ADDR]], align 4
-// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l34_kernel_environment)
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l34_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32-EX:       user_code.entry:
@@ -1373,31 +1417,31 @@ int bar(int n){
 // CHECK-32-EX-NEXT:    store i32 0, ptr [[DOTOMP_IV]], align 4
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32-EX:       omp.inner.for.cond:
-// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16:![0-9]+]]
-// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP28:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4, !llvm.access.group [[ACC_GRP28]]
 // CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP6]], 1
 // CHECK-32-EX-NEXT:    [[CMP4:%.*]] = icmp slt i32 [[TMP5]], [[ADD]]
 // CHECK-32-EX-NEXT:    br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32-EX:       omp.inner.for.body:
-// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP28]]
 // CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
 // CHECK-32-EX-NEXT:    [[ADD5:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-EX-NEXT:    store i32 [[ADD5]], ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD5]], ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP28]]
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP28]]
 // CHECK-32-EX-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], ptr [[TMP0]], i32 0, i32 [[TMP8]]
-// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i16, ptr [[ARRAYIDX]], align 2, !llvm.access.group [[ACC_GRP16]]
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i16, ptr [[ARRAYIDX]], align 2, !llvm.access.group [[ACC_GRP28]]
 // CHECK-32-EX-NEXT:    [[CONV:%.*]] = sext i16 [[TMP9]] to i32
 // CHECK-32-EX-NEXT:    [[ADD6:%.*]] = add nsw i32 [[CONV]], 1
 // CHECK-32-EX-NEXT:    [[CONV7:%.*]] = trunc i32 [[ADD6]] to i16
-// CHECK-32-EX-NEXT:    store i16 [[CONV7]], ptr [[ARRAYIDX]], align 2, !llvm.access.group [[ACC_GRP16]]
+// CHECK-32-EX-NEXT:    store i16 [[CONV7]], ptr [[ARRAYIDX]], align 2, !llvm.access.group [[ACC_GRP28]]
 // CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32-EX:       omp.body.continue:
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32-EX:       omp.inner.for.inc:
-// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP28]]
 // CHECK-32-EX-NEXT:    [[ADD8:%.*]] = add nsw i32 [[TMP10]], 1
-// CHECK-32-EX-NEXT:    store i32 [[ADD8]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP17:![0-9]+]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD8]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP28]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP29:![0-9]+]]
 // CHECK-32-EX:       worker.exit:
 // CHECK-32-EX-NEXT:    ret void
 // CHECK-32-EX:       omp.inner.for.end:
@@ -1414,42 +1458,44 @@ int bar(int n){
 //
 //
 // CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l39
-// CHECK-32-EX-SAME: (ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
+// CHECK-32-EX-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
 // CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-EX-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
 // CHECK-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
 // CHECK-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK-32-EX-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 4
 // CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 4
-// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l39_kernel_environment)
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l39_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32-EX:       user_code.entry:
 // CHECK-32-EX-NEXT:    store i32 0, ptr [[DOTOMP_IV]], align 4
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32-EX:       omp.inner.for.cond:
-// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP31:![0-9]+]]
 // CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP2]], 10
 // CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32-EX:       omp.inner.for.body:
-// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP31]]
 // CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP3]], 1
 // CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP19]]
-// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP31]]
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP31]]
 // CHECK-32-EX-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP0]], i32 0, i32 [[TMP4]]
-// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP31]]
 // CHECK-32-EX-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP5]], 1
-// CHECK-32-EX-NEXT:    store i32 [[ADD1]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD1]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP31]]
 // CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32-EX:       omp.body.continue:
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32-EX:       omp.inner.for.inc:
-// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19]]
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP31]]
 // CHECK-32-EX-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP6]], 1
-// CHECK-32-EX-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP19]]
-// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP20:![0-9]+]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP31]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP32:![0-9]+]]
 // CHECK-32-EX:       worker.exit:
 // CHECK-32-EX-NEXT:    ret void
 // CHECK-32-EX:       omp.inner.for.end:
@@ -1459,19 +1505,21 @@ int bar(int n){
 //
 //
 // CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l44
-// CHECK-32-EX-SAME: (ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]]) #[[ATTR0]] {
+// CHECK-32-EX-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[N:%.*]]) #[[ATTR0]] {
 // CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-EX-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-EX-NEXT:    [[N_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-EX-NEXT:    [[TMP:%.*]] = alloca i32, align 4
 // CHECK-32-EX-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
 // CHECK-32-EX-NEXT:    [[I:%.*]] = alloca i32, align 4
 // CHECK-32-EX-NEXT:    [[N1:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK-32-EX-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 4
 // CHECK-32-EX-NEXT:    store ptr [[N]], ptr [[N_ADDR]], align 4
 // CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 4
 // CHECK-32-EX-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[N_ADDR]], align 4
-// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l44_kernel_environment)
+// CHECK-32-EX-NEXT:    [[TMP2:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l44_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP2]], -1
 // CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32-EX:       user_code.entry:
@@ -1479,27 +1527,27 @@ int bar(int n){
 // CHECK-32-EX-NEXT:    store i32 0, ptr [[N1]], align 4
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32-EX:       omp.inner.for.cond:
-// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP34:![0-9]+]]
 // CHECK-32-EX-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP3]], 10
 // CHECK-32-EX-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32-EX:       omp.inner.for.body:
-// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK-32-EX-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP34]]
 // CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP4]], 1
 // CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP22]]
-// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP34]]
+// CHECK-32-EX-NEXT:    [[TMP5:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP34]]
 // CHECK-32-EX-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP0]], i32 0, i32 [[TMP5]]
-// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK-32-EX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP34]]
 // CHECK-32-EX-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP6]], 1
-// CHECK-32-EX-NEXT:    store i32 [[ADD2]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD2]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP34]]
 // CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32-EX:       omp.body.continue:
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32-EX:       omp.inner.for.inc:
-// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK-32-EX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP34]]
 // CHECK-32-EX-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP7]], 1
-// CHECK-32-EX-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
-// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP23:![0-9]+]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP34]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP35:![0-9]+]]
 // CHECK-32-EX:       worker.exit:
 // CHECK-32-EX-NEXT:    ret void
 // CHECK-32-EX:       omp.inner.for.end:
diff --git a/clang/test/OpenMP/nvptx_target_teams_codegen.cpp b/clang/test/OpenMP/nvptx_target_teams_codegen.cpp
index 01eab5ff971ec..5d6efc1b126a5 100644
--- a/clang/test/OpenMP/nvptx_target_teams_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_target_teams_codegen.cpp
@@ -50,14 +50,16 @@ int bar(int n){
 
 #endif
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l23
-// CHECK1-SAME: (i64 noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
 // CHECK1-NEXT:    [[A_CASTED:%.*]] = alloca i64, align 8
 // CHECK1-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK1-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l23_kernel_environment)
+// CHECK1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l23_kernel_environment, ptr [[DYN_PTR]])
 // CHECK1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK1:       user_code.entry:
@@ -88,14 +90,16 @@ int bar(int n){
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l28
-// CHECK1-SAME: (i64 noundef [[AA:%.*]]) #[[ATTR0]] {
+// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[AA:%.*]]) #[[ATTR0]] {
 // CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[AA_ADDR:%.*]] = alloca i64, align 8
 // CHECK1-NEXT:    [[AA_CASTED:%.*]] = alloca i64, align 8
 // CHECK1-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK1-NEXT:    store i64 [[AA]], ptr [[AA_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l28_kernel_environment)
+// CHECK1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l28_kernel_environment, ptr [[DYN_PTR]])
 // CHECK1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK1:       user_code.entry:
@@ -126,14 +130,16 @@ int bar(int n){
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l33
-// CHECK1-SAME: (i64 noundef [[AA:%.*]]) #[[ATTR0]] {
+// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[AA:%.*]]) #[[ATTR0]] {
 // CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[AA_ADDR:%.*]] = alloca i64, align 8
 // CHECK1-NEXT:    [[AA_CASTED:%.*]] = alloca i64, align 8
 // CHECK1-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK1-NEXT:    store i64 [[AA]], ptr [[AA_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l33_kernel_environment)
+// CHECK1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l33_kernel_environment, ptr [[DYN_PTR]])
 // CHECK1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK1:       user_code.entry:
@@ -202,14 +208,16 @@ int bar(int n){
 //
 //
 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l23
-// CHECK2-SAME: (i32 noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK2-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK2-NEXT:  entry:
+// CHECK2-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK2-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    [[A_CASTED:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK2-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
-// CHECK2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l23_kernel_environment)
+// CHECK2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l23_kernel_environment, ptr [[DYN_PTR]])
 // CHECK2-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK2-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK2:       user_code.entry:
@@ -240,14 +248,16 @@ int bar(int n){
 //
 //
 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l28
-// CHECK2-SAME: (i32 noundef [[AA:%.*]]) #[[ATTR0]] {
+// CHECK2-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[AA:%.*]]) #[[ATTR0]] {
 // CHECK2-NEXT:  entry:
+// CHECK2-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK2-NEXT:    [[AA_ADDR:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    [[AA_CASTED:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK2-NEXT:    store i32 [[AA]], ptr [[AA_ADDR]], align 4
-// CHECK2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l28_kernel_environment)
+// CHECK2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l28_kernel_environment, ptr [[DYN_PTR]])
 // CHECK2-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK2-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK2:       user_code.entry:
@@ -278,14 +288,16 @@ int bar(int n){
 //
 //
 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l33
-// CHECK2-SAME: (i32 noundef [[AA:%.*]]) #[[ATTR0]] {
+// CHECK2-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[AA:%.*]]) #[[ATTR0]] {
 // CHECK2-NEXT:  entry:
+// CHECK2-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK2-NEXT:    [[AA_ADDR:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    [[AA_CASTED:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK2-NEXT:    store i32 [[AA]], ptr [[AA_ADDR]], align 4
-// CHECK2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l33_kernel_environment)
+// CHECK2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l33_kernel_environment, ptr [[DYN_PTR]])
 // CHECK2-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK2-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK2:       user_code.entry:
diff --git a/clang/test/OpenMP/nvptx_target_teams_distribute_codegen.cpp b/clang/test/OpenMP/nvptx_target_teams_distribute_codegen.cpp
index cc17b43000ae0..48ac5a799346c 100644
--- a/clang/test/OpenMP/nvptx_target_teams_distribute_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_target_teams_distribute_codegen.cpp
@@ -33,11 +33,13 @@ int bar(int n){
 
 #endif
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l16
-// CHECK1-SAME: () #[[ATTR0:[0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-// CHECK1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l16_kernel_environment)
+// CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
+// CHECK1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l16_kernel_environment, ptr [[DYN_PTR]])
 // CHECK1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK1:       user_code.entry:
@@ -151,11 +153,13 @@ int bar(int n){
 //
 //
 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l16
-// CHECK2-SAME: () #[[ATTR0:[0-9]+]] {
+// CHECK2-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK2-NEXT:  entry:
+// CHECK2-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK2-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-// CHECK2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l16_kernel_environment)
+// CHECK2-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
+// CHECK2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l16_kernel_environment, ptr [[DYN_PTR]])
 // CHECK2-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK2-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK2:       user_code.entry:
diff --git a/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp b/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp
index df43851378bcb..4d6982d10616e 100644
--- a/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_codegen.cpp
@@ -72,8 +72,9 @@ int bar(int n){
 
 #endif
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l28
-// CHECK1-SAME: (i64 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 noundef [[L:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 noundef [[L:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[N_ADDR:%.*]] = alloca i64, align 8
 // CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[L_ADDR:%.*]] = alloca i64, align 8
@@ -81,11 +82,12 @@ int bar(int n){
 // CHECK1-NEXT:    [[L_CASTED:%.*]] = alloca i64, align 8
 // CHECK1-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK1-NEXT:    store i64 [[N]], ptr [[N_ADDR]], align 8
 // CHECK1-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
 // CHECK1-NEXT:    store i64 [[L]], ptr [[L_ADDR]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l28_kernel_environment)
+// CHECK1-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l28_kernel_environment, ptr [[DYN_PTR]])
 // CHECK1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK1:       user_code.entry:
@@ -384,17 +386,19 @@ int bar(int n){
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l34
-// CHECK1-SAME: (i64 noundef [[N:%.*]], ptr noundef nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR4:[0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[N:%.*]], ptr noundef nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR4:[0-9]+]] {
 // CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[N_ADDR:%.*]] = alloca i64, align 8
 // CHECK1-NEXT:    [[AA_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[N_CASTED:%.*]] = alloca i64, align 8
 // CHECK1-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK1-NEXT:    store i64 [[N]], ptr [[N_ADDR]], align 8
 // CHECK1-NEXT:    store ptr [[AA]], ptr [[AA_ADDR]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[AA_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l34_kernel_environment)
+// CHECK1-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l34_kernel_environment, ptr [[DYN_PTR]])
 // CHECK1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK1:       user_code.entry:
@@ -636,14 +640,16 @@ int bar(int n){
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l39
-// CHECK1-SAME: (ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
+// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
 // CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l39_kernel_environment)
+// CHECK1-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l39_kernel_environment, ptr [[DYN_PTR]])
 // CHECK1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK1:       user_code.entry:
@@ -821,17 +827,19 @@ int bar(int n){
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l44
-// CHECK1-SAME: (ptr noundef nonnull align 4 dereferenceable(400) [[C:%.*]], i64 noundef [[F:%.*]]) #[[ATTR0]] {
+// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[C:%.*]], i64 noundef [[F:%.*]]) #[[ATTR0]] {
 // CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[F_ADDR:%.*]] = alloca i64, align 8
 // CHECK1-NEXT:    [[F_CASTED:%.*]] = alloca i64, align 8
 // CHECK1-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK1-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
 // CHECK1-NEXT:    store i64 [[F]], ptr [[F_ADDR]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l44_kernel_environment)
+// CHECK1-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l44_kernel_environment, ptr [[DYN_PTR]])
 // CHECK1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK1:       user_code.entry:
@@ -1047,17 +1055,19 @@ int bar(int n){
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l52
-// CHECK1-SAME: (i64 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] {
+// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] {
 // CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[N_ADDR:%.*]] = alloca i64, align 8
 // CHECK1-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[N_CASTED:%.*]] = alloca i64, align 8
 // CHECK1-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK1-NEXT:    store i64 [[N]], ptr [[N_ADDR]], align 8
 // CHECK1-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l52_kernel_environment)
+// CHECK1-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l52_kernel_environment, ptr [[DYN_PTR]])
 // CHECK1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK1:       user_code.entry:
@@ -1357,19 +1367,21 @@ int bar(int n){
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l59
-// CHECK1-SAME: (i64 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[A:%.*]], ptr noundef [[V:%.*]]) #[[ATTR0]] {
+// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[A:%.*]], ptr noundef [[V:%.*]]) #[[ATTR0]] {
 // CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[N_ADDR:%.*]] = alloca i64, align 8
 // CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[V_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[N_CASTED:%.*]] = alloca i64, align 8
 // CHECK1-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK1-NEXT:    store i64 [[N]], ptr [[N_ADDR]], align 8
 // CHECK1-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
 // CHECK1-NEXT:    store ptr [[V]], ptr [[V_ADDR]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l59_kernel_environment)
+// CHECK1-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l59_kernel_environment, ptr [[DYN_PTR]])
 // CHECK1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK1:       user_code.entry:
@@ -1620,8 +1632,9 @@ int bar(int n){
 //
 //
 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l28
-// CHECK2-SAME: (i64 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 noundef [[L:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK2-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 noundef [[L:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK2-NEXT:  entry:
+// CHECK2-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK2-NEXT:    [[N_ADDR:%.*]] = alloca i64, align 8
 // CHECK2-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
 // CHECK2-NEXT:    [[L_ADDR:%.*]] = alloca i64, align 8
@@ -1629,11 +1642,12 @@ int bar(int n){
 // CHECK2-NEXT:    [[L_CASTED:%.*]] = alloca i64, align 8
 // CHECK2-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK2-NEXT:    store i64 [[N]], ptr [[N_ADDR]], align 8
 // CHECK2-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
 // CHECK2-NEXT:    store i64 [[L]], ptr [[L_ADDR]], align 8
 // CHECK2-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK2-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l28_kernel_environment)
+// CHECK2-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l28_kernel_environment, ptr [[DYN_PTR]])
 // CHECK2-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // CHECK2-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK2:       user_code.entry:
@@ -1932,17 +1946,19 @@ int bar(int n){
 //
 //
 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l34
-// CHECK2-SAME: (i64 noundef [[N:%.*]], ptr noundef nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR4:[0-9]+]] {
+// CHECK2-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[N:%.*]], ptr noundef nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR4:[0-9]+]] {
 // CHECK2-NEXT:  entry:
+// CHECK2-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK2-NEXT:    [[N_ADDR:%.*]] = alloca i64, align 8
 // CHECK2-NEXT:    [[AA_ADDR:%.*]] = alloca ptr, align 8
 // CHECK2-NEXT:    [[N_CASTED:%.*]] = alloca i64, align 8
 // CHECK2-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK2-NEXT:    store i64 [[N]], ptr [[N_ADDR]], align 8
 // CHECK2-NEXT:    store ptr [[AA]], ptr [[AA_ADDR]], align 8
 // CHECK2-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[AA_ADDR]], align 8
-// CHECK2-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l34_kernel_environment)
+// CHECK2-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l34_kernel_environment, ptr [[DYN_PTR]])
 // CHECK2-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // CHECK2-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK2:       user_code.entry:
@@ -2184,14 +2200,16 @@ int bar(int n){
 //
 //
 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l39
-// CHECK2-SAME: (ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
+// CHECK2-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
 // CHECK2-NEXT:  entry:
+// CHECK2-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK2-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
 // CHECK2-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK2-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
 // CHECK2-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 8
-// CHECK2-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l39_kernel_environment)
+// CHECK2-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l39_kernel_environment, ptr [[DYN_PTR]])
 // CHECK2-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // CHECK2-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK2:       user_code.entry:
@@ -2369,17 +2387,19 @@ int bar(int n){
 //
 //
 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l44
-// CHECK2-SAME: (ptr noundef nonnull align 4 dereferenceable(400) [[C:%.*]], i64 noundef [[F:%.*]]) #[[ATTR0]] {
+// CHECK2-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[C:%.*]], i64 noundef [[F:%.*]]) #[[ATTR0]] {
 // CHECK2-NEXT:  entry:
+// CHECK2-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK2-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8
 // CHECK2-NEXT:    [[F_ADDR:%.*]] = alloca i64, align 8
 // CHECK2-NEXT:    [[F_CASTED:%.*]] = alloca i64, align 8
 // CHECK2-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK2-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
 // CHECK2-NEXT:    store i64 [[F]], ptr [[F_ADDR]], align 8
 // CHECK2-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8
-// CHECK2-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l44_kernel_environment)
+// CHECK2-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l44_kernel_environment, ptr [[DYN_PTR]])
 // CHECK2-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // CHECK2-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK2:       user_code.entry:
@@ -2595,17 +2615,19 @@ int bar(int n){
 //
 //
 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l52
-// CHECK2-SAME: (i64 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] {
+// CHECK2-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] {
 // CHECK2-NEXT:  entry:
+// CHECK2-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK2-NEXT:    [[N_ADDR:%.*]] = alloca i64, align 8
 // CHECK2-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8
 // CHECK2-NEXT:    [[N_CASTED:%.*]] = alloca i64, align 8
 // CHECK2-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK2-NEXT:    store i64 [[N]], ptr [[N_ADDR]], align 8
 // CHECK2-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
 // CHECK2-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8
-// CHECK2-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l52_kernel_environment)
+// CHECK2-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l52_kernel_environment, ptr [[DYN_PTR]])
 // CHECK2-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // CHECK2-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK2:       user_code.entry:
@@ -2900,19 +2922,21 @@ int bar(int n){
 //
 //
 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l59
-// CHECK2-SAME: (i64 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[A:%.*]], ptr noundef [[V:%.*]]) #[[ATTR0]] {
+// CHECK2-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[A:%.*]], ptr noundef [[V:%.*]]) #[[ATTR0]] {
 // CHECK2-NEXT:  entry:
+// CHECK2-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK2-NEXT:    [[N_ADDR:%.*]] = alloca i64, align 8
 // CHECK2-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
 // CHECK2-NEXT:    [[V_ADDR:%.*]] = alloca ptr, align 8
 // CHECK2-NEXT:    [[N_CASTED:%.*]] = alloca i64, align 8
 // CHECK2-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK2-NEXT:    store i64 [[N]], ptr [[N_ADDR]], align 8
 // CHECK2-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
 // CHECK2-NEXT:    store ptr [[V]], ptr [[V_ADDR]], align 8
 // CHECK2-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK2-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l59_kernel_environment)
+// CHECK2-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l59_kernel_environment, ptr [[DYN_PTR]])
 // CHECK2-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // CHECK2-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK2:       user_code.entry:
@@ -3163,8 +3187,9 @@ int bar(int n){
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l28
-// CHECK3-SAME: (i32 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 noundef [[L:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK3-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 noundef [[L:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK3-NEXT:  entry:
+// CHECK3-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
 // CHECK3-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[L_ADDR:%.*]] = alloca i32, align 4
@@ -3172,11 +3197,12 @@ int bar(int n){
 // CHECK3-NEXT:    [[L_CASTED:%.*]] = alloca i32, align 4
 // CHECK3-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK3-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK3-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
 // CHECK3-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
 // CHECK3-NEXT:    store i32 [[L]], ptr [[L_ADDR]], align 4
 // CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
-// CHECK3-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l28_kernel_environment)
+// CHECK3-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l28_kernel_environment, ptr [[DYN_PTR]])
 // CHECK3-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // CHECK3-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK3:       user_code.entry:
@@ -3468,17 +3494,19 @@ int bar(int n){
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l34
-// CHECK3-SAME: (i32 noundef [[N:%.*]], ptr noundef nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR4:[0-9]+]] {
+// CHECK3-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[N:%.*]], ptr noundef nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR4:[0-9]+]] {
 // CHECK3-NEXT:  entry:
+// CHECK3-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
 // CHECK3-NEXT:    [[AA_ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[N_CASTED:%.*]] = alloca i32, align 4
 // CHECK3-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK3-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK3-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
 // CHECK3-NEXT:    store ptr [[AA]], ptr [[AA_ADDR]], align 4
 // CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[AA_ADDR]], align 4
-// CHECK3-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l34_kernel_environment)
+// CHECK3-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l34_kernel_environment, ptr [[DYN_PTR]])
 // CHECK3-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // CHECK3-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK3:       user_code.entry:
@@ -3714,14 +3742,16 @@ int bar(int n){
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l39
-// CHECK3-SAME: (ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
+// CHECK3-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
 // CHECK3-NEXT:  entry:
+// CHECK3-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK3-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK3-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 4
 // CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 4
-// CHECK3-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l39_kernel_environment)
+// CHECK3-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l39_kernel_environment, ptr [[DYN_PTR]])
 // CHECK3-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // CHECK3-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK3:       user_code.entry:
@@ -3893,17 +3923,19 @@ int bar(int n){
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l44
-// CHECK3-SAME: (ptr noundef nonnull align 4 dereferenceable(400) [[C:%.*]], i32 noundef [[F:%.*]]) #[[ATTR0]] {
+// CHECK3-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[C:%.*]], i32 noundef [[F:%.*]]) #[[ATTR0]] {
 // CHECK3-NEXT:  entry:
+// CHECK3-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[F_ADDR:%.*]] = alloca i32, align 4
 // CHECK3-NEXT:    [[F_CASTED:%.*]] = alloca i32, align 4
 // CHECK3-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK3-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK3-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 4
 // CHECK3-NEXT:    store i32 [[F]], ptr [[F_ADDR]], align 4
 // CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 4
-// CHECK3-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l44_kernel_environment)
+// CHECK3-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l44_kernel_environment, ptr [[DYN_PTR]])
 // CHECK3-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // CHECK3-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK3:       user_code.entry:
@@ -4112,17 +4144,19 @@ int bar(int n){
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l52
-// CHECK3-SAME: (i32 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] {
+// CHECK3-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR0]] {
 // CHECK3-NEXT:  entry:
+// CHECK3-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
 // CHECK3-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[N_CASTED:%.*]] = alloca i32, align 4
 // CHECK3-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK3-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK3-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
 // CHECK3-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 4
 // CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 4
-// CHECK3-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l52_kernel_environment)
+// CHECK3-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l52_kernel_environment, ptr [[DYN_PTR]])
 // CHECK3-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // CHECK3-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK3:       user_code.entry:
@@ -4425,19 +4459,21 @@ int bar(int n){
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l59
-// CHECK3-SAME: (i32 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[A:%.*]], ptr noundef [[V:%.*]]) #[[ATTR0]] {
+// CHECK3-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[A:%.*]], ptr noundef [[V:%.*]]) #[[ATTR0]] {
 // CHECK3-NEXT:  entry:
+// CHECK3-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
 // CHECK3-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[V_ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[N_CASTED:%.*]] = alloca i32, align 4
 // CHECK3-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK3-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK3-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
 // CHECK3-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
 // CHECK3-NEXT:    store ptr [[V]], ptr [[V_ADDR]], align 4
 // CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
-// CHECK3-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l59_kernel_environment)
+// CHECK3-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l59_kernel_environment, ptr [[DYN_PTR]])
 // CHECK3-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // CHECK3-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK3:       user_code.entry:
diff --git a/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_generic_mode_codegen.cpp b/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_generic_mode_codegen.cpp
index cb206a19f467b..045cd39b07b73 100644
--- a/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_generic_mode_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_generic_mode_codegen.cpp
@@ -30,8 +30,9 @@ int main(int argc, char **argv) {
 
 #endif
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l24
-// CHECK1-SAME: (i64 noundef [[ARGC:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[ARGC:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[ARGC_ADDR:%.*]] = alloca i64, align 8
 // CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8
@@ -39,11 +40,12 @@ int main(int argc, char **argv) {
 // CHECK1-NEXT:    [[DOTCAPTURE_EXPR__CASTED:%.*]] = alloca i64, align 8
 // CHECK1-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK1-NEXT:    store i64 [[ARGC]], ptr [[ARGC_ADDR]], align 8
 // CHECK1-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
 // CHECK1-NEXT:    store i64 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l24_kernel_environment)
+// CHECK1-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l24_kernel_environment, ptr [[DYN_PTR]])
 // CHECK1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK1:       user_code.entry:
@@ -328,8 +330,9 @@ int main(int argc, char **argv) {
 //
 //
 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l24
-// CHECK2-SAME: (i32 noundef [[ARGC:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK2-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[ARGC:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK2-NEXT:  entry:
+// CHECK2-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK2-NEXT:    [[ARGC_ADDR:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
 // CHECK2-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i32, align 4
@@ -337,11 +340,12 @@ int main(int argc, char **argv) {
 // CHECK2-NEXT:    [[DOTCAPTURE_EXPR__CASTED:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK2-NEXT:    store i32 [[ARGC]], ptr [[ARGC_ADDR]], align 4
 // CHECK2-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
 // CHECK2-NEXT:    store i32 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 4
 // CHECK2-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
-// CHECK2-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l24_kernel_environment)
+// CHECK2-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l24_kernel_environment, ptr [[DYN_PTR]])
 // CHECK2-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // CHECK2-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK2:       user_code.entry:
diff --git a/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_simd_codegen.cpp b/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_simd_codegen.cpp
index fdd4457739510..2520713da50e5 100644
--- a/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_simd_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_target_teams_distribute_parallel_for_simd_codegen.cpp
@@ -60,8 +60,9 @@ int bar(int n){
 
 #endif
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l26
-// CHECK1-SAME: (i64 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 noundef [[L:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 noundef [[L:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[N_ADDR:%.*]] = alloca i64, align 8
 // CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[L_ADDR:%.*]] = alloca i64, align 8
@@ -69,11 +70,12 @@ int bar(int n){
 // CHECK1-NEXT:    [[L_CASTED:%.*]] = alloca i64, align 8
 // CHECK1-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK1-NEXT:    store i64 [[N]], ptr [[N_ADDR]], align 8
 // CHECK1-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
 // CHECK1-NEXT:    store i64 [[L]], ptr [[L_ADDR]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l26_kernel_environment)
+// CHECK1-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l26_kernel_environment, ptr [[DYN_PTR]])
 // CHECK1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK1:       user_code.entry:
@@ -157,69 +159,69 @@ int bar(int n){
 // CHECK1-NEXT:    store i32 [[TMP11]], ptr [[DOTOMP_IV]], align 4
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK1:       omp.inner.for.cond:
-// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16:![0-9]+]]
-// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP18:![0-9]+]]
+// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP13]], 1
 // CHECK1-NEXT:    [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]]
 // CHECK1-NEXT:    br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK1:       omp.inner.for.body:
-// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK1-NEXT:    [[TMP15:%.*]] = zext i32 [[TMP14]] to i64
-// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK1-NEXT:    [[TMP17:%.*]] = zext i32 [[TMP16]] to i64
-// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[N_ADDR]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK1-NEXT:    store i32 [[TMP18]], ptr [[N_CASTED]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK1-NEXT:    [[TMP19:%.*]] = load i64, ptr [[N_CASTED]], align 8, !llvm.access.group [[ACC_GRP16]]
-// CHECK1-NEXT:    [[TMP20:%.*]] = load i32, ptr [[L_ADDR]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK1-NEXT:    store i32 [[TMP20]], ptr [[L_CASTED]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK1-NEXT:    [[TMP21:%.*]] = load i64, ptr [[L_CASTED]], align 8, !llvm.access.group [[ACC_GRP16]]
+// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[N_ADDR]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK1-NEXT:    store i32 [[TMP18]], ptr [[N_CASTED]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK1-NEXT:    [[TMP19:%.*]] = load i64, ptr [[N_CASTED]], align 8, !llvm.access.group [[ACC_GRP18]]
+// CHECK1-NEXT:    [[TMP20:%.*]] = load i32, ptr [[L_ADDR]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK1-NEXT:    store i32 [[TMP20]], ptr [[L_CASTED]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK1-NEXT:    [[TMP21:%.*]] = load i64, ptr [[L_CASTED]], align 8, !llvm.access.group [[ACC_GRP18]]
 // CHECK1-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [5 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
 // CHECK1-NEXT:    [[TMP23:%.*]] = inttoptr i64 [[TMP15]] to ptr
-// CHECK1-NEXT:    store ptr [[TMP23]], ptr [[TMP22]], align 8, !llvm.access.group [[ACC_GRP16]]
+// CHECK1-NEXT:    store ptr [[TMP23]], ptr [[TMP22]], align 8, !llvm.access.group [[ACC_GRP18]]
 // CHECK1-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [5 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 1
 // CHECK1-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP17]] to ptr
-// CHECK1-NEXT:    store ptr [[TMP25]], ptr [[TMP24]], align 8, !llvm.access.group [[ACC_GRP16]]
+// CHECK1-NEXT:    store ptr [[TMP25]], ptr [[TMP24]], align 8, !llvm.access.group [[ACC_GRP18]]
 // CHECK1-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [5 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 2
 // CHECK1-NEXT:    [[TMP27:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK1-NEXT:    store ptr [[TMP27]], ptr [[TMP26]], align 8, !llvm.access.group [[ACC_GRP16]]
+// CHECK1-NEXT:    store ptr [[TMP27]], ptr [[TMP26]], align 8, !llvm.access.group [[ACC_GRP18]]
 // CHECK1-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [5 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 3
-// CHECK1-NEXT:    store ptr [[TMP0]], ptr [[TMP28]], align 8, !llvm.access.group [[ACC_GRP16]]
+// CHECK1-NEXT:    store ptr [[TMP0]], ptr [[TMP28]], align 8, !llvm.access.group [[ACC_GRP18]]
 // CHECK1-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [5 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 4
 // CHECK1-NEXT:    [[TMP30:%.*]] = inttoptr i64 [[TMP21]] to ptr
-// CHECK1-NEXT:    store ptr [[TMP30]], ptr [[TMP29]], align 8, !llvm.access.group [[ACC_GRP16]]
-// CHECK1-NEXT:    [[TMP31:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !llvm.access.group [[ACC_GRP16]]
-// CHECK1-NEXT:    [[TMP32:%.*]] = load i32, ptr [[TMP31]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK1-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP32]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l26_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 5), !llvm.access.group [[ACC_GRP16]]
+// CHECK1-NEXT:    store ptr [[TMP30]], ptr [[TMP29]], align 8, !llvm.access.group [[ACC_GRP18]]
+// CHECK1-NEXT:    [[TMP31:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !llvm.access.group [[ACC_GRP18]]
+// CHECK1-NEXT:    [[TMP32:%.*]] = load i32, ptr [[TMP31]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK1-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP32]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l26_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 5), !llvm.access.group [[ACC_GRP18]]
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK1:       omp.inner.for.inc:
-// CHECK1-NEXT:    [[TMP33:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK1-NEXT:    [[TMP34:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK1-NEXT:    [[TMP33:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK1-NEXT:    [[TMP34:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK1-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP33]], [[TMP34]]
-// CHECK1-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK1-NEXT:    [[TMP35:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK1-NEXT:    [[TMP36:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK1-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK1-NEXT:    [[TMP35:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK1-NEXT:    [[TMP36:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK1-NEXT:    [[ADD7:%.*]] = add nsw i32 [[TMP35]], [[TMP36]]
-// CHECK1-NEXT:    store i32 [[ADD7]], ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK1-NEXT:    [[TMP37:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK1-NEXT:    [[TMP38:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK1-NEXT:    store i32 [[ADD7]], ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK1-NEXT:    [[TMP37:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK1-NEXT:    [[TMP38:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK1-NEXT:    [[ADD8:%.*]] = add nsw i32 [[TMP37]], [[TMP38]]
-// CHECK1-NEXT:    store i32 [[ADD8]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK1-NEXT:    [[TMP39:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK1-NEXT:    [[TMP40:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK1-NEXT:    store i32 [[ADD8]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK1-NEXT:    [[TMP39:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK1-NEXT:    [[TMP40:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK1-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[TMP39]], [[TMP40]]
 // CHECK1-NEXT:    br i1 [[CMP9]], label [[COND_TRUE10:%.*]], label [[COND_FALSE11:%.*]]
 // CHECK1:       cond.true10:
-// CHECK1-NEXT:    [[TMP41:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK1-NEXT:    [[TMP41:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK1-NEXT:    br label [[COND_END12:%.*]]
 // CHECK1:       cond.false11:
-// CHECK1-NEXT:    [[TMP42:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK1-NEXT:    [[TMP42:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK1-NEXT:    br label [[COND_END12]]
 // CHECK1:       cond.end12:
 // CHECK1-NEXT:    [[COND13:%.*]] = phi i32 [ [[TMP41]], [[COND_TRUE10]] ], [ [[TMP42]], [[COND_FALSE11]] ]
-// CHECK1-NEXT:    store i32 [[COND13]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK1-NEXT:    [[TMP43:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK1-NEXT:    store i32 [[TMP43]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP17:![0-9]+]]
+// CHECK1-NEXT:    store i32 [[COND13]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK1-NEXT:    [[TMP43:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK1-NEXT:    store i32 [[TMP43]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP19:![0-9]+]]
 // CHECK1:       omp.inner.for.end:
 // CHECK1-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK1:       omp.loop.exit:
@@ -331,29 +333,29 @@ int bar(int n){
 // CHECK1:       omp.dispatch.body:
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK1:       omp.inner.for.cond:
-// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP20:![0-9]+]]
-// CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP20]]
+// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22:![0-9]+]]
+// CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP22]]
 // CHECK1-NEXT:    [[CMP9:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]]
 // CHECK1-NEXT:    br i1 [[CMP9]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK1:       omp.inner.for.body:
-// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP20]]
+// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
 // CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1
 // CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK1-NEXT:    store i32 [[ADD]], ptr [[I4]], align 4, !llvm.access.group [[ACC_GRP20]]
-// CHECK1-NEXT:    [[TMP19:%.*]] = load i32, ptr [[I4]], align 4, !llvm.access.group [[ACC_GRP20]]
+// CHECK1-NEXT:    store i32 [[ADD]], ptr [[I4]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK1-NEXT:    [[TMP19:%.*]] = load i32, ptr [[I4]], align 4, !llvm.access.group [[ACC_GRP22]]
 // CHECK1-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP19]] to i64
 // CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], ptr [[TMP0]], i64 0, i64 [[IDXPROM]]
-// CHECK1-NEXT:    store i32 1, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP20]]
-// CHECK1-NEXT:    [[TMP20:%.*]] = load i32, ptr [[I4]], align 4, !llvm.access.group [[ACC_GRP20]]
-// CHECK1-NEXT:    store i32 [[TMP20]], ptr [[L_ADDR]], align 4, !llvm.access.group [[ACC_GRP20]]
+// CHECK1-NEXT:    store i32 1, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK1-NEXT:    [[TMP20:%.*]] = load i32, ptr [[I4]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK1-NEXT:    store i32 [[TMP20]], ptr [[L_ADDR]], align 4, !llvm.access.group [[ACC_GRP22]]
 // CHECK1-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK1:       omp.body.continue:
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK1:       omp.inner.for.inc:
-// CHECK1-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP20]]
+// CHECK1-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
 // CHECK1-NEXT:    [[ADD10:%.*]] = add nsw i32 [[TMP21]], 1
-// CHECK1-NEXT:    store i32 [[ADD10]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP20]]
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP21:![0-9]+]]
+// CHECK1-NEXT:    store i32 [[ADD10]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP23:![0-9]+]]
 // CHECK1:       omp.inner.for.end:
 // CHECK1-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK1:       omp.dispatch.inc:
@@ -396,17 +398,19 @@ int bar(int n){
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l32
-// CHECK1-SAME: (i64 noundef [[N:%.*]], ptr noundef nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR4:[0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[N:%.*]], ptr noundef nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR4:[0-9]+]] {
 // CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[N_ADDR:%.*]] = alloca i64, align 8
 // CHECK1-NEXT:    [[AA_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[N_CASTED:%.*]] = alloca i64, align 8
 // CHECK1-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK1-NEXT:    store i64 [[N]], ptr [[N_ADDR]], align 8
 // CHECK1-NEXT:    store ptr [[AA]], ptr [[AA_ADDR]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[AA_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l32_kernel_environment)
+// CHECK1-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l32_kernel_environment, ptr [[DYN_PTR]])
 // CHECK1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK1:       user_code.entry:
@@ -485,63 +489,63 @@ int bar(int n){
 // CHECK1-NEXT:    store i32 [[TMP11]], ptr [[DOTOMP_IV]], align 4
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK1:       omp.inner.for.cond:
-// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP23:![0-9]+]]
-// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP25:![0-9]+]]
+// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4, !llvm.access.group [[ACC_GRP25]]
 // CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP13]], 1
 // CHECK1-NEXT:    [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]]
 // CHECK1-NEXT:    br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK1:       omp.inner.for.body:
-// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP25]]
 // CHECK1-NEXT:    [[TMP15:%.*]] = zext i32 [[TMP14]] to i64
-// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP25]]
 // CHECK1-NEXT:    [[TMP17:%.*]] = zext i32 [[TMP16]] to i64
-// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[N_ADDR]], align 4, !llvm.access.group [[ACC_GRP23]]
-// CHECK1-NEXT:    store i32 [[TMP18]], ptr [[N_CASTED]], align 4, !llvm.access.group [[ACC_GRP23]]
-// CHECK1-NEXT:    [[TMP19:%.*]] = load i64, ptr [[N_CASTED]], align 8, !llvm.access.group [[ACC_GRP23]]
+// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[N_ADDR]], align 4, !llvm.access.group [[ACC_GRP25]]
+// CHECK1-NEXT:    store i32 [[TMP18]], ptr [[N_CASTED]], align 4, !llvm.access.group [[ACC_GRP25]]
+// CHECK1-NEXT:    [[TMP19:%.*]] = load i64, ptr [[N_CASTED]], align 8, !llvm.access.group [[ACC_GRP25]]
 // CHECK1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
 // CHECK1-NEXT:    [[TMP21:%.*]] = inttoptr i64 [[TMP15]] to ptr
-// CHECK1-NEXT:    store ptr [[TMP21]], ptr [[TMP20]], align 8, !llvm.access.group [[ACC_GRP23]]
+// CHECK1-NEXT:    store ptr [[TMP21]], ptr [[TMP20]], align 8, !llvm.access.group [[ACC_GRP25]]
 // CHECK1-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 1
 // CHECK1-NEXT:    [[TMP23:%.*]] = inttoptr i64 [[TMP17]] to ptr
-// CHECK1-NEXT:    store ptr [[TMP23]], ptr [[TMP22]], align 8, !llvm.access.group [[ACC_GRP23]]
+// CHECK1-NEXT:    store ptr [[TMP23]], ptr [[TMP22]], align 8, !llvm.access.group [[ACC_GRP25]]
 // CHECK1-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 2
 // CHECK1-NEXT:    [[TMP25:%.*]] = inttoptr i64 [[TMP19]] to ptr
-// CHECK1-NEXT:    store ptr [[TMP25]], ptr [[TMP24]], align 8, !llvm.access.group [[ACC_GRP23]]
+// CHECK1-NEXT:    store ptr [[TMP25]], ptr [[TMP24]], align 8, !llvm.access.group [[ACC_GRP25]]
 // CHECK1-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 3
-// CHECK1-NEXT:    store ptr [[TMP0]], ptr [[TMP26]], align 8, !llvm.access.group [[ACC_GRP23]]
-// CHECK1-NEXT:    [[TMP27:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !llvm.access.group [[ACC_GRP23]]
-// CHECK1-NEXT:    [[TMP28:%.*]] = load i32, ptr [[TMP27]], align 4, !llvm.access.group [[ACC_GRP23]]
-// CHECK1-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP28]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l32_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 4), !llvm.access.group [[ACC_GRP23]]
+// CHECK1-NEXT:    store ptr [[TMP0]], ptr [[TMP26]], align 8, !llvm.access.group [[ACC_GRP25]]
+// CHECK1-NEXT:    [[TMP27:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !llvm.access.group [[ACC_GRP25]]
+// CHECK1-NEXT:    [[TMP28:%.*]] = load i32, ptr [[TMP27]], align 4, !llvm.access.group [[ACC_GRP25]]
+// CHECK1-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP28]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l32_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 4), !llvm.access.group [[ACC_GRP25]]
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK1:       omp.inner.for.inc:
-// CHECK1-NEXT:    [[TMP29:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP23]]
-// CHECK1-NEXT:    [[TMP30:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK1-NEXT:    [[TMP29:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP25]]
+// CHECK1-NEXT:    [[TMP30:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP25]]
 // CHECK1-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP29]], [[TMP30]]
-// CHECK1-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP23]]
-// CHECK1-NEXT:    [[TMP31:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP23]]
-// CHECK1-NEXT:    [[TMP32:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK1-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP25]]
+// CHECK1-NEXT:    [[TMP31:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP25]]
+// CHECK1-NEXT:    [[TMP32:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP25]]
 // CHECK1-NEXT:    [[ADD7:%.*]] = add nsw i32 [[TMP31]], [[TMP32]]
-// CHECK1-NEXT:    store i32 [[ADD7]], ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP23]]
-// CHECK1-NEXT:    [[TMP33:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP23]]
-// CHECK1-NEXT:    [[TMP34:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK1-NEXT:    store i32 [[ADD7]], ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP25]]
+// CHECK1-NEXT:    [[TMP33:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP25]]
+// CHECK1-NEXT:    [[TMP34:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP25]]
 // CHECK1-NEXT:    [[ADD8:%.*]] = add nsw i32 [[TMP33]], [[TMP34]]
-// CHECK1-NEXT:    store i32 [[ADD8]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP23]]
-// CHECK1-NEXT:    [[TMP35:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP23]]
-// CHECK1-NEXT:    [[TMP36:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK1-NEXT:    store i32 [[ADD8]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP25]]
+// CHECK1-NEXT:    [[TMP35:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP25]]
+// CHECK1-NEXT:    [[TMP36:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4, !llvm.access.group [[ACC_GRP25]]
 // CHECK1-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[TMP35]], [[TMP36]]
 // CHECK1-NEXT:    br i1 [[CMP9]], label [[COND_TRUE10:%.*]], label [[COND_FALSE11:%.*]]
 // CHECK1:       cond.true10:
-// CHECK1-NEXT:    [[TMP37:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK1-NEXT:    [[TMP37:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4, !llvm.access.group [[ACC_GRP25]]
 // CHECK1-NEXT:    br label [[COND_END12:%.*]]
 // CHECK1:       cond.false11:
-// CHECK1-NEXT:    [[TMP38:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK1-NEXT:    [[TMP38:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP25]]
 // CHECK1-NEXT:    br label [[COND_END12]]
 // CHECK1:       cond.end12:
 // CHECK1-NEXT:    [[COND13:%.*]] = phi i32 [ [[TMP37]], [[COND_TRUE10]] ], [ [[TMP38]], [[COND_FALSE11]] ]
-// CHECK1-NEXT:    store i32 [[COND13]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP23]]
-// CHECK1-NEXT:    [[TMP39:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP23]]
-// CHECK1-NEXT:    store i32 [[TMP39]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP23]]
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP24:![0-9]+]]
+// CHECK1-NEXT:    store i32 [[COND13]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP25]]
+// CHECK1-NEXT:    [[TMP39:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP25]]
+// CHECK1-NEXT:    store i32 [[TMP39]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP25]]
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP26:![0-9]+]]
 // CHECK1:       omp.inner.for.end:
 // CHECK1-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK1:       omp.loop.exit:
@@ -621,33 +625,33 @@ int bar(int n){
 // CHECK1-NEXT:    store i32 [[TMP9]], ptr [[DOTOMP_IV]], align 4
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK1:       omp.inner.for.cond:
-// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26:![0-9]+]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP28:![0-9]+]]
 // CHECK1-NEXT:    [[CONV5:%.*]] = sext i32 [[TMP10]] to i64
-// CHECK1-NEXT:    [[TMP11:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR]], align 8, !llvm.access.group [[ACC_GRP26]]
+// CHECK1-NEXT:    [[TMP11:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR]], align 8, !llvm.access.group [[ACC_GRP28]]
 // CHECK1-NEXT:    [[CMP6:%.*]] = icmp ule i64 [[CONV5]], [[TMP11]]
 // CHECK1-NEXT:    br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK1:       omp.inner.for.body:
-// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP28]]
 // CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1
 // CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK1-NEXT:    store i32 [[ADD]], ptr [[I4]], align 4, !llvm.access.group [[ACC_GRP26]]
-// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[I4]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK1-NEXT:    store i32 [[ADD]], ptr [[I4]], align 4, !llvm.access.group [[ACC_GRP28]]
+// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[I4]], align 4, !llvm.access.group [[ACC_GRP28]]
 // CHECK1-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP13]] to i64
 // CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], ptr [[TMP0]], i64 0, i64 [[IDXPROM]]
-// CHECK1-NEXT:    [[TMP14:%.*]] = load i16, ptr [[ARRAYIDX]], align 2, !llvm.access.group [[ACC_GRP26]]
+// CHECK1-NEXT:    [[TMP14:%.*]] = load i16, ptr [[ARRAYIDX]], align 2, !llvm.access.group [[ACC_GRP28]]
 // CHECK1-NEXT:    [[CONV7:%.*]] = sext i16 [[TMP14]] to i32
 // CHECK1-NEXT:    [[ADD8:%.*]] = add nsw i32 [[CONV7]], 1
 // CHECK1-NEXT:    [[CONV9:%.*]] = trunc i32 [[ADD8]] to i16
-// CHECK1-NEXT:    store i16 [[CONV9]], ptr [[ARRAYIDX]], align 2, !llvm.access.group [[ACC_GRP26]]
+// CHECK1-NEXT:    store i16 [[CONV9]], ptr [[ARRAYIDX]], align 2, !llvm.access.group [[ACC_GRP28]]
 // CHECK1-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK1:       omp.body.continue:
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK1:       omp.inner.for.inc:
-// CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
-// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP28]]
+// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP28]]
 // CHECK1-NEXT:    [[ADD10:%.*]] = add nsw i32 [[TMP15]], [[TMP16]]
-// CHECK1-NEXT:    store i32 [[ADD10]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP27:![0-9]+]]
+// CHECK1-NEXT:    store i32 [[ADD10]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP28]]
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP29:![0-9]+]]
 // CHECK1:       omp.inner.for.end:
 // CHECK1-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK1:       omp.loop.exit:
@@ -672,14 +676,16 @@ int bar(int n){
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l37
-// CHECK1-SAME: (ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
+// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
 // CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l37_kernel_environment)
+// CHECK1-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l37_kernel_environment, ptr [[DYN_PTR]])
 // CHECK1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK1:       user_code.entry:
@@ -734,51 +740,51 @@ int bar(int n){
 // CHECK1-NEXT:    store i32 [[TMP5]], ptr [[DOTOMP_IV]], align 4
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK1:       omp.inner.for.cond:
-// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP29:![0-9]+]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP31:![0-9]+]]
 // CHECK1-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP6]], 10
 // CHECK1-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK1:       omp.inner.for.body:
-// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP29]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP31]]
 // CHECK1-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP7]] to i64
-// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP29]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP31]]
 // CHECK1-NEXT:    [[TMP10:%.*]] = zext i32 [[TMP9]] to i64
 // CHECK1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [3 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
 // CHECK1-NEXT:    [[TMP12:%.*]] = inttoptr i64 [[TMP8]] to ptr
-// CHECK1-NEXT:    store ptr [[TMP12]], ptr [[TMP11]], align 8, !llvm.access.group [[ACC_GRP29]]
+// CHECK1-NEXT:    store ptr [[TMP12]], ptr [[TMP11]], align 8, !llvm.access.group [[ACC_GRP31]]
 // CHECK1-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [3 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 1
 // CHECK1-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK1-NEXT:    store ptr [[TMP14]], ptr [[TMP13]], align 8, !llvm.access.group [[ACC_GRP29]]
+// CHECK1-NEXT:    store ptr [[TMP14]], ptr [[TMP13]], align 8, !llvm.access.group [[ACC_GRP31]]
 // CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [3 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 2
-// CHECK1-NEXT:    store ptr [[TMP0]], ptr [[TMP15]], align 8, !llvm.access.group [[ACC_GRP29]]
-// CHECK1-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l37_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 3), !llvm.access.group [[ACC_GRP29]]
+// CHECK1-NEXT:    store ptr [[TMP0]], ptr [[TMP15]], align 8, !llvm.access.group [[ACC_GRP31]]
+// CHECK1-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l37_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 3), !llvm.access.group [[ACC_GRP31]]
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK1:       omp.inner.for.inc:
-// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP29]]
-// CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP29]]
+// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP31]]
+// CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP31]]
 // CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP16]], [[TMP17]]
-// CHECK1-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP29]]
-// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP29]]
-// CHECK1-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP29]]
+// CHECK1-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP31]]
+// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP31]]
+// CHECK1-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP31]]
 // CHECK1-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP18]], [[TMP19]]
-// CHECK1-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP29]]
-// CHECK1-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP29]]
-// CHECK1-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP29]]
+// CHECK1-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP31]]
+// CHECK1-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP31]]
+// CHECK1-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP31]]
 // CHECK1-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP20]], [[TMP21]]
-// CHECK1-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP29]]
-// CHECK1-NEXT:    [[TMP22:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP29]]
+// CHECK1-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP31]]
+// CHECK1-NEXT:    [[TMP22:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP31]]
 // CHECK1-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP22]], 9
 // CHECK1-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
 // CHECK1:       cond.true5:
 // CHECK1-NEXT:    br label [[COND_END7:%.*]]
 // CHECK1:       cond.false6:
-// CHECK1-NEXT:    [[TMP23:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP29]]
+// CHECK1-NEXT:    [[TMP23:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP31]]
 // CHECK1-NEXT:    br label [[COND_END7]]
 // CHECK1:       cond.end7:
 // CHECK1-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP23]], [[COND_FALSE6]] ]
-// CHECK1-NEXT:    store i32 [[COND8]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP29]]
-// CHECK1-NEXT:    [[TMP24:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP29]]
-// CHECK1-NEXT:    store i32 [[TMP24]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP29]]
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP30:![0-9]+]]
+// CHECK1-NEXT:    store i32 [[COND8]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP31]]
+// CHECK1-NEXT:    [[TMP24:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP31]]
+// CHECK1-NEXT:    store i32 [[TMP24]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP31]]
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP32:![0-9]+]]
 // CHECK1:       omp.inner.for.end:
 // CHECK1-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK1:       omp.loop.exit:
@@ -831,31 +837,31 @@ int bar(int n){
 // CHECK1-NEXT:    store i32 [[TMP5]], ptr [[DOTOMP_IV]], align 4
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK1:       omp.inner.for.cond:
-// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP32:![0-9]+]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP34:![0-9]+]]
 // CHECK1-NEXT:    [[CONV2:%.*]] = sext i32 [[TMP6]] to i64
-// CHECK1-NEXT:    [[TMP7:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR]], align 8, !llvm.access.group [[ACC_GRP32]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR]], align 8, !llvm.access.group [[ACC_GRP34]]
 // CHECK1-NEXT:    [[CMP:%.*]] = icmp ule i64 [[CONV2]], [[TMP7]]
 // CHECK1-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK1:       omp.inner.for.body:
-// CHECK1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP32]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP34]]
 // CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1
 // CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK1-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP32]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP32]]
+// CHECK1-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP34]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP34]]
 // CHECK1-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP9]] to i64
 // CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP0]], i64 0, i64 [[IDXPROM]]
-// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP32]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP34]]
 // CHECK1-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP10]], 1
-// CHECK1-NEXT:    store i32 [[ADD3]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP32]]
+// CHECK1-NEXT:    store i32 [[ADD3]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP34]]
 // CHECK1-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK1:       omp.body.continue:
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK1:       omp.inner.for.inc:
-// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP32]]
-// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP32]]
+// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP34]]
+// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP34]]
 // CHECK1-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP11]], [[TMP12]]
-// CHECK1-NEXT:    store i32 [[ADD4]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP32]]
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP33:![0-9]+]]
+// CHECK1-NEXT:    store i32 [[ADD4]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP34]]
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP35:![0-9]+]]
 // CHECK1:       omp.inner.for.end:
 // CHECK1-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK1:       omp.loop.exit:
@@ -871,17 +877,19 @@ int bar(int n){
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l42
-// CHECK1-SAME: (ptr noundef nonnull align 4 dereferenceable(400) [[C:%.*]], i64 noundef [[F:%.*]]) #[[ATTR0]] {
+// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[C:%.*]], i64 noundef [[F:%.*]]) #[[ATTR0]] {
 // CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[F_ADDR:%.*]] = alloca i64, align 8
 // CHECK1-NEXT:    [[F_CASTED:%.*]] = alloca i64, align 8
 // CHECK1-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK1-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
 // CHECK1-NEXT:    store i64 [[F]], ptr [[F_ADDR]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l42_kernel_environment)
+// CHECK1-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l42_kernel_environment, ptr [[DYN_PTR]])
 // CHECK1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK1:       user_code.entry:
@@ -945,57 +953,57 @@ int bar(int n){
 // CHECK1-NEXT:    store i32 [[TMP5]], ptr [[DOTOMP_IV]], align 4
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK1:       omp.inner.for.cond:
-// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP35:![0-9]+]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP37:![0-9]+]]
 // CHECK1-NEXT:    [[CMP2:%.*]] = icmp slt i32 [[TMP6]], 100
 // CHECK1-NEXT:    br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK1:       omp.inner.for.body:
-// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP35]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP37]]
 // CHECK1-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP7]] to i64
-// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP35]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP37]]
 // CHECK1-NEXT:    [[TMP10:%.*]] = zext i32 [[TMP9]] to i64
-// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[F_ADDR]], align 4, !llvm.access.group [[ACC_GRP35]]
-// CHECK1-NEXT:    store i32 [[TMP11]], ptr [[F_CASTED]], align 4, !llvm.access.group [[ACC_GRP35]]
-// CHECK1-NEXT:    [[TMP12:%.*]] = load i64, ptr [[F_CASTED]], align 8, !llvm.access.group [[ACC_GRP35]]
+// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[F_ADDR]], align 4, !llvm.access.group [[ACC_GRP37]]
+// CHECK1-NEXT:    store i32 [[TMP11]], ptr [[F_CASTED]], align 4, !llvm.access.group [[ACC_GRP37]]
+// CHECK1-NEXT:    [[TMP12:%.*]] = load i64, ptr [[F_CASTED]], align 8, !llvm.access.group [[ACC_GRP37]]
 // CHECK1-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
 // CHECK1-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP8]] to ptr
-// CHECK1-NEXT:    store ptr [[TMP14]], ptr [[TMP13]], align 8, !llvm.access.group [[ACC_GRP35]]
+// CHECK1-NEXT:    store ptr [[TMP14]], ptr [[TMP13]], align 8, !llvm.access.group [[ACC_GRP37]]
 // CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 1
 // CHECK1-NEXT:    [[TMP16:%.*]] = inttoptr i64 [[TMP10]] to ptr
-// CHECK1-NEXT:    store ptr [[TMP16]], ptr [[TMP15]], align 8, !llvm.access.group [[ACC_GRP35]]
+// CHECK1-NEXT:    store ptr [[TMP16]], ptr [[TMP15]], align 8, !llvm.access.group [[ACC_GRP37]]
 // CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 2
-// CHECK1-NEXT:    store ptr [[TMP0]], ptr [[TMP17]], align 8, !llvm.access.group [[ACC_GRP35]]
+// CHECK1-NEXT:    store ptr [[TMP0]], ptr [[TMP17]], align 8, !llvm.access.group [[ACC_GRP37]]
 // CHECK1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 3
 // CHECK1-NEXT:    [[TMP19:%.*]] = inttoptr i64 [[TMP12]] to ptr
-// CHECK1-NEXT:    store ptr [[TMP19]], ptr [[TMP18]], align 8, !llvm.access.group [[ACC_GRP35]]
-// CHECK1-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l42_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 4), !llvm.access.group [[ACC_GRP35]]
+// CHECK1-NEXT:    store ptr [[TMP19]], ptr [[TMP18]], align 8, !llvm.access.group [[ACC_GRP37]]
+// CHECK1-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l42_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 4), !llvm.access.group [[ACC_GRP37]]
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK1:       omp.inner.for.inc:
-// CHECK1-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP35]]
-// CHECK1-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP35]]
+// CHECK1-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP37]]
+// CHECK1-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP37]]
 // CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP20]], [[TMP21]]
-// CHECK1-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP35]]
-// CHECK1-NEXT:    [[TMP22:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP35]]
-// CHECK1-NEXT:    [[TMP23:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP35]]
+// CHECK1-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP37]]
+// CHECK1-NEXT:    [[TMP22:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP37]]
+// CHECK1-NEXT:    [[TMP23:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP37]]
 // CHECK1-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP22]], [[TMP23]]
-// CHECK1-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP35]]
-// CHECK1-NEXT:    [[TMP24:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP35]]
-// CHECK1-NEXT:    [[TMP25:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP35]]
+// CHECK1-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP37]]
+// CHECK1-NEXT:    [[TMP24:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP37]]
+// CHECK1-NEXT:    [[TMP25:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP37]]
 // CHECK1-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP24]], [[TMP25]]
-// CHECK1-NEXT:    store i32 [[ADD4]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP35]]
-// CHECK1-NEXT:    [[TMP26:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP35]]
+// CHECK1-NEXT:    store i32 [[ADD4]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP37]]
+// CHECK1-NEXT:    [[TMP26:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP37]]
 // CHECK1-NEXT:    [[CMP5:%.*]] = icmp sgt i32 [[TMP26]], 99
 // CHECK1-NEXT:    br i1 [[CMP5]], label [[COND_TRUE6:%.*]], label [[COND_FALSE7:%.*]]
 // CHECK1:       cond.true6:
 // CHECK1-NEXT:    br label [[COND_END8:%.*]]
 // CHECK1:       cond.false7:
-// CHECK1-NEXT:    [[TMP27:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP35]]
+// CHECK1-NEXT:    [[TMP27:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP37]]
 // CHECK1-NEXT:    br label [[COND_END8]]
 // CHECK1:       cond.end8:
 // CHECK1-NEXT:    [[COND9:%.*]] = phi i32 [ 99, [[COND_TRUE6]] ], [ [[TMP27]], [[COND_FALSE7]] ]
-// CHECK1-NEXT:    store i32 [[COND9]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP35]]
-// CHECK1-NEXT:    [[TMP28:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP35]]
-// CHECK1-NEXT:    store i32 [[TMP28]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP35]]
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP36:![0-9]+]]
+// CHECK1-NEXT:    store i32 [[COND9]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP37]]
+// CHECK1-NEXT:    [[TMP28:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP37]]
+// CHECK1-NEXT:    store i32 [[TMP28]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP37]]
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP38:![0-9]+]]
 // CHECK1:       omp.inner.for.end:
 // CHECK1-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK1:       omp.loop.exit:
@@ -1054,49 +1062,49 @@ int bar(int n){
 // CHECK1-NEXT:    store i32 [[TMP5]], ptr [[DOTOMP_IV]], align 4
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK1:       omp.inner.for.cond:
-// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP38:![0-9]+]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP40:![0-9]+]]
 // CHECK1-NEXT:    [[CONV3:%.*]] = sext i32 [[TMP6]] to i64
-// CHECK1-NEXT:    [[TMP7:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR]], align 8, !llvm.access.group [[ACC_GRP38]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load i64, ptr [[DOTPREVIOUS_UB__ADDR]], align 8, !llvm.access.group [[ACC_GRP40]]
 // CHECK1-NEXT:    [[CMP:%.*]] = icmp ule i64 [[CONV3]], [[TMP7]]
 // CHECK1-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK1:       omp.inner.for.body:
-// CHECK1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP38]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP40]]
 // CHECK1-NEXT:    [[DIV:%.*]] = sdiv i32 [[TMP8]], 10
 // CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i32 [[DIV]], 1
 // CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK1-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP38]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP38]]
-// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP38]]
+// CHECK1-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP40]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP40]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP40]]
 // CHECK1-NEXT:    [[DIV4:%.*]] = sdiv i32 [[TMP10]], 10
 // CHECK1-NEXT:    [[MUL5:%.*]] = mul nsw i32 [[DIV4]], 10
 // CHECK1-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP9]], [[MUL5]]
 // CHECK1-NEXT:    [[MUL6:%.*]] = mul nsw i32 [[SUB]], 1
 // CHECK1-NEXT:    [[ADD7:%.*]] = add nsw i32 0, [[MUL6]]
-// CHECK1-NEXT:    store i32 [[ADD7]], ptr [[J]], align 4, !llvm.access.group [[ACC_GRP38]]
-// CHECK1-NEXT:    store i32 10, ptr [[K]], align 4, !llvm.access.group [[ACC_GRP38]]
-// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP38]]
-// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[J]], align 4, !llvm.access.group [[ACC_GRP38]]
-// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[F_ADDR]], align 4, !llvm.access.group [[ACC_GRP38]]
+// CHECK1-NEXT:    store i32 [[ADD7]], ptr [[J]], align 4, !llvm.access.group [[ACC_GRP40]]
+// CHECK1-NEXT:    store i32 10, ptr [[K]], align 4, !llvm.access.group [[ACC_GRP40]]
+// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP40]]
+// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[J]], align 4, !llvm.access.group [[ACC_GRP40]]
+// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[F_ADDR]], align 4, !llvm.access.group [[ACC_GRP40]]
 // CHECK1-NEXT:    [[MUL8:%.*]] = mul nsw i32 [[TMP12]], [[TMP13]]
 // CHECK1-NEXT:    [[ADD9:%.*]] = add nsw i32 [[TMP11]], [[MUL8]]
-// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[K]], align 4, !llvm.access.group [[ACC_GRP38]]
+// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[K]], align 4, !llvm.access.group [[ACC_GRP40]]
 // CHECK1-NEXT:    [[ADD10:%.*]] = add nsw i32 [[ADD9]], [[TMP14]]
-// CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP38]]
+// CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP40]]
 // CHECK1-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP15]] to i64
 // CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP0]], i64 0, i64 [[IDXPROM]]
-// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[J]], align 4, !llvm.access.group [[ACC_GRP38]]
+// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[J]], align 4, !llvm.access.group [[ACC_GRP40]]
 // CHECK1-NEXT:    [[IDXPROM11:%.*]] = sext i32 [[TMP16]] to i64
 // CHECK1-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX]], i64 0, i64 [[IDXPROM11]]
-// CHECK1-NEXT:    store i32 [[ADD10]], ptr [[ARRAYIDX12]], align 4, !llvm.access.group [[ACC_GRP38]]
+// CHECK1-NEXT:    store i32 [[ADD10]], ptr [[ARRAYIDX12]], align 4, !llvm.access.group [[ACC_GRP40]]
 // CHECK1-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK1:       omp.body.continue:
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK1:       omp.inner.for.inc:
-// CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP38]]
-// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP38]]
+// CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP40]]
+// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP40]]
 // CHECK1-NEXT:    [[ADD13:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
-// CHECK1-NEXT:    store i32 [[ADD13]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP38]]
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP39:![0-9]+]]
+// CHECK1-NEXT:    store i32 [[ADD13]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP40]]
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP41:![0-9]+]]
 // CHECK1:       omp.inner.for.end:
 // CHECK1-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK1:       omp.loop.exit:
@@ -1113,8 +1121,9 @@ int bar(int n){
 //
 //
 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l26
-// CHECK2-SAME: (i32 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 noundef [[L:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK2-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 noundef [[L:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK2-NEXT:  entry:
+// CHECK2-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK2-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
 // CHECK2-NEXT:    [[L_ADDR:%.*]] = alloca i32, align 4
@@ -1122,11 +1131,12 @@ int bar(int n){
 // CHECK2-NEXT:    [[L_CASTED:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK2-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
 // CHECK2-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
 // CHECK2-NEXT:    store i32 [[L]], ptr [[L_ADDR]], align 4
 // CHECK2-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
-// CHECK2-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l26_kernel_environment)
+// CHECK2-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l26_kernel_environment, ptr [[DYN_PTR]])
 // CHECK2-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // CHECK2-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK2:       user_code.entry:
@@ -1210,67 +1220,67 @@ int bar(int n){
 // CHECK2-NEXT:    store i32 [[TMP11]], ptr [[DOTOMP_IV]], align 4
 // CHECK2-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK2:       omp.inner.for.cond:
-// CHECK2-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16:![0-9]+]]
-// CHECK2-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK2-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP18:![0-9]+]]
+// CHECK2-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK2-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP13]], 1
 // CHECK2-NEXT:    [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]]
 // CHECK2-NEXT:    br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK2:       omp.inner.for.body:
-// CHECK2-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK2-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK2-NEXT:    [[TMP16:%.*]] = load i32, ptr [[N_ADDR]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK2-NEXT:    store i32 [[TMP16]], ptr [[N_CASTED]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK2-NEXT:    [[TMP17:%.*]] = load i32, ptr [[N_CASTED]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK2-NEXT:    [[TMP18:%.*]] = load i32, ptr [[L_ADDR]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK2-NEXT:    store i32 [[TMP18]], ptr [[L_CASTED]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK2-NEXT:    [[TMP19:%.*]] = load i32, ptr [[L_CASTED]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK2-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK2-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK2-NEXT:    [[TMP16:%.*]] = load i32, ptr [[N_ADDR]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK2-NEXT:    store i32 [[TMP16]], ptr [[N_CASTED]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK2-NEXT:    [[TMP17:%.*]] = load i32, ptr [[N_CASTED]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK2-NEXT:    [[TMP18:%.*]] = load i32, ptr [[L_ADDR]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK2-NEXT:    store i32 [[TMP18]], ptr [[L_CASTED]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK2-NEXT:    [[TMP19:%.*]] = load i32, ptr [[L_CASTED]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK2-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [5 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
 // CHECK2-NEXT:    [[TMP21:%.*]] = inttoptr i32 [[TMP14]] to ptr
-// CHECK2-NEXT:    store ptr [[TMP21]], ptr [[TMP20]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK2-NEXT:    store ptr [[TMP21]], ptr [[TMP20]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK2-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [5 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
 // CHECK2-NEXT:    [[TMP23:%.*]] = inttoptr i32 [[TMP15]] to ptr
-// CHECK2-NEXT:    store ptr [[TMP23]], ptr [[TMP22]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK2-NEXT:    store ptr [[TMP23]], ptr [[TMP22]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK2-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [5 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 2
 // CHECK2-NEXT:    [[TMP25:%.*]] = inttoptr i32 [[TMP17]] to ptr
-// CHECK2-NEXT:    store ptr [[TMP25]], ptr [[TMP24]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK2-NEXT:    store ptr [[TMP25]], ptr [[TMP24]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK2-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [5 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 3
-// CHECK2-NEXT:    store ptr [[TMP0]], ptr [[TMP26]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK2-NEXT:    store ptr [[TMP0]], ptr [[TMP26]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK2-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [5 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 4
 // CHECK2-NEXT:    [[TMP28:%.*]] = inttoptr i32 [[TMP19]] to ptr
-// CHECK2-NEXT:    store ptr [[TMP28]], ptr [[TMP27]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK2-NEXT:    [[TMP29:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK2-NEXT:    [[TMP30:%.*]] = load i32, ptr [[TMP29]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK2-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP30]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l26_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i32 5), !llvm.access.group [[ACC_GRP16]]
+// CHECK2-NEXT:    store ptr [[TMP28]], ptr [[TMP27]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK2-NEXT:    [[TMP29:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK2-NEXT:    [[TMP30:%.*]] = load i32, ptr [[TMP29]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK2-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP30]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l26_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i32 5), !llvm.access.group [[ACC_GRP18]]
 // CHECK2-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK2:       omp.inner.for.inc:
-// CHECK2-NEXT:    [[TMP31:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK2-NEXT:    [[TMP32:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK2-NEXT:    [[TMP31:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK2-NEXT:    [[TMP32:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK2-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP31]], [[TMP32]]
-// CHECK2-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK2-NEXT:    [[TMP33:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK2-NEXT:    [[TMP34:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK2-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK2-NEXT:    [[TMP33:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK2-NEXT:    [[TMP34:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK2-NEXT:    [[ADD7:%.*]] = add nsw i32 [[TMP33]], [[TMP34]]
-// CHECK2-NEXT:    store i32 [[ADD7]], ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK2-NEXT:    [[TMP35:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK2-NEXT:    [[TMP36:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK2-NEXT:    store i32 [[ADD7]], ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK2-NEXT:    [[TMP35:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK2-NEXT:    [[TMP36:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK2-NEXT:    [[ADD8:%.*]] = add nsw i32 [[TMP35]], [[TMP36]]
-// CHECK2-NEXT:    store i32 [[ADD8]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK2-NEXT:    [[TMP37:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK2-NEXT:    [[TMP38:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK2-NEXT:    store i32 [[ADD8]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK2-NEXT:    [[TMP37:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK2-NEXT:    [[TMP38:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK2-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[TMP37]], [[TMP38]]
 // CHECK2-NEXT:    br i1 [[CMP9]], label [[COND_TRUE10:%.*]], label [[COND_FALSE11:%.*]]
 // CHECK2:       cond.true10:
-// CHECK2-NEXT:    [[TMP39:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK2-NEXT:    [[TMP39:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK2-NEXT:    br label [[COND_END12:%.*]]
 // CHECK2:       cond.false11:
-// CHECK2-NEXT:    [[TMP40:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK2-NEXT:    [[TMP40:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK2-NEXT:    br label [[COND_END12]]
 // CHECK2:       cond.end12:
 // CHECK2-NEXT:    [[COND13:%.*]] = phi i32 [ [[TMP39]], [[COND_TRUE10]] ], [ [[TMP40]], [[COND_FALSE11]] ]
-// CHECK2-NEXT:    store i32 [[COND13]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK2-NEXT:    [[TMP41:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK2-NEXT:    store i32 [[TMP41]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK2-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP17:![0-9]+]]
+// CHECK2-NEXT:    store i32 [[COND13]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK2-NEXT:    [[TMP41:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK2-NEXT:    store i32 [[TMP41]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK2-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP19:![0-9]+]]
 // CHECK2:       omp.inner.for.end:
 // CHECK2-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK2:       omp.loop.exit:
@@ -1378,28 +1388,28 @@ int bar(int n){
 // CHECK2:       omp.dispatch.body:
 // CHECK2-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK2:       omp.inner.for.cond:
-// CHECK2-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP20:![0-9]+]]
-// CHECK2-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP20]]
+// CHECK2-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22:![0-9]+]]
+// CHECK2-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP22]]
 // CHECK2-NEXT:    [[CMP6:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]]
 // CHECK2-NEXT:    br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK2:       omp.inner.for.body:
-// CHECK2-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP20]]
+// CHECK2-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
 // CHECK2-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1
 // CHECK2-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK2-NEXT:    store i32 [[ADD]], ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP20]]
-// CHECK2-NEXT:    [[TMP19:%.*]] = load i32, ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP20]]
+// CHECK2-NEXT:    store i32 [[ADD]], ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK2-NEXT:    [[TMP19:%.*]] = load i32, ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP22]]
 // CHECK2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], ptr [[TMP0]], i32 0, i32 [[TMP19]]
-// CHECK2-NEXT:    store i32 1, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP20]]
-// CHECK2-NEXT:    [[TMP20:%.*]] = load i32, ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP20]]
-// CHECK2-NEXT:    store i32 [[TMP20]], ptr [[L_ADDR]], align 4, !llvm.access.group [[ACC_GRP20]]
+// CHECK2-NEXT:    store i32 1, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK2-NEXT:    [[TMP20:%.*]] = load i32, ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK2-NEXT:    store i32 [[TMP20]], ptr [[L_ADDR]], align 4, !llvm.access.group [[ACC_GRP22]]
 // CHECK2-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK2:       omp.body.continue:
 // CHECK2-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK2:       omp.inner.for.inc:
-// CHECK2-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP20]]
+// CHECK2-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
 // CHECK2-NEXT:    [[ADD7:%.*]] = add nsw i32 [[TMP21]], 1
-// CHECK2-NEXT:    store i32 [[ADD7]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP20]]
-// CHECK2-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP21:![0-9]+]]
+// CHECK2-NEXT:    store i32 [[ADD7]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK2-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP23:![0-9]+]]
 // CHECK2:       omp.inner.for.end:
 // CHECK2-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK2:       omp.dispatch.inc:
@@ -1442,17 +1452,19 @@ int bar(int n){
 //
 //
 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l32
-// CHECK2-SAME: (i32 noundef [[N:%.*]], ptr noundef nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR4:[0-9]+]] {
+// CHECK2-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[N:%.*]], ptr noundef nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR4:[0-9]+]] {
 // CHECK2-NEXT:  entry:
+// CHECK2-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK2-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    [[AA_ADDR:%.*]] = alloca ptr, align 4
 // CHECK2-NEXT:    [[N_CASTED:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK2-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
 // CHECK2-NEXT:    store ptr [[AA]], ptr [[AA_ADDR]], align 4
 // CHECK2-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[AA_ADDR]], align 4
-// CHECK2-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l32_kernel_environment)
+// CHECK2-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l32_kernel_environment, ptr [[DYN_PTR]])
 // CHECK2-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // CHECK2-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK2:       user_code.entry:
@@ -1531,61 +1543,61 @@ int bar(int n){
 // CHECK2-NEXT:    store i32 [[TMP11]], ptr [[DOTOMP_IV]], align 4
 // CHECK2-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK2:       omp.inner.for.cond:
-// CHECK2-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP23:![0-9]+]]
-// CHECK2-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK2-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP25:![0-9]+]]
+// CHECK2-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4, !llvm.access.group [[ACC_GRP25]]
 // CHECK2-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP13]], 1
 // CHECK2-NEXT:    [[CMP5:%.*]] = icmp slt i32 [[TMP12]], [[ADD]]
 // CHECK2-NEXT:    br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK2:       omp.inner.for.body:
-// CHECK2-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP23]]
-// CHECK2-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP23]]
-// CHECK2-NEXT:    [[TMP16:%.*]] = load i32, ptr [[N_ADDR]], align 4, !llvm.access.group [[ACC_GRP23]]
-// CHECK2-NEXT:    store i32 [[TMP16]], ptr [[N_CASTED]], align 4, !llvm.access.group [[ACC_GRP23]]
-// CHECK2-NEXT:    [[TMP17:%.*]] = load i32, ptr [[N_CASTED]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK2-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP25]]
+// CHECK2-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP25]]
+// CHECK2-NEXT:    [[TMP16:%.*]] = load i32, ptr [[N_ADDR]], align 4, !llvm.access.group [[ACC_GRP25]]
+// CHECK2-NEXT:    store i32 [[TMP16]], ptr [[N_CASTED]], align 4, !llvm.access.group [[ACC_GRP25]]
+// CHECK2-NEXT:    [[TMP17:%.*]] = load i32, ptr [[N_CASTED]], align 4, !llvm.access.group [[ACC_GRP25]]
 // CHECK2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
 // CHECK2-NEXT:    [[TMP19:%.*]] = inttoptr i32 [[TMP14]] to ptr
-// CHECK2-NEXT:    store ptr [[TMP19]], ptr [[TMP18]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK2-NEXT:    store ptr [[TMP19]], ptr [[TMP18]], align 4, !llvm.access.group [[ACC_GRP25]]
 // CHECK2-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
 // CHECK2-NEXT:    [[TMP21:%.*]] = inttoptr i32 [[TMP15]] to ptr
-// CHECK2-NEXT:    store ptr [[TMP21]], ptr [[TMP20]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK2-NEXT:    store ptr [[TMP21]], ptr [[TMP20]], align 4, !llvm.access.group [[ACC_GRP25]]
 // CHECK2-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 2
 // CHECK2-NEXT:    [[TMP23:%.*]] = inttoptr i32 [[TMP17]] to ptr
-// CHECK2-NEXT:    store ptr [[TMP23]], ptr [[TMP22]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK2-NEXT:    store ptr [[TMP23]], ptr [[TMP22]], align 4, !llvm.access.group [[ACC_GRP25]]
 // CHECK2-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 3
-// CHECK2-NEXT:    store ptr [[TMP0]], ptr [[TMP24]], align 4, !llvm.access.group [[ACC_GRP23]]
-// CHECK2-NEXT:    [[TMP25:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4, !llvm.access.group [[ACC_GRP23]]
-// CHECK2-NEXT:    [[TMP26:%.*]] = load i32, ptr [[TMP25]], align 4, !llvm.access.group [[ACC_GRP23]]
-// CHECK2-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP26]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l32_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i32 4), !llvm.access.group [[ACC_GRP23]]
+// CHECK2-NEXT:    store ptr [[TMP0]], ptr [[TMP24]], align 4, !llvm.access.group [[ACC_GRP25]]
+// CHECK2-NEXT:    [[TMP25:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4, !llvm.access.group [[ACC_GRP25]]
+// CHECK2-NEXT:    [[TMP26:%.*]] = load i32, ptr [[TMP25]], align 4, !llvm.access.group [[ACC_GRP25]]
+// CHECK2-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP26]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l32_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i32 4), !llvm.access.group [[ACC_GRP25]]
 // CHECK2-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK2:       omp.inner.for.inc:
-// CHECK2-NEXT:    [[TMP27:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP23]]
-// CHECK2-NEXT:    [[TMP28:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK2-NEXT:    [[TMP27:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP25]]
+// CHECK2-NEXT:    [[TMP28:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP25]]
 // CHECK2-NEXT:    [[ADD6:%.*]] = add nsw i32 [[TMP27]], [[TMP28]]
-// CHECK2-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP23]]
-// CHECK2-NEXT:    [[TMP29:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP23]]
-// CHECK2-NEXT:    [[TMP30:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK2-NEXT:    store i32 [[ADD6]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP25]]
+// CHECK2-NEXT:    [[TMP29:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP25]]
+// CHECK2-NEXT:    [[TMP30:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP25]]
 // CHECK2-NEXT:    [[ADD7:%.*]] = add nsw i32 [[TMP29]], [[TMP30]]
-// CHECK2-NEXT:    store i32 [[ADD7]], ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP23]]
-// CHECK2-NEXT:    [[TMP31:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP23]]
-// CHECK2-NEXT:    [[TMP32:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK2-NEXT:    store i32 [[ADD7]], ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP25]]
+// CHECK2-NEXT:    [[TMP31:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP25]]
+// CHECK2-NEXT:    [[TMP32:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP25]]
 // CHECK2-NEXT:    [[ADD8:%.*]] = add nsw i32 [[TMP31]], [[TMP32]]
-// CHECK2-NEXT:    store i32 [[ADD8]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP23]]
-// CHECK2-NEXT:    [[TMP33:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP23]]
-// CHECK2-NEXT:    [[TMP34:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK2-NEXT:    store i32 [[ADD8]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP25]]
+// CHECK2-NEXT:    [[TMP33:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP25]]
+// CHECK2-NEXT:    [[TMP34:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4, !llvm.access.group [[ACC_GRP25]]
 // CHECK2-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[TMP33]], [[TMP34]]
 // CHECK2-NEXT:    br i1 [[CMP9]], label [[COND_TRUE10:%.*]], label [[COND_FALSE11:%.*]]
 // CHECK2:       cond.true10:
-// CHECK2-NEXT:    [[TMP35:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK2-NEXT:    [[TMP35:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_1]], align 4, !llvm.access.group [[ACC_GRP25]]
 // CHECK2-NEXT:    br label [[COND_END12:%.*]]
 // CHECK2:       cond.false11:
-// CHECK2-NEXT:    [[TMP36:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK2-NEXT:    [[TMP36:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP25]]
 // CHECK2-NEXT:    br label [[COND_END12]]
 // CHECK2:       cond.end12:
 // CHECK2-NEXT:    [[COND13:%.*]] = phi i32 [ [[TMP35]], [[COND_TRUE10]] ], [ [[TMP36]], [[COND_FALSE11]] ]
-// CHECK2-NEXT:    store i32 [[COND13]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP23]]
-// CHECK2-NEXT:    [[TMP37:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP23]]
-// CHECK2-NEXT:    store i32 [[TMP37]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP23]]
-// CHECK2-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP24:![0-9]+]]
+// CHECK2-NEXT:    store i32 [[COND13]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP25]]
+// CHECK2-NEXT:    [[TMP37:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP25]]
+// CHECK2-NEXT:    store i32 [[TMP37]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP25]]
+// CHECK2-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP26:![0-9]+]]
 // CHECK2:       omp.inner.for.end:
 // CHECK2-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK2:       omp.loop.exit:
@@ -1663,31 +1675,31 @@ int bar(int n){
 // CHECK2-NEXT:    store i32 [[TMP9]], ptr [[DOTOMP_IV]], align 4
 // CHECK2-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK2:       omp.inner.for.cond:
-// CHECK2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26:![0-9]+]]
-// CHECK2-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTPREVIOUS_UB__ADDR]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP28:![0-9]+]]
+// CHECK2-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTPREVIOUS_UB__ADDR]], align 4, !llvm.access.group [[ACC_GRP28]]
 // CHECK2-NEXT:    [[CMP4:%.*]] = icmp ule i32 [[TMP10]], [[TMP11]]
 // CHECK2-NEXT:    br i1 [[CMP4]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK2:       omp.inner.for.body:
-// CHECK2-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK2-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP28]]
 // CHECK2-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP12]], 1
 // CHECK2-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK2-NEXT:    store i32 [[ADD]], ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP26]]
-// CHECK2-NEXT:    [[TMP13:%.*]] = load i32, ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK2-NEXT:    store i32 [[ADD]], ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP28]]
+// CHECK2-NEXT:    [[TMP13:%.*]] = load i32, ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP28]]
 // CHECK2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], ptr [[TMP0]], i32 0, i32 [[TMP13]]
-// CHECK2-NEXT:    [[TMP14:%.*]] = load i16, ptr [[ARRAYIDX]], align 2, !llvm.access.group [[ACC_GRP26]]
+// CHECK2-NEXT:    [[TMP14:%.*]] = load i16, ptr [[ARRAYIDX]], align 2, !llvm.access.group [[ACC_GRP28]]
 // CHECK2-NEXT:    [[CONV:%.*]] = sext i16 [[TMP14]] to i32
 // CHECK2-NEXT:    [[ADD5:%.*]] = add nsw i32 [[CONV]], 1
 // CHECK2-NEXT:    [[CONV6:%.*]] = trunc i32 [[ADD5]] to i16
-// CHECK2-NEXT:    store i16 [[CONV6]], ptr [[ARRAYIDX]], align 2, !llvm.access.group [[ACC_GRP26]]
+// CHECK2-NEXT:    store i16 [[CONV6]], ptr [[ARRAYIDX]], align 2, !llvm.access.group [[ACC_GRP28]]
 // CHECK2-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK2:       omp.body.continue:
 // CHECK2-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK2:       omp.inner.for.inc:
-// CHECK2-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
-// CHECK2-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK2-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP28]]
+// CHECK2-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP28]]
 // CHECK2-NEXT:    [[ADD7:%.*]] = add nsw i32 [[TMP15]], [[TMP16]]
-// CHECK2-NEXT:    store i32 [[ADD7]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
-// CHECK2-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP27:![0-9]+]]
+// CHECK2-NEXT:    store i32 [[ADD7]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP28]]
+// CHECK2-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP29:![0-9]+]]
 // CHECK2:       omp.inner.for.end:
 // CHECK2-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK2:       omp.loop.exit:
@@ -1712,14 +1724,16 @@ int bar(int n){
 //
 //
 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l37
-// CHECK2-SAME: (ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
+// CHECK2-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
 // CHECK2-NEXT:  entry:
+// CHECK2-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK2-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 4
 // CHECK2-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK2-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 4
 // CHECK2-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 4
-// CHECK2-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l37_kernel_environment)
+// CHECK2-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l37_kernel_environment, ptr [[DYN_PTR]])
 // CHECK2-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // CHECK2-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK2:       user_code.entry:
@@ -1774,49 +1788,49 @@ int bar(int n){
 // CHECK2-NEXT:    store i32 [[TMP5]], ptr [[DOTOMP_IV]], align 4
 // CHECK2-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK2:       omp.inner.for.cond:
-// CHECK2-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP29:![0-9]+]]
+// CHECK2-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP31:![0-9]+]]
 // CHECK2-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[TMP6]], 10
 // CHECK2-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK2:       omp.inner.for.body:
-// CHECK2-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP29]]
-// CHECK2-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP29]]
+// CHECK2-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP31]]
+// CHECK2-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP31]]
 // CHECK2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [3 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
 // CHECK2-NEXT:    [[TMP10:%.*]] = inttoptr i32 [[TMP7]] to ptr
-// CHECK2-NEXT:    store ptr [[TMP10]], ptr [[TMP9]], align 4, !llvm.access.group [[ACC_GRP29]]
+// CHECK2-NEXT:    store ptr [[TMP10]], ptr [[TMP9]], align 4, !llvm.access.group [[ACC_GRP31]]
 // CHECK2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [3 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
 // CHECK2-NEXT:    [[TMP12:%.*]] = inttoptr i32 [[TMP8]] to ptr
-// CHECK2-NEXT:    store ptr [[TMP12]], ptr [[TMP11]], align 4, !llvm.access.group [[ACC_GRP29]]
+// CHECK2-NEXT:    store ptr [[TMP12]], ptr [[TMP11]], align 4, !llvm.access.group [[ACC_GRP31]]
 // CHECK2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [3 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 2
-// CHECK2-NEXT:    store ptr [[TMP0]], ptr [[TMP13]], align 4, !llvm.access.group [[ACC_GRP29]]
-// CHECK2-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l37_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i32 3), !llvm.access.group [[ACC_GRP29]]
+// CHECK2-NEXT:    store ptr [[TMP0]], ptr [[TMP13]], align 4, !llvm.access.group [[ACC_GRP31]]
+// CHECK2-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l37_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i32 3), !llvm.access.group [[ACC_GRP31]]
 // CHECK2-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK2:       omp.inner.for.inc:
-// CHECK2-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP29]]
-// CHECK2-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP29]]
+// CHECK2-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP31]]
+// CHECK2-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP31]]
 // CHECK2-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP14]], [[TMP15]]
-// CHECK2-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP29]]
-// CHECK2-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP29]]
-// CHECK2-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP29]]
+// CHECK2-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP31]]
+// CHECK2-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP31]]
+// CHECK2-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP31]]
 // CHECK2-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP16]], [[TMP17]]
-// CHECK2-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP29]]
-// CHECK2-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP29]]
-// CHECK2-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP29]]
+// CHECK2-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP31]]
+// CHECK2-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP31]]
+// CHECK2-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP31]]
 // CHECK2-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP18]], [[TMP19]]
-// CHECK2-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP29]]
-// CHECK2-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP29]]
+// CHECK2-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP31]]
+// CHECK2-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP31]]
 // CHECK2-NEXT:    [[CMP4:%.*]] = icmp sgt i32 [[TMP20]], 9
 // CHECK2-NEXT:    br i1 [[CMP4]], label [[COND_TRUE5:%.*]], label [[COND_FALSE6:%.*]]
 // CHECK2:       cond.true5:
 // CHECK2-NEXT:    br label [[COND_END7:%.*]]
 // CHECK2:       cond.false6:
-// CHECK2-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP29]]
+// CHECK2-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP31]]
 // CHECK2-NEXT:    br label [[COND_END7]]
 // CHECK2:       cond.end7:
 // CHECK2-NEXT:    [[COND8:%.*]] = phi i32 [ 9, [[COND_TRUE5]] ], [ [[TMP21]], [[COND_FALSE6]] ]
-// CHECK2-NEXT:    store i32 [[COND8]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP29]]
-// CHECK2-NEXT:    [[TMP22:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP29]]
-// CHECK2-NEXT:    store i32 [[TMP22]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP29]]
-// CHECK2-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP30:![0-9]+]]
+// CHECK2-NEXT:    store i32 [[COND8]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP31]]
+// CHECK2-NEXT:    [[TMP22:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP31]]
+// CHECK2-NEXT:    store i32 [[TMP22]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP31]]
+// CHECK2-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP32:![0-9]+]]
 // CHECK2:       omp.inner.for.end:
 // CHECK2-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK2:       omp.loop.exit:
@@ -1867,29 +1881,29 @@ int bar(int n){
 // CHECK2-NEXT:    store i32 [[TMP5]], ptr [[DOTOMP_IV]], align 4
 // CHECK2-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK2:       omp.inner.for.cond:
-// CHECK2-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP32:![0-9]+]]
-// CHECK2-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTPREVIOUS_UB__ADDR]], align 4, !llvm.access.group [[ACC_GRP32]]
+// CHECK2-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP34:![0-9]+]]
+// CHECK2-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTPREVIOUS_UB__ADDR]], align 4, !llvm.access.group [[ACC_GRP34]]
 // CHECK2-NEXT:    [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]]
 // CHECK2-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK2:       omp.inner.for.body:
-// CHECK2-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP32]]
+// CHECK2-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP34]]
 // CHECK2-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1
 // CHECK2-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK2-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP32]]
-// CHECK2-NEXT:    [[TMP9:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP32]]
+// CHECK2-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP34]]
+// CHECK2-NEXT:    [[TMP9:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP34]]
 // CHECK2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP0]], i32 0, i32 [[TMP9]]
-// CHECK2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP32]]
+// CHECK2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP34]]
 // CHECK2-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP10]], 1
-// CHECK2-NEXT:    store i32 [[ADD1]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP32]]
+// CHECK2-NEXT:    store i32 [[ADD1]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP34]]
 // CHECK2-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK2:       omp.body.continue:
 // CHECK2-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK2:       omp.inner.for.inc:
-// CHECK2-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP32]]
-// CHECK2-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP32]]
+// CHECK2-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP34]]
+// CHECK2-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP34]]
 // CHECK2-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP11]], [[TMP12]]
-// CHECK2-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP32]]
-// CHECK2-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP33:![0-9]+]]
+// CHECK2-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP34]]
+// CHECK2-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP35:![0-9]+]]
 // CHECK2:       omp.inner.for.end:
 // CHECK2-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK2:       omp.loop.exit:
@@ -1905,17 +1919,19 @@ int bar(int n){
 //
 //
 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l42
-// CHECK2-SAME: (ptr noundef nonnull align 4 dereferenceable(400) [[C:%.*]], i32 noundef [[F:%.*]]) #[[ATTR0]] {
+// CHECK2-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[C:%.*]], i32 noundef [[F:%.*]]) #[[ATTR0]] {
 // CHECK2-NEXT:  entry:
+// CHECK2-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK2-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 4
 // CHECK2-NEXT:    [[F_ADDR:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    [[F_CASTED:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK2-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 4
 // CHECK2-NEXT:    store i32 [[F]], ptr [[F_ADDR]], align 4
 // CHECK2-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 4
-// CHECK2-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l42_kernel_environment)
+// CHECK2-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l42_kernel_environment, ptr [[DYN_PTR]])
 // CHECK2-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // CHECK2-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK2:       user_code.entry:
@@ -1979,55 +1995,55 @@ int bar(int n){
 // CHECK2-NEXT:    store i32 [[TMP5]], ptr [[DOTOMP_IV]], align 4
 // CHECK2-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK2:       omp.inner.for.cond:
-// CHECK2-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP35:![0-9]+]]
+// CHECK2-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP37:![0-9]+]]
 // CHECK2-NEXT:    [[CMP2:%.*]] = icmp slt i32 [[TMP6]], 100
 // CHECK2-NEXT:    br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK2:       omp.inner.for.body:
-// CHECK2-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP35]]
-// CHECK2-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP35]]
-// CHECK2-NEXT:    [[TMP9:%.*]] = load i32, ptr [[F_ADDR]], align 4, !llvm.access.group [[ACC_GRP35]]
-// CHECK2-NEXT:    store i32 [[TMP9]], ptr [[F_CASTED]], align 4, !llvm.access.group [[ACC_GRP35]]
-// CHECK2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[F_CASTED]], align 4, !llvm.access.group [[ACC_GRP35]]
+// CHECK2-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP37]]
+// CHECK2-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP37]]
+// CHECK2-NEXT:    [[TMP9:%.*]] = load i32, ptr [[F_ADDR]], align 4, !llvm.access.group [[ACC_GRP37]]
+// CHECK2-NEXT:    store i32 [[TMP9]], ptr [[F_CASTED]], align 4, !llvm.access.group [[ACC_GRP37]]
+// CHECK2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[F_CASTED]], align 4, !llvm.access.group [[ACC_GRP37]]
 // CHECK2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
 // CHECK2-NEXT:    [[TMP12:%.*]] = inttoptr i32 [[TMP7]] to ptr
-// CHECK2-NEXT:    store ptr [[TMP12]], ptr [[TMP11]], align 4, !llvm.access.group [[ACC_GRP35]]
+// CHECK2-NEXT:    store ptr [[TMP12]], ptr [[TMP11]], align 4, !llvm.access.group [[ACC_GRP37]]
 // CHECK2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
 // CHECK2-NEXT:    [[TMP14:%.*]] = inttoptr i32 [[TMP8]] to ptr
-// CHECK2-NEXT:    store ptr [[TMP14]], ptr [[TMP13]], align 4, !llvm.access.group [[ACC_GRP35]]
+// CHECK2-NEXT:    store ptr [[TMP14]], ptr [[TMP13]], align 4, !llvm.access.group [[ACC_GRP37]]
 // CHECK2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 2
-// CHECK2-NEXT:    store ptr [[TMP0]], ptr [[TMP15]], align 4, !llvm.access.group [[ACC_GRP35]]
+// CHECK2-NEXT:    store ptr [[TMP0]], ptr [[TMP15]], align 4, !llvm.access.group [[ACC_GRP37]]
 // CHECK2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i32 0, i32 3
 // CHECK2-NEXT:    [[TMP17:%.*]] = inttoptr i32 [[TMP10]] to ptr
-// CHECK2-NEXT:    store ptr [[TMP17]], ptr [[TMP16]], align 4, !llvm.access.group [[ACC_GRP35]]
-// CHECK2-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l42_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i32 4), !llvm.access.group [[ACC_GRP35]]
+// CHECK2-NEXT:    store ptr [[TMP17]], ptr [[TMP16]], align 4, !llvm.access.group [[ACC_GRP37]]
+// CHECK2-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB1]], i32 [[TMP2]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l42_omp_outlined_omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i32 4), !llvm.access.group [[ACC_GRP37]]
 // CHECK2-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK2:       omp.inner.for.inc:
-// CHECK2-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP35]]
-// CHECK2-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP35]]
+// CHECK2-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP37]]
+// CHECK2-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP37]]
 // CHECK2-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP18]], [[TMP19]]
-// CHECK2-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP35]]
-// CHECK2-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP35]]
-// CHECK2-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP35]]
+// CHECK2-NEXT:    store i32 [[ADD]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP37]]
+// CHECK2-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP37]]
+// CHECK2-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP37]]
 // CHECK2-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP20]], [[TMP21]]
-// CHECK2-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP35]]
-// CHECK2-NEXT:    [[TMP22:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP35]]
-// CHECK2-NEXT:    [[TMP23:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP35]]
+// CHECK2-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP37]]
+// CHECK2-NEXT:    [[TMP22:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP37]]
+// CHECK2-NEXT:    [[TMP23:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP37]]
 // CHECK2-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP22]], [[TMP23]]
-// CHECK2-NEXT:    store i32 [[ADD4]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP35]]
-// CHECK2-NEXT:    [[TMP24:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP35]]
+// CHECK2-NEXT:    store i32 [[ADD4]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP37]]
+// CHECK2-NEXT:    [[TMP24:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP37]]
 // CHECK2-NEXT:    [[CMP5:%.*]] = icmp sgt i32 [[TMP24]], 99
 // CHECK2-NEXT:    br i1 [[CMP5]], label [[COND_TRUE6:%.*]], label [[COND_FALSE7:%.*]]
 // CHECK2:       cond.true6:
 // CHECK2-NEXT:    br label [[COND_END8:%.*]]
 // CHECK2:       cond.false7:
-// CHECK2-NEXT:    [[TMP25:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP35]]
+// CHECK2-NEXT:    [[TMP25:%.*]] = load i32, ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP37]]
 // CHECK2-NEXT:    br label [[COND_END8]]
 // CHECK2:       cond.end8:
 // CHECK2-NEXT:    [[COND9:%.*]] = phi i32 [ 99, [[COND_TRUE6]] ], [ [[TMP25]], [[COND_FALSE7]] ]
-// CHECK2-NEXT:    store i32 [[COND9]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP35]]
-// CHECK2-NEXT:    [[TMP26:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP35]]
-// CHECK2-NEXT:    store i32 [[TMP26]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP35]]
-// CHECK2-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP36:![0-9]+]]
+// CHECK2-NEXT:    store i32 [[COND9]], ptr [[DOTOMP_COMB_UB]], align 4, !llvm.access.group [[ACC_GRP37]]
+// CHECK2-NEXT:    [[TMP26:%.*]] = load i32, ptr [[DOTOMP_COMB_LB]], align 4, !llvm.access.group [[ACC_GRP37]]
+// CHECK2-NEXT:    store i32 [[TMP26]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP37]]
+// CHECK2-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP38:![0-9]+]]
 // CHECK2:       omp.inner.for.end:
 // CHECK2-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK2:       omp.loop.exit:
@@ -2084,46 +2100,46 @@ int bar(int n){
 // CHECK2-NEXT:    store i32 [[TMP5]], ptr [[DOTOMP_IV]], align 4
 // CHECK2-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK2:       omp.inner.for.cond:
-// CHECK2-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP38:![0-9]+]]
-// CHECK2-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTPREVIOUS_UB__ADDR]], align 4, !llvm.access.group [[ACC_GRP38]]
+// CHECK2-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP40:![0-9]+]]
+// CHECK2-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTPREVIOUS_UB__ADDR]], align 4, !llvm.access.group [[ACC_GRP40]]
 // CHECK2-NEXT:    [[CMP:%.*]] = icmp ule i32 [[TMP6]], [[TMP7]]
 // CHECK2-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK2:       omp.inner.for.body:
-// CHECK2-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP38]]
+// CHECK2-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP40]]
 // CHECK2-NEXT:    [[DIV:%.*]] = sdiv i32 [[TMP8]], 10
 // CHECK2-NEXT:    [[MUL:%.*]] = mul nsw i32 [[DIV]], 1
 // CHECK2-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK2-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP38]]
-// CHECK2-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP38]]
-// CHECK2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP38]]
+// CHECK2-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP40]]
+// CHECK2-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP40]]
+// CHECK2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP40]]
 // CHECK2-NEXT:    [[DIV2:%.*]] = sdiv i32 [[TMP10]], 10
 // CHECK2-NEXT:    [[MUL3:%.*]] = mul nsw i32 [[DIV2]], 10
 // CHECK2-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP9]], [[MUL3]]
 // CHECK2-NEXT:    [[MUL4:%.*]] = mul nsw i32 [[SUB]], 1
 // CHECK2-NEXT:    [[ADD5:%.*]] = add nsw i32 0, [[MUL4]]
-// CHECK2-NEXT:    store i32 [[ADD5]], ptr [[J]], align 4, !llvm.access.group [[ACC_GRP38]]
-// CHECK2-NEXT:    store i32 10, ptr [[K]], align 4, !llvm.access.group [[ACC_GRP38]]
-// CHECK2-NEXT:    [[TMP11:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP38]]
-// CHECK2-NEXT:    [[TMP12:%.*]] = load i32, ptr [[J]], align 4, !llvm.access.group [[ACC_GRP38]]
-// CHECK2-NEXT:    [[TMP13:%.*]] = load i32, ptr [[F_ADDR]], align 4, !llvm.access.group [[ACC_GRP38]]
+// CHECK2-NEXT:    store i32 [[ADD5]], ptr [[J]], align 4, !llvm.access.group [[ACC_GRP40]]
+// CHECK2-NEXT:    store i32 10, ptr [[K]], align 4, !llvm.access.group [[ACC_GRP40]]
+// CHECK2-NEXT:    [[TMP11:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP40]]
+// CHECK2-NEXT:    [[TMP12:%.*]] = load i32, ptr [[J]], align 4, !llvm.access.group [[ACC_GRP40]]
+// CHECK2-NEXT:    [[TMP13:%.*]] = load i32, ptr [[F_ADDR]], align 4, !llvm.access.group [[ACC_GRP40]]
 // CHECK2-NEXT:    [[MUL6:%.*]] = mul nsw i32 [[TMP12]], [[TMP13]]
 // CHECK2-NEXT:    [[ADD7:%.*]] = add nsw i32 [[TMP11]], [[MUL6]]
-// CHECK2-NEXT:    [[TMP14:%.*]] = load i32, ptr [[K]], align 4, !llvm.access.group [[ACC_GRP38]]
+// CHECK2-NEXT:    [[TMP14:%.*]] = load i32, ptr [[K]], align 4, !llvm.access.group [[ACC_GRP40]]
 // CHECK2-NEXT:    [[ADD8:%.*]] = add nsw i32 [[ADD7]], [[TMP14]]
-// CHECK2-NEXT:    [[TMP15:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP38]]
+// CHECK2-NEXT:    [[TMP15:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP40]]
 // CHECK2-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP0]], i32 0, i32 [[TMP15]]
-// CHECK2-NEXT:    [[TMP16:%.*]] = load i32, ptr [[J]], align 4, !llvm.access.group [[ACC_GRP38]]
+// CHECK2-NEXT:    [[TMP16:%.*]] = load i32, ptr [[J]], align 4, !llvm.access.group [[ACC_GRP40]]
 // CHECK2-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX]], i32 0, i32 [[TMP16]]
-// CHECK2-NEXT:    store i32 [[ADD8]], ptr [[ARRAYIDX9]], align 4, !llvm.access.group [[ACC_GRP38]]
+// CHECK2-NEXT:    store i32 [[ADD8]], ptr [[ARRAYIDX9]], align 4, !llvm.access.group [[ACC_GRP40]]
 // CHECK2-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK2:       omp.body.continue:
 // CHECK2-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK2:       omp.inner.for.inc:
-// CHECK2-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP38]]
-// CHECK2-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP38]]
+// CHECK2-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP40]]
+// CHECK2-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !llvm.access.group [[ACC_GRP40]]
 // CHECK2-NEXT:    [[ADD10:%.*]] = add nsw i32 [[TMP17]], [[TMP18]]
-// CHECK2-NEXT:    store i32 [[ADD10]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP38]]
-// CHECK2-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP39:![0-9]+]]
+// CHECK2-NEXT:    store i32 [[ADD10]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP40]]
+// CHECK2-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP41:![0-9]+]]
 // CHECK2:       omp.inner.for.end:
 // CHECK2-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
 // CHECK2:       omp.loop.exit:
diff --git a/clang/test/OpenMP/nvptx_target_teams_distribute_simd_codegen.cpp b/clang/test/OpenMP/nvptx_target_teams_distribute_simd_codegen.cpp
index be814ed13349c..4fc93386236a3 100644
--- a/clang/test/OpenMP/nvptx_target_teams_distribute_simd_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_target_teams_distribute_simd_codegen.cpp
@@ -68,8 +68,9 @@ int bar(int n){
 
 #endif
 // CHECK45-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l34
-// CHECK45-64-SAME: (i64 [[N:%.*]], ptr nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK45-64-SAME: (ptr noalias [[DYN_PTR:%.*]], i64 [[N:%.*]], ptr nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK45-64-NEXT:  entry:
+// CHECK45-64-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK45-64-NEXT:    [[N_ADDR:%.*]] = alloca i64, align 8
 // CHECK45-64-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
 // CHECK45-64-NEXT:    [[L_ADDR:%.*]] = alloca i64, align 8
@@ -77,11 +78,12 @@ int bar(int n){
 // CHECK45-64-NEXT:    [[L_CASTED:%.*]] = alloca i64, align 8
 // CHECK45-64-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK45-64-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK45-64-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK45-64-NEXT:    store i64 [[N]], ptr [[N_ADDR]], align 8
 // CHECK45-64-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
 // CHECK45-64-NEXT:    store i64 [[L]], ptr [[L_ADDR]], align 8
 // CHECK45-64-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK45-64-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l34_kernel_environment)
+// CHECK45-64-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l34_kernel_environment, ptr [[DYN_PTR]])
 // CHECK45-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // CHECK45-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK45-64:       user_code.entry:
@@ -169,29 +171,29 @@ int bar(int n){
 // CHECK45-64:       omp.dispatch.body:
 // CHECK45-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK45-64:       omp.inner.for.cond:
-// CHECK45-64-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16:![0-9]+]]
-// CHECK45-64-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK45-64-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP18:![0-9]+]]
+// CHECK45-64-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK45-64-NEXT:    [[CMP6:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]]
 // CHECK45-64-NEXT:    br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK45-64:       omp.inner.for.body:
-// CHECK45-64-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK45-64-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK45-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP16]], 1
 // CHECK45-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK45-64-NEXT:    store i32 [[ADD]], ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK45-64-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK45-64-NEXT:    store i32 [[ADD]], ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK45-64-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK45-64-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP17]] to i64
 // CHECK45-64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], ptr [[TMP0]], i64 0, i64 [[IDXPROM]]
-// CHECK45-64-NEXT:    store i32 1, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK45-64-NEXT:    [[TMP18:%.*]] = load i32, ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK45-64-NEXT:    store i32 [[TMP18]], ptr [[L_ADDR]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK45-64-NEXT:    store i32 1, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK45-64-NEXT:    [[TMP18:%.*]] = load i32, ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK45-64-NEXT:    store i32 [[TMP18]], ptr [[L_ADDR]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK45-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK45-64:       omp.body.continue:
 // CHECK45-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK45-64:       omp.inner.for.inc:
-// CHECK45-64-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK45-64-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK45-64-NEXT:    [[ADD7:%.*]] = add nsw i32 [[TMP19]], 1
-// CHECK45-64-NEXT:    store i32 [[ADD7]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK45-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP17:![0-9]+]]
+// CHECK45-64-NEXT:    store i32 [[ADD7]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK45-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP19:![0-9]+]]
 // CHECK45-64:       omp.inner.for.end:
 // CHECK45-64-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK45-64:       omp.dispatch.inc:
@@ -234,17 +236,19 @@ int bar(int n){
 //
 //
 // CHECK45-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l40
-// CHECK45-64-SAME: (i64 [[N:%.*]], ptr nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK45-64-SAME: (ptr noalias [[DYN_PTR:%.*]], i64 [[N:%.*]], ptr nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR3:[0-9]+]] {
 // CHECK45-64-NEXT:  entry:
+// CHECK45-64-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK45-64-NEXT:    [[N_ADDR:%.*]] = alloca i64, align 8
 // CHECK45-64-NEXT:    [[AA_ADDR:%.*]] = alloca ptr, align 8
 // CHECK45-64-NEXT:    [[N_CASTED:%.*]] = alloca i64, align 8
 // CHECK45-64-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK45-64-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK45-64-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK45-64-NEXT:    store i64 [[N]], ptr [[N_ADDR]], align 8
 // CHECK45-64-NEXT:    store ptr [[AA]], ptr [[AA_ADDR]], align 8
 // CHECK45-64-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[AA_ADDR]], align 8
-// CHECK45-64-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l40_kernel_environment)
+// CHECK45-64-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l40_kernel_environment, ptr [[DYN_PTR]])
 // CHECK45-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // CHECK45-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK45-64:       user_code.entry:
@@ -328,31 +332,31 @@ int bar(int n){
 // CHECK45-64:       omp.dispatch.body:
 // CHECK45-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK45-64:       omp.inner.for.cond:
-// CHECK45-64-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP20:![0-9]+]]
-// CHECK45-64-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP20]]
+// CHECK45-64-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22:![0-9]+]]
+// CHECK45-64-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP22]]
 // CHECK45-64-NEXT:    [[CMP6:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]]
 // CHECK45-64-NEXT:    br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK45-64:       omp.inner.for.body:
-// CHECK45-64-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP20]]
+// CHECK45-64-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
 // CHECK45-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP16]], 1
 // CHECK45-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK45-64-NEXT:    store i32 [[ADD]], ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP20]]
-// CHECK45-64-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP20]]
+// CHECK45-64-NEXT:    store i32 [[ADD]], ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK45-64-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP22]]
 // CHECK45-64-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP17]] to i64
 // CHECK45-64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], ptr [[TMP0]], i64 0, i64 [[IDXPROM]]
-// CHECK45-64-NEXT:    [[TMP18:%.*]] = load i16, ptr [[ARRAYIDX]], align 2, !llvm.access.group [[ACC_GRP20]]
+// CHECK45-64-NEXT:    [[TMP18:%.*]] = load i16, ptr [[ARRAYIDX]], align 2, !llvm.access.group [[ACC_GRP22]]
 // CHECK45-64-NEXT:    [[CONV:%.*]] = sext i16 [[TMP18]] to i32
 // CHECK45-64-NEXT:    [[ADD7:%.*]] = add nsw i32 [[CONV]], 1
 // CHECK45-64-NEXT:    [[CONV8:%.*]] = trunc i32 [[ADD7]] to i16
-// CHECK45-64-NEXT:    store i16 [[CONV8]], ptr [[ARRAYIDX]], align 2, !llvm.access.group [[ACC_GRP20]]
+// CHECK45-64-NEXT:    store i16 [[CONV8]], ptr [[ARRAYIDX]], align 2, !llvm.access.group [[ACC_GRP22]]
 // CHECK45-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK45-64:       omp.body.continue:
 // CHECK45-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK45-64:       omp.inner.for.inc:
-// CHECK45-64-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP20]]
+// CHECK45-64-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
 // CHECK45-64-NEXT:    [[ADD9:%.*]] = add nsw i32 [[TMP19]], 1
-// CHECK45-64-NEXT:    store i32 [[ADD9]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP20]]
-// CHECK45-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP21:![0-9]+]]
+// CHECK45-64-NEXT:    store i32 [[ADD9]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK45-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP23:![0-9]+]]
 // CHECK45-64:       omp.inner.for.end:
 // CHECK45-64-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK45-64:       omp.dispatch.inc:
@@ -387,14 +391,16 @@ int bar(int n){
 //
 //
 // CHECK45-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l45
-// CHECK45-64-SAME: (ptr nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
+// CHECK45-64-SAME: (ptr noalias [[DYN_PTR:%.*]], ptr nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
 // CHECK45-64-NEXT:  entry:
+// CHECK45-64-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK45-64-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
 // CHECK45-64-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK45-64-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK45-64-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK45-64-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
 // CHECK45-64-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 8
-// CHECK45-64-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l45_kernel_environment)
+// CHECK45-64-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l45_kernel_environment, ptr [[DYN_PTR]])
 // CHECK45-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // CHECK45-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK45-64:       user_code.entry:
@@ -455,29 +461,29 @@ int bar(int n){
 // CHECK45-64:       omp.dispatch.body:
 // CHECK45-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK45-64:       omp.inner.for.cond:
-// CHECK45-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP23:![0-9]+]]
-// CHECK45-64-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK45-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP25:![0-9]+]]
+// CHECK45-64-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP25]]
 // CHECK45-64-NEXT:    [[CMP2:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
 // CHECK45-64-NEXT:    br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK45-64:       omp.inner.for.body:
-// CHECK45-64-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK45-64-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP25]]
 // CHECK45-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
 // CHECK45-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK45-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP23]]
-// CHECK45-64-NEXT:    [[TMP11:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK45-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP25]]
+// CHECK45-64-NEXT:    [[TMP11:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP25]]
 // CHECK45-64-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP11]] to i64
 // CHECK45-64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP0]], i64 0, i64 [[IDXPROM]]
-// CHECK45-64-NEXT:    [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK45-64-NEXT:    [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP25]]
 // CHECK45-64-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP12]], 1
-// CHECK45-64-NEXT:    store i32 [[ADD3]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK45-64-NEXT:    store i32 [[ADD3]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP25]]
 // CHECK45-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK45-64:       omp.body.continue:
 // CHECK45-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK45-64:       omp.inner.for.inc:
-// CHECK45-64-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK45-64-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP25]]
 // CHECK45-64-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP13]], 1
-// CHECK45-64-NEXT:    store i32 [[ADD4]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP23]]
-// CHECK45-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP24:![0-9]+]]
+// CHECK45-64-NEXT:    store i32 [[ADD4]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP25]]
+// CHECK45-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP26:![0-9]+]]
 // CHECK45-64:       omp.inner.for.end:
 // CHECK45-64-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK45-64:       omp.dispatch.inc:
@@ -503,17 +509,19 @@ int bar(int n){
 //
 //
 // CHECK45-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l50
-// CHECK45-64-SAME: (ptr nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] {
+// CHECK45-64-SAME: (ptr noalias [[DYN_PTR:%.*]], ptr nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] {
 // CHECK45-64-NEXT:  entry:
+// CHECK45-64-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK45-64-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8
 // CHECK45-64-NEXT:    [[F_ADDR:%.*]] = alloca i64, align 8
 // CHECK45-64-NEXT:    [[F_CASTED:%.*]] = alloca i64, align 8
 // CHECK45-64-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK45-64-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK45-64-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK45-64-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
 // CHECK45-64-NEXT:    store i64 [[F]], ptr [[F_ADDR]], align 8
 // CHECK45-64-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8
-// CHECK45-64-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l50_kernel_environment)
+// CHECK45-64-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l50_kernel_environment, ptr [[DYN_PTR]])
 // CHECK45-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // CHECK45-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK45-64:       user_code.entry:
@@ -582,47 +590,47 @@ int bar(int n){
 // CHECK45-64:       omp.dispatch.body:
 // CHECK45-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK45-64:       omp.inner.for.cond:
-// CHECK45-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26:![0-9]+]]
-// CHECK45-64-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK45-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP28:![0-9]+]]
+// CHECK45-64-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP28]]
 // CHECK45-64-NEXT:    [[CMP3:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
 // CHECK45-64-NEXT:    br i1 [[CMP3]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK45-64:       omp.inner.for.body:
-// CHECK45-64-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK45-64-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP28]]
 // CHECK45-64-NEXT:    [[DIV:%.*]] = sdiv i32 [[TMP10]], 10
 // CHECK45-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[DIV]], 1
 // CHECK45-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK45-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP26]]
-// CHECK45-64-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
-// CHECK45-64-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK45-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP28]]
+// CHECK45-64-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP28]]
+// CHECK45-64-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP28]]
 // CHECK45-64-NEXT:    [[DIV4:%.*]] = sdiv i32 [[TMP12]], 10
 // CHECK45-64-NEXT:    [[MUL5:%.*]] = mul nsw i32 [[DIV4]], 10
 // CHECK45-64-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP11]], [[MUL5]]
 // CHECK45-64-NEXT:    [[MUL6:%.*]] = mul nsw i32 [[SUB]], 1
 // CHECK45-64-NEXT:    [[ADD7:%.*]] = add nsw i32 0, [[MUL6]]
-// CHECK45-64-NEXT:    store i32 [[ADD7]], ptr [[J]], align 4, !llvm.access.group [[ACC_GRP26]]
-// CHECK45-64-NEXT:    store i32 10, ptr [[K]], align 4, !llvm.access.group [[ACC_GRP26]]
-// CHECK45-64-NEXT:    [[TMP13:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP26]]
-// CHECK45-64-NEXT:    [[TMP14:%.*]] = load i32, ptr [[J]], align 4, !llvm.access.group [[ACC_GRP26]]
-// CHECK45-64-NEXT:    [[TMP15:%.*]] = load i32, ptr [[F_ADDR]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK45-64-NEXT:    store i32 [[ADD7]], ptr [[J]], align 4, !llvm.access.group [[ACC_GRP28]]
+// CHECK45-64-NEXT:    store i32 10, ptr [[K]], align 4, !llvm.access.group [[ACC_GRP28]]
+// CHECK45-64-NEXT:    [[TMP13:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP28]]
+// CHECK45-64-NEXT:    [[TMP14:%.*]] = load i32, ptr [[J]], align 4, !llvm.access.group [[ACC_GRP28]]
+// CHECK45-64-NEXT:    [[TMP15:%.*]] = load i32, ptr [[F_ADDR]], align 4, !llvm.access.group [[ACC_GRP28]]
 // CHECK45-64-NEXT:    [[MUL8:%.*]] = mul nsw i32 [[TMP14]], [[TMP15]]
 // CHECK45-64-NEXT:    [[ADD9:%.*]] = add nsw i32 [[TMP13]], [[MUL8]]
-// CHECK45-64-NEXT:    [[TMP16:%.*]] = load i32, ptr [[K]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK45-64-NEXT:    [[TMP16:%.*]] = load i32, ptr [[K]], align 4, !llvm.access.group [[ACC_GRP28]]
 // CHECK45-64-NEXT:    [[ADD10:%.*]] = add nsw i32 [[ADD9]], [[TMP16]]
-// CHECK45-64-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK45-64-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP28]]
 // CHECK45-64-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP17]] to i64
 // CHECK45-64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP0]], i64 0, i64 [[IDXPROM]]
-// CHECK45-64-NEXT:    [[TMP18:%.*]] = load i32, ptr [[J]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK45-64-NEXT:    [[TMP18:%.*]] = load i32, ptr [[J]], align 4, !llvm.access.group [[ACC_GRP28]]
 // CHECK45-64-NEXT:    [[IDXPROM11:%.*]] = sext i32 [[TMP18]] to i64
 // CHECK45-64-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX]], i64 0, i64 [[IDXPROM11]]
-// CHECK45-64-NEXT:    store i32 [[ADD10]], ptr [[ARRAYIDX12]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK45-64-NEXT:    store i32 [[ADD10]], ptr [[ARRAYIDX12]], align 4, !llvm.access.group [[ACC_GRP28]]
 // CHECK45-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK45-64:       omp.body.continue:
 // CHECK45-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK45-64:       omp.inner.for.inc:
-// CHECK45-64-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK45-64-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP28]]
 // CHECK45-64-NEXT:    [[ADD13:%.*]] = add nsw i32 [[TMP19]], 1
-// CHECK45-64-NEXT:    store i32 [[ADD13]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
-// CHECK45-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP27:![0-9]+]]
+// CHECK45-64-NEXT:    store i32 [[ADD13]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP28]]
+// CHECK45-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP29:![0-9]+]]
 // CHECK45-64:       omp.inner.for.end:
 // CHECK45-64-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK45-64:       omp.dispatch.inc:
@@ -649,8 +657,9 @@ int bar(int n){
 //
 //
 // CHECK45-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l34
-// CHECK45-32-SAME: (i32 [[N:%.*]], ptr nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK45-32-SAME: (ptr noalias [[DYN_PTR:%.*]], i32 [[N:%.*]], ptr nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK45-32-NEXT:  entry:
+// CHECK45-32-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK45-32-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
 // CHECK45-32-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
 // CHECK45-32-NEXT:    [[L_ADDR:%.*]] = alloca i32, align 4
@@ -658,11 +667,12 @@ int bar(int n){
 // CHECK45-32-NEXT:    [[L_CASTED:%.*]] = alloca i32, align 4
 // CHECK45-32-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK45-32-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK45-32-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK45-32-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
 // CHECK45-32-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
 // CHECK45-32-NEXT:    store i32 [[L]], ptr [[L_ADDR]], align 4
 // CHECK45-32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
-// CHECK45-32-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l34_kernel_environment)
+// CHECK45-32-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l34_kernel_environment, ptr [[DYN_PTR]])
 // CHECK45-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // CHECK45-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK45-32:       user_code.entry:
@@ -750,28 +760,28 @@ int bar(int n){
 // CHECK45-32:       omp.dispatch.body:
 // CHECK45-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK45-32:       omp.inner.for.cond:
-// CHECK45-32-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16:![0-9]+]]
-// CHECK45-32-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK45-32-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP18:![0-9]+]]
+// CHECK45-32-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK45-32-NEXT:    [[CMP6:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]]
 // CHECK45-32-NEXT:    br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK45-32:       omp.inner.for.body:
-// CHECK45-32-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK45-32-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK45-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP16]], 1
 // CHECK45-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK45-32-NEXT:    store i32 [[ADD]], ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK45-32-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK45-32-NEXT:    store i32 [[ADD]], ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK45-32-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK45-32-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], ptr [[TMP0]], i32 0, i32 [[TMP17]]
-// CHECK45-32-NEXT:    store i32 1, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK45-32-NEXT:    [[TMP18:%.*]] = load i32, ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK45-32-NEXT:    store i32 [[TMP18]], ptr [[L_ADDR]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK45-32-NEXT:    store i32 1, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK45-32-NEXT:    [[TMP18:%.*]] = load i32, ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK45-32-NEXT:    store i32 [[TMP18]], ptr [[L_ADDR]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK45-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK45-32:       omp.body.continue:
 // CHECK45-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK45-32:       omp.inner.for.inc:
-// CHECK45-32-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK45-32-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK45-32-NEXT:    [[ADD7:%.*]] = add nsw i32 [[TMP19]], 1
-// CHECK45-32-NEXT:    store i32 [[ADD7]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK45-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP17:![0-9]+]]
+// CHECK45-32-NEXT:    store i32 [[ADD7]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK45-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP19:![0-9]+]]
 // CHECK45-32:       omp.inner.for.end:
 // CHECK45-32-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK45-32:       omp.dispatch.inc:
@@ -814,17 +824,19 @@ int bar(int n){
 //
 //
 // CHECK45-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l40
-// CHECK45-32-SAME: (i32 [[N:%.*]], ptr nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK45-32-SAME: (ptr noalias [[DYN_PTR:%.*]], i32 [[N:%.*]], ptr nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR3:[0-9]+]] {
 // CHECK45-32-NEXT:  entry:
+// CHECK45-32-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK45-32-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
 // CHECK45-32-NEXT:    [[AA_ADDR:%.*]] = alloca ptr, align 4
 // CHECK45-32-NEXT:    [[N_CASTED:%.*]] = alloca i32, align 4
 // CHECK45-32-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK45-32-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK45-32-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK45-32-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
 // CHECK45-32-NEXT:    store ptr [[AA]], ptr [[AA_ADDR]], align 4
 // CHECK45-32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[AA_ADDR]], align 4
-// CHECK45-32-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l40_kernel_environment)
+// CHECK45-32-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l40_kernel_environment, ptr [[DYN_PTR]])
 // CHECK45-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // CHECK45-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK45-32:       user_code.entry:
@@ -908,30 +920,30 @@ int bar(int n){
 // CHECK45-32:       omp.dispatch.body:
 // CHECK45-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK45-32:       omp.inner.for.cond:
-// CHECK45-32-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP20:![0-9]+]]
-// CHECK45-32-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP20]]
+// CHECK45-32-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22:![0-9]+]]
+// CHECK45-32-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP22]]
 // CHECK45-32-NEXT:    [[CMP6:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]]
 // CHECK45-32-NEXT:    br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK45-32:       omp.inner.for.body:
-// CHECK45-32-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP20]]
+// CHECK45-32-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
 // CHECK45-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP16]], 1
 // CHECK45-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK45-32-NEXT:    store i32 [[ADD]], ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP20]]
-// CHECK45-32-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP20]]
+// CHECK45-32-NEXT:    store i32 [[ADD]], ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK45-32-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP22]]
 // CHECK45-32-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], ptr [[TMP0]], i32 0, i32 [[TMP17]]
-// CHECK45-32-NEXT:    [[TMP18:%.*]] = load i16, ptr [[ARRAYIDX]], align 2, !llvm.access.group [[ACC_GRP20]]
+// CHECK45-32-NEXT:    [[TMP18:%.*]] = load i16, ptr [[ARRAYIDX]], align 2, !llvm.access.group [[ACC_GRP22]]
 // CHECK45-32-NEXT:    [[CONV:%.*]] = sext i16 [[TMP18]] to i32
 // CHECK45-32-NEXT:    [[ADD7:%.*]] = add nsw i32 [[CONV]], 1
 // CHECK45-32-NEXT:    [[CONV8:%.*]] = trunc i32 [[ADD7]] to i16
-// CHECK45-32-NEXT:    store i16 [[CONV8]], ptr [[ARRAYIDX]], align 2, !llvm.access.group [[ACC_GRP20]]
+// CHECK45-32-NEXT:    store i16 [[CONV8]], ptr [[ARRAYIDX]], align 2, !llvm.access.group [[ACC_GRP22]]
 // CHECK45-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK45-32:       omp.body.continue:
 // CHECK45-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK45-32:       omp.inner.for.inc:
-// CHECK45-32-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP20]]
+// CHECK45-32-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
 // CHECK45-32-NEXT:    [[ADD9:%.*]] = add nsw i32 [[TMP19]], 1
-// CHECK45-32-NEXT:    store i32 [[ADD9]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP20]]
-// CHECK45-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP21:![0-9]+]]
+// CHECK45-32-NEXT:    store i32 [[ADD9]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK45-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP23:![0-9]+]]
 // CHECK45-32:       omp.inner.for.end:
 // CHECK45-32-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK45-32:       omp.dispatch.inc:
@@ -966,14 +978,16 @@ int bar(int n){
 //
 //
 // CHECK45-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l45
-// CHECK45-32-SAME: (ptr nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
+// CHECK45-32-SAME: (ptr noalias [[DYN_PTR:%.*]], ptr nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
 // CHECK45-32-NEXT:  entry:
+// CHECK45-32-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK45-32-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 4
 // CHECK45-32-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK45-32-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK45-32-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK45-32-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 4
 // CHECK45-32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 4
-// CHECK45-32-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l45_kernel_environment)
+// CHECK45-32-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l45_kernel_environment, ptr [[DYN_PTR]])
 // CHECK45-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // CHECK45-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK45-32:       user_code.entry:
@@ -1034,28 +1048,28 @@ int bar(int n){
 // CHECK45-32:       omp.dispatch.body:
 // CHECK45-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK45-32:       omp.inner.for.cond:
-// CHECK45-32-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP23:![0-9]+]]
-// CHECK45-32-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK45-32-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP25:![0-9]+]]
+// CHECK45-32-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP25]]
 // CHECK45-32-NEXT:    [[CMP2:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
 // CHECK45-32-NEXT:    br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK45-32:       omp.inner.for.body:
-// CHECK45-32-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK45-32-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP25]]
 // CHECK45-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
 // CHECK45-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK45-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP23]]
-// CHECK45-32-NEXT:    [[TMP11:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK45-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP25]]
+// CHECK45-32-NEXT:    [[TMP11:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP25]]
 // CHECK45-32-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP0]], i32 0, i32 [[TMP11]]
-// CHECK45-32-NEXT:    [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK45-32-NEXT:    [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP25]]
 // CHECK45-32-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP12]], 1
-// CHECK45-32-NEXT:    store i32 [[ADD3]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK45-32-NEXT:    store i32 [[ADD3]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP25]]
 // CHECK45-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK45-32:       omp.body.continue:
 // CHECK45-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK45-32:       omp.inner.for.inc:
-// CHECK45-32-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK45-32-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP25]]
 // CHECK45-32-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP13]], 1
-// CHECK45-32-NEXT:    store i32 [[ADD4]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP23]]
-// CHECK45-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP24:![0-9]+]]
+// CHECK45-32-NEXT:    store i32 [[ADD4]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP25]]
+// CHECK45-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP26:![0-9]+]]
 // CHECK45-32:       omp.inner.for.end:
 // CHECK45-32-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK45-32:       omp.dispatch.inc:
@@ -1081,17 +1095,19 @@ int bar(int n){
 //
 //
 // CHECK45-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l50
-// CHECK45-32-SAME: (ptr nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] {
+// CHECK45-32-SAME: (ptr noalias [[DYN_PTR:%.*]], ptr nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] {
 // CHECK45-32-NEXT:  entry:
+// CHECK45-32-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK45-32-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 4
 // CHECK45-32-NEXT:    [[F_ADDR:%.*]] = alloca i32, align 4
 // CHECK45-32-NEXT:    [[F_CASTED:%.*]] = alloca i32, align 4
 // CHECK45-32-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK45-32-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK45-32-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK45-32-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 4
 // CHECK45-32-NEXT:    store i32 [[F]], ptr [[F_ADDR]], align 4
 // CHECK45-32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 4
-// CHECK45-32-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l50_kernel_environment)
+// CHECK45-32-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l50_kernel_environment, ptr [[DYN_PTR]])
 // CHECK45-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // CHECK45-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK45-32:       user_code.entry:
@@ -1160,45 +1176,45 @@ int bar(int n){
 // CHECK45-32:       omp.dispatch.body:
 // CHECK45-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK45-32:       omp.inner.for.cond:
-// CHECK45-32-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26:![0-9]+]]
-// CHECK45-32-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK45-32-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP28:![0-9]+]]
+// CHECK45-32-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP28]]
 // CHECK45-32-NEXT:    [[CMP3:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
 // CHECK45-32-NEXT:    br i1 [[CMP3]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK45-32:       omp.inner.for.body:
-// CHECK45-32-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK45-32-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP28]]
 // CHECK45-32-NEXT:    [[DIV:%.*]] = sdiv i32 [[TMP10]], 10
 // CHECK45-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[DIV]], 1
 // CHECK45-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK45-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP26]]
-// CHECK45-32-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
-// CHECK45-32-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK45-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP28]]
+// CHECK45-32-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP28]]
+// CHECK45-32-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP28]]
 // CHECK45-32-NEXT:    [[DIV4:%.*]] = sdiv i32 [[TMP12]], 10
 // CHECK45-32-NEXT:    [[MUL5:%.*]] = mul nsw i32 [[DIV4]], 10
 // CHECK45-32-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP11]], [[MUL5]]
 // CHECK45-32-NEXT:    [[MUL6:%.*]] = mul nsw i32 [[SUB]], 1
 // CHECK45-32-NEXT:    [[ADD7:%.*]] = add nsw i32 0, [[MUL6]]
-// CHECK45-32-NEXT:    store i32 [[ADD7]], ptr [[J]], align 4, !llvm.access.group [[ACC_GRP26]]
-// CHECK45-32-NEXT:    store i32 10, ptr [[K]], align 4, !llvm.access.group [[ACC_GRP26]]
-// CHECK45-32-NEXT:    [[TMP13:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP26]]
-// CHECK45-32-NEXT:    [[TMP14:%.*]] = load i32, ptr [[J]], align 4, !llvm.access.group [[ACC_GRP26]]
-// CHECK45-32-NEXT:    [[TMP15:%.*]] = load i32, ptr [[F_ADDR]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK45-32-NEXT:    store i32 [[ADD7]], ptr [[J]], align 4, !llvm.access.group [[ACC_GRP28]]
+// CHECK45-32-NEXT:    store i32 10, ptr [[K]], align 4, !llvm.access.group [[ACC_GRP28]]
+// CHECK45-32-NEXT:    [[TMP13:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP28]]
+// CHECK45-32-NEXT:    [[TMP14:%.*]] = load i32, ptr [[J]], align 4, !llvm.access.group [[ACC_GRP28]]
+// CHECK45-32-NEXT:    [[TMP15:%.*]] = load i32, ptr [[F_ADDR]], align 4, !llvm.access.group [[ACC_GRP28]]
 // CHECK45-32-NEXT:    [[MUL8:%.*]] = mul nsw i32 [[TMP14]], [[TMP15]]
 // CHECK45-32-NEXT:    [[ADD9:%.*]] = add nsw i32 [[TMP13]], [[MUL8]]
-// CHECK45-32-NEXT:    [[TMP16:%.*]] = load i32, ptr [[K]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK45-32-NEXT:    [[TMP16:%.*]] = load i32, ptr [[K]], align 4, !llvm.access.group [[ACC_GRP28]]
 // CHECK45-32-NEXT:    [[ADD10:%.*]] = add nsw i32 [[ADD9]], [[TMP16]]
-// CHECK45-32-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK45-32-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP28]]
 // CHECK45-32-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP0]], i32 0, i32 [[TMP17]]
-// CHECK45-32-NEXT:    [[TMP18:%.*]] = load i32, ptr [[J]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK45-32-NEXT:    [[TMP18:%.*]] = load i32, ptr [[J]], align 4, !llvm.access.group [[ACC_GRP28]]
 // CHECK45-32-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX]], i32 0, i32 [[TMP18]]
-// CHECK45-32-NEXT:    store i32 [[ADD10]], ptr [[ARRAYIDX11]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK45-32-NEXT:    store i32 [[ADD10]], ptr [[ARRAYIDX11]], align 4, !llvm.access.group [[ACC_GRP28]]
 // CHECK45-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK45-32:       omp.body.continue:
 // CHECK45-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK45-32:       omp.inner.for.inc:
-// CHECK45-32-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK45-32-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP28]]
 // CHECK45-32-NEXT:    [[ADD12:%.*]] = add nsw i32 [[TMP19]], 1
-// CHECK45-32-NEXT:    store i32 [[ADD12]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
-// CHECK45-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP27:![0-9]+]]
+// CHECK45-32-NEXT:    store i32 [[ADD12]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP28]]
+// CHECK45-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP29:![0-9]+]]
 // CHECK45-32:       omp.inner.for.end:
 // CHECK45-32-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK45-32:       omp.dispatch.inc:
@@ -1225,8 +1241,9 @@ int bar(int n){
 //
 //
 // CHECK45-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l34
-// CHECK45-32-EX-SAME: (i32 [[N:%.*]], ptr nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK45-32-EX-SAME: (ptr noalias [[DYN_PTR:%.*]], i32 [[N:%.*]], ptr nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK45-32-EX-NEXT:  entry:
+// CHECK45-32-EX-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK45-32-EX-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
 // CHECK45-32-EX-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
 // CHECK45-32-EX-NEXT:    [[L_ADDR:%.*]] = alloca i32, align 4
@@ -1234,11 +1251,12 @@ int bar(int n){
 // CHECK45-32-EX-NEXT:    [[L_CASTED:%.*]] = alloca i32, align 4
 // CHECK45-32-EX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK45-32-EX-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK45-32-EX-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK45-32-EX-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
 // CHECK45-32-EX-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
 // CHECK45-32-EX-NEXT:    store i32 [[L]], ptr [[L_ADDR]], align 4
 // CHECK45-32-EX-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
-// CHECK45-32-EX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l34_kernel_environment)
+// CHECK45-32-EX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l34_kernel_environment, ptr [[DYN_PTR]])
 // CHECK45-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // CHECK45-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK45-32-EX:       user_code.entry:
@@ -1326,28 +1344,28 @@ int bar(int n){
 // CHECK45-32-EX:       omp.dispatch.body:
 // CHECK45-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK45-32-EX:       omp.inner.for.cond:
-// CHECK45-32-EX-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16:![0-9]+]]
-// CHECK45-32-EX-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK45-32-EX-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP18:![0-9]+]]
+// CHECK45-32-EX-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK45-32-EX-NEXT:    [[CMP6:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]]
 // CHECK45-32-EX-NEXT:    br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK45-32-EX:       omp.inner.for.body:
-// CHECK45-32-EX-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK45-32-EX-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK45-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP16]], 1
 // CHECK45-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK45-32-EX-NEXT:    store i32 [[ADD]], ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK45-32-EX-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK45-32-EX-NEXT:    store i32 [[ADD]], ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK45-32-EX-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK45-32-EX-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], ptr [[TMP0]], i32 0, i32 [[TMP17]]
-// CHECK45-32-EX-NEXT:    store i32 1, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK45-32-EX-NEXT:    [[TMP18:%.*]] = load i32, ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK45-32-EX-NEXT:    store i32 [[TMP18]], ptr [[L_ADDR]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK45-32-EX-NEXT:    store i32 1, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK45-32-EX-NEXT:    [[TMP18:%.*]] = load i32, ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK45-32-EX-NEXT:    store i32 [[TMP18]], ptr [[L_ADDR]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK45-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK45-32-EX:       omp.body.continue:
 // CHECK45-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK45-32-EX:       omp.inner.for.inc:
-// CHECK45-32-EX-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK45-32-EX-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK45-32-EX-NEXT:    [[ADD7:%.*]] = add nsw i32 [[TMP19]], 1
-// CHECK45-32-EX-NEXT:    store i32 [[ADD7]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK45-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP17:![0-9]+]]
+// CHECK45-32-EX-NEXT:    store i32 [[ADD7]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK45-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP19:![0-9]+]]
 // CHECK45-32-EX:       omp.inner.for.end:
 // CHECK45-32-EX-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK45-32-EX:       omp.dispatch.inc:
@@ -1390,17 +1408,19 @@ int bar(int n){
 //
 //
 // CHECK45-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l40
-// CHECK45-32-EX-SAME: (i32 [[N:%.*]], ptr nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK45-32-EX-SAME: (ptr noalias [[DYN_PTR:%.*]], i32 [[N:%.*]], ptr nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR3:[0-9]+]] {
 // CHECK45-32-EX-NEXT:  entry:
+// CHECK45-32-EX-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK45-32-EX-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
 // CHECK45-32-EX-NEXT:    [[AA_ADDR:%.*]] = alloca ptr, align 4
 // CHECK45-32-EX-NEXT:    [[N_CASTED:%.*]] = alloca i32, align 4
 // CHECK45-32-EX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK45-32-EX-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK45-32-EX-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK45-32-EX-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
 // CHECK45-32-EX-NEXT:    store ptr [[AA]], ptr [[AA_ADDR]], align 4
 // CHECK45-32-EX-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[AA_ADDR]], align 4
-// CHECK45-32-EX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l40_kernel_environment)
+// CHECK45-32-EX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l40_kernel_environment, ptr [[DYN_PTR]])
 // CHECK45-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // CHECK45-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK45-32-EX:       user_code.entry:
@@ -1484,30 +1504,30 @@ int bar(int n){
 // CHECK45-32-EX:       omp.dispatch.body:
 // CHECK45-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK45-32-EX:       omp.inner.for.cond:
-// CHECK45-32-EX-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP20:![0-9]+]]
-// CHECK45-32-EX-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP20]]
+// CHECK45-32-EX-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22:![0-9]+]]
+// CHECK45-32-EX-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP22]]
 // CHECK45-32-EX-NEXT:    [[CMP6:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]]
 // CHECK45-32-EX-NEXT:    br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK45-32-EX:       omp.inner.for.body:
-// CHECK45-32-EX-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP20]]
+// CHECK45-32-EX-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
 // CHECK45-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP16]], 1
 // CHECK45-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK45-32-EX-NEXT:    store i32 [[ADD]], ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP20]]
-// CHECK45-32-EX-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP20]]
+// CHECK45-32-EX-NEXT:    store i32 [[ADD]], ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK45-32-EX-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP22]]
 // CHECK45-32-EX-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], ptr [[TMP0]], i32 0, i32 [[TMP17]]
-// CHECK45-32-EX-NEXT:    [[TMP18:%.*]] = load i16, ptr [[ARRAYIDX]], align 2, !llvm.access.group [[ACC_GRP20]]
+// CHECK45-32-EX-NEXT:    [[TMP18:%.*]] = load i16, ptr [[ARRAYIDX]], align 2, !llvm.access.group [[ACC_GRP22]]
 // CHECK45-32-EX-NEXT:    [[CONV:%.*]] = sext i16 [[TMP18]] to i32
 // CHECK45-32-EX-NEXT:    [[ADD7:%.*]] = add nsw i32 [[CONV]], 1
 // CHECK45-32-EX-NEXT:    [[CONV8:%.*]] = trunc i32 [[ADD7]] to i16
-// CHECK45-32-EX-NEXT:    store i16 [[CONV8]], ptr [[ARRAYIDX]], align 2, !llvm.access.group [[ACC_GRP20]]
+// CHECK45-32-EX-NEXT:    store i16 [[CONV8]], ptr [[ARRAYIDX]], align 2, !llvm.access.group [[ACC_GRP22]]
 // CHECK45-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK45-32-EX:       omp.body.continue:
 // CHECK45-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK45-32-EX:       omp.inner.for.inc:
-// CHECK45-32-EX-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP20]]
+// CHECK45-32-EX-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
 // CHECK45-32-EX-NEXT:    [[ADD9:%.*]] = add nsw i32 [[TMP19]], 1
-// CHECK45-32-EX-NEXT:    store i32 [[ADD9]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP20]]
-// CHECK45-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP21:![0-9]+]]
+// CHECK45-32-EX-NEXT:    store i32 [[ADD9]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK45-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP23:![0-9]+]]
 // CHECK45-32-EX:       omp.inner.for.end:
 // CHECK45-32-EX-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK45-32-EX:       omp.dispatch.inc:
@@ -1542,14 +1562,16 @@ int bar(int n){
 //
 //
 // CHECK45-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l45
-// CHECK45-32-EX-SAME: (ptr nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
+// CHECK45-32-EX-SAME: (ptr noalias [[DYN_PTR:%.*]], ptr nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
 // CHECK45-32-EX-NEXT:  entry:
+// CHECK45-32-EX-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK45-32-EX-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 4
 // CHECK45-32-EX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK45-32-EX-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK45-32-EX-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK45-32-EX-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 4
 // CHECK45-32-EX-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 4
-// CHECK45-32-EX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l45_kernel_environment)
+// CHECK45-32-EX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l45_kernel_environment, ptr [[DYN_PTR]])
 // CHECK45-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // CHECK45-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK45-32-EX:       user_code.entry:
@@ -1610,28 +1632,28 @@ int bar(int n){
 // CHECK45-32-EX:       omp.dispatch.body:
 // CHECK45-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK45-32-EX:       omp.inner.for.cond:
-// CHECK45-32-EX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP23:![0-9]+]]
-// CHECK45-32-EX-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK45-32-EX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP25:![0-9]+]]
+// CHECK45-32-EX-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP25]]
 // CHECK45-32-EX-NEXT:    [[CMP2:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
 // CHECK45-32-EX-NEXT:    br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK45-32-EX:       omp.inner.for.body:
-// CHECK45-32-EX-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK45-32-EX-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP25]]
 // CHECK45-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
 // CHECK45-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK45-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP23]]
-// CHECK45-32-EX-NEXT:    [[TMP11:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK45-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP25]]
+// CHECK45-32-EX-NEXT:    [[TMP11:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP25]]
 // CHECK45-32-EX-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP0]], i32 0, i32 [[TMP11]]
-// CHECK45-32-EX-NEXT:    [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK45-32-EX-NEXT:    [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP25]]
 // CHECK45-32-EX-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP12]], 1
-// CHECK45-32-EX-NEXT:    store i32 [[ADD3]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK45-32-EX-NEXT:    store i32 [[ADD3]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP25]]
 // CHECK45-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK45-32-EX:       omp.body.continue:
 // CHECK45-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK45-32-EX:       omp.inner.for.inc:
-// CHECK45-32-EX-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK45-32-EX-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP25]]
 // CHECK45-32-EX-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP13]], 1
-// CHECK45-32-EX-NEXT:    store i32 [[ADD4]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP23]]
-// CHECK45-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP24:![0-9]+]]
+// CHECK45-32-EX-NEXT:    store i32 [[ADD4]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP25]]
+// CHECK45-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP26:![0-9]+]]
 // CHECK45-32-EX:       omp.inner.for.end:
 // CHECK45-32-EX-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK45-32-EX:       omp.dispatch.inc:
@@ -1657,17 +1679,19 @@ int bar(int n){
 //
 //
 // CHECK45-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l50
-// CHECK45-32-EX-SAME: (ptr nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] {
+// CHECK45-32-EX-SAME: (ptr noalias [[DYN_PTR:%.*]], ptr nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] {
 // CHECK45-32-EX-NEXT:  entry:
+// CHECK45-32-EX-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK45-32-EX-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 4
 // CHECK45-32-EX-NEXT:    [[F_ADDR:%.*]] = alloca i32, align 4
 // CHECK45-32-EX-NEXT:    [[F_CASTED:%.*]] = alloca i32, align 4
 // CHECK45-32-EX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK45-32-EX-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK45-32-EX-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK45-32-EX-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 4
 // CHECK45-32-EX-NEXT:    store i32 [[F]], ptr [[F_ADDR]], align 4
 // CHECK45-32-EX-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 4
-// CHECK45-32-EX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l50_kernel_environment)
+// CHECK45-32-EX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l50_kernel_environment, ptr [[DYN_PTR]])
 // CHECK45-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // CHECK45-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK45-32-EX:       user_code.entry:
@@ -1736,45 +1760,45 @@ int bar(int n){
 // CHECK45-32-EX:       omp.dispatch.body:
 // CHECK45-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK45-32-EX:       omp.inner.for.cond:
-// CHECK45-32-EX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26:![0-9]+]]
-// CHECK45-32-EX-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK45-32-EX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP28:![0-9]+]]
+// CHECK45-32-EX-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP28]]
 // CHECK45-32-EX-NEXT:    [[CMP3:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
 // CHECK45-32-EX-NEXT:    br i1 [[CMP3]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK45-32-EX:       omp.inner.for.body:
-// CHECK45-32-EX-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK45-32-EX-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP28]]
 // CHECK45-32-EX-NEXT:    [[DIV:%.*]] = sdiv i32 [[TMP10]], 10
 // CHECK45-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[DIV]], 1
 // CHECK45-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK45-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP26]]
-// CHECK45-32-EX-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
-// CHECK45-32-EX-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK45-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP28]]
+// CHECK45-32-EX-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP28]]
+// CHECK45-32-EX-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP28]]
 // CHECK45-32-EX-NEXT:    [[DIV4:%.*]] = sdiv i32 [[TMP12]], 10
 // CHECK45-32-EX-NEXT:    [[MUL5:%.*]] = mul nsw i32 [[DIV4]], 10
 // CHECK45-32-EX-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP11]], [[MUL5]]
 // CHECK45-32-EX-NEXT:    [[MUL6:%.*]] = mul nsw i32 [[SUB]], 1
 // CHECK45-32-EX-NEXT:    [[ADD7:%.*]] = add nsw i32 0, [[MUL6]]
-// CHECK45-32-EX-NEXT:    store i32 [[ADD7]], ptr [[J]], align 4, !llvm.access.group [[ACC_GRP26]]
-// CHECK45-32-EX-NEXT:    store i32 10, ptr [[K]], align 4, !llvm.access.group [[ACC_GRP26]]
-// CHECK45-32-EX-NEXT:    [[TMP13:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP26]]
-// CHECK45-32-EX-NEXT:    [[TMP14:%.*]] = load i32, ptr [[J]], align 4, !llvm.access.group [[ACC_GRP26]]
-// CHECK45-32-EX-NEXT:    [[TMP15:%.*]] = load i32, ptr [[F_ADDR]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK45-32-EX-NEXT:    store i32 [[ADD7]], ptr [[J]], align 4, !llvm.access.group [[ACC_GRP28]]
+// CHECK45-32-EX-NEXT:    store i32 10, ptr [[K]], align 4, !llvm.access.group [[ACC_GRP28]]
+// CHECK45-32-EX-NEXT:    [[TMP13:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP28]]
+// CHECK45-32-EX-NEXT:    [[TMP14:%.*]] = load i32, ptr [[J]], align 4, !llvm.access.group [[ACC_GRP28]]
+// CHECK45-32-EX-NEXT:    [[TMP15:%.*]] = load i32, ptr [[F_ADDR]], align 4, !llvm.access.group [[ACC_GRP28]]
 // CHECK45-32-EX-NEXT:    [[MUL8:%.*]] = mul nsw i32 [[TMP14]], [[TMP15]]
 // CHECK45-32-EX-NEXT:    [[ADD9:%.*]] = add nsw i32 [[TMP13]], [[MUL8]]
-// CHECK45-32-EX-NEXT:    [[TMP16:%.*]] = load i32, ptr [[K]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK45-32-EX-NEXT:    [[TMP16:%.*]] = load i32, ptr [[K]], align 4, !llvm.access.group [[ACC_GRP28]]
 // CHECK45-32-EX-NEXT:    [[ADD10:%.*]] = add nsw i32 [[ADD9]], [[TMP16]]
-// CHECK45-32-EX-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK45-32-EX-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP28]]
 // CHECK45-32-EX-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP0]], i32 0, i32 [[TMP17]]
-// CHECK45-32-EX-NEXT:    [[TMP18:%.*]] = load i32, ptr [[J]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK45-32-EX-NEXT:    [[TMP18:%.*]] = load i32, ptr [[J]], align 4, !llvm.access.group [[ACC_GRP28]]
 // CHECK45-32-EX-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX]], i32 0, i32 [[TMP18]]
-// CHECK45-32-EX-NEXT:    store i32 [[ADD10]], ptr [[ARRAYIDX11]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK45-32-EX-NEXT:    store i32 [[ADD10]], ptr [[ARRAYIDX11]], align 4, !llvm.access.group [[ACC_GRP28]]
 // CHECK45-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK45-32-EX:       omp.body.continue:
 // CHECK45-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK45-32-EX:       omp.inner.for.inc:
-// CHECK45-32-EX-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK45-32-EX-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP28]]
 // CHECK45-32-EX-NEXT:    [[ADD12:%.*]] = add nsw i32 [[TMP19]], 1
-// CHECK45-32-EX-NEXT:    store i32 [[ADD12]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
-// CHECK45-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP27:![0-9]+]]
+// CHECK45-32-EX-NEXT:    store i32 [[ADD12]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP28]]
+// CHECK45-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP29:![0-9]+]]
 // CHECK45-32-EX:       omp.inner.for.end:
 // CHECK45-32-EX-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK45-32-EX:       omp.dispatch.inc:
@@ -1801,8 +1825,9 @@ int bar(int n){
 //
 //
 // CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l34
-// CHECK-64-SAME: (i64 [[N:%.*]], ptr nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-64-SAME: (ptr noalias [[DYN_PTR:%.*]], i64 [[N:%.*]], ptr nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 [[L:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-64-NEXT:    [[N_ADDR:%.*]] = alloca i64, align 8
 // CHECK-64-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-64-NEXT:    [[L_ADDR:%.*]] = alloca i64, align 8
@@ -1810,11 +1835,12 @@ int bar(int n){
 // CHECK-64-NEXT:    [[L_CASTED:%.*]] = alloca i64, align 8
 // CHECK-64-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK-64-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK-64-NEXT:    store i64 [[N]], ptr [[N_ADDR]], align 8
 // CHECK-64-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
 // CHECK-64-NEXT:    store i64 [[L]], ptr [[L_ADDR]], align 8
 // CHECK-64-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK-64-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l34_kernel_environment)
+// CHECK-64-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l34_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-64:       user_code.entry:
@@ -1902,29 +1928,29 @@ int bar(int n){
 // CHECK-64:       omp.dispatch.body:
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-64:       omp.inner.for.cond:
-// CHECK-64-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16:![0-9]+]]
-// CHECK-64-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK-64-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP18:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK-64-NEXT:    [[CMP6:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]]
 // CHECK-64-NEXT:    br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-64:       omp.inner.for.body:
-// CHECK-64-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK-64-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP16]], 1
 // CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK-64-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK-64-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK-64-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP17]] to i64
 // CHECK-64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], ptr [[TMP0]], i64 0, i64 [[IDXPROM]]
-// CHECK-64-NEXT:    store i32 1, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK-64-NEXT:    [[TMP18:%.*]] = load i32, ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK-64-NEXT:    store i32 [[TMP18]], ptr [[L_ADDR]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK-64-NEXT:    store i32 1, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK-64-NEXT:    [[TMP18:%.*]] = load i32, ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK-64-NEXT:    store i32 [[TMP18]], ptr [[L_ADDR]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-64:       omp.body.continue:
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-64:       omp.inner.for.inc:
-// CHECK-64-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK-64-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK-64-NEXT:    [[ADD7:%.*]] = add nsw i32 [[TMP19]], 1
-// CHECK-64-NEXT:    store i32 [[ADD7]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP17:![0-9]+]]
+// CHECK-64-NEXT:    store i32 [[ADD7]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP19:![0-9]+]]
 // CHECK-64:       omp.inner.for.end:
 // CHECK-64-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-64:       omp.dispatch.inc:
@@ -1967,17 +1993,19 @@ int bar(int n){
 //
 //
 // CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l40
-// CHECK-64-SAME: (i64 [[N:%.*]], ptr nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK-64-SAME: (ptr noalias [[DYN_PTR:%.*]], i64 [[N:%.*]], ptr nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR3:[0-9]+]] {
 // CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-64-NEXT:    [[N_ADDR:%.*]] = alloca i64, align 8
 // CHECK-64-NEXT:    [[AA_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-64-NEXT:    [[N_CASTED:%.*]] = alloca i64, align 8
 // CHECK-64-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK-64-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK-64-NEXT:    store i64 [[N]], ptr [[N_ADDR]], align 8
 // CHECK-64-NEXT:    store ptr [[AA]], ptr [[AA_ADDR]], align 8
 // CHECK-64-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[AA_ADDR]], align 8
-// CHECK-64-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l40_kernel_environment)
+// CHECK-64-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l40_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-64:       user_code.entry:
@@ -2061,31 +2089,31 @@ int bar(int n){
 // CHECK-64:       omp.dispatch.body:
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-64:       omp.inner.for.cond:
-// CHECK-64-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP20:![0-9]+]]
-// CHECK-64-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP20]]
+// CHECK-64-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP22]]
 // CHECK-64-NEXT:    [[CMP6:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]]
 // CHECK-64-NEXT:    br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-64:       omp.inner.for.body:
-// CHECK-64-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP20]]
+// CHECK-64-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
 // CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP16]], 1
 // CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP20]]
-// CHECK-64-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP20]]
+// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK-64-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP22]]
 // CHECK-64-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP17]] to i64
 // CHECK-64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], ptr [[TMP0]], i64 0, i64 [[IDXPROM]]
-// CHECK-64-NEXT:    [[TMP18:%.*]] = load i16, ptr [[ARRAYIDX]], align 2, !llvm.access.group [[ACC_GRP20]]
+// CHECK-64-NEXT:    [[TMP18:%.*]] = load i16, ptr [[ARRAYIDX]], align 2, !llvm.access.group [[ACC_GRP22]]
 // CHECK-64-NEXT:    [[CONV:%.*]] = sext i16 [[TMP18]] to i32
 // CHECK-64-NEXT:    [[ADD7:%.*]] = add nsw i32 [[CONV]], 1
 // CHECK-64-NEXT:    [[CONV8:%.*]] = trunc i32 [[ADD7]] to i16
-// CHECK-64-NEXT:    store i16 [[CONV8]], ptr [[ARRAYIDX]], align 2, !llvm.access.group [[ACC_GRP20]]
+// CHECK-64-NEXT:    store i16 [[CONV8]], ptr [[ARRAYIDX]], align 2, !llvm.access.group [[ACC_GRP22]]
 // CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-64:       omp.body.continue:
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-64:       omp.inner.for.inc:
-// CHECK-64-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP20]]
+// CHECK-64-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
 // CHECK-64-NEXT:    [[ADD9:%.*]] = add nsw i32 [[TMP19]], 1
-// CHECK-64-NEXT:    store i32 [[ADD9]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP20]]
-// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP21:![0-9]+]]
+// CHECK-64-NEXT:    store i32 [[ADD9]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP23:![0-9]+]]
 // CHECK-64:       omp.inner.for.end:
 // CHECK-64-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-64:       omp.dispatch.inc:
@@ -2120,14 +2148,16 @@ int bar(int n){
 //
 //
 // CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l45
-// CHECK-64-SAME: (ptr nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
+// CHECK-64-SAME: (ptr noalias [[DYN_PTR:%.*]], ptr nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
 // CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-64-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-64-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK-64-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK-64-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
 // CHECK-64-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 8
-// CHECK-64-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l45_kernel_environment)
+// CHECK-64-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l45_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-64:       user_code.entry:
@@ -2188,29 +2218,29 @@ int bar(int n){
 // CHECK-64:       omp.dispatch.body:
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-64:       omp.inner.for.cond:
-// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP23:![0-9]+]]
-// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP25:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP25]]
 // CHECK-64-NEXT:    [[CMP2:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
 // CHECK-64-NEXT:    br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-64:       omp.inner.for.body:
-// CHECK-64-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK-64-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP25]]
 // CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
 // CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP23]]
-// CHECK-64-NEXT:    [[TMP11:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP25]]
+// CHECK-64-NEXT:    [[TMP11:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP25]]
 // CHECK-64-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP11]] to i64
 // CHECK-64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP0]], i64 0, i64 [[IDXPROM]]
-// CHECK-64-NEXT:    [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK-64-NEXT:    [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP25]]
 // CHECK-64-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP12]], 1
-// CHECK-64-NEXT:    store i32 [[ADD3]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK-64-NEXT:    store i32 [[ADD3]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP25]]
 // CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-64:       omp.body.continue:
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-64:       omp.inner.for.inc:
-// CHECK-64-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK-64-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP25]]
 // CHECK-64-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP13]], 1
-// CHECK-64-NEXT:    store i32 [[ADD4]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP23]]
-// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP24:![0-9]+]]
+// CHECK-64-NEXT:    store i32 [[ADD4]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP25]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP26:![0-9]+]]
 // CHECK-64:       omp.inner.for.end:
 // CHECK-64-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-64:       omp.dispatch.inc:
@@ -2236,17 +2266,19 @@ int bar(int n){
 //
 //
 // CHECK-64-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l50
-// CHECK-64-SAME: (ptr nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] {
+// CHECK-64-SAME: (ptr noalias [[DYN_PTR:%.*]], ptr nonnull align 4 dereferenceable(400) [[C:%.*]], i64 [[F:%.*]]) #[[ATTR0]] {
 // CHECK-64-NEXT:  entry:
+// CHECK-64-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-64-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-64-NEXT:    [[F_ADDR:%.*]] = alloca i64, align 8
 // CHECK-64-NEXT:    [[F_CASTED:%.*]] = alloca i64, align 8
 // CHECK-64-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK-64-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-64-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK-64-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
 // CHECK-64-NEXT:    store i64 [[F]], ptr [[F_ADDR]], align 8
 // CHECK-64-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8
-// CHECK-64-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l50_kernel_environment)
+// CHECK-64-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l50_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-64-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // CHECK-64-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-64:       user_code.entry:
@@ -2315,47 +2347,47 @@ int bar(int n){
 // CHECK-64:       omp.dispatch.body:
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-64:       omp.inner.for.cond:
-// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26:![0-9]+]]
-// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK-64-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP28:![0-9]+]]
+// CHECK-64-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP28]]
 // CHECK-64-NEXT:    [[CMP3:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
 // CHECK-64-NEXT:    br i1 [[CMP3]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-64:       omp.inner.for.body:
-// CHECK-64-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK-64-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP28]]
 // CHECK-64-NEXT:    [[DIV:%.*]] = sdiv i32 [[TMP10]], 10
 // CHECK-64-NEXT:    [[MUL:%.*]] = mul nsw i32 [[DIV]], 1
 // CHECK-64-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP26]]
-// CHECK-64-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
-// CHECK-64-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK-64-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP28]]
+// CHECK-64-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP28]]
+// CHECK-64-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP28]]
 // CHECK-64-NEXT:    [[DIV4:%.*]] = sdiv i32 [[TMP12]], 10
 // CHECK-64-NEXT:    [[MUL5:%.*]] = mul nsw i32 [[DIV4]], 10
 // CHECK-64-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP11]], [[MUL5]]
 // CHECK-64-NEXT:    [[MUL6:%.*]] = mul nsw i32 [[SUB]], 1
 // CHECK-64-NEXT:    [[ADD7:%.*]] = add nsw i32 0, [[MUL6]]
-// CHECK-64-NEXT:    store i32 [[ADD7]], ptr [[J]], align 4, !llvm.access.group [[ACC_GRP26]]
-// CHECK-64-NEXT:    store i32 10, ptr [[K]], align 4, !llvm.access.group [[ACC_GRP26]]
-// CHECK-64-NEXT:    [[TMP13:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP26]]
-// CHECK-64-NEXT:    [[TMP14:%.*]] = load i32, ptr [[J]], align 4, !llvm.access.group [[ACC_GRP26]]
-// CHECK-64-NEXT:    [[TMP15:%.*]] = load i32, ptr [[F_ADDR]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK-64-NEXT:    store i32 [[ADD7]], ptr [[J]], align 4, !llvm.access.group [[ACC_GRP28]]
+// CHECK-64-NEXT:    store i32 10, ptr [[K]], align 4, !llvm.access.group [[ACC_GRP28]]
+// CHECK-64-NEXT:    [[TMP13:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP28]]
+// CHECK-64-NEXT:    [[TMP14:%.*]] = load i32, ptr [[J]], align 4, !llvm.access.group [[ACC_GRP28]]
+// CHECK-64-NEXT:    [[TMP15:%.*]] = load i32, ptr [[F_ADDR]], align 4, !llvm.access.group [[ACC_GRP28]]
 // CHECK-64-NEXT:    [[MUL8:%.*]] = mul nsw i32 [[TMP14]], [[TMP15]]
 // CHECK-64-NEXT:    [[ADD9:%.*]] = add nsw i32 [[TMP13]], [[MUL8]]
-// CHECK-64-NEXT:    [[TMP16:%.*]] = load i32, ptr [[K]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK-64-NEXT:    [[TMP16:%.*]] = load i32, ptr [[K]], align 4, !llvm.access.group [[ACC_GRP28]]
 // CHECK-64-NEXT:    [[ADD10:%.*]] = add nsw i32 [[ADD9]], [[TMP16]]
-// CHECK-64-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK-64-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP28]]
 // CHECK-64-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP17]] to i64
 // CHECK-64-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP0]], i64 0, i64 [[IDXPROM]]
-// CHECK-64-NEXT:    [[TMP18:%.*]] = load i32, ptr [[J]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK-64-NEXT:    [[TMP18:%.*]] = load i32, ptr [[J]], align 4, !llvm.access.group [[ACC_GRP28]]
 // CHECK-64-NEXT:    [[IDXPROM11:%.*]] = sext i32 [[TMP18]] to i64
 // CHECK-64-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX]], i64 0, i64 [[IDXPROM11]]
-// CHECK-64-NEXT:    store i32 [[ADD10]], ptr [[ARRAYIDX12]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK-64-NEXT:    store i32 [[ADD10]], ptr [[ARRAYIDX12]], align 4, !llvm.access.group [[ACC_GRP28]]
 // CHECK-64-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-64:       omp.body.continue:
 // CHECK-64-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-64:       omp.inner.for.inc:
-// CHECK-64-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK-64-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP28]]
 // CHECK-64-NEXT:    [[ADD13:%.*]] = add nsw i32 [[TMP19]], 1
-// CHECK-64-NEXT:    store i32 [[ADD13]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
-// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP27:![0-9]+]]
+// CHECK-64-NEXT:    store i32 [[ADD13]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP28]]
+// CHECK-64-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP29:![0-9]+]]
 // CHECK-64:       omp.inner.for.end:
 // CHECK-64-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-64:       omp.dispatch.inc:
@@ -2382,8 +2414,9 @@ int bar(int n){
 //
 //
 // CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l34
-// CHECK-32-SAME: (i32 [[N:%.*]], ptr nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-32-SAME: (ptr noalias [[DYN_PTR:%.*]], i32 [[N:%.*]], ptr nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
 // CHECK-32-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-NEXT:    [[L_ADDR:%.*]] = alloca i32, align 4
@@ -2391,11 +2424,12 @@ int bar(int n){
 // CHECK-32-NEXT:    [[L_CASTED:%.*]] = alloca i32, align 4
 // CHECK-32-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK-32-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK-32-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
 // CHECK-32-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
 // CHECK-32-NEXT:    store i32 [[L]], ptr [[L_ADDR]], align 4
 // CHECK-32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
-// CHECK-32-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l34_kernel_environment)
+// CHECK-32-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l34_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32:       user_code.entry:
@@ -2483,28 +2517,28 @@ int bar(int n){
 // CHECK-32:       omp.dispatch.body:
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32:       omp.inner.for.cond:
-// CHECK-32-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16:![0-9]+]]
-// CHECK-32-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK-32-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP18:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK-32-NEXT:    [[CMP6:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]]
 // CHECK-32-NEXT:    br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32:       omp.inner.for.body:
-// CHECK-32-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK-32-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP16]], 1
 // CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK-32-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK-32-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK-32-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], ptr [[TMP0]], i32 0, i32 [[TMP17]]
-// CHECK-32-NEXT:    store i32 1, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK-32-NEXT:    [[TMP18:%.*]] = load i32, ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK-32-NEXT:    store i32 [[TMP18]], ptr [[L_ADDR]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK-32-NEXT:    store i32 1, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK-32-NEXT:    [[TMP18:%.*]] = load i32, ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK-32-NEXT:    store i32 [[TMP18]], ptr [[L_ADDR]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32:       omp.body.continue:
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32:       omp.inner.for.inc:
-// CHECK-32-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK-32-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK-32-NEXT:    [[ADD7:%.*]] = add nsw i32 [[TMP19]], 1
-// CHECK-32-NEXT:    store i32 [[ADD7]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP17:![0-9]+]]
+// CHECK-32-NEXT:    store i32 [[ADD7]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP19:![0-9]+]]
 // CHECK-32:       omp.inner.for.end:
 // CHECK-32-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-32:       omp.dispatch.inc:
@@ -2547,17 +2581,19 @@ int bar(int n){
 //
 //
 // CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l40
-// CHECK-32-SAME: (i32 [[N:%.*]], ptr nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK-32-SAME: (ptr noalias [[DYN_PTR:%.*]], i32 [[N:%.*]], ptr nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR3:[0-9]+]] {
 // CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
 // CHECK-32-NEXT:    [[AA_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-NEXT:    [[N_CASTED:%.*]] = alloca i32, align 4
 // CHECK-32-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK-32-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK-32-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
 // CHECK-32-NEXT:    store ptr [[AA]], ptr [[AA_ADDR]], align 4
 // CHECK-32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[AA_ADDR]], align 4
-// CHECK-32-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l40_kernel_environment)
+// CHECK-32-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l40_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32:       user_code.entry:
@@ -2641,30 +2677,30 @@ int bar(int n){
 // CHECK-32:       omp.dispatch.body:
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32:       omp.inner.for.cond:
-// CHECK-32-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP20:![0-9]+]]
-// CHECK-32-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP20]]
+// CHECK-32-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP22]]
 // CHECK-32-NEXT:    [[CMP6:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]]
 // CHECK-32-NEXT:    br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32:       omp.inner.for.body:
-// CHECK-32-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP20]]
+// CHECK-32-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
 // CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP16]], 1
 // CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP20]]
-// CHECK-32-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP20]]
+// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK-32-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP22]]
 // CHECK-32-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], ptr [[TMP0]], i32 0, i32 [[TMP17]]
-// CHECK-32-NEXT:    [[TMP18:%.*]] = load i16, ptr [[ARRAYIDX]], align 2, !llvm.access.group [[ACC_GRP20]]
+// CHECK-32-NEXT:    [[TMP18:%.*]] = load i16, ptr [[ARRAYIDX]], align 2, !llvm.access.group [[ACC_GRP22]]
 // CHECK-32-NEXT:    [[CONV:%.*]] = sext i16 [[TMP18]] to i32
 // CHECK-32-NEXT:    [[ADD7:%.*]] = add nsw i32 [[CONV]], 1
 // CHECK-32-NEXT:    [[CONV8:%.*]] = trunc i32 [[ADD7]] to i16
-// CHECK-32-NEXT:    store i16 [[CONV8]], ptr [[ARRAYIDX]], align 2, !llvm.access.group [[ACC_GRP20]]
+// CHECK-32-NEXT:    store i16 [[CONV8]], ptr [[ARRAYIDX]], align 2, !llvm.access.group [[ACC_GRP22]]
 // CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32:       omp.body.continue:
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32:       omp.inner.for.inc:
-// CHECK-32-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP20]]
+// CHECK-32-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
 // CHECK-32-NEXT:    [[ADD9:%.*]] = add nsw i32 [[TMP19]], 1
-// CHECK-32-NEXT:    store i32 [[ADD9]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP20]]
-// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP21:![0-9]+]]
+// CHECK-32-NEXT:    store i32 [[ADD9]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP23:![0-9]+]]
 // CHECK-32:       omp.inner.for.end:
 // CHECK-32-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-32:       omp.dispatch.inc:
@@ -2699,14 +2735,16 @@ int bar(int n){
 //
 //
 // CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l45
-// CHECK-32-SAME: (ptr nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
+// CHECK-32-SAME: (ptr noalias [[DYN_PTR:%.*]], ptr nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
 // CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK-32-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK-32-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 4
 // CHECK-32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 4
-// CHECK-32-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l45_kernel_environment)
+// CHECK-32-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l45_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32:       user_code.entry:
@@ -2767,28 +2805,28 @@ int bar(int n){
 // CHECK-32:       omp.dispatch.body:
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32:       omp.inner.for.cond:
-// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP23:![0-9]+]]
-// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP25:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP25]]
 // CHECK-32-NEXT:    [[CMP2:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
 // CHECK-32-NEXT:    br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32:       omp.inner.for.body:
-// CHECK-32-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK-32-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP25]]
 // CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
 // CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP23]]
-// CHECK-32-NEXT:    [[TMP11:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP25]]
+// CHECK-32-NEXT:    [[TMP11:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP25]]
 // CHECK-32-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP0]], i32 0, i32 [[TMP11]]
-// CHECK-32-NEXT:    [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK-32-NEXT:    [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP25]]
 // CHECK-32-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP12]], 1
-// CHECK-32-NEXT:    store i32 [[ADD3]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK-32-NEXT:    store i32 [[ADD3]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP25]]
 // CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32:       omp.body.continue:
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32:       omp.inner.for.inc:
-// CHECK-32-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK-32-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP25]]
 // CHECK-32-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP13]], 1
-// CHECK-32-NEXT:    store i32 [[ADD4]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP23]]
-// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP24:![0-9]+]]
+// CHECK-32-NEXT:    store i32 [[ADD4]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP25]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP26:![0-9]+]]
 // CHECK-32:       omp.inner.for.end:
 // CHECK-32-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-32:       omp.dispatch.inc:
@@ -2814,17 +2852,19 @@ int bar(int n){
 //
 //
 // CHECK-32-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l50
-// CHECK-32-SAME: (ptr nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] {
+// CHECK-32-SAME: (ptr noalias [[DYN_PTR:%.*]], ptr nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] {
 // CHECK-32-NEXT:  entry:
+// CHECK-32-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-NEXT:    [[F_ADDR:%.*]] = alloca i32, align 4
 // CHECK-32-NEXT:    [[F_CASTED:%.*]] = alloca i32, align 4
 // CHECK-32-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK-32-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-32-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK-32-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 4
 // CHECK-32-NEXT:    store i32 [[F]], ptr [[F_ADDR]], align 4
 // CHECK-32-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 4
-// CHECK-32-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l50_kernel_environment)
+// CHECK-32-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l50_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // CHECK-32-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32:       user_code.entry:
@@ -2893,45 +2933,45 @@ int bar(int n){
 // CHECK-32:       omp.dispatch.body:
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32:       omp.inner.for.cond:
-// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26:![0-9]+]]
-// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK-32-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP28:![0-9]+]]
+// CHECK-32-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP28]]
 // CHECK-32-NEXT:    [[CMP3:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
 // CHECK-32-NEXT:    br i1 [[CMP3]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32:       omp.inner.for.body:
-// CHECK-32-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK-32-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP28]]
 // CHECK-32-NEXT:    [[DIV:%.*]] = sdiv i32 [[TMP10]], 10
 // CHECK-32-NEXT:    [[MUL:%.*]] = mul nsw i32 [[DIV]], 1
 // CHECK-32-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP26]]
-// CHECK-32-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
-// CHECK-32-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK-32-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP28]]
+// CHECK-32-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP28]]
+// CHECK-32-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP28]]
 // CHECK-32-NEXT:    [[DIV4:%.*]] = sdiv i32 [[TMP12]], 10
 // CHECK-32-NEXT:    [[MUL5:%.*]] = mul nsw i32 [[DIV4]], 10
 // CHECK-32-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP11]], [[MUL5]]
 // CHECK-32-NEXT:    [[MUL6:%.*]] = mul nsw i32 [[SUB]], 1
 // CHECK-32-NEXT:    [[ADD7:%.*]] = add nsw i32 0, [[MUL6]]
-// CHECK-32-NEXT:    store i32 [[ADD7]], ptr [[J]], align 4, !llvm.access.group [[ACC_GRP26]]
-// CHECK-32-NEXT:    store i32 10, ptr [[K]], align 4, !llvm.access.group [[ACC_GRP26]]
-// CHECK-32-NEXT:    [[TMP13:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP26]]
-// CHECK-32-NEXT:    [[TMP14:%.*]] = load i32, ptr [[J]], align 4, !llvm.access.group [[ACC_GRP26]]
-// CHECK-32-NEXT:    [[TMP15:%.*]] = load i32, ptr [[F_ADDR]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK-32-NEXT:    store i32 [[ADD7]], ptr [[J]], align 4, !llvm.access.group [[ACC_GRP28]]
+// CHECK-32-NEXT:    store i32 10, ptr [[K]], align 4, !llvm.access.group [[ACC_GRP28]]
+// CHECK-32-NEXT:    [[TMP13:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP28]]
+// CHECK-32-NEXT:    [[TMP14:%.*]] = load i32, ptr [[J]], align 4, !llvm.access.group [[ACC_GRP28]]
+// CHECK-32-NEXT:    [[TMP15:%.*]] = load i32, ptr [[F_ADDR]], align 4, !llvm.access.group [[ACC_GRP28]]
 // CHECK-32-NEXT:    [[MUL8:%.*]] = mul nsw i32 [[TMP14]], [[TMP15]]
 // CHECK-32-NEXT:    [[ADD9:%.*]] = add nsw i32 [[TMP13]], [[MUL8]]
-// CHECK-32-NEXT:    [[TMP16:%.*]] = load i32, ptr [[K]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK-32-NEXT:    [[TMP16:%.*]] = load i32, ptr [[K]], align 4, !llvm.access.group [[ACC_GRP28]]
 // CHECK-32-NEXT:    [[ADD10:%.*]] = add nsw i32 [[ADD9]], [[TMP16]]
-// CHECK-32-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK-32-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP28]]
 // CHECK-32-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP0]], i32 0, i32 [[TMP17]]
-// CHECK-32-NEXT:    [[TMP18:%.*]] = load i32, ptr [[J]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK-32-NEXT:    [[TMP18:%.*]] = load i32, ptr [[J]], align 4, !llvm.access.group [[ACC_GRP28]]
 // CHECK-32-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX]], i32 0, i32 [[TMP18]]
-// CHECK-32-NEXT:    store i32 [[ADD10]], ptr [[ARRAYIDX11]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK-32-NEXT:    store i32 [[ADD10]], ptr [[ARRAYIDX11]], align 4, !llvm.access.group [[ACC_GRP28]]
 // CHECK-32-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32:       omp.body.continue:
 // CHECK-32-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32:       omp.inner.for.inc:
-// CHECK-32-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK-32-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP28]]
 // CHECK-32-NEXT:    [[ADD12:%.*]] = add nsw i32 [[TMP19]], 1
-// CHECK-32-NEXT:    store i32 [[ADD12]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
-// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP27:![0-9]+]]
+// CHECK-32-NEXT:    store i32 [[ADD12]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP28]]
+// CHECK-32-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP29:![0-9]+]]
 // CHECK-32:       omp.inner.for.end:
 // CHECK-32-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-32:       omp.dispatch.inc:
@@ -2958,8 +2998,9 @@ int bar(int n){
 //
 //
 // CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l34
-// CHECK-32-EX-SAME: (i32 [[N:%.*]], ptr nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-32-EX-SAME: (ptr noalias [[DYN_PTR:%.*]], i32 [[N:%.*]], ptr nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 [[L:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-EX-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
 // CHECK-32-EX-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-EX-NEXT:    [[L_ADDR:%.*]] = alloca i32, align 4
@@ -2967,11 +3008,12 @@ int bar(int n){
 // CHECK-32-EX-NEXT:    [[L_CASTED:%.*]] = alloca i32, align 4
 // CHECK-32-EX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK-32-EX-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK-32-EX-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
 // CHECK-32-EX-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
 // CHECK-32-EX-NEXT:    store i32 [[L]], ptr [[L_ADDR]], align 4
 // CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
-// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l34_kernel_environment)
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l34_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32-EX:       user_code.entry:
@@ -3059,28 +3101,28 @@ int bar(int n){
 // CHECK-32-EX:       omp.dispatch.body:
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32-EX:       omp.inner.for.cond:
-// CHECK-32-EX-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16:![0-9]+]]
-// CHECK-32-EX-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK-32-EX-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP18:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK-32-EX-NEXT:    [[CMP6:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]]
 // CHECK-32-EX-NEXT:    br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32-EX:       omp.inner.for.body:
-// CHECK-32-EX-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK-32-EX-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP16]], 1
 // CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK-32-EX-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK-32-EX-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK-32-EX-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i32], ptr [[TMP0]], i32 0, i32 [[TMP17]]
-// CHECK-32-EX-NEXT:    store i32 1, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK-32-EX-NEXT:    [[TMP18:%.*]] = load i32, ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK-32-EX-NEXT:    store i32 [[TMP18]], ptr [[L_ADDR]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK-32-EX-NEXT:    store i32 1, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK-32-EX-NEXT:    [[TMP18:%.*]] = load i32, ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK-32-EX-NEXT:    store i32 [[TMP18]], ptr [[L_ADDR]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32-EX:       omp.body.continue:
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32-EX:       omp.inner.for.inc:
-// CHECK-32-EX-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
+// CHECK-32-EX-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP18]]
 // CHECK-32-EX-NEXT:    [[ADD7:%.*]] = add nsw i32 [[TMP19]], 1
-// CHECK-32-EX-NEXT:    store i32 [[ADD7]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP16]]
-// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP17:![0-9]+]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD7]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP18]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP19:![0-9]+]]
 // CHECK-32-EX:       omp.inner.for.end:
 // CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-32-EX:       omp.dispatch.inc:
@@ -3123,17 +3165,19 @@ int bar(int n){
 //
 //
 // CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l40
-// CHECK-32-EX-SAME: (i32 [[N:%.*]], ptr nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK-32-EX-SAME: (ptr noalias [[DYN_PTR:%.*]], i32 [[N:%.*]], ptr nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR3:[0-9]+]] {
 // CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-EX-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
 // CHECK-32-EX-NEXT:    [[AA_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-EX-NEXT:    [[N_CASTED:%.*]] = alloca i32, align 4
 // CHECK-32-EX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK-32-EX-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK-32-EX-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
 // CHECK-32-EX-NEXT:    store ptr [[AA]], ptr [[AA_ADDR]], align 4
 // CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[AA_ADDR]], align 4
-// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l40_kernel_environment)
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l40_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32-EX:       user_code.entry:
@@ -3217,30 +3261,30 @@ int bar(int n){
 // CHECK-32-EX:       omp.dispatch.body:
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32-EX:       omp.inner.for.cond:
-// CHECK-32-EX-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP20:![0-9]+]]
-// CHECK-32-EX-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP20]]
+// CHECK-32-EX-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP22]]
 // CHECK-32-EX-NEXT:    [[CMP6:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]]
 // CHECK-32-EX-NEXT:    br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32-EX:       omp.inner.for.body:
-// CHECK-32-EX-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP20]]
+// CHECK-32-EX-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
 // CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP16]], 1
 // CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP20]]
-// CHECK-32-EX-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP20]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK-32-EX-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I3]], align 4, !llvm.access.group [[ACC_GRP22]]
 // CHECK-32-EX-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1000 x i16], ptr [[TMP0]], i32 0, i32 [[TMP17]]
-// CHECK-32-EX-NEXT:    [[TMP18:%.*]] = load i16, ptr [[ARRAYIDX]], align 2, !llvm.access.group [[ACC_GRP20]]
+// CHECK-32-EX-NEXT:    [[TMP18:%.*]] = load i16, ptr [[ARRAYIDX]], align 2, !llvm.access.group [[ACC_GRP22]]
 // CHECK-32-EX-NEXT:    [[CONV:%.*]] = sext i16 [[TMP18]] to i32
 // CHECK-32-EX-NEXT:    [[ADD7:%.*]] = add nsw i32 [[CONV]], 1
 // CHECK-32-EX-NEXT:    [[CONV8:%.*]] = trunc i32 [[ADD7]] to i16
-// CHECK-32-EX-NEXT:    store i16 [[CONV8]], ptr [[ARRAYIDX]], align 2, !llvm.access.group [[ACC_GRP20]]
+// CHECK-32-EX-NEXT:    store i16 [[CONV8]], ptr [[ARRAYIDX]], align 2, !llvm.access.group [[ACC_GRP22]]
 // CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32-EX:       omp.body.continue:
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32-EX:       omp.inner.for.inc:
-// CHECK-32-EX-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP20]]
+// CHECK-32-EX-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
 // CHECK-32-EX-NEXT:    [[ADD9:%.*]] = add nsw i32 [[TMP19]], 1
-// CHECK-32-EX-NEXT:    store i32 [[ADD9]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP20]]
-// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP21:![0-9]+]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD9]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP22]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP23:![0-9]+]]
 // CHECK-32-EX:       omp.inner.for.end:
 // CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-32-EX:       omp.dispatch.inc:
@@ -3275,14 +3319,16 @@ int bar(int n){
 //
 //
 // CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l45
-// CHECK-32-EX-SAME: (ptr nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
+// CHECK-32-EX-SAME: (ptr noalias [[DYN_PTR:%.*]], ptr nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
 // CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-EX-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-EX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK-32-EX-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK-32-EX-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 4
 // CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 4
-// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l45_kernel_environment)
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l45_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32-EX:       user_code.entry:
@@ -3343,28 +3389,28 @@ int bar(int n){
 // CHECK-32-EX:       omp.dispatch.body:
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32-EX:       omp.inner.for.cond:
-// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP23:![0-9]+]]
-// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP25:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP25]]
 // CHECK-32-EX-NEXT:    [[CMP2:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
 // CHECK-32-EX-NEXT:    br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32-EX:       omp.inner.for.body:
-// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP25]]
 // CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP10]], 1
 // CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP23]]
-// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP25]]
+// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP25]]
 // CHECK-32-EX-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[TMP0]], i32 0, i32 [[TMP11]]
-// CHECK-32-EX-NEXT:    [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK-32-EX-NEXT:    [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP25]]
 // CHECK-32-EX-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP12]], 1
-// CHECK-32-EX-NEXT:    store i32 [[ADD3]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD3]], ptr [[ARRAYIDX]], align 4, !llvm.access.group [[ACC_GRP25]]
 // CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32-EX:       omp.body.continue:
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32-EX:       omp.inner.for.inc:
-// CHECK-32-EX-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP23]]
+// CHECK-32-EX-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP25]]
 // CHECK-32-EX-NEXT:    [[ADD4:%.*]] = add nsw i32 [[TMP13]], 1
-// CHECK-32-EX-NEXT:    store i32 [[ADD4]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP23]]
-// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP24:![0-9]+]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD4]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP25]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP26:![0-9]+]]
 // CHECK-32-EX:       omp.inner.for.end:
 // CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-32-EX:       omp.dispatch.inc:
@@ -3390,17 +3436,19 @@ int bar(int n){
 //
 //
 // CHECK-32-EX-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l50
-// CHECK-32-EX-SAME: (ptr nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] {
+// CHECK-32-EX-SAME: (ptr noalias [[DYN_PTR:%.*]], ptr nonnull align 4 dereferenceable(400) [[C:%.*]], i32 [[F:%.*]]) #[[ATTR0]] {
 // CHECK-32-EX-NEXT:  entry:
+// CHECK-32-EX-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-EX-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 4
 // CHECK-32-EX-NEXT:    [[F_ADDR:%.*]] = alloca i32, align 4
 // CHECK-32-EX-NEXT:    [[F_CASTED:%.*]] = alloca i32, align 4
 // CHECK-32-EX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK-32-EX-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK-32-EX-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK-32-EX-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 4
 // CHECK-32-EX-NEXT:    store i32 [[F]], ptr [[F_ADDR]], align 4
 // CHECK-32-EX-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 4
-// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l50_kernel_environment)
+// CHECK-32-EX-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l50_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-32-EX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // CHECK-32-EX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK-32-EX:       user_code.entry:
@@ -3469,45 +3517,45 @@ int bar(int n){
 // CHECK-32-EX:       omp.dispatch.body:
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
 // CHECK-32-EX:       omp.inner.for.cond:
-// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26:![0-9]+]]
-// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK-32-EX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP28:![0-9]+]]
+// CHECK-32-EX-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP28]]
 // CHECK-32-EX-NEXT:    [[CMP3:%.*]] = icmp sle i32 [[TMP8]], [[TMP9]]
 // CHECK-32-EX-NEXT:    br i1 [[CMP3]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
 // CHECK-32-EX:       omp.inner.for.body:
-// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK-32-EX-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP28]]
 // CHECK-32-EX-NEXT:    [[DIV:%.*]] = sdiv i32 [[TMP10]], 10
 // CHECK-32-EX-NEXT:    [[MUL:%.*]] = mul nsw i32 [[DIV]], 1
 // CHECK-32-EX-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
-// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP26]]
-// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
-// CHECK-32-EX-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP28]]
+// CHECK-32-EX-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP28]]
+// CHECK-32-EX-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP28]]
 // CHECK-32-EX-NEXT:    [[DIV4:%.*]] = sdiv i32 [[TMP12]], 10
 // CHECK-32-EX-NEXT:    [[MUL5:%.*]] = mul nsw i32 [[DIV4]], 10
 // CHECK-32-EX-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP11]], [[MUL5]]
 // CHECK-32-EX-NEXT:    [[MUL6:%.*]] = mul nsw i32 [[SUB]], 1
 // CHECK-32-EX-NEXT:    [[ADD7:%.*]] = add nsw i32 0, [[MUL6]]
-// CHECK-32-EX-NEXT:    store i32 [[ADD7]], ptr [[J]], align 4, !llvm.access.group [[ACC_GRP26]]
-// CHECK-32-EX-NEXT:    store i32 10, ptr [[K]], align 4, !llvm.access.group [[ACC_GRP26]]
-// CHECK-32-EX-NEXT:    [[TMP13:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP26]]
-// CHECK-32-EX-NEXT:    [[TMP14:%.*]] = load i32, ptr [[J]], align 4, !llvm.access.group [[ACC_GRP26]]
-// CHECK-32-EX-NEXT:    [[TMP15:%.*]] = load i32, ptr [[F_ADDR]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD7]], ptr [[J]], align 4, !llvm.access.group [[ACC_GRP28]]
+// CHECK-32-EX-NEXT:    store i32 10, ptr [[K]], align 4, !llvm.access.group [[ACC_GRP28]]
+// CHECK-32-EX-NEXT:    [[TMP13:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP28]]
+// CHECK-32-EX-NEXT:    [[TMP14:%.*]] = load i32, ptr [[J]], align 4, !llvm.access.group [[ACC_GRP28]]
+// CHECK-32-EX-NEXT:    [[TMP15:%.*]] = load i32, ptr [[F_ADDR]], align 4, !llvm.access.group [[ACC_GRP28]]
 // CHECK-32-EX-NEXT:    [[MUL8:%.*]] = mul nsw i32 [[TMP14]], [[TMP15]]
 // CHECK-32-EX-NEXT:    [[ADD9:%.*]] = add nsw i32 [[TMP13]], [[MUL8]]
-// CHECK-32-EX-NEXT:    [[TMP16:%.*]] = load i32, ptr [[K]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK-32-EX-NEXT:    [[TMP16:%.*]] = load i32, ptr [[K]], align 4, !llvm.access.group [[ACC_GRP28]]
 // CHECK-32-EX-NEXT:    [[ADD10:%.*]] = add nsw i32 [[ADD9]], [[TMP16]]
-// CHECK-32-EX-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK-32-EX-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I]], align 4, !llvm.access.group [[ACC_GRP28]]
 // CHECK-32-EX-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP0]], i32 0, i32 [[TMP17]]
-// CHECK-32-EX-NEXT:    [[TMP18:%.*]] = load i32, ptr [[J]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK-32-EX-NEXT:    [[TMP18:%.*]] = load i32, ptr [[J]], align 4, !llvm.access.group [[ACC_GRP28]]
 // CHECK-32-EX-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX]], i32 0, i32 [[TMP18]]
-// CHECK-32-EX-NEXT:    store i32 [[ADD10]], ptr [[ARRAYIDX11]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD10]], ptr [[ARRAYIDX11]], align 4, !llvm.access.group [[ACC_GRP28]]
 // CHECK-32-EX-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK-32-EX:       omp.body.continue:
 // CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK-32-EX:       omp.inner.for.inc:
-// CHECK-32-EX-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
+// CHECK-32-EX-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP28]]
 // CHECK-32-EX-NEXT:    [[ADD12:%.*]] = add nsw i32 [[TMP19]], 1
-// CHECK-32-EX-NEXT:    store i32 [[ADD12]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP26]]
-// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP27:![0-9]+]]
+// CHECK-32-EX-NEXT:    store i32 [[ADD12]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP28]]
+// CHECK-32-EX-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP29:![0-9]+]]
 // CHECK-32-EX:       omp.inner.for.end:
 // CHECK-32-EX-NEXT:    br label [[OMP_DISPATCH_INC:%.*]]
 // CHECK-32-EX:       omp.dispatch.inc:
diff --git a/clang/test/OpenMP/nvptx_target_teams_generic_loop_codegen.cpp b/clang/test/OpenMP/nvptx_target_teams_generic_loop_codegen.cpp
index bbff9c9f4fbc0..fc83500a09f98 100644
--- a/clang/test/OpenMP/nvptx_target_teams_generic_loop_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_target_teams_generic_loop_codegen.cpp
@@ -66,17 +66,19 @@ int bar(int n){
 
 #endif
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l28
-// CHECK1-SAME: (i64 noundef [[N:%.*]], ptr noundef nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[N:%.*]], ptr noundef nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[N_ADDR:%.*]] = alloca i64, align 8
 // CHECK1-NEXT:    [[AA_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[N_CASTED:%.*]] = alloca i64, align 8
 // CHECK1-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK1-NEXT:    store i64 [[N]], ptr [[N_ADDR]], align 8
 // CHECK1-NEXT:    store ptr [[AA]], ptr [[AA_ADDR]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[AA_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l28_kernel_environment)
+// CHECK1-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l28_kernel_environment, ptr [[DYN_PTR]])
 // CHECK1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK1:       user_code.entry:
@@ -318,14 +320,16 @@ int bar(int n){
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l33
-// CHECK1-SAME: (ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR4:[0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR4:[0-9]+]] {
 // CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l33_kernel_environment)
+// CHECK1-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l33_kernel_environment, ptr [[DYN_PTR]])
 // CHECK1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK1:       user_code.entry:
@@ -503,17 +507,19 @@ int bar(int n){
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l38
-// CHECK1-SAME: (ptr noundef nonnull align 4 dereferenceable(400) [[C:%.*]], i64 noundef [[F:%.*]]) #[[ATTR4]] {
+// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[C:%.*]], i64 noundef [[F:%.*]]) #[[ATTR4]] {
 // CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[F_ADDR:%.*]] = alloca i64, align 8
 // CHECK1-NEXT:    [[F_CASTED:%.*]] = alloca i64, align 8
 // CHECK1-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK1-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
 // CHECK1-NEXT:    store i64 [[F]], ptr [[F_ADDR]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l38_kernel_environment)
+// CHECK1-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l38_kernel_environment, ptr [[DYN_PTR]])
 // CHECK1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK1:       user_code.entry:
@@ -729,17 +735,19 @@ int bar(int n){
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l46
-// CHECK1-SAME: (i64 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR4]] {
+// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR4]] {
 // CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[N_ADDR:%.*]] = alloca i64, align 8
 // CHECK1-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[N_CASTED:%.*]] = alloca i64, align 8
 // CHECK1-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK1-NEXT:    store i64 [[N]], ptr [[N_ADDR]], align 8
 // CHECK1-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l46_kernel_environment)
+// CHECK1-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l46_kernel_environment, ptr [[DYN_PTR]])
 // CHECK1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK1:       user_code.entry:
@@ -1039,19 +1047,21 @@ int bar(int n){
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l53
-// CHECK1-SAME: (i64 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[A:%.*]], ptr noundef [[V:%.*]]) #[[ATTR4]] {
+// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[A:%.*]], ptr noundef [[V:%.*]]) #[[ATTR4]] {
 // CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[N_ADDR:%.*]] = alloca i64, align 8
 // CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[V_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[N_CASTED:%.*]] = alloca i64, align 8
 // CHECK1-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK1-NEXT:    store i64 [[N]], ptr [[N_ADDR]], align 8
 // CHECK1-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
 // CHECK1-NEXT:    store ptr [[V]], ptr [[V_ADDR]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l53_kernel_environment)
+// CHECK1-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l53_kernel_environment, ptr [[DYN_PTR]])
 // CHECK1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK1:       user_code.entry:
@@ -1302,17 +1312,19 @@ int bar(int n){
 //
 //
 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l28
-// CHECK2-SAME: (i64 noundef [[N:%.*]], ptr noundef nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK2-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[N:%.*]], ptr noundef nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK2-NEXT:  entry:
+// CHECK2-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK2-NEXT:    [[N_ADDR:%.*]] = alloca i64, align 8
 // CHECK2-NEXT:    [[AA_ADDR:%.*]] = alloca ptr, align 8
 // CHECK2-NEXT:    [[N_CASTED:%.*]] = alloca i64, align 8
 // CHECK2-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK2-NEXT:    store i64 [[N]], ptr [[N_ADDR]], align 8
 // CHECK2-NEXT:    store ptr [[AA]], ptr [[AA_ADDR]], align 8
 // CHECK2-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[AA_ADDR]], align 8
-// CHECK2-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l28_kernel_environment)
+// CHECK2-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l28_kernel_environment, ptr [[DYN_PTR]])
 // CHECK2-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // CHECK2-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK2:       user_code.entry:
@@ -1554,14 +1566,16 @@ int bar(int n){
 //
 //
 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l33
-// CHECK2-SAME: (ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR4:[0-9]+]] {
+// CHECK2-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR4:[0-9]+]] {
 // CHECK2-NEXT:  entry:
+// CHECK2-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK2-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
 // CHECK2-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK2-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
 // CHECK2-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 8
-// CHECK2-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l33_kernel_environment)
+// CHECK2-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l33_kernel_environment, ptr [[DYN_PTR]])
 // CHECK2-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // CHECK2-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK2:       user_code.entry:
@@ -1739,17 +1753,19 @@ int bar(int n){
 //
 //
 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l38
-// CHECK2-SAME: (ptr noundef nonnull align 4 dereferenceable(400) [[C:%.*]], i64 noundef [[F:%.*]]) #[[ATTR4]] {
+// CHECK2-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[C:%.*]], i64 noundef [[F:%.*]]) #[[ATTR4]] {
 // CHECK2-NEXT:  entry:
+// CHECK2-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK2-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8
 // CHECK2-NEXT:    [[F_ADDR:%.*]] = alloca i64, align 8
 // CHECK2-NEXT:    [[F_CASTED:%.*]] = alloca i64, align 8
 // CHECK2-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK2-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
 // CHECK2-NEXT:    store i64 [[F]], ptr [[F_ADDR]], align 8
 // CHECK2-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8
-// CHECK2-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l38_kernel_environment)
+// CHECK2-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l38_kernel_environment, ptr [[DYN_PTR]])
 // CHECK2-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // CHECK2-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK2:       user_code.entry:
@@ -1965,17 +1981,19 @@ int bar(int n){
 //
 //
 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l46
-// CHECK2-SAME: (i64 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR4]] {
+// CHECK2-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR4]] {
 // CHECK2-NEXT:  entry:
+// CHECK2-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK2-NEXT:    [[N_ADDR:%.*]] = alloca i64, align 8
 // CHECK2-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8
 // CHECK2-NEXT:    [[N_CASTED:%.*]] = alloca i64, align 8
 // CHECK2-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK2-NEXT:    store i64 [[N]], ptr [[N_ADDR]], align 8
 // CHECK2-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
 // CHECK2-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8
-// CHECK2-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l46_kernel_environment)
+// CHECK2-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l46_kernel_environment, ptr [[DYN_PTR]])
 // CHECK2-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // CHECK2-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK2:       user_code.entry:
@@ -2270,19 +2288,21 @@ int bar(int n){
 //
 //
 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l53
-// CHECK2-SAME: (i64 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[A:%.*]], ptr noundef [[V:%.*]]) #[[ATTR4]] {
+// CHECK2-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[A:%.*]], ptr noundef [[V:%.*]]) #[[ATTR4]] {
 // CHECK2-NEXT:  entry:
+// CHECK2-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK2-NEXT:    [[N_ADDR:%.*]] = alloca i64, align 8
 // CHECK2-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
 // CHECK2-NEXT:    [[V_ADDR:%.*]] = alloca ptr, align 8
 // CHECK2-NEXT:    [[N_CASTED:%.*]] = alloca i64, align 8
 // CHECK2-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK2-NEXT:    store i64 [[N]], ptr [[N_ADDR]], align 8
 // CHECK2-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
 // CHECK2-NEXT:    store ptr [[V]], ptr [[V_ADDR]], align 8
 // CHECK2-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK2-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l53_kernel_environment)
+// CHECK2-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l53_kernel_environment, ptr [[DYN_PTR]])
 // CHECK2-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // CHECK2-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK2:       user_code.entry:
@@ -2533,17 +2553,19 @@ int bar(int n){
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l28
-// CHECK3-SAME: (i32 noundef [[N:%.*]], ptr noundef nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK3-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[N:%.*]], ptr noundef nonnull align 2 dereferenceable(2000) [[AA:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK3-NEXT:  entry:
+// CHECK3-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
 // CHECK3-NEXT:    [[AA_ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[N_CASTED:%.*]] = alloca i32, align 4
 // CHECK3-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK3-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK3-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
 // CHECK3-NEXT:    store ptr [[AA]], ptr [[AA_ADDR]], align 4
 // CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[AA_ADDR]], align 4
-// CHECK3-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l28_kernel_environment)
+// CHECK3-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l28_kernel_environment, ptr [[DYN_PTR]])
 // CHECK3-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // CHECK3-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK3:       user_code.entry:
@@ -2779,14 +2801,16 @@ int bar(int n){
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l33
-// CHECK3-SAME: (ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR4:[0-9]+]] {
+// CHECK3-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR4:[0-9]+]] {
 // CHECK3-NEXT:  entry:
+// CHECK3-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK3-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK3-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 4
 // CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 4
-// CHECK3-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l33_kernel_environment)
+// CHECK3-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l33_kernel_environment, ptr [[DYN_PTR]])
 // CHECK3-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // CHECK3-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK3:       user_code.entry:
@@ -2958,17 +2982,19 @@ int bar(int n){
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l38
-// CHECK3-SAME: (ptr noundef nonnull align 4 dereferenceable(400) [[C:%.*]], i32 noundef [[F:%.*]]) #[[ATTR4]] {
+// CHECK3-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[C:%.*]], i32 noundef [[F:%.*]]) #[[ATTR4]] {
 // CHECK3-NEXT:  entry:
+// CHECK3-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[F_ADDR:%.*]] = alloca i32, align 4
 // CHECK3-NEXT:    [[F_CASTED:%.*]] = alloca i32, align 4
 // CHECK3-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK3-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK3-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 4
 // CHECK3-NEXT:    store i32 [[F]], ptr [[F_ADDR]], align 4
 // CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 4
-// CHECK3-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l38_kernel_environment)
+// CHECK3-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l38_kernel_environment, ptr [[DYN_PTR]])
 // CHECK3-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // CHECK3-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK3:       user_code.entry:
@@ -3177,17 +3203,19 @@ int bar(int n){
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l46
-// CHECK3-SAME: (i32 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR4]] {
+// CHECK3-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[C:%.*]]) #[[ATTR4]] {
 // CHECK3-NEXT:  entry:
+// CHECK3-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
 // CHECK3-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[N_CASTED:%.*]] = alloca i32, align 4
 // CHECK3-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK3-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK3-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
 // CHECK3-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 4
 // CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 4
-// CHECK3-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l46_kernel_environment)
+// CHECK3-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l46_kernel_environment, ptr [[DYN_PTR]])
 // CHECK3-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // CHECK3-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK3:       user_code.entry:
@@ -3490,19 +3518,21 @@ int bar(int n){
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l53
-// CHECK3-SAME: (i32 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[A:%.*]], ptr noundef [[V:%.*]]) #[[ATTR4]] {
+// CHECK3-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[A:%.*]], ptr noundef [[V:%.*]]) #[[ATTR4]] {
 // CHECK3-NEXT:  entry:
+// CHECK3-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
 // CHECK3-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[V_ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[N_CASTED:%.*]] = alloca i32, align 4
 // CHECK3-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK3-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK3-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
 // CHECK3-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
 // CHECK3-NEXT:    store ptr [[V]], ptr [[V_ADDR]], align 4
 // CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
-// CHECK3-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l53_kernel_environment)
+// CHECK3-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l53_kernel_environment, ptr [[DYN_PTR]])
 // CHECK3-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // CHECK3-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK3:       user_code.entry:
diff --git a/clang/test/OpenMP/nvptx_target_teams_generic_loop_generic_mode_codegen.cpp b/clang/test/OpenMP/nvptx_target_teams_generic_loop_generic_mode_codegen.cpp
index c59ec3cf6884e..ef26c9b1003ac 100644
--- a/clang/test/OpenMP/nvptx_target_teams_generic_loop_generic_mode_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_target_teams_generic_loop_generic_mode_codegen.cpp
@@ -30,17 +30,19 @@ int main(int argc, char **argv) {
 
 #endif
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l24
-// CHECK1-SAME: (i64 noundef [[ARGC:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[ARGC:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[ARGC_ADDR:%.*]] = alloca i64, align 8
 // CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[ARGC_CASTED:%.*]] = alloca i64, align 8
 // CHECK1-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK1-NEXT:    store i64 [[ARGC]], ptr [[ARGC_ADDR]], align 8
 // CHECK1-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l24_kernel_environment)
+// CHECK1-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l24_kernel_environment, ptr [[DYN_PTR]])
 // CHECK1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK1:       user_code.entry:
@@ -280,17 +282,19 @@ int main(int argc, char **argv) {
 //
 //
 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l24
-// CHECK2-SAME: (i32 noundef [[ARGC:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK2-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[ARGC:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK2-NEXT:  entry:
+// CHECK2-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK2-NEXT:    [[ARGC_ADDR:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
 // CHECK2-NEXT:    [[ARGC_CASTED:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK2-NEXT:    store i32 [[ARGC]], ptr [[ARGC_ADDR]], align 4
 // CHECK2-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
 // CHECK2-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 4
-// CHECK2-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l24_kernel_environment)
+// CHECK2-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l24_kernel_environment, ptr [[DYN_PTR]])
 // CHECK2-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // CHECK2-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK2:       user_code.entry:
diff --git a/clang/test/OpenMP/nvptx_target_teams_ompx_bare_codegen.cpp b/clang/test/OpenMP/nvptx_target_teams_ompx_bare_codegen.cpp
index e0fe9eb76a6f1..9f8046acb0970 100644
--- a/clang/test/OpenMP/nvptx_target_teams_ompx_bare_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_target_teams_ompx_bare_codegen.cpp
@@ -28,11 +28,13 @@ int bar(int n){
 
 #endif
 // CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l13
-// CHECK-SAME: (i64 noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
 // CHECK-NEXT:    [[A_CASTED:%.*]] = alloca i64, align 8
 // CHECK-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[A_ADDR]], align 1
 // CHECK-NEXT:    store i8 [[TMP0]], ptr [[A_CASTED]], align 1
diff --git a/clang/test/OpenMP/nvptx_teams_codegen.cpp b/clang/test/OpenMP/nvptx_teams_codegen.cpp
index 5cbc5e178d43d..c71f5da40a147 100644
--- a/clang/test/OpenMP/nvptx_teams_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_teams_codegen.cpp
@@ -77,13 +77,15 @@ int main (int argc, char **argv) {
 #endif // CK2
 #endif
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l23
-// CHECK1-SAME: (i64 noundef [[ARGC:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[ARGC:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[ARGC_ADDR:%.*]] = alloca i64, align 8
 // CHECK1-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK1-NEXT:    store i64 [[ARGC]], ptr [[ARGC_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l23_kernel_environment)
+// CHECK1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l23_kernel_environment, ptr [[DYN_PTR]])
 // CHECK1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK1:       user_code.entry:
@@ -116,13 +118,15 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l15
-// CHECK1-SAME: (ptr noundef [[ARGC:%.*]]) #[[ATTR0]] {
+// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef [[ARGC:%.*]]) #[[ATTR0]] {
 // CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[ARGC_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK1-NEXT:    store ptr [[ARGC]], ptr [[ARGC_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l15_kernel_environment)
+// CHECK1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l15_kernel_environment, ptr [[DYN_PTR]])
 // CHECK1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK1:       user_code.entry:
@@ -155,13 +159,15 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l23
-// CHECK2-SAME: (i32 noundef [[ARGC:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK2-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[ARGC:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK2-NEXT:  entry:
+// CHECK2-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK2-NEXT:    [[ARGC_ADDR:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK2-NEXT:    store i32 [[ARGC]], ptr [[ARGC_ADDR]], align 4
-// CHECK2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l23_kernel_environment)
+// CHECK2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l23_kernel_environment, ptr [[DYN_PTR]])
 // CHECK2-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK2-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK2:       user_code.entry:
@@ -194,13 +200,15 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l15
-// CHECK2-SAME: (ptr noundef [[ARGC:%.*]]) #[[ATTR0]] {
+// CHECK2-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef [[ARGC:%.*]]) #[[ATTR0]] {
 // CHECK2-NEXT:  entry:
+// CHECK2-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK2-NEXT:    [[ARGC_ADDR:%.*]] = alloca ptr, align 4
 // CHECK2-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK2-NEXT:    store ptr [[ARGC]], ptr [[ARGC_ADDR]], align 4
-// CHECK2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l15_kernel_environment)
+// CHECK2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l15_kernel_environment, ptr [[DYN_PTR]])
 // CHECK2-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK2-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK2:       user_code.entry:
@@ -233,17 +241,19 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l64
-// CHECK3-SAME: (i64 noundef [[A:%.*]], i64 noundef [[B:%.*]], i64 noundef [[ARGC:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK3-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[A:%.*]], i64 noundef [[B:%.*]], i64 noundef [[ARGC:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK3-NEXT:  entry:
+// CHECK3-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK3-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
 // CHECK3-NEXT:    [[B_ADDR:%.*]] = alloca i64, align 8
 // CHECK3-NEXT:    [[ARGC_ADDR:%.*]] = alloca i64, align 8
 // CHECK3-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK3-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK3-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
 // CHECK3-NEXT:    store i64 [[B]], ptr [[B_ADDR]], align 8
 // CHECK3-NEXT:    store i64 [[ARGC]], ptr [[ARGC_ADDR]], align 8
-// CHECK3-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l64_kernel_environment)
+// CHECK3-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l64_kernel_environment, ptr [[DYN_PTR]])
 // CHECK3-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK3-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK3:       user_code.entry:
@@ -276,17 +286,19 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l53
-// CHECK3-SAME: (i64 noundef [[A:%.*]], i64 noundef [[B:%.*]], ptr noundef [[ARGC:%.*]]) #[[ATTR0]] {
+// CHECK3-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[A:%.*]], i64 noundef [[B:%.*]], ptr noundef [[ARGC:%.*]]) #[[ATTR0]] {
 // CHECK3-NEXT:  entry:
+// CHECK3-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK3-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
 // CHECK3-NEXT:    [[B_ADDR:%.*]] = alloca i64, align 8
 // CHECK3-NEXT:    [[ARGC_ADDR:%.*]] = alloca ptr, align 8
 // CHECK3-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK3-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK3-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
 // CHECK3-NEXT:    store i64 [[B]], ptr [[B_ADDR]], align 8
 // CHECK3-NEXT:    store ptr [[ARGC]], ptr [[ARGC_ADDR]], align 8
-// CHECK3-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l53_kernel_environment)
+// CHECK3-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l53_kernel_environment, ptr [[DYN_PTR]])
 // CHECK3-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK3-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK3:       user_code.entry:
@@ -319,17 +331,19 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l64
-// CHECK4-SAME: (i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], i32 noundef [[ARGC:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK4-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], i32 noundef [[ARGC:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK4-NEXT:  entry:
+// CHECK4-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK4-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 // CHECK4-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
 // CHECK4-NEXT:    [[ARGC_ADDR:%.*]] = alloca i32, align 4
 // CHECK4-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK4-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK4-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK4-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
 // CHECK4-NEXT:    store i32 [[B]], ptr [[B_ADDR]], align 4
 // CHECK4-NEXT:    store i32 [[ARGC]], ptr [[ARGC_ADDR]], align 4
-// CHECK4-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l64_kernel_environment)
+// CHECK4-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l64_kernel_environment, ptr [[DYN_PTR]])
 // CHECK4-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK4-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK4:       user_code.entry:
@@ -362,17 +376,19 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l53
-// CHECK4-SAME: (i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], ptr noundef [[ARGC:%.*]]) #[[ATTR0]] {
+// CHECK4-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], ptr noundef [[ARGC:%.*]]) #[[ATTR0]] {
 // CHECK4-NEXT:  entry:
+// CHECK4-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK4-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 // CHECK4-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
 // CHECK4-NEXT:    [[ARGC_ADDR:%.*]] = alloca ptr, align 4
 // CHECK4-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK4-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK4-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK4-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
 // CHECK4-NEXT:    store i32 [[B]], ptr [[B_ADDR]], align 4
 // CHECK4-NEXT:    store ptr [[ARGC]], ptr [[ARGC_ADDR]], align 4
-// CHECK4-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l53_kernel_environment)
+// CHECK4-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l53_kernel_environment, ptr [[DYN_PTR]])
 // CHECK4-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK4-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK4:       user_code.entry:
diff --git a/clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp b/clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp
index d700439e2508e..f4ec40b030a41 100644
--- a/clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp
@@ -51,13 +51,15 @@ int bar(int n){
 
 #endif
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l20
-// CHECK1-SAME: (i64 noundef [[E:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[E:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[E_ADDR:%.*]] = alloca i64, align 8
 // CHECK1-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK1-NEXT:    store i64 [[E]], ptr [[E_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l20_kernel_environment)
+// CHECK1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l20_kernel_environment, ptr [[DYN_PTR]])
 // CHECK1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK1:       user_code.entry:
@@ -320,15 +322,17 @@ int bar(int n){
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l26
-// CHECK1-SAME: (i64 noundef [[C:%.*]], i64 noundef [[D:%.*]]) #[[ATTR0]] {
+// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[C:%.*]], i64 noundef [[D:%.*]]) #[[ATTR0]] {
 // CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[C_ADDR:%.*]] = alloca i64, align 8
 // CHECK1-NEXT:    [[D_ADDR:%.*]] = alloca i64, align 8
 // CHECK1-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK1-NEXT:    store i64 [[C]], ptr [[C_ADDR]], align 8
 // CHECK1-NEXT:    store i64 [[D]], ptr [[D_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l26_kernel_environment)
+// CHECK1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l26_kernel_environment, ptr [[DYN_PTR]])
 // CHECK1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK1:       user_code.entry:
@@ -669,15 +673,17 @@ int bar(int n){
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l33
-// CHECK1-SAME: (i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
 // CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca i64, align 8
 // CHECK1-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK1-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
 // CHECK1-NEXT:    store i64 [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l33_kernel_environment)
+// CHECK1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l33_kernel_environment, ptr [[DYN_PTR]])
 // CHECK1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK1:       user_code.entry:
@@ -1242,15 +1248,17 @@ int bar(int n){
 //
 //
 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l20
-// CHECK2-SAME: (ptr noundef nonnull align 8 dereferenceable(8) [[E:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK2-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[E:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK2-NEXT:  entry:
+// CHECK2-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK2-NEXT:    [[E_ADDR:%.*]] = alloca ptr, align 4
 // CHECK2-NEXT:    [[E1:%.*]] = alloca double, align 8
 // CHECK2-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK2-NEXT:    store ptr [[E]], ptr [[E_ADDR]], align 4
 // CHECK2-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[E_ADDR]], align 4
-// CHECK2-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l20_kernel_environment)
+// CHECK2-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l20_kernel_environment, ptr [[DYN_PTR]])
 // CHECK2-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // CHECK2-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK2:       user_code.entry:
@@ -1511,15 +1519,17 @@ int bar(int n){
 //
 //
 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l26
-// CHECK2-SAME: (i32 noundef [[C:%.*]], i32 noundef [[D:%.*]]) #[[ATTR0]] {
+// CHECK2-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[C:%.*]], i32 noundef [[D:%.*]]) #[[ATTR0]] {
 // CHECK2-NEXT:  entry:
+// CHECK2-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK2-NEXT:    [[C_ADDR:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    [[D_ADDR:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK2-NEXT:    store i32 [[C]], ptr [[C_ADDR]], align 4
 // CHECK2-NEXT:    store i32 [[D]], ptr [[D_ADDR]], align 4
-// CHECK2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l26_kernel_environment)
+// CHECK2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l26_kernel_environment, ptr [[DYN_PTR]])
 // CHECK2-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK2-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK2:       user_code.entry:
@@ -1860,15 +1870,17 @@ int bar(int n){
 //
 //
 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l33
-// CHECK2-SAME: (i32 noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK2-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK2-NEXT:  entry:
+// CHECK2-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK2-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK2-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK2-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
 // CHECK2-NEXT:    store i32 [[B]], ptr [[B_ADDR]], align 4
-// CHECK2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l33_kernel_environment)
+// CHECK2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l33_kernel_environment, ptr [[DYN_PTR]])
 // CHECK2-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK2-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK2:       user_code.entry:
@@ -2433,15 +2445,17 @@ int bar(int n){
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l20
-// CHECK3-SAME: (ptr noundef nonnull align 8 dereferenceable(8) [[E:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK3-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[E:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK3-NEXT:  entry:
+// CHECK3-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[E_ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[E1:%.*]] = alloca double, align 8
 // CHECK3-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK3-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK3-NEXT:    store ptr [[E]], ptr [[E_ADDR]], align 4
 // CHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[E_ADDR]], align 4
-// CHECK3-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l20_kernel_environment)
+// CHECK3-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l20_kernel_environment, ptr [[DYN_PTR]])
 // CHECK3-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // CHECK3-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK3:       user_code.entry:
@@ -2702,15 +2716,17 @@ int bar(int n){
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l26
-// CHECK3-SAME: (i32 noundef [[C:%.*]], i32 noundef [[D:%.*]]) #[[ATTR0]] {
+// CHECK3-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[C:%.*]], i32 noundef [[D:%.*]]) #[[ATTR0]] {
 // CHECK3-NEXT:  entry:
+// CHECK3-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[C_ADDR:%.*]] = alloca i32, align 4
 // CHECK3-NEXT:    [[D_ADDR:%.*]] = alloca i32, align 4
 // CHECK3-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK3-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK3-NEXT:    store i32 [[C]], ptr [[C_ADDR]], align 4
 // CHECK3-NEXT:    store i32 [[D]], ptr [[D_ADDR]], align 4
-// CHECK3-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l26_kernel_environment)
+// CHECK3-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l26_kernel_environment, ptr [[DYN_PTR]])
 // CHECK3-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK3-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK3:       user_code.entry:
@@ -3051,15 +3067,17 @@ int bar(int n){
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l33
-// CHECK3-SAME: (i32 noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK3-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK3-NEXT:  entry:
+// CHECK3-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 // CHECK3-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
 // CHECK3-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK3-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
+// CHECK3-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK3-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
 // CHECK3-NEXT:    store i32 [[B]], ptr [[B_ADDR]], align 4
-// CHECK3-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l33_kernel_environment)
+// CHECK3-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIcET_i_l33_kernel_environment, ptr [[DYN_PTR]])
 // CHECK3-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK3-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK3:       user_code.entry:
diff --git a/clang/test/OpenMP/ompx_attributes_codegen.cpp b/clang/test/OpenMP/ompx_attributes_codegen.cpp
index bcf524b464aef..6735972c6b107 100644
--- a/clang/test/OpenMP/ompx_attributes_codegen.cpp
+++ b/clang/test/OpenMP/ompx_attributes_codegen.cpp
@@ -9,9 +9,9 @@
 
 // Check that the target attributes are set on the generated kernel
 void func() {
-  // AMD: amdgpu_kernel void @__omp_offloading[[HASH:.*]]_l16() #0
-  // AMD: amdgpu_kernel void @__omp_offloading[[HASH:.*]]_l18()
-  // AMD: amdgpu_kernel void @__omp_offloading[[HASH:.*]]_l20() #4
+  // AMD: amdgpu_kernel void @__omp_offloading[[HASH:.*]]_l16(ptr {{[^,]+}}) #0
+  // AMD: amdgpu_kernel void @__omp_offloading[[HASH:.*]]_l18(ptr {{[^,]+}})
+  // AMD: amdgpu_kernel void @__omp_offloading[[HASH:.*]]_l20(ptr {{[^,]+}}) #4
 
   #pragma omp target ompx_attribute([[clang::amdgpu_flat_work_group_size(10, 20)]])
   {}
diff --git a/clang/test/OpenMP/openmp_offload_codegen.cpp b/clang/test/OpenMP/openmp_offload_codegen.cpp
index 25285fd93eacd..6987b2de74f1b 100644
--- a/clang/test/OpenMP/openmp_offload_codegen.cpp
+++ b/clang/test/OpenMP/openmp_offload_codegen.cpp
@@ -25,7 +25,7 @@ void target_maps_parallel_integer(int a){
   }
 }
 
-// CK1-DEVICE: {{.*}}void @__omp_offloading_{{.*}}(ptr noundef nonnull align 4 dereferenceable(4){{.*}}
+// CK1-DEVICE: {{.*}}void @__omp_offloading_{{.*}}(ptr {{[^,]+}}, ptr noundef nonnull align 4 dereferenceable(4){{.*}}
 
 // CK1: {{.*}}void {{.*}}target_maps_parallel_integer{{.*}} {
 
diff --git a/clang/test/OpenMP/reduction_implicit_map.cpp b/clang/test/OpenMP/reduction_implicit_map.cpp
index bcd940e1316f5..03864f6215bdd 100644
--- a/clang/test/OpenMP/reduction_implicit_map.cpp
+++ b/clang/test/OpenMP/reduction_implicit_map.cpp
@@ -98,12 +98,14 @@ int main()
   return 0;
 }
 // CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l32
-// CHECK-SAME: (ptr noundef [[E:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef [[E:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-NEXT:    [[E_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x ptr], align 8
+// CHECK-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK-NEXT:    store ptr [[E]], ptr [[E_ADDR]], align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l32_kernel_environment)
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l32_kernel_environment, ptr [[DYN_PTR]])
 // CHECK-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 // CHECK-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // CHECK:       user_code.entry:
@@ -348,7 +350,7 @@ int main()
 // CHECK1-NEXT:    [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
 // CHECK1-NEXT:    br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK1:       omp_offload.failed:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3barv_l50(ptr [[O]]) #[[ATTR8:[0-9]+]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3barv_l50(ptr [[O]]) #[[ATTR7:[0-9]+]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
@@ -389,7 +391,7 @@ int main()
 // CHECK1-NEXT:    [[TMP39:%.*]] = icmp ne i32 [[TMP38]], 0
 // CHECK1-NEXT:    br i1 [[TMP39]], label [[OMP_OFFLOAD_FAILED5:%.*]], label [[OMP_OFFLOAD_CONT6:%.*]]
 // CHECK1:       omp_offload.failed5:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3barv_l55(ptr [[B]]) #[[ATTR8]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3barv_l55(ptr [[B]]) #[[ATTR7]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT6]]
 // CHECK1:       omp_offload.cont6:
 // CHECK1-NEXT:    ret i32 0
@@ -416,7 +418,7 @@ int main()
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3barv_l50.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(20) [[O:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(20) [[O:%.*]]) #[[ATTR2]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -476,7 +478,7 @@ int main()
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3barv_l50.omp_outlined.omp.reduction.reduction_func
-// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4:[0-9]+]] {
+// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3:[0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8
@@ -504,7 +506,7 @@ int main()
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3barv_l55.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 8 dereferenceable(8000) [[B:%.*]]) #[[ATTR3]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 8 dereferenceable(8000) [[B:%.*]]) #[[ATTR2]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -697,7 +699,7 @@ int main()
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@.red_init.
-// CHECK1-SAME: (ptr noalias noundef [[TMP0:%.*]], ptr noalias noundef [[TMP1:%.*]]) #[[ATTR4]] {
+// CHECK1-SAME: (ptr noalias noundef [[TMP0:%.*]], ptr noalias noundef [[TMP1:%.*]]) #[[ATTR3]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8
@@ -720,7 +722,7 @@ int main()
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@.red_comb.
-// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4]] {
+// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8
@@ -749,7 +751,7 @@ int main()
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3barv_l55.omp_outlined.omp.reduction.reduction_func
-// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4]] {
+// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8
@@ -794,7 +796,7 @@ int main()
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@main
-// CHECK1-SAME: () #[[ATTR11:[0-9]+]] {
+// CHECK1-SAME: () #[[ATTR10:[0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[A:%.*]] = alloca i32, align 4
@@ -805,7 +807,7 @@ int main()
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK1-SAME: () #[[ATTR12:[0-9]+]] {
+// CHECK1-SAME: () #[[ATTR11:[0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK1-NEXT:    ret void
@@ -927,7 +929,7 @@ int main()
 // CHECK2-NEXT:    [[TMP42:%.*]] = icmp ne i32 [[TMP41]], 0
 // CHECK2-NEXT:    br i1 [[TMP42]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK2:       omp_offload.failed:
-// CHECK2-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3sumPiiS__l69(i32 [[TMP1]], ptr [[TMP2]], ptr [[TMP3]]) #[[ATTR3:[0-9]+]]
+// CHECK2-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3sumPiiS__l69(i32 [[TMP1]], ptr [[TMP2]], ptr [[TMP3]]) #[[ATTR2:[0-9]+]]
 // CHECK2-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK2:       omp_offload.cont:
 // CHECK2-NEXT:    [[TMP43:%.*]] = load i32, ptr [[SIZE_ADDR]], align 4
@@ -1008,7 +1010,7 @@ int main()
 // CHECK2-NEXT:    [[TMP85:%.*]] = icmp ne i32 [[TMP84]], 0
 // CHECK2-NEXT:    br i1 [[TMP85]], label [[OMP_OFFLOAD_FAILED19:%.*]], label [[OMP_OFFLOAD_CONT20:%.*]]
 // CHECK2:       omp_offload.failed19:
-// CHECK2-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3sumPiiS__l73(i32 [[TMP44]], ptr [[TMP45]], ptr [[TMP46]]) #[[ATTR3]]
+// CHECK2-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3sumPiiS__l73(i32 [[TMP44]], ptr [[TMP45]], ptr [[TMP46]]) #[[ATTR2]]
 // CHECK2-NEXT:    br label [[OMP_OFFLOAD_CONT20]]
 // CHECK2:       omp_offload.cont20:
 // CHECK2-NEXT:    [[TMP86:%.*]] = load i32, ptr [[SIZE_ADDR]], align 4
@@ -1059,7 +1061,7 @@ int main()
 // CHECK2-NEXT:    [[TMP110:%.*]] = icmp ne i32 [[TMP109]], 0
 // CHECK2-NEXT:    br i1 [[TMP110]], label [[OMP_OFFLOAD_FAILED27:%.*]], label [[OMP_OFFLOAD_CONT28:%.*]]
 // CHECK2:       omp_offload.failed27:
-// CHECK2-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3sumPiiS__l78(i32 [[TMP87]], ptr [[A]]) #[[ATTR3]]
+// CHECK2-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3sumPiiS__l78(i32 [[TMP87]], ptr [[A]]) #[[ATTR2]]
 // CHECK2-NEXT:    br label [[OMP_OFFLOAD_CONT28]]
 // CHECK2:       omp_offload.cont28:
 // CHECK2-NEXT:    [[TMP111:%.*]] = load i32, ptr [[SIZE_ADDR]], align 4
@@ -1110,7 +1112,7 @@ int main()
 // CHECK2-NEXT:    [[TMP135:%.*]] = icmp ne i32 [[TMP134]], 0
 // CHECK2-NEXT:    br i1 [[TMP135]], label [[OMP_OFFLOAD_FAILED35:%.*]], label [[OMP_OFFLOAD_CONT36:%.*]]
 // CHECK2:       omp_offload.failed35:
-// CHECK2-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3sumPiiS__l81(i32 [[TMP112]], ptr [[A]]) #[[ATTR3]]
+// CHECK2-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3sumPiiS__l81(i32 [[TMP112]], ptr [[A]]) #[[ATTR2]]
 // CHECK2-NEXT:    br label [[OMP_OFFLOAD_CONT36]]
 // CHECK2:       omp_offload.cont36:
 // CHECK2-NEXT:    ret void
@@ -1136,7 +1138,7 @@ int main()
 //
 //
 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3sumPiiS__l69.omp_outlined
-// CHECK2-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[SIZE:%.*]], ptr noundef [[OUTPUT:%.*]], ptr noundef [[INPUT:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK2-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[SIZE:%.*]], ptr noundef [[OUTPUT:%.*]], ptr noundef [[INPUT:%.*]]) #[[ATTR1]] {
 // CHECK2-NEXT:  entry:
 // CHECK2-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK2-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1262,7 +1264,7 @@ int main()
 //
 //
 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3sumPiiS__l69.omp_outlined.omp_outlined
-// CHECK2-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], i32 noundef [[SIZE:%.*]], ptr noundef [[OUTPUT:%.*]], ptr noundef [[INPUT:%.*]]) #[[ATTR2]] {
+// CHECK2-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], i32 noundef [[SIZE:%.*]], ptr noundef [[OUTPUT:%.*]], ptr noundef [[INPUT:%.*]]) #[[ATTR1]] {
 // CHECK2-NEXT:  entry:
 // CHECK2-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK2-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1401,7 +1403,7 @@ int main()
 //
 //
 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3sumPiiS__l69.omp_outlined.omp_outlined.omp.reduction.reduction_func
-// CHECK2-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4:[0-9]+]] {
+// CHECK2-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3:[0-9]+]] {
 // CHECK2-NEXT:  entry:
 // CHECK2-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 4
 // CHECK2-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 4
@@ -1421,7 +1423,7 @@ int main()
 //
 //
 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3sumPiiS__l69.omp_outlined.omp.reduction.reduction_func
-// CHECK2-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4]] {
+// CHECK2-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] {
 // CHECK2-NEXT:  entry:
 // CHECK2-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 4
 // CHECK2-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 4
@@ -1460,7 +1462,7 @@ int main()
 //
 //
 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3sumPiiS__l73.omp_outlined
-// CHECK2-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[SIZE:%.*]], ptr noundef [[OUTPUT:%.*]], ptr noundef [[INPUT:%.*]]) #[[ATTR2]] {
+// CHECK2-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[SIZE:%.*]], ptr noundef [[OUTPUT:%.*]], ptr noundef [[INPUT:%.*]]) #[[ATTR1]] {
 // CHECK2-NEXT:  entry:
 // CHECK2-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK2-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1620,7 +1622,7 @@ int main()
 //
 //
 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3sumPiiS__l73.omp_outlined.omp_outlined
-// CHECK2-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], i32 noundef [[SIZE:%.*]], ptr noundef [[OUTPUT:%.*]], ptr noundef [[INPUT:%.*]]) #[[ATTR2]] {
+// CHECK2-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], i32 noundef [[SIZE:%.*]], ptr noundef [[OUTPUT:%.*]], ptr noundef [[INPUT:%.*]]) #[[ATTR1]] {
 // CHECK2-NEXT:  entry:
 // CHECK2-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK2-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1793,7 +1795,7 @@ int main()
 //
 //
 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3sumPiiS__l73.omp_outlined.omp_outlined.omp.reduction.reduction_func
-// CHECK2-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4]] {
+// CHECK2-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] {
 // CHECK2-NEXT:  entry:
 // CHECK2-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 4
 // CHECK2-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 4
@@ -1824,7 +1826,7 @@ int main()
 //
 //
 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3sumPiiS__l73.omp_outlined.omp.reduction.reduction_func
-// CHECK2-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4]] {
+// CHECK2-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] {
 // CHECK2-NEXT:  entry:
 // CHECK2-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 4
 // CHECK2-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 4
@@ -1855,7 +1857,7 @@ int main()
 //
 //
 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3sumPiiS__l78
-// CHECK2-SAME: (i32 noundef [[SIZE:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR7:[0-9]+]] {
+// CHECK2-SAME: (i32 noundef [[SIZE:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR1]] {
 // CHECK2-NEXT:  entry:
 // CHECK2-NEXT:    [[SIZE_ADDR:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
@@ -1871,7 +1873,7 @@ int main()
 //
 //
 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3sumPiiS__l78.omp_outlined
-// CHECK2-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[SIZE:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] {
+// CHECK2-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[SIZE:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR1]] {
 // CHECK2-NEXT:  entry:
 // CHECK2-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK2-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1965,7 +1967,7 @@ int main()
 //
 //
 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3sumPiiS__l78.omp_outlined.omp.reduction.reduction_func
-// CHECK2-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4]] {
+// CHECK2-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] {
 // CHECK2-NEXT:  entry:
 // CHECK2-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 4
 // CHECK2-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 4
@@ -1996,7 +1998,7 @@ int main()
 //
 //
 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3sumPiiS__l81
-// CHECK2-SAME: (i32 noundef [[SIZE:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR7]] {
+// CHECK2-SAME: (i32 noundef [[SIZE:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR1]] {
 // CHECK2-NEXT:  entry:
 // CHECK2-NEXT:    [[SIZE_ADDR:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
@@ -2012,7 +2014,7 @@ int main()
 //
 //
 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3sumPiiS__l81.omp_outlined
-// CHECK2-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[SIZE:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] {
+// CHECK2-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[SIZE:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR1]] {
 // CHECK2-NEXT:  entry:
 // CHECK2-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK2-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -2073,7 +2075,7 @@ int main()
 //
 //
 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3sumPiiS__l81.omp_outlined.omp.reduction.reduction_func
-// CHECK2-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4]] {
+// CHECK2-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] {
 // CHECK2-NEXT:  entry:
 // CHECK2-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 4
 // CHECK2-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 4
@@ -2093,7 +2095,7 @@ int main()
 //
 //
 // CHECK2-LABEL: define {{[^@]+}}@main
-// CHECK2-SAME: () #[[ATTR8:[0-9]+]] {
+// CHECK2-SAME: () #[[ATTR6:[0-9]+]] {
 // CHECK2-NEXT:  entry:
 // CHECK2-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    [[SIZE:%.*]] = alloca i32, align 4
@@ -2101,7 +2103,7 @@ int main()
 // CHECK2-NEXT:    [[RESULT:%.*]] = alloca i32, align 4
 // CHECK2-NEXT:    store i32 0, ptr [[RETVAL]], align 4
 // CHECK2-NEXT:    store i32 100, ptr [[SIZE]], align 4
-// CHECK2-NEXT:    [[CALL:%.*]] = call noalias noundef nonnull ptr @_Znaj(i32 noundef 400) #[[ATTR11:[0-9]+]]
+// CHECK2-NEXT:    [[CALL:%.*]] = call noalias noundef nonnull ptr @_Znaj(i32 noundef 400) #[[ATTR9:[0-9]+]]
 // CHECK2-NEXT:    store ptr [[CALL]], ptr [[ARRAY]], align 4
 // CHECK2-NEXT:    store i32 0, ptr [[RESULT]], align 4
 // CHECK2-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ARRAY]], align 4
@@ -2110,7 +2112,7 @@ int main()
 //
 //
 // CHECK2-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK2-SAME: () #[[ATTR10:[0-9]+]] {
+// CHECK2-SAME: () #[[ATTR8:[0-9]+]] {
 // CHECK2-NEXT:  entry:
 // CHECK2-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK2-NEXT:    ret void
diff --git a/clang/test/OpenMP/remarks_parallel_in_multiple_target_state_machines.c b/clang/test/OpenMP/remarks_parallel_in_multiple_target_state_machines.c
index 6d0cdc2ca6d80..2f829d2ad0945 100644
--- a/clang/test/OpenMP/remarks_parallel_in_multiple_target_state_machines.c
+++ b/clang/test/OpenMP/remarks_parallel_in_multiple_target_state_machines.c
@@ -85,8 +85,9 @@ void spmd(void) {
 
 #pragma omp begin declare target device_type(nohost)
 struct KernelEnvironmentTy;
+struct KernelLaunchEnvironmentTy;
 __attribute__((weak))
-extern "C" int __kmpc_target_init(struct KernelEnvironmentTy *) { // all-remark {{Could not internalize function. Some optimizations may not be possible. [OMP140]}}
+extern "C" int __kmpc_target_init(struct KernelEnvironmentTy *, struct KernelLaunchEnvironmentTy *) { // all-remark {{Could not internalize function. Some optimizations may not be possible. [OMP140]}}
   return 0;
 }
 #pragma omp end declare target
diff --git a/clang/test/OpenMP/remarks_parallel_in_target_state_machine.c b/clang/test/OpenMP/remarks_parallel_in_target_state_machine.c
index 335d97cc97c22..c48a4b966077d 100644
--- a/clang/test/OpenMP/remarks_parallel_in_target_state_machine.c
+++ b/clang/test/OpenMP/remarks_parallel_in_target_state_machine.c
@@ -41,8 +41,9 @@ void spmd(void) {
 
 #pragma omp begin declare target device_type(nohost)
 struct KernelEnvironmentTy;
+struct KernelLaunchEnvironmentTy;
 __attribute__((weak))
-extern "C" int __kmpc_target_init(struct KernelEnvironmentTy *) { // expected-remark {{Could not internalize function. Some optimizations may not be possible. [OMP140]}}
+extern "C" int __kmpc_target_init(struct KernelEnvironmentTy *, struct KernelLaunchEnvironmentTy *) { // expected-remark {{Could not internalize function. Some optimizations may not be possible. [OMP140]}}
   return 0;
 }
 #pragma omp end declare target
diff --git a/clang/test/OpenMP/target_codegen_global_capture.cpp b/clang/test/OpenMP/target_codegen_global_capture.cpp
index 41f5b7b546a0c..4237914c4551a 100644
--- a/clang/test/OpenMP/target_codegen_global_capture.cpp
+++ b/clang/test/OpenMP/target_codegen_global_capture.cpp
@@ -317,12 +317,12 @@ int tbar2(short a, short b, short c, short d){
 // CHECK1-NEXT:    [[TMP64:%.*]] = icmp ne i32 [[TMP63]], 0
 // CHECK1-NEXT:    br i1 [[TMP64]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK1:       omp_offload.failed:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foossss_l49(i64 [[TMP1]], i64 [[TMP3]], i64 [[TMP5]], i64 [[TMP7]], i64 [[TMP9]], i64 [[TMP11]], i64 [[TMP13]], i64 [[TMP15]], i64 [[TMP17]]) #[[ATTR3:[0-9]+]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foossss_l49(i64 [[TMP1]], i64 [[TMP3]], i64 [[TMP5]], i64 [[TMP7]], i64 [[TMP9]], i64 [[TMP11]], i64 [[TMP13]], i64 [[TMP15]], i64 [[TMP17]]) #[[ATTR2:[0-9]+]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    br label [[OMP_IF_END:%.*]]
 // CHECK1:       omp_if.else:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foossss_l49(i64 [[TMP1]], i64 [[TMP3]], i64 [[TMP5]], i64 [[TMP7]], i64 [[TMP9]], i64 [[TMP11]], i64 [[TMP13]], i64 [[TMP15]], i64 [[TMP17]]) #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foossss_l49(i64 [[TMP1]], i64 [[TMP3]], i64 [[TMP5]], i64 [[TMP7]], i64 [[TMP9]], i64 [[TMP11]], i64 [[TMP13]], i64 [[TMP15]], i64 [[TMP17]]) #[[ATTR2]]
 // CHECK1-NEXT:    br label [[OMP_IF_END]]
 // CHECK1:       omp_if.end:
 // CHECK1-NEXT:    [[TMP65:%.*]] = load i16, ptr [[A_ADDR]], align 2
@@ -408,7 +408,7 @@ int tbar2(short a, short b, short c, short d){
 // CHECK1-NEXT:    call void @__kmpc_serialized_parallel(ptr @[[GLOB1]], i32 [[TMP0]])
 // CHECK1-NEXT:    store i32 [[TMP0]], ptr [[DOTTHREADID_TEMP_]], align 4
 // CHECK1-NEXT:    store i32 0, ptr [[DOTBOUND_ZERO_ADDR]], align 4
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foossss_l49.omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTBOUND_ZERO_ADDR]], ptr [[D_ADDR]], ptr [[GD_ADDR]], ptr [[SD_ADDR]]) #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foossss_l49.omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTBOUND_ZERO_ADDR]], ptr [[D_ADDR]], ptr [[GD_ADDR]], ptr [[SD_ADDR]]) #[[ATTR2]]
 // CHECK1-NEXT:    call void @__kmpc_end_serialized_parallel(ptr @[[GLOB1]], i32 [[TMP0]])
 // CHECK1-NEXT:    br label [[OMP_IF_END]]
 // CHECK1:       omp_if.end:
@@ -416,7 +416,7 @@ int tbar2(short a, short b, short c, short d){
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foossss_l49.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[D:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[GD:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SD:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[D:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[GD:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SD:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -486,7 +486,7 @@ int tbar2(short a, short b, short c, short d){
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@_Z3barssss.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[A:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[B:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[C:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[D:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[A:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[B:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[C:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[D:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -644,12 +644,12 @@ int tbar2(short a, short b, short c, short d){
 // CHECK1-NEXT:    [[TMP68:%.*]] = icmp ne i32 [[TMP67]], 0
 // CHECK1-NEXT:    br i1 [[TMP68]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK1:       omp_offload.failed:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3barssss_l94(i64 [[TMP5]], i64 [[TMP7]], i64 [[TMP9]], i64 [[TMP11]], i64 [[TMP13]], i64 [[TMP15]], i64 [[TMP17]], i64 [[TMP19]], i64 [[TMP21]]) #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3barssss_l94(i64 [[TMP5]], i64 [[TMP7]], i64 [[TMP9]], i64 [[TMP11]], i64 [[TMP13]], i64 [[TMP15]], i64 [[TMP17]], i64 [[TMP19]], i64 [[TMP21]]) #[[ATTR2]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    br label [[OMP_IF_END:%.*]]
 // CHECK1:       omp_if.else:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3barssss_l94(i64 [[TMP5]], i64 [[TMP7]], i64 [[TMP9]], i64 [[TMP11]], i64 [[TMP13]], i64 [[TMP15]], i64 [[TMP17]], i64 [[TMP19]], i64 [[TMP21]]) #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3barssss_l94(i64 [[TMP5]], i64 [[TMP7]], i64 [[TMP9]], i64 [[TMP11]], i64 [[TMP13]], i64 [[TMP15]], i64 [[TMP17]], i64 [[TMP19]], i64 [[TMP21]]) #[[ATTR2]]
 // CHECK1-NEXT:    br label [[OMP_IF_END]]
 // CHECK1:       omp_if.end:
 // CHECK1-NEXT:    ret void
@@ -712,7 +712,7 @@ int tbar2(short a, short b, short c, short d){
 // CHECK1-NEXT:    call void @__kmpc_serialized_parallel(ptr @[[GLOB1]], i32 [[TMP0]])
 // CHECK1-NEXT:    store i32 [[TMP0]], ptr [[DOTTHREADID_TEMP_]], align 4
 // CHECK1-NEXT:    store i32 0, ptr [[DOTBOUND_ZERO_ADDR]], align 4
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3barssss_l94.omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTBOUND_ZERO_ADDR]], ptr [[D_ADDR]], ptr [[GD_ADDR]], ptr [[SD_ADDR]]) #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3barssss_l94.omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTBOUND_ZERO_ADDR]], ptr [[D_ADDR]], ptr [[GD_ADDR]], ptr [[SD_ADDR]]) #[[ATTR2]]
 // CHECK1-NEXT:    call void @__kmpc_end_serialized_parallel(ptr @[[GLOB1]], i32 [[TMP0]])
 // CHECK1-NEXT:    br label [[OMP_IF_END]]
 // CHECK1:       omp_if.end:
@@ -720,7 +720,7 @@ int tbar2(short a, short b, short c, short d){
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3barssss_l94.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[D:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[GD:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SD:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[D:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[GD:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SD:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -809,7 +809,7 @@ int tbar2(short a, short b, short c, short d){
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@_Z4tbarIsEiT_S0_S0_S0_.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[A:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[B:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[C:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[D:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[A:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[B:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[C:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[D:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -967,12 +967,12 @@ int tbar2(short a, short b, short c, short d){
 // CHECK1-NEXT:    [[TMP68:%.*]] = icmp ne i32 [[TMP67]], 0
 // CHECK1-NEXT:    br i1 [[TMP68]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK1:       omp_offload.failed:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4tbarIsEiT_S0_S0_S0__l145(i64 [[TMP5]], i64 [[TMP7]], i64 [[TMP9]], i64 [[TMP11]], i64 [[TMP13]], i64 [[TMP15]], i64 [[TMP17]], i64 [[TMP19]], i64 [[TMP21]]) #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4tbarIsEiT_S0_S0_S0__l145(i64 [[TMP5]], i64 [[TMP7]], i64 [[TMP9]], i64 [[TMP11]], i64 [[TMP13]], i64 [[TMP15]], i64 [[TMP17]], i64 [[TMP19]], i64 [[TMP21]]) #[[ATTR2]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    br label [[OMP_IF_END:%.*]]
 // CHECK1:       omp_if.else:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4tbarIsEiT_S0_S0_S0__l145(i64 [[TMP5]], i64 [[TMP7]], i64 [[TMP9]], i64 [[TMP11]], i64 [[TMP13]], i64 [[TMP15]], i64 [[TMP17]], i64 [[TMP19]], i64 [[TMP21]]) #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4tbarIsEiT_S0_S0_S0__l145(i64 [[TMP5]], i64 [[TMP7]], i64 [[TMP9]], i64 [[TMP11]], i64 [[TMP13]], i64 [[TMP15]], i64 [[TMP17]], i64 [[TMP19]], i64 [[TMP21]]) #[[ATTR2]]
 // CHECK1-NEXT:    br label [[OMP_IF_END]]
 // CHECK1:       omp_if.end:
 // CHECK1-NEXT:    ret void
@@ -1035,7 +1035,7 @@ int tbar2(short a, short b, short c, short d){
 // CHECK1-NEXT:    call void @__kmpc_serialized_parallel(ptr @[[GLOB1]], i32 [[TMP0]])
 // CHECK1-NEXT:    store i32 [[TMP0]], ptr [[DOTTHREADID_TEMP_]], align 4
 // CHECK1-NEXT:    store i32 0, ptr [[DOTBOUND_ZERO_ADDR]], align 4
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4tbarIsEiT_S0_S0_S0__l145.omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTBOUND_ZERO_ADDR]], ptr [[D_ADDR]], ptr [[GD_ADDR]], ptr [[SD_ADDR]]) #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4tbarIsEiT_S0_S0_S0__l145.omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTBOUND_ZERO_ADDR]], ptr [[D_ADDR]], ptr [[GD_ADDR]], ptr [[SD_ADDR]]) #[[ATTR2]]
 // CHECK1-NEXT:    call void @__kmpc_end_serialized_parallel(ptr @[[GLOB1]], i32 [[TMP0]])
 // CHECK1-NEXT:    br label [[OMP_IF_END]]
 // CHECK1:       omp_if.end:
@@ -1043,7 +1043,7 @@ int tbar2(short a, short b, short c, short d){
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4tbarIsEiT_S0_S0_S0__l145.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[D:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[GD:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SD:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[D:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[GD:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SD:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1075,7 +1075,7 @@ int tbar2(short a, short b, short c, short d){
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK1-SAME: () #[[ATTR4:[0-9]+]] {
+// CHECK1-SAME: () #[[ATTR3:[0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK1-NEXT:    ret void
@@ -1220,12 +1220,12 @@ int tbar2(short a, short b, short c, short d){
 // CHECK3-NEXT:    [[TMP58:%.*]] = icmp ne i32 [[TMP57]], 0
 // CHECK3-NEXT:    br i1 [[TMP58]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK3:       omp_offload.failed:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foossss_l49(i32 [[TMP1]], ptr @Gb, i32 [[TMP3]], ptr @Gc, i32 [[TMP5]], i32 [[TMP7]], i32 [[TMP9]], ptr @Gd, i32 [[TMP11]]) #[[ATTR3:[0-9]+]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foossss_l49(i32 [[TMP1]], ptr @Gb, i32 [[TMP3]], ptr @Gc, i32 [[TMP5]], i32 [[TMP7]], i32 [[TMP9]], ptr @Gd, i32 [[TMP11]]) #[[ATTR2:[0-9]+]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK3:       omp_offload.cont:
 // CHECK3-NEXT:    br label [[OMP_IF_END:%.*]]
 // CHECK3:       omp_if.else:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foossss_l49(i32 [[TMP1]], ptr @Gb, i32 [[TMP3]], ptr @Gc, i32 [[TMP5]], i32 [[TMP7]], i32 [[TMP9]], ptr @Gd, i32 [[TMP11]]) #[[ATTR3]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foossss_l49(i32 [[TMP1]], ptr @Gb, i32 [[TMP3]], ptr @Gc, i32 [[TMP5]], i32 [[TMP7]], i32 [[TMP9]], ptr @Gd, i32 [[TMP11]]) #[[ATTR2]]
 // CHECK3-NEXT:    br label [[OMP_IF_END]]
 // CHECK3:       omp_if.end:
 // CHECK3-NEXT:    [[TMP59:%.*]] = load i16, ptr [[A_ADDR]], align 2
@@ -1323,7 +1323,7 @@ int tbar2(short a, short b, short c, short d){
 // CHECK3-NEXT:    call void @__kmpc_serialized_parallel(ptr @[[GLOB1]], i32 [[TMP0]])
 // CHECK3-NEXT:    store i32 [[TMP0]], ptr [[DOTTHREADID_TEMP_]], align 4
 // CHECK3-NEXT:    store i32 0, ptr [[DOTBOUND_ZERO_ADDR]], align 4
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foossss_l49.omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTBOUND_ZERO_ADDR]], ptr [[D_ADDR]], ptr [[GD3]], ptr [[SD_ADDR]]) #[[ATTR3]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foossss_l49.omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTBOUND_ZERO_ADDR]], ptr [[D_ADDR]], ptr [[GD3]], ptr [[SD_ADDR]]) #[[ATTR2]]
 // CHECK3-NEXT:    call void @__kmpc_end_serialized_parallel(ptr @[[GLOB1]], i32 [[TMP0]])
 // CHECK3-NEXT:    br label [[OMP_IF_END]]
 // CHECK3:       omp_if.end:
@@ -1331,7 +1331,7 @@ int tbar2(short a, short b, short c, short d){
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foossss_l49.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[D:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[GD:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SD:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[D:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[GD:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SD:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1401,7 +1401,7 @@ int tbar2(short a, short b, short c, short d){
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@_Z3barssss.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[A:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[B:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[C:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[D:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[A:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[B:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[C:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[D:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1547,12 +1547,12 @@ int tbar2(short a, short b, short c, short d){
 // CHECK3-NEXT:    [[TMP62:%.*]] = icmp ne i32 [[TMP61]], 0
 // CHECK3-NEXT:    br i1 [[TMP62]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK3:       omp_offload.failed:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3barssss_l94(i32 [[TMP5]], ptr @Gb, i32 [[TMP7]], ptr @Gc, i32 [[TMP9]], i32 [[TMP11]], i32 [[TMP13]], ptr @Gd, i32 [[TMP15]]) #[[ATTR3]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3barssss_l94(i32 [[TMP5]], ptr @Gb, i32 [[TMP7]], ptr @Gc, i32 [[TMP9]], i32 [[TMP11]], i32 [[TMP13]], ptr @Gd, i32 [[TMP15]]) #[[ATTR2]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK3:       omp_offload.cont:
 // CHECK3-NEXT:    br label [[OMP_IF_END:%.*]]
 // CHECK3:       omp_if.else:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3barssss_l94(i32 [[TMP5]], ptr @Gb, i32 [[TMP7]], ptr @Gc, i32 [[TMP9]], i32 [[TMP11]], i32 [[TMP13]], ptr @Gd, i32 [[TMP15]]) #[[ATTR3]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3barssss_l94(i32 [[TMP5]], ptr @Gb, i32 [[TMP7]], ptr @Gc, i32 [[TMP9]], i32 [[TMP11]], i32 [[TMP13]], ptr @Gd, i32 [[TMP15]]) #[[ATTR2]]
 // CHECK3-NEXT:    br label [[OMP_IF_END]]
 // CHECK3:       omp_if.end:
 // CHECK3-NEXT:    ret void
@@ -1627,7 +1627,7 @@ int tbar2(short a, short b, short c, short d){
 // CHECK3-NEXT:    call void @__kmpc_serialized_parallel(ptr @[[GLOB1]], i32 [[TMP0]])
 // CHECK3-NEXT:    store i32 [[TMP0]], ptr [[DOTTHREADID_TEMP_]], align 4
 // CHECK3-NEXT:    store i32 0, ptr [[DOTBOUND_ZERO_ADDR]], align 4
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3barssss_l94.omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTBOUND_ZERO_ADDR]], ptr [[D_ADDR]], ptr [[GD3]], ptr [[SD_ADDR]]) #[[ATTR3]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3barssss_l94.omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTBOUND_ZERO_ADDR]], ptr [[D_ADDR]], ptr [[GD3]], ptr [[SD_ADDR]]) #[[ATTR2]]
 // CHECK3-NEXT:    call void @__kmpc_end_serialized_parallel(ptr @[[GLOB1]], i32 [[TMP0]])
 // CHECK3-NEXT:    br label [[OMP_IF_END]]
 // CHECK3:       omp_if.end:
@@ -1635,7 +1635,7 @@ int tbar2(short a, short b, short c, short d){
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3barssss_l94.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[D:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[GD:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SD:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[D:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[GD:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SD:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1724,7 +1724,7 @@ int tbar2(short a, short b, short c, short d){
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@_Z4tbarIsEiT_S0_S0_S0_.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[A:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[B:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[C:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[D:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[A:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[B:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[C:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[D:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1870,12 +1870,12 @@ int tbar2(short a, short b, short c, short d){
 // CHECK3-NEXT:    [[TMP62:%.*]] = icmp ne i32 [[TMP61]], 0
 // CHECK3-NEXT:    br i1 [[TMP62]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK3:       omp_offload.failed:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4tbarIsEiT_S0_S0_S0__l145(i32 [[TMP5]], ptr @Gb, i32 [[TMP7]], ptr @Gc, i32 [[TMP9]], i32 [[TMP11]], i32 [[TMP13]], ptr @Gd, i32 [[TMP15]]) #[[ATTR3]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4tbarIsEiT_S0_S0_S0__l145(i32 [[TMP5]], ptr @Gb, i32 [[TMP7]], ptr @Gc, i32 [[TMP9]], i32 [[TMP11]], i32 [[TMP13]], ptr @Gd, i32 [[TMP15]]) #[[ATTR2]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK3:       omp_offload.cont:
 // CHECK3-NEXT:    br label [[OMP_IF_END:%.*]]
 // CHECK3:       omp_if.else:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4tbarIsEiT_S0_S0_S0__l145(i32 [[TMP5]], ptr @Gb, i32 [[TMP7]], ptr @Gc, i32 [[TMP9]], i32 [[TMP11]], i32 [[TMP13]], ptr @Gd, i32 [[TMP15]]) #[[ATTR3]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4tbarIsEiT_S0_S0_S0__l145(i32 [[TMP5]], ptr @Gb, i32 [[TMP7]], ptr @Gc, i32 [[TMP9]], i32 [[TMP11]], i32 [[TMP13]], ptr @Gd, i32 [[TMP15]]) #[[ATTR2]]
 // CHECK3-NEXT:    br label [[OMP_IF_END]]
 // CHECK3:       omp_if.end:
 // CHECK3-NEXT:    ret void
@@ -1950,7 +1950,7 @@ int tbar2(short a, short b, short c, short d){
 // CHECK3-NEXT:    call void @__kmpc_serialized_parallel(ptr @[[GLOB1]], i32 [[TMP0]])
 // CHECK3-NEXT:    store i32 [[TMP0]], ptr [[DOTTHREADID_TEMP_]], align 4
 // CHECK3-NEXT:    store i32 0, ptr [[DOTBOUND_ZERO_ADDR]], align 4
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4tbarIsEiT_S0_S0_S0__l145.omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTBOUND_ZERO_ADDR]], ptr [[D_ADDR]], ptr [[GD3]], ptr [[SD_ADDR]]) #[[ATTR3]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4tbarIsEiT_S0_S0_S0__l145.omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTBOUND_ZERO_ADDR]], ptr [[D_ADDR]], ptr [[GD3]], ptr [[SD_ADDR]]) #[[ATTR2]]
 // CHECK3-NEXT:    call void @__kmpc_end_serialized_parallel(ptr @[[GLOB1]], i32 [[TMP0]])
 // CHECK3-NEXT:    br label [[OMP_IF_END]]
 // CHECK3:       omp_if.end:
@@ -1958,7 +1958,7 @@ int tbar2(short a, short b, short c, short d){
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z4tbarIsEiT_S0_S0_S0__l145.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[D:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[GD:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SD:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[D:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[GD:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SD:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1990,7 +1990,7 @@ int tbar2(short a, short b, short c, short d){
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK3-SAME: () #[[ATTR4:[0-9]+]] {
+// CHECK3-SAME: () #[[ATTR3:[0-9]+]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK3-NEXT:    ret void
diff --git a/clang/test/OpenMP/target_firstprivate_codegen.cpp b/clang/test/OpenMP/target_firstprivate_codegen.cpp
index b4ead8f50cfea..bd9874fcae6e3 100644
--- a/clang/test/OpenMP/target_firstprivate_codegen.cpp
+++ b/clang/test/OpenMP/target_firstprivate_codegen.cpp
@@ -9989,11 +9989,13 @@ int bar(int n, double *ptr) {
 //
 //
 // TCHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l63
-// TCHECK-SAME: (i64 noundef [[A:%.*]], ptr noundef [[P:%.*]], i64 noundef [[GA:%.*]]) #[[ATTR0:[0-9]+]] {
+// TCHECK-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[A:%.*]], ptr noundef [[P:%.*]], i64 noundef [[GA:%.*]]) #[[ATTR0:[0-9]+]] {
 // TCHECK-NEXT:  entry:
+// TCHECK-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // TCHECK-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
 // TCHECK-NEXT:    [[P_ADDR:%.*]] = alloca ptr, align 8
 // TCHECK-NEXT:    [[GA_ADDR:%.*]] = alloca i64, align 8
+// TCHECK-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // TCHECK-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
 // TCHECK-NEXT:    store ptr [[P]], ptr [[P_ADDR]], align 8
 // TCHECK-NEXT:    store i64 [[GA]], ptr [[GA_ADDR]], align 8
@@ -10001,8 +10003,9 @@ int bar(int n, double *ptr) {
 //
 //
 // TCHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l70
-// TCHECK-SAME: (i64 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BN:%.*]], ptr noundef nonnull align 8 dereferenceable(400) [[C:%.*]], i64 noundef [[VLA1:%.*]], i64 noundef [[VLA3:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[CN:%.*]], ptr noundef nonnull align 8 dereferenceable(16) [[D:%.*]]) #[[ATTR0]] {
+// TCHECK-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BN:%.*]], ptr noundef nonnull align 8 dereferenceable(400) [[C:%.*]], i64 noundef [[VLA1:%.*]], i64 noundef [[VLA3:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[CN:%.*]], ptr noundef nonnull align 8 dereferenceable(16) [[D:%.*]]) #[[ATTR0]] {
 // TCHECK-NEXT:  entry:
+// TCHECK-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // TCHECK-NEXT:    [[AA_ADDR:%.*]] = alloca i64, align 8
 // TCHECK-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
 // TCHECK-NEXT:    [[VLA_ADDR:%.*]] = alloca i64, align 8
@@ -10019,6 +10022,7 @@ int bar(int n, double *ptr) {
 // TCHECK-NEXT:    [[__VLA_EXPR1:%.*]] = alloca i64, align 8
 // TCHECK-NEXT:    [[__VLA_EXPR2:%.*]] = alloca i64, align 8
 // TCHECK-NEXT:    [[D9:%.*]] = alloca [[STRUCT_TT:%.*]], align 8
+// TCHECK-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // TCHECK-NEXT:    store i64 [[AA]], ptr [[AA_ADDR]], align 8
 // TCHECK-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
 // TCHECK-NEXT:    store i64 [[VLA]], ptr [[VLA_ADDR]], align 8
@@ -10078,10 +10082,12 @@ int bar(int n, double *ptr) {
 //
 //
 // TCHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l111
-// TCHECK-SAME: (ptr noundef [[PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[E:%.*]]) #[[ATTR0]] {
+// TCHECK-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef [[PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[E:%.*]]) #[[ATTR0]] {
 // TCHECK-NEXT:  entry:
+// TCHECK-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // TCHECK-NEXT:    [[PTR_ADDR:%.*]] = alloca ptr, align 8
 // TCHECK-NEXT:    [[E_ADDR:%.*]] = alloca ptr, align 8
+// TCHECK-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // TCHECK-NEXT:    store ptr [[PTR]], ptr [[PTR_ADDR]], align 8
 // TCHECK-NEXT:    store ptr [[E]], ptr [[E_ADDR]], align 8
 // TCHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[E_ADDR]], align 8
@@ -10100,12 +10106,14 @@ int bar(int n, double *ptr) {
 //
 //
 // TCHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l142
-// TCHECK-SAME: (i64 noundef [[A:%.*]], i64 noundef [[AAA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
+// TCHECK-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[A:%.*]], i64 noundef [[AAA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
 // TCHECK-NEXT:  entry:
+// TCHECK-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // TCHECK-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
 // TCHECK-NEXT:    [[AAA_ADDR:%.*]] = alloca i64, align 8
 // TCHECK-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
 // TCHECK-NEXT:    [[B1:%.*]] = alloca [10 x i32], align 4
+// TCHECK-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // TCHECK-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
 // TCHECK-NEXT:    store i64 [[AAA]], ptr [[AAA_ADDR]], align 8
 // TCHECK-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
@@ -10127,8 +10135,9 @@ int bar(int n, double *ptr) {
 //
 //
 // TCHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l167
-// TCHECK-SAME: (ptr noundef [[THIS:%.*]], i64 noundef [[B:%.*]], i64 noundef [[VLA:%.*]], i64 noundef [[VLA1:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[C:%.*]]) #[[ATTR0]] {
+// TCHECK-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef [[THIS:%.*]], i64 noundef [[B:%.*]], i64 noundef [[VLA:%.*]], i64 noundef [[VLA1:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[C:%.*]]) #[[ATTR0]] {
 // TCHECK-NEXT:  entry:
+// TCHECK-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // TCHECK-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // TCHECK-NEXT:    [[B_ADDR:%.*]] = alloca i64, align 8
 // TCHECK-NEXT:    [[VLA_ADDR:%.*]] = alloca i64, align 8
@@ -10137,6 +10146,7 @@ int bar(int n, double *ptr) {
 // TCHECK-NEXT:    [[SAVED_STACK:%.*]] = alloca ptr, align 8
 // TCHECK-NEXT:    [[__VLA_EXPR0:%.*]] = alloca i64, align 8
 // TCHECK-NEXT:    [[__VLA_EXPR1:%.*]] = alloca i64, align 8
+// TCHECK-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // TCHECK-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
 // TCHECK-NEXT:    store i64 [[B]], ptr [[B_ADDR]], align 8
 // TCHECK-NEXT:    store i64 [[VLA]], ptr [[VLA_ADDR]], align 8
@@ -10175,11 +10185,13 @@ int bar(int n, double *ptr) {
 //
 //
 // TCHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l128
-// TCHECK-SAME: (i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
+// TCHECK-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
 // TCHECK-NEXT:  entry:
+// TCHECK-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // TCHECK-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
 // TCHECK-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
 // TCHECK-NEXT:    [[B1:%.*]] = alloca [10 x i32], align 4
+// TCHECK-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // TCHECK-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
 // TCHECK-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
 // TCHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 8
@@ -10195,11 +10207,13 @@ int bar(int n, double *ptr) {
 //
 //
 // TCHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l63
-// TCHECK1-SAME: (i64 noundef [[A:%.*]], ptr noundef [[P:%.*]], i64 noundef [[GA:%.*]]) #[[ATTR0:[0-9]+]] {
+// TCHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[A:%.*]], ptr noundef [[P:%.*]], i64 noundef [[GA:%.*]]) #[[ATTR0:[0-9]+]] {
 // TCHECK1-NEXT:  entry:
+// TCHECK1-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // TCHECK1-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
 // TCHECK1-NEXT:    [[P_ADDR:%.*]] = alloca ptr, align 8
 // TCHECK1-NEXT:    [[GA_ADDR:%.*]] = alloca i64, align 8
+// TCHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // TCHECK1-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
 // TCHECK1-NEXT:    store ptr [[P]], ptr [[P_ADDR]], align 8
 // TCHECK1-NEXT:    store i64 [[GA]], ptr [[GA_ADDR]], align 8
@@ -10207,8 +10221,9 @@ int bar(int n, double *ptr) {
 //
 //
 // TCHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l70
-// TCHECK1-SAME: (i64 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BN:%.*]], ptr noundef nonnull align 8 dereferenceable(400) [[C:%.*]], i64 noundef [[VLA1:%.*]], i64 noundef [[VLA3:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[CN:%.*]], ptr noundef nonnull align 8 dereferenceable(16) [[D:%.*]]) #[[ATTR0]] {
+// TCHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BN:%.*]], ptr noundef nonnull align 8 dereferenceable(400) [[C:%.*]], i64 noundef [[VLA1:%.*]], i64 noundef [[VLA3:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[CN:%.*]], ptr noundef nonnull align 8 dereferenceable(16) [[D:%.*]]) #[[ATTR0]] {
 // TCHECK1-NEXT:  entry:
+// TCHECK1-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // TCHECK1-NEXT:    [[AA_ADDR:%.*]] = alloca i64, align 8
 // TCHECK1-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
 // TCHECK1-NEXT:    [[VLA_ADDR:%.*]] = alloca i64, align 8
@@ -10225,6 +10240,7 @@ int bar(int n, double *ptr) {
 // TCHECK1-NEXT:    [[__VLA_EXPR1:%.*]] = alloca i64, align 8
 // TCHECK1-NEXT:    [[__VLA_EXPR2:%.*]] = alloca i64, align 8
 // TCHECK1-NEXT:    [[D9:%.*]] = alloca [[STRUCT_TT:%.*]], align 8
+// TCHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // TCHECK1-NEXT:    store i64 [[AA]], ptr [[AA_ADDR]], align 8
 // TCHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
 // TCHECK1-NEXT:    store i64 [[VLA]], ptr [[VLA_ADDR]], align 8
@@ -10284,10 +10300,12 @@ int bar(int n, double *ptr) {
 //
 //
 // TCHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l111
-// TCHECK1-SAME: (ptr noundef [[PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[E:%.*]]) #[[ATTR0]] {
+// TCHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef [[PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[E:%.*]]) #[[ATTR0]] {
 // TCHECK1-NEXT:  entry:
+// TCHECK1-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // TCHECK1-NEXT:    [[PTR_ADDR:%.*]] = alloca ptr, align 8
 // TCHECK1-NEXT:    [[E_ADDR:%.*]] = alloca ptr, align 8
+// TCHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // TCHECK1-NEXT:    store ptr [[PTR]], ptr [[PTR_ADDR]], align 8
 // TCHECK1-NEXT:    store ptr [[E]], ptr [[E_ADDR]], align 8
 // TCHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[E_ADDR]], align 8
@@ -10306,12 +10324,14 @@ int bar(int n, double *ptr) {
 //
 //
 // TCHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l142
-// TCHECK1-SAME: (i64 noundef [[A:%.*]], i64 noundef [[AAA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
+// TCHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[A:%.*]], i64 noundef [[AAA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
 // TCHECK1-NEXT:  entry:
+// TCHECK1-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // TCHECK1-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
 // TCHECK1-NEXT:    [[AAA_ADDR:%.*]] = alloca i64, align 8
 // TCHECK1-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
 // TCHECK1-NEXT:    [[B1:%.*]] = alloca [10 x i32], align 4
+// TCHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // TCHECK1-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
 // TCHECK1-NEXT:    store i64 [[AAA]], ptr [[AAA_ADDR]], align 8
 // TCHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
@@ -10333,8 +10353,9 @@ int bar(int n, double *ptr) {
 //
 //
 // TCHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l167
-// TCHECK1-SAME: (ptr noundef [[THIS:%.*]], i64 noundef [[B:%.*]], i64 noundef [[VLA:%.*]], i64 noundef [[VLA1:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[C:%.*]]) #[[ATTR0]] {
+// TCHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef [[THIS:%.*]], i64 noundef [[B:%.*]], i64 noundef [[VLA:%.*]], i64 noundef [[VLA1:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[C:%.*]]) #[[ATTR0]] {
 // TCHECK1-NEXT:  entry:
+// TCHECK1-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // TCHECK1-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // TCHECK1-NEXT:    [[B_ADDR:%.*]] = alloca i64, align 8
 // TCHECK1-NEXT:    [[VLA_ADDR:%.*]] = alloca i64, align 8
@@ -10343,6 +10364,7 @@ int bar(int n, double *ptr) {
 // TCHECK1-NEXT:    [[SAVED_STACK:%.*]] = alloca ptr, align 8
 // TCHECK1-NEXT:    [[__VLA_EXPR0:%.*]] = alloca i64, align 8
 // TCHECK1-NEXT:    [[__VLA_EXPR1:%.*]] = alloca i64, align 8
+// TCHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // TCHECK1-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
 // TCHECK1-NEXT:    store i64 [[B]], ptr [[B_ADDR]], align 8
 // TCHECK1-NEXT:    store i64 [[VLA]], ptr [[VLA_ADDR]], align 8
@@ -10381,11 +10403,13 @@ int bar(int n, double *ptr) {
 //
 //
 // TCHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l128
-// TCHECK1-SAME: (i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
+// TCHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
 // TCHECK1-NEXT:  entry:
+// TCHECK1-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // TCHECK1-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
 // TCHECK1-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
 // TCHECK1-NEXT:    [[B1:%.*]] = alloca [10 x i32], align 4
+// TCHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // TCHECK1-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
 // TCHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
 // TCHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 8
@@ -10401,11 +10425,13 @@ int bar(int n, double *ptr) {
 //
 //
 // TCHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l63
-// TCHECK2-SAME: (i32 noundef [[A:%.*]], ptr noundef [[P:%.*]], i32 noundef [[GA:%.*]]) #[[ATTR0:[0-9]+]] {
+// TCHECK2-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[A:%.*]], ptr noundef [[P:%.*]], i32 noundef [[GA:%.*]]) #[[ATTR0:[0-9]+]] {
 // TCHECK2-NEXT:  entry:
+// TCHECK2-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // TCHECK2-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 // TCHECK2-NEXT:    [[P_ADDR:%.*]] = alloca ptr, align 4
 // TCHECK2-NEXT:    [[GA_ADDR:%.*]] = alloca i32, align 4
+// TCHECK2-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // TCHECK2-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
 // TCHECK2-NEXT:    store ptr [[P]], ptr [[P_ADDR]], align 4
 // TCHECK2-NEXT:    store i32 [[GA]], ptr [[GA_ADDR]], align 4
@@ -10413,8 +10439,9 @@ int bar(int n, double *ptr) {
 //
 //
 // TCHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l70
-// TCHECK2-SAME: (i32 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BN:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[C:%.*]], i32 noundef [[VLA1:%.*]], i32 noundef [[VLA3:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[CN:%.*]], ptr noundef nonnull align 4 dereferenceable(12) [[D:%.*]]) #[[ATTR0]] {
+// TCHECK2-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BN:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[C:%.*]], i32 noundef [[VLA1:%.*]], i32 noundef [[VLA3:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[CN:%.*]], ptr noundef nonnull align 4 dereferenceable(12) [[D:%.*]]) #[[ATTR0]] {
 // TCHECK2-NEXT:  entry:
+// TCHECK2-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // TCHECK2-NEXT:    [[AA_ADDR:%.*]] = alloca i32, align 4
 // TCHECK2-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 4
 // TCHECK2-NEXT:    [[VLA_ADDR:%.*]] = alloca i32, align 4
@@ -10431,6 +10458,7 @@ int bar(int n, double *ptr) {
 // TCHECK2-NEXT:    [[__VLA_EXPR1:%.*]] = alloca i32, align 4
 // TCHECK2-NEXT:    [[__VLA_EXPR2:%.*]] = alloca i32, align 4
 // TCHECK2-NEXT:    [[D9:%.*]] = alloca [[STRUCT_TT:%.*]], align 4
+// TCHECK2-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // TCHECK2-NEXT:    store i32 [[AA]], ptr [[AA_ADDR]], align 4
 // TCHECK2-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 4
 // TCHECK2-NEXT:    store i32 [[VLA]], ptr [[VLA_ADDR]], align 4
@@ -10490,10 +10518,12 @@ int bar(int n, double *ptr) {
 //
 //
 // TCHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l111
-// TCHECK2-SAME: (ptr noundef [[PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[E:%.*]]) #[[ATTR0]] {
+// TCHECK2-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef [[PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[E:%.*]]) #[[ATTR0]] {
 // TCHECK2-NEXT:  entry:
+// TCHECK2-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // TCHECK2-NEXT:    [[PTR_ADDR:%.*]] = alloca ptr, align 4
 // TCHECK2-NEXT:    [[E_ADDR:%.*]] = alloca ptr, align 4
+// TCHECK2-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // TCHECK2-NEXT:    store ptr [[PTR]], ptr [[PTR_ADDR]], align 4
 // TCHECK2-NEXT:    store ptr [[E]], ptr [[E_ADDR]], align 4
 // TCHECK2-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[E_ADDR]], align 4
@@ -10512,12 +10542,14 @@ int bar(int n, double *ptr) {
 //
 //
 // TCHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l142
-// TCHECK2-SAME: (i32 noundef [[A:%.*]], i32 noundef [[AAA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
+// TCHECK2-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[A:%.*]], i32 noundef [[AAA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
 // TCHECK2-NEXT:  entry:
+// TCHECK2-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // TCHECK2-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 // TCHECK2-NEXT:    [[AAA_ADDR:%.*]] = alloca i32, align 4
 // TCHECK2-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 4
 // TCHECK2-NEXT:    [[B1:%.*]] = alloca [10 x i32], align 4
+// TCHECK2-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // TCHECK2-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
 // TCHECK2-NEXT:    store i32 [[AAA]], ptr [[AAA_ADDR]], align 4
 // TCHECK2-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 4
@@ -10539,8 +10571,9 @@ int bar(int n, double *ptr) {
 //
 //
 // TCHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l167
-// TCHECK2-SAME: (ptr noundef [[THIS:%.*]], i32 noundef [[B:%.*]], i32 noundef [[VLA:%.*]], i32 noundef [[VLA1:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[C:%.*]]) #[[ATTR0]] {
+// TCHECK2-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef [[THIS:%.*]], i32 noundef [[B:%.*]], i32 noundef [[VLA:%.*]], i32 noundef [[VLA1:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[C:%.*]]) #[[ATTR0]] {
 // TCHECK2-NEXT:  entry:
+// TCHECK2-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // TCHECK2-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
 // TCHECK2-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
 // TCHECK2-NEXT:    [[VLA_ADDR:%.*]] = alloca i32, align 4
@@ -10549,6 +10582,7 @@ int bar(int n, double *ptr) {
 // TCHECK2-NEXT:    [[SAVED_STACK:%.*]] = alloca ptr, align 4
 // TCHECK2-NEXT:    [[__VLA_EXPR0:%.*]] = alloca i32, align 4
 // TCHECK2-NEXT:    [[__VLA_EXPR1:%.*]] = alloca i32, align 4
+// TCHECK2-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // TCHECK2-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
 // TCHECK2-NEXT:    store i32 [[B]], ptr [[B_ADDR]], align 4
 // TCHECK2-NEXT:    store i32 [[VLA]], ptr [[VLA_ADDR]], align 4
@@ -10587,11 +10621,13 @@ int bar(int n, double *ptr) {
 //
 //
 // TCHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l128
-// TCHECK2-SAME: (i32 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
+// TCHECK2-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
 // TCHECK2-NEXT:  entry:
+// TCHECK2-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // TCHECK2-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 // TCHECK2-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 4
 // TCHECK2-NEXT:    [[B1:%.*]] = alloca [10 x i32], align 4
+// TCHECK2-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // TCHECK2-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
 // TCHECK2-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 4
 // TCHECK2-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 4
@@ -10607,11 +10643,13 @@ int bar(int n, double *ptr) {
 //
 //
 // TCHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l63
-// TCHECK3-SAME: (i32 noundef [[A:%.*]], ptr noundef [[P:%.*]], i32 noundef [[GA:%.*]]) #[[ATTR0:[0-9]+]] {
+// TCHECK3-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[A:%.*]], ptr noundef [[P:%.*]], i32 noundef [[GA:%.*]]) #[[ATTR0:[0-9]+]] {
 // TCHECK3-NEXT:  entry:
+// TCHECK3-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // TCHECK3-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 // TCHECK3-NEXT:    [[P_ADDR:%.*]] = alloca ptr, align 4
 // TCHECK3-NEXT:    [[GA_ADDR:%.*]] = alloca i32, align 4
+// TCHECK3-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // TCHECK3-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
 // TCHECK3-NEXT:    store ptr [[P]], ptr [[P_ADDR]], align 4
 // TCHECK3-NEXT:    store i32 [[GA]], ptr [[GA_ADDR]], align 4
@@ -10619,8 +10657,9 @@ int bar(int n, double *ptr) {
 //
 //
 // TCHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l70
-// TCHECK3-SAME: (i32 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BN:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[C:%.*]], i32 noundef [[VLA1:%.*]], i32 noundef [[VLA3:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[CN:%.*]], ptr noundef nonnull align 4 dereferenceable(12) [[D:%.*]]) #[[ATTR0]] {
+// TCHECK3-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BN:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[C:%.*]], i32 noundef [[VLA1:%.*]], i32 noundef [[VLA3:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[CN:%.*]], ptr noundef nonnull align 4 dereferenceable(12) [[D:%.*]]) #[[ATTR0]] {
 // TCHECK3-NEXT:  entry:
+// TCHECK3-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // TCHECK3-NEXT:    [[AA_ADDR:%.*]] = alloca i32, align 4
 // TCHECK3-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 4
 // TCHECK3-NEXT:    [[VLA_ADDR:%.*]] = alloca i32, align 4
@@ -10637,6 +10676,7 @@ int bar(int n, double *ptr) {
 // TCHECK3-NEXT:    [[__VLA_EXPR1:%.*]] = alloca i32, align 4
 // TCHECK3-NEXT:    [[__VLA_EXPR2:%.*]] = alloca i32, align 4
 // TCHECK3-NEXT:    [[D9:%.*]] = alloca [[STRUCT_TT:%.*]], align 4
+// TCHECK3-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // TCHECK3-NEXT:    store i32 [[AA]], ptr [[AA_ADDR]], align 4
 // TCHECK3-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 4
 // TCHECK3-NEXT:    store i32 [[VLA]], ptr [[VLA_ADDR]], align 4
@@ -10696,10 +10736,12 @@ int bar(int n, double *ptr) {
 //
 //
 // TCHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooiPd_l111
-// TCHECK3-SAME: (ptr noundef [[PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[E:%.*]]) #[[ATTR0]] {
+// TCHECK3-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef [[PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[E:%.*]]) #[[ATTR0]] {
 // TCHECK3-NEXT:  entry:
+// TCHECK3-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // TCHECK3-NEXT:    [[PTR_ADDR:%.*]] = alloca ptr, align 4
 // TCHECK3-NEXT:    [[E_ADDR:%.*]] = alloca ptr, align 4
+// TCHECK3-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // TCHECK3-NEXT:    store ptr [[PTR]], ptr [[PTR_ADDR]], align 4
 // TCHECK3-NEXT:    store ptr [[E]], ptr [[E_ADDR]], align 4
 // TCHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[E_ADDR]], align 4
@@ -10718,12 +10760,14 @@ int bar(int n, double *ptr) {
 //
 //
 // TCHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l142
-// TCHECK3-SAME: (i32 noundef [[A:%.*]], i32 noundef [[AAA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
+// TCHECK3-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[A:%.*]], i32 noundef [[AAA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
 // TCHECK3-NEXT:  entry:
+// TCHECK3-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // TCHECK3-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 // TCHECK3-NEXT:    [[AAA_ADDR:%.*]] = alloca i32, align 4
 // TCHECK3-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 4
 // TCHECK3-NEXT:    [[B1:%.*]] = alloca [10 x i32], align 4
+// TCHECK3-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // TCHECK3-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
 // TCHECK3-NEXT:    store i32 [[AAA]], ptr [[AAA_ADDR]], align 4
 // TCHECK3-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 4
@@ -10745,8 +10789,9 @@ int bar(int n, double *ptr) {
 //
 //
 // TCHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l167
-// TCHECK3-SAME: (ptr noundef [[THIS:%.*]], i32 noundef [[B:%.*]], i32 noundef [[VLA:%.*]], i32 noundef [[VLA1:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[C:%.*]]) #[[ATTR0]] {
+// TCHECK3-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef [[THIS:%.*]], i32 noundef [[B:%.*]], i32 noundef [[VLA:%.*]], i32 noundef [[VLA1:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[C:%.*]]) #[[ATTR0]] {
 // TCHECK3-NEXT:  entry:
+// TCHECK3-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // TCHECK3-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
 // TCHECK3-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
 // TCHECK3-NEXT:    [[VLA_ADDR:%.*]] = alloca i32, align 4
@@ -10755,6 +10800,7 @@ int bar(int n, double *ptr) {
 // TCHECK3-NEXT:    [[SAVED_STACK:%.*]] = alloca ptr, align 4
 // TCHECK3-NEXT:    [[__VLA_EXPR0:%.*]] = alloca i32, align 4
 // TCHECK3-NEXT:    [[__VLA_EXPR1:%.*]] = alloca i32, align 4
+// TCHECK3-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // TCHECK3-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
 // TCHECK3-NEXT:    store i32 [[B]], ptr [[B_ADDR]], align 4
 // TCHECK3-NEXT:    store i32 [[VLA]], ptr [[VLA_ADDR]], align 4
@@ -10793,11 +10839,13 @@ int bar(int n, double *ptr) {
 //
 //
 // TCHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l128
-// TCHECK3-SAME: (i32 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
+// TCHECK3-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
 // TCHECK3-NEXT:  entry:
+// TCHECK3-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // TCHECK3-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 // TCHECK3-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 4
 // TCHECK3-NEXT:    [[B1:%.*]] = alloca [10 x i32], align 4
+// TCHECK3-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // TCHECK3-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
 // TCHECK3-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 4
 // TCHECK3-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[B_ADDR]], align 4
diff --git a/clang/test/OpenMP/target_map_codegen_03.cpp b/clang/test/OpenMP/target_map_codegen_03.cpp
index 82ead65bcce62..b4fd0864c75c1 100644
--- a/clang/test/OpenMP/target_map_codegen_03.cpp
+++ b/clang/test/OpenMP/target_map_codegen_03.cpp
@@ -125,14 +125,14 @@ void implicit_maps_nested_integer (int a){
 // CHECK1-NEXT:    [[TMP22:%.*]] = icmp ne i32 [[TMP21]], 0
 // CHECK1-NEXT:    br i1 [[TMP22]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK1:       omp_offload.failed:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z28implicit_maps_nested_integeri_l48(i64 [[TMP2]]) #[[ATTR3:[0-9]+]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z28implicit_maps_nested_integeri_l48(i64 [[TMP2]]) #[[ATTR2:[0-9]+]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    ret void
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z28implicit_maps_nested_integeri_l48
-// CHECK1-SAME: (i64 noundef [[I:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK1-SAME: (i64 noundef [[I:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[I_ADDR:%.*]] = alloca i64, align 8
 // CHECK1-NEXT:    store i64 [[I]], ptr [[I_ADDR]], align 8
@@ -157,7 +157,7 @@ void implicit_maps_nested_integer (int a){
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK1-SAME: () #[[ATTR4:[0-9]+]] {
+// CHECK1-SAME: () #[[ATTR3:[0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK1-NEXT:    ret void
@@ -231,14 +231,14 @@ void implicit_maps_nested_integer (int a){
 // CHECK3-NEXT:    [[TMP22:%.*]] = icmp ne i32 [[TMP21]], 0
 // CHECK3-NEXT:    br i1 [[TMP22]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK3:       omp_offload.failed:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z28implicit_maps_nested_integeri_l48(i32 [[TMP2]]) #[[ATTR3:[0-9]+]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z28implicit_maps_nested_integeri_l48(i32 [[TMP2]]) #[[ATTR2:[0-9]+]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK3:       omp_offload.cont:
 // CHECK3-NEXT:    ret void
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z28implicit_maps_nested_integeri_l48
-// CHECK3-SAME: (i32 noundef [[I:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK3-SAME: (i32 noundef [[I:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[I_ADDR:%.*]] = alloca i32, align 4
 // CHECK3-NEXT:    store i32 [[I]], ptr [[I_ADDR]], align 4
@@ -263,7 +263,7 @@ void implicit_maps_nested_integer (int a){
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK3-SAME: () #[[ATTR4:[0-9]+]] {
+// CHECK3-SAME: () #[[ATTR3:[0-9]+]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK3-NEXT:    ret void
diff --git a/clang/test/OpenMP/target_map_member_expr_codegen.cpp b/clang/test/OpenMP/target_map_member_expr_codegen.cpp
index e213f1f9fd21a..89a9ee8018aa0 100644
--- a/clang/test/OpenMP/target_map_member_expr_codegen.cpp
+++ b/clang/test/OpenMP/target_map_member_expr_codegen.cpp
@@ -487,7 +487,7 @@ void foo() {
 //
 //
 // CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN1C3barER10descriptorIfE_l55.omp_outlined
-// CHECK-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[CSIZE:%.*]], ptr noundef nonnull align 8 dereferenceable(40) [[D:%.*]]) #[[ATTR4:[0-9]+]] {
+// CHECK-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[CSIZE:%.*]], ptr noundef nonnull align 8 dereferenceable(40) [[D:%.*]]) #[[ATTR2]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -563,7 +563,7 @@ void foo() {
 //
 //
 // CHECK-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK-SAME: () #[[ATTR6:[0-9]+]] section ".text.startup" {
+// CHECK-SAME: () #[[ATTR5:[0-9]+]] section ".text.startup" {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK-NEXT:    ret void
diff --git a/clang/test/OpenMP/target_ompx_dyn_cgroup_mem_codegen.cpp b/clang/test/OpenMP/target_ompx_dyn_cgroup_mem_codegen.cpp
index 7ae4c220623fd..220ffb9a740ad 100644
--- a/clang/test/OpenMP/target_ompx_dyn_cgroup_mem_codegen.cpp
+++ b/clang/test/OpenMP/target_ompx_dyn_cgroup_mem_codegen.cpp
@@ -1950,13 +1950,15 @@ int bar(int n){
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l104
-// CHECK9-SAME: (i64 noundef [[N:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]], i64 noundef [[DOTCAPTURE_EXPR_1:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK9-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[N:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]], i64 noundef [[DOTCAPTURE_EXPR_1:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK9-NEXT:  entry:
+// CHECK9-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[N_ADDR:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[DOTCAPTURE_EXPR__ADDR2:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[N_CASTED:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB3:[0-9]+]])
+// CHECK9-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK9-NEXT:    store i64 [[N]], ptr [[N_ADDR]], align 8
 // CHECK9-NEXT:    store i64 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 8
 // CHECK9-NEXT:    store i64 [[DOTCAPTURE_EXPR_1]], ptr [[DOTCAPTURE_EXPR__ADDR2]], align 8
@@ -2176,9 +2178,11 @@ int bar(int n){
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l108
-// CHECK9-SAME: (i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] {
+// CHECK9-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] {
 // CHECK9-NEXT:  entry:
+// CHECK9-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8
+// CHECK9-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK9-NEXT:    store i64 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 8
 // CHECK9-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l108.omp_outlined)
 // CHECK9-NEXT:    ret void
@@ -2195,12 +2199,14 @@ int bar(int n){
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l121
-// CHECK9-SAME: (ptr noundef [[THIS:%.*]], i64 noundef [[B:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] {
+// CHECK9-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef [[THIS:%.*]], i64 noundef [[B:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] {
 // CHECK9-NEXT:  entry:
+// CHECK9-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[B_ADDR:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[B_CASTED:%.*]] = alloca i64, align 8
+// CHECK9-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK9-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
 // CHECK9-NEXT:    store i64 [[B]], ptr [[B_ADDR]], align 8
 // CHECK9-NEXT:    store i64 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 8
@@ -2233,9 +2239,11 @@ int bar(int n){
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l126
-// CHECK9-SAME: (ptr noundef [[THIS:%.*]]) #[[ATTR0]] {
+// CHECK9-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR0]] {
 // CHECK9-NEXT:  entry:
+// CHECK9-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK9-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
 // CHECK9-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CHECK9-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_S1:%.*]], ptr [[TMP0]], i32 0, i32 0
@@ -2244,8 +2252,10 @@ int bar(int n){
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l88
-// CHECK9-SAME: () #[[ATTR0]] {
+// CHECK9-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
 // CHECK9-NEXT:  entry:
+// CHECK9-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK9-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l88.omp_outlined)
 // CHECK9-NEXT:    ret void
 //
@@ -2261,14 +2271,16 @@ int bar(int n){
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l93
-// CHECK9-SAME: (i64 noundef [[A:%.*]], i64 noundef [[B:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] {
+// CHECK9-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[A:%.*]], i64 noundef [[B:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] {
 // CHECK9-NEXT:  entry:
+// CHECK9-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[B_ADDR:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[A_CASTED:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[B_CASTED:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB3]])
+// CHECK9-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK9-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
 // CHECK9-NEXT:    store i64 [[B]], ptr [[B_ADDR]], align 8
 // CHECK9-NEXT:    store i64 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 8
@@ -2305,13 +2317,15 @@ int bar(int n){
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l104
-// CHECK11-SAME: (i32 noundef [[N:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]], i32 noundef [[DOTCAPTURE_EXPR_1:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK11-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[N:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]], i32 noundef [[DOTCAPTURE_EXPR_1:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK11-NEXT:  entry:
+// CHECK11-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[DOTCAPTURE_EXPR__ADDR2:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[N_CASTED:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB3:[0-9]+]])
+// CHECK11-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK11-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
 // CHECK11-NEXT:    store i32 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 4
 // CHECK11-NEXT:    store i32 [[DOTCAPTURE_EXPR_1]], ptr [[DOTCAPTURE_EXPR__ADDR2]], align 4
@@ -2527,9 +2541,11 @@ int bar(int n){
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l108
-// CHECK11-SAME: (i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] {
+// CHECK11-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] {
 // CHECK11-NEXT:  entry:
+// CHECK11-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK11-NEXT:    store i32 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 4
 // CHECK11-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l108.omp_outlined)
 // CHECK11-NEXT:    ret void
@@ -2546,12 +2562,14 @@ int bar(int n){
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l121
-// CHECK11-SAME: (ptr noundef [[THIS:%.*]], i32 noundef [[B:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] {
+// CHECK11-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef [[THIS:%.*]], i32 noundef [[B:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] {
 // CHECK11-NEXT:  entry:
+// CHECK11-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[B_CASTED:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK11-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
 // CHECK11-NEXT:    store i32 [[B]], ptr [[B_ADDR]], align 4
 // CHECK11-NEXT:    store i32 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 4
@@ -2584,9 +2602,11 @@ int bar(int n){
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l126
-// CHECK11-SAME: (ptr noundef [[THIS:%.*]]) #[[ATTR0]] {
+// CHECK11-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR0]] {
 // CHECK11-NEXT:  entry:
+// CHECK11-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
+// CHECK11-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK11-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
 // CHECK11-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
 // CHECK11-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT_S1:%.*]], ptr [[TMP0]], i32 0, i32 0
@@ -2595,8 +2615,10 @@ int bar(int n){
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l88
-// CHECK11-SAME: () #[[ATTR0]] {
+// CHECK11-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
 // CHECK11-NEXT:  entry:
+// CHECK11-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
+// CHECK11-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK11-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l88.omp_outlined)
 // CHECK11-NEXT:    ret void
 //
@@ -2612,14 +2634,16 @@ int bar(int n){
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l93
-// CHECK11-SAME: (i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] {
+// CHECK11-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] {
 // CHECK11-NEXT:  entry:
+// CHECK11-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[A_CASTED:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[B_CASTED:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB3]])
+// CHECK11-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK11-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
 // CHECK11-NEXT:    store i32 [[B]], ptr [[B_ADDR]], align 4
 // CHECK11-NEXT:    store i32 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 4
diff --git a/clang/test/OpenMP/target_parallel_codegen.cpp b/clang/test/OpenMP/target_parallel_codegen.cpp
index f0c3d6917e68e..bb7999b3e55b1 100644
--- a/clang/test/OpenMP/target_parallel_codegen.cpp
+++ b/clang/test/OpenMP/target_parallel_codegen.cpp
@@ -2498,8 +2498,10 @@ int bar(int n){
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l100
-// CHECK9-SAME: () #[[ATTR0:[0-9]+]] {
+// CHECK9-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK9-NEXT:  entry:
+// CHECK9-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK9-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB1:[0-9]+]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l100.omp_outlined)
 // CHECK9-NEXT:    ret void
 //
@@ -2515,10 +2517,12 @@ int bar(int n){
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l110
-// CHECK9-SAME: (i64 noundef [[AA:%.*]]) #[[ATTR0]] {
+// CHECK9-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[AA:%.*]]) #[[ATTR0]] {
 // CHECK9-NEXT:  entry:
+// CHECK9-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[AA_ADDR:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[AA_CASTED:%.*]] = alloca i64, align 8
+// CHECK9-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK9-NEXT:    store i64 [[AA]], ptr [[AA_ADDR]], align 8
 // CHECK9-NEXT:    [[TMP0:%.*]] = load i16, ptr [[AA_ADDR]], align 2
 // CHECK9-NEXT:    store i16 [[TMP0]], ptr [[AA_CASTED]], align 2
@@ -2554,12 +2558,14 @@ int bar(int n){
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l119
-// CHECK9-SAME: (i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]]) #[[ATTR0]] {
+// CHECK9-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]]) #[[ATTR0]] {
 // CHECK9-NEXT:  entry:
+// CHECK9-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[AA_ADDR:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[A_CASTED:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[AA_CASTED:%.*]] = alloca i64, align 8
+// CHECK9-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK9-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
 // CHECK9-NEXT:    store i64 [[AA]], ptr [[AA_ADDR]], align 8
 // CHECK9-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
@@ -2595,8 +2601,9 @@ int bar(int n){
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l144
-// CHECK9-SAME: (i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BN:%.*]], ptr noundef nonnull align 8 dereferenceable(400) [[C:%.*]], i64 noundef [[VLA1:%.*]], i64 noundef [[VLA3:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[CN:%.*]], ptr noundef nonnull align 8 dereferenceable(16) [[D:%.*]]) #[[ATTR0]] {
+// CHECK9-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BN:%.*]], ptr noundef nonnull align 8 dereferenceable(400) [[C:%.*]], i64 noundef [[VLA1:%.*]], i64 noundef [[VLA3:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[CN:%.*]], ptr noundef nonnull align 8 dereferenceable(16) [[D:%.*]]) #[[ATTR0]] {
 // CHECK9-NEXT:  entry:
+// CHECK9-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[VLA_ADDR:%.*]] = alloca i64, align 8
@@ -2607,6 +2614,7 @@ int bar(int n){
 // CHECK9-NEXT:    [[CN_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[D_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[A_CASTED:%.*]] = alloca i64, align 8
+// CHECK9-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK9-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
 // CHECK9-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
 // CHECK9-NEXT:    store i64 [[VLA]], ptr [[VLA_ADDR]], align 8
@@ -2704,8 +2712,9 @@ int bar(int n){
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l198
-// CHECK9-SAME: (i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]], i64 noundef [[AAA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
+// CHECK9-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]], i64 noundef [[AAA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
 // CHECK9-NEXT:  entry:
+// CHECK9-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[AA_ADDR:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[AAA_ADDR:%.*]] = alloca i64, align 8
@@ -2713,6 +2722,7 @@ int bar(int n){
 // CHECK9-NEXT:    [[A_CASTED:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[AA_CASTED:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[AAA_CASTED:%.*]] = alloca i64, align 8
+// CHECK9-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK9-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
 // CHECK9-NEXT:    store i64 [[AA]], ptr [[AA_ADDR]], align 8
 // CHECK9-NEXT:    store i64 [[AAA]], ptr [[AAA_ADDR]], align 8
@@ -2768,14 +2778,16 @@ int bar(int n){
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l216
-// CHECK9-SAME: (ptr noundef [[THIS:%.*]], i64 noundef [[B:%.*]], i64 noundef [[VLA:%.*]], i64 noundef [[VLA1:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[C:%.*]]) #[[ATTR0]] {
+// CHECK9-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef [[THIS:%.*]], i64 noundef [[B:%.*]], i64 noundef [[VLA:%.*]], i64 noundef [[VLA1:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[C:%.*]]) #[[ATTR0]] {
 // CHECK9-NEXT:  entry:
+// CHECK9-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[B_ADDR:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[VLA_ADDR:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[VLA_ADDR2:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[B_CASTED:%.*]] = alloca i64, align 8
+// CHECK9-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK9-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
 // CHECK9-NEXT:    store i64 [[B]], ptr [[B_ADDR]], align 8
 // CHECK9-NEXT:    store i64 [[VLA]], ptr [[VLA_ADDR]], align 8
@@ -2831,13 +2843,15 @@ int bar(int n){
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l181
-// CHECK9-SAME: (i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
+// CHECK9-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
 // CHECK9-NEXT:  entry:
+// CHECK9-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[AA_ADDR:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[A_CASTED:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[AA_CASTED:%.*]] = alloca i64, align 8
+// CHECK9-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK9-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
 // CHECK9-NEXT:    store i64 [[AA]], ptr [[AA_ADDR]], align 8
 // CHECK9-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
@@ -2882,8 +2896,10 @@ int bar(int n){
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l100
-// CHECK11-SAME: () #[[ATTR0:[0-9]+]] {
+// CHECK11-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK11-NEXT:  entry:
+// CHECK11-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
+// CHECK11-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK11-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB1:[0-9]+]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l100.omp_outlined)
 // CHECK11-NEXT:    ret void
 //
@@ -2899,10 +2915,12 @@ int bar(int n){
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l110
-// CHECK11-SAME: (i32 noundef [[AA:%.*]]) #[[ATTR0]] {
+// CHECK11-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[AA:%.*]]) #[[ATTR0]] {
 // CHECK11-NEXT:  entry:
+// CHECK11-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[AA_ADDR:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[AA_CASTED:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK11-NEXT:    store i32 [[AA]], ptr [[AA_ADDR]], align 4
 // CHECK11-NEXT:    [[TMP0:%.*]] = load i16, ptr [[AA_ADDR]], align 2
 // CHECK11-NEXT:    store i16 [[TMP0]], ptr [[AA_CASTED]], align 2
@@ -2938,12 +2956,14 @@ int bar(int n){
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l119
-// CHECK11-SAME: (i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]]) #[[ATTR0]] {
+// CHECK11-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]]) #[[ATTR0]] {
 // CHECK11-NEXT:  entry:
+// CHECK11-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[AA_ADDR:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[A_CASTED:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[AA_CASTED:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK11-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
 // CHECK11-NEXT:    store i32 [[AA]], ptr [[AA_ADDR]], align 4
 // CHECK11-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
@@ -2979,8 +2999,9 @@ int bar(int n){
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l144
-// CHECK11-SAME: (i32 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BN:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[C:%.*]], i32 noundef [[VLA1:%.*]], i32 noundef [[VLA3:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[CN:%.*]], ptr noundef nonnull align 4 dereferenceable(12) [[D:%.*]]) #[[ATTR0]] {
+// CHECK11-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BN:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[C:%.*]], i32 noundef [[VLA1:%.*]], i32 noundef [[VLA3:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[CN:%.*]], ptr noundef nonnull align 4 dereferenceable(12) [[D:%.*]]) #[[ATTR0]] {
 // CHECK11-NEXT:  entry:
+// CHECK11-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[VLA_ADDR:%.*]] = alloca i32, align 4
@@ -2991,6 +3012,7 @@ int bar(int n){
 // CHECK11-NEXT:    [[CN_ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[D_ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[A_CASTED:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK11-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
 // CHECK11-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 4
 // CHECK11-NEXT:    store i32 [[VLA]], ptr [[VLA_ADDR]], align 4
@@ -3088,8 +3110,9 @@ int bar(int n){
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l198
-// CHECK11-SAME: (i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]], i32 noundef [[AAA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
+// CHECK11-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]], i32 noundef [[AAA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
 // CHECK11-NEXT:  entry:
+// CHECK11-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[AA_ADDR:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[AAA_ADDR:%.*]] = alloca i32, align 4
@@ -3097,6 +3120,7 @@ int bar(int n){
 // CHECK11-NEXT:    [[A_CASTED:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[AA_CASTED:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[AAA_CASTED:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK11-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
 // CHECK11-NEXT:    store i32 [[AA]], ptr [[AA_ADDR]], align 4
 // CHECK11-NEXT:    store i32 [[AAA]], ptr [[AAA_ADDR]], align 4
@@ -3152,14 +3176,16 @@ int bar(int n){
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l216
-// CHECK11-SAME: (ptr noundef [[THIS:%.*]], i32 noundef [[B:%.*]], i32 noundef [[VLA:%.*]], i32 noundef [[VLA1:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[C:%.*]]) #[[ATTR0]] {
+// CHECK11-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef [[THIS:%.*]], i32 noundef [[B:%.*]], i32 noundef [[VLA:%.*]], i32 noundef [[VLA1:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[C:%.*]]) #[[ATTR0]] {
 // CHECK11-NEXT:  entry:
+// CHECK11-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[VLA_ADDR:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[VLA_ADDR2:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[B_CASTED:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK11-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
 // CHECK11-NEXT:    store i32 [[B]], ptr [[B_ADDR]], align 4
 // CHECK11-NEXT:    store i32 [[VLA]], ptr [[VLA_ADDR]], align 4
@@ -3215,13 +3241,15 @@ int bar(int n){
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l181
-// CHECK11-SAME: (i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
+// CHECK11-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
 // CHECK11-NEXT:  entry:
+// CHECK11-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[AA_ADDR:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[A_CASTED:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[AA_CASTED:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK11-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
 // CHECK11-NEXT:    store i32 [[AA]], ptr [[AA_ADDR]], align 4
 // CHECK11-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 4
diff --git a/clang/test/OpenMP/target_parallel_debug_codegen.cpp b/clang/test/OpenMP/target_parallel_debug_codegen.cpp
index 8d13e3387e73f..e99c393a4021c 100644
--- a/clang/test/OpenMP/target_parallel_debug_codegen.cpp
+++ b/clang/test/OpenMP/target_parallel_debug_codegen.cpp
@@ -65,8 +65,9 @@ int main() {
   return 0;
 }
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l23_debug__
-// CHECK1-SAME: (ptr addrspace(1) noalias noundef [[C:%.*]], i32 noundef [[A:%.*]], ptr noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR0:[0-9]+]] !dbg [[DBG32:![0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], i32 noundef [[A:%.*]], ptr noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR0:[0-9]+]] !dbg [[DBG32:![0-9]+]] {
 // CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[C_ADDR:%.*]] = alloca ptr addrspace(1), align 8
 // CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
@@ -76,51 +77,53 @@ int main() {
 // CHECK1-NEXT:    [[_TMP2:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[A_CASTED:%.*]] = alloca i64, align 8
 // CHECK1-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x ptr], align 8
+// CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DYN_PTR_ADDR]], metadata [[META51:![0-9]+]], metadata !DIExpression()), !dbg [[DBG52:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META48:![0-9]+]], metadata !DIExpression()), !dbg [[DBG49:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META53:![0-9]+]], metadata !DIExpression()), !dbg [[DBG54:![0-9]+]]
 // CHECK1-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META50:![0-9]+]], metadata !DIExpression()), !dbg [[DBG51:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META55:![0-9]+]], metadata !DIExpression()), !dbg [[DBG56:![0-9]+]]
 // CHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META52:![0-9]+]], metadata !DIExpression()), !dbg [[DBG53:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META57:![0-9]+]], metadata !DIExpression()), !dbg [[DBG58:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META54:![0-9]+]], metadata !DIExpression()), !dbg [[DBG55:![0-9]+]]
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG56:![0-9]+]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG56]]
-// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG56]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG56]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG56]]
-// CHECK1-NEXT:    store ptr [[TMP3]], ptr [[_TMP1]], align 8, !dbg [[DBG56]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG56]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG56]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = addrspacecast ptr addrspace(1) [[TMP5]] to ptr, !dbg [[DBG56]]
-// CHECK1-NEXT:    store ptr [[TMP6]], ptr [[_TMP2]], align 8, !dbg [[DBG56]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG56]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l23_kernel_environment), !dbg [[DBG56]]
-// CHECK1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP8]], -1, !dbg [[DBG56]]
-// CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]], !dbg [[DBG56]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META59:![0-9]+]], metadata !DIExpression()), !dbg [[DBG60:![0-9]+]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG61:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG61]]
+// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG61]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG61]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG61]]
+// CHECK1-NEXT:    store ptr [[TMP3]], ptr [[_TMP1]], align 8, !dbg [[DBG61]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG61]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG61]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = addrspacecast ptr addrspace(1) [[TMP5]] to ptr, !dbg [[DBG61]]
+// CHECK1-NEXT:    store ptr [[TMP6]], ptr [[_TMP2]], align 8, !dbg [[DBG61]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG61]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l23_kernel_environment, ptr [[DYN_PTR]]), !dbg [[DBG61]]
+// CHECK1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP8]], -1, !dbg [[DBG61]]
+// CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]], !dbg [[DBG61]]
 // CHECK1:       user_code.entry:
 // CHECK1-NEXT:    [[TMP9:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB3:[0-9]+]])
-// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG57:![0-9]+]]
-// CHECK1-NEXT:    store i32 [[TMP10]], ptr [[A_CASTED]], align 4, !dbg [[DBG57]]
-// CHECK1-NEXT:    [[TMP11:%.*]] = load i64, ptr [[A_CASTED]], align 8, !dbg [[DBG57]]
-// CHECK1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 0, !dbg [[DBG57]]
-// CHECK1-NEXT:    store ptr [[TMP2]], ptr [[TMP12]], align 8, !dbg [[DBG57]]
-// CHECK1-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 1, !dbg [[DBG57]]
-// CHECK1-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP11]] to ptr, !dbg [[DBG57]]
-// CHECK1-NEXT:    store ptr [[TMP14]], ptr [[TMP13]], align 8, !dbg [[DBG57]]
-// CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 2, !dbg [[DBG57]]
-// CHECK1-NEXT:    store ptr [[TMP4]], ptr [[TMP15]], align 8, !dbg [[DBG57]]
-// CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 3, !dbg [[DBG57]]
-// CHECK1-NEXT:    store ptr [[TMP7]], ptr [[TMP16]], align 8, !dbg [[DBG57]]
-// CHECK1-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB3]], i32 [[TMP9]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l23_debug___omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 4), !dbg [[DBG57]]
-// CHECK1-NEXT:    call void @__kmpc_target_deinit(), !dbg [[DBG58:![0-9]+]]
-// CHECK1-NEXT:    ret void, !dbg [[DBG60:![0-9]+]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG62:![0-9]+]]
+// CHECK1-NEXT:    store i32 [[TMP10]], ptr [[A_CASTED]], align 4, !dbg [[DBG62]]
+// CHECK1-NEXT:    [[TMP11:%.*]] = load i64, ptr [[A_CASTED]], align 8, !dbg [[DBG62]]
+// CHECK1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 0, !dbg [[DBG62]]
+// CHECK1-NEXT:    store ptr [[TMP2]], ptr [[TMP12]], align 8, !dbg [[DBG62]]
+// CHECK1-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 1, !dbg [[DBG62]]
+// CHECK1-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP11]] to ptr, !dbg [[DBG62]]
+// CHECK1-NEXT:    store ptr [[TMP14]], ptr [[TMP13]], align 8, !dbg [[DBG62]]
+// CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 2, !dbg [[DBG62]]
+// CHECK1-NEXT:    store ptr [[TMP4]], ptr [[TMP15]], align 8, !dbg [[DBG62]]
+// CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 3, !dbg [[DBG62]]
+// CHECK1-NEXT:    store ptr [[TMP7]], ptr [[TMP16]], align 8, !dbg [[DBG62]]
+// CHECK1-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB3]], i32 [[TMP9]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l23_debug___omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 4), !dbg [[DBG62]]
+// CHECK1-NEXT:    call void @__kmpc_target_deinit(), !dbg [[DBG63:![0-9]+]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG65:![0-9]+]]
 // CHECK1:       worker.exit:
-// CHECK1-NEXT:    ret void, !dbg [[DBG56]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG61]]
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l23_debug___omp_outlined_debug__
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], i32 noundef [[A:%.*]], ptr noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR2:[0-9]+]] !dbg [[DBG61:![0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], i32 noundef [[A:%.*]], ptr noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR2:[0-9]+]] !dbg [[DBG66:![0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -137,83 +140,83 @@ int main() {
 // CHECK1-NEXT:    [[H:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[D:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META68:![0-9]+]], metadata !DIExpression()), !dbg [[DBG69:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META73:![0-9]+]], metadata !DIExpression()), !dbg [[DBG74:![0-9]+]]
 // CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META70:![0-9]+]], metadata !DIExpression()), !dbg [[DBG69]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META75:![0-9]+]], metadata !DIExpression()), !dbg [[DBG74]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META71:![0-9]+]], metadata !DIExpression()), !dbg [[DBG72:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META76:![0-9]+]], metadata !DIExpression()), !dbg [[DBG77:![0-9]+]]
 // CHECK1-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META73:![0-9]+]], metadata !DIExpression()), !dbg [[DBG74:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META78:![0-9]+]], metadata !DIExpression()), !dbg [[DBG79:![0-9]+]]
 // CHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META75:![0-9]+]], metadata !DIExpression()), !dbg [[DBG76:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META80:![0-9]+]], metadata !DIExpression()), !dbg [[DBG81:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META77:![0-9]+]], metadata !DIExpression()), !dbg [[DBG78:![0-9]+]]
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG79:![0-9]+]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG79]]
-// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG79]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG79]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG79]]
-// CHECK1-NEXT:    store ptr [[TMP3]], ptr [[_TMP1]], align 8, !dbg [[DBG79]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG79]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG79]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = addrspacecast ptr addrspace(1) [[TMP5]] to ptr, !dbg [[DBG79]]
-// CHECK1-NEXT:    store ptr [[TMP6]], ptr [[_TMP2]], align 8, !dbg [[DBG79]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG79]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B3]], metadata [[META80:![0-9]+]], metadata !DIExpression()), !dbg [[DBG69]]
-// CHECK1-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[B3]], ptr align 4 [[TMP4]], i64 400, i1 false), !dbg [[DBG79]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[F]], metadata [[META81:![0-9]+]], metadata !DIExpression()), !dbg [[DBG84:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 1, !dbg [[DBG85:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX]], i64 0, i64 1, !dbg [[DBG85]]
-// CHECK1-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX4]], i64 0, i64 1, !dbg [[DBG85]]
-// CHECK1-NEXT:    store ptr [[ARRAYIDX5]], ptr [[F]], align 8, !dbg [[DBG84]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[G]], metadata [[META86:![0-9]+]], metadata !DIExpression()), !dbg [[DBG87:![0-9]+]]
-// CHECK1-NEXT:    store ptr [[A_ADDR]], ptr [[G]], align 8, !dbg [[DBG87]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[H]], metadata [[META88:![0-9]+]], metadata !DIExpression()), !dbg [[DBG89:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B3]], i64 0, i64 1, !dbg [[DBG90:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX6]], i64 0, i64 1, !dbg [[DBG90]]
-// CHECK1-NEXT:    store ptr [[ARRAYIDX7]], ptr [[H]], align 8, !dbg [[DBG89]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[D]], metadata [[META91:![0-9]+]], metadata !DIExpression()), !dbg [[DBG92:![0-9]+]]
-// CHECK1-NEXT:    store i32 15, ptr [[D]], align 4, !dbg [[DBG92]]
-// CHECK1-NEXT:    store i32 5, ptr [[A_ADDR]], align 4, !dbg [[DBG93:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B3]], i64 0, i64 0, !dbg [[DBG94:![0-9]+]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG95:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP8]] to i64, !dbg [[DBG94]]
-// CHECK1-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX8]], i64 0, i64 [[IDXPROM]], !dbg [[DBG94]]
-// CHECK1-NEXT:    store i32 10, ptr [[ARRAYIDX9]], align 4, !dbg [[DBG96:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG97:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX10]], i64 0, i64 0, !dbg [[DBG97]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG98:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM12:%.*]] = sext i32 [[TMP9]] to i64, !dbg [[DBG97]]
-// CHECK1-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX11]], i64 0, i64 [[IDXPROM12]], !dbg [[DBG97]]
-// CHECK1-NEXT:    store i32 11, ptr [[ARRAYIDX13]], align 4, !dbg [[DBG99:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG100:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX14]], i64 0, i64 0, !dbg [[DBG100]]
-// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG101:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM16:%.*]] = sext i32 [[TMP10]] to i64, !dbg [[DBG100]]
-// CHECK1-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX15]], i64 0, i64 [[IDXPROM16]], !dbg [[DBG100]]
-// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX17]], align 4, !dbg [[DBG100]]
-// CHECK1-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B3]], i64 0, i64 0, !dbg [[DBG102:![0-9]+]]
-// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG103:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM19:%.*]] = sext i32 [[TMP12]] to i64, !dbg [[DBG102]]
-// CHECK1-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX18]], i64 0, i64 [[IDXPROM19]], !dbg [[DBG102]]
-// CHECK1-NEXT:    store i32 [[TMP11]], ptr [[ARRAYIDX20]], align 4, !dbg [[DBG104:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX21:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B3]], i64 0, i64 0, !dbg [[DBG105:![0-9]+]]
-// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG106:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM22:%.*]] = sext i32 [[TMP13]] to i64, !dbg [[DBG105]]
-// CHECK1-NEXT:    [[ARRAYIDX23:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX21]], i64 0, i64 [[IDXPROM22]], !dbg [[DBG105]]
-// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[ARRAYIDX23]], align 4, !dbg [[DBG105]]
-// CHECK1-NEXT:    [[TMP15:%.*]] = load i8, ptr [[TMP7]], align 1, !dbg [[DBG107:![0-9]+]]
-// CHECK1-NEXT:    [[TOBOOL:%.*]] = trunc i8 [[TMP15]] to i1, !dbg [[DBG107]]
-// CHECK1-NEXT:    [[CONV:%.*]] = zext i1 [[TOBOOL]] to i32, !dbg [[DBG107]]
-// CHECK1-NEXT:    [[OR:%.*]] = or i32 [[CONV]], [[TMP14]], !dbg [[DBG107]]
-// CHECK1-NEXT:    [[TOBOOL24:%.*]] = icmp ne i32 [[OR]], 0, !dbg [[DBG107]]
-// CHECK1-NEXT:    [[FROMBOOL:%.*]] = zext i1 [[TOBOOL24]] to i8, !dbg [[DBG107]]
-// CHECK1-NEXT:    store i8 [[FROMBOOL]], ptr [[TMP7]], align 1, !dbg [[DBG107]]
-// CHECK1-NEXT:    ret void, !dbg [[DBG108:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META82:![0-9]+]], metadata !DIExpression()), !dbg [[DBG83:![0-9]+]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG84:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG84]]
+// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG84]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG84]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG84]]
+// CHECK1-NEXT:    store ptr [[TMP3]], ptr [[_TMP1]], align 8, !dbg [[DBG84]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG84]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG84]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = addrspacecast ptr addrspace(1) [[TMP5]] to ptr, !dbg [[DBG84]]
+// CHECK1-NEXT:    store ptr [[TMP6]], ptr [[_TMP2]], align 8, !dbg [[DBG84]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG84]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B3]], metadata [[META85:![0-9]+]], metadata !DIExpression()), !dbg [[DBG74]]
+// CHECK1-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[B3]], ptr align 4 [[TMP4]], i64 400, i1 false), !dbg [[DBG84]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[F]], metadata [[META86:![0-9]+]], metadata !DIExpression()), !dbg [[DBG89:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 1, !dbg [[DBG90:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX]], i64 0, i64 1, !dbg [[DBG90]]
+// CHECK1-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX4]], i64 0, i64 1, !dbg [[DBG90]]
+// CHECK1-NEXT:    store ptr [[ARRAYIDX5]], ptr [[F]], align 8, !dbg [[DBG89]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[G]], metadata [[META91:![0-9]+]], metadata !DIExpression()), !dbg [[DBG92:![0-9]+]]
+// CHECK1-NEXT:    store ptr [[A_ADDR]], ptr [[G]], align 8, !dbg [[DBG92]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[H]], metadata [[META93:![0-9]+]], metadata !DIExpression()), !dbg [[DBG94:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B3]], i64 0, i64 1, !dbg [[DBG95:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX6]], i64 0, i64 1, !dbg [[DBG95]]
+// CHECK1-NEXT:    store ptr [[ARRAYIDX7]], ptr [[H]], align 8, !dbg [[DBG94]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[D]], metadata [[META96:![0-9]+]], metadata !DIExpression()), !dbg [[DBG97:![0-9]+]]
+// CHECK1-NEXT:    store i32 15, ptr [[D]], align 4, !dbg [[DBG97]]
+// CHECK1-NEXT:    store i32 5, ptr [[A_ADDR]], align 4, !dbg [[DBG98:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B3]], i64 0, i64 0, !dbg [[DBG99:![0-9]+]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG100:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP8]] to i64, !dbg [[DBG99]]
+// CHECK1-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX8]], i64 0, i64 [[IDXPROM]], !dbg [[DBG99]]
+// CHECK1-NEXT:    store i32 10, ptr [[ARRAYIDX9]], align 4, !dbg [[DBG101:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG102:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX10]], i64 0, i64 0, !dbg [[DBG102]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG103:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM12:%.*]] = sext i32 [[TMP9]] to i64, !dbg [[DBG102]]
+// CHECK1-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX11]], i64 0, i64 [[IDXPROM12]], !dbg [[DBG102]]
+// CHECK1-NEXT:    store i32 11, ptr [[ARRAYIDX13]], align 4, !dbg [[DBG104:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG105:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX14]], i64 0, i64 0, !dbg [[DBG105]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG106:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM16:%.*]] = sext i32 [[TMP10]] to i64, !dbg [[DBG105]]
+// CHECK1-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX15]], i64 0, i64 [[IDXPROM16]], !dbg [[DBG105]]
+// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[ARRAYIDX17]], align 4, !dbg [[DBG105]]
+// CHECK1-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B3]], i64 0, i64 0, !dbg [[DBG107:![0-9]+]]
+// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG108:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM19:%.*]] = sext i32 [[TMP12]] to i64, !dbg [[DBG107]]
+// CHECK1-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX18]], i64 0, i64 [[IDXPROM19]], !dbg [[DBG107]]
+// CHECK1-NEXT:    store i32 [[TMP11]], ptr [[ARRAYIDX20]], align 4, !dbg [[DBG109:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX21:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B3]], i64 0, i64 0, !dbg [[DBG110:![0-9]+]]
+// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG111:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM22:%.*]] = sext i32 [[TMP13]] to i64, !dbg [[DBG110]]
+// CHECK1-NEXT:    [[ARRAYIDX23:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX21]], i64 0, i64 [[IDXPROM22]], !dbg [[DBG110]]
+// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[ARRAYIDX23]], align 4, !dbg [[DBG110]]
+// CHECK1-NEXT:    [[TMP15:%.*]] = load i8, ptr [[TMP7]], align 1, !dbg [[DBG112:![0-9]+]]
+// CHECK1-NEXT:    [[TOBOOL:%.*]] = trunc i8 [[TMP15]] to i1, !dbg [[DBG112]]
+// CHECK1-NEXT:    [[CONV:%.*]] = zext i1 [[TOBOOL]] to i32, !dbg [[DBG112]]
+// CHECK1-NEXT:    [[OR:%.*]] = or i32 [[CONV]], [[TMP14]], !dbg [[DBG112]]
+// CHECK1-NEXT:    [[TOBOOL24:%.*]] = icmp ne i32 [[OR]], 0, !dbg [[DBG112]]
+// CHECK1-NEXT:    [[FROMBOOL:%.*]] = zext i1 [[TOBOOL24]] to i8, !dbg [[DBG112]]
+// CHECK1-NEXT:    store i8 [[FROMBOOL]], ptr [[TMP7]], align 1, !dbg [[DBG112]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG113:![0-9]+]]
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l23_debug___omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR2]] !dbg [[DBG109:![0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR2]] !dbg [[DBG114:![0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -222,63 +225,68 @@ int main() {
 // CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[BB_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META116:![0-9]+]], metadata !DIExpression()), !dbg [[DBG117:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META121:![0-9]+]], metadata !DIExpression()), !dbg [[DBG122:![0-9]+]]
 // CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META118:![0-9]+]], metadata !DIExpression()), !dbg [[DBG117]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META123:![0-9]+]], metadata !DIExpression()), !dbg [[DBG122]]
 // CHECK1-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META119:![0-9]+]], metadata !DIExpression()), !dbg [[DBG117]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META124:![0-9]+]], metadata !DIExpression()), !dbg [[DBG122]]
 // CHECK1-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META120:![0-9]+]], metadata !DIExpression()), !dbg [[DBG117]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META125:![0-9]+]], metadata !DIExpression()), !dbg [[DBG122]]
 // CHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META121:![0-9]+]], metadata !DIExpression()), !dbg [[DBG117]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META126:![0-9]+]], metadata !DIExpression()), !dbg [[DBG122]]
 // CHECK1-NEXT:    store ptr [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META122:![0-9]+]], metadata !DIExpression()), !dbg [[DBG117]]
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG123:![0-9]+]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG123]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG123]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG123]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG123]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG123]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG123]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG123]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG123]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = addrspacecast ptr [[TMP5]] to ptr addrspace(1), !dbg [[DBG123]]
-// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr [[TMP8]] to ptr addrspace(1), !dbg [[DBG123]]
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l23_debug___omp_outlined_debug__(ptr [[TMP3]], ptr [[TMP4]], ptr addrspace(1) [[TMP9]], i32 [[TMP6]], ptr [[TMP7]], ptr addrspace(1) [[TMP10]]) #[[ATTR4:[0-9]+]], !dbg [[DBG123]]
-// CHECK1-NEXT:    ret void, !dbg [[DBG123]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META127:![0-9]+]], metadata !DIExpression()), !dbg [[DBG122]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG128:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG128]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG128]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG128]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG128]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG128]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG128]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG128]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG128]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = addrspacecast ptr [[TMP5]] to ptr addrspace(1), !dbg [[DBG128]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr [[TMP8]] to ptr addrspace(1), !dbg [[DBG128]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l23_debug___omp_outlined_debug__(ptr [[TMP3]], ptr [[TMP4]], ptr addrspace(1) [[TMP9]], i32 [[TMP6]], ptr [[TMP7]], ptr addrspace(1) [[TMP10]]) #[[ATTR4:[0-9]+]], !dbg [[DBG128]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG128]]
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l23
-// CHECK1-SAME: (ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR6:[0-9]+]] !dbg [[DBG124:![0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR6:[0-9]+]] !dbg [[DBG129:![0-9]+]] {
 // CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
 // CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[BB_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DYN_PTR_ADDR]], metadata [[META132:![0-9]+]], metadata !DIExpression()), !dbg [[DBG133:![0-9]+]]
 // CHECK1-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META127:![0-9]+]], metadata !DIExpression()), !dbg [[DBG128:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META134:![0-9]+]], metadata !DIExpression()), !dbg [[DBG133]]
 // CHECK1-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META129:![0-9]+]], metadata !DIExpression()), !dbg [[DBG128]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META135:![0-9]+]], metadata !DIExpression()), !dbg [[DBG133]]
 // CHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META130:![0-9]+]], metadata !DIExpression()), !dbg [[DBG128]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META136:![0-9]+]], metadata !DIExpression()), !dbg [[DBG133]]
 // CHECK1-NEXT:    store ptr [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META131:![0-9]+]], metadata !DIExpression()), !dbg [[DBG128]]
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG132:![0-9]+]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG132]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG132]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG132]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG132]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG132]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG132]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = addrspacecast ptr [[TMP3]] to ptr addrspace(1), !dbg [[DBG132]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = addrspacecast ptr [[TMP6]] to ptr addrspace(1), !dbg [[DBG132]]
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l23_debug__(ptr addrspace(1) [[TMP7]], i32 [[TMP4]], ptr [[TMP5]], ptr addrspace(1) [[TMP8]]) #[[ATTR4]], !dbg [[DBG132]]
-// CHECK1-NEXT:    ret void, !dbg [[DBG132]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META137:![0-9]+]], metadata !DIExpression()), !dbg [[DBG133]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG138:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG138]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG138]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DYN_PTR_ADDR]], align 8, !dbg [[DBG138]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG138]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG138]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG138]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG138]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = addrspacecast ptr [[TMP4]] to ptr addrspace(1), !dbg [[DBG138]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = addrspacecast ptr [[TMP7]] to ptr addrspace(1), !dbg [[DBG138]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l23_debug__(ptr [[TMP3]], ptr addrspace(1) [[TMP8]], i32 [[TMP5]], ptr [[TMP6]], ptr addrspace(1) [[TMP9]]) #[[ATTR4]], !dbg [[DBG138]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG138]]
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l37_debug__
-// CHECK1-SAME: (ptr addrspace(1) noalias noundef [[C:%.*]], i32 noundef [[A:%.*]], ptr addrspace(1) noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG133:![0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], i32 noundef [[A:%.*]], ptr addrspace(1) noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG139:![0-9]+]] {
 // CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[C_ADDR:%.*]] = alloca ptr addrspace(1), align 8
 // CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca ptr addrspace(1), align 8
@@ -288,52 +296,54 @@ int main() {
 // CHECK1-NEXT:    [[_TMP2:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[A_CASTED:%.*]] = alloca i64, align 8
 // CHECK1-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x ptr], align 8
+// CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DYN_PTR_ADDR]], metadata [[META144:![0-9]+]], metadata !DIExpression()), !dbg [[DBG145:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META138:![0-9]+]], metadata !DIExpression()), !dbg [[DBG139:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META146:![0-9]+]], metadata !DIExpression()), !dbg [[DBG147:![0-9]+]]
 // CHECK1-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META140:![0-9]+]], metadata !DIExpression()), !dbg [[DBG141:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META148:![0-9]+]], metadata !DIExpression()), !dbg [[DBG149:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META142:![0-9]+]], metadata !DIExpression()), !dbg [[DBG143:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META150:![0-9]+]], metadata !DIExpression()), !dbg [[DBG151:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META144:![0-9]+]], metadata !DIExpression()), !dbg [[DBG145:![0-9]+]]
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG146:![0-9]+]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG146]]
-// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG146]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG146]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr [[B_ADDR]], align 8, !dbg [[DBG146]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[TMP3]] to ptr, !dbg [[DBG146]]
-// CHECK1-NEXT:    store ptr [[TMP4]], ptr [[_TMP1]], align 8, !dbg [[DBG146]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG146]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG146]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = addrspacecast ptr addrspace(1) [[TMP6]] to ptr, !dbg [[DBG146]]
-// CHECK1-NEXT:    store ptr [[TMP7]], ptr [[_TMP2]], align 8, !dbg [[DBG146]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG146]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l37_kernel_environment), !dbg [[DBG146]]
-// CHECK1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP9]], -1, !dbg [[DBG146]]
-// CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]], !dbg [[DBG146]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META152:![0-9]+]], metadata !DIExpression()), !dbg [[DBG153:![0-9]+]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG154:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG154]]
+// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG154]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG154]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr [[B_ADDR]], align 8, !dbg [[DBG154]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[TMP3]] to ptr, !dbg [[DBG154]]
+// CHECK1-NEXT:    store ptr [[TMP4]], ptr [[_TMP1]], align 8, !dbg [[DBG154]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG154]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG154]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = addrspacecast ptr addrspace(1) [[TMP6]] to ptr, !dbg [[DBG154]]
+// CHECK1-NEXT:    store ptr [[TMP7]], ptr [[_TMP2]], align 8, !dbg [[DBG154]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG154]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l37_kernel_environment, ptr [[DYN_PTR]]), !dbg [[DBG154]]
+// CHECK1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP9]], -1, !dbg [[DBG154]]
+// CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]], !dbg [[DBG154]]
 // CHECK1:       user_code.entry:
 // CHECK1-NEXT:    [[TMP10:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB7:[0-9]+]])
-// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG147:![0-9]+]]
-// CHECK1-NEXT:    store i32 [[TMP11]], ptr [[A_CASTED]], align 4, !dbg [[DBG147]]
-// CHECK1-NEXT:    [[TMP12:%.*]] = load i64, ptr [[A_CASTED]], align 8, !dbg [[DBG147]]
-// CHECK1-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 0, !dbg [[DBG147]]
-// CHECK1-NEXT:    store ptr [[TMP2]], ptr [[TMP13]], align 8, !dbg [[DBG147]]
-// CHECK1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 1, !dbg [[DBG147]]
-// CHECK1-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP12]] to ptr, !dbg [[DBG147]]
-// CHECK1-NEXT:    store ptr [[TMP15]], ptr [[TMP14]], align 8, !dbg [[DBG147]]
-// CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 2, !dbg [[DBG147]]
-// CHECK1-NEXT:    store ptr [[TMP5]], ptr [[TMP16]], align 8, !dbg [[DBG147]]
-// CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 3, !dbg [[DBG147]]
-// CHECK1-NEXT:    store ptr [[TMP8]], ptr [[TMP17]], align 8, !dbg [[DBG147]]
-// CHECK1-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB7]], i32 [[TMP10]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l37_debug___omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 4), !dbg [[DBG147]]
-// CHECK1-NEXT:    call void @__kmpc_target_deinit(), !dbg [[DBG148:![0-9]+]]
-// CHECK1-NEXT:    ret void, !dbg [[DBG150:![0-9]+]]
+// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG155:![0-9]+]]
+// CHECK1-NEXT:    store i32 [[TMP11]], ptr [[A_CASTED]], align 4, !dbg [[DBG155]]
+// CHECK1-NEXT:    [[TMP12:%.*]] = load i64, ptr [[A_CASTED]], align 8, !dbg [[DBG155]]
+// CHECK1-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 0, !dbg [[DBG155]]
+// CHECK1-NEXT:    store ptr [[TMP2]], ptr [[TMP13]], align 8, !dbg [[DBG155]]
+// CHECK1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 1, !dbg [[DBG155]]
+// CHECK1-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP12]] to ptr, !dbg [[DBG155]]
+// CHECK1-NEXT:    store ptr [[TMP15]], ptr [[TMP14]], align 8, !dbg [[DBG155]]
+// CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 2, !dbg [[DBG155]]
+// CHECK1-NEXT:    store ptr [[TMP5]], ptr [[TMP16]], align 8, !dbg [[DBG155]]
+// CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 3, !dbg [[DBG155]]
+// CHECK1-NEXT:    store ptr [[TMP8]], ptr [[TMP17]], align 8, !dbg [[DBG155]]
+// CHECK1-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB7]], i32 [[TMP10]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l37_debug___omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 4), !dbg [[DBG155]]
+// CHECK1-NEXT:    call void @__kmpc_target_deinit(), !dbg [[DBG156:![0-9]+]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG158:![0-9]+]]
 // CHECK1:       worker.exit:
-// CHECK1-NEXT:    ret void, !dbg [[DBG146]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG154]]
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l37_debug___omp_outlined_debug__
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], i32 noundef [[A:%.*]], ptr addrspace(1) noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR2]] !dbg [[DBG151:![0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], i32 noundef [[A:%.*]], ptr addrspace(1) noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR2]] !dbg [[DBG159:![0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -349,74 +359,74 @@ int main() {
 // CHECK1-NEXT:    [[H:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[D:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META154:![0-9]+]], metadata !DIExpression()), !dbg [[DBG155:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META162:![0-9]+]], metadata !DIExpression()), !dbg [[DBG163:![0-9]+]]
 // CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META156:![0-9]+]], metadata !DIExpression()), !dbg [[DBG155]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META164:![0-9]+]], metadata !DIExpression()), !dbg [[DBG163]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META157:![0-9]+]], metadata !DIExpression()), !dbg [[DBG158:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META165:![0-9]+]], metadata !DIExpression()), !dbg [[DBG166:![0-9]+]]
 // CHECK1-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META159:![0-9]+]], metadata !DIExpression()), !dbg [[DBG160:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META167:![0-9]+]], metadata !DIExpression()), !dbg [[DBG168:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META161:![0-9]+]], metadata !DIExpression()), !dbg [[DBG162:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META169:![0-9]+]], metadata !DIExpression()), !dbg [[DBG170:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META163:![0-9]+]], metadata !DIExpression()), !dbg [[DBG164:![0-9]+]]
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG165:![0-9]+]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG165]]
-// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG165]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG165]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr [[B_ADDR]], align 8, !dbg [[DBG165]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[TMP3]] to ptr, !dbg [[DBG165]]
-// CHECK1-NEXT:    store ptr [[TMP4]], ptr [[_TMP1]], align 8, !dbg [[DBG165]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG165]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG165]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = addrspacecast ptr addrspace(1) [[TMP6]] to ptr, !dbg [[DBG165]]
-// CHECK1-NEXT:    store ptr [[TMP7]], ptr [[_TMP2]], align 8, !dbg [[DBG165]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG165]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[F]], metadata [[META166:![0-9]+]], metadata !DIExpression()), !dbg [[DBG168:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 1, !dbg [[DBG169:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX]], i64 0, i64 1, !dbg [[DBG169]]
-// CHECK1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX3]], i64 0, i64 1, !dbg [[DBG169]]
-// CHECK1-NEXT:    store ptr [[ARRAYIDX4]], ptr [[F]], align 8, !dbg [[DBG168]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[G]], metadata [[META170:![0-9]+]], metadata !DIExpression()), !dbg [[DBG171:![0-9]+]]
-// CHECK1-NEXT:    store ptr [[A_ADDR]], ptr [[G]], align 8, !dbg [[DBG171]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[H]], metadata [[META172:![0-9]+]], metadata !DIExpression()), !dbg [[DBG173:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP5]], i64 0, i64 1, !dbg [[DBG174:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX5]], i64 0, i64 1, !dbg [[DBG174]]
-// CHECK1-NEXT:    store ptr [[ARRAYIDX6]], ptr [[H]], align 8, !dbg [[DBG173]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[D]], metadata [[META175:![0-9]+]], metadata !DIExpression()), !dbg [[DBG176:![0-9]+]]
-// CHECK1-NEXT:    store i32 15, ptr [[D]], align 4, !dbg [[DBG176]]
-// CHECK1-NEXT:    store i32 5, ptr [[A_ADDR]], align 4, !dbg [[DBG177:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP5]], i64 0, i64 0, !dbg [[DBG178:![0-9]+]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG179:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP9]] to i64, !dbg [[DBG178]]
-// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX7]], i64 0, i64 [[IDXPROM]], !dbg [[DBG178]]
-// CHECK1-NEXT:    store i32 10, ptr [[ARRAYIDX8]], align 4, !dbg [[DBG180:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG181:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX9]], i64 0, i64 0, !dbg [[DBG181]]
-// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG182:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM11:%.*]] = sext i32 [[TMP10]] to i64, !dbg [[DBG181]]
-// CHECK1-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX10]], i64 0, i64 [[IDXPROM11]], !dbg [[DBG181]]
-// CHECK1-NEXT:    store i32 11, ptr [[ARRAYIDX12]], align 4, !dbg [[DBG183:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG184:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX13]], i64 0, i64 0, !dbg [[DBG184]]
-// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG185:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM15:%.*]] = sext i32 [[TMP11]] to i64, !dbg [[DBG184]]
-// CHECK1-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX14]], i64 0, i64 [[IDXPROM15]], !dbg [[DBG184]]
-// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX16]], align 4, !dbg [[DBG184]]
-// CHECK1-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP5]], i64 0, i64 0, !dbg [[DBG186:![0-9]+]]
-// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG187:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM18:%.*]] = sext i32 [[TMP13]] to i64, !dbg [[DBG186]]
-// CHECK1-NEXT:    [[ARRAYIDX19:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX17]], i64 0, i64 [[IDXPROM18]], !dbg [[DBG186]]
-// CHECK1-NEXT:    store i32 [[TMP12]], ptr [[ARRAYIDX19]], align 4, !dbg [[DBG188:![0-9]+]]
-// CHECK1-NEXT:    [[TMP14:%.*]] = load i8, ptr [[TMP8]], align 1, !dbg [[DBG189:![0-9]+]]
-// CHECK1-NEXT:    [[TOBOOL:%.*]] = trunc i8 [[TMP14]] to i1, !dbg [[DBG189]]
-// CHECK1-NEXT:    [[CONV:%.*]] = zext i1 [[TOBOOL]] to i32, !dbg [[DBG189]]
-// CHECK1-NEXT:    store i32 [[CONV]], ptr [[D]], align 4, !dbg [[DBG190:![0-9]+]]
-// CHECK1-NEXT:    ret void, !dbg [[DBG191:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META171:![0-9]+]], metadata !DIExpression()), !dbg [[DBG172:![0-9]+]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG173:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG173]]
+// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG173]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG173]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr [[B_ADDR]], align 8, !dbg [[DBG173]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[TMP3]] to ptr, !dbg [[DBG173]]
+// CHECK1-NEXT:    store ptr [[TMP4]], ptr [[_TMP1]], align 8, !dbg [[DBG173]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG173]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG173]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = addrspacecast ptr addrspace(1) [[TMP6]] to ptr, !dbg [[DBG173]]
+// CHECK1-NEXT:    store ptr [[TMP7]], ptr [[_TMP2]], align 8, !dbg [[DBG173]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG173]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[F]], metadata [[META174:![0-9]+]], metadata !DIExpression()), !dbg [[DBG176:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 1, !dbg [[DBG177:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX]], i64 0, i64 1, !dbg [[DBG177]]
+// CHECK1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX3]], i64 0, i64 1, !dbg [[DBG177]]
+// CHECK1-NEXT:    store ptr [[ARRAYIDX4]], ptr [[F]], align 8, !dbg [[DBG176]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[G]], metadata [[META178:![0-9]+]], metadata !DIExpression()), !dbg [[DBG179:![0-9]+]]
+// CHECK1-NEXT:    store ptr [[A_ADDR]], ptr [[G]], align 8, !dbg [[DBG179]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[H]], metadata [[META180:![0-9]+]], metadata !DIExpression()), !dbg [[DBG181:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP5]], i64 0, i64 1, !dbg [[DBG182:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX5]], i64 0, i64 1, !dbg [[DBG182]]
+// CHECK1-NEXT:    store ptr [[ARRAYIDX6]], ptr [[H]], align 8, !dbg [[DBG181]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[D]], metadata [[META183:![0-9]+]], metadata !DIExpression()), !dbg [[DBG184:![0-9]+]]
+// CHECK1-NEXT:    store i32 15, ptr [[D]], align 4, !dbg [[DBG184]]
+// CHECK1-NEXT:    store i32 5, ptr [[A_ADDR]], align 4, !dbg [[DBG185:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP5]], i64 0, i64 0, !dbg [[DBG186:![0-9]+]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG187:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP9]] to i64, !dbg [[DBG186]]
+// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX7]], i64 0, i64 [[IDXPROM]], !dbg [[DBG186]]
+// CHECK1-NEXT:    store i32 10, ptr [[ARRAYIDX8]], align 4, !dbg [[DBG188:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG189:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX9]], i64 0, i64 0, !dbg [[DBG189]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG190:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM11:%.*]] = sext i32 [[TMP10]] to i64, !dbg [[DBG189]]
+// CHECK1-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX10]], i64 0, i64 [[IDXPROM11]], !dbg [[DBG189]]
+// CHECK1-NEXT:    store i32 11, ptr [[ARRAYIDX12]], align 4, !dbg [[DBG191:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG192:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX13]], i64 0, i64 0, !dbg [[DBG192]]
+// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG193:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM15:%.*]] = sext i32 [[TMP11]] to i64, !dbg [[DBG192]]
+// CHECK1-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX14]], i64 0, i64 [[IDXPROM15]], !dbg [[DBG192]]
+// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX16]], align 4, !dbg [[DBG192]]
+// CHECK1-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP5]], i64 0, i64 0, !dbg [[DBG194:![0-9]+]]
+// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG195:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM18:%.*]] = sext i32 [[TMP13]] to i64, !dbg [[DBG194]]
+// CHECK1-NEXT:    [[ARRAYIDX19:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX17]], i64 0, i64 [[IDXPROM18]], !dbg [[DBG194]]
+// CHECK1-NEXT:    store i32 [[TMP12]], ptr [[ARRAYIDX19]], align 4, !dbg [[DBG196:![0-9]+]]
+// CHECK1-NEXT:    [[TMP14:%.*]] = load i8, ptr [[TMP8]], align 1, !dbg [[DBG197:![0-9]+]]
+// CHECK1-NEXT:    [[TOBOOL:%.*]] = trunc i8 [[TMP14]] to i1, !dbg [[DBG197]]
+// CHECK1-NEXT:    [[CONV:%.*]] = zext i1 [[TOBOOL]] to i32, !dbg [[DBG197]]
+// CHECK1-NEXT:    store i32 [[CONV]], ptr [[D]], align 4, !dbg [[DBG198:![0-9]+]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG199:![0-9]+]]
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l37_debug___omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR2]] !dbg [[DBG192:![0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR2]] !dbg [[DBG200:![0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -425,65 +435,70 @@ int main() {
 // CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[BB_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META193:![0-9]+]], metadata !DIExpression()), !dbg [[DBG194:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META201:![0-9]+]], metadata !DIExpression()), !dbg [[DBG202:![0-9]+]]
 // CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META195:![0-9]+]], metadata !DIExpression()), !dbg [[DBG194]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META203:![0-9]+]], metadata !DIExpression()), !dbg [[DBG202]]
 // CHECK1-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META196:![0-9]+]], metadata !DIExpression()), !dbg [[DBG194]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META204:![0-9]+]], metadata !DIExpression()), !dbg [[DBG202]]
 // CHECK1-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META197:![0-9]+]], metadata !DIExpression()), !dbg [[DBG194]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META205:![0-9]+]], metadata !DIExpression()), !dbg [[DBG202]]
 // CHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META198:![0-9]+]], metadata !DIExpression()), !dbg [[DBG194]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META206:![0-9]+]], metadata !DIExpression()), !dbg [[DBG202]]
 // CHECK1-NEXT:    store ptr [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META199:![0-9]+]], metadata !DIExpression()), !dbg [[DBG194]]
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG200:![0-9]+]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG200]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG200]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG200]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG200]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG200]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG200]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG200]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG200]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = addrspacecast ptr [[TMP5]] to ptr addrspace(1), !dbg [[DBG200]]
-// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr [[TMP7]] to ptr addrspace(1), !dbg [[DBG200]]
-// CHECK1-NEXT:    [[TMP11:%.*]] = addrspacecast ptr [[TMP8]] to ptr addrspace(1), !dbg [[DBG200]]
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l37_debug___omp_outlined_debug__(ptr [[TMP3]], ptr [[TMP4]], ptr addrspace(1) [[TMP9]], i32 [[TMP6]], ptr addrspace(1) [[TMP10]], ptr addrspace(1) [[TMP11]]) #[[ATTR4]], !dbg [[DBG200]]
-// CHECK1-NEXT:    ret void, !dbg [[DBG200]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META207:![0-9]+]], metadata !DIExpression()), !dbg [[DBG202]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG208:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG208]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG208]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG208]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG208]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG208]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG208]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG208]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG208]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = addrspacecast ptr [[TMP5]] to ptr addrspace(1), !dbg [[DBG208]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr [[TMP7]] to ptr addrspace(1), !dbg [[DBG208]]
+// CHECK1-NEXT:    [[TMP11:%.*]] = addrspacecast ptr [[TMP8]] to ptr addrspace(1), !dbg [[DBG208]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l37_debug___omp_outlined_debug__(ptr [[TMP3]], ptr [[TMP4]], ptr addrspace(1) [[TMP9]], i32 [[TMP6]], ptr addrspace(1) [[TMP10]], ptr addrspace(1) [[TMP11]]) #[[ATTR4]], !dbg [[DBG208]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG208]]
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l37
-// CHECK1-SAME: (ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR6]] !dbg [[DBG201:![0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR6]] !dbg [[DBG209:![0-9]+]] {
 // CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
 // CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[BB_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DYN_PTR_ADDR]], metadata [[META210:![0-9]+]], metadata !DIExpression()), !dbg [[DBG211:![0-9]+]]
 // CHECK1-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META202:![0-9]+]], metadata !DIExpression()), !dbg [[DBG203:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META212:![0-9]+]], metadata !DIExpression()), !dbg [[DBG211]]
 // CHECK1-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META204:![0-9]+]], metadata !DIExpression()), !dbg [[DBG203]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META213:![0-9]+]], metadata !DIExpression()), !dbg [[DBG211]]
 // CHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META205:![0-9]+]], metadata !DIExpression()), !dbg [[DBG203]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META214:![0-9]+]], metadata !DIExpression()), !dbg [[DBG211]]
 // CHECK1-NEXT:    store ptr [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META206:![0-9]+]], metadata !DIExpression()), !dbg [[DBG203]]
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG207:![0-9]+]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG207]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG207]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG207]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG207]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG207]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG207]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = addrspacecast ptr [[TMP3]] to ptr addrspace(1), !dbg [[DBG207]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = addrspacecast ptr [[TMP5]] to ptr addrspace(1), !dbg [[DBG207]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = addrspacecast ptr [[TMP6]] to ptr addrspace(1), !dbg [[DBG207]]
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l37_debug__(ptr addrspace(1) [[TMP7]], i32 [[TMP4]], ptr addrspace(1) [[TMP8]], ptr addrspace(1) [[TMP9]]) #[[ATTR4]], !dbg [[DBG207]]
-// CHECK1-NEXT:    ret void, !dbg [[DBG207]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META215:![0-9]+]], metadata !DIExpression()), !dbg [[DBG211]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG216:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG216]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG216]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DYN_PTR_ADDR]], align 8, !dbg [[DBG216]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG216]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG216]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG216]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG216]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = addrspacecast ptr [[TMP4]] to ptr addrspace(1), !dbg [[DBG216]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = addrspacecast ptr [[TMP6]] to ptr addrspace(1), !dbg [[DBG216]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr [[TMP7]] to ptr addrspace(1), !dbg [[DBG216]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l37_debug__(ptr [[TMP3]], ptr addrspace(1) [[TMP8]], i32 [[TMP5]], ptr addrspace(1) [[TMP9]], ptr addrspace(1) [[TMP10]]) #[[ATTR4]], !dbg [[DBG216]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG216]]
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l51_debug__
-// CHECK1-SAME: (ptr addrspace(1) noalias noundef [[C:%.*]], ptr addrspace(1) noalias noundef [[A:%.*]], ptr addrspace(1) noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG208:![0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], ptr addrspace(1) noalias noundef [[A:%.*]], ptr addrspace(1) noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG217:![0-9]+]] {
 // CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[C_ADDR:%.*]] = alloca ptr addrspace(1), align 8
 // CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca ptr addrspace(1), align 8
 // CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca ptr addrspace(1), align 8
@@ -493,52 +508,54 @@ int main() {
 // CHECK1-NEXT:    [[_TMP2:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[_TMP3:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x ptr], align 8
+// CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DYN_PTR_ADDR]], metadata [[META222:![0-9]+]], metadata !DIExpression()), !dbg [[DBG223:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META213:![0-9]+]], metadata !DIExpression()), !dbg [[DBG214:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META224:![0-9]+]], metadata !DIExpression()), !dbg [[DBG225:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[A]], ptr [[A_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META215:![0-9]+]], metadata !DIExpression()), !dbg [[DBG216:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META226:![0-9]+]], metadata !DIExpression()), !dbg [[DBG227:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META217:![0-9]+]], metadata !DIExpression()), !dbg [[DBG218:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META228:![0-9]+]], metadata !DIExpression()), !dbg [[DBG229:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META219:![0-9]+]], metadata !DIExpression()), !dbg [[DBG220:![0-9]+]]
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG221:![0-9]+]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG221]]
-// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG221]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG221]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr [[A_ADDR]], align 8, !dbg [[DBG221]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[TMP3]] to ptr, !dbg [[DBG221]]
-// CHECK1-NEXT:    store ptr [[TMP4]], ptr [[_TMP1]], align 8, !dbg [[DBG221]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG221]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr addrspace(1), ptr [[B_ADDR]], align 8, !dbg [[DBG221]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = addrspacecast ptr addrspace(1) [[TMP6]] to ptr, !dbg [[DBG221]]
-// CHECK1-NEXT:    store ptr [[TMP7]], ptr [[_TMP2]], align 8, !dbg [[DBG221]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG221]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG221]]
-// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr addrspace(1) [[TMP9]] to ptr, !dbg [[DBG221]]
-// CHECK1-NEXT:    store ptr [[TMP10]], ptr [[_TMP3]], align 8, !dbg [[DBG221]]
-// CHECK1-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[_TMP3]], align 8, !dbg [[DBG221]]
-// CHECK1-NEXT:    [[TMP12:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l51_kernel_environment), !dbg [[DBG221]]
-// CHECK1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP12]], -1, !dbg [[DBG221]]
-// CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]], !dbg [[DBG221]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META230:![0-9]+]], metadata !DIExpression()), !dbg [[DBG231:![0-9]+]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG232:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG232]]
+// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG232]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG232]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr [[A_ADDR]], align 8, !dbg [[DBG232]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[TMP3]] to ptr, !dbg [[DBG232]]
+// CHECK1-NEXT:    store ptr [[TMP4]], ptr [[_TMP1]], align 8, !dbg [[DBG232]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG232]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr addrspace(1), ptr [[B_ADDR]], align 8, !dbg [[DBG232]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = addrspacecast ptr addrspace(1) [[TMP6]] to ptr, !dbg [[DBG232]]
+// CHECK1-NEXT:    store ptr [[TMP7]], ptr [[_TMP2]], align 8, !dbg [[DBG232]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG232]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG232]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr addrspace(1) [[TMP9]] to ptr, !dbg [[DBG232]]
+// CHECK1-NEXT:    store ptr [[TMP10]], ptr [[_TMP3]], align 8, !dbg [[DBG232]]
+// CHECK1-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[_TMP3]], align 8, !dbg [[DBG232]]
+// CHECK1-NEXT:    [[TMP12:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l51_kernel_environment, ptr [[DYN_PTR]]), !dbg [[DBG232]]
+// CHECK1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP12]], -1, !dbg [[DBG232]]
+// CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]], !dbg [[DBG232]]
 // CHECK1:       user_code.entry:
 // CHECK1-NEXT:    [[TMP13:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB11:[0-9]+]])
-// CHECK1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 0, !dbg [[DBG222:![0-9]+]]
-// CHECK1-NEXT:    store ptr [[TMP2]], ptr [[TMP14]], align 8, !dbg [[DBG222]]
-// CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 1, !dbg [[DBG222]]
-// CHECK1-NEXT:    store ptr [[TMP5]], ptr [[TMP15]], align 8, !dbg [[DBG222]]
-// CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 2, !dbg [[DBG222]]
-// CHECK1-NEXT:    store ptr [[TMP8]], ptr [[TMP16]], align 8, !dbg [[DBG222]]
-// CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 3, !dbg [[DBG222]]
-// CHECK1-NEXT:    store ptr [[TMP11]], ptr [[TMP17]], align 8, !dbg [[DBG222]]
-// CHECK1-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB11]], i32 [[TMP13]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l51_debug___omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 4), !dbg [[DBG222]]
-// CHECK1-NEXT:    call void @__kmpc_target_deinit(), !dbg [[DBG223:![0-9]+]]
-// CHECK1-NEXT:    ret void, !dbg [[DBG225:![0-9]+]]
+// CHECK1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 0, !dbg [[DBG233:![0-9]+]]
+// CHECK1-NEXT:    store ptr [[TMP2]], ptr [[TMP14]], align 8, !dbg [[DBG233]]
+// CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 1, !dbg [[DBG233]]
+// CHECK1-NEXT:    store ptr [[TMP5]], ptr [[TMP15]], align 8, !dbg [[DBG233]]
+// CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 2, !dbg [[DBG233]]
+// CHECK1-NEXT:    store ptr [[TMP8]], ptr [[TMP16]], align 8, !dbg [[DBG233]]
+// CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 3, !dbg [[DBG233]]
+// CHECK1-NEXT:    store ptr [[TMP11]], ptr [[TMP17]], align 8, !dbg [[DBG233]]
+// CHECK1-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB11]], i32 [[TMP13]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l51_debug___omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 4), !dbg [[DBG233]]
+// CHECK1-NEXT:    call void @__kmpc_target_deinit(), !dbg [[DBG234:![0-9]+]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG236:![0-9]+]]
 // CHECK1:       worker.exit:
-// CHECK1-NEXT:    ret void, !dbg [[DBG221]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG232]]
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l51_debug___omp_outlined_debug__
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], ptr addrspace(1) noalias noundef [[A:%.*]], ptr addrspace(1) noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR2]] !dbg [[DBG226:![0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], ptr addrspace(1) noalias noundef [[A:%.*]], ptr addrspace(1) noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR2]] !dbg [[DBG237:![0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -555,82 +572,82 @@ int main() {
 // CHECK1-NEXT:    [[H:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[D:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META229:![0-9]+]], metadata !DIExpression()), !dbg [[DBG230:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META240:![0-9]+]], metadata !DIExpression()), !dbg [[DBG241:![0-9]+]]
 // CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META231:![0-9]+]], metadata !DIExpression()), !dbg [[DBG230]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META242:![0-9]+]], metadata !DIExpression()), !dbg [[DBG241]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META232:![0-9]+]], metadata !DIExpression()), !dbg [[DBG233:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META243:![0-9]+]], metadata !DIExpression()), !dbg [[DBG244:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[A]], ptr [[A_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META234:![0-9]+]], metadata !DIExpression()), !dbg [[DBG235:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META245:![0-9]+]], metadata !DIExpression()), !dbg [[DBG246:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META236:![0-9]+]], metadata !DIExpression()), !dbg [[DBG237:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META247:![0-9]+]], metadata !DIExpression()), !dbg [[DBG248:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META238:![0-9]+]], metadata !DIExpression()), !dbg [[DBG239:![0-9]+]]
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG240:![0-9]+]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG240]]
-// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG240]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG240]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr [[A_ADDR]], align 8, !dbg [[DBG240]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[TMP3]] to ptr, !dbg [[DBG240]]
-// CHECK1-NEXT:    store ptr [[TMP4]], ptr [[_TMP1]], align 8, !dbg [[DBG240]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG240]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr addrspace(1), ptr [[B_ADDR]], align 8, !dbg [[DBG240]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = addrspacecast ptr addrspace(1) [[TMP6]] to ptr, !dbg [[DBG240]]
-// CHECK1-NEXT:    store ptr [[TMP7]], ptr [[_TMP2]], align 8, !dbg [[DBG240]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG240]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG240]]
-// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr addrspace(1) [[TMP9]] to ptr, !dbg [[DBG240]]
-// CHECK1-NEXT:    store ptr [[TMP10]], ptr [[_TMP3]], align 8, !dbg [[DBG240]]
-// CHECK1-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[_TMP3]], align 8, !dbg [[DBG240]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[F]], metadata [[META241:![0-9]+]], metadata !DIExpression()), !dbg [[DBG243:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 1, !dbg [[DBG244:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX]], i64 0, i64 1, !dbg [[DBG244]]
-// CHECK1-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX4]], i64 0, i64 1, !dbg [[DBG244]]
-// CHECK1-NEXT:    store ptr [[ARRAYIDX5]], ptr [[F]], align 8, !dbg [[DBG243]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[G]], metadata [[META245:![0-9]+]], metadata !DIExpression()), !dbg [[DBG246:![0-9]+]]
-// CHECK1-NEXT:    store ptr [[TMP5]], ptr [[G]], align 8, !dbg [[DBG246]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[H]], metadata [[META247:![0-9]+]], metadata !DIExpression()), !dbg [[DBG248:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP8]], i64 0, i64 1, !dbg [[DBG249:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX6]], i64 0, i64 1, !dbg [[DBG249]]
-// CHECK1-NEXT:    store ptr [[ARRAYIDX7]], ptr [[H]], align 8, !dbg [[DBG248]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[D]], metadata [[META250:![0-9]+]], metadata !DIExpression()), !dbg [[DBG251:![0-9]+]]
-// CHECK1-NEXT:    store i32 15, ptr [[D]], align 4, !dbg [[DBG251]]
-// CHECK1-NEXT:    store i32 5, ptr [[TMP5]], align 4, !dbg [[DBG252:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP8]], i64 0, i64 0, !dbg [[DBG253:![0-9]+]]
-// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG254:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP12]] to i64, !dbg [[DBG253]]
-// CHECK1-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX8]], i64 0, i64 [[IDXPROM]], !dbg [[DBG253]]
-// CHECK1-NEXT:    store i32 10, ptr [[ARRAYIDX9]], align 4, !dbg [[DBG255:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG256:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX10]], i64 0, i64 0, !dbg [[DBG256]]
-// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG257:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM12:%.*]] = sext i32 [[TMP13]] to i64, !dbg [[DBG256]]
-// CHECK1-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX11]], i64 0, i64 [[IDXPROM12]], !dbg [[DBG256]]
-// CHECK1-NEXT:    store i32 11, ptr [[ARRAYIDX13]], align 4, !dbg [[DBG258:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG259:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX14]], i64 0, i64 0, !dbg [[DBG259]]
-// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG260:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM16:%.*]] = sext i32 [[TMP14]] to i64, !dbg [[DBG259]]
-// CHECK1-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX15]], i64 0, i64 [[IDXPROM16]], !dbg [[DBG259]]
-// CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[ARRAYIDX17]], align 4, !dbg [[DBG259]]
-// CHECK1-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP8]], i64 0, i64 0, !dbg [[DBG261:![0-9]+]]
-// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG262:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM19:%.*]] = sext i32 [[TMP16]] to i64, !dbg [[DBG261]]
-// CHECK1-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX18]], i64 0, i64 [[IDXPROM19]], !dbg [[DBG261]]
-// CHECK1-NEXT:    store i32 [[TMP15]], ptr [[ARRAYIDX20]], align 4, !dbg [[DBG263:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX21:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP8]], i64 0, i64 0, !dbg [[DBG264:![0-9]+]]
-// CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG265:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM22:%.*]] = sext i32 [[TMP17]] to i64, !dbg [[DBG264]]
-// CHECK1-NEXT:    [[ARRAYIDX23:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX21]], i64 0, i64 [[IDXPROM22]], !dbg [[DBG264]]
-// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[ARRAYIDX23]], align 4, !dbg [[DBG264]]
-// CHECK1-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP18]], 0, !dbg [[DBG264]]
-// CHECK1-NEXT:    [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8, !dbg [[DBG266:![0-9]+]]
-// CHECK1-NEXT:    store i8 [[FROMBOOL]], ptr [[TMP11]], align 1, !dbg [[DBG266]]
-// CHECK1-NEXT:    ret void, !dbg [[DBG267:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META249:![0-9]+]], metadata !DIExpression()), !dbg [[DBG250:![0-9]+]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG251:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG251]]
+// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG251]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG251]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr [[A_ADDR]], align 8, !dbg [[DBG251]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[TMP3]] to ptr, !dbg [[DBG251]]
+// CHECK1-NEXT:    store ptr [[TMP4]], ptr [[_TMP1]], align 8, !dbg [[DBG251]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG251]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr addrspace(1), ptr [[B_ADDR]], align 8, !dbg [[DBG251]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = addrspacecast ptr addrspace(1) [[TMP6]] to ptr, !dbg [[DBG251]]
+// CHECK1-NEXT:    store ptr [[TMP7]], ptr [[_TMP2]], align 8, !dbg [[DBG251]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG251]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG251]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr addrspace(1) [[TMP9]] to ptr, !dbg [[DBG251]]
+// CHECK1-NEXT:    store ptr [[TMP10]], ptr [[_TMP3]], align 8, !dbg [[DBG251]]
+// CHECK1-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[_TMP3]], align 8, !dbg [[DBG251]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[F]], metadata [[META252:![0-9]+]], metadata !DIExpression()), !dbg [[DBG254:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 1, !dbg [[DBG255:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX]], i64 0, i64 1, !dbg [[DBG255]]
+// CHECK1-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX4]], i64 0, i64 1, !dbg [[DBG255]]
+// CHECK1-NEXT:    store ptr [[ARRAYIDX5]], ptr [[F]], align 8, !dbg [[DBG254]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[G]], metadata [[META256:![0-9]+]], metadata !DIExpression()), !dbg [[DBG257:![0-9]+]]
+// CHECK1-NEXT:    store ptr [[TMP5]], ptr [[G]], align 8, !dbg [[DBG257]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[H]], metadata [[META258:![0-9]+]], metadata !DIExpression()), !dbg [[DBG259:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP8]], i64 0, i64 1, !dbg [[DBG260:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX6]], i64 0, i64 1, !dbg [[DBG260]]
+// CHECK1-NEXT:    store ptr [[ARRAYIDX7]], ptr [[H]], align 8, !dbg [[DBG259]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[D]], metadata [[META261:![0-9]+]], metadata !DIExpression()), !dbg [[DBG262:![0-9]+]]
+// CHECK1-NEXT:    store i32 15, ptr [[D]], align 4, !dbg [[DBG262]]
+// CHECK1-NEXT:    store i32 5, ptr [[TMP5]], align 4, !dbg [[DBG263:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP8]], i64 0, i64 0, !dbg [[DBG264:![0-9]+]]
+// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG265:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP12]] to i64, !dbg [[DBG264]]
+// CHECK1-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX8]], i64 0, i64 [[IDXPROM]], !dbg [[DBG264]]
+// CHECK1-NEXT:    store i32 10, ptr [[ARRAYIDX9]], align 4, !dbg [[DBG266:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG267:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX10]], i64 0, i64 0, !dbg [[DBG267]]
+// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG268:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM12:%.*]] = sext i32 [[TMP13]] to i64, !dbg [[DBG267]]
+// CHECK1-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX11]], i64 0, i64 [[IDXPROM12]], !dbg [[DBG267]]
+// CHECK1-NEXT:    store i32 11, ptr [[ARRAYIDX13]], align 4, !dbg [[DBG269:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG270:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX14]], i64 0, i64 0, !dbg [[DBG270]]
+// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG271:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM16:%.*]] = sext i32 [[TMP14]] to i64, !dbg [[DBG270]]
+// CHECK1-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX15]], i64 0, i64 [[IDXPROM16]], !dbg [[DBG270]]
+// CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[ARRAYIDX17]], align 4, !dbg [[DBG270]]
+// CHECK1-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP8]], i64 0, i64 0, !dbg [[DBG272:![0-9]+]]
+// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG273:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM19:%.*]] = sext i32 [[TMP16]] to i64, !dbg [[DBG272]]
+// CHECK1-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX18]], i64 0, i64 [[IDXPROM19]], !dbg [[DBG272]]
+// CHECK1-NEXT:    store i32 [[TMP15]], ptr [[ARRAYIDX20]], align 4, !dbg [[DBG274:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX21:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP8]], i64 0, i64 0, !dbg [[DBG275:![0-9]+]]
+// CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG276:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM22:%.*]] = sext i32 [[TMP17]] to i64, !dbg [[DBG275]]
+// CHECK1-NEXT:    [[ARRAYIDX23:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX21]], i64 0, i64 [[IDXPROM22]], !dbg [[DBG275]]
+// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[ARRAYIDX23]], align 4, !dbg [[DBG275]]
+// CHECK1-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP18]], 0, !dbg [[DBG275]]
+// CHECK1-NEXT:    [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8, !dbg [[DBG277:![0-9]+]]
+// CHECK1-NEXT:    store i8 [[FROMBOOL]], ptr [[TMP11]], align 1, !dbg [[DBG277]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG278:![0-9]+]]
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l51_debug___omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR2]] !dbg [[DBG268:![0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR2]] !dbg [[DBG279:![0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -639,62 +656,66 @@ int main() {
 // CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[BB_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META271:![0-9]+]], metadata !DIExpression()), !dbg [[DBG272:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META282:![0-9]+]], metadata !DIExpression()), !dbg [[DBG283:![0-9]+]]
 // CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META273:![0-9]+]], metadata !DIExpression()), !dbg [[DBG272]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META284:![0-9]+]], metadata !DIExpression()), !dbg [[DBG283]]
 // CHECK1-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META274:![0-9]+]], metadata !DIExpression()), !dbg [[DBG272]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META285:![0-9]+]], metadata !DIExpression()), !dbg [[DBG283]]
 // CHECK1-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META275:![0-9]+]], metadata !DIExpression()), !dbg [[DBG272]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META286:![0-9]+]], metadata !DIExpression()), !dbg [[DBG283]]
 // CHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META276:![0-9]+]], metadata !DIExpression()), !dbg [[DBG272]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META287:![0-9]+]], metadata !DIExpression()), !dbg [[DBG283]]
 // CHECK1-NEXT:    store ptr [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META277:![0-9]+]], metadata !DIExpression()), !dbg [[DBG272]]
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG278:![0-9]+]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG278]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG278]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG278]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG278]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG278]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG278]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG278]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG278]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG278]]
-// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr [[TMP6]] to ptr addrspace(1), !dbg [[DBG278]]
-// CHECK1-NEXT:    [[TMP11:%.*]] = addrspacecast ptr [[TMP7]] to ptr addrspace(1), !dbg [[DBG278]]
-// CHECK1-NEXT:    [[TMP12:%.*]] = addrspacecast ptr [[TMP8]] to ptr addrspace(1), !dbg [[DBG278]]
-// CHECK1-NEXT:    [[TMP13:%.*]] = addrspacecast ptr [[TMP9]] to ptr addrspace(1), !dbg [[DBG278]]
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l51_debug___omp_outlined_debug__(ptr [[TMP4]], ptr [[TMP5]], ptr addrspace(1) [[TMP10]], ptr addrspace(1) [[TMP11]], ptr addrspace(1) [[TMP12]], ptr addrspace(1) [[TMP13]]) #[[ATTR4]], !dbg [[DBG278]]
-// CHECK1-NEXT:    ret void, !dbg [[DBG278]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META288:![0-9]+]], metadata !DIExpression()), !dbg [[DBG283]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG289:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG289]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG289]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG289]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG289]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG289]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG289]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG289]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG289]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG289]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr [[TMP6]] to ptr addrspace(1), !dbg [[DBG289]]
+// CHECK1-NEXT:    [[TMP11:%.*]] = addrspacecast ptr [[TMP7]] to ptr addrspace(1), !dbg [[DBG289]]
+// CHECK1-NEXT:    [[TMP12:%.*]] = addrspacecast ptr [[TMP8]] to ptr addrspace(1), !dbg [[DBG289]]
+// CHECK1-NEXT:    [[TMP13:%.*]] = addrspacecast ptr [[TMP9]] to ptr addrspace(1), !dbg [[DBG289]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l51_debug___omp_outlined_debug__(ptr [[TMP4]], ptr [[TMP5]], ptr addrspace(1) [[TMP10]], ptr addrspace(1) [[TMP11]], ptr addrspace(1) [[TMP12]], ptr addrspace(1) [[TMP13]]) #[[ATTR4]], !dbg [[DBG289]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG289]]
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l51
-// CHECK1-SAME: (ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR6]] !dbg [[DBG279:![0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR6]] !dbg [[DBG290:![0-9]+]] {
 // CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[BB_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DYN_PTR_ADDR]], metadata [[META293:![0-9]+]], metadata !DIExpression()), !dbg [[DBG294:![0-9]+]]
 // CHECK1-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META282:![0-9]+]], metadata !DIExpression()), !dbg [[DBG283:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META295:![0-9]+]], metadata !DIExpression()), !dbg [[DBG294]]
 // CHECK1-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META284:![0-9]+]], metadata !DIExpression()), !dbg [[DBG283]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META296:![0-9]+]], metadata !DIExpression()), !dbg [[DBG294]]
 // CHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META285:![0-9]+]], metadata !DIExpression()), !dbg [[DBG283]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META297:![0-9]+]], metadata !DIExpression()), !dbg [[DBG294]]
 // CHECK1-NEXT:    store ptr [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META286:![0-9]+]], metadata !DIExpression()), !dbg [[DBG283]]
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG287:![0-9]+]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG287]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG287]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG287]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG287]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG287]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG287]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG287]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = addrspacecast ptr [[TMP4]] to ptr addrspace(1), !dbg [[DBG287]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = addrspacecast ptr [[TMP5]] to ptr addrspace(1), !dbg [[DBG287]]
-// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr [[TMP6]] to ptr addrspace(1), !dbg [[DBG287]]
-// CHECK1-NEXT:    [[TMP11:%.*]] = addrspacecast ptr [[TMP7]] to ptr addrspace(1), !dbg [[DBG287]]
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l51_debug__(ptr addrspace(1) [[TMP8]], ptr addrspace(1) [[TMP9]], ptr addrspace(1) [[TMP10]], ptr addrspace(1) [[TMP11]]) #[[ATTR4]], !dbg [[DBG287]]
-// CHECK1-NEXT:    ret void, !dbg [[DBG287]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META298:![0-9]+]], metadata !DIExpression()), !dbg [[DBG294]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG299:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG299]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG299]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG299]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DYN_PTR_ADDR]], align 8, !dbg [[DBG299]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG299]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG299]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG299]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG299]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = addrspacecast ptr [[TMP5]] to ptr addrspace(1), !dbg [[DBG299]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr [[TMP6]] to ptr addrspace(1), !dbg [[DBG299]]
+// CHECK1-NEXT:    [[TMP11:%.*]] = addrspacecast ptr [[TMP7]] to ptr addrspace(1), !dbg [[DBG299]]
+// CHECK1-NEXT:    [[TMP12:%.*]] = addrspacecast ptr [[TMP8]] to ptr addrspace(1), !dbg [[DBG299]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l51_debug__(ptr [[TMP4]], ptr addrspace(1) [[TMP9]], ptr addrspace(1) [[TMP10]], ptr addrspace(1) [[TMP11]], ptr addrspace(1) [[TMP12]]) #[[ATTR4]], !dbg [[DBG299]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG299]]
 //
diff --git a/clang/test/OpenMP/target_parallel_for_codegen.cpp b/clang/test/OpenMP/target_parallel_for_codegen.cpp
index c4680d1ea6c4a..697202b7ac412 100644
--- a/clang/test/OpenMP/target_parallel_for_codegen.cpp
+++ b/clang/test/OpenMP/target_parallel_for_codegen.cpp
@@ -3653,8 +3653,10 @@ int bar(int n){
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l103
-// CHECK9-SAME: () #[[ATTR0:[0-9]+]] {
+// CHECK9-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK9-NEXT:  entry:
+// CHECK9-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK9-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB2:[0-9]+]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l103.omp_outlined)
 // CHECK9-NEXT:    ret void
 //
@@ -3737,14 +3739,16 @@ int bar(int n){
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l128
-// CHECK9-SAME: (i64 noundef [[AA:%.*]], i64 noundef [[LIN:%.*]], i64 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK9-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[AA:%.*]], i64 noundef [[LIN:%.*]], i64 noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK9-NEXT:  entry:
+// CHECK9-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[AA_ADDR:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[LIN_ADDR:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[AA_CASTED:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[LIN_CASTED:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[A_CASTED:%.*]] = alloca i64, align 8
+// CHECK9-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK9-NEXT:    store i64 [[AA]], ptr [[AA_ADDR]], align 8
 // CHECK9-NEXT:    store i64 [[LIN]], ptr [[LIN_ADDR]], align 8
 // CHECK9-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
@@ -3877,12 +3881,14 @@ int bar(int n){
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l136
-// CHECK9-SAME: (i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]]) #[[ATTR0]] {
+// CHECK9-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]]) #[[ATTR0]] {
 // CHECK9-NEXT:  entry:
+// CHECK9-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[AA_ADDR:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[A_CASTED:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[AA_CASTED:%.*]] = alloca i64, align 8
+// CHECK9-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK9-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
 // CHECK9-NEXT:    store i64 [[AA]], ptr [[AA_ADDR]], align 8
 // CHECK9-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
@@ -3969,8 +3975,9 @@ int bar(int n){
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l160
-// CHECK9-SAME: (i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BN:%.*]], ptr noundef nonnull align 8 dereferenceable(400) [[C:%.*]], i64 noundef [[VLA1:%.*]], i64 noundef [[VLA3:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[CN:%.*]], ptr noundef nonnull align 8 dereferenceable(16) [[D:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] {
+// CHECK9-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BN:%.*]], ptr noundef nonnull align 8 dereferenceable(400) [[C:%.*]], i64 noundef [[VLA1:%.*]], i64 noundef [[VLA3:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[CN:%.*]], ptr noundef nonnull align 8 dereferenceable(16) [[D:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] {
 // CHECK9-NEXT:  entry:
+// CHECK9-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[VLA_ADDR:%.*]] = alloca i64, align 8
@@ -3983,6 +3990,7 @@ int bar(int n){
 // CHECK9-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[A_CASTED:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[DOTCAPTURE_EXPR__CASTED:%.*]] = alloca i64, align 8
+// CHECK9-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK9-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
 // CHECK9-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
 // CHECK9-NEXT:    store i64 [[VLA]], ptr [[VLA_ADDR]], align 8
@@ -4155,8 +4163,9 @@ int bar(int n){
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l214
-// CHECK9-SAME: (i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]], i64 noundef [[AAA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
+// CHECK9-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]], i64 noundef [[AAA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
 // CHECK9-NEXT:  entry:
+// CHECK9-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[AA_ADDR:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[AAA_ADDR:%.*]] = alloca i64, align 8
@@ -4164,6 +4173,7 @@ int bar(int n){
 // CHECK9-NEXT:    [[A_CASTED:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[AA_CASTED:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[AAA_CASTED:%.*]] = alloca i64, align 8
+// CHECK9-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK9-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
 // CHECK9-NEXT:    store i64 [[AA]], ptr [[AA_ADDR]], align 8
 // CHECK9-NEXT:    store i64 [[AAA]], ptr [[AAA_ADDR]], align 8
@@ -4204,14 +4214,16 @@ int bar(int n){
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l232
-// CHECK9-SAME: (ptr noundef [[THIS:%.*]], i64 noundef [[B:%.*]], i64 noundef [[VLA:%.*]], i64 noundef [[VLA1:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[C:%.*]]) #[[ATTR0]] {
+// CHECK9-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef [[THIS:%.*]], i64 noundef [[B:%.*]], i64 noundef [[VLA:%.*]], i64 noundef [[VLA1:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[C:%.*]]) #[[ATTR0]] {
 // CHECK9-NEXT:  entry:
+// CHECK9-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[B_ADDR:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[VLA_ADDR:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[VLA_ADDR2:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[B_CASTED:%.*]] = alloca i64, align 8
+// CHECK9-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK9-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
 // CHECK9-NEXT:    store i64 [[B]], ptr [[B_ADDR]], align 8
 // CHECK9-NEXT:    store i64 [[VLA]], ptr [[VLA_ADDR]], align 8
@@ -4317,13 +4329,15 @@ int bar(int n){
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l197
-// CHECK9-SAME: (i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
+// CHECK9-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
 // CHECK9-NEXT:  entry:
+// CHECK9-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[AA_ADDR:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[A_CASTED:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[AA_CASTED:%.*]] = alloca i64, align 8
+// CHECK9-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK9-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
 // CHECK9-NEXT:    store i64 [[AA]], ptr [[AA_ADDR]], align 8
 // CHECK9-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
@@ -4418,8 +4432,10 @@ int bar(int n){
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l103
-// CHECK11-SAME: () #[[ATTR0:[0-9]+]] {
+// CHECK11-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK11-NEXT:  entry:
+// CHECK11-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
+// CHECK11-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK11-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB2:[0-9]+]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l103.omp_outlined)
 // CHECK11-NEXT:    ret void
 //
@@ -4502,14 +4518,16 @@ int bar(int n){
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l128
-// CHECK11-SAME: (i32 noundef [[AA:%.*]], i32 noundef [[LIN:%.*]], i32 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK11-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[AA:%.*]], i32 noundef [[LIN:%.*]], i32 noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK11-NEXT:  entry:
+// CHECK11-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[AA_ADDR:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[LIN_ADDR:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[AA_CASTED:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[LIN_CASTED:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[A_CASTED:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK11-NEXT:    store i32 [[AA]], ptr [[AA_ADDR]], align 4
 // CHECK11-NEXT:    store i32 [[LIN]], ptr [[LIN_ADDR]], align 4
 // CHECK11-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
@@ -4642,12 +4660,14 @@ int bar(int n){
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l136
-// CHECK11-SAME: (i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]]) #[[ATTR0]] {
+// CHECK11-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]]) #[[ATTR0]] {
 // CHECK11-NEXT:  entry:
+// CHECK11-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[AA_ADDR:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[A_CASTED:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[AA_CASTED:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK11-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
 // CHECK11-NEXT:    store i32 [[AA]], ptr [[AA_ADDR]], align 4
 // CHECK11-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
@@ -4734,8 +4754,9 @@ int bar(int n){
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l160
-// CHECK11-SAME: (i32 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BN:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[C:%.*]], i32 noundef [[VLA1:%.*]], i32 noundef [[VLA3:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[CN:%.*]], ptr noundef nonnull align 4 dereferenceable(12) [[D:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] {
+// CHECK11-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BN:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[C:%.*]], i32 noundef [[VLA1:%.*]], i32 noundef [[VLA3:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[CN:%.*]], ptr noundef nonnull align 4 dereferenceable(12) [[D:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] {
 // CHECK11-NEXT:  entry:
+// CHECK11-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[VLA_ADDR:%.*]] = alloca i32, align 4
@@ -4748,6 +4769,7 @@ int bar(int n){
 // CHECK11-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[A_CASTED:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[DOTCAPTURE_EXPR__CASTED:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK11-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
 // CHECK11-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 4
 // CHECK11-NEXT:    store i32 [[VLA]], ptr [[VLA_ADDR]], align 4
@@ -4920,8 +4942,9 @@ int bar(int n){
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l214
-// CHECK11-SAME: (i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]], i32 noundef [[AAA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
+// CHECK11-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]], i32 noundef [[AAA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
 // CHECK11-NEXT:  entry:
+// CHECK11-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[AA_ADDR:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[AAA_ADDR:%.*]] = alloca i32, align 4
@@ -4929,6 +4952,7 @@ int bar(int n){
 // CHECK11-NEXT:    [[A_CASTED:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[AA_CASTED:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[AAA_CASTED:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK11-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
 // CHECK11-NEXT:    store i32 [[AA]], ptr [[AA_ADDR]], align 4
 // CHECK11-NEXT:    store i32 [[AAA]], ptr [[AAA_ADDR]], align 4
@@ -4969,14 +4993,16 @@ int bar(int n){
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l232
-// CHECK11-SAME: (ptr noundef [[THIS:%.*]], i32 noundef [[B:%.*]], i32 noundef [[VLA:%.*]], i32 noundef [[VLA1:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[C:%.*]]) #[[ATTR0]] {
+// CHECK11-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef [[THIS:%.*]], i32 noundef [[B:%.*]], i32 noundef [[VLA:%.*]], i32 noundef [[VLA1:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[C:%.*]]) #[[ATTR0]] {
 // CHECK11-NEXT:  entry:
+// CHECK11-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[VLA_ADDR:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[VLA_ADDR2:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[B_CASTED:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK11-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
 // CHECK11-NEXT:    store i32 [[B]], ptr [[B_ADDR]], align 4
 // CHECK11-NEXT:    store i32 [[VLA]], ptr [[VLA_ADDR]], align 4
@@ -5082,13 +5108,15 @@ int bar(int n){
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l197
-// CHECK11-SAME: (i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
+// CHECK11-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
 // CHECK11-NEXT:  entry:
+// CHECK11-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[AA_ADDR:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[A_CASTED:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[AA_CASTED:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK11-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
 // CHECK11-NEXT:    store i32 [[AA]], ptr [[AA_ADDR]], align 4
 // CHECK11-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 4
diff --git a/clang/test/OpenMP/target_parallel_for_debug_codegen.cpp b/clang/test/OpenMP/target_parallel_for_debug_codegen.cpp
index a49c0c92a9e3f..688456a647764 100644
--- a/clang/test/OpenMP/target_parallel_for_debug_codegen.cpp
+++ b/clang/test/OpenMP/target_parallel_for_debug_codegen.cpp
@@ -55,8 +55,9 @@ int main() {
   return 0;
 }
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13_debug__
-// CHECK1-SAME: (ptr addrspace(1) noalias noundef [[C:%.*]], i32 noundef [[A:%.*]], ptr noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]], i1 noundef zeroext [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0:[0-9]+]] !dbg [[DBG22:![0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], i32 noundef [[A:%.*]], ptr noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]], i1 noundef zeroext [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0:[0-9]+]] !dbg [[DBG22:![0-9]+]] {
 // CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[C_ADDR:%.*]] = alloca ptr addrspace(1), align 8
 // CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
@@ -67,57 +68,59 @@ int main() {
 // CHECK1-NEXT:    [[_TMP2:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[A_CASTED:%.*]] = alloca i64, align 8
 // CHECK1-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x ptr], align 8
+// CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DYN_PTR_ADDR]], metadata [[META43:![0-9]+]], metadata !DIExpression()), !dbg [[DBG44:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META40:![0-9]+]], metadata !DIExpression()), !dbg [[DBG41:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META45:![0-9]+]], metadata !DIExpression()), !dbg [[DBG46:![0-9]+]]
 // CHECK1-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META42:![0-9]+]], metadata !DIExpression()), !dbg [[DBG43:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META47:![0-9]+]], metadata !DIExpression()), !dbg [[DBG48:![0-9]+]]
 // CHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META44:![0-9]+]], metadata !DIExpression()), !dbg [[DBG45:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META49:![0-9]+]], metadata !DIExpression()), !dbg [[DBG50:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META46:![0-9]+]], metadata !DIExpression()), !dbg [[DBG47:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META51:![0-9]+]], metadata !DIExpression()), !dbg [[DBG52:![0-9]+]]
 // CHECK1-NEXT:    [[FROMBOOL:%.*]] = zext i1 [[DOTCAPTURE_EXPR_]] to i8
 // CHECK1-NEXT:    store i8 [[FROMBOOL]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 1
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTCAPTURE_EXPR__ADDR]], metadata [[META48:![0-9]+]], metadata !DIExpression()), !dbg [[DBG49:![0-9]+]]
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG50:![0-9]+]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG50]]
-// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG50]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG50]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG50]]
-// CHECK1-NEXT:    store ptr [[TMP3]], ptr [[_TMP1]], align 8, !dbg [[DBG50]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG50]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG50]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = addrspacecast ptr addrspace(1) [[TMP5]] to ptr, !dbg [[DBG50]]
-// CHECK1-NEXT:    store ptr [[TMP6]], ptr [[_TMP2]], align 8, !dbg [[DBG50]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG50]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13_kernel_environment), !dbg [[DBG50]]
-// CHECK1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP8]], -1, !dbg [[DBG50]]
-// CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]], !dbg [[DBG50]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTCAPTURE_EXPR__ADDR]], metadata [[META53:![0-9]+]], metadata !DIExpression()), !dbg [[DBG54:![0-9]+]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG55:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG55]]
+// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG55]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG55]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG55]]
+// CHECK1-NEXT:    store ptr [[TMP3]], ptr [[_TMP1]], align 8, !dbg [[DBG55]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG55]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG55]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = addrspacecast ptr addrspace(1) [[TMP5]] to ptr, !dbg [[DBG55]]
+// CHECK1-NEXT:    store ptr [[TMP6]], ptr [[_TMP2]], align 8, !dbg [[DBG55]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG55]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13_kernel_environment, ptr [[DYN_PTR]]), !dbg [[DBG55]]
+// CHECK1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP8]], -1, !dbg [[DBG55]]
+// CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]], !dbg [[DBG55]]
 // CHECK1:       user_code.entry:
 // CHECK1-NEXT:    [[TMP9:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB6:[0-9]+]])
-// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG51:![0-9]+]]
-// CHECK1-NEXT:    store i32 [[TMP10]], ptr [[A_CASTED]], align 4, !dbg [[DBG51]]
-// CHECK1-NEXT:    [[TMP11:%.*]] = load i64, ptr [[A_CASTED]], align 8, !dbg [[DBG51]]
-// CHECK1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 0, !dbg [[DBG51]]
-// CHECK1-NEXT:    store ptr [[TMP2]], ptr [[TMP12]], align 8, !dbg [[DBG51]]
-// CHECK1-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 1, !dbg [[DBG51]]
-// CHECK1-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP11]] to ptr, !dbg [[DBG51]]
-// CHECK1-NEXT:    store ptr [[TMP14]], ptr [[TMP13]], align 8, !dbg [[DBG51]]
-// CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 2, !dbg [[DBG51]]
-// CHECK1-NEXT:    store ptr [[TMP4]], ptr [[TMP15]], align 8, !dbg [[DBG51]]
-// CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 3, !dbg [[DBG51]]
-// CHECK1-NEXT:    store ptr [[TMP7]], ptr [[TMP16]], align 8, !dbg [[DBG51]]
-// CHECK1-NEXT:    [[TMP17:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1, !dbg [[DBG52:![0-9]+]]
-// CHECK1-NEXT:    [[TOBOOL:%.*]] = trunc i8 [[TMP17]] to i1, !dbg [[DBG52]]
-// CHECK1-NEXT:    [[TMP18:%.*]] = zext i1 [[TOBOOL]] to i32, !dbg [[DBG51]]
-// CHECK1-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB6]], i32 [[TMP9]], i32 [[TMP18]], i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13_debug___omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 4), !dbg [[DBG51]]
-// CHECK1-NEXT:    call void @__kmpc_target_deinit(), !dbg [[DBG54:![0-9]+]]
-// CHECK1-NEXT:    ret void, !dbg [[DBG55:![0-9]+]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG56:![0-9]+]]
+// CHECK1-NEXT:    store i32 [[TMP10]], ptr [[A_CASTED]], align 4, !dbg [[DBG56]]
+// CHECK1-NEXT:    [[TMP11:%.*]] = load i64, ptr [[A_CASTED]], align 8, !dbg [[DBG56]]
+// CHECK1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 0, !dbg [[DBG56]]
+// CHECK1-NEXT:    store ptr [[TMP2]], ptr [[TMP12]], align 8, !dbg [[DBG56]]
+// CHECK1-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 1, !dbg [[DBG56]]
+// CHECK1-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP11]] to ptr, !dbg [[DBG56]]
+// CHECK1-NEXT:    store ptr [[TMP14]], ptr [[TMP13]], align 8, !dbg [[DBG56]]
+// CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 2, !dbg [[DBG56]]
+// CHECK1-NEXT:    store ptr [[TMP4]], ptr [[TMP15]], align 8, !dbg [[DBG56]]
+// CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 3, !dbg [[DBG56]]
+// CHECK1-NEXT:    store ptr [[TMP7]], ptr [[TMP16]], align 8, !dbg [[DBG56]]
+// CHECK1-NEXT:    [[TMP17:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1, !dbg [[DBG57:![0-9]+]]
+// CHECK1-NEXT:    [[TOBOOL:%.*]] = trunc i8 [[TMP17]] to i1, !dbg [[DBG57]]
+// CHECK1-NEXT:    [[TMP18:%.*]] = zext i1 [[TOBOOL]] to i32, !dbg [[DBG56]]
+// CHECK1-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB6]], i32 [[TMP9]], i32 [[TMP18]], i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13_debug___omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 4), !dbg [[DBG56]]
+// CHECK1-NEXT:    call void @__kmpc_target_deinit(), !dbg [[DBG59:![0-9]+]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG60:![0-9]+]]
 // CHECK1:       worker.exit:
-// CHECK1-NEXT:    ret void, !dbg [[DBG50]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG55]]
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13_debug___omp_outlined_debug__
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], i32 noundef [[A:%.*]], ptr noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR2:[0-9]+]] !dbg [[DBG56:![0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], i32 noundef [[A:%.*]], ptr noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR2:[0-9]+]] !dbg [[DBG61:![0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -141,149 +144,149 @@ int main() {
 // CHECK1-NEXT:    [[H:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[D:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META63:![0-9]+]], metadata !DIExpression()), !dbg [[DBG64:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META68:![0-9]+]], metadata !DIExpression()), !dbg [[DBG69:![0-9]+]]
 // CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META65:![0-9]+]], metadata !DIExpression()), !dbg [[DBG64]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META70:![0-9]+]], metadata !DIExpression()), !dbg [[DBG69]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META66:![0-9]+]], metadata !DIExpression()), !dbg [[DBG67:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META71:![0-9]+]], metadata !DIExpression()), !dbg [[DBG72:![0-9]+]]
 // CHECK1-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META68:![0-9]+]], metadata !DIExpression()), !dbg [[DBG69:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META73:![0-9]+]], metadata !DIExpression()), !dbg [[DBG74:![0-9]+]]
 // CHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META70:![0-9]+]], metadata !DIExpression()), !dbg [[DBG71:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META75:![0-9]+]], metadata !DIExpression()), !dbg [[DBG76:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META72:![0-9]+]], metadata !DIExpression()), !dbg [[DBG73:![0-9]+]]
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG74:![0-9]+]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG74]]
-// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG74]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG74]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG74]]
-// CHECK1-NEXT:    store ptr [[TMP3]], ptr [[_TMP1]], align 8, !dbg [[DBG74]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG74]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG74]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = addrspacecast ptr addrspace(1) [[TMP5]] to ptr, !dbg [[DBG74]]
-// CHECK1-NEXT:    store ptr [[TMP6]], ptr [[_TMP2]], align 8, !dbg [[DBG74]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG74]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_IV]], metadata [[META75:![0-9]+]], metadata !DIExpression()), !dbg [[DBG64]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_LB]], metadata [[META76:![0-9]+]], metadata !DIExpression()), !dbg [[DBG64]]
-// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG77:![0-9]+]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_UB]], metadata [[META78:![0-9]+]], metadata !DIExpression()), !dbg [[DBG64]]
-// CHECK1-NEXT:    store i32 9, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG77]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_STRIDE]], metadata [[META79:![0-9]+]], metadata !DIExpression()), !dbg [[DBG64]]
-// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG77]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_IS_LAST]], metadata [[META80:![0-9]+]], metadata !DIExpression()), !dbg [[DBG64]]
-// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4, !dbg [[DBG77]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B4]], metadata [[META81:![0-9]+]], metadata !DIExpression()), !dbg [[DBG64]]
-// CHECK1-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[B4]], ptr align 4 [[TMP4]], i64 400, i1 false), !dbg [[DBG74]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[I]], metadata [[META82:![0-9]+]], metadata !DIExpression()), !dbg [[DBG64]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG74]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP8]], align 4, !dbg [[DBG74]]
-// CHECK1-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB3:[0-9]+]], i32 [[TMP9]], i32 33, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1), !dbg [[DBG83:![0-9]+]]
-// CHECK1-NEXT:    br label [[OMP_DISPATCH_COND:%.*]], !dbg [[DBG74]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META77:![0-9]+]], metadata !DIExpression()), !dbg [[DBG78:![0-9]+]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG79:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG79]]
+// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG79]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG79]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG79]]
+// CHECK1-NEXT:    store ptr [[TMP3]], ptr [[_TMP1]], align 8, !dbg [[DBG79]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG79]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG79]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = addrspacecast ptr addrspace(1) [[TMP5]] to ptr, !dbg [[DBG79]]
+// CHECK1-NEXT:    store ptr [[TMP6]], ptr [[_TMP2]], align 8, !dbg [[DBG79]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG79]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_IV]], metadata [[META80:![0-9]+]], metadata !DIExpression()), !dbg [[DBG69]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_LB]], metadata [[META81:![0-9]+]], metadata !DIExpression()), !dbg [[DBG69]]
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG82:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_UB]], metadata [[META83:![0-9]+]], metadata !DIExpression()), !dbg [[DBG69]]
+// CHECK1-NEXT:    store i32 9, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG82]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_STRIDE]], metadata [[META84:![0-9]+]], metadata !DIExpression()), !dbg [[DBG69]]
+// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG82]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_IS_LAST]], metadata [[META85:![0-9]+]], metadata !DIExpression()), !dbg [[DBG69]]
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4, !dbg [[DBG82]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B4]], metadata [[META86:![0-9]+]], metadata !DIExpression()), !dbg [[DBG69]]
+// CHECK1-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[B4]], ptr align 4 [[TMP4]], i64 400, i1 false), !dbg [[DBG79]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[I]], metadata [[META87:![0-9]+]], metadata !DIExpression()), !dbg [[DBG69]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG79]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP8]], align 4, !dbg [[DBG79]]
+// CHECK1-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB3:[0-9]+]], i32 [[TMP9]], i32 33, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1), !dbg [[DBG88:![0-9]+]]
+// CHECK1-NEXT:    br label [[OMP_DISPATCH_COND:%.*]], !dbg [[DBG79]]
 // CHECK1:       omp.dispatch.cond:
-// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG77]]
-// CHECK1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP10]], 9, !dbg [[DBG77]]
-// CHECK1-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]], !dbg [[DBG77]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG82]]
+// CHECK1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP10]], 9, !dbg [[DBG82]]
+// CHECK1-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]], !dbg [[DBG82]]
 // CHECK1:       cond.true:
-// CHECK1-NEXT:    br label [[COND_END:%.*]], !dbg [[DBG77]]
+// CHECK1-NEXT:    br label [[COND_END:%.*]], !dbg [[DBG82]]
 // CHECK1:       cond.false:
-// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG77]]
-// CHECK1-NEXT:    br label [[COND_END]], !dbg [[DBG77]]
+// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG82]]
+// CHECK1-NEXT:    br label [[COND_END]], !dbg [[DBG82]]
 // CHECK1:       cond.end:
-// CHECK1-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP11]], [[COND_FALSE]] ], !dbg [[DBG77]]
-// CHECK1-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4, !dbg [[DBG77]]
-// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG77]]
-// CHECK1-NEXT:    store i32 [[TMP12]], ptr [[DOTOMP_IV]], align 4, !dbg [[DBG77]]
-// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG77]]
-// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG77]]
-// CHECK1-NEXT:    [[CMP5:%.*]] = icmp sle i32 [[TMP13]], [[TMP14]], !dbg [[DBG74]]
-// CHECK1-NEXT:    br i1 [[CMP5]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]], !dbg [[DBG74]]
+// CHECK1-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP11]], [[COND_FALSE]] ], !dbg [[DBG82]]
+// CHECK1-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4, !dbg [[DBG82]]
+// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG82]]
+// CHECK1-NEXT:    store i32 [[TMP12]], ptr [[DOTOMP_IV]], align 4, !dbg [[DBG82]]
+// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG82]]
+// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG82]]
+// CHECK1-NEXT:    [[CMP5:%.*]] = icmp sle i32 [[TMP13]], [[TMP14]], !dbg [[DBG79]]
+// CHECK1-NEXT:    br i1 [[CMP5]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]], !dbg [[DBG79]]
 // CHECK1:       omp.dispatch.body:
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]], !dbg [[DBG74]]
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]], !dbg [[DBG79]]
 // CHECK1:       omp.inner.for.cond:
-// CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG77]]
-// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG77]]
-// CHECK1-NEXT:    [[CMP6:%.*]] = icmp sle i32 [[TMP15]], [[TMP16]], !dbg [[DBG74]]
-// CHECK1-NEXT:    br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]], !dbg [[DBG74]]
+// CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG82]]
+// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG82]]
+// CHECK1-NEXT:    [[CMP6:%.*]] = icmp sle i32 [[TMP15]], [[TMP16]], !dbg [[DBG79]]
+// CHECK1-NEXT:    br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]], !dbg [[DBG79]]
 // CHECK1:       omp.inner.for.body:
-// CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG77]]
-// CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP17]], 1, !dbg [[DBG84:![0-9]+]]
-// CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]], !dbg [[DBG84]]
-// CHECK1-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !dbg [[DBG84]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[F]], metadata [[META85:![0-9]+]], metadata !DIExpression()), !dbg [[DBG88:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 1, !dbg [[DBG89:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX]], i64 0, i64 1, !dbg [[DBG89]]
-// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX7]], i64 0, i64 1, !dbg [[DBG89]]
-// CHECK1-NEXT:    store ptr [[ARRAYIDX8]], ptr [[F]], align 8, !dbg [[DBG88]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[G]], metadata [[META90:![0-9]+]], metadata !DIExpression()), !dbg [[DBG91:![0-9]+]]
-// CHECK1-NEXT:    store ptr [[A_ADDR]], ptr [[G]], align 8, !dbg [[DBG91]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[H]], metadata [[META92:![0-9]+]], metadata !DIExpression()), !dbg [[DBG93:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B4]], i64 0, i64 1, !dbg [[DBG94:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX9]], i64 0, i64 1, !dbg [[DBG94]]
-// CHECK1-NEXT:    store ptr [[ARRAYIDX10]], ptr [[H]], align 8, !dbg [[DBG93]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[D]], metadata [[META95:![0-9]+]], metadata !DIExpression()), !dbg [[DBG96:![0-9]+]]
-// CHECK1-NEXT:    store i32 15, ptr [[D]], align 4, !dbg [[DBG96]]
-// CHECK1-NEXT:    store i32 5, ptr [[A_ADDR]], align 4, !dbg [[DBG97:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B4]], i64 0, i64 0, !dbg [[DBG98:![0-9]+]]
-// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG99:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP18]] to i64, !dbg [[DBG98]]
-// CHECK1-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX11]], i64 0, i64 [[IDXPROM]], !dbg [[DBG98]]
-// CHECK1-NEXT:    store i32 10, ptr [[ARRAYIDX12]], align 4, !dbg [[DBG100:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG101:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX13]], i64 0, i64 0, !dbg [[DBG101]]
-// CHECK1-NEXT:    [[TMP19:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG102:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM15:%.*]] = sext i32 [[TMP19]] to i64, !dbg [[DBG101]]
-// CHECK1-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX14]], i64 0, i64 [[IDXPROM15]], !dbg [[DBG101]]
-// CHECK1-NEXT:    store i32 11, ptr [[ARRAYIDX16]], align 4, !dbg [[DBG103:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG104:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX17]], i64 0, i64 0, !dbg [[DBG104]]
-// CHECK1-NEXT:    [[TMP20:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG105:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM19:%.*]] = sext i32 [[TMP20]] to i64, !dbg [[DBG104]]
-// CHECK1-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX18]], i64 0, i64 [[IDXPROM19]], !dbg [[DBG104]]
-// CHECK1-NEXT:    [[TMP21:%.*]] = load i32, ptr [[ARRAYIDX20]], align 4, !dbg [[DBG104]]
-// CHECK1-NEXT:    [[ARRAYIDX21:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B4]], i64 0, i64 0, !dbg [[DBG106:![0-9]+]]
-// CHECK1-NEXT:    [[TMP22:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG107:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM22:%.*]] = sext i32 [[TMP22]] to i64, !dbg [[DBG106]]
-// CHECK1-NEXT:    [[ARRAYIDX23:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX21]], i64 0, i64 [[IDXPROM22]], !dbg [[DBG106]]
-// CHECK1-NEXT:    store i32 [[TMP21]], ptr [[ARRAYIDX23]], align 4, !dbg [[DBG108:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX24:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B4]], i64 0, i64 0, !dbg [[DBG109:![0-9]+]]
-// CHECK1-NEXT:    [[TMP23:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG110:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM25:%.*]] = sext i32 [[TMP23]] to i64, !dbg [[DBG109]]
-// CHECK1-NEXT:    [[ARRAYIDX26:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX24]], i64 0, i64 [[IDXPROM25]], !dbg [[DBG109]]
-// CHECK1-NEXT:    [[TMP24:%.*]] = load i32, ptr [[ARRAYIDX26]], align 4, !dbg [[DBG109]]
-// CHECK1-NEXT:    [[TMP25:%.*]] = load i8, ptr [[TMP7]], align 1, !dbg [[DBG111:![0-9]+]]
-// CHECK1-NEXT:    [[TOBOOL:%.*]] = trunc i8 [[TMP25]] to i1, !dbg [[DBG111]]
-// CHECK1-NEXT:    [[CONV:%.*]] = zext i1 [[TOBOOL]] to i32, !dbg [[DBG111]]
-// CHECK1-NEXT:    [[OR:%.*]] = or i32 [[CONV]], [[TMP24]], !dbg [[DBG111]]
-// CHECK1-NEXT:    [[TOBOOL27:%.*]] = icmp ne i32 [[OR]], 0, !dbg [[DBG111]]
-// CHECK1-NEXT:    [[FROMBOOL:%.*]] = zext i1 [[TOBOOL27]] to i8, !dbg [[DBG111]]
-// CHECK1-NEXT:    store i8 [[FROMBOOL]], ptr [[TMP7]], align 1, !dbg [[DBG111]]
-// CHECK1-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]], !dbg [[DBG112:![0-9]+]]
+// CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG82]]
+// CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP17]], 1, !dbg [[DBG89:![0-9]+]]
+// CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]], !dbg [[DBG89]]
+// CHECK1-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !dbg [[DBG89]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[F]], metadata [[META90:![0-9]+]], metadata !DIExpression()), !dbg [[DBG93:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 1, !dbg [[DBG94:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX]], i64 0, i64 1, !dbg [[DBG94]]
+// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX7]], i64 0, i64 1, !dbg [[DBG94]]
+// CHECK1-NEXT:    store ptr [[ARRAYIDX8]], ptr [[F]], align 8, !dbg [[DBG93]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[G]], metadata [[META95:![0-9]+]], metadata !DIExpression()), !dbg [[DBG96:![0-9]+]]
+// CHECK1-NEXT:    store ptr [[A_ADDR]], ptr [[G]], align 8, !dbg [[DBG96]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[H]], metadata [[META97:![0-9]+]], metadata !DIExpression()), !dbg [[DBG98:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B4]], i64 0, i64 1, !dbg [[DBG99:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX9]], i64 0, i64 1, !dbg [[DBG99]]
+// CHECK1-NEXT:    store ptr [[ARRAYIDX10]], ptr [[H]], align 8, !dbg [[DBG98]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[D]], metadata [[META100:![0-9]+]], metadata !DIExpression()), !dbg [[DBG101:![0-9]+]]
+// CHECK1-NEXT:    store i32 15, ptr [[D]], align 4, !dbg [[DBG101]]
+// CHECK1-NEXT:    store i32 5, ptr [[A_ADDR]], align 4, !dbg [[DBG102:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B4]], i64 0, i64 0, !dbg [[DBG103:![0-9]+]]
+// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG104:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP18]] to i64, !dbg [[DBG103]]
+// CHECK1-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX11]], i64 0, i64 [[IDXPROM]], !dbg [[DBG103]]
+// CHECK1-NEXT:    store i32 10, ptr [[ARRAYIDX12]], align 4, !dbg [[DBG105:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG106:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX13]], i64 0, i64 0, !dbg [[DBG106]]
+// CHECK1-NEXT:    [[TMP19:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG107:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM15:%.*]] = sext i32 [[TMP19]] to i64, !dbg [[DBG106]]
+// CHECK1-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX14]], i64 0, i64 [[IDXPROM15]], !dbg [[DBG106]]
+// CHECK1-NEXT:    store i32 11, ptr [[ARRAYIDX16]], align 4, !dbg [[DBG108:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG109:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX17]], i64 0, i64 0, !dbg [[DBG109]]
+// CHECK1-NEXT:    [[TMP20:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG110:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM19:%.*]] = sext i32 [[TMP20]] to i64, !dbg [[DBG109]]
+// CHECK1-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX18]], i64 0, i64 [[IDXPROM19]], !dbg [[DBG109]]
+// CHECK1-NEXT:    [[TMP21:%.*]] = load i32, ptr [[ARRAYIDX20]], align 4, !dbg [[DBG109]]
+// CHECK1-NEXT:    [[ARRAYIDX21:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B4]], i64 0, i64 0, !dbg [[DBG111:![0-9]+]]
+// CHECK1-NEXT:    [[TMP22:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG112:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM22:%.*]] = sext i32 [[TMP22]] to i64, !dbg [[DBG111]]
+// CHECK1-NEXT:    [[ARRAYIDX23:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX21]], i64 0, i64 [[IDXPROM22]], !dbg [[DBG111]]
+// CHECK1-NEXT:    store i32 [[TMP21]], ptr [[ARRAYIDX23]], align 4, !dbg [[DBG113:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX24:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B4]], i64 0, i64 0, !dbg [[DBG114:![0-9]+]]
+// CHECK1-NEXT:    [[TMP23:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG115:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM25:%.*]] = sext i32 [[TMP23]] to i64, !dbg [[DBG114]]
+// CHECK1-NEXT:    [[ARRAYIDX26:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX24]], i64 0, i64 [[IDXPROM25]], !dbg [[DBG114]]
+// CHECK1-NEXT:    [[TMP24:%.*]] = load i32, ptr [[ARRAYIDX26]], align 4, !dbg [[DBG114]]
+// CHECK1-NEXT:    [[TMP25:%.*]] = load i8, ptr [[TMP7]], align 1, !dbg [[DBG116:![0-9]+]]
+// CHECK1-NEXT:    [[TOBOOL:%.*]] = trunc i8 [[TMP25]] to i1, !dbg [[DBG116]]
+// CHECK1-NEXT:    [[CONV:%.*]] = zext i1 [[TOBOOL]] to i32, !dbg [[DBG116]]
+// CHECK1-NEXT:    [[OR:%.*]] = or i32 [[CONV]], [[TMP24]], !dbg [[DBG116]]
+// CHECK1-NEXT:    [[TOBOOL27:%.*]] = icmp ne i32 [[OR]], 0, !dbg [[DBG116]]
+// CHECK1-NEXT:    [[FROMBOOL:%.*]] = zext i1 [[TOBOOL27]] to i8, !dbg [[DBG116]]
+// CHECK1-NEXT:    store i8 [[FROMBOOL]], ptr [[TMP7]], align 1, !dbg [[DBG116]]
+// CHECK1-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]], !dbg [[DBG117:![0-9]+]]
 // CHECK1:       omp.body.continue:
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]], !dbg [[DBG83]]
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]], !dbg [[DBG88]]
 // CHECK1:       omp.inner.for.inc:
-// CHECK1-NEXT:    [[TMP26:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG77]]
-// CHECK1-NEXT:    [[ADD28:%.*]] = add nsw i32 [[TMP26]], 1, !dbg [[DBG74]]
-// CHECK1-NEXT:    store i32 [[ADD28]], ptr [[DOTOMP_IV]], align 4, !dbg [[DBG74]]
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !dbg [[DBG83]], !llvm.loop [[LOOP113:![0-9]+]]
+// CHECK1-NEXT:    [[TMP26:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG82]]
+// CHECK1-NEXT:    [[ADD28:%.*]] = add nsw i32 [[TMP26]], 1, !dbg [[DBG79]]
+// CHECK1-NEXT:    store i32 [[ADD28]], ptr [[DOTOMP_IV]], align 4, !dbg [[DBG79]]
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !dbg [[DBG88]], !llvm.loop [[LOOP118:![0-9]+]]
 // CHECK1:       omp.inner.for.end:
-// CHECK1-NEXT:    br label [[OMP_DISPATCH_INC:%.*]], !dbg [[DBG83]]
+// CHECK1-NEXT:    br label [[OMP_DISPATCH_INC:%.*]], !dbg [[DBG88]]
 // CHECK1:       omp.dispatch.inc:
-// CHECK1-NEXT:    [[TMP27:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG77]]
-// CHECK1-NEXT:    [[TMP28:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG77]]
-// CHECK1-NEXT:    [[ADD29:%.*]] = add nsw i32 [[TMP27]], [[TMP28]], !dbg [[DBG74]]
-// CHECK1-NEXT:    store i32 [[ADD29]], ptr [[DOTOMP_LB]], align 4, !dbg [[DBG74]]
-// CHECK1-NEXT:    [[TMP29:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG77]]
-// CHECK1-NEXT:    [[TMP30:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG77]]
-// CHECK1-NEXT:    [[ADD30:%.*]] = add nsw i32 [[TMP29]], [[TMP30]], !dbg [[DBG74]]
-// CHECK1-NEXT:    store i32 [[ADD30]], ptr [[DOTOMP_UB]], align 4, !dbg [[DBG74]]
-// CHECK1-NEXT:    br label [[OMP_DISPATCH_COND]], !dbg [[DBG83]], !llvm.loop [[LOOP115:![0-9]+]]
+// CHECK1-NEXT:    [[TMP27:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG82]]
+// CHECK1-NEXT:    [[TMP28:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG82]]
+// CHECK1-NEXT:    [[ADD29:%.*]] = add nsw i32 [[TMP27]], [[TMP28]], !dbg [[DBG79]]
+// CHECK1-NEXT:    store i32 [[ADD29]], ptr [[DOTOMP_LB]], align 4, !dbg [[DBG79]]
+// CHECK1-NEXT:    [[TMP29:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG82]]
+// CHECK1-NEXT:    [[TMP30:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG82]]
+// CHECK1-NEXT:    [[ADD30:%.*]] = add nsw i32 [[TMP29]], [[TMP30]], !dbg [[DBG79]]
+// CHECK1-NEXT:    store i32 [[ADD30]], ptr [[DOTOMP_UB]], align 4, !dbg [[DBG79]]
+// CHECK1-NEXT:    br label [[OMP_DISPATCH_COND]], !dbg [[DBG88]], !llvm.loop [[LOOP120:![0-9]+]]
 // CHECK1:       omp.dispatch.end:
-// CHECK1-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB5:[0-9]+]], i32 [[TMP9]]), !dbg [[DBG114:![0-9]+]]
-// CHECK1-NEXT:    ret void, !dbg [[DBG116:![0-9]+]]
+// CHECK1-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB5:[0-9]+]], i32 [[TMP9]]), !dbg [[DBG119:![0-9]+]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG121:![0-9]+]]
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13_debug___omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR2]] !dbg [[DBG117:![0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR2]] !dbg [[DBG122:![0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -292,68 +295,73 @@ int main() {
 // CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[BB_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META124:![0-9]+]], metadata !DIExpression()), !dbg [[DBG125:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META129:![0-9]+]], metadata !DIExpression()), !dbg [[DBG130:![0-9]+]]
 // CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META126:![0-9]+]], metadata !DIExpression()), !dbg [[DBG125]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META131:![0-9]+]], metadata !DIExpression()), !dbg [[DBG130]]
 // CHECK1-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META127:![0-9]+]], metadata !DIExpression()), !dbg [[DBG125]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META132:![0-9]+]], metadata !DIExpression()), !dbg [[DBG130]]
 // CHECK1-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META128:![0-9]+]], metadata !DIExpression()), !dbg [[DBG125]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META133:![0-9]+]], metadata !DIExpression()), !dbg [[DBG130]]
 // CHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META129:![0-9]+]], metadata !DIExpression()), !dbg [[DBG125]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META134:![0-9]+]], metadata !DIExpression()), !dbg [[DBG130]]
 // CHECK1-NEXT:    store ptr [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META130:![0-9]+]], metadata !DIExpression()), !dbg [[DBG125]]
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG131:![0-9]+]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG131]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG131]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG131]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG131]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG131]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG131]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG131]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG131]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = addrspacecast ptr [[TMP5]] to ptr addrspace(1), !dbg [[DBG131]]
-// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr [[TMP8]] to ptr addrspace(1), !dbg [[DBG131]]
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13_debug___omp_outlined_debug__(ptr [[TMP3]], ptr [[TMP4]], ptr addrspace(1) [[TMP9]], i32 [[TMP6]], ptr [[TMP7]], ptr addrspace(1) [[TMP10]]) #[[ATTR4:[0-9]+]], !dbg [[DBG131]]
-// CHECK1-NEXT:    ret void, !dbg [[DBG131]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META135:![0-9]+]], metadata !DIExpression()), !dbg [[DBG130]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG136:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG136]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG136]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG136]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG136]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG136]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG136]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG136]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG136]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = addrspacecast ptr [[TMP5]] to ptr addrspace(1), !dbg [[DBG136]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr [[TMP8]] to ptr addrspace(1), !dbg [[DBG136]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13_debug___omp_outlined_debug__(ptr [[TMP3]], ptr [[TMP4]], ptr addrspace(1) [[TMP9]], i32 [[TMP6]], ptr [[TMP7]], ptr addrspace(1) [[TMP10]]) #[[ATTR4:[0-9]+]], !dbg [[DBG136]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG136]]
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13
-// CHECK1-SAME: (ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR6:[0-9]+]] !dbg [[DBG132:![0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR6:[0-9]+]] !dbg [[DBG137:![0-9]+]] {
 // CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
 // CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[BB_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DYN_PTR_ADDR]], metadata [[META140:![0-9]+]], metadata !DIExpression()), !dbg [[DBG141:![0-9]+]]
 // CHECK1-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META135:![0-9]+]], metadata !DIExpression()), !dbg [[DBG136:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META142:![0-9]+]], metadata !DIExpression()), !dbg [[DBG141]]
 // CHECK1-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META137:![0-9]+]], metadata !DIExpression()), !dbg [[DBG136]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META143:![0-9]+]], metadata !DIExpression()), !dbg [[DBG141]]
 // CHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META138:![0-9]+]], metadata !DIExpression()), !dbg [[DBG136]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META144:![0-9]+]], metadata !DIExpression()), !dbg [[DBG141]]
 // CHECK1-NEXT:    store ptr [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META139:![0-9]+]], metadata !DIExpression()), !dbg [[DBG136]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META145:![0-9]+]], metadata !DIExpression()), !dbg [[DBG141]]
 // CHECK1-NEXT:    store i64 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTCAPTURE_EXPR__ADDR]], metadata [[META140:![0-9]+]], metadata !DIExpression()), !dbg [[DBG136]]
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG141:![0-9]+]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG141]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG141]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG141]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG141]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG141]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG141]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1, !dbg [[DBG141]]
-// CHECK1-NEXT:    [[TOBOOL:%.*]] = trunc i8 [[TMP7]] to i1, !dbg [[DBG141]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = addrspacecast ptr [[TMP3]] to ptr addrspace(1), !dbg [[DBG141]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = addrspacecast ptr [[TMP6]] to ptr addrspace(1), !dbg [[DBG141]]
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13_debug__(ptr addrspace(1) [[TMP8]], i32 [[TMP4]], ptr [[TMP5]], ptr addrspace(1) [[TMP9]], i1 [[TOBOOL]]) #[[ATTR4]], !dbg [[DBG141]]
-// CHECK1-NEXT:    ret void, !dbg [[DBG141]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTCAPTURE_EXPR__ADDR]], metadata [[META146:![0-9]+]], metadata !DIExpression()), !dbg [[DBG141]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG147:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG147]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG147]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DYN_PTR_ADDR]], align 8, !dbg [[DBG147]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG147]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG147]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG147]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG147]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1, !dbg [[DBG147]]
+// CHECK1-NEXT:    [[TOBOOL:%.*]] = trunc i8 [[TMP8]] to i1, !dbg [[DBG147]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = addrspacecast ptr [[TMP4]] to ptr addrspace(1), !dbg [[DBG147]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr [[TMP7]] to ptr addrspace(1), !dbg [[DBG147]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13_debug__(ptr [[TMP3]], ptr addrspace(1) [[TMP9]], i32 [[TMP5]], ptr [[TMP6]], ptr addrspace(1) [[TMP10]], i1 [[TOBOOL]]) #[[ATTR4]], !dbg [[DBG147]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG147]]
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27_debug__
-// CHECK1-SAME: (ptr addrspace(1) noalias noundef [[C:%.*]], i32 noundef [[A:%.*]], ptr addrspace(1) noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG142:![0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], i32 noundef [[A:%.*]], ptr addrspace(1) noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG148:![0-9]+]] {
 // CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[C_ADDR:%.*]] = alloca ptr addrspace(1), align 8
 // CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca ptr addrspace(1), align 8
@@ -363,52 +371,54 @@ int main() {
 // CHECK1-NEXT:    [[_TMP2:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[A_CASTED:%.*]] = alloca i64, align 8
 // CHECK1-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x ptr], align 8
+// CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DYN_PTR_ADDR]], metadata [[META153:![0-9]+]], metadata !DIExpression()), !dbg [[DBG154:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META147:![0-9]+]], metadata !DIExpression()), !dbg [[DBG148:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META155:![0-9]+]], metadata !DIExpression()), !dbg [[DBG156:![0-9]+]]
 // CHECK1-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META149:![0-9]+]], metadata !DIExpression()), !dbg [[DBG150:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META157:![0-9]+]], metadata !DIExpression()), !dbg [[DBG158:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META151:![0-9]+]], metadata !DIExpression()), !dbg [[DBG152:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META159:![0-9]+]], metadata !DIExpression()), !dbg [[DBG160:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META153:![0-9]+]], metadata !DIExpression()), !dbg [[DBG154:![0-9]+]]
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG155:![0-9]+]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG155]]
-// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG155]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG155]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr [[B_ADDR]], align 8, !dbg [[DBG155]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[TMP3]] to ptr, !dbg [[DBG155]]
-// CHECK1-NEXT:    store ptr [[TMP4]], ptr [[_TMP1]], align 8, !dbg [[DBG155]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG155]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG155]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = addrspacecast ptr addrspace(1) [[TMP6]] to ptr, !dbg [[DBG155]]
-// CHECK1-NEXT:    store ptr [[TMP7]], ptr [[_TMP2]], align 8, !dbg [[DBG155]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG155]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27_kernel_environment), !dbg [[DBG155]]
-// CHECK1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP9]], -1, !dbg [[DBG155]]
-// CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]], !dbg [[DBG155]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META161:![0-9]+]], metadata !DIExpression()), !dbg [[DBG162:![0-9]+]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG163:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG163]]
+// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG163]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG163]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr [[B_ADDR]], align 8, !dbg [[DBG163]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[TMP3]] to ptr, !dbg [[DBG163]]
+// CHECK1-NEXT:    store ptr [[TMP4]], ptr [[_TMP1]], align 8, !dbg [[DBG163]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG163]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG163]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = addrspacecast ptr addrspace(1) [[TMP6]] to ptr, !dbg [[DBG163]]
+// CHECK1-NEXT:    store ptr [[TMP7]], ptr [[_TMP2]], align 8, !dbg [[DBG163]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG163]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27_kernel_environment, ptr [[DYN_PTR]]), !dbg [[DBG163]]
+// CHECK1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP9]], -1, !dbg [[DBG163]]
+// CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]], !dbg [[DBG163]]
 // CHECK1:       user_code.entry:
 // CHECK1-NEXT:    [[TMP10:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB13:[0-9]+]])
-// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG156:![0-9]+]]
-// CHECK1-NEXT:    store i32 [[TMP11]], ptr [[A_CASTED]], align 4, !dbg [[DBG156]]
-// CHECK1-NEXT:    [[TMP12:%.*]] = load i64, ptr [[A_CASTED]], align 8, !dbg [[DBG156]]
-// CHECK1-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 0, !dbg [[DBG156]]
-// CHECK1-NEXT:    store ptr [[TMP2]], ptr [[TMP13]], align 8, !dbg [[DBG156]]
-// CHECK1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 1, !dbg [[DBG156]]
-// CHECK1-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP12]] to ptr, !dbg [[DBG156]]
-// CHECK1-NEXT:    store ptr [[TMP15]], ptr [[TMP14]], align 8, !dbg [[DBG156]]
-// CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 2, !dbg [[DBG156]]
-// CHECK1-NEXT:    store ptr [[TMP5]], ptr [[TMP16]], align 8, !dbg [[DBG156]]
-// CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 3, !dbg [[DBG156]]
-// CHECK1-NEXT:    store ptr [[TMP8]], ptr [[TMP17]], align 8, !dbg [[DBG156]]
-// CHECK1-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB13]], i32 [[TMP10]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27_debug___omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 4), !dbg [[DBG156]]
-// CHECK1-NEXT:    call void @__kmpc_target_deinit(), !dbg [[DBG157:![0-9]+]]
-// CHECK1-NEXT:    ret void, !dbg [[DBG159:![0-9]+]]
+// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG164:![0-9]+]]
+// CHECK1-NEXT:    store i32 [[TMP11]], ptr [[A_CASTED]], align 4, !dbg [[DBG164]]
+// CHECK1-NEXT:    [[TMP12:%.*]] = load i64, ptr [[A_CASTED]], align 8, !dbg [[DBG164]]
+// CHECK1-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 0, !dbg [[DBG164]]
+// CHECK1-NEXT:    store ptr [[TMP2]], ptr [[TMP13]], align 8, !dbg [[DBG164]]
+// CHECK1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 1, !dbg [[DBG164]]
+// CHECK1-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP12]] to ptr, !dbg [[DBG164]]
+// CHECK1-NEXT:    store ptr [[TMP15]], ptr [[TMP14]], align 8, !dbg [[DBG164]]
+// CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 2, !dbg [[DBG164]]
+// CHECK1-NEXT:    store ptr [[TMP5]], ptr [[TMP16]], align 8, !dbg [[DBG164]]
+// CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 3, !dbg [[DBG164]]
+// CHECK1-NEXT:    store ptr [[TMP8]], ptr [[TMP17]], align 8, !dbg [[DBG164]]
+// CHECK1-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB13]], i32 [[TMP10]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27_debug___omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 4), !dbg [[DBG164]]
+// CHECK1-NEXT:    call void @__kmpc_target_deinit(), !dbg [[DBG165:![0-9]+]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG167:![0-9]+]]
 // CHECK1:       worker.exit:
-// CHECK1-NEXT:    ret void, !dbg [[DBG155]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG163]]
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27_debug___omp_outlined_debug__
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], i32 noundef [[A:%.*]], ptr addrspace(1) noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR2]] !dbg [[DBG160:![0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], i32 noundef [[A:%.*]], ptr addrspace(1) noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR2]] !dbg [[DBG168:![0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -431,140 +441,140 @@ int main() {
 // CHECK1-NEXT:    [[H:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[D:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META163:![0-9]+]], metadata !DIExpression()), !dbg [[DBG164:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META171:![0-9]+]], metadata !DIExpression()), !dbg [[DBG172:![0-9]+]]
 // CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META165:![0-9]+]], metadata !DIExpression()), !dbg [[DBG164]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META173:![0-9]+]], metadata !DIExpression()), !dbg [[DBG172]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META166:![0-9]+]], metadata !DIExpression()), !dbg [[DBG167:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META174:![0-9]+]], metadata !DIExpression()), !dbg [[DBG175:![0-9]+]]
 // CHECK1-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META168:![0-9]+]], metadata !DIExpression()), !dbg [[DBG169:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META176:![0-9]+]], metadata !DIExpression()), !dbg [[DBG177:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META170:![0-9]+]], metadata !DIExpression()), !dbg [[DBG171:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META178:![0-9]+]], metadata !DIExpression()), !dbg [[DBG179:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META172:![0-9]+]], metadata !DIExpression()), !dbg [[DBG173:![0-9]+]]
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG174:![0-9]+]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG174]]
-// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG174]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG174]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr [[B_ADDR]], align 8, !dbg [[DBG174]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[TMP3]] to ptr, !dbg [[DBG174]]
-// CHECK1-NEXT:    store ptr [[TMP4]], ptr [[_TMP1]], align 8, !dbg [[DBG174]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG174]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG174]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = addrspacecast ptr addrspace(1) [[TMP6]] to ptr, !dbg [[DBG174]]
-// CHECK1-NEXT:    store ptr [[TMP7]], ptr [[_TMP2]], align 8, !dbg [[DBG174]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG174]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_IV]], metadata [[META175:![0-9]+]], metadata !DIExpression()), !dbg [[DBG164]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_LB]], metadata [[META176:![0-9]+]], metadata !DIExpression()), !dbg [[DBG164]]
-// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG177:![0-9]+]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_UB]], metadata [[META178:![0-9]+]], metadata !DIExpression()), !dbg [[DBG164]]
-// CHECK1-NEXT:    store i32 9, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG177]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_STRIDE]], metadata [[META179:![0-9]+]], metadata !DIExpression()), !dbg [[DBG164]]
-// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG177]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_IS_LAST]], metadata [[META180:![0-9]+]], metadata !DIExpression()), !dbg [[DBG164]]
-// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4, !dbg [[DBG177]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[I]], metadata [[META181:![0-9]+]], metadata !DIExpression()), !dbg [[DBG164]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG174]]
-// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !dbg [[DBG174]]
-// CHECK1-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB10:[0-9]+]], i32 [[TMP10]], i32 33, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1), !dbg [[DBG182:![0-9]+]]
-// CHECK1-NEXT:    br label [[OMP_DISPATCH_COND:%.*]], !dbg [[DBG174]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META180:![0-9]+]], metadata !DIExpression()), !dbg [[DBG181:![0-9]+]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG182:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG182]]
+// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG182]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG182]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr [[B_ADDR]], align 8, !dbg [[DBG182]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[TMP3]] to ptr, !dbg [[DBG182]]
+// CHECK1-NEXT:    store ptr [[TMP4]], ptr [[_TMP1]], align 8, !dbg [[DBG182]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG182]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG182]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = addrspacecast ptr addrspace(1) [[TMP6]] to ptr, !dbg [[DBG182]]
+// CHECK1-NEXT:    store ptr [[TMP7]], ptr [[_TMP2]], align 8, !dbg [[DBG182]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG182]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_IV]], metadata [[META183:![0-9]+]], metadata !DIExpression()), !dbg [[DBG172]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_LB]], metadata [[META184:![0-9]+]], metadata !DIExpression()), !dbg [[DBG172]]
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG185:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_UB]], metadata [[META186:![0-9]+]], metadata !DIExpression()), !dbg [[DBG172]]
+// CHECK1-NEXT:    store i32 9, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG185]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_STRIDE]], metadata [[META187:![0-9]+]], metadata !DIExpression()), !dbg [[DBG172]]
+// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG185]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_IS_LAST]], metadata [[META188:![0-9]+]], metadata !DIExpression()), !dbg [[DBG172]]
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4, !dbg [[DBG185]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[I]], metadata [[META189:![0-9]+]], metadata !DIExpression()), !dbg [[DBG172]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG182]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !dbg [[DBG182]]
+// CHECK1-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB10:[0-9]+]], i32 [[TMP10]], i32 33, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1), !dbg [[DBG190:![0-9]+]]
+// CHECK1-NEXT:    br label [[OMP_DISPATCH_COND:%.*]], !dbg [[DBG182]]
 // CHECK1:       omp.dispatch.cond:
-// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG177]]
-// CHECK1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP11]], 9, !dbg [[DBG177]]
-// CHECK1-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]], !dbg [[DBG177]]
+// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG185]]
+// CHECK1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP11]], 9, !dbg [[DBG185]]
+// CHECK1-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]], !dbg [[DBG185]]
 // CHECK1:       cond.true:
-// CHECK1-NEXT:    br label [[COND_END:%.*]], !dbg [[DBG177]]
+// CHECK1-NEXT:    br label [[COND_END:%.*]], !dbg [[DBG185]]
 // CHECK1:       cond.false:
-// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG177]]
-// CHECK1-NEXT:    br label [[COND_END]], !dbg [[DBG177]]
+// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG185]]
+// CHECK1-NEXT:    br label [[COND_END]], !dbg [[DBG185]]
 // CHECK1:       cond.end:
-// CHECK1-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ], !dbg [[DBG177]]
-// CHECK1-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4, !dbg [[DBG177]]
-// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG177]]
-// CHECK1-NEXT:    store i32 [[TMP13]], ptr [[DOTOMP_IV]], align 4, !dbg [[DBG177]]
-// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG177]]
-// CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG177]]
-// CHECK1-NEXT:    [[CMP4:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]], !dbg [[DBG174]]
-// CHECK1-NEXT:    br i1 [[CMP4]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]], !dbg [[DBG174]]
+// CHECK1-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ], !dbg [[DBG185]]
+// CHECK1-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4, !dbg [[DBG185]]
+// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG185]]
+// CHECK1-NEXT:    store i32 [[TMP13]], ptr [[DOTOMP_IV]], align 4, !dbg [[DBG185]]
+// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG185]]
+// CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG185]]
+// CHECK1-NEXT:    [[CMP4:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]], !dbg [[DBG182]]
+// CHECK1-NEXT:    br i1 [[CMP4]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]], !dbg [[DBG182]]
 // CHECK1:       omp.dispatch.body:
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]], !dbg [[DBG174]]
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]], !dbg [[DBG182]]
 // CHECK1:       omp.inner.for.cond:
-// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG177]]
-// CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG177]]
-// CHECK1-NEXT:    [[CMP5:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]], !dbg [[DBG174]]
-// CHECK1-NEXT:    br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]], !dbg [[DBG174]]
+// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG185]]
+// CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG185]]
+// CHECK1-NEXT:    [[CMP5:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]], !dbg [[DBG182]]
+// CHECK1-NEXT:    br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]], !dbg [[DBG182]]
 // CHECK1:       omp.inner.for.body:
-// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG177]]
-// CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1, !dbg [[DBG183:![0-9]+]]
-// CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]], !dbg [[DBG183]]
-// CHECK1-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !dbg [[DBG183]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[F]], metadata [[META184:![0-9]+]], metadata !DIExpression()), !dbg [[DBG186:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 1, !dbg [[DBG187:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX]], i64 0, i64 1, !dbg [[DBG187]]
-// CHECK1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX6]], i64 0, i64 1, !dbg [[DBG187]]
-// CHECK1-NEXT:    store ptr [[ARRAYIDX7]], ptr [[F]], align 8, !dbg [[DBG186]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[G]], metadata [[META188:![0-9]+]], metadata !DIExpression()), !dbg [[DBG189:![0-9]+]]
-// CHECK1-NEXT:    store ptr [[A_ADDR]], ptr [[G]], align 8, !dbg [[DBG189]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[H]], metadata [[META190:![0-9]+]], metadata !DIExpression()), !dbg [[DBG191:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP5]], i64 0, i64 1, !dbg [[DBG192:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX8]], i64 0, i64 1, !dbg [[DBG192]]
-// CHECK1-NEXT:    store ptr [[ARRAYIDX9]], ptr [[H]], align 8, !dbg [[DBG191]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[D]], metadata [[META193:![0-9]+]], metadata !DIExpression()), !dbg [[DBG194:![0-9]+]]
-// CHECK1-NEXT:    store i32 15, ptr [[D]], align 4, !dbg [[DBG194]]
-// CHECK1-NEXT:    store i32 5, ptr [[A_ADDR]], align 4, !dbg [[DBG195:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP5]], i64 0, i64 0, !dbg [[DBG196:![0-9]+]]
-// CHECK1-NEXT:    [[TMP19:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG197:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP19]] to i64, !dbg [[DBG196]]
-// CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX10]], i64 0, i64 [[IDXPROM]], !dbg [[DBG196]]
-// CHECK1-NEXT:    store i32 10, ptr [[ARRAYIDX11]], align 4, !dbg [[DBG198:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG199:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX12]], i64 0, i64 0, !dbg [[DBG199]]
-// CHECK1-NEXT:    [[TMP20:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG200:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM14:%.*]] = sext i32 [[TMP20]] to i64, !dbg [[DBG199]]
-// CHECK1-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX13]], i64 0, i64 [[IDXPROM14]], !dbg [[DBG199]]
-// CHECK1-NEXT:    store i32 11, ptr [[ARRAYIDX15]], align 4, !dbg [[DBG201:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG202:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX16]], i64 0, i64 0, !dbg [[DBG202]]
-// CHECK1-NEXT:    [[TMP21:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG203:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM18:%.*]] = sext i32 [[TMP21]] to i64, !dbg [[DBG202]]
-// CHECK1-NEXT:    [[ARRAYIDX19:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX17]], i64 0, i64 [[IDXPROM18]], !dbg [[DBG202]]
-// CHECK1-NEXT:    [[TMP22:%.*]] = load i32, ptr [[ARRAYIDX19]], align 4, !dbg [[DBG202]]
-// CHECK1-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP5]], i64 0, i64 0, !dbg [[DBG204:![0-9]+]]
-// CHECK1-NEXT:    [[TMP23:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG205:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM21:%.*]] = sext i32 [[TMP23]] to i64, !dbg [[DBG204]]
-// CHECK1-NEXT:    [[ARRAYIDX22:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX20]], i64 0, i64 [[IDXPROM21]], !dbg [[DBG204]]
-// CHECK1-NEXT:    store i32 [[TMP22]], ptr [[ARRAYIDX22]], align 4, !dbg [[DBG206:![0-9]+]]
-// CHECK1-NEXT:    [[TMP24:%.*]] = load i8, ptr [[TMP8]], align 1, !dbg [[DBG207:![0-9]+]]
-// CHECK1-NEXT:    [[TOBOOL:%.*]] = trunc i8 [[TMP24]] to i1, !dbg [[DBG207]]
-// CHECK1-NEXT:    [[CONV:%.*]] = zext i1 [[TOBOOL]] to i32, !dbg [[DBG207]]
-// CHECK1-NEXT:    store i32 [[CONV]], ptr [[D]], align 4, !dbg [[DBG208:![0-9]+]]
-// CHECK1-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]], !dbg [[DBG209:![0-9]+]]
+// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG185]]
+// CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1, !dbg [[DBG191:![0-9]+]]
+// CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]], !dbg [[DBG191]]
+// CHECK1-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !dbg [[DBG191]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[F]], metadata [[META192:![0-9]+]], metadata !DIExpression()), !dbg [[DBG194:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 1, !dbg [[DBG195:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX]], i64 0, i64 1, !dbg [[DBG195]]
+// CHECK1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX6]], i64 0, i64 1, !dbg [[DBG195]]
+// CHECK1-NEXT:    store ptr [[ARRAYIDX7]], ptr [[F]], align 8, !dbg [[DBG194]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[G]], metadata [[META196:![0-9]+]], metadata !DIExpression()), !dbg [[DBG197:![0-9]+]]
+// CHECK1-NEXT:    store ptr [[A_ADDR]], ptr [[G]], align 8, !dbg [[DBG197]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[H]], metadata [[META198:![0-9]+]], metadata !DIExpression()), !dbg [[DBG199:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP5]], i64 0, i64 1, !dbg [[DBG200:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX8]], i64 0, i64 1, !dbg [[DBG200]]
+// CHECK1-NEXT:    store ptr [[ARRAYIDX9]], ptr [[H]], align 8, !dbg [[DBG199]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[D]], metadata [[META201:![0-9]+]], metadata !DIExpression()), !dbg [[DBG202:![0-9]+]]
+// CHECK1-NEXT:    store i32 15, ptr [[D]], align 4, !dbg [[DBG202]]
+// CHECK1-NEXT:    store i32 5, ptr [[A_ADDR]], align 4, !dbg [[DBG203:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP5]], i64 0, i64 0, !dbg [[DBG204:![0-9]+]]
+// CHECK1-NEXT:    [[TMP19:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG205:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP19]] to i64, !dbg [[DBG204]]
+// CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX10]], i64 0, i64 [[IDXPROM]], !dbg [[DBG204]]
+// CHECK1-NEXT:    store i32 10, ptr [[ARRAYIDX11]], align 4, !dbg [[DBG206:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG207:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX12]], i64 0, i64 0, !dbg [[DBG207]]
+// CHECK1-NEXT:    [[TMP20:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG208:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM14:%.*]] = sext i32 [[TMP20]] to i64, !dbg [[DBG207]]
+// CHECK1-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX13]], i64 0, i64 [[IDXPROM14]], !dbg [[DBG207]]
+// CHECK1-NEXT:    store i32 11, ptr [[ARRAYIDX15]], align 4, !dbg [[DBG209:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG210:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX16]], i64 0, i64 0, !dbg [[DBG210]]
+// CHECK1-NEXT:    [[TMP21:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG211:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM18:%.*]] = sext i32 [[TMP21]] to i64, !dbg [[DBG210]]
+// CHECK1-NEXT:    [[ARRAYIDX19:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX17]], i64 0, i64 [[IDXPROM18]], !dbg [[DBG210]]
+// CHECK1-NEXT:    [[TMP22:%.*]] = load i32, ptr [[ARRAYIDX19]], align 4, !dbg [[DBG210]]
+// CHECK1-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP5]], i64 0, i64 0, !dbg [[DBG212:![0-9]+]]
+// CHECK1-NEXT:    [[TMP23:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG213:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM21:%.*]] = sext i32 [[TMP23]] to i64, !dbg [[DBG212]]
+// CHECK1-NEXT:    [[ARRAYIDX22:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX20]], i64 0, i64 [[IDXPROM21]], !dbg [[DBG212]]
+// CHECK1-NEXT:    store i32 [[TMP22]], ptr [[ARRAYIDX22]], align 4, !dbg [[DBG214:![0-9]+]]
+// CHECK1-NEXT:    [[TMP24:%.*]] = load i8, ptr [[TMP8]], align 1, !dbg [[DBG215:![0-9]+]]
+// CHECK1-NEXT:    [[TOBOOL:%.*]] = trunc i8 [[TMP24]] to i1, !dbg [[DBG215]]
+// CHECK1-NEXT:    [[CONV:%.*]] = zext i1 [[TOBOOL]] to i32, !dbg [[DBG215]]
+// CHECK1-NEXT:    store i32 [[CONV]], ptr [[D]], align 4, !dbg [[DBG216:![0-9]+]]
+// CHECK1-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]], !dbg [[DBG217:![0-9]+]]
 // CHECK1:       omp.body.continue:
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]], !dbg [[DBG182]]
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]], !dbg [[DBG190]]
 // CHECK1:       omp.inner.for.inc:
-// CHECK1-NEXT:    [[TMP25:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG177]]
-// CHECK1-NEXT:    [[ADD23:%.*]] = add nsw i32 [[TMP25]], 1, !dbg [[DBG174]]
-// CHECK1-NEXT:    store i32 [[ADD23]], ptr [[DOTOMP_IV]], align 4, !dbg [[DBG174]]
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !dbg [[DBG182]], !llvm.loop [[LOOP210:![0-9]+]]
+// CHECK1-NEXT:    [[TMP25:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG185]]
+// CHECK1-NEXT:    [[ADD23:%.*]] = add nsw i32 [[TMP25]], 1, !dbg [[DBG182]]
+// CHECK1-NEXT:    store i32 [[ADD23]], ptr [[DOTOMP_IV]], align 4, !dbg [[DBG182]]
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !dbg [[DBG190]], !llvm.loop [[LOOP218:![0-9]+]]
 // CHECK1:       omp.inner.for.end:
-// CHECK1-NEXT:    br label [[OMP_DISPATCH_INC:%.*]], !dbg [[DBG182]]
+// CHECK1-NEXT:    br label [[OMP_DISPATCH_INC:%.*]], !dbg [[DBG190]]
 // CHECK1:       omp.dispatch.inc:
-// CHECK1-NEXT:    [[TMP26:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG177]]
-// CHECK1-NEXT:    [[TMP27:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG177]]
-// CHECK1-NEXT:    [[ADD24:%.*]] = add nsw i32 [[TMP26]], [[TMP27]], !dbg [[DBG174]]
-// CHECK1-NEXT:    store i32 [[ADD24]], ptr [[DOTOMP_LB]], align 4, !dbg [[DBG174]]
-// CHECK1-NEXT:    [[TMP28:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG177]]
-// CHECK1-NEXT:    [[TMP29:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG177]]
-// CHECK1-NEXT:    [[ADD25:%.*]] = add nsw i32 [[TMP28]], [[TMP29]], !dbg [[DBG174]]
-// CHECK1-NEXT:    store i32 [[ADD25]], ptr [[DOTOMP_UB]], align 4, !dbg [[DBG174]]
-// CHECK1-NEXT:    br label [[OMP_DISPATCH_COND]], !dbg [[DBG182]], !llvm.loop [[LOOP212:![0-9]+]]
+// CHECK1-NEXT:    [[TMP26:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG185]]
+// CHECK1-NEXT:    [[TMP27:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG185]]
+// CHECK1-NEXT:    [[ADD24:%.*]] = add nsw i32 [[TMP26]], [[TMP27]], !dbg [[DBG182]]
+// CHECK1-NEXT:    store i32 [[ADD24]], ptr [[DOTOMP_LB]], align 4, !dbg [[DBG182]]
+// CHECK1-NEXT:    [[TMP28:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG185]]
+// CHECK1-NEXT:    [[TMP29:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG185]]
+// CHECK1-NEXT:    [[ADD25:%.*]] = add nsw i32 [[TMP28]], [[TMP29]], !dbg [[DBG182]]
+// CHECK1-NEXT:    store i32 [[ADD25]], ptr [[DOTOMP_UB]], align 4, !dbg [[DBG182]]
+// CHECK1-NEXT:    br label [[OMP_DISPATCH_COND]], !dbg [[DBG190]], !llvm.loop [[LOOP220:![0-9]+]]
 // CHECK1:       omp.dispatch.end:
-// CHECK1-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB12:[0-9]+]], i32 [[TMP10]]), !dbg [[DBG211:![0-9]+]]
-// CHECK1-NEXT:    ret void, !dbg [[DBG213:![0-9]+]]
+// CHECK1-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB12:[0-9]+]], i32 [[TMP10]]), !dbg [[DBG219:![0-9]+]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG221:![0-9]+]]
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27_debug___omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR2]] !dbg [[DBG214:![0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR2]] !dbg [[DBG222:![0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -573,65 +583,70 @@ int main() {
 // CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[BB_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META215:![0-9]+]], metadata !DIExpression()), !dbg [[DBG216:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META223:![0-9]+]], metadata !DIExpression()), !dbg [[DBG224:![0-9]+]]
 // CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META217:![0-9]+]], metadata !DIExpression()), !dbg [[DBG216]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META225:![0-9]+]], metadata !DIExpression()), !dbg [[DBG224]]
 // CHECK1-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META218:![0-9]+]], metadata !DIExpression()), !dbg [[DBG216]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META226:![0-9]+]], metadata !DIExpression()), !dbg [[DBG224]]
 // CHECK1-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META219:![0-9]+]], metadata !DIExpression()), !dbg [[DBG216]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META227:![0-9]+]], metadata !DIExpression()), !dbg [[DBG224]]
 // CHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META220:![0-9]+]], metadata !DIExpression()), !dbg [[DBG216]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META228:![0-9]+]], metadata !DIExpression()), !dbg [[DBG224]]
 // CHECK1-NEXT:    store ptr [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META221:![0-9]+]], metadata !DIExpression()), !dbg [[DBG216]]
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG222:![0-9]+]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG222]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG222]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG222]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG222]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG222]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG222]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG222]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG222]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = addrspacecast ptr [[TMP5]] to ptr addrspace(1), !dbg [[DBG222]]
-// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr [[TMP7]] to ptr addrspace(1), !dbg [[DBG222]]
-// CHECK1-NEXT:    [[TMP11:%.*]] = addrspacecast ptr [[TMP8]] to ptr addrspace(1), !dbg [[DBG222]]
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27_debug___omp_outlined_debug__(ptr [[TMP3]], ptr [[TMP4]], ptr addrspace(1) [[TMP9]], i32 [[TMP6]], ptr addrspace(1) [[TMP10]], ptr addrspace(1) [[TMP11]]) #[[ATTR4]], !dbg [[DBG222]]
-// CHECK1-NEXT:    ret void, !dbg [[DBG222]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META229:![0-9]+]], metadata !DIExpression()), !dbg [[DBG224]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG230:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG230]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG230]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG230]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG230]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG230]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG230]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG230]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG230]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = addrspacecast ptr [[TMP5]] to ptr addrspace(1), !dbg [[DBG230]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr [[TMP7]] to ptr addrspace(1), !dbg [[DBG230]]
+// CHECK1-NEXT:    [[TMP11:%.*]] = addrspacecast ptr [[TMP8]] to ptr addrspace(1), !dbg [[DBG230]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27_debug___omp_outlined_debug__(ptr [[TMP3]], ptr [[TMP4]], ptr addrspace(1) [[TMP9]], i32 [[TMP6]], ptr addrspace(1) [[TMP10]], ptr addrspace(1) [[TMP11]]) #[[ATTR4]], !dbg [[DBG230]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG230]]
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27
-// CHECK1-SAME: (ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR6]] !dbg [[DBG223:![0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR6]] !dbg [[DBG231:![0-9]+]] {
 // CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
 // CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[BB_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DYN_PTR_ADDR]], metadata [[META234:![0-9]+]], metadata !DIExpression()), !dbg [[DBG235:![0-9]+]]
 // CHECK1-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META226:![0-9]+]], metadata !DIExpression()), !dbg [[DBG227:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META236:![0-9]+]], metadata !DIExpression()), !dbg [[DBG235]]
 // CHECK1-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META228:![0-9]+]], metadata !DIExpression()), !dbg [[DBG227]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META237:![0-9]+]], metadata !DIExpression()), !dbg [[DBG235]]
 // CHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META229:![0-9]+]], metadata !DIExpression()), !dbg [[DBG227]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META238:![0-9]+]], metadata !DIExpression()), !dbg [[DBG235]]
 // CHECK1-NEXT:    store ptr [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META230:![0-9]+]], metadata !DIExpression()), !dbg [[DBG227]]
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG231:![0-9]+]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG231]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG231]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG231]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG231]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG231]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG231]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = addrspacecast ptr [[TMP3]] to ptr addrspace(1), !dbg [[DBG231]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = addrspacecast ptr [[TMP5]] to ptr addrspace(1), !dbg [[DBG231]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = addrspacecast ptr [[TMP6]] to ptr addrspace(1), !dbg [[DBG231]]
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27_debug__(ptr addrspace(1) [[TMP7]], i32 [[TMP4]], ptr addrspace(1) [[TMP8]], ptr addrspace(1) [[TMP9]]) #[[ATTR4]], !dbg [[DBG231]]
-// CHECK1-NEXT:    ret void, !dbg [[DBG231]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META239:![0-9]+]], metadata !DIExpression()), !dbg [[DBG235]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG240:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG240]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG240]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DYN_PTR_ADDR]], align 8, !dbg [[DBG240]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG240]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG240]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG240]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG240]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = addrspacecast ptr [[TMP4]] to ptr addrspace(1), !dbg [[DBG240]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = addrspacecast ptr [[TMP6]] to ptr addrspace(1), !dbg [[DBG240]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr [[TMP7]] to ptr addrspace(1), !dbg [[DBG240]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27_debug__(ptr [[TMP3]], ptr addrspace(1) [[TMP8]], i32 [[TMP5]], ptr addrspace(1) [[TMP9]], ptr addrspace(1) [[TMP10]]) #[[ATTR4]], !dbg [[DBG240]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG240]]
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41_debug__
-// CHECK1-SAME: (ptr addrspace(1) noalias noundef [[C:%.*]], ptr addrspace(1) noalias noundef [[A:%.*]], ptr addrspace(1) noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG232:![0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], ptr addrspace(1) noalias noundef [[A:%.*]], ptr addrspace(1) noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG241:![0-9]+]] {
 // CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[C_ADDR:%.*]] = alloca ptr addrspace(1), align 8
 // CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca ptr addrspace(1), align 8
 // CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca ptr addrspace(1), align 8
@@ -641,52 +656,54 @@ int main() {
 // CHECK1-NEXT:    [[_TMP2:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[_TMP3:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x ptr], align 8
+// CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DYN_PTR_ADDR]], metadata [[META246:![0-9]+]], metadata !DIExpression()), !dbg [[DBG247:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META237:![0-9]+]], metadata !DIExpression()), !dbg [[DBG238:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META248:![0-9]+]], metadata !DIExpression()), !dbg [[DBG249:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[A]], ptr [[A_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META239:![0-9]+]], metadata !DIExpression()), !dbg [[DBG240:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META250:![0-9]+]], metadata !DIExpression()), !dbg [[DBG251:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META241:![0-9]+]], metadata !DIExpression()), !dbg [[DBG242:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META252:![0-9]+]], metadata !DIExpression()), !dbg [[DBG253:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META243:![0-9]+]], metadata !DIExpression()), !dbg [[DBG244:![0-9]+]]
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG245:![0-9]+]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG245]]
-// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG245]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG245]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr [[A_ADDR]], align 8, !dbg [[DBG245]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[TMP3]] to ptr, !dbg [[DBG245]]
-// CHECK1-NEXT:    store ptr [[TMP4]], ptr [[_TMP1]], align 8, !dbg [[DBG245]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG245]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr addrspace(1), ptr [[B_ADDR]], align 8, !dbg [[DBG245]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = addrspacecast ptr addrspace(1) [[TMP6]] to ptr, !dbg [[DBG245]]
-// CHECK1-NEXT:    store ptr [[TMP7]], ptr [[_TMP2]], align 8, !dbg [[DBG245]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG245]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG245]]
-// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr addrspace(1) [[TMP9]] to ptr, !dbg [[DBG245]]
-// CHECK1-NEXT:    store ptr [[TMP10]], ptr [[_TMP3]], align 8, !dbg [[DBG245]]
-// CHECK1-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[_TMP3]], align 8, !dbg [[DBG245]]
-// CHECK1-NEXT:    [[TMP12:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41_kernel_environment), !dbg [[DBG245]]
-// CHECK1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP12]], -1, !dbg [[DBG245]]
-// CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]], !dbg [[DBG245]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META254:![0-9]+]], metadata !DIExpression()), !dbg [[DBG255:![0-9]+]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG256:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG256]]
+// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG256]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG256]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr [[A_ADDR]], align 8, !dbg [[DBG256]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[TMP3]] to ptr, !dbg [[DBG256]]
+// CHECK1-NEXT:    store ptr [[TMP4]], ptr [[_TMP1]], align 8, !dbg [[DBG256]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG256]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr addrspace(1), ptr [[B_ADDR]], align 8, !dbg [[DBG256]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = addrspacecast ptr addrspace(1) [[TMP6]] to ptr, !dbg [[DBG256]]
+// CHECK1-NEXT:    store ptr [[TMP7]], ptr [[_TMP2]], align 8, !dbg [[DBG256]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG256]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG256]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr addrspace(1) [[TMP9]] to ptr, !dbg [[DBG256]]
+// CHECK1-NEXT:    store ptr [[TMP10]], ptr [[_TMP3]], align 8, !dbg [[DBG256]]
+// CHECK1-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[_TMP3]], align 8, !dbg [[DBG256]]
+// CHECK1-NEXT:    [[TMP12:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41_kernel_environment, ptr [[DYN_PTR]]), !dbg [[DBG256]]
+// CHECK1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP12]], -1, !dbg [[DBG256]]
+// CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]], !dbg [[DBG256]]
 // CHECK1:       user_code.entry:
 // CHECK1-NEXT:    [[TMP13:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB20:[0-9]+]])
-// CHECK1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 0, !dbg [[DBG246:![0-9]+]]
-// CHECK1-NEXT:    store ptr [[TMP2]], ptr [[TMP14]], align 8, !dbg [[DBG246]]
-// CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 1, !dbg [[DBG246]]
-// CHECK1-NEXT:    store ptr [[TMP5]], ptr [[TMP15]], align 8, !dbg [[DBG246]]
-// CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 2, !dbg [[DBG246]]
-// CHECK1-NEXT:    store ptr [[TMP8]], ptr [[TMP16]], align 8, !dbg [[DBG246]]
-// CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 3, !dbg [[DBG246]]
-// CHECK1-NEXT:    store ptr [[TMP11]], ptr [[TMP17]], align 8, !dbg [[DBG246]]
-// CHECK1-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB20]], i32 [[TMP13]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41_debug___omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 4), !dbg [[DBG246]]
-// CHECK1-NEXT:    call void @__kmpc_target_deinit(), !dbg [[DBG247:![0-9]+]]
-// CHECK1-NEXT:    ret void, !dbg [[DBG249:![0-9]+]]
+// CHECK1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 0, !dbg [[DBG257:![0-9]+]]
+// CHECK1-NEXT:    store ptr [[TMP2]], ptr [[TMP14]], align 8, !dbg [[DBG257]]
+// CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 1, !dbg [[DBG257]]
+// CHECK1-NEXT:    store ptr [[TMP5]], ptr [[TMP15]], align 8, !dbg [[DBG257]]
+// CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 2, !dbg [[DBG257]]
+// CHECK1-NEXT:    store ptr [[TMP8]], ptr [[TMP16]], align 8, !dbg [[DBG257]]
+// CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 3, !dbg [[DBG257]]
+// CHECK1-NEXT:    store ptr [[TMP11]], ptr [[TMP17]], align 8, !dbg [[DBG257]]
+// CHECK1-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB20]], i32 [[TMP13]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41_debug___omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 4), !dbg [[DBG257]]
+// CHECK1-NEXT:    call void @__kmpc_target_deinit(), !dbg [[DBG258:![0-9]+]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG260:![0-9]+]]
 // CHECK1:       worker.exit:
-// CHECK1-NEXT:    ret void, !dbg [[DBG245]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG256]]
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41_debug___omp_outlined_debug__
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], ptr addrspace(1) noalias noundef [[A:%.*]], ptr addrspace(1) noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR2]] !dbg [[DBG250:![0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], ptr addrspace(1) noalias noundef [[A:%.*]], ptr addrspace(1) noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR2]] !dbg [[DBG261:![0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -710,148 +727,148 @@ int main() {
 // CHECK1-NEXT:    [[H:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[D:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META253:![0-9]+]], metadata !DIExpression()), !dbg [[DBG254:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META264:![0-9]+]], metadata !DIExpression()), !dbg [[DBG265:![0-9]+]]
 // CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META255:![0-9]+]], metadata !DIExpression()), !dbg [[DBG254]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META266:![0-9]+]], metadata !DIExpression()), !dbg [[DBG265]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META256:![0-9]+]], metadata !DIExpression()), !dbg [[DBG257:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META267:![0-9]+]], metadata !DIExpression()), !dbg [[DBG268:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[A]], ptr [[A_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META258:![0-9]+]], metadata !DIExpression()), !dbg [[DBG259:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META269:![0-9]+]], metadata !DIExpression()), !dbg [[DBG270:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META260:![0-9]+]], metadata !DIExpression()), !dbg [[DBG261:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META271:![0-9]+]], metadata !DIExpression()), !dbg [[DBG272:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META262:![0-9]+]], metadata !DIExpression()), !dbg [[DBG263:![0-9]+]]
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG264:![0-9]+]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG264]]
-// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG264]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG264]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr [[A_ADDR]], align 8, !dbg [[DBG264]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[TMP3]] to ptr, !dbg [[DBG264]]
-// CHECK1-NEXT:    store ptr [[TMP4]], ptr [[_TMP1]], align 8, !dbg [[DBG264]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG264]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr addrspace(1), ptr [[B_ADDR]], align 8, !dbg [[DBG264]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = addrspacecast ptr addrspace(1) [[TMP6]] to ptr, !dbg [[DBG264]]
-// CHECK1-NEXT:    store ptr [[TMP7]], ptr [[_TMP2]], align 8, !dbg [[DBG264]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG264]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG264]]
-// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr addrspace(1) [[TMP9]] to ptr, !dbg [[DBG264]]
-// CHECK1-NEXT:    store ptr [[TMP10]], ptr [[_TMP3]], align 8, !dbg [[DBG264]]
-// CHECK1-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[_TMP3]], align 8, !dbg [[DBG264]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_IV]], metadata [[META265:![0-9]+]], metadata !DIExpression()), !dbg [[DBG254]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_LB]], metadata [[META266:![0-9]+]], metadata !DIExpression()), !dbg [[DBG254]]
-// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG267:![0-9]+]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_UB]], metadata [[META268:![0-9]+]], metadata !DIExpression()), !dbg [[DBG254]]
-// CHECK1-NEXT:    store i32 9, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG267]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_STRIDE]], metadata [[META269:![0-9]+]], metadata !DIExpression()), !dbg [[DBG254]]
-// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG267]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_IS_LAST]], metadata [[META270:![0-9]+]], metadata !DIExpression()), !dbg [[DBG254]]
-// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4, !dbg [[DBG267]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[I]], metadata [[META271:![0-9]+]], metadata !DIExpression()), !dbg [[DBG254]]
-// CHECK1-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG264]]
-// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4, !dbg [[DBG264]]
-// CHECK1-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB17:[0-9]+]], i32 [[TMP13]], i32 33, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1), !dbg [[DBG272:![0-9]+]]
-// CHECK1-NEXT:    br label [[OMP_DISPATCH_COND:%.*]], !dbg [[DBG264]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META273:![0-9]+]], metadata !DIExpression()), !dbg [[DBG274:![0-9]+]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG275:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG275]]
+// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG275]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG275]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr [[A_ADDR]], align 8, !dbg [[DBG275]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[TMP3]] to ptr, !dbg [[DBG275]]
+// CHECK1-NEXT:    store ptr [[TMP4]], ptr [[_TMP1]], align 8, !dbg [[DBG275]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG275]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr addrspace(1), ptr [[B_ADDR]], align 8, !dbg [[DBG275]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = addrspacecast ptr addrspace(1) [[TMP6]] to ptr, !dbg [[DBG275]]
+// CHECK1-NEXT:    store ptr [[TMP7]], ptr [[_TMP2]], align 8, !dbg [[DBG275]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG275]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG275]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr addrspace(1) [[TMP9]] to ptr, !dbg [[DBG275]]
+// CHECK1-NEXT:    store ptr [[TMP10]], ptr [[_TMP3]], align 8, !dbg [[DBG275]]
+// CHECK1-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[_TMP3]], align 8, !dbg [[DBG275]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_IV]], metadata [[META276:![0-9]+]], metadata !DIExpression()), !dbg [[DBG265]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_LB]], metadata [[META277:![0-9]+]], metadata !DIExpression()), !dbg [[DBG265]]
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG278:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_UB]], metadata [[META279:![0-9]+]], metadata !DIExpression()), !dbg [[DBG265]]
+// CHECK1-NEXT:    store i32 9, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG278]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_STRIDE]], metadata [[META280:![0-9]+]], metadata !DIExpression()), !dbg [[DBG265]]
+// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG278]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_IS_LAST]], metadata [[META281:![0-9]+]], metadata !DIExpression()), !dbg [[DBG265]]
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4, !dbg [[DBG278]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[I]], metadata [[META282:![0-9]+]], metadata !DIExpression()), !dbg [[DBG265]]
+// CHECK1-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG275]]
+// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4, !dbg [[DBG275]]
+// CHECK1-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB17:[0-9]+]], i32 [[TMP13]], i32 33, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1), !dbg [[DBG283:![0-9]+]]
+// CHECK1-NEXT:    br label [[OMP_DISPATCH_COND:%.*]], !dbg [[DBG275]]
 // CHECK1:       omp.dispatch.cond:
-// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG267]]
-// CHECK1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP14]], 9, !dbg [[DBG267]]
-// CHECK1-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]], !dbg [[DBG267]]
+// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG278]]
+// CHECK1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP14]], 9, !dbg [[DBG278]]
+// CHECK1-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]], !dbg [[DBG278]]
 // CHECK1:       cond.true:
-// CHECK1-NEXT:    br label [[COND_END:%.*]], !dbg [[DBG267]]
+// CHECK1-NEXT:    br label [[COND_END:%.*]], !dbg [[DBG278]]
 // CHECK1:       cond.false:
-// CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG267]]
-// CHECK1-NEXT:    br label [[COND_END]], !dbg [[DBG267]]
+// CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG278]]
+// CHECK1-NEXT:    br label [[COND_END]], !dbg [[DBG278]]
 // CHECK1:       cond.end:
-// CHECK1-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP15]], [[COND_FALSE]] ], !dbg [[DBG267]]
-// CHECK1-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4, !dbg [[DBG267]]
-// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG267]]
-// CHECK1-NEXT:    store i32 [[TMP16]], ptr [[DOTOMP_IV]], align 4, !dbg [[DBG267]]
-// CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG267]]
-// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG267]]
-// CHECK1-NEXT:    [[CMP5:%.*]] = icmp sle i32 [[TMP17]], [[TMP18]], !dbg [[DBG264]]
-// CHECK1-NEXT:    br i1 [[CMP5]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]], !dbg [[DBG264]]
+// CHECK1-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP15]], [[COND_FALSE]] ], !dbg [[DBG278]]
+// CHECK1-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4, !dbg [[DBG278]]
+// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG278]]
+// CHECK1-NEXT:    store i32 [[TMP16]], ptr [[DOTOMP_IV]], align 4, !dbg [[DBG278]]
+// CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG278]]
+// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG278]]
+// CHECK1-NEXT:    [[CMP5:%.*]] = icmp sle i32 [[TMP17]], [[TMP18]], !dbg [[DBG275]]
+// CHECK1-NEXT:    br i1 [[CMP5]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]], !dbg [[DBG275]]
 // CHECK1:       omp.dispatch.body:
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]], !dbg [[DBG264]]
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]], !dbg [[DBG275]]
 // CHECK1:       omp.inner.for.cond:
-// CHECK1-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG267]]
-// CHECK1-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG267]]
-// CHECK1-NEXT:    [[CMP6:%.*]] = icmp sle i32 [[TMP19]], [[TMP20]], !dbg [[DBG264]]
-// CHECK1-NEXT:    br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]], !dbg [[DBG264]]
+// CHECK1-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG278]]
+// CHECK1-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG278]]
+// CHECK1-NEXT:    [[CMP6:%.*]] = icmp sle i32 [[TMP19]], [[TMP20]], !dbg [[DBG275]]
+// CHECK1-NEXT:    br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]], !dbg [[DBG275]]
 // CHECK1:       omp.inner.for.body:
-// CHECK1-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG267]]
-// CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP21]], 1, !dbg [[DBG273:![0-9]+]]
-// CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]], !dbg [[DBG273]]
-// CHECK1-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !dbg [[DBG273]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[F]], metadata [[META274:![0-9]+]], metadata !DIExpression()), !dbg [[DBG276:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 1, !dbg [[DBG277:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX]], i64 0, i64 1, !dbg [[DBG277]]
-// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX7]], i64 0, i64 1, !dbg [[DBG277]]
-// CHECK1-NEXT:    store ptr [[ARRAYIDX8]], ptr [[F]], align 8, !dbg [[DBG276]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[G]], metadata [[META278:![0-9]+]], metadata !DIExpression()), !dbg [[DBG279:![0-9]+]]
-// CHECK1-NEXT:    store ptr [[TMP5]], ptr [[G]], align 8, !dbg [[DBG279]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[H]], metadata [[META280:![0-9]+]], metadata !DIExpression()), !dbg [[DBG281:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP8]], i64 0, i64 1, !dbg [[DBG282:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX9]], i64 0, i64 1, !dbg [[DBG282]]
-// CHECK1-NEXT:    store ptr [[ARRAYIDX10]], ptr [[H]], align 8, !dbg [[DBG281]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[D]], metadata [[META283:![0-9]+]], metadata !DIExpression()), !dbg [[DBG284:![0-9]+]]
-// CHECK1-NEXT:    store i32 15, ptr [[D]], align 4, !dbg [[DBG284]]
-// CHECK1-NEXT:    store i32 5, ptr [[TMP5]], align 4, !dbg [[DBG285:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP8]], i64 0, i64 0, !dbg [[DBG286:![0-9]+]]
-// CHECK1-NEXT:    [[TMP22:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG287:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP22]] to i64, !dbg [[DBG286]]
-// CHECK1-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX11]], i64 0, i64 [[IDXPROM]], !dbg [[DBG286]]
-// CHECK1-NEXT:    store i32 10, ptr [[ARRAYIDX12]], align 4, !dbg [[DBG288:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG289:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX13]], i64 0, i64 0, !dbg [[DBG289]]
-// CHECK1-NEXT:    [[TMP23:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG290:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM15:%.*]] = sext i32 [[TMP23]] to i64, !dbg [[DBG289]]
-// CHECK1-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX14]], i64 0, i64 [[IDXPROM15]], !dbg [[DBG289]]
-// CHECK1-NEXT:    store i32 11, ptr [[ARRAYIDX16]], align 4, !dbg [[DBG291:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG292:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX17]], i64 0, i64 0, !dbg [[DBG292]]
-// CHECK1-NEXT:    [[TMP24:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG293:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM19:%.*]] = sext i32 [[TMP24]] to i64, !dbg [[DBG292]]
-// CHECK1-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX18]], i64 0, i64 [[IDXPROM19]], !dbg [[DBG292]]
-// CHECK1-NEXT:    [[TMP25:%.*]] = load i32, ptr [[ARRAYIDX20]], align 4, !dbg [[DBG292]]
-// CHECK1-NEXT:    [[ARRAYIDX21:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP8]], i64 0, i64 0, !dbg [[DBG294:![0-9]+]]
-// CHECK1-NEXT:    [[TMP26:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG295:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM22:%.*]] = sext i32 [[TMP26]] to i64, !dbg [[DBG294]]
-// CHECK1-NEXT:    [[ARRAYIDX23:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX21]], i64 0, i64 [[IDXPROM22]], !dbg [[DBG294]]
-// CHECK1-NEXT:    store i32 [[TMP25]], ptr [[ARRAYIDX23]], align 4, !dbg [[DBG296:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX24:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP8]], i64 0, i64 0, !dbg [[DBG297:![0-9]+]]
-// CHECK1-NEXT:    [[TMP27:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG298:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM25:%.*]] = sext i32 [[TMP27]] to i64, !dbg [[DBG297]]
-// CHECK1-NEXT:    [[ARRAYIDX26:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX24]], i64 0, i64 [[IDXPROM25]], !dbg [[DBG297]]
-// CHECK1-NEXT:    [[TMP28:%.*]] = load i32, ptr [[ARRAYIDX26]], align 4, !dbg [[DBG297]]
-// CHECK1-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP28]], 0, !dbg [[DBG297]]
-// CHECK1-NEXT:    [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8, !dbg [[DBG299:![0-9]+]]
-// CHECK1-NEXT:    store i8 [[FROMBOOL]], ptr [[TMP11]], align 1, !dbg [[DBG299]]
-// CHECK1-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]], !dbg [[DBG300:![0-9]+]]
+// CHECK1-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG278]]
+// CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP21]], 1, !dbg [[DBG284:![0-9]+]]
+// CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]], !dbg [[DBG284]]
+// CHECK1-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !dbg [[DBG284]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[F]], metadata [[META285:![0-9]+]], metadata !DIExpression()), !dbg [[DBG287:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 1, !dbg [[DBG288:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX]], i64 0, i64 1, !dbg [[DBG288]]
+// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX7]], i64 0, i64 1, !dbg [[DBG288]]
+// CHECK1-NEXT:    store ptr [[ARRAYIDX8]], ptr [[F]], align 8, !dbg [[DBG287]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[G]], metadata [[META289:![0-9]+]], metadata !DIExpression()), !dbg [[DBG290:![0-9]+]]
+// CHECK1-NEXT:    store ptr [[TMP5]], ptr [[G]], align 8, !dbg [[DBG290]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[H]], metadata [[META291:![0-9]+]], metadata !DIExpression()), !dbg [[DBG292:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP8]], i64 0, i64 1, !dbg [[DBG293:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX9]], i64 0, i64 1, !dbg [[DBG293]]
+// CHECK1-NEXT:    store ptr [[ARRAYIDX10]], ptr [[H]], align 8, !dbg [[DBG292]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[D]], metadata [[META294:![0-9]+]], metadata !DIExpression()), !dbg [[DBG295:![0-9]+]]
+// CHECK1-NEXT:    store i32 15, ptr [[D]], align 4, !dbg [[DBG295]]
+// CHECK1-NEXT:    store i32 5, ptr [[TMP5]], align 4, !dbg [[DBG296:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP8]], i64 0, i64 0, !dbg [[DBG297:![0-9]+]]
+// CHECK1-NEXT:    [[TMP22:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG298:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP22]] to i64, !dbg [[DBG297]]
+// CHECK1-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX11]], i64 0, i64 [[IDXPROM]], !dbg [[DBG297]]
+// CHECK1-NEXT:    store i32 10, ptr [[ARRAYIDX12]], align 4, !dbg [[DBG299:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG300:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX13]], i64 0, i64 0, !dbg [[DBG300]]
+// CHECK1-NEXT:    [[TMP23:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG301:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM15:%.*]] = sext i32 [[TMP23]] to i64, !dbg [[DBG300]]
+// CHECK1-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX14]], i64 0, i64 [[IDXPROM15]], !dbg [[DBG300]]
+// CHECK1-NEXT:    store i32 11, ptr [[ARRAYIDX16]], align 4, !dbg [[DBG302:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG303:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX17]], i64 0, i64 0, !dbg [[DBG303]]
+// CHECK1-NEXT:    [[TMP24:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG304:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM19:%.*]] = sext i32 [[TMP24]] to i64, !dbg [[DBG303]]
+// CHECK1-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX18]], i64 0, i64 [[IDXPROM19]], !dbg [[DBG303]]
+// CHECK1-NEXT:    [[TMP25:%.*]] = load i32, ptr [[ARRAYIDX20]], align 4, !dbg [[DBG303]]
+// CHECK1-NEXT:    [[ARRAYIDX21:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP8]], i64 0, i64 0, !dbg [[DBG305:![0-9]+]]
+// CHECK1-NEXT:    [[TMP26:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG306:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM22:%.*]] = sext i32 [[TMP26]] to i64, !dbg [[DBG305]]
+// CHECK1-NEXT:    [[ARRAYIDX23:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX21]], i64 0, i64 [[IDXPROM22]], !dbg [[DBG305]]
+// CHECK1-NEXT:    store i32 [[TMP25]], ptr [[ARRAYIDX23]], align 4, !dbg [[DBG307:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX24:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP8]], i64 0, i64 0, !dbg [[DBG308:![0-9]+]]
+// CHECK1-NEXT:    [[TMP27:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG309:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM25:%.*]] = sext i32 [[TMP27]] to i64, !dbg [[DBG308]]
+// CHECK1-NEXT:    [[ARRAYIDX26:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX24]], i64 0, i64 [[IDXPROM25]], !dbg [[DBG308]]
+// CHECK1-NEXT:    [[TMP28:%.*]] = load i32, ptr [[ARRAYIDX26]], align 4, !dbg [[DBG308]]
+// CHECK1-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP28]], 0, !dbg [[DBG308]]
+// CHECK1-NEXT:    [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8, !dbg [[DBG310:![0-9]+]]
+// CHECK1-NEXT:    store i8 [[FROMBOOL]], ptr [[TMP11]], align 1, !dbg [[DBG310]]
+// CHECK1-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]], !dbg [[DBG311:![0-9]+]]
 // CHECK1:       omp.body.continue:
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]], !dbg [[DBG272]]
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]], !dbg [[DBG283]]
 // CHECK1:       omp.inner.for.inc:
-// CHECK1-NEXT:    [[TMP29:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG267]]
-// CHECK1-NEXT:    [[ADD27:%.*]] = add nsw i32 [[TMP29]], 1, !dbg [[DBG264]]
-// CHECK1-NEXT:    store i32 [[ADD27]], ptr [[DOTOMP_IV]], align 4, !dbg [[DBG264]]
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !dbg [[DBG272]], !llvm.loop [[LOOP301:![0-9]+]]
+// CHECK1-NEXT:    [[TMP29:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG278]]
+// CHECK1-NEXT:    [[ADD27:%.*]] = add nsw i32 [[TMP29]], 1, !dbg [[DBG275]]
+// CHECK1-NEXT:    store i32 [[ADD27]], ptr [[DOTOMP_IV]], align 4, !dbg [[DBG275]]
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !dbg [[DBG283]], !llvm.loop [[LOOP312:![0-9]+]]
 // CHECK1:       omp.inner.for.end:
-// CHECK1-NEXT:    br label [[OMP_DISPATCH_INC:%.*]], !dbg [[DBG272]]
+// CHECK1-NEXT:    br label [[OMP_DISPATCH_INC:%.*]], !dbg [[DBG283]]
 // CHECK1:       omp.dispatch.inc:
-// CHECK1-NEXT:    [[TMP30:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG267]]
-// CHECK1-NEXT:    [[TMP31:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG267]]
-// CHECK1-NEXT:    [[ADD28:%.*]] = add nsw i32 [[TMP30]], [[TMP31]], !dbg [[DBG264]]
-// CHECK1-NEXT:    store i32 [[ADD28]], ptr [[DOTOMP_LB]], align 4, !dbg [[DBG264]]
-// CHECK1-NEXT:    [[TMP32:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG267]]
-// CHECK1-NEXT:    [[TMP33:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG267]]
-// CHECK1-NEXT:    [[ADD29:%.*]] = add nsw i32 [[TMP32]], [[TMP33]], !dbg [[DBG264]]
-// CHECK1-NEXT:    store i32 [[ADD29]], ptr [[DOTOMP_UB]], align 4, !dbg [[DBG264]]
-// CHECK1-NEXT:    br label [[OMP_DISPATCH_COND]], !dbg [[DBG272]], !llvm.loop [[LOOP303:![0-9]+]]
+// CHECK1-NEXT:    [[TMP30:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG278]]
+// CHECK1-NEXT:    [[TMP31:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG278]]
+// CHECK1-NEXT:    [[ADD28:%.*]] = add nsw i32 [[TMP30]], [[TMP31]], !dbg [[DBG275]]
+// CHECK1-NEXT:    store i32 [[ADD28]], ptr [[DOTOMP_LB]], align 4, !dbg [[DBG275]]
+// CHECK1-NEXT:    [[TMP32:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG278]]
+// CHECK1-NEXT:    [[TMP33:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG278]]
+// CHECK1-NEXT:    [[ADD29:%.*]] = add nsw i32 [[TMP32]], [[TMP33]], !dbg [[DBG275]]
+// CHECK1-NEXT:    store i32 [[ADD29]], ptr [[DOTOMP_UB]], align 4, !dbg [[DBG275]]
+// CHECK1-NEXT:    br label [[OMP_DISPATCH_COND]], !dbg [[DBG283]], !llvm.loop [[LOOP314:![0-9]+]]
 // CHECK1:       omp.dispatch.end:
-// CHECK1-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB19:[0-9]+]], i32 [[TMP13]]), !dbg [[DBG302:![0-9]+]]
-// CHECK1-NEXT:    ret void, !dbg [[DBG304:![0-9]+]]
+// CHECK1-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB19:[0-9]+]], i32 [[TMP13]]), !dbg [[DBG313:![0-9]+]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG315:![0-9]+]]
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41_debug___omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR2]] !dbg [[DBG305:![0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR2]] !dbg [[DBG316:![0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -860,62 +877,66 @@ int main() {
 // CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[BB_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META308:![0-9]+]], metadata !DIExpression()), !dbg [[DBG309:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META319:![0-9]+]], metadata !DIExpression()), !dbg [[DBG320:![0-9]+]]
 // CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META310:![0-9]+]], metadata !DIExpression()), !dbg [[DBG309]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META321:![0-9]+]], metadata !DIExpression()), !dbg [[DBG320]]
 // CHECK1-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META311:![0-9]+]], metadata !DIExpression()), !dbg [[DBG309]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META322:![0-9]+]], metadata !DIExpression()), !dbg [[DBG320]]
 // CHECK1-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META312:![0-9]+]], metadata !DIExpression()), !dbg [[DBG309]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META323:![0-9]+]], metadata !DIExpression()), !dbg [[DBG320]]
 // CHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META313:![0-9]+]], metadata !DIExpression()), !dbg [[DBG309]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META324:![0-9]+]], metadata !DIExpression()), !dbg [[DBG320]]
 // CHECK1-NEXT:    store ptr [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META314:![0-9]+]], metadata !DIExpression()), !dbg [[DBG309]]
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG315:![0-9]+]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG315]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG315]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG315]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG315]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG315]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG315]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG315]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG315]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG315]]
-// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr [[TMP6]] to ptr addrspace(1), !dbg [[DBG315]]
-// CHECK1-NEXT:    [[TMP11:%.*]] = addrspacecast ptr [[TMP7]] to ptr addrspace(1), !dbg [[DBG315]]
-// CHECK1-NEXT:    [[TMP12:%.*]] = addrspacecast ptr [[TMP8]] to ptr addrspace(1), !dbg [[DBG315]]
-// CHECK1-NEXT:    [[TMP13:%.*]] = addrspacecast ptr [[TMP9]] to ptr addrspace(1), !dbg [[DBG315]]
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41_debug___omp_outlined_debug__(ptr [[TMP4]], ptr [[TMP5]], ptr addrspace(1) [[TMP10]], ptr addrspace(1) [[TMP11]], ptr addrspace(1) [[TMP12]], ptr addrspace(1) [[TMP13]]) #[[ATTR4]], !dbg [[DBG315]]
-// CHECK1-NEXT:    ret void, !dbg [[DBG315]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META325:![0-9]+]], metadata !DIExpression()), !dbg [[DBG320]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG326:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG326]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG326]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG326]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG326]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG326]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG326]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG326]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG326]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG326]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr [[TMP6]] to ptr addrspace(1), !dbg [[DBG326]]
+// CHECK1-NEXT:    [[TMP11:%.*]] = addrspacecast ptr [[TMP7]] to ptr addrspace(1), !dbg [[DBG326]]
+// CHECK1-NEXT:    [[TMP12:%.*]] = addrspacecast ptr [[TMP8]] to ptr addrspace(1), !dbg [[DBG326]]
+// CHECK1-NEXT:    [[TMP13:%.*]] = addrspacecast ptr [[TMP9]] to ptr addrspace(1), !dbg [[DBG326]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41_debug___omp_outlined_debug__(ptr [[TMP4]], ptr [[TMP5]], ptr addrspace(1) [[TMP10]], ptr addrspace(1) [[TMP11]], ptr addrspace(1) [[TMP12]], ptr addrspace(1) [[TMP13]]) #[[ATTR4]], !dbg [[DBG326]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG326]]
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41
-// CHECK1-SAME: (ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR6]] !dbg [[DBG316:![0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR6]] !dbg [[DBG327:![0-9]+]] {
 // CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[BB_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DYN_PTR_ADDR]], metadata [[META330:![0-9]+]], metadata !DIExpression()), !dbg [[DBG331:![0-9]+]]
 // CHECK1-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META319:![0-9]+]], metadata !DIExpression()), !dbg [[DBG320:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META332:![0-9]+]], metadata !DIExpression()), !dbg [[DBG331]]
 // CHECK1-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META321:![0-9]+]], metadata !DIExpression()), !dbg [[DBG320]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META333:![0-9]+]], metadata !DIExpression()), !dbg [[DBG331]]
 // CHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META322:![0-9]+]], metadata !DIExpression()), !dbg [[DBG320]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META334:![0-9]+]], metadata !DIExpression()), !dbg [[DBG331]]
 // CHECK1-NEXT:    store ptr [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META323:![0-9]+]], metadata !DIExpression()), !dbg [[DBG320]]
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG324:![0-9]+]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG324]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG324]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG324]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG324]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG324]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG324]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG324]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = addrspacecast ptr [[TMP4]] to ptr addrspace(1), !dbg [[DBG324]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = addrspacecast ptr [[TMP5]] to ptr addrspace(1), !dbg [[DBG324]]
-// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr [[TMP6]] to ptr addrspace(1), !dbg [[DBG324]]
-// CHECK1-NEXT:    [[TMP11:%.*]] = addrspacecast ptr [[TMP7]] to ptr addrspace(1), !dbg [[DBG324]]
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41_debug__(ptr addrspace(1) [[TMP8]], ptr addrspace(1) [[TMP9]], ptr addrspace(1) [[TMP10]], ptr addrspace(1) [[TMP11]]) #[[ATTR4]], !dbg [[DBG324]]
-// CHECK1-NEXT:    ret void, !dbg [[DBG324]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META335:![0-9]+]], metadata !DIExpression()), !dbg [[DBG331]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG336:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG336]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG336]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG336]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DYN_PTR_ADDR]], align 8, !dbg [[DBG336]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG336]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG336]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG336]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG336]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = addrspacecast ptr [[TMP5]] to ptr addrspace(1), !dbg [[DBG336]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr [[TMP6]] to ptr addrspace(1), !dbg [[DBG336]]
+// CHECK1-NEXT:    [[TMP11:%.*]] = addrspacecast ptr [[TMP7]] to ptr addrspace(1), !dbg [[DBG336]]
+// CHECK1-NEXT:    [[TMP12:%.*]] = addrspacecast ptr [[TMP8]] to ptr addrspace(1), !dbg [[DBG336]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41_debug__(ptr [[TMP4]], ptr addrspace(1) [[TMP9]], ptr addrspace(1) [[TMP10]], ptr addrspace(1) [[TMP11]], ptr addrspace(1) [[TMP12]]) #[[ATTR4]], !dbg [[DBG336]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG336]]
 //
diff --git a/clang/test/OpenMP/target_parallel_for_simd_codegen.cpp b/clang/test/OpenMP/target_parallel_for_simd_codegen.cpp
index 8205d1f217817..f78a71585ce7f 100644
--- a/clang/test/OpenMP/target_parallel_for_simd_codegen.cpp
+++ b/clang/test/OpenMP/target_parallel_for_simd_codegen.cpp
@@ -9024,8 +9024,10 @@ int bar(int n){
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l96
-// CHECK17-SAME: () #[[ATTR0:[0-9]+]] {
+// CHECK17-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK17-NEXT:  entry:
+// CHECK17-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
+// CHECK17-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK17-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB2:[0-9]+]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l96.omp_outlined)
 // CHECK17-NEXT:    ret void
 //
@@ -9098,14 +9100,16 @@ int bar(int n){
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l108
-// CHECK17-SAME: (i64 noundef [[AA:%.*]], i64 noundef [[LIN:%.*]], i64 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK17-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[AA:%.*]], i64 noundef [[LIN:%.*]], i64 noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK17-NEXT:  entry:
+// CHECK17-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[AA_ADDR:%.*]] = alloca i64, align 8
 // CHECK17-NEXT:    [[LIN_ADDR:%.*]] = alloca i64, align 8
 // CHECK17-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
 // CHECK17-NEXT:    [[AA_CASTED:%.*]] = alloca i64, align 8
 // CHECK17-NEXT:    [[LIN_CASTED:%.*]] = alloca i64, align 8
 // CHECK17-NEXT:    [[A_CASTED:%.*]] = alloca i64, align 8
+// CHECK17-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK17-NEXT:    store i64 [[AA]], ptr [[AA_ADDR]], align 8
 // CHECK17-NEXT:    store i64 [[LIN]], ptr [[LIN_ADDR]], align 8
 // CHECK17-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
@@ -9245,12 +9249,14 @@ int bar(int n){
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l116
-// CHECK17-SAME: (i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]]) #[[ATTR0]] {
+// CHECK17-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]]) #[[ATTR0]] {
 // CHECK17-NEXT:  entry:
+// CHECK17-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
 // CHECK17-NEXT:    [[AA_ADDR:%.*]] = alloca i64, align 8
 // CHECK17-NEXT:    [[A_CASTED:%.*]] = alloca i64, align 8
 // CHECK17-NEXT:    [[AA_CASTED:%.*]] = alloca i64, align 8
+// CHECK17-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK17-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
 // CHECK17-NEXT:    store i64 [[AA]], ptr [[AA_ADDR]], align 8
 // CHECK17-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
@@ -9344,8 +9350,9 @@ int bar(int n){
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l140
-// CHECK17-SAME: (i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BN:%.*]], ptr noundef nonnull align 8 dereferenceable(400) [[C:%.*]], i64 noundef [[VLA1:%.*]], i64 noundef [[VLA3:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[CN:%.*]], ptr noundef nonnull align 8 dereferenceable(16) [[D:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] {
+// CHECK17-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BN:%.*]], ptr noundef nonnull align 8 dereferenceable(400) [[C:%.*]], i64 noundef [[VLA1:%.*]], i64 noundef [[VLA3:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[CN:%.*]], ptr noundef nonnull align 8 dereferenceable(16) [[D:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] {
 // CHECK17-NEXT:  entry:
+// CHECK17-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
 // CHECK17-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[VLA_ADDR:%.*]] = alloca i64, align 8
@@ -9358,6 +9365,7 @@ int bar(int n){
 // CHECK17-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8
 // CHECK17-NEXT:    [[A_CASTED:%.*]] = alloca i64, align 8
 // CHECK17-NEXT:    [[DOTCAPTURE_EXPR__CASTED:%.*]] = alloca i64, align 8
+// CHECK17-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK17-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
 // CHECK17-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
 // CHECK17-NEXT:    store i64 [[VLA]], ptr [[VLA_ADDR]], align 8
@@ -9537,8 +9545,9 @@ int bar(int n){
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l195
-// CHECK17-SAME: (i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]], i64 noundef [[AAA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
+// CHECK17-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]], i64 noundef [[AAA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
 // CHECK17-NEXT:  entry:
+// CHECK17-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
 // CHECK17-NEXT:    [[AA_ADDR:%.*]] = alloca i64, align 8
 // CHECK17-NEXT:    [[AAA_ADDR:%.*]] = alloca i64, align 8
@@ -9546,6 +9555,7 @@ int bar(int n){
 // CHECK17-NEXT:    [[A_CASTED:%.*]] = alloca i64, align 8
 // CHECK17-NEXT:    [[AA_CASTED:%.*]] = alloca i64, align 8
 // CHECK17-NEXT:    [[AAA_CASTED:%.*]] = alloca i64, align 8
+// CHECK17-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK17-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
 // CHECK17-NEXT:    store i64 [[AA]], ptr [[AA_ADDR]], align 8
 // CHECK17-NEXT:    store i64 [[AAA]], ptr [[AAA_ADDR]], align 8
@@ -9586,14 +9596,16 @@ int bar(int n){
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l216
-// CHECK17-SAME: (ptr noundef [[THIS:%.*]], i64 noundef [[B:%.*]], i64 noundef [[VLA:%.*]], i64 noundef [[VLA1:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[C:%.*]]) #[[ATTR0]] {
+// CHECK17-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef [[THIS:%.*]], i64 noundef [[B:%.*]], i64 noundef [[VLA:%.*]], i64 noundef [[VLA1:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[C:%.*]]) #[[ATTR0]] {
 // CHECK17-NEXT:  entry:
+// CHECK17-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[B_ADDR:%.*]] = alloca i64, align 8
 // CHECK17-NEXT:    [[VLA_ADDR:%.*]] = alloca i64, align 8
 // CHECK17-NEXT:    [[VLA_ADDR2:%.*]] = alloca i64, align 8
 // CHECK17-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[B_CASTED:%.*]] = alloca i64, align 8
+// CHECK17-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK17-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
 // CHECK17-NEXT:    store i64 [[B]], ptr [[B_ADDR]], align 8
 // CHECK17-NEXT:    store i64 [[VLA]], ptr [[VLA_ADDR]], align 8
@@ -9706,13 +9718,15 @@ int bar(int n){
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l178
-// CHECK17-SAME: (i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
+// CHECK17-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
 // CHECK17-NEXT:  entry:
+// CHECK17-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
 // CHECK17-NEXT:    [[AA_ADDR:%.*]] = alloca i64, align 8
 // CHECK17-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[A_CASTED:%.*]] = alloca i64, align 8
 // CHECK17-NEXT:    [[AA_CASTED:%.*]] = alloca i64, align 8
+// CHECK17-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK17-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
 // CHECK17-NEXT:    store i64 [[AA]], ptr [[AA_ADDR]], align 8
 // CHECK17-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
@@ -9814,8 +9828,10 @@ int bar(int n){
 //
 //
 // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l96
-// CHECK19-SAME: () #[[ATTR0:[0-9]+]] {
+// CHECK19-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK19-NEXT:  entry:
+// CHECK19-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
+// CHECK19-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK19-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB2:[0-9]+]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l96.omp_outlined)
 // CHECK19-NEXT:    ret void
 //
@@ -9888,14 +9904,16 @@ int bar(int n){
 //
 //
 // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l108
-// CHECK19-SAME: (i32 noundef [[AA:%.*]], i32 noundef [[LIN:%.*]], i32 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK19-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[AA:%.*]], i32 noundef [[LIN:%.*]], i32 noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK19-NEXT:  entry:
+// CHECK19-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[AA_ADDR:%.*]] = alloca i32, align 4
 // CHECK19-NEXT:    [[LIN_ADDR:%.*]] = alloca i32, align 4
 // CHECK19-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 // CHECK19-NEXT:    [[AA_CASTED:%.*]] = alloca i32, align 4
 // CHECK19-NEXT:    [[LIN_CASTED:%.*]] = alloca i32, align 4
 // CHECK19-NEXT:    [[A_CASTED:%.*]] = alloca i32, align 4
+// CHECK19-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK19-NEXT:    store i32 [[AA]], ptr [[AA_ADDR]], align 4
 // CHECK19-NEXT:    store i32 [[LIN]], ptr [[LIN_ADDR]], align 4
 // CHECK19-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
@@ -10035,12 +10053,14 @@ int bar(int n){
 //
 //
 // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l116
-// CHECK19-SAME: (i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]]) #[[ATTR0]] {
+// CHECK19-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]]) #[[ATTR0]] {
 // CHECK19-NEXT:  entry:
+// CHECK19-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 // CHECK19-NEXT:    [[AA_ADDR:%.*]] = alloca i32, align 4
 // CHECK19-NEXT:    [[A_CASTED:%.*]] = alloca i32, align 4
 // CHECK19-NEXT:    [[AA_CASTED:%.*]] = alloca i32, align 4
+// CHECK19-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK19-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
 // CHECK19-NEXT:    store i32 [[AA]], ptr [[AA_ADDR]], align 4
 // CHECK19-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
@@ -10134,8 +10154,9 @@ int bar(int n){
 //
 //
 // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l140
-// CHECK19-SAME: (i32 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BN:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[C:%.*]], i32 noundef [[VLA1:%.*]], i32 noundef [[VLA3:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[CN:%.*]], ptr noundef nonnull align 4 dereferenceable(12) [[D:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] {
+// CHECK19-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BN:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[C:%.*]], i32 noundef [[VLA1:%.*]], i32 noundef [[VLA3:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[CN:%.*]], ptr noundef nonnull align 4 dereferenceable(12) [[D:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] {
 // CHECK19-NEXT:  entry:
+// CHECK19-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 // CHECK19-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[VLA_ADDR:%.*]] = alloca i32, align 4
@@ -10148,6 +10169,7 @@ int bar(int n){
 // CHECK19-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i32, align 4
 // CHECK19-NEXT:    [[A_CASTED:%.*]] = alloca i32, align 4
 // CHECK19-NEXT:    [[DOTCAPTURE_EXPR__CASTED:%.*]] = alloca i32, align 4
+// CHECK19-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK19-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
 // CHECK19-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 4
 // CHECK19-NEXT:    store i32 [[VLA]], ptr [[VLA_ADDR]], align 4
@@ -10327,8 +10349,9 @@ int bar(int n){
 //
 //
 // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l195
-// CHECK19-SAME: (i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]], i32 noundef [[AAA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
+// CHECK19-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]], i32 noundef [[AAA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
 // CHECK19-NEXT:  entry:
+// CHECK19-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 // CHECK19-NEXT:    [[AA_ADDR:%.*]] = alloca i32, align 4
 // CHECK19-NEXT:    [[AAA_ADDR:%.*]] = alloca i32, align 4
@@ -10336,6 +10359,7 @@ int bar(int n){
 // CHECK19-NEXT:    [[A_CASTED:%.*]] = alloca i32, align 4
 // CHECK19-NEXT:    [[AA_CASTED:%.*]] = alloca i32, align 4
 // CHECK19-NEXT:    [[AAA_CASTED:%.*]] = alloca i32, align 4
+// CHECK19-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK19-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
 // CHECK19-NEXT:    store i32 [[AA]], ptr [[AA_ADDR]], align 4
 // CHECK19-NEXT:    store i32 [[AAA]], ptr [[AAA_ADDR]], align 4
@@ -10376,14 +10400,16 @@ int bar(int n){
 //
 //
 // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l216
-// CHECK19-SAME: (ptr noundef [[THIS:%.*]], i32 noundef [[B:%.*]], i32 noundef [[VLA:%.*]], i32 noundef [[VLA1:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[C:%.*]]) #[[ATTR0]] {
+// CHECK19-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef [[THIS:%.*]], i32 noundef [[B:%.*]], i32 noundef [[VLA:%.*]], i32 noundef [[VLA1:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[C:%.*]]) #[[ATTR0]] {
 // CHECK19-NEXT:  entry:
+// CHECK19-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
 // CHECK19-NEXT:    [[VLA_ADDR:%.*]] = alloca i32, align 4
 // CHECK19-NEXT:    [[VLA_ADDR2:%.*]] = alloca i32, align 4
 // CHECK19-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[B_CASTED:%.*]] = alloca i32, align 4
+// CHECK19-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK19-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
 // CHECK19-NEXT:    store i32 [[B]], ptr [[B_ADDR]], align 4
 // CHECK19-NEXT:    store i32 [[VLA]], ptr [[VLA_ADDR]], align 4
@@ -10496,13 +10522,15 @@ int bar(int n){
 //
 //
 // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l178
-// CHECK19-SAME: (i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
+// CHECK19-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
 // CHECK19-NEXT:  entry:
+// CHECK19-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 // CHECK19-NEXT:    [[AA_ADDR:%.*]] = alloca i32, align 4
 // CHECK19-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[A_CASTED:%.*]] = alloca i32, align 4
 // CHECK19-NEXT:    [[AA_CASTED:%.*]] = alloca i32, align 4
+// CHECK19-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK19-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
 // CHECK19-NEXT:    store i32 [[AA]], ptr [[AA_ADDR]], align 4
 // CHECK19-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 4
@@ -10604,8 +10632,10 @@ int bar(int n){
 //
 //
 // CHECK21-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l96
-// CHECK21-SAME: () #[[ATTR0:[0-9]+]] {
+// CHECK21-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK21-NEXT:  entry:
+// CHECK21-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
+// CHECK21-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK21-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB2:[0-9]+]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l96.omp_outlined)
 // CHECK21-NEXT:    ret void
 //
@@ -10678,14 +10708,16 @@ int bar(int n){
 //
 //
 // CHECK21-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l108
-// CHECK21-SAME: (i64 noundef [[AA:%.*]], i64 noundef [[LIN:%.*]], i64 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK21-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[AA:%.*]], i64 noundef [[LIN:%.*]], i64 noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK21-NEXT:  entry:
+// CHECK21-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK21-NEXT:    [[AA_ADDR:%.*]] = alloca i64, align 8
 // CHECK21-NEXT:    [[LIN_ADDR:%.*]] = alloca i64, align 8
 // CHECK21-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
 // CHECK21-NEXT:    [[AA_CASTED:%.*]] = alloca i64, align 8
 // CHECK21-NEXT:    [[LIN_CASTED:%.*]] = alloca i64, align 8
 // CHECK21-NEXT:    [[A_CASTED:%.*]] = alloca i64, align 8
+// CHECK21-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK21-NEXT:    store i64 [[AA]], ptr [[AA_ADDR]], align 8
 // CHECK21-NEXT:    store i64 [[LIN]], ptr [[LIN_ADDR]], align 8
 // CHECK21-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
@@ -10825,12 +10857,14 @@ int bar(int n){
 //
 //
 // CHECK21-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l116
-// CHECK21-SAME: (i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]]) #[[ATTR0]] {
+// CHECK21-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]]) #[[ATTR0]] {
 // CHECK21-NEXT:  entry:
+// CHECK21-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK21-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
 // CHECK21-NEXT:    [[AA_ADDR:%.*]] = alloca i64, align 8
 // CHECK21-NEXT:    [[A_CASTED:%.*]] = alloca i64, align 8
 // CHECK21-NEXT:    [[AA_CASTED:%.*]] = alloca i64, align 8
+// CHECK21-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK21-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
 // CHECK21-NEXT:    store i64 [[AA]], ptr [[AA_ADDR]], align 8
 // CHECK21-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
@@ -10924,8 +10958,9 @@ int bar(int n){
 //
 //
 // CHECK21-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l140
-// CHECK21-SAME: (i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BN:%.*]], ptr noundef nonnull align 8 dereferenceable(400) [[C:%.*]], i64 noundef [[VLA1:%.*]], i64 noundef [[VLA3:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[CN:%.*]], ptr noundef nonnull align 8 dereferenceable(16) [[D:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] {
+// CHECK21-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BN:%.*]], ptr noundef nonnull align 8 dereferenceable(400) [[C:%.*]], i64 noundef [[VLA1:%.*]], i64 noundef [[VLA3:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[CN:%.*]], ptr noundef nonnull align 8 dereferenceable(16) [[D:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] {
 // CHECK21-NEXT:  entry:
+// CHECK21-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK21-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
 // CHECK21-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
 // CHECK21-NEXT:    [[VLA_ADDR:%.*]] = alloca i64, align 8
@@ -10938,6 +10973,7 @@ int bar(int n){
 // CHECK21-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8
 // CHECK21-NEXT:    [[A_CASTED:%.*]] = alloca i64, align 8
 // CHECK21-NEXT:    [[DOTCAPTURE_EXPR__CASTED:%.*]] = alloca i64, align 8
+// CHECK21-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK21-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
 // CHECK21-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
 // CHECK21-NEXT:    store i64 [[VLA]], ptr [[VLA_ADDR]], align 8
@@ -11117,8 +11153,9 @@ int bar(int n){
 //
 //
 // CHECK21-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l195
-// CHECK21-SAME: (i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]], i64 noundef [[AAA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
+// CHECK21-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]], i64 noundef [[AAA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
 // CHECK21-NEXT:  entry:
+// CHECK21-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK21-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
 // CHECK21-NEXT:    [[AA_ADDR:%.*]] = alloca i64, align 8
 // CHECK21-NEXT:    [[AAA_ADDR:%.*]] = alloca i64, align 8
@@ -11126,6 +11163,7 @@ int bar(int n){
 // CHECK21-NEXT:    [[A_CASTED:%.*]] = alloca i64, align 8
 // CHECK21-NEXT:    [[AA_CASTED:%.*]] = alloca i64, align 8
 // CHECK21-NEXT:    [[AAA_CASTED:%.*]] = alloca i64, align 8
+// CHECK21-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK21-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
 // CHECK21-NEXT:    store i64 [[AA]], ptr [[AA_ADDR]], align 8
 // CHECK21-NEXT:    store i64 [[AAA]], ptr [[AAA_ADDR]], align 8
@@ -11166,8 +11204,9 @@ int bar(int n){
 //
 //
 // CHECK21-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l214
-// CHECK21-SAME: (ptr noundef [[THIS:%.*]], i64 noundef [[B:%.*]], i64 noundef [[VLA:%.*]], i64 noundef [[VLA1:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[C:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] {
+// CHECK21-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef [[THIS:%.*]], i64 noundef [[B:%.*]], i64 noundef [[VLA:%.*]], i64 noundef [[VLA1:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[C:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] {
 // CHECK21-NEXT:  entry:
+// CHECK21-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK21-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK21-NEXT:    [[B_ADDR:%.*]] = alloca i64, align 8
 // CHECK21-NEXT:    [[VLA_ADDR:%.*]] = alloca i64, align 8
@@ -11179,6 +11218,7 @@ int bar(int n){
 // CHECK21-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
 // CHECK21-NEXT:    [[DOTBOUND_ZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK21-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB2]])
+// CHECK21-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK21-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
 // CHECK21-NEXT:    store i64 [[B]], ptr [[B_ADDR]], align 8
 // CHECK21-NEXT:    store i64 [[VLA]], ptr [[VLA_ADDR]], align 8
@@ -11372,13 +11412,15 @@ int bar(int n){
 //
 //
 // CHECK21-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l178
-// CHECK21-SAME: (i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
+// CHECK21-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
 // CHECK21-NEXT:  entry:
+// CHECK21-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK21-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
 // CHECK21-NEXT:    [[AA_ADDR:%.*]] = alloca i64, align 8
 // CHECK21-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
 // CHECK21-NEXT:    [[A_CASTED:%.*]] = alloca i64, align 8
 // CHECK21-NEXT:    [[AA_CASTED:%.*]] = alloca i64, align 8
+// CHECK21-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK21-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
 // CHECK21-NEXT:    store i64 [[AA]], ptr [[AA_ADDR]], align 8
 // CHECK21-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
@@ -11480,8 +11522,10 @@ int bar(int n){
 //
 //
 // CHECK23-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l96
-// CHECK23-SAME: () #[[ATTR0:[0-9]+]] {
+// CHECK23-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK23-NEXT:  entry:
+// CHECK23-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
+// CHECK23-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK23-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB2:[0-9]+]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l96.omp_outlined)
 // CHECK23-NEXT:    ret void
 //
@@ -11554,14 +11598,16 @@ int bar(int n){
 //
 //
 // CHECK23-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l108
-// CHECK23-SAME: (i32 noundef [[AA:%.*]], i32 noundef [[LIN:%.*]], i32 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK23-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[AA:%.*]], i32 noundef [[LIN:%.*]], i32 noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK23-NEXT:  entry:
+// CHECK23-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK23-NEXT:    [[AA_ADDR:%.*]] = alloca i32, align 4
 // CHECK23-NEXT:    [[LIN_ADDR:%.*]] = alloca i32, align 4
 // CHECK23-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 // CHECK23-NEXT:    [[AA_CASTED:%.*]] = alloca i32, align 4
 // CHECK23-NEXT:    [[LIN_CASTED:%.*]] = alloca i32, align 4
 // CHECK23-NEXT:    [[A_CASTED:%.*]] = alloca i32, align 4
+// CHECK23-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK23-NEXT:    store i32 [[AA]], ptr [[AA_ADDR]], align 4
 // CHECK23-NEXT:    store i32 [[LIN]], ptr [[LIN_ADDR]], align 4
 // CHECK23-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
@@ -11701,12 +11747,14 @@ int bar(int n){
 //
 //
 // CHECK23-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l116
-// CHECK23-SAME: (i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]]) #[[ATTR0]] {
+// CHECK23-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]]) #[[ATTR0]] {
 // CHECK23-NEXT:  entry:
+// CHECK23-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK23-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 // CHECK23-NEXT:    [[AA_ADDR:%.*]] = alloca i32, align 4
 // CHECK23-NEXT:    [[A_CASTED:%.*]] = alloca i32, align 4
 // CHECK23-NEXT:    [[AA_CASTED:%.*]] = alloca i32, align 4
+// CHECK23-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK23-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
 // CHECK23-NEXT:    store i32 [[AA]], ptr [[AA_ADDR]], align 4
 // CHECK23-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
@@ -11800,8 +11848,9 @@ int bar(int n){
 //
 //
 // CHECK23-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l140
-// CHECK23-SAME: (i32 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BN:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[C:%.*]], i32 noundef [[VLA1:%.*]], i32 noundef [[VLA3:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[CN:%.*]], ptr noundef nonnull align 4 dereferenceable(12) [[D:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] {
+// CHECK23-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BN:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[C:%.*]], i32 noundef [[VLA1:%.*]], i32 noundef [[VLA3:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[CN:%.*]], ptr noundef nonnull align 4 dereferenceable(12) [[D:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] {
 // CHECK23-NEXT:  entry:
+// CHECK23-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK23-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 // CHECK23-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 4
 // CHECK23-NEXT:    [[VLA_ADDR:%.*]] = alloca i32, align 4
@@ -11814,6 +11863,7 @@ int bar(int n){
 // CHECK23-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i32, align 4
 // CHECK23-NEXT:    [[A_CASTED:%.*]] = alloca i32, align 4
 // CHECK23-NEXT:    [[DOTCAPTURE_EXPR__CASTED:%.*]] = alloca i32, align 4
+// CHECK23-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK23-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
 // CHECK23-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 4
 // CHECK23-NEXT:    store i32 [[VLA]], ptr [[VLA_ADDR]], align 4
@@ -11993,8 +12043,9 @@ int bar(int n){
 //
 //
 // CHECK23-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l195
-// CHECK23-SAME: (i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]], i32 noundef [[AAA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
+// CHECK23-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]], i32 noundef [[AAA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
 // CHECK23-NEXT:  entry:
+// CHECK23-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK23-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 // CHECK23-NEXT:    [[AA_ADDR:%.*]] = alloca i32, align 4
 // CHECK23-NEXT:    [[AAA_ADDR:%.*]] = alloca i32, align 4
@@ -12002,6 +12053,7 @@ int bar(int n){
 // CHECK23-NEXT:    [[A_CASTED:%.*]] = alloca i32, align 4
 // CHECK23-NEXT:    [[AA_CASTED:%.*]] = alloca i32, align 4
 // CHECK23-NEXT:    [[AAA_CASTED:%.*]] = alloca i32, align 4
+// CHECK23-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK23-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
 // CHECK23-NEXT:    store i32 [[AA]], ptr [[AA_ADDR]], align 4
 // CHECK23-NEXT:    store i32 [[AAA]], ptr [[AAA_ADDR]], align 4
@@ -12042,8 +12094,9 @@ int bar(int n){
 //
 //
 // CHECK23-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l214
-// CHECK23-SAME: (ptr noundef [[THIS:%.*]], i32 noundef [[B:%.*]], i32 noundef [[VLA:%.*]], i32 noundef [[VLA1:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[C:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] {
+// CHECK23-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef [[THIS:%.*]], i32 noundef [[B:%.*]], i32 noundef [[VLA:%.*]], i32 noundef [[VLA1:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[C:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] {
 // CHECK23-NEXT:  entry:
+// CHECK23-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK23-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
 // CHECK23-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
 // CHECK23-NEXT:    [[VLA_ADDR:%.*]] = alloca i32, align 4
@@ -12055,6 +12108,7 @@ int bar(int n){
 // CHECK23-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
 // CHECK23-NEXT:    [[DOTBOUND_ZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK23-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB2]])
+// CHECK23-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK23-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
 // CHECK23-NEXT:    store i32 [[B]], ptr [[B_ADDR]], align 4
 // CHECK23-NEXT:    store i32 [[VLA]], ptr [[VLA_ADDR]], align 4
@@ -12248,13 +12302,15 @@ int bar(int n){
 //
 //
 // CHECK23-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l178
-// CHECK23-SAME: (i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
+// CHECK23-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
 // CHECK23-NEXT:  entry:
+// CHECK23-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK23-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 // CHECK23-NEXT:    [[AA_ADDR:%.*]] = alloca i32, align 4
 // CHECK23-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 4
 // CHECK23-NEXT:    [[A_CASTED:%.*]] = alloca i32, align 4
 // CHECK23-NEXT:    [[AA_CASTED:%.*]] = alloca i32, align 4
+// CHECK23-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK23-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
 // CHECK23-NEXT:    store i32 [[AA]], ptr [[AA_ADDR]], align 4
 // CHECK23-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 4
diff --git a/clang/test/OpenMP/target_parallel_for_simd_tl_codegen.cpp b/clang/test/OpenMP/target_parallel_for_simd_tl_codegen.cpp
index daeb5102b0e22..382ecf8808800 100644
--- a/clang/test/OpenMP/target_parallel_for_simd_tl_codegen.cpp
+++ b/clang/test/OpenMP/target_parallel_for_simd_tl_codegen.cpp
@@ -1,4 +1,4 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --include-generated-funcs --version 2
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_  size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _ --version 2
 // RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fopenmp -fopenmp-version=51 -emit-llvm %s -o - | FileCheck --check-prefix=OMP51 %s
 // RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fopenmp -fopenmp-version=51 -x c++ -std=c++11 -emit-pch -o %t %s
 // RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fopenmp -fopenmp-version=51 -x c++ -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix=OMP51
@@ -18,7 +18,8 @@ int thread_limit_target_parallel_for_simd() {
 }
 
 #endif
-// OMP51-LABEL: define {{.*}}thread_limit_target_parallel_for_simd{{.*}}{
+// OMP51-LABEL: define dso_local noundef i32 @_Z37thread_limit_target_parallel_for_simdv
+// OMP51-SAME: () #[[ATTR0:[0-9]+]] {
 // OMP51-NEXT:  entry:
 // OMP51-NEXT:    [[AGG_CAPTURED:%.*]] = alloca [[STRUCT_ANON:%.*]], align 1
 // OMP51-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB2:[0-9]+]])
@@ -30,7 +31,82 @@ int thread_limit_target_parallel_for_simd() {
 // OMP51-NEXT:    ret i32 0
 //
 //
-// OMP51-LABEL: define {{.*}}omp_task_entry{{.*}}{
+// OMP51-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z37thread_limit_target_parallel_for_simdv_l14
+// OMP51-SAME: () #[[ATTR1:[0-9]+]] {
+// OMP51-NEXT:  entry:
+// OMP51-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB2]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z37thread_limit_target_parallel_for_simdv_l14.omp_outlined)
+// OMP51-NEXT:    ret void
+//
+//
+// OMP51-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z37thread_limit_target_parallel_for_simdv_l14.omp_outlined
+// OMP51-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// OMP51-NEXT:  entry:
+// OMP51-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// OMP51-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// OMP51-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// OMP51-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// OMP51-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// OMP51-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// OMP51-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// OMP51-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// OMP51-NEXT:    [[I:%.*]] = alloca i32, align 4
+// OMP51-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// OMP51-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// OMP51-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4
+// OMP51-NEXT:    store i32 1, ptr [[DOTOMP_UB]], align 4
+// OMP51-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// OMP51-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// OMP51-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// OMP51-NEXT:    [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// OMP51-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB1:[0-9]+]], i32 [[TMP1]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// OMP51-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// OMP51-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 1
+// OMP51-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// OMP51:       cond.true:
+// OMP51-NEXT:    br label [[COND_END:%.*]]
+// OMP51:       cond.false:
+// OMP51-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// OMP51-NEXT:    br label [[COND_END]]
+// OMP51:       cond.end:
+// OMP51-NEXT:    [[COND:%.*]] = phi i32 [ 1, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// OMP51-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// OMP51-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// OMP51-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
+// OMP51-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// OMP51:       omp.inner.for.cond:
+// OMP51-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3:![0-9]+]]
+// OMP51-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !llvm.access.group [[ACC_GRP3]]
+// OMP51-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
+// OMP51-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// OMP51:       omp.inner.for.body:
+// OMP51-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3]]
+// OMP51-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
+// OMP51-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// OMP51-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP3]]
+// OMP51-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// OMP51:       omp.body.continue:
+// OMP51-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// OMP51:       omp.inner.for.inc:
+// OMP51-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3]]
+// OMP51-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP8]], 1
+// OMP51-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3]]
+// OMP51-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP4:![0-9]+]]
+// OMP51:       omp.inner.for.end:
+// OMP51-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// OMP51:       omp.loop.exit:
+// OMP51-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP1]])
+// OMP51-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTOMP_IS_LAST]], align 4
+// OMP51-NEXT:    [[TMP10:%.*]] = icmp ne i32 [[TMP9]], 0
+// OMP51-NEXT:    br i1 [[TMP10]], label [[DOTOMP_FINAL_THEN:%.*]], label [[DOTOMP_FINAL_DONE:%.*]]
+// OMP51:       .omp.final.then:
+// OMP51-NEXT:    store i32 2, ptr [[I]], align 4
+// OMP51-NEXT:    br label [[DOTOMP_FINAL_DONE]]
+// OMP51:       .omp.final.done:
+// OMP51-NEXT:    ret void
+//
+//
+// OMP51-LABEL: define internal noundef i32 @.omp_task_entry.
+// OMP51-SAME: (i32 noundef [[TMP0:%.*]], ptr noalias noundef [[TMP1:%.*]]) #[[ATTR3:[0-9]+]] {
 // OMP51-NEXT:  entry:
 // OMP51-NEXT:    [[DOTGLOBAL_TID__ADDR_I:%.*]] = alloca i32, align 4
 // OMP51-NEXT:    [[DOTPART_ID__ADDR_I:%.*]] = alloca ptr, align 8
@@ -61,6 +137,6 @@ int thread_limit_target_parallel_for_simd() {
 // OMP51-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !18
 // OMP51-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !18
 // OMP51-NEXT:    call void @__kmpc_set_thread_limit(ptr @[[GLOB2]], i32 [[TMP9]], i32 2)
-// OMP51-NEXT:    call void @__omp_offloading{{.*}}thread_limit_target_parallel_for_simd{{.*\(.*\).*}}
+// OMP51-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z37thread_limit_target_parallel_for_simdv_l14() #[[ATTR2]]
 // OMP51-NEXT:    ret i32 0
 //
diff --git a/clang/test/OpenMP/target_parallel_for_tl_codegen.cpp b/clang/test/OpenMP/target_parallel_for_tl_codegen.cpp
index e6483b704586e..c471dbe32aebe 100644
--- a/clang/test/OpenMP/target_parallel_for_tl_codegen.cpp
+++ b/clang/test/OpenMP/target_parallel_for_tl_codegen.cpp
@@ -1,4 +1,4 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --include-generated-funcs --version 2
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_  size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _ --version 2
 // RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fopenmp -fopenmp-version=51 -emit-llvm %s -o - | FileCheck --check-prefix=OMP51 %s
 // RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fopenmp -fopenmp-version=51 -x c++ -std=c++11 -emit-pch -o %t %s
 // RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fopenmp -fopenmp-version=51 -x c++ -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix=OMP51
@@ -18,7 +18,8 @@ int thread_limit_target_parallel_for() {
 }
 
 #endif
-// OMP51-LABEL: define {{.*}}thread_limit_target_parallel_for{{.*}}{
+// OMP51-LABEL: define dso_local noundef i32 @_Z32thread_limit_target_parallel_forv
+// OMP51-SAME: () #[[ATTR0:[0-9]+]] {
 // OMP51-NEXT:  entry:
 // OMP51-NEXT:    [[AGG_CAPTURED:%.*]] = alloca [[STRUCT_ANON:%.*]], align 1
 // OMP51-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB2:[0-9]+]])
@@ -30,7 +31,75 @@ int thread_limit_target_parallel_for() {
 // OMP51-NEXT:    ret i32 0
 //
 //
-// OMP51-LABEL: define {{.*}}omp_task_entry{{.*}}{
+// OMP51-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z32thread_limit_target_parallel_forv_l14
+// OMP51-SAME: () #[[ATTR1:[0-9]+]] {
+// OMP51-NEXT:  entry:
+// OMP51-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB2]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z32thread_limit_target_parallel_forv_l14.omp_outlined)
+// OMP51-NEXT:    ret void
+//
+//
+// OMP51-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z32thread_limit_target_parallel_forv_l14.omp_outlined
+// OMP51-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// OMP51-NEXT:  entry:
+// OMP51-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// OMP51-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// OMP51-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// OMP51-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// OMP51-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// OMP51-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// OMP51-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// OMP51-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// OMP51-NEXT:    [[I:%.*]] = alloca i32, align 4
+// OMP51-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// OMP51-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// OMP51-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4
+// OMP51-NEXT:    store i32 1, ptr [[DOTOMP_UB]], align 4
+// OMP51-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// OMP51-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// OMP51-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// OMP51-NEXT:    [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// OMP51-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB1:[0-9]+]], i32 [[TMP1]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// OMP51-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// OMP51-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 1
+// OMP51-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// OMP51:       cond.true:
+// OMP51-NEXT:    br label [[COND_END:%.*]]
+// OMP51:       cond.false:
+// OMP51-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// OMP51-NEXT:    br label [[COND_END]]
+// OMP51:       cond.end:
+// OMP51-NEXT:    [[COND:%.*]] = phi i32 [ 1, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// OMP51-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// OMP51-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// OMP51-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
+// OMP51-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// OMP51:       omp.inner.for.cond:
+// OMP51-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// OMP51-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// OMP51-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
+// OMP51-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// OMP51:       omp.inner.for.body:
+// OMP51-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// OMP51-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
+// OMP51-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// OMP51-NEXT:    store i32 [[ADD]], ptr [[I]], align 4
+// OMP51-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// OMP51:       omp.body.continue:
+// OMP51-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// OMP51:       omp.inner.for.inc:
+// OMP51-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// OMP51-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP8]], 1
+// OMP51-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4
+// OMP51-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// OMP51:       omp.inner.for.end:
+// OMP51-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// OMP51:       omp.loop.exit:
+// OMP51-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP1]])
+// OMP51-NEXT:    ret void
+//
+//
+// OMP51-LABEL: define internal noundef i32 @.omp_task_entry.
+// OMP51-SAME: (i32 noundef [[TMP0:%.*]], ptr noalias noundef [[TMP1:%.*]]) #[[ATTR3:[0-9]+]] {
 // OMP51-NEXT:  entry:
 // OMP51-NEXT:    [[DOTGLOBAL_TID__ADDR_I:%.*]] = alloca i32, align 4
 // OMP51-NEXT:    [[DOTPART_ID__ADDR_I:%.*]] = alloca ptr, align 8
@@ -61,6 +130,6 @@ int thread_limit_target_parallel_for() {
 // OMP51-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !14
 // OMP51-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !14
 // OMP51-NEXT:    call void @__kmpc_set_thread_limit(ptr @[[GLOB2]], i32 [[TMP9]], i32 2)
-// OMP51-NEXT:    call void @__omp_offloading{{.*}}thread_limit_target_parallel_for{{.*}}
+// OMP51-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z32thread_limit_target_parallel_forv_l14() #[[ATTR2]]
 // OMP51-NEXT:    ret i32 0
 //
diff --git a/clang/test/OpenMP/target_parallel_generic_loop_codegen-1.cpp b/clang/test/OpenMP/target_parallel_generic_loop_codegen-1.cpp
index 7fdc69b58bd6a..1df762c9fa0ee 100644
--- a/clang/test/OpenMP/target_parallel_generic_loop_codegen-1.cpp
+++ b/clang/test/OpenMP/target_parallel_generic_loop_codegen-1.cpp
@@ -276,10 +276,10 @@ int bar(int a){
 
 #endif
 // CHECK-LABEL: define {{[^@]+}}@__cxx_global_var_init
-// CHECK-SAME: () #[[ATTR4:[0-9]+]] {
+// CHECK-SAME: () #[[ATTR3:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    call void @_ZN2SAC1Ev(ptr noundef nonnull align 4 dereferenceable(16) @_ZL2a1)
-// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__cxa_atexit(ptr @_ZN2SAD1Ev, ptr @_ZL2a1, ptr @__dso_handle) #[[ATTR3:[0-9]+]]
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__cxa_atexit(ptr @_ZN2SAD1Ev, ptr @_ZL2a1, ptr @__dso_handle) #[[ATTR2:[0-9]+]]
 // CHECK-NEXT:    ret void
 //
 //
@@ -288,18 +288,18 @@ int bar(int a){
 //
 //
 // CHECK-LABEL: define {{[^@]+}}@__cxx_global_var_init.1
-// CHECK-SAME: () #[[ATTR4]] {
+// CHECK-SAME: () #[[ATTR3]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    call void @_ZN2SAC1Ev(ptr noundef nonnull align 4 dereferenceable(16) @a2)
-// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__cxa_atexit(ptr @_ZN2SAD1Ev, ptr @a2, ptr @__dso_handle) #[[ATTR3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__cxa_atexit(ptr @_ZN2SAD1Ev, ptr @a2, ptr @__dso_handle) #[[ATTR2]]
 // CHECK-NEXT:    ret void
 //
 //
 // CHECK-LABEL: define {{[^@]+}}@__cxx_global_var_init.2
-// CHECK-SAME: () #[[ATTR4]] {
+// CHECK-SAME: () #[[ATTR3]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    call void @_ZN2SBC1Ev(ptr noundef nonnull align 4 dereferenceable(32) @b1)
-// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__cxa_atexit(ptr @_ZN2SBD1Ev, ptr @b1, ptr @__dso_handle) #[[ATTR3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__cxa_atexit(ptr @_ZN2SBD1Ev, ptr @b1, ptr @__dso_handle) #[[ATTR2]]
 // CHECK-NEXT:    ret void
 //
 //
@@ -308,18 +308,18 @@ int bar(int a){
 //
 //
 // CHECK-LABEL: define {{[^@]+}}@__cxx_global_var_init.3
-// CHECK-SAME: () #[[ATTR4]] {
+// CHECK-SAME: () #[[ATTR3]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    call void @_ZN2SBC1Ev(ptr noundef nonnull align 4 dereferenceable(32) @b2)
-// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__cxa_atexit(ptr @_ZN2SBD1Ev, ptr @b2, ptr @__dso_handle) #[[ATTR3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__cxa_atexit(ptr @_ZN2SBD1Ev, ptr @b2, ptr @__dso_handle) #[[ATTR2]]
 // CHECK-NEXT:    ret void
 //
 //
 // CHECK-LABEL: define {{[^@]+}}@__cxx_global_var_init.4
-// CHECK-SAME: () #[[ATTR4]] {
+// CHECK-SAME: () #[[ATTR3]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    call void @_ZN2SCC1Ev(ptr noundef nonnull align 4 dereferenceable(64) @_ZL2c1)
-// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__cxa_atexit(ptr @_ZN2SCD1Ev, ptr @_ZL2c1, ptr @__dso_handle) #[[ATTR3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__cxa_atexit(ptr @_ZN2SCD1Ev, ptr @_ZL2c1, ptr @__dso_handle) #[[ATTR2]]
 // CHECK-NEXT:    ret void
 //
 //
@@ -330,10 +330,10 @@ int bar(int a){
 //
 //
 // CHECK-LABEL: define {{[^@]+}}@__cxx_global_var_init.7
-// CHECK-SAME: () #[[ATTR4]] {
+// CHECK-SAME: () #[[ATTR3]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    call void @_ZN2SDC1Ev(ptr noundef nonnull align 4 dereferenceable(128) @d1)
-// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__cxa_atexit(ptr @_ZN2SDD1Ev, ptr @d1, ptr @__dso_handle) #[[ATTR3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__cxa_atexit(ptr @_ZN2SDD1Ev, ptr @d1, ptr @__dso_handle) #[[ATTR2]]
 // CHECK-NEXT:    ret void
 //
 //
@@ -344,10 +344,10 @@ int bar(int a){
 //
 //
 // CHECK-LABEL: define {{[^@]+}}@__cxx_global_var_init.10
-// CHECK-SAME: () #[[ATTR4]] {
+// CHECK-SAME: () #[[ATTR3]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    call void @_ZN2SEC1Ev(ptr noundef nonnull align 4 dereferenceable(256) @e1)
-// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__cxa_atexit(ptr @_ZN2SED1Ev, ptr @e1, ptr @__dso_handle) #[[ATTR3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__cxa_atexit(ptr @_ZN2SED1Ev, ptr @e1, ptr @__dso_handle) #[[ATTR2]]
 // CHECK-NEXT:    ret void
 //
 //
@@ -360,10 +360,10 @@ int bar(int a){
 //
 //
 // CHECK-LABEL: define {{[^@]+}}@__cxx_global_var_init.15
-// CHECK-SAME: () #[[ATTR4]] {
+// CHECK-SAME: () #[[ATTR3]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    call void @_ZN2STILi100EEC1Ev(ptr noundef nonnull align 4 dereferenceable(912) @t1)
-// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__cxa_atexit(ptr @_ZN2STILi100EED1Ev, ptr @t1, ptr @__dso_handle) #[[ATTR3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__cxa_atexit(ptr @_ZN2STILi100EED1Ev, ptr @t1, ptr @__dso_handle) #[[ATTR2]]
 // CHECK-NEXT:    ret void
 //
 //
@@ -376,10 +376,10 @@ int bar(int a){
 //
 //
 // CHECK-LABEL: define {{[^@]+}}@__cxx_global_var_init.20
-// CHECK-SAME: () #[[ATTR4]] {
+// CHECK-SAME: () #[[ATTR3]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    call void @_ZN2STILi1000EEC1Ev(ptr noundef nonnull align 4 dereferenceable(4512) @t2)
-// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__cxa_atexit(ptr @_ZN2STILi1000EED1Ev, ptr @t2, ptr @__dso_handle) #[[ATTR3]]
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__cxa_atexit(ptr @_ZN2STILi1000EED1Ev, ptr @t2, ptr @__dso_handle) #[[ATTR2]]
 // CHECK-NEXT:    ret void
 //
 //
@@ -400,7 +400,7 @@ int bar(int a){
 //
 //
 // CHECK-LABEL: define {{[^@]+}}@_GLOBAL__I_000500
-// CHECK-SAME: () #[[ATTR4]] {
+// CHECK-SAME: () #[[ATTR3]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    call void @__cxx_global_var_init()
 // CHECK-NEXT:    call void @__cxx_global_var_init.2()
@@ -408,14 +408,14 @@ int bar(int a){
 //
 //
 // CHECK-LABEL: define {{[^@]+}}@_GLOBAL__I_000501
-// CHECK-SAME: () #[[ATTR4]] {
+// CHECK-SAME: () #[[ATTR3]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    call void @__cxx_global_var_init.3()
 // CHECK-NEXT:    ret void
 //
 //
 // CHECK-LABEL: define {{[^@]+}}@_GLOBAL__sub_I_target_parallel_generic_loop_codegen_1.cpp
-// CHECK-SAME: () #[[ATTR4]] {
+// CHECK-SAME: () #[[ATTR3]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    call void @__cxx_global_var_init.1()
 // CHECK-NEXT:    call void @__cxx_global_var_init.4()
@@ -427,7 +427,7 @@ int bar(int a){
 //
 //
 // CHECK-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK-SAME: () #[[ATTR4]] {
+// CHECK-SAME: () #[[ATTR3]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK-NEXT:    ret void
@@ -4251,7 +4251,7 @@ int bar(int a){
 //
 //
 // OMP-DEfAULT-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SCC1Ev_l148.omp_outlined
-// OMP-DEfAULT-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]]) #[[ATTR4:[0-9]+]] {
+// OMP-DEfAULT-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]]) #[[ATTR3]] {
 // OMP-DEfAULT-NEXT:  entry:
 // OMP-DEfAULT-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // OMP-DEfAULT-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -4460,7 +4460,7 @@ int bar(int a){
 //
 //
 // OMP-DEfAULT-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SDD1Ev_l174.omp_outlined
-// OMP-DEfAULT-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]]) #[[ATTR4]] {
+// OMP-DEfAULT-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]]) #[[ATTR3]] {
 // OMP-DEfAULT-NEXT:  entry:
 // OMP-DEfAULT-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // OMP-DEfAULT-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -4631,7 +4631,7 @@ int bar(int a){
 //
 //
 // OMP-DEfAULT-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SEC1Ev_l192.omp_outlined
-// OMP-DEfAULT-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]]) #[[ATTR4]] {
+// OMP-DEfAULT-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]]) #[[ATTR3]] {
 // OMP-DEfAULT-NEXT:  entry:
 // OMP-DEfAULT-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // OMP-DEfAULT-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -4774,7 +4774,7 @@ int bar(int a){
 //
 //
 // OMP-DEfAULT-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SED1Ev_l199.omp_outlined
-// OMP-DEfAULT-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]]) #[[ATTR4]] {
+// OMP-DEfAULT-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]]) #[[ATTR3]] {
 // OMP-DEfAULT-NEXT:  entry:
 // OMP-DEfAULT-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // OMP-DEfAULT-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -4945,7 +4945,7 @@ int bar(int a){
 //
 //
 // OMP-DEfAULT-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STILi100EEC1Ev_l218.omp_outlined
-// OMP-DEfAULT-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]]) #[[ATTR4]] {
+// OMP-DEfAULT-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]]) #[[ATTR3]] {
 // OMP-DEfAULT-NEXT:  entry:
 // OMP-DEfAULT-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // OMP-DEfAULT-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -5088,7 +5088,7 @@ int bar(int a){
 //
 //
 // OMP-DEfAULT-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STILi100EED1Ev_l225.omp_outlined
-// OMP-DEfAULT-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]]) #[[ATTR4]] {
+// OMP-DEfAULT-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]]) #[[ATTR3]] {
 // OMP-DEfAULT-NEXT:  entry:
 // OMP-DEfAULT-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // OMP-DEfAULT-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -5259,7 +5259,7 @@ int bar(int a){
 //
 //
 // OMP-DEfAULT-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STILi1000EEC1Ev_l218.omp_outlined
-// OMP-DEfAULT-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]]) #[[ATTR4]] {
+// OMP-DEfAULT-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]]) #[[ATTR3]] {
 // OMP-DEfAULT-NEXT:  entry:
 // OMP-DEfAULT-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // OMP-DEfAULT-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -5402,7 +5402,7 @@ int bar(int a){
 //
 //
 // OMP-DEfAULT-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STILi1000EED1Ev_l225.omp_outlined
-// OMP-DEfAULT-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]]) #[[ATTR4]] {
+// OMP-DEfAULT-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]]) #[[ATTR3]] {
 // OMP-DEfAULT-NEXT:  entry:
 // OMP-DEfAULT-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // OMP-DEfAULT-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -5467,7 +5467,7 @@ int bar(int a){
 //
 //
 // OMP-DEfAULT-LABEL: define {{[^@]+}}@_Z3bari
-// OMP-DEfAULT-SAME: (i32 noundef [[A:%.*]]) #[[ATTR5:[0-9]+]] {
+// OMP-DEfAULT-SAME: (i32 noundef [[A:%.*]]) #[[ATTR4:[0-9]+]] {
 // OMP-DEfAULT-NEXT:  entry:
 // OMP-DEfAULT-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 // OMP-DEfAULT-NEXT:    [[R:%.*]] = alloca i32, align 4
@@ -5540,7 +5540,7 @@ int bar(int a){
 //
 //
 // OMP-DEfAULT-LABEL: define {{[^@]+}}@_ZN2SA3fooEv
-// OMP-DEfAULT-SAME: (ptr noundef nonnull align 4 dereferenceable(16) [[THIS:%.*]]) #[[ATTR5]] comdat align 2 {
+// OMP-DEfAULT-SAME: (ptr noundef nonnull align 4 dereferenceable(16) [[THIS:%.*]]) #[[ATTR4]] comdat align 2 {
 // OMP-DEfAULT-NEXT:  entry:
 // OMP-DEfAULT-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
 // OMP-DEfAULT-NEXT:    [[A:%.*]] = alloca i32, align 4
@@ -5559,7 +5559,7 @@ int bar(int a){
 //
 //
 // OMP-DEfAULT-LABEL: define {{[^@]+}}@_ZN2SB3fooEv
-// OMP-DEfAULT-SAME: (ptr noundef nonnull align 4 dereferenceable(32) [[THIS:%.*]]) #[[ATTR5]] comdat align 2 {
+// OMP-DEfAULT-SAME: (ptr noundef nonnull align 4 dereferenceable(32) [[THIS:%.*]]) #[[ATTR4]] comdat align 2 {
 // OMP-DEfAULT-NEXT:  entry:
 // OMP-DEfAULT-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
 // OMP-DEfAULT-NEXT:    [[A:%.*]] = alloca i32, align 4
@@ -5624,7 +5624,7 @@ int bar(int a){
 //
 //
 // OMP-DEfAULT-LABEL: define {{[^@]+}}@_ZN2SC3fooEv
-// OMP-DEfAULT-SAME: (ptr noundef nonnull align 4 dereferenceable(64) [[THIS:%.*]]) #[[ATTR5]] comdat align 2 {
+// OMP-DEfAULT-SAME: (ptr noundef nonnull align 4 dereferenceable(64) [[THIS:%.*]]) #[[ATTR4]] comdat align 2 {
 // OMP-DEfAULT-NEXT:  entry:
 // OMP-DEfAULT-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
 // OMP-DEfAULT-NEXT:    [[A:%.*]] = alloca i32, align 4
@@ -5643,7 +5643,7 @@ int bar(int a){
 //
 //
 // OMP-DEfAULT-LABEL: define {{[^@]+}}@_ZN2SD3fooEv
-// OMP-DEfAULT-SAME: (ptr noundef nonnull align 4 dereferenceable(128) [[THIS:%.*]]) #[[ATTR5]] comdat align 2 {
+// OMP-DEfAULT-SAME: (ptr noundef nonnull align 4 dereferenceable(128) [[THIS:%.*]]) #[[ATTR4]] comdat align 2 {
 // OMP-DEfAULT-NEXT:  entry:
 // OMP-DEfAULT-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
 // OMP-DEfAULT-NEXT:    [[A:%.*]] = alloca i32, align 4
@@ -5662,7 +5662,7 @@ int bar(int a){
 //
 //
 // OMP-DEfAULT-LABEL: define {{[^@]+}}@_ZN2SE3fooEv
-// OMP-DEfAULT-SAME: (ptr noundef nonnull align 4 dereferenceable(256) [[THIS:%.*]]) #[[ATTR5]] comdat align 2 {
+// OMP-DEfAULT-SAME: (ptr noundef nonnull align 4 dereferenceable(256) [[THIS:%.*]]) #[[ATTR4]] comdat align 2 {
 // OMP-DEfAULT-NEXT:  entry:
 // OMP-DEfAULT-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
 // OMP-DEfAULT-NEXT:    [[A:%.*]] = alloca i32, align 4
@@ -5683,7 +5683,7 @@ int bar(int a){
 //
 //
 // OMP-DEfAULT-LABEL: define {{[^@]+}}@_ZN2STILi100EE3fooEv
-// OMP-DEfAULT-SAME: (ptr noundef nonnull align 4 dereferenceable(912) [[THIS:%.*]]) #[[ATTR5]] comdat align 2 {
+// OMP-DEfAULT-SAME: (ptr noundef nonnull align 4 dereferenceable(912) [[THIS:%.*]]) #[[ATTR4]] comdat align 2 {
 // OMP-DEfAULT-NEXT:  entry:
 // OMP-DEfAULT-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
 // OMP-DEfAULT-NEXT:    [[A:%.*]] = alloca i32, align 4
@@ -5748,7 +5748,7 @@ int bar(int a){
 //
 //
 // OMP-DEfAULT-LABEL: define {{[^@]+}}@_ZN2STILi1000EE3fooEv
-// OMP-DEfAULT-SAME: (ptr noundef nonnull align 4 dereferenceable(4512) [[THIS:%.*]]) #[[ATTR5]] comdat align 2 {
+// OMP-DEfAULT-SAME: (ptr noundef nonnull align 4 dereferenceable(4512) [[THIS:%.*]]) #[[ATTR4]] comdat align 2 {
 // OMP-DEfAULT-NEXT:  entry:
 // OMP-DEfAULT-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
 // OMP-DEfAULT-NEXT:    [[A:%.*]] = alloca i32, align 4
@@ -5826,7 +5826,7 @@ int bar(int a){
 //
 //
 // OMP-DEfAULT-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3bari_l267.omp_outlined
-// OMP-DEfAULT-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[R:%.*]]) #[[ATTR4]] {
+// OMP-DEfAULT-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[R:%.*]]) #[[ATTR3]] {
 // OMP-DEfAULT-NEXT:  entry:
 // OMP-DEfAULT-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // OMP-DEfAULT-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -5904,7 +5904,7 @@ int bar(int a){
 //
 //
 // OMP-DEfAULT-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SB3fooEv_l122.omp_outlined
-// OMP-DEfAULT-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]]) #[[ATTR4]] {
+// OMP-DEfAULT-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]]) #[[ATTR3]] {
 // OMP-DEfAULT-NEXT:  entry:
 // OMP-DEfAULT-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // OMP-DEfAULT-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -5969,7 +5969,7 @@ int bar(int a){
 //
 //
 // OMP-DEfAULT-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SE3fooEv_l185
-// OMP-DEfAULT-SAME: (i32 noundef [[A:%.*]]) #[[ATTR4]] {
+// OMP-DEfAULT-SAME: (i32 noundef [[A:%.*]]) #[[ATTR3]] {
 // OMP-DEfAULT-NEXT:  entry:
 // OMP-DEfAULT-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 // OMP-DEfAULT-NEXT:    [[A_CASTED:%.*]] = alloca i32, align 4
@@ -5982,7 +5982,7 @@ int bar(int a){
 //
 //
 // OMP-DEfAULT-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SE3fooEv_l185.omp_outlined
-// OMP-DEfAULT-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]]) #[[ATTR4]] {
+// OMP-DEfAULT-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]]) #[[ATTR3]] {
 // OMP-DEfAULT-NEXT:  entry:
 // OMP-DEfAULT-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // OMP-DEfAULT-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -6060,7 +6060,7 @@ int bar(int a){
 //
 //
 // OMP-DEfAULT-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STILi100EE3fooEv_l211.omp_outlined
-// OMP-DEfAULT-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]]) #[[ATTR4]] {
+// OMP-DEfAULT-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]]) #[[ATTR3]] {
 // OMP-DEfAULT-NEXT:  entry:
 // OMP-DEfAULT-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // OMP-DEfAULT-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -6138,7 +6138,7 @@ int bar(int a){
 //
 //
 // OMP-DEfAULT-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2STILi1000EE3fooEv_l211.omp_outlined
-// OMP-DEfAULT-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]]) #[[ATTR4]] {
+// OMP-DEfAULT-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]]) #[[ATTR3]] {
 // OMP-DEfAULT-NEXT:  entry:
 // OMP-DEfAULT-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // OMP-DEfAULT-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
diff --git a/clang/test/OpenMP/target_parallel_generic_loop_codegen-2.cpp b/clang/test/OpenMP/target_parallel_generic_loop_codegen-2.cpp
index 342dd732b18b4..722a9c1bfce79 100644
--- a/clang/test/OpenMP/target_parallel_generic_loop_codegen-2.cpp
+++ b/clang/test/OpenMP/target_parallel_generic_loop_codegen-2.cpp
@@ -117,7 +117,7 @@ int nested(int a){
 // CHECK-NEXT:    [[TMP21:%.*]] = icmp ne i32 [[TMP20]], 0
 // CHECK-NEXT:    br i1 [[TMP21]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK:       omp_offload.failed:
-// CHECK-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z6nestedi_l42(i64 [[TMP1]]) #[[ATTR3:[0-9]+]]
+// CHECK-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z6nestedi_l42(i64 [[TMP1]]) #[[ATTR2:[0-9]+]]
 // CHECK-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK:       omp_offload.cont:
 // CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[CLASS_ANON]], ptr [[F]], i32 0, i32 0
@@ -141,7 +141,7 @@ int nested(int a){
 //
 //
 // CHECK-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z6nestedi_l42.omp_outlined
-// CHECK-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]]) #[[ATTR1]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -219,7 +219,7 @@ int nested(int a){
 //
 //
 // CHECK-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z6nestedi_l49.omp_outlined
-// CHECK-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]]) #[[ATTR2]] {
+// CHECK-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]]) #[[ATTR1]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -284,7 +284,7 @@ int nested(int a){
 //
 //
 // CHECK-LABEL: define internal void @.omp_offloading.requires_reg
-// CHECK-SAME: () #[[ATTR4:[0-9]+]] {
+// CHECK-SAME: () #[[ATTR3:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK-NEXT:    ret void
@@ -342,7 +342,7 @@ int nested(int a){
 // CHECK-X86-NEXT:    [[TMP21:%.*]] = icmp ne i32 [[TMP20]], 0
 // CHECK-X86-NEXT:    br i1 [[TMP21]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK-X86:       omp_offload.failed:
-// CHECK-X86-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z6nestedi_l42(i32 [[TMP1]]) #[[ATTR3:[0-9]+]]
+// CHECK-X86-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z6nestedi_l42(i32 [[TMP1]]) #[[ATTR2:[0-9]+]]
 // CHECK-X86-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK-X86:       omp_offload.cont:
 // CHECK-X86-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[CLASS_ANON]], ptr [[F]], i32 0, i32 0
@@ -366,7 +366,7 @@ int nested(int a){
 //
 //
 // CHECK-X86-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z6nestedi_l42.omp_outlined
-// CHECK-X86-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK-X86-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]]) #[[ATTR1]] {
 // CHECK-X86-NEXT:  entry:
 // CHECK-X86-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK-X86-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -444,7 +444,7 @@ int nested(int a){
 //
 //
 // CHECK-X86-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z6nestedi_l49.omp_outlined
-// CHECK-X86-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]]) #[[ATTR2]] {
+// CHECK-X86-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]]) #[[ATTR1]] {
 // CHECK-X86-NEXT:  entry:
 // CHECK-X86-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK-X86-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -509,7 +509,7 @@ int nested(int a){
 //
 //
 // CHECK-X86-LABEL: define internal void @.omp_offloading.requires_reg
-// CHECK-X86-SAME: () #[[ATTR4:[0-9]+]] {
+// CHECK-X86-SAME: () #[[ATTR3:[0-9]+]] {
 // CHECK-X86-NEXT:  entry:
 // CHECK-X86-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK-X86-NEXT:    ret void
@@ -578,10 +578,12 @@ int nested(int a){
 //
 //
 // TCHECK-TARGET-LABEL: define weak_odr protected void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z6nestedi_l42
-// TCHECK-TARGET-SAME: (i64 noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+// TCHECK-TARGET-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] {
 // TCHECK-TARGET-NEXT:  entry:
+// TCHECK-TARGET-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // TCHECK-TARGET-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
 // TCHECK-TARGET-NEXT:    [[A_CASTED:%.*]] = alloca i64, align 8
+// TCHECK-TARGET-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // TCHECK-TARGET-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
 // TCHECK-TARGET-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
 // TCHECK-TARGET-NEXT:    store i32 [[TMP0]], ptr [[A_CASTED]], align 4
@@ -591,7 +593,7 @@ int nested(int a){
 //
 //
 // TCHECK-TARGET-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z6nestedi_l42.omp_outlined
-// TCHECK-TARGET-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]]) #[[ATTR1:[0-9]+]] {
+// TCHECK-TARGET-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]]) #[[ATTR0]] {
 // TCHECK-TARGET-NEXT:  entry:
 // TCHECK-TARGET-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // TCHECK-TARGET-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -656,10 +658,12 @@ int nested(int a){
 //
 //
 // TCHECK-TARGET-LABEL: define weak_odr protected void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z6nestedi_l49
-// TCHECK-TARGET-SAME: (i64 noundef [[A:%.*]]) #[[ATTR0]] {
+// TCHECK-TARGET-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[A:%.*]]) #[[ATTR0]] {
 // TCHECK-TARGET-NEXT:  entry:
+// TCHECK-TARGET-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // TCHECK-TARGET-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
 // TCHECK-TARGET-NEXT:    [[A_CASTED:%.*]] = alloca i64, align 8
+// TCHECK-TARGET-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // TCHECK-TARGET-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
 // TCHECK-TARGET-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
 // TCHECK-TARGET-NEXT:    store i32 [[TMP0]], ptr [[A_CASTED]], align 4
@@ -669,7 +673,7 @@ int nested(int a){
 //
 //
 // TCHECK-TARGET-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z6nestedi_l49.omp_outlined
-// TCHECK-TARGET-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]]) #[[ATTR1]] {
+// TCHECK-TARGET-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]]) #[[ATTR0]] {
 // TCHECK-TARGET-NEXT:  entry:
 // TCHECK-TARGET-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // TCHECK-TARGET-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -734,10 +738,12 @@ int nested(int a){
 //
 //
 // TCHECK-TARGET-X86-LABEL: define weak_odr protected void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z6nestedi_l42
-// TCHECK-TARGET-X86-SAME: (i32 noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+// TCHECK-TARGET-X86-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] {
 // TCHECK-TARGET-X86-NEXT:  entry:
+// TCHECK-TARGET-X86-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // TCHECK-TARGET-X86-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 // TCHECK-TARGET-X86-NEXT:    [[A_CASTED:%.*]] = alloca i32, align 4
+// TCHECK-TARGET-X86-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // TCHECK-TARGET-X86-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
 // TCHECK-TARGET-X86-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
 // TCHECK-TARGET-X86-NEXT:    store i32 [[TMP0]], ptr [[A_CASTED]], align 4
@@ -747,7 +753,7 @@ int nested(int a){
 //
 //
 // TCHECK-TARGET-X86-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z6nestedi_l42.omp_outlined
-// TCHECK-TARGET-X86-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]]) #[[ATTR1:[0-9]+]] {
+// TCHECK-TARGET-X86-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]]) #[[ATTR0]] {
 // TCHECK-TARGET-X86-NEXT:  entry:
 // TCHECK-TARGET-X86-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // TCHECK-TARGET-X86-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -812,10 +818,12 @@ int nested(int a){
 //
 //
 // TCHECK-TARGET-X86-LABEL: define weak_odr protected void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z6nestedi_l49
-// TCHECK-TARGET-X86-SAME: (i32 noundef [[A:%.*]]) #[[ATTR0]] {
+// TCHECK-TARGET-X86-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[A:%.*]]) #[[ATTR0]] {
 // TCHECK-TARGET-X86-NEXT:  entry:
+// TCHECK-TARGET-X86-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // TCHECK-TARGET-X86-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 // TCHECK-TARGET-X86-NEXT:    [[A_CASTED:%.*]] = alloca i32, align 4
+// TCHECK-TARGET-X86-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // TCHECK-TARGET-X86-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
 // TCHECK-TARGET-X86-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
 // TCHECK-TARGET-X86-NEXT:    store i32 [[TMP0]], ptr [[A_CASTED]], align 4
@@ -825,7 +833,7 @@ int nested(int a){
 //
 //
 // TCHECK-TARGET-X86-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z6nestedi_l49.omp_outlined
-// TCHECK-TARGET-X86-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]]) #[[ATTR1]] {
+// TCHECK-TARGET-X86-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]]) #[[ATTR0]] {
 // TCHECK-TARGET-X86-NEXT:  entry:
 // TCHECK-TARGET-X86-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // TCHECK-TARGET-X86-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
diff --git a/clang/test/OpenMP/target_parallel_generic_loop_codegen-3.cpp b/clang/test/OpenMP/target_parallel_generic_loop_codegen-3.cpp
index a8e55c51aafd8..5a34dab313419 100644
--- a/clang/test/OpenMP/target_parallel_generic_loop_codegen-3.cpp
+++ b/clang/test/OpenMP/target_parallel_generic_loop_codegen-3.cpp
@@ -55,8 +55,9 @@ int main() {
   return 0;
 }
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13_debug__
-// CHECK1-SAME: (ptr addrspace(1) noalias noundef [[C:%.*]], i32 noundef [[A:%.*]], ptr noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]], i1 noundef zeroext [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0:[0-9]+]] !dbg [[DBG22:![0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], i32 noundef [[A:%.*]], ptr noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]], i1 noundef zeroext [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0:[0-9]+]] !dbg [[DBG22:![0-9]+]] {
 // CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[C_ADDR:%.*]] = alloca ptr addrspace(1), align 8
 // CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
@@ -67,57 +68,59 @@ int main() {
 // CHECK1-NEXT:    [[_TMP2:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[A_CASTED:%.*]] = alloca i64, align 8
 // CHECK1-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x ptr], align 8
+// CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DYN_PTR_ADDR]], metadata [[META43:![0-9]+]], metadata !DIExpression()), !dbg [[DBG44:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META40:![0-9]+]], metadata !DIExpression()), !dbg [[DBG41:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META45:![0-9]+]], metadata !DIExpression()), !dbg [[DBG46:![0-9]+]]
 // CHECK1-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META42:![0-9]+]], metadata !DIExpression()), !dbg [[DBG43:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META47:![0-9]+]], metadata !DIExpression()), !dbg [[DBG48:![0-9]+]]
 // CHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META44:![0-9]+]], metadata !DIExpression()), !dbg [[DBG45:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META49:![0-9]+]], metadata !DIExpression()), !dbg [[DBG50:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META46:![0-9]+]], metadata !DIExpression()), !dbg [[DBG47:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META51:![0-9]+]], metadata !DIExpression()), !dbg [[DBG52:![0-9]+]]
 // CHECK1-NEXT:    [[FROMBOOL:%.*]] = zext i1 [[DOTCAPTURE_EXPR_]] to i8
 // CHECK1-NEXT:    store i8 [[FROMBOOL]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 1
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTCAPTURE_EXPR__ADDR]], metadata [[META48:![0-9]+]], metadata !DIExpression()), !dbg [[DBG49:![0-9]+]]
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG50:![0-9]+]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG50]]
-// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG50]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG50]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG50]]
-// CHECK1-NEXT:    store ptr [[TMP3]], ptr [[_TMP1]], align 8, !dbg [[DBG50]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG50]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG50]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = addrspacecast ptr addrspace(1) [[TMP5]] to ptr, !dbg [[DBG50]]
-// CHECK1-NEXT:    store ptr [[TMP6]], ptr [[_TMP2]], align 8, !dbg [[DBG50]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG50]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13_kernel_environment), !dbg [[DBG50]]
-// CHECK1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP8]], -1, !dbg [[DBG50]]
-// CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]], !dbg [[DBG50]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTCAPTURE_EXPR__ADDR]], metadata [[META53:![0-9]+]], metadata !DIExpression()), !dbg [[DBG54:![0-9]+]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG55:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG55]]
+// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG55]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG55]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG55]]
+// CHECK1-NEXT:    store ptr [[TMP3]], ptr [[_TMP1]], align 8, !dbg [[DBG55]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG55]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG55]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = addrspacecast ptr addrspace(1) [[TMP5]] to ptr, !dbg [[DBG55]]
+// CHECK1-NEXT:    store ptr [[TMP6]], ptr [[_TMP2]], align 8, !dbg [[DBG55]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG55]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13_kernel_environment, ptr [[DYN_PTR]]), !dbg [[DBG55]]
+// CHECK1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP8]], -1, !dbg [[DBG55]]
+// CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]], !dbg [[DBG55]]
 // CHECK1:       user_code.entry:
 // CHECK1-NEXT:    [[TMP9:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB6:[0-9]+]])
-// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG51:![0-9]+]]
-// CHECK1-NEXT:    store i32 [[TMP10]], ptr [[A_CASTED]], align 4, !dbg [[DBG51]]
-// CHECK1-NEXT:    [[TMP11:%.*]] = load i64, ptr [[A_CASTED]], align 8, !dbg [[DBG51]]
-// CHECK1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 0, !dbg [[DBG51]]
-// CHECK1-NEXT:    store ptr [[TMP2]], ptr [[TMP12]], align 8, !dbg [[DBG51]]
-// CHECK1-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 1, !dbg [[DBG51]]
-// CHECK1-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP11]] to ptr, !dbg [[DBG51]]
-// CHECK1-NEXT:    store ptr [[TMP14]], ptr [[TMP13]], align 8, !dbg [[DBG51]]
-// CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 2, !dbg [[DBG51]]
-// CHECK1-NEXT:    store ptr [[TMP4]], ptr [[TMP15]], align 8, !dbg [[DBG51]]
-// CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 3, !dbg [[DBG51]]
-// CHECK1-NEXT:    store ptr [[TMP7]], ptr [[TMP16]], align 8, !dbg [[DBG51]]
-// CHECK1-NEXT:    [[TMP17:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1, !dbg [[DBG52:![0-9]+]]
-// CHECK1-NEXT:    [[TOBOOL:%.*]] = trunc i8 [[TMP17]] to i1, !dbg [[DBG52]]
-// CHECK1-NEXT:    [[TMP18:%.*]] = zext i1 [[TOBOOL]] to i32, !dbg [[DBG51]]
-// CHECK1-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB6]], i32 [[TMP9]], i32 [[TMP18]], i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13_debug___omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 4), !dbg [[DBG51]]
-// CHECK1-NEXT:    call void @__kmpc_target_deinit(), !dbg [[DBG54:![0-9]+]]
-// CHECK1-NEXT:    ret void, !dbg [[DBG55:![0-9]+]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG56:![0-9]+]]
+// CHECK1-NEXT:    store i32 [[TMP10]], ptr [[A_CASTED]], align 4, !dbg [[DBG56]]
+// CHECK1-NEXT:    [[TMP11:%.*]] = load i64, ptr [[A_CASTED]], align 8, !dbg [[DBG56]]
+// CHECK1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 0, !dbg [[DBG56]]
+// CHECK1-NEXT:    store ptr [[TMP2]], ptr [[TMP12]], align 8, !dbg [[DBG56]]
+// CHECK1-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 1, !dbg [[DBG56]]
+// CHECK1-NEXT:    [[TMP14:%.*]] = inttoptr i64 [[TMP11]] to ptr, !dbg [[DBG56]]
+// CHECK1-NEXT:    store ptr [[TMP14]], ptr [[TMP13]], align 8, !dbg [[DBG56]]
+// CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 2, !dbg [[DBG56]]
+// CHECK1-NEXT:    store ptr [[TMP4]], ptr [[TMP15]], align 8, !dbg [[DBG56]]
+// CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 3, !dbg [[DBG56]]
+// CHECK1-NEXT:    store ptr [[TMP7]], ptr [[TMP16]], align 8, !dbg [[DBG56]]
+// CHECK1-NEXT:    [[TMP17:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1, !dbg [[DBG57:![0-9]+]]
+// CHECK1-NEXT:    [[TOBOOL:%.*]] = trunc i8 [[TMP17]] to i1, !dbg [[DBG57]]
+// CHECK1-NEXT:    [[TMP18:%.*]] = zext i1 [[TOBOOL]] to i32, !dbg [[DBG56]]
+// CHECK1-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB6]], i32 [[TMP9]], i32 [[TMP18]], i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13_debug___omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 4), !dbg [[DBG56]]
+// CHECK1-NEXT:    call void @__kmpc_target_deinit(), !dbg [[DBG59:![0-9]+]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG60:![0-9]+]]
 // CHECK1:       worker.exit:
-// CHECK1-NEXT:    ret void, !dbg [[DBG50]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG55]]
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13_debug___omp_outlined_debug__
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], i32 noundef [[A:%.*]], ptr noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR2:[0-9]+]] !dbg [[DBG56:![0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], i32 noundef [[A:%.*]], ptr noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR2:[0-9]+]] !dbg [[DBG61:![0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -141,149 +144,149 @@ int main() {
 // CHECK1-NEXT:    [[H:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[D:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META63:![0-9]+]], metadata !DIExpression()), !dbg [[DBG64:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META68:![0-9]+]], metadata !DIExpression()), !dbg [[DBG69:![0-9]+]]
 // CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META65:![0-9]+]], metadata !DIExpression()), !dbg [[DBG64]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META70:![0-9]+]], metadata !DIExpression()), !dbg [[DBG69]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META66:![0-9]+]], metadata !DIExpression()), !dbg [[DBG67:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META71:![0-9]+]], metadata !DIExpression()), !dbg [[DBG72:![0-9]+]]
 // CHECK1-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META68:![0-9]+]], metadata !DIExpression()), !dbg [[DBG69:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META73:![0-9]+]], metadata !DIExpression()), !dbg [[DBG74:![0-9]+]]
 // CHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META70:![0-9]+]], metadata !DIExpression()), !dbg [[DBG71:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META75:![0-9]+]], metadata !DIExpression()), !dbg [[DBG76:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META72:![0-9]+]], metadata !DIExpression()), !dbg [[DBG73:![0-9]+]]
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG74:![0-9]+]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG74]]
-// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG74]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG74]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG74]]
-// CHECK1-NEXT:    store ptr [[TMP3]], ptr [[_TMP1]], align 8, !dbg [[DBG74]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG74]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG74]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = addrspacecast ptr addrspace(1) [[TMP5]] to ptr, !dbg [[DBG74]]
-// CHECK1-NEXT:    store ptr [[TMP6]], ptr [[_TMP2]], align 8, !dbg [[DBG74]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG74]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_IV]], metadata [[META75:![0-9]+]], metadata !DIExpression()), !dbg [[DBG64]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_LB]], metadata [[META76:![0-9]+]], metadata !DIExpression()), !dbg [[DBG64]]
-// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG77:![0-9]+]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_UB]], metadata [[META78:![0-9]+]], metadata !DIExpression()), !dbg [[DBG64]]
-// CHECK1-NEXT:    store i32 9, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG77]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_STRIDE]], metadata [[META79:![0-9]+]], metadata !DIExpression()), !dbg [[DBG64]]
-// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG77]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_IS_LAST]], metadata [[META80:![0-9]+]], metadata !DIExpression()), !dbg [[DBG64]]
-// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4, !dbg [[DBG77]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B4]], metadata [[META81:![0-9]+]], metadata !DIExpression()), !dbg [[DBG64]]
-// CHECK1-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[B4]], ptr align 4 [[TMP4]], i64 400, i1 false), !dbg [[DBG74]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[I]], metadata [[META82:![0-9]+]], metadata !DIExpression()), !dbg [[DBG64]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG74]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP8]], align 4, !dbg [[DBG74]]
-// CHECK1-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB3:[0-9]+]], i32 [[TMP9]], i32 33, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1), !dbg [[DBG83:![0-9]+]]
-// CHECK1-NEXT:    br label [[OMP_DISPATCH_COND:%.*]], !dbg [[DBG74]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META77:![0-9]+]], metadata !DIExpression()), !dbg [[DBG78:![0-9]+]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG79:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG79]]
+// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG79]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG79]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG79]]
+// CHECK1-NEXT:    store ptr [[TMP3]], ptr [[_TMP1]], align 8, !dbg [[DBG79]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG79]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG79]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = addrspacecast ptr addrspace(1) [[TMP5]] to ptr, !dbg [[DBG79]]
+// CHECK1-NEXT:    store ptr [[TMP6]], ptr [[_TMP2]], align 8, !dbg [[DBG79]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG79]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_IV]], metadata [[META80:![0-9]+]], metadata !DIExpression()), !dbg [[DBG69]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_LB]], metadata [[META81:![0-9]+]], metadata !DIExpression()), !dbg [[DBG69]]
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG82:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_UB]], metadata [[META83:![0-9]+]], metadata !DIExpression()), !dbg [[DBG69]]
+// CHECK1-NEXT:    store i32 9, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG82]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_STRIDE]], metadata [[META84:![0-9]+]], metadata !DIExpression()), !dbg [[DBG69]]
+// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG82]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_IS_LAST]], metadata [[META85:![0-9]+]], metadata !DIExpression()), !dbg [[DBG69]]
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4, !dbg [[DBG82]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B4]], metadata [[META86:![0-9]+]], metadata !DIExpression()), !dbg [[DBG69]]
+// CHECK1-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[B4]], ptr align 4 [[TMP4]], i64 400, i1 false), !dbg [[DBG79]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[I]], metadata [[META87:![0-9]+]], metadata !DIExpression()), !dbg [[DBG69]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG79]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP8]], align 4, !dbg [[DBG79]]
+// CHECK1-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB3:[0-9]+]], i32 [[TMP9]], i32 33, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1), !dbg [[DBG88:![0-9]+]]
+// CHECK1-NEXT:    br label [[OMP_DISPATCH_COND:%.*]], !dbg [[DBG79]]
 // CHECK1:       omp.dispatch.cond:
-// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG77]]
-// CHECK1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP10]], 9, !dbg [[DBG77]]
-// CHECK1-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]], !dbg [[DBG77]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG82]]
+// CHECK1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP10]], 9, !dbg [[DBG82]]
+// CHECK1-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]], !dbg [[DBG82]]
 // CHECK1:       cond.true:
-// CHECK1-NEXT:    br label [[COND_END:%.*]], !dbg [[DBG77]]
+// CHECK1-NEXT:    br label [[COND_END:%.*]], !dbg [[DBG82]]
 // CHECK1:       cond.false:
-// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG77]]
-// CHECK1-NEXT:    br label [[COND_END]], !dbg [[DBG77]]
+// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG82]]
+// CHECK1-NEXT:    br label [[COND_END]], !dbg [[DBG82]]
 // CHECK1:       cond.end:
-// CHECK1-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP11]], [[COND_FALSE]] ], !dbg [[DBG77]]
-// CHECK1-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4, !dbg [[DBG77]]
-// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG77]]
-// CHECK1-NEXT:    store i32 [[TMP12]], ptr [[DOTOMP_IV]], align 4, !dbg [[DBG77]]
-// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG77]]
-// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG77]]
-// CHECK1-NEXT:    [[CMP5:%.*]] = icmp sle i32 [[TMP13]], [[TMP14]], !dbg [[DBG74]]
-// CHECK1-NEXT:    br i1 [[CMP5]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]], !dbg [[DBG74]]
+// CHECK1-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP11]], [[COND_FALSE]] ], !dbg [[DBG82]]
+// CHECK1-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4, !dbg [[DBG82]]
+// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG82]]
+// CHECK1-NEXT:    store i32 [[TMP12]], ptr [[DOTOMP_IV]], align 4, !dbg [[DBG82]]
+// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG82]]
+// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG82]]
+// CHECK1-NEXT:    [[CMP5:%.*]] = icmp sle i32 [[TMP13]], [[TMP14]], !dbg [[DBG79]]
+// CHECK1-NEXT:    br i1 [[CMP5]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]], !dbg [[DBG79]]
 // CHECK1:       omp.dispatch.body:
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]], !dbg [[DBG74]]
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]], !dbg [[DBG79]]
 // CHECK1:       omp.inner.for.cond:
-// CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG77]]
-// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG77]]
-// CHECK1-NEXT:    [[CMP6:%.*]] = icmp sle i32 [[TMP15]], [[TMP16]], !dbg [[DBG74]]
-// CHECK1-NEXT:    br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]], !dbg [[DBG74]]
+// CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG82]]
+// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG82]]
+// CHECK1-NEXT:    [[CMP6:%.*]] = icmp sle i32 [[TMP15]], [[TMP16]], !dbg [[DBG79]]
+// CHECK1-NEXT:    br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]], !dbg [[DBG79]]
 // CHECK1:       omp.inner.for.body:
-// CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG77]]
-// CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP17]], 1, !dbg [[DBG84:![0-9]+]]
-// CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]], !dbg [[DBG84]]
-// CHECK1-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !dbg [[DBG84]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[F]], metadata [[META85:![0-9]+]], metadata !DIExpression()), !dbg [[DBG88:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 1, !dbg [[DBG89:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX]], i64 0, i64 1, !dbg [[DBG89]]
-// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX7]], i64 0, i64 1, !dbg [[DBG89]]
-// CHECK1-NEXT:    store ptr [[ARRAYIDX8]], ptr [[F]], align 8, !dbg [[DBG88]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[G]], metadata [[META90:![0-9]+]], metadata !DIExpression()), !dbg [[DBG91:![0-9]+]]
-// CHECK1-NEXT:    store ptr [[A_ADDR]], ptr [[G]], align 8, !dbg [[DBG91]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[H]], metadata [[META92:![0-9]+]], metadata !DIExpression()), !dbg [[DBG93:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B4]], i64 0, i64 1, !dbg [[DBG94:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX9]], i64 0, i64 1, !dbg [[DBG94]]
-// CHECK1-NEXT:    store ptr [[ARRAYIDX10]], ptr [[H]], align 8, !dbg [[DBG93]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[D]], metadata [[META95:![0-9]+]], metadata !DIExpression()), !dbg [[DBG96:![0-9]+]]
-// CHECK1-NEXT:    store i32 15, ptr [[D]], align 4, !dbg [[DBG96]]
-// CHECK1-NEXT:    store i32 5, ptr [[A_ADDR]], align 4, !dbg [[DBG97:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B4]], i64 0, i64 0, !dbg [[DBG98:![0-9]+]]
-// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG99:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP18]] to i64, !dbg [[DBG98]]
-// CHECK1-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX11]], i64 0, i64 [[IDXPROM]], !dbg [[DBG98]]
-// CHECK1-NEXT:    store i32 10, ptr [[ARRAYIDX12]], align 4, !dbg [[DBG100:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG101:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX13]], i64 0, i64 0, !dbg [[DBG101]]
-// CHECK1-NEXT:    [[TMP19:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG102:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM15:%.*]] = sext i32 [[TMP19]] to i64, !dbg [[DBG101]]
-// CHECK1-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX14]], i64 0, i64 [[IDXPROM15]], !dbg [[DBG101]]
-// CHECK1-NEXT:    store i32 11, ptr [[ARRAYIDX16]], align 4, !dbg [[DBG103:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG104:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX17]], i64 0, i64 0, !dbg [[DBG104]]
-// CHECK1-NEXT:    [[TMP20:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG105:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM19:%.*]] = sext i32 [[TMP20]] to i64, !dbg [[DBG104]]
-// CHECK1-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX18]], i64 0, i64 [[IDXPROM19]], !dbg [[DBG104]]
-// CHECK1-NEXT:    [[TMP21:%.*]] = load i32, ptr [[ARRAYIDX20]], align 4, !dbg [[DBG104]]
-// CHECK1-NEXT:    [[ARRAYIDX21:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B4]], i64 0, i64 0, !dbg [[DBG106:![0-9]+]]
-// CHECK1-NEXT:    [[TMP22:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG107:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM22:%.*]] = sext i32 [[TMP22]] to i64, !dbg [[DBG106]]
-// CHECK1-NEXT:    [[ARRAYIDX23:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX21]], i64 0, i64 [[IDXPROM22]], !dbg [[DBG106]]
-// CHECK1-NEXT:    store i32 [[TMP21]], ptr [[ARRAYIDX23]], align 4, !dbg [[DBG108:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX24:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B4]], i64 0, i64 0, !dbg [[DBG109:![0-9]+]]
-// CHECK1-NEXT:    [[TMP23:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG110:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM25:%.*]] = sext i32 [[TMP23]] to i64, !dbg [[DBG109]]
-// CHECK1-NEXT:    [[ARRAYIDX26:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX24]], i64 0, i64 [[IDXPROM25]], !dbg [[DBG109]]
-// CHECK1-NEXT:    [[TMP24:%.*]] = load i32, ptr [[ARRAYIDX26]], align 4, !dbg [[DBG109]]
-// CHECK1-NEXT:    [[TMP25:%.*]] = load i8, ptr [[TMP7]], align 1, !dbg [[DBG111:![0-9]+]]
-// CHECK1-NEXT:    [[TOBOOL:%.*]] = trunc i8 [[TMP25]] to i1, !dbg [[DBG111]]
-// CHECK1-NEXT:    [[CONV:%.*]] = zext i1 [[TOBOOL]] to i32, !dbg [[DBG111]]
-// CHECK1-NEXT:    [[OR:%.*]] = or i32 [[CONV]], [[TMP24]], !dbg [[DBG111]]
-// CHECK1-NEXT:    [[TOBOOL27:%.*]] = icmp ne i32 [[OR]], 0, !dbg [[DBG111]]
-// CHECK1-NEXT:    [[FROMBOOL:%.*]] = zext i1 [[TOBOOL27]] to i8, !dbg [[DBG111]]
-// CHECK1-NEXT:    store i8 [[FROMBOOL]], ptr [[TMP7]], align 1, !dbg [[DBG111]]
-// CHECK1-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]], !dbg [[DBG112:![0-9]+]]
+// CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG82]]
+// CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP17]], 1, !dbg [[DBG89:![0-9]+]]
+// CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]], !dbg [[DBG89]]
+// CHECK1-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !dbg [[DBG89]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[F]], metadata [[META90:![0-9]+]], metadata !DIExpression()), !dbg [[DBG93:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 1, !dbg [[DBG94:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX]], i64 0, i64 1, !dbg [[DBG94]]
+// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX7]], i64 0, i64 1, !dbg [[DBG94]]
+// CHECK1-NEXT:    store ptr [[ARRAYIDX8]], ptr [[F]], align 8, !dbg [[DBG93]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[G]], metadata [[META95:![0-9]+]], metadata !DIExpression()), !dbg [[DBG96:![0-9]+]]
+// CHECK1-NEXT:    store ptr [[A_ADDR]], ptr [[G]], align 8, !dbg [[DBG96]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[H]], metadata [[META97:![0-9]+]], metadata !DIExpression()), !dbg [[DBG98:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B4]], i64 0, i64 1, !dbg [[DBG99:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX9]], i64 0, i64 1, !dbg [[DBG99]]
+// CHECK1-NEXT:    store ptr [[ARRAYIDX10]], ptr [[H]], align 8, !dbg [[DBG98]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[D]], metadata [[META100:![0-9]+]], metadata !DIExpression()), !dbg [[DBG101:![0-9]+]]
+// CHECK1-NEXT:    store i32 15, ptr [[D]], align 4, !dbg [[DBG101]]
+// CHECK1-NEXT:    store i32 5, ptr [[A_ADDR]], align 4, !dbg [[DBG102:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B4]], i64 0, i64 0, !dbg [[DBG103:![0-9]+]]
+// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG104:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP18]] to i64, !dbg [[DBG103]]
+// CHECK1-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX11]], i64 0, i64 [[IDXPROM]], !dbg [[DBG103]]
+// CHECK1-NEXT:    store i32 10, ptr [[ARRAYIDX12]], align 4, !dbg [[DBG105:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG106:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX13]], i64 0, i64 0, !dbg [[DBG106]]
+// CHECK1-NEXT:    [[TMP19:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG107:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM15:%.*]] = sext i32 [[TMP19]] to i64, !dbg [[DBG106]]
+// CHECK1-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX14]], i64 0, i64 [[IDXPROM15]], !dbg [[DBG106]]
+// CHECK1-NEXT:    store i32 11, ptr [[ARRAYIDX16]], align 4, !dbg [[DBG108:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG109:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX17]], i64 0, i64 0, !dbg [[DBG109]]
+// CHECK1-NEXT:    [[TMP20:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG110:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM19:%.*]] = sext i32 [[TMP20]] to i64, !dbg [[DBG109]]
+// CHECK1-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX18]], i64 0, i64 [[IDXPROM19]], !dbg [[DBG109]]
+// CHECK1-NEXT:    [[TMP21:%.*]] = load i32, ptr [[ARRAYIDX20]], align 4, !dbg [[DBG109]]
+// CHECK1-NEXT:    [[ARRAYIDX21:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B4]], i64 0, i64 0, !dbg [[DBG111:![0-9]+]]
+// CHECK1-NEXT:    [[TMP22:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG112:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM22:%.*]] = sext i32 [[TMP22]] to i64, !dbg [[DBG111]]
+// CHECK1-NEXT:    [[ARRAYIDX23:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX21]], i64 0, i64 [[IDXPROM22]], !dbg [[DBG111]]
+// CHECK1-NEXT:    store i32 [[TMP21]], ptr [[ARRAYIDX23]], align 4, !dbg [[DBG113:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX24:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[B4]], i64 0, i64 0, !dbg [[DBG114:![0-9]+]]
+// CHECK1-NEXT:    [[TMP23:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG115:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM25:%.*]] = sext i32 [[TMP23]] to i64, !dbg [[DBG114]]
+// CHECK1-NEXT:    [[ARRAYIDX26:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX24]], i64 0, i64 [[IDXPROM25]], !dbg [[DBG114]]
+// CHECK1-NEXT:    [[TMP24:%.*]] = load i32, ptr [[ARRAYIDX26]], align 4, !dbg [[DBG114]]
+// CHECK1-NEXT:    [[TMP25:%.*]] = load i8, ptr [[TMP7]], align 1, !dbg [[DBG116:![0-9]+]]
+// CHECK1-NEXT:    [[TOBOOL:%.*]] = trunc i8 [[TMP25]] to i1, !dbg [[DBG116]]
+// CHECK1-NEXT:    [[CONV:%.*]] = zext i1 [[TOBOOL]] to i32, !dbg [[DBG116]]
+// CHECK1-NEXT:    [[OR:%.*]] = or i32 [[CONV]], [[TMP24]], !dbg [[DBG116]]
+// CHECK1-NEXT:    [[TOBOOL27:%.*]] = icmp ne i32 [[OR]], 0, !dbg [[DBG116]]
+// CHECK1-NEXT:    [[FROMBOOL:%.*]] = zext i1 [[TOBOOL27]] to i8, !dbg [[DBG116]]
+// CHECK1-NEXT:    store i8 [[FROMBOOL]], ptr [[TMP7]], align 1, !dbg [[DBG116]]
+// CHECK1-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]], !dbg [[DBG117:![0-9]+]]
 // CHECK1:       omp.body.continue:
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]], !dbg [[DBG83]]
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]], !dbg [[DBG88]]
 // CHECK1:       omp.inner.for.inc:
-// CHECK1-NEXT:    [[TMP26:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG77]]
-// CHECK1-NEXT:    [[ADD28:%.*]] = add nsw i32 [[TMP26]], 1, !dbg [[DBG74]]
-// CHECK1-NEXT:    store i32 [[ADD28]], ptr [[DOTOMP_IV]], align 4, !dbg [[DBG74]]
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !dbg [[DBG83]], !llvm.loop [[LOOP113:![0-9]+]]
+// CHECK1-NEXT:    [[TMP26:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG82]]
+// CHECK1-NEXT:    [[ADD28:%.*]] = add nsw i32 [[TMP26]], 1, !dbg [[DBG79]]
+// CHECK1-NEXT:    store i32 [[ADD28]], ptr [[DOTOMP_IV]], align 4, !dbg [[DBG79]]
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !dbg [[DBG88]], !llvm.loop [[LOOP118:![0-9]+]]
 // CHECK1:       omp.inner.for.end:
-// CHECK1-NEXT:    br label [[OMP_DISPATCH_INC:%.*]], !dbg [[DBG83]]
+// CHECK1-NEXT:    br label [[OMP_DISPATCH_INC:%.*]], !dbg [[DBG88]]
 // CHECK1:       omp.dispatch.inc:
-// CHECK1-NEXT:    [[TMP27:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG77]]
-// CHECK1-NEXT:    [[TMP28:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG77]]
-// CHECK1-NEXT:    [[ADD29:%.*]] = add nsw i32 [[TMP27]], [[TMP28]], !dbg [[DBG74]]
-// CHECK1-NEXT:    store i32 [[ADD29]], ptr [[DOTOMP_LB]], align 4, !dbg [[DBG74]]
-// CHECK1-NEXT:    [[TMP29:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG77]]
-// CHECK1-NEXT:    [[TMP30:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG77]]
-// CHECK1-NEXT:    [[ADD30:%.*]] = add nsw i32 [[TMP29]], [[TMP30]], !dbg [[DBG74]]
-// CHECK1-NEXT:    store i32 [[ADD30]], ptr [[DOTOMP_UB]], align 4, !dbg [[DBG74]]
-// CHECK1-NEXT:    br label [[OMP_DISPATCH_COND]], !dbg [[DBG83]], !llvm.loop [[LOOP115:![0-9]+]]
+// CHECK1-NEXT:    [[TMP27:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG82]]
+// CHECK1-NEXT:    [[TMP28:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG82]]
+// CHECK1-NEXT:    [[ADD29:%.*]] = add nsw i32 [[TMP27]], [[TMP28]], !dbg [[DBG79]]
+// CHECK1-NEXT:    store i32 [[ADD29]], ptr [[DOTOMP_LB]], align 4, !dbg [[DBG79]]
+// CHECK1-NEXT:    [[TMP29:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG82]]
+// CHECK1-NEXT:    [[TMP30:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG82]]
+// CHECK1-NEXT:    [[ADD30:%.*]] = add nsw i32 [[TMP29]], [[TMP30]], !dbg [[DBG79]]
+// CHECK1-NEXT:    store i32 [[ADD30]], ptr [[DOTOMP_UB]], align 4, !dbg [[DBG79]]
+// CHECK1-NEXT:    br label [[OMP_DISPATCH_COND]], !dbg [[DBG88]], !llvm.loop [[LOOP120:![0-9]+]]
 // CHECK1:       omp.dispatch.end:
-// CHECK1-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB5:[0-9]+]], i32 [[TMP9]]), !dbg [[DBG114:![0-9]+]]
-// CHECK1-NEXT:    ret void, !dbg [[DBG116:![0-9]+]]
+// CHECK1-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB5:[0-9]+]], i32 [[TMP9]]), !dbg [[DBG119:![0-9]+]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG121:![0-9]+]]
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13_debug___omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR2]] !dbg [[DBG117:![0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR2]] !dbg [[DBG122:![0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -292,68 +295,73 @@ int main() {
 // CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[BB_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META124:![0-9]+]], metadata !DIExpression()), !dbg [[DBG125:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META129:![0-9]+]], metadata !DIExpression()), !dbg [[DBG130:![0-9]+]]
 // CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META126:![0-9]+]], metadata !DIExpression()), !dbg [[DBG125]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META131:![0-9]+]], metadata !DIExpression()), !dbg [[DBG130]]
 // CHECK1-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META127:![0-9]+]], metadata !DIExpression()), !dbg [[DBG125]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META132:![0-9]+]], metadata !DIExpression()), !dbg [[DBG130]]
 // CHECK1-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META128:![0-9]+]], metadata !DIExpression()), !dbg [[DBG125]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META133:![0-9]+]], metadata !DIExpression()), !dbg [[DBG130]]
 // CHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META129:![0-9]+]], metadata !DIExpression()), !dbg [[DBG125]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META134:![0-9]+]], metadata !DIExpression()), !dbg [[DBG130]]
 // CHECK1-NEXT:    store ptr [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META130:![0-9]+]], metadata !DIExpression()), !dbg [[DBG125]]
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG131:![0-9]+]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG131]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG131]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG131]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG131]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG131]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG131]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG131]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG131]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = addrspacecast ptr [[TMP5]] to ptr addrspace(1), !dbg [[DBG131]]
-// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr [[TMP8]] to ptr addrspace(1), !dbg [[DBG131]]
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13_debug___omp_outlined_debug__(ptr [[TMP3]], ptr [[TMP4]], ptr addrspace(1) [[TMP9]], i32 [[TMP6]], ptr [[TMP7]], ptr addrspace(1) [[TMP10]]) #[[ATTR4:[0-9]+]], !dbg [[DBG131]]
-// CHECK1-NEXT:    ret void, !dbg [[DBG131]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META135:![0-9]+]], metadata !DIExpression()), !dbg [[DBG130]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG136:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG136]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG136]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG136]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG136]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG136]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG136]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG136]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG136]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = addrspacecast ptr [[TMP5]] to ptr addrspace(1), !dbg [[DBG136]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr [[TMP8]] to ptr addrspace(1), !dbg [[DBG136]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13_debug___omp_outlined_debug__(ptr [[TMP3]], ptr [[TMP4]], ptr addrspace(1) [[TMP9]], i32 [[TMP6]], ptr [[TMP7]], ptr addrspace(1) [[TMP10]]) #[[ATTR4:[0-9]+]], !dbg [[DBG136]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG136]]
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13
-// CHECK1-SAME: (ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR6:[0-9]+]] !dbg [[DBG132:![0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR6:[0-9]+]] !dbg [[DBG137:![0-9]+]] {
 // CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
 // CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[BB_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8
+// CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DYN_PTR_ADDR]], metadata [[META140:![0-9]+]], metadata !DIExpression()), !dbg [[DBG141:![0-9]+]]
 // CHECK1-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META135:![0-9]+]], metadata !DIExpression()), !dbg [[DBG136:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META142:![0-9]+]], metadata !DIExpression()), !dbg [[DBG141]]
 // CHECK1-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META137:![0-9]+]], metadata !DIExpression()), !dbg [[DBG136]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META143:![0-9]+]], metadata !DIExpression()), !dbg [[DBG141]]
 // CHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META138:![0-9]+]], metadata !DIExpression()), !dbg [[DBG136]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META144:![0-9]+]], metadata !DIExpression()), !dbg [[DBG141]]
 // CHECK1-NEXT:    store ptr [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META139:![0-9]+]], metadata !DIExpression()), !dbg [[DBG136]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META145:![0-9]+]], metadata !DIExpression()), !dbg [[DBG141]]
 // CHECK1-NEXT:    store i64 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTCAPTURE_EXPR__ADDR]], metadata [[META140:![0-9]+]], metadata !DIExpression()), !dbg [[DBG136]]
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG141:![0-9]+]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG141]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG141]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG141]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG141]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG141]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG141]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1, !dbg [[DBG141]]
-// CHECK1-NEXT:    [[TOBOOL:%.*]] = trunc i8 [[TMP7]] to i1, !dbg [[DBG141]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = addrspacecast ptr [[TMP3]] to ptr addrspace(1), !dbg [[DBG141]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = addrspacecast ptr [[TMP6]] to ptr addrspace(1), !dbg [[DBG141]]
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13_debug__(ptr addrspace(1) [[TMP8]], i32 [[TMP4]], ptr [[TMP5]], ptr addrspace(1) [[TMP9]], i1 [[TOBOOL]]) #[[ATTR4]], !dbg [[DBG141]]
-// CHECK1-NEXT:    ret void, !dbg [[DBG141]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTCAPTURE_EXPR__ADDR]], metadata [[META146:![0-9]+]], metadata !DIExpression()), !dbg [[DBG141]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG147:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG147]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG147]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DYN_PTR_ADDR]], align 8, !dbg [[DBG147]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG147]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG147]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG147]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG147]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1, !dbg [[DBG147]]
+// CHECK1-NEXT:    [[TOBOOL:%.*]] = trunc i8 [[TMP8]] to i1, !dbg [[DBG147]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = addrspacecast ptr [[TMP4]] to ptr addrspace(1), !dbg [[DBG147]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr [[TMP7]] to ptr addrspace(1), !dbg [[DBG147]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l13_debug__(ptr [[TMP3]], ptr addrspace(1) [[TMP9]], i32 [[TMP5]], ptr [[TMP6]], ptr addrspace(1) [[TMP10]], i1 [[TOBOOL]]) #[[ATTR4]], !dbg [[DBG147]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG147]]
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27_debug__
-// CHECK1-SAME: (ptr addrspace(1) noalias noundef [[C:%.*]], i32 noundef [[A:%.*]], ptr addrspace(1) noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG142:![0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], i32 noundef [[A:%.*]], ptr addrspace(1) noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG148:![0-9]+]] {
 // CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[C_ADDR:%.*]] = alloca ptr addrspace(1), align 8
 // CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca ptr addrspace(1), align 8
@@ -363,52 +371,54 @@ int main() {
 // CHECK1-NEXT:    [[_TMP2:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[A_CASTED:%.*]] = alloca i64, align 8
 // CHECK1-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x ptr], align 8
+// CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DYN_PTR_ADDR]], metadata [[META153:![0-9]+]], metadata !DIExpression()), !dbg [[DBG154:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META147:![0-9]+]], metadata !DIExpression()), !dbg [[DBG148:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META155:![0-9]+]], metadata !DIExpression()), !dbg [[DBG156:![0-9]+]]
 // CHECK1-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META149:![0-9]+]], metadata !DIExpression()), !dbg [[DBG150:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META157:![0-9]+]], metadata !DIExpression()), !dbg [[DBG158:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META151:![0-9]+]], metadata !DIExpression()), !dbg [[DBG152:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META159:![0-9]+]], metadata !DIExpression()), !dbg [[DBG160:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META153:![0-9]+]], metadata !DIExpression()), !dbg [[DBG154:![0-9]+]]
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG155:![0-9]+]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG155]]
-// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG155]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG155]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr [[B_ADDR]], align 8, !dbg [[DBG155]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[TMP3]] to ptr, !dbg [[DBG155]]
-// CHECK1-NEXT:    store ptr [[TMP4]], ptr [[_TMP1]], align 8, !dbg [[DBG155]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG155]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG155]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = addrspacecast ptr addrspace(1) [[TMP6]] to ptr, !dbg [[DBG155]]
-// CHECK1-NEXT:    store ptr [[TMP7]], ptr [[_TMP2]], align 8, !dbg [[DBG155]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG155]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27_kernel_environment), !dbg [[DBG155]]
-// CHECK1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP9]], -1, !dbg [[DBG155]]
-// CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]], !dbg [[DBG155]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META161:![0-9]+]], metadata !DIExpression()), !dbg [[DBG162:![0-9]+]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG163:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG163]]
+// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG163]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG163]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr [[B_ADDR]], align 8, !dbg [[DBG163]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[TMP3]] to ptr, !dbg [[DBG163]]
+// CHECK1-NEXT:    store ptr [[TMP4]], ptr [[_TMP1]], align 8, !dbg [[DBG163]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG163]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG163]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = addrspacecast ptr addrspace(1) [[TMP6]] to ptr, !dbg [[DBG163]]
+// CHECK1-NEXT:    store ptr [[TMP7]], ptr [[_TMP2]], align 8, !dbg [[DBG163]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG163]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27_kernel_environment, ptr [[DYN_PTR]]), !dbg [[DBG163]]
+// CHECK1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP9]], -1, !dbg [[DBG163]]
+// CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]], !dbg [[DBG163]]
 // CHECK1:       user_code.entry:
 // CHECK1-NEXT:    [[TMP10:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB13:[0-9]+]])
-// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG156:![0-9]+]]
-// CHECK1-NEXT:    store i32 [[TMP11]], ptr [[A_CASTED]], align 4, !dbg [[DBG156]]
-// CHECK1-NEXT:    [[TMP12:%.*]] = load i64, ptr [[A_CASTED]], align 8, !dbg [[DBG156]]
-// CHECK1-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 0, !dbg [[DBG156]]
-// CHECK1-NEXT:    store ptr [[TMP2]], ptr [[TMP13]], align 8, !dbg [[DBG156]]
-// CHECK1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 1, !dbg [[DBG156]]
-// CHECK1-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP12]] to ptr, !dbg [[DBG156]]
-// CHECK1-NEXT:    store ptr [[TMP15]], ptr [[TMP14]], align 8, !dbg [[DBG156]]
-// CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 2, !dbg [[DBG156]]
-// CHECK1-NEXT:    store ptr [[TMP5]], ptr [[TMP16]], align 8, !dbg [[DBG156]]
-// CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 3, !dbg [[DBG156]]
-// CHECK1-NEXT:    store ptr [[TMP8]], ptr [[TMP17]], align 8, !dbg [[DBG156]]
-// CHECK1-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB13]], i32 [[TMP10]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27_debug___omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 4), !dbg [[DBG156]]
-// CHECK1-NEXT:    call void @__kmpc_target_deinit(), !dbg [[DBG157:![0-9]+]]
-// CHECK1-NEXT:    ret void, !dbg [[DBG159:![0-9]+]]
+// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG164:![0-9]+]]
+// CHECK1-NEXT:    store i32 [[TMP11]], ptr [[A_CASTED]], align 4, !dbg [[DBG164]]
+// CHECK1-NEXT:    [[TMP12:%.*]] = load i64, ptr [[A_CASTED]], align 8, !dbg [[DBG164]]
+// CHECK1-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 0, !dbg [[DBG164]]
+// CHECK1-NEXT:    store ptr [[TMP2]], ptr [[TMP13]], align 8, !dbg [[DBG164]]
+// CHECK1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 1, !dbg [[DBG164]]
+// CHECK1-NEXT:    [[TMP15:%.*]] = inttoptr i64 [[TMP12]] to ptr, !dbg [[DBG164]]
+// CHECK1-NEXT:    store ptr [[TMP15]], ptr [[TMP14]], align 8, !dbg [[DBG164]]
+// CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 2, !dbg [[DBG164]]
+// CHECK1-NEXT:    store ptr [[TMP5]], ptr [[TMP16]], align 8, !dbg [[DBG164]]
+// CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 3, !dbg [[DBG164]]
+// CHECK1-NEXT:    store ptr [[TMP8]], ptr [[TMP17]], align 8, !dbg [[DBG164]]
+// CHECK1-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB13]], i32 [[TMP10]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27_debug___omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 4), !dbg [[DBG164]]
+// CHECK1-NEXT:    call void @__kmpc_target_deinit(), !dbg [[DBG165:![0-9]+]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG167:![0-9]+]]
 // CHECK1:       worker.exit:
-// CHECK1-NEXT:    ret void, !dbg [[DBG155]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG163]]
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27_debug___omp_outlined_debug__
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], i32 noundef [[A:%.*]], ptr addrspace(1) noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR2]] !dbg [[DBG160:![0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], i32 noundef [[A:%.*]], ptr addrspace(1) noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR2]] !dbg [[DBG168:![0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -431,140 +441,140 @@ int main() {
 // CHECK1-NEXT:    [[H:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[D:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META163:![0-9]+]], metadata !DIExpression()), !dbg [[DBG164:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META171:![0-9]+]], metadata !DIExpression()), !dbg [[DBG172:![0-9]+]]
 // CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META165:![0-9]+]], metadata !DIExpression()), !dbg [[DBG164]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META173:![0-9]+]], metadata !DIExpression()), !dbg [[DBG172]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META166:![0-9]+]], metadata !DIExpression()), !dbg [[DBG167:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META174:![0-9]+]], metadata !DIExpression()), !dbg [[DBG175:![0-9]+]]
 // CHECK1-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META168:![0-9]+]], metadata !DIExpression()), !dbg [[DBG169:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META176:![0-9]+]], metadata !DIExpression()), !dbg [[DBG177:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META170:![0-9]+]], metadata !DIExpression()), !dbg [[DBG171:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META178:![0-9]+]], metadata !DIExpression()), !dbg [[DBG179:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META172:![0-9]+]], metadata !DIExpression()), !dbg [[DBG173:![0-9]+]]
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG174:![0-9]+]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG174]]
-// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG174]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG174]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr [[B_ADDR]], align 8, !dbg [[DBG174]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[TMP3]] to ptr, !dbg [[DBG174]]
-// CHECK1-NEXT:    store ptr [[TMP4]], ptr [[_TMP1]], align 8, !dbg [[DBG174]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG174]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG174]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = addrspacecast ptr addrspace(1) [[TMP6]] to ptr, !dbg [[DBG174]]
-// CHECK1-NEXT:    store ptr [[TMP7]], ptr [[_TMP2]], align 8, !dbg [[DBG174]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG174]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_IV]], metadata [[META175:![0-9]+]], metadata !DIExpression()), !dbg [[DBG164]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_LB]], metadata [[META176:![0-9]+]], metadata !DIExpression()), !dbg [[DBG164]]
-// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG177:![0-9]+]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_UB]], metadata [[META178:![0-9]+]], metadata !DIExpression()), !dbg [[DBG164]]
-// CHECK1-NEXT:    store i32 9, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG177]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_STRIDE]], metadata [[META179:![0-9]+]], metadata !DIExpression()), !dbg [[DBG164]]
-// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG177]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_IS_LAST]], metadata [[META180:![0-9]+]], metadata !DIExpression()), !dbg [[DBG164]]
-// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4, !dbg [[DBG177]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[I]], metadata [[META181:![0-9]+]], metadata !DIExpression()), !dbg [[DBG164]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG174]]
-// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !dbg [[DBG174]]
-// CHECK1-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB10:[0-9]+]], i32 [[TMP10]], i32 33, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1), !dbg [[DBG182:![0-9]+]]
-// CHECK1-NEXT:    br label [[OMP_DISPATCH_COND:%.*]], !dbg [[DBG174]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META180:![0-9]+]], metadata !DIExpression()), !dbg [[DBG181:![0-9]+]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG182:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG182]]
+// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG182]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG182]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr [[B_ADDR]], align 8, !dbg [[DBG182]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[TMP3]] to ptr, !dbg [[DBG182]]
+// CHECK1-NEXT:    store ptr [[TMP4]], ptr [[_TMP1]], align 8, !dbg [[DBG182]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG182]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG182]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = addrspacecast ptr addrspace(1) [[TMP6]] to ptr, !dbg [[DBG182]]
+// CHECK1-NEXT:    store ptr [[TMP7]], ptr [[_TMP2]], align 8, !dbg [[DBG182]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG182]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_IV]], metadata [[META183:![0-9]+]], metadata !DIExpression()), !dbg [[DBG172]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_LB]], metadata [[META184:![0-9]+]], metadata !DIExpression()), !dbg [[DBG172]]
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG185:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_UB]], metadata [[META186:![0-9]+]], metadata !DIExpression()), !dbg [[DBG172]]
+// CHECK1-NEXT:    store i32 9, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG185]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_STRIDE]], metadata [[META187:![0-9]+]], metadata !DIExpression()), !dbg [[DBG172]]
+// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG185]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_IS_LAST]], metadata [[META188:![0-9]+]], metadata !DIExpression()), !dbg [[DBG172]]
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4, !dbg [[DBG185]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[I]], metadata [[META189:![0-9]+]], metadata !DIExpression()), !dbg [[DBG172]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG182]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !dbg [[DBG182]]
+// CHECK1-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB10:[0-9]+]], i32 [[TMP10]], i32 33, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1), !dbg [[DBG190:![0-9]+]]
+// CHECK1-NEXT:    br label [[OMP_DISPATCH_COND:%.*]], !dbg [[DBG182]]
 // CHECK1:       omp.dispatch.cond:
-// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG177]]
-// CHECK1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP11]], 9, !dbg [[DBG177]]
-// CHECK1-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]], !dbg [[DBG177]]
+// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG185]]
+// CHECK1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP11]], 9, !dbg [[DBG185]]
+// CHECK1-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]], !dbg [[DBG185]]
 // CHECK1:       cond.true:
-// CHECK1-NEXT:    br label [[COND_END:%.*]], !dbg [[DBG177]]
+// CHECK1-NEXT:    br label [[COND_END:%.*]], !dbg [[DBG185]]
 // CHECK1:       cond.false:
-// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG177]]
-// CHECK1-NEXT:    br label [[COND_END]], !dbg [[DBG177]]
+// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG185]]
+// CHECK1-NEXT:    br label [[COND_END]], !dbg [[DBG185]]
 // CHECK1:       cond.end:
-// CHECK1-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ], !dbg [[DBG177]]
-// CHECK1-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4, !dbg [[DBG177]]
-// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG177]]
-// CHECK1-NEXT:    store i32 [[TMP13]], ptr [[DOTOMP_IV]], align 4, !dbg [[DBG177]]
-// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG177]]
-// CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG177]]
-// CHECK1-NEXT:    [[CMP4:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]], !dbg [[DBG174]]
-// CHECK1-NEXT:    br i1 [[CMP4]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]], !dbg [[DBG174]]
+// CHECK1-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP12]], [[COND_FALSE]] ], !dbg [[DBG185]]
+// CHECK1-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4, !dbg [[DBG185]]
+// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG185]]
+// CHECK1-NEXT:    store i32 [[TMP13]], ptr [[DOTOMP_IV]], align 4, !dbg [[DBG185]]
+// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG185]]
+// CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG185]]
+// CHECK1-NEXT:    [[CMP4:%.*]] = icmp sle i32 [[TMP14]], [[TMP15]], !dbg [[DBG182]]
+// CHECK1-NEXT:    br i1 [[CMP4]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]], !dbg [[DBG182]]
 // CHECK1:       omp.dispatch.body:
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]], !dbg [[DBG174]]
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]], !dbg [[DBG182]]
 // CHECK1:       omp.inner.for.cond:
-// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG177]]
-// CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG177]]
-// CHECK1-NEXT:    [[CMP5:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]], !dbg [[DBG174]]
-// CHECK1-NEXT:    br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]], !dbg [[DBG174]]
+// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG185]]
+// CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG185]]
+// CHECK1-NEXT:    [[CMP5:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]], !dbg [[DBG182]]
+// CHECK1-NEXT:    br i1 [[CMP5]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]], !dbg [[DBG182]]
 // CHECK1:       omp.inner.for.body:
-// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG177]]
-// CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1, !dbg [[DBG183:![0-9]+]]
-// CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]], !dbg [[DBG183]]
-// CHECK1-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !dbg [[DBG183]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[F]], metadata [[META184:![0-9]+]], metadata !DIExpression()), !dbg [[DBG186:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 1, !dbg [[DBG187:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX]], i64 0, i64 1, !dbg [[DBG187]]
-// CHECK1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX6]], i64 0, i64 1, !dbg [[DBG187]]
-// CHECK1-NEXT:    store ptr [[ARRAYIDX7]], ptr [[F]], align 8, !dbg [[DBG186]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[G]], metadata [[META188:![0-9]+]], metadata !DIExpression()), !dbg [[DBG189:![0-9]+]]
-// CHECK1-NEXT:    store ptr [[A_ADDR]], ptr [[G]], align 8, !dbg [[DBG189]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[H]], metadata [[META190:![0-9]+]], metadata !DIExpression()), !dbg [[DBG191:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP5]], i64 0, i64 1, !dbg [[DBG192:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX8]], i64 0, i64 1, !dbg [[DBG192]]
-// CHECK1-NEXT:    store ptr [[ARRAYIDX9]], ptr [[H]], align 8, !dbg [[DBG191]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[D]], metadata [[META193:![0-9]+]], metadata !DIExpression()), !dbg [[DBG194:![0-9]+]]
-// CHECK1-NEXT:    store i32 15, ptr [[D]], align 4, !dbg [[DBG194]]
-// CHECK1-NEXT:    store i32 5, ptr [[A_ADDR]], align 4, !dbg [[DBG195:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP5]], i64 0, i64 0, !dbg [[DBG196:![0-9]+]]
-// CHECK1-NEXT:    [[TMP19:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG197:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP19]] to i64, !dbg [[DBG196]]
-// CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX10]], i64 0, i64 [[IDXPROM]], !dbg [[DBG196]]
-// CHECK1-NEXT:    store i32 10, ptr [[ARRAYIDX11]], align 4, !dbg [[DBG198:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG199:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX12]], i64 0, i64 0, !dbg [[DBG199]]
-// CHECK1-NEXT:    [[TMP20:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG200:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM14:%.*]] = sext i32 [[TMP20]] to i64, !dbg [[DBG199]]
-// CHECK1-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX13]], i64 0, i64 [[IDXPROM14]], !dbg [[DBG199]]
-// CHECK1-NEXT:    store i32 11, ptr [[ARRAYIDX15]], align 4, !dbg [[DBG201:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG202:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX16]], i64 0, i64 0, !dbg [[DBG202]]
-// CHECK1-NEXT:    [[TMP21:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG203:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM18:%.*]] = sext i32 [[TMP21]] to i64, !dbg [[DBG202]]
-// CHECK1-NEXT:    [[ARRAYIDX19:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX17]], i64 0, i64 [[IDXPROM18]], !dbg [[DBG202]]
-// CHECK1-NEXT:    [[TMP22:%.*]] = load i32, ptr [[ARRAYIDX19]], align 4, !dbg [[DBG202]]
-// CHECK1-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP5]], i64 0, i64 0, !dbg [[DBG204:![0-9]+]]
-// CHECK1-NEXT:    [[TMP23:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG205:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM21:%.*]] = sext i32 [[TMP23]] to i64, !dbg [[DBG204]]
-// CHECK1-NEXT:    [[ARRAYIDX22:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX20]], i64 0, i64 [[IDXPROM21]], !dbg [[DBG204]]
-// CHECK1-NEXT:    store i32 [[TMP22]], ptr [[ARRAYIDX22]], align 4, !dbg [[DBG206:![0-9]+]]
-// CHECK1-NEXT:    [[TMP24:%.*]] = load i8, ptr [[TMP8]], align 1, !dbg [[DBG207:![0-9]+]]
-// CHECK1-NEXT:    [[TOBOOL:%.*]] = trunc i8 [[TMP24]] to i1, !dbg [[DBG207]]
-// CHECK1-NEXT:    [[CONV:%.*]] = zext i1 [[TOBOOL]] to i32, !dbg [[DBG207]]
-// CHECK1-NEXT:    store i32 [[CONV]], ptr [[D]], align 4, !dbg [[DBG208:![0-9]+]]
-// CHECK1-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]], !dbg [[DBG209:![0-9]+]]
+// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG185]]
+// CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1, !dbg [[DBG191:![0-9]+]]
+// CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]], !dbg [[DBG191]]
+// CHECK1-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !dbg [[DBG191]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[F]], metadata [[META192:![0-9]+]], metadata !DIExpression()), !dbg [[DBG194:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 1, !dbg [[DBG195:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX]], i64 0, i64 1, !dbg [[DBG195]]
+// CHECK1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX6]], i64 0, i64 1, !dbg [[DBG195]]
+// CHECK1-NEXT:    store ptr [[ARRAYIDX7]], ptr [[F]], align 8, !dbg [[DBG194]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[G]], metadata [[META196:![0-9]+]], metadata !DIExpression()), !dbg [[DBG197:![0-9]+]]
+// CHECK1-NEXT:    store ptr [[A_ADDR]], ptr [[G]], align 8, !dbg [[DBG197]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[H]], metadata [[META198:![0-9]+]], metadata !DIExpression()), !dbg [[DBG199:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP5]], i64 0, i64 1, !dbg [[DBG200:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX8]], i64 0, i64 1, !dbg [[DBG200]]
+// CHECK1-NEXT:    store ptr [[ARRAYIDX9]], ptr [[H]], align 8, !dbg [[DBG199]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[D]], metadata [[META201:![0-9]+]], metadata !DIExpression()), !dbg [[DBG202:![0-9]+]]
+// CHECK1-NEXT:    store i32 15, ptr [[D]], align 4, !dbg [[DBG202]]
+// CHECK1-NEXT:    store i32 5, ptr [[A_ADDR]], align 4, !dbg [[DBG203:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP5]], i64 0, i64 0, !dbg [[DBG204:![0-9]+]]
+// CHECK1-NEXT:    [[TMP19:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG205:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP19]] to i64, !dbg [[DBG204]]
+// CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX10]], i64 0, i64 [[IDXPROM]], !dbg [[DBG204]]
+// CHECK1-NEXT:    store i32 10, ptr [[ARRAYIDX11]], align 4, !dbg [[DBG206:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG207:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX12]], i64 0, i64 0, !dbg [[DBG207]]
+// CHECK1-NEXT:    [[TMP20:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG208:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM14:%.*]] = sext i32 [[TMP20]] to i64, !dbg [[DBG207]]
+// CHECK1-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX13]], i64 0, i64 [[IDXPROM14]], !dbg [[DBG207]]
+// CHECK1-NEXT:    store i32 11, ptr [[ARRAYIDX15]], align 4, !dbg [[DBG209:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG210:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX16]], i64 0, i64 0, !dbg [[DBG210]]
+// CHECK1-NEXT:    [[TMP21:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG211:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM18:%.*]] = sext i32 [[TMP21]] to i64, !dbg [[DBG210]]
+// CHECK1-NEXT:    [[ARRAYIDX19:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX17]], i64 0, i64 [[IDXPROM18]], !dbg [[DBG210]]
+// CHECK1-NEXT:    [[TMP22:%.*]] = load i32, ptr [[ARRAYIDX19]], align 4, !dbg [[DBG210]]
+// CHECK1-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP5]], i64 0, i64 0, !dbg [[DBG212:![0-9]+]]
+// CHECK1-NEXT:    [[TMP23:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG213:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM21:%.*]] = sext i32 [[TMP23]] to i64, !dbg [[DBG212]]
+// CHECK1-NEXT:    [[ARRAYIDX22:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX20]], i64 0, i64 [[IDXPROM21]], !dbg [[DBG212]]
+// CHECK1-NEXT:    store i32 [[TMP22]], ptr [[ARRAYIDX22]], align 4, !dbg [[DBG214:![0-9]+]]
+// CHECK1-NEXT:    [[TMP24:%.*]] = load i8, ptr [[TMP8]], align 1, !dbg [[DBG215:![0-9]+]]
+// CHECK1-NEXT:    [[TOBOOL:%.*]] = trunc i8 [[TMP24]] to i1, !dbg [[DBG215]]
+// CHECK1-NEXT:    [[CONV:%.*]] = zext i1 [[TOBOOL]] to i32, !dbg [[DBG215]]
+// CHECK1-NEXT:    store i32 [[CONV]], ptr [[D]], align 4, !dbg [[DBG216:![0-9]+]]
+// CHECK1-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]], !dbg [[DBG217:![0-9]+]]
 // CHECK1:       omp.body.continue:
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]], !dbg [[DBG182]]
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]], !dbg [[DBG190]]
 // CHECK1:       omp.inner.for.inc:
-// CHECK1-NEXT:    [[TMP25:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG177]]
-// CHECK1-NEXT:    [[ADD23:%.*]] = add nsw i32 [[TMP25]], 1, !dbg [[DBG174]]
-// CHECK1-NEXT:    store i32 [[ADD23]], ptr [[DOTOMP_IV]], align 4, !dbg [[DBG174]]
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !dbg [[DBG182]], !llvm.loop [[LOOP210:![0-9]+]]
+// CHECK1-NEXT:    [[TMP25:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG185]]
+// CHECK1-NEXT:    [[ADD23:%.*]] = add nsw i32 [[TMP25]], 1, !dbg [[DBG182]]
+// CHECK1-NEXT:    store i32 [[ADD23]], ptr [[DOTOMP_IV]], align 4, !dbg [[DBG182]]
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !dbg [[DBG190]], !llvm.loop [[LOOP218:![0-9]+]]
 // CHECK1:       omp.inner.for.end:
-// CHECK1-NEXT:    br label [[OMP_DISPATCH_INC:%.*]], !dbg [[DBG182]]
+// CHECK1-NEXT:    br label [[OMP_DISPATCH_INC:%.*]], !dbg [[DBG190]]
 // CHECK1:       omp.dispatch.inc:
-// CHECK1-NEXT:    [[TMP26:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG177]]
-// CHECK1-NEXT:    [[TMP27:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG177]]
-// CHECK1-NEXT:    [[ADD24:%.*]] = add nsw i32 [[TMP26]], [[TMP27]], !dbg [[DBG174]]
-// CHECK1-NEXT:    store i32 [[ADD24]], ptr [[DOTOMP_LB]], align 4, !dbg [[DBG174]]
-// CHECK1-NEXT:    [[TMP28:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG177]]
-// CHECK1-NEXT:    [[TMP29:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG177]]
-// CHECK1-NEXT:    [[ADD25:%.*]] = add nsw i32 [[TMP28]], [[TMP29]], !dbg [[DBG174]]
-// CHECK1-NEXT:    store i32 [[ADD25]], ptr [[DOTOMP_UB]], align 4, !dbg [[DBG174]]
-// CHECK1-NEXT:    br label [[OMP_DISPATCH_COND]], !dbg [[DBG182]], !llvm.loop [[LOOP212:![0-9]+]]
+// CHECK1-NEXT:    [[TMP26:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG185]]
+// CHECK1-NEXT:    [[TMP27:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG185]]
+// CHECK1-NEXT:    [[ADD24:%.*]] = add nsw i32 [[TMP26]], [[TMP27]], !dbg [[DBG182]]
+// CHECK1-NEXT:    store i32 [[ADD24]], ptr [[DOTOMP_LB]], align 4, !dbg [[DBG182]]
+// CHECK1-NEXT:    [[TMP28:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG185]]
+// CHECK1-NEXT:    [[TMP29:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG185]]
+// CHECK1-NEXT:    [[ADD25:%.*]] = add nsw i32 [[TMP28]], [[TMP29]], !dbg [[DBG182]]
+// CHECK1-NEXT:    store i32 [[ADD25]], ptr [[DOTOMP_UB]], align 4, !dbg [[DBG182]]
+// CHECK1-NEXT:    br label [[OMP_DISPATCH_COND]], !dbg [[DBG190]], !llvm.loop [[LOOP220:![0-9]+]]
 // CHECK1:       omp.dispatch.end:
-// CHECK1-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB12:[0-9]+]], i32 [[TMP10]]), !dbg [[DBG211:![0-9]+]]
-// CHECK1-NEXT:    ret void, !dbg [[DBG213:![0-9]+]]
+// CHECK1-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB12:[0-9]+]], i32 [[TMP10]]), !dbg [[DBG219:![0-9]+]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG221:![0-9]+]]
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27_debug___omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR2]] !dbg [[DBG214:![0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR2]] !dbg [[DBG222:![0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -573,65 +583,70 @@ int main() {
 // CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[BB_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META215:![0-9]+]], metadata !DIExpression()), !dbg [[DBG216:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META223:![0-9]+]], metadata !DIExpression()), !dbg [[DBG224:![0-9]+]]
 // CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META217:![0-9]+]], metadata !DIExpression()), !dbg [[DBG216]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META225:![0-9]+]], metadata !DIExpression()), !dbg [[DBG224]]
 // CHECK1-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META218:![0-9]+]], metadata !DIExpression()), !dbg [[DBG216]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META226:![0-9]+]], metadata !DIExpression()), !dbg [[DBG224]]
 // CHECK1-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META219:![0-9]+]], metadata !DIExpression()), !dbg [[DBG216]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META227:![0-9]+]], metadata !DIExpression()), !dbg [[DBG224]]
 // CHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META220:![0-9]+]], metadata !DIExpression()), !dbg [[DBG216]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META228:![0-9]+]], metadata !DIExpression()), !dbg [[DBG224]]
 // CHECK1-NEXT:    store ptr [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META221:![0-9]+]], metadata !DIExpression()), !dbg [[DBG216]]
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG222:![0-9]+]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG222]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG222]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG222]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG222]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG222]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG222]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG222]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG222]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = addrspacecast ptr [[TMP5]] to ptr addrspace(1), !dbg [[DBG222]]
-// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr [[TMP7]] to ptr addrspace(1), !dbg [[DBG222]]
-// CHECK1-NEXT:    [[TMP11:%.*]] = addrspacecast ptr [[TMP8]] to ptr addrspace(1), !dbg [[DBG222]]
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27_debug___omp_outlined_debug__(ptr [[TMP3]], ptr [[TMP4]], ptr addrspace(1) [[TMP9]], i32 [[TMP6]], ptr addrspace(1) [[TMP10]], ptr addrspace(1) [[TMP11]]) #[[ATTR4]], !dbg [[DBG222]]
-// CHECK1-NEXT:    ret void, !dbg [[DBG222]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META229:![0-9]+]], metadata !DIExpression()), !dbg [[DBG224]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG230:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG230]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG230]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG230]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG230]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG230]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG230]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG230]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG230]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = addrspacecast ptr [[TMP5]] to ptr addrspace(1), !dbg [[DBG230]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr [[TMP7]] to ptr addrspace(1), !dbg [[DBG230]]
+// CHECK1-NEXT:    [[TMP11:%.*]] = addrspacecast ptr [[TMP8]] to ptr addrspace(1), !dbg [[DBG230]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27_debug___omp_outlined_debug__(ptr [[TMP3]], ptr [[TMP4]], ptr addrspace(1) [[TMP9]], i32 [[TMP6]], ptr addrspace(1) [[TMP10]], ptr addrspace(1) [[TMP11]]) #[[ATTR4]], !dbg [[DBG230]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG230]]
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27
-// CHECK1-SAME: (ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR6]] !dbg [[DBG223:![0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR6]] !dbg [[DBG231:![0-9]+]] {
 // CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
 // CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[BB_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DYN_PTR_ADDR]], metadata [[META234:![0-9]+]], metadata !DIExpression()), !dbg [[DBG235:![0-9]+]]
 // CHECK1-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META226:![0-9]+]], metadata !DIExpression()), !dbg [[DBG227:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META236:![0-9]+]], metadata !DIExpression()), !dbg [[DBG235]]
 // CHECK1-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META228:![0-9]+]], metadata !DIExpression()), !dbg [[DBG227]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META237:![0-9]+]], metadata !DIExpression()), !dbg [[DBG235]]
 // CHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META229:![0-9]+]], metadata !DIExpression()), !dbg [[DBG227]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META238:![0-9]+]], metadata !DIExpression()), !dbg [[DBG235]]
 // CHECK1-NEXT:    store ptr [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META230:![0-9]+]], metadata !DIExpression()), !dbg [[DBG227]]
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG231:![0-9]+]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG231]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG231]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG231]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG231]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG231]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG231]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = addrspacecast ptr [[TMP3]] to ptr addrspace(1), !dbg [[DBG231]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = addrspacecast ptr [[TMP5]] to ptr addrspace(1), !dbg [[DBG231]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = addrspacecast ptr [[TMP6]] to ptr addrspace(1), !dbg [[DBG231]]
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27_debug__(ptr addrspace(1) [[TMP7]], i32 [[TMP4]], ptr addrspace(1) [[TMP8]], ptr addrspace(1) [[TMP9]]) #[[ATTR4]], !dbg [[DBG231]]
-// CHECK1-NEXT:    ret void, !dbg [[DBG231]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META239:![0-9]+]], metadata !DIExpression()), !dbg [[DBG235]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG240:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG240]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG240]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DYN_PTR_ADDR]], align 8, !dbg [[DBG240]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG240]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[A_ADDR]], align 4, !dbg [[DBG240]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG240]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG240]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = addrspacecast ptr [[TMP4]] to ptr addrspace(1), !dbg [[DBG240]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = addrspacecast ptr [[TMP6]] to ptr addrspace(1), !dbg [[DBG240]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr [[TMP7]] to ptr addrspace(1), !dbg [[DBG240]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l27_debug__(ptr [[TMP3]], ptr addrspace(1) [[TMP8]], i32 [[TMP5]], ptr addrspace(1) [[TMP9]], ptr addrspace(1) [[TMP10]]) #[[ATTR4]], !dbg [[DBG240]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG240]]
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41_debug__
-// CHECK1-SAME: (ptr addrspace(1) noalias noundef [[C:%.*]], ptr addrspace(1) noalias noundef [[A:%.*]], ptr addrspace(1) noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG232:![0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], ptr addrspace(1) noalias noundef [[A:%.*]], ptr addrspace(1) noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR0]] !dbg [[DBG241:![0-9]+]] {
 // CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[C_ADDR:%.*]] = alloca ptr addrspace(1), align 8
 // CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca ptr addrspace(1), align 8
 // CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca ptr addrspace(1), align 8
@@ -641,52 +656,54 @@ int main() {
 // CHECK1-NEXT:    [[_TMP2:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[_TMP3:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [4 x ptr], align 8
+// CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DYN_PTR_ADDR]], metadata [[META246:![0-9]+]], metadata !DIExpression()), !dbg [[DBG247:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META237:![0-9]+]], metadata !DIExpression()), !dbg [[DBG238:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META248:![0-9]+]], metadata !DIExpression()), !dbg [[DBG249:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[A]], ptr [[A_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META239:![0-9]+]], metadata !DIExpression()), !dbg [[DBG240:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META250:![0-9]+]], metadata !DIExpression()), !dbg [[DBG251:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META241:![0-9]+]], metadata !DIExpression()), !dbg [[DBG242:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META252:![0-9]+]], metadata !DIExpression()), !dbg [[DBG253:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META243:![0-9]+]], metadata !DIExpression()), !dbg [[DBG244:![0-9]+]]
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG245:![0-9]+]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG245]]
-// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG245]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG245]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr [[A_ADDR]], align 8, !dbg [[DBG245]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[TMP3]] to ptr, !dbg [[DBG245]]
-// CHECK1-NEXT:    store ptr [[TMP4]], ptr [[_TMP1]], align 8, !dbg [[DBG245]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG245]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr addrspace(1), ptr [[B_ADDR]], align 8, !dbg [[DBG245]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = addrspacecast ptr addrspace(1) [[TMP6]] to ptr, !dbg [[DBG245]]
-// CHECK1-NEXT:    store ptr [[TMP7]], ptr [[_TMP2]], align 8, !dbg [[DBG245]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG245]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG245]]
-// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr addrspace(1) [[TMP9]] to ptr, !dbg [[DBG245]]
-// CHECK1-NEXT:    store ptr [[TMP10]], ptr [[_TMP3]], align 8, !dbg [[DBG245]]
-// CHECK1-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[_TMP3]], align 8, !dbg [[DBG245]]
-// CHECK1-NEXT:    [[TMP12:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41_kernel_environment), !dbg [[DBG245]]
-// CHECK1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP12]], -1, !dbg [[DBG245]]
-// CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]], !dbg [[DBG245]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META254:![0-9]+]], metadata !DIExpression()), !dbg [[DBG255:![0-9]+]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG256:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG256]]
+// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG256]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG256]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr [[A_ADDR]], align 8, !dbg [[DBG256]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[TMP3]] to ptr, !dbg [[DBG256]]
+// CHECK1-NEXT:    store ptr [[TMP4]], ptr [[_TMP1]], align 8, !dbg [[DBG256]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG256]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr addrspace(1), ptr [[B_ADDR]], align 8, !dbg [[DBG256]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = addrspacecast ptr addrspace(1) [[TMP6]] to ptr, !dbg [[DBG256]]
+// CHECK1-NEXT:    store ptr [[TMP7]], ptr [[_TMP2]], align 8, !dbg [[DBG256]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG256]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG256]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr addrspace(1) [[TMP9]] to ptr, !dbg [[DBG256]]
+// CHECK1-NEXT:    store ptr [[TMP10]], ptr [[_TMP3]], align 8, !dbg [[DBG256]]
+// CHECK1-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[_TMP3]], align 8, !dbg [[DBG256]]
+// CHECK1-NEXT:    [[TMP12:%.*]] = call i32 @__kmpc_target_init(ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41_kernel_environment, ptr [[DYN_PTR]]), !dbg [[DBG256]]
+// CHECK1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP12]], -1, !dbg [[DBG256]]
+// CHECK1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]], !dbg [[DBG256]]
 // CHECK1:       user_code.entry:
 // CHECK1-NEXT:    [[TMP13:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB20:[0-9]+]])
-// CHECK1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 0, !dbg [[DBG246:![0-9]+]]
-// CHECK1-NEXT:    store ptr [[TMP2]], ptr [[TMP14]], align 8, !dbg [[DBG246]]
-// CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 1, !dbg [[DBG246]]
-// CHECK1-NEXT:    store ptr [[TMP5]], ptr [[TMP15]], align 8, !dbg [[DBG246]]
-// CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 2, !dbg [[DBG246]]
-// CHECK1-NEXT:    store ptr [[TMP8]], ptr [[TMP16]], align 8, !dbg [[DBG246]]
-// CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 3, !dbg [[DBG246]]
-// CHECK1-NEXT:    store ptr [[TMP11]], ptr [[TMP17]], align 8, !dbg [[DBG246]]
-// CHECK1-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB20]], i32 [[TMP13]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41_debug___omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 4), !dbg [[DBG246]]
-// CHECK1-NEXT:    call void @__kmpc_target_deinit(), !dbg [[DBG247:![0-9]+]]
-// CHECK1-NEXT:    ret void, !dbg [[DBG249:![0-9]+]]
+// CHECK1-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 0, !dbg [[DBG257:![0-9]+]]
+// CHECK1-NEXT:    store ptr [[TMP2]], ptr [[TMP14]], align 8, !dbg [[DBG257]]
+// CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 1, !dbg [[DBG257]]
+// CHECK1-NEXT:    store ptr [[TMP5]], ptr [[TMP15]], align 8, !dbg [[DBG257]]
+// CHECK1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 2, !dbg [[DBG257]]
+// CHECK1-NEXT:    store ptr [[TMP8]], ptr [[TMP16]], align 8, !dbg [[DBG257]]
+// CHECK1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [4 x ptr], ptr [[CAPTURED_VARS_ADDRS]], i64 0, i64 3, !dbg [[DBG257]]
+// CHECK1-NEXT:    store ptr [[TMP11]], ptr [[TMP17]], align 8, !dbg [[DBG257]]
+// CHECK1-NEXT:    call void @__kmpc_parallel_51(ptr @[[GLOB20]], i32 [[TMP13]], i32 1, i32 -1, i32 -1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41_debug___omp_outlined, ptr null, ptr [[CAPTURED_VARS_ADDRS]], i64 4), !dbg [[DBG257]]
+// CHECK1-NEXT:    call void @__kmpc_target_deinit(), !dbg [[DBG258:![0-9]+]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG260:![0-9]+]]
 // CHECK1:       worker.exit:
-// CHECK1-NEXT:    ret void, !dbg [[DBG245]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG256]]
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41_debug___omp_outlined_debug__
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], ptr addrspace(1) noalias noundef [[A:%.*]], ptr addrspace(1) noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR2]] !dbg [[DBG250:![0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr addrspace(1) noalias noundef [[C:%.*]], ptr addrspace(1) noalias noundef [[A:%.*]], ptr addrspace(1) noalias noundef [[B:%.*]], ptr addrspace(1) noalias noundef [[BB:%.*]]) #[[ATTR2]] !dbg [[DBG261:![0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -710,148 +727,148 @@ int main() {
 // CHECK1-NEXT:    [[H:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[D:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META253:![0-9]+]], metadata !DIExpression()), !dbg [[DBG254:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META264:![0-9]+]], metadata !DIExpression()), !dbg [[DBG265:![0-9]+]]
 // CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META255:![0-9]+]], metadata !DIExpression()), !dbg [[DBG254]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META266:![0-9]+]], metadata !DIExpression()), !dbg [[DBG265]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META256:![0-9]+]], metadata !DIExpression()), !dbg [[DBG257:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META267:![0-9]+]], metadata !DIExpression()), !dbg [[DBG268:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[A]], ptr [[A_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META258:![0-9]+]], metadata !DIExpression()), !dbg [[DBG259:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META269:![0-9]+]], metadata !DIExpression()), !dbg [[DBG270:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META260:![0-9]+]], metadata !DIExpression()), !dbg [[DBG261:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META271:![0-9]+]], metadata !DIExpression()), !dbg [[DBG272:![0-9]+]]
 // CHECK1-NEXT:    store ptr addrspace(1) [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META262:![0-9]+]], metadata !DIExpression()), !dbg [[DBG263:![0-9]+]]
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG264:![0-9]+]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG264]]
-// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG264]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG264]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr [[A_ADDR]], align 8, !dbg [[DBG264]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[TMP3]] to ptr, !dbg [[DBG264]]
-// CHECK1-NEXT:    store ptr [[TMP4]], ptr [[_TMP1]], align 8, !dbg [[DBG264]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG264]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr addrspace(1), ptr [[B_ADDR]], align 8, !dbg [[DBG264]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = addrspacecast ptr addrspace(1) [[TMP6]] to ptr, !dbg [[DBG264]]
-// CHECK1-NEXT:    store ptr [[TMP7]], ptr [[_TMP2]], align 8, !dbg [[DBG264]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG264]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG264]]
-// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr addrspace(1) [[TMP9]] to ptr, !dbg [[DBG264]]
-// CHECK1-NEXT:    store ptr [[TMP10]], ptr [[_TMP3]], align 8, !dbg [[DBG264]]
-// CHECK1-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[_TMP3]], align 8, !dbg [[DBG264]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_IV]], metadata [[META265:![0-9]+]], metadata !DIExpression()), !dbg [[DBG254]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_LB]], metadata [[META266:![0-9]+]], metadata !DIExpression()), !dbg [[DBG254]]
-// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG267:![0-9]+]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_UB]], metadata [[META268:![0-9]+]], metadata !DIExpression()), !dbg [[DBG254]]
-// CHECK1-NEXT:    store i32 9, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG267]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_STRIDE]], metadata [[META269:![0-9]+]], metadata !DIExpression()), !dbg [[DBG254]]
-// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG267]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_IS_LAST]], metadata [[META270:![0-9]+]], metadata !DIExpression()), !dbg [[DBG254]]
-// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4, !dbg [[DBG267]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[I]], metadata [[META271:![0-9]+]], metadata !DIExpression()), !dbg [[DBG254]]
-// CHECK1-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG264]]
-// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4, !dbg [[DBG264]]
-// CHECK1-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB17:[0-9]+]], i32 [[TMP13]], i32 33, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1), !dbg [[DBG272:![0-9]+]]
-// CHECK1-NEXT:    br label [[OMP_DISPATCH_COND:%.*]], !dbg [[DBG264]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META273:![0-9]+]], metadata !DIExpression()), !dbg [[DBG274:![0-9]+]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[C_ADDR]], align 8, !dbg [[DBG275:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(1) [[TMP0]] to ptr, !dbg [[DBG275]]
+// CHECK1-NEXT:    store ptr [[TMP1]], ptr [[TMP]], align 8, !dbg [[DBG275]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP]], align 8, !dbg [[DBG275]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr addrspace(1), ptr [[A_ADDR]], align 8, !dbg [[DBG275]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = addrspacecast ptr addrspace(1) [[TMP3]] to ptr, !dbg [[DBG275]]
+// CHECK1-NEXT:    store ptr [[TMP4]], ptr [[_TMP1]], align 8, !dbg [[DBG275]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[_TMP1]], align 8, !dbg [[DBG275]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr addrspace(1), ptr [[B_ADDR]], align 8, !dbg [[DBG275]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = addrspacecast ptr addrspace(1) [[TMP6]] to ptr, !dbg [[DBG275]]
+// CHECK1-NEXT:    store ptr [[TMP7]], ptr [[_TMP2]], align 8, !dbg [[DBG275]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[_TMP2]], align 8, !dbg [[DBG275]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = load ptr addrspace(1), ptr [[BB_ADDR]], align 8, !dbg [[DBG275]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr addrspace(1) [[TMP9]] to ptr, !dbg [[DBG275]]
+// CHECK1-NEXT:    store ptr [[TMP10]], ptr [[_TMP3]], align 8, !dbg [[DBG275]]
+// CHECK1-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[_TMP3]], align 8, !dbg [[DBG275]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_IV]], metadata [[META276:![0-9]+]], metadata !DIExpression()), !dbg [[DBG265]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_LB]], metadata [[META277:![0-9]+]], metadata !DIExpression()), !dbg [[DBG265]]
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG278:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_UB]], metadata [[META279:![0-9]+]], metadata !DIExpression()), !dbg [[DBG265]]
+// CHECK1-NEXT:    store i32 9, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG278]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_STRIDE]], metadata [[META280:![0-9]+]], metadata !DIExpression()), !dbg [[DBG265]]
+// CHECK1-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG278]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTOMP_IS_LAST]], metadata [[META281:![0-9]+]], metadata !DIExpression()), !dbg [[DBG265]]
+// CHECK1-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4, !dbg [[DBG278]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[I]], metadata [[META282:![0-9]+]], metadata !DIExpression()), !dbg [[DBG265]]
+// CHECK1-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG275]]
+// CHECK1-NEXT:    [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4, !dbg [[DBG275]]
+// CHECK1-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB17:[0-9]+]], i32 [[TMP13]], i32 33, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1), !dbg [[DBG283:![0-9]+]]
+// CHECK1-NEXT:    br label [[OMP_DISPATCH_COND:%.*]], !dbg [[DBG275]]
 // CHECK1:       omp.dispatch.cond:
-// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG267]]
-// CHECK1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP14]], 9, !dbg [[DBG267]]
-// CHECK1-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]], !dbg [[DBG267]]
+// CHECK1-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG278]]
+// CHECK1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP14]], 9, !dbg [[DBG278]]
+// CHECK1-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]], !dbg [[DBG278]]
 // CHECK1:       cond.true:
-// CHECK1-NEXT:    br label [[COND_END:%.*]], !dbg [[DBG267]]
+// CHECK1-NEXT:    br label [[COND_END:%.*]], !dbg [[DBG278]]
 // CHECK1:       cond.false:
-// CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG267]]
-// CHECK1-NEXT:    br label [[COND_END]], !dbg [[DBG267]]
+// CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG278]]
+// CHECK1-NEXT:    br label [[COND_END]], !dbg [[DBG278]]
 // CHECK1:       cond.end:
-// CHECK1-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP15]], [[COND_FALSE]] ], !dbg [[DBG267]]
-// CHECK1-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4, !dbg [[DBG267]]
-// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG267]]
-// CHECK1-NEXT:    store i32 [[TMP16]], ptr [[DOTOMP_IV]], align 4, !dbg [[DBG267]]
-// CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG267]]
-// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG267]]
-// CHECK1-NEXT:    [[CMP5:%.*]] = icmp sle i32 [[TMP17]], [[TMP18]], !dbg [[DBG264]]
-// CHECK1-NEXT:    br i1 [[CMP5]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]], !dbg [[DBG264]]
+// CHECK1-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP15]], [[COND_FALSE]] ], !dbg [[DBG278]]
+// CHECK1-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4, !dbg [[DBG278]]
+// CHECK1-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG278]]
+// CHECK1-NEXT:    store i32 [[TMP16]], ptr [[DOTOMP_IV]], align 4, !dbg [[DBG278]]
+// CHECK1-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG278]]
+// CHECK1-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG278]]
+// CHECK1-NEXT:    [[CMP5:%.*]] = icmp sle i32 [[TMP17]], [[TMP18]], !dbg [[DBG275]]
+// CHECK1-NEXT:    br i1 [[CMP5]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]], !dbg [[DBG275]]
 // CHECK1:       omp.dispatch.body:
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]], !dbg [[DBG264]]
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]], !dbg [[DBG275]]
 // CHECK1:       omp.inner.for.cond:
-// CHECK1-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG267]]
-// CHECK1-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG267]]
-// CHECK1-NEXT:    [[CMP6:%.*]] = icmp sle i32 [[TMP19]], [[TMP20]], !dbg [[DBG264]]
-// CHECK1-NEXT:    br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]], !dbg [[DBG264]]
+// CHECK1-NEXT:    [[TMP19:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG278]]
+// CHECK1-NEXT:    [[TMP20:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG278]]
+// CHECK1-NEXT:    [[CMP6:%.*]] = icmp sle i32 [[TMP19]], [[TMP20]], !dbg [[DBG275]]
+// CHECK1-NEXT:    br i1 [[CMP6]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]], !dbg [[DBG275]]
 // CHECK1:       omp.inner.for.body:
-// CHECK1-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG267]]
-// CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP21]], 1, !dbg [[DBG273:![0-9]+]]
-// CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]], !dbg [[DBG273]]
-// CHECK1-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !dbg [[DBG273]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[F]], metadata [[META274:![0-9]+]], metadata !DIExpression()), !dbg [[DBG276:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 1, !dbg [[DBG277:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX]], i64 0, i64 1, !dbg [[DBG277]]
-// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX7]], i64 0, i64 1, !dbg [[DBG277]]
-// CHECK1-NEXT:    store ptr [[ARRAYIDX8]], ptr [[F]], align 8, !dbg [[DBG276]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[G]], metadata [[META278:![0-9]+]], metadata !DIExpression()), !dbg [[DBG279:![0-9]+]]
-// CHECK1-NEXT:    store ptr [[TMP5]], ptr [[G]], align 8, !dbg [[DBG279]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[H]], metadata [[META280:![0-9]+]], metadata !DIExpression()), !dbg [[DBG281:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP8]], i64 0, i64 1, !dbg [[DBG282:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX9]], i64 0, i64 1, !dbg [[DBG282]]
-// CHECK1-NEXT:    store ptr [[ARRAYIDX10]], ptr [[H]], align 8, !dbg [[DBG281]]
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[D]], metadata [[META283:![0-9]+]], metadata !DIExpression()), !dbg [[DBG284:![0-9]+]]
-// CHECK1-NEXT:    store i32 15, ptr [[D]], align 4, !dbg [[DBG284]]
-// CHECK1-NEXT:    store i32 5, ptr [[TMP5]], align 4, !dbg [[DBG285:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP8]], i64 0, i64 0, !dbg [[DBG286:![0-9]+]]
-// CHECK1-NEXT:    [[TMP22:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG287:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP22]] to i64, !dbg [[DBG286]]
-// CHECK1-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX11]], i64 0, i64 [[IDXPROM]], !dbg [[DBG286]]
-// CHECK1-NEXT:    store i32 10, ptr [[ARRAYIDX12]], align 4, !dbg [[DBG288:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG289:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX13]], i64 0, i64 0, !dbg [[DBG289]]
-// CHECK1-NEXT:    [[TMP23:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG290:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM15:%.*]] = sext i32 [[TMP23]] to i64, !dbg [[DBG289]]
-// CHECK1-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX14]], i64 0, i64 [[IDXPROM15]], !dbg [[DBG289]]
-// CHECK1-NEXT:    store i32 11, ptr [[ARRAYIDX16]], align 4, !dbg [[DBG291:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG292:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX17]], i64 0, i64 0, !dbg [[DBG292]]
-// CHECK1-NEXT:    [[TMP24:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG293:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM19:%.*]] = sext i32 [[TMP24]] to i64, !dbg [[DBG292]]
-// CHECK1-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX18]], i64 0, i64 [[IDXPROM19]], !dbg [[DBG292]]
-// CHECK1-NEXT:    [[TMP25:%.*]] = load i32, ptr [[ARRAYIDX20]], align 4, !dbg [[DBG292]]
-// CHECK1-NEXT:    [[ARRAYIDX21:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP8]], i64 0, i64 0, !dbg [[DBG294:![0-9]+]]
-// CHECK1-NEXT:    [[TMP26:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG295:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM22:%.*]] = sext i32 [[TMP26]] to i64, !dbg [[DBG294]]
-// CHECK1-NEXT:    [[ARRAYIDX23:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX21]], i64 0, i64 [[IDXPROM22]], !dbg [[DBG294]]
-// CHECK1-NEXT:    store i32 [[TMP25]], ptr [[ARRAYIDX23]], align 4, !dbg [[DBG296:![0-9]+]]
-// CHECK1-NEXT:    [[ARRAYIDX24:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP8]], i64 0, i64 0, !dbg [[DBG297:![0-9]+]]
-// CHECK1-NEXT:    [[TMP27:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG298:![0-9]+]]
-// CHECK1-NEXT:    [[IDXPROM25:%.*]] = sext i32 [[TMP27]] to i64, !dbg [[DBG297]]
-// CHECK1-NEXT:    [[ARRAYIDX26:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX24]], i64 0, i64 [[IDXPROM25]], !dbg [[DBG297]]
-// CHECK1-NEXT:    [[TMP28:%.*]] = load i32, ptr [[ARRAYIDX26]], align 4, !dbg [[DBG297]]
-// CHECK1-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP28]], 0, !dbg [[DBG297]]
-// CHECK1-NEXT:    [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8, !dbg [[DBG299:![0-9]+]]
-// CHECK1-NEXT:    store i8 [[FROMBOOL]], ptr [[TMP11]], align 1, !dbg [[DBG299]]
-// CHECK1-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]], !dbg [[DBG300:![0-9]+]]
+// CHECK1-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG278]]
+// CHECK1-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP21]], 1, !dbg [[DBG284:![0-9]+]]
+// CHECK1-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]], !dbg [[DBG284]]
+// CHECK1-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !dbg [[DBG284]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[F]], metadata [[META285:![0-9]+]], metadata !DIExpression()), !dbg [[DBG287:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 1, !dbg [[DBG288:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX]], i64 0, i64 1, !dbg [[DBG288]]
+// CHECK1-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX7]], i64 0, i64 1, !dbg [[DBG288]]
+// CHECK1-NEXT:    store ptr [[ARRAYIDX8]], ptr [[F]], align 8, !dbg [[DBG287]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[G]], metadata [[META289:![0-9]+]], metadata !DIExpression()), !dbg [[DBG290:![0-9]+]]
+// CHECK1-NEXT:    store ptr [[TMP5]], ptr [[G]], align 8, !dbg [[DBG290]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[H]], metadata [[META291:![0-9]+]], metadata !DIExpression()), !dbg [[DBG292:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP8]], i64 0, i64 1, !dbg [[DBG293:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX9]], i64 0, i64 1, !dbg [[DBG293]]
+// CHECK1-NEXT:    store ptr [[ARRAYIDX10]], ptr [[H]], align 8, !dbg [[DBG292]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[D]], metadata [[META294:![0-9]+]], metadata !DIExpression()), !dbg [[DBG295:![0-9]+]]
+// CHECK1-NEXT:    store i32 15, ptr [[D]], align 4, !dbg [[DBG295]]
+// CHECK1-NEXT:    store i32 5, ptr [[TMP5]], align 4, !dbg [[DBG296:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP8]], i64 0, i64 0, !dbg [[DBG297:![0-9]+]]
+// CHECK1-NEXT:    [[TMP22:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG298:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP22]] to i64, !dbg [[DBG297]]
+// CHECK1-NEXT:    [[ARRAYIDX12:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX11]], i64 0, i64 [[IDXPROM]], !dbg [[DBG297]]
+// CHECK1-NEXT:    store i32 10, ptr [[ARRAYIDX12]], align 4, !dbg [[DBG299:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG300:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX13]], i64 0, i64 0, !dbg [[DBG300]]
+// CHECK1-NEXT:    [[TMP23:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG301:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM15:%.*]] = sext i32 [[TMP23]] to i64, !dbg [[DBG300]]
+// CHECK1-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX14]], i64 0, i64 [[IDXPROM15]], !dbg [[DBG300]]
+// CHECK1-NEXT:    store i32 11, ptr [[ARRAYIDX16]], align 4, !dbg [[DBG302:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds [10 x [10 x [10 x i32]]], ptr [[TMP2]], i64 0, i64 0, !dbg [[DBG303:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[ARRAYIDX17]], i64 0, i64 0, !dbg [[DBG303]]
+// CHECK1-NEXT:    [[TMP24:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG304:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM19:%.*]] = sext i32 [[TMP24]] to i64, !dbg [[DBG303]]
+// CHECK1-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX18]], i64 0, i64 [[IDXPROM19]], !dbg [[DBG303]]
+// CHECK1-NEXT:    [[TMP25:%.*]] = load i32, ptr [[ARRAYIDX20]], align 4, !dbg [[DBG303]]
+// CHECK1-NEXT:    [[ARRAYIDX21:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP8]], i64 0, i64 0, !dbg [[DBG305:![0-9]+]]
+// CHECK1-NEXT:    [[TMP26:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG306:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM22:%.*]] = sext i32 [[TMP26]] to i64, !dbg [[DBG305]]
+// CHECK1-NEXT:    [[ARRAYIDX23:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX21]], i64 0, i64 [[IDXPROM22]], !dbg [[DBG305]]
+// CHECK1-NEXT:    store i32 [[TMP25]], ptr [[ARRAYIDX23]], align 4, !dbg [[DBG307:![0-9]+]]
+// CHECK1-NEXT:    [[ARRAYIDX24:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr [[TMP8]], i64 0, i64 0, !dbg [[DBG308:![0-9]+]]
+// CHECK1-NEXT:    [[TMP27:%.*]] = load i32, ptr [[TMP5]], align 4, !dbg [[DBG309:![0-9]+]]
+// CHECK1-NEXT:    [[IDXPROM25:%.*]] = sext i32 [[TMP27]] to i64, !dbg [[DBG308]]
+// CHECK1-NEXT:    [[ARRAYIDX26:%.*]] = getelementptr inbounds [10 x i32], ptr [[ARRAYIDX24]], i64 0, i64 [[IDXPROM25]], !dbg [[DBG308]]
+// CHECK1-NEXT:    [[TMP28:%.*]] = load i32, ptr [[ARRAYIDX26]], align 4, !dbg [[DBG308]]
+// CHECK1-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP28]], 0, !dbg [[DBG308]]
+// CHECK1-NEXT:    [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8, !dbg [[DBG310:![0-9]+]]
+// CHECK1-NEXT:    store i8 [[FROMBOOL]], ptr [[TMP11]], align 1, !dbg [[DBG310]]
+// CHECK1-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]], !dbg [[DBG311:![0-9]+]]
 // CHECK1:       omp.body.continue:
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]], !dbg [[DBG272]]
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]], !dbg [[DBG283]]
 // CHECK1:       omp.inner.for.inc:
-// CHECK1-NEXT:    [[TMP29:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG267]]
-// CHECK1-NEXT:    [[ADD27:%.*]] = add nsw i32 [[TMP29]], 1, !dbg [[DBG264]]
-// CHECK1-NEXT:    store i32 [[ADD27]], ptr [[DOTOMP_IV]], align 4, !dbg [[DBG264]]
-// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !dbg [[DBG272]], !llvm.loop [[LOOP301:![0-9]+]]
+// CHECK1-NEXT:    [[TMP29:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !dbg [[DBG278]]
+// CHECK1-NEXT:    [[ADD27:%.*]] = add nsw i32 [[TMP29]], 1, !dbg [[DBG275]]
+// CHECK1-NEXT:    store i32 [[ADD27]], ptr [[DOTOMP_IV]], align 4, !dbg [[DBG275]]
+// CHECK1-NEXT:    br label [[OMP_INNER_FOR_COND]], !dbg [[DBG283]], !llvm.loop [[LOOP312:![0-9]+]]
 // CHECK1:       omp.inner.for.end:
-// CHECK1-NEXT:    br label [[OMP_DISPATCH_INC:%.*]], !dbg [[DBG272]]
+// CHECK1-NEXT:    br label [[OMP_DISPATCH_INC:%.*]], !dbg [[DBG283]]
 // CHECK1:       omp.dispatch.inc:
-// CHECK1-NEXT:    [[TMP30:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG267]]
-// CHECK1-NEXT:    [[TMP31:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG267]]
-// CHECK1-NEXT:    [[ADD28:%.*]] = add nsw i32 [[TMP30]], [[TMP31]], !dbg [[DBG264]]
-// CHECK1-NEXT:    store i32 [[ADD28]], ptr [[DOTOMP_LB]], align 4, !dbg [[DBG264]]
-// CHECK1-NEXT:    [[TMP32:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG267]]
-// CHECK1-NEXT:    [[TMP33:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG267]]
-// CHECK1-NEXT:    [[ADD29:%.*]] = add nsw i32 [[TMP32]], [[TMP33]], !dbg [[DBG264]]
-// CHECK1-NEXT:    store i32 [[ADD29]], ptr [[DOTOMP_UB]], align 4, !dbg [[DBG264]]
-// CHECK1-NEXT:    br label [[OMP_DISPATCH_COND]], !dbg [[DBG272]], !llvm.loop [[LOOP303:![0-9]+]]
+// CHECK1-NEXT:    [[TMP30:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4, !dbg [[DBG278]]
+// CHECK1-NEXT:    [[TMP31:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG278]]
+// CHECK1-NEXT:    [[ADD28:%.*]] = add nsw i32 [[TMP30]], [[TMP31]], !dbg [[DBG275]]
+// CHECK1-NEXT:    store i32 [[ADD28]], ptr [[DOTOMP_LB]], align 4, !dbg [[DBG275]]
+// CHECK1-NEXT:    [[TMP32:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4, !dbg [[DBG278]]
+// CHECK1-NEXT:    [[TMP33:%.*]] = load i32, ptr [[DOTOMP_STRIDE]], align 4, !dbg [[DBG278]]
+// CHECK1-NEXT:    [[ADD29:%.*]] = add nsw i32 [[TMP32]], [[TMP33]], !dbg [[DBG275]]
+// CHECK1-NEXT:    store i32 [[ADD29]], ptr [[DOTOMP_UB]], align 4, !dbg [[DBG275]]
+// CHECK1-NEXT:    br label [[OMP_DISPATCH_COND]], !dbg [[DBG283]], !llvm.loop [[LOOP314:![0-9]+]]
 // CHECK1:       omp.dispatch.end:
-// CHECK1-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB19:[0-9]+]], i32 [[TMP13]]), !dbg [[DBG302:![0-9]+]]
-// CHECK1-NEXT:    ret void, !dbg [[DBG304:![0-9]+]]
+// CHECK1-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB19:[0-9]+]], i32 [[TMP13]]), !dbg [[DBG313:![0-9]+]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG315:![0-9]+]]
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41_debug___omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR2]] !dbg [[DBG305:![0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR2]] !dbg [[DBG316:![0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -860,62 +877,66 @@ int main() {
 // CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[BB_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META308:![0-9]+]], metadata !DIExpression()), !dbg [[DBG309:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTGLOBAL_TID__ADDR]], metadata [[META319:![0-9]+]], metadata !DIExpression()), !dbg [[DBG320:![0-9]+]]
 // CHECK1-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META310:![0-9]+]], metadata !DIExpression()), !dbg [[DBG309]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DOTBOUND_TID__ADDR]], metadata [[META321:![0-9]+]], metadata !DIExpression()), !dbg [[DBG320]]
 // CHECK1-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META311:![0-9]+]], metadata !DIExpression()), !dbg [[DBG309]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META322:![0-9]+]], metadata !DIExpression()), !dbg [[DBG320]]
 // CHECK1-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META312:![0-9]+]], metadata !DIExpression()), !dbg [[DBG309]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META323:![0-9]+]], metadata !DIExpression()), !dbg [[DBG320]]
 // CHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META313:![0-9]+]], metadata !DIExpression()), !dbg [[DBG309]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META324:![0-9]+]], metadata !DIExpression()), !dbg [[DBG320]]
 // CHECK1-NEXT:    store ptr [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META314:![0-9]+]], metadata !DIExpression()), !dbg [[DBG309]]
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG315:![0-9]+]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG315]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG315]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG315]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG315]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG315]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG315]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG315]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG315]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG315]]
-// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr [[TMP6]] to ptr addrspace(1), !dbg [[DBG315]]
-// CHECK1-NEXT:    [[TMP11:%.*]] = addrspacecast ptr [[TMP7]] to ptr addrspace(1), !dbg [[DBG315]]
-// CHECK1-NEXT:    [[TMP12:%.*]] = addrspacecast ptr [[TMP8]] to ptr addrspace(1), !dbg [[DBG315]]
-// CHECK1-NEXT:    [[TMP13:%.*]] = addrspacecast ptr [[TMP9]] to ptr addrspace(1), !dbg [[DBG315]]
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41_debug___omp_outlined_debug__(ptr [[TMP4]], ptr [[TMP5]], ptr addrspace(1) [[TMP10]], ptr addrspace(1) [[TMP11]], ptr addrspace(1) [[TMP12]], ptr addrspace(1) [[TMP13]]) #[[ATTR4]], !dbg [[DBG315]]
-// CHECK1-NEXT:    ret void, !dbg [[DBG315]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META325:![0-9]+]], metadata !DIExpression()), !dbg [[DBG320]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG326:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG326]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG326]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG326]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8, !dbg [[DBG326]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[DOTBOUND_TID__ADDR]], align 8, !dbg [[DBG326]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG326]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG326]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG326]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG326]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr [[TMP6]] to ptr addrspace(1), !dbg [[DBG326]]
+// CHECK1-NEXT:    [[TMP11:%.*]] = addrspacecast ptr [[TMP7]] to ptr addrspace(1), !dbg [[DBG326]]
+// CHECK1-NEXT:    [[TMP12:%.*]] = addrspacecast ptr [[TMP8]] to ptr addrspace(1), !dbg [[DBG326]]
+// CHECK1-NEXT:    [[TMP13:%.*]] = addrspacecast ptr [[TMP9]] to ptr addrspace(1), !dbg [[DBG326]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41_debug___omp_outlined_debug__(ptr [[TMP4]], ptr [[TMP5]], ptr addrspace(1) [[TMP10]], ptr addrspace(1) [[TMP11]], ptr addrspace(1) [[TMP12]], ptr addrspace(1) [[TMP13]]) #[[ATTR4]], !dbg [[DBG326]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG326]]
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41
-// CHECK1-SAME: (ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR6]] !dbg [[DBG316:![0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[C:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[B:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[BB:%.*]]) #[[ATTR6]] !dbg [[DBG327:![0-9]+]] {
 // CHECK1-NEXT:  entry:
+// CHECK1-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[BB_ADDR:%.*]] = alloca ptr, align 8
+// CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[DYN_PTR_ADDR]], metadata [[META330:![0-9]+]], metadata !DIExpression()), !dbg [[DBG331:![0-9]+]]
 // CHECK1-NEXT:    store ptr [[C]], ptr [[C_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META319:![0-9]+]], metadata !DIExpression()), !dbg [[DBG320:![0-9]+]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[C_ADDR]], metadata [[META332:![0-9]+]], metadata !DIExpression()), !dbg [[DBG331]]
 // CHECK1-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META321:![0-9]+]], metadata !DIExpression()), !dbg [[DBG320]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[A_ADDR]], metadata [[META333:![0-9]+]], metadata !DIExpression()), !dbg [[DBG331]]
 // CHECK1-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META322:![0-9]+]], metadata !DIExpression()), !dbg [[DBG320]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[B_ADDR]], metadata [[META334:![0-9]+]], metadata !DIExpression()), !dbg [[DBG331]]
 // CHECK1-NEXT:    store ptr [[BB]], ptr [[BB_ADDR]], align 8
-// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META323:![0-9]+]], metadata !DIExpression()), !dbg [[DBG320]]
-// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG324:![0-9]+]]
-// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG324]]
-// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG324]]
-// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG324]]
-// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG324]]
-// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG324]]
-// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG324]]
-// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG324]]
-// CHECK1-NEXT:    [[TMP8:%.*]] = addrspacecast ptr [[TMP4]] to ptr addrspace(1), !dbg [[DBG324]]
-// CHECK1-NEXT:    [[TMP9:%.*]] = addrspacecast ptr [[TMP5]] to ptr addrspace(1), !dbg [[DBG324]]
-// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr [[TMP6]] to ptr addrspace(1), !dbg [[DBG324]]
-// CHECK1-NEXT:    [[TMP11:%.*]] = addrspacecast ptr [[TMP7]] to ptr addrspace(1), !dbg [[DBG324]]
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41_debug__(ptr addrspace(1) [[TMP8]], ptr addrspace(1) [[TMP9]], ptr addrspace(1) [[TMP10]], ptr addrspace(1) [[TMP11]]) #[[ATTR4]], !dbg [[DBG324]]
-// CHECK1-NEXT:    ret void, !dbg [[DBG324]]
+// CHECK1-NEXT:    call void @llvm.dbg.declare(metadata ptr [[BB_ADDR]], metadata [[META335:![0-9]+]], metadata !DIExpression()), !dbg [[DBG331]]
+// CHECK1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG336:![0-9]+]]
+// CHECK1-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG336]]
+// CHECK1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG336]]
+// CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG336]]
+// CHECK1-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DYN_PTR_ADDR]], align 8, !dbg [[DBG336]]
+// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[C_ADDR]], align 8, !dbg [[DBG336]]
+// CHECK1-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[A_ADDR]], align 8, !dbg [[DBG336]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[B_ADDR]], align 8, !dbg [[DBG336]]
+// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[BB_ADDR]], align 8, !dbg [[DBG336]]
+// CHECK1-NEXT:    [[TMP9:%.*]] = addrspacecast ptr [[TMP5]] to ptr addrspace(1), !dbg [[DBG336]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = addrspacecast ptr [[TMP6]] to ptr addrspace(1), !dbg [[DBG336]]
+// CHECK1-NEXT:    [[TMP11:%.*]] = addrspacecast ptr [[TMP7]] to ptr addrspace(1), !dbg [[DBG336]]
+// CHECK1-NEXT:    [[TMP12:%.*]] = addrspacecast ptr [[TMP8]] to ptr addrspace(1), !dbg [[DBG336]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l41_debug__(ptr [[TMP4]], ptr addrspace(1) [[TMP9]], ptr addrspace(1) [[TMP10]], ptr addrspace(1) [[TMP11]], ptr addrspace(1) [[TMP12]]) #[[ATTR4]], !dbg [[DBG336]]
+// CHECK1-NEXT:    ret void, !dbg [[DBG336]]
 //
diff --git a/clang/test/OpenMP/target_parallel_generic_loop_codegen.cpp b/clang/test/OpenMP/target_parallel_generic_loop_codegen.cpp
index bd889ccad7b14..b9ed9bc955001 100644
--- a/clang/test/OpenMP/target_parallel_generic_loop_codegen.cpp
+++ b/clang/test/OpenMP/target_parallel_generic_loop_codegen.cpp
@@ -42,18 +42,21 @@ int main() {
 }
 #endif
 // IR-GPU-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l37
-// IR-GPU-SAME: (ptr noundef nonnull align 4 dereferenceable(256) [[DEVICE_RESULT:%.*]], ptr noundef [[OMP_PTEAM_MEM_ALLOC:%.*]]) #[[ATTR0:[0-9]+]] {
+// IR-GPU-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(256) [[DEVICE_RESULT:%.*]], ptr noundef [[OMP_PTEAM_MEM_ALLOC:%.*]]) #[[ATTR0:[0-9]+]] {
 // IR-GPU-NEXT:  entry:
+// IR-GPU-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 // IR-GPU-NEXT:    [[DEVICE_RESULT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 // IR-GPU-NEXT:    [[OMP_PTEAM_MEM_ALLOC_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 // IR-GPU-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x ptr], align 8, addrspace(5)
+// IR-GPU-NEXT:    [[DYN_PTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DYN_PTR_ADDR]] to ptr
 // IR-GPU-NEXT:    [[DEVICE_RESULT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DEVICE_RESULT_ADDR]] to ptr
 // IR-GPU-NEXT:    [[OMP_PTEAM_MEM_ALLOC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OMP_PTEAM_MEM_ALLOC_ADDR]] to ptr
 // IR-GPU-NEXT:    [[CAPTURED_VARS_ADDRS_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CAPTURED_VARS_ADDRS]] to ptr
+// IR-GPU-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR_ASCAST]], align 8
 // IR-GPU-NEXT:    store ptr [[DEVICE_RESULT]], ptr [[DEVICE_RESULT_ADDR_ASCAST]], align 8
 // IR-GPU-NEXT:    store ptr [[OMP_PTEAM_MEM_ALLOC]], ptr [[OMP_PTEAM_MEM_ALLOC_ADDR_ASCAST]], align 8
 // IR-GPU-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[DEVICE_RESULT_ADDR_ASCAST]], align 8
-// IR-GPU-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l37_kernel_environment to ptr))
+// IR-GPU-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l37_kernel_environment to ptr), ptr [[DYN_PTR]])
 // IR-GPU-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // IR-GPU-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // IR-GPU:       user_code.entry:
diff --git a/clang/test/OpenMP/target_parallel_generic_loop_depend_codegen.cpp b/clang/test/OpenMP/target_parallel_generic_loop_depend_codegen.cpp
index 6403240091b48..97bc63d49f7fc 100644
--- a/clang/test/OpenMP/target_parallel_generic_loop_depend_codegen.cpp
+++ b/clang/test/OpenMP/target_parallel_generic_loop_depend_codegen.cpp
@@ -134,7 +134,7 @@ int foo(int n) {
 //
 //
 // CHECK-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK-SAME: () #[[ATTR8:[0-9]+]] {
+// CHECK-SAME: () #[[ATTR7:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK-NEXT:    ret void
@@ -183,11 +183,8 @@ int foo(int n) {
 //
 //
 //
-// TCHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l66
-// TCHECK-SAME: () #[[ATTR0:[0-9]+]] {
-// TCHECK-NEXT:  entry:
-// TCHECK-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB2:[0-9]+]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l66.omp_outlined)
-// TCHECK-NEXT:    ret void
+//
+//
 //
 //
 //
@@ -208,3 +205,4 @@ int foo(int n) {
 //// NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 // SIMD-ONLY0: {{.*}}
 // SIMD-ONLY1: {{.*}}
+// TCHECK: {{.*}}
diff --git a/clang/test/OpenMP/target_parallel_generic_loop_tl_codegen.cpp b/clang/test/OpenMP/target_parallel_generic_loop_tl_codegen.cpp
index 32bbb546a05a3..c98ece61fb786 100644
--- a/clang/test/OpenMP/target_parallel_generic_loop_tl_codegen.cpp
+++ b/clang/test/OpenMP/target_parallel_generic_loop_tl_codegen.cpp
@@ -1,4 +1,4 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --include-generated-funcs --version 2
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_  size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _ --version 2
 // RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fopenmp -fopenmp-version=51 -emit-llvm %s -o - | FileCheck --check-prefix=OMP51 %s
 // RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fopenmp -fopenmp-version=51 -x c++ -std=c++11 -emit-pch -o %t %s
 // RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fopenmp -fopenmp-version=51 -x c++ -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix=OMP51
@@ -18,7 +18,8 @@ int thread_limit_target_parallel_loop() {
 }
 
 #endif
-// OMP51-LABEL: define {{.*}}thread_limit_target_parallel_loop{{.*}}{
+// OMP51-LABEL: define dso_local noundef i32 @_Z33thread_limit_target_parallel_loopv
+// OMP51-SAME: () #[[ATTR0:[0-9]+]] {
 // OMP51-NEXT:  entry:
 // OMP51-NEXT:    [[AGG_CAPTURED:%.*]] = alloca [[STRUCT_ANON:%.*]], align 1
 // OMP51-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB2:[0-9]+]])
@@ -30,7 +31,75 @@ int thread_limit_target_parallel_loop() {
 // OMP51-NEXT:    ret i32 0
 //
 //
-// OMP51-LABEL: define {{.*}}omp_task_entry{{.*}}{
+// OMP51-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z33thread_limit_target_parallel_loopv_l14
+// OMP51-SAME: () #[[ATTR1:[0-9]+]] {
+// OMP51-NEXT:  entry:
+// OMP51-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB2]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z33thread_limit_target_parallel_loopv_l14.omp_outlined)
+// OMP51-NEXT:    ret void
+//
+//
+// OMP51-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z33thread_limit_target_parallel_loopv_l14.omp_outlined
+// OMP51-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// OMP51-NEXT:  entry:
+// OMP51-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// OMP51-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// OMP51-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// OMP51-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// OMP51-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// OMP51-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// OMP51-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// OMP51-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// OMP51-NEXT:    [[I:%.*]] = alloca i32, align 4
+// OMP51-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// OMP51-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// OMP51-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4
+// OMP51-NEXT:    store i32 1, ptr [[DOTOMP_UB]], align 4
+// OMP51-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// OMP51-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// OMP51-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// OMP51-NEXT:    [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
+// OMP51-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB1:[0-9]+]], i32 [[TMP1]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// OMP51-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// OMP51-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP2]], 1
+// OMP51-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// OMP51:       cond.true:
+// OMP51-NEXT:    br label [[COND_END:%.*]]
+// OMP51:       cond.false:
+// OMP51-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// OMP51-NEXT:    br label [[COND_END]]
+// OMP51:       cond.end:
+// OMP51-NEXT:    [[COND:%.*]] = phi i32 [ 1, [[COND_TRUE]] ], [ [[TMP3]], [[COND_FALSE]] ]
+// OMP51-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// OMP51-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// OMP51-NEXT:    store i32 [[TMP4]], ptr [[DOTOMP_IV]], align 4
+// OMP51-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// OMP51:       omp.inner.for.cond:
+// OMP51-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// OMP51-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// OMP51-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[TMP5]], [[TMP6]]
+// OMP51-NEXT:    br i1 [[CMP1]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// OMP51:       omp.inner.for.body:
+// OMP51-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// OMP51-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP7]], 1
+// OMP51-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// OMP51-NEXT:    store i32 [[ADD]], ptr [[I]], align 4
+// OMP51-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// OMP51:       omp.body.continue:
+// OMP51-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// OMP51:       omp.inner.for.inc:
+// OMP51-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// OMP51-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP8]], 1
+// OMP51-NEXT:    store i32 [[ADD2]], ptr [[DOTOMP_IV]], align 4
+// OMP51-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// OMP51:       omp.inner.for.end:
+// OMP51-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// OMP51:       omp.loop.exit:
+// OMP51-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP1]])
+// OMP51-NEXT:    ret void
+//
+//
+// OMP51-LABEL: define internal noundef i32 @.omp_task_entry.
+// OMP51-SAME: (i32 noundef [[TMP0:%.*]], ptr noalias noundef [[TMP1:%.*]]) #[[ATTR3:[0-9]+]] {
 // OMP51-NEXT:  entry:
 // OMP51-NEXT:    [[DOTGLOBAL_TID__ADDR_I:%.*]] = alloca i32, align 4
 // OMP51-NEXT:    [[DOTPART_ID__ADDR_I:%.*]] = alloca ptr, align 8
@@ -61,6 +130,6 @@ int thread_limit_target_parallel_loop() {
 // OMP51-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !14
 // OMP51-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !14
 // OMP51-NEXT:    call void @__kmpc_set_thread_limit(ptr @[[GLOB2]], i32 [[TMP9]], i32 2)
-// OMP51-NEXT:    call void @__omp_offloading{{.*}}thread_limit_target_parallel_loop{{.*}}
+// OMP51-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z33thread_limit_target_parallel_loopv_l14() #[[ATTR2]]
 // OMP51-NEXT:    ret i32 0
 //
diff --git a/clang/test/OpenMP/target_parallel_generic_loop_uses_allocators_codegen.cpp b/clang/test/OpenMP/target_parallel_generic_loop_uses_allocators_codegen.cpp
index 364797e5a91aa..50b38c4855d0a 100644
--- a/clang/test/OpenMP/target_parallel_generic_loop_uses_allocators_codegen.cpp
+++ b/clang/test/OpenMP/target_parallel_generic_loop_uses_allocators_codegen.cpp
@@ -144,7 +144,7 @@ void foo() {
 //
 //
 // CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l66.omp_outlined
-// CHECK-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -204,7 +204,7 @@ void foo() {
 //
 //
 // CHECK-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK-SAME: () #[[ATTR4:[0-9]+]] {
+// CHECK-SAME: () #[[ATTR3:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK-NEXT:    ret void
diff --git a/clang/test/OpenMP/target_parallel_if_codegen.cpp b/clang/test/OpenMP/target_parallel_if_codegen.cpp
index 6bf3c09a2a6c1..5ad2c6a9879f8 100644
--- a/clang/test/OpenMP/target_parallel_if_codegen.cpp
+++ b/clang/test/OpenMP/target_parallel_if_codegen.cpp
@@ -299,7 +299,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP33:%.*]] = icmp ne i32 [[TMP32]], 0
 // CHECK1-NEXT:    br i1 [[TMP33]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK1:       omp_offload.failed:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l121(ptr [[THIS1]], i64 [[TMP2]], i64 [[TMP4]]) #[[ATTR3:[0-9]+]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l121(ptr [[THIS1]], i64 [[TMP2]], i64 [[TMP4]]) #[[ATTR2:[0-9]+]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    [[TMP34:%.*]] = load i32, ptr [[N_ADDR]], align 4
@@ -364,12 +364,12 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP63:%.*]] = icmp ne i32 [[TMP62]], 0
 // CHECK1-NEXT:    br i1 [[TMP63]], label [[OMP_OFFLOAD_FAILED17:%.*]], label [[OMP_OFFLOAD_CONT18:%.*]]
 // CHECK1:       omp_offload.failed17:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l126(ptr [[THIS1]], i64 [[TMP36]]) #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l126(ptr [[THIS1]], i64 [[TMP36]]) #[[ATTR2]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT18]]
 // CHECK1:       omp_offload.cont18:
 // CHECK1-NEXT:    br label [[OMP_IF_END:%.*]]
 // CHECK1:       omp_if.else:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l126(ptr [[THIS1]], i64 [[TMP36]]) #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l126(ptr [[THIS1]], i64 [[TMP36]]) #[[ATTR2]]
 // CHECK1-NEXT:    br label [[OMP_IF_END]]
 // CHECK1:       omp_if.end:
 // CHECK1-NEXT:    [[A19:%.*]] = getelementptr inbounds [[STRUCT_S1]], ptr [[THIS1]], i32 0, i32 0
@@ -445,12 +445,12 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP26:%.*]] = icmp ne i32 [[TMP25]], 0
 // CHECK1-NEXT:    br i1 [[TMP26]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK1:       omp_offload.failed:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l104(i64 [[TMP2]]) #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l104(i64 [[TMP2]]) #[[ATTR2]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    br label [[OMP_IF_END:%.*]]
 // CHECK1:       omp_if.else:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l104(i64 [[TMP2]]) #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l104(i64 [[TMP2]]) #[[ATTR2]]
 // CHECK1-NEXT:    br label [[OMP_IF_END]]
 // CHECK1:       omp_if.end:
 // CHECK1-NEXT:    [[TMP27:%.*]] = load i32, ptr [[N_ADDR]], align 4
@@ -488,12 +488,12 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP42:%.*]] = icmp ne i32 [[TMP41]], 0
 // CHECK1-NEXT:    br i1 [[TMP42]], label [[OMP_OFFLOAD_FAILED7:%.*]], label [[OMP_OFFLOAD_CONT8:%.*]]
 // CHECK1:       omp_offload.failed7:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l108() #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l108() #[[ATTR2]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT8]]
 // CHECK1:       omp_offload.cont8:
 // CHECK1-NEXT:    br label [[OMP_IF_END10:%.*]]
 // CHECK1:       omp_if.else9:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l108() #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l108() #[[ATTR2]]
 // CHECK1-NEXT:    br label [[OMP_IF_END10]]
 // CHECK1:       omp_if.end10:
 // CHECK1-NEXT:    [[TMP43:%.*]] = load i32, ptr [[N_ADDR]], align 4
@@ -561,7 +561,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP21:%.*]] = icmp ne i32 [[TMP20]], 0
 // CHECK1-NEXT:    br i1 [[TMP21]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK1:       omp_offload.failed:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l87(i64 [[TMP1]]) #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l87(i64 [[TMP1]]) #[[ATTR2]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    store i16 1, ptr [[B]], align 2
@@ -615,7 +615,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP48:%.*]] = icmp ne i32 [[TMP47]], 0
 // CHECK1-NEXT:    br i1 [[TMP48]], label [[OMP_OFFLOAD_FAILED6:%.*]], label [[OMP_OFFLOAD_CONT7:%.*]]
 // CHECK1:       omp_offload.failed6:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l93(i64 [[TMP23]], i64 [[TMP25]]) #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l93(i64 [[TMP23]], i64 [[TMP25]]) #[[ATTR2]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT7]]
 // CHECK1:       omp_offload.cont7:
 // CHECK1-NEXT:    [[TMP49:%.*]] = load i32, ptr [[A]], align 4
@@ -649,7 +649,7 @@ int bar(int n){
 // CHECK1-NEXT:    call void @__kmpc_serialized_parallel(ptr @[[GLOB1]], i32 [[TMP0]])
 // CHECK1-NEXT:    store i32 [[TMP0]], ptr [[DOTTHREADID_TEMP_]], align 4
 // CHECK1-NEXT:    store i32 0, ptr [[DOTBOUND_ZERO_ADDR]], align 4
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l121.omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTBOUND_ZERO_ADDR]], ptr [[TMP1]], i64 [[TMP3]]) #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l121.omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTBOUND_ZERO_ADDR]], ptr [[TMP1]], i64 [[TMP3]]) #[[ATTR2]]
 // CHECK1-NEXT:    call void @__kmpc_end_serialized_parallel(ptr @[[GLOB1]], i32 [[TMP0]])
 // CHECK1-NEXT:    br label [[OMP_IF_END]]
 // CHECK1:       omp_if.end:
@@ -657,7 +657,7 @@ int bar(int n){
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l121.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]], i64 noundef [[B:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]], i64 noundef [[B:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -697,7 +697,7 @@ int bar(int n){
 // CHECK1-NEXT:    call void @__kmpc_serialized_parallel(ptr @[[GLOB1]], i32 [[TMP0]])
 // CHECK1-NEXT:    store i32 [[TMP0]], ptr [[DOTTHREADID_TEMP_]], align 4
 // CHECK1-NEXT:    store i32 0, ptr [[DOTBOUND_ZERO_ADDR]], align 4
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l126.omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTBOUND_ZERO_ADDR]], ptr [[TMP1]]) #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l126.omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTBOUND_ZERO_ADDR]], ptr [[TMP1]]) #[[ATTR2]]
 // CHECK1-NEXT:    call void @__kmpc_end_serialized_parallel(ptr @[[GLOB1]], i32 [[TMP0]])
 // CHECK1-NEXT:    br label [[OMP_IF_END]]
 // CHECK1:       omp_if.end:
@@ -705,7 +705,7 @@ int bar(int n){
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l126.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -737,7 +737,7 @@ int bar(int n){
 // CHECK1-NEXT:    call void @__kmpc_serialized_parallel(ptr @[[GLOB1]], i32 [[TMP0]])
 // CHECK1-NEXT:    store i32 [[TMP0]], ptr [[DOTTHREADID_TEMP_]], align 4
 // CHECK1-NEXT:    store i32 0, ptr [[DOTBOUND_ZERO_ADDR]], align 4
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l104.omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTBOUND_ZERO_ADDR]]) #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l104.omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTBOUND_ZERO_ADDR]]) #[[ATTR2]]
 // CHECK1-NEXT:    call void @__kmpc_end_serialized_parallel(ptr @[[GLOB1]], i32 [[TMP0]])
 // CHECK1-NEXT:    br label [[OMP_IF_END]]
 // CHECK1:       omp_if.end:
@@ -745,7 +745,7 @@ int bar(int n){
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l104.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -762,7 +762,7 @@ int bar(int n){
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l108.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -786,13 +786,13 @@ int bar(int n){
 // CHECK1-NEXT:    call void @__kmpc_serialized_parallel(ptr @[[GLOB1]], i32 [[TMP0]])
 // CHECK1-NEXT:    store i32 [[TMP0]], ptr [[DOTTHREADID_TEMP_]], align 4
 // CHECK1-NEXT:    store i32 0, ptr [[DOTBOUND_ZERO_ADDR]], align 4
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l87.omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP2]]) #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l87.omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP2]]) #[[ATTR2]]
 // CHECK1-NEXT:    call void @__kmpc_end_serialized_parallel(ptr @[[GLOB1]], i32 [[TMP0]])
 // CHECK1-NEXT:    ret void
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l87.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -826,7 +826,7 @@ int bar(int n){
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l93.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -845,7 +845,7 @@ int bar(int n){
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK1-SAME: () #[[ATTR4:[0-9]+]] {
+// CHECK1-SAME: () #[[ATTR3:[0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK1-NEXT:    ret void
@@ -968,7 +968,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP33:%.*]] = icmp ne i32 [[TMP32]], 0
 // CHECK3-NEXT:    br i1 [[TMP33]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK3:       omp_offload.failed:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l121(ptr [[THIS1]], i32 [[TMP2]], i32 [[TMP4]]) #[[ATTR3:[0-9]+]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l121(ptr [[THIS1]], i32 [[TMP2]], i32 [[TMP4]]) #[[ATTR2:[0-9]+]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK3:       omp_offload.cont:
 // CHECK3-NEXT:    [[TMP34:%.*]] = load i32, ptr [[N_ADDR]], align 4
@@ -1033,12 +1033,12 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP63:%.*]] = icmp ne i32 [[TMP62]], 0
 // CHECK3-NEXT:    br i1 [[TMP63]], label [[OMP_OFFLOAD_FAILED17:%.*]], label [[OMP_OFFLOAD_CONT18:%.*]]
 // CHECK3:       omp_offload.failed17:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l126(ptr [[THIS1]], i32 [[TMP36]]) #[[ATTR3]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l126(ptr [[THIS1]], i32 [[TMP36]]) #[[ATTR2]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT18]]
 // CHECK3:       omp_offload.cont18:
 // CHECK3-NEXT:    br label [[OMP_IF_END:%.*]]
 // CHECK3:       omp_if.else:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l126(ptr [[THIS1]], i32 [[TMP36]]) #[[ATTR3]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l126(ptr [[THIS1]], i32 [[TMP36]]) #[[ATTR2]]
 // CHECK3-NEXT:    br label [[OMP_IF_END]]
 // CHECK3:       omp_if.end:
 // CHECK3-NEXT:    [[A19:%.*]] = getelementptr inbounds [[STRUCT_S1]], ptr [[THIS1]], i32 0, i32 0
@@ -1114,12 +1114,12 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP26:%.*]] = icmp ne i32 [[TMP25]], 0
 // CHECK3-NEXT:    br i1 [[TMP26]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK3:       omp_offload.failed:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l104(i32 [[TMP2]]) #[[ATTR3]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l104(i32 [[TMP2]]) #[[ATTR2]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK3:       omp_offload.cont:
 // CHECK3-NEXT:    br label [[OMP_IF_END:%.*]]
 // CHECK3:       omp_if.else:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l104(i32 [[TMP2]]) #[[ATTR3]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l104(i32 [[TMP2]]) #[[ATTR2]]
 // CHECK3-NEXT:    br label [[OMP_IF_END]]
 // CHECK3:       omp_if.end:
 // CHECK3-NEXT:    [[TMP27:%.*]] = load i32, ptr [[N_ADDR]], align 4
@@ -1157,12 +1157,12 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP42:%.*]] = icmp ne i32 [[TMP41]], 0
 // CHECK3-NEXT:    br i1 [[TMP42]], label [[OMP_OFFLOAD_FAILED7:%.*]], label [[OMP_OFFLOAD_CONT8:%.*]]
 // CHECK3:       omp_offload.failed7:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l108() #[[ATTR3]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l108() #[[ATTR2]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT8]]
 // CHECK3:       omp_offload.cont8:
 // CHECK3-NEXT:    br label [[OMP_IF_END10:%.*]]
 // CHECK3:       omp_if.else9:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l108() #[[ATTR3]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l108() #[[ATTR2]]
 // CHECK3-NEXT:    br label [[OMP_IF_END10]]
 // CHECK3:       omp_if.end10:
 // CHECK3-NEXT:    [[TMP43:%.*]] = load i32, ptr [[N_ADDR]], align 4
@@ -1230,7 +1230,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP21:%.*]] = icmp ne i32 [[TMP20]], 0
 // CHECK3-NEXT:    br i1 [[TMP21]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK3:       omp_offload.failed:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l87(i32 [[TMP1]]) #[[ATTR3]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l87(i32 [[TMP1]]) #[[ATTR2]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK3:       omp_offload.cont:
 // CHECK3-NEXT:    store i16 1, ptr [[B]], align 2
@@ -1284,7 +1284,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP48:%.*]] = icmp ne i32 [[TMP47]], 0
 // CHECK3-NEXT:    br i1 [[TMP48]], label [[OMP_OFFLOAD_FAILED6:%.*]], label [[OMP_OFFLOAD_CONT7:%.*]]
 // CHECK3:       omp_offload.failed6:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l93(i32 [[TMP23]], i32 [[TMP25]]) #[[ATTR3]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l93(i32 [[TMP23]], i32 [[TMP25]]) #[[ATTR2]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT7]]
 // CHECK3:       omp_offload.cont7:
 // CHECK3-NEXT:    [[TMP49:%.*]] = load i32, ptr [[A]], align 4
@@ -1318,7 +1318,7 @@ int bar(int n){
 // CHECK3-NEXT:    call void @__kmpc_serialized_parallel(ptr @[[GLOB1]], i32 [[TMP0]])
 // CHECK3-NEXT:    store i32 [[TMP0]], ptr [[DOTTHREADID_TEMP_]], align 4
 // CHECK3-NEXT:    store i32 0, ptr [[DOTBOUND_ZERO_ADDR]], align 4
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l121.omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTBOUND_ZERO_ADDR]], ptr [[TMP1]], i32 [[TMP3]]) #[[ATTR3]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l121.omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTBOUND_ZERO_ADDR]], ptr [[TMP1]], i32 [[TMP3]]) #[[ATTR2]]
 // CHECK3-NEXT:    call void @__kmpc_end_serialized_parallel(ptr @[[GLOB1]], i32 [[TMP0]])
 // CHECK3-NEXT:    br label [[OMP_IF_END]]
 // CHECK3:       omp_if.end:
@@ -1326,7 +1326,7 @@ int bar(int n){
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l121.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]], i32 noundef [[B:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]], i32 noundef [[B:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1366,7 +1366,7 @@ int bar(int n){
 // CHECK3-NEXT:    call void @__kmpc_serialized_parallel(ptr @[[GLOB1]], i32 [[TMP0]])
 // CHECK3-NEXT:    store i32 [[TMP0]], ptr [[DOTTHREADID_TEMP_]], align 4
 // CHECK3-NEXT:    store i32 0, ptr [[DOTBOUND_ZERO_ADDR]], align 4
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l126.omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTBOUND_ZERO_ADDR]], ptr [[TMP1]]) #[[ATTR3]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l126.omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTBOUND_ZERO_ADDR]], ptr [[TMP1]]) #[[ATTR2]]
 // CHECK3-NEXT:    call void @__kmpc_end_serialized_parallel(ptr @[[GLOB1]], i32 [[TMP0]])
 // CHECK3-NEXT:    br label [[OMP_IF_END]]
 // CHECK3:       omp_if.end:
@@ -1374,7 +1374,7 @@ int bar(int n){
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l126.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1406,7 +1406,7 @@ int bar(int n){
 // CHECK3-NEXT:    call void @__kmpc_serialized_parallel(ptr @[[GLOB1]], i32 [[TMP0]])
 // CHECK3-NEXT:    store i32 [[TMP0]], ptr [[DOTTHREADID_TEMP_]], align 4
 // CHECK3-NEXT:    store i32 0, ptr [[DOTBOUND_ZERO_ADDR]], align 4
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l104.omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTBOUND_ZERO_ADDR]]) #[[ATTR3]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l104.omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTBOUND_ZERO_ADDR]]) #[[ATTR2]]
 // CHECK3-NEXT:    call void @__kmpc_end_serialized_parallel(ptr @[[GLOB1]], i32 [[TMP0]])
 // CHECK3-NEXT:    br label [[OMP_IF_END]]
 // CHECK3:       omp_if.end:
@@ -1414,7 +1414,7 @@ int bar(int n){
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l104.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1431,7 +1431,7 @@ int bar(int n){
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l108.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1455,13 +1455,13 @@ int bar(int n){
 // CHECK3-NEXT:    call void @__kmpc_serialized_parallel(ptr @[[GLOB1]], i32 [[TMP0]])
 // CHECK3-NEXT:    store i32 [[TMP0]], ptr [[DOTTHREADID_TEMP_]], align 4
 // CHECK3-NEXT:    store i32 0, ptr [[DOTBOUND_ZERO_ADDR]], align 4
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l87.omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTBOUND_ZERO_ADDR]], i32 [[TMP2]]) #[[ATTR3]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l87.omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTBOUND_ZERO_ADDR]], i32 [[TMP2]]) #[[ATTR2]]
 // CHECK3-NEXT:    call void @__kmpc_end_serialized_parallel(ptr @[[GLOB1]], i32 [[TMP0]])
 // CHECK3-NEXT:    ret void
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l87.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1495,7 +1495,7 @@ int bar(int n){
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l93.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1514,19 +1514,21 @@ int bar(int n){
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK3-SAME: () #[[ATTR4:[0-9]+]] {
+// CHECK3-SAME: () #[[ATTR3:[0-9]+]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK3-NEXT:    ret void
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l104
-// CHECK9-SAME: (i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK9-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK9-NEXT:  entry:
+// CHECK9-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
 // CHECK9-NEXT:    [[DOTBOUND_ZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK9-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1:[0-9]+]])
+// CHECK9-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK9-NEXT:    store i64 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 8
 // CHECK9-NEXT:    [[TMP1:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1
 // CHECK9-NEXT:    [[TOBOOL:%.*]] = trunc i8 [[TMP1]] to i1
@@ -1538,7 +1540,7 @@ int bar(int n){
 // CHECK9-NEXT:    call void @__kmpc_serialized_parallel(ptr @[[GLOB1]], i32 [[TMP0]])
 // CHECK9-NEXT:    store i32 [[TMP0]], ptr [[DOTTHREADID_TEMP_]], align 4
 // CHECK9-NEXT:    store i32 0, ptr [[DOTBOUND_ZERO_ADDR]], align 4
-// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l104.omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTBOUND_ZERO_ADDR]]) #[[ATTR2:[0-9]+]]
+// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l104.omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTBOUND_ZERO_ADDR]]) #[[ATTR1:[0-9]+]]
 // CHECK9-NEXT:    call void @__kmpc_end_serialized_parallel(ptr @[[GLOB1]], i32 [[TMP0]])
 // CHECK9-NEXT:    br label [[OMP_IF_END]]
 // CHECK9:       omp_if.end:
@@ -1546,7 +1548,7 @@ int bar(int n){
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l104.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1556,14 +1558,16 @@ int bar(int n){
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l108
-// CHECK9-SAME: () #[[ATTR0]] {
+// CHECK9-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
 // CHECK9-NEXT:  entry:
+// CHECK9-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK9-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB1]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l108.omp_outlined)
 // CHECK9-NEXT:    ret void
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l108.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1573,8 +1577,9 @@ int bar(int n){
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l121
-// CHECK9-SAME: (ptr noundef [[THIS:%.*]], i64 noundef [[B:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] {
+// CHECK9-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef [[THIS:%.*]], i64 noundef [[B:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] {
 // CHECK9-NEXT:  entry:
+// CHECK9-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[B_ADDR:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8
@@ -1582,6 +1587,7 @@ int bar(int n){
 // CHECK9-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
 // CHECK9-NEXT:    [[DOTBOUND_ZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK9-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK9-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK9-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
 // CHECK9-NEXT:    store i64 [[B]], ptr [[B_ADDR]], align 8
 // CHECK9-NEXT:    store i64 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 8
@@ -1599,7 +1605,7 @@ int bar(int n){
 // CHECK9-NEXT:    call void @__kmpc_serialized_parallel(ptr @[[GLOB1]], i32 [[TMP0]])
 // CHECK9-NEXT:    store i32 [[TMP0]], ptr [[DOTTHREADID_TEMP_]], align 4
 // CHECK9-NEXT:    store i32 0, ptr [[DOTBOUND_ZERO_ADDR]], align 4
-// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l121.omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTBOUND_ZERO_ADDR]], ptr [[TMP1]], i64 [[TMP3]]) #[[ATTR2]]
+// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l121.omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTBOUND_ZERO_ADDR]], ptr [[TMP1]], i64 [[TMP3]]) #[[ATTR1]]
 // CHECK9-NEXT:    call void @__kmpc_end_serialized_parallel(ptr @[[GLOB1]], i32 [[TMP0]])
 // CHECK9-NEXT:    br label [[OMP_IF_END]]
 // CHECK9:       omp_if.end:
@@ -1607,7 +1613,7 @@ int bar(int n){
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l121.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]], i64 noundef [[B:%.*]]) #[[ATTR1]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1627,13 +1633,15 @@ int bar(int n){
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l126
-// CHECK9-SAME: (ptr noundef [[THIS:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] {
+// CHECK9-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef [[THIS:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] {
 // CHECK9-NEXT:  entry:
+// CHECK9-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
 // CHECK9-NEXT:    [[DOTBOUND_ZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK9-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK9-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK9-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
 // CHECK9-NEXT:    store i64 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 8
 // CHECK9-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
@@ -1647,7 +1655,7 @@ int bar(int n){
 // CHECK9-NEXT:    call void @__kmpc_serialized_parallel(ptr @[[GLOB1]], i32 [[TMP0]])
 // CHECK9-NEXT:    store i32 [[TMP0]], ptr [[DOTTHREADID_TEMP_]], align 4
 // CHECK9-NEXT:    store i32 0, ptr [[DOTBOUND_ZERO_ADDR]], align 4
-// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l126.omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTBOUND_ZERO_ADDR]], ptr [[TMP1]]) #[[ATTR2]]
+// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l126.omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTBOUND_ZERO_ADDR]], ptr [[TMP1]]) #[[ATTR1]]
 // CHECK9-NEXT:    call void @__kmpc_end_serialized_parallel(ptr @[[GLOB1]], i32 [[TMP0]])
 // CHECK9-NEXT:    br label [[OMP_IF_END]]
 // CHECK9:       omp_if.end:
@@ -1655,7 +1663,7 @@ int bar(int n){
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l126.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR0]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1670,13 +1678,15 @@ int bar(int n){
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l87
-// CHECK9-SAME: (i64 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK9-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK9-NEXT:  entry:
+// CHECK9-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[A_CASTED:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
 // CHECK9-NEXT:    [[DOTBOUND_ZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK9-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK9-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK9-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
 // CHECK9-NEXT:    [[TMP1:%.*]] = load i32, ptr [[A_ADDR]], align 4
 // CHECK9-NEXT:    store i32 [[TMP1]], ptr [[A_CASTED]], align 4
@@ -1684,13 +1694,13 @@ int bar(int n){
 // CHECK9-NEXT:    call void @__kmpc_serialized_parallel(ptr @[[GLOB1]], i32 [[TMP0]])
 // CHECK9-NEXT:    store i32 [[TMP0]], ptr [[DOTTHREADID_TEMP_]], align 4
 // CHECK9-NEXT:    store i32 0, ptr [[DOTBOUND_ZERO_ADDR]], align 4
-// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l87.omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP2]]) #[[ATTR2]]
+// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l87.omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP2]]) #[[ATTR1]]
 // CHECK9-NEXT:    call void @__kmpc_end_serialized_parallel(ptr @[[GLOB1]], i32 [[TMP0]])
 // CHECK9-NEXT:    ret void
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l87.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1705,12 +1715,14 @@ int bar(int n){
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l93
-// CHECK9-SAME: (i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK9-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK9-NEXT:  entry:
+// CHECK9-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[B_ADDR:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[A_CASTED:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[B_CASTED:%.*]] = alloca i64, align 8
+// CHECK9-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK9-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
 // CHECK9-NEXT:    store i64 [[B]], ptr [[B_ADDR]], align 8
 // CHECK9-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
@@ -1724,7 +1736,7 @@ int bar(int n){
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l93.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR1]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1743,12 +1755,14 @@ int bar(int n){
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l104
-// CHECK11-SAME: (i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK11-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK11-NEXT:  entry:
+// CHECK11-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[DOTBOUND_ZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1:[0-9]+]])
+// CHECK11-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK11-NEXT:    store i32 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 4
 // CHECK11-NEXT:    [[TMP1:%.*]] = load i8, ptr [[DOTCAPTURE_EXPR__ADDR]], align 1
 // CHECK11-NEXT:    [[TOBOOL:%.*]] = trunc i8 [[TMP1]] to i1
@@ -1760,7 +1774,7 @@ int bar(int n){
 // CHECK11-NEXT:    call void @__kmpc_serialized_parallel(ptr @[[GLOB1]], i32 [[TMP0]])
 // CHECK11-NEXT:    store i32 [[TMP0]], ptr [[DOTTHREADID_TEMP_]], align 4
 // CHECK11-NEXT:    store i32 0, ptr [[DOTBOUND_ZERO_ADDR]], align 4
-// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l104.omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTBOUND_ZERO_ADDR]]) #[[ATTR2:[0-9]+]]
+// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l104.omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTBOUND_ZERO_ADDR]]) #[[ATTR1:[0-9]+]]
 // CHECK11-NEXT:    call void @__kmpc_end_serialized_parallel(ptr @[[GLOB1]], i32 [[TMP0]])
 // CHECK11-NEXT:    br label [[OMP_IF_END]]
 // CHECK11:       omp_if.end:
@@ -1768,7 +1782,7 @@ int bar(int n){
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l104.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1778,14 +1792,16 @@ int bar(int n){
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l108
-// CHECK11-SAME: () #[[ATTR0]] {
+// CHECK11-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
 // CHECK11-NEXT:  entry:
+// CHECK11-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
+// CHECK11-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK11-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB1]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l108.omp_outlined)
 // CHECK11-NEXT:    ret void
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l108.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1795,8 +1811,9 @@ int bar(int n){
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l121
-// CHECK11-SAME: (ptr noundef [[THIS:%.*]], i32 noundef [[B:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] {
+// CHECK11-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef [[THIS:%.*]], i32 noundef [[B:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] {
 // CHECK11-NEXT:  entry:
+// CHECK11-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i32, align 4
@@ -1804,6 +1821,7 @@ int bar(int n){
 // CHECK11-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[DOTBOUND_ZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK11-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK11-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
 // CHECK11-NEXT:    store i32 [[B]], ptr [[B_ADDR]], align 4
 // CHECK11-NEXT:    store i32 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 4
@@ -1821,7 +1839,7 @@ int bar(int n){
 // CHECK11-NEXT:    call void @__kmpc_serialized_parallel(ptr @[[GLOB1]], i32 [[TMP0]])
 // CHECK11-NEXT:    store i32 [[TMP0]], ptr [[DOTTHREADID_TEMP_]], align 4
 // CHECK11-NEXT:    store i32 0, ptr [[DOTBOUND_ZERO_ADDR]], align 4
-// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l121.omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTBOUND_ZERO_ADDR]], ptr [[TMP1]], i32 [[TMP3]]) #[[ATTR2]]
+// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l121.omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTBOUND_ZERO_ADDR]], ptr [[TMP1]], i32 [[TMP3]]) #[[ATTR1]]
 // CHECK11-NEXT:    call void @__kmpc_end_serialized_parallel(ptr @[[GLOB1]], i32 [[TMP0]])
 // CHECK11-NEXT:    br label [[OMP_IF_END]]
 // CHECK11:       omp_if.end:
@@ -1829,7 +1847,7 @@ int bar(int n){
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l121.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]], i32 noundef [[B:%.*]]) #[[ATTR1]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1849,13 +1867,15 @@ int bar(int n){
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l126
-// CHECK11-SAME: (ptr noundef [[THIS:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] {
+// CHECK11-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef [[THIS:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] {
 // CHECK11-NEXT:  entry:
+// CHECK11-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[DOTBOUND_ZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK11-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK11-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
 // CHECK11-NEXT:    store i32 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 4
 // CHECK11-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
@@ -1869,7 +1889,7 @@ int bar(int n){
 // CHECK11-NEXT:    call void @__kmpc_serialized_parallel(ptr @[[GLOB1]], i32 [[TMP0]])
 // CHECK11-NEXT:    store i32 [[TMP0]], ptr [[DOTTHREADID_TEMP_]], align 4
 // CHECK11-NEXT:    store i32 0, ptr [[DOTBOUND_ZERO_ADDR]], align 4
-// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l126.omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTBOUND_ZERO_ADDR]], ptr [[TMP1]]) #[[ATTR2]]
+// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l126.omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTBOUND_ZERO_ADDR]], ptr [[TMP1]]) #[[ATTR1]]
 // CHECK11-NEXT:    call void @__kmpc_end_serialized_parallel(ptr @[[GLOB1]], i32 [[TMP0]])
 // CHECK11-NEXT:    br label [[OMP_IF_END]]
 // CHECK11:       omp_if.end:
@@ -1877,7 +1897,7 @@ int bar(int n){
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l126.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR0]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1892,13 +1912,15 @@ int bar(int n){
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l87
-// CHECK11-SAME: (i32 noundef [[A:%.*]]) #[[ATTR0]] {
+// CHECK11-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK11-NEXT:  entry:
+// CHECK11-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[A_CASTED:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[DOTBOUND_ZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK11-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK11-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
 // CHECK11-NEXT:    [[TMP1:%.*]] = load i32, ptr [[A_ADDR]], align 4
 // CHECK11-NEXT:    store i32 [[TMP1]], ptr [[A_CASTED]], align 4
@@ -1906,13 +1928,13 @@ int bar(int n){
 // CHECK11-NEXT:    call void @__kmpc_serialized_parallel(ptr @[[GLOB1]], i32 [[TMP0]])
 // CHECK11-NEXT:    store i32 [[TMP0]], ptr [[DOTTHREADID_TEMP_]], align 4
 // CHECK11-NEXT:    store i32 0, ptr [[DOTBOUND_ZERO_ADDR]], align 4
-// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l87.omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTBOUND_ZERO_ADDR]], i32 [[TMP2]]) #[[ATTR2]]
+// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l87.omp_outlined(ptr [[DOTTHREADID_TEMP_]], ptr [[DOTBOUND_ZERO_ADDR]], i32 [[TMP2]]) #[[ATTR1]]
 // CHECK11-NEXT:    call void @__kmpc_end_serialized_parallel(ptr @[[GLOB1]], i32 [[TMP0]])
 // CHECK11-NEXT:    ret void
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l87.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]]) #[[ATTR1]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]]) #[[ATTR0]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1927,12 +1949,14 @@ int bar(int n){
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l93
-// CHECK11-SAME: (i32 noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK11-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK11-NEXT:  entry:
+// CHECK11-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[A_CASTED:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[B_CASTED:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK11-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
 // CHECK11-NEXT:    store i32 [[B]], ptr [[B_ADDR]], align 4
 // CHECK11-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
@@ -1946,7 +1970,7 @@ int bar(int n){
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l93.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR1]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
diff --git a/clang/test/OpenMP/target_parallel_num_threads_codegen.cpp b/clang/test/OpenMP/target_parallel_num_threads_codegen.cpp
index d7133013c0fc0..434c18fa3fcdc 100644
--- a/clang/test/OpenMP/target_parallel_num_threads_codegen.cpp
+++ b/clang/test/OpenMP/target_parallel_num_threads_codegen.cpp
@@ -300,7 +300,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP33:%.*]] = icmp ne i32 [[TMP32]], 0
 // CHECK1-NEXT:    br i1 [[TMP33]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK1:       omp_offload.failed:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l121(ptr [[THIS1]], i64 [[TMP3]], i64 [[TMP5]]) #[[ATTR3:[0-9]+]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l121(ptr [[THIS1]], i64 [[TMP3]], i64 [[TMP5]]) #[[ATTR2:[0-9]+]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    [[A2:%.*]] = getelementptr inbounds [[STRUCT_S1]], ptr [[THIS1]], i32 0, i32 0
@@ -342,7 +342,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP53:%.*]] = icmp ne i32 [[TMP52]], 0
 // CHECK1-NEXT:    br i1 [[TMP53]], label [[OMP_OFFLOAD_FAILED7:%.*]], label [[OMP_OFFLOAD_CONT8:%.*]]
 // CHECK1:       omp_offload.failed7:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l126(ptr [[THIS1]]) #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l126(ptr [[THIS1]]) #[[ATTR2]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT8]]
 // CHECK1:       omp_offload.cont8:
 // CHECK1-NEXT:    [[A9:%.*]] = getelementptr inbounds [[STRUCT_S1]], ptr [[THIS1]], i32 0, i32 0
@@ -413,7 +413,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP24:%.*]] = icmp ne i32 [[TMP23]], 0
 // CHECK1-NEXT:    br i1 [[TMP24]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK1:       omp_offload.failed:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l104(i64 [[TMP2]]) #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l104(i64 [[TMP2]]) #[[ATTR2]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    [[TMP25:%.*]] = load i32, ptr [[N_ADDR]], align 4
@@ -462,7 +462,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP49:%.*]] = icmp ne i32 [[TMP48]], 0
 // CHECK1-NEXT:    br i1 [[TMP49]], label [[OMP_OFFLOAD_FAILED7:%.*]], label [[OMP_OFFLOAD_CONT8:%.*]]
 // CHECK1:       omp_offload.failed7:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l108(i64 [[TMP27]]) #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l108(i64 [[TMP27]]) #[[ATTR2]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT8]]
 // CHECK1:       omp_offload.cont8:
 // CHECK1-NEXT:    [[TMP50:%.*]] = load i32, ptr [[N_ADDR]], align 4
@@ -517,7 +517,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
 // CHECK1-NEXT:    br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK1:       omp_offload.failed:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l88() #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l88() #[[ATTR2]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    store i16 1, ptr [[B]], align 2
@@ -585,7 +585,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP50:%.*]] = icmp ne i32 [[TMP49]], 0
 // CHECK1-NEXT:    br i1 [[TMP50]], label [[OMP_OFFLOAD_FAILED2:%.*]], label [[OMP_OFFLOAD_CONT3:%.*]]
 // CHECK1:       omp_offload.failed2:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l93(i64 [[TMP17]], i64 [[TMP19]], i64 [[TMP21]]) #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l93(i64 [[TMP17]], i64 [[TMP19]], i64 [[TMP21]]) #[[ATTR2]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT3]]
 // CHECK1:       omp_offload.cont3:
 // CHECK1-NEXT:    [[TMP51:%.*]] = load i32, ptr [[A]], align 4
@@ -614,7 +614,7 @@ int bar(int n){
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l121.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]], i64 noundef [[B:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]], i64 noundef [[B:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -634,7 +634,7 @@ int bar(int n){
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l126
-// CHECK1-SAME: (ptr noundef [[THIS:%.*]]) #[[ATTR4:[0-9]+]] {
+// CHECK1-SAME: (ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
@@ -646,7 +646,7 @@ int bar(int n){
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l126.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -673,7 +673,7 @@ int bar(int n){
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l104.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -695,7 +695,7 @@ int bar(int n){
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l108.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -705,7 +705,7 @@ int bar(int n){
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l88
-// CHECK1-SAME: () #[[ATTR5:[0-9]+]] {
+// CHECK1-SAME: () #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK1-NEXT:    call void @__kmpc_push_num_threads(ptr @[[GLOB1]], i32 [[TMP0]], i32 20)
@@ -714,7 +714,7 @@ int bar(int n){
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l88.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -749,7 +749,7 @@ int bar(int n){
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l93.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -768,7 +768,7 @@ int bar(int n){
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK1-SAME: () #[[ATTR6:[0-9]+]] {
+// CHECK1-SAME: () #[[ATTR3:[0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK1-NEXT:    ret void
@@ -885,7 +885,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP33:%.*]] = icmp ne i32 [[TMP32]], 0
 // CHECK3-NEXT:    br i1 [[TMP33]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK3:       omp_offload.failed:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l121(ptr [[THIS1]], i32 [[TMP3]], i32 [[TMP5]]) #[[ATTR3:[0-9]+]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l121(ptr [[THIS1]], i32 [[TMP3]], i32 [[TMP5]]) #[[ATTR2:[0-9]+]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK3:       omp_offload.cont:
 // CHECK3-NEXT:    [[A2:%.*]] = getelementptr inbounds [[STRUCT_S1]], ptr [[THIS1]], i32 0, i32 0
@@ -927,7 +927,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP53:%.*]] = icmp ne i32 [[TMP52]], 0
 // CHECK3-NEXT:    br i1 [[TMP53]], label [[OMP_OFFLOAD_FAILED7:%.*]], label [[OMP_OFFLOAD_CONT8:%.*]]
 // CHECK3:       omp_offload.failed7:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l126(ptr [[THIS1]]) #[[ATTR3]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l126(ptr [[THIS1]]) #[[ATTR2]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT8]]
 // CHECK3:       omp_offload.cont8:
 // CHECK3-NEXT:    [[A9:%.*]] = getelementptr inbounds [[STRUCT_S1]], ptr [[THIS1]], i32 0, i32 0
@@ -998,7 +998,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP24:%.*]] = icmp ne i32 [[TMP23]], 0
 // CHECK3-NEXT:    br i1 [[TMP24]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK3:       omp_offload.failed:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l104(i32 [[TMP2]]) #[[ATTR3]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l104(i32 [[TMP2]]) #[[ATTR2]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK3:       omp_offload.cont:
 // CHECK3-NEXT:    [[TMP25:%.*]] = load i32, ptr [[N_ADDR]], align 4
@@ -1047,7 +1047,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP49:%.*]] = icmp ne i32 [[TMP48]], 0
 // CHECK3-NEXT:    br i1 [[TMP49]], label [[OMP_OFFLOAD_FAILED7:%.*]], label [[OMP_OFFLOAD_CONT8:%.*]]
 // CHECK3:       omp_offload.failed7:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l108(i32 [[TMP27]]) #[[ATTR3]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l108(i32 [[TMP27]]) #[[ATTR2]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT8]]
 // CHECK3:       omp_offload.cont8:
 // CHECK3-NEXT:    [[TMP50:%.*]] = load i32, ptr [[N_ADDR]], align 4
@@ -1102,7 +1102,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
 // CHECK3-NEXT:    br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK3:       omp_offload.failed:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l88() #[[ATTR3]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l88() #[[ATTR2]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK3:       omp_offload.cont:
 // CHECK3-NEXT:    store i16 1, ptr [[B]], align 2
@@ -1170,7 +1170,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP50:%.*]] = icmp ne i32 [[TMP49]], 0
 // CHECK3-NEXT:    br i1 [[TMP50]], label [[OMP_OFFLOAD_FAILED2:%.*]], label [[OMP_OFFLOAD_CONT3:%.*]]
 // CHECK3:       omp_offload.failed2:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l93(i32 [[TMP17]], i32 [[TMP19]], i32 [[TMP21]]) #[[ATTR3]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l93(i32 [[TMP17]], i32 [[TMP19]], i32 [[TMP21]]) #[[ATTR2]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT3]]
 // CHECK3:       omp_offload.cont3:
 // CHECK3-NEXT:    [[TMP51:%.*]] = load i32, ptr [[A]], align 4
@@ -1199,7 +1199,7 @@ int bar(int n){
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l121.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]], i32 noundef [[B:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]], i32 noundef [[B:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1219,7 +1219,7 @@ int bar(int n){
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l126
-// CHECK3-SAME: (ptr noundef [[THIS:%.*]]) #[[ATTR4:[0-9]+]] {
+// CHECK3-SAME: (ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
@@ -1231,7 +1231,7 @@ int bar(int n){
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l126.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1258,7 +1258,7 @@ int bar(int n){
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l104.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1280,7 +1280,7 @@ int bar(int n){
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l108.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1290,7 +1290,7 @@ int bar(int n){
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l88
-// CHECK3-SAME: () #[[ATTR5:[0-9]+]] {
+// CHECK3-SAME: () #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK3-NEXT:    call void @__kmpc_push_num_threads(ptr @[[GLOB1]], i32 [[TMP0]], i32 20)
@@ -1299,7 +1299,7 @@ int bar(int n){
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l88.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1334,7 +1334,7 @@ int bar(int n){
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l93.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1353,17 +1353,19 @@ int bar(int n){
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK3-SAME: () #[[ATTR6:[0-9]+]] {
+// CHECK3-SAME: () #[[ATTR3:[0-9]+]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK3-NEXT:    ret void
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l104
-// CHECK9-SAME: (i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK9-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK9-NEXT:  entry:
+// CHECK9-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1:[0-9]+]])
+// CHECK9-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK9-NEXT:    store i64 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 8
 // CHECK9-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__ADDR]], align 4
 // CHECK9-NEXT:    call void @__kmpc_push_num_threads(ptr @[[GLOB1]], i32 [[TMP0]], i32 [[TMP1]])
@@ -1372,7 +1374,7 @@ int bar(int n){
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l104.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1382,10 +1384,12 @@ int bar(int n){
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l108
-// CHECK9-SAME: (i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] {
+// CHECK9-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] {
 // CHECK9-NEXT:  entry:
+// CHECK9-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK9-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK9-NEXT:    store i64 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 8
 // CHECK9-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__ADDR]], align 4
 // CHECK9-NEXT:    call void @__kmpc_push_num_threads(ptr @[[GLOB1]], i32 [[TMP0]], i32 [[TMP1]])
@@ -1394,7 +1398,7 @@ int bar(int n){
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l108.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1404,13 +1408,15 @@ int bar(int n){
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l121
-// CHECK9-SAME: (ptr noundef [[THIS:%.*]], i64 noundef [[B:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] {
+// CHECK9-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef [[THIS:%.*]], i64 noundef [[B:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] {
 // CHECK9-NEXT:  entry:
+// CHECK9-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[B_ADDR:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[B_CASTED:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK9-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK9-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
 // CHECK9-NEXT:    store i64 [[B]], ptr [[B_ADDR]], align 8
 // CHECK9-NEXT:    store i64 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 8
@@ -1425,7 +1431,7 @@ int bar(int n){
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l121.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]], i64 noundef [[B:%.*]]) #[[ATTR1]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1445,10 +1451,12 @@ int bar(int n){
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l126
-// CHECK9-SAME: (ptr noundef [[THIS:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK9-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR0]] {
 // CHECK9-NEXT:  entry:
+// CHECK9-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK9-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK9-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
 // CHECK9-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CHECK9-NEXT:    call void @__kmpc_push_num_threads(ptr @[[GLOB1]], i32 [[TMP0]], i32 1024)
@@ -1457,7 +1465,7 @@ int bar(int n){
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l126.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR0]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1472,16 +1480,18 @@ int bar(int n){
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l88
-// CHECK9-SAME: () #[[ATTR4:[0-9]+]] {
+// CHECK9-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
 // CHECK9-NEXT:  entry:
+// CHECK9-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK9-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK9-NEXT:    call void @__kmpc_push_num_threads(ptr @[[GLOB1]], i32 [[TMP0]], i32 20)
 // CHECK9-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB1]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l88.omp_outlined)
 // CHECK9-NEXT:    ret void
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l88.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1491,14 +1501,16 @@ int bar(int n){
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l93
-// CHECK9-SAME: (i64 noundef [[A:%.*]], i64 noundef [[B:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] {
+// CHECK9-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[A:%.*]], i64 noundef [[B:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] {
 // CHECK9-NEXT:  entry:
+// CHECK9-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[B_ADDR:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[A_CASTED:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[B_CASTED:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK9-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK9-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
 // CHECK9-NEXT:    store i64 [[B]], ptr [[B_ADDR]], align 8
 // CHECK9-NEXT:    store i64 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 8
@@ -1516,7 +1528,7 @@ int bar(int n){
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l93.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR1]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1535,10 +1547,12 @@ int bar(int n){
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l104
-// CHECK11-SAME: (i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK11-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK11-NEXT:  entry:
+// CHECK11-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1:[0-9]+]])
+// CHECK11-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK11-NEXT:    store i32 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 4
 // CHECK11-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__ADDR]], align 4
 // CHECK11-NEXT:    call void @__kmpc_push_num_threads(ptr @[[GLOB1]], i32 [[TMP0]], i32 [[TMP1]])
@@ -1547,7 +1561,7 @@ int bar(int n){
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l104.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1557,10 +1571,12 @@ int bar(int n){
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l108
-// CHECK11-SAME: (i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] {
+// CHECK11-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] {
 // CHECK11-NEXT:  entry:
+// CHECK11-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK11-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK11-NEXT:    store i32 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 4
 // CHECK11-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__ADDR]], align 4
 // CHECK11-NEXT:    call void @__kmpc_push_num_threads(ptr @[[GLOB1]], i32 [[TMP0]], i32 [[TMP1]])
@@ -1569,7 +1585,7 @@ int bar(int n){
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l108.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1579,13 +1595,15 @@ int bar(int n){
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l121
-// CHECK11-SAME: (ptr noundef [[THIS:%.*]], i32 noundef [[B:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] {
+// CHECK11-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef [[THIS:%.*]], i32 noundef [[B:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] {
 // CHECK11-NEXT:  entry:
+// CHECK11-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[B_CASTED:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK11-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK11-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
 // CHECK11-NEXT:    store i32 [[B]], ptr [[B_ADDR]], align 4
 // CHECK11-NEXT:    store i32 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 4
@@ -1600,7 +1618,7 @@ int bar(int n){
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l121.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]], i32 noundef [[B:%.*]]) #[[ATTR1]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1620,10 +1638,12 @@ int bar(int n){
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l126
-// CHECK11-SAME: (ptr noundef [[THIS:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK11-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR0]] {
 // CHECK11-NEXT:  entry:
+// CHECK11-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK11-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK11-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
 // CHECK11-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
 // CHECK11-NEXT:    call void @__kmpc_push_num_threads(ptr @[[GLOB1]], i32 [[TMP0]], i32 1024)
@@ -1632,7 +1652,7 @@ int bar(int n){
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l126.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR0]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1647,16 +1667,18 @@ int bar(int n){
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l88
-// CHECK11-SAME: () #[[ATTR4:[0-9]+]] {
+// CHECK11-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
 // CHECK11-NEXT:  entry:
+// CHECK11-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK11-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK11-NEXT:    call void @__kmpc_push_num_threads(ptr @[[GLOB1]], i32 [[TMP0]], i32 20)
 // CHECK11-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB1]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l88.omp_outlined)
 // CHECK11-NEXT:    ret void
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l88.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1666,14 +1688,16 @@ int bar(int n){
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l93
-// CHECK11-SAME: (i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] {
+// CHECK11-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] {
 // CHECK11-NEXT:  entry:
+// CHECK11-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[A_CASTED:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[B_CASTED:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK11-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK11-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
 // CHECK11-NEXT:    store i32 [[B]], ptr [[B_ADDR]], align 4
 // CHECK11-NEXT:    store i32 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 4
@@ -1691,7 +1715,7 @@ int bar(int n){
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l93.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR1]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
diff --git a/clang/test/OpenMP/target_parallel_tl_codegen.cpp b/clang/test/OpenMP/target_parallel_tl_codegen.cpp
index e1ca288bd7329..3a3af6a8dac96 100644
--- a/clang/test/OpenMP/target_parallel_tl_codegen.cpp
+++ b/clang/test/OpenMP/target_parallel_tl_codegen.cpp
@@ -1,4 +1,4 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --include-generated-funcs --version 2
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_  size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _ --version 2
 // RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fopenmp -fopenmp-version=51 -emit-llvm %s -o - | FileCheck --check-prefix=OMP51 %s
 // RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fopenmp -fopenmp-version=51 -x c++ -std=c++11 -emit-pch -o %t %s
 // RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fopenmp -fopenmp-version=51 -x c++ -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix=OMP51
@@ -18,7 +18,8 @@ int thread_limit_target_parallel() {
 }
 
 #endif
-// OMP51-LABEL: define {{.*}}thread_limit_target_parallel{{.*}}{
+// OMP51-LABEL: define dso_local noundef i32 @_Z28thread_limit_target_parallelv
+// OMP51-SAME: () #[[ATTR0:[0-9]+]] {
 // OMP51-NEXT:  entry:
 // OMP51-NEXT:    [[AGG_CAPTURED:%.*]] = alloca [[STRUCT_ANON:%.*]], align 1
 // OMP51-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1:[0-9]+]])
@@ -30,7 +31,25 @@ int thread_limit_target_parallel() {
 // OMP51-NEXT:    ret i32 0
 //
 //
-// OMP51-LABEL: define {{.*}}omp_task_entry{{.*}}{
+// OMP51-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z28thread_limit_target_parallelv_l14
+// OMP51-SAME: () #[[ATTR1:[0-9]+]] {
+// OMP51-NEXT:  entry:
+// OMP51-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB1]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z28thread_limit_target_parallelv_l14.omp_outlined)
+// OMP51-NEXT:    ret void
+//
+//
+// OMP51-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z28thread_limit_target_parallelv_l14.omp_outlined
+// OMP51-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// OMP51-NEXT:  entry:
+// OMP51-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// OMP51-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// OMP51-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// OMP51-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// OMP51-NEXT:    ret void
+//
+//
+// OMP51-LABEL: define internal noundef i32 @.omp_task_entry.
+// OMP51-SAME: (i32 noundef [[TMP0:%.*]], ptr noalias noundef [[TMP1:%.*]]) #[[ATTR3:[0-9]+]] {
 // OMP51-NEXT:  entry:
 // OMP51-NEXT:    [[DOTGLOBAL_TID__ADDR_I:%.*]] = alloca i32, align 4
 // OMP51-NEXT:    [[DOTPART_ID__ADDR_I:%.*]] = alloca ptr, align 8
@@ -61,6 +80,6 @@ int thread_limit_target_parallel() {
 // OMP51-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !14
 // OMP51-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !14
 // OMP51-NEXT:    call void @__kmpc_set_thread_limit(ptr @[[GLOB1]], i32 [[TMP9]], i32 2)
-// OMP51-NEXT:    call void @__omp_offloading{{.*}}thread_limit_target_parallel{{.*}}
+// OMP51-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z28thread_limit_target_parallelv_l14() #[[ATTR2]]
 // OMP51-NEXT:    ret i32 0
 //
diff --git a/clang/test/OpenMP/target_private_codegen.cpp b/clang/test/OpenMP/target_private_codegen.cpp
index 4173313073d19..483e40f85f19a 100644
--- a/clang/test/OpenMP/target_private_codegen.cpp
+++ b/clang/test/OpenMP/target_private_codegen.cpp
@@ -45,7 +45,8 @@ int foo(int n) {
   {
   }
 
-  // TCHECK:  define weak_odr protected void @__omp_offloading_{{.+}}()
+  // TCHECK:  define weak_odr protected void @__omp_offloading_{{.+}}(ptr {{[^,]+}})
+  // TCHECK:  [[DYN_PTR:%.+]] = alloca ptr
   // TCHECK:  [[A:%.+]] = alloca i{{[0-9]+}},
   // TCHECK-NOT: store {{.+}}, {{.+}} [[A]],
   // TCHECK:  ret void
@@ -55,7 +56,8 @@ int foo(int n) {
     a = 1;
   }
 
-  // TCHECK:  define weak_odr protected void @__omp_offloading_{{.+}}()
+  // TCHECK:  define weak_odr protected void @__omp_offloading_{{.+}}(ptr {{[^,]+}})
+  // TCHECK:  [[DYN_PTR:%.+]] = alloca ptr
   // TCHECK:  [[A:%.+]] = alloca i{{[0-9]+}},
   // TCHECK:  store i{{[0-9]+}} 1, ptr [[A]],
   // TCHECK:  ret void
@@ -66,7 +68,8 @@ int foo(int n) {
     aa = 1;
   }
 
-  // TCHECK:  define weak_odr protected void @__omp_offloading_{{.+}}()
+  // TCHECK:  define weak_odr protected void @__omp_offloading_{{.+}}(ptr {{[^,]+}})
+  // TCHECK:   [[DYN_PTR:%.+]] = alloca ptr
   // TCHECK:  [[A:%.+]] = alloca i{{[0-9]+}},
   // TCHECK:  [[A2:%.+]] = alloca i{{[0-9]+}},
   // TCHECK:  store i{{[0-9]+}} 1, ptr [[A]],
@@ -85,7 +88,8 @@ int foo(int n) {
   }
   // make sure that private variables are generated in all cases and that we use those instances for operations inside the
   // target region
-  // TCHECK:  define weak_odr protected void @__omp_offloading_{{.+}}(i{{[0-9]+}} noundef [[VLA:%.+]], i{{[0-9]+}} noundef [[VLA1:%.+]], i{{[0-9]+}} noundef [[VLA3:%.+]])
+  // TCHECK:  define weak_odr protected void @__omp_offloading_{{.+}}(ptr {{[^,]+}}, i{{[0-9]+}} noundef [[VLA:%.+]], i{{[0-9]+}} noundef [[VLA1:%.+]], i{{[0-9]+}} noundef [[VLA3:%.+]])
+  // TCHECK:  [[DYN_PTR:%.+]] = alloca ptr
   // TCHECK:  [[VLA_ADDR:%.+]] = alloca i{{[0-9]+}},
   // TCHECK:  [[VLA_ADDR2:%.+]] = alloca i{{[0-9]+}},
   // TCHECK:  [[VLA_ADDR4:%.+]] = alloca i{{[0-9]+}},
@@ -179,7 +183,8 @@ int fstatic(int n) {
   return a;
 }
 
-// TCHECK: define weak_odr protected void @__omp_offloading_{{.+}}()
+// TCHECK: define weak_odr protected void @__omp_offloading_{{.+}}(ptr {{[^,]+}})
+// TCHECK:  [[DYN_PTR:%.+]] = alloca ptr
 // TCHECK:  [[A:%.+]] = alloca i{{[0-9]+}},
 // TCHECK:  [[A2:%.+]] = alloca i{{[0-9]+}},
 // TCHECK:  [[A3:%.+]] = alloca i{{[0-9]+}},
@@ -207,7 +212,8 @@ struct S1 {
     return c[1][1] + (int)b;
   }
 
-  // TCHECK: define weak_odr protected void @__omp_offloading_{{.+}}(ptr noundef [[TH:%.+]], i{{[0-9]+}} noundef [[VLA:%.+]], i{{[0-9]+}} noundef [[VLA1:%.+]])
+  // TCHECK: define weak_odr protected void @__omp_offloading_{{.+}}(ptr {{[^,]+}}, ptr noundef [[TH:%.+]], i{{[0-9]+}} noundef [[VLA:%.+]], i{{[0-9]+}} noundef [[VLA1:%.+]])
+  // TCHECK:  [[DYN_PTR:%.+]] = alloca ptr
   // TCHECK: [[TH_ADDR:%.+]] = alloca ptr,
   // TCHECK: [[VLA_ADDR:%.+]] = alloca i{{[0-9]+}},
   // TCHECK: [[VLA_ADDR2:%.+]] = alloca i{{[0-9]+}},
@@ -261,7 +267,8 @@ int bar(int n){
 }
 
 // template
-// TCHECK: define weak_odr protected void @__omp_offloading_{{.+}}()
+// TCHECK: define weak_odr protected void @__omp_offloading_{{.+}}(ptr {{[^,]+}})
+// TCHECK: [[DYN_PTR:%.+]] = alloca ptr
 // TCHECK: [[A:%.+]] = alloca i{{[0-9]+}},
 // TCHECK: [[A2:%.+]] = alloca i{{[0-9]+}},
 // TCHECK: [[B:%.+]] = alloca [10 x i{{[0-9]+}}],
diff --git a/clang/test/OpenMP/target_reduction_codegen.cpp b/clang/test/OpenMP/target_reduction_codegen.cpp
index 8cf22d8229c84..9f113be62beb5 100644
--- a/clang/test/OpenMP/target_reduction_codegen.cpp
+++ b/clang/test/OpenMP/target_reduction_codegen.cpp
@@ -45,7 +45,8 @@ int foo(int n) {
   {
   }
 
-  // TCHECK: define weak_odr protected void @__omp_offloading_{{.+}}(ptr{{.+}} %{{.+}})
+  // TCHECK: define weak_odr protected void @__omp_offloading_{{.+}}(ptr {{[^,]+}}, ptr{{.+}} %{{.+}})
+  // TCHECK: [[DYN_PTR:%.+]] = alloca ptr,
   // TCHECK: [[A:%.+]] = alloca ptr,
   // TCHECK: store {{.+}}, {{.+}} [[A]],
   // TCHECK: load ptr, ptr [[A]],
@@ -56,7 +57,8 @@ int foo(int n) {
     a = 1;
   }
 
-  // TCHECK:  define weak_odr protected void @__omp_offloading_{{.+}}(ptr{{.+}} %{{.+}})
+  // TCHECK:  define weak_odr protected void @__omp_offloading_{{.+}}(ptr {{[^,]+}}, ptr{{.+}} %{{.+}})
+  // TCHECK: [[DYN_PTR:%.+]] = alloca ptr,
   // TCHECK: [[A:%.+]] = alloca ptr,
   // TCHECK: store {{.+}}, {{.+}} [[A]],
   // TCHECK: [[REF:%.+]] = load ptr, ptr [[A]],
@@ -69,7 +71,8 @@ int foo(int n) {
     aa = 1;
   }
 
-  // TCHECK:  define weak_odr protected void @__omp_offloading_{{.+}}(ptr{{.+}} [[A:%.+]], ptr{{.+}} [[AA:%.+]])
+  // TCHECK:  define weak_odr protected void @__omp_offloading_{{.+}}(ptr {{[^,]+}}, ptr{{.+}} [[A:%.+]], ptr{{.+}} [[AA:%.+]])
+  // TCHECK: [[DYN_PTR:%.+]] = alloca ptr,
   // TCHECK:  [[A:%.+]] = alloca ptr,
   // TCHECK:  [[AA:%.+]] = alloca ptr,
   // TCHECK: store {{.+}}, {{.+}} [[A]],
@@ -118,7 +121,8 @@ int fstatic(int n) {
   return a;
 }
 
-// TCHECK: define weak_odr protected void @__omp_offloading_{{.+}}(ptr{{.+}}, ptr{{.+}}, ptr{{.+}}, ptr{{.+}})
+// TCHECK: define weak_odr protected void @__omp_offloading_{{.+}}(ptr {{[^,]+}}, ptr{{.+}}, ptr{{.+}}, ptr{{.+}}, ptr{{.+}})
+// TCHECK: [[DYN_PTR:%.+]] = alloca ptr,
 // TCHECK:  [[A:%.+]] = alloca ptr,
 // TCHECK:  [[A2:%.+]] = alloca ptr,
 // TCHECK:  [[A3:%.+]] = alloca ptr,
@@ -154,7 +158,8 @@ struct S1 {
     return c[1][1] + (int)b;
   }
 
-  // TCHECK: define weak_odr protected void @__omp_offloading_{{.+}}(ptr noundef [[TH:%.+]], ptr{{.+}}, i{{[0-9]+}} noundef [[VLA:%.+]], i{{[0-9]+}} noundef [[VLA1:%.+]], ptr{{.+}})
+  // TCHECK: define weak_odr protected void @__omp_offloading_{{.+}}(ptr {{[^,]+}}, ptr noundef [[TH:%.+]], ptr{{.+}}, i{{[0-9]+}} noundef [[VLA:%.+]], i{{[0-9]+}} noundef [[VLA1:%.+]], ptr{{.+}})
+  // TCHECK: [[DYN_PTR:%.+]] = alloca ptr,
   // TCHECK: [[TH_ADDR:%.+]] = alloca ptr,
   // TCHECK: [[B_ADDR:%.+]] = alloca ptr,
   // TCHECK: [[VLA_ADDR:%.+]] = alloca i{{[0-9]+}},
@@ -206,7 +211,8 @@ int bar(int n){
 }
 
 // template
-// TCHECK: define weak_odr protected void @__omp_offloading_{{.+}}(ptr{{.+}}, ptr{{.+}}, ptr{{.+}})
+// TCHECK: define weak_odr protected void @__omp_offloading_{{.+}}(ptr {{[^,]+}}, ptr{{.+}}, ptr{{.+}}, ptr{{.+}})
+// TCHECK: [[DYN_PTR:%.+]] = alloca ptr,
 // TCHECK: [[A:%.+]] = alloca ptr,
 // TCHECK: [[A2:%.+]] = alloca ptr,
 // TCHECK: [[B:%.+]] = alloca ptr,
diff --git a/clang/test/OpenMP/target_simd_tl_codegen.cpp b/clang/test/OpenMP/target_simd_tl_codegen.cpp
index 8d6139d055fc5..96c73f88a8e2c 100644
--- a/clang/test/OpenMP/target_simd_tl_codegen.cpp
+++ b/clang/test/OpenMP/target_simd_tl_codegen.cpp
@@ -1,4 +1,4 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --include-generated-funcs --version 2
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_  size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _ --version 2
 // RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fopenmp -fopenmp-version=51 -emit-llvm %s -o - | FileCheck --check-prefix=OMP51 %s
 // RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fopenmp -fopenmp-version=51 -x c++ -std=c++11 -emit-pch -o %t %s
 // RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fopenmp -fopenmp-version=51 -x c++ -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix=OMP51
@@ -18,7 +18,8 @@ int thread_limit_target_simd() {
 }
 
 #endif
-// OMP51-LABEL: define {{.*}}thread_limit_target_simd{{.*}}{
+// OMP51-LABEL: define dso_local noundef i32 @_Z24thread_limit_target_simdv
+// OMP51-SAME: () #[[ATTR0:[0-9]+]] {
 // OMP51-NEXT:  entry:
 // OMP51-NEXT:    [[AGG_CAPTURED:%.*]] = alloca [[STRUCT_ANON:%.*]], align 1
 // OMP51-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1:[0-9]+]])
@@ -30,7 +31,38 @@ int thread_limit_target_simd() {
 // OMP51-NEXT:    ret i32 0
 //
 //
-// OMP51-LABEL: define {{.*}}omp_task_entry{{.*}}{
+// OMP51-LABEL: define internal void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z24thread_limit_target_simdv_l14
+// OMP51-SAME: () #[[ATTR1:[0-9]+]] {
+// OMP51-NEXT:  entry:
+// OMP51-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// OMP51-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// OMP51-NEXT:    [[I:%.*]] = alloca i32, align 4
+// OMP51-NEXT:    store i32 0, ptr [[DOTOMP_IV]], align 4
+// OMP51-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// OMP51:       omp.inner.for.cond:
+// OMP51-NEXT:    [[TMP0:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3:![0-9]+]]
+// OMP51-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP0]], 2
+// OMP51-NEXT:    br i1 [[CMP]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// OMP51:       omp.inner.for.body:
+// OMP51-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3]]
+// OMP51-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP1]], 1
+// OMP51-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// OMP51-NEXT:    store i32 [[ADD]], ptr [[I]], align 4, !llvm.access.group [[ACC_GRP3]]
+// OMP51-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// OMP51:       omp.body.continue:
+// OMP51-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// OMP51:       omp.inner.for.inc:
+// OMP51-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3]]
+// OMP51-NEXT:    [[ADD1:%.*]] = add nsw i32 [[TMP2]], 1
+// OMP51-NEXT:    store i32 [[ADD1]], ptr [[DOTOMP_IV]], align 4, !llvm.access.group [[ACC_GRP3]]
+// OMP51-NEXT:    br label [[OMP_INNER_FOR_COND]], !llvm.loop [[LOOP4:![0-9]+]]
+// OMP51:       omp.inner.for.end:
+// OMP51-NEXT:    store i32 2, ptr [[I]], align 4
+// OMP51-NEXT:    ret void
+//
+//
+// OMP51-LABEL: define internal noundef i32 @.omp_task_entry.
+// OMP51-SAME: (i32 noundef [[TMP0:%.*]], ptr noalias noundef [[TMP1:%.*]]) #[[ATTR3:[0-9]+]] {
 // OMP51-NEXT:  entry:
 // OMP51-NEXT:    [[DOTGLOBAL_TID__ADDR_I:%.*]] = alloca i32, align 4
 // OMP51-NEXT:    [[DOTPART_ID__ADDR_I:%.*]] = alloca ptr, align 8
@@ -61,6 +93,6 @@ int thread_limit_target_simd() {
 // OMP51-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[__CONTEXT_ADDR_I]], align 8, !noalias !16
 // OMP51-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTGLOBAL_TID__ADDR_I]], align 4, !noalias !16
 // OMP51-NEXT:    call void @__kmpc_set_thread_limit(ptr @[[GLOB1]], i32 [[TMP9]], i32 2)
-// OMP51-NEXT:    call void @__omp_offloading{{.*}}thread_limit_target_simd{{.*}}
+// OMP51-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z24thread_limit_target_simdv_l14() #[[ATTR2]]
 // OMP51-NEXT:    ret i32 0
 //
diff --git a/clang/test/OpenMP/target_task_affinity_codegen.cpp b/clang/test/OpenMP/target_task_affinity_codegen.cpp
index fa7ceb28a5dc0..26ccfde37a9f3 100644
--- a/clang/test/OpenMP/target_task_affinity_codegen.cpp
+++ b/clang/test/OpenMP/target_task_affinity_codegen.cpp
@@ -585,13 +585,15 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l44
-// CHECK9-SAME: (ptr noundef [[B:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK9-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef [[B:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK9-NEXT:  entry:
+// CHECK9-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[AGG_CAPTURED:%.*]] = alloca [[STRUCT_ANON:%.*]], align 8
 // CHECK9-NEXT:    [[DOTAFFS_ARR_ADDR:%.*]] = alloca [1 x %struct.kmp_task_affinity_info_t], align 8
 // CHECK9-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1:[0-9]+]])
+// CHECK9-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK9-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
 // CHECK9-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
 // CHECK9-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT_ANON]], ptr [[AGG_CAPTURED]], i32 0, i32 0
@@ -705,13 +707,15 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l44
-// CHECK11-SAME: (ptr noundef [[B:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK11-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef [[B:%.*]], ptr noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK11-NEXT:  entry:
+// CHECK11-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[AGG_CAPTURED:%.*]] = alloca [[STRUCT_ANON:%.*]], align 4
 // CHECK11-NEXT:    [[DOTAFFS_ARR_ADDR:%.*]] = alloca [1 x %struct.kmp_task_affinity_info_t], align 4
 // CHECK11-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1:[0-9]+]])
+// CHECK11-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK11-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 4
 // CHECK11-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
 // CHECK11-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT_ANON]], ptr [[AGG_CAPTURED]], i32 0, i32 0
diff --git a/clang/test/OpenMP/target_teams_codegen.cpp b/clang/test/OpenMP/target_teams_codegen.cpp
index 9ee1f74e8fdc4..b7c7add229c14 100644
--- a/clang/test/OpenMP/target_teams_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_codegen.cpp
@@ -3496,13 +3496,15 @@ int bar(int n){
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l101
-// CHECK9-SAME: (i64 noundef [[AA:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]], i64 noundef [[DOTCAPTURE_EXPR_1:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK9-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[AA:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]], i64 noundef [[DOTCAPTURE_EXPR_1:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK9-NEXT:  entry:
+// CHECK9-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[AA_ADDR:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[DOTCAPTURE_EXPR__ADDR2:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[AA_CASTED:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1:[0-9]+]])
+// CHECK9-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK9-NEXT:    store i64 [[AA]], ptr [[AA_ADDR]], align 8
 // CHECK9-NEXT:    store i64 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 8
 // CHECK9-NEXT:    store i64 [[DOTCAPTURE_EXPR_1]], ptr [[DOTCAPTURE_EXPR__ADDR2]], align 8
@@ -3529,10 +3531,12 @@ int bar(int n){
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l111
-// CHECK9-SAME: (i64 noundef [[AA:%.*]]) #[[ATTR0]] {
+// CHECK9-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[AA:%.*]]) #[[ATTR0]] {
 // CHECK9-NEXT:  entry:
+// CHECK9-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[AA_ADDR:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[AA_CASTED:%.*]] = alloca i64, align 8
+// CHECK9-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK9-NEXT:    store i64 [[AA]], ptr [[AA_ADDR]], align 8
 // CHECK9-NEXT:    [[TMP0:%.*]] = load i16, ptr [[AA_ADDR]], align 2
 // CHECK9-NEXT:    store i16 [[TMP0]], ptr [[AA_CASTED]], align 2
@@ -3559,12 +3563,14 @@ int bar(int n){
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l118
-// CHECK9-SAME: (i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]]) #[[ATTR0]] {
+// CHECK9-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]]) #[[ATTR0]] {
 // CHECK9-NEXT:  entry:
+// CHECK9-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[AA_ADDR:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[A_CASTED:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[AA_CASTED:%.*]] = alloca i64, align 8
+// CHECK9-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK9-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
 // CHECK9-NEXT:    store i64 [[AA]], ptr [[AA_ADDR]], align 8
 // CHECK9-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
@@ -3600,12 +3606,14 @@ int bar(int n){
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l124
-// CHECK9-SAME: (i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]]) #[[ATTR0]] {
+// CHECK9-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]]) #[[ATTR0]] {
 // CHECK9-NEXT:  entry:
+// CHECK9-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[AA_ADDR:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[A_CASTED:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[AA_CASTED:%.*]] = alloca i64, align 8
+// CHECK9-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK9-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
 // CHECK9-NEXT:    store i64 [[AA]], ptr [[AA_ADDR]], align 8
 // CHECK9-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
@@ -3641,8 +3649,9 @@ int bar(int n){
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l148
-// CHECK9-SAME: (i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BN:%.*]], ptr noundef nonnull align 8 dereferenceable(400) [[C:%.*]], i64 noundef [[VLA1:%.*]], i64 noundef [[VLA3:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[CN:%.*]], ptr noundef nonnull align 8 dereferenceable(16) [[D:%.*]]) #[[ATTR0]] {
+// CHECK9-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BN:%.*]], ptr noundef nonnull align 8 dereferenceable(400) [[C:%.*]], i64 noundef [[VLA1:%.*]], i64 noundef [[VLA3:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[CN:%.*]], ptr noundef nonnull align 8 dereferenceable(16) [[D:%.*]]) #[[ATTR0]] {
 // CHECK9-NEXT:  entry:
+// CHECK9-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[VLA_ADDR:%.*]] = alloca i64, align 8
@@ -3653,6 +3662,7 @@ int bar(int n){
 // CHECK9-NEXT:    [[CN_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[D_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[A_CASTED:%.*]] = alloca i64, align 8
+// CHECK9-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK9-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
 // CHECK9-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
 // CHECK9-NEXT:    store i64 [[VLA]], ptr [[VLA_ADDR]], align 8
@@ -3750,10 +3760,12 @@ int bar(int n){
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l160
-// CHECK9-SAME: (i64 noundef [[NN:%.*]]) #[[ATTR0]] {
+// CHECK9-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[NN:%.*]]) #[[ATTR0]] {
 // CHECK9-NEXT:  entry:
+// CHECK9-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[NN_ADDR:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[NN_CASTED:%.*]] = alloca i64, align 8
+// CHECK9-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK9-NEXT:    store i64 [[NN]], ptr [[NN_ADDR]], align 8
 // CHECK9-NEXT:    [[TMP0:%.*]] = load i32, ptr [[NN_ADDR]], align 4
 // CHECK9-NEXT:    store i32 [[TMP0]], ptr [[NN_CASTED]], align 4
@@ -3792,10 +3804,12 @@ int bar(int n){
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l163
-// CHECK9-SAME: (i64 noundef [[NN:%.*]]) #[[ATTR0]] {
+// CHECK9-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[NN:%.*]]) #[[ATTR0]] {
 // CHECK9-NEXT:  entry:
+// CHECK9-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[NN_ADDR:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[NN_CASTED:%.*]] = alloca i64, align 8
+// CHECK9-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK9-NEXT:    store i64 [[NN]], ptr [[NN_ADDR]], align 8
 // CHECK9-NEXT:    [[TMP0:%.*]] = load i32, ptr [[NN_ADDR]], align 4
 // CHECK9-NEXT:    store i32 [[TMP0]], ptr [[NN_CASTED]], align 4
@@ -3831,9 +3845,11 @@ int bar(int n){
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z6bazzzziPi_l188
-// CHECK9-SAME: (i64 noundef [[VLA:%.*]]) #[[ATTR0]] {
+// CHECK9-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[VLA:%.*]]) #[[ATTR0]] {
 // CHECK9-NEXT:  entry:
+// CHECK9-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[VLA_ADDR:%.*]] = alloca i64, align 8
+// CHECK9-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK9-NEXT:    store i64 [[VLA]], ptr [[VLA_ADDR]], align 8
 // CHECK9-NEXT:    [[TMP0:%.*]] = load i64, ptr [[VLA_ADDR]], align 8
 // CHECK9-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z6bazzzziPi_l188.omp_outlined, i64 [[TMP0]])
@@ -3855,8 +3871,9 @@ int bar(int n){
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l215
-// CHECK9-SAME: (i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]], i64 noundef [[AAA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
+// CHECK9-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]], i64 noundef [[AAA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
 // CHECK9-NEXT:  entry:
+// CHECK9-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[AA_ADDR:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[AAA_ADDR:%.*]] = alloca i64, align 8
@@ -3864,6 +3881,7 @@ int bar(int n){
 // CHECK9-NEXT:    [[A_CASTED:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[AA_CASTED:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[AAA_CASTED:%.*]] = alloca i64, align 8
+// CHECK9-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK9-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
 // CHECK9-NEXT:    store i64 [[AA]], ptr [[AA_ADDR]], align 8
 // CHECK9-NEXT:    store i64 [[AAA]], ptr [[AAA_ADDR]], align 8
@@ -3919,14 +3937,16 @@ int bar(int n){
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l233
-// CHECK9-SAME: (ptr noundef [[THIS:%.*]], i64 noundef [[B:%.*]], i64 noundef [[VLA:%.*]], i64 noundef [[VLA1:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[C:%.*]]) #[[ATTR0]] {
+// CHECK9-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef [[THIS:%.*]], i64 noundef [[B:%.*]], i64 noundef [[VLA:%.*]], i64 noundef [[VLA1:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[C:%.*]]) #[[ATTR0]] {
 // CHECK9-NEXT:  entry:
+// CHECK9-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[B_ADDR:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[VLA_ADDR:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[VLA_ADDR2:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[B_CASTED:%.*]] = alloca i64, align 8
+// CHECK9-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK9-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
 // CHECK9-NEXT:    store i64 [[B]], ptr [[B_ADDR]], align 8
 // CHECK9-NEXT:    store i64 [[VLA]], ptr [[VLA_ADDR]], align 8
@@ -3982,13 +4002,15 @@ int bar(int n){
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l198
-// CHECK9-SAME: (i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
+// CHECK9-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
 // CHECK9-NEXT:  entry:
+// CHECK9-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[AA_ADDR:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[A_CASTED:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[AA_CASTED:%.*]] = alloca i64, align 8
+// CHECK9-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK9-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
 // CHECK9-NEXT:    store i64 [[AA]], ptr [[AA_ADDR]], align 8
 // CHECK9-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
@@ -4033,13 +4055,15 @@ int bar(int n){
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l101
-// CHECK11-SAME: (i32 noundef [[AA:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]], i32 noundef [[DOTCAPTURE_EXPR_1:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK11-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[AA:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]], i32 noundef [[DOTCAPTURE_EXPR_1:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK11-NEXT:  entry:
+// CHECK11-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[AA_ADDR:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[DOTCAPTURE_EXPR__ADDR2:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[AA_CASTED:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1:[0-9]+]])
+// CHECK11-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK11-NEXT:    store i32 [[AA]], ptr [[AA_ADDR]], align 4
 // CHECK11-NEXT:    store i32 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 4
 // CHECK11-NEXT:    store i32 [[DOTCAPTURE_EXPR_1]], ptr [[DOTCAPTURE_EXPR__ADDR2]], align 4
@@ -4066,10 +4090,12 @@ int bar(int n){
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l111
-// CHECK11-SAME: (i32 noundef [[AA:%.*]]) #[[ATTR0]] {
+// CHECK11-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[AA:%.*]]) #[[ATTR0]] {
 // CHECK11-NEXT:  entry:
+// CHECK11-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[AA_ADDR:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[AA_CASTED:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK11-NEXT:    store i32 [[AA]], ptr [[AA_ADDR]], align 4
 // CHECK11-NEXT:    [[TMP0:%.*]] = load i16, ptr [[AA_ADDR]], align 2
 // CHECK11-NEXT:    store i16 [[TMP0]], ptr [[AA_CASTED]], align 2
@@ -4096,12 +4122,14 @@ int bar(int n){
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l118
-// CHECK11-SAME: (i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]]) #[[ATTR0]] {
+// CHECK11-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]]) #[[ATTR0]] {
 // CHECK11-NEXT:  entry:
+// CHECK11-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[AA_ADDR:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[A_CASTED:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[AA_CASTED:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK11-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
 // CHECK11-NEXT:    store i32 [[AA]], ptr [[AA_ADDR]], align 4
 // CHECK11-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
@@ -4137,12 +4165,14 @@ int bar(int n){
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l124
-// CHECK11-SAME: (i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]]) #[[ATTR0]] {
+// CHECK11-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]]) #[[ATTR0]] {
 // CHECK11-NEXT:  entry:
+// CHECK11-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[AA_ADDR:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[A_CASTED:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[AA_CASTED:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK11-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
 // CHECK11-NEXT:    store i32 [[AA]], ptr [[AA_ADDR]], align 4
 // CHECK11-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
@@ -4178,8 +4208,9 @@ int bar(int n){
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l148
-// CHECK11-SAME: (i32 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BN:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[C:%.*]], i32 noundef [[VLA1:%.*]], i32 noundef [[VLA3:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[CN:%.*]], ptr noundef nonnull align 4 dereferenceable(12) [[D:%.*]]) #[[ATTR0]] {
+// CHECK11-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BN:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[C:%.*]], i32 noundef [[VLA1:%.*]], i32 noundef [[VLA3:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[CN:%.*]], ptr noundef nonnull align 4 dereferenceable(12) [[D:%.*]]) #[[ATTR0]] {
 // CHECK11-NEXT:  entry:
+// CHECK11-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[VLA_ADDR:%.*]] = alloca i32, align 4
@@ -4190,6 +4221,7 @@ int bar(int n){
 // CHECK11-NEXT:    [[CN_ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[D_ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[A_CASTED:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK11-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
 // CHECK11-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 4
 // CHECK11-NEXT:    store i32 [[VLA]], ptr [[VLA_ADDR]], align 4
@@ -4287,10 +4319,12 @@ int bar(int n){
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l160
-// CHECK11-SAME: (i32 noundef [[NN:%.*]]) #[[ATTR0]] {
+// CHECK11-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[NN:%.*]]) #[[ATTR0]] {
 // CHECK11-NEXT:  entry:
+// CHECK11-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[NN_ADDR:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[NN_CASTED:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK11-NEXT:    store i32 [[NN]], ptr [[NN_ADDR]], align 4
 // CHECK11-NEXT:    [[TMP0:%.*]] = load i32, ptr [[NN_ADDR]], align 4
 // CHECK11-NEXT:    store i32 [[TMP0]], ptr [[NN_CASTED]], align 4
@@ -4329,10 +4363,12 @@ int bar(int n){
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l163
-// CHECK11-SAME: (i32 noundef [[NN:%.*]]) #[[ATTR0]] {
+// CHECK11-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[NN:%.*]]) #[[ATTR0]] {
 // CHECK11-NEXT:  entry:
+// CHECK11-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[NN_ADDR:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[NN_CASTED:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK11-NEXT:    store i32 [[NN]], ptr [[NN_ADDR]], align 4
 // CHECK11-NEXT:    [[TMP0:%.*]] = load i32, ptr [[NN_ADDR]], align 4
 // CHECK11-NEXT:    store i32 [[TMP0]], ptr [[NN_CASTED]], align 4
@@ -4368,9 +4404,11 @@ int bar(int n){
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z6bazzzziPi_l188
-// CHECK11-SAME: (i32 noundef [[VLA:%.*]]) #[[ATTR0]] {
+// CHECK11-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[VLA:%.*]]) #[[ATTR0]] {
 // CHECK11-NEXT:  entry:
+// CHECK11-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[VLA_ADDR:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK11-NEXT:    store i32 [[VLA]], ptr [[VLA_ADDR]], align 4
 // CHECK11-NEXT:    [[TMP0:%.*]] = load i32, ptr [[VLA_ADDR]], align 4
 // CHECK11-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z6bazzzziPi_l188.omp_outlined, i32 [[TMP0]])
@@ -4392,8 +4430,9 @@ int bar(int n){
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l215
-// CHECK11-SAME: (i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]], i32 noundef [[AAA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
+// CHECK11-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]], i32 noundef [[AAA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
 // CHECK11-NEXT:  entry:
+// CHECK11-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[AA_ADDR:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[AAA_ADDR:%.*]] = alloca i32, align 4
@@ -4401,6 +4440,7 @@ int bar(int n){
 // CHECK11-NEXT:    [[A_CASTED:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[AA_CASTED:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[AAA_CASTED:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK11-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
 // CHECK11-NEXT:    store i32 [[AA]], ptr [[AA_ADDR]], align 4
 // CHECK11-NEXT:    store i32 [[AAA]], ptr [[AAA_ADDR]], align 4
@@ -4456,14 +4496,16 @@ int bar(int n){
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l233
-// CHECK11-SAME: (ptr noundef [[THIS:%.*]], i32 noundef [[B:%.*]], i32 noundef [[VLA:%.*]], i32 noundef [[VLA1:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[C:%.*]]) #[[ATTR0]] {
+// CHECK11-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef [[THIS:%.*]], i32 noundef [[B:%.*]], i32 noundef [[VLA:%.*]], i32 noundef [[VLA1:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[C:%.*]]) #[[ATTR0]] {
 // CHECK11-NEXT:  entry:
+// CHECK11-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[VLA_ADDR:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[VLA_ADDR2:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[B_CASTED:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK11-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
 // CHECK11-NEXT:    store i32 [[B]], ptr [[B_ADDR]], align 4
 // CHECK11-NEXT:    store i32 [[VLA]], ptr [[VLA_ADDR]], align 4
@@ -4519,13 +4561,15 @@ int bar(int n){
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l198
-// CHECK11-SAME: (i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
+// CHECK11-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
 // CHECK11-NEXT:  entry:
+// CHECK11-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[AA_ADDR:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[A_CASTED:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[AA_CASTED:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK11-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
 // CHECK11-NEXT:    store i32 [[AA]], ptr [[AA_ADDR]], align 4
 // CHECK11-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 4
diff --git a/clang/test/OpenMP/target_teams_distribute_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_codegen.cpp
index a7358090164ed..ce489d3b4e2a3 100644
--- a/clang/test/OpenMP/target_teams_distribute_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_codegen.cpp
@@ -3785,13 +3785,15 @@ int bar(int n){
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l103
-// CHECK9-SAME: (i64 noundef [[AA:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]], i64 noundef [[DOTCAPTURE_EXPR_1:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK9-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[AA:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]], i64 noundef [[DOTCAPTURE_EXPR_1:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK9-NEXT:  entry:
+// CHECK9-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[AA_ADDR:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[DOTCAPTURE_EXPR__ADDR2:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[AA_CASTED:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB2:[0-9]+]])
+// CHECK9-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK9-NEXT:    store i64 [[AA]], ptr [[AA_ADDR]], align 8
 // CHECK9-NEXT:    store i64 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 8
 // CHECK9-NEXT:    store i64 [[DOTCAPTURE_EXPR_1]], ptr [[DOTCAPTURE_EXPR__ADDR2]], align 8
@@ -3868,10 +3870,12 @@ int bar(int n){
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l113
-// CHECK9-SAME: (i64 noundef [[AA:%.*]]) #[[ATTR0]] {
+// CHECK9-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[AA:%.*]]) #[[ATTR0]] {
 // CHECK9-NEXT:  entry:
+// CHECK9-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[AA_ADDR:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[AA_CASTED:%.*]] = alloca i64, align 8
+// CHECK9-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK9-NEXT:    store i64 [[AA]], ptr [[AA_ADDR]], align 8
 // CHECK9-NEXT:    [[TMP0:%.*]] = load i16, ptr [[AA_ADDR]], align 2
 // CHECK9-NEXT:    store i16 [[TMP0]], ptr [[AA_CASTED]], align 2
@@ -3948,12 +3952,14 @@ int bar(int n){
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l120
-// CHECK9-SAME: (i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]]) #[[ATTR0]] {
+// CHECK9-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]]) #[[ATTR0]] {
 // CHECK9-NEXT:  entry:
+// CHECK9-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[AA_ADDR:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[A_CASTED:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[AA_CASTED:%.*]] = alloca i64, align 8
+// CHECK9-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK9-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
 // CHECK9-NEXT:    store i64 [[AA]], ptr [[AA_ADDR]], align 8
 // CHECK9-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
@@ -4039,8 +4045,9 @@ int bar(int n){
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l145
-// CHECK9-SAME: (i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BN:%.*]], ptr noundef nonnull align 8 dereferenceable(400) [[C:%.*]], i64 noundef [[VLA1:%.*]], i64 noundef [[VLA3:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[CN:%.*]], ptr noundef nonnull align 8 dereferenceable(16) [[D:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] {
+// CHECK9-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BN:%.*]], ptr noundef nonnull align 8 dereferenceable(400) [[C:%.*]], i64 noundef [[VLA1:%.*]], i64 noundef [[VLA3:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[CN:%.*]], ptr noundef nonnull align 8 dereferenceable(16) [[D:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] {
 // CHECK9-NEXT:  entry:
+// CHECK9-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[VLA_ADDR:%.*]] = alloca i64, align 8
@@ -4053,6 +4060,7 @@ int bar(int n){
 // CHECK9-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[A_CASTED:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[DOTCAPTURE_EXPR__CASTED:%.*]] = alloca i64, align 8
+// CHECK9-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK9-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
 // CHECK9-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
 // CHECK9-NEXT:    store i64 [[VLA]], ptr [[VLA_ADDR]], align 8
@@ -4224,8 +4232,9 @@ int bar(int n){
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l200
-// CHECK9-SAME: (i64 noundef [[N:%.*]], i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]], i64 noundef [[AAA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
+// CHECK9-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[N:%.*]], i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]], i64 noundef [[AAA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
 // CHECK9-NEXT:  entry:
+// CHECK9-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[N_ADDR:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[AA_ADDR:%.*]] = alloca i64, align 8
@@ -4235,6 +4244,7 @@ int bar(int n){
 // CHECK9-NEXT:    [[A_CASTED:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[AA_CASTED:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[AAA_CASTED:%.*]] = alloca i64, align 8
+// CHECK9-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK9-NEXT:    store i64 [[N]], ptr [[N_ADDR]], align 8
 // CHECK9-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
 // CHECK9-NEXT:    store i64 [[AA]], ptr [[AA_ADDR]], align 8
@@ -4378,14 +4388,16 @@ int bar(int n){
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l218
-// CHECK9-SAME: (ptr noundef [[THIS:%.*]], i64 noundef [[B:%.*]], i64 noundef [[VLA:%.*]], i64 noundef [[VLA1:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[C:%.*]]) #[[ATTR0]] {
+// CHECK9-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef [[THIS:%.*]], i64 noundef [[B:%.*]], i64 noundef [[VLA:%.*]], i64 noundef [[VLA1:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[C:%.*]]) #[[ATTR0]] {
 // CHECK9-NEXT:  entry:
+// CHECK9-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[B_ADDR:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[VLA_ADDR:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[VLA_ADDR2:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[B_CASTED:%.*]] = alloca i64, align 8
+// CHECK9-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK9-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
 // CHECK9-NEXT:    store i64 [[B]], ptr [[B_ADDR]], align 8
 // CHECK9-NEXT:    store i64 [[VLA]], ptr [[VLA_ADDR]], align 8
@@ -4491,13 +4503,15 @@ int bar(int n){
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l183
-// CHECK9-SAME: (i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
+// CHECK9-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
 // CHECK9-NEXT:  entry:
+// CHECK9-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[AA_ADDR:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[A_CASTED:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[AA_CASTED:%.*]] = alloca i64, align 8
+// CHECK9-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK9-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
 // CHECK9-NEXT:    store i64 [[AA]], ptr [[AA_ADDR]], align 8
 // CHECK9-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
@@ -4592,13 +4606,15 @@ int bar(int n){
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l103
-// CHECK11-SAME: (i32 noundef [[AA:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]], i32 noundef [[DOTCAPTURE_EXPR_1:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK11-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[AA:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]], i32 noundef [[DOTCAPTURE_EXPR_1:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK11-NEXT:  entry:
+// CHECK11-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[AA_ADDR:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[DOTCAPTURE_EXPR__ADDR2:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[AA_CASTED:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB2:[0-9]+]])
+// CHECK11-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK11-NEXT:    store i32 [[AA]], ptr [[AA_ADDR]], align 4
 // CHECK11-NEXT:    store i32 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 4
 // CHECK11-NEXT:    store i32 [[DOTCAPTURE_EXPR_1]], ptr [[DOTCAPTURE_EXPR__ADDR2]], align 4
@@ -4675,10 +4691,12 @@ int bar(int n){
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l113
-// CHECK11-SAME: (i32 noundef [[AA:%.*]]) #[[ATTR0]] {
+// CHECK11-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[AA:%.*]]) #[[ATTR0]] {
 // CHECK11-NEXT:  entry:
+// CHECK11-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[AA_ADDR:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[AA_CASTED:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK11-NEXT:    store i32 [[AA]], ptr [[AA_ADDR]], align 4
 // CHECK11-NEXT:    [[TMP0:%.*]] = load i16, ptr [[AA_ADDR]], align 2
 // CHECK11-NEXT:    store i16 [[TMP0]], ptr [[AA_CASTED]], align 2
@@ -4755,12 +4773,14 @@ int bar(int n){
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l120
-// CHECK11-SAME: (i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]]) #[[ATTR0]] {
+// CHECK11-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]]) #[[ATTR0]] {
 // CHECK11-NEXT:  entry:
+// CHECK11-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[AA_ADDR:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[A_CASTED:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[AA_CASTED:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK11-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
 // CHECK11-NEXT:    store i32 [[AA]], ptr [[AA_ADDR]], align 4
 // CHECK11-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
@@ -4846,8 +4866,9 @@ int bar(int n){
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l145
-// CHECK11-SAME: (i32 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BN:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[C:%.*]], i32 noundef [[VLA1:%.*]], i32 noundef [[VLA3:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[CN:%.*]], ptr noundef nonnull align 4 dereferenceable(12) [[D:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] {
+// CHECK11-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BN:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[C:%.*]], i32 noundef [[VLA1:%.*]], i32 noundef [[VLA3:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[CN:%.*]], ptr noundef nonnull align 4 dereferenceable(12) [[D:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] {
 // CHECK11-NEXT:  entry:
+// CHECK11-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[VLA_ADDR:%.*]] = alloca i32, align 4
@@ -4860,6 +4881,7 @@ int bar(int n){
 // CHECK11-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[A_CASTED:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[DOTCAPTURE_EXPR__CASTED:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK11-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
 // CHECK11-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 4
 // CHECK11-NEXT:    store i32 [[VLA]], ptr [[VLA_ADDR]], align 4
@@ -5031,8 +5053,9 @@ int bar(int n){
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l200
-// CHECK11-SAME: (i32 noundef [[N:%.*]], i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]], i32 noundef [[AAA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
+// CHECK11-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[N:%.*]], i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]], i32 noundef [[AAA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
 // CHECK11-NEXT:  entry:
+// CHECK11-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[AA_ADDR:%.*]] = alloca i32, align 4
@@ -5042,6 +5065,7 @@ int bar(int n){
 // CHECK11-NEXT:    [[A_CASTED:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[AA_CASTED:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[AAA_CASTED:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK11-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
 // CHECK11-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
 // CHECK11-NEXT:    store i32 [[AA]], ptr [[AA_ADDR]], align 4
@@ -5185,14 +5209,16 @@ int bar(int n){
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l218
-// CHECK11-SAME: (ptr noundef [[THIS:%.*]], i32 noundef [[B:%.*]], i32 noundef [[VLA:%.*]], i32 noundef [[VLA1:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[C:%.*]]) #[[ATTR0]] {
+// CHECK11-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef [[THIS:%.*]], i32 noundef [[B:%.*]], i32 noundef [[VLA:%.*]], i32 noundef [[VLA1:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[C:%.*]]) #[[ATTR0]] {
 // CHECK11-NEXT:  entry:
+// CHECK11-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[VLA_ADDR:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[VLA_ADDR2:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[B_CASTED:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK11-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
 // CHECK11-NEXT:    store i32 [[B]], ptr [[B_ADDR]], align 4
 // CHECK11-NEXT:    store i32 [[VLA]], ptr [[VLA_ADDR]], align 4
@@ -5298,13 +5324,15 @@ int bar(int n){
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l183
-// CHECK11-SAME: (i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
+// CHECK11-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
 // CHECK11-NEXT:  entry:
+// CHECK11-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[AA_ADDR:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[A_CASTED:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[AA_CASTED:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK11-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
 // CHECK11-NEXT:    store i32 [[AA]], ptr [[AA_ADDR]], align 4
 // CHECK11-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 4
diff --git a/clang/test/OpenMP/target_teams_distribute_collapse_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_collapse_codegen.cpp
index 298568f8b75db..56579b33c1779 100644
--- a/clang/test/OpenMP/target_teams_distribute_collapse_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_collapse_codegen.cpp
@@ -153,7 +153,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
 // CHECK1-NEXT:    br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK1:       omp_offload.failed:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28(ptr [[THIS1]]) #[[ATTR3:[0-9]+]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28(ptr [[THIS1]]) #[[ATTR2:[0-9]+]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    [[A3:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0
@@ -174,7 +174,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -256,7 +256,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK1-SAME: () #[[ATTR4:[0-9]+]] {
+// CHECK1-SAME: () #[[ATTR3:[0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK1-NEXT:    ret void
@@ -321,7 +321,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
 // CHECK3-NEXT:    br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK3:       omp_offload.failed:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28(ptr [[THIS1]]) #[[ATTR3:[0-9]+]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28(ptr [[THIS1]]) #[[ATTR2:[0-9]+]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK3:       omp_offload.cont:
 // CHECK3-NEXT:    [[A3:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0
@@ -342,7 +342,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -422,7 +422,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK3-SAME: () #[[ATTR4:[0-9]+]] {
+// CHECK3-SAME: () #[[ATTR3:[0-9]+]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK3-NEXT:    ret void
@@ -557,7 +557,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[TMP50:%.*]] = icmp ne i32 [[TMP49]], 0
 // CHECK9-NEXT:    br i1 [[TMP50]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK9:       omp_offload.failed:
-// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l80(i64 [[TMP7]], i64 [[TMP9]], i64 [[TMP1]], i64 [[TMP3]], ptr [[VLA]]) #[[ATTR4:[0-9]+]]
+// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l80(i64 [[TMP7]], i64 [[TMP9]], i64 [[TMP1]], i64 [[TMP3]], ptr [[VLA]]) #[[ATTR3:[0-9]+]]
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK9:       omp_offload.cont:
 // CHECK9-NEXT:    [[TMP51:%.*]] = load i32, ptr [[ARGC_ADDR]], align 4
@@ -598,7 +598,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l80.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[M:%.*]], i64 noundef [[VLA:%.*]], i64 noundef [[VLA1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[M:%.*]], i64 noundef [[VLA:%.*]], i64 noundef [[VLA1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -744,7 +744,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@_Z5tmainIiLi10ELi2EEiT_
-// CHECK9-SAME: (i32 noundef signext [[ARGC:%.*]]) #[[ATTR6:[0-9]+]] comdat {
+// CHECK9-SAME: (i32 noundef signext [[ARGC:%.*]]) #[[ATTR5:[0-9]+]] comdat {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[ARGC_ADDR:%.*]] = alloca i32, align 4
 // CHECK9-NEXT:    [[A:%.*]] = alloca [10 x [2 x i32]], align 4
@@ -793,7 +793,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
 // CHECK9-NEXT:    br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK9:       omp_offload.failed:
-// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10ELi2EEiT__l67(ptr [[A]]) #[[ATTR4]]
+// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10ELi2EEiT__l67(ptr [[A]]) #[[ATTR3]]
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK9:       omp_offload.cont:
 // CHECK9-NEXT:    ret i32 0
@@ -810,7 +810,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10ELi2EEiT__l67.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(80) [[A:%.*]]) #[[ATTR3]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(80) [[A:%.*]]) #[[ATTR2]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -891,7 +891,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK9-SAME: () #[[ATTR7:[0-9]+]] {
+// CHECK9-SAME: () #[[ATTR6:[0-9]+]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK9-NEXT:    ret void
@@ -1025,7 +1025,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[TMP49:%.*]] = icmp ne i32 [[TMP48]], 0
 // CHECK11-NEXT:    br i1 [[TMP49]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK11:       omp_offload.failed:
-// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l80(i32 [[TMP5]], i32 [[TMP7]], i32 [[TMP0]], i32 [[TMP1]], ptr [[VLA]]) #[[ATTR4:[0-9]+]]
+// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l80(i32 [[TMP5]], i32 [[TMP7]], i32 [[TMP0]], i32 [[TMP1]], ptr [[VLA]]) #[[ATTR3:[0-9]+]]
 // CHECK11-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK11:       omp_offload.cont:
 // CHECK11-NEXT:    [[TMP50:%.*]] = load i32, ptr [[ARGC_ADDR]], align 4
@@ -1066,7 +1066,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l80.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[M:%.*]], i32 noundef [[VLA:%.*]], i32 noundef [[VLA1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[M:%.*]], i32 noundef [[VLA:%.*]], i32 noundef [[VLA1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1210,7 +1210,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@_Z5tmainIiLi10ELi2EEiT_
-// CHECK11-SAME: (i32 noundef [[ARGC:%.*]]) #[[ATTR6:[0-9]+]] comdat {
+// CHECK11-SAME: (i32 noundef [[ARGC:%.*]]) #[[ATTR5:[0-9]+]] comdat {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[ARGC_ADDR:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[A:%.*]] = alloca [10 x [2 x i32]], align 4
@@ -1259,7 +1259,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
 // CHECK11-NEXT:    br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK11:       omp_offload.failed:
-// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10ELi2EEiT__l67(ptr [[A]]) #[[ATTR4]]
+// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10ELi2EEiT__l67(ptr [[A]]) #[[ATTR3]]
 // CHECK11-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK11:       omp_offload.cont:
 // CHECK11-NEXT:    ret i32 0
@@ -1276,7 +1276,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10ELi2EEiT__l67.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(80) [[A:%.*]]) #[[ATTR3]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(80) [[A:%.*]]) #[[ATTR2]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1355,7 +1355,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK11-SAME: () #[[ATTR7:[0-9]+]] {
+// CHECK11-SAME: () #[[ATTR6:[0-9]+]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK11-NEXT:    ret void
diff --git a/clang/test/OpenMP/target_teams_distribute_dist_schedule_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_dist_schedule_codegen.cpp
index f4b424eb25c7e..5d940d020deaa 100644
--- a/clang/test/OpenMP/target_teams_distribute_dist_schedule_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_dist_schedule_codegen.cpp
@@ -190,7 +190,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
 // CHECK1-NEXT:    br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK1:       omp_offload.failed:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28(ptr [[THIS1]]) #[[ATTR3:[0-9]+]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28(ptr [[THIS1]]) #[[ATTR2:[0-9]+]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    [[A2:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0
@@ -232,7 +232,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP39:%.*]] = icmp ne i32 [[TMP38]], 0
 // CHECK1-NEXT:    br i1 [[TMP39]], label [[OMP_OFFLOAD_FAILED8:%.*]], label [[OMP_OFFLOAD_CONT9:%.*]]
 // CHECK1:       omp_offload.failed8:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l32(ptr [[THIS1]]) #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l32(ptr [[THIS1]]) #[[ATTR2]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT9]]
 // CHECK1:       omp_offload.cont9:
 // CHECK1-NEXT:    [[A10:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0
@@ -274,7 +274,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP59:%.*]] = icmp ne i32 [[TMP58]], 0
 // CHECK1-NEXT:    br i1 [[TMP59]], label [[OMP_OFFLOAD_FAILED16:%.*]], label [[OMP_OFFLOAD_CONT17:%.*]]
 // CHECK1:       omp_offload.failed16:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l36(ptr [[THIS1]]) #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l36(ptr [[THIS1]]) #[[ATTR2]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT17]]
 // CHECK1:       omp_offload.cont17:
 // CHECK1-NEXT:    [[A18:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0
@@ -294,7 +294,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -372,7 +372,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l32.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -450,7 +450,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l36.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -535,7 +535,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK1-SAME: () #[[ATTR4:[0-9]+]] {
+// CHECK1-SAME: () #[[ATTR3:[0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK1-NEXT:    ret void
@@ -609,7 +609,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
 // CHECK3-NEXT:    br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK3:       omp_offload.failed:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28(ptr [[THIS1]]) #[[ATTR3:[0-9]+]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28(ptr [[THIS1]]) #[[ATTR2:[0-9]+]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK3:       omp_offload.cont:
 // CHECK3-NEXT:    [[A2:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0
@@ -651,7 +651,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP39:%.*]] = icmp ne i32 [[TMP38]], 0
 // CHECK3-NEXT:    br i1 [[TMP39]], label [[OMP_OFFLOAD_FAILED8:%.*]], label [[OMP_OFFLOAD_CONT9:%.*]]
 // CHECK3:       omp_offload.failed8:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l32(ptr [[THIS1]]) #[[ATTR3]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l32(ptr [[THIS1]]) #[[ATTR2]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT9]]
 // CHECK3:       omp_offload.cont9:
 // CHECK3-NEXT:    [[A10:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0
@@ -693,7 +693,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP59:%.*]] = icmp ne i32 [[TMP58]], 0
 // CHECK3-NEXT:    br i1 [[TMP59]], label [[OMP_OFFLOAD_FAILED16:%.*]], label [[OMP_OFFLOAD_CONT17:%.*]]
 // CHECK3:       omp_offload.failed16:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l36(ptr [[THIS1]]) #[[ATTR3]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l36(ptr [[THIS1]]) #[[ATTR2]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT17]]
 // CHECK3:       omp_offload.cont17:
 // CHECK3-NEXT:    [[A18:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0
@@ -713,7 +713,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -790,7 +790,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l32.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -867,7 +867,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l36.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -951,7 +951,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK3-SAME: () #[[ATTR4:[0-9]+]] {
+// CHECK3-SAME: () #[[ATTR3:[0-9]+]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK3-NEXT:    ret void
@@ -1073,7 +1073,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[TMP37:%.*]] = icmp ne i32 [[TMP36]], 0
 // CHECK9-NEXT:    br i1 [[TMP37]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK9:       omp_offload.failed:
-// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l94(i64 [[TMP4]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR4:[0-9]+]]
+// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l94(i64 [[TMP4]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR3:[0-9]+]]
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK9:       omp_offload.cont:
 // CHECK9-NEXT:    [[TMP38:%.*]] = load i32, ptr [[N]], align 4
@@ -1144,7 +1144,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[TMP72:%.*]] = icmp ne i32 [[TMP71]], 0
 // CHECK9-NEXT:    br i1 [[TMP72]], label [[OMP_OFFLOAD_FAILED16:%.*]], label [[OMP_OFFLOAD_CONT17:%.*]]
 // CHECK9:       omp_offload.failed16:
-// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l98(i64 [[TMP39]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR4]]
+// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l98(i64 [[TMP39]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR3]]
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT17]]
 // CHECK9:       omp_offload.cont17:
 // CHECK9-NEXT:    [[TMP73:%.*]] = load i32, ptr [[N]], align 4
@@ -1226,7 +1226,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[TMP113:%.*]] = icmp ne i32 [[TMP112]], 0
 // CHECK9-NEXT:    br i1 [[TMP113]], label [[OMP_OFFLOAD_FAILED32:%.*]], label [[OMP_OFFLOAD_CONT33:%.*]]
 // CHECK9:       omp_offload.failed32:
-// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l102(i64 [[TMP75]], i64 [[TMP1]], ptr [[VLA]], i64 [[TMP77]]) #[[ATTR4]]
+// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l102(i64 [[TMP75]], i64 [[TMP1]], ptr [[VLA]], i64 [[TMP77]]) #[[ATTR3]]
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT33]]
 // CHECK9:       omp_offload.cont33:
 // CHECK9-NEXT:    [[TMP114:%.*]] = load i32, ptr [[ARGC_ADDR]], align 4
@@ -1258,7 +1258,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l94.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1371,7 +1371,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l98.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1490,7 +1490,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l102.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1604,7 +1604,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@_Z5tmainIiLi10EEiT_
-// CHECK9-SAME: (i32 noundef signext [[ARGC:%.*]]) #[[ATTR6:[0-9]+]] comdat {
+// CHECK9-SAME: (i32 noundef signext [[ARGC:%.*]]) #[[ATTR5:[0-9]+]] comdat {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[ARGC_ADDR:%.*]] = alloca i32, align 4
 // CHECK9-NEXT:    [[A:%.*]] = alloca [10 x i32], align 4
@@ -1662,7 +1662,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
 // CHECK9-NEXT:    br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK9:       omp_offload.failed:
-// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l76(ptr [[A]]) #[[ATTR4]]
+// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l76(ptr [[A]]) #[[ATTR3]]
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK9:       omp_offload.cont:
 // CHECK9-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
@@ -1703,7 +1703,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[TMP39:%.*]] = icmp ne i32 [[TMP38]], 0
 // CHECK9-NEXT:    br i1 [[TMP39]], label [[OMP_OFFLOAD_FAILED6:%.*]], label [[OMP_OFFLOAD_CONT7:%.*]]
 // CHECK9:       omp_offload.failed6:
-// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l80(ptr [[A]]) #[[ATTR4]]
+// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l80(ptr [[A]]) #[[ATTR3]]
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT7]]
 // CHECK9:       omp_offload.cont7:
 // CHECK9-NEXT:    [[TMP40:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS8]], i32 0, i32 0
@@ -1744,7 +1744,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[TMP59:%.*]] = icmp ne i32 [[TMP58]], 0
 // CHECK9-NEXT:    br i1 [[TMP59]], label [[OMP_OFFLOAD_FAILED13:%.*]], label [[OMP_OFFLOAD_CONT14:%.*]]
 // CHECK9:       omp_offload.failed13:
-// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l84(ptr [[A]]) #[[ATTR4]]
+// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l84(ptr [[A]]) #[[ATTR3]]
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT14]]
 // CHECK9:       omp_offload.cont14:
 // CHECK9-NEXT:    ret i32 0
@@ -1761,7 +1761,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l76.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1838,7 +1838,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l80.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1915,7 +1915,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l84.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1999,7 +1999,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK9-SAME: () #[[ATTR7:[0-9]+]] {
+// CHECK9-SAME: () #[[ATTR6:[0-9]+]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK9-NEXT:    ret void
@@ -2121,7 +2121,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[TMP37:%.*]] = icmp ne i32 [[TMP36]], 0
 // CHECK11-NEXT:    br i1 [[TMP37]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK11:       omp_offload.failed:
-// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l94(i32 [[TMP3]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR4:[0-9]+]]
+// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l94(i32 [[TMP3]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR3:[0-9]+]]
 // CHECK11-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK11:       omp_offload.cont:
 // CHECK11-NEXT:    [[TMP38:%.*]] = load i32, ptr [[N]], align 4
@@ -2193,7 +2193,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[TMP73:%.*]] = icmp ne i32 [[TMP72]], 0
 // CHECK11-NEXT:    br i1 [[TMP73]], label [[OMP_OFFLOAD_FAILED16:%.*]], label [[OMP_OFFLOAD_CONT17:%.*]]
 // CHECK11:       omp_offload.failed16:
-// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l98(i32 [[TMP39]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR4]]
+// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l98(i32 [[TMP39]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR3]]
 // CHECK11-NEXT:    br label [[OMP_OFFLOAD_CONT17]]
 // CHECK11:       omp_offload.cont17:
 // CHECK11-NEXT:    [[TMP74:%.*]] = load i32, ptr [[N]], align 4
@@ -2276,7 +2276,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[TMP115:%.*]] = icmp ne i32 [[TMP114]], 0
 // CHECK11-NEXT:    br i1 [[TMP115]], label [[OMP_OFFLOAD_FAILED32:%.*]], label [[OMP_OFFLOAD_CONT33:%.*]]
 // CHECK11:       omp_offload.failed32:
-// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l102(i32 [[TMP76]], i32 [[TMP0]], ptr [[VLA]], i32 [[TMP78]]) #[[ATTR4]]
+// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l102(i32 [[TMP76]], i32 [[TMP0]], ptr [[VLA]], i32 [[TMP78]]) #[[ATTR3]]
 // CHECK11-NEXT:    br label [[OMP_OFFLOAD_CONT33]]
 // CHECK11:       omp_offload.cont33:
 // CHECK11-NEXT:    [[TMP116:%.*]] = load i32, ptr [[ARGC_ADDR]], align 4
@@ -2308,7 +2308,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l94.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -2420,7 +2420,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l98.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -2538,7 +2538,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l102.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -2651,7 +2651,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@_Z5tmainIiLi10EEiT_
-// CHECK11-SAME: (i32 noundef [[ARGC:%.*]]) #[[ATTR6:[0-9]+]] comdat {
+// CHECK11-SAME: (i32 noundef [[ARGC:%.*]]) #[[ATTR5:[0-9]+]] comdat {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[ARGC_ADDR:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[A:%.*]] = alloca [10 x i32], align 4
@@ -2709,7 +2709,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
 // CHECK11-NEXT:    br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK11:       omp_offload.failed:
-// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l76(ptr [[A]]) #[[ATTR4]]
+// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l76(ptr [[A]]) #[[ATTR3]]
 // CHECK11-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK11:       omp_offload.cont:
 // CHECK11-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
@@ -2750,7 +2750,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[TMP39:%.*]] = icmp ne i32 [[TMP38]], 0
 // CHECK11-NEXT:    br i1 [[TMP39]], label [[OMP_OFFLOAD_FAILED6:%.*]], label [[OMP_OFFLOAD_CONT7:%.*]]
 // CHECK11:       omp_offload.failed6:
-// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l80(ptr [[A]]) #[[ATTR4]]
+// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l80(ptr [[A]]) #[[ATTR3]]
 // CHECK11-NEXT:    br label [[OMP_OFFLOAD_CONT7]]
 // CHECK11:       omp_offload.cont7:
 // CHECK11-NEXT:    [[TMP40:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS8]], i32 0, i32 0
@@ -2791,7 +2791,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[TMP59:%.*]] = icmp ne i32 [[TMP58]], 0
 // CHECK11-NEXT:    br i1 [[TMP59]], label [[OMP_OFFLOAD_FAILED13:%.*]], label [[OMP_OFFLOAD_CONT14:%.*]]
 // CHECK11:       omp_offload.failed13:
-// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l84(ptr [[A]]) #[[ATTR4]]
+// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l84(ptr [[A]]) #[[ATTR3]]
 // CHECK11-NEXT:    br label [[OMP_OFFLOAD_CONT14]]
 // CHECK11:       omp_offload.cont14:
 // CHECK11-NEXT:    ret i32 0
@@ -2808,7 +2808,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l76.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -2884,7 +2884,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l80.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -2960,7 +2960,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l84.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -3043,7 +3043,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK11-SAME: () #[[ATTR7:[0-9]+]] {
+// CHECK11-SAME: () #[[ATTR6:[0-9]+]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK11-NEXT:    ret void
diff --git a/clang/test/OpenMP/target_teams_distribute_firstprivate_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_firstprivate_codegen.cpp
index 5aa29110a17d0..540d28cf9bc95 100644
--- a/clang/test/OpenMP/target_teams_distribute_firstprivate_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_firstprivate_codegen.cpp
@@ -368,7 +368,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l92.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i64 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], i64 noundef [[SIVAR:%.*]]) #[[ATTR5:[0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i64 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], i64 noundef [[SIVAR:%.*]]) #[[ATTR4]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -528,7 +528,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@_Z5tmainIiET_v
-// CHECK1-SAME: () #[[ATTR7:[0-9]+]] comdat {
+// CHECK1-SAME: () #[[ATTR6:[0-9]+]] comdat {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4
@@ -725,7 +725,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l56.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i64 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR5]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i64 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR4]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1177,7 +1177,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l92.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i32 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], i32 noundef [[SIVAR:%.*]]) #[[ATTR5:[0-9]+]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i32 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], i32 noundef [[SIVAR:%.*]]) #[[ATTR4]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1335,7 +1335,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@_Z5tmainIiET_v
-// CHECK3-SAME: () #[[ATTR7:[0-9]+]] comdat {
+// CHECK3-SAME: () #[[ATTR6:[0-9]+]] comdat {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
 // CHECK3-NEXT:    [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4
@@ -1532,7 +1532,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l56.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i32 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR5]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i32 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR4]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1905,7 +1905,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l74.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[G:%.*]], i64 noundef [[G1:%.*]], i64 noundef [[SIVAR:%.*]]) #[[ATTR6:[0-9]+]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[G:%.*]], i64 noundef [[G1:%.*]], i64 noundef [[SIVAR:%.*]]) #[[ATTR5]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
diff --git a/clang/test/OpenMP/target_teams_distribute_lastprivate_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_lastprivate_codegen.cpp
index fce40994ed6a3..221ebf3767741 100644
--- a/clang/test/OpenMP/target_teams_distribute_lastprivate_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_lastprivate_codegen.cpp
@@ -187,7 +187,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l66.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[G:%.*]], i64 noundef [[G1:%.*]], i64 noundef [[SVAR:%.*]], i64 noundef [[SFVAR:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[G:%.*]], i64 noundef [[G1:%.*]], i64 noundef [[SVAR:%.*]], i64 noundef [[SFVAR:%.*]]) #[[ATTR2]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -295,7 +295,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK1-SAME: () #[[ATTR5:[0-9]+]] {
+// CHECK1-SAME: () #[[ATTR4:[0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK1-NEXT:    ret void
@@ -348,7 +348,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l66.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[G:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[G1:%.*]], i32 noundef [[SVAR:%.*]], i32 noundef [[SFVAR:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[G:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[G1:%.*]], i32 noundef [[SVAR:%.*]], i32 noundef [[SFVAR:%.*]]) #[[ATTR2]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -458,7 +458,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK3-SAME: () #[[ATTR5:[0-9]+]] {
+// CHECK3-SAME: () #[[ATTR4:[0-9]+]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK3-NEXT:    ret void
@@ -565,7 +565,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP37:%.*]] = icmp ne i32 [[TMP36]], 0
 // CHECK9-NEXT:    br i1 [[TMP37]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK9:       omp_offload.failed:
-// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l94(ptr [[VEC]], i64 [[TMP2]], ptr [[S_ARR]], ptr [[TMP3]], i64 [[TMP5]]) #[[ATTR5:[0-9]+]]
+// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l94(ptr [[VEC]], i64 [[TMP2]], ptr [[S_ARR]], ptr [[TMP3]], i64 [[TMP5]]) #[[ATTR4:[0-9]+]]
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK9:       omp_offload.cont:
 // CHECK9-NEXT:    [[CALL:%.*]] = call noundef signext i32 @_Z5tmainIiET_v()
@@ -576,11 +576,11 @@ int main() {
 // CHECK9:       arraydestroy.body:
 // CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP38]], [[OMP_OFFLOAD_CONT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
-// CHECK9-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK9-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN]]
 // CHECK9-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE2:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK9:       arraydestroy.done2:
-// CHECK9-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]]
 // CHECK9-NEXT:    [[TMP39:%.*]] = load i32, ptr [[RETVAL]], align 4
 // CHECK9-NEXT:    ret i32 [[TMP39]]
 //
@@ -640,7 +640,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l94.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i64 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], i64 noundef [[SVAR:%.*]]) #[[ATTR4:[0-9]+]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i64 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], i64 noundef [[SVAR:%.*]]) #[[ATTR3]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -770,14 +770,14 @@ int main() {
 // CHECK9-NEXT:    store i32 [[TMP24]], ptr [[SVAR_ADDR]], align 4
 // CHECK9-NEXT:    br label [[DOTOMP_LASTPRIVATE_DONE]]
 // CHECK9:       .omp.lastprivate.done:
-// CHECK9-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR4]]
 // CHECK9-NEXT:    [[ARRAY_BEGIN14:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR4]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN14]], i64 2
 // CHECK9-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK9:       arraydestroy.body:
 // CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP25]], [[DOTOMP_LASTPRIVATE_DONE]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
-// CHECK9-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK9-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN14]]
 // CHECK9-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE15:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK9:       arraydestroy.done15:
@@ -790,12 +790,12 @@ int main() {
 // CHECK9-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
 // CHECK9-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
-// CHECK9-NEXT:    call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]]
 // CHECK9-NEXT:    ret void
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@_Z5tmainIiET_v
-// CHECK9-SAME: () #[[ATTR6:[0-9]+]] comdat {
+// CHECK9-SAME: () #[[ATTR5:[0-9]+]] comdat {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
 // CHECK9-NEXT:    [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4
@@ -880,7 +880,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0
 // CHECK9-NEXT:    br i1 [[TMP32]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK9:       omp_offload.failed:
-// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l49(ptr [[VEC]], i64 [[TMP2]], ptr [[S_ARR]], ptr [[TMP3]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l49(ptr [[VEC]], i64 [[TMP2]], ptr [[S_ARR]], ptr [[TMP3]]) #[[ATTR4]]
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK9:       omp_offload.cont:
 // CHECK9-NEXT:    store i32 0, ptr [[RETVAL]], align 4
@@ -890,11 +890,11 @@ int main() {
 // CHECK9:       arraydestroy.body:
 // CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP33]], [[OMP_OFFLOAD_CONT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
-// CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK9-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN]]
 // CHECK9-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE2:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK9:       arraydestroy.done2:
-// CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]]
 // CHECK9-NEXT:    [[TMP34:%.*]] = load i32, ptr [[RETVAL]], align 4
 // CHECK9-NEXT:    ret i32 [[TMP34]]
 //
@@ -982,7 +982,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l49.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i64 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR4]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i64 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR3]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1107,14 +1107,14 @@ int main() {
 // CHECK9-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP3]], ptr align 4 [[TMP23]], i64 4, i1 false)
 // CHECK9-NEXT:    br label [[DOTOMP_LASTPRIVATE_DONE]]
 // CHECK9:       .omp.lastprivate.done:
-// CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR4]]
 // CHECK9-NEXT:    [[ARRAY_BEGIN13:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR4]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN13]], i64 2
 // CHECK9-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK9:       arraydestroy.body:
 // CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP24]], [[DOTOMP_LASTPRIVATE_DONE]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
-// CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK9-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN13]]
 // CHECK9-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE14:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK9:       arraydestroy.done14:
@@ -1127,7 +1127,7 @@ int main() {
 // CHECK9-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
 // CHECK9-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
-// CHECK9-NEXT:    call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]]
 // CHECK9-NEXT:    ret void
 //
 //
@@ -1166,7 +1166,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK9-SAME: () #[[ATTR7:[0-9]+]] {
+// CHECK9-SAME: () #[[ATTR6:[0-9]+]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK9-NEXT:    ret void
@@ -1273,7 +1273,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP37:%.*]] = icmp ne i32 [[TMP36]], 0
 // CHECK11-NEXT:    br i1 [[TMP37]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK11:       omp_offload.failed:
-// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l94(ptr [[VEC]], i32 [[TMP2]], ptr [[S_ARR]], ptr [[TMP3]], i32 [[TMP5]]) #[[ATTR5:[0-9]+]]
+// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l94(ptr [[VEC]], i32 [[TMP2]], ptr [[S_ARR]], ptr [[TMP3]], i32 [[TMP5]]) #[[ATTR4:[0-9]+]]
 // CHECK11-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK11:       omp_offload.cont:
 // CHECK11-NEXT:    [[CALL:%.*]] = call noundef i32 @_Z5tmainIiET_v()
@@ -1284,11 +1284,11 @@ int main() {
 // CHECK11:       arraydestroy.body:
 // CHECK11-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP38]], [[OMP_OFFLOAD_CONT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK11-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1
-// CHECK11-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK11-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN]]
 // CHECK11-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE2:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK11:       arraydestroy.done2:
-// CHECK11-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]]
 // CHECK11-NEXT:    [[TMP39:%.*]] = load i32, ptr [[RETVAL]], align 4
 // CHECK11-NEXT:    ret i32 [[TMP39]]
 //
@@ -1348,7 +1348,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l94.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i32 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], i32 noundef [[SVAR:%.*]]) #[[ATTR4:[0-9]+]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i32 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], i32 noundef [[SVAR:%.*]]) #[[ATTR3]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1476,14 +1476,14 @@ int main() {
 // CHECK11-NEXT:    store i32 [[TMP24]], ptr [[SVAR_ADDR]], align 4
 // CHECK11-NEXT:    br label [[DOTOMP_LASTPRIVATE_DONE]]
 // CHECK11:       .omp.lastprivate.done:
-// CHECK11-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR4]]
 // CHECK11-NEXT:    [[ARRAY_BEGIN13:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR4]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN13]], i32 2
 // CHECK11-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK11:       arraydestroy.body:
 // CHECK11-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP25]], [[DOTOMP_LASTPRIVATE_DONE]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK11-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1
-// CHECK11-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK11-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN13]]
 // CHECK11-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE14:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK11:       arraydestroy.done14:
@@ -1496,12 +1496,12 @@ int main() {
 // CHECK11-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
 // CHECK11-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
-// CHECK11-NEXT:    call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]]
 // CHECK11-NEXT:    ret void
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@_Z5tmainIiET_v
-// CHECK11-SAME: () #[[ATTR6:[0-9]+]] comdat {
+// CHECK11-SAME: () #[[ATTR5:[0-9]+]] comdat {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4
@@ -1586,7 +1586,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0
 // CHECK11-NEXT:    br i1 [[TMP32]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK11:       omp_offload.failed:
-// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l49(ptr [[VEC]], i32 [[TMP2]], ptr [[S_ARR]], ptr [[TMP3]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l49(ptr [[VEC]], i32 [[TMP2]], ptr [[S_ARR]], ptr [[TMP3]]) #[[ATTR4]]
 // CHECK11-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK11:       omp_offload.cont:
 // CHECK11-NEXT:    store i32 0, ptr [[RETVAL]], align 4
@@ -1596,11 +1596,11 @@ int main() {
 // CHECK11:       arraydestroy.body:
 // CHECK11-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP33]], [[OMP_OFFLOAD_CONT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK11-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1
-// CHECK11-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK11-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN]]
 // CHECK11-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE2:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK11:       arraydestroy.done2:
-// CHECK11-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]]
 // CHECK11-NEXT:    [[TMP34:%.*]] = load i32, ptr [[RETVAL]], align 4
 // CHECK11-NEXT:    ret i32 [[TMP34]]
 //
@@ -1688,7 +1688,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l49.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i32 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR4]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i32 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR3]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1811,14 +1811,14 @@ int main() {
 // CHECK11-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[TMP3]], ptr align 4 [[TMP23]], i32 4, i1 false)
 // CHECK11-NEXT:    br label [[DOTOMP_LASTPRIVATE_DONE]]
 // CHECK11:       .omp.lastprivate.done:
-// CHECK11-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR4]]
 // CHECK11-NEXT:    [[ARRAY_BEGIN12:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR4]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN12]], i32 2
 // CHECK11-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK11:       arraydestroy.body:
 // CHECK11-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP24]], [[DOTOMP_LASTPRIVATE_DONE]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK11-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1
-// CHECK11-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK11-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN12]]
 // CHECK11-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE13:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK11:       arraydestroy.done13:
@@ -1831,7 +1831,7 @@ int main() {
 // CHECK11-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
 // CHECK11-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
-// CHECK11-NEXT:    call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]]
 // CHECK11-NEXT:    ret void
 //
 //
@@ -1870,7 +1870,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK11-SAME: () #[[ATTR7:[0-9]+]] {
+// CHECK11-SAME: () #[[ATTR6:[0-9]+]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK11-NEXT:    ret void
diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_codegen.cpp
index 24b6128ee35ef..d55a7348b5840 100644
--- a/clang/test/OpenMP/target_teams_distribute_parallel_for_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_codegen.cpp
@@ -989,7 +989,7 @@ int target_teams_fun(int *g){
 //
 //
 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16target_teams_funPi_l58
-// CHECK2-SAME: (i64 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[A:%.*]], ptr noundef [[G:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK2-SAME: (i64 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[A:%.*]], ptr noundef [[G:%.*]]) #[[ATTR1]] {
 // CHECK2-NEXT:  entry:
 // CHECK2-NEXT:    [[N_ADDR:%.*]] = alloca i64, align 8
 // CHECK2-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
@@ -1208,7 +1208,7 @@ int target_teams_fun(int *g){
 //
 //
 // CHECK2-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK2-SAME: () #[[ATTR4:[0-9]+]] {
+// CHECK2-SAME: () #[[ATTR3:[0-9]+]] {
 // CHECK2-NEXT:  entry:
 // CHECK2-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK2-NEXT:    ret void
@@ -1633,7 +1633,7 @@ int target_teams_fun(int *g){
 //
 //
 // CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16target_teams_funPi_l58
-// CHECK4-SAME: (i32 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[A:%.*]], ptr noundef [[G:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK4-SAME: (i32 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[A:%.*]], ptr noundef [[G:%.*]]) #[[ATTR1]] {
 // CHECK4-NEXT:  entry:
 // CHECK4-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
 // CHECK4-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
@@ -1847,21 +1847,23 @@ int target_teams_fun(int *g){
 //
 //
 // CHECK4-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK4-SAME: () #[[ATTR4:[0-9]+]] {
+// CHECK4-SAME: () #[[ATTR3:[0-9]+]] {
 // CHECK4-NEXT:  entry:
 // CHECK4-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK4-NEXT:    ret void
 //
 //
 // CHECK10-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16target_teams_funPi_l51
-// CHECK10-SAME: (i64 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]], i64 noundef [[DOTCAPTURE_EXPR_1:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK10-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]], i64 noundef [[DOTCAPTURE_EXPR_1:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK10-NEXT:  entry:
+// CHECK10-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK10-NEXT:    [[N_ADDR:%.*]] = alloca i64, align 8
 // CHECK10-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
 // CHECK10-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8
 // CHECK10-NEXT:    [[DOTCAPTURE_EXPR__ADDR2:%.*]] = alloca i64, align 8
 // CHECK10-NEXT:    [[N_CASTED:%.*]] = alloca i64, align 8
 // CHECK10-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB3:[0-9]+]])
+// CHECK10-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK10-NEXT:    store i64 [[N]], ptr [[N_ADDR]], align 8
 // CHECK10-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
 // CHECK10-NEXT:    store i64 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 8
@@ -2085,12 +2087,14 @@ int target_teams_fun(int *g){
 //
 //
 // CHECK10-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16target_teams_funPi_l58
-// CHECK10-SAME: (i64 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[A:%.*]], ptr noundef [[G:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK10-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[A:%.*]], ptr noundef [[G:%.*]]) #[[ATTR0]] {
 // CHECK10-NEXT:  entry:
+// CHECK10-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK10-NEXT:    [[N_ADDR:%.*]] = alloca i64, align 8
 // CHECK10-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
 // CHECK10-NEXT:    [[G_ADDR:%.*]] = alloca ptr, align 8
 // CHECK10-NEXT:    [[N_CASTED:%.*]] = alloca i64, align 8
+// CHECK10-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK10-NEXT:    store i64 [[N]], ptr [[N_ADDR]], align 8
 // CHECK10-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
 // CHECK10-NEXT:    store ptr [[G]], ptr [[G_ADDR]], align 8
@@ -2304,14 +2308,16 @@ int target_teams_fun(int *g){
 //
 //
 // CHECK12-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16target_teams_funPi_l51
-// CHECK12-SAME: (i32 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]], i32 noundef [[DOTCAPTURE_EXPR_1:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK12-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]], i32 noundef [[DOTCAPTURE_EXPR_1:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK12-NEXT:  entry:
+// CHECK12-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK12-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
 // CHECK12-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
 // CHECK12-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i32, align 4
 // CHECK12-NEXT:    [[DOTCAPTURE_EXPR__ADDR2:%.*]] = alloca i32, align 4
 // CHECK12-NEXT:    [[N_CASTED:%.*]] = alloca i32, align 4
 // CHECK12-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB3:[0-9]+]])
+// CHECK12-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK12-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
 // CHECK12-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
 // CHECK12-NEXT:    store i32 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 4
@@ -2530,12 +2536,14 @@ int target_teams_fun(int *g){
 //
 //
 // CHECK12-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16target_teams_funPi_l58
-// CHECK12-SAME: (i32 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[A:%.*]], ptr noundef [[G:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK12-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[A:%.*]], ptr noundef [[G:%.*]]) #[[ATTR0]] {
 // CHECK12-NEXT:  entry:
+// CHECK12-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK12-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
 // CHECK12-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
 // CHECK12-NEXT:    [[G_ADDR:%.*]] = alloca ptr, align 4
 // CHECK12-NEXT:    [[N_CASTED:%.*]] = alloca i32, align 4
+// CHECK12-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK12-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
 // CHECK12-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
 // CHECK12-NEXT:    store ptr [[G]], ptr [[G_ADDR]], align 4
diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_collapse_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_collapse_codegen.cpp
index 1eeab3cbf0688..46f612c0db28c 100644
--- a/clang/test/OpenMP/target_teams_distribute_parallel_for_collapse_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_collapse_codegen.cpp
@@ -158,7 +158,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
 // CHECK1-NEXT:    br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK1:       omp_offload.failed:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28(ptr [[THIS1]]) #[[ATTR3:[0-9]+]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28(ptr [[THIS1]]) #[[ATTR2:[0-9]+]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    [[A3:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0
@@ -179,7 +179,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -244,7 +244,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28.omp_outlined.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -336,7 +336,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK1-SAME: () #[[ATTR4:[0-9]+]] {
+// CHECK1-SAME: () #[[ATTR3:[0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK1-NEXT:    ret void
@@ -401,7 +401,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
 // CHECK3-NEXT:    br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK3:       omp_offload.failed:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28(ptr [[THIS1]]) #[[ATTR3:[0-9]+]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28(ptr [[THIS1]]) #[[ATTR2:[0-9]+]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK3:       omp_offload.cont:
 // CHECK3-NEXT:    [[A3:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0
@@ -422,7 +422,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -485,7 +485,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28.omp_outlined.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -573,7 +573,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK3-SAME: () #[[ATTR4:[0-9]+]] {
+// CHECK3-SAME: () #[[ATTR3:[0-9]+]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK3-NEXT:    ret void
@@ -708,7 +708,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[TMP50:%.*]] = icmp ne i32 [[TMP49]], 0
 // CHECK9-NEXT:    br i1 [[TMP50]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK9:       omp_offload.failed:
-// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l81(i64 [[TMP7]], i64 [[TMP9]], i64 [[TMP1]], i64 [[TMP3]], ptr [[VLA]]) #[[ATTR4:[0-9]+]]
+// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l81(i64 [[TMP7]], i64 [[TMP9]], i64 [[TMP1]], i64 [[TMP3]], ptr [[VLA]]) #[[ATTR3:[0-9]+]]
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK9:       omp_offload.cont:
 // CHECK9-NEXT:    [[TMP51:%.*]] = load i32, ptr [[ARGC_ADDR]], align 4
@@ -749,7 +749,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l81.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[M:%.*]], i64 noundef [[VLA:%.*]], i64 noundef [[VLA1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[M:%.*]], i64 noundef [[VLA:%.*]], i64 noundef [[VLA1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -867,7 +867,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l81.omp_outlined.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[M:%.*]], i64 noundef [[VLA:%.*]], i64 noundef [[VLA1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[M:%.*]], i64 noundef [[VLA:%.*]], i64 noundef [[VLA1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1021,7 +1021,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@_Z5tmainIiLi10ELi2EEiT_
-// CHECK9-SAME: (i32 noundef signext [[ARGC:%.*]]) #[[ATTR6:[0-9]+]] comdat {
+// CHECK9-SAME: (i32 noundef signext [[ARGC:%.*]]) #[[ATTR5:[0-9]+]] comdat {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[ARGC_ADDR:%.*]] = alloca i32, align 4
 // CHECK9-NEXT:    [[A:%.*]] = alloca [10 x [2 x i32]], align 4
@@ -1070,7 +1070,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
 // CHECK9-NEXT:    br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK9:       omp_offload.failed:
-// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10ELi2EEiT__l68(ptr [[A]]) #[[ATTR4]]
+// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10ELi2EEiT__l68(ptr [[A]]) #[[ATTR3]]
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK9:       omp_offload.cont:
 // CHECK9-NEXT:    ret i32 0
@@ -1087,7 +1087,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10ELi2EEiT__l68.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(80) [[A:%.*]]) #[[ATTR3]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(80) [[A:%.*]]) #[[ATTR2]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1152,7 +1152,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10ELi2EEiT__l68.omp_outlined.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(80) [[A:%.*]]) #[[ATTR3]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(80) [[A:%.*]]) #[[ATTR2]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1243,7 +1243,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK9-SAME: () #[[ATTR7:[0-9]+]] {
+// CHECK9-SAME: () #[[ATTR6:[0-9]+]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK9-NEXT:    ret void
@@ -1377,7 +1377,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[TMP49:%.*]] = icmp ne i32 [[TMP48]], 0
 // CHECK11-NEXT:    br i1 [[TMP49]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK11:       omp_offload.failed:
-// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l81(i32 [[TMP5]], i32 [[TMP7]], i32 [[TMP0]], i32 [[TMP1]], ptr [[VLA]]) #[[ATTR4:[0-9]+]]
+// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l81(i32 [[TMP5]], i32 [[TMP7]], i32 [[TMP0]], i32 [[TMP1]], ptr [[VLA]]) #[[ATTR3:[0-9]+]]
 // CHECK11-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK11:       omp_offload.cont:
 // CHECK11-NEXT:    [[TMP50:%.*]] = load i32, ptr [[ARGC_ADDR]], align 4
@@ -1418,7 +1418,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l81.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[M:%.*]], i32 noundef [[VLA:%.*]], i32 noundef [[VLA1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[M:%.*]], i32 noundef [[VLA:%.*]], i32 noundef [[VLA1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1538,7 +1538,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l81.omp_outlined.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[M:%.*]], i32 noundef [[VLA:%.*]], i32 noundef [[VLA1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[M:%.*]], i32 noundef [[VLA:%.*]], i32 noundef [[VLA1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1692,7 +1692,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@_Z5tmainIiLi10ELi2EEiT_
-// CHECK11-SAME: (i32 noundef [[ARGC:%.*]]) #[[ATTR6:[0-9]+]] comdat {
+// CHECK11-SAME: (i32 noundef [[ARGC:%.*]]) #[[ATTR5:[0-9]+]] comdat {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[ARGC_ADDR:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[A:%.*]] = alloca [10 x [2 x i32]], align 4
@@ -1741,7 +1741,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
 // CHECK11-NEXT:    br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK11:       omp_offload.failed:
-// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10ELi2EEiT__l68(ptr [[A]]) #[[ATTR4]]
+// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10ELi2EEiT__l68(ptr [[A]]) #[[ATTR3]]
 // CHECK11-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK11:       omp_offload.cont:
 // CHECK11-NEXT:    ret i32 0
@@ -1758,7 +1758,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10ELi2EEiT__l68.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(80) [[A:%.*]]) #[[ATTR3]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(80) [[A:%.*]]) #[[ATTR2]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1821,7 +1821,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10ELi2EEiT__l68.omp_outlined.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(80) [[A:%.*]]) #[[ATTR3]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(80) [[A:%.*]]) #[[ATTR2]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1908,7 +1908,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK11-SAME: () #[[ATTR7:[0-9]+]] {
+// CHECK11-SAME: () #[[ATTR6:[0-9]+]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK11-NEXT:    ret void
diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_dist_schedule_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_dist_schedule_codegen.cpp
index 019869514a029..afd15f4c30dc3 100644
--- a/clang/test/OpenMP/target_teams_distribute_parallel_for_dist_schedule_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_dist_schedule_codegen.cpp
@@ -202,7 +202,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
 // CHECK1-NEXT:    br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK1:       omp_offload.failed:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28(ptr [[THIS1]]) #[[ATTR3:[0-9]+]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28(ptr [[THIS1]]) #[[ATTR2:[0-9]+]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    [[A2:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0
@@ -244,7 +244,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP39:%.*]] = icmp ne i32 [[TMP38]], 0
 // CHECK1-NEXT:    br i1 [[TMP39]], label [[OMP_OFFLOAD_FAILED8:%.*]], label [[OMP_OFFLOAD_CONT9:%.*]]
 // CHECK1:       omp_offload.failed8:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l32(ptr [[THIS1]]) #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l32(ptr [[THIS1]]) #[[ATTR2]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT9]]
 // CHECK1:       omp_offload.cont9:
 // CHECK1-NEXT:    [[A10:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0
@@ -286,7 +286,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP59:%.*]] = icmp ne i32 [[TMP58]], 0
 // CHECK1-NEXT:    br i1 [[TMP59]], label [[OMP_OFFLOAD_FAILED16:%.*]], label [[OMP_OFFLOAD_CONT17:%.*]]
 // CHECK1:       omp_offload.failed16:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l36(ptr [[THIS1]]) #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l36(ptr [[THIS1]]) #[[ATTR2]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT17]]
 // CHECK1:       omp_offload.cont17:
 // CHECK1-NEXT:    [[A18:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0
@@ -306,7 +306,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -369,7 +369,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28.omp_outlined.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -457,7 +457,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l32.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -520,7 +520,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l32.omp_outlined.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -608,7 +608,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l36.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -691,7 +691,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l36.omp_outlined.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -769,7 +769,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK1-SAME: () #[[ATTR4:[0-9]+]] {
+// CHECK1-SAME: () #[[ATTR3:[0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK1-NEXT:    ret void
@@ -843,7 +843,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
 // CHECK3-NEXT:    br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK3:       omp_offload.failed:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28(ptr [[THIS1]]) #[[ATTR3:[0-9]+]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28(ptr [[THIS1]]) #[[ATTR2:[0-9]+]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK3:       omp_offload.cont:
 // CHECK3-NEXT:    [[A2:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0
@@ -885,7 +885,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP39:%.*]] = icmp ne i32 [[TMP38]], 0
 // CHECK3-NEXT:    br i1 [[TMP39]], label [[OMP_OFFLOAD_FAILED8:%.*]], label [[OMP_OFFLOAD_CONT9:%.*]]
 // CHECK3:       omp_offload.failed8:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l32(ptr [[THIS1]]) #[[ATTR3]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l32(ptr [[THIS1]]) #[[ATTR2]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT9]]
 // CHECK3:       omp_offload.cont9:
 // CHECK3-NEXT:    [[A10:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0
@@ -927,7 +927,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP59:%.*]] = icmp ne i32 [[TMP58]], 0
 // CHECK3-NEXT:    br i1 [[TMP59]], label [[OMP_OFFLOAD_FAILED16:%.*]], label [[OMP_OFFLOAD_CONT17:%.*]]
 // CHECK3:       omp_offload.failed16:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l36(ptr [[THIS1]]) #[[ATTR3]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l36(ptr [[THIS1]]) #[[ATTR2]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT17]]
 // CHECK3:       omp_offload.cont17:
 // CHECK3-NEXT:    [[A18:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0
@@ -947,7 +947,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1008,7 +1008,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28.omp_outlined.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1093,7 +1093,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l32.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1154,7 +1154,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l32.omp_outlined.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1239,7 +1239,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l36.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1320,7 +1320,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l36.omp_outlined.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1395,7 +1395,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK3-SAME: () #[[ATTR4:[0-9]+]] {
+// CHECK3-SAME: () #[[ATTR3:[0-9]+]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK3-NEXT:    ret void
@@ -1519,7 +1519,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[TMP37:%.*]] = icmp ne i32 [[TMP36]], 0
 // CHECK9-NEXT:    br i1 [[TMP37]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK9:       omp_offload.failed:
-// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l100(i64 [[TMP4]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR4:[0-9]+]]
+// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l100(i64 [[TMP4]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR3:[0-9]+]]
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK9:       omp_offload.cont:
 // CHECK9-NEXT:    [[TMP38:%.*]] = load i32, ptr [[N]], align 4
@@ -1590,7 +1590,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[TMP72:%.*]] = icmp ne i32 [[TMP71]], 0
 // CHECK9-NEXT:    br i1 [[TMP72]], label [[OMP_OFFLOAD_FAILED16:%.*]], label [[OMP_OFFLOAD_CONT17:%.*]]
 // CHECK9:       omp_offload.failed16:
-// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l104(i64 [[TMP39]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR4]]
+// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l104(i64 [[TMP39]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR3]]
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT17]]
 // CHECK9:       omp_offload.cont17:
 // CHECK9-NEXT:    [[TMP73:%.*]] = load i32, ptr [[M]], align 4
@@ -1672,7 +1672,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[TMP113:%.*]] = icmp ne i32 [[TMP112]], 0
 // CHECK9-NEXT:    br i1 [[TMP113]], label [[OMP_OFFLOAD_FAILED32:%.*]], label [[OMP_OFFLOAD_CONT33:%.*]]
 // CHECK9:       omp_offload.failed32:
-// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l108(i64 [[TMP75]], i64 [[TMP1]], ptr [[VLA]], i64 [[TMP77]]) #[[ATTR4]]
+// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l108(i64 [[TMP75]], i64 [[TMP1]], ptr [[VLA]], i64 [[TMP77]]) #[[ATTR3]]
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT33]]
 // CHECK9:       omp_offload.cont33:
 // CHECK9-NEXT:    [[TMP114:%.*]] = load i32, ptr [[ARGC_ADDR]], align 4
@@ -1704,7 +1704,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l100.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1798,7 +1798,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l100.omp_outlined.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1921,7 +1921,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l104.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -2015,7 +2015,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l104.omp_outlined.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -2144,7 +2144,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l108.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -2269,7 +2269,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l108.omp_outlined.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -2375,7 +2375,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@_Z5tmainIiLi10EEiT_
-// CHECK9-SAME: (i32 noundef signext [[ARGC:%.*]]) #[[ATTR6:[0-9]+]] comdat {
+// CHECK9-SAME: (i32 noundef signext [[ARGC:%.*]]) #[[ATTR5:[0-9]+]] comdat {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[ARGC_ADDR:%.*]] = alloca i32, align 4
 // CHECK9-NEXT:    [[A:%.*]] = alloca [10 x i32], align 4
@@ -2437,7 +2437,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
 // CHECK9-NEXT:    br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK9:       omp_offload.failed:
-// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l81(ptr [[A]]) #[[ATTR4]]
+// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l81(ptr [[A]]) #[[ATTR3]]
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK9:       omp_offload.cont:
 // CHECK9-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
@@ -2478,7 +2478,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[TMP39:%.*]] = icmp ne i32 [[TMP38]], 0
 // CHECK9-NEXT:    br i1 [[TMP39]], label [[OMP_OFFLOAD_FAILED6:%.*]], label [[OMP_OFFLOAD_CONT7:%.*]]
 // CHECK9:       omp_offload.failed6:
-// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l85(ptr [[A]]) #[[ATTR4]]
+// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l85(ptr [[A]]) #[[ATTR3]]
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT7]]
 // CHECK9:       omp_offload.cont7:
 // CHECK9-NEXT:    [[TMP40:%.*]] = load i32, ptr [[M]], align 4
@@ -2530,7 +2530,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[TMP65:%.*]] = icmp ne i32 [[TMP64]], 0
 // CHECK9-NEXT:    br i1 [[TMP65]], label [[OMP_OFFLOAD_FAILED13:%.*]], label [[OMP_OFFLOAD_CONT14:%.*]]
 // CHECK9:       omp_offload.failed13:
-// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l89(ptr [[A]], i64 [[TMP42]]) #[[ATTR4]]
+// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l89(ptr [[A]], i64 [[TMP42]]) #[[ATTR3]]
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT14]]
 // CHECK9:       omp_offload.cont14:
 // CHECK9-NEXT:    ret i32 0
@@ -2547,7 +2547,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l81.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -2610,7 +2610,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l81.omp_outlined.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -2697,7 +2697,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l85.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -2760,7 +2760,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l85.omp_outlined.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -2853,7 +2853,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l89.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -2943,7 +2943,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l89.omp_outlined.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -3022,7 +3022,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK9-SAME: () #[[ATTR7:[0-9]+]] {
+// CHECK9-SAME: () #[[ATTR6:[0-9]+]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK9-NEXT:    ret void
@@ -3146,7 +3146,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[TMP37:%.*]] = icmp ne i32 [[TMP36]], 0
 // CHECK11-NEXT:    br i1 [[TMP37]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK11:       omp_offload.failed:
-// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l100(i32 [[TMP3]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR4:[0-9]+]]
+// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l100(i32 [[TMP3]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR3:[0-9]+]]
 // CHECK11-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK11:       omp_offload.cont:
 // CHECK11-NEXT:    [[TMP38:%.*]] = load i32, ptr [[N]], align 4
@@ -3218,7 +3218,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[TMP73:%.*]] = icmp ne i32 [[TMP72]], 0
 // CHECK11-NEXT:    br i1 [[TMP73]], label [[OMP_OFFLOAD_FAILED16:%.*]], label [[OMP_OFFLOAD_CONT17:%.*]]
 // CHECK11:       omp_offload.failed16:
-// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l104(i32 [[TMP39]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR4]]
+// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l104(i32 [[TMP39]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR3]]
 // CHECK11-NEXT:    br label [[OMP_OFFLOAD_CONT17]]
 // CHECK11:       omp_offload.cont17:
 // CHECK11-NEXT:    [[TMP74:%.*]] = load i32, ptr [[M]], align 4
@@ -3301,7 +3301,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[TMP115:%.*]] = icmp ne i32 [[TMP114]], 0
 // CHECK11-NEXT:    br i1 [[TMP115]], label [[OMP_OFFLOAD_FAILED32:%.*]], label [[OMP_OFFLOAD_CONT33:%.*]]
 // CHECK11:       omp_offload.failed32:
-// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l108(i32 [[TMP76]], i32 [[TMP0]], ptr [[VLA]], i32 [[TMP78]]) #[[ATTR4]]
+// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l108(i32 [[TMP76]], i32 [[TMP0]], ptr [[VLA]], i32 [[TMP78]]) #[[ATTR3]]
 // CHECK11-NEXT:    br label [[OMP_OFFLOAD_CONT33]]
 // CHECK11:       omp_offload.cont33:
 // CHECK11-NEXT:    [[TMP116:%.*]] = load i32, ptr [[ARGC_ADDR]], align 4
@@ -3333,7 +3333,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l100.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -3425,7 +3425,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l100.omp_outlined.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -3545,7 +3545,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l104.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -3637,7 +3637,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l104.omp_outlined.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -3763,7 +3763,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l108.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -3886,7 +3886,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l108.omp_outlined.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -3989,7 +3989,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@_Z5tmainIiLi10EEiT_
-// CHECK11-SAME: (i32 noundef [[ARGC:%.*]]) #[[ATTR6:[0-9]+]] comdat {
+// CHECK11-SAME: (i32 noundef [[ARGC:%.*]]) #[[ATTR5:[0-9]+]] comdat {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[ARGC_ADDR:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[A:%.*]] = alloca [10 x i32], align 4
@@ -4051,7 +4051,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
 // CHECK11-NEXT:    br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK11:       omp_offload.failed:
-// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l81(ptr [[A]]) #[[ATTR4]]
+// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l81(ptr [[A]]) #[[ATTR3]]
 // CHECK11-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK11:       omp_offload.cont:
 // CHECK11-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
@@ -4092,7 +4092,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[TMP39:%.*]] = icmp ne i32 [[TMP38]], 0
 // CHECK11-NEXT:    br i1 [[TMP39]], label [[OMP_OFFLOAD_FAILED6:%.*]], label [[OMP_OFFLOAD_CONT7:%.*]]
 // CHECK11:       omp_offload.failed6:
-// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l85(ptr [[A]]) #[[ATTR4]]
+// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l85(ptr [[A]]) #[[ATTR3]]
 // CHECK11-NEXT:    br label [[OMP_OFFLOAD_CONT7]]
 // CHECK11:       omp_offload.cont7:
 // CHECK11-NEXT:    [[TMP40:%.*]] = load i32, ptr [[M]], align 4
@@ -4144,7 +4144,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[TMP65:%.*]] = icmp ne i32 [[TMP64]], 0
 // CHECK11-NEXT:    br i1 [[TMP65]], label [[OMP_OFFLOAD_FAILED13:%.*]], label [[OMP_OFFLOAD_CONT14:%.*]]
 // CHECK11:       omp_offload.failed13:
-// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l89(ptr [[A]], i32 [[TMP42]]) #[[ATTR4]]
+// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l89(ptr [[A]], i32 [[TMP42]]) #[[ATTR3]]
 // CHECK11-NEXT:    br label [[OMP_OFFLOAD_CONT14]]
 // CHECK11:       omp_offload.cont14:
 // CHECK11-NEXT:    ret i32 0
@@ -4161,7 +4161,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l81.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -4222,7 +4222,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l81.omp_outlined.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -4306,7 +4306,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l85.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -4367,7 +4367,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l85.omp_outlined.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -4457,7 +4457,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l89.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -4545,7 +4545,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l89.omp_outlined.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -4621,7 +4621,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK11-SAME: () #[[ATTR7:[0-9]+]] {
+// CHECK11-SAME: () #[[ATTR6:[0-9]+]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK11-NEXT:    ret void
diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_firstprivate_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_firstprivate_codegen.cpp
index bf3593df4a660..55ceeafea83fe 100644
--- a/clang/test/OpenMP/target_teams_distribute_parallel_for_firstprivate_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_firstprivate_codegen.cpp
@@ -429,7 +429,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l122.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i64 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], i64 noundef [[SIVAR:%.*]]) #[[ATTR5:[0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i64 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], i64 noundef [[SIVAR:%.*]]) #[[ATTR4]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -584,7 +584,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l122.omp_outlined.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i64 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], i64 noundef [[SIVAR:%.*]]) #[[ATTR5]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i64 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], i64 noundef [[SIVAR:%.*]]) #[[ATTR4]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -719,7 +719,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@_Z5tmainIiET_v
-// CHECK1-SAME: () #[[ATTR7:[0-9]+]] comdat {
+// CHECK1-SAME: () #[[ATTR6:[0-9]+]] comdat {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4
@@ -916,7 +916,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l81.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i64 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR5]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i64 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR4]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1051,7 +1051,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l81.omp_outlined.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i64 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR5]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i64 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR4]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1498,7 +1498,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l122.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i32 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], i32 noundef [[SIVAR:%.*]]) #[[ATTR5:[0-9]+]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i32 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], i32 noundef [[SIVAR:%.*]]) #[[ATTR4]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1651,7 +1651,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l122.omp_outlined.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i32 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], i32 noundef [[SIVAR:%.*]]) #[[ATTR5]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i32 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], i32 noundef [[SIVAR:%.*]]) #[[ATTR4]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1782,7 +1782,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@_Z5tmainIiET_v
-// CHECK3-SAME: () #[[ATTR7:[0-9]+]] comdat {
+// CHECK3-SAME: () #[[ATTR6:[0-9]+]] comdat {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
 // CHECK3-NEXT:    [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4
@@ -1979,7 +1979,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l81.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i32 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR5]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i32 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR4]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -2112,7 +2112,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l81.omp_outlined.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i32 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR5]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i32 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR4]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -2478,7 +2478,7 @@ int main() {
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l99.omp_outlined
-// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[G:%.*]], i64 noundef [[G1:%.*]], i64 noundef [[SIVAR:%.*]]) #[[ATTR6:[0-9]+]] {
+// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[G:%.*]], i64 noundef [[G1:%.*]], i64 noundef [[SIVAR:%.*]]) #[[ATTR5]] {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -2559,7 +2559,7 @@ int main() {
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l99.omp_outlined.omp_outlined
-// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[G:%.*]], i64 noundef [[G1:%.*]], i64 noundef [[SIVAR:%.*]]) #[[ATTR6]] {
+// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[G:%.*]], i64 noundef [[G1:%.*]], i64 noundef [[SIVAR:%.*]]) #[[ATTR5]] {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -2666,8 +2666,9 @@ int main() {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l122
-// CHECK13-SAME: (ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i64 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], i64 noundef [[SIVAR:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK13-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i64 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], i64 noundef [[SIVAR:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK13-NEXT:  entry:
+// CHECK13-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    [[VEC_ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    [[T_VAR_ADDR:%.*]] = alloca i64, align 8
 // CHECK13-NEXT:    [[S_ARR_ADDR:%.*]] = alloca ptr, align 8
@@ -2675,6 +2676,7 @@ int main() {
 // CHECK13-NEXT:    [[SIVAR_ADDR:%.*]] = alloca i64, align 8
 // CHECK13-NEXT:    [[T_VAR_CASTED:%.*]] = alloca i64, align 8
 // CHECK13-NEXT:    [[SIVAR_CASTED:%.*]] = alloca i64, align 8
+// CHECK13-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK13-NEXT:    store ptr [[VEC]], ptr [[VEC_ADDR]], align 8
 // CHECK13-NEXT:    store i64 [[T_VAR]], ptr [[T_VAR_ADDR]], align 8
 // CHECK13-NEXT:    store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 8
@@ -2694,7 +2696,7 @@ int main() {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l122.omp_outlined
-// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i64 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], i64 noundef [[SIVAR:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i64 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], i64 noundef [[SIVAR:%.*]]) #[[ATTR0]] {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -2739,17 +2741,17 @@ int main() {
 // CHECK13:       omp.arraycpy.body:
 // CHECK13-NEXT:    [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi ptr [ [[TMP1]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
 // CHECK13-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi ptr [ [[ARRAY_BEGIN]], [[ENTRY]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
-// CHECK13-NEXT:    call void @_ZN2StC1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP]]) #[[ATTR5:[0-9]+]]
-// CHECK13-NEXT:    call void @_ZN1SIfEC1ERKS0_2St(ptr noundef nonnull align 4 dereferenceable(4) [[OMP_ARRAYCPY_DESTELEMENTPAST]], ptr noundef nonnull align 4 dereferenceable(4) [[OMP_ARRAYCPY_SRCELEMENTPAST]], ptr noundef [[AGG_TMP]]) #[[ATTR5]]
-// CHECK13-NEXT:    call void @_ZN2StD1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP]]) #[[ATTR6:[0-9]+]]
+// CHECK13-NEXT:    call void @_ZN2StC1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP]]) #[[ATTR4:[0-9]+]]
+// CHECK13-NEXT:    call void @_ZN1SIfEC1ERKS0_2St(ptr noundef nonnull align 4 dereferenceable(4) [[OMP_ARRAYCPY_DESTELEMENTPAST]], ptr noundef nonnull align 4 dereferenceable(4) [[OMP_ARRAYCPY_SRCELEMENTPAST]], ptr noundef [[AGG_TMP]]) #[[ATTR4]]
+// CHECK13-NEXT:    call void @_ZN2StD1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP]]) #[[ATTR5:[0-9]+]]
 // CHECK13-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr [[STRUCT_S]], ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
 // CHECK13-NEXT:    [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr [[STRUCT_S]], ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1
 // CHECK13-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP3]]
 // CHECK13-NEXT:    br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYCPY_DONE3]], label [[OMP_ARRAYCPY_BODY]]
 // CHECK13:       omp.arraycpy.done3:
-// CHECK13-NEXT:    call void @_ZN2StC1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP5]]) #[[ATTR5]]
-// CHECK13-NEXT:    call void @_ZN1SIfEC1ERKS0_2St(ptr noundef nonnull align 4 dereferenceable(4) [[VAR4]], ptr noundef nonnull align 4 dereferenceable(4) [[TMP2]], ptr noundef [[AGG_TMP5]]) #[[ATTR5]]
-// CHECK13-NEXT:    call void @_ZN2StD1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP5]]) #[[ATTR6]]
+// CHECK13-NEXT:    call void @_ZN2StC1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP5]]) #[[ATTR4]]
+// CHECK13-NEXT:    call void @_ZN1SIfEC1ERKS0_2St(ptr noundef nonnull align 4 dereferenceable(4) [[VAR4]], ptr noundef nonnull align 4 dereferenceable(4) [[TMP2]], ptr noundef [[AGG_TMP5]]) #[[ATTR4]]
+// CHECK13-NEXT:    call void @_ZN2StD1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP5]]) #[[ATTR5]]
 // CHECK13-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
 // CHECK13-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4
 // CHECK13-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB1:[0-9]+]], i32 [[TMP5]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
@@ -2799,14 +2801,14 @@ int main() {
 // CHECK13-NEXT:    [[TMP21:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
 // CHECK13-NEXT:    [[TMP22:%.*]] = load i32, ptr [[TMP21]], align 4
 // CHECK13-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP22]])
-// CHECK13-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR4]]) #[[ATTR6]]
+// CHECK13-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR4]]) #[[ATTR5]]
 // CHECK13-NEXT:    [[ARRAY_BEGIN7:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR2]], i32 0, i32 0
 // CHECK13-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN7]], i64 2
 // CHECK13-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK13:       arraydestroy.body:
 // CHECK13-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP23]], [[OMP_LOOP_EXIT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK13-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
-// CHECK13-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR6]]
+// CHECK13-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
 // CHECK13-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN7]]
 // CHECK13-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE8:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK13:       arraydestroy.done8:
@@ -2814,17 +2816,17 @@ int main() {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@_ZN2StC1Ev
-// CHECK13-SAME: (ptr noundef nonnull align 4 dereferenceable(8) [[THIS:%.*]]) unnamed_addr #[[ATTR3:[0-9]+]] comdat {
+// CHECK13-SAME: (ptr noundef nonnull align 4 dereferenceable(8) [[THIS:%.*]]) unnamed_addr #[[ATTR2:[0-9]+]] comdat {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
 // CHECK13-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
-// CHECK13-NEXT:    call void @_ZN2StC2Ev(ptr noundef nonnull align 4 dereferenceable(8) [[THIS1]]) #[[ATTR5]]
+// CHECK13-NEXT:    call void @_ZN2StC2Ev(ptr noundef nonnull align 4 dereferenceable(8) [[THIS1]]) #[[ATTR4]]
 // CHECK13-NEXT:    ret void
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@_ZN1SIfEC1ERKS0_2St
-// CHECK13-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[S:%.*]], ptr noundef [[T:%.*]]) unnamed_addr #[[ATTR3]] comdat {
+// CHECK13-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[S:%.*]], ptr noundef [[T:%.*]]) unnamed_addr #[[ATTR2]] comdat {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    [[S_ADDR:%.*]] = alloca ptr, align 8
@@ -2834,22 +2836,22 @@ int main() {
 // CHECK13-NEXT:    store ptr [[T]], ptr [[T_INDIRECT_ADDR]], align 8
 // CHECK13-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CHECK13-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 8
-// CHECK13-NEXT:    call void @_ZN1SIfEC2ERKS0_2St(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]], ptr noundef nonnull align 4 dereferenceable(4) [[TMP0]], ptr noundef [[T]]) #[[ATTR5]]
+// CHECK13-NEXT:    call void @_ZN1SIfEC2ERKS0_2St(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]], ptr noundef nonnull align 4 dereferenceable(4) [[TMP0]], ptr noundef [[T]]) #[[ATTR4]]
 // CHECK13-NEXT:    ret void
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@_ZN2StD1Ev
-// CHECK13-SAME: (ptr noundef nonnull align 4 dereferenceable(8) [[THIS:%.*]]) unnamed_addr #[[ATTR3]] comdat {
+// CHECK13-SAME: (ptr noundef nonnull align 4 dereferenceable(8) [[THIS:%.*]]) unnamed_addr #[[ATTR2]] comdat {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
 // CHECK13-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
-// CHECK13-NEXT:    call void @_ZN2StD2Ev(ptr noundef nonnull align 4 dereferenceable(8) [[THIS1]]) #[[ATTR6]]
+// CHECK13-NEXT:    call void @_ZN2StD2Ev(ptr noundef nonnull align 4 dereferenceable(8) [[THIS1]]) #[[ATTR5]]
 // CHECK13-NEXT:    ret void
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l122.omp_outlined.omp_outlined
-// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i64 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], i64 noundef [[SIVAR:%.*]]) #[[ATTR1]] {
+// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i64 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], i64 noundef [[SIVAR:%.*]]) #[[ATTR0]] {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -2902,17 +2904,17 @@ int main() {
 // CHECK13:       omp.arraycpy.body:
 // CHECK13-NEXT:    [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi ptr [ [[TMP1]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
 // CHECK13-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi ptr [ [[ARRAY_BEGIN]], [[ENTRY]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
-// CHECK13-NEXT:    call void @_ZN2StC1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP]]) #[[ATTR5]]
-// CHECK13-NEXT:    call void @_ZN1SIfEC1ERKS0_2St(ptr noundef nonnull align 4 dereferenceable(4) [[OMP_ARRAYCPY_DESTELEMENTPAST]], ptr noundef nonnull align 4 dereferenceable(4) [[OMP_ARRAYCPY_SRCELEMENTPAST]], ptr noundef [[AGG_TMP]]) #[[ATTR5]]
-// CHECK13-NEXT:    call void @_ZN2StD1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP]]) #[[ATTR6]]
+// CHECK13-NEXT:    call void @_ZN2StC1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP]]) #[[ATTR4]]
+// CHECK13-NEXT:    call void @_ZN1SIfEC1ERKS0_2St(ptr noundef nonnull align 4 dereferenceable(4) [[OMP_ARRAYCPY_DESTELEMENTPAST]], ptr noundef nonnull align 4 dereferenceable(4) [[OMP_ARRAYCPY_SRCELEMENTPAST]], ptr noundef [[AGG_TMP]]) #[[ATTR4]]
+// CHECK13-NEXT:    call void @_ZN2StD1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP]]) #[[ATTR5]]
 // CHECK13-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr [[STRUCT_S]], ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
 // CHECK13-NEXT:    [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr [[STRUCT_S]], ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1
 // CHECK13-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP5]]
 // CHECK13-NEXT:    br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYCPY_DONE4]], label [[OMP_ARRAYCPY_BODY]]
 // CHECK13:       omp.arraycpy.done4:
-// CHECK13-NEXT:    call void @_ZN2StC1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP6]]) #[[ATTR5]]
-// CHECK13-NEXT:    call void @_ZN1SIfEC1ERKS0_2St(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]], ptr noundef nonnull align 4 dereferenceable(4) [[TMP2]], ptr noundef [[AGG_TMP6]]) #[[ATTR5]]
-// CHECK13-NEXT:    call void @_ZN2StD1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP6]]) #[[ATTR6]]
+// CHECK13-NEXT:    call void @_ZN2StC1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP6]]) #[[ATTR4]]
+// CHECK13-NEXT:    call void @_ZN1SIfEC1ERKS0_2St(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]], ptr noundef nonnull align 4 dereferenceable(4) [[TMP2]], ptr noundef [[AGG_TMP6]]) #[[ATTR4]]
+// CHECK13-NEXT:    call void @_ZN2StD1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP6]]) #[[ATTR5]]
 // CHECK13-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
 // CHECK13-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4
 // CHECK13-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB2:[0-9]+]], i32 [[TMP7]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
@@ -2969,14 +2971,14 @@ int main() {
 // CHECK13-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
 // CHECK13-NEXT:    [[TMP21:%.*]] = load i32, ptr [[TMP20]], align 4
 // CHECK13-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP21]])
-// CHECK13-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR6]]
+// CHECK13-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR5]]
 // CHECK13-NEXT:    [[ARRAY_BEGIN12:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR3]], i32 0, i32 0
 // CHECK13-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN12]], i64 2
 // CHECK13-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK13:       arraydestroy.body:
 // CHECK13-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP22]], [[OMP_LOOP_EXIT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK13-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
-// CHECK13-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR6]]
+// CHECK13-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
 // CHECK13-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN12]]
 // CHECK13-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE13:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK13:       arraydestroy.done13:
@@ -2984,24 +2986,26 @@ int main() {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@_ZN1SIfED1Ev
-// CHECK13-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR3]] comdat {
+// CHECK13-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR2]] comdat {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
 // CHECK13-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
-// CHECK13-NEXT:    call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR6]]
+// CHECK13-NEXT:    call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]]
 // CHECK13-NEXT:    ret void
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l81
-// CHECK13-SAME: (ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i64 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR0]] {
+// CHECK13-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i64 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR0]] {
 // CHECK13-NEXT:  entry:
+// CHECK13-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    [[VEC_ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    [[T_VAR_ADDR:%.*]] = alloca i64, align 8
 // CHECK13-NEXT:    [[S_ARR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    [[VAR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    [[TMP:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    [[T_VAR_CASTED:%.*]] = alloca i64, align 8
+// CHECK13-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK13-NEXT:    store ptr [[VEC]], ptr [[VEC_ADDR]], align 8
 // CHECK13-NEXT:    store i64 [[T_VAR]], ptr [[T_VAR_ADDR]], align 8
 // CHECK13-NEXT:    store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 8
@@ -3019,7 +3023,7 @@ int main() {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l81.omp_outlined
-// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i64 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR1]] {
+// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i64 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR0]] {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -3064,18 +3068,18 @@ int main() {
 // CHECK13:       omp.arraycpy.body:
 // CHECK13-NEXT:    [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi ptr [ [[TMP1]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
 // CHECK13-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi ptr [ [[ARRAY_BEGIN]], [[ENTRY]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
-// CHECK13-NEXT:    call void @_ZN2StC1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP]]) #[[ATTR5]]
-// CHECK13-NEXT:    call void @_ZN1SIiEC1ERKS0_2St(ptr noundef nonnull align 4 dereferenceable(4) [[OMP_ARRAYCPY_DESTELEMENTPAST]], ptr noundef nonnull align 4 dereferenceable(4) [[OMP_ARRAYCPY_SRCELEMENTPAST]], ptr noundef [[AGG_TMP]]) #[[ATTR5]]
-// CHECK13-NEXT:    call void @_ZN2StD1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP]]) #[[ATTR6]]
+// CHECK13-NEXT:    call void @_ZN2StC1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP]]) #[[ATTR4]]
+// CHECK13-NEXT:    call void @_ZN1SIiEC1ERKS0_2St(ptr noundef nonnull align 4 dereferenceable(4) [[OMP_ARRAYCPY_DESTELEMENTPAST]], ptr noundef nonnull align 4 dereferenceable(4) [[OMP_ARRAYCPY_SRCELEMENTPAST]], ptr noundef [[AGG_TMP]]) #[[ATTR4]]
+// CHECK13-NEXT:    call void @_ZN2StD1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP]]) #[[ATTR5]]
 // CHECK13-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr [[STRUCT_S_0]], ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
 // CHECK13-NEXT:    [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr [[STRUCT_S_0]], ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1
 // CHECK13-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP3]]
 // CHECK13-NEXT:    br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYCPY_DONE4]], label [[OMP_ARRAYCPY_BODY]]
 // CHECK13:       omp.arraycpy.done4:
 // CHECK13-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[TMP]], align 8
-// CHECK13-NEXT:    call void @_ZN2StC1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP6]]) #[[ATTR5]]
-// CHECK13-NEXT:    call void @_ZN1SIiEC1ERKS0_2St(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]], ptr noundef nonnull align 4 dereferenceable(4) [[TMP4]], ptr noundef [[AGG_TMP6]]) #[[ATTR5]]
-// CHECK13-NEXT:    call void @_ZN2StD1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP6]]) #[[ATTR6]]
+// CHECK13-NEXT:    call void @_ZN2StC1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP6]]) #[[ATTR4]]
+// CHECK13-NEXT:    call void @_ZN1SIiEC1ERKS0_2St(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]], ptr noundef nonnull align 4 dereferenceable(4) [[TMP4]], ptr noundef [[AGG_TMP6]]) #[[ATTR4]]
+// CHECK13-NEXT:    call void @_ZN2StD1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP6]]) #[[ATTR5]]
 // CHECK13-NEXT:    store ptr [[VAR5]], ptr [[_TMP7]], align 8
 // CHECK13-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
 // CHECK13-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4
@@ -3124,14 +3128,14 @@ int main() {
 // CHECK13-NEXT:    [[TMP21:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
 // CHECK13-NEXT:    [[TMP22:%.*]] = load i32, ptr [[TMP21]], align 4
 // CHECK13-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP22]])
-// CHECK13-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR6]]
+// CHECK13-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR5]]
 // CHECK13-NEXT:    [[ARRAY_BEGIN9:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR3]], i32 0, i32 0
 // CHECK13-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN9]], i64 2
 // CHECK13-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK13:       arraydestroy.body:
 // CHECK13-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP23]], [[OMP_LOOP_EXIT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK13-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
-// CHECK13-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR6]]
+// CHECK13-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
 // CHECK13-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN9]]
 // CHECK13-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE10:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK13:       arraydestroy.done10:
@@ -3139,7 +3143,7 @@ int main() {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@_ZN1SIiEC1ERKS0_2St
-// CHECK13-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[S:%.*]], ptr noundef [[T:%.*]]) unnamed_addr #[[ATTR3]] comdat {
+// CHECK13-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[S:%.*]], ptr noundef [[T:%.*]]) unnamed_addr #[[ATTR2]] comdat {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    [[S_ADDR:%.*]] = alloca ptr, align 8
@@ -3149,12 +3153,12 @@ int main() {
 // CHECK13-NEXT:    store ptr [[T]], ptr [[T_INDIRECT_ADDR]], align 8
 // CHECK13-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CHECK13-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 8
-// CHECK13-NEXT:    call void @_ZN1SIiEC2ERKS0_2St(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]], ptr noundef nonnull align 4 dereferenceable(4) [[TMP0]], ptr noundef [[T]]) #[[ATTR5]]
+// CHECK13-NEXT:    call void @_ZN1SIiEC2ERKS0_2St(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]], ptr noundef nonnull align 4 dereferenceable(4) [[TMP0]], ptr noundef [[T]]) #[[ATTR4]]
 // CHECK13-NEXT:    ret void
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l81.omp_outlined.omp_outlined
-// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i64 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR1]] {
+// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i64 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR0]] {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -3208,18 +3212,18 @@ int main() {
 // CHECK13:       omp.arraycpy.body:
 // CHECK13-NEXT:    [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi ptr [ [[TMP1]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
 // CHECK13-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi ptr [ [[ARRAY_BEGIN]], [[ENTRY]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
-// CHECK13-NEXT:    call void @_ZN2StC1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP]]) #[[ATTR5]]
-// CHECK13-NEXT:    call void @_ZN1SIiEC1ERKS0_2St(ptr noundef nonnull align 4 dereferenceable(4) [[OMP_ARRAYCPY_DESTELEMENTPAST]], ptr noundef nonnull align 4 dereferenceable(4) [[OMP_ARRAYCPY_SRCELEMENTPAST]], ptr noundef [[AGG_TMP]]) #[[ATTR5]]
-// CHECK13-NEXT:    call void @_ZN2StD1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP]]) #[[ATTR6]]
+// CHECK13-NEXT:    call void @_ZN2StC1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP]]) #[[ATTR4]]
+// CHECK13-NEXT:    call void @_ZN1SIiEC1ERKS0_2St(ptr noundef nonnull align 4 dereferenceable(4) [[OMP_ARRAYCPY_DESTELEMENTPAST]], ptr noundef nonnull align 4 dereferenceable(4) [[OMP_ARRAYCPY_SRCELEMENTPAST]], ptr noundef [[AGG_TMP]]) #[[ATTR4]]
+// CHECK13-NEXT:    call void @_ZN2StD1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP]]) #[[ATTR5]]
 // CHECK13-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr [[STRUCT_S_0]], ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
 // CHECK13-NEXT:    [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr [[STRUCT_S_0]], ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1
 // CHECK13-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP5]]
 // CHECK13-NEXT:    br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYCPY_DONE5]], label [[OMP_ARRAYCPY_BODY]]
 // CHECK13:       omp.arraycpy.done5:
 // CHECK13-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[TMP]], align 8
-// CHECK13-NEXT:    call void @_ZN2StC1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP7]]) #[[ATTR5]]
-// CHECK13-NEXT:    call void @_ZN1SIiEC1ERKS0_2St(ptr noundef nonnull align 4 dereferenceable(4) [[VAR6]], ptr noundef nonnull align 4 dereferenceable(4) [[TMP6]], ptr noundef [[AGG_TMP7]]) #[[ATTR5]]
-// CHECK13-NEXT:    call void @_ZN2StD1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP7]]) #[[ATTR6]]
+// CHECK13-NEXT:    call void @_ZN2StC1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP7]]) #[[ATTR4]]
+// CHECK13-NEXT:    call void @_ZN1SIiEC1ERKS0_2St(ptr noundef nonnull align 4 dereferenceable(4) [[VAR6]], ptr noundef nonnull align 4 dereferenceable(4) [[TMP6]], ptr noundef [[AGG_TMP7]]) #[[ATTR4]]
+// CHECK13-NEXT:    call void @_ZN2StD1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP7]]) #[[ATTR5]]
 // CHECK13-NEXT:    store ptr [[VAR6]], ptr [[_TMP8]], align 8
 // CHECK13-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
 // CHECK13-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4
@@ -3274,14 +3278,14 @@ int main() {
 // CHECK13-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
 // CHECK13-NEXT:    [[TMP21:%.*]] = load i32, ptr [[TMP20]], align 4
 // CHECK13-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP21]])
-// CHECK13-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR6]]) #[[ATTR6]]
+// CHECK13-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR6]]) #[[ATTR5]]
 // CHECK13-NEXT:    [[ARRAY_BEGIN13:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR4]], i32 0, i32 0
 // CHECK13-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN13]], i64 2
 // CHECK13-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK13:       arraydestroy.body:
 // CHECK13-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP22]], [[OMP_LOOP_EXIT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK13-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
-// CHECK13-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR6]]
+// CHECK13-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
 // CHECK13-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN13]]
 // CHECK13-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE14:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK13:       arraydestroy.done14:
@@ -3289,17 +3293,17 @@ int main() {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@_ZN1SIiED1Ev
-// CHECK13-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR3]] comdat {
+// CHECK13-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR2]] comdat {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
 // CHECK13-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
-// CHECK13-NEXT:    call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR6]]
+// CHECK13-NEXT:    call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]]
 // CHECK13-NEXT:    ret void
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@_ZN2StC2Ev
-// CHECK13-SAME: (ptr noundef nonnull align 4 dereferenceable(8) [[THIS:%.*]]) unnamed_addr #[[ATTR3]] comdat {
+// CHECK13-SAME: (ptr noundef nonnull align 4 dereferenceable(8) [[THIS:%.*]]) unnamed_addr #[[ATTR2]] comdat {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
@@ -3312,7 +3316,7 @@ int main() {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@_ZN2StD2Ev
-// CHECK13-SAME: (ptr noundef nonnull align 4 dereferenceable(8) [[THIS:%.*]]) unnamed_addr #[[ATTR3]] comdat {
+// CHECK13-SAME: (ptr noundef nonnull align 4 dereferenceable(8) [[THIS:%.*]]) unnamed_addr #[[ATTR2]] comdat {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
@@ -3321,7 +3325,7 @@ int main() {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@_ZN1SIfED2Ev
-// CHECK13-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR3]] comdat {
+// CHECK13-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR2]] comdat {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
@@ -3330,7 +3334,7 @@ int main() {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@_ZN1SIfEC2ERKS0_2St
-// CHECK13-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[S:%.*]], ptr noundef [[T:%.*]]) unnamed_addr #[[ATTR3]] comdat {
+// CHECK13-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[S:%.*]], ptr noundef [[T:%.*]]) unnamed_addr #[[ATTR2]] comdat {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    [[S_ADDR:%.*]] = alloca ptr, align 8
@@ -3352,7 +3356,7 @@ int main() {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@_ZN1SIiED2Ev
-// CHECK13-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR3]] comdat {
+// CHECK13-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR2]] comdat {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
@@ -3361,7 +3365,7 @@ int main() {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@_ZN1SIiEC2ERKS0_2St
-// CHECK13-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[S:%.*]], ptr noundef [[T:%.*]]) unnamed_addr #[[ATTR3]] comdat {
+// CHECK13-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[S:%.*]], ptr noundef [[T:%.*]]) unnamed_addr #[[ATTR2]] comdat {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    [[S_ADDR:%.*]] = alloca ptr, align 8
@@ -3382,8 +3386,9 @@ int main() {
 //
 //
 // CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l122
-// CHECK15-SAME: (ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i32 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], i32 noundef [[SIVAR:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK15-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i32 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], i32 noundef [[SIVAR:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK15-NEXT:  entry:
+// CHECK15-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK15-NEXT:    [[VEC_ADDR:%.*]] = alloca ptr, align 4
 // CHECK15-NEXT:    [[T_VAR_ADDR:%.*]] = alloca i32, align 4
 // CHECK15-NEXT:    [[S_ARR_ADDR:%.*]] = alloca ptr, align 4
@@ -3391,6 +3396,7 @@ int main() {
 // CHECK15-NEXT:    [[SIVAR_ADDR:%.*]] = alloca i32, align 4
 // CHECK15-NEXT:    [[T_VAR_CASTED:%.*]] = alloca i32, align 4
 // CHECK15-NEXT:    [[SIVAR_CASTED:%.*]] = alloca i32, align 4
+// CHECK15-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK15-NEXT:    store ptr [[VEC]], ptr [[VEC_ADDR]], align 4
 // CHECK15-NEXT:    store i32 [[T_VAR]], ptr [[T_VAR_ADDR]], align 4
 // CHECK15-NEXT:    store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 4
@@ -3410,7 +3416,7 @@ int main() {
 //
 //
 // CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l122.omp_outlined
-// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i32 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], i32 noundef [[SIVAR:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i32 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], i32 noundef [[SIVAR:%.*]]) #[[ATTR0]] {
 // CHECK15-NEXT:  entry:
 // CHECK15-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK15-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -3455,17 +3461,17 @@ int main() {
 // CHECK15:       omp.arraycpy.body:
 // CHECK15-NEXT:    [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi ptr [ [[TMP1]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
 // CHECK15-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi ptr [ [[ARRAY_BEGIN]], [[ENTRY]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
-// CHECK15-NEXT:    call void @_ZN2StC1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP]]) #[[ATTR5:[0-9]+]]
-// CHECK15-NEXT:    call void @_ZN1SIfEC1ERKS0_2St(ptr noundef nonnull align 4 dereferenceable(4) [[OMP_ARRAYCPY_DESTELEMENTPAST]], ptr noundef nonnull align 4 dereferenceable(4) [[OMP_ARRAYCPY_SRCELEMENTPAST]], ptr noundef [[AGG_TMP]]) #[[ATTR5]]
-// CHECK15-NEXT:    call void @_ZN2StD1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP]]) #[[ATTR6:[0-9]+]]
+// CHECK15-NEXT:    call void @_ZN2StC1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP]]) #[[ATTR4:[0-9]+]]
+// CHECK15-NEXT:    call void @_ZN1SIfEC1ERKS0_2St(ptr noundef nonnull align 4 dereferenceable(4) [[OMP_ARRAYCPY_DESTELEMENTPAST]], ptr noundef nonnull align 4 dereferenceable(4) [[OMP_ARRAYCPY_SRCELEMENTPAST]], ptr noundef [[AGG_TMP]]) #[[ATTR4]]
+// CHECK15-NEXT:    call void @_ZN2StD1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP]]) #[[ATTR5:[0-9]+]]
 // CHECK15-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr [[STRUCT_S]], ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
 // CHECK15-NEXT:    [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr [[STRUCT_S]], ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1
 // CHECK15-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP3]]
 // CHECK15-NEXT:    br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYCPY_DONE3]], label [[OMP_ARRAYCPY_BODY]]
 // CHECK15:       omp.arraycpy.done3:
-// CHECK15-NEXT:    call void @_ZN2StC1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP5]]) #[[ATTR5]]
-// CHECK15-NEXT:    call void @_ZN1SIfEC1ERKS0_2St(ptr noundef nonnull align 4 dereferenceable(4) [[VAR4]], ptr noundef nonnull align 4 dereferenceable(4) [[TMP2]], ptr noundef [[AGG_TMP5]]) #[[ATTR5]]
-// CHECK15-NEXT:    call void @_ZN2StD1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP5]]) #[[ATTR6]]
+// CHECK15-NEXT:    call void @_ZN2StC1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP5]]) #[[ATTR4]]
+// CHECK15-NEXT:    call void @_ZN1SIfEC1ERKS0_2St(ptr noundef nonnull align 4 dereferenceable(4) [[VAR4]], ptr noundef nonnull align 4 dereferenceable(4) [[TMP2]], ptr noundef [[AGG_TMP5]]) #[[ATTR4]]
+// CHECK15-NEXT:    call void @_ZN2StD1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP5]]) #[[ATTR5]]
 // CHECK15-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
 // CHECK15-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4
 // CHECK15-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB1:[0-9]+]], i32 [[TMP5]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
@@ -3513,14 +3519,14 @@ int main() {
 // CHECK15-NEXT:    [[TMP19:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
 // CHECK15-NEXT:    [[TMP20:%.*]] = load i32, ptr [[TMP19]], align 4
 // CHECK15-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP20]])
-// CHECK15-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR4]]) #[[ATTR6]]
+// CHECK15-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR4]]) #[[ATTR5]]
 // CHECK15-NEXT:    [[ARRAY_BEGIN7:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR2]], i32 0, i32 0
 // CHECK15-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN7]], i32 2
 // CHECK15-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK15:       arraydestroy.body:
 // CHECK15-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP21]], [[OMP_LOOP_EXIT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK15-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1
-// CHECK15-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR6]]
+// CHECK15-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
 // CHECK15-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN7]]
 // CHECK15-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE8:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK15:       arraydestroy.done8:
@@ -3528,17 +3534,17 @@ int main() {
 //
 //
 // CHECK15-LABEL: define {{[^@]+}}@_ZN2StC1Ev
-// CHECK15-SAME: (ptr noundef nonnull align 4 dereferenceable(8) [[THIS:%.*]]) unnamed_addr #[[ATTR3:[0-9]+]] comdat align 2 {
+// CHECK15-SAME: (ptr noundef nonnull align 4 dereferenceable(8) [[THIS:%.*]]) unnamed_addr #[[ATTR2:[0-9]+]] comdat align 2 {
 // CHECK15-NEXT:  entry:
 // CHECK15-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
 // CHECK15-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
 // CHECK15-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
-// CHECK15-NEXT:    call void @_ZN2StC2Ev(ptr noundef nonnull align 4 dereferenceable(8) [[THIS1]]) #[[ATTR5]]
+// CHECK15-NEXT:    call void @_ZN2StC2Ev(ptr noundef nonnull align 4 dereferenceable(8) [[THIS1]]) #[[ATTR4]]
 // CHECK15-NEXT:    ret void
 //
 //
 // CHECK15-LABEL: define {{[^@]+}}@_ZN1SIfEC1ERKS0_2St
-// CHECK15-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[S:%.*]], ptr noundef [[T:%.*]]) unnamed_addr #[[ATTR3]] comdat align 2 {
+// CHECK15-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[S:%.*]], ptr noundef [[T:%.*]]) unnamed_addr #[[ATTR2]] comdat align 2 {
 // CHECK15-NEXT:  entry:
 // CHECK15-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
 // CHECK15-NEXT:    [[S_ADDR:%.*]] = alloca ptr, align 4
@@ -3548,22 +3554,22 @@ int main() {
 // CHECK15-NEXT:    store ptr [[T]], ptr [[T_INDIRECT_ADDR]], align 4
 // CHECK15-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
 // CHECK15-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 4
-// CHECK15-NEXT:    call void @_ZN1SIfEC2ERKS0_2St(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]], ptr noundef nonnull align 4 dereferenceable(4) [[TMP0]], ptr noundef [[T]]) #[[ATTR5]]
+// CHECK15-NEXT:    call void @_ZN1SIfEC2ERKS0_2St(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]], ptr noundef nonnull align 4 dereferenceable(4) [[TMP0]], ptr noundef [[T]]) #[[ATTR4]]
 // CHECK15-NEXT:    ret void
 //
 //
 // CHECK15-LABEL: define {{[^@]+}}@_ZN2StD1Ev
-// CHECK15-SAME: (ptr noundef nonnull align 4 dereferenceable(8) [[THIS:%.*]]) unnamed_addr #[[ATTR3]] comdat align 2 {
+// CHECK15-SAME: (ptr noundef nonnull align 4 dereferenceable(8) [[THIS:%.*]]) unnamed_addr #[[ATTR2]] comdat align 2 {
 // CHECK15-NEXT:  entry:
 // CHECK15-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
 // CHECK15-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
 // CHECK15-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
-// CHECK15-NEXT:    call void @_ZN2StD2Ev(ptr noundef nonnull align 4 dereferenceable(8) [[THIS1]]) #[[ATTR6]]
+// CHECK15-NEXT:    call void @_ZN2StD2Ev(ptr noundef nonnull align 4 dereferenceable(8) [[THIS1]]) #[[ATTR5]]
 // CHECK15-NEXT:    ret void
 //
 //
 // CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l122.omp_outlined.omp_outlined
-// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i32 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], i32 noundef [[SIVAR:%.*]]) #[[ATTR1]] {
+// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i32 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], i32 noundef [[SIVAR:%.*]]) #[[ATTR0]] {
 // CHECK15-NEXT:  entry:
 // CHECK15-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK15-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -3614,17 +3620,17 @@ int main() {
 // CHECK15:       omp.arraycpy.body:
 // CHECK15-NEXT:    [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi ptr [ [[TMP1]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
 // CHECK15-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi ptr [ [[ARRAY_BEGIN]], [[ENTRY]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
-// CHECK15-NEXT:    call void @_ZN2StC1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP]]) #[[ATTR5]]
-// CHECK15-NEXT:    call void @_ZN1SIfEC1ERKS0_2St(ptr noundef nonnull align 4 dereferenceable(4) [[OMP_ARRAYCPY_DESTELEMENTPAST]], ptr noundef nonnull align 4 dereferenceable(4) [[OMP_ARRAYCPY_SRCELEMENTPAST]], ptr noundef [[AGG_TMP]]) #[[ATTR5]]
-// CHECK15-NEXT:    call void @_ZN2StD1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP]]) #[[ATTR6]]
+// CHECK15-NEXT:    call void @_ZN2StC1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP]]) #[[ATTR4]]
+// CHECK15-NEXT:    call void @_ZN1SIfEC1ERKS0_2St(ptr noundef nonnull align 4 dereferenceable(4) [[OMP_ARRAYCPY_DESTELEMENTPAST]], ptr noundef nonnull align 4 dereferenceable(4) [[OMP_ARRAYCPY_SRCELEMENTPAST]], ptr noundef [[AGG_TMP]]) #[[ATTR4]]
+// CHECK15-NEXT:    call void @_ZN2StD1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP]]) #[[ATTR5]]
 // CHECK15-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr [[STRUCT_S]], ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
 // CHECK15-NEXT:    [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr [[STRUCT_S]], ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1
 // CHECK15-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP5]]
 // CHECK15-NEXT:    br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYCPY_DONE3]], label [[OMP_ARRAYCPY_BODY]]
 // CHECK15:       omp.arraycpy.done3:
-// CHECK15-NEXT:    call void @_ZN2StC1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP5]]) #[[ATTR5]]
-// CHECK15-NEXT:    call void @_ZN1SIfEC1ERKS0_2St(ptr noundef nonnull align 4 dereferenceable(4) [[VAR4]], ptr noundef nonnull align 4 dereferenceable(4) [[TMP2]], ptr noundef [[AGG_TMP5]]) #[[ATTR5]]
-// CHECK15-NEXT:    call void @_ZN2StD1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP5]]) #[[ATTR6]]
+// CHECK15-NEXT:    call void @_ZN2StC1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP5]]) #[[ATTR4]]
+// CHECK15-NEXT:    call void @_ZN1SIfEC1ERKS0_2St(ptr noundef nonnull align 4 dereferenceable(4) [[VAR4]], ptr noundef nonnull align 4 dereferenceable(4) [[TMP2]], ptr noundef [[AGG_TMP5]]) #[[ATTR4]]
+// CHECK15-NEXT:    call void @_ZN2StD1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP5]]) #[[ATTR5]]
 // CHECK15-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
 // CHECK15-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4
 // CHECK15-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB2:[0-9]+]], i32 [[TMP7]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
@@ -3679,14 +3685,14 @@ int main() {
 // CHECK15-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
 // CHECK15-NEXT:    [[TMP21:%.*]] = load i32, ptr [[TMP20]], align 4
 // CHECK15-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP21]])
-// CHECK15-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR4]]) #[[ATTR6]]
+// CHECK15-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR4]]) #[[ATTR5]]
 // CHECK15-NEXT:    [[ARRAY_BEGIN10:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR2]], i32 0, i32 0
 // CHECK15-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN10]], i32 2
 // CHECK15-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK15:       arraydestroy.body:
 // CHECK15-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP22]], [[OMP_LOOP_EXIT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK15-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1
-// CHECK15-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR6]]
+// CHECK15-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
 // CHECK15-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN10]]
 // CHECK15-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE11:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK15:       arraydestroy.done11:
@@ -3694,24 +3700,26 @@ int main() {
 //
 //
 // CHECK15-LABEL: define {{[^@]+}}@_ZN1SIfED1Ev
-// CHECK15-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR3]] comdat align 2 {
+// CHECK15-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR2]] comdat align 2 {
 // CHECK15-NEXT:  entry:
 // CHECK15-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
 // CHECK15-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
 // CHECK15-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
-// CHECK15-NEXT:    call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR6]]
+// CHECK15-NEXT:    call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]]
 // CHECK15-NEXT:    ret void
 //
 //
 // CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l81
-// CHECK15-SAME: (ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i32 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR0]] {
+// CHECK15-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i32 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR0]] {
 // CHECK15-NEXT:  entry:
+// CHECK15-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK15-NEXT:    [[VEC_ADDR:%.*]] = alloca ptr, align 4
 // CHECK15-NEXT:    [[T_VAR_ADDR:%.*]] = alloca i32, align 4
 // CHECK15-NEXT:    [[S_ARR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK15-NEXT:    [[VAR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK15-NEXT:    [[TMP:%.*]] = alloca ptr, align 4
 // CHECK15-NEXT:    [[T_VAR_CASTED:%.*]] = alloca i32, align 4
+// CHECK15-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK15-NEXT:    store ptr [[VEC]], ptr [[VEC_ADDR]], align 4
 // CHECK15-NEXT:    store i32 [[T_VAR]], ptr [[T_VAR_ADDR]], align 4
 // CHECK15-NEXT:    store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 4
@@ -3729,7 +3737,7 @@ int main() {
 //
 //
 // CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l81.omp_outlined
-// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i32 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR1]] {
+// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i32 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR0]] {
 // CHECK15-NEXT:  entry:
 // CHECK15-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK15-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -3774,18 +3782,18 @@ int main() {
 // CHECK15:       omp.arraycpy.body:
 // CHECK15-NEXT:    [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi ptr [ [[TMP1]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
 // CHECK15-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi ptr [ [[ARRAY_BEGIN]], [[ENTRY]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
-// CHECK15-NEXT:    call void @_ZN2StC1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP]]) #[[ATTR5]]
-// CHECK15-NEXT:    call void @_ZN1SIiEC1ERKS0_2St(ptr noundef nonnull align 4 dereferenceable(4) [[OMP_ARRAYCPY_DESTELEMENTPAST]], ptr noundef nonnull align 4 dereferenceable(4) [[OMP_ARRAYCPY_SRCELEMENTPAST]], ptr noundef [[AGG_TMP]]) #[[ATTR5]]
-// CHECK15-NEXT:    call void @_ZN2StD1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP]]) #[[ATTR6]]
+// CHECK15-NEXT:    call void @_ZN2StC1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP]]) #[[ATTR4]]
+// CHECK15-NEXT:    call void @_ZN1SIiEC1ERKS0_2St(ptr noundef nonnull align 4 dereferenceable(4) [[OMP_ARRAYCPY_DESTELEMENTPAST]], ptr noundef nonnull align 4 dereferenceable(4) [[OMP_ARRAYCPY_SRCELEMENTPAST]], ptr noundef [[AGG_TMP]]) #[[ATTR4]]
+// CHECK15-NEXT:    call void @_ZN2StD1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP]]) #[[ATTR5]]
 // CHECK15-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr [[STRUCT_S_0]], ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
 // CHECK15-NEXT:    [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr [[STRUCT_S_0]], ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1
 // CHECK15-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP3]]
 // CHECK15-NEXT:    br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYCPY_DONE4]], label [[OMP_ARRAYCPY_BODY]]
 // CHECK15:       omp.arraycpy.done4:
 // CHECK15-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[TMP]], align 4
-// CHECK15-NEXT:    call void @_ZN2StC1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP6]]) #[[ATTR5]]
-// CHECK15-NEXT:    call void @_ZN1SIiEC1ERKS0_2St(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]], ptr noundef nonnull align 4 dereferenceable(4) [[TMP4]], ptr noundef [[AGG_TMP6]]) #[[ATTR5]]
-// CHECK15-NEXT:    call void @_ZN2StD1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP6]]) #[[ATTR6]]
+// CHECK15-NEXT:    call void @_ZN2StC1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP6]]) #[[ATTR4]]
+// CHECK15-NEXT:    call void @_ZN1SIiEC1ERKS0_2St(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]], ptr noundef nonnull align 4 dereferenceable(4) [[TMP4]], ptr noundef [[AGG_TMP6]]) #[[ATTR4]]
+// CHECK15-NEXT:    call void @_ZN2StD1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP6]]) #[[ATTR5]]
 // CHECK15-NEXT:    store ptr [[VAR5]], ptr [[_TMP7]], align 4
 // CHECK15-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
 // CHECK15-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4
@@ -3832,14 +3840,14 @@ int main() {
 // CHECK15-NEXT:    [[TMP19:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
 // CHECK15-NEXT:    [[TMP20:%.*]] = load i32, ptr [[TMP19]], align 4
 // CHECK15-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP20]])
-// CHECK15-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR6]]
+// CHECK15-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR5]]
 // CHECK15-NEXT:    [[ARRAY_BEGIN9:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR3]], i32 0, i32 0
 // CHECK15-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN9]], i32 2
 // CHECK15-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK15:       arraydestroy.body:
 // CHECK15-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP21]], [[OMP_LOOP_EXIT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK15-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1
-// CHECK15-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR6]]
+// CHECK15-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
 // CHECK15-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN9]]
 // CHECK15-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE10:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK15:       arraydestroy.done10:
@@ -3847,7 +3855,7 @@ int main() {
 //
 //
 // CHECK15-LABEL: define {{[^@]+}}@_ZN1SIiEC1ERKS0_2St
-// CHECK15-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[S:%.*]], ptr noundef [[T:%.*]]) unnamed_addr #[[ATTR3]] comdat align 2 {
+// CHECK15-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[S:%.*]], ptr noundef [[T:%.*]]) unnamed_addr #[[ATTR2]] comdat align 2 {
 // CHECK15-NEXT:  entry:
 // CHECK15-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
 // CHECK15-NEXT:    [[S_ADDR:%.*]] = alloca ptr, align 4
@@ -3857,12 +3865,12 @@ int main() {
 // CHECK15-NEXT:    store ptr [[T]], ptr [[T_INDIRECT_ADDR]], align 4
 // CHECK15-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
 // CHECK15-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 4
-// CHECK15-NEXT:    call void @_ZN1SIiEC2ERKS0_2St(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]], ptr noundef nonnull align 4 dereferenceable(4) [[TMP0]], ptr noundef [[T]]) #[[ATTR5]]
+// CHECK15-NEXT:    call void @_ZN1SIiEC2ERKS0_2St(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]], ptr noundef nonnull align 4 dereferenceable(4) [[TMP0]], ptr noundef [[T]]) #[[ATTR4]]
 // CHECK15-NEXT:    ret void
 //
 //
 // CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l81.omp_outlined.omp_outlined
-// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i32 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR1]] {
+// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i32 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR0]] {
 // CHECK15-NEXT:  entry:
 // CHECK15-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK15-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -3914,18 +3922,18 @@ int main() {
 // CHECK15:       omp.arraycpy.body:
 // CHECK15-NEXT:    [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi ptr [ [[TMP1]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
 // CHECK15-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi ptr [ [[ARRAY_BEGIN]], [[ENTRY]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
-// CHECK15-NEXT:    call void @_ZN2StC1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP]]) #[[ATTR5]]
-// CHECK15-NEXT:    call void @_ZN1SIiEC1ERKS0_2St(ptr noundef nonnull align 4 dereferenceable(4) [[OMP_ARRAYCPY_DESTELEMENTPAST]], ptr noundef nonnull align 4 dereferenceable(4) [[OMP_ARRAYCPY_SRCELEMENTPAST]], ptr noundef [[AGG_TMP]]) #[[ATTR5]]
-// CHECK15-NEXT:    call void @_ZN2StD1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP]]) #[[ATTR6]]
+// CHECK15-NEXT:    call void @_ZN2StC1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP]]) #[[ATTR4]]
+// CHECK15-NEXT:    call void @_ZN1SIiEC1ERKS0_2St(ptr noundef nonnull align 4 dereferenceable(4) [[OMP_ARRAYCPY_DESTELEMENTPAST]], ptr noundef nonnull align 4 dereferenceable(4) [[OMP_ARRAYCPY_SRCELEMENTPAST]], ptr noundef [[AGG_TMP]]) #[[ATTR4]]
+// CHECK15-NEXT:    call void @_ZN2StD1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP]]) #[[ATTR5]]
 // CHECK15-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr [[STRUCT_S_0]], ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
 // CHECK15-NEXT:    [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr [[STRUCT_S_0]], ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1
 // CHECK15-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP5]]
 // CHECK15-NEXT:    br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYCPY_DONE4]], label [[OMP_ARRAYCPY_BODY]]
 // CHECK15:       omp.arraycpy.done4:
 // CHECK15-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[TMP]], align 4
-// CHECK15-NEXT:    call void @_ZN2StC1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP6]]) #[[ATTR5]]
-// CHECK15-NEXT:    call void @_ZN1SIiEC1ERKS0_2St(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]], ptr noundef nonnull align 4 dereferenceable(4) [[TMP6]], ptr noundef [[AGG_TMP6]]) #[[ATTR5]]
-// CHECK15-NEXT:    call void @_ZN2StD1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP6]]) #[[ATTR6]]
+// CHECK15-NEXT:    call void @_ZN2StC1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP6]]) #[[ATTR4]]
+// CHECK15-NEXT:    call void @_ZN1SIiEC1ERKS0_2St(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]], ptr noundef nonnull align 4 dereferenceable(4) [[TMP6]], ptr noundef [[AGG_TMP6]]) #[[ATTR4]]
+// CHECK15-NEXT:    call void @_ZN2StD1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP6]]) #[[ATTR5]]
 // CHECK15-NEXT:    store ptr [[VAR5]], ptr [[_TMP7]], align 4
 // CHECK15-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
 // CHECK15-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4
@@ -3978,14 +3986,14 @@ int main() {
 // CHECK15-NEXT:    [[TMP20:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
 // CHECK15-NEXT:    [[TMP21:%.*]] = load i32, ptr [[TMP20]], align 4
 // CHECK15-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP21]])
-// CHECK15-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR6]]
+// CHECK15-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR5]]
 // CHECK15-NEXT:    [[ARRAY_BEGIN11:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR3]], i32 0, i32 0
 // CHECK15-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN11]], i32 2
 // CHECK15-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK15:       arraydestroy.body:
 // CHECK15-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP22]], [[OMP_LOOP_EXIT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK15-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1
-// CHECK15-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR6]]
+// CHECK15-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
 // CHECK15-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN11]]
 // CHECK15-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE12:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK15:       arraydestroy.done12:
@@ -3993,17 +4001,17 @@ int main() {
 //
 //
 // CHECK15-LABEL: define {{[^@]+}}@_ZN1SIiED1Ev
-// CHECK15-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR3]] comdat align 2 {
+// CHECK15-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR2]] comdat align 2 {
 // CHECK15-NEXT:  entry:
 // CHECK15-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
 // CHECK15-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
 // CHECK15-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
-// CHECK15-NEXT:    call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR6]]
+// CHECK15-NEXT:    call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]]
 // CHECK15-NEXT:    ret void
 //
 //
 // CHECK15-LABEL: define {{[^@]+}}@_ZN2StC2Ev
-// CHECK15-SAME: (ptr noundef nonnull align 4 dereferenceable(8) [[THIS:%.*]]) unnamed_addr #[[ATTR3]] comdat align 2 {
+// CHECK15-SAME: (ptr noundef nonnull align 4 dereferenceable(8) [[THIS:%.*]]) unnamed_addr #[[ATTR2]] comdat align 2 {
 // CHECK15-NEXT:  entry:
 // CHECK15-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
 // CHECK15-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
@@ -4016,7 +4024,7 @@ int main() {
 //
 //
 // CHECK15-LABEL: define {{[^@]+}}@_ZN2StD2Ev
-// CHECK15-SAME: (ptr noundef nonnull align 4 dereferenceable(8) [[THIS:%.*]]) unnamed_addr #[[ATTR3]] comdat align 2 {
+// CHECK15-SAME: (ptr noundef nonnull align 4 dereferenceable(8) [[THIS:%.*]]) unnamed_addr #[[ATTR2]] comdat align 2 {
 // CHECK15-NEXT:  entry:
 // CHECK15-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
 // CHECK15-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
@@ -4025,7 +4033,7 @@ int main() {
 //
 //
 // CHECK15-LABEL: define {{[^@]+}}@_ZN1SIfED2Ev
-// CHECK15-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR3]] comdat align 2 {
+// CHECK15-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR2]] comdat align 2 {
 // CHECK15-NEXT:  entry:
 // CHECK15-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
 // CHECK15-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
@@ -4034,7 +4042,7 @@ int main() {
 //
 //
 // CHECK15-LABEL: define {{[^@]+}}@_ZN1SIfEC2ERKS0_2St
-// CHECK15-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[S:%.*]], ptr noundef [[T:%.*]]) unnamed_addr #[[ATTR3]] comdat align 2 {
+// CHECK15-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[S:%.*]], ptr noundef [[T:%.*]]) unnamed_addr #[[ATTR2]] comdat align 2 {
 // CHECK15-NEXT:  entry:
 // CHECK15-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
 // CHECK15-NEXT:    [[S_ADDR:%.*]] = alloca ptr, align 4
@@ -4056,7 +4064,7 @@ int main() {
 //
 //
 // CHECK15-LABEL: define {{[^@]+}}@_ZN1SIiED2Ev
-// CHECK15-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR3]] comdat align 2 {
+// CHECK15-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR2]] comdat align 2 {
 // CHECK15-NEXT:  entry:
 // CHECK15-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
 // CHECK15-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
@@ -4065,7 +4073,7 @@ int main() {
 //
 //
 // CHECK15-LABEL: define {{[^@]+}}@_ZN1SIiEC2ERKS0_2St
-// CHECK15-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[S:%.*]], ptr noundef [[T:%.*]]) unnamed_addr #[[ATTR3]] comdat align 2 {
+// CHECK15-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[S:%.*]], ptr noundef [[T:%.*]]) unnamed_addr #[[ATTR2]] comdat align 2 {
 // CHECK15-NEXT:  entry:
 // CHECK15-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
 // CHECK15-NEXT:    [[S_ADDR:%.*]] = alloca ptr, align 4
@@ -4086,8 +4094,9 @@ int main() {
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l99
-// CHECK17-SAME: (i64 noundef [[G:%.*]], i64 noundef [[G1:%.*]], i64 noundef [[SIVAR:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK17-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[G:%.*]], i64 noundef [[G1:%.*]], i64 noundef [[SIVAR:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK17-NEXT:  entry:
+// CHECK17-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[G_ADDR:%.*]] = alloca i64, align 8
 // CHECK17-NEXT:    [[G1_ADDR:%.*]] = alloca i64, align 8
 // CHECK17-NEXT:    [[SIVAR_ADDR:%.*]] = alloca i64, align 8
@@ -4095,6 +4104,7 @@ int main() {
 // CHECK17-NEXT:    [[G_CASTED:%.*]] = alloca i64, align 8
 // CHECK17-NEXT:    [[G1_CASTED:%.*]] = alloca i64, align 8
 // CHECK17-NEXT:    [[SIVAR_CASTED:%.*]] = alloca i64, align 8
+// CHECK17-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK17-NEXT:    store i64 [[G]], ptr [[G_ADDR]], align 8
 // CHECK17-NEXT:    store i64 [[G1]], ptr [[G1_ADDR]], align 8
 // CHECK17-NEXT:    store i64 [[SIVAR]], ptr [[SIVAR_ADDR]], align 8
@@ -4114,7 +4124,7 @@ int main() {
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l99.omp_outlined
-// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[G:%.*]], i64 noundef [[G1:%.*]], i64 noundef [[SIVAR:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[G:%.*]], i64 noundef [[G1:%.*]], i64 noundef [[SIVAR:%.*]]) #[[ATTR0]] {
 // CHECK17-NEXT:  entry:
 // CHECK17-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -4195,7 +4205,7 @@ int main() {
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l99.omp_outlined.omp_outlined
-// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[G:%.*]], i64 noundef [[G1:%.*]], i64 noundef [[SIVAR:%.*]]) #[[ATTR1]] {
+// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[G:%.*]], i64 noundef [[G1:%.*]], i64 noundef [[SIVAR:%.*]]) #[[ATTR0]] {
 // CHECK17-NEXT:  entry:
 // CHECK17-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -4269,7 +4279,7 @@ int main() {
 // CHECK17-NEXT:    store ptr [[TMP13]], ptr [[TMP12]], align 8
 // CHECK17-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[CLASS_ANON]], ptr [[REF_TMP]], i32 0, i32 2
 // CHECK17-NEXT:    store ptr [[SIVAR_ADDR]], ptr [[TMP14]], align 8
-// CHECK17-NEXT:    call void @"_ZZZ4mainENK3$_0clEvENKUlvE_clEv"(ptr noundef nonnull align 8 dereferenceable(24) [[REF_TMP]]) #[[ATTR4:[0-9]+]]
+// CHECK17-NEXT:    call void @"_ZZZ4mainENK3$_0clEvENKUlvE_clEv"(ptr noundef nonnull align 8 dereferenceable(24) [[REF_TMP]]) #[[ATTR3:[0-9]+]]
 // CHECK17-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK17:       omp.body.continue:
 // CHECK17-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_if_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_if_codegen.cpp
index d5f65cfd45319..b64e624c09372 100644
--- a/clang/test/OpenMP/target_teams_distribute_parallel_for_if_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_if_codegen.cpp
@@ -142,7 +142,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
 // CHECK1-NEXT:    br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK1:       omp_offload.failed:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l48() #[[ATTR3:[0-9]+]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l48() #[[ATTR2:[0-9]+]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
@@ -175,7 +175,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0
 // CHECK1-NEXT:    br i1 [[TMP29]], label [[OMP_OFFLOAD_FAILED3:%.*]], label [[OMP_OFFLOAD_CONT4:%.*]]
 // CHECK1:       omp_offload.failed3:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l51() #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l51() #[[ATTR2]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT4]]
 // CHECK1:       omp_offload.cont4:
 // CHECK1-NEXT:    ret void
@@ -189,7 +189,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l48.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -249,7 +249,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l48.omp_outlined.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -326,7 +326,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l51.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -374,7 +374,7 @@ int main() {
 // CHECK1-NEXT:    call void @__kmpc_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP1]])
 // CHECK1-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
 // CHECK1-NEXT:    store i32 0, ptr [[DOTBOUND_ZERO_ADDR]], align 4
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l51.omp_outlined.omp_outlined(ptr [[TMP11]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP8]], i64 [[TMP10]]) #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l51.omp_outlined.omp_outlined(ptr [[TMP11]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP8]], i64 [[TMP10]]) #[[ATTR2]]
 // CHECK1-NEXT:    call void @__kmpc_end_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP1]])
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK1:       omp.inner.for.inc:
@@ -391,7 +391,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l51.omp_outlined.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -462,7 +462,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@main
-// CHECK1-SAME: () #[[ATTR4:[0-9]+]] {
+// CHECK1-SAME: () #[[ATTR3:[0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[TMP:%.*]] = alloca i32, align 4
@@ -505,10 +505,10 @@ int main() {
 // CHECK1-NEXT:    [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
 // CHECK1-NEXT:    br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK1:       omp_offload.failed:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l76() #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l76() #[[ATTR2]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l83() #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l83() #[[ATTR2]]
 // CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr @Arg, align 4
 // CHECK1-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP15]], 0
 // CHECK1-NEXT:    [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8
@@ -564,12 +564,12 @@ int main() {
 // CHECK1-NEXT:    [[TMP41:%.*]] = icmp ne i32 [[TMP40]], 0
 // CHECK1-NEXT:    br i1 [[TMP41]], label [[OMP_OFFLOAD_FAILED7:%.*]], label [[OMP_OFFLOAD_CONT8:%.*]]
 // CHECK1:       omp_offload.failed7:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l90(i64 [[TMP17]]) #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l90(i64 [[TMP17]]) #[[ATTR2]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT8]]
 // CHECK1:       omp_offload.cont8:
 // CHECK1-NEXT:    br label [[OMP_IF_END:%.*]]
 // CHECK1:       omp_if.else:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l90(i64 [[TMP17]]) #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l90(i64 [[TMP17]]) #[[ATTR2]]
 // CHECK1-NEXT:    br label [[OMP_IF_END]]
 // CHECK1:       omp_if.end:
 // CHECK1-NEXT:    [[TMP42:%.*]] = load i32, ptr @Arg, align 4
@@ -585,7 +585,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l76.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -645,7 +645,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l76.omp_outlined.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -716,14 +716,14 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l83
-// CHECK1-SAME: () #[[ATTR2]] {
+// CHECK1-SAME: () #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l83.omp_outlined)
 // CHECK1-NEXT:    ret void
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l83.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -771,7 +771,7 @@ int main() {
 // CHECK1-NEXT:    call void @__kmpc_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP1]])
 // CHECK1-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
 // CHECK1-NEXT:    store i32 0, ptr [[DOTBOUND_ZERO_ADDR]], align 4
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l83.omp_outlined.omp_outlined(ptr [[TMP11]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP8]], i64 [[TMP10]]) #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l83.omp_outlined.omp_outlined(ptr [[TMP11]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP8]], i64 [[TMP10]]) #[[ATTR2]]
 // CHECK1-NEXT:    call void @__kmpc_end_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP1]])
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK1:       omp.inner.for.inc:
@@ -788,7 +788,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l83.omp_outlined.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -874,7 +874,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l90.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -931,7 +931,7 @@ int main() {
 // CHECK1-NEXT:    call void @__kmpc_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP1]])
 // CHECK1-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
 // CHECK1-NEXT:    store i32 0, ptr [[DOTBOUND_ZERO_ADDR]], align 4
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l90.omp_outlined.omp_outlined(ptr [[TMP12]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP8]], i64 [[TMP10]]) #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l90.omp_outlined.omp_outlined(ptr [[TMP12]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP8]], i64 [[TMP10]]) #[[ATTR2]]
 // CHECK1-NEXT:    call void @__kmpc_end_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP1]])
 // CHECK1-NEXT:    br label [[OMP_IF_END]]
 // CHECK1:       omp_if.end:
@@ -950,7 +950,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l90.omp_outlined.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1064,10 +1064,10 @@ int main() {
 // CHECK1-NEXT:    [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
 // CHECK1-NEXT:    br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK1:       omp_offload.failed:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l60() #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l60() #[[ATTR2]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l64() #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l64() #[[ATTR2]]
 // CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[ARG_ADDR]], align 4
 // CHECK1-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP15]], 0
 // CHECK1-NEXT:    [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8
@@ -1119,7 +1119,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP40:%.*]] = icmp ne i32 [[TMP39]], 0
 // CHECK1-NEXT:    br i1 [[TMP40]], label [[OMP_OFFLOAD_FAILED6:%.*]], label [[OMP_OFFLOAD_CONT7:%.*]]
 // CHECK1:       omp_offload.failed6:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l68(i64 [[TMP17]]) #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l68(i64 [[TMP17]]) #[[ATTR2]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT7]]
 // CHECK1:       omp_offload.cont7:
 // CHECK1-NEXT:    ret i32 0
@@ -1133,7 +1133,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l60.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1193,7 +1193,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l60.omp_outlined.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1264,14 +1264,14 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l64
-// CHECK1-SAME: () #[[ATTR2]] {
+// CHECK1-SAME: () #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l64.omp_outlined)
 // CHECK1-NEXT:    ret void
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l64.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1319,7 +1319,7 @@ int main() {
 // CHECK1-NEXT:    call void @__kmpc_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP1]])
 // CHECK1-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
 // CHECK1-NEXT:    store i32 0, ptr [[DOTBOUND_ZERO_ADDR]], align 4
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l64.omp_outlined.omp_outlined(ptr [[TMP11]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP8]], i64 [[TMP10]]) #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l64.omp_outlined.omp_outlined(ptr [[TMP11]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP8]], i64 [[TMP10]]) #[[ATTR2]]
 // CHECK1-NEXT:    call void @__kmpc_end_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP1]])
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK1:       omp.inner.for.inc:
@@ -1336,7 +1336,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l64.omp_outlined.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1422,7 +1422,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l68.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1479,7 +1479,7 @@ int main() {
 // CHECK1-NEXT:    call void @__kmpc_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP1]])
 // CHECK1-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
 // CHECK1-NEXT:    store i32 0, ptr [[DOTBOUND_ZERO_ADDR]], align 4
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l68.omp_outlined.omp_outlined(ptr [[TMP12]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP8]], i64 [[TMP10]]) #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l68.omp_outlined.omp_outlined(ptr [[TMP12]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP8]], i64 [[TMP10]]) #[[ATTR2]]
 // CHECK1-NEXT:    call void @__kmpc_end_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP1]])
 // CHECK1-NEXT:    br label [[OMP_IF_END]]
 // CHECK1:       omp_if.end:
@@ -1498,7 +1498,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l68.omp_outlined.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1569,7 +1569,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK1-SAME: () #[[ATTR6:[0-9]+]] {
+// CHECK1-SAME: () #[[ATTR5:[0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK1-NEXT:    ret void
diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_lastprivate_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_lastprivate_codegen.cpp
index 6c4e6cc71f590..6142e9113660e 100644
--- a/clang/test/OpenMP/target_teams_distribute_parallel_for_lastprivate_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_lastprivate_codegen.cpp
@@ -222,7 +222,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l67.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[G1:%.*]], i64 noundef [[SVAR:%.*]], i64 noundef [[SFVAR:%.*]], i64 noundef [[G:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[G1:%.*]], i64 noundef [[SVAR:%.*]], i64 noundef [[SFVAR:%.*]], i64 noundef [[G:%.*]]) #[[ATTR2]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -331,7 +331,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l67.omp_outlined.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[G1:%.*]], i64 noundef [[SVAR:%.*]], i64 noundef [[SFVAR:%.*]], i64 noundef [[G:%.*]]) #[[ATTR3]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[G1:%.*]], i64 noundef [[SVAR:%.*]], i64 noundef [[SFVAR:%.*]], i64 noundef [[G:%.*]]) #[[ATTR2]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -448,7 +448,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK1-SAME: () #[[ATTR5:[0-9]+]] {
+// CHECK1-SAME: () #[[ATTR4:[0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK1-NEXT:    ret void
@@ -501,7 +501,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l67.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[G1:%.*]], i32 noundef [[SVAR:%.*]], i32 noundef [[SFVAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[G:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[G1:%.*]], i32 noundef [[SVAR:%.*]], i32 noundef [[SFVAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[G:%.*]]) #[[ATTR2]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -602,7 +602,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l67.omp_outlined.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[G1:%.*]], i32 noundef [[SVAR:%.*]], i32 noundef [[SFVAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[G:%.*]]) #[[ATTR3]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[G1:%.*]], i32 noundef [[SVAR:%.*]], i32 noundef [[SFVAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[G:%.*]]) #[[ATTR2]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -719,7 +719,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK3-SAME: () #[[ATTR5:[0-9]+]] {
+// CHECK3-SAME: () #[[ATTR4:[0-9]+]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK3-NEXT:    ret void
@@ -826,7 +826,7 @@ int main() {
 // CHECK5-NEXT:    [[TMP37:%.*]] = icmp ne i32 [[TMP36]], 0
 // CHECK5-NEXT:    br i1 [[TMP37]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK5:       omp_offload.failed:
-// CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l106(ptr [[VEC]], i64 [[TMP2]], ptr [[S_ARR]], ptr [[TMP3]], i64 [[TMP5]]) #[[ATTR5:[0-9]+]]
+// CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l106(ptr [[VEC]], i64 [[TMP2]], ptr [[S_ARR]], ptr [[TMP3]], i64 [[TMP5]]) #[[ATTR4:[0-9]+]]
 // CHECK5-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK5:       omp_offload.cont:
 // CHECK5-NEXT:    [[CALL:%.*]] = call noundef signext i32 @_Z5tmainIiET_v()
@@ -837,11 +837,11 @@ int main() {
 // CHECK5:       arraydestroy.body:
 // CHECK5-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP38]], [[OMP_OFFLOAD_CONT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK5-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
-// CHECK5-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK5-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK5-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN]]
 // CHECK5-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE2:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK5:       arraydestroy.done2:
-// CHECK5-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR5]]
+// CHECK5-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]]
 // CHECK5-NEXT:    [[TMP39:%.*]] = load i32, ptr [[RETVAL]], align 4
 // CHECK5-NEXT:    ret i32 [[TMP39]]
 //
@@ -901,7 +901,7 @@ int main() {
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l106.omp_outlined
-// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i64 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], i64 noundef [[SVAR:%.*]]) #[[ATTR4:[0-9]+]] {
+// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i64 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], i64 noundef [[SVAR:%.*]]) #[[ATTR3]] {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1030,14 +1030,14 @@ int main() {
 // CHECK5-NEXT:    store i32 [[TMP29]], ptr [[SVAR_ADDR]], align 4
 // CHECK5-NEXT:    br label [[DOTOMP_LASTPRIVATE_DONE]]
 // CHECK5:       .omp.lastprivate.done:
-// CHECK5-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR5]]
+// CHECK5-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR4]]
 // CHECK5-NEXT:    [[ARRAY_BEGIN11:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR4]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN11]], i64 2
 // CHECK5-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK5:       arraydestroy.body:
 // CHECK5-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP30]], [[DOTOMP_LASTPRIVATE_DONE]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK5-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
-// CHECK5-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK5-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK5-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN11]]
 // CHECK5-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE12:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK5:       arraydestroy.done12:
@@ -1045,7 +1045,7 @@ int main() {
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l106.omp_outlined.omp_outlined
-// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i64 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], i64 noundef [[SVAR:%.*]]) #[[ATTR4]] {
+// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i64 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], i64 noundef [[SVAR:%.*]]) #[[ATTR3]] {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1185,14 +1185,14 @@ int main() {
 // CHECK5-NEXT:    store i32 [[TMP26]], ptr [[SVAR_ADDR]], align 4
 // CHECK5-NEXT:    br label [[DOTOMP_LASTPRIVATE_DONE]]
 // CHECK5:       .omp.lastprivate.done:
-// CHECK5-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR6]]) #[[ATTR5]]
+// CHECK5-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR6]]) #[[ATTR4]]
 // CHECK5-NEXT:    [[ARRAY_BEGIN15:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR5]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN15]], i64 2
 // CHECK5-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK5:       arraydestroy.body:
 // CHECK5-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP27]], [[DOTOMP_LASTPRIVATE_DONE]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK5-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
-// CHECK5-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK5-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK5-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN15]]
 // CHECK5-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE16:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK5:       arraydestroy.done16:
@@ -1205,12 +1205,12 @@ int main() {
 // CHECK5-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
 // CHECK5-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
-// CHECK5-NEXT:    call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]]
+// CHECK5-NEXT:    call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]]
 // CHECK5-NEXT:    ret void
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@_Z5tmainIiET_v
-// CHECK5-SAME: () #[[ATTR6:[0-9]+]] comdat {
+// CHECK5-SAME: () #[[ATTR5:[0-9]+]] comdat {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
 // CHECK5-NEXT:    [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4
@@ -1295,7 +1295,7 @@ int main() {
 // CHECK5-NEXT:    [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0
 // CHECK5-NEXT:    br i1 [[TMP32]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK5:       omp_offload.failed:
-// CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l50(ptr [[VEC]], i64 [[TMP2]], ptr [[S_ARR]], ptr [[TMP3]]) #[[ATTR5]]
+// CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l50(ptr [[VEC]], i64 [[TMP2]], ptr [[S_ARR]], ptr [[TMP3]]) #[[ATTR4]]
 // CHECK5-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK5:       omp_offload.cont:
 // CHECK5-NEXT:    store i32 0, ptr [[RETVAL]], align 4
@@ -1305,11 +1305,11 @@ int main() {
 // CHECK5:       arraydestroy.body:
 // CHECK5-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP33]], [[OMP_OFFLOAD_CONT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK5-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
-// CHECK5-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK5-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK5-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN]]
 // CHECK5-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE2:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK5:       arraydestroy.done2:
-// CHECK5-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR5]]
+// CHECK5-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]]
 // CHECK5-NEXT:    [[TMP34:%.*]] = load i32, ptr [[RETVAL]], align 4
 // CHECK5-NEXT:    ret i32 [[TMP34]]
 //
@@ -1397,7 +1397,7 @@ int main() {
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l50.omp_outlined
-// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i64 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR4]] {
+// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i64 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR3]] {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1517,14 +1517,14 @@ int main() {
 // CHECK5-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP3]], ptr align 4 [[TMP26]], i64 4, i1 false)
 // CHECK5-NEXT:    br label [[DOTOMP_LASTPRIVATE_DONE]]
 // CHECK5:       .omp.lastprivate.done:
-// CHECK5-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR5]]
+// CHECK5-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR4]]
 // CHECK5-NEXT:    [[ARRAY_BEGIN10:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR4]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN10]], i64 2
 // CHECK5-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK5:       arraydestroy.body:
 // CHECK5-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP27]], [[DOTOMP_LASTPRIVATE_DONE]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK5-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
-// CHECK5-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK5-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK5-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN10]]
 // CHECK5-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE11:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK5:       arraydestroy.done11:
@@ -1532,7 +1532,7 @@ int main() {
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l50.omp_outlined.omp_outlined
-// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i64 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR4]] {
+// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i64 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR3]] {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1667,14 +1667,14 @@ int main() {
 // CHECK5-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP5]], ptr align 4 [[TMP25]], i64 4, i1 false)
 // CHECK5-NEXT:    br label [[DOTOMP_LASTPRIVATE_DONE]]
 // CHECK5:       .omp.lastprivate.done:
-// CHECK5-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR6]]) #[[ATTR5]]
+// CHECK5-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR6]]) #[[ATTR4]]
 // CHECK5-NEXT:    [[ARRAY_BEGIN14:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR5]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN14]], i64 2
 // CHECK5-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK5:       arraydestroy.body:
 // CHECK5-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP26]], [[DOTOMP_LASTPRIVATE_DONE]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK5-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
-// CHECK5-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK5-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK5-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN14]]
 // CHECK5-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE15:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK5:       arraydestroy.done15:
@@ -1687,7 +1687,7 @@ int main() {
 // CHECK5-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
 // CHECK5-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
-// CHECK5-NEXT:    call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]]
+// CHECK5-NEXT:    call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]]
 // CHECK5-NEXT:    ret void
 //
 //
@@ -1726,7 +1726,7 @@ int main() {
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK5-SAME: () #[[ATTR7:[0-9]+]] {
+// CHECK5-SAME: () #[[ATTR6:[0-9]+]] {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK5-NEXT:    ret void
@@ -1833,7 +1833,7 @@ int main() {
 // CHECK7-NEXT:    [[TMP37:%.*]] = icmp ne i32 [[TMP36]], 0
 // CHECK7-NEXT:    br i1 [[TMP37]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK7:       omp_offload.failed:
-// CHECK7-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l106(ptr [[VEC]], i32 [[TMP2]], ptr [[S_ARR]], ptr [[TMP3]], i32 [[TMP5]]) #[[ATTR5:[0-9]+]]
+// CHECK7-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l106(ptr [[VEC]], i32 [[TMP2]], ptr [[S_ARR]], ptr [[TMP3]], i32 [[TMP5]]) #[[ATTR4:[0-9]+]]
 // CHECK7-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK7:       omp_offload.cont:
 // CHECK7-NEXT:    [[CALL:%.*]] = call noundef i32 @_Z5tmainIiET_v()
@@ -1844,11 +1844,11 @@ int main() {
 // CHECK7:       arraydestroy.body:
 // CHECK7-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP38]], [[OMP_OFFLOAD_CONT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK7-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1
-// CHECK7-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK7-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK7-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN]]
 // CHECK7-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE2:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK7:       arraydestroy.done2:
-// CHECK7-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR5]]
+// CHECK7-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]]
 // CHECK7-NEXT:    [[TMP39:%.*]] = load i32, ptr [[RETVAL]], align 4
 // CHECK7-NEXT:    ret i32 [[TMP39]]
 //
@@ -1908,7 +1908,7 @@ int main() {
 //
 //
 // CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l106.omp_outlined
-// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i32 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], i32 noundef [[SVAR:%.*]]) #[[ATTR4:[0-9]+]] {
+// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i32 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], i32 noundef [[SVAR:%.*]]) #[[ATTR3]] {
 // CHECK7-NEXT:  entry:
 // CHECK7-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK7-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -2035,14 +2035,14 @@ int main() {
 // CHECK7-NEXT:    store i32 [[TMP27]], ptr [[SVAR_ADDR]], align 4
 // CHECK7-NEXT:    br label [[DOTOMP_LASTPRIVATE_DONE]]
 // CHECK7:       .omp.lastprivate.done:
-// CHECK7-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR5]]
+// CHECK7-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR4]]
 // CHECK7-NEXT:    [[ARRAY_BEGIN11:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR4]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN11]], i32 2
 // CHECK7-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK7:       arraydestroy.body:
 // CHECK7-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP28]], [[DOTOMP_LASTPRIVATE_DONE]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK7-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1
-// CHECK7-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK7-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK7-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN11]]
 // CHECK7-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE12:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK7:       arraydestroy.done12:
@@ -2050,7 +2050,7 @@ int main() {
 //
 //
 // CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l106.omp_outlined.omp_outlined
-// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i32 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], i32 noundef [[SVAR:%.*]]) #[[ATTR4]] {
+// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i32 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], i32 noundef [[SVAR:%.*]]) #[[ATTR3]] {
 // CHECK7-NEXT:  entry:
 // CHECK7-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK7-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -2186,14 +2186,14 @@ int main() {
 // CHECK7-NEXT:    store i32 [[TMP26]], ptr [[SVAR_ADDR]], align 4
 // CHECK7-NEXT:    br label [[DOTOMP_LASTPRIVATE_DONE]]
 // CHECK7:       .omp.lastprivate.done:
-// CHECK7-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR5]]
+// CHECK7-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR4]]
 // CHECK7-NEXT:    [[ARRAY_BEGIN13:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR4]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN13]], i32 2
 // CHECK7-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK7:       arraydestroy.body:
 // CHECK7-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP27]], [[DOTOMP_LASTPRIVATE_DONE]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK7-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1
-// CHECK7-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK7-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK7-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN13]]
 // CHECK7-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE14:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK7:       arraydestroy.done14:
@@ -2206,12 +2206,12 @@ int main() {
 // CHECK7-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
 // CHECK7-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
 // CHECK7-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
-// CHECK7-NEXT:    call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]]
+// CHECK7-NEXT:    call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]]
 // CHECK7-NEXT:    ret void
 //
 //
 // CHECK7-LABEL: define {{[^@]+}}@_Z5tmainIiET_v
-// CHECK7-SAME: () #[[ATTR6:[0-9]+]] comdat {
+// CHECK7-SAME: () #[[ATTR5:[0-9]+]] comdat {
 // CHECK7-NEXT:  entry:
 // CHECK7-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
 // CHECK7-NEXT:    [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4
@@ -2296,7 +2296,7 @@ int main() {
 // CHECK7-NEXT:    [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0
 // CHECK7-NEXT:    br i1 [[TMP32]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK7:       omp_offload.failed:
-// CHECK7-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l50(ptr [[VEC]], i32 [[TMP2]], ptr [[S_ARR]], ptr [[TMP3]]) #[[ATTR5]]
+// CHECK7-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l50(ptr [[VEC]], i32 [[TMP2]], ptr [[S_ARR]], ptr [[TMP3]]) #[[ATTR4]]
 // CHECK7-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK7:       omp_offload.cont:
 // CHECK7-NEXT:    store i32 0, ptr [[RETVAL]], align 4
@@ -2306,11 +2306,11 @@ int main() {
 // CHECK7:       arraydestroy.body:
 // CHECK7-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP33]], [[OMP_OFFLOAD_CONT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK7-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1
-// CHECK7-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK7-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK7-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN]]
 // CHECK7-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE2:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK7:       arraydestroy.done2:
-// CHECK7-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR5]]
+// CHECK7-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]]
 // CHECK7-NEXT:    [[TMP34:%.*]] = load i32, ptr [[RETVAL]], align 4
 // CHECK7-NEXT:    ret i32 [[TMP34]]
 //
@@ -2398,7 +2398,7 @@ int main() {
 //
 //
 // CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l50.omp_outlined
-// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i32 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR4]] {
+// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i32 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR3]] {
 // CHECK7-NEXT:  entry:
 // CHECK7-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK7-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -2516,14 +2516,14 @@ int main() {
 // CHECK7-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[TMP3]], ptr align 4 [[TMP24]], i32 4, i1 false)
 // CHECK7-NEXT:    br label [[DOTOMP_LASTPRIVATE_DONE]]
 // CHECK7:       .omp.lastprivate.done:
-// CHECK7-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR5]]
+// CHECK7-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR4]]
 // CHECK7-NEXT:    [[ARRAY_BEGIN10:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR4]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN10]], i32 2
 // CHECK7-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK7:       arraydestroy.body:
 // CHECK7-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP25]], [[DOTOMP_LASTPRIVATE_DONE]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK7-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1
-// CHECK7-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK7-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK7-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN10]]
 // CHECK7-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE11:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK7:       arraydestroy.done11:
@@ -2531,7 +2531,7 @@ int main() {
 //
 //
 // CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l50.omp_outlined.omp_outlined
-// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i32 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR4]] {
+// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i32 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR3]] {
 // CHECK7-NEXT:  entry:
 // CHECK7-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK7-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -2662,14 +2662,14 @@ int main() {
 // CHECK7-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[TMP5]], ptr align 4 [[TMP25]], i32 4, i1 false)
 // CHECK7-NEXT:    br label [[DOTOMP_LASTPRIVATE_DONE]]
 // CHECK7:       .omp.lastprivate.done:
-// CHECK7-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR5]]
+// CHECK7-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR4]]
 // CHECK7-NEXT:    [[ARRAY_BEGIN12:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR4]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN12]], i32 2
 // CHECK7-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK7:       arraydestroy.body:
 // CHECK7-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP26]], [[DOTOMP_LASTPRIVATE_DONE]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK7-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1
-// CHECK7-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK7-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK7-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN12]]
 // CHECK7-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE13:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK7:       arraydestroy.done13:
@@ -2682,7 +2682,7 @@ int main() {
 // CHECK7-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
 // CHECK7-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
 // CHECK7-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
-// CHECK7-NEXT:    call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]]
+// CHECK7-NEXT:    call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]]
 // CHECK7-NEXT:    ret void
 //
 //
@@ -2721,7 +2721,7 @@ int main() {
 //
 //
 // CHECK7-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK7-SAME: () #[[ATTR7:[0-9]+]] {
+// CHECK7-SAME: () #[[ATTR6:[0-9]+]] {
 // CHECK7-NEXT:  entry:
 // CHECK7-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK7-NEXT:    ret void
diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_order_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_order_codegen.cpp
index 1071a999e8da7..98593d5c1224c 100644
--- a/clang/test/OpenMP/target_teams_distribute_parallel_for_order_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_order_codegen.cpp
@@ -56,7 +56,7 @@ void gtid_test() {
 // CHECK1-NEXT:    [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
 // CHECK1-NEXT:    br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK1:       omp_offload.failed:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l16() #[[ATTR3:[0-9]+]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l16() #[[ATTR2:[0-9]+]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    ret void
@@ -70,7 +70,7 @@ void gtid_test() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l16.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -130,7 +130,7 @@ void gtid_test() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l16.omp_outlined.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -200,7 +200,7 @@ void gtid_test() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK1-SAME: () #[[ATTR4:[0-9]+]] section ".text.startup" {
+// CHECK1-SAME: () #[[ATTR3:[0-9]+]] section ".text.startup" {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK1-NEXT:    ret void
diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_private_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_private_codegen.cpp
index b3c227bfdf330..2e45b84d6a37d 100644
--- a/clang/test/OpenMP/target_teams_distribute_parallel_for_private_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_private_codegen.cpp
@@ -345,7 +345,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l124.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR5:[0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR4]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -436,7 +436,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l124.omp_outlined.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR5]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR4]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -550,7 +550,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@_Z5tmainIiET_v
-// CHECK1-SAME: () #[[ATTR7:[0-9]+]] comdat {
+// CHECK1-SAME: () #[[ATTR6:[0-9]+]] comdat {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4
@@ -650,7 +650,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l80.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR5]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR4]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -744,7 +744,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l80.omp_outlined.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR5]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR4]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1085,7 +1085,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l124.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR5:[0-9]+]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR4]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1174,7 +1174,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l124.omp_outlined.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR5]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR4]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1284,7 +1284,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@_Z5tmainIiET_v
-// CHECK3-SAME: () #[[ATTR7:[0-9]+]] comdat {
+// CHECK3-SAME: () #[[ATTR6:[0-9]+]] comdat {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
 // CHECK3-NEXT:    [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4
@@ -1384,7 +1384,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l80.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR5]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR4]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1476,7 +1476,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l80.omp_outlined.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR5]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR4]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1779,7 +1779,7 @@ int main() {
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l104.omp_outlined
-// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR6:[0-9]+]] {
+// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR5]] {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1846,7 +1846,7 @@ int main() {
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l104.omp_outlined.omp_outlined
-// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR6]] {
+// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR5]] {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1952,14 +1952,16 @@ int main() {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l124
-// CHECK13-SAME: () #[[ATTR0:[0-9]+]] {
+// CHECK13-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK13-NEXT:  entry:
+// CHECK13-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
+// CHECK13-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK13-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3:[0-9]+]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l124.omp_outlined)
 // CHECK13-NEXT:    ret void
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l124.omp_outlined
-// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1986,12 +1988,12 @@ int main() {
 // CHECK13-NEXT:    br label [[ARRAYCTOR_LOOP:%.*]]
 // CHECK13:       arrayctor.loop:
 // CHECK13-NEXT:    [[ARRAYCTOR_CUR:%.*]] = phi ptr [ [[ARRAY_BEGIN]], [[ENTRY:%.*]] ], [ [[ARRAYCTOR_NEXT:%.*]], [[ARRAYCTOR_LOOP]] ]
-// CHECK13-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYCTOR_CUR]]) #[[ATTR5:[0-9]+]]
+// CHECK13-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYCTOR_CUR]]) #[[ATTR4:[0-9]+]]
 // CHECK13-NEXT:    [[ARRAYCTOR_NEXT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYCTOR_CUR]], i64 1
 // CHECK13-NEXT:    [[ARRAYCTOR_DONE:%.*]] = icmp eq ptr [[ARRAYCTOR_NEXT]], [[ARRAYCTOR_END]]
 // CHECK13-NEXT:    br i1 [[ARRAYCTOR_DONE]], label [[ARRAYCTOR_CONT:%.*]], label [[ARRAYCTOR_LOOP]]
 // CHECK13:       arrayctor.cont:
-// CHECK13-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR5]]
+// CHECK13-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR4]]
 // CHECK13-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
 // CHECK13-NEXT:    [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
 // CHECK13-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB1:[0-9]+]], i32 [[TMP1]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
@@ -2035,14 +2037,14 @@ int main() {
 // CHECK13-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
 // CHECK13-NEXT:    [[TMP14:%.*]] = load i32, ptr [[TMP13]], align 4
 // CHECK13-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP14]])
-// CHECK13-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR6:[0-9]+]]
+// CHECK13-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR5:[0-9]+]]
 // CHECK13-NEXT:    [[ARRAY_BEGIN2:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i32 0, i32 0
 // CHECK13-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN2]], i64 2
 // CHECK13-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK13:       arraydestroy.body:
 // CHECK13-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP15]], [[OMP_LOOP_EXIT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK13-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
-// CHECK13-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR6]]
+// CHECK13-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
 // CHECK13-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN2]]
 // CHECK13-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE3:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK13:       arraydestroy.done3:
@@ -2050,17 +2052,17 @@ int main() {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@_ZN1SIfEC1Ev
-// CHECK13-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR2:[0-9]+]] comdat {
+// CHECK13-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1:[0-9]+]] comdat {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
 // CHECK13-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
-// CHECK13-NEXT:    call void @_ZN1SIfEC2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]]
+// CHECK13-NEXT:    call void @_ZN1SIfEC2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]]
 // CHECK13-NEXT:    ret void
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l124.omp_outlined.omp_outlined
-// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR0]] {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -2097,12 +2099,12 @@ int main() {
 // CHECK13-NEXT:    br label [[ARRAYCTOR_LOOP:%.*]]
 // CHECK13:       arrayctor.loop:
 // CHECK13-NEXT:    [[ARRAYCTOR_CUR:%.*]] = phi ptr [ [[ARRAY_BEGIN]], [[ENTRY:%.*]] ], [ [[ARRAYCTOR_NEXT:%.*]], [[ARRAYCTOR_LOOP]] ]
-// CHECK13-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYCTOR_CUR]]) #[[ATTR5]]
+// CHECK13-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYCTOR_CUR]]) #[[ATTR4]]
 // CHECK13-NEXT:    [[ARRAYCTOR_NEXT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYCTOR_CUR]], i64 1
 // CHECK13-NEXT:    [[ARRAYCTOR_DONE:%.*]] = icmp eq ptr [[ARRAYCTOR_NEXT]], [[ARRAYCTOR_END]]
 // CHECK13-NEXT:    br i1 [[ARRAYCTOR_DONE]], label [[ARRAYCTOR_CONT:%.*]], label [[ARRAYCTOR_LOOP]]
 // CHECK13:       arrayctor.cont:
-// CHECK13-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR5]]
+// CHECK13-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR4]]
 // CHECK13-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
 // CHECK13-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
 // CHECK13-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB2:[0-9]+]], i32 [[TMP3]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
@@ -2159,14 +2161,14 @@ int main() {
 // CHECK13-NEXT:    [[TMP16:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
 // CHECK13-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP16]], align 4
 // CHECK13-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP17]])
-// CHECK13-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR6]]
+// CHECK13-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR5]]
 // CHECK13-NEXT:    [[ARRAY_BEGIN7:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i32 0, i32 0
 // CHECK13-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN7]], i64 2
 // CHECK13-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK13:       arraydestroy.body:
 // CHECK13-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP18]], [[OMP_LOOP_EXIT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK13-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
-// CHECK13-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR6]]
+// CHECK13-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
 // CHECK13-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN7]]
 // CHECK13-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE8:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK13:       arraydestroy.done8:
@@ -2174,24 +2176,26 @@ int main() {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@_ZN1SIfED1Ev
-// CHECK13-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR2]] comdat {
+// CHECK13-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
 // CHECK13-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
-// CHECK13-NEXT:    call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR6]]
+// CHECK13-NEXT:    call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]]
 // CHECK13-NEXT:    ret void
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l80
-// CHECK13-SAME: () #[[ATTR0]] {
+// CHECK13-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
 // CHECK13-NEXT:  entry:
+// CHECK13-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
+// CHECK13-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK13-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l80.omp_outlined)
 // CHECK13-NEXT:    ret void
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l80.omp_outlined
-// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -2220,12 +2224,12 @@ int main() {
 // CHECK13-NEXT:    br label [[ARRAYCTOR_LOOP:%.*]]
 // CHECK13:       arrayctor.loop:
 // CHECK13-NEXT:    [[ARRAYCTOR_CUR:%.*]] = phi ptr [ [[ARRAY_BEGIN]], [[ENTRY:%.*]] ], [ [[ARRAYCTOR_NEXT:%.*]], [[ARRAYCTOR_LOOP]] ]
-// CHECK13-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYCTOR_CUR]]) #[[ATTR5]]
+// CHECK13-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYCTOR_CUR]]) #[[ATTR4]]
 // CHECK13-NEXT:    [[ARRAYCTOR_NEXT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYCTOR_CUR]], i64 1
 // CHECK13-NEXT:    [[ARRAYCTOR_DONE:%.*]] = icmp eq ptr [[ARRAYCTOR_NEXT]], [[ARRAYCTOR_END]]
 // CHECK13-NEXT:    br i1 [[ARRAYCTOR_DONE]], label [[ARRAYCTOR_CONT:%.*]], label [[ARRAYCTOR_LOOP]]
 // CHECK13:       arrayctor.cont:
-// CHECK13-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR5]]
+// CHECK13-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR4]]
 // CHECK13-NEXT:    store ptr [[VAR]], ptr [[_TMP2]], align 8
 // CHECK13-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
 // CHECK13-NEXT:    [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
@@ -2270,14 +2274,14 @@ int main() {
 // CHECK13-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
 // CHECK13-NEXT:    [[TMP14:%.*]] = load i32, ptr [[TMP13]], align 4
 // CHECK13-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP14]])
-// CHECK13-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR6]]
+// CHECK13-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR5]]
 // CHECK13-NEXT:    [[ARRAY_BEGIN4:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0
 // CHECK13-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN4]], i64 2
 // CHECK13-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK13:       arraydestroy.body:
 // CHECK13-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP15]], [[OMP_LOOP_EXIT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK13-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
-// CHECK13-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR6]]
+// CHECK13-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
 // CHECK13-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN4]]
 // CHECK13-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE5:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK13:       arraydestroy.done5:
@@ -2285,17 +2289,17 @@ int main() {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@_ZN1SIiEC1Ev
-// CHECK13-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR2]] comdat {
+// CHECK13-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
 // CHECK13-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
-// CHECK13-NEXT:    call void @_ZN1SIiEC2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]]
+// CHECK13-NEXT:    call void @_ZN1SIiEC2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]]
 // CHECK13-NEXT:    ret void
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l80.omp_outlined.omp_outlined
-// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR0]] {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -2334,12 +2338,12 @@ int main() {
 // CHECK13-NEXT:    br label [[ARRAYCTOR_LOOP:%.*]]
 // CHECK13:       arrayctor.loop:
 // CHECK13-NEXT:    [[ARRAYCTOR_CUR:%.*]] = phi ptr [ [[ARRAY_BEGIN]], [[ENTRY:%.*]] ], [ [[ARRAYCTOR_NEXT:%.*]], [[ARRAYCTOR_LOOP]] ]
-// CHECK13-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYCTOR_CUR]]) #[[ATTR5]]
+// CHECK13-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYCTOR_CUR]]) #[[ATTR4]]
 // CHECK13-NEXT:    [[ARRAYCTOR_NEXT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYCTOR_CUR]], i64 1
 // CHECK13-NEXT:    [[ARRAYCTOR_DONE:%.*]] = icmp eq ptr [[ARRAYCTOR_NEXT]], [[ARRAYCTOR_END]]
 // CHECK13-NEXT:    br i1 [[ARRAYCTOR_DONE]], label [[ARRAYCTOR_CONT:%.*]], label [[ARRAYCTOR_LOOP]]
 // CHECK13:       arrayctor.cont:
-// CHECK13-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR5]]
+// CHECK13-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR4]]
 // CHECK13-NEXT:    store ptr [[VAR]], ptr [[_TMP3]], align 8
 // CHECK13-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
 // CHECK13-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
@@ -2394,14 +2398,14 @@ int main() {
 // CHECK13-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
 // CHECK13-NEXT:    [[TMP16:%.*]] = load i32, ptr [[TMP15]], align 4
 // CHECK13-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP16]])
-// CHECK13-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR6]]
+// CHECK13-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR5]]
 // CHECK13-NEXT:    [[ARRAY_BEGIN8:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0
 // CHECK13-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN8]], i64 2
 // CHECK13-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK13:       arraydestroy.body:
 // CHECK13-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP17]], [[OMP_LOOP_EXIT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK13-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
-// CHECK13-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR6]]
+// CHECK13-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
 // CHECK13-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN8]]
 // CHECK13-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE9:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK13:       arraydestroy.done9:
@@ -2409,17 +2413,17 @@ int main() {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@_ZN1SIiED1Ev
-// CHECK13-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR2]] comdat {
+// CHECK13-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
 // CHECK13-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
-// CHECK13-NEXT:    call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR6]]
+// CHECK13-NEXT:    call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]]
 // CHECK13-NEXT:    ret void
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@_ZN1SIfEC2Ev
-// CHECK13-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR2]] comdat {
+// CHECK13-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
@@ -2432,7 +2436,7 @@ int main() {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@_ZN1SIfED2Ev
-// CHECK13-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR2]] comdat {
+// CHECK13-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
@@ -2441,7 +2445,7 @@ int main() {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@_ZN1SIiEC2Ev
-// CHECK13-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR2]] comdat {
+// CHECK13-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
@@ -2453,7 +2457,7 @@ int main() {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@_ZN1SIiED2Ev
-// CHECK13-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR2]] comdat {
+// CHECK13-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
@@ -2462,14 +2466,16 @@ int main() {
 //
 //
 // CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l124
-// CHECK15-SAME: () #[[ATTR0:[0-9]+]] {
+// CHECK15-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK15-NEXT:  entry:
+// CHECK15-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
+// CHECK15-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK15-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3:[0-9]+]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l124.omp_outlined)
 // CHECK15-NEXT:    ret void
 //
 //
 // CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l124.omp_outlined
-// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
 // CHECK15-NEXT:  entry:
 // CHECK15-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK15-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -2496,12 +2502,12 @@ int main() {
 // CHECK15-NEXT:    br label [[ARRAYCTOR_LOOP:%.*]]
 // CHECK15:       arrayctor.loop:
 // CHECK15-NEXT:    [[ARRAYCTOR_CUR:%.*]] = phi ptr [ [[ARRAY_BEGIN]], [[ENTRY:%.*]] ], [ [[ARRAYCTOR_NEXT:%.*]], [[ARRAYCTOR_LOOP]] ]
-// CHECK15-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYCTOR_CUR]]) #[[ATTR5:[0-9]+]]
+// CHECK15-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYCTOR_CUR]]) #[[ATTR4:[0-9]+]]
 // CHECK15-NEXT:    [[ARRAYCTOR_NEXT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYCTOR_CUR]], i32 1
 // CHECK15-NEXT:    [[ARRAYCTOR_DONE:%.*]] = icmp eq ptr [[ARRAYCTOR_NEXT]], [[ARRAYCTOR_END]]
 // CHECK15-NEXT:    br i1 [[ARRAYCTOR_DONE]], label [[ARRAYCTOR_CONT:%.*]], label [[ARRAYCTOR_LOOP]]
 // CHECK15:       arrayctor.cont:
-// CHECK15-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR5]]
+// CHECK15-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR4]]
 // CHECK15-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
 // CHECK15-NEXT:    [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
 // CHECK15-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB1:[0-9]+]], i32 [[TMP1]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
@@ -2543,14 +2549,14 @@ int main() {
 // CHECK15-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
 // CHECK15-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP11]], align 4
 // CHECK15-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP12]])
-// CHECK15-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR6:[0-9]+]]
+// CHECK15-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR5:[0-9]+]]
 // CHECK15-NEXT:    [[ARRAY_BEGIN2:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i32 0, i32 0
 // CHECK15-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN2]], i32 2
 // CHECK15-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK15:       arraydestroy.body:
 // CHECK15-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP13]], [[OMP_LOOP_EXIT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK15-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1
-// CHECK15-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR6]]
+// CHECK15-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
 // CHECK15-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN2]]
 // CHECK15-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE3:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK15:       arraydestroy.done3:
@@ -2558,17 +2564,17 @@ int main() {
 //
 //
 // CHECK15-LABEL: define {{[^@]+}}@_ZN1SIfEC1Ev
-// CHECK15-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR2:[0-9]+]] comdat align 2 {
+// CHECK15-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1:[0-9]+]] comdat align 2 {
 // CHECK15-NEXT:  entry:
 // CHECK15-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
 // CHECK15-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
 // CHECK15-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
-// CHECK15-NEXT:    call void @_ZN1SIfEC2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]]
+// CHECK15-NEXT:    call void @_ZN1SIfEC2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]]
 // CHECK15-NEXT:    ret void
 //
 //
 // CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l124.omp_outlined.omp_outlined
-// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR0]] {
 // CHECK15-NEXT:  entry:
 // CHECK15-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK15-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -2603,12 +2609,12 @@ int main() {
 // CHECK15-NEXT:    br label [[ARRAYCTOR_LOOP:%.*]]
 // CHECK15:       arrayctor.loop:
 // CHECK15-NEXT:    [[ARRAYCTOR_CUR:%.*]] = phi ptr [ [[ARRAY_BEGIN]], [[ENTRY:%.*]] ], [ [[ARRAYCTOR_NEXT:%.*]], [[ARRAYCTOR_LOOP]] ]
-// CHECK15-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYCTOR_CUR]]) #[[ATTR5]]
+// CHECK15-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYCTOR_CUR]]) #[[ATTR4]]
 // CHECK15-NEXT:    [[ARRAYCTOR_NEXT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYCTOR_CUR]], i32 1
 // CHECK15-NEXT:    [[ARRAYCTOR_DONE:%.*]] = icmp eq ptr [[ARRAYCTOR_NEXT]], [[ARRAYCTOR_END]]
 // CHECK15-NEXT:    br i1 [[ARRAYCTOR_DONE]], label [[ARRAYCTOR_CONT:%.*]], label [[ARRAYCTOR_LOOP]]
 // CHECK15:       arrayctor.cont:
-// CHECK15-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR5]]
+// CHECK15-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR4]]
 // CHECK15-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
 // CHECK15-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
 // CHECK15-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB2:[0-9]+]], i32 [[TMP3]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
@@ -2663,14 +2669,14 @@ int main() {
 // CHECK15-NEXT:    [[TMP16:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
 // CHECK15-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP16]], align 4
 // CHECK15-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP17]])
-// CHECK15-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR6]]
+// CHECK15-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR5]]
 // CHECK15-NEXT:    [[ARRAY_BEGIN5:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i32 0, i32 0
 // CHECK15-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN5]], i32 2
 // CHECK15-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK15:       arraydestroy.body:
 // CHECK15-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP18]], [[OMP_LOOP_EXIT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK15-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1
-// CHECK15-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR6]]
+// CHECK15-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
 // CHECK15-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN5]]
 // CHECK15-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE6:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK15:       arraydestroy.done6:
@@ -2678,24 +2684,26 @@ int main() {
 //
 //
 // CHECK15-LABEL: define {{[^@]+}}@_ZN1SIfED1Ev
-// CHECK15-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR2]] comdat align 2 {
+// CHECK15-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
 // CHECK15-NEXT:  entry:
 // CHECK15-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
 // CHECK15-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
 // CHECK15-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
-// CHECK15-NEXT:    call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR6]]
+// CHECK15-NEXT:    call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]]
 // CHECK15-NEXT:    ret void
 //
 //
 // CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l80
-// CHECK15-SAME: () #[[ATTR0]] {
+// CHECK15-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
 // CHECK15-NEXT:  entry:
+// CHECK15-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
+// CHECK15-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK15-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l80.omp_outlined)
 // CHECK15-NEXT:    ret void
 //
 //
 // CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l80.omp_outlined
-// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
 // CHECK15-NEXT:  entry:
 // CHECK15-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK15-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -2724,12 +2732,12 @@ int main() {
 // CHECK15-NEXT:    br label [[ARRAYCTOR_LOOP:%.*]]
 // CHECK15:       arrayctor.loop:
 // CHECK15-NEXT:    [[ARRAYCTOR_CUR:%.*]] = phi ptr [ [[ARRAY_BEGIN]], [[ENTRY:%.*]] ], [ [[ARRAYCTOR_NEXT:%.*]], [[ARRAYCTOR_LOOP]] ]
-// CHECK15-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYCTOR_CUR]]) #[[ATTR5]]
+// CHECK15-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYCTOR_CUR]]) #[[ATTR4]]
 // CHECK15-NEXT:    [[ARRAYCTOR_NEXT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYCTOR_CUR]], i32 1
 // CHECK15-NEXT:    [[ARRAYCTOR_DONE:%.*]] = icmp eq ptr [[ARRAYCTOR_NEXT]], [[ARRAYCTOR_END]]
 // CHECK15-NEXT:    br i1 [[ARRAYCTOR_DONE]], label [[ARRAYCTOR_CONT:%.*]], label [[ARRAYCTOR_LOOP]]
 // CHECK15:       arrayctor.cont:
-// CHECK15-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR5]]
+// CHECK15-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR4]]
 // CHECK15-NEXT:    store ptr [[VAR]], ptr [[_TMP2]], align 4
 // CHECK15-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
 // CHECK15-NEXT:    [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
@@ -2772,14 +2780,14 @@ int main() {
 // CHECK15-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
 // CHECK15-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP11]], align 4
 // CHECK15-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP12]])
-// CHECK15-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR6]]
+// CHECK15-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR5]]
 // CHECK15-NEXT:    [[ARRAY_BEGIN4:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0
 // CHECK15-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN4]], i32 2
 // CHECK15-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK15:       arraydestroy.body:
 // CHECK15-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP13]], [[OMP_LOOP_EXIT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK15-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1
-// CHECK15-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR6]]
+// CHECK15-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
 // CHECK15-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN4]]
 // CHECK15-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE5:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK15:       arraydestroy.done5:
@@ -2787,17 +2795,17 @@ int main() {
 //
 //
 // CHECK15-LABEL: define {{[^@]+}}@_ZN1SIiEC1Ev
-// CHECK15-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR2]] comdat align 2 {
+// CHECK15-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
 // CHECK15-NEXT:  entry:
 // CHECK15-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
 // CHECK15-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
 // CHECK15-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
-// CHECK15-NEXT:    call void @_ZN1SIiEC2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]]
+// CHECK15-NEXT:    call void @_ZN1SIiEC2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]]
 // CHECK15-NEXT:    ret void
 //
 //
 // CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l80.omp_outlined.omp_outlined
-// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR0]] {
 // CHECK15-NEXT:  entry:
 // CHECK15-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK15-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -2834,12 +2842,12 @@ int main() {
 // CHECK15-NEXT:    br label [[ARRAYCTOR_LOOP:%.*]]
 // CHECK15:       arrayctor.loop:
 // CHECK15-NEXT:    [[ARRAYCTOR_CUR:%.*]] = phi ptr [ [[ARRAY_BEGIN]], [[ENTRY:%.*]] ], [ [[ARRAYCTOR_NEXT:%.*]], [[ARRAYCTOR_LOOP]] ]
-// CHECK15-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYCTOR_CUR]]) #[[ATTR5]]
+// CHECK15-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYCTOR_CUR]]) #[[ATTR4]]
 // CHECK15-NEXT:    [[ARRAYCTOR_NEXT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYCTOR_CUR]], i32 1
 // CHECK15-NEXT:    [[ARRAYCTOR_DONE:%.*]] = icmp eq ptr [[ARRAYCTOR_NEXT]], [[ARRAYCTOR_END]]
 // CHECK15-NEXT:    br i1 [[ARRAYCTOR_DONE]], label [[ARRAYCTOR_CONT:%.*]], label [[ARRAYCTOR_LOOP]]
 // CHECK15:       arrayctor.cont:
-// CHECK15-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR5]]
+// CHECK15-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR4]]
 // CHECK15-NEXT:    store ptr [[VAR]], ptr [[_TMP2]], align 4
 // CHECK15-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
 // CHECK15-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
@@ -2892,14 +2900,14 @@ int main() {
 // CHECK15-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
 // CHECK15-NEXT:    [[TMP16:%.*]] = load i32, ptr [[TMP15]], align 4
 // CHECK15-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP16]])
-// CHECK15-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR6]]
+// CHECK15-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR5]]
 // CHECK15-NEXT:    [[ARRAY_BEGIN6:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0
 // CHECK15-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN6]], i32 2
 // CHECK15-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK15:       arraydestroy.body:
 // CHECK15-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP17]], [[OMP_LOOP_EXIT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK15-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1
-// CHECK15-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR6]]
+// CHECK15-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
 // CHECK15-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN6]]
 // CHECK15-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE7:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK15:       arraydestroy.done7:
@@ -2907,17 +2915,17 @@ int main() {
 //
 //
 // CHECK15-LABEL: define {{[^@]+}}@_ZN1SIiED1Ev
-// CHECK15-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR2]] comdat align 2 {
+// CHECK15-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
 // CHECK15-NEXT:  entry:
 // CHECK15-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
 // CHECK15-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
 // CHECK15-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
-// CHECK15-NEXT:    call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR6]]
+// CHECK15-NEXT:    call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]]
 // CHECK15-NEXT:    ret void
 //
 //
 // CHECK15-LABEL: define {{[^@]+}}@_ZN1SIfEC2Ev
-// CHECK15-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR2]] comdat align 2 {
+// CHECK15-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
 // CHECK15-NEXT:  entry:
 // CHECK15-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
 // CHECK15-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
@@ -2930,7 +2938,7 @@ int main() {
 //
 //
 // CHECK15-LABEL: define {{[^@]+}}@_ZN1SIfED2Ev
-// CHECK15-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR2]] comdat align 2 {
+// CHECK15-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
 // CHECK15-NEXT:  entry:
 // CHECK15-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
 // CHECK15-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
@@ -2939,7 +2947,7 @@ int main() {
 //
 //
 // CHECK15-LABEL: define {{[^@]+}}@_ZN1SIiEC2Ev
-// CHECK15-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR2]] comdat align 2 {
+// CHECK15-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
 // CHECK15-NEXT:  entry:
 // CHECK15-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
 // CHECK15-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
@@ -2951,7 +2959,7 @@ int main() {
 //
 //
 // CHECK15-LABEL: define {{[^@]+}}@_ZN1SIiED2Ev
-// CHECK15-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR2]] comdat align 2 {
+// CHECK15-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
 // CHECK15-NEXT:  entry:
 // CHECK15-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
 // CHECK15-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
@@ -2960,14 +2968,16 @@ int main() {
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l104
-// CHECK17-SAME: () #[[ATTR0:[0-9]+]] {
+// CHECK17-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK17-NEXT:  entry:
+// CHECK17-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
+// CHECK17-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK17-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3:[0-9]+]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l104.omp_outlined)
 // CHECK17-NEXT:    ret void
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l104.omp_outlined
-// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
 // CHECK17-NEXT:  entry:
 // CHECK17-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -3034,7 +3044,7 @@ int main() {
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l104.omp_outlined.omp_outlined
-// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR0]] {
 // CHECK17-NEXT:  entry:
 // CHECK17-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -3107,7 +3117,7 @@ int main() {
 // CHECK17-NEXT:    store ptr [[TMP13]], ptr [[TMP12]], align 8
 // CHECK17-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[CLASS_ANON]], ptr [[REF_TMP]], i32 0, i32 2
 // CHECK17-NEXT:    store ptr [[SIVAR]], ptr [[TMP14]], align 8
-// CHECK17-NEXT:    call void @"_ZZZ4mainENK3$_0clEvENKUlvE_clEv"(ptr noundef nonnull align 8 dereferenceable(24) [[REF_TMP]]) #[[ATTR4:[0-9]+]]
+// CHECK17-NEXT:    call void @"_ZZZ4mainENK3$_0clEvENKUlvE_clEv"(ptr noundef nonnull align 8 dereferenceable(24) [[REF_TMP]]) #[[ATTR3:[0-9]+]]
 // CHECK17-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK17:       omp.body.continue:
 // CHECK17-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_proc_bind_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_proc_bind_codegen.cpp
index 7cd13750cf451..3642c9ca4c62b 100644
--- a/clang/test/OpenMP/target_teams_distribute_parallel_for_proc_bind_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_proc_bind_codegen.cpp
@@ -87,7 +87,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
 // CHECK1-NEXT:    br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK1:       omp_offload.failed:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l36() #[[ATTR3:[0-9]+]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l36() #[[ATTR2:[0-9]+]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
@@ -120,7 +120,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0
 // CHECK1-NEXT:    br i1 [[TMP29]], label [[OMP_OFFLOAD_FAILED3:%.*]], label [[OMP_OFFLOAD_CONT4:%.*]]
 // CHECK1:       omp_offload.failed3:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l38() #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l38() #[[ATTR2]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT4]]
 // CHECK1:       omp_offload.cont4:
 // CHECK1-NEXT:    [[CALL:%.*]] = call noundef signext i32 @_Z5tmainIiET_v()
@@ -135,7 +135,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l36.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -196,7 +196,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l36.omp_outlined.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -273,7 +273,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l38.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -334,7 +334,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l38.omp_outlined.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -404,7 +404,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@_Z5tmainIiET_v
-// CHECK1-SAME: () #[[ATTR4:[0-9]+]] comdat {
+// CHECK1-SAME: () #[[ATTR3:[0-9]+]] comdat {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[TMP:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
@@ -438,7 +438,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
 // CHECK1-NEXT:    br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK1:       omp_offload.failed:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l30() #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l30() #[[ATTR2]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    ret i32 0
@@ -452,7 +452,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l30.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -513,7 +513,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l30.omp_outlined.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -583,7 +583,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK1-SAME: () #[[ATTR5:[0-9]+]] {
+// CHECK1-SAME: () #[[ATTR4:[0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK1-NEXT:    ret void
diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_reduction_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_reduction_codegen.cpp
index 9f6082b34f149..fa48c02a65647 100644
--- a/clang/test/OpenMP/target_teams_distribute_parallel_for_reduction_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_reduction_codegen.cpp
@@ -138,7 +138,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
 // CHECK1-NEXT:    br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK1:       omp_offload.failed:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l66(ptr @_ZZ4mainE5sivar) #[[ATTR3:[0-9]+]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l66(ptr @_ZZ4mainE5sivar) #[[ATTR2:[0-9]+]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    [[CALL:%.*]] = call noundef signext i32 @_Z5tmainIiET_v()
@@ -156,7 +156,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l66.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -241,7 +241,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l66.omp_outlined.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -340,7 +340,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l66.omp_outlined.omp_outlined.omp.reduction.reduction_func
-// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4:[0-9]+]] {
+// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3:[0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8
@@ -360,7 +360,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l66.omp_outlined.omp.reduction.reduction_func
-// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4]] {
+// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8
@@ -380,7 +380,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@_Z5tmainIiET_v
-// CHECK1-SAME: () #[[ATTR6:[0-9]+]] comdat {
+// CHECK1-SAME: () #[[ATTR5:[0-9]+]] comdat {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[T_VAR:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[VEC:%.*]] = alloca [2 x i32], align 4
@@ -429,7 +429,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
 // CHECK1-NEXT:    br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK1:       omp_offload.failed:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32(ptr [[T_VAR]]) #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32(ptr [[T_VAR]]) #[[ATTR2]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    ret i32 0
@@ -446,7 +446,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -531,7 +531,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -630,7 +630,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined.omp_outlined.omp.reduction.reduction_func
-// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4]] {
+// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8
@@ -650,7 +650,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined.omp.reduction.reduction_func
-// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4]] {
+// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8
@@ -670,7 +670,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK1-SAME: () #[[ATTR8:[0-9]+]] {
+// CHECK1-SAME: () #[[ATTR7:[0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK1-NEXT:    ret void
@@ -724,7 +724,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
 // CHECK3-NEXT:    br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK3:       omp_offload.failed:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l66(ptr @_ZZ4mainE5sivar) #[[ATTR3:[0-9]+]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l66(ptr @_ZZ4mainE5sivar) #[[ATTR2:[0-9]+]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK3:       omp_offload.cont:
 // CHECK3-NEXT:    [[CALL:%.*]] = call noundef i32 @_Z5tmainIiET_v()
@@ -742,7 +742,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l66.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -825,7 +825,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l66.omp_outlined.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -922,7 +922,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l66.omp_outlined.omp_outlined.omp.reduction.reduction_func
-// CHECK3-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4:[0-9]+]] {
+// CHECK3-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3:[0-9]+]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 4
@@ -942,7 +942,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l66.omp_outlined.omp.reduction.reduction_func
-// CHECK3-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4]] {
+// CHECK3-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 4
@@ -962,7 +962,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@_Z5tmainIiET_v
-// CHECK3-SAME: () #[[ATTR6:[0-9]+]] comdat {
+// CHECK3-SAME: () #[[ATTR5:[0-9]+]] comdat {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[T_VAR:%.*]] = alloca i32, align 4
 // CHECK3-NEXT:    [[VEC:%.*]] = alloca [2 x i32], align 4
@@ -1011,7 +1011,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
 // CHECK3-NEXT:    br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK3:       omp_offload.failed:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32(ptr [[T_VAR]]) #[[ATTR3]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32(ptr [[T_VAR]]) #[[ATTR2]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK3:       omp_offload.cont:
 // CHECK3-NEXT:    ret i32 0
@@ -1028,7 +1028,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1111,7 +1111,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1208,7 +1208,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined.omp_outlined.omp.reduction.reduction_func
-// CHECK3-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4]] {
+// CHECK3-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 4
@@ -1228,7 +1228,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined.omp.reduction.reduction_func
-// CHECK3-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4]] {
+// CHECK3-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 4
@@ -1248,7 +1248,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK3-SAME: () #[[ATTR8:[0-9]+]] {
+// CHECK3-SAME: () #[[ATTR7:[0-9]+]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK3-NEXT:    ret void
@@ -1275,7 +1275,7 @@ int main() {
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l44.omp_outlined
-// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR2]] {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1360,7 +1360,7 @@ int main() {
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l44.omp_outlined.omp_outlined
-// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR3]] {
+// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR2]] {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1463,7 +1463,7 @@ int main() {
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l44.omp_outlined.omp_outlined.omp.reduction.reduction_func
-// CHECK5-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR5:[0-9]+]] {
+// CHECK5-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4:[0-9]+]] {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8
@@ -1483,7 +1483,7 @@ int main() {
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l44.omp_outlined.omp.reduction.reduction_func
-// CHECK5-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR5]] {
+// CHECK5-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4]] {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8
@@ -1503,7 +1503,7 @@ int main() {
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK5-SAME: () #[[ATTR7:[0-9]+]] {
+// CHECK5-SAME: () #[[ATTR6:[0-9]+]] {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK5-NEXT:    ret void
diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_schedule_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_schedule_codegen.cpp
index d6f9b412bebe8..3cc57b74616f1 100644
--- a/clang/test/OpenMP/target_teams_distribute_parallel_for_schedule_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_schedule_codegen.cpp
@@ -272,7 +272,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
 // CHECK1-NEXT:    br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK1:       omp_offload.failed:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l36(ptr [[THIS1]]) #[[ATTR3:[0-9]+]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l36(ptr [[THIS1]]) #[[ATTR2:[0-9]+]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    [[A2:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0
@@ -314,7 +314,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP39:%.*]] = icmp ne i32 [[TMP38]], 0
 // CHECK1-NEXT:    br i1 [[TMP39]], label [[OMP_OFFLOAD_FAILED8:%.*]], label [[OMP_OFFLOAD_CONT9:%.*]]
 // CHECK1:       omp_offload.failed8:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l40(ptr [[THIS1]]) #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l40(ptr [[THIS1]]) #[[ATTR2]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT9]]
 // CHECK1:       omp_offload.cont9:
 // CHECK1-NEXT:    [[A10:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0
@@ -356,7 +356,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP59:%.*]] = icmp ne i32 [[TMP58]], 0
 // CHECK1-NEXT:    br i1 [[TMP59]], label [[OMP_OFFLOAD_FAILED16:%.*]], label [[OMP_OFFLOAD_CONT17:%.*]]
 // CHECK1:       omp_offload.failed16:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l44(ptr [[THIS1]]) #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l44(ptr [[THIS1]]) #[[ATTR2]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT17]]
 // CHECK1:       omp_offload.cont17:
 // CHECK1-NEXT:    [[A18:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0
@@ -398,7 +398,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP79:%.*]] = icmp ne i32 [[TMP78]], 0
 // CHECK1-NEXT:    br i1 [[TMP79]], label [[OMP_OFFLOAD_FAILED24:%.*]], label [[OMP_OFFLOAD_CONT25:%.*]]
 // CHECK1:       omp_offload.failed24:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l49(ptr [[THIS1]]) #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l49(ptr [[THIS1]]) #[[ATTR2]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT25]]
 // CHECK1:       omp_offload.cont25:
 // CHECK1-NEXT:    [[A26:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0
@@ -440,7 +440,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP99:%.*]] = icmp ne i32 [[TMP98]], 0
 // CHECK1-NEXT:    br i1 [[TMP99]], label [[OMP_OFFLOAD_FAILED32:%.*]], label [[OMP_OFFLOAD_CONT33:%.*]]
 // CHECK1:       omp_offload.failed32:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l54(ptr [[THIS1]]) #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l54(ptr [[THIS1]]) #[[ATTR2]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT33]]
 // CHECK1:       omp_offload.cont33:
 // CHECK1-NEXT:    [[A34:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0
@@ -460,7 +460,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l36.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -523,7 +523,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l36.omp_outlined.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -611,7 +611,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l40.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -674,7 +674,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l40.omp_outlined.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -762,7 +762,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l44.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -825,7 +825,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l44.omp_outlined.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -934,7 +934,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l49.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -997,7 +997,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l49.omp_outlined.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1083,7 +1083,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l54.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1146,7 +1146,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l54.omp_outlined.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1222,7 +1222,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK1-SAME: () #[[ATTR4:[0-9]+]] {
+// CHECK1-SAME: () #[[ATTR3:[0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK1-NEXT:    ret void
@@ -1306,7 +1306,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
 // CHECK3-NEXT:    br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK3:       omp_offload.failed:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l36(ptr [[THIS1]]) #[[ATTR3:[0-9]+]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l36(ptr [[THIS1]]) #[[ATTR2:[0-9]+]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK3:       omp_offload.cont:
 // CHECK3-NEXT:    [[A2:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0
@@ -1348,7 +1348,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP39:%.*]] = icmp ne i32 [[TMP38]], 0
 // CHECK3-NEXT:    br i1 [[TMP39]], label [[OMP_OFFLOAD_FAILED8:%.*]], label [[OMP_OFFLOAD_CONT9:%.*]]
 // CHECK3:       omp_offload.failed8:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l40(ptr [[THIS1]]) #[[ATTR3]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l40(ptr [[THIS1]]) #[[ATTR2]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT9]]
 // CHECK3:       omp_offload.cont9:
 // CHECK3-NEXT:    [[A10:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0
@@ -1390,7 +1390,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP59:%.*]] = icmp ne i32 [[TMP58]], 0
 // CHECK3-NEXT:    br i1 [[TMP59]], label [[OMP_OFFLOAD_FAILED16:%.*]], label [[OMP_OFFLOAD_CONT17:%.*]]
 // CHECK3:       omp_offload.failed16:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l44(ptr [[THIS1]]) #[[ATTR3]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l44(ptr [[THIS1]]) #[[ATTR2]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT17]]
 // CHECK3:       omp_offload.cont17:
 // CHECK3-NEXT:    [[A18:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0
@@ -1432,7 +1432,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP79:%.*]] = icmp ne i32 [[TMP78]], 0
 // CHECK3-NEXT:    br i1 [[TMP79]], label [[OMP_OFFLOAD_FAILED24:%.*]], label [[OMP_OFFLOAD_CONT25:%.*]]
 // CHECK3:       omp_offload.failed24:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l49(ptr [[THIS1]]) #[[ATTR3]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l49(ptr [[THIS1]]) #[[ATTR2]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT25]]
 // CHECK3:       omp_offload.cont25:
 // CHECK3-NEXT:    [[A26:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0
@@ -1474,7 +1474,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP99:%.*]] = icmp ne i32 [[TMP98]], 0
 // CHECK3-NEXT:    br i1 [[TMP99]], label [[OMP_OFFLOAD_FAILED32:%.*]], label [[OMP_OFFLOAD_CONT33:%.*]]
 // CHECK3:       omp_offload.failed32:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l54(ptr [[THIS1]]) #[[ATTR3]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l54(ptr [[THIS1]]) #[[ATTR2]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT33]]
 // CHECK3:       omp_offload.cont33:
 // CHECK3-NEXT:    [[A34:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0
@@ -1494,7 +1494,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l36.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1555,7 +1555,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l36.omp_outlined.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1640,7 +1640,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l40.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1701,7 +1701,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l40.omp_outlined.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1786,7 +1786,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l44.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1847,7 +1847,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l44.omp_outlined.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1951,7 +1951,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l49.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -2012,7 +2012,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l49.omp_outlined.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -2095,7 +2095,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l54.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -2156,7 +2156,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l54.omp_outlined.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -2229,7 +2229,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK3-SAME: () #[[ATTR4:[0-9]+]] {
+// CHECK3-SAME: () #[[ATTR3:[0-9]+]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK3-NEXT:    ret void
@@ -2313,7 +2313,7 @@ int main (int argc, char **argv) {
 // CHECK5-NEXT:    [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
 // CHECK5-NEXT:    br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK5:       omp_offload.failed:
-// CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l36(ptr [[THIS1]]) #[[ATTR3:[0-9]+]]
+// CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l36(ptr [[THIS1]]) #[[ATTR2:[0-9]+]]
 // CHECK5-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK5:       omp_offload.cont:
 // CHECK5-NEXT:    [[A2:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0
@@ -2355,7 +2355,7 @@ int main (int argc, char **argv) {
 // CHECK5-NEXT:    [[TMP39:%.*]] = icmp ne i32 [[TMP38]], 0
 // CHECK5-NEXT:    br i1 [[TMP39]], label [[OMP_OFFLOAD_FAILED8:%.*]], label [[OMP_OFFLOAD_CONT9:%.*]]
 // CHECK5:       omp_offload.failed8:
-// CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l40(ptr [[THIS1]]) #[[ATTR3]]
+// CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l40(ptr [[THIS1]]) #[[ATTR2]]
 // CHECK5-NEXT:    br label [[OMP_OFFLOAD_CONT9]]
 // CHECK5:       omp_offload.cont9:
 // CHECK5-NEXT:    [[A10:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0
@@ -2397,7 +2397,7 @@ int main (int argc, char **argv) {
 // CHECK5-NEXT:    [[TMP59:%.*]] = icmp ne i32 [[TMP58]], 0
 // CHECK5-NEXT:    br i1 [[TMP59]], label [[OMP_OFFLOAD_FAILED16:%.*]], label [[OMP_OFFLOAD_CONT17:%.*]]
 // CHECK5:       omp_offload.failed16:
-// CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l44(ptr [[THIS1]]) #[[ATTR3]]
+// CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l44(ptr [[THIS1]]) #[[ATTR2]]
 // CHECK5-NEXT:    br label [[OMP_OFFLOAD_CONT17]]
 // CHECK5:       omp_offload.cont17:
 // CHECK5-NEXT:    [[A18:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0
@@ -2439,7 +2439,7 @@ int main (int argc, char **argv) {
 // CHECK5-NEXT:    [[TMP79:%.*]] = icmp ne i32 [[TMP78]], 0
 // CHECK5-NEXT:    br i1 [[TMP79]], label [[OMP_OFFLOAD_FAILED24:%.*]], label [[OMP_OFFLOAD_CONT25:%.*]]
 // CHECK5:       omp_offload.failed24:
-// CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l49(ptr [[THIS1]]) #[[ATTR3]]
+// CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l49(ptr [[THIS1]]) #[[ATTR2]]
 // CHECK5-NEXT:    br label [[OMP_OFFLOAD_CONT25]]
 // CHECK5:       omp_offload.cont25:
 // CHECK5-NEXT:    [[A26:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0
@@ -2481,7 +2481,7 @@ int main (int argc, char **argv) {
 // CHECK5-NEXT:    [[TMP99:%.*]] = icmp ne i32 [[TMP98]], 0
 // CHECK5-NEXT:    br i1 [[TMP99]], label [[OMP_OFFLOAD_FAILED32:%.*]], label [[OMP_OFFLOAD_CONT33:%.*]]
 // CHECK5:       omp_offload.failed32:
-// CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l54(ptr [[THIS1]]) #[[ATTR3]]
+// CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l54(ptr [[THIS1]]) #[[ATTR2]]
 // CHECK5-NEXT:    br label [[OMP_OFFLOAD_CONT33]]
 // CHECK5:       omp_offload.cont33:
 // CHECK5-NEXT:    [[A34:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0
@@ -2501,7 +2501,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l36.omp_outlined
-// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -2564,7 +2564,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l36.omp_outlined.omp_outlined
-// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -2652,7 +2652,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l40.omp_outlined
-// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -2715,7 +2715,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l40.omp_outlined.omp_outlined
-// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -2803,7 +2803,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l44.omp_outlined
-// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -2866,7 +2866,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l44.omp_outlined.omp_outlined
-// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -2975,7 +2975,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l49.omp_outlined
-// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -3038,7 +3038,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l49.omp_outlined.omp_outlined
-// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -3124,7 +3124,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l54.omp_outlined
-// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -3187,7 +3187,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l54.omp_outlined.omp_outlined
-// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -3263,7 +3263,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK5-SAME: () #[[ATTR4:[0-9]+]] {
+// CHECK5-SAME: () #[[ATTR3:[0-9]+]] {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK5-NEXT:    ret void
@@ -3347,7 +3347,7 @@ int main (int argc, char **argv) {
 // CHECK7-NEXT:    [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
 // CHECK7-NEXT:    br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK7:       omp_offload.failed:
-// CHECK7-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l36(ptr [[THIS1]]) #[[ATTR3:[0-9]+]]
+// CHECK7-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l36(ptr [[THIS1]]) #[[ATTR2:[0-9]+]]
 // CHECK7-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK7:       omp_offload.cont:
 // CHECK7-NEXT:    [[A2:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0
@@ -3389,7 +3389,7 @@ int main (int argc, char **argv) {
 // CHECK7-NEXT:    [[TMP39:%.*]] = icmp ne i32 [[TMP38]], 0
 // CHECK7-NEXT:    br i1 [[TMP39]], label [[OMP_OFFLOAD_FAILED8:%.*]], label [[OMP_OFFLOAD_CONT9:%.*]]
 // CHECK7:       omp_offload.failed8:
-// CHECK7-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l40(ptr [[THIS1]]) #[[ATTR3]]
+// CHECK7-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l40(ptr [[THIS1]]) #[[ATTR2]]
 // CHECK7-NEXT:    br label [[OMP_OFFLOAD_CONT9]]
 // CHECK7:       omp_offload.cont9:
 // CHECK7-NEXT:    [[A10:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0
@@ -3431,7 +3431,7 @@ int main (int argc, char **argv) {
 // CHECK7-NEXT:    [[TMP59:%.*]] = icmp ne i32 [[TMP58]], 0
 // CHECK7-NEXT:    br i1 [[TMP59]], label [[OMP_OFFLOAD_FAILED16:%.*]], label [[OMP_OFFLOAD_CONT17:%.*]]
 // CHECK7:       omp_offload.failed16:
-// CHECK7-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l44(ptr [[THIS1]]) #[[ATTR3]]
+// CHECK7-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l44(ptr [[THIS1]]) #[[ATTR2]]
 // CHECK7-NEXT:    br label [[OMP_OFFLOAD_CONT17]]
 // CHECK7:       omp_offload.cont17:
 // CHECK7-NEXT:    [[A18:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0
@@ -3473,7 +3473,7 @@ int main (int argc, char **argv) {
 // CHECK7-NEXT:    [[TMP79:%.*]] = icmp ne i32 [[TMP78]], 0
 // CHECK7-NEXT:    br i1 [[TMP79]], label [[OMP_OFFLOAD_FAILED24:%.*]], label [[OMP_OFFLOAD_CONT25:%.*]]
 // CHECK7:       omp_offload.failed24:
-// CHECK7-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l49(ptr [[THIS1]]) #[[ATTR3]]
+// CHECK7-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l49(ptr [[THIS1]]) #[[ATTR2]]
 // CHECK7-NEXT:    br label [[OMP_OFFLOAD_CONT25]]
 // CHECK7:       omp_offload.cont25:
 // CHECK7-NEXT:    [[A26:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0
@@ -3515,7 +3515,7 @@ int main (int argc, char **argv) {
 // CHECK7-NEXT:    [[TMP99:%.*]] = icmp ne i32 [[TMP98]], 0
 // CHECK7-NEXT:    br i1 [[TMP99]], label [[OMP_OFFLOAD_FAILED32:%.*]], label [[OMP_OFFLOAD_CONT33:%.*]]
 // CHECK7:       omp_offload.failed32:
-// CHECK7-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l54(ptr [[THIS1]]) #[[ATTR3]]
+// CHECK7-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l54(ptr [[THIS1]]) #[[ATTR2]]
 // CHECK7-NEXT:    br label [[OMP_OFFLOAD_CONT33]]
 // CHECK7:       omp_offload.cont33:
 // CHECK7-NEXT:    [[A34:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0
@@ -3535,7 +3535,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l36.omp_outlined
-// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK7-NEXT:  entry:
 // CHECK7-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK7-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -3596,7 +3596,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l36.omp_outlined.omp_outlined
-// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK7-NEXT:  entry:
 // CHECK7-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK7-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -3681,7 +3681,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l40.omp_outlined
-// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK7-NEXT:  entry:
 // CHECK7-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK7-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -3742,7 +3742,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l40.omp_outlined.omp_outlined
-// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK7-NEXT:  entry:
 // CHECK7-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK7-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -3827,7 +3827,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l44.omp_outlined
-// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK7-NEXT:  entry:
 // CHECK7-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK7-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -3888,7 +3888,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l44.omp_outlined.omp_outlined
-// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK7-NEXT:  entry:
 // CHECK7-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK7-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -3992,7 +3992,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l49.omp_outlined
-// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK7-NEXT:  entry:
 // CHECK7-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK7-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -4053,7 +4053,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l49.omp_outlined.omp_outlined
-// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK7-NEXT:  entry:
 // CHECK7-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK7-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -4136,7 +4136,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l54.omp_outlined
-// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK7-NEXT:  entry:
 // CHECK7-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK7-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -4197,7 +4197,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l54.omp_outlined.omp_outlined
-// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK7-NEXT:  entry:
 // CHECK7-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK7-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -4270,7 +4270,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK7-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK7-SAME: () #[[ATTR4:[0-9]+]] {
+// CHECK7-SAME: () #[[ATTR3:[0-9]+]] {
 // CHECK7-NEXT:  entry:
 // CHECK7-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK7-NEXT:    ret void
@@ -4414,7 +4414,7 @@ int main (int argc, char **argv) {
 // CHECK13-NEXT:    [[TMP37:%.*]] = icmp ne i32 [[TMP36]], 0
 // CHECK13-NEXT:    br i1 [[TMP37]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK13:       omp_offload.failed:
-// CHECK13-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l139(i64 [[TMP4]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR4:[0-9]+]]
+// CHECK13-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l139(i64 [[TMP4]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR3:[0-9]+]]
 // CHECK13-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK13:       omp_offload.cont:
 // CHECK13-NEXT:    [[TMP38:%.*]] = load i32, ptr [[N]], align 4
@@ -4485,7 +4485,7 @@ int main (int argc, char **argv) {
 // CHECK13-NEXT:    [[TMP72:%.*]] = icmp ne i32 [[TMP71]], 0
 // CHECK13-NEXT:    br i1 [[TMP72]], label [[OMP_OFFLOAD_FAILED16:%.*]], label [[OMP_OFFLOAD_CONT17:%.*]]
 // CHECK13:       omp_offload.failed16:
-// CHECK13-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l143(i64 [[TMP39]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR4]]
+// CHECK13-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l143(i64 [[TMP39]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR3]]
 // CHECK13-NEXT:    br label [[OMP_OFFLOAD_CONT17]]
 // CHECK13:       omp_offload.cont17:
 // CHECK13-NEXT:    [[TMP73:%.*]] = load i32, ptr [[M]], align 4
@@ -4567,7 +4567,7 @@ int main (int argc, char **argv) {
 // CHECK13-NEXT:    [[TMP113:%.*]] = icmp ne i32 [[TMP112]], 0
 // CHECK13-NEXT:    br i1 [[TMP113]], label [[OMP_OFFLOAD_FAILED32:%.*]], label [[OMP_OFFLOAD_CONT33:%.*]]
 // CHECK13:       omp_offload.failed32:
-// CHECK13-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l147(i64 [[TMP75]], i64 [[TMP1]], ptr [[VLA]], i64 [[TMP77]]) #[[ATTR4]]
+// CHECK13-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l147(i64 [[TMP75]], i64 [[TMP1]], ptr [[VLA]], i64 [[TMP77]]) #[[ATTR3]]
 // CHECK13-NEXT:    br label [[OMP_OFFLOAD_CONT33]]
 // CHECK13:       omp_offload.cont33:
 // CHECK13-NEXT:    [[TMP114:%.*]] = load i32, ptr [[N]], align 4
@@ -4638,7 +4638,7 @@ int main (int argc, char **argv) {
 // CHECK13-NEXT:    [[TMP148:%.*]] = icmp ne i32 [[TMP147]], 0
 // CHECK13-NEXT:    br i1 [[TMP148]], label [[OMP_OFFLOAD_FAILED47:%.*]], label [[OMP_OFFLOAD_CONT48:%.*]]
 // CHECK13:       omp_offload.failed47:
-// CHECK13-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l151(i64 [[TMP115]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR4]]
+// CHECK13-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l151(i64 [[TMP115]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR3]]
 // CHECK13-NEXT:    br label [[OMP_OFFLOAD_CONT48]]
 // CHECK13:       omp_offload.cont48:
 // CHECK13-NEXT:    [[TMP149:%.*]] = load i32, ptr [[M]], align 4
@@ -4720,7 +4720,7 @@ int main (int argc, char **argv) {
 // CHECK13-NEXT:    [[TMP189:%.*]] = icmp ne i32 [[TMP188]], 0
 // CHECK13-NEXT:    br i1 [[TMP189]], label [[OMP_OFFLOAD_FAILED64:%.*]], label [[OMP_OFFLOAD_CONT65:%.*]]
 // CHECK13:       omp_offload.failed64:
-// CHECK13-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l155(i64 [[TMP151]], i64 [[TMP1]], ptr [[VLA]], i64 [[TMP153]]) #[[ATTR4]]
+// CHECK13-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l155(i64 [[TMP151]], i64 [[TMP1]], ptr [[VLA]], i64 [[TMP153]]) #[[ATTR3]]
 // CHECK13-NEXT:    br label [[OMP_OFFLOAD_CONT65]]
 // CHECK13:       omp_offload.cont65:
 // CHECK13-NEXT:    [[TMP190:%.*]] = load i32, ptr [[ARGC_ADDR]], align 4
@@ -4752,7 +4752,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l139.omp_outlined
-// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -4846,7 +4846,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l139.omp_outlined.omp_outlined
-// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] {
+// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -4969,7 +4969,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l143.omp_outlined
-// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] {
+// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -5063,7 +5063,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l143.omp_outlined.omp_outlined
-// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] {
+// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -5192,7 +5192,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l147.omp_outlined
-// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] {
+// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -5317,7 +5317,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l147.omp_outlined.omp_outlined
-// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] {
+// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -5442,7 +5442,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l151.omp_outlined
-// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] {
+// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -5536,7 +5536,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l151.omp_outlined.omp_outlined
-// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] {
+// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -5661,7 +5661,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l155.omp_outlined
-// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] {
+// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -5761,7 +5761,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l155.omp_outlined.omp_outlined
-// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] {
+// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -5864,7 +5864,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@_Z5tmainIiLi10EEiT_
-// CHECK13-SAME: (i32 noundef signext [[ARGC:%.*]]) #[[ATTR6:[0-9]+]] comdat {
+// CHECK13-SAME: (i32 noundef signext [[ARGC:%.*]]) #[[ATTR5:[0-9]+]] comdat {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[ARGC_ADDR:%.*]] = alloca i32, align 4
 // CHECK13-NEXT:    [[A:%.*]] = alloca [10 x i32], align 4
@@ -5938,7 +5938,7 @@ int main (int argc, char **argv) {
 // CHECK13-NEXT:    [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
 // CHECK13-NEXT:    br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK13:       omp_offload.failed:
-// CHECK13-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l112(ptr [[A]]) #[[ATTR4]]
+// CHECK13-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l112(ptr [[A]]) #[[ATTR3]]
 // CHECK13-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK13:       omp_offload.cont:
 // CHECK13-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
@@ -5979,7 +5979,7 @@ int main (int argc, char **argv) {
 // CHECK13-NEXT:    [[TMP39:%.*]] = icmp ne i32 [[TMP38]], 0
 // CHECK13-NEXT:    br i1 [[TMP39]], label [[OMP_OFFLOAD_FAILED6:%.*]], label [[OMP_OFFLOAD_CONT7:%.*]]
 // CHECK13:       omp_offload.failed6:
-// CHECK13-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l116(ptr [[A]]) #[[ATTR4]]
+// CHECK13-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l116(ptr [[A]]) #[[ATTR3]]
 // CHECK13-NEXT:    br label [[OMP_OFFLOAD_CONT7]]
 // CHECK13:       omp_offload.cont7:
 // CHECK13-NEXT:    [[TMP40:%.*]] = load i32, ptr [[M]], align 4
@@ -6031,7 +6031,7 @@ int main (int argc, char **argv) {
 // CHECK13-NEXT:    [[TMP65:%.*]] = icmp ne i32 [[TMP64]], 0
 // CHECK13-NEXT:    br i1 [[TMP65]], label [[OMP_OFFLOAD_FAILED13:%.*]], label [[OMP_OFFLOAD_CONT14:%.*]]
 // CHECK13:       omp_offload.failed13:
-// CHECK13-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l120(ptr [[A]], i64 [[TMP42]]) #[[ATTR4]]
+// CHECK13-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l120(ptr [[A]], i64 [[TMP42]]) #[[ATTR3]]
 // CHECK13-NEXT:    br label [[OMP_OFFLOAD_CONT14]]
 // CHECK13:       omp_offload.cont14:
 // CHECK13-NEXT:    [[TMP66:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS15]], i32 0, i32 0
@@ -6072,7 +6072,7 @@ int main (int argc, char **argv) {
 // CHECK13-NEXT:    [[TMP85:%.*]] = icmp ne i32 [[TMP84]], 0
 // CHECK13-NEXT:    br i1 [[TMP85]], label [[OMP_OFFLOAD_FAILED20:%.*]], label [[OMP_OFFLOAD_CONT21:%.*]]
 // CHECK13:       omp_offload.failed20:
-// CHECK13-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l124(ptr [[A]]) #[[ATTR4]]
+// CHECK13-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l124(ptr [[A]]) #[[ATTR3]]
 // CHECK13-NEXT:    br label [[OMP_OFFLOAD_CONT21]]
 // CHECK13:       omp_offload.cont21:
 // CHECK13-NEXT:    [[TMP86:%.*]] = load i32, ptr [[M]], align 4
@@ -6124,7 +6124,7 @@ int main (int argc, char **argv) {
 // CHECK13-NEXT:    [[TMP111:%.*]] = icmp ne i32 [[TMP110]], 0
 // CHECK13-NEXT:    br i1 [[TMP111]], label [[OMP_OFFLOAD_FAILED29:%.*]], label [[OMP_OFFLOAD_CONT30:%.*]]
 // CHECK13:       omp_offload.failed29:
-// CHECK13-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l128(ptr [[A]], i64 [[TMP88]]) #[[ATTR4]]
+// CHECK13-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l128(ptr [[A]], i64 [[TMP88]]) #[[ATTR3]]
 // CHECK13-NEXT:    br label [[OMP_OFFLOAD_CONT30]]
 // CHECK13:       omp_offload.cont30:
 // CHECK13-NEXT:    ret i32 0
@@ -6141,7 +6141,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l112.omp_outlined
-// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] {
+// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -6204,7 +6204,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l112.omp_outlined.omp_outlined
-// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] {
+// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -6291,7 +6291,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l116.omp_outlined
-// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] {
+// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -6354,7 +6354,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l116.omp_outlined.omp_outlined
-// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] {
+// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -6447,7 +6447,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l120.omp_outlined
-// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] {
+// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -6516,7 +6516,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l120.omp_outlined.omp_outlined
-// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] {
+// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -6627,7 +6627,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l124.omp_outlined
-// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] {
+// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -6690,7 +6690,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l124.omp_outlined.omp_outlined
-// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] {
+// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -6781,7 +6781,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l128.omp_outlined
-// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] {
+// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -6850,7 +6850,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l128.omp_outlined.omp_outlined
-// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] {
+// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -6928,7 +6928,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK13-SAME: () #[[ATTR7:[0-9]+]] {
+// CHECK13-SAME: () #[[ATTR6:[0-9]+]] {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK13-NEXT:    ret void
@@ -7072,7 +7072,7 @@ int main (int argc, char **argv) {
 // CHECK15-NEXT:    [[TMP37:%.*]] = icmp ne i32 [[TMP36]], 0
 // CHECK15-NEXT:    br i1 [[TMP37]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK15:       omp_offload.failed:
-// CHECK15-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l139(i32 [[TMP3]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR4:[0-9]+]]
+// CHECK15-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l139(i32 [[TMP3]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR3:[0-9]+]]
 // CHECK15-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK15:       omp_offload.cont:
 // CHECK15-NEXT:    [[TMP38:%.*]] = load i32, ptr [[N]], align 4
@@ -7144,7 +7144,7 @@ int main (int argc, char **argv) {
 // CHECK15-NEXT:    [[TMP73:%.*]] = icmp ne i32 [[TMP72]], 0
 // CHECK15-NEXT:    br i1 [[TMP73]], label [[OMP_OFFLOAD_FAILED16:%.*]], label [[OMP_OFFLOAD_CONT17:%.*]]
 // CHECK15:       omp_offload.failed16:
-// CHECK15-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l143(i32 [[TMP39]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR4]]
+// CHECK15-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l143(i32 [[TMP39]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR3]]
 // CHECK15-NEXT:    br label [[OMP_OFFLOAD_CONT17]]
 // CHECK15:       omp_offload.cont17:
 // CHECK15-NEXT:    [[TMP74:%.*]] = load i32, ptr [[M]], align 4
@@ -7227,7 +7227,7 @@ int main (int argc, char **argv) {
 // CHECK15-NEXT:    [[TMP115:%.*]] = icmp ne i32 [[TMP114]], 0
 // CHECK15-NEXT:    br i1 [[TMP115]], label [[OMP_OFFLOAD_FAILED32:%.*]], label [[OMP_OFFLOAD_CONT33:%.*]]
 // CHECK15:       omp_offload.failed32:
-// CHECK15-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l147(i32 [[TMP76]], i32 [[TMP0]], ptr [[VLA]], i32 [[TMP78]]) #[[ATTR4]]
+// CHECK15-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l147(i32 [[TMP76]], i32 [[TMP0]], ptr [[VLA]], i32 [[TMP78]]) #[[ATTR3]]
 // CHECK15-NEXT:    br label [[OMP_OFFLOAD_CONT33]]
 // CHECK15:       omp_offload.cont33:
 // CHECK15-NEXT:    [[TMP116:%.*]] = load i32, ptr [[N]], align 4
@@ -7299,7 +7299,7 @@ int main (int argc, char **argv) {
 // CHECK15-NEXT:    [[TMP151:%.*]] = icmp ne i32 [[TMP150]], 0
 // CHECK15-NEXT:    br i1 [[TMP151]], label [[OMP_OFFLOAD_FAILED47:%.*]], label [[OMP_OFFLOAD_CONT48:%.*]]
 // CHECK15:       omp_offload.failed47:
-// CHECK15-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l151(i32 [[TMP117]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR4]]
+// CHECK15-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l151(i32 [[TMP117]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR3]]
 // CHECK15-NEXT:    br label [[OMP_OFFLOAD_CONT48]]
 // CHECK15:       omp_offload.cont48:
 // CHECK15-NEXT:    [[TMP152:%.*]] = load i32, ptr [[M]], align 4
@@ -7382,7 +7382,7 @@ int main (int argc, char **argv) {
 // CHECK15-NEXT:    [[TMP193:%.*]] = icmp ne i32 [[TMP192]], 0
 // CHECK15-NEXT:    br i1 [[TMP193]], label [[OMP_OFFLOAD_FAILED64:%.*]], label [[OMP_OFFLOAD_CONT65:%.*]]
 // CHECK15:       omp_offload.failed64:
-// CHECK15-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l155(i32 [[TMP154]], i32 [[TMP0]], ptr [[VLA]], i32 [[TMP156]]) #[[ATTR4]]
+// CHECK15-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l155(i32 [[TMP154]], i32 [[TMP0]], ptr [[VLA]], i32 [[TMP156]]) #[[ATTR3]]
 // CHECK15-NEXT:    br label [[OMP_OFFLOAD_CONT65]]
 // CHECK15:       omp_offload.cont65:
 // CHECK15-NEXT:    [[TMP194:%.*]] = load i32, ptr [[ARGC_ADDR]], align 4
@@ -7414,7 +7414,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l139.omp_outlined
-// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] {
 // CHECK15-NEXT:  entry:
 // CHECK15-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK15-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -7506,7 +7506,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l139.omp_outlined.omp_outlined
-// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] {
+// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] {
 // CHECK15-NEXT:  entry:
 // CHECK15-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK15-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -7626,7 +7626,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l143.omp_outlined
-// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] {
+// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] {
 // CHECK15-NEXT:  entry:
 // CHECK15-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK15-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -7718,7 +7718,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l143.omp_outlined.omp_outlined
-// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] {
+// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] {
 // CHECK15-NEXT:  entry:
 // CHECK15-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK15-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -7844,7 +7844,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l147.omp_outlined
-// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] {
+// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
 // CHECK15-NEXT:  entry:
 // CHECK15-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK15-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -7967,7 +7967,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l147.omp_outlined.omp_outlined
-// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] {
+// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
 // CHECK15-NEXT:  entry:
 // CHECK15-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK15-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -8089,7 +8089,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l151.omp_outlined
-// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] {
+// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] {
 // CHECK15-NEXT:  entry:
 // CHECK15-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK15-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -8181,7 +8181,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l151.omp_outlined.omp_outlined
-// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] {
+// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] {
 // CHECK15-NEXT:  entry:
 // CHECK15-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK15-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -8303,7 +8303,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l155.omp_outlined
-// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] {
+// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
 // CHECK15-NEXT:  entry:
 // CHECK15-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK15-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -8401,7 +8401,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l155.omp_outlined.omp_outlined
-// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] {
+// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
 // CHECK15-NEXT:  entry:
 // CHECK15-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK15-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -8501,7 +8501,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK15-LABEL: define {{[^@]+}}@_Z5tmainIiLi10EEiT_
-// CHECK15-SAME: (i32 noundef [[ARGC:%.*]]) #[[ATTR6:[0-9]+]] comdat {
+// CHECK15-SAME: (i32 noundef [[ARGC:%.*]]) #[[ATTR5:[0-9]+]] comdat {
 // CHECK15-NEXT:  entry:
 // CHECK15-NEXT:    [[ARGC_ADDR:%.*]] = alloca i32, align 4
 // CHECK15-NEXT:    [[A:%.*]] = alloca [10 x i32], align 4
@@ -8575,7 +8575,7 @@ int main (int argc, char **argv) {
 // CHECK15-NEXT:    [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
 // CHECK15-NEXT:    br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK15:       omp_offload.failed:
-// CHECK15-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l112(ptr [[A]]) #[[ATTR4]]
+// CHECK15-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l112(ptr [[A]]) #[[ATTR3]]
 // CHECK15-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK15:       omp_offload.cont:
 // CHECK15-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
@@ -8616,7 +8616,7 @@ int main (int argc, char **argv) {
 // CHECK15-NEXT:    [[TMP39:%.*]] = icmp ne i32 [[TMP38]], 0
 // CHECK15-NEXT:    br i1 [[TMP39]], label [[OMP_OFFLOAD_FAILED6:%.*]], label [[OMP_OFFLOAD_CONT7:%.*]]
 // CHECK15:       omp_offload.failed6:
-// CHECK15-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l116(ptr [[A]]) #[[ATTR4]]
+// CHECK15-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l116(ptr [[A]]) #[[ATTR3]]
 // CHECK15-NEXT:    br label [[OMP_OFFLOAD_CONT7]]
 // CHECK15:       omp_offload.cont7:
 // CHECK15-NEXT:    [[TMP40:%.*]] = load i32, ptr [[M]], align 4
@@ -8668,7 +8668,7 @@ int main (int argc, char **argv) {
 // CHECK15-NEXT:    [[TMP65:%.*]] = icmp ne i32 [[TMP64]], 0
 // CHECK15-NEXT:    br i1 [[TMP65]], label [[OMP_OFFLOAD_FAILED13:%.*]], label [[OMP_OFFLOAD_CONT14:%.*]]
 // CHECK15:       omp_offload.failed13:
-// CHECK15-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l120(ptr [[A]], i32 [[TMP42]]) #[[ATTR4]]
+// CHECK15-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l120(ptr [[A]], i32 [[TMP42]]) #[[ATTR3]]
 // CHECK15-NEXT:    br label [[OMP_OFFLOAD_CONT14]]
 // CHECK15:       omp_offload.cont14:
 // CHECK15-NEXT:    [[TMP66:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS15]], i32 0, i32 0
@@ -8709,7 +8709,7 @@ int main (int argc, char **argv) {
 // CHECK15-NEXT:    [[TMP85:%.*]] = icmp ne i32 [[TMP84]], 0
 // CHECK15-NEXT:    br i1 [[TMP85]], label [[OMP_OFFLOAD_FAILED20:%.*]], label [[OMP_OFFLOAD_CONT21:%.*]]
 // CHECK15:       omp_offload.failed20:
-// CHECK15-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l124(ptr [[A]]) #[[ATTR4]]
+// CHECK15-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l124(ptr [[A]]) #[[ATTR3]]
 // CHECK15-NEXT:    br label [[OMP_OFFLOAD_CONT21]]
 // CHECK15:       omp_offload.cont21:
 // CHECK15-NEXT:    [[TMP86:%.*]] = load i32, ptr [[M]], align 4
@@ -8761,7 +8761,7 @@ int main (int argc, char **argv) {
 // CHECK15-NEXT:    [[TMP111:%.*]] = icmp ne i32 [[TMP110]], 0
 // CHECK15-NEXT:    br i1 [[TMP111]], label [[OMP_OFFLOAD_FAILED29:%.*]], label [[OMP_OFFLOAD_CONT30:%.*]]
 // CHECK15:       omp_offload.failed29:
-// CHECK15-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l128(ptr [[A]], i32 [[TMP88]]) #[[ATTR4]]
+// CHECK15-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l128(ptr [[A]], i32 [[TMP88]]) #[[ATTR3]]
 // CHECK15-NEXT:    br label [[OMP_OFFLOAD_CONT30]]
 // CHECK15:       omp_offload.cont30:
 // CHECK15-NEXT:    ret i32 0
@@ -8778,7 +8778,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l112.omp_outlined
-// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] {
+// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] {
 // CHECK15-NEXT:  entry:
 // CHECK15-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK15-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -8839,7 +8839,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l112.omp_outlined.omp_outlined
-// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] {
+// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] {
 // CHECK15-NEXT:  entry:
 // CHECK15-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK15-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -8923,7 +8923,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l116.omp_outlined
-// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] {
+// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] {
 // CHECK15-NEXT:  entry:
 // CHECK15-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK15-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -8984,7 +8984,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l116.omp_outlined.omp_outlined
-// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] {
+// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] {
 // CHECK15-NEXT:  entry:
 // CHECK15-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK15-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -9074,7 +9074,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l120.omp_outlined
-// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] {
+// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
 // CHECK15-NEXT:  entry:
 // CHECK15-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK15-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -9141,7 +9141,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l120.omp_outlined.omp_outlined
-// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] {
+// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
 // CHECK15-NEXT:  entry:
 // CHECK15-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK15-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -9247,7 +9247,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l124.omp_outlined
-// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] {
+// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] {
 // CHECK15-NEXT:  entry:
 // CHECK15-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK15-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -9308,7 +9308,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l124.omp_outlined.omp_outlined
-// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] {
+// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] {
 // CHECK15-NEXT:  entry:
 // CHECK15-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK15-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -9396,7 +9396,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l128.omp_outlined
-// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] {
+// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
 // CHECK15-NEXT:  entry:
 // CHECK15-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK15-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -9463,7 +9463,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l128.omp_outlined.omp_outlined
-// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] {
+// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
 // CHECK15-NEXT:  entry:
 // CHECK15-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK15-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -9538,7 +9538,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK15-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK15-SAME: () #[[ATTR7:[0-9]+]] {
+// CHECK15-SAME: () #[[ATTR6:[0-9]+]] {
 // CHECK15-NEXT:  entry:
 // CHECK15-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK15-NEXT:    ret void
@@ -9682,7 +9682,7 @@ int main (int argc, char **argv) {
 // CHECK17-NEXT:    [[TMP37:%.*]] = icmp ne i32 [[TMP36]], 0
 // CHECK17-NEXT:    br i1 [[TMP37]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK17:       omp_offload.failed:
-// CHECK17-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l139(i64 [[TMP4]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR4:[0-9]+]]
+// CHECK17-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l139(i64 [[TMP4]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR3:[0-9]+]]
 // CHECK17-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK17:       omp_offload.cont:
 // CHECK17-NEXT:    [[TMP38:%.*]] = load i32, ptr [[N]], align 4
@@ -9753,7 +9753,7 @@ int main (int argc, char **argv) {
 // CHECK17-NEXT:    [[TMP72:%.*]] = icmp ne i32 [[TMP71]], 0
 // CHECK17-NEXT:    br i1 [[TMP72]], label [[OMP_OFFLOAD_FAILED16:%.*]], label [[OMP_OFFLOAD_CONT17:%.*]]
 // CHECK17:       omp_offload.failed16:
-// CHECK17-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l143(i64 [[TMP39]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR4]]
+// CHECK17-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l143(i64 [[TMP39]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR3]]
 // CHECK17-NEXT:    br label [[OMP_OFFLOAD_CONT17]]
 // CHECK17:       omp_offload.cont17:
 // CHECK17-NEXT:    [[TMP73:%.*]] = load i32, ptr [[M]], align 4
@@ -9835,7 +9835,7 @@ int main (int argc, char **argv) {
 // CHECK17-NEXT:    [[TMP113:%.*]] = icmp ne i32 [[TMP112]], 0
 // CHECK17-NEXT:    br i1 [[TMP113]], label [[OMP_OFFLOAD_FAILED32:%.*]], label [[OMP_OFFLOAD_CONT33:%.*]]
 // CHECK17:       omp_offload.failed32:
-// CHECK17-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l147(i64 [[TMP75]], i64 [[TMP1]], ptr [[VLA]], i64 [[TMP77]]) #[[ATTR4]]
+// CHECK17-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l147(i64 [[TMP75]], i64 [[TMP1]], ptr [[VLA]], i64 [[TMP77]]) #[[ATTR3]]
 // CHECK17-NEXT:    br label [[OMP_OFFLOAD_CONT33]]
 // CHECK17:       omp_offload.cont33:
 // CHECK17-NEXT:    [[TMP114:%.*]] = load i32, ptr [[N]], align 4
@@ -9906,7 +9906,7 @@ int main (int argc, char **argv) {
 // CHECK17-NEXT:    [[TMP148:%.*]] = icmp ne i32 [[TMP147]], 0
 // CHECK17-NEXT:    br i1 [[TMP148]], label [[OMP_OFFLOAD_FAILED47:%.*]], label [[OMP_OFFLOAD_CONT48:%.*]]
 // CHECK17:       omp_offload.failed47:
-// CHECK17-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l151(i64 [[TMP115]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR4]]
+// CHECK17-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l151(i64 [[TMP115]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR3]]
 // CHECK17-NEXT:    br label [[OMP_OFFLOAD_CONT48]]
 // CHECK17:       omp_offload.cont48:
 // CHECK17-NEXT:    [[TMP149:%.*]] = load i32, ptr [[M]], align 4
@@ -9988,7 +9988,7 @@ int main (int argc, char **argv) {
 // CHECK17-NEXT:    [[TMP189:%.*]] = icmp ne i32 [[TMP188]], 0
 // CHECK17-NEXT:    br i1 [[TMP189]], label [[OMP_OFFLOAD_FAILED64:%.*]], label [[OMP_OFFLOAD_CONT65:%.*]]
 // CHECK17:       omp_offload.failed64:
-// CHECK17-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l155(i64 [[TMP151]], i64 [[TMP1]], ptr [[VLA]], i64 [[TMP153]]) #[[ATTR4]]
+// CHECK17-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l155(i64 [[TMP151]], i64 [[TMP1]], ptr [[VLA]], i64 [[TMP153]]) #[[ATTR3]]
 // CHECK17-NEXT:    br label [[OMP_OFFLOAD_CONT65]]
 // CHECK17:       omp_offload.cont65:
 // CHECK17-NEXT:    [[TMP190:%.*]] = load i32, ptr [[ARGC_ADDR]], align 4
@@ -10020,7 +10020,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l139.omp_outlined
-// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] {
 // CHECK17-NEXT:  entry:
 // CHECK17-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -10114,7 +10114,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l139.omp_outlined.omp_outlined
-// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] {
+// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] {
 // CHECK17-NEXT:  entry:
 // CHECK17-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -10237,7 +10237,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l143.omp_outlined
-// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] {
+// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] {
 // CHECK17-NEXT:  entry:
 // CHECK17-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -10331,7 +10331,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l143.omp_outlined.omp_outlined
-// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] {
+// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] {
 // CHECK17-NEXT:  entry:
 // CHECK17-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -10460,7 +10460,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l147.omp_outlined
-// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] {
+// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
 // CHECK17-NEXT:  entry:
 // CHECK17-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -10585,7 +10585,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l147.omp_outlined.omp_outlined
-// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] {
+// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
 // CHECK17-NEXT:  entry:
 // CHECK17-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -10710,7 +10710,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l151.omp_outlined
-// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] {
+// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] {
 // CHECK17-NEXT:  entry:
 // CHECK17-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -10804,7 +10804,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l151.omp_outlined.omp_outlined
-// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] {
+// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] {
 // CHECK17-NEXT:  entry:
 // CHECK17-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -10929,7 +10929,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l155.omp_outlined
-// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] {
+// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
 // CHECK17-NEXT:  entry:
 // CHECK17-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -11029,7 +11029,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l155.omp_outlined.omp_outlined
-// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] {
+// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
 // CHECK17-NEXT:  entry:
 // CHECK17-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -11132,7 +11132,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@_Z5tmainIiLi10EEiT_
-// CHECK17-SAME: (i32 noundef signext [[ARGC:%.*]]) #[[ATTR6:[0-9]+]] comdat {
+// CHECK17-SAME: (i32 noundef signext [[ARGC:%.*]]) #[[ATTR5:[0-9]+]] comdat {
 // CHECK17-NEXT:  entry:
 // CHECK17-NEXT:    [[ARGC_ADDR:%.*]] = alloca i32, align 4
 // CHECK17-NEXT:    [[A:%.*]] = alloca [10 x i32], align 4
@@ -11206,7 +11206,7 @@ int main (int argc, char **argv) {
 // CHECK17-NEXT:    [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
 // CHECK17-NEXT:    br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK17:       omp_offload.failed:
-// CHECK17-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l112(ptr [[A]]) #[[ATTR4]]
+// CHECK17-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l112(ptr [[A]]) #[[ATTR3]]
 // CHECK17-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK17:       omp_offload.cont:
 // CHECK17-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
@@ -11247,7 +11247,7 @@ int main (int argc, char **argv) {
 // CHECK17-NEXT:    [[TMP39:%.*]] = icmp ne i32 [[TMP38]], 0
 // CHECK17-NEXT:    br i1 [[TMP39]], label [[OMP_OFFLOAD_FAILED6:%.*]], label [[OMP_OFFLOAD_CONT7:%.*]]
 // CHECK17:       omp_offload.failed6:
-// CHECK17-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l116(ptr [[A]]) #[[ATTR4]]
+// CHECK17-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l116(ptr [[A]]) #[[ATTR3]]
 // CHECK17-NEXT:    br label [[OMP_OFFLOAD_CONT7]]
 // CHECK17:       omp_offload.cont7:
 // CHECK17-NEXT:    [[TMP40:%.*]] = load i32, ptr [[M]], align 4
@@ -11299,7 +11299,7 @@ int main (int argc, char **argv) {
 // CHECK17-NEXT:    [[TMP65:%.*]] = icmp ne i32 [[TMP64]], 0
 // CHECK17-NEXT:    br i1 [[TMP65]], label [[OMP_OFFLOAD_FAILED13:%.*]], label [[OMP_OFFLOAD_CONT14:%.*]]
 // CHECK17:       omp_offload.failed13:
-// CHECK17-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l120(ptr [[A]], i64 [[TMP42]]) #[[ATTR4]]
+// CHECK17-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l120(ptr [[A]], i64 [[TMP42]]) #[[ATTR3]]
 // CHECK17-NEXT:    br label [[OMP_OFFLOAD_CONT14]]
 // CHECK17:       omp_offload.cont14:
 // CHECK17-NEXT:    [[TMP66:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS15]], i32 0, i32 0
@@ -11340,7 +11340,7 @@ int main (int argc, char **argv) {
 // CHECK17-NEXT:    [[TMP85:%.*]] = icmp ne i32 [[TMP84]], 0
 // CHECK17-NEXT:    br i1 [[TMP85]], label [[OMP_OFFLOAD_FAILED20:%.*]], label [[OMP_OFFLOAD_CONT21:%.*]]
 // CHECK17:       omp_offload.failed20:
-// CHECK17-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l124(ptr [[A]]) #[[ATTR4]]
+// CHECK17-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l124(ptr [[A]]) #[[ATTR3]]
 // CHECK17-NEXT:    br label [[OMP_OFFLOAD_CONT21]]
 // CHECK17:       omp_offload.cont21:
 // CHECK17-NEXT:    [[TMP86:%.*]] = load i32, ptr [[M]], align 4
@@ -11392,7 +11392,7 @@ int main (int argc, char **argv) {
 // CHECK17-NEXT:    [[TMP111:%.*]] = icmp ne i32 [[TMP110]], 0
 // CHECK17-NEXT:    br i1 [[TMP111]], label [[OMP_OFFLOAD_FAILED29:%.*]], label [[OMP_OFFLOAD_CONT30:%.*]]
 // CHECK17:       omp_offload.failed29:
-// CHECK17-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l128(ptr [[A]], i64 [[TMP88]]) #[[ATTR4]]
+// CHECK17-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l128(ptr [[A]], i64 [[TMP88]]) #[[ATTR3]]
 // CHECK17-NEXT:    br label [[OMP_OFFLOAD_CONT30]]
 // CHECK17:       omp_offload.cont30:
 // CHECK17-NEXT:    ret i32 0
@@ -11409,7 +11409,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l112.omp_outlined
-// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] {
+// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] {
 // CHECK17-NEXT:  entry:
 // CHECK17-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -11472,7 +11472,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l112.omp_outlined.omp_outlined
-// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] {
+// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] {
 // CHECK17-NEXT:  entry:
 // CHECK17-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -11559,7 +11559,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l116.omp_outlined
-// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] {
+// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] {
 // CHECK17-NEXT:  entry:
 // CHECK17-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -11622,7 +11622,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l116.omp_outlined.omp_outlined
-// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] {
+// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] {
 // CHECK17-NEXT:  entry:
 // CHECK17-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -11715,7 +11715,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l120.omp_outlined
-// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] {
+// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
 // CHECK17-NEXT:  entry:
 // CHECK17-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -11784,7 +11784,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l120.omp_outlined.omp_outlined
-// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] {
+// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
 // CHECK17-NEXT:  entry:
 // CHECK17-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -11895,7 +11895,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l124.omp_outlined
-// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] {
+// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] {
 // CHECK17-NEXT:  entry:
 // CHECK17-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -11958,7 +11958,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l124.omp_outlined.omp_outlined
-// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] {
+// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] {
 // CHECK17-NEXT:  entry:
 // CHECK17-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -12049,7 +12049,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l128.omp_outlined
-// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] {
+// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
 // CHECK17-NEXT:  entry:
 // CHECK17-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -12118,7 +12118,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l128.omp_outlined.omp_outlined
-// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] {
+// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
 // CHECK17-NEXT:  entry:
 // CHECK17-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -12196,7 +12196,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK17-SAME: () #[[ATTR7:[0-9]+]] {
+// CHECK17-SAME: () #[[ATTR6:[0-9]+]] {
 // CHECK17-NEXT:  entry:
 // CHECK17-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK17-NEXT:    ret void
@@ -12340,7 +12340,7 @@ int main (int argc, char **argv) {
 // CHECK19-NEXT:    [[TMP37:%.*]] = icmp ne i32 [[TMP36]], 0
 // CHECK19-NEXT:    br i1 [[TMP37]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK19:       omp_offload.failed:
-// CHECK19-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l139(i32 [[TMP3]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR4:[0-9]+]]
+// CHECK19-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l139(i32 [[TMP3]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR3:[0-9]+]]
 // CHECK19-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK19:       omp_offload.cont:
 // CHECK19-NEXT:    [[TMP38:%.*]] = load i32, ptr [[N]], align 4
@@ -12412,7 +12412,7 @@ int main (int argc, char **argv) {
 // CHECK19-NEXT:    [[TMP73:%.*]] = icmp ne i32 [[TMP72]], 0
 // CHECK19-NEXT:    br i1 [[TMP73]], label [[OMP_OFFLOAD_FAILED16:%.*]], label [[OMP_OFFLOAD_CONT17:%.*]]
 // CHECK19:       omp_offload.failed16:
-// CHECK19-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l143(i32 [[TMP39]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR4]]
+// CHECK19-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l143(i32 [[TMP39]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR3]]
 // CHECK19-NEXT:    br label [[OMP_OFFLOAD_CONT17]]
 // CHECK19:       omp_offload.cont17:
 // CHECK19-NEXT:    [[TMP74:%.*]] = load i32, ptr [[M]], align 4
@@ -12495,7 +12495,7 @@ int main (int argc, char **argv) {
 // CHECK19-NEXT:    [[TMP115:%.*]] = icmp ne i32 [[TMP114]], 0
 // CHECK19-NEXT:    br i1 [[TMP115]], label [[OMP_OFFLOAD_FAILED32:%.*]], label [[OMP_OFFLOAD_CONT33:%.*]]
 // CHECK19:       omp_offload.failed32:
-// CHECK19-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l147(i32 [[TMP76]], i32 [[TMP0]], ptr [[VLA]], i32 [[TMP78]]) #[[ATTR4]]
+// CHECK19-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l147(i32 [[TMP76]], i32 [[TMP0]], ptr [[VLA]], i32 [[TMP78]]) #[[ATTR3]]
 // CHECK19-NEXT:    br label [[OMP_OFFLOAD_CONT33]]
 // CHECK19:       omp_offload.cont33:
 // CHECK19-NEXT:    [[TMP116:%.*]] = load i32, ptr [[N]], align 4
@@ -12567,7 +12567,7 @@ int main (int argc, char **argv) {
 // CHECK19-NEXT:    [[TMP151:%.*]] = icmp ne i32 [[TMP150]], 0
 // CHECK19-NEXT:    br i1 [[TMP151]], label [[OMP_OFFLOAD_FAILED47:%.*]], label [[OMP_OFFLOAD_CONT48:%.*]]
 // CHECK19:       omp_offload.failed47:
-// CHECK19-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l151(i32 [[TMP117]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR4]]
+// CHECK19-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l151(i32 [[TMP117]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR3]]
 // CHECK19-NEXT:    br label [[OMP_OFFLOAD_CONT48]]
 // CHECK19:       omp_offload.cont48:
 // CHECK19-NEXT:    [[TMP152:%.*]] = load i32, ptr [[M]], align 4
@@ -12650,7 +12650,7 @@ int main (int argc, char **argv) {
 // CHECK19-NEXT:    [[TMP193:%.*]] = icmp ne i32 [[TMP192]], 0
 // CHECK19-NEXT:    br i1 [[TMP193]], label [[OMP_OFFLOAD_FAILED64:%.*]], label [[OMP_OFFLOAD_CONT65:%.*]]
 // CHECK19:       omp_offload.failed64:
-// CHECK19-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l155(i32 [[TMP154]], i32 [[TMP0]], ptr [[VLA]], i32 [[TMP156]]) #[[ATTR4]]
+// CHECK19-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l155(i32 [[TMP154]], i32 [[TMP0]], ptr [[VLA]], i32 [[TMP156]]) #[[ATTR3]]
 // CHECK19-NEXT:    br label [[OMP_OFFLOAD_CONT65]]
 // CHECK19:       omp_offload.cont65:
 // CHECK19-NEXT:    [[TMP194:%.*]] = load i32, ptr [[ARGC_ADDR]], align 4
@@ -12682,7 +12682,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l139.omp_outlined
-// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] {
 // CHECK19-NEXT:  entry:
 // CHECK19-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -12774,7 +12774,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l139.omp_outlined.omp_outlined
-// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] {
+// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] {
 // CHECK19-NEXT:  entry:
 // CHECK19-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -12894,7 +12894,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l143.omp_outlined
-// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] {
+// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] {
 // CHECK19-NEXT:  entry:
 // CHECK19-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -12986,7 +12986,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l143.omp_outlined.omp_outlined
-// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] {
+// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] {
 // CHECK19-NEXT:  entry:
 // CHECK19-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -13112,7 +13112,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l147.omp_outlined
-// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] {
+// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
 // CHECK19-NEXT:  entry:
 // CHECK19-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -13235,7 +13235,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l147.omp_outlined.omp_outlined
-// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] {
+// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
 // CHECK19-NEXT:  entry:
 // CHECK19-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -13357,7 +13357,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l151.omp_outlined
-// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] {
+// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] {
 // CHECK19-NEXT:  entry:
 // CHECK19-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -13449,7 +13449,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l151.omp_outlined.omp_outlined
-// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] {
+// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] {
 // CHECK19-NEXT:  entry:
 // CHECK19-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -13571,7 +13571,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l155.omp_outlined
-// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] {
+// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
 // CHECK19-NEXT:  entry:
 // CHECK19-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -13669,7 +13669,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l155.omp_outlined.omp_outlined
-// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] {
+// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
 // CHECK19-NEXT:  entry:
 // CHECK19-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -13769,7 +13769,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK19-LABEL: define {{[^@]+}}@_Z5tmainIiLi10EEiT_
-// CHECK19-SAME: (i32 noundef [[ARGC:%.*]]) #[[ATTR6:[0-9]+]] comdat {
+// CHECK19-SAME: (i32 noundef [[ARGC:%.*]]) #[[ATTR5:[0-9]+]] comdat {
 // CHECK19-NEXT:  entry:
 // CHECK19-NEXT:    [[ARGC_ADDR:%.*]] = alloca i32, align 4
 // CHECK19-NEXT:    [[A:%.*]] = alloca [10 x i32], align 4
@@ -13843,7 +13843,7 @@ int main (int argc, char **argv) {
 // CHECK19-NEXT:    [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
 // CHECK19-NEXT:    br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK19:       omp_offload.failed:
-// CHECK19-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l112(ptr [[A]]) #[[ATTR4]]
+// CHECK19-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l112(ptr [[A]]) #[[ATTR3]]
 // CHECK19-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK19:       omp_offload.cont:
 // CHECK19-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
@@ -13884,7 +13884,7 @@ int main (int argc, char **argv) {
 // CHECK19-NEXT:    [[TMP39:%.*]] = icmp ne i32 [[TMP38]], 0
 // CHECK19-NEXT:    br i1 [[TMP39]], label [[OMP_OFFLOAD_FAILED6:%.*]], label [[OMP_OFFLOAD_CONT7:%.*]]
 // CHECK19:       omp_offload.failed6:
-// CHECK19-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l116(ptr [[A]]) #[[ATTR4]]
+// CHECK19-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l116(ptr [[A]]) #[[ATTR3]]
 // CHECK19-NEXT:    br label [[OMP_OFFLOAD_CONT7]]
 // CHECK19:       omp_offload.cont7:
 // CHECK19-NEXT:    [[TMP40:%.*]] = load i32, ptr [[M]], align 4
@@ -13936,7 +13936,7 @@ int main (int argc, char **argv) {
 // CHECK19-NEXT:    [[TMP65:%.*]] = icmp ne i32 [[TMP64]], 0
 // CHECK19-NEXT:    br i1 [[TMP65]], label [[OMP_OFFLOAD_FAILED13:%.*]], label [[OMP_OFFLOAD_CONT14:%.*]]
 // CHECK19:       omp_offload.failed13:
-// CHECK19-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l120(ptr [[A]], i32 [[TMP42]]) #[[ATTR4]]
+// CHECK19-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l120(ptr [[A]], i32 [[TMP42]]) #[[ATTR3]]
 // CHECK19-NEXT:    br label [[OMP_OFFLOAD_CONT14]]
 // CHECK19:       omp_offload.cont14:
 // CHECK19-NEXT:    [[TMP66:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS15]], i32 0, i32 0
@@ -13977,7 +13977,7 @@ int main (int argc, char **argv) {
 // CHECK19-NEXT:    [[TMP85:%.*]] = icmp ne i32 [[TMP84]], 0
 // CHECK19-NEXT:    br i1 [[TMP85]], label [[OMP_OFFLOAD_FAILED20:%.*]], label [[OMP_OFFLOAD_CONT21:%.*]]
 // CHECK19:       omp_offload.failed20:
-// CHECK19-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l124(ptr [[A]]) #[[ATTR4]]
+// CHECK19-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l124(ptr [[A]]) #[[ATTR3]]
 // CHECK19-NEXT:    br label [[OMP_OFFLOAD_CONT21]]
 // CHECK19:       omp_offload.cont21:
 // CHECK19-NEXT:    [[TMP86:%.*]] = load i32, ptr [[M]], align 4
@@ -14029,7 +14029,7 @@ int main (int argc, char **argv) {
 // CHECK19-NEXT:    [[TMP111:%.*]] = icmp ne i32 [[TMP110]], 0
 // CHECK19-NEXT:    br i1 [[TMP111]], label [[OMP_OFFLOAD_FAILED29:%.*]], label [[OMP_OFFLOAD_CONT30:%.*]]
 // CHECK19:       omp_offload.failed29:
-// CHECK19-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l128(ptr [[A]], i32 [[TMP88]]) #[[ATTR4]]
+// CHECK19-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l128(ptr [[A]], i32 [[TMP88]]) #[[ATTR3]]
 // CHECK19-NEXT:    br label [[OMP_OFFLOAD_CONT30]]
 // CHECK19:       omp_offload.cont30:
 // CHECK19-NEXT:    ret i32 0
@@ -14046,7 +14046,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l112.omp_outlined
-// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] {
+// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] {
 // CHECK19-NEXT:  entry:
 // CHECK19-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -14107,7 +14107,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l112.omp_outlined.omp_outlined
-// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] {
+// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] {
 // CHECK19-NEXT:  entry:
 // CHECK19-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -14191,7 +14191,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l116.omp_outlined
-// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] {
+// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] {
 // CHECK19-NEXT:  entry:
 // CHECK19-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -14252,7 +14252,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l116.omp_outlined.omp_outlined
-// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] {
+// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] {
 // CHECK19-NEXT:  entry:
 // CHECK19-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -14342,7 +14342,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l120.omp_outlined
-// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] {
+// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
 // CHECK19-NEXT:  entry:
 // CHECK19-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -14409,7 +14409,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l120.omp_outlined.omp_outlined
-// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] {
+// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
 // CHECK19-NEXT:  entry:
 // CHECK19-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -14515,7 +14515,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l124.omp_outlined
-// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] {
+// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] {
 // CHECK19-NEXT:  entry:
 // CHECK19-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -14576,7 +14576,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l124.omp_outlined.omp_outlined
-// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] {
+// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] {
 // CHECK19-NEXT:  entry:
 // CHECK19-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -14664,7 +14664,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l128.omp_outlined
-// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] {
+// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
 // CHECK19-NEXT:  entry:
 // CHECK19-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -14731,7 +14731,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l128.omp_outlined.omp_outlined
-// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] {
+// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
 // CHECK19-NEXT:  entry:
 // CHECK19-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -14806,7 +14806,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK19-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK19-SAME: () #[[ATTR7:[0-9]+]] {
+// CHECK19-SAME: () #[[ATTR6:[0-9]+]] {
 // CHECK19-NEXT:  entry:
 // CHECK19-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK19-NEXT:    ret void
diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_codegen.cpp
index 3ecca2c04d942..74742ce19ca8e 100644
--- a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_codegen.cpp
@@ -560,7 +560,7 @@ void test_target_teams_atomic() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16target_teams_funPi_l56
-// CHECK1-SAME: (i64 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[A:%.*]], ptr noundef [[G:%.*]]) #[[ATTR5:[0-9]+]] {
+// CHECK1-SAME: (i64 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[A:%.*]], ptr noundef [[G:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[N_ADDR:%.*]] = alloca i64, align 8
 // CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
@@ -857,7 +857,7 @@ void test_target_teams_atomic() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z24test_target_teams_atomicv_l72
-// CHECK1-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[X:%.*]]) #[[ATTR5]] {
+// CHECK1-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[X:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[X_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    store ptr [[X]], ptr [[X_ADDR]], align 8
@@ -1018,7 +1018,7 @@ void test_target_teams_atomic() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK1-SAME: () #[[ATTR6:[0-9]+]] {
+// CHECK1-SAME: () #[[ATTR5:[0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK1-NEXT:    ret void
@@ -1497,7 +1497,7 @@ void test_target_teams_atomic() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16target_teams_funPi_l56
-// CHECK3-SAME: (i32 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[A:%.*]], ptr noundef [[G:%.*]]) #[[ATTR5:[0-9]+]] {
+// CHECK3-SAME: (i32 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[A:%.*]], ptr noundef [[G:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
 // CHECK3-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
@@ -1789,7 +1789,7 @@ void test_target_teams_atomic() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z24test_target_teams_atomicv_l72
-// CHECK3-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[X:%.*]]) #[[ATTR5]] {
+// CHECK3-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[X:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[X_ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    store ptr [[X]], ptr [[X_ADDR]], align 4
@@ -1946,7 +1946,7 @@ void test_target_teams_atomic() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK3-SAME: () #[[ATTR6:[0-9]+]] {
+// CHECK3-SAME: () #[[ATTR5:[0-9]+]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK3-NEXT:    ret void
@@ -2337,8 +2337,9 @@ void test_target_teams_atomic() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16target_teams_funPi_l50
-// CHECK9-SAME: (i64 noundef [[I:%.*]], i64 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]], i64 noundef [[DOTCAPTURE_EXPR_1:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK9-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[I:%.*]], i64 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]], i64 noundef [[DOTCAPTURE_EXPR_1:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK9-NEXT:  entry:
+// CHECK9-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[I_ADDR:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[N_ADDR:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
@@ -2347,6 +2348,7 @@ void test_target_teams_atomic() {
 // CHECK9-NEXT:    [[I_CASTED:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[N_CASTED:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB4:[0-9]+]])
+// CHECK9-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK9-NEXT:    store i64 [[I]], ptr [[I_ADDR]], align 8
 // CHECK9-NEXT:    store i64 [[N]], ptr [[N_ADDR]], align 8
 // CHECK9-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
@@ -2611,12 +2613,14 @@ void test_target_teams_atomic() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16target_teams_funPi_l56
-// CHECK9-SAME: (i64 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[A:%.*]], ptr noundef [[G:%.*]]) #[[ATTR4:[0-9]+]] {
+// CHECK9-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[A:%.*]], ptr noundef [[G:%.*]]) #[[ATTR0]] {
 // CHECK9-NEXT:  entry:
+// CHECK9-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[N_ADDR:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[G_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[N_CASTED:%.*]] = alloca i64, align 8
+// CHECK9-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK9-NEXT:    store i64 [[N]], ptr [[N_ADDR]], align 8
 // CHECK9-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
 // CHECK9-NEXT:    store ptr [[G]], ptr [[G_ADDR]], align 8
@@ -2854,9 +2858,11 @@ void test_target_teams_atomic() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z24test_target_teams_atomicv_l72
-// CHECK9-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[X:%.*]]) #[[ATTR4]] {
+// CHECK9-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[X:%.*]]) #[[ATTR0]] {
 // CHECK9-NEXT:  entry:
+// CHECK9-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[X_ADDR:%.*]] = alloca ptr, align 8
+// CHECK9-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK9-NEXT:    store ptr [[X]], ptr [[X_ADDR]], align 8
 // CHECK9-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[X_ADDR]], align 8
 // CHECK9-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB4]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z24test_target_teams_atomicv_l72.omp_outlined, ptr [[TMP0]])
@@ -3015,8 +3021,9 @@ void test_target_teams_atomic() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16target_teams_funPi_l50
-// CHECK11-SAME: (i32 noundef [[I:%.*]], i32 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]], i32 noundef [[DOTCAPTURE_EXPR_1:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK11-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[I:%.*]], i32 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]], i32 noundef [[DOTCAPTURE_EXPR_1:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK11-NEXT:  entry:
+// CHECK11-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[I_ADDR:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
@@ -3025,6 +3032,7 @@ void test_target_teams_atomic() {
 // CHECK11-NEXT:    [[I_CASTED:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[N_CASTED:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB4:[0-9]+]])
+// CHECK11-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK11-NEXT:    store i32 [[I]], ptr [[I_ADDR]], align 4
 // CHECK11-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
 // CHECK11-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
@@ -3284,12 +3292,14 @@ void test_target_teams_atomic() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16target_teams_funPi_l56
-// CHECK11-SAME: (i32 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[A:%.*]], ptr noundef [[G:%.*]]) #[[ATTR4:[0-9]+]] {
+// CHECK11-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[A:%.*]], ptr noundef [[G:%.*]]) #[[ATTR0]] {
 // CHECK11-NEXT:  entry:
+// CHECK11-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[G_ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[N_CASTED:%.*]] = alloca i32, align 4
+// CHECK11-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK11-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
 // CHECK11-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
 // CHECK11-NEXT:    store ptr [[G]], ptr [[G_ADDR]], align 4
@@ -3522,9 +3532,11 @@ void test_target_teams_atomic() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z24test_target_teams_atomicv_l72
-// CHECK11-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[X:%.*]]) #[[ATTR4]] {
+// CHECK11-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[X:%.*]]) #[[ATTR0]] {
 // CHECK11-NEXT:  entry:
+// CHECK11-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[X_ADDR:%.*]] = alloca ptr, align 4
+// CHECK11-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK11-NEXT:    store ptr [[X]], ptr [[X_ADDR]], align 4
 // CHECK11-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[X_ADDR]], align 4
 // CHECK11-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB4]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z24test_target_teams_atomicv_l72.omp_outlined, ptr [[TMP0]])
diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_collapse_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_collapse_codegen.cpp
index 44976df6849b1..a1c61a055606f 100644
--- a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_collapse_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_collapse_codegen.cpp
@@ -158,7 +158,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
 // CHECK1-NEXT:    br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK1:       omp_offload.failed:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28(ptr [[THIS1]]) #[[ATTR3:[0-9]+]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28(ptr [[THIS1]]) #[[ATTR2:[0-9]+]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    [[A3:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0
@@ -179,7 +179,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -252,7 +252,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28.omp_outlined.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -352,7 +352,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK1-SAME: () #[[ATTR4:[0-9]+]] {
+// CHECK1-SAME: () #[[ATTR3:[0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK1-NEXT:    ret void
@@ -417,7 +417,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
 // CHECK3-NEXT:    br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK3:       omp_offload.failed:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28(ptr [[THIS1]]) #[[ATTR3:[0-9]+]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28(ptr [[THIS1]]) #[[ATTR2:[0-9]+]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK3:       omp_offload.cont:
 // CHECK3-NEXT:    [[A3:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0
@@ -438,7 +438,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -509,7 +509,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28.omp_outlined.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -605,7 +605,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK3-SAME: () #[[ATTR4:[0-9]+]] {
+// CHECK3-SAME: () #[[ATTR3:[0-9]+]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK3-NEXT:    ret void
@@ -880,7 +880,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[TMP50:%.*]] = icmp ne i32 [[TMP49]], 0
 // CHECK9-NEXT:    br i1 [[TMP50]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK9:       omp_offload.failed:
-// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l81(i64 [[TMP7]], i64 [[TMP9]], i64 [[TMP1]], i64 [[TMP3]], ptr [[VLA]]) #[[ATTR4:[0-9]+]]
+// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l81(i64 [[TMP7]], i64 [[TMP9]], i64 [[TMP1]], i64 [[TMP3]], ptr [[VLA]]) #[[ATTR3:[0-9]+]]
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK9:       omp_offload.cont:
 // CHECK9-NEXT:    [[TMP51:%.*]] = load i32, ptr [[ARGC_ADDR]], align 4
@@ -921,7 +921,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l81.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[M:%.*]], i64 noundef [[VLA:%.*]], i64 noundef [[VLA1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[M:%.*]], i64 noundef [[VLA:%.*]], i64 noundef [[VLA1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1057,7 +1057,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l81.omp_outlined.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[M:%.*]], i64 noundef [[VLA:%.*]], i64 noundef [[VLA1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[M:%.*]], i64 noundef [[VLA:%.*]], i64 noundef [[VLA1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1229,7 +1229,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@_Z5tmainIiLi10ELi2EEiT_
-// CHECK9-SAME: (i32 noundef signext [[ARGC:%.*]]) #[[ATTR6:[0-9]+]] comdat {
+// CHECK9-SAME: (i32 noundef signext [[ARGC:%.*]]) #[[ATTR5:[0-9]+]] comdat {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[ARGC_ADDR:%.*]] = alloca i32, align 4
 // CHECK9-NEXT:    [[A:%.*]] = alloca [10 x [2 x i32]], align 4
@@ -1278,7 +1278,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
 // CHECK9-NEXT:    br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK9:       omp_offload.failed:
-// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10ELi2EEiT__l68(ptr [[A]]) #[[ATTR4]]
+// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10ELi2EEiT__l68(ptr [[A]]) #[[ATTR3]]
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK9:       omp_offload.cont:
 // CHECK9-NEXT:    ret i32 0
@@ -1295,7 +1295,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10ELi2EEiT__l68.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(80) [[A:%.*]]) #[[ATTR3]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(80) [[A:%.*]]) #[[ATTR2]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1368,7 +1368,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10ELi2EEiT__l68.omp_outlined.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(80) [[A:%.*]]) #[[ATTR3]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(80) [[A:%.*]]) #[[ATTR2]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1467,7 +1467,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK9-SAME: () #[[ATTR7:[0-9]+]] {
+// CHECK9-SAME: () #[[ATTR6:[0-9]+]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK9-NEXT:    ret void
@@ -1601,7 +1601,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[TMP49:%.*]] = icmp ne i32 [[TMP48]], 0
 // CHECK11-NEXT:    br i1 [[TMP49]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK11:       omp_offload.failed:
-// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l81(i32 [[TMP5]], i32 [[TMP7]], i32 [[TMP0]], i32 [[TMP1]], ptr [[VLA]]) #[[ATTR4:[0-9]+]]
+// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l81(i32 [[TMP5]], i32 [[TMP7]], i32 [[TMP0]], i32 [[TMP1]], ptr [[VLA]]) #[[ATTR3:[0-9]+]]
 // CHECK11-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK11:       omp_offload.cont:
 // CHECK11-NEXT:    [[TMP50:%.*]] = load i32, ptr [[ARGC_ADDR]], align 4
@@ -1642,7 +1642,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l81.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[M:%.*]], i32 noundef [[VLA:%.*]], i32 noundef [[VLA1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[M:%.*]], i32 noundef [[VLA:%.*]], i32 noundef [[VLA1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1780,7 +1780,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l81.omp_outlined.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[M:%.*]], i32 noundef [[VLA:%.*]], i32 noundef [[VLA1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[M:%.*]], i32 noundef [[VLA:%.*]], i32 noundef [[VLA1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1952,7 +1952,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@_Z5tmainIiLi10ELi2EEiT_
-// CHECK11-SAME: (i32 noundef [[ARGC:%.*]]) #[[ATTR6:[0-9]+]] comdat {
+// CHECK11-SAME: (i32 noundef [[ARGC:%.*]]) #[[ATTR5:[0-9]+]] comdat {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[ARGC_ADDR:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[A:%.*]] = alloca [10 x [2 x i32]], align 4
@@ -2001,7 +2001,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
 // CHECK11-NEXT:    br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK11:       omp_offload.failed:
-// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10ELi2EEiT__l68(ptr [[A]]) #[[ATTR4]]
+// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10ELi2EEiT__l68(ptr [[A]]) #[[ATTR3]]
 // CHECK11-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK11:       omp_offload.cont:
 // CHECK11-NEXT:    ret i32 0
@@ -2018,7 +2018,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10ELi2EEiT__l68.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(80) [[A:%.*]]) #[[ATTR3]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(80) [[A:%.*]]) #[[ATTR2]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -2089,7 +2089,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10ELi2EEiT__l68.omp_outlined.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(80) [[A:%.*]]) #[[ATTR3]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(80) [[A:%.*]]) #[[ATTR2]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -2184,7 +2184,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK11-SAME: () #[[ATTR7:[0-9]+]] {
+// CHECK11-SAME: () #[[ATTR6:[0-9]+]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK11-NEXT:    ret void
diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_dist_schedule_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_dist_schedule_codegen.cpp
index 0d1374ddb645f..ac1b7da71ef51 100644
--- a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_dist_schedule_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_dist_schedule_codegen.cpp
@@ -202,7 +202,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
 // CHECK1-NEXT:    br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK1:       omp_offload.failed:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28(ptr [[THIS1]]) #[[ATTR3:[0-9]+]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28(ptr [[THIS1]]) #[[ATTR2:[0-9]+]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    [[A2:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0
@@ -244,7 +244,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP39:%.*]] = icmp ne i32 [[TMP38]], 0
 // CHECK1-NEXT:    br i1 [[TMP39]], label [[OMP_OFFLOAD_FAILED8:%.*]], label [[OMP_OFFLOAD_CONT9:%.*]]
 // CHECK1:       omp_offload.failed8:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l32(ptr [[THIS1]]) #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l32(ptr [[THIS1]]) #[[ATTR2]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT9]]
 // CHECK1:       omp_offload.cont9:
 // CHECK1-NEXT:    [[A10:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0
@@ -286,7 +286,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP59:%.*]] = icmp ne i32 [[TMP58]], 0
 // CHECK1-NEXT:    br i1 [[TMP59]], label [[OMP_OFFLOAD_FAILED16:%.*]], label [[OMP_OFFLOAD_CONT17:%.*]]
 // CHECK1:       omp_offload.failed16:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l36(ptr [[THIS1]]) #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l36(ptr [[THIS1]]) #[[ATTR2]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT17]]
 // CHECK1:       omp_offload.cont17:
 // CHECK1-NEXT:    [[A18:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0
@@ -306,7 +306,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -376,7 +376,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28.omp_outlined.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -471,7 +471,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l32.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -541,7 +541,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l32.omp_outlined.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -636,7 +636,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l36.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -726,7 +726,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l36.omp_outlined.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -811,7 +811,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK1-SAME: () #[[ATTR4:[0-9]+]] {
+// CHECK1-SAME: () #[[ATTR3:[0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK1-NEXT:    ret void
@@ -885,7 +885,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
 // CHECK3-NEXT:    br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK3:       omp_offload.failed:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28(ptr [[THIS1]]) #[[ATTR3:[0-9]+]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28(ptr [[THIS1]]) #[[ATTR2:[0-9]+]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK3:       omp_offload.cont:
 // CHECK3-NEXT:    [[A2:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0
@@ -927,7 +927,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP39:%.*]] = icmp ne i32 [[TMP38]], 0
 // CHECK3-NEXT:    br i1 [[TMP39]], label [[OMP_OFFLOAD_FAILED8:%.*]], label [[OMP_OFFLOAD_CONT9:%.*]]
 // CHECK3:       omp_offload.failed8:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l32(ptr [[THIS1]]) #[[ATTR3]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l32(ptr [[THIS1]]) #[[ATTR2]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT9]]
 // CHECK3:       omp_offload.cont9:
 // CHECK3-NEXT:    [[A10:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0
@@ -969,7 +969,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP59:%.*]] = icmp ne i32 [[TMP58]], 0
 // CHECK3-NEXT:    br i1 [[TMP59]], label [[OMP_OFFLOAD_FAILED16:%.*]], label [[OMP_OFFLOAD_CONT17:%.*]]
 // CHECK3:       omp_offload.failed16:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l36(ptr [[THIS1]]) #[[ATTR3]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l36(ptr [[THIS1]]) #[[ATTR2]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT17]]
 // CHECK3:       omp_offload.cont17:
 // CHECK3-NEXT:    [[A18:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0
@@ -989,7 +989,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1057,7 +1057,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28.omp_outlined.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1149,7 +1149,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l32.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1217,7 +1217,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l32.omp_outlined.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1309,7 +1309,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l36.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1397,7 +1397,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l36.omp_outlined.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1479,7 +1479,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK3-SAME: () #[[ATTR4:[0-9]+]] {
+// CHECK3-SAME: () #[[ATTR3:[0-9]+]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK3-NEXT:    ret void
@@ -1850,7 +1850,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[TMP37:%.*]] = icmp ne i32 [[TMP36]], 0
 // CHECK9-NEXT:    br i1 [[TMP37]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK9:       omp_offload.failed:
-// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l100(i64 [[TMP4]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR4:[0-9]+]]
+// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l100(i64 [[TMP4]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR3:[0-9]+]]
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK9:       omp_offload.cont:
 // CHECK9-NEXT:    [[TMP38:%.*]] = load i32, ptr [[N]], align 4
@@ -1921,7 +1921,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[TMP72:%.*]] = icmp ne i32 [[TMP71]], 0
 // CHECK9-NEXT:    br i1 [[TMP72]], label [[OMP_OFFLOAD_FAILED16:%.*]], label [[OMP_OFFLOAD_CONT17:%.*]]
 // CHECK9:       omp_offload.failed16:
-// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l104(i64 [[TMP39]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR4]]
+// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l104(i64 [[TMP39]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR3]]
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT17]]
 // CHECK9:       omp_offload.cont17:
 // CHECK9-NEXT:    [[TMP73:%.*]] = load i32, ptr [[M]], align 4
@@ -2003,7 +2003,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[TMP113:%.*]] = icmp ne i32 [[TMP112]], 0
 // CHECK9-NEXT:    br i1 [[TMP113]], label [[OMP_OFFLOAD_FAILED32:%.*]], label [[OMP_OFFLOAD_CONT33:%.*]]
 // CHECK9:       omp_offload.failed32:
-// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l108(i64 [[TMP75]], i64 [[TMP1]], ptr [[VLA]], i64 [[TMP77]]) #[[ATTR4]]
+// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l108(i64 [[TMP75]], i64 [[TMP1]], ptr [[VLA]], i64 [[TMP77]]) #[[ATTR3]]
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT33]]
 // CHECK9:       omp_offload.cont33:
 // CHECK9-NEXT:    [[TMP114:%.*]] = load i32, ptr [[ARGC_ADDR]], align 4
@@ -2035,7 +2035,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l100.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -2141,7 +2141,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l100.omp_outlined.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -2276,7 +2276,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l104.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -2382,7 +2382,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l104.omp_outlined.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -2523,7 +2523,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l108.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -2660,7 +2660,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l108.omp_outlined.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -2778,7 +2778,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@_Z5tmainIiLi10EEiT_
-// CHECK9-SAME: (i32 noundef signext [[ARGC:%.*]]) #[[ATTR6:[0-9]+]] comdat {
+// CHECK9-SAME: (i32 noundef signext [[ARGC:%.*]]) #[[ATTR5:[0-9]+]] comdat {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[ARGC_ADDR:%.*]] = alloca i32, align 4
 // CHECK9-NEXT:    [[A:%.*]] = alloca [10 x i32], align 4
@@ -2840,7 +2840,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
 // CHECK9-NEXT:    br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK9:       omp_offload.failed:
-// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l81(ptr [[A]]) #[[ATTR4]]
+// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l81(ptr [[A]]) #[[ATTR3]]
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK9:       omp_offload.cont:
 // CHECK9-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
@@ -2881,7 +2881,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[TMP39:%.*]] = icmp ne i32 [[TMP38]], 0
 // CHECK9-NEXT:    br i1 [[TMP39]], label [[OMP_OFFLOAD_FAILED6:%.*]], label [[OMP_OFFLOAD_CONT7:%.*]]
 // CHECK9:       omp_offload.failed6:
-// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l85(ptr [[A]]) #[[ATTR4]]
+// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l85(ptr [[A]]) #[[ATTR3]]
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT7]]
 // CHECK9:       omp_offload.cont7:
 // CHECK9-NEXT:    [[TMP40:%.*]] = load i32, ptr [[M]], align 4
@@ -2933,7 +2933,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[TMP65:%.*]] = icmp ne i32 [[TMP64]], 0
 // CHECK9-NEXT:    br i1 [[TMP65]], label [[OMP_OFFLOAD_FAILED13:%.*]], label [[OMP_OFFLOAD_CONT14:%.*]]
 // CHECK9:       omp_offload.failed13:
-// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l89(ptr [[A]], i64 [[TMP42]]) #[[ATTR4]]
+// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l89(ptr [[A]], i64 [[TMP42]]) #[[ATTR3]]
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT14]]
 // CHECK9:       omp_offload.cont14:
 // CHECK9-NEXT:    ret i32 0
@@ -2950,7 +2950,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l81.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -3020,7 +3020,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l81.omp_outlined.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -3114,7 +3114,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l85.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -3184,7 +3184,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l85.omp_outlined.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -3284,7 +3284,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l89.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -3381,7 +3381,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l89.omp_outlined.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -3467,7 +3467,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK9-SAME: () #[[ATTR7:[0-9]+]] {
+// CHECK9-SAME: () #[[ATTR6:[0-9]+]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK9-NEXT:    ret void
@@ -3591,7 +3591,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[TMP37:%.*]] = icmp ne i32 [[TMP36]], 0
 // CHECK11-NEXT:    br i1 [[TMP37]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK11:       omp_offload.failed:
-// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l100(i32 [[TMP3]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR4:[0-9]+]]
+// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l100(i32 [[TMP3]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR3:[0-9]+]]
 // CHECK11-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK11:       omp_offload.cont:
 // CHECK11-NEXT:    [[TMP38:%.*]] = load i32, ptr [[N]], align 4
@@ -3663,7 +3663,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[TMP73:%.*]] = icmp ne i32 [[TMP72]], 0
 // CHECK11-NEXT:    br i1 [[TMP73]], label [[OMP_OFFLOAD_FAILED16:%.*]], label [[OMP_OFFLOAD_CONT17:%.*]]
 // CHECK11:       omp_offload.failed16:
-// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l104(i32 [[TMP39]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR4]]
+// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l104(i32 [[TMP39]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR3]]
 // CHECK11-NEXT:    br label [[OMP_OFFLOAD_CONT17]]
 // CHECK11:       omp_offload.cont17:
 // CHECK11-NEXT:    [[TMP74:%.*]] = load i32, ptr [[M]], align 4
@@ -3746,7 +3746,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[TMP115:%.*]] = icmp ne i32 [[TMP114]], 0
 // CHECK11-NEXT:    br i1 [[TMP115]], label [[OMP_OFFLOAD_FAILED32:%.*]], label [[OMP_OFFLOAD_CONT33:%.*]]
 // CHECK11:       omp_offload.failed32:
-// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l108(i32 [[TMP76]], i32 [[TMP0]], ptr [[VLA]], i32 [[TMP78]]) #[[ATTR4]]
+// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l108(i32 [[TMP76]], i32 [[TMP0]], ptr [[VLA]], i32 [[TMP78]]) #[[ATTR3]]
 // CHECK11-NEXT:    br label [[OMP_OFFLOAD_CONT33]]
 // CHECK11:       omp_offload.cont33:
 // CHECK11-NEXT:    [[TMP116:%.*]] = load i32, ptr [[ARGC_ADDR]], align 4
@@ -3778,7 +3778,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l100.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -3882,7 +3882,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l100.omp_outlined.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -4014,7 +4014,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l104.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -4118,7 +4118,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l104.omp_outlined.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -4256,7 +4256,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l108.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -4391,7 +4391,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l108.omp_outlined.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -4506,7 +4506,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@_Z5tmainIiLi10EEiT_
-// CHECK11-SAME: (i32 noundef [[ARGC:%.*]]) #[[ATTR6:[0-9]+]] comdat {
+// CHECK11-SAME: (i32 noundef [[ARGC:%.*]]) #[[ATTR5:[0-9]+]] comdat {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[ARGC_ADDR:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[A:%.*]] = alloca [10 x i32], align 4
@@ -4568,7 +4568,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
 // CHECK11-NEXT:    br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK11:       omp_offload.failed:
-// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l81(ptr [[A]]) #[[ATTR4]]
+// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l81(ptr [[A]]) #[[ATTR3]]
 // CHECK11-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK11:       omp_offload.cont:
 // CHECK11-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
@@ -4609,7 +4609,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[TMP39:%.*]] = icmp ne i32 [[TMP38]], 0
 // CHECK11-NEXT:    br i1 [[TMP39]], label [[OMP_OFFLOAD_FAILED6:%.*]], label [[OMP_OFFLOAD_CONT7:%.*]]
 // CHECK11:       omp_offload.failed6:
-// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l85(ptr [[A]]) #[[ATTR4]]
+// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l85(ptr [[A]]) #[[ATTR3]]
 // CHECK11-NEXT:    br label [[OMP_OFFLOAD_CONT7]]
 // CHECK11:       omp_offload.cont7:
 // CHECK11-NEXT:    [[TMP40:%.*]] = load i32, ptr [[M]], align 4
@@ -4661,7 +4661,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[TMP65:%.*]] = icmp ne i32 [[TMP64]], 0
 // CHECK11-NEXT:    br i1 [[TMP65]], label [[OMP_OFFLOAD_FAILED13:%.*]], label [[OMP_OFFLOAD_CONT14:%.*]]
 // CHECK11:       omp_offload.failed13:
-// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l89(ptr [[A]], i32 [[TMP42]]) #[[ATTR4]]
+// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l89(ptr [[A]], i32 [[TMP42]]) #[[ATTR3]]
 // CHECK11-NEXT:    br label [[OMP_OFFLOAD_CONT14]]
 // CHECK11:       omp_offload.cont14:
 // CHECK11-NEXT:    ret i32 0
@@ -4678,7 +4678,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l81.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -4746,7 +4746,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l81.omp_outlined.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -4837,7 +4837,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l85.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -4905,7 +4905,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l85.omp_outlined.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -5002,7 +5002,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l89.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -5097,7 +5097,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l89.omp_outlined.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -5180,7 +5180,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK11-SAME: () #[[ATTR7:[0-9]+]] {
+// CHECK11-SAME: () #[[ATTR6:[0-9]+]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK11-NEXT:    ret void
diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_firstprivate_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_firstprivate_codegen.cpp
index 38156e95014dd..a7f257773b9c5 100644
--- a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_firstprivate_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_firstprivate_codegen.cpp
@@ -427,7 +427,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l122.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i64 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], i64 noundef [[SIVAR:%.*]]) #[[ATTR5:[0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i64 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], i64 noundef [[SIVAR:%.*]]) #[[ATTR4]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -589,7 +589,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l122.omp_outlined.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i64 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], i64 noundef [[SIVAR:%.*]]) #[[ATTR5]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i64 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], i64 noundef [[SIVAR:%.*]]) #[[ATTR4]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -731,7 +731,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@_Z5tmainIiET_v
-// CHECK1-SAME: () #[[ATTR7:[0-9]+]] comdat {
+// CHECK1-SAME: () #[[ATTR6:[0-9]+]] comdat {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4
@@ -928,7 +928,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l81.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i64 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR5]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i64 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR4]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1070,7 +1070,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l81.omp_outlined.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i64 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR5]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i64 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR4]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1524,7 +1524,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l122.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i32 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], i32 noundef [[SIVAR:%.*]]) #[[ATTR5:[0-9]+]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i32 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], i32 noundef [[SIVAR:%.*]]) #[[ATTR4]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1684,7 +1684,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l122.omp_outlined.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i32 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], i32 noundef [[SIVAR:%.*]]) #[[ATTR5]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i32 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], i32 noundef [[SIVAR:%.*]]) #[[ATTR4]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1822,7 +1822,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@_Z5tmainIiET_v
-// CHECK3-SAME: () #[[ATTR7:[0-9]+]] comdat {
+// CHECK3-SAME: () #[[ATTR6:[0-9]+]] comdat {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
 // CHECK3-NEXT:    [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4
@@ -2019,7 +2019,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l81.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i32 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR5]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i32 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR4]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -2159,7 +2159,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l81.omp_outlined.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i32 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR5]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i32 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR4]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -2532,7 +2532,7 @@ int main() {
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l99.omp_outlined
-// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[G:%.*]], i64 noundef [[G1:%.*]], i64 noundef [[SIVAR:%.*]]) #[[ATTR6:[0-9]+]] {
+// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[G:%.*]], i64 noundef [[G1:%.*]], i64 noundef [[SIVAR:%.*]]) #[[ATTR5]] {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -2620,7 +2620,7 @@ int main() {
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l99.omp_outlined.omp_outlined
-// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[G:%.*]], i64 noundef [[G1:%.*]], i64 noundef [[SIVAR:%.*]]) #[[ATTR6]] {
+// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[G:%.*]], i64 noundef [[G1:%.*]], i64 noundef [[SIVAR:%.*]]) #[[ATTR5]] {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -3508,8 +3508,9 @@ int main() {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l122
-// CHECK13-SAME: (ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i64 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], i64 noundef [[SIVAR:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK13-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i64 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], i64 noundef [[SIVAR:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK13-NEXT:  entry:
+// CHECK13-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    [[VEC_ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    [[T_VAR_ADDR:%.*]] = alloca i64, align 8
 // CHECK13-NEXT:    [[S_ARR_ADDR:%.*]] = alloca ptr, align 8
@@ -3517,6 +3518,7 @@ int main() {
 // CHECK13-NEXT:    [[SIVAR_ADDR:%.*]] = alloca i64, align 8
 // CHECK13-NEXT:    [[T_VAR_CASTED:%.*]] = alloca i64, align 8
 // CHECK13-NEXT:    [[SIVAR_CASTED:%.*]] = alloca i64, align 8
+// CHECK13-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK13-NEXT:    store ptr [[VEC]], ptr [[VEC_ADDR]], align 8
 // CHECK13-NEXT:    store i64 [[T_VAR]], ptr [[T_VAR_ADDR]], align 8
 // CHECK13-NEXT:    store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 8
@@ -3536,7 +3538,7 @@ int main() {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l122.omp_outlined
-// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i64 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], i64 noundef [[SIVAR:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i64 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], i64 noundef [[SIVAR:%.*]]) #[[ATTR0]] {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -3581,17 +3583,17 @@ int main() {
 // CHECK13:       omp.arraycpy.body:
 // CHECK13-NEXT:    [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi ptr [ [[TMP1]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
 // CHECK13-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi ptr [ [[ARRAY_BEGIN]], [[ENTRY]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
-// CHECK13-NEXT:    call void @_ZN2StC1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP]]) #[[ATTR5:[0-9]+]]
-// CHECK13-NEXT:    call void @_ZN1SIfEC1ERKS0_2St(ptr noundef nonnull align 4 dereferenceable(4) [[OMP_ARRAYCPY_DESTELEMENTPAST]], ptr noundef nonnull align 4 dereferenceable(4) [[OMP_ARRAYCPY_SRCELEMENTPAST]], ptr noundef [[AGG_TMP]]) #[[ATTR5]]
-// CHECK13-NEXT:    call void @_ZN2StD1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP]]) #[[ATTR6:[0-9]+]]
+// CHECK13-NEXT:    call void @_ZN2StC1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP]]) #[[ATTR4:[0-9]+]]
+// CHECK13-NEXT:    call void @_ZN1SIfEC1ERKS0_2St(ptr noundef nonnull align 4 dereferenceable(4) [[OMP_ARRAYCPY_DESTELEMENTPAST]], ptr noundef nonnull align 4 dereferenceable(4) [[OMP_ARRAYCPY_SRCELEMENTPAST]], ptr noundef [[AGG_TMP]]) #[[ATTR4]]
+// CHECK13-NEXT:    call void @_ZN2StD1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP]]) #[[ATTR5:[0-9]+]]
 // CHECK13-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr [[STRUCT_S]], ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
 // CHECK13-NEXT:    [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr [[STRUCT_S]], ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1
 // CHECK13-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP3]]
 // CHECK13-NEXT:    br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYCPY_DONE3]], label [[OMP_ARRAYCPY_BODY]]
 // CHECK13:       omp.arraycpy.done3:
-// CHECK13-NEXT:    call void @_ZN2StC1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP5]]) #[[ATTR5]]
-// CHECK13-NEXT:    call void @_ZN1SIfEC1ERKS0_2St(ptr noundef nonnull align 4 dereferenceable(4) [[VAR4]], ptr noundef nonnull align 4 dereferenceable(4) [[TMP2]], ptr noundef [[AGG_TMP5]]) #[[ATTR5]]
-// CHECK13-NEXT:    call void @_ZN2StD1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP5]]) #[[ATTR6]]
+// CHECK13-NEXT:    call void @_ZN2StC1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP5]]) #[[ATTR4]]
+// CHECK13-NEXT:    call void @_ZN1SIfEC1ERKS0_2St(ptr noundef nonnull align 4 dereferenceable(4) [[VAR4]], ptr noundef nonnull align 4 dereferenceable(4) [[TMP2]], ptr noundef [[AGG_TMP5]]) #[[ATTR4]]
+// CHECK13-NEXT:    call void @_ZN2StD1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP5]]) #[[ATTR5]]
 // CHECK13-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
 // CHECK13-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4
 // CHECK13-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB1:[0-9]+]], i32 [[TMP5]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
@@ -3648,14 +3650,14 @@ int main() {
 // CHECK13-NEXT:    store i32 2, ptr [[I]], align 4
 // CHECK13-NEXT:    br label [[DOTOMP_FINAL_DONE]]
 // CHECK13:       .omp.final.done:
-// CHECK13-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR4]]) #[[ATTR6]]
+// CHECK13-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR4]]) #[[ATTR5]]
 // CHECK13-NEXT:    [[ARRAY_BEGIN7:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR2]], i32 0, i32 0
 // CHECK13-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN7]], i64 2
 // CHECK13-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK13:       arraydestroy.body:
 // CHECK13-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP25]], [[DOTOMP_FINAL_DONE]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK13-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
-// CHECK13-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR6]]
+// CHECK13-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
 // CHECK13-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN7]]
 // CHECK13-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE8:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK13:       arraydestroy.done8:
@@ -3663,17 +3665,17 @@ int main() {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@_ZN2StC1Ev
-// CHECK13-SAME: (ptr noundef nonnull align 4 dereferenceable(8) [[THIS:%.*]]) unnamed_addr #[[ATTR3:[0-9]+]] comdat {
+// CHECK13-SAME: (ptr noundef nonnull align 4 dereferenceable(8) [[THIS:%.*]]) unnamed_addr #[[ATTR2:[0-9]+]] comdat {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
 // CHECK13-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
-// CHECK13-NEXT:    call void @_ZN2StC2Ev(ptr noundef nonnull align 4 dereferenceable(8) [[THIS1]]) #[[ATTR5]]
+// CHECK13-NEXT:    call void @_ZN2StC2Ev(ptr noundef nonnull align 4 dereferenceable(8) [[THIS1]]) #[[ATTR4]]
 // CHECK13-NEXT:    ret void
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@_ZN1SIfEC1ERKS0_2St
-// CHECK13-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[S:%.*]], ptr noundef [[T:%.*]]) unnamed_addr #[[ATTR3]] comdat {
+// CHECK13-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[S:%.*]], ptr noundef [[T:%.*]]) unnamed_addr #[[ATTR2]] comdat {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    [[S_ADDR:%.*]] = alloca ptr, align 8
@@ -3683,22 +3685,22 @@ int main() {
 // CHECK13-NEXT:    store ptr [[T]], ptr [[T_INDIRECT_ADDR]], align 8
 // CHECK13-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CHECK13-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 8
-// CHECK13-NEXT:    call void @_ZN1SIfEC2ERKS0_2St(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]], ptr noundef nonnull align 4 dereferenceable(4) [[TMP0]], ptr noundef [[T]]) #[[ATTR5]]
+// CHECK13-NEXT:    call void @_ZN1SIfEC2ERKS0_2St(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]], ptr noundef nonnull align 4 dereferenceable(4) [[TMP0]], ptr noundef [[T]]) #[[ATTR4]]
 // CHECK13-NEXT:    ret void
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@_ZN2StD1Ev
-// CHECK13-SAME: (ptr noundef nonnull align 4 dereferenceable(8) [[THIS:%.*]]) unnamed_addr #[[ATTR3]] comdat {
+// CHECK13-SAME: (ptr noundef nonnull align 4 dereferenceable(8) [[THIS:%.*]]) unnamed_addr #[[ATTR2]] comdat {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
 // CHECK13-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
-// CHECK13-NEXT:    call void @_ZN2StD2Ev(ptr noundef nonnull align 4 dereferenceable(8) [[THIS1]]) #[[ATTR6]]
+// CHECK13-NEXT:    call void @_ZN2StD2Ev(ptr noundef nonnull align 4 dereferenceable(8) [[THIS1]]) #[[ATTR5]]
 // CHECK13-NEXT:    ret void
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l122.omp_outlined.omp_outlined
-// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i64 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], i64 noundef [[SIVAR:%.*]]) #[[ATTR1]] {
+// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i64 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], i64 noundef [[SIVAR:%.*]]) #[[ATTR0]] {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -3751,17 +3753,17 @@ int main() {
 // CHECK13:       omp.arraycpy.body:
 // CHECK13-NEXT:    [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi ptr [ [[TMP1]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
 // CHECK13-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi ptr [ [[ARRAY_BEGIN]], [[ENTRY]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
-// CHECK13-NEXT:    call void @_ZN2StC1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP]]) #[[ATTR5]]
-// CHECK13-NEXT:    call void @_ZN1SIfEC1ERKS0_2St(ptr noundef nonnull align 4 dereferenceable(4) [[OMP_ARRAYCPY_DESTELEMENTPAST]], ptr noundef nonnull align 4 dereferenceable(4) [[OMP_ARRAYCPY_SRCELEMENTPAST]], ptr noundef [[AGG_TMP]]) #[[ATTR5]]
-// CHECK13-NEXT:    call void @_ZN2StD1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP]]) #[[ATTR6]]
+// CHECK13-NEXT:    call void @_ZN2StC1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP]]) #[[ATTR4]]
+// CHECK13-NEXT:    call void @_ZN1SIfEC1ERKS0_2St(ptr noundef nonnull align 4 dereferenceable(4) [[OMP_ARRAYCPY_DESTELEMENTPAST]], ptr noundef nonnull align 4 dereferenceable(4) [[OMP_ARRAYCPY_SRCELEMENTPAST]], ptr noundef [[AGG_TMP]]) #[[ATTR4]]
+// CHECK13-NEXT:    call void @_ZN2StD1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP]]) #[[ATTR5]]
 // CHECK13-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr [[STRUCT_S]], ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
 // CHECK13-NEXT:    [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr [[STRUCT_S]], ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1
 // CHECK13-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP5]]
 // CHECK13-NEXT:    br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYCPY_DONE4]], label [[OMP_ARRAYCPY_BODY]]
 // CHECK13:       omp.arraycpy.done4:
-// CHECK13-NEXT:    call void @_ZN2StC1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP6]]) #[[ATTR5]]
-// CHECK13-NEXT:    call void @_ZN1SIfEC1ERKS0_2St(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]], ptr noundef nonnull align 4 dereferenceable(4) [[TMP2]], ptr noundef [[AGG_TMP6]]) #[[ATTR5]]
-// CHECK13-NEXT:    call void @_ZN2StD1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP6]]) #[[ATTR6]]
+// CHECK13-NEXT:    call void @_ZN2StC1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP6]]) #[[ATTR4]]
+// CHECK13-NEXT:    call void @_ZN1SIfEC1ERKS0_2St(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]], ptr noundef nonnull align 4 dereferenceable(4) [[TMP2]], ptr noundef [[AGG_TMP6]]) #[[ATTR4]]
+// CHECK13-NEXT:    call void @_ZN2StD1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP6]]) #[[ATTR5]]
 // CHECK13-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
 // CHECK13-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4
 // CHECK13-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB2:[0-9]+]], i32 [[TMP7]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
@@ -3825,14 +3827,14 @@ int main() {
 // CHECK13-NEXT:    store i32 2, ptr [[I]], align 4
 // CHECK13-NEXT:    br label [[DOTOMP_FINAL_DONE]]
 // CHECK13:       .omp.final.done:
-// CHECK13-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR6]]
+// CHECK13-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR5]]
 // CHECK13-NEXT:    [[ARRAY_BEGIN12:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR3]], i32 0, i32 0
 // CHECK13-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN12]], i64 2
 // CHECK13-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK13:       arraydestroy.body:
 // CHECK13-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP24]], [[DOTOMP_FINAL_DONE]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK13-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
-// CHECK13-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR6]]
+// CHECK13-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
 // CHECK13-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN12]]
 // CHECK13-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE13:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK13:       arraydestroy.done13:
@@ -3840,24 +3842,26 @@ int main() {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@_ZN1SIfED1Ev
-// CHECK13-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR3]] comdat {
+// CHECK13-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR2]] comdat {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
 // CHECK13-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
-// CHECK13-NEXT:    call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR6]]
+// CHECK13-NEXT:    call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]]
 // CHECK13-NEXT:    ret void
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l81
-// CHECK13-SAME: (ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i64 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR0]] {
+// CHECK13-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i64 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR0]] {
 // CHECK13-NEXT:  entry:
+// CHECK13-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    [[VEC_ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    [[T_VAR_ADDR:%.*]] = alloca i64, align 8
 // CHECK13-NEXT:    [[S_ARR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    [[VAR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    [[TMP:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    [[T_VAR_CASTED:%.*]] = alloca i64, align 8
+// CHECK13-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK13-NEXT:    store ptr [[VEC]], ptr [[VEC_ADDR]], align 8
 // CHECK13-NEXT:    store i64 [[T_VAR]], ptr [[T_VAR_ADDR]], align 8
 // CHECK13-NEXT:    store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 8
@@ -3875,7 +3879,7 @@ int main() {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l81.omp_outlined
-// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i64 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR1]] {
+// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i64 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR0]] {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -3920,18 +3924,18 @@ int main() {
 // CHECK13:       omp.arraycpy.body:
 // CHECK13-NEXT:    [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi ptr [ [[TMP1]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
 // CHECK13-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi ptr [ [[ARRAY_BEGIN]], [[ENTRY]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
-// CHECK13-NEXT:    call void @_ZN2StC1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP]]) #[[ATTR5]]
-// CHECK13-NEXT:    call void @_ZN1SIiEC1ERKS0_2St(ptr noundef nonnull align 4 dereferenceable(4) [[OMP_ARRAYCPY_DESTELEMENTPAST]], ptr noundef nonnull align 4 dereferenceable(4) [[OMP_ARRAYCPY_SRCELEMENTPAST]], ptr noundef [[AGG_TMP]]) #[[ATTR5]]
-// CHECK13-NEXT:    call void @_ZN2StD1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP]]) #[[ATTR6]]
+// CHECK13-NEXT:    call void @_ZN2StC1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP]]) #[[ATTR4]]
+// CHECK13-NEXT:    call void @_ZN1SIiEC1ERKS0_2St(ptr noundef nonnull align 4 dereferenceable(4) [[OMP_ARRAYCPY_DESTELEMENTPAST]], ptr noundef nonnull align 4 dereferenceable(4) [[OMP_ARRAYCPY_SRCELEMENTPAST]], ptr noundef [[AGG_TMP]]) #[[ATTR4]]
+// CHECK13-NEXT:    call void @_ZN2StD1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP]]) #[[ATTR5]]
 // CHECK13-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr [[STRUCT_S_0]], ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
 // CHECK13-NEXT:    [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr [[STRUCT_S_0]], ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1
 // CHECK13-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP3]]
 // CHECK13-NEXT:    br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYCPY_DONE4]], label [[OMP_ARRAYCPY_BODY]]
 // CHECK13:       omp.arraycpy.done4:
 // CHECK13-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[TMP]], align 8
-// CHECK13-NEXT:    call void @_ZN2StC1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP6]]) #[[ATTR5]]
-// CHECK13-NEXT:    call void @_ZN1SIiEC1ERKS0_2St(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]], ptr noundef nonnull align 4 dereferenceable(4) [[TMP4]], ptr noundef [[AGG_TMP6]]) #[[ATTR5]]
-// CHECK13-NEXT:    call void @_ZN2StD1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP6]]) #[[ATTR6]]
+// CHECK13-NEXT:    call void @_ZN2StC1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP6]]) #[[ATTR4]]
+// CHECK13-NEXT:    call void @_ZN1SIiEC1ERKS0_2St(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]], ptr noundef nonnull align 4 dereferenceable(4) [[TMP4]], ptr noundef [[AGG_TMP6]]) #[[ATTR4]]
+// CHECK13-NEXT:    call void @_ZN2StD1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP6]]) #[[ATTR5]]
 // CHECK13-NEXT:    store ptr [[VAR5]], ptr [[_TMP7]], align 8
 // CHECK13-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
 // CHECK13-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4
@@ -3987,14 +3991,14 @@ int main() {
 // CHECK13-NEXT:    store i32 2, ptr [[I]], align 4
 // CHECK13-NEXT:    br label [[DOTOMP_FINAL_DONE]]
 // CHECK13:       .omp.final.done:
-// CHECK13-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR6]]
+// CHECK13-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR5]]
 // CHECK13-NEXT:    [[ARRAY_BEGIN9:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR3]], i32 0, i32 0
 // CHECK13-NEXT:    [[TMP25:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN9]], i64 2
 // CHECK13-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK13:       arraydestroy.body:
 // CHECK13-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP25]], [[DOTOMP_FINAL_DONE]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK13-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
-// CHECK13-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR6]]
+// CHECK13-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
 // CHECK13-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN9]]
 // CHECK13-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE10:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK13:       arraydestroy.done10:
@@ -4002,7 +4006,7 @@ int main() {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@_ZN1SIiEC1ERKS0_2St
-// CHECK13-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[S:%.*]], ptr noundef [[T:%.*]]) unnamed_addr #[[ATTR3]] comdat {
+// CHECK13-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[S:%.*]], ptr noundef [[T:%.*]]) unnamed_addr #[[ATTR2]] comdat {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    [[S_ADDR:%.*]] = alloca ptr, align 8
@@ -4012,12 +4016,12 @@ int main() {
 // CHECK13-NEXT:    store ptr [[T]], ptr [[T_INDIRECT_ADDR]], align 8
 // CHECK13-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CHECK13-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 8
-// CHECK13-NEXT:    call void @_ZN1SIiEC2ERKS0_2St(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]], ptr noundef nonnull align 4 dereferenceable(4) [[TMP0]], ptr noundef [[T]]) #[[ATTR5]]
+// CHECK13-NEXT:    call void @_ZN1SIiEC2ERKS0_2St(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]], ptr noundef nonnull align 4 dereferenceable(4) [[TMP0]], ptr noundef [[T]]) #[[ATTR4]]
 // CHECK13-NEXT:    ret void
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l81.omp_outlined.omp_outlined
-// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i64 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR1]] {
+// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i64 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR0]] {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -4071,18 +4075,18 @@ int main() {
 // CHECK13:       omp.arraycpy.body:
 // CHECK13-NEXT:    [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi ptr [ [[TMP1]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
 // CHECK13-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi ptr [ [[ARRAY_BEGIN]], [[ENTRY]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
-// CHECK13-NEXT:    call void @_ZN2StC1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP]]) #[[ATTR5]]
-// CHECK13-NEXT:    call void @_ZN1SIiEC1ERKS0_2St(ptr noundef nonnull align 4 dereferenceable(4) [[OMP_ARRAYCPY_DESTELEMENTPAST]], ptr noundef nonnull align 4 dereferenceable(4) [[OMP_ARRAYCPY_SRCELEMENTPAST]], ptr noundef [[AGG_TMP]]) #[[ATTR5]]
-// CHECK13-NEXT:    call void @_ZN2StD1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP]]) #[[ATTR6]]
+// CHECK13-NEXT:    call void @_ZN2StC1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP]]) #[[ATTR4]]
+// CHECK13-NEXT:    call void @_ZN1SIiEC1ERKS0_2St(ptr noundef nonnull align 4 dereferenceable(4) [[OMP_ARRAYCPY_DESTELEMENTPAST]], ptr noundef nonnull align 4 dereferenceable(4) [[OMP_ARRAYCPY_SRCELEMENTPAST]], ptr noundef [[AGG_TMP]]) #[[ATTR4]]
+// CHECK13-NEXT:    call void @_ZN2StD1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP]]) #[[ATTR5]]
 // CHECK13-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr [[STRUCT_S_0]], ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
 // CHECK13-NEXT:    [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr [[STRUCT_S_0]], ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1
 // CHECK13-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP5]]
 // CHECK13-NEXT:    br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYCPY_DONE5]], label [[OMP_ARRAYCPY_BODY]]
 // CHECK13:       omp.arraycpy.done5:
 // CHECK13-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[TMP]], align 8
-// CHECK13-NEXT:    call void @_ZN2StC1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP7]]) #[[ATTR5]]
-// CHECK13-NEXT:    call void @_ZN1SIiEC1ERKS0_2St(ptr noundef nonnull align 4 dereferenceable(4) [[VAR6]], ptr noundef nonnull align 4 dereferenceable(4) [[TMP6]], ptr noundef [[AGG_TMP7]]) #[[ATTR5]]
-// CHECK13-NEXT:    call void @_ZN2StD1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP7]]) #[[ATTR6]]
+// CHECK13-NEXT:    call void @_ZN2StC1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP7]]) #[[ATTR4]]
+// CHECK13-NEXT:    call void @_ZN1SIiEC1ERKS0_2St(ptr noundef nonnull align 4 dereferenceable(4) [[VAR6]], ptr noundef nonnull align 4 dereferenceable(4) [[TMP6]], ptr noundef [[AGG_TMP7]]) #[[ATTR4]]
+// CHECK13-NEXT:    call void @_ZN2StD1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP7]]) #[[ATTR5]]
 // CHECK13-NEXT:    store ptr [[VAR6]], ptr [[_TMP8]], align 8
 // CHECK13-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
 // CHECK13-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4
@@ -4144,14 +4148,14 @@ int main() {
 // CHECK13-NEXT:    store i32 2, ptr [[I]], align 4
 // CHECK13-NEXT:    br label [[DOTOMP_FINAL_DONE]]
 // CHECK13:       .omp.final.done:
-// CHECK13-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR6]]) #[[ATTR6]]
+// CHECK13-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR6]]) #[[ATTR5]]
 // CHECK13-NEXT:    [[ARRAY_BEGIN13:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR4]], i32 0, i32 0
 // CHECK13-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN13]], i64 2
 // CHECK13-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK13:       arraydestroy.body:
 // CHECK13-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP24]], [[DOTOMP_FINAL_DONE]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK13-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
-// CHECK13-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR6]]
+// CHECK13-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
 // CHECK13-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN13]]
 // CHECK13-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE14:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK13:       arraydestroy.done14:
@@ -4159,17 +4163,17 @@ int main() {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@_ZN1SIiED1Ev
-// CHECK13-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR3]] comdat {
+// CHECK13-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR2]] comdat {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
 // CHECK13-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
-// CHECK13-NEXT:    call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR6]]
+// CHECK13-NEXT:    call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]]
 // CHECK13-NEXT:    ret void
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@_ZN2StC2Ev
-// CHECK13-SAME: (ptr noundef nonnull align 4 dereferenceable(8) [[THIS:%.*]]) unnamed_addr #[[ATTR3]] comdat {
+// CHECK13-SAME: (ptr noundef nonnull align 4 dereferenceable(8) [[THIS:%.*]]) unnamed_addr #[[ATTR2]] comdat {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
@@ -4182,7 +4186,7 @@ int main() {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@_ZN2StD2Ev
-// CHECK13-SAME: (ptr noundef nonnull align 4 dereferenceable(8) [[THIS:%.*]]) unnamed_addr #[[ATTR3]] comdat {
+// CHECK13-SAME: (ptr noundef nonnull align 4 dereferenceable(8) [[THIS:%.*]]) unnamed_addr #[[ATTR2]] comdat {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
@@ -4191,7 +4195,7 @@ int main() {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@_ZN1SIfED2Ev
-// CHECK13-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR3]] comdat {
+// CHECK13-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR2]] comdat {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
@@ -4200,7 +4204,7 @@ int main() {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@_ZN1SIfEC2ERKS0_2St
-// CHECK13-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[S:%.*]], ptr noundef [[T:%.*]]) unnamed_addr #[[ATTR3]] comdat {
+// CHECK13-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[S:%.*]], ptr noundef [[T:%.*]]) unnamed_addr #[[ATTR2]] comdat {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    [[S_ADDR:%.*]] = alloca ptr, align 8
@@ -4222,7 +4226,7 @@ int main() {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@_ZN1SIiED2Ev
-// CHECK13-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR3]] comdat {
+// CHECK13-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR2]] comdat {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
@@ -4231,7 +4235,7 @@ int main() {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@_ZN1SIiEC2ERKS0_2St
-// CHECK13-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[S:%.*]], ptr noundef [[T:%.*]]) unnamed_addr #[[ATTR3]] comdat {
+// CHECK13-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[S:%.*]], ptr noundef [[T:%.*]]) unnamed_addr #[[ATTR2]] comdat {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    [[S_ADDR:%.*]] = alloca ptr, align 8
@@ -4252,8 +4256,9 @@ int main() {
 //
 //
 // CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l122
-// CHECK15-SAME: (ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i32 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], i32 noundef [[SIVAR:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK15-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i32 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], i32 noundef [[SIVAR:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK15-NEXT:  entry:
+// CHECK15-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK15-NEXT:    [[VEC_ADDR:%.*]] = alloca ptr, align 4
 // CHECK15-NEXT:    [[T_VAR_ADDR:%.*]] = alloca i32, align 4
 // CHECK15-NEXT:    [[S_ARR_ADDR:%.*]] = alloca ptr, align 4
@@ -4261,6 +4266,7 @@ int main() {
 // CHECK15-NEXT:    [[SIVAR_ADDR:%.*]] = alloca i32, align 4
 // CHECK15-NEXT:    [[T_VAR_CASTED:%.*]] = alloca i32, align 4
 // CHECK15-NEXT:    [[SIVAR_CASTED:%.*]] = alloca i32, align 4
+// CHECK15-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK15-NEXT:    store ptr [[VEC]], ptr [[VEC_ADDR]], align 4
 // CHECK15-NEXT:    store i32 [[T_VAR]], ptr [[T_VAR_ADDR]], align 4
 // CHECK15-NEXT:    store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 4
@@ -4280,7 +4286,7 @@ int main() {
 //
 //
 // CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l122.omp_outlined
-// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i32 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], i32 noundef [[SIVAR:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i32 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], i32 noundef [[SIVAR:%.*]]) #[[ATTR0]] {
 // CHECK15-NEXT:  entry:
 // CHECK15-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK15-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -4325,17 +4331,17 @@ int main() {
 // CHECK15:       omp.arraycpy.body:
 // CHECK15-NEXT:    [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi ptr [ [[TMP1]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
 // CHECK15-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi ptr [ [[ARRAY_BEGIN]], [[ENTRY]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
-// CHECK15-NEXT:    call void @_ZN2StC1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP]]) #[[ATTR5:[0-9]+]]
-// CHECK15-NEXT:    call void @_ZN1SIfEC1ERKS0_2St(ptr noundef nonnull align 4 dereferenceable(4) [[OMP_ARRAYCPY_DESTELEMENTPAST]], ptr noundef nonnull align 4 dereferenceable(4) [[OMP_ARRAYCPY_SRCELEMENTPAST]], ptr noundef [[AGG_TMP]]) #[[ATTR5]]
-// CHECK15-NEXT:    call void @_ZN2StD1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP]]) #[[ATTR6:[0-9]+]]
+// CHECK15-NEXT:    call void @_ZN2StC1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP]]) #[[ATTR4:[0-9]+]]
+// CHECK15-NEXT:    call void @_ZN1SIfEC1ERKS0_2St(ptr noundef nonnull align 4 dereferenceable(4) [[OMP_ARRAYCPY_DESTELEMENTPAST]], ptr noundef nonnull align 4 dereferenceable(4) [[OMP_ARRAYCPY_SRCELEMENTPAST]], ptr noundef [[AGG_TMP]]) #[[ATTR4]]
+// CHECK15-NEXT:    call void @_ZN2StD1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP]]) #[[ATTR5:[0-9]+]]
 // CHECK15-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr [[STRUCT_S]], ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
 // CHECK15-NEXT:    [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr [[STRUCT_S]], ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1
 // CHECK15-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP3]]
 // CHECK15-NEXT:    br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYCPY_DONE3]], label [[OMP_ARRAYCPY_BODY]]
 // CHECK15:       omp.arraycpy.done3:
-// CHECK15-NEXT:    call void @_ZN2StC1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP5]]) #[[ATTR5]]
-// CHECK15-NEXT:    call void @_ZN1SIfEC1ERKS0_2St(ptr noundef nonnull align 4 dereferenceable(4) [[VAR4]], ptr noundef nonnull align 4 dereferenceable(4) [[TMP2]], ptr noundef [[AGG_TMP5]]) #[[ATTR5]]
-// CHECK15-NEXT:    call void @_ZN2StD1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP5]]) #[[ATTR6]]
+// CHECK15-NEXT:    call void @_ZN2StC1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP5]]) #[[ATTR4]]
+// CHECK15-NEXT:    call void @_ZN1SIfEC1ERKS0_2St(ptr noundef nonnull align 4 dereferenceable(4) [[VAR4]], ptr noundef nonnull align 4 dereferenceable(4) [[TMP2]], ptr noundef [[AGG_TMP5]]) #[[ATTR4]]
+// CHECK15-NEXT:    call void @_ZN2StD1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP5]]) #[[ATTR5]]
 // CHECK15-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
 // CHECK15-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4
 // CHECK15-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB1:[0-9]+]], i32 [[TMP5]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
@@ -4390,14 +4396,14 @@ int main() {
 // CHECK15-NEXT:    store i32 2, ptr [[I]], align 4
 // CHECK15-NEXT:    br label [[DOTOMP_FINAL_DONE]]
 // CHECK15:       .omp.final.done:
-// CHECK15-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR4]]) #[[ATTR6]]
+// CHECK15-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR4]]) #[[ATTR5]]
 // CHECK15-NEXT:    [[ARRAY_BEGIN7:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR2]], i32 0, i32 0
 // CHECK15-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN7]], i32 2
 // CHECK15-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK15:       arraydestroy.body:
 // CHECK15-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP23]], [[DOTOMP_FINAL_DONE]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK15-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1
-// CHECK15-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR6]]
+// CHECK15-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
 // CHECK15-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN7]]
 // CHECK15-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE8:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK15:       arraydestroy.done8:
@@ -4405,17 +4411,17 @@ int main() {
 //
 //
 // CHECK15-LABEL: define {{[^@]+}}@_ZN2StC1Ev
-// CHECK15-SAME: (ptr noundef nonnull align 4 dereferenceable(8) [[THIS:%.*]]) unnamed_addr #[[ATTR3:[0-9]+]] comdat align 2 {
+// CHECK15-SAME: (ptr noundef nonnull align 4 dereferenceable(8) [[THIS:%.*]]) unnamed_addr #[[ATTR2:[0-9]+]] comdat align 2 {
 // CHECK15-NEXT:  entry:
 // CHECK15-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
 // CHECK15-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
 // CHECK15-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
-// CHECK15-NEXT:    call void @_ZN2StC2Ev(ptr noundef nonnull align 4 dereferenceable(8) [[THIS1]]) #[[ATTR5]]
+// CHECK15-NEXT:    call void @_ZN2StC2Ev(ptr noundef nonnull align 4 dereferenceable(8) [[THIS1]]) #[[ATTR4]]
 // CHECK15-NEXT:    ret void
 //
 //
 // CHECK15-LABEL: define {{[^@]+}}@_ZN1SIfEC1ERKS0_2St
-// CHECK15-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[S:%.*]], ptr noundef [[T:%.*]]) unnamed_addr #[[ATTR3]] comdat align 2 {
+// CHECK15-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[S:%.*]], ptr noundef [[T:%.*]]) unnamed_addr #[[ATTR2]] comdat align 2 {
 // CHECK15-NEXT:  entry:
 // CHECK15-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
 // CHECK15-NEXT:    [[S_ADDR:%.*]] = alloca ptr, align 4
@@ -4425,22 +4431,22 @@ int main() {
 // CHECK15-NEXT:    store ptr [[T]], ptr [[T_INDIRECT_ADDR]], align 4
 // CHECK15-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
 // CHECK15-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 4
-// CHECK15-NEXT:    call void @_ZN1SIfEC2ERKS0_2St(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]], ptr noundef nonnull align 4 dereferenceable(4) [[TMP0]], ptr noundef [[T]]) #[[ATTR5]]
+// CHECK15-NEXT:    call void @_ZN1SIfEC2ERKS0_2St(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]], ptr noundef nonnull align 4 dereferenceable(4) [[TMP0]], ptr noundef [[T]]) #[[ATTR4]]
 // CHECK15-NEXT:    ret void
 //
 //
 // CHECK15-LABEL: define {{[^@]+}}@_ZN2StD1Ev
-// CHECK15-SAME: (ptr noundef nonnull align 4 dereferenceable(8) [[THIS:%.*]]) unnamed_addr #[[ATTR3]] comdat align 2 {
+// CHECK15-SAME: (ptr noundef nonnull align 4 dereferenceable(8) [[THIS:%.*]]) unnamed_addr #[[ATTR2]] comdat align 2 {
 // CHECK15-NEXT:  entry:
 // CHECK15-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
 // CHECK15-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
 // CHECK15-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
-// CHECK15-NEXT:    call void @_ZN2StD2Ev(ptr noundef nonnull align 4 dereferenceable(8) [[THIS1]]) #[[ATTR6]]
+// CHECK15-NEXT:    call void @_ZN2StD2Ev(ptr noundef nonnull align 4 dereferenceable(8) [[THIS1]]) #[[ATTR5]]
 // CHECK15-NEXT:    ret void
 //
 //
 // CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l122.omp_outlined.omp_outlined
-// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i32 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], i32 noundef [[SIVAR:%.*]]) #[[ATTR1]] {
+// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i32 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], i32 noundef [[SIVAR:%.*]]) #[[ATTR0]] {
 // CHECK15-NEXT:  entry:
 // CHECK15-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK15-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -4491,17 +4497,17 @@ int main() {
 // CHECK15:       omp.arraycpy.body:
 // CHECK15-NEXT:    [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi ptr [ [[TMP1]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
 // CHECK15-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi ptr [ [[ARRAY_BEGIN]], [[ENTRY]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
-// CHECK15-NEXT:    call void @_ZN2StC1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP]]) #[[ATTR5]]
-// CHECK15-NEXT:    call void @_ZN1SIfEC1ERKS0_2St(ptr noundef nonnull align 4 dereferenceable(4) [[OMP_ARRAYCPY_DESTELEMENTPAST]], ptr noundef nonnull align 4 dereferenceable(4) [[OMP_ARRAYCPY_SRCELEMENTPAST]], ptr noundef [[AGG_TMP]]) #[[ATTR5]]
-// CHECK15-NEXT:    call void @_ZN2StD1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP]]) #[[ATTR6]]
+// CHECK15-NEXT:    call void @_ZN2StC1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP]]) #[[ATTR4]]
+// CHECK15-NEXT:    call void @_ZN1SIfEC1ERKS0_2St(ptr noundef nonnull align 4 dereferenceable(4) [[OMP_ARRAYCPY_DESTELEMENTPAST]], ptr noundef nonnull align 4 dereferenceable(4) [[OMP_ARRAYCPY_SRCELEMENTPAST]], ptr noundef [[AGG_TMP]]) #[[ATTR4]]
+// CHECK15-NEXT:    call void @_ZN2StD1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP]]) #[[ATTR5]]
 // CHECK15-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr [[STRUCT_S]], ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
 // CHECK15-NEXT:    [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr [[STRUCT_S]], ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1
 // CHECK15-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP5]]
 // CHECK15-NEXT:    br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYCPY_DONE3]], label [[OMP_ARRAYCPY_BODY]]
 // CHECK15:       omp.arraycpy.done3:
-// CHECK15-NEXT:    call void @_ZN2StC1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP5]]) #[[ATTR5]]
-// CHECK15-NEXT:    call void @_ZN1SIfEC1ERKS0_2St(ptr noundef nonnull align 4 dereferenceable(4) [[VAR4]], ptr noundef nonnull align 4 dereferenceable(4) [[TMP2]], ptr noundef [[AGG_TMP5]]) #[[ATTR5]]
-// CHECK15-NEXT:    call void @_ZN2StD1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP5]]) #[[ATTR6]]
+// CHECK15-NEXT:    call void @_ZN2StC1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP5]]) #[[ATTR4]]
+// CHECK15-NEXT:    call void @_ZN1SIfEC1ERKS0_2St(ptr noundef nonnull align 4 dereferenceable(4) [[VAR4]], ptr noundef nonnull align 4 dereferenceable(4) [[TMP2]], ptr noundef [[AGG_TMP5]]) #[[ATTR4]]
+// CHECK15-NEXT:    call void @_ZN2StD1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP5]]) #[[ATTR5]]
 // CHECK15-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
 // CHECK15-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4
 // CHECK15-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB2:[0-9]+]], i32 [[TMP7]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
@@ -4563,14 +4569,14 @@ int main() {
 // CHECK15-NEXT:    store i32 2, ptr [[I]], align 4
 // CHECK15-NEXT:    br label [[DOTOMP_FINAL_DONE]]
 // CHECK15:       .omp.final.done:
-// CHECK15-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR4]]) #[[ATTR6]]
+// CHECK15-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR4]]) #[[ATTR5]]
 // CHECK15-NEXT:    [[ARRAY_BEGIN10:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR2]], i32 0, i32 0
 // CHECK15-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN10]], i32 2
 // CHECK15-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK15:       arraydestroy.body:
 // CHECK15-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP24]], [[DOTOMP_FINAL_DONE]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK15-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1
-// CHECK15-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR6]]
+// CHECK15-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
 // CHECK15-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN10]]
 // CHECK15-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE11:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK15:       arraydestroy.done11:
@@ -4578,24 +4584,26 @@ int main() {
 //
 //
 // CHECK15-LABEL: define {{[^@]+}}@_ZN1SIfED1Ev
-// CHECK15-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR3]] comdat align 2 {
+// CHECK15-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR2]] comdat align 2 {
 // CHECK15-NEXT:  entry:
 // CHECK15-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
 // CHECK15-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
 // CHECK15-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
-// CHECK15-NEXT:    call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR6]]
+// CHECK15-NEXT:    call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]]
 // CHECK15-NEXT:    ret void
 //
 //
 // CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l81
-// CHECK15-SAME: (ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i32 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR0]] {
+// CHECK15-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i32 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR0]] {
 // CHECK15-NEXT:  entry:
+// CHECK15-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK15-NEXT:    [[VEC_ADDR:%.*]] = alloca ptr, align 4
 // CHECK15-NEXT:    [[T_VAR_ADDR:%.*]] = alloca i32, align 4
 // CHECK15-NEXT:    [[S_ARR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK15-NEXT:    [[VAR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK15-NEXT:    [[TMP:%.*]] = alloca ptr, align 4
 // CHECK15-NEXT:    [[T_VAR_CASTED:%.*]] = alloca i32, align 4
+// CHECK15-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK15-NEXT:    store ptr [[VEC]], ptr [[VEC_ADDR]], align 4
 // CHECK15-NEXT:    store i32 [[T_VAR]], ptr [[T_VAR_ADDR]], align 4
 // CHECK15-NEXT:    store ptr [[S_ARR]], ptr [[S_ARR_ADDR]], align 4
@@ -4613,7 +4621,7 @@ int main() {
 //
 //
 // CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l81.omp_outlined
-// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i32 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR1]] {
+// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i32 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR0]] {
 // CHECK15-NEXT:  entry:
 // CHECK15-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK15-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -4658,18 +4666,18 @@ int main() {
 // CHECK15:       omp.arraycpy.body:
 // CHECK15-NEXT:    [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi ptr [ [[TMP1]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
 // CHECK15-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi ptr [ [[ARRAY_BEGIN]], [[ENTRY]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
-// CHECK15-NEXT:    call void @_ZN2StC1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP]]) #[[ATTR5]]
-// CHECK15-NEXT:    call void @_ZN1SIiEC1ERKS0_2St(ptr noundef nonnull align 4 dereferenceable(4) [[OMP_ARRAYCPY_DESTELEMENTPAST]], ptr noundef nonnull align 4 dereferenceable(4) [[OMP_ARRAYCPY_SRCELEMENTPAST]], ptr noundef [[AGG_TMP]]) #[[ATTR5]]
-// CHECK15-NEXT:    call void @_ZN2StD1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP]]) #[[ATTR6]]
+// CHECK15-NEXT:    call void @_ZN2StC1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP]]) #[[ATTR4]]
+// CHECK15-NEXT:    call void @_ZN1SIiEC1ERKS0_2St(ptr noundef nonnull align 4 dereferenceable(4) [[OMP_ARRAYCPY_DESTELEMENTPAST]], ptr noundef nonnull align 4 dereferenceable(4) [[OMP_ARRAYCPY_SRCELEMENTPAST]], ptr noundef [[AGG_TMP]]) #[[ATTR4]]
+// CHECK15-NEXT:    call void @_ZN2StD1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP]]) #[[ATTR5]]
 // CHECK15-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr [[STRUCT_S_0]], ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
 // CHECK15-NEXT:    [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr [[STRUCT_S_0]], ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1
 // CHECK15-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP3]]
 // CHECK15-NEXT:    br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYCPY_DONE4]], label [[OMP_ARRAYCPY_BODY]]
 // CHECK15:       omp.arraycpy.done4:
 // CHECK15-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[TMP]], align 4
-// CHECK15-NEXT:    call void @_ZN2StC1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP6]]) #[[ATTR5]]
-// CHECK15-NEXT:    call void @_ZN1SIiEC1ERKS0_2St(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]], ptr noundef nonnull align 4 dereferenceable(4) [[TMP4]], ptr noundef [[AGG_TMP6]]) #[[ATTR5]]
-// CHECK15-NEXT:    call void @_ZN2StD1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP6]]) #[[ATTR6]]
+// CHECK15-NEXT:    call void @_ZN2StC1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP6]]) #[[ATTR4]]
+// CHECK15-NEXT:    call void @_ZN1SIiEC1ERKS0_2St(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]], ptr noundef nonnull align 4 dereferenceable(4) [[TMP4]], ptr noundef [[AGG_TMP6]]) #[[ATTR4]]
+// CHECK15-NEXT:    call void @_ZN2StD1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP6]]) #[[ATTR5]]
 // CHECK15-NEXT:    store ptr [[VAR5]], ptr [[_TMP7]], align 4
 // CHECK15-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
 // CHECK15-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4
@@ -4723,14 +4731,14 @@ int main() {
 // CHECK15-NEXT:    store i32 2, ptr [[I]], align 4
 // CHECK15-NEXT:    br label [[DOTOMP_FINAL_DONE]]
 // CHECK15:       .omp.final.done:
-// CHECK15-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR6]]
+// CHECK15-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR5]]
 // CHECK15-NEXT:    [[ARRAY_BEGIN9:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR3]], i32 0, i32 0
 // CHECK15-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN9]], i32 2
 // CHECK15-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK15:       arraydestroy.body:
 // CHECK15-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP23]], [[DOTOMP_FINAL_DONE]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK15-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1
-// CHECK15-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR6]]
+// CHECK15-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
 // CHECK15-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN9]]
 // CHECK15-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE10:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK15:       arraydestroy.done10:
@@ -4738,7 +4746,7 @@ int main() {
 //
 //
 // CHECK15-LABEL: define {{[^@]+}}@_ZN1SIiEC1ERKS0_2St
-// CHECK15-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[S:%.*]], ptr noundef [[T:%.*]]) unnamed_addr #[[ATTR3]] comdat align 2 {
+// CHECK15-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[S:%.*]], ptr noundef [[T:%.*]]) unnamed_addr #[[ATTR2]] comdat align 2 {
 // CHECK15-NEXT:  entry:
 // CHECK15-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
 // CHECK15-NEXT:    [[S_ADDR:%.*]] = alloca ptr, align 4
@@ -4748,12 +4756,12 @@ int main() {
 // CHECK15-NEXT:    store ptr [[T]], ptr [[T_INDIRECT_ADDR]], align 4
 // CHECK15-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
 // CHECK15-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[S_ADDR]], align 4
-// CHECK15-NEXT:    call void @_ZN1SIiEC2ERKS0_2St(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]], ptr noundef nonnull align 4 dereferenceable(4) [[TMP0]], ptr noundef [[T]]) #[[ATTR5]]
+// CHECK15-NEXT:    call void @_ZN1SIiEC2ERKS0_2St(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]], ptr noundef nonnull align 4 dereferenceable(4) [[TMP0]], ptr noundef [[T]]) #[[ATTR4]]
 // CHECK15-NEXT:    ret void
 //
 //
 // CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l81.omp_outlined.omp_outlined
-// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i32 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR1]] {
+// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i32 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR0]] {
 // CHECK15-NEXT:  entry:
 // CHECK15-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK15-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -4805,18 +4813,18 @@ int main() {
 // CHECK15:       omp.arraycpy.body:
 // CHECK15-NEXT:    [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi ptr [ [[TMP1]], [[ENTRY:%.*]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
 // CHECK15-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST:%.*]] = phi ptr [ [[ARRAY_BEGIN]], [[ENTRY]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
-// CHECK15-NEXT:    call void @_ZN2StC1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP]]) #[[ATTR5]]
-// CHECK15-NEXT:    call void @_ZN1SIiEC1ERKS0_2St(ptr noundef nonnull align 4 dereferenceable(4) [[OMP_ARRAYCPY_DESTELEMENTPAST]], ptr noundef nonnull align 4 dereferenceable(4) [[OMP_ARRAYCPY_SRCELEMENTPAST]], ptr noundef [[AGG_TMP]]) #[[ATTR5]]
-// CHECK15-NEXT:    call void @_ZN2StD1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP]]) #[[ATTR6]]
+// CHECK15-NEXT:    call void @_ZN2StC1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP]]) #[[ATTR4]]
+// CHECK15-NEXT:    call void @_ZN1SIiEC1ERKS0_2St(ptr noundef nonnull align 4 dereferenceable(4) [[OMP_ARRAYCPY_DESTELEMENTPAST]], ptr noundef nonnull align 4 dereferenceable(4) [[OMP_ARRAYCPY_SRCELEMENTPAST]], ptr noundef [[AGG_TMP]]) #[[ATTR4]]
+// CHECK15-NEXT:    call void @_ZN2StD1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP]]) #[[ATTR5]]
 // CHECK15-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT]] = getelementptr [[STRUCT_S_0]], ptr [[OMP_ARRAYCPY_DESTELEMENTPAST]], i32 1
 // CHECK15-NEXT:    [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr [[STRUCT_S_0]], ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1
 // CHECK15-NEXT:    [[OMP_ARRAYCPY_DONE:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT]], [[TMP5]]
 // CHECK15-NEXT:    br i1 [[OMP_ARRAYCPY_DONE]], label [[OMP_ARRAYCPY_DONE4]], label [[OMP_ARRAYCPY_BODY]]
 // CHECK15:       omp.arraycpy.done4:
 // CHECK15-NEXT:    [[TMP6:%.*]] = load ptr, ptr [[TMP]], align 4
-// CHECK15-NEXT:    call void @_ZN2StC1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP6]]) #[[ATTR5]]
-// CHECK15-NEXT:    call void @_ZN1SIiEC1ERKS0_2St(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]], ptr noundef nonnull align 4 dereferenceable(4) [[TMP6]], ptr noundef [[AGG_TMP6]]) #[[ATTR5]]
-// CHECK15-NEXT:    call void @_ZN2StD1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP6]]) #[[ATTR6]]
+// CHECK15-NEXT:    call void @_ZN2StC1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP6]]) #[[ATTR4]]
+// CHECK15-NEXT:    call void @_ZN1SIiEC1ERKS0_2St(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]], ptr noundef nonnull align 4 dereferenceable(4) [[TMP6]], ptr noundef [[AGG_TMP6]]) #[[ATTR4]]
+// CHECK15-NEXT:    call void @_ZN2StD1Ev(ptr noundef nonnull align 4 dereferenceable(8) [[AGG_TMP6]]) #[[ATTR5]]
 // CHECK15-NEXT:    store ptr [[VAR5]], ptr [[_TMP7]], align 4
 // CHECK15-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
 // CHECK15-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4
@@ -4876,14 +4884,14 @@ int main() {
 // CHECK15-NEXT:    store i32 2, ptr [[I]], align 4
 // CHECK15-NEXT:    br label [[DOTOMP_FINAL_DONE]]
 // CHECK15:       .omp.final.done:
-// CHECK15-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR6]]
+// CHECK15-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR5]]
 // CHECK15-NEXT:    [[ARRAY_BEGIN11:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR3]], i32 0, i32 0
 // CHECK15-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN11]], i32 2
 // CHECK15-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK15:       arraydestroy.body:
 // CHECK15-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP24]], [[DOTOMP_FINAL_DONE]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK15-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1
-// CHECK15-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR6]]
+// CHECK15-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
 // CHECK15-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN11]]
 // CHECK15-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE12:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK15:       arraydestroy.done12:
@@ -4891,17 +4899,17 @@ int main() {
 //
 //
 // CHECK15-LABEL: define {{[^@]+}}@_ZN1SIiED1Ev
-// CHECK15-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR3]] comdat align 2 {
+// CHECK15-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR2]] comdat align 2 {
 // CHECK15-NEXT:  entry:
 // CHECK15-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
 // CHECK15-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
 // CHECK15-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
-// CHECK15-NEXT:    call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR6]]
+// CHECK15-NEXT:    call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]]
 // CHECK15-NEXT:    ret void
 //
 //
 // CHECK15-LABEL: define {{[^@]+}}@_ZN2StC2Ev
-// CHECK15-SAME: (ptr noundef nonnull align 4 dereferenceable(8) [[THIS:%.*]]) unnamed_addr #[[ATTR3]] comdat align 2 {
+// CHECK15-SAME: (ptr noundef nonnull align 4 dereferenceable(8) [[THIS:%.*]]) unnamed_addr #[[ATTR2]] comdat align 2 {
 // CHECK15-NEXT:  entry:
 // CHECK15-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
 // CHECK15-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
@@ -4914,7 +4922,7 @@ int main() {
 //
 //
 // CHECK15-LABEL: define {{[^@]+}}@_ZN2StD2Ev
-// CHECK15-SAME: (ptr noundef nonnull align 4 dereferenceable(8) [[THIS:%.*]]) unnamed_addr #[[ATTR3]] comdat align 2 {
+// CHECK15-SAME: (ptr noundef nonnull align 4 dereferenceable(8) [[THIS:%.*]]) unnamed_addr #[[ATTR2]] comdat align 2 {
 // CHECK15-NEXT:  entry:
 // CHECK15-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
 // CHECK15-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
@@ -4923,7 +4931,7 @@ int main() {
 //
 //
 // CHECK15-LABEL: define {{[^@]+}}@_ZN1SIfED2Ev
-// CHECK15-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR3]] comdat align 2 {
+// CHECK15-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR2]] comdat align 2 {
 // CHECK15-NEXT:  entry:
 // CHECK15-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
 // CHECK15-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
@@ -4932,7 +4940,7 @@ int main() {
 //
 //
 // CHECK15-LABEL: define {{[^@]+}}@_ZN1SIfEC2ERKS0_2St
-// CHECK15-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[S:%.*]], ptr noundef [[T:%.*]]) unnamed_addr #[[ATTR3]] comdat align 2 {
+// CHECK15-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[S:%.*]], ptr noundef [[T:%.*]]) unnamed_addr #[[ATTR2]] comdat align 2 {
 // CHECK15-NEXT:  entry:
 // CHECK15-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
 // CHECK15-NEXT:    [[S_ADDR:%.*]] = alloca ptr, align 4
@@ -4954,7 +4962,7 @@ int main() {
 //
 //
 // CHECK15-LABEL: define {{[^@]+}}@_ZN1SIiED2Ev
-// CHECK15-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR3]] comdat align 2 {
+// CHECK15-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR2]] comdat align 2 {
 // CHECK15-NEXT:  entry:
 // CHECK15-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
 // CHECK15-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
@@ -4963,7 +4971,7 @@ int main() {
 //
 //
 // CHECK15-LABEL: define {{[^@]+}}@_ZN1SIiEC2ERKS0_2St
-// CHECK15-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[S:%.*]], ptr noundef [[T:%.*]]) unnamed_addr #[[ATTR3]] comdat align 2 {
+// CHECK15-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[S:%.*]], ptr noundef [[T:%.*]]) unnamed_addr #[[ATTR2]] comdat align 2 {
 // CHECK15-NEXT:  entry:
 // CHECK15-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
 // CHECK15-NEXT:    [[S_ADDR:%.*]] = alloca ptr, align 4
@@ -4984,8 +4992,9 @@ int main() {
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l99
-// CHECK17-SAME: (i64 noundef [[G:%.*]], i64 noundef [[G1:%.*]], i64 noundef [[SIVAR:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK17-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[G:%.*]], i64 noundef [[G1:%.*]], i64 noundef [[SIVAR:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK17-NEXT:  entry:
+// CHECK17-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[G_ADDR:%.*]] = alloca i64, align 8
 // CHECK17-NEXT:    [[G1_ADDR:%.*]] = alloca i64, align 8
 // CHECK17-NEXT:    [[SIVAR_ADDR:%.*]] = alloca i64, align 8
@@ -4993,6 +5002,7 @@ int main() {
 // CHECK17-NEXT:    [[G_CASTED:%.*]] = alloca i64, align 8
 // CHECK17-NEXT:    [[G1_CASTED:%.*]] = alloca i64, align 8
 // CHECK17-NEXT:    [[SIVAR_CASTED:%.*]] = alloca i64, align 8
+// CHECK17-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK17-NEXT:    store i64 [[G]], ptr [[G_ADDR]], align 8
 // CHECK17-NEXT:    store i64 [[G1]], ptr [[G1_ADDR]], align 8
 // CHECK17-NEXT:    store i64 [[SIVAR]], ptr [[SIVAR_ADDR]], align 8
@@ -5012,7 +5022,7 @@ int main() {
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l99.omp_outlined
-// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[G:%.*]], i64 noundef [[G1:%.*]], i64 noundef [[SIVAR:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[G:%.*]], i64 noundef [[G1:%.*]], i64 noundef [[SIVAR:%.*]]) #[[ATTR0]] {
 // CHECK17-NEXT:  entry:
 // CHECK17-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -5100,7 +5110,7 @@ int main() {
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l99.omp_outlined.omp_outlined
-// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[G:%.*]], i64 noundef [[G1:%.*]], i64 noundef [[SIVAR:%.*]]) #[[ATTR1]] {
+// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[G:%.*]], i64 noundef [[G1:%.*]], i64 noundef [[SIVAR:%.*]]) #[[ATTR0]] {
 // CHECK17-NEXT:  entry:
 // CHECK17-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -5174,7 +5184,7 @@ int main() {
 // CHECK17-NEXT:    store ptr [[TMP13]], ptr [[TMP12]], align 8, !llvm.access.group [[ACC_GRP9]]
 // CHECK17-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[CLASS_ANON]], ptr [[REF_TMP]], i32 0, i32 2
 // CHECK17-NEXT:    store ptr [[SIVAR_ADDR]], ptr [[TMP14]], align 8, !llvm.access.group [[ACC_GRP9]]
-// CHECK17-NEXT:    call void @"_ZZZ4mainENK3$_0clEvENKUlvE_clEv"(ptr noundef nonnull align 8 dereferenceable(24) [[REF_TMP]]) #[[ATTR4:[0-9]+]], !llvm.access.group [[ACC_GRP9]]
+// CHECK17-NEXT:    call void @"_ZZZ4mainENK3$_0clEvENKUlvE_clEv"(ptr noundef nonnull align 8 dereferenceable(24) [[REF_TMP]]) #[[ATTR3:[0-9]+]], !llvm.access.group [[ACC_GRP9]]
 // CHECK17-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK17:       omp.body.continue:
 // CHECK17-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_lastprivate_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_lastprivate_codegen.cpp
index 3f78d5c457342..65b57ceb39adf 100644
--- a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_lastprivate_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_lastprivate_codegen.cpp
@@ -222,7 +222,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l67.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[G1:%.*]], i64 noundef [[SVAR:%.*]], i64 noundef [[SFVAR:%.*]], i64 noundef [[G:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[G1:%.*]], i64 noundef [[SVAR:%.*]], i64 noundef [[SFVAR:%.*]], i64 noundef [[G:%.*]]) #[[ATTR2]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -338,7 +338,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l67.omp_outlined.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[G1:%.*]], i64 noundef [[SVAR:%.*]], i64 noundef [[SFVAR:%.*]], i64 noundef [[G:%.*]]) #[[ATTR3]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[G1:%.*]], i64 noundef [[SVAR:%.*]], i64 noundef [[SFVAR:%.*]], i64 noundef [[G:%.*]]) #[[ATTR2]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -462,7 +462,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK1-SAME: () #[[ATTR5:[0-9]+]] {
+// CHECK1-SAME: () #[[ATTR4:[0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK1-NEXT:    ret void
@@ -515,7 +515,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l67.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[G1:%.*]], i32 noundef [[SVAR:%.*]], i32 noundef [[SFVAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[G:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[G1:%.*]], i32 noundef [[SVAR:%.*]], i32 noundef [[SFVAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[G:%.*]]) #[[ATTR2]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -623,7 +623,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l67.omp_outlined.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[G1:%.*]], i32 noundef [[SVAR:%.*]], i32 noundef [[SFVAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[G:%.*]]) #[[ATTR3]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[G1:%.*]], i32 noundef [[SVAR:%.*]], i32 noundef [[SFVAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[G:%.*]]) #[[ATTR2]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -747,7 +747,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK3-SAME: () #[[ATTR5:[0-9]+]] {
+// CHECK3-SAME: () #[[ATTR4:[0-9]+]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK3-NEXT:    ret void
@@ -854,7 +854,7 @@ int main() {
 // CHECK5-NEXT:    [[TMP37:%.*]] = icmp ne i32 [[TMP36]], 0
 // CHECK5-NEXT:    br i1 [[TMP37]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK5:       omp_offload.failed:
-// CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l106(ptr [[VEC]], i64 [[TMP2]], ptr [[S_ARR]], ptr [[TMP3]], i64 [[TMP5]]) #[[ATTR5:[0-9]+]]
+// CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l106(ptr [[VEC]], i64 [[TMP2]], ptr [[S_ARR]], ptr [[TMP3]], i64 [[TMP5]]) #[[ATTR4:[0-9]+]]
 // CHECK5-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK5:       omp_offload.cont:
 // CHECK5-NEXT:    [[CALL:%.*]] = call noundef signext i32 @_Z5tmainIiET_v()
@@ -865,11 +865,11 @@ int main() {
 // CHECK5:       arraydestroy.body:
 // CHECK5-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP38]], [[OMP_OFFLOAD_CONT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK5-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
-// CHECK5-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK5-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK5-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN]]
 // CHECK5-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE2:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK5:       arraydestroy.done2:
-// CHECK5-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR5]]
+// CHECK5-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]]
 // CHECK5-NEXT:    [[TMP39:%.*]] = load i32, ptr [[RETVAL]], align 4
 // CHECK5-NEXT:    ret i32 [[TMP39]]
 //
@@ -929,7 +929,7 @@ int main() {
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l106.omp_outlined
-// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i64 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], i64 noundef [[SVAR:%.*]]) #[[ATTR4:[0-9]+]] {
+// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i64 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], i64 noundef [[SVAR:%.*]]) #[[ATTR3]] {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1065,14 +1065,14 @@ int main() {
 // CHECK5-NEXT:    store i32 [[TMP31]], ptr [[SVAR_ADDR]], align 4
 // CHECK5-NEXT:    br label [[DOTOMP_LASTPRIVATE_DONE]]
 // CHECK5:       .omp.lastprivate.done:
-// CHECK5-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR5]]
+// CHECK5-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR4]]
 // CHECK5-NEXT:    [[ARRAY_BEGIN11:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR4]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP32:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN11]], i64 2
 // CHECK5-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK5:       arraydestroy.body:
 // CHECK5-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP32]], [[DOTOMP_LASTPRIVATE_DONE]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK5-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
-// CHECK5-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK5-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK5-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN11]]
 // CHECK5-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE12:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK5:       arraydestroy.done12:
@@ -1080,7 +1080,7 @@ int main() {
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l106.omp_outlined.omp_outlined
-// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i64 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], i64 noundef [[SVAR:%.*]]) #[[ATTR4]] {
+// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i64 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], i64 noundef [[SVAR:%.*]]) #[[ATTR3]] {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1227,14 +1227,14 @@ int main() {
 // CHECK5-NEXT:    store i32 [[TMP28]], ptr [[SVAR_ADDR]], align 4
 // CHECK5-NEXT:    br label [[DOTOMP_LASTPRIVATE_DONE]]
 // CHECK5:       .omp.lastprivate.done:
-// CHECK5-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR6]]) #[[ATTR5]]
+// CHECK5-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR6]]) #[[ATTR4]]
 // CHECK5-NEXT:    [[ARRAY_BEGIN15:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR5]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN15]], i64 2
 // CHECK5-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK5:       arraydestroy.body:
 // CHECK5-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP29]], [[DOTOMP_LASTPRIVATE_DONE]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK5-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
-// CHECK5-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK5-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK5-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN15]]
 // CHECK5-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE16:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK5:       arraydestroy.done16:
@@ -1247,12 +1247,12 @@ int main() {
 // CHECK5-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
 // CHECK5-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
-// CHECK5-NEXT:    call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]]
+// CHECK5-NEXT:    call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]]
 // CHECK5-NEXT:    ret void
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@_Z5tmainIiET_v
-// CHECK5-SAME: () #[[ATTR6:[0-9]+]] comdat {
+// CHECK5-SAME: () #[[ATTR5:[0-9]+]] comdat {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
 // CHECK5-NEXT:    [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4
@@ -1337,7 +1337,7 @@ int main() {
 // CHECK5-NEXT:    [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0
 // CHECK5-NEXT:    br i1 [[TMP32]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK5:       omp_offload.failed:
-// CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l50(ptr [[VEC]], i64 [[TMP2]], ptr [[S_ARR]], ptr [[TMP3]]) #[[ATTR5]]
+// CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l50(ptr [[VEC]], i64 [[TMP2]], ptr [[S_ARR]], ptr [[TMP3]]) #[[ATTR4]]
 // CHECK5-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK5:       omp_offload.cont:
 // CHECK5-NEXT:    store i32 0, ptr [[RETVAL]], align 4
@@ -1347,11 +1347,11 @@ int main() {
 // CHECK5:       arraydestroy.body:
 // CHECK5-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP33]], [[OMP_OFFLOAD_CONT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK5-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
-// CHECK5-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK5-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK5-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN]]
 // CHECK5-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE2:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK5:       arraydestroy.done2:
-// CHECK5-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR5]]
+// CHECK5-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]]
 // CHECK5-NEXT:    [[TMP34:%.*]] = load i32, ptr [[RETVAL]], align 4
 // CHECK5-NEXT:    ret i32 [[TMP34]]
 //
@@ -1439,7 +1439,7 @@ int main() {
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l50.omp_outlined
-// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i64 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR4]] {
+// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i64 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR3]] {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1566,14 +1566,14 @@ int main() {
 // CHECK5-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP3]], ptr align 4 [[TMP28]], i64 4, i1 false)
 // CHECK5-NEXT:    br label [[DOTOMP_LASTPRIVATE_DONE]]
 // CHECK5:       .omp.lastprivate.done:
-// CHECK5-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR5]]
+// CHECK5-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR4]]
 // CHECK5-NEXT:    [[ARRAY_BEGIN10:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR4]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN10]], i64 2
 // CHECK5-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK5:       arraydestroy.body:
 // CHECK5-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP29]], [[DOTOMP_LASTPRIVATE_DONE]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK5-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
-// CHECK5-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK5-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK5-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN10]]
 // CHECK5-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE11:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK5:       arraydestroy.done11:
@@ -1581,7 +1581,7 @@ int main() {
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l50.omp_outlined.omp_outlined
-// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i64 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR4]] {
+// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i64 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR3]] {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1723,14 +1723,14 @@ int main() {
 // CHECK5-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP5]], ptr align 4 [[TMP27]], i64 4, i1 false)
 // CHECK5-NEXT:    br label [[DOTOMP_LASTPRIVATE_DONE]]
 // CHECK5:       .omp.lastprivate.done:
-// CHECK5-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR6]]) #[[ATTR5]]
+// CHECK5-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR6]]) #[[ATTR4]]
 // CHECK5-NEXT:    [[ARRAY_BEGIN14:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR5]], i32 0, i32 0
 // CHECK5-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN14]], i64 2
 // CHECK5-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK5:       arraydestroy.body:
 // CHECK5-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP28]], [[DOTOMP_LASTPRIVATE_DONE]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK5-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
-// CHECK5-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK5-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK5-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN14]]
 // CHECK5-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE15:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK5:       arraydestroy.done15:
@@ -1743,7 +1743,7 @@ int main() {
 // CHECK5-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
 // CHECK5-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
-// CHECK5-NEXT:    call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]]
+// CHECK5-NEXT:    call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]]
 // CHECK5-NEXT:    ret void
 //
 //
@@ -1782,7 +1782,7 @@ int main() {
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK5-SAME: () #[[ATTR7:[0-9]+]] {
+// CHECK5-SAME: () #[[ATTR6:[0-9]+]] {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK5-NEXT:    ret void
@@ -1889,7 +1889,7 @@ int main() {
 // CHECK7-NEXT:    [[TMP37:%.*]] = icmp ne i32 [[TMP36]], 0
 // CHECK7-NEXT:    br i1 [[TMP37]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK7:       omp_offload.failed:
-// CHECK7-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l106(ptr [[VEC]], i32 [[TMP2]], ptr [[S_ARR]], ptr [[TMP3]], i32 [[TMP5]]) #[[ATTR5:[0-9]+]]
+// CHECK7-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l106(ptr [[VEC]], i32 [[TMP2]], ptr [[S_ARR]], ptr [[TMP3]], i32 [[TMP5]]) #[[ATTR4:[0-9]+]]
 // CHECK7-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK7:       omp_offload.cont:
 // CHECK7-NEXT:    [[CALL:%.*]] = call noundef i32 @_Z5tmainIiET_v()
@@ -1900,11 +1900,11 @@ int main() {
 // CHECK7:       arraydestroy.body:
 // CHECK7-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP38]], [[OMP_OFFLOAD_CONT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK7-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1
-// CHECK7-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK7-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK7-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN]]
 // CHECK7-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE2:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK7:       arraydestroy.done2:
-// CHECK7-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR5]]
+// CHECK7-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]]
 // CHECK7-NEXT:    [[TMP39:%.*]] = load i32, ptr [[RETVAL]], align 4
 // CHECK7-NEXT:    ret i32 [[TMP39]]
 //
@@ -1964,7 +1964,7 @@ int main() {
 //
 //
 // CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l106.omp_outlined
-// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i32 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], i32 noundef [[SVAR:%.*]]) #[[ATTR4:[0-9]+]] {
+// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i32 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], i32 noundef [[SVAR:%.*]]) #[[ATTR3]] {
 // CHECK7-NEXT:  entry:
 // CHECK7-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK7-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -2098,14 +2098,14 @@ int main() {
 // CHECK7-NEXT:    store i32 [[TMP29]], ptr [[SVAR_ADDR]], align 4
 // CHECK7-NEXT:    br label [[DOTOMP_LASTPRIVATE_DONE]]
 // CHECK7:       .omp.lastprivate.done:
-// CHECK7-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR5]]
+// CHECK7-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR4]]
 // CHECK7-NEXT:    [[ARRAY_BEGIN11:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR4]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN11]], i32 2
 // CHECK7-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK7:       arraydestroy.body:
 // CHECK7-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP30]], [[DOTOMP_LASTPRIVATE_DONE]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK7-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1
-// CHECK7-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK7-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK7-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN11]]
 // CHECK7-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE12:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK7:       arraydestroy.done12:
@@ -2113,7 +2113,7 @@ int main() {
 //
 //
 // CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l106.omp_outlined.omp_outlined
-// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i32 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], i32 noundef [[SVAR:%.*]]) #[[ATTR4]] {
+// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i32 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], i32 noundef [[SVAR:%.*]]) #[[ATTR3]] {
 // CHECK7-NEXT:  entry:
 // CHECK7-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK7-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -2256,14 +2256,14 @@ int main() {
 // CHECK7-NEXT:    store i32 [[TMP28]], ptr [[SVAR_ADDR]], align 4
 // CHECK7-NEXT:    br label [[DOTOMP_LASTPRIVATE_DONE]]
 // CHECK7:       .omp.lastprivate.done:
-// CHECK7-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR5]]
+// CHECK7-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR4]]
 // CHECK7-NEXT:    [[ARRAY_BEGIN13:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR4]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP29:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN13]], i32 2
 // CHECK7-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK7:       arraydestroy.body:
 // CHECK7-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP29]], [[DOTOMP_LASTPRIVATE_DONE]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK7-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1
-// CHECK7-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK7-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK7-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN13]]
 // CHECK7-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE14:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK7:       arraydestroy.done14:
@@ -2276,12 +2276,12 @@ int main() {
 // CHECK7-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
 // CHECK7-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
 // CHECK7-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
-// CHECK7-NEXT:    call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]]
+// CHECK7-NEXT:    call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]]
 // CHECK7-NEXT:    ret void
 //
 //
 // CHECK7-LABEL: define {{[^@]+}}@_Z5tmainIiET_v
-// CHECK7-SAME: () #[[ATTR6:[0-9]+]] comdat {
+// CHECK7-SAME: () #[[ATTR5:[0-9]+]] comdat {
 // CHECK7-NEXT:  entry:
 // CHECK7-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
 // CHECK7-NEXT:    [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4
@@ -2366,7 +2366,7 @@ int main() {
 // CHECK7-NEXT:    [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0
 // CHECK7-NEXT:    br i1 [[TMP32]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK7:       omp_offload.failed:
-// CHECK7-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l50(ptr [[VEC]], i32 [[TMP2]], ptr [[S_ARR]], ptr [[TMP3]]) #[[ATTR5]]
+// CHECK7-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l50(ptr [[VEC]], i32 [[TMP2]], ptr [[S_ARR]], ptr [[TMP3]]) #[[ATTR4]]
 // CHECK7-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK7:       omp_offload.cont:
 // CHECK7-NEXT:    store i32 0, ptr [[RETVAL]], align 4
@@ -2376,11 +2376,11 @@ int main() {
 // CHECK7:       arraydestroy.body:
 // CHECK7-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP33]], [[OMP_OFFLOAD_CONT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK7-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1
-// CHECK7-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK7-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK7-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN]]
 // CHECK7-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE2:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK7:       arraydestroy.done2:
-// CHECK7-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR5]]
+// CHECK7-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]]
 // CHECK7-NEXT:    [[TMP34:%.*]] = load i32, ptr [[RETVAL]], align 4
 // CHECK7-NEXT:    ret i32 [[TMP34]]
 //
@@ -2468,7 +2468,7 @@ int main() {
 //
 //
 // CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l50.omp_outlined
-// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i32 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR4]] {
+// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i32 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR3]] {
 // CHECK7-NEXT:  entry:
 // CHECK7-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK7-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -2593,14 +2593,14 @@ int main() {
 // CHECK7-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[TMP3]], ptr align 4 [[TMP26]], i32 4, i1 false)
 // CHECK7-NEXT:    br label [[DOTOMP_LASTPRIVATE_DONE]]
 // CHECK7:       .omp.lastprivate.done:
-// CHECK7-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR5]]
+// CHECK7-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR4]]
 // CHECK7-NEXT:    [[ARRAY_BEGIN10:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR4]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN10]], i32 2
 // CHECK7-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK7:       arraydestroy.body:
 // CHECK7-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP27]], [[DOTOMP_LASTPRIVATE_DONE]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK7-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1
-// CHECK7-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK7-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK7-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN10]]
 // CHECK7-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE11:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK7:       arraydestroy.done11:
@@ -2608,7 +2608,7 @@ int main() {
 //
 //
 // CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l50.omp_outlined.omp_outlined
-// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i32 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR4]] {
+// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i32 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR3]] {
 // CHECK7-NEXT:  entry:
 // CHECK7-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK7-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -2746,14 +2746,14 @@ int main() {
 // CHECK7-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[TMP5]], ptr align 4 [[TMP27]], i32 4, i1 false)
 // CHECK7-NEXT:    br label [[DOTOMP_LASTPRIVATE_DONE]]
 // CHECK7:       .omp.lastprivate.done:
-// CHECK7-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR5]]
+// CHECK7-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR4]]
 // CHECK7-NEXT:    [[ARRAY_BEGIN12:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR4]], i32 0, i32 0
 // CHECK7-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN12]], i32 2
 // CHECK7-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK7:       arraydestroy.body:
 // CHECK7-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP28]], [[DOTOMP_LASTPRIVATE_DONE]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK7-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1
-// CHECK7-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK7-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK7-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN12]]
 // CHECK7-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE13:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK7:       arraydestroy.done13:
@@ -2766,7 +2766,7 @@ int main() {
 // CHECK7-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
 // CHECK7-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
 // CHECK7-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
-// CHECK7-NEXT:    call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]]
+// CHECK7-NEXT:    call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]]
 // CHECK7-NEXT:    ret void
 //
 //
@@ -2805,7 +2805,7 @@ int main() {
 //
 //
 // CHECK7-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK7-SAME: () #[[ATTR7:[0-9]+]] {
+// CHECK7-SAME: () #[[ATTR6:[0-9]+]] {
 // CHECK7-NEXT:  entry:
 // CHECK7-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK7-NEXT:    ret void
diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_private_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_private_codegen.cpp
index e1131272ef70f..6fcbb8def798f 100644
--- a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_private_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_private_codegen.cpp
@@ -345,7 +345,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l124.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR5:[0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR4]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -443,7 +443,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l124.omp_outlined.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR5]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR4]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -564,7 +564,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@_Z5tmainIiET_v
-// CHECK1-SAME: () #[[ATTR7:[0-9]+]] comdat {
+// CHECK1-SAME: () #[[ATTR6:[0-9]+]] comdat {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4
@@ -664,7 +664,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l80.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR5]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR4]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -765,7 +765,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l80.omp_outlined.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR5]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR4]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1113,7 +1113,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l124.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR5:[0-9]+]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR4]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1209,7 +1209,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l124.omp_outlined.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR5]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR4]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1326,7 +1326,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@_Z5tmainIiET_v
-// CHECK3-SAME: () #[[ATTR7:[0-9]+]] comdat {
+// CHECK3-SAME: () #[[ATTR6:[0-9]+]] comdat {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
 // CHECK3-NEXT:    [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4
@@ -1426,7 +1426,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l80.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR5]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR4]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1525,7 +1525,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l80.omp_outlined.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR5]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR4]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1835,7 +1835,7 @@ int main() {
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l104.omp_outlined
-// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR6:[0-9]+]] {
+// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR5]] {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1909,7 +1909,7 @@ int main() {
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l104.omp_outlined.omp_outlined
-// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR6]] {
+// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR5]] {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -2906,14 +2906,16 @@ int main() {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l124
-// CHECK13-SAME: () #[[ATTR0:[0-9]+]] {
+// CHECK13-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK13-NEXT:  entry:
+// CHECK13-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
+// CHECK13-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK13-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3:[0-9]+]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l124.omp_outlined)
 // CHECK13-NEXT:    ret void
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l124.omp_outlined
-// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -2940,12 +2942,12 @@ int main() {
 // CHECK13-NEXT:    br label [[ARRAYCTOR_LOOP:%.*]]
 // CHECK13:       arrayctor.loop:
 // CHECK13-NEXT:    [[ARRAYCTOR_CUR:%.*]] = phi ptr [ [[ARRAY_BEGIN]], [[ENTRY:%.*]] ], [ [[ARRAYCTOR_NEXT:%.*]], [[ARRAYCTOR_LOOP]] ]
-// CHECK13-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYCTOR_CUR]]) #[[ATTR5:[0-9]+]]
+// CHECK13-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYCTOR_CUR]]) #[[ATTR4:[0-9]+]]
 // CHECK13-NEXT:    [[ARRAYCTOR_NEXT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYCTOR_CUR]], i64 1
 // CHECK13-NEXT:    [[ARRAYCTOR_DONE:%.*]] = icmp eq ptr [[ARRAYCTOR_NEXT]], [[ARRAYCTOR_END]]
 // CHECK13-NEXT:    br i1 [[ARRAYCTOR_DONE]], label [[ARRAYCTOR_CONT:%.*]], label [[ARRAYCTOR_LOOP]]
 // CHECK13:       arrayctor.cont:
-// CHECK13-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR5]]
+// CHECK13-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR4]]
 // CHECK13-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
 // CHECK13-NEXT:    [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
 // CHECK13-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB1:[0-9]+]], i32 [[TMP1]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
@@ -2996,14 +2998,14 @@ int main() {
 // CHECK13-NEXT:    store i32 2, ptr [[I]], align 4
 // CHECK13-NEXT:    br label [[DOTOMP_FINAL_DONE]]
 // CHECK13:       .omp.final.done:
-// CHECK13-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR6:[0-9]+]]
+// CHECK13-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR5:[0-9]+]]
 // CHECK13-NEXT:    [[ARRAY_BEGIN2:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i32 0, i32 0
 // CHECK13-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN2]], i64 2
 // CHECK13-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK13:       arraydestroy.body:
 // CHECK13-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP17]], [[DOTOMP_FINAL_DONE]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK13-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
-// CHECK13-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR6]]
+// CHECK13-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
 // CHECK13-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN2]]
 // CHECK13-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE3:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK13:       arraydestroy.done3:
@@ -3011,17 +3013,17 @@ int main() {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@_ZN1SIfEC1Ev
-// CHECK13-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR2:[0-9]+]] comdat {
+// CHECK13-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1:[0-9]+]] comdat {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
 // CHECK13-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
-// CHECK13-NEXT:    call void @_ZN1SIfEC2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]]
+// CHECK13-NEXT:    call void @_ZN1SIfEC2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]]
 // CHECK13-NEXT:    ret void
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l124.omp_outlined.omp_outlined
-// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR0]] {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -3058,12 +3060,12 @@ int main() {
 // CHECK13-NEXT:    br label [[ARRAYCTOR_LOOP:%.*]]
 // CHECK13:       arrayctor.loop:
 // CHECK13-NEXT:    [[ARRAYCTOR_CUR:%.*]] = phi ptr [ [[ARRAY_BEGIN]], [[ENTRY:%.*]] ], [ [[ARRAYCTOR_NEXT:%.*]], [[ARRAYCTOR_LOOP]] ]
-// CHECK13-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYCTOR_CUR]]) #[[ATTR5]]
+// CHECK13-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYCTOR_CUR]]) #[[ATTR4]]
 // CHECK13-NEXT:    [[ARRAYCTOR_NEXT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYCTOR_CUR]], i64 1
 // CHECK13-NEXT:    [[ARRAYCTOR_DONE:%.*]] = icmp eq ptr [[ARRAYCTOR_NEXT]], [[ARRAYCTOR_END]]
 // CHECK13-NEXT:    br i1 [[ARRAYCTOR_DONE]], label [[ARRAYCTOR_CONT:%.*]], label [[ARRAYCTOR_LOOP]]
 // CHECK13:       arrayctor.cont:
-// CHECK13-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR5]]
+// CHECK13-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR4]]
 // CHECK13-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
 // CHECK13-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
 // CHECK13-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB2:[0-9]+]], i32 [[TMP3]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
@@ -3127,14 +3129,14 @@ int main() {
 // CHECK13-NEXT:    store i32 2, ptr [[I]], align 4
 // CHECK13-NEXT:    br label [[DOTOMP_FINAL_DONE]]
 // CHECK13:       .omp.final.done:
-// CHECK13-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR6]]
+// CHECK13-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR5]]
 // CHECK13-NEXT:    [[ARRAY_BEGIN7:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i32 0, i32 0
 // CHECK13-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN7]], i64 2
 // CHECK13-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK13:       arraydestroy.body:
 // CHECK13-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP20]], [[DOTOMP_FINAL_DONE]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK13-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
-// CHECK13-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR6]]
+// CHECK13-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
 // CHECK13-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN7]]
 // CHECK13-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE8:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK13:       arraydestroy.done8:
@@ -3142,24 +3144,26 @@ int main() {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@_ZN1SIfED1Ev
-// CHECK13-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR2]] comdat {
+// CHECK13-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
 // CHECK13-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
-// CHECK13-NEXT:    call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR6]]
+// CHECK13-NEXT:    call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]]
 // CHECK13-NEXT:    ret void
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l80
-// CHECK13-SAME: () #[[ATTR0]] {
+// CHECK13-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
 // CHECK13-NEXT:  entry:
+// CHECK13-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
+// CHECK13-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK13-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l80.omp_outlined)
 // CHECK13-NEXT:    ret void
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l80.omp_outlined
-// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -3188,12 +3192,12 @@ int main() {
 // CHECK13-NEXT:    br label [[ARRAYCTOR_LOOP:%.*]]
 // CHECK13:       arrayctor.loop:
 // CHECK13-NEXT:    [[ARRAYCTOR_CUR:%.*]] = phi ptr [ [[ARRAY_BEGIN]], [[ENTRY:%.*]] ], [ [[ARRAYCTOR_NEXT:%.*]], [[ARRAYCTOR_LOOP]] ]
-// CHECK13-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYCTOR_CUR]]) #[[ATTR5]]
+// CHECK13-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYCTOR_CUR]]) #[[ATTR4]]
 // CHECK13-NEXT:    [[ARRAYCTOR_NEXT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYCTOR_CUR]], i64 1
 // CHECK13-NEXT:    [[ARRAYCTOR_DONE:%.*]] = icmp eq ptr [[ARRAYCTOR_NEXT]], [[ARRAYCTOR_END]]
 // CHECK13-NEXT:    br i1 [[ARRAYCTOR_DONE]], label [[ARRAYCTOR_CONT:%.*]], label [[ARRAYCTOR_LOOP]]
 // CHECK13:       arrayctor.cont:
-// CHECK13-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR5]]
+// CHECK13-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR4]]
 // CHECK13-NEXT:    store ptr [[VAR]], ptr [[_TMP2]], align 8
 // CHECK13-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
 // CHECK13-NEXT:    [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
@@ -3245,14 +3249,14 @@ int main() {
 // CHECK13-NEXT:    store i32 2, ptr [[I]], align 4
 // CHECK13-NEXT:    br label [[DOTOMP_FINAL_DONE]]
 // CHECK13:       .omp.final.done:
-// CHECK13-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR6]]
+// CHECK13-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR5]]
 // CHECK13-NEXT:    [[ARRAY_BEGIN4:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0
 // CHECK13-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN4]], i64 2
 // CHECK13-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK13:       arraydestroy.body:
 // CHECK13-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP17]], [[DOTOMP_FINAL_DONE]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK13-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
-// CHECK13-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR6]]
+// CHECK13-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
 // CHECK13-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN4]]
 // CHECK13-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE5:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK13:       arraydestroy.done5:
@@ -3260,17 +3264,17 @@ int main() {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@_ZN1SIiEC1Ev
-// CHECK13-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR2]] comdat {
+// CHECK13-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
 // CHECK13-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
-// CHECK13-NEXT:    call void @_ZN1SIiEC2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]]
+// CHECK13-NEXT:    call void @_ZN1SIiEC2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]]
 // CHECK13-NEXT:    ret void
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l80.omp_outlined.omp_outlined
-// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR0]] {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -3309,12 +3313,12 @@ int main() {
 // CHECK13-NEXT:    br label [[ARRAYCTOR_LOOP:%.*]]
 // CHECK13:       arrayctor.loop:
 // CHECK13-NEXT:    [[ARRAYCTOR_CUR:%.*]] = phi ptr [ [[ARRAY_BEGIN]], [[ENTRY:%.*]] ], [ [[ARRAYCTOR_NEXT:%.*]], [[ARRAYCTOR_LOOP]] ]
-// CHECK13-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYCTOR_CUR]]) #[[ATTR5]]
+// CHECK13-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYCTOR_CUR]]) #[[ATTR4]]
 // CHECK13-NEXT:    [[ARRAYCTOR_NEXT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYCTOR_CUR]], i64 1
 // CHECK13-NEXT:    [[ARRAYCTOR_DONE:%.*]] = icmp eq ptr [[ARRAYCTOR_NEXT]], [[ARRAYCTOR_END]]
 // CHECK13-NEXT:    br i1 [[ARRAYCTOR_DONE]], label [[ARRAYCTOR_CONT:%.*]], label [[ARRAYCTOR_LOOP]]
 // CHECK13:       arrayctor.cont:
-// CHECK13-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR5]]
+// CHECK13-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR4]]
 // CHECK13-NEXT:    store ptr [[VAR]], ptr [[_TMP3]], align 8
 // CHECK13-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
 // CHECK13-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
@@ -3376,14 +3380,14 @@ int main() {
 // CHECK13-NEXT:    store i32 2, ptr [[I]], align 4
 // CHECK13-NEXT:    br label [[DOTOMP_FINAL_DONE]]
 // CHECK13:       .omp.final.done:
-// CHECK13-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR6]]
+// CHECK13-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR5]]
 // CHECK13-NEXT:    [[ARRAY_BEGIN8:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0
 // CHECK13-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN8]], i64 2
 // CHECK13-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK13:       arraydestroy.body:
 // CHECK13-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP19]], [[DOTOMP_FINAL_DONE]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK13-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
-// CHECK13-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR6]]
+// CHECK13-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
 // CHECK13-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN8]]
 // CHECK13-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE9:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK13:       arraydestroy.done9:
@@ -3391,17 +3395,17 @@ int main() {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@_ZN1SIiED1Ev
-// CHECK13-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR2]] comdat {
+// CHECK13-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
 // CHECK13-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
-// CHECK13-NEXT:    call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR6]]
+// CHECK13-NEXT:    call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]]
 // CHECK13-NEXT:    ret void
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@_ZN1SIfEC2Ev
-// CHECK13-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR2]] comdat {
+// CHECK13-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
@@ -3414,7 +3418,7 @@ int main() {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@_ZN1SIfED2Ev
-// CHECK13-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR2]] comdat {
+// CHECK13-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
@@ -3423,7 +3427,7 @@ int main() {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@_ZN1SIiEC2Ev
-// CHECK13-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR2]] comdat {
+// CHECK13-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
@@ -3435,7 +3439,7 @@ int main() {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@_ZN1SIiED2Ev
-// CHECK13-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR2]] comdat {
+// CHECK13-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
@@ -3444,14 +3448,16 @@ int main() {
 //
 //
 // CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l124
-// CHECK15-SAME: () #[[ATTR0:[0-9]+]] {
+// CHECK15-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK15-NEXT:  entry:
+// CHECK15-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
+// CHECK15-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK15-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3:[0-9]+]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l124.omp_outlined)
 // CHECK15-NEXT:    ret void
 //
 //
 // CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l124.omp_outlined
-// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
 // CHECK15-NEXT:  entry:
 // CHECK15-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK15-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -3478,12 +3484,12 @@ int main() {
 // CHECK15-NEXT:    br label [[ARRAYCTOR_LOOP:%.*]]
 // CHECK15:       arrayctor.loop:
 // CHECK15-NEXT:    [[ARRAYCTOR_CUR:%.*]] = phi ptr [ [[ARRAY_BEGIN]], [[ENTRY:%.*]] ], [ [[ARRAYCTOR_NEXT:%.*]], [[ARRAYCTOR_LOOP]] ]
-// CHECK15-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYCTOR_CUR]]) #[[ATTR5:[0-9]+]]
+// CHECK15-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYCTOR_CUR]]) #[[ATTR4:[0-9]+]]
 // CHECK15-NEXT:    [[ARRAYCTOR_NEXT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYCTOR_CUR]], i32 1
 // CHECK15-NEXT:    [[ARRAYCTOR_DONE:%.*]] = icmp eq ptr [[ARRAYCTOR_NEXT]], [[ARRAYCTOR_END]]
 // CHECK15-NEXT:    br i1 [[ARRAYCTOR_DONE]], label [[ARRAYCTOR_CONT:%.*]], label [[ARRAYCTOR_LOOP]]
 // CHECK15:       arrayctor.cont:
-// CHECK15-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR5]]
+// CHECK15-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR4]]
 // CHECK15-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
 // CHECK15-NEXT:    [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
 // CHECK15-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB1:[0-9]+]], i32 [[TMP1]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
@@ -3532,14 +3538,14 @@ int main() {
 // CHECK15-NEXT:    store i32 2, ptr [[I]], align 4
 // CHECK15-NEXT:    br label [[DOTOMP_FINAL_DONE]]
 // CHECK15:       .omp.final.done:
-// CHECK15-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR6:[0-9]+]]
+// CHECK15-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR5:[0-9]+]]
 // CHECK15-NEXT:    [[ARRAY_BEGIN2:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i32 0, i32 0
 // CHECK15-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN2]], i32 2
 // CHECK15-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK15:       arraydestroy.body:
 // CHECK15-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP15]], [[DOTOMP_FINAL_DONE]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK15-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1
-// CHECK15-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR6]]
+// CHECK15-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
 // CHECK15-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN2]]
 // CHECK15-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE3:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK15:       arraydestroy.done3:
@@ -3547,17 +3553,17 @@ int main() {
 //
 //
 // CHECK15-LABEL: define {{[^@]+}}@_ZN1SIfEC1Ev
-// CHECK15-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR2:[0-9]+]] comdat align 2 {
+// CHECK15-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1:[0-9]+]] comdat align 2 {
 // CHECK15-NEXT:  entry:
 // CHECK15-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
 // CHECK15-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
 // CHECK15-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
-// CHECK15-NEXT:    call void @_ZN1SIfEC2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]]
+// CHECK15-NEXT:    call void @_ZN1SIfEC2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]]
 // CHECK15-NEXT:    ret void
 //
 //
 // CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l124.omp_outlined.omp_outlined
-// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR0]] {
 // CHECK15-NEXT:  entry:
 // CHECK15-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK15-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -3592,12 +3598,12 @@ int main() {
 // CHECK15-NEXT:    br label [[ARRAYCTOR_LOOP:%.*]]
 // CHECK15:       arrayctor.loop:
 // CHECK15-NEXT:    [[ARRAYCTOR_CUR:%.*]] = phi ptr [ [[ARRAY_BEGIN]], [[ENTRY:%.*]] ], [ [[ARRAYCTOR_NEXT:%.*]], [[ARRAYCTOR_LOOP]] ]
-// CHECK15-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYCTOR_CUR]]) #[[ATTR5]]
+// CHECK15-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYCTOR_CUR]]) #[[ATTR4]]
 // CHECK15-NEXT:    [[ARRAYCTOR_NEXT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYCTOR_CUR]], i32 1
 // CHECK15-NEXT:    [[ARRAYCTOR_DONE:%.*]] = icmp eq ptr [[ARRAYCTOR_NEXT]], [[ARRAYCTOR_END]]
 // CHECK15-NEXT:    br i1 [[ARRAYCTOR_DONE]], label [[ARRAYCTOR_CONT:%.*]], label [[ARRAYCTOR_LOOP]]
 // CHECK15:       arrayctor.cont:
-// CHECK15-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR5]]
+// CHECK15-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR4]]
 // CHECK15-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
 // CHECK15-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
 // CHECK15-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB2:[0-9]+]], i32 [[TMP3]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
@@ -3659,14 +3665,14 @@ int main() {
 // CHECK15-NEXT:    store i32 2, ptr [[I]], align 4
 // CHECK15-NEXT:    br label [[DOTOMP_FINAL_DONE]]
 // CHECK15:       .omp.final.done:
-// CHECK15-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR6]]
+// CHECK15-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR5]]
 // CHECK15-NEXT:    [[ARRAY_BEGIN5:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i32 0, i32 0
 // CHECK15-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN5]], i32 2
 // CHECK15-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK15:       arraydestroy.body:
 // CHECK15-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP20]], [[DOTOMP_FINAL_DONE]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK15-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1
-// CHECK15-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR6]]
+// CHECK15-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
 // CHECK15-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN5]]
 // CHECK15-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE6:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK15:       arraydestroy.done6:
@@ -3674,24 +3680,26 @@ int main() {
 //
 //
 // CHECK15-LABEL: define {{[^@]+}}@_ZN1SIfED1Ev
-// CHECK15-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR2]] comdat align 2 {
+// CHECK15-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
 // CHECK15-NEXT:  entry:
 // CHECK15-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
 // CHECK15-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
 // CHECK15-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
-// CHECK15-NEXT:    call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR6]]
+// CHECK15-NEXT:    call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]]
 // CHECK15-NEXT:    ret void
 //
 //
 // CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l80
-// CHECK15-SAME: () #[[ATTR0]] {
+// CHECK15-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
 // CHECK15-NEXT:  entry:
+// CHECK15-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
+// CHECK15-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK15-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l80.omp_outlined)
 // CHECK15-NEXT:    ret void
 //
 //
 // CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l80.omp_outlined
-// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
 // CHECK15-NEXT:  entry:
 // CHECK15-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK15-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -3720,12 +3728,12 @@ int main() {
 // CHECK15-NEXT:    br label [[ARRAYCTOR_LOOP:%.*]]
 // CHECK15:       arrayctor.loop:
 // CHECK15-NEXT:    [[ARRAYCTOR_CUR:%.*]] = phi ptr [ [[ARRAY_BEGIN]], [[ENTRY:%.*]] ], [ [[ARRAYCTOR_NEXT:%.*]], [[ARRAYCTOR_LOOP]] ]
-// CHECK15-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYCTOR_CUR]]) #[[ATTR5]]
+// CHECK15-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYCTOR_CUR]]) #[[ATTR4]]
 // CHECK15-NEXT:    [[ARRAYCTOR_NEXT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYCTOR_CUR]], i32 1
 // CHECK15-NEXT:    [[ARRAYCTOR_DONE:%.*]] = icmp eq ptr [[ARRAYCTOR_NEXT]], [[ARRAYCTOR_END]]
 // CHECK15-NEXT:    br i1 [[ARRAYCTOR_DONE]], label [[ARRAYCTOR_CONT:%.*]], label [[ARRAYCTOR_LOOP]]
 // CHECK15:       arrayctor.cont:
-// CHECK15-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR5]]
+// CHECK15-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR4]]
 // CHECK15-NEXT:    store ptr [[VAR]], ptr [[_TMP2]], align 4
 // CHECK15-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
 // CHECK15-NEXT:    [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
@@ -3775,14 +3783,14 @@ int main() {
 // CHECK15-NEXT:    store i32 2, ptr [[I]], align 4
 // CHECK15-NEXT:    br label [[DOTOMP_FINAL_DONE]]
 // CHECK15:       .omp.final.done:
-// CHECK15-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR6]]
+// CHECK15-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR5]]
 // CHECK15-NEXT:    [[ARRAY_BEGIN4:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0
 // CHECK15-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN4]], i32 2
 // CHECK15-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK15:       arraydestroy.body:
 // CHECK15-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP15]], [[DOTOMP_FINAL_DONE]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK15-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1
-// CHECK15-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR6]]
+// CHECK15-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
 // CHECK15-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN4]]
 // CHECK15-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE5:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK15:       arraydestroy.done5:
@@ -3790,17 +3798,17 @@ int main() {
 //
 //
 // CHECK15-LABEL: define {{[^@]+}}@_ZN1SIiEC1Ev
-// CHECK15-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR2]] comdat align 2 {
+// CHECK15-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
 // CHECK15-NEXT:  entry:
 // CHECK15-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
 // CHECK15-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
 // CHECK15-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
-// CHECK15-NEXT:    call void @_ZN1SIiEC2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]]
+// CHECK15-NEXT:    call void @_ZN1SIiEC2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]]
 // CHECK15-NEXT:    ret void
 //
 //
 // CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l80.omp_outlined.omp_outlined
-// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR0]] {
 // CHECK15-NEXT:  entry:
 // CHECK15-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK15-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -3837,12 +3845,12 @@ int main() {
 // CHECK15-NEXT:    br label [[ARRAYCTOR_LOOP:%.*]]
 // CHECK15:       arrayctor.loop:
 // CHECK15-NEXT:    [[ARRAYCTOR_CUR:%.*]] = phi ptr [ [[ARRAY_BEGIN]], [[ENTRY:%.*]] ], [ [[ARRAYCTOR_NEXT:%.*]], [[ARRAYCTOR_LOOP]] ]
-// CHECK15-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYCTOR_CUR]]) #[[ATTR5]]
+// CHECK15-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYCTOR_CUR]]) #[[ATTR4]]
 // CHECK15-NEXT:    [[ARRAYCTOR_NEXT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYCTOR_CUR]], i32 1
 // CHECK15-NEXT:    [[ARRAYCTOR_DONE:%.*]] = icmp eq ptr [[ARRAYCTOR_NEXT]], [[ARRAYCTOR_END]]
 // CHECK15-NEXT:    br i1 [[ARRAYCTOR_DONE]], label [[ARRAYCTOR_CONT:%.*]], label [[ARRAYCTOR_LOOP]]
 // CHECK15:       arrayctor.cont:
-// CHECK15-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR5]]
+// CHECK15-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR4]]
 // CHECK15-NEXT:    store ptr [[VAR]], ptr [[_TMP2]], align 4
 // CHECK15-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
 // CHECK15-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
@@ -3902,14 +3910,14 @@ int main() {
 // CHECK15-NEXT:    store i32 2, ptr [[I]], align 4
 // CHECK15-NEXT:    br label [[DOTOMP_FINAL_DONE]]
 // CHECK15:       .omp.final.done:
-// CHECK15-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR6]]
+// CHECK15-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR5]]
 // CHECK15-NEXT:    [[ARRAY_BEGIN6:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0
 // CHECK15-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN6]], i32 2
 // CHECK15-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK15:       arraydestroy.body:
 // CHECK15-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP19]], [[DOTOMP_FINAL_DONE]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK15-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1
-// CHECK15-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR6]]
+// CHECK15-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
 // CHECK15-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN6]]
 // CHECK15-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE7:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK15:       arraydestroy.done7:
@@ -3917,17 +3925,17 @@ int main() {
 //
 //
 // CHECK15-LABEL: define {{[^@]+}}@_ZN1SIiED1Ev
-// CHECK15-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR2]] comdat align 2 {
+// CHECK15-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
 // CHECK15-NEXT:  entry:
 // CHECK15-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
 // CHECK15-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
 // CHECK15-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
-// CHECK15-NEXT:    call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR6]]
+// CHECK15-NEXT:    call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]]
 // CHECK15-NEXT:    ret void
 //
 //
 // CHECK15-LABEL: define {{[^@]+}}@_ZN1SIfEC2Ev
-// CHECK15-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR2]] comdat align 2 {
+// CHECK15-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
 // CHECK15-NEXT:  entry:
 // CHECK15-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
 // CHECK15-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
@@ -3940,7 +3948,7 @@ int main() {
 //
 //
 // CHECK15-LABEL: define {{[^@]+}}@_ZN1SIfED2Ev
-// CHECK15-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR2]] comdat align 2 {
+// CHECK15-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
 // CHECK15-NEXT:  entry:
 // CHECK15-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
 // CHECK15-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
@@ -3949,7 +3957,7 @@ int main() {
 //
 //
 // CHECK15-LABEL: define {{[^@]+}}@_ZN1SIiEC2Ev
-// CHECK15-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR2]] comdat align 2 {
+// CHECK15-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
 // CHECK15-NEXT:  entry:
 // CHECK15-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
 // CHECK15-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
@@ -3961,7 +3969,7 @@ int main() {
 //
 //
 // CHECK15-LABEL: define {{[^@]+}}@_ZN1SIiED2Ev
-// CHECK15-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR2]] comdat align 2 {
+// CHECK15-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
 // CHECK15-NEXT:  entry:
 // CHECK15-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
 // CHECK15-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
@@ -3970,14 +3978,16 @@ int main() {
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l104
-// CHECK17-SAME: () #[[ATTR0:[0-9]+]] {
+// CHECK17-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK17-NEXT:  entry:
+// CHECK17-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
+// CHECK17-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK17-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3:[0-9]+]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l104.omp_outlined)
 // CHECK17-NEXT:    ret void
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l104.omp_outlined
-// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
 // CHECK17-NEXT:  entry:
 // CHECK17-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -4051,7 +4061,7 @@ int main() {
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l104.omp_outlined.omp_outlined
-// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR0]] {
 // CHECK17-NEXT:  entry:
 // CHECK17-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -4124,7 +4134,7 @@ int main() {
 // CHECK17-NEXT:    store ptr [[TMP13]], ptr [[TMP12]], align 8, !llvm.access.group [[ACC_GRP9]]
 // CHECK17-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[CLASS_ANON]], ptr [[REF_TMP]], i32 0, i32 2
 // CHECK17-NEXT:    store ptr [[SIVAR]], ptr [[TMP14]], align 8, !llvm.access.group [[ACC_GRP9]]
-// CHECK17-NEXT:    call void @"_ZZZ4mainENK3$_0clEvENKUlvE_clEv"(ptr noundef nonnull align 8 dereferenceable(24) [[REF_TMP]]) #[[ATTR4:[0-9]+]], !llvm.access.group [[ACC_GRP9]]
+// CHECK17-NEXT:    call void @"_ZZZ4mainENK3$_0clEvENKUlvE_clEv"(ptr noundef nonnull align 8 dereferenceable(24) [[REF_TMP]]) #[[ATTR3:[0-9]+]], !llvm.access.group [[ACC_GRP9]]
 // CHECK17-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK17:       omp.body.continue:
 // CHECK17-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_proc_bind_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_proc_bind_codegen.cpp
index 82e6b3c0dc120..717ef52ea73f1 100644
--- a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_proc_bind_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_proc_bind_codegen.cpp
@@ -87,7 +87,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
 // CHECK1-NEXT:    br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK1:       omp_offload.failed:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l36() #[[ATTR3:[0-9]+]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l36() #[[ATTR2:[0-9]+]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS2]], i32 0, i32 0
@@ -120,7 +120,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP29:%.*]] = icmp ne i32 [[TMP28]], 0
 // CHECK1-NEXT:    br i1 [[TMP29]], label [[OMP_OFFLOAD_FAILED3:%.*]], label [[OMP_OFFLOAD_CONT4:%.*]]
 // CHECK1:       omp_offload.failed3:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l38() #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l38() #[[ATTR2]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT4]]
 // CHECK1:       omp_offload.cont4:
 // CHECK1-NEXT:    [[CALL:%.*]] = call noundef signext i32 @_Z5tmainIiET_v()
@@ -135,7 +135,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l36.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -203,7 +203,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l36.omp_outlined.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -287,7 +287,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l38.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -355,7 +355,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l38.omp_outlined.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -432,7 +432,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@_Z5tmainIiET_v
-// CHECK1-SAME: () #[[ATTR4:[0-9]+]] comdat {
+// CHECK1-SAME: () #[[ATTR3:[0-9]+]] comdat {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[TMP:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8
@@ -466,7 +466,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
 // CHECK1-NEXT:    br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK1:       omp_offload.failed:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l30() #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l30() #[[ATTR2]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    ret i32 0
@@ -480,7 +480,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l30.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -548,7 +548,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l30.omp_outlined.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -625,7 +625,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK1-SAME: () #[[ATTR5:[0-9]+]] {
+// CHECK1-SAME: () #[[ATTR4:[0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK1-NEXT:    ret void
diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_reduction_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_reduction_codegen.cpp
index abdd50ec44480..6ca9097ac9b6c 100644
--- a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_reduction_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_reduction_codegen.cpp
@@ -138,7 +138,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
 // CHECK1-NEXT:    br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK1:       omp_offload.failed:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l66(ptr @_ZZ4mainE5sivar) #[[ATTR3:[0-9]+]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l66(ptr @_ZZ4mainE5sivar) #[[ATTR2:[0-9]+]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    [[CALL:%.*]] = call noundef signext i32 @_Z5tmainIiET_v()
@@ -156,7 +156,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l66.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -248,7 +248,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l66.omp_outlined.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -354,7 +354,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l66.omp_outlined.omp_outlined.omp.reduction.reduction_func
-// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4:[0-9]+]] {
+// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3:[0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8
@@ -374,7 +374,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l66.omp_outlined.omp.reduction.reduction_func
-// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4]] {
+// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8
@@ -394,7 +394,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@_Z5tmainIiET_v
-// CHECK1-SAME: () #[[ATTR6:[0-9]+]] comdat {
+// CHECK1-SAME: () #[[ATTR5:[0-9]+]] comdat {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[T_VAR:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[VEC:%.*]] = alloca [2 x i32], align 4
@@ -443,7 +443,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
 // CHECK1-NEXT:    br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK1:       omp_offload.failed:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32(ptr [[T_VAR]]) #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32(ptr [[T_VAR]]) #[[ATTR2]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    ret i32 0
@@ -460,7 +460,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -552,7 +552,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -658,7 +658,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined.omp_outlined.omp.reduction.reduction_func
-// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4]] {
+// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8
@@ -678,7 +678,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined.omp.reduction.reduction_func
-// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4]] {
+// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8
@@ -698,7 +698,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK1-SAME: () #[[ATTR8:[0-9]+]] {
+// CHECK1-SAME: () #[[ATTR7:[0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK1-NEXT:    ret void
@@ -752,7 +752,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
 // CHECK3-NEXT:    br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK3:       omp_offload.failed:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l66(ptr @_ZZ4mainE5sivar) #[[ATTR3:[0-9]+]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l66(ptr @_ZZ4mainE5sivar) #[[ATTR2:[0-9]+]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK3:       omp_offload.cont:
 // CHECK3-NEXT:    [[CALL:%.*]] = call noundef i32 @_Z5tmainIiET_v()
@@ -770,7 +770,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l66.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -860,7 +860,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l66.omp_outlined.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -964,7 +964,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l66.omp_outlined.omp_outlined.omp.reduction.reduction_func
-// CHECK3-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4:[0-9]+]] {
+// CHECK3-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3:[0-9]+]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 4
@@ -984,7 +984,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l66.omp_outlined.omp.reduction.reduction_func
-// CHECK3-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4]] {
+// CHECK3-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 4
@@ -1004,7 +1004,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@_Z5tmainIiET_v
-// CHECK3-SAME: () #[[ATTR6:[0-9]+]] comdat {
+// CHECK3-SAME: () #[[ATTR5:[0-9]+]] comdat {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[T_VAR:%.*]] = alloca i32, align 4
 // CHECK3-NEXT:    [[VEC:%.*]] = alloca [2 x i32], align 4
@@ -1053,7 +1053,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
 // CHECK3-NEXT:    br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK3:       omp_offload.failed:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32(ptr [[T_VAR]]) #[[ATTR3]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32(ptr [[T_VAR]]) #[[ATTR2]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK3:       omp_offload.cont:
 // CHECK3-NEXT:    ret i32 0
@@ -1070,7 +1070,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1160,7 +1160,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1264,7 +1264,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined.omp_outlined.omp.reduction.reduction_func
-// CHECK3-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4]] {
+// CHECK3-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 4
@@ -1284,7 +1284,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined.omp.reduction.reduction_func
-// CHECK3-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4]] {
+// CHECK3-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 4
@@ -1304,7 +1304,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK3-SAME: () #[[ATTR8:[0-9]+]] {
+// CHECK3-SAME: () #[[ATTR7:[0-9]+]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK3-NEXT:    ret void
@@ -1331,7 +1331,7 @@ int main() {
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l44.omp_outlined
-// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR2]] {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1423,7 +1423,7 @@ int main() {
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l44.omp_outlined.omp_outlined
-// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR3]] {
+// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR2]] {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1533,7 +1533,7 @@ int main() {
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l44.omp_outlined.omp_outlined.omp.reduction.reduction_func
-// CHECK5-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR5:[0-9]+]] {
+// CHECK5-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4:[0-9]+]] {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8
@@ -1553,7 +1553,7 @@ int main() {
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l44.omp_outlined.omp.reduction.reduction_func
-// CHECK5-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR5]] {
+// CHECK5-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4]] {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8
@@ -1573,7 +1573,7 @@ int main() {
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK5-SAME: () #[[ATTR7:[0-9]+]] {
+// CHECK5-SAME: () #[[ATTR6:[0-9]+]] {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK5-NEXT:    ret void
diff --git a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_schedule_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_schedule_codegen.cpp
index aba9968fac967..4a2a71ca83891 100644
--- a/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_schedule_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_parallel_for_simd_schedule_codegen.cpp
@@ -272,7 +272,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
 // CHECK1-NEXT:    br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK1:       omp_offload.failed:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l36(ptr [[THIS1]]) #[[ATTR3:[0-9]+]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l36(ptr [[THIS1]]) #[[ATTR2:[0-9]+]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    [[A2:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0
@@ -314,7 +314,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP39:%.*]] = icmp ne i32 [[TMP38]], 0
 // CHECK1-NEXT:    br i1 [[TMP39]], label [[OMP_OFFLOAD_FAILED8:%.*]], label [[OMP_OFFLOAD_CONT9:%.*]]
 // CHECK1:       omp_offload.failed8:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l40(ptr [[THIS1]]) #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l40(ptr [[THIS1]]) #[[ATTR2]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT9]]
 // CHECK1:       omp_offload.cont9:
 // CHECK1-NEXT:    [[A10:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0
@@ -356,7 +356,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP59:%.*]] = icmp ne i32 [[TMP58]], 0
 // CHECK1-NEXT:    br i1 [[TMP59]], label [[OMP_OFFLOAD_FAILED16:%.*]], label [[OMP_OFFLOAD_CONT17:%.*]]
 // CHECK1:       omp_offload.failed16:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l44(ptr [[THIS1]]) #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l44(ptr [[THIS1]]) #[[ATTR2]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT17]]
 // CHECK1:       omp_offload.cont17:
 // CHECK1-NEXT:    [[A18:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0
@@ -398,7 +398,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP79:%.*]] = icmp ne i32 [[TMP78]], 0
 // CHECK1-NEXT:    br i1 [[TMP79]], label [[OMP_OFFLOAD_FAILED24:%.*]], label [[OMP_OFFLOAD_CONT25:%.*]]
 // CHECK1:       omp_offload.failed24:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l49(ptr [[THIS1]]) #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l49(ptr [[THIS1]]) #[[ATTR2]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT25]]
 // CHECK1:       omp_offload.cont25:
 // CHECK1-NEXT:    [[A26:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0
@@ -440,7 +440,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP99:%.*]] = icmp ne i32 [[TMP98]], 0
 // CHECK1-NEXT:    br i1 [[TMP99]], label [[OMP_OFFLOAD_FAILED32:%.*]], label [[OMP_OFFLOAD_CONT33:%.*]]
 // CHECK1:       omp_offload.failed32:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l54(ptr [[THIS1]]) #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l54(ptr [[THIS1]]) #[[ATTR2]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT33]]
 // CHECK1:       omp_offload.cont33:
 // CHECK1-NEXT:    [[A34:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0
@@ -460,7 +460,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l36.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -530,7 +530,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l36.omp_outlined.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -625,7 +625,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l40.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -695,7 +695,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l40.omp_outlined.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -790,7 +790,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l44.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -860,7 +860,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l44.omp_outlined.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -976,7 +976,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l49.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1046,7 +1046,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l49.omp_outlined.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1139,7 +1139,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l54.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1209,7 +1209,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l54.omp_outlined.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1292,7 +1292,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK1-SAME: () #[[ATTR4:[0-9]+]] {
+// CHECK1-SAME: () #[[ATTR3:[0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK1-NEXT:    ret void
@@ -1376,7 +1376,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
 // CHECK3-NEXT:    br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK3:       omp_offload.failed:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l36(ptr [[THIS1]]) #[[ATTR3:[0-9]+]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l36(ptr [[THIS1]]) #[[ATTR2:[0-9]+]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK3:       omp_offload.cont:
 // CHECK3-NEXT:    [[A2:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0
@@ -1418,7 +1418,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP39:%.*]] = icmp ne i32 [[TMP38]], 0
 // CHECK3-NEXT:    br i1 [[TMP39]], label [[OMP_OFFLOAD_FAILED8:%.*]], label [[OMP_OFFLOAD_CONT9:%.*]]
 // CHECK3:       omp_offload.failed8:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l40(ptr [[THIS1]]) #[[ATTR3]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l40(ptr [[THIS1]]) #[[ATTR2]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT9]]
 // CHECK3:       omp_offload.cont9:
 // CHECK3-NEXT:    [[A10:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0
@@ -1460,7 +1460,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP59:%.*]] = icmp ne i32 [[TMP58]], 0
 // CHECK3-NEXT:    br i1 [[TMP59]], label [[OMP_OFFLOAD_FAILED16:%.*]], label [[OMP_OFFLOAD_CONT17:%.*]]
 // CHECK3:       omp_offload.failed16:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l44(ptr [[THIS1]]) #[[ATTR3]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l44(ptr [[THIS1]]) #[[ATTR2]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT17]]
 // CHECK3:       omp_offload.cont17:
 // CHECK3-NEXT:    [[A18:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0
@@ -1502,7 +1502,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP79:%.*]] = icmp ne i32 [[TMP78]], 0
 // CHECK3-NEXT:    br i1 [[TMP79]], label [[OMP_OFFLOAD_FAILED24:%.*]], label [[OMP_OFFLOAD_CONT25:%.*]]
 // CHECK3:       omp_offload.failed24:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l49(ptr [[THIS1]]) #[[ATTR3]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l49(ptr [[THIS1]]) #[[ATTR2]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT25]]
 // CHECK3:       omp_offload.cont25:
 // CHECK3-NEXT:    [[A26:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0
@@ -1544,7 +1544,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP99:%.*]] = icmp ne i32 [[TMP98]], 0
 // CHECK3-NEXT:    br i1 [[TMP99]], label [[OMP_OFFLOAD_FAILED32:%.*]], label [[OMP_OFFLOAD_CONT33:%.*]]
 // CHECK3:       omp_offload.failed32:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l54(ptr [[THIS1]]) #[[ATTR3]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l54(ptr [[THIS1]]) #[[ATTR2]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT33]]
 // CHECK3:       omp_offload.cont33:
 // CHECK3-NEXT:    [[A34:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0
@@ -1564,7 +1564,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l36.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1632,7 +1632,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l36.omp_outlined.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1724,7 +1724,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l40.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1792,7 +1792,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l40.omp_outlined.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1884,7 +1884,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l44.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1952,7 +1952,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l44.omp_outlined.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -2063,7 +2063,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l49.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -2131,7 +2131,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l49.omp_outlined.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -2221,7 +2221,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l54.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -2289,7 +2289,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l54.omp_outlined.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -2369,7 +2369,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK3-SAME: () #[[ATTR4:[0-9]+]] {
+// CHECK3-SAME: () #[[ATTR3:[0-9]+]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK3-NEXT:    ret void
@@ -2453,7 +2453,7 @@ int main (int argc, char **argv) {
 // CHECK5-NEXT:    [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
 // CHECK5-NEXT:    br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK5:       omp_offload.failed:
-// CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l36(ptr [[THIS1]]) #[[ATTR3:[0-9]+]]
+// CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l36(ptr [[THIS1]]) #[[ATTR2:[0-9]+]]
 // CHECK5-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK5:       omp_offload.cont:
 // CHECK5-NEXT:    [[A2:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0
@@ -2495,7 +2495,7 @@ int main (int argc, char **argv) {
 // CHECK5-NEXT:    [[TMP39:%.*]] = icmp ne i32 [[TMP38]], 0
 // CHECK5-NEXT:    br i1 [[TMP39]], label [[OMP_OFFLOAD_FAILED8:%.*]], label [[OMP_OFFLOAD_CONT9:%.*]]
 // CHECK5:       omp_offload.failed8:
-// CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l40(ptr [[THIS1]]) #[[ATTR3]]
+// CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l40(ptr [[THIS1]]) #[[ATTR2]]
 // CHECK5-NEXT:    br label [[OMP_OFFLOAD_CONT9]]
 // CHECK5:       omp_offload.cont9:
 // CHECK5-NEXT:    [[A10:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0
@@ -2537,7 +2537,7 @@ int main (int argc, char **argv) {
 // CHECK5-NEXT:    [[TMP59:%.*]] = icmp ne i32 [[TMP58]], 0
 // CHECK5-NEXT:    br i1 [[TMP59]], label [[OMP_OFFLOAD_FAILED16:%.*]], label [[OMP_OFFLOAD_CONT17:%.*]]
 // CHECK5:       omp_offload.failed16:
-// CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l44(ptr [[THIS1]]) #[[ATTR3]]
+// CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l44(ptr [[THIS1]]) #[[ATTR2]]
 // CHECK5-NEXT:    br label [[OMP_OFFLOAD_CONT17]]
 // CHECK5:       omp_offload.cont17:
 // CHECK5-NEXT:    [[A18:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0
@@ -2579,7 +2579,7 @@ int main (int argc, char **argv) {
 // CHECK5-NEXT:    [[TMP79:%.*]] = icmp ne i32 [[TMP78]], 0
 // CHECK5-NEXT:    br i1 [[TMP79]], label [[OMP_OFFLOAD_FAILED24:%.*]], label [[OMP_OFFLOAD_CONT25:%.*]]
 // CHECK5:       omp_offload.failed24:
-// CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l49(ptr [[THIS1]]) #[[ATTR3]]
+// CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l49(ptr [[THIS1]]) #[[ATTR2]]
 // CHECK5-NEXT:    br label [[OMP_OFFLOAD_CONT25]]
 // CHECK5:       omp_offload.cont25:
 // CHECK5-NEXT:    [[A26:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0
@@ -2621,7 +2621,7 @@ int main (int argc, char **argv) {
 // CHECK5-NEXT:    [[TMP99:%.*]] = icmp ne i32 [[TMP98]], 0
 // CHECK5-NEXT:    br i1 [[TMP99]], label [[OMP_OFFLOAD_FAILED32:%.*]], label [[OMP_OFFLOAD_CONT33:%.*]]
 // CHECK5:       omp_offload.failed32:
-// CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l54(ptr [[THIS1]]) #[[ATTR3]]
+// CHECK5-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l54(ptr [[THIS1]]) #[[ATTR2]]
 // CHECK5-NEXT:    br label [[OMP_OFFLOAD_CONT33]]
 // CHECK5:       omp_offload.cont33:
 // CHECK5-NEXT:    [[A34:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0
@@ -2641,7 +2641,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l36.omp_outlined
-// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -2711,7 +2711,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l36.omp_outlined.omp_outlined
-// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -2806,7 +2806,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l40.omp_outlined
-// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -2876,7 +2876,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l40.omp_outlined.omp_outlined
-// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -2971,7 +2971,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l44.omp_outlined
-// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -3041,7 +3041,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l44.omp_outlined.omp_outlined
-// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -3157,7 +3157,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l49.omp_outlined
-// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -3227,7 +3227,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l49.omp_outlined.omp_outlined
-// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -3320,7 +3320,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l54.omp_outlined
-// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -3390,7 +3390,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l54.omp_outlined.omp_outlined
-// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -3473,7 +3473,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK5-SAME: () #[[ATTR4:[0-9]+]] {
+// CHECK5-SAME: () #[[ATTR3:[0-9]+]] {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK5-NEXT:    ret void
@@ -3557,7 +3557,7 @@ int main (int argc, char **argv) {
 // CHECK7-NEXT:    [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
 // CHECK7-NEXT:    br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK7:       omp_offload.failed:
-// CHECK7-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l36(ptr [[THIS1]]) #[[ATTR3:[0-9]+]]
+// CHECK7-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l36(ptr [[THIS1]]) #[[ATTR2:[0-9]+]]
 // CHECK7-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK7:       omp_offload.cont:
 // CHECK7-NEXT:    [[A2:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0
@@ -3599,7 +3599,7 @@ int main (int argc, char **argv) {
 // CHECK7-NEXT:    [[TMP39:%.*]] = icmp ne i32 [[TMP38]], 0
 // CHECK7-NEXT:    br i1 [[TMP39]], label [[OMP_OFFLOAD_FAILED8:%.*]], label [[OMP_OFFLOAD_CONT9:%.*]]
 // CHECK7:       omp_offload.failed8:
-// CHECK7-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l40(ptr [[THIS1]]) #[[ATTR3]]
+// CHECK7-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l40(ptr [[THIS1]]) #[[ATTR2]]
 // CHECK7-NEXT:    br label [[OMP_OFFLOAD_CONT9]]
 // CHECK7:       omp_offload.cont9:
 // CHECK7-NEXT:    [[A10:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0
@@ -3641,7 +3641,7 @@ int main (int argc, char **argv) {
 // CHECK7-NEXT:    [[TMP59:%.*]] = icmp ne i32 [[TMP58]], 0
 // CHECK7-NEXT:    br i1 [[TMP59]], label [[OMP_OFFLOAD_FAILED16:%.*]], label [[OMP_OFFLOAD_CONT17:%.*]]
 // CHECK7:       omp_offload.failed16:
-// CHECK7-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l44(ptr [[THIS1]]) #[[ATTR3]]
+// CHECK7-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l44(ptr [[THIS1]]) #[[ATTR2]]
 // CHECK7-NEXT:    br label [[OMP_OFFLOAD_CONT17]]
 // CHECK7:       omp_offload.cont17:
 // CHECK7-NEXT:    [[A18:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0
@@ -3683,7 +3683,7 @@ int main (int argc, char **argv) {
 // CHECK7-NEXT:    [[TMP79:%.*]] = icmp ne i32 [[TMP78]], 0
 // CHECK7-NEXT:    br i1 [[TMP79]], label [[OMP_OFFLOAD_FAILED24:%.*]], label [[OMP_OFFLOAD_CONT25:%.*]]
 // CHECK7:       omp_offload.failed24:
-// CHECK7-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l49(ptr [[THIS1]]) #[[ATTR3]]
+// CHECK7-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l49(ptr [[THIS1]]) #[[ATTR2]]
 // CHECK7-NEXT:    br label [[OMP_OFFLOAD_CONT25]]
 // CHECK7:       omp_offload.cont25:
 // CHECK7-NEXT:    [[A26:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0
@@ -3725,7 +3725,7 @@ int main (int argc, char **argv) {
 // CHECK7-NEXT:    [[TMP99:%.*]] = icmp ne i32 [[TMP98]], 0
 // CHECK7-NEXT:    br i1 [[TMP99]], label [[OMP_OFFLOAD_FAILED32:%.*]], label [[OMP_OFFLOAD_CONT33:%.*]]
 // CHECK7:       omp_offload.failed32:
-// CHECK7-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l54(ptr [[THIS1]]) #[[ATTR3]]
+// CHECK7-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l54(ptr [[THIS1]]) #[[ATTR2]]
 // CHECK7-NEXT:    br label [[OMP_OFFLOAD_CONT33]]
 // CHECK7:       omp_offload.cont33:
 // CHECK7-NEXT:    [[A34:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0
@@ -3745,7 +3745,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l36.omp_outlined
-// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK7-NEXT:  entry:
 // CHECK7-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK7-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -3813,7 +3813,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l36.omp_outlined.omp_outlined
-// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK7-NEXT:  entry:
 // CHECK7-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK7-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -3905,7 +3905,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l40.omp_outlined
-// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK7-NEXT:  entry:
 // CHECK7-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK7-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -3973,7 +3973,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l40.omp_outlined.omp_outlined
-// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK7-NEXT:  entry:
 // CHECK7-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK7-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -4065,7 +4065,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l44.omp_outlined
-// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK7-NEXT:  entry:
 // CHECK7-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK7-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -4133,7 +4133,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l44.omp_outlined.omp_outlined
-// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK7-NEXT:  entry:
 // CHECK7-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK7-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -4244,7 +4244,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l49.omp_outlined
-// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK7-NEXT:  entry:
 // CHECK7-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK7-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -4312,7 +4312,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l49.omp_outlined.omp_outlined
-// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK7-NEXT:  entry:
 // CHECK7-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK7-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -4402,7 +4402,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l54.omp_outlined
-// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK7-NEXT:  entry:
 // CHECK7-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK7-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -4470,7 +4470,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l54.omp_outlined.omp_outlined
-// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK7-NEXT:  entry:
 // CHECK7-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK7-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -4550,7 +4550,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK7-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK7-SAME: () #[[ATTR4:[0-9]+]] {
+// CHECK7-SAME: () #[[ATTR3:[0-9]+]] {
 // CHECK7-NEXT:  entry:
 // CHECK7-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK7-NEXT:    ret void
@@ -5079,7 +5079,7 @@ int main (int argc, char **argv) {
 // CHECK13-NEXT:    [[TMP37:%.*]] = icmp ne i32 [[TMP36]], 0
 // CHECK13-NEXT:    br i1 [[TMP37]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK13:       omp_offload.failed:
-// CHECK13-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l139(i64 [[TMP4]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR4:[0-9]+]]
+// CHECK13-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l139(i64 [[TMP4]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR3:[0-9]+]]
 // CHECK13-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK13:       omp_offload.cont:
 // CHECK13-NEXT:    [[TMP38:%.*]] = load i32, ptr [[N]], align 4
@@ -5150,7 +5150,7 @@ int main (int argc, char **argv) {
 // CHECK13-NEXT:    [[TMP72:%.*]] = icmp ne i32 [[TMP71]], 0
 // CHECK13-NEXT:    br i1 [[TMP72]], label [[OMP_OFFLOAD_FAILED16:%.*]], label [[OMP_OFFLOAD_CONT17:%.*]]
 // CHECK13:       omp_offload.failed16:
-// CHECK13-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l143(i64 [[TMP39]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR4]]
+// CHECK13-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l143(i64 [[TMP39]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR3]]
 // CHECK13-NEXT:    br label [[OMP_OFFLOAD_CONT17]]
 // CHECK13:       omp_offload.cont17:
 // CHECK13-NEXT:    [[TMP73:%.*]] = load i32, ptr [[M]], align 4
@@ -5232,7 +5232,7 @@ int main (int argc, char **argv) {
 // CHECK13-NEXT:    [[TMP113:%.*]] = icmp ne i32 [[TMP112]], 0
 // CHECK13-NEXT:    br i1 [[TMP113]], label [[OMP_OFFLOAD_FAILED32:%.*]], label [[OMP_OFFLOAD_CONT33:%.*]]
 // CHECK13:       omp_offload.failed32:
-// CHECK13-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l147(i64 [[TMP75]], i64 [[TMP1]], ptr [[VLA]], i64 [[TMP77]]) #[[ATTR4]]
+// CHECK13-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l147(i64 [[TMP75]], i64 [[TMP1]], ptr [[VLA]], i64 [[TMP77]]) #[[ATTR3]]
 // CHECK13-NEXT:    br label [[OMP_OFFLOAD_CONT33]]
 // CHECK13:       omp_offload.cont33:
 // CHECK13-NEXT:    [[TMP114:%.*]] = load i32, ptr [[N]], align 4
@@ -5303,7 +5303,7 @@ int main (int argc, char **argv) {
 // CHECK13-NEXT:    [[TMP148:%.*]] = icmp ne i32 [[TMP147]], 0
 // CHECK13-NEXT:    br i1 [[TMP148]], label [[OMP_OFFLOAD_FAILED47:%.*]], label [[OMP_OFFLOAD_CONT48:%.*]]
 // CHECK13:       omp_offload.failed47:
-// CHECK13-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l151(i64 [[TMP115]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR4]]
+// CHECK13-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l151(i64 [[TMP115]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR3]]
 // CHECK13-NEXT:    br label [[OMP_OFFLOAD_CONT48]]
 // CHECK13:       omp_offload.cont48:
 // CHECK13-NEXT:    [[TMP149:%.*]] = load i32, ptr [[M]], align 4
@@ -5385,7 +5385,7 @@ int main (int argc, char **argv) {
 // CHECK13-NEXT:    [[TMP189:%.*]] = icmp ne i32 [[TMP188]], 0
 // CHECK13-NEXT:    br i1 [[TMP189]], label [[OMP_OFFLOAD_FAILED64:%.*]], label [[OMP_OFFLOAD_CONT65:%.*]]
 // CHECK13:       omp_offload.failed64:
-// CHECK13-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l155(i64 [[TMP151]], i64 [[TMP1]], ptr [[VLA]], i64 [[TMP153]]) #[[ATTR4]]
+// CHECK13-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l155(i64 [[TMP151]], i64 [[TMP1]], ptr [[VLA]], i64 [[TMP153]]) #[[ATTR3]]
 // CHECK13-NEXT:    br label [[OMP_OFFLOAD_CONT65]]
 // CHECK13:       omp_offload.cont65:
 // CHECK13-NEXT:    [[TMP190:%.*]] = load i32, ptr [[ARGC_ADDR]], align 4
@@ -5417,7 +5417,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l139.omp_outlined
-// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -5523,7 +5523,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l139.omp_outlined.omp_outlined
-// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] {
+// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -5658,7 +5658,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l143.omp_outlined
-// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] {
+// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -5764,7 +5764,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l143.omp_outlined.omp_outlined
-// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] {
+// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -5905,7 +5905,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l147.omp_outlined
-// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] {
+// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -6042,7 +6042,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l147.omp_outlined.omp_outlined
-// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] {
+// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -6179,7 +6179,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l151.omp_outlined
-// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] {
+// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -6285,7 +6285,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l151.omp_outlined.omp_outlined
-// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] {
+// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -6422,7 +6422,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l155.omp_outlined
-// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] {
+// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -6534,7 +6534,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l155.omp_outlined.omp_outlined
-// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] {
+// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -6649,7 +6649,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@_Z5tmainIiLi10EEiT_
-// CHECK13-SAME: (i32 noundef signext [[ARGC:%.*]]) #[[ATTR6:[0-9]+]] comdat {
+// CHECK13-SAME: (i32 noundef signext [[ARGC:%.*]]) #[[ATTR5:[0-9]+]] comdat {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[ARGC_ADDR:%.*]] = alloca i32, align 4
 // CHECK13-NEXT:    [[A:%.*]] = alloca [10 x i32], align 4
@@ -6723,7 +6723,7 @@ int main (int argc, char **argv) {
 // CHECK13-NEXT:    [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
 // CHECK13-NEXT:    br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK13:       omp_offload.failed:
-// CHECK13-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l112(ptr [[A]]) #[[ATTR4]]
+// CHECK13-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l112(ptr [[A]]) #[[ATTR3]]
 // CHECK13-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK13:       omp_offload.cont:
 // CHECK13-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
@@ -6764,7 +6764,7 @@ int main (int argc, char **argv) {
 // CHECK13-NEXT:    [[TMP39:%.*]] = icmp ne i32 [[TMP38]], 0
 // CHECK13-NEXT:    br i1 [[TMP39]], label [[OMP_OFFLOAD_FAILED6:%.*]], label [[OMP_OFFLOAD_CONT7:%.*]]
 // CHECK13:       omp_offload.failed6:
-// CHECK13-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l116(ptr [[A]]) #[[ATTR4]]
+// CHECK13-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l116(ptr [[A]]) #[[ATTR3]]
 // CHECK13-NEXT:    br label [[OMP_OFFLOAD_CONT7]]
 // CHECK13:       omp_offload.cont7:
 // CHECK13-NEXT:    [[TMP40:%.*]] = load i32, ptr [[M]], align 4
@@ -6816,7 +6816,7 @@ int main (int argc, char **argv) {
 // CHECK13-NEXT:    [[TMP65:%.*]] = icmp ne i32 [[TMP64]], 0
 // CHECK13-NEXT:    br i1 [[TMP65]], label [[OMP_OFFLOAD_FAILED13:%.*]], label [[OMP_OFFLOAD_CONT14:%.*]]
 // CHECK13:       omp_offload.failed13:
-// CHECK13-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l120(ptr [[A]], i64 [[TMP42]]) #[[ATTR4]]
+// CHECK13-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l120(ptr [[A]], i64 [[TMP42]]) #[[ATTR3]]
 // CHECK13-NEXT:    br label [[OMP_OFFLOAD_CONT14]]
 // CHECK13:       omp_offload.cont14:
 // CHECK13-NEXT:    [[TMP66:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS15]], i32 0, i32 0
@@ -6857,7 +6857,7 @@ int main (int argc, char **argv) {
 // CHECK13-NEXT:    [[TMP85:%.*]] = icmp ne i32 [[TMP84]], 0
 // CHECK13-NEXT:    br i1 [[TMP85]], label [[OMP_OFFLOAD_FAILED20:%.*]], label [[OMP_OFFLOAD_CONT21:%.*]]
 // CHECK13:       omp_offload.failed20:
-// CHECK13-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l124(ptr [[A]]) #[[ATTR4]]
+// CHECK13-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l124(ptr [[A]]) #[[ATTR3]]
 // CHECK13-NEXT:    br label [[OMP_OFFLOAD_CONT21]]
 // CHECK13:       omp_offload.cont21:
 // CHECK13-NEXT:    [[TMP86:%.*]] = load i32, ptr [[M]], align 4
@@ -6909,7 +6909,7 @@ int main (int argc, char **argv) {
 // CHECK13-NEXT:    [[TMP111:%.*]] = icmp ne i32 [[TMP110]], 0
 // CHECK13-NEXT:    br i1 [[TMP111]], label [[OMP_OFFLOAD_FAILED29:%.*]], label [[OMP_OFFLOAD_CONT30:%.*]]
 // CHECK13:       omp_offload.failed29:
-// CHECK13-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l128(ptr [[A]], i64 [[TMP88]]) #[[ATTR4]]
+// CHECK13-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l128(ptr [[A]], i64 [[TMP88]]) #[[ATTR3]]
 // CHECK13-NEXT:    br label [[OMP_OFFLOAD_CONT30]]
 // CHECK13:       omp_offload.cont30:
 // CHECK13-NEXT:    ret i32 0
@@ -6926,7 +6926,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l112.omp_outlined
-// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] {
+// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -6996,7 +6996,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l112.omp_outlined.omp_outlined
-// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] {
+// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -7090,7 +7090,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l116.omp_outlined
-// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] {
+// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -7160,7 +7160,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l116.omp_outlined.omp_outlined
-// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] {
+// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -7260,7 +7260,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l120.omp_outlined
-// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] {
+// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -7336,7 +7336,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l120.omp_outlined.omp_outlined
-// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] {
+// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -7454,7 +7454,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l124.omp_outlined
-// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] {
+// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -7524,7 +7524,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l124.omp_outlined.omp_outlined
-// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] {
+// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -7622,7 +7622,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l128.omp_outlined
-// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] {
+// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -7698,7 +7698,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l128.omp_outlined.omp_outlined
-// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] {
+// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -7783,7 +7783,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK13-SAME: () #[[ATTR7:[0-9]+]] {
+// CHECK13-SAME: () #[[ATTR6:[0-9]+]] {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK13-NEXT:    ret void
@@ -7927,7 +7927,7 @@ int main (int argc, char **argv) {
 // CHECK15-NEXT:    [[TMP37:%.*]] = icmp ne i32 [[TMP36]], 0
 // CHECK15-NEXT:    br i1 [[TMP37]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK15:       omp_offload.failed:
-// CHECK15-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l139(i32 [[TMP3]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR4:[0-9]+]]
+// CHECK15-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l139(i32 [[TMP3]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR3:[0-9]+]]
 // CHECK15-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK15:       omp_offload.cont:
 // CHECK15-NEXT:    [[TMP38:%.*]] = load i32, ptr [[N]], align 4
@@ -7999,7 +7999,7 @@ int main (int argc, char **argv) {
 // CHECK15-NEXT:    [[TMP73:%.*]] = icmp ne i32 [[TMP72]], 0
 // CHECK15-NEXT:    br i1 [[TMP73]], label [[OMP_OFFLOAD_FAILED16:%.*]], label [[OMP_OFFLOAD_CONT17:%.*]]
 // CHECK15:       omp_offload.failed16:
-// CHECK15-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l143(i32 [[TMP39]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR4]]
+// CHECK15-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l143(i32 [[TMP39]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR3]]
 // CHECK15-NEXT:    br label [[OMP_OFFLOAD_CONT17]]
 // CHECK15:       omp_offload.cont17:
 // CHECK15-NEXT:    [[TMP74:%.*]] = load i32, ptr [[M]], align 4
@@ -8082,7 +8082,7 @@ int main (int argc, char **argv) {
 // CHECK15-NEXT:    [[TMP115:%.*]] = icmp ne i32 [[TMP114]], 0
 // CHECK15-NEXT:    br i1 [[TMP115]], label [[OMP_OFFLOAD_FAILED32:%.*]], label [[OMP_OFFLOAD_CONT33:%.*]]
 // CHECK15:       omp_offload.failed32:
-// CHECK15-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l147(i32 [[TMP76]], i32 [[TMP0]], ptr [[VLA]], i32 [[TMP78]]) #[[ATTR4]]
+// CHECK15-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l147(i32 [[TMP76]], i32 [[TMP0]], ptr [[VLA]], i32 [[TMP78]]) #[[ATTR3]]
 // CHECK15-NEXT:    br label [[OMP_OFFLOAD_CONT33]]
 // CHECK15:       omp_offload.cont33:
 // CHECK15-NEXT:    [[TMP116:%.*]] = load i32, ptr [[N]], align 4
@@ -8154,7 +8154,7 @@ int main (int argc, char **argv) {
 // CHECK15-NEXT:    [[TMP151:%.*]] = icmp ne i32 [[TMP150]], 0
 // CHECK15-NEXT:    br i1 [[TMP151]], label [[OMP_OFFLOAD_FAILED47:%.*]], label [[OMP_OFFLOAD_CONT48:%.*]]
 // CHECK15:       omp_offload.failed47:
-// CHECK15-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l151(i32 [[TMP117]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR4]]
+// CHECK15-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l151(i32 [[TMP117]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR3]]
 // CHECK15-NEXT:    br label [[OMP_OFFLOAD_CONT48]]
 // CHECK15:       omp_offload.cont48:
 // CHECK15-NEXT:    [[TMP152:%.*]] = load i32, ptr [[M]], align 4
@@ -8237,7 +8237,7 @@ int main (int argc, char **argv) {
 // CHECK15-NEXT:    [[TMP193:%.*]] = icmp ne i32 [[TMP192]], 0
 // CHECK15-NEXT:    br i1 [[TMP193]], label [[OMP_OFFLOAD_FAILED64:%.*]], label [[OMP_OFFLOAD_CONT65:%.*]]
 // CHECK15:       omp_offload.failed64:
-// CHECK15-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l155(i32 [[TMP154]], i32 [[TMP0]], ptr [[VLA]], i32 [[TMP156]]) #[[ATTR4]]
+// CHECK15-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l155(i32 [[TMP154]], i32 [[TMP0]], ptr [[VLA]], i32 [[TMP156]]) #[[ATTR3]]
 // CHECK15-NEXT:    br label [[OMP_OFFLOAD_CONT65]]
 // CHECK15:       omp_offload.cont65:
 // CHECK15-NEXT:    [[TMP194:%.*]] = load i32, ptr [[ARGC_ADDR]], align 4
@@ -8269,7 +8269,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l139.omp_outlined
-// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] {
 // CHECK15-NEXT:  entry:
 // CHECK15-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK15-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -8373,7 +8373,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l139.omp_outlined.omp_outlined
-// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] {
+// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] {
 // CHECK15-NEXT:  entry:
 // CHECK15-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK15-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -8505,7 +8505,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l143.omp_outlined
-// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] {
+// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] {
 // CHECK15-NEXT:  entry:
 // CHECK15-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK15-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -8609,7 +8609,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l143.omp_outlined.omp_outlined
-// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] {
+// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] {
 // CHECK15-NEXT:  entry:
 // CHECK15-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK15-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -8747,7 +8747,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l147.omp_outlined
-// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] {
+// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
 // CHECK15-NEXT:  entry:
 // CHECK15-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK15-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -8882,7 +8882,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l147.omp_outlined.omp_outlined
-// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] {
+// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
 // CHECK15-NEXT:  entry:
 // CHECK15-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK15-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -9016,7 +9016,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l151.omp_outlined
-// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] {
+// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] {
 // CHECK15-NEXT:  entry:
 // CHECK15-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK15-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -9120,7 +9120,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l151.omp_outlined.omp_outlined
-// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] {
+// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] {
 // CHECK15-NEXT:  entry:
 // CHECK15-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK15-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -9254,7 +9254,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l155.omp_outlined
-// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] {
+// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
 // CHECK15-NEXT:  entry:
 // CHECK15-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK15-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -9364,7 +9364,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l155.omp_outlined.omp_outlined
-// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] {
+// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
 // CHECK15-NEXT:  entry:
 // CHECK15-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK15-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -9476,7 +9476,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK15-LABEL: define {{[^@]+}}@_Z5tmainIiLi10EEiT_
-// CHECK15-SAME: (i32 noundef [[ARGC:%.*]]) #[[ATTR6:[0-9]+]] comdat {
+// CHECK15-SAME: (i32 noundef [[ARGC:%.*]]) #[[ATTR5:[0-9]+]] comdat {
 // CHECK15-NEXT:  entry:
 // CHECK15-NEXT:    [[ARGC_ADDR:%.*]] = alloca i32, align 4
 // CHECK15-NEXT:    [[A:%.*]] = alloca [10 x i32], align 4
@@ -9550,7 +9550,7 @@ int main (int argc, char **argv) {
 // CHECK15-NEXT:    [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
 // CHECK15-NEXT:    br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK15:       omp_offload.failed:
-// CHECK15-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l112(ptr [[A]]) #[[ATTR4]]
+// CHECK15-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l112(ptr [[A]]) #[[ATTR3]]
 // CHECK15-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK15:       omp_offload.cont:
 // CHECK15-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
@@ -9591,7 +9591,7 @@ int main (int argc, char **argv) {
 // CHECK15-NEXT:    [[TMP39:%.*]] = icmp ne i32 [[TMP38]], 0
 // CHECK15-NEXT:    br i1 [[TMP39]], label [[OMP_OFFLOAD_FAILED6:%.*]], label [[OMP_OFFLOAD_CONT7:%.*]]
 // CHECK15:       omp_offload.failed6:
-// CHECK15-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l116(ptr [[A]]) #[[ATTR4]]
+// CHECK15-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l116(ptr [[A]]) #[[ATTR3]]
 // CHECK15-NEXT:    br label [[OMP_OFFLOAD_CONT7]]
 // CHECK15:       omp_offload.cont7:
 // CHECK15-NEXT:    [[TMP40:%.*]] = load i32, ptr [[M]], align 4
@@ -9643,7 +9643,7 @@ int main (int argc, char **argv) {
 // CHECK15-NEXT:    [[TMP65:%.*]] = icmp ne i32 [[TMP64]], 0
 // CHECK15-NEXT:    br i1 [[TMP65]], label [[OMP_OFFLOAD_FAILED13:%.*]], label [[OMP_OFFLOAD_CONT14:%.*]]
 // CHECK15:       omp_offload.failed13:
-// CHECK15-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l120(ptr [[A]], i32 [[TMP42]]) #[[ATTR4]]
+// CHECK15-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l120(ptr [[A]], i32 [[TMP42]]) #[[ATTR3]]
 // CHECK15-NEXT:    br label [[OMP_OFFLOAD_CONT14]]
 // CHECK15:       omp_offload.cont14:
 // CHECK15-NEXT:    [[TMP66:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS15]], i32 0, i32 0
@@ -9684,7 +9684,7 @@ int main (int argc, char **argv) {
 // CHECK15-NEXT:    [[TMP85:%.*]] = icmp ne i32 [[TMP84]], 0
 // CHECK15-NEXT:    br i1 [[TMP85]], label [[OMP_OFFLOAD_FAILED20:%.*]], label [[OMP_OFFLOAD_CONT21:%.*]]
 // CHECK15:       omp_offload.failed20:
-// CHECK15-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l124(ptr [[A]]) #[[ATTR4]]
+// CHECK15-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l124(ptr [[A]]) #[[ATTR3]]
 // CHECK15-NEXT:    br label [[OMP_OFFLOAD_CONT21]]
 // CHECK15:       omp_offload.cont21:
 // CHECK15-NEXT:    [[TMP86:%.*]] = load i32, ptr [[M]], align 4
@@ -9736,7 +9736,7 @@ int main (int argc, char **argv) {
 // CHECK15-NEXT:    [[TMP111:%.*]] = icmp ne i32 [[TMP110]], 0
 // CHECK15-NEXT:    br i1 [[TMP111]], label [[OMP_OFFLOAD_FAILED29:%.*]], label [[OMP_OFFLOAD_CONT30:%.*]]
 // CHECK15:       omp_offload.failed29:
-// CHECK15-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l128(ptr [[A]], i32 [[TMP88]]) #[[ATTR4]]
+// CHECK15-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l128(ptr [[A]], i32 [[TMP88]]) #[[ATTR3]]
 // CHECK15-NEXT:    br label [[OMP_OFFLOAD_CONT30]]
 // CHECK15:       omp_offload.cont30:
 // CHECK15-NEXT:    ret i32 0
@@ -9753,7 +9753,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l112.omp_outlined
-// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] {
+// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] {
 // CHECK15-NEXT:  entry:
 // CHECK15-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK15-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -9821,7 +9821,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l112.omp_outlined.omp_outlined
-// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] {
+// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] {
 // CHECK15-NEXT:  entry:
 // CHECK15-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK15-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -9912,7 +9912,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l116.omp_outlined
-// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] {
+// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] {
 // CHECK15-NEXT:  entry:
 // CHECK15-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK15-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -9980,7 +9980,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l116.omp_outlined.omp_outlined
-// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] {
+// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] {
 // CHECK15-NEXT:  entry:
 // CHECK15-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK15-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -10077,7 +10077,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l120.omp_outlined
-// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] {
+// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
 // CHECK15-NEXT:  entry:
 // CHECK15-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK15-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -10151,7 +10151,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l120.omp_outlined.omp_outlined
-// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] {
+// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
 // CHECK15-NEXT:  entry:
 // CHECK15-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK15-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -10264,7 +10264,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l124.omp_outlined
-// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] {
+// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] {
 // CHECK15-NEXT:  entry:
 // CHECK15-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK15-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -10332,7 +10332,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l124.omp_outlined.omp_outlined
-// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] {
+// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] {
 // CHECK15-NEXT:  entry:
 // CHECK15-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK15-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -10427,7 +10427,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l128.omp_outlined
-// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] {
+// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
 // CHECK15-NEXT:  entry:
 // CHECK15-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK15-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -10501,7 +10501,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l128.omp_outlined.omp_outlined
-// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] {
+// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
 // CHECK15-NEXT:  entry:
 // CHECK15-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK15-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -10583,7 +10583,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK15-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK15-SAME: () #[[ATTR7:[0-9]+]] {
+// CHECK15-SAME: () #[[ATTR6:[0-9]+]] {
 // CHECK15-NEXT:  entry:
 // CHECK15-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK15-NEXT:    ret void
@@ -10727,7 +10727,7 @@ int main (int argc, char **argv) {
 // CHECK17-NEXT:    [[TMP37:%.*]] = icmp ne i32 [[TMP36]], 0
 // CHECK17-NEXT:    br i1 [[TMP37]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK17:       omp_offload.failed:
-// CHECK17-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l139(i64 [[TMP4]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR4:[0-9]+]]
+// CHECK17-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l139(i64 [[TMP4]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR3:[0-9]+]]
 // CHECK17-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK17:       omp_offload.cont:
 // CHECK17-NEXT:    [[TMP38:%.*]] = load i32, ptr [[N]], align 4
@@ -10798,7 +10798,7 @@ int main (int argc, char **argv) {
 // CHECK17-NEXT:    [[TMP72:%.*]] = icmp ne i32 [[TMP71]], 0
 // CHECK17-NEXT:    br i1 [[TMP72]], label [[OMP_OFFLOAD_FAILED16:%.*]], label [[OMP_OFFLOAD_CONT17:%.*]]
 // CHECK17:       omp_offload.failed16:
-// CHECK17-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l143(i64 [[TMP39]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR4]]
+// CHECK17-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l143(i64 [[TMP39]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR3]]
 // CHECK17-NEXT:    br label [[OMP_OFFLOAD_CONT17]]
 // CHECK17:       omp_offload.cont17:
 // CHECK17-NEXT:    [[TMP73:%.*]] = load i32, ptr [[M]], align 4
@@ -10880,7 +10880,7 @@ int main (int argc, char **argv) {
 // CHECK17-NEXT:    [[TMP113:%.*]] = icmp ne i32 [[TMP112]], 0
 // CHECK17-NEXT:    br i1 [[TMP113]], label [[OMP_OFFLOAD_FAILED32:%.*]], label [[OMP_OFFLOAD_CONT33:%.*]]
 // CHECK17:       omp_offload.failed32:
-// CHECK17-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l147(i64 [[TMP75]], i64 [[TMP1]], ptr [[VLA]], i64 [[TMP77]]) #[[ATTR4]]
+// CHECK17-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l147(i64 [[TMP75]], i64 [[TMP1]], ptr [[VLA]], i64 [[TMP77]]) #[[ATTR3]]
 // CHECK17-NEXT:    br label [[OMP_OFFLOAD_CONT33]]
 // CHECK17:       omp_offload.cont33:
 // CHECK17-NEXT:    [[TMP114:%.*]] = load i32, ptr [[N]], align 4
@@ -10951,7 +10951,7 @@ int main (int argc, char **argv) {
 // CHECK17-NEXT:    [[TMP148:%.*]] = icmp ne i32 [[TMP147]], 0
 // CHECK17-NEXT:    br i1 [[TMP148]], label [[OMP_OFFLOAD_FAILED47:%.*]], label [[OMP_OFFLOAD_CONT48:%.*]]
 // CHECK17:       omp_offload.failed47:
-// CHECK17-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l151(i64 [[TMP115]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR4]]
+// CHECK17-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l151(i64 [[TMP115]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR3]]
 // CHECK17-NEXT:    br label [[OMP_OFFLOAD_CONT48]]
 // CHECK17:       omp_offload.cont48:
 // CHECK17-NEXT:    [[TMP149:%.*]] = load i32, ptr [[M]], align 4
@@ -11033,7 +11033,7 @@ int main (int argc, char **argv) {
 // CHECK17-NEXT:    [[TMP189:%.*]] = icmp ne i32 [[TMP188]], 0
 // CHECK17-NEXT:    br i1 [[TMP189]], label [[OMP_OFFLOAD_FAILED64:%.*]], label [[OMP_OFFLOAD_CONT65:%.*]]
 // CHECK17:       omp_offload.failed64:
-// CHECK17-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l155(i64 [[TMP151]], i64 [[TMP1]], ptr [[VLA]], i64 [[TMP153]]) #[[ATTR4]]
+// CHECK17-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l155(i64 [[TMP151]], i64 [[TMP1]], ptr [[VLA]], i64 [[TMP153]]) #[[ATTR3]]
 // CHECK17-NEXT:    br label [[OMP_OFFLOAD_CONT65]]
 // CHECK17:       omp_offload.cont65:
 // CHECK17-NEXT:    [[TMP190:%.*]] = load i32, ptr [[ARGC_ADDR]], align 4
@@ -11065,7 +11065,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l139.omp_outlined
-// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] {
 // CHECK17-NEXT:  entry:
 // CHECK17-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -11171,7 +11171,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l139.omp_outlined.omp_outlined
-// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] {
+// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] {
 // CHECK17-NEXT:  entry:
 // CHECK17-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -11306,7 +11306,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l143.omp_outlined
-// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] {
+// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] {
 // CHECK17-NEXT:  entry:
 // CHECK17-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -11412,7 +11412,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l143.omp_outlined.omp_outlined
-// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] {
+// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] {
 // CHECK17-NEXT:  entry:
 // CHECK17-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -11553,7 +11553,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l147.omp_outlined
-// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] {
+// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
 // CHECK17-NEXT:  entry:
 // CHECK17-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -11690,7 +11690,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l147.omp_outlined.omp_outlined
-// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] {
+// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
 // CHECK17-NEXT:  entry:
 // CHECK17-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -11827,7 +11827,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l151.omp_outlined
-// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] {
+// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] {
 // CHECK17-NEXT:  entry:
 // CHECK17-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -11933,7 +11933,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l151.omp_outlined.omp_outlined
-// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] {
+// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] {
 // CHECK17-NEXT:  entry:
 // CHECK17-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -12070,7 +12070,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l155.omp_outlined
-// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] {
+// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
 // CHECK17-NEXT:  entry:
 // CHECK17-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -12182,7 +12182,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l155.omp_outlined.omp_outlined
-// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] {
+// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
 // CHECK17-NEXT:  entry:
 // CHECK17-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -12297,7 +12297,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@_Z5tmainIiLi10EEiT_
-// CHECK17-SAME: (i32 noundef signext [[ARGC:%.*]]) #[[ATTR6:[0-9]+]] comdat {
+// CHECK17-SAME: (i32 noundef signext [[ARGC:%.*]]) #[[ATTR5:[0-9]+]] comdat {
 // CHECK17-NEXT:  entry:
 // CHECK17-NEXT:    [[ARGC_ADDR:%.*]] = alloca i32, align 4
 // CHECK17-NEXT:    [[A:%.*]] = alloca [10 x i32], align 4
@@ -12371,7 +12371,7 @@ int main (int argc, char **argv) {
 // CHECK17-NEXT:    [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
 // CHECK17-NEXT:    br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK17:       omp_offload.failed:
-// CHECK17-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l112(ptr [[A]]) #[[ATTR4]]
+// CHECK17-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l112(ptr [[A]]) #[[ATTR3]]
 // CHECK17-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK17:       omp_offload.cont:
 // CHECK17-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
@@ -12412,7 +12412,7 @@ int main (int argc, char **argv) {
 // CHECK17-NEXT:    [[TMP39:%.*]] = icmp ne i32 [[TMP38]], 0
 // CHECK17-NEXT:    br i1 [[TMP39]], label [[OMP_OFFLOAD_FAILED6:%.*]], label [[OMP_OFFLOAD_CONT7:%.*]]
 // CHECK17:       omp_offload.failed6:
-// CHECK17-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l116(ptr [[A]]) #[[ATTR4]]
+// CHECK17-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l116(ptr [[A]]) #[[ATTR3]]
 // CHECK17-NEXT:    br label [[OMP_OFFLOAD_CONT7]]
 // CHECK17:       omp_offload.cont7:
 // CHECK17-NEXT:    [[TMP40:%.*]] = load i32, ptr [[M]], align 4
@@ -12464,7 +12464,7 @@ int main (int argc, char **argv) {
 // CHECK17-NEXT:    [[TMP65:%.*]] = icmp ne i32 [[TMP64]], 0
 // CHECK17-NEXT:    br i1 [[TMP65]], label [[OMP_OFFLOAD_FAILED13:%.*]], label [[OMP_OFFLOAD_CONT14:%.*]]
 // CHECK17:       omp_offload.failed13:
-// CHECK17-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l120(ptr [[A]], i64 [[TMP42]]) #[[ATTR4]]
+// CHECK17-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l120(ptr [[A]], i64 [[TMP42]]) #[[ATTR3]]
 // CHECK17-NEXT:    br label [[OMP_OFFLOAD_CONT14]]
 // CHECK17:       omp_offload.cont14:
 // CHECK17-NEXT:    [[TMP66:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS15]], i32 0, i32 0
@@ -12505,7 +12505,7 @@ int main (int argc, char **argv) {
 // CHECK17-NEXT:    [[TMP85:%.*]] = icmp ne i32 [[TMP84]], 0
 // CHECK17-NEXT:    br i1 [[TMP85]], label [[OMP_OFFLOAD_FAILED20:%.*]], label [[OMP_OFFLOAD_CONT21:%.*]]
 // CHECK17:       omp_offload.failed20:
-// CHECK17-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l124(ptr [[A]]) #[[ATTR4]]
+// CHECK17-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l124(ptr [[A]]) #[[ATTR3]]
 // CHECK17-NEXT:    br label [[OMP_OFFLOAD_CONT21]]
 // CHECK17:       omp_offload.cont21:
 // CHECK17-NEXT:    [[TMP86:%.*]] = load i32, ptr [[M]], align 4
@@ -12557,7 +12557,7 @@ int main (int argc, char **argv) {
 // CHECK17-NEXT:    [[TMP111:%.*]] = icmp ne i32 [[TMP110]], 0
 // CHECK17-NEXT:    br i1 [[TMP111]], label [[OMP_OFFLOAD_FAILED29:%.*]], label [[OMP_OFFLOAD_CONT30:%.*]]
 // CHECK17:       omp_offload.failed29:
-// CHECK17-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l128(ptr [[A]], i64 [[TMP88]]) #[[ATTR4]]
+// CHECK17-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l128(ptr [[A]], i64 [[TMP88]]) #[[ATTR3]]
 // CHECK17-NEXT:    br label [[OMP_OFFLOAD_CONT30]]
 // CHECK17:       omp_offload.cont30:
 // CHECK17-NEXT:    ret i32 0
@@ -12574,7 +12574,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l112.omp_outlined
-// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] {
+// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] {
 // CHECK17-NEXT:  entry:
 // CHECK17-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -12644,7 +12644,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l112.omp_outlined.omp_outlined
-// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] {
+// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] {
 // CHECK17-NEXT:  entry:
 // CHECK17-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -12738,7 +12738,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l116.omp_outlined
-// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] {
+// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] {
 // CHECK17-NEXT:  entry:
 // CHECK17-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -12808,7 +12808,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l116.omp_outlined.omp_outlined
-// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] {
+// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] {
 // CHECK17-NEXT:  entry:
 // CHECK17-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -12908,7 +12908,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l120.omp_outlined
-// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] {
+// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
 // CHECK17-NEXT:  entry:
 // CHECK17-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -12984,7 +12984,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l120.omp_outlined.omp_outlined
-// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] {
+// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
 // CHECK17-NEXT:  entry:
 // CHECK17-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -13102,7 +13102,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l124.omp_outlined
-// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] {
+// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] {
 // CHECK17-NEXT:  entry:
 // CHECK17-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -13172,7 +13172,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l124.omp_outlined.omp_outlined
-// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] {
+// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] {
 // CHECK17-NEXT:  entry:
 // CHECK17-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -13270,7 +13270,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l128.omp_outlined
-// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] {
+// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
 // CHECK17-NEXT:  entry:
 // CHECK17-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -13346,7 +13346,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l128.omp_outlined.omp_outlined
-// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] {
+// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
 // CHECK17-NEXT:  entry:
 // CHECK17-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -13431,7 +13431,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK17-SAME: () #[[ATTR7:[0-9]+]] {
+// CHECK17-SAME: () #[[ATTR6:[0-9]+]] {
 // CHECK17-NEXT:  entry:
 // CHECK17-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK17-NEXT:    ret void
@@ -13575,7 +13575,7 @@ int main (int argc, char **argv) {
 // CHECK19-NEXT:    [[TMP37:%.*]] = icmp ne i32 [[TMP36]], 0
 // CHECK19-NEXT:    br i1 [[TMP37]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK19:       omp_offload.failed:
-// CHECK19-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l139(i32 [[TMP3]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR4:[0-9]+]]
+// CHECK19-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l139(i32 [[TMP3]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR3:[0-9]+]]
 // CHECK19-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK19:       omp_offload.cont:
 // CHECK19-NEXT:    [[TMP38:%.*]] = load i32, ptr [[N]], align 4
@@ -13647,7 +13647,7 @@ int main (int argc, char **argv) {
 // CHECK19-NEXT:    [[TMP73:%.*]] = icmp ne i32 [[TMP72]], 0
 // CHECK19-NEXT:    br i1 [[TMP73]], label [[OMP_OFFLOAD_FAILED16:%.*]], label [[OMP_OFFLOAD_CONT17:%.*]]
 // CHECK19:       omp_offload.failed16:
-// CHECK19-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l143(i32 [[TMP39]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR4]]
+// CHECK19-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l143(i32 [[TMP39]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR3]]
 // CHECK19-NEXT:    br label [[OMP_OFFLOAD_CONT17]]
 // CHECK19:       omp_offload.cont17:
 // CHECK19-NEXT:    [[TMP74:%.*]] = load i32, ptr [[M]], align 4
@@ -13730,7 +13730,7 @@ int main (int argc, char **argv) {
 // CHECK19-NEXT:    [[TMP115:%.*]] = icmp ne i32 [[TMP114]], 0
 // CHECK19-NEXT:    br i1 [[TMP115]], label [[OMP_OFFLOAD_FAILED32:%.*]], label [[OMP_OFFLOAD_CONT33:%.*]]
 // CHECK19:       omp_offload.failed32:
-// CHECK19-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l147(i32 [[TMP76]], i32 [[TMP0]], ptr [[VLA]], i32 [[TMP78]]) #[[ATTR4]]
+// CHECK19-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l147(i32 [[TMP76]], i32 [[TMP0]], ptr [[VLA]], i32 [[TMP78]]) #[[ATTR3]]
 // CHECK19-NEXT:    br label [[OMP_OFFLOAD_CONT33]]
 // CHECK19:       omp_offload.cont33:
 // CHECK19-NEXT:    [[TMP116:%.*]] = load i32, ptr [[N]], align 4
@@ -13802,7 +13802,7 @@ int main (int argc, char **argv) {
 // CHECK19-NEXT:    [[TMP151:%.*]] = icmp ne i32 [[TMP150]], 0
 // CHECK19-NEXT:    br i1 [[TMP151]], label [[OMP_OFFLOAD_FAILED47:%.*]], label [[OMP_OFFLOAD_CONT48:%.*]]
 // CHECK19:       omp_offload.failed47:
-// CHECK19-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l151(i32 [[TMP117]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR4]]
+// CHECK19-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l151(i32 [[TMP117]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR3]]
 // CHECK19-NEXT:    br label [[OMP_OFFLOAD_CONT48]]
 // CHECK19:       omp_offload.cont48:
 // CHECK19-NEXT:    [[TMP152:%.*]] = load i32, ptr [[M]], align 4
@@ -13885,7 +13885,7 @@ int main (int argc, char **argv) {
 // CHECK19-NEXT:    [[TMP193:%.*]] = icmp ne i32 [[TMP192]], 0
 // CHECK19-NEXT:    br i1 [[TMP193]], label [[OMP_OFFLOAD_FAILED64:%.*]], label [[OMP_OFFLOAD_CONT65:%.*]]
 // CHECK19:       omp_offload.failed64:
-// CHECK19-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l155(i32 [[TMP154]], i32 [[TMP0]], ptr [[VLA]], i32 [[TMP156]]) #[[ATTR4]]
+// CHECK19-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l155(i32 [[TMP154]], i32 [[TMP0]], ptr [[VLA]], i32 [[TMP156]]) #[[ATTR3]]
 // CHECK19-NEXT:    br label [[OMP_OFFLOAD_CONT65]]
 // CHECK19:       omp_offload.cont65:
 // CHECK19-NEXT:    [[TMP194:%.*]] = load i32, ptr [[ARGC_ADDR]], align 4
@@ -13917,7 +13917,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l139.omp_outlined
-// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] {
 // CHECK19-NEXT:  entry:
 // CHECK19-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -14021,7 +14021,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l139.omp_outlined.omp_outlined
-// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] {
+// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] {
 // CHECK19-NEXT:  entry:
 // CHECK19-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -14153,7 +14153,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l143.omp_outlined
-// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] {
+// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] {
 // CHECK19-NEXT:  entry:
 // CHECK19-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -14257,7 +14257,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l143.omp_outlined.omp_outlined
-// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] {
+// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] {
 // CHECK19-NEXT:  entry:
 // CHECK19-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -14395,7 +14395,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l147.omp_outlined
-// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] {
+// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
 // CHECK19-NEXT:  entry:
 // CHECK19-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -14530,7 +14530,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l147.omp_outlined.omp_outlined
-// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] {
+// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
 // CHECK19-NEXT:  entry:
 // CHECK19-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -14664,7 +14664,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l151.omp_outlined
-// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] {
+// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] {
 // CHECK19-NEXT:  entry:
 // CHECK19-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -14768,7 +14768,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l151.omp_outlined.omp_outlined
-// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] {
+// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] {
 // CHECK19-NEXT:  entry:
 // CHECK19-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -14902,7 +14902,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l155.omp_outlined
-// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] {
+// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
 // CHECK19-NEXT:  entry:
 // CHECK19-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -15012,7 +15012,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l155.omp_outlined.omp_outlined
-// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] {
+// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
 // CHECK19-NEXT:  entry:
 // CHECK19-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -15124,7 +15124,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK19-LABEL: define {{[^@]+}}@_Z5tmainIiLi10EEiT_
-// CHECK19-SAME: (i32 noundef [[ARGC:%.*]]) #[[ATTR6:[0-9]+]] comdat {
+// CHECK19-SAME: (i32 noundef [[ARGC:%.*]]) #[[ATTR5:[0-9]+]] comdat {
 // CHECK19-NEXT:  entry:
 // CHECK19-NEXT:    [[ARGC_ADDR:%.*]] = alloca i32, align 4
 // CHECK19-NEXT:    [[A:%.*]] = alloca [10 x i32], align 4
@@ -15198,7 +15198,7 @@ int main (int argc, char **argv) {
 // CHECK19-NEXT:    [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
 // CHECK19-NEXT:    br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK19:       omp_offload.failed:
-// CHECK19-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l112(ptr [[A]]) #[[ATTR4]]
+// CHECK19-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l112(ptr [[A]]) #[[ATTR3]]
 // CHECK19-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK19:       omp_offload.cont:
 // CHECK19-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
@@ -15239,7 +15239,7 @@ int main (int argc, char **argv) {
 // CHECK19-NEXT:    [[TMP39:%.*]] = icmp ne i32 [[TMP38]], 0
 // CHECK19-NEXT:    br i1 [[TMP39]], label [[OMP_OFFLOAD_FAILED6:%.*]], label [[OMP_OFFLOAD_CONT7:%.*]]
 // CHECK19:       omp_offload.failed6:
-// CHECK19-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l116(ptr [[A]]) #[[ATTR4]]
+// CHECK19-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l116(ptr [[A]]) #[[ATTR3]]
 // CHECK19-NEXT:    br label [[OMP_OFFLOAD_CONT7]]
 // CHECK19:       omp_offload.cont7:
 // CHECK19-NEXT:    [[TMP40:%.*]] = load i32, ptr [[M]], align 4
@@ -15291,7 +15291,7 @@ int main (int argc, char **argv) {
 // CHECK19-NEXT:    [[TMP65:%.*]] = icmp ne i32 [[TMP64]], 0
 // CHECK19-NEXT:    br i1 [[TMP65]], label [[OMP_OFFLOAD_FAILED13:%.*]], label [[OMP_OFFLOAD_CONT14:%.*]]
 // CHECK19:       omp_offload.failed13:
-// CHECK19-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l120(ptr [[A]], i32 [[TMP42]]) #[[ATTR4]]
+// CHECK19-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l120(ptr [[A]], i32 [[TMP42]]) #[[ATTR3]]
 // CHECK19-NEXT:    br label [[OMP_OFFLOAD_CONT14]]
 // CHECK19:       omp_offload.cont14:
 // CHECK19-NEXT:    [[TMP66:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS15]], i32 0, i32 0
@@ -15332,7 +15332,7 @@ int main (int argc, char **argv) {
 // CHECK19-NEXT:    [[TMP85:%.*]] = icmp ne i32 [[TMP84]], 0
 // CHECK19-NEXT:    br i1 [[TMP85]], label [[OMP_OFFLOAD_FAILED20:%.*]], label [[OMP_OFFLOAD_CONT21:%.*]]
 // CHECK19:       omp_offload.failed20:
-// CHECK19-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l124(ptr [[A]]) #[[ATTR4]]
+// CHECK19-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l124(ptr [[A]]) #[[ATTR3]]
 // CHECK19-NEXT:    br label [[OMP_OFFLOAD_CONT21]]
 // CHECK19:       omp_offload.cont21:
 // CHECK19-NEXT:    [[TMP86:%.*]] = load i32, ptr [[M]], align 4
@@ -15384,7 +15384,7 @@ int main (int argc, char **argv) {
 // CHECK19-NEXT:    [[TMP111:%.*]] = icmp ne i32 [[TMP110]], 0
 // CHECK19-NEXT:    br i1 [[TMP111]], label [[OMP_OFFLOAD_FAILED29:%.*]], label [[OMP_OFFLOAD_CONT30:%.*]]
 // CHECK19:       omp_offload.failed29:
-// CHECK19-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l128(ptr [[A]], i32 [[TMP88]]) #[[ATTR4]]
+// CHECK19-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l128(ptr [[A]], i32 [[TMP88]]) #[[ATTR3]]
 // CHECK19-NEXT:    br label [[OMP_OFFLOAD_CONT30]]
 // CHECK19:       omp_offload.cont30:
 // CHECK19-NEXT:    ret i32 0
@@ -15401,7 +15401,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l112.omp_outlined
-// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] {
+// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] {
 // CHECK19-NEXT:  entry:
 // CHECK19-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -15469,7 +15469,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l112.omp_outlined.omp_outlined
-// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] {
+// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] {
 // CHECK19-NEXT:  entry:
 // CHECK19-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -15560,7 +15560,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l116.omp_outlined
-// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] {
+// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] {
 // CHECK19-NEXT:  entry:
 // CHECK19-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -15628,7 +15628,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l116.omp_outlined.omp_outlined
-// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] {
+// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] {
 // CHECK19-NEXT:  entry:
 // CHECK19-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -15725,7 +15725,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l120.omp_outlined
-// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] {
+// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
 // CHECK19-NEXT:  entry:
 // CHECK19-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -15799,7 +15799,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l120.omp_outlined.omp_outlined
-// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] {
+// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
 // CHECK19-NEXT:  entry:
 // CHECK19-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -15912,7 +15912,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l124.omp_outlined
-// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] {
+// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] {
 // CHECK19-NEXT:  entry:
 // CHECK19-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -15980,7 +15980,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l124.omp_outlined.omp_outlined
-// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] {
+// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] {
 // CHECK19-NEXT:  entry:
 // CHECK19-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -16075,7 +16075,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l128.omp_outlined
-// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] {
+// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
 // CHECK19-NEXT:  entry:
 // CHECK19-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -16149,7 +16149,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l128.omp_outlined.omp_outlined
-// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] {
+// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
 // CHECK19-NEXT:  entry:
 // CHECK19-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -16231,7 +16231,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK19-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK19-SAME: () #[[ATTR7:[0-9]+]] {
+// CHECK19-SAME: () #[[ATTR6:[0-9]+]] {
 // CHECK19-NEXT:  entry:
 // CHECK19-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK19-NEXT:    ret void
diff --git a/clang/test/OpenMP/target_teams_distribute_private_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_private_codegen.cpp
index 06eaf91b03010..e83ba0ff9082f 100644
--- a/clang/test/OpenMP/target_teams_distribute_private_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_private_codegen.cpp
@@ -284,7 +284,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l91.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR5:[0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR4]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -388,7 +388,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@_Z5tmainIiET_v
-// CHECK1-SAME: () #[[ATTR7:[0-9]+]] comdat {
+// CHECK1-SAME: () #[[ATTR6:[0-9]+]] comdat {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4
@@ -488,7 +488,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l56.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR5]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR4]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -819,7 +819,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l91.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR5:[0-9]+]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR4]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -921,7 +921,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@_Z5tmainIiET_v
-// CHECK3-SAME: () #[[ATTR7:[0-9]+]] comdat {
+// CHECK3-SAME: () #[[ATTR6:[0-9]+]] comdat {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
 // CHECK3-NEXT:    [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4
@@ -1021,7 +1021,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l56.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR5]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR4]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1316,7 +1316,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l74.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR6:[0-9]+]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR5]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
diff --git a/clang/test/OpenMP/target_teams_distribute_reduction_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_reduction_codegen.cpp
index 8a67a51558dd1..424985dea537c 100644
--- a/clang/test/OpenMP/target_teams_distribute_reduction_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_reduction_codegen.cpp
@@ -364,7 +364,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
 // CHECK1-NEXT:    br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK1:       omp_offload.failed:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l209(ptr @_ZZ4mainE5sivar) #[[ATTR3:[0-9]+]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l209(ptr @_ZZ4mainE5sivar) #[[ATTR2:[0-9]+]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
@@ -405,7 +405,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP39:%.*]] = icmp ne i32 [[TMP38]], 0
 // CHECK1-NEXT:    br i1 [[TMP39]], label [[OMP_OFFLOAD_FAILED6:%.*]], label [[OMP_OFFLOAD_CONT7:%.*]]
 // CHECK1:       omp_offload.failed6:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l214(ptr @_ZZ4mainE5sivar) #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l214(ptr @_ZZ4mainE5sivar) #[[ATTR2]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT7]]
 // CHECK1:       omp_offload.cont7:
 // CHECK1-NEXT:    store i32 1, ptr @_ZZ4mainE5sivar, align 4
@@ -447,7 +447,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP59:%.*]] = icmp ne i32 [[TMP58]], 0
 // CHECK1-NEXT:    br i1 [[TMP59]], label [[OMP_OFFLOAD_FAILED13:%.*]], label [[OMP_OFFLOAD_CONT14:%.*]]
 // CHECK1:       omp_offload.failed13:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l220(ptr @_ZZ4mainE5sivar) #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l220(ptr @_ZZ4mainE5sivar) #[[ATTR2]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT14]]
 // CHECK1:       omp_offload.cont14:
 // CHECK1-NEXT:    store i8 1, ptr [[AND_VAR]], align 1
@@ -489,7 +489,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP79:%.*]] = icmp ne i32 [[TMP78]], 0
 // CHECK1-NEXT:    br i1 [[TMP79]], label [[OMP_OFFLOAD_FAILED20:%.*]], label [[OMP_OFFLOAD_CONT21:%.*]]
 // CHECK1:       omp_offload.failed20:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l226(ptr [[AND_VAR]]) #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l226(ptr [[AND_VAR]]) #[[ATTR2]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT21]]
 // CHECK1:       omp_offload.cont21:
 // CHECK1-NEXT:    store i8 0, ptr [[OR_VAR]], align 1
@@ -531,7 +531,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP99:%.*]] = icmp ne i32 [[TMP98]], 0
 // CHECK1-NEXT:    br i1 [[TMP99]], label [[OMP_OFFLOAD_FAILED27:%.*]], label [[OMP_OFFLOAD_CONT28:%.*]]
 // CHECK1:       omp_offload.failed27:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l232(ptr [[OR_VAR]]) #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l232(ptr [[OR_VAR]]) #[[ATTR2]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT28]]
 // CHECK1:       omp_offload.cont28:
 // CHECK1-NEXT:    store i32 1, ptr [[BIT_VAR]], align 4
@@ -573,7 +573,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP119:%.*]] = icmp ne i32 [[TMP118]], 0
 // CHECK1-NEXT:    br i1 [[TMP119]], label [[OMP_OFFLOAD_FAILED34:%.*]], label [[OMP_OFFLOAD_CONT35:%.*]]
 // CHECK1:       omp_offload.failed34:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l238(ptr [[BIT_VAR]]) #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l238(ptr [[BIT_VAR]]) #[[ATTR2]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT35]]
 // CHECK1:       omp_offload.cont35:
 // CHECK1-NEXT:    [[TMP120:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS36]], i32 0, i32 0
@@ -614,7 +614,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP139:%.*]] = icmp ne i32 [[TMP138]], 0
 // CHECK1-NEXT:    br i1 [[TMP139]], label [[OMP_OFFLOAD_FAILED41:%.*]], label [[OMP_OFFLOAD_CONT42:%.*]]
 // CHECK1:       omp_offload.failed41:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l243(ptr [[BIT_VAR]]) #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l243(ptr [[BIT_VAR]]) #[[ATTR2]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT42]]
 // CHECK1:       omp_offload.cont42:
 // CHECK1-NEXT:    [[TMP140:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS43]], i32 0, i32 0
@@ -655,7 +655,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP159:%.*]] = icmp ne i32 [[TMP158]], 0
 // CHECK1-NEXT:    br i1 [[TMP159]], label [[OMP_OFFLOAD_FAILED48:%.*]], label [[OMP_OFFLOAD_CONT49:%.*]]
 // CHECK1:       omp_offload.failed48:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l248(ptr [[BIT_VAR]]) #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l248(ptr [[BIT_VAR]]) #[[ATTR2]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT49]]
 // CHECK1:       omp_offload.cont49:
 // CHECK1-NEXT:    store i32 0, ptr @_ZZ4mainE5sivar, align 4
@@ -697,7 +697,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP179:%.*]] = icmp ne i32 [[TMP178]], 0
 // CHECK1-NEXT:    br i1 [[TMP179]], label [[OMP_OFFLOAD_FAILED55:%.*]], label [[OMP_OFFLOAD_CONT56:%.*]]
 // CHECK1:       omp_offload.failed55:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l254(ptr @_ZZ4mainE5sivar) #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l254(ptr @_ZZ4mainE5sivar) #[[ATTR2]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT56]]
 // CHECK1:       omp_offload.cont56:
 // CHECK1-NEXT:    store i32 10, ptr @_ZZ4mainE5sivar, align 4
@@ -739,7 +739,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP199:%.*]] = icmp ne i32 [[TMP198]], 0
 // CHECK1-NEXT:    br i1 [[TMP199]], label [[OMP_OFFLOAD_FAILED62:%.*]], label [[OMP_OFFLOAD_CONT63:%.*]]
 // CHECK1:       omp_offload.failed62:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l260(ptr @_ZZ4mainE5sivar) #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l260(ptr @_ZZ4mainE5sivar) #[[ATTR2]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT63]]
 // CHECK1:       omp_offload.cont63:
 // CHECK1-NEXT:    [[CALL:%.*]] = call noundef signext i32 @_Z5tmainIiET_v()
@@ -757,7 +757,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l209.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -847,7 +847,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l209.omp_outlined.omp.reduction.reduction_func
-// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4:[0-9]+]] {
+// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3:[0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8
@@ -877,7 +877,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l214.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -967,7 +967,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l214.omp_outlined.omp.reduction.reduction_func
-// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4]] {
+// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8
@@ -997,7 +997,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l220.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1103,7 +1103,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l220.omp_outlined.omp.reduction.reduction_func
-// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4]] {
+// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8
@@ -1133,7 +1133,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l226.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[AND_VAR:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[AND_VAR:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1263,7 +1263,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l226.omp_outlined.omp.reduction.reduction_func
-// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4]] {
+// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8
@@ -1300,7 +1300,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l232.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[OR_VAR:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[OR_VAR:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1430,7 +1430,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l232.omp_outlined.omp.reduction.reduction_func
-// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4]] {
+// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8
@@ -1467,7 +1467,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l238.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BIT_VAR:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BIT_VAR:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1557,7 +1557,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l238.omp_outlined.omp.reduction.reduction_func
-// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4]] {
+// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8
@@ -1587,7 +1587,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l243.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BIT_VAR:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BIT_VAR:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1677,7 +1677,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l243.omp_outlined.omp.reduction.reduction_func
-// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4]] {
+// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8
@@ -1707,7 +1707,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l248.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BIT_VAR:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BIT_VAR:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1797,7 +1797,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l248.omp_outlined.omp.reduction.reduction_func
-// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4]] {
+// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8
@@ -1827,7 +1827,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l254.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1935,7 +1935,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l254.omp_outlined.omp.reduction.reduction_func
-// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4]] {
+// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8
@@ -1974,7 +1974,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l260.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -2082,7 +2082,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l260.omp_outlined.omp.reduction.reduction_func
-// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4]] {
+// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8
@@ -2111,7 +2111,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@_Z5tmainIiET_v
-// CHECK1-SAME: () #[[ATTR6:[0-9]+]] comdat {
+// CHECK1-SAME: () #[[ATTR5:[0-9]+]] comdat {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[T_VAR:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[VEC:%.*]] = alloca [2 x i32], align 4
@@ -2208,7 +2208,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
 // CHECK1-NEXT:    br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK1:       omp_offload.failed:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32(ptr [[T_VAR]]) #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32(ptr [[T_VAR]]) #[[ATTR2]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
@@ -2255,7 +2255,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP42:%.*]] = icmp ne i32 [[TMP41]], 0
 // CHECK1-NEXT:    br i1 [[TMP42]], label [[OMP_OFFLOAD_FAILED6:%.*]], label [[OMP_OFFLOAD_CONT7:%.*]]
 // CHECK1:       omp_offload.failed6:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l37(ptr [[T_VAR]], ptr [[VEC]]) #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l37(ptr [[T_VAR]], ptr [[VEC]]) #[[ATTR2]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT7]]
 // CHECK1:       omp_offload.cont7:
 // CHECK1-NEXT:    store i32 1, ptr [[T_VAR]], align 4
@@ -2303,7 +2303,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP65:%.*]] = icmp ne i32 [[TMP64]], 0
 // CHECK1-NEXT:    br i1 [[TMP65]], label [[OMP_OFFLOAD_FAILED13:%.*]], label [[OMP_OFFLOAD_CONT14:%.*]]
 // CHECK1:       omp_offload.failed13:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l43(ptr [[T_VAR]], ptr [[VEC]]) #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l43(ptr [[T_VAR]], ptr [[VEC]]) #[[ATTR2]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT14]]
 // CHECK1:       omp_offload.cont14:
 // CHECK1-NEXT:    store i8 1, ptr [[AND_VAR]], align 1
@@ -2351,7 +2351,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP88:%.*]] = icmp ne i32 [[TMP87]], 0
 // CHECK1-NEXT:    br i1 [[TMP88]], label [[OMP_OFFLOAD_FAILED20:%.*]], label [[OMP_OFFLOAD_CONT21:%.*]]
 // CHECK1:       omp_offload.failed20:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l49(ptr [[AND_VAR]], ptr [[VEC]]) #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l49(ptr [[AND_VAR]], ptr [[VEC]]) #[[ATTR2]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT21]]
 // CHECK1:       omp_offload.cont21:
 // CHECK1-NEXT:    store i8 0, ptr [[OR_VAR]], align 1
@@ -2399,7 +2399,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP111:%.*]] = icmp ne i32 [[TMP110]], 0
 // CHECK1-NEXT:    br i1 [[TMP111]], label [[OMP_OFFLOAD_FAILED27:%.*]], label [[OMP_OFFLOAD_CONT28:%.*]]
 // CHECK1:       omp_offload.failed27:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l55(ptr [[OR_VAR]], ptr [[VEC]]) #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l55(ptr [[OR_VAR]], ptr [[VEC]]) #[[ATTR2]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT28]]
 // CHECK1:       omp_offload.cont28:
 // CHECK1-NEXT:    store i32 1, ptr [[BIT_VAR]], align 4
@@ -2447,7 +2447,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP134:%.*]] = icmp ne i32 [[TMP133]], 0
 // CHECK1-NEXT:    br i1 [[TMP134]], label [[OMP_OFFLOAD_FAILED34:%.*]], label [[OMP_OFFLOAD_CONT35:%.*]]
 // CHECK1:       omp_offload.failed34:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l61(ptr [[BIT_VAR]], ptr [[VEC]]) #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l61(ptr [[BIT_VAR]], ptr [[VEC]]) #[[ATTR2]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT35]]
 // CHECK1:       omp_offload.cont35:
 // CHECK1-NEXT:    [[TMP135:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS36]], i32 0, i32 0
@@ -2494,7 +2494,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP157:%.*]] = icmp ne i32 [[TMP156]], 0
 // CHECK1-NEXT:    br i1 [[TMP157]], label [[OMP_OFFLOAD_FAILED41:%.*]], label [[OMP_OFFLOAD_CONT42:%.*]]
 // CHECK1:       omp_offload.failed41:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l66(ptr [[BIT_VAR]], ptr [[VEC]]) #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l66(ptr [[BIT_VAR]], ptr [[VEC]]) #[[ATTR2]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT42]]
 // CHECK1:       omp_offload.cont42:
 // CHECK1-NEXT:    [[TMP158:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS43]], i32 0, i32 0
@@ -2541,7 +2541,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP180:%.*]] = icmp ne i32 [[TMP179]], 0
 // CHECK1-NEXT:    br i1 [[TMP180]], label [[OMP_OFFLOAD_FAILED48:%.*]], label [[OMP_OFFLOAD_CONT49:%.*]]
 // CHECK1:       omp_offload.failed48:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l71(ptr [[BIT_VAR]], ptr [[VEC]]) #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l71(ptr [[BIT_VAR]], ptr [[VEC]]) #[[ATTR2]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT49]]
 // CHECK1:       omp_offload.cont49:
 // CHECK1-NEXT:    store i32 0, ptr [[T_VAR]], align 4
@@ -2589,7 +2589,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP203:%.*]] = icmp ne i32 [[TMP202]], 0
 // CHECK1-NEXT:    br i1 [[TMP203]], label [[OMP_OFFLOAD_FAILED55:%.*]], label [[OMP_OFFLOAD_CONT56:%.*]]
 // CHECK1:       omp_offload.failed55:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l77(ptr [[T_VAR]], ptr [[VEC]]) #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l77(ptr [[T_VAR]], ptr [[VEC]]) #[[ATTR2]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT56]]
 // CHECK1:       omp_offload.cont56:
 // CHECK1-NEXT:    store i32 10, ptr [[T_VAR]], align 4
@@ -2637,7 +2637,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP226:%.*]] = icmp ne i32 [[TMP225]], 0
 // CHECK1-NEXT:    br i1 [[TMP226]], label [[OMP_OFFLOAD_FAILED62:%.*]], label [[OMP_OFFLOAD_CONT63:%.*]]
 // CHECK1:       omp_offload.failed62:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l83(ptr [[T_VAR]], ptr [[VEC]]) #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l83(ptr [[T_VAR]], ptr [[VEC]]) #[[ATTR2]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT63]]
 // CHECK1:       omp_offload.cont63:
 // CHECK1-NEXT:    ret i32 0
@@ -2654,7 +2654,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -2744,7 +2744,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined.omp.reduction.reduction_func
-// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4]] {
+// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8
@@ -2777,7 +2777,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l37.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -2873,7 +2873,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l37.omp_outlined.omp.reduction.reduction_func
-// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4]] {
+// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8
@@ -2906,7 +2906,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l43.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -3018,7 +3018,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l43.omp_outlined.omp.reduction.reduction_func
-// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4]] {
+// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8
@@ -3051,7 +3051,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l49.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[AND_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[AND_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -3188,7 +3188,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l49.omp_outlined.omp.reduction.reduction_func
-// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4]] {
+// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8
@@ -3228,7 +3228,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l55.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[OR_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[OR_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -3365,7 +3365,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l55.omp_outlined.omp.reduction.reduction_func
-// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4]] {
+// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8
@@ -3405,7 +3405,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l61.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BIT_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BIT_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -3501,7 +3501,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l61.omp_outlined.omp.reduction.reduction_func
-// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4]] {
+// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8
@@ -3534,7 +3534,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l66.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BIT_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BIT_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -3630,7 +3630,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l66.omp_outlined.omp.reduction.reduction_func
-// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4]] {
+// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8
@@ -3663,7 +3663,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l71.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BIT_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BIT_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -3759,7 +3759,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l71.omp_outlined.omp.reduction.reduction_func
-// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4]] {
+// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8
@@ -3792,7 +3792,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l77.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -3909,7 +3909,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l77.omp_outlined.omp.reduction.reduction_func
-// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4]] {
+// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8
@@ -3951,7 +3951,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l83.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -4068,7 +4068,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l83.omp_outlined.omp.reduction.reduction_func
-// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4]] {
+// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8
@@ -4097,7 +4097,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK1-SAME: () #[[ATTR8:[0-9]+]] {
+// CHECK1-SAME: () #[[ATTR7:[0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK1-NEXT:    ret void
@@ -4199,7 +4199,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
 // CHECK3-NEXT:    br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK3:       omp_offload.failed:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l209(ptr @_ZZ4mainE5sivar) #[[ATTR3:[0-9]+]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l209(ptr @_ZZ4mainE5sivar) #[[ATTR2:[0-9]+]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK3:       omp_offload.cont:
 // CHECK3-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
@@ -4240,7 +4240,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP39:%.*]] = icmp ne i32 [[TMP38]], 0
 // CHECK3-NEXT:    br i1 [[TMP39]], label [[OMP_OFFLOAD_FAILED6:%.*]], label [[OMP_OFFLOAD_CONT7:%.*]]
 // CHECK3:       omp_offload.failed6:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l214(ptr @_ZZ4mainE5sivar) #[[ATTR3]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l214(ptr @_ZZ4mainE5sivar) #[[ATTR2]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT7]]
 // CHECK3:       omp_offload.cont7:
 // CHECK3-NEXT:    store i32 1, ptr @_ZZ4mainE5sivar, align 4
@@ -4282,7 +4282,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP59:%.*]] = icmp ne i32 [[TMP58]], 0
 // CHECK3-NEXT:    br i1 [[TMP59]], label [[OMP_OFFLOAD_FAILED13:%.*]], label [[OMP_OFFLOAD_CONT14:%.*]]
 // CHECK3:       omp_offload.failed13:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l220(ptr @_ZZ4mainE5sivar) #[[ATTR3]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l220(ptr @_ZZ4mainE5sivar) #[[ATTR2]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT14]]
 // CHECK3:       omp_offload.cont14:
 // CHECK3-NEXT:    store i8 1, ptr [[AND_VAR]], align 1
@@ -4324,7 +4324,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP79:%.*]] = icmp ne i32 [[TMP78]], 0
 // CHECK3-NEXT:    br i1 [[TMP79]], label [[OMP_OFFLOAD_FAILED20:%.*]], label [[OMP_OFFLOAD_CONT21:%.*]]
 // CHECK3:       omp_offload.failed20:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l226(ptr [[AND_VAR]]) #[[ATTR3]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l226(ptr [[AND_VAR]]) #[[ATTR2]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT21]]
 // CHECK3:       omp_offload.cont21:
 // CHECK3-NEXT:    store i8 0, ptr [[OR_VAR]], align 1
@@ -4366,7 +4366,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP99:%.*]] = icmp ne i32 [[TMP98]], 0
 // CHECK3-NEXT:    br i1 [[TMP99]], label [[OMP_OFFLOAD_FAILED27:%.*]], label [[OMP_OFFLOAD_CONT28:%.*]]
 // CHECK3:       omp_offload.failed27:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l232(ptr [[OR_VAR]]) #[[ATTR3]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l232(ptr [[OR_VAR]]) #[[ATTR2]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT28]]
 // CHECK3:       omp_offload.cont28:
 // CHECK3-NEXT:    store i32 1, ptr [[BIT_VAR]], align 4
@@ -4408,7 +4408,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP119:%.*]] = icmp ne i32 [[TMP118]], 0
 // CHECK3-NEXT:    br i1 [[TMP119]], label [[OMP_OFFLOAD_FAILED34:%.*]], label [[OMP_OFFLOAD_CONT35:%.*]]
 // CHECK3:       omp_offload.failed34:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l238(ptr [[BIT_VAR]]) #[[ATTR3]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l238(ptr [[BIT_VAR]]) #[[ATTR2]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT35]]
 // CHECK3:       omp_offload.cont35:
 // CHECK3-NEXT:    [[TMP120:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS36]], i32 0, i32 0
@@ -4449,7 +4449,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP139:%.*]] = icmp ne i32 [[TMP138]], 0
 // CHECK3-NEXT:    br i1 [[TMP139]], label [[OMP_OFFLOAD_FAILED41:%.*]], label [[OMP_OFFLOAD_CONT42:%.*]]
 // CHECK3:       omp_offload.failed41:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l243(ptr [[BIT_VAR]]) #[[ATTR3]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l243(ptr [[BIT_VAR]]) #[[ATTR2]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT42]]
 // CHECK3:       omp_offload.cont42:
 // CHECK3-NEXT:    [[TMP140:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS43]], i32 0, i32 0
@@ -4490,7 +4490,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP159:%.*]] = icmp ne i32 [[TMP158]], 0
 // CHECK3-NEXT:    br i1 [[TMP159]], label [[OMP_OFFLOAD_FAILED48:%.*]], label [[OMP_OFFLOAD_CONT49:%.*]]
 // CHECK3:       omp_offload.failed48:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l248(ptr [[BIT_VAR]]) #[[ATTR3]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l248(ptr [[BIT_VAR]]) #[[ATTR2]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT49]]
 // CHECK3:       omp_offload.cont49:
 // CHECK3-NEXT:    store i32 0, ptr @_ZZ4mainE5sivar, align 4
@@ -4532,7 +4532,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP179:%.*]] = icmp ne i32 [[TMP178]], 0
 // CHECK3-NEXT:    br i1 [[TMP179]], label [[OMP_OFFLOAD_FAILED55:%.*]], label [[OMP_OFFLOAD_CONT56:%.*]]
 // CHECK3:       omp_offload.failed55:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l254(ptr @_ZZ4mainE5sivar) #[[ATTR3]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l254(ptr @_ZZ4mainE5sivar) #[[ATTR2]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT56]]
 // CHECK3:       omp_offload.cont56:
 // CHECK3-NEXT:    store i32 10, ptr @_ZZ4mainE5sivar, align 4
@@ -4574,7 +4574,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP199:%.*]] = icmp ne i32 [[TMP198]], 0
 // CHECK3-NEXT:    br i1 [[TMP199]], label [[OMP_OFFLOAD_FAILED62:%.*]], label [[OMP_OFFLOAD_CONT63:%.*]]
 // CHECK3:       omp_offload.failed62:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l260(ptr @_ZZ4mainE5sivar) #[[ATTR3]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l260(ptr @_ZZ4mainE5sivar) #[[ATTR2]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT63]]
 // CHECK3:       omp_offload.cont63:
 // CHECK3-NEXT:    [[CALL:%.*]] = call noundef i32 @_Z5tmainIiET_v()
@@ -4592,7 +4592,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l209.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -4682,7 +4682,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l209.omp_outlined.omp.reduction.reduction_func
-// CHECK3-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4:[0-9]+]] {
+// CHECK3-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3:[0-9]+]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 4
@@ -4712,7 +4712,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l214.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -4802,7 +4802,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l214.omp_outlined.omp.reduction.reduction_func
-// CHECK3-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4]] {
+// CHECK3-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 4
@@ -4832,7 +4832,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l220.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -4938,7 +4938,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l220.omp_outlined.omp.reduction.reduction_func
-// CHECK3-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4]] {
+// CHECK3-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 4
@@ -4968,7 +4968,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l226.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[AND_VAR:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[AND_VAR:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -5098,7 +5098,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l226.omp_outlined.omp.reduction.reduction_func
-// CHECK3-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4]] {
+// CHECK3-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 4
@@ -5135,7 +5135,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l232.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[OR_VAR:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[OR_VAR:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -5265,7 +5265,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l232.omp_outlined.omp.reduction.reduction_func
-// CHECK3-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4]] {
+// CHECK3-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 4
@@ -5302,7 +5302,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l238.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BIT_VAR:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BIT_VAR:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -5392,7 +5392,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l238.omp_outlined.omp.reduction.reduction_func
-// CHECK3-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4]] {
+// CHECK3-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 4
@@ -5422,7 +5422,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l243.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BIT_VAR:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BIT_VAR:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -5512,7 +5512,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l243.omp_outlined.omp.reduction.reduction_func
-// CHECK3-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4]] {
+// CHECK3-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 4
@@ -5542,7 +5542,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l248.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BIT_VAR:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BIT_VAR:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -5632,7 +5632,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l248.omp_outlined.omp.reduction.reduction_func
-// CHECK3-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4]] {
+// CHECK3-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 4
@@ -5662,7 +5662,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l254.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -5770,7 +5770,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l254.omp_outlined.omp.reduction.reduction_func
-// CHECK3-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4]] {
+// CHECK3-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 4
@@ -5809,7 +5809,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l260.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -5917,7 +5917,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l260.omp_outlined.omp.reduction.reduction_func
-// CHECK3-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4]] {
+// CHECK3-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 4
@@ -5946,7 +5946,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@_Z5tmainIiET_v
-// CHECK3-SAME: () #[[ATTR6:[0-9]+]] comdat {
+// CHECK3-SAME: () #[[ATTR5:[0-9]+]] comdat {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[T_VAR:%.*]] = alloca i32, align 4
 // CHECK3-NEXT:    [[VEC:%.*]] = alloca [2 x i32], align 4
@@ -6043,7 +6043,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
 // CHECK3-NEXT:    br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK3:       omp_offload.failed:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32(ptr [[T_VAR]]) #[[ATTR3]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32(ptr [[T_VAR]]) #[[ATTR2]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK3:       omp_offload.cont:
 // CHECK3-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
@@ -6090,7 +6090,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP42:%.*]] = icmp ne i32 [[TMP41]], 0
 // CHECK3-NEXT:    br i1 [[TMP42]], label [[OMP_OFFLOAD_FAILED6:%.*]], label [[OMP_OFFLOAD_CONT7:%.*]]
 // CHECK3:       omp_offload.failed6:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l37(ptr [[T_VAR]], ptr [[VEC]]) #[[ATTR3]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l37(ptr [[T_VAR]], ptr [[VEC]]) #[[ATTR2]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT7]]
 // CHECK3:       omp_offload.cont7:
 // CHECK3-NEXT:    store i32 1, ptr [[T_VAR]], align 4
@@ -6138,7 +6138,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP65:%.*]] = icmp ne i32 [[TMP64]], 0
 // CHECK3-NEXT:    br i1 [[TMP65]], label [[OMP_OFFLOAD_FAILED13:%.*]], label [[OMP_OFFLOAD_CONT14:%.*]]
 // CHECK3:       omp_offload.failed13:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l43(ptr [[T_VAR]], ptr [[VEC]]) #[[ATTR3]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l43(ptr [[T_VAR]], ptr [[VEC]]) #[[ATTR2]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT14]]
 // CHECK3:       omp_offload.cont14:
 // CHECK3-NEXT:    store i8 1, ptr [[AND_VAR]], align 1
@@ -6186,7 +6186,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP88:%.*]] = icmp ne i32 [[TMP87]], 0
 // CHECK3-NEXT:    br i1 [[TMP88]], label [[OMP_OFFLOAD_FAILED20:%.*]], label [[OMP_OFFLOAD_CONT21:%.*]]
 // CHECK3:       omp_offload.failed20:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l49(ptr [[AND_VAR]], ptr [[VEC]]) #[[ATTR3]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l49(ptr [[AND_VAR]], ptr [[VEC]]) #[[ATTR2]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT21]]
 // CHECK3:       omp_offload.cont21:
 // CHECK3-NEXT:    store i8 0, ptr [[OR_VAR]], align 1
@@ -6234,7 +6234,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP111:%.*]] = icmp ne i32 [[TMP110]], 0
 // CHECK3-NEXT:    br i1 [[TMP111]], label [[OMP_OFFLOAD_FAILED27:%.*]], label [[OMP_OFFLOAD_CONT28:%.*]]
 // CHECK3:       omp_offload.failed27:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l55(ptr [[OR_VAR]], ptr [[VEC]]) #[[ATTR3]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l55(ptr [[OR_VAR]], ptr [[VEC]]) #[[ATTR2]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT28]]
 // CHECK3:       omp_offload.cont28:
 // CHECK3-NEXT:    store i32 1, ptr [[BIT_VAR]], align 4
@@ -6282,7 +6282,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP134:%.*]] = icmp ne i32 [[TMP133]], 0
 // CHECK3-NEXT:    br i1 [[TMP134]], label [[OMP_OFFLOAD_FAILED34:%.*]], label [[OMP_OFFLOAD_CONT35:%.*]]
 // CHECK3:       omp_offload.failed34:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l61(ptr [[BIT_VAR]], ptr [[VEC]]) #[[ATTR3]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l61(ptr [[BIT_VAR]], ptr [[VEC]]) #[[ATTR2]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT35]]
 // CHECK3:       omp_offload.cont35:
 // CHECK3-NEXT:    [[TMP135:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS36]], i32 0, i32 0
@@ -6329,7 +6329,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP157:%.*]] = icmp ne i32 [[TMP156]], 0
 // CHECK3-NEXT:    br i1 [[TMP157]], label [[OMP_OFFLOAD_FAILED41:%.*]], label [[OMP_OFFLOAD_CONT42:%.*]]
 // CHECK3:       omp_offload.failed41:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l66(ptr [[BIT_VAR]], ptr [[VEC]]) #[[ATTR3]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l66(ptr [[BIT_VAR]], ptr [[VEC]]) #[[ATTR2]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT42]]
 // CHECK3:       omp_offload.cont42:
 // CHECK3-NEXT:    [[TMP158:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOFFLOAD_BASEPTRS43]], i32 0, i32 0
@@ -6376,7 +6376,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP180:%.*]] = icmp ne i32 [[TMP179]], 0
 // CHECK3-NEXT:    br i1 [[TMP180]], label [[OMP_OFFLOAD_FAILED48:%.*]], label [[OMP_OFFLOAD_CONT49:%.*]]
 // CHECK3:       omp_offload.failed48:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l71(ptr [[BIT_VAR]], ptr [[VEC]]) #[[ATTR3]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l71(ptr [[BIT_VAR]], ptr [[VEC]]) #[[ATTR2]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT49]]
 // CHECK3:       omp_offload.cont49:
 // CHECK3-NEXT:    store i32 0, ptr [[T_VAR]], align 4
@@ -6424,7 +6424,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP203:%.*]] = icmp ne i32 [[TMP202]], 0
 // CHECK3-NEXT:    br i1 [[TMP203]], label [[OMP_OFFLOAD_FAILED55:%.*]], label [[OMP_OFFLOAD_CONT56:%.*]]
 // CHECK3:       omp_offload.failed55:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l77(ptr [[T_VAR]], ptr [[VEC]]) #[[ATTR3]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l77(ptr [[T_VAR]], ptr [[VEC]]) #[[ATTR2]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT56]]
 // CHECK3:       omp_offload.cont56:
 // CHECK3-NEXT:    store i32 10, ptr [[T_VAR]], align 4
@@ -6472,7 +6472,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP226:%.*]] = icmp ne i32 [[TMP225]], 0
 // CHECK3-NEXT:    br i1 [[TMP226]], label [[OMP_OFFLOAD_FAILED62:%.*]], label [[OMP_OFFLOAD_CONT63:%.*]]
 // CHECK3:       omp_offload.failed62:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l83(ptr [[T_VAR]], ptr [[VEC]]) #[[ATTR3]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l83(ptr [[T_VAR]], ptr [[VEC]]) #[[ATTR2]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT63]]
 // CHECK3:       omp_offload.cont63:
 // CHECK3-NEXT:    ret i32 0
@@ -6489,7 +6489,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -6579,7 +6579,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined.omp.reduction.reduction_func
-// CHECK3-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4]] {
+// CHECK3-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 4
@@ -6612,7 +6612,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l37.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -6707,7 +6707,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l37.omp_outlined.omp.reduction.reduction_func
-// CHECK3-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4]] {
+// CHECK3-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 4
@@ -6740,7 +6740,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l43.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -6851,7 +6851,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l43.omp_outlined.omp.reduction.reduction_func
-// CHECK3-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4]] {
+// CHECK3-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 4
@@ -6884,7 +6884,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l49.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[AND_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[AND_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -7020,7 +7020,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l49.omp_outlined.omp.reduction.reduction_func
-// CHECK3-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4]] {
+// CHECK3-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 4
@@ -7060,7 +7060,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l55.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[OR_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[OR_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -7196,7 +7196,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l55.omp_outlined.omp.reduction.reduction_func
-// CHECK3-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4]] {
+// CHECK3-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 4
@@ -7236,7 +7236,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l61.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BIT_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BIT_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -7331,7 +7331,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l61.omp_outlined.omp.reduction.reduction_func
-// CHECK3-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4]] {
+// CHECK3-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 4
@@ -7364,7 +7364,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l66.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BIT_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BIT_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -7459,7 +7459,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l66.omp_outlined.omp.reduction.reduction_func
-// CHECK3-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4]] {
+// CHECK3-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 4
@@ -7492,7 +7492,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l71.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BIT_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BIT_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -7587,7 +7587,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l71.omp_outlined.omp.reduction.reduction_func
-// CHECK3-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4]] {
+// CHECK3-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 4
@@ -7620,7 +7620,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l77.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -7735,7 +7735,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l77.omp_outlined.omp.reduction.reduction_func
-// CHECK3-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4]] {
+// CHECK3-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 4
@@ -7777,7 +7777,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l83.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -7892,7 +7892,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l83.omp_outlined.omp.reduction.reduction_func
-// CHECK3-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4]] {
+// CHECK3-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 4
@@ -7921,7 +7921,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK3-SAME: () #[[ATTR8:[0-9]+]] {
+// CHECK3-SAME: () #[[ATTR7:[0-9]+]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK3-NEXT:    ret void
@@ -7966,7 +7966,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l96.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR2]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -8060,7 +8060,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l96.omp_outlined.omp.reduction.reduction_func
-// CHECK9-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR5:[0-9]+]] {
+// CHECK9-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4:[0-9]+]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8
@@ -8090,7 +8090,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l112.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR3]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR2]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -8184,7 +8184,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l112.omp_outlined.omp.reduction.reduction_func
-// CHECK9-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR5]] {
+// CHECK9-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8
@@ -8214,7 +8214,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l122.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR3]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR2]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -8324,7 +8324,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l122.omp_outlined.omp.reduction.reduction_func
-// CHECK9-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR5]] {
+// CHECK9-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8
@@ -8360,7 +8360,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l133.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[AND_VAR:%.*]], i64 noundef [[SIVAR:%.*]]) #[[ATTR3]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[AND_VAR:%.*]], i64 noundef [[SIVAR:%.*]]) #[[ATTR2]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -8496,7 +8496,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l133.omp_outlined.omp.reduction.reduction_func
-// CHECK9-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR5]] {
+// CHECK9-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8
@@ -8539,7 +8539,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l144.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[OR_VAR:%.*]], i64 noundef [[SIVAR:%.*]]) #[[ATTR3]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 1 dereferenceable(1) [[OR_VAR:%.*]], i64 noundef [[SIVAR:%.*]]) #[[ATTR2]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -8675,7 +8675,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l144.omp_outlined.omp.reduction.reduction_func
-// CHECK9-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR5]] {
+// CHECK9-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8
@@ -8718,7 +8718,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l155.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BIT_VAR:%.*]], i64 noundef [[SIVAR:%.*]]) #[[ATTR3]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BIT_VAR:%.*]], i64 noundef [[SIVAR:%.*]]) #[[ATTR2]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -8814,7 +8814,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l155.omp_outlined.omp.reduction.reduction_func
-// CHECK9-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR5]] {
+// CHECK9-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8
@@ -8850,7 +8850,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l166.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BIT_VAR:%.*]], i64 noundef [[SIVAR:%.*]]) #[[ATTR3]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BIT_VAR:%.*]], i64 noundef [[SIVAR:%.*]]) #[[ATTR2]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -8946,7 +8946,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l166.omp_outlined.omp.reduction.reduction_func
-// CHECK9-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR5]] {
+// CHECK9-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8
@@ -8982,7 +8982,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l177.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BIT_VAR:%.*]], i64 noundef [[SIVAR:%.*]]) #[[ATTR3]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BIT_VAR:%.*]], i64 noundef [[SIVAR:%.*]]) #[[ATTR2]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -9078,7 +9078,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l177.omp_outlined.omp.reduction.reduction_func
-// CHECK9-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR5]] {
+// CHECK9-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8
@@ -9108,7 +9108,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l188.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[MAX_VAR:%.*]]) #[[ATTR3]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[MAX_VAR:%.*]]) #[[ATTR2]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -9220,7 +9220,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l188.omp_outlined.omp.reduction.reduction_func
-// CHECK9-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR5]] {
+// CHECK9-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8
@@ -9259,7 +9259,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l199.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[MIN_VAR:%.*]]) #[[ATTR3]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[MIN_VAR:%.*]]) #[[ATTR2]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -9371,7 +9371,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l199.omp_outlined.omp.reduction.reduction_func
-// CHECK9-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR5]] {
+// CHECK9-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8
@@ -9400,7 +9400,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK9-SAME: () #[[ATTR7:[0-9]+]] {
+// CHECK9-SAME: () #[[ATTR6:[0-9]+]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK9-NEXT:    ret void
diff --git a/clang/test/OpenMP/target_teams_distribute_simd_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_simd_codegen.cpp
index 9735f69119174..67279d431fcb3 100644
--- a/clang/test/OpenMP/target_teams_distribute_simd_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_simd_codegen.cpp
@@ -9648,13 +9648,15 @@ int bar(int n){
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l97
-// CHECK17-SAME: (i64 noundef [[AA:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]], i64 noundef [[DOTCAPTURE_EXPR_1:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK17-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[AA:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]], i64 noundef [[DOTCAPTURE_EXPR_1:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK17-NEXT:  entry:
+// CHECK17-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[AA_ADDR:%.*]] = alloca i64, align 8
 // CHECK17-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8
 // CHECK17-NEXT:    [[DOTCAPTURE_EXPR__ADDR2:%.*]] = alloca i64, align 8
 // CHECK17-NEXT:    [[AA_CASTED:%.*]] = alloca i64, align 8
 // CHECK17-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB2:[0-9]+]])
+// CHECK17-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK17-NEXT:    store i64 [[AA]], ptr [[AA_ADDR]], align 8
 // CHECK17-NEXT:    store i64 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 8
 // CHECK17-NEXT:    store i64 [[DOTCAPTURE_EXPR_1]], ptr [[DOTCAPTURE_EXPR__ADDR2]], align 8
@@ -9738,10 +9740,12 @@ int bar(int n){
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l111
-// CHECK17-SAME: (i64 noundef [[AA:%.*]]) #[[ATTR0]] {
+// CHECK17-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[AA:%.*]]) #[[ATTR0]] {
 // CHECK17-NEXT:  entry:
+// CHECK17-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[AA_ADDR:%.*]] = alloca i64, align 8
 // CHECK17-NEXT:    [[AA_CASTED:%.*]] = alloca i64, align 8
+// CHECK17-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK17-NEXT:    store i64 [[AA]], ptr [[AA_ADDR]], align 8
 // CHECK17-NEXT:    [[TMP0:%.*]] = load i16, ptr [[AA_ADDR]], align 2
 // CHECK17-NEXT:    store i16 [[TMP0]], ptr [[AA_CASTED]], align 2
@@ -9825,12 +9829,14 @@ int bar(int n){
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l118
-// CHECK17-SAME: (i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]]) #[[ATTR0]] {
+// CHECK17-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]]) #[[ATTR0]] {
 // CHECK17-NEXT:  entry:
+// CHECK17-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
 // CHECK17-NEXT:    [[AA_ADDR:%.*]] = alloca i64, align 8
 // CHECK17-NEXT:    [[A_CASTED:%.*]] = alloca i64, align 8
 // CHECK17-NEXT:    [[AA_CASTED:%.*]] = alloca i64, align 8
+// CHECK17-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK17-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
 // CHECK17-NEXT:    store i64 [[AA]], ptr [[AA_ADDR]], align 8
 // CHECK17-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
@@ -9923,8 +9929,9 @@ int bar(int n){
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l142
-// CHECK17-SAME: (i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BN:%.*]], ptr noundef nonnull align 8 dereferenceable(400) [[C:%.*]], i64 noundef [[VLA1:%.*]], i64 noundef [[VLA3:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[CN:%.*]], ptr noundef nonnull align 8 dereferenceable(16) [[D:%.*]]) #[[ATTR0]] {
+// CHECK17-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BN:%.*]], ptr noundef nonnull align 8 dereferenceable(400) [[C:%.*]], i64 noundef [[VLA1:%.*]], i64 noundef [[VLA3:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[CN:%.*]], ptr noundef nonnull align 8 dereferenceable(16) [[D:%.*]]) #[[ATTR0]] {
 // CHECK17-NEXT:  entry:
+// CHECK17-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
 // CHECK17-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[VLA_ADDR:%.*]] = alloca i64, align 8
@@ -9935,6 +9942,7 @@ int bar(int n){
 // CHECK17-NEXT:    [[CN_ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[D_ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[A_CASTED:%.*]] = alloca i64, align 8
+// CHECK17-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK17-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
 // CHECK17-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
 // CHECK17-NEXT:    store i64 [[VLA]], ptr [[VLA_ADDR]], align 8
@@ -10091,8 +10099,9 @@ int bar(int n){
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l197
-// CHECK17-SAME: (i64 noundef [[A:%.*]], i64 noundef [[N:%.*]], i64 noundef [[AA:%.*]], i64 noundef [[AAA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
+// CHECK17-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[A:%.*]], i64 noundef [[N:%.*]], i64 noundef [[AA:%.*]], i64 noundef [[AAA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
 // CHECK17-NEXT:  entry:
+// CHECK17-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
 // CHECK17-NEXT:    [[N_ADDR:%.*]] = alloca i64, align 8
 // CHECK17-NEXT:    [[AA_ADDR:%.*]] = alloca i64, align 8
@@ -10102,6 +10111,7 @@ int bar(int n){
 // CHECK17-NEXT:    [[N_CASTED:%.*]] = alloca i64, align 8
 // CHECK17-NEXT:    [[AA_CASTED:%.*]] = alloca i64, align 8
 // CHECK17-NEXT:    [[AAA_CASTED:%.*]] = alloca i64, align 8
+// CHECK17-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK17-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
 // CHECK17-NEXT:    store i64 [[N]], ptr [[N_ADDR]], align 8
 // CHECK17-NEXT:    store i64 [[AA]], ptr [[AA_ADDR]], align 8
@@ -10261,14 +10271,16 @@ int bar(int n){
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l215
-// CHECK17-SAME: (ptr noundef [[THIS:%.*]], i64 noundef [[B:%.*]], i64 noundef [[VLA:%.*]], i64 noundef [[VLA1:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[C:%.*]]) #[[ATTR0]] {
+// CHECK17-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef [[THIS:%.*]], i64 noundef [[B:%.*]], i64 noundef [[VLA:%.*]], i64 noundef [[VLA1:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[C:%.*]]) #[[ATTR0]] {
 // CHECK17-NEXT:  entry:
+// CHECK17-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[B_ADDR:%.*]] = alloca i64, align 8
 // CHECK17-NEXT:    [[VLA_ADDR:%.*]] = alloca i64, align 8
 // CHECK17-NEXT:    [[VLA_ADDR2:%.*]] = alloca i64, align 8
 // CHECK17-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[B_CASTED:%.*]] = alloca i64, align 8
+// CHECK17-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK17-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
 // CHECK17-NEXT:    store i64 [[B]], ptr [[B_ADDR]], align 8
 // CHECK17-NEXT:    store i64 [[VLA]], ptr [[VLA_ADDR]], align 8
@@ -10381,13 +10393,15 @@ int bar(int n){
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l180
-// CHECK17-SAME: (i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
+// CHECK17-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
 // CHECK17-NEXT:  entry:
+// CHECK17-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
 // CHECK17-NEXT:    [[AA_ADDR:%.*]] = alloca i64, align 8
 // CHECK17-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[A_CASTED:%.*]] = alloca i64, align 8
 // CHECK17-NEXT:    [[AA_CASTED:%.*]] = alloca i64, align 8
+// CHECK17-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK17-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
 // CHECK17-NEXT:    store i64 [[AA]], ptr [[AA_ADDR]], align 8
 // CHECK17-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
@@ -10489,13 +10503,15 @@ int bar(int n){
 //
 //
 // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l97
-// CHECK19-SAME: (i32 noundef [[AA:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]], i32 noundef [[DOTCAPTURE_EXPR_1:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK19-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[AA:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]], i32 noundef [[DOTCAPTURE_EXPR_1:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK19-NEXT:  entry:
+// CHECK19-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[AA_ADDR:%.*]] = alloca i32, align 4
 // CHECK19-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i32, align 4
 // CHECK19-NEXT:    [[DOTCAPTURE_EXPR__ADDR2:%.*]] = alloca i32, align 4
 // CHECK19-NEXT:    [[AA_CASTED:%.*]] = alloca i32, align 4
 // CHECK19-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB2:[0-9]+]])
+// CHECK19-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK19-NEXT:    store i32 [[AA]], ptr [[AA_ADDR]], align 4
 // CHECK19-NEXT:    store i32 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 4
 // CHECK19-NEXT:    store i32 [[DOTCAPTURE_EXPR_1]], ptr [[DOTCAPTURE_EXPR__ADDR2]], align 4
@@ -10579,10 +10595,12 @@ int bar(int n){
 //
 //
 // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l111
-// CHECK19-SAME: (i32 noundef [[AA:%.*]]) #[[ATTR0]] {
+// CHECK19-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[AA:%.*]]) #[[ATTR0]] {
 // CHECK19-NEXT:  entry:
+// CHECK19-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[AA_ADDR:%.*]] = alloca i32, align 4
 // CHECK19-NEXT:    [[AA_CASTED:%.*]] = alloca i32, align 4
+// CHECK19-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK19-NEXT:    store i32 [[AA]], ptr [[AA_ADDR]], align 4
 // CHECK19-NEXT:    [[TMP0:%.*]] = load i16, ptr [[AA_ADDR]], align 2
 // CHECK19-NEXT:    store i16 [[TMP0]], ptr [[AA_CASTED]], align 2
@@ -10666,12 +10684,14 @@ int bar(int n){
 //
 //
 // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l118
-// CHECK19-SAME: (i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]]) #[[ATTR0]] {
+// CHECK19-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]]) #[[ATTR0]] {
 // CHECK19-NEXT:  entry:
+// CHECK19-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 // CHECK19-NEXT:    [[AA_ADDR:%.*]] = alloca i32, align 4
 // CHECK19-NEXT:    [[A_CASTED:%.*]] = alloca i32, align 4
 // CHECK19-NEXT:    [[AA_CASTED:%.*]] = alloca i32, align 4
+// CHECK19-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK19-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
 // CHECK19-NEXT:    store i32 [[AA]], ptr [[AA_ADDR]], align 4
 // CHECK19-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
@@ -10764,8 +10784,9 @@ int bar(int n){
 //
 //
 // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l142
-// CHECK19-SAME: (i32 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BN:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[C:%.*]], i32 noundef [[VLA1:%.*]], i32 noundef [[VLA3:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[CN:%.*]], ptr noundef nonnull align 4 dereferenceable(12) [[D:%.*]]) #[[ATTR0]] {
+// CHECK19-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BN:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[C:%.*]], i32 noundef [[VLA1:%.*]], i32 noundef [[VLA3:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[CN:%.*]], ptr noundef nonnull align 4 dereferenceable(12) [[D:%.*]]) #[[ATTR0]] {
 // CHECK19-NEXT:  entry:
+// CHECK19-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 // CHECK19-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[VLA_ADDR:%.*]] = alloca i32, align 4
@@ -10776,6 +10797,7 @@ int bar(int n){
 // CHECK19-NEXT:    [[CN_ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[D_ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[A_CASTED:%.*]] = alloca i32, align 4
+// CHECK19-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK19-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
 // CHECK19-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 4
 // CHECK19-NEXT:    store i32 [[VLA]], ptr [[VLA_ADDR]], align 4
@@ -10932,8 +10954,9 @@ int bar(int n){
 //
 //
 // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l197
-// CHECK19-SAME: (i32 noundef [[A:%.*]], i32 noundef [[N:%.*]], i32 noundef [[AA:%.*]], i32 noundef [[AAA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
+// CHECK19-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[A:%.*]], i32 noundef [[N:%.*]], i32 noundef [[AA:%.*]], i32 noundef [[AAA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
 // CHECK19-NEXT:  entry:
+// CHECK19-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 // CHECK19-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
 // CHECK19-NEXT:    [[AA_ADDR:%.*]] = alloca i32, align 4
@@ -10943,6 +10966,7 @@ int bar(int n){
 // CHECK19-NEXT:    [[N_CASTED:%.*]] = alloca i32, align 4
 // CHECK19-NEXT:    [[AA_CASTED:%.*]] = alloca i32, align 4
 // CHECK19-NEXT:    [[AAA_CASTED:%.*]] = alloca i32, align 4
+// CHECK19-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK19-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
 // CHECK19-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
 // CHECK19-NEXT:    store i32 [[AA]], ptr [[AA_ADDR]], align 4
@@ -11102,14 +11126,16 @@ int bar(int n){
 //
 //
 // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l215
-// CHECK19-SAME: (ptr noundef [[THIS:%.*]], i32 noundef [[B:%.*]], i32 noundef [[VLA:%.*]], i32 noundef [[VLA1:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[C:%.*]]) #[[ATTR0]] {
+// CHECK19-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef [[THIS:%.*]], i32 noundef [[B:%.*]], i32 noundef [[VLA:%.*]], i32 noundef [[VLA1:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[C:%.*]]) #[[ATTR0]] {
 // CHECK19-NEXT:  entry:
+// CHECK19-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
 // CHECK19-NEXT:    [[VLA_ADDR:%.*]] = alloca i32, align 4
 // CHECK19-NEXT:    [[VLA_ADDR2:%.*]] = alloca i32, align 4
 // CHECK19-NEXT:    [[C_ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[B_CASTED:%.*]] = alloca i32, align 4
+// CHECK19-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK19-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
 // CHECK19-NEXT:    store i32 [[B]], ptr [[B_ADDR]], align 4
 // CHECK19-NEXT:    store i32 [[VLA]], ptr [[VLA_ADDR]], align 4
@@ -11222,13 +11248,15 @@ int bar(int n){
 //
 //
 // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l180
-// CHECK19-SAME: (i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
+// CHECK19-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
 // CHECK19-NEXT:  entry:
+// CHECK19-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 // CHECK19-NEXT:    [[AA_ADDR:%.*]] = alloca i32, align 4
 // CHECK19-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[A_CASTED:%.*]] = alloca i32, align 4
 // CHECK19-NEXT:    [[AA_CASTED:%.*]] = alloca i32, align 4
+// CHECK19-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK19-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
 // CHECK19-NEXT:    store i32 [[AA]], ptr [[AA_ADDR]], align 4
 // CHECK19-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 4
@@ -11330,13 +11358,15 @@ int bar(int n){
 //
 //
 // CHECK21-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l97
-// CHECK21-SAME: (i64 noundef [[AA:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]], i64 noundef [[DOTCAPTURE_EXPR_1:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK21-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[AA:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]], i64 noundef [[DOTCAPTURE_EXPR_1:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK21-NEXT:  entry:
+// CHECK21-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK21-NEXT:    [[AA_ADDR:%.*]] = alloca i64, align 8
 // CHECK21-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8
 // CHECK21-NEXT:    [[DOTCAPTURE_EXPR__ADDR2:%.*]] = alloca i64, align 8
 // CHECK21-NEXT:    [[AA_CASTED:%.*]] = alloca i64, align 8
 // CHECK21-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB2:[0-9]+]])
+// CHECK21-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK21-NEXT:    store i64 [[AA]], ptr [[AA_ADDR]], align 8
 // CHECK21-NEXT:    store i64 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 8
 // CHECK21-NEXT:    store i64 [[DOTCAPTURE_EXPR_1]], ptr [[DOTCAPTURE_EXPR__ADDR2]], align 8
@@ -11420,10 +11450,12 @@ int bar(int n){
 //
 //
 // CHECK21-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l111
-// CHECK21-SAME: (i64 noundef [[AA:%.*]]) #[[ATTR0]] {
+// CHECK21-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[AA:%.*]]) #[[ATTR0]] {
 // CHECK21-NEXT:  entry:
+// CHECK21-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK21-NEXT:    [[AA_ADDR:%.*]] = alloca i64, align 8
 // CHECK21-NEXT:    [[AA_CASTED:%.*]] = alloca i64, align 8
+// CHECK21-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK21-NEXT:    store i64 [[AA]], ptr [[AA_ADDR]], align 8
 // CHECK21-NEXT:    [[TMP0:%.*]] = load i16, ptr [[AA_ADDR]], align 2
 // CHECK21-NEXT:    store i16 [[TMP0]], ptr [[AA_CASTED]], align 2
@@ -11507,12 +11539,14 @@ int bar(int n){
 //
 //
 // CHECK21-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l118
-// CHECK21-SAME: (i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]]) #[[ATTR0]] {
+// CHECK21-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]]) #[[ATTR0]] {
 // CHECK21-NEXT:  entry:
+// CHECK21-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK21-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
 // CHECK21-NEXT:    [[AA_ADDR:%.*]] = alloca i64, align 8
 // CHECK21-NEXT:    [[A_CASTED:%.*]] = alloca i64, align 8
 // CHECK21-NEXT:    [[AA_CASTED:%.*]] = alloca i64, align 8
+// CHECK21-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK21-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
 // CHECK21-NEXT:    store i64 [[AA]], ptr [[AA_ADDR]], align 8
 // CHECK21-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
@@ -11605,8 +11639,9 @@ int bar(int n){
 //
 //
 // CHECK21-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l142
-// CHECK21-SAME: (i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BN:%.*]], ptr noundef nonnull align 8 dereferenceable(400) [[C:%.*]], i64 noundef [[VLA1:%.*]], i64 noundef [[VLA3:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[CN:%.*]], ptr noundef nonnull align 8 dereferenceable(16) [[D:%.*]]) #[[ATTR0]] {
+// CHECK21-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BN:%.*]], ptr noundef nonnull align 8 dereferenceable(400) [[C:%.*]], i64 noundef [[VLA1:%.*]], i64 noundef [[VLA3:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[CN:%.*]], ptr noundef nonnull align 8 dereferenceable(16) [[D:%.*]]) #[[ATTR0]] {
 // CHECK21-NEXT:  entry:
+// CHECK21-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK21-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
 // CHECK21-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
 // CHECK21-NEXT:    [[VLA_ADDR:%.*]] = alloca i64, align 8
@@ -11617,6 +11652,7 @@ int bar(int n){
 // CHECK21-NEXT:    [[CN_ADDR:%.*]] = alloca ptr, align 8
 // CHECK21-NEXT:    [[D_ADDR:%.*]] = alloca ptr, align 8
 // CHECK21-NEXT:    [[A_CASTED:%.*]] = alloca i64, align 8
+// CHECK21-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK21-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
 // CHECK21-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
 // CHECK21-NEXT:    store i64 [[VLA]], ptr [[VLA_ADDR]], align 8
@@ -11773,8 +11809,9 @@ int bar(int n){
 //
 //
 // CHECK21-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l197
-// CHECK21-SAME: (i64 noundef [[A:%.*]], i64 noundef [[N:%.*]], i64 noundef [[AA:%.*]], i64 noundef [[AAA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
+// CHECK21-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[A:%.*]], i64 noundef [[N:%.*]], i64 noundef [[AA:%.*]], i64 noundef [[AAA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
 // CHECK21-NEXT:  entry:
+// CHECK21-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK21-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
 // CHECK21-NEXT:    [[N_ADDR:%.*]] = alloca i64, align 8
 // CHECK21-NEXT:    [[AA_ADDR:%.*]] = alloca i64, align 8
@@ -11784,6 +11821,7 @@ int bar(int n){
 // CHECK21-NEXT:    [[N_CASTED:%.*]] = alloca i64, align 8
 // CHECK21-NEXT:    [[AA_CASTED:%.*]] = alloca i64, align 8
 // CHECK21-NEXT:    [[AAA_CASTED:%.*]] = alloca i64, align 8
+// CHECK21-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK21-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
 // CHECK21-NEXT:    store i64 [[N]], ptr [[N_ADDR]], align 8
 // CHECK21-NEXT:    store i64 [[AA]], ptr [[AA_ADDR]], align 8
@@ -11943,8 +11981,9 @@ int bar(int n){
 //
 //
 // CHECK21-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l215
-// CHECK21-SAME: (ptr noundef [[THIS:%.*]], i64 noundef [[B:%.*]], i64 noundef [[VLA:%.*]], i64 noundef [[VLA1:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[C:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] {
+// CHECK21-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef [[THIS:%.*]], i64 noundef [[B:%.*]], i64 noundef [[VLA:%.*]], i64 noundef [[VLA1:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[C:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] {
 // CHECK21-NEXT:  entry:
+// CHECK21-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK21-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK21-NEXT:    [[B_ADDR:%.*]] = alloca i64, align 8
 // CHECK21-NEXT:    [[VLA_ADDR:%.*]] = alloca i64, align 8
@@ -11953,6 +11992,7 @@ int bar(int n){
 // CHECK21-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8
 // CHECK21-NEXT:    [[B_CASTED:%.*]] = alloca i64, align 8
 // CHECK21-NEXT:    [[DOTCAPTURE_EXPR__CASTED:%.*]] = alloca i64, align 8
+// CHECK21-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK21-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
 // CHECK21-NEXT:    store i64 [[B]], ptr [[B_ADDR]], align 8
 // CHECK21-NEXT:    store i64 [[VLA]], ptr [[VLA_ADDR]], align 8
@@ -12115,13 +12155,15 @@ int bar(int n){
 //
 //
 // CHECK21-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l180
-// CHECK21-SAME: (i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
+// CHECK21-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[A:%.*]], i64 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
 // CHECK21-NEXT:  entry:
+// CHECK21-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK21-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
 // CHECK21-NEXT:    [[AA_ADDR:%.*]] = alloca i64, align 8
 // CHECK21-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 8
 // CHECK21-NEXT:    [[A_CASTED:%.*]] = alloca i64, align 8
 // CHECK21-NEXT:    [[AA_CASTED:%.*]] = alloca i64, align 8
+// CHECK21-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK21-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
 // CHECK21-NEXT:    store i64 [[AA]], ptr [[AA_ADDR]], align 8
 // CHECK21-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 8
@@ -12223,13 +12265,15 @@ int bar(int n){
 //
 //
 // CHECK23-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l97
-// CHECK23-SAME: (i32 noundef [[AA:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]], i32 noundef [[DOTCAPTURE_EXPR_1:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK23-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[AA:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]], i32 noundef [[DOTCAPTURE_EXPR_1:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK23-NEXT:  entry:
+// CHECK23-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK23-NEXT:    [[AA_ADDR:%.*]] = alloca i32, align 4
 // CHECK23-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i32, align 4
 // CHECK23-NEXT:    [[DOTCAPTURE_EXPR__ADDR2:%.*]] = alloca i32, align 4
 // CHECK23-NEXT:    [[AA_CASTED:%.*]] = alloca i32, align 4
 // CHECK23-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB2:[0-9]+]])
+// CHECK23-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK23-NEXT:    store i32 [[AA]], ptr [[AA_ADDR]], align 4
 // CHECK23-NEXT:    store i32 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 4
 // CHECK23-NEXT:    store i32 [[DOTCAPTURE_EXPR_1]], ptr [[DOTCAPTURE_EXPR__ADDR2]], align 4
@@ -12313,10 +12357,12 @@ int bar(int n){
 //
 //
 // CHECK23-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l111
-// CHECK23-SAME: (i32 noundef [[AA:%.*]]) #[[ATTR0]] {
+// CHECK23-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[AA:%.*]]) #[[ATTR0]] {
 // CHECK23-NEXT:  entry:
+// CHECK23-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK23-NEXT:    [[AA_ADDR:%.*]] = alloca i32, align 4
 // CHECK23-NEXT:    [[AA_CASTED:%.*]] = alloca i32, align 4
+// CHECK23-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK23-NEXT:    store i32 [[AA]], ptr [[AA_ADDR]], align 4
 // CHECK23-NEXT:    [[TMP0:%.*]] = load i16, ptr [[AA_ADDR]], align 2
 // CHECK23-NEXT:    store i16 [[TMP0]], ptr [[AA_CASTED]], align 2
@@ -12400,12 +12446,14 @@ int bar(int n){
 //
 //
 // CHECK23-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l118
-// CHECK23-SAME: (i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]]) #[[ATTR0]] {
+// CHECK23-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]]) #[[ATTR0]] {
 // CHECK23-NEXT:  entry:
+// CHECK23-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK23-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 // CHECK23-NEXT:    [[AA_ADDR:%.*]] = alloca i32, align 4
 // CHECK23-NEXT:    [[A_CASTED:%.*]] = alloca i32, align 4
 // CHECK23-NEXT:    [[AA_CASTED:%.*]] = alloca i32, align 4
+// CHECK23-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK23-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
 // CHECK23-NEXT:    store i32 [[AA]], ptr [[AA_ADDR]], align 4
 // CHECK23-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
@@ -12498,8 +12546,9 @@ int bar(int n){
 //
 //
 // CHECK23-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l142
-// CHECK23-SAME: (i32 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BN:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[C:%.*]], i32 noundef [[VLA1:%.*]], i32 noundef [[VLA3:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[CN:%.*]], ptr noundef nonnull align 4 dereferenceable(12) [[D:%.*]]) #[[ATTR0]] {
+// CHECK23-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[BN:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[C:%.*]], i32 noundef [[VLA1:%.*]], i32 noundef [[VLA3:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[CN:%.*]], ptr noundef nonnull align 4 dereferenceable(12) [[D:%.*]]) #[[ATTR0]] {
 // CHECK23-NEXT:  entry:
+// CHECK23-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK23-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 // CHECK23-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 4
 // CHECK23-NEXT:    [[VLA_ADDR:%.*]] = alloca i32, align 4
@@ -12510,6 +12559,7 @@ int bar(int n){
 // CHECK23-NEXT:    [[CN_ADDR:%.*]] = alloca ptr, align 4
 // CHECK23-NEXT:    [[D_ADDR:%.*]] = alloca ptr, align 4
 // CHECK23-NEXT:    [[A_CASTED:%.*]] = alloca i32, align 4
+// CHECK23-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK23-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
 // CHECK23-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 4
 // CHECK23-NEXT:    store i32 [[VLA]], ptr [[VLA_ADDR]], align 4
@@ -12666,8 +12716,9 @@ int bar(int n){
 //
 //
 // CHECK23-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l197
-// CHECK23-SAME: (i32 noundef [[A:%.*]], i32 noundef [[N:%.*]], i32 noundef [[AA:%.*]], i32 noundef [[AAA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
+// CHECK23-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[A:%.*]], i32 noundef [[N:%.*]], i32 noundef [[AA:%.*]], i32 noundef [[AAA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
 // CHECK23-NEXT:  entry:
+// CHECK23-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK23-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 // CHECK23-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
 // CHECK23-NEXT:    [[AA_ADDR:%.*]] = alloca i32, align 4
@@ -12677,6 +12728,7 @@ int bar(int n){
 // CHECK23-NEXT:    [[N_CASTED:%.*]] = alloca i32, align 4
 // CHECK23-NEXT:    [[AA_CASTED:%.*]] = alloca i32, align 4
 // CHECK23-NEXT:    [[AAA_CASTED:%.*]] = alloca i32, align 4
+// CHECK23-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK23-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
 // CHECK23-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
 // CHECK23-NEXT:    store i32 [[AA]], ptr [[AA_ADDR]], align 4
@@ -12836,8 +12888,9 @@ int bar(int n){
 //
 //
 // CHECK23-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l215
-// CHECK23-SAME: (ptr noundef [[THIS:%.*]], i32 noundef [[B:%.*]], i32 noundef [[VLA:%.*]], i32 noundef [[VLA1:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[C:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] {
+// CHECK23-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef [[THIS:%.*]], i32 noundef [[B:%.*]], i32 noundef [[VLA:%.*]], i32 noundef [[VLA1:%.*]], ptr noundef nonnull align 2 dereferenceable(2) [[C:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] {
 // CHECK23-NEXT:  entry:
+// CHECK23-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK23-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
 // CHECK23-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
 // CHECK23-NEXT:    [[VLA_ADDR:%.*]] = alloca i32, align 4
@@ -12846,6 +12899,7 @@ int bar(int n){
 // CHECK23-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i32, align 4
 // CHECK23-NEXT:    [[B_CASTED:%.*]] = alloca i32, align 4
 // CHECK23-NEXT:    [[DOTCAPTURE_EXPR__CASTED:%.*]] = alloca i32, align 4
+// CHECK23-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK23-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
 // CHECK23-NEXT:    store i32 [[B]], ptr [[B_ADDR]], align 4
 // CHECK23-NEXT:    store i32 [[VLA]], ptr [[VLA_ADDR]], align 4
@@ -13008,13 +13062,15 @@ int bar(int n){
 //
 //
 // CHECK23-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l180
-// CHECK23-SAME: (i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
+// CHECK23-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[A:%.*]], i32 noundef [[AA:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR0]] {
 // CHECK23-NEXT:  entry:
+// CHECK23-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK23-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 // CHECK23-NEXT:    [[AA_ADDR:%.*]] = alloca i32, align 4
 // CHECK23-NEXT:    [[B_ADDR:%.*]] = alloca ptr, align 4
 // CHECK23-NEXT:    [[A_CASTED:%.*]] = alloca i32, align 4
 // CHECK23-NEXT:    [[AA_CASTED:%.*]] = alloca i32, align 4
+// CHECK23-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK23-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
 // CHECK23-NEXT:    store i32 [[AA]], ptr [[AA_ADDR]], align 4
 // CHECK23-NEXT:    store ptr [[B]], ptr [[B_ADDR]], align 4
diff --git a/clang/test/OpenMP/target_teams_distribute_simd_collapse_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_simd_collapse_codegen.cpp
index b4e68abe3cd72..3a303fe951cf4 100644
--- a/clang/test/OpenMP/target_teams_distribute_simd_collapse_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_simd_collapse_codegen.cpp
@@ -153,7 +153,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
 // CHECK1-NEXT:    br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK1:       omp_offload.failed:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28(ptr [[THIS1]]) #[[ATTR3:[0-9]+]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28(ptr [[THIS1]]) #[[ATTR2:[0-9]+]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    [[A3:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0
@@ -174,7 +174,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -264,7 +264,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK1-SAME: () #[[ATTR4:[0-9]+]] {
+// CHECK1-SAME: () #[[ATTR3:[0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK1-NEXT:    ret void
@@ -329,7 +329,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
 // CHECK3-NEXT:    br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK3:       omp_offload.failed:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28(ptr [[THIS1]]) #[[ATTR3:[0-9]+]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28(ptr [[THIS1]]) #[[ATTR2:[0-9]+]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK3:       omp_offload.cont:
 // CHECK3-NEXT:    [[A3:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0
@@ -350,7 +350,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -438,7 +438,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK3-SAME: () #[[ATTR4:[0-9]+]] {
+// CHECK3-SAME: () #[[ATTR3:[0-9]+]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK3-NEXT:    ret void
@@ -713,7 +713,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[TMP50:%.*]] = icmp ne i32 [[TMP49]], 0
 // CHECK9-NEXT:    br i1 [[TMP50]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK9:       omp_offload.failed:
-// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l80(i64 [[TMP7]], i64 [[TMP9]], i64 [[TMP1]], i64 [[TMP3]], ptr [[VLA]]) #[[ATTR4:[0-9]+]]
+// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l80(i64 [[TMP7]], i64 [[TMP9]], i64 [[TMP1]], i64 [[TMP3]], ptr [[VLA]]) #[[ATTR3:[0-9]+]]
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK9:       omp_offload.cont:
 // CHECK9-NEXT:    [[TMP51:%.*]] = load i32, ptr [[ARGC_ADDR]], align 4
@@ -754,7 +754,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l80.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[M:%.*]], i64 noundef [[VLA:%.*]], i64 noundef [[VLA1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[M:%.*]], i64 noundef [[VLA:%.*]], i64 noundef [[VLA1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -918,7 +918,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@_Z5tmainIiLi10ELi2EEiT_
-// CHECK9-SAME: (i32 noundef signext [[ARGC:%.*]]) #[[ATTR6:[0-9]+]] comdat {
+// CHECK9-SAME: (i32 noundef signext [[ARGC:%.*]]) #[[ATTR5:[0-9]+]] comdat {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[ARGC_ADDR:%.*]] = alloca i32, align 4
 // CHECK9-NEXT:    [[A:%.*]] = alloca [10 x [2 x i32]], align 4
@@ -967,7 +967,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
 // CHECK9-NEXT:    br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK9:       omp_offload.failed:
-// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10ELi2EEiT__l67(ptr [[A]]) #[[ATTR4]]
+// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10ELi2EEiT__l67(ptr [[A]]) #[[ATTR3]]
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK9:       omp_offload.cont:
 // CHECK9-NEXT:    ret i32 0
@@ -984,7 +984,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10ELi2EEiT__l67.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(80) [[A:%.*]]) #[[ATTR3]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(80) [[A:%.*]]) #[[ATTR2]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1073,7 +1073,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK9-SAME: () #[[ATTR7:[0-9]+]] {
+// CHECK9-SAME: () #[[ATTR6:[0-9]+]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK9-NEXT:    ret void
@@ -1207,7 +1207,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[TMP49:%.*]] = icmp ne i32 [[TMP48]], 0
 // CHECK11-NEXT:    br i1 [[TMP49]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK11:       omp_offload.failed:
-// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l80(i32 [[TMP5]], i32 [[TMP7]], i32 [[TMP0]], i32 [[TMP1]], ptr [[VLA]]) #[[ATTR4:[0-9]+]]
+// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l80(i32 [[TMP5]], i32 [[TMP7]], i32 [[TMP0]], i32 [[TMP1]], ptr [[VLA]]) #[[ATTR3:[0-9]+]]
 // CHECK11-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK11:       omp_offload.cont:
 // CHECK11-NEXT:    [[TMP50:%.*]] = load i32, ptr [[ARGC_ADDR]], align 4
@@ -1248,7 +1248,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l80.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[M:%.*]], i32 noundef [[VLA:%.*]], i32 noundef [[VLA1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[M:%.*]], i32 noundef [[VLA:%.*]], i32 noundef [[VLA1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1410,7 +1410,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@_Z5tmainIiLi10ELi2EEiT_
-// CHECK11-SAME: (i32 noundef [[ARGC:%.*]]) #[[ATTR6:[0-9]+]] comdat {
+// CHECK11-SAME: (i32 noundef [[ARGC:%.*]]) #[[ATTR5:[0-9]+]] comdat {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[ARGC_ADDR:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[A:%.*]] = alloca [10 x [2 x i32]], align 4
@@ -1459,7 +1459,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
 // CHECK11-NEXT:    br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK11:       omp_offload.failed:
-// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10ELi2EEiT__l67(ptr [[A]]) #[[ATTR4]]
+// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10ELi2EEiT__l67(ptr [[A]]) #[[ATTR3]]
 // CHECK11-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK11:       omp_offload.cont:
 // CHECK11-NEXT:    ret i32 0
@@ -1476,7 +1476,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10ELi2EEiT__l67.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(80) [[A:%.*]]) #[[ATTR3]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(80) [[A:%.*]]) #[[ATTR2]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1563,7 +1563,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK11-SAME: () #[[ATTR7:[0-9]+]] {
+// CHECK11-SAME: () #[[ATTR6:[0-9]+]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK11-NEXT:    ret void
diff --git a/clang/test/OpenMP/target_teams_distribute_simd_dist_schedule_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_simd_dist_schedule_codegen.cpp
index aebbe0e695aa7..7b854870b9429 100644
--- a/clang/test/OpenMP/target_teams_distribute_simd_dist_schedule_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_simd_dist_schedule_codegen.cpp
@@ -190,7 +190,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
 // CHECK1-NEXT:    br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK1:       omp_offload.failed:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28(ptr [[THIS1]]) #[[ATTR3:[0-9]+]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28(ptr [[THIS1]]) #[[ATTR2:[0-9]+]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    [[A2:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0
@@ -232,7 +232,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP39:%.*]] = icmp ne i32 [[TMP38]], 0
 // CHECK1-NEXT:    br i1 [[TMP39]], label [[OMP_OFFLOAD_FAILED8:%.*]], label [[OMP_OFFLOAD_CONT9:%.*]]
 // CHECK1:       omp_offload.failed8:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l32(ptr [[THIS1]]) #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l32(ptr [[THIS1]]) #[[ATTR2]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT9]]
 // CHECK1:       omp_offload.cont9:
 // CHECK1-NEXT:    [[A10:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0
@@ -274,7 +274,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP59:%.*]] = icmp ne i32 [[TMP58]], 0
 // CHECK1-NEXT:    br i1 [[TMP59]], label [[OMP_OFFLOAD_FAILED16:%.*]], label [[OMP_OFFLOAD_CONT17:%.*]]
 // CHECK1:       omp_offload.failed16:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l36(ptr [[THIS1]]) #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l36(ptr [[THIS1]]) #[[ATTR2]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT17]]
 // CHECK1:       omp_offload.cont17:
 // CHECK1-NEXT:    [[A18:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0
@@ -294,7 +294,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -379,7 +379,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l32.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -464,7 +464,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l36.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -556,7 +556,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK1-SAME: () #[[ATTR4:[0-9]+]] {
+// CHECK1-SAME: () #[[ATTR3:[0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK1-NEXT:    ret void
@@ -630,7 +630,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
 // CHECK3-NEXT:    br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK3:       omp_offload.failed:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28(ptr [[THIS1]]) #[[ATTR3:[0-9]+]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28(ptr [[THIS1]]) #[[ATTR2:[0-9]+]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK3:       omp_offload.cont:
 // CHECK3-NEXT:    [[A2:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0
@@ -672,7 +672,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP39:%.*]] = icmp ne i32 [[TMP38]], 0
 // CHECK3-NEXT:    br i1 [[TMP39]], label [[OMP_OFFLOAD_FAILED8:%.*]], label [[OMP_OFFLOAD_CONT9:%.*]]
 // CHECK3:       omp_offload.failed8:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l32(ptr [[THIS1]]) #[[ATTR3]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l32(ptr [[THIS1]]) #[[ATTR2]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT9]]
 // CHECK3:       omp_offload.cont9:
 // CHECK3-NEXT:    [[A10:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0
@@ -714,7 +714,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP59:%.*]] = icmp ne i32 [[TMP58]], 0
 // CHECK3-NEXT:    br i1 [[TMP59]], label [[OMP_OFFLOAD_FAILED16:%.*]], label [[OMP_OFFLOAD_CONT17:%.*]]
 // CHECK3:       omp_offload.failed16:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l36(ptr [[THIS1]]) #[[ATTR3]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l36(ptr [[THIS1]]) #[[ATTR2]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT17]]
 // CHECK3:       omp_offload.cont17:
 // CHECK3-NEXT:    [[A18:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0
@@ -734,7 +734,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -818,7 +818,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l32.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -902,7 +902,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l36.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -993,7 +993,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK3-SAME: () #[[ATTR4:[0-9]+]] {
+// CHECK3-SAME: () #[[ATTR3:[0-9]+]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK3-NEXT:    ret void
@@ -1362,7 +1362,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[TMP37:%.*]] = icmp ne i32 [[TMP36]], 0
 // CHECK9-NEXT:    br i1 [[TMP37]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK9:       omp_offload.failed:
-// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l94(i64 [[TMP4]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR4:[0-9]+]]
+// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l94(i64 [[TMP4]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR3:[0-9]+]]
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK9:       omp_offload.cont:
 // CHECK9-NEXT:    [[TMP38:%.*]] = load i32, ptr [[N]], align 4
@@ -1433,7 +1433,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[TMP72:%.*]] = icmp ne i32 [[TMP71]], 0
 // CHECK9-NEXT:    br i1 [[TMP72]], label [[OMP_OFFLOAD_FAILED16:%.*]], label [[OMP_OFFLOAD_CONT17:%.*]]
 // CHECK9:       omp_offload.failed16:
-// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l98(i64 [[TMP39]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR4]]
+// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l98(i64 [[TMP39]], i64 [[TMP1]], ptr [[VLA]]) #[[ATTR3]]
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT17]]
 // CHECK9:       omp_offload.cont17:
 // CHECK9-NEXT:    [[TMP73:%.*]] = load i32, ptr [[N]], align 4
@@ -1515,7 +1515,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[TMP113:%.*]] = icmp ne i32 [[TMP112]], 0
 // CHECK9-NEXT:    br i1 [[TMP113]], label [[OMP_OFFLOAD_FAILED32:%.*]], label [[OMP_OFFLOAD_CONT33:%.*]]
 // CHECK9:       omp_offload.failed32:
-// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l102(i64 [[TMP75]], i64 [[TMP1]], ptr [[VLA]], i64 [[TMP77]]) #[[ATTR4]]
+// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l102(i64 [[TMP75]], i64 [[TMP1]], ptr [[VLA]], i64 [[TMP77]]) #[[ATTR3]]
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT33]]
 // CHECK9:       omp_offload.cont33:
 // CHECK9-NEXT:    [[TMP114:%.*]] = load i32, ptr [[ARGC_ADDR]], align 4
@@ -1547,7 +1547,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l94.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1672,7 +1672,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l98.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1803,7 +1803,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l102.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1929,7 +1929,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@_Z5tmainIiLi10EEiT_
-// CHECK9-SAME: (i32 noundef signext [[ARGC:%.*]]) #[[ATTR6:[0-9]+]] comdat {
+// CHECK9-SAME: (i32 noundef signext [[ARGC:%.*]]) #[[ATTR5:[0-9]+]] comdat {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[ARGC_ADDR:%.*]] = alloca i32, align 4
 // CHECK9-NEXT:    [[A:%.*]] = alloca [10 x i32], align 4
@@ -1987,7 +1987,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
 // CHECK9-NEXT:    br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK9:       omp_offload.failed:
-// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l76(ptr [[A]]) #[[ATTR4]]
+// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l76(ptr [[A]]) #[[ATTR3]]
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK9:       omp_offload.cont:
 // CHECK9-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
@@ -2028,7 +2028,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[TMP39:%.*]] = icmp ne i32 [[TMP38]], 0
 // CHECK9-NEXT:    br i1 [[TMP39]], label [[OMP_OFFLOAD_FAILED6:%.*]], label [[OMP_OFFLOAD_CONT7:%.*]]
 // CHECK9:       omp_offload.failed6:
-// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l80(ptr [[A]]) #[[ATTR4]]
+// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l80(ptr [[A]]) #[[ATTR3]]
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT7]]
 // CHECK9:       omp_offload.cont7:
 // CHECK9-NEXT:    [[TMP40:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS8]], i32 0, i32 0
@@ -2069,7 +2069,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[TMP59:%.*]] = icmp ne i32 [[TMP58]], 0
 // CHECK9-NEXT:    br i1 [[TMP59]], label [[OMP_OFFLOAD_FAILED13:%.*]], label [[OMP_OFFLOAD_CONT14:%.*]]
 // CHECK9:       omp_offload.failed13:
-// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l84(ptr [[A]]) #[[ATTR4]]
+// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l84(ptr [[A]]) #[[ATTR3]]
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT14]]
 // CHECK9:       omp_offload.cont14:
 // CHECK9-NEXT:    ret i32 0
@@ -2086,7 +2086,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l76.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -2170,7 +2170,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l80.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -2254,7 +2254,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l84.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -2345,7 +2345,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK9-SAME: () #[[ATTR7:[0-9]+]] {
+// CHECK9-SAME: () #[[ATTR6:[0-9]+]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK9-NEXT:    ret void
@@ -2467,7 +2467,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[TMP37:%.*]] = icmp ne i32 [[TMP36]], 0
 // CHECK11-NEXT:    br i1 [[TMP37]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK11:       omp_offload.failed:
-// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l94(i32 [[TMP3]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR4:[0-9]+]]
+// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l94(i32 [[TMP3]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR3:[0-9]+]]
 // CHECK11-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK11:       omp_offload.cont:
 // CHECK11-NEXT:    [[TMP38:%.*]] = load i32, ptr [[N]], align 4
@@ -2539,7 +2539,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[TMP73:%.*]] = icmp ne i32 [[TMP72]], 0
 // CHECK11-NEXT:    br i1 [[TMP73]], label [[OMP_OFFLOAD_FAILED16:%.*]], label [[OMP_OFFLOAD_CONT17:%.*]]
 // CHECK11:       omp_offload.failed16:
-// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l98(i32 [[TMP39]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR4]]
+// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l98(i32 [[TMP39]], i32 [[TMP0]], ptr [[VLA]]) #[[ATTR3]]
 // CHECK11-NEXT:    br label [[OMP_OFFLOAD_CONT17]]
 // CHECK11:       omp_offload.cont17:
 // CHECK11-NEXT:    [[TMP74:%.*]] = load i32, ptr [[N]], align 4
@@ -2622,7 +2622,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[TMP115:%.*]] = icmp ne i32 [[TMP114]], 0
 // CHECK11-NEXT:    br i1 [[TMP115]], label [[OMP_OFFLOAD_FAILED32:%.*]], label [[OMP_OFFLOAD_CONT33:%.*]]
 // CHECK11:       omp_offload.failed32:
-// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l102(i32 [[TMP76]], i32 [[TMP0]], ptr [[VLA]], i32 [[TMP78]]) #[[ATTR4]]
+// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l102(i32 [[TMP76]], i32 [[TMP0]], ptr [[VLA]], i32 [[TMP78]]) #[[ATTR3]]
 // CHECK11-NEXT:    br label [[OMP_OFFLOAD_CONT33]]
 // CHECK11:       omp_offload.cont33:
 // CHECK11-NEXT:    [[TMP116:%.*]] = load i32, ptr [[ARGC_ADDR]], align 4
@@ -2654,7 +2654,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l94.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -2778,7 +2778,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l98.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -2908,7 +2908,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l102.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[VLA:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -3033,7 +3033,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@_Z5tmainIiLi10EEiT_
-// CHECK11-SAME: (i32 noundef [[ARGC:%.*]]) #[[ATTR6:[0-9]+]] comdat {
+// CHECK11-SAME: (i32 noundef [[ARGC:%.*]]) #[[ATTR5:[0-9]+]] comdat {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[ARGC_ADDR:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[A:%.*]] = alloca [10 x i32], align 4
@@ -3091,7 +3091,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
 // CHECK11-NEXT:    br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK11:       omp_offload.failed:
-// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l76(ptr [[A]]) #[[ATTR4]]
+// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l76(ptr [[A]]) #[[ATTR3]]
 // CHECK11-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK11:       omp_offload.cont:
 // CHECK11-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
@@ -3132,7 +3132,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[TMP39:%.*]] = icmp ne i32 [[TMP38]], 0
 // CHECK11-NEXT:    br i1 [[TMP39]], label [[OMP_OFFLOAD_FAILED6:%.*]], label [[OMP_OFFLOAD_CONT7:%.*]]
 // CHECK11:       omp_offload.failed6:
-// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l80(ptr [[A]]) #[[ATTR4]]
+// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l80(ptr [[A]]) #[[ATTR3]]
 // CHECK11-NEXT:    br label [[OMP_OFFLOAD_CONT7]]
 // CHECK11:       omp_offload.cont7:
 // CHECK11-NEXT:    [[TMP40:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS8]], i32 0, i32 0
@@ -3173,7 +3173,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[TMP59:%.*]] = icmp ne i32 [[TMP58]], 0
 // CHECK11-NEXT:    br i1 [[TMP59]], label [[OMP_OFFLOAD_FAILED13:%.*]], label [[OMP_OFFLOAD_CONT14:%.*]]
 // CHECK11:       omp_offload.failed13:
-// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l84(ptr [[A]]) #[[ATTR4]]
+// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l84(ptr [[A]]) #[[ATTR3]]
 // CHECK11-NEXT:    br label [[OMP_OFFLOAD_CONT14]]
 // CHECK11:       omp_offload.cont14:
 // CHECK11-NEXT:    ret i32 0
@@ -3190,7 +3190,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l76.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -3273,7 +3273,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l80.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -3356,7 +3356,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10EEiT__l84.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR3]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[A:%.*]]) #[[ATTR2]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -3446,7 +3446,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK11-SAME: () #[[ATTR7:[0-9]+]] {
+// CHECK11-SAME: () #[[ATTR6:[0-9]+]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK11-NEXT:    ret void
diff --git a/clang/test/OpenMP/target_teams_distribute_simd_firstprivate_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_simd_firstprivate_codegen.cpp
index c674a6c2b18d5..f9bfa069a6e81 100644
--- a/clang/test/OpenMP/target_teams_distribute_simd_firstprivate_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_simd_firstprivate_codegen.cpp
@@ -368,7 +368,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l92.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i64 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], i64 noundef [[SIVAR:%.*]]) #[[ATTR5:[0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i64 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], i64 noundef [[SIVAR:%.*]]) #[[ATTR4]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -535,7 +535,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@_Z5tmainIiET_v
-// CHECK1-SAME: () #[[ATTR7:[0-9]+]] comdat {
+// CHECK1-SAME: () #[[ATTR6:[0-9]+]] comdat {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4
@@ -732,7 +732,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l56.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i64 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR5]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i64 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR4]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1191,7 +1191,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l92.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i32 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], i32 noundef [[SIVAR:%.*]]) #[[ATTR5:[0-9]+]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i32 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], i32 noundef [[SIVAR:%.*]]) #[[ATTR4]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1356,7 +1356,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@_Z5tmainIiET_v
-// CHECK3-SAME: () #[[ATTR7:[0-9]+]] comdat {
+// CHECK3-SAME: () #[[ATTR6:[0-9]+]] comdat {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
 // CHECK3-NEXT:    [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4
@@ -1553,7 +1553,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l56.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i32 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR5]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i32 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR4]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -2573,7 +2573,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l74.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[G:%.*]], i64 noundef [[G1:%.*]], i64 noundef [[SIVAR:%.*]]) #[[ATTR6:[0-9]+]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[G:%.*]], i64 noundef [[G1:%.*]], i64 noundef [[SIVAR:%.*]]) #[[ATTR5]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
diff --git a/clang/test/OpenMP/target_teams_distribute_simd_lastprivate_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_simd_lastprivate_codegen.cpp
index 68fa099dca997..9704ba49fb84f 100644
--- a/clang/test/OpenMP/target_teams_distribute_simd_lastprivate_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_simd_lastprivate_codegen.cpp
@@ -187,7 +187,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l66.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[G:%.*]], i64 noundef [[G1:%.*]], i64 noundef [[SVAR:%.*]], i64 noundef [[SFVAR:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[G:%.*]], i64 noundef [[G1:%.*]], i64 noundef [[SVAR:%.*]], i64 noundef [[SFVAR:%.*]]) #[[ATTR2]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -302,7 +302,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK1-SAME: () #[[ATTR5:[0-9]+]] {
+// CHECK1-SAME: () #[[ATTR4:[0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK1-NEXT:    ret void
@@ -355,7 +355,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l66.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[G:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[G1:%.*]], i32 noundef [[SVAR:%.*]], i32 noundef [[SFVAR:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[G:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[G1:%.*]], i32 noundef [[SVAR:%.*]], i32 noundef [[SFVAR:%.*]]) #[[ATTR2]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -472,7 +472,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK3-SAME: () #[[ATTR5:[0-9]+]] {
+// CHECK3-SAME: () #[[ATTR4:[0-9]+]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK3-NEXT:    ret void
@@ -615,7 +615,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP37:%.*]] = icmp ne i32 [[TMP36]], 0
 // CHECK9-NEXT:    br i1 [[TMP37]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK9:       omp_offload.failed:
-// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l94(ptr [[VEC]], i64 [[TMP2]], ptr [[S_ARR]], ptr [[TMP3]], i64 [[TMP5]]) #[[ATTR5:[0-9]+]]
+// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l94(ptr [[VEC]], i64 [[TMP2]], ptr [[S_ARR]], ptr [[TMP3]], i64 [[TMP5]]) #[[ATTR4:[0-9]+]]
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK9:       omp_offload.cont:
 // CHECK9-NEXT:    [[CALL:%.*]] = call noundef signext i32 @_Z5tmainIiET_v()
@@ -626,11 +626,11 @@ int main() {
 // CHECK9:       arraydestroy.body:
 // CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP38]], [[OMP_OFFLOAD_CONT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
-// CHECK9-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK9-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN]]
 // CHECK9-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE2:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK9:       arraydestroy.done2:
-// CHECK9-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]]
 // CHECK9-NEXT:    [[TMP39:%.*]] = load i32, ptr [[RETVAL]], align 4
 // CHECK9-NEXT:    ret i32 [[TMP39]]
 //
@@ -690,7 +690,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l94.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i64 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], i64 noundef [[SVAR:%.*]]) #[[ATTR4:[0-9]+]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i64 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], i64 noundef [[SVAR:%.*]]) #[[ATTR3]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -827,14 +827,14 @@ int main() {
 // CHECK9-NEXT:    store i32 [[TMP26]], ptr [[SVAR_ADDR]], align 4
 // CHECK9-NEXT:    br label [[DOTOMP_LASTPRIVATE_DONE]]
 // CHECK9:       .omp.lastprivate.done:
-// CHECK9-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR4]]
 // CHECK9-NEXT:    [[ARRAY_BEGIN14:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR4]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN14]], i64 2
 // CHECK9-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK9:       arraydestroy.body:
 // CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP27]], [[DOTOMP_LASTPRIVATE_DONE]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
-// CHECK9-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK9-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN14]]
 // CHECK9-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE15:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK9:       arraydestroy.done15:
@@ -847,12 +847,12 @@ int main() {
 // CHECK9-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
 // CHECK9-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
-// CHECK9-NEXT:    call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]]
 // CHECK9-NEXT:    ret void
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@_Z5tmainIiET_v
-// CHECK9-SAME: () #[[ATTR6:[0-9]+]] comdat {
+// CHECK9-SAME: () #[[ATTR5:[0-9]+]] comdat {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
 // CHECK9-NEXT:    [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4
@@ -937,7 +937,7 @@ int main() {
 // CHECK9-NEXT:    [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0
 // CHECK9-NEXT:    br i1 [[TMP32]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK9:       omp_offload.failed:
-// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l49(ptr [[VEC]], i64 [[TMP2]], ptr [[S_ARR]], ptr [[TMP3]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l49(ptr [[VEC]], i64 [[TMP2]], ptr [[S_ARR]], ptr [[TMP3]]) #[[ATTR4]]
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK9:       omp_offload.cont:
 // CHECK9-NEXT:    store i32 0, ptr [[RETVAL]], align 4
@@ -947,11 +947,11 @@ int main() {
 // CHECK9:       arraydestroy.body:
 // CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP33]], [[OMP_OFFLOAD_CONT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
-// CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK9-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN]]
 // CHECK9-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE2:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK9:       arraydestroy.done2:
-// CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]]
 // CHECK9-NEXT:    [[TMP34:%.*]] = load i32, ptr [[RETVAL]], align 4
 // CHECK9-NEXT:    ret i32 [[TMP34]]
 //
@@ -1039,7 +1039,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l49.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i64 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR4]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i64 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR3]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1171,14 +1171,14 @@ int main() {
 // CHECK9-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[TMP3]], ptr align 4 [[TMP25]], i64 4, i1 false)
 // CHECK9-NEXT:    br label [[DOTOMP_LASTPRIVATE_DONE]]
 // CHECK9:       .omp.lastprivate.done:
-// CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR4]]
 // CHECK9-NEXT:    [[ARRAY_BEGIN13:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR4]], i32 0, i32 0
 // CHECK9-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN13]], i64 2
 // CHECK9-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK9:       arraydestroy.body:
 // CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP26]], [[DOTOMP_LASTPRIVATE_DONE]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK9-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
-// CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK9-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN13]]
 // CHECK9-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE14:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK9:       arraydestroy.done14:
@@ -1191,7 +1191,7 @@ int main() {
 // CHECK9-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
 // CHECK9-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
-// CHECK9-NEXT:    call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]]
+// CHECK9-NEXT:    call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]]
 // CHECK9-NEXT:    ret void
 //
 //
@@ -1230,7 +1230,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK9-SAME: () #[[ATTR7:[0-9]+]] {
+// CHECK9-SAME: () #[[ATTR6:[0-9]+]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK9-NEXT:    ret void
@@ -1337,7 +1337,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP37:%.*]] = icmp ne i32 [[TMP36]], 0
 // CHECK11-NEXT:    br i1 [[TMP37]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK11:       omp_offload.failed:
-// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l94(ptr [[VEC]], i32 [[TMP2]], ptr [[S_ARR]], ptr [[TMP3]], i32 [[TMP5]]) #[[ATTR5:[0-9]+]]
+// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l94(ptr [[VEC]], i32 [[TMP2]], ptr [[S_ARR]], ptr [[TMP3]], i32 [[TMP5]]) #[[ATTR4:[0-9]+]]
 // CHECK11-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK11:       omp_offload.cont:
 // CHECK11-NEXT:    [[CALL:%.*]] = call noundef i32 @_Z5tmainIiET_v()
@@ -1348,11 +1348,11 @@ int main() {
 // CHECK11:       arraydestroy.body:
 // CHECK11-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP38]], [[OMP_OFFLOAD_CONT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK11-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1
-// CHECK11-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK11-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN]]
 // CHECK11-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE2:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK11:       arraydestroy.done2:
-// CHECK11-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]]
 // CHECK11-NEXT:    [[TMP39:%.*]] = load i32, ptr [[RETVAL]], align 4
 // CHECK11-NEXT:    ret i32 [[TMP39]]
 //
@@ -1412,7 +1412,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l94.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i32 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], i32 noundef [[SVAR:%.*]]) #[[ATTR4:[0-9]+]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i32 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]], i32 noundef [[SVAR:%.*]]) #[[ATTR3]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1547,14 +1547,14 @@ int main() {
 // CHECK11-NEXT:    store i32 [[TMP26]], ptr [[SVAR_ADDR]], align 4
 // CHECK11-NEXT:    br label [[DOTOMP_LASTPRIVATE_DONE]]
 // CHECK11:       .omp.lastprivate.done:
-// CHECK11-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR4]]
 // CHECK11-NEXT:    [[ARRAY_BEGIN13:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR4]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN13]], i32 2
 // CHECK11-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK11:       arraydestroy.body:
 // CHECK11-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP27]], [[DOTOMP_LASTPRIVATE_DONE]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK11-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1
-// CHECK11-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK11-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN13]]
 // CHECK11-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE14:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK11:       arraydestroy.done14:
@@ -1567,12 +1567,12 @@ int main() {
 // CHECK11-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
 // CHECK11-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
-// CHECK11-NEXT:    call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]]
 // CHECK11-NEXT:    ret void
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@_Z5tmainIiET_v
-// CHECK11-SAME: () #[[ATTR6:[0-9]+]] comdat {
+// CHECK11-SAME: () #[[ATTR5:[0-9]+]] comdat {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4
@@ -1657,7 +1657,7 @@ int main() {
 // CHECK11-NEXT:    [[TMP32:%.*]] = icmp ne i32 [[TMP31]], 0
 // CHECK11-NEXT:    br i1 [[TMP32]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK11:       omp_offload.failed:
-// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l49(ptr [[VEC]], i32 [[TMP2]], ptr [[S_ARR]], ptr [[TMP3]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l49(ptr [[VEC]], i32 [[TMP2]], ptr [[S_ARR]], ptr [[TMP3]]) #[[ATTR4]]
 // CHECK11-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK11:       omp_offload.cont:
 // CHECK11-NEXT:    store i32 0, ptr [[RETVAL]], align 4
@@ -1667,11 +1667,11 @@ int main() {
 // CHECK11:       arraydestroy.body:
 // CHECK11-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP33]], [[OMP_OFFLOAD_CONT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK11-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1
-// CHECK11-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK11-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN]]
 // CHECK11-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE2:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK11:       arraydestroy.done2:
-// CHECK11-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[TEST]]) #[[ATTR4]]
 // CHECK11-NEXT:    [[TMP34:%.*]] = load i32, ptr [[RETVAL]], align 4
 // CHECK11-NEXT:    ret i32 [[TMP34]]
 //
@@ -1759,7 +1759,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l49.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i32 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR4]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[VEC:%.*]], i32 noundef [[T_VAR:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[S_ARR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[VAR:%.*]]) #[[ATTR3]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1889,14 +1889,14 @@ int main() {
 // CHECK11-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[TMP3]], ptr align 4 [[TMP25]], i32 4, i1 false)
 // CHECK11-NEXT:    br label [[DOTOMP_LASTPRIVATE_DONE]]
 // CHECK11:       .omp.lastprivate.done:
-// CHECK11-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR5]]) #[[ATTR4]]
 // CHECK11-NEXT:    [[ARRAY_BEGIN12:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR4]], i32 0, i32 0
 // CHECK11-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN12]], i32 2
 // CHECK11-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK11:       arraydestroy.body:
 // CHECK11-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP26]], [[DOTOMP_LASTPRIVATE_DONE]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK11-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1
-// CHECK11-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR4]]
 // CHECK11-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN12]]
 // CHECK11-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE13:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK11:       arraydestroy.done13:
@@ -1909,7 +1909,7 @@ int main() {
 // CHECK11-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
 // CHECK11-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
-// CHECK11-NEXT:    call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]]
+// CHECK11-NEXT:    call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]]
 // CHECK11-NEXT:    ret void
 //
 //
@@ -1948,7 +1948,7 @@ int main() {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK11-SAME: () #[[ATTR7:[0-9]+]] {
+// CHECK11-SAME: () #[[ATTR6:[0-9]+]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK11-NEXT:    ret void
diff --git a/clang/test/OpenMP/target_teams_distribute_simd_private_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_simd_private_codegen.cpp
index 92fdc5c43f729..d754a33dc8d0b 100644
--- a/clang/test/OpenMP/target_teams_distribute_simd_private_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_simd_private_codegen.cpp
@@ -284,7 +284,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l91.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR5:[0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR4]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -395,7 +395,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@_Z5tmainIiET_v
-// CHECK1-SAME: () #[[ATTR7:[0-9]+]] comdat {
+// CHECK1-SAME: () #[[ATTR6:[0-9]+]] comdat {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4
@@ -495,7 +495,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l56.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR5]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR4]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -833,7 +833,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l91.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR5:[0-9]+]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR4]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -942,7 +942,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@_Z5tmainIiET_v
-// CHECK3-SAME: () #[[ATTR7:[0-9]+]] comdat {
+// CHECK3-SAME: () #[[ATTR6:[0-9]+]] comdat {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
 // CHECK3-NEXT:    [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4
@@ -1042,7 +1042,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l56.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR5]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR4]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -2096,7 +2096,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l74.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR6:[0-9]+]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR5]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
diff --git a/clang/test/OpenMP/target_teams_distribute_simd_reduction_codegen.cpp b/clang/test/OpenMP/target_teams_distribute_simd_reduction_codegen.cpp
index d68fa035b0148..63ad2c3bdeb8c 100644
--- a/clang/test/OpenMP/target_teams_distribute_simd_reduction_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_distribute_simd_reduction_codegen.cpp
@@ -126,7 +126,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
 // CHECK1-NEXT:    br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK1:       omp_offload.failed:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l60(ptr @_ZZ4mainE5sivar) #[[ATTR3:[0-9]+]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l60(ptr @_ZZ4mainE5sivar) #[[ATTR2:[0-9]+]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    [[CALL:%.*]] = call noundef signext i32 @_Z5tmainIiET_v()
@@ -144,7 +144,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l60.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -241,7 +241,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l60.omp_outlined.omp.reduction.reduction_func
-// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4:[0-9]+]] {
+// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3:[0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8
@@ -261,7 +261,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@_Z5tmainIiET_v
-// CHECK1-SAME: () #[[ATTR6:[0-9]+]] comdat {
+// CHECK1-SAME: () #[[ATTR5:[0-9]+]] comdat {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[T_VAR:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[VEC:%.*]] = alloca [2 x i32], align 4
@@ -310,7 +310,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
 // CHECK1-NEXT:    br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK1:       omp_offload.failed:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32(ptr [[T_VAR]]) #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32(ptr [[T_VAR]]) #[[ATTR2]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    ret i32 0
@@ -327,7 +327,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -424,7 +424,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined.omp.reduction.reduction_func
-// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4]] {
+// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8
@@ -444,7 +444,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK1-SAME: () #[[ATTR8:[0-9]+]] {
+// CHECK1-SAME: () #[[ATTR7:[0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK1-NEXT:    ret void
@@ -498,7 +498,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
 // CHECK3-NEXT:    br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK3:       omp_offload.failed:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l60(ptr @_ZZ4mainE5sivar) #[[ATTR3:[0-9]+]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l60(ptr @_ZZ4mainE5sivar) #[[ATTR2:[0-9]+]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK3:       omp_offload.cont:
 // CHECK3-NEXT:    [[CALL:%.*]] = call noundef i32 @_Z5tmainIiET_v()
@@ -516,7 +516,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l60.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -613,7 +613,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l60.omp_outlined.omp.reduction.reduction_func
-// CHECK3-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4:[0-9]+]] {
+// CHECK3-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3:[0-9]+]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 4
@@ -633,7 +633,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@_Z5tmainIiET_v
-// CHECK3-SAME: () #[[ATTR6:[0-9]+]] comdat {
+// CHECK3-SAME: () #[[ATTR5:[0-9]+]] comdat {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[T_VAR:%.*]] = alloca i32, align 4
 // CHECK3-NEXT:    [[VEC:%.*]] = alloca [2 x i32], align 4
@@ -682,7 +682,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
 // CHECK3-NEXT:    br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK3:       omp_offload.failed:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32(ptr [[T_VAR]]) #[[ATTR3]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32(ptr [[T_VAR]]) #[[ATTR2]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK3:       omp_offload.cont:
 // CHECK3-NEXT:    ret i32 0
@@ -699,7 +699,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -796,7 +796,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined.omp.reduction.reduction_func
-// CHECK3-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4]] {
+// CHECK3-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 4
@@ -816,7 +816,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK3-SAME: () #[[ATTR8:[0-9]+]] {
+// CHECK3-SAME: () #[[ATTR7:[0-9]+]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK3-NEXT:    ret void
@@ -1041,7 +1041,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l44.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR2]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1142,7 +1142,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l44.omp_outlined.omp.reduction.reduction_func
-// CHECK9-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR5:[0-9]+]] {
+// CHECK9-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4:[0-9]+]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8
@@ -1162,7 +1162,7 @@ int main() {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK9-SAME: () #[[ATTR7:[0-9]+]] {
+// CHECK9-SAME: () #[[ATTR6:[0-9]+]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK9-NEXT:    ret void
diff --git a/clang/test/OpenMP/target_teams_generic_loop_codegen-1.cpp b/clang/test/OpenMP/target_teams_generic_loop_codegen-1.cpp
index 2df50fc62e610..ad84510e7f8ab 100644
--- a/clang/test/OpenMP/target_teams_generic_loop_codegen-1.cpp
+++ b/clang/test/OpenMP/target_teams_generic_loop_codegen-1.cpp
@@ -956,7 +956,7 @@ int target_teams_fun(int *g){
 //
 //
 // CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16target_teams_funPi_l57
-// CHECK2-SAME: (i64 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[A:%.*]], ptr noundef [[G:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK2-SAME: (i64 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[A:%.*]], ptr noundef [[G:%.*]]) #[[ATTR1]] {
 // CHECK2-NEXT:  entry:
 // CHECK2-NEXT:    [[N_ADDR:%.*]] = alloca i64, align 8
 // CHECK2-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
@@ -1175,7 +1175,7 @@ int target_teams_fun(int *g){
 //
 //
 // CHECK2-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK2-SAME: () #[[ATTR4:[0-9]+]] {
+// CHECK2-SAME: () #[[ATTR3:[0-9]+]] {
 // CHECK2-NEXT:  entry:
 // CHECK2-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK2-NEXT:    ret void
@@ -1583,7 +1583,7 @@ int target_teams_fun(int *g){
 //
 //
 // CHECK4-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16target_teams_funPi_l57
-// CHECK4-SAME: (i32 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[A:%.*]], ptr noundef [[G:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK4-SAME: (i32 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[A:%.*]], ptr noundef [[G:%.*]]) #[[ATTR1]] {
 // CHECK4-NEXT:  entry:
 // CHECK4-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
 // CHECK4-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
@@ -1797,21 +1797,23 @@ int target_teams_fun(int *g){
 //
 //
 // CHECK4-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK4-SAME: () #[[ATTR4:[0-9]+]] {
+// CHECK4-SAME: () #[[ATTR3:[0-9]+]] {
 // CHECK4-NEXT:  entry:
 // CHECK4-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK4-NEXT:    ret void
 //
 //
 // CHECK10-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16target_teams_funPi_l51
-// CHECK10-SAME: (i64 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]], i64 noundef [[DOTCAPTURE_EXPR_1:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK10-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[A:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]], i64 noundef [[DOTCAPTURE_EXPR_1:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK10-NEXT:  entry:
+// CHECK10-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK10-NEXT:    [[N_ADDR:%.*]] = alloca i64, align 8
 // CHECK10-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
 // CHECK10-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8
 // CHECK10-NEXT:    [[DOTCAPTURE_EXPR__ADDR2:%.*]] = alloca i64, align 8
 // CHECK10-NEXT:    [[N_CASTED:%.*]] = alloca i64, align 8
 // CHECK10-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB3:[0-9]+]])
+// CHECK10-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK10-NEXT:    store i64 [[N]], ptr [[N_ADDR]], align 8
 // CHECK10-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
 // CHECK10-NEXT:    store i64 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 8
@@ -2020,12 +2022,14 @@ int target_teams_fun(int *g){
 //
 //
 // CHECK10-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16target_teams_funPi_l57
-// CHECK10-SAME: (i64 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[A:%.*]], ptr noundef [[G:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK10-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[A:%.*]], ptr noundef [[G:%.*]]) #[[ATTR0]] {
 // CHECK10-NEXT:  entry:
+// CHECK10-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK10-NEXT:    [[N_ADDR:%.*]] = alloca i64, align 8
 // CHECK10-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
 // CHECK10-NEXT:    [[G_ADDR:%.*]] = alloca ptr, align 8
 // CHECK10-NEXT:    [[N_CASTED:%.*]] = alloca i64, align 8
+// CHECK10-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK10-NEXT:    store i64 [[N]], ptr [[N_ADDR]], align 8
 // CHECK10-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 8
 // CHECK10-NEXT:    store ptr [[G]], ptr [[G_ADDR]], align 8
@@ -2239,14 +2243,16 @@ int target_teams_fun(int *g){
 //
 //
 // CHECK12-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16target_teams_funPi_l51
-// CHECK12-SAME: (i32 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]], i32 noundef [[DOTCAPTURE_EXPR_1:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK12-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[A:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]], i32 noundef [[DOTCAPTURE_EXPR_1:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK12-NEXT:  entry:
+// CHECK12-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK12-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
 // CHECK12-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
 // CHECK12-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i32, align 4
 // CHECK12-NEXT:    [[DOTCAPTURE_EXPR__ADDR2:%.*]] = alloca i32, align 4
 // CHECK12-NEXT:    [[N_CASTED:%.*]] = alloca i32, align 4
 // CHECK12-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB3:[0-9]+]])
+// CHECK12-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK12-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
 // CHECK12-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
 // CHECK12-NEXT:    store i32 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 4
@@ -2450,12 +2456,14 @@ int target_teams_fun(int *g){
 //
 //
 // CHECK12-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16target_teams_funPi_l57
-// CHECK12-SAME: (i32 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[A:%.*]], ptr noundef [[G:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK12-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[N:%.*]], ptr noundef nonnull align 4 dereferenceable(4000) [[A:%.*]], ptr noundef [[G:%.*]]) #[[ATTR0]] {
 // CHECK12-NEXT:  entry:
+// CHECK12-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK12-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
 // CHECK12-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 4
 // CHECK12-NEXT:    [[G_ADDR:%.*]] = alloca ptr, align 4
 // CHECK12-NEXT:    [[N_CASTED:%.*]] = alloca i32, align 4
+// CHECK12-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK12-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
 // CHECK12-NEXT:    store ptr [[A]], ptr [[A_ADDR]], align 4
 // CHECK12-NEXT:    store ptr [[G]], ptr [[G_ADDR]], align 4
diff --git a/clang/test/OpenMP/target_teams_generic_loop_codegen.cpp b/clang/test/OpenMP/target_teams_generic_loop_codegen.cpp
index f32e7e9993e64..63926bb444081 100644
--- a/clang/test/OpenMP/target_teams_generic_loop_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_generic_loop_codegen.cpp
@@ -1143,22 +1143,25 @@ int foo() {
 // CHECK-NEXT:    call void @"_omp$reduction$reduction_func.2"(ptr [[TMP7]], ptr [[DOTOMP_REDUCTION_RED_LIST_ASCAST]]) #[[ATTR2]]
 // CHECK-NEXT:    ret void
 // IR-GPU-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22
-// IR-GPU-SAME: (i64 noundef [[J:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[SUM:%.*]]) #[[ATTR0:[0-9]+]] {
+// IR-GPU-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[J:%.*]], ptr noundef nonnull align 4 dereferenceable(400) [[SUM:%.*]]) #[[ATTR0:[0-9]+]] {
 // IR-GPU-NEXT:  entry:
+// IR-GPU-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 // IR-GPU-NEXT:    [[J_ADDR:%.*]] = alloca i64, align 8, addrspace(5)
 // IR-GPU-NEXT:    [[SUM_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 // IR-GPU-NEXT:    [[J_CASTED:%.*]] = alloca i64, align 8, addrspace(5)
 // IR-GPU-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
 // IR-GPU-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4, addrspace(5)
+// IR-GPU-NEXT:    [[DYN_PTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DYN_PTR_ADDR]] to ptr
 // IR-GPU-NEXT:    [[J_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[J_ADDR]] to ptr
 // IR-GPU-NEXT:    [[SUM_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM_ADDR]] to ptr
 // IR-GPU-NEXT:    [[J_CASTED_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[J_CASTED]] to ptr
 // IR-GPU-NEXT:    [[DOTZERO_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTZERO_ADDR]] to ptr
 // IR-GPU-NEXT:    [[DOTTHREADID_TEMP__ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTTHREADID_TEMP_]] to ptr
+// IR-GPU-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR_ASCAST]], align 8
 // IR-GPU-NEXT:    store i64 [[J]], ptr [[J_ADDR_ASCAST]], align 8
 // IR-GPU-NEXT:    store ptr [[SUM]], ptr [[SUM_ADDR_ASCAST]], align 8
 // IR-GPU-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[SUM_ADDR_ASCAST]], align 8
-// IR-GPU-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22_kernel_environment to ptr))
+// IR-GPU-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l22_kernel_environment to ptr), ptr [[DYN_PTR]])
 // IR-GPU-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
 // IR-GPU-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 // IR-GPU:       user_code.entry:
diff --git a/clang/test/OpenMP/target_teams_generic_loop_collapse_codegen.cpp b/clang/test/OpenMP/target_teams_generic_loop_collapse_codegen.cpp
index 558d9ab15c940..0a6ae1ad405c2 100644
--- a/clang/test/OpenMP/target_teams_generic_loop_collapse_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_generic_loop_collapse_codegen.cpp
@@ -158,7 +158,7 @@ int main (int argc, char **argv) {
 // CHECK1-NEXT:    [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
 // CHECK1-NEXT:    br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK1:       omp_offload.failed:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28(ptr [[THIS1]]) #[[ATTR3:[0-9]+]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28(ptr [[THIS1]]) #[[ATTR2:[0-9]+]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    [[A3:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0
@@ -179,7 +179,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -244,7 +244,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28.omp_outlined.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -336,7 +336,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK1-SAME: () #[[ATTR4:[0-9]+]] {
+// CHECK1-SAME: () #[[ATTR3:[0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK1-NEXT:    ret void
@@ -401,7 +401,7 @@ int main (int argc, char **argv) {
 // CHECK3-NEXT:    [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
 // CHECK3-NEXT:    br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK3:       omp_offload.failed:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28(ptr [[THIS1]]) #[[ATTR3:[0-9]+]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28(ptr [[THIS1]]) #[[ATTR2:[0-9]+]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK3:       omp_offload.cont:
 // CHECK3-NEXT:    [[A3:%.*]] = getelementptr inbounds [[STRUCT_SS]], ptr [[THIS1]], i32 0, i32 0
@@ -422,7 +422,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -485,7 +485,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l28.omp_outlined.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -573,7 +573,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK3-SAME: () #[[ATTR4:[0-9]+]] {
+// CHECK3-SAME: () #[[ATTR3:[0-9]+]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK3-NEXT:    ret void
@@ -708,7 +708,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[TMP50:%.*]] = icmp ne i32 [[TMP49]], 0
 // CHECK9-NEXT:    br i1 [[TMP50]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK9:       omp_offload.failed:
-// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l81(i64 [[TMP7]], i64 [[TMP9]], i64 [[TMP1]], i64 [[TMP3]], ptr [[VLA]]) #[[ATTR4:[0-9]+]]
+// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l81(i64 [[TMP7]], i64 [[TMP9]], i64 [[TMP1]], i64 [[TMP3]], ptr [[VLA]]) #[[ATTR3:[0-9]+]]
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK9:       omp_offload.cont:
 // CHECK9-NEXT:    [[TMP51:%.*]] = load i32, ptr [[ARGC_ADDR]], align 4
@@ -749,7 +749,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l81.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[M:%.*]], i64 noundef [[VLA:%.*]], i64 noundef [[VLA1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[M:%.*]], i64 noundef [[VLA:%.*]], i64 noundef [[VLA1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -867,7 +867,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l81.omp_outlined.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[M:%.*]], i64 noundef [[VLA:%.*]], i64 noundef [[VLA1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], i64 noundef [[N:%.*]], i64 noundef [[M:%.*]], i64 noundef [[VLA:%.*]], i64 noundef [[VLA1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1021,7 +1021,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@_Z5tmainIiLi10ELi2EEiT_
-// CHECK9-SAME: (i32 noundef signext [[ARGC:%.*]]) #[[ATTR6:[0-9]+]] comdat {
+// CHECK9-SAME: (i32 noundef signext [[ARGC:%.*]]) #[[ATTR5:[0-9]+]] comdat {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[ARGC_ADDR:%.*]] = alloca i32, align 4
 // CHECK9-NEXT:    [[A:%.*]] = alloca [10 x [2 x i32]], align 4
@@ -1070,7 +1070,7 @@ int main (int argc, char **argv) {
 // CHECK9-NEXT:    [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
 // CHECK9-NEXT:    br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK9:       omp_offload.failed:
-// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10ELi2EEiT__l68(ptr [[A]]) #[[ATTR4]]
+// CHECK9-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10ELi2EEiT__l68(ptr [[A]]) #[[ATTR3]]
 // CHECK9-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK9:       omp_offload.cont:
 // CHECK9-NEXT:    ret i32 0
@@ -1087,7 +1087,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10ELi2EEiT__l68.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(80) [[A:%.*]]) #[[ATTR3]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(80) [[A:%.*]]) #[[ATTR2]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1152,7 +1152,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10ELi2EEiT__l68.omp_outlined.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(80) [[A:%.*]]) #[[ATTR3]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(80) [[A:%.*]]) #[[ATTR2]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1243,7 +1243,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK9-SAME: () #[[ATTR7:[0-9]+]] {
+// CHECK9-SAME: () #[[ATTR6:[0-9]+]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK9-NEXT:    ret void
@@ -1377,7 +1377,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[TMP49:%.*]] = icmp ne i32 [[TMP48]], 0
 // CHECK11-NEXT:    br i1 [[TMP49]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK11:       omp_offload.failed:
-// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l81(i32 [[TMP5]], i32 [[TMP7]], i32 [[TMP0]], i32 [[TMP1]], ptr [[VLA]]) #[[ATTR4:[0-9]+]]
+// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l81(i32 [[TMP5]], i32 [[TMP7]], i32 [[TMP0]], i32 [[TMP1]], ptr [[VLA]]) #[[ATTR3:[0-9]+]]
 // CHECK11-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK11:       omp_offload.cont:
 // CHECK11-NEXT:    [[TMP50:%.*]] = load i32, ptr [[ARGC_ADDR]], align 4
@@ -1418,7 +1418,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l81.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[M:%.*]], i32 noundef [[VLA:%.*]], i32 noundef [[VLA1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[M:%.*]], i32 noundef [[VLA:%.*]], i32 noundef [[VLA1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1538,7 +1538,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l81.omp_outlined.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[M:%.*]], i32 noundef [[VLA:%.*]], i32 noundef [[VLA1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR3]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], i32 noundef [[N:%.*]], i32 noundef [[M:%.*]], i32 noundef [[VLA:%.*]], i32 noundef [[VLA1:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR2]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1692,7 +1692,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@_Z5tmainIiLi10ELi2EEiT_
-// CHECK11-SAME: (i32 noundef [[ARGC:%.*]]) #[[ATTR6:[0-9]+]] comdat {
+// CHECK11-SAME: (i32 noundef [[ARGC:%.*]]) #[[ATTR5:[0-9]+]] comdat {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[ARGC_ADDR:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[A:%.*]] = alloca [10 x [2 x i32]], align 4
@@ -1741,7 +1741,7 @@ int main (int argc, char **argv) {
 // CHECK11-NEXT:    [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
 // CHECK11-NEXT:    br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK11:       omp_offload.failed:
-// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10ELi2EEiT__l68(ptr [[A]]) #[[ATTR4]]
+// CHECK11-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10ELi2EEiT__l68(ptr [[A]]) #[[ATTR3]]
 // CHECK11-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK11:       omp_offload.cont:
 // CHECK11-NEXT:    ret i32 0
@@ -1758,7 +1758,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10ELi2EEiT__l68.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(80) [[A:%.*]]) #[[ATTR3]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(80) [[A:%.*]]) #[[ATTR2]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1821,7 +1821,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiLi10ELi2EEiT__l68.omp_outlined.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(80) [[A:%.*]]) #[[ATTR3]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(80) [[A:%.*]]) #[[ATTR2]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1908,7 +1908,7 @@ int main (int argc, char **argv) {
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK11-SAME: () #[[ATTR7:[0-9]+]] {
+// CHECK11-SAME: () #[[ATTR6:[0-9]+]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK11-NEXT:    ret void
diff --git a/clang/test/OpenMP/target_teams_generic_loop_depend_codegen.cpp b/clang/test/OpenMP/target_teams_generic_loop_depend_codegen.cpp
index a712cd7d33b88..b17b252baaeea 100644
--- a/clang/test/OpenMP/target_teams_generic_loop_depend_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_generic_loop_depend_codegen.cpp
@@ -3126,7 +3126,7 @@ int foo(int n) {
 //
 //
 // CHECK-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK-SAME: () #[[ATTR8:[0-9]+]] {
+// CHECK-SAME: () #[[ATTR7:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK-NEXT:    ret void
@@ -3184,11 +3184,8 @@ int foo(int n) {
 //
 //
 //
-// TCHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l66
-// TCHECK-SAME: () #[[ATTR0:[0-9]+]] {
-// TCHECK-NEXT:  entry:
-// TCHECK-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3:[0-9]+]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3fooi_l66.omp_outlined)
-// TCHECK-NEXT:    ret void
+//
+//
 //
 //
 //
@@ -3217,3 +3214,4 @@ int foo(int n) {
 //// NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 // SIMD-ONLY0: {{.*}}
 // SIMD-ONLY1: {{.*}}
+// TCHECK: {{.*}}
diff --git a/clang/test/OpenMP/target_teams_generic_loop_if_codegen.cpp b/clang/test/OpenMP/target_teams_generic_loop_if_codegen.cpp
index f120ec90de250..4291a405e4baf 100644
--- a/clang/test/OpenMP/target_teams_generic_loop_if_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_generic_loop_if_codegen.cpp
@@ -140,10 +140,10 @@ int main() {
 // CHECK1-NEXT:    [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
 // CHECK1-NEXT:    br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK1:       omp_offload.failed:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l48() #[[ATTR3:[0-9]+]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l48() #[[ATTR2:[0-9]+]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l51() #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l51() #[[ATTR2]]
 // CHECK1-NEXT:    ret void
 //
 //
@@ -155,7 +155,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l48.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -215,7 +215,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l48.omp_outlined.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -285,14 +285,14 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l51
-// CHECK1-SAME: () #[[ATTR2]] {
+// CHECK1-SAME: () #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l51.omp_outlined)
 // CHECK1-NEXT:    ret void
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l51.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -352,7 +352,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l51.omp_outlined.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -423,7 +423,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@main
-// CHECK1-SAME: () #[[ATTR4:[0-9]+]] {
+// CHECK1-SAME: () #[[ATTR3:[0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[TMP:%.*]] = alloca i32, align 4
@@ -466,10 +466,10 @@ int main() {
 // CHECK1-NEXT:    [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
 // CHECK1-NEXT:    br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK1:       omp_offload.failed:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l76() #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l76() #[[ATTR2]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l83() #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l83() #[[ATTR2]]
 // CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr @Arg, align 4
 // CHECK1-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP15]], 0
 // CHECK1-NEXT:    [[FROMBOOL:%.*]] = zext i1 [[TOBOOL]] to i8
@@ -525,12 +525,12 @@ int main() {
 // CHECK1-NEXT:    [[TMP41:%.*]] = icmp ne i32 [[TMP40]], 0
 // CHECK1-NEXT:    br i1 [[TMP41]], label [[OMP_OFFLOAD_FAILED7:%.*]], label [[OMP_OFFLOAD_CONT8:%.*]]
 // CHECK1:       omp_offload.failed7:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l90(i64 [[TMP17]]) #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l90(i64 [[TMP17]]) #[[ATTR2]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT8]]
 // CHECK1:       omp_offload.cont8:
 // CHECK1-NEXT:    br label [[OMP_IF_END:%.*]]
 // CHECK1:       omp_if.else:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l90(i64 [[TMP17]]) #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l90(i64 [[TMP17]]) #[[ATTR2]]
 // CHECK1-NEXT:    br label [[OMP_IF_END]]
 // CHECK1:       omp_if.end:
 // CHECK1-NEXT:    [[TMP42:%.*]] = load i32, ptr @Arg, align 4
@@ -546,7 +546,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l76.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -606,7 +606,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l76.omp_outlined.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -677,14 +677,14 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l83
-// CHECK1-SAME: () #[[ATTR2]] {
+// CHECK1-SAME: () #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l83.omp_outlined)
 // CHECK1-NEXT:    ret void
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l83.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -732,7 +732,7 @@ int main() {
 // CHECK1-NEXT:    call void @__kmpc_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP1]])
 // CHECK1-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
 // CHECK1-NEXT:    store i32 0, ptr [[DOTBOUND_ZERO_ADDR]], align 4
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l83.omp_outlined.omp_outlined(ptr [[TMP11]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP8]], i64 [[TMP10]]) #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l83.omp_outlined.omp_outlined(ptr [[TMP11]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP8]], i64 [[TMP10]]) #[[ATTR2]]
 // CHECK1-NEXT:    call void @__kmpc_end_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP1]])
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK1:       omp.inner.for.inc:
@@ -749,7 +749,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l83.omp_outlined.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -835,7 +835,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l90.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -892,7 +892,7 @@ int main() {
 // CHECK1-NEXT:    call void @__kmpc_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP1]])
 // CHECK1-NEXT:    [[TMP12:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
 // CHECK1-NEXT:    store i32 0, ptr [[DOTBOUND_ZERO_ADDR]], align 4
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l90.omp_outlined.omp_outlined(ptr [[TMP12]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP8]], i64 [[TMP10]]) #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l90.omp_outlined.omp_outlined(ptr [[TMP12]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP8]], i64 [[TMP10]]) #[[ATTR2]]
 // CHECK1-NEXT:    call void @__kmpc_end_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP1]])
 // CHECK1-NEXT:    br label [[OMP_IF_END]]
 // CHECK1:       omp_if.end:
@@ -911,7 +911,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l90.omp_outlined.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1020,10 +1020,10 @@ int main() {
 // CHECK1-NEXT:    [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
 // CHECK1-NEXT:    br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK1:       omp_offload.failed:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l60() #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l60() #[[ATTR2]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l64() #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l64() #[[ATTR2]]
 // CHECK1-NEXT:    [[TMP15:%.*]] = load i32, ptr [[ARG_ADDR]], align 4
 // CHECK1-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP15]], 0
 // CHECK1-NEXT:    br i1 [[TOBOOL]], label [[OMP_IF_THEN:%.*]], label [[OMP_IF_ELSE:%.*]]
@@ -1058,12 +1058,12 @@ int main() {
 // CHECK1-NEXT:    [[TMP30:%.*]] = icmp ne i32 [[TMP29]], 0
 // CHECK1-NEXT:    br i1 [[TMP30]], label [[OMP_OFFLOAD_FAILED3:%.*]], label [[OMP_OFFLOAD_CONT4:%.*]]
 // CHECK1:       omp_offload.failed3:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l68() #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l68() #[[ATTR2]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT4]]
 // CHECK1:       omp_offload.cont4:
 // CHECK1-NEXT:    br label [[OMP_IF_END:%.*]]
 // CHECK1:       omp_if.else:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l68() #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l68() #[[ATTR2]]
 // CHECK1-NEXT:    br label [[OMP_IF_END]]
 // CHECK1:       omp_if.end:
 // CHECK1-NEXT:    ret i32 0
@@ -1077,7 +1077,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l60.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1137,7 +1137,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l60.omp_outlined.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1208,14 +1208,14 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l64
-// CHECK1-SAME: () #[[ATTR2]] {
+// CHECK1-SAME: () #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l64.omp_outlined)
 // CHECK1-NEXT:    ret void
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l64.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1263,7 +1263,7 @@ int main() {
 // CHECK1-NEXT:    call void @__kmpc_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP1]])
 // CHECK1-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
 // CHECK1-NEXT:    store i32 0, ptr [[DOTBOUND_ZERO_ADDR]], align 4
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l64.omp_outlined.omp_outlined(ptr [[TMP11]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP8]], i64 [[TMP10]]) #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l64.omp_outlined.omp_outlined(ptr [[TMP11]], ptr [[DOTBOUND_ZERO_ADDR]], i64 [[TMP8]], i64 [[TMP10]]) #[[ATTR2]]
 // CHECK1-NEXT:    call void @__kmpc_end_serialized_parallel(ptr @[[GLOB3]], i32 [[TMP1]])
 // CHECK1-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
 // CHECK1:       omp.inner.for.inc:
@@ -1280,7 +1280,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l64.omp_outlined.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1358,7 +1358,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l68.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1418,7 +1418,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiEiT__l68.omp_outlined.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1489,7 +1489,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK1-SAME: () #[[ATTR6:[0-9]+]] {
+// CHECK1-SAME: () #[[ATTR5:[0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK1-NEXT:    ret void
diff --git a/clang/test/OpenMP/target_teams_generic_loop_order_codegen.cpp b/clang/test/OpenMP/target_teams_generic_loop_order_codegen.cpp
index 9d498d294eb91..fb93d58b6bd17 100644
--- a/clang/test/OpenMP/target_teams_generic_loop_order_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_generic_loop_order_codegen.cpp
@@ -56,7 +56,7 @@ void gtid_test() {
 // CHECK1-NEXT:    [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
 // CHECK1-NEXT:    br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK1:       omp_offload.failed:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l16() #[[ATTR3:[0-9]+]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l16() #[[ATTR2:[0-9]+]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    ret void
@@ -70,7 +70,7 @@ void gtid_test() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l16.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -130,7 +130,7 @@ void gtid_test() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9gtid_testv_l16.omp_outlined.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -200,7 +200,7 @@ void gtid_test() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK1-SAME: () #[[ATTR4:[0-9]+]] section ".text.startup" {
+// CHECK1-SAME: () #[[ATTR3:[0-9]+]] section ".text.startup" {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK1-NEXT:    ret void
diff --git a/clang/test/OpenMP/target_teams_generic_loop_private_codegen.cpp b/clang/test/OpenMP/target_teams_generic_loop_private_codegen.cpp
index 0ad82d4ca7d2d..7cc148f4c4ee7 100644
--- a/clang/test/OpenMP/target_teams_generic_loop_private_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_generic_loop_private_codegen.cpp
@@ -345,7 +345,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l124.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR5:[0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR4]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -436,7 +436,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l124.omp_outlined.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR5]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR4]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -550,7 +550,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@_Z5tmainIiET_v
-// CHECK1-SAME: () #[[ATTR7:[0-9]+]] comdat {
+// CHECK1-SAME: () #[[ATTR6:[0-9]+]] comdat {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4
@@ -650,7 +650,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l80.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR5]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR4]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -744,7 +744,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l80.omp_outlined.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR5]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR4]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1085,7 +1085,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l124.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR5:[0-9]+]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR4]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1174,7 +1174,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l124.omp_outlined.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR5]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR4]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1284,7 +1284,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@_Z5tmainIiET_v
-// CHECK3-SAME: () #[[ATTR7:[0-9]+]] comdat {
+// CHECK3-SAME: () #[[ATTR6:[0-9]+]] comdat {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
 // CHECK3-NEXT:    [[TEST:%.*]] = alloca [[STRUCT_S_0:%.*]], align 4
@@ -1384,7 +1384,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l80.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR5]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR4]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1476,7 +1476,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l80.omp_outlined.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR5]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR4]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1779,7 +1779,7 @@ int main() {
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l104.omp_outlined
-// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR6:[0-9]+]] {
+// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR5]] {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1846,7 +1846,7 @@ int main() {
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l104.omp_outlined.omp_outlined
-// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR6]] {
+// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR5]] {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1952,14 +1952,16 @@ int main() {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l124
-// CHECK13-SAME: () #[[ATTR0:[0-9]+]] {
+// CHECK13-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK13-NEXT:  entry:
+// CHECK13-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
+// CHECK13-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK13-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3:[0-9]+]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l124.omp_outlined)
 // CHECK13-NEXT:    ret void
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l124.omp_outlined
-// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1986,12 +1988,12 @@ int main() {
 // CHECK13-NEXT:    br label [[ARRAYCTOR_LOOP:%.*]]
 // CHECK13:       arrayctor.loop:
 // CHECK13-NEXT:    [[ARRAYCTOR_CUR:%.*]] = phi ptr [ [[ARRAY_BEGIN]], [[ENTRY:%.*]] ], [ [[ARRAYCTOR_NEXT:%.*]], [[ARRAYCTOR_LOOP]] ]
-// CHECK13-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYCTOR_CUR]]) #[[ATTR5:[0-9]+]]
+// CHECK13-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYCTOR_CUR]]) #[[ATTR4:[0-9]+]]
 // CHECK13-NEXT:    [[ARRAYCTOR_NEXT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYCTOR_CUR]], i64 1
 // CHECK13-NEXT:    [[ARRAYCTOR_DONE:%.*]] = icmp eq ptr [[ARRAYCTOR_NEXT]], [[ARRAYCTOR_END]]
 // CHECK13-NEXT:    br i1 [[ARRAYCTOR_DONE]], label [[ARRAYCTOR_CONT:%.*]], label [[ARRAYCTOR_LOOP]]
 // CHECK13:       arrayctor.cont:
-// CHECK13-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR5]]
+// CHECK13-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR4]]
 // CHECK13-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
 // CHECK13-NEXT:    [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
 // CHECK13-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB1:[0-9]+]], i32 [[TMP1]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
@@ -2035,14 +2037,14 @@ int main() {
 // CHECK13-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
 // CHECK13-NEXT:    [[TMP14:%.*]] = load i32, ptr [[TMP13]], align 4
 // CHECK13-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB2:[0-9]+]], i32 [[TMP14]])
-// CHECK13-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR6:[0-9]+]]
+// CHECK13-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR5:[0-9]+]]
 // CHECK13-NEXT:    [[ARRAY_BEGIN2:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i32 0, i32 0
 // CHECK13-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN2]], i64 2
 // CHECK13-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK13:       arraydestroy.body:
 // CHECK13-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP15]], [[OMP_LOOP_EXIT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK13-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
-// CHECK13-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR6]]
+// CHECK13-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
 // CHECK13-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN2]]
 // CHECK13-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE3:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK13:       arraydestroy.done3:
@@ -2050,17 +2052,17 @@ int main() {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@_ZN1SIfEC1Ev
-// CHECK13-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR2:[0-9]+]] comdat {
+// CHECK13-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1:[0-9]+]] comdat {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
 // CHECK13-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
-// CHECK13-NEXT:    call void @_ZN1SIfEC2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]]
+// CHECK13-NEXT:    call void @_ZN1SIfEC2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]]
 // CHECK13-NEXT:    ret void
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l124.omp_outlined.omp_outlined
-// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR0]] {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -2097,12 +2099,12 @@ int main() {
 // CHECK13-NEXT:    br label [[ARRAYCTOR_LOOP:%.*]]
 // CHECK13:       arrayctor.loop:
 // CHECK13-NEXT:    [[ARRAYCTOR_CUR:%.*]] = phi ptr [ [[ARRAY_BEGIN]], [[ENTRY:%.*]] ], [ [[ARRAYCTOR_NEXT:%.*]], [[ARRAYCTOR_LOOP]] ]
-// CHECK13-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYCTOR_CUR]]) #[[ATTR5]]
+// CHECK13-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYCTOR_CUR]]) #[[ATTR4]]
 // CHECK13-NEXT:    [[ARRAYCTOR_NEXT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYCTOR_CUR]], i64 1
 // CHECK13-NEXT:    [[ARRAYCTOR_DONE:%.*]] = icmp eq ptr [[ARRAYCTOR_NEXT]], [[ARRAYCTOR_END]]
 // CHECK13-NEXT:    br i1 [[ARRAYCTOR_DONE]], label [[ARRAYCTOR_CONT:%.*]], label [[ARRAYCTOR_LOOP]]
 // CHECK13:       arrayctor.cont:
-// CHECK13-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR5]]
+// CHECK13-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR4]]
 // CHECK13-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
 // CHECK13-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
 // CHECK13-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB2]], i32 [[TMP3]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
@@ -2159,14 +2161,14 @@ int main() {
 // CHECK13-NEXT:    [[TMP16:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
 // CHECK13-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP16]], align 4
 // CHECK13-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP17]])
-// CHECK13-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR6]]
+// CHECK13-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR5]]
 // CHECK13-NEXT:    [[ARRAY_BEGIN7:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i32 0, i32 0
 // CHECK13-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN7]], i64 2
 // CHECK13-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK13:       arraydestroy.body:
 // CHECK13-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP18]], [[OMP_LOOP_EXIT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK13-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
-// CHECK13-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR6]]
+// CHECK13-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
 // CHECK13-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN7]]
 // CHECK13-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE8:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK13:       arraydestroy.done8:
@@ -2174,24 +2176,26 @@ int main() {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@_ZN1SIfED1Ev
-// CHECK13-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR2]] comdat {
+// CHECK13-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
 // CHECK13-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
-// CHECK13-NEXT:    call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR6]]
+// CHECK13-NEXT:    call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]]
 // CHECK13-NEXT:    ret void
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l80
-// CHECK13-SAME: () #[[ATTR0]] {
+// CHECK13-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
 // CHECK13-NEXT:  entry:
+// CHECK13-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
+// CHECK13-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK13-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l80.omp_outlined)
 // CHECK13-NEXT:    ret void
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l80.omp_outlined
-// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -2220,12 +2224,12 @@ int main() {
 // CHECK13-NEXT:    br label [[ARRAYCTOR_LOOP:%.*]]
 // CHECK13:       arrayctor.loop:
 // CHECK13-NEXT:    [[ARRAYCTOR_CUR:%.*]] = phi ptr [ [[ARRAY_BEGIN]], [[ENTRY:%.*]] ], [ [[ARRAYCTOR_NEXT:%.*]], [[ARRAYCTOR_LOOP]] ]
-// CHECK13-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYCTOR_CUR]]) #[[ATTR5]]
+// CHECK13-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYCTOR_CUR]]) #[[ATTR4]]
 // CHECK13-NEXT:    [[ARRAYCTOR_NEXT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYCTOR_CUR]], i64 1
 // CHECK13-NEXT:    [[ARRAYCTOR_DONE:%.*]] = icmp eq ptr [[ARRAYCTOR_NEXT]], [[ARRAYCTOR_END]]
 // CHECK13-NEXT:    br i1 [[ARRAYCTOR_DONE]], label [[ARRAYCTOR_CONT:%.*]], label [[ARRAYCTOR_LOOP]]
 // CHECK13:       arrayctor.cont:
-// CHECK13-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR5]]
+// CHECK13-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR4]]
 // CHECK13-NEXT:    store ptr [[VAR]], ptr [[_TMP2]], align 8
 // CHECK13-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
 // CHECK13-NEXT:    [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
@@ -2270,14 +2274,14 @@ int main() {
 // CHECK13-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
 // CHECK13-NEXT:    [[TMP14:%.*]] = load i32, ptr [[TMP13]], align 4
 // CHECK13-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP14]])
-// CHECK13-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR6]]
+// CHECK13-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR5]]
 // CHECK13-NEXT:    [[ARRAY_BEGIN4:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0
 // CHECK13-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN4]], i64 2
 // CHECK13-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK13:       arraydestroy.body:
 // CHECK13-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP15]], [[OMP_LOOP_EXIT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK13-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
-// CHECK13-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR6]]
+// CHECK13-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
 // CHECK13-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN4]]
 // CHECK13-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE5:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK13:       arraydestroy.done5:
@@ -2285,17 +2289,17 @@ int main() {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@_ZN1SIiEC1Ev
-// CHECK13-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR2]] comdat {
+// CHECK13-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
 // CHECK13-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
-// CHECK13-NEXT:    call void @_ZN1SIiEC2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]]
+// CHECK13-NEXT:    call void @_ZN1SIiEC2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]]
 // CHECK13-NEXT:    ret void
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l80.omp_outlined.omp_outlined
-// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK13-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR0]] {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -2334,12 +2338,12 @@ int main() {
 // CHECK13-NEXT:    br label [[ARRAYCTOR_LOOP:%.*]]
 // CHECK13:       arrayctor.loop:
 // CHECK13-NEXT:    [[ARRAYCTOR_CUR:%.*]] = phi ptr [ [[ARRAY_BEGIN]], [[ENTRY:%.*]] ], [ [[ARRAYCTOR_NEXT:%.*]], [[ARRAYCTOR_LOOP]] ]
-// CHECK13-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYCTOR_CUR]]) #[[ATTR5]]
+// CHECK13-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYCTOR_CUR]]) #[[ATTR4]]
 // CHECK13-NEXT:    [[ARRAYCTOR_NEXT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYCTOR_CUR]], i64 1
 // CHECK13-NEXT:    [[ARRAYCTOR_DONE:%.*]] = icmp eq ptr [[ARRAYCTOR_NEXT]], [[ARRAYCTOR_END]]
 // CHECK13-NEXT:    br i1 [[ARRAYCTOR_DONE]], label [[ARRAYCTOR_CONT:%.*]], label [[ARRAYCTOR_LOOP]]
 // CHECK13:       arrayctor.cont:
-// CHECK13-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR5]]
+// CHECK13-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR4]]
 // CHECK13-NEXT:    store ptr [[VAR]], ptr [[_TMP3]], align 8
 // CHECK13-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
 // CHECK13-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
@@ -2394,14 +2398,14 @@ int main() {
 // CHECK13-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
 // CHECK13-NEXT:    [[TMP16:%.*]] = load i32, ptr [[TMP15]], align 4
 // CHECK13-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP16]])
-// CHECK13-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR6]]
+// CHECK13-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR5]]
 // CHECK13-NEXT:    [[ARRAY_BEGIN8:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0
 // CHECK13-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN8]], i64 2
 // CHECK13-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK13:       arraydestroy.body:
 // CHECK13-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP17]], [[OMP_LOOP_EXIT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK13-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i64 -1
-// CHECK13-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR6]]
+// CHECK13-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
 // CHECK13-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN8]]
 // CHECK13-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE9:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK13:       arraydestroy.done9:
@@ -2409,17 +2413,17 @@ int main() {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@_ZN1SIiED1Ev
-// CHECK13-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR2]] comdat {
+// CHECK13-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
 // CHECK13-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
-// CHECK13-NEXT:    call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR6]]
+// CHECK13-NEXT:    call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]]
 // CHECK13-NEXT:    ret void
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@_ZN1SIfEC2Ev
-// CHECK13-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR2]] comdat {
+// CHECK13-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
@@ -2432,7 +2436,7 @@ int main() {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@_ZN1SIfED2Ev
-// CHECK13-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR2]] comdat {
+// CHECK13-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
@@ -2441,7 +2445,7 @@ int main() {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@_ZN1SIiEC2Ev
-// CHECK13-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR2]] comdat {
+// CHECK13-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
@@ -2453,7 +2457,7 @@ int main() {
 //
 //
 // CHECK13-LABEL: define {{[^@]+}}@_ZN1SIiED2Ev
-// CHECK13-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR2]] comdat {
+// CHECK13-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat {
 // CHECK13-NEXT:  entry:
 // CHECK13-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK13-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
@@ -2462,14 +2466,16 @@ int main() {
 //
 //
 // CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l124
-// CHECK15-SAME: () #[[ATTR0:[0-9]+]] {
+// CHECK15-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK15-NEXT:  entry:
+// CHECK15-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
+// CHECK15-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK15-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3:[0-9]+]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l124.omp_outlined)
 // CHECK15-NEXT:    ret void
 //
 //
 // CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l124.omp_outlined
-// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
 // CHECK15-NEXT:  entry:
 // CHECK15-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK15-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -2496,12 +2502,12 @@ int main() {
 // CHECK15-NEXT:    br label [[ARRAYCTOR_LOOP:%.*]]
 // CHECK15:       arrayctor.loop:
 // CHECK15-NEXT:    [[ARRAYCTOR_CUR:%.*]] = phi ptr [ [[ARRAY_BEGIN]], [[ENTRY:%.*]] ], [ [[ARRAYCTOR_NEXT:%.*]], [[ARRAYCTOR_LOOP]] ]
-// CHECK15-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYCTOR_CUR]]) #[[ATTR5:[0-9]+]]
+// CHECK15-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYCTOR_CUR]]) #[[ATTR4:[0-9]+]]
 // CHECK15-NEXT:    [[ARRAYCTOR_NEXT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYCTOR_CUR]], i32 1
 // CHECK15-NEXT:    [[ARRAYCTOR_DONE:%.*]] = icmp eq ptr [[ARRAYCTOR_NEXT]], [[ARRAYCTOR_END]]
 // CHECK15-NEXT:    br i1 [[ARRAYCTOR_DONE]], label [[ARRAYCTOR_CONT:%.*]], label [[ARRAYCTOR_LOOP]]
 // CHECK15:       arrayctor.cont:
-// CHECK15-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR5]]
+// CHECK15-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR4]]
 // CHECK15-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
 // CHECK15-NEXT:    [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
 // CHECK15-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB1:[0-9]+]], i32 [[TMP1]], i32 92, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_COMB_LB]], ptr [[DOTOMP_COMB_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
@@ -2543,14 +2549,14 @@ int main() {
 // CHECK15-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
 // CHECK15-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP11]], align 4
 // CHECK15-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB2:[0-9]+]], i32 [[TMP12]])
-// CHECK15-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR6:[0-9]+]]
+// CHECK15-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR5:[0-9]+]]
 // CHECK15-NEXT:    [[ARRAY_BEGIN2:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i32 0, i32 0
 // CHECK15-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN2]], i32 2
 // CHECK15-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK15:       arraydestroy.body:
 // CHECK15-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP13]], [[OMP_LOOP_EXIT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK15-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1
-// CHECK15-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR6]]
+// CHECK15-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
 // CHECK15-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN2]]
 // CHECK15-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE3:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK15:       arraydestroy.done3:
@@ -2558,17 +2564,17 @@ int main() {
 //
 //
 // CHECK15-LABEL: define {{[^@]+}}@_ZN1SIfEC1Ev
-// CHECK15-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR2:[0-9]+]] comdat align 2 {
+// CHECK15-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1:[0-9]+]] comdat align 2 {
 // CHECK15-NEXT:  entry:
 // CHECK15-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
 // CHECK15-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
 // CHECK15-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
-// CHECK15-NEXT:    call void @_ZN1SIfEC2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]]
+// CHECK15-NEXT:    call void @_ZN1SIfEC2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]]
 // CHECK15-NEXT:    ret void
 //
 //
 // CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l124.omp_outlined.omp_outlined
-// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR0]] {
 // CHECK15-NEXT:  entry:
 // CHECK15-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK15-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -2603,12 +2609,12 @@ int main() {
 // CHECK15-NEXT:    br label [[ARRAYCTOR_LOOP:%.*]]
 // CHECK15:       arrayctor.loop:
 // CHECK15-NEXT:    [[ARRAYCTOR_CUR:%.*]] = phi ptr [ [[ARRAY_BEGIN]], [[ENTRY:%.*]] ], [ [[ARRAYCTOR_NEXT:%.*]], [[ARRAYCTOR_LOOP]] ]
-// CHECK15-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYCTOR_CUR]]) #[[ATTR5]]
+// CHECK15-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYCTOR_CUR]]) #[[ATTR4]]
 // CHECK15-NEXT:    [[ARRAYCTOR_NEXT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYCTOR_CUR]], i32 1
 // CHECK15-NEXT:    [[ARRAYCTOR_DONE:%.*]] = icmp eq ptr [[ARRAYCTOR_NEXT]], [[ARRAYCTOR_END]]
 // CHECK15-NEXT:    br i1 [[ARRAYCTOR_DONE]], label [[ARRAYCTOR_CONT:%.*]], label [[ARRAYCTOR_LOOP]]
 // CHECK15:       arrayctor.cont:
-// CHECK15-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR5]]
+// CHECK15-NEXT:    call void @_ZN1SIfEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR4]]
 // CHECK15-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
 // CHECK15-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
 // CHECK15-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB2]], i32 [[TMP3]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
@@ -2663,14 +2669,14 @@ int main() {
 // CHECK15-NEXT:    [[TMP16:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
 // CHECK15-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP16]], align 4
 // CHECK15-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP17]])
-// CHECK15-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR6]]
+// CHECK15-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR5]]
 // CHECK15-NEXT:    [[ARRAY_BEGIN5:%.*]] = getelementptr inbounds [2 x %struct.S], ptr [[S_ARR]], i32 0, i32 0
 // CHECK15-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAY_BEGIN5]], i32 2
 // CHECK15-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK15:       arraydestroy.body:
 // CHECK15-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP18]], [[OMP_LOOP_EXIT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK15-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1
-// CHECK15-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR6]]
+// CHECK15-NEXT:    call void @_ZN1SIfED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
 // CHECK15-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN5]]
 // CHECK15-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE6:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK15:       arraydestroy.done6:
@@ -2678,24 +2684,26 @@ int main() {
 //
 //
 // CHECK15-LABEL: define {{[^@]+}}@_ZN1SIfED1Ev
-// CHECK15-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR2]] comdat align 2 {
+// CHECK15-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
 // CHECK15-NEXT:  entry:
 // CHECK15-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
 // CHECK15-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
 // CHECK15-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
-// CHECK15-NEXT:    call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR6]]
+// CHECK15-NEXT:    call void @_ZN1SIfED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]]
 // CHECK15-NEXT:    ret void
 //
 //
 // CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l80
-// CHECK15-SAME: () #[[ATTR0]] {
+// CHECK15-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
 // CHECK15-NEXT:  entry:
+// CHECK15-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
+// CHECK15-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK15-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l80.omp_outlined)
 // CHECK15-NEXT:    ret void
 //
 //
 // CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l80.omp_outlined
-// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
 // CHECK15-NEXT:  entry:
 // CHECK15-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK15-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -2724,12 +2732,12 @@ int main() {
 // CHECK15-NEXT:    br label [[ARRAYCTOR_LOOP:%.*]]
 // CHECK15:       arrayctor.loop:
 // CHECK15-NEXT:    [[ARRAYCTOR_CUR:%.*]] = phi ptr [ [[ARRAY_BEGIN]], [[ENTRY:%.*]] ], [ [[ARRAYCTOR_NEXT:%.*]], [[ARRAYCTOR_LOOP]] ]
-// CHECK15-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYCTOR_CUR]]) #[[ATTR5]]
+// CHECK15-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYCTOR_CUR]]) #[[ATTR4]]
 // CHECK15-NEXT:    [[ARRAYCTOR_NEXT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYCTOR_CUR]], i32 1
 // CHECK15-NEXT:    [[ARRAYCTOR_DONE:%.*]] = icmp eq ptr [[ARRAYCTOR_NEXT]], [[ARRAYCTOR_END]]
 // CHECK15-NEXT:    br i1 [[ARRAYCTOR_DONE]], label [[ARRAYCTOR_CONT:%.*]], label [[ARRAYCTOR_LOOP]]
 // CHECK15:       arrayctor.cont:
-// CHECK15-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR5]]
+// CHECK15-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR4]]
 // CHECK15-NEXT:    store ptr [[VAR]], ptr [[_TMP2]], align 4
 // CHECK15-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
 // CHECK15-NEXT:    [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
@@ -2772,14 +2780,14 @@ int main() {
 // CHECK15-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
 // CHECK15-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP11]], align 4
 // CHECK15-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP12]])
-// CHECK15-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR6]]
+// CHECK15-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR5]]
 // CHECK15-NEXT:    [[ARRAY_BEGIN4:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0
 // CHECK15-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN4]], i32 2
 // CHECK15-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK15:       arraydestroy.body:
 // CHECK15-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP13]], [[OMP_LOOP_EXIT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK15-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1
-// CHECK15-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR6]]
+// CHECK15-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
 // CHECK15-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN4]]
 // CHECK15-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE5:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK15:       arraydestroy.done5:
@@ -2787,17 +2795,17 @@ int main() {
 //
 //
 // CHECK15-LABEL: define {{[^@]+}}@_ZN1SIiEC1Ev
-// CHECK15-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR2]] comdat align 2 {
+// CHECK15-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
 // CHECK15-NEXT:  entry:
 // CHECK15-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
 // CHECK15-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
 // CHECK15-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
-// CHECK15-NEXT:    call void @_ZN1SIiEC2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]]
+// CHECK15-NEXT:    call void @_ZN1SIiEC2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR4]]
 // CHECK15-NEXT:    ret void
 //
 //
 // CHECK15-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l80.omp_outlined.omp_outlined
-// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK15-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR0]] {
 // CHECK15-NEXT:  entry:
 // CHECK15-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK15-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -2834,12 +2842,12 @@ int main() {
 // CHECK15-NEXT:    br label [[ARRAYCTOR_LOOP:%.*]]
 // CHECK15:       arrayctor.loop:
 // CHECK15-NEXT:    [[ARRAYCTOR_CUR:%.*]] = phi ptr [ [[ARRAY_BEGIN]], [[ENTRY:%.*]] ], [ [[ARRAYCTOR_NEXT:%.*]], [[ARRAYCTOR_LOOP]] ]
-// CHECK15-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYCTOR_CUR]]) #[[ATTR5]]
+// CHECK15-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYCTOR_CUR]]) #[[ATTR4]]
 // CHECK15-NEXT:    [[ARRAYCTOR_NEXT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYCTOR_CUR]], i32 1
 // CHECK15-NEXT:    [[ARRAYCTOR_DONE:%.*]] = icmp eq ptr [[ARRAYCTOR_NEXT]], [[ARRAYCTOR_END]]
 // CHECK15-NEXT:    br i1 [[ARRAYCTOR_DONE]], label [[ARRAYCTOR_CONT:%.*]], label [[ARRAYCTOR_LOOP]]
 // CHECK15:       arrayctor.cont:
-// CHECK15-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR5]]
+// CHECK15-NEXT:    call void @_ZN1SIiEC1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR4]]
 // CHECK15-NEXT:    store ptr [[VAR]], ptr [[_TMP2]], align 4
 // CHECK15-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
 // CHECK15-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
@@ -2892,14 +2900,14 @@ int main() {
 // CHECK15-NEXT:    [[TMP15:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 4
 // CHECK15-NEXT:    [[TMP16:%.*]] = load i32, ptr [[TMP15]], align 4
 // CHECK15-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB2]], i32 [[TMP16]])
-// CHECK15-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR6]]
+// CHECK15-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[VAR]]) #[[ATTR5]]
 // CHECK15-NEXT:    [[ARRAY_BEGIN6:%.*]] = getelementptr inbounds [2 x %struct.S.0], ptr [[S_ARR]], i32 0, i32 0
 // CHECK15-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAY_BEGIN6]], i32 2
 // CHECK15-NEXT:    br label [[ARRAYDESTROY_BODY:%.*]]
 // CHECK15:       arraydestroy.body:
 // CHECK15-NEXT:    [[ARRAYDESTROY_ELEMENTPAST:%.*]] = phi ptr [ [[TMP17]], [[OMP_LOOP_EXIT]] ], [ [[ARRAYDESTROY_ELEMENT:%.*]], [[ARRAYDESTROY_BODY]] ]
 // CHECK15-NEXT:    [[ARRAYDESTROY_ELEMENT]] = getelementptr inbounds [[STRUCT_S_0]], ptr [[ARRAYDESTROY_ELEMENTPAST]], i32 -1
-// CHECK15-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR6]]
+// CHECK15-NEXT:    call void @_ZN1SIiED1Ev(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYDESTROY_ELEMENT]]) #[[ATTR5]]
 // CHECK15-NEXT:    [[ARRAYDESTROY_DONE:%.*]] = icmp eq ptr [[ARRAYDESTROY_ELEMENT]], [[ARRAY_BEGIN6]]
 // CHECK15-NEXT:    br i1 [[ARRAYDESTROY_DONE]], label [[ARRAYDESTROY_DONE7:%.*]], label [[ARRAYDESTROY_BODY]]
 // CHECK15:       arraydestroy.done7:
@@ -2907,17 +2915,17 @@ int main() {
 //
 //
 // CHECK15-LABEL: define {{[^@]+}}@_ZN1SIiED1Ev
-// CHECK15-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR2]] comdat align 2 {
+// CHECK15-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
 // CHECK15-NEXT:  entry:
 // CHECK15-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
 // CHECK15-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
 // CHECK15-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
-// CHECK15-NEXT:    call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR6]]
+// CHECK15-NEXT:    call void @_ZN1SIiED2Ev(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]]) #[[ATTR5]]
 // CHECK15-NEXT:    ret void
 //
 //
 // CHECK15-LABEL: define {{[^@]+}}@_ZN1SIfEC2Ev
-// CHECK15-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR2]] comdat align 2 {
+// CHECK15-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
 // CHECK15-NEXT:  entry:
 // CHECK15-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
 // CHECK15-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
@@ -2930,7 +2938,7 @@ int main() {
 //
 //
 // CHECK15-LABEL: define {{[^@]+}}@_ZN1SIfED2Ev
-// CHECK15-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR2]] comdat align 2 {
+// CHECK15-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
 // CHECK15-NEXT:  entry:
 // CHECK15-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
 // CHECK15-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
@@ -2939,7 +2947,7 @@ int main() {
 //
 //
 // CHECK15-LABEL: define {{[^@]+}}@_ZN1SIiEC2Ev
-// CHECK15-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR2]] comdat align 2 {
+// CHECK15-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
 // CHECK15-NEXT:  entry:
 // CHECK15-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
 // CHECK15-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
@@ -2951,7 +2959,7 @@ int main() {
 //
 //
 // CHECK15-LABEL: define {{[^@]+}}@_ZN1SIiED2Ev
-// CHECK15-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR2]] comdat align 2 {
+// CHECK15-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]]) unnamed_addr #[[ATTR1]] comdat align 2 {
 // CHECK15-NEXT:  entry:
 // CHECK15-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
 // CHECK15-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
@@ -2960,14 +2968,16 @@ int main() {
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l104
-// CHECK17-SAME: () #[[ATTR0:[0-9]+]] {
+// CHECK17-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK17-NEXT:  entry:
+// CHECK17-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
+// CHECK17-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK17-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB3:[0-9]+]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l104.omp_outlined)
 // CHECK17-NEXT:    ret void
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l104.omp_outlined
-// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
 // CHECK17-NEXT:  entry:
 // CHECK17-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -3034,7 +3044,7 @@ int main() {
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l104.omp_outlined.omp_outlined
-// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
+// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR0]] {
 // CHECK17-NEXT:  entry:
 // CHECK17-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -3107,7 +3117,7 @@ int main() {
 // CHECK17-NEXT:    store ptr [[TMP13]], ptr [[TMP12]], align 8
 // CHECK17-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[CLASS_ANON]], ptr [[REF_TMP]], i32 0, i32 2
 // CHECK17-NEXT:    store ptr [[SIVAR]], ptr [[TMP14]], align 8
-// CHECK17-NEXT:    call void @"_ZZZ4mainENK3$_0clEvENKUlvE_clEv"(ptr noundef nonnull align 8 dereferenceable(24) [[REF_TMP]]) #[[ATTR4:[0-9]+]]
+// CHECK17-NEXT:    call void @"_ZZZ4mainENK3$_0clEvENKUlvE_clEv"(ptr noundef nonnull align 8 dereferenceable(24) [[REF_TMP]]) #[[ATTR3:[0-9]+]]
 // CHECK17-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
 // CHECK17:       omp.body.continue:
 // CHECK17-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
diff --git a/clang/test/OpenMP/target_teams_generic_loop_reduction_codegen.cpp b/clang/test/OpenMP/target_teams_generic_loop_reduction_codegen.cpp
index 9f6082b34f149..fa48c02a65647 100644
--- a/clang/test/OpenMP/target_teams_generic_loop_reduction_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_generic_loop_reduction_codegen.cpp
@@ -138,7 +138,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
 // CHECK1-NEXT:    br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK1:       omp_offload.failed:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l66(ptr @_ZZ4mainE5sivar) #[[ATTR3:[0-9]+]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l66(ptr @_ZZ4mainE5sivar) #[[ATTR2:[0-9]+]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    [[CALL:%.*]] = call noundef signext i32 @_Z5tmainIiET_v()
@@ -156,7 +156,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l66.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -241,7 +241,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l66.omp_outlined.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -340,7 +340,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l66.omp_outlined.omp_outlined.omp.reduction.reduction_func
-// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4:[0-9]+]] {
+// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3:[0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8
@@ -360,7 +360,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l66.omp_outlined.omp.reduction.reduction_func
-// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4]] {
+// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8
@@ -380,7 +380,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@_Z5tmainIiET_v
-// CHECK1-SAME: () #[[ATTR6:[0-9]+]] comdat {
+// CHECK1-SAME: () #[[ATTR5:[0-9]+]] comdat {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[T_VAR:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[VEC:%.*]] = alloca [2 x i32], align 4
@@ -429,7 +429,7 @@ int main() {
 // CHECK1-NEXT:    [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
 // CHECK1-NEXT:    br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK1:       omp_offload.failed:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32(ptr [[T_VAR]]) #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32(ptr [[T_VAR]]) #[[ATTR2]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    ret i32 0
@@ -446,7 +446,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -531,7 +531,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -630,7 +630,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined.omp_outlined.omp.reduction.reduction_func
-// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4]] {
+// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8
@@ -650,7 +650,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined.omp.reduction.reduction_func
-// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4]] {
+// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8
@@ -670,7 +670,7 @@ int main() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK1-SAME: () #[[ATTR8:[0-9]+]] {
+// CHECK1-SAME: () #[[ATTR7:[0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK1-NEXT:    ret void
@@ -724,7 +724,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
 // CHECK3-NEXT:    br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK3:       omp_offload.failed:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l66(ptr @_ZZ4mainE5sivar) #[[ATTR3:[0-9]+]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l66(ptr @_ZZ4mainE5sivar) #[[ATTR2:[0-9]+]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK3:       omp_offload.cont:
 // CHECK3-NEXT:    [[CALL:%.*]] = call noundef i32 @_Z5tmainIiET_v()
@@ -742,7 +742,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l66.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -825,7 +825,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l66.omp_outlined.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -922,7 +922,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l66.omp_outlined.omp_outlined.omp.reduction.reduction_func
-// CHECK3-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4:[0-9]+]] {
+// CHECK3-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3:[0-9]+]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 4
@@ -942,7 +942,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l66.omp_outlined.omp.reduction.reduction_func
-// CHECK3-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4]] {
+// CHECK3-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 4
@@ -962,7 +962,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@_Z5tmainIiET_v
-// CHECK3-SAME: () #[[ATTR6:[0-9]+]] comdat {
+// CHECK3-SAME: () #[[ATTR5:[0-9]+]] comdat {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[T_VAR:%.*]] = alloca i32, align 4
 // CHECK3-NEXT:    [[VEC:%.*]] = alloca [2 x i32], align 4
@@ -1011,7 +1011,7 @@ int main() {
 // CHECK3-NEXT:    [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
 // CHECK3-NEXT:    br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK3:       omp_offload.failed:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32(ptr [[T_VAR]]) #[[ATTR3]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32(ptr [[T_VAR]]) #[[ATTR2]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK3:       omp_offload.cont:
 // CHECK3-NEXT:    ret i32 0
@@ -1028,7 +1028,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1111,7 +1111,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[DOTPREVIOUS_LB_:%.*]], i32 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[T_VAR:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1208,7 +1208,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined.omp_outlined.omp.reduction.reduction_func
-// CHECK3-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4]] {
+// CHECK3-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 4
@@ -1228,7 +1228,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIiET_v_l32.omp_outlined.omp.reduction.reduction_func
-// CHECK3-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4]] {
+// CHECK3-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 4
@@ -1248,7 +1248,7 @@ int main() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK3-SAME: () #[[ATTR8:[0-9]+]] {
+// CHECK3-SAME: () #[[ATTR7:[0-9]+]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK3-NEXT:    ret void
@@ -1275,7 +1275,7 @@ int main() {
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l44.omp_outlined
-// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR2]] {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1360,7 +1360,7 @@ int main() {
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l44.omp_outlined.omp_outlined
-// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR3]] {
+// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SIVAR:%.*]]) #[[ATTR2]] {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1463,7 +1463,7 @@ int main() {
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l44.omp_outlined.omp_outlined.omp.reduction.reduction_func
-// CHECK5-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR5:[0-9]+]] {
+// CHECK5-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4:[0-9]+]] {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8
@@ -1483,7 +1483,7 @@ int main() {
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l44.omp_outlined.omp.reduction.reduction_func
-// CHECK5-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR5]] {
+// CHECK5-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4]] {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8
@@ -1503,7 +1503,7 @@ int main() {
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK5-SAME: () #[[ATTR7:[0-9]+]] {
+// CHECK5-SAME: () #[[ATTR6:[0-9]+]] {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK5-NEXT:    ret void
diff --git a/clang/test/OpenMP/target_teams_generic_loop_uses_allocators_codegen.cpp b/clang/test/OpenMP/target_teams_generic_loop_uses_allocators_codegen.cpp
index ee4f816cd28ce..64f0dced135f8 100644
--- a/clang/test/OpenMP/target_teams_generic_loop_uses_allocators_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_generic_loop_uses_allocators_codegen.cpp
@@ -345,7 +345,7 @@ void foo() {
 //
 //
 // CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l66.omp_outlined
-// CHECK-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -405,7 +405,7 @@ void foo() {
 //
 //
 // CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l66.omp_outlined.omp_outlined
-// CHECK-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR3]] {
+// CHECK-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[DOTPREVIOUS_LB_:%.*]], i64 noundef [[DOTPREVIOUS_UB_:%.*]]) #[[ATTR1]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -475,7 +475,7 @@ void foo() {
 //
 //
 // CHECK-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK-SAME: () #[[ATTR4:[0-9]+]] {
+// CHECK-SAME: () #[[ATTR3:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK-NEXT:    ret void
diff --git a/clang/test/OpenMP/target_teams_map_codegen.cpp b/clang/test/OpenMP/target_teams_map_codegen.cpp
index 2393569392b22..c535c495b2051 100644
--- a/clang/test/OpenMP/target_teams_map_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_map_codegen.cpp
@@ -129,7 +129,7 @@ void mapInt128() {
 // CHECK1-NEXT:    [[TMP22:%.*]] = icmp ne i32 [[TMP21]], 0
 // CHECK1-NEXT:    br i1 [[TMP22]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK1:       omp_offload.failed:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z14mapWithPrivatev_l27() #[[ATTR3:[0-9]+]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z14mapWithPrivatev_l27() #[[ATTR2:[0-9]+]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    ret void
@@ -143,7 +143,7 @@ void mapInt128() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z14mapWithPrivatev_l27.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -207,7 +207,7 @@ void mapInt128() {
 // CHECK1-NEXT:    [[TMP22:%.*]] = icmp ne i32 [[TMP21]], 0
 // CHECK1-NEXT:    br i1 [[TMP22]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK1:       omp_offload.failed:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z19mapWithFirstprivatev_l33(ptr [[X]], ptr [[Y]]) #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z19mapWithFirstprivatev_l33(ptr [[X]], ptr [[Y]]) #[[ATTR2]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    ret void
@@ -235,7 +235,7 @@ void mapInt128() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z19mapWithFirstprivatev_l33.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[X:%.*]], i64 noundef [[Y:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[X:%.*]], i64 noundef [[Y:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -301,7 +301,7 @@ void mapInt128() {
 // CHECK1-NEXT:    [[TMP22:%.*]] = icmp ne i32 [[TMP21]], 0
 // CHECK1-NEXT:    br i1 [[TMP22]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK1:       omp_offload.failed:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16mapWithReductionv_l39(ptr [[X]], ptr [[Y]]) #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16mapWithReductionv_l39(ptr [[X]], ptr [[Y]]) #[[ATTR2]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    ret void
@@ -321,7 +321,7 @@ void mapInt128() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16mapWithReductionv_l39.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[X:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[Y:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[X:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[Y:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -372,7 +372,7 @@ void mapInt128() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16mapWithReductionv_l39.omp_outlined.omp.reduction.reduction_func
-// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4:[0-9]+]] {
+// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3:[0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8
@@ -445,7 +445,7 @@ void mapInt128() {
 // CHECK1-NEXT:    [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
 // CHECK1-NEXT:    br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK1:       omp_offload.failed:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z7mapFromv_l45(ptr [[X]]) #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z7mapFromv_l45(ptr [[X]]) #[[ATTR2]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    ret void
@@ -466,7 +466,7 @@ void mapInt128() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z7mapFromv_l45.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[X:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[X:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -523,7 +523,7 @@ void mapInt128() {
 // CHECK1-NEXT:    [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
 // CHECK1-NEXT:    br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK1:       omp_offload.failed:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5mapTov_l51(ptr [[X]]) #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5mapTov_l51(ptr [[X]]) #[[ATTR2]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    ret void
@@ -544,7 +544,7 @@ void mapInt128() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5mapTov_l51.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[X:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[X:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -601,7 +601,7 @@ void mapInt128() {
 // CHECK1-NEXT:    [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
 // CHECK1-NEXT:    br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK1:       omp_offload.failed:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z8mapAllocv_l57(ptr [[X]]) #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z8mapAllocv_l57(ptr [[X]]) #[[ATTR2]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    ret void
@@ -622,7 +622,7 @@ void mapInt128() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z8mapAllocv_l57.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[X:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[X:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -697,7 +697,7 @@ void mapInt128() {
 // CHECK1-NEXT:    [[TMP25:%.*]] = icmp ne i32 [[TMP24]], 0
 // CHECK1-NEXT:    br i1 [[TMP25]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK1:       omp_offload.failed:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z8mapArrayv_l63(ptr [[Y]], ptr [[Z]]) #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z8mapArrayv_l63(ptr [[Y]], ptr [[Z]]) #[[ATTR2]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
@@ -750,7 +750,7 @@ void mapInt128() {
 // CHECK1-NEXT:    [[TMP51:%.*]] = icmp ne i32 [[TMP50]], 0
 // CHECK1-NEXT:    br i1 [[TMP51]], label [[OMP_OFFLOAD_FAILED5:%.*]], label [[OMP_OFFLOAD_CONT6:%.*]]
 // CHECK1:       omp_offload.failed5:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z8mapArrayv_l65(ptr [[Y]], ptr [[Z]]) #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z8mapArrayv_l65(ptr [[Y]], ptr [[Z]]) #[[ATTR2]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT6]]
 // CHECK1:       omp_offload.cont6:
 // CHECK1-NEXT:    ret void
@@ -770,7 +770,7 @@ void mapInt128() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z8mapArrayv_l63.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(352) [[Y:%.*]], ptr noundef nonnull align 4 dereferenceable(396) [[Z:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(352) [[Y:%.*]], ptr noundef nonnull align 4 dereferenceable(396) [[Z:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -846,7 +846,7 @@ void mapInt128() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z8mapArrayv_l63.omp_outlined.omp.reduction.reduction_func
-// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4]] {
+// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8
@@ -890,7 +890,7 @@ void mapInt128() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z8mapArrayv_l65.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(352) [[Y:%.*]], ptr noundef nonnull align 4 dereferenceable(396) [[Z:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(352) [[Y:%.*]], ptr noundef nonnull align 4 dereferenceable(396) [[Z:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -966,7 +966,7 @@ void mapInt128() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z8mapArrayv_l65.omp_outlined.omp.reduction.reduction_func
-// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4]] {
+// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8
@@ -1060,7 +1060,7 @@ void mapInt128() {
 // CHECK1-NEXT:    [[TMP25:%.*]] = icmp ne i32 [[TMP24]], 0
 // CHECK1-NEXT:    br i1 [[TMP25]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK1:       omp_offload.failed:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9mapInt128v_l72(ptr [[Y]], ptr [[Z]]) #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9mapInt128v_l72(ptr [[Y]], ptr [[Z]]) #[[ATTR2]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
@@ -1113,7 +1113,7 @@ void mapInt128() {
 // CHECK1-NEXT:    [[TMP51:%.*]] = icmp ne i32 [[TMP50]], 0
 // CHECK1-NEXT:    br i1 [[TMP51]], label [[OMP_OFFLOAD_FAILED5:%.*]], label [[OMP_OFFLOAD_CONT6:%.*]]
 // CHECK1:       omp_offload.failed5:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9mapInt128v_l74(ptr [[Y]], ptr [[Z]]) #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9mapInt128v_l74(ptr [[Y]], ptr [[Z]]) #[[ATTR2]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT6]]
 // CHECK1:       omp_offload.cont6:
 // CHECK1-NEXT:    ret void
@@ -1133,7 +1133,7 @@ void mapInt128() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9mapInt128v_l72.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 16 dereferenceable(16) [[Y:%.*]], ptr noundef nonnull align 16 dereferenceable(16) [[Z:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 16 dereferenceable(16) [[Y:%.*]], ptr noundef nonnull align 16 dereferenceable(16) [[Z:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1192,7 +1192,7 @@ void mapInt128() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9mapInt128v_l72.omp_outlined.omp.reduction.reduction_func
-// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4]] {
+// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8
@@ -1225,7 +1225,7 @@ void mapInt128() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9mapInt128v_l74.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 16 dereferenceable(16) [[Y:%.*]], ptr noundef nonnull align 16 dereferenceable(16) [[Z:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 16 dereferenceable(16) [[Y:%.*]], ptr noundef nonnull align 16 dereferenceable(16) [[Z:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1284,7 +1284,7 @@ void mapInt128() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9mapInt128v_l74.omp_outlined.omp.reduction.reduction_func
-// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4]] {
+// CHECK1-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8
@@ -1304,7 +1304,7 @@ void mapInt128() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK1-SAME: () #[[ATTR8:[0-9]+]] {
+// CHECK1-SAME: () #[[ATTR7:[0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK1-NEXT:    ret void
@@ -1363,7 +1363,7 @@ void mapInt128() {
 // CHECK3-NEXT:    [[TMP22:%.*]] = icmp ne i32 [[TMP21]], 0
 // CHECK3-NEXT:    br i1 [[TMP22]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK3:       omp_offload.failed:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z14mapWithPrivatev_l27() #[[ATTR3:[0-9]+]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z14mapWithPrivatev_l27() #[[ATTR2:[0-9]+]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK3:       omp_offload.cont:
 // CHECK3-NEXT:    ret void
@@ -1377,7 +1377,7 @@ void mapInt128() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z14mapWithPrivatev_l27.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1441,7 +1441,7 @@ void mapInt128() {
 // CHECK3-NEXT:    [[TMP22:%.*]] = icmp ne i32 [[TMP21]], 0
 // CHECK3-NEXT:    br i1 [[TMP22]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK3:       omp_offload.failed:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z19mapWithFirstprivatev_l33(ptr [[X]], ptr [[Y]]) #[[ATTR3]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z19mapWithFirstprivatev_l33(ptr [[X]], ptr [[Y]]) #[[ATTR2]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK3:       omp_offload.cont:
 // CHECK3-NEXT:    ret void
@@ -1469,7 +1469,7 @@ void mapInt128() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z19mapWithFirstprivatev_l33.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[X:%.*]], i32 noundef [[Y:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[X:%.*]], i32 noundef [[Y:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1535,7 +1535,7 @@ void mapInt128() {
 // CHECK3-NEXT:    [[TMP22:%.*]] = icmp ne i32 [[TMP21]], 0
 // CHECK3-NEXT:    br i1 [[TMP22]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK3:       omp_offload.failed:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16mapWithReductionv_l39(ptr [[X]], ptr [[Y]]) #[[ATTR3]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16mapWithReductionv_l39(ptr [[X]], ptr [[Y]]) #[[ATTR2]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK3:       omp_offload.cont:
 // CHECK3-NEXT:    ret void
@@ -1555,7 +1555,7 @@ void mapInt128() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16mapWithReductionv_l39.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[X:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[Y:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[X:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[Y:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1606,7 +1606,7 @@ void mapInt128() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16mapWithReductionv_l39.omp_outlined.omp.reduction.reduction_func
-// CHECK3-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4:[0-9]+]] {
+// CHECK3-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3:[0-9]+]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 4
@@ -1679,7 +1679,7 @@ void mapInt128() {
 // CHECK3-NEXT:    [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
 // CHECK3-NEXT:    br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK3:       omp_offload.failed:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z7mapFromv_l45(ptr [[X]]) #[[ATTR3]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z7mapFromv_l45(ptr [[X]]) #[[ATTR2]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK3:       omp_offload.cont:
 // CHECK3-NEXT:    ret void
@@ -1700,7 +1700,7 @@ void mapInt128() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z7mapFromv_l45.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[X:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[X:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1757,7 +1757,7 @@ void mapInt128() {
 // CHECK3-NEXT:    [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
 // CHECK3-NEXT:    br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK3:       omp_offload.failed:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5mapTov_l51(ptr [[X]]) #[[ATTR3]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5mapTov_l51(ptr [[X]]) #[[ATTR2]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK3:       omp_offload.cont:
 // CHECK3-NEXT:    ret void
@@ -1778,7 +1778,7 @@ void mapInt128() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5mapTov_l51.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[X:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[X:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1835,7 +1835,7 @@ void mapInt128() {
 // CHECK3-NEXT:    [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0
 // CHECK3-NEXT:    br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK3:       omp_offload.failed:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z8mapAllocv_l57(ptr [[X]]) #[[ATTR3]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z8mapAllocv_l57(ptr [[X]]) #[[ATTR2]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK3:       omp_offload.cont:
 // CHECK3-NEXT:    ret void
@@ -1856,7 +1856,7 @@ void mapInt128() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z8mapAllocv_l57.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[X:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[X:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1931,7 +1931,7 @@ void mapInt128() {
 // CHECK3-NEXT:    [[TMP25:%.*]] = icmp ne i32 [[TMP24]], 0
 // CHECK3-NEXT:    br i1 [[TMP25]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK3:       omp_offload.failed:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z8mapArrayv_l63(ptr [[Y]], ptr [[Z]]) #[[ATTR3]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z8mapArrayv_l63(ptr [[Y]], ptr [[Z]]) #[[ATTR2]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK3:       omp_offload.cont:
 // CHECK3-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [3 x ptr], ptr [[DOTOFFLOAD_BASEPTRS1]], i32 0, i32 0
@@ -1984,7 +1984,7 @@ void mapInt128() {
 // CHECK3-NEXT:    [[TMP51:%.*]] = icmp ne i32 [[TMP50]], 0
 // CHECK3-NEXT:    br i1 [[TMP51]], label [[OMP_OFFLOAD_FAILED5:%.*]], label [[OMP_OFFLOAD_CONT6:%.*]]
 // CHECK3:       omp_offload.failed5:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z8mapArrayv_l65(ptr [[Y]], ptr [[Z]]) #[[ATTR3]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z8mapArrayv_l65(ptr [[Y]], ptr [[Z]]) #[[ATTR2]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT6]]
 // CHECK3:       omp_offload.cont6:
 // CHECK3-NEXT:    ret void
@@ -2004,7 +2004,7 @@ void mapInt128() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z8mapArrayv_l63.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(352) [[Y:%.*]], ptr noundef nonnull align 4 dereferenceable(396) [[Z:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(352) [[Y:%.*]], ptr noundef nonnull align 4 dereferenceable(396) [[Z:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -2080,7 +2080,7 @@ void mapInt128() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z8mapArrayv_l63.omp_outlined.omp.reduction.reduction_func
-// CHECK3-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4]] {
+// CHECK3-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 4
@@ -2124,7 +2124,7 @@ void mapInt128() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z8mapArrayv_l65.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(352) [[Y:%.*]], ptr noundef nonnull align 4 dereferenceable(396) [[Z:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(352) [[Y:%.*]], ptr noundef nonnull align 4 dereferenceable(396) [[Z:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -2200,7 +2200,7 @@ void mapInt128() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z8mapArrayv_l65.omp_outlined.omp.reduction.reduction_func
-// CHECK3-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR4]] {
+// CHECK3-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 4
@@ -2231,21 +2231,23 @@ void mapInt128() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK3-SAME: () #[[ATTR7:[0-9]+]] {
+// CHECK3-SAME: () #[[ATTR6:[0-9]+]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK3-NEXT:    ret void
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z14mapWithPrivatev_l27
-// CHECK5-SAME: () #[[ATTR0:[0-9]+]] {
+// CHECK5-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK5-NEXT:  entry:
+// CHECK5-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
+// CHECK5-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK5-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1:[0-9]+]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z14mapWithPrivatev_l27.omp_outlined)
 // CHECK5-NEXT:    ret void
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z14mapWithPrivatev_l27.omp_outlined
-// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -2257,12 +2259,14 @@ void mapInt128() {
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z19mapWithFirstprivatev_l33
-// CHECK5-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[X:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[Y:%.*]]) #[[ATTR0]] {
+// CHECK5-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[X:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[Y:%.*]]) #[[ATTR0]] {
 // CHECK5-NEXT:  entry:
+// CHECK5-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[X_ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[Y_ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[X_CASTED:%.*]] = alloca i64, align 8
 // CHECK5-NEXT:    [[Y_CASTED:%.*]] = alloca i64, align 8
+// CHECK5-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK5-NEXT:    store ptr [[X]], ptr [[X_ADDR]], align 8
 // CHECK5-NEXT:    store ptr [[Y]], ptr [[Y_ADDR]], align 8
 // CHECK5-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[X_ADDR]], align 8
@@ -2278,7 +2282,7 @@ void mapInt128() {
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z19mapWithFirstprivatev_l33.omp_outlined
-// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[X:%.*]], i64 noundef [[Y:%.*]]) #[[ATTR1]] {
+// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[X:%.*]], i64 noundef [[Y:%.*]]) #[[ATTR0]] {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -2292,10 +2296,12 @@ void mapInt128() {
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16mapWithReductionv_l39
-// CHECK5-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[X:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[Y:%.*]]) #[[ATTR0]] {
+// CHECK5-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[X:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[Y:%.*]]) #[[ATTR0]] {
 // CHECK5-NEXT:  entry:
+// CHECK5-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[X_ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[Y_ADDR:%.*]] = alloca ptr, align 8
+// CHECK5-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK5-NEXT:    store ptr [[X]], ptr [[X_ADDR]], align 8
 // CHECK5-NEXT:    store ptr [[Y]], ptr [[Y_ADDR]], align 8
 // CHECK5-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[X_ADDR]], align 8
@@ -2305,7 +2311,7 @@ void mapInt128() {
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16mapWithReductionv_l39.omp_outlined
-// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[X:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[Y:%.*]]) #[[ATTR1]] {
+// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[X:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[Y:%.*]]) #[[ATTR0]] {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -2356,7 +2362,7 @@ void mapInt128() {
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16mapWithReductionv_l39.omp_outlined.omp.reduction.reduction_func
-// CHECK5-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK5-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR2:[0-9]+]] {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8
@@ -2384,10 +2390,12 @@ void mapInt128() {
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z7mapFromv_l45
-// CHECK5-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[X:%.*]]) #[[ATTR0]] {
+// CHECK5-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[X:%.*]]) #[[ATTR0]] {
 // CHECK5-NEXT:  entry:
+// CHECK5-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[X_ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[X_CASTED:%.*]] = alloca i64, align 8
+// CHECK5-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK5-NEXT:    store ptr [[X]], ptr [[X_ADDR]], align 8
 // CHECK5-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[X_ADDR]], align 8
 // CHECK5-NEXT:    [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
@@ -2398,7 +2406,7 @@ void mapInt128() {
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z7mapFromv_l45.omp_outlined
-// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[X:%.*]]) #[[ATTR1]] {
+// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[X:%.*]]) #[[ATTR0]] {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -2410,10 +2418,12 @@ void mapInt128() {
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5mapTov_l51
-// CHECK5-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[X:%.*]]) #[[ATTR0]] {
+// CHECK5-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[X:%.*]]) #[[ATTR0]] {
 // CHECK5-NEXT:  entry:
+// CHECK5-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[X_ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[X_CASTED:%.*]] = alloca i64, align 8
+// CHECK5-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK5-NEXT:    store ptr [[X]], ptr [[X_ADDR]], align 8
 // CHECK5-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[X_ADDR]], align 8
 // CHECK5-NEXT:    [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
@@ -2424,7 +2434,7 @@ void mapInt128() {
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5mapTov_l51.omp_outlined
-// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[X:%.*]]) #[[ATTR1]] {
+// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[X:%.*]]) #[[ATTR0]] {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -2436,10 +2446,12 @@ void mapInt128() {
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z8mapAllocv_l57
-// CHECK5-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[X:%.*]]) #[[ATTR0]] {
+// CHECK5-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[X:%.*]]) #[[ATTR0]] {
 // CHECK5-NEXT:  entry:
+// CHECK5-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[X_ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[X_CASTED:%.*]] = alloca i64, align 8
+// CHECK5-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK5-NEXT:    store ptr [[X]], ptr [[X_ADDR]], align 8
 // CHECK5-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[X_ADDR]], align 8
 // CHECK5-NEXT:    [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
@@ -2450,7 +2462,7 @@ void mapInt128() {
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z8mapAllocv_l57.omp_outlined
-// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[X:%.*]]) #[[ATTR1]] {
+// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[X:%.*]]) #[[ATTR0]] {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -2462,10 +2474,12 @@ void mapInt128() {
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z8mapArrayv_l63
-// CHECK5-SAME: (ptr noundef nonnull align 4 dereferenceable(352) [[Y:%.*]], ptr noundef nonnull align 4 dereferenceable(396) [[Z:%.*]]) #[[ATTR0]] {
+// CHECK5-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(352) [[Y:%.*]], ptr noundef nonnull align 4 dereferenceable(396) [[Z:%.*]]) #[[ATTR0]] {
 // CHECK5-NEXT:  entry:
+// CHECK5-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[Y_ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[Z_ADDR:%.*]] = alloca ptr, align 8
+// CHECK5-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK5-NEXT:    store ptr [[Y]], ptr [[Y_ADDR]], align 8
 // CHECK5-NEXT:    store ptr [[Z]], ptr [[Z_ADDR]], align 8
 // CHECK5-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[Y_ADDR]], align 8
@@ -2475,7 +2489,7 @@ void mapInt128() {
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z8mapArrayv_l63.omp_outlined
-// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(352) [[Y:%.*]], ptr noundef nonnull align 4 dereferenceable(396) [[Z:%.*]]) #[[ATTR1]] {
+// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(352) [[Y:%.*]], ptr noundef nonnull align 4 dereferenceable(396) [[Z:%.*]]) #[[ATTR0]] {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -2551,7 +2565,7 @@ void mapInt128() {
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z8mapArrayv_l63.omp_outlined.omp.reduction.reduction_func
-// CHECK5-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] {
+// CHECK5-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR2]] {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8
@@ -2582,10 +2596,12 @@ void mapInt128() {
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z8mapArrayv_l65
-// CHECK5-SAME: (ptr noundef nonnull align 4 dereferenceable(352) [[Y:%.*]], ptr noundef nonnull align 4 dereferenceable(396) [[Z:%.*]]) #[[ATTR0]] {
+// CHECK5-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(352) [[Y:%.*]], ptr noundef nonnull align 4 dereferenceable(396) [[Z:%.*]]) #[[ATTR0]] {
 // CHECK5-NEXT:  entry:
+// CHECK5-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[Y_ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[Z_ADDR:%.*]] = alloca ptr, align 8
+// CHECK5-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK5-NEXT:    store ptr [[Y]], ptr [[Y_ADDR]], align 8
 // CHECK5-NEXT:    store ptr [[Z]], ptr [[Z_ADDR]], align 8
 // CHECK5-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[Y_ADDR]], align 8
@@ -2595,7 +2611,7 @@ void mapInt128() {
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z8mapArrayv_l65.omp_outlined
-// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(352) [[Y:%.*]], ptr noundef nonnull align 4 dereferenceable(396) [[Z:%.*]]) #[[ATTR1]] {
+// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(352) [[Y:%.*]], ptr noundef nonnull align 4 dereferenceable(396) [[Z:%.*]]) #[[ATTR0]] {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -2671,7 +2687,7 @@ void mapInt128() {
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z8mapArrayv_l65.omp_outlined.omp.reduction.reduction_func
-// CHECK5-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] {
+// CHECK5-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR2]] {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8
@@ -2702,10 +2718,12 @@ void mapInt128() {
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9mapInt128v_l72
-// CHECK5-SAME: (ptr noundef nonnull align 16 dereferenceable(16) [[Y:%.*]], ptr noundef nonnull align 16 dereferenceable(16) [[Z:%.*]]) #[[ATTR0]] {
+// CHECK5-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 16 dereferenceable(16) [[Y:%.*]], ptr noundef nonnull align 16 dereferenceable(16) [[Z:%.*]]) #[[ATTR0]] {
 // CHECK5-NEXT:  entry:
+// CHECK5-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[Y_ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[Z_ADDR:%.*]] = alloca ptr, align 8
+// CHECK5-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK5-NEXT:    store ptr [[Y]], ptr [[Y_ADDR]], align 8
 // CHECK5-NEXT:    store ptr [[Z]], ptr [[Z_ADDR]], align 8
 // CHECK5-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[Y_ADDR]], align 8
@@ -2715,7 +2733,7 @@ void mapInt128() {
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9mapInt128v_l72.omp_outlined
-// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 16 dereferenceable(16) [[Y:%.*]], ptr noundef nonnull align 16 dereferenceable(16) [[Z:%.*]]) #[[ATTR1]] {
+// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 16 dereferenceable(16) [[Y:%.*]], ptr noundef nonnull align 16 dereferenceable(16) [[Z:%.*]]) #[[ATTR0]] {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -2755,7 +2773,7 @@ void mapInt128() {
 // CHECK5-NEXT:    br label [[DOTOMP_REDUCTION_DEFAULT]]
 // CHECK5:       .omp.reduction.case2:
 // CHECK5-NEXT:    [[TMP9:%.*]] = load i128, ptr [[Z2]], align 16
-// CHECK5-NEXT:    call void @__atomic_load(i64 noundef 16, ptr noundef [[TMP1]], ptr noundef [[ATOMIC_TEMP]], i32 noundef signext 0) #[[ATTR7:[0-9]+]]
+// CHECK5-NEXT:    call void @__atomic_load(i64 noundef 16, ptr noundef [[TMP1]], ptr noundef [[ATOMIC_TEMP]], i32 noundef signext 0) #[[ATTR6:[0-9]+]]
 // CHECK5-NEXT:    br label [[ATOMIC_CONT:%.*]]
 // CHECK5:       atomic_cont:
 // CHECK5-NEXT:    [[TMP10:%.*]] = load i128, ptr [[ATOMIC_TEMP]], align 16
@@ -2764,7 +2782,7 @@ void mapInt128() {
 // CHECK5-NEXT:    [[TMP12:%.*]] = load i128, ptr [[Z2]], align 16
 // CHECK5-NEXT:    [[ADD4:%.*]] = add nsw i128 [[TMP11]], [[TMP12]]
 // CHECK5-NEXT:    store i128 [[ADD4]], ptr [[ATOMIC_TEMP3]], align 16
-// CHECK5-NEXT:    [[CALL:%.*]] = call noundef zeroext i1 @__atomic_compare_exchange(i64 noundef 16, ptr noundef [[TMP1]], ptr noundef [[ATOMIC_TEMP]], ptr noundef [[ATOMIC_TEMP3]], i32 noundef signext 0, i32 noundef signext 0) #[[ATTR7]]
+// CHECK5-NEXT:    [[CALL:%.*]] = call noundef zeroext i1 @__atomic_compare_exchange(i64 noundef 16, ptr noundef [[TMP1]], ptr noundef [[ATOMIC_TEMP]], ptr noundef [[ATOMIC_TEMP3]], i32 noundef signext 0, i32 noundef signext 0) #[[ATTR6]]
 // CHECK5-NEXT:    br i1 [[CALL]], label [[ATOMIC_EXIT:%.*]], label [[ATOMIC_CONT]]
 // CHECK5:       atomic_exit:
 // CHECK5-NEXT:    call void @__kmpc_end_reduce(ptr @[[GLOB2]], i32 [[TMP5]], ptr @.gomp_critical_user_.reduction.var)
@@ -2774,7 +2792,7 @@ void mapInt128() {
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9mapInt128v_l72.omp_outlined.omp.reduction.reduction_func
-// CHECK5-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] {
+// CHECK5-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR2]] {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8
@@ -2794,10 +2812,12 @@ void mapInt128() {
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9mapInt128v_l74
-// CHECK5-SAME: (ptr noundef nonnull align 16 dereferenceable(16) [[Y:%.*]], ptr noundef nonnull align 16 dereferenceable(16) [[Z:%.*]]) #[[ATTR0]] {
+// CHECK5-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 16 dereferenceable(16) [[Y:%.*]], ptr noundef nonnull align 16 dereferenceable(16) [[Z:%.*]]) #[[ATTR0]] {
 // CHECK5-NEXT:  entry:
+// CHECK5-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[Y_ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[Z_ADDR:%.*]] = alloca ptr, align 8
+// CHECK5-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK5-NEXT:    store ptr [[Y]], ptr [[Y_ADDR]], align 8
 // CHECK5-NEXT:    store ptr [[Z]], ptr [[Z_ADDR]], align 8
 // CHECK5-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[Y_ADDR]], align 8
@@ -2807,7 +2827,7 @@ void mapInt128() {
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9mapInt128v_l74.omp_outlined
-// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 16 dereferenceable(16) [[Y:%.*]], ptr noundef nonnull align 16 dereferenceable(16) [[Z:%.*]]) #[[ATTR1]] {
+// CHECK5-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 16 dereferenceable(16) [[Y:%.*]], ptr noundef nonnull align 16 dereferenceable(16) [[Z:%.*]]) #[[ATTR0]] {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -2847,7 +2867,7 @@ void mapInt128() {
 // CHECK5-NEXT:    br label [[DOTOMP_REDUCTION_DEFAULT]]
 // CHECK5:       .omp.reduction.case2:
 // CHECK5-NEXT:    [[TMP9:%.*]] = load i128, ptr [[Z2]], align 16
-// CHECK5-NEXT:    call void @__atomic_load(i64 noundef 16, ptr noundef [[TMP1]], ptr noundef [[ATOMIC_TEMP]], i32 noundef signext 0) #[[ATTR7]]
+// CHECK5-NEXT:    call void @__atomic_load(i64 noundef 16, ptr noundef [[TMP1]], ptr noundef [[ATOMIC_TEMP]], i32 noundef signext 0) #[[ATTR6]]
 // CHECK5-NEXT:    br label [[ATOMIC_CONT:%.*]]
 // CHECK5:       atomic_cont:
 // CHECK5-NEXT:    [[TMP10:%.*]] = load i128, ptr [[ATOMIC_TEMP]], align 16
@@ -2856,7 +2876,7 @@ void mapInt128() {
 // CHECK5-NEXT:    [[TMP12:%.*]] = load i128, ptr [[Z2]], align 16
 // CHECK5-NEXT:    [[ADD4:%.*]] = add nsw i128 [[TMP11]], [[TMP12]]
 // CHECK5-NEXT:    store i128 [[ADD4]], ptr [[ATOMIC_TEMP3]], align 16
-// CHECK5-NEXT:    [[CALL:%.*]] = call noundef zeroext i1 @__atomic_compare_exchange(i64 noundef 16, ptr noundef [[TMP1]], ptr noundef [[ATOMIC_TEMP]], ptr noundef [[ATOMIC_TEMP3]], i32 noundef signext 0, i32 noundef signext 0) #[[ATTR7]]
+// CHECK5-NEXT:    [[CALL:%.*]] = call noundef zeroext i1 @__atomic_compare_exchange(i64 noundef 16, ptr noundef [[TMP1]], ptr noundef [[ATOMIC_TEMP]], ptr noundef [[ATOMIC_TEMP3]], i32 noundef signext 0, i32 noundef signext 0) #[[ATTR6]]
 // CHECK5-NEXT:    br i1 [[CALL]], label [[ATOMIC_EXIT:%.*]], label [[ATOMIC_CONT]]
 // CHECK5:       atomic_exit:
 // CHECK5-NEXT:    call void @__kmpc_end_reduce(ptr @[[GLOB2]], i32 [[TMP5]], ptr @.gomp_critical_user_.reduction.var)
@@ -2866,7 +2886,7 @@ void mapInt128() {
 //
 //
 // CHECK5-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9mapInt128v_l74.omp_outlined.omp.reduction.reduction_func
-// CHECK5-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] {
+// CHECK5-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR2]] {
 // CHECK5-NEXT:  entry:
 // CHECK5-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
 // CHECK5-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8
@@ -2886,14 +2906,16 @@ void mapInt128() {
 //
 //
 // CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z14mapWithPrivatev_l27
-// CHECK7-SAME: () #[[ATTR0:[0-9]+]] {
+// CHECK7-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK7-NEXT:  entry:
+// CHECK7-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
+// CHECK7-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK7-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1:[0-9]+]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z14mapWithPrivatev_l27.omp_outlined)
 // CHECK7-NEXT:    ret void
 //
 //
 // CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z14mapWithPrivatev_l27.omp_outlined
-// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
 // CHECK7-NEXT:  entry:
 // CHECK7-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK7-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -2905,12 +2927,14 @@ void mapInt128() {
 //
 //
 // CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z19mapWithFirstprivatev_l33
-// CHECK7-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[X:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[Y:%.*]]) #[[ATTR0]] {
+// CHECK7-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[X:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[Y:%.*]]) #[[ATTR0]] {
 // CHECK7-NEXT:  entry:
+// CHECK7-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK7-NEXT:    [[X_ADDR:%.*]] = alloca ptr, align 4
 // CHECK7-NEXT:    [[Y_ADDR:%.*]] = alloca ptr, align 4
 // CHECK7-NEXT:    [[X_CASTED:%.*]] = alloca i32, align 4
 // CHECK7-NEXT:    [[Y_CASTED:%.*]] = alloca i32, align 4
+// CHECK7-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK7-NEXT:    store ptr [[X]], ptr [[X_ADDR]], align 4
 // CHECK7-NEXT:    store ptr [[Y]], ptr [[Y_ADDR]], align 4
 // CHECK7-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[X_ADDR]], align 4
@@ -2926,7 +2950,7 @@ void mapInt128() {
 //
 //
 // CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z19mapWithFirstprivatev_l33.omp_outlined
-// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[X:%.*]], i32 noundef [[Y:%.*]]) #[[ATTR1]] {
+// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[X:%.*]], i32 noundef [[Y:%.*]]) #[[ATTR0]] {
 // CHECK7-NEXT:  entry:
 // CHECK7-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK7-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -2940,10 +2964,12 @@ void mapInt128() {
 //
 //
 // CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16mapWithReductionv_l39
-// CHECK7-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[X:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[Y:%.*]]) #[[ATTR0]] {
+// CHECK7-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[X:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[Y:%.*]]) #[[ATTR0]] {
 // CHECK7-NEXT:  entry:
+// CHECK7-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK7-NEXT:    [[X_ADDR:%.*]] = alloca ptr, align 4
 // CHECK7-NEXT:    [[Y_ADDR:%.*]] = alloca ptr, align 4
+// CHECK7-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK7-NEXT:    store ptr [[X]], ptr [[X_ADDR]], align 4
 // CHECK7-NEXT:    store ptr [[Y]], ptr [[Y_ADDR]], align 4
 // CHECK7-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[X_ADDR]], align 4
@@ -2953,7 +2979,7 @@ void mapInt128() {
 //
 //
 // CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16mapWithReductionv_l39.omp_outlined
-// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[X:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[Y:%.*]]) #[[ATTR1]] {
+// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[X:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[Y:%.*]]) #[[ATTR0]] {
 // CHECK7-NEXT:  entry:
 // CHECK7-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK7-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -3004,7 +3030,7 @@ void mapInt128() {
 //
 //
 // CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16mapWithReductionv_l39.omp_outlined.omp.reduction.reduction_func
-// CHECK7-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK7-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR2:[0-9]+]] {
 // CHECK7-NEXT:  entry:
 // CHECK7-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 4
 // CHECK7-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 4
@@ -3032,10 +3058,12 @@ void mapInt128() {
 //
 //
 // CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z7mapFromv_l45
-// CHECK7-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[X:%.*]]) #[[ATTR0]] {
+// CHECK7-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[X:%.*]]) #[[ATTR0]] {
 // CHECK7-NEXT:  entry:
+// CHECK7-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK7-NEXT:    [[X_ADDR:%.*]] = alloca ptr, align 4
 // CHECK7-NEXT:    [[X_CASTED:%.*]] = alloca i32, align 4
+// CHECK7-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK7-NEXT:    store ptr [[X]], ptr [[X_ADDR]], align 4
 // CHECK7-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[X_ADDR]], align 4
 // CHECK7-NEXT:    [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
@@ -3046,7 +3074,7 @@ void mapInt128() {
 //
 //
 // CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z7mapFromv_l45.omp_outlined
-// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[X:%.*]]) #[[ATTR1]] {
+// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[X:%.*]]) #[[ATTR0]] {
 // CHECK7-NEXT:  entry:
 // CHECK7-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK7-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -3058,10 +3086,12 @@ void mapInt128() {
 //
 //
 // CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5mapTov_l51
-// CHECK7-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[X:%.*]]) #[[ATTR0]] {
+// CHECK7-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[X:%.*]]) #[[ATTR0]] {
 // CHECK7-NEXT:  entry:
+// CHECK7-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK7-NEXT:    [[X_ADDR:%.*]] = alloca ptr, align 4
 // CHECK7-NEXT:    [[X_CASTED:%.*]] = alloca i32, align 4
+// CHECK7-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK7-NEXT:    store ptr [[X]], ptr [[X_ADDR]], align 4
 // CHECK7-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[X_ADDR]], align 4
 // CHECK7-NEXT:    [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
@@ -3072,7 +3102,7 @@ void mapInt128() {
 //
 //
 // CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5mapTov_l51.omp_outlined
-// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[X:%.*]]) #[[ATTR1]] {
+// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[X:%.*]]) #[[ATTR0]] {
 // CHECK7-NEXT:  entry:
 // CHECK7-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK7-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -3084,10 +3114,12 @@ void mapInt128() {
 //
 //
 // CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z8mapAllocv_l57
-// CHECK7-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[X:%.*]]) #[[ATTR0]] {
+// CHECK7-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[X:%.*]]) #[[ATTR0]] {
 // CHECK7-NEXT:  entry:
+// CHECK7-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK7-NEXT:    [[X_ADDR:%.*]] = alloca ptr, align 4
 // CHECK7-NEXT:    [[X_CASTED:%.*]] = alloca i32, align 4
+// CHECK7-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK7-NEXT:    store ptr [[X]], ptr [[X_ADDR]], align 4
 // CHECK7-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[X_ADDR]], align 4
 // CHECK7-NEXT:    [[TMP1:%.*]] = load i32, ptr [[TMP0]], align 4
@@ -3098,7 +3130,7 @@ void mapInt128() {
 //
 //
 // CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z8mapAllocv_l57.omp_outlined
-// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[X:%.*]]) #[[ATTR1]] {
+// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[X:%.*]]) #[[ATTR0]] {
 // CHECK7-NEXT:  entry:
 // CHECK7-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK7-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -3110,10 +3142,12 @@ void mapInt128() {
 //
 //
 // CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z8mapArrayv_l63
-// CHECK7-SAME: (ptr noundef nonnull align 4 dereferenceable(352) [[Y:%.*]], ptr noundef nonnull align 4 dereferenceable(396) [[Z:%.*]]) #[[ATTR0]] {
+// CHECK7-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(352) [[Y:%.*]], ptr noundef nonnull align 4 dereferenceable(396) [[Z:%.*]]) #[[ATTR0]] {
 // CHECK7-NEXT:  entry:
+// CHECK7-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK7-NEXT:    [[Y_ADDR:%.*]] = alloca ptr, align 4
 // CHECK7-NEXT:    [[Z_ADDR:%.*]] = alloca ptr, align 4
+// CHECK7-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK7-NEXT:    store ptr [[Y]], ptr [[Y_ADDR]], align 4
 // CHECK7-NEXT:    store ptr [[Z]], ptr [[Z_ADDR]], align 4
 // CHECK7-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[Y_ADDR]], align 4
@@ -3123,7 +3157,7 @@ void mapInt128() {
 //
 //
 // CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z8mapArrayv_l63.omp_outlined
-// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(352) [[Y:%.*]], ptr noundef nonnull align 4 dereferenceable(396) [[Z:%.*]]) #[[ATTR1]] {
+// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(352) [[Y:%.*]], ptr noundef nonnull align 4 dereferenceable(396) [[Z:%.*]]) #[[ATTR0]] {
 // CHECK7-NEXT:  entry:
 // CHECK7-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK7-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -3199,7 +3233,7 @@ void mapInt128() {
 //
 //
 // CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z8mapArrayv_l63.omp_outlined.omp.reduction.reduction_func
-// CHECK7-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] {
+// CHECK7-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR2]] {
 // CHECK7-NEXT:  entry:
 // CHECK7-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 4
 // CHECK7-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 4
@@ -3230,10 +3264,12 @@ void mapInt128() {
 //
 //
 // CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z8mapArrayv_l65
-// CHECK7-SAME: (ptr noundef nonnull align 4 dereferenceable(352) [[Y:%.*]], ptr noundef nonnull align 4 dereferenceable(396) [[Z:%.*]]) #[[ATTR0]] {
+// CHECK7-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(352) [[Y:%.*]], ptr noundef nonnull align 4 dereferenceable(396) [[Z:%.*]]) #[[ATTR0]] {
 // CHECK7-NEXT:  entry:
+// CHECK7-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK7-NEXT:    [[Y_ADDR:%.*]] = alloca ptr, align 4
 // CHECK7-NEXT:    [[Z_ADDR:%.*]] = alloca ptr, align 4
+// CHECK7-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK7-NEXT:    store ptr [[Y]], ptr [[Y_ADDR]], align 4
 // CHECK7-NEXT:    store ptr [[Z]], ptr [[Z_ADDR]], align 4
 // CHECK7-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[Y_ADDR]], align 4
@@ -3243,7 +3279,7 @@ void mapInt128() {
 //
 //
 // CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z8mapArrayv_l65.omp_outlined
-// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(352) [[Y:%.*]], ptr noundef nonnull align 4 dereferenceable(396) [[Z:%.*]]) #[[ATTR1]] {
+// CHECK7-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(352) [[Y:%.*]], ptr noundef nonnull align 4 dereferenceable(396) [[Z:%.*]]) #[[ATTR0]] {
 // CHECK7-NEXT:  entry:
 // CHECK7-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK7-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -3319,7 +3355,7 @@ void mapInt128() {
 //
 //
 // CHECK7-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z8mapArrayv_l65.omp_outlined.omp.reduction.reduction_func
-// CHECK7-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR3]] {
+// CHECK7-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR2]] {
 // CHECK7-NEXT:  entry:
 // CHECK7-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 4
 // CHECK7-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 4
diff --git a/clang/test/OpenMP/target_teams_num_teams_codegen.cpp b/clang/test/OpenMP/target_teams_num_teams_codegen.cpp
index 49296458d8cc5..210cfb922b5e2 100644
--- a/clang/test/OpenMP/target_teams_num_teams_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_num_teams_codegen.cpp
@@ -285,7 +285,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP33:%.*]] = icmp ne i32 [[TMP32]], 0
 // CHECK1-NEXT:    br i1 [[TMP33]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK1:       omp_offload.failed:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l121(ptr [[THIS1]], i64 [[TMP3]], i64 [[TMP5]]) #[[ATTR3:[0-9]+]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l121(ptr [[THIS1]], i64 [[TMP3]], i64 [[TMP5]]) #[[ATTR2:[0-9]+]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    [[A2:%.*]] = getelementptr inbounds [[STRUCT_S1]], ptr [[THIS1]], i32 0, i32 0
@@ -327,7 +327,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP53:%.*]] = icmp ne i32 [[TMP52]], 0
 // CHECK1-NEXT:    br i1 [[TMP53]], label [[OMP_OFFLOAD_FAILED7:%.*]], label [[OMP_OFFLOAD_CONT8:%.*]]
 // CHECK1:       omp_offload.failed7:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l126(ptr [[THIS1]]) #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l126(ptr [[THIS1]]) #[[ATTR2]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT8]]
 // CHECK1:       omp_offload.cont8:
 // CHECK1-NEXT:    [[A9:%.*]] = getelementptr inbounds [[STRUCT_S1]], ptr [[THIS1]], i32 0, i32 0
@@ -398,7 +398,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP24:%.*]] = icmp ne i32 [[TMP23]], 0
 // CHECK1-NEXT:    br i1 [[TMP24]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK1:       omp_offload.failed:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l104(i64 [[TMP2]]) #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l104(i64 [[TMP2]]) #[[ATTR2]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    [[TMP25:%.*]] = load i32, ptr [[N_ADDR]], align 4
@@ -447,7 +447,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP49:%.*]] = icmp ne i32 [[TMP48]], 0
 // CHECK1-NEXT:    br i1 [[TMP49]], label [[OMP_OFFLOAD_FAILED7:%.*]], label [[OMP_OFFLOAD_CONT8:%.*]]
 // CHECK1:       omp_offload.failed7:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l108(i64 [[TMP27]]) #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l108(i64 [[TMP27]]) #[[ATTR2]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT8]]
 // CHECK1:       omp_offload.cont8:
 // CHECK1-NEXT:    [[TMP50:%.*]] = load i32, ptr [[N_ADDR]], align 4
@@ -502,7 +502,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
 // CHECK1-NEXT:    br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK1:       omp_offload.failed:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l88() #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l88() #[[ATTR2]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    store i16 1, ptr [[B]], align 2
@@ -570,7 +570,7 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP50:%.*]] = icmp ne i32 [[TMP49]], 0
 // CHECK1-NEXT:    br i1 [[TMP50]], label [[OMP_OFFLOAD_FAILED2:%.*]], label [[OMP_OFFLOAD_CONT3:%.*]]
 // CHECK1:       omp_offload.failed2:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l93(i64 [[TMP17]], i64 [[TMP19]], i64 [[TMP21]]) #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l93(i64 [[TMP17]], i64 [[TMP19]], i64 [[TMP21]]) #[[ATTR2]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT3]]
 // CHECK1:       omp_offload.cont3:
 // CHECK1-NEXT:    [[TMP51:%.*]] = load i32, ptr [[A]], align 4
@@ -599,7 +599,7 @@ int bar(int n){
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l121.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]], i64 noundef [[B:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]], i64 noundef [[B:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -619,7 +619,7 @@ int bar(int n){
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l126
-// CHECK1-SAME: (ptr noundef [[THIS:%.*]]) #[[ATTR4:[0-9]+]] {
+// CHECK1-SAME: (ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
@@ -631,7 +631,7 @@ int bar(int n){
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l126.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -658,7 +658,7 @@ int bar(int n){
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l104.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -680,7 +680,7 @@ int bar(int n){
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l108.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -690,7 +690,7 @@ int bar(int n){
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l88
-// CHECK1-SAME: () #[[ATTR5:[0-9]+]] {
+// CHECK1-SAME: () #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK1-NEXT:    call void @__kmpc_push_num_teams(ptr @[[GLOB1]], i32 [[TMP0]], i32 20, i32 0)
@@ -699,7 +699,7 @@ int bar(int n){
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l88.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -734,7 +734,7 @@ int bar(int n){
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l93.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -753,7 +753,7 @@ int bar(int n){
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK1-SAME: () #[[ATTR6:[0-9]+]] {
+// CHECK1-SAME: () #[[ATTR3:[0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK1-NEXT:    ret void
@@ -870,7 +870,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP33:%.*]] = icmp ne i32 [[TMP32]], 0
 // CHECK3-NEXT:    br i1 [[TMP33]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK3:       omp_offload.failed:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l121(ptr [[THIS1]], i32 [[TMP3]], i32 [[TMP5]]) #[[ATTR3:[0-9]+]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l121(ptr [[THIS1]], i32 [[TMP3]], i32 [[TMP5]]) #[[ATTR2:[0-9]+]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK3:       omp_offload.cont:
 // CHECK3-NEXT:    [[A2:%.*]] = getelementptr inbounds [[STRUCT_S1]], ptr [[THIS1]], i32 0, i32 0
@@ -912,7 +912,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP53:%.*]] = icmp ne i32 [[TMP52]], 0
 // CHECK3-NEXT:    br i1 [[TMP53]], label [[OMP_OFFLOAD_FAILED7:%.*]], label [[OMP_OFFLOAD_CONT8:%.*]]
 // CHECK3:       omp_offload.failed7:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l126(ptr [[THIS1]]) #[[ATTR3]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l126(ptr [[THIS1]]) #[[ATTR2]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT8]]
 // CHECK3:       omp_offload.cont8:
 // CHECK3-NEXT:    [[A9:%.*]] = getelementptr inbounds [[STRUCT_S1]], ptr [[THIS1]], i32 0, i32 0
@@ -983,7 +983,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP24:%.*]] = icmp ne i32 [[TMP23]], 0
 // CHECK3-NEXT:    br i1 [[TMP24]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK3:       omp_offload.failed:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l104(i32 [[TMP2]]) #[[ATTR3]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l104(i32 [[TMP2]]) #[[ATTR2]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK3:       omp_offload.cont:
 // CHECK3-NEXT:    [[TMP25:%.*]] = load i32, ptr [[N_ADDR]], align 4
@@ -1032,7 +1032,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP49:%.*]] = icmp ne i32 [[TMP48]], 0
 // CHECK3-NEXT:    br i1 [[TMP49]], label [[OMP_OFFLOAD_FAILED7:%.*]], label [[OMP_OFFLOAD_CONT8:%.*]]
 // CHECK3:       omp_offload.failed7:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l108(i32 [[TMP27]]) #[[ATTR3]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l108(i32 [[TMP27]]) #[[ATTR2]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT8]]
 // CHECK3:       omp_offload.cont8:
 // CHECK3-NEXT:    [[TMP50:%.*]] = load i32, ptr [[N_ADDR]], align 4
@@ -1087,7 +1087,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP14:%.*]] = icmp ne i32 [[TMP13]], 0
 // CHECK3-NEXT:    br i1 [[TMP14]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK3:       omp_offload.failed:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l88() #[[ATTR3]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l88() #[[ATTR2]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK3:       omp_offload.cont:
 // CHECK3-NEXT:    store i16 1, ptr [[B]], align 2
@@ -1155,7 +1155,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP50:%.*]] = icmp ne i32 [[TMP49]], 0
 // CHECK3-NEXT:    br i1 [[TMP50]], label [[OMP_OFFLOAD_FAILED2:%.*]], label [[OMP_OFFLOAD_CONT3:%.*]]
 // CHECK3:       omp_offload.failed2:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l93(i32 [[TMP17]], i32 [[TMP19]], i32 [[TMP21]]) #[[ATTR3]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l93(i32 [[TMP17]], i32 [[TMP19]], i32 [[TMP21]]) #[[ATTR2]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT3]]
 // CHECK3:       omp_offload.cont3:
 // CHECK3-NEXT:    [[TMP51:%.*]] = load i32, ptr [[A]], align 4
@@ -1184,7 +1184,7 @@ int bar(int n){
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l121.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]], i32 noundef [[B:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]], i32 noundef [[B:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1204,7 +1204,7 @@ int bar(int n){
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l126
-// CHECK3-SAME: (ptr noundef [[THIS:%.*]]) #[[ATTR4:[0-9]+]] {
+// CHECK3-SAME: (ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
@@ -1216,7 +1216,7 @@ int bar(int n){
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l126.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1243,7 +1243,7 @@ int bar(int n){
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l104.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1265,7 +1265,7 @@ int bar(int n){
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l108.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1275,7 +1275,7 @@ int bar(int n){
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l88
-// CHECK3-SAME: () #[[ATTR5:[0-9]+]] {
+// CHECK3-SAME: () #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK3-NEXT:    call void @__kmpc_push_num_teams(ptr @[[GLOB1]], i32 [[TMP0]], i32 20, i32 0)
@@ -1284,7 +1284,7 @@ int bar(int n){
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l88.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1319,7 +1319,7 @@ int bar(int n){
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l93.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1338,17 +1338,19 @@ int bar(int n){
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK3-SAME: () #[[ATTR6:[0-9]+]] {
+// CHECK3-SAME: () #[[ATTR3:[0-9]+]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK3-NEXT:    ret void
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l104
-// CHECK9-SAME: (i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK9-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK9-NEXT:  entry:
+// CHECK9-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1:[0-9]+]])
+// CHECK9-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK9-NEXT:    store i64 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 8
 // CHECK9-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__ADDR]], align 4
 // CHECK9-NEXT:    call void @__kmpc_push_num_teams(ptr @[[GLOB1]], i32 [[TMP0]], i32 [[TMP1]], i32 0)
@@ -1357,7 +1359,7 @@ int bar(int n){
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l104.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1367,10 +1369,12 @@ int bar(int n){
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l108
-// CHECK9-SAME: (i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] {
+// CHECK9-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] {
 // CHECK9-NEXT:  entry:
+// CHECK9-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK9-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK9-NEXT:    store i64 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 8
 // CHECK9-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__ADDR]], align 4
 // CHECK9-NEXT:    call void @__kmpc_push_num_teams(ptr @[[GLOB1]], i32 [[TMP0]], i32 [[TMP1]], i32 0)
@@ -1379,7 +1383,7 @@ int bar(int n){
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l108.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1389,13 +1393,15 @@ int bar(int n){
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l121
-// CHECK9-SAME: (ptr noundef [[THIS:%.*]], i64 noundef [[B:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] {
+// CHECK9-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef [[THIS:%.*]], i64 noundef [[B:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] {
 // CHECK9-NEXT:  entry:
+// CHECK9-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[B_ADDR:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[B_CASTED:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK9-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK9-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
 // CHECK9-NEXT:    store i64 [[B]], ptr [[B_ADDR]], align 8
 // CHECK9-NEXT:    store i64 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 8
@@ -1410,7 +1416,7 @@ int bar(int n){
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l121.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]], i64 noundef [[B:%.*]]) #[[ATTR1]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1430,10 +1436,12 @@ int bar(int n){
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l126
-// CHECK9-SAME: (ptr noundef [[THIS:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK9-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR0]] {
 // CHECK9-NEXT:  entry:
+// CHECK9-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK9-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK9-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
 // CHECK9-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CHECK9-NEXT:    call void @__kmpc_push_num_teams(ptr @[[GLOB1]], i32 [[TMP0]], i32 1024, i32 0)
@@ -1442,7 +1450,7 @@ int bar(int n){
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l126.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR0]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1457,16 +1465,18 @@ int bar(int n){
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l88
-// CHECK9-SAME: () #[[ATTR4:[0-9]+]] {
+// CHECK9-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
 // CHECK9-NEXT:  entry:
+// CHECK9-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK9-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK9-NEXT:    call void @__kmpc_push_num_teams(ptr @[[GLOB1]], i32 [[TMP0]], i32 20, i32 0)
 // CHECK9-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l88.omp_outlined)
 // CHECK9-NEXT:    ret void
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l88.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1476,14 +1486,16 @@ int bar(int n){
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l93
-// CHECK9-SAME: (i64 noundef [[A:%.*]], i64 noundef [[B:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] {
+// CHECK9-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[A:%.*]], i64 noundef [[B:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] {
 // CHECK9-NEXT:  entry:
+// CHECK9-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[B_ADDR:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[A_CASTED:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[B_CASTED:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK9-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK9-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
 // CHECK9-NEXT:    store i64 [[B]], ptr [[B_ADDR]], align 8
 // CHECK9-NEXT:    store i64 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 8
@@ -1501,7 +1513,7 @@ int bar(int n){
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l93.omp_outlined
-// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR1]] {
+// CHECK9-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i64 noundef [[A:%.*]], i64 noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK9-NEXT:  entry:
 // CHECK9-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -1520,10 +1532,12 @@ int bar(int n){
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l104
-// CHECK11-SAME: (i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK11-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK11-NEXT:  entry:
+// CHECK11-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1:[0-9]+]])
+// CHECK11-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK11-NEXT:    store i32 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 4
 // CHECK11-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__ADDR]], align 4
 // CHECK11-NEXT:    call void @__kmpc_push_num_teams(ptr @[[GLOB1]], i32 [[TMP0]], i32 [[TMP1]], i32 0)
@@ -1532,7 +1546,7 @@ int bar(int n){
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l104.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1542,10 +1556,12 @@ int bar(int n){
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l108
-// CHECK11-SAME: (i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] {
+// CHECK11-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] {
 // CHECK11-NEXT:  entry:
+// CHECK11-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK11-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK11-NEXT:    store i32 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 4
 // CHECK11-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__ADDR]], align 4
 // CHECK11-NEXT:    call void @__kmpc_push_num_teams(ptr @[[GLOB1]], i32 [[TMP0]], i32 [[TMP1]], i32 0)
@@ -1554,7 +1570,7 @@ int bar(int n){
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l108.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1564,13 +1580,15 @@ int bar(int n){
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l121
-// CHECK11-SAME: (ptr noundef [[THIS:%.*]], i32 noundef [[B:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] {
+// CHECK11-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef [[THIS:%.*]], i32 noundef [[B:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] {
 // CHECK11-NEXT:  entry:
+// CHECK11-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[B_CASTED:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK11-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK11-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
 // CHECK11-NEXT:    store i32 [[B]], ptr [[B_ADDR]], align 4
 // CHECK11-NEXT:    store i32 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 4
@@ -1585,7 +1603,7 @@ int bar(int n){
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l121.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]], i32 noundef [[B:%.*]]) #[[ATTR1]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1605,10 +1623,12 @@ int bar(int n){
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l126
-// CHECK11-SAME: (ptr noundef [[THIS:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK11-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR0]] {
 // CHECK11-NEXT:  entry:
+// CHECK11-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK11-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK11-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
 // CHECK11-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
 // CHECK11-NEXT:    call void @__kmpc_push_num_teams(ptr @[[GLOB1]], i32 [[TMP0]], i32 1024, i32 0)
@@ -1617,7 +1637,7 @@ int bar(int n){
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l126.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR0]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1632,16 +1652,18 @@ int bar(int n){
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l88
-// CHECK11-SAME: () #[[ATTR4:[0-9]+]] {
+// CHECK11-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
 // CHECK11-NEXT:  entry:
+// CHECK11-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK11-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK11-NEXT:    call void @__kmpc_push_num_teams(ptr @[[GLOB1]], i32 [[TMP0]], i32 20, i32 0)
 // CHECK11-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l88.omp_outlined)
 // CHECK11-NEXT:    ret void
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l88.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR1]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]]) #[[ATTR0]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1651,14 +1673,16 @@ int bar(int n){
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l93
-// CHECK11-SAME: (i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] {
+// CHECK11-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] {
 // CHECK11-NEXT:  entry:
+// CHECK11-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[A_CASTED:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[B_CASTED:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK11-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK11-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
 // CHECK11-NEXT:    store i32 [[B]], ptr [[B_ADDR]], align 4
 // CHECK11-NEXT:    store i32 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 4
@@ -1676,7 +1700,7 @@ int bar(int n){
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l93.omp_outlined
-// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR1]] {
+// CHECK11-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK11-NEXT:  entry:
 // CHECK11-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
diff --git a/clang/test/OpenMP/target_teams_thread_limit_codegen.cpp b/clang/test/OpenMP/target_teams_thread_limit_codegen.cpp
index a82442a63cf9e..bafab12a529d1 100644
--- a/clang/test/OpenMP/target_teams_thread_limit_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_thread_limit_codegen.cpp
@@ -635,7 +635,7 @@ int bar(int n){
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l126
-// CHECK1-SAME: (ptr noundef [[THIS:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK1-SAME: (ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
@@ -709,7 +709,7 @@ int bar(int n){
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l88
-// CHECK1-SAME: () #[[ATTR4:[0-9]+]] {
+// CHECK1-SAME: () #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK1-NEXT:    call void @__kmpc_push_num_teams(ptr @[[GLOB1]], i32 [[TMP0]], i32 0, i32 20)
@@ -728,7 +728,7 @@ int bar(int n){
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l93
-// CHECK1-SAME: (i64 noundef [[A:%.*]], i64 noundef [[B:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] {
+// CHECK1-SAME: (i64 noundef [[A:%.*]], i64 noundef [[B:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
 // CHECK1-NEXT:    [[B_ADDR:%.*]] = alloca i64, align 8
@@ -772,7 +772,7 @@ int bar(int n){
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK1-SAME: () #[[ATTR5:[0-9]+]] {
+// CHECK1-SAME: () #[[ATTR3:[0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK1-NEXT:    ret void
@@ -1239,7 +1239,7 @@ int bar(int n){
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l126
-// CHECK3-SAME: (ptr noundef [[THIS:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK3-SAME: (ptr noundef [[THIS:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
@@ -1313,7 +1313,7 @@ int bar(int n){
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l88
-// CHECK3-SAME: () #[[ATTR4:[0-9]+]] {
+// CHECK3-SAME: () #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
 // CHECK3-NEXT:    call void @__kmpc_push_num_teams(ptr @[[GLOB1]], i32 [[TMP0]], i32 0, i32 20)
@@ -1332,7 +1332,7 @@ int bar(int n){
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l93
-// CHECK3-SAME: (i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR3]] {
+// CHECK3-SAME: (i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 // CHECK3-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
@@ -1376,18 +1376,20 @@ int bar(int n){
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK3-SAME: () #[[ATTR5:[0-9]+]] {
+// CHECK3-SAME: () #[[ATTR3:[0-9]+]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK3-NEXT:    ret void
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l104
-// CHECK9-SAME: (i64 noundef [[DOTCAPTURE_EXPR_:%.*]], i64 noundef [[DOTCAPTURE_EXPR_1:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK9-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]], i64 noundef [[DOTCAPTURE_EXPR_1:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK9-NEXT:  entry:
+// CHECK9-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[DOTCAPTURE_EXPR__ADDR2:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1:[0-9]+]])
+// CHECK9-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK9-NEXT:    store i64 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 8
 // CHECK9-NEXT:    store i64 [[DOTCAPTURE_EXPR_1]], ptr [[DOTCAPTURE_EXPR__ADDR2]], align 8
 // CHECK9-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__ADDR]], align 4
@@ -1408,10 +1410,12 @@ int bar(int n){
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l108
-// CHECK9-SAME: (i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] {
+// CHECK9-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] {
 // CHECK9-NEXT:  entry:
+// CHECK9-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK9-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK9-NEXT:    store i64 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 8
 // CHECK9-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__ADDR]], align 4
 // CHECK9-NEXT:    call void @__kmpc_push_num_teams(ptr @[[GLOB1]], i32 [[TMP0]], i32 0, i32 [[TMP1]])
@@ -1430,13 +1434,15 @@ int bar(int n){
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l121
-// CHECK9-SAME: (ptr noundef [[THIS:%.*]], i64 noundef [[B:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] {
+// CHECK9-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef [[THIS:%.*]], i64 noundef [[B:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] {
 // CHECK9-NEXT:  entry:
+// CHECK9-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[B_ADDR:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[B_CASTED:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK9-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK9-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
 // CHECK9-NEXT:    store i64 [[B]], ptr [[B_ADDR]], align 8
 // CHECK9-NEXT:    store i64 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 8
@@ -1471,10 +1477,12 @@ int bar(int n){
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l126
-// CHECK9-SAME: (ptr noundef [[THIS:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK9-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR0]] {
 // CHECK9-NEXT:  entry:
+// CHECK9-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK9-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK9-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
 // CHECK9-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CHECK9-NEXT:    call void @__kmpc_push_num_teams(ptr @[[GLOB1]], i32 [[TMP0]], i32 0, i32 1024)
@@ -1498,9 +1506,11 @@ int bar(int n){
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l88
-// CHECK9-SAME: () #[[ATTR3:[0-9]+]] {
+// CHECK9-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
 // CHECK9-NEXT:  entry:
+// CHECK9-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK9-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK9-NEXT:    call void @__kmpc_push_num_teams(ptr @[[GLOB1]], i32 [[TMP0]], i32 0, i32 20)
 // CHECK9-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l88.omp_outlined)
 // CHECK9-NEXT:    ret void
@@ -1517,14 +1527,16 @@ int bar(int n){
 //
 //
 // CHECK9-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l93
-// CHECK9-SAME: (i64 noundef [[A:%.*]], i64 noundef [[B:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
+// CHECK9-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[A:%.*]], i64 noundef [[B:%.*]], i64 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] {
 // CHECK9-NEXT:  entry:
+// CHECK9-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK9-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[B_ADDR:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[A_CASTED:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[B_CASTED:%.*]] = alloca i64, align 8
 // CHECK9-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK9-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK9-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
 // CHECK9-NEXT:    store i64 [[B]], ptr [[B_ADDR]], align 8
 // CHECK9-NEXT:    store i64 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 8
@@ -1561,11 +1573,13 @@ int bar(int n){
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l104
-// CHECK11-SAME: (i32 noundef [[DOTCAPTURE_EXPR_:%.*]], i32 noundef [[DOTCAPTURE_EXPR_1:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK11-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]], i32 noundef [[DOTCAPTURE_EXPR_1:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK11-NEXT:  entry:
+// CHECK11-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[DOTCAPTURE_EXPR__ADDR2:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1:[0-9]+]])
+// CHECK11-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK11-NEXT:    store i32 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 4
 // CHECK11-NEXT:    store i32 [[DOTCAPTURE_EXPR_1]], ptr [[DOTCAPTURE_EXPR__ADDR2]], align 4
 // CHECK11-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__ADDR]], align 4
@@ -1586,10 +1600,12 @@ int bar(int n){
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZL7fstatici_l108
-// CHECK11-SAME: (i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] {
+// CHECK11-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] {
 // CHECK11-NEXT:  entry:
+// CHECK11-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK11-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK11-NEXT:    store i32 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 4
 // CHECK11-NEXT:    [[TMP1:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR__ADDR]], align 4
 // CHECK11-NEXT:    call void @__kmpc_push_num_teams(ptr @[[GLOB1]], i32 [[TMP0]], i32 0, i32 [[TMP1]])
@@ -1608,13 +1624,15 @@ int bar(int n){
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l121
-// CHECK11-SAME: (ptr noundef [[THIS:%.*]], i32 noundef [[B:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] {
+// CHECK11-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef [[THIS:%.*]], i32 noundef [[B:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] {
 // CHECK11-NEXT:  entry:
+// CHECK11-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[B_CASTED:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK11-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK11-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
 // CHECK11-NEXT:    store i32 [[B]], ptr [[B_ADDR]], align 4
 // CHECK11-NEXT:    store i32 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 4
@@ -1649,10 +1667,12 @@ int bar(int n){
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2S12r1Ei_l126
-// CHECK11-SAME: (ptr noundef [[THIS:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK11-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef [[THIS:%.*]]) #[[ATTR0]] {
 // CHECK11-NEXT:  entry:
+// CHECK11-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK11-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK11-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 4
 // CHECK11-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 4
 // CHECK11-NEXT:    call void @__kmpc_push_num_teams(ptr @[[GLOB1]], i32 [[TMP0]], i32 0, i32 1024)
@@ -1676,9 +1696,11 @@ int bar(int n){
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l88
-// CHECK11-SAME: () #[[ATTR3:[0-9]+]] {
+// CHECK11-SAME: (ptr noalias noundef [[DYN_PTR:%.*]]) #[[ATTR0]] {
 // CHECK11-NEXT:  entry:
+// CHECK11-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK11-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK11-NEXT:    call void @__kmpc_push_num_teams(ptr @[[GLOB1]], i32 [[TMP0]], i32 0, i32 20)
 // CHECK11-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 0, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l88.omp_outlined)
 // CHECK11-NEXT:    ret void
@@ -1695,14 +1717,16 @@ int bar(int n){
 //
 //
 // CHECK11-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l93
-// CHECK11-SAME: (i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR2]] {
+// CHECK11-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], i32 noundef [[DOTCAPTURE_EXPR_:%.*]]) #[[ATTR0]] {
 // CHECK11-NEXT:  entry:
+// CHECK11-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK11-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[DOTCAPTURE_EXPR__ADDR:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[A_CASTED:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[B_CASTED:%.*]] = alloca i32, align 4
 // CHECK11-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK11-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK11-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
 // CHECK11-NEXT:    store i32 [[B]], ptr [[B_ADDR]], align 4
 // CHECK11-NEXT:    store i32 [[DOTCAPTURE_EXPR_]], ptr [[DOTCAPTURE_EXPR__ADDR]], align 4
diff --git a/clang/test/OpenMP/teams_codegen.cpp b/clang/test/OpenMP/teams_codegen.cpp
index a2f43bf151d98..914c0f275b3d5 100644
--- a/clang/test/OpenMP/teams_codegen.cpp
+++ b/clang/test/OpenMP/teams_codegen.cpp
@@ -390,7 +390,7 @@ void foo() {
 // CHECK1-NEXT:    [[TMP21:%.*]] = icmp ne i32 [[TMP20]], 0
 // CHECK1-NEXT:    br i1 [[TMP21]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK1:       omp_offload.failed:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z27teams_argument_global_locali_l31(i64 [[TMP1]]) #[[ATTR3:[0-9]+]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z27teams_argument_global_locali_l31(i64 [[TMP1]]) #[[ATTR2:[0-9]+]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK1:       omp_offload.cont:
 // CHECK1-NEXT:    [[TMP22:%.*]] = load i32, ptr [[COMP]], align 4
@@ -434,7 +434,7 @@ void foo() {
 // CHECK1-NEXT:    [[TMP43:%.*]] = icmp ne i32 [[TMP42]], 0
 // CHECK1-NEXT:    br i1 [[TMP43]], label [[OMP_OFFLOAD_FAILED6:%.*]], label [[OMP_OFFLOAD_CONT7:%.*]]
 // CHECK1:       omp_offload.failed6:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z27teams_argument_global_locali_l37(i64 [[TMP23]]) #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z27teams_argument_global_locali_l37(i64 [[TMP23]]) #[[ATTR2]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT7]]
 // CHECK1:       omp_offload.cont7:
 // CHECK1-NEXT:    [[TMP44:%.*]] = load i32, ptr [[LA]], align 4
@@ -489,7 +489,7 @@ void foo() {
 // CHECK1-NEXT:    [[TMP72:%.*]] = icmp ne i32 [[TMP71]], 0
 // CHECK1-NEXT:    br i1 [[TMP72]], label [[OMP_OFFLOAD_FAILED13:%.*]], label [[OMP_OFFLOAD_CONT14:%.*]]
 // CHECK1:       omp_offload.failed13:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z27teams_argument_global_locali_l46(i64 [[TMP45]], i64 [[TMP47]]) #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z27teams_argument_global_locali_l46(i64 [[TMP45]], i64 [[TMP47]]) #[[ATTR2]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT14]]
 // CHECK1:       omp_offload.cont14:
 // CHECK1-NEXT:    [[TMP73:%.*]] = load i32, ptr [[LA]], align 4
@@ -544,7 +544,7 @@ void foo() {
 // CHECK1-NEXT:    [[TMP101:%.*]] = icmp ne i32 [[TMP100]], 0
 // CHECK1-NEXT:    br i1 [[TMP101]], label [[OMP_OFFLOAD_FAILED21:%.*]], label [[OMP_OFFLOAD_CONT22:%.*]]
 // CHECK1:       omp_offload.failed21:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z27teams_argument_global_locali_l53(i64 [[TMP74]], i64 [[TMP76]]) #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z27teams_argument_global_locali_l53(i64 [[TMP74]], i64 [[TMP76]]) #[[ATTR2]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT22]]
 // CHECK1:       omp_offload.cont22:
 // CHECK1-NEXT:    [[TMP102:%.*]] = load i32, ptr @Gbla, align 4
@@ -634,7 +634,7 @@ void foo() {
 // CHECK1-NEXT:    [[TMP150:%.*]] = icmp ne i32 [[TMP149]], 0
 // CHECK1-NEXT:    br i1 [[TMP150]], label [[OMP_OFFLOAD_FAILED29:%.*]], label [[OMP_OFFLOAD_CONT30:%.*]]
 // CHECK1:       omp_offload.failed29:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z27teams_argument_global_locali_l62(i64 [[TMP103]], i64 [[TMP105]], i64 [[TMP107]], i64 [[TMP109]], i64 [[TMP111]]) #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z27teams_argument_global_locali_l62(i64 [[TMP103]], i64 [[TMP105]], i64 [[TMP107]], i64 [[TMP109]], i64 [[TMP111]]) #[[ATTR2]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT30]]
 // CHECK1:       omp_offload.cont30:
 // CHECK1-NEXT:    [[TMP151:%.*]] = load ptr, ptr @Gblc, align 8
@@ -699,7 +699,7 @@ void foo() {
 // CHECK1-NEXT:    [[TMP184:%.*]] = icmp ne i32 [[TMP183]], 0
 // CHECK1-NEXT:    br i1 [[TMP184]], label [[OMP_OFFLOAD_FAILED40:%.*]], label [[OMP_OFFLOAD_CONT41:%.*]]
 // CHECK1:       omp_offload.failed40:
-// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z27teams_argument_global_locali_l71(i64 [[TMP153]], i64 [[TMP155]]) #[[ATTR3]]
+// CHECK1-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z27teams_argument_global_locali_l71(i64 [[TMP153]], i64 [[TMP155]]) #[[ATTR2]]
 // CHECK1-NEXT:    br label [[OMP_OFFLOAD_CONT41]]
 // CHECK1:       omp_offload.cont41:
 // CHECK1-NEXT:    [[TMP185:%.*]] = load i32, ptr [[COMP]], align 4
@@ -716,7 +716,7 @@ void foo() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z27teams_argument_global_locali_l31.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[COMP:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[COMP:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -741,7 +741,7 @@ void foo() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z27teams_argument_global_locali_l37.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[COMP:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[COMP:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -771,7 +771,7 @@ void foo() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z27teams_argument_global_locali_l46.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[COMP:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[COMP:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -787,7 +787,7 @@ void foo() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z27teams_argument_global_locali_l53
-// CHECK1-SAME: (i64 noundef [[LA:%.*]], i64 noundef [[COMP:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (i64 noundef [[LA:%.*]], i64 noundef [[COMP:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[LA_ADDR:%.*]] = alloca i64, align 8
 // CHECK1-NEXT:    [[COMP_ADDR:%.*]] = alloca i64, align 8
@@ -801,7 +801,7 @@ void foo() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z27teams_argument_global_locali_l53.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[COMP:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[COMP:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -817,7 +817,7 @@ void foo() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z27teams_argument_global_locali_l62
-// CHECK1-SAME: (i64 noundef [[GBLA:%.*]], i64 noundef [[A:%.*]], i64 noundef [[GBLB:%.*]], i64 noundef [[LC:%.*]], i64 noundef [[COMP:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (i64 noundef [[GBLA:%.*]], i64 noundef [[A:%.*]], i64 noundef [[GBLB:%.*]], i64 noundef [[LC:%.*]], i64 noundef [[COMP:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[GBLA_ADDR:%.*]] = alloca i64, align 8
 // CHECK1-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
@@ -844,7 +844,7 @@ void foo() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z27teams_argument_global_locali_l62.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[COMP:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[COMP:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -860,7 +860,7 @@ void foo() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z27teams_argument_global_locali_l71
-// CHECK1-SAME: (i64 noundef [[GBLC:%.*]], i64 noundef [[COMP:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (i64 noundef [[GBLC:%.*]], i64 noundef [[COMP:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[GBLC_ADDR:%.*]] = alloca i64, align 8
 // CHECK1-NEXT:    [[COMP_ADDR:%.*]] = alloca i64, align 8
@@ -880,7 +880,7 @@ void foo() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z27teams_argument_global_locali_l71.omp_outlined
-// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[COMP:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[GBLC:%.*]]) #[[ATTR2]] {
+// CHECK1-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[COMP:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[GBLC:%.*]]) #[[ATTR1]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -902,7 +902,7 @@ void foo() {
 //
 //
 // CHECK1-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK1-SAME: () #[[ATTR4:[0-9]+]] {
+// CHECK1-SAME: () #[[ATTR3:[0-9]+]] {
 // CHECK1-NEXT:  entry:
 // CHECK1-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK1-NEXT:    ret void
@@ -999,7 +999,7 @@ void foo() {
 // CHECK3-NEXT:    [[TMP21:%.*]] = icmp ne i32 [[TMP20]], 0
 // CHECK3-NEXT:    br i1 [[TMP21]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK3:       omp_offload.failed:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z27teams_argument_global_locali_l31(i32 [[TMP1]]) #[[ATTR3:[0-9]+]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z27teams_argument_global_locali_l31(i32 [[TMP1]]) #[[ATTR2:[0-9]+]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK3:       omp_offload.cont:
 // CHECK3-NEXT:    [[TMP22:%.*]] = load i32, ptr [[COMP]], align 4
@@ -1043,7 +1043,7 @@ void foo() {
 // CHECK3-NEXT:    [[TMP43:%.*]] = icmp ne i32 [[TMP42]], 0
 // CHECK3-NEXT:    br i1 [[TMP43]], label [[OMP_OFFLOAD_FAILED6:%.*]], label [[OMP_OFFLOAD_CONT7:%.*]]
 // CHECK3:       omp_offload.failed6:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z27teams_argument_global_locali_l37(i32 [[TMP23]]) #[[ATTR3]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z27teams_argument_global_locali_l37(i32 [[TMP23]]) #[[ATTR2]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT7]]
 // CHECK3:       omp_offload.cont7:
 // CHECK3-NEXT:    [[TMP44:%.*]] = load i32, ptr [[LA]], align 4
@@ -1098,7 +1098,7 @@ void foo() {
 // CHECK3-NEXT:    [[TMP72:%.*]] = icmp ne i32 [[TMP71]], 0
 // CHECK3-NEXT:    br i1 [[TMP72]], label [[OMP_OFFLOAD_FAILED13:%.*]], label [[OMP_OFFLOAD_CONT14:%.*]]
 // CHECK3:       omp_offload.failed13:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z27teams_argument_global_locali_l46(i32 [[TMP45]], i32 [[TMP47]]) #[[ATTR3]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z27teams_argument_global_locali_l46(i32 [[TMP45]], i32 [[TMP47]]) #[[ATTR2]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT14]]
 // CHECK3:       omp_offload.cont14:
 // CHECK3-NEXT:    [[TMP73:%.*]] = load i32, ptr [[LA]], align 4
@@ -1153,7 +1153,7 @@ void foo() {
 // CHECK3-NEXT:    [[TMP101:%.*]] = icmp ne i32 [[TMP100]], 0
 // CHECK3-NEXT:    br i1 [[TMP101]], label [[OMP_OFFLOAD_FAILED21:%.*]], label [[OMP_OFFLOAD_CONT22:%.*]]
 // CHECK3:       omp_offload.failed21:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z27teams_argument_global_locali_l53(i32 [[TMP74]], i32 [[TMP76]]) #[[ATTR3]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z27teams_argument_global_locali_l53(i32 [[TMP74]], i32 [[TMP76]]) #[[ATTR2]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT22]]
 // CHECK3:       omp_offload.cont22:
 // CHECK3-NEXT:    [[TMP102:%.*]] = load i32, ptr @Gbla, align 4
@@ -1240,7 +1240,7 @@ void foo() {
 // CHECK3-NEXT:    [[TMP148:%.*]] = icmp ne i32 [[TMP147]], 0
 // CHECK3-NEXT:    br i1 [[TMP148]], label [[OMP_OFFLOAD_FAILED29:%.*]], label [[OMP_OFFLOAD_CONT30:%.*]]
 // CHECK3:       omp_offload.failed29:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z27teams_argument_global_locali_l62(i32 [[TMP103]], i32 [[TMP105]], ptr @Gblb, i32 [[TMP107]], i32 [[TMP109]]) #[[ATTR3]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z27teams_argument_global_locali_l62(i32 [[TMP103]], i32 [[TMP105]], ptr @Gblb, i32 [[TMP107]], i32 [[TMP109]]) #[[ATTR2]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT30]]
 // CHECK3:       omp_offload.cont30:
 // CHECK3-NEXT:    [[TMP149:%.*]] = load ptr, ptr @Gblc, align 4
@@ -1305,7 +1305,7 @@ void foo() {
 // CHECK3-NEXT:    [[TMP182:%.*]] = icmp ne i32 [[TMP181]], 0
 // CHECK3-NEXT:    br i1 [[TMP182]], label [[OMP_OFFLOAD_FAILED40:%.*]], label [[OMP_OFFLOAD_CONT41:%.*]]
 // CHECK3:       omp_offload.failed40:
-// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z27teams_argument_global_locali_l71(i32 [[TMP151]], i32 [[TMP153]]) #[[ATTR3]]
+// CHECK3-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z27teams_argument_global_locali_l71(i32 [[TMP151]], i32 [[TMP153]]) #[[ATTR2]]
 // CHECK3-NEXT:    br label [[OMP_OFFLOAD_CONT41]]
 // CHECK3:       omp_offload.cont41:
 // CHECK3-NEXT:    [[TMP183:%.*]] = load i32, ptr [[COMP]], align 4
@@ -1322,7 +1322,7 @@ void foo() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z27teams_argument_global_locali_l31.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[COMP:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[COMP:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1347,7 +1347,7 @@ void foo() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z27teams_argument_global_locali_l37.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[COMP:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[COMP:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1377,7 +1377,7 @@ void foo() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z27teams_argument_global_locali_l46.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[COMP:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[COMP:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1393,7 +1393,7 @@ void foo() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z27teams_argument_global_locali_l53
-// CHECK3-SAME: (i32 noundef [[LA:%.*]], i32 noundef [[COMP:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (i32 noundef [[LA:%.*]], i32 noundef [[COMP:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[LA_ADDR:%.*]] = alloca i32, align 4
 // CHECK3-NEXT:    [[COMP_ADDR:%.*]] = alloca i32, align 4
@@ -1407,7 +1407,7 @@ void foo() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z27teams_argument_global_locali_l53.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[COMP:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[COMP:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1423,7 +1423,7 @@ void foo() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z27teams_argument_global_locali_l62
-// CHECK3-SAME: (i32 noundef [[GBLA:%.*]], i32 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[GBLB:%.*]], i32 noundef [[LC:%.*]], i32 noundef [[COMP:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (i32 noundef [[GBLA:%.*]], i32 noundef [[A:%.*]], ptr noundef nonnull align 4 dereferenceable(8) [[GBLB:%.*]], i32 noundef [[LC:%.*]], i32 noundef [[COMP:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[GBLA_ADDR:%.*]] = alloca i32, align 4
 // CHECK3-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
@@ -1454,7 +1454,7 @@ void foo() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z27teams_argument_global_locali_l62.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[COMP:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[COMP:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1470,7 +1470,7 @@ void foo() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z27teams_argument_global_locali_l71
-// CHECK3-SAME: (i32 noundef [[GBLC:%.*]], i32 noundef [[COMP:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (i32 noundef [[GBLC:%.*]], i32 noundef [[COMP:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[GBLC_ADDR:%.*]] = alloca i32, align 4
 // CHECK3-NEXT:    [[COMP_ADDR:%.*]] = alloca i32, align 4
@@ -1490,7 +1490,7 @@ void foo() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z27teams_argument_global_locali_l71.omp_outlined
-// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[COMP:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[GBLC:%.*]]) #[[ATTR2]] {
+// CHECK3-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[COMP:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[GBLC:%.*]]) #[[ATTR1]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK3-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -1512,7 +1512,7 @@ void foo() {
 //
 //
 // CHECK3-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK3-SAME: () #[[ATTR4:[0-9]+]] {
+// CHECK3-SAME: () #[[ATTR3:[0-9]+]] {
 // CHECK3-NEXT:  entry:
 // CHECK3-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK3-NEXT:    ret void
@@ -2063,7 +2063,7 @@ void foo() {
 // CHECK17-NEXT:    [[TMP26:%.*]] = icmp ne i32 [[TMP25]], 0
 // CHECK17-NEXT:    br i1 [[TMP26]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK17:       omp_offload.failed:
-// CHECK17-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l161(ptr [[THIS1]], i64 [[TMP1]]) #[[ATTR3:[0-9]+]]
+// CHECK17-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l161(ptr [[THIS1]], i64 [[TMP1]]) #[[ATTR2:[0-9]+]]
 // CHECK17-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK17:       omp_offload.cont:
 // CHECK17-NEXT:    [[TMP27:%.*]] = load i32, ptr [[COMP]], align 4
@@ -2119,7 +2119,7 @@ void foo() {
 // CHECK17-NEXT:    [[TMP53:%.*]] = icmp ne i32 [[TMP52]], 0
 // CHECK17-NEXT:    br i1 [[TMP53]], label [[OMP_OFFLOAD_FAILED9:%.*]], label [[OMP_OFFLOAD_CONT10:%.*]]
 // CHECK17:       omp_offload.failed9:
-// CHECK17-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l169(ptr [[THIS1]], i64 [[TMP28]]) #[[ATTR3]]
+// CHECK17-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l169(ptr [[THIS1]], i64 [[TMP28]]) #[[ATTR2]]
 // CHECK17-NEXT:    br label [[OMP_OFFLOAD_CONT10]]
 // CHECK17:       omp_offload.cont10:
 // CHECK17-NEXT:    [[TMP54:%.*]] = load i32, ptr [[COMP]], align 4
@@ -2143,7 +2143,7 @@ void foo() {
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l161.omp_outlined
-// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[COMP:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[COMP:%.*]]) #[[ATTR1]] {
 // CHECK17-NEXT:  entry:
 // CHECK17-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -2159,7 +2159,7 @@ void foo() {
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l169
-// CHECK17-SAME: (ptr noundef [[THIS:%.*]], i64 noundef [[COMP:%.*]]) #[[ATTR4:[0-9]+]] {
+// CHECK17-SAME: (ptr noundef [[THIS:%.*]], i64 noundef [[COMP:%.*]]) #[[ATTR1]] {
 // CHECK17-NEXT:  entry:
 // CHECK17-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[COMP_ADDR:%.*]] = alloca i64, align 8
@@ -2177,7 +2177,7 @@ void foo() {
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l169.omp_outlined
-// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[COMP:%.*]]) #[[ATTR2]] {
+// CHECK17-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[COMP:%.*]]) #[[ATTR1]] {
 // CHECK17-NEXT:  entry:
 // CHECK17-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK17-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -2193,7 +2193,7 @@ void foo() {
 //
 //
 // CHECK17-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK17-SAME: () #[[ATTR5:[0-9]+]] {
+// CHECK17-SAME: () #[[ATTR3:[0-9]+]] {
 // CHECK17-NEXT:  entry:
 // CHECK17-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK17-NEXT:    ret void
@@ -2276,7 +2276,7 @@ void foo() {
 // CHECK19-NEXT:    [[TMP26:%.*]] = icmp ne i32 [[TMP25]], 0
 // CHECK19-NEXT:    br i1 [[TMP26]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]]
 // CHECK19:       omp_offload.failed:
-// CHECK19-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l161(ptr [[THIS1]], i32 [[TMP1]]) #[[ATTR3:[0-9]+]]
+// CHECK19-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l161(ptr [[THIS1]], i32 [[TMP1]]) #[[ATTR2:[0-9]+]]
 // CHECK19-NEXT:    br label [[OMP_OFFLOAD_CONT]]
 // CHECK19:       omp_offload.cont:
 // CHECK19-NEXT:    [[TMP27:%.*]] = load i32, ptr [[COMP]], align 4
@@ -2332,7 +2332,7 @@ void foo() {
 // CHECK19-NEXT:    [[TMP53:%.*]] = icmp ne i32 [[TMP52]], 0
 // CHECK19-NEXT:    br i1 [[TMP53]], label [[OMP_OFFLOAD_FAILED9:%.*]], label [[OMP_OFFLOAD_CONT10:%.*]]
 // CHECK19:       omp_offload.failed9:
-// CHECK19-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l169(ptr [[THIS1]], i32 [[TMP28]]) #[[ATTR3]]
+// CHECK19-NEXT:    call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l169(ptr [[THIS1]], i32 [[TMP28]]) #[[ATTR2]]
 // CHECK19-NEXT:    br label [[OMP_OFFLOAD_CONT10]]
 // CHECK19:       omp_offload.cont10:
 // CHECK19-NEXT:    [[TMP54:%.*]] = load i32, ptr [[COMP]], align 4
@@ -2356,7 +2356,7 @@ void foo() {
 //
 //
 // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l161.omp_outlined
-// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[COMP:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[COMP:%.*]]) #[[ATTR1]] {
 // CHECK19-NEXT:  entry:
 // CHECK19-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -2372,7 +2372,7 @@ void foo() {
 //
 //
 // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l169
-// CHECK19-SAME: (ptr noundef [[THIS:%.*]], i32 noundef [[COMP:%.*]]) #[[ATTR4:[0-9]+]] {
+// CHECK19-SAME: (ptr noundef [[THIS:%.*]], i32 noundef [[COMP:%.*]]) #[[ATTR1]] {
 // CHECK19-NEXT:  entry:
 // CHECK19-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[COMP_ADDR:%.*]] = alloca i32, align 4
@@ -2390,7 +2390,7 @@ void foo() {
 //
 //
 // CHECK19-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__ZN2SSIiLi123ELx456EE3fooEv_l169.omp_outlined
-// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[COMP:%.*]]) #[[ATTR2]] {
+// CHECK19-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[COMP:%.*]]) #[[ATTR1]] {
 // CHECK19-NEXT:  entry:
 // CHECK19-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK19-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -2406,23 +2406,25 @@ void foo() {
 //
 //
 // CHECK19-LABEL: define {{[^@]+}}@.omp_offloading.requires_reg
-// CHECK19-SAME: () #[[ATTR5:[0-9]+]] {
+// CHECK19-SAME: () #[[ATTR3:[0-9]+]] {
 // CHECK19-NEXT:  entry:
 // CHECK19-NEXT:    call void @__tgt_register_requires(i64 1)
 // CHECK19-NEXT:    ret void
 //
 //
 // CHECK25-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l216
-// CHECK25-SAME: (i64 noundef [[ARGC:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK25-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[ARGC:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK25-NEXT:  entry:
+// CHECK25-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK25-NEXT:    [[ARGC_ADDR:%.*]] = alloca i64, align 8
+// CHECK25-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK25-NEXT:    store i64 [[ARGC]], ptr [[ARGC_ADDR]], align 8
 // CHECK25-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1:[0-9]+]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l216.omp_outlined, ptr [[ARGC_ADDR]])
 // CHECK25-NEXT:    ret void
 //
 //
 // CHECK25-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l216.omp_outlined
-// CHECK25-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[ARGC:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK25-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[ARGC:%.*]]) #[[ATTR0]] {
 // CHECK25-NEXT:  entry:
 // CHECK25-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK25-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -2436,16 +2438,18 @@ void foo() {
 //
 //
 // CHECK25-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l209
-// CHECK25-SAME: (ptr noundef [[ARGC:%.*]]) #[[ATTR0]] {
+// CHECK25-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef [[ARGC:%.*]]) #[[ATTR0]] {
 // CHECK25-NEXT:  entry:
+// CHECK25-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK25-NEXT:    [[ARGC_ADDR:%.*]] = alloca ptr, align 8
+// CHECK25-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK25-NEXT:    store ptr [[ARGC]], ptr [[ARGC_ADDR]], align 8
 // CHECK25-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l209.omp_outlined, ptr [[ARGC_ADDR]])
 // CHECK25-NEXT:    ret void
 //
 //
 // CHECK25-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l209.omp_outlined
-// CHECK25-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[ARGC:%.*]]) #[[ATTR1]] {
+// CHECK25-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 8 dereferenceable(8) [[ARGC:%.*]]) #[[ATTR0]] {
 // CHECK25-NEXT:  entry:
 // CHECK25-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
 // CHECK25-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
@@ -2459,16 +2463,18 @@ void foo() {
 //
 //
 // CHECK27-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l216
-// CHECK27-SAME: (i32 noundef [[ARGC:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK27-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[ARGC:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK27-NEXT:  entry:
+// CHECK27-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK27-NEXT:    [[ARGC_ADDR:%.*]] = alloca i32, align 4
+// CHECK27-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK27-NEXT:    store i32 [[ARGC]], ptr [[ARGC_ADDR]], align 4
 // CHECK27-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1:[0-9]+]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l216.omp_outlined, ptr [[ARGC_ADDR]])
 // CHECK27-NEXT:    ret void
 //
 //
 // CHECK27-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l216.omp_outlined
-// CHECK27-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[ARGC:%.*]]) #[[ATTR1:[0-9]+]] {
+// CHECK27-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[ARGC:%.*]]) #[[ATTR0]] {
 // CHECK27-NEXT:  entry:
 // CHECK27-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK27-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -2482,16 +2488,18 @@ void foo() {
 //
 //
 // CHECK27-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l209
-// CHECK27-SAME: (ptr noundef [[ARGC:%.*]]) #[[ATTR0]] {
+// CHECK27-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef [[ARGC:%.*]]) #[[ATTR0]] {
 // CHECK27-NEXT:  entry:
+// CHECK27-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK27-NEXT:    [[ARGC_ADDR:%.*]] = alloca ptr, align 4
+// CHECK27-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK27-NEXT:    store ptr [[ARGC]], ptr [[ARGC_ADDR]], align 4
 // CHECK27-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_teams(ptr @[[GLOB1]], i32 1, ptr @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l209.omp_outlined, ptr [[ARGC_ADDR]])
 // CHECK27-NEXT:    ret void
 //
 //
 // CHECK27-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l209.omp_outlined
-// CHECK27-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[ARGC:%.*]]) #[[ATTR1]] {
+// CHECK27-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[ARGC:%.*]]) #[[ATTR0]] {
 // CHECK27-NEXT:  entry:
 // CHECK27-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 4
 // CHECK27-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 4
@@ -2505,12 +2513,14 @@ void foo() {
 //
 //
 // CHECK33-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l265
-// CHECK33-SAME: (i64 noundef [[A:%.*]], i64 noundef [[B:%.*]], i64 noundef [[ARGC:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK33-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[A:%.*]], i64 noundef [[B:%.*]], i64 noundef [[ARGC:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK33-NEXT:  entry:
+// CHECK33-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK33-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
 // CHECK33-NEXT:    [[B_ADDR:%.*]] = alloca i64, align 8
 // CHECK33-NEXT:    [[ARGC_ADDR:%.*]] = alloca i64, align 8
 // CHECK33-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1:[0-9]+]])
+// CHECK33-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK33-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
 // CHECK33-NEXT:    store i64 [[B]], ptr [[B_ADDR]], align 8
 // CHECK33-NEXT:    store i64 [[ARGC]], ptr [[ARGC_ADDR]], align 8
@@ -2536,12 +2546,14 @@ void foo() {
 //
 //
 // CHECK33-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l254
-// CHECK33-SAME: (i64 noundef [[A:%.*]], i64 noundef [[B:%.*]], ptr noundef [[ARGC:%.*]]) #[[ATTR0]] {
+// CHECK33-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i64 noundef [[A:%.*]], i64 noundef [[B:%.*]], ptr noundef [[ARGC:%.*]]) #[[ATTR0]] {
 // CHECK33-NEXT:  entry:
+// CHECK33-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK33-NEXT:    [[A_ADDR:%.*]] = alloca i64, align 8
 // CHECK33-NEXT:    [[B_ADDR:%.*]] = alloca i64, align 8
 // CHECK33-NEXT:    [[ARGC_ADDR:%.*]] = alloca ptr, align 8
 // CHECK33-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK33-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8
 // CHECK33-NEXT:    store i64 [[A]], ptr [[A_ADDR]], align 8
 // CHECK33-NEXT:    store i64 [[B]], ptr [[B_ADDR]], align 8
 // CHECK33-NEXT:    store ptr [[ARGC]], ptr [[ARGC_ADDR]], align 8
@@ -2567,12 +2579,14 @@ void foo() {
 //
 //
 // CHECK35-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l265
-// CHECK35-SAME: (i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], i32 noundef [[ARGC:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK35-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], i32 noundef [[ARGC:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK35-NEXT:  entry:
+// CHECK35-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK35-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 // CHECK35-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
 // CHECK35-NEXT:    [[ARGC_ADDR:%.*]] = alloca i32, align 4
 // CHECK35-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1:[0-9]+]])
+// CHECK35-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK35-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
 // CHECK35-NEXT:    store i32 [[B]], ptr [[B_ADDR]], align 4
 // CHECK35-NEXT:    store i32 [[ARGC]], ptr [[ARGC_ADDR]], align 4
@@ -2598,12 +2612,14 @@ void foo() {
 //
 //
 // CHECK35-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z5tmainIPPcEiT__l254
-// CHECK35-SAME: (i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], ptr noundef [[ARGC:%.*]]) #[[ATTR0]] {
+// CHECK35-SAME: (ptr noalias noundef [[DYN_PTR:%.*]], i32 noundef [[A:%.*]], i32 noundef [[B:%.*]], ptr noundef [[ARGC:%.*]]) #[[ATTR0]] {
 // CHECK35-NEXT:  entry:
+// CHECK35-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 4
 // CHECK35-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
 // CHECK35-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4
 // CHECK35-NEXT:    [[ARGC_ADDR:%.*]] = alloca ptr, align 4
 // CHECK35-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB1]])
+// CHECK35-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 4
 // CHECK35-NEXT:    store i32 [[A]], ptr [[A_ADDR]], align 4
 // CHECK35-NEXT:    store i32 [[B]], ptr [[B_ADDR]], align 4
 // CHECK35-NEXT:    store ptr [[ARGC]], ptr [[ARGC_ADDR]], align 4
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index aa891202a40ea..dc9a6497140dd 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -2039,7 +2039,10 @@ class OpenMPIRBuilder {
   /// Create a runtime call for kmpc_target_deinit
   ///
   /// \param Loc The insert and source location description.
-  void createTargetDeinit(const LocationDescription &Loc);
+  /// \param TeamsReductionBufferSize The size to be allocated for the teams
+  /// 	     reduction buffer.
+  void createTargetDeinit(const LocationDescription &Loc,
+                          int32_t TeamsReductionBufferSize = 0);
 
   ///}
 
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
index be03757d7d57c..9ee1cdd5313fe 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPKinds.def
@@ -95,10 +95,12 @@ __OMP_STRUCT_TYPE(AsyncInfo, __tgt_async_info, false, Int8Ptr)
 __OMP_STRUCT_TYPE(DependInfo, kmp_dep_info, false, SizeTy, SizeTy, Int8)
 __OMP_STRUCT_TYPE(Task, kmp_task_ompbuilder_t, false, VoidPtr, VoidPtr, Int32, VoidPtr, VoidPtr)
 __OMP_STRUCT_TYPE(ConfigurationEnvironment, ConfigurationEnvironmentTy, false,
-                  Int8, Int8, Int8, Int32, Int32, Int32, Int32)
+                  Int8, Int8, Int8, Int32, Int32, Int32, Int32, Int32)
 __OMP_STRUCT_TYPE(DynamicEnvironment, DynamicEnvironmentTy, false, Int16)
 __OMP_STRUCT_TYPE(KernelEnvironment, KernelEnvironmentTy, false,
                   ConfigurationEnvironment, IdentPtr, DynamicEnvironmentPtr)
+__OMP_STRUCT_TYPE(KernelLaunchEnvironment, KernelLaunchEnvironmentTy, false,
+                  Int32, Int32)
 
 #undef __OMP_STRUCT_TYPE
 #undef OMP_STRUCT_TYPE
@@ -457,7 +459,7 @@ __OMP_RTL(__kmpc_task_allow_completion_event, false, VoidPtr, IdentPtr,
           /* Int */ Int32, /* kmp_task_t */ VoidPtr)
 
 /// OpenMP Device runtime functions
-__OMP_RTL(__kmpc_target_init, false, Int32, KernelEnvironmentPtr)
+__OMP_RTL(__kmpc_target_init, false, Int32, KernelEnvironmentPtr, KernelLaunchEnvironmentPtr)
 __OMP_RTL(__kmpc_target_deinit, false, Void,)
 __OMP_RTL(__kmpc_kernel_prepare_parallel, false, Void, VoidPtr)
 __OMP_RTL(__kmpc_parallel_51, false, Void, IdentPtr, Int32, Int32, Int32, Int32,
@@ -473,6 +475,7 @@ __OMP_RTL(__kmpc_nvptx_end_reduce_nowait, false, Void, Int32)
 __OMP_RTL(__kmpc_nvptx_teams_reduce_nowait_v2, false, Int32, IdentPtr, Int32,
           VoidPtr, Int32, VoidPtr, ShuffleReducePtr, InterWarpCopyPtr,
           GlobalListPtr, GlobalListPtr, GlobalListPtr, GlobalListPtr)
+__OMP_RTL(__kmpc_reduction_get_fixed_buffer, false, VoidPtr, )
 
 __OMP_RTL(__kmpc_shuffle_int64, false, Int64, Int64, Int16, Int16)
 
@@ -1043,6 +1046,7 @@ __OMP_RTL_ATTRS(__kmpc_nvptx_end_reduce_nowait, AttributeSet(), AttributeSet(),
                 ParamAttrs(SExt))
 __OMP_RTL_ATTRS(__kmpc_nvptx_teams_reduce_nowait_v2, AttributeSet(), SExt,
                 ParamAttrs(AttributeSet(), SExt, AttributeSet(), ZExt))
+__OMP_RTL_ATTRS(__kmpc_reduction_get_fixed_buffer, GetterAttrs, AttributeSet(), ParamAttrs())
 
 __OMP_RTL_ATTRS(__kmpc_shuffle_int64, AttributeSet(), AttributeSet(),
                 ParamAttrs(AttributeSet(), SExt, SExt))
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 624958649d39f..46db9c209a6a4 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -4110,6 +4110,7 @@ OpenMPIRBuilder::createTargetInit(const LocationDescription &Loc, bool IsSPMD,
   Constant *MaxThreads = ConstantInt::getSigned(Int32, MaxThreadsVal);
   Constant *MinTeams = ConstantInt::getSigned(Int32, MinTeamsVal);
   Constant *MaxTeams = ConstantInt::getSigned(Int32, MaxTeamsVal);
+  Constant *ReductionBufferSize = ConstantInt::getSigned(Int32, 0);
 
   // We need to strip the debug prefix to get the correct kernel name.
   StringRef KernelName = Kernel->getName();
@@ -4146,6 +4147,7 @@ OpenMPIRBuilder::createTargetInit(const LocationDescription &Loc, bool IsSPMD,
                                     MaxThreads,
                                     MinTeams,
                                     MaxTeams,
+                                    ReductionBufferSize,
                                 });
   Constant *KernelEnvironmentInitializer = ConstantStruct::get(
       KernelEnvironment, {
@@ -4166,7 +4168,9 @@ OpenMPIRBuilder::createTargetInit(const LocationDescription &Loc, bool IsSPMD,
           ? KernelEnvironmentGV
           : ConstantExpr::getAddrSpaceCast(KernelEnvironmentGV,
                                            KernelEnvironmentPtr);
-  CallInst *ThreadKind = Builder.CreateCall(Fn, {KernelEnvironment});
+  Value *KernelLaunchEnvironment = Kernel->getArg(0);
+  CallInst *ThreadKind =
+      Builder.CreateCall(Fn, {KernelEnvironment, KernelLaunchEnvironment});
 
   Value *ExecUserCode = Builder.CreateICmpEQ(
       ThreadKind, ConstantInt::get(ThreadKind->getType(), -1),
@@ -4199,7 +4203,8 @@ OpenMPIRBuilder::createTargetInit(const LocationDescription &Loc, bool IsSPMD,
   return InsertPointTy(UserCodeEntryBB, UserCodeEntryBB->getFirstInsertionPt());
 }
 
-void OpenMPIRBuilder::createTargetDeinit(const LocationDescription &Loc) {
+void OpenMPIRBuilder::createTargetDeinit(const LocationDescription &Loc,
+                                         int32_t TeamsReductionBufferSize) {
   if (!updateToLocation(Loc))
     return;
 
@@ -4207,6 +4212,24 @@ void OpenMPIRBuilder::createTargetDeinit(const LocationDescription &Loc) {
       omp::RuntimeFunction::OMPRTL___kmpc_target_deinit);
 
   Builder.CreateCall(Fn, {});
+
+  if (!TeamsReductionBufferSize)
+    return;
+
+  Function *Kernel = Builder.GetInsertBlock()->getParent();
+  // We need to strip the debug prefix to get the correct kernel name.
+  StringRef KernelName = Kernel->getName();
+  const std::string DebugPrefix = "_debug__";
+  if (KernelName.ends_with(DebugPrefix))
+    KernelName = KernelName.drop_back(DebugPrefix.length());
+  auto *KernelEnvironmentGV =
+      M.getNamedGlobal((KernelName + "_kernel_environment").str());
+  assert(KernelEnvironmentGV && "Expected kernel environment global\n");
+  auto *KernelEnvironmentInitializer = KernelEnvironmentGV->getInitializer();
+  auto *NewInitializer = ConstantFoldInsertValueInstruction(
+      KernelEnvironmentInitializer,
+      ConstantInt::get(Int32, TeamsReductionBufferSize), {0, 7});
+  KernelEnvironmentGV->setInitializer(NewInitializer);
 }
 
 static MDNode *getNVPTXMDNode(Function &Kernel, StringRef Name) {
@@ -4584,6 +4607,11 @@ static Function *createOutlinedFunction(
     OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy &ArgAccessorFuncCB) {
   SmallVector<Type *> ParameterTypes;
   if (OMPBuilder.Config.isTargetDevice()) {
+    // Add the "implicit" runtime argument we use to provide launch specific
+    // information for target devices.
+    auto *Int8PtrTy = Type::getInt8PtrTy(Builder.getContext());
+    ParameterTypes.push_back(Int8PtrTy);
+
     // All parameters to target devices are passed as pointers
     // or i64. This assumes 64-bit address spaces/pointers.
     for (auto &Arg : Inputs)
@@ -4627,8 +4655,14 @@ static Function *createOutlinedFunction(
 
   Builder.SetInsertPoint(UserCodeEntryBB->getFirstNonPHIOrDbg());
 
+  // Skip the artificial dyn_ptr on the device.
+  const auto &ArgRange =
+      OMPBuilder.Config.isTargetDevice()
+          ? make_range(Func->arg_begin() + 1, Func->arg_end())
+          : Func->args();
+
   // Rewrite uses of input valus to parameters.
-  for (auto InArg : zip(Inputs, Func->args())) {
+  for (auto InArg : zip(Inputs, ArgRange)) {
     Value *Input = std::get<0>(InArg);
     Argument &Arg = std::get<1>(InArg);
     Value *InputCopy = nullptr;
diff --git a/llvm/test/Transforms/OpenMP/add_attributes.ll b/llvm/test/Transforms/OpenMP/add_attributes.ll
index d96e3ad408dc4..2c6ce204dc9e4 100644
--- a/llvm/test/Transforms/OpenMP/add_attributes.ll
+++ b/llvm/test/Transforms/OpenMP/add_attributes.ll
@@ -738,7 +738,7 @@ declare i64 @__kmpc_shuffle_int64(i64, i16, i16);
 
 declare void @__kmpc_target_deinit();
 
-declare i32 @__kmpc_target_init(ptr);
+declare i32 @__kmpc_target_init(ptr, ptr);
 
 declare void @__tgt_interop_destroy(ptr, i32, ptr, i32, i32, ptr, i32);
 
@@ -1392,7 +1392,7 @@ declare i32 @__tgt_target_kernel_nowait(ptr, i64, i32, i32, ptr, ptr, i32, ptr,
 ; CHECK: declare void @__kmpc_target_deinit()
 
 ; CHECK-NOT: Function Attrs
-; CHECK: declare i32 @__kmpc_target_init(ptr)
+; CHECK: declare i32 @__kmpc_target_init(ptr, ptr)
 
 ; CHECK-NOT: Function Attrs
 ; CHECK: declare void @__tgt_interop_destroy(ptr, i32, ptr, i32, i32, ptr, i32)
@@ -2040,7 +2040,7 @@ declare i32 @__tgt_target_kernel_nowait(ptr, i64, i32, i32, ptr, ptr, i32, ptr,
 ; OPTIMISTIC: declare void @__kmpc_target_deinit()
 
 ; OPTIMISTIC-NOT: Function Attrs
-; OPTIMISTIC: declare i32 @__kmpc_target_init(ptr)
+; OPTIMISTIC: declare i32 @__kmpc_target_init(ptr, ptr)
 
 ; OPTIMISTIC-NOT: Function Attrs
 ; OPTIMISTIC: declare void @__tgt_interop_destroy(ptr, i32, ptr, i32, i32, ptr, i32)
@@ -2701,7 +2701,7 @@ declare i32 @__tgt_target_kernel_nowait(ptr, i64, i32, i32, ptr, ptr, i32, ptr,
 ; EXT: declare void @__kmpc_target_deinit()
 
 ; EXT-NOT: Function Attrs
-; EXT: declare signext i32 @__kmpc_target_init(ptr)
+; EXT: declare signext i32 @__kmpc_target_init(ptr, ptr)
 
 ; EXT-NOT: Function Attrs
 ; EXT: declare void @__tgt_interop_destroy(ptr, i32 signext, ptr, i32 signext, i32 signext, ptr, i32 signext)
diff --git a/llvm/test/Transforms/OpenMP/always_inline_device.ll b/llvm/test/Transforms/OpenMP/always_inline_device.ll
index 993f097d74118..2e4b8d76795a0 100644
--- a/llvm/test/Transforms/OpenMP/always_inline_device.ll
+++ b/llvm/test/Transforms/OpenMP/always_inline_device.ll
@@ -17,11 +17,11 @@
 ; CHECK: @[[G:[a-zA-Z0-9_$"\\.-]+]] = external global i8
 ; CHECK: @[[KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 3, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
 ;.
-define weak void @__omp_offloading_fd02_c0934fc2_foo_l4() #0 {
+define weak void @__omp_offloading_fd02_c0934fc2_foo_l4(ptr %dyn) #0 {
 ; CHECK: Function Attrs: norecurse nounwind
 ; CHECK-LABEL: @__omp_offloading_fd02_c0934fc2_foo_l4(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @kernel_environment)
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @kernel_environment, ptr [[DYN:%.*]])
 ; CHECK-NEXT:    [[THREAD_ID_IN_BLOCK:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
 ; CHECK-NEXT:    [[THREAD_IS_MAIN:%.*]] = icmp ne i32 [[THREAD_ID_IN_BLOCK]], 0
 ; CHECK-NEXT:    br i1 [[THREAD_IS_MAIN]], label [[EXIT_THREADS:%.*]], label [[MAIN_THREAD_USER_CODE:%.*]]
@@ -38,7 +38,7 @@ define weak void @__omp_offloading_fd02_c0934fc2_foo_l4() #0 {
 ; CHECK-NEXT:    ret void
 ;
 entry:
-  %0 = call i32 @__kmpc_target_init(ptr @kernel_environment)
+  %0 = call i32 @__kmpc_target_init(ptr @kernel_environment, ptr %dyn)
   %exec_user_code = icmp eq i32 %0, -1
   br i1 %exec_user_code, label %user_code.entry, label %worker.exit
 
@@ -59,7 +59,7 @@ worker.exit:                                      ; preds = %entry
 
 declare i8 @__kmpc_is_spmd_exec_mode()
 
-declare i32 @__kmpc_target_init(ptr)
+declare i32 @__kmpc_target_init(ptr, ptr)
 
 declare void @__kmpc_target_deinit()
 
diff --git a/llvm/test/Transforms/OpenMP/custom_state_machines.ll b/llvm/test/Transforms/OpenMP/custom_state_machines.ll
index 89a298ff19549..47a580e13b918 100644
--- a/llvm/test/Transforms/OpenMP/custom_state_machines.ll
+++ b/llvm/test/Transforms/OpenMP/custom_state_machines.ll
@@ -138,12 +138,12 @@
 @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
 @__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
 
-define weak void @__omp_offloading_14_a36502b_no_state_machine_needed_l14() #0 {
+define weak void @__omp_offloading_14_a36502b_no_state_machine_needed_l14(ptr %dyn) #0 {
 entry:
   %.zero.addr = alloca i32, align 4
   %.threadid_temp. = alloca i32, align 4
   store i32 0, ptr %.zero.addr, align 4
-  %0 = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_no_state_machine_needed_l14_kernel_environment)
+  %0 = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_no_state_machine_needed_l14_kernel_environment, ptr %dyn)
   %exec_user_code = icmp eq i32 %0, -1
   br i1 %exec_user_code, label %user_code.entry, label %worker.exit
 
@@ -159,7 +159,7 @@ worker.exit:                                      ; preds = %entry
 }
 
 ; Make it a weak definition so we will apply custom state machine rewriting but can't use the body in the reasoning.
-define weak i32 @__kmpc_target_init(ptr) {
+define weak i32 @__kmpc_target_init(ptr, ptr) {
   ret i32 0
 }
 
@@ -197,12 +197,12 @@ declare i32 @__kmpc_global_thread_num(ptr) #3
 
 declare void @__kmpc_target_deinit()
 
-define weak void @__omp_offloading_14_a36502b_simple_state_machine_l22() #0 {
+define weak void @__omp_offloading_14_a36502b_simple_state_machine_l22(ptr %dyn) #0 {
 entry:
   %.zero.addr = alloca i32, align 4
   %.threadid_temp. = alloca i32, align 4
   store i32 0, ptr %.zero.addr, align 4
-  %0 = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_l22_kernel_environment)
+  %0 = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_l22_kernel_environment, ptr %dyn)
   %exec_user_code = icmp eq i32 %0, -1
   br i1 %exec_user_code, label %user_code.entry, label %worker.exit
 
@@ -290,12 +290,12 @@ entry:
   ret void
 }
 
-define weak void @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39() #0 {
+define weak void @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39(ptr %dyn) #0 {
 entry:
   %.zero.addr = alloca i32, align 4
   %.threadid_temp. = alloca i32, align 4
   store i32 0, ptr %.zero.addr, align 4
-  %0 = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39_kernel_environment)
+  %0 = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39_kernel_environment, ptr %dyn)
   %exec_user_code = icmp eq i32 %0, -1
   br i1 %exec_user_code, label %user_code.entry, label %worker.exit
 
@@ -367,12 +367,12 @@ entry:
   ret void
 }
 
-define weak void @__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55() #0 {
+define weak void @__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55(ptr %dyn) #0 {
 entry:
   %.zero.addr = alloca i32, align 4
   %.threadid_temp. = alloca i32, align 4
   store i32 0, ptr %.zero.addr, align 4
-  %0 = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55_kernel_environment)
+  %0 = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55_kernel_environment, ptr %dyn)
   %exec_user_code = icmp eq i32 %0, -1
   br i1 %exec_user_code, label %user_code.entry, label %worker.exit
 
@@ -453,12 +453,12 @@ entry:
   ret void
 }
 
-define weak void @__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66() #0 {
+define weak void @__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66(ptr %dyn) #0 {
 entry:
   %.zero.addr = alloca i32, align 4
   %.threadid_temp. = alloca i32, align 4
   store i32 0, ptr %.zero.addr, align 4
-  %0 = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66_kernel_environment)
+  %0 = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66_kernel_environment, ptr %dyn)
   %exec_user_code = icmp eq i32 %0, -1
   br i1 %exec_user_code, label %user_code.entry, label %worker.exit
 
@@ -537,12 +537,12 @@ entry:
   ret void
 }
 
-define weak void @__omp_offloading_14_a36502b_simple_state_machine_pure_l77() #0 {
+define weak void @__omp_offloading_14_a36502b_simple_state_machine_pure_l77(ptr %dyn) #0 {
 entry:
   %.zero.addr = alloca i32, align 4
   %.threadid_temp. = alloca i32, align 4
   store i32 0, ptr %.zero.addr, align 4
-  %0 = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_pure_l77_kernel_environment)
+  %0 = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_pure_l77_kernel_environment, ptr %dyn)
   %exec_user_code = icmp eq i32 %0, -1
   br i1 %exec_user_code, label %user_code.entry, label %worker.exit
 
@@ -624,12 +624,12 @@ entry:
   ret void
 }
 
-define weak void @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92() #0 {
+define weak void @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92(ptr %dyn) #0 {
 entry:
   %.zero.addr = alloca i32, align 4
   %.threadid_temp. = alloca i32, align 4
   store i32 0, ptr %.zero.addr, align 4
-  %0 = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92_kernel_environment)
+  %0 = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92_kernel_environment, ptr %dyn)
   %exec_user_code = icmp eq i32 %0, -1
   br i1 %exec_user_code, label %user_code.entry, label %worker.exit
 
@@ -679,12 +679,12 @@ return:                                           ; preds = %if.end, %if.then
 
 declare i32 @omp_get_thread_num(...) #4
 
-define weak void @__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112() #0 {
+define weak void @__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112(ptr %dyn) #0 {
 entry:
   %.zero.addr = alloca i32, align 4
   %.threadid_temp. = alloca i32, align 4
   store i32 0, ptr %.zero.addr, align 4
-  %0 = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112_kernel_environment)
+  %0 = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112_kernel_environment, ptr %dyn)
   %exec_user_code = icmp eq i32 %0, -1
   br i1 %exec_user_code, label %user_code.entry, label %worker.exit
 
@@ -911,11 +911,11 @@ attributes #9 = { convergent nounwind readonly willreturn }
 ;.
 ; AMDGPU: Function Attrs: convergent noinline norecurse nounwind
 ; AMDGPU-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_no_state_machine_needed_l14
-; AMDGPU-SAME: () #[[ATTR0:[0-9]+]] {
+; AMDGPU-SAME: (ptr [[DYN:%.*]]) #[[ATTR0:[0-9]+]] {
 ; AMDGPU-NEXT:  entry:
 ; AMDGPU-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; AMDGPU-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; AMDGPU-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_no_state_machine_needed_l14_kernel_environment)
+; AMDGPU-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_no_state_machine_needed_l14_kernel_environment, ptr [[DYN]])
 ; AMDGPU-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 ; AMDGPU-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 ; AMDGPU:       user_code.entry:
@@ -928,7 +928,7 @@ attributes #9 = { convergent nounwind readonly willreturn }
 ;
 ;
 ; AMDGPU-LABEL: define {{[^@]+}}@__kmpc_target_init
-; AMDGPU-SAME: (ptr [[TMP0:%.*]]) {
+; AMDGPU-SAME: (ptr [[TMP0:%.*]], ptr [[TMP1:%.*]]) {
 ; AMDGPU-NEXT:    ret i32 0
 ;
 ;
@@ -979,12 +979,12 @@ attributes #9 = { convergent nounwind readonly willreturn }
 ;
 ; AMDGPU: Function Attrs: convergent noinline norecurse nounwind
 ; AMDGPU-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_simple_state_machine_l22
-; AMDGPU-SAME: () #[[ATTR0]] {
+; AMDGPU-SAME: (ptr [[DYN:%.*]]) #[[ATTR0]] {
 ; AMDGPU-NEXT:  entry:
 ; AMDGPU-NEXT:    [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 ; AMDGPU-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; AMDGPU-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; AMDGPU-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_l22_kernel_environment)
+; AMDGPU-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_l22_kernel_environment, ptr [[DYN]])
 ; AMDGPU-NEXT:    [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
 ; AMDGPU-NEXT:    br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
 ; AMDGPU:       is_worker_check:
@@ -1098,12 +1098,12 @@ attributes #9 = { convergent nounwind readonly willreturn }
 ;
 ; AMDGPU: Function Attrs: convergent noinline norecurse nounwind
 ; AMDGPU-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39
-; AMDGPU-SAME: () #[[ATTR0]] {
+; AMDGPU-SAME: (ptr [[DYN:%.*]]) #[[ATTR0]] {
 ; AMDGPU-NEXT:  entry:
 ; AMDGPU-NEXT:    [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 ; AMDGPU-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; AMDGPU-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; AMDGPU-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39_kernel_environment)
+; AMDGPU-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39_kernel_environment, ptr [[DYN]])
 ; AMDGPU-NEXT:    [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
 ; AMDGPU-NEXT:    br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
 ; AMDGPU:       is_worker_check:
@@ -1240,12 +1240,12 @@ attributes #9 = { convergent nounwind readonly willreturn }
 ;
 ; AMDGPU: Function Attrs: convergent noinline norecurse nounwind
 ; AMDGPU-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55
-; AMDGPU-SAME: () #[[ATTR0]] {
+; AMDGPU-SAME: (ptr [[DYN:%.*]]) #[[ATTR0]] {
 ; AMDGPU-NEXT:  entry:
 ; AMDGPU-NEXT:    [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 ; AMDGPU-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; AMDGPU-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; AMDGPU-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55_kernel_environment)
+; AMDGPU-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55_kernel_environment, ptr [[DYN]])
 ; AMDGPU-NEXT:    [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
 ; AMDGPU-NEXT:    br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
 ; AMDGPU:       is_worker_check:
@@ -1360,12 +1360,12 @@ attributes #9 = { convergent nounwind readonly willreturn }
 ;
 ; AMDGPU: Function Attrs: convergent noinline norecurse nounwind
 ; AMDGPU-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66
-; AMDGPU-SAME: () #[[ATTR0]] {
+; AMDGPU-SAME: (ptr [[DYN:%.*]]) #[[ATTR0]] {
 ; AMDGPU-NEXT:  entry:
 ; AMDGPU-NEXT:    [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 ; AMDGPU-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; AMDGPU-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; AMDGPU-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66_kernel_environment)
+; AMDGPU-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66_kernel_environment, ptr [[DYN]])
 ; AMDGPU-NEXT:    [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
 ; AMDGPU-NEXT:    br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
 ; AMDGPU:       is_worker_check:
@@ -1478,12 +1478,12 @@ attributes #9 = { convergent nounwind readonly willreturn }
 ;
 ; AMDGPU: Function Attrs: convergent noinline norecurse nounwind
 ; AMDGPU-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_simple_state_machine_pure_l77
-; AMDGPU-SAME: () #[[ATTR0]] {
+; AMDGPU-SAME: (ptr [[DYN:%.*]]) #[[ATTR0]] {
 ; AMDGPU-NEXT:  entry:
 ; AMDGPU-NEXT:    [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 ; AMDGPU-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; AMDGPU-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; AMDGPU-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_pure_l77_kernel_environment)
+; AMDGPU-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_pure_l77_kernel_environment, ptr [[DYN]])
 ; AMDGPU-NEXT:    [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
 ; AMDGPU-NEXT:    br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
 ; AMDGPU:       is_worker_check:
@@ -1596,11 +1596,11 @@ attributes #9 = { convergent nounwind readonly willreturn }
 ;
 ; AMDGPU: Function Attrs: convergent noinline norecurse nounwind
 ; AMDGPU-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92
-; AMDGPU-SAME: () #[[ATTR0]] {
+; AMDGPU-SAME: (ptr [[DYN:%.*]]) #[[ATTR0]] {
 ; AMDGPU-NEXT:  entry:
 ; AMDGPU-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; AMDGPU-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; AMDGPU-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92_kernel_environment)
+; AMDGPU-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92_kernel_environment, ptr [[DYN]])
 ; AMDGPU-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 ; AMDGPU-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 ; AMDGPU:       user_code.entry:
@@ -1667,12 +1667,12 @@ attributes #9 = { convergent nounwind readonly willreturn }
 ;
 ; AMDGPU: Function Attrs: convergent noinline norecurse nounwind
 ; AMDGPU-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112
-; AMDGPU-SAME: () #[[ATTR0]] {
+; AMDGPU-SAME: (ptr [[DYN:%.*]]) #[[ATTR0]] {
 ; AMDGPU-NEXT:  entry:
 ; AMDGPU-NEXT:    [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 ; AMDGPU-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; AMDGPU-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; AMDGPU-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112_kernel_environment)
+; AMDGPU-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112_kernel_environment, ptr [[DYN]])
 ; AMDGPU-NEXT:    [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
 ; AMDGPU-NEXT:    br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
 ; AMDGPU:       is_worker_check:
@@ -1822,11 +1822,11 @@ attributes #9 = { convergent nounwind readonly willreturn }
 ;
 ; NVPTX: Function Attrs: convergent noinline norecurse nounwind
 ; NVPTX-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_no_state_machine_needed_l14
-; NVPTX-SAME: () #[[ATTR0:[0-9]+]] {
+; NVPTX-SAME: (ptr [[DYN:%.*]]) #[[ATTR0:[0-9]+]] {
 ; NVPTX-NEXT:  entry:
 ; NVPTX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; NVPTX-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; NVPTX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_no_state_machine_needed_l14_kernel_environment)
+; NVPTX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_no_state_machine_needed_l14_kernel_environment, ptr [[DYN]])
 ; NVPTX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 ; NVPTX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 ; NVPTX:       user_code.entry:
@@ -1839,7 +1839,7 @@ attributes #9 = { convergent nounwind readonly willreturn }
 ;
 ;
 ; NVPTX-LABEL: define {{[^@]+}}@__kmpc_target_init
-; NVPTX-SAME: (ptr [[TMP0:%.*]]) {
+; NVPTX-SAME: (ptr [[TMP0:%.*]], ptr [[TMP1:%.*]]) {
 ; NVPTX-NEXT:    ret i32 0
 ;
 ;
@@ -1890,12 +1890,12 @@ attributes #9 = { convergent nounwind readonly willreturn }
 ;
 ; NVPTX: Function Attrs: convergent noinline norecurse nounwind
 ; NVPTX-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_simple_state_machine_l22
-; NVPTX-SAME: () #[[ATTR0]] {
+; NVPTX-SAME: (ptr [[DYN:%.*]]) #[[ATTR0]] {
 ; NVPTX-NEXT:  entry:
 ; NVPTX-NEXT:    [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8
 ; NVPTX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; NVPTX-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; NVPTX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_l22_kernel_environment)
+; NVPTX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_l22_kernel_environment, ptr [[DYN]])
 ; NVPTX-NEXT:    [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
 ; NVPTX-NEXT:    br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
 ; NVPTX:       is_worker_check:
@@ -2008,12 +2008,12 @@ attributes #9 = { convergent nounwind readonly willreturn }
 ;
 ; NVPTX: Function Attrs: convergent noinline norecurse nounwind
 ; NVPTX-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39
-; NVPTX-SAME: () #[[ATTR0]] {
+; NVPTX-SAME: (ptr [[DYN:%.*]]) #[[ATTR0]] {
 ; NVPTX-NEXT:  entry:
 ; NVPTX-NEXT:    [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8
 ; NVPTX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; NVPTX-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; NVPTX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39_kernel_environment)
+; NVPTX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39_kernel_environment, ptr [[DYN]])
 ; NVPTX-NEXT:    [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
 ; NVPTX-NEXT:    br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
 ; NVPTX:       is_worker_check:
@@ -2149,12 +2149,12 @@ attributes #9 = { convergent nounwind readonly willreturn }
 ;
 ; NVPTX: Function Attrs: convergent noinline norecurse nounwind
 ; NVPTX-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55
-; NVPTX-SAME: () #[[ATTR0]] {
+; NVPTX-SAME: (ptr [[DYN:%.*]]) #[[ATTR0]] {
 ; NVPTX-NEXT:  entry:
 ; NVPTX-NEXT:    [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8
 ; NVPTX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; NVPTX-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; NVPTX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55_kernel_environment)
+; NVPTX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55_kernel_environment, ptr [[DYN]])
 ; NVPTX-NEXT:    [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
 ; NVPTX-NEXT:    br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
 ; NVPTX:       is_worker_check:
@@ -2268,12 +2268,12 @@ attributes #9 = { convergent nounwind readonly willreturn }
 ;
 ; NVPTX: Function Attrs: convergent noinline norecurse nounwind
 ; NVPTX-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66
-; NVPTX-SAME: () #[[ATTR0]] {
+; NVPTX-SAME: (ptr [[DYN:%.*]]) #[[ATTR0]] {
 ; NVPTX-NEXT:  entry:
 ; NVPTX-NEXT:    [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8
 ; NVPTX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; NVPTX-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; NVPTX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66_kernel_environment)
+; NVPTX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66_kernel_environment, ptr [[DYN]])
 ; NVPTX-NEXT:    [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
 ; NVPTX-NEXT:    br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
 ; NVPTX:       is_worker_check:
@@ -2385,12 +2385,12 @@ attributes #9 = { convergent nounwind readonly willreturn }
 ;
 ; NVPTX: Function Attrs: convergent noinline norecurse nounwind
 ; NVPTX-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_simple_state_machine_pure_l77
-; NVPTX-SAME: () #[[ATTR0]] {
+; NVPTX-SAME: (ptr [[DYN:%.*]]) #[[ATTR0]] {
 ; NVPTX-NEXT:  entry:
 ; NVPTX-NEXT:    [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8
 ; NVPTX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; NVPTX-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; NVPTX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_pure_l77_kernel_environment)
+; NVPTX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_pure_l77_kernel_environment, ptr [[DYN]])
 ; NVPTX-NEXT:    [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
 ; NVPTX-NEXT:    br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
 ; NVPTX:       is_worker_check:
@@ -2502,11 +2502,11 @@ attributes #9 = { convergent nounwind readonly willreturn }
 ;
 ; NVPTX: Function Attrs: convergent noinline norecurse nounwind
 ; NVPTX-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92
-; NVPTX-SAME: () #[[ATTR0]] {
+; NVPTX-SAME: (ptr [[DYN:%.*]]) #[[ATTR0]] {
 ; NVPTX-NEXT:  entry:
 ; NVPTX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; NVPTX-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; NVPTX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92_kernel_environment)
+; NVPTX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92_kernel_environment, ptr [[DYN]])
 ; NVPTX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 ; NVPTX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 ; NVPTX:       user_code.entry:
@@ -2573,12 +2573,12 @@ attributes #9 = { convergent nounwind readonly willreturn }
 ;
 ; NVPTX: Function Attrs: convergent noinline norecurse nounwind
 ; NVPTX-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112
-; NVPTX-SAME: () #[[ATTR0]] {
+; NVPTX-SAME: (ptr [[DYN:%.*]]) #[[ATTR0]] {
 ; NVPTX-NEXT:  entry:
 ; NVPTX-NEXT:    [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8
 ; NVPTX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; NVPTX-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; NVPTX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112_kernel_environment)
+; NVPTX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112_kernel_environment, ptr [[DYN]])
 ; NVPTX-NEXT:    [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
 ; NVPTX-NEXT:    br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
 ; NVPTX:       is_worker_check:
@@ -2727,11 +2727,11 @@ attributes #9 = { convergent nounwind readonly willreturn }
 ;
 ; AMDGPU-DISABLED: Function Attrs: convergent noinline norecurse nounwind
 ; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_no_state_machine_needed_l14
-; AMDGPU-DISABLED-SAME: () #[[ATTR0:[0-9]+]] {
+; AMDGPU-DISABLED-SAME: (ptr [[DYN:%.*]]) #[[ATTR0:[0-9]+]] {
 ; AMDGPU-DISABLED-NEXT:  entry:
 ; AMDGPU-DISABLED-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; AMDGPU-DISABLED-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; AMDGPU-DISABLED-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_no_state_machine_needed_l14_kernel_environment)
+; AMDGPU-DISABLED-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_no_state_machine_needed_l14_kernel_environment, ptr [[DYN]])
 ; AMDGPU-DISABLED-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 ; AMDGPU-DISABLED-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 ; AMDGPU-DISABLED:       user_code.entry:
@@ -2744,7 +2744,7 @@ attributes #9 = { convergent nounwind readonly willreturn }
 ;
 ;
 ; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__kmpc_target_init
-; AMDGPU-DISABLED-SAME: (ptr [[TMP0:%.*]]) {
+; AMDGPU-DISABLED-SAME: (ptr [[TMP0:%.*]], ptr [[TMP1:%.*]]) {
 ; AMDGPU-DISABLED-NEXT:    ret i32 0
 ;
 ;
@@ -2795,11 +2795,11 @@ attributes #9 = { convergent nounwind readonly willreturn }
 ;
 ; AMDGPU-DISABLED: Function Attrs: convergent noinline norecurse nounwind
 ; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_simple_state_machine_l22
-; AMDGPU-DISABLED-SAME: () #[[ATTR0]] {
+; AMDGPU-DISABLED-SAME: (ptr [[DYN:%.*]]) #[[ATTR0]] {
 ; AMDGPU-DISABLED-NEXT:  entry:
 ; AMDGPU-DISABLED-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; AMDGPU-DISABLED-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; AMDGPU-DISABLED-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_l22_kernel_environment)
+; AMDGPU-DISABLED-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_l22_kernel_environment, ptr [[DYN]])
 ; AMDGPU-DISABLED-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 ; AMDGPU-DISABLED-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 ; AMDGPU-DISABLED:       user_code.entry:
@@ -2873,11 +2873,11 @@ attributes #9 = { convergent nounwind readonly willreturn }
 ;
 ; AMDGPU-DISABLED: Function Attrs: convergent noinline norecurse nounwind
 ; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39
-; AMDGPU-DISABLED-SAME: () #[[ATTR0]] {
+; AMDGPU-DISABLED-SAME: (ptr [[DYN:%.*]]) #[[ATTR0]] {
 ; AMDGPU-DISABLED-NEXT:  entry:
 ; AMDGPU-DISABLED-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; AMDGPU-DISABLED-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; AMDGPU-DISABLED-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39_kernel_environment)
+; AMDGPU-DISABLED-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39_kernel_environment, ptr [[DYN]])
 ; AMDGPU-DISABLED-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 ; AMDGPU-DISABLED-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 ; AMDGPU-DISABLED:       user_code.entry:
@@ -2968,11 +2968,11 @@ attributes #9 = { convergent nounwind readonly willreturn }
 ;
 ; AMDGPU-DISABLED: Function Attrs: convergent noinline norecurse nounwind
 ; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55
-; AMDGPU-DISABLED-SAME: () #[[ATTR0]] {
+; AMDGPU-DISABLED-SAME: (ptr [[DYN:%.*]]) #[[ATTR0]] {
 ; AMDGPU-DISABLED-NEXT:  entry:
 ; AMDGPU-DISABLED-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; AMDGPU-DISABLED-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; AMDGPU-DISABLED-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55_kernel_environment)
+; AMDGPU-DISABLED-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55_kernel_environment, ptr [[DYN]])
 ; AMDGPU-DISABLED-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 ; AMDGPU-DISABLED-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 ; AMDGPU-DISABLED:       user_code.entry:
@@ -3045,11 +3045,11 @@ attributes #9 = { convergent nounwind readonly willreturn }
 ;
 ; AMDGPU-DISABLED: Function Attrs: convergent noinline norecurse nounwind
 ; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66
-; AMDGPU-DISABLED-SAME: () #[[ATTR0]] {
+; AMDGPU-DISABLED-SAME: (ptr [[DYN:%.*]]) #[[ATTR0]] {
 ; AMDGPU-DISABLED-NEXT:  entry:
 ; AMDGPU-DISABLED-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; AMDGPU-DISABLED-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; AMDGPU-DISABLED-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66_kernel_environment)
+; AMDGPU-DISABLED-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66_kernel_environment, ptr [[DYN]])
 ; AMDGPU-DISABLED-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 ; AMDGPU-DISABLED-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 ; AMDGPU-DISABLED:       user_code.entry:
@@ -3122,11 +3122,11 @@ attributes #9 = { convergent nounwind readonly willreturn }
 ;
 ; AMDGPU-DISABLED: Function Attrs: convergent noinline norecurse nounwind
 ; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_simple_state_machine_pure_l77
-; AMDGPU-DISABLED-SAME: () #[[ATTR0]] {
+; AMDGPU-DISABLED-SAME: (ptr [[DYN:%.*]]) #[[ATTR0]] {
 ; AMDGPU-DISABLED-NEXT:  entry:
 ; AMDGPU-DISABLED-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; AMDGPU-DISABLED-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; AMDGPU-DISABLED-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_pure_l77_kernel_environment)
+; AMDGPU-DISABLED-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_pure_l77_kernel_environment, ptr [[DYN]])
 ; AMDGPU-DISABLED-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 ; AMDGPU-DISABLED-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 ; AMDGPU-DISABLED:       user_code.entry:
@@ -3199,11 +3199,11 @@ attributes #9 = { convergent nounwind readonly willreturn }
 ;
 ; AMDGPU-DISABLED: Function Attrs: convergent noinline norecurse nounwind
 ; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92
-; AMDGPU-DISABLED-SAME: () #[[ATTR0]] {
+; AMDGPU-DISABLED-SAME: (ptr [[DYN:%.*]]) #[[ATTR0]] {
 ; AMDGPU-DISABLED-NEXT:  entry:
 ; AMDGPU-DISABLED-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; AMDGPU-DISABLED-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; AMDGPU-DISABLED-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92_kernel_environment)
+; AMDGPU-DISABLED-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92_kernel_environment, ptr [[DYN]])
 ; AMDGPU-DISABLED-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 ; AMDGPU-DISABLED-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 ; AMDGPU-DISABLED:       user_code.entry:
@@ -3270,11 +3270,11 @@ attributes #9 = { convergent nounwind readonly willreturn }
 ;
 ; AMDGPU-DISABLED: Function Attrs: convergent noinline norecurse nounwind
 ; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112
-; AMDGPU-DISABLED-SAME: () #[[ATTR0]] {
+; AMDGPU-DISABLED-SAME: (ptr [[DYN:%.*]]) #[[ATTR0]] {
 ; AMDGPU-DISABLED-NEXT:  entry:
 ; AMDGPU-DISABLED-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; AMDGPU-DISABLED-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; AMDGPU-DISABLED-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112_kernel_environment)
+; AMDGPU-DISABLED-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112_kernel_environment, ptr [[DYN]])
 ; AMDGPU-DISABLED-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 ; AMDGPU-DISABLED-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 ; AMDGPU-DISABLED:       user_code.entry:
@@ -3394,11 +3394,11 @@ attributes #9 = { convergent nounwind readonly willreturn }
 ;
 ; NVPTX-DISABLED: Function Attrs: convergent noinline norecurse nounwind
 ; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_no_state_machine_needed_l14
-; NVPTX-DISABLED-SAME: () #[[ATTR0:[0-9]+]] {
+; NVPTX-DISABLED-SAME: (ptr [[DYN:%.*]]) #[[ATTR0:[0-9]+]] {
 ; NVPTX-DISABLED-NEXT:  entry:
 ; NVPTX-DISABLED-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; NVPTX-DISABLED-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; NVPTX-DISABLED-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_no_state_machine_needed_l14_kernel_environment)
+; NVPTX-DISABLED-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_no_state_machine_needed_l14_kernel_environment, ptr [[DYN]])
 ; NVPTX-DISABLED-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 ; NVPTX-DISABLED-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 ; NVPTX-DISABLED:       user_code.entry:
@@ -3411,7 +3411,7 @@ attributes #9 = { convergent nounwind readonly willreturn }
 ;
 ;
 ; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__kmpc_target_init
-; NVPTX-DISABLED-SAME: (ptr [[TMP0:%.*]]) {
+; NVPTX-DISABLED-SAME: (ptr [[TMP0:%.*]], ptr [[TMP1:%.*]]) {
 ; NVPTX-DISABLED-NEXT:    ret i32 0
 ;
 ;
@@ -3462,11 +3462,11 @@ attributes #9 = { convergent nounwind readonly willreturn }
 ;
 ; NVPTX-DISABLED: Function Attrs: convergent noinline norecurse nounwind
 ; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_simple_state_machine_l22
-; NVPTX-DISABLED-SAME: () #[[ATTR0]] {
+; NVPTX-DISABLED-SAME: (ptr [[DYN:%.*]]) #[[ATTR0]] {
 ; NVPTX-DISABLED-NEXT:  entry:
 ; NVPTX-DISABLED-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; NVPTX-DISABLED-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; NVPTX-DISABLED-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_l22_kernel_environment)
+; NVPTX-DISABLED-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_l22_kernel_environment, ptr [[DYN]])
 ; NVPTX-DISABLED-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 ; NVPTX-DISABLED-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 ; NVPTX-DISABLED:       user_code.entry:
@@ -3540,11 +3540,11 @@ attributes #9 = { convergent nounwind readonly willreturn }
 ;
 ; NVPTX-DISABLED: Function Attrs: convergent noinline norecurse nounwind
 ; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39
-; NVPTX-DISABLED-SAME: () #[[ATTR0]] {
+; NVPTX-DISABLED-SAME: (ptr [[DYN:%.*]]) #[[ATTR0]] {
 ; NVPTX-DISABLED-NEXT:  entry:
 ; NVPTX-DISABLED-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; NVPTX-DISABLED-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; NVPTX-DISABLED-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39_kernel_environment)
+; NVPTX-DISABLED-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39_kernel_environment, ptr [[DYN]])
 ; NVPTX-DISABLED-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 ; NVPTX-DISABLED-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 ; NVPTX-DISABLED:       user_code.entry:
@@ -3635,11 +3635,11 @@ attributes #9 = { convergent nounwind readonly willreturn }
 ;
 ; NVPTX-DISABLED: Function Attrs: convergent noinline norecurse nounwind
 ; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55
-; NVPTX-DISABLED-SAME: () #[[ATTR0]] {
+; NVPTX-DISABLED-SAME: (ptr [[DYN:%.*]]) #[[ATTR0]] {
 ; NVPTX-DISABLED-NEXT:  entry:
 ; NVPTX-DISABLED-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; NVPTX-DISABLED-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; NVPTX-DISABLED-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55_kernel_environment)
+; NVPTX-DISABLED-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55_kernel_environment, ptr [[DYN]])
 ; NVPTX-DISABLED-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 ; NVPTX-DISABLED-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 ; NVPTX-DISABLED:       user_code.entry:
@@ -3712,11 +3712,11 @@ attributes #9 = { convergent nounwind readonly willreturn }
 ;
 ; NVPTX-DISABLED: Function Attrs: convergent noinline norecurse nounwind
 ; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66
-; NVPTX-DISABLED-SAME: () #[[ATTR0]] {
+; NVPTX-DISABLED-SAME: (ptr [[DYN:%.*]]) #[[ATTR0]] {
 ; NVPTX-DISABLED-NEXT:  entry:
 ; NVPTX-DISABLED-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; NVPTX-DISABLED-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; NVPTX-DISABLED-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66_kernel_environment)
+; NVPTX-DISABLED-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66_kernel_environment, ptr [[DYN]])
 ; NVPTX-DISABLED-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 ; NVPTX-DISABLED-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 ; NVPTX-DISABLED:       user_code.entry:
@@ -3789,11 +3789,11 @@ attributes #9 = { convergent nounwind readonly willreturn }
 ;
 ; NVPTX-DISABLED: Function Attrs: convergent noinline norecurse nounwind
 ; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_simple_state_machine_pure_l77
-; NVPTX-DISABLED-SAME: () #[[ATTR0]] {
+; NVPTX-DISABLED-SAME: (ptr [[DYN:%.*]]) #[[ATTR0]] {
 ; NVPTX-DISABLED-NEXT:  entry:
 ; NVPTX-DISABLED-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; NVPTX-DISABLED-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; NVPTX-DISABLED-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_pure_l77_kernel_environment)
+; NVPTX-DISABLED-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_pure_l77_kernel_environment, ptr [[DYN]])
 ; NVPTX-DISABLED-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 ; NVPTX-DISABLED-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 ; NVPTX-DISABLED:       user_code.entry:
@@ -3866,11 +3866,11 @@ attributes #9 = { convergent nounwind readonly willreturn }
 ;
 ; NVPTX-DISABLED: Function Attrs: convergent noinline norecurse nounwind
 ; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92
-; NVPTX-DISABLED-SAME: () #[[ATTR0]] {
+; NVPTX-DISABLED-SAME: (ptr [[DYN:%.*]]) #[[ATTR0]] {
 ; NVPTX-DISABLED-NEXT:  entry:
 ; NVPTX-DISABLED-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; NVPTX-DISABLED-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; NVPTX-DISABLED-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92_kernel_environment)
+; NVPTX-DISABLED-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92_kernel_environment, ptr [[DYN]])
 ; NVPTX-DISABLED-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 ; NVPTX-DISABLED-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 ; NVPTX-DISABLED:       user_code.entry:
@@ -3937,11 +3937,11 @@ attributes #9 = { convergent nounwind readonly willreturn }
 ;
 ; NVPTX-DISABLED: Function Attrs: convergent noinline norecurse nounwind
 ; NVPTX-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112
-; NVPTX-DISABLED-SAME: () #[[ATTR0]] {
+; NVPTX-DISABLED-SAME: (ptr [[DYN:%.*]]) #[[ATTR0]] {
 ; NVPTX-DISABLED-NEXT:  entry:
 ; NVPTX-DISABLED-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; NVPTX-DISABLED-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; NVPTX-DISABLED-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112_kernel_environment)
+; NVPTX-DISABLED-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112_kernel_environment, ptr [[DYN]])
 ; NVPTX-DISABLED-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 ; NVPTX-DISABLED-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 ; NVPTX-DISABLED:       user_code.entry:
diff --git a/llvm/test/Transforms/OpenMP/custom_state_machines_pre_lto.ll b/llvm/test/Transforms/OpenMP/custom_state_machines_pre_lto.ll
index 4965c18f33f59..c60d07ada2fd0 100644
--- a/llvm/test/Transforms/OpenMP/custom_state_machines_pre_lto.ll
+++ b/llvm/test/Transforms/OpenMP/custom_state_machines_pre_lto.ll
@@ -139,12 +139,12 @@
 @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
 @__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
 
-define weak void @__omp_offloading_14_a36502b_no_state_machine_needed_l14() #0 {
+define weak void @__omp_offloading_14_a36502b_no_state_machine_needed_l14(ptr %dyn) #0 {
 entry:
   %.zero.addr = alloca i32, align 4
   %.threadid_temp. = alloca i32, align 4
   store i32 0, ptr %.zero.addr, align 4
-  %0 = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_no_state_machine_needed_l14_kernel_environment)
+  %0 = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_no_state_machine_needed_l14_kernel_environment, ptr %dyn)
   %exec_user_code = icmp eq i32 %0, -1
   br i1 %exec_user_code, label %user_code.entry, label %worker.exit
 
@@ -196,12 +196,12 @@ declare i32 @__kmpc_global_thread_num(ptr) #3
 
 declare void @__kmpc_target_deinit()
 
-define weak void @__omp_offloading_14_a36502b_simple_state_machine_l22() #0 {
+define weak void @__omp_offloading_14_a36502b_simple_state_machine_l22(ptr %dyn) #0 {
 entry:
   %.zero.addr = alloca i32, align 4
   %.threadid_temp. = alloca i32, align 4
   store i32 0, ptr %.zero.addr, align 4
-  %0 = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_l22_kernel_environment)
+  %0 = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_l22_kernel_environment, ptr %dyn)
   %exec_user_code = icmp eq i32 %0, -1
   br i1 %exec_user_code, label %user_code.entry, label %worker.exit
 
@@ -289,12 +289,12 @@ entry:
   ret void
 }
 
-define weak void @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39() #0 {
+define weak void @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39(ptr %dyn) #0 {
 entry:
   %.zero.addr = alloca i32, align 4
   %.threadid_temp. = alloca i32, align 4
   store i32 0, ptr %.zero.addr, align 4
-  %0 = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39_kernel_environment)
+  %0 = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39_kernel_environment, ptr %dyn)
   %exec_user_code = icmp eq i32 %0, -1
   br i1 %exec_user_code, label %user_code.entry, label %worker.exit
 
@@ -366,12 +366,12 @@ entry:
   ret void
 }
 
-define weak void @__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55() #0 {
+define weak void @__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55(ptr %dyn) #0 {
 entry:
   %.zero.addr = alloca i32, align 4
   %.threadid_temp. = alloca i32, align 4
   store i32 0, ptr %.zero.addr, align 4
-  %0 = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55_kernel_environment)
+  %0 = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55_kernel_environment, ptr %dyn)
   %exec_user_code = icmp eq i32 %0, -1
   br i1 %exec_user_code, label %user_code.entry, label %worker.exit
 
@@ -452,12 +452,12 @@ entry:
   ret void
 }
 
-define weak void @__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66() #0 {
+define weak void @__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66(ptr %dyn) #0 {
 entry:
   %.zero.addr = alloca i32, align 4
   %.threadid_temp. = alloca i32, align 4
   store i32 0, ptr %.zero.addr, align 4
-  %0 = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66_kernel_environment)
+  %0 = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66_kernel_environment, ptr %dyn)
   %exec_user_code = icmp eq i32 %0, -1
   br i1 %exec_user_code, label %user_code.entry, label %worker.exit
 
@@ -536,12 +536,12 @@ entry:
   ret void
 }
 
-define weak void @__omp_offloading_14_a36502b_simple_state_machine_pure_l77() #0 {
+define weak void @__omp_offloading_14_a36502b_simple_state_machine_pure_l77(ptr %dyn) #0 {
 entry:
   %.zero.addr = alloca i32, align 4
   %.threadid_temp. = alloca i32, align 4
   store i32 0, ptr %.zero.addr, align 4
-  %0 = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_pure_l77_kernel_environment)
+  %0 = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_pure_l77_kernel_environment, ptr %dyn)
   %exec_user_code = icmp eq i32 %0, -1
   br i1 %exec_user_code, label %user_code.entry, label %worker.exit
 
@@ -623,12 +623,12 @@ entry:
   ret void
 }
 
-define weak void @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92() #0 {
+define weak void @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92(ptr %dyn) #0 {
 entry:
   %.zero.addr = alloca i32, align 4
   %.threadid_temp. = alloca i32, align 4
   store i32 0, ptr %.zero.addr, align 4
-  %0 = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92_kernel_environment)
+  %0 = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92_kernel_environment, ptr %dyn)
   %exec_user_code = icmp eq i32 %0, -1
   br i1 %exec_user_code, label %user_code.entry, label %worker.exit
 
@@ -678,12 +678,12 @@ return:                                           ; preds = %if.end, %if.then
 
 declare i32 @omp_get_thread_num(...) #4
 
-define weak void @__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112() #0 {
+define weak void @__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112(ptr %dyn) #0 {
 entry:
   %.zero.addr = alloca i32, align 4
   %.threadid_temp. = alloca i32, align 4
   store i32 0, ptr %.zero.addr, align 4
-  %0 = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112_kernel_environment)
+  %0 = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112_kernel_environment, ptr %dyn)
   %exec_user_code = icmp eq i32 %0, -1
   br i1 %exec_user_code, label %user_code.entry, label %worker.exit
 
@@ -840,12 +840,12 @@ attributes #9 = { convergent nounwind readonly willreturn }
 ; AMDGPU1: @[[G:[a-zA-Z0-9_$"\\.-]+]] = external global i32, align 4
 ; AMDGPU1: @[[GLOB3:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 322, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
 ; AMDGPU1: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_NEEDED_L14_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU1: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_L22_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU1: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_L39_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU1: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_WITH_FALLBACK_L55_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU1: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_NO_OPENMP_ATTR_L66_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU1: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_PURE_L77_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU1: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_NESTED_RECURSIVE_L92_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU1: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_L22_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU1: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_L39_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU1: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_WITH_FALLBACK_L55_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU1: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_NO_OPENMP_ATTR_L66_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU1: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_PURE_L77_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU1: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_NESTED_RECURSIVE_L92_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
 ; AMDGPU1: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_WEAK_CALLEE_L112_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
 ;.
 ; NVPTX1: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
@@ -854,12 +854,12 @@ attributes #9 = { convergent nounwind readonly willreturn }
 ; NVPTX1: @[[G:[a-zA-Z0-9_$"\\.-]+]] = external global i32, align 4
 ; NVPTX1: @[[GLOB3:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 322, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
 ; NVPTX1: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_NEEDED_L14_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX1: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_L22_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX1: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_L39_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX1: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_WITH_FALLBACK_L55_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX1: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_NO_OPENMP_ATTR_L66_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX1: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_PURE_L77_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX1: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_NESTED_RECURSIVE_L92_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX1: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_L22_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX1: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_L39_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX1: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_WITH_FALLBACK_L55_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX1: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_NO_OPENMP_ATTR_L66_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX1: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_PURE_L77_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX1: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_NESTED_RECURSIVE_L92_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
 ; NVPTX1: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_WEAK_CALLEE_L112_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
 ;.
 ; AMDGPU2: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
@@ -868,12 +868,12 @@ attributes #9 = { convergent nounwind readonly willreturn }
 ; AMDGPU2: @[[G:[a-zA-Z0-9_$"\\.-]+]] = external global i32, align 4
 ; AMDGPU2: @[[GLOB3:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 322, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
 ; AMDGPU2: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_NEEDED_L14_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU2: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_L22_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU2: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_L39_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU2: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_WITH_FALLBACK_L55_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU2: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_NO_OPENMP_ATTR_L66_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU2: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_PURE_L77_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU2: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_NESTED_RECURSIVE_L92_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU2: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_L22_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU2: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_L39_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU2: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_WITH_FALLBACK_L55_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU2: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_NO_OPENMP_ATTR_L66_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU2: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_PURE_L77_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU2: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_NESTED_RECURSIVE_L92_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
 ; AMDGPU2: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_WEAK_CALLEE_L112_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
 ;.
 ; AMDGPU3: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
@@ -882,12 +882,12 @@ attributes #9 = { convergent nounwind readonly willreturn }
 ; AMDGPU3: @[[G:[a-zA-Z0-9_$"\\.-]+]] = external global i32, align 4
 ; AMDGPU3: @[[GLOB3:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 322, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
 ; AMDGPU3: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_NEEDED_L14_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU3: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_L22_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU3: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_L39_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU3: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_WITH_FALLBACK_L55_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU3: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_NO_OPENMP_ATTR_L66_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU3: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_PURE_L77_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; AMDGPU3: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_NESTED_RECURSIVE_L92_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU3: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_L22_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU3: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_L39_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU3: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_WITH_FALLBACK_L55_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU3: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_NO_OPENMP_ATTR_L66_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU3: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_PURE_L77_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; AMDGPU3: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_NESTED_RECURSIVE_L92_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
 ; AMDGPU3: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_WEAK_CALLEE_L112_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
 ;.
 ; NVPTX2: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
@@ -896,12 +896,12 @@ attributes #9 = { convergent nounwind readonly willreturn }
 ; NVPTX2: @[[G:[a-zA-Z0-9_$"\\.-]+]] = external global i32, align 4
 ; NVPTX2: @[[GLOB3:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 322, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
 ; NVPTX2: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_NEEDED_L14_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX2: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_L22_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX2: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_L39_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX2: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_WITH_FALLBACK_L55_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX2: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_NO_OPENMP_ATTR_L66_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX2: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_PURE_L77_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX2: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_NESTED_RECURSIVE_L92_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX2: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_L22_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX2: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_L39_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX2: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_WITH_FALLBACK_L55_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX2: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_NO_OPENMP_ATTR_L66_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX2: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_PURE_L77_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX2: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_NESTED_RECURSIVE_L92_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
 ; NVPTX2: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_WEAK_CALLEE_L112_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
 ;.
 ; NVPTX3: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
@@ -910,21 +910,21 @@ attributes #9 = { convergent nounwind readonly willreturn }
 ; NVPTX3: @[[G:[a-zA-Z0-9_$"\\.-]+]] = external global i32, align 4
 ; NVPTX3: @[[GLOB3:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 322, i32 2, i32 0, ptr @[[GLOB0]] }, align 8
 ; NVPTX3: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_NEEDED_L14_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX3: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_L22_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX3: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_L39_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX3: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_WITH_FALLBACK_L55_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX3: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_NO_OPENMP_ATTR_L66_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX3: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_PURE_L77_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
-; NVPTX3: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_NESTED_RECURSIVE_L92_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 3, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX3: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_L22_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX3: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_L39_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX3: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_WITH_FALLBACK_L55_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX3: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_NO_OPENMP_ATTR_L66_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX3: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_PURE_L77_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
+; NVPTX3: @[[__OMP_OFFLOADING_14_A36502B_SIMPLE_STATE_MACHINE_INTERPROCEDURAL_NESTED_RECURSIVE_L92_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
 ; NVPTX3: @[[__OMP_OFFLOADING_14_A36502B_NO_STATE_MACHINE_WEAK_CALLEE_L112_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
 ;.
 ; AMDGPU1: Function Attrs: convergent noinline norecurse nounwind
 ; AMDGPU1-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_no_state_machine_needed_l14
-; AMDGPU1-SAME: () #[[ATTR0:[0-9]+]] {
+; AMDGPU1-SAME: (ptr [[DYN:%.*]]) #[[ATTR0:[0-9]+]] {
 ; AMDGPU1-NEXT:  entry:
 ; AMDGPU1-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; AMDGPU1-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; AMDGPU1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_no_state_machine_needed_l14_kernel_environment)
+; AMDGPU1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_no_state_machine_needed_l14_kernel_environment, ptr [[DYN]])
 ; AMDGPU1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 ; AMDGPU1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 ; AMDGPU1:       user_code.entry:
@@ -983,11 +983,11 @@ attributes #9 = { convergent nounwind readonly willreturn }
 ;
 ; AMDGPU1: Function Attrs: convergent noinline norecurse nounwind
 ; AMDGPU1-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_simple_state_machine_l22
-; AMDGPU1-SAME: () #[[ATTR0]] {
+; AMDGPU1-SAME: (ptr [[DYN:%.*]]) #[[ATTR0]] {
 ; AMDGPU1-NEXT:  entry:
 ; AMDGPU1-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; AMDGPU1-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; AMDGPU1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_l22_kernel_environment)
+; AMDGPU1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_l22_kernel_environment, ptr [[DYN]])
 ; AMDGPU1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 ; AMDGPU1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 ; AMDGPU1:       user_code.entry:
@@ -1061,11 +1061,11 @@ attributes #9 = { convergent nounwind readonly willreturn }
 ;
 ; AMDGPU1: Function Attrs: convergent noinline norecurse nounwind
 ; AMDGPU1-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39
-; AMDGPU1-SAME: () #[[ATTR0]] {
+; AMDGPU1-SAME: (ptr [[DYN:%.*]]) #[[ATTR0]] {
 ; AMDGPU1-NEXT:  entry:
 ; AMDGPU1-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; AMDGPU1-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; AMDGPU1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39_kernel_environment)
+; AMDGPU1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39_kernel_environment, ptr [[DYN]])
 ; AMDGPU1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 ; AMDGPU1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 ; AMDGPU1:       user_code.entry:
@@ -1156,11 +1156,11 @@ attributes #9 = { convergent nounwind readonly willreturn }
 ;
 ; AMDGPU1: Function Attrs: convergent noinline norecurse nounwind
 ; AMDGPU1-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55
-; AMDGPU1-SAME: () #[[ATTR0]] {
+; AMDGPU1-SAME: (ptr [[DYN:%.*]]) #[[ATTR0]] {
 ; AMDGPU1-NEXT:  entry:
 ; AMDGPU1-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; AMDGPU1-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; AMDGPU1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55_kernel_environment)
+; AMDGPU1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55_kernel_environment, ptr [[DYN]])
 ; AMDGPU1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 ; AMDGPU1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 ; AMDGPU1:       user_code.entry:
@@ -1233,11 +1233,11 @@ attributes #9 = { convergent nounwind readonly willreturn }
 ;
 ; AMDGPU1: Function Attrs: convergent noinline norecurse nounwind
 ; AMDGPU1-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66
-; AMDGPU1-SAME: () #[[ATTR0]] {
+; AMDGPU1-SAME: (ptr [[DYN:%.*]]) #[[ATTR0]] {
 ; AMDGPU1-NEXT:  entry:
 ; AMDGPU1-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; AMDGPU1-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; AMDGPU1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66_kernel_environment)
+; AMDGPU1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66_kernel_environment, ptr [[DYN]])
 ; AMDGPU1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 ; AMDGPU1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 ; AMDGPU1:       user_code.entry:
@@ -1310,11 +1310,11 @@ attributes #9 = { convergent nounwind readonly willreturn }
 ;
 ; AMDGPU1: Function Attrs: convergent noinline norecurse nounwind
 ; AMDGPU1-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_simple_state_machine_pure_l77
-; AMDGPU1-SAME: () #[[ATTR0]] {
+; AMDGPU1-SAME: (ptr [[DYN:%.*]]) #[[ATTR0]] {
 ; AMDGPU1-NEXT:  entry:
 ; AMDGPU1-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; AMDGPU1-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; AMDGPU1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_pure_l77_kernel_environment)
+; AMDGPU1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_pure_l77_kernel_environment, ptr [[DYN]])
 ; AMDGPU1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 ; AMDGPU1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 ; AMDGPU1:       user_code.entry:
@@ -1387,11 +1387,11 @@ attributes #9 = { convergent nounwind readonly willreturn }
 ;
 ; AMDGPU1: Function Attrs: convergent noinline norecurse nounwind
 ; AMDGPU1-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92
-; AMDGPU1-SAME: () #[[ATTR0]] {
+; AMDGPU1-SAME: (ptr [[DYN:%.*]]) #[[ATTR0]] {
 ; AMDGPU1-NEXT:  entry:
 ; AMDGPU1-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; AMDGPU1-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; AMDGPU1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92_kernel_environment)
+; AMDGPU1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92_kernel_environment, ptr [[DYN]])
 ; AMDGPU1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 ; AMDGPU1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 ; AMDGPU1:       user_code.entry:
@@ -1458,11 +1458,11 @@ attributes #9 = { convergent nounwind readonly willreturn }
 ;
 ; AMDGPU1: Function Attrs: convergent noinline norecurse nounwind
 ; AMDGPU1-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112
-; AMDGPU1-SAME: () #[[ATTR0]] {
+; AMDGPU1-SAME: (ptr [[DYN:%.*]]) #[[ATTR0]] {
 ; AMDGPU1-NEXT:  entry:
 ; AMDGPU1-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; AMDGPU1-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; AMDGPU1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112_kernel_environment)
+; AMDGPU1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112_kernel_environment, ptr [[DYN]])
 ; AMDGPU1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 ; AMDGPU1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 ; AMDGPU1:       user_code.entry:
@@ -1582,11 +1582,11 @@ attributes #9 = { convergent nounwind readonly willreturn }
 ;
 ; NVPTX1: Function Attrs: convergent noinline norecurse nounwind
 ; NVPTX1-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_no_state_machine_needed_l14
-; NVPTX1-SAME: () #[[ATTR0:[0-9]+]] {
+; NVPTX1-SAME: (ptr [[DYN:%.*]]) #[[ATTR0:[0-9]+]] {
 ; NVPTX1-NEXT:  entry:
 ; NVPTX1-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; NVPTX1-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; NVPTX1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_no_state_machine_needed_l14_kernel_environment)
+; NVPTX1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_no_state_machine_needed_l14_kernel_environment, ptr [[DYN]])
 ; NVPTX1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 ; NVPTX1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 ; NVPTX1:       user_code.entry:
@@ -1645,11 +1645,11 @@ attributes #9 = { convergent nounwind readonly willreturn }
 ;
 ; NVPTX1: Function Attrs: convergent noinline norecurse nounwind
 ; NVPTX1-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_simple_state_machine_l22
-; NVPTX1-SAME: () #[[ATTR0]] {
+; NVPTX1-SAME: (ptr [[DYN:%.*]]) #[[ATTR0]] {
 ; NVPTX1-NEXT:  entry:
 ; NVPTX1-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; NVPTX1-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; NVPTX1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_l22_kernel_environment)
+; NVPTX1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_l22_kernel_environment, ptr [[DYN]])
 ; NVPTX1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 ; NVPTX1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 ; NVPTX1:       user_code.entry:
@@ -1723,11 +1723,11 @@ attributes #9 = { convergent nounwind readonly willreturn }
 ;
 ; NVPTX1: Function Attrs: convergent noinline norecurse nounwind
 ; NVPTX1-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39
-; NVPTX1-SAME: () #[[ATTR0]] {
+; NVPTX1-SAME: (ptr [[DYN:%.*]]) #[[ATTR0]] {
 ; NVPTX1-NEXT:  entry:
 ; NVPTX1-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; NVPTX1-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; NVPTX1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39_kernel_environment)
+; NVPTX1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39_kernel_environment, ptr [[DYN]])
 ; NVPTX1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 ; NVPTX1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 ; NVPTX1:       user_code.entry:
@@ -1818,11 +1818,11 @@ attributes #9 = { convergent nounwind readonly willreturn }
 ;
 ; NVPTX1: Function Attrs: convergent noinline norecurse nounwind
 ; NVPTX1-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55
-; NVPTX1-SAME: () #[[ATTR0]] {
+; NVPTX1-SAME: (ptr [[DYN:%.*]]) #[[ATTR0]] {
 ; NVPTX1-NEXT:  entry:
 ; NVPTX1-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; NVPTX1-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; NVPTX1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55_kernel_environment)
+; NVPTX1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55_kernel_environment, ptr [[DYN]])
 ; NVPTX1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 ; NVPTX1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 ; NVPTX1:       user_code.entry:
@@ -1895,11 +1895,11 @@ attributes #9 = { convergent nounwind readonly willreturn }
 ;
 ; NVPTX1: Function Attrs: convergent noinline norecurse nounwind
 ; NVPTX1-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66
-; NVPTX1-SAME: () #[[ATTR0]] {
+; NVPTX1-SAME: (ptr [[DYN:%.*]]) #[[ATTR0]] {
 ; NVPTX1-NEXT:  entry:
 ; NVPTX1-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; NVPTX1-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; NVPTX1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66_kernel_environment)
+; NVPTX1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66_kernel_environment, ptr [[DYN]])
 ; NVPTX1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 ; NVPTX1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 ; NVPTX1:       user_code.entry:
@@ -1972,11 +1972,11 @@ attributes #9 = { convergent nounwind readonly willreturn }
 ;
 ; NVPTX1: Function Attrs: convergent noinline norecurse nounwind
 ; NVPTX1-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_simple_state_machine_pure_l77
-; NVPTX1-SAME: () #[[ATTR0]] {
+; NVPTX1-SAME: (ptr [[DYN:%.*]]) #[[ATTR0]] {
 ; NVPTX1-NEXT:  entry:
 ; NVPTX1-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; NVPTX1-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; NVPTX1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_pure_l77_kernel_environment)
+; NVPTX1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_pure_l77_kernel_environment, ptr [[DYN]])
 ; NVPTX1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 ; NVPTX1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 ; NVPTX1:       user_code.entry:
@@ -2049,11 +2049,11 @@ attributes #9 = { convergent nounwind readonly willreturn }
 ;
 ; NVPTX1: Function Attrs: convergent noinline norecurse nounwind
 ; NVPTX1-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92
-; NVPTX1-SAME: () #[[ATTR0]] {
+; NVPTX1-SAME: (ptr [[DYN:%.*]]) #[[ATTR0]] {
 ; NVPTX1-NEXT:  entry:
 ; NVPTX1-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; NVPTX1-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; NVPTX1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92_kernel_environment)
+; NVPTX1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92_kernel_environment, ptr [[DYN]])
 ; NVPTX1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 ; NVPTX1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 ; NVPTX1:       user_code.entry:
@@ -2120,11 +2120,11 @@ attributes #9 = { convergent nounwind readonly willreturn }
 ;
 ; NVPTX1: Function Attrs: convergent noinline norecurse nounwind
 ; NVPTX1-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112
-; NVPTX1-SAME: () #[[ATTR0]] {
+; NVPTX1-SAME: (ptr [[DYN:%.*]]) #[[ATTR0]] {
 ; NVPTX1-NEXT:  entry:
 ; NVPTX1-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; NVPTX1-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; NVPTX1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112_kernel_environment)
+; NVPTX1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112_kernel_environment, ptr [[DYN]])
 ; NVPTX1-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 ; NVPTX1-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 ; NVPTX1:       user_code.entry:
@@ -2244,11 +2244,11 @@ attributes #9 = { convergent nounwind readonly willreturn }
 ;
 ; AMDGPU2: Function Attrs: convergent noinline norecurse nounwind
 ; AMDGPU2-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_no_state_machine_needed_l14
-; AMDGPU2-SAME: () #[[ATTR0:[0-9]+]] {
+; AMDGPU2-SAME: (ptr [[DYN:%.*]]) #[[ATTR0:[0-9]+]] {
 ; AMDGPU2-NEXT:  entry:
 ; AMDGPU2-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; AMDGPU2-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; AMDGPU2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_no_state_machine_needed_l14_kernel_environment)
+; AMDGPU2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_no_state_machine_needed_l14_kernel_environment, ptr [[DYN]])
 ; AMDGPU2-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 ; AMDGPU2-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 ; AMDGPU2:       user_code.entry:
@@ -2307,11 +2307,11 @@ attributes #9 = { convergent nounwind readonly willreturn }
 ;
 ; AMDGPU2: Function Attrs: convergent noinline norecurse nounwind
 ; AMDGPU2-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_simple_state_machine_l22
-; AMDGPU2-SAME: () #[[ATTR0]] {
+; AMDGPU2-SAME: (ptr [[DYN:%.*]]) #[[ATTR0]] {
 ; AMDGPU2-NEXT:  entry:
 ; AMDGPU2-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; AMDGPU2-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; AMDGPU2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_l22_kernel_environment)
+; AMDGPU2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_l22_kernel_environment, ptr [[DYN]])
 ; AMDGPU2-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 ; AMDGPU2-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 ; AMDGPU2:       user_code.entry:
@@ -2385,11 +2385,11 @@ attributes #9 = { convergent nounwind readonly willreturn }
 ;
 ; AMDGPU2: Function Attrs: convergent noinline norecurse nounwind
 ; AMDGPU2-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39
-; AMDGPU2-SAME: () #[[ATTR0]] {
+; AMDGPU2-SAME: (ptr [[DYN:%.*]]) #[[ATTR0]] {
 ; AMDGPU2-NEXT:  entry:
 ; AMDGPU2-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; AMDGPU2-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; AMDGPU2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39_kernel_environment)
+; AMDGPU2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39_kernel_environment, ptr [[DYN]])
 ; AMDGPU2-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 ; AMDGPU2-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 ; AMDGPU2:       user_code.entry:
@@ -2480,11 +2480,11 @@ attributes #9 = { convergent nounwind readonly willreturn }
 ;
 ; AMDGPU2: Function Attrs: convergent noinline norecurse nounwind
 ; AMDGPU2-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55
-; AMDGPU2-SAME: () #[[ATTR0]] {
+; AMDGPU2-SAME: (ptr [[DYN:%.*]]) #[[ATTR0]] {
 ; AMDGPU2-NEXT:  entry:
 ; AMDGPU2-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; AMDGPU2-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; AMDGPU2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55_kernel_environment)
+; AMDGPU2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55_kernel_environment, ptr [[DYN]])
 ; AMDGPU2-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 ; AMDGPU2-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 ; AMDGPU2:       user_code.entry:
@@ -2557,11 +2557,11 @@ attributes #9 = { convergent nounwind readonly willreturn }
 ;
 ; AMDGPU2: Function Attrs: convergent noinline norecurse nounwind
 ; AMDGPU2-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66
-; AMDGPU2-SAME: () #[[ATTR0]] {
+; AMDGPU2-SAME: (ptr [[DYN:%.*]]) #[[ATTR0]] {
 ; AMDGPU2-NEXT:  entry:
 ; AMDGPU2-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; AMDGPU2-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; AMDGPU2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66_kernel_environment)
+; AMDGPU2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66_kernel_environment, ptr [[DYN]])
 ; AMDGPU2-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 ; AMDGPU2-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 ; AMDGPU2:       user_code.entry:
@@ -2634,11 +2634,11 @@ attributes #9 = { convergent nounwind readonly willreturn }
 ;
 ; AMDGPU2: Function Attrs: convergent noinline norecurse nounwind
 ; AMDGPU2-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_simple_state_machine_pure_l77
-; AMDGPU2-SAME: () #[[ATTR0]] {
+; AMDGPU2-SAME: (ptr [[DYN:%.*]]) #[[ATTR0]] {
 ; AMDGPU2-NEXT:  entry:
 ; AMDGPU2-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; AMDGPU2-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; AMDGPU2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_pure_l77_kernel_environment)
+; AMDGPU2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_pure_l77_kernel_environment, ptr [[DYN]])
 ; AMDGPU2-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 ; AMDGPU2-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 ; AMDGPU2:       user_code.entry:
@@ -2711,11 +2711,11 @@ attributes #9 = { convergent nounwind readonly willreturn }
 ;
 ; AMDGPU2: Function Attrs: convergent noinline norecurse nounwind
 ; AMDGPU2-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92
-; AMDGPU2-SAME: () #[[ATTR0]] {
+; AMDGPU2-SAME: (ptr [[DYN:%.*]]) #[[ATTR0]] {
 ; AMDGPU2-NEXT:  entry:
 ; AMDGPU2-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; AMDGPU2-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; AMDGPU2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92_kernel_environment)
+; AMDGPU2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92_kernel_environment, ptr [[DYN]])
 ; AMDGPU2-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 ; AMDGPU2-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 ; AMDGPU2:       user_code.entry:
@@ -2782,11 +2782,11 @@ attributes #9 = { convergent nounwind readonly willreturn }
 ;
 ; AMDGPU2: Function Attrs: convergent noinline norecurse nounwind
 ; AMDGPU2-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112
-; AMDGPU2-SAME: () #[[ATTR0]] {
+; AMDGPU2-SAME: (ptr [[DYN:%.*]]) #[[ATTR0]] {
 ; AMDGPU2-NEXT:  entry:
 ; AMDGPU2-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; AMDGPU2-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; AMDGPU2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112_kernel_environment)
+; AMDGPU2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112_kernel_environment, ptr [[DYN]])
 ; AMDGPU2-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 ; AMDGPU2-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 ; AMDGPU2:       user_code.entry:
@@ -2906,11 +2906,11 @@ attributes #9 = { convergent nounwind readonly willreturn }
 ;
 ; AMDGPU3: Function Attrs: convergent noinline norecurse nounwind
 ; AMDGPU3-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_no_state_machine_needed_l14
-; AMDGPU3-SAME: () #[[ATTR0:[0-9]+]] {
+; AMDGPU3-SAME: (ptr [[DYN:%.*]]) #[[ATTR0:[0-9]+]] {
 ; AMDGPU3-NEXT:  entry:
 ; AMDGPU3-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; AMDGPU3-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; AMDGPU3-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_no_state_machine_needed_l14_kernel_environment)
+; AMDGPU3-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_no_state_machine_needed_l14_kernel_environment, ptr [[DYN]])
 ; AMDGPU3-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 ; AMDGPU3-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 ; AMDGPU3:       user_code.entry:
@@ -2969,11 +2969,11 @@ attributes #9 = { convergent nounwind readonly willreturn }
 ;
 ; AMDGPU3: Function Attrs: convergent noinline norecurse nounwind
 ; AMDGPU3-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_simple_state_machine_l22
-; AMDGPU3-SAME: () #[[ATTR0]] {
+; AMDGPU3-SAME: (ptr [[DYN:%.*]]) #[[ATTR0]] {
 ; AMDGPU3-NEXT:  entry:
 ; AMDGPU3-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; AMDGPU3-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; AMDGPU3-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_l22_kernel_environment)
+; AMDGPU3-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_l22_kernel_environment, ptr [[DYN]])
 ; AMDGPU3-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 ; AMDGPU3-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 ; AMDGPU3:       user_code.entry:
@@ -3047,11 +3047,11 @@ attributes #9 = { convergent nounwind readonly willreturn }
 ;
 ; AMDGPU3: Function Attrs: convergent noinline norecurse nounwind
 ; AMDGPU3-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39
-; AMDGPU3-SAME: () #[[ATTR0]] {
+; AMDGPU3-SAME: (ptr [[DYN:%.*]]) #[[ATTR0]] {
 ; AMDGPU3-NEXT:  entry:
 ; AMDGPU3-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; AMDGPU3-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; AMDGPU3-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39_kernel_environment)
+; AMDGPU3-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39_kernel_environment, ptr [[DYN]])
 ; AMDGPU3-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 ; AMDGPU3-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 ; AMDGPU3:       user_code.entry:
@@ -3142,11 +3142,11 @@ attributes #9 = { convergent nounwind readonly willreturn }
 ;
 ; AMDGPU3: Function Attrs: convergent noinline norecurse nounwind
 ; AMDGPU3-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55
-; AMDGPU3-SAME: () #[[ATTR0]] {
+; AMDGPU3-SAME: (ptr [[DYN:%.*]]) #[[ATTR0]] {
 ; AMDGPU3-NEXT:  entry:
 ; AMDGPU3-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; AMDGPU3-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; AMDGPU3-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55_kernel_environment)
+; AMDGPU3-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55_kernel_environment, ptr [[DYN]])
 ; AMDGPU3-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 ; AMDGPU3-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 ; AMDGPU3:       user_code.entry:
@@ -3219,11 +3219,11 @@ attributes #9 = { convergent nounwind readonly willreturn }
 ;
 ; AMDGPU3: Function Attrs: convergent noinline norecurse nounwind
 ; AMDGPU3-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66
-; AMDGPU3-SAME: () #[[ATTR0]] {
+; AMDGPU3-SAME: (ptr [[DYN:%.*]]) #[[ATTR0]] {
 ; AMDGPU3-NEXT:  entry:
 ; AMDGPU3-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; AMDGPU3-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; AMDGPU3-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66_kernel_environment)
+; AMDGPU3-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66_kernel_environment, ptr [[DYN]])
 ; AMDGPU3-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 ; AMDGPU3-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 ; AMDGPU3:       user_code.entry:
@@ -3296,11 +3296,11 @@ attributes #9 = { convergent nounwind readonly willreturn }
 ;
 ; AMDGPU3: Function Attrs: convergent noinline norecurse nounwind
 ; AMDGPU3-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_simple_state_machine_pure_l77
-; AMDGPU3-SAME: () #[[ATTR0]] {
+; AMDGPU3-SAME: (ptr [[DYN:%.*]]) #[[ATTR0]] {
 ; AMDGPU3-NEXT:  entry:
 ; AMDGPU3-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; AMDGPU3-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; AMDGPU3-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_pure_l77_kernel_environment)
+; AMDGPU3-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_pure_l77_kernel_environment, ptr [[DYN]])
 ; AMDGPU3-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 ; AMDGPU3-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 ; AMDGPU3:       user_code.entry:
@@ -3373,11 +3373,11 @@ attributes #9 = { convergent nounwind readonly willreturn }
 ;
 ; AMDGPU3: Function Attrs: convergent noinline norecurse nounwind
 ; AMDGPU3-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92
-; AMDGPU3-SAME: () #[[ATTR0]] {
+; AMDGPU3-SAME: (ptr [[DYN:%.*]]) #[[ATTR0]] {
 ; AMDGPU3-NEXT:  entry:
 ; AMDGPU3-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; AMDGPU3-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; AMDGPU3-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92_kernel_environment)
+; AMDGPU3-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92_kernel_environment, ptr [[DYN]])
 ; AMDGPU3-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 ; AMDGPU3-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 ; AMDGPU3:       user_code.entry:
@@ -3444,11 +3444,11 @@ attributes #9 = { convergent nounwind readonly willreturn }
 ;
 ; AMDGPU3: Function Attrs: convergent noinline norecurse nounwind
 ; AMDGPU3-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112
-; AMDGPU3-SAME: () #[[ATTR0]] {
+; AMDGPU3-SAME: (ptr [[DYN:%.*]]) #[[ATTR0]] {
 ; AMDGPU3-NEXT:  entry:
 ; AMDGPU3-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; AMDGPU3-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; AMDGPU3-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112_kernel_environment)
+; AMDGPU3-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112_kernel_environment, ptr [[DYN]])
 ; AMDGPU3-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 ; AMDGPU3-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 ; AMDGPU3:       user_code.entry:
@@ -3568,11 +3568,11 @@ attributes #9 = { convergent nounwind readonly willreturn }
 ;
 ; NVPTX2: Function Attrs: convergent noinline norecurse nounwind
 ; NVPTX2-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_no_state_machine_needed_l14
-; NVPTX2-SAME: () #[[ATTR0:[0-9]+]] {
+; NVPTX2-SAME: (ptr [[DYN:%.*]]) #[[ATTR0:[0-9]+]] {
 ; NVPTX2-NEXT:  entry:
 ; NVPTX2-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; NVPTX2-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; NVPTX2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_no_state_machine_needed_l14_kernel_environment)
+; NVPTX2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_no_state_machine_needed_l14_kernel_environment, ptr [[DYN]])
 ; NVPTX2-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 ; NVPTX2-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 ; NVPTX2:       user_code.entry:
@@ -3631,11 +3631,11 @@ attributes #9 = { convergent nounwind readonly willreturn }
 ;
 ; NVPTX2: Function Attrs: convergent noinline norecurse nounwind
 ; NVPTX2-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_simple_state_machine_l22
-; NVPTX2-SAME: () #[[ATTR0]] {
+; NVPTX2-SAME: (ptr [[DYN:%.*]]) #[[ATTR0]] {
 ; NVPTX2-NEXT:  entry:
 ; NVPTX2-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; NVPTX2-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; NVPTX2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_l22_kernel_environment)
+; NVPTX2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_l22_kernel_environment, ptr [[DYN]])
 ; NVPTX2-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 ; NVPTX2-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 ; NVPTX2:       user_code.entry:
@@ -3709,11 +3709,11 @@ attributes #9 = { convergent nounwind readonly willreturn }
 ;
 ; NVPTX2: Function Attrs: convergent noinline norecurse nounwind
 ; NVPTX2-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39
-; NVPTX2-SAME: () #[[ATTR0]] {
+; NVPTX2-SAME: (ptr [[DYN:%.*]]) #[[ATTR0]] {
 ; NVPTX2-NEXT:  entry:
 ; NVPTX2-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; NVPTX2-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; NVPTX2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39_kernel_environment)
+; NVPTX2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39_kernel_environment, ptr [[DYN]])
 ; NVPTX2-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 ; NVPTX2-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 ; NVPTX2:       user_code.entry:
@@ -3804,11 +3804,11 @@ attributes #9 = { convergent nounwind readonly willreturn }
 ;
 ; NVPTX2: Function Attrs: convergent noinline norecurse nounwind
 ; NVPTX2-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55
-; NVPTX2-SAME: () #[[ATTR0]] {
+; NVPTX2-SAME: (ptr [[DYN:%.*]]) #[[ATTR0]] {
 ; NVPTX2-NEXT:  entry:
 ; NVPTX2-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; NVPTX2-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; NVPTX2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55_kernel_environment)
+; NVPTX2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55_kernel_environment, ptr [[DYN]])
 ; NVPTX2-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 ; NVPTX2-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 ; NVPTX2:       user_code.entry:
@@ -3881,11 +3881,11 @@ attributes #9 = { convergent nounwind readonly willreturn }
 ;
 ; NVPTX2: Function Attrs: convergent noinline norecurse nounwind
 ; NVPTX2-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66
-; NVPTX2-SAME: () #[[ATTR0]] {
+; NVPTX2-SAME: (ptr [[DYN:%.*]]) #[[ATTR0]] {
 ; NVPTX2-NEXT:  entry:
 ; NVPTX2-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; NVPTX2-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; NVPTX2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66_kernel_environment)
+; NVPTX2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66_kernel_environment, ptr [[DYN]])
 ; NVPTX2-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 ; NVPTX2-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 ; NVPTX2:       user_code.entry:
@@ -3958,11 +3958,11 @@ attributes #9 = { convergent nounwind readonly willreturn }
 ;
 ; NVPTX2: Function Attrs: convergent noinline norecurse nounwind
 ; NVPTX2-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_simple_state_machine_pure_l77
-; NVPTX2-SAME: () #[[ATTR0]] {
+; NVPTX2-SAME: (ptr [[DYN:%.*]]) #[[ATTR0]] {
 ; NVPTX2-NEXT:  entry:
 ; NVPTX2-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; NVPTX2-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; NVPTX2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_pure_l77_kernel_environment)
+; NVPTX2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_pure_l77_kernel_environment, ptr [[DYN]])
 ; NVPTX2-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 ; NVPTX2-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 ; NVPTX2:       user_code.entry:
@@ -4035,11 +4035,11 @@ attributes #9 = { convergent nounwind readonly willreturn }
 ;
 ; NVPTX2: Function Attrs: convergent noinline norecurse nounwind
 ; NVPTX2-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92
-; NVPTX2-SAME: () #[[ATTR0]] {
+; NVPTX2-SAME: (ptr [[DYN:%.*]]) #[[ATTR0]] {
 ; NVPTX2-NEXT:  entry:
 ; NVPTX2-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; NVPTX2-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; NVPTX2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92_kernel_environment)
+; NVPTX2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92_kernel_environment, ptr [[DYN]])
 ; NVPTX2-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 ; NVPTX2-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 ; NVPTX2:       user_code.entry:
@@ -4106,11 +4106,11 @@ attributes #9 = { convergent nounwind readonly willreturn }
 ;
 ; NVPTX2: Function Attrs: convergent noinline norecurse nounwind
 ; NVPTX2-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112
-; NVPTX2-SAME: () #[[ATTR0]] {
+; NVPTX2-SAME: (ptr [[DYN:%.*]]) #[[ATTR0]] {
 ; NVPTX2-NEXT:  entry:
 ; NVPTX2-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; NVPTX2-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; NVPTX2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112_kernel_environment)
+; NVPTX2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112_kernel_environment, ptr [[DYN]])
 ; NVPTX2-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 ; NVPTX2-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 ; NVPTX2:       user_code.entry:
@@ -4230,11 +4230,11 @@ attributes #9 = { convergent nounwind readonly willreturn }
 ;
 ; NVPTX3: Function Attrs: convergent noinline norecurse nounwind
 ; NVPTX3-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_no_state_machine_needed_l14
-; NVPTX3-SAME: () #[[ATTR0:[0-9]+]] {
+; NVPTX3-SAME: (ptr [[DYN:%.*]]) #[[ATTR0:[0-9]+]] {
 ; NVPTX3-NEXT:  entry:
 ; NVPTX3-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; NVPTX3-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; NVPTX3-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_no_state_machine_needed_l14_kernel_environment)
+; NVPTX3-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_no_state_machine_needed_l14_kernel_environment, ptr [[DYN]])
 ; NVPTX3-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 ; NVPTX3-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 ; NVPTX3:       user_code.entry:
@@ -4293,11 +4293,11 @@ attributes #9 = { convergent nounwind readonly willreturn }
 ;
 ; NVPTX3: Function Attrs: convergent noinline norecurse nounwind
 ; NVPTX3-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_simple_state_machine_l22
-; NVPTX3-SAME: () #[[ATTR0]] {
+; NVPTX3-SAME: (ptr [[DYN:%.*]]) #[[ATTR0]] {
 ; NVPTX3-NEXT:  entry:
 ; NVPTX3-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; NVPTX3-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; NVPTX3-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_l22_kernel_environment)
+; NVPTX3-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_l22_kernel_environment, ptr [[DYN]])
 ; NVPTX3-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 ; NVPTX3-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 ; NVPTX3:       user_code.entry:
@@ -4371,11 +4371,11 @@ attributes #9 = { convergent nounwind readonly willreturn }
 ;
 ; NVPTX3: Function Attrs: convergent noinline norecurse nounwind
 ; NVPTX3-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39
-; NVPTX3-SAME: () #[[ATTR0]] {
+; NVPTX3-SAME: (ptr [[DYN:%.*]]) #[[ATTR0]] {
 ; NVPTX3-NEXT:  entry:
 ; NVPTX3-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; NVPTX3-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; NVPTX3-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39_kernel_environment)
+; NVPTX3-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_l39_kernel_environment, ptr [[DYN]])
 ; NVPTX3-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 ; NVPTX3-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 ; NVPTX3:       user_code.entry:
@@ -4466,11 +4466,11 @@ attributes #9 = { convergent nounwind readonly willreturn }
 ;
 ; NVPTX3: Function Attrs: convergent noinline norecurse nounwind
 ; NVPTX3-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55
-; NVPTX3-SAME: () #[[ATTR0]] {
+; NVPTX3-SAME: (ptr [[DYN:%.*]]) #[[ATTR0]] {
 ; NVPTX3-NEXT:  entry:
 ; NVPTX3-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; NVPTX3-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; NVPTX3-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55_kernel_environment)
+; NVPTX3-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_with_fallback_l55_kernel_environment, ptr [[DYN]])
 ; NVPTX3-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 ; NVPTX3-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 ; NVPTX3:       user_code.entry:
@@ -4543,11 +4543,11 @@ attributes #9 = { convergent nounwind readonly willreturn }
 ;
 ; NVPTX3: Function Attrs: convergent noinline norecurse nounwind
 ; NVPTX3-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66
-; NVPTX3-SAME: () #[[ATTR0]] {
+; NVPTX3-SAME: (ptr [[DYN:%.*]]) #[[ATTR0]] {
 ; NVPTX3-NEXT:  entry:
 ; NVPTX3-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; NVPTX3-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; NVPTX3-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66_kernel_environment)
+; NVPTX3-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_no_openmp_attr_l66_kernel_environment, ptr [[DYN]])
 ; NVPTX3-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 ; NVPTX3-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 ; NVPTX3:       user_code.entry:
@@ -4620,11 +4620,11 @@ attributes #9 = { convergent nounwind readonly willreturn }
 ;
 ; NVPTX3: Function Attrs: convergent noinline norecurse nounwind
 ; NVPTX3-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_simple_state_machine_pure_l77
-; NVPTX3-SAME: () #[[ATTR0]] {
+; NVPTX3-SAME: (ptr [[DYN:%.*]]) #[[ATTR0]] {
 ; NVPTX3-NEXT:  entry:
 ; NVPTX3-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; NVPTX3-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; NVPTX3-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_pure_l77_kernel_environment)
+; NVPTX3-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_pure_l77_kernel_environment, ptr [[DYN]])
 ; NVPTX3-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 ; NVPTX3-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 ; NVPTX3:       user_code.entry:
@@ -4697,11 +4697,11 @@ attributes #9 = { convergent nounwind readonly willreturn }
 ;
 ; NVPTX3: Function Attrs: convergent noinline norecurse nounwind
 ; NVPTX3-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92
-; NVPTX3-SAME: () #[[ATTR0]] {
+; NVPTX3-SAME: (ptr [[DYN:%.*]]) #[[ATTR0]] {
 ; NVPTX3-NEXT:  entry:
 ; NVPTX3-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; NVPTX3-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; NVPTX3-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92_kernel_environment)
+; NVPTX3-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_simple_state_machine_interprocedural_nested_recursive_l92_kernel_environment, ptr [[DYN]])
 ; NVPTX3-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 ; NVPTX3-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 ; NVPTX3:       user_code.entry:
@@ -4768,11 +4768,11 @@ attributes #9 = { convergent nounwind readonly willreturn }
 ;
 ; NVPTX3: Function Attrs: convergent noinline norecurse nounwind
 ; NVPTX3-LABEL: define {{[^@]+}}@__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112
-; NVPTX3-SAME: () #[[ATTR0]] {
+; NVPTX3-SAME: (ptr [[DYN:%.*]]) #[[ATTR0]] {
 ; NVPTX3-NEXT:  entry:
 ; NVPTX3-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; NVPTX3-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; NVPTX3-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112_kernel_environment)
+; NVPTX3-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_14_a36502b_no_state_machine_weak_callee_l112_kernel_environment, ptr [[DYN]])
 ; NVPTX3-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 ; NVPTX3-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 ; NVPTX3:       user_code.entry:
diff --git a/llvm/test/Transforms/OpenMP/custom_state_machines_remarks.ll b/llvm/test/Transforms/OpenMP/custom_state_machines_remarks.ll
index 621ad16d836c0..432c853fcaa67 100644
--- a/llvm/test/Transforms/OpenMP/custom_state_machines_remarks.ll
+++ b/llvm/test/Transforms/OpenMP/custom_state_machines_remarks.ll
@@ -59,10 +59,10 @@ target triple = "nvptx64"
 
 
 ; Function Attrs: convergent norecurse nounwind
-define weak void @__omp_offloading_2a_d80d3d_test_fallback_l11() local_unnamed_addr #0 !dbg !15 {
+define weak void @__omp_offloading_2a_d80d3d_test_fallback_l11(ptr %dyn) local_unnamed_addr #0 !dbg !15 {
 entry:
   %captured_vars_addrs.i.i = alloca [0 x ptr], align 8
-  %0 = call i32 @__kmpc_target_init(ptr @__omp_offloading_2a_d80d3d_test_fallback_l11_kernel_environment) #3, !dbg !18
+  %0 = call i32 @__kmpc_target_init(ptr @__omp_offloading_2a_d80d3d_test_fallback_l11_kernel_environment, ptr %dyn) #3, !dbg !18
   %exec_user_code = icmp eq i32 %0, -1, !dbg !18
   br i1 %exec_user_code, label %user_code.entry, label %common.ret, !dbg !18
 
@@ -82,7 +82,7 @@ user_code.entry:                                  ; preds = %entry
 }
 
 ; Make it a weak definition so we will apply custom state machine rewriting but can't use the body in the reasoning.
-define weak i32 @__kmpc_target_init(ptr) {
+define weak i32 @__kmpc_target_init(ptr, ptr) {
   ret i32 0
 }
 
@@ -104,10 +104,10 @@ declare i32 @__kmpc_global_thread_num(ptr) local_unnamed_addr #3
 declare void @__kmpc_target_deinit() local_unnamed_addr
 
 ; Function Attrs: norecurse nounwind
-define weak void @__omp_offloading_2a_d80d3d_test_no_fallback_l20() local_unnamed_addr #4 !dbg !32 {
+define weak void @__omp_offloading_2a_d80d3d_test_no_fallback_l20(ptr %dyn) local_unnamed_addr #4 !dbg !32 {
 entry:
   %captured_vars_addrs.i2.i = alloca [0 x ptr], align 8
-  %0 = call i32 @__kmpc_target_init(ptr @__omp_offloading_2a_d80d3d_test_no_fallback_l20_kernel_environment) #3, !dbg !33
+  %0 = call i32 @__kmpc_target_init(ptr @__omp_offloading_2a_d80d3d_test_no_fallback_l20_kernel_environment, ptr %dyn) #3, !dbg !33
   %exec_user_code = icmp eq i32 %0, -1, !dbg !33
   br i1 %exec_user_code, label %user_code.entry, label %common.ret, !dbg !33
 
diff --git a/llvm/test/Transforms/OpenMP/deduplication_target.ll b/llvm/test/Transforms/OpenMP/deduplication_target.ll
index 6174fc64b90bd..a5d54414f0086 100644
--- a/llvm/test/Transforms/OpenMP/deduplication_target.ll
+++ b/llvm/test/Transforms/OpenMP/deduplication_target.ll
@@ -15,12 +15,12 @@ target triple = "nvptx64"
 
 declare void @use(i32)
 
-define weak void @__omp_offloading_50_a3e09bf8_foo_l2() #0 {
+define weak void @__omp_offloading_50_a3e09bf8_foo_l2(ptr %dyn) #0 {
 ; CHECK-LABEL: define {{[^@]+}}@__omp_offloading_50_a3e09bf8_foo_l2
-; CHECK-SAME: () #[[ATTR0:[0-9]+]] {
+; CHECK-SAME: (ptr [[DYN:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8
-; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_50_a3e09bf8_foo_l2_kernel_environment)
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_50_a3e09bf8_foo_l2_kernel_environment, ptr [[DYN]])
 ; CHECK-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 ; CHECK-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 ; CHECK:       user_code.entry:
@@ -32,7 +32,7 @@ define weak void @__omp_offloading_50_a3e09bf8_foo_l2() #0 {
 ;
 entry:
   %captured_vars_addrs = alloca [0 x ptr], align 8
-  %0 = call i32 @__kmpc_target_init(ptr @__omp_offloading_50_a3e09bf8_foo_l2_kernel_environment)
+  %0 = call i32 @__kmpc_target_init(ptr @__omp_offloading_50_a3e09bf8_foo_l2_kernel_environment, ptr %dyn)
   %exec_user_code = icmp eq i32 %0, -1
   br i1 %exec_user_code, label %user_code.entry, label %worker.exit
 
@@ -46,7 +46,7 @@ worker.exit:                                      ; preds = %entry
   ret void
 }
 
-declare i32 @__kmpc_target_init(ptr)
+declare i32 @__kmpc_target_init(ptr, ptr)
 
 declare i32 @__kmpc_global_thread_num(ptr) #1
 
diff --git a/llvm/test/Transforms/OpenMP/get_hardware_num_threads_in_block_fold.ll b/llvm/test/Transforms/OpenMP/get_hardware_num_threads_in_block_fold.ll
index abbfc424cedc2..7de8fa65fcc05 100644
--- a/llvm/test/Transforms/OpenMP/get_hardware_num_threads_in_block_fold.ll
+++ b/llvm/test/Transforms/OpenMP/get_hardware_num_threads_in_block_fold.ll
@@ -19,10 +19,10 @@ target triple = "nvptx64"
 ; CHECK: @[[GLOB0:[0-9]+]] = private unnamed_addr constant [23 x i8] c"
 ; CHECK: @[[GLOB1:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 22, ptr @[[GLOB0]] }, align 8
 ;.
-define weak void @kernel0() "kernel" #0 {
+define weak void @kernel0(ptr %dyn) "kernel" #0 {
 ; CHECK-LABEL: define {{[^@]+}}@kernel0
-; CHECK-SAME: () #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:    [[I:%.*]] = call i32 @__kmpc_target_init(ptr @kernel0_kernel_environment)
+; CHECK-SAME: (ptr [[DYN:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[I:%.*]] = call i32 @__kmpc_target_init(ptr @kernel0_kernel_environment, ptr [[DYN]])
 ; CHECK-NEXT:    [[THREAD_ID_IN_BLOCK:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
 ; CHECK-NEXT:    [[THREAD_IS_MAIN:%.*]] = icmp ne i32 [[THREAD_ID_IN_BLOCK]], 0
 ; CHECK-NEXT:    br i1 [[THREAD_IS_MAIN]], label [[EXIT_THREADS:%.*]], label [[MAIN_THREAD_USER_CODE:%.*]]
@@ -35,7 +35,7 @@ define weak void @kernel0() "kernel" #0 {
 ; CHECK-NEXT:    call void @__kmpc_target_deinit()
 ; CHECK-NEXT:    ret void
 ;
-  %i = call i32 @__kmpc_target_init(ptr @kernel0_kernel_environment)
+  %i = call i32 @__kmpc_target_init(ptr @kernel0_kernel_environment, ptr %dyn)
   call void @helper0()
   call void @helper1()
   call void @helper2()
@@ -43,10 +43,10 @@ define weak void @kernel0() "kernel" #0 {
   ret void
 }
 
-define weak void @kernel1() "kernel" #0 {
+define weak void @kernel1(ptr %dyn) "kernel" #0 {
 ; CHECK-LABEL: define {{[^@]+}}@kernel1
-; CHECK-SAME: () #[[ATTR0]] {
-; CHECK-NEXT:    [[I:%.*]] = call i32 @__kmpc_target_init(ptr @kernel1_kernel_environment)
+; CHECK-SAME: (ptr [[DYN:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[I:%.*]] = call i32 @__kmpc_target_init(ptr @kernel1_kernel_environment, ptr [[DYN]])
 ; CHECK-NEXT:    [[THREAD_ID_IN_BLOCK:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
 ; CHECK-NEXT:    [[THREAD_IS_MAIN:%.*]] = icmp ne i32 [[THREAD_ID_IN_BLOCK]], 0
 ; CHECK-NEXT:    br i1 [[THREAD_IS_MAIN]], label [[EXIT_THREADS:%.*]], label [[MAIN_THREAD_USER_CODE:%.*]]
@@ -57,18 +57,18 @@ define weak void @kernel1() "kernel" #0 {
 ; CHECK-NEXT:    call void @__kmpc_target_deinit()
 ; CHECK-NEXT:    ret void
 ;
-  %i = call i32 @__kmpc_target_init(ptr @kernel1_kernel_environment)
+  %i = call i32 @__kmpc_target_init(ptr @kernel1_kernel_environment, ptr %dyn)
   call void @helper1()
   call void @__kmpc_target_deinit()
   ret void
 }
 
-define weak void @kernel2() "kernel" #0 {
+define weak void @kernel2(ptr %dyn) "kernel" #0 {
 ; CHECK-LABEL: define {{[^@]+}}@kernel2
-; CHECK-SAME: () #[[ATTR0]] {
+; CHECK-SAME: (ptr [[DYN:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8
-; CHECK-NEXT:    [[I:%.*]] = call i32 @__kmpc_target_init(ptr @kernel2_kernel_environment)
+; CHECK-NEXT:    [[I:%.*]] = call i32 @__kmpc_target_init(ptr @kernel2_kernel_environment, ptr [[DYN]])
 ; CHECK-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[I]], -1
 ; CHECK-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
 ; CHECK:       common.ret:
@@ -84,7 +84,7 @@ define weak void @kernel2() "kernel" #0 {
 ;
 entry:
   %captured_vars_addrs = alloca [0 x ptr], align 8
-  %i = call i32 @__kmpc_target_init(ptr @kernel2_kernel_environment)
+  %i = call i32 @__kmpc_target_init(ptr @kernel2_kernel_environment, ptr %dyn)
   %exec_user_code = icmp eq i32 %i, -1
   br i1 %exec_user_code, label %user_code.entry, label %common.ret
 
@@ -193,7 +193,7 @@ define internal i32 @__kmpc_get_hardware_num_threads_in_block() {
   ret i32 %ret
 }
 declare i32 @__kmpc_get_hardware_num_threads_in_block_dummy()
-declare i32 @__kmpc_target_init(ptr) #1
+declare i32 @__kmpc_target_init(ptr, ptr) #1
 declare void @__kmpc_target_deinit() #1
 declare void @__kmpc_parallel_51(ptr, i32, i32, i32, i32, ptr, ptr, ptr, i64)
 declare i32 @__kmpc_global_thread_num(ptr)
diff --git a/llvm/test/Transforms/OpenMP/get_hardware_num_threads_in_block_fold_optnone.ll b/llvm/test/Transforms/OpenMP/get_hardware_num_threads_in_block_fold_optnone.ll
index 7910ad19407ef..2d22ac52275c5 100644
--- a/llvm/test/Transforms/OpenMP/get_hardware_num_threads_in_block_fold_optnone.ll
+++ b/llvm/test/Transforms/OpenMP/get_hardware_num_threads_in_block_fold_optnone.ll
@@ -10,14 +10,14 @@ target triple = "nvptx64"
 define weak void @kernel0() #0 {
 ; CHECK-LABEL: define {{[^@]+}}@kernel0
 ; CHECK-SAME: () #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:    [[I:%.*]] = call i32 @__kmpc_target_init(ptr null)
+; CHECK-NEXT:    [[I:%.*]] = call i32 @__kmpc_target_init(ptr null, ptr null)
 ; CHECK-NEXT:    call void @helper0()
 ; CHECK-NEXT:    call void @helper1()
 ; CHECK-NEXT:    call void @helper2()
 ; CHECK-NEXT:    call void @__kmpc_target_deinit(ptr null, i1 true)
 ; CHECK-NEXT:    ret void
 ;
-  %i = call i32 @__kmpc_target_init(ptr null)
+  %i = call i32 @__kmpc_target_init(ptr null, ptr null)
   call void @helper0()
   call void @helper1()
   call void @helper2()
@@ -28,12 +28,12 @@ define weak void @kernel0() #0 {
 define weak void @kernel1() #0 {
 ; CHECK-LABEL: define {{[^@]+}}@kernel1
 ; CHECK-SAME: () #[[ATTR0]] {
-; CHECK-NEXT:    [[I:%.*]] = call i32 @__kmpc_target_init(ptr null)
+; CHECK-NEXT:    [[I:%.*]] = call i32 @__kmpc_target_init(ptr null, ptr null)
 ; CHECK-NEXT:    call void @helper1()
 ; CHECK-NEXT:    call void @__kmpc_target_deinit()
 ; CHECK-NEXT:    ret void
 ;
-  %i = call i32 @__kmpc_target_init(ptr null)
+  %i = call i32 @__kmpc_target_init(ptr null, ptr null)
   call void @helper1()
   call void @__kmpc_target_deinit()
   ret void
@@ -42,14 +42,14 @@ define weak void @kernel1() #0 {
 define weak void @kernel2() #0 {
 ; CHECK-LABEL: define {{[^@]+}}@kernel2
 ; CHECK-SAME: () #[[ATTR0]] {
-; CHECK-NEXT:    [[I:%.*]] = call i32 @__kmpc_target_init(ptr null, i1 false, i1 false)
+; CHECK-NEXT:    [[I:%.*]] = call i32 @__kmpc_target_init(ptr null, ptr null)
 ; CHECK-NEXT:    call void @helper0()
 ; CHECK-NEXT:    call void @helper1()
 ; CHECK-NEXT:    call void @helper2()
 ; CHECK-NEXT:    call void @__kmpc_target_deinit()
 ; CHECK-NEXT:    ret void
 ;
-  %i = call i32 @__kmpc_target_init(ptr null, i1 false, i1 false)
+  %i = call i32 @__kmpc_target_init(ptr null, ptr null)
   call void @helper0()
   call void @helper1()
   call void @helper2()
@@ -102,7 +102,7 @@ define internal void @helper2() {
 }
 
 declare i32 @__kmpc_get_hardware_num_threads_in_block()
-declare i32 @__kmpc_target_init(ptr) #1
+declare i32 @__kmpc_target_init(ptr, ptr) #1
 declare void @__kmpc_target_deinit() #1
 
 
diff --git a/llvm/test/Transforms/OpenMP/global_constructor.ll b/llvm/test/Transforms/OpenMP/global_constructor.ll
index 35c027ed5c164..7ac04f0268120 100644
--- a/llvm/test/Transforms/OpenMP/global_constructor.ll
+++ b/llvm/test/Transforms/OpenMP/global_constructor.ll
@@ -10,9 +10,9 @@
 @_ZL6Device = internal global double 0.000000e+00, align 8
 @__omp_offloading_fd02_85283c04_main_l11_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
 
-define weak void @__omp_offloading_fd02_85283c04_main_l11(ptr nonnull align 8 dereferenceable(8) %X) local_unnamed_addr "kernel" {
+define weak void @__omp_offloading_fd02_85283c04_main_l11(ptr %dyn, ptr nonnull align 8 dereferenceable(8) %X) local_unnamed_addr "kernel" {
 entry:
-  %0 = tail call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_85283c04_main_l11_kernel_environment) #0
+  %0 = tail call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_85283c04_main_l11_kernel_environment, ptr %dyn) #0
   %exec_user_code = icmp eq i32 %0, -1
   br i1 %exec_user_code, label %user_code.entry, label %common.ret
 
@@ -35,7 +35,7 @@ region.barrier:
   br label %common.ret
 }
 
-declare i32 @__kmpc_target_init(ptr) local_unnamed_addr
+declare i32 @__kmpc_target_init(ptr, ptr) local_unnamed_addr
 
 declare void @__kmpc_target_deinit() local_unnamed_addr
 
@@ -78,9 +78,9 @@ attributes #1 = { convergent nounwind }
 !13 = !{!"omnipotent char", !14, i64 0}
 !14 = !{!"Simple C++ TBAA"}
 ; CHECK-LABEL: define {{[^@]+}}@__omp_offloading_fd02_85283c04_main_l11
-; CHECK-SAME: (ptr nonnull align 8 dereferenceable(8) [[X:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-SAME: (ptr [[DYN:%.*]], ptr nonnull align 8 dereferenceable(8) [[X:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_85283c04_main_l11_kernel_environment) #[[ATTR1:[0-9]+]]
+; CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_85283c04_main_l11_kernel_environment, ptr [[DYN]]) #[[ATTR1:[0-9]+]]
 ; CHECK-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 ; CHECK-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
 ; CHECK:       common.ret:
diff --git a/llvm/test/Transforms/OpenMP/globalization_remarks.ll b/llvm/test/Transforms/OpenMP/globalization_remarks.ll
index df369c7f3f19c..9a44343b5f524 100644
--- a/llvm/test/Transforms/OpenMP/globalization_remarks.ll
+++ b/llvm/test/Transforms/OpenMP/globalization_remarks.ll
@@ -15,7 +15,7 @@ target triple = "nvptx64"
 
 define void @foo() "kernel" {
 entry:
-  %c = call i32 @__kmpc_target_init(ptr @foo_kernel_environment)
+  %c = call i32 @__kmpc_target_init(ptr @foo_kernel_environment, ptr null)
   %0 = call ptr @__kmpc_alloc_shared(i64 4), !dbg !10
   call void @share(ptr %0), !dbg !10
   call void @__kmpc_free_shared(ptr %0)
@@ -33,7 +33,7 @@ declare ptr @__kmpc_alloc_shared(i64)
 
 declare void @__kmpc_free_shared(ptr nocapture)
 
-declare i32 @__kmpc_target_init(ptr);
+declare i32 @__kmpc_target_init(ptr, ptr);
 
 declare void @__kmpc_target_deinit()
 
diff --git a/llvm/test/Transforms/OpenMP/gpu_state_machine_function_ptr_replacement.ll b/llvm/test/Transforms/OpenMP/gpu_state_machine_function_ptr_replacement.ll
index 94fcb35fc380b..9363b29d00e24 100644
--- a/llvm/test/Transforms/OpenMP/gpu_state_machine_function_ptr_replacement.ll
+++ b/llvm/test/Transforms/OpenMP/gpu_state_machine_function_ptr_replacement.ll
@@ -49,7 +49,7 @@ entry:
   %.zero.addr = alloca i32, align 4
   %.threadid_temp. = alloca i32, align 4
   store i32 0, ptr %.zero.addr, align 4
-  %0 = call i32 @__kmpc_target_init(ptr @__omp_offloading_10301_87b2c_foo_l7_kernel_environment)
+  %0 = call i32 @__kmpc_target_init(ptr @__omp_offloading_10301_87b2c_foo_l7_kernel_environment, ptr null)
   %exec_user_code = icmp eq i32 %0, -1
   br i1 %exec_user_code, label %user_code.entry, label %worker.exit
 
@@ -64,7 +64,7 @@ worker.exit:                                      ; preds = %entry
   ret void
 }
 
-define weak i32 @__kmpc_target_init(ptr %0) {
+define weak i32 @__kmpc_target_init(ptr %0, ptr) {
   ret i32 0
 }
 
diff --git a/llvm/test/Transforms/OpenMP/indirect_call_kernel_info_crash.ll b/llvm/test/Transforms/OpenMP/indirect_call_kernel_info_crash.ll
index 03bc31bac2034..1b6273e8f60f0 100644
--- a/llvm/test/Transforms/OpenMP/indirect_call_kernel_info_crash.ll
+++ b/llvm/test/Transforms/OpenMP/indirect_call_kernel_info_crash.ll
@@ -7,8 +7,8 @@ target triple = "amdgcn-amd-amdhsa"
 
 @_ZN4ompx5state9TeamStateE = internal addrspace(3) global %"struct.ompx::state::TeamStateTy" undef
 
-define amdgpu_kernel void @__omp_offloading_32_70c2e76c_main_l24() {
-  %1 = tail call i32 @__kmpc_target_init(ptr null)
+define amdgpu_kernel void @__omp_offloading_32_70c2e76c_main_l24(ptr %dyn) {
+  %1 = tail call i32 @__kmpc_target_init(ptr null, ptr %dyn)
   call void @__kmpc_parallel_51(ptr null, i32 0, i32 0, i32 0, i32 0, ptr @__omp_offloading_32_70c2e76c_main_l24_omp_outlined, ptr null, ptr null, i64 0)
   ret void
 }
@@ -24,7 +24,7 @@ define void @__omp_offloading_32_70c2e76c_main_l24_omp_outlined(ptr %0) {
   br label %2
 }
 
-define internal i32 @__kmpc_target_init(ptr %0) {
+define internal i32 @__kmpc_target_init(ptr %0, ptr) {
   store i32 0, ptr addrspace(3) @_ZN4ompx5state9TeamStateE, align 16
   ret i32 0
 }
diff --git a/llvm/test/Transforms/OpenMP/is_spmd_exec_mode_fold.ll b/llvm/test/Transforms/OpenMP/is_spmd_exec_mode_fold.ll
index afc4043d6dd70..f0feec51d3c2d 100644
--- a/llvm/test/Transforms/OpenMP/is_spmd_exec_mode_fold.ll
+++ b/llvm/test/Transforms/OpenMP/is_spmd_exec_mode_fold.ll
@@ -21,14 +21,14 @@ target triple = "nvptx64"
 define weak void @is_spmd() "kernel" {
 ; CHECK-LABEL: define {{[^@]+}}@is_spmd
 ; CHECK-SAME: () #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:    [[I:%.*]] = call i32 @__kmpc_target_init(ptr @is_spmd_kernel_environment)
+; CHECK-NEXT:    [[I:%.*]] = call i32 @__kmpc_target_init(ptr @is_spmd_kernel_environment, ptr null)
 ; CHECK-NEXT:    call void @is_spmd_helper1()
 ; CHECK-NEXT:    call void @is_spmd_helper2()
 ; CHECK-NEXT:    call void @is_mixed_helper()
 ; CHECK-NEXT:    call void @__kmpc_target_deinit()
 ; CHECK-NEXT:    ret void
 ;
-  %i = call i32 @__kmpc_target_init(ptr @is_spmd_kernel_environment)
+  %i = call i32 @__kmpc_target_init(ptr @is_spmd_kernel_environment, ptr null)
   call void @is_spmd_helper1()
   call void @is_spmd_helper2()
   call void @is_mixed_helper()
@@ -41,7 +41,7 @@ define weak void @will_be_spmd() "kernel" {
 ; CHECK-SAME: () #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8
-; CHECK-NEXT:    [[I:%.*]] = call i32 @__kmpc_target_init(ptr @will_be_spmd_kernel_environment)
+; CHECK-NEXT:    [[I:%.*]] = call i32 @__kmpc_target_init(ptr @will_be_spmd_kernel_environment, ptr null)
 ; CHECK-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[I]], -1
 ; CHECK-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
 ; CHECK:       common.ret:
@@ -55,7 +55,7 @@ define weak void @will_be_spmd() "kernel" {
 ;
 entry:
   %captured_vars_addrs = alloca [0 x ptr], align 8
-  %i = call i32 @__kmpc_target_init(ptr @will_be_spmd_kernel_environment)
+  %i = call i32 @__kmpc_target_init(ptr @will_be_spmd_kernel_environment, ptr null)
   %exec_user_code = icmp eq i32 %i, -1
   br i1 %exec_user_code, label %user_code.entry, label %common.ret
 
@@ -73,14 +73,14 @@ user_code.entry:
 define weak void @non_spmd() "kernel" {
 ; CHECK-LABEL: define {{[^@]+}}@non_spmd
 ; CHECK-SAME: () #[[ATTR0]] {
-; CHECK-NEXT:    [[I:%.*]] = call i32 @__kmpc_target_init(ptr @none_spmd_kernel_environment)
+; CHECK-NEXT:    [[I:%.*]] = call i32 @__kmpc_target_init(ptr @none_spmd_kernel_environment, ptr null)
 ; CHECK-NEXT:    call void @is_generic_helper1()
 ; CHECK-NEXT:    call void @is_generic_helper2()
 ; CHECK-NEXT:    call void @is_mixed_helper()
 ; CHECK-NEXT:    call void @__kmpc_target_deinit()
 ; CHECK-NEXT:    ret void
 ;
-  %i = call i32 @__kmpc_target_init(ptr @none_spmd_kernel_environment)
+  %i = call i32 @__kmpc_target_init(ptr @none_spmd_kernel_environment, ptr null)
   call void @is_generic_helper1()
   call void @is_generic_helper2()
   call void @is_mixed_helper()
@@ -91,14 +91,14 @@ define weak void @non_spmd() "kernel" {
 define weak void @will_not_be_spmd() "kernel" {
 ; CHECK-LABEL: define {{[^@]+}}@will_not_be_spmd
 ; CHECK-SAME: () #[[ATTR0]] {
-; CHECK-NEXT:    [[I:%.*]] = call i32 @__kmpc_target_init(ptr @will_not_be_spmd_kernel_environment)
+; CHECK-NEXT:    [[I:%.*]] = call i32 @__kmpc_target_init(ptr @will_not_be_spmd_kernel_environment, ptr null)
 ; CHECK-NEXT:    call void @is_generic_helper1()
 ; CHECK-NEXT:    call void @is_generic_helper2()
 ; CHECK-NEXT:    call void @is_mixed_helper()
 ; CHECK-NEXT:    call void @__kmpc_target_deinit()
 ; CHECK-NEXT:    ret void
 ;
-  %i = call i32 @__kmpc_target_init(ptr @will_not_be_spmd_kernel_environment)
+  %i = call i32 @__kmpc_target_init(ptr @will_not_be_spmd_kernel_environment, ptr null)
   call void @is_generic_helper1()
   call void @is_generic_helper2()
   call void @is_mixed_helper()
@@ -199,7 +199,7 @@ entry:
 
 declare void @spmd_compatible() "llvm.assume"="ompx_spmd_amenable"
 declare i8 @__kmpc_is_spmd_exec_mode()
-declare i32 @__kmpc_target_init(ptr)
+declare i32 @__kmpc_target_init(ptr, ptr)
 declare void @__kmpc_target_deinit()
 declare void @__kmpc_parallel_51(ptr, i32, i32, i32, i32, ptr, ptr, ptr, i64)
 declare i32 @__kmpc_global_thread_num(ptr)
diff --git a/llvm/test/Transforms/OpenMP/nested_parallelism.ll b/llvm/test/Transforms/OpenMP/nested_parallelism.ll
index 8fdd438355681..fafd05f4efb9b 100644
--- a/llvm/test/Transforms/OpenMP/nested_parallelism.ll
+++ b/llvm/test/Transforms/OpenMP/nested_parallelism.ll
@@ -43,11 +43,11 @@ target triple = "nvptx64"
 ; CHECK: @[[__OMP_OFFLOADING_10302_BD7E0_MAIN_L13_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 2, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
 ; CHECK: @[[__OMP_OFFLOADING_10302_BD7E0_MAIN_L16_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 1, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
 ;.
-define weak_odr protected void @__omp_offloading_10302_bd7e0_main_l13(i64 noundef %i) local_unnamed_addr "kernel" {
+define weak_odr protected void @__omp_offloading_10302_bd7e0_main_l13(ptr %dyn, i64 noundef %i) local_unnamed_addr "kernel" {
 ; CHECK-LABEL: @__omp_offloading_10302_bd7e0_main_l13(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CAPTURED_VARS_ADDRS_I:%.*]] = alloca [1 x ptr], align 8
-; CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @__kmpc_target_init(ptr @__omp_offloading_10302_bd7e0_main_l13_kernel_environment)
+; CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @__kmpc_target_init(ptr @__omp_offloading_10302_bd7e0_main_l13_kernel_environment, ptr [[DYN:%.*]])
 ; CHECK-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 ; CHECK-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
 ; CHECK:       common.ret:
@@ -72,7 +72,7 @@ define weak_odr protected void @__omp_offloading_10302_bd7e0_main_l13(i64 nounde
 ;
 entry:
   %captured_vars_addrs.i = alloca [1 x ptr], align 8
-  %0 = tail call i32 @__kmpc_target_init(ptr @__omp_offloading_10302_bd7e0_main_l13_kernel_environment) #6
+  %0 = tail call i32 @__kmpc_target_init(ptr @__omp_offloading_10302_bd7e0_main_l13_kernel_environment, ptr %dyn) #6
   %exec_user_code = icmp eq i32 %0, -1
   br i1 %exec_user_code, label %user_code.entry, label %common.ret
 
@@ -100,7 +100,7 @@ _Z3fooi.internalized.exit:                        ; preds = %user_code.entry, %r
   br label %common.ret
 }
 
-declare i32 @__kmpc_target_init(ptr) local_unnamed_addr
+declare i32 @__kmpc_target_init(ptr, ptr) local_unnamed_addr
 
 define hidden void @_Z3fooi(i32 noundef %i1) local_unnamed_addr #1 {
 ; CHECK-LABEL: @_Z3fooi(
@@ -127,11 +127,11 @@ entry:
 
 declare void @__kmpc_target_deinit(ptr, i8) local_unnamed_addr
 
-define weak_odr protected void @__omp_offloading_10302_bd7e0_main_l16(i64 noundef %i) local_unnamed_addr "kernel" {
+define weak_odr protected void @__omp_offloading_10302_bd7e0_main_l16(ptr %dyn, i64 noundef %i) local_unnamed_addr "kernel" {
 ; CHECK-LABEL: @__omp_offloading_10302_bd7e0_main_l16(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CAPTURED_VARS_ADDRS_I:%.*]] = alloca [1 x ptr], align 8
-; CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @__kmpc_target_init(ptr @__omp_offloading_10302_bd7e0_main_l16_kernel_environment)
+; CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @__kmpc_target_init(ptr @__omp_offloading_10302_bd7e0_main_l16_kernel_environment, ptr [[DYN:%.*]])
 ; CHECK-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 ; CHECK-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
 ; CHECK:       common.ret:
@@ -149,7 +149,7 @@ define weak_odr protected void @__omp_offloading_10302_bd7e0_main_l16(i64 nounde
 ;
 entry:
   %captured_vars_addrs.i = alloca [1 x ptr], align 8
-  %0 = tail call i32 @__kmpc_target_init(ptr @__omp_offloading_10302_bd7e0_main_l16_kernel_environment) #6
+  %0 = tail call i32 @__kmpc_target_init(ptr @__omp_offloading_10302_bd7e0_main_l16_kernel_environment, ptr %dyn) #6
   %exec_user_code = icmp eq i32 %0, -1
   br i1 %exec_user_code, label %user_code.entry, label %common.ret
 
diff --git a/llvm/test/Transforms/OpenMP/parallel_level_fold.ll b/llvm/test/Transforms/OpenMP/parallel_level_fold.ll
index 2aae5d3186393..5692197f80939 100644
--- a/llvm/test/Transforms/OpenMP/parallel_level_fold.ll
+++ b/llvm/test/Transforms/OpenMP/parallel_level_fold.ll
@@ -19,13 +19,13 @@ target triple = "nvptx64"
 define weak void @none_spmd() "kernel" {
 ; CHECK-LABEL: define {{[^@]+}}@none_spmd
 ; CHECK-SAME: () #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:    [[I:%.*]] = call i32 @__kmpc_target_init(ptr @none_spmd_kernel_environment)
+; CHECK-NEXT:    [[I:%.*]] = call i32 @__kmpc_target_init(ptr @none_spmd_kernel_environment, ptr null)
 ; CHECK-NEXT:    call void @none_spmd_helper()
 ; CHECK-NEXT:    call void @mixed_helper()
 ; CHECK-NEXT:    call void @__kmpc_target_deinit()
 ; CHECK-NEXT:    ret void
 ;
-  %i = call i32 @__kmpc_target_init(ptr @none_spmd_kernel_environment)
+  %i = call i32 @__kmpc_target_init(ptr @none_spmd_kernel_environment, ptr null)
   call void @none_spmd_helper()
   call void @mixed_helper()
   call void @__kmpc_target_deinit()
@@ -35,13 +35,13 @@ define weak void @none_spmd() "kernel" {
 define weak void @spmd() "kernel" {
 ; CHECK-LABEL: define {{[^@]+}}@spmd
 ; CHECK-SAME: () #[[ATTR0]] {
-; CHECK-NEXT:    [[I:%.*]] = call i32 @__kmpc_target_init(ptr @spmd_kernel_environment)
+; CHECK-NEXT:    [[I:%.*]] = call i32 @__kmpc_target_init(ptr @spmd_kernel_environment, ptr null)
 ; CHECK-NEXT:    call void @spmd_helper()
 ; CHECK-NEXT:    call void @mixed_helper()
 ; CHECK-NEXT:    call void @__kmpc_target_deinit()
 ; CHECK-NEXT:    ret void
 ;
-  %i = call i32 @__kmpc_target_init(ptr @spmd_kernel_environment)
+  %i = call i32 @__kmpc_target_init(ptr @spmd_kernel_environment, ptr null)
   call void @spmd_helper()
   call void @mixed_helper()
   call void @__kmpc_target_deinit()
@@ -51,13 +51,13 @@ define weak void @spmd() "kernel" {
 define weak void @parallel() "kernel" {
 ; CHECK-LABEL: define {{[^@]+}}@parallel
 ; CHECK-SAME: () #[[ATTR0]] {
-; CHECK-NEXT:    [[I:%.*]] = call i32 @__kmpc_target_init(ptr @parallel_kernel_environment)
+; CHECK-NEXT:    [[I:%.*]] = call i32 @__kmpc_target_init(ptr @parallel_kernel_environment, ptr null)
 ; CHECK-NEXT:    call void @spmd_helper()
 ; CHECK-NEXT:    call void @__kmpc_parallel_51(ptr null, i32 0, i32 0, i32 0, i32 0, ptr null, ptr null, ptr null, i64 0)
 ; CHECK-NEXT:    call void @__kmpc_target_deinit()
 ; CHECK-NEXT:    ret void
 ;
-  %i = call i32 @__kmpc_target_init(ptr @parallel_kernel_environment)
+  %i = call i32 @__kmpc_target_init(ptr @parallel_kernel_environment, ptr null)
   call void @spmd_helper()
   call void @__kmpc_parallel_51(ptr null, i32 0, i32 0, i32 0, i32 0, ptr null, ptr null, ptr null, i64 0)
   call void @__kmpc_target_deinit()
@@ -132,7 +132,7 @@ define internal void @parallel_helper() {
 declare void @foo()
 declare void @bar()
 declare zeroext i16 @__kmpc_parallel_level(ptr, i32)
-declare i32 @__kmpc_target_init(ptr) #1
+declare i32 @__kmpc_target_init(ptr, ptr) #1
 declare void @__kmpc_target_deinit() #1
 
 !llvm.module.flags = !{!0, !1}
diff --git a/llvm/test/Transforms/OpenMP/remove_globalization.ll b/llvm/test/Transforms/OpenMP/remove_globalization.ll
index 4db48c9a874c5..0c27e9c86c7fd 100644
--- a/llvm/test/Transforms/OpenMP/remove_globalization.ll
+++ b/llvm/test/Transforms/OpenMP/remove_globalization.ll
@@ -27,24 +27,24 @@ target triple = "nvptx64"
 ; CHECK-DISABLED: @[[S:[a-zA-Z0-9_$"\\.-]+]] = external local_unnamed_addr global ptr
 ; CHECK-DISABLED: @[[KERNEL_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr null, ptr null }
 ;.
-define weak i32 @__kmpc_target_init(ptr %0) {
+define weak i32 @__kmpc_target_init(ptr %0, ptr) {
 ; CHECK-LABEL: define {{[^@]+}}@__kmpc_target_init
-; CHECK-SAME: (ptr [[TMP0:%.*]]) {
+; CHECK-SAME: (ptr [[TMP0:%.*]], ptr [[TMP1:%.*]]) {
 ; CHECK-NEXT:    ret i32 0
 ;
 ; CHECK-DISABLED-LABEL: define {{[^@]+}}@__kmpc_target_init
-; CHECK-DISABLED-SAME: (ptr [[TMP0:%.*]]) {
+; CHECK-DISABLED-SAME: (ptr [[TMP0:%.*]], ptr [[TMP1:%.*]]) {
 ; CHECK-DISABLED-NEXT:    ret i32 0
 ;
   ret i32 0
 }
 declare void @__kmpc_target_deinit()
 
-define void @kernel() "kernel" {
+define void @kernel(ptr %dyn) "kernel" {
 ; CHECK-LABEL: define {{[^@]+}}@kernel
-; CHECK-SAME: () #[[ATTR0:[0-9]+]] {
+; CHECK-SAME: (ptr [[DYN:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @kernel_kernel_environment)
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @kernel_kernel_environment, ptr [[DYN]])
 ; CHECK-NEXT:    call void @foo() #[[ATTR1:[0-9]+]]
 ; CHECK-NEXT:    call void @bar() #[[ATTR1]]
 ; CHECK-NEXT:    call void @convert_and_move_alloca() #[[ATTR1]]
@@ -53,9 +53,9 @@ define void @kernel() "kernel" {
 ; CHECK-NEXT:    ret void
 ;
 ; CHECK-DISABLED-LABEL: define {{[^@]+}}@kernel
-; CHECK-DISABLED-SAME: () #[[ATTR0:[0-9]+]] {
+; CHECK-DISABLED-SAME: (ptr [[DYN:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-DISABLED-NEXT:  entry:
-; CHECK-DISABLED-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @kernel_kernel_environment)
+; CHECK-DISABLED-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @kernel_kernel_environment, ptr [[DYN]])
 ; CHECK-DISABLED-NEXT:    call void @foo() #[[ATTR1:[0-9]+]]
 ; CHECK-DISABLED-NEXT:    call void @bar() #[[ATTR1]]
 ; CHECK-DISABLED-NEXT:    call void @convert_and_move_alloca() #[[ATTR1]]
@@ -64,7 +64,7 @@ define void @kernel() "kernel" {
 ; CHECK-DISABLED-NEXT:    ret void
 ;
 entry:
-  %0 = call i32 @__kmpc_target_init(ptr @kernel_kernel_environment)
+  %0 = call i32 @__kmpc_target_init(ptr @kernel_kernel_environment, ptr %dyn)
   call void @foo()
   call void @bar()
   call void @convert_and_move_alloca()
diff --git a/llvm/test/Transforms/OpenMP/replace_globalization.ll b/llvm/test/Transforms/OpenMP/replace_globalization.ll
index d7eb120e0d1ff..4dc10eba52413 100644
--- a/llvm/test/Transforms/OpenMP/replace_globalization.ll
+++ b/llvm/test/Transforms/OpenMP/replace_globalization.ll
@@ -25,9 +25,9 @@ target triple = "nvptx64"
 @baz_kernel_environment = local_unnamed_addr constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 0, i8 0, i8 2, i32 0, i32 0, i32 0, i32 0 }, ptr @1, ptr null }
 
 
-define dso_local void @foo() "kernel" {
+define dso_local void @foo(ptr %dyn) "kernel" {
 entry:
-  %c = call i32 @__kmpc_target_init(ptr @foo_kernel_environment)
+  %c = call i32 @__kmpc_target_init(ptr @foo_kernel_environment, ptr %dyn)
   %x = call align 4 ptr @__kmpc_alloc_shared(i64 4)
   call void @unknown_no_openmp()
   call void @use(ptr %x)
@@ -36,8 +36,8 @@ entry:
   ret void
 }
 
-define void @bar() "kernel" {
-  %c = call i32 @__kmpc_target_init(ptr @bar_kernel_environment)
+define void @bar(ptr %dyn) "kernel" {
+  %c = call i32 @__kmpc_target_init(ptr @bar_kernel_environment, ptr %dyn)
   call void @unknown_no_openmp()
   %cmp = icmp eq i32 %c, -1
   br i1 %cmp, label %master1, label %exit
@@ -60,8 +60,8 @@ exit:
   ret void
 }
 
-define void @baz_spmd() "kernel" {
-  %c = call i32 @__kmpc_target_init(ptr @baz_kernel_environment)
+define void @baz_spmd(ptr %dyn) "kernel" {
+  %c = call i32 @__kmpc_target_init(ptr @baz_kernel_environment, ptr %dyn)
   call void @unknown_no_openmp()
   %c0 = icmp eq i32 %c, -1
   br i1 %c0, label %master3, label %exit
@@ -99,7 +99,7 @@ declare i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
 declare i32 @llvm.nvvm.read.ptx.sreg.warpsize()
 
 ; Make it a weak definition so we will apply custom state machine rewriting but can't use the body in the reasoning.
-define weak i32 @__kmpc_target_init(ptr) {
+define weak i32 @__kmpc_target_init(ptr, ptr) {
   ret i32 0
 }
 
@@ -138,9 +138,9 @@ declare void @unknown_no_openmp() "llvm.assume"="omp_no_openmp"
 ; CHECK: @[[Y_SHARED:[a-zA-Z0-9_$"\\.-]+]] = internal addrspace(3) global [4 x i8] poison, align 4
 ;.
 ; CHECK-LABEL: define {{[^@]+}}@foo
-; CHECK-SAME: () #[[ATTR0:[0-9]+]] {
+; CHECK-SAME: (ptr [[DYN:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[C:%.*]] = call i32 @__kmpc_target_init(ptr @foo_kernel_environment)
+; CHECK-NEXT:    [[C:%.*]] = call i32 @__kmpc_target_init(ptr @foo_kernel_environment, ptr [[DYN]])
 ; CHECK-NEXT:    [[X:%.*]] = call align 4 ptr @__kmpc_alloc_shared(i64 4) #[[ATTR6:[0-9]+]]
 ; CHECK-NEXT:    call void @unknown_no_openmp() #[[ATTR5:[0-9]+]]
 ; CHECK-NEXT:    call void @use.internalized(ptr nofree [[X]]) #[[ATTR7:[0-9]+]]
@@ -150,8 +150,8 @@ declare void @unknown_no_openmp() "llvm.assume"="omp_no_openmp"
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@bar
-; CHECK-SAME: () #[[ATTR0]] {
-; CHECK-NEXT:    [[C:%.*]] = call i32 @__kmpc_target_init(ptr @bar_kernel_environment)
+; CHECK-SAME: (ptr [[DYN:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[C:%.*]] = call i32 @__kmpc_target_init(ptr @bar_kernel_environment, ptr [[DYN]])
 ; CHECK-NEXT:    call void @unknown_no_openmp() #[[ATTR5]]
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[C]], -1
 ; CHECK-NEXT:    br i1 [[CMP]], label [[MASTER1:%.*]], label [[EXIT:%.*]]
@@ -171,8 +171,8 @@ declare void @unknown_no_openmp() "llvm.assume"="omp_no_openmp"
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@baz_spmd
-; CHECK-SAME: () #[[ATTR0]] {
-; CHECK-NEXT:    [[C:%.*]] = call i32 @__kmpc_target_init(ptr @baz_kernel_environment)
+; CHECK-SAME: (ptr [[DYN:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[C:%.*]] = call i32 @__kmpc_target_init(ptr @baz_kernel_environment, ptr [[DYN]])
 ; CHECK-NEXT:    call void @unknown_no_openmp() #[[ATTR5]]
 ; CHECK-NEXT:    [[C0:%.*]] = icmp eq i32 [[C]], -1
 ; CHECK-NEXT:    br i1 [[C0]], label [[MASTER3:%.*]], label [[EXIT:%.*]]
@@ -210,7 +210,7 @@ declare void @unknown_no_openmp() "llvm.assume"="omp_no_openmp"
 ;
 ;
 ; CHECK-LABEL: define {{[^@]+}}@__kmpc_target_init
-; CHECK-SAME: (ptr [[TMP0:%.*]]) {
+; CHECK-SAME: (ptr [[TMP0:%.*]], ptr [[TMP1:%.*]]) {
 ; CHECK-NEXT:    ret i32 0
 ;
 ;.
diff --git a/llvm/test/Transforms/OpenMP/single_threaded_execution.ll b/llvm/test/Transforms/OpenMP/single_threaded_execution.ll
index 0a1ad421fb502..6de658eb63e90 100644
--- a/llvm/test/Transforms/OpenMP/single_threaded_execution.ll
+++ b/llvm/test/Transforms/OpenMP/single_threaded_execution.ll
@@ -16,8 +16,8 @@
 ; CHECK: [openmp-opt] Basic block @kernel if.then is executed by a single thread.
 ; CHECK-NOT: [openmp-opt] Basic block @kernel if.else is executed by a single thread.
 ; CHECK-NOT: [openmp-opt] Basic block @kernel if.end is executed by a single thread.
-define void @kernel() "kernel" {
-  %call = call i32 @__kmpc_target_init(ptr @kernel_kernel_environment)
+define void @kernel(ptr %dyn) "kernel" {
+  %call = call i32 @__kmpc_target_init(ptr @kernel_kernel_environment, ptr %dyn)
   %cmp = icmp eq i32 %call, -1
   br i1 %cmp, label %if.then, label %if.else
 if.then:
@@ -108,7 +108,7 @@ declare i32 @llvm.amdgcn.workitem.id.x()
 
 declare void @__kmpc_kernel_prepare_parallel(ptr)
 
-declare i32 @__kmpc_target_init(ptr)
+declare i32 @__kmpc_target_init(ptr, ptr)
 
 declare void @__kmpc_target_deinit()
 
diff --git a/llvm/test/Transforms/OpenMP/spmdization.ll b/llvm/test/Transforms/OpenMP/spmdization.ll
index fd272c017db13..a64be8a22a7f9 100644
--- a/llvm/test/Transforms/OpenMP/spmdization.ll
+++ b/llvm/test/Transforms/OpenMP/spmdization.ll
@@ -275,7 +275,7 @@ define internal void @__omp_offloading_fd02_2044372e_sequential_loop_l5__debug()
 ; AMDGPU-NEXT:  entry:
 ; AMDGPU-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; AMDGPU-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; AMDGPU-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_l5_kernel_environment)
+; AMDGPU-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_l5_kernel_environment, ptr null)
 ; AMDGPU-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 ; AMDGPU-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
 ; AMDGPU:       common.ret:
@@ -292,7 +292,7 @@ define internal void @__omp_offloading_fd02_2044372e_sequential_loop_l5__debug()
 ; NVPTX-NEXT:  entry:
 ; NVPTX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; NVPTX-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; NVPTX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_l5_kernel_environment)
+; NVPTX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_l5_kernel_environment, ptr null)
 ; NVPTX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 ; NVPTX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
 ; NVPTX:       common.ret:
@@ -310,7 +310,7 @@ define internal void @__omp_offloading_fd02_2044372e_sequential_loop_l5__debug()
 ; AMDGPU-DISABLED1-NEXT:    [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 ; AMDGPU-DISABLED1-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; AMDGPU-DISABLED1-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; AMDGPU-DISABLED1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_l5_kernel_environment)
+; AMDGPU-DISABLED1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_l5_kernel_environment, ptr null)
 ; AMDGPU-DISABLED1-NEXT:    [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
 ; AMDGPU-DISABLED1-NEXT:    br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
 ; AMDGPU-DISABLED1:       is_worker_check:
@@ -362,7 +362,7 @@ define internal void @__omp_offloading_fd02_2044372e_sequential_loop_l5__debug()
 ; AMDGPU-DISABLED2-NEXT:    [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 ; AMDGPU-DISABLED2-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; AMDGPU-DISABLED2-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; AMDGPU-DISABLED2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_l5_kernel_environment)
+; AMDGPU-DISABLED2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_l5_kernel_environment, ptr null)
 ; AMDGPU-DISABLED2-NEXT:    [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
 ; AMDGPU-DISABLED2-NEXT:    br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
 ; AMDGPU-DISABLED2:       is_worker_check:
@@ -414,7 +414,7 @@ define internal void @__omp_offloading_fd02_2044372e_sequential_loop_l5__debug()
 ; NVPTX-DISABLED1-NEXT:    [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8
 ; NVPTX-DISABLED1-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; NVPTX-DISABLED1-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; NVPTX-DISABLED1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_l5_kernel_environment)
+; NVPTX-DISABLED1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_l5_kernel_environment, ptr null)
 ; NVPTX-DISABLED1-NEXT:    [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
 ; NVPTX-DISABLED1-NEXT:    br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
 ; NVPTX-DISABLED1:       is_worker_check:
@@ -465,7 +465,7 @@ define internal void @__omp_offloading_fd02_2044372e_sequential_loop_l5__debug()
 ; NVPTX-DISABLED2-NEXT:    [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8
 ; NVPTX-DISABLED2-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; NVPTX-DISABLED2-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; NVPTX-DISABLED2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_l5_kernel_environment)
+; NVPTX-DISABLED2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_l5_kernel_environment, ptr null)
 ; NVPTX-DISABLED2-NEXT:    [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
 ; NVPTX-DISABLED2-NEXT:    br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
 ; NVPTX-DISABLED2:       is_worker_check:
@@ -618,7 +618,7 @@ define internal void @__omp_offloading_fd02_2044372e_sequential_loop_l5__debug()
 entry:
   %.zero.addr = alloca i32, align 4
   %.threadid_temp. = alloca i32, align 4
-  %0 = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_l5_kernel_environment)
+  %0 = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_l5_kernel_environment, ptr null)
   %exec_user_code = icmp eq i32 %0, -1
   br i1 %exec_user_code, label %user_code.entry, label %common.ret
 
@@ -949,7 +949,7 @@ define weak void @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l2
 ; AMDGPU-NEXT:  entry:
 ; AMDGPU-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; AMDGPU-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; AMDGPU-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20_kernel_environment)
+; AMDGPU-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20_kernel_environment, ptr null)
 ; AMDGPU-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 ; AMDGPU-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
 ; AMDGPU:       common.ret:
@@ -966,7 +966,7 @@ define weak void @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l2
 ; NVPTX-NEXT:  entry:
 ; NVPTX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; NVPTX-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; NVPTX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20_kernel_environment)
+; NVPTX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20_kernel_environment, ptr null)
 ; NVPTX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 ; NVPTX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
 ; NVPTX:       common.ret:
@@ -984,7 +984,7 @@ define weak void @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l2
 ; AMDGPU-DISABLED1-NEXT:    [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 ; AMDGPU-DISABLED1-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; AMDGPU-DISABLED1-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; AMDGPU-DISABLED1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20_kernel_environment)
+; AMDGPU-DISABLED1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20_kernel_environment, ptr null)
 ; AMDGPU-DISABLED1-NEXT:    [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
 ; AMDGPU-DISABLED1-NEXT:    br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
 ; AMDGPU-DISABLED1:       is_worker_check:
@@ -1036,7 +1036,7 @@ define weak void @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l2
 ; AMDGPU-DISABLED2-NEXT:    [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 ; AMDGPU-DISABLED2-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; AMDGPU-DISABLED2-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; AMDGPU-DISABLED2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20_kernel_environment)
+; AMDGPU-DISABLED2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20_kernel_environment, ptr null)
 ; AMDGPU-DISABLED2-NEXT:    [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
 ; AMDGPU-DISABLED2-NEXT:    br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
 ; AMDGPU-DISABLED2:       is_worker_check:
@@ -1088,7 +1088,7 @@ define weak void @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l2
 ; NVPTX-DISABLED1-NEXT:    [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8
 ; NVPTX-DISABLED1-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; NVPTX-DISABLED1-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; NVPTX-DISABLED1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20_kernel_environment)
+; NVPTX-DISABLED1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20_kernel_environment, ptr null)
 ; NVPTX-DISABLED1-NEXT:    [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
 ; NVPTX-DISABLED1-NEXT:    br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
 ; NVPTX-DISABLED1:       is_worker_check:
@@ -1139,7 +1139,7 @@ define weak void @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l2
 ; NVPTX-DISABLED2-NEXT:    [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8
 ; NVPTX-DISABLED2-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; NVPTX-DISABLED2-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; NVPTX-DISABLED2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20_kernel_environment)
+; NVPTX-DISABLED2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20_kernel_environment, ptr null)
 ; NVPTX-DISABLED2-NEXT:    [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
 ; NVPTX-DISABLED2-NEXT:    br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
 ; NVPTX-DISABLED2:       is_worker_check:
@@ -1292,7 +1292,7 @@ define weak void @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l2
 entry:
   %.zero.addr = alloca i32, align 4
   %.threadid_temp. = alloca i32, align 4
-  %0 = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20_kernel_environment)
+  %0 = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_stack_var_l20_kernel_environment, ptr null)
   %exec_user_code = icmp eq i32 %0, -1
   br i1 %exec_user_code, label %user_code.entry, label %common.ret
 
@@ -1646,7 +1646,7 @@ define weak void @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l
 ; AMDGPU-NEXT:  entry:
 ; AMDGPU-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; AMDGPU-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; AMDGPU-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35_kernel_environment)
+; AMDGPU-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35_kernel_environment, ptr null)
 ; AMDGPU-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 ; AMDGPU-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
 ; AMDGPU:       common.ret:
@@ -1663,7 +1663,7 @@ define weak void @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l
 ; NVPTX-NEXT:  entry:
 ; NVPTX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; NVPTX-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; NVPTX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35_kernel_environment)
+; NVPTX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35_kernel_environment, ptr null)
 ; NVPTX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 ; NVPTX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
 ; NVPTX:       common.ret:
@@ -1681,7 +1681,7 @@ define weak void @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l
 ; AMDGPU-DISABLED1-NEXT:    [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 ; AMDGPU-DISABLED1-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; AMDGPU-DISABLED1-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; AMDGPU-DISABLED1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35_kernel_environment)
+; AMDGPU-DISABLED1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35_kernel_environment, ptr null)
 ; AMDGPU-DISABLED1-NEXT:    [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
 ; AMDGPU-DISABLED1-NEXT:    br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
 ; AMDGPU-DISABLED1:       is_worker_check:
@@ -1733,7 +1733,7 @@ define weak void @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l
 ; AMDGPU-DISABLED2-NEXT:    [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 ; AMDGPU-DISABLED2-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; AMDGPU-DISABLED2-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; AMDGPU-DISABLED2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35_kernel_environment)
+; AMDGPU-DISABLED2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35_kernel_environment, ptr null)
 ; AMDGPU-DISABLED2-NEXT:    [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
 ; AMDGPU-DISABLED2-NEXT:    br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
 ; AMDGPU-DISABLED2:       is_worker_check:
@@ -1785,7 +1785,7 @@ define weak void @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l
 ; NVPTX-DISABLED1-NEXT:    [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8
 ; NVPTX-DISABLED1-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; NVPTX-DISABLED1-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; NVPTX-DISABLED1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35_kernel_environment)
+; NVPTX-DISABLED1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35_kernel_environment, ptr null)
 ; NVPTX-DISABLED1-NEXT:    [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
 ; NVPTX-DISABLED1-NEXT:    br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
 ; NVPTX-DISABLED1:       is_worker_check:
@@ -1836,7 +1836,7 @@ define weak void @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l
 ; NVPTX-DISABLED2-NEXT:    [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8
 ; NVPTX-DISABLED2-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; NVPTX-DISABLED2-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; NVPTX-DISABLED2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35_kernel_environment)
+; NVPTX-DISABLED2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35_kernel_environment, ptr null)
 ; NVPTX-DISABLED2-NEXT:    [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
 ; NVPTX-DISABLED2-NEXT:    br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
 ; NVPTX-DISABLED2:       is_worker_check:
@@ -1989,7 +1989,7 @@ define weak void @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l
 entry:
   %.zero.addr = alloca i32, align 4
   %.threadid_temp. = alloca i32, align 4
-  %0 = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35_kernel_environment)
+  %0 = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_l35_kernel_environment, ptr null)
   %exec_user_code = icmp eq i32 %0, -1
   br i1 %exec_user_code, label %user_code.entry, label %common.ret
 
@@ -2376,7 +2376,7 @@ define weak void @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_g
 ; AMDGPU-NEXT:  entry:
 ; AMDGPU-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; AMDGPU-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; AMDGPU-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50_kernel_environment)
+; AMDGPU-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50_kernel_environment, ptr null)
 ; AMDGPU-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 ; AMDGPU-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
 ; AMDGPU:       common.ret:
@@ -2393,7 +2393,7 @@ define weak void @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_g
 ; NVPTX-NEXT:  entry:
 ; NVPTX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; NVPTX-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; NVPTX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50_kernel_environment)
+; NVPTX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50_kernel_environment, ptr null)
 ; NVPTX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 ; NVPTX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
 ; NVPTX:       common.ret:
@@ -2411,7 +2411,7 @@ define weak void @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_g
 ; AMDGPU-DISABLED1-NEXT:    [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 ; AMDGPU-DISABLED1-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; AMDGPU-DISABLED1-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; AMDGPU-DISABLED1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50_kernel_environment)
+; AMDGPU-DISABLED1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50_kernel_environment, ptr null)
 ; AMDGPU-DISABLED1-NEXT:    [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
 ; AMDGPU-DISABLED1-NEXT:    br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
 ; AMDGPU-DISABLED1:       is_worker_check:
@@ -2463,7 +2463,7 @@ define weak void @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_g
 ; AMDGPU-DISABLED2-NEXT:    [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 ; AMDGPU-DISABLED2-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; AMDGPU-DISABLED2-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; AMDGPU-DISABLED2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50_kernel_environment)
+; AMDGPU-DISABLED2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50_kernel_environment, ptr null)
 ; AMDGPU-DISABLED2-NEXT:    [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
 ; AMDGPU-DISABLED2-NEXT:    br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
 ; AMDGPU-DISABLED2:       is_worker_check:
@@ -2515,7 +2515,7 @@ define weak void @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_g
 ; NVPTX-DISABLED1-NEXT:    [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8
 ; NVPTX-DISABLED1-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; NVPTX-DISABLED1-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; NVPTX-DISABLED1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50_kernel_environment)
+; NVPTX-DISABLED1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50_kernel_environment, ptr null)
 ; NVPTX-DISABLED1-NEXT:    [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
 ; NVPTX-DISABLED1-NEXT:    br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
 ; NVPTX-DISABLED1:       is_worker_check:
@@ -2566,7 +2566,7 @@ define weak void @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_g
 ; NVPTX-DISABLED2-NEXT:    [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8
 ; NVPTX-DISABLED2-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; NVPTX-DISABLED2-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; NVPTX-DISABLED2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50_kernel_environment)
+; NVPTX-DISABLED2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50_kernel_environment, ptr null)
 ; NVPTX-DISABLED2-NEXT:    [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
 ; NVPTX-DISABLED2-NEXT:    br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
 ; NVPTX-DISABLED2:       is_worker_check:
@@ -2719,7 +2719,7 @@ define weak void @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_g
 entry:
   %.zero.addr = alloca i32, align 4
   %.threadid_temp. = alloca i32, align 4
-  %0 = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50_kernel_environment)
+  %0 = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_sequential_loop_to_shared_var_guarded_l50_kernel_environment, ptr null)
   %exec_user_code = icmp eq i32 %0, -1
   br i1 %exec_user_code, label %user_code.entry, label %common.ret
 
@@ -3142,7 +3142,7 @@ define weak void @__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65() #0
 ; AMDGPU-NEXT:    [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 ; AMDGPU-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; AMDGPU-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; AMDGPU-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65_kernel_environment)
+; AMDGPU-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65_kernel_environment, ptr null)
 ; AMDGPU-NEXT:    [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
 ; AMDGPU-NEXT:    br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
 ; AMDGPU:       is_worker_check:
@@ -3189,7 +3189,7 @@ define weak void @__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65() #0
 ; NVPTX-NEXT:    [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8
 ; NVPTX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; NVPTX-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; NVPTX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65_kernel_environment)
+; NVPTX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65_kernel_environment, ptr null)
 ; NVPTX-NEXT:    [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
 ; NVPTX-NEXT:    br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
 ; NVPTX:       is_worker_check:
@@ -3235,7 +3235,7 @@ define weak void @__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65() #0
 ; AMDGPU-DISABLED1-NEXT:    [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 ; AMDGPU-DISABLED1-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; AMDGPU-DISABLED1-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; AMDGPU-DISABLED1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65_kernel_environment)
+; AMDGPU-DISABLED1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65_kernel_environment, ptr null)
 ; AMDGPU-DISABLED1-NEXT:    [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
 ; AMDGPU-DISABLED1-NEXT:    br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
 ; AMDGPU-DISABLED1:       is_worker_check:
@@ -3282,7 +3282,7 @@ define weak void @__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65() #0
 ; AMDGPU-DISABLED2-NEXT:    [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 ; AMDGPU-DISABLED2-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; AMDGPU-DISABLED2-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; AMDGPU-DISABLED2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65_kernel_environment)
+; AMDGPU-DISABLED2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65_kernel_environment, ptr null)
 ; AMDGPU-DISABLED2-NEXT:    [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
 ; AMDGPU-DISABLED2-NEXT:    br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
 ; AMDGPU-DISABLED2:       is_worker_check:
@@ -3329,7 +3329,7 @@ define weak void @__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65() #0
 ; NVPTX-DISABLED1-NEXT:    [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8
 ; NVPTX-DISABLED1-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; NVPTX-DISABLED1-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; NVPTX-DISABLED1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65_kernel_environment)
+; NVPTX-DISABLED1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65_kernel_environment, ptr null)
 ; NVPTX-DISABLED1-NEXT:    [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
 ; NVPTX-DISABLED1-NEXT:    br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
 ; NVPTX-DISABLED1:       is_worker_check:
@@ -3375,7 +3375,7 @@ define weak void @__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65() #0
 ; NVPTX-DISABLED2-NEXT:    [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8
 ; NVPTX-DISABLED2-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; NVPTX-DISABLED2-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; NVPTX-DISABLED2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65_kernel_environment)
+; NVPTX-DISABLED2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65_kernel_environment, ptr null)
 ; NVPTX-DISABLED2-NEXT:    [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
 ; NVPTX-DISABLED2-NEXT:    br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
 ; NVPTX-DISABLED2:       is_worker_check:
@@ -3509,7 +3509,7 @@ define weak void @__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65() #0
 entry:
   %.zero.addr = alloca i32, align 4
   %.threadid_temp. = alloca i32, align 4
-  %0 = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65_kernel_environment)
+  %0 = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_do_not_spmdize_target_l65_kernel_environment, ptr null)
   %exec_user_code = icmp eq i32 %0, -1
   br i1 %exec_user_code, label %user_code.entry, label %common.ret
 
@@ -3585,7 +3585,7 @@ define weak void @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74() #0 {
 ; AMDGPU-NEXT:  entry:
 ; AMDGPU-NEXT:    [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 ; AMDGPU-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8
-; AMDGPU-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74_kernel_environment)
+; AMDGPU-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74_kernel_environment, ptr null)
 ; AMDGPU-NEXT:    [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
 ; AMDGPU-NEXT:    br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
 ; AMDGPU:       is_worker_check:
@@ -3639,7 +3639,7 @@ define weak void @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74() #0 {
 ; NVPTX-NEXT:  entry:
 ; NVPTX-NEXT:    [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8
 ; NVPTX-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8
-; NVPTX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74_kernel_environment)
+; NVPTX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74_kernel_environment, ptr null)
 ; NVPTX-NEXT:    [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
 ; NVPTX-NEXT:    br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
 ; NVPTX:       is_worker_check:
@@ -3692,7 +3692,7 @@ define weak void @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74() #0 {
 ; AMDGPU-DISABLED1-NEXT:  entry:
 ; AMDGPU-DISABLED1-NEXT:    [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 ; AMDGPU-DISABLED1-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8
-; AMDGPU-DISABLED1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74_kernel_environment)
+; AMDGPU-DISABLED1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74_kernel_environment, ptr null)
 ; AMDGPU-DISABLED1-NEXT:    [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
 ; AMDGPU-DISABLED1-NEXT:    br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
 ; AMDGPU-DISABLED1:       is_worker_check:
@@ -3746,7 +3746,7 @@ define weak void @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74() #0 {
 ; AMDGPU-DISABLED2-NEXT:  entry:
 ; AMDGPU-DISABLED2-NEXT:    [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 ; AMDGPU-DISABLED2-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8
-; AMDGPU-DISABLED2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74_kernel_environment)
+; AMDGPU-DISABLED2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74_kernel_environment, ptr null)
 ; AMDGPU-DISABLED2-NEXT:    [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
 ; AMDGPU-DISABLED2-NEXT:    br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
 ; AMDGPU-DISABLED2:       is_worker_check:
@@ -3800,7 +3800,7 @@ define weak void @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74() #0 {
 ; NVPTX-DISABLED1-NEXT:  entry:
 ; NVPTX-DISABLED1-NEXT:    [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8
 ; NVPTX-DISABLED1-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8
-; NVPTX-DISABLED1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74_kernel_environment)
+; NVPTX-DISABLED1-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74_kernel_environment, ptr null)
 ; NVPTX-DISABLED1-NEXT:    [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
 ; NVPTX-DISABLED1-NEXT:    br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
 ; NVPTX-DISABLED1:       is_worker_check:
@@ -3853,7 +3853,7 @@ define weak void @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74() #0 {
 ; NVPTX-DISABLED2-NEXT:  entry:
 ; NVPTX-DISABLED2-NEXT:    [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8
 ; NVPTX-DISABLED2-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8
-; NVPTX-DISABLED2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74_kernel_environment)
+; NVPTX-DISABLED2-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74_kernel_environment, ptr null)
 ; NVPTX-DISABLED2-NEXT:    [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
 ; NVPTX-DISABLED2-NEXT:    br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
 ; NVPTX-DISABLED2:       is_worker_check:
@@ -4008,7 +4008,7 @@ define weak void @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74() #0 {
 ; NVPTX-DISABLED-NEXT:    br label [[COMMON_RET]]
 entry:
   %captured_vars_addrs = alloca [0 x ptr], align 8
-  %0 = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74_kernel_environment)
+  %0 = call i32 @__kmpc_target_init(ptr @__omp_offloading_fd02_2044372e_do_not_spmdize_task_l74_kernel_environment, ptr null)
   %exec_user_code = icmp eq i32 %0, -1
   br i1 %exec_user_code, label %user_code.entry, label %common.ret
 
@@ -4109,29 +4109,29 @@ declare void @unknowni32p(ptr) #2
 declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1
 
 ; Make it a weak definition so we will apply custom state machine rewriting but can't use the body in the reasoning.
-define weak i32 @__kmpc_target_init(ptr) {
+define weak i32 @__kmpc_target_init(ptr, ptr) {
 ; AMDGPU-LABEL: define {{[^@]+}}@__kmpc_target_init
-; AMDGPU-SAME: (ptr [[TMP0:%.*]]) {
+; AMDGPU-SAME: (ptr [[TMP0:%.*]], ptr [[TMP1:%.*]]) {
 ; AMDGPU-NEXT:    ret i32 0
 ;
 ; NVPTX-LABEL: define {{[^@]+}}@__kmpc_target_init
-; NVPTX-SAME: (ptr [[TMP0:%.*]]) {
+; NVPTX-SAME: (ptr [[TMP0:%.*]], ptr [[TMP1:%.*]]) {
 ; NVPTX-NEXT:    ret i32 0
 ;
 ; AMDGPU-DISABLED1-LABEL: define {{[^@]+}}@__kmpc_target_init
-; AMDGPU-DISABLED1-SAME: (ptr [[TMP0:%.*]]) {
+; AMDGPU-DISABLED1-SAME: (ptr [[TMP0:%.*]], ptr [[TMP1:%.*]]) {
 ; AMDGPU-DISABLED1-NEXT:    ret i32 0
 ;
 ; AMDGPU-DISABLED2-LABEL: define {{[^@]+}}@__kmpc_target_init
-; AMDGPU-DISABLED2-SAME: (ptr [[TMP0:%.*]]) {
+; AMDGPU-DISABLED2-SAME: (ptr [[TMP0:%.*]], ptr [[TMP1:%.*]]) {
 ; AMDGPU-DISABLED2-NEXT:    ret i32 0
 ;
 ; NVPTX-DISABLED1-LABEL: define {{[^@]+}}@__kmpc_target_init
-; NVPTX-DISABLED1-SAME: (ptr [[TMP0:%.*]]) {
+; NVPTX-DISABLED1-SAME: (ptr [[TMP0:%.*]], ptr [[TMP1:%.*]]) {
 ; NVPTX-DISABLED1-NEXT:    ret i32 0
 ;
 ; NVPTX-DISABLED2-LABEL: define {{[^@]+}}@__kmpc_target_init
-; NVPTX-DISABLED2-SAME: (ptr [[TMP0:%.*]]) {
+; NVPTX-DISABLED2-SAME: (ptr [[TMP0:%.*]], ptr [[TMP1:%.*]]) {
 ; NVPTX-DISABLED2-NEXT:    ret i32 0
 ;
 ; AMDGPU-DISABLED-LABEL: define {{[^@]+}}@__kmpc_target_init
diff --git a/llvm/test/Transforms/OpenMP/spmdization_assumes.ll b/llvm/test/Transforms/OpenMP/spmdization_assumes.ll
index 1c583475d50a6..2db7c1c42daa5 100644
--- a/llvm/test/Transforms/OpenMP/spmdization_assumes.ll
+++ b/llvm/test/Transforms/OpenMP/spmdization_assumes.ll
@@ -28,12 +28,12 @@ target triple = "nvptx64"
 ; CHECK: @[[__OMP_OFFLOADING_FD02_404433C2_MAIN_L5_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 3, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
 ; CHECK: @[[GLOB2:[0-9]+]] = private unnamed_addr constant [[STRUCT_IDENT_T:%.*]] { i32 0, i32 2, i32 0, i32 22, ptr @[[GLOB0]] }, align 8
 ;.
-define weak void @__omp_offloading_fd02_404433c2_main_l5(ptr nonnull align 8 dereferenceable(8) %x) local_unnamed_addr #0 {
+define weak void @__omp_offloading_fd02_404433c2_main_l5(ptr %dyn, ptr nonnull align 8 dereferenceable(8) %x) local_unnamed_addr #0 {
 ; CHECK-LABEL: define {{[^@]+}}@__omp_offloading_fd02_404433c2_main_l5
-; CHECK-SAME: (ptr nonnull align 8 dereferenceable(8) [[X:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-SAME: (ptr [[DYN:%.*]], ptr nonnull align 8 dereferenceable(8) [[X:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x ptr], align 8
-; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr nonnull @__omp_offloading_fd02_404433c2_main_l5_kernel_environment) #[[ATTR3:[0-9]+]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr nonnull @__omp_offloading_fd02_404433c2_main_l5_kernel_environment, ptr [[DYN]]) #[[ATTR3:[0-9]+]]
 ; CHECK-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 ; CHECK-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
 ; CHECK:       common.ret:
@@ -61,7 +61,7 @@ define weak void @__omp_offloading_fd02_404433c2_main_l5(ptr nonnull align 8 der
 ;
 entry:
   %captured_vars_addrs = alloca [0 x ptr], align 8
-  %0 = call i32 @__kmpc_target_init(ptr nonnull @__omp_offloading_fd02_404433c2_main_l5_kernel_environment) #3
+  %0 = call i32 @__kmpc_target_init(ptr nonnull @__omp_offloading_fd02_404433c2_main_l5_kernel_environment, ptr %dyn) #3
   %exec_user_code = icmp eq i32 %0, -1
   br i1 %exec_user_code, label %user_code.entry, label %common.ret
 
@@ -77,7 +77,7 @@ user_code.entry:                                  ; preds = %entry
   br label %common.ret
 }
 
-declare i32 @__kmpc_target_init(ptr) local_unnamed_addr
+declare i32 @__kmpc_target_init(ptr, ptr) local_unnamed_addr
 
 ; Function Attrs: alwaysinline mustprogress nofree norecurse nosync nounwind readnone willreturn
 define internal void @__omp_outlined__(ptr noalias nocapture %.global_tid., ptr noalias nocapture %.bound_tid.) #1 {
diff --git a/llvm/test/Transforms/OpenMP/spmdization_constant_prop.ll b/llvm/test/Transforms/OpenMP/spmdization_constant_prop.ll
index 1ac4fd2b45b7a..8cf7b6c1c8ee3 100644
--- a/llvm/test/Transforms/OpenMP/spmdization_constant_prop.ll
+++ b/llvm/test/Transforms/OpenMP/spmdization_constant_prop.ll
@@ -40,11 +40,11 @@ target triple = "amdgcn-amd-amdhsa"
 @llvm.compiler.used = appending addrspace(1) global [1 x ptr] [ptr addrspacecast (ptr addrspace(1) @__omp_offloading_20_11e3950_main_l12_exec_mode to ptr)], section "llvm.metadata"
 
 ; Function Attrs: alwaysinline convergent norecurse nounwind
-define weak_odr amdgpu_kernel void @__omp_offloading_20_11e3950_main_l12(i64 noundef %nxyz, i64 noundef %ng, ptr noundef nonnull align 8 dereferenceable(8) %aa) local_unnamed_addr #0 {
+define weak_odr amdgpu_kernel void @__omp_offloading_20_11e3950_main_l12(ptr %dyn, i64 noundef %nxyz, i64 noundef %ng, ptr noundef nonnull align 8 dereferenceable(8) %aa) local_unnamed_addr #0 {
 entry:
   %ng1 = alloca i32, align 4
   %captured_vars_addrs = alloca [2 x ptr], align 8, addrspace(5)
-  %0 = call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @__omp_offloading_20_11e3950_main_l12_kernel_environment to ptr))
+  %0 = call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @__omp_offloading_20_11e3950_main_l12_kernel_environment to ptr), ptr %dyn)
   %exec_user_code = icmp eq i32 %0, -1
   br i1 %exec_user_code, label %user_code.entry, label %common.ret
 
@@ -108,7 +108,7 @@ entry:
 
 ; Function Attrs: convergent nounwind
 ; define internal i32 @__kmpc_target_init(ptr nocapture noundef readnone %Ident, i8 noundef signext %Mode, i1 noundef zeroext %UseGenericStateMachine) local_unnamed_addr #9 {
-define internal i32 @__kmpc_target_init(ptr nofree noundef nonnull align 8 dereferenceable(24) %KernelEnvironment) local_unnamed_addr #9 {
+define internal i32 @__kmpc_target_init(ptr nofree noundef nonnull align 8 dereferenceable(24) %KernelEnvironment, ptr %dyn) local_unnamed_addr #9 {
 entry:
   %0 = and i32 undef, undef
   %ExecMode = getelementptr inbounds %struct.ConfigurationEnvironmentTy, ptr %KernelEnvironment, i64 0, i32 2
diff --git a/llvm/test/Transforms/OpenMP/spmdization_guarding.ll b/llvm/test/Transforms/OpenMP/spmdization_guarding.ll
index de73250dbb760..9c05ab3cc5d56 100644
--- a/llvm/test/Transforms/OpenMP/spmdization_guarding.ll
+++ b/llvm/test/Transforms/OpenMP/spmdization_guarding.ll
@@ -58,15 +58,15 @@ target triple = "nvptx64"
 ; CHECK-DISABLED: @[[__OMP_OFFLOADING_2A_FBFA7A_SEQUENTIAL_LOOP_L6_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
 ; CHECK-DISABLED: @[[__OMP_OUTLINED__1_WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
 ;.
-define weak void @__omp_offloading_2a_fbfa7a_sequential_loop_l6(ptr %x, i64 %N) #0 {
+define weak void @__omp_offloading_2a_fbfa7a_sequential_loop_l6(ptr %dyn, ptr %x, i64 %N) #0 {
 ; CHECK-LABEL: define {{[^@]+}}@__omp_offloading_2a_fbfa7a_sequential_loop_l6
-; CHECK-SAME: (ptr [[X:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-SAME: (ptr [[DYN:%.*]], ptr [[X:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[HEAP2STACK_H2S:%.*]] = alloca i8, i64 8, align 8
 ; CHECK-NEXT:    [[LOC:%.*]] = alloca ptr, align 8
 ; CHECK-NEXT:    [[AL32:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    [[N_ADDR_SROA_0_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[N]] to i32
-; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr nonnull @__omp_offloading_2a_fbfa7a_sequential_loop_l6_kernel_environment) #[[ATTR6:[0-9]+]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr nonnull @__omp_offloading_2a_fbfa7a_sequential_loop_l6_kernel_environment, ptr [[DYN]]) #[[ATTR6:[0-9]+]]
 ; CHECK-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 ; CHECK-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 ; CHECK:       user_code.entry:
@@ -183,14 +183,14 @@ define weak void @__omp_offloading_2a_fbfa7a_sequential_loop_l6(ptr %x, i64 %N)
 ; CHECK-NEXT:    ret void
 ;
 ; CHECK-DISABLED-LABEL: define {{[^@]+}}@__omp_offloading_2a_fbfa7a_sequential_loop_l6
-; CHECK-DISABLED-SAME: (ptr [[X:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-DISABLED-SAME: (ptr [[DYN:%.*]], ptr [[X:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-DISABLED-NEXT:  entry:
 ; CHECK-DISABLED-NEXT:    [[HEAP2STACK_H2S:%.*]] = alloca i8, i64 8, align 8
 ; CHECK-DISABLED-NEXT:    [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8
 ; CHECK-DISABLED-NEXT:    [[LOC:%.*]] = alloca ptr, align 8
 ; CHECK-DISABLED-NEXT:    [[AL32:%.*]] = alloca i32, align 4
 ; CHECK-DISABLED-NEXT:    [[N_ADDR_SROA_0_0_EXTRACT_TRUNC:%.*]] = trunc i64 [[N]] to i32
-; CHECK-DISABLED-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr nonnull @__omp_offloading_2a_fbfa7a_sequential_loop_l6_kernel_environment) #[[ATTR6:[0-9]+]]
+; CHECK-DISABLED-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr nonnull @__omp_offloading_2a_fbfa7a_sequential_loop_l6_kernel_environment, ptr [[DYN]]) #[[ATTR6:[0-9]+]]
 ; CHECK-DISABLED-NEXT:    [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
 ; CHECK-DISABLED-NEXT:    br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
 ; CHECK-DISABLED:       is_worker_check:
@@ -278,7 +278,7 @@ entry:
   %loc = alloca ptr
   %al32 = alloca i32
   %N.addr.sroa.0.0.extract.trunc = trunc i64 %N to i32
-  %0 = call i32 @__kmpc_target_init(ptr nonnull @__omp_offloading_2a_fbfa7a_sequential_loop_l6_kernel_environment) #3
+  %0 = call i32 @__kmpc_target_init(ptr nonnull @__omp_offloading_2a_fbfa7a_sequential_loop_l6_kernel_environment, ptr %dyn) #3
   %exec_user_code = icmp eq i32 %0, -1
   br i1 %exec_user_code, label %user_code.entry, label %worker.exit
 
@@ -366,13 +366,13 @@ define internal void @__omp_outlined__1_wrapper(i16 zeroext %0, i32 %1) {
 declare void @__kmpc_parallel_51(ptr, i32, i32, i32, i32, ptr, ptr, ptr, i64)
 
 ; Make it a weak definition so we will apply custom state machine rewriting but can't use the body in the reasoning.
-define weak i32 @__kmpc_target_init(ptr) {
+define weak i32 @__kmpc_target_init(ptr, ptr) {
 ; CHECK-LABEL: define {{[^@]+}}@__kmpc_target_init
-; CHECK-SAME: (ptr [[TMP0:%.*]]) {
+; CHECK-SAME: (ptr [[TMP0:%.*]], ptr [[TMP1:%.*]]) {
 ; CHECK-NEXT:    ret i32 0
 ;
 ; CHECK-DISABLED-LABEL: define {{[^@]+}}@__kmpc_target_init
-; CHECK-DISABLED-SAME: (ptr [[TMP0:%.*]]) {
+; CHECK-DISABLED-SAME: (ptr [[TMP0:%.*]], ptr [[TMP1:%.*]]) {
 ; CHECK-DISABLED-NEXT:    ret i32 0
 ;
   ret i32 0
diff --git a/llvm/test/Transforms/OpenMP/spmdization_guarding_two_reaching_kernels.ll b/llvm/test/Transforms/OpenMP/spmdization_guarding_two_reaching_kernels.ll
index 62aaff572c181..00c8642a6034a 100644
--- a/llvm/test/Transforms/OpenMP/spmdization_guarding_two_reaching_kernels.ll
+++ b/llvm/test/Transforms/OpenMP/spmdization_guarding_two_reaching_kernels.ll
@@ -56,11 +56,11 @@ target triple = "nvptx64"
 ; CHECK-DISABLE-SPMDIZATION: @[[__OMP_OFFLOADING_2B_10393B5_SPMD_L12_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
 ; CHECK-DISABLE-SPMDIZATION: @[[__OMP_OFFLOADING_2B_10393B5_GENERIC_L20_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
 ;.
-define weak void @__omp_offloading_2b_10393b5_spmd_l12() "kernel" #0 {
+define weak void @__omp_offloading_2b_10393b5_spmd_l12(ptr %dyn) "kernel" #0 {
 ; CHECK-LABEL: define {{[^@]+}}@__omp_offloading_2b_10393b5_spmd_l12
-; CHECK-SAME: () #[[ATTR0:[0-9]+]] {
+; CHECK-SAME: (ptr [[DYN:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_2b_10393b5_spmd_l12_kernel_environment)
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_2b_10393b5_spmd_l12_kernel_environment, ptr [[DYN]])
 ; CHECK-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 ; CHECK-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 ; CHECK:       user_code.entry:
@@ -71,9 +71,9 @@ define weak void @__omp_offloading_2b_10393b5_spmd_l12() "kernel" #0 {
 ; CHECK-NEXT:    ret void
 ;
 ; CHECK-DISABLE-SPMDIZATION-LABEL: define {{[^@]+}}@__omp_offloading_2b_10393b5_spmd_l12
-; CHECK-DISABLE-SPMDIZATION-SAME: () #[[ATTR0:[0-9]+]] {
+; CHECK-DISABLE-SPMDIZATION-SAME: (ptr [[DYN:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-DISABLE-SPMDIZATION-NEXT:  entry:
-; CHECK-DISABLE-SPMDIZATION-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_2b_10393b5_spmd_l12_kernel_environment)
+; CHECK-DISABLE-SPMDIZATION-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_2b_10393b5_spmd_l12_kernel_environment, ptr [[DYN]])
 ; CHECK-DISABLE-SPMDIZATION-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 ; CHECK-DISABLE-SPMDIZATION-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 ; CHECK-DISABLE-SPMDIZATION:       user_code.entry:
@@ -84,7 +84,7 @@ define weak void @__omp_offloading_2b_10393b5_spmd_l12() "kernel" #0 {
 ; CHECK-DISABLE-SPMDIZATION-NEXT:    ret void
 ;
 entry:
-  %0 = call i32 @__kmpc_target_init(ptr @__omp_offloading_2b_10393b5_spmd_l12_kernel_environment)
+  %0 = call i32 @__kmpc_target_init(ptr @__omp_offloading_2b_10393b5_spmd_l12_kernel_environment, ptr %dyn)
   %exec_user_code = icmp eq i32 %0, -1
   br i1 %exec_user_code, label %user_code.entry, label %worker.exit
 
@@ -98,13 +98,13 @@ worker.exit:                                      ; preds = %entry
 }
 
 ; Make it a weak definition so we will apply custom state machine rewriting but can't use the body in the reasoning.
-define weak i32 @__kmpc_target_init(ptr) {
+define weak i32 @__kmpc_target_init(ptr, ptr) {
 ; CHECK-LABEL: define {{[^@]+}}@__kmpc_target_init
-; CHECK-SAME: (ptr [[TMP0:%.*]]) {
+; CHECK-SAME: (ptr [[TMP0:%.*]], ptr [[TMP1:%.*]]) {
 ; CHECK-NEXT:    ret i32 0
 ;
 ; CHECK-DISABLE-SPMDIZATION-LABEL: define {{[^@]+}}@__kmpc_target_init
-; CHECK-DISABLE-SPMDIZATION-SAME: (ptr [[TMP0:%.*]]) {
+; CHECK-DISABLE-SPMDIZATION-SAME: (ptr [[TMP0:%.*]], ptr [[TMP1:%.*]]) {
 ; CHECK-DISABLE-SPMDIZATION-NEXT:    ret i32 0
 ;
   ret i32 0
@@ -113,11 +113,11 @@ define weak i32 @__kmpc_target_init(ptr) {
 declare void @__kmpc_target_deinit()
 
 ; Function Attrs: convergent noinline norecurse nounwind
-define weak void @__omp_offloading_2b_10393b5_generic_l20() #0 {
+define weak void @__omp_offloading_2b_10393b5_generic_l20(ptr %dyn) #0 {
 ; CHECK-LABEL: define {{[^@]+}}@__omp_offloading_2b_10393b5_generic_l20
-; CHECK-SAME: () #[[ATTR0]] {
+; CHECK-SAME: (ptr [[DYN:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_2b_10393b5_generic_l20_kernel_environment)
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_2b_10393b5_generic_l20_kernel_environment, ptr [[DYN]])
 ; CHECK-NEXT:    [[THREAD_ID_IN_BLOCK:%.*]] = call i32 @__kmpc_get_hardware_thread_id_in_block()
 ; CHECK-NEXT:    [[THREAD_IS_MAIN:%.*]] = icmp ne i32 [[THREAD_ID_IN_BLOCK]], 0
 ; CHECK-NEXT:    br i1 [[THREAD_IS_MAIN]], label [[EXIT_THREADS:%.*]], label [[MAIN_THREAD_USER_CODE:%.*]]
@@ -134,9 +134,9 @@ define weak void @__omp_offloading_2b_10393b5_generic_l20() #0 {
 ; CHECK-NEXT:    ret void
 ;
 ; CHECK-DISABLE-SPMDIZATION-LABEL: define {{[^@]+}}@__omp_offloading_2b_10393b5_generic_l20
-; CHECK-DISABLE-SPMDIZATION-SAME: () #[[ATTR0]] {
+; CHECK-DISABLE-SPMDIZATION-SAME: (ptr [[DYN:%.*]]) #[[ATTR0]] {
 ; CHECK-DISABLE-SPMDIZATION-NEXT:  entry:
-; CHECK-DISABLE-SPMDIZATION-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_2b_10393b5_generic_l20_kernel_environment)
+; CHECK-DISABLE-SPMDIZATION-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_2b_10393b5_generic_l20_kernel_environment, ptr [[DYN]])
 ; CHECK-DISABLE-SPMDIZATION-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 ; CHECK-DISABLE-SPMDIZATION-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 ; CHECK-DISABLE-SPMDIZATION:       user_code.entry:
@@ -147,7 +147,7 @@ define weak void @__omp_offloading_2b_10393b5_generic_l20() #0 {
 ; CHECK-DISABLE-SPMDIZATION-NEXT:    ret void
 ;
 entry:
-  %0 = call i32 @__kmpc_target_init(ptr @__omp_offloading_2b_10393b5_generic_l20_kernel_environment)
+  %0 = call i32 @__kmpc_target_init(ptr @__omp_offloading_2b_10393b5_generic_l20_kernel_environment, ptr %dyn)
   %exec_user_code = icmp eq i32 %0, -1
   br i1 %exec_user_code, label %user_code.entry, label %worker.exit
 
diff --git a/llvm/test/Transforms/OpenMP/spmdization_indirect.ll b/llvm/test/Transforms/OpenMP/spmdization_indirect.ll
index 3a64b2660cf11..8ae94cf9fe37d 100644
--- a/llvm/test/Transforms/OpenMP/spmdization_indirect.ll
+++ b/llvm/test/Transforms/OpenMP/spmdization_indirect.ll
@@ -49,7 +49,7 @@ define internal void @spmd_callees__debug(i1 %c) {
 ; AMDGPU-NEXT:  entry:
 ; AMDGPU-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; AMDGPU-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; AMDGPU-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @spmd_callees_kernel_environment)
+; AMDGPU-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @spmd_callees_kernel_environment, ptr null)
 ; AMDGPU-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 ; AMDGPU-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
 ; AMDGPU:       common.ret:
@@ -80,7 +80,7 @@ define internal void @spmd_callees__debug(i1 %c) {
 ; NVPTX-NEXT:  entry:
 ; NVPTX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; NVPTX-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; NVPTX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @spmd_callees_kernel_environment)
+; NVPTX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @spmd_callees_kernel_environment, ptr null)
 ; NVPTX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 ; NVPTX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
 ; NVPTX:       common.ret:
@@ -109,7 +109,7 @@ define internal void @spmd_callees__debug(i1 %c) {
 entry:
   %.zero.addr = alloca i32, align 4
   %.threadid_temp. = alloca i32, align 4
-  %0 = call i32 @__kmpc_target_init(ptr @spmd_callees_kernel_environment)
+  %0 = call i32 @__kmpc_target_init(ptr @spmd_callees_kernel_environment, ptr null)
   %exec_user_code = icmp eq i32 %0, -1
   br i1 %exec_user_code, label %user_code.entry, label %common.ret
 
@@ -376,7 +376,7 @@ define weak void @spmd_and_non_spmd_callee(i1 %c) #0 {
 ; AMDGPU-NEXT:    [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 ; AMDGPU-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; AMDGPU-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; AMDGPU-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @spmd_and_non_spmd_callee_kernel_environment)
+; AMDGPU-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @spmd_and_non_spmd_callee_kernel_environment, ptr null)
 ; AMDGPU-NEXT:    [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
 ; AMDGPU-NEXT:    br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
 ; AMDGPU:       is_worker_check:
@@ -438,7 +438,7 @@ define weak void @spmd_and_non_spmd_callee(i1 %c) #0 {
 ; NVPTX-NEXT:    [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8
 ; NVPTX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; NVPTX-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; NVPTX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @spmd_and_non_spmd_callee_kernel_environment)
+; NVPTX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @spmd_and_non_spmd_callee_kernel_environment, ptr null)
 ; NVPTX-NEXT:    [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
 ; NVPTX-NEXT:    br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
 ; NVPTX:       is_worker_check:
@@ -496,7 +496,7 @@ define weak void @spmd_and_non_spmd_callee(i1 %c) #0 {
 entry:
   %.zero.addr = alloca i32, align 4
   %.threadid_temp. = alloca i32, align 4
-  %0 = call i32 @__kmpc_target_init(ptr @spmd_and_non_spmd_callee_kernel_environment)
+  %0 = call i32 @__kmpc_target_init(ptr @spmd_and_non_spmd_callee_kernel_environment, ptr null)
   %exec_user_code = icmp eq i32 %0, -1
   br i1 %exec_user_code, label %user_code.entry, label %common.ret
 
@@ -662,7 +662,7 @@ define weak void @spmd_callees_metadata(ptr %fp) #0 {
 ; AMDGPU-NEXT:  entry:
 ; AMDGPU-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; AMDGPU-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; AMDGPU-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @spmd_callees_metadata_kernel_environment)
+; AMDGPU-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @spmd_callees_metadata_kernel_environment, ptr null)
 ; AMDGPU-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 ; AMDGPU-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
 ; AMDGPU:       common.ret:
@@ -680,7 +680,7 @@ define weak void @spmd_callees_metadata(ptr %fp) #0 {
 ; NVPTX-NEXT:  entry:
 ; NVPTX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; NVPTX-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; NVPTX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @spmd_callees_metadata_kernel_environment)
+; NVPTX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @spmd_callees_metadata_kernel_environment, ptr null)
 ; NVPTX-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 ; NVPTX-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[COMMON_RET:%.*]]
 ; NVPTX:       common.ret:
@@ -696,7 +696,7 @@ define weak void @spmd_callees_metadata(ptr %fp) #0 {
 entry:
   %.zero.addr = alloca i32, align 4
   %.threadid_temp. = alloca i32, align 4
-  %0 = call i32 @__kmpc_target_init(ptr @spmd_callees_metadata_kernel_environment)
+  %0 = call i32 @__kmpc_target_init(ptr @spmd_callees_metadata_kernel_environment, ptr null)
   %exec_user_code = icmp eq i32 %0, -1
   br i1 %exec_user_code, label %user_code.entry, label %common.ret
 
@@ -722,7 +722,7 @@ define weak void @spmd_and_non_spmd_callees_metadata(ptr %fp) #0 {
 ; AMDGPU-NEXT:    [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
 ; AMDGPU-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; AMDGPU-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; AMDGPU-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @spmd_and_non_spmd_callees_metadata_kernel_environment)
+; AMDGPU-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @spmd_and_non_spmd_callees_metadata_kernel_environment, ptr null)
 ; AMDGPU-NEXT:    [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
 ; AMDGPU-NEXT:    br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
 ; AMDGPU:       is_worker_check:
@@ -783,7 +783,7 @@ define weak void @spmd_and_non_spmd_callees_metadata(ptr %fp) #0 {
 ; NVPTX-NEXT:    [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8
 ; NVPTX-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 ; NVPTX-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-; NVPTX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @spmd_and_non_spmd_callees_metadata_kernel_environment)
+; NVPTX-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @spmd_and_non_spmd_callees_metadata_kernel_environment, ptr null)
 ; NVPTX-NEXT:    [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
 ; NVPTX-NEXT:    br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
 ; NVPTX:       is_worker_check:
@@ -840,7 +840,7 @@ define weak void @spmd_and_non_spmd_callees_metadata(ptr %fp) #0 {
 entry:
   %.zero.addr = alloca i32, align 4
   %.threadid_temp. = alloca i32, align 4
-  %0 = call i32 @__kmpc_target_init(ptr @spmd_and_non_spmd_callees_metadata_kernel_environment)
+  %0 = call i32 @__kmpc_target_init(ptr @spmd_and_non_spmd_callees_metadata_kernel_environment, ptr null)
   %exec_user_code = icmp eq i32 %0, -1
   br i1 %exec_user_code, label %user_code.entry, label %common.ret
 
@@ -1002,15 +1002,15 @@ declare void @unknowni32p(ptr) #2
 declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1
 
 ; Make it a weak definition so we will apply custom state machine rewriting but can't use the body in the reasoning.
-define weak i32 @__kmpc_target_init(ptr) {
+define weak i32 @__kmpc_target_init(ptr, ptr) {
 ;
 ;
 ; AMDGPU-LABEL: define {{[^@]+}}@__kmpc_target_init
-; AMDGPU-SAME: (ptr [[TMP0:%.*]]) {
+; AMDGPU-SAME: (ptr [[TMP0:%.*]], ptr [[TMP1:%.*]]) {
 ; AMDGPU-NEXT:    ret i32 0
 ;
 ; NVPTX-LABEL: define {{[^@]+}}@__kmpc_target_init
-; NVPTX-SAME: (ptr [[TMP0:%.*]]) {
+; NVPTX-SAME: (ptr [[TMP0:%.*]], ptr [[TMP1:%.*]]) {
 ; NVPTX-NEXT:    ret i32 0
 ;
   ret i32 0
diff --git a/llvm/test/Transforms/OpenMP/spmdization_kernel_env_dep.ll b/llvm/test/Transforms/OpenMP/spmdization_kernel_env_dep.ll
index a5bef5f7e8e15..711d046008506 100644
--- a/llvm/test/Transforms/OpenMP/spmdization_kernel_env_dep.ll
+++ b/llvm/test/Transforms/OpenMP/spmdization_kernel_env_dep.ll
@@ -24,9 +24,9 @@ define i32 @fputs() {
   ret i32 0
 }
 
-define internal i32 @__kmpc_target_init(ptr %0) {
+define internal i32 @__kmpc_target_init(ptr %0, ptr %dyn) {
 ; AMDGPU-LABEL: define {{[^@]+}}@__kmpc_target_init
-; AMDGPU-SAME: (ptr [[TMP0:%.*]]) #[[ATTR1:[0-9]+]] {
+; AMDGPU-SAME: (ptr [[TMP0:%.*]], ptr [[DYN:%.*]]) #[[ATTR1:[0-9]+]] {
 ; AMDGPU-NEXT:    [[TMP2:%.*]] = load i8, ptr getelementptr (i8, ptr addrspacecast (ptr addrspace(1) @__omp_offloading_10302_b20a40e_main_l4_kernel_environment to ptr), i64 2), align 2
 ; AMDGPU-NEXT:    [[TMP3:%.*]] = and i8 [[TMP2]], 2
 ; AMDGPU-NEXT:    [[TMP4:%.*]] = icmp ne i8 [[TMP3]], 0
@@ -75,16 +75,17 @@ declare i32 @llvm.amdgcn.workitem.id.x() #0
 
 declare void @__kmpc_target_deinit()
 
-define amdgpu_kernel void @__omp_offloading_10302_b20a40e_main_l4() {
-; AMDGPU-LABEL: define {{[^@]+}}@__omp_offloading_10302_b20a40e_main_l4() {
-; AMDGPU-NEXT:    [[TMP1:%.*]] = tail call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @__omp_offloading_10302_b20a40e_main_l4_kernel_environment to ptr)) #[[ATTR4:[0-9]+]]
+define amdgpu_kernel void @__omp_offloading_10302_b20a40e_main_l4(ptr %dyn) {
+; AMDGPU-LABEL: define {{[^@]+}}@__omp_offloading_10302_b20a40e_main_l4
+; AMDGPU-SAME: (ptr [[DYN:%.*]]) {
+; AMDGPU-NEXT:    [[TMP1:%.*]] = tail call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @__omp_offloading_10302_b20a40e_main_l4_kernel_environment to ptr), ptr [[DYN]]) #[[ATTR4:[0-9]+]]
 ; AMDGPU-NEXT:    br label [[TMP2:%.*]]
 ; AMDGPU:       2:
 ; AMDGPU-NEXT:    [[TMP3:%.*]] = call i32 @fputs() #[[ATTR0]]
 ; AMDGPU-NEXT:    tail call void @__kmpc_target_deinit()
 ; AMDGPU-NEXT:    ret void
 ;
-  %1 = tail call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @__omp_offloading_10302_b20a40e_main_l4_kernel_environment to ptr))
+  %1 = tail call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @__omp_offloading_10302_b20a40e_main_l4_kernel_environment to ptr), ptr %dyn)
   br label %2
 
 2:                                                ; preds = %0
diff --git a/llvm/test/Transforms/OpenMP/spmdization_no_guarding_two_reaching_kernels.ll b/llvm/test/Transforms/OpenMP/spmdization_no_guarding_two_reaching_kernels.ll
index c6365c9e0e93d..3cc7c2559ad2f 100644
--- a/llvm/test/Transforms/OpenMP/spmdization_no_guarding_two_reaching_kernels.ll
+++ b/llvm/test/Transforms/OpenMP/spmdization_no_guarding_two_reaching_kernels.ll
@@ -57,11 +57,11 @@ target triple = "nvptx64"
 ; CHECK-DISABLE-SPMDIZATION: @[[__OMP_OFFLOADING_2B_10393B5_GENERIC_L20_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[GLOB1]], ptr null }
 ; CHECK-DISABLE-SPMDIZATION: @[[__OMP_OUTLINED___WRAPPER_ID:[a-zA-Z0-9_$"\\.-]+]] = private constant i8 undef
 ;.
-define weak void @__omp_offloading_2b_10393b5_spmd_l12() #0 {
+define weak void @__omp_offloading_2b_10393b5_spmd_l12(ptr %dyn) #0 {
 ; CHECK-LABEL: define {{[^@]+}}@__omp_offloading_2b_10393b5_spmd_l12
-; CHECK-SAME: () #[[ATTR0:[0-9]+]] {
+; CHECK-SAME: (ptr [[DYN:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_2b_10393b5_spmd_l12_kernel_environment)
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_2b_10393b5_spmd_l12_kernel_environment, ptr [[DYN]])
 ; CHECK-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP0]], -1
 ; CHECK-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
 ; CHECK:       user_code.entry:
@@ -72,10 +72,10 @@ define weak void @__omp_offloading_2b_10393b5_spmd_l12() #0 {
 ; CHECK-NEXT:    ret void
 ;
 ; CHECK-DISABLE-SPMDIZATION-LABEL: define {{[^@]+}}@__omp_offloading_2b_10393b5_spmd_l12
-; CHECK-DISABLE-SPMDIZATION-SAME: () #[[ATTR0:[0-9]+]] {
+; CHECK-DISABLE-SPMDIZATION-SAME: (ptr [[DYN:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-DISABLE-SPMDIZATION-NEXT:  entry:
 ; CHECK-DISABLE-SPMDIZATION-NEXT:    [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8
-; CHECK-DISABLE-SPMDIZATION-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_2b_10393b5_spmd_l12_kernel_environment)
+; CHECK-DISABLE-SPMDIZATION-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_2b_10393b5_spmd_l12_kernel_environment, ptr [[DYN]])
 ; CHECK-DISABLE-SPMDIZATION-NEXT:    [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
 ; CHECK-DISABLE-SPMDIZATION-NEXT:    br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
 ; CHECK-DISABLE-SPMDIZATION:       is_worker_check:
@@ -119,7 +119,7 @@ define weak void @__omp_offloading_2b_10393b5_spmd_l12() #0 {
 ; CHECK-DISABLE-SPMDIZATION-NEXT:    ret void
 ;
 entry:
-  %0 = call i32 @__kmpc_target_init(ptr @__omp_offloading_2b_10393b5_spmd_l12_kernel_environment)
+  %0 = call i32 @__kmpc_target_init(ptr @__omp_offloading_2b_10393b5_spmd_l12_kernel_environment, ptr %dyn)
   %exec_user_code = icmp eq i32 %0, -1
   br i1 %exec_user_code, label %user_code.entry, label %worker.exit
 
@@ -133,13 +133,13 @@ worker.exit:                                      ; preds = %entry
 }
 
 ; Make it a weak definition so we will apply custom state machine rewriting but can't use the body in the reasoning.
-define weak i32 @__kmpc_target_init(ptr) {
+define weak i32 @__kmpc_target_init(ptr, ptr) {
 ; CHECK-LABEL: define {{[^@]+}}@__kmpc_target_init
-; CHECK-SAME: (ptr [[TMP0:%.*]]) {
+; CHECK-SAME: (ptr [[TMP0:%.*]], ptr [[TMP1:%.*]]) {
 ; CHECK-NEXT:    ret i32 0
 ;
 ; CHECK-DISABLE-SPMDIZATION-LABEL: define {{[^@]+}}@__kmpc_target_init
-; CHECK-DISABLE-SPMDIZATION-SAME: (ptr [[TMP0:%.*]]) {
+; CHECK-DISABLE-SPMDIZATION-SAME: (ptr [[TMP0:%.*]], ptr [[TMP1:%.*]]) {
 ; CHECK-DISABLE-SPMDIZATION-NEXT:    ret i32 0
 ;
   ret i32 0
@@ -148,12 +148,12 @@ define weak i32 @__kmpc_target_init(ptr) {
 declare void @__kmpc_target_deinit()
 
 ; Function Attrs: convergent noinline norecurse nounwind
-define weak void @__omp_offloading_2b_10393b5_generic_l20() #0 {
+define weak void @__omp_offloading_2b_10393b5_generic_l20(ptr %dyn) #0 {
 ; CHECK-LABEL: define {{[^@]+}}@__omp_offloading_2b_10393b5_generic_l20
-; CHECK-SAME: () #[[ATTR0]] {
+; CHECK-SAME: (ptr [[DYN:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8
-; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_2b_10393b5_generic_l20_kernel_environment)
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_2b_10393b5_generic_l20_kernel_environment, ptr [[DYN]])
 ; CHECK-NEXT:    [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
 ; CHECK-NEXT:    br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
 ; CHECK:       is_worker_check:
@@ -193,10 +193,10 @@ define weak void @__omp_offloading_2b_10393b5_generic_l20() #0 {
 ; CHECK-NEXT:    ret void
 ;
 ; CHECK-DISABLE-SPMDIZATION-LABEL: define {{[^@]+}}@__omp_offloading_2b_10393b5_generic_l20
-; CHECK-DISABLE-SPMDIZATION-SAME: () #[[ATTR0]] {
+; CHECK-DISABLE-SPMDIZATION-SAME: (ptr [[DYN:%.*]]) #[[ATTR0]] {
 ; CHECK-DISABLE-SPMDIZATION-NEXT:  entry:
 ; CHECK-DISABLE-SPMDIZATION-NEXT:    [[WORKER_WORK_FN_ADDR:%.*]] = alloca ptr, align 8
-; CHECK-DISABLE-SPMDIZATION-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_2b_10393b5_generic_l20_kernel_environment)
+; CHECK-DISABLE-SPMDIZATION-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_target_init(ptr @__omp_offloading_2b_10393b5_generic_l20_kernel_environment, ptr [[DYN]])
 ; CHECK-DISABLE-SPMDIZATION-NEXT:    [[THREAD_IS_WORKER:%.*]] = icmp ne i32 [[TMP0]], -1
 ; CHECK-DISABLE-SPMDIZATION-NEXT:    br i1 [[THREAD_IS_WORKER]], label [[IS_WORKER_CHECK:%.*]], label [[THREAD_USER_CODE_CHECK:%.*]]
 ; CHECK-DISABLE-SPMDIZATION:       is_worker_check:
@@ -236,7 +236,7 @@ define weak void @__omp_offloading_2b_10393b5_generic_l20() #0 {
 ; CHECK-DISABLE-SPMDIZATION-NEXT:    ret void
 ;
 entry:
-  %0 = call i32 @__kmpc_target_init(ptr @__omp_offloading_2b_10393b5_generic_l20_kernel_environment)
+  %0 = call i32 @__kmpc_target_init(ptr @__omp_offloading_2b_10393b5_generic_l20_kernel_environment, ptr %dyn)
   %exec_user_code = icmp eq i32 %0, -1
   br i1 %exec_user_code, label %user_code.entry, label %worker.exit
 
diff --git a/llvm/test/Transforms/OpenMP/spmdization_remarks.ll b/llvm/test/Transforms/OpenMP/spmdization_remarks.ll
index d2d22c2d8790c..e338a80075349 100644
--- a/llvm/test/Transforms/OpenMP/spmdization_remarks.ll
+++ b/llvm/test/Transforms/OpenMP/spmdization_remarks.ll
@@ -62,10 +62,10 @@ target triple = "nvptx64"
 
 
 ; Function Attrs: convergent norecurse nounwind
-define weak void @__omp_offloading_2a_d80d3d_test_fallback_l11() local_unnamed_addr #0 !dbg !15 {
+define weak void @__omp_offloading_2a_d80d3d_test_fallback_l11(ptr %dyn) local_unnamed_addr #0 !dbg !15 {
 entry:
   %captured_vars_addrs.i.i = alloca [0 x ptr], align 8
-  %0 = call i32 @__kmpc_target_init(ptr nonnull @__omp_offloading_2a_d80d3d_test_fallback_l11_kernel_environment) #3, !dbg !18
+  %0 = call i32 @__kmpc_target_init(ptr nonnull @__omp_offloading_2a_d80d3d_test_fallback_l11_kernel_environment, ptr %dyn) #3, !dbg !18
   %exec_user_code = icmp eq i32 %0, -1, !dbg !18
   br i1 %exec_user_code, label %user_code.entry, label %common.ret, !dbg !18
 
@@ -84,7 +84,7 @@ user_code.entry:                                  ; preds = %entry
   br label %common.ret
 }
 
-define weak i32 @__kmpc_target_init(ptr) {
+define weak i32 @__kmpc_target_init(ptr, ptr) {
   ret i32 0
 }
 
@@ -107,10 +107,10 @@ declare i32 @__kmpc_global_thread_num(ptr) local_unnamed_addr #3
 declare void @__kmpc_target_deinit() local_unnamed_addr
 
 ; Function Attrs: norecurse nounwind
-define weak void @__omp_offloading_2a_d80d3d_test_no_fallback_l20() local_unnamed_addr #4 !dbg !32 {
+define weak void @__omp_offloading_2a_d80d3d_test_no_fallback_l20(ptr %dyn) local_unnamed_addr #4 !dbg !32 {
 entry:
   %captured_vars_addrs.i2.i = alloca [0 x ptr], align 8
-  %0 = call i32 @__kmpc_target_init(ptr nonnull @__omp_offloading_2a_d80d3d_test_no_fallback_l20_kernel_environment) #3, !dbg !33
+  %0 = call i32 @__kmpc_target_init(ptr nonnull @__omp_offloading_2a_d80d3d_test_no_fallback_l20_kernel_environment, ptr %dyn) #3, !dbg !33
   %exec_user_code = icmp eq i32 %0, -1, !dbg !33
   br i1 %exec_user_code, label %user_code.entry, label %common.ret, !dbg !33
 
diff --git a/llvm/test/Transforms/OpenMP/value-simplify-openmp-opt.ll b/llvm/test/Transforms/OpenMP/value-simplify-openmp-opt.ll
index 0a62b1750cb0d..f322e745c714b 100644
--- a/llvm/test/Transforms/OpenMP/value-simplify-openmp-opt.ll
+++ b/llvm/test/Transforms/OpenMP/value-simplify-openmp-opt.ll
@@ -53,12 +53,12 @@ target triple = "amdgcn-amd-amdhsa"
 ; CHECK: @[[STR:[a-zA-Z0-9_$"\\.-]+]] = private unnamed_addr addrspace(4) constant [1 x i8] zeroinitializer, align 1
 ; CHECK: @[[KERNEL_KERNEL_ENVIRONMENT:[a-zA-Z0-9_$"\\.-]+]] = local_unnamed_addr constant [[STRUCT_KERNELENVIRONMENTTY:%.*]] { [[STRUCT_CONFIGURATIONENVIRONMENTTY:%.*]] { i8 0, i8 0, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr null, ptr null }
 ;.
-define void @kernel() "kernel" {
+define void @kernel(ptr %dyn) "kernel" {
 ;
 ; TUNIT: Function Attrs: norecurse
 ; TUNIT-LABEL: define {{[^@]+}}@kernel
-; TUNIT-SAME: () #[[ATTR0:[0-9]+]] {
-; TUNIT-NEXT:    [[CALL:%.*]] = call i32 @__kmpc_target_init(ptr @kernel_kernel_environment)
+; TUNIT-SAME: (ptr [[DYN:%.*]]) #[[ATTR0:[0-9]+]] {
+; TUNIT-NEXT:    [[CALL:%.*]] = call i32 @__kmpc_target_init(ptr @kernel_kernel_environment, ptr [[DYN]])
 ; TUNIT-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], -1
 ; TUNIT-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
 ; TUNIT:       if.then:
@@ -81,8 +81,8 @@ define void @kernel() "kernel" {
 ;
 ; CGSCC: Function Attrs: norecurse
 ; CGSCC-LABEL: define {{[^@]+}}@kernel
-; CGSCC-SAME: () #[[ATTR0:[0-9]+]] {
-; CGSCC-NEXT:    [[CALL:%.*]] = call i32 @__kmpc_target_init(ptr @kernel_kernel_environment)
+; CGSCC-SAME: (ptr [[DYN:%.*]]) #[[ATTR0:[0-9]+]] {
+; CGSCC-NEXT:    [[CALL:%.*]] = call i32 @__kmpc_target_init(ptr @kernel_kernel_environment, ptr [[DYN]])
 ; CGSCC-NEXT:    [[CMP:%.*]] = icmp eq i32 [[CALL]], -1
 ; CGSCC-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
 ; CGSCC:       if.then:
@@ -103,7 +103,7 @@ define void @kernel() "kernel" {
 ; CGSCC-NEXT:    call void @__kmpc_target_deinit()
 ; CGSCC-NEXT:    ret void
 ;
-  %call = call i32 @__kmpc_target_init(ptr @kernel_kernel_environment)
+  %call = call i32 @__kmpc_target_init(ptr @kernel_kernel_environment, ptr %dyn)
   %cmp = icmp eq i32 %call, -1
   br i1 %cmp, label %if.then, label %if.else
 if.then:
@@ -821,7 +821,7 @@ S:
 declare void @sync()
 declare void @barrier() norecurse nounwind nocallback "llvm.assume"="ompx_aligned_barrier"
 declare void @use1(i32) nosync norecurse nounwind nocallback
-declare i32 @__kmpc_target_init(ptr) nocallback
+declare i32 @__kmpc_target_init(ptr, ptr) nocallback
 declare void @__kmpc_target_deinit() nocallback
 declare void @llvm.assume(i1)
 
diff --git a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
index 97cfc339675f6..b9e9655b89108 100644
--- a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
+++ b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
@@ -5727,10 +5727,11 @@ TEST_F(OpenMPIRBuilderTest, TargetRegionDevice) {
   EXPECT_NE(F, OutlinedFn);
 
   EXPECT_TRUE(OutlinedFn->hasWeakODRLinkage());
-  EXPECT_EQ(OutlinedFn->arg_size(), 2U);
+  // Account for the "implicit" first argument.
   EXPECT_EQ(OutlinedFn->getName(), "__omp_offloading_1_2_parent_l3");
-  EXPECT_TRUE(OutlinedFn->getArg(0)->getType()->isPointerTy());
+  EXPECT_EQ(OutlinedFn->arg_size(), 3U);
   EXPECT_TRUE(OutlinedFn->getArg(1)->getType()->isPointerTy());
+  EXPECT_TRUE(OutlinedFn->getArg(2)->getType()->isPointerTy());
 
   // Check entry block
   auto &EntryBlock = OutlinedFn->getEntryBlock();
@@ -5748,7 +5749,7 @@ TEST_F(OpenMPIRBuilderTest, TargetRegionDevice) {
   auto *InitCall = dyn_cast<CallInst>(Store2->getNextNode());
   EXPECT_NE(InitCall, nullptr);
   EXPECT_EQ(InitCall->getCalledFunction()->getName(), "__kmpc_target_init");
-  EXPECT_EQ(InitCall->arg_size(), 1U);
+  EXPECT_EQ(InitCall->arg_size(), 2U);
   EXPECT_TRUE(isa<GlobalVariable>(InitCall->getArgOperand(0)));
   auto *KernelEnvGV = cast<GlobalVariable>(InitCall->getArgOperand(0));
   EXPECT_TRUE(isa<ConstantStruct>(KernelEnvGV->getInitializer()));
diff --git a/mlir/test/Target/LLVMIR/omptarget-byref-bycopy-generation-device.mlir b/mlir/test/Target/LLVMIR/omptarget-byref-bycopy-generation-device.mlir
index cf6b7257ac606..c0c8640bb30bd 100644
--- a/mlir/test/Target/LLVMIR/omptarget-byref-bycopy-generation-device.mlir
+++ b/mlir/test/Target/LLVMIR/omptarget-byref-bycopy-generation-device.mlir
@@ -23,7 +23,7 @@ module attributes {omp.is_target_device = true} {
   }
 }
 
-// CHECK: define {{.*}} void @__omp_offloading_{{.*}}_{{.*}}__QQmain_l{{.*}}(ptr %[[ARG_BYREF:.*]], ptr %[[ARG_BYCOPY:.*]]) {
+// CHECK: define {{.*}} void @__omp_offloading_{{.*}}_{{.*}}__QQmain_l{{.*}}(ptr %[[DYN_PTR:.*]], ptr %[[ARG_BYREF:.*]], ptr %[[ARG_BYCOPY:.*]]) {
 
 // CHECK: entry:
 // CHECK: %[[ALLOCA_BYREF:.*]] = alloca ptr, align 8
diff --git a/mlir/test/Target/LLVMIR/omptarget-declare-target-llvm-device.mlir b/mlir/test/Target/LLVMIR/omptarget-declare-target-llvm-device.mlir
index af776b89b3ae6..24795cf70c009 100644
--- a/mlir/test/Target/LLVMIR/omptarget-declare-target-llvm-device.mlir
+++ b/mlir/test/Target/LLVMIR/omptarget-declare-target-llvm-device.mlir
@@ -18,8 +18,8 @@ module attributes {omp.is_target_device = true} {
     %0 = llvm.mlir.addressof @_QMtest_0Esp : !llvm.ptr
   
   // CHECK-DAG:   omp.target:                                       ; preds = %user_code.entry
-  // CHECK-DAG: %1 = load ptr, ptr @_QMtest_0Esp_decl_tgt_ref_ptr, align 8
-  // CHECK-DAG: store i32 1, ptr %1, align 4
+  // CHECK-DAG: %[[V:.*]] = load ptr, ptr @_QMtest_0Esp_decl_tgt_ref_ptr, align 8
+  // CHECK-DAG: store i32 1, ptr %[[V]], align 4
   // CHECK-DAG: br label %omp.region.cont
     %map = omp.map_info var_ptr(%0 : !llvm.ptr, i32)   map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""}
     omp.target   map_entries(%map : !llvm.ptr) {
diff --git a/mlir/test/Target/LLVMIR/omptarget-region-device-llvm.mlir b/mlir/test/Target/LLVMIR/omptarget-region-device-llvm.mlir
index f16ffc34cca5c..bd399ad935259 100644
--- a/mlir/test/Target/LLVMIR/omptarget-region-device-llvm.mlir
+++ b/mlir/test/Target/LLVMIR/omptarget-region-device-llvm.mlir
@@ -29,15 +29,15 @@ module attributes {omp.is_target_device = true} {
 // CHECK:      @[[SRC_LOC:.*]] = private unnamed_addr constant [23 x i8] c"{{[^"]*}}", align 1
 // CHECK:      @[[IDENT:.*]] = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 0, i32 22, ptr @[[SRC_LOC]] }, align 8
 // CHECK:      @[[DYNA_ENV:.*]] = weak_odr protected global %struct.DynamicEnvironmentTy zeroinitializer
-// CHECK:      @[[KERNEL_ENV:.*]] = weak_odr protected constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0 }, ptr @[[IDENT]], ptr @[[DYNA_ENV]] }
-// CHECK:      define weak_odr protected void @__omp_offloading_{{[^_]+}}_{{[^_]+}}_omp_target_region__l{{[0-9]+}}(ptr %[[ADDR_A:.*]], ptr %[[ADDR_B:.*]], ptr %[[ADDR_C:.*]])
+// CHECK:      @[[KERNEL_ENV:.*]] = weak_odr protected constant %struct.KernelEnvironmentTy { %struct.ConfigurationEnvironmentTy { i8 1, i8 1, i8 1, i32 0, i32 0, i32 0, i32 0, i32 0 }, ptr @[[IDENT]], ptr @[[DYNA_ENV]] }
+// CHECK:      define weak_odr protected void @__omp_offloading_{{[^_]+}}_{{[^_]+}}_omp_target_region__l{{[0-9]+}}(ptr %[[DYN_PTR:.*]], ptr %[[ADDR_A:.*]], ptr %[[ADDR_B:.*]], ptr %[[ADDR_C:.*]])
 // CHECK:        %[[TMP_A:.*]] = alloca ptr, align 8
 // CHECK:        store ptr %[[ADDR_A]], ptr %[[TMP_A]], align 8
 // CHECK:        %[[TMP_B:.*]] = alloca ptr, align 8
 // CHECK:        store ptr %[[ADDR_B]], ptr %[[TMP_B]], align 8
 // CHECK:        %[[TMP_C:.*]] = alloca ptr, align 8
 // CHECK:        store ptr %[[ADDR_C]], ptr %[[TMP_C]], align 8
-// CHECK:        %[[INIT:.*]] = call i32 @__kmpc_target_init(ptr @[[KERNEL_ENV]])
+// CHECK:        %[[INIT:.*]] = call i32 @__kmpc_target_init(ptr @[[KERNEL_ENV]], ptr %[[DYN_PTR]])
 // CHECK-NEXT:   %[[CMP:.*]] = icmp eq i32 %[[INIT]], -1
 // CHECK-NEXT:   br i1 %[[CMP]], label %[[LABEL_ENTRY:.*]], label %[[LABEL_EXIT:.*]]
 // CHECK:        [[LABEL_ENTRY]]:
diff --git a/openmp/libomptarget/DeviceRTL/include/Interface.h b/openmp/libomptarget/DeviceRTL/include/Interface.h
index 8b6019b9dc2ab..6ce56475c09b3 100644
--- a/openmp/libomptarget/DeviceRTL/include/Interface.h
+++ b/openmp/libomptarget/DeviceRTL/include/Interface.h
@@ -12,6 +12,7 @@
 #ifndef OMPTARGET_DEVICERTL_INTERFACE_H
 #define OMPTARGET_DEVICERTL_INTERFACE_H
 
+#include "Environment.h"
 #include "Types.h"
 
 /// External API
@@ -219,7 +220,8 @@ struct KernelEnvironmentTy;
 
 int8_t __kmpc_is_spmd_exec_mode();
 
-int32_t __kmpc_target_init(KernelEnvironmentTy &KernelEnvironment);
+int32_t __kmpc_target_init(KernelEnvironmentTy &KernelEnvironment,
+                           KernelLaunchEnvironmentTy &KernelLaunchEnvironment);
 
 void __kmpc_target_deinit();
 
diff --git a/openmp/libomptarget/DeviceRTL/include/State.h b/openmp/libomptarget/DeviceRTL/include/State.h
index 5db5e27ebe888..1d73bdc4f5409 100644
--- a/openmp/libomptarget/DeviceRTL/include/State.h
+++ b/openmp/libomptarget/DeviceRTL/include/State.h
@@ -13,6 +13,7 @@
 #define OMPTARGET_STATE_H
 
 #include "Debug.h"
+#include "Environment.h"
 #include "Mapping.h"
 #include "Types.h"
 #include "Utils.h"
@@ -117,10 +118,15 @@ extern ThreadStateTy **ThreadStates;
 #pragma omp allocate(ThreadStates) allocator(omp_pteam_mem_alloc)
 
 /// Initialize the state machinery. Must be called by all threads.
-void init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment);
+void init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment,
+          KernelLaunchEnvironmentTy &KernelLaunchEnvironment);
 
-/// Return the kernel environment associated with the current kernel.
+/// Return the kernel and kernel launch environment associated with the current
+/// kernel. The former is static and contains compile time information that
+/// holds for all instances of the kernel. The latter is dynamic and provides
+/// per-launch information.
 KernelEnvironmentTy &getKernelEnvironment();
+KernelLaunchEnvironmentTy &getKernelLaunchEnvironment();
 
 /// TODO
 enum ValueKind {
diff --git a/openmp/libomptarget/DeviceRTL/src/Kernel.cpp b/openmp/libomptarget/DeviceRTL/src/Kernel.cpp
index 8fe3e3f32d1aa..f7d8ff8e565c1 100644
--- a/openmp/libomptarget/DeviceRTL/src/Kernel.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Kernel.cpp
@@ -25,12 +25,13 @@ using namespace ompx;
 
 #pragma omp begin declare target device_type(nohost)
 
-static void inititializeRuntime(bool IsSPMD,
-                                KernelEnvironmentTy &KernelEnvironment) {
+static void
+inititializeRuntime(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment,
+                    KernelLaunchEnvironmentTy &KernelLaunchEnvironment) {
   // Order is important here.
   synchronize::init(IsSPMD);
   mapping::init(IsSPMD);
-  state::init(IsSPMD, KernelEnvironment);
+  state::init(IsSPMD, KernelEnvironment, KernelLaunchEnvironment);
   allocator::init(IsSPMD, KernelEnvironment);
 }
 
@@ -69,16 +70,19 @@ extern "C" {
 ///
 /// \param Ident               Source location identification, can be NULL.
 ///
-int32_t __kmpc_target_init(KernelEnvironmentTy &KernelEnvironment) {
+int32_t __kmpc_target_init(KernelEnvironmentTy &KernelEnvironment,
+                           KernelLaunchEnvironmentTy &KernelLaunchEnvironment) {
   ConfigurationEnvironmentTy &Configuration = KernelEnvironment.Configuration;
   bool IsSPMD = Configuration.ExecMode &
                 llvm::omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_SPMD;
   bool UseGenericStateMachine = Configuration.UseGenericStateMachine;
   if (IsSPMD) {
-    inititializeRuntime(/* IsSPMD */ true, KernelEnvironment);
+    inititializeRuntime(/* IsSPMD */ true, KernelEnvironment,
+                        KernelLaunchEnvironment);
     synchronize::threadsAligned(atomic::relaxed);
   } else {
-    inititializeRuntime(/* IsSPMD */ false, KernelEnvironment);
+    inititializeRuntime(/* IsSPMD */ false, KernelEnvironment,
+                        KernelLaunchEnvironment);
     // No need to wait since only the main threads will execute user
     // code and workers will run into a barrier right away.
   }
diff --git a/openmp/libomptarget/DeviceRTL/src/State.cpp b/openmp/libomptarget/DeviceRTL/src/State.cpp
index e929edda9502b..f8a6d333df0d9 100644
--- a/openmp/libomptarget/DeviceRTL/src/State.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/State.cpp
@@ -36,6 +36,10 @@ using namespace ompx;
 /// The kernel environment passed to the init method by the compiler.
 static KernelEnvironmentTy *SHARED(KernelEnvironmentPtr);
 
+/// The kernel launch environment passed as argument to the kernel by the
+/// runtime.
+static KernelLaunchEnvironmentTy *SHARED(KernelLaunchEnvironmentPtr);
+
 ///}
 
 namespace {
@@ -238,12 +242,14 @@ int returnValIfLevelIsActive(int Level, int Val, int DefaultVal,
 
 } // namespace
 
-void state::init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment) {
+void state::init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment,
+                 KernelLaunchEnvironmentTy &KernelLaunchEnvironment) {
   SharedMemorySmartStack.init(IsSPMD);
   if (mapping::isInitialThreadInLevel0(IsSPMD)) {
     TeamState.init(IsSPMD);
     ThreadStates = nullptr;
     KernelEnvironmentPtr = &KernelEnvironment;
+    KernelLaunchEnvironmentPtr = &KernelLaunchEnvironment;
   }
 }
 
@@ -251,6 +257,10 @@ KernelEnvironmentTy &state::getKernelEnvironment() {
   return *KernelEnvironmentPtr;
 }
 
+KernelLaunchEnvironmentTy &state::getKernelLaunchEnvironment() {
+  return *KernelLaunchEnvironmentPtr;
+}
+
 void state::enterDataEnvironment(IdentTy *Ident) {
   ASSERT(config::mayUseThreadStates(),
          "Thread state modified while explicitly disabled!");
diff --git a/openmp/libomptarget/include/Environment.h b/openmp/libomptarget/include/Environment.h
index 1374d1e95a554..9c02e2390581d 100644
--- a/openmp/libomptarget/include/Environment.h
+++ b/openmp/libomptarget/include/Environment.h
@@ -97,4 +97,6 @@ struct KernelEnvironmentTy {
   DynamicEnvironmentTy *DynamicEnv;
 };
 
+struct KernelLaunchEnvironmentTy {};
+
 #endif // _OMPTARGET_ENVIRONMENT_H_
diff --git a/openmp/libomptarget/include/omptarget.h b/openmp/libomptarget/include/omptarget.h
index de4a1935c2863..818967c88904e 100644
--- a/openmp/libomptarget/include/omptarget.h
+++ b/openmp/libomptarget/include/omptarget.h
@@ -14,6 +14,8 @@
 #ifndef _OMPTARGET_H_
 #define _OMPTARGET_H_
 
+#include "Environment.h"
+
 #include <cstdint>
 #include <deque>
 #include <functional>
@@ -192,6 +194,15 @@ struct __tgt_async_info {
   // We assume to use this structure to do synchronization. In CUDA backend, it
   // is CUstream.
   void *Queue = nullptr;
+
+  /// A collection of allocations that are associated with this stream and that
+  /// should be freed after finalization.
+  llvm::SmallVector<void *, 2> AssociatedAllocations;
+
+  /// The kernel launch environment used to issue a kernel. Stored here to
+  /// ensure it is a valid location while the transfer to the device is
+  /// happening.
+  KernelLaunchEnvironmentTy KernelLaunchEnvironment;
 };
 
 struct DeviceTy;
diff --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp
index 200fa15cb9fb9..69943486aa720 100644
--- a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp
+++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp
@@ -423,6 +423,32 @@ Error GenericKernelTy::init(GenericDeviceTy &GenericDevice,
   return initImpl(GenericDevice, Image);
 }
 
+Expected<KernelLaunchEnvironmentTy *>
+GenericKernelTy::getKernelLaunchEnvironment(
+    GenericDeviceTy &GenericDevice,
+    AsyncInfoWrapperTy &AsyncInfoWrapper) const {
+  // TODO: Check if the kernel needs a launch environment.
+  auto AllocOrErr = GenericDevice.dataAlloc(sizeof(KernelLaunchEnvironmentTy),
+                                            /*HostPtr=*/nullptr,
+                                            TargetAllocTy::TARGET_ALLOC_DEVICE);
+  if (!AllocOrErr)
+    return AllocOrErr.takeError();
+
+  // Remember to free the memory later.
+  AsyncInfoWrapper.freeAllocationAfterSynchronization(*AllocOrErr);
+
+  /// Use the KLE in the __tgt_async_info to ensure a stable address for the
+  /// async data transfer.
+  auto &LocalKLE = (*AsyncInfoWrapper).KernelLaunchEnvironment;
+  LocalKLE = KernelLaunchEnvironment;
+  auto Err = GenericDevice.dataSubmit(*AllocOrErr, &LocalKLE,
+                                      sizeof(KernelLaunchEnvironmentTy),
+                                      AsyncInfoWrapper);
+  if (Err)
+    return Err;
+  return static_cast<KernelLaunchEnvironmentTy *>(*AllocOrErr);
+}
+
 Error GenericKernelTy::printLaunchInfo(GenericDeviceTy &GenericDevice,
                                        KernelArgsTy &KernelArgs,
                                        uint32_t NumThreads,
@@ -448,8 +474,14 @@ Error GenericKernelTy::launch(GenericDeviceTy &GenericDevice, void **ArgPtrs,
   llvm::SmallVector<void *, 16> Args;
   llvm::SmallVector<void *, 16> Ptrs;
 
-  void *KernelArgsPtr = prepareArgs(GenericDevice, ArgPtrs, ArgOffsets,
-                                    KernelArgs.NumArgs, Args, Ptrs);
+  auto KernelLaunchEnvOrErr =
+      getKernelLaunchEnvironment(GenericDevice, AsyncInfoWrapper);
+  if (!KernelLaunchEnvOrErr)
+    return KernelLaunchEnvOrErr.takeError();
+
+  void *KernelArgsPtr =
+      prepareArgs(GenericDevice, ArgPtrs, ArgOffsets, KernelArgs.NumArgs, Args,
+                  Ptrs, *KernelLaunchEnvOrErr);
 
   uint32_t NumThreads = getNumThreads(GenericDevice, KernelArgs.ThreadLimit);
   uint64_t NumBlocks =
@@ -464,19 +496,21 @@ Error GenericKernelTy::launch(GenericDeviceTy &GenericDevice, void **ArgPtrs,
                     KernelArgsPtr, AsyncInfoWrapper);
 }
 
-void *GenericKernelTy::prepareArgs(GenericDeviceTy &GenericDevice,
-                                   void **ArgPtrs, ptrdiff_t *ArgOffsets,
-                                   int32_t NumArgs,
-                                   llvm::SmallVectorImpl<void *> &Args,
-                                   llvm::SmallVectorImpl<void *> &Ptrs) const {
+void *GenericKernelTy::prepareArgs(
+    GenericDeviceTy &GenericDevice, void **ArgPtrs, ptrdiff_t *ArgOffsets,
+    uint32_t &NumArgs, llvm::SmallVectorImpl<void *> &Args,
+    llvm::SmallVectorImpl<void *> &Ptrs,
+    KernelLaunchEnvironmentTy *KernelLaunchEnvironment) const {
+  NumArgs += 1;
+
   Args.resize(NumArgs);
   Ptrs.resize(NumArgs);
 
-  if (NumArgs == 0)
-    return nullptr;
+  Ptrs[0] = KernelLaunchEnvironment;
+  Args[0] = &Ptrs[0];
 
-  for (int I = 0; I < NumArgs; ++I) {
-    Ptrs[I] = (void *)((intptr_t)ArgPtrs[I] + ArgOffsets[I]);
+  for (int I = 1; I < NumArgs; ++I) {
+    Ptrs[I] = (void *)((intptr_t)ArgPtrs[I - 1] + ArgOffsets[I - 1]);
     Args[I] = &Ptrs[I];
   }
   return &Args[0];
@@ -1227,7 +1261,15 @@ Error GenericDeviceTy::synchronize(__tgt_async_info *AsyncInfo) {
   if (!AsyncInfo || !AsyncInfo->Queue)
     return Plugin::error("Invalid async info queue");
 
-  return synchronizeImpl(*AsyncInfo);
+  if (auto Err = synchronizeImpl(*AsyncInfo))
+    return Err;
+
+  for (auto *Ptr : AsyncInfo->AssociatedAllocations)
+    if (auto Err = dataDelete(Ptr, TargetAllocTy::TARGET_ALLOC_DEVICE))
+      return Err;
+  AsyncInfo->AssociatedAllocations.clear();
+
+  return Plugin::success();
 }
 
 Error GenericDeviceTy::queryAsync(__tgt_async_info *AsyncInfo) {
diff --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h
index e6cfa3d3d6c11..bbd6acd19bb01 100644
--- a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h
+++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h
@@ -93,6 +93,12 @@ struct AsyncInfoWrapperTy {
   /// object and only once.
   void finalize(Error &Err);
 
+  /// Register \p Ptr as an associated alloction that is freed after
+  /// finalization.
+  void freeAllocationAfterSynchronization(void *Ptr) {
+    AsyncInfoPtr->AssociatedAllocations.push_back(Ptr);
+  }
+
 private:
   GenericDeviceTy &Device;
   __tgt_async_info LocalAsyncInfo;
@@ -289,6 +295,11 @@ struct GenericKernelTy {
     return KernelEnvironment;
   }
 
+  /// Return a device pointer to a new kernel launch environment.
+  Expected<KernelLaunchEnvironmentTy *>
+  getKernelLaunchEnvironment(GenericDeviceTy &GenericDevice,
+                             AsyncInfoWrapperTy &AsyncInfo) const;
+
   /// Indicate whether an execution mode is valid.
   static bool isValidExecutionMode(OMPTgtExecModeFlags ExecutionMode) {
     switch (ExecutionMode) {
@@ -329,9 +340,10 @@ struct GenericKernelTy {
 private:
   /// Prepare the arguments before launching the kernel.
   void *prepareArgs(GenericDeviceTy &GenericDevice, void **ArgPtrs,
-                    ptrdiff_t *ArgOffsets, int32_t NumArgs,
+                    ptrdiff_t *ArgOffsets, uint32_t &NumArgs,
                     llvm::SmallVectorImpl<void *> &Args,
-                    llvm::SmallVectorImpl<void *> &Ptrs) const;
+                    llvm::SmallVectorImpl<void *> &Ptrs,
+                    KernelLaunchEnvironmentTy *KernelLaunchEnvironment) const;
 
   /// Get the number of threads and blocks for the kernel based on the
   /// user-defined threads and block clauses.
@@ -373,6 +385,9 @@ struct GenericKernelTy {
 
   /// The kernel environment, including execution flags.
   KernelEnvironmentTy KernelEnvironment;
+
+  /// The prototype kernel launch environment.
+  KernelLaunchEnvironmentTy KernelLaunchEnvironment;
 };
 
 /// Class representing a map of host pinned allocations. We track these pinned

From 32379e536dfccb67ac39ccaadd390d265716c8bd Mon Sep 17 00:00:00 2001
From: "Wang, Xin10" <xin10.wang@intel.com>
Date: Tue, 31 Oct 2023 19:49:34 -0700
Subject: [PATCH 613/877] Fix build fail on 3a2de7eaa4

---
 libclc/utils/libclc-remangler/LibclcRemangler.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libclc/utils/libclc-remangler/LibclcRemangler.cpp b/libclc/utils/libclc-remangler/LibclcRemangler.cpp
index 876d4c250eb48..7b850146dfabf 100644
--- a/libclc/utils/libclc-remangler/LibclcRemangler.cpp
+++ b/libclc/utils/libclc-remangler/LibclcRemangler.cpp
@@ -614,7 +614,7 @@ class Remangler {
             EnumDecl::Create(*AST, RD, SourceLocation(), SourceLocation(),
                              &EnumName, nullptr, false, false, true);
         Res = AST->getEnumType(ED);
-        Res = AST->getElaboratedType(ETK_None, NNS, Res);
+        Res = AST->getElaboratedType(ElaboratedTypeKeyword::None, NNS, Res);
         // Store the elaborated type for reuse, this is important as clang uses
         // substitutions for ET based on the object not the name enclosed in.
         NestedNamesQTMap[N] = Res;

From b46e76845568020281e01277f9b5f7fbcdaa8adf Mon Sep 17 00:00:00 2001
From: Qiu Chaofan <qiucofan@cn.ibm.com>
Date: Wed, 1 Nov 2023 10:51:15 +0800
Subject: [PATCH 614/877] [DAGCombine] Fold setcc_eq infinity into is.fpclass
 (#67829)

---
 .../CodeGen/SelectionDAG/TargetLowering.cpp   | 15 ++++++
 llvm/test/CodeGen/PowerPC/fp-classify.ll      | 53 +++++++------------
 llvm/test/CodeGen/PowerPC/is_fpclass.ll       |  7 +--
 3 files changed, 35 insertions(+), 40 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index de4a3098c9351..8aab7b90db5ad 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -5104,6 +5104,21 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
       }
     }
 
+    // setueq/setoeq X, (fabs Inf) -> is_fpclass X, fcInf
+    if (isOperationLegalOrCustom(ISD::IS_FPCLASS, N0.getValueType()) &&
+        !isFPImmLegal(CFP->getValueAPF(), CFP->getValueType(0))) {
+      bool IsFabs = N0.getOpcode() == ISD::FABS;
+      SDValue Op = IsFabs ? N0.getOperand(0) : N0;
+      if ((Cond == ISD::SETOEQ || Cond == ISD::SETUEQ) && CFP->isInfinity()) {
+        FPClassTest Flag = CFP->isNegative() ? (IsFabs ? fcNone : fcNegInf)
+                                             : (IsFabs ? fcInf : fcPosInf);
+        if (Cond == ISD::SETUEQ)
+          Flag |= fcNan;
+        return DAG.getNode(ISD::IS_FPCLASS, dl, VT, Op,
+                           DAG.getTargetConstant(Flag, dl, MVT::i32));
+      }
+    }
+
     // If the condition is not legal, see if we can find an equivalent one
     // which is legal.
     if (!isCondCodeLegal(Cond, N0.getSimpleValueType())) {
diff --git a/llvm/test/CodeGen/PowerPC/fp-classify.ll b/llvm/test/CodeGen/PowerPC/fp-classify.ll
index 26cde62071224..7de35b880a5d9 100644
--- a/llvm/test/CodeGen/PowerPC/fp-classify.ll
+++ b/llvm/test/CodeGen/PowerPC/fp-classify.ll
@@ -18,12 +18,9 @@ define zeroext i1 @abs_isinff(float %x) {
 ;
 ; P9-LABEL: abs_isinff:
 ; P9:       # %bb.0: # %entry
-; P9-NEXT:    addis 3, 2, .LCPI0_0@toc@ha
-; P9-NEXT:    xsabsdp 0, 1
-; P9-NEXT:    li 4, 1
-; P9-NEXT:    lfs 1, .LCPI0_0@toc@l(3)
+; P9-NEXT:    xststdcsp 0, 1, 48
 ; P9-NEXT:    li 3, 0
-; P9-NEXT:    fcmpu 0, 0, 1
+; P9-NEXT:    li 4, 1
 ; P9-NEXT:    iseleq 3, 4, 3
 ; P9-NEXT:    blr
 entry:
@@ -46,12 +43,9 @@ define zeroext i1 @abs_isinf(double %x) {
 ;
 ; P9-LABEL: abs_isinf:
 ; P9:       # %bb.0: # %entry
-; P9-NEXT:    addis 3, 2, .LCPI1_0@toc@ha
-; P9-NEXT:    xsabsdp 0, 1
-; P9-NEXT:    li 4, 1
-; P9-NEXT:    lfs 1, .LCPI1_0@toc@l(3)
+; P9-NEXT:    xststdcdp 0, 1, 48
 ; P9-NEXT:    li 3, 0
-; P9-NEXT:    fcmpu 0, 0, 1
+; P9-NEXT:    li 4, 1
 ; P9-NEXT:    iseleq 3, 4, 3
 ; P9-NEXT:    blr
 entry:
@@ -91,13 +85,9 @@ define zeroext i1 @abs_isinfq(fp128 %x) {
 ;
 ; P9-LABEL: abs_isinfq:
 ; P9:       # %bb.0: # %entry
-; P9-NEXT:    addis 3, 2, .LCPI2_0@toc@ha
-; P9-NEXT:    xsabsqp 2, 2
-; P9-NEXT:    li 4, 1
-; P9-NEXT:    addi 3, 3, .LCPI2_0@toc@l
-; P9-NEXT:    lxv 35, 0(3)
+; P9-NEXT:    xststdcqp 0, 2, 48
 ; P9-NEXT:    li 3, 0
-; P9-NEXT:    xscmpuqp 0, 2, 3
+; P9-NEXT:    li 4, 1
 ; P9-NEXT:    iseleq 3, 4, 3
 ; P9-NEXT:    blr
 entry:
@@ -119,12 +109,10 @@ define zeroext i1 @abs_isinfornanf(float %x) {
 ;
 ; P9-LABEL: abs_isinfornanf:
 ; P9:       # %bb.0: # %entry
-; P9-NEXT:    addis 3, 2, .LCPI3_0@toc@ha
-; P9-NEXT:    xsabsdp 0, 1
-; P9-NEXT:    lfs 1, .LCPI3_0@toc@l(3)
-; P9-NEXT:    li 3, 1
-; P9-NEXT:    fcmpu 0, 0, 1
-; P9-NEXT:    isellt 3, 0, 3
+; P9-NEXT:    xststdcsp 0, 1, 112
+; P9-NEXT:    li 3, 0
+; P9-NEXT:    li 4, 1
+; P9-NEXT:    iseleq 3, 4, 3
 ; P9-NEXT:    blr
 entry:
   %0 = tail call float @llvm.fabs.f32(float %x)
@@ -145,12 +133,10 @@ define zeroext i1 @abs_isinfornan(double %x) {
 ;
 ; P9-LABEL: abs_isinfornan:
 ; P9:       # %bb.0: # %entry
-; P9-NEXT:    addis 3, 2, .LCPI4_0@toc@ha
-; P9-NEXT:    xsabsdp 0, 1
-; P9-NEXT:    lfs 1, .LCPI4_0@toc@l(3)
-; P9-NEXT:    li 3, 1
-; P9-NEXT:    fcmpu 0, 0, 1
-; P9-NEXT:    isellt 3, 0, 3
+; P9-NEXT:    xststdcdp 0, 1, 112
+; P9-NEXT:    li 3, 0
+; P9-NEXT:    li 4, 1
+; P9-NEXT:    iseleq 3, 4, 3
 ; P9-NEXT:    blr
 entry:
   %0 = tail call double @llvm.fabs.f64(double %x)
@@ -212,13 +198,10 @@ define zeroext i1 @abs_isinfornanq(fp128 %x) {
 ;
 ; P9-LABEL: abs_isinfornanq:
 ; P9:       # %bb.0: # %entry
-; P9-NEXT:    addis 3, 2, .LCPI5_0@toc@ha
-; P9-NEXT:    xsabsqp 2, 2
-; P9-NEXT:    addi 3, 3, .LCPI5_0@toc@l
-; P9-NEXT:    lxv 35, 0(3)
-; P9-NEXT:    li 3, 1
-; P9-NEXT:    xscmpuqp 0, 2, 3
-; P9-NEXT:    isellt 3, 0, 3
+; P9-NEXT:    xststdcqp 0, 2, 112
+; P9-NEXT:    li 3, 0
+; P9-NEXT:    li 4, 1
+; P9-NEXT:    iseleq 3, 4, 3
 ; P9-NEXT:    blr
 entry:
   %0 = tail call fp128 @llvm.fabs.f128(fp128 %x)
diff --git a/llvm/test/CodeGen/PowerPC/is_fpclass.ll b/llvm/test/CodeGen/PowerPC/is_fpclass.ll
index 1ba4749d7437d..57f457553a540 100644
--- a/llvm/test/CodeGen/PowerPC/is_fpclass.ll
+++ b/llvm/test/CodeGen/PowerPC/is_fpclass.ll
@@ -117,12 +117,9 @@ define i1 @isinf_float(float %x) nounwind {
 define i1 @isinf_ppc_fp128(ppc_fp128 %x) nounwind {
 ; CHECK-LABEL: isinf_ppc_fp128:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    addis 3, 2, .LCPI9_0@toc@ha
-; CHECK-NEXT:    xsabsdp 0, 1
-; CHECK-NEXT:    li 4, 1
-; CHECK-NEXT:    lfs 1, .LCPI9_0@toc@l(3)
+; CHECK-NEXT:    xststdcdp 0, 1, 48
 ; CHECK-NEXT:    li 3, 0
-; CHECK-NEXT:    fcmpu 0, 0, 1
+; CHECK-NEXT:    li 4, 1
 ; CHECK-NEXT:    iseleq 3, 4, 3
 ; CHECK-NEXT:    blr
   %1 = call i1 @llvm.is.fpclass.ppcf128(ppc_fp128 %x, i32 516)  ; 0x204 = "inf"

From 3c727a959d1c479ef284471701faa0e8ac6f427e Mon Sep 17 00:00:00 2001
From: Med Ismail Bennani <ismail@bennani.ma>
Date: Tue, 31 Oct 2023 19:56:02 -0700
Subject: [PATCH 615/877] [lldb/Target] Delay image loading after corefile
 process creation (#70351)

---
 lldb/include/lldb/Target/Process.h            |  2 +
 .../Process/mach-core/ProcessMachCore.cpp     |  4 +-
 .../Process/mach-core/ProcessMachCore.h       |  2 +
 lldb/source/Target/Process.cpp                | 29 +++++----
 .../script-resource-loading/Makefile          |  5 ++
 .../TestScriptResourceLoading.py              | 63 +++++++++++++++++++
 .../script-resource-loading/main.cpp          |  5 ++
 .../my_scripting_resource.py                  | 15 +++++
 8 files changed, 110 insertions(+), 15 deletions(-)
 create mode 100644 lldb/test/API/functionalities/script-resource-loading/Makefile
 create mode 100644 lldb/test/API/functionalities/script-resource-loading/TestScriptResourceLoading.py
 create mode 100644 lldb/test/API/functionalities/script-resource-loading/main.cpp
 create mode 100644 lldb/test/API/functionalities/script-resource-loading/my_scripting_resource.py

diff --git a/lldb/include/lldb/Target/Process.h b/lldb/include/lldb/Target/Process.h
index a6d3e6c2d1692..e25e82302a56d 100644
--- a/lldb/include/lldb/Target/Process.h
+++ b/lldb/include/lldb/Target/Process.h
@@ -614,6 +614,8 @@ class Process : public std::enable_shared_from_this<Process>,
     return error;
   }
 
+  virtual void DidLoadCore() {}
+
   /// The "ShadowListener" for a process is just an ordinary Listener that 
   /// listens for all the Process event bits.  It's convenient because you can
   /// specify it in the LaunchInfo or AttachInfo, so it will get events from
diff --git a/lldb/source/Plugins/Process/mach-core/ProcessMachCore.cpp b/lldb/source/Plugins/Process/mach-core/ProcessMachCore.cpp
index b11062a0224ab..9b10a0b832915 100644
--- a/lldb/source/Plugins/Process/mach-core/ProcessMachCore.cpp
+++ b/lldb/source/Plugins/Process/mach-core/ProcessMachCore.cpp
@@ -570,8 +570,6 @@ Status ProcessMachCore::DoLoadCore() {
 
   CreateMemoryRegions();
 
-  LoadBinariesAndSetDYLD();
-
   CleanupMemoryRegionPermissions();
 
   AddressableBits addressable_bits = core_objfile->GetAddressableBits();
@@ -580,6 +578,8 @@ Status ProcessMachCore::DoLoadCore() {
   return error;
 }
 
+void ProcessMachCore::DidLoadCore() { LoadBinariesAndSetDYLD(); }
+
 lldb_private::DynamicLoader *ProcessMachCore::GetDynamicLoader() {
   if (m_dyld_up.get() == nullptr)
     m_dyld_up.reset(DynamicLoader::FindPlugin(this, m_dyld_plugin_name));
diff --git a/lldb/source/Plugins/Process/mach-core/ProcessMachCore.h b/lldb/source/Plugins/Process/mach-core/ProcessMachCore.h
index c8820209e3f38..0e61daa625b53 100644
--- a/lldb/source/Plugins/Process/mach-core/ProcessMachCore.h
+++ b/lldb/source/Plugins/Process/mach-core/ProcessMachCore.h
@@ -46,6 +46,8 @@ class ProcessMachCore : public lldb_private::PostMortemProcess {
   // Creating a new process, or attaching to an existing one
   lldb_private::Status DoLoadCore() override;
 
+  void DidLoadCore() override;
+
   lldb_private::DynamicLoader *GetDynamicLoader() override;
 
   // PluginInterface protocol
diff --git a/lldb/source/Target/Process.cpp b/lldb/source/Target/Process.cpp
index f82ab05362fbe..f4bacf314dd74 100644
--- a/lldb/source/Target/Process.cpp
+++ b/lldb/source/Target/Process.cpp
@@ -2639,19 +2639,6 @@ Status Process::LoadCore() {
     else
       StartPrivateStateThread();
 
-    DynamicLoader *dyld = GetDynamicLoader();
-    if (dyld)
-      dyld->DidAttach();
-
-    GetJITLoaders().DidAttach();
-
-    SystemRuntime *system_runtime = GetSystemRuntime();
-    if (system_runtime)
-      system_runtime->DidAttach();
-
-    if (!m_os_up)
-      LoadOperatingSystemPlugin(false);
-
     // We successfully loaded a core file, now pretend we stopped so we can
     // show all of the threads in the core file and explore the crashed state.
     SetPrivateState(eStateStopped);
@@ -2668,7 +2655,23 @@ Status Process::LoadCore() {
                 StateAsCString(state));
       error.SetErrorString(
           "Did not get stopped event after loading the core file.");
+    } else {
+      DidLoadCore();
+
+      DynamicLoader *dyld = GetDynamicLoader();
+      if (dyld)
+        dyld->DidAttach();
+
+      GetJITLoaders().DidAttach();
+
+      SystemRuntime *system_runtime = GetSystemRuntime();
+      if (system_runtime)
+        system_runtime->DidAttach();
+
+      if (!m_os_up)
+        LoadOperatingSystemPlugin(false);
     }
+
     RestoreProcessEvents();
   }
   return error;
diff --git a/lldb/test/API/functionalities/script-resource-loading/Makefile b/lldb/test/API/functionalities/script-resource-loading/Makefile
new file mode 100644
index 0000000000000..98d4eb86e95bf
--- /dev/null
+++ b/lldb/test/API/functionalities/script-resource-loading/Makefile
@@ -0,0 +1,5 @@
+CXX_SOURCES := main.cpp
+
+override ARCH := $(shell uname -m)
+
+include Makefile.rules
diff --git a/lldb/test/API/functionalities/script-resource-loading/TestScriptResourceLoading.py b/lldb/test/API/functionalities/script-resource-loading/TestScriptResourceLoading.py
new file mode 100644
index 0000000000000..6148ed09e20b0
--- /dev/null
+++ b/lldb/test/API/functionalities/script-resource-loading/TestScriptResourceLoading.py
@@ -0,0 +1,63 @@
+"""
+Test loading python scripting resource from corefile
+"""
+
+import os, tempfile
+
+import lldb
+from lldbsuite.test.decorators import *
+from lldbsuite.test.lldbtest import *
+from lldbsuite.test import lldbutil
+from lldbsuite.test import lldbtest
+
+
+class ScriptResourceLoadingTestCase(TestBase):
+    NO_DEBUG_INFO_TESTCASE = True
+
+    def create_stack_skinny_corefile(self, file):
+        self.build()
+        target, process, thread, _ = lldbutil.run_to_source_breakpoint(
+            self, "// break", lldb.SBFileSpec("main.cpp")
+        )
+        self.assertTrue(process.IsValid(), "Process is invalid.")
+        # FIXME: Use SBAPI to save the process corefile.
+        self.runCmd("process save-core -s stack  " + file)
+        self.assertTrue(os.path.exists(file), "No stack-only corefile found.")
+        self.assertTrue(self.dbg.DeleteTarget(target), "Couldn't delete target")
+
+    def move_blueprint_to_dsym(self, blueprint_name):
+        blueprint_origin_path = os.path.join(self.getSourceDir(), blueprint_name)
+        dsym_bundle = self.getBuildArtifact("a.out.dSYM")
+        blueprint_destination_path = os.path.join(
+            dsym_bundle, "Contents", "Resources", "Python"
+        )
+        if not os.path.exists(blueprint_destination_path):
+            os.mkdir(blueprint_destination_path)
+
+        blueprint_destination_path = os.path.join(
+            blueprint_destination_path, "a_out.py"
+        )
+        shutil.copy(blueprint_origin_path, blueprint_destination_path)
+
+    @skipUnlessDarwin
+    def test_script_resource_loading(self):
+        """
+        Test that we're able to load the python scripting resource from
+        corefile dSYM bundle.
+
+        """
+        self.build()
+
+        self.runCmd("settings set target.load-script-from-symbol-file true")
+        self.move_blueprint_to_dsym("my_scripting_resource.py")
+
+        corefile_process = None
+        with tempfile.NamedTemporaryFile() as file:
+            self.create_stack_skinny_corefile(file.name)
+            corefile_target = self.dbg.CreateTarget(None)
+            corefile_process = corefile_target.LoadCore(
+                self.getBuildArtifact(file.name)
+            )
+        self.assertTrue(corefile_process, PROCESS_IS_VALID)
+        self.expect("command script list", substrs=["test_script_resource_loading"])
+        self.runCmd("test_script_resource_loading")
diff --git a/lldb/test/API/functionalities/script-resource-loading/main.cpp b/lldb/test/API/functionalities/script-resource-loading/main.cpp
new file mode 100644
index 0000000000000..fb5f61d8ffcff
--- /dev/null
+++ b/lldb/test/API/functionalities/script-resource-loading/main.cpp
@@ -0,0 +1,5 @@
+int foo() {
+  return 42; // break
+}
+
+int main() { return foo(); }
diff --git a/lldb/test/API/functionalities/script-resource-loading/my_scripting_resource.py b/lldb/test/API/functionalities/script-resource-loading/my_scripting_resource.py
new file mode 100644
index 0000000000000..d48ae5a8b59bd
--- /dev/null
+++ b/lldb/test/API/functionalities/script-resource-loading/my_scripting_resource.py
@@ -0,0 +1,15 @@
+import sys, lldb
+
+
+def test_script_resource_loading(debugger, command, exe_ctx, result, dict):
+    if not exe_ctx.target.process.IsValid():
+        result.SetError("invalid process")
+    process = exe_ctx.target.process
+    if not len(process):
+        result.SetError("invalid thread count")
+
+
+def __lldb_init_module(debugger, dict):
+    debugger.HandleCommand(
+        "command script add -o -f a_out.test_script_resource_loading test_script_resource_loading"
+    )

From 773caac71123fec3367b533e6aecff067dea9718 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Tue, 31 Oct 2023 20:03:20 -0700
Subject: [PATCH 616/877] [ADT] Use llvm::to_underlying (NFC)

---
 llvm/include/llvm/ADT/BitmaskEnum.h | 8 ++++----
 llvm/include/llvm/ADT/FoldingSet.h  | 3 ++-
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/llvm/include/llvm/ADT/BitmaskEnum.h b/llvm/include/llvm/ADT/BitmaskEnum.h
index 976fddde725f0..5fe724d80c330 100644
--- a/llvm/include/llvm/ADT/BitmaskEnum.h
+++ b/llvm/include/llvm/ADT/BitmaskEnum.h
@@ -13,6 +13,7 @@
 #include <type_traits>
 #include <utility>
 
+#include "llvm/ADT/STLForwardCompat.h"
 #include "llvm/Support/MathExtras.h"
 
 /// LLVM_MARK_AS_BITMASK_ENUM lets you opt in an individual enum type so you can
@@ -125,7 +126,7 @@ template <typename E> constexpr std::underlying_type_t<E> Mask() {
 /// Check that Val is in range for E, and return Val cast to E's underlying
 /// type.
 template <typename E> constexpr std::underlying_type_t<E> Underlying(E Val) {
-  auto U = static_cast<std::underlying_type_t<E>>(Val);
+  auto U = llvm::to_underlying(Val);
   assert(U >= 0 && "Negative enum values are not allowed.");
   assert(U <= Mask<E>() && "Enum value too large (or largest val too small?)");
   return U;
@@ -181,9 +182,8 @@ E &operator^=(E &LHS, E RHS) {
 // Enable bitmask enums in namespace ::llvm and all nested namespaces.
 LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE();
 template <typename E, typename = std::enable_if_t<is_bitmask_enum<E>::value>>
-constexpr unsigned BitWidth = BitmaskEnumDetail::bitWidth(uint64_t{
-    static_cast<std::underlying_type_t<E>>(
-        E::LLVM_BITMASK_LARGEST_ENUMERATOR)});
+constexpr unsigned BitWidth = BitmaskEnumDetail::bitWidth(
+    uint64_t{llvm::to_underlying(E::LLVM_BITMASK_LARGEST_ENUMERATOR)});
 
 } // namespace llvm
 
diff --git a/llvm/include/llvm/ADT/FoldingSet.h b/llvm/include/llvm/ADT/FoldingSet.h
index 237668921cbb2..f82eabd5044b2 100644
--- a/llvm/include/llvm/ADT/FoldingSet.h
+++ b/llvm/include/llvm/ADT/FoldingSet.h
@@ -17,6 +17,7 @@
 #define LLVM_ADT_FOLDINGSET_H
 
 #include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/STLForwardCompat.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/iterator.h"
 #include "llvm/Support/Allocator.h"
@@ -832,7 +833,7 @@ struct FoldingSetTrait<std::pair<T1, T2>> {
 template <typename T>
 struct FoldingSetTrait<T, std::enable_if_t<std::is_enum<T>::value>> {
   static void Profile(const T &X, FoldingSetNodeID &ID) {
-    ID.AddInteger(static_cast<std::underlying_type_t<T>>(X));
+    ID.AddInteger(llvm::to_underlying(X));
   }
 };
 

From 1df6504ac21acbdaeee4eed52e82af69a302024c Mon Sep 17 00:00:00 2001
From: Matthias Springer <me@m-sp.org>
Date: Wed, 1 Nov 2023 12:19:30 +0900
Subject: [PATCH 617/877] [mlir][vector] LISH: Implement `SubsetOpInterface`
 for transfer_read/write (#70629)

- Implement `SubsetOpInterface`, `SubsetExtractionOpInterface`,
`SubsetInsertionOpInterface` for `vector.transfer_read` and
`vector.transfer_write`.
- Move all tensor subset hoisting test cases from `Linalg` to
`loop-invariant-subset-hoisting.mlir`. (Removing 1 duplicate test case.)
---
 .../Vector/Transforms/SubsetOpInterfaceImpl.h |  20 +
 mlir/include/mlir/InitAllDialects.h           |   2 +
 .../Dialect/Vector/Transforms/CMakeLists.txt  |   2 +
 .../Transforms/SubsetOpInterfaceImpl.cpp      |  82 +++
 mlir/test/Dialect/Linalg/hoisting.mlir        | 471 ------------------
 .../loop-invariant-subset-hoisting.mlir       | 318 ++++++++++++
 .../llvm-project-overlay/mlir/BUILD.bazel     |   1 +
 7 files changed, 425 insertions(+), 471 deletions(-)
 create mode 100644 mlir/include/mlir/Dialect/Vector/Transforms/SubsetOpInterfaceImpl.h
 create mode 100644 mlir/lib/Dialect/Vector/Transforms/SubsetOpInterfaceImpl.cpp

diff --git a/mlir/include/mlir/Dialect/Vector/Transforms/SubsetOpInterfaceImpl.h b/mlir/include/mlir/Dialect/Vector/Transforms/SubsetOpInterfaceImpl.h
new file mode 100644
index 0000000000000..74bde485fa17a
--- /dev/null
+++ b/mlir/include/mlir/Dialect/Vector/Transforms/SubsetOpInterfaceImpl.h
@@ -0,0 +1,20 @@
+//===- SubsetOpInterfaceImpl.h - Tensor subsets -----------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_VECTOR_SUBSETOPINTERFACEIMPL_H
+#define MLIR_DIALECT_VECTOR_SUBSETOPINTERFACEIMPL_H
+
+namespace mlir {
+class DialectRegistry;
+
+namespace vector {
+void registerSubsetOpInterfaceExternalModels(DialectRegistry &registry);
+} // namespace vector
+} // namespace mlir
+
+#endif // MLIR_DIALECT_VECTOR_SUBSETOPINTERFACEIMPL_H
diff --git a/mlir/include/mlir/InitAllDialects.h b/mlir/include/mlir/InitAllDialects.h
index 7c2ffb7408d9a..621110d130818 100644
--- a/mlir/include/mlir/InitAllDialects.h
+++ b/mlir/include/mlir/InitAllDialects.h
@@ -85,6 +85,7 @@
 #include "mlir/Dialect/UB/IR/UBOps.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/Dialect/Vector/Transforms/BufferizableOpInterfaceImpl.h"
+#include "mlir/Dialect/Vector/Transforms/SubsetOpInterfaceImpl.h"
 #include "mlir/Dialect/X86Vector/X86VectorDialect.h"
 #include "mlir/IR/Dialect.h"
 #include "mlir/Interfaces/CastInterfaces.h"
@@ -171,6 +172,7 @@ inline void registerAllDialects(DialectRegistry &registry) {
   tensor::registerTilingInterfaceExternalModels(registry);
   tensor::registerValueBoundsOpInterfaceExternalModels(registry);
   vector::registerBufferizableOpInterfaceExternalModels(registry);
+  vector::registerSubsetOpInterfaceExternalModels(registry);
   NVVM::registerNVVMTargetInterfaceExternalModels(registry);
   ROCDL::registerROCDLTargetInterfaceExternalModels(registry);
 }
diff --git a/mlir/lib/Dialect/Vector/Transforms/CMakeLists.txt b/mlir/lib/Dialect/Vector/Transforms/CMakeLists.txt
index e6e7fb465aec3..513340096a5c1 100644
--- a/mlir/lib/Dialect/Vector/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/Vector/Transforms/CMakeLists.txt
@@ -10,6 +10,7 @@ add_mlir_dialect_library(MLIRVectorTransforms
   LowerVectorShapeCast.cpp
   LowerVectorTransfer.cpp
   LowerVectorTranspose.cpp
+  SubsetOpInterfaceImpl.cpp
   VectorDistribute.cpp
   VectorDropLeadUnitDim.cpp
   VectorEmulateNarrowType.cpp
@@ -40,6 +41,7 @@ add_mlir_dialect_library(MLIRVectorTransforms
   MLIRMemRefUtils
   MLIRSCFDialect
   MLIRSideEffectInterfaces
+  MLIRSubsetOpInterface
   MLIRTensorDialect
   MLIRTransforms
   MLIRVectorDialect
diff --git a/mlir/lib/Dialect/Vector/Transforms/SubsetOpInterfaceImpl.cpp b/mlir/lib/Dialect/Vector/Transforms/SubsetOpInterfaceImpl.cpp
new file mode 100644
index 0000000000000..b450d5b78a466
--- /dev/null
+++ b/mlir/lib/Dialect/Vector/Transforms/SubsetOpInterfaceImpl.cpp
@@ -0,0 +1,82 @@
+//===- SubsetOpInterfaceImpl.cpp - Tensor subsets -------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/Vector/Transforms/SubsetOpInterfaceImpl.h"
+
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/Interfaces/SubsetOpInterface.h"
+
+using namespace mlir;
+using namespace mlir::vector;
+
+namespace {
+
+template <typename OpTy>
+struct XferOpSubsetOpInterface
+    : public SubsetOpInterface::ExternalModel<XferOpSubsetOpInterface<OpTy>,
+                                              OpTy> {
+  FailureOr<HyperrectangularSlice>
+  getAccessedHyperrectangularSlice(Operation *op) const {
+    auto xferOp = cast<OpTy>(op);
+    Builder b(xferOp->getContext());
+    SmallVector<OpFoldResult> offsets = llvm::map_to_vector(
+        xferOp.getIndices(), [](Value v) -> OpFoldResult { return v; });
+    SmallVector<OpFoldResult> sizes = llvm::map_to_vector(
+        xferOp.getTransferChunkAccessed(),
+        [&](int64_t sz) -> OpFoldResult { return b.getIndexAttr(sz); });
+    return HyperrectangularSlice(offsets, sizes);
+  }
+};
+
+struct TransferReadOpSubsetExtractionOpInterface
+    : public SubsetExtractionOpInterface::ExternalModel<
+          TransferReadOpSubsetExtractionOpInterface, vector::TransferReadOp> {
+  OpOperand &getSourceOperand(Operation *op) const {
+    return cast<vector::TransferReadOp>(op).getSourceMutable();
+  }
+};
+
+struct TransferWriteOpSubsetInsertionOpInterface
+    : public SubsetInsertionOpInterface::ExternalModel<
+          TransferWriteOpSubsetInsertionOpInterface, vector::TransferWriteOp> {
+  OpOperand &getSourceOperand(Operation *op) const {
+    return cast<vector::TransferWriteOp>(op).getVectorMutable();
+  }
+
+  OpOperand &getDestinationOperand(Operation *op) const {
+    return cast<vector::TransferWriteOp>(op).getSourceMutable();
+  }
+
+  Value buildSubsetExtraction(Operation *op, OpBuilder &builder,
+                              Location loc) const {
+    // TODO: Implement when needed.
+    return Value();
+  }
+
+  SmallVector<Value>
+  getValuesNeededToBuildSubsetExtraction(Operation *op) const {
+    // TODO: Implement when needed.
+    return {};
+  }
+};
+
+} // namespace
+
+void mlir::vector::registerSubsetOpInterfaceExternalModels(
+    DialectRegistry &registry) {
+  registry.addExtension(+[](MLIRContext *ctx, vector::VectorDialect *dialect) {
+    TransferReadOp::attachInterface<XferOpSubsetOpInterface<TransferReadOp>>(
+        *ctx);
+    TransferReadOp::attachInterface<TransferReadOpSubsetExtractionOpInterface>(
+        *ctx);
+    TransferWriteOp::attachInterface<XferOpSubsetOpInterface<TransferWriteOp>>(
+        *ctx);
+    TransferWriteOp::attachInterface<TransferWriteOpSubsetInsertionOpInterface>(
+        *ctx);
+  });
+}
diff --git a/mlir/test/Dialect/Linalg/hoisting.mlir b/mlir/test/Dialect/Linalg/hoisting.mlir
index 3623952a08df0..550ffbc7bab67 100644
--- a/mlir/test/Dialect/Linalg/hoisting.mlir
+++ b/mlir/test/Dialect/Linalg/hoisting.mlir
@@ -224,477 +224,6 @@ module attributes {transform.with_named_sequence} {
 
 // -----
 
-// CHECK-LABEL: func @hoist_vector_transfer_pairs_tensor
-func.func @hoist_vector_transfer_pairs_tensor(
-    %tensor0: tensor<?x?xf32>, %tensor1: tensor<?x?xf32>, %tensor2: tensor<?x?xf32>,
-    %tensor3: tensor<?x?xf32>, %tensor4: tensor<?x?xf32>, %tensor5: tensor<?x?xf32>,
-    %val: index, %lb : index, %ub : index, %step: index) ->
-    (tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>,
-     tensor<?x?xf32>, tensor<?x?xf32>) {
-  %c0 = arith.constant 0 : index
-  %cst = arith.constant 0.0 : f32
-
-// CHECK: vector.transfer_read %{{.*}} : tensor<?x?xf32>, vector<1xf32>
-// CHECK: scf.for {{.*}} iter_args({{.*}}) ->
-// CHECK-SAME: (tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, vector<1xf32>) {
-// CHECK:   vector.transfer_read %{{.*}} : tensor<?x?xf32>, vector<2xf32>
-// CHECK:   scf.for {{.*}} iter_args({{.*}}) ->
-// CHECK-SAME: (tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, vector<2xf32>, vector<1xf32>) {
-// CHECK:     vector.transfer_read %{{.*}} : tensor<?x?xf32>, vector<4xf32>
-// CHECK:     "some_crippling_use"(%{{.*}}) : (tensor<?x?xf32>) -> ()
-// CHECK:     vector.transfer_read %{{.*}} : tensor<?x?xf32>, vector<5xf32>
-// CHECK:     "some_use"(%{{.*}}) : (vector<1xf32>) -> vector<1xf32>
-// CHECK:     "some_use"(%{{.*}}) : (vector<2xf32>) -> vector<2xf32>
-// CHECK:     "some_use"(%{{.*}}) : (tensor<?x?xf32>) -> vector<3xf32>
-// CHECK:     "some_use"(%{{.*}}) : (vector<4xf32>) -> vector<4xf32>
-// CHECK:     "some_use"(%{{.*}}) : (vector<5xf32>) -> vector<5xf32>
-// CHECK:     vector.transfer_write %{{.*}} : vector<3xf32>, tensor<?x?xf32>
-// CHECK:     vector.transfer_write %{{.*}} : vector<4xf32>, tensor<?x?xf32>
-// CHECK:     vector.transfer_write %{{.*}} : vector<5xf32>, tensor<?x?xf32>
-// CHECK:     "some_crippling_use"(%{{.*}}) : (tensor<?x?xf32>) -> ()
-// CHECK:     scf.yield {{.*}} :
-// CHECK-SAME: tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, vector<2xf32>, vector<1xf32>
-// CHECK:   }
-// CHECK:   vector.transfer_write %{{.*}} : vector<2xf32>, tensor<?x?xf32>
-// CHECK:   scf.yield {{.*}} :
-// CHECK-SAME: tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, vector<1xf32>
-// CHECK: }
-// CHECK: vector.transfer_write %{{.*}} : vector<1xf32>, tensor<?x?xf32>
-  %0:6 = scf.for %i = %lb to %ub step %step
-  iter_args(%arg0 = %tensor0, %arg1 = %tensor1, %arg2 = %tensor2,
-            %arg3 = %tensor3,  %arg4 = %tensor4, %arg5 = %tensor5)
-  -> (tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>,
-     tensor<?x?xf32>, tensor<?x?xf32>)  {
-    %1:6 = scf.for %j = %lb to %ub step %step
-    iter_args(%arg6 = %arg0, %arg7 = %arg1, %arg8 = %arg2,
-              %arg9 = %arg3,  %arg10 = %arg4, %arg11 = %arg5)
-    -> (tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>,
-       tensor<?x?xf32>, tensor<?x?xf32>)  {
-      %r0 = vector.transfer_read %arg7[%c0, %c0], %cst: tensor<?x?xf32>, vector<1xf32>
-      %r1 = vector.transfer_read %arg6[%i, %i], %cst: tensor<?x?xf32>, vector<2xf32>
-      %r3 = vector.transfer_read %arg9[%c0, %c0], %cst: tensor<?x?xf32>, vector<4xf32>
-      "some_crippling_use"(%arg10) : (tensor<?x?xf32>) -> ()
-      %r4 = vector.transfer_read %arg10[%c0, %c0], %cst: tensor<?x?xf32>, vector<5xf32>
-      %r5 = vector.transfer_read %arg11[%c0, %c0], %cst: tensor<?x?xf32>, vector<6xf32>
-      "some_crippling_use"(%arg11) : (tensor<?x?xf32>) -> ()
-      %u0 = "some_use"(%r0) : (vector<1xf32>) -> vector<1xf32>
-      %u1 = "some_use"(%r1) : (vector<2xf32>) -> vector<2xf32>
-      %u2 = "some_use"(%arg8) : (tensor<?x?xf32>) -> vector<3xf32>
-      %u3 = "some_use"(%r3) : (vector<4xf32>) -> vector<4xf32>
-      %u4 = "some_use"(%r4) : (vector<5xf32>) -> vector<5xf32>
-      %u5 = "some_use"(%r5) : (vector<6xf32>) -> vector<6xf32>
-      %w1 = vector.transfer_write %u0, %arg7[%c0, %c0] : vector<1xf32>, tensor<?x?xf32>
-      %w0 = vector.transfer_write %u1, %arg6[%i, %i] : vector<2xf32>, tensor<?x?xf32>
-      %w2 = vector.transfer_write %u2, %arg8[%c0, %c0] : vector<3xf32>, tensor<?x?xf32>
-      %w3 = vector.transfer_write %u3, %arg9[%c0, %c0] : vector<4xf32>, tensor<?x?xf32>
-      %w4 = vector.transfer_write %u4, %arg10[%c0, %c0] : vector<5xf32>, tensor<?x?xf32>
-      %w5 = vector.transfer_write %u5, %arg11[%c0, %c0] : vector<6xf32>, tensor<?x?xf32>
-      "some_crippling_use"(%w3) : (tensor<?x?xf32>) -> ()
-      scf.yield %w0, %w1, %w2, %w3, %w4, %w5 :
-        tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>,
-        tensor<?x?xf32>, tensor<?x?xf32>
-      }
-      scf.yield %1#0,  %1#1, %1#2, %1#3, %1#4, %1#5 :
-        tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>,
-        tensor<?x?xf32>, tensor<?x?xf32>
-  }
-  return %0#0,  %0#1, %0#2, %0#3, %0#4,  %0#5 :
-        tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>,
-        tensor<?x?xf32>, tensor<?x?xf32>
-}
-
-module attributes {transform.with_named_sequence} {
-  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
-    %0 = transform.structured.match ops{["func.func"]} in %arg1
-      : (!transform.any_op) -> !transform.any_op
-    transform.structured.hoist_redundant_tensor_subsets %0
-      : (!transform.any_op) -> ()
-    transform.yield
-  }
-}
-
-// -----
-
-// CHECK-LABEL: func @hoist_vector_transfer_pairs_disjoint_tensor(
-//  CHECK-SAME:   %[[TENSOR0:[a-zA-Z0-9]*]]: tensor<?x?xf32>,
-//  CHECK-SAME:   %[[TENSOR1:[a-zA-Z0-9]*]]: tensor<?x?xf32>,
-//  CHECK-SAME:   %[[TENSOR2:[a-zA-Z0-9]*]]: tensor<?x?xf32>,
-//  CHECK-SAME:   %[[TENSOR3:[a-zA-Z0-9]*]]: tensor<?x?xf32>,
-func.func @hoist_vector_transfer_pairs_disjoint_tensor(
-    %tensor0: tensor<?x?xf32>, %tensor1: tensor<?x?xf32>,
-    %tensor2: tensor<?x?xf32>, %tensor3: tensor<?x?xf32>,
-    %val: index, %lb : index, %ub : index, %step: index,
-    %random_index : index) ->
-    (tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>) {
-  %c0 = arith.constant 0 : index
-  %c1 = arith.constant 1 : index
-  %c3 = arith.constant 3 : index
-  %cst = arith.constant 0.0 : f32
-
-// CHECK: vector.transfer_read %[[TENSOR2]]{{.*}} : tensor<?x?xf32>, vector<3xf32>
-// CHECK: vector.transfer_read %[[TENSOR2]]{{.*}} : tensor<?x?xf32>, vector<3xf32>
-// CHECK: vector.transfer_read %[[TENSOR3]]{{.*}} : tensor<?x?xf32>, vector<4xf32>
-// CHECK: vector.transfer_read %[[TENSOR3]]{{.*}} : tensor<?x?xf32>, vector<4xf32>
-// CHECK: %[[R:.*]]:6 = scf.for {{.*}} iter_args({{.*}}) ->
-// CHECK-SAME: (tensor<?x?xf32>, tensor<?x?xf32>, vector<3xf32>, vector<3xf32>, vector<4xf32>, vector<4xf32>) {
-// CHECK:   scf.for {{.*}} iter_args({{.*}}) ->
-// CHECK-SAME: (tensor<?x?xf32>, tensor<?x?xf32>, vector<3xf32>, vector<3xf32>, vector<4xf32>, vector<4xf32>) {
-// CHECK:     vector.transfer_read %[[TENSOR1]]{{.*}} : tensor<?x?xf32>, vector<2xf32>
-// CHECK:     vector.transfer_read %[[TENSOR1]]{{.*}} : tensor<?x?xf32>, vector<2xf32>
-// CHECK:     "some_use"(%{{.*}}) : (vector<2xf32>) -> vector<2xf32>
-// CHECK:     "some_use"(%{{.*}}) : (vector<2xf32>) -> vector<2xf32>
-// CHECK:     "some_use"(%{{.*}}) : (vector<3xf32>) -> vector<3xf32>
-// CHECK:     "some_use"(%{{.*}}) : (vector<3xf32>) -> vector<3xf32>
-// CHECK:     "some_use"(%{{.*}}) : (vector<4xf32>) -> vector<4xf32>
-// CHECK:     "some_use"(%{{.*}}) : (vector<4xf32>) -> vector<4xf32>
-// CHECK:     "some_use"(%{{.*}}) : (vector<2xf32>) -> vector<2xf32>
-// CHECK:     "some_use"(%{{.*}}) : (vector<2xf32>) -> vector<2xf32>
-// CHECK:     vector.transfer_write %{{.*}}, %{{.*}}{{.*}} : vector<2xf32>, tensor<?x?xf32>
-// CHECK:     vector.transfer_write %{{.*}}, %{{.*}}{{.*}} : vector<2xf32>, tensor<?x?xf32>
-// CHECK:     scf.yield {{.*}} :
-// CHECK-SAME: tensor<?x?xf32>, tensor<?x?xf32>, vector<3xf32>, vector<3xf32>, vector<4xf32>, vector<4xf32>
-// CHECK:   }
-// CHECK:   scf.yield {{.*}} :
-// CHECK-SAME: tensor<?x?xf32>, tensor<?x?xf32>, vector<3xf32>, vector<3xf32>, vector<4xf32>, vector<4xf32>
-// CHECK: }
-// CHECK: %[[TENSOR4:.*]] = vector.transfer_write %[[R]]#5, %[[TENSOR3]]{{.*}} : vector<4xf32>, tensor<?x?xf32>
-// CHECK:                   vector.transfer_write %[[R]]#4, %[[TENSOR4]]{{.*}} : vector<4xf32>, tensor<?x?xf32>
-// CHECK: %[[TENSOR5:.*]] = vector.transfer_write %[[R]]#3, %[[TENSOR2]]{{.*}} : vector<3xf32>, tensor<?x?xf32>
-// CHECK:                   vector.transfer_write %[[R]]#2, %[[TENSOR5]]{{.*}} : vector<3xf32>, tensor<?x?xf32>
-  %0:4 = scf.for %i = %lb to %ub step %step
-  iter_args(%arg0 = %tensor0, %arg1 = %tensor1, %arg2 = %tensor2,
-            %arg3 = %tensor3)
-  -> (tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>) {
-    %1:4 = scf.for %j = %lb to %ub step %step
-    iter_args(%arg4 = %arg0, %arg5 = %arg1, %arg6 = %arg2,
-              %arg7 = %arg3)
-    -> (tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>) {
-      %r00 = vector.transfer_read %arg5[%c0, %c0], %cst: tensor<?x?xf32>, vector<2xf32>
-      %r01 = vector.transfer_read %arg5[%c0, %c1], %cst: tensor<?x?xf32>, vector<2xf32>
-      %r20 = vector.transfer_read %arg6[%c0, %c0], %cst: tensor<?x?xf32>, vector<3xf32>
-      %r21 = vector.transfer_read %arg6[%c0, %c3], %cst: tensor<?x?xf32>, vector<3xf32>
-      %r30 = vector.transfer_read %arg7[%c0, %random_index], %cst: tensor<?x?xf32>, vector<4xf32>
-      %r31 = vector.transfer_read %arg7[%c1, %random_index], %cst: tensor<?x?xf32>, vector<4xf32>
-      %r10 = vector.transfer_read %arg4[%i, %i], %cst: tensor<?x?xf32>, vector<2xf32>
-      %r11 = vector.transfer_read %arg4[%random_index, %random_index], %cst: tensor<?x?xf32>, vector<2xf32>
-      %u00 = "some_use"(%r00) : (vector<2xf32>) -> vector<2xf32>
-      %u01 = "some_use"(%r01) : (vector<2xf32>) -> vector<2xf32>
-      %u20 = "some_use"(%r20) : (vector<3xf32>) -> vector<3xf32>
-      %u21 = "some_use"(%r21) : (vector<3xf32>) -> vector<3xf32>
-      %u30 = "some_use"(%r30) : (vector<4xf32>) -> vector<4xf32>
-      %u31 = "some_use"(%r31) : (vector<4xf32>) -> vector<4xf32>
-      %u10 = "some_use"(%r10) : (vector<2xf32>) -> vector<2xf32>
-      %u11 = "some_use"(%r11) : (vector<2xf32>) -> vector<2xf32>
-      %w10 = vector.transfer_write %u00, %arg5[%c0, %c0] : vector<2xf32>, tensor<?x?xf32>
-      %w11 = vector.transfer_write %u01, %w10[%c0, %c1] : vector<2xf32>, tensor<?x?xf32>
-      %w20 = vector.transfer_write %u20, %arg6[%c0, %c0] : vector<3xf32>, tensor<?x?xf32>
-      %w21 = vector.transfer_write %u21, %w20[%c0, %c3] : vector<3xf32>, tensor<?x?xf32>
-      %w30 = vector.transfer_write %u30, %arg7[%c0, %random_index] : vector<4xf32>, tensor<?x?xf32>
-      %w31 = vector.transfer_write %u31, %w30[%c1, %random_index] : vector<4xf32>, tensor<?x?xf32>
-      %w00 = vector.transfer_write %u10, %arg4[%i, %i] : vector<2xf32>, tensor<?x?xf32>
-      %w01 = vector.transfer_write %u11, %w00[%random_index, %random_index] : vector<2xf32>, tensor<?x?xf32>
-      scf.yield %w01, %w11, %w21, %w31 : tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>
-    }
-    scf.yield %1#0,  %1#1, %1#2, %1#3 : tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>
-  }
-  return %0#0,  %0#1, %0#2, %0#3 : tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>
-}
-
-module attributes {transform.with_named_sequence} {
-  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
-    %0 = transform.structured.match ops{["func.func"]} in %arg1
-      : (!transform.any_op) -> !transform.any_op
-    transform.structured.hoist_redundant_tensor_subsets %0
-      : (!transform.any_op) -> ()
-    transform.yield
-  }
-}
-
-// -----
-
-// CHECK-LABEL: func @hoist_vector_transfer_pairs_tensor_and_slices
-//  CHECK-SAME:   %[[TENSOR0:[a-zA-Z0-9]*]]: tensor<?x?xf32>,
-//  CHECK-SAME:   %[[TENSOR1:[a-zA-Z0-9]*]]: tensor<?x?xf32>,
-//  CHECK-SAME:   %[[TENSOR2:[a-zA-Z0-9]*]]: tensor<?x?xf32>,
-//  CHECK-SAME:   %[[TENSOR3:[a-zA-Z0-9]*]]: tensor<?x?xf32>,
-//  CHECK-SAME:   %[[TENSOR4:[a-zA-Z0-9]*]]: tensor<?x?xf32>,
-//  CHECK-SAME:   %[[TENSOR5:[a-zA-Z0-9]*]]: tensor<?x?xf32>
-func.func @hoist_vector_transfer_pairs_tensor_and_slices(
-    %tensor0: tensor<?x?xf32>, %tensor1: tensor<?x?xf32>, %tensor2: tensor<?x?xf32>,
-    %tensor3: tensor<?x?xf32>, %tensor4: tensor<?x?xf32>, %tensor5: tensor<?x?xf32>,
-    %val: index, %lb : index, %ub : index, %step: index) ->
-    (
-      tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>//, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>
-    ) {
-  %c0 = arith.constant 0 : index
-  %cst = arith.constant 0.0 : f32
-
-  //      CHECK: scf.for %[[I:.*]] = {{.*}} iter_args(
-  // CHECK-SAME:   %[[TENSOR0_ARG:[0-9a-zA-Z]+]] = %[[TENSOR0]],
-  // CHECK-SAME:   %[[TENSOR1_ARG:[0-9a-zA-Z]+]] = %[[TENSOR1]],
-  // CHECK-SAME:   %[[TENSOR2_ARG:[0-9a-zA-Z]+]] = %[[TENSOR2]]
-  // CHECK-SAME: ) ->
-  // CHECK-SAME: (tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>
-  %0:3 = scf.for %i = %lb to %ub step %step
-  iter_args(%arg0 = %tensor0, %arg1 = %tensor1, %arg2 = %tensor2)
-    -> (tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>)  {
-
-    // Hoisted
-    // CHECK:   %[[ST0:.*]] = tensor.extract_slice %[[TENSOR0_ARG]][%[[I]], %[[I]]]{{.*}}: tensor<?x?xf32> to tensor<?x?xf32>
-    // CHECK:   %[[V0:.*]] = vector.transfer_read %[[ST0]]{{.*}} : tensor<?x?xf32>, vector<1xf32>
-
-    //      CHECK:   %[[R:.*]]:3 = scf.for %[[J:.*]] = {{.*}} iter_args(
-    // CHECK-SAME:   %[[TENSOR1_ARG_L2:[0-9a-zA-Z]+]] = %[[TENSOR1_ARG]]
-    // CHECK-SAME:   %[[TENSOR2_ARG_L2:[0-9a-zA-Z]+]] = %[[TENSOR2_ARG]]
-    // CHECK-SAME:   %[[V0_ARG_L2:[0-9a-zA-Z]+]] = %[[V0]]
-    // CHECK-SAME: ) ->
-    // CHECK-SAME: (tensor<?x?xf32>, tensor<?x?xf32>, vector<1xf32>
-    %1:3 = scf.for %j = %lb to %ub step %step
-    iter_args(%arg6 = %arg0, %arg7 = %arg1, %arg8 = %arg2)
-    -> (tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>)  {
-      // Hoists.
-      %st0 = tensor.extract_slice %arg6[%i, %i][%step, %step][1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
-      %r0 = vector.transfer_read %st0[%c0, %c0], %cst: tensor<?x?xf32>, vector<1xf32>
-
-      // CHECK:     %[[ST1:.*]] = tensor.extract_slice %[[TENSOR1_ARG_L2]][%[[J]],{{.*}}: tensor<?x?xf32> to tensor<?x?xf32>
-      // CHECK:     %[[V1:.*]] = vector.transfer_read %[[ST1]]{{.*}} : tensor<?x?xf32>, vector<2xf32>
-      // Does not hoist (slice depends on %j)
-      %st1 = tensor.extract_slice %arg7[%j, %c0][%step, %step][1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
-      %r1 = vector.transfer_read %st1[%c0, %c0], %cst: tensor<?x?xf32>, vector<2xf32>
-
-      // CHECK:     %[[ST2:.*]] = tensor.extract_slice %[[TENSOR2_ARG_L2]][%[[I]],{{.*}}: tensor<?x?xf32> to tensor<?x?xf32>
-      // CHECK:     %[[V2:.*]] = vector.transfer_read %[[ST2]]{{.*}} : tensor<?x?xf32>, vector<3xf32>
-      // Does not hoist, 2 slice %arg8.
-      %st2 = tensor.extract_slice %arg8[%i, %c0][%step, %step][1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
-      %r2 = vector.transfer_read %st2[%c0, %c0], %cst: tensor<?x?xf32>, vector<3xf32>
-
-      // CHECK:     %[[U0:.*]] = "some_use"(%[[V0_ARG_L2]]) : (vector<1xf32>) -> vector<1xf32>
-      // CHECK:     %[[U1:.*]] = "some_use"(%[[V1]]) : (vector<2xf32>) -> vector<2xf32>
-      // CHECK:     %[[U2:.*]] = "some_use"(%[[V2]]) : (vector<3xf32>) -> vector<3xf32>
-      %u0 = "some_use"(%r0) : (vector<1xf32>) -> vector<1xf32>
-      %u1 = "some_use"(%r1) : (vector<2xf32>) -> vector<2xf32>
-      %u2 = "some_use"(%r2) : (vector<3xf32>) -> vector<3xf32>
-
-      // Hoists
-      %w0 = vector.transfer_write %u0, %st0[%c0, %c0] : vector<1xf32>, tensor<?x?xf32>
-
-      // CHECK-DAG:     %[[STI1:.*]] = vector.transfer_write %[[U1]], %{{.*}} : vector<2xf32>, tensor<?x?xf32>
-      // Does not hoist (associated slice depends on %j).
-      %w1 = vector.transfer_write %u1, %st1[%i, %i] : vector<2xf32>, tensor<?x?xf32>
-
-      // CHECK-DAG:     %[[STI2:.*]] = vector.transfer_write %[[U2]], %{{.*}} : vector<3xf32>, tensor<?x?xf32>
-      // Does not hoist, 2 slice / insert_slice for %arg8.
-      %w2 = vector.transfer_write %u2, %st2[%c0, %c0] : vector<3xf32>, tensor<?x?xf32>
-
-      // Hoists.
-      %sti0 = tensor.insert_slice %w0 into %arg6[%i, %i][%step, %step][1, 1] : tensor<?x?xf32> into tensor<?x?xf32>
-
-      // CHECK-DAG:     tensor.insert_slice %[[STI1]] into %[[TENSOR1_ARG_L2]][%[[J]],{{.*}}: tensor<?x?xf32> into tensor<?x?xf32>
-      // Does not hoist (depends on %j).
-      %sti1 = tensor.insert_slice %w1 into %arg7[%j, %c0][%step, %step][1, 1] : tensor<?x?xf32> into tensor<?x?xf32>
-
-      // CHECK-DAG:     tensor.insert_slice %[[STI2]] into %[[TENSOR2_ARG_L2]][%[[I]],{{.*}}: tensor<?x?xf32> into tensor<?x?xf32>
-      // Does not hoist, 2 slice / insert_slice for %arg8.
-      %sti2 = tensor.insert_slice %w2 into %arg8[%i, %c0][%step, %step][1, 1] : tensor<?x?xf32> into tensor<?x?xf32>
-      // Extract with a different stride to make sure we cannot fold this extract with the above insert.
-      %st22 = tensor.extract_slice %sti2[%i, %c0][%step, %step][2, 1] : tensor<?x?xf32> to tensor<?x?xf32>
-      %sti22 = tensor.insert_slice %st22 into %arg8[%i, %c0][%step, %step][1, 1] : tensor<?x?xf32> into tensor<?x?xf32>
-
-      // CHECK:     scf.yield {{.*}} : tensor<?x?xf32>, tensor<?x?xf32>, vector<1xf32>
-      // CHECK:   }
-      scf.yield %sti0, %sti1, %sti22:
-        tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>
-    }
-
-    // Hoisted
-    // CHECK:   %[[STI0:.*]] = vector.transfer_write %[[R]]#2, %[[ST0]]{{.*}} : vector<1xf32>, tensor<?x?xf32>
-    // CHECK:   tensor.insert_slice %[[STI0]] into %[[TENSOR0_ARG]][%[[I]], %[[I]]]{{.*}} : tensor<?x?xf32> into tensor<?x?xf32>
-
-    // CHECK:   scf.yield {{.*}} : tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>
-    scf.yield %1#0, %1#1, %1#2 :
-      tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>
-
-    // CHECK: }
-  }
-  return %0#0, %0#1, %0#2 : tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>
-}
-
-module attributes {transform.with_named_sequence} {
-  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
-    %0 = transform.structured.match ops{["func.func"]} in %arg1
-      : (!transform.any_op) -> !transform.any_op
-    transform.structured.hoist_redundant_tensor_subsets %0
-      : (!transform.any_op) -> ()
-    transform.yield
-  }
-}
-
-// -----
-
-// CHECK-LABEL: func @hoist_vector_transfer_write_pairs_disjoint_tensor(
-//  CHECK-SAME:   %[[T:.*]]: tensor<?x?xf32>,
-//   CHECK-DAG:   %[[C0:.*]] = arith.constant 0 : index
-//   CHECK-DAG:   %[[C3:.*]] = arith.constant 3 : index
-//   CHECK-DAG:   %[[R0:.*]] = vector.transfer_read %[[T]][%[[C0]], %[[C0]]], %{{.*}} : tensor<?x?xf32>, vector<2xf32>
-//   CHECK-DAG:   %[[R1:.*]] = vector.transfer_read %[[T]][%[[C0]], %[[C3]]], %{{.*}} : tensor<?x?xf32>, vector<2xf32>
-//       CHECK:   %[[F:.*]]:2 = scf.for %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[R3:.*]] = %[[R1:.*]], %[[R2:.*]] = %[[R0]]) -> (vector<2xf32>, vector<2xf32>) {
-//       CHECK:     %[[R4:.*]] = "some_use"(%[[R2]]) : (vector<2xf32>) -> vector<2xf32>
-//       CHECK:     %[[R5:.*]] = "some_use"(%[[R3]]) : (vector<2xf32>) -> vector<2xf32>
-//       CHECK:     scf.yield %[[R5]], %[[R4]] : vector<2xf32>, vector<2xf32>
-//       CHECK:   }
-//       CHECK:   %[[W0:.*]] = vector.transfer_write %[[F]]#1, %[[T]][%[[C0]], %[[C0]]] : vector<2xf32>, tensor<?x?xf32>
-//       CHECK:   %[[W1:.*]] = vector.transfer_write %[[F]]#0, %[[W0]][%[[C0]], %[[C3]]] : vector<2xf32>, tensor<?x?xf32>
-//       CHECK:  return %[[W1]] : tensor<?x?xf32>
-func.func @hoist_vector_transfer_write_pairs_disjoint_tensor(
-    %tensor: tensor<?x?xf32>,
-    %val: index, %lb : index, %ub : index, %step: index) ->
-    (tensor<?x?xf32>) {
-  %c0 = arith.constant 0 : index
-  %c1 = arith.constant 1 : index
-  %c3 = arith.constant 3 : index
-  %cst = arith.constant 0.0 : f32
-  %1 = scf.for %j = %lb to %ub step %step iter_args(%arg5 = %tensor)
-    -> (tensor<?x?xf32>) {
-    %r00 = vector.transfer_read %arg5[%c0, %c0], %cst: tensor<?x?xf32>, vector<2xf32>
-    %u00 = "some_use"(%r00) : (vector<2xf32>) -> vector<2xf32>
-    %w10 = vector.transfer_write %u00, %arg5[%c0, %c0] : vector<2xf32>, tensor<?x?xf32>
-
-    // Hoist by properly bypassing the disjoint write %w10.
-    %r01 = vector.transfer_read %w10[%c0, %c3], %cst: tensor<?x?xf32>, vector<2xf32>
-    %u01 = "some_use"(%r01) : (vector<2xf32>) -> vector<2xf32>
-    %w11 = vector.transfer_write %u01, %w10[%c0, %c3] : vector<2xf32>, tensor<?x?xf32>
-    scf.yield %w11 : tensor<?x?xf32>
-  }
-  return %1 : tensor<?x?xf32>
-}
-
-module attributes {transform.with_named_sequence} {
-  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
-    %0 = transform.structured.match ops{["func.func"]} in %arg1
-      : (!transform.any_op) -> !transform.any_op
-    transform.structured.hoist_redundant_tensor_subsets %0
-      : (!transform.any_op) -> ()
-    transform.yield
-  }
-}
-
-// -----
-
-// CHECK-LABEL: func @hoist_vector_transfer_pairs_tensor_and_slices_static_large_tensor
-//  CHECK-SAME:   %[[TENSOR0:[a-zA-Z0-9]*]]: tensor<100x100xf32>,
-//  CHECK-SAME:   %[[TENSOR1:[a-zA-Z0-9]*]]: tensor<200x200xf32>,
-//  CHECK-SAME:   %[[TENSOR2:[a-zA-Z0-9]*]]: tensor<300x300xf32>
-func.func @hoist_vector_transfer_pairs_tensor_and_slices_static_large_tensor(
-    %tensor0: tensor<100x100xf32>, %tensor1: tensor<200x200xf32>, %tensor2: tensor<300x300xf32>,
-    %val: index, %lb : index, %ub : index, %step: index) ->
-    (
-      tensor<100x100xf32>, tensor<200x200xf32>, tensor<300x300xf32>
-    ) {
-  %c0 = arith.constant 0 : index
-  %cst = arith.constant 0.0 : f32
-
-  //      CHECK: scf.for %[[I:.*]] = {{.*}} iter_args(
-  // CHECK-SAME:   %[[TENSOR0_ARG:[0-9a-zA-Z]+]] = %[[TENSOR0]],
-  // CHECK-SAME:   %[[TENSOR1_ARG:[0-9a-zA-Z]+]] = %[[TENSOR1]],
-  // CHECK-SAME:   %[[TENSOR2_ARG:[0-9a-zA-Z]+]] = %[[TENSOR2]]
-  // CHECK-SAME: ) ->
-  // CHECK-SAME: (tensor<100x100xf32>, tensor<200x200xf32>, tensor<300x300xf32>
-  %0:3 = scf.for %i = %lb to %ub step %step
-  iter_args(%arg0 = %tensor0, %arg1 = %tensor1, %arg2 = %tensor2)
-    -> (tensor<100x100xf32>, tensor<200x200xf32>, tensor<300x300xf32>)  {
-
-    // Hoisted
-    // CHECK:   %[[ST0:.*]] = tensor.extract_slice %[[TENSOR0_ARG]][%[[I]], %[[I]]]{{.*}}: tensor<100x100xf32> to tensor<?x?xf32>
-    // CHECK:   %[[V0:.*]] = vector.transfer_read %[[ST0]]{{.*}} : tensor<?x?xf32>, vector<1xf32>
-
-    //      CHECK:   %[[R:.*]]:3 = scf.for %[[J:.*]] = {{.*}} iter_args(
-    // CHECK-SAME:   %[[TENSOR1_ARG_L2:[0-9a-zA-Z]+]] = %[[TENSOR1_ARG]]
-    // CHECK-SAME:   %[[TENSOR2_ARG_L2:[0-9a-zA-Z]+]] = %[[TENSOR2_ARG]]
-    // CHECK-SAME:   %[[V0_ARG_L2:[0-9a-zA-Z]+]] = %[[V0]]
-    // CHECK-SAME: ) ->
-    // CHECK-SAME: (tensor<200x200xf32>, tensor<300x300xf32>, vector<1xf32>
-    %1:3 = scf.for %j = %lb to %ub step %step
-    iter_args(%arg6 = %arg0, %arg7 = %arg1, %arg8 = %arg2)
-    -> (tensor<100x100xf32>, tensor<200x200xf32>, tensor<300x300xf32>)  {
-      // Hoists.
-      %st0 = tensor.extract_slice %arg6[%i, %i][%step, %step][1, 1] : tensor<100x100xf32> to tensor<?x?xf32>
-      %r0 = vector.transfer_read %st0[%c0, %c0], %cst: tensor<?x?xf32>, vector<1xf32>
-
-      // CHECK:     %[[ST1:.*]] = tensor.extract_slice %[[TENSOR1_ARG_L2]][%[[J]],{{.*}}: tensor<200x200xf32> to tensor<?x?xf32>
-      // CHECK:     %[[V1:.*]] = vector.transfer_read %[[ST1]]{{.*}} : tensor<?x?xf32>, vector<2xf32>
-      // Does not hoist (slice depends on %j)
-      %st1 = tensor.extract_slice %arg7[%j, %c0][%step, %step][1, 1] : tensor<200x200xf32> to tensor<?x?xf32>
-      %r1 = vector.transfer_read %st1[%c0, %c0], %cst: tensor<?x?xf32>, vector<2xf32>
-
-      // CHECK:     %[[ST2:.*]] = tensor.extract_slice %[[TENSOR2_ARG_L2]][%[[I]],{{.*}}: tensor<300x300xf32> to tensor<?x?xf32>
-      // CHECK:     %[[V2:.*]] = vector.transfer_read %[[ST2]]{{.*}} : tensor<?x?xf32>, vector<3xf32>
-      // Does not hoist, 2 slice %arg8.
-      %st2 = tensor.extract_slice %arg8[%i, %c0][%step, %step][1, 1] : tensor<300x300xf32> to tensor<?x?xf32>
-      %r2 = vector.transfer_read %st2[%c0, %c0], %cst: tensor<?x?xf32>, vector<3xf32>
-
-      // CHECK:     %[[U0:.*]] = "some_use"(%[[V0_ARG_L2]]) : (vector<1xf32>) -> vector<1xf32>
-      // CHECK:     %[[U1:.*]] = "some_use"(%[[V1]]) : (vector<2xf32>) -> vector<2xf32>
-      // CHECK:     %[[U2:.*]] = "some_use"(%[[V2]]) : (vector<3xf32>) -> vector<3xf32>
-      %u0 = "some_use"(%r0) : (vector<1xf32>) -> vector<1xf32>
-      %u1 = "some_use"(%r1) : (vector<2xf32>) -> vector<2xf32>
-      %u2 = "some_use"(%r2) : (vector<3xf32>) -> vector<3xf32>
-
-      // Hoists
-      %w0 = vector.transfer_write %u0, %st0[%c0, %c0] : vector<1xf32>, tensor<?x?xf32>
-
-      // CHECK-DAG:     %[[STI1:.*]] = vector.transfer_write %[[U1]], %{{.*}} : vector<2xf32>, tensor<?x?xf32>
-      // Does not hoist (associated slice depends on %j).
-      %w1 = vector.transfer_write %u1, %st1[%i, %i] : vector<2xf32>, tensor<?x?xf32>
-
-      // CHECK-DAG:     %[[STI2:.*]] = vector.transfer_write %[[U2]], %{{.*}} : vector<3xf32>, tensor<?x?xf32>
-      // Does not hoist, 2 slice / insert_slice for %arg8.
-      %w2 = vector.transfer_write %u2, %st2[%c0, %c0] : vector<3xf32>, tensor<?x?xf32>
-
-      // Hoists.
-      %sti0 = tensor.insert_slice %w0 into %arg6[%i, %i][%step, %step][1, 1] : tensor<?x?xf32> into tensor<100x100xf32>
-
-      // CHECK-DAG:     tensor.insert_slice %[[STI1]] into %[[TENSOR1_ARG_L2]][%[[J]],{{.*}}: tensor<?x?xf32> into tensor<200x200xf32>
-      // Does not hoist (depends on %j).
-      %sti1 = tensor.insert_slice %w1 into %arg7[%j, %c0][%step, %step][1, 1] : tensor<?x?xf32> into tensor<200x200xf32>
-
-      // CHECK-DAG:     tensor.insert_slice %[[STI2]] into %[[TENSOR2_ARG_L2]][%[[I]],{{.*}}: tensor<?x?xf32> into tensor<300x300xf32>
-      // Does not hoist, 2 slice / insert_slice for %arg8.
-      %sti2 = tensor.insert_slice %w2 into %arg8[%i, %c0][%step, %step][1, 1] : tensor<?x?xf32> into tensor<300x300xf32>
-      // Extract with a different stride to make sure we cannot fold this extract with the above insert.
-      %st22 = tensor.extract_slice %sti2[%i, %c0][%step, %step][2, 1] : tensor<300x300xf32> to tensor<?x?xf32>
-      %sti22 = tensor.insert_slice %st22 into %arg8[%i, %c0][%step, %step][1, 1] : tensor<?x?xf32> into tensor<300x300xf32>
-
-      // CHECK:     scf.yield {{.*}} : tensor<200x200xf32>, tensor<300x300xf32>, vector<1xf32>
-      // CHECK:   }
-      scf.yield %sti0, %sti1, %sti22:
-        tensor<100x100xf32>, tensor<200x200xf32>, tensor<300x300xf32>
-    }
-
-    // Hoisted
-    // CHECK:   %[[STI0:.*]] = vector.transfer_write %[[R]]#2, %[[ST0]]{{.*}} : vector<1xf32>, tensor<?x?xf32>
-    // CHECK:   tensor.insert_slice %[[STI0]] into %[[TENSOR0_ARG]][%[[I]], %[[I]]]{{.*}} : tensor<?x?xf32> into tensor<100x100xf32>
-
-    // CHECK:   scf.yield {{.*}} : tensor<100x100xf32>, tensor<200x200xf32>, tensor<300x300xf32>
-    scf.yield %1#0, %1#1, %1#2 :
-      tensor<100x100xf32>, tensor<200x200xf32>, tensor<300x300xf32>
-
-    // CHECK: }
-  }
-  return %0#0, %0#1, %0#2 : tensor<100x100xf32>, tensor<200x200xf32>, tensor<300x300xf32>
-}
-
-module attributes {transform.with_named_sequence} {
-  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
-    %0 = transform.structured.match ops{["func.func"]} in %arg1
-      : (!transform.any_op) -> !transform.any_op
-    transform.structured.hoist_redundant_tensor_subsets %0
-      : (!transform.any_op) -> ()
-    transform.yield
-  }
-}
-
-// -----
-
 // CHECK-LABEL:  func.func @hoist_vector_transfer_read(
 // CHECK-DAG:      %[[C0:.+]] = arith.constant 0 : index
 // CHECK-DAG:      %[[C128:.+]] = arith.constant 128 : index
diff --git a/mlir/test/Transforms/loop-invariant-subset-hoisting.mlir b/mlir/test/Transforms/loop-invariant-subset-hoisting.mlir
index bb60eeaba5245..3a78287a0dcad 100644
--- a/mlir/test/Transforms/loop-invariant-subset-hoisting.mlir
+++ b/mlir/test/Transforms/loop-invariant-subset-hoisting.mlir
@@ -277,3 +277,321 @@ func.func @nested_hoisting(%arg: tensor<?xf32>) -> tensor<?xf32> {
   // CHECK: return %[[insert2]]
   return %0 : tensor<?xf32>
 }
+
+// -----
+
+// CHECK-LABEL: func @hoist_vector_transfer_pairs_tensor
+func.func @hoist_vector_transfer_pairs_tensor(
+    %tensor0: tensor<?x?xf32>, %tensor1: tensor<?x?xf32>, %tensor2: tensor<?x?xf32>,
+    %tensor3: tensor<?x?xf32>, %tensor4: tensor<?x?xf32>, %tensor5: tensor<?x?xf32>,
+    %val: index, %lb : index, %ub : index, %step: index) ->
+    (tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>,
+     tensor<?x?xf32>, tensor<?x?xf32>) {
+  %c0 = arith.constant 0 : index
+  %cst = arith.constant 0.0 : f32
+
+// CHECK: vector.transfer_read %{{.*}} : tensor<?x?xf32>, vector<1xf32>
+// CHECK: scf.for {{.*}} iter_args({{.*}}) ->
+// CHECK-SAME: (tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, vector<1xf32>) {
+// CHECK:   vector.transfer_read %{{.*}} : tensor<?x?xf32>, vector<2xf32>
+// CHECK:   scf.for {{.*}} iter_args({{.*}}) ->
+// CHECK-SAME: (tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, vector<2xf32>, vector<1xf32>) {
+// CHECK:     vector.transfer_read %{{.*}} : tensor<?x?xf32>, vector<4xf32>
+// CHECK:     "test.some_crippling_use"(%{{.*}}) : (tensor<?x?xf32>) -> ()
+// CHECK:     vector.transfer_read %{{.*}} : tensor<?x?xf32>, vector<5xf32>
+// CHECK:     "test.some_use"(%{{.*}}) : (vector<1xf32>) -> vector<1xf32>
+// CHECK:     "test.some_use"(%{{.*}}) : (vector<2xf32>) -> vector<2xf32>
+// CHECK:     "test.some_use"(%{{.*}}) : (tensor<?x?xf32>) -> vector<3xf32>
+// CHECK:     "test.some_use"(%{{.*}}) : (vector<4xf32>) -> vector<4xf32>
+// CHECK:     "test.some_use"(%{{.*}}) : (vector<5xf32>) -> vector<5xf32>
+// CHECK:     vector.transfer_write %{{.*}} : vector<3xf32>, tensor<?x?xf32>
+// CHECK:     vector.transfer_write %{{.*}} : vector<4xf32>, tensor<?x?xf32>
+// CHECK:     vector.transfer_write %{{.*}} : vector<5xf32>, tensor<?x?xf32>
+// CHECK:     "test.some_crippling_use"(%{{.*}}) : (tensor<?x?xf32>) -> ()
+// CHECK:     scf.yield {{.*}} :
+// CHECK-SAME: tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, vector<2xf32>, vector<1xf32>
+// CHECK:   }
+// CHECK:   vector.transfer_write %{{.*}} : vector<2xf32>, tensor<?x?xf32>
+// CHECK:   scf.yield {{.*}} :
+// CHECK-SAME: tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, vector<1xf32>
+// CHECK: }
+// CHECK: vector.transfer_write %{{.*}} : vector<1xf32>, tensor<?x?xf32>
+  %0:6 = scf.for %i = %lb to %ub step %step
+  iter_args(%arg0 = %tensor0, %arg1 = %tensor1, %arg2 = %tensor2,
+            %arg3 = %tensor3,  %arg4 = %tensor4, %arg5 = %tensor5)
+  -> (tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>,
+     tensor<?x?xf32>, tensor<?x?xf32>)  {
+    %1:6 = scf.for %j = %lb to %ub step %step
+    iter_args(%arg6 = %arg0, %arg7 = %arg1, %arg8 = %arg2,
+              %arg9 = %arg3,  %arg10 = %arg4, %arg11 = %arg5)
+    -> (tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>,
+       tensor<?x?xf32>, tensor<?x?xf32>)  {
+      %r0 = vector.transfer_read %arg7[%c0, %c0], %cst: tensor<?x?xf32>, vector<1xf32>
+      %r1 = vector.transfer_read %arg6[%i, %i], %cst: tensor<?x?xf32>, vector<2xf32>
+      %r3 = vector.transfer_read %arg9[%c0, %c0], %cst: tensor<?x?xf32>, vector<4xf32>
+      "test.some_crippling_use"(%arg10) : (tensor<?x?xf32>) -> ()
+      %r4 = vector.transfer_read %arg10[%c0, %c0], %cst: tensor<?x?xf32>, vector<5xf32>
+      %r5 = vector.transfer_read %arg11[%c0, %c0], %cst: tensor<?x?xf32>, vector<6xf32>
+      "test.some_crippling_use"(%arg11) : (tensor<?x?xf32>) -> ()
+      %u0 = "test.some_use"(%r0) : (vector<1xf32>) -> vector<1xf32>
+      %u1 = "test.some_use"(%r1) : (vector<2xf32>) -> vector<2xf32>
+      %u2 = "test.some_use"(%arg8) : (tensor<?x?xf32>) -> vector<3xf32>
+      %u3 = "test.some_use"(%r3) : (vector<4xf32>) -> vector<4xf32>
+      %u4 = "test.some_use"(%r4) : (vector<5xf32>) -> vector<5xf32>
+      %u5 = "test.some_use"(%r5) : (vector<6xf32>) -> vector<6xf32>
+      %w1 = vector.transfer_write %u0, %arg7[%c0, %c0] : vector<1xf32>, tensor<?x?xf32>
+      %w0 = vector.transfer_write %u1, %arg6[%i, %i] : vector<2xf32>, tensor<?x?xf32>
+      %w2 = vector.transfer_write %u2, %arg8[%c0, %c0] : vector<3xf32>, tensor<?x?xf32>
+      %w3 = vector.transfer_write %u3, %arg9[%c0, %c0] : vector<4xf32>, tensor<?x?xf32>
+      %w4 = vector.transfer_write %u4, %arg10[%c0, %c0] : vector<5xf32>, tensor<?x?xf32>
+      %w5 = vector.transfer_write %u5, %arg11[%c0, %c0] : vector<6xf32>, tensor<?x?xf32>
+      "test.some_crippling_use"(%w3) : (tensor<?x?xf32>) -> ()
+      scf.yield %w0, %w1, %w2, %w3, %w4, %w5 :
+        tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>,
+        tensor<?x?xf32>, tensor<?x?xf32>
+      }
+      scf.yield %1#0,  %1#1, %1#2, %1#3, %1#4, %1#5 :
+        tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>,
+        tensor<?x?xf32>, tensor<?x?xf32>
+  }
+  return %0#0,  %0#1, %0#2, %0#3, %0#4,  %0#5 :
+        tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>,
+        tensor<?x?xf32>, tensor<?x?xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @hoist_vector_transfer_pairs_disjoint_tensor(
+//  CHECK-SAME:   %[[TENSOR0:[a-zA-Z0-9]*]]: tensor<?x?xf32>,
+//  CHECK-SAME:   %[[TENSOR1:[a-zA-Z0-9]*]]: tensor<?x?xf32>,
+//  CHECK-SAME:   %[[TENSOR2:[a-zA-Z0-9]*]]: tensor<?x?xf32>,
+//  CHECK-SAME:   %[[TENSOR3:[a-zA-Z0-9]*]]: tensor<?x?xf32>,
+func.func @hoist_vector_transfer_pairs_disjoint_tensor(
+    %tensor0: tensor<?x?xf32>, %tensor1: tensor<?x?xf32>,
+    %tensor2: tensor<?x?xf32>, %tensor3: tensor<?x?xf32>,
+    %val: index, %lb : index, %ub : index, %step: index,
+    %random_index : index) ->
+    (tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>) {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c3 = arith.constant 3 : index
+  %cst = arith.constant 0.0 : f32
+
+// CHECK: vector.transfer_read %[[TENSOR2]]{{.*}} : tensor<?x?xf32>, vector<3xf32>
+// CHECK: vector.transfer_read %[[TENSOR2]]{{.*}} : tensor<?x?xf32>, vector<3xf32>
+// CHECK: vector.transfer_read %[[TENSOR3]]{{.*}} : tensor<?x?xf32>, vector<4xf32>
+// CHECK: vector.transfer_read %[[TENSOR3]]{{.*}} : tensor<?x?xf32>, vector<4xf32>
+// CHECK: %[[R:.*]]:8 = scf.for {{.*}} iter_args({{.*}}) ->
+// CHECK-SAME: (tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, vector<3xf32>, vector<3xf32>, vector<4xf32>, vector<4xf32>) {
+// CHECK:   scf.for {{.*}} iter_args({{.*}}) ->
+// CHECK-SAME: (tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, vector<3xf32>, vector<3xf32>, vector<4xf32>, vector<4xf32>) {
+// CHECK:     vector.transfer_read %[[TENSOR1]]{{.*}} : tensor<?x?xf32>, vector<2xf32>
+// CHECK:     vector.transfer_read %[[TENSOR1]]{{.*}} : tensor<?x?xf32>, vector<2xf32>
+// CHECK:     "test.some_use"(%{{.*}}) : (vector<2xf32>) -> vector<2xf32>
+// CHECK:     "test.some_use"(%{{.*}}) : (vector<2xf32>) -> vector<2xf32>
+// CHECK:     "test.some_use"(%{{.*}}) : (vector<3xf32>) -> vector<3xf32>
+// CHECK:     "test.some_use"(%{{.*}}) : (vector<3xf32>) -> vector<3xf32>
+// CHECK:     "test.some_use"(%{{.*}}) : (vector<4xf32>) -> vector<4xf32>
+// CHECK:     "test.some_use"(%{{.*}}) : (vector<4xf32>) -> vector<4xf32>
+// CHECK:     "test.some_use"(%{{.*}}) : (vector<2xf32>) -> vector<2xf32>
+// CHECK:     "test.some_use"(%{{.*}}) : (vector<2xf32>) -> vector<2xf32>
+// CHECK:     vector.transfer_write %{{.*}}, %{{.*}}{{.*}} : vector<2xf32>, tensor<?x?xf32>
+// CHECK:     vector.transfer_write %{{.*}}, %{{.*}}{{.*}} : vector<2xf32>, tensor<?x?xf32>
+// CHECK:     scf.yield {{.*}} :
+// CHECK-SAME: tensor<?x?xf32>, tensor<?x?xf32>, vector<3xf32>, vector<3xf32>, vector<4xf32>, vector<4xf32>
+// CHECK:   }
+// CHECK:   scf.yield {{.*}} :
+// CHECK-SAME: tensor<?x?xf32>, tensor<?x?xf32>, vector<3xf32>, vector<3xf32>, vector<4xf32>, vector<4xf32>
+// CHECK: }
+// CHECK: %[[TENSOR4:.*]] = vector.transfer_write %[[R]]#7, %[[R]]#3{{.*}} : vector<4xf32>, tensor<?x?xf32>
+// CHECK:                   vector.transfer_write %[[R]]#6, %[[TENSOR4]]{{.*}} : vector<4xf32>, tensor<?x?xf32>
+// CHECK: %[[TENSOR5:.*]] = vector.transfer_write %[[R]]#5, %[[R]]#2{{.*}} : vector<3xf32>, tensor<?x?xf32>
+// CHECK:                   vector.transfer_write %[[R]]#4, %[[TENSOR5]]{{.*}} : vector<3xf32>, tensor<?x?xf32>
+  %0:4 = scf.for %i = %lb to %ub step %step
+  iter_args(%arg0 = %tensor0, %arg1 = %tensor1, %arg2 = %tensor2,
+            %arg3 = %tensor3)
+  -> (tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>) {
+    %1:4 = scf.for %j = %lb to %ub step %step
+    iter_args(%arg4 = %arg0, %arg5 = %arg1, %arg6 = %arg2,
+              %arg7 = %arg3)
+    -> (tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>) {
+      %r00 = vector.transfer_read %arg5[%c0, %c0], %cst: tensor<?x?xf32>, vector<2xf32>
+      %r01 = vector.transfer_read %arg5[%c0, %c1], %cst: tensor<?x?xf32>, vector<2xf32>
+      %r20 = vector.transfer_read %arg6[%c0, %c0], %cst: tensor<?x?xf32>, vector<3xf32>
+      %r21 = vector.transfer_read %arg6[%c0, %c3], %cst: tensor<?x?xf32>, vector<3xf32>
+      %r30 = vector.transfer_read %arg7[%c0, %random_index], %cst: tensor<?x?xf32>, vector<4xf32>
+      %r31 = vector.transfer_read %arg7[%c1, %random_index], %cst: tensor<?x?xf32>, vector<4xf32>
+      %r10 = vector.transfer_read %arg4[%i, %i], %cst: tensor<?x?xf32>, vector<2xf32>
+      %r11 = vector.transfer_read %arg4[%random_index, %random_index], %cst: tensor<?x?xf32>, vector<2xf32>
+      %u00 = "test.some_use"(%r00) : (vector<2xf32>) -> vector<2xf32>
+      %u01 = "test.some_use"(%r01) : (vector<2xf32>) -> vector<2xf32>
+      %u20 = "test.some_use"(%r20) : (vector<3xf32>) -> vector<3xf32>
+      %u21 = "test.some_use"(%r21) : (vector<3xf32>) -> vector<3xf32>
+      %u30 = "test.some_use"(%r30) : (vector<4xf32>) -> vector<4xf32>
+      %u31 = "test.some_use"(%r31) : (vector<4xf32>) -> vector<4xf32>
+      %u10 = "test.some_use"(%r10) : (vector<2xf32>) -> vector<2xf32>
+      %u11 = "test.some_use"(%r11) : (vector<2xf32>) -> vector<2xf32>
+      %w10 = vector.transfer_write %u00, %arg5[%c0, %c0] : vector<2xf32>, tensor<?x?xf32>
+      %w11 = vector.transfer_write %u01, %w10[%c0, %c1] : vector<2xf32>, tensor<?x?xf32>
+      %w20 = vector.transfer_write %u20, %arg6[%c0, %c0] : vector<3xf32>, tensor<?x?xf32>
+      %w21 = vector.transfer_write %u21, %w20[%c0, %c3] : vector<3xf32>, tensor<?x?xf32>
+      %w30 = vector.transfer_write %u30, %arg7[%c0, %random_index] : vector<4xf32>, tensor<?x?xf32>
+      %w31 = vector.transfer_write %u31, %w30[%c1, %random_index] : vector<4xf32>, tensor<?x?xf32>
+      %w00 = vector.transfer_write %u10, %arg4[%i, %i] : vector<2xf32>, tensor<?x?xf32>
+      %w01 = vector.transfer_write %u11, %w00[%random_index, %random_index] : vector<2xf32>, tensor<?x?xf32>
+      scf.yield %w01, %w11, %w21, %w31 : tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>
+    }
+    scf.yield %1#0,  %1#1, %1#2, %1#3 : tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>
+  }
+  return %0#0,  %0#1, %0#2, %0#3 : tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @hoist_vector_transfer_pairs_tensor_and_slices
+//  CHECK-SAME:   %[[TENSOR0:[a-zA-Z0-9]*]]: tensor<?x?xf32>,
+//  CHECK-SAME:   %[[TENSOR1:[a-zA-Z0-9]*]]: tensor<?x?xf32>,
+//  CHECK-SAME:   %[[TENSOR2:[a-zA-Z0-9]*]]: tensor<?x?xf32>,
+//  CHECK-SAME:   %[[TENSOR3:[a-zA-Z0-9]*]]: tensor<?x?xf32>,
+//  CHECK-SAME:   %[[TENSOR4:[a-zA-Z0-9]*]]: tensor<?x?xf32>,
+//  CHECK-SAME:   %[[TENSOR5:[a-zA-Z0-9]*]]: tensor<?x?xf32>
+func.func @hoist_vector_transfer_pairs_tensor_and_slices(
+    %tensor0: tensor<?x?xf32>, %tensor1: tensor<?x?xf32>, %tensor2: tensor<?x?xf32>,
+    %tensor3: tensor<?x?xf32>, %tensor4: tensor<?x?xf32>, %tensor5: tensor<?x?xf32>,
+    %val: index, %lb : index, %ub : index, %step: index) ->
+    (
+      tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>//, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>
+    ) {
+  %c0 = arith.constant 0 : index
+  %cst = arith.constant 0.0 : f32
+
+  //      CHECK: scf.for %[[I:.*]] = {{.*}} iter_args(
+  // CHECK-SAME:   %[[TENSOR0_ARG:[0-9a-zA-Z]+]] = %[[TENSOR0]],
+  // CHECK-SAME:   %[[TENSOR1_ARG:[0-9a-zA-Z]+]] = %[[TENSOR1]],
+  // CHECK-SAME:   %[[TENSOR2_ARG:[0-9a-zA-Z]+]] = %[[TENSOR2]]
+  // CHECK-SAME: ) ->
+  // CHECK-SAME: (tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>
+  %0:3 = scf.for %i = %lb to %ub step %step
+  iter_args(%arg0 = %tensor0, %arg1 = %tensor1, %arg2 = %tensor2)
+    -> (tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>)  {
+
+    // Hoisted
+    // CHECK:   %[[ST0:.*]] = tensor.extract_slice %[[TENSOR0_ARG]][%[[I]], %[[I]]]{{.*}}: tensor<?x?xf32> to tensor<?x?xf32>
+    // CHECK:   %[[V0:.*]] = vector.transfer_read %[[ST0]]{{.*}} : tensor<?x?xf32>, vector<1xf32>
+
+    //      CHECK:   %[[R:.*]]:5 = scf.for %[[J:.*]] = {{.*}} iter_args(
+    // CHECK-SAME:   %[[TENSOR0_ARG_L2:[0-9a-zA-Z]+]] = %[[TENSOR0_ARG]]
+    // CHECK-SAME:   %[[TENSOR1_ARG_L2:[0-9a-zA-Z]+]] = %[[TENSOR1_ARG]]
+    // CHECK-SAME:   %[[TENSOR2_ARG_L2:[0-9a-zA-Z]+]] = %[[TENSOR2_ARG]]
+    // CHECK-SAME:   %[[ST0_ARG_L2:[0-9a-zA-Z]+]] = %[[ST0]]
+    // CHECK-SAME:   %[[V0_ARG_L2:[0-9a-zA-Z]+]] = %[[V0]]
+    // CHECK-SAME: ) ->
+    // CHECK-SAME: (tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, vector<1xf32>)
+    %1:3 = scf.for %j = %lb to %ub step %step
+    iter_args(%arg6 = %arg0, %arg7 = %arg1, %arg8 = %arg2)
+    -> (tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>)  {
+      // Hoists.
+      %st0 = tensor.extract_slice %arg6[%i, %i][%step, %step][1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
+      %r0 = vector.transfer_read %st0[%c0, %c0], %cst: tensor<?x?xf32>, vector<1xf32>
+
+      // CHECK:     %[[ST1:.*]] = tensor.extract_slice %[[TENSOR1_ARG_L2]][%[[J]],{{.*}}: tensor<?x?xf32> to tensor<?x?xf32>
+      // CHECK:     %[[V1:.*]] = vector.transfer_read %[[ST1]]{{.*}} : tensor<?x?xf32>, vector<2xf32>
+      // Does not hoist (slice depends on %j)
+      %st1 = tensor.extract_slice %arg7[%j, %c0][%step, %step][1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
+      %r1 = vector.transfer_read %st1[%c0, %c0], %cst: tensor<?x?xf32>, vector<2xf32>
+
+      // CHECK:     %[[ST2:.*]] = tensor.extract_slice %[[TENSOR2_ARG_L2]][%[[I]],{{.*}}: tensor<?x?xf32> to tensor<?x?xf32>
+      // CHECK:     %[[V2:.*]] = vector.transfer_read %[[ST2]]{{.*}} : tensor<?x?xf32>, vector<3xf32>
+      // Does not hoist, 2 slice %arg8.
+      %st2 = tensor.extract_slice %arg8[%i, %c0][%step, %step][1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
+      %r2 = vector.transfer_read %st2[%c0, %c0], %cst: tensor<?x?xf32>, vector<3xf32>
+
+      // CHECK:     %[[U0:.*]] = "test.some_use"(%[[V0_ARG_L2]]) : (vector<1xf32>) -> vector<1xf32>
+      // CHECK:     %[[U1:.*]] = "test.some_use"(%[[V1]]) : (vector<2xf32>) -> vector<2xf32>
+      // CHECK:     %[[U2:.*]] = "test.some_use"(%[[V2]]) : (vector<3xf32>) -> vector<3xf32>
+      %u0 = "test.some_use"(%r0) : (vector<1xf32>) -> vector<1xf32>
+      %u1 = "test.some_use"(%r1) : (vector<2xf32>) -> vector<2xf32>
+      %u2 = "test.some_use"(%r2) : (vector<3xf32>) -> vector<3xf32>
+
+      // Hoists
+      %w0 = vector.transfer_write %u0, %st0[%c0, %c0] : vector<1xf32>, tensor<?x?xf32>
+
+      // CHECK-DAG:     %[[STI1:.*]] = vector.transfer_write %[[U1]], %{{.*}} : vector<2xf32>, tensor<?x?xf32>
+      // Does not hoist (associated slice depends on %j).
+      %w1 = vector.transfer_write %u1, %st1[%i, %i] : vector<2xf32>, tensor<?x?xf32>
+
+      // CHECK-DAG:     %[[STI2:.*]] = vector.transfer_write %[[U2]], %{{.*}} : vector<3xf32>, tensor<?x?xf32>
+      // Does not hoist, 2 slice / insert_slice for %arg8.
+      %w2 = vector.transfer_write %u2, %st2[%c0, %c0] : vector<3xf32>, tensor<?x?xf32>
+
+      // Hoists.
+      %sti0 = tensor.insert_slice %w0 into %arg6[%i, %i][%step, %step][1, 1] : tensor<?x?xf32> into tensor<?x?xf32>
+
+      // CHECK-DAG:     tensor.insert_slice %[[STI1]] into %[[TENSOR1_ARG_L2]][%[[J]],{{.*}}: tensor<?x?xf32> into tensor<?x?xf32>
+      // Does not hoist (depends on %j).
+      %sti1 = tensor.insert_slice %w1 into %arg7[%j, %c0][%step, %step][1, 1] : tensor<?x?xf32> into tensor<?x?xf32>
+
+      // CHECK-DAG:     tensor.insert_slice %[[STI2]] into %[[TENSOR2_ARG_L2]][%[[I]],{{.*}}: tensor<?x?xf32> into tensor<?x?xf32>
+      // Does not hoist, 2 slice / insert_slice for %arg8.
+      %sti2 = tensor.insert_slice %w2 into %arg8[%i, %c0][%step, %step][1, 1] : tensor<?x?xf32> into tensor<?x?xf32>
+      // Extract with a different stride to make sure we cannot fold this extract with the above insert.
+      %st22 = tensor.extract_slice %sti2[%i, %c0][%step, %step][2, 1] : tensor<?x?xf32> to tensor<?x?xf32>
+      %sti22 = tensor.insert_slice %st22 into %arg8[%i, %c0][%step, %step][1, 1] : tensor<?x?xf32> into tensor<?x?xf32>
+
+      // CHECK:     scf.yield {{.*}} : tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>, vector<1xf32>
+      // CHECK:   }
+      scf.yield %sti0, %sti1, %sti22:
+        tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>
+    }
+
+    // Hoisted
+    // CHECK:   %[[STI0:.*]] = vector.transfer_write %[[R]]#4, %[[R]]#3{{.*}} : vector<1xf32>, tensor<?x?xf32>
+    // CHECK:   tensor.insert_slice %[[STI0]] into %[[R]]#0[%[[I]], %[[I]]]{{.*}} : tensor<?x?xf32> into tensor<?x?xf32>
+
+    // CHECK:   scf.yield {{.*}} : tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>
+    scf.yield %1#0, %1#1, %1#2 :
+      tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>
+
+    // CHECK: }
+  }
+  return %0#0, %0#1, %0#2 : tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @hoist_vector_transfer_write_pairs_disjoint_tensor(
+//  CHECK-SAME:   %[[T:.*]]: tensor<?x?xf32>,
+//   CHECK-DAG:   %[[C0:.*]] = arith.constant 0 : index
+//   CHECK-DAG:   %[[C3:.*]] = arith.constant 3 : index
+//   CHECK-DAG:   %[[R0:.*]] = vector.transfer_read %[[T]][%[[C0]], %[[C0]]], %{{.*}} : tensor<?x?xf32>, vector<2xf32>
+//   CHECK-DAG:   %[[R1:.*]] = vector.transfer_read %[[T]][%[[C0]], %[[C3]]], %{{.*}} : tensor<?x?xf32>, vector<2xf32>
+//       CHECK:   %[[F:.*]]:3 = scf.for %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[TL:.*]] = %[[T]], %[[R2:.*]] = %[[R0]], %[[R3:.*]] = %[[R1]]) -> (tensor<?x?xf32>, vector<2xf32>, vector<2xf32>) {
+//       CHECK:     %[[R4:.*]] = "test.some_use"(%[[R2]]) : (vector<2xf32>) -> vector<2xf32>
+//       CHECK:     %[[R5:.*]] = "test.some_use"(%[[R3]]) : (vector<2xf32>) -> vector<2xf32>
+//       CHECK:     scf.yield %[[TL]], %[[R4]], %[[R5]] : tensor<?x?xf32>, vector<2xf32>, vector<2xf32>
+//       CHECK:   }
+//       CHECK:   %[[W0:.*]] = vector.transfer_write %[[F]]#2, %[[F]]#0[%[[C0]], %[[C3]]] : vector<2xf32>, tensor<?x?xf32>
+//       CHECK:   %[[W1:.*]] = vector.transfer_write %[[F]]#1, %[[W0]][%[[C0]], %[[C0]]] : vector<2xf32>, tensor<?x?xf32>
+//       CHECK:  return %[[W1]] : tensor<?x?xf32>
+func.func @hoist_vector_transfer_write_pairs_disjoint_tensor(
+    %tensor: tensor<?x?xf32>,
+    %val: index, %lb : index, %ub : index, %step: index) ->
+    (tensor<?x?xf32>) {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c3 = arith.constant 3 : index
+  %cst = arith.constant 0.0 : f32
+  %1 = scf.for %j = %lb to %ub step %step iter_args(%arg5 = %tensor)
+    -> (tensor<?x?xf32>) {
+    %r00 = vector.transfer_read %arg5[%c0, %c0], %cst: tensor<?x?xf32>, vector<2xf32>
+    %u00 = "test.some_use"(%r00) : (vector<2xf32>) -> vector<2xf32>
+    %w10 = vector.transfer_write %u00, %arg5[%c0, %c0] : vector<2xf32>, tensor<?x?xf32>
+
+    // Hoist by properly bypassing the disjoint write %w10.
+    %r01 = vector.transfer_read %w10[%c0, %c3], %cst: tensor<?x?xf32>, vector<2xf32>
+    %u01 = "test.some_use"(%r01) : (vector<2xf32>) -> vector<2xf32>
+    %w11 = vector.transfer_write %u01, %w10[%c0, %c3] : vector<2xf32>, tensor<?x?xf32>
+    scf.yield %w11 : tensor<?x?xf32>
+  }
+  return %1 : tensor<?x?xf32>
+}
diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index 7109b6c439057..2cadd4e0d2911 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -4598,6 +4598,7 @@ cc_library(
         ":Pass",
         ":SCFDialect",
         ":SideEffectInterfaces",
+        ":SubsetOpInterface",
         ":Support",
         ":TensorDialect",
         ":Transforms",

From 455098d8073483e0fa40557100cfb717ea466818 Mon Sep 17 00:00:00 2001
From: Med Ismail Bennani <ismail@bennani.ma>
Date: Tue, 31 Oct 2023 20:35:25 -0700
Subject: [PATCH 618/877] Revert "[lldb/Target] Delay image loading after
 corefile process creation (#70351)"

This reverts commit 3c727a959d1c479ef284471701faa0e8ac6f427e because it
introduced some test failures:

https://lab.llvm.org/buildbot/#/builders/68/builds/62638

```
********************
Failed Tests (5):
  lldb-api :: functionalities/postmortem/elf-core/TestLinuxCore.py
  lldb-api :: functionalities/postmortem/mach-core/TestMachCore.py
  lldb-api :: functionalities/postmortem/netbsd-core/TestNetBSDCore.py
  lldb-api :: functionalities/unwind/noreturn/module-end/TestNoReturnModuleEnd.py
  lldb-api :: tools/lldb-dap/coreFile/TestDAP_coreFile.py
```

Signed-off-by: Med Ismail Bennani <ismail@bennani.ma>
---
 lldb/include/lldb/Target/Process.h            |  2 -
 .../Process/mach-core/ProcessMachCore.cpp     |  4 +-
 .../Process/mach-core/ProcessMachCore.h       |  2 -
 lldb/source/Target/Process.cpp                | 29 ++++-----
 .../script-resource-loading/Makefile          |  5 --
 .../TestScriptResourceLoading.py              | 63 -------------------
 .../script-resource-loading/main.cpp          |  5 --
 .../my_scripting_resource.py                  | 15 -----
 8 files changed, 15 insertions(+), 110 deletions(-)
 delete mode 100644 lldb/test/API/functionalities/script-resource-loading/Makefile
 delete mode 100644 lldb/test/API/functionalities/script-resource-loading/TestScriptResourceLoading.py
 delete mode 100644 lldb/test/API/functionalities/script-resource-loading/main.cpp
 delete mode 100644 lldb/test/API/functionalities/script-resource-loading/my_scripting_resource.py

diff --git a/lldb/include/lldb/Target/Process.h b/lldb/include/lldb/Target/Process.h
index e25e82302a56d..a6d3e6c2d1692 100644
--- a/lldb/include/lldb/Target/Process.h
+++ b/lldb/include/lldb/Target/Process.h
@@ -614,8 +614,6 @@ class Process : public std::enable_shared_from_this<Process>,
     return error;
   }
 
-  virtual void DidLoadCore() {}
-
   /// The "ShadowListener" for a process is just an ordinary Listener that 
   /// listens for all the Process event bits.  It's convenient because you can
   /// specify it in the LaunchInfo or AttachInfo, so it will get events from
diff --git a/lldb/source/Plugins/Process/mach-core/ProcessMachCore.cpp b/lldb/source/Plugins/Process/mach-core/ProcessMachCore.cpp
index 9b10a0b832915..b11062a0224ab 100644
--- a/lldb/source/Plugins/Process/mach-core/ProcessMachCore.cpp
+++ b/lldb/source/Plugins/Process/mach-core/ProcessMachCore.cpp
@@ -570,6 +570,8 @@ Status ProcessMachCore::DoLoadCore() {
 
   CreateMemoryRegions();
 
+  LoadBinariesAndSetDYLD();
+
   CleanupMemoryRegionPermissions();
 
   AddressableBits addressable_bits = core_objfile->GetAddressableBits();
@@ -578,8 +580,6 @@ Status ProcessMachCore::DoLoadCore() {
   return error;
 }
 
-void ProcessMachCore::DidLoadCore() { LoadBinariesAndSetDYLD(); }
-
 lldb_private::DynamicLoader *ProcessMachCore::GetDynamicLoader() {
   if (m_dyld_up.get() == nullptr)
     m_dyld_up.reset(DynamicLoader::FindPlugin(this, m_dyld_plugin_name));
diff --git a/lldb/source/Plugins/Process/mach-core/ProcessMachCore.h b/lldb/source/Plugins/Process/mach-core/ProcessMachCore.h
index 0e61daa625b53..c8820209e3f38 100644
--- a/lldb/source/Plugins/Process/mach-core/ProcessMachCore.h
+++ b/lldb/source/Plugins/Process/mach-core/ProcessMachCore.h
@@ -46,8 +46,6 @@ class ProcessMachCore : public lldb_private::PostMortemProcess {
   // Creating a new process, or attaching to an existing one
   lldb_private::Status DoLoadCore() override;
 
-  void DidLoadCore() override;
-
   lldb_private::DynamicLoader *GetDynamicLoader() override;
 
   // PluginInterface protocol
diff --git a/lldb/source/Target/Process.cpp b/lldb/source/Target/Process.cpp
index f4bacf314dd74..f82ab05362fbe 100644
--- a/lldb/source/Target/Process.cpp
+++ b/lldb/source/Target/Process.cpp
@@ -2639,6 +2639,19 @@ Status Process::LoadCore() {
     else
       StartPrivateStateThread();
 
+    DynamicLoader *dyld = GetDynamicLoader();
+    if (dyld)
+      dyld->DidAttach();
+
+    GetJITLoaders().DidAttach();
+
+    SystemRuntime *system_runtime = GetSystemRuntime();
+    if (system_runtime)
+      system_runtime->DidAttach();
+
+    if (!m_os_up)
+      LoadOperatingSystemPlugin(false);
+
     // We successfully loaded a core file, now pretend we stopped so we can
     // show all of the threads in the core file and explore the crashed state.
     SetPrivateState(eStateStopped);
@@ -2655,23 +2668,7 @@ Status Process::LoadCore() {
                 StateAsCString(state));
       error.SetErrorString(
           "Did not get stopped event after loading the core file.");
-    } else {
-      DidLoadCore();
-
-      DynamicLoader *dyld = GetDynamicLoader();
-      if (dyld)
-        dyld->DidAttach();
-
-      GetJITLoaders().DidAttach();
-
-      SystemRuntime *system_runtime = GetSystemRuntime();
-      if (system_runtime)
-        system_runtime->DidAttach();
-
-      if (!m_os_up)
-        LoadOperatingSystemPlugin(false);
     }
-
     RestoreProcessEvents();
   }
   return error;
diff --git a/lldb/test/API/functionalities/script-resource-loading/Makefile b/lldb/test/API/functionalities/script-resource-loading/Makefile
deleted file mode 100644
index 98d4eb86e95bf..0000000000000
--- a/lldb/test/API/functionalities/script-resource-loading/Makefile
+++ /dev/null
@@ -1,5 +0,0 @@
-CXX_SOURCES := main.cpp
-
-override ARCH := $(shell uname -m)
-
-include Makefile.rules
diff --git a/lldb/test/API/functionalities/script-resource-loading/TestScriptResourceLoading.py b/lldb/test/API/functionalities/script-resource-loading/TestScriptResourceLoading.py
deleted file mode 100644
index 6148ed09e20b0..0000000000000
--- a/lldb/test/API/functionalities/script-resource-loading/TestScriptResourceLoading.py
+++ /dev/null
@@ -1,63 +0,0 @@
-"""
-Test loading python scripting resource from corefile
-"""
-
-import os, tempfile
-
-import lldb
-from lldbsuite.test.decorators import *
-from lldbsuite.test.lldbtest import *
-from lldbsuite.test import lldbutil
-from lldbsuite.test import lldbtest
-
-
-class ScriptResourceLoadingTestCase(TestBase):
-    NO_DEBUG_INFO_TESTCASE = True
-
-    def create_stack_skinny_corefile(self, file):
-        self.build()
-        target, process, thread, _ = lldbutil.run_to_source_breakpoint(
-            self, "// break", lldb.SBFileSpec("main.cpp")
-        )
-        self.assertTrue(process.IsValid(), "Process is invalid.")
-        # FIXME: Use SBAPI to save the process corefile.
-        self.runCmd("process save-core -s stack  " + file)
-        self.assertTrue(os.path.exists(file), "No stack-only corefile found.")
-        self.assertTrue(self.dbg.DeleteTarget(target), "Couldn't delete target")
-
-    def move_blueprint_to_dsym(self, blueprint_name):
-        blueprint_origin_path = os.path.join(self.getSourceDir(), blueprint_name)
-        dsym_bundle = self.getBuildArtifact("a.out.dSYM")
-        blueprint_destination_path = os.path.join(
-            dsym_bundle, "Contents", "Resources", "Python"
-        )
-        if not os.path.exists(blueprint_destination_path):
-            os.mkdir(blueprint_destination_path)
-
-        blueprint_destination_path = os.path.join(
-            blueprint_destination_path, "a_out.py"
-        )
-        shutil.copy(blueprint_origin_path, blueprint_destination_path)
-
-    @skipUnlessDarwin
-    def test_script_resource_loading(self):
-        """
-        Test that we're able to load the python scripting resource from
-        corefile dSYM bundle.
-
-        """
-        self.build()
-
-        self.runCmd("settings set target.load-script-from-symbol-file true")
-        self.move_blueprint_to_dsym("my_scripting_resource.py")
-
-        corefile_process = None
-        with tempfile.NamedTemporaryFile() as file:
-            self.create_stack_skinny_corefile(file.name)
-            corefile_target = self.dbg.CreateTarget(None)
-            corefile_process = corefile_target.LoadCore(
-                self.getBuildArtifact(file.name)
-            )
-        self.assertTrue(corefile_process, PROCESS_IS_VALID)
-        self.expect("command script list", substrs=["test_script_resource_loading"])
-        self.runCmd("test_script_resource_loading")
diff --git a/lldb/test/API/functionalities/script-resource-loading/main.cpp b/lldb/test/API/functionalities/script-resource-loading/main.cpp
deleted file mode 100644
index fb5f61d8ffcff..0000000000000
--- a/lldb/test/API/functionalities/script-resource-loading/main.cpp
+++ /dev/null
@@ -1,5 +0,0 @@
-int foo() {
-  return 42; // break
-}
-
-int main() { return foo(); }
diff --git a/lldb/test/API/functionalities/script-resource-loading/my_scripting_resource.py b/lldb/test/API/functionalities/script-resource-loading/my_scripting_resource.py
deleted file mode 100644
index d48ae5a8b59bd..0000000000000
--- a/lldb/test/API/functionalities/script-resource-loading/my_scripting_resource.py
+++ /dev/null
@@ -1,15 +0,0 @@
-import sys, lldb
-
-
-def test_script_resource_loading(debugger, command, exe_ctx, result, dict):
-    if not exe_ctx.target.process.IsValid():
-        result.SetError("invalid process")
-    process = exe_ctx.target.process
-    if not len(process):
-        result.SetError("invalid thread count")
-
-
-def __lldb_init_module(debugger, dict):
-    debugger.HandleCommand(
-        "command script add -o -f a_out.test_script_resource_loading test_script_resource_loading"
-    )

From 30416f39be326b403e19f23da387009736483119 Mon Sep 17 00:00:00 2001
From: Yingwei Zheng <dtcxzyw2333@gmail.com>
Date: Wed, 1 Nov 2023 11:44:19 +0800
Subject: [PATCH 619/877] [llvm-c] Improve TargetMachine bindings (#70806)

This PR exposes four APIs to C/OCaml bindings for `TargetMachine`:
```
/** Enable fast-path instruction selection. */
void LLVMSetTargetMachineFastISel(LLVMTargetMachineRef T, LLVMBool Enable);

/** Enable global instruction selection. */
void LLVMSetTargetMachineGlobalISel(LLVMTargetMachineRef T, LLVMBool Enable);

/** Set abort behaviour when global instruction selection fails to lower/select
 * an instruction. */
void LLVMSetTargetMachineGlobalISelAbort(LLVMTargetMachineRef T,
                                         LLVMGlobalISelAbortMode Mode);

/** Enable the MachineOutliner pass. */
void LLVMSetTargetMachineMachineOutliner(LLVMTargetMachineRef T,
                                         LLVMBool Enable);
```

Fixes #70666.
---
 llvm/bindings/ocaml/target/llvm_target.ml  | 15 ++++++++++
 llvm/bindings/ocaml/target/llvm_target.mli | 23 +++++++++++++++
 llvm/bindings/ocaml/target/target_ocaml.c  | 29 ++++++++++++++++++
 llvm/include/llvm-c/TargetMachine.h        | 21 +++++++++++++
 llvm/lib/Target/TargetMachineC.cpp         | 34 ++++++++++++++++++++++
 5 files changed, 122 insertions(+)

diff --git a/llvm/bindings/ocaml/target/llvm_target.ml b/llvm/bindings/ocaml/target/llvm_target.ml
index 29af0187f940b..ff899f7b6d01a 100644
--- a/llvm/bindings/ocaml/target/llvm_target.ml
+++ b/llvm/bindings/ocaml/target/llvm_target.ml
@@ -44,6 +44,13 @@ module CodeGenFileType = struct
   | ObjectFile
 end
 
+module GlobalISelAbortMode = struct
+  type t =
+  | Enable
+  | Disable
+  | DisableWithDiag
+end
+
 exception Error of string
 
 let () = Callback.register_exception "Llvm_target.Error" (Error "")
@@ -124,6 +131,14 @@ module TargetMachine = struct
                        = "llvm_targetmachine_data_layout"
   external set_verbose_asm : bool -> t -> unit
                            = "llvm_targetmachine_set_verbose_asm"
+  external set_fast_isel : bool -> t -> unit
+                           = "llvm_targetmachine_set_fast_isel"
+  external set_global_isel : bool -> t -> unit
+                           = "llvm_targetmachine_set_global_isel"
+  external set_global_isel_abort : ?mode:GlobalISelAbortMode.t -> t -> unit
+                                 = "llvm_targetmachine_set_global_isel_abort"
+  external set_machine_outliner : bool -> t -> unit
+                                = "llvm_targetmachine_set_machine_outliner"
   external emit_to_file : Llvm.llmodule -> CodeGenFileType.t -> string ->
                           t -> unit
                         = "llvm_targetmachine_emit_to_file"
diff --git a/llvm/bindings/ocaml/target/llvm_target.mli b/llvm/bindings/ocaml/target/llvm_target.mli
index 56ecb2d908dd3..4df15154feeb9 100644
--- a/llvm/bindings/ocaml/target/llvm_target.mli
+++ b/llvm/bindings/ocaml/target/llvm_target.mli
@@ -49,6 +49,13 @@ module CodeGenFileType : sig
   | ObjectFile
 end
 
+module GlobalISelAbortMode : sig
+  type t =
+  | Enable
+  | Disable
+  | DisableWithDiag
+end
+
 (** {6 Exceptions} *)
 
 exception Error of string
@@ -204,6 +211,22 @@ module TargetMachine : sig
       See [llvm::TargetMachine::setAsmVerbosity]. *)
   val set_verbose_asm : bool -> t -> unit
 
+  (** Enable fast-path instruction selection.
+      See [llvm::TargetMachine::setFastISel]. *)
+  val set_fast_isel : bool -> t -> unit
+
+  (** Enable global instruction selection.
+      See [llvm::TargetMachine::setGlobalISel]. *)
+  val set_global_isel : bool -> t -> unit
+
+  (** Set abort behaviour when global instruction selection fails to lower/select an instruction.
+      See [llvm::TargetMachine::setGlobalISelAbort]. *)
+  val set_global_isel_abort : ?mode:GlobalISelAbortMode.t -> t -> unit
+
+  (** Enable the MachineOutliner pass.
+      See [llvm::TargetMachine::setMachineOutliner]. *)
+  val set_machine_outliner : bool -> t -> unit
+
   (** Emits assembly or object data for the given module to the given
       file or raise [Error]. *)
   val emit_to_file : Llvm.llmodule -> CodeGenFileType.t -> string -> t -> unit
diff --git a/llvm/bindings/ocaml/target/target_ocaml.c b/llvm/bindings/ocaml/target/target_ocaml.c
index e80072d5eb635..94cf437b93486 100644
--- a/llvm/bindings/ocaml/target/target_ocaml.c
+++ b/llvm/bindings/ocaml/target/target_ocaml.c
@@ -301,6 +301,35 @@ value llvm_targetmachine_set_verbose_asm(value Verb, value Machine) {
   return Val_unit;
 }
 
+/* bool -> TargetMachine.t -> unit */
+value llvm_targetmachine_set_fast_isel(value Enable, value Machine) {
+  LLVMSetTargetMachineFastISel(TargetMachine_val(Machine), Bool_val(Enable));
+  return Val_unit;
+}
+
+/* bool -> TargetMachine.t -> unit */
+value llvm_targetmachine_set_global_isel(value Enable, value Machine) {
+  LLVMSetTargetMachineGlobalISel(TargetMachine_val(Machine), Bool_val(Enable));
+  return Val_unit;
+}
+
+/* ?mode:GlobalISelAbortMode.t -> TargetMachine.t -> unit */
+value llvm_targetmachine_set_global_isel_abort(value Mode, value Machine) {
+  LLVMGlobalISelAbortMode AbortModeEnum = LLVMGlobalISelAbortEnable;
+  if (Mode != Val_int(0))
+    AbortModeEnum = Int_val(Field(Mode, 0));
+  LLVMSetTargetMachineGlobalISelAbort(TargetMachine_val(Machine),
+                                      AbortModeEnum);
+  return Val_unit;
+}
+
+/* bool -> TargetMachine.t -> unit */
+value llvm_targetmachine_set_machine_outliner(value Enable, value Machine) {
+  LLVMSetTargetMachineMachineOutliner(TargetMachine_val(Machine),
+                                      Bool_val(Enable));
+  return Val_unit;
+}
+
 /* Llvm.llmodule -> CodeGenFileType.t -> string -> TargetMachine.t -> unit */
 value llvm_targetmachine_emit_to_file(value Module, value FileType,
                                       value FileName, value Machine) {
diff --git a/llvm/include/llvm-c/TargetMachine.h b/llvm/include/llvm-c/TargetMachine.h
index 0612fd0a19d6e..cbe8913803e02 100644
--- a/llvm/include/llvm-c/TargetMachine.h
+++ b/llvm/include/llvm-c/TargetMachine.h
@@ -67,6 +67,12 @@ typedef enum {
     LLVMObjectFile
 } LLVMCodeGenFileType;
 
+typedef enum {
+  LLVMGlobalISelAbortEnable,
+  LLVMGlobalISelAbortDisable,
+  LLVMGlobalISelAbortDisableWithDiag,
+} LLVMGlobalISelAbortMode;
+
 /** Returns the first llvm::Target in the registered targets list. */
 LLVMTargetRef LLVMGetFirstTarget(void);
 /** Returns the next llvm::Target given a previous one (or null if there's none) */
@@ -182,6 +188,21 @@ LLVMTargetDataRef LLVMCreateTargetDataLayout(LLVMTargetMachineRef T);
 void LLVMSetTargetMachineAsmVerbosity(LLVMTargetMachineRef T,
                                       LLVMBool VerboseAsm);
 
+/** Enable fast-path instruction selection. */
+void LLVMSetTargetMachineFastISel(LLVMTargetMachineRef T, LLVMBool Enable);
+
+/** Enable global instruction selection. */
+void LLVMSetTargetMachineGlobalISel(LLVMTargetMachineRef T, LLVMBool Enable);
+
+/** Set abort behaviour when global instruction selection fails to lower/select
+ * an instruction. */
+void LLVMSetTargetMachineGlobalISelAbort(LLVMTargetMachineRef T,
+                                         LLVMGlobalISelAbortMode Mode);
+
+/** Enable the MachineOutliner pass. */
+void LLVMSetTargetMachineMachineOutliner(LLVMTargetMachineRef T,
+                                         LLVMBool Enable);
+
 /** Emits an asm or object file for the given module to the filename. This
   wraps several c++ only classes (among them a file stream). Returns any
   error in ErrorMessage. Use LLVMDisposeMessage to dispose the message. */
diff --git a/llvm/lib/Target/TargetMachineC.cpp b/llvm/lib/Target/TargetMachineC.cpp
index 4d23f40c110d1..f6ed26be57559 100644
--- a/llvm/lib/Target/TargetMachineC.cpp
+++ b/llvm/lib/Target/TargetMachineC.cpp
@@ -252,6 +252,40 @@ void LLVMSetTargetMachineAsmVerbosity(LLVMTargetMachineRef T,
   unwrap(T)->Options.MCOptions.AsmVerbose = VerboseAsm;
 }
 
+void LLVMSetTargetMachineFastISel(LLVMTargetMachineRef T, LLVMBool Enable) {
+  unwrap(T)->setFastISel(Enable);
+}
+
+void LLVMSetTargetMachineGlobalISel(LLVMTargetMachineRef T, LLVMBool Enable) {
+  unwrap(T)->setGlobalISel(Enable);
+}
+
+void LLVMSetTargetMachineGlobalISelAbort(LLVMTargetMachineRef T,
+                                         LLVMGlobalISelAbortMode Mode) {
+  GlobalISelAbortMode AM;
+  switch (Mode) {
+  case LLVMGlobalISelAbortDisable:
+    AM = GlobalISelAbortMode::Disable;
+    break;
+  case LLVMGlobalISelAbortEnable:
+    AM = GlobalISelAbortMode::Enable;
+    break;
+  case LLVMGlobalISelAbortDisableWithDiag:
+    AM = GlobalISelAbortMode::DisableWithDiag;
+    break;
+  default:
+    AM = GlobalISelAbortMode::Enable;
+    break;
+  }
+
+  unwrap(T)->setGlobalISelAbort(AM);
+}
+
+void LLVMSetTargetMachineMachineOutliner(LLVMTargetMachineRef T,
+                                         LLVMBool Enable) {
+  unwrap(T)->setMachineOutliner(Enable);
+}
+
 LLVMTargetDataRef LLVMCreateTargetDataLayout(LLVMTargetMachineRef T) {
   return wrap(new DataLayout(unwrap(T)->createDataLayout()));
 }

From 5a71f7a4b725db3b9bf1457e2ca6be2db6aa3996 Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph@gmail.com>
Date: Tue, 31 Oct 2023 20:58:29 -0700
Subject: [PATCH 620/877] [mlir] Fix bufferization.alloc_tensor
 canonicalization crash (#70891)

This make sure that an invalid negative dimension is ignored and stays
dynamic instead of crashing the compiler.

Fixes #70887
---
 .../Dialect/Bufferization/IR/BufferizationOps.cpp   |  6 +++++-
 mlir/test/Dialect/Bufferization/canonicalize.mlir   | 13 +++++++++++++
 2 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/mlir/lib/Dialect/Bufferization/IR/BufferizationOps.cpp b/mlir/lib/Dialect/Bufferization/IR/BufferizationOps.cpp
index 8f19245efdba6..033eeac1939fc 100644
--- a/mlir/lib/Dialect/Bufferization/IR/BufferizationOps.cpp
+++ b/mlir/lib/Dialect/Bufferization/IR/BufferizationOps.cpp
@@ -314,7 +314,11 @@ struct ReplaceStaticShapeDims : OpRewritePattern<AllocTensorOp> {
       Value value = op.getDynamicSizes()[dynValCounter++];
       APInt intVal;
       if (matchPattern(value, m_ConstantInt(&intVal))) {
-        newShape[i] = intVal.getSExtValue();
+        int64_t dim = intVal.getSExtValue();
+        if (dim >= 0)
+          newShape[i] = intVal.getSExtValue();
+        else
+          newDynamicSizes.push_back(value);
       } else {
         newDynamicSizes.push_back(value);
       }
diff --git a/mlir/test/Dialect/Bufferization/canonicalize.mlir b/mlir/test/Dialect/Bufferization/canonicalize.mlir
index 12f13743febb7..3ba283928a83f 100644
--- a/mlir/test/Dialect/Bufferization/canonicalize.mlir
+++ b/mlir/test/Dialect/Bufferization/canonicalize.mlir
@@ -351,3 +351,16 @@ func.func @dealloc_base_memref_extract_of_alloc(%arg0: memref<2xi32>) {
 //  CHECK-SAME:([[ARG0:%.+]]: memref<2xi32>)
 //   CHECK-NOT: memref.alloc(
 //       CHECK: bufferization.dealloc ([[ARG0]] : memref<2xi32>) if (%true
+
+// -----
+
+// CHECK-LABEL: func @negative_input
+func.func @negative_input() -> tensor<?x?x?xf16> {
+  %idx27 = index.constant 27
+  %idx-3 = index.constant -3  // negative integer?
+  %c10 = arith.constant 10 : index
+// CHECK: bufferization.alloc_tensor
+// CHECK-SAME: tensor<10x?x27xf16>
+  %11 = bufferization.alloc_tensor(%c10, %idx-3, %idx27) : tensor<?x?x?xf16>
+  return %11 : tensor<?x?x?xf16>
+}

From ad1b7a47486545c669c0a4fe25f2e2d5d0e7530b Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph@gmail.com>
Date: Tue, 31 Oct 2023 21:10:03 -0700
Subject: [PATCH 621/877] Fix a warning: default label in switch which covers
 all enumeration values (NFC)

---
 llvm/lib/Target/TargetMachineC.cpp | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/llvm/lib/Target/TargetMachineC.cpp b/llvm/lib/Target/TargetMachineC.cpp
index f6ed26be57559..80024f9a6d5df 100644
--- a/llvm/lib/Target/TargetMachineC.cpp
+++ b/llvm/lib/Target/TargetMachineC.cpp
@@ -262,7 +262,7 @@ void LLVMSetTargetMachineGlobalISel(LLVMTargetMachineRef T, LLVMBool Enable) {
 
 void LLVMSetTargetMachineGlobalISelAbort(LLVMTargetMachineRef T,
                                          LLVMGlobalISelAbortMode Mode) {
-  GlobalISelAbortMode AM;
+  GlobalISelAbortMode AM = GlobalISelAbortMode::Enable;
   switch (Mode) {
   case LLVMGlobalISelAbortDisable:
     AM = GlobalISelAbortMode::Disable;
@@ -273,9 +273,6 @@ void LLVMSetTargetMachineGlobalISelAbort(LLVMTargetMachineRef T,
   case LLVMGlobalISelAbortDisableWithDiag:
     AM = GlobalISelAbortMode::DisableWithDiag;
     break;
-  default:
-    AM = GlobalISelAbortMode::Enable;
-    break;
   }
 
   unwrap(T)->setGlobalISelAbort(AM);

From 237d5c646991b57aacd60b23b3dab509eca9e1aa Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph@gmail.com>
Date: Tue, 31 Oct 2023 21:06:21 -0700
Subject: [PATCH 622/877] [mlir] Fix vector::InsertElementOp::fold on invalid
 input

The IR is valid, but UB: there is an out-of-bound index for the position
to insert inside the vector. We should just ignore this in the folder.

Fixes #70884
---
 mlir/lib/Dialect/Vector/IR/VectorOps.cpp   |  3 ++-
 mlir/test/Dialect/Vector/canonicalize.mlir | 13 +++++++++++++
 2 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
index f7b15f98e1665..4e34caa6d8aab 100644
--- a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
+++ b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
@@ -2507,7 +2507,8 @@ OpFoldResult vector::InsertElementOp::fold(FoldAdaptor adaptor) {
 
   auto attr = llvm::dyn_cast<IntegerAttr>(pos);
   uint64_t posIdx = attr.getInt();
-
+  if (posIdx >= results.size())
+    return {};
   results[posIdx] = src;
 
   return DenseElementsAttr::get(getDestVectorType(), results);
diff --git a/mlir/test/Dialect/Vector/canonicalize.mlir b/mlir/test/Dialect/Vector/canonicalize.mlir
index d866c14fcbf25..f6bb42b1b2491 100644
--- a/mlir/test/Dialect/Vector/canonicalize.mlir
+++ b/mlir/test/Dialect/Vector/canonicalize.mlir
@@ -2016,6 +2016,19 @@ func.func @insert_element_fold() -> vector<4xi32> {
 
 // -----
 
+// CHECK-LABEL: func @insert_element_invalid_fold
+func.func @insert_element_invalid_fold() -> vector<1xf32> {
+  // Out-of-bound index here.
+  %c26 = arith.constant 26 : index
+  %cst_2 = arith.constant 1.60215309E+9 : f32
+  %cst_20 = arith.constant dense<1.60215309E+9> : vector<1xf32>
+// CHECK: vector.insertelement
+  %46 = vector.insertelement %cst_2, %cst_20[%c26 : index] : vector<1xf32>
+  return %46 : vector<1xf32>
+}
+
+// -----
+
 // CHECK-LABEL: func @extract_element_fold
 //       CHECK:   %[[C:.+]] = arith.constant 5 : i32
 //       CHECK:   return %[[C]]

From 2ee87cd610bfbc7ae3916f9e40a3f7450aca6b59 Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph@gmail.com>
Date: Tue, 31 Oct 2023 21:24:57 -0700
Subject: [PATCH 623/877] [MLIR] Apply clang-tidy fixes for
 misc-include-cleaner in Presburger library (NFC)

---
 .../Analysis/Presburger/IntegerRelation.cpp   | 15 ++++++++++++++-
 .../Analysis/Presburger/LinearTransform.cpp   |  4 ++++
 mlir/lib/Analysis/Presburger/MPInt.cpp        |  4 +++-
 mlir/lib/Analysis/Presburger/Matrix.cpp       |  6 ++++++
 mlir/lib/Analysis/Presburger/PWMAFunction.cpp | 13 ++++++++++++-
 .../Presburger/PresburgerRelation.cpp         | 10 +++++++++-
 .../Analysis/Presburger/PresburgerSpace.cpp   |  2 ++
 mlir/lib/Analysis/Presburger/Simplex.cpp      | 19 +++++++++++++++++--
 mlir/lib/Analysis/Presburger/SlowMPInt.cpp    | 10 +++++++++-
 mlir/lib/Analysis/Presburger/Utils.cpp        | 11 ++++++++++-
 10 files changed, 86 insertions(+), 8 deletions(-)

diff --git a/mlir/lib/Analysis/Presburger/IntegerRelation.cpp b/mlir/lib/Analysis/Presburger/IntegerRelation.cpp
index a18947a04602a..6b26e09158208 100644
--- a/mlir/lib/Analysis/Presburger/IntegerRelation.cpp
+++ b/mlir/lib/Analysis/Presburger/IntegerRelation.cpp
@@ -13,16 +13,29 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/Analysis/Presburger/IntegerRelation.h"
+#include "mlir/Analysis/Presburger/Fraction.h"
 #include "mlir/Analysis/Presburger/LinearTransform.h"
+#include "mlir/Analysis/Presburger/MPInt.h"
 #include "mlir/Analysis/Presburger/PWMAFunction.h"
 #include "mlir/Analysis/Presburger/PresburgerRelation.h"
+#include "mlir/Analysis/Presburger/PresburgerSpace.h"
 #include "mlir/Analysis/Presburger/Simplex.h"
 #include "mlir/Analysis/Presburger/Utils.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Support/LogicalResult.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallBitVector.h"
 #include "llvm/Support/Debug.h"
-#include <numeric>
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+#include <functional>
+#include <memory>
 #include <optional>
+#include <utility>
+#include <vector>
 
 #define DEBUG_TYPE "presburger"
 
diff --git a/mlir/lib/Analysis/Presburger/LinearTransform.cpp b/mlir/lib/Analysis/Presburger/LinearTransform.cpp
index 32b21f5079fdc..3e080e698b199 100644
--- a/mlir/lib/Analysis/Presburger/LinearTransform.cpp
+++ b/mlir/lib/Analysis/Presburger/LinearTransform.cpp
@@ -8,6 +8,10 @@
 
 #include "mlir/Analysis/Presburger/LinearTransform.h"
 #include "mlir/Analysis/Presburger/IntegerRelation.h"
+#include "mlir/Analysis/Presburger/MPInt.h"
+#include "mlir/Analysis/Presburger/Matrix.h"
+#include "mlir/Support/LLVM.h"
+#include <utility>
 
 using namespace mlir;
 using namespace presburger;
diff --git a/mlir/lib/Analysis/Presburger/MPInt.cpp b/mlir/lib/Analysis/Presburger/MPInt.cpp
index 974907274e37f..587e2b572facf 100644
--- a/mlir/lib/Analysis/Presburger/MPInt.cpp
+++ b/mlir/lib/Analysis/Presburger/MPInt.cpp
@@ -7,7 +7,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/Analysis/Presburger/MPInt.h"
-#include "llvm/Support/MathExtras.h"
+#include "mlir/Analysis/Presburger/SlowMPInt.h"
+#include "llvm/ADT/Hashing.h"
+#include "llvm/Support/raw_ostream.h"
 
 using namespace mlir;
 using namespace presburger;
diff --git a/mlir/lib/Analysis/Presburger/Matrix.cpp b/mlir/lib/Analysis/Presburger/Matrix.cpp
index ae97e456d9820..fe461a1f95819 100644
--- a/mlir/lib/Analysis/Presburger/Matrix.cpp
+++ b/mlir/lib/Analysis/Presburger/Matrix.cpp
@@ -8,8 +8,14 @@
 
 #include "mlir/Analysis/Presburger/Matrix.h"
 #include "mlir/Analysis/Presburger/Fraction.h"
+#include "mlir/Analysis/Presburger/MPInt.h"
 #include "mlir/Analysis/Presburger/Utils.h"
+#include "mlir/Support/LLVM.h"
 #include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+#include <utility>
 
 using namespace mlir;
 using namespace presburger;
diff --git a/mlir/lib/Analysis/Presburger/PWMAFunction.cpp b/mlir/lib/Analysis/Presburger/PWMAFunction.cpp
index 9061fd3a6db85..d55962616de17 100644
--- a/mlir/lib/Analysis/Presburger/PWMAFunction.cpp
+++ b/mlir/lib/Analysis/Presburger/PWMAFunction.cpp
@@ -7,7 +7,18 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/Analysis/Presburger/PWMAFunction.h"
-#include "mlir/Analysis/Presburger/Simplex.h"
+#include "mlir/Analysis/Presburger/IntegerRelation.h"
+#include "mlir/Analysis/Presburger/MPInt.h"
+#include "mlir/Analysis/Presburger/PresburgerRelation.h"
+#include "mlir/Analysis/Presburger/PresburgerSpace.h"
+#include "mlir/Analysis/Presburger/Utils.h"
+#include "mlir/Support/LLVM.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/STLFunctionalExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
 #include <optional>
 
 using namespace mlir;
diff --git a/mlir/lib/Analysis/Presburger/PresburgerRelation.cpp b/mlir/lib/Analysis/Presburger/PresburgerRelation.cpp
index a82b60cddc73d..787fc1c659a12 100644
--- a/mlir/lib/Analysis/Presburger/PresburgerRelation.cpp
+++ b/mlir/lib/Analysis/Presburger/PresburgerRelation.cpp
@@ -8,14 +8,22 @@
 
 #include "mlir/Analysis/Presburger/PresburgerRelation.h"
 #include "mlir/Analysis/Presburger/IntegerRelation.h"
+#include "mlir/Analysis/Presburger/MPInt.h"
 #include "mlir/Analysis/Presburger/PWMAFunction.h"
 #include "mlir/Analysis/Presburger/PresburgerSpace.h"
 #include "mlir/Analysis/Presburger/Simplex.h"
 #include "mlir/Analysis/Presburger/Utils.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Support/LogicalResult.h"
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/ScopeExit.h"
 #include "llvm/ADT/SmallBitVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <functional>
 #include <optional>
+#include <utility>
+#include <vector>
 
 using namespace mlir;
 using namespace presburger;
diff --git a/mlir/lib/Analysis/Presburger/PresburgerSpace.cpp b/mlir/lib/Analysis/Presburger/PresburgerSpace.cpp
index 2ce5528365eb5..cf1b3befbc89f 100644
--- a/mlir/lib/Analysis/Presburger/PresburgerSpace.cpp
+++ b/mlir/lib/Analysis/Presburger/PresburgerSpace.cpp
@@ -7,6 +7,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/Analysis/Presburger/PresburgerSpace.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 #include <cassert>
 
diff --git a/mlir/lib/Analysis/Presburger/Simplex.cpp b/mlir/lib/Analysis/Presburger/Simplex.cpp
index ba840cad6b315..42bbc3363d583 100644
--- a/mlir/lib/Analysis/Presburger/Simplex.cpp
+++ b/mlir/lib/Analysis/Presburger/Simplex.cpp
@@ -7,11 +7,26 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/Analysis/Presburger/Simplex.h"
+#include "mlir/Analysis/Presburger/Fraction.h"
+#include "mlir/Analysis/Presburger/IntegerRelation.h"
+#include "mlir/Analysis/Presburger/MPInt.h"
 #include "mlir/Analysis/Presburger/Matrix.h"
-#include "mlir/Support/MathExtras.h"
+#include "mlir/Analysis/Presburger/PresburgerSpace.h"
+#include "mlir/Analysis/Presburger/Utils.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Support/LogicalResult.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallBitVector.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/Compiler.h"
-#include <numeric>
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <functional>
+#include <limits>
 #include <optional>
+#include <tuple>
+#include <utility>
 
 using namespace mlir;
 using namespace presburger;
diff --git a/mlir/lib/Analysis/Presburger/SlowMPInt.cpp b/mlir/lib/Analysis/Presburger/SlowMPInt.cpp
index bd1f0e2d2e537..ae6f2827be926 100644
--- a/mlir/lib/Analysis/Presburger/SlowMPInt.cpp
+++ b/mlir/lib/Analysis/Presburger/SlowMPInt.cpp
@@ -7,7 +7,15 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/Analysis/Presburger/SlowMPInt.h"
-#include "llvm/Support/MathExtras.h"
+#include "mlir/Support/LLVM.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/STLFunctionalExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <functional>
 
 using namespace mlir;
 using namespace presburger;
diff --git a/mlir/lib/Analysis/Presburger/Utils.cpp b/mlir/lib/Analysis/Presburger/Utils.cpp
index 9aef2f5de1093..612712a1d9078 100644
--- a/mlir/lib/Analysis/Presburger/Utils.cpp
+++ b/mlir/lib/Analysis/Presburger/Utils.cpp
@@ -13,8 +13,17 @@
 #include "mlir/Analysis/Presburger/Utils.h"
 #include "mlir/Analysis/Presburger/IntegerRelation.h"
 #include "mlir/Analysis/Presburger/MPInt.h"
+#include "mlir/Analysis/Presburger/PresburgerSpace.h"
+#include "mlir/Support/LLVM.h"
 #include "mlir/Support/LogicalResult.h"
-#include "mlir/Support/MathExtras.h"
+#include "llvm/ADT/STLFunctionalExtras.h"
+#include "llvm/ADT/SmallBitVector.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <functional>
 #include <numeric>
 
 #include <numeric>

From 06927848dae4279267392b1000f71c55a65e2c49 Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph@gmail.com>
Date: Fri, 20 Oct 2023 01:20:31 -0700
Subject: [PATCH 624/877] Apply clang-tidy fixes for
 performance-unnecessary-value-param in SliceAnalysis.cpp (NFC)

---
 mlir/include/mlir/Analysis/SliceAnalysis.h | 14 +++++++-------
 mlir/lib/Analysis/SliceAnalysis.cpp        | 18 +++++++++---------
 2 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/mlir/include/mlir/Analysis/SliceAnalysis.h b/mlir/include/mlir/Analysis/SliceAnalysis.h
index e64d3ac0271eb..d5cdf72c3889f 100644
--- a/mlir/include/mlir/Analysis/SliceAnalysis.h
+++ b/mlir/include/mlir/Analysis/SliceAnalysis.h
@@ -93,12 +93,12 @@ using ForwardSliceOptions = SliceOptions;
 ///      {4, 3, 6, 2, 1, 5, 8, 7, 9}
 ///
 void getForwardSlice(Operation *op, SetVector<Operation *> *forwardSlice,
-                     ForwardSliceOptions options = {});
+                     const ForwardSliceOptions &options = {});
 
 /// Value-rooted version of `getForwardSlice`. Return the union of all forward
 /// slices for the uses of the value `root`.
 void getForwardSlice(Value root, SetVector<Operation *> *forwardSlice,
-                     ForwardSliceOptions options = {});
+                     const ForwardSliceOptions &options = {});
 
 /// Fills `backwardSlice` with the computed backward slice (i.e.
 /// all the transitive defs of op), **without** including that operation.
@@ -135,12 +135,12 @@ void getForwardSlice(Value root, SetVector<Operation *> *forwardSlice,
 ///    {1, 2, 5, 3, 4, 6}
 ///
 void getBackwardSlice(Operation *op, SetVector<Operation *> *backwardSlice,
-                      BackwardSliceOptions options = {});
+                      const BackwardSliceOptions &options = {});
 
 /// Value-rooted version of `getBackwardSlice`. Return the union of all backward
 /// slices for the op defining or owning the value `root`.
 void getBackwardSlice(Value root, SetVector<Operation *> *backwardSlice,
-                      BackwardSliceOptions options = {});
+                      const BackwardSliceOptions &options = {});
 
 /// Iteratively computes backward slices and forward slices until
 /// a fixed point is reached. Returns an `SetVector<Operation *>` which
@@ -219,9 +219,9 @@ void getBackwardSlice(Value root, SetVector<Operation *> *backwardSlice,
 /// and keep things ordered but this is still hand-wavy and not worth the
 /// trouble for now: punt to a simple worklist-based solution.
 ///
-SetVector<Operation *> getSlice(Operation *op,
-                                BackwardSliceOptions backwardSliceOptions = {},
-                                ForwardSliceOptions forwardSliceOptions = {});
+SetVector<Operation *>
+getSlice(Operation *op, const BackwardSliceOptions &backwardSliceOptions = {},
+         const ForwardSliceOptions &forwardSliceOptions = {});
 
 /// Multi-root DAG topological sort.
 /// Performs a topological sort of the Operation in the `toSort` SetVector.
diff --git a/mlir/lib/Analysis/SliceAnalysis.cpp b/mlir/lib/Analysis/SliceAnalysis.cpp
index 95ec6ab2bd7a4..26fe8e3dc0819 100644
--- a/mlir/lib/Analysis/SliceAnalysis.cpp
+++ b/mlir/lib/Analysis/SliceAnalysis.cpp
@@ -26,7 +26,7 @@ using namespace mlir;
 
 static void
 getForwardSliceImpl(Operation *op, SetVector<Operation *> *forwardSlice,
-                    SliceOptions::TransitiveFilter filter = nullptr) {
+                    const SliceOptions::TransitiveFilter &filter = nullptr) {
   if (!op)
     return;
 
@@ -51,7 +51,7 @@ getForwardSliceImpl(Operation *op, SetVector<Operation *> *forwardSlice,
 }
 
 void mlir::getForwardSlice(Operation *op, SetVector<Operation *> *forwardSlice,
-                           ForwardSliceOptions options) {
+                           const ForwardSliceOptions &options) {
   getForwardSliceImpl(op, forwardSlice, options.filter);
   if (!options.inclusive) {
     // Don't insert the top level operation, we just queried on it and don't
@@ -67,7 +67,7 @@ void mlir::getForwardSlice(Operation *op, SetVector<Operation *> *forwardSlice,
 }
 
 void mlir::getForwardSlice(Value root, SetVector<Operation *> *forwardSlice,
-                           SliceOptions options) {
+                           const SliceOptions &options) {
   for (Operation *user : root.getUsers())
     getForwardSliceImpl(user, forwardSlice, options.filter);
 
@@ -80,7 +80,7 @@ void mlir::getForwardSlice(Value root, SetVector<Operation *> *forwardSlice,
 
 static void getBackwardSliceImpl(Operation *op,
                                  SetVector<Operation *> *backwardSlice,
-                                 BackwardSliceOptions options) {
+                                 const BackwardSliceOptions &options) {
   if (!op || op->hasTrait<OpTrait::IsIsolatedFromAbove>())
     return;
 
@@ -119,7 +119,7 @@ static void getBackwardSliceImpl(Operation *op,
 
 void mlir::getBackwardSlice(Operation *op,
                             SetVector<Operation *> *backwardSlice,
-                            BackwardSliceOptions options) {
+                            const BackwardSliceOptions &options) {
   getBackwardSliceImpl(op, backwardSlice, options);
 
   if (!options.inclusive) {
@@ -130,7 +130,7 @@ void mlir::getBackwardSlice(Operation *op,
 }
 
 void mlir::getBackwardSlice(Value root, SetVector<Operation *> *backwardSlice,
-                            BackwardSliceOptions options) {
+                            const BackwardSliceOptions &options) {
   if (Operation *definingOp = root.getDefiningOp()) {
     getBackwardSlice(definingOp, backwardSlice, options);
     return;
@@ -139,9 +139,9 @@ void mlir::getBackwardSlice(Value root, SetVector<Operation *> *backwardSlice,
   getBackwardSlice(bbAargOwner, backwardSlice, options);
 }
 
-SetVector<Operation *> mlir::getSlice(Operation *op,
-                                      BackwardSliceOptions backwardSliceOptions,
-                                      ForwardSliceOptions forwardSliceOptions) {
+SetVector<Operation *>
+mlir::getSlice(Operation *op, const BackwardSliceOptions &backwardSliceOptions,
+               const ForwardSliceOptions &forwardSliceOptions) {
   SetVector<Operation *> slice;
   slice.insert(op);
 

From 954af75cebbdad6de747b1491bd83a204c8b0d6f Mon Sep 17 00:00:00 2001
From: Davide Italiano <dcci@users.noreply.github.com>
Date: Tue, 31 Oct 2023 21:30:36 -0700
Subject: [PATCH 625/877] [PGO] Skip optimizing probes that don't fit. (#70875)

The discriminator can only pack 16 bits, so anything exceeding that
value will cause the packing code to crash. Emit a diagnostic and skip
the optimization instead.
---
 llvm/lib/Transforms/IPO/SampleProfileProbe.cpp | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/llvm/lib/Transforms/IPO/SampleProfileProbe.cpp b/llvm/lib/Transforms/IPO/SampleProfileProbe.cpp
index 56d711c588b07..b786685d0943a 100644
--- a/llvm/lib/Transforms/IPO/SampleProfileProbe.cpp
+++ b/llvm/lib/Transforms/IPO/SampleProfileProbe.cpp
@@ -18,6 +18,7 @@
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/IntrinsicInst.h"
@@ -221,12 +222,25 @@ void SampleProfileProber::computeProbeIdForBlocks() {
 }
 
 void SampleProfileProber::computeProbeIdForCallsites() {
+  LLVMContext &Ctx = F->getContext();
+  Module *M = F->getParent();
+
   for (auto &BB : *F) {
     for (auto &I : BB) {
       if (!isa<CallBase>(I))
         continue;
       if (isa<IntrinsicInst>(&I))
         continue;
+
+      // The current implementation uses the lower 16 bits of the discriminator
+      // so anything larger than 0xFFFF will be ignored.
+      if (LastProbeId >= 0xFFFF) {
+        std::string Msg = "Pseudo instrumentation incomplete for " +
+                          std::string(F->getName()) + " because it's too large";
+        Ctx.diagnose(DiagnosticInfoSampleProfile(M->getName().data(), Msg));
+        return;
+      }
+
       CallProbeIds[&I] = ++LastProbeId;
     }
   }

From 95d15d1814f04e80f3af6c0593e9faaecb587fdd Mon Sep 17 00:00:00 2001
From: "Wang, Xin10" <xin10.wang@intel.com>
Date: Tue, 31 Oct 2023 22:08:25 -0700
Subject: [PATCH 626/877] Fix build fail

---
 libclc/utils/libclc-remangler/LibclcRemangler.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libclc/utils/libclc-remangler/LibclcRemangler.cpp b/libclc/utils/libclc-remangler/LibclcRemangler.cpp
index 7b850146dfabf..b5b9a0674afb1 100644
--- a/libclc/utils/libclc-remangler/LibclcRemangler.cpp
+++ b/libclc/utils/libclc-remangler/LibclcRemangler.cpp
@@ -631,7 +631,7 @@ class Remangler {
       }
       case Node::Kind::KVectorType: {
         Res = AST->getVectorType(Res, I->Data,
-                                 clang::VectorType::VectorKind::GenericVector);
+                                 clang::VectorKind::Generic);
         break;
       }
       case Node::Kind::KQualType: {

From fc7198b799b0f9dc9e27b5ae4334e5c1b1c89b6e Mon Sep 17 00:00:00 2001
From: Serge Pavlov <sepavloff@gmail.com>
Date: Wed, 1 Nov 2023 12:10:54 +0700
Subject: [PATCH 627/877] [clang] Additional FP classification functions
 (#69041)

C language standard defined library functions `iszero`, `issignaling`
and `issubnormal`, which did not have counterparts among clang builtin
functions. This change adds new functions:

    __builtin_iszero
    __builtin_issubnormal
    __builtin_issignaling

They provide builtin implementation for the missing standard functions.

Pull request: https://github.com/llvm/llvm-project/pull/69041
---
 clang/docs/ReleaseNotes.rst                 |  2 ++
 clang/include/clang/Basic/Builtins.def      |  3 ++
 clang/lib/AST/ExprConstant.cpp              | 18 ++++++++++
 clang/lib/AST/Interp/Floating.h             |  1 +
 clang/lib/AST/Interp/InterpBuiltin.cpp      | 39 +++++++++++++++++++++
 clang/lib/CodeGen/CGBuiltin.cpp             | 24 +++++++++++++
 clang/lib/Sema/SemaChecking.cpp             |  3 ++
 clang/test/AST/Interp/builtin-functions.cpp | 13 +++++++
 clang/test/CodeGen/builtins.c               | 15 ++++++++
 clang/test/Sema/constant-builtins-2.c       | 39 +++++++++++++++++++++
 10 files changed, 157 insertions(+)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index bc28bb567f693..1cd8cf8fe3288 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -809,6 +809,8 @@ Floating Point Support in Clang
 - Add ``__builtin_exp10``, ``__builtin_exp10f``,
   ``__builtin_exp10f16``, ``__builtin_exp10l`` and
   ``__builtin_exp10f128`` builtins.
+- Add ``__builtin_iszero``, ``__builtin_issignaling`` and
+  ``__builtin_issubnormal``.
 
 AST Matchers
 ------------
diff --git a/clang/include/clang/Basic/Builtins.def b/clang/include/clang/Basic/Builtins.def
index 6033e8a955fb8..ec39e926889b9 100644
--- a/clang/include/clang/Basic/Builtins.def
+++ b/clang/include/clang/Basic/Builtins.def
@@ -494,6 +494,9 @@ BUILTIN(__builtin_isinf,      "i.", "FnctE")
 BUILTIN(__builtin_isinf_sign, "i.", "FnctE")
 BUILTIN(__builtin_isnan,      "i.", "FnctE")
 BUILTIN(__builtin_isnormal,   "i.", "FnctE")
+BUILTIN(__builtin_issubnormal,"i.", "FnctE")
+BUILTIN(__builtin_iszero,     "i.", "FnctE")
+BUILTIN(__builtin_issignaling,"i.", "FnctE")
 BUILTIN(__builtin_isfpclass,  "i.", "nctE")
 
 // FP signbit builtins
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index aea26d380b8e8..b6b1e6617dffa 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -12398,6 +12398,24 @@ bool IntExprEvaluator::VisitBuiltinCallExpr(const CallExpr *E,
            Success(Val.isNormal() ? 1 : 0, E);
   }
 
+  case Builtin::BI__builtin_issubnormal: {
+    APFloat Val(0.0);
+    return EvaluateFloat(E->getArg(0), Val, Info) &&
+           Success(Val.isDenormal() ? 1 : 0, E);
+  }
+
+  case Builtin::BI__builtin_iszero: {
+    APFloat Val(0.0);
+    return EvaluateFloat(E->getArg(0), Val, Info) &&
+           Success(Val.isZero() ? 1 : 0, E);
+  }
+
+  case Builtin::BI__builtin_issignaling: {
+    APFloat Val(0.0);
+    return EvaluateFloat(E->getArg(0), Val, Info) &&
+           Success(Val.isSignaling() ? 1 : 0, E);
+  }
+
   case Builtin::BI__builtin_isfpclass: {
     APSInt MaskVal;
     if (!EvaluateInteger(E->getArg(1), MaskVal, Info))
diff --git a/clang/lib/AST/Interp/Floating.h b/clang/lib/AST/Interp/Floating.h
index a22b3fa79f399..e4ac76d8509fb 100644
--- a/clang/lib/AST/Interp/Floating.h
+++ b/clang/lib/AST/Interp/Floating.h
@@ -93,6 +93,7 @@ class Floating final {
   bool isMin() const { return F.isSmallest(); }
   bool isMinusOne() const { return F.isExactlyValue(-1.0); }
   bool isNan() const { return F.isNaN(); }
+  bool isSignaling() const { return F.isSignaling(); }
   bool isInf() const { return F.isInfinity(); }
   bool isFinite() const { return F.isFinite(); }
   bool isNormal() const { return F.isNormal(); }
diff --git a/clang/lib/AST/Interp/InterpBuiltin.cpp b/clang/lib/AST/Interp/InterpBuiltin.cpp
index e329794cb7924..f26d298f5b600 100644
--- a/clang/lib/AST/Interp/InterpBuiltin.cpp
+++ b/clang/lib/AST/Interp/InterpBuiltin.cpp
@@ -306,6 +306,15 @@ static bool interp__builtin_isnan(InterpState &S, CodePtr OpPC,
   return true;
 }
 
+static bool interp__builtin_issignaling(InterpState &S, CodePtr OpPC,
+                                        const InterpFrame *Frame,
+                                        const Function *F) {
+  const Floating &Arg = S.Stk.peek<Floating>();
+
+  pushInt(S, Arg.isSignaling());
+  return true;
+}
+
 static bool interp__builtin_isinf(InterpState &S, CodePtr OpPC,
                                   const InterpFrame *Frame, const Function *F,
                                   bool CheckSign) {
@@ -337,6 +346,24 @@ static bool interp__builtin_isnormal(InterpState &S, CodePtr OpPC,
   return true;
 }
 
+static bool interp__builtin_issubnormal(InterpState &S, CodePtr OpPC,
+                                        const InterpFrame *Frame,
+                                        const Function *F) {
+  const Floating &Arg = S.Stk.peek<Floating>();
+
+  pushInt(S, Arg.isDenormal());
+  return true;
+}
+
+static bool interp__builtin_iszero(InterpState &S, CodePtr OpPC,
+                                   const InterpFrame *Frame,
+                                   const Function *F) {
+  const Floating &Arg = S.Stk.peek<Floating>();
+
+  pushInt(S, Arg.isZero());
+  return true;
+}
+
 /// First parameter to __builtin_isfpclass is the floating value, the
 /// second one is an integral value.
 static bool interp__builtin_isfpclass(InterpState &S, CodePtr OpPC,
@@ -491,6 +518,10 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const Function *F,
     if (interp__builtin_isnan(S, OpPC, Frame, F))
       return retInt(S, OpPC, Dummy);
     break;
+  case Builtin::BI__builtin_issignaling:
+    if (interp__builtin_issignaling(S, OpPC, Frame, F))
+      return retInt(S, OpPC, Dummy);
+    break;
 
   case Builtin::BI__builtin_isinf:
     if (interp__builtin_isinf(S, OpPC, Frame, F, /*Sign=*/false))
@@ -510,6 +541,14 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const Function *F,
     if (interp__builtin_isnormal(S, OpPC, Frame, F))
       return retInt(S, OpPC, Dummy);
     break;
+  case Builtin::BI__builtin_issubnormal:
+    if (interp__builtin_issubnormal(S, OpPC, Frame, F))
+      return retInt(S, OpPC, Dummy);
+    break;
+  case Builtin::BI__builtin_iszero:
+    if (interp__builtin_iszero(S, OpPC, Frame, F))
+      return retInt(S, OpPC, Dummy);
+    break;
   case Builtin::BI__builtin_isfpclass:
     if (interp__builtin_isfpclass(S, OpPC, Frame, F, Call))
       return retInt(S, OpPC, Dummy);
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index a90d0fdd2efbf..7c2911468cad5 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -3356,6 +3356,14 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
                            ConvertType(E->getType())));
   }
 
+  case Builtin::BI__builtin_issignaling: {
+    CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E);
+    Value *V = EmitScalarExpr(E->getArg(0));
+    return RValue::get(
+        Builder.CreateZExt(Builder.createIsFPClass(V, FPClassTest::fcSNan),
+                           ConvertType(E->getType())));
+  }
+
   case Builtin::BI__builtin_isinf: {
     CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E);
     Value *V = EmitScalarExpr(E->getArg(0));
@@ -3390,6 +3398,22 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
                            ConvertType(E->getType())));
   }
 
+  case Builtin::BI__builtin_issubnormal: {
+    CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E);
+    Value *V = EmitScalarExpr(E->getArg(0));
+    return RValue::get(
+        Builder.CreateZExt(Builder.createIsFPClass(V, FPClassTest::fcSubnormal),
+                           ConvertType(E->getType())));
+  }
+
+  case Builtin::BI__builtin_iszero: {
+    CodeGenFunction::CGFPOptionsRAII FPOptsRAII(*this, E);
+    Value *V = EmitScalarExpr(E->getArg(0));
+    return RValue::get(
+        Builder.CreateZExt(Builder.createIsFPClass(V, FPClassTest::fcZero),
+                           ConvertType(E->getType())));
+  }
+
   case Builtin::BI__builtin_isfpclass: {
     Expr::EvalResult Result;
     if (!E->getArg(1)->EvaluateAsInt(Result, CGM.getContext()))
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index a8217ed1297b0..1842a783dc29a 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -2255,7 +2255,10 @@ Sema::CheckBuiltinFunctionCall(FunctionDecl *FDecl, unsigned BuiltinID,
   case Builtin::BI__builtin_isinf:
   case Builtin::BI__builtin_isinf_sign:
   case Builtin::BI__builtin_isnan:
+  case Builtin::BI__builtin_issignaling:
   case Builtin::BI__builtin_isnormal:
+  case Builtin::BI__builtin_issubnormal:
+  case Builtin::BI__builtin_iszero:
   case Builtin::BI__builtin_signbit:
   case Builtin::BI__builtin_signbitf:
   case Builtin::BI__builtin_signbitl:
diff --git a/clang/test/AST/Interp/builtin-functions.cpp b/clang/test/AST/Interp/builtin-functions.cpp
index 65361d67d68d5..a78a0fbdf11b1 100644
--- a/clang/test/AST/Interp/builtin-functions.cpp
+++ b/clang/test/AST/Interp/builtin-functions.cpp
@@ -130,6 +130,8 @@ namespace nan {
                                              // expected-error {{must be initialized by a constant expression}} \
                                              // expected-note {{read of dereferenced one-past-the-end pointer}} \
                                              // expected-note {{in call to}}
+  static_assert(!__builtin_issignaling(__builtin_nan("")), "");
+  static_assert(__builtin_issignaling(__builtin_nans("")), "");
 }
 
 namespace fmin {
@@ -153,6 +155,17 @@ namespace inf {
 
   static_assert(__builtin_isnormal(1.0), "");
   static_assert(!__builtin_isnormal(__builtin_inf()), "");
+
+#ifndef __AVR__
+  static_assert(__builtin_issubnormal(0x1p-1070), "");
+#endif
+  static_assert(!__builtin_issubnormal(__builtin_inf()), "");
+
+  static_assert(__builtin_iszero(0.0), "");
+  static_assert(!__builtin_iszero(__builtin_inf()), "");
+
+  static_assert(__builtin_issignaling(__builtin_nans("")), "");
+  static_assert(!__builtin_issignaling(__builtin_inf()), "");
 }
 
 namespace isfpclass {
diff --git a/clang/test/CodeGen/builtins.c b/clang/test/CodeGen/builtins.c
index 1b1b2cd6413a3..ce1182b724dcc 100644
--- a/clang/test/CodeGen/builtins.c
+++ b/clang/test/CodeGen/builtins.c
@@ -64,6 +64,9 @@ int main(void) {
   P(isinf_sign, (1.));
   P(isnan, (1.));
   P(isfinite, (1.));
+  P(iszero, (1.));
+  P(issubnormal, (1.));
+  P(issignaling, (1.));
   P(isfpclass, (1., 1));
 
   // Bitwise & Numeric Functions
@@ -270,6 +273,18 @@ void test_float_builtins(__fp16 *H, float F, double D, long double LD) {
   // CHECK: [[TMP:%.*]] = call i1 @llvm.is.fpclass.f32(float {{.*}}, i32 264)
   // CHECK: zext i1 [[TMP]] to i32
 
+  res = __builtin_issubnormal(F);
+  // CHECK: [[TMP:%.*]] = call i1 @llvm.is.fpclass.f32(float {{.*}}, i32 144)
+  // CHECK: zext i1 [[TMP]] to i32
+
+  res = __builtin_iszero(F);
+  // CHECK: [[TMP:%.*]] = call i1 @llvm.is.fpclass.f32(float {{.*}}, i32 96)
+  // CHECK: zext i1 [[TMP]] to i32
+
+  res = __builtin_issignaling(F);
+  // CHECK: [[TMP:%.*]] = call i1 @llvm.is.fpclass.f32(float {{.*}}, i32 1)
+  // CHECK: zext i1 [[TMP]] to i32
+
   res = __builtin_flt_rounds();
   // CHECK: call i32 @llvm.get.rounding(
 }
diff --git a/clang/test/Sema/constant-builtins-2.c b/clang/test/Sema/constant-builtins-2.c
index 93948201c451b..2bdd7b06daabf 100644
--- a/clang/test/Sema/constant-builtins-2.c
+++ b/clang/test/Sema/constant-builtins-2.c
@@ -124,6 +124,45 @@ char isnormal_inf_neg[!__builtin_isnormal(-__builtin_inf()) ? 1 : -1];
 char isnormal_nan    [!__builtin_isnormal(__builtin_nan("")) ? 1 : -1];
 char isnormal_snan   [!__builtin_isnormal(__builtin_nans("")) ? 1 : -1];
 
+char iszero_inf_pos[!__builtin_iszero(__builtin_inf()) ? 1 : -1];
+char iszero_pos    [!__builtin_iszero(1.0) ? 1 : -1];
+char iszero_normf  [!__builtin_iszero(1e-37f) ? 1 : -1];
+char iszero_denormf[!__builtin_iszero(1e-38f) ? 1 : -1];
+char iszero_norm   [!__builtin_iszero(1e-307) ? 1 : -1];
+char iszero_denorm [!__builtin_iszero(1e-308) ? 1 : -1];
+char iszero_zero   [__builtin_iszero(0.0) ? 1 : -1];
+char iszero_negzero[__builtin_iszero(-0.0) ? 1 : -1];
+char iszero_neg    [!__builtin_iszero(-1.0) ? 1 : -1];
+char iszero_inf_neg[!__builtin_iszero(-__builtin_inf()) ? 1 : -1];
+char iszero_nan    [!__builtin_iszero(__builtin_nan("")) ? 1 : -1];
+char iszero_snan   [!__builtin_iszero(__builtin_nans("")) ? 1 : -1];
+
+char issubnormal_inf_pos[!__builtin_issubnormal(__builtin_inf()) ? 1 : -1];
+char issubnormal_pos    [!__builtin_issubnormal(1.0) ? 1 : -1];
+char issubnormal_normf  [!__builtin_issubnormal(1e-37f) ? 1 : -1];
+char issubnormal_denormf[__builtin_issubnormal(1e-38f) ? 1 : -1];
+char issubnormal_norm   [!__builtin_issubnormal(1e-307) ? 1 : -1];
+char issubnormal_denorm [__builtin_issubnormal(1e-308) ? 1 : -1];
+char issubnormal_zero   [!__builtin_issubnormal(0.0) ? 1 : -1];
+char issubnormal_negzero[!__builtin_issubnormal(-0.0) ? 1 : -1];
+char issubnormal_neg    [!__builtin_issubnormal(-1.0) ? 1 : -1];
+char issubnormal_inf_neg[!__builtin_issubnormal(-__builtin_inf()) ? 1 : -1];
+char issubnormal_nan    [!__builtin_issubnormal(__builtin_nan("")) ? 1 : -1];
+char issubnormal_snan   [!__builtin_issubnormal(__builtin_nans("")) ? 1 : -1];
+
+char issignaling_inf_pos[!__builtin_issignaling(__builtin_inf()) ? 1 : -1];
+char issignaling_pos    [!__builtin_issignaling(1.0) ? 1 : -1];
+char issignaling_normf  [!__builtin_issignaling(1e-37f) ? 1 : -1];
+char issignaling_denormf[!__builtin_issignaling(1e-38f) ? 1 : -1];
+char issignaling_norm   [!__builtin_issignaling(1e-307) ? 1 : -1];
+char issignaling_denorm [!__builtin_issignaling(1e-308) ? 1 : -1];
+char issignaling_zero   [!__builtin_issignaling(0.0) ? 1 : -1];
+char issignaling_negzero[!__builtin_issignaling(-0.0) ? 1 : -1];
+char issignaling_neg    [!__builtin_issignaling(-1.0) ? 1 : -1];
+char issignaling_inf_neg[!__builtin_issignaling(-__builtin_inf()) ? 1 : -1];
+char issignaling_nan    [!__builtin_issignaling(__builtin_nan("")) ? 1 : -1];
+char issignaling_snan   [__builtin_issignaling(__builtin_nans("")) ? 1 : -1];
+
 char isfpclass_inf_pos_0[__builtin_isfpclass(__builtin_inf(), 0x0200) ? 1 : -1]; // fcPosInf
 char isfpclass_inf_pos_1[!__builtin_isfpclass(__builtin_inff(), 0x0004) ? 1 : -1]; // fcNegInf
 char isfpclass_inf_pos_2[__builtin_isfpclass(__builtin_infl(), 0x0207) ? 1 : -1]; // fcSNan|fcQNan|fcNegInf|fcPosInf

From d199ff17659fc65a9b1b4ebe2d304cdbfdfe89d7 Mon Sep 17 00:00:00 2001
From: Wenju He <wenju.he@intel.com>
Date: Wed, 1 Nov 2023 13:32:38 +0800
Subject: [PATCH 628/877] [InferAddressSpaces] collect flat address expression
 from return value (#70610)

If function return value's type is pointer, we can try to collect flat
address expression from it.
This PR also fixes noop_ptrint_pair_ce2 in noop-ptrint-pair.ll in #70611
---
 llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp      |  4 ++++
 .../InferAddressSpaces/AMDGPU/noop-ptrint-pair.ll      |  7 +++++++
 .../Transforms/InferAddressSpaces/AMDGPU/select.ll     | 10 +++++-----
 3 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp b/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp
index 2da521375c001..28fe1b5e75327 100644
--- a/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp
+++ b/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp
@@ -523,6 +523,10 @@ InferAddressSpacesImpl::collectFlatAddressExpressions(Function &F) const {
     } else if (auto *I2P = dyn_cast<IntToPtrInst>(&I)) {
       if (isNoopPtrIntCastPair(cast<Operator>(I2P), *DL, TTI))
         PushPtrOperand(cast<Operator>(I2P->getOperand(0))->getOperand(0));
+    } else if (auto *RI = dyn_cast<ReturnInst>(&I)) {
+      if (auto *RV = RI->getReturnValue();
+          RV && RV->getType()->isPtrOrPtrVectorTy())
+        PushPtrOperand(RV);
     }
   }
 
diff --git a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/noop-ptrint-pair.ll b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/noop-ptrint-pair.ll
index a73cd517fd49d..b6713e96bed3f 100644
--- a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/noop-ptrint-pair.ll
+++ b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/noop-ptrint-pair.ll
@@ -69,6 +69,13 @@ define ptr @noop_ptrint_pair_ce2() {
   ret ptr inttoptr (i64 ptrtoint (ptr addrspace(1) @g to i64) to ptr)
 }
 
+; COMMON-LABEL: @noop_ptrint_pair_ce2_vec(
+; AMDGCN-NEXT: ret <2 x ptr> <ptr addrspacecast (ptr addrspace(1) @g to ptr), ptr inttoptr (i64 ptrtoint (ptr addrspace(3) @l to i64) to ptr)>
+; NOTTI-NEXT: ret <2 x ptr> <ptr inttoptr (i64 ptrtoint (ptr addrspace(1) @g to i64) to ptr), ptr inttoptr (i64 ptrtoint (ptr addrspace(3) @l to i64) to ptr)>
+define <2 x ptr> @noop_ptrint_pair_ce2_vec() {
+  ret <2 x ptr> <ptr inttoptr (i64 ptrtoint (ptr addrspace(1) @g to i64) to ptr), ptr inttoptr (i64 ptrtoint (ptr addrspace(3) @l to i64) to ptr)>
+}
+
 ; COMMON-LABEL: @noop_ptrint_pair_ce3(
 ; AMDGCN-NEXT: %i = inttoptr i64 ptrtoint (ptr addrspace(1) @g to i64) to ptr
 ; AMDGCN-NEXT: ret void
diff --git a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/select.ll b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/select.ll
index 9495c5566b36c..ab4e1277b81fd 100644
--- a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/select.ll
+++ b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/select.ll
@@ -3,11 +3,11 @@
 ; Instcombine pulls the addrspacecast out of the select, make sure
 ;  this doesn't do something insane on non-canonical IR.
 
-; CHECK-LABEL: @return_select_group_flat(
-; CHECK-NEXT: %cast0 = addrspacecast ptr addrspace(3) %group.ptr.0 to ptr
-; CHECK-NEXT: %cast1 = addrspacecast ptr addrspace(3) %group.ptr.1 to ptr
-; CHECK-NEXT: %select = select i1 %c, ptr %cast0, ptr %cast1
-; CHECK-NEXT: ret ptr %select
+; CHECK-LABEL: define ptr @return_select_group_flat(
+; CHECK-SAME: i1 [[C:%.*]], ptr addrspace(3) [[GROUP_PTR_0:%.*]], ptr addrspace(3) [[GROUP_PTR_1:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[SELECT:%.*]] = select i1 [[C]], ptr addrspace(3) [[GROUP_PTR_0]], ptr addrspace(3) [[GROUP_PTR_1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = addrspacecast ptr addrspace(3) [[SELECT]] to ptr
+; CHECK-NEXT:    ret ptr [[TMP1]]
 define ptr @return_select_group_flat(i1 %c, ptr addrspace(3) %group.ptr.0, ptr addrspace(3) %group.ptr.1) #0 {
   %cast0 = addrspacecast ptr addrspace(3) %group.ptr.0 to ptr
   %cast1 = addrspacecast ptr addrspace(3) %group.ptr.1 to ptr

From 264f5ec99c2567c2e0ef21a39016318dc2b0da26 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Tue, 31 Oct 2023 23:04:13 -0700
Subject: [PATCH 629/877] [RISCV][GISel] Refactor the load/store action builder
 to prepare for upcoming patch. NFC

Remove redundancy from RV32 by not using sXLen in the legalization
rules. Make s64 conditional on XLen==64 instead.

Preparation for making s64 legal on RV32+D.
---
 .../Target/RISCV/GISel/RISCVLegalizerInfo.cpp | 37 +++++++++----------
 1 file changed, 18 insertions(+), 19 deletions(-)

diff --git a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
index e80d620936e89..2031cabc75a31 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
+++ b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
@@ -95,25 +95,24 @@ RISCVLegalizerInfo::RISCVLegalizerInfo(const RISCVSubtarget &ST) {
       .clampScalar(0, sXLen, sXLen)
       .clampScalar(1, sXLen, sXLen);
 
-  getActionDefinitionsBuilder({G_LOAD, G_STORE})
-      .legalForTypesWithMemDesc({{s32, p0, s8, 8},
-                                 {s32, p0, s16, 16},
-                                 {s32, p0, s32, 32},
-                                 {sXLen, p0, s8, 8},
-                                 {sXLen, p0, s16, 16},
-                                 {sXLen, p0, s32, 32},
-                                 {sXLen, p0, sXLen, XLen},
-                                 {p0, p0, sXLen, XLen}})
-      .clampScalar(0, s32, sXLen)
-      .lower();
-
-  auto &ExtLoadActions = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
-                             .legalForTypesWithMemDesc({{s32, p0, s8, 8},
-                                                        {s32, p0, s16, 16},
-                                                        {sXLen, p0, s8, 8},
-                                                        {sXLen, p0, s16, 16}});
-  if (XLen == 64)
-    ExtLoadActions.legalForTypesWithMemDesc({{sXLen, p0, s32, 32}});
+  auto &LoadStoreActions =
+      getActionDefinitionsBuilder({G_LOAD, G_STORE})
+          .legalForTypesWithMemDesc({{s32, p0, s8, 8},
+                                     {s32, p0, s16, 16},
+                                     {s32, p0, s32, 32},
+                                     {p0, p0, sXLen, XLen}});
+  auto &ExtLoadActions =
+      getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
+          .legalForTypesWithMemDesc({{s32, p0, s8, 8}, {s32, p0, s16, 16}});
+  if (XLen == 64) {
+    LoadStoreActions.legalForTypesWithMemDesc({{s64, p0, s8, 8},
+                                               {s64, p0, s16, 16},
+                                               {s64, p0, s32, 32},
+                                               {s64, p0, s64, 64}});
+    ExtLoadActions.legalForTypesWithMemDesc(
+        {{s64, p0, s8, 8}, {s64, p0, s16, 16}, {s64, p0, s32, 32}});
+  }
+  LoadStoreActions.clampScalar(0, s32, sXLen).lower();
   ExtLoadActions.widenScalarToNextPow2(0).clampScalar(0, s32, sXLen).lower();
 
   getActionDefinitionsBuilder(G_PTR_ADD).legalFor({{p0, sXLen}});

From fe846bc9bd164ed89cef1caefb681e0aee3d4dec Mon Sep 17 00:00:00 2001
From: Natschz <natschz@users.noreply.github.com>
Date: Wed, 1 Nov 2023 07:34:23 +0100
Subject: [PATCH 630/877] Escape LLVM_TARGETS_TO_BUILD while checking against
 LLVM_ALL_TARGETS and LLVM_EXPERIMENTAL_TARGETS_TO_BUILD (#70885)

Targeting X86 while building LLVM with the Android NDK, currently fails
with "The target `X86' is not a core tier target. It may be experimental,
if so it must be passed via LLVM_EXPERIMENTAL_TARGETS_TO_BUILD."

This is happening because the Android NDK defines a variable named X86,
which will lead to a "fun" CMake feature. For further details reference
[this post](https://discourse.llvm.org/t/the-target-x86-is-not-a-core-tier-target/73784).

To fix this, the LLVM_TARGETS_TO_BUILD need to be escaped while checking
agains LLVM_ALL_TARGETS and LLVM_EXPERIMENTAL_TARGETS_TO_BUILD.
---
 llvm/CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt
index 82d4beea91e34..292efa3316df7 100644
--- a/llvm/CMakeLists.txt
+++ b/llvm/CMakeLists.txt
@@ -942,8 +942,8 @@ foreach(t ${LLVM_TARGETS_TO_BUILD})
   # LLVM_EXPERIMENTAL_TARGETS_TO_BUILD, not LLVM_TARGETS_TO_BUILD.
   # We allow experimental targets that are not in LLVM_ALL_EXPERIMENTAL_TARGETS,
   # as long as they are passed via LLVM_EXPERIMENTAL_TARGETS_TO_BUILD.
-  if ( NOT ${t} IN_LIST LLVM_ALL_TARGETS AND NOT ${t} IN_LIST LLVM_EXPERIMENTAL_TARGETS_TO_BUILD )
-    if( ${t} IN_LIST LLVM_ALL_EXPERIMENTAL_TARGETS )
+  if ( NOT "${t}" IN_LIST LLVM_ALL_TARGETS AND NOT "${t}" IN_LIST LLVM_EXPERIMENTAL_TARGETS_TO_BUILD )
+    if( "${t}" IN_LIST LLVM_ALL_EXPERIMENTAL_TARGETS )
       message(FATAL_ERROR "The target `${t}' is experimental and must be passed "
         "via LLVM_EXPERIMENTAL_TARGETS_TO_BUILD.")
     else()

From 6930c3a49bafe2d889e70452be1a1688b0ac7d28 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Tue, 31 Oct 2023 23:40:04 -0700
Subject: [PATCH 631/877] [RISCV][GISel] Make s64 G_LOAD/G_STORE legal on RV32
 with D extension.

---
 llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
index 2031cabc75a31..662e5e553121c 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
+++ b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
@@ -111,6 +111,8 @@ RISCVLegalizerInfo::RISCVLegalizerInfo(const RISCVSubtarget &ST) {
                                                {s64, p0, s64, 64}});
     ExtLoadActions.legalForTypesWithMemDesc(
         {{s64, p0, s8, 8}, {s64, p0, s16, 16}, {s64, p0, s32, 32}});
+  } else if (ST.hasStdExtD()) {
+    LoadStoreActions.legalForTypesWithMemDesc({{s64, p0, s64, 64}});
   }
   LoadStoreActions.clampScalar(0, s32, sXLen).lower();
   ExtLoadActions.widenScalarToNextPow2(0).clampScalar(0, s32, sXLen).lower();

From 2862d17b30a02037db2776c6fb18b4d43764df44 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Tue, 31 Oct 2023 23:49:56 -0700
Subject: [PATCH 632/877] [RISCV][GISel] Add test case for FP load/store
 legalization. NFC

---
 .../legalizer/legalize-fp-load-store.mir      | 82 +++++++++++++++++++
 1 file changed, 82 insertions(+)
 create mode 100644 llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-fp-load-store.mir

diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-fp-load-store.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-fp-load-store.mir
new file mode 100644
index 0000000000000..5afc3df08690e
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/legalize-fp-load-store.mir
@@ -0,0 +1,82 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=riscv32 -mattr=+d -run-pass=legalizer %s -o - \
+# RUN: | FileCheck %s
+# RUN: llc -mtriple=riscv64 -mattr=+d -run-pass=legalizer %s -o - \
+# RUN: | FileCheck %s
+
+---
+name:            fp_store_f32
+body:             |
+  bb.1:
+    liveins: $x10, $f10_f
+
+    ; CHECK-LABEL: name: fp_store_f32
+    ; CHECK: liveins: $x10, $f10_f
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $f10_f
+    ; CHECK-NEXT: G_STORE [[COPY1]](s32), [[COPY]](p0) :: (store (s32))
+    ; CHECK-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(s32) = COPY $f10_f
+    G_STORE %1(s32), %0(p0) :: (store (s32))
+    PseudoRET
+
+...
+---
+name:            fp_store_f64
+body:             |
+  bb.1:
+    liveins: $x10, $f10_d
+
+    ; CHECK-LABEL: name: fp_store_f64
+    ; CHECK: liveins: $x10, $f10_d
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $f10_d
+    ; CHECK-NEXT: G_STORE [[COPY1]](s64), [[COPY]](p0) :: (store (s64))
+    ; CHECK-NEXT: PseudoRET
+    %0:_(p0) = COPY $x10
+    %1:_(s64) = COPY $f10_d
+    G_STORE %1(s64), %0(p0) :: (store (s64))
+    PseudoRET
+
+...
+---
+name:            fp_load_f32
+body:             |
+  bb.1:
+    liveins: $x10, $f10_f
+
+    ; CHECK-LABEL: name: fp_load_f32
+    ; CHECK: liveins: $x10, $f10_f
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32))
+    ; CHECK-NEXT: $f10_f = COPY [[LOAD]](s32)
+    ; CHECK-NEXT: PseudoRET implicit $f10_f
+    %0:_(p0) = COPY $x10
+    %2:_(s32) = G_LOAD %0(p0) :: (load (s32))
+    $f10_f = COPY %2(s32)
+    PseudoRET implicit $f10_f
+
+...
+---
+name:            fp_load_f64
+body:             |
+  bb.1:
+    liveins: $x10, $f10_d
+
+    ; CHECK-LABEL: name: fp_load_f64
+    ; CHECK: liveins: $x10, $f10_d
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x10
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p0) :: (load (s64))
+    ; CHECK-NEXT: $f10_d = COPY [[LOAD]](s64)
+    ; CHECK-NEXT: PseudoRET implicit $f10_d
+    %0:_(p0) = COPY $x10
+    %2:_(s64) = G_LOAD %0(p0) :: (load (s64))
+    $f10_d = COPY %2(s64)
+    PseudoRET implicit $f10_d
+
+...

From 1908f47a9b4c7697160441b51df93f904049a589 Mon Sep 17 00:00:00 2001
From: Cullen Rhodes <cullen.rhodes@arm.com>
Date: Wed, 1 Nov 2023 07:15:51 +0000
Subject: [PATCH 633/877] [mlir][ArmSME] Add optional mask operand to
 tile_store (#70657)

---
 .../mlir/Dialect/ArmSME/IR/ArmSMEOps.td       | 52 ++++++++++++-------
 .../VectorToArmSME/VectorToArmSME.cpp         |  4 +-
 mlir/test/Dialect/ArmSME/invalid.mlir         | 14 +++++
 mlir/test/Dialect/ArmSME/roundtrip.mlir       |  9 ++++
 .../Dialect/ArmSME/vector-ops-to-sme.mlir     | 14 +++++
 5 files changed, 71 insertions(+), 22 deletions(-)

diff --git a/mlir/include/mlir/Dialect/ArmSME/IR/ArmSMEOps.td b/mlir/include/mlir/Dialect/ArmSME/IR/ArmSMEOps.td
index 37a2257a0015c..8a34472d3b3a2 100644
--- a/mlir/include/mlir/Dialect/ArmSME/IR/ArmSMEOps.td
+++ b/mlir/include/mlir/Dialect/ArmSME/IR/ArmSMEOps.td
@@ -60,6 +60,12 @@ def TileElementWidthMatchesTileID : TypesMatchWith<
                   "::llvm::cast<VectorType>($_self).getElementType())"
                   ".getWidth())">;
 
+class HasMatchingMaskTypeConstraint<string vector, string mask> :
+  OptionalTypesMatchWith<
+    mask # " has i1 element type and same shape as " # vector,
+    vector, mask,
+    "::llvm::cast<mlir::VectorType>($_self).cloneWith({}, IntegerType::get($_ctxt, 1))">;
+
 //===----------------------------------------------------------------------===//
 // ArmSME attr definitions
 //===----------------------------------------------------------------------===//
@@ -259,14 +265,7 @@ def TileLoadOp : ArmSME_Op<"tile_load", [
     "result", "padding",
     "::llvm::cast<VectorType>($_self).getElementType()"
   >,
-  OptionalTypesMatchWith<
-    "mask has i1 element type and same shape as result",
-    "result", "mask",
-    "VectorType("
-      "VectorType::Builder("
-        "::llvm::cast<mlir::VectorType>($_self)"
-      ").setElementType(IntegerType::get($_self.getContext(), 1)))"
-  >,
+  HasMatchingMaskTypeConstraint<"result", "mask">,
   PredOpTrait<
     "both `padding` and `mask` should be provided or neither",
     CPred<"bool(getPadding()) == bool(getMask())">
@@ -345,7 +344,10 @@ def TileLoadOp : ArmSME_Op<"tile_load", [
       "attr-dict `:` type($base) `,` type($result)";
 }
 
-def TileStoreOp : ArmSME_Op<"tile_store"> {
+def TileStoreOp : ArmSME_Op<"tile_store", [
+  AttrSizedOperandSegments,
+  HasMatchingMaskTypeConstraint<"valueToStore", "mask">,
+]> {
   let summary = "Tile store operation";
   let description = [{
     Stores a 2D SME "virtual tile" to memory defined by a base and indices,
@@ -356,6 +358,9 @@ def TileStoreOp : ArmSME_Op<"tile_store"> {
     rank 2 with dynamic dimensions, since the operation is scalable, and the
     element type must be a scalar that matches the element type of the result.
 
+    An optional `mask` may be provided, the shape of which corresponds to the
+    `tile`, and selects which elements of the tile will be stored.
+
     Example 1: Store an 8-bit element ZA tile with horizontal (default) layout to memory (ZA0.B).
     ```mlir
     arm_sme.tile_store %tile, %base[%c0, %c0] : vector<[16]x[16]xi8>, memref<?x?xi8>
@@ -370,10 +375,16 @@ def TileStoreOp : ArmSME_Op<"tile_store"> {
     ```mlir
     arm_sme.tile_store %tile, %base[%c0, %c0] layout<horizontal> : vector<[1]x[1]xi128>, memref<?x?xi128>
     ```
+
+    Example 4: Masked store a int 32-bit element ZA tile with vertical layout to memory.
+    ```mlir
+    arm_sme.tile_store %tile, %base[%c0, %c0], %mask layout<vertical> : vector<[4]x[4]xf32>, memref<?x?xf32>
+    ```
   }];
   let arguments = (ins SMETile:$valueToStore,
     Arg<AnyMemRef, "the reference to store to", [MemWrite]>:$base,
-    Variadic<Index>:$indices, ArmSME_TileSliceLayoutAttr:$layout
+    Variadic<Index>:$indices, Optional<AnyVector>:$mask,
+    ArmSME_TileSliceLayoutAttr:$layout
   );
   let extraClassDeclaration = [{
     MemRefType getMemRefType() {
@@ -384,9 +395,16 @@ def TileStoreOp : ArmSME_Op<"tile_store"> {
     }
   }];
 
+  let builders = [
+    OpBuilder<(ins "Value":$valueToStore, "Value":$base,
+                   "ValueRange":$indices), [{
+      build($_builder, $_state, valueToStore, base, indices, {});
+    }]>,
+  ];
+
   let assemblyFormat =
-    "$valueToStore `,` $base `[` $indices `]` (`layout` `` $layout^)? attr-dict "
-      "`:` type($base) `,` type($valueToStore)";
+    "$valueToStore `,` $base `[` $indices `]` (`,` $mask^)? (`layout` `` $layout^)?"
+      "attr-dict `:` type($base) `,` type($valueToStore)";
 }
 
 def LoadTileSliceOp : ArmSME_Op<"load_tile_slice", [
@@ -595,12 +613,6 @@ def MoveTileSliceToVectorOp : ArmSME_Op<"move_tile_slice_to_vector", [Pure,
   }];
 }
 
-class HasMatchingMaskTypeConstraint<string operand> :
-  OptionalTypesMatchWith<
-    "shape of `" # operand #  "Mask` matches `" # operand # "`",
-    operand, operand # "Mask",
-    "::llvm::cast<mlir::VectorType>($_self).cloneWith({}, IntegerType::get($_ctxt, 1))">;
-
 class OuterProductResultTileTypeConstraint<string operand> :
   OptionalTypesMatchWith<operand # "type is derived from `lhs` and `rhs`",
     "lhs", operand,
@@ -615,8 +627,8 @@ def OuterProductOp :
   ArmSME_Op<"outerproduct", [Pure,
     AttrSizedOperandSegments,
     AllTypesMatch<["lhs", "rhs"]>,
-    HasMatchingMaskTypeConstraint<"lhs">,
-    HasMatchingMaskTypeConstraint<"rhs">,
+    HasMatchingMaskTypeConstraint<"lhs", "lhsMask">,
+    HasMatchingMaskTypeConstraint<"rhs", "rhsMask">,
     PredOpTrait<
       "both `lhsMask` and `rhsMask` should be provided or neither",
       CPred<"bool(getLhsMask()) == bool(getRhsMask())">>,
diff --git a/mlir/lib/Conversion/VectorToArmSME/VectorToArmSME.cpp b/mlir/lib/Conversion/VectorToArmSME/VectorToArmSME.cpp
index b60c21e2ced7a..005dd546bf163 100644
--- a/mlir/lib/Conversion/VectorToArmSME/VectorToArmSME.cpp
+++ b/mlir/lib/Conversion/VectorToArmSME/VectorToArmSME.cpp
@@ -144,8 +144,8 @@ struct TransferWriteToArmSMELowering
       return failure();
 
     rewriter.replaceOpWithNewOp<arm_sme::TileStoreOp>(
-        writeOp, writeOp.getVector(), writeOp.getSource(),
-        writeOp.getIndices());
+        writeOp, writeOp.getVector(), writeOp.getSource(), writeOp.getIndices(),
+        writeOp.getMask());
     return success();
   }
 };
diff --git a/mlir/test/Dialect/ArmSME/invalid.mlir b/mlir/test/Dialect/ArmSME/invalid.mlir
index 1d6386bbf3828..588b8e891fadd 100644
--- a/mlir/test/Dialect/ArmSME/invalid.mlir
+++ b/mlir/test/Dialect/ArmSME/invalid.mlir
@@ -164,6 +164,20 @@ func.func @arm_sme_load_tile_slice__bad_mask_type(%src : memref<?x?xi8>, %mask :
   return
 }
 
+//===----------------------------------------------------------------------===//
+// arm_sme.tile_store
+//===----------------------------------------------------------------------===//
+
+// -----
+
+func.func @arm_sme_tile_store__bad_mask_type(%tile : vector<[16]x[16]xi8>, %mask : vector<[1]x[1]xi1>, %dest : memref<?x?xi8>) {
+  %c0 = arith.constant 0 : index
+  // expected-note@-2 {{prior use here}}
+  // expected-error@+1 {{use of value '%mask' expects different type than prior uses: 'vector<[16]x[16]xi1>' vs 'vector<[1]x[1]xi1>}}
+  arm_sme.tile_store %tile, %dest[%c0, %c0], %mask : memref<?x?xi8>, vector<[16]x[16]xi8>
+  return
+}
+
 //===----------------------------------------------------------------------===//
 // arm_sme.outerproduct
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/Dialect/ArmSME/roundtrip.mlir b/mlir/test/Dialect/ArmSME/roundtrip.mlir
index 6d0aa48015c14..8206bd80e3eb4 100644
--- a/mlir/test/Dialect/ArmSME/roundtrip.mlir
+++ b/mlir/test/Dialect/ArmSME/roundtrip.mlir
@@ -624,6 +624,15 @@ func.func @arm_sme_tile_store_ver_f64(%tile : vector<[2]x[2]xf64>, %dest : memre
 
 // -----
 
+func.func @arm_sme_tile_store_with_mask_ver_f32(%tile : vector<[4]x[4]xf32>, %dest : memref<?x?xf32>, %mask : vector<[4]x[4]xi1>) {
+  // CHECK: arm_sme.tile_store {{.*}} layout<vertical> : memref<?x?xf32>, vector<[4]x[4]xf32>
+  %c0 = arith.constant 0 : index
+  arm_sme.tile_store %tile, %dest[%c0, %c0], %mask layout<vertical> : memref<?x?xf32>, vector<[4]x[4]xf32>
+  return
+}
+
+// -----
+
 /// Layout is optional and horizontal is the default, verify it's still parsed.
 func.func @arm_sme_tile_store_ver_i8(%tile : vector<[16]x[16]xi8>, %dest : memref<?x?xi8>) {
   // CHECK: arm_sme.tile_store %{{.*}}[{{.*}}] : memref<?x?xi8>, vector<[16]x[16]xi8>
diff --git a/mlir/test/Dialect/ArmSME/vector-ops-to-sme.mlir b/mlir/test/Dialect/ArmSME/vector-ops-to-sme.mlir
index 9eb7cd143e5b5..5f41313fc6ac7 100644
--- a/mlir/test/Dialect/ArmSME/vector-ops-to-sme.mlir
+++ b/mlir/test/Dialect/ArmSME/vector-ops-to-sme.mlir
@@ -315,6 +315,20 @@ func.func @transfer_write_2d_f64(%vector : vector<[2]x[2]xf64>, %dest : memref<?
 
 // -----
 
+// CHECK-LABEL: func.func @transfer_write_2d_with_mask_f64(
+// CHECK-SAME:                                             %[[VECTOR:.*]]: vector<[2]x[2]xf64>,
+// CHECK-SAME:                                             %[[DEST:.*]]: memref<?x?xf64>,
+// CHECK-SAME:                                             %[[MASK:.*]]: vector<[2]x[2]xi1>) {
+// CHECK:         %[[C0:.*]] = arith.constant 0 : index
+// CHECK:         arm_sme.tile_store %[[VECTOR]], %[[DEST]]{{\[}}%[[C0]], %[[C0]]], %[[MASK]] : memref<?x?xf64>, vector<[2]x[2]xf64>
+func.func @transfer_write_2d_with_mask_f64(%vector : vector<[2]x[2]xf64>, %dest : memref<?x?xf64>, %mask : vector<[2]x[2]xi1>) {
+  %c0 = arith.constant 0 : index
+  vector.transfer_write %vector, %dest[%c0, %c0], %mask {in_bounds = [true, true]} : vector<[2]x[2]xf64>, memref<?x?xf64>
+  return
+}
+
+// -----
+
 // The following tests check the 'vector.transfer_write' -> 'arm_sme.intr.zero'
 // lowering only occurs for vector types of correct rank, shape, element size
 // and number of scalable dims.

From 02307a14440ee7270ecc8490ad6e390261bb21c4 Mon Sep 17 00:00:00 2001
From: Christian Ulmann <christian.ulmann@nextsilicon.com>
Date: Wed, 1 Nov 2023 08:40:32 +0100
Subject: [PATCH 634/877] [MLIR][GPUToNVVM] Remove typed pointer support
 (#70861)

This commit removes the support for lowering GPU to NVVM dialect with
typed pointers. Typed pointers have been deprecated for a while now and
it's planned to soon remove them from the LLVM dialect.

Related PSA:
https://discourse.llvm.org/t/psa-removal-of-typed-pointers-from-the-llvm-dialect/74502
---
 mlir/include/mlir/Conversion/Passes.td        |  5 +--
 .../GPUToNVVM/LowerGpuOpsToNVVMOps.cpp        |  1 -
 ...wer-memory-space-attrs-typed-pointers.mlir | 15 -------
 .../GPUCommon/lower-memory-space-attrs.mlir   |  2 +-
 .../GPUCommon/memory-attrbution.mlir          |  2 +-
 .../GPUCommon/memref-arg-attrs.mlir           |  2 +-
 .../GPUCommon/memref-arg-noalias-attrs.mlir   |  2 +-
 .../Conversion/GPUToNVVM/gpu-to-nvvm-32b.mlir |  2 +-
 .../Conversion/GPUToNVVM/gpu-to-nvvm.mlir     |  2 +-
 .../Conversion/GPUToNVVM/typed-pointers.mlir  | 39 -------------------
 .../GPUToNVVM/wmma-ops-to-nvvm.mlir           |  4 +-
 11 files changed, 9 insertions(+), 67 deletions(-)
 delete mode 100644 mlir/test/Conversion/GPUCommon/lower-memory-space-attrs-typed-pointers.mlir
 delete mode 100644 mlir/test/Conversion/GPUToNVVM/typed-pointers.mlir

diff --git a/mlir/include/mlir/Conversion/Passes.td b/mlir/include/mlir/Conversion/Passes.td
index 2d504cd9192ad..25481d75e7ea4 100644
--- a/mlir/include/mlir/Conversion/Passes.td
+++ b/mlir/include/mlir/Conversion/Passes.td
@@ -495,10 +495,7 @@ def ConvertGpuOpsToNVVMOps : Pass<"convert-gpu-to-nvvm", "gpu::GPUModuleOp"> {
     Option<"useBarePtrCallConv", "use-bare-ptr-memref-call-conv", "bool",
            /*default=*/"false",
            "Replace memref arguments in GPU functions with bare pointers. "
-           "All memrefs must have static shape.">,
-    Option<"useOpaquePointers", "use-opaque-pointers", "bool",
-                   /*default=*/"true", "Generate LLVM IR using opaque pointers "
-                   "instead of typed pointers">,
+           "All memrefs must have static shape.">
   ];
 }
 
diff --git a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
index 764b6a779b98c..935e3d2a40950 100644
--- a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
+++ b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
@@ -227,7 +227,6 @@ struct LowerGpuOpsToNVVMOpsPass
         DataLayout(cast<DataLayoutOpInterface>(m.getOperation())));
     if (indexBitwidth != kDeriveIndexBitwidthFromDataLayout)
       options.overrideIndexBitwidth(indexBitwidth);
-    options.useOpaquePointers = useOpaquePointers;
     options.useBarePtrCallConv = useBarePtrCallConv;
 
     // Apply in-dialect lowering. In-dialect lowering will replace
diff --git a/mlir/test/Conversion/GPUCommon/lower-memory-space-attrs-typed-pointers.mlir b/mlir/test/Conversion/GPUCommon/lower-memory-space-attrs-typed-pointers.mlir
deleted file mode 100644
index ebf1f309b0e44..0000000000000
--- a/mlir/test/Conversion/GPUCommon/lower-memory-space-attrs-typed-pointers.mlir
+++ /dev/null
@@ -1,15 +0,0 @@
-// RUN: mlir-opt %s -split-input-file -convert-gpu-to-rocdl='use-opaque-pointers=0' | FileCheck %s --check-prefixes=CHECK,ROCDL
-// RUN: mlir-opt %s -split-input-file -convert-gpu-to-nvvm='use-opaque-pointers=0' | FileCheck %s --check-prefixes=CHECK,NVVM
-
-gpu.module @kernel {
-  gpu.func @private(%arg0: f32) private(%arg1: memref<4xf32, #gpu.address_space<private>>) {
-    %c0 = arith.constant 0 : index
-    memref.store %arg0, %arg1[%c0] : memref<4xf32, #gpu.address_space<private>>
-    gpu.return
-  }
-}
-
-// CHECK-LABEL:  llvm.func @private
-//      CHECK:  llvm.store
-// ROCDL-SAME:   : !llvm.ptr<f32, 5>
-//  NVVM-SAME:   : !llvm.ptr<f32>
diff --git a/mlir/test/Conversion/GPUCommon/lower-memory-space-attrs.mlir b/mlir/test/Conversion/GPUCommon/lower-memory-space-attrs.mlir
index cde11a73397f4..2d7e90dd1f5cb 100644
--- a/mlir/test/Conversion/GPUCommon/lower-memory-space-attrs.mlir
+++ b/mlir/test/Conversion/GPUCommon/lower-memory-space-attrs.mlir
@@ -1,5 +1,5 @@
 // RUN: mlir-opt %s -split-input-file -convert-gpu-to-rocdl='use-opaque-pointers=1' | FileCheck %s --check-prefixes=CHECK,ROCDL
-// RUN: mlir-opt %s -split-input-file -convert-gpu-to-nvvm='use-opaque-pointers=1' | FileCheck %s --check-prefixes=CHECK,NVVM
+// RUN: mlir-opt %s -split-input-file -convert-gpu-to-nvvm | FileCheck %s --check-prefixes=CHECK,NVVM
 
 gpu.module @kernel {
   gpu.func @private(%arg0: f32) private(%arg1: memref<4xf32, #gpu.address_space<private>>) {
diff --git a/mlir/test/Conversion/GPUCommon/memory-attrbution.mlir b/mlir/test/Conversion/GPUCommon/memory-attrbution.mlir
index 2457cbcf3251c..0231cf6f98066 100644
--- a/mlir/test/Conversion/GPUCommon/memory-attrbution.mlir
+++ b/mlir/test/Conversion/GPUCommon/memory-attrbution.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt -allow-unregistered-dialect --convert-gpu-to-nvvm='use-opaque-pointers=1' --split-input-file %s | FileCheck --check-prefix=NVVM %s
+// RUN: mlir-opt -allow-unregistered-dialect --convert-gpu-to-nvvm --split-input-file %s | FileCheck --check-prefix=NVVM %s
 // RUN: mlir-opt -allow-unregistered-dialect --convert-gpu-to-rocdl='use-opaque-pointers=1' --split-input-file %s | FileCheck --check-prefix=ROCDL %s
 
 gpu.module @kernel {
diff --git a/mlir/test/Conversion/GPUCommon/memref-arg-attrs.mlir b/mlir/test/Conversion/GPUCommon/memref-arg-attrs.mlir
index 71ae5fbf154ad..21c4d5696c421 100644
--- a/mlir/test/Conversion/GPUCommon/memref-arg-attrs.mlir
+++ b/mlir/test/Conversion/GPUCommon/memref-arg-attrs.mlir
@@ -1,5 +1,5 @@
 // RUN: mlir-opt %s -split-input-file -convert-gpu-to-rocdl='use-opaque-pointers=1 use-bare-ptr-memref-call-conv=0' | FileCheck %s --check-prefixes=CHECK,ROCDL
-// RUN: mlir-opt %s -split-input-file -convert-gpu-to-nvvm='use-opaque-pointers=1 use-bare-ptr-memref-call-conv=0' | FileCheck %s --check-prefixes=CHECK,NVVM
+// RUN: mlir-opt %s -split-input-file -convert-gpu-to-nvvm='use-bare-ptr-memref-call-conv=0' | FileCheck %s --check-prefixes=CHECK,NVVM
 
 gpu.module @kernel {
   gpu.func @test_func_readonly(%arg0 : memref<f32> {llvm.readonly} ) {
diff --git a/mlir/test/Conversion/GPUCommon/memref-arg-noalias-attrs.mlir b/mlir/test/Conversion/GPUCommon/memref-arg-noalias-attrs.mlir
index 64c2cfdbcc5ef..81babac5502c7 100644
--- a/mlir/test/Conversion/GPUCommon/memref-arg-noalias-attrs.mlir
+++ b/mlir/test/Conversion/GPUCommon/memref-arg-noalias-attrs.mlir
@@ -1,5 +1,5 @@
 // RUN: mlir-opt %s -split-input-file -convert-gpu-to-rocdl='use-opaque-pointers=1 use-bare-ptr-memref-call-conv=1' | FileCheck %s --check-prefixes=CHECK,ROCDL
-// RUN: mlir-opt %s -split-input-file -convert-gpu-to-nvvm='use-opaque-pointers=1 use-bare-ptr-memref-call-conv=1' | FileCheck %s --check-prefixes=CHECK,NVVM
+// RUN: mlir-opt %s -split-input-file -convert-gpu-to-nvvm='use-bare-ptr-memref-call-conv=1' | FileCheck %s --check-prefixes=CHECK,NVVM
 
 gpu.module @kernel {
   gpu.func @func_with_noalias_attr(%arg0 : memref<f32> {llvm.noalias} ) {
diff --git a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm-32b.mlir b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm-32b.mlir
index 9bdce31261064..7b873463a5f98 100644
--- a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm-32b.mlir
+++ b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm-32b.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-gpu-to-nvvm='index-bitwidth=32 use-opaque-pointers=1' -split-input-file | FileCheck %s
+// RUN: mlir-opt %s -convert-gpu-to-nvvm='index-bitwidth=32' -split-input-file | FileCheck %s
 
 // RUN: mlir-opt %s -transform-interpreter | FileCheck %s
 
diff --git a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
index 92d85a473d231..c18bb423a6e60 100644
--- a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
+++ b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-gpu-to-nvvm='has-redux=1 use-opaque-pointers=1' -split-input-file | FileCheck %s
+// RUN: mlir-opt %s -convert-gpu-to-nvvm='has-redux=1' -split-input-file | FileCheck %s
 // RUN: mlir-opt %s -transform-interpreter | FileCheck %s
 
 gpu.module @test_module_0 {
diff --git a/mlir/test/Conversion/GPUToNVVM/typed-pointers.mlir b/mlir/test/Conversion/GPUToNVVM/typed-pointers.mlir
deleted file mode 100644
index de0a638432e57..0000000000000
--- a/mlir/test/Conversion/GPUToNVVM/typed-pointers.mlir
+++ /dev/null
@@ -1,39 +0,0 @@
-// RUN: mlir-opt --convert-gpu-to-nvvm="use-opaque-pointers=0" --split-input-file %s | FileCheck %s
-// RUN: mlir-opt --convert-gpu-to-nvvm="index-bitwidth=32 use-opaque-pointers=0" --split-input-file %s | FileCheck --check-prefix=CHECK32 %s
-
-gpu.module @test_module {
-
-  // CHECK-LABEL: func @gpu_wmma_load_op() ->
-  // CHECK-SAME: !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
-  // CHECK32-LABEL: func @gpu_wmma_load_op() ->
-  func.func @gpu_wmma_load_op() -> (!gpu.mma_matrix<16x16xf16, "AOp">) {
-    %wg = memref.alloca() {alignment = 32} : memref<32x32xf16, 3>
-    %i = arith.constant 16 : index
-    %j = arith.constant 16 : index
-    %0 = gpu.subgroup_mma_load_matrix %wg[%i, %j] {leadDimension = 32 : index, transpose} : memref<32x32xf16, 3> -> !gpu.mma_matrix<16x16xf16, "AOp">
-    // CHECK:  %[[INX:.*]] = llvm.mlir.constant(16 : index) : i64
-    // CHECK: %{{.*}} = llvm.insertvalue %{{.*}}, %{{.*}}[{{.*}}, {{.*}}]
-    // CHECK:  %[[BASE:.*]] = llvm.extractvalue %{{.*}}[1] : !llvm.struct<(ptr<f16, 3>, ptr<f16, 3>, i64, array<2 x i64>, array<2 x i64>)>
-    // CHECK:  %[[LDM:.*]] = llvm.mlir.constant(32 : index) : i64
-    // CHECK:  %[[LI:.*]] = llvm.mul %[[INX]], %[[LDM]]  : i64
-    // CHECK:  %[[LIJ:.*]] = llvm.add %[[LI]], %[[INX]]  : i64
-    // CHECK:  %[[ADDRESS:.*]] = llvm.getelementptr %[[BASE]][%[[LIJ]]] : (!llvm.ptr<f16, 3>, i64) -> !llvm.ptr<f16, 3>
-    // CHECK:  %[[LDM32:.*]] = llvm.mlir.constant(32 : index) : i32
-    // CHECK:  %[[FRAG:.*]] = nvvm.wmma.load %[[ADDRESS]], %[[LDM32]]
-    // CHECK-SAME: {eltype = #nvvm.mma_type<f16>, frag = #nvvm.mma_frag<a>, k = 16 : i32, layout = #nvvm.mma_layout<col>, m = 16 : i32, n = 16 : i32}  : (!llvm.ptr<f16, 3>) -> !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
-    // CHECK:  llvm.return %[[FRAG]] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
-
-    // CHECK32:  %[[INX:.*]] = llvm.mlir.constant(16 : index) : i32
-    // CHECK32: %{{.*}} = llvm.insertvalue %{{.*}}, %{{.*}}[{{.*}}, {{.*}}]
-    // CHECK32:  %[[BASE:.*]] = llvm.extractvalue %{{.*}}[1] : !llvm.struct<(ptr<f16, 3>, ptr<f16, 3>, i32, array<2 x i32>, array<2 x i32>)>
-    // CHECK32:  %[[LDM:.*]] = llvm.mlir.constant(32 : index) : i32
-    // CHECK32:  %[[LI:.*]] = llvm.mul %[[INX]], %[[LDM]]  : i32
-    // CHECK32:  %[[LIJ:.*]] = llvm.add %[[LI]], %[[INX]]  : i32
-    // CHECK32:  %[[ADDRESS:.*]] = llvm.getelementptr %[[BASE]][%[[LIJ]]] : (!llvm.ptr<f16, 3>, i32) -> !llvm.ptr<f16, 3>
-    // CHECK32:  %[[LDM32:.*]] = llvm.mlir.constant(32 : index) : i32
-    // CHECK32:  %[[FRAG:.*]] = nvvm.wmma.load %[[ADDRESS]], %[[LDM32]]
-    // CHECK32-SAME: {eltype = #nvvm.mma_type<f16>, frag = #nvvm.mma_frag<a>, k = 16 : i32, layout = #nvvm.mma_layout<col>, m = 16 : i32, n = 16 : i32}  : (!llvm.ptr<f16, 3>) -> !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
-    // CHECK32:  llvm.return %[[FRAG]] : !llvm.struct<(vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>, vector<2xf16>)>
-    return %0 : !gpu.mma_matrix<16x16xf16, "AOp">
-  }
-}
diff --git a/mlir/test/Conversion/GPUToNVVM/wmma-ops-to-nvvm.mlir b/mlir/test/Conversion/GPUToNVVM/wmma-ops-to-nvvm.mlir
index 520d6294bc1c1..9dec666bf4b3d 100644
--- a/mlir/test/Conversion/GPUToNVVM/wmma-ops-to-nvvm.mlir
+++ b/mlir/test/Conversion/GPUToNVVM/wmma-ops-to-nvvm.mlir
@@ -1,5 +1,5 @@
-// RUN: mlir-opt --convert-gpu-to-nvvm='use-opaque-pointers=1' --split-input-file %s | FileCheck %s
-// RUN: mlir-opt --convert-gpu-to-nvvm="index-bitwidth=32 use-opaque-pointers=1" --split-input-file %s | FileCheck --check-prefix=CHECK32 %s
+// RUN: mlir-opt --convert-gpu-to-nvvm --split-input-file %s | FileCheck %s
+// RUN: mlir-opt --convert-gpu-to-nvvm="index-bitwidth=32" --split-input-file %s | FileCheck --check-prefix=CHECK32 %s
 
 gpu.module @test_module {
 

From fd8be1ef598da48897f396296c03a576bf391378 Mon Sep 17 00:00:00 2001
From: Christian Ulmann <christian.ulmann@nextsilicon.com>
Date: Wed, 1 Nov 2023 08:40:54 +0100
Subject: [PATCH 635/877] [MLIR][LaunchFuncToVulkan] Remove typed pointer
 support (#70865)

This commit removes the typed pointer support from the LaunchFunc's
lowering to Vukan dialect. Typed pointers have been deprecated for a
while now and it's planned to soon remove them from the LLVM dialect.

Related PSA:
https://discourse.llvm.org/t/psa-removal-of-typed-pointers-from-the-llvm-dialect/74502
---
 mlir/include/mlir/Conversion/Passes.td        |  6 --
 .../ConvertLaunchFuncToVulkanCalls.cpp        | 41 +++---------
 .../Conversion/GPUToVulkan/invoke-vulkan.mlir |  2 +-
 .../GPUToVulkan/typed-pointers.mlir           | 63 -------------------
 4 files changed, 9 insertions(+), 103 deletions(-)
 delete mode 100644 mlir/test/Conversion/GPUToVulkan/typed-pointers.mlir

diff --git a/mlir/include/mlir/Conversion/Passes.td b/mlir/include/mlir/Conversion/Passes.td
index 25481d75e7ea4..53da1e58bf248 100644
--- a/mlir/include/mlir/Conversion/Passes.td
+++ b/mlir/include/mlir/Conversion/Passes.td
@@ -589,12 +589,6 @@ def ConvertVulkanLaunchFuncToVulkanCallsPass
     This pass is only intended for the mlir-vulkan-runner.
   }];
 
-  let options = [
-     Option<"useOpaquePointers", "use-opaque-pointers", "bool",
-            /*default=*/"true", "Generate LLVM IR using opaque pointers "
-            "instead of typed pointers">
-  ];
-
   let dependentDialects = ["LLVM::LLVMDialect"];
 }
 
diff --git a/mlir/lib/Conversion/GPUToVulkan/ConvertLaunchFuncToVulkanCalls.cpp b/mlir/lib/Conversion/GPUToVulkan/ConvertLaunchFuncToVulkanCalls.cpp
index 036eb0200ca71..938db54963068 100644
--- a/mlir/lib/Conversion/GPUToVulkan/ConvertLaunchFuncToVulkanCalls.cpp
+++ b/mlir/lib/Conversion/GPUToVulkan/ConvertLaunchFuncToVulkanCalls.cpp
@@ -65,11 +65,7 @@ class VulkanLaunchFuncToVulkanCallsPass
   void initializeCachedTypes() {
     llvmFloatType = Float32Type::get(&getContext());
     llvmVoidType = LLVM::LLVMVoidType::get(&getContext());
-    if (useOpaquePointers)
-      llvmPointerType = LLVM::LLVMPointerType::get(&getContext());
-    else
-      llvmPointerType =
-          LLVM::LLVMPointerType::get(IntegerType::get(&getContext(), 8));
+    llvmPointerType = LLVM::LLVMPointerType::get(&getContext());
     llvmInt32Type = IntegerType::get(&getContext(), 32);
     llvmInt64Type = IntegerType::get(&getContext(), 64);
   }
@@ -85,9 +81,6 @@ class VulkanLaunchFuncToVulkanCallsPass
     //   int64_t sizes[Rank]; // omitted when rank == 0
     //   int64_t strides[Rank]; // omitted when rank == 0
     // };
-    auto llvmPtrToElementType = useOpaquePointers
-                                    ? llvmPointerType
-                                    : LLVM::LLVMPointerType::get(elemenType);
     auto llvmArrayRankElementSizeType =
         LLVM::LLVMArrayType::get(getInt64Type(), rank);
 
@@ -96,7 +89,7 @@ class VulkanLaunchFuncToVulkanCallsPass
     // [`rank` x i64], [`rank` x i64]}">`.
     return LLVM::LLVMStructType::getLiteral(
         &getContext(),
-        {llvmPtrToElementType, llvmPtrToElementType, getInt64Type(),
+        {llvmPointerType, llvmPointerType, getInt64Type(),
          llvmArrayRankElementSizeType, llvmArrayRankElementSizeType});
   }
 
@@ -280,13 +273,6 @@ void VulkanLaunchFuncToVulkanCallsPass::createBindMemRefCalls(
 
     auto symbolName =
         llvm::formatv("bindMemRef{0}D{1}", rank, stringifyType(type)).str();
-    // Special case for fp16 type. Since it is not a supported type in C we use
-    // int16_t and bitcast the descriptor.
-    if (!useOpaquePointers && isa<Float16Type>(type)) {
-      auto memRefTy = getMemRefType(rank, IntegerType::get(&getContext(), 16));
-      ptrToMemRefDescriptor = builder.create<LLVM::BitcastOp>(
-          loc, LLVM::LLVMPointerType::get(memRefTy), ptrToMemRefDescriptor);
-    }
     // Create call to `bindMemRef`.
     builder.create<LLVM::CallOp>(
         loc, TypeRange(), StringRef(symbolName.data(), symbolName.size()),
@@ -303,16 +289,9 @@ VulkanLaunchFuncToVulkanCallsPass::deduceMemRefRank(Value launchCallArg,
   if (!alloca)
     return failure();
 
-  LLVM::LLVMStructType llvmDescriptorTy;
-  if (std::optional<Type> elementType = alloca.getElemType()) {
-    llvmDescriptorTy = dyn_cast<LLVM::LLVMStructType>(*elementType);
-  } else {
-    // This case is only possible if we are not using opaque pointers
-    // since opaque pointer producing allocas require an element type.
-    llvmDescriptorTy = dyn_cast<LLVM::LLVMStructType>(
-        alloca.getRes().getType().getElementType());
-  }
-
+  std::optional<Type> elementType = alloca.getElemType();
+  assert(elementType && "expected to work with opaque pointers");
+  auto llvmDescriptorTy = dyn_cast<LLVM::LLVMStructType>(*elementType);
   // template <typename Elem, size_t Rank>
   // struct {
   //   Elem *allocated;
@@ -379,10 +358,7 @@ void VulkanLaunchFuncToVulkanCallsPass::declareVulkanFunctions(Location loc) {
       if (!module.lookupSymbol(fnName)) {
         auto fnType = LLVM::LLVMFunctionType::get(
             getVoidType(),
-            {getPointerType(), getInt32Type(), getInt32Type(),
-             useOpaquePointers
-                 ? llvmPointerType
-                 : LLVM::LLVMPointerType::get(getMemRefType(i, type))},
+            {llvmPointerType, getInt32Type(), getInt32Type(), llvmPointerType},
             /*isVarArg=*/false);
         builder.create<LLVM::LLVMFuncOp>(loc, fnName, fnType);
       }
@@ -410,8 +386,7 @@ Value VulkanLaunchFuncToVulkanCallsPass::createEntryPointNameConstant(
 
   std::string entryPointGlobalName = (name + "_spv_entry_point_name").str();
   return LLVM::createGlobalString(loc, builder, entryPointGlobalName,
-                                  shaderName, LLVM::Linkage::Internal,
-                                  useOpaquePointers);
+                                  shaderName, LLVM::Linkage::Internal);
 }
 
 void VulkanLaunchFuncToVulkanCallsPass::translateVulkanLaunchCall(
@@ -429,7 +404,7 @@ void VulkanLaunchFuncToVulkanCallsPass::translateVulkanLaunchCall(
   // that data to runtime call.
   Value ptrToSPIRVBinary = LLVM::createGlobalString(
       loc, builder, kSPIRVBinary, spirvAttributes.blob.getValue(),
-      LLVM::Linkage::Internal, useOpaquePointers);
+      LLVM::Linkage::Internal);
 
   // Create LLVM constant for the size of SPIR-V binary shader.
   Value binarySize = builder.create<LLVM::ConstantOp>(
diff --git a/mlir/test/Conversion/GPUToVulkan/invoke-vulkan.mlir b/mlir/test/Conversion/GPUToVulkan/invoke-vulkan.mlir
index f6da6d0a3be34..fe8a36ee29a9f 100644
--- a/mlir/test/Conversion/GPUToVulkan/invoke-vulkan.mlir
+++ b/mlir/test/Conversion/GPUToVulkan/invoke-vulkan.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -launch-func-to-vulkan='use-opaque-pointers=1' | FileCheck %s
+// RUN: mlir-opt %s -launch-func-to-vulkan | FileCheck %s
 
 // CHECK: llvm.mlir.global internal constant @kernel_spv_entry_point_name
 // CHECK: llvm.mlir.global internal constant @SPIRV_BIN
diff --git a/mlir/test/Conversion/GPUToVulkan/typed-pointers.mlir b/mlir/test/Conversion/GPUToVulkan/typed-pointers.mlir
deleted file mode 100644
index 2884b33750b32..0000000000000
--- a/mlir/test/Conversion/GPUToVulkan/typed-pointers.mlir
+++ /dev/null
@@ -1,63 +0,0 @@
-// RUN: mlir-opt %s -launch-func-to-vulkan='use-opaque-pointers=0' | FileCheck %s
-
-// CHECK: llvm.mlir.global internal constant @kernel_spv_entry_point_name
-// CHECK: llvm.mlir.global internal constant @SPIRV_BIN
-// CHECK: %[[Vulkan_Runtime_ptr:.*]] = llvm.call @initVulkan() : () -> !llvm.ptr<i8>
-// CHECK: %[[addressof_SPIRV_BIN:.*]] = llvm.mlir.addressof @SPIRV_BIN
-// CHECK: %[[SPIRV_BIN_ptr:.*]] = llvm.getelementptr %[[addressof_SPIRV_BIN]]
-// CHECK: %[[SPIRV_BIN_size:.*]] = llvm.mlir.constant
-// CHECK: llvm.call @bindMemRef1DFloat(%[[Vulkan_Runtime_ptr]], %{{.*}}, %{{.*}}, %{{.*}}) : (!llvm.ptr<i8>, i32, i32, !llvm.ptr<struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>>) -> ()
-// CHECK: llvm.call @setBinaryShader(%[[Vulkan_Runtime_ptr]], %[[SPIRV_BIN_ptr]], %[[SPIRV_BIN_size]]) : (!llvm.ptr<i8>, !llvm.ptr<i8>, i32) -> ()
-// CHECK: %[[addressof_entry_point:.*]] = llvm.mlir.addressof @kernel_spv_entry_point_name
-// CHECK: %[[entry_point_ptr:.*]] = llvm.getelementptr %[[addressof_entry_point]]
-// CHECK: llvm.call @setEntryPoint(%[[Vulkan_Runtime_ptr]], %[[entry_point_ptr]]) : (!llvm.ptr<i8>, !llvm.ptr<i8>) -> ()
-// CHECK: llvm.call @setNumWorkGroups(%[[Vulkan_Runtime_ptr]], %{{.*}}, %{{.*}}, %{{.*}}) : (!llvm.ptr<i8>, i64, i64, i64) -> ()
-// CHECK: llvm.call @runOnVulkan(%[[Vulkan_Runtime_ptr]]) : (!llvm.ptr<i8>) -> ()
-// CHECK: llvm.call @deinitVulkan(%[[Vulkan_Runtime_ptr]]) : (!llvm.ptr<i8>) -> ()
-
-// CHECK: llvm.func @bindMemRef1DHalf(!llvm.ptr<i8>, i32, i32, !llvm.ptr<struct<(ptr<i16>, ptr<i16>, i64, array<1 x i64>, array<1 x i64>)>>)
-
-module attributes {gpu.container_module} {
-  llvm.func @malloc(i64) -> !llvm.ptr<i8>
-  llvm.func @foo() {
-    %0 = llvm.mlir.constant(12 : index) : i64
-    %1 = llvm.mlir.zero : !llvm.ptr<f32>
-    %2 = llvm.mlir.constant(1 : index) : i64
-    %3 = llvm.getelementptr %1[%2] : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32>
-    %4 = llvm.ptrtoint %3 : !llvm.ptr<f32> to i64
-    %5 = llvm.mul %0, %4 : i64
-    %6 = llvm.call @malloc(%5) : (i64) -> !llvm.ptr<i8>
-    %7 = llvm.bitcast %6 : !llvm.ptr<i8> to !llvm.ptr<f32>
-    %8 = llvm.mlir.undef : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-    %9 = llvm.insertvalue %7, %8[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-    %10 = llvm.insertvalue %7, %9[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-    %11 = llvm.mlir.constant(0 : index) : i64
-    %12 = llvm.insertvalue %11, %10[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-    %13 = llvm.mlir.constant(1 : index) : i64
-    %14 = llvm.insertvalue %0, %12[3, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-    %15 = llvm.insertvalue %13, %14[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-    %16 = llvm.mlir.constant(1 : index) : i64
-    %17 = llvm.extractvalue %15[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-    %18 = llvm.extractvalue %15[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-    %19 = llvm.extractvalue %15[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-    %20 = llvm.extractvalue %15[3, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-    %21 = llvm.extractvalue %15[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-    llvm.call @vulkanLaunch(%16, %16, %16, %17, %18, %19, %20, %21) {spirv_blob = "\03\02#\07\00", spirv_element_types = [f32], spirv_entry_point = "kernel"}
-    : (i64, i64, i64, !llvm.ptr<f32>, !llvm.ptr<f32>, i64, i64, i64) -> ()
-    llvm.return
-  }
-  llvm.func @vulkanLaunch(%arg0: i64, %arg1: i64, %arg2: i64, %arg6: !llvm.ptr<f32>, %arg7: !llvm.ptr<f32>, %arg8: i64, %arg9: i64, %arg10: i64) {
-    %0 = llvm.mlir.undef : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-    %1 = llvm.insertvalue %arg6, %0[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-    %2 = llvm.insertvalue %arg7, %1[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-    %3 = llvm.insertvalue %arg8, %2[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-    %4 = llvm.insertvalue %arg9, %3[3, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-    %5 = llvm.insertvalue %arg10, %4[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-    %6 = llvm.mlir.constant(1 : index) : i64
-    %7 = llvm.alloca %6 x !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)> : (i64) -> !llvm.ptr<struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>>
-    llvm.store %5, %7 : !llvm.ptr<struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>>
-    llvm.call @_mlir_ciface_vulkanLaunch(%arg0, %arg1, %arg2, %7) : (i64, i64, i64, !llvm.ptr<struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>>) -> ()
-    llvm.return
-  }
-  llvm.func @_mlir_ciface_vulkanLaunch(i64, i64, i64, !llvm.ptr<struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>>)
-}

From e144ae54dcb96838a6176fd9eef21028935ccd4f Mon Sep 17 00:00:00 2001
From: Serge Pavlov <sepavloff@gmail.com>
Date: Mon, 3 Apr 2023 23:28:50 +0700
Subject: [PATCH 636/877] [symbolizer] Support symbol lookup

Recent versions of GNU binutils starting from 2.39 support symbol+offset
lookup in addition to the usual numeric address lookup. This change adds
symbol lookup to llvm-symbolize and llvm-addr2line.

Now llvm-symbolize behaves closer to GNU addr2line, - if the value specified
as address in command line or input stream is not a number, it is treated as
a symbol name. For example:

    llvm-symbolize --obj=abc.so func_22
    llvm-symbolize --obj=abc.so "CODE func_22"

This lookup is now supported only for functions. Specification with
offset is not supported yet.

This is a recommit of 2b27948783e4bbc1132d3220d8517ef62607b558, reverted
in 39fec5457c0925bd39f67f63fe17391584e08258 because the test
llvm/test/Support/interrupts.test started failing on Windows. The test was
changed in 18f036d0105589c3175bb51a518c5d272dae61e2 and is also updated in
this commit.

Differential Revision: https://reviews.llvm.org/D149759
---
 llvm/docs/CommandGuide/llvm-symbolizer.rst    |  13 +++-
 llvm/docs/ReleaseNotes.rst                    |   2 +
 .../llvm/DebugInfo/Symbolize/DIPrinter.h      |   7 ++
 .../DebugInfo/Symbolize/SymbolizableModule.h  |   3 +
 .../Symbolize/SymbolizableObjectFile.h        |   2 +
 .../llvm/DebugInfo/Symbolize/Symbolize.h      |  11 +++
 llvm/lib/DebugInfo/Symbolize/DIPrinter.cpp    |  26 +++++++
 .../Symbolize/SymbolizableObjectFile.cpp      |  13 ++++
 llvm/lib/DebugInfo/Symbolize/Symbolize.cpp    |  44 ++++++++++++
 llvm/test/Support/interrupts.test             |   5 +-
 .../tools/llvm-symbolizer/Inputs/addr.inp     |   4 +-
 .../tools/llvm-symbolizer/Inputs/discrim.inp  |   2 +-
 .../tools/llvm-symbolizer/Inputs/symbols.h    |  19 +++++
 .../llvm-symbolizer/Inputs/symbols.part1.cpp  |  25 +++++++
 .../llvm-symbolizer/Inputs/symbols.part2.cpp  |  18 +++++
 .../llvm-symbolizer/Inputs/symbols.part3.c    |  12 ++++
 .../llvm-symbolizer/Inputs/symbols.part4.c    |  13 ++++
 .../tools/llvm-symbolizer/Inputs/symbols.so   | Bin 0 -> 19496 bytes
 .../output-style-json-code.test               |  21 +++---
 .../output-style-json-data.test               |   4 +-
 .../output-style-json-frame.ll                |   4 +-
 .../tools/llvm-symbolizer/symbol-search.test  |  65 ++++++++++++++++++
 .../tools/llvm-symbolizer/llvm-symbolizer.cpp |  58 +++++++++-------
 llvm/unittests/ProfileData/MemProfTest.cpp    |   4 ++
 24 files changed, 333 insertions(+), 42 deletions(-)
 create mode 100644 llvm/test/tools/llvm-symbolizer/Inputs/symbols.h
 create mode 100644 llvm/test/tools/llvm-symbolizer/Inputs/symbols.part1.cpp
 create mode 100644 llvm/test/tools/llvm-symbolizer/Inputs/symbols.part2.cpp
 create mode 100644 llvm/test/tools/llvm-symbolizer/Inputs/symbols.part3.c
 create mode 100644 llvm/test/tools/llvm-symbolizer/Inputs/symbols.part4.c
 create mode 100755 llvm/test/tools/llvm-symbolizer/Inputs/symbols.so
 create mode 100644 llvm/test/tools/llvm-symbolizer/symbol-search.test

diff --git a/llvm/docs/CommandGuide/llvm-symbolizer.rst b/llvm/docs/CommandGuide/llvm-symbolizer.rst
index fe5df077b4566..59c0ab6d196ac 100644
--- a/llvm/docs/CommandGuide/llvm-symbolizer.rst
+++ b/llvm/docs/CommandGuide/llvm-symbolizer.rst
@@ -14,7 +14,7 @@ DESCRIPTION
 :program:`llvm-symbolizer` reads input names and addresses from the command-line
 and prints corresponding source code locations to standard output. It can also
 symbolize logs containing :doc:`Symbolizer Markup </SymbolizerMarkupFormat>` via
-:option:`--filter-markup`.
+:option:`--filter-markup`. Addresses may be specified as numbers or symbol names.
 
 If no address is specified on the command-line, it reads the addresses from
 standard input. If no input name is specified on the command-line, but addresses
@@ -196,6 +196,17 @@ shows --relativenames.
   main
   foo/test.cpp:15:0
 
+Example 7 - Addresses as symbol names:
+
+.. code-block:: console
+
+  $ llvm-symbolizer --obj=test.elf main
+  main
+  /tmp/test.cpp:14:0
+  $ llvm-symbolizer --obj=test.elf "CODE foz"
+  foz
+  /tmp/test.h:1:0
+
 OPTIONS
 -------
 
diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
index a1277171e8a54..25817e6e1d7f4 100644
--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@@ -201,6 +201,8 @@ Changes to the LLVM tools
 * ``llvm-nm`` now supports the ``--line-numbers`` (``-l``) option to use
   debugging information to print symbols' filenames and line numbers.
 
+* llvm-symbolizer and llvm-addr2line now support addresses specified as symbol names.
+
 Changes to LLDB
 ---------------------------------
 
diff --git a/llvm/include/llvm/DebugInfo/Symbolize/DIPrinter.h b/llvm/include/llvm/DebugInfo/Symbolize/DIPrinter.h
index 026f917ced5bc..72ffdd29f1b72 100644
--- a/llvm/include/llvm/DebugInfo/Symbolize/DIPrinter.h
+++ b/llvm/include/llvm/DebugInfo/Symbolize/DIPrinter.h
@@ -34,6 +34,7 @@ class SourceCode;
 struct Request {
   StringRef ModuleName;
   std::optional<uint64_t> Address;
+  StringRef Symbol;
 };
 
 class DIPrinter {
@@ -46,6 +47,8 @@ class DIPrinter {
   virtual void print(const Request &Request, const DIGlobal &Global) = 0;
   virtual void print(const Request &Request,
                      const std::vector<DILocal> &Locals) = 0;
+  virtual void print(const Request &Request,
+                     const std::vector<DILineInfo> &Locations) = 0;
 
   virtual bool printError(const Request &Request,
                           const ErrorInfoBase &ErrorInfo) = 0;
@@ -91,6 +94,8 @@ class PlainPrinterBase : public DIPrinter {
   void print(const Request &Request, const DIGlobal &Global) override;
   void print(const Request &Request,
              const std::vector<DILocal> &Locals) override;
+  void print(const Request &Request,
+             const std::vector<DILineInfo> &Locations) override;
 
   bool printError(const Request &Request,
                   const ErrorInfoBase &ErrorInfo) override;
@@ -141,6 +146,8 @@ class JSONPrinter : public DIPrinter {
   void print(const Request &Request, const DIGlobal &Global) override;
   void print(const Request &Request,
              const std::vector<DILocal> &Locals) override;
+  void print(const Request &Request,
+             const std::vector<DILineInfo> &Locations) override;
 
   bool printError(const Request &Request,
                   const ErrorInfoBase &ErrorInfo) override;
diff --git a/llvm/include/llvm/DebugInfo/Symbolize/SymbolizableModule.h b/llvm/include/llvm/DebugInfo/Symbolize/SymbolizableModule.h
index 51e92b83eadba..255932d35cda1 100644
--- a/llvm/include/llvm/DebugInfo/Symbolize/SymbolizableModule.h
+++ b/llvm/include/llvm/DebugInfo/Symbolize/SymbolizableModule.h
@@ -36,6 +36,9 @@ class SymbolizableModule {
   virtual std::vector<DILocal>
   symbolizeFrame(object::SectionedAddress ModuleOffset) const = 0;
 
+  virtual std::vector<object::SectionedAddress>
+  findSymbol(StringRef Symbol) const = 0;
+
   // Return true if this is a 32-bit x86 PE COFF module.
   virtual bool isWin32Module() const = 0;
 
diff --git a/llvm/include/llvm/DebugInfo/Symbolize/SymbolizableObjectFile.h b/llvm/include/llvm/DebugInfo/Symbolize/SymbolizableObjectFile.h
index 075dbe3e0e372..311fa201d900e 100644
--- a/llvm/include/llvm/DebugInfo/Symbolize/SymbolizableObjectFile.h
+++ b/llvm/include/llvm/DebugInfo/Symbolize/SymbolizableObjectFile.h
@@ -43,6 +43,8 @@ class SymbolizableObjectFile : public SymbolizableModule {
   DIGlobal symbolizeData(object::SectionedAddress ModuleOffset) const override;
   std::vector<DILocal>
   symbolizeFrame(object::SectionedAddress ModuleOffset) const override;
+  std::vector<object::SectionedAddress>
+  findSymbol(StringRef Symbol) const override;
 
   // Return true if this is a 32-bit x86 PE COFF module.
   bool isWin32Module() const override;
diff --git a/llvm/include/llvm/DebugInfo/Symbolize/Symbolize.h b/llvm/include/llvm/DebugInfo/Symbolize/Symbolize.h
index 99a7f219baaa0..bc4aa74073a65 100644
--- a/llvm/include/llvm/DebugInfo/Symbolize/Symbolize.h
+++ b/llvm/include/llvm/DebugInfo/Symbolize/Symbolize.h
@@ -104,6 +104,14 @@ class LLVMSymbolizer {
   Expected<std::vector<DILocal>>
   symbolizeFrame(ArrayRef<uint8_t> BuildID,
                  object::SectionedAddress ModuleOffset);
+
+  Expected<std::vector<DILineInfo>> findSymbol(const ObjectFile &Obj,
+                                               StringRef Symbol);
+  Expected<std::vector<DILineInfo>> findSymbol(StringRef ModuleName,
+                                               StringRef Symbol);
+  Expected<std::vector<DILineInfo>> findSymbol(ArrayRef<uint8_t> BuildID,
+                                               StringRef Symbol);
+
   void flush();
 
   // Evict entries from the binary cache until it is under the maximum size
@@ -146,6 +154,9 @@ class LLVMSymbolizer {
   Expected<std::vector<DILocal>>
   symbolizeFrameCommon(const T &ModuleSpecifier,
                        object::SectionedAddress ModuleOffset);
+  template <typename T>
+  Expected<std::vector<DILineInfo>> findSymbolCommon(const T &ModuleSpecifier,
+                                                     StringRef Symbol);
 
   Expected<SymbolizableModule *> getOrCreateModuleInfo(const ObjectFile &Obj);
 
diff --git a/llvm/lib/DebugInfo/Symbolize/DIPrinter.cpp b/llvm/lib/DebugInfo/Symbolize/DIPrinter.cpp
index dcf5eee2bb32b..d7b33ce1d0f06 100644
--- a/llvm/lib/DebugInfo/Symbolize/DIPrinter.cpp
+++ b/llvm/lib/DebugInfo/Symbolize/DIPrinter.cpp
@@ -260,6 +260,17 @@ void PlainPrinterBase::print(const Request &Request,
   printFooter();
 }
 
+void PlainPrinterBase::print(const Request &Request,
+                             const std::vector<DILineInfo> &Locations) {
+  if (Locations.empty()) {
+    print(Request, DILineInfo());
+  } else {
+    for (const DILineInfo &L : Locations)
+      print(L, false);
+    printFooter();
+  }
+}
+
 bool PlainPrinterBase::printError(const Request &Request,
                                   const ErrorInfoBase &ErrorInfo) {
   ErrHandler(ErrorInfo, Request.ModuleName);
@@ -273,6 +284,8 @@ static std::string toHex(uint64_t V) {
 
 static json::Object toJSON(const Request &Request, StringRef ErrorMsg = "") {
   json::Object Json({{"ModuleName", Request.ModuleName.str()}});
+  if (!Request.Symbol.empty())
+    Json["SymName"] = Request.Symbol.str();
   if (Request.Address)
     Json["Address"] = toHex(*Request.Address);
   if (!ErrorMsg.empty())
@@ -362,6 +375,19 @@ void JSONPrinter::print(const Request &Request,
     printJSON(std::move(Json));
 }
 
+void JSONPrinter::print(const Request &Request,
+                        const std::vector<DILineInfo> &Locations) {
+  json::Array Definitions;
+  for (const DILineInfo &L : Locations)
+    Definitions.push_back(toJSON(L));
+  json::Object Json = toJSON(Request);
+  Json["Loc"] = std::move(Definitions);
+  if (ObjectList)
+    ObjectList->push_back(std::move(Json));
+  else
+    printJSON(std::move(Json));
+}
+
 bool JSONPrinter::printError(const Request &Request,
                              const ErrorInfoBase &ErrorInfo) {
   json::Object Json = toJSON(Request, ErrorInfo.message());
diff --git a/llvm/lib/DebugInfo/Symbolize/SymbolizableObjectFile.cpp b/llvm/lib/DebugInfo/Symbolize/SymbolizableObjectFile.cpp
index 6b8068a531c05..697303038507a 100644
--- a/llvm/lib/DebugInfo/Symbolize/SymbolizableObjectFile.cpp
+++ b/llvm/lib/DebugInfo/Symbolize/SymbolizableObjectFile.cpp
@@ -351,6 +351,19 @@ std::vector<DILocal> SymbolizableObjectFile::symbolizeFrame(
   return DebugInfoContext->getLocalsForAddress(ModuleOffset);
 }
 
+std::vector<object::SectionedAddress>
+SymbolizableObjectFile::findSymbol(StringRef Symbol) const {
+  std::vector<object::SectionedAddress> Result;
+  for (const SymbolDesc &Sym : Symbols) {
+    if (Sym.Name.equals(Symbol)) {
+      object::SectionedAddress A{Sym.Addr,
+                                 getModuleSectionIndexForAddress(Sym.Addr)};
+      Result.push_back(A);
+    }
+  }
+  return Result;
+}
+
 /// Search for the first occurence of specified Address in ObjectFile.
 uint64_t SymbolizableObjectFile::getModuleSectionIndexForAddress(
     uint64_t Address) const {
diff --git a/llvm/lib/DebugInfo/Symbolize/Symbolize.cpp b/llvm/lib/DebugInfo/Symbolize/Symbolize.cpp
index 517f1e7dc284f..36d112a5f3fb2 100644
--- a/llvm/lib/DebugInfo/Symbolize/Symbolize.cpp
+++ b/llvm/lib/DebugInfo/Symbolize/Symbolize.cpp
@@ -231,6 +231,50 @@ LLVMSymbolizer::symbolizeFrame(ArrayRef<uint8_t> BuildID,
   return symbolizeFrameCommon(BuildID, ModuleOffset);
 }
 
+template <typename T>
+Expected<std::vector<DILineInfo>>
+LLVMSymbolizer::findSymbolCommon(const T &ModuleSpecifier, StringRef Symbol) {
+  auto InfoOrErr = getOrCreateModuleInfo(ModuleSpecifier);
+  if (!InfoOrErr)
+    return InfoOrErr.takeError();
+
+  SymbolizableModule *Info = *InfoOrErr;
+  std::vector<DILineInfo> Result;
+
+  // A null module means an error has already been reported. Return an empty
+  // result.
+  if (!Info)
+    return Result;
+
+  for (object::SectionedAddress A : Info->findSymbol(Symbol)) {
+    DILineInfo LineInfo = Info->symbolizeCode(
+        A, DILineInfoSpecifier(Opts.PathStyle, Opts.PrintFunctions),
+        Opts.UseSymbolTable);
+    if (LineInfo.FileName != DILineInfo::BadString) {
+      if (Opts.Demangle)
+        LineInfo.FunctionName = DemangleName(LineInfo.FunctionName, Info);
+      Result.push_back(LineInfo);
+    }
+  }
+
+  return Result;
+}
+
+Expected<std::vector<DILineInfo>>
+LLVMSymbolizer::findSymbol(const ObjectFile &Obj, StringRef Symbol) {
+  return findSymbolCommon(Obj, Symbol);
+}
+
+Expected<std::vector<DILineInfo>>
+LLVMSymbolizer::findSymbol(StringRef ModuleName, StringRef Symbol) {
+  return findSymbolCommon(ModuleName.str(), Symbol);
+}
+
+Expected<std::vector<DILineInfo>>
+LLVMSymbolizer::findSymbol(ArrayRef<uint8_t> BuildID, StringRef Symbol) {
+  return findSymbolCommon(BuildID, Symbol);
+}
+
 void LLVMSymbolizer::flush() {
   ObjectForUBPathAndArch.clear();
   LRUBinaries.clear();
diff --git a/llvm/test/Support/interrupts.test b/llvm/test/Support/interrupts.test
index 752426c5292b0..4768ac61dff02 100644
--- a/llvm/test/Support/interrupts.test
+++ b/llvm/test/Support/interrupts.test
@@ -1,7 +1,10 @@
 ## Show that SIGINT and similar signals don't cause crash messages to be
 ## reported.
 # RUN: %python %s wrapper llvm-symbolizer 2> %t.err
-# RUN: count 0 < %t.err
+# RUN: FileCheck --input-file=%t.err %s
+
+# CHECK: {{.*}} error: 'foo': {{[Nn]}}o such file or directory
+# CHECK-NOT: {{.+}}
 
 import os
 import signal
diff --git a/llvm/test/tools/llvm-symbolizer/Inputs/addr.inp b/llvm/test/tools/llvm-symbolizer/Inputs/addr.inp
index b5e146b114e25..b19992175bf99 100644
--- a/llvm/test/tools/llvm-symbolizer/Inputs/addr.inp
+++ b/llvm/test/tools/llvm-symbolizer/Inputs/addr.inp
@@ -1,3 +1,3 @@
-some text
+something not a valid address
 0x40054d
-some text2
+some text possibly a symbol
diff --git a/llvm/test/tools/llvm-symbolizer/Inputs/discrim.inp b/llvm/test/tools/llvm-symbolizer/Inputs/discrim.inp
index a5cfcb2558f35..2c4d722e32862 100644
--- a/llvm/test/tools/llvm-symbolizer/Inputs/discrim.inp
+++ b/llvm/test/tools/llvm-symbolizer/Inputs/discrim.inp
@@ -5,4 +5,4 @@ some text
 0x4005b9
 0x4005ce
 0x4005d4
-some more text
+another text
diff --git a/llvm/test/tools/llvm-symbolizer/Inputs/symbols.h b/llvm/test/tools/llvm-symbolizer/Inputs/symbols.h
new file mode 100644
index 0000000000000..b097c4d9dc00a
--- /dev/null
+++ b/llvm/test/tools/llvm-symbolizer/Inputs/symbols.h
@@ -0,0 +1,19 @@
+// This file is a part of sources used to build `symbols.so`, which is used to
+// test symbol location search made by llvm-symbolizer.
+//
+// Build instructions:
+// $ mkdir /tmp/dbginfo
+// $ cp symbols.h symbols.part1.cpp symbols.part2.cpp symbols.part3.c symbols.part4.c /tmp/dbginfo/
+// $ cd /tmp/dbginfo
+// $ gcc -osymbols.so -shared -fPIC -g symbols.part1.cpp symbols.part2.cpp symbols.part3.c symbols.part4.c
+
+
+extern "C" {
+extern int global_01;
+int func_01();
+int func_02(int);
+}
+
+template<typename T> T func_03(T x) {
+  return x + T(1);
+}
diff --git a/llvm/test/tools/llvm-symbolizer/Inputs/symbols.part1.cpp b/llvm/test/tools/llvm-symbolizer/Inputs/symbols.part1.cpp
new file mode 100644
index 0000000000000..ad4b3e34411aa
--- /dev/null
+++ b/llvm/test/tools/llvm-symbolizer/Inputs/symbols.part1.cpp
@@ -0,0 +1,25 @@
+#include "symbols.h"
+
+int global_01 = 22;
+
+int static static_var = 0;
+
+static int static_func_01(int x) {
+  static_var = x;
+  return global_01;
+}
+
+int func_01() {
+  int res = 1;
+  return res + static_func_01(22);
+}
+
+int func_04() {
+  static_var = 0;
+  return 22;
+}
+
+int func_04(int x) {
+  int res = static_var;
+  return res + func_03(x);
+}
diff --git a/llvm/test/tools/llvm-symbolizer/Inputs/symbols.part2.cpp b/llvm/test/tools/llvm-symbolizer/Inputs/symbols.part2.cpp
new file mode 100644
index 0000000000000..35e66d62622f8
--- /dev/null
+++ b/llvm/test/tools/llvm-symbolizer/Inputs/symbols.part2.cpp
@@ -0,0 +1,18 @@
+#include "symbols.h"
+
+int static static_var = 4;
+
+static int static_func_01(int x) {
+  static_var--;
+  return x;
+}
+
+int func_02(int x) {
+  static_var = x;
+  return static_func_01(x);
+}
+
+int func_05(int x) {
+  int res = static_var;
+  return res + func_03(x);
+}
diff --git a/llvm/test/tools/llvm-symbolizer/Inputs/symbols.part3.c b/llvm/test/tools/llvm-symbolizer/Inputs/symbols.part3.c
new file mode 100644
index 0000000000000..1284be505b6ba
--- /dev/null
+++ b/llvm/test/tools/llvm-symbolizer/Inputs/symbols.part3.c
@@ -0,0 +1,12 @@
+static int static_func(int);
+static int static_var = 0;
+
+int static_func(int x) {
+  static_var++;
+  return static_var + x;
+}
+
+int func_06(int x) {
+  return static_func(x);
+}
+
diff --git a/llvm/test/tools/llvm-symbolizer/Inputs/symbols.part4.c b/llvm/test/tools/llvm-symbolizer/Inputs/symbols.part4.c
new file mode 100644
index 0000000000000..de2ac81d2a78c
--- /dev/null
+++ b/llvm/test/tools/llvm-symbolizer/Inputs/symbols.part4.c
@@ -0,0 +1,13 @@
+static int static_func(int);
+static int static_var = 5;
+
+int static_func(int x) {
+  static_var++;
+  return static_var + x;
+}
+
+int func_07(int x) {
+  static_var++;
+  return static_func(x);
+}
+
diff --git a/llvm/test/tools/llvm-symbolizer/Inputs/symbols.so b/llvm/test/tools/llvm-symbolizer/Inputs/symbols.so
new file mode 100755
index 0000000000000000000000000000000000000000..ceacd9845a8d880e82d3f3b98d181af670e350d3
GIT binary patch
literal 19496
zcmeHP4~!Jm8Go~TbN~MW2X`XeLb0NF%N-nWfXKq#!JY*@;jp0CVefA5ZtdRgW&glI
zv=UM4<!Yj>HpUSDG^RDF8cmJbG**ljsHqkko0v9fuq_4?Yt>>yDPjBj-h1Ef?63>5
zO-$R&OLo5Z{r}!KZ{EC_d2jFN*|fo97=nveTqAOr5CP42s*5Fv)+wrRoG<1{*^(UV
zyjoMKA~6*83MU^XRW<H}uXvOkL*|p7^ygE2;efYkJ%)BY3MJ@QdckY*;YYgBr!{0f
zGDmvbwcd8E$8f)vXIQSx$`~omr*u3{=+klxoq9*Lo)a$6dJJVQK-+pEHT3WJ>(hG9
zJTiWl_Q#O@-2gp(YPH0L0iWhA+F!Br@UqS)LygLOk`3xJ8?W0Yke)*ni&wj)`V5T|
ze`;gA^|7D4b$agRr_R0C^uqG%{yZEfQIaPcWLL)j^x+aCAbdot#IG8^Ex*3y==Xnf
z*-`(|r<T6`S$}&C;EkZo*6;?@Ay%+#0M%E3b5#*KwW*Li#aD>_UC^f?%3}?vpe#^8
zYy)f%jbdRy5%@ek0+i%WY56M@TigLSAM{5xeY&P6wS3wk|3gjho~8t6YxxmP->K!B
zHT_vlzemeosp*!ccWQZ#Us%@2SUhGWQ=vr4vV_&!zu6iJC&D9<WGbBK-`o|A$HM)g
zfoNFq3UjQ%iI6oMiG`w(ePJ;YjSqyPR$IFmPR9lbS~smw)sEe^dL&n0+8gQVx3;!f
zT4hO&wJf4(%Qakqw(vN@*KfUUSDzjyn0`!O0KR}q1k)LRZ>^$JpOpPlq2joWPJM#Q
zNgG`c6_WLdjn3BwF;3g)+&75c6riIJW%zMSlHeZP47eF^GvH>x&48N$Hv?`4{x37|
zx4OA!f(Je-4^EjMF2;2B?n5b0=2-B+8|AOd_GXs-5#-M@m;DK#u0@diC~-df@cm3C
zb3ftWJ|wx_Lh^yX!g%n$HSeGvyw7}-z{=C9M!4Edb-1d?9ItEHPv&0N5b{q*`DKp~
zxZ+%J@{{0eAFmC*cG?>>4hP>kmzn_st=d3&=J;@33;eP^#j}6S4zNXf(YD}$HD~C-
zJvezHRUMpK;|Kj@FXBEqgu`L89k_7|bo2B-+yf>V+6KMJ+s{t*oSihQ$m5jxB+N~T
zTV7{2a#w$(=PXUlU!94=$v2=owe75N?3Y=YBj%g>?ZL|=lBevOy#1po^I7mGd*JA!
z$s;F!dhR?6Du@Gz5Gy+-l6foi4xBV5%@Bpvb2i7ObIPH2D<(|8ir{i_K}Na%irxLH
z!tTvl@^uKlOfuJ)X_X$0V+DO4Ie940m5lZfRHTKMNVoC%;ug$@SoAwe2(dq(H7NUQ
zwud64ZgDf<X28vWn*lciZU)>8xEXLW;AX(hfSZAfX26*3U40j%>hSY@lF2*{7yvv9
z*avtVu=&$W<}_d@U=yBncLH_-rUADDJ_<Mi_&nglfX4t|20R7$9^jnMpbyv$*o2Q&
z7GNjfJ%HQw$1D1FOU2l?MHmzH#_Z~{^82B)0JKZ-oA_%c;{z16^&4uNuC1%uQ@&rU
zy=2vuOBc*1GU?+rfEb4ye`{v?PE<x6bh4SkuNiG@A;F&7`g=TGHKjMgF;L<^h+h-<
z6hoGObB_N4$d`gIzj?BLUylDS_>JJpZ?4SWmE)fRe=hj)+bi?0&GBcT{SDw3i?0Lx
zdhm<I*AM=T3;0igKL`BF5l6Ot-^lrY5c~z;%Wu6L-<BNz1@K!gp#LuTUj<)&`(^!*
z(kGX0aWmj%z|DZ00XG9~2HXs|8E`Y;X5fD_1HA5$*E!NUT3PhNSdB)X)9XQbohYxL
z<n^Sy2JsRt&+91Zd5jA4KR=g=6aI;Qew1s;f!|v~)Z7}%PQ3_{*7j0)Q30%9SEWhA
z>{`iyB51RM`7^W}t&^lOOM$E-rU!H&9Ooe|SD_)>`$tB_pY_Z3zGTSj*qv_M9P3r-
z!!@Ornzym5Yn5-|wt;jkmG-r_`<MFLTHDhSZU5GiCH}UKD>PYL4RMu9m5^q=JoawT
zsW_J(D{Ahj^mpMwQt3ah^`lzkV*0e~+9U2n2ugpB?tbfYF7j$hej(0^lDD+{4S7;I
zWG5msg&~E#?V!&i9w}un03x82&{MXjx_UW)Sz<<FDN#xf#)j9d@GcCXhEhhaVGL6W
zRd_oCL_;WaF?br9jsTFlv(d)E$3d;=mrP><qVq}2^FUNoet?QG)BG5KXwGHb#=-YV
zU;;7?N@u4h5l)I(Y@kXSXfS`L4LH|%8wabY>l@W3X*gGP8wZynLDFIkTa+V#Q6}5S
z-9f(LATtLkl(H6<6?1idOUaDPFTG5m{9+M8wmgcOSLL)sX3$$rsjcu{6;Lv!ErY~4
zKZAFITG}t<8Ic*J8fCDo7Osst^9*pJITz9!2Y&%7g<aR6bas|g295eMHc>&*8<i&E
z&UO37!OyjNrB=sD1BH44$#aL#&yd-|w#5%1L_P2=%FvZK%vlWzWysmrTfy|o^mwYf
z!LRUM9RNTnQMohcQ=AnhF`euDjf49@MKdcJh)tO(ok`*)Qk;EJC#ND$WJx27W~1!E
zm8hcp8l)&n=M{GEc~Q0nAcn$G{yZ0@b9cc-qGVsiqBKn5Enii&x@u)r?{p7_YEK6d
z<{6;OD^nzIgK=5y^eS)5?3Qh_8jO0}2udJT%W7+zug1O2Tp+5duBq~3U?}Yn8>cHS
znJlvl6QSI1tZiofSZ(k024h-TplM&Tf%k$HixEICDwTz$Zj(|~eo@#2h9SEi+g4gN
zeuLdIV;|j?&(Fg7P4?#6m)Ck~kZE`6Y)~mTsI0J*3awn`-F6}>?9v>ql0{}N&<uI*
z_-y9dK{=*V=XIWql3c^`+HrYvzSq*6*Br~1zAxYJBRF*?Zd;4#6dJ}qYikJLS`l=;
z7)Fahmz&3E@zmKS9dcYT=ZgvtkHt-9?K5WO1E%qSS=VNk{mh)3Hs?KR8aoKBJz!Qo
zM)etBoiJxTU^WxW<AH|~kJ$t}Jhpo44YT1H(-@J=sV@+nd|xgPx#)NtoN3k|x)Y|c
zhC--4VKyPs$G&hlU)SN#FnW<>d1yQp@*vyCLtP_9(uhc&A@3B@W3r_;eF4)n2nu<S
zrg?nIH(hhWW*z&ZQjtN6_cLj9cPOFvBrUxfGmC3%JD;!_&eyeQQTqx#CG)k8rP8tR
znvrlUoQMnp4J8Ig*GzOSZ(ZKuYaQ{m4)^tT;TTHpjSY?_;<0!-*_w{+iNuClQ?&mo
z2`2mwezSEv5l@8&Q}IM=G6g}>RgA%CC^^dPNIV9{;IIwPZK}%cx3X`|(tEUWVjZ?E
zTG@SAOLUu;30qdOd$eT3|I3=K+O<L?_l^z3qe=fbHoLX^2gk?7;?&sq;-P_&NDTWF
znLV1Vk>C<3$ULGf^$+GrbO4{LyC3zeGQOtoHp=+2oG#|rCHT6E+5||oSfTNxM0}cY
z@@cy3!K8PW{o@;GwVHmvgFa8w`FRnjEGuv{#7xl<P&oOc(uJRa9gh4wD2X`SiXZvq
z=TD~ZM#aD;UVgr0`a`G`ivLGaex~RQD9I89#7lr=pEJqxuwQ{*+)kt5nWxhZ=bP>P
zp@^NQL8m-qr)a>be2gE(!=n&SWzK+JC>~{}QDi4kWCVrn@Q}lHik;s|FbsSpo|v!n
z?Z-lXo22|S;e5WFC+UsZJhL5|oEq4_>I*PM!BwDp=wZy!e*&P(-sMwreB3JKYXrY%
zVEPVeA5ZZCrYIN(z0iKzJ(}*EVcr3Hp?$WKp!@PoOYC87$NAXsWD$9q+|I}O&0Vg_
z=sQK^|Dx?X=j)$q`gzZ5n1EAzolj~Nn(ll+Yy`bfyTo83l}x3FhcV>jw&_}_F>8=E
z>L#(DcPMV@4Y)(NMI^0IdO{4w$Ht@KRCvhWR#<{I??$XpA`#kag=4A2UNM{qjfJhD
z^w`*5sMs(IlBqmV``D<4RqLir?QOQ3B;g&ly9M?HNAiZyB7<!1uA3)K1FmIl*s^YO
zkJWQsH|<;Q-f`W!&AnY9W6v@Zt)XPx8V$vUX!kOO4L0siR!>kX2fMcj*>quT+;si=
zb(^f~H*DC7oyh&`)^F;`ZARuBNN#7dmA~aVdl$uBQtqKHyic09LECP>7uayEye_xh
zj(6+ZtbIo{9Om!g7Ji%%Oc?-E2?a;F7Q@WJKN3s(F^3orCsKQbKT8@&N1{Wmks%=k
zMlqET{-M1w7*jBn5dK6s8X}^ujYm_$FB9uW)jtvkN|j`IQ20~f382)mgg-p0yU^$m
ztT3i@RbNt)T!)L$SOi93k}M*EP%0$+1IeWDqc@MClY$l=NRL1z6dOU;*QmUeGizWV
z5#G(5Xe1WqIugQFB4Y!57Ei=RqS*aTTk`+oLho10@WbUSOR@i>&}{wsi!t@O#jaoM
ze<@gl?|Y1|L*?&y=gI#0y)DC_7Gyp7MvavLX_8!pAM5jbU50%a@2JrGNvg6wo=0Wb
ziX%Ptus**hX858K%O174EPgX;6dUUY$Po&|LlRRmRCW3fgGT!6iAjYw`D@5=k_^3T
zcIwALqnKDNdq33oMLNIs!g*vG&6Di<KCPF9ET|#hZW*Q>`T@;g$Z~9kafaV===W&`
zLzXM1|80l<F0H_@k57y?;Iwz2L!ajd4Eg&6`*-H=ds;v1#-}AUyi21Docd3ILHTF>
zdwoih;VEL;3ZH)_{TLOROY@^U&ujUARGjjRGo*Pez58Q*o=5ljK}2CWh2?OjzYYfV
zJ=W)W9se)RejUHQEVB0aJnV%oiLgHZ-^F|Mx&mdP0@)+`Wk~aFyS@l$(KZEr8fCzG
z41WuTUEdc_l0L1^-*=q)jQ<`Lvc>W9^GS!+uVz6Fo#+2J=;U9EYbhaEe^HjADkMvV
z;fJ=mWaxvX4IZ}>QD;3SpDsWn-dcL8j`B7)OrhuLm%yOYCW+Q8QwrJtX)2;Wt%&}c
z^uUMmfy1~1l^+6#=FH!7E2X+P>`-8o;T5P*-KoEOwbG~Y-CkIa;YzJ<m!bMQTEEM|
zU_B-i&euNE2sY6dE)<{cFh11j*^v7!-xn%CbDSbj@Wmw{fmq-$?)2XPu|oPg*DL*A
Jhk^qY{{rb*mPP;o

literal 0
HcmV?d00001

diff --git a/llvm/test/tools/llvm-symbolizer/output-style-json-code.test b/llvm/test/tools/llvm-symbolizer/output-style-json-code.test
index 9179b673f39fd..0e0e61c0bf119 100644
--- a/llvm/test/tools/llvm-symbolizer/output-style-json-code.test
+++ b/llvm/test/tools/llvm-symbolizer/output-style-json-code.test
@@ -25,39 +25,44 @@
 # RUN: llvm-symbolizer --output-style=JSON --no-inlines -e %p/Inputs/addr.exe < %p/Inputs/addr.inp | \
 # RUN:   FileCheck %s --check-prefix=NO-INLINES --strict-whitespace --match-full-lines --implicit-check-not={{.}}
 ## Invalid first argument before any valid one.
-# NO-INLINES:{"ModuleName":"{{.*}}/Inputs/addr.exe","Symbol":[{"Column":0,"Discriminator":0,"FileName":"","FunctionName":"","Line":0,"StartAddress":"","StartFileName":"","StartLine":0}]}
+# NO-INLINES:{"Loc":[],"ModuleName":"{{.*}}/Inputs/addr.exe","SymName":"something"}
 ## Resolve valid address.
 # NO-INLINES-NEXT:{"Address":"0x40054d","ModuleName":"{{.*}}/Inputs/addr.exe","Symbol":[{"Column":3,"Discriminator":0,"FileName":"/tmp{{/|\\\\}}x.c","FunctionName":"main","Line":3,"StartAddress":"0x400540","StartFileName":"/tmp{{/|\\\\}}x.c","StartLine":2}]}
 ## Invalid argument after a valid one.
-# NO-INLINES-NEXT:{"ModuleName":"{{.*}}/Inputs/addr.exe","Symbol":[{"Column":0,"Discriminator":0,"FileName":"","FunctionName":"","Line":0,"StartAddress":"","StartFileName":"","StartLine":0}]}
+# NO-INLINES-NEXT:{"Loc":[],"ModuleName":"{{.*}}/Inputs/addr.exe","SymName":"some"}
 
 ## This test case is testing stdin input, inlines by default.
 # RUN: llvm-symbolizer --output-style=JSON -e %p/Inputs/addr.exe < %p/Inputs/addr.inp | \
 # RUN:   FileCheck %s --check-prefix=INLINE --strict-whitespace --match-full-lines --implicit-check-not={{.}}
 ## Invalid first argument before any valid one.
-# INLINE:{"ModuleName":"{{.*}}/Inputs/addr.exe","Symbol":[{"Column":0,"Discriminator":0,"FileName":"","FunctionName":"","Line":0,"StartAddress":"","StartFileName":"","StartLine":0}]}
+# INLINE:{"Loc":[],"ModuleName":"{{.*}}/Inputs/addr.exe","SymName":"something"}
 ## Resolve valid address.
 # INLINE-NEXT:{"Address":"0x40054d","ModuleName":"{{.*}}/Inputs/addr.exe","Symbol":[{"Column":3,"Discriminator":0,"FileName":"/tmp{{/|\\\\}}x.c","FunctionName":"inctwo","Line":3,"StartAddress":"0x400540","StartFileName":"/tmp{{/|\\\\}}x.c","StartLine":2},{"Column":0,"Discriminator":0,"FileName":"/tmp{{/|\\\\}}x.c","FunctionName":"inc","Line":7,"StartAddress":"0x400540","StartFileName":"/tmp{{/|\\\\}}x.c","StartLine":6},{"Column":0,"Discriminator":0,"FileName":"/tmp{{/|\\\\}}x.c","FunctionName":"main","Line":14,"StartAddress":"0x400540","StartFileName":"/tmp{{/|\\\\}}x.c","StartLine":12}]}
 ## Invalid argument after a valid one.
-# INLINE-NEXT:{"ModuleName":"{{.*}}/Inputs/addr.exe","Symbol":[{"Column":0,"Discriminator":0,"FileName":"","FunctionName":"","Line":0,"StartAddress":"","StartFileName":"","StartLine":0}]}
+# INLINE-NEXT:{"Loc":[],"ModuleName":"{{.*}}/Inputs/addr.exe","SymName":"some"}
 
 ## Also check the last test case with llvm-adr2line.
 ## The expected result is the same with -f -i.
 # RUN: llvm-addr2line --output-style=JSON -f -i -e %p/Inputs/addr.exe < %p/Inputs/addr.inp | \
 # RUN:   FileCheck %s --check-prefix=INLINE-A2L --strict-whitespace --match-full-lines --implicit-check-not={{.}}
 ## Invalid first argument before any valid one.
-# INLINE-A2L:{"ModuleName":"{{.*}}/Inputs/addr.exe","Symbol":[{"Column":0,"Discriminator":0,"FileName":"","FunctionName":"","Line":0,"StartAddress":"","StartFileName":"","StartLine":0}]}
+# INLINE-A2L:{"Loc":[],"ModuleName":"{{.*}}/Inputs/addr.exe","SymName":"something"}
 ## Resolve valid address.
 # INLINE-A2L-NEXT:{"Address":"0x40054d","ModuleName":"{{.*}}/Inputs/addr.exe","Symbol":[{"Column":3,"Discriminator":0,"FileName":"/tmp{{/|\\\\}}x.c","FunctionName":"inctwo","Line":3,"StartAddress":"0x400540","StartFileName":"/tmp{{/|\\\\}}x.c","StartLine":2},{"Column":0,"Discriminator":0,"FileName":"/tmp{{/|\\\\}}x.c","FunctionName":"inc","Line":7,"StartAddress":"0x400540","StartFileName":"/tmp{{/|\\\\}}x.c","StartLine":6},{"Column":0,"Discriminator":0,"FileName":"/tmp{{/|\\\\}}x.c","FunctionName":"main","Line":14,"StartAddress":"0x400540","StartFileName":"/tmp{{/|\\\\}}x.c","StartLine":12}]}
 ## Invalid argument after a valid one.
-# INLINE-A2L-NEXT:{"ModuleName":"{{.*}}/Inputs/addr.exe","Symbol":[{"Column":0,"Discriminator":0,"FileName":"","FunctionName":"","Line":0,"StartAddress":"","StartFileName":"","StartLine":0}]}
+# INLINE-A2L:{"Loc":[],"ModuleName":"{{.*}}/Inputs/addr.exe","SymName":"some"}
 
 ## Note llvm-addr2line without -f does not print the function name in JSON too.
 # RUN: llvm-addr2line --output-style=JSON -i -e %p/Inputs/addr.exe < %p/Inputs/addr.inp | \
 # RUN:   FileCheck %s --check-prefix=NO-FUNC-A2L --strict-whitespace --match-full-lines --implicit-check-not={{.}}
 ## Invalid first argument before any valid one.
-# NO-FUNC-A2L:{"ModuleName":"{{.*}}/Inputs/addr.exe","Symbol":[{"Column":0,"Discriminator":0,"FileName":"","FunctionName":"","Line":0,"StartAddress":"","StartFileName":"","StartLine":0}]}
+# NO-FUNC-A2L:{"Loc":[],"ModuleName":"{{.*}}/Inputs/addr.exe","SymName":"something"}
 ## Resolve valid address.
 # NO-FUNC-A2L-NEXT:{"Address":"0x40054d","ModuleName":"{{.*}}/Inputs/addr.exe","Symbol":[{"Column":3,"Discriminator":0,"FileName":"/tmp{{/|\\\\}}x.c","FunctionName":"","Line":3,"StartAddress":"0x400540","StartFileName":"/tmp{{/|\\\\}}x.c","StartLine":2},{"Column":0,"Discriminator":0,"FileName":"/tmp{{/|\\\\}}x.c","FunctionName":"","Line":7,"StartAddress":"0x400540","StartFileName":"/tmp{{/|\\\\}}x.c","StartLine":6},{"Column":0,"Discriminator":0,"FileName":"/tmp{{/|\\\\}}x.c","FunctionName":"","Line":14,"StartAddress":"0x400540","StartFileName":"/tmp{{/|\\\\}}x.c","StartLine":12}]}
 ## Invalid argument after a valid one.
-# NO-FUNC-A2L-NEXT:{"ModuleName":"{{.*}}/Inputs/addr.exe","Symbol":[{"Column":0,"Discriminator":0,"FileName":"","FunctionName":"","Line":0,"StartAddress":"","StartFileName":"","StartLine":0}]}
+# NO-FUNC-A2L-NEXT:{"Loc":[],"ModuleName":"{{.*}}/Inputs/addr.exe","SymName":"some"}
+
+## When a module offset is specified by a symbol, more than one source location can be found.
+# RUN: llvm-symbolizer --output-style=JSON --no-inlines -e %p/Inputs/symbols.so "static_func" | \
+# RUN:   FileCheck %s --check-prefix=MULTIPLE --strict-whitespace --match-full-lines --implicit-check-not={{.}}
+# MULTIPLE:[{"Loc":[{"Column":24,"Discriminator":0,"FileName":"/tmp/dbginfo{{/|\\\\}}symbols.part3.c","FunctionName":"static_func","Line":4,"StartAddress":"0x121d","StartFileName":"/tmp/dbginfo{{/|\\\\}}symbols.part3.c","StartLine":4},{"Column":24,"Discriminator":0,"FileName":"/tmp/dbginfo{{/|\\\\}}symbols.part4.c","FunctionName":"static_func","Line":4,"StartAddress":"0x125f","StartFileName":"/tmp/dbginfo{{/|\\\\}}symbols.part4.c","StartLine":4}],"ModuleName":"{{.*}}Inputs/symbols.so","SymName":"static_func"}]
diff --git a/llvm/test/tools/llvm-symbolizer/output-style-json-data.test b/llvm/test/tools/llvm-symbolizer/output-style-json-data.test
index 722ac73d75104..b91555937086e 100644
--- a/llvm/test/tools/llvm-symbolizer/output-style-json-data.test
+++ b/llvm/test/tools/llvm-symbolizer/output-style-json-data.test
@@ -9,8 +9,8 @@
 
 ## Handle invalid argument.
 # RUN: llvm-symbolizer "DATA tmp.o Z" --output-style=JSON | \
-# RUN:   FileCheck %s --check-prefix=INVARG --strict-whitespace --match-full-lines --implicit-check-not={{.}}
-# INVARG:[{"ModuleName":"tmp.o","Symbol":[{"Column":0,"Discriminator":0,"FileName":"","FunctionName":"","Line":0,"StartAddress":"","StartFileName":"","StartLine":0}]}]
+# RUN:   FileCheck %s -DMSG=%errc_ENOENT --check-prefix=INVARG --strict-whitespace --match-full-lines --implicit-check-not={{.}}
+# INVARG:[{"Error":{"Message":"[[MSG]]"},"ModuleName":"tmp.o","SymName":"Z"}]
 
 # RUN: llvm-mc -filetype=obj -triple=x86_64-pc-linux %s -o %t.o
 
diff --git a/llvm/test/tools/llvm-symbolizer/output-style-json-frame.ll b/llvm/test/tools/llvm-symbolizer/output-style-json-frame.ll
index 8a99345420fde..f82d6704ef8f9 100644
--- a/llvm/test/tools/llvm-symbolizer/output-style-json-frame.ll
+++ b/llvm/test/tools/llvm-symbolizer/output-style-json-frame.ll
@@ -9,8 +9,8 @@
 
 ;; Handle invalid argument.
 ; RUN: llvm-symbolizer "FRAME tmp.o Z" --output-style=JSON | \
-; RUN:   FileCheck %s --check-prefix=INVARG --strict-whitespace --match-full-lines --implicit-check-not={{.}}
-; INVARG:[{"ModuleName":"tmp.o","Symbol":[{"Column":0,"Discriminator":0,"FileName":"","FunctionName":"","Line":0,"StartAddress":"","StartFileName":"","StartLine":0}]}]
+; RUN:   FileCheck %s -DMSG=%errc_ENOENT --check-prefix=INVARG --strict-whitespace --match-full-lines --implicit-check-not={{.}}
+; INVARG:[{"Error":{"Message":"[[MSG]]"},"ModuleName":"tmp.o","SymName":"Z"}]
 
 ; RUN: llc -filetype=obj -o %t.o %s 
 
diff --git a/llvm/test/tools/llvm-symbolizer/symbol-search.test b/llvm/test/tools/llvm-symbolizer/symbol-search.test
new file mode 100644
index 0000000000000..634229c2e74c0
--- /dev/null
+++ b/llvm/test/tools/llvm-symbolizer/symbol-search.test
@@ -0,0 +1,65 @@
+# This test checks the case when an address is specified by a symbol name rather
+# than a number.
+#
+# It uses ELF shared object `Inputs/symbols.so` built for x86_64 using
+# the instructions from `Inputs/symbols.h`.
+
+# Show that the "CODE" command supports search by symbol name.
+RUN: llvm-addr2line --obj=%p/Inputs/symbols.so "CODE func_01" | FileCheck --check-prefix=CODE-CMD %s
+RUN: llvm-symbolizer --obj=%p/Inputs/symbols.so "CODE func_01" | FileCheck --check-prefix=CODE-CMD %s
+CODE-CMD: /tmp/dbginfo{{[/\]+}}symbols.part1.cpp:12
+
+# Check if a symbol name can be specified on the command-line.
+RUN: llvm-addr2line -e %p/Inputs/symbols.so func_01 | FileCheck --check-prefix=SYMB %s
+RUN: llvm-symbolizer -e %p/Inputs/symbols.so func_01 | FileCheck --check-prefix=SYMB %s
+SYMB: /tmp/dbginfo{{[/\]+}}symbols.part1.cpp:12
+
+# Check that if a symbol has a space in its name, ignore everything from the space onwards.
+RUN: llvm-addr2line -e %p/Inputs/symbols.so "func_01 ignored text" | FileCheck --check-prefix=SYMB %s
+RUN: llvm-symbolizer -e %p/Inputs/symbols.so "func_01 ignored text" | FileCheck --check-prefix=SYMB %s
+
+# Show that a symbol name may be resolved to more than one location.
+RUN: llvm-addr2line -e %p/Inputs/symbols.so static_func | FileCheck --check-prefix=SYMB-MULTI %s
+SYMB-MULTI:      /tmp/dbginfo{{[/\]+}}symbols.part3.c:4
+SYMB-MULTI-NEXT: /tmp/dbginfo{{[/\]+}}symbols.part4.c:4
+
+# Show that if a symbol is not found, a special mark is printed.
+RUN: llvm-addr2line --obj=%p/Inputs/symbols.so func_666 | FileCheck --check-prefix=NONEXISTENT %s
+RUN: llvm-symbolizer --obj=%p/Inputs/symbols.so func_666 | FileCheck --check-prefix=NONEXISTENT %s
+NONEXISTENT: ??
+
+# Show that more than one symbol may be specified.
+RUN: llvm-addr2line --obj=%p/Inputs/symbols.so func_01 func_02 | FileCheck --check-prefix=FUNCS %s
+RUN: llvm-symbolizer --obj=%p/Inputs/symbols.so func_01 func_02 | FileCheck --check-prefix=FUNCS %s
+FUNCS:  /tmp/dbginfo{{[/\]+}}symbols.part1.cpp:12
+FUNCS:  /tmp/dbginfo{{[/\]+}}symbols.part2.cpp:10
+
+# Show that C++ mangled names may be specified.
+RUN: llvm-addr2line --obj=%p/Inputs/symbols.so _ZL14static_func_01i | FileCheck --check-prefix=MULTI-CXX %s
+RUN: llvm-symbolizer --obj=%p/Inputs/symbols.so _ZL14static_func_01i | FileCheck --check-prefix=MULTI-CXX %s
+MULTI-CXX: /tmp/dbginfo{{[/\]+}}symbols.part1.cpp:7
+MULTI-CXX: /tmp/dbginfo{{[/\]+}}symbols.part2.cpp:5
+
+# Show that containing function name can be printed in mangled form.
+RUN: llvm-symbolizer --obj=%p/Inputs/symbols.so --no-demangle _Z7func_04i | FileCheck --check-prefix=MANGLED %s
+RUN: llvm-addr2line --obj=%p/Inputs/symbols.so -f _Z7func_04i | FileCheck --check-prefix=MANGLED %s
+MANGLED: _Z7func_04i
+MANGLED-NEXT: /tmp/dbginfo{{[/\]+}}symbols.part1.cpp:22
+
+# Show that containing function name can be printed in demangled form.
+RUN: llvm-symbolizer --obj=%p/Inputs/symbols.so _Z7func_04i | FileCheck --check-prefix=NOTMANGLED %s
+RUN: llvm-addr2line --obj=%p/Inputs/symbols.so -f --demangle _Z7func_04i | FileCheck --check-prefix=NOTMANGLED %s
+NOTMANGLED: func_04(int)
+NOTMANGLED-NEXT: /tmp/dbginfo{{[/\]+}}symbols.part1.cpp:22
+
+# Show that both the symbol and input file can be specified in the search string on the command-line.
+RUN: llvm-addr2line "%p/Inputs/symbols.so func_01" | FileCheck --check-prefix=SYMBIN %s
+RUN: llvm-symbolizer "%p/Inputs/symbols.so func_01" | FileCheck --check-prefix=SYMBIN %s
+SYMBIN: /tmp/dbginfo{{[/\]+}}symbols.part1.cpp:12
+
+# Show that the case of missing input file specified in the search string on the command-line is properly treated.
+RUN: llvm-addr2line "%p/Inputs/666.so func_01" 2> %t.1.stderr | FileCheck --check-prefix=NONEXISTENT %s
+RUN: FileCheck --input-file=%t.1.stderr --check-prefix=BINARY-NOT-FOUND -DMSG=%errc_ENOENT %s
+RUN: llvm-symbolizer "%p/Inputs/666.so func_01" 2> %t.2.stderr | FileCheck --check-prefix=NONEXISTENT %s
+RUN: FileCheck --input-file=%t.2.stderr --check-prefix=BINARY-NOT-FOUND -DMSG=%errc_ENOENT %s
+BINARY-NOT-FOUND: error: '{{.*}}666.so': [[MSG]]
diff --git a/llvm/tools/llvm-symbolizer/llvm-symbolizer.cpp b/llvm/tools/llvm-symbolizer/llvm-symbolizer.cpp
index 78a0e6772f3fb..447c18abadc17 100644
--- a/llvm/tools/llvm-symbolizer/llvm-symbolizer.cpp
+++ b/llvm/tools/llvm-symbolizer/llvm-symbolizer.cpp
@@ -159,7 +159,7 @@ static Error makeStringError(StringRef Msg) {
 static Error parseCommand(StringRef BinaryName, bool IsAddr2Line,
                           StringRef InputString, Command &Cmd,
                           std::string &ModuleName, object::BuildID &BuildID,
-                          uint64_t &ModuleOffset) {
+                          StringRef &Symbol, uint64_t &ModuleOffset) {
   ModuleName = BinaryName;
   if (InputString.consume_front("CODE ")) {
     Cmd = Command::Code;
@@ -224,35 +224,41 @@ static Error parseCommand(StringRef BinaryName, bool IsAddr2Line,
       return makeStringError("no input filename has been specified");
   }
 
-  // Parse module offset.
+  // Parse module offset, which can be specified as a number or as a symbol.
   InputString = InputString.ltrim();
   if (InputString.empty())
     return makeStringError("no module offset has been specified");
+
+  // If input string contains a space, ignore everything after it. This behavior
+  // is consistent with GNU addr2line.
   int OffsetLength = InputString.find_first_of(" \n\r");
   StringRef Offset = InputString.substr(0, OffsetLength);
+
   // GNU addr2line assumes the offset is hexadecimal and allows a redundant
   // "0x" or "0X" prefix; do the same for compatibility.
   if (IsAddr2Line)
     Offset.consume_front("0x") || Offset.consume_front("0X");
 
-  // If the input is not a valid module offset, it is not an error, but its
-  // lookup does not make sense. Return error of different kind to distinguish
-  // from error or success.
-  if (Offset.getAsInteger(IsAddr2Line ? 16 : 0, ModuleOffset))
-    return errorCodeToError(errc::invalid_argument);
+  // If the input is not a number, treat it is a symbol.
+  if (Offset.getAsInteger(IsAddr2Line ? 16 : 0, ModuleOffset)) {
+    Symbol = Offset;
+    ModuleOffset = 0;
+  }
 
   return Error::success();
 }
 
 template <typename T>
 void executeCommand(StringRef ModuleName, const T &ModuleSpec, Command Cmd,
-                    uint64_t Offset, uint64_t AdjustVMA, bool ShouldInline,
-                    OutputStyle Style, LLVMSymbolizer &Symbolizer,
-                    DIPrinter &Printer) {
+                    StringRef Symbol, uint64_t Offset, uint64_t AdjustVMA,
+                    bool ShouldInline, OutputStyle Style,
+                    LLVMSymbolizer &Symbolizer, DIPrinter &Printer) {
   uint64_t AdjustedOffset = Offset - AdjustVMA;
   object::SectionedAddress Address = {AdjustedOffset,
                                       object::SectionedAddress::UndefSection};
-  Request SymRequest = {ModuleName, Offset};
+  Request SymRequest = {
+      ModuleName, Symbol.empty() ? std::make_optional(Offset) : std::nullopt,
+      Symbol};
   if (Cmd == Command::Data) {
     Expected<DIGlobal> ResOrErr = Symbolizer.symbolizeData(ModuleSpec, Address);
     print(SymRequest, ResOrErr, Printer);
@@ -260,6 +266,10 @@ void executeCommand(StringRef ModuleName, const T &ModuleSpec, Command Cmd,
     Expected<std::vector<DILocal>> ResOrErr =
         Symbolizer.symbolizeFrame(ModuleSpec, Address);
     print(SymRequest, ResOrErr, Printer);
+  } else if (!Symbol.empty()) {
+    Expected<std::vector<DILineInfo>> ResOrErr =
+        Symbolizer.findSymbol(ModuleSpec, Symbol);
+    print(SymRequest, ResOrErr, Printer);
   } else if (ShouldInline) {
     Expected<DIInliningInfo> ResOrErr =
         Symbolizer.symbolizeInlinedCode(ModuleSpec, Address);
@@ -288,7 +298,7 @@ void executeCommand(StringRef ModuleName, const T &ModuleSpec, Command Cmd,
 }
 
 static void printUnknownLineInfo(std::string ModuleName, DIPrinter &Printer) {
-  Request SymRequest = {ModuleName, std::nullopt};
+  Request SymRequest = {ModuleName, std::nullopt, StringRef()};
   Printer.print(SymRequest, DILineInfo());
 }
 
@@ -301,16 +311,14 @@ static void symbolizeInput(const opt::InputArgList &Args,
   std::string ModuleName;
   object::BuildID BuildID(IncomingBuildID.begin(), IncomingBuildID.end());
   uint64_t Offset = 0;
+  StringRef Symbol;
   if (Error E = parseCommand(Args.getLastArgValue(OPT_obj_EQ), IsAddr2Line,
                              StringRef(InputString), Cmd, ModuleName, BuildID,
-                             Offset)) {
-    handleAllErrors(
-        std::move(E),
-        [&](const StringError &EI) {
-          printError(EI, InputString);
-          printUnknownLineInfo(ModuleName, Printer);
-        },
-        [&](const ECError &EI) { printUnknownLineInfo(ModuleName, Printer); });
+                             Symbol, Offset)) {
+    handleAllErrors(std::move(E), [&](const StringError &EI) {
+      printError(EI, InputString);
+      printUnknownLineInfo(ModuleName, Printer);
+    });
     return;
   }
   bool ShouldInline = Args.hasFlag(OPT_inlines, OPT_no_inlines, !IsAddr2Line);
@@ -319,11 +327,11 @@ static void symbolizeInput(const opt::InputArgList &Args,
     if (!Args.hasArg(OPT_no_debuginfod))
       enableDebuginfod(Symbolizer, Args);
     std::string BuildIDStr = toHex(BuildID);
-    executeCommand(BuildIDStr, BuildID, Cmd, Offset, AdjustVMA, ShouldInline,
-                   Style, Symbolizer, Printer);
+    executeCommand(BuildIDStr, BuildID, Cmd, Symbol, Offset, AdjustVMA,
+                   ShouldInline, Style, Symbolizer, Printer);
   } else {
-    executeCommand(ModuleName, ModuleName, Cmd, Offset, AdjustVMA, ShouldInline,
-                   Style, Symbolizer, Printer);
+    executeCommand(ModuleName, ModuleName, Cmd, Symbol, Offset, AdjustVMA,
+                   ShouldInline, Style, Symbolizer, Printer);
   }
 }
 
@@ -527,7 +535,7 @@ int llvm_symbolizer_main(int argc, char **argv, const llvm::ToolContext &) {
   if (auto *Arg = Args.getLastArg(OPT_obj_EQ); Arg) {
     auto Status = Symbolizer.getOrCreateModuleInfo(Arg->getValue());
     if (!Status) {
-      Request SymRequest = {Arg->getValue(), 0};
+      Request SymRequest = {Arg->getValue(), 0, StringRef()};
       handleAllErrors(Status.takeError(), [&](const ErrorInfoBase &EI) {
         Printer->printError(SymRequest, EI);
       });
diff --git a/llvm/unittests/ProfileData/MemProfTest.cpp b/llvm/unittests/ProfileData/MemProfTest.cpp
index 5984be98d798a..682f79a540cdc 100644
--- a/llvm/unittests/ProfileData/MemProfTest.cpp
+++ b/llvm/unittests/ProfileData/MemProfTest.cpp
@@ -20,6 +20,7 @@ using ::llvm::DIInliningInfo;
 using ::llvm::DILineInfo;
 using ::llvm::DILineInfoSpecifier;
 using ::llvm::DILocal;
+using ::llvm::StringRef;
 using ::llvm::memprof::CallStackMap;
 using ::llvm::memprof::Frame;
 using ::llvm::memprof::FrameId;
@@ -53,6 +54,9 @@ class MockSymbolizer : public SymbolizableModule {
   virtual std::vector<DILocal> symbolizeFrame(SectionedAddress) const {
     llvm_unreachable("unused");
   }
+  virtual std::vector<SectionedAddress> findSymbol(StringRef Symbol) const {
+    llvm_unreachable("unused");
+  }
   virtual bool isWin32Module() const { llvm_unreachable("unused"); }
   virtual uint64_t getModulePreferredBase() const {
     llvm_unreachable("unused");

From 760658c118f491985ec00f62f5a261293db67b95 Mon Sep 17 00:00:00 2001
From: Brad Smith <brad@comstyle.com>
Date: Wed, 1 Nov 2023 04:28:55 -0400
Subject: [PATCH 637/877] [Driver] Silence stdlib warning when linking C on
 *BSD / Solaris / Haiku (#70434)

Same as 12b87f6ef720080fab1e2d48ca2d8c5ba478ee5d and the addition to Gnu.
---
 clang/lib/Driver/ToolChains/DragonFly.cpp | 3 +++
 clang/lib/Driver/ToolChains/FreeBSD.cpp   | 6 +++---
 clang/lib/Driver/ToolChains/Haiku.cpp     | 3 +++
 clang/lib/Driver/ToolChains/NetBSD.cpp    | 3 +++
 clang/lib/Driver/ToolChains/OpenBSD.cpp   | 3 +++
 clang/lib/Driver/ToolChains/Solaris.cpp   | 2 ++
 6 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/clang/lib/Driver/ToolChains/DragonFly.cpp b/clang/lib/Driver/ToolChains/DragonFly.cpp
index ed7f751adc0ef..cced977bf0292 100644
--- a/clang/lib/Driver/ToolChains/DragonFly.cpp
+++ b/clang/lib/Driver/ToolChains/DragonFly.cpp
@@ -144,6 +144,9 @@ void dragonfly::Linker::ConstructJob(Compilation &C, const JobAction &JA,
       CmdArgs.push_back("-lm");
     }
 
+    // Silence warnings when linking C code with a C++ '-stdlib' argument.
+    Args.ClaimAllArgs(options::OPT_stdlib_EQ);
+
     // Additional linker set-up and flags for Fortran. This is required in order
     // to generate executables. As Fortran runtime depends on the C runtime,
     // these dependencies need to be listed before the C runtime below (i.e.
diff --git a/clang/lib/Driver/ToolChains/FreeBSD.cpp b/clang/lib/Driver/ToolChains/FreeBSD.cpp
index e7dcccc9e9fc4..0b70ac7b76e67 100644
--- a/clang/lib/Driver/ToolChains/FreeBSD.cpp
+++ b/clang/lib/Driver/ToolChains/FreeBSD.cpp
@@ -291,6 +291,9 @@ void freebsd::Linker::ConstructJob(Compilation &C, const JobAction &JA,
         CmdArgs.push_back("-lm");
     }
 
+    // Silence warnings when linking C code with a C++ '-stdlib' argument.
+    Args.ClaimAllArgs(options::OPT_stdlib_EQ);
+
     // Additional linker set-up and flags for Fortran. This is required in order
     // to generate executables. As Fortran runtime depends on the C runtime,
     // these dependencies need to be listed before the C runtime below (i.e.
@@ -364,9 +367,6 @@ void freebsd::Linker::ConstructJob(Compilation &C, const JobAction &JA,
 
   ToolChain.addProfileRTLibs(Args, CmdArgs);
 
-  // Silence warnings when linking C code with a C++ '-stdlib' argument.
-  Args.ClaimAllArgs(options::OPT_stdlib_EQ);
-
   const char *Exec = Args.MakeArgString(getToolChain().GetLinkerPath());
   C.addCommand(std::make_unique<Command>(JA, *this,
                                          ResponseFileSupport::AtFileCurCP(),
diff --git a/clang/lib/Driver/ToolChains/Haiku.cpp b/clang/lib/Driver/ToolChains/Haiku.cpp
index b940150788f65..1df9c7b08879e 100644
--- a/clang/lib/Driver/ToolChains/Haiku.cpp
+++ b/clang/lib/Driver/ToolChains/Haiku.cpp
@@ -95,6 +95,9 @@ void haiku::Linker::ConstructJob(Compilation &C, const JobAction &JA,
     if (D.CCCIsCXX() && ToolChain.ShouldLinkCXXStdlib(Args))
       ToolChain.AddCXXStdlibLibArgs(Args, CmdArgs);
 
+    // Silence warnings when linking C code with a C++ '-stdlib' argument.
+    Args.ClaimAllArgs(options::OPT_stdlib_EQ);
+
     // Additional linker set-up and flags for Fortran. This is required in order
     // to generate executables. As Fortran runtime depends on the C runtime,
     // these dependencies need to be listed before the C runtime below (i.e.
diff --git a/clang/lib/Driver/ToolChains/NetBSD.cpp b/clang/lib/Driver/ToolChains/NetBSD.cpp
index 7a1d4561c6f2f..cfde8d40a77ae 100644
--- a/clang/lib/Driver/ToolChains/NetBSD.cpp
+++ b/clang/lib/Driver/ToolChains/NetBSD.cpp
@@ -315,6 +315,9 @@ void netbsd::Linker::ConstructJob(Compilation &C, const JobAction &JA,
       CmdArgs.push_back("-lm");
     }
 
+    // Silence warnings when linking C code with a C++ '-stdlib' argument.
+    Args.ClaimAllArgs(options::OPT_stdlib_EQ);
+
     // Additional linker set-up and flags for Fortran. This is required in order
     // to generate executables. As Fortran runtime depends on the C runtime,
     // these dependencies need to be listed before the C runtime below (i.e.
diff --git a/clang/lib/Driver/ToolChains/OpenBSD.cpp b/clang/lib/Driver/ToolChains/OpenBSD.cpp
index 16a311be31be7..c5255573baf3c 100644
--- a/clang/lib/Driver/ToolChains/OpenBSD.cpp
+++ b/clang/lib/Driver/ToolChains/OpenBSD.cpp
@@ -214,6 +214,9 @@ void openbsd::Linker::ConstructJob(Compilation &C, const JobAction &JA,
         CmdArgs.push_back("-lm");
     }
 
+    // Silence warnings when linking C code with a C++ '-stdlib' argument.
+    Args.ClaimAllArgs(options::OPT_stdlib_EQ);
+
     // Additional linker set-up and flags for Fortran. This is required in order
     // to generate executables. As Fortran runtime depends on the C runtime,
     // these dependencies need to be listed before the C runtime below (i.e.
diff --git a/clang/lib/Driver/ToolChains/Solaris.cpp b/clang/lib/Driver/ToolChains/Solaris.cpp
index 3d5a710842eca..ee2504ec18026 100644
--- a/clang/lib/Driver/ToolChains/Solaris.cpp
+++ b/clang/lib/Driver/ToolChains/Solaris.cpp
@@ -220,6 +220,8 @@ void solaris::Linker::ConstructJob(Compilation &C, const JobAction &JA,
         getToolChain().AddCXXStdlibLibArgs(Args, CmdArgs);
       CmdArgs.push_back("-lm");
     }
+    // Silence warnings when linking C code with a C++ '-stdlib' argument.
+    Args.ClaimAllArgs(options::OPT_stdlib_EQ);
     // Additional linker set-up and flags for Fortran. This is required in order
     // to generate executables. As Fortran runtime depends on the C runtime,
     // these dependencies need to be listed before the C runtime below.

From b120fe8d3288c4dca1b5427ca34839ce8833f71c Mon Sep 17 00:00:00 2001
From: Vlad Serebrennikov <serebrennikov.vladislav@gmail.com>
Date: Wed, 1 Nov 2023 11:47:40 +0300
Subject: [PATCH 638/877] [clang][NFC] Refactor `ArgPassingKind`

This patch moves `RecordDecl::ArgPassingKind` to DeclBase.h to namespace scope, so that it's complete at the time bit-field is declared.
---
 clang/include/clang/AST/Decl.h                | 26 ++-----------------
 clang/include/clang/AST/DeclBase.h            | 23 ++++++++++++++++
 clang/lib/AST/Decl.cpp                        |  2 +-
 clang/lib/AST/DeclCXX.cpp                     | 10 +++----
 clang/lib/CodeGen/CGCall.cpp                  |  2 +-
 clang/lib/Sema/SemaDecl.cpp                   |  6 ++---
 clang/lib/Sema/SemaDeclCXX.cpp                |  8 +++---
 clang/lib/Serialization/ASTReaderDecl.cpp     |  4 +--
 clang/lib/Serialization/ASTWriter.cpp         |  2 +-
 clang/lib/Serialization/ASTWriterDecl.cpp     |  2 +-
 .../SymbolFile/DWARF/DWARFASTParserClang.cpp  |  2 +-
 11 files changed, 44 insertions(+), 43 deletions(-)

diff --git a/clang/include/clang/AST/Decl.h b/clang/include/clang/AST/Decl.h
index 7f076cc77ea82..1c2158f51aa18 100644
--- a/clang/include/clang/AST/Decl.h
+++ b/clang/include/clang/AST/Decl.h
@@ -4069,28 +4069,6 @@ class RecordDecl : public TagDecl {
 public:
   friend class DeclContext;
   friend class ASTDeclReader;
-  /// Enum that represents the different ways arguments are passed to and
-  /// returned from function calls. This takes into account the target-specific
-  /// and version-specific rules along with the rules determined by the
-  /// language.
-  enum ArgPassingKind : unsigned {
-    /// The argument of this type can be passed directly in registers.
-    APK_CanPassInRegs,
-
-    /// The argument of this type cannot be passed directly in registers.
-    /// Records containing this type as a subobject are not forced to be passed
-    /// indirectly. This value is used only in C++. This value is required by
-    /// C++ because, in uncommon situations, it is possible for a class to have
-    /// only trivial copy/move constructors even when one of its subobjects has
-    /// a non-trivial copy/move constructor (if e.g. the corresponding copy/move
-    /// constructor in the derived class is deleted).
-    APK_CannotPassInRegs,
-
-    /// The argument of this type cannot be passed directly in registers.
-    /// Records containing this type as a subobject are forced to be passed
-    /// indirectly.
-    APK_CanNeverPassInRegs
-  };
 
 protected:
   RecordDecl(Kind DK, TagKind TK, const ASTContext &C, DeclContext *DC,
@@ -4215,7 +4193,7 @@ class RecordDecl : public TagDecl {
   /// it must have at least one trivial, non-deleted copy or move constructor.
   /// FIXME: This should be set as part of completeDefinition.
   bool canPassInRegisters() const {
-    return getArgPassingRestrictions() == APK_CanPassInRegs;
+    return getArgPassingRestrictions() == ArgPassingKind::CanPassInRegs;
   }
 
   ArgPassingKind getArgPassingRestrictions() const {
@@ -4223,7 +4201,7 @@ class RecordDecl : public TagDecl {
   }
 
   void setArgPassingRestrictions(ArgPassingKind Kind) {
-    RecordDeclBits.ArgPassingRestrictions = Kind;
+    RecordDeclBits.ArgPassingRestrictions = llvm::to_underlying(Kind);
   }
 
   bool isParamDestroyedInCallee() const {
diff --git a/clang/include/clang/AST/DeclBase.h b/clang/include/clang/AST/DeclBase.h
index 978e4255e877e..ba6dadd7d3563 100644
--- a/clang/include/clang/AST/DeclBase.h
+++ b/clang/include/clang/AST/DeclBase.h
@@ -1399,6 +1399,29 @@ enum class DeductionCandidate : unsigned char {
   Aggregate,
 };
 
+/// Enum that represents the different ways arguments are passed to and
+/// returned from function calls. This takes into account the target-specific
+/// and version-specific rules along with the rules determined by the
+/// language.
+enum class ArgPassingKind {
+  /// The argument of this type can be passed directly in registers.
+  CanPassInRegs,
+
+  /// The argument of this type cannot be passed directly in registers.
+  /// Records containing this type as a subobject are not forced to be passed
+  /// indirectly. This value is used only in C++. This value is required by
+  /// C++ because, in uncommon situations, it is possible for a class to have
+  /// only trivial copy/move constructors even when one of its subobjects has
+  /// a non-trivial copy/move constructor (if e.g. the corresponding copy/move
+  /// constructor in the derived class is deleted).
+  CannotPassInRegs,
+
+  /// The argument of this type cannot be passed directly in registers.
+  /// Records containing this type as a subobject are forced to be passed
+  /// indirectly.
+  CanNeverPassInRegs
+};
+
 /// DeclContext - This is used only as base class of specific decl types that
 /// can act as declaration contexts. These decls are (only the top classes
 /// that directly derive from DeclContext are mentioned, not their subclasses):
diff --git a/clang/lib/AST/Decl.cpp b/clang/lib/AST/Decl.cpp
index 480639606d225..28243a76712d6 100644
--- a/clang/lib/AST/Decl.cpp
+++ b/clang/lib/AST/Decl.cpp
@@ -4932,7 +4932,7 @@ RecordDecl::RecordDecl(Kind DK, TagKind TK, const ASTContext &C,
   setHasNonTrivialToPrimitiveDestructCUnion(false);
   setHasNonTrivialToPrimitiveCopyCUnion(false);
   setParamDestroyedInCallee(false);
-  setArgPassingRestrictions(APK_CanPassInRegs);
+  setArgPassingRestrictions(ArgPassingKind::CanPassInRegs);
   setIsRandomized(false);
   setODRHash(0);
 }
diff --git a/clang/lib/AST/DeclCXX.cpp b/clang/lib/AST/DeclCXX.cpp
index 9107525a44f22..7f42decda5121 100644
--- a/clang/lib/AST/DeclCXX.cpp
+++ b/clang/lib/AST/DeclCXX.cpp
@@ -446,8 +446,8 @@ CXXRecordDecl::setBases(CXXBaseSpecifier const * const *Bases,
       setHasVolatileMember(true);
 
     if (BaseClassDecl->getArgPassingRestrictions() ==
-        RecordDecl::APK_CanNeverPassInRegs)
-      setArgPassingRestrictions(RecordDecl::APK_CanNeverPassInRegs);
+        ArgPassingKind::CanNeverPassInRegs)
+      setArgPassingRestrictions(ArgPassingKind::CanNeverPassInRegs);
 
     // Keep track of the presence of mutable fields.
     if (BaseClassDecl->hasMutableFields())
@@ -1032,7 +1032,7 @@ void CXXRecordDecl::addedMember(Decl *D) {
 
         // Structs with __weak fields should never be passed directly.
         if (LT == Qualifiers::OCL_Weak)
-          setArgPassingRestrictions(RecordDecl::APK_CanNeverPassInRegs);
+          setArgPassingRestrictions(ArgPassingKind::CanNeverPassInRegs);
 
         Data.HasIrrelevantDestructor = false;
 
@@ -1226,8 +1226,8 @@ void CXXRecordDecl::addedMember(Decl *D) {
         if (FieldRec->hasVolatileMember())
           setHasVolatileMember(true);
         if (FieldRec->getArgPassingRestrictions() ==
-            RecordDecl::APK_CanNeverPassInRegs)
-          setArgPassingRestrictions(RecordDecl::APK_CanNeverPassInRegs);
+            ArgPassingKind::CanNeverPassInRegs)
+          setArgPassingRestrictions(ArgPassingKind::CanNeverPassInRegs);
 
         // C++0x [class]p7:
         //   A standard-layout class is a class that:
diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp
index d0ee70c8d7127..d8168e35decb0 100644
--- a/clang/lib/CodeGen/CGCall.cpp
+++ b/clang/lib/CodeGen/CGCall.cpp
@@ -2719,7 +2719,7 @@ void CodeGenModule::ConstructAttributeList(StringRef Name,
 
       auto *Decl = ParamType->getAsRecordDecl();
       if (CodeGenOpts.PassByValueIsNoAlias && Decl &&
-          Decl->getArgPassingRestrictions() == RecordDecl::APK_CanPassInRegs)
+          Decl->getArgPassingRestrictions() == ArgPassingKind::CanPassInRegs)
         // When calling the function, the pointer passed in will be the only
         // reference to the underlying object. Mark it accordingly.
         Attrs.addAttribute(llvm::Attribute::NoAlias);
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index 824267acbb1c0..96128850f5be0 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -19197,10 +19197,10 @@ void Sema::ActOnFields(Scope *S, SourceLocation RecLoc, Decl *EnclosingDecl,
 
       if (const auto *RT = FT->getAs<RecordType>()) {
         if (RT->getDecl()->getArgPassingRestrictions() ==
-            RecordDecl::APK_CanNeverPassInRegs)
-          Record->setArgPassingRestrictions(RecordDecl::APK_CanNeverPassInRegs);
+            ArgPassingKind::CanNeverPassInRegs)
+          Record->setArgPassingRestrictions(ArgPassingKind::CanNeverPassInRegs);
       } else if (FT.getQualifiers().getObjCLifetime() == Qualifiers::OCL_Weak)
-        Record->setArgPassingRestrictions(RecordDecl::APK_CanNeverPassInRegs);
+        Record->setArgPassingRestrictions(ArgPassingKind::CanNeverPassInRegs);
     }
 
     if (Record && FD->getType().isVolatileQualified())
diff --git a/clang/lib/Sema/SemaDeclCXX.cpp b/clang/lib/Sema/SemaDeclCXX.cpp
index 5b6d452ebf4ea..8d74ecd375f67 100644
--- a/clang/lib/Sema/SemaDeclCXX.cpp
+++ b/clang/lib/Sema/SemaDeclCXX.cpp
@@ -7280,11 +7280,11 @@ void Sema::CheckCompletedCXXClass(Scope *S, CXXRecordDecl *Record) {
   bool CanPass = canPassInRegisters(*this, Record, CCK);
 
   // Do not change ArgPassingRestrictions if it has already been set to
-  // APK_CanNeverPassInRegs.
-  if (Record->getArgPassingRestrictions() != RecordDecl::APK_CanNeverPassInRegs)
+  // ArgPassingKind::CanNeverPassInRegs.
+  if (Record->getArgPassingRestrictions() != ArgPassingKind::CanNeverPassInRegs)
     Record->setArgPassingRestrictions(CanPass
-                                          ? RecordDecl::APK_CanPassInRegs
-                                          : RecordDecl::APK_CannotPassInRegs);
+                                          ? ArgPassingKind::CanPassInRegs
+                                          : ArgPassingKind::CannotPassInRegs);
 
   // If canPassInRegisters returns true despite the record having a non-trivial
   // destructor, the record is destructed in the callee. This happens only when
diff --git a/clang/lib/Serialization/ASTReaderDecl.cpp b/clang/lib/Serialization/ASTReaderDecl.cpp
index 3a3477c39efae..470a648098e2e 100644
--- a/clang/lib/Serialization/ASTReaderDecl.cpp
+++ b/clang/lib/Serialization/ASTReaderDecl.cpp
@@ -845,7 +845,7 @@ ASTDeclReader::VisitRecordDeclImpl(RecordDecl *RD) {
   RD->setHasNonTrivialToPrimitiveDestructCUnion(Record.readInt());
   RD->setHasNonTrivialToPrimitiveCopyCUnion(Record.readInt());
   RD->setParamDestroyedInCallee(Record.readInt());
-  RD->setArgPassingRestrictions((RecordDecl::ArgPassingKind)Record.readInt());
+  RD->setArgPassingRestrictions((ArgPassingKind)Record.readInt());
   return Redecl;
 }
 
@@ -4513,7 +4513,7 @@ void ASTDeclReader::UpdateDecl(Decl *D,
                     !Reader.PendingFakeDefinitionData.count(OldDD));
       RD->setParamDestroyedInCallee(Record.readInt());
       RD->setArgPassingRestrictions(
-          (RecordDecl::ArgPassingKind)Record.readInt());
+          static_cast<ArgPassingKind>(Record.readInt()));
       ReadCXXRecordDefinition(RD, /*Update*/true);
 
       // Visible update is handled separately.
diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp
index 739344b9a128d..ab70594607530 100644
--- a/clang/lib/Serialization/ASTWriter.cpp
+++ b/clang/lib/Serialization/ASTWriter.cpp
@@ -5302,7 +5302,7 @@ void ASTWriter::WriteDeclUpdatesBlocks(RecordDataImpl &OffsetsRecord) {
         auto *RD = cast<CXXRecordDecl>(D);
         UpdatedDeclContexts.insert(RD->getPrimaryContext());
         Record.push_back(RD->isParamDestroyedInCallee());
-        Record.push_back(RD->getArgPassingRestrictions());
+        Record.push_back(llvm::to_underlying(RD->getArgPassingRestrictions()));
         Record.AddCXXDefinitionData(RD);
         Record.AddOffset(WriteDeclContextLexicalBlock(
             *Context, const_cast<CXXRecordDecl *>(RD)));
diff --git a/clang/lib/Serialization/ASTWriterDecl.cpp b/clang/lib/Serialization/ASTWriterDecl.cpp
index b3364113abf15..7f6a23cf6b8c1 100644
--- a/clang/lib/Serialization/ASTWriterDecl.cpp
+++ b/clang/lib/Serialization/ASTWriterDecl.cpp
@@ -522,7 +522,7 @@ void ASTDeclWriter::VisitRecordDecl(RecordDecl *D) {
   Record.push_back(D->hasNonTrivialToPrimitiveDestructCUnion());
   Record.push_back(D->hasNonTrivialToPrimitiveCopyCUnion());
   Record.push_back(D->isParamDestroyedInCallee());
-  Record.push_back(D->getArgPassingRestrictions());
+  Record.push_back(llvm::to_underlying(D->getArgPassingRestrictions()));
   // Only compute this for C/Objective-C, in C++ this is computed as part
   // of CXXRecordDecl.
   if (!isa<CXXRecordDecl>(D))
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp
index 182cc67646517..83bc67c0a12ca 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp
@@ -1929,7 +1929,7 @@ DWARFASTParserClang::ParseStructureLikeDIE(const SymbolContext &sc,
         m_ast.GetAsCXXRecordDecl(clang_type.GetOpaqueQualType());
     if (record_decl)
       record_decl->setArgPassingRestrictions(
-          clang::RecordDecl::APK_CannotPassInRegs);
+          clang::ArgPassingKind::CannotPassInRegs);
   }
   return type_sp;
 }

From 4279a642fba2d107e9bef17376facf9350dae6a9 Mon Sep 17 00:00:00 2001
From: Christian Ulmann <christian.ulmann@nextsilicon.com>
Date: Wed, 1 Nov 2023 10:13:06 +0100
Subject: [PATCH 639/877] [MLIR][GPUToROCDL] Remove typed pointer support
 (#70908)

This commit removes the support for lowering GPU to ROCDL dialect with
typed pointers. Typed pointers have been deprecated for a while now and
it's planned to soon remove them from the LLVM dialect.

Related PSA:
https://discourse.llvm.org/t/psa-removal-of-typed-pointers-from-the-llvm-dialect/74502
---
 mlir/include/mlir/Conversion/Passes.td        |  5 +--
 .../GPUToROCDL/LowerGpuOpsToROCDLOps.cpp      |  1 -
 .../GPUCommon/lower-memory-space-attrs.mlir   |  2 +-
 .../GPUCommon/memory-attrbution.mlir          |  2 +-
 .../GPUCommon/memref-arg-attrs.mlir           |  2 +-
 .../GPUCommon/memref-arg-noalias-attrs.mlir   |  2 +-
 .../GPUCommon/memref-arg-noalias-warning.mlir |  2 +-
 .../GPUToROCDL/gpu-to-rocdl-hip.mlir          |  2 +-
 .../GPUToROCDL/gpu-to-rocdl-opencl.mlir       |  2 +-
 .../Conversion/GPUToROCDL/gpu-to-rocdl.mlir   |  4 +--
 mlir/test/Conversion/GPUToROCDL/memref.mlir   |  4 +--
 .../Conversion/GPUToROCDL/typed-pointers.mlir | 34 -------------------
 12 files changed, 12 insertions(+), 50 deletions(-)
 delete mode 100644 mlir/test/Conversion/GPUToROCDL/typed-pointers.mlir

diff --git a/mlir/include/mlir/Conversion/Passes.td b/mlir/include/mlir/Conversion/Passes.td
index 53da1e58bf248..d7253212e5fe3 100644
--- a/mlir/include/mlir/Conversion/Passes.td
+++ b/mlir/include/mlir/Conversion/Passes.td
@@ -529,10 +529,7 @@ def ConvertGpuOpsToROCDLOps : Pass<"convert-gpu-to-rocdl", "gpu::GPUModuleOp"> {
             clEnumValN(::mlir::gpu::amd::Runtime::Unknown, "unknown", "Unknown (default)"),
             clEnumValN(::mlir::gpu::amd::Runtime::HIP, "HIP", "HIP"),
             clEnumValN(::mlir::gpu::amd::Runtime::OpenCL, "OpenCL", "OpenCL")
-          )}]>,
-    Option<"useOpaquePointers", "use-opaque-pointers", "bool",
-               /*default=*/"true", "Generate LLVM IR using opaque pointers "
-               "instead of typed pointers">,
+          )}]>
   ];
 }
 
diff --git a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
index e2cb3687d8728..d9f94e30b04c6 100644
--- a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
+++ b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
@@ -229,7 +229,6 @@ struct LowerGpuOpsToROCDLOpsPass
         ctx, DataLayout(cast<DataLayoutOpInterface>(m.getOperation())));
     if (indexBitwidth != kDeriveIndexBitwidthFromDataLayout)
       options.overrideIndexBitwidth(indexBitwidth);
-    options.useOpaquePointers = useOpaquePointers;
 
     if (useBarePtrCallConv) {
       options.useBarePtrCallConv = true;
diff --git a/mlir/test/Conversion/GPUCommon/lower-memory-space-attrs.mlir b/mlir/test/Conversion/GPUCommon/lower-memory-space-attrs.mlir
index 2d7e90dd1f5cb..14f5302ac2002 100644
--- a/mlir/test/Conversion/GPUCommon/lower-memory-space-attrs.mlir
+++ b/mlir/test/Conversion/GPUCommon/lower-memory-space-attrs.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -split-input-file -convert-gpu-to-rocdl='use-opaque-pointers=1' | FileCheck %s --check-prefixes=CHECK,ROCDL
+// RUN: mlir-opt %s -split-input-file -convert-gpu-to-rocdl | FileCheck %s --check-prefixes=CHECK,ROCDL
 // RUN: mlir-opt %s -split-input-file -convert-gpu-to-nvvm | FileCheck %s --check-prefixes=CHECK,NVVM
 
 gpu.module @kernel {
diff --git a/mlir/test/Conversion/GPUCommon/memory-attrbution.mlir b/mlir/test/Conversion/GPUCommon/memory-attrbution.mlir
index 0231cf6f98066..4fc19b8e93646 100644
--- a/mlir/test/Conversion/GPUCommon/memory-attrbution.mlir
+++ b/mlir/test/Conversion/GPUCommon/memory-attrbution.mlir
@@ -1,5 +1,5 @@
 // RUN: mlir-opt -allow-unregistered-dialect --convert-gpu-to-nvvm --split-input-file %s | FileCheck --check-prefix=NVVM %s
-// RUN: mlir-opt -allow-unregistered-dialect --convert-gpu-to-rocdl='use-opaque-pointers=1' --split-input-file %s | FileCheck --check-prefix=ROCDL %s
+// RUN: mlir-opt -allow-unregistered-dialect --convert-gpu-to-rocdl --split-input-file %s | FileCheck --check-prefix=ROCDL %s
 
 gpu.module @kernel {
   // NVVM-LABEL:  llvm.func @private
diff --git a/mlir/test/Conversion/GPUCommon/memref-arg-attrs.mlir b/mlir/test/Conversion/GPUCommon/memref-arg-attrs.mlir
index 21c4d5696c421..33374984eb7c9 100644
--- a/mlir/test/Conversion/GPUCommon/memref-arg-attrs.mlir
+++ b/mlir/test/Conversion/GPUCommon/memref-arg-attrs.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -split-input-file -convert-gpu-to-rocdl='use-opaque-pointers=1 use-bare-ptr-memref-call-conv=0' | FileCheck %s --check-prefixes=CHECK,ROCDL
+// RUN: mlir-opt %s -split-input-file -convert-gpu-to-rocdl='use-bare-ptr-memref-call-conv=0' | FileCheck %s --check-prefixes=CHECK,ROCDL
 // RUN: mlir-opt %s -split-input-file -convert-gpu-to-nvvm='use-bare-ptr-memref-call-conv=0' | FileCheck %s --check-prefixes=CHECK,NVVM
 
 gpu.module @kernel {
diff --git a/mlir/test/Conversion/GPUCommon/memref-arg-noalias-attrs.mlir b/mlir/test/Conversion/GPUCommon/memref-arg-noalias-attrs.mlir
index 81babac5502c7..33cdc3348e513 100644
--- a/mlir/test/Conversion/GPUCommon/memref-arg-noalias-attrs.mlir
+++ b/mlir/test/Conversion/GPUCommon/memref-arg-noalias-attrs.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -split-input-file -convert-gpu-to-rocdl='use-opaque-pointers=1 use-bare-ptr-memref-call-conv=1' | FileCheck %s --check-prefixes=CHECK,ROCDL
+// RUN: mlir-opt %s -split-input-file -convert-gpu-to-rocdl='use-bare-ptr-memref-call-conv=1' | FileCheck %s --check-prefixes=CHECK,ROCDL
 // RUN: mlir-opt %s -split-input-file -convert-gpu-to-nvvm='use-bare-ptr-memref-call-conv=1' | FileCheck %s --check-prefixes=CHECK,NVVM
 
 gpu.module @kernel {
diff --git a/mlir/test/Conversion/GPUCommon/memref-arg-noalias-warning.mlir b/mlir/test/Conversion/GPUCommon/memref-arg-noalias-warning.mlir
index 31be99c56bb7a..793df7380d78b 100644
--- a/mlir/test/Conversion/GPUCommon/memref-arg-noalias-warning.mlir
+++ b/mlir/test/Conversion/GPUCommon/memref-arg-noalias-warning.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -split-input-file -convert-gpu-to-rocdl='use-opaque-pointers=1 use-bare-ptr-memref-call-conv=0' -verify-diagnostics
+// RUN: mlir-opt %s -split-input-file -convert-gpu-to-rocdl='use-bare-ptr-memref-call-conv=0' -verify-diagnostics
 
 gpu.module @kernel {
 // expected-warning @+1 {{Cannot copy noalias with non-bare pointers.}}
diff --git a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-hip.mlir b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-hip.mlir
index 17f784f50277c..1b904fa142bad 100644
--- a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-hip.mlir
+++ b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-hip.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-gpu-to-rocdl='runtime=HIP use-opaque-pointers=1' -split-input-file | FileCheck %s
+// RUN: mlir-opt %s -convert-gpu-to-rocdl='runtime=HIP' -split-input-file | FileCheck %s
 
 gpu.module @test_module {
   // CHECK-DAG: llvm.mlir.global internal constant @[[$PRINT_GLOBAL0:[A-Za-z0-9_]+]]("Hello, world\0A\00")
diff --git a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-opencl.mlir b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-opencl.mlir
index 68dfb852b88b5..870f5c5016ece 100644
--- a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-opencl.mlir
+++ b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl-opencl.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-gpu-to-rocdl='runtime=OpenCL use-opaque-pointers=1' | FileCheck %s
+// RUN: mlir-opt %s -convert-gpu-to-rocdl='runtime=OpenCL' | FileCheck %s
 
 gpu.module @test_module {
   // CHECK: llvm.mlir.global internal constant @[[$PRINT_GLOBAL:[A-Za-z0-9_]+]]("Hello: %d\0A\00")  {addr_space = 4 : i32}
diff --git a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
index b1ae2f2a25405..2652b86657099 100644
--- a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
+++ b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
@@ -1,5 +1,5 @@
-// RUN: mlir-opt %s -convert-gpu-to-rocdl='use-opaque-pointers=1' -split-input-file | FileCheck %s
-// RUN: mlir-opt %s -convert-gpu-to-rocdl='index-bitwidth=32 use-opaque-pointers=1' -split-input-file | FileCheck --check-prefix=CHECK32 %s
+// RUN: mlir-opt %s -convert-gpu-to-rocdl -split-input-file | FileCheck %s
+// RUN: mlir-opt %s -convert-gpu-to-rocdl='index-bitwidth=32' -split-input-file | FileCheck --check-prefix=CHECK32 %s
 
 gpu.module @test_module {
   // CHECK-LABEL: func @gpu_index_ops()
diff --git a/mlir/test/Conversion/GPUToROCDL/memref.mlir b/mlir/test/Conversion/GPUToROCDL/memref.mlir
index c19edc6689749..e645481c89230 100644
--- a/mlir/test/Conversion/GPUToROCDL/memref.mlir
+++ b/mlir/test/Conversion/GPUToROCDL/memref.mlir
@@ -1,6 +1,6 @@
-// RUN: mlir-opt %s -convert-gpu-to-rocdl='use-opaque-pointers=1' -split-input-file | FileCheck %s
+// RUN: mlir-opt %s -convert-gpu-to-rocdl -split-input-file | FileCheck %s
 // RUN: mlir-opt %s \
-// RUN:   -convert-gpu-to-rocdl='use-bare-ptr-memref-call-conv=true use-opaque-pointers=1' \
+// RUN:   -convert-gpu-to-rocdl='use-bare-ptr-memref-call-conv=true' \
 // RUN:   -split-input-file \
 // RUN: | FileCheck %s --check-prefix=BARE
 
diff --git a/mlir/test/Conversion/GPUToROCDL/typed-pointers.mlir b/mlir/test/Conversion/GPUToROCDL/typed-pointers.mlir
deleted file mode 100644
index 004a526790fc5..0000000000000
--- a/mlir/test/Conversion/GPUToROCDL/typed-pointers.mlir
+++ /dev/null
@@ -1,34 +0,0 @@
-// RUN: mlir-opt %s -convert-gpu-to-rocdl="runtime=HIP use-opaque-pointers=0" -split-input-file | FileCheck --check-prefixes=CHECK,HIP %s
-// RUN: mlir-opt %s -convert-gpu-to-rocdl="runtime=OpenCL use-opaque-pointers=0" | FileCheck --check-prefixes=CHECK,OCL %s
-
-gpu.module @test_module {
-  // HIP-DAG: llvm.mlir.global internal constant @[[$PRINT_GLOBAL1:[A-Za-z0-9_]+]]("Hello: %d\0A\00")
-  // HIP-DAG: llvm.func @__ockl_printf_append_args(i64, i32, i64, i64, i64, i64, i64, i64, i64, i32) -> i64
-  // HIP-DAG: llvm.func @__ockl_printf_append_string_n(i64, !llvm.ptr<i8>, i64, i32) -> i64
-  // HIP-DAG: llvm.func @__ockl_printf_begin(i64) -> i64
-
-  // OCL: llvm.mlir.global internal constant @[[$PRINT_GLOBAL:[A-Za-z0-9_]+]]("Hello: %d\0A\00")  {addr_space = 4 : i32}
-  // OCL: llvm.func @printf(!llvm.ptr<i8, 4>, ...) -> i32
-  // CHECK-LABEL: func @test_printf
-  // CHECK: (%[[ARG0:.*]]: i32)
-  gpu.func @test_printf(%arg0: i32) {
-    // OCL: %[[IMM0:.*]] = llvm.mlir.addressof @[[$PRINT_GLOBAL]] : !llvm.ptr<array<11 x i8>, 4>
-    // OCL-NEXT: %[[IMM2:.*]] = llvm.getelementptr %[[IMM0]][0, 0] : (!llvm.ptr<array<11 x i8>, 4>) -> !llvm.ptr<i8, 4>
-    // OCL-NEXT: %{{.*}} = llvm.call @printf(%[[IMM2]], %[[ARG0]]) vararg(!llvm.func<i32 (ptr<i8, 4>, ...)>) : (!llvm.ptr<i8, 4>, i32) -> i32
-
-    // HIP: %[[CST0:.*]] = llvm.mlir.constant(0 : i64) : i64
-    // HIP-NEXT: %[[DESC0:.*]] = llvm.call @__ockl_printf_begin(%0) : (i64) -> i64
-    // HIP-NEXT: %[[FORMATSTR:.*]] = llvm.mlir.addressof @[[$PRINT_GLOBAL1]] : !llvm.ptr<array<11 x i8>>
-    // HIP-NEXT: %[[FORMATSTART:.*]] = llvm.getelementptr %[[FORMATSTR]][0, 0] : (!llvm.ptr<array<11 x i8>>) -> !llvm.ptr<i8>
-    // HIP-NEXT: %[[FORMATLEN:.*]] = llvm.mlir.constant(11 : i64) : i64
-    // HIP-NEXT: %[[ISLAST:.*]] = llvm.mlir.constant(1 : i32) : i32
-    // HIP-NEXT: %[[ISNTLAST:.*]] = llvm.mlir.constant(0 : i32) : i32
-    // HIP-NEXT: %[[DESC1:.*]] = llvm.call @__ockl_printf_append_string_n(%[[DESC0]], %[[FORMATSTART]], %[[FORMATLEN]], %[[ISNTLAST]]) : (i64, !llvm.ptr<i8>, i64, i32) -> i64
-    // HIP-NEXT: %[[NARGS1:.*]] = llvm.mlir.constant(1 : i32) : i32
-    // HIP-NEXT: %[[ARG0_64:.*]] = llvm.zext %[[ARG0]] : i32 to i64
-    // HIP-NEXT: %{{.*}} = llvm.call @__ockl_printf_append_args(%[[DESC1]], %[[NARGS1]], %[[ARG0_64]], %[[CST0]], %[[CST0]], %[[CST0]], %[[CST0]], %[[CST0]], %[[CST0]], %[[ISLAST]]) : (i64, i32, i64, i64, i64, i64, i64, i64, i64, i32) -> i64
-
-    gpu.printf "Hello: %d\n" %arg0 : i32
-    gpu.return
-  }
-}

From dfcb8907c125828cbde1f22c21210c073f896213 Mon Sep 17 00:00:00 2001
From: Yingwei Zheng <dtcxzyw2333@gmail.com>
Date: Wed, 1 Nov 2023 17:17:07 +0800
Subject: [PATCH 640/877] [LLVM-C] Fix linking failure introduced by 3351097.

This patch adds missing dependencies required by the new unittest introduced by #68406.
---
 llvm/unittests/Target/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/unittests/Target/CMakeLists.txt b/llvm/unittests/Target/CMakeLists.txt
index fb0775a67fe6d..b10236a2948f8 100644
--- a/llvm/unittests/Target/CMakeLists.txt
+++ b/llvm/unittests/Target/CMakeLists.txt
@@ -4,7 +4,7 @@ foreach(t ${LLVM_TARGETS_TO_BUILD})
   endif()
 endforeach()
 
-set(LLVM_LINK_COMPONENTS Target AllTargetsCodeGens)
+set(LLVM_LINK_COMPONENTS Core Target AllTargetsCodeGens AllTargetsDescs AllTargetsInfos)
 
 add_llvm_unittest(TargetMachineCTests
   TargetMachineOptionsTest.cpp

From 18669b1282d4eb20ab46a0fdc7da1993c5434049 Mon Sep 17 00:00:00 2001
From: Nicolas Vasilache <nicolas.vasilache@gmail.com>
Date: Wed, 1 Nov 2023 09:33:27 +0000
Subject: [PATCH 641/877] [NFC][Transform] Cleanup magic constant usage

---
 mlir/include/mlir/Dialect/Transform/Transforms/Passes.h  | 2 +-
 mlir/include/mlir/Dialect/Transform/Transforms/Passes.td | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Transform/Transforms/Passes.h b/mlir/include/mlir/Dialect/Transform/Transforms/Passes.h
index 7a7dfe4709b22..74e37d58074ce 100644
--- a/mlir/include/mlir/Dialect/Transform/Transforms/Passes.h
+++ b/mlir/include/mlir/Dialect/Transform/Transforms/Passes.h
@@ -9,8 +9,8 @@
 #ifndef MLIR_DIALECT_TRANSFORM_TRANSFORMS_PASSES_H
 #define MLIR_DIALECT_TRANSFORM_TRANSFORMS_PASSES_H
 
+#include "mlir/Dialect/Transform/IR/TransformDialect.h"
 #include "mlir/Pass/Pass.h"
-#include <memory>
 
 namespace mlir {
 class Pass;
diff --git a/mlir/include/mlir/Dialect/Transform/Transforms/Passes.td b/mlir/include/mlir/Dialect/Transform/Transforms/Passes.td
index 94965a67c0c12..c3436fd682427 100644
--- a/mlir/include/mlir/Dialect/Transform/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/Transform/Transforms/Passes.td
@@ -66,7 +66,7 @@ def InterpreterPass : Pass<"transform-interpreter"> {
   let description = [{
     This pass runs the transform dialect interpreter and applies the named
     sequence transformation specified by the provided name (defaults to
-    `__transform_main`).
+    `TransformDialect::kTransformEntryPointSymbolName` (i.e. `__transform_main`)).
   }];
   let dependentDialects = ["::mlir::transform::TransformDialect"];
   let options = [
@@ -79,7 +79,7 @@ def InterpreterPass : Pass<"transform-interpreter"> {
            "false",
            "Disable expensive checks in the interpreter for a faster run.">,
     Option<"entryPoint", "entry-point", "std::string",
-           /*default=*/[{"__transform_main"}],
+           /*default=*/[{TransformDialect::kTransformEntryPointSymbolName.str()}],
            "Entry point of the pass pipeline.">,
   ];
 }

From 50dec541f328a251c2830421f354e4439e635def Mon Sep 17 00:00:00 2001
From: Vlad Serebrennikov <serebrennikov.vladislav@gmail.com>
Date: Wed, 1 Nov 2023 12:39:33 +0300
Subject: [PATCH 642/877] [clang][NFC] Refactor
 `OMPDeclareReductionDecl::InitKind`

This patch moves `OMPDeclareReductionDecl::InitKind` to DeclBase.h, so that it's complete at the point where corresponding bit-field is declared. This patch also converts it to scoped enum named `OMPDeclareReductionInitKind`
---
 clang/include/clang/AST/DeclBase.h             |  6 ++++++
 clang/include/clang/AST/DeclOpenMP.h           | 16 +++++-----------
 clang/lib/AST/DeclOpenMP.cpp                   |  2 +-
 clang/lib/AST/DeclPrinter.cpp                  |  8 ++++----
 clang/lib/AST/TextNodeDumper.cpp               |  6 +++---
 clang/lib/CodeGen/CGOpenMPRuntime.cpp          |  2 +-
 clang/lib/Sema/SemaOpenMP.cpp                  |  6 +++---
 clang/lib/Sema/SemaTemplateInstantiateDecl.cpp |  6 +++---
 clang/lib/Serialization/ASTReaderDecl.cpp      |  2 +-
 clang/lib/Serialization/ASTWriterDecl.cpp      |  2 +-
 10 files changed, 28 insertions(+), 28 deletions(-)

diff --git a/clang/include/clang/AST/DeclBase.h b/clang/include/clang/AST/DeclBase.h
index ba6dadd7d3563..7b743edf94525 100644
--- a/clang/include/clang/AST/DeclBase.h
+++ b/clang/include/clang/AST/DeclBase.h
@@ -1422,6 +1422,12 @@ enum class ArgPassingKind {
   CanNeverPassInRegs
 };
 
+enum class OMPDeclareReductionInitKind {
+  Call,   // Initialized by function call.
+  Direct, // omp_priv(<expr>)
+  Copy    // omp_priv = <expr>
+};
+
 /// DeclContext - This is used only as base class of specific decl types that
 /// can act as declaration contexts. These decls are (only the top classes
 /// that directly derive from DeclContext are mentioned, not their subclasses):
diff --git a/clang/include/clang/AST/DeclOpenMP.h b/clang/include/clang/AST/DeclOpenMP.h
index dd63a4f5d680c..2bbd159f59a35 100644
--- a/clang/include/clang/AST/DeclOpenMP.h
+++ b/clang/include/clang/AST/DeclOpenMP.h
@@ -171,14 +171,7 @@ class OMPThreadPrivateDecl final : public OMPDeclarativeDirective<Decl> {
 class OMPDeclareReductionDecl final : public ValueDecl, public DeclContext {
   // This class stores some data in DeclContext::OMPDeclareReductionDeclBits
   // to save some space. Use the provided accessors to access it.
-public:
-  enum InitKind {
-    CallInit,   // Initialized by function call.
-    DirectInit, // omp_priv(<expr>)
-    CopyInit    // omp_priv = <expr>
-  };
 
-private:
   friend class ASTDeclReader;
   /// Combiner for declare reduction construct.
   Expr *Combiner = nullptr;
@@ -239,8 +232,9 @@ class OMPDeclareReductionDecl final : public ValueDecl, public DeclContext {
   Expr *getInitializer() { return Initializer; }
   const Expr *getInitializer() const { return Initializer; }
   /// Get initializer kind.
-  InitKind getInitializerKind() const {
-    return static_cast<InitKind>(OMPDeclareReductionDeclBits.InitializerKind);
+  OMPDeclareReductionInitKind getInitializerKind() const {
+    return static_cast<OMPDeclareReductionInitKind>(
+        OMPDeclareReductionDeclBits.InitializerKind);
   }
   /// Get Orig variable of the initializer.
   Expr *getInitOrig() { return Orig; }
@@ -249,9 +243,9 @@ class OMPDeclareReductionDecl final : public ValueDecl, public DeclContext {
   Expr *getInitPriv() { return Priv; }
   const Expr *getInitPriv() const { return Priv; }
   /// Set initializer expression for the declare reduction construct.
-  void setInitializer(Expr *E, InitKind IK) {
+  void setInitializer(Expr *E, OMPDeclareReductionInitKind IK) {
     Initializer = E;
-    OMPDeclareReductionDeclBits.InitializerKind = IK;
+    OMPDeclareReductionDeclBits.InitializerKind = llvm::to_underlying(IK);
   }
   /// Set initializer Orig and Priv vars.
   void setInitializerData(Expr *OrigE, Expr *PrivE) {
diff --git a/clang/lib/AST/DeclOpenMP.cpp b/clang/lib/AST/DeclOpenMP.cpp
index e29fc564fd34a..ac5780f82dbbb 100644
--- a/clang/lib/AST/DeclOpenMP.cpp
+++ b/clang/lib/AST/DeclOpenMP.cpp
@@ -104,7 +104,7 @@ OMPDeclareReductionDecl::OMPDeclareReductionDecl(
     QualType Ty, OMPDeclareReductionDecl *PrevDeclInScope)
     : ValueDecl(DK, DC, L, Name, Ty), DeclContext(DK), Combiner(nullptr),
       PrevDeclInScope(PrevDeclInScope) {
-  setInitializer(nullptr, CallInit);
+  setInitializer(nullptr, OMPDeclareReductionInitKind::Call);
 }
 
 void OMPDeclareReductionDecl::anchor() {}
diff --git a/clang/lib/AST/DeclPrinter.cpp b/clang/lib/AST/DeclPrinter.cpp
index daa219b43c4e6..98fba958064a1 100644
--- a/clang/lib/AST/DeclPrinter.cpp
+++ b/clang/lib/AST/DeclPrinter.cpp
@@ -1866,17 +1866,17 @@ void DeclPrinter::VisitOMPDeclareReductionDecl(OMPDeclareReductionDecl *D) {
     if (auto *Init = D->getInitializer()) {
       Out << " initializer(";
       switch (D->getInitializerKind()) {
-      case OMPDeclareReductionDecl::DirectInit:
+      case OMPDeclareReductionInitKind::Direct:
         Out << "omp_priv(";
         break;
-      case OMPDeclareReductionDecl::CopyInit:
+      case OMPDeclareReductionInitKind::Copy:
         Out << "omp_priv = ";
         break;
-      case OMPDeclareReductionDecl::CallInit:
+      case OMPDeclareReductionInitKind::Call:
         break;
       }
       Init->printPretty(Out, nullptr, Policy, 0, "\n", &Context);
-      if (D->getInitializerKind() == OMPDeclareReductionDecl::DirectInit)
+      if (D->getInitializerKind() == OMPDeclareReductionInitKind::Direct)
         Out << ")";
       Out << ")";
     }
diff --git a/clang/lib/AST/TextNodeDumper.cpp b/clang/lib/AST/TextNodeDumper.cpp
index 89951b384f128..bbdfd4523c820 100644
--- a/clang/lib/AST/TextNodeDumper.cpp
+++ b/clang/lib/AST/TextNodeDumper.cpp
@@ -2078,13 +2078,13 @@ void TextNodeDumper::VisitOMPDeclareReductionDecl(
     OS << " initializer";
     dumpPointer(Initializer);
     switch (D->getInitializerKind()) {
-    case OMPDeclareReductionDecl::DirectInit:
+    case OMPDeclareReductionInitKind::Direct:
       OS << " omp_priv = ";
       break;
-    case OMPDeclareReductionDecl::CopyInit:
+    case OMPDeclareReductionInitKind::Copy:
       OS << " omp_priv ()";
       break;
-    case OMPDeclareReductionDecl::CallInit:
+    case OMPDeclareReductionInitKind::Call:
       break;
     }
   }
diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index 01786774aad6b..34c9c02884ec5 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -1133,7 +1133,7 @@ void CGOpenMPRuntime::emitUserDefinedReduction(
   if (const Expr *Init = D->getInitializer()) {
     Initializer = emitCombinerOrInitializer(
         CGM, D->getType(),
-        D->getInitializerKind() == OMPDeclareReductionDecl::CallInit ? Init
+        D->getInitializerKind() == OMPDeclareReductionInitKind::Call ? Init
                                                                      : nullptr,
         cast<VarDecl>(cast<DeclRefExpr>(D->getInitOrig())->getDecl()),
         cast<VarDecl>(cast<DeclRefExpr>(D->getInitPriv())->getDecl()),
diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp
index 29b2c7531f4b5..1bd34f73e5f7e 100644
--- a/clang/lib/Sema/SemaOpenMP.cpp
+++ b/clang/lib/Sema/SemaOpenMP.cpp
@@ -22611,12 +22611,12 @@ void Sema::ActOnOpenMPDeclareReductionInitializerEnd(Decl *D, Expr *Initializer,
   PopFunctionScopeInfo();
 
   if (Initializer != nullptr) {
-    DRD->setInitializer(Initializer, OMPDeclareReductionDecl::CallInit);
+    DRD->setInitializer(Initializer, OMPDeclareReductionInitKind::Call);
   } else if (OmpPrivParm->hasInit()) {
     DRD->setInitializer(OmpPrivParm->getInit(),
                         OmpPrivParm->isDirectInit()
-                            ? OMPDeclareReductionDecl::DirectInit
-                            : OMPDeclareReductionDecl::CopyInit);
+                            ? OMPDeclareReductionInitKind::Direct
+                            : OMPDeclareReductionInitKind::Copy);
   } else {
     DRD->setInvalidDecl();
   }
diff --git a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
index 78a7892a35a32..11d4c83dba77d 100644
--- a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
@@ -3627,7 +3627,7 @@ Decl *TemplateDeclInstantiator::VisitOMPDeclareReductionDecl(
     SemaRef.CurrentInstantiationScope->InstantiatedLocal(
         cast<DeclRefExpr>(D->getInitPriv())->getDecl(),
         cast<DeclRefExpr>(NewDRD->getInitPriv())->getDecl());
-    if (D->getInitializerKind() == OMPDeclareReductionDecl::CallInit) {
+    if (D->getInitializerKind() == OMPDeclareReductionInitKind::Call) {
       SubstInitializer = SemaRef.SubstExpr(Init, TemplateArgs).get();
     } else {
       auto *OldPrivParm =
@@ -3642,9 +3642,9 @@ Decl *TemplateDeclInstantiator::VisitOMPDeclareReductionDecl(
   }
   IsCorrect = IsCorrect && SubstCombiner &&
               (!Init ||
-               (D->getInitializerKind() == OMPDeclareReductionDecl::CallInit &&
+               (D->getInitializerKind() == OMPDeclareReductionInitKind::Call &&
                 SubstInitializer) ||
-               (D->getInitializerKind() != OMPDeclareReductionDecl::CallInit &&
+               (D->getInitializerKind() != OMPDeclareReductionInitKind::Call &&
                 !SubstInitializer));
 
   (void)SemaRef.ActOnOpenMPDeclareReductionDirectiveEnd(
diff --git a/clang/lib/Serialization/ASTReaderDecl.cpp b/clang/lib/Serialization/ASTReaderDecl.cpp
index 470a648098e2e..94a678e0c4164 100644
--- a/clang/lib/Serialization/ASTReaderDecl.cpp
+++ b/clang/lib/Serialization/ASTReaderDecl.cpp
@@ -3005,7 +3005,7 @@ void ASTDeclReader::VisitOMPDeclareReductionDecl(OMPDeclareReductionDecl *D) {
   Expr *Priv = Record.readExpr();
   D->setInitializerData(Orig, Priv);
   Expr *Init = Record.readExpr();
-  auto IK = static_cast<OMPDeclareReductionDecl::InitKind>(Record.readInt());
+  auto IK = static_cast<OMPDeclareReductionInitKind>(Record.readInt());
   D->setInitializer(Init, IK);
   D->PrevDeclInScope = readDeclID();
 }
diff --git a/clang/lib/Serialization/ASTWriterDecl.cpp b/clang/lib/Serialization/ASTWriterDecl.cpp
index 7f6a23cf6b8c1..a91fde561d23a 100644
--- a/clang/lib/Serialization/ASTWriterDecl.cpp
+++ b/clang/lib/Serialization/ASTWriterDecl.cpp
@@ -1972,7 +1972,7 @@ void ASTDeclWriter::VisitOMPDeclareReductionDecl(OMPDeclareReductionDecl *D) {
   Record.AddStmt(D->getInitOrig());
   Record.AddStmt(D->getInitPriv());
   Record.AddStmt(D->getInitializer());
-  Record.push_back(D->getInitializerKind());
+  Record.push_back(llvm::to_underlying(D->getInitializerKind()));
   Record.AddDeclRef(D->getPrevDeclInScope());
   Code = serialization::DECL_OMP_DECLARE_REDUCTION;
 }

From 6b8ed78719d0ae8eff55b937a976602f3a748697 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Wed, 16 Aug 2023 15:22:57 +0200
Subject: [PATCH 643/877] [IR] Add writable attribute

This adds a writable attribute, which in conjunction with
dereferenceable(N) states that a spurious store of N bytes is
introduced on function entry. This implies that this many bytes
are writable without trapping or introducing data races. See
https://llvm.org/docs/Atomics.html#optimization-outside-atomic for
why the second point is important.

This attribute can be added to sret arguments. I believe Rust will
also be able to use it for by-value (moved) arguments. Rust likely
won't be able to use it for &mut arguments (tree borrows does not
appear to allow spurious stores).

In this patch the new attribute is only used by LICM scalar promotion.
However, the actual motivation for this is to fix a correctness issue
in call slot optimization, which needs this attribute to avoid
optimization regressions.

Followup to the discussion on D157499.

Differential Revision: https://reviews.llvm.org/D158081
---
 llvm/docs/LangRef.rst                         | 24 ++++++++
 llvm/include/llvm/Analysis/AliasAnalysis.h    | 12 +++-
 llvm/include/llvm/Bitcode/LLVMBitCodes.h      |  1 +
 llvm/include/llvm/IR/Attributes.td            |  3 +
 llvm/lib/Analysis/AliasAnalysis.cpp           | 14 ++++-
 llvm/lib/Bitcode/Reader/BitcodeReader.cpp     |  2 +
 llvm/lib/Bitcode/Writer/BitcodeWriter.cpp     |  2 +
 llvm/lib/IR/Attributes.cpp                    |  3 +-
 llvm/lib/IR/Verifier.cpp                      | 13 +++++
 .../Transforms/IPO/AttributorAttributes.cpp   | 12 ++++
 llvm/lib/Transforms/IPO/FunctionAttrs.cpp     |  7 +++
 llvm/lib/Transforms/Scalar/LICM.cpp           |  5 +-
 llvm/lib/Transforms/Utils/CodeExtractor.cpp   |  1 +
 llvm/test/Bitcode/attributes.ll               |  8 ++-
 .../Transforms/FunctionAttrs/readattrs.ll     | 55 +++++++++++++++++--
 llvm/test/Transforms/LICM/scalar-promote.ll   | 37 ++++++++++++-
 llvm/test/Verifier/writable-attr.ll           | 30 ++++++++++
 17 files changed, 210 insertions(+), 19 deletions(-)
 create mode 100644 llvm/test/Verifier/writable-attr.ll

diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index d8679829fcd59..9c8e264eb9b97 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -1532,6 +1532,30 @@ Currently, only the following parameter attributes are defined:
     If a function reads from a writeonly pointer argument, the behavior is
     undefined.
 
+``writable``
+    This attribute is only meaningful in conjunction with ``dereferenceable(N)``
+    or another attribute that implies the first ``N`` bytes of the pointer
+    argument are dereferenceable.
+
+    In that case, the attribute indicates that the first ``N`` bytes will be
+    (non-atomically) loaded and stored back on entry to the function.
+
+    This implies that it's possible to introduce spurious stores on entry to
+    the function without introducing traps or data races. This does not
+    necessarily hold throughout the whole function, as the pointer may escape
+    to a different thread during the execution of the function. See also the
+    :ref:`atomic optimization guide <Optimization outside atomic>`
+
+    The "other attributes" that imply dereferenceability are
+    ``dereferenceable_or_null`` (if the pointer is non-null) and the
+    ``sret``, ``byval``, ``byref``, ``inalloca``, ``preallocated`` family of
+    attributes. Note that not all of these combinations are useful, e.g.
+    ``byval`` arguments are known to be writable even without this attribute.
+
+    The ``writable`` attribute cannot be combined with ``readnone``,
+    ``readonly`` or a ``memory`` attribute that does not contain
+    ``argmem: write``.
+
 .. _gc:
 
 Garbage Collector Strategy Names
diff --git a/llvm/include/llvm/Analysis/AliasAnalysis.h b/llvm/include/llvm/Analysis/AliasAnalysis.h
index a69c4adf28fed..9bcf971378750 100644
--- a/llvm/include/llvm/Analysis/AliasAnalysis.h
+++ b/llvm/include/llvm/Analysis/AliasAnalysis.h
@@ -875,10 +875,16 @@ bool isNotVisibleOnUnwind(const Value *Object,
 
 /// Return true if the Object is writable, in the sense that any location based
 /// on this pointer that can be loaded can also be stored to without trapping.
+/// Additionally, at the point Object is declared, stores can be introduced
+/// without data races. At later points, this is only the case if the pointer
+/// can not escape to a different thread.
 ///
-/// By itself, this does not imply that introducing spurious stores is safe,
-/// for example due to thread-safety reasons.
-bool isWritableObject(const Value *Object);
+/// If ExplicitlyDereferenceableOnly is set to true, this property only holds
+/// for the part of Object that is explicitly marked as dereferenceable, e.g.
+/// using the dereferenceable(N) attribute. It does not necessarily hold for
+/// parts that are only known to be dereferenceable due to the presence of
+/// loads.
+bool isWritableObject(const Value *Object, bool &ExplicitlyDereferenceableOnly);
 
 /// A manager for alias analyses.
 ///
diff --git a/llvm/include/llvm/Bitcode/LLVMBitCodes.h b/llvm/include/llvm/Bitcode/LLVMBitCodes.h
index ee2ccec6e89a4..2a522c517c7fa 100644
--- a/llvm/include/llvm/Bitcode/LLVMBitCodes.h
+++ b/llvm/include/llvm/Bitcode/LLVMBitCodes.h
@@ -717,6 +717,7 @@ enum AttributeKindCodes {
   ATTR_KIND_MEMORY = 86,
   ATTR_KIND_NOFPCLASS = 87,
   ATTR_KIND_OPTIMIZE_FOR_DEBUGGING = 88,
+  ATTR_KIND_WRITABLE = 89,
 };
 
 enum ComdatSelectionKindCodes {
diff --git a/llvm/include/llvm/IR/Attributes.td b/llvm/include/llvm/IR/Attributes.td
index fda79f5f24495..693ffbbf32e0e 100644
--- a/llvm/include/llvm/IR/Attributes.td
+++ b/llvm/include/llvm/IR/Attributes.td
@@ -303,6 +303,9 @@ def VScaleRange : IntAttr<"vscale_range", [FnAttr]>;
 /// Function always comes back to callsite.
 def WillReturn : EnumAttr<"willreturn", [FnAttr]>;
 
+/// Pointer argument is writable.
+def Writable : EnumAttr<"writable", [ParamAttr]>;
+
 /// Function only writes to memory.
 def WriteOnly : EnumAttr<"writeonly", [ParamAttr]>;
 
diff --git a/llvm/lib/Analysis/AliasAnalysis.cpp b/llvm/lib/Analysis/AliasAnalysis.cpp
index 75fe68a80ab60..b8a22184d9602 100644
--- a/llvm/lib/Analysis/AliasAnalysis.cpp
+++ b/llvm/lib/Analysis/AliasAnalysis.cpp
@@ -911,15 +911,23 @@ bool llvm::isNotVisibleOnUnwind(const Value *Object,
 
 // We don't consider globals as writable: While the physical memory is writable,
 // we may not have provenance to perform the write.
-bool llvm::isWritableObject(const Value *Object) {
+bool llvm::isWritableObject(const Value *Object,
+                            bool &ExplicitlyDereferenceableOnly) {
+  ExplicitlyDereferenceableOnly = false;
+
   // TODO: Alloca might not be writable after its lifetime ends.
   // See https://github.com/llvm/llvm-project/issues/51838.
   if (isa<AllocaInst>(Object))
     return true;
 
-  // TODO: Also handle sret.
-  if (auto *A = dyn_cast<Argument>(Object))
+  if (auto *A = dyn_cast<Argument>(Object)) {
+    if (A->hasAttribute(Attribute::Writable)) {
+      ExplicitlyDereferenceableOnly = true;
+      return true;
+    }
+
     return A->hasByValAttr();
+  }
 
   // TODO: Noalias shouldn't imply writability, this should check for an
   // allocator function instead.
diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
index fcba7366fdb61..747968b51407c 100644
--- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -2058,6 +2058,8 @@ static Attribute::AttrKind getAttrFromCode(uint64_t Code) {
     return Attribute::Hot;
   case bitc::ATTR_KIND_PRESPLIT_COROUTINE:
     return Attribute::PresplitCoroutine;
+  case bitc::ATTR_KIND_WRITABLE:
+    return Attribute::Writable;
   }
 }
 
diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
index d7ebc76d9bfb0..5e3341d4de133 100644
--- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -824,6 +824,8 @@ static uint64_t getAttrKindEncoding(Attribute::AttrKind Kind) {
     return bitc::ATTR_KIND_MUSTPROGRESS;
   case Attribute::PresplitCoroutine:
     return bitc::ATTR_KIND_PRESPLIT_COROUTINE;
+  case Attribute::Writable:
+    return bitc::ATTR_KIND_WRITABLE;
   case Attribute::EndAttrKinds:
     llvm_unreachable("Can not encode end-attribute kinds marker.");
   case Attribute::None:
diff --git a/llvm/lib/IR/Attributes.cpp b/llvm/lib/IR/Attributes.cpp
index 3d89d18e5822b..10a74ed68a392 100644
--- a/llvm/lib/IR/Attributes.cpp
+++ b/llvm/lib/IR/Attributes.cpp
@@ -1961,7 +1961,8 @@ AttributeMask AttributeFuncs::typeIncompatible(Type *Ty,
           .addAttribute(Attribute::ReadNone)
           .addAttribute(Attribute::ReadOnly)
           .addAttribute(Attribute::Dereferenceable)
-          .addAttribute(Attribute::DereferenceableOrNull);
+          .addAttribute(Attribute::DereferenceableOrNull)
+          .addAttribute(Attribute::Writable);
     if (ASK & ASK_UNSAFE_TO_DROP)
       Incompatible.addAttribute(Attribute::Nest)
           .addAttribute(Attribute::SwiftError)
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index 396af600b8dab..7c80997313292 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -1927,6 +1927,14 @@ void Verifier::verifyParameterAttrs(AttributeSet Attrs, Type *Ty,
         "'noinline and alwaysinline' are incompatible!",
         V);
 
+  Check(!(Attrs.hasAttribute(Attribute::Writable) &&
+          Attrs.hasAttribute(Attribute::ReadNone)),
+        "Attributes writable and readnone are incompatible!", V);
+
+  Check(!(Attrs.hasAttribute(Attribute::Writable) &&
+          Attrs.hasAttribute(Attribute::ReadOnly)),
+        "Attributes writable and readonly are incompatible!", V);
+
   AttributeMask IncompatibleAttrs = AttributeFuncs::typeIncompatible(Ty);
   for (Attribute Attr : Attrs) {
     if (!Attr.isStringAttribute() &&
@@ -2131,6 +2139,11 @@ void Verifier::verifyFunctionAttrs(FunctionType *FT, AttributeList Attrs,
           "Attributes 'minsize and optdebug' are incompatible!", V);
   }
 
+  Check(!Attrs.hasAttrSomewhere(Attribute::Writable) ||
+        isModSet(Attrs.getMemoryEffects().getModRef(IRMemLocation::ArgMem)),
+        "Attribute writable and memory without argmem: write are incompatible!",
+        V);
+
   if (Attrs.hasFnAttr("aarch64_pstate_sm_enabled")) {
     Check(!Attrs.hasFnAttr("aarch64_pstate_sm_compatible"),
            "Attributes 'aarch64_pstate_sm_enabled and "
diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
index bbb0cfa0eb05f..21d83edbb18c4 100644
--- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
+++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
@@ -7845,6 +7845,9 @@ struct AAMemoryBehaviorImpl : public AAMemoryBehavior {
 
     // Clear existing attributes.
     A.removeAttrs(IRP, AttrKinds);
+    // Clear conflicting writable attribute.
+    if (isAssumedReadOnly())
+      A.removeAttrs(IRP, Attribute::Writable);
 
     // Use the generic manifest method.
     return IRAttribute::manifest(A);
@@ -8032,6 +8035,10 @@ struct AAMemoryBehaviorFunction final : public AAMemoryBehaviorImpl {
       ME = MemoryEffects::writeOnly();
 
     A.removeAttrs(getIRPosition(), AttrKinds);
+    // Clear conflicting writable attribute.
+    if (ME.onlyReadsMemory())
+      for (Argument &Arg : F.args())
+        A.removeAttrs(IRPosition::argument(Arg), Attribute::Writable);
     return A.manifestAttrs(getIRPosition(),
                            Attribute::getWithMemoryEffects(F.getContext(), ME));
   }
@@ -8066,6 +8073,11 @@ struct AAMemoryBehaviorCallSite final
       ME = MemoryEffects::writeOnly();
 
     A.removeAttrs(getIRPosition(), AttrKinds);
+    // Clear conflicting writable attribute.
+    if (ME.onlyReadsMemory())
+      for (Use &U : CB.args())
+        A.removeAttrs(IRPosition::callsite_argument(CB, U.getOperandNo()),
+                      Attribute::Writable);
     return A.manifestAttrs(
         getIRPosition(), Attribute::getWithMemoryEffects(CB.getContext(), ME));
   }
diff --git a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
index 57cc780c8b468..7c277518b21db 100644
--- a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
+++ b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
@@ -285,6 +285,10 @@ static void addMemoryAttrs(const SCCNodeSet &SCCNodes, AARGetterT &&AARGetter,
     if (NewME != OldME) {
       ++NumMemoryAttr;
       F->setMemoryEffects(NewME);
+      // Remove conflicting writable attributes.
+      if (!isModSet(NewME.getModRef(IRMemLocation::ArgMem)))
+        for (Argument &A : F->args())
+          A.removeAttr(Attribute::Writable);
       Changed.insert(F);
     }
   }
@@ -848,6 +852,9 @@ static bool addAccessAttr(Argument *A, Attribute::AttrKind R) {
   A->removeAttr(Attribute::WriteOnly);
   A->removeAttr(Attribute::ReadOnly);
   A->removeAttr(Attribute::ReadNone);
+  // Remove conflicting writable attribute.
+  if (R == Attribute::ReadNone || R == Attribute::ReadOnly)
+    A->removeAttr(Attribute::Writable);
   A->addAttr(R);
   if (R == Attribute::ReadOnly)
     ++NumReadOnlyArg;
diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp
index aab6b05385108..35a9e3c9b6000 100644
--- a/llvm/lib/Transforms/Scalar/LICM.cpp
+++ b/llvm/lib/Transforms/Scalar/LICM.cpp
@@ -2171,7 +2171,10 @@ bool llvm::promoteLoopAccessesToScalars(
   // violating the memory model.
   if (StoreSafety == StoreSafetyUnknown) {
     Value *Object = getUnderlyingObject(SomePtr);
-    if (isWritableObject(Object) &&
+    bool ExplicitlyDereferenceableOnly;
+    if (isWritableObject(Object, ExplicitlyDereferenceableOnly) &&
+        (!ExplicitlyDereferenceableOnly ||
+         isDereferenceablePointer(SomePtr, AccessTy, MDL)) &&
         isThreadLocalObject(Object, CurLoop, DT, TTI))
       StoreSafety = StoreSafe;
   }
diff --git a/llvm/lib/Transforms/Utils/CodeExtractor.cpp b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
index b251a85cf85f9..32f1bb1db9c35 100644
--- a/llvm/lib/Transforms/Utils/CodeExtractor.cpp
+++ b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
@@ -994,6 +994,7 @@ Function *CodeExtractor::constructFunction(const ValueSet &inputs,
       case Attribute::ImmArg:
       case Attribute::ByRef:
       case Attribute::WriteOnly:
+      case Attribute::Writable:
       //  These are not really attributes.
       case Attribute::None:
       case Attribute::EndAttrKinds:
diff --git a/llvm/test/Bitcode/attributes.ll b/llvm/test/Bitcode/attributes.ll
index eaf670575f4dd..c4bced5c003be 100644
--- a/llvm/test/Bitcode/attributes.ll
+++ b/llvm/test/Bitcode/attributes.ll
@@ -511,12 +511,16 @@ define void @f87() fn_ret_thunk_extern { ret void }
 ; CHECK: define void @f88() [[SKIPPROFILE:#[0-9]+]]
 define void @f88() skipprofile { ret void }
 
-define void @f89() optdebug
 ; CHECK: define void @f89() [[OPTDEBUG:#[0-9]+]]
-{
+define void @f89() optdebug {
         ret void;
 }
 
+; CHECK: define void @f90(ptr writable %p)
+define void @f90(ptr writable %p) {
+  ret void
+}
+
 ; CHECK: attributes #0 = { noreturn }
 ; CHECK: attributes #1 = { nounwind }
 ; CHECK: attributes #2 = { memory(none) }
diff --git a/llvm/test/Transforms/FunctionAttrs/readattrs.ll b/llvm/test/Transforms/FunctionAttrs/readattrs.ll
index 086da40d3fde7..0986f74c181d9 100644
--- a/llvm/test/Transforms/FunctionAttrs/readattrs.ll
+++ b/llvm/test/Transforms/FunctionAttrs/readattrs.ll
@@ -230,7 +230,7 @@ define void @test8_2(ptr %p) {
 ; ATTRIBUTOR-LABEL: define {{[^@]+}}@test8_2
 ; ATTRIBUTOR-SAME: (ptr nocapture nofree writeonly [[P:%.*]]) #[[ATTR0]] {
 ; ATTRIBUTOR-NEXT:  entry:
-; ATTRIBUTOR-NEXT:    [[CALL:%.*]] = call ptr @test8_1(ptr nofree readnone [[P]]) #[[ATTR13:[0-9]+]]
+; ATTRIBUTOR-NEXT:    [[CALL:%.*]] = call ptr @test8_1(ptr nofree readnone [[P]]) #[[ATTR14:[0-9]+]]
 ; ATTRIBUTOR-NEXT:    store i32 10, ptr [[CALL]], align 4
 ; ATTRIBUTOR-NEXT:    ret void
 ;
@@ -238,7 +238,7 @@ define void @test8_2(ptr %p) {
 ; ATTRIBUTOR-CGSCC-LABEL: define {{[^@]+}}@test8_2
 ; ATTRIBUTOR-CGSCC-SAME: (ptr nofree writeonly [[P:%.*]]) #[[ATTR5:[0-9]+]] {
 ; ATTRIBUTOR-CGSCC-NEXT:  entry:
-; ATTRIBUTOR-CGSCC-NEXT:    [[CALL:%.*]] = call ptr @test8_1(ptr nofree readnone [[P]]) #[[ATTR13:[0-9]+]]
+; ATTRIBUTOR-CGSCC-NEXT:    [[CALL:%.*]] = call ptr @test8_1(ptr nofree readnone [[P]]) #[[ATTR14:[0-9]+]]
 ; ATTRIBUTOR-CGSCC-NEXT:    store i32 10, ptr [[CALL]], align 4
 ; ATTRIBUTOR-CGSCC-NEXT:    ret void
 ;
@@ -260,13 +260,13 @@ define void @test9(<4 x ptr> %ptrs, <4 x i32>%val) {
 ; ATTRIBUTOR: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(write)
 ; ATTRIBUTOR-LABEL: define {{[^@]+}}@test9
 ; ATTRIBUTOR-SAME: (<4 x ptr> [[PTRS:%.*]], <4 x i32> [[VAL:%.*]]) #[[ATTR0]] {
-; ATTRIBUTOR-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[VAL]], <4 x ptr> [[PTRS]], i32 4, <4 x i1> <i1 true, i1 false, i1 true, i1 false>) #[[ATTR14:[0-9]+]]
+; ATTRIBUTOR-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[VAL]], <4 x ptr> [[PTRS]], i32 4, <4 x i1> <i1 true, i1 false, i1 true, i1 false>) #[[ATTR15:[0-9]+]]
 ; ATTRIBUTOR-NEXT:    ret void
 ;
 ; ATTRIBUTOR-CGSCC: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(write)
 ; ATTRIBUTOR-CGSCC-LABEL: define {{[^@]+}}@test9
 ; ATTRIBUTOR-CGSCC-SAME: (<4 x ptr> [[PTRS:%.*]], <4 x i32> [[VAL:%.*]]) #[[ATTR0]] {
-; ATTRIBUTOR-CGSCC-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[VAL]], <4 x ptr> [[PTRS]], i32 4, <4 x i1> <i1 true, i1 false, i1 true, i1 false>) #[[ATTR14:[0-9]+]]
+; ATTRIBUTOR-CGSCC-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> [[VAL]], <4 x ptr> [[PTRS]], i32 4, <4 x i1> <i1 true, i1 false, i1 true, i1 false>) #[[ATTR15:[0-9]+]]
 ; ATTRIBUTOR-CGSCC-NEXT:    ret void
 ;
   call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32>%val, <4 x ptr> %ptrs, i32 4, <4 x i1><i1 true, i1 false, i1 true, i1 false>)
@@ -284,13 +284,13 @@ define <4 x i32> @test10(<4 x ptr> %ptrs) {
 ; ATTRIBUTOR: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(read)
 ; ATTRIBUTOR-LABEL: define {{[^@]+}}@test10
 ; ATTRIBUTOR-SAME: (<4 x ptr> [[PTRS:%.*]]) #[[ATTR7:[0-9]+]] {
-; ATTRIBUTOR-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[PTRS]], i32 4, <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x i32> undef) #[[ATTR15:[0-9]+]]
+; ATTRIBUTOR-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[PTRS]], i32 4, <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x i32> undef) #[[ATTR16:[0-9]+]]
 ; ATTRIBUTOR-NEXT:    ret <4 x i32> [[RES]]
 ;
 ; ATTRIBUTOR-CGSCC: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(read)
 ; ATTRIBUTOR-CGSCC-LABEL: define {{[^@]+}}@test10
 ; ATTRIBUTOR-CGSCC-SAME: (<4 x ptr> [[PTRS:%.*]]) #[[ATTR8:[0-9]+]] {
-; ATTRIBUTOR-CGSCC-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[PTRS]], i32 4, <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x i32> undef) #[[ATTR15:[0-9]+]]
+; ATTRIBUTOR-CGSCC-NEXT:    [[RES:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[PTRS]], i32 4, <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x i32> undef) #[[ATTR16:[0-9]+]]
 ; ATTRIBUTOR-CGSCC-NEXT:    ret <4 x i32> [[RES]]
 ;
   %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> %ptrs, i32 4, <4 x i1><i1 true, i1 false, i1 true, i1 false>, <4 x i32>undef)
@@ -719,5 +719,48 @@ define void @op_bundle_readonly_unknown(ptr %p) {
   call void @readonly_param(ptr %p) ["unknown"()]
   ret void
 }
+
+define i32 @writable_readonly(ptr writable dereferenceable(4) %p) {
+; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: read)
+; FNATTRS-LABEL: define {{[^@]+}}@writable_readonly
+; FNATTRS-SAME: (ptr nocapture readonly dereferenceable(4) [[P:%.*]]) #[[ATTR15:[0-9]+]] {
+; FNATTRS-NEXT:    [[V:%.*]] = load i32, ptr [[P]], align 4
+; FNATTRS-NEXT:    ret i32 [[V]]
+;
+; ATTRIBUTOR: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: read)
+; ATTRIBUTOR-LABEL: define {{[^@]+}}@writable_readonly
+; ATTRIBUTOR-SAME: (ptr nocapture nofree nonnull readonly dereferenceable(4) [[P:%.*]]) #[[ATTR13:[0-9]+]] {
+; ATTRIBUTOR-NEXT:    [[V:%.*]] = load i32, ptr [[P]], align 4
+; ATTRIBUTOR-NEXT:    ret i32 [[V]]
+;
+; ATTRIBUTOR-CGSCC: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: read)
+; ATTRIBUTOR-CGSCC-LABEL: define {{[^@]+}}@writable_readonly
+; ATTRIBUTOR-CGSCC-SAME: (ptr nocapture nofree nonnull readonly dereferenceable(4) [[P:%.*]]) #[[ATTR13:[0-9]+]] {
+; ATTRIBUTOR-CGSCC-NEXT:    [[V:%.*]] = load i32, ptr [[P]], align 4
+; ATTRIBUTOR-CGSCC-NEXT:    ret i32 [[V]]
+;
+  %v = load i32, ptr %p
+  ret i32 %v
+}
+
+define void @writable_readnone(ptr writable dereferenceable(4) %p) {
+; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+; FNATTRS-LABEL: define {{[^@]+}}@writable_readnone
+; FNATTRS-SAME: (ptr nocapture readnone dereferenceable(4) [[P:%.*]]) #[[ATTR1]] {
+; FNATTRS-NEXT:    ret void
+;
+; ATTRIBUTOR: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+; ATTRIBUTOR-LABEL: define {{[^@]+}}@writable_readnone
+; ATTRIBUTOR-SAME: (ptr nocapture nofree nonnull readnone dereferenceable(4) [[P:%.*]]) #[[ATTR1]] {
+; ATTRIBUTOR-NEXT:    ret void
+;
+; ATTRIBUTOR-CGSCC: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
+; ATTRIBUTOR-CGSCC-LABEL: define {{[^@]+}}@writable_readnone
+; ATTRIBUTOR-CGSCC-SAME: (ptr nocapture nofree nonnull readnone dereferenceable(4) [[P:%.*]]) #[[ATTR1]] {
+; ATTRIBUTOR-CGSCC-NEXT:    ret void
+;
+  ret void
+}
+
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; COMMON: {{.*}}
diff --git a/llvm/test/Transforms/LICM/scalar-promote.ll b/llvm/test/Transforms/LICM/scalar-promote.ll
index edcadf53fa477..0ce85081a69b4 100644
--- a/llvm/test/Transforms/LICM/scalar-promote.ll
+++ b/llvm/test/Transforms/LICM/scalar-promote.ll
@@ -885,9 +885,40 @@ exit:
   ret void
 }
 
-; TODO: The store can be promoted, as sret memory is writable.
-define void @sret_cond_store(ptr sret(i32) noalias %ptr) {
-; CHECK-LABEL: @sret_cond_store(
+define void @cond_store_writable_dereferenceable(ptr noalias writable dereferenceable(4) %ptr) {
+; CHECK-LABEL: @cond_store_writable_dereferenceable(
+; CHECK-NEXT:    [[PTR_PROMOTED:%.*]] = load i32, ptr [[PTR:%.*]], align 4
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[V_INC1:%.*]] = phi i32 [ [[V_INC:%.*]], [[LOOP_LATCH:%.*]] ], [ [[PTR_PROMOTED]], [[TMP0:%.*]] ]
+; CHECK-NEXT:    [[C:%.*]] = icmp ult i32 [[V_INC1]], 10
+; CHECK-NEXT:    br i1 [[C]], label [[LOOP_LATCH]], label [[EXIT:%.*]]
+; CHECK:       loop.latch:
+; CHECK-NEXT:    [[V_INC]] = add i32 [[V_INC1]], 1
+; CHECK-NEXT:    br label [[LOOP]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[V_INC1_LCSSA:%.*]] = phi i32 [ [[V_INC1]], [[LOOP]] ]
+; CHECK-NEXT:    store i32 [[V_INC1_LCSSA]], ptr [[PTR]], align 4
+; CHECK-NEXT:    ret void
+;
+  br label %loop
+
+loop:
+  %v = load i32, ptr %ptr
+  %c = icmp ult i32 %v, 10
+  br i1 %c, label %loop.latch, label %exit
+
+loop.latch:
+  %v.inc = add i32 %v, 1
+  store i32 %v.inc, ptr %ptr
+  br label %loop
+
+exit:
+  ret void
+}
+
+define void @cond_store_writable_not_sufficiently_dereferenceable(ptr noalias writable dereferenceable(2) %ptr) {
+; CHECK-LABEL: @cond_store_writable_not_sufficiently_dereferenceable(
 ; CHECK-NEXT:    [[PTR_PROMOTED:%.*]] = load i32, ptr [[PTR:%.*]], align 4
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
diff --git a/llvm/test/Verifier/writable-attr.ll b/llvm/test/Verifier/writable-attr.ll
new file mode 100644
index 0000000000000..8a4c5aed4db08
--- /dev/null
+++ b/llvm/test/Verifier/writable-attr.ll
@@ -0,0 +1,30 @@
+; RUN: not llvm-as -disable-output %s 2>&1 | FileCheck %s
+
+; CHECK: Attribute 'writable' applied to incompatible type!
+; CHECK-NEXT: ptr @not_pointer
+define void @not_pointer_writable(i32 writable %arg) {
+  ret void
+}
+
+; CHECK: Attributes writable and readnone are incompatible!
+; CHECK-NEXT: ptr @writable_readnone
+define void @writable_readnone(ptr writable readnone %arg) {
+  ret void
+}
+
+; CHECK: Attributes writable and readonly are incompatible!
+; CHECK-NEXT: ptr @writable_readonly
+define void @writable_readonly(ptr writable readonly %arg) {
+  ret void
+}
+
+; CHECK: Attribute writable and memory without argmem: write are incompatible!
+; CHECK-NEXT: ptr @writable_memory_argmem_read
+define void @writable_memory_argmem_read(ptr writable %arg) memory(write, argmem: read) {
+  ret void
+}
+
+; CHECK-NOT: incompatible
+define void @writable_memory_argmem_write(ptr writable %arg) memory(read, argmem: write) {
+  ret void
+}

From b6b31e791b5ada286bace4c48243910985b0ac10 Mon Sep 17 00:00:00 2001
From: Ella Ma <alansnape3058@gmail.com>
Date: Wed, 1 Nov 2023 17:50:01 +0800
Subject: [PATCH 644/877] [analyzer] Fix uninitialized base class with
 initializer list when ctor is not declared in the base class

Fixes #70464

When ctor is not declared in the base class, initializing the base class
with the initializer list will not trigger a proper assignment of the
base region, as a CXXConstructExpr doing that is not available in the
AST.

This patch checks whether the init expr is an InitListExpr under a base
initializer, and adds a binding if so.
---
 clang/lib/StaticAnalyzer/Core/ExprEngine.cpp |  8 +++
 clang/test/Analysis/issue-70464.cpp          | 68 ++++++++++++++++++++
 2 files changed, 76 insertions(+)
 create mode 100644 clang/test/Analysis/issue-70464.cpp

diff --git a/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp b/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp
index 2e67fb953e456..24e91a22fd688 100644
--- a/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp
+++ b/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp
@@ -1222,6 +1222,14 @@ void ExprEngine::ProcessInitializer(const CFGInitializer CFGInit,
       PostInitializer PP(BMI, FieldLoc.getAsRegion(), stackFrame);
       evalBind(Tmp, Init, Pred, FieldLoc, InitVal, /*isInit=*/true, &PP);
     }
+  } else if (BMI->isBaseInitializer() && isa<InitListExpr>(Init)) {
+    // When the base class is initialized with an initialization list and the
+    // base class does not have a ctor, there will not be a CXXConstructExpr to
+    // initialize the base region. Hence, we need to make the bind for it.
+    SVal BaseLoc = getStoreManager().evalDerivedToBase(
+        thisVal, QualType(BMI->getBaseClass(), 0), BMI->isBaseVirtual());
+    SVal InitVal = State->getSVal(Init, stackFrame);
+    evalBind(Tmp, Init, Pred, BaseLoc, InitVal, /*isInit=*/true);
   } else {
     assert(BMI->isBaseInitializer() || BMI->isDelegatingInitializer());
     Tmp.insert(Pred);
diff --git a/clang/test/Analysis/issue-70464.cpp b/clang/test/Analysis/issue-70464.cpp
new file mode 100644
index 0000000000000..f3b3072eb9198
--- /dev/null
+++ b/clang/test/Analysis/issue-70464.cpp
@@ -0,0 +1,68 @@
+// RUN: %clang_analyze_cc1 %s -verify -analyzer-checker=core,debug.ExprInspection
+
+// Refer to https://github.com/llvm/llvm-project/issues/70464 for more details.
+//
+// When the base class does not have a declared constructor, the base
+// initializer in the constructor of the derived class should use the given
+// initializer list to finish the initialization of the base class.
+//
+// Also testing base constructor and delegate constructor to make sure this
+// change will not break these two cases when a CXXConstructExpr is available.
+
+void clang_analyzer_dump(int);
+
+namespace init_list {
+
+struct Base {
+  int foox;
+};
+
+class Derived : public Base {
+public:
+  Derived() : Base{42} {
+    // The dereference to this->foox below should be initialized properly.
+    clang_analyzer_dump(this->foox); // expected-warning{{42 S32b}}
+    clang_analyzer_dump(foox); // expected-warning{{42 S32b}}
+  }
+};
+
+void entry() { Derived test; }
+
+} // namespace init_list
+
+namespace base_ctor_call {
+
+struct Base {
+  int foox;
+  Base(int x) : foox(x) {}
+};
+
+class Derived : public Base {
+public:
+  Derived() : Base{42} {
+    clang_analyzer_dump(this->foox); // expected-warning{{42 S32b}}
+    clang_analyzer_dump(foox); // expected-warning{{42 S32b}}
+  }
+};
+
+void entry() { Derived test; }
+
+} // namespace base_ctor_call
+
+namespace delegate_ctor_call {
+
+struct Base {
+  int foox;
+};
+
+struct Derived : Base {
+  Derived(int parx) { this->foox = parx; }
+  Derived() : Derived{42} {
+    clang_analyzer_dump(this->foox); // expected-warning{{42 S32b}}
+    clang_analyzer_dump(foox); // expected-warning{{42 S32b}}
+  }
+};
+
+void entry() { Derived test; }
+
+} // namespace delegate_ctor_call

From ac7c816dc297a3b85b947b14c83b20433ef6c49b Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <Ramkumar.Ramachandra@imgtec.com>
Date: Wed, 1 Nov 2023 09:55:49 +0000
Subject: [PATCH 645/877] Revert "VectorUtils: mark lrint, llrint as trivially
 vectorizable (#69945)"

This reverts commit 5bfd89bda7c2d5ff167c7bcea0c8d69b0b498f08.

It was causing build failures on ffmpeg on i686.
---
 llvm/lib/Analysis/VectorUtils.cpp             |   4 -
 .../Transforms/LoopVectorize/intrinsic.ll     |   4 +-
 .../Transforms/SLPVectorizer/RISCV/fround.ll  | 165 ++++++++++++++----
 llvm/test/Transforms/Scalarizer/intrinsics.ll |  14 +-
 4 files changed, 139 insertions(+), 48 deletions(-)

diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp
index 4f5db8b7aaf74..be171867a2bf4 100644
--- a/llvm/lib/Analysis/VectorUtils.cpp
+++ b/llvm/lib/Analysis/VectorUtils.cpp
@@ -91,8 +91,6 @@ bool llvm::isTriviallyVectorizable(Intrinsic::ID ID) {
   case Intrinsic::canonicalize:
   case Intrinsic::fptosi_sat:
   case Intrinsic::fptoui_sat:
-  case Intrinsic::lrint:
-  case Intrinsic::llrint:
     return true;
   default:
     return false;
@@ -124,8 +122,6 @@ bool llvm::isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID,
   switch (ID) {
   case Intrinsic::fptosi_sat:
   case Intrinsic::fptoui_sat:
-  case Intrinsic::lrint:
-  case Intrinsic::llrint:
     return OpdIdx == -1 || OpdIdx == 0;
   case Intrinsic::is_fpclass:
     return OpdIdx == 0;
diff --git a/llvm/test/Transforms/LoopVectorize/intrinsic.ll b/llvm/test/Transforms/LoopVectorize/intrinsic.ll
index 0f070347dd4ef..3e7754951de6a 100644
--- a/llvm/test/Transforms/LoopVectorize/intrinsic.ll
+++ b/llvm/test/Transforms/LoopVectorize/intrinsic.ll
@@ -1602,7 +1602,7 @@ declare i32 @llvm.lrint.i32.f32(float)
 
 define void @lrint_i32_f32(ptr %x, ptr %y, i64 %n) {
 ; CHECK-LABEL: @lrint_i32_f32(
-; CHECK: llvm.lrint.v4i32.v4f32
+; CHECK-NOT: llvm.lrint.v4i32.v4f32
 ; CHECK: ret void
 ;
 entry:
@@ -1628,7 +1628,7 @@ declare i64 @llvm.llrint.i64.f32(float)
 
 define void @llrint_i64_f32(ptr %x, ptr %y, i64 %n) {
 ; CHECK-LABEL: @llrint_i64_f32(
-; CHECK: llvm.llrint.v4i64.v4f32
+; CHECK-NOT: llvm.llrint.v4i32.v4f32
 ; CHECK: ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/fround.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/fround.ll
index 9cd0d5d321e2f..e5c46b72a1c30 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/fround.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/fround.ll
@@ -34,8 +34,13 @@ define <2 x i32> @lrint_v2i32f32(ptr %a) {
 ; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[A]], align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x i32> @llvm.lrint.v2i32.v2f32(<2 x float> [[TMP0]])
-; CHECK-NEXT:    ret <2 x i32> [[TMP1]]
+; CHECK-NEXT:    [[VECEXT:%.*]] = extractelement <2 x float> [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.lrint.i32.f32(float [[VECEXT]])
+; CHECK-NEXT:    [[VECINS:%.*]] = insertelement <2 x i32> undef, i32 [[TMP1]], i32 0
+; CHECK-NEXT:    [[VECEXT_1:%.*]] = extractelement <2 x float> [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.lrint.i32.f32(float [[VECEXT_1]])
+; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <2 x i32> [[VECINS]], i32 [[TMP2]], i32 1
+; CHECK-NEXT:    ret <2 x i32> [[VECINS_1]]
 ;
 entry:
   %0 = load <2 x float>, ptr %a
@@ -53,8 +58,19 @@ define <4 x i32> @lrint_v4i32f32(ptr %a) {
 ; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16
-; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i32> @llvm.lrint.v4i32.v4f32(<4 x float> [[TMP0]])
-; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+; CHECK-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.lrint.i32.f32(float [[VECEXT]])
+; CHECK-NEXT:    [[VECINS:%.*]] = insertelement <4 x i32> undef, i32 [[TMP1]], i32 0
+; CHECK-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.lrint.i32.f32(float [[VECEXT_1]])
+; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x i32> [[VECINS]], i32 [[TMP2]], i32 1
+; CHECK-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.lrint.i32.f32(float [[VECEXT_2]])
+; CHECK-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x i32> [[VECINS_1]], i32 [[TMP3]], i32 2
+; CHECK-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.lrint.i32.f32(float [[VECEXT_3]])
+; CHECK-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x i32> [[VECINS_2]], i32 [[TMP4]], i32 3
+; CHECK-NEXT:    ret <4 x i32> [[VECINS_3]]
 ;
 entry:
   %0 = load <4 x float>, ptr %a
@@ -78,8 +94,31 @@ define <8 x i32> @lrint_v8i32f32(ptr %a) {
 ; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x float>, ptr [[A]], align 32
-; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i32> @llvm.lrint.v8i32.v8f32(<8 x float> [[TMP0]])
-; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
+; CHECK-NEXT:    [[VECEXT:%.*]] = extractelement <8 x float> [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.lrint.i32.f32(float [[VECEXT]])
+; CHECK-NEXT:    [[VECINS:%.*]] = insertelement <8 x i32> undef, i32 [[TMP1]], i32 0
+; CHECK-NEXT:    [[VECEXT_1:%.*]] = extractelement <8 x float> [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.lrint.i32.f32(float [[VECEXT_1]])
+; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <8 x i32> [[VECINS]], i32 [[TMP2]], i32 1
+; CHECK-NEXT:    [[VECEXT_2:%.*]] = extractelement <8 x float> [[TMP0]], i32 2
+; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.lrint.i32.f32(float [[VECEXT_2]])
+; CHECK-NEXT:    [[VECINS_2:%.*]] = insertelement <8 x i32> [[VECINS_1]], i32 [[TMP3]], i32 2
+; CHECK-NEXT:    [[VECEXT_3:%.*]] = extractelement <8 x float> [[TMP0]], i32 3
+; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.lrint.i32.f32(float [[VECEXT_3]])
+; CHECK-NEXT:    [[VECINS_3:%.*]] = insertelement <8 x i32> [[VECINS_2]], i32 [[TMP4]], i32 3
+; CHECK-NEXT:    [[VECEXT_4:%.*]] = extractelement <8 x float> [[TMP0]], i32 4
+; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.lrint.i32.f32(float [[VECEXT_4]])
+; CHECK-NEXT:    [[VECINS_4:%.*]] = insertelement <8 x i32> [[VECINS_3]], i32 [[TMP5]], i32 4
+; CHECK-NEXT:    [[VECEXT_5:%.*]] = extractelement <8 x float> [[TMP0]], i32 5
+; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.lrint.i32.f32(float [[VECEXT_5]])
+; CHECK-NEXT:    [[VECINS_5:%.*]] = insertelement <8 x i32> [[VECINS_4]], i32 [[TMP6]], i32 5
+; CHECK-NEXT:    [[VECEXT_6:%.*]] = extractelement <8 x float> [[TMP0]], i32 6
+; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.lrint.i32.f32(float [[VECEXT_6]])
+; CHECK-NEXT:    [[VECINS_6:%.*]] = insertelement <8 x i32> [[VECINS_5]], i32 [[TMP7]], i32 6
+; CHECK-NEXT:    [[VECEXT_7:%.*]] = extractelement <8 x float> [[TMP0]], i32 7
+; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.lrint.i32.f32(float [[VECEXT_7]])
+; CHECK-NEXT:    [[VECINS_7:%.*]] = insertelement <8 x i32> [[VECINS_6]], i32 [[TMP8]], i32 7
+; CHECK-NEXT:    ret <8 x i32> [[VECINS_7]]
 ;
 entry:
   %0 = load <8 x float>, ptr %a
@@ -115,8 +154,13 @@ define <2 x i64> @lrint_v2i64f32(ptr %a) {
 ; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[A]], align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x i64> @llvm.lrint.v2i64.v2f32(<2 x float> [[TMP0]])
-; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
+; CHECK-NEXT:    [[VECEXT:%.*]] = extractelement <2 x float> [[TMP0]], i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.lrint.i64.f32(float [[VECEXT]])
+; CHECK-NEXT:    [[VECINS:%.*]] = insertelement <2 x i64> undef, i64 [[TMP1]], i64 0
+; CHECK-NEXT:    [[VECEXT_1:%.*]] = extractelement <2 x float> [[TMP0]], i64 1
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.lrint.i64.f32(float [[VECEXT_1]])
+; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <2 x i64> [[VECINS]], i64 [[TMP2]], i64 1
+; CHECK-NEXT:    ret <2 x i64> [[VECINS_1]]
 ;
 entry:
   %0 = load <2 x float>, ptr %a
@@ -134,8 +178,19 @@ define <4 x i64> @lrint_v4i64f32(ptr %a) {
 ; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16
-; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i64> @llvm.lrint.v4i64.v4f32(<4 x float> [[TMP0]])
-; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
+; CHECK-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.lrint.i64.f32(float [[VECEXT]])
+; CHECK-NEXT:    [[VECINS:%.*]] = insertelement <4 x i64> undef, i64 [[TMP1]], i64 0
+; CHECK-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i64 1
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.lrint.i64.f32(float [[VECEXT_1]])
+; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x i64> [[VECINS]], i64 [[TMP2]], i64 1
+; CHECK-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i64 2
+; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.lrint.i64.f32(float [[VECEXT_2]])
+; CHECK-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x i64> [[VECINS_1]], i64 [[TMP3]], i64 2
+; CHECK-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i64 3
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.lrint.i64.f32(float [[VECEXT_3]])
+; CHECK-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x i64> [[VECINS_2]], i64 [[TMP4]], i64 3
+; CHECK-NEXT:    ret <4 x i64> [[VECINS_3]]
 ;
 entry:
   %0 = load <4 x float>, ptr %a
@@ -159,14 +214,31 @@ define <8 x i64> @lrint_v8i64f32(ptr %a) {
 ; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x float>, ptr [[A]], align 32
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[TMP0]], <8 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i64> @llvm.lrint.v4i64.v4f32(<4 x float> [[TMP1]])
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i64> [[TMP2]], <4 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP0]], <8 x float> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP5:%.*]] = call <4 x i64> @llvm.lrint.v4i64.v4f32(<4 x float> [[TMP4]])
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i64> [[TMP5]], <4 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[VECINS_71:%.*]] = shufflevector <8 x i64> [[TMP3]], <8 x i64> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; CHECK-NEXT:    ret <8 x i64> [[VECINS_71]]
+; CHECK-NEXT:    [[VECEXT:%.*]] = extractelement <8 x float> [[TMP0]], i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.lrint.i64.f32(float [[VECEXT]])
+; CHECK-NEXT:    [[VECINS:%.*]] = insertelement <8 x i64> undef, i64 [[TMP1]], i64 0
+; CHECK-NEXT:    [[VECEXT_1:%.*]] = extractelement <8 x float> [[TMP0]], i64 1
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.lrint.i64.f32(float [[VECEXT_1]])
+; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <8 x i64> [[VECINS]], i64 [[TMP2]], i64 1
+; CHECK-NEXT:    [[VECEXT_2:%.*]] = extractelement <8 x float> [[TMP0]], i64 2
+; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.lrint.i64.f32(float [[VECEXT_2]])
+; CHECK-NEXT:    [[VECINS_2:%.*]] = insertelement <8 x i64> [[VECINS_1]], i64 [[TMP3]], i64 2
+; CHECK-NEXT:    [[VECEXT_3:%.*]] = extractelement <8 x float> [[TMP0]], i64 3
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.lrint.i64.f32(float [[VECEXT_3]])
+; CHECK-NEXT:    [[VECINS_3:%.*]] = insertelement <8 x i64> [[VECINS_2]], i64 [[TMP4]], i64 3
+; CHECK-NEXT:    [[VECEXT_4:%.*]] = extractelement <8 x float> [[TMP0]], i64 4
+; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.lrint.i64.f32(float [[VECEXT_4]])
+; CHECK-NEXT:    [[VECINS_4:%.*]] = insertelement <8 x i64> [[VECINS_3]], i64 [[TMP5]], i64 4
+; CHECK-NEXT:    [[VECEXT_5:%.*]] = extractelement <8 x float> [[TMP0]], i64 5
+; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.lrint.i64.f32(float [[VECEXT_5]])
+; CHECK-NEXT:    [[VECINS_5:%.*]] = insertelement <8 x i64> [[VECINS_4]], i64 [[TMP6]], i64 5
+; CHECK-NEXT:    [[VECEXT_6:%.*]] = extractelement <8 x float> [[TMP0]], i64 6
+; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.lrint.i64.f32(float [[VECEXT_6]])
+; CHECK-NEXT:    [[VECINS_6:%.*]] = insertelement <8 x i64> [[VECINS_5]], i64 [[TMP7]], i64 6
+; CHECK-NEXT:    [[VECEXT_7:%.*]] = extractelement <8 x float> [[TMP0]], i64 7
+; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.lrint.i64.f32(float [[VECEXT_7]])
+; CHECK-NEXT:    [[VECINS_7:%.*]] = insertelement <8 x i64> [[VECINS_6]], i64 [[TMP8]], i64 7
+; CHECK-NEXT:    ret <8 x i64> [[VECINS_7]]
 ;
 entry:
   %0 = load <8 x float>, ptr %a
@@ -202,8 +274,13 @@ define <2 x i64> @llrint_v2i64f32(ptr %a) {
 ; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[A]], align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x i64> @llvm.llrint.v2i64.v2f32(<2 x float> [[TMP0]])
-; CHECK-NEXT:    ret <2 x i64> [[TMP1]]
+; CHECK-NEXT:    [[VECEXT:%.*]] = extractelement <2 x float> [[TMP0]], i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.llrint.i64.f32(float [[VECEXT]])
+; CHECK-NEXT:    [[VECINS:%.*]] = insertelement <2 x i64> undef, i64 [[TMP1]], i64 0
+; CHECK-NEXT:    [[VECEXT_1:%.*]] = extractelement <2 x float> [[TMP0]], i64 1
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.llrint.i64.f32(float [[VECEXT_1]])
+; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <2 x i64> [[VECINS]], i64 [[TMP2]], i64 1
+; CHECK-NEXT:    ret <2 x i64> [[VECINS_1]]
 ;
 entry:
   %0 = load <2 x float>, ptr %a
@@ -221,8 +298,19 @@ define <4 x i64> @llrint_v4i64f32(ptr %a) {
 ; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 16
-; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i64> @llvm.llrint.v4i64.v4f32(<4 x float> [[TMP0]])
-; CHECK-NEXT:    ret <4 x i64> [[TMP1]]
+; CHECK-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.llrint.i64.f32(float [[VECEXT]])
+; CHECK-NEXT:    [[VECINS:%.*]] = insertelement <4 x i64> undef, i64 [[TMP1]], i64 0
+; CHECK-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i64 1
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.llrint.i64.f32(float [[VECEXT_1]])
+; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x i64> [[VECINS]], i64 [[TMP2]], i64 1
+; CHECK-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i64 2
+; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.llrint.i64.f32(float [[VECEXT_2]])
+; CHECK-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x i64> [[VECINS_1]], i64 [[TMP3]], i64 2
+; CHECK-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i64 3
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.llrint.i64.f32(float [[VECEXT_3]])
+; CHECK-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x i64> [[VECINS_2]], i64 [[TMP4]], i64 3
+; CHECK-NEXT:    ret <4 x i64> [[VECINS_3]]
 ;
 entry:
   %0 = load <4 x float>, ptr %a
@@ -246,14 +334,31 @@ define <8 x i64> @llrint_v8i64f32(ptr %a) {
 ; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x float>, ptr [[A]], align 32
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[TMP0]], <8 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i64> @llvm.llrint.v4i64.v4f32(<4 x float> [[TMP1]])
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i64> [[TMP2]], <4 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP0]], <8 x float> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    [[TMP5:%.*]] = call <4 x i64> @llvm.llrint.v4i64.v4f32(<4 x float> [[TMP4]])
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i64> [[TMP5]], <4 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[VECINS_71:%.*]] = shufflevector <8 x i64> [[TMP3]], <8 x i64> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; CHECK-NEXT:    ret <8 x i64> [[VECINS_71]]
+; CHECK-NEXT:    [[VECEXT:%.*]] = extractelement <8 x float> [[TMP0]], i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.llrint.i64.f32(float [[VECEXT]])
+; CHECK-NEXT:    [[VECINS:%.*]] = insertelement <8 x i64> undef, i64 [[TMP1]], i64 0
+; CHECK-NEXT:    [[VECEXT_1:%.*]] = extractelement <8 x float> [[TMP0]], i64 1
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.llrint.i64.f32(float [[VECEXT_1]])
+; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <8 x i64> [[VECINS]], i64 [[TMP2]], i64 1
+; CHECK-NEXT:    [[VECEXT_2:%.*]] = extractelement <8 x float> [[TMP0]], i64 2
+; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.llrint.i64.f32(float [[VECEXT_2]])
+; CHECK-NEXT:    [[VECINS_2:%.*]] = insertelement <8 x i64> [[VECINS_1]], i64 [[TMP3]], i64 2
+; CHECK-NEXT:    [[VECEXT_3:%.*]] = extractelement <8 x float> [[TMP0]], i64 3
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.llrint.i64.f32(float [[VECEXT_3]])
+; CHECK-NEXT:    [[VECINS_3:%.*]] = insertelement <8 x i64> [[VECINS_2]], i64 [[TMP4]], i64 3
+; CHECK-NEXT:    [[VECEXT_4:%.*]] = extractelement <8 x float> [[TMP0]], i64 4
+; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.llrint.i64.f32(float [[VECEXT_4]])
+; CHECK-NEXT:    [[VECINS_4:%.*]] = insertelement <8 x i64> [[VECINS_3]], i64 [[TMP5]], i64 4
+; CHECK-NEXT:    [[VECEXT_5:%.*]] = extractelement <8 x float> [[TMP0]], i64 5
+; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.llrint.i64.f32(float [[VECEXT_5]])
+; CHECK-NEXT:    [[VECINS_5:%.*]] = insertelement <8 x i64> [[VECINS_4]], i64 [[TMP6]], i64 5
+; CHECK-NEXT:    [[VECEXT_6:%.*]] = extractelement <8 x float> [[TMP0]], i64 6
+; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.llrint.i64.f32(float [[VECEXT_6]])
+; CHECK-NEXT:    [[VECINS_6:%.*]] = insertelement <8 x i64> [[VECINS_5]], i64 [[TMP7]], i64 6
+; CHECK-NEXT:    [[VECEXT_7:%.*]] = extractelement <8 x float> [[TMP0]], i64 7
+; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.llrint.i64.f32(float [[VECEXT_7]])
+; CHECK-NEXT:    [[VECINS_7:%.*]] = insertelement <8 x i64> [[VECINS_6]], i64 [[TMP8]], i64 7
+; CHECK-NEXT:    ret <8 x i64> [[VECINS_7]]
 ;
 entry:
   %0 = load <8 x float>, ptr %a
diff --git a/llvm/test/Transforms/Scalarizer/intrinsics.ll b/llvm/test/Transforms/Scalarizer/intrinsics.ll
index cee44ef434260..f5e5be1f5998e 100644
--- a/llvm/test/Transforms/Scalarizer/intrinsics.ll
+++ b/llvm/test/Transforms/Scalarizer/intrinsics.ll
@@ -217,12 +217,7 @@ define <2 x i32> @scalarize_fptoui_sat(<2 x float> %x) #0 {
 
 define <2 x i32> @scalarize_lrint(<2 x float> %x) #0 {
 ; CHECK-LABEL: @scalarize_lrint(
-; CHECK-NEXT:    [[X_I0:%.*]] = extractelement <2 x float> [[X:%.*]], i64 0
-; CHECK-NEXT:    [[RND_I0:%.*]] = call i32 @llvm.lrint.i32.f32(float [[X_I0]])
-; CHECK-NEXT:    [[X_I1:%.*]] = extractelement <2 x float> [[X]], i64 1
-; CHECK-NEXT:    [[RND_I1:%.*]] = call i32 @llvm.lrint.i32.f32(float [[X_I1]])
-; CHECK-NEXT:    [[RND_UPTO0:%.*]] = insertelement <2 x i32> poison, i32 [[RND_I0]], i64 0
-; CHECK-NEXT:    [[RND:%.*]] = insertelement <2 x i32> [[RND_UPTO0]], i32 [[RND_I1]], i64 1
+; CHECK-NEXT:    [[RND:%.*]] = call <2 x i32> @llvm.lrint.v2i32.v2f32(<2 x float> [[X:%.*]])
 ; CHECK-NEXT:    ret <2 x i32> [[RND]]
 ;
   %rnd = call <2 x i32> @llvm.lrint.v2i32.v2f32(<2 x float> %x)
@@ -231,12 +226,7 @@ define <2 x i32> @scalarize_lrint(<2 x float> %x) #0 {
 
 define <2 x i32> @scalarize_llrint(<2 x float> %x) #0 {
 ; CHECK-LABEL: @scalarize_llrint(
-; CHECK-NEXT:    [[X_I0:%.*]] = extractelement <2 x float> [[X:%.*]], i64 0
-; CHECK-NEXT:    [[RND_I0:%.*]] = call i32 @llvm.llrint.i32.f32(float [[X_I0]])
-; CHECK-NEXT:    [[X_I1:%.*]] = extractelement <2 x float> [[X]], i64 1
-; CHECK-NEXT:    [[RND_I1:%.*]] = call i32 @llvm.llrint.i32.f32(float [[X_I1]])
-; CHECK-NEXT:    [[RND_UPTO0:%.*]] = insertelement <2 x i32> poison, i32 [[RND_I0]], i64 0
-; CHECK-NEXT:    [[RND:%.*]] = insertelement <2 x i32> [[RND_UPTO0]], i32 [[RND_I1]], i64 1
+; CHECK-NEXT:    [[RND:%.*]] = call <2 x i32> @llvm.llrint.v2i32.v2f32(<2 x float> [[X:%.*]])
 ; CHECK-NEXT:    ret <2 x i32> [[RND]]
 ;
   %rnd = call <2 x i32> @llvm.llrint.v2i32.v2f32(<2 x float> %x)

From 8aaa2cb833abc840b0780c4a0a69d42eab0b6d38 Mon Sep 17 00:00:00 2001
From: Kai Sasaki <lewuathe@gmail.com>
Date: Wed, 1 Nov 2023 18:58:33 +0900
Subject: [PATCH 646/877] [mlir][complex] Support Fastmath flag for complex log
 ops (#69798)

Progressive support of fastmath flag in the conversion of log type ops.

See more detail
https://discourse.llvm.org/t/rfc-fastmath-flags-support-in-complex-dialect/71981
---
 .../ComplexToStandard/ComplexToStandard.cpp   | 32 +++++++-----
 .../convert-to-standard.mlir                  | 49 ++++++++++++++++++-
 2 files changed, 69 insertions(+), 12 deletions(-)

diff --git a/mlir/lib/Conversion/ComplexToStandard/ComplexToStandard.cpp b/mlir/lib/Conversion/ComplexToStandard/ComplexToStandard.cpp
index 174b7ce9fed2d..bf753c7062f36 100644
--- a/mlir/lib/Conversion/ComplexToStandard/ComplexToStandard.cpp
+++ b/mlir/lib/Conversion/ComplexToStandard/ComplexToStandard.cpp
@@ -499,13 +499,16 @@ struct LogOpConversion : public OpConversionPattern<complex::LogOp> {
                   ConversionPatternRewriter &rewriter) const override {
     auto type = cast<ComplexType>(adaptor.getComplex().getType());
     auto elementType = cast<FloatType>(type.getElementType());
+    arith::FastMathFlagsAttr fmf = op.getFastMathFlagsAttr();
     mlir::ImplicitLocOpBuilder b(op.getLoc(), rewriter);
 
-    Value abs = b.create<complex::AbsOp>(elementType, adaptor.getComplex());
-    Value resultReal = b.create<math::LogOp>(elementType, abs);
+    Value abs = b.create<complex::AbsOp>(elementType, adaptor.getComplex(),
+                                         fmf.getValue());
+    Value resultReal = b.create<math::LogOp>(elementType, abs, fmf.getValue());
     Value real = b.create<complex::ReOp>(elementType, adaptor.getComplex());
     Value imag = b.create<complex::ImOp>(elementType, adaptor.getComplex());
-    Value resultImag = b.create<math::Atan2Op>(elementType, imag, real);
+    Value resultImag =
+        b.create<math::Atan2Op>(elementType, imag, real, fmf.getValue());
     rewriter.replaceOpWithNewOp<complex::CreateOp>(op, type, resultReal,
                                                    resultImag);
     return success();
@@ -520,6 +523,7 @@ struct Log1pOpConversion : public OpConversionPattern<complex::Log1pOp> {
                   ConversionPatternRewriter &rewriter) const override {
     auto type = cast<ComplexType>(adaptor.getComplex().getType());
     auto elementType = cast<FloatType>(type.getElementType());
+    arith::FastMathFlagsAttr fmf = op.getFastMathFlagsAttr();
     mlir::ImplicitLocOpBuilder b(op.getLoc(), rewriter);
 
     Value real = b.create<complex::ReOp>(elementType, adaptor.getComplex());
@@ -535,15 +539,21 @@ struct Log1pOpConversion : public OpConversionPattern<complex::Log1pOp> {
     // log1p(a+bi) = .5*log((a+1)^2+b^2) + i*atan2(b, a + 1)
     // log((a+1)+bi) = .5*log(a*a + 2*a + 1 + b*b) + i*atan2(b, a+1)
     // log((a+1)+bi) = .5*log1p(a*a + 2*a + b*b) + i*atan2(b, a+1)
-    Value sumSq = b.create<arith::MulFOp>(real, real);
-    sumSq = b.create<arith::AddFOp>(sumSq, b.create<arith::MulFOp>(real, two));
-    sumSq = b.create<arith::AddFOp>(sumSq, b.create<arith::MulFOp>(imag, imag));
-    Value logSumSq = b.create<math::Log1pOp>(elementType, sumSq);
-    Value resultReal = b.create<arith::MulFOp>(logSumSq, half);
+    Value sumSq = b.create<arith::MulFOp>(real, real, fmf.getValue());
+    sumSq = b.create<arith::AddFOp>(
+        sumSq, b.create<arith::MulFOp>(real, two, fmf.getValue()),
+        fmf.getValue());
+    sumSq = b.create<arith::AddFOp>(
+        sumSq, b.create<arith::MulFOp>(imag, imag, fmf.getValue()),
+        fmf.getValue());
+    Value logSumSq =
+        b.create<math::Log1pOp>(elementType, sumSq, fmf.getValue());
+    Value resultReal = b.create<arith::MulFOp>(logSumSq, half, fmf.getValue());
+
+    Value realPlusOne = b.create<arith::AddFOp>(real, one, fmf.getValue());
 
-    Value realPlusOne = b.create<arith::AddFOp>(real, one);
-
-    Value resultImag = b.create<math::Atan2Op>(elementType, imag, realPlusOne);
+    Value resultImag =
+        b.create<math::Atan2Op>(elementType, imag, realPlusOne, fmf.getValue());
     rewriter.replaceOpWithNewOp<complex::CreateOp>(op, type, resultReal,
                                                    resultImag);
     return success();
diff --git a/mlir/test/Conversion/ComplexToStandard/convert-to-standard.mlir b/mlir/test/Conversion/ComplexToStandard/convert-to-standard.mlir
index 8264382a02651..3af28150fd5c3 100644
--- a/mlir/test/Conversion/ComplexToStandard/convert-to-standard.mlir
+++ b/mlir/test/Conversion/ComplexToStandard/convert-to-standard.mlir
@@ -797,4 +797,51 @@ func.func @complex_expm1_with_fmf(%arg: complex<f32>) -> complex<f32> {
 // CHECK: %[[REAL_M1:.*]] = arith.subf %[[REAL]], %[[ONE]] fastmath<nnan,contract> : f32
 // CHECK: %[[IMAG:.*]] = complex.im %[[RES_EXP]] : complex<f32>
 // CHECK: %[[RES:.*]] = complex.create %[[REAL_M1]], %[[IMAG]] : complex<f32>
-// CHECK: return %[[RES]] : complex<f32>
\ No newline at end of file
+// CHECK: return %[[RES]] : complex<f32>
+
+// -----
+
+// CHECK-LABEL: func @complex_log_with_fmf
+// CHECK-SAME: %[[ARG:.*]]: complex<f32>
+func.func @complex_log_with_fmf(%arg: complex<f32>) -> complex<f32> {
+  %log = complex.log %arg fastmath<nnan,contract> : complex<f32>
+  return %log : complex<f32>
+}
+// CHECK: %[[REAL:.*]] = complex.re %[[ARG]] : complex<f32>
+// CHECK: %[[IMAG:.*]] = complex.im %[[ARG]] : complex<f32>
+// CHECK: %[[SQR_REAL:.*]] = arith.mulf %[[REAL]], %[[REAL]] fastmath<nnan,contract> : f32
+// CHECK: %[[SQR_IMAG:.*]] = arith.mulf %[[IMAG]], %[[IMAG]] fastmath<nnan,contract> : f32
+// CHECK: %[[SQ_NORM:.*]] = arith.addf %[[SQR_REAL]], %[[SQR_IMAG]] fastmath<nnan,contract> : f32
+// CHECK: %[[NORM:.*]] = math.sqrt %[[SQ_NORM]] : f32
+// CHECK: %[[RESULT_REAL:.*]] = math.log %[[NORM]] fastmath<nnan,contract> : f32
+// CHECK: %[[REAL2:.*]] = complex.re %[[ARG]] : complex<f32>
+// CHECK: %[[IMAG2:.*]] = complex.im %[[ARG]] : complex<f32>
+// CHECK: %[[RESULT_IMAG:.*]] = math.atan2 %[[IMAG2]], %[[REAL2]] fastmath<nnan,contract> : f32
+// CHECK: %[[RESULT:.*]] = complex.create %[[RESULT_REAL]], %[[RESULT_IMAG]] : complex<f32>
+// CHECK: return %[[RESULT]] : complex<f32>
+
+// -----
+
+// CHECK-LABEL: func @complex_log1p_with_fmf
+// CHECK-SAME: %[[ARG:.*]]: complex<f32>
+func.func @complex_log1p_with_fmf(%arg: complex<f32>) -> complex<f32> {
+  %log1p = complex.log1p %arg fastmath<nnan,contract> : complex<f32>
+  return %log1p : complex<f32>
+}
+
+// CHECK: %[[REAL:.*]] = complex.re %[[ARG]] : complex<f32>
+// CHECK: %[[IMAG:.*]] = complex.im %[[ARG]] : complex<f32>
+// CHECK: %[[ONE_HALF:.*]] = arith.constant 5.000000e-01 : f32
+// CHECK: %[[ONE:.*]] = arith.constant 1.000000e+00 : f32
+// CHECK: %[[TWO:.*]] = arith.constant 2.000000e+00 : f32
+// CHECK: %[[SQ_SUM_0:.*]] = arith.mulf %[[REAL]], %[[REAL]] fastmath<nnan,contract> : f32
+// CHECK: %[[TWO_REAL:.*]] = arith.mulf %[[REAL]], %[[TWO]] fastmath<nnan,contract> : f32
+// CHECK: %[[SQ_SUM_1:.*]] = arith.addf %[[SQ_SUM_0]], %[[TWO_REAL]] fastmath<nnan,contract> : f32
+// CHECK: %[[SQ_IMAG:.*]] = arith.mulf %[[IMAG]], %[[IMAG]] fastmath<nnan,contract> : f32
+// CHECK: %[[SQ_SUM_2:.*]] = arith.addf %[[SQ_SUM_1]], %[[SQ_IMAG]] fastmath<nnan,contract> : f32
+// CHECK: %[[LOG_SQ_SUM:.*]] = math.log1p %[[SQ_SUM_2]] fastmath<nnan,contract> : f32
+// CHECK: %[[RESULT_REAL:.*]] = arith.mulf %[[LOG_SQ_SUM]], %[[ONE_HALF]] fastmath<nnan,contract> : f32
+// CHECK: %[[REAL_PLUS_ONE:.*]] = arith.addf %[[REAL]], %[[ONE]] fastmath<nnan,contract> : f32
+// CHECK: %[[RESULT_IMAG:.*]] = math.atan2 %[[IMAG]], %[[REAL_PLUS_ONE]] fastmath<nnan,contract> : f32
+// CHECK: %[[RESULT:.*]] = complex.create %[[RESULT_REAL]], %[[RESULT_IMAG]] : complex<f32>
+// CHECK: return %[[RESULT]] : complex<f32>

From dc5e6e4c077d62d7ffdc596bd069cbfd6e2a009d Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Tue, 31 Oct 2023 18:00:30 +0000
Subject: [PATCH 647/877] [X86] Add fpclamptosat to vXi8 test coverage

Adds additional test coverage for Issue #68466
---
 llvm/test/CodeGen/X86/fpclamptosat_vec.ll | 170 ++++++++++++++++++++++
 1 file changed, 170 insertions(+)

diff --git a/llvm/test/CodeGen/X86/fpclamptosat_vec.ll b/llvm/test/CodeGen/X86/fpclamptosat_vec.ll
index 5e4f690fcdbbf..506d31dc534e1 100644
--- a/llvm/test/CodeGen/X86/fpclamptosat_vec.ll
+++ b/llvm/test/CodeGen/X86/fpclamptosat_vec.ll
@@ -1104,6 +1104,176 @@ entry:
   ret <8 x i16> %conv6
 }
 
+; i8 saturate
+
+define <2 x i8> @stest_f64i8(<2 x double> %x) {
+; CHECK-LABEL: stest_f64i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    cvttpd2dq %xmm0, %xmm0
+; CHECK-NEXT:    movdqa {{.*#+}} xmm1 = <127,127,u,u>
+; CHECK-NEXT:    movdqa %xmm1, %xmm2
+; CHECK-NEXT:    pcmpgtd %xmm0, %xmm2
+; CHECK-NEXT:    pand %xmm2, %xmm0
+; CHECK-NEXT:    pandn %xmm1, %xmm2
+; CHECK-NEXT:    por %xmm0, %xmm2
+; CHECK-NEXT:    movdqa {{.*#+}} xmm1 = <4294967168,4294967168,u,u>
+; CHECK-NEXT:    movdqa %xmm2, %xmm0
+; CHECK-NEXT:    pcmpgtd %xmm1, %xmm0
+; CHECK-NEXT:    pand %xmm0, %xmm2
+; CHECK-NEXT:    pandn %xmm1, %xmm0
+; CHECK-NEXT:    por %xmm2, %xmm0
+; CHECK-NEXT:    packssdw %xmm0, %xmm0
+; CHECK-NEXT:    packsswb %xmm0, %xmm0
+; CHECK-NEXT:    retq
+entry:
+  %conv = fptosi <2 x double> %x to <2 x i32>
+  %0 = icmp slt <2 x i32> %conv, <i32 127, i32 127>
+  %spec.store.select = select <2 x i1> %0, <2 x i32> %conv, <2 x i32> <i32 127, i32 127>
+  %1 = icmp sgt <2 x i32> %spec.store.select, <i32 -128, i32 -128>
+  %spec.store.select7 = select <2 x i1> %1, <2 x i32> %spec.store.select, <2 x i32> <i32 -128, i32 -128>
+  %conv6 = trunc <2 x i32> %spec.store.select7 to <2 x i8>
+  ret <2 x i8> %conv6
+}
+
+define <2 x i8> @utest_f64i8(<2 x double> %x) {
+; CHECK-LABEL: utest_f64i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    cvttpd2dq %xmm0, %xmm1
+; CHECK-NEXT:    movapd %xmm1, %xmm2
+; CHECK-NEXT:    psrad $31, %xmm2
+; CHECK-NEXT:    addpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-NEXT:    cvttpd2dq %xmm0, %xmm3
+; CHECK-NEXT:    andpd %xmm2, %xmm3
+; CHECK-NEXT:    orpd %xmm1, %xmm3
+; CHECK-NEXT:    movapd {{.*#+}} xmm0 = [2147483648,2147483648,2147483648,2147483648]
+; CHECK-NEXT:    xorpd %xmm3, %xmm0
+; CHECK-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-NEXT:    movdqa %xmm0, %xmm1
+; CHECK-NEXT:    pandn %xmm3, %xmm1
+; CHECK-NEXT:    psrld $24, %xmm0
+; CHECK-NEXT:    por %xmm1, %xmm0
+; CHECK-NEXT:    packuswb %xmm0, %xmm0
+; CHECK-NEXT:    packuswb %xmm0, %xmm0
+; CHECK-NEXT:    retq
+entry:
+  %conv = fptoui <2 x double> %x to <2 x i32>
+  %0 = icmp ult <2 x i32> %conv, <i32 255, i32 255>
+  %spec.store.select = select <2 x i1> %0, <2 x i32> %conv, <2 x i32> <i32 255, i32 255>
+  %conv6 = trunc <2 x i32> %spec.store.select to <2 x i8>
+  ret <2 x i8> %conv6
+}
+
+define <2 x i8> @ustest_f64i8(<2 x double> %x) {
+; CHECK-LABEL: ustest_f64i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    cvttpd2dq %xmm0, %xmm0
+; CHECK-NEXT:    movdqa {{.*#+}} xmm1 = <255,255,u,u>
+; CHECK-NEXT:    movdqa %xmm1, %xmm2
+; CHECK-NEXT:    pcmpgtd %xmm0, %xmm2
+; CHECK-NEXT:    pand %xmm2, %xmm0
+; CHECK-NEXT:    pandn %xmm1, %xmm2
+; CHECK-NEXT:    por %xmm0, %xmm2
+; CHECK-NEXT:    pxor %xmm1, %xmm1
+; CHECK-NEXT:    movdqa %xmm2, %xmm0
+; CHECK-NEXT:    pcmpgtd %xmm1, %xmm0
+; CHECK-NEXT:    pand %xmm2, %xmm0
+; CHECK-NEXT:    packuswb %xmm0, %xmm0
+; CHECK-NEXT:    packuswb %xmm0, %xmm0
+; CHECK-NEXT:    retq
+entry:
+  %conv = fptosi <2 x double> %x to <2 x i32>
+  %0 = icmp slt <2 x i32> %conv, <i32 255, i32 255>
+  %spec.store.select = select <2 x i1> %0, <2 x i32> %conv, <2 x i32> <i32 255, i32 255>
+  %1 = icmp sgt <2 x i32> %spec.store.select, zeroinitializer
+  %spec.store.select7 = select <2 x i1> %1, <2 x i32> %spec.store.select, <2 x i32> zeroinitializer
+  %conv6 = trunc <2 x i32> %spec.store.select7 to <2 x i8>
+  ret <2 x i8> %conv6
+}
+
+define <4 x i8> @stest_f32i8(<4 x float> %x) {
+; CHECK-LABEL: stest_f32i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    cvttps2dq %xmm0, %xmm0
+; CHECK-NEXT:    movdqa {{.*#+}} xmm1 = [127,127,127,127]
+; CHECK-NEXT:    movdqa %xmm1, %xmm2
+; CHECK-NEXT:    pcmpgtd %xmm0, %xmm2
+; CHECK-NEXT:    pand %xmm2, %xmm0
+; CHECK-NEXT:    pandn %xmm1, %xmm2
+; CHECK-NEXT:    por %xmm0, %xmm2
+; CHECK-NEXT:    movdqa {{.*#+}} xmm1 = [4294967168,4294967168,4294967168,4294967168]
+; CHECK-NEXT:    movdqa %xmm2, %xmm0
+; CHECK-NEXT:    pcmpgtd %xmm1, %xmm0
+; CHECK-NEXT:    pand %xmm0, %xmm2
+; CHECK-NEXT:    pandn %xmm1, %xmm0
+; CHECK-NEXT:    por %xmm2, %xmm0
+; CHECK-NEXT:    packssdw %xmm0, %xmm0
+; CHECK-NEXT:    packsswb %xmm0, %xmm0
+; CHECK-NEXT:    retq
+entry:
+  %conv = fptosi <4 x float> %x to <4 x i32>
+  %0 = icmp slt <4 x i32> %conv, <i32 127, i32 127, i32 127, i32 127>
+  %spec.store.select = select <4 x i1> %0, <4 x i32> %conv, <4 x i32> <i32 127, i32 127, i32 127, i32 127>
+  %1 = icmp sgt <4 x i32> %spec.store.select, <i32 -128, i32 -128, i32 -128, i32 -128>
+  %spec.store.select7 = select <4 x i1> %1, <4 x i32> %spec.store.select, <4 x i32> <i32 -128, i32 -128, i32 -128, i32 -128>
+  %conv6 = trunc <4 x i32> %spec.store.select7 to <4 x i8>
+  ret <4 x i8> %conv6
+}
+
+define <4 x i8> @utest_f32i8(<4 x float> %x) {
+; CHECK-LABEL: utest_f32i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    cvttps2dq %xmm0, %xmm1
+; CHECK-NEXT:    movdqa %xmm1, %xmm2
+; CHECK-NEXT:    psrad $31, %xmm2
+; CHECK-NEXT:    subps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-NEXT:    cvttps2dq %xmm0, %xmm3
+; CHECK-NEXT:    pand %xmm2, %xmm3
+; CHECK-NEXT:    por %xmm1, %xmm3
+; CHECK-NEXT:    movdqa {{.*#+}} xmm0 = [2147483648,2147483648,2147483648,2147483648]
+; CHECK-NEXT:    pxor %xmm3, %xmm0
+; CHECK-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-NEXT:    movdqa %xmm0, %xmm1
+; CHECK-NEXT:    pandn %xmm3, %xmm1
+; CHECK-NEXT:    psrld $24, %xmm0
+; CHECK-NEXT:    por %xmm1, %xmm0
+; CHECK-NEXT:    packuswb %xmm0, %xmm0
+; CHECK-NEXT:    packuswb %xmm0, %xmm0
+; CHECK-NEXT:    retq
+entry:
+  %conv = fptoui <4 x float> %x to <4 x i32>
+  %0 = icmp ult <4 x i32> %conv, <i32 255, i32 255, i32 255, i32 255>
+  %spec.store.select = select <4 x i1> %0, <4 x i32> %conv, <4 x i32> <i32 255, i32 255, i32 255, i32 255>
+  %conv6 = trunc <4 x i32> %spec.store.select to <4 x i8>
+  ret <4 x i8> %conv6
+}
+
+define <4 x i8> @ustest_f32i8(<4 x float> %x) {
+; CHECK-LABEL: ustest_f32i8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    cvttps2dq %xmm0, %xmm0
+; CHECK-NEXT:    movdqa {{.*#+}} xmm1 = [255,255,255,255]
+; CHECK-NEXT:    movdqa %xmm1, %xmm2
+; CHECK-NEXT:    pcmpgtd %xmm0, %xmm2
+; CHECK-NEXT:    pand %xmm2, %xmm0
+; CHECK-NEXT:    pandn %xmm1, %xmm2
+; CHECK-NEXT:    por %xmm0, %xmm2
+; CHECK-NEXT:    pxor %xmm1, %xmm1
+; CHECK-NEXT:    movdqa %xmm2, %xmm0
+; CHECK-NEXT:    pcmpgtd %xmm1, %xmm0
+; CHECK-NEXT:    pand %xmm2, %xmm0
+; CHECK-NEXT:    packuswb %xmm0, %xmm0
+; CHECK-NEXT:    packuswb %xmm0, %xmm0
+; CHECK-NEXT:    retq
+entry:
+  %conv = fptosi <4 x float> %x to <4 x i32>
+  %0 = icmp slt <4 x i32> %conv, <i32 255, i32 255, i32 255, i32 255>
+  %spec.store.select = select <4 x i1> %0, <4 x i32> %conv, <4 x i32> <i32 255, i32 255, i32 255, i32 255>
+  %1 = icmp sgt <4 x i32> %spec.store.select, zeroinitializer
+  %spec.store.select7 = select <4 x i1> %1, <4 x i32> %spec.store.select, <4 x i32> zeroinitializer
+  %conv6 = trunc <4 x i32> %spec.store.select7 to <4 x i8>
+  ret <4 x i8> %conv6
+}
+
 ; i64 saturate
 
 define <2 x i64> @stest_f64i64(<2 x double> %x) {

From 432e11478a928e60b95bb5c7d980f0313604b235 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Tue, 31 Oct 2023 18:16:32 +0000
Subject: [PATCH 648/877] [X86] fpclamptosat_vec.ll - add AVX2/AVX512 test
 coverage

---
 llvm/test/CodeGen/X86/fpclamptosat_vec.ll | 7405 +++++++++++++--------
 1 file changed, 4678 insertions(+), 2727 deletions(-)

diff --git a/llvm/test/CodeGen/X86/fpclamptosat_vec.ll b/llvm/test/CodeGen/X86/fpclamptosat_vec.ll
index 506d31dc534e1..017fe14366bd6 100644
--- a/llvm/test/CodeGen/X86/fpclamptosat_vec.ll
+++ b/llvm/test/CodeGen/X86/fpclamptosat_vec.ll
@@ -1,46 +1,71 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefixes=SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v3| FileCheck %s --check-prefixes=AVX,AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX,AVX512
 
 ; i32 saturate
 
-define <2 x i32> @stest_f64i32(<2 x double> %x) {
-; CHECK-LABEL: stest_f64i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    cvttsd2si %xmm0, %rax
-; CHECK-NEXT:    movq %rax, %xmm1
-; CHECK-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
-; CHECK-NEXT:    cvttsd2si %xmm0, %rax
-; CHECK-NEXT:    movq %rax, %xmm0
-; CHECK-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; CHECK-NEXT:    movdqa {{.*#+}} xmm0 = [2147483648,2147483648]
-; CHECK-NEXT:    movdqa %xmm1, %xmm2
-; CHECK-NEXT:    pxor %xmm0, %xmm2
-; CHECK-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
-; CHECK-NEXT:    pxor %xmm4, %xmm4
-; CHECK-NEXT:    pcmpeqd %xmm3, %xmm4
-; CHECK-NEXT:    movdqa {{.*#+}} xmm3 = [4294967295,4294967295]
-; CHECK-NEXT:    pcmpgtd %xmm2, %xmm3
-; CHECK-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; CHECK-NEXT:    pand %xmm4, %xmm2
-; CHECK-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; CHECK-NEXT:    por %xmm2, %xmm3
-; CHECK-NEXT:    pand %xmm3, %xmm1
-; CHECK-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; CHECK-NEXT:    por %xmm1, %xmm3
-; CHECK-NEXT:    pxor %xmm3, %xmm0
-; CHECK-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-NEXT:    pcmpeqd %xmm2, %xmm2
-; CHECK-NEXT:    pcmpeqd %xmm1, %xmm2
-; CHECK-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; CHECK-NEXT:    pand %xmm2, %xmm1
-; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; CHECK-NEXT:    por %xmm1, %xmm0
-; CHECK-NEXT:    pand %xmm0, %xmm3
-; CHECK-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    por %xmm3, %xmm0
-; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-NEXT:    retq
+define <2 x i32> @stest_f64i32(<2 x double> %x) nounwind {
+; SSE-LABEL: stest_f64i32:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    cvttsd2si %xmm0, %rax
+; SSE-NEXT:    movq %rax, %xmm1
+; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
+; SSE-NEXT:    cvttsd2si %xmm0, %rax
+; SSE-NEXT:    movq %rax, %xmm0
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [2147483648,2147483648]
+; SSE-NEXT:    movdqa %xmm1, %xmm2
+; SSE-NEXT:    pxor %xmm0, %xmm2
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
+; SSE-NEXT:    pxor %xmm4, %xmm4
+; SSE-NEXT:    pcmpeqd %xmm3, %xmm4
+; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [4294967295,4294967295]
+; SSE-NEXT:    pcmpgtd %xmm2, %xmm3
+; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
+; SSE-NEXT:    pand %xmm4, %xmm2
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE-NEXT:    por %xmm2, %xmm3
+; SSE-NEXT:    pand %xmm3, %xmm1
+; SSE-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; SSE-NEXT:    por %xmm1, %xmm3
+; SSE-NEXT:    pxor %xmm3, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; SSE-NEXT:    pcmpeqd %xmm2, %xmm2
+; SSE-NEXT:    pcmpeqd %xmm1, %xmm2
+; SSE-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSE-NEXT:    pand %xmm2, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE-NEXT:    por %xmm1, %xmm0
+; SSE-NEXT:    pand %xmm0, %xmm3
+; SSE-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT:    por %xmm3, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: stest_f64i32:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vcvttsd2si %xmm0, %rax
+; AVX2-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX2-NEXT:    vcvttsd2si %xmm0, %rcx
+; AVX2-NEXT:    vmovq %rax, %xmm0
+; AVX2-NEXT:    vmovq %rcx, %xmm1
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [2147483647,2147483647]
+; AVX2-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm2
+; AVX2-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968]
+; AVX2-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
+; AVX2-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: stest_f64i32:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vcvttpd2qq %xmm0, %xmm0
+; AVX512-NEXT:    vpmovsqd %xmm0, %xmm0
+; AVX512-NEXT:    retq
 entry:
   %conv = fptosi <2 x double> %x to <2 x i64>
   %0 = icmp slt <2 x i64> %conv, <i64 2147483647, i64 2147483647>
@@ -51,42 +76,75 @@ entry:
   ret <2 x i32> %conv6
 }
 
-define <2 x i32> @utest_f64i32(<2 x double> %x) {
-; CHECK-LABEL: utest_f64i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; CHECK-NEXT:    movapd %xmm0, %xmm2
-; CHECK-NEXT:    subsd %xmm1, %xmm2
-; CHECK-NEXT:    cvttsd2si %xmm2, %rax
-; CHECK-NEXT:    cvttsd2si %xmm0, %rcx
-; CHECK-NEXT:    movq %rcx, %rdx
-; CHECK-NEXT:    sarq $63, %rdx
-; CHECK-NEXT:    andq %rax, %rdx
-; CHECK-NEXT:    orq %rcx, %rdx
-; CHECK-NEXT:    movq %rdx, %xmm2
-; CHECK-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
-; CHECK-NEXT:    cvttsd2si %xmm0, %rax
-; CHECK-NEXT:    subsd %xmm1, %xmm0
-; CHECK-NEXT:    cvttsd2si %xmm0, %rcx
-; CHECK-NEXT:    movq %rax, %rdx
-; CHECK-NEXT:    sarq $63, %rdx
-; CHECK-NEXT:    andq %rcx, %rdx
-; CHECK-NEXT:    orq %rax, %rdx
-; CHECK-NEXT:    movq %rdx, %xmm0
-; CHECK-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0]
-; CHECK-NEXT:    movdqa {{.*#+}} xmm0 = [9223372039002259456,9223372039002259456]
-; CHECK-NEXT:    pxor %xmm2, %xmm0
-; CHECK-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-NEXT:    pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
-; CHECK-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    pandn %xmm1, %xmm0
-; CHECK-NEXT:    pcmpeqd %xmm1, %xmm1
-; CHECK-NEXT:    pxor %xmm0, %xmm1
-; CHECK-NEXT:    pand %xmm2, %xmm0
-; CHECK-NEXT:    por %xmm1, %xmm0
-; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-NEXT:    retq
+define <2 x i32> @utest_f64i32(<2 x double> %x) nounwind {
+; SSE-LABEL: utest_f64i32:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; SSE-NEXT:    movapd %xmm0, %xmm2
+; SSE-NEXT:    subsd %xmm1, %xmm2
+; SSE-NEXT:    cvttsd2si %xmm2, %rax
+; SSE-NEXT:    cvttsd2si %xmm0, %rcx
+; SSE-NEXT:    movq %rcx, %rdx
+; SSE-NEXT:    sarq $63, %rdx
+; SSE-NEXT:    andq %rax, %rdx
+; SSE-NEXT:    orq %rcx, %rdx
+; SSE-NEXT:    movq %rdx, %xmm2
+; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
+; SSE-NEXT:    cvttsd2si %xmm0, %rax
+; SSE-NEXT:    subsd %xmm1, %xmm0
+; SSE-NEXT:    cvttsd2si %xmm0, %rcx
+; SSE-NEXT:    movq %rax, %rdx
+; SSE-NEXT:    sarq $63, %rdx
+; SSE-NEXT:    andq %rcx, %rdx
+; SSE-NEXT:    orq %rax, %rdx
+; SSE-NEXT:    movq %rdx, %xmm0
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0]
+; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [9223372039002259456,9223372039002259456]
+; SSE-NEXT:    pxor %xmm2, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; SSE-NEXT:    pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT:    pandn %xmm1, %xmm0
+; SSE-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE-NEXT:    pxor %xmm0, %xmm1
+; SSE-NEXT:    pand %xmm2, %xmm0
+; SSE-NEXT:    por %xmm1, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: utest_f64i32:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX2-NEXT:    vsubsd %xmm1, %xmm0, %xmm2
+; AVX2-NEXT:    vcvttsd2si %xmm2, %rax
+; AVX2-NEXT:    vcvttsd2si %xmm0, %rcx
+; AVX2-NEXT:    movq %rcx, %rdx
+; AVX2-NEXT:    sarq $63, %rdx
+; AVX2-NEXT:    andq %rax, %rdx
+; AVX2-NEXT:    orq %rcx, %rdx
+; AVX2-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX2-NEXT:    vsubsd %xmm1, %xmm0, %xmm1
+; AVX2-NEXT:    vcvttsd2si %xmm1, %rax
+; AVX2-NEXT:    vmovq %rdx, %xmm1
+; AVX2-NEXT:    vcvttsd2si %xmm0, %rcx
+; AVX2-NEXT:    movq %rcx, %rdx
+; AVX2-NEXT:    sarq $63, %rdx
+; AVX2-NEXT:    andq %rax, %rdx
+; AVX2-NEXT:    orq %rcx, %rdx
+; AVX2-NEXT:    vmovq %rdx, %xmm0
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX2-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
+; AVX2-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX2-NEXT:    vblendvpd %xmm1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: utest_f64i32:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vcvttpd2uqq %xmm0, %xmm0
+; AVX512-NEXT:    vpmovusqd %xmm0, %xmm0
+; AVX512-NEXT:    retq
 entry:
   %conv = fptoui <2 x double> %x to <2 x i64>
   %0 = icmp ult <2 x i64> %conv, <i64 4294967295, i64 4294967295>
@@ -95,42 +153,67 @@ entry:
   ret <2 x i32> %conv6
 }
 
-define <2 x i32> @ustest_f64i32(<2 x double> %x) {
-; CHECK-LABEL: ustest_f64i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    cvttsd2si %xmm0, %rax
-; CHECK-NEXT:    movq %rax, %xmm1
-; CHECK-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
-; CHECK-NEXT:    cvttsd2si %xmm0, %rax
-; CHECK-NEXT:    movq %rax, %xmm0
-; CHECK-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; CHECK-NEXT:    movdqa {{.*#+}} xmm0 = [2147483648,2147483648]
-; CHECK-NEXT:    movdqa %xmm1, %xmm2
-; CHECK-NEXT:    pxor %xmm0, %xmm2
-; CHECK-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
-; CHECK-NEXT:    pxor %xmm4, %xmm4
-; CHECK-NEXT:    pcmpeqd %xmm3, %xmm4
-; CHECK-NEXT:    movdqa {{.*#+}} xmm3 = [2147483647,2147483647]
-; CHECK-NEXT:    pcmpgtd %xmm2, %xmm3
-; CHECK-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; CHECK-NEXT:    pand %xmm4, %xmm2
-; CHECK-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; CHECK-NEXT:    por %xmm2, %xmm3
-; CHECK-NEXT:    pand %xmm3, %xmm1
-; CHECK-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; CHECK-NEXT:    por %xmm1, %xmm3
-; CHECK-NEXT:    movdqa %xmm3, %xmm1
-; CHECK-NEXT:    pxor %xmm0, %xmm1
-; CHECK-NEXT:    movdqa %xmm1, %xmm2
-; CHECK-NEXT:    pcmpgtd %xmm0, %xmm2
-; CHECK-NEXT:    pcmpeqd %xmm0, %xmm1
-; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; CHECK-NEXT:    pand %xmm2, %xmm0
-; CHECK-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
-; CHECK-NEXT:    por %xmm0, %xmm1
-; CHECK-NEXT:    pand %xmm3, %xmm1
-; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
-; CHECK-NEXT:    retq
+define <2 x i32> @ustest_f64i32(<2 x double> %x) nounwind {
+; SSE-LABEL: ustest_f64i32:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    cvttsd2si %xmm0, %rax
+; SSE-NEXT:    movq %rax, %xmm1
+; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
+; SSE-NEXT:    cvttsd2si %xmm0, %rax
+; SSE-NEXT:    movq %rax, %xmm0
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [2147483648,2147483648]
+; SSE-NEXT:    movdqa %xmm1, %xmm2
+; SSE-NEXT:    pxor %xmm0, %xmm2
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
+; SSE-NEXT:    pxor %xmm4, %xmm4
+; SSE-NEXT:    pcmpeqd %xmm3, %xmm4
+; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [2147483647,2147483647]
+; SSE-NEXT:    pcmpgtd %xmm2, %xmm3
+; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
+; SSE-NEXT:    pand %xmm4, %xmm2
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE-NEXT:    por %xmm2, %xmm3
+; SSE-NEXT:    pand %xmm3, %xmm1
+; SSE-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; SSE-NEXT:    por %xmm1, %xmm3
+; SSE-NEXT:    movdqa %xmm3, %xmm1
+; SSE-NEXT:    pxor %xmm0, %xmm1
+; SSE-NEXT:    movdqa %xmm1, %xmm2
+; SSE-NEXT:    pcmpgtd %xmm0, %xmm2
+; SSE-NEXT:    pcmpeqd %xmm0, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE-NEXT:    pand %xmm2, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; SSE-NEXT:    por %xmm0, %xmm1
+; SSE-NEXT:    pand %xmm3, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: ustest_f64i32:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vcvttsd2si %xmm0, %rax
+; AVX2-NEXT:    vmovq %rax, %xmm1
+; AVX2-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX2-NEXT:    vcvttsd2si %xmm0, %rax
+; AVX2-NEXT:    vmovq %rax, %xmm0
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [4294967295,4294967295]
+; AVX2-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm2
+; AVX2-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm1
+; AVX2-NEXT:    vpand %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: ustest_f64i32:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vcvttpd2qq %xmm0, %xmm0
+; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512-NEXT:    vpmaxsq %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpmovusqd %xmm0, %xmm0
+; AVX512-NEXT:    retq
 entry:
   %conv = fptosi <2 x double> %x to <2 x i64>
   %0 = icmp slt <2 x i64> %conv, <i64 4294967295, i64 4294967295>
@@ -141,81 +224,117 @@ entry:
   ret <2 x i32> %conv6
 }
 
-define <4 x i32> @stest_f32i32(<4 x float> %x) {
-; CHECK-LABEL: stest_f32i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movaps %xmm0, %xmm1
-; CHECK-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3]
-; CHECK-NEXT:    cvttss2si %xmm1, %rax
-; CHECK-NEXT:    movq %rax, %xmm1
-; CHECK-NEXT:    movaps %xmm0, %xmm2
-; CHECK-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
-; CHECK-NEXT:    cvttss2si %xmm2, %rax
-; CHECK-NEXT:    movq %rax, %xmm2
-; CHECK-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
-; CHECK-NEXT:    cvttss2si %xmm0, %rax
-; CHECK-NEXT:    movq %rax, %xmm4
-; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; CHECK-NEXT:    cvttss2si %xmm0, %rax
-; CHECK-NEXT:    movq %rax, %xmm0
-; CHECK-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm0[0]
-; CHECK-NEXT:    movdqa {{.*#+}} xmm3 = [2147483647,2147483647]
-; CHECK-NEXT:    movdqa {{.*#+}} xmm0 = [2147483648,2147483648]
-; CHECK-NEXT:    movdqa %xmm4, %xmm1
-; CHECK-NEXT:    pxor %xmm0, %xmm1
-; CHECK-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3]
-; CHECK-NEXT:    pxor %xmm6, %xmm6
-; CHECK-NEXT:    pcmpeqd %xmm6, %xmm5
-; CHECK-NEXT:    movdqa {{.*#+}} xmm7 = [4294967295,4294967295]
-; CHECK-NEXT:    movdqa %xmm7, %xmm8
-; CHECK-NEXT:    pcmpgtd %xmm1, %xmm8
-; CHECK-NEXT:    pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2]
-; CHECK-NEXT:    pand %xmm5, %xmm9
-; CHECK-NEXT:    pshufd {{.*#+}} xmm1 = xmm8[1,1,3,3]
-; CHECK-NEXT:    por %xmm9, %xmm1
-; CHECK-NEXT:    pand %xmm1, %xmm4
-; CHECK-NEXT:    pandn %xmm3, %xmm1
-; CHECK-NEXT:    por %xmm4, %xmm1
-; CHECK-NEXT:    movdqa %xmm2, %xmm4
-; CHECK-NEXT:    pxor %xmm0, %xmm4
-; CHECK-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
-; CHECK-NEXT:    pcmpeqd %xmm6, %xmm5
-; CHECK-NEXT:    pcmpgtd %xmm4, %xmm7
-; CHECK-NEXT:    pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2]
-; CHECK-NEXT:    pand %xmm5, %xmm4
-; CHECK-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
-; CHECK-NEXT:    por %xmm4, %xmm5
-; CHECK-NEXT:    pand %xmm5, %xmm2
-; CHECK-NEXT:    pandn %xmm3, %xmm5
-; CHECK-NEXT:    por %xmm2, %xmm5
-; CHECK-NEXT:    movdqa {{.*#+}} xmm2 = [18446744071562067968,18446744071562067968]
-; CHECK-NEXT:    movdqa %xmm5, %xmm3
-; CHECK-NEXT:    pxor %xmm0, %xmm3
-; CHECK-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
-; CHECK-NEXT:    pcmpeqd %xmm6, %xmm6
-; CHECK-NEXT:    pcmpeqd %xmm6, %xmm4
-; CHECK-NEXT:    movdqa {{.*#+}} xmm7 = [18446744069414584320,18446744069414584320]
-; CHECK-NEXT:    pcmpgtd %xmm7, %xmm3
-; CHECK-NEXT:    pshufd {{.*#+}} xmm8 = xmm3[0,0,2,2]
-; CHECK-NEXT:    pand %xmm4, %xmm8
-; CHECK-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; CHECK-NEXT:    por %xmm8, %xmm3
-; CHECK-NEXT:    pand %xmm3, %xmm5
-; CHECK-NEXT:    pandn %xmm2, %xmm3
-; CHECK-NEXT:    por %xmm5, %xmm3
-; CHECK-NEXT:    pxor %xmm1, %xmm0
-; CHECK-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; CHECK-NEXT:    pcmpeqd %xmm6, %xmm4
-; CHECK-NEXT:    pcmpgtd %xmm7, %xmm0
-; CHECK-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2]
-; CHECK-NEXT:    pand %xmm4, %xmm5
-; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; CHECK-NEXT:    por %xmm5, %xmm0
-; CHECK-NEXT:    pand %xmm0, %xmm1
-; CHECK-NEXT:    pandn %xmm2, %xmm0
-; CHECK-NEXT:    por %xmm1, %xmm0
-; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2]
-; CHECK-NEXT:    retq
+define <4 x i32> @stest_f32i32(<4 x float> %x) nounwind {
+; SSE-LABEL: stest_f32i32:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    movaps %xmm0, %xmm1
+; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3]
+; SSE-NEXT:    cvttss2si %xmm1, %rax
+; SSE-NEXT:    movq %rax, %xmm1
+; SSE-NEXT:    movaps %xmm0, %xmm2
+; SSE-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
+; SSE-NEXT:    cvttss2si %xmm2, %rax
+; SSE-NEXT:    movq %rax, %xmm2
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
+; SSE-NEXT:    cvttss2si %xmm0, %rax
+; SSE-NEXT:    movq %rax, %xmm4
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; SSE-NEXT:    cvttss2si %xmm0, %rax
+; SSE-NEXT:    movq %rax, %xmm0
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm0[0]
+; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [2147483647,2147483647]
+; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [2147483648,2147483648]
+; SSE-NEXT:    movdqa %xmm4, %xmm1
+; SSE-NEXT:    pxor %xmm0, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3]
+; SSE-NEXT:    pxor %xmm6, %xmm6
+; SSE-NEXT:    pcmpeqd %xmm6, %xmm5
+; SSE-NEXT:    movdqa {{.*#+}} xmm7 = [4294967295,4294967295]
+; SSE-NEXT:    movdqa %xmm7, %xmm8
+; SSE-NEXT:    pcmpgtd %xmm1, %xmm8
+; SSE-NEXT:    pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2]
+; SSE-NEXT:    pand %xmm5, %xmm9
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm8[1,1,3,3]
+; SSE-NEXT:    por %xmm9, %xmm1
+; SSE-NEXT:    pand %xmm1, %xmm4
+; SSE-NEXT:    pandn %xmm3, %xmm1
+; SSE-NEXT:    por %xmm4, %xmm1
+; SSE-NEXT:    movdqa %xmm2, %xmm4
+; SSE-NEXT:    pxor %xmm0, %xmm4
+; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
+; SSE-NEXT:    pcmpeqd %xmm6, %xmm5
+; SSE-NEXT:    pcmpgtd %xmm4, %xmm7
+; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2]
+; SSE-NEXT:    pand %xmm5, %xmm4
+; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; SSE-NEXT:    por %xmm4, %xmm5
+; SSE-NEXT:    pand %xmm5, %xmm2
+; SSE-NEXT:    pandn %xmm3, %xmm5
+; SSE-NEXT:    por %xmm2, %xmm5
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [18446744071562067968,18446744071562067968]
+; SSE-NEXT:    movdqa %xmm5, %xmm3
+; SSE-NEXT:    pxor %xmm0, %xmm3
+; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[1,1,3,3]
+; SSE-NEXT:    pcmpeqd %xmm6, %xmm6
+; SSE-NEXT:    pcmpeqd %xmm6, %xmm4
+; SSE-NEXT:    movdqa {{.*#+}} xmm7 = [18446744069414584320,18446744069414584320]
+; SSE-NEXT:    pcmpgtd %xmm7, %xmm3
+; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm3[0,0,2,2]
+; SSE-NEXT:    pand %xmm4, %xmm8
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE-NEXT:    por %xmm8, %xmm3
+; SSE-NEXT:    pand %xmm3, %xmm5
+; SSE-NEXT:    pandn %xmm2, %xmm3
+; SSE-NEXT:    por %xmm5, %xmm3
+; SSE-NEXT:    pxor %xmm1, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
+; SSE-NEXT:    pcmpeqd %xmm6, %xmm4
+; SSE-NEXT:    pcmpgtd %xmm7, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2]
+; SSE-NEXT:    pand %xmm4, %xmm5
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE-NEXT:    por %xmm5, %xmm0
+; SSE-NEXT:    pand %xmm0, %xmm1
+; SSE-NEXT:    pandn %xmm2, %xmm0
+; SSE-NEXT:    por %xmm1, %xmm0
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2]
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: stest_f32i32:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3]
+; AVX2-NEXT:    vcvttss2si %xmm1, %rax
+; AVX2-NEXT:    vmovq %rax, %xmm1
+; AVX2-NEXT:    vshufpd {{.*#+}} xmm2 = xmm0[1,0]
+; AVX2-NEXT:    vcvttss2si %xmm2, %rax
+; AVX2-NEXT:    vmovq %rax, %xmm2
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX2-NEXT:    vcvttss2si %xmm0, %rax
+; AVX2-NEXT:    vmovq %rax, %xmm2
+; AVX2-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX2-NEXT:    vcvttss2si %xmm0, %rax
+; AVX2-NEXT:    vmovq %rax, %xmm0
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
+; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [2147483647,2147483647,2147483647,2147483647]
+; AVX2-NEXT:    vpcmpgtq %ymm0, %ymm1, %ymm2
+; AVX2-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [18446744071562067968,18446744071562067968,18446744071562067968,18446744071562067968]
+; AVX2-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm2
+; AVX2-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vbroadcastf128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6]
+; AVX2-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX2-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: stest_f32i32:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vcvttps2qq %xmm0, %ymm0
+; AVX512-NEXT:    vpmovsqd %ymm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
 entry:
   %conv = fptosi <4 x float> %x to <4 x i64>
   %0 = icmp slt <4 x i64> %conv, <i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647>
@@ -226,76 +345,137 @@ entry:
   ret <4 x i32> %conv6
 }
 
-define <4 x i32> @utest_f32i32(<4 x float> %x) {
-; CHECK-LABEL: utest_f32i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; CHECK-NEXT:    movaps %xmm0, %xmm1
-; CHECK-NEXT:    subss %xmm2, %xmm1
-; CHECK-NEXT:    cvttss2si %xmm1, %rax
-; CHECK-NEXT:    cvttss2si %xmm0, %rcx
-; CHECK-NEXT:    movq %rcx, %rdx
-; CHECK-NEXT:    sarq $63, %rdx
-; CHECK-NEXT:    andq %rax, %rdx
-; CHECK-NEXT:    orq %rcx, %rdx
-; CHECK-NEXT:    movq %rdx, %xmm1
-; CHECK-NEXT:    movaps %xmm0, %xmm3
-; CHECK-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[1,1]
-; CHECK-NEXT:    cvttss2si %xmm3, %rax
-; CHECK-NEXT:    subss %xmm2, %xmm3
-; CHECK-NEXT:    cvttss2si %xmm3, %rcx
-; CHECK-NEXT:    movq %rax, %rdx
-; CHECK-NEXT:    sarq $63, %rdx
-; CHECK-NEXT:    andq %rcx, %rdx
-; CHECK-NEXT:    orq %rax, %rdx
-; CHECK-NEXT:    movq %rdx, %xmm3
-; CHECK-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
-; CHECK-NEXT:    movaps %xmm0, %xmm3
-; CHECK-NEXT:    shufps {{.*#+}} xmm3 = xmm3[3,3],xmm0[3,3]
-; CHECK-NEXT:    cvttss2si %xmm3, %rax
-; CHECK-NEXT:    subss %xmm2, %xmm3
-; CHECK-NEXT:    cvttss2si %xmm3, %rcx
-; CHECK-NEXT:    movq %rax, %rdx
-; CHECK-NEXT:    sarq $63, %rdx
-; CHECK-NEXT:    andq %rcx, %rdx
-; CHECK-NEXT:    orq %rax, %rdx
-; CHECK-NEXT:    movq %rdx, %xmm3
-; CHECK-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
-; CHECK-NEXT:    cvttss2si %xmm0, %rax
-; CHECK-NEXT:    subss %xmm2, %xmm0
-; CHECK-NEXT:    cvttss2si %xmm0, %rcx
-; CHECK-NEXT:    movq %rax, %rdx
-; CHECK-NEXT:    sarq $63, %rdx
-; CHECK-NEXT:    andq %rcx, %rdx
-; CHECK-NEXT:    orq %rax, %rdx
-; CHECK-NEXT:    movq %rdx, %xmm0
-; CHECK-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
-; CHECK-NEXT:    movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456]
-; CHECK-NEXT:    movdqa %xmm0, %xmm3
-; CHECK-NEXT:    pxor %xmm2, %xmm3
-; CHECK-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
-; CHECK-NEXT:    movdqa {{.*#+}} xmm5 = [2147483647,2147483647,2147483647,2147483647]
-; CHECK-NEXT:    movdqa %xmm5, %xmm6
-; CHECK-NEXT:    pcmpgtd %xmm4, %xmm6
-; CHECK-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; CHECK-NEXT:    pcmpeqd %xmm2, %xmm3
-; CHECK-NEXT:    pand %xmm6, %xmm3
-; CHECK-NEXT:    pcmpeqd %xmm4, %xmm4
-; CHECK-NEXT:    pand %xmm3, %xmm0
-; CHECK-NEXT:    pxor %xmm4, %xmm3
-; CHECK-NEXT:    por %xmm0, %xmm3
-; CHECK-NEXT:    movdqa %xmm1, %xmm0
-; CHECK-NEXT:    pxor %xmm2, %xmm0
-; CHECK-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[0,0,2,2]
-; CHECK-NEXT:    pcmpgtd %xmm6, %xmm5
-; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; CHECK-NEXT:    pcmpeqd %xmm2, %xmm0
-; CHECK-NEXT:    pand %xmm5, %xmm0
-; CHECK-NEXT:    pxor %xmm0, %xmm4
-; CHECK-NEXT:    pand %xmm1, %xmm0
-; CHECK-NEXT:    por %xmm4, %xmm0
-; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2]
-; CHECK-NEXT:    retq
+define <4 x i32> @utest_f32i32(<4 x float> %x) nounwind {
+; SSE-LABEL: utest_f32i32:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE-NEXT:    movaps %xmm0, %xmm1
+; SSE-NEXT:    subss %xmm2, %xmm1
+; SSE-NEXT:    cvttss2si %xmm1, %rax
+; SSE-NEXT:    cvttss2si %xmm0, %rcx
+; SSE-NEXT:    movq %rcx, %rdx
+; SSE-NEXT:    sarq $63, %rdx
+; SSE-NEXT:    andq %rax, %rdx
+; SSE-NEXT:    orq %rcx, %rdx
+; SSE-NEXT:    movq %rdx, %xmm1
+; SSE-NEXT:    movaps %xmm0, %xmm3
+; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[1,1]
+; SSE-NEXT:    cvttss2si %xmm3, %rax
+; SSE-NEXT:    subss %xmm2, %xmm3
+; SSE-NEXT:    cvttss2si %xmm3, %rcx
+; SSE-NEXT:    movq %rax, %rdx
+; SSE-NEXT:    sarq $63, %rdx
+; SSE-NEXT:    andq %rcx, %rdx
+; SSE-NEXT:    orq %rax, %rdx
+; SSE-NEXT:    movq %rdx, %xmm3
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
+; SSE-NEXT:    movaps %xmm0, %xmm3
+; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[3,3],xmm0[3,3]
+; SSE-NEXT:    cvttss2si %xmm3, %rax
+; SSE-NEXT:    subss %xmm2, %xmm3
+; SSE-NEXT:    cvttss2si %xmm3, %rcx
+; SSE-NEXT:    movq %rax, %rdx
+; SSE-NEXT:    sarq $63, %rdx
+; SSE-NEXT:    andq %rcx, %rdx
+; SSE-NEXT:    orq %rax, %rdx
+; SSE-NEXT:    movq %rdx, %xmm3
+; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE-NEXT:    cvttss2si %xmm0, %rax
+; SSE-NEXT:    subss %xmm2, %xmm0
+; SSE-NEXT:    cvttss2si %xmm0, %rcx
+; SSE-NEXT:    movq %rax, %rdx
+; SSE-NEXT:    sarq $63, %rdx
+; SSE-NEXT:    andq %rcx, %rdx
+; SSE-NEXT:    orq %rax, %rdx
+; SSE-NEXT:    movq %rdx, %xmm0
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456]
+; SSE-NEXT:    movdqa %xmm0, %xmm3
+; SSE-NEXT:    pxor %xmm2, %xmm3
+; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
+; SSE-NEXT:    movdqa {{.*#+}} xmm5 = [2147483647,2147483647,2147483647,2147483647]
+; SSE-NEXT:    movdqa %xmm5, %xmm6
+; SSE-NEXT:    pcmpgtd %xmm4, %xmm6
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE-NEXT:    pcmpeqd %xmm2, %xmm3
+; SSE-NEXT:    pand %xmm6, %xmm3
+; SSE-NEXT:    pcmpeqd %xmm4, %xmm4
+; SSE-NEXT:    pand %xmm3, %xmm0
+; SSE-NEXT:    pxor %xmm4, %xmm3
+; SSE-NEXT:    por %xmm0, %xmm3
+; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    pxor %xmm2, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[0,0,2,2]
+; SSE-NEXT:    pcmpgtd %xmm6, %xmm5
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE-NEXT:    pcmpeqd %xmm2, %xmm0
+; SSE-NEXT:    pand %xmm5, %xmm0
+; SSE-NEXT:    pxor %xmm0, %xmm4
+; SSE-NEXT:    pand %xmm1, %xmm0
+; SSE-NEXT:    por %xmm4, %xmm0
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2]
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: utest_f32i32:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3]
+; AVX2-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX2-NEXT:    vsubss %xmm1, %xmm2, %xmm3
+; AVX2-NEXT:    vcvttss2si %xmm3, %rax
+; AVX2-NEXT:    vcvttss2si %xmm2, %rcx
+; AVX2-NEXT:    movq %rcx, %rdx
+; AVX2-NEXT:    sarq $63, %rdx
+; AVX2-NEXT:    andq %rax, %rdx
+; AVX2-NEXT:    orq %rcx, %rdx
+; AVX2-NEXT:    vshufpd {{.*#+}} xmm2 = xmm0[1,0]
+; AVX2-NEXT:    vsubss %xmm1, %xmm2, %xmm3
+; AVX2-NEXT:    vcvttss2si %xmm3, %rax
+; AVX2-NEXT:    vcvttss2si %xmm2, %rcx
+; AVX2-NEXT:    vmovq %rdx, %xmm2
+; AVX2-NEXT:    movq %rcx, %rdx
+; AVX2-NEXT:    sarq $63, %rdx
+; AVX2-NEXT:    andq %rax, %rdx
+; AVX2-NEXT:    orq %rcx, %rdx
+; AVX2-NEXT:    vmovq %rdx, %xmm3
+; AVX2-NEXT:    vsubss %xmm1, %xmm0, %xmm4
+; AVX2-NEXT:    vcvttss2si %xmm4, %rax
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX2-NEXT:    vcvttss2si %xmm0, %rcx
+; AVX2-NEXT:    movq %rcx, %rdx
+; AVX2-NEXT:    sarq $63, %rdx
+; AVX2-NEXT:    andq %rax, %rdx
+; AVX2-NEXT:    orq %rcx, %rdx
+; AVX2-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX2-NEXT:    vsubss %xmm1, %xmm0, %xmm1
+; AVX2-NEXT:    vcvttss2si %xmm1, %rax
+; AVX2-NEXT:    vcvttss2si %xmm0, %rcx
+; AVX2-NEXT:    vmovq %rdx, %xmm0
+; AVX2-NEXT:    movq %rcx, %rdx
+; AVX2-NEXT:    sarq $63, %rdx
+; AVX2-NEXT:    andq %rax, %rdx
+; AVX2-NEXT:    orq %rcx, %rdx
+; AVX2-NEXT:    vmovq %rdx, %xmm1
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,4294967295]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm2
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [9223372041149743102,9223372041149743102,9223372041149743102,9223372041149743102]
+; AVX2-NEXT:    vpcmpgtq %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vblendvpd %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vbroadcastf128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6]
+; AVX2-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX2-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: utest_f32i32:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vcvttps2uqq %xmm0, %ymm0
+; AVX512-NEXT:    vpmovusqd %ymm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
 entry:
   %conv = fptoui <4 x float> %x to <4 x i64>
   %0 = icmp ult <4 x i64> %conv, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
@@ -304,75 +484,113 @@ entry:
   ret <4 x i32> %conv6
 }
 
-define <4 x i32> @ustest_f32i32(<4 x float> %x) {
-; CHECK-LABEL: ustest_f32i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movaps %xmm0, %xmm1
-; CHECK-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3]
-; CHECK-NEXT:    cvttss2si %xmm1, %rax
-; CHECK-NEXT:    movq %rax, %xmm1
-; CHECK-NEXT:    movaps %xmm0, %xmm2
-; CHECK-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
-; CHECK-NEXT:    cvttss2si %xmm2, %rax
-; CHECK-NEXT:    movq %rax, %xmm2
-; CHECK-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
-; CHECK-NEXT:    cvttss2si %xmm0, %rax
-; CHECK-NEXT:    movq %rax, %xmm4
-; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; CHECK-NEXT:    cvttss2si %xmm0, %rax
-; CHECK-NEXT:    movq %rax, %xmm0
-; CHECK-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm0[0]
-; CHECK-NEXT:    movdqa {{.*#+}} xmm3 = [4294967295,4294967295]
-; CHECK-NEXT:    movdqa {{.*#+}} xmm0 = [2147483648,2147483648]
-; CHECK-NEXT:    movdqa %xmm4, %xmm1
-; CHECK-NEXT:    pxor %xmm0, %xmm1
-; CHECK-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3]
-; CHECK-NEXT:    pxor %xmm6, %xmm6
-; CHECK-NEXT:    pcmpeqd %xmm6, %xmm5
-; CHECK-NEXT:    movdqa {{.*#+}} xmm7 = [2147483647,2147483647]
-; CHECK-NEXT:    movdqa %xmm7, %xmm8
-; CHECK-NEXT:    pcmpgtd %xmm1, %xmm8
-; CHECK-NEXT:    pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2]
-; CHECK-NEXT:    pand %xmm5, %xmm9
-; CHECK-NEXT:    pshufd {{.*#+}} xmm1 = xmm8[1,1,3,3]
-; CHECK-NEXT:    por %xmm9, %xmm1
-; CHECK-NEXT:    pand %xmm1, %xmm4
-; CHECK-NEXT:    pandn %xmm3, %xmm1
-; CHECK-NEXT:    por %xmm4, %xmm1
-; CHECK-NEXT:    movdqa %xmm2, %xmm4
-; CHECK-NEXT:    pxor %xmm0, %xmm4
-; CHECK-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
-; CHECK-NEXT:    pcmpeqd %xmm6, %xmm5
-; CHECK-NEXT:    pcmpgtd %xmm4, %xmm7
-; CHECK-NEXT:    pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2]
-; CHECK-NEXT:    pand %xmm5, %xmm4
-; CHECK-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
-; CHECK-NEXT:    por %xmm4, %xmm5
-; CHECK-NEXT:    pand %xmm5, %xmm2
-; CHECK-NEXT:    pandn %xmm3, %xmm5
-; CHECK-NEXT:    por %xmm2, %xmm5
-; CHECK-NEXT:    movdqa %xmm5, %xmm2
-; CHECK-NEXT:    pxor %xmm0, %xmm2
-; CHECK-NEXT:    movdqa %xmm2, %xmm3
-; CHECK-NEXT:    pcmpgtd %xmm0, %xmm3
-; CHECK-NEXT:    pcmpeqd %xmm0, %xmm2
-; CHECK-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-NEXT:    pand %xmm3, %xmm2
-; CHECK-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; CHECK-NEXT:    por %xmm2, %xmm3
-; CHECK-NEXT:    pand %xmm5, %xmm3
-; CHECK-NEXT:    movdqa %xmm1, %xmm2
-; CHECK-NEXT:    pxor %xmm0, %xmm2
-; CHECK-NEXT:    movdqa %xmm2, %xmm4
-; CHECK-NEXT:    pcmpgtd %xmm0, %xmm4
-; CHECK-NEXT:    pcmpeqd %xmm0, %xmm2
-; CHECK-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-NEXT:    pand %xmm4, %xmm2
-; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; CHECK-NEXT:    por %xmm2, %xmm0
-; CHECK-NEXT:    pand %xmm1, %xmm0
-; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2]
-; CHECK-NEXT:    retq
+define <4 x i32> @ustest_f32i32(<4 x float> %x) nounwind {
+; SSE-LABEL: ustest_f32i32:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    movaps %xmm0, %xmm1
+; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3]
+; SSE-NEXT:    cvttss2si %xmm1, %rax
+; SSE-NEXT:    movq %rax, %xmm1
+; SSE-NEXT:    movaps %xmm0, %xmm2
+; SSE-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
+; SSE-NEXT:    cvttss2si %xmm2, %rax
+; SSE-NEXT:    movq %rax, %xmm2
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
+; SSE-NEXT:    cvttss2si %xmm0, %rax
+; SSE-NEXT:    movq %rax, %xmm4
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; SSE-NEXT:    cvttss2si %xmm0, %rax
+; SSE-NEXT:    movq %rax, %xmm0
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm0[0]
+; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [4294967295,4294967295]
+; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [2147483648,2147483648]
+; SSE-NEXT:    movdqa %xmm4, %xmm1
+; SSE-NEXT:    pxor %xmm0, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3]
+; SSE-NEXT:    pxor %xmm6, %xmm6
+; SSE-NEXT:    pcmpeqd %xmm6, %xmm5
+; SSE-NEXT:    movdqa {{.*#+}} xmm7 = [2147483647,2147483647]
+; SSE-NEXT:    movdqa %xmm7, %xmm8
+; SSE-NEXT:    pcmpgtd %xmm1, %xmm8
+; SSE-NEXT:    pshufd {{.*#+}} xmm9 = xmm8[0,0,2,2]
+; SSE-NEXT:    pand %xmm5, %xmm9
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm8[1,1,3,3]
+; SSE-NEXT:    por %xmm9, %xmm1
+; SSE-NEXT:    pand %xmm1, %xmm4
+; SSE-NEXT:    pandn %xmm3, %xmm1
+; SSE-NEXT:    por %xmm4, %xmm1
+; SSE-NEXT:    movdqa %xmm2, %xmm4
+; SSE-NEXT:    pxor %xmm0, %xmm4
+; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[1,1,3,3]
+; SSE-NEXT:    pcmpeqd %xmm6, %xmm5
+; SSE-NEXT:    pcmpgtd %xmm4, %xmm7
+; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2]
+; SSE-NEXT:    pand %xmm5, %xmm4
+; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; SSE-NEXT:    por %xmm4, %xmm5
+; SSE-NEXT:    pand %xmm5, %xmm2
+; SSE-NEXT:    pandn %xmm3, %xmm5
+; SSE-NEXT:    por %xmm2, %xmm5
+; SSE-NEXT:    movdqa %xmm5, %xmm2
+; SSE-NEXT:    pxor %xmm0, %xmm2
+; SSE-NEXT:    movdqa %xmm2, %xmm3
+; SSE-NEXT:    pcmpgtd %xmm0, %xmm3
+; SSE-NEXT:    pcmpeqd %xmm0, %xmm2
+; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE-NEXT:    pand %xmm3, %xmm2
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE-NEXT:    por %xmm2, %xmm3
+; SSE-NEXT:    pand %xmm5, %xmm3
+; SSE-NEXT:    movdqa %xmm1, %xmm2
+; SSE-NEXT:    pxor %xmm0, %xmm2
+; SSE-NEXT:    movdqa %xmm2, %xmm4
+; SSE-NEXT:    pcmpgtd %xmm0, %xmm4
+; SSE-NEXT:    pcmpeqd %xmm0, %xmm2
+; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE-NEXT:    pand %xmm4, %xmm2
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
+; SSE-NEXT:    por %xmm2, %xmm0
+; SSE-NEXT:    pand %xmm1, %xmm0
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2]
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: ustest_f32i32:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3]
+; AVX2-NEXT:    vcvttss2si %xmm1, %rax
+; AVX2-NEXT:    vmovq %rax, %xmm1
+; AVX2-NEXT:    vshufpd {{.*#+}} xmm2 = xmm0[1,0]
+; AVX2-NEXT:    vcvttss2si %xmm2, %rax
+; AVX2-NEXT:    vmovq %rax, %xmm2
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX2-NEXT:    vcvttss2si %xmm0, %rax
+; AVX2-NEXT:    vmovq %rax, %xmm2
+; AVX2-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX2-NEXT:    vcvttss2si %xmm0, %rax
+; AVX2-NEXT:    vmovq %rax, %xmm0
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
+; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,4294967295]
+; AVX2-NEXT:    vpcmpgtq %ymm0, %ymm1, %ymm2
+; AVX2-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm1
+; AVX2-NEXT:    vpand %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6]
+; AVX2-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX2-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: ustest_f32i32:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vcvttps2qq %xmm0, %ymm0
+; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512-NEXT:    vpmaxsq %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpmovusqd %ymm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
 entry:
   %conv = fptosi <4 x float> %x to <4 x i64>
   %0 = icmp slt <4 x i64> %conv, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
@@ -383,103 +601,154 @@ entry:
   ret <4 x i32> %conv6
 }
 
-define <4 x i32> @stest_f16i32(<4 x half> %x) {
-; CHECK-LABEL: stest_f16i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    subq $72, %rsp
-; CHECK-NEXT:    .cfi_def_cfa_offset 80
-; CHECK-NEXT:    movdqa %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT:    movdqa %xmm0, %xmm1
-; CHECK-NEXT:    psrld $16, %xmm1
-; CHECK-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movdqa %xmm0, %xmm1
-; CHECK-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
-; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    psrlq $48, %xmm0
-; CHECK-NEXT:    callq __extendhfsf2@PLT
-; CHECK-NEXT:    cvttss2si %xmm0, %rax
-; CHECK-NEXT:    movq %rax, %xmm0
-; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __extendhfsf2@PLT
-; CHECK-NEXT:    cvttss2si %xmm0, %rax
-; CHECK-NEXT:    movq %rax, %xmm0
-; CHECK-NEXT:    punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
-; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __extendhfsf2@PLT
-; CHECK-NEXT:    cvttss2si %xmm0, %rax
-; CHECK-NEXT:    movq %rax, %xmm0
-; CHECK-NEXT:    movdqa %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __extendhfsf2@PLT
-; CHECK-NEXT:    cvttss2si %xmm0, %rax
-; CHECK-NEXT:    movq %rax, %xmm0
-; CHECK-NEXT:    movdqa (%rsp), %xmm3 # 16-byte Reload
-; CHECK-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0]
-; CHECK-NEXT:    movdqa {{.*#+}} xmm2 = [2147483647,2147483647]
-; CHECK-NEXT:    movdqa {{.*#+}} xmm0 = [2147483648,2147483648]
-; CHECK-NEXT:    movdqa %xmm3, %xmm1
-; CHECK-NEXT:    movdqa %xmm3, %xmm8
-; CHECK-NEXT:    pxor %xmm0, %xmm1
-; CHECK-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
-; CHECK-NEXT:    pxor %xmm4, %xmm4
-; CHECK-NEXT:    pcmpeqd %xmm4, %xmm3
-; CHECK-NEXT:    movdqa {{.*#+}} xmm5 = [4294967295,4294967295]
-; CHECK-NEXT:    movdqa %xmm5, %xmm6
-; CHECK-NEXT:    pcmpgtd %xmm1, %xmm6
-; CHECK-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; CHECK-NEXT:    pand %xmm3, %xmm7
-; CHECK-NEXT:    pshufd {{.*#+}} xmm1 = xmm6[1,1,3,3]
-; CHECK-NEXT:    por %xmm7, %xmm1
-; CHECK-NEXT:    pand %xmm1, %xmm8
-; CHECK-NEXT:    pandn %xmm2, %xmm1
-; CHECK-NEXT:    por %xmm8, %xmm1
-; CHECK-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
-; CHECK-NEXT:    movdqa %xmm7, %xmm3
-; CHECK-NEXT:    pxor %xmm0, %xmm3
-; CHECK-NEXT:    pshufd {{.*#+}} xmm6 = xmm3[1,1,3,3]
-; CHECK-NEXT:    pcmpeqd %xmm4, %xmm6
-; CHECK-NEXT:    pcmpgtd %xmm3, %xmm5
-; CHECK-NEXT:    pshufd {{.*#+}} xmm3 = xmm5[0,0,2,2]
-; CHECK-NEXT:    pand %xmm6, %xmm3
-; CHECK-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
-; CHECK-NEXT:    por %xmm3, %xmm4
-; CHECK-NEXT:    movdqa %xmm7, %xmm3
-; CHECK-NEXT:    pand %xmm4, %xmm3
-; CHECK-NEXT:    pandn %xmm2, %xmm4
-; CHECK-NEXT:    por %xmm3, %xmm4
-; CHECK-NEXT:    movdqa {{.*#+}} xmm2 = [18446744071562067968,18446744071562067968]
-; CHECK-NEXT:    movdqa %xmm4, %xmm3
-; CHECK-NEXT:    pxor %xmm0, %xmm3
-; CHECK-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3]
-; CHECK-NEXT:    pcmpeqd %xmm6, %xmm6
-; CHECK-NEXT:    pcmpeqd %xmm6, %xmm5
-; CHECK-NEXT:    movdqa {{.*#+}} xmm7 = [18446744069414584320,18446744069414584320]
-; CHECK-NEXT:    pcmpgtd %xmm7, %xmm3
-; CHECK-NEXT:    pshufd {{.*#+}} xmm8 = xmm3[0,0,2,2]
-; CHECK-NEXT:    pand %xmm5, %xmm8
-; CHECK-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; CHECK-NEXT:    por %xmm8, %xmm3
-; CHECK-NEXT:    pand %xmm3, %xmm4
-; CHECK-NEXT:    pandn %xmm2, %xmm3
-; CHECK-NEXT:    por %xmm4, %xmm3
-; CHECK-NEXT:    pxor %xmm1, %xmm0
-; CHECK-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; CHECK-NEXT:    pcmpeqd %xmm6, %xmm4
-; CHECK-NEXT:    pcmpgtd %xmm7, %xmm0
-; CHECK-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2]
-; CHECK-NEXT:    pand %xmm4, %xmm5
-; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; CHECK-NEXT:    por %xmm5, %xmm0
-; CHECK-NEXT:    pand %xmm0, %xmm1
-; CHECK-NEXT:    pandn %xmm2, %xmm0
-; CHECK-NEXT:    por %xmm1, %xmm0
-; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2]
-; CHECK-NEXT:    addq $72, %rsp
-; CHECK-NEXT:    .cfi_def_cfa_offset 8
-; CHECK-NEXT:    retq
+define <4 x i32> @stest_f16i32(<4 x half> %x) nounwind {
+; SSE-LABEL: stest_f16i32:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    subq $72, %rsp
+; SSE-NEXT:    movdqa %xmm0, (%rsp) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm0, %xmm1
+; SSE-NEXT:    psrld $16, %xmm1
+; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm0, %xmm1
+; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
+; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    psrlq $48, %xmm0
+; SSE-NEXT:    callq __extendhfsf2@PLT
+; SSE-NEXT:    cvttss2si %xmm0, %rax
+; SSE-NEXT:    movq %rax, %xmm0
+; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE-NEXT:    callq __extendhfsf2@PLT
+; SSE-NEXT:    cvttss2si %xmm0, %rax
+; SSE-NEXT:    movq %rax, %xmm0
+; SSE-NEXT:    punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; SSE-NEXT:    # xmm0 = xmm0[0],mem[0]
+; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; SSE-NEXT:    callq __extendhfsf2@PLT
+; SSE-NEXT:    cvttss2si %xmm0, %rax
+; SSE-NEXT:    movq %rax, %xmm0
+; SSE-NEXT:    movdqa %xmm0, (%rsp) # 16-byte Spill
+; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE-NEXT:    callq __extendhfsf2@PLT
+; SSE-NEXT:    cvttss2si %xmm0, %rax
+; SSE-NEXT:    movq %rax, %xmm0
+; SSE-NEXT:    movdqa (%rsp), %xmm3 # 16-byte Reload
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0]
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [2147483647,2147483647]
+; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [2147483648,2147483648]
+; SSE-NEXT:    movdqa %xmm3, %xmm1
+; SSE-NEXT:    movdqa %xmm3, %xmm8
+; SSE-NEXT:    pxor %xmm0, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
+; SSE-NEXT:    pxor %xmm4, %xmm4
+; SSE-NEXT:    pcmpeqd %xmm4, %xmm3
+; SSE-NEXT:    movdqa {{.*#+}} xmm5 = [4294967295,4294967295]
+; SSE-NEXT:    movdqa %xmm5, %xmm6
+; SSE-NEXT:    pcmpgtd %xmm1, %xmm6
+; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
+; SSE-NEXT:    pand %xmm3, %xmm7
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm6[1,1,3,3]
+; SSE-NEXT:    por %xmm7, %xmm1
+; SSE-NEXT:    pand %xmm1, %xmm8
+; SSE-NEXT:    pandn %xmm2, %xmm1
+; SSE-NEXT:    por %xmm8, %xmm1
+; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
+; SSE-NEXT:    movdqa %xmm7, %xmm3
+; SSE-NEXT:    pxor %xmm0, %xmm3
+; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm3[1,1,3,3]
+; SSE-NEXT:    pcmpeqd %xmm4, %xmm6
+; SSE-NEXT:    pcmpgtd %xmm3, %xmm5
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm5[0,0,2,2]
+; SSE-NEXT:    pand %xmm6, %xmm3
+; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
+; SSE-NEXT:    por %xmm3, %xmm4
+; SSE-NEXT:    movdqa %xmm7, %xmm3
+; SSE-NEXT:    pand %xmm4, %xmm3
+; SSE-NEXT:    pandn %xmm2, %xmm4
+; SSE-NEXT:    por %xmm3, %xmm4
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [18446744071562067968,18446744071562067968]
+; SSE-NEXT:    movdqa %xmm4, %xmm3
+; SSE-NEXT:    pxor %xmm0, %xmm3
+; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[1,1,3,3]
+; SSE-NEXT:    pcmpeqd %xmm6, %xmm6
+; SSE-NEXT:    pcmpeqd %xmm6, %xmm5
+; SSE-NEXT:    movdqa {{.*#+}} xmm7 = [18446744069414584320,18446744069414584320]
+; SSE-NEXT:    pcmpgtd %xmm7, %xmm3
+; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm3[0,0,2,2]
+; SSE-NEXT:    pand %xmm5, %xmm8
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE-NEXT:    por %xmm8, %xmm3
+; SSE-NEXT:    pand %xmm3, %xmm4
+; SSE-NEXT:    pandn %xmm2, %xmm3
+; SSE-NEXT:    por %xmm4, %xmm3
+; SSE-NEXT:    pxor %xmm1, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
+; SSE-NEXT:    pcmpeqd %xmm6, %xmm4
+; SSE-NEXT:    pcmpgtd %xmm7, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2]
+; SSE-NEXT:    pand %xmm4, %xmm5
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE-NEXT:    por %xmm5, %xmm0
+; SSE-NEXT:    pand %xmm0, %xmm1
+; SSE-NEXT:    pandn %xmm2, %xmm0
+; SSE-NEXT:    por %xmm1, %xmm0
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2]
+; SSE-NEXT:    addq $72, %rsp
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: stest_f16i32:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpsrlq $48, %xmm0, %xmm1
+; AVX2-NEXT:    vpextrw $0, %xmm1, %eax
+; AVX2-NEXT:    movzwl %ax, %eax
+; AVX2-NEXT:    vmovd %eax, %xmm1
+; AVX2-NEXT:    vcvtph2ps %xmm1, %xmm1
+; AVX2-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; AVX2-NEXT:    vpextrw $0, %xmm2, %eax
+; AVX2-NEXT:    movzwl %ax, %eax
+; AVX2-NEXT:    vmovd %eax, %xmm2
+; AVX2-NEXT:    vcvttss2si %xmm1, %rax
+; AVX2-NEXT:    vcvtph2ps %xmm2, %xmm1
+; AVX2-NEXT:    vmovq %rax, %xmm2
+; AVX2-NEXT:    vcvttss2si %xmm1, %rax
+; AVX2-NEXT:    vmovq %rax, %xmm1
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX2-NEXT:    vpextrw $0, %xmm0, %eax
+; AVX2-NEXT:    movzwl %ax, %eax
+; AVX2-NEXT:    vmovd %eax, %xmm2
+; AVX2-NEXT:    vcvtph2ps %xmm2, %xmm2
+; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm0
+; AVX2-NEXT:    vpextrw $0, %xmm0, %eax
+; AVX2-NEXT:    movzwl %ax, %eax
+; AVX2-NEXT:    vmovd %eax, %xmm0
+; AVX2-NEXT:    vcvttss2si %xmm2, %rax
+; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX2-NEXT:    vmovq %rax, %xmm2
+; AVX2-NEXT:    vcvttss2si %xmm0, %rax
+; AVX2-NEXT:    vmovq %rax, %xmm0
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
+; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [2147483647,2147483647,2147483647,2147483647]
+; AVX2-NEXT:    vpcmpgtq %ymm0, %ymm1, %ymm2
+; AVX2-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [18446744071562067968,18446744071562067968,18446744071562067968,18446744071562067968]
+; AVX2-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm2
+; AVX2-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vbroadcastf128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6]
+; AVX2-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX2-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: stest_f16i32:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vcvtph2ps %xmm0, %ymm0
+; AVX512-NEXT:    vcvttps2qq %ymm0, %zmm0
+; AVX512-NEXT:    vpmovsqd %ymm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
 entry:
   %conv = fptosi <4 x half> %x to <4 x i64>
   %0 = icmp slt <4 x i64> %conv, <i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647>
@@ -490,95 +759,171 @@ entry:
   ret <4 x i32> %conv6
 }
 
-define <4 x i32> @utesth_f16i32(<4 x half> %x) {
-; CHECK-LABEL: utesth_f16i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    subq $72, %rsp
-; CHECK-NEXT:    .cfi_def_cfa_offset 80
-; CHECK-NEXT:    movaps %xmm0, %xmm1
-; CHECK-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
-; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movaps %xmm0, %xmm1
-; CHECK-NEXT:    psrlq $48, %xmm1
-; CHECK-NEXT:    movdqa %xmm1, (%rsp) # 16-byte Spill
-; CHECK-NEXT:    movaps %xmm0, %xmm1
-; CHECK-NEXT:    psrld $16, %xmm1
-; CHECK-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    callq __extendhfsf2@PLT
-; CHECK-NEXT:    cvttss2si %xmm0, %rax
-; CHECK-NEXT:    subss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cvttss2si %xmm0, %rcx
-; CHECK-NEXT:    movq %rax, %rdx
-; CHECK-NEXT:    sarq $63, %rdx
-; CHECK-NEXT:    andq %rcx, %rdx
-; CHECK-NEXT:    orq %rax, %rdx
-; CHECK-NEXT:    movq %rdx, %xmm0
-; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __extendhfsf2@PLT
-; CHECK-NEXT:    cvttss2si %xmm0, %rax
-; CHECK-NEXT:    subss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cvttss2si %xmm0, %rcx
-; CHECK-NEXT:    movq %rax, %rdx
-; CHECK-NEXT:    sarq $63, %rdx
-; CHECK-NEXT:    andq %rcx, %rdx
-; CHECK-NEXT:    orq %rax, %rdx
-; CHECK-NEXT:    movq %rdx, %xmm0
-; CHECK-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; CHECK-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; CHECK-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __extendhfsf2@PLT
-; CHECK-NEXT:    cvttss2si %xmm0, %rax
-; CHECK-NEXT:    subss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cvttss2si %xmm0, %rcx
-; CHECK-NEXT:    movq %rax, %rdx
-; CHECK-NEXT:    sarq $63, %rdx
-; CHECK-NEXT:    andq %rcx, %rdx
-; CHECK-NEXT:    orq %rax, %rdx
-; CHECK-NEXT:    movq %rdx, %xmm0
-; CHECK-NEXT:    movdqa %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __extendhfsf2@PLT
-; CHECK-NEXT:    cvttss2si %xmm0, %rax
-; CHECK-NEXT:    subss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cvttss2si %xmm0, %rcx
-; CHECK-NEXT:    movq %rax, %rdx
-; CHECK-NEXT:    sarq $63, %rdx
-; CHECK-NEXT:    andq %rcx, %rdx
-; CHECK-NEXT:    orq %rax, %rdx
-; CHECK-NEXT:    movq %rdx, %xmm0
-; CHECK-NEXT:    punpcklqdq (%rsp), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
-; CHECK-NEXT:    movdqa {{.*#+}} xmm1 = [9223372039002259456,9223372039002259456]
-; CHECK-NEXT:    movdqa %xmm0, %xmm2
-; CHECK-NEXT:    pxor %xmm1, %xmm2
-; CHECK-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
-; CHECK-NEXT:    movdqa {{.*#+}} xmm4 = [2147483647,2147483647,2147483647,2147483647]
-; CHECK-NEXT:    movdqa %xmm4, %xmm5
-; CHECK-NEXT:    pcmpgtd %xmm3, %xmm5
-; CHECK-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-NEXT:    pcmpeqd %xmm1, %xmm2
-; CHECK-NEXT:    pand %xmm5, %xmm2
-; CHECK-NEXT:    pcmpeqd %xmm3, %xmm3
-; CHECK-NEXT:    pand %xmm2, %xmm0
-; CHECK-NEXT:    pxor %xmm3, %xmm2
-; CHECK-NEXT:    por %xmm0, %xmm2
-; CHECK-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; CHECK-NEXT:    movdqa %xmm6, %xmm0
-; CHECK-NEXT:    pxor %xmm1, %xmm0
-; CHECK-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2]
-; CHECK-NEXT:    pcmpgtd %xmm5, %xmm4
-; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; CHECK-NEXT:    pcmpeqd %xmm1, %xmm0
-; CHECK-NEXT:    pand %xmm4, %xmm0
-; CHECK-NEXT:    pxor %xmm0, %xmm3
-; CHECK-NEXT:    pand %xmm6, %xmm0
-; CHECK-NEXT:    por %xmm3, %xmm0
-; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
-; CHECK-NEXT:    addq $72, %rsp
-; CHECK-NEXT:    .cfi_def_cfa_offset 8
-; CHECK-NEXT:    retq
+define <4 x i32> @utesth_f16i32(<4 x half> %x) nounwind {
+; SSE-LABEL: utesth_f16i32:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    subq $72, %rsp
+; SSE-NEXT:    movaps %xmm0, %xmm1
+; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
+; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps %xmm0, %xmm1
+; SSE-NEXT:    psrlq $48, %xmm1
+; SSE-NEXT:    movdqa %xmm1, (%rsp) # 16-byte Spill
+; SSE-NEXT:    movaps %xmm0, %xmm1
+; SSE-NEXT:    psrld $16, %xmm1
+; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    callq __extendhfsf2@PLT
+; SSE-NEXT:    cvttss2si %xmm0, %rax
+; SSE-NEXT:    subss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT:    cvttss2si %xmm0, %rcx
+; SSE-NEXT:    movq %rax, %rdx
+; SSE-NEXT:    sarq $63, %rdx
+; SSE-NEXT:    andq %rcx, %rdx
+; SSE-NEXT:    orq %rax, %rdx
+; SSE-NEXT:    movq %rdx, %xmm0
+; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE-NEXT:    callq __extendhfsf2@PLT
+; SSE-NEXT:    cvttss2si %xmm0, %rax
+; SSE-NEXT:    subss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT:    cvttss2si %xmm0, %rcx
+; SSE-NEXT:    movq %rax, %rdx
+; SSE-NEXT:    sarq $63, %rdx
+; SSE-NEXT:    andq %rcx, %rdx
+; SSE-NEXT:    orq %rax, %rdx
+; SSE-NEXT:    movq %rdx, %xmm0
+; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; SSE-NEXT:    callq __extendhfsf2@PLT
+; SSE-NEXT:    cvttss2si %xmm0, %rax
+; SSE-NEXT:    subss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT:    cvttss2si %xmm0, %rcx
+; SSE-NEXT:    movq %rax, %rdx
+; SSE-NEXT:    sarq $63, %rdx
+; SSE-NEXT:    andq %rcx, %rdx
+; SSE-NEXT:    orq %rax, %rdx
+; SSE-NEXT:    movq %rdx, %xmm0
+; SSE-NEXT:    movdqa %xmm0, (%rsp) # 16-byte Spill
+; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE-NEXT:    callq __extendhfsf2@PLT
+; SSE-NEXT:    cvttss2si %xmm0, %rax
+; SSE-NEXT:    subss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT:    cvttss2si %xmm0, %rcx
+; SSE-NEXT:    movq %rax, %rdx
+; SSE-NEXT:    sarq $63, %rdx
+; SSE-NEXT:    andq %rcx, %rdx
+; SSE-NEXT:    orq %rax, %rdx
+; SSE-NEXT:    movq %rdx, %xmm0
+; SSE-NEXT:    punpcklqdq (%rsp), %xmm0 # 16-byte Folded Reload
+; SSE-NEXT:    # xmm0 = xmm0[0],mem[0]
+; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [9223372039002259456,9223372039002259456]
+; SSE-NEXT:    movdqa %xmm0, %xmm2
+; SSE-NEXT:    pxor %xmm1, %xmm2
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [2147483647,2147483647,2147483647,2147483647]
+; SSE-NEXT:    movdqa %xmm4, %xmm5
+; SSE-NEXT:    pcmpgtd %xmm3, %xmm5
+; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE-NEXT:    pcmpeqd %xmm1, %xmm2
+; SSE-NEXT:    pand %xmm5, %xmm2
+; SSE-NEXT:    pcmpeqd %xmm3, %xmm3
+; SSE-NEXT:    pand %xmm2, %xmm0
+; SSE-NEXT:    pxor %xmm3, %xmm2
+; SSE-NEXT:    por %xmm0, %xmm2
+; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
+; SSE-NEXT:    movdqa %xmm6, %xmm0
+; SSE-NEXT:    pxor %xmm1, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2]
+; SSE-NEXT:    pcmpgtd %xmm5, %xmm4
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE-NEXT:    pcmpeqd %xmm1, %xmm0
+; SSE-NEXT:    pand %xmm4, %xmm0
+; SSE-NEXT:    pxor %xmm0, %xmm3
+; SSE-NEXT:    pand %xmm6, %xmm0
+; SSE-NEXT:    por %xmm3, %xmm0
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
+; SSE-NEXT:    addq $72, %rsp
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: utesth_f16i32:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpsrlq $48, %xmm0, %xmm1
+; AVX2-NEXT:    vpextrw $0, %xmm1, %eax
+; AVX2-NEXT:    movzwl %ax, %eax
+; AVX2-NEXT:    vmovd %eax, %xmm1
+; AVX2-NEXT:    vcvtph2ps %xmm1, %xmm2
+; AVX2-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX2-NEXT:    vsubss %xmm1, %xmm2, %xmm3
+; AVX2-NEXT:    vcvttss2si %xmm3, %rax
+; AVX2-NEXT:    vcvttss2si %xmm2, %rcx
+; AVX2-NEXT:    movq %rcx, %rdx
+; AVX2-NEXT:    sarq $63, %rdx
+; AVX2-NEXT:    andq %rax, %rdx
+; AVX2-NEXT:    orq %rcx, %rdx
+; AVX2-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; AVX2-NEXT:    vpextrw $0, %xmm2, %eax
+; AVX2-NEXT:    movzwl %ax, %eax
+; AVX2-NEXT:    vmovd %eax, %xmm2
+; AVX2-NEXT:    vcvtph2ps %xmm2, %xmm2
+; AVX2-NEXT:    vsubss %xmm1, %xmm2, %xmm3
+; AVX2-NEXT:    vcvttss2si %xmm3, %rax
+; AVX2-NEXT:    vcvttss2si %xmm2, %rcx
+; AVX2-NEXT:    vmovq %rdx, %xmm2
+; AVX2-NEXT:    vpextrw $0, %xmm0, %edx
+; AVX2-NEXT:    movzwl %dx, %edx
+; AVX2-NEXT:    vmovd %edx, %xmm3
+; AVX2-NEXT:    movq %rcx, %rdx
+; AVX2-NEXT:    sarq $63, %rdx
+; AVX2-NEXT:    vcvtph2ps %xmm3, %xmm3
+; AVX2-NEXT:    andq %rax, %rdx
+; AVX2-NEXT:    vsubss %xmm1, %xmm3, %xmm4
+; AVX2-NEXT:    vcvttss2si %xmm4, %rax
+; AVX2-NEXT:    orq %rcx, %rdx
+; AVX2-NEXT:    vmovq %rdx, %xmm4
+; AVX2-NEXT:    vcvttss2si %xmm3, %rcx
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm4[0],xmm2[0]
+; AVX2-NEXT:    movq %rcx, %rdx
+; AVX2-NEXT:    sarq $63, %rdx
+; AVX2-NEXT:    andq %rax, %rdx
+; AVX2-NEXT:    orq %rcx, %rdx
+; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm0
+; AVX2-NEXT:    vpextrw $0, %xmm0, %eax
+; AVX2-NEXT:    movzwl %ax, %eax
+; AVX2-NEXT:    vmovd %eax, %xmm0
+; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX2-NEXT:    vsubss %xmm1, %xmm0, %xmm1
+; AVX2-NEXT:    vcvttss2si %xmm1, %rax
+; AVX2-NEXT:    vcvttss2si %xmm0, %rcx
+; AVX2-NEXT:    vmovq %rdx, %xmm0
+; AVX2-NEXT:    movq %rcx, %rdx
+; AVX2-NEXT:    sarq $63, %rdx
+; AVX2-NEXT:    andq %rax, %rdx
+; AVX2-NEXT:    orq %rcx, %rdx
+; AVX2-NEXT:    vmovq %rdx, %xmm1
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,4294967295]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm2
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [9223372041149743102,9223372041149743102,9223372041149743102,9223372041149743102]
+; AVX2-NEXT:    vpcmpgtq %ymm3, %ymm2, %ymm2
+; AVX2-NEXT:    vblendvpd %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vbroadcastf128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6]
+; AVX2-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX2-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: utesth_f16i32:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vcvtph2ps %xmm0, %ymm0
+; AVX512-NEXT:    vcvttps2uqq %ymm0, %zmm0
+; AVX512-NEXT:    vpmovusqd %ymm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
 entry:
   %conv = fptoui <4 x half> %x to <4 x i64>
   %0 = icmp ult <4 x i64> %conv, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
@@ -587,97 +932,150 @@ entry:
   ret <4 x i32> %conv6
 }
 
-define <4 x i32> @ustest_f16i32(<4 x half> %x) {
-; CHECK-LABEL: ustest_f16i32:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    subq $72, %rsp
-; CHECK-NEXT:    .cfi_def_cfa_offset 80
-; CHECK-NEXT:    movdqa %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT:    movdqa %xmm0, %xmm1
-; CHECK-NEXT:    psrld $16, %xmm1
-; CHECK-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movdqa %xmm0, %xmm1
-; CHECK-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
-; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    psrlq $48, %xmm0
-; CHECK-NEXT:    callq __extendhfsf2@PLT
-; CHECK-NEXT:    cvttss2si %xmm0, %rax
-; CHECK-NEXT:    movq %rax, %xmm0
-; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __extendhfsf2@PLT
-; CHECK-NEXT:    cvttss2si %xmm0, %rax
-; CHECK-NEXT:    movq %rax, %xmm0
-; CHECK-NEXT:    punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
-; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __extendhfsf2@PLT
-; CHECK-NEXT:    cvttss2si %xmm0, %rax
-; CHECK-NEXT:    movq %rax, %xmm0
-; CHECK-NEXT:    movdqa %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __extendhfsf2@PLT
-; CHECK-NEXT:    cvttss2si %xmm0, %rax
-; CHECK-NEXT:    movq %rax, %xmm0
-; CHECK-NEXT:    movdqa (%rsp), %xmm3 # 16-byte Reload
-; CHECK-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0]
-; CHECK-NEXT:    movdqa {{.*#+}} xmm2 = [4294967295,4294967295]
-; CHECK-NEXT:    movdqa {{.*#+}} xmm0 = [2147483648,2147483648]
-; CHECK-NEXT:    movdqa %xmm3, %xmm1
-; CHECK-NEXT:    movdqa %xmm3, %xmm8
-; CHECK-NEXT:    pxor %xmm0, %xmm1
-; CHECK-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
-; CHECK-NEXT:    pxor %xmm4, %xmm4
-; CHECK-NEXT:    pcmpeqd %xmm4, %xmm3
-; CHECK-NEXT:    movdqa {{.*#+}} xmm5 = [2147483647,2147483647]
-; CHECK-NEXT:    movdqa %xmm5, %xmm6
-; CHECK-NEXT:    pcmpgtd %xmm1, %xmm6
-; CHECK-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
-; CHECK-NEXT:    pand %xmm3, %xmm7
-; CHECK-NEXT:    pshufd {{.*#+}} xmm1 = xmm6[1,1,3,3]
-; CHECK-NEXT:    por %xmm7, %xmm1
-; CHECK-NEXT:    pand %xmm1, %xmm8
-; CHECK-NEXT:    pandn %xmm2, %xmm1
-; CHECK-NEXT:    por %xmm8, %xmm1
-; CHECK-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
-; CHECK-NEXT:    movdqa %xmm7, %xmm3
-; CHECK-NEXT:    pxor %xmm0, %xmm3
-; CHECK-NEXT:    pshufd {{.*#+}} xmm6 = xmm3[1,1,3,3]
-; CHECK-NEXT:    pcmpeqd %xmm4, %xmm6
-; CHECK-NEXT:    pcmpgtd %xmm3, %xmm5
-; CHECK-NEXT:    pshufd {{.*#+}} xmm3 = xmm5[0,0,2,2]
-; CHECK-NEXT:    pand %xmm6, %xmm3
-; CHECK-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
-; CHECK-NEXT:    por %xmm3, %xmm4
-; CHECK-NEXT:    movdqa %xmm7, %xmm3
-; CHECK-NEXT:    pand %xmm4, %xmm3
-; CHECK-NEXT:    pandn %xmm2, %xmm4
-; CHECK-NEXT:    por %xmm3, %xmm4
-; CHECK-NEXT:    movdqa %xmm4, %xmm2
-; CHECK-NEXT:    pxor %xmm0, %xmm2
-; CHECK-NEXT:    movdqa %xmm2, %xmm3
-; CHECK-NEXT:    pcmpgtd %xmm0, %xmm3
-; CHECK-NEXT:    pcmpeqd %xmm0, %xmm2
-; CHECK-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-NEXT:    pand %xmm3, %xmm2
-; CHECK-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; CHECK-NEXT:    por %xmm2, %xmm3
-; CHECK-NEXT:    pand %xmm4, %xmm3
-; CHECK-NEXT:    movdqa %xmm1, %xmm2
-; CHECK-NEXT:    pxor %xmm0, %xmm2
-; CHECK-NEXT:    movdqa %xmm2, %xmm4
-; CHECK-NEXT:    pcmpgtd %xmm0, %xmm4
-; CHECK-NEXT:    pcmpeqd %xmm0, %xmm2
-; CHECK-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-NEXT:    pand %xmm4, %xmm2
-; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; CHECK-NEXT:    por %xmm2, %xmm0
-; CHECK-NEXT:    pand %xmm1, %xmm0
-; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2]
-; CHECK-NEXT:    addq $72, %rsp
-; CHECK-NEXT:    .cfi_def_cfa_offset 8
-; CHECK-NEXT:    retq
+define <4 x i32> @ustest_f16i32(<4 x half> %x) nounwind {
+; SSE-LABEL: ustest_f16i32:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    subq $72, %rsp
+; SSE-NEXT:    movdqa %xmm0, (%rsp) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm0, %xmm1
+; SSE-NEXT:    psrld $16, %xmm1
+; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm0, %xmm1
+; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
+; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    psrlq $48, %xmm0
+; SSE-NEXT:    callq __extendhfsf2@PLT
+; SSE-NEXT:    cvttss2si %xmm0, %rax
+; SSE-NEXT:    movq %rax, %xmm0
+; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE-NEXT:    callq __extendhfsf2@PLT
+; SSE-NEXT:    cvttss2si %xmm0, %rax
+; SSE-NEXT:    movq %rax, %xmm0
+; SSE-NEXT:    punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; SSE-NEXT:    # xmm0 = xmm0[0],mem[0]
+; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; SSE-NEXT:    callq __extendhfsf2@PLT
+; SSE-NEXT:    cvttss2si %xmm0, %rax
+; SSE-NEXT:    movq %rax, %xmm0
+; SSE-NEXT:    movdqa %xmm0, (%rsp) # 16-byte Spill
+; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE-NEXT:    callq __extendhfsf2@PLT
+; SSE-NEXT:    cvttss2si %xmm0, %rax
+; SSE-NEXT:    movq %rax, %xmm0
+; SSE-NEXT:    movdqa (%rsp), %xmm3 # 16-byte Reload
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0]
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [4294967295,4294967295]
+; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [2147483648,2147483648]
+; SSE-NEXT:    movdqa %xmm3, %xmm1
+; SSE-NEXT:    movdqa %xmm3, %xmm8
+; SSE-NEXT:    pxor %xmm0, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
+; SSE-NEXT:    pxor %xmm4, %xmm4
+; SSE-NEXT:    pcmpeqd %xmm4, %xmm3
+; SSE-NEXT:    movdqa {{.*#+}} xmm5 = [2147483647,2147483647]
+; SSE-NEXT:    movdqa %xmm5, %xmm6
+; SSE-NEXT:    pcmpgtd %xmm1, %xmm6
+; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,0,2,2]
+; SSE-NEXT:    pand %xmm3, %xmm7
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm6[1,1,3,3]
+; SSE-NEXT:    por %xmm7, %xmm1
+; SSE-NEXT:    pand %xmm1, %xmm8
+; SSE-NEXT:    pandn %xmm2, %xmm1
+; SSE-NEXT:    por %xmm8, %xmm1
+; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
+; SSE-NEXT:    movdqa %xmm7, %xmm3
+; SSE-NEXT:    pxor %xmm0, %xmm3
+; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm3[1,1,3,3]
+; SSE-NEXT:    pcmpeqd %xmm4, %xmm6
+; SSE-NEXT:    pcmpgtd %xmm3, %xmm5
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm5[0,0,2,2]
+; SSE-NEXT:    pand %xmm6, %xmm3
+; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
+; SSE-NEXT:    por %xmm3, %xmm4
+; SSE-NEXT:    movdqa %xmm7, %xmm3
+; SSE-NEXT:    pand %xmm4, %xmm3
+; SSE-NEXT:    pandn %xmm2, %xmm4
+; SSE-NEXT:    por %xmm3, %xmm4
+; SSE-NEXT:    movdqa %xmm4, %xmm2
+; SSE-NEXT:    pxor %xmm0, %xmm2
+; SSE-NEXT:    movdqa %xmm2, %xmm3
+; SSE-NEXT:    pcmpgtd %xmm0, %xmm3
+; SSE-NEXT:    pcmpeqd %xmm0, %xmm2
+; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE-NEXT:    pand %xmm3, %xmm2
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE-NEXT:    por %xmm2, %xmm3
+; SSE-NEXT:    pand %xmm4, %xmm3
+; SSE-NEXT:    movdqa %xmm1, %xmm2
+; SSE-NEXT:    pxor %xmm0, %xmm2
+; SSE-NEXT:    movdqa %xmm2, %xmm4
+; SSE-NEXT:    pcmpgtd %xmm0, %xmm4
+; SSE-NEXT:    pcmpeqd %xmm0, %xmm2
+; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE-NEXT:    pand %xmm4, %xmm2
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
+; SSE-NEXT:    por %xmm2, %xmm0
+; SSE-NEXT:    pand %xmm1, %xmm0
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2]
+; SSE-NEXT:    addq $72, %rsp
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: ustest_f16i32:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpsrlq $48, %xmm0, %xmm1
+; AVX2-NEXT:    vpextrw $0, %xmm1, %eax
+; AVX2-NEXT:    movzwl %ax, %eax
+; AVX2-NEXT:    vmovd %eax, %xmm1
+; AVX2-NEXT:    vcvtph2ps %xmm1, %xmm1
+; AVX2-NEXT:    vcvttss2si %xmm1, %rax
+; AVX2-NEXT:    vmovq %rax, %xmm1
+; AVX2-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; AVX2-NEXT:    vpextrw $0, %xmm2, %eax
+; AVX2-NEXT:    movzwl %ax, %eax
+; AVX2-NEXT:    vmovd %eax, %xmm2
+; AVX2-NEXT:    vcvtph2ps %xmm2, %xmm2
+; AVX2-NEXT:    vcvttss2si %xmm2, %rax
+; AVX2-NEXT:    vmovq %rax, %xmm2
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX2-NEXT:    vpextrw $0, %xmm0, %eax
+; AVX2-NEXT:    movzwl %ax, %eax
+; AVX2-NEXT:    vmovd %eax, %xmm2
+; AVX2-NEXT:    vcvtph2ps %xmm2, %xmm2
+; AVX2-NEXT:    vcvttss2si %xmm2, %rax
+; AVX2-NEXT:    vmovq %rax, %xmm2
+; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm0
+; AVX2-NEXT:    vpextrw $0, %xmm0, %eax
+; AVX2-NEXT:    movzwl %ax, %eax
+; AVX2-NEXT:    vmovd %eax, %xmm0
+; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX2-NEXT:    vcvttss2si %xmm0, %rax
+; AVX2-NEXT:    vmovq %rax, %xmm0
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
+; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,4294967295]
+; AVX2-NEXT:    vpcmpgtq %ymm0, %ymm1, %ymm2
+; AVX2-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm1
+; AVX2-NEXT:    vpand %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6]
+; AVX2-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX2-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: ustest_f16i32:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vcvtph2ps %xmm0, %ymm0
+; AVX512-NEXT:    vcvttps2qq %ymm0, %zmm0
+; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512-NEXT:    vpmaxsq %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpmovusqd %ymm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
 entry:
   %conv = fptosi <4 x half> %x to <4 x i64>
   %0 = icmp slt <4 x i64> %conv, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
@@ -690,24 +1088,42 @@ entry:
 
 ; i16 saturate
 
-define <2 x i16> @stest_f64i16(<2 x double> %x) {
-; CHECK-LABEL: stest_f64i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    cvttpd2dq %xmm0, %xmm0
-; CHECK-NEXT:    movdqa {{.*#+}} xmm1 = <32767,32767,u,u>
-; CHECK-NEXT:    movdqa %xmm1, %xmm2
-; CHECK-NEXT:    pcmpgtd %xmm0, %xmm2
-; CHECK-NEXT:    pand %xmm2, %xmm0
-; CHECK-NEXT:    pandn %xmm1, %xmm2
-; CHECK-NEXT:    por %xmm0, %xmm2
-; CHECK-NEXT:    movdqa {{.*#+}} xmm0 = <4294934528,4294934528,u,u>
-; CHECK-NEXT:    movdqa %xmm2, %xmm1
-; CHECK-NEXT:    pcmpgtd %xmm0, %xmm1
-; CHECK-NEXT:    pand %xmm1, %xmm2
-; CHECK-NEXT:    pandn %xmm0, %xmm1
-; CHECK-NEXT:    por %xmm2, %xmm1
-; CHECK-NEXT:    pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7]
-; CHECK-NEXT:    retq
+define <2 x i16> @stest_f64i16(<2 x double> %x) nounwind {
+; SSE-LABEL: stest_f64i16:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    cvttpd2dq %xmm0, %xmm0
+; SSE-NEXT:    movdqa {{.*#+}} xmm1 = <32767,32767,u,u>
+; SSE-NEXT:    movdqa %xmm1, %xmm2
+; SSE-NEXT:    pcmpgtd %xmm0, %xmm2
+; SSE-NEXT:    pand %xmm2, %xmm0
+; SSE-NEXT:    pandn %xmm1, %xmm2
+; SSE-NEXT:    por %xmm0, %xmm2
+; SSE-NEXT:    movdqa {{.*#+}} xmm0 = <4294934528,4294934528,u,u>
+; SSE-NEXT:    movdqa %xmm2, %xmm1
+; SSE-NEXT:    pcmpgtd %xmm0, %xmm1
+; SSE-NEXT:    pand %xmm1, %xmm2
+; SSE-NEXT:    pandn %xmm0, %xmm1
+; SSE-NEXT:    por %xmm2, %xmm1
+; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7]
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: stest_f64i16:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vcvttpd2dq %xmm0, %xmm0
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [32767,32767,32767,32767]
+; AVX2-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [4294934528,4294934528,4294934528,4294934528]
+; AVX2-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: stest_f64i16:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vcvttpd2dq %xmm0, %xmm0
+; AVX512-NEXT:    vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
+; AVX512-NEXT:    vpmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
+; AVX512-NEXT:    vpmovdw %xmm0, %xmm0
+; AVX512-NEXT:    retq
 entry:
   %conv = fptosi <2 x double> %x to <2 x i32>
   %0 = icmp slt <2 x i32> %conv, <i32 32767, i32 32767>
@@ -718,25 +1134,45 @@ entry:
   ret <2 x i16> %conv6
 }
 
-define <2 x i16> @utest_f64i16(<2 x double> %x) {
-; CHECK-LABEL: utest_f64i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    cvttpd2dq %xmm0, %xmm1
-; CHECK-NEXT:    movapd %xmm1, %xmm2
-; CHECK-NEXT:    psrad $31, %xmm2
-; CHECK-NEXT:    addpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cvttpd2dq %xmm0, %xmm0
-; CHECK-NEXT:    andpd %xmm2, %xmm0
-; CHECK-NEXT:    orpd %xmm1, %xmm0
-; CHECK-NEXT:    movapd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
-; CHECK-NEXT:    xorpd %xmm0, %xmm1
-; CHECK-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-NEXT:    movdqa %xmm1, %xmm2
-; CHECK-NEXT:    pandn %xmm0, %xmm2
-; CHECK-NEXT:    psrld $16, %xmm1
-; CHECK-NEXT:    por %xmm2, %xmm1
-; CHECK-NEXT:    pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7]
-; CHECK-NEXT:    retq
+define <2 x i16> @utest_f64i16(<2 x double> %x) nounwind {
+; SSE-LABEL: utest_f64i16:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    cvttpd2dq %xmm0, %xmm1
+; SSE-NEXT:    movapd %xmm1, %xmm2
+; SSE-NEXT:    psrad $31, %xmm2
+; SSE-NEXT:    addpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT:    cvttpd2dq %xmm0, %xmm0
+; SSE-NEXT:    andpd %xmm2, %xmm0
+; SSE-NEXT:    orpd %xmm1, %xmm0
+; SSE-NEXT:    movapd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
+; SSE-NEXT:    xorpd %xmm0, %xmm1
+; SSE-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE-NEXT:    movdqa %xmm1, %xmm2
+; SSE-NEXT:    pandn %xmm0, %xmm2
+; SSE-NEXT:    psrld $16, %xmm1
+; SSE-NEXT:    por %xmm2, %xmm1
+; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7]
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: utest_f64i16:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vcvttpd2dq %xmm0, %xmm1
+; AVX2-NEXT:    vaddpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vpsrad $31, %xmm1, %xmm2
+; AVX2-NEXT:    vcvttpd2dq %xmm0, %xmm0
+; AVX2-NEXT:    vandpd %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vorpd %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [65535,65535,65535,65535]
+; AVX2-NEXT:    vpminud %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: utest_f64i16:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vcvttpd2udq %xmm0, %xmm0
+; AVX512-NEXT:    vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
+; AVX512-NEXT:    vpmovdw %xmm0, %xmm0
+; AVX512-NEXT:    retq
 entry:
   %conv = fptoui <2 x double> %x to <2 x i32>
   %0 = icmp ult <2 x i32> %conv, <i32 65535, i32 65535>
@@ -745,22 +1181,41 @@ entry:
   ret <2 x i16> %conv6
 }
 
-define <2 x i16> @ustest_f64i16(<2 x double> %x) {
-; CHECK-LABEL: ustest_f64i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    cvttpd2dq %xmm0, %xmm0
-; CHECK-NEXT:    movdqa {{.*#+}} xmm1 = <65535,65535,u,u>
-; CHECK-NEXT:    movdqa %xmm1, %xmm2
-; CHECK-NEXT:    pcmpgtd %xmm0, %xmm2
-; CHECK-NEXT:    pand %xmm2, %xmm0
-; CHECK-NEXT:    pandn %xmm1, %xmm2
-; CHECK-NEXT:    por %xmm0, %xmm2
-; CHECK-NEXT:    pxor %xmm0, %xmm0
-; CHECK-NEXT:    movdqa %xmm2, %xmm1
-; CHECK-NEXT:    pcmpgtd %xmm0, %xmm1
-; CHECK-NEXT:    pand %xmm2, %xmm1
-; CHECK-NEXT:    pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7]
-; CHECK-NEXT:    retq
+define <2 x i16> @ustest_f64i16(<2 x double> %x) nounwind {
+; SSE-LABEL: ustest_f64i16:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    cvttpd2dq %xmm0, %xmm0
+; SSE-NEXT:    movdqa {{.*#+}} xmm1 = <65535,65535,u,u>
+; SSE-NEXT:    movdqa %xmm1, %xmm2
+; SSE-NEXT:    pcmpgtd %xmm0, %xmm2
+; SSE-NEXT:    pand %xmm2, %xmm0
+; SSE-NEXT:    pandn %xmm1, %xmm2
+; SSE-NEXT:    por %xmm0, %xmm2
+; SSE-NEXT:    pxor %xmm0, %xmm0
+; SSE-NEXT:    movdqa %xmm2, %xmm1
+; SSE-NEXT:    pcmpgtd %xmm0, %xmm1
+; SSE-NEXT:    pand %xmm2, %xmm1
+; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7]
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: ustest_f64i16:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vcvttpd2dq %xmm0, %xmm0
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [65535,65535,65535,65535]
+; AVX2-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: ustest_f64i16:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vcvttpd2dq %xmm0, %xmm0
+; AVX512-NEXT:    vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
+; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpmovdw %xmm0, %xmm0
+; AVX512-NEXT:    retq
 entry:
   %conv = fptosi <2 x double> %x to <2 x i32>
   %0 = icmp slt <2 x i32> %conv, <i32 65535, i32 65535>
@@ -771,12 +1226,18 @@ entry:
   ret <2 x i16> %conv6
 }
 
-define <4 x i16> @stest_f32i16(<4 x float> %x) {
-; CHECK-LABEL: stest_f32i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    cvttps2dq %xmm0, %xmm0
-; CHECK-NEXT:    packssdw %xmm0, %xmm0
-; CHECK-NEXT:    retq
+define <4 x i16> @stest_f32i16(<4 x float> %x) nounwind {
+; SSE-LABEL: stest_f32i16:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    cvttps2dq %xmm0, %xmm0
+; SSE-NEXT:    packssdw %xmm0, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: stest_f32i16:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    vcvttps2dq %xmm0, %xmm0
+; AVX-NEXT:    vpackssdw %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    retq
 entry:
   %conv = fptosi <4 x float> %x to <4 x i32>
   %0 = icmp slt <4 x i32> %conv, <i32 32767, i32 32767, i32 32767, i32 32767>
@@ -787,27 +1248,47 @@ entry:
   ret <4 x i16> %conv6
 }
 
-define <4 x i16> @utest_f32i16(<4 x float> %x) {
-; CHECK-LABEL: utest_f32i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    cvttps2dq %xmm0, %xmm1
-; CHECK-NEXT:    movdqa %xmm1, %xmm2
-; CHECK-NEXT:    psrad $31, %xmm2
-; CHECK-NEXT:    subps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cvttps2dq %xmm0, %xmm0
-; CHECK-NEXT:    pand %xmm2, %xmm0
-; CHECK-NEXT:    por %xmm1, %xmm0
-; CHECK-NEXT:    movdqa {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
-; CHECK-NEXT:    pxor %xmm0, %xmm1
-; CHECK-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-NEXT:    movdqa %xmm1, %xmm2
-; CHECK-NEXT:    pandn %xmm0, %xmm2
-; CHECK-NEXT:    psrld $16, %xmm1
-; CHECK-NEXT:    por %xmm2, %xmm1
-; CHECK-NEXT:    pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7]
-; CHECK-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-NEXT:    retq
+define <4 x i16> @utest_f32i16(<4 x float> %x) nounwind {
+; SSE-LABEL: utest_f32i16:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    cvttps2dq %xmm0, %xmm1
+; SSE-NEXT:    movdqa %xmm1, %xmm2
+; SSE-NEXT:    psrad $31, %xmm2
+; SSE-NEXT:    subps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT:    cvttps2dq %xmm0, %xmm0
+; SSE-NEXT:    pand %xmm2, %xmm0
+; SSE-NEXT:    por %xmm1, %xmm0
+; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
+; SSE-NEXT:    pxor %xmm0, %xmm1
+; SSE-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE-NEXT:    movdqa %xmm1, %xmm2
+; SSE-NEXT:    pandn %xmm0, %xmm2
+; SSE-NEXT:    psrld $16, %xmm1
+; SSE-NEXT:    por %xmm2, %xmm1
+; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: utest_f32i16:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vbroadcastss {{.*#+}} xmm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9]
+; AVX2-NEXT:    vsubps %xmm1, %xmm0, %xmm1
+; AVX2-NEXT:    vcvttps2dq %xmm1, %xmm1
+; AVX2-NEXT:    vcvttps2dq %xmm0, %xmm0
+; AVX2-NEXT:    vpsrad $31, %xmm0, %xmm2
+; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [65535,65535,65535,65535]
+; AVX2-NEXT:    vpminud %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpackusdw %xmm0, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: utest_f32i16:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vcvttps2udq %xmm0, %xmm0
+; AVX512-NEXT:    vpmovusdw %xmm0, %xmm0
+; AVX512-NEXT:    retq
 entry:
   %conv = fptoui <4 x float> %x to <4 x i32>
   %0 = icmp ult <4 x i32> %conv, <i32 65535, i32 65535, i32 65535, i32 65535>
@@ -816,24 +1297,30 @@ entry:
   ret <4 x i16> %conv6
 }
 
-define <4 x i16> @ustest_f32i16(<4 x float> %x) {
-; CHECK-LABEL: ustest_f32i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    cvttps2dq %xmm0, %xmm0
-; CHECK-NEXT:    movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535]
-; CHECK-NEXT:    movdqa %xmm1, %xmm2
-; CHECK-NEXT:    pcmpgtd %xmm0, %xmm2
-; CHECK-NEXT:    pand %xmm2, %xmm0
-; CHECK-NEXT:    pandn %xmm1, %xmm2
-; CHECK-NEXT:    por %xmm0, %xmm2
-; CHECK-NEXT:    pxor %xmm0, %xmm0
-; CHECK-NEXT:    movdqa %xmm2, %xmm1
-; CHECK-NEXT:    pcmpgtd %xmm0, %xmm1
-; CHECK-NEXT:    pand %xmm2, %xmm1
-; CHECK-NEXT:    pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7]
-; CHECK-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-NEXT:    retq
+define <4 x i16> @ustest_f32i16(<4 x float> %x) nounwind {
+; SSE-LABEL: ustest_f32i16:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    cvttps2dq %xmm0, %xmm0
+; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535]
+; SSE-NEXT:    movdqa %xmm1, %xmm2
+; SSE-NEXT:    pcmpgtd %xmm0, %xmm2
+; SSE-NEXT:    pand %xmm2, %xmm0
+; SSE-NEXT:    pandn %xmm1, %xmm2
+; SSE-NEXT:    por %xmm0, %xmm2
+; SSE-NEXT:    pxor %xmm0, %xmm0
+; SSE-NEXT:    movdqa %xmm2, %xmm1
+; SSE-NEXT:    pcmpgtd %xmm0, %xmm1
+; SSE-NEXT:    pand %xmm2, %xmm1
+; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: ustest_f32i16:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    vcvttps2dq %xmm0, %xmm0
+; AVX-NEXT:    vpackusdw %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    retq
 entry:
   %conv = fptosi <4 x float> %x to <4 x i32>
   %0 = icmp slt <4 x i32> %conv, <i32 65535, i32 65535, i32 65535, i32 65535>
@@ -844,61 +1331,76 @@ entry:
   ret <4 x i16> %conv6
 }
 
-define <8 x i16> @stest_f16i16(<8 x half> %x) {
-; CHECK-LABEL: stest_f16i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    subq $72, %rsp
-; CHECK-NEXT:    .cfi_def_cfa_offset 80
-; CHECK-NEXT:    movdqa %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2@PLT
-; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; CHECK-NEXT:    callq __extendhfsf2@PLT
-; CHECK-NEXT:    unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; CHECK-NEXT:    cvttps2dq %xmm0, %xmm0
-; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movdqa (%rsp), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2@PLT
-; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
-; CHECK-NEXT:    callq __extendhfsf2@PLT
-; CHECK-NEXT:    unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; CHECK-NEXT:    cvttps2dq %xmm0, %xmm0
-; CHECK-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
-; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movdqa (%rsp), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    psrlq $48, %xmm0
-; CHECK-NEXT:    callq __extendhfsf2@PLT
-; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; CHECK-NEXT:    callq __extendhfsf2@PLT
-; CHECK-NEXT:    unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; CHECK-NEXT:    cvttps2dq %xmm0, %xmm0
-; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __extendhfsf2@PLT
-; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movdqa (%rsp), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    psrld $16, %xmm0
-; CHECK-NEXT:    callq __extendhfsf2@PLT
-; CHECK-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; CHECK-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; CHECK-NEXT:    cvttps2dq %xmm1, %xmm0
-; CHECK-NEXT:    punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
-; CHECK-NEXT:    packssdw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    addq $72, %rsp
-; CHECK-NEXT:    .cfi_def_cfa_offset 8
-; CHECK-NEXT:    retq
+define <8 x i16> @stest_f16i16(<8 x half> %x) nounwind {
+; SSE-LABEL: stest_f16i16:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    subq $72, %rsp
+; SSE-NEXT:    movdqa %xmm0, (%rsp) # 16-byte Spill
+; SSE-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE-NEXT:    callq __extendhfsf2@PLT
+; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; SSE-NEXT:    callq __extendhfsf2@PLT
+; SSE-NEXT:    unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; SSE-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; SSE-NEXT:    cvttps2dq %xmm0, %xmm0
+; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa (%rsp), %xmm0 # 16-byte Reload
+; SSE-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE-NEXT:    callq __extendhfsf2@PLT
+; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE-NEXT:    callq __extendhfsf2@PLT
+; SSE-NEXT:    unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; SSE-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; SSE-NEXT:    cvttps2dq %xmm0, %xmm0
+; SSE-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; SSE-NEXT:    # xmm0 = xmm0[0],mem[0]
+; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa (%rsp), %xmm0 # 16-byte Reload
+; SSE-NEXT:    psrlq $48, %xmm0
+; SSE-NEXT:    callq __extendhfsf2@PLT
+; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; SSE-NEXT:    callq __extendhfsf2@PLT
+; SSE-NEXT:    unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; SSE-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; SSE-NEXT:    cvttps2dq %xmm0, %xmm0
+; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; SSE-NEXT:    callq __extendhfsf2@PLT
+; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa (%rsp), %xmm0 # 16-byte Reload
+; SSE-NEXT:    psrld $16, %xmm0
+; SSE-NEXT:    callq __extendhfsf2@PLT
+; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE-NEXT:    cvttps2dq %xmm1, %xmm0
+; SSE-NEXT:    punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; SSE-NEXT:    # xmm0 = xmm0[0],mem[0]
+; SSE-NEXT:    packssdw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; SSE-NEXT:    addq $72, %rsp
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: stest_f16i16:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vcvtph2ps %xmm0, %ymm0
+; AVX2-NEXT:    vcvttps2dq %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: stest_f16i16:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vcvtph2ps %xmm0, %ymm0
+; AVX512-NEXT:    vcvttps2dq %ymm0, %ymm0
+; AVX512-NEXT:    vpmovsdw %ymm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
 entry:
   %conv = fptosi <8 x half> %x to <8 x i32>
   %0 = icmp slt <8 x i32> %conv, <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>
@@ -909,105 +1411,128 @@ entry:
   ret <8 x i16> %conv6
 }
 
-define <8 x i16> @utesth_f16i16(<8 x half> %x) {
-; CHECK-LABEL: utesth_f16i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    subq $72, %rsp
-; CHECK-NEXT:    .cfi_def_cfa_offset 80
-; CHECK-NEXT:    movdqa %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2@PLT
-; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; CHECK-NEXT:    callq __extendhfsf2@PLT
-; CHECK-NEXT:    unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; CHECK-NEXT:    cvttps2dq %xmm0, %xmm1
-; CHECK-NEXT:    movdqa %xmm1, %xmm2
-; CHECK-NEXT:    psrad $31, %xmm2
-; CHECK-NEXT:    subps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cvttps2dq %xmm0, %xmm0
-; CHECK-NEXT:    pand %xmm2, %xmm0
-; CHECK-NEXT:    por %xmm1, %xmm0
-; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movdqa (%rsp), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2@PLT
-; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
-; CHECK-NEXT:    callq __extendhfsf2@PLT
-; CHECK-NEXT:    unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; CHECK-NEXT:    cvttps2dq %xmm0, %xmm1
-; CHECK-NEXT:    movdqa %xmm1, %xmm2
-; CHECK-NEXT:    psrad $31, %xmm2
-; CHECK-NEXT:    subps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cvttps2dq %xmm0, %xmm0
-; CHECK-NEXT:    pand %xmm2, %xmm0
-; CHECK-NEXT:    por %xmm1, %xmm0
-; CHECK-NEXT:    punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
-; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movdqa (%rsp), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    psrlq $48, %xmm0
-; CHECK-NEXT:    callq __extendhfsf2@PLT
-; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; CHECK-NEXT:    callq __extendhfsf2@PLT
-; CHECK-NEXT:    unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; CHECK-NEXT:    cvttps2dq %xmm0, %xmm1
-; CHECK-NEXT:    movdqa %xmm1, %xmm2
-; CHECK-NEXT:    psrad $31, %xmm2
-; CHECK-NEXT:    subps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cvttps2dq %xmm0, %xmm0
-; CHECK-NEXT:    pand %xmm2, %xmm0
-; CHECK-NEXT:    por %xmm1, %xmm0
-; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __extendhfsf2@PLT
-; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movdqa (%rsp), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    psrld $16, %xmm0
-; CHECK-NEXT:    callq __extendhfsf2@PLT
-; CHECK-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; CHECK-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; CHECK-NEXT:    cvttps2dq %xmm2, %xmm0
-; CHECK-NEXT:    movdqa %xmm0, %xmm1
-; CHECK-NEXT:    psrad $31, %xmm1
-; CHECK-NEXT:    subps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; CHECK-NEXT:    cvttps2dq %xmm2, %xmm2
-; CHECK-NEXT:    pand %xmm1, %xmm2
-; CHECK-NEXT:    por %xmm0, %xmm2
-; CHECK-NEXT:    punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm2 = xmm2[0],mem[0]
-; CHECK-NEXT:    movdqa {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
-; CHECK-NEXT:    movdqa %xmm2, %xmm3
-; CHECK-NEXT:    pxor %xmm1, %xmm3
-; CHECK-NEXT:    movdqa {{.*#+}} xmm4 = [2147549183,2147549183,2147549183,2147549183]
-; CHECK-NEXT:    movdqa %xmm4, %xmm0
-; CHECK-NEXT:    pcmpgtd %xmm3, %xmm0
-; CHECK-NEXT:    pand %xmm0, %xmm2
-; CHECK-NEXT:    pcmpeqd %xmm3, %xmm3
-; CHECK-NEXT:    pxor %xmm3, %xmm0
-; CHECK-NEXT:    por %xmm2, %xmm0
-; CHECK-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; CHECK-NEXT:    pxor %xmm2, %xmm1
-; CHECK-NEXT:    pcmpgtd %xmm1, %xmm4
-; CHECK-NEXT:    pand %xmm4, %xmm2
-; CHECK-NEXT:    pxor %xmm3, %xmm4
-; CHECK-NEXT:    por %xmm2, %xmm4
-; CHECK-NEXT:    pslld $16, %xmm4
-; CHECK-NEXT:    psrad $16, %xmm4
-; CHECK-NEXT:    pslld $16, %xmm0
-; CHECK-NEXT:    psrad $16, %xmm0
-; CHECK-NEXT:    packssdw %xmm4, %xmm0
-; CHECK-NEXT:    addq $72, %rsp
-; CHECK-NEXT:    .cfi_def_cfa_offset 8
-; CHECK-NEXT:    retq
+define <8 x i16> @utesth_f16i16(<8 x half> %x) nounwind {
+; SSE-LABEL: utesth_f16i16:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    subq $72, %rsp
+; SSE-NEXT:    movdqa %xmm0, (%rsp) # 16-byte Spill
+; SSE-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE-NEXT:    callq __extendhfsf2@PLT
+; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; SSE-NEXT:    callq __extendhfsf2@PLT
+; SSE-NEXT:    unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; SSE-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; SSE-NEXT:    cvttps2dq %xmm0, %xmm1
+; SSE-NEXT:    movdqa %xmm1, %xmm2
+; SSE-NEXT:    psrad $31, %xmm2
+; SSE-NEXT:    subps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT:    cvttps2dq %xmm0, %xmm0
+; SSE-NEXT:    pand %xmm2, %xmm0
+; SSE-NEXT:    por %xmm1, %xmm0
+; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa (%rsp), %xmm0 # 16-byte Reload
+; SSE-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE-NEXT:    callq __extendhfsf2@PLT
+; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE-NEXT:    callq __extendhfsf2@PLT
+; SSE-NEXT:    unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; SSE-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; SSE-NEXT:    cvttps2dq %xmm0, %xmm1
+; SSE-NEXT:    movdqa %xmm1, %xmm2
+; SSE-NEXT:    psrad $31, %xmm2
+; SSE-NEXT:    subps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT:    cvttps2dq %xmm0, %xmm0
+; SSE-NEXT:    pand %xmm2, %xmm0
+; SSE-NEXT:    por %xmm1, %xmm0
+; SSE-NEXT:    punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; SSE-NEXT:    # xmm0 = xmm0[0],mem[0]
+; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa (%rsp), %xmm0 # 16-byte Reload
+; SSE-NEXT:    psrlq $48, %xmm0
+; SSE-NEXT:    callq __extendhfsf2@PLT
+; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; SSE-NEXT:    callq __extendhfsf2@PLT
+; SSE-NEXT:    unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; SSE-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; SSE-NEXT:    cvttps2dq %xmm0, %xmm1
+; SSE-NEXT:    movdqa %xmm1, %xmm2
+; SSE-NEXT:    psrad $31, %xmm2
+; SSE-NEXT:    subps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT:    cvttps2dq %xmm0, %xmm0
+; SSE-NEXT:    pand %xmm2, %xmm0
+; SSE-NEXT:    por %xmm1, %xmm0
+; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; SSE-NEXT:    callq __extendhfsf2@PLT
+; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa (%rsp), %xmm0 # 16-byte Reload
+; SSE-NEXT:    psrld $16, %xmm0
+; SSE-NEXT:    callq __extendhfsf2@PLT
+; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; SSE-NEXT:    cvttps2dq %xmm2, %xmm0
+; SSE-NEXT:    movdqa %xmm0, %xmm1
+; SSE-NEXT:    psrad $31, %xmm1
+; SSE-NEXT:    subps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE-NEXT:    cvttps2dq %xmm2, %xmm2
+; SSE-NEXT:    pand %xmm1, %xmm2
+; SSE-NEXT:    por %xmm0, %xmm2
+; SSE-NEXT:    punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
+; SSE-NEXT:    # xmm2 = xmm2[0],mem[0]
+; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
+; SSE-NEXT:    movdqa %xmm2, %xmm3
+; SSE-NEXT:    pxor %xmm1, %xmm3
+; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [2147549183,2147549183,2147549183,2147549183]
+; SSE-NEXT:    movdqa %xmm4, %xmm0
+; SSE-NEXT:    pcmpgtd %xmm3, %xmm0
+; SSE-NEXT:    pand %xmm0, %xmm2
+; SSE-NEXT:    pcmpeqd %xmm3, %xmm3
+; SSE-NEXT:    pxor %xmm3, %xmm0
+; SSE-NEXT:    por %xmm2, %xmm0
+; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; SSE-NEXT:    pxor %xmm2, %xmm1
+; SSE-NEXT:    pcmpgtd %xmm1, %xmm4
+; SSE-NEXT:    pand %xmm4, %xmm2
+; SSE-NEXT:    pxor %xmm3, %xmm4
+; SSE-NEXT:    por %xmm2, %xmm4
+; SSE-NEXT:    pslld $16, %xmm4
+; SSE-NEXT:    psrad $16, %xmm4
+; SSE-NEXT:    pslld $16, %xmm0
+; SSE-NEXT:    psrad $16, %xmm0
+; SSE-NEXT:    packssdw %xmm4, %xmm0
+; SSE-NEXT:    addq $72, %rsp
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: utesth_f16i16:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vcvtph2ps %xmm0, %ymm0
+; AVX2-NEXT:    vbroadcastss {{.*#+}} ymm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9]
+; AVX2-NEXT:    vsubps %ymm1, %ymm0, %ymm1
+; AVX2-NEXT:    vcvttps2dq %ymm1, %ymm1
+; AVX2-NEXT:    vcvttps2dq %ymm0, %ymm0
+; AVX2-NEXT:    vpsrad $31, %ymm0, %ymm2
+; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,65535,65535,65535]
+; AVX2-NEXT:    vpminud %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: utesth_f16i16:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vcvtph2ps %xmm0, %ymm0
+; AVX512-NEXT:    vcvttps2udq %ymm0, %ymm0
+; AVX512-NEXT:    vpmovusdw %ymm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
 entry:
   %conv = fptoui <8 x half> %x to <8 x i32>
   %0 = icmp ult <8 x i32> %conv, <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
@@ -1016,84 +1541,101 @@ entry:
   ret <8 x i16> %conv6
 }
 
-define <8 x i16> @ustest_f16i16(<8 x half> %x) {
-; CHECK-LABEL: ustest_f16i16:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    subq $72, %rsp
-; CHECK-NEXT:    .cfi_def_cfa_offset 80
-; CHECK-NEXT:    movdqa %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT:    psrlq $48, %xmm0
-; CHECK-NEXT:    callq __extendhfsf2@PLT
-; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; CHECK-NEXT:    callq __extendhfsf2@PLT
-; CHECK-NEXT:    unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; CHECK-NEXT:    cvttps2dq %xmm0, %xmm0
-; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __extendhfsf2@PLT
-; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movdqa (%rsp), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    psrld $16, %xmm0
-; CHECK-NEXT:    callq __extendhfsf2@PLT
-; CHECK-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; CHECK-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; CHECK-NEXT:    cvttps2dq %xmm1, %xmm0
-; CHECK-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
-; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movdqa (%rsp), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2@PLT
-; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; CHECK-NEXT:    callq __extendhfsf2@PLT
-; CHECK-NEXT:    unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; CHECK-NEXT:    cvttps2dq %xmm0, %xmm0
-; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movdqa (%rsp), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2@PLT
-; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
-; CHECK-NEXT:    callq __extendhfsf2@PLT
-; CHECK-NEXT:    unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; CHECK-NEXT:    cvttps2dq %xmm0, %xmm0
-; CHECK-NEXT:    punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
-; CHECK-NEXT:    movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535]
-; CHECK-NEXT:    movdqa %xmm1, %xmm2
-; CHECK-NEXT:    pcmpgtd %xmm0, %xmm2
-; CHECK-NEXT:    pand %xmm2, %xmm0
-; CHECK-NEXT:    pandn %xmm1, %xmm2
-; CHECK-NEXT:    por %xmm0, %xmm2
-; CHECK-NEXT:    movdqa %xmm1, %xmm3
-; CHECK-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    pcmpgtd %xmm0, %xmm3
-; CHECK-NEXT:    pand %xmm3, %xmm0
-; CHECK-NEXT:    pandn %xmm1, %xmm3
-; CHECK-NEXT:    por %xmm0, %xmm3
-; CHECK-NEXT:    pxor %xmm1, %xmm1
-; CHECK-NEXT:    movdqa %xmm3, %xmm0
-; CHECK-NEXT:    pcmpgtd %xmm1, %xmm0
-; CHECK-NEXT:    pand %xmm3, %xmm0
-; CHECK-NEXT:    movdqa %xmm2, %xmm3
-; CHECK-NEXT:    pcmpgtd %xmm1, %xmm3
-; CHECK-NEXT:    pand %xmm2, %xmm3
-; CHECK-NEXT:    pslld $16, %xmm3
-; CHECK-NEXT:    psrad $16, %xmm3
-; CHECK-NEXT:    pslld $16, %xmm0
-; CHECK-NEXT:    psrad $16, %xmm0
-; CHECK-NEXT:    packssdw %xmm3, %xmm0
-; CHECK-NEXT:    addq $72, %rsp
-; CHECK-NEXT:    .cfi_def_cfa_offset 8
-; CHECK-NEXT:    retq
+define <8 x i16> @ustest_f16i16(<8 x half> %x) nounwind {
+; SSE-LABEL: ustest_f16i16:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    subq $72, %rsp
+; SSE-NEXT:    movdqa %xmm0, (%rsp) # 16-byte Spill
+; SSE-NEXT:    psrlq $48, %xmm0
+; SSE-NEXT:    callq __extendhfsf2@PLT
+; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; SSE-NEXT:    callq __extendhfsf2@PLT
+; SSE-NEXT:    unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; SSE-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; SSE-NEXT:    cvttps2dq %xmm0, %xmm0
+; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; SSE-NEXT:    callq __extendhfsf2@PLT
+; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa (%rsp), %xmm0 # 16-byte Reload
+; SSE-NEXT:    psrld $16, %xmm0
+; SSE-NEXT:    callq __extendhfsf2@PLT
+; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE-NEXT:    cvttps2dq %xmm1, %xmm0
+; SSE-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; SSE-NEXT:    # xmm0 = xmm0[0],mem[0]
+; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa (%rsp), %xmm0 # 16-byte Reload
+; SSE-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE-NEXT:    callq __extendhfsf2@PLT
+; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; SSE-NEXT:    callq __extendhfsf2@PLT
+; SSE-NEXT:    unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; SSE-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; SSE-NEXT:    cvttps2dq %xmm0, %xmm0
+; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa (%rsp), %xmm0 # 16-byte Reload
+; SSE-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE-NEXT:    callq __extendhfsf2@PLT
+; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE-NEXT:    callq __extendhfsf2@PLT
+; SSE-NEXT:    unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; SSE-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; SSE-NEXT:    cvttps2dq %xmm0, %xmm0
+; SSE-NEXT:    punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; SSE-NEXT:    # xmm0 = xmm0[0],mem[0]
+; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535]
+; SSE-NEXT:    movdqa %xmm1, %xmm2
+; SSE-NEXT:    pcmpgtd %xmm0, %xmm2
+; SSE-NEXT:    pand %xmm2, %xmm0
+; SSE-NEXT:    pandn %xmm1, %xmm2
+; SSE-NEXT:    por %xmm0, %xmm2
+; SSE-NEXT:    movdqa %xmm1, %xmm3
+; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE-NEXT:    pcmpgtd %xmm0, %xmm3
+; SSE-NEXT:    pand %xmm3, %xmm0
+; SSE-NEXT:    pandn %xmm1, %xmm3
+; SSE-NEXT:    por %xmm0, %xmm3
+; SSE-NEXT:    pxor %xmm1, %xmm1
+; SSE-NEXT:    movdqa %xmm3, %xmm0
+; SSE-NEXT:    pcmpgtd %xmm1, %xmm0
+; SSE-NEXT:    pand %xmm3, %xmm0
+; SSE-NEXT:    movdqa %xmm2, %xmm3
+; SSE-NEXT:    pcmpgtd %xmm1, %xmm3
+; SSE-NEXT:    pand %xmm2, %xmm3
+; SSE-NEXT:    pslld $16, %xmm3
+; SSE-NEXT:    psrad $16, %xmm3
+; SSE-NEXT:    pslld $16, %xmm0
+; SSE-NEXT:    psrad $16, %xmm0
+; SSE-NEXT:    packssdw %xmm3, %xmm0
+; SSE-NEXT:    addq $72, %rsp
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: ustest_f16i16:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vcvtph2ps %xmm0, %ymm0
+; AVX2-NEXT:    vcvttps2dq %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: ustest_f16i16:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vcvtph2ps %xmm0, %ymm0
+; AVX512-NEXT:    vcvttps2dq %ymm0, %ymm0
+; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpmovusdw %ymm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
 entry:
   %conv = fptosi <8 x half> %x to <8 x i32>
   %0 = icmp slt <8 x i32> %conv, <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
@@ -1106,25 +1648,44 @@ entry:
 
 ; i8 saturate
 
-define <2 x i8> @stest_f64i8(<2 x double> %x) {
-; CHECK-LABEL: stest_f64i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    cvttpd2dq %xmm0, %xmm0
-; CHECK-NEXT:    movdqa {{.*#+}} xmm1 = <127,127,u,u>
-; CHECK-NEXT:    movdqa %xmm1, %xmm2
-; CHECK-NEXT:    pcmpgtd %xmm0, %xmm2
-; CHECK-NEXT:    pand %xmm2, %xmm0
-; CHECK-NEXT:    pandn %xmm1, %xmm2
-; CHECK-NEXT:    por %xmm0, %xmm2
-; CHECK-NEXT:    movdqa {{.*#+}} xmm1 = <4294967168,4294967168,u,u>
-; CHECK-NEXT:    movdqa %xmm2, %xmm0
-; CHECK-NEXT:    pcmpgtd %xmm1, %xmm0
-; CHECK-NEXT:    pand %xmm0, %xmm2
-; CHECK-NEXT:    pandn %xmm1, %xmm0
-; CHECK-NEXT:    por %xmm2, %xmm0
-; CHECK-NEXT:    packssdw %xmm0, %xmm0
-; CHECK-NEXT:    packsswb %xmm0, %xmm0
-; CHECK-NEXT:    retq
+define <2 x i8> @stest_f64i8(<2 x double> %x) nounwind {
+; SSE-LABEL: stest_f64i8:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    cvttpd2dq %xmm0, %xmm0
+; SSE-NEXT:    movdqa {{.*#+}} xmm1 = <127,127,u,u>
+; SSE-NEXT:    movdqa %xmm1, %xmm2
+; SSE-NEXT:    pcmpgtd %xmm0, %xmm2
+; SSE-NEXT:    pand %xmm2, %xmm0
+; SSE-NEXT:    pandn %xmm1, %xmm2
+; SSE-NEXT:    por %xmm0, %xmm2
+; SSE-NEXT:    movdqa {{.*#+}} xmm1 = <4294967168,4294967168,u,u>
+; SSE-NEXT:    movdqa %xmm2, %xmm0
+; SSE-NEXT:    pcmpgtd %xmm1, %xmm0
+; SSE-NEXT:    pand %xmm0, %xmm2
+; SSE-NEXT:    pandn %xmm1, %xmm0
+; SSE-NEXT:    por %xmm2, %xmm0
+; SSE-NEXT:    packssdw %xmm0, %xmm0
+; SSE-NEXT:    packsswb %xmm0, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: stest_f64i8:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vcvttpd2dq %xmm0, %xmm0
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [127,127,127,127]
+; AVX2-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [4294967168,4294967168,4294967168,4294967168]
+; AVX2-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpackssdw %xmm0, %xmm0, %xmm0
+; AVX2-NEXT:    vpacksswb %xmm0, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: stest_f64i8:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vcvttpd2dq %xmm0, %xmm0
+; AVX512-NEXT:    vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
+; AVX512-NEXT:    vpmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
+; AVX512-NEXT:    vpmovdb %xmm0, %xmm0
+; AVX512-NEXT:    retq
 entry:
   %conv = fptosi <2 x double> %x to <2 x i32>
   %0 = icmp slt <2 x i32> %conv, <i32 127, i32 127>
@@ -1135,26 +1696,46 @@ entry:
   ret <2 x i8> %conv6
 }
 
-define <2 x i8> @utest_f64i8(<2 x double> %x) {
-; CHECK-LABEL: utest_f64i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    cvttpd2dq %xmm0, %xmm1
-; CHECK-NEXT:    movapd %xmm1, %xmm2
-; CHECK-NEXT:    psrad $31, %xmm2
-; CHECK-NEXT:    addpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cvttpd2dq %xmm0, %xmm3
-; CHECK-NEXT:    andpd %xmm2, %xmm3
-; CHECK-NEXT:    orpd %xmm1, %xmm3
-; CHECK-NEXT:    movapd {{.*#+}} xmm0 = [2147483648,2147483648,2147483648,2147483648]
-; CHECK-NEXT:    xorpd %xmm3, %xmm0
-; CHECK-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    movdqa %xmm0, %xmm1
-; CHECK-NEXT:    pandn %xmm3, %xmm1
-; CHECK-NEXT:    psrld $24, %xmm0
-; CHECK-NEXT:    por %xmm1, %xmm0
-; CHECK-NEXT:    packuswb %xmm0, %xmm0
-; CHECK-NEXT:    packuswb %xmm0, %xmm0
-; CHECK-NEXT:    retq
+define <2 x i8> @utest_f64i8(<2 x double> %x) nounwind {
+; SSE-LABEL: utest_f64i8:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    cvttpd2dq %xmm0, %xmm1
+; SSE-NEXT:    movapd %xmm1, %xmm2
+; SSE-NEXT:    psrad $31, %xmm2
+; SSE-NEXT:    addpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT:    cvttpd2dq %xmm0, %xmm3
+; SSE-NEXT:    andpd %xmm2, %xmm3
+; SSE-NEXT:    orpd %xmm1, %xmm3
+; SSE-NEXT:    movapd {{.*#+}} xmm0 = [2147483648,2147483648,2147483648,2147483648]
+; SSE-NEXT:    xorpd %xmm3, %xmm0
+; SSE-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT:    movdqa %xmm0, %xmm1
+; SSE-NEXT:    pandn %xmm3, %xmm1
+; SSE-NEXT:    psrld $24, %xmm0
+; SSE-NEXT:    por %xmm1, %xmm0
+; SSE-NEXT:    packuswb %xmm0, %xmm0
+; SSE-NEXT:    packuswb %xmm0, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: utest_f64i8:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vcvttpd2dq %xmm0, %xmm1
+; AVX2-NEXT:    vpsrad $31, %xmm1, %xmm2
+; AVX2-NEXT:    vaddpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vcvttpd2dq %xmm0, %xmm0
+; AVX2-NEXT:    vandpd %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vorpd %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [255,255,255,255]
+; AVX2-NEXT:    vpminud %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: utest_f64i8:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vcvttpd2udq %xmm0, %xmm0
+; AVX512-NEXT:    vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
+; AVX512-NEXT:    vpmovdb %xmm0, %xmm0
+; AVX512-NEXT:    retq
 entry:
   %conv = fptoui <2 x double> %x to <2 x i32>
   %0 = icmp ult <2 x i32> %conv, <i32 255, i32 255>
@@ -1163,23 +1744,43 @@ entry:
   ret <2 x i8> %conv6
 }
 
-define <2 x i8> @ustest_f64i8(<2 x double> %x) {
-; CHECK-LABEL: ustest_f64i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    cvttpd2dq %xmm0, %xmm0
-; CHECK-NEXT:    movdqa {{.*#+}} xmm1 = <255,255,u,u>
-; CHECK-NEXT:    movdqa %xmm1, %xmm2
-; CHECK-NEXT:    pcmpgtd %xmm0, %xmm2
-; CHECK-NEXT:    pand %xmm2, %xmm0
-; CHECK-NEXT:    pandn %xmm1, %xmm2
-; CHECK-NEXT:    por %xmm0, %xmm2
-; CHECK-NEXT:    pxor %xmm1, %xmm1
-; CHECK-NEXT:    movdqa %xmm2, %xmm0
-; CHECK-NEXT:    pcmpgtd %xmm1, %xmm0
-; CHECK-NEXT:    pand %xmm2, %xmm0
-; CHECK-NEXT:    packuswb %xmm0, %xmm0
-; CHECK-NEXT:    packuswb %xmm0, %xmm0
-; CHECK-NEXT:    retq
+define <2 x i8> @ustest_f64i8(<2 x double> %x) nounwind {
+; SSE-LABEL: ustest_f64i8:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    cvttpd2dq %xmm0, %xmm0
+; SSE-NEXT:    movdqa {{.*#+}} xmm1 = <255,255,u,u>
+; SSE-NEXT:    movdqa %xmm1, %xmm2
+; SSE-NEXT:    pcmpgtd %xmm0, %xmm2
+; SSE-NEXT:    pand %xmm2, %xmm0
+; SSE-NEXT:    pandn %xmm1, %xmm2
+; SSE-NEXT:    por %xmm0, %xmm2
+; SSE-NEXT:    pxor %xmm1, %xmm1
+; SSE-NEXT:    movdqa %xmm2, %xmm0
+; SSE-NEXT:    pcmpgtd %xmm1, %xmm0
+; SSE-NEXT:    pand %xmm2, %xmm0
+; SSE-NEXT:    packuswb %xmm0, %xmm0
+; SSE-NEXT:    packuswb %xmm0, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: ustest_f64i8:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vcvttpd2dq %xmm0, %xmm0
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [255,255,255,255]
+; AVX2-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpackusdw %xmm0, %xmm0, %xmm0
+; AVX2-NEXT:    vpackuswb %xmm0, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: ustest_f64i8:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vcvttpd2dq %xmm0, %xmm0
+; AVX512-NEXT:    vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
+; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpmovdb %xmm0, %xmm0
+; AVX512-NEXT:    retq
 entry:
   %conv = fptosi <2 x double> %x to <2 x i32>
   %0 = icmp slt <2 x i32> %conv, <i32 255, i32 255>
@@ -1190,25 +1791,41 @@ entry:
   ret <2 x i8> %conv6
 }
 
-define <4 x i8> @stest_f32i8(<4 x float> %x) {
-; CHECK-LABEL: stest_f32i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    cvttps2dq %xmm0, %xmm0
-; CHECK-NEXT:    movdqa {{.*#+}} xmm1 = [127,127,127,127]
-; CHECK-NEXT:    movdqa %xmm1, %xmm2
-; CHECK-NEXT:    pcmpgtd %xmm0, %xmm2
-; CHECK-NEXT:    pand %xmm2, %xmm0
-; CHECK-NEXT:    pandn %xmm1, %xmm2
-; CHECK-NEXT:    por %xmm0, %xmm2
-; CHECK-NEXT:    movdqa {{.*#+}} xmm1 = [4294967168,4294967168,4294967168,4294967168]
-; CHECK-NEXT:    movdqa %xmm2, %xmm0
-; CHECK-NEXT:    pcmpgtd %xmm1, %xmm0
-; CHECK-NEXT:    pand %xmm0, %xmm2
-; CHECK-NEXT:    pandn %xmm1, %xmm0
-; CHECK-NEXT:    por %xmm2, %xmm0
-; CHECK-NEXT:    packssdw %xmm0, %xmm0
-; CHECK-NEXT:    packsswb %xmm0, %xmm0
-; CHECK-NEXT:    retq
+define <4 x i8> @stest_f32i8(<4 x float> %x) nounwind {
+; SSE-LABEL: stest_f32i8:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    cvttps2dq %xmm0, %xmm0
+; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [127,127,127,127]
+; SSE-NEXT:    movdqa %xmm1, %xmm2
+; SSE-NEXT:    pcmpgtd %xmm0, %xmm2
+; SSE-NEXT:    pand %xmm2, %xmm0
+; SSE-NEXT:    pandn %xmm1, %xmm2
+; SSE-NEXT:    por %xmm0, %xmm2
+; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [4294967168,4294967168,4294967168,4294967168]
+; SSE-NEXT:    movdqa %xmm2, %xmm0
+; SSE-NEXT:    pcmpgtd %xmm1, %xmm0
+; SSE-NEXT:    pand %xmm0, %xmm2
+; SSE-NEXT:    pandn %xmm1, %xmm0
+; SSE-NEXT:    por %xmm2, %xmm0
+; SSE-NEXT:    packssdw %xmm0, %xmm0
+; SSE-NEXT:    packsswb %xmm0, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: stest_f32i8:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vcvttps2dq %xmm0, %xmm0
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [127,127,127,127]
+; AVX2-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [4294967168,4294967168,4294967168,4294967168]
+; AVX2-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: stest_f32i8:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vcvttps2dq %xmm0, %xmm0
+; AVX512-NEXT:    vpmovsdb %xmm0, %xmm0
+; AVX512-NEXT:    retq
 entry:
   %conv = fptosi <4 x float> %x to <4 x i32>
   %0 = icmp slt <4 x i32> %conv, <i32 127, i32 127, i32 127, i32 127>
@@ -1219,26 +1836,46 @@ entry:
   ret <4 x i8> %conv6
 }
 
-define <4 x i8> @utest_f32i8(<4 x float> %x) {
-; CHECK-LABEL: utest_f32i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    cvttps2dq %xmm0, %xmm1
-; CHECK-NEXT:    movdqa %xmm1, %xmm2
-; CHECK-NEXT:    psrad $31, %xmm2
-; CHECK-NEXT:    subps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cvttps2dq %xmm0, %xmm3
-; CHECK-NEXT:    pand %xmm2, %xmm3
-; CHECK-NEXT:    por %xmm1, %xmm3
-; CHECK-NEXT:    movdqa {{.*#+}} xmm0 = [2147483648,2147483648,2147483648,2147483648]
-; CHECK-NEXT:    pxor %xmm3, %xmm0
-; CHECK-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    movdqa %xmm0, %xmm1
-; CHECK-NEXT:    pandn %xmm3, %xmm1
-; CHECK-NEXT:    psrld $24, %xmm0
-; CHECK-NEXT:    por %xmm1, %xmm0
-; CHECK-NEXT:    packuswb %xmm0, %xmm0
-; CHECK-NEXT:    packuswb %xmm0, %xmm0
-; CHECK-NEXT:    retq
+define <4 x i8> @utest_f32i8(<4 x float> %x) nounwind {
+; SSE-LABEL: utest_f32i8:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    cvttps2dq %xmm0, %xmm1
+; SSE-NEXT:    movdqa %xmm1, %xmm2
+; SSE-NEXT:    psrad $31, %xmm2
+; SSE-NEXT:    subps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT:    cvttps2dq %xmm0, %xmm3
+; SSE-NEXT:    pand %xmm2, %xmm3
+; SSE-NEXT:    por %xmm1, %xmm3
+; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [2147483648,2147483648,2147483648,2147483648]
+; SSE-NEXT:    pxor %xmm3, %xmm0
+; SSE-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT:    movdqa %xmm0, %xmm1
+; SSE-NEXT:    pandn %xmm3, %xmm1
+; SSE-NEXT:    psrld $24, %xmm0
+; SSE-NEXT:    por %xmm1, %xmm0
+; SSE-NEXT:    packuswb %xmm0, %xmm0
+; SSE-NEXT:    packuswb %xmm0, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: utest_f32i8:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vbroadcastss {{.*#+}} xmm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9]
+; AVX2-NEXT:    vsubps %xmm1, %xmm0, %xmm1
+; AVX2-NEXT:    vcvttps2dq %xmm1, %xmm1
+; AVX2-NEXT:    vcvttps2dq %xmm0, %xmm0
+; AVX2-NEXT:    vpsrad $31, %xmm0, %xmm2
+; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [255,255,255,255]
+; AVX2-NEXT:    vpminud %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: utest_f32i8:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vcvttps2udq %xmm0, %xmm0
+; AVX512-NEXT:    vpmovusdb %xmm0, %xmm0
+; AVX512-NEXT:    retq
 entry:
   %conv = fptoui <4 x float> %x to <4 x i32>
   %0 = icmp ult <4 x i32> %conv, <i32 255, i32 255, i32 255, i32 255>
@@ -1247,23 +1884,41 @@ entry:
   ret <4 x i8> %conv6
 }
 
-define <4 x i8> @ustest_f32i8(<4 x float> %x) {
-; CHECK-LABEL: ustest_f32i8:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    cvttps2dq %xmm0, %xmm0
-; CHECK-NEXT:    movdqa {{.*#+}} xmm1 = [255,255,255,255]
-; CHECK-NEXT:    movdqa %xmm1, %xmm2
-; CHECK-NEXT:    pcmpgtd %xmm0, %xmm2
-; CHECK-NEXT:    pand %xmm2, %xmm0
-; CHECK-NEXT:    pandn %xmm1, %xmm2
-; CHECK-NEXT:    por %xmm0, %xmm2
-; CHECK-NEXT:    pxor %xmm1, %xmm1
-; CHECK-NEXT:    movdqa %xmm2, %xmm0
-; CHECK-NEXT:    pcmpgtd %xmm1, %xmm0
-; CHECK-NEXT:    pand %xmm2, %xmm0
-; CHECK-NEXT:    packuswb %xmm0, %xmm0
-; CHECK-NEXT:    packuswb %xmm0, %xmm0
-; CHECK-NEXT:    retq
+define <4 x i8> @ustest_f32i8(<4 x float> %x) nounwind {
+; SSE-LABEL: ustest_f32i8:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    cvttps2dq %xmm0, %xmm0
+; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [255,255,255,255]
+; SSE-NEXT:    movdqa %xmm1, %xmm2
+; SSE-NEXT:    pcmpgtd %xmm0, %xmm2
+; SSE-NEXT:    pand %xmm2, %xmm0
+; SSE-NEXT:    pandn %xmm1, %xmm2
+; SSE-NEXT:    por %xmm0, %xmm2
+; SSE-NEXT:    pxor %xmm1, %xmm1
+; SSE-NEXT:    movdqa %xmm2, %xmm0
+; SSE-NEXT:    pcmpgtd %xmm1, %xmm0
+; SSE-NEXT:    pand %xmm2, %xmm0
+; SSE-NEXT:    packuswb %xmm0, %xmm0
+; SSE-NEXT:    packuswb %xmm0, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: ustest_f32i8:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vcvttps2dq %xmm0, %xmm0
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [255,255,255,255]
+; AVX2-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: ustest_f32i8:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vcvttps2dq %xmm0, %xmm0
+; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpmovusdb %xmm0, %xmm0
+; AVX512-NEXT:    retq
 entry:
   %conv = fptosi <4 x float> %x to <4 x i32>
   %0 = icmp slt <4 x i32> %conv, <i32 255, i32 255, i32 255, i32 255>
@@ -1276,55 +1931,129 @@ entry:
 
 ; i64 saturate
 
-define <2 x i64> @stest_f64i64(<2 x double> %x) {
-; CHECK-LABEL: stest_f64i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    pushq %r14
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    pushq %rbx
-; CHECK-NEXT:    .cfi_def_cfa_offset 24
-; CHECK-NEXT:    subq $24, %rsp
-; CHECK-NEXT:    .cfi_def_cfa_offset 48
-; CHECK-NEXT:    .cfi_offset %rbx, -24
-; CHECK-NEXT:    .cfi_offset %r14, -16
-; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
-; CHECK-NEXT:    callq __fixdfti@PLT
-; CHECK-NEXT:    movq %rax, %rbx
-; CHECK-NEXT:    movq %rdx, %r14
-; CHECK-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __fixdfti@PLT
-; CHECK-NEXT:    xorl %ecx, %ecx
-; CHECK-NEXT:    movabsq $9223372036854775807, %rsi # imm = 0x7FFFFFFFFFFFFFFF
-; CHECK-NEXT:    cmpq %rsi, %rax
-; CHECK-NEXT:    movq %rdx, %rdi
-; CHECK-NEXT:    sbbq $0, %rdi
-; CHECK-NEXT:    cmovgeq %rcx, %rdx
-; CHECK-NEXT:    cmovgeq %rsi, %rax
-; CHECK-NEXT:    cmpq %rsi, %rbx
-; CHECK-NEXT:    movq %r14, %rdi
-; CHECK-NEXT:    sbbq $0, %rdi
-; CHECK-NEXT:    cmovlq %r14, %rcx
-; CHECK-NEXT:    cmovlq %rbx, %rsi
-; CHECK-NEXT:    movabsq $-9223372036854775808, %rdi # imm = 0x8000000000000000
-; CHECK-NEXT:    cmpq %rsi, %rdi
-; CHECK-NEXT:    movq $-1, %r8
-; CHECK-NEXT:    movq $-1, %r9
-; CHECK-NEXT:    sbbq %rcx, %r9
-; CHECK-NEXT:    cmovgeq %rdi, %rsi
-; CHECK-NEXT:    cmpq %rax, %rdi
-; CHECK-NEXT:    sbbq %rdx, %r8
-; CHECK-NEXT:    cmovgeq %rdi, %rax
-; CHECK-NEXT:    movq %rax, %xmm0
-; CHECK-NEXT:    movq %rsi, %xmm1
-; CHECK-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; CHECK-NEXT:    addq $24, %rsp
-; CHECK-NEXT:    .cfi_def_cfa_offset 24
-; CHECK-NEXT:    popq %rbx
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    popq %r14
-; CHECK-NEXT:    .cfi_def_cfa_offset 8
-; CHECK-NEXT:    retq
+define <2 x i64> @stest_f64i64(<2 x double> %x) nounwind {
+; SSE-LABEL: stest_f64i64:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    pushq %r14
+; SSE-NEXT:    pushq %rbx
+; SSE-NEXT:    subq $24, %rsp
+; SSE-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE-NEXT:    callq __fixdfti@PLT
+; SSE-NEXT:    movq %rax, %rbx
+; SSE-NEXT:    movq %rdx, %r14
+; SSE-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; SSE-NEXT:    callq __fixdfti@PLT
+; SSE-NEXT:    xorl %ecx, %ecx
+; SSE-NEXT:    movabsq $9223372036854775807, %rsi # imm = 0x7FFFFFFFFFFFFFFF
+; SSE-NEXT:    cmpq %rsi, %rax
+; SSE-NEXT:    movq %rdx, %rdi
+; SSE-NEXT:    sbbq $0, %rdi
+; SSE-NEXT:    cmovgeq %rcx, %rdx
+; SSE-NEXT:    cmovgeq %rsi, %rax
+; SSE-NEXT:    cmpq %rsi, %rbx
+; SSE-NEXT:    movq %r14, %rdi
+; SSE-NEXT:    sbbq $0, %rdi
+; SSE-NEXT:    cmovlq %r14, %rcx
+; SSE-NEXT:    cmovlq %rbx, %rsi
+; SSE-NEXT:    movabsq $-9223372036854775808, %rdi # imm = 0x8000000000000000
+; SSE-NEXT:    cmpq %rsi, %rdi
+; SSE-NEXT:    movq $-1, %r8
+; SSE-NEXT:    movq $-1, %r9
+; SSE-NEXT:    sbbq %rcx, %r9
+; SSE-NEXT:    cmovgeq %rdi, %rsi
+; SSE-NEXT:    cmpq %rax, %rdi
+; SSE-NEXT:    sbbq %rdx, %r8
+; SSE-NEXT:    cmovgeq %rdi, %rax
+; SSE-NEXT:    movq %rax, %xmm0
+; SSE-NEXT:    movq %rsi, %xmm1
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT:    addq $24, %rsp
+; SSE-NEXT:    popq %rbx
+; SSE-NEXT:    popq %r14
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: stest_f64i64:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    pushq %r14
+; AVX2-NEXT:    pushq %rbx
+; AVX2-NEXT:    subq $24, %rsp
+; AVX2-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
+; AVX2-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX2-NEXT:    callq __fixdfti@PLT
+; AVX2-NEXT:    movq %rax, %rbx
+; AVX2-NEXT:    movq %rdx, %r14
+; AVX2-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
+; AVX2-NEXT:    callq __fixdfti@PLT
+; AVX2-NEXT:    xorl %ecx, %ecx
+; AVX2-NEXT:    movabsq $9223372036854775807, %rsi # imm = 0x7FFFFFFFFFFFFFFF
+; AVX2-NEXT:    cmpq %rsi, %rax
+; AVX2-NEXT:    movq %rdx, %rdi
+; AVX2-NEXT:    sbbq $0, %rdi
+; AVX2-NEXT:    cmovgeq %rcx, %rdx
+; AVX2-NEXT:    cmovgeq %rsi, %rax
+; AVX2-NEXT:    cmpq %rsi, %rbx
+; AVX2-NEXT:    movq %r14, %rdi
+; AVX2-NEXT:    sbbq $0, %rdi
+; AVX2-NEXT:    cmovlq %r14, %rcx
+; AVX2-NEXT:    cmovlq %rbx, %rsi
+; AVX2-NEXT:    movabsq $-9223372036854775808, %rdi # imm = 0x8000000000000000
+; AVX2-NEXT:    cmpq %rsi, %rdi
+; AVX2-NEXT:    movq $-1, %r8
+; AVX2-NEXT:    sbbq %rcx, %r8
+; AVX2-NEXT:    movq $-1, %rcx
+; AVX2-NEXT:    cmovgeq %rdi, %rsi
+; AVX2-NEXT:    cmpq %rax, %rdi
+; AVX2-NEXT:    sbbq %rdx, %rcx
+; AVX2-NEXT:    cmovgeq %rdi, %rax
+; AVX2-NEXT:    vmovq %rax, %xmm0
+; AVX2-NEXT:    vmovq %rsi, %xmm1
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT:    addq $24, %rsp
+; AVX2-NEXT:    popq %rbx
+; AVX2-NEXT:    popq %r14
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: stest_f64i64:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    pushq %r14
+; AVX512-NEXT:    pushq %rbx
+; AVX512-NEXT:    subq $24, %rsp
+; AVX512-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
+; AVX512-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512-NEXT:    callq __fixdfti@PLT
+; AVX512-NEXT:    movq %rax, %rbx
+; AVX512-NEXT:    movq %rdx, %r14
+; AVX512-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
+; AVX512-NEXT:    callq __fixdfti@PLT
+; AVX512-NEXT:    xorl %ecx, %ecx
+; AVX512-NEXT:    movabsq $9223372036854775807, %rsi # imm = 0x7FFFFFFFFFFFFFFF
+; AVX512-NEXT:    cmpq %rsi, %rax
+; AVX512-NEXT:    movq %rdx, %rdi
+; AVX512-NEXT:    sbbq $0, %rdi
+; AVX512-NEXT:    cmovgeq %rcx, %rdx
+; AVX512-NEXT:    cmovgeq %rsi, %rax
+; AVX512-NEXT:    cmpq %rsi, %rbx
+; AVX512-NEXT:    movq %r14, %rdi
+; AVX512-NEXT:    sbbq $0, %rdi
+; AVX512-NEXT:    cmovlq %r14, %rcx
+; AVX512-NEXT:    cmovlq %rbx, %rsi
+; AVX512-NEXT:    movabsq $-9223372036854775808, %rdi # imm = 0x8000000000000000
+; AVX512-NEXT:    cmpq %rsi, %rdi
+; AVX512-NEXT:    movq $-1, %r8
+; AVX512-NEXT:    movq $-1, %r9
+; AVX512-NEXT:    sbbq %rcx, %r9
+; AVX512-NEXT:    cmovgeq %rdi, %rsi
+; AVX512-NEXT:    cmpq %rax, %rdi
+; AVX512-NEXT:    sbbq %rdx, %r8
+; AVX512-NEXT:    cmovgeq %rdi, %rax
+; AVX512-NEXT:    vmovq %rax, %xmm0
+; AVX512-NEXT:    vmovq %rsi, %xmm1
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512-NEXT:    addq $24, %rsp
+; AVX512-NEXT:    popq %rbx
+; AVX512-NEXT:    popq %r14
+; AVX512-NEXT:    retq
 entry:
   %conv = fptosi <2 x double> %x to <2 x i128>
   %0 = icmp slt <2 x i128> %conv, <i128 9223372036854775807, i128 9223372036854775807>
@@ -1335,39 +2064,56 @@ entry:
   ret <2 x i64> %conv6
 }
 
-define <2 x i64> @utest_f64i64(<2 x double> %x) {
-; CHECK-LABEL: utest_f64i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    pushq %r14
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    pushq %rbx
-; CHECK-NEXT:    .cfi_def_cfa_offset 24
-; CHECK-NEXT:    subq $24, %rsp
-; CHECK-NEXT:    .cfi_def_cfa_offset 48
-; CHECK-NEXT:    .cfi_offset %rbx, -24
-; CHECK-NEXT:    .cfi_offset %r14, -16
-; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT:    callq __fixunsdfti@PLT
-; CHECK-NEXT:    movq %rax, %rbx
-; CHECK-NEXT:    movq %rdx, %r14
-; CHECK-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
-; CHECK-NEXT:    callq __fixunsdfti@PLT
-; CHECK-NEXT:    xorl %ecx, %ecx
-; CHECK-NEXT:    testq %rdx, %rdx
-; CHECK-NEXT:    cmovneq %rcx, %rax
-; CHECK-NEXT:    testq %r14, %r14
-; CHECK-NEXT:    cmovneq %rcx, %rbx
-; CHECK-NEXT:    movq %rbx, %xmm0
-; CHECK-NEXT:    movq %rax, %xmm1
-; CHECK-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; CHECK-NEXT:    addq $24, %rsp
-; CHECK-NEXT:    .cfi_def_cfa_offset 24
-; CHECK-NEXT:    popq %rbx
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    popq %r14
-; CHECK-NEXT:    .cfi_def_cfa_offset 8
-; CHECK-NEXT:    retq
+define <2 x i64> @utest_f64i64(<2 x double> %x) nounwind {
+; SSE-LABEL: utest_f64i64:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    pushq %r14
+; SSE-NEXT:    pushq %rbx
+; SSE-NEXT:    subq $24, %rsp
+; SSE-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; SSE-NEXT:    callq __fixunsdfti@PLT
+; SSE-NEXT:    movq %rax, %rbx
+; SSE-NEXT:    movq %rdx, %r14
+; SSE-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE-NEXT:    callq __fixunsdfti@PLT
+; SSE-NEXT:    xorl %ecx, %ecx
+; SSE-NEXT:    testq %rdx, %rdx
+; SSE-NEXT:    cmovneq %rcx, %rax
+; SSE-NEXT:    testq %r14, %r14
+; SSE-NEXT:    cmovneq %rcx, %rbx
+; SSE-NEXT:    movq %rbx, %xmm0
+; SSE-NEXT:    movq %rax, %xmm1
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT:    addq $24, %rsp
+; SSE-NEXT:    popq %rbx
+; SSE-NEXT:    popq %r14
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: utest_f64i64:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    pushq %r14
+; AVX-NEXT:    pushq %rbx
+; AVX-NEXT:    subq $24, %rsp
+; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX-NEXT:    callq __fixunsdfti@PLT
+; AVX-NEXT:    movq %rax, %rbx
+; AVX-NEXT:    movq %rdx, %r14
+; AVX-NEXT:    vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm0 = mem[1,0]
+; AVX-NEXT:    callq __fixunsdfti@PLT
+; AVX-NEXT:    xorl %ecx, %ecx
+; AVX-NEXT:    testq %rdx, %rdx
+; AVX-NEXT:    cmovneq %rcx, %rax
+; AVX-NEXT:    testq %r14, %r14
+; AVX-NEXT:    cmovneq %rcx, %rbx
+; AVX-NEXT:    vmovq %rbx, %xmm0
+; AVX-NEXT:    vmovq %rax, %xmm1
+; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT:    addq $24, %rsp
+; AVX-NEXT:    popq %rbx
+; AVX-NEXT:    popq %r14
+; AVX-NEXT:    retq
 entry:
   %conv = fptoui <2 x double> %x to <2 x i128>
   %0 = icmp ult <2 x i128> %conv, <i128 18446744073709551616, i128 18446744073709551616>
@@ -1376,52 +2122,82 @@ entry:
   ret <2 x i64> %conv6
 }
 
-define <2 x i64> @ustest_f64i64(<2 x double> %x) {
-; CHECK-LABEL: ustest_f64i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    pushq %r14
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    pushq %rbx
-; CHECK-NEXT:    .cfi_def_cfa_offset 24
-; CHECK-NEXT:    subq $24, %rsp
-; CHECK-NEXT:    .cfi_def_cfa_offset 48
-; CHECK-NEXT:    .cfi_offset %rbx, -24
-; CHECK-NEXT:    .cfi_offset %r14, -16
-; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
-; CHECK-NEXT:    callq __fixdfti@PLT
-; CHECK-NEXT:    movq %rax, %rbx
-; CHECK-NEXT:    movq %rdx, %r14
-; CHECK-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __fixdfti@PLT
-; CHECK-NEXT:    xorl %ecx, %ecx
-; CHECK-NEXT:    testq %rdx, %rdx
-; CHECK-NEXT:    movl $1, %esi
-; CHECK-NEXT:    cmovgq %rsi, %rdx
-; CHECK-NEXT:    cmovgq %rcx, %rax
-; CHECK-NEXT:    testq %r14, %r14
-; CHECK-NEXT:    cmovleq %r14, %rsi
-; CHECK-NEXT:    cmovgq %rcx, %rbx
-; CHECK-NEXT:    movq %rbx, %rdi
-; CHECK-NEXT:    negq %rdi
-; CHECK-NEXT:    movl $0, %edi
-; CHECK-NEXT:    sbbq %rsi, %rdi
-; CHECK-NEXT:    cmovgeq %rcx, %rbx
-; CHECK-NEXT:    movq %rax, %rsi
-; CHECK-NEXT:    negq %rsi
-; CHECK-NEXT:    movl $0, %esi
-; CHECK-NEXT:    sbbq %rdx, %rsi
-; CHECK-NEXT:    cmovgeq %rcx, %rax
-; CHECK-NEXT:    movq %rax, %xmm0
-; CHECK-NEXT:    movq %rbx, %xmm1
-; CHECK-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; CHECK-NEXT:    addq $24, %rsp
-; CHECK-NEXT:    .cfi_def_cfa_offset 24
-; CHECK-NEXT:    popq %rbx
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    popq %r14
-; CHECK-NEXT:    .cfi_def_cfa_offset 8
-; CHECK-NEXT:    retq
+define <2 x i64> @ustest_f64i64(<2 x double> %x) nounwind {
+; SSE-LABEL: ustest_f64i64:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    pushq %r14
+; SSE-NEXT:    pushq %rbx
+; SSE-NEXT:    subq $24, %rsp
+; SSE-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE-NEXT:    callq __fixdfti@PLT
+; SSE-NEXT:    movq %rax, %rbx
+; SSE-NEXT:    movq %rdx, %r14
+; SSE-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; SSE-NEXT:    callq __fixdfti@PLT
+; SSE-NEXT:    xorl %ecx, %ecx
+; SSE-NEXT:    testq %rdx, %rdx
+; SSE-NEXT:    movl $1, %esi
+; SSE-NEXT:    cmovgq %rsi, %rdx
+; SSE-NEXT:    cmovgq %rcx, %rax
+; SSE-NEXT:    testq %r14, %r14
+; SSE-NEXT:    cmovleq %r14, %rsi
+; SSE-NEXT:    cmovgq %rcx, %rbx
+; SSE-NEXT:    movq %rbx, %rdi
+; SSE-NEXT:    negq %rdi
+; SSE-NEXT:    movl $0, %edi
+; SSE-NEXT:    sbbq %rsi, %rdi
+; SSE-NEXT:    cmovgeq %rcx, %rbx
+; SSE-NEXT:    movq %rax, %rsi
+; SSE-NEXT:    negq %rsi
+; SSE-NEXT:    movl $0, %esi
+; SSE-NEXT:    sbbq %rdx, %rsi
+; SSE-NEXT:    cmovgeq %rcx, %rax
+; SSE-NEXT:    movq %rax, %xmm0
+; SSE-NEXT:    movq %rbx, %xmm1
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT:    addq $24, %rsp
+; SSE-NEXT:    popq %rbx
+; SSE-NEXT:    popq %r14
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: ustest_f64i64:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    pushq %r14
+; AVX-NEXT:    pushq %rbx
+; AVX-NEXT:    subq $24, %rsp
+; AVX-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
+; AVX-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX-NEXT:    callq __fixdfti@PLT
+; AVX-NEXT:    movq %rax, %rbx
+; AVX-NEXT:    movq %rdx, %r14
+; AVX-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
+; AVX-NEXT:    callq __fixdfti@PLT
+; AVX-NEXT:    xorl %ecx, %ecx
+; AVX-NEXT:    testq %rdx, %rdx
+; AVX-NEXT:    movl $1, %esi
+; AVX-NEXT:    cmovgq %rsi, %rdx
+; AVX-NEXT:    cmovgq %rcx, %rax
+; AVX-NEXT:    testq %r14, %r14
+; AVX-NEXT:    cmovleq %r14, %rsi
+; AVX-NEXT:    cmovgq %rcx, %rbx
+; AVX-NEXT:    movq %rbx, %rdi
+; AVX-NEXT:    negq %rdi
+; AVX-NEXT:    movl $0, %edi
+; AVX-NEXT:    sbbq %rsi, %rdi
+; AVX-NEXT:    cmovgeq %rcx, %rbx
+; AVX-NEXT:    movq %rax, %rsi
+; AVX-NEXT:    negq %rsi
+; AVX-NEXT:    movl $0, %esi
+; AVX-NEXT:    sbbq %rdx, %rsi
+; AVX-NEXT:    cmovgeq %rcx, %rax
+; AVX-NEXT:    vmovq %rax, %xmm0
+; AVX-NEXT:    vmovq %rbx, %xmm1
+; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT:    addq $24, %rsp
+; AVX-NEXT:    popq %rbx
+; AVX-NEXT:    popq %r14
+; AVX-NEXT:    retq
 entry:
   %conv = fptosi <2 x double> %x to <2 x i128>
   %0 = icmp slt <2 x i128> %conv, <i128 18446744073709551616, i128 18446744073709551616>
@@ -1432,55 +2208,129 @@ entry:
   ret <2 x i64> %conv6
 }
 
-define <2 x i64> @stest_f32i64(<2 x float> %x) {
-; CHECK-LABEL: stest_f32i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    pushq %r14
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    pushq %rbx
-; CHECK-NEXT:    .cfi_def_cfa_offset 24
-; CHECK-NEXT:    subq $24, %rsp
-; CHECK-NEXT:    .cfi_def_cfa_offset 48
-; CHECK-NEXT:    .cfi_offset %rbx, -24
-; CHECK-NEXT:    .cfi_offset %r14, -16
-; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; CHECK-NEXT:    callq __fixsfti@PLT
-; CHECK-NEXT:    movq %rax, %rbx
-; CHECK-NEXT:    movq %rdx, %r14
-; CHECK-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __fixsfti@PLT
-; CHECK-NEXT:    xorl %ecx, %ecx
-; CHECK-NEXT:    movabsq $9223372036854775807, %rsi # imm = 0x7FFFFFFFFFFFFFFF
-; CHECK-NEXT:    cmpq %rsi, %rax
-; CHECK-NEXT:    movq %rdx, %rdi
-; CHECK-NEXT:    sbbq $0, %rdi
-; CHECK-NEXT:    cmovgeq %rcx, %rdx
-; CHECK-NEXT:    cmovgeq %rsi, %rax
-; CHECK-NEXT:    cmpq %rsi, %rbx
-; CHECK-NEXT:    movq %r14, %rdi
-; CHECK-NEXT:    sbbq $0, %rdi
-; CHECK-NEXT:    cmovlq %r14, %rcx
-; CHECK-NEXT:    cmovlq %rbx, %rsi
-; CHECK-NEXT:    movabsq $-9223372036854775808, %rdi # imm = 0x8000000000000000
-; CHECK-NEXT:    cmpq %rsi, %rdi
-; CHECK-NEXT:    movq $-1, %r8
-; CHECK-NEXT:    movq $-1, %r9
-; CHECK-NEXT:    sbbq %rcx, %r9
-; CHECK-NEXT:    cmovgeq %rdi, %rsi
-; CHECK-NEXT:    cmpq %rax, %rdi
-; CHECK-NEXT:    sbbq %rdx, %r8
-; CHECK-NEXT:    cmovgeq %rdi, %rax
-; CHECK-NEXT:    movq %rax, %xmm0
-; CHECK-NEXT:    movq %rsi, %xmm1
-; CHECK-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; CHECK-NEXT:    addq $24, %rsp
-; CHECK-NEXT:    .cfi_def_cfa_offset 24
-; CHECK-NEXT:    popq %rbx
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    popq %r14
-; CHECK-NEXT:    .cfi_def_cfa_offset 8
-; CHECK-NEXT:    retq
+define <2 x i64> @stest_f32i64(<2 x float> %x) nounwind {
+; SSE-LABEL: stest_f32i64:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    pushq %r14
+; SSE-NEXT:    pushq %rbx
+; SSE-NEXT:    subq $24, %rsp
+; SSE-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; SSE-NEXT:    callq __fixsfti@PLT
+; SSE-NEXT:    movq %rax, %rbx
+; SSE-NEXT:    movq %rdx, %r14
+; SSE-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; SSE-NEXT:    callq __fixsfti@PLT
+; SSE-NEXT:    xorl %ecx, %ecx
+; SSE-NEXT:    movabsq $9223372036854775807, %rsi # imm = 0x7FFFFFFFFFFFFFFF
+; SSE-NEXT:    cmpq %rsi, %rax
+; SSE-NEXT:    movq %rdx, %rdi
+; SSE-NEXT:    sbbq $0, %rdi
+; SSE-NEXT:    cmovgeq %rcx, %rdx
+; SSE-NEXT:    cmovgeq %rsi, %rax
+; SSE-NEXT:    cmpq %rsi, %rbx
+; SSE-NEXT:    movq %r14, %rdi
+; SSE-NEXT:    sbbq $0, %rdi
+; SSE-NEXT:    cmovlq %r14, %rcx
+; SSE-NEXT:    cmovlq %rbx, %rsi
+; SSE-NEXT:    movabsq $-9223372036854775808, %rdi # imm = 0x8000000000000000
+; SSE-NEXT:    cmpq %rsi, %rdi
+; SSE-NEXT:    movq $-1, %r8
+; SSE-NEXT:    movq $-1, %r9
+; SSE-NEXT:    sbbq %rcx, %r9
+; SSE-NEXT:    cmovgeq %rdi, %rsi
+; SSE-NEXT:    cmpq %rax, %rdi
+; SSE-NEXT:    sbbq %rdx, %r8
+; SSE-NEXT:    cmovgeq %rdi, %rax
+; SSE-NEXT:    movq %rax, %xmm0
+; SSE-NEXT:    movq %rsi, %xmm1
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT:    addq $24, %rsp
+; SSE-NEXT:    popq %rbx
+; SSE-NEXT:    popq %r14
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: stest_f32i64:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    pushq %r14
+; AVX2-NEXT:    pushq %rbx
+; AVX2-NEXT:    subq $24, %rsp
+; AVX2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX2-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX2-NEXT:    callq __fixsfti@PLT
+; AVX2-NEXT:    movq %rax, %rbx
+; AVX2-NEXT:    movq %rdx, %r14
+; AVX2-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
+; AVX2-NEXT:    callq __fixsfti@PLT
+; AVX2-NEXT:    xorl %ecx, %ecx
+; AVX2-NEXT:    movabsq $9223372036854775807, %rsi # imm = 0x7FFFFFFFFFFFFFFF
+; AVX2-NEXT:    cmpq %rsi, %rax
+; AVX2-NEXT:    movq %rdx, %rdi
+; AVX2-NEXT:    sbbq $0, %rdi
+; AVX2-NEXT:    cmovgeq %rcx, %rdx
+; AVX2-NEXT:    cmovgeq %rsi, %rax
+; AVX2-NEXT:    cmpq %rsi, %rbx
+; AVX2-NEXT:    movq %r14, %rdi
+; AVX2-NEXT:    sbbq $0, %rdi
+; AVX2-NEXT:    cmovlq %r14, %rcx
+; AVX2-NEXT:    cmovlq %rbx, %rsi
+; AVX2-NEXT:    movabsq $-9223372036854775808, %rdi # imm = 0x8000000000000000
+; AVX2-NEXT:    cmpq %rsi, %rdi
+; AVX2-NEXT:    movq $-1, %r8
+; AVX2-NEXT:    sbbq %rcx, %r8
+; AVX2-NEXT:    movq $-1, %rcx
+; AVX2-NEXT:    cmovgeq %rdi, %rsi
+; AVX2-NEXT:    cmpq %rax, %rdi
+; AVX2-NEXT:    sbbq %rdx, %rcx
+; AVX2-NEXT:    cmovgeq %rdi, %rax
+; AVX2-NEXT:    vmovq %rax, %xmm0
+; AVX2-NEXT:    vmovq %rsi, %xmm1
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT:    addq $24, %rsp
+; AVX2-NEXT:    popq %rbx
+; AVX2-NEXT:    popq %r14
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: stest_f32i64:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    pushq %r14
+; AVX512-NEXT:    pushq %rbx
+; AVX512-NEXT:    subq $24, %rsp
+; AVX512-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX512-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX512-NEXT:    callq __fixsfti@PLT
+; AVX512-NEXT:    movq %rax, %rbx
+; AVX512-NEXT:    movq %rdx, %r14
+; AVX512-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
+; AVX512-NEXT:    callq __fixsfti@PLT
+; AVX512-NEXT:    xorl %ecx, %ecx
+; AVX512-NEXT:    movabsq $9223372036854775807, %rsi # imm = 0x7FFFFFFFFFFFFFFF
+; AVX512-NEXT:    cmpq %rsi, %rax
+; AVX512-NEXT:    movq %rdx, %rdi
+; AVX512-NEXT:    sbbq $0, %rdi
+; AVX512-NEXT:    cmovgeq %rcx, %rdx
+; AVX512-NEXT:    cmovgeq %rsi, %rax
+; AVX512-NEXT:    cmpq %rsi, %rbx
+; AVX512-NEXT:    movq %r14, %rdi
+; AVX512-NEXT:    sbbq $0, %rdi
+; AVX512-NEXT:    cmovlq %r14, %rcx
+; AVX512-NEXT:    cmovlq %rbx, %rsi
+; AVX512-NEXT:    movabsq $-9223372036854775808, %rdi # imm = 0x8000000000000000
+; AVX512-NEXT:    cmpq %rsi, %rdi
+; AVX512-NEXT:    movq $-1, %r8
+; AVX512-NEXT:    movq $-1, %r9
+; AVX512-NEXT:    sbbq %rcx, %r9
+; AVX512-NEXT:    cmovgeq %rdi, %rsi
+; AVX512-NEXT:    cmpq %rax, %rdi
+; AVX512-NEXT:    sbbq %rdx, %r8
+; AVX512-NEXT:    cmovgeq %rdi, %rax
+; AVX512-NEXT:    vmovq %rax, %xmm0
+; AVX512-NEXT:    vmovq %rsi, %xmm1
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512-NEXT:    addq $24, %rsp
+; AVX512-NEXT:    popq %rbx
+; AVX512-NEXT:    popq %r14
+; AVX512-NEXT:    retq
 entry:
   %conv = fptosi <2 x float> %x to <2 x i128>
   %0 = icmp slt <2 x i128> %conv, <i128 9223372036854775807, i128 9223372036854775807>
@@ -1491,39 +2341,56 @@ entry:
   ret <2 x i64> %conv6
 }
 
-define <2 x i64> @utest_f32i64(<2 x float> %x) {
-; CHECK-LABEL: utest_f32i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    pushq %r14
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    pushq %rbx
-; CHECK-NEXT:    .cfi_def_cfa_offset 24
-; CHECK-NEXT:    subq $24, %rsp
-; CHECK-NEXT:    .cfi_def_cfa_offset 48
-; CHECK-NEXT:    .cfi_offset %rbx, -24
-; CHECK-NEXT:    .cfi_offset %r14, -16
-; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT:    callq __fixunssfti@PLT
-; CHECK-NEXT:    movq %rax, %rbx
-; CHECK-NEXT:    movq %rdx, %r14
-; CHECK-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; CHECK-NEXT:    callq __fixunssfti@PLT
-; CHECK-NEXT:    xorl %ecx, %ecx
-; CHECK-NEXT:    testq %rdx, %rdx
-; CHECK-NEXT:    cmovneq %rcx, %rax
-; CHECK-NEXT:    testq %r14, %r14
-; CHECK-NEXT:    cmovneq %rcx, %rbx
-; CHECK-NEXT:    movq %rbx, %xmm0
-; CHECK-NEXT:    movq %rax, %xmm1
-; CHECK-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; CHECK-NEXT:    addq $24, %rsp
-; CHECK-NEXT:    .cfi_def_cfa_offset 24
-; CHECK-NEXT:    popq %rbx
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    popq %r14
-; CHECK-NEXT:    .cfi_def_cfa_offset 8
-; CHECK-NEXT:    retq
+define <2 x i64> @utest_f32i64(<2 x float> %x) nounwind {
+; SSE-LABEL: utest_f32i64:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    pushq %r14
+; SSE-NEXT:    pushq %rbx
+; SSE-NEXT:    subq $24, %rsp
+; SSE-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; SSE-NEXT:    callq __fixunssfti@PLT
+; SSE-NEXT:    movq %rax, %rbx
+; SSE-NEXT:    movq %rdx, %r14
+; SSE-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; SSE-NEXT:    callq __fixunssfti@PLT
+; SSE-NEXT:    xorl %ecx, %ecx
+; SSE-NEXT:    testq %rdx, %rdx
+; SSE-NEXT:    cmovneq %rcx, %rax
+; SSE-NEXT:    testq %r14, %r14
+; SSE-NEXT:    cmovneq %rcx, %rbx
+; SSE-NEXT:    movq %rbx, %xmm0
+; SSE-NEXT:    movq %rax, %xmm1
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT:    addq $24, %rsp
+; SSE-NEXT:    popq %rbx
+; SSE-NEXT:    popq %r14
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: utest_f32i64:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    pushq %r14
+; AVX-NEXT:    pushq %rbx
+; AVX-NEXT:    subq $24, %rsp
+; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX-NEXT:    callq __fixunssfti@PLT
+; AVX-NEXT:    movq %rax, %rbx
+; AVX-NEXT:    movq %rdx, %r14
+; AVX-NEXT:    vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm0 = mem[1,1,3,3]
+; AVX-NEXT:    callq __fixunssfti@PLT
+; AVX-NEXT:    xorl %ecx, %ecx
+; AVX-NEXT:    testq %rdx, %rdx
+; AVX-NEXT:    cmovneq %rcx, %rax
+; AVX-NEXT:    testq %r14, %r14
+; AVX-NEXT:    cmovneq %rcx, %rbx
+; AVX-NEXT:    vmovq %rbx, %xmm0
+; AVX-NEXT:    vmovq %rax, %xmm1
+; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT:    addq $24, %rsp
+; AVX-NEXT:    popq %rbx
+; AVX-NEXT:    popq %r14
+; AVX-NEXT:    retq
 entry:
   %conv = fptoui <2 x float> %x to <2 x i128>
   %0 = icmp ult <2 x i128> %conv, <i128 18446744073709551616, i128 18446744073709551616>
@@ -1532,52 +2399,82 @@ entry:
   ret <2 x i64> %conv6
 }
 
-define <2 x i64> @ustest_f32i64(<2 x float> %x) {
-; CHECK-LABEL: ustest_f32i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    pushq %r14
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    pushq %rbx
-; CHECK-NEXT:    .cfi_def_cfa_offset 24
-; CHECK-NEXT:    subq $24, %rsp
-; CHECK-NEXT:    .cfi_def_cfa_offset 48
-; CHECK-NEXT:    .cfi_offset %rbx, -24
-; CHECK-NEXT:    .cfi_offset %r14, -16
-; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; CHECK-NEXT:    callq __fixsfti@PLT
-; CHECK-NEXT:    movq %rax, %rbx
-; CHECK-NEXT:    movq %rdx, %r14
-; CHECK-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __fixsfti@PLT
-; CHECK-NEXT:    xorl %ecx, %ecx
-; CHECK-NEXT:    testq %rdx, %rdx
-; CHECK-NEXT:    movl $1, %esi
-; CHECK-NEXT:    cmovgq %rsi, %rdx
-; CHECK-NEXT:    cmovgq %rcx, %rax
-; CHECK-NEXT:    testq %r14, %r14
-; CHECK-NEXT:    cmovleq %r14, %rsi
-; CHECK-NEXT:    cmovgq %rcx, %rbx
-; CHECK-NEXT:    movq %rbx, %rdi
-; CHECK-NEXT:    negq %rdi
-; CHECK-NEXT:    movl $0, %edi
-; CHECK-NEXT:    sbbq %rsi, %rdi
-; CHECK-NEXT:    cmovgeq %rcx, %rbx
-; CHECK-NEXT:    movq %rax, %rsi
-; CHECK-NEXT:    negq %rsi
-; CHECK-NEXT:    movl $0, %esi
-; CHECK-NEXT:    sbbq %rdx, %rsi
-; CHECK-NEXT:    cmovgeq %rcx, %rax
-; CHECK-NEXT:    movq %rax, %xmm0
-; CHECK-NEXT:    movq %rbx, %xmm1
-; CHECK-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; CHECK-NEXT:    addq $24, %rsp
-; CHECK-NEXT:    .cfi_def_cfa_offset 24
-; CHECK-NEXT:    popq %rbx
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    popq %r14
-; CHECK-NEXT:    .cfi_def_cfa_offset 8
-; CHECK-NEXT:    retq
+define <2 x i64> @ustest_f32i64(<2 x float> %x) nounwind {
+; SSE-LABEL: ustest_f32i64:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    pushq %r14
+; SSE-NEXT:    pushq %rbx
+; SSE-NEXT:    subq $24, %rsp
+; SSE-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; SSE-NEXT:    callq __fixsfti@PLT
+; SSE-NEXT:    movq %rax, %rbx
+; SSE-NEXT:    movq %rdx, %r14
+; SSE-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; SSE-NEXT:    callq __fixsfti@PLT
+; SSE-NEXT:    xorl %ecx, %ecx
+; SSE-NEXT:    testq %rdx, %rdx
+; SSE-NEXT:    movl $1, %esi
+; SSE-NEXT:    cmovgq %rsi, %rdx
+; SSE-NEXT:    cmovgq %rcx, %rax
+; SSE-NEXT:    testq %r14, %r14
+; SSE-NEXT:    cmovleq %r14, %rsi
+; SSE-NEXT:    cmovgq %rcx, %rbx
+; SSE-NEXT:    movq %rbx, %rdi
+; SSE-NEXT:    negq %rdi
+; SSE-NEXT:    movl $0, %edi
+; SSE-NEXT:    sbbq %rsi, %rdi
+; SSE-NEXT:    cmovgeq %rcx, %rbx
+; SSE-NEXT:    movq %rax, %rsi
+; SSE-NEXT:    negq %rsi
+; SSE-NEXT:    movl $0, %esi
+; SSE-NEXT:    sbbq %rdx, %rsi
+; SSE-NEXT:    cmovgeq %rcx, %rax
+; SSE-NEXT:    movq %rax, %xmm0
+; SSE-NEXT:    movq %rbx, %xmm1
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT:    addq $24, %rsp
+; SSE-NEXT:    popq %rbx
+; SSE-NEXT:    popq %r14
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: ustest_f32i64:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    pushq %r14
+; AVX-NEXT:    pushq %rbx
+; AVX-NEXT:    subq $24, %rsp
+; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX-NEXT:    callq __fixsfti@PLT
+; AVX-NEXT:    movq %rax, %rbx
+; AVX-NEXT:    movq %rdx, %r14
+; AVX-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
+; AVX-NEXT:    callq __fixsfti@PLT
+; AVX-NEXT:    xorl %ecx, %ecx
+; AVX-NEXT:    testq %rdx, %rdx
+; AVX-NEXT:    movl $1, %esi
+; AVX-NEXT:    cmovgq %rsi, %rdx
+; AVX-NEXT:    cmovgq %rcx, %rax
+; AVX-NEXT:    testq %r14, %r14
+; AVX-NEXT:    cmovleq %r14, %rsi
+; AVX-NEXT:    cmovgq %rcx, %rbx
+; AVX-NEXT:    movq %rbx, %rdi
+; AVX-NEXT:    negq %rdi
+; AVX-NEXT:    movl $0, %edi
+; AVX-NEXT:    sbbq %rsi, %rdi
+; AVX-NEXT:    cmovgeq %rcx, %rbx
+; AVX-NEXT:    movq %rax, %rsi
+; AVX-NEXT:    negq %rsi
+; AVX-NEXT:    movl $0, %esi
+; AVX-NEXT:    sbbq %rdx, %rsi
+; AVX-NEXT:    cmovgeq %rcx, %rax
+; AVX-NEXT:    vmovq %rax, %xmm0
+; AVX-NEXT:    vmovq %rbx, %xmm1
+; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT:    addq $24, %rsp
+; AVX-NEXT:    popq %rbx
+; AVX-NEXT:    popq %r14
+; AVX-NEXT:    retq
 entry:
   %conv = fptosi <2 x float> %x to <2 x i128>
   %0 = icmp slt <2 x i128> %conv, <i128 18446744073709551616, i128 18446744073709551616>
@@ -1588,55 +2485,129 @@ entry:
   ret <2 x i64> %conv6
 }
 
-define <2 x i64> @stest_f16i64(<2 x half> %x) {
-; CHECK-LABEL: stest_f16i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    pushq %r14
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    pushq %rbx
-; CHECK-NEXT:    .cfi_def_cfa_offset 24
-; CHECK-NEXT:    subq $24, %rsp
-; CHECK-NEXT:    .cfi_def_cfa_offset 48
-; CHECK-NEXT:    .cfi_offset %rbx, -24
-; CHECK-NEXT:    .cfi_offset %r14, -16
-; CHECK-NEXT:    movdqa %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT:    psrld $16, %xmm0
-; CHECK-NEXT:    callq __fixhfti@PLT
-; CHECK-NEXT:    movq %rax, %rbx
-; CHECK-NEXT:    movq %rdx, %r14
-; CHECK-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __fixhfti@PLT
-; CHECK-NEXT:    xorl %ecx, %ecx
-; CHECK-NEXT:    movabsq $9223372036854775807, %rsi # imm = 0x7FFFFFFFFFFFFFFF
-; CHECK-NEXT:    cmpq %rsi, %rax
-; CHECK-NEXT:    movq %rdx, %rdi
-; CHECK-NEXT:    sbbq $0, %rdi
-; CHECK-NEXT:    cmovgeq %rcx, %rdx
-; CHECK-NEXT:    cmovgeq %rsi, %rax
-; CHECK-NEXT:    cmpq %rsi, %rbx
-; CHECK-NEXT:    movq %r14, %rdi
-; CHECK-NEXT:    sbbq $0, %rdi
-; CHECK-NEXT:    cmovlq %r14, %rcx
-; CHECK-NEXT:    cmovlq %rbx, %rsi
-; CHECK-NEXT:    movabsq $-9223372036854775808, %rdi # imm = 0x8000000000000000
-; CHECK-NEXT:    cmpq %rsi, %rdi
-; CHECK-NEXT:    movq $-1, %r8
-; CHECK-NEXT:    movq $-1, %r9
-; CHECK-NEXT:    sbbq %rcx, %r9
-; CHECK-NEXT:    cmovgeq %rdi, %rsi
-; CHECK-NEXT:    cmpq %rax, %rdi
-; CHECK-NEXT:    sbbq %rdx, %r8
-; CHECK-NEXT:    cmovgeq %rdi, %rax
-; CHECK-NEXT:    movq %rax, %xmm0
-; CHECK-NEXT:    movq %rsi, %xmm1
-; CHECK-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; CHECK-NEXT:    addq $24, %rsp
-; CHECK-NEXT:    .cfi_def_cfa_offset 24
-; CHECK-NEXT:    popq %rbx
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    popq %r14
-; CHECK-NEXT:    .cfi_def_cfa_offset 8
-; CHECK-NEXT:    retq
+define <2 x i64> @stest_f16i64(<2 x half> %x) nounwind {
+; SSE-LABEL: stest_f16i64:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    pushq %r14
+; SSE-NEXT:    pushq %rbx
+; SSE-NEXT:    subq $24, %rsp
+; SSE-NEXT:    movdqa %xmm0, (%rsp) # 16-byte Spill
+; SSE-NEXT:    psrld $16, %xmm0
+; SSE-NEXT:    callq __fixhfti@PLT
+; SSE-NEXT:    movq %rax, %rbx
+; SSE-NEXT:    movq %rdx, %r14
+; SSE-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; SSE-NEXT:    callq __fixhfti@PLT
+; SSE-NEXT:    xorl %ecx, %ecx
+; SSE-NEXT:    movabsq $9223372036854775807, %rsi # imm = 0x7FFFFFFFFFFFFFFF
+; SSE-NEXT:    cmpq %rsi, %rax
+; SSE-NEXT:    movq %rdx, %rdi
+; SSE-NEXT:    sbbq $0, %rdi
+; SSE-NEXT:    cmovgeq %rcx, %rdx
+; SSE-NEXT:    cmovgeq %rsi, %rax
+; SSE-NEXT:    cmpq %rsi, %rbx
+; SSE-NEXT:    movq %r14, %rdi
+; SSE-NEXT:    sbbq $0, %rdi
+; SSE-NEXT:    cmovlq %r14, %rcx
+; SSE-NEXT:    cmovlq %rbx, %rsi
+; SSE-NEXT:    movabsq $-9223372036854775808, %rdi # imm = 0x8000000000000000
+; SSE-NEXT:    cmpq %rsi, %rdi
+; SSE-NEXT:    movq $-1, %r8
+; SSE-NEXT:    movq $-1, %r9
+; SSE-NEXT:    sbbq %rcx, %r9
+; SSE-NEXT:    cmovgeq %rdi, %rsi
+; SSE-NEXT:    cmpq %rax, %rdi
+; SSE-NEXT:    sbbq %rdx, %r8
+; SSE-NEXT:    cmovgeq %rdi, %rax
+; SSE-NEXT:    movq %rax, %xmm0
+; SSE-NEXT:    movq %rsi, %xmm1
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT:    addq $24, %rsp
+; SSE-NEXT:    popq %rbx
+; SSE-NEXT:    popq %r14
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: stest_f16i64:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    pushq %r14
+; AVX2-NEXT:    pushq %rbx
+; AVX2-NEXT:    subq $24, %rsp
+; AVX2-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
+; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm0
+; AVX2-NEXT:    callq __fixhfti@PLT
+; AVX2-NEXT:    movq %rax, %rbx
+; AVX2-NEXT:    movq %rdx, %r14
+; AVX2-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
+; AVX2-NEXT:    callq __fixhfti@PLT
+; AVX2-NEXT:    xorl %ecx, %ecx
+; AVX2-NEXT:    movabsq $9223372036854775807, %rsi # imm = 0x7FFFFFFFFFFFFFFF
+; AVX2-NEXT:    cmpq %rsi, %rax
+; AVX2-NEXT:    movq %rdx, %rdi
+; AVX2-NEXT:    sbbq $0, %rdi
+; AVX2-NEXT:    cmovgeq %rcx, %rdx
+; AVX2-NEXT:    cmovgeq %rsi, %rax
+; AVX2-NEXT:    cmpq %rsi, %rbx
+; AVX2-NEXT:    movq %r14, %rdi
+; AVX2-NEXT:    sbbq $0, %rdi
+; AVX2-NEXT:    cmovlq %r14, %rcx
+; AVX2-NEXT:    cmovlq %rbx, %rsi
+; AVX2-NEXT:    movabsq $-9223372036854775808, %rdi # imm = 0x8000000000000000
+; AVX2-NEXT:    cmpq %rsi, %rdi
+; AVX2-NEXT:    movq $-1, %r8
+; AVX2-NEXT:    sbbq %rcx, %r8
+; AVX2-NEXT:    movq $-1, %rcx
+; AVX2-NEXT:    cmovgeq %rdi, %rsi
+; AVX2-NEXT:    cmpq %rax, %rdi
+; AVX2-NEXT:    sbbq %rdx, %rcx
+; AVX2-NEXT:    cmovgeq %rdi, %rax
+; AVX2-NEXT:    vmovq %rax, %xmm0
+; AVX2-NEXT:    vmovq %rsi, %xmm1
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT:    addq $24, %rsp
+; AVX2-NEXT:    popq %rbx
+; AVX2-NEXT:    popq %r14
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: stest_f16i64:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    pushq %r14
+; AVX512-NEXT:    pushq %rbx
+; AVX512-NEXT:    subq $24, %rsp
+; AVX512-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
+; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm0
+; AVX512-NEXT:    callq __fixhfti@PLT
+; AVX512-NEXT:    movq %rax, %rbx
+; AVX512-NEXT:    movq %rdx, %r14
+; AVX512-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
+; AVX512-NEXT:    callq __fixhfti@PLT
+; AVX512-NEXT:    xorl %ecx, %ecx
+; AVX512-NEXT:    movabsq $9223372036854775807, %rsi # imm = 0x7FFFFFFFFFFFFFFF
+; AVX512-NEXT:    cmpq %rsi, %rax
+; AVX512-NEXT:    movq %rdx, %rdi
+; AVX512-NEXT:    sbbq $0, %rdi
+; AVX512-NEXT:    cmovgeq %rcx, %rdx
+; AVX512-NEXT:    cmovgeq %rsi, %rax
+; AVX512-NEXT:    cmpq %rsi, %rbx
+; AVX512-NEXT:    movq %r14, %rdi
+; AVX512-NEXT:    sbbq $0, %rdi
+; AVX512-NEXT:    cmovlq %r14, %rcx
+; AVX512-NEXT:    cmovlq %rbx, %rsi
+; AVX512-NEXT:    movabsq $-9223372036854775808, %rdi # imm = 0x8000000000000000
+; AVX512-NEXT:    cmpq %rsi, %rdi
+; AVX512-NEXT:    movq $-1, %r8
+; AVX512-NEXT:    movq $-1, %r9
+; AVX512-NEXT:    sbbq %rcx, %r9
+; AVX512-NEXT:    cmovgeq %rdi, %rsi
+; AVX512-NEXT:    cmpq %rax, %rdi
+; AVX512-NEXT:    sbbq %rdx, %r8
+; AVX512-NEXT:    cmovgeq %rdi, %rax
+; AVX512-NEXT:    vmovq %rax, %xmm0
+; AVX512-NEXT:    vmovq %rsi, %xmm1
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512-NEXT:    addq $24, %rsp
+; AVX512-NEXT:    popq %rbx
+; AVX512-NEXT:    popq %r14
+; AVX512-NEXT:    retq
 entry:
   %conv = fptosi <2 x half> %x to <2 x i128>
   %0 = icmp slt <2 x i128> %conv, <i128 9223372036854775807, i128 9223372036854775807>
@@ -1647,40 +2618,81 @@ entry:
   ret <2 x i64> %conv6
 }
 
-define <2 x i64> @utesth_f16i64(<2 x half> %x) {
-; CHECK-LABEL: utesth_f16i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    pushq %r14
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    pushq %rbx
-; CHECK-NEXT:    .cfi_def_cfa_offset 24
-; CHECK-NEXT:    subq $24, %rsp
-; CHECK-NEXT:    .cfi_def_cfa_offset 48
-; CHECK-NEXT:    .cfi_offset %rbx, -24
-; CHECK-NEXT:    .cfi_offset %r14, -16
-; CHECK-NEXT:    movdqa %xmm0, %xmm1
-; CHECK-NEXT:    psrld $16, %xmm1
-; CHECK-NEXT:    movdqa %xmm1, (%rsp) # 16-byte Spill
-; CHECK-NEXT:    callq __fixunshfti@PLT
-; CHECK-NEXT:    movq %rax, %rbx
-; CHECK-NEXT:    movq %rdx, %r14
-; CHECK-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __fixunshfti@PLT
-; CHECK-NEXT:    xorl %ecx, %ecx
-; CHECK-NEXT:    testq %rdx, %rdx
-; CHECK-NEXT:    cmovneq %rcx, %rax
-; CHECK-NEXT:    testq %r14, %r14
-; CHECK-NEXT:    cmovneq %rcx, %rbx
-; CHECK-NEXT:    movq %rbx, %xmm0
-; CHECK-NEXT:    movq %rax, %xmm1
-; CHECK-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; CHECK-NEXT:    addq $24, %rsp
-; CHECK-NEXT:    .cfi_def_cfa_offset 24
-; CHECK-NEXT:    popq %rbx
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    popq %r14
-; CHECK-NEXT:    .cfi_def_cfa_offset 8
-; CHECK-NEXT:    retq
+define <2 x i64> @utesth_f16i64(<2 x half> %x) nounwind {
+; SSE-LABEL: utesth_f16i64:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    pushq %r14
+; SSE-NEXT:    pushq %rbx
+; SSE-NEXT:    subq $24, %rsp
+; SSE-NEXT:    movdqa %xmm0, %xmm1
+; SSE-NEXT:    psrld $16, %xmm1
+; SSE-NEXT:    movdqa %xmm1, (%rsp) # 16-byte Spill
+; SSE-NEXT:    callq __fixunshfti@PLT
+; SSE-NEXT:    movq %rax, %rbx
+; SSE-NEXT:    movq %rdx, %r14
+; SSE-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; SSE-NEXT:    callq __fixunshfti@PLT
+; SSE-NEXT:    xorl %ecx, %ecx
+; SSE-NEXT:    testq %rdx, %rdx
+; SSE-NEXT:    cmovneq %rcx, %rax
+; SSE-NEXT:    testq %r14, %r14
+; SSE-NEXT:    cmovneq %rcx, %rbx
+; SSE-NEXT:    movq %rbx, %xmm0
+; SSE-NEXT:    movq %rax, %xmm1
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT:    addq $24, %rsp
+; SSE-NEXT:    popq %rbx
+; SSE-NEXT:    popq %r14
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: utesth_f16i64:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    pushq %r14
+; AVX2-NEXT:    pushq %rbx
+; AVX2-NEXT:    subq $24, %rsp
+; AVX2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX2-NEXT:    callq __fixunshfti@PLT
+; AVX2-NEXT:    movq %rax, %rbx
+; AVX2-NEXT:    movq %rdx, %r14
+; AVX2-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
+; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm0
+; AVX2-NEXT:    callq __fixunshfti@PLT
+; AVX2-NEXT:    xorl %ecx, %ecx
+; AVX2-NEXT:    testq %rdx, %rdx
+; AVX2-NEXT:    cmovneq %rcx, %rax
+; AVX2-NEXT:    testq %r14, %r14
+; AVX2-NEXT:    cmovneq %rcx, %rbx
+; AVX2-NEXT:    vmovq %rbx, %xmm0
+; AVX2-NEXT:    vmovq %rax, %xmm1
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT:    addq $24, %rsp
+; AVX2-NEXT:    popq %rbx
+; AVX2-NEXT:    popq %r14
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: utesth_f16i64:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    pushq %r14
+; AVX512-NEXT:    pushq %rbx
+; AVX512-NEXT:    subq $24, %rsp
+; AVX512-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX512-NEXT:    callq __fixunshfti@PLT
+; AVX512-NEXT:    movq %rax, %rbx
+; AVX512-NEXT:    movq %rdx, %r14
+; AVX512-NEXT:    vpsrld $16, (%rsp), %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    callq __fixunshfti@PLT
+; AVX512-NEXT:    xorl %ecx, %ecx
+; AVX512-NEXT:    testq %rdx, %rdx
+; AVX512-NEXT:    cmovneq %rcx, %rax
+; AVX512-NEXT:    testq %r14, %r14
+; AVX512-NEXT:    cmovneq %rcx, %rbx
+; AVX512-NEXT:    vmovq %rbx, %xmm0
+; AVX512-NEXT:    vmovq %rax, %xmm1
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512-NEXT:    addq $24, %rsp
+; AVX512-NEXT:    popq %rbx
+; AVX512-NEXT:    popq %r14
+; AVX512-NEXT:    retq
 entry:
   %conv = fptoui <2 x half> %x to <2 x i128>
   %0 = icmp ult <2 x i128> %conv, <i128 18446744073709551616, i128 18446744073709551616>
@@ -1689,52 +2701,82 @@ entry:
   ret <2 x i64> %conv6
 }
 
-define <2 x i64> @ustest_f16i64(<2 x half> %x) {
-; CHECK-LABEL: ustest_f16i64:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    pushq %r14
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    pushq %rbx
-; CHECK-NEXT:    .cfi_def_cfa_offset 24
-; CHECK-NEXT:    subq $24, %rsp
-; CHECK-NEXT:    .cfi_def_cfa_offset 48
-; CHECK-NEXT:    .cfi_offset %rbx, -24
-; CHECK-NEXT:    .cfi_offset %r14, -16
-; CHECK-NEXT:    movdqa %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT:    psrld $16, %xmm0
-; CHECK-NEXT:    callq __fixhfti@PLT
-; CHECK-NEXT:    movq %rax, %rbx
-; CHECK-NEXT:    movq %rdx, %r14
-; CHECK-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __fixhfti@PLT
-; CHECK-NEXT:    xorl %ecx, %ecx
-; CHECK-NEXT:    testq %rdx, %rdx
-; CHECK-NEXT:    movl $1, %esi
-; CHECK-NEXT:    cmovgq %rsi, %rdx
-; CHECK-NEXT:    cmovgq %rcx, %rax
-; CHECK-NEXT:    testq %r14, %r14
-; CHECK-NEXT:    cmovleq %r14, %rsi
-; CHECK-NEXT:    cmovgq %rcx, %rbx
-; CHECK-NEXT:    movq %rbx, %rdi
-; CHECK-NEXT:    negq %rdi
-; CHECK-NEXT:    movl $0, %edi
-; CHECK-NEXT:    sbbq %rsi, %rdi
-; CHECK-NEXT:    cmovgeq %rcx, %rbx
-; CHECK-NEXT:    movq %rax, %rsi
-; CHECK-NEXT:    negq %rsi
-; CHECK-NEXT:    movl $0, %esi
-; CHECK-NEXT:    sbbq %rdx, %rsi
-; CHECK-NEXT:    cmovgeq %rcx, %rax
-; CHECK-NEXT:    movq %rax, %xmm0
-; CHECK-NEXT:    movq %rbx, %xmm1
-; CHECK-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; CHECK-NEXT:    addq $24, %rsp
-; CHECK-NEXT:    .cfi_def_cfa_offset 24
-; CHECK-NEXT:    popq %rbx
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    popq %r14
-; CHECK-NEXT:    .cfi_def_cfa_offset 8
-; CHECK-NEXT:    retq
+define <2 x i64> @ustest_f16i64(<2 x half> %x) nounwind {
+; SSE-LABEL: ustest_f16i64:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    pushq %r14
+; SSE-NEXT:    pushq %rbx
+; SSE-NEXT:    subq $24, %rsp
+; SSE-NEXT:    movdqa %xmm0, (%rsp) # 16-byte Spill
+; SSE-NEXT:    psrld $16, %xmm0
+; SSE-NEXT:    callq __fixhfti@PLT
+; SSE-NEXT:    movq %rax, %rbx
+; SSE-NEXT:    movq %rdx, %r14
+; SSE-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; SSE-NEXT:    callq __fixhfti@PLT
+; SSE-NEXT:    xorl %ecx, %ecx
+; SSE-NEXT:    testq %rdx, %rdx
+; SSE-NEXT:    movl $1, %esi
+; SSE-NEXT:    cmovgq %rsi, %rdx
+; SSE-NEXT:    cmovgq %rcx, %rax
+; SSE-NEXT:    testq %r14, %r14
+; SSE-NEXT:    cmovleq %r14, %rsi
+; SSE-NEXT:    cmovgq %rcx, %rbx
+; SSE-NEXT:    movq %rbx, %rdi
+; SSE-NEXT:    negq %rdi
+; SSE-NEXT:    movl $0, %edi
+; SSE-NEXT:    sbbq %rsi, %rdi
+; SSE-NEXT:    cmovgeq %rcx, %rbx
+; SSE-NEXT:    movq %rax, %rsi
+; SSE-NEXT:    negq %rsi
+; SSE-NEXT:    movl $0, %esi
+; SSE-NEXT:    sbbq %rdx, %rsi
+; SSE-NEXT:    cmovgeq %rcx, %rax
+; SSE-NEXT:    movq %rax, %xmm0
+; SSE-NEXT:    movq %rbx, %xmm1
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT:    addq $24, %rsp
+; SSE-NEXT:    popq %rbx
+; SSE-NEXT:    popq %r14
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: ustest_f16i64:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    pushq %r14
+; AVX-NEXT:    pushq %rbx
+; AVX-NEXT:    subq $24, %rsp
+; AVX-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
+; AVX-NEXT:    vpsrld $16, %xmm0, %xmm0
+; AVX-NEXT:    callq __fixhfti@PLT
+; AVX-NEXT:    movq %rax, %rbx
+; AVX-NEXT:    movq %rdx, %r14
+; AVX-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
+; AVX-NEXT:    callq __fixhfti@PLT
+; AVX-NEXT:    xorl %ecx, %ecx
+; AVX-NEXT:    testq %rdx, %rdx
+; AVX-NEXT:    movl $1, %esi
+; AVX-NEXT:    cmovgq %rsi, %rdx
+; AVX-NEXT:    cmovgq %rcx, %rax
+; AVX-NEXT:    testq %r14, %r14
+; AVX-NEXT:    cmovleq %r14, %rsi
+; AVX-NEXT:    cmovgq %rcx, %rbx
+; AVX-NEXT:    movq %rbx, %rdi
+; AVX-NEXT:    negq %rdi
+; AVX-NEXT:    movl $0, %edi
+; AVX-NEXT:    sbbq %rsi, %rdi
+; AVX-NEXT:    cmovgeq %rcx, %rbx
+; AVX-NEXT:    movq %rax, %rsi
+; AVX-NEXT:    negq %rsi
+; AVX-NEXT:    movl $0, %esi
+; AVX-NEXT:    sbbq %rdx, %rsi
+; AVX-NEXT:    cmovgeq %rcx, %rax
+; AVX-NEXT:    vmovq %rax, %xmm0
+; AVX-NEXT:    vmovq %rbx, %xmm1
+; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT:    addq $24, %rsp
+; AVX-NEXT:    popq %rbx
+; AVX-NEXT:    popq %r14
+; AVX-NEXT:    retq
 entry:
   %conv = fptosi <2 x half> %x to <2 x i128>
   %0 = icmp slt <2 x i128> %conv, <i128 18446744073709551616, i128 18446744073709551616>
@@ -1749,44 +2791,67 @@ entry:
 
 ; i32 saturate
 
-define <2 x i32> @stest_f64i32_mm(<2 x double> %x) {
-; CHECK-LABEL: stest_f64i32_mm:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    cvttsd2si %xmm0, %rax
-; CHECK-NEXT:    movq %rax, %xmm1
-; CHECK-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
-; CHECK-NEXT:    cvttsd2si %xmm0, %rax
-; CHECK-NEXT:    movq %rax, %xmm0
-; CHECK-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; CHECK-NEXT:    movdqa {{.*#+}} xmm0 = [2147483648,2147483648]
-; CHECK-NEXT:    movdqa %xmm1, %xmm2
-; CHECK-NEXT:    pxor %xmm0, %xmm2
-; CHECK-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
-; CHECK-NEXT:    pxor %xmm4, %xmm4
-; CHECK-NEXT:    pcmpeqd %xmm3, %xmm4
-; CHECK-NEXT:    movdqa {{.*#+}} xmm3 = [4294967295,4294967295]
-; CHECK-NEXT:    pcmpgtd %xmm2, %xmm3
-; CHECK-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; CHECK-NEXT:    pand %xmm4, %xmm2
-; CHECK-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; CHECK-NEXT:    por %xmm2, %xmm3
-; CHECK-NEXT:    pand %xmm3, %xmm1
-; CHECK-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; CHECK-NEXT:    por %xmm1, %xmm3
-; CHECK-NEXT:    pxor %xmm3, %xmm0
-; CHECK-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-NEXT:    pcmpeqd %xmm2, %xmm2
-; CHECK-NEXT:    pcmpeqd %xmm1, %xmm2
-; CHECK-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
-; CHECK-NEXT:    pand %xmm2, %xmm1
-; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; CHECK-NEXT:    por %xmm1, %xmm0
-; CHECK-NEXT:    pand %xmm0, %xmm3
-; CHECK-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    por %xmm3, %xmm0
-; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-NEXT:    retq
+define <2 x i32> @stest_f64i32_mm(<2 x double> %x) nounwind {
+; SSE-LABEL: stest_f64i32_mm:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    cvttsd2si %xmm0, %rax
+; SSE-NEXT:    movq %rax, %xmm1
+; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
+; SSE-NEXT:    cvttsd2si %xmm0, %rax
+; SSE-NEXT:    movq %rax, %xmm0
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [2147483648,2147483648]
+; SSE-NEXT:    movdqa %xmm1, %xmm2
+; SSE-NEXT:    pxor %xmm0, %xmm2
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
+; SSE-NEXT:    pxor %xmm4, %xmm4
+; SSE-NEXT:    pcmpeqd %xmm3, %xmm4
+; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [4294967295,4294967295]
+; SSE-NEXT:    pcmpgtd %xmm2, %xmm3
+; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
+; SSE-NEXT:    pand %xmm4, %xmm2
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE-NEXT:    por %xmm2, %xmm3
+; SSE-NEXT:    pand %xmm3, %xmm1
+; SSE-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; SSE-NEXT:    por %xmm1, %xmm3
+; SSE-NEXT:    pxor %xmm3, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; SSE-NEXT:    pcmpeqd %xmm2, %xmm2
+; SSE-NEXT:    pcmpeqd %xmm1, %xmm2
+; SSE-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSE-NEXT:    pand %xmm2, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE-NEXT:    por %xmm1, %xmm0
+; SSE-NEXT:    pand %xmm0, %xmm3
+; SSE-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT:    por %xmm3, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: stest_f64i32_mm:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vcvttsd2si %xmm0, %rax
+; AVX2-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX2-NEXT:    vcvttsd2si %xmm0, %rcx
+; AVX2-NEXT:    vmovq %rax, %xmm0
+; AVX2-NEXT:    vmovq %rcx, %xmm1
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [2147483647,2147483647]
+; AVX2-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm2
+; AVX2-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968]
+; AVX2-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
+; AVX2-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: stest_f64i32_mm:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vcvttpd2qq %xmm0, %xmm0
+; AVX512-NEXT:    vpmovsqd %xmm0, %xmm0
+; AVX512-NEXT:    retq
 entry:
   %conv = fptosi <2 x double> %x to <2 x i64>
   %spec.store.select = call <2 x i64> @llvm.smin.v2i64(<2 x i64> %conv, <2 x i64> <i64 2147483647, i64 2147483647>)
@@ -1795,42 +2860,75 @@ entry:
   ret <2 x i32> %conv6
 }
 
-define <2 x i32> @utest_f64i32_mm(<2 x double> %x) {
-; CHECK-LABEL: utest_f64i32_mm:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; CHECK-NEXT:    movapd %xmm0, %xmm2
-; CHECK-NEXT:    subsd %xmm1, %xmm2
-; CHECK-NEXT:    cvttsd2si %xmm2, %rax
-; CHECK-NEXT:    cvttsd2si %xmm0, %rcx
-; CHECK-NEXT:    movq %rcx, %rdx
-; CHECK-NEXT:    sarq $63, %rdx
-; CHECK-NEXT:    andq %rax, %rdx
-; CHECK-NEXT:    orq %rcx, %rdx
-; CHECK-NEXT:    movq %rdx, %xmm2
-; CHECK-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
-; CHECK-NEXT:    cvttsd2si %xmm0, %rax
-; CHECK-NEXT:    subsd %xmm1, %xmm0
-; CHECK-NEXT:    cvttsd2si %xmm0, %rcx
-; CHECK-NEXT:    movq %rax, %rdx
-; CHECK-NEXT:    sarq $63, %rdx
-; CHECK-NEXT:    andq %rcx, %rdx
-; CHECK-NEXT:    orq %rax, %rdx
-; CHECK-NEXT:    movq %rdx, %xmm0
-; CHECK-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0]
-; CHECK-NEXT:    movdqa {{.*#+}} xmm0 = [9223372039002259456,9223372039002259456]
-; CHECK-NEXT:    pxor %xmm2, %xmm0
-; CHECK-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-NEXT:    pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
-; CHECK-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    pandn %xmm1, %xmm0
-; CHECK-NEXT:    pcmpeqd %xmm1, %xmm1
-; CHECK-NEXT:    pxor %xmm0, %xmm1
-; CHECK-NEXT:    pand %xmm2, %xmm0
-; CHECK-NEXT:    por %xmm1, %xmm0
-; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-NEXT:    retq
+define <2 x i32> @utest_f64i32_mm(<2 x double> %x) nounwind {
+; SSE-LABEL: utest_f64i32_mm:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; SSE-NEXT:    movapd %xmm0, %xmm2
+; SSE-NEXT:    subsd %xmm1, %xmm2
+; SSE-NEXT:    cvttsd2si %xmm2, %rax
+; SSE-NEXT:    cvttsd2si %xmm0, %rcx
+; SSE-NEXT:    movq %rcx, %rdx
+; SSE-NEXT:    sarq $63, %rdx
+; SSE-NEXT:    andq %rax, %rdx
+; SSE-NEXT:    orq %rcx, %rdx
+; SSE-NEXT:    movq %rdx, %xmm2
+; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
+; SSE-NEXT:    cvttsd2si %xmm0, %rax
+; SSE-NEXT:    subsd %xmm1, %xmm0
+; SSE-NEXT:    cvttsd2si %xmm0, %rcx
+; SSE-NEXT:    movq %rax, %rdx
+; SSE-NEXT:    sarq $63, %rdx
+; SSE-NEXT:    andq %rcx, %rdx
+; SSE-NEXT:    orq %rax, %rdx
+; SSE-NEXT:    movq %rdx, %xmm0
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0]
+; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [9223372039002259456,9223372039002259456]
+; SSE-NEXT:    pxor %xmm2, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; SSE-NEXT:    pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,2]
+; SSE-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT:    pandn %xmm1, %xmm0
+; SSE-NEXT:    pcmpeqd %xmm1, %xmm1
+; SSE-NEXT:    pxor %xmm0, %xmm1
+; SSE-NEXT:    pand %xmm2, %xmm0
+; SSE-NEXT:    por %xmm1, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: utest_f64i32_mm:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX2-NEXT:    vsubsd %xmm1, %xmm0, %xmm2
+; AVX2-NEXT:    vcvttsd2si %xmm2, %rax
+; AVX2-NEXT:    vcvttsd2si %xmm0, %rcx
+; AVX2-NEXT:    movq %rcx, %rdx
+; AVX2-NEXT:    sarq $63, %rdx
+; AVX2-NEXT:    andq %rax, %rdx
+; AVX2-NEXT:    orq %rcx, %rdx
+; AVX2-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX2-NEXT:    vsubsd %xmm1, %xmm0, %xmm1
+; AVX2-NEXT:    vcvttsd2si %xmm1, %rax
+; AVX2-NEXT:    vmovq %rdx, %xmm1
+; AVX2-NEXT:    vcvttsd2si %xmm0, %rcx
+; AVX2-NEXT:    movq %rcx, %rdx
+; AVX2-NEXT:    sarq $63, %rdx
+; AVX2-NEXT:    andq %rax, %rdx
+; AVX2-NEXT:    orq %rcx, %rdx
+; AVX2-NEXT:    vmovq %rdx, %xmm0
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX2-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
+; AVX2-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX2-NEXT:    vblendvpd %xmm1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: utest_f64i32_mm:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vcvttpd2uqq %xmm0, %xmm0
+; AVX512-NEXT:    vpmovusqd %xmm0, %xmm0
+; AVX512-NEXT:    retq
 entry:
   %conv = fptoui <2 x double> %x to <2 x i64>
   %spec.store.select = call <2 x i64> @llvm.umin.v2i64(<2 x i64> %conv, <2 x i64> <i64 4294967295, i64 4294967295>)
@@ -1838,42 +2936,67 @@ entry:
   ret <2 x i32> %conv6
 }
 
-define <2 x i32> @ustest_f64i32_mm(<2 x double> %x) {
-; CHECK-LABEL: ustest_f64i32_mm:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    cvttsd2si %xmm0, %rax
-; CHECK-NEXT:    movq %rax, %xmm1
-; CHECK-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
-; CHECK-NEXT:    cvttsd2si %xmm0, %rax
-; CHECK-NEXT:    movq %rax, %xmm0
-; CHECK-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; CHECK-NEXT:    movdqa {{.*#+}} xmm0 = [2147483648,2147483648]
-; CHECK-NEXT:    movdqa %xmm1, %xmm2
-; CHECK-NEXT:    pxor %xmm0, %xmm2
-; CHECK-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
-; CHECK-NEXT:    pxor %xmm4, %xmm4
-; CHECK-NEXT:    pcmpeqd %xmm3, %xmm4
-; CHECK-NEXT:    movdqa {{.*#+}} xmm3 = [2147483647,2147483647]
-; CHECK-NEXT:    pcmpgtd %xmm2, %xmm3
-; CHECK-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
-; CHECK-NEXT:    pand %xmm4, %xmm2
-; CHECK-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; CHECK-NEXT:    por %xmm2, %xmm3
-; CHECK-NEXT:    pand %xmm3, %xmm1
-; CHECK-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
-; CHECK-NEXT:    por %xmm1, %xmm3
-; CHECK-NEXT:    movdqa %xmm3, %xmm1
-; CHECK-NEXT:    pxor %xmm0, %xmm1
-; CHECK-NEXT:    movdqa %xmm1, %xmm2
-; CHECK-NEXT:    pcmpgtd %xmm0, %xmm2
-; CHECK-NEXT:    pcmpeqd %xmm0, %xmm1
-; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; CHECK-NEXT:    pand %xmm2, %xmm0
-; CHECK-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
-; CHECK-NEXT:    por %xmm0, %xmm1
-; CHECK-NEXT:    pand %xmm3, %xmm1
-; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
-; CHECK-NEXT:    retq
+define <2 x i32> @ustest_f64i32_mm(<2 x double> %x) nounwind {
+; SSE-LABEL: ustest_f64i32_mm:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    cvttsd2si %xmm0, %rax
+; SSE-NEXT:    movq %rax, %xmm1
+; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
+; SSE-NEXT:    cvttsd2si %xmm0, %rax
+; SSE-NEXT:    movq %rax, %xmm0
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [2147483648,2147483648]
+; SSE-NEXT:    movdqa %xmm1, %xmm2
+; SSE-NEXT:    pxor %xmm0, %xmm2
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
+; SSE-NEXT:    pxor %xmm4, %xmm4
+; SSE-NEXT:    pcmpeqd %xmm3, %xmm4
+; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [2147483647,2147483647]
+; SSE-NEXT:    pcmpgtd %xmm2, %xmm3
+; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
+; SSE-NEXT:    pand %xmm4, %xmm2
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE-NEXT:    por %xmm2, %xmm3
+; SSE-NEXT:    pand %xmm3, %xmm1
+; SSE-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; SSE-NEXT:    por %xmm1, %xmm3
+; SSE-NEXT:    movdqa %xmm3, %xmm1
+; SSE-NEXT:    pxor %xmm0, %xmm1
+; SSE-NEXT:    movdqa %xmm1, %xmm2
+; SSE-NEXT:    pcmpgtd %xmm0, %xmm2
+; SSE-NEXT:    pcmpeqd %xmm0, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; SSE-NEXT:    pand %xmm2, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; SSE-NEXT:    por %xmm0, %xmm1
+; SSE-NEXT:    pand %xmm3, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: ustest_f64i32_mm:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vcvttsd2si %xmm0, %rax
+; AVX2-NEXT:    vmovq %rax, %xmm1
+; AVX2-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX2-NEXT:    vcvttsd2si %xmm0, %rax
+; AVX2-NEXT:    vmovq %rax, %xmm0
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [4294967295,4294967295]
+; AVX2-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm2
+; AVX2-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm1
+; AVX2-NEXT:    vpand %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: ustest_f64i32_mm:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vcvttpd2qq %xmm0, %xmm0
+; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512-NEXT:    vpmaxsq %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpmovusqd %xmm0, %xmm0
+; AVX512-NEXT:    retq
 entry:
   %conv = fptosi <2 x double> %x to <2 x i64>
   %spec.store.select = call <2 x i64> @llvm.smin.v2i64(<2 x i64> %conv, <2 x i64> <i64 4294967295, i64 4294967295>)
@@ -1882,81 +3005,117 @@ entry:
   ret <2 x i32> %conv6
 }
 
-define <4 x i32> @stest_f32i32_mm(<4 x float> %x) {
-; CHECK-LABEL: stest_f32i32_mm:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movaps %xmm0, %xmm1
-; CHECK-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3]
-; CHECK-NEXT:    cvttss2si %xmm1, %rax
-; CHECK-NEXT:    movq %rax, %xmm1
-; CHECK-NEXT:    movaps %xmm0, %xmm2
-; CHECK-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
-; CHECK-NEXT:    cvttss2si %xmm2, %rax
-; CHECK-NEXT:    movq %rax, %xmm2
-; CHECK-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
-; CHECK-NEXT:    cvttss2si %xmm0, %rax
-; CHECK-NEXT:    movq %rax, %xmm3
-; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; CHECK-NEXT:    cvttss2si %xmm0, %rax
-; CHECK-NEXT:    movq %rax, %xmm0
-; CHECK-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0]
-; CHECK-NEXT:    movdqa {{.*#+}} xmm0 = [2147483648,2147483648]
-; CHECK-NEXT:    movdqa %xmm3, %xmm1
-; CHECK-NEXT:    pxor %xmm0, %xmm1
-; CHECK-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
-; CHECK-NEXT:    pxor %xmm5, %xmm5
-; CHECK-NEXT:    pcmpeqd %xmm5, %xmm4
-; CHECK-NEXT:    movdqa {{.*#+}} xmm6 = [4294967295,4294967295]
-; CHECK-NEXT:    movdqa %xmm6, %xmm7
-; CHECK-NEXT:    pcmpgtd %xmm1, %xmm7
-; CHECK-NEXT:    pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
-; CHECK-NEXT:    pand %xmm4, %xmm8
-; CHECK-NEXT:    pshufd {{.*#+}} xmm1 = xmm7[1,1,3,3]
-; CHECK-NEXT:    por %xmm8, %xmm1
-; CHECK-NEXT:    movdqa {{.*#+}} xmm4 = [2147483647,2147483647]
-; CHECK-NEXT:    pand %xmm1, %xmm3
-; CHECK-NEXT:    pandn %xmm4, %xmm1
-; CHECK-NEXT:    por %xmm3, %xmm1
-; CHECK-NEXT:    movdqa %xmm2, %xmm3
-; CHECK-NEXT:    pxor %xmm0, %xmm3
-; CHECK-NEXT:    pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3]
-; CHECK-NEXT:    pcmpeqd %xmm5, %xmm7
-; CHECK-NEXT:    pcmpgtd %xmm3, %xmm6
-; CHECK-NEXT:    pshufd {{.*#+}} xmm3 = xmm6[0,0,2,2]
-; CHECK-NEXT:    pand %xmm7, %xmm3
-; CHECK-NEXT:    pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
-; CHECK-NEXT:    por %xmm3, %xmm5
-; CHECK-NEXT:    pand %xmm5, %xmm2
-; CHECK-NEXT:    pandn %xmm4, %xmm5
-; CHECK-NEXT:    por %xmm2, %xmm5
-; CHECK-NEXT:    movdqa %xmm5, %xmm2
-; CHECK-NEXT:    pxor %xmm0, %xmm2
-; CHECK-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
-; CHECK-NEXT:    pcmpeqd %xmm4, %xmm4
-; CHECK-NEXT:    pcmpeqd %xmm4, %xmm3
-; CHECK-NEXT:    movdqa {{.*#+}} xmm6 = [18446744069414584320,18446744069414584320]
-; CHECK-NEXT:    pcmpgtd %xmm6, %xmm2
-; CHECK-NEXT:    pshufd {{.*#+}} xmm7 = xmm2[0,0,2,2]
-; CHECK-NEXT:    pand %xmm3, %xmm7
-; CHECK-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-NEXT:    por %xmm7, %xmm2
-; CHECK-NEXT:    movdqa {{.*#+}} xmm3 = [18446744071562067968,18446744071562067968]
-; CHECK-NEXT:    pand %xmm2, %xmm5
-; CHECK-NEXT:    pandn %xmm3, %xmm2
-; CHECK-NEXT:    por %xmm5, %xmm2
-; CHECK-NEXT:    pxor %xmm1, %xmm0
-; CHECK-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
-; CHECK-NEXT:    pcmpeqd %xmm4, %xmm5
-; CHECK-NEXT:    pcmpgtd %xmm6, %xmm0
-; CHECK-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[0,0,2,2]
-; CHECK-NEXT:    pand %xmm5, %xmm4
-; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; CHECK-NEXT:    por %xmm4, %xmm0
-; CHECK-NEXT:    pand %xmm0, %xmm1
-; CHECK-NEXT:    pandn %xmm3, %xmm0
-; CHECK-NEXT:    por %xmm1, %xmm0
-; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
-; CHECK-NEXT:    retq
+define <4 x i32> @stest_f32i32_mm(<4 x float> %x) nounwind {
+; SSE-LABEL: stest_f32i32_mm:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    movaps %xmm0, %xmm1
+; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3]
+; SSE-NEXT:    cvttss2si %xmm1, %rax
+; SSE-NEXT:    movq %rax, %xmm1
+; SSE-NEXT:    movaps %xmm0, %xmm2
+; SSE-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
+; SSE-NEXT:    cvttss2si %xmm2, %rax
+; SSE-NEXT:    movq %rax, %xmm2
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
+; SSE-NEXT:    cvttss2si %xmm0, %rax
+; SSE-NEXT:    movq %rax, %xmm3
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; SSE-NEXT:    cvttss2si %xmm0, %rax
+; SSE-NEXT:    movq %rax, %xmm0
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0]
+; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [2147483648,2147483648]
+; SSE-NEXT:    movdqa %xmm3, %xmm1
+; SSE-NEXT:    pxor %xmm0, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
+; SSE-NEXT:    pxor %xmm5, %xmm5
+; SSE-NEXT:    pcmpeqd %xmm5, %xmm4
+; SSE-NEXT:    movdqa {{.*#+}} xmm6 = [4294967295,4294967295]
+; SSE-NEXT:    movdqa %xmm6, %xmm7
+; SSE-NEXT:    pcmpgtd %xmm1, %xmm7
+; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
+; SSE-NEXT:    pand %xmm4, %xmm8
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm7[1,1,3,3]
+; SSE-NEXT:    por %xmm8, %xmm1
+; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [2147483647,2147483647]
+; SSE-NEXT:    pand %xmm1, %xmm3
+; SSE-NEXT:    pandn %xmm4, %xmm1
+; SSE-NEXT:    por %xmm3, %xmm1
+; SSE-NEXT:    movdqa %xmm2, %xmm3
+; SSE-NEXT:    pxor %xmm0, %xmm3
+; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3]
+; SSE-NEXT:    pcmpeqd %xmm5, %xmm7
+; SSE-NEXT:    pcmpgtd %xmm3, %xmm6
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm6[0,0,2,2]
+; SSE-NEXT:    pand %xmm7, %xmm3
+; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
+; SSE-NEXT:    por %xmm3, %xmm5
+; SSE-NEXT:    pand %xmm5, %xmm2
+; SSE-NEXT:    pandn %xmm4, %xmm5
+; SSE-NEXT:    por %xmm2, %xmm5
+; SSE-NEXT:    movdqa %xmm5, %xmm2
+; SSE-NEXT:    pxor %xmm0, %xmm2
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
+; SSE-NEXT:    pcmpeqd %xmm4, %xmm4
+; SSE-NEXT:    pcmpeqd %xmm4, %xmm3
+; SSE-NEXT:    movdqa {{.*#+}} xmm6 = [18446744069414584320,18446744069414584320]
+; SSE-NEXT:    pcmpgtd %xmm6, %xmm2
+; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm2[0,0,2,2]
+; SSE-NEXT:    pand %xmm3, %xmm7
+; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE-NEXT:    por %xmm7, %xmm2
+; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [18446744071562067968,18446744071562067968]
+; SSE-NEXT:    pand %xmm2, %xmm5
+; SSE-NEXT:    pandn %xmm3, %xmm2
+; SSE-NEXT:    por %xmm5, %xmm2
+; SSE-NEXT:    pxor %xmm1, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3]
+; SSE-NEXT:    pcmpeqd %xmm4, %xmm5
+; SSE-NEXT:    pcmpgtd %xmm6, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[0,0,2,2]
+; SSE-NEXT:    pand %xmm5, %xmm4
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE-NEXT:    por %xmm4, %xmm0
+; SSE-NEXT:    pand %xmm0, %xmm1
+; SSE-NEXT:    pandn %xmm3, %xmm0
+; SSE-NEXT:    por %xmm1, %xmm0
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: stest_f32i32_mm:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3]
+; AVX2-NEXT:    vcvttss2si %xmm1, %rax
+; AVX2-NEXT:    vmovq %rax, %xmm1
+; AVX2-NEXT:    vshufpd {{.*#+}} xmm2 = xmm0[1,0]
+; AVX2-NEXT:    vcvttss2si %xmm2, %rax
+; AVX2-NEXT:    vmovq %rax, %xmm2
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX2-NEXT:    vcvttss2si %xmm0, %rax
+; AVX2-NEXT:    vmovq %rax, %xmm2
+; AVX2-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX2-NEXT:    vcvttss2si %xmm0, %rax
+; AVX2-NEXT:    vmovq %rax, %xmm0
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
+; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [2147483647,2147483647,2147483647,2147483647]
+; AVX2-NEXT:    vpcmpgtq %ymm0, %ymm1, %ymm2
+; AVX2-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [18446744071562067968,18446744071562067968,18446744071562067968,18446744071562067968]
+; AVX2-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm2
+; AVX2-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vbroadcastf128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6]
+; AVX2-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX2-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: stest_f32i32_mm:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vcvttps2qq %xmm0, %ymm0
+; AVX512-NEXT:    vpmovsqd %ymm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
 entry:
   %conv = fptosi <4 x float> %x to <4 x i64>
   %spec.store.select = call <4 x i64> @llvm.smin.v4i64(<4 x i64> %conv, <4 x i64> <i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647>)
@@ -1965,76 +3124,137 @@ entry:
   ret <4 x i32> %conv6
 }
 
-define <4 x i32> @utest_f32i32_mm(<4 x float> %x) {
-; CHECK-LABEL: utest_f32i32_mm:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; CHECK-NEXT:    movaps %xmm0, %xmm1
-; CHECK-NEXT:    subss %xmm2, %xmm1
-; CHECK-NEXT:    cvttss2si %xmm1, %rax
-; CHECK-NEXT:    cvttss2si %xmm0, %rcx
-; CHECK-NEXT:    movq %rcx, %rdx
-; CHECK-NEXT:    sarq $63, %rdx
-; CHECK-NEXT:    andq %rax, %rdx
-; CHECK-NEXT:    orq %rcx, %rdx
-; CHECK-NEXT:    movq %rdx, %xmm1
-; CHECK-NEXT:    movaps %xmm0, %xmm3
-; CHECK-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[1,1]
-; CHECK-NEXT:    cvttss2si %xmm3, %rax
-; CHECK-NEXT:    subss %xmm2, %xmm3
-; CHECK-NEXT:    cvttss2si %xmm3, %rcx
-; CHECK-NEXT:    movq %rax, %rdx
-; CHECK-NEXT:    sarq $63, %rdx
-; CHECK-NEXT:    andq %rcx, %rdx
-; CHECK-NEXT:    orq %rax, %rdx
-; CHECK-NEXT:    movq %rdx, %xmm3
-; CHECK-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
-; CHECK-NEXT:    movaps %xmm0, %xmm3
-; CHECK-NEXT:    shufps {{.*#+}} xmm3 = xmm3[3,3],xmm0[3,3]
-; CHECK-NEXT:    cvttss2si %xmm3, %rax
-; CHECK-NEXT:    subss %xmm2, %xmm3
-; CHECK-NEXT:    cvttss2si %xmm3, %rcx
-; CHECK-NEXT:    movq %rax, %rdx
-; CHECK-NEXT:    sarq $63, %rdx
-; CHECK-NEXT:    andq %rcx, %rdx
-; CHECK-NEXT:    orq %rax, %rdx
-; CHECK-NEXT:    movq %rdx, %xmm3
-; CHECK-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
-; CHECK-NEXT:    cvttss2si %xmm0, %rax
-; CHECK-NEXT:    subss %xmm2, %xmm0
-; CHECK-NEXT:    cvttss2si %xmm0, %rcx
-; CHECK-NEXT:    movq %rax, %rdx
-; CHECK-NEXT:    sarq $63, %rdx
-; CHECK-NEXT:    andq %rcx, %rdx
-; CHECK-NEXT:    orq %rax, %rdx
-; CHECK-NEXT:    movq %rdx, %xmm0
-; CHECK-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
-; CHECK-NEXT:    movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456]
-; CHECK-NEXT:    movdqa %xmm0, %xmm3
-; CHECK-NEXT:    pxor %xmm2, %xmm3
-; CHECK-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
-; CHECK-NEXT:    movdqa {{.*#+}} xmm5 = [2147483647,2147483647,2147483647,2147483647]
-; CHECK-NEXT:    movdqa %xmm5, %xmm6
-; CHECK-NEXT:    pcmpgtd %xmm4, %xmm6
-; CHECK-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; CHECK-NEXT:    pcmpeqd %xmm2, %xmm3
-; CHECK-NEXT:    pand %xmm6, %xmm3
-; CHECK-NEXT:    pcmpeqd %xmm4, %xmm4
-; CHECK-NEXT:    pand %xmm3, %xmm0
-; CHECK-NEXT:    pxor %xmm4, %xmm3
-; CHECK-NEXT:    por %xmm0, %xmm3
-; CHECK-NEXT:    movdqa %xmm1, %xmm0
-; CHECK-NEXT:    pxor %xmm2, %xmm0
-; CHECK-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[0,0,2,2]
-; CHECK-NEXT:    pcmpgtd %xmm6, %xmm5
-; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; CHECK-NEXT:    pcmpeqd %xmm2, %xmm0
-; CHECK-NEXT:    pand %xmm5, %xmm0
-; CHECK-NEXT:    pxor %xmm0, %xmm4
-; CHECK-NEXT:    pand %xmm1, %xmm0
-; CHECK-NEXT:    por %xmm4, %xmm0
-; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2]
-; CHECK-NEXT:    retq
+define <4 x i32> @utest_f32i32_mm(<4 x float> %x) nounwind {
+; SSE-LABEL: utest_f32i32_mm:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE-NEXT:    movaps %xmm0, %xmm1
+; SSE-NEXT:    subss %xmm2, %xmm1
+; SSE-NEXT:    cvttss2si %xmm1, %rax
+; SSE-NEXT:    cvttss2si %xmm0, %rcx
+; SSE-NEXT:    movq %rcx, %rdx
+; SSE-NEXT:    sarq $63, %rdx
+; SSE-NEXT:    andq %rax, %rdx
+; SSE-NEXT:    orq %rcx, %rdx
+; SSE-NEXT:    movq %rdx, %xmm1
+; SSE-NEXT:    movaps %xmm0, %xmm3
+; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,1],xmm0[1,1]
+; SSE-NEXT:    cvttss2si %xmm3, %rax
+; SSE-NEXT:    subss %xmm2, %xmm3
+; SSE-NEXT:    cvttss2si %xmm3, %rcx
+; SSE-NEXT:    movq %rax, %rdx
+; SSE-NEXT:    sarq $63, %rdx
+; SSE-NEXT:    andq %rcx, %rdx
+; SSE-NEXT:    orq %rax, %rdx
+; SSE-NEXT:    movq %rdx, %xmm3
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
+; SSE-NEXT:    movaps %xmm0, %xmm3
+; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[3,3],xmm0[3,3]
+; SSE-NEXT:    cvttss2si %xmm3, %rax
+; SSE-NEXT:    subss %xmm2, %xmm3
+; SSE-NEXT:    cvttss2si %xmm3, %rcx
+; SSE-NEXT:    movq %rax, %rdx
+; SSE-NEXT:    sarq $63, %rdx
+; SSE-NEXT:    andq %rcx, %rdx
+; SSE-NEXT:    orq %rax, %rdx
+; SSE-NEXT:    movq %rdx, %xmm3
+; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE-NEXT:    cvttss2si %xmm0, %rax
+; SSE-NEXT:    subss %xmm2, %xmm0
+; SSE-NEXT:    cvttss2si %xmm0, %rcx
+; SSE-NEXT:    movq %rax, %rdx
+; SSE-NEXT:    sarq $63, %rdx
+; SSE-NEXT:    andq %rcx, %rdx
+; SSE-NEXT:    orq %rax, %rdx
+; SSE-NEXT:    movq %rdx, %xmm0
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456]
+; SSE-NEXT:    movdqa %xmm0, %xmm3
+; SSE-NEXT:    pxor %xmm2, %xmm3
+; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
+; SSE-NEXT:    movdqa {{.*#+}} xmm5 = [2147483647,2147483647,2147483647,2147483647]
+; SSE-NEXT:    movdqa %xmm5, %xmm6
+; SSE-NEXT:    pcmpgtd %xmm4, %xmm6
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE-NEXT:    pcmpeqd %xmm2, %xmm3
+; SSE-NEXT:    pand %xmm6, %xmm3
+; SSE-NEXT:    pcmpeqd %xmm4, %xmm4
+; SSE-NEXT:    pand %xmm3, %xmm0
+; SSE-NEXT:    pxor %xmm4, %xmm3
+; SSE-NEXT:    por %xmm0, %xmm3
+; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    pxor %xmm2, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[0,0,2,2]
+; SSE-NEXT:    pcmpgtd %xmm6, %xmm5
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE-NEXT:    pcmpeqd %xmm2, %xmm0
+; SSE-NEXT:    pand %xmm5, %xmm0
+; SSE-NEXT:    pxor %xmm0, %xmm4
+; SSE-NEXT:    pand %xmm1, %xmm0
+; SSE-NEXT:    por %xmm4, %xmm0
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2]
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: utest_f32i32_mm:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3]
+; AVX2-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX2-NEXT:    vsubss %xmm1, %xmm2, %xmm3
+; AVX2-NEXT:    vcvttss2si %xmm3, %rax
+; AVX2-NEXT:    vcvttss2si %xmm2, %rcx
+; AVX2-NEXT:    movq %rcx, %rdx
+; AVX2-NEXT:    sarq $63, %rdx
+; AVX2-NEXT:    andq %rax, %rdx
+; AVX2-NEXT:    orq %rcx, %rdx
+; AVX2-NEXT:    vshufpd {{.*#+}} xmm2 = xmm0[1,0]
+; AVX2-NEXT:    vsubss %xmm1, %xmm2, %xmm3
+; AVX2-NEXT:    vcvttss2si %xmm3, %rax
+; AVX2-NEXT:    vcvttss2si %xmm2, %rcx
+; AVX2-NEXT:    vmovq %rdx, %xmm2
+; AVX2-NEXT:    movq %rcx, %rdx
+; AVX2-NEXT:    sarq $63, %rdx
+; AVX2-NEXT:    andq %rax, %rdx
+; AVX2-NEXT:    orq %rcx, %rdx
+; AVX2-NEXT:    vmovq %rdx, %xmm3
+; AVX2-NEXT:    vsubss %xmm1, %xmm0, %xmm4
+; AVX2-NEXT:    vcvttss2si %xmm4, %rax
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX2-NEXT:    vcvttss2si %xmm0, %rcx
+; AVX2-NEXT:    movq %rcx, %rdx
+; AVX2-NEXT:    sarq $63, %rdx
+; AVX2-NEXT:    andq %rax, %rdx
+; AVX2-NEXT:    orq %rcx, %rdx
+; AVX2-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX2-NEXT:    vsubss %xmm1, %xmm0, %xmm1
+; AVX2-NEXT:    vcvttss2si %xmm1, %rax
+; AVX2-NEXT:    vcvttss2si %xmm0, %rcx
+; AVX2-NEXT:    vmovq %rdx, %xmm0
+; AVX2-NEXT:    movq %rcx, %rdx
+; AVX2-NEXT:    sarq $63, %rdx
+; AVX2-NEXT:    andq %rax, %rdx
+; AVX2-NEXT:    orq %rcx, %rdx
+; AVX2-NEXT:    vmovq %rdx, %xmm1
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm1
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [9223372041149743102,9223372041149743102,9223372041149743102,9223372041149743102]
+; AVX2-NEXT:    vpcmpgtq %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [4294967295,4294967295,4294967295,4294967295]
+; AVX2-NEXT:    vblendvpd %ymm1, %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vbroadcastf128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6]
+; AVX2-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX2-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: utest_f32i32_mm:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vcvttps2uqq %xmm0, %ymm0
+; AVX512-NEXT:    vpmovusqd %ymm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
 entry:
   %conv = fptoui <4 x float> %x to <4 x i64>
   %spec.store.select = call <4 x i64> @llvm.umin.v4i64(<4 x i64> %conv, <4 x i64> <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>)
@@ -2042,75 +3262,113 @@ entry:
   ret <4 x i32> %conv6
 }
 
-define <4 x i32> @ustest_f32i32_mm(<4 x float> %x) {
-; CHECK-LABEL: ustest_f32i32_mm:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movaps %xmm0, %xmm1
-; CHECK-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3]
-; CHECK-NEXT:    cvttss2si %xmm1, %rax
-; CHECK-NEXT:    movq %rax, %xmm1
-; CHECK-NEXT:    movaps %xmm0, %xmm2
-; CHECK-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
-; CHECK-NEXT:    cvttss2si %xmm2, %rax
-; CHECK-NEXT:    movq %rax, %xmm2
-; CHECK-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
-; CHECK-NEXT:    cvttss2si %xmm0, %rax
-; CHECK-NEXT:    movq %rax, %xmm3
-; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; CHECK-NEXT:    cvttss2si %xmm0, %rax
-; CHECK-NEXT:    movq %rax, %xmm0
-; CHECK-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0]
-; CHECK-NEXT:    movdqa {{.*#+}} xmm0 = [2147483648,2147483648]
-; CHECK-NEXT:    movdqa %xmm3, %xmm1
-; CHECK-NEXT:    pxor %xmm0, %xmm1
-; CHECK-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
-; CHECK-NEXT:    pxor %xmm5, %xmm5
-; CHECK-NEXT:    pcmpeqd %xmm5, %xmm4
-; CHECK-NEXT:    movdqa {{.*#+}} xmm6 = [2147483647,2147483647]
-; CHECK-NEXT:    movdqa %xmm6, %xmm7
-; CHECK-NEXT:    pcmpgtd %xmm1, %xmm7
-; CHECK-NEXT:    pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
-; CHECK-NEXT:    pand %xmm4, %xmm8
-; CHECK-NEXT:    pshufd {{.*#+}} xmm1 = xmm7[1,1,3,3]
-; CHECK-NEXT:    por %xmm8, %xmm1
-; CHECK-NEXT:    movdqa {{.*#+}} xmm4 = [4294967295,4294967295]
-; CHECK-NEXT:    pand %xmm1, %xmm3
-; CHECK-NEXT:    pandn %xmm4, %xmm1
-; CHECK-NEXT:    por %xmm3, %xmm1
-; CHECK-NEXT:    movdqa %xmm2, %xmm3
-; CHECK-NEXT:    pxor %xmm0, %xmm3
-; CHECK-NEXT:    pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3]
-; CHECK-NEXT:    pcmpeqd %xmm5, %xmm7
-; CHECK-NEXT:    pcmpgtd %xmm3, %xmm6
-; CHECK-NEXT:    pshufd {{.*#+}} xmm3 = xmm6[0,0,2,2]
-; CHECK-NEXT:    pand %xmm7, %xmm3
-; CHECK-NEXT:    pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
-; CHECK-NEXT:    por %xmm3, %xmm5
-; CHECK-NEXT:    pand %xmm5, %xmm2
-; CHECK-NEXT:    pandn %xmm4, %xmm5
-; CHECK-NEXT:    por %xmm2, %xmm5
-; CHECK-NEXT:    movdqa %xmm5, %xmm2
-; CHECK-NEXT:    pxor %xmm0, %xmm2
-; CHECK-NEXT:    movdqa %xmm2, %xmm3
-; CHECK-NEXT:    pcmpgtd %xmm0, %xmm3
-; CHECK-NEXT:    pcmpeqd %xmm0, %xmm2
-; CHECK-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-NEXT:    pand %xmm3, %xmm2
-; CHECK-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; CHECK-NEXT:    por %xmm2, %xmm3
-; CHECK-NEXT:    pand %xmm5, %xmm3
-; CHECK-NEXT:    movdqa %xmm1, %xmm2
-; CHECK-NEXT:    pxor %xmm0, %xmm2
-; CHECK-NEXT:    movdqa %xmm2, %xmm4
-; CHECK-NEXT:    pcmpgtd %xmm0, %xmm4
-; CHECK-NEXT:    pcmpeqd %xmm0, %xmm2
-; CHECK-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-NEXT:    pand %xmm4, %xmm2
-; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; CHECK-NEXT:    por %xmm2, %xmm0
-; CHECK-NEXT:    pand %xmm1, %xmm0
-; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2]
-; CHECK-NEXT:    retq
+define <4 x i32> @ustest_f32i32_mm(<4 x float> %x) nounwind {
+; SSE-LABEL: ustest_f32i32_mm:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    movaps %xmm0, %xmm1
+; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3]
+; SSE-NEXT:    cvttss2si %xmm1, %rax
+; SSE-NEXT:    movq %rax, %xmm1
+; SSE-NEXT:    movaps %xmm0, %xmm2
+; SSE-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
+; SSE-NEXT:    cvttss2si %xmm2, %rax
+; SSE-NEXT:    movq %rax, %xmm2
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
+; SSE-NEXT:    cvttss2si %xmm0, %rax
+; SSE-NEXT:    movq %rax, %xmm3
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; SSE-NEXT:    cvttss2si %xmm0, %rax
+; SSE-NEXT:    movq %rax, %xmm0
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0]
+; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [2147483648,2147483648]
+; SSE-NEXT:    movdqa %xmm3, %xmm1
+; SSE-NEXT:    pxor %xmm0, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
+; SSE-NEXT:    pxor %xmm5, %xmm5
+; SSE-NEXT:    pcmpeqd %xmm5, %xmm4
+; SSE-NEXT:    movdqa {{.*#+}} xmm6 = [2147483647,2147483647]
+; SSE-NEXT:    movdqa %xmm6, %xmm7
+; SSE-NEXT:    pcmpgtd %xmm1, %xmm7
+; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm7[0,0,2,2]
+; SSE-NEXT:    pand %xmm4, %xmm8
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm7[1,1,3,3]
+; SSE-NEXT:    por %xmm8, %xmm1
+; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [4294967295,4294967295]
+; SSE-NEXT:    pand %xmm1, %xmm3
+; SSE-NEXT:    pandn %xmm4, %xmm1
+; SSE-NEXT:    por %xmm3, %xmm1
+; SSE-NEXT:    movdqa %xmm2, %xmm3
+; SSE-NEXT:    pxor %xmm0, %xmm3
+; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm3[1,1,3,3]
+; SSE-NEXT:    pcmpeqd %xmm5, %xmm7
+; SSE-NEXT:    pcmpgtd %xmm3, %xmm6
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm6[0,0,2,2]
+; SSE-NEXT:    pand %xmm7, %xmm3
+; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
+; SSE-NEXT:    por %xmm3, %xmm5
+; SSE-NEXT:    pand %xmm5, %xmm2
+; SSE-NEXT:    pandn %xmm4, %xmm5
+; SSE-NEXT:    por %xmm2, %xmm5
+; SSE-NEXT:    movdqa %xmm5, %xmm2
+; SSE-NEXT:    pxor %xmm0, %xmm2
+; SSE-NEXT:    movdqa %xmm2, %xmm3
+; SSE-NEXT:    pcmpgtd %xmm0, %xmm3
+; SSE-NEXT:    pcmpeqd %xmm0, %xmm2
+; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE-NEXT:    pand %xmm3, %xmm2
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE-NEXT:    por %xmm2, %xmm3
+; SSE-NEXT:    pand %xmm5, %xmm3
+; SSE-NEXT:    movdqa %xmm1, %xmm2
+; SSE-NEXT:    pxor %xmm0, %xmm2
+; SSE-NEXT:    movdqa %xmm2, %xmm4
+; SSE-NEXT:    pcmpgtd %xmm0, %xmm4
+; SSE-NEXT:    pcmpeqd %xmm0, %xmm2
+; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE-NEXT:    pand %xmm4, %xmm2
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
+; SSE-NEXT:    por %xmm2, %xmm0
+; SSE-NEXT:    pand %xmm1, %xmm0
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2]
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: ustest_f32i32_mm:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[3,3,3,3]
+; AVX2-NEXT:    vcvttss2si %xmm1, %rax
+; AVX2-NEXT:    vmovq %rax, %xmm1
+; AVX2-NEXT:    vshufpd {{.*#+}} xmm2 = xmm0[1,0]
+; AVX2-NEXT:    vcvttss2si %xmm2, %rax
+; AVX2-NEXT:    vmovq %rax, %xmm2
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX2-NEXT:    vcvttss2si %xmm0, %rax
+; AVX2-NEXT:    vmovq %rax, %xmm2
+; AVX2-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX2-NEXT:    vcvttss2si %xmm0, %rax
+; AVX2-NEXT:    vmovq %rax, %xmm0
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
+; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,4294967295]
+; AVX2-NEXT:    vpcmpgtq %ymm0, %ymm1, %ymm2
+; AVX2-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm1
+; AVX2-NEXT:    vpand %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6]
+; AVX2-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX2-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: ustest_f32i32_mm:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vcvttps2qq %xmm0, %ymm0
+; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512-NEXT:    vpmaxsq %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpmovusqd %ymm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
 entry:
   %conv = fptosi <4 x float> %x to <4 x i64>
   %spec.store.select = call <4 x i64> @llvm.smin.v4i64(<4 x i64> %conv, <4 x i64> <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>)
@@ -2119,103 +3377,154 @@ entry:
   ret <4 x i32> %conv6
 }
 
-define <4 x i32> @stest_f16i32_mm(<4 x half> %x) {
-; CHECK-LABEL: stest_f16i32_mm:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    subq $72, %rsp
-; CHECK-NEXT:    .cfi_def_cfa_offset 80
-; CHECK-NEXT:    movdqa %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT:    movdqa %xmm0, %xmm1
-; CHECK-NEXT:    psrld $16, %xmm1
-; CHECK-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movdqa %xmm0, %xmm1
-; CHECK-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
-; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    psrlq $48, %xmm0
-; CHECK-NEXT:    callq __extendhfsf2@PLT
-; CHECK-NEXT:    cvttss2si %xmm0, %rax
-; CHECK-NEXT:    movq %rax, %xmm0
-; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __extendhfsf2@PLT
-; CHECK-NEXT:    cvttss2si %xmm0, %rax
-; CHECK-NEXT:    movq %rax, %xmm0
-; CHECK-NEXT:    punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
-; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __extendhfsf2@PLT
-; CHECK-NEXT:    cvttss2si %xmm0, %rax
-; CHECK-NEXT:    movq %rax, %xmm0
-; CHECK-NEXT:    movdqa %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __extendhfsf2@PLT
-; CHECK-NEXT:    cvttss2si %xmm0, %rax
-; CHECK-NEXT:    movq %rax, %xmm0
-; CHECK-NEXT:    movdqa (%rsp), %xmm2 # 16-byte Reload
-; CHECK-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0]
-; CHECK-NEXT:    movdqa {{.*#+}} xmm0 = [2147483648,2147483648]
-; CHECK-NEXT:    movdqa %xmm2, %xmm1
-; CHECK-NEXT:    movdqa %xmm2, %xmm7
-; CHECK-NEXT:    pxor %xmm0, %xmm1
-; CHECK-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; CHECK-NEXT:    pxor %xmm3, %xmm3
-; CHECK-NEXT:    pcmpeqd %xmm3, %xmm2
-; CHECK-NEXT:    movdqa {{.*#+}} xmm4 = [4294967295,4294967295]
-; CHECK-NEXT:    movdqa %xmm4, %xmm5
-; CHECK-NEXT:    pcmpgtd %xmm1, %xmm5
-; CHECK-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
-; CHECK-NEXT:    pand %xmm2, %xmm6
-; CHECK-NEXT:    pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3]
-; CHECK-NEXT:    por %xmm6, %xmm1
-; CHECK-NEXT:    movdqa {{.*#+}} xmm2 = [2147483647,2147483647]
-; CHECK-NEXT:    pand %xmm1, %xmm7
-; CHECK-NEXT:    pandn %xmm2, %xmm1
-; CHECK-NEXT:    por %xmm7, %xmm1
-; CHECK-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
-; CHECK-NEXT:    movdqa %xmm7, %xmm5
-; CHECK-NEXT:    pxor %xmm0, %xmm5
-; CHECK-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3]
-; CHECK-NEXT:    pcmpeqd %xmm3, %xmm6
-; CHECK-NEXT:    pcmpgtd %xmm5, %xmm4
-; CHECK-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[0,0,2,2]
-; CHECK-NEXT:    pand %xmm6, %xmm3
-; CHECK-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; CHECK-NEXT:    por %xmm3, %xmm4
-; CHECK-NEXT:    movdqa %xmm7, %xmm3
-; CHECK-NEXT:    pand %xmm4, %xmm3
-; CHECK-NEXT:    pandn %xmm2, %xmm4
-; CHECK-NEXT:    por %xmm3, %xmm4
-; CHECK-NEXT:    movdqa %xmm4, %xmm2
-; CHECK-NEXT:    pxor %xmm0, %xmm2
-; CHECK-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
-; CHECK-NEXT:    pcmpeqd %xmm5, %xmm5
-; CHECK-NEXT:    pcmpeqd %xmm5, %xmm3
-; CHECK-NEXT:    movdqa {{.*#+}} xmm6 = [18446744069414584320,18446744069414584320]
-; CHECK-NEXT:    pcmpgtd %xmm6, %xmm2
-; CHECK-NEXT:    pshufd {{.*#+}} xmm7 = xmm2[0,0,2,2]
-; CHECK-NEXT:    pand %xmm3, %xmm7
-; CHECK-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-NEXT:    por %xmm7, %xmm2
-; CHECK-NEXT:    movdqa {{.*#+}} xmm3 = [18446744071562067968,18446744071562067968]
-; CHECK-NEXT:    pand %xmm2, %xmm4
-; CHECK-NEXT:    pandn %xmm3, %xmm2
-; CHECK-NEXT:    por %xmm4, %xmm2
-; CHECK-NEXT:    pxor %xmm1, %xmm0
-; CHECK-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; CHECK-NEXT:    pcmpeqd %xmm5, %xmm4
-; CHECK-NEXT:    pcmpgtd %xmm6, %xmm0
-; CHECK-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2]
-; CHECK-NEXT:    pand %xmm4, %xmm5
-; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; CHECK-NEXT:    por %xmm5, %xmm0
-; CHECK-NEXT:    pand %xmm0, %xmm1
-; CHECK-NEXT:    pandn %xmm3, %xmm0
-; CHECK-NEXT:    por %xmm1, %xmm0
-; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
-; CHECK-NEXT:    addq $72, %rsp
-; CHECK-NEXT:    .cfi_def_cfa_offset 8
-; CHECK-NEXT:    retq
+define <4 x i32> @stest_f16i32_mm(<4 x half> %x) nounwind {
+; SSE-LABEL: stest_f16i32_mm:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    subq $72, %rsp
+; SSE-NEXT:    movdqa %xmm0, (%rsp) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm0, %xmm1
+; SSE-NEXT:    psrld $16, %xmm1
+; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm0, %xmm1
+; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
+; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    psrlq $48, %xmm0
+; SSE-NEXT:    callq __extendhfsf2@PLT
+; SSE-NEXT:    cvttss2si %xmm0, %rax
+; SSE-NEXT:    movq %rax, %xmm0
+; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE-NEXT:    callq __extendhfsf2@PLT
+; SSE-NEXT:    cvttss2si %xmm0, %rax
+; SSE-NEXT:    movq %rax, %xmm0
+; SSE-NEXT:    punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; SSE-NEXT:    # xmm0 = xmm0[0],mem[0]
+; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; SSE-NEXT:    callq __extendhfsf2@PLT
+; SSE-NEXT:    cvttss2si %xmm0, %rax
+; SSE-NEXT:    movq %rax, %xmm0
+; SSE-NEXT:    movdqa %xmm0, (%rsp) # 16-byte Spill
+; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE-NEXT:    callq __extendhfsf2@PLT
+; SSE-NEXT:    cvttss2si %xmm0, %rax
+; SSE-NEXT:    movq %rax, %xmm0
+; SSE-NEXT:    movdqa (%rsp), %xmm2 # 16-byte Reload
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0]
+; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [2147483648,2147483648]
+; SSE-NEXT:    movdqa %xmm2, %xmm1
+; SSE-NEXT:    movdqa %xmm2, %xmm7
+; SSE-NEXT:    pxor %xmm0, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; SSE-NEXT:    pxor %xmm3, %xmm3
+; SSE-NEXT:    pcmpeqd %xmm3, %xmm2
+; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [4294967295,4294967295]
+; SSE-NEXT:    movdqa %xmm4, %xmm5
+; SSE-NEXT:    pcmpgtd %xmm1, %xmm5
+; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; SSE-NEXT:    pand %xmm2, %xmm6
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3]
+; SSE-NEXT:    por %xmm6, %xmm1
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [2147483647,2147483647]
+; SSE-NEXT:    pand %xmm1, %xmm7
+; SSE-NEXT:    pandn %xmm2, %xmm1
+; SSE-NEXT:    por %xmm7, %xmm1
+; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
+; SSE-NEXT:    movdqa %xmm7, %xmm5
+; SSE-NEXT:    pxor %xmm0, %xmm5
+; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3]
+; SSE-NEXT:    pcmpeqd %xmm3, %xmm6
+; SSE-NEXT:    pcmpgtd %xmm5, %xmm4
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[0,0,2,2]
+; SSE-NEXT:    pand %xmm6, %xmm3
+; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE-NEXT:    por %xmm3, %xmm4
+; SSE-NEXT:    movdqa %xmm7, %xmm3
+; SSE-NEXT:    pand %xmm4, %xmm3
+; SSE-NEXT:    pandn %xmm2, %xmm4
+; SSE-NEXT:    por %xmm3, %xmm4
+; SSE-NEXT:    movdqa %xmm4, %xmm2
+; SSE-NEXT:    pxor %xmm0, %xmm2
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
+; SSE-NEXT:    pcmpeqd %xmm5, %xmm5
+; SSE-NEXT:    pcmpeqd %xmm5, %xmm3
+; SSE-NEXT:    movdqa {{.*#+}} xmm6 = [18446744069414584320,18446744069414584320]
+; SSE-NEXT:    pcmpgtd %xmm6, %xmm2
+; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm2[0,0,2,2]
+; SSE-NEXT:    pand %xmm3, %xmm7
+; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE-NEXT:    por %xmm7, %xmm2
+; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [18446744071562067968,18446744071562067968]
+; SSE-NEXT:    pand %xmm2, %xmm4
+; SSE-NEXT:    pandn %xmm3, %xmm2
+; SSE-NEXT:    por %xmm4, %xmm2
+; SSE-NEXT:    pxor %xmm1, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
+; SSE-NEXT:    pcmpeqd %xmm5, %xmm4
+; SSE-NEXT:    pcmpgtd %xmm6, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2]
+; SSE-NEXT:    pand %xmm4, %xmm5
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE-NEXT:    por %xmm5, %xmm0
+; SSE-NEXT:    pand %xmm0, %xmm1
+; SSE-NEXT:    pandn %xmm3, %xmm0
+; SSE-NEXT:    por %xmm1, %xmm0
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
+; SSE-NEXT:    addq $72, %rsp
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: stest_f16i32_mm:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpsrlq $48, %xmm0, %xmm1
+; AVX2-NEXT:    vpextrw $0, %xmm1, %eax
+; AVX2-NEXT:    movzwl %ax, %eax
+; AVX2-NEXT:    vmovd %eax, %xmm1
+; AVX2-NEXT:    vcvtph2ps %xmm1, %xmm1
+; AVX2-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; AVX2-NEXT:    vpextrw $0, %xmm2, %eax
+; AVX2-NEXT:    movzwl %ax, %eax
+; AVX2-NEXT:    vmovd %eax, %xmm2
+; AVX2-NEXT:    vcvttss2si %xmm1, %rax
+; AVX2-NEXT:    vcvtph2ps %xmm2, %xmm1
+; AVX2-NEXT:    vmovq %rax, %xmm2
+; AVX2-NEXT:    vcvttss2si %xmm1, %rax
+; AVX2-NEXT:    vmovq %rax, %xmm1
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
+; AVX2-NEXT:    vpextrw $0, %xmm0, %eax
+; AVX2-NEXT:    movzwl %ax, %eax
+; AVX2-NEXT:    vmovd %eax, %xmm2
+; AVX2-NEXT:    vcvtph2ps %xmm2, %xmm2
+; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm0
+; AVX2-NEXT:    vpextrw $0, %xmm0, %eax
+; AVX2-NEXT:    movzwl %ax, %eax
+; AVX2-NEXT:    vmovd %eax, %xmm0
+; AVX2-NEXT:    vcvttss2si %xmm2, %rax
+; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX2-NEXT:    vmovq %rax, %xmm2
+; AVX2-NEXT:    vcvttss2si %xmm0, %rax
+; AVX2-NEXT:    vmovq %rax, %xmm0
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
+; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [2147483647,2147483647,2147483647,2147483647]
+; AVX2-NEXT:    vpcmpgtq %ymm0, %ymm1, %ymm2
+; AVX2-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [18446744071562067968,18446744071562067968,18446744071562067968,18446744071562067968]
+; AVX2-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm2
+; AVX2-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vbroadcastf128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6]
+; AVX2-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX2-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: stest_f16i32_mm:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vcvtph2ps %xmm0, %ymm0
+; AVX512-NEXT:    vcvttps2qq %ymm0, %zmm0
+; AVX512-NEXT:    vpmovsqd %ymm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
 entry:
   %conv = fptosi <4 x half> %x to <4 x i64>
   %spec.store.select = call <4 x i64> @llvm.smin.v4i64(<4 x i64> %conv, <4 x i64> <i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647>)
@@ -2224,95 +3533,171 @@ entry:
   ret <4 x i32> %conv6
 }
 
-define <4 x i32> @utesth_f16i32_mm(<4 x half> %x) {
-; CHECK-LABEL: utesth_f16i32_mm:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    subq $72, %rsp
-; CHECK-NEXT:    .cfi_def_cfa_offset 80
-; CHECK-NEXT:    movaps %xmm0, %xmm1
-; CHECK-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
-; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movaps %xmm0, %xmm1
-; CHECK-NEXT:    psrlq $48, %xmm1
-; CHECK-NEXT:    movdqa %xmm1, (%rsp) # 16-byte Spill
-; CHECK-NEXT:    movaps %xmm0, %xmm1
-; CHECK-NEXT:    psrld $16, %xmm1
-; CHECK-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    callq __extendhfsf2@PLT
-; CHECK-NEXT:    cvttss2si %xmm0, %rax
-; CHECK-NEXT:    subss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cvttss2si %xmm0, %rcx
-; CHECK-NEXT:    movq %rax, %rdx
-; CHECK-NEXT:    sarq $63, %rdx
-; CHECK-NEXT:    andq %rcx, %rdx
-; CHECK-NEXT:    orq %rax, %rdx
-; CHECK-NEXT:    movq %rdx, %xmm0
-; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __extendhfsf2@PLT
-; CHECK-NEXT:    cvttss2si %xmm0, %rax
-; CHECK-NEXT:    subss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cvttss2si %xmm0, %rcx
-; CHECK-NEXT:    movq %rax, %rdx
-; CHECK-NEXT:    sarq $63, %rdx
-; CHECK-NEXT:    andq %rcx, %rdx
-; CHECK-NEXT:    orq %rax, %rdx
-; CHECK-NEXT:    movq %rdx, %xmm0
-; CHECK-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; CHECK-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
-; CHECK-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __extendhfsf2@PLT
-; CHECK-NEXT:    cvttss2si %xmm0, %rax
-; CHECK-NEXT:    subss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cvttss2si %xmm0, %rcx
-; CHECK-NEXT:    movq %rax, %rdx
-; CHECK-NEXT:    sarq $63, %rdx
-; CHECK-NEXT:    andq %rcx, %rdx
-; CHECK-NEXT:    orq %rax, %rdx
-; CHECK-NEXT:    movq %rdx, %xmm0
-; CHECK-NEXT:    movdqa %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __extendhfsf2@PLT
-; CHECK-NEXT:    cvttss2si %xmm0, %rax
-; CHECK-NEXT:    subss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cvttss2si %xmm0, %rcx
-; CHECK-NEXT:    movq %rax, %rdx
-; CHECK-NEXT:    sarq $63, %rdx
-; CHECK-NEXT:    andq %rcx, %rdx
-; CHECK-NEXT:    orq %rax, %rdx
-; CHECK-NEXT:    movq %rdx, %xmm0
-; CHECK-NEXT:    punpcklqdq (%rsp), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
-; CHECK-NEXT:    movdqa {{.*#+}} xmm1 = [9223372039002259456,9223372039002259456]
-; CHECK-NEXT:    movdqa %xmm0, %xmm2
-; CHECK-NEXT:    pxor %xmm1, %xmm2
-; CHECK-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
-; CHECK-NEXT:    movdqa {{.*#+}} xmm4 = [2147483647,2147483647,2147483647,2147483647]
-; CHECK-NEXT:    movdqa %xmm4, %xmm5
-; CHECK-NEXT:    pcmpgtd %xmm3, %xmm5
-; CHECK-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-NEXT:    pcmpeqd %xmm1, %xmm2
-; CHECK-NEXT:    pand %xmm5, %xmm2
-; CHECK-NEXT:    pcmpeqd %xmm3, %xmm3
-; CHECK-NEXT:    pand %xmm2, %xmm0
-; CHECK-NEXT:    pxor %xmm3, %xmm2
-; CHECK-NEXT:    por %xmm0, %xmm2
-; CHECK-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; CHECK-NEXT:    movdqa %xmm6, %xmm0
-; CHECK-NEXT:    pxor %xmm1, %xmm0
-; CHECK-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2]
-; CHECK-NEXT:    pcmpgtd %xmm5, %xmm4
-; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; CHECK-NEXT:    pcmpeqd %xmm1, %xmm0
-; CHECK-NEXT:    pand %xmm4, %xmm0
-; CHECK-NEXT:    pxor %xmm0, %xmm3
-; CHECK-NEXT:    pand %xmm6, %xmm0
-; CHECK-NEXT:    por %xmm3, %xmm0
-; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
-; CHECK-NEXT:    addq $72, %rsp
-; CHECK-NEXT:    .cfi_def_cfa_offset 8
-; CHECK-NEXT:    retq
+define <4 x i32> @utesth_f16i32_mm(<4 x half> %x) nounwind {
+; SSE-LABEL: utesth_f16i32_mm:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    subq $72, %rsp
+; SSE-NEXT:    movaps %xmm0, %xmm1
+; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
+; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps %xmm0, %xmm1
+; SSE-NEXT:    psrlq $48, %xmm1
+; SSE-NEXT:    movdqa %xmm1, (%rsp) # 16-byte Spill
+; SSE-NEXT:    movaps %xmm0, %xmm1
+; SSE-NEXT:    psrld $16, %xmm1
+; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    callq __extendhfsf2@PLT
+; SSE-NEXT:    cvttss2si %xmm0, %rax
+; SSE-NEXT:    subss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT:    cvttss2si %xmm0, %rcx
+; SSE-NEXT:    movq %rax, %rdx
+; SSE-NEXT:    sarq $63, %rdx
+; SSE-NEXT:    andq %rcx, %rdx
+; SSE-NEXT:    orq %rax, %rdx
+; SSE-NEXT:    movq %rdx, %xmm0
+; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE-NEXT:    callq __extendhfsf2@PLT
+; SSE-NEXT:    cvttss2si %xmm0, %rax
+; SSE-NEXT:    subss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT:    cvttss2si %xmm0, %rcx
+; SSE-NEXT:    movq %rax, %rdx
+; SSE-NEXT:    sarq $63, %rdx
+; SSE-NEXT:    andq %rcx, %rdx
+; SSE-NEXT:    orq %rax, %rdx
+; SSE-NEXT:    movq %rdx, %xmm0
+; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; SSE-NEXT:    callq __extendhfsf2@PLT
+; SSE-NEXT:    cvttss2si %xmm0, %rax
+; SSE-NEXT:    subss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT:    cvttss2si %xmm0, %rcx
+; SSE-NEXT:    movq %rax, %rdx
+; SSE-NEXT:    sarq $63, %rdx
+; SSE-NEXT:    andq %rcx, %rdx
+; SSE-NEXT:    orq %rax, %rdx
+; SSE-NEXT:    movq %rdx, %xmm0
+; SSE-NEXT:    movdqa %xmm0, (%rsp) # 16-byte Spill
+; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE-NEXT:    callq __extendhfsf2@PLT
+; SSE-NEXT:    cvttss2si %xmm0, %rax
+; SSE-NEXT:    subss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT:    cvttss2si %xmm0, %rcx
+; SSE-NEXT:    movq %rax, %rdx
+; SSE-NEXT:    sarq $63, %rdx
+; SSE-NEXT:    andq %rcx, %rdx
+; SSE-NEXT:    orq %rax, %rdx
+; SSE-NEXT:    movq %rdx, %xmm0
+; SSE-NEXT:    punpcklqdq (%rsp), %xmm0 # 16-byte Folded Reload
+; SSE-NEXT:    # xmm0 = xmm0[0],mem[0]
+; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [9223372039002259456,9223372039002259456]
+; SSE-NEXT:    movdqa %xmm0, %xmm2
+; SSE-NEXT:    pxor %xmm1, %xmm2
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [2147483647,2147483647,2147483647,2147483647]
+; SSE-NEXT:    movdqa %xmm4, %xmm5
+; SSE-NEXT:    pcmpgtd %xmm3, %xmm5
+; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE-NEXT:    pcmpeqd %xmm1, %xmm2
+; SSE-NEXT:    pand %xmm5, %xmm2
+; SSE-NEXT:    pcmpeqd %xmm3, %xmm3
+; SSE-NEXT:    pand %xmm2, %xmm0
+; SSE-NEXT:    pxor %xmm3, %xmm2
+; SSE-NEXT:    por %xmm0, %xmm2
+; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
+; SSE-NEXT:    movdqa %xmm6, %xmm0
+; SSE-NEXT:    pxor %xmm1, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm0[0,0,2,2]
+; SSE-NEXT:    pcmpgtd %xmm5, %xmm4
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE-NEXT:    pcmpeqd %xmm1, %xmm0
+; SSE-NEXT:    pand %xmm4, %xmm0
+; SSE-NEXT:    pxor %xmm0, %xmm3
+; SSE-NEXT:    pand %xmm6, %xmm0
+; SSE-NEXT:    por %xmm3, %xmm0
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
+; SSE-NEXT:    addq $72, %rsp
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: utesth_f16i32_mm:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpsrlq $48, %xmm0, %xmm1
+; AVX2-NEXT:    vpextrw $0, %xmm1, %eax
+; AVX2-NEXT:    movzwl %ax, %eax
+; AVX2-NEXT:    vmovd %eax, %xmm1
+; AVX2-NEXT:    vcvtph2ps %xmm1, %xmm2
+; AVX2-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX2-NEXT:    vsubss %xmm1, %xmm2, %xmm3
+; AVX2-NEXT:    vcvttss2si %xmm3, %rax
+; AVX2-NEXT:    vcvttss2si %xmm2, %rcx
+; AVX2-NEXT:    movq %rcx, %rdx
+; AVX2-NEXT:    sarq $63, %rdx
+; AVX2-NEXT:    andq %rax, %rdx
+; AVX2-NEXT:    orq %rcx, %rdx
+; AVX2-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; AVX2-NEXT:    vpextrw $0, %xmm2, %eax
+; AVX2-NEXT:    movzwl %ax, %eax
+; AVX2-NEXT:    vmovd %eax, %xmm2
+; AVX2-NEXT:    vcvtph2ps %xmm2, %xmm2
+; AVX2-NEXT:    vsubss %xmm1, %xmm2, %xmm3
+; AVX2-NEXT:    vcvttss2si %xmm3, %rax
+; AVX2-NEXT:    vcvttss2si %xmm2, %rcx
+; AVX2-NEXT:    vmovq %rdx, %xmm2
+; AVX2-NEXT:    vpextrw $0, %xmm0, %edx
+; AVX2-NEXT:    movzwl %dx, %edx
+; AVX2-NEXT:    vmovd %edx, %xmm3
+; AVX2-NEXT:    movq %rcx, %rdx
+; AVX2-NEXT:    sarq $63, %rdx
+; AVX2-NEXT:    vcvtph2ps %xmm3, %xmm3
+; AVX2-NEXT:    andq %rax, %rdx
+; AVX2-NEXT:    vsubss %xmm1, %xmm3, %xmm4
+; AVX2-NEXT:    vcvttss2si %xmm4, %rax
+; AVX2-NEXT:    orq %rcx, %rdx
+; AVX2-NEXT:    vmovq %rdx, %xmm4
+; AVX2-NEXT:    vcvttss2si %xmm3, %rcx
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm4[0],xmm2[0]
+; AVX2-NEXT:    movq %rcx, %rdx
+; AVX2-NEXT:    sarq $63, %rdx
+; AVX2-NEXT:    andq %rax, %rdx
+; AVX2-NEXT:    orq %rcx, %rdx
+; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm0
+; AVX2-NEXT:    vpextrw $0, %xmm0, %eax
+; AVX2-NEXT:    movzwl %ax, %eax
+; AVX2-NEXT:    vmovd %eax, %xmm0
+; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX2-NEXT:    vsubss %xmm1, %xmm0, %xmm1
+; AVX2-NEXT:    vcvttss2si %xmm1, %rax
+; AVX2-NEXT:    vcvttss2si %xmm0, %rcx
+; AVX2-NEXT:    vmovq %rdx, %xmm0
+; AVX2-NEXT:    movq %rcx, %rdx
+; AVX2-NEXT:    sarq $63, %rdx
+; AVX2-NEXT:    andq %rax, %rdx
+; AVX2-NEXT:    orq %rcx, %rdx
+; AVX2-NEXT:    vmovq %rdx, %xmm1
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm1
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [9223372041149743102,9223372041149743102,9223372041149743102,9223372041149743102]
+; AVX2-NEXT:    vpcmpgtq %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [4294967295,4294967295,4294967295,4294967295]
+; AVX2-NEXT:    vblendvpd %ymm1, %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vbroadcastf128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6]
+; AVX2-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX2-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: utesth_f16i32_mm:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vcvtph2ps %xmm0, %ymm0
+; AVX512-NEXT:    vcvttps2uqq %ymm0, %zmm0
+; AVX512-NEXT:    vpmovusqd %ymm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
 entry:
   %conv = fptoui <4 x half> %x to <4 x i64>
   %spec.store.select = call <4 x i64> @llvm.umin.v4i64(<4 x i64> %conv, <4 x i64> <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>)
@@ -2320,97 +3705,150 @@ entry:
   ret <4 x i32> %conv6
 }
 
-define <4 x i32> @ustest_f16i32_mm(<4 x half> %x) {
-; CHECK-LABEL: ustest_f16i32_mm:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    subq $72, %rsp
-; CHECK-NEXT:    .cfi_def_cfa_offset 80
-; CHECK-NEXT:    movdqa %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT:    movdqa %xmm0, %xmm1
-; CHECK-NEXT:    psrld $16, %xmm1
-; CHECK-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movdqa %xmm0, %xmm1
-; CHECK-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
-; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    psrlq $48, %xmm0
-; CHECK-NEXT:    callq __extendhfsf2@PLT
-; CHECK-NEXT:    cvttss2si %xmm0, %rax
-; CHECK-NEXT:    movq %rax, %xmm0
-; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __extendhfsf2@PLT
-; CHECK-NEXT:    cvttss2si %xmm0, %rax
-; CHECK-NEXT:    movq %rax, %xmm0
-; CHECK-NEXT:    punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
-; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __extendhfsf2@PLT
-; CHECK-NEXT:    cvttss2si %xmm0, %rax
-; CHECK-NEXT:    movq %rax, %xmm0
-; CHECK-NEXT:    movdqa %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __extendhfsf2@PLT
-; CHECK-NEXT:    cvttss2si %xmm0, %rax
-; CHECK-NEXT:    movq %rax, %xmm0
-; CHECK-NEXT:    movdqa (%rsp), %xmm2 # 16-byte Reload
-; CHECK-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0]
-; CHECK-NEXT:    movdqa {{.*#+}} xmm0 = [2147483648,2147483648]
-; CHECK-NEXT:    movdqa %xmm2, %xmm1
-; CHECK-NEXT:    movdqa %xmm2, %xmm7
-; CHECK-NEXT:    pxor %xmm0, %xmm1
-; CHECK-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
-; CHECK-NEXT:    pxor %xmm3, %xmm3
-; CHECK-NEXT:    pcmpeqd %xmm3, %xmm2
-; CHECK-NEXT:    movdqa {{.*#+}} xmm4 = [2147483647,2147483647]
-; CHECK-NEXT:    movdqa %xmm4, %xmm5
-; CHECK-NEXT:    pcmpgtd %xmm1, %xmm5
-; CHECK-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
-; CHECK-NEXT:    pand %xmm2, %xmm6
-; CHECK-NEXT:    pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3]
-; CHECK-NEXT:    por %xmm6, %xmm1
-; CHECK-NEXT:    movdqa {{.*#+}} xmm2 = [4294967295,4294967295]
-; CHECK-NEXT:    pand %xmm1, %xmm7
-; CHECK-NEXT:    pandn %xmm2, %xmm1
-; CHECK-NEXT:    por %xmm7, %xmm1
-; CHECK-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
-; CHECK-NEXT:    movdqa %xmm7, %xmm5
-; CHECK-NEXT:    pxor %xmm0, %xmm5
-; CHECK-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3]
-; CHECK-NEXT:    pcmpeqd %xmm3, %xmm6
-; CHECK-NEXT:    pcmpgtd %xmm5, %xmm4
-; CHECK-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[0,0,2,2]
-; CHECK-NEXT:    pand %xmm6, %xmm3
-; CHECK-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; CHECK-NEXT:    por %xmm3, %xmm4
-; CHECK-NEXT:    movdqa %xmm7, %xmm3
-; CHECK-NEXT:    pand %xmm4, %xmm3
-; CHECK-NEXT:    pandn %xmm2, %xmm4
-; CHECK-NEXT:    por %xmm3, %xmm4
-; CHECK-NEXT:    movdqa %xmm4, %xmm2
-; CHECK-NEXT:    pxor %xmm0, %xmm2
-; CHECK-NEXT:    movdqa %xmm2, %xmm3
-; CHECK-NEXT:    pcmpgtd %xmm0, %xmm3
-; CHECK-NEXT:    pcmpeqd %xmm0, %xmm2
-; CHECK-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-NEXT:    pand %xmm3, %xmm2
-; CHECK-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; CHECK-NEXT:    por %xmm2, %xmm3
-; CHECK-NEXT:    pand %xmm4, %xmm3
-; CHECK-NEXT:    movdqa %xmm1, %xmm2
-; CHECK-NEXT:    pxor %xmm0, %xmm2
-; CHECK-NEXT:    movdqa %xmm2, %xmm4
-; CHECK-NEXT:    pcmpgtd %xmm0, %xmm4
-; CHECK-NEXT:    pcmpeqd %xmm0, %xmm2
-; CHECK-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; CHECK-NEXT:    pand %xmm4, %xmm2
-; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; CHECK-NEXT:    por %xmm2, %xmm0
-; CHECK-NEXT:    pand %xmm1, %xmm0
-; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2]
-; CHECK-NEXT:    addq $72, %rsp
-; CHECK-NEXT:    .cfi_def_cfa_offset 8
-; CHECK-NEXT:    retq
+define <4 x i32> @ustest_f16i32_mm(<4 x half> %x) nounwind {
+; SSE-LABEL: ustest_f16i32_mm:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    subq $72, %rsp
+; SSE-NEXT:    movdqa %xmm0, (%rsp) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm0, %xmm1
+; SSE-NEXT:    psrld $16, %xmm1
+; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm0, %xmm1
+; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
+; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    psrlq $48, %xmm0
+; SSE-NEXT:    callq __extendhfsf2@PLT
+; SSE-NEXT:    cvttss2si %xmm0, %rax
+; SSE-NEXT:    movq %rax, %xmm0
+; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE-NEXT:    callq __extendhfsf2@PLT
+; SSE-NEXT:    cvttss2si %xmm0, %rax
+; SSE-NEXT:    movq %rax, %xmm0
+; SSE-NEXT:    punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; SSE-NEXT:    # xmm0 = xmm0[0],mem[0]
+; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; SSE-NEXT:    callq __extendhfsf2@PLT
+; SSE-NEXT:    cvttss2si %xmm0, %rax
+; SSE-NEXT:    movq %rax, %xmm0
+; SSE-NEXT:    movdqa %xmm0, (%rsp) # 16-byte Spill
+; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE-NEXT:    callq __extendhfsf2@PLT
+; SSE-NEXT:    cvttss2si %xmm0, %rax
+; SSE-NEXT:    movq %rax, %xmm0
+; SSE-NEXT:    movdqa (%rsp), %xmm2 # 16-byte Reload
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0]
+; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [2147483648,2147483648]
+; SSE-NEXT:    movdqa %xmm2, %xmm1
+; SSE-NEXT:    movdqa %xmm2, %xmm7
+; SSE-NEXT:    pxor %xmm0, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
+; SSE-NEXT:    pxor %xmm3, %xmm3
+; SSE-NEXT:    pcmpeqd %xmm3, %xmm2
+; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [2147483647,2147483647]
+; SSE-NEXT:    movdqa %xmm4, %xmm5
+; SSE-NEXT:    pcmpgtd %xmm1, %xmm5
+; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; SSE-NEXT:    pand %xmm2, %xmm6
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3]
+; SSE-NEXT:    por %xmm6, %xmm1
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [4294967295,4294967295]
+; SSE-NEXT:    pand %xmm1, %xmm7
+; SSE-NEXT:    pandn %xmm2, %xmm1
+; SSE-NEXT:    por %xmm7, %xmm1
+; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
+; SSE-NEXT:    movdqa %xmm7, %xmm5
+; SSE-NEXT:    pxor %xmm0, %xmm5
+; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[1,1,3,3]
+; SSE-NEXT:    pcmpeqd %xmm3, %xmm6
+; SSE-NEXT:    pcmpgtd %xmm5, %xmm4
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[0,0,2,2]
+; SSE-NEXT:    pand %xmm6, %xmm3
+; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; SSE-NEXT:    por %xmm3, %xmm4
+; SSE-NEXT:    movdqa %xmm7, %xmm3
+; SSE-NEXT:    pand %xmm4, %xmm3
+; SSE-NEXT:    pandn %xmm2, %xmm4
+; SSE-NEXT:    por %xmm3, %xmm4
+; SSE-NEXT:    movdqa %xmm4, %xmm2
+; SSE-NEXT:    pxor %xmm0, %xmm2
+; SSE-NEXT:    movdqa %xmm2, %xmm3
+; SSE-NEXT:    pcmpgtd %xmm0, %xmm3
+; SSE-NEXT:    pcmpeqd %xmm0, %xmm2
+; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE-NEXT:    pand %xmm3, %xmm2
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; SSE-NEXT:    por %xmm2, %xmm3
+; SSE-NEXT:    pand %xmm4, %xmm3
+; SSE-NEXT:    movdqa %xmm1, %xmm2
+; SSE-NEXT:    pxor %xmm0, %xmm2
+; SSE-NEXT:    movdqa %xmm2, %xmm4
+; SSE-NEXT:    pcmpgtd %xmm0, %xmm4
+; SSE-NEXT:    pcmpeqd %xmm0, %xmm2
+; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE-NEXT:    pand %xmm4, %xmm2
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
+; SSE-NEXT:    por %xmm2, %xmm0
+; SSE-NEXT:    pand %xmm1, %xmm0
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2]
+; SSE-NEXT:    addq $72, %rsp
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: ustest_f16i32_mm:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vpsrlq $48, %xmm0, %xmm1
+; AVX2-NEXT:    vpextrw $0, %xmm1, %eax
+; AVX2-NEXT:    movzwl %ax, %eax
+; AVX2-NEXT:    vmovd %eax, %xmm1
+; AVX2-NEXT:    vcvtph2ps %xmm1, %xmm1
+; AVX2-NEXT:    vcvttss2si %xmm1, %rax
+; AVX2-NEXT:    vmovq %rax, %xmm1
+; AVX2-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; AVX2-NEXT:    vpextrw $0, %xmm2, %eax
+; AVX2-NEXT:    movzwl %ax, %eax
+; AVX2-NEXT:    vmovd %eax, %xmm2
+; AVX2-NEXT:    vcvtph2ps %xmm2, %xmm2
+; AVX2-NEXT:    vcvttss2si %xmm2, %rax
+; AVX2-NEXT:    vmovq %rax, %xmm2
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX2-NEXT:    vpextrw $0, %xmm0, %eax
+; AVX2-NEXT:    movzwl %ax, %eax
+; AVX2-NEXT:    vmovd %eax, %xmm2
+; AVX2-NEXT:    vcvtph2ps %xmm2, %xmm2
+; AVX2-NEXT:    vcvttss2si %xmm2, %rax
+; AVX2-NEXT:    vmovq %rax, %xmm2
+; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm0
+; AVX2-NEXT:    vpextrw $0, %xmm0, %eax
+; AVX2-NEXT:    movzwl %ax, %eax
+; AVX2-NEXT:    vmovd %eax, %xmm0
+; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVX2-NEXT:    vcvttss2si %xmm0, %rax
+; AVX2-NEXT:    vmovq %rax, %xmm0
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
+; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,4294967295]
+; AVX2-NEXT:    vpcmpgtq %ymm0, %ymm1, %ymm2
+; AVX2-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm1
+; AVX2-NEXT:    vpand %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6]
+; AVX2-NEXT:    # ymm1 = mem[0,1,0,1]
+; AVX2-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: ustest_f16i32_mm:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vcvtph2ps %xmm0, %ymm0
+; AVX512-NEXT:    vcvttps2qq %ymm0, %zmm0
+; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512-NEXT:    vpmaxsq %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpmovusqd %ymm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
 entry:
   %conv = fptosi <4 x half> %x to <4 x i64>
   %spec.store.select = call <4 x i64> @llvm.smin.v4i64(<4 x i64> %conv, <4 x i64> <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>)
@@ -2421,24 +3859,42 @@ entry:
 
 ; i16 saturate
 
-define <2 x i16> @stest_f64i16_mm(<2 x double> %x) {
-; CHECK-LABEL: stest_f64i16_mm:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    cvttpd2dq %xmm0, %xmm0
-; CHECK-NEXT:    movdqa {{.*#+}} xmm1 = <32767,32767,u,u>
-; CHECK-NEXT:    movdqa %xmm1, %xmm2
-; CHECK-NEXT:    pcmpgtd %xmm0, %xmm2
-; CHECK-NEXT:    pand %xmm2, %xmm0
-; CHECK-NEXT:    pandn %xmm1, %xmm2
-; CHECK-NEXT:    por %xmm0, %xmm2
-; CHECK-NEXT:    movdqa {{.*#+}} xmm0 = <4294934528,4294934528,u,u>
-; CHECK-NEXT:    movdqa %xmm2, %xmm1
-; CHECK-NEXT:    pcmpgtd %xmm0, %xmm1
-; CHECK-NEXT:    pand %xmm1, %xmm2
-; CHECK-NEXT:    pandn %xmm0, %xmm1
-; CHECK-NEXT:    por %xmm2, %xmm1
-; CHECK-NEXT:    pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7]
-; CHECK-NEXT:    retq
+define <2 x i16> @stest_f64i16_mm(<2 x double> %x) nounwind {
+; SSE-LABEL: stest_f64i16_mm:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    cvttpd2dq %xmm0, %xmm0
+; SSE-NEXT:    movdqa {{.*#+}} xmm1 = <32767,32767,u,u>
+; SSE-NEXT:    movdqa %xmm1, %xmm2
+; SSE-NEXT:    pcmpgtd %xmm0, %xmm2
+; SSE-NEXT:    pand %xmm2, %xmm0
+; SSE-NEXT:    pandn %xmm1, %xmm2
+; SSE-NEXT:    por %xmm0, %xmm2
+; SSE-NEXT:    movdqa {{.*#+}} xmm0 = <4294934528,4294934528,u,u>
+; SSE-NEXT:    movdqa %xmm2, %xmm1
+; SSE-NEXT:    pcmpgtd %xmm0, %xmm1
+; SSE-NEXT:    pand %xmm1, %xmm2
+; SSE-NEXT:    pandn %xmm0, %xmm1
+; SSE-NEXT:    por %xmm2, %xmm1
+; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7]
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: stest_f64i16_mm:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vcvttpd2dq %xmm0, %xmm0
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [32767,32767,32767,32767]
+; AVX2-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [4294934528,4294934528,4294934528,4294934528]
+; AVX2-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: stest_f64i16_mm:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vcvttpd2dq %xmm0, %xmm0
+; AVX512-NEXT:    vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
+; AVX512-NEXT:    vpmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
+; AVX512-NEXT:    vpmovdw %xmm0, %xmm0
+; AVX512-NEXT:    retq
 entry:
   %conv = fptosi <2 x double> %x to <2 x i32>
   %spec.store.select = call <2 x i32> @llvm.smin.v2i32(<2 x i32> %conv, <2 x i32> <i32 32767, i32 32767>)
@@ -2447,25 +3903,45 @@ entry:
   ret <2 x i16> %conv6
 }
 
-define <2 x i16> @utest_f64i16_mm(<2 x double> %x) {
-; CHECK-LABEL: utest_f64i16_mm:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    cvttpd2dq %xmm0, %xmm1
-; CHECK-NEXT:    movapd %xmm1, %xmm2
-; CHECK-NEXT:    psrad $31, %xmm2
-; CHECK-NEXT:    addpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cvttpd2dq %xmm0, %xmm0
-; CHECK-NEXT:    andpd %xmm2, %xmm0
-; CHECK-NEXT:    orpd %xmm1, %xmm0
-; CHECK-NEXT:    movapd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
-; CHECK-NEXT:    xorpd %xmm0, %xmm1
-; CHECK-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-NEXT:    movdqa %xmm1, %xmm2
-; CHECK-NEXT:    pandn %xmm0, %xmm2
-; CHECK-NEXT:    psrld $16, %xmm1
-; CHECK-NEXT:    por %xmm2, %xmm1
-; CHECK-NEXT:    pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7]
-; CHECK-NEXT:    retq
+define <2 x i16> @utest_f64i16_mm(<2 x double> %x) nounwind {
+; SSE-LABEL: utest_f64i16_mm:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    cvttpd2dq %xmm0, %xmm1
+; SSE-NEXT:    movapd %xmm1, %xmm2
+; SSE-NEXT:    psrad $31, %xmm2
+; SSE-NEXT:    addpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT:    cvttpd2dq %xmm0, %xmm0
+; SSE-NEXT:    andpd %xmm2, %xmm0
+; SSE-NEXT:    orpd %xmm1, %xmm0
+; SSE-NEXT:    movapd {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
+; SSE-NEXT:    xorpd %xmm0, %xmm1
+; SSE-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE-NEXT:    movdqa %xmm1, %xmm2
+; SSE-NEXT:    pandn %xmm0, %xmm2
+; SSE-NEXT:    psrld $16, %xmm1
+; SSE-NEXT:    por %xmm2, %xmm1
+; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7]
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: utest_f64i16_mm:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vcvttpd2dq %xmm0, %xmm1
+; AVX2-NEXT:    vaddpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vpsrad $31, %xmm1, %xmm2
+; AVX2-NEXT:    vcvttpd2dq %xmm0, %xmm0
+; AVX2-NEXT:    vandpd %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vorpd %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [65535,65535,65535,65535]
+; AVX2-NEXT:    vpminud %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: utest_f64i16_mm:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vcvttpd2udq %xmm0, %xmm0
+; AVX512-NEXT:    vpminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
+; AVX512-NEXT:    vpmovdw %xmm0, %xmm0
+; AVX512-NEXT:    retq
 entry:
   %conv = fptoui <2 x double> %x to <2 x i32>
   %spec.store.select = call <2 x i32> @llvm.umin.v2i32(<2 x i32> %conv, <2 x i32> <i32 65535, i32 65535>)
@@ -2473,22 +3949,41 @@ entry:
   ret <2 x i16> %conv6
 }
 
-define <2 x i16> @ustest_f64i16_mm(<2 x double> %x) {
-; CHECK-LABEL: ustest_f64i16_mm:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    cvttpd2dq %xmm0, %xmm0
-; CHECK-NEXT:    movdqa {{.*#+}} xmm1 = <65535,65535,u,u>
-; CHECK-NEXT:    movdqa %xmm1, %xmm2
-; CHECK-NEXT:    pcmpgtd %xmm0, %xmm2
-; CHECK-NEXT:    pand %xmm2, %xmm0
-; CHECK-NEXT:    pandn %xmm1, %xmm2
-; CHECK-NEXT:    por %xmm0, %xmm2
-; CHECK-NEXT:    pxor %xmm0, %xmm0
-; CHECK-NEXT:    movdqa %xmm2, %xmm1
-; CHECK-NEXT:    pcmpgtd %xmm0, %xmm1
-; CHECK-NEXT:    pand %xmm2, %xmm1
-; CHECK-NEXT:    pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7]
-; CHECK-NEXT:    retq
+define <2 x i16> @ustest_f64i16_mm(<2 x double> %x) nounwind {
+; SSE-LABEL: ustest_f64i16_mm:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    cvttpd2dq %xmm0, %xmm0
+; SSE-NEXT:    movdqa {{.*#+}} xmm1 = <65535,65535,u,u>
+; SSE-NEXT:    movdqa %xmm1, %xmm2
+; SSE-NEXT:    pcmpgtd %xmm0, %xmm2
+; SSE-NEXT:    pand %xmm2, %xmm0
+; SSE-NEXT:    pandn %xmm1, %xmm2
+; SSE-NEXT:    por %xmm0, %xmm2
+; SSE-NEXT:    pxor %xmm0, %xmm0
+; SSE-NEXT:    movdqa %xmm2, %xmm1
+; SSE-NEXT:    pcmpgtd %xmm0, %xmm1
+; SSE-NEXT:    pand %xmm2, %xmm1
+; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7]
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: ustest_f64i16_mm:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vcvttpd2dq %xmm0, %xmm0
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [65535,65535,65535,65535]
+; AVX2-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: ustest_f64i16_mm:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vcvttpd2dq %xmm0, %xmm0
+; AVX512-NEXT:    vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
+; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpmovdw %xmm0, %xmm0
+; AVX512-NEXT:    retq
 entry:
   %conv = fptosi <2 x double> %x to <2 x i32>
   %spec.store.select = call <2 x i32> @llvm.smin.v2i32(<2 x i32> %conv, <2 x i32> <i32 65535, i32 65535>)
@@ -2497,12 +3992,18 @@ entry:
   ret <2 x i16> %conv6
 }
 
-define <4 x i16> @stest_f32i16_mm(<4 x float> %x) {
-; CHECK-LABEL: stest_f32i16_mm:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    cvttps2dq %xmm0, %xmm0
-; CHECK-NEXT:    packssdw %xmm0, %xmm0
-; CHECK-NEXT:    retq
+define <4 x i16> @stest_f32i16_mm(<4 x float> %x) nounwind {
+; SSE-LABEL: stest_f32i16_mm:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    cvttps2dq %xmm0, %xmm0
+; SSE-NEXT:    packssdw %xmm0, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: stest_f32i16_mm:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    vcvttps2dq %xmm0, %xmm0
+; AVX-NEXT:    vpackssdw %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    retq
 entry:
   %conv = fptosi <4 x float> %x to <4 x i32>
   %spec.store.select = call <4 x i32> @llvm.smin.v4i32(<4 x i32> %conv, <4 x i32> <i32 32767, i32 32767, i32 32767, i32 32767>)
@@ -2511,27 +4012,47 @@ entry:
   ret <4 x i16> %conv6
 }
 
-define <4 x i16> @utest_f32i16_mm(<4 x float> %x) {
-; CHECK-LABEL: utest_f32i16_mm:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    cvttps2dq %xmm0, %xmm1
-; CHECK-NEXT:    movdqa %xmm1, %xmm2
-; CHECK-NEXT:    psrad $31, %xmm2
-; CHECK-NEXT:    subps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cvttps2dq %xmm0, %xmm0
-; CHECK-NEXT:    pand %xmm2, %xmm0
-; CHECK-NEXT:    por %xmm1, %xmm0
-; CHECK-NEXT:    movdqa {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
-; CHECK-NEXT:    pxor %xmm0, %xmm1
-; CHECK-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-NEXT:    movdqa %xmm1, %xmm2
-; CHECK-NEXT:    pandn %xmm0, %xmm2
-; CHECK-NEXT:    psrld $16, %xmm1
-; CHECK-NEXT:    por %xmm2, %xmm1
-; CHECK-NEXT:    pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7]
-; CHECK-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-NEXT:    retq
+define <4 x i16> @utest_f32i16_mm(<4 x float> %x) nounwind {
+; SSE-LABEL: utest_f32i16_mm:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    cvttps2dq %xmm0, %xmm1
+; SSE-NEXT:    movdqa %xmm1, %xmm2
+; SSE-NEXT:    psrad $31, %xmm2
+; SSE-NEXT:    subps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT:    cvttps2dq %xmm0, %xmm0
+; SSE-NEXT:    pand %xmm2, %xmm0
+; SSE-NEXT:    por %xmm1, %xmm0
+; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
+; SSE-NEXT:    pxor %xmm0, %xmm1
+; SSE-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE-NEXT:    movdqa %xmm1, %xmm2
+; SSE-NEXT:    pandn %xmm0, %xmm2
+; SSE-NEXT:    psrld $16, %xmm1
+; SSE-NEXT:    por %xmm2, %xmm1
+; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: utest_f32i16_mm:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vbroadcastss {{.*#+}} xmm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9]
+; AVX2-NEXT:    vsubps %xmm1, %xmm0, %xmm1
+; AVX2-NEXT:    vcvttps2dq %xmm1, %xmm1
+; AVX2-NEXT:    vcvttps2dq %xmm0, %xmm0
+; AVX2-NEXT:    vpsrad $31, %xmm0, %xmm2
+; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [65535,65535,65535,65535]
+; AVX2-NEXT:    vpminud %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpackusdw %xmm0, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: utest_f32i16_mm:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vcvttps2udq %xmm0, %xmm0
+; AVX512-NEXT:    vpmovusdw %xmm0, %xmm0
+; AVX512-NEXT:    retq
 entry:
   %conv = fptoui <4 x float> %x to <4 x i32>
   %spec.store.select = call <4 x i32> @llvm.umin.v4i32(<4 x i32> %conv, <4 x i32> <i32 65535, i32 65535, i32 65535, i32 65535>)
@@ -2539,24 +4060,30 @@ entry:
   ret <4 x i16> %conv6
 }
 
-define <4 x i16> @ustest_f32i16_mm(<4 x float> %x) {
-; CHECK-LABEL: ustest_f32i16_mm:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    cvttps2dq %xmm0, %xmm0
-; CHECK-NEXT:    movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535]
-; CHECK-NEXT:    movdqa %xmm1, %xmm2
-; CHECK-NEXT:    pcmpgtd %xmm0, %xmm2
-; CHECK-NEXT:    pand %xmm2, %xmm0
-; CHECK-NEXT:    pandn %xmm1, %xmm2
-; CHECK-NEXT:    por %xmm0, %xmm2
-; CHECK-NEXT:    pxor %xmm0, %xmm0
-; CHECK-NEXT:    movdqa %xmm2, %xmm1
-; CHECK-NEXT:    pcmpgtd %xmm0, %xmm1
-; CHECK-NEXT:    pand %xmm2, %xmm1
-; CHECK-NEXT:    pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7]
-; CHECK-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-NEXT:    retq
+define <4 x i16> @ustest_f32i16_mm(<4 x float> %x) nounwind {
+; SSE-LABEL: ustest_f32i16_mm:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    cvttps2dq %xmm0, %xmm0
+; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535]
+; SSE-NEXT:    movdqa %xmm1, %xmm2
+; SSE-NEXT:    pcmpgtd %xmm0, %xmm2
+; SSE-NEXT:    pand %xmm2, %xmm0
+; SSE-NEXT:    pandn %xmm1, %xmm2
+; SSE-NEXT:    por %xmm0, %xmm2
+; SSE-NEXT:    pxor %xmm0, %xmm0
+; SSE-NEXT:    movdqa %xmm2, %xmm1
+; SSE-NEXT:    pcmpgtd %xmm0, %xmm1
+; SSE-NEXT:    pand %xmm2, %xmm1
+; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: ustest_f32i16_mm:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    vcvttps2dq %xmm0, %xmm0
+; AVX-NEXT:    vpackusdw %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    retq
 entry:
   %conv = fptosi <4 x float> %x to <4 x i32>
   %spec.store.select = call <4 x i32> @llvm.smin.v4i32(<4 x i32> %conv, <4 x i32> <i32 65535, i32 65535, i32 65535, i32 65535>)
@@ -2565,61 +4092,76 @@ entry:
   ret <4 x i16> %conv6
 }
 
-define <8 x i16> @stest_f16i16_mm(<8 x half> %x) {
-; CHECK-LABEL: stest_f16i16_mm:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    subq $72, %rsp
-; CHECK-NEXT:    .cfi_def_cfa_offset 80
-; CHECK-NEXT:    movdqa %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2@PLT
-; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; CHECK-NEXT:    callq __extendhfsf2@PLT
-; CHECK-NEXT:    unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; CHECK-NEXT:    cvttps2dq %xmm0, %xmm0
-; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movdqa (%rsp), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2@PLT
-; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
-; CHECK-NEXT:    callq __extendhfsf2@PLT
-; CHECK-NEXT:    unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; CHECK-NEXT:    cvttps2dq %xmm0, %xmm0
-; CHECK-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
-; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movdqa (%rsp), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    psrlq $48, %xmm0
-; CHECK-NEXT:    callq __extendhfsf2@PLT
-; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; CHECK-NEXT:    callq __extendhfsf2@PLT
-; CHECK-NEXT:    unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; CHECK-NEXT:    cvttps2dq %xmm0, %xmm0
-; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __extendhfsf2@PLT
-; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movdqa (%rsp), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    psrld $16, %xmm0
-; CHECK-NEXT:    callq __extendhfsf2@PLT
-; CHECK-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; CHECK-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; CHECK-NEXT:    cvttps2dq %xmm1, %xmm0
-; CHECK-NEXT:    punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
-; CHECK-NEXT:    packssdw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    addq $72, %rsp
-; CHECK-NEXT:    .cfi_def_cfa_offset 8
-; CHECK-NEXT:    retq
+define <8 x i16> @stest_f16i16_mm(<8 x half> %x) nounwind {
+; SSE-LABEL: stest_f16i16_mm:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    subq $72, %rsp
+; SSE-NEXT:    movdqa %xmm0, (%rsp) # 16-byte Spill
+; SSE-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE-NEXT:    callq __extendhfsf2@PLT
+; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; SSE-NEXT:    callq __extendhfsf2@PLT
+; SSE-NEXT:    unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; SSE-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; SSE-NEXT:    cvttps2dq %xmm0, %xmm0
+; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa (%rsp), %xmm0 # 16-byte Reload
+; SSE-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE-NEXT:    callq __extendhfsf2@PLT
+; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE-NEXT:    callq __extendhfsf2@PLT
+; SSE-NEXT:    unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; SSE-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; SSE-NEXT:    cvttps2dq %xmm0, %xmm0
+; SSE-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; SSE-NEXT:    # xmm0 = xmm0[0],mem[0]
+; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa (%rsp), %xmm0 # 16-byte Reload
+; SSE-NEXT:    psrlq $48, %xmm0
+; SSE-NEXT:    callq __extendhfsf2@PLT
+; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; SSE-NEXT:    callq __extendhfsf2@PLT
+; SSE-NEXT:    unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; SSE-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; SSE-NEXT:    cvttps2dq %xmm0, %xmm0
+; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; SSE-NEXT:    callq __extendhfsf2@PLT
+; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa (%rsp), %xmm0 # 16-byte Reload
+; SSE-NEXT:    psrld $16, %xmm0
+; SSE-NEXT:    callq __extendhfsf2@PLT
+; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE-NEXT:    cvttps2dq %xmm1, %xmm0
+; SSE-NEXT:    punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; SSE-NEXT:    # xmm0 = xmm0[0],mem[0]
+; SSE-NEXT:    packssdw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; SSE-NEXT:    addq $72, %rsp
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: stest_f16i16_mm:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vcvtph2ps %xmm0, %ymm0
+; AVX2-NEXT:    vcvttps2dq %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: stest_f16i16_mm:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vcvtph2ps %xmm0, %ymm0
+; AVX512-NEXT:    vcvttps2dq %ymm0, %ymm0
+; AVX512-NEXT:    vpmovsdw %ymm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
 entry:
   %conv = fptosi <8 x half> %x to <8 x i32>
   %spec.store.select = call <8 x i32> @llvm.smin.v8i32(<8 x i32> %conv, <8 x i32> <i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767, i32 32767>)
@@ -2628,105 +4170,128 @@ entry:
   ret <8 x i16> %conv6
 }
 
-define <8 x i16> @utesth_f16i16_mm(<8 x half> %x) {
-; CHECK-LABEL: utesth_f16i16_mm:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    subq $72, %rsp
-; CHECK-NEXT:    .cfi_def_cfa_offset 80
-; CHECK-NEXT:    movdqa %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2@PLT
-; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; CHECK-NEXT:    callq __extendhfsf2@PLT
-; CHECK-NEXT:    unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; CHECK-NEXT:    cvttps2dq %xmm0, %xmm1
-; CHECK-NEXT:    movdqa %xmm1, %xmm2
-; CHECK-NEXT:    psrad $31, %xmm2
-; CHECK-NEXT:    subps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cvttps2dq %xmm0, %xmm0
-; CHECK-NEXT:    pand %xmm2, %xmm0
-; CHECK-NEXT:    por %xmm1, %xmm0
-; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movdqa (%rsp), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2@PLT
-; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
-; CHECK-NEXT:    callq __extendhfsf2@PLT
-; CHECK-NEXT:    unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; CHECK-NEXT:    cvttps2dq %xmm0, %xmm1
-; CHECK-NEXT:    movdqa %xmm1, %xmm2
-; CHECK-NEXT:    psrad $31, %xmm2
-; CHECK-NEXT:    subps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cvttps2dq %xmm0, %xmm0
-; CHECK-NEXT:    pand %xmm2, %xmm0
-; CHECK-NEXT:    por %xmm1, %xmm0
-; CHECK-NEXT:    punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
-; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movdqa (%rsp), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    psrlq $48, %xmm0
-; CHECK-NEXT:    callq __extendhfsf2@PLT
-; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; CHECK-NEXT:    callq __extendhfsf2@PLT
-; CHECK-NEXT:    unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; CHECK-NEXT:    cvttps2dq %xmm0, %xmm1
-; CHECK-NEXT:    movdqa %xmm1, %xmm2
-; CHECK-NEXT:    psrad $31, %xmm2
-; CHECK-NEXT:    subps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    cvttps2dq %xmm0, %xmm0
-; CHECK-NEXT:    pand %xmm2, %xmm0
-; CHECK-NEXT:    por %xmm1, %xmm0
-; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __extendhfsf2@PLT
-; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movdqa (%rsp), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    psrld $16, %xmm0
-; CHECK-NEXT:    callq __extendhfsf2@PLT
-; CHECK-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; CHECK-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; CHECK-NEXT:    cvttps2dq %xmm2, %xmm0
-; CHECK-NEXT:    movdqa %xmm0, %xmm1
-; CHECK-NEXT:    psrad $31, %xmm1
-; CHECK-NEXT:    subps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; CHECK-NEXT:    cvttps2dq %xmm2, %xmm2
-; CHECK-NEXT:    pand %xmm1, %xmm2
-; CHECK-NEXT:    por %xmm0, %xmm2
-; CHECK-NEXT:    punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm2 = xmm2[0],mem[0]
-; CHECK-NEXT:    movdqa {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
-; CHECK-NEXT:    movdqa %xmm2, %xmm3
-; CHECK-NEXT:    pxor %xmm1, %xmm3
-; CHECK-NEXT:    movdqa {{.*#+}} xmm4 = [2147549183,2147549183,2147549183,2147549183]
-; CHECK-NEXT:    movdqa %xmm4, %xmm0
-; CHECK-NEXT:    pcmpgtd %xmm3, %xmm0
-; CHECK-NEXT:    pand %xmm0, %xmm2
-; CHECK-NEXT:    pcmpeqd %xmm3, %xmm3
-; CHECK-NEXT:    pxor %xmm3, %xmm0
-; CHECK-NEXT:    por %xmm2, %xmm0
-; CHECK-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; CHECK-NEXT:    pxor %xmm2, %xmm1
-; CHECK-NEXT:    pcmpgtd %xmm1, %xmm4
-; CHECK-NEXT:    pand %xmm4, %xmm2
-; CHECK-NEXT:    pxor %xmm3, %xmm4
-; CHECK-NEXT:    por %xmm2, %xmm4
-; CHECK-NEXT:    pslld $16, %xmm4
-; CHECK-NEXT:    psrad $16, %xmm4
-; CHECK-NEXT:    pslld $16, %xmm0
-; CHECK-NEXT:    psrad $16, %xmm0
-; CHECK-NEXT:    packssdw %xmm4, %xmm0
-; CHECK-NEXT:    addq $72, %rsp
-; CHECK-NEXT:    .cfi_def_cfa_offset 8
-; CHECK-NEXT:    retq
+define <8 x i16> @utesth_f16i16_mm(<8 x half> %x) nounwind {
+; SSE-LABEL: utesth_f16i16_mm:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    subq $72, %rsp
+; SSE-NEXT:    movdqa %xmm0, (%rsp) # 16-byte Spill
+; SSE-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE-NEXT:    callq __extendhfsf2@PLT
+; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; SSE-NEXT:    callq __extendhfsf2@PLT
+; SSE-NEXT:    unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; SSE-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; SSE-NEXT:    cvttps2dq %xmm0, %xmm1
+; SSE-NEXT:    movdqa %xmm1, %xmm2
+; SSE-NEXT:    psrad $31, %xmm2
+; SSE-NEXT:    subps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT:    cvttps2dq %xmm0, %xmm0
+; SSE-NEXT:    pand %xmm2, %xmm0
+; SSE-NEXT:    por %xmm1, %xmm0
+; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa (%rsp), %xmm0 # 16-byte Reload
+; SSE-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE-NEXT:    callq __extendhfsf2@PLT
+; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE-NEXT:    callq __extendhfsf2@PLT
+; SSE-NEXT:    unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; SSE-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; SSE-NEXT:    cvttps2dq %xmm0, %xmm1
+; SSE-NEXT:    movdqa %xmm1, %xmm2
+; SSE-NEXT:    psrad $31, %xmm2
+; SSE-NEXT:    subps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT:    cvttps2dq %xmm0, %xmm0
+; SSE-NEXT:    pand %xmm2, %xmm0
+; SSE-NEXT:    por %xmm1, %xmm0
+; SSE-NEXT:    punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; SSE-NEXT:    # xmm0 = xmm0[0],mem[0]
+; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa (%rsp), %xmm0 # 16-byte Reload
+; SSE-NEXT:    psrlq $48, %xmm0
+; SSE-NEXT:    callq __extendhfsf2@PLT
+; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; SSE-NEXT:    callq __extendhfsf2@PLT
+; SSE-NEXT:    unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; SSE-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; SSE-NEXT:    cvttps2dq %xmm0, %xmm1
+; SSE-NEXT:    movdqa %xmm1, %xmm2
+; SSE-NEXT:    psrad $31, %xmm2
+; SSE-NEXT:    subps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT:    cvttps2dq %xmm0, %xmm0
+; SSE-NEXT:    pand %xmm2, %xmm0
+; SSE-NEXT:    por %xmm1, %xmm0
+; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; SSE-NEXT:    callq __extendhfsf2@PLT
+; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa (%rsp), %xmm0 # 16-byte Reload
+; SSE-NEXT:    psrld $16, %xmm0
+; SSE-NEXT:    callq __extendhfsf2@PLT
+; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
+; SSE-NEXT:    cvttps2dq %xmm2, %xmm0
+; SSE-NEXT:    movdqa %xmm0, %xmm1
+; SSE-NEXT:    psrad $31, %xmm1
+; SSE-NEXT:    subps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
+; SSE-NEXT:    cvttps2dq %xmm2, %xmm2
+; SSE-NEXT:    pand %xmm1, %xmm2
+; SSE-NEXT:    por %xmm0, %xmm2
+; SSE-NEXT:    punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
+; SSE-NEXT:    # xmm2 = xmm2[0],mem[0]
+; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
+; SSE-NEXT:    movdqa %xmm2, %xmm3
+; SSE-NEXT:    pxor %xmm1, %xmm3
+; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [2147549183,2147549183,2147549183,2147549183]
+; SSE-NEXT:    movdqa %xmm4, %xmm0
+; SSE-NEXT:    pcmpgtd %xmm3, %xmm0
+; SSE-NEXT:    pand %xmm0, %xmm2
+; SSE-NEXT:    pcmpeqd %xmm3, %xmm3
+; SSE-NEXT:    pxor %xmm3, %xmm0
+; SSE-NEXT:    por %xmm2, %xmm0
+; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; SSE-NEXT:    pxor %xmm2, %xmm1
+; SSE-NEXT:    pcmpgtd %xmm1, %xmm4
+; SSE-NEXT:    pand %xmm4, %xmm2
+; SSE-NEXT:    pxor %xmm3, %xmm4
+; SSE-NEXT:    por %xmm2, %xmm4
+; SSE-NEXT:    pslld $16, %xmm4
+; SSE-NEXT:    psrad $16, %xmm4
+; SSE-NEXT:    pslld $16, %xmm0
+; SSE-NEXT:    psrad $16, %xmm0
+; SSE-NEXT:    packssdw %xmm4, %xmm0
+; SSE-NEXT:    addq $72, %rsp
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: utesth_f16i16_mm:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vcvtph2ps %xmm0, %ymm0
+; AVX2-NEXT:    vbroadcastss {{.*#+}} ymm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9]
+; AVX2-NEXT:    vsubps %ymm1, %ymm0, %ymm1
+; AVX2-NEXT:    vcvttps2dq %ymm1, %ymm1
+; AVX2-NEXT:    vcvttps2dq %ymm0, %ymm0
+; AVX2-NEXT:    vpsrad $31, %ymm0, %ymm2
+; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [65535,65535,65535,65535,65535,65535,65535,65535]
+; AVX2-NEXT:    vpminud %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: utesth_f16i16_mm:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vcvtph2ps %xmm0, %ymm0
+; AVX512-NEXT:    vcvttps2udq %ymm0, %ymm0
+; AVX512-NEXT:    vpmovusdw %ymm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
 entry:
   %conv = fptoui <8 x half> %x to <8 x i32>
   %spec.store.select = call <8 x i32> @llvm.umin.v8i32(<8 x i32> %conv, <8 x i32> <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>)
@@ -2734,84 +4299,101 @@ entry:
   ret <8 x i16> %conv6
 }
 
-define <8 x i16> @ustest_f16i16_mm(<8 x half> %x) {
-; CHECK-LABEL: ustest_f16i16_mm:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    subq $72, %rsp
-; CHECK-NEXT:    .cfi_def_cfa_offset 80
-; CHECK-NEXT:    movdqa %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT:    psrlq $48, %xmm0
-; CHECK-NEXT:    callq __extendhfsf2@PLT
-; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; CHECK-NEXT:    callq __extendhfsf2@PLT
-; CHECK-NEXT:    unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; CHECK-NEXT:    cvttps2dq %xmm0, %xmm0
-; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __extendhfsf2@PLT
-; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movdqa (%rsp), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    psrld $16, %xmm0
-; CHECK-NEXT:    callq __extendhfsf2@PLT
-; CHECK-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; CHECK-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; CHECK-NEXT:    cvttps2dq %xmm1, %xmm0
-; CHECK-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
-; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movdqa (%rsp), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2@PLT
-; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; CHECK-NEXT:    callq __extendhfsf2@PLT
-; CHECK-NEXT:    unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; CHECK-NEXT:    cvttps2dq %xmm0, %xmm0
-; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movdqa (%rsp), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; CHECK-NEXT:    callq __extendhfsf2@PLT
-; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
-; CHECK-NEXT:    callq __extendhfsf2@PLT
-; CHECK-NEXT:    unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; CHECK-NEXT:    cvttps2dq %xmm0, %xmm0
-; CHECK-NEXT:    punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
-; CHECK-NEXT:    movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535]
-; CHECK-NEXT:    movdqa %xmm1, %xmm2
-; CHECK-NEXT:    pcmpgtd %xmm0, %xmm2
-; CHECK-NEXT:    pand %xmm2, %xmm0
-; CHECK-NEXT:    pandn %xmm1, %xmm2
-; CHECK-NEXT:    por %xmm0, %xmm2
-; CHECK-NEXT:    movdqa %xmm1, %xmm3
-; CHECK-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    pcmpgtd %xmm0, %xmm3
-; CHECK-NEXT:    pand %xmm3, %xmm0
-; CHECK-NEXT:    pandn %xmm1, %xmm3
-; CHECK-NEXT:    por %xmm0, %xmm3
-; CHECK-NEXT:    pxor %xmm1, %xmm1
-; CHECK-NEXT:    movdqa %xmm3, %xmm0
-; CHECK-NEXT:    pcmpgtd %xmm1, %xmm0
-; CHECK-NEXT:    pand %xmm3, %xmm0
-; CHECK-NEXT:    movdqa %xmm2, %xmm3
-; CHECK-NEXT:    pcmpgtd %xmm1, %xmm3
-; CHECK-NEXT:    pand %xmm2, %xmm3
-; CHECK-NEXT:    pslld $16, %xmm3
-; CHECK-NEXT:    psrad $16, %xmm3
-; CHECK-NEXT:    pslld $16, %xmm0
-; CHECK-NEXT:    psrad $16, %xmm0
-; CHECK-NEXT:    packssdw %xmm3, %xmm0
-; CHECK-NEXT:    addq $72, %rsp
-; CHECK-NEXT:    .cfi_def_cfa_offset 8
-; CHECK-NEXT:    retq
+define <8 x i16> @ustest_f16i16_mm(<8 x half> %x) nounwind {
+; SSE-LABEL: ustest_f16i16_mm:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    subq $72, %rsp
+; SSE-NEXT:    movdqa %xmm0, (%rsp) # 16-byte Spill
+; SSE-NEXT:    psrlq $48, %xmm0
+; SSE-NEXT:    callq __extendhfsf2@PLT
+; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; SSE-NEXT:    callq __extendhfsf2@PLT
+; SSE-NEXT:    unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; SSE-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; SSE-NEXT:    cvttps2dq %xmm0, %xmm0
+; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; SSE-NEXT:    callq __extendhfsf2@PLT
+; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa (%rsp), %xmm0 # 16-byte Reload
+; SSE-NEXT:    psrld $16, %xmm0
+; SSE-NEXT:    callq __extendhfsf2@PLT
+; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE-NEXT:    cvttps2dq %xmm1, %xmm0
+; SSE-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; SSE-NEXT:    # xmm0 = xmm0[0],mem[0]
+; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa (%rsp), %xmm0 # 16-byte Reload
+; SSE-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE-NEXT:    callq __extendhfsf2@PLT
+; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; SSE-NEXT:    callq __extendhfsf2@PLT
+; SSE-NEXT:    unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; SSE-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; SSE-NEXT:    cvttps2dq %xmm0, %xmm0
+; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa (%rsp), %xmm0 # 16-byte Reload
+; SSE-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE-NEXT:    callq __extendhfsf2@PLT
+; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE-NEXT:    callq __extendhfsf2@PLT
+; SSE-NEXT:    unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; SSE-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; SSE-NEXT:    cvttps2dq %xmm0, %xmm0
+; SSE-NEXT:    punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; SSE-NEXT:    # xmm0 = xmm0[0],mem[0]
+; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535]
+; SSE-NEXT:    movdqa %xmm1, %xmm2
+; SSE-NEXT:    pcmpgtd %xmm0, %xmm2
+; SSE-NEXT:    pand %xmm2, %xmm0
+; SSE-NEXT:    pandn %xmm1, %xmm2
+; SSE-NEXT:    por %xmm0, %xmm2
+; SSE-NEXT:    movdqa %xmm1, %xmm3
+; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE-NEXT:    pcmpgtd %xmm0, %xmm3
+; SSE-NEXT:    pand %xmm3, %xmm0
+; SSE-NEXT:    pandn %xmm1, %xmm3
+; SSE-NEXT:    por %xmm0, %xmm3
+; SSE-NEXT:    pxor %xmm1, %xmm1
+; SSE-NEXT:    movdqa %xmm3, %xmm0
+; SSE-NEXT:    pcmpgtd %xmm1, %xmm0
+; SSE-NEXT:    pand %xmm3, %xmm0
+; SSE-NEXT:    movdqa %xmm2, %xmm3
+; SSE-NEXT:    pcmpgtd %xmm1, %xmm3
+; SSE-NEXT:    pand %xmm2, %xmm3
+; SSE-NEXT:    pslld $16, %xmm3
+; SSE-NEXT:    psrad $16, %xmm3
+; SSE-NEXT:    pslld $16, %xmm0
+; SSE-NEXT:    psrad $16, %xmm0
+; SSE-NEXT:    packssdw %xmm3, %xmm0
+; SSE-NEXT:    addq $72, %rsp
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: ustest_f16i16_mm:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    vcvtph2ps %xmm0, %ymm0
+; AVX2-NEXT:    vcvttps2dq %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: ustest_f16i16_mm:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    vcvtph2ps %xmm0, %ymm0
+; AVX512-NEXT:    vcvttps2dq %ymm0, %ymm0
+; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512-NEXT:    vpmaxsd %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpmovusdw %ymm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
 entry:
   %conv = fptosi <8 x half> %x to <8 x i32>
   %spec.store.select = call <8 x i32> @llvm.smin.v8i32(<8 x i32> %conv, <8 x i32> <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>)
@@ -2822,55 +4404,129 @@ entry:
 
 ; i64 saturate
 
-define <2 x i64> @stest_f64i64_mm(<2 x double> %x) {
-; CHECK-LABEL: stest_f64i64_mm:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    pushq %r14
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    pushq %rbx
-; CHECK-NEXT:    .cfi_def_cfa_offset 24
-; CHECK-NEXT:    subq $24, %rsp
-; CHECK-NEXT:    .cfi_def_cfa_offset 48
-; CHECK-NEXT:    .cfi_offset %rbx, -24
-; CHECK-NEXT:    .cfi_offset %r14, -16
-; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
-; CHECK-NEXT:    callq __fixdfti@PLT
-; CHECK-NEXT:    movq %rax, %rbx
-; CHECK-NEXT:    movq %rdx, %r14
-; CHECK-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __fixdfti@PLT
-; CHECK-NEXT:    xorl %ecx, %ecx
-; CHECK-NEXT:    movabsq $9223372036854775807, %rsi # imm = 0x7FFFFFFFFFFFFFFF
-; CHECK-NEXT:    cmpq %rsi, %rax
-; CHECK-NEXT:    movq %rdx, %rdi
-; CHECK-NEXT:    sbbq $0, %rdi
-; CHECK-NEXT:    cmovgeq %rcx, %rdx
-; CHECK-NEXT:    cmovgeq %rsi, %rax
-; CHECK-NEXT:    cmpq %rsi, %rbx
-; CHECK-NEXT:    movq %r14, %rdi
-; CHECK-NEXT:    sbbq $0, %rdi
-; CHECK-NEXT:    cmovlq %r14, %rcx
-; CHECK-NEXT:    cmovlq %rbx, %rsi
-; CHECK-NEXT:    movabsq $-9223372036854775808, %rdi # imm = 0x8000000000000000
-; CHECK-NEXT:    cmpq %rsi, %rdi
-; CHECK-NEXT:    movq $-1, %r8
-; CHECK-NEXT:    movq $-1, %r9
-; CHECK-NEXT:    sbbq %rcx, %r9
-; CHECK-NEXT:    cmovgeq %rdi, %rsi
-; CHECK-NEXT:    cmpq %rax, %rdi
-; CHECK-NEXT:    sbbq %rdx, %r8
-; CHECK-NEXT:    cmovgeq %rdi, %rax
-; CHECK-NEXT:    movq %rax, %xmm0
-; CHECK-NEXT:    movq %rsi, %xmm1
-; CHECK-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; CHECK-NEXT:    addq $24, %rsp
-; CHECK-NEXT:    .cfi_def_cfa_offset 24
-; CHECK-NEXT:    popq %rbx
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    popq %r14
-; CHECK-NEXT:    .cfi_def_cfa_offset 8
-; CHECK-NEXT:    retq
+define <2 x i64> @stest_f64i64_mm(<2 x double> %x) nounwind {
+; SSE-LABEL: stest_f64i64_mm:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    pushq %r14
+; SSE-NEXT:    pushq %rbx
+; SSE-NEXT:    subq $24, %rsp
+; SSE-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE-NEXT:    callq __fixdfti@PLT
+; SSE-NEXT:    movq %rax, %rbx
+; SSE-NEXT:    movq %rdx, %r14
+; SSE-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; SSE-NEXT:    callq __fixdfti@PLT
+; SSE-NEXT:    xorl %ecx, %ecx
+; SSE-NEXT:    movabsq $9223372036854775807, %rsi # imm = 0x7FFFFFFFFFFFFFFF
+; SSE-NEXT:    cmpq %rsi, %rax
+; SSE-NEXT:    movq %rdx, %rdi
+; SSE-NEXT:    sbbq $0, %rdi
+; SSE-NEXT:    cmovgeq %rcx, %rdx
+; SSE-NEXT:    cmovgeq %rsi, %rax
+; SSE-NEXT:    cmpq %rsi, %rbx
+; SSE-NEXT:    movq %r14, %rdi
+; SSE-NEXT:    sbbq $0, %rdi
+; SSE-NEXT:    cmovlq %r14, %rcx
+; SSE-NEXT:    cmovlq %rbx, %rsi
+; SSE-NEXT:    movabsq $-9223372036854775808, %rdi # imm = 0x8000000000000000
+; SSE-NEXT:    cmpq %rsi, %rdi
+; SSE-NEXT:    movq $-1, %r8
+; SSE-NEXT:    movq $-1, %r9
+; SSE-NEXT:    sbbq %rcx, %r9
+; SSE-NEXT:    cmovgeq %rdi, %rsi
+; SSE-NEXT:    cmpq %rax, %rdi
+; SSE-NEXT:    sbbq %rdx, %r8
+; SSE-NEXT:    cmovgeq %rdi, %rax
+; SSE-NEXT:    movq %rax, %xmm0
+; SSE-NEXT:    movq %rsi, %xmm1
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT:    addq $24, %rsp
+; SSE-NEXT:    popq %rbx
+; SSE-NEXT:    popq %r14
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: stest_f64i64_mm:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    pushq %r14
+; AVX2-NEXT:    pushq %rbx
+; AVX2-NEXT:    subq $24, %rsp
+; AVX2-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
+; AVX2-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX2-NEXT:    callq __fixdfti@PLT
+; AVX2-NEXT:    movq %rax, %rbx
+; AVX2-NEXT:    movq %rdx, %r14
+; AVX2-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
+; AVX2-NEXT:    callq __fixdfti@PLT
+; AVX2-NEXT:    xorl %ecx, %ecx
+; AVX2-NEXT:    movabsq $9223372036854775807, %rsi # imm = 0x7FFFFFFFFFFFFFFF
+; AVX2-NEXT:    cmpq %rsi, %rax
+; AVX2-NEXT:    movq %rdx, %rdi
+; AVX2-NEXT:    sbbq $0, %rdi
+; AVX2-NEXT:    cmovgeq %rcx, %rdx
+; AVX2-NEXT:    cmovgeq %rsi, %rax
+; AVX2-NEXT:    cmpq %rsi, %rbx
+; AVX2-NEXT:    movq %r14, %rdi
+; AVX2-NEXT:    sbbq $0, %rdi
+; AVX2-NEXT:    cmovlq %r14, %rcx
+; AVX2-NEXT:    cmovlq %rbx, %rsi
+; AVX2-NEXT:    movabsq $-9223372036854775808, %rdi # imm = 0x8000000000000000
+; AVX2-NEXT:    cmpq %rsi, %rdi
+; AVX2-NEXT:    movq $-1, %r8
+; AVX2-NEXT:    sbbq %rcx, %r8
+; AVX2-NEXT:    movq $-1, %rcx
+; AVX2-NEXT:    cmovgeq %rdi, %rsi
+; AVX2-NEXT:    cmpq %rax, %rdi
+; AVX2-NEXT:    sbbq %rdx, %rcx
+; AVX2-NEXT:    cmovgeq %rdi, %rax
+; AVX2-NEXT:    vmovq %rax, %xmm0
+; AVX2-NEXT:    vmovq %rsi, %xmm1
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT:    addq $24, %rsp
+; AVX2-NEXT:    popq %rbx
+; AVX2-NEXT:    popq %r14
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: stest_f64i64_mm:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    pushq %r14
+; AVX512-NEXT:    pushq %rbx
+; AVX512-NEXT:    subq $24, %rsp
+; AVX512-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
+; AVX512-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512-NEXT:    callq __fixdfti@PLT
+; AVX512-NEXT:    movq %rax, %rbx
+; AVX512-NEXT:    movq %rdx, %r14
+; AVX512-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
+; AVX512-NEXT:    callq __fixdfti@PLT
+; AVX512-NEXT:    xorl %ecx, %ecx
+; AVX512-NEXT:    movabsq $9223372036854775807, %rsi # imm = 0x7FFFFFFFFFFFFFFF
+; AVX512-NEXT:    cmpq %rsi, %rax
+; AVX512-NEXT:    movq %rdx, %rdi
+; AVX512-NEXT:    sbbq $0, %rdi
+; AVX512-NEXT:    cmovgeq %rcx, %rdx
+; AVX512-NEXT:    cmovgeq %rsi, %rax
+; AVX512-NEXT:    cmpq %rsi, %rbx
+; AVX512-NEXT:    movq %r14, %rdi
+; AVX512-NEXT:    sbbq $0, %rdi
+; AVX512-NEXT:    cmovlq %r14, %rcx
+; AVX512-NEXT:    cmovlq %rbx, %rsi
+; AVX512-NEXT:    movabsq $-9223372036854775808, %rdi # imm = 0x8000000000000000
+; AVX512-NEXT:    cmpq %rsi, %rdi
+; AVX512-NEXT:    movq $-1, %r8
+; AVX512-NEXT:    movq $-1, %r9
+; AVX512-NEXT:    sbbq %rcx, %r9
+; AVX512-NEXT:    cmovgeq %rdi, %rsi
+; AVX512-NEXT:    cmpq %rax, %rdi
+; AVX512-NEXT:    sbbq %rdx, %r8
+; AVX512-NEXT:    cmovgeq %rdi, %rax
+; AVX512-NEXT:    vmovq %rax, %xmm0
+; AVX512-NEXT:    vmovq %rsi, %xmm1
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512-NEXT:    addq $24, %rsp
+; AVX512-NEXT:    popq %rbx
+; AVX512-NEXT:    popq %r14
+; AVX512-NEXT:    retq
 entry:
   %conv = fptosi <2 x double> %x to <2 x i128>
   %spec.store.select = call <2 x i128> @llvm.smin.v2i128(<2 x i128> %conv, <2 x i128> <i128 9223372036854775807, i128 9223372036854775807>)
@@ -2879,39 +4535,56 @@ entry:
   ret <2 x i64> %conv6
 }
 
-define <2 x i64> @utest_f64i64_mm(<2 x double> %x) {
-; CHECK-LABEL: utest_f64i64_mm:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    pushq %r14
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    pushq %rbx
-; CHECK-NEXT:    .cfi_def_cfa_offset 24
-; CHECK-NEXT:    subq $24, %rsp
-; CHECK-NEXT:    .cfi_def_cfa_offset 48
-; CHECK-NEXT:    .cfi_offset %rbx, -24
-; CHECK-NEXT:    .cfi_offset %r14, -16
-; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT:    callq __fixunsdfti@PLT
-; CHECK-NEXT:    movq %rax, %rbx
-; CHECK-NEXT:    movq %rdx, %r14
-; CHECK-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
-; CHECK-NEXT:    callq __fixunsdfti@PLT
-; CHECK-NEXT:    xorl %ecx, %ecx
-; CHECK-NEXT:    testq %rdx, %rdx
-; CHECK-NEXT:    cmovneq %rcx, %rax
-; CHECK-NEXT:    testq %r14, %r14
-; CHECK-NEXT:    cmovneq %rcx, %rbx
-; CHECK-NEXT:    movq %rbx, %xmm0
-; CHECK-NEXT:    movq %rax, %xmm1
-; CHECK-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; CHECK-NEXT:    addq $24, %rsp
-; CHECK-NEXT:    .cfi_def_cfa_offset 24
-; CHECK-NEXT:    popq %rbx
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    popq %r14
-; CHECK-NEXT:    .cfi_def_cfa_offset 8
-; CHECK-NEXT:    retq
+define <2 x i64> @utest_f64i64_mm(<2 x double> %x) nounwind {
+; SSE-LABEL: utest_f64i64_mm:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    pushq %r14
+; SSE-NEXT:    pushq %rbx
+; SSE-NEXT:    subq $24, %rsp
+; SSE-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; SSE-NEXT:    callq __fixunsdfti@PLT
+; SSE-NEXT:    movq %rax, %rbx
+; SSE-NEXT:    movq %rdx, %r14
+; SSE-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE-NEXT:    callq __fixunsdfti@PLT
+; SSE-NEXT:    xorl %ecx, %ecx
+; SSE-NEXT:    testq %rdx, %rdx
+; SSE-NEXT:    cmovneq %rcx, %rax
+; SSE-NEXT:    testq %r14, %r14
+; SSE-NEXT:    cmovneq %rcx, %rbx
+; SSE-NEXT:    movq %rbx, %xmm0
+; SSE-NEXT:    movq %rax, %xmm1
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT:    addq $24, %rsp
+; SSE-NEXT:    popq %rbx
+; SSE-NEXT:    popq %r14
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: utest_f64i64_mm:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    pushq %r14
+; AVX-NEXT:    pushq %rbx
+; AVX-NEXT:    subq $24, %rsp
+; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX-NEXT:    callq __fixunsdfti@PLT
+; AVX-NEXT:    movq %rax, %rbx
+; AVX-NEXT:    movq %rdx, %r14
+; AVX-NEXT:    vpermilpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm0 = mem[1,0]
+; AVX-NEXT:    callq __fixunsdfti@PLT
+; AVX-NEXT:    xorl %ecx, %ecx
+; AVX-NEXT:    testq %rdx, %rdx
+; AVX-NEXT:    cmovneq %rcx, %rax
+; AVX-NEXT:    testq %r14, %r14
+; AVX-NEXT:    cmovneq %rcx, %rbx
+; AVX-NEXT:    vmovq %rbx, %xmm0
+; AVX-NEXT:    vmovq %rax, %xmm1
+; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT:    addq $24, %rsp
+; AVX-NEXT:    popq %rbx
+; AVX-NEXT:    popq %r14
+; AVX-NEXT:    retq
 entry:
   %conv = fptoui <2 x double> %x to <2 x i128>
   %spec.store.select = call <2 x i128> @llvm.umin.v2i128(<2 x i128> %conv, <2 x i128> <i128 18446744073709551616, i128 18446744073709551616>)
@@ -2919,46 +4592,70 @@ entry:
   ret <2 x i64> %conv6
 }
 
-define <2 x i64> @ustest_f64i64_mm(<2 x double> %x) {
-; CHECK-LABEL: ustest_f64i64_mm:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    pushq %r14
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    pushq %rbx
-; CHECK-NEXT:    .cfi_def_cfa_offset 24
-; CHECK-NEXT:    subq $24, %rsp
-; CHECK-NEXT:    .cfi_def_cfa_offset 48
-; CHECK-NEXT:    .cfi_offset %rbx, -24
-; CHECK-NEXT:    .cfi_offset %r14, -16
-; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
-; CHECK-NEXT:    callq __fixdfti@PLT
-; CHECK-NEXT:    movq %rax, %rbx
-; CHECK-NEXT:    movq %rdx, %r14
-; CHECK-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __fixdfti@PLT
-; CHECK-NEXT:    xorl %ecx, %ecx
-; CHECK-NEXT:    testq %rdx, %rdx
-; CHECK-NEXT:    cmovgq %rcx, %rax
-; CHECK-NEXT:    movl $1, %esi
-; CHECK-NEXT:    cmovgq %rsi, %rdx
-; CHECK-NEXT:    testq %r14, %r14
-; CHECK-NEXT:    cmovgq %rcx, %rbx
-; CHECK-NEXT:    cmovleq %r14, %rsi
-; CHECK-NEXT:    testq %rsi, %rsi
-; CHECK-NEXT:    cmovsq %rcx, %rbx
-; CHECK-NEXT:    testq %rdx, %rdx
-; CHECK-NEXT:    cmovsq %rcx, %rax
-; CHECK-NEXT:    movq %rax, %xmm0
-; CHECK-NEXT:    movq %rbx, %xmm1
-; CHECK-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; CHECK-NEXT:    addq $24, %rsp
-; CHECK-NEXT:    .cfi_def_cfa_offset 24
-; CHECK-NEXT:    popq %rbx
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    popq %r14
-; CHECK-NEXT:    .cfi_def_cfa_offset 8
-; CHECK-NEXT:    retq
+define <2 x i64> @ustest_f64i64_mm(<2 x double> %x) nounwind {
+; SSE-LABEL: ustest_f64i64_mm:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    pushq %r14
+; SSE-NEXT:    pushq %rbx
+; SSE-NEXT:    subq $24, %rsp
+; SSE-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
+; SSE-NEXT:    callq __fixdfti@PLT
+; SSE-NEXT:    movq %rax, %rbx
+; SSE-NEXT:    movq %rdx, %r14
+; SSE-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; SSE-NEXT:    callq __fixdfti@PLT
+; SSE-NEXT:    xorl %ecx, %ecx
+; SSE-NEXT:    testq %rdx, %rdx
+; SSE-NEXT:    cmovgq %rcx, %rax
+; SSE-NEXT:    movl $1, %esi
+; SSE-NEXT:    cmovgq %rsi, %rdx
+; SSE-NEXT:    testq %r14, %r14
+; SSE-NEXT:    cmovgq %rcx, %rbx
+; SSE-NEXT:    cmovleq %r14, %rsi
+; SSE-NEXT:    testq %rsi, %rsi
+; SSE-NEXT:    cmovsq %rcx, %rbx
+; SSE-NEXT:    testq %rdx, %rdx
+; SSE-NEXT:    cmovsq %rcx, %rax
+; SSE-NEXT:    movq %rax, %xmm0
+; SSE-NEXT:    movq %rbx, %xmm1
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT:    addq $24, %rsp
+; SSE-NEXT:    popq %rbx
+; SSE-NEXT:    popq %r14
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: ustest_f64i64_mm:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    pushq %r14
+; AVX-NEXT:    pushq %rbx
+; AVX-NEXT:    subq $24, %rsp
+; AVX-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
+; AVX-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX-NEXT:    callq __fixdfti@PLT
+; AVX-NEXT:    movq %rax, %rbx
+; AVX-NEXT:    movq %rdx, %r14
+; AVX-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
+; AVX-NEXT:    callq __fixdfti@PLT
+; AVX-NEXT:    xorl %ecx, %ecx
+; AVX-NEXT:    testq %rdx, %rdx
+; AVX-NEXT:    cmovgq %rcx, %rax
+; AVX-NEXT:    movl $1, %esi
+; AVX-NEXT:    cmovgq %rsi, %rdx
+; AVX-NEXT:    testq %r14, %r14
+; AVX-NEXT:    cmovgq %rcx, %rbx
+; AVX-NEXT:    cmovleq %r14, %rsi
+; AVX-NEXT:    testq %rsi, %rsi
+; AVX-NEXT:    cmovsq %rcx, %rbx
+; AVX-NEXT:    testq %rdx, %rdx
+; AVX-NEXT:    cmovsq %rcx, %rax
+; AVX-NEXT:    vmovq %rax, %xmm0
+; AVX-NEXT:    vmovq %rbx, %xmm1
+; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT:    addq $24, %rsp
+; AVX-NEXT:    popq %rbx
+; AVX-NEXT:    popq %r14
+; AVX-NEXT:    retq
 entry:
   %conv = fptosi <2 x double> %x to <2 x i128>
   %spec.store.select = call <2 x i128> @llvm.smin.v2i128(<2 x i128> %conv, <2 x i128> <i128 18446744073709551616, i128 18446744073709551616>)
@@ -2967,55 +4664,129 @@ entry:
   ret <2 x i64> %conv6
 }
 
-define <2 x i64> @stest_f32i64_mm(<2 x float> %x) {
-; CHECK-LABEL: stest_f32i64_mm:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    pushq %r14
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    pushq %rbx
-; CHECK-NEXT:    .cfi_def_cfa_offset 24
-; CHECK-NEXT:    subq $24, %rsp
-; CHECK-NEXT:    .cfi_def_cfa_offset 48
-; CHECK-NEXT:    .cfi_offset %rbx, -24
-; CHECK-NEXT:    .cfi_offset %r14, -16
-; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; CHECK-NEXT:    callq __fixsfti@PLT
-; CHECK-NEXT:    movq %rax, %rbx
-; CHECK-NEXT:    movq %rdx, %r14
-; CHECK-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __fixsfti@PLT
-; CHECK-NEXT:    xorl %ecx, %ecx
-; CHECK-NEXT:    movabsq $9223372036854775807, %rsi # imm = 0x7FFFFFFFFFFFFFFF
-; CHECK-NEXT:    cmpq %rsi, %rax
-; CHECK-NEXT:    movq %rdx, %rdi
-; CHECK-NEXT:    sbbq $0, %rdi
-; CHECK-NEXT:    cmovgeq %rcx, %rdx
-; CHECK-NEXT:    cmovgeq %rsi, %rax
-; CHECK-NEXT:    cmpq %rsi, %rbx
-; CHECK-NEXT:    movq %r14, %rdi
-; CHECK-NEXT:    sbbq $0, %rdi
-; CHECK-NEXT:    cmovlq %r14, %rcx
-; CHECK-NEXT:    cmovlq %rbx, %rsi
-; CHECK-NEXT:    movabsq $-9223372036854775808, %rdi # imm = 0x8000000000000000
-; CHECK-NEXT:    cmpq %rsi, %rdi
-; CHECK-NEXT:    movq $-1, %r8
-; CHECK-NEXT:    movq $-1, %r9
-; CHECK-NEXT:    sbbq %rcx, %r9
-; CHECK-NEXT:    cmovgeq %rdi, %rsi
-; CHECK-NEXT:    cmpq %rax, %rdi
-; CHECK-NEXT:    sbbq %rdx, %r8
-; CHECK-NEXT:    cmovgeq %rdi, %rax
-; CHECK-NEXT:    movq %rax, %xmm0
-; CHECK-NEXT:    movq %rsi, %xmm1
-; CHECK-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; CHECK-NEXT:    addq $24, %rsp
-; CHECK-NEXT:    .cfi_def_cfa_offset 24
-; CHECK-NEXT:    popq %rbx
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    popq %r14
-; CHECK-NEXT:    .cfi_def_cfa_offset 8
-; CHECK-NEXT:    retq
+define <2 x i64> @stest_f32i64_mm(<2 x float> %x) nounwind {
+; SSE-LABEL: stest_f32i64_mm:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    pushq %r14
+; SSE-NEXT:    pushq %rbx
+; SSE-NEXT:    subq $24, %rsp
+; SSE-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; SSE-NEXT:    callq __fixsfti@PLT
+; SSE-NEXT:    movq %rax, %rbx
+; SSE-NEXT:    movq %rdx, %r14
+; SSE-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; SSE-NEXT:    callq __fixsfti@PLT
+; SSE-NEXT:    xorl %ecx, %ecx
+; SSE-NEXT:    movabsq $9223372036854775807, %rsi # imm = 0x7FFFFFFFFFFFFFFF
+; SSE-NEXT:    cmpq %rsi, %rax
+; SSE-NEXT:    movq %rdx, %rdi
+; SSE-NEXT:    sbbq $0, %rdi
+; SSE-NEXT:    cmovgeq %rcx, %rdx
+; SSE-NEXT:    cmovgeq %rsi, %rax
+; SSE-NEXT:    cmpq %rsi, %rbx
+; SSE-NEXT:    movq %r14, %rdi
+; SSE-NEXT:    sbbq $0, %rdi
+; SSE-NEXT:    cmovlq %r14, %rcx
+; SSE-NEXT:    cmovlq %rbx, %rsi
+; SSE-NEXT:    movabsq $-9223372036854775808, %rdi # imm = 0x8000000000000000
+; SSE-NEXT:    cmpq %rsi, %rdi
+; SSE-NEXT:    movq $-1, %r8
+; SSE-NEXT:    movq $-1, %r9
+; SSE-NEXT:    sbbq %rcx, %r9
+; SSE-NEXT:    cmovgeq %rdi, %rsi
+; SSE-NEXT:    cmpq %rax, %rdi
+; SSE-NEXT:    sbbq %rdx, %r8
+; SSE-NEXT:    cmovgeq %rdi, %rax
+; SSE-NEXT:    movq %rax, %xmm0
+; SSE-NEXT:    movq %rsi, %xmm1
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT:    addq $24, %rsp
+; SSE-NEXT:    popq %rbx
+; SSE-NEXT:    popq %r14
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: stest_f32i64_mm:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    pushq %r14
+; AVX2-NEXT:    pushq %rbx
+; AVX2-NEXT:    subq $24, %rsp
+; AVX2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX2-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX2-NEXT:    callq __fixsfti@PLT
+; AVX2-NEXT:    movq %rax, %rbx
+; AVX2-NEXT:    movq %rdx, %r14
+; AVX2-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
+; AVX2-NEXT:    callq __fixsfti@PLT
+; AVX2-NEXT:    xorl %ecx, %ecx
+; AVX2-NEXT:    movabsq $9223372036854775807, %rsi # imm = 0x7FFFFFFFFFFFFFFF
+; AVX2-NEXT:    cmpq %rsi, %rax
+; AVX2-NEXT:    movq %rdx, %rdi
+; AVX2-NEXT:    sbbq $0, %rdi
+; AVX2-NEXT:    cmovgeq %rcx, %rdx
+; AVX2-NEXT:    cmovgeq %rsi, %rax
+; AVX2-NEXT:    cmpq %rsi, %rbx
+; AVX2-NEXT:    movq %r14, %rdi
+; AVX2-NEXT:    sbbq $0, %rdi
+; AVX2-NEXT:    cmovlq %r14, %rcx
+; AVX2-NEXT:    cmovlq %rbx, %rsi
+; AVX2-NEXT:    movabsq $-9223372036854775808, %rdi # imm = 0x8000000000000000
+; AVX2-NEXT:    cmpq %rsi, %rdi
+; AVX2-NEXT:    movq $-1, %r8
+; AVX2-NEXT:    sbbq %rcx, %r8
+; AVX2-NEXT:    movq $-1, %rcx
+; AVX2-NEXT:    cmovgeq %rdi, %rsi
+; AVX2-NEXT:    cmpq %rax, %rdi
+; AVX2-NEXT:    sbbq %rdx, %rcx
+; AVX2-NEXT:    cmovgeq %rdi, %rax
+; AVX2-NEXT:    vmovq %rax, %xmm0
+; AVX2-NEXT:    vmovq %rsi, %xmm1
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT:    addq $24, %rsp
+; AVX2-NEXT:    popq %rbx
+; AVX2-NEXT:    popq %r14
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: stest_f32i64_mm:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    pushq %r14
+; AVX512-NEXT:    pushq %rbx
+; AVX512-NEXT:    subq $24, %rsp
+; AVX512-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX512-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX512-NEXT:    callq __fixsfti@PLT
+; AVX512-NEXT:    movq %rax, %rbx
+; AVX512-NEXT:    movq %rdx, %r14
+; AVX512-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
+; AVX512-NEXT:    callq __fixsfti@PLT
+; AVX512-NEXT:    xorl %ecx, %ecx
+; AVX512-NEXT:    movabsq $9223372036854775807, %rsi # imm = 0x7FFFFFFFFFFFFFFF
+; AVX512-NEXT:    cmpq %rsi, %rax
+; AVX512-NEXT:    movq %rdx, %rdi
+; AVX512-NEXT:    sbbq $0, %rdi
+; AVX512-NEXT:    cmovgeq %rcx, %rdx
+; AVX512-NEXT:    cmovgeq %rsi, %rax
+; AVX512-NEXT:    cmpq %rsi, %rbx
+; AVX512-NEXT:    movq %r14, %rdi
+; AVX512-NEXT:    sbbq $0, %rdi
+; AVX512-NEXT:    cmovlq %r14, %rcx
+; AVX512-NEXT:    cmovlq %rbx, %rsi
+; AVX512-NEXT:    movabsq $-9223372036854775808, %rdi # imm = 0x8000000000000000
+; AVX512-NEXT:    cmpq %rsi, %rdi
+; AVX512-NEXT:    movq $-1, %r8
+; AVX512-NEXT:    movq $-1, %r9
+; AVX512-NEXT:    sbbq %rcx, %r9
+; AVX512-NEXT:    cmovgeq %rdi, %rsi
+; AVX512-NEXT:    cmpq %rax, %rdi
+; AVX512-NEXT:    sbbq %rdx, %r8
+; AVX512-NEXT:    cmovgeq %rdi, %rax
+; AVX512-NEXT:    vmovq %rax, %xmm0
+; AVX512-NEXT:    vmovq %rsi, %xmm1
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512-NEXT:    addq $24, %rsp
+; AVX512-NEXT:    popq %rbx
+; AVX512-NEXT:    popq %r14
+; AVX512-NEXT:    retq
 entry:
   %conv = fptosi <2 x float> %x to <2 x i128>
   %spec.store.select = call <2 x i128> @llvm.smin.v2i128(<2 x i128> %conv, <2 x i128> <i128 9223372036854775807, i128 9223372036854775807>)
@@ -3024,39 +4795,56 @@ entry:
   ret <2 x i64> %conv6
 }
 
-define <2 x i64> @utest_f32i64_mm(<2 x float> %x) {
-; CHECK-LABEL: utest_f32i64_mm:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    pushq %r14
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    pushq %rbx
-; CHECK-NEXT:    .cfi_def_cfa_offset 24
-; CHECK-NEXT:    subq $24, %rsp
-; CHECK-NEXT:    .cfi_def_cfa_offset 48
-; CHECK-NEXT:    .cfi_offset %rbx, -24
-; CHECK-NEXT:    .cfi_offset %r14, -16
-; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT:    callq __fixunssfti@PLT
-; CHECK-NEXT:    movq %rax, %rbx
-; CHECK-NEXT:    movq %rdx, %r14
-; CHECK-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; CHECK-NEXT:    callq __fixunssfti@PLT
-; CHECK-NEXT:    xorl %ecx, %ecx
-; CHECK-NEXT:    testq %rdx, %rdx
-; CHECK-NEXT:    cmovneq %rcx, %rax
-; CHECK-NEXT:    testq %r14, %r14
-; CHECK-NEXT:    cmovneq %rcx, %rbx
-; CHECK-NEXT:    movq %rbx, %xmm0
-; CHECK-NEXT:    movq %rax, %xmm1
-; CHECK-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; CHECK-NEXT:    addq $24, %rsp
-; CHECK-NEXT:    .cfi_def_cfa_offset 24
-; CHECK-NEXT:    popq %rbx
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    popq %r14
-; CHECK-NEXT:    .cfi_def_cfa_offset 8
-; CHECK-NEXT:    retq
+define <2 x i64> @utest_f32i64_mm(<2 x float> %x) nounwind {
+; SSE-LABEL: utest_f32i64_mm:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    pushq %r14
+; SSE-NEXT:    pushq %rbx
+; SSE-NEXT:    subq $24, %rsp
+; SSE-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; SSE-NEXT:    callq __fixunssfti@PLT
+; SSE-NEXT:    movq %rax, %rbx
+; SSE-NEXT:    movq %rdx, %r14
+; SSE-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; SSE-NEXT:    callq __fixunssfti@PLT
+; SSE-NEXT:    xorl %ecx, %ecx
+; SSE-NEXT:    testq %rdx, %rdx
+; SSE-NEXT:    cmovneq %rcx, %rax
+; SSE-NEXT:    testq %r14, %r14
+; SSE-NEXT:    cmovneq %rcx, %rbx
+; SSE-NEXT:    movq %rbx, %xmm0
+; SSE-NEXT:    movq %rax, %xmm1
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT:    addq $24, %rsp
+; SSE-NEXT:    popq %rbx
+; SSE-NEXT:    popq %r14
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: utest_f32i64_mm:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    pushq %r14
+; AVX-NEXT:    pushq %rbx
+; AVX-NEXT:    subq $24, %rsp
+; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX-NEXT:    callq __fixunssfti@PLT
+; AVX-NEXT:    movq %rax, %rbx
+; AVX-NEXT:    movq %rdx, %r14
+; AVX-NEXT:    vmovshdup (%rsp), %xmm0 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm0 = mem[1,1,3,3]
+; AVX-NEXT:    callq __fixunssfti@PLT
+; AVX-NEXT:    xorl %ecx, %ecx
+; AVX-NEXT:    testq %rdx, %rdx
+; AVX-NEXT:    cmovneq %rcx, %rax
+; AVX-NEXT:    testq %r14, %r14
+; AVX-NEXT:    cmovneq %rcx, %rbx
+; AVX-NEXT:    vmovq %rbx, %xmm0
+; AVX-NEXT:    vmovq %rax, %xmm1
+; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT:    addq $24, %rsp
+; AVX-NEXT:    popq %rbx
+; AVX-NEXT:    popq %r14
+; AVX-NEXT:    retq
 entry:
   %conv = fptoui <2 x float> %x to <2 x i128>
   %spec.store.select = call <2 x i128> @llvm.umin.v2i128(<2 x i128> %conv, <2 x i128> <i128 18446744073709551616, i128 18446744073709551616>)
@@ -3064,46 +4852,70 @@ entry:
   ret <2 x i64> %conv6
 }
 
-define <2 x i64> @ustest_f32i64_mm(<2 x float> %x) {
-; CHECK-LABEL: ustest_f32i64_mm:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    pushq %r14
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    pushq %rbx
-; CHECK-NEXT:    .cfi_def_cfa_offset 24
-; CHECK-NEXT:    subq $24, %rsp
-; CHECK-NEXT:    .cfi_def_cfa_offset 48
-; CHECK-NEXT:    .cfi_offset %rbx, -24
-; CHECK-NEXT:    .cfi_offset %r14, -16
-; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; CHECK-NEXT:    callq __fixsfti@PLT
-; CHECK-NEXT:    movq %rax, %rbx
-; CHECK-NEXT:    movq %rdx, %r14
-; CHECK-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __fixsfti@PLT
-; CHECK-NEXT:    xorl %ecx, %ecx
-; CHECK-NEXT:    testq %rdx, %rdx
-; CHECK-NEXT:    cmovgq %rcx, %rax
-; CHECK-NEXT:    movl $1, %esi
-; CHECK-NEXT:    cmovgq %rsi, %rdx
-; CHECK-NEXT:    testq %r14, %r14
-; CHECK-NEXT:    cmovgq %rcx, %rbx
-; CHECK-NEXT:    cmovleq %r14, %rsi
-; CHECK-NEXT:    testq %rsi, %rsi
-; CHECK-NEXT:    cmovsq %rcx, %rbx
-; CHECK-NEXT:    testq %rdx, %rdx
-; CHECK-NEXT:    cmovsq %rcx, %rax
-; CHECK-NEXT:    movq %rax, %xmm0
-; CHECK-NEXT:    movq %rbx, %xmm1
-; CHECK-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; CHECK-NEXT:    addq $24, %rsp
-; CHECK-NEXT:    .cfi_def_cfa_offset 24
-; CHECK-NEXT:    popq %rbx
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    popq %r14
-; CHECK-NEXT:    .cfi_def_cfa_offset 8
-; CHECK-NEXT:    retq
+define <2 x i64> @ustest_f32i64_mm(<2 x float> %x) nounwind {
+; SSE-LABEL: ustest_f32i64_mm:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    pushq %r14
+; SSE-NEXT:    pushq %rbx
+; SSE-NEXT:    subq $24, %rsp
+; SSE-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
+; SSE-NEXT:    callq __fixsfti@PLT
+; SSE-NEXT:    movq %rax, %rbx
+; SSE-NEXT:    movq %rdx, %r14
+; SSE-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; SSE-NEXT:    callq __fixsfti@PLT
+; SSE-NEXT:    xorl %ecx, %ecx
+; SSE-NEXT:    testq %rdx, %rdx
+; SSE-NEXT:    cmovgq %rcx, %rax
+; SSE-NEXT:    movl $1, %esi
+; SSE-NEXT:    cmovgq %rsi, %rdx
+; SSE-NEXT:    testq %r14, %r14
+; SSE-NEXT:    cmovgq %rcx, %rbx
+; SSE-NEXT:    cmovleq %r14, %rsi
+; SSE-NEXT:    testq %rsi, %rsi
+; SSE-NEXT:    cmovsq %rcx, %rbx
+; SSE-NEXT:    testq %rdx, %rdx
+; SSE-NEXT:    cmovsq %rcx, %rax
+; SSE-NEXT:    movq %rax, %xmm0
+; SSE-NEXT:    movq %rbx, %xmm1
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT:    addq $24, %rsp
+; SSE-NEXT:    popq %rbx
+; SSE-NEXT:    popq %r14
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: ustest_f32i64_mm:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    pushq %r14
+; AVX-NEXT:    pushq %rbx
+; AVX-NEXT:    subq $24, %rsp
+; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX-NEXT:    callq __fixsfti@PLT
+; AVX-NEXT:    movq %rax, %rbx
+; AVX-NEXT:    movq %rdx, %r14
+; AVX-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
+; AVX-NEXT:    callq __fixsfti@PLT
+; AVX-NEXT:    xorl %ecx, %ecx
+; AVX-NEXT:    testq %rdx, %rdx
+; AVX-NEXT:    cmovgq %rcx, %rax
+; AVX-NEXT:    movl $1, %esi
+; AVX-NEXT:    cmovgq %rsi, %rdx
+; AVX-NEXT:    testq %r14, %r14
+; AVX-NEXT:    cmovgq %rcx, %rbx
+; AVX-NEXT:    cmovleq %r14, %rsi
+; AVX-NEXT:    testq %rsi, %rsi
+; AVX-NEXT:    cmovsq %rcx, %rbx
+; AVX-NEXT:    testq %rdx, %rdx
+; AVX-NEXT:    cmovsq %rcx, %rax
+; AVX-NEXT:    vmovq %rax, %xmm0
+; AVX-NEXT:    vmovq %rbx, %xmm1
+; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT:    addq $24, %rsp
+; AVX-NEXT:    popq %rbx
+; AVX-NEXT:    popq %r14
+; AVX-NEXT:    retq
 entry:
   %conv = fptosi <2 x float> %x to <2 x i128>
   %spec.store.select = call <2 x i128> @llvm.smin.v2i128(<2 x i128> %conv, <2 x i128> <i128 18446744073709551616, i128 18446744073709551616>)
@@ -3112,55 +4924,129 @@ entry:
   ret <2 x i64> %conv6
 }
 
-define <2 x i64> @stest_f16i64_mm(<2 x half> %x) {
-; CHECK-LABEL: stest_f16i64_mm:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    pushq %r14
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    pushq %rbx
-; CHECK-NEXT:    .cfi_def_cfa_offset 24
-; CHECK-NEXT:    subq $24, %rsp
-; CHECK-NEXT:    .cfi_def_cfa_offset 48
-; CHECK-NEXT:    .cfi_offset %rbx, -24
-; CHECK-NEXT:    .cfi_offset %r14, -16
-; CHECK-NEXT:    movdqa %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT:    psrld $16, %xmm0
-; CHECK-NEXT:    callq __fixhfti@PLT
-; CHECK-NEXT:    movq %rax, %rbx
-; CHECK-NEXT:    movq %rdx, %r14
-; CHECK-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __fixhfti@PLT
-; CHECK-NEXT:    xorl %ecx, %ecx
-; CHECK-NEXT:    movabsq $9223372036854775807, %rsi # imm = 0x7FFFFFFFFFFFFFFF
-; CHECK-NEXT:    cmpq %rsi, %rax
-; CHECK-NEXT:    movq %rdx, %rdi
-; CHECK-NEXT:    sbbq $0, %rdi
-; CHECK-NEXT:    cmovgeq %rcx, %rdx
-; CHECK-NEXT:    cmovgeq %rsi, %rax
-; CHECK-NEXT:    cmpq %rsi, %rbx
-; CHECK-NEXT:    movq %r14, %rdi
-; CHECK-NEXT:    sbbq $0, %rdi
-; CHECK-NEXT:    cmovlq %r14, %rcx
-; CHECK-NEXT:    cmovlq %rbx, %rsi
-; CHECK-NEXT:    movabsq $-9223372036854775808, %rdi # imm = 0x8000000000000000
-; CHECK-NEXT:    cmpq %rsi, %rdi
-; CHECK-NEXT:    movq $-1, %r8
-; CHECK-NEXT:    movq $-1, %r9
-; CHECK-NEXT:    sbbq %rcx, %r9
-; CHECK-NEXT:    cmovgeq %rdi, %rsi
-; CHECK-NEXT:    cmpq %rax, %rdi
-; CHECK-NEXT:    sbbq %rdx, %r8
-; CHECK-NEXT:    cmovgeq %rdi, %rax
-; CHECK-NEXT:    movq %rax, %xmm0
-; CHECK-NEXT:    movq %rsi, %xmm1
-; CHECK-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; CHECK-NEXT:    addq $24, %rsp
-; CHECK-NEXT:    .cfi_def_cfa_offset 24
-; CHECK-NEXT:    popq %rbx
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    popq %r14
-; CHECK-NEXT:    .cfi_def_cfa_offset 8
-; CHECK-NEXT:    retq
+define <2 x i64> @stest_f16i64_mm(<2 x half> %x) nounwind {
+; SSE-LABEL: stest_f16i64_mm:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    pushq %r14
+; SSE-NEXT:    pushq %rbx
+; SSE-NEXT:    subq $24, %rsp
+; SSE-NEXT:    movdqa %xmm0, (%rsp) # 16-byte Spill
+; SSE-NEXT:    psrld $16, %xmm0
+; SSE-NEXT:    callq __fixhfti@PLT
+; SSE-NEXT:    movq %rax, %rbx
+; SSE-NEXT:    movq %rdx, %r14
+; SSE-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; SSE-NEXT:    callq __fixhfti@PLT
+; SSE-NEXT:    xorl %ecx, %ecx
+; SSE-NEXT:    movabsq $9223372036854775807, %rsi # imm = 0x7FFFFFFFFFFFFFFF
+; SSE-NEXT:    cmpq %rsi, %rax
+; SSE-NEXT:    movq %rdx, %rdi
+; SSE-NEXT:    sbbq $0, %rdi
+; SSE-NEXT:    cmovgeq %rcx, %rdx
+; SSE-NEXT:    cmovgeq %rsi, %rax
+; SSE-NEXT:    cmpq %rsi, %rbx
+; SSE-NEXT:    movq %r14, %rdi
+; SSE-NEXT:    sbbq $0, %rdi
+; SSE-NEXT:    cmovlq %r14, %rcx
+; SSE-NEXT:    cmovlq %rbx, %rsi
+; SSE-NEXT:    movabsq $-9223372036854775808, %rdi # imm = 0x8000000000000000
+; SSE-NEXT:    cmpq %rsi, %rdi
+; SSE-NEXT:    movq $-1, %r8
+; SSE-NEXT:    movq $-1, %r9
+; SSE-NEXT:    sbbq %rcx, %r9
+; SSE-NEXT:    cmovgeq %rdi, %rsi
+; SSE-NEXT:    cmpq %rax, %rdi
+; SSE-NEXT:    sbbq %rdx, %r8
+; SSE-NEXT:    cmovgeq %rdi, %rax
+; SSE-NEXT:    movq %rax, %xmm0
+; SSE-NEXT:    movq %rsi, %xmm1
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT:    addq $24, %rsp
+; SSE-NEXT:    popq %rbx
+; SSE-NEXT:    popq %r14
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: stest_f16i64_mm:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    pushq %r14
+; AVX2-NEXT:    pushq %rbx
+; AVX2-NEXT:    subq $24, %rsp
+; AVX2-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
+; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm0
+; AVX2-NEXT:    callq __fixhfti@PLT
+; AVX2-NEXT:    movq %rax, %rbx
+; AVX2-NEXT:    movq %rdx, %r14
+; AVX2-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
+; AVX2-NEXT:    callq __fixhfti@PLT
+; AVX2-NEXT:    xorl %ecx, %ecx
+; AVX2-NEXT:    movabsq $9223372036854775807, %rsi # imm = 0x7FFFFFFFFFFFFFFF
+; AVX2-NEXT:    cmpq %rsi, %rax
+; AVX2-NEXT:    movq %rdx, %rdi
+; AVX2-NEXT:    sbbq $0, %rdi
+; AVX2-NEXT:    cmovgeq %rcx, %rdx
+; AVX2-NEXT:    cmovgeq %rsi, %rax
+; AVX2-NEXT:    cmpq %rsi, %rbx
+; AVX2-NEXT:    movq %r14, %rdi
+; AVX2-NEXT:    sbbq $0, %rdi
+; AVX2-NEXT:    cmovlq %r14, %rcx
+; AVX2-NEXT:    cmovlq %rbx, %rsi
+; AVX2-NEXT:    movabsq $-9223372036854775808, %rdi # imm = 0x8000000000000000
+; AVX2-NEXT:    cmpq %rsi, %rdi
+; AVX2-NEXT:    movq $-1, %r8
+; AVX2-NEXT:    sbbq %rcx, %r8
+; AVX2-NEXT:    movq $-1, %rcx
+; AVX2-NEXT:    cmovgeq %rdi, %rsi
+; AVX2-NEXT:    cmpq %rax, %rdi
+; AVX2-NEXT:    sbbq %rdx, %rcx
+; AVX2-NEXT:    cmovgeq %rdi, %rax
+; AVX2-NEXT:    vmovq %rax, %xmm0
+; AVX2-NEXT:    vmovq %rsi, %xmm1
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT:    addq $24, %rsp
+; AVX2-NEXT:    popq %rbx
+; AVX2-NEXT:    popq %r14
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: stest_f16i64_mm:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    pushq %r14
+; AVX512-NEXT:    pushq %rbx
+; AVX512-NEXT:    subq $24, %rsp
+; AVX512-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
+; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm0
+; AVX512-NEXT:    callq __fixhfti@PLT
+; AVX512-NEXT:    movq %rax, %rbx
+; AVX512-NEXT:    movq %rdx, %r14
+; AVX512-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
+; AVX512-NEXT:    callq __fixhfti@PLT
+; AVX512-NEXT:    xorl %ecx, %ecx
+; AVX512-NEXT:    movabsq $9223372036854775807, %rsi # imm = 0x7FFFFFFFFFFFFFFF
+; AVX512-NEXT:    cmpq %rsi, %rax
+; AVX512-NEXT:    movq %rdx, %rdi
+; AVX512-NEXT:    sbbq $0, %rdi
+; AVX512-NEXT:    cmovgeq %rcx, %rdx
+; AVX512-NEXT:    cmovgeq %rsi, %rax
+; AVX512-NEXT:    cmpq %rsi, %rbx
+; AVX512-NEXT:    movq %r14, %rdi
+; AVX512-NEXT:    sbbq $0, %rdi
+; AVX512-NEXT:    cmovlq %r14, %rcx
+; AVX512-NEXT:    cmovlq %rbx, %rsi
+; AVX512-NEXT:    movabsq $-9223372036854775808, %rdi # imm = 0x8000000000000000
+; AVX512-NEXT:    cmpq %rsi, %rdi
+; AVX512-NEXT:    movq $-1, %r8
+; AVX512-NEXT:    movq $-1, %r9
+; AVX512-NEXT:    sbbq %rcx, %r9
+; AVX512-NEXT:    cmovgeq %rdi, %rsi
+; AVX512-NEXT:    cmpq %rax, %rdi
+; AVX512-NEXT:    sbbq %rdx, %r8
+; AVX512-NEXT:    cmovgeq %rdi, %rax
+; AVX512-NEXT:    vmovq %rax, %xmm0
+; AVX512-NEXT:    vmovq %rsi, %xmm1
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512-NEXT:    addq $24, %rsp
+; AVX512-NEXT:    popq %rbx
+; AVX512-NEXT:    popq %r14
+; AVX512-NEXT:    retq
 entry:
   %conv = fptosi <2 x half> %x to <2 x i128>
   %spec.store.select = call <2 x i128> @llvm.smin.v2i128(<2 x i128> %conv, <2 x i128> <i128 9223372036854775807, i128 9223372036854775807>)
@@ -3169,40 +5055,81 @@ entry:
   ret <2 x i64> %conv6
 }
 
-define <2 x i64> @utesth_f16i64_mm(<2 x half> %x) {
-; CHECK-LABEL: utesth_f16i64_mm:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    pushq %r14
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    pushq %rbx
-; CHECK-NEXT:    .cfi_def_cfa_offset 24
-; CHECK-NEXT:    subq $24, %rsp
-; CHECK-NEXT:    .cfi_def_cfa_offset 48
-; CHECK-NEXT:    .cfi_offset %rbx, -24
-; CHECK-NEXT:    .cfi_offset %r14, -16
-; CHECK-NEXT:    movdqa %xmm0, %xmm1
-; CHECK-NEXT:    psrld $16, %xmm1
-; CHECK-NEXT:    movdqa %xmm1, (%rsp) # 16-byte Spill
-; CHECK-NEXT:    callq __fixunshfti@PLT
-; CHECK-NEXT:    movq %rax, %rbx
-; CHECK-NEXT:    movq %rdx, %r14
-; CHECK-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __fixunshfti@PLT
-; CHECK-NEXT:    xorl %ecx, %ecx
-; CHECK-NEXT:    testq %rdx, %rdx
-; CHECK-NEXT:    cmovneq %rcx, %rax
-; CHECK-NEXT:    testq %r14, %r14
-; CHECK-NEXT:    cmovneq %rcx, %rbx
-; CHECK-NEXT:    movq %rbx, %xmm0
-; CHECK-NEXT:    movq %rax, %xmm1
-; CHECK-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; CHECK-NEXT:    addq $24, %rsp
-; CHECK-NEXT:    .cfi_def_cfa_offset 24
-; CHECK-NEXT:    popq %rbx
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    popq %r14
-; CHECK-NEXT:    .cfi_def_cfa_offset 8
-; CHECK-NEXT:    retq
+define <2 x i64> @utesth_f16i64_mm(<2 x half> %x) nounwind {
+; SSE-LABEL: utesth_f16i64_mm:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    pushq %r14
+; SSE-NEXT:    pushq %rbx
+; SSE-NEXT:    subq $24, %rsp
+; SSE-NEXT:    movdqa %xmm0, %xmm1
+; SSE-NEXT:    psrld $16, %xmm1
+; SSE-NEXT:    movdqa %xmm1, (%rsp) # 16-byte Spill
+; SSE-NEXT:    callq __fixunshfti@PLT
+; SSE-NEXT:    movq %rax, %rbx
+; SSE-NEXT:    movq %rdx, %r14
+; SSE-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; SSE-NEXT:    callq __fixunshfti@PLT
+; SSE-NEXT:    xorl %ecx, %ecx
+; SSE-NEXT:    testq %rdx, %rdx
+; SSE-NEXT:    cmovneq %rcx, %rax
+; SSE-NEXT:    testq %r14, %r14
+; SSE-NEXT:    cmovneq %rcx, %rbx
+; SSE-NEXT:    movq %rbx, %xmm0
+; SSE-NEXT:    movq %rax, %xmm1
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT:    addq $24, %rsp
+; SSE-NEXT:    popq %rbx
+; SSE-NEXT:    popq %r14
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: utesth_f16i64_mm:
+; AVX2:       # %bb.0: # %entry
+; AVX2-NEXT:    pushq %r14
+; AVX2-NEXT:    pushq %rbx
+; AVX2-NEXT:    subq $24, %rsp
+; AVX2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX2-NEXT:    callq __fixunshfti@PLT
+; AVX2-NEXT:    movq %rax, %rbx
+; AVX2-NEXT:    movq %rdx, %r14
+; AVX2-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
+; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm0
+; AVX2-NEXT:    callq __fixunshfti@PLT
+; AVX2-NEXT:    xorl %ecx, %ecx
+; AVX2-NEXT:    testq %rdx, %rdx
+; AVX2-NEXT:    cmovneq %rcx, %rax
+; AVX2-NEXT:    testq %r14, %r14
+; AVX2-NEXT:    cmovneq %rcx, %rbx
+; AVX2-NEXT:    vmovq %rbx, %xmm0
+; AVX2-NEXT:    vmovq %rax, %xmm1
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT:    addq $24, %rsp
+; AVX2-NEXT:    popq %rbx
+; AVX2-NEXT:    popq %r14
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: utesth_f16i64_mm:
+; AVX512:       # %bb.0: # %entry
+; AVX512-NEXT:    pushq %r14
+; AVX512-NEXT:    pushq %rbx
+; AVX512-NEXT:    subq $24, %rsp
+; AVX512-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX512-NEXT:    callq __fixunshfti@PLT
+; AVX512-NEXT:    movq %rax, %rbx
+; AVX512-NEXT:    movq %rdx, %r14
+; AVX512-NEXT:    vpsrld $16, (%rsp), %xmm0 # 16-byte Folded Reload
+; AVX512-NEXT:    callq __fixunshfti@PLT
+; AVX512-NEXT:    xorl %ecx, %ecx
+; AVX512-NEXT:    testq %rdx, %rdx
+; AVX512-NEXT:    cmovneq %rcx, %rax
+; AVX512-NEXT:    testq %r14, %r14
+; AVX512-NEXT:    cmovneq %rcx, %rbx
+; AVX512-NEXT:    vmovq %rbx, %xmm0
+; AVX512-NEXT:    vmovq %rax, %xmm1
+; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512-NEXT:    addq $24, %rsp
+; AVX512-NEXT:    popq %rbx
+; AVX512-NEXT:    popq %r14
+; AVX512-NEXT:    retq
 entry:
   %conv = fptoui <2 x half> %x to <2 x i128>
   %spec.store.select = call <2 x i128> @llvm.umin.v2i128(<2 x i128> %conv, <2 x i128> <i128 18446744073709551616, i128 18446744073709551616>)
@@ -3210,46 +5137,70 @@ entry:
   ret <2 x i64> %conv6
 }
 
-define <2 x i64> @ustest_f16i64_mm(<2 x half> %x) {
-; CHECK-LABEL: ustest_f16i64_mm:
-; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    pushq %r14
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    pushq %rbx
-; CHECK-NEXT:    .cfi_def_cfa_offset 24
-; CHECK-NEXT:    subq $24, %rsp
-; CHECK-NEXT:    .cfi_def_cfa_offset 48
-; CHECK-NEXT:    .cfi_offset %rbx, -24
-; CHECK-NEXT:    .cfi_offset %r14, -16
-; CHECK-NEXT:    movdqa %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT:    psrld $16, %xmm0
-; CHECK-NEXT:    callq __fixhfti@PLT
-; CHECK-NEXT:    movq %rax, %rbx
-; CHECK-NEXT:    movq %rdx, %r14
-; CHECK-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    callq __fixhfti@PLT
-; CHECK-NEXT:    xorl %ecx, %ecx
-; CHECK-NEXT:    testq %rdx, %rdx
-; CHECK-NEXT:    cmovgq %rcx, %rax
-; CHECK-NEXT:    movl $1, %esi
-; CHECK-NEXT:    cmovgq %rsi, %rdx
-; CHECK-NEXT:    testq %r14, %r14
-; CHECK-NEXT:    cmovgq %rcx, %rbx
-; CHECK-NEXT:    cmovleq %r14, %rsi
-; CHECK-NEXT:    testq %rsi, %rsi
-; CHECK-NEXT:    cmovsq %rcx, %rbx
-; CHECK-NEXT:    testq %rdx, %rdx
-; CHECK-NEXT:    cmovsq %rcx, %rax
-; CHECK-NEXT:    movq %rax, %xmm0
-; CHECK-NEXT:    movq %rbx, %xmm1
-; CHECK-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; CHECK-NEXT:    addq $24, %rsp
-; CHECK-NEXT:    .cfi_def_cfa_offset 24
-; CHECK-NEXT:    popq %rbx
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    popq %r14
-; CHECK-NEXT:    .cfi_def_cfa_offset 8
-; CHECK-NEXT:    retq
+define <2 x i64> @ustest_f16i64_mm(<2 x half> %x) nounwind {
+; SSE-LABEL: ustest_f16i64_mm:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    pushq %r14
+; SSE-NEXT:    pushq %rbx
+; SSE-NEXT:    subq $24, %rsp
+; SSE-NEXT:    movdqa %xmm0, (%rsp) # 16-byte Spill
+; SSE-NEXT:    psrld $16, %xmm0
+; SSE-NEXT:    callq __fixhfti@PLT
+; SSE-NEXT:    movq %rax, %rbx
+; SSE-NEXT:    movq %rdx, %r14
+; SSE-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; SSE-NEXT:    callq __fixhfti@PLT
+; SSE-NEXT:    xorl %ecx, %ecx
+; SSE-NEXT:    testq %rdx, %rdx
+; SSE-NEXT:    cmovgq %rcx, %rax
+; SSE-NEXT:    movl $1, %esi
+; SSE-NEXT:    cmovgq %rsi, %rdx
+; SSE-NEXT:    testq %r14, %r14
+; SSE-NEXT:    cmovgq %rcx, %rbx
+; SSE-NEXT:    cmovleq %r14, %rsi
+; SSE-NEXT:    testq %rsi, %rsi
+; SSE-NEXT:    cmovsq %rcx, %rbx
+; SSE-NEXT:    testq %rdx, %rdx
+; SSE-NEXT:    cmovsq %rcx, %rax
+; SSE-NEXT:    movq %rax, %xmm0
+; SSE-NEXT:    movq %rbx, %xmm1
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT:    addq $24, %rsp
+; SSE-NEXT:    popq %rbx
+; SSE-NEXT:    popq %r14
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: ustest_f16i64_mm:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    pushq %r14
+; AVX-NEXT:    pushq %rbx
+; AVX-NEXT:    subq $24, %rsp
+; AVX-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
+; AVX-NEXT:    vpsrld $16, %xmm0, %xmm0
+; AVX-NEXT:    callq __fixhfti@PLT
+; AVX-NEXT:    movq %rax, %rbx
+; AVX-NEXT:    movq %rdx, %r14
+; AVX-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
+; AVX-NEXT:    callq __fixhfti@PLT
+; AVX-NEXT:    xorl %ecx, %ecx
+; AVX-NEXT:    testq %rdx, %rdx
+; AVX-NEXT:    cmovgq %rcx, %rax
+; AVX-NEXT:    movl $1, %esi
+; AVX-NEXT:    cmovgq %rsi, %rdx
+; AVX-NEXT:    testq %r14, %r14
+; AVX-NEXT:    cmovgq %rcx, %rbx
+; AVX-NEXT:    cmovleq %r14, %rsi
+; AVX-NEXT:    testq %rsi, %rsi
+; AVX-NEXT:    cmovsq %rcx, %rbx
+; AVX-NEXT:    testq %rdx, %rdx
+; AVX-NEXT:    cmovsq %rcx, %rax
+; AVX-NEXT:    vmovq %rax, %xmm0
+; AVX-NEXT:    vmovq %rbx, %xmm1
+; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT:    addq $24, %rsp
+; AVX-NEXT:    popq %rbx
+; AVX-NEXT:    popq %r14
+; AVX-NEXT:    retq
 entry:
   %conv = fptosi <2 x half> %x to <2 x i128>
   %spec.store.select = call <2 x i128> @llvm.smin.v2i128(<2 x i128> %conv, <2 x i128> <i128 18446744073709551616, i128 18446744073709551616>)

From d9f36c45da85b2086781a921606ea5f828b045ff Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Wed, 1 Nov 2023 11:10:23 +0100
Subject: [PATCH 649/877] [ConstantFolding] Add ConstantFoldIntegerCast helper

This is intended as the replacement for ConstantExpr::getIntegerCast(),
which does not require availability of the corresponding constant
expressions. It just forwards to ConstantFoldCastOperand with the
correct opcode.
---
 llvm/include/llvm/Analysis/ConstantFolding.h |  5 +++
 llvm/lib/Analysis/ConstantFolding.cpp        | 42 +++++++++++++-------
 2 files changed, 33 insertions(+), 14 deletions(-)

diff --git a/llvm/include/llvm/Analysis/ConstantFolding.h b/llvm/include/llvm/Analysis/ConstantFolding.h
index 169a1d0c48e65..1b194b07e8678 100644
--- a/llvm/include/llvm/Analysis/ConstantFolding.h
+++ b/llvm/include/llvm/Analysis/ConstantFolding.h
@@ -114,6 +114,11 @@ Constant *ConstantFoldSelectInstruction(Constant *Cond, Constant *V1,
 Constant *ConstantFoldCastOperand(unsigned Opcode, Constant *C, Type *DestTy,
                                   const DataLayout &DL);
 
+/// Constant fold a zext, sext or trunc, depending on IsSigned and whether the
+/// DestTy is wider or narrower than C. Returns nullptr on failure.
+Constant *ConstantFoldIntegerCast(Constant *C, Type *DestTy, bool IsSigned,
+                                  const DataLayout &DL);
+
 /// ConstantFoldInsertValueInstruction - Attempt to constant fold an insertvalue
 /// instruction with the specified operands and indices.  The constant result is
 /// returned if successful; if not, null is returned.
diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp
index 2f164e80f2406..353cf733a428c 100644
--- a/llvm/lib/Analysis/ConstantFolding.cpp
+++ b/llvm/lib/Analysis/ConstantFolding.cpp
@@ -1217,10 +1217,11 @@ Constant *llvm::ConstantFoldCompareInstOperands(
         Type *IntPtrTy = DL.getIntPtrType(CE0->getType());
         // Convert the integer value to the right size to ensure we get the
         // proper extension or truncation.
-        Constant *C = ConstantExpr::getIntegerCast(CE0->getOperand(0),
-                                                   IntPtrTy, false);
-        Constant *Null = Constant::getNullValue(C->getType());
-        return ConstantFoldCompareInstOperands(Predicate, C, Null, DL, TLI);
+        if (Constant *C = ConstantFoldIntegerCast(CE0->getOperand(0), IntPtrTy,
+                                                  /*IsSigned*/ false, DL)) {
+          Constant *Null = Constant::getNullValue(C->getType());
+          return ConstantFoldCompareInstOperands(Predicate, C, Null, DL, TLI);
+        }
       }
 
       // Only do this transformation if the int is intptrty in size, otherwise
@@ -1242,11 +1243,12 @@ Constant *llvm::ConstantFoldCompareInstOperands(
 
           // Convert the integer value to the right size to ensure we get the
           // proper extension or truncation.
-          Constant *C0 = ConstantExpr::getIntegerCast(CE0->getOperand(0),
-                                                      IntPtrTy, false);
-          Constant *C1 = ConstantExpr::getIntegerCast(CE1->getOperand(0),
-                                                      IntPtrTy, false);
-          return ConstantFoldCompareInstOperands(Predicate, C0, C1, DL, TLI);
+          Constant *C0 = ConstantFoldIntegerCast(CE0->getOperand(0), IntPtrTy,
+                                                 /*IsSigned*/ false, DL);
+          Constant *C1 = ConstantFoldIntegerCast(CE1->getOperand(0), IntPtrTy,
+                                                 /*IsSigned*/ false, DL);
+          if (C0 && C1)
+            return ConstantFoldCompareInstOperands(Predicate, C0, C1, DL, TLI);
         }
 
         // Only do this transformation if the int is intptrty in size, otherwise
@@ -1401,9 +1403,9 @@ Constant *llvm::ConstantFoldCastOperand(unsigned Opcode, Constant *C,
       // the width of a pointer, so it can't be done in ConstantExpr::getCast.
       if (CE->getOpcode() == Instruction::IntToPtr) {
         // zext/trunc the inttoptr to pointer size.
-        FoldedValue = ConstantExpr::getIntegerCast(
-            CE->getOperand(0), DL.getIntPtrType(CE->getType()),
-            /*IsSigned=*/false);
+        FoldedValue = ConstantFoldIntegerCast(CE->getOperand(0),
+                                              DL.getIntPtrType(CE->getType()),
+                                              /*IsSigned=*/false, DL);
       } else if (auto *GEP = dyn_cast<GEPOperator>(CE)) {
         // If we have GEP, we can perform the following folds:
         // (ptrtoint (gep null, x)) -> x
@@ -1431,8 +1433,8 @@ Constant *llvm::ConstantFoldCastOperand(unsigned Opcode, Constant *C,
       }
       if (FoldedValue) {
         // Do a zext or trunc to get to the ptrtoint dest size.
-        return ConstantExpr::getIntegerCast(FoldedValue, DestTy,
-                                            /*IsSigned=*/false);
+        return ConstantFoldIntegerCast(FoldedValue, DestTy, /*IsSigned=*/false,
+                                       DL);
       }
     }
     break;
@@ -1475,6 +1477,18 @@ Constant *llvm::ConstantFoldCastOperand(unsigned Opcode, Constant *C,
   return ConstantFoldCastInstruction(Opcode, C, DestTy);
 }
 
+Constant *llvm::ConstantFoldIntegerCast(Constant *C, Type *DestTy,
+                                        bool IsSigned, const DataLayout &DL) {
+  Type *SrcTy = C->getType();
+  if (SrcTy == DestTy)
+    return C;
+  if (SrcTy->getScalarSizeInBits() > DestTy->getScalarSizeInBits())
+    return ConstantFoldCastOperand(Instruction::Trunc, C, DestTy, DL);
+  if (IsSigned)
+    return ConstantFoldCastOperand(Instruction::SExt, C, DestTy, DL);
+  return ConstantFoldCastOperand(Instruction::ZExt, C, DestTy, DL);
+}
+
 //===----------------------------------------------------------------------===//
 //  Constant Folding for Calls
 //

From e91812792a106831eb0f3d84cbdeb759e42302d4 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Wed, 1 Nov 2023 11:15:18 +0100
Subject: [PATCH 650/877] [InstSimplify] Avoid ConstantExpr::getIntegerCast()
 (NFCI)

This always works on a constant integer or integer splat, so the
constant fold here should always succeed.
---
 llvm/lib/Analysis/InstructionSimplify.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp
index f0e60c9a2dac6..fe3d7d679129f 100644
--- a/llvm/lib/Analysis/InstructionSimplify.cpp
+++ b/llvm/lib/Analysis/InstructionSimplify.cpp
@@ -895,7 +895,8 @@ static Value *simplifySubInst(Value *Op0, Value *Op1, bool IsNSW, bool IsNUW,
   // Variations on GEP(base, I, ...) - GEP(base, i, ...) -> GEP(null, I-i, ...).
   if (match(Op0, m_PtrToInt(m_Value(X))) && match(Op1, m_PtrToInt(m_Value(Y))))
     if (Constant *Result = computePointerDifference(Q.DL, X, Y))
-      return ConstantExpr::getIntegerCast(Result, Op0->getType(), true);
+      return ConstantFoldIntegerCast(Result, Op0->getType(), /*IsSigned*/ true,
+                                     Q.DL);
 
   // i1 sub -> xor.
   if (MaxRecurse && Op0->getType()->isIntOrIntVectorTy(1))

From d47e2ff8e15d14f5776c8c36b27ba8d6ad392211 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Wed, 1 Nov 2023 11:19:12 +0100
Subject: [PATCH 651/877] [ValueTracking] Avoid ConstantExpr::getIntegerCast()

Use ConstantFoldIntegerCast() instead, to remove the reliance on
constant expressions.
---
 llvm/lib/Analysis/ValueTracking.cpp | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index c303d261107eb..2458c1cb9f8ec 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -5390,10 +5390,9 @@ Value *llvm::isBytewiseValue(Value *V, const DataLayout &DL) {
     if (CE->getOpcode() == Instruction::IntToPtr) {
       if (auto *PtrTy = dyn_cast<PointerType>(CE->getType())) {
         unsigned BitWidth = DL.getPointerSizeInBits(PtrTy->getAddressSpace());
-        return isBytewiseValue(
-            ConstantExpr::getIntegerCast(CE->getOperand(0),
-                                         Type::getIntNTy(Ctx, BitWidth), false),
-            DL);
+        if (Constant *Op = ConstantFoldIntegerCast(
+                CE->getOperand(0), Type::getIntNTy(Ctx, BitWidth), false, DL))
+          return isBytewiseValue(Op, DL);
       }
     }
   }

From 39dfaf0d0d48ac53fca5a8123d2074dcc55ce6fd Mon Sep 17 00:00:00 2001
From: Exile <2094247798@qq.com>
Date: Wed, 1 Nov 2023 18:29:37 +0800
Subject: [PATCH 652/877] [clang][ASTImporter] Fix crash when template class
 static member imported to other translation unit. (#68774)

Fixes: #68769

Co-authored-by: miaozhiyuan <miaozhiyuan@feysh.com>
---
 clang/docs/ReleaseNotes.rst             |  4 +++
 clang/lib/AST/ASTImporter.cpp           | 11 ++++++++
 clang/unittests/AST/ASTImporterTest.cpp | 34 +++++++++++++++++++++++++
 3 files changed, 49 insertions(+)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 1cd8cf8fe3288..89c35af565fde 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -670,6 +670,10 @@ Bug Fixes to C++ Support
   default initializing a base class in a constant expression context. Fixes:
   (`#69890 <https://github.com/llvm/llvm-project/issues/69890>`_)
 
+- Fix crash when template class static member imported to other translation unit.
+  Fixes:
+  (`#68769 <https://github.com/llvm/llvm-project/issues/68769>`_)
+
 Bug Fixes to AST Handling
 ^^^^^^^^^^^^^^^^^^^^^^^^^
 - Fixed an import failure of recursive friend class template.
diff --git a/clang/lib/AST/ASTImporter.cpp b/clang/lib/AST/ASTImporter.cpp
index 650ff201e66b7..c4e931e220f69 100644
--- a/clang/lib/AST/ASTImporter.cpp
+++ b/clang/lib/AST/ASTImporter.cpp
@@ -4476,6 +4476,17 @@ ExpectedDecl ASTNodeImporter::VisitVarDecl(VarDecl *D) {
     auto ToVTOrErr = import(D->getDescribedVarTemplate());
     if (!ToVTOrErr)
       return ToVTOrErr.takeError();
+  } else if (MemberSpecializationInfo *MSI = D->getMemberSpecializationInfo()) {
+    TemplateSpecializationKind SK = MSI->getTemplateSpecializationKind();
+    VarDecl *FromInst = D->getInstantiatedFromStaticDataMember();
+    if (Expected<VarDecl *> ToInstOrErr = import(FromInst))
+      ToVar->setInstantiationOfStaticDataMember(*ToInstOrErr, SK);
+    else
+      return ToInstOrErr.takeError();
+    if (ExpectedSLoc POIOrErr = import(MSI->getPointOfInstantiation()))
+      ToVar->getMemberSpecializationInfo()->setPointOfInstantiation(*POIOrErr);
+    else
+      return POIOrErr.takeError();
   }
 
   if (Error Err = ImportInitializer(D, ToVar))
diff --git a/clang/unittests/AST/ASTImporterTest.cpp b/clang/unittests/AST/ASTImporterTest.cpp
index 87e6cd1e160c4..0cbec5cf9ba06 100644
--- a/clang/unittests/AST/ASTImporterTest.cpp
+++ b/clang/unittests/AST/ASTImporterTest.cpp
@@ -1370,6 +1370,40 @@ TEST_P(ASTImporterOptionSpecificTestBase, ImportCorrectTemplatedDecl) {
   ASSERT_EQ(ToTemplated1, ToTemplated);
 }
 
+TEST_P(ASTImporterOptionSpecificTestBase,
+       ImportTemplateSpecializationStaticMember) {
+  auto FromCode =
+      R"(
+      template <typename H> class Test{
+      public:
+        static const unsigned int length;
+      };
+
+      template<> const unsigned int Test<int>::length;
+      template<> const unsigned int Test<int>::length = 0;
+      )";
+  auto ToCode =
+      R"(
+      template <typename H> class Test {
+      public:
+        static const unsigned int length;
+      };
+
+      template <> const unsigned int Test<int>::length;
+
+      void foo() { int i = 1 / Test<int>::length; }
+      )";
+  Decl *FromTU = getTuDecl(FromCode, Lang_CXX14);
+  auto FromDecl = FirstDeclMatcher<VarDecl>().match(
+      FromTU, varDecl(hasName("length"), isDefinition()));
+  Decl *ToTu = getToTuDecl(ToCode, Lang_CXX14);
+  auto ToX = Import(FromDecl, Lang_CXX03);
+  auto ToDecl = FirstDeclMatcher<VarDecl>().match(
+      ToTu, varDecl(hasName("length"), isDefinition()));
+  EXPECT_TRUE(ToX);
+  EXPECT_EQ(ToX, ToDecl);
+}
+
 TEST_P(ASTImporterOptionSpecificTestBase, ImportChooseExpr) {
   // This tests the import of isConditionTrue directly to make sure the importer
   // gets it right.

From f471f6ff2f02a5081f89e0daf18c2ee9f3dc103d Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Wed, 1 Nov 2023 10:33:22 +0000
Subject: [PATCH 653/877] [X86] combineTruncateWithSat - relax minimum
 truncation size for PACKSS/PACKUS

truncateVectorWithPACK handling of sub-128-bit result types was improved some time ago, so remove the old 64-bit limit

Fixes #68466
---
 llvm/lib/Target/X86/X86ISelLowering.cpp       |   6 +-
 llvm/test/CodeGen/X86/fpclamptosat_vec.ll     | 259 ++++--------------
 .../CodeGen/X86/masked_store_trunc_ssat.ll    | 143 +++-------
 llvm/test/CodeGen/X86/vector-trunc-packus.ll  | 180 +++---------
 llvm/test/CodeGen/X86/vector-trunc-ssat.ll    | 170 +++---------
 5 files changed, 159 insertions(+), 599 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 18f6a695e4502..9a3e1e9bd3233 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -49604,14 +49604,12 @@ static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL,
                       (Subtarget.hasVLX() || InVT.getSizeInBits() > 256) &&
                       !(!Subtarget.useAVX512Regs() && VT.getSizeInBits() >= 256);
 
-  if (isPowerOf2_32(VT.getVectorNumElements()) && !PreferAVX512 &&
-      VT.getSizeInBits() >= 64 &&
+  if (!PreferAVX512 && VT.getVectorNumElements() > 1 &&
+      isPowerOf2_32(VT.getVectorNumElements()) &&
       (SVT == MVT::i8 || SVT == MVT::i16) &&
       (InSVT == MVT::i16 || InSVT == MVT::i32)) {
     if (SDValue USatVal = detectSSatPattern(In, VT, true)) {
       // vXi32 -> vXi8 must be performed as PACKUSWB(PACKSSDW,PACKSSDW).
-      // Only do this when the result is at least 64 bits or we'll leaving
-      // dangling PACKSSDW nodes.
       if (SVT == MVT::i8 && InSVT == MVT::i32) {
         EVT MidVT = VT.changeVectorElementType(MVT::i16);
         SDValue Mid = truncateVectorWithPACK(X86ISD::PACKSS, MidVT, USatVal, DL,
diff --git a/llvm/test/CodeGen/X86/fpclamptosat_vec.ll b/llvm/test/CodeGen/X86/fpclamptosat_vec.ll
index 017fe14366bd6..78ccc983d1637 100644
--- a/llvm/test/CodeGen/X86/fpclamptosat_vec.ll
+++ b/llvm/test/CodeGen/X86/fpclamptosat_vec.ll
@@ -1092,38 +1092,14 @@ define <2 x i16> @stest_f64i16(<2 x double> %x) nounwind {
 ; SSE-LABEL: stest_f64i16:
 ; SSE:       # %bb.0: # %entry
 ; SSE-NEXT:    cvttpd2dq %xmm0, %xmm0
-; SSE-NEXT:    movdqa {{.*#+}} xmm1 = <32767,32767,u,u>
-; SSE-NEXT:    movdqa %xmm1, %xmm2
-; SSE-NEXT:    pcmpgtd %xmm0, %xmm2
-; SSE-NEXT:    pand %xmm2, %xmm0
-; SSE-NEXT:    pandn %xmm1, %xmm2
-; SSE-NEXT:    por %xmm0, %xmm2
-; SSE-NEXT:    movdqa {{.*#+}} xmm0 = <4294934528,4294934528,u,u>
-; SSE-NEXT:    movdqa %xmm2, %xmm1
-; SSE-NEXT:    pcmpgtd %xmm0, %xmm1
-; SSE-NEXT:    pand %xmm1, %xmm2
-; SSE-NEXT:    pandn %xmm0, %xmm1
-; SSE-NEXT:    por %xmm2, %xmm1
-; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7]
+; SSE-NEXT:    packssdw %xmm0, %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX2-LABEL: stest_f64i16:
-; AVX2:       # %bb.0: # %entry
-; AVX2-NEXT:    vcvttpd2dq %xmm0, %xmm0
-; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [32767,32767,32767,32767]
-; AVX2-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [4294934528,4294934528,4294934528,4294934528]
-; AVX2-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: stest_f64i16:
-; AVX512:       # %bb.0: # %entry
-; AVX512-NEXT:    vcvttpd2dq %xmm0, %xmm0
-; AVX512-NEXT:    vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
-; AVX512-NEXT:    vpmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
-; AVX512-NEXT:    vpmovdw %xmm0, %xmm0
-; AVX512-NEXT:    retq
+; AVX-LABEL: stest_f64i16:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    vcvttpd2dq %xmm0, %xmm0
+; AVX-NEXT:    vpackssdw %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    retq
 entry:
   %conv = fptosi <2 x double> %x to <2 x i32>
   %0 = icmp slt <2 x i32> %conv, <i32 32767, i32 32767>
@@ -1198,24 +1174,11 @@ define <2 x i16> @ustest_f64i16(<2 x double> %x) nounwind {
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7]
 ; SSE-NEXT:    retq
 ;
-; AVX2-LABEL: ustest_f64i16:
-; AVX2:       # %bb.0: # %entry
-; AVX2-NEXT:    vcvttpd2dq %xmm0, %xmm0
-; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [65535,65535,65535,65535]
-; AVX2-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: ustest_f64i16:
-; AVX512:       # %bb.0: # %entry
-; AVX512-NEXT:    vcvttpd2dq %xmm0, %xmm0
-; AVX512-NEXT:    vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
-; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vpmovdw %xmm0, %xmm0
-; AVX512-NEXT:    retq
+; AVX-LABEL: ustest_f64i16:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    vcvttpd2dq %xmm0, %xmm0
+; AVX-NEXT:    vpackusdw %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    retq
 entry:
   %conv = fptosi <2 x double> %x to <2 x i32>
   %0 = icmp slt <2 x i32> %conv, <i32 65535, i32 65535>
@@ -1652,40 +1615,16 @@ define <2 x i8> @stest_f64i8(<2 x double> %x) nounwind {
 ; SSE-LABEL: stest_f64i8:
 ; SSE:       # %bb.0: # %entry
 ; SSE-NEXT:    cvttpd2dq %xmm0, %xmm0
-; SSE-NEXT:    movdqa {{.*#+}} xmm1 = <127,127,u,u>
-; SSE-NEXT:    movdqa %xmm1, %xmm2
-; SSE-NEXT:    pcmpgtd %xmm0, %xmm2
-; SSE-NEXT:    pand %xmm2, %xmm0
-; SSE-NEXT:    pandn %xmm1, %xmm2
-; SSE-NEXT:    por %xmm0, %xmm2
-; SSE-NEXT:    movdqa {{.*#+}} xmm1 = <4294967168,4294967168,u,u>
-; SSE-NEXT:    movdqa %xmm2, %xmm0
-; SSE-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE-NEXT:    pand %xmm0, %xmm2
-; SSE-NEXT:    pandn %xmm1, %xmm0
-; SSE-NEXT:    por %xmm2, %xmm0
 ; SSE-NEXT:    packssdw %xmm0, %xmm0
 ; SSE-NEXT:    packsswb %xmm0, %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX2-LABEL: stest_f64i8:
-; AVX2:       # %bb.0: # %entry
-; AVX2-NEXT:    vcvttpd2dq %xmm0, %xmm0
-; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [127,127,127,127]
-; AVX2-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [4294967168,4294967168,4294967168,4294967168]
-; AVX2-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpackssdw %xmm0, %xmm0, %xmm0
-; AVX2-NEXT:    vpacksswb %xmm0, %xmm0, %xmm0
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: stest_f64i8:
-; AVX512:       # %bb.0: # %entry
-; AVX512-NEXT:    vcvttpd2dq %xmm0, %xmm0
-; AVX512-NEXT:    vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
-; AVX512-NEXT:    vpmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
-; AVX512-NEXT:    vpmovdb %xmm0, %xmm0
-; AVX512-NEXT:    retq
+; AVX-LABEL: stest_f64i8:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    vcvttpd2dq %xmm0, %xmm0
+; AVX-NEXT:    vpackssdw %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vpacksswb %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    retq
 entry:
   %conv = fptosi <2 x double> %x to <2 x i32>
   %0 = icmp slt <2 x i32> %conv, <i32 127, i32 127>
@@ -1748,39 +1687,16 @@ define <2 x i8> @ustest_f64i8(<2 x double> %x) nounwind {
 ; SSE-LABEL: ustest_f64i8:
 ; SSE:       # %bb.0: # %entry
 ; SSE-NEXT:    cvttpd2dq %xmm0, %xmm0
-; SSE-NEXT:    movdqa {{.*#+}} xmm1 = <255,255,u,u>
-; SSE-NEXT:    movdqa %xmm1, %xmm2
-; SSE-NEXT:    pcmpgtd %xmm0, %xmm2
-; SSE-NEXT:    pand %xmm2, %xmm0
-; SSE-NEXT:    pandn %xmm1, %xmm2
-; SSE-NEXT:    por %xmm0, %xmm2
-; SSE-NEXT:    pxor %xmm1, %xmm1
-; SSE-NEXT:    movdqa %xmm2, %xmm0
-; SSE-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE-NEXT:    pand %xmm2, %xmm0
-; SSE-NEXT:    packuswb %xmm0, %xmm0
+; SSE-NEXT:    packssdw %xmm0, %xmm0
 ; SSE-NEXT:    packuswb %xmm0, %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX2-LABEL: ustest_f64i8:
-; AVX2:       # %bb.0: # %entry
-; AVX2-NEXT:    vcvttpd2dq %xmm0, %xmm0
-; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [255,255,255,255]
-; AVX2-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpackusdw %xmm0, %xmm0, %xmm0
-; AVX2-NEXT:    vpackuswb %xmm0, %xmm0, %xmm0
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: ustest_f64i8:
-; AVX512:       # %bb.0: # %entry
-; AVX512-NEXT:    vcvttpd2dq %xmm0, %xmm0
-; AVX512-NEXT:    vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
-; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vpmovdb %xmm0, %xmm0
-; AVX512-NEXT:    retq
+; AVX-LABEL: ustest_f64i8:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    vcvttpd2dq %xmm0, %xmm0
+; AVX-NEXT:    vpackssdw %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vpackuswb %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    retq
 entry:
   %conv = fptosi <2 x double> %x to <2 x i32>
   %0 = icmp slt <2 x i32> %conv, <i32 255, i32 255>
@@ -1795,37 +1711,16 @@ define <4 x i8> @stest_f32i8(<4 x float> %x) nounwind {
 ; SSE-LABEL: stest_f32i8:
 ; SSE:       # %bb.0: # %entry
 ; SSE-NEXT:    cvttps2dq %xmm0, %xmm0
-; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [127,127,127,127]
-; SSE-NEXT:    movdqa %xmm1, %xmm2
-; SSE-NEXT:    pcmpgtd %xmm0, %xmm2
-; SSE-NEXT:    pand %xmm2, %xmm0
-; SSE-NEXT:    pandn %xmm1, %xmm2
-; SSE-NEXT:    por %xmm0, %xmm2
-; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [4294967168,4294967168,4294967168,4294967168]
-; SSE-NEXT:    movdqa %xmm2, %xmm0
-; SSE-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE-NEXT:    pand %xmm0, %xmm2
-; SSE-NEXT:    pandn %xmm1, %xmm0
-; SSE-NEXT:    por %xmm2, %xmm0
 ; SSE-NEXT:    packssdw %xmm0, %xmm0
 ; SSE-NEXT:    packsswb %xmm0, %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX2-LABEL: stest_f32i8:
-; AVX2:       # %bb.0: # %entry
-; AVX2-NEXT:    vcvttps2dq %xmm0, %xmm0
-; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [127,127,127,127]
-; AVX2-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [4294967168,4294967168,4294967168,4294967168]
-; AVX2-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: stest_f32i8:
-; AVX512:       # %bb.0: # %entry
-; AVX512-NEXT:    vcvttps2dq %xmm0, %xmm0
-; AVX512-NEXT:    vpmovsdb %xmm0, %xmm0
-; AVX512-NEXT:    retq
+; AVX-LABEL: stest_f32i8:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    vcvttps2dq %xmm0, %xmm0
+; AVX-NEXT:    vpackssdw %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vpacksswb %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    retq
 entry:
   %conv = fptosi <4 x float> %x to <4 x i32>
   %0 = icmp slt <4 x i32> %conv, <i32 127, i32 127, i32 127, i32 127>
@@ -1888,37 +1783,16 @@ define <4 x i8> @ustest_f32i8(<4 x float> %x) nounwind {
 ; SSE-LABEL: ustest_f32i8:
 ; SSE:       # %bb.0: # %entry
 ; SSE-NEXT:    cvttps2dq %xmm0, %xmm0
-; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [255,255,255,255]
-; SSE-NEXT:    movdqa %xmm1, %xmm2
-; SSE-NEXT:    pcmpgtd %xmm0, %xmm2
-; SSE-NEXT:    pand %xmm2, %xmm0
-; SSE-NEXT:    pandn %xmm1, %xmm2
-; SSE-NEXT:    por %xmm0, %xmm2
-; SSE-NEXT:    pxor %xmm1, %xmm1
-; SSE-NEXT:    movdqa %xmm2, %xmm0
-; SSE-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE-NEXT:    pand %xmm2, %xmm0
-; SSE-NEXT:    packuswb %xmm0, %xmm0
+; SSE-NEXT:    packssdw %xmm0, %xmm0
 ; SSE-NEXT:    packuswb %xmm0, %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX2-LABEL: ustest_f32i8:
-; AVX2:       # %bb.0: # %entry
-; AVX2-NEXT:    vcvttps2dq %xmm0, %xmm0
-; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [255,255,255,255]
-; AVX2-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: ustest_f32i8:
-; AVX512:       # %bb.0: # %entry
-; AVX512-NEXT:    vcvttps2dq %xmm0, %xmm0
-; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vpmovusdb %xmm0, %xmm0
-; AVX512-NEXT:    retq
+; AVX-LABEL: ustest_f32i8:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    vcvttps2dq %xmm0, %xmm0
+; AVX-NEXT:    vpackssdw %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vpackuswb %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    retq
 entry:
   %conv = fptosi <4 x float> %x to <4 x i32>
   %0 = icmp slt <4 x i32> %conv, <i32 255, i32 255, i32 255, i32 255>
@@ -3863,38 +3737,14 @@ define <2 x i16> @stest_f64i16_mm(<2 x double> %x) nounwind {
 ; SSE-LABEL: stest_f64i16_mm:
 ; SSE:       # %bb.0: # %entry
 ; SSE-NEXT:    cvttpd2dq %xmm0, %xmm0
-; SSE-NEXT:    movdqa {{.*#+}} xmm1 = <32767,32767,u,u>
-; SSE-NEXT:    movdqa %xmm1, %xmm2
-; SSE-NEXT:    pcmpgtd %xmm0, %xmm2
-; SSE-NEXT:    pand %xmm2, %xmm0
-; SSE-NEXT:    pandn %xmm1, %xmm2
-; SSE-NEXT:    por %xmm0, %xmm2
-; SSE-NEXT:    movdqa {{.*#+}} xmm0 = <4294934528,4294934528,u,u>
-; SSE-NEXT:    movdqa %xmm2, %xmm1
-; SSE-NEXT:    pcmpgtd %xmm0, %xmm1
-; SSE-NEXT:    pand %xmm1, %xmm2
-; SSE-NEXT:    pandn %xmm0, %xmm1
-; SSE-NEXT:    por %xmm2, %xmm1
-; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7]
+; SSE-NEXT:    packssdw %xmm0, %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX2-LABEL: stest_f64i16_mm:
-; AVX2:       # %bb.0: # %entry
-; AVX2-NEXT:    vcvttpd2dq %xmm0, %xmm0
-; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [32767,32767,32767,32767]
-; AVX2-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [4294934528,4294934528,4294934528,4294934528]
-; AVX2-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: stest_f64i16_mm:
-; AVX512:       # %bb.0: # %entry
-; AVX512-NEXT:    vcvttpd2dq %xmm0, %xmm0
-; AVX512-NEXT:    vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
-; AVX512-NEXT:    vpmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
-; AVX512-NEXT:    vpmovdw %xmm0, %xmm0
-; AVX512-NEXT:    retq
+; AVX-LABEL: stest_f64i16_mm:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    vcvttpd2dq %xmm0, %xmm0
+; AVX-NEXT:    vpackssdw %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    retq
 entry:
   %conv = fptosi <2 x double> %x to <2 x i32>
   %spec.store.select = call <2 x i32> @llvm.smin.v2i32(<2 x i32> %conv, <2 x i32> <i32 32767, i32 32767>)
@@ -3966,24 +3816,11 @@ define <2 x i16> @ustest_f64i16_mm(<2 x double> %x) nounwind {
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm1[0,2,2,3,4,5,6,7]
 ; SSE-NEXT:    retq
 ;
-; AVX2-LABEL: ustest_f64i16_mm:
-; AVX2:       # %bb.0: # %entry
-; AVX2-NEXT:    vcvttpd2dq %xmm0, %xmm0
-; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [65535,65535,65535,65535]
-; AVX2-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: ustest_f64i16_mm:
-; AVX512:       # %bb.0: # %entry
-; AVX512-NEXT:    vcvttpd2dq %xmm0, %xmm0
-; AVX512-NEXT:    vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
-; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vpmovdw %xmm0, %xmm0
-; AVX512-NEXT:    retq
+; AVX-LABEL: ustest_f64i16_mm:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    vcvttpd2dq %xmm0, %xmm0
+; AVX-NEXT:    vpackusdw %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    retq
 entry:
   %conv = fptosi <2 x double> %x to <2 x i32>
   %spec.store.select = call <2 x i32> @llvm.smin.v2i32(<2 x i32> %conv, <2 x i32> <i32 65535, i32 65535>)
diff --git a/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll b/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll
index 38abaf8ff11c6..bd1e6d320b69e 100644
--- a/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll
+++ b/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll
@@ -5166,25 +5166,13 @@ define void @truncstore_v4i32_v4i8(<4 x i32> %x, ptr %p, <4 x i32> %mask) {
 ; SSE2-LABEL: truncstore_v4i32_v4i8:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pxor %xmm2, %xmm2
-; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [127,127,127,127]
-; SSE2-NEXT:    movdqa %xmm3, %xmm4
-; SSE2-NEXT:    pcmpgtd %xmm0, %xmm4
-; SSE2-NEXT:    pand %xmm4, %xmm0
-; SSE2-NEXT:    pandn %xmm3, %xmm4
-; SSE2-NEXT:    por %xmm0, %xmm4
-; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [4294967168,4294967168,4294967168,4294967168]
-; SSE2-NEXT:    movdqa %xmm4, %xmm3
-; SSE2-NEXT:    pcmpgtd %xmm0, %xmm3
-; SSE2-NEXT:    pand %xmm3, %xmm4
-; SSE2-NEXT:    pandn %xmm0, %xmm3
-; SSE2-NEXT:    por %xmm4, %xmm3
-; SSE2-NEXT:    packssdw %xmm3, %xmm3
-; SSE2-NEXT:    packsswb %xmm3, %xmm3
+; SSE2-NEXT:    packssdw %xmm0, %xmm0
+; SSE2-NEXT:    packsswb %xmm0, %xmm0
 ; SSE2-NEXT:    pcmpeqd %xmm1, %xmm2
 ; SSE2-NEXT:    movmskps %xmm2, %ecx
 ; SSE2-NEXT:    xorl $15, %ecx
 ; SSE2-NEXT:    testb $1, %cl
-; SSE2-NEXT:    movd %xmm3, %eax
+; SSE2-NEXT:    movd %xmm0, %eax
 ; SSE2-NEXT:    jne .LBB14_1
 ; SSE2-NEXT:  # %bb.2: # %else
 ; SSE2-NEXT:    testb $2, %cl
@@ -5219,8 +5207,6 @@ define void @truncstore_v4i32_v4i8(<4 x i32> %x, ptr %p, <4 x i32> %mask) {
 ; SSE4-LABEL: truncstore_v4i32_v4i8:
 ; SSE4:       # %bb.0:
 ; SSE4-NEXT:    pxor %xmm2, %xmm2
-; SSE4-NEXT:    pminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE4-NEXT:    pmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE4-NEXT:    packssdw %xmm0, %xmm0
 ; SSE4-NEXT:    packsswb %xmm0, %xmm0
 ; SSE4-NEXT:    pcmpeqd %xmm1, %xmm2
@@ -5255,92 +5241,49 @@ define void @truncstore_v4i32_v4i8(<4 x i32> %x, ptr %p, <4 x i32> %mask) {
 ; SSE4-NEXT:    pextrb $3, %xmm0, 3(%rdi)
 ; SSE4-NEXT:    retq
 ;
-; AVX1-LABEL: truncstore_v4i32_v4i8:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT:    vpmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT:    vpackssdw %xmm0, %xmm0, %xmm0
-; AVX1-NEXT:    vpacksswb %xmm0, %xmm0, %xmm0
-; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vmovmskps %xmm1, %eax
-; AVX1-NEXT:    xorl $15, %eax
-; AVX1-NEXT:    testb $1, %al
-; AVX1-NEXT:    jne .LBB14_1
-; AVX1-NEXT:  # %bb.2: # %else
-; AVX1-NEXT:    testb $2, %al
-; AVX1-NEXT:    jne .LBB14_3
-; AVX1-NEXT:  .LBB14_4: # %else2
-; AVX1-NEXT:    testb $4, %al
-; AVX1-NEXT:    jne .LBB14_5
-; AVX1-NEXT:  .LBB14_6: # %else4
-; AVX1-NEXT:    testb $8, %al
-; AVX1-NEXT:    jne .LBB14_7
-; AVX1-NEXT:  .LBB14_8: # %else6
-; AVX1-NEXT:    retq
-; AVX1-NEXT:  .LBB14_1: # %cond.store
-; AVX1-NEXT:    vpextrb $0, %xmm0, (%rdi)
-; AVX1-NEXT:    testb $2, %al
-; AVX1-NEXT:    je .LBB14_4
-; AVX1-NEXT:  .LBB14_3: # %cond.store1
-; AVX1-NEXT:    vpextrb $1, %xmm0, 1(%rdi)
-; AVX1-NEXT:    testb $4, %al
-; AVX1-NEXT:    je .LBB14_6
-; AVX1-NEXT:  .LBB14_5: # %cond.store3
-; AVX1-NEXT:    vpextrb $2, %xmm0, 2(%rdi)
-; AVX1-NEXT:    testb $8, %al
-; AVX1-NEXT:    je .LBB14_8
-; AVX1-NEXT:  .LBB14_7: # %cond.store5
-; AVX1-NEXT:    vpextrb $3, %xmm0, 3(%rdi)
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: truncstore_v4i32_v4i8:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [127,127,127,127]
-; AVX2-NEXT:    vpminsd %xmm3, %xmm0, %xmm0
-; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [4294967168,4294967168,4294967168,4294967168]
-; AVX2-NEXT:    vpmaxsd %xmm3, %xmm0, %xmm0
-; AVX2-NEXT:    vpackssdw %xmm0, %xmm0, %xmm0
-; AVX2-NEXT:    vpacksswb %xmm0, %xmm0, %xmm0
-; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm1, %xmm1
-; AVX2-NEXT:    vmovmskps %xmm1, %eax
-; AVX2-NEXT:    xorl $15, %eax
-; AVX2-NEXT:    testb $1, %al
-; AVX2-NEXT:    jne .LBB14_1
-; AVX2-NEXT:  # %bb.2: # %else
-; AVX2-NEXT:    testb $2, %al
-; AVX2-NEXT:    jne .LBB14_3
-; AVX2-NEXT:  .LBB14_4: # %else2
-; AVX2-NEXT:    testb $4, %al
-; AVX2-NEXT:    jne .LBB14_5
-; AVX2-NEXT:  .LBB14_6: # %else4
-; AVX2-NEXT:    testb $8, %al
-; AVX2-NEXT:    jne .LBB14_7
-; AVX2-NEXT:  .LBB14_8: # %else6
-; AVX2-NEXT:    retq
-; AVX2-NEXT:  .LBB14_1: # %cond.store
-; AVX2-NEXT:    vpextrb $0, %xmm0, (%rdi)
-; AVX2-NEXT:    testb $2, %al
-; AVX2-NEXT:    je .LBB14_4
-; AVX2-NEXT:  .LBB14_3: # %cond.store1
-; AVX2-NEXT:    vpextrb $1, %xmm0, 1(%rdi)
-; AVX2-NEXT:    testb $4, %al
-; AVX2-NEXT:    je .LBB14_6
-; AVX2-NEXT:  .LBB14_5: # %cond.store3
-; AVX2-NEXT:    vpextrb $2, %xmm0, 2(%rdi)
-; AVX2-NEXT:    testb $8, %al
-; AVX2-NEXT:    je .LBB14_8
-; AVX2-NEXT:  .LBB14_7: # %cond.store5
-; AVX2-NEXT:    vpextrb $3, %xmm0, 3(%rdi)
-; AVX2-NEXT:    retq
+; AVX-LABEL: truncstore_v4i32_v4i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX-NEXT:    vpackssdw %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vpacksswb %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vpcmpeqd %xmm2, %xmm1, %xmm1
+; AVX-NEXT:    vmovmskps %xmm1, %eax
+; AVX-NEXT:    xorl $15, %eax
+; AVX-NEXT:    testb $1, %al
+; AVX-NEXT:    jne .LBB14_1
+; AVX-NEXT:  # %bb.2: # %else
+; AVX-NEXT:    testb $2, %al
+; AVX-NEXT:    jne .LBB14_3
+; AVX-NEXT:  .LBB14_4: # %else2
+; AVX-NEXT:    testb $4, %al
+; AVX-NEXT:    jne .LBB14_5
+; AVX-NEXT:  .LBB14_6: # %else4
+; AVX-NEXT:    testb $8, %al
+; AVX-NEXT:    jne .LBB14_7
+; AVX-NEXT:  .LBB14_8: # %else6
+; AVX-NEXT:    retq
+; AVX-NEXT:  .LBB14_1: # %cond.store
+; AVX-NEXT:    vpextrb $0, %xmm0, (%rdi)
+; AVX-NEXT:    testb $2, %al
+; AVX-NEXT:    je .LBB14_4
+; AVX-NEXT:  .LBB14_3: # %cond.store1
+; AVX-NEXT:    vpextrb $1, %xmm0, 1(%rdi)
+; AVX-NEXT:    testb $4, %al
+; AVX-NEXT:    je .LBB14_6
+; AVX-NEXT:  .LBB14_5: # %cond.store3
+; AVX-NEXT:    vpextrb $2, %xmm0, 2(%rdi)
+; AVX-NEXT:    testb $8, %al
+; AVX-NEXT:    je .LBB14_8
+; AVX-NEXT:  .LBB14_7: # %cond.store5
+; AVX-NEXT:    vpextrb $3, %xmm0, 3(%rdi)
+; AVX-NEXT:    retq
 ;
 ; AVX512F-LABEL: truncstore_v4i32_v4i8:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
-; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; AVX512F-NEXT:    vptestmd %zmm1, %zmm1, %k0
-; AVX512F-NEXT:    vpmovsdb %zmm0, %xmm0
+; AVX512F-NEXT:    vpackssdw %xmm0, %xmm0, %xmm0
+; AVX512F-NEXT:    vpacksswb %xmm0, %xmm0, %xmm0
 ; AVX512F-NEXT:    kmovw %k0, %eax
 ; AVX512F-NEXT:    testb $1, %al
 ; AVX512F-NEXT:    jne .LBB14_1
@@ -5376,11 +5319,11 @@ define void @truncstore_v4i32_v4i8(<4 x i32> %x, ptr %p, <4 x i32> %mask) {
 ; AVX512BW-LABEL: truncstore_v4i32_v4i8:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
-; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
 ; AVX512BW-NEXT:    vptestmd %zmm1, %zmm1, %k0
 ; AVX512BW-NEXT:    kshiftlq $60, %k0, %k0
 ; AVX512BW-NEXT:    kshiftrq $60, %k0, %k1
-; AVX512BW-NEXT:    vpmovsdb %zmm0, %xmm0
+; AVX512BW-NEXT:    vpackssdw %xmm0, %xmm0, %xmm0
+; AVX512BW-NEXT:    vpacksswb %xmm0, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vmovdqu8 %zmm0, (%rdi) {%k1}
 ; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/vector-trunc-packus.ll b/llvm/test/CodeGen/X86/vector-trunc-packus.ll
index a94104a002d5c..f93f5682df826 100644
--- a/llvm/test/CodeGen/X86/vector-trunc-packus.ll
+++ b/llvm/test/CodeGen/X86/vector-trunc-packus.ll
@@ -4042,94 +4042,28 @@ define <16 x i8> @trunc_packus_v16i64_v16i8(ptr %p0) "min-legal-vector-width"="2
 }
 
 define <4 x i8> @trunc_packus_v4i32_v4i8(<4 x i32> %a0) "min-legal-vector-width"="256" {
-; SSE2-SSSE3-LABEL: trunc_packus_v4i32_v4i8:
-; SSE2-SSSE3:       # %bb.0:
-; SSE2-SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [255,255,255,255]
-; SSE2-SSSE3-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-SSSE3-NEXT:    pcmpgtd %xmm0, %xmm2
-; SSE2-SSSE3-NEXT:    pand %xmm2, %xmm0
-; SSE2-SSSE3-NEXT:    pandn %xmm1, %xmm2
-; SSE2-SSSE3-NEXT:    por %xmm2, %xmm0
-; SSE2-SSSE3-NEXT:    pxor %xmm1, %xmm1
-; SSE2-SSSE3-NEXT:    movdqa %xmm0, %xmm2
-; SSE2-SSSE3-NEXT:    pcmpgtd %xmm1, %xmm2
-; SSE2-SSSE3-NEXT:    pand %xmm2, %xmm0
-; SSE2-SSSE3-NEXT:    packuswb %xmm0, %xmm0
-; SSE2-SSSE3-NEXT:    packuswb %xmm0, %xmm0
-; SSE2-SSSE3-NEXT:    retq
-;
-; SSE41-LABEL: trunc_packus_v4i32_v4i8:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    pminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE41-NEXT:    pxor %xmm1, %xmm1
-; SSE41-NEXT:    pmaxsd %xmm1, %xmm0
-; SSE41-NEXT:    packusdw %xmm0, %xmm0
-; SSE41-NEXT:    packuswb %xmm0, %xmm0
-; SSE41-NEXT:    retq
-;
-; AVX1-LABEL: trunc_packus_v4i32_v4i8:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpackusdw %xmm0, %xmm0, %xmm0
-; AVX1-NEXT:    vpackuswb %xmm0, %xmm0, %xmm0
-; AVX1-NEXT:    retq
-;
-; AVX2-SLOW-LABEL: trunc_packus_v4i32_v4i8:
-; AVX2-SLOW:       # %bb.0:
-; AVX2-SLOW-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [255,255,255,255]
-; AVX2-SLOW-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
-; AVX2-SLOW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-SLOW-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
-; AVX2-SLOW-NEXT:    vpackusdw %xmm0, %xmm0, %xmm0
-; AVX2-SLOW-NEXT:    vpackuswb %xmm0, %xmm0, %xmm0
-; AVX2-SLOW-NEXT:    retq
-;
-; AVX2-FAST-LABEL: trunc_packus_v4i32_v4i8:
-; AVX2-FAST:       # %bb.0:
-; AVX2-FAST-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [255,255,255,255]
-; AVX2-FAST-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
-; AVX2-FAST-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-FAST-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
-; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX2-FAST-NEXT:    retq
-;
-; AVX512F-LABEL: trunc_packus_v4i32_v4i8:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
-; AVX512F-NEXT:    vpmovusdb %zmm0, %xmm0
-; AVX512F-NEXT:    vzeroupper
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: trunc_packus_v4i32_v4i8:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
-; AVX512VL-NEXT:    vpmovusdb %xmm0, %xmm0
-; AVX512VL-NEXT:    retq
+; SSE-LABEL: trunc_packus_v4i32_v4i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    packssdw %xmm0, %xmm0
+; SSE-NEXT:    packuswb %xmm0, %xmm0
+; SSE-NEXT:    retq
 ;
-; AVX512BW-LABEL: trunc_packus_v4i32_v4i8:
-; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512BW-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
-; AVX512BW-NEXT:    vpmovusdb %zmm0, %xmm0
-; AVX512BW-NEXT:    vzeroupper
-; AVX512BW-NEXT:    retq
+; AVX-LABEL: trunc_packus_v4i32_v4i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpackssdw %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vpackuswb %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    retq
 ;
-; AVX512BWVL-LABEL: trunc_packus_v4i32_v4i8:
-; AVX512BWVL:       # %bb.0:
-; AVX512BWVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512BWVL-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
-; AVX512BWVL-NEXT:    vpmovusdb %xmm0, %xmm0
-; AVX512BWVL-NEXT:    retq
+; AVX512-LABEL: trunc_packus_v4i32_v4i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpackssdw %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vpackuswb %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    retq
 ;
 ; SKX-LABEL: trunc_packus_v4i32_v4i8:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; SKX-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
-; SKX-NEXT:    vpmovusdb %xmm0, %xmm0
+; SKX-NEXT:    vpackssdw %xmm0, %xmm0, %xmm0
+; SKX-NEXT:    vpackuswb %xmm0, %xmm0, %xmm0
 ; SKX-NEXT:    retq
   %1 = icmp slt <4 x i32> %a0, <i32 255, i32 255, i32 255, i32 255>
   %2 = select <4 x i1> %1, <4 x i32> %a0, <4 x i32> <i32 255, i32 255, i32 255, i32 255>
@@ -4140,71 +4074,25 @@ define <4 x i8> @trunc_packus_v4i32_v4i8(<4 x i32> %a0) "min-legal-vector-width"
 }
 
 define void @trunc_packus_v4i32_v4i8_store(<4 x i32> %a0, ptr%p1) {
-; SSE2-SSSE3-LABEL: trunc_packus_v4i32_v4i8_store:
-; SSE2-SSSE3:       # %bb.0:
-; SSE2-SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [255,255,255,255]
-; SSE2-SSSE3-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-SSSE3-NEXT:    pcmpgtd %xmm0, %xmm2
-; SSE2-SSSE3-NEXT:    pand %xmm2, %xmm0
-; SSE2-SSSE3-NEXT:    pandn %xmm1, %xmm2
-; SSE2-SSSE3-NEXT:    por %xmm0, %xmm2
-; SSE2-SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE2-SSSE3-NEXT:    movdqa %xmm2, %xmm1
-; SSE2-SSSE3-NEXT:    pcmpgtd %xmm0, %xmm1
-; SSE2-SSSE3-NEXT:    pand %xmm2, %xmm1
-; SSE2-SSSE3-NEXT:    packuswb %xmm1, %xmm1
-; SSE2-SSSE3-NEXT:    packuswb %xmm1, %xmm1
-; SSE2-SSSE3-NEXT:    movd %xmm1, (%rdi)
-; SSE2-SSSE3-NEXT:    retq
-;
-; SSE41-LABEL: trunc_packus_v4i32_v4i8_store:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    pminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE41-NEXT:    pxor %xmm1, %xmm1
-; SSE41-NEXT:    pmaxsd %xmm0, %xmm1
-; SSE41-NEXT:    packusdw %xmm1, %xmm1
-; SSE41-NEXT:    packuswb %xmm1, %xmm1
-; SSE41-NEXT:    movd %xmm1, (%rdi)
-; SSE41-NEXT:    retq
-;
-; AVX1-LABEL: trunc_packus_v4i32_v4i8_store:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpackusdw %xmm0, %xmm0, %xmm0
-; AVX1-NEXT:    vpackuswb %xmm0, %xmm0, %xmm0
-; AVX1-NEXT:    vmovd %xmm0, (%rdi)
-; AVX1-NEXT:    retq
-;
-; AVX2-SLOW-LABEL: trunc_packus_v4i32_v4i8_store:
-; AVX2-SLOW:       # %bb.0:
-; AVX2-SLOW-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [255,255,255,255]
-; AVX2-SLOW-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
-; AVX2-SLOW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-SLOW-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
-; AVX2-SLOW-NEXT:    vpackusdw %xmm0, %xmm0, %xmm0
-; AVX2-SLOW-NEXT:    vpackuswb %xmm0, %xmm0, %xmm0
-; AVX2-SLOW-NEXT:    vmovd %xmm0, (%rdi)
-; AVX2-SLOW-NEXT:    retq
+; SSE-LABEL: trunc_packus_v4i32_v4i8_store:
+; SSE:       # %bb.0:
+; SSE-NEXT:    packssdw %xmm0, %xmm0
+; SSE-NEXT:    packuswb %xmm0, %xmm0
+; SSE-NEXT:    movd %xmm0, (%rdi)
+; SSE-NEXT:    retq
 ;
-; AVX2-FAST-LABEL: trunc_packus_v4i32_v4i8_store:
-; AVX2-FAST:       # %bb.0:
-; AVX2-FAST-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [255,255,255,255]
-; AVX2-FAST-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
-; AVX2-FAST-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-FAST-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
-; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX2-FAST-NEXT:    vmovd %xmm0, (%rdi)
-; AVX2-FAST-NEXT:    retq
+; AVX-LABEL: trunc_packus_v4i32_v4i8_store:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpackssdw %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vpackuswb %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vmovd %xmm0, (%rdi)
+; AVX-NEXT:    retq
 ;
 ; AVX512F-LABEL: trunc_packus_v4i32_v4i8_store:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
-; AVX512F-NEXT:    vpmovusdb %zmm0, %xmm0
+; AVX512F-NEXT:    vpackssdw %xmm0, %xmm0, %xmm0
+; AVX512F-NEXT:    vpackuswb %xmm0, %xmm0, %xmm0
 ; AVX512F-NEXT:    vmovd %xmm0, (%rdi)
-; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: trunc_packus_v4i32_v4i8_store:
@@ -4216,11 +4104,9 @@ define void @trunc_packus_v4i32_v4i8_store(<4 x i32> %a0, ptr%p1) {
 ;
 ; AVX512BW-LABEL: trunc_packus_v4i32_v4i8_store:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512BW-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
-; AVX512BW-NEXT:    vpmovusdb %zmm0, %xmm0
+; AVX512BW-NEXT:    vpackssdw %xmm0, %xmm0, %xmm0
+; AVX512BW-NEXT:    vpackuswb %xmm0, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vmovd %xmm0, (%rdi)
-; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512BWVL-LABEL: trunc_packus_v4i32_v4i8_store:
diff --git a/llvm/test/CodeGen/X86/vector-trunc-ssat.ll b/llvm/test/CodeGen/X86/vector-trunc-ssat.ll
index 2f3fdeb74dc47..14f724fc3b8c7 100644
--- a/llvm/test/CodeGen/X86/vector-trunc-ssat.ll
+++ b/llvm/test/CodeGen/X86/vector-trunc-ssat.ll
@@ -3777,86 +3777,28 @@ define <16 x i8> @trunc_ssat_v16i64_v16i8(ptr %p0) "min-legal-vector-width"="256
 }
 
 define <4 x i8> @trunc_ssat_v4i32_v4i8(<4 x i32> %a0) {
-; SSE2-SSSE3-LABEL: trunc_ssat_v4i32_v4i8:
-; SSE2-SSSE3:       # %bb.0:
-; SSE2-SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [127,127,127,127]
-; SSE2-SSSE3-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-SSSE3-NEXT:    pcmpgtd %xmm0, %xmm2
-; SSE2-SSSE3-NEXT:    pand %xmm2, %xmm0
-; SSE2-SSSE3-NEXT:    pandn %xmm1, %xmm2
-; SSE2-SSSE3-NEXT:    por %xmm2, %xmm0
-; SSE2-SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [4294967168,4294967168,4294967168,4294967168]
-; SSE2-SSSE3-NEXT:    movdqa %xmm0, %xmm2
-; SSE2-SSSE3-NEXT:    pcmpgtd %xmm1, %xmm2
-; SSE2-SSSE3-NEXT:    pand %xmm2, %xmm0
-; SSE2-SSSE3-NEXT:    pandn %xmm1, %xmm2
-; SSE2-SSSE3-NEXT:    por %xmm2, %xmm0
-; SSE2-SSSE3-NEXT:    packssdw %xmm0, %xmm0
-; SSE2-SSSE3-NEXT:    packsswb %xmm0, %xmm0
-; SSE2-SSSE3-NEXT:    retq
-;
-; SSE41-LABEL: trunc_ssat_v4i32_v4i8:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    pminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE41-NEXT:    pmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE41-NEXT:    packssdw %xmm0, %xmm0
-; SSE41-NEXT:    packsswb %xmm0, %xmm0
-; SSE41-NEXT:    retq
-;
-; AVX1-LABEL: trunc_ssat_v4i32_v4i8:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT:    vpmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT:    vpackssdw %xmm0, %xmm0, %xmm0
-; AVX1-NEXT:    vpacksswb %xmm0, %xmm0, %xmm0
-; AVX1-NEXT:    retq
-;
-; AVX2-SLOW-LABEL: trunc_ssat_v4i32_v4i8:
-; AVX2-SLOW:       # %bb.0:
-; AVX2-SLOW-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [127,127,127,127]
-; AVX2-SLOW-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
-; AVX2-SLOW-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [4294967168,4294967168,4294967168,4294967168]
-; AVX2-SLOW-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
-; AVX2-SLOW-NEXT:    vpackssdw %xmm0, %xmm0, %xmm0
-; AVX2-SLOW-NEXT:    vpacksswb %xmm0, %xmm0, %xmm0
-; AVX2-SLOW-NEXT:    retq
-;
-; AVX2-FAST-LABEL: trunc_ssat_v4i32_v4i8:
-; AVX2-FAST:       # %bb.0:
-; AVX2-FAST-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [127,127,127,127]
-; AVX2-FAST-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
-; AVX2-FAST-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [4294967168,4294967168,4294967168,4294967168]
-; AVX2-FAST-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
-; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX2-FAST-NEXT:    retq
-;
-; AVX512F-LABEL: trunc_ssat_v4i32_v4i8:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512F-NEXT:    vpmovsdb %zmm0, %xmm0
-; AVX512F-NEXT:    vzeroupper
-; AVX512F-NEXT:    retq
-;
-; AVX512VL-LABEL: trunc_ssat_v4i32_v4i8:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vpmovsdb %xmm0, %xmm0
-; AVX512VL-NEXT:    retq
+; SSE-LABEL: trunc_ssat_v4i32_v4i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    packssdw %xmm0, %xmm0
+; SSE-NEXT:    packsswb %xmm0, %xmm0
+; SSE-NEXT:    retq
 ;
-; AVX512BW-LABEL: trunc_ssat_v4i32_v4i8:
-; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512BW-NEXT:    vpmovsdb %zmm0, %xmm0
-; AVX512BW-NEXT:    vzeroupper
-; AVX512BW-NEXT:    retq
+; AVX-LABEL: trunc_ssat_v4i32_v4i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpackssdw %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vpacksswb %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    retq
 ;
-; AVX512BWVL-LABEL: trunc_ssat_v4i32_v4i8:
-; AVX512BWVL:       # %bb.0:
-; AVX512BWVL-NEXT:    vpmovsdb %xmm0, %xmm0
-; AVX512BWVL-NEXT:    retq
+; AVX512-LABEL: trunc_ssat_v4i32_v4i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpackssdw %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vpacksswb %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    retq
 ;
 ; SKX-LABEL: trunc_ssat_v4i32_v4i8:
 ; SKX:       # %bb.0:
-; SKX-NEXT:    vpmovsdb %xmm0, %xmm0
+; SKX-NEXT:    vpackssdw %xmm0, %xmm0, %xmm0
+; SKX-NEXT:    vpacksswb %xmm0, %xmm0, %xmm0
 ; SKX-NEXT:    retq
   %1 = icmp slt <4 x i32> %a0, <i32 127, i32 127, i32 127, i32 127>
   %2 = select <4 x i1> %1, <4 x i32> %a0, <4 x i32> <i32 127, i32 127, i32 127, i32 127>
@@ -3867,70 +3809,25 @@ define <4 x i8> @trunc_ssat_v4i32_v4i8(<4 x i32> %a0) {
 }
 
 define void @trunc_ssat_v4i32_v4i8_store(<4 x i32> %a0, ptr%p1) {
-; SSE2-SSSE3-LABEL: trunc_ssat_v4i32_v4i8_store:
-; SSE2-SSSE3:       # %bb.0:
-; SSE2-SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [127,127,127,127]
-; SSE2-SSSE3-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-SSSE3-NEXT:    pcmpgtd %xmm0, %xmm2
-; SSE2-SSSE3-NEXT:    pand %xmm2, %xmm0
-; SSE2-SSSE3-NEXT:    pandn %xmm1, %xmm2
-; SSE2-SSSE3-NEXT:    por %xmm0, %xmm2
-; SSE2-SSSE3-NEXT:    movdqa {{.*#+}} xmm0 = [4294967168,4294967168,4294967168,4294967168]
-; SSE2-SSSE3-NEXT:    movdqa %xmm2, %xmm1
-; SSE2-SSSE3-NEXT:    pcmpgtd %xmm0, %xmm1
-; SSE2-SSSE3-NEXT:    pand %xmm1, %xmm2
-; SSE2-SSSE3-NEXT:    pandn %xmm0, %xmm1
-; SSE2-SSSE3-NEXT:    por %xmm2, %xmm1
-; SSE2-SSSE3-NEXT:    packssdw %xmm1, %xmm1
-; SSE2-SSSE3-NEXT:    packsswb %xmm1, %xmm1
-; SSE2-SSSE3-NEXT:    movd %xmm1, (%rdi)
-; SSE2-SSSE3-NEXT:    retq
-;
-; SSE41-LABEL: trunc_ssat_v4i32_v4i8_store:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    pminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE41-NEXT:    pmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE41-NEXT:    packssdw %xmm0, %xmm0
-; SSE41-NEXT:    packsswb %xmm0, %xmm0
-; SSE41-NEXT:    movd %xmm0, (%rdi)
-; SSE41-NEXT:    retq
-;
-; AVX1-LABEL: trunc_ssat_v4i32_v4i8_store:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT:    vpmaxsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT:    vpackssdw %xmm0, %xmm0, %xmm0
-; AVX1-NEXT:    vpacksswb %xmm0, %xmm0, %xmm0
-; AVX1-NEXT:    vmovd %xmm0, (%rdi)
-; AVX1-NEXT:    retq
-;
-; AVX2-SLOW-LABEL: trunc_ssat_v4i32_v4i8_store:
-; AVX2-SLOW:       # %bb.0:
-; AVX2-SLOW-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [127,127,127,127]
-; AVX2-SLOW-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
-; AVX2-SLOW-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [4294967168,4294967168,4294967168,4294967168]
-; AVX2-SLOW-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
-; AVX2-SLOW-NEXT:    vpackssdw %xmm0, %xmm0, %xmm0
-; AVX2-SLOW-NEXT:    vpacksswb %xmm0, %xmm0, %xmm0
-; AVX2-SLOW-NEXT:    vmovd %xmm0, (%rdi)
-; AVX2-SLOW-NEXT:    retq
+; SSE-LABEL: trunc_ssat_v4i32_v4i8_store:
+; SSE:       # %bb.0:
+; SSE-NEXT:    packssdw %xmm0, %xmm0
+; SSE-NEXT:    packsswb %xmm0, %xmm0
+; SSE-NEXT:    movd %xmm0, (%rdi)
+; SSE-NEXT:    retq
 ;
-; AVX2-FAST-LABEL: trunc_ssat_v4i32_v4i8_store:
-; AVX2-FAST:       # %bb.0:
-; AVX2-FAST-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [127,127,127,127]
-; AVX2-FAST-NEXT:    vpminsd %xmm1, %xmm0, %xmm0
-; AVX2-FAST-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [4294967168,4294967168,4294967168,4294967168]
-; AVX2-FAST-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0
-; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX2-FAST-NEXT:    vmovd %xmm0, (%rdi)
-; AVX2-FAST-NEXT:    retq
+; AVX-LABEL: trunc_ssat_v4i32_v4i8_store:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpackssdw %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vpacksswb %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vmovd %xmm0, (%rdi)
+; AVX-NEXT:    retq
 ;
 ; AVX512F-LABEL: trunc_ssat_v4i32_v4i8_store:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512F-NEXT:    vpmovsdb %zmm0, %xmm0
+; AVX512F-NEXT:    vpackssdw %xmm0, %xmm0, %xmm0
+; AVX512F-NEXT:    vpacksswb %xmm0, %xmm0, %xmm0
 ; AVX512F-NEXT:    vmovd %xmm0, (%rdi)
-; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: trunc_ssat_v4i32_v4i8_store:
@@ -3940,10 +3837,9 @@ define void @trunc_ssat_v4i32_v4i8_store(<4 x i32> %a0, ptr%p1) {
 ;
 ; AVX512BW-LABEL: trunc_ssat_v4i32_v4i8_store:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512BW-NEXT:    vpmovsdb %zmm0, %xmm0
+; AVX512BW-NEXT:    vpackssdw %xmm0, %xmm0, %xmm0
+; AVX512BW-NEXT:    vpacksswb %xmm0, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vmovd %xmm0, (%rdi)
-; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512BWVL-LABEL: trunc_ssat_v4i32_v4i8_store:

From aaba3761db84032541712899964714f3184e8b3d Mon Sep 17 00:00:00 2001
From: Vlad Serebrennikov <serebrennikov.vladislav@gmail.com>
Date: Wed, 1 Nov 2023 13:39:24 +0300
Subject: [PATCH 654/877] [clang][NFC] Refactor
 `ObjCMethodDecl::ImplementationControl`

This patch moves `ObjCMethodDecl::ImplementationControl` to a DeclBase.h so that it's complete at the point where corresponsing bit-field is declared. This patch also converts it to a scoped enum `clang::ObjCImplementationControl`.
---
 clang/include/clang/AST/DeclBase.h            |  2 +
 clang/include/clang/AST/DeclObjC.h            | 33 ++++-----
 clang/lib/ARCMigrate/ObjCMT.cpp               |  2 +-
 clang/lib/AST/DeclObjC.cpp                    |  6 +-
 clang/lib/AST/ODRDiagsEmitter.cpp             |  6 +-
 clang/lib/AST/ODRHash.cpp                     |  2 +-
 clang/lib/CodeGen/CodeGenModule.cpp           |  4 +-
 .../Frontend/Rewrite/RewriteModernObjC.cpp    |  4 +-
 clang/lib/Sema/SemaDeclObjC.cpp               | 72 +++++++++----------
 clang/lib/Sema/SemaExprObjC.cpp               | 29 ++++----
 clang/lib/Sema/SemaObjCProperty.cpp           | 28 ++++----
 clang/lib/Sema/SemaPseudoObject.cpp           |  4 +-
 clang/lib/Serialization/ASTReaderDecl.cpp     |  3 +-
 clang/lib/Serialization/ASTWriterDecl.cpp     |  2 +-
 clang/tools/libclang/CIndex.cpp               |  3 +-
 .../AppleObjCRuntime/AppleObjCDeclVendor.cpp  |  4 +-
 .../TypeSystem/Clang/TypeSystemClang.cpp      | 12 ++--
 17 files changed, 109 insertions(+), 107 deletions(-)

diff --git a/clang/include/clang/AST/DeclBase.h b/clang/include/clang/AST/DeclBase.h
index 7b743edf94525..6f2c5b96554a9 100644
--- a/clang/include/clang/AST/DeclBase.h
+++ b/clang/include/clang/AST/DeclBase.h
@@ -1428,6 +1428,8 @@ enum class OMPDeclareReductionInitKind {
   Copy    // omp_priv = <expr>
 };
 
+enum class ObjCImplementationControl { None, Required, Optional };
+
 /// DeclContext - This is used only as base class of specific decl types that
 /// can act as declaration contexts. These decls are (only the top classes
 /// that directly derive from DeclContext are mentioned, not their subclasses):
diff --git a/clang/include/clang/AST/DeclObjC.h b/clang/include/clang/AST/DeclObjC.h
index ee8ec7a6a016b..2b205bee51de1 100644
--- a/clang/include/clang/AST/DeclObjC.h
+++ b/clang/include/clang/AST/DeclObjC.h
@@ -139,10 +139,6 @@ class ObjCMethodDecl : public NamedDecl, public DeclContext {
   // This class stores some data in DeclContext::ObjCMethodDeclBits
   // to save some space. Use the provided accessors to access it.
 
-public:
-  enum ImplementationControl { None, Required, Optional };
-
-private:
   /// Return type of this method.
   QualType MethodDeclType;
 
@@ -168,14 +164,14 @@ class ObjCMethodDecl : public NamedDecl, public DeclContext {
   /// constructed by createImplicitParams.
   ImplicitParamDecl *CmdDecl = nullptr;
 
-  ObjCMethodDecl(SourceLocation beginLoc, SourceLocation endLoc,
-                 Selector SelInfo, QualType T, TypeSourceInfo *ReturnTInfo,
-                 DeclContext *contextDecl, bool isInstance = true,
-                 bool isVariadic = false, bool isPropertyAccessor = false,
-                 bool isSynthesizedAccessorStub = false,
-                 bool isImplicitlyDeclared = false, bool isDefined = false,
-                 ImplementationControl impControl = None,
-                 bool HasRelatedResultType = false);
+  ObjCMethodDecl(
+      SourceLocation beginLoc, SourceLocation endLoc, Selector SelInfo,
+      QualType T, TypeSourceInfo *ReturnTInfo, DeclContext *contextDecl,
+      bool isInstance = true, bool isVariadic = false,
+      bool isPropertyAccessor = false, bool isSynthesizedAccessorStub = false,
+      bool isImplicitlyDeclared = false, bool isDefined = false,
+      ObjCImplementationControl impControl = ObjCImplementationControl::None,
+      bool HasRelatedResultType = false);
 
   SelectorLocationsKind getSelLocsKind() const {
     return static_cast<SelectorLocationsKind>(ObjCMethodDeclBits.SelLocsKind);
@@ -235,7 +231,7 @@ class ObjCMethodDecl : public NamedDecl, public DeclContext {
          bool isVariadic = false, bool isPropertyAccessor = false,
          bool isSynthesizedAccessorStub = false,
          bool isImplicitlyDeclared = false, bool isDefined = false,
-         ImplementationControl impControl = None,
+         ObjCImplementationControl impControl = ObjCImplementationControl::None,
          bool HasRelatedResultType = false);
 
   static ObjCMethodDecl *CreateDeserialized(ASTContext &C, unsigned ID);
@@ -495,16 +491,17 @@ class ObjCMethodDecl : public NamedDecl, public DeclContext {
   const ObjCPropertyDecl *findPropertyDecl(bool CheckOverrides = true) const;
 
   // Related to protocols declared in  \@protocol
-  void setDeclImplementation(ImplementationControl ic) {
-    ObjCMethodDeclBits.DeclImplementation = ic;
+  void setDeclImplementation(ObjCImplementationControl ic) {
+    ObjCMethodDeclBits.DeclImplementation = llvm::to_underlying(ic);
   }
 
-  ImplementationControl getImplementationControl() const {
-    return ImplementationControl(ObjCMethodDeclBits.DeclImplementation);
+  ObjCImplementationControl getImplementationControl() const {
+    return static_cast<ObjCImplementationControl>(
+        ObjCMethodDeclBits.DeclImplementation);
   }
 
   bool isOptional() const {
-    return getImplementationControl() == Optional;
+    return getImplementationControl() == ObjCImplementationControl::Optional;
   }
 
   /// Returns true if this specific method declaration is marked with the
diff --git a/clang/lib/ARCMigrate/ObjCMT.cpp b/clang/lib/ARCMigrate/ObjCMT.cpp
index 4fa4ab8d42b7d..5a25c88c65f64 100644
--- a/clang/lib/ARCMigrate/ObjCMT.cpp
+++ b/clang/lib/ARCMigrate/ObjCMT.cpp
@@ -636,7 +636,7 @@ ClassImplementsAllMethodsAndProperties(ASTContext &Ctx,
     for (const auto *MD : PDecl->methods()) {
       if (MD->isImplicit())
         continue;
-      if (MD->getImplementationControl() == ObjCMethodDecl::Optional)
+      if (MD->getImplementationControl() == ObjCImplementationControl::Optional)
         continue;
       DeclContext::lookup_result R = ImpDecl->lookup(MD->getDeclName());
       if (R.empty())
diff --git a/clang/lib/AST/DeclObjC.cpp b/clang/lib/AST/DeclObjC.cpp
index e1eef2dbd9c3d..2c88d05dc07ea 100644
--- a/clang/lib/AST/DeclObjC.cpp
+++ b/clang/lib/AST/DeclObjC.cpp
@@ -825,7 +825,7 @@ ObjCMethodDecl::ObjCMethodDecl(
     QualType T, TypeSourceInfo *ReturnTInfo, DeclContext *contextDecl,
     bool isInstance, bool isVariadic, bool isPropertyAccessor,
     bool isSynthesizedAccessorStub, bool isImplicitlyDeclared, bool isDefined,
-    ImplementationControl impControl, bool HasRelatedResultType)
+    ObjCImplementationControl impControl, bool HasRelatedResultType)
     : NamedDecl(ObjCMethod, contextDecl, beginLoc, SelInfo),
       DeclContext(ObjCMethod), MethodDeclType(T), ReturnTInfo(ReturnTInfo),
       DeclEndLoc(endLoc) {
@@ -855,8 +855,8 @@ ObjCMethodDecl *ObjCMethodDecl::Create(
     Selector SelInfo, QualType T, TypeSourceInfo *ReturnTInfo,
     DeclContext *contextDecl, bool isInstance, bool isVariadic,
     bool isPropertyAccessor, bool isSynthesizedAccessorStub,
-    bool isImplicitlyDeclared, bool isDefined, ImplementationControl impControl,
-    bool HasRelatedResultType) {
+    bool isImplicitlyDeclared, bool isDefined,
+    ObjCImplementationControl impControl, bool HasRelatedResultType) {
   return new (C, contextDecl) ObjCMethodDecl(
       beginLoc, endLoc, SelInfo, T, ReturnTInfo, contextDecl, isInstance,
       isVariadic, isPropertyAccessor, isSynthesizedAccessorStub,
diff --git a/clang/lib/AST/ODRDiagsEmitter.cpp b/clang/lib/AST/ODRDiagsEmitter.cpp
index 0189a5de625e0..9dcd2ed04f6f7 100644
--- a/clang/lib/AST/ODRDiagsEmitter.cpp
+++ b/clang/lib/AST/ODRDiagsEmitter.cpp
@@ -461,8 +461,10 @@ bool ODRDiagsEmitter::diagnoseSubMismatchObjCMethod(
   }
   if (FirstMethod->getImplementationControl() !=
       SecondMethod->getImplementationControl()) {
-    DiagError(ControlLevel) << FirstMethod->getImplementationControl();
-    DiagNote(ControlLevel) << SecondMethod->getImplementationControl();
+    DiagError(ControlLevel)
+        << llvm::to_underlying(FirstMethod->getImplementationControl());
+    DiagNote(ControlLevel) << llvm::to_underlying(
+        SecondMethod->getImplementationControl());
     return true;
   }
   if (FirstMethod->isThisDeclarationADesignatedInitializer() !=
diff --git a/clang/lib/AST/ODRHash.cpp b/clang/lib/AST/ODRHash.cpp
index e728c47ade788..f04dcef18a36e 100644
--- a/clang/lib/AST/ODRHash.cpp
+++ b/clang/lib/AST/ODRHash.cpp
@@ -380,7 +380,7 @@ class ODRDeclVisitor : public ConstDeclVisitor<ODRDeclVisitor> {
     Hash.AddBoolean(Method->isThisDeclarationADesignatedInitializer());
     Hash.AddBoolean(Method->hasSkippedBody());
 
-    ID.AddInteger(Method->getImplementationControl());
+    ID.AddInteger(llvm::to_underlying(Method->getImplementationControl()));
     ID.AddInteger(Method->getMethodFamily());
     ImplicitParamDecl *Cmd = Method->getCmdDecl();
     Hash.AddBoolean(Cmd);
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index 4203a6218aba6..5b80478d2265c 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -6485,7 +6485,7 @@ void CodeGenModule::EmitObjCIvarInitializations(ObjCImplementationDecl *D) {
         /*isInstance=*/true, /*isVariadic=*/false,
         /*isPropertyAccessor=*/true, /*isSynthesizedAccessorStub=*/false,
         /*isImplicitlyDeclared=*/true,
-        /*isDefined=*/false, ObjCMethodDecl::Required);
+        /*isDefined=*/false, ObjCImplementationControl::Required);
     D->addInstanceMethod(DTORMethod);
     CodeGenFunction(*this).GenerateObjCCtorDtorMethod(D, DTORMethod, false);
     D->setHasDestructors(true);
@@ -6506,7 +6506,7 @@ void CodeGenModule::EmitObjCIvarInitializations(ObjCImplementationDecl *D) {
       /*isVariadic=*/false,
       /*isPropertyAccessor=*/true, /*isSynthesizedAccessorStub=*/false,
       /*isImplicitlyDeclared=*/true,
-      /*isDefined=*/false, ObjCMethodDecl::Required);
+      /*isDefined=*/false, ObjCImplementationControl::Required);
   D->addInstanceMethod(CTORMethod);
   CodeGenFunction(*this).GenerateObjCCtorDtorMethod(D, CTORMethod, true);
   D->setHasNonZeroConstructors(true);
diff --git a/clang/lib/Frontend/Rewrite/RewriteModernObjC.cpp b/clang/lib/Frontend/Rewrite/RewriteModernObjC.cpp
index 4b961e036fcdd..c645355d5aecd 100644
--- a/clang/lib/Frontend/Rewrite/RewriteModernObjC.cpp
+++ b/clang/lib/Frontend/Rewrite/RewriteModernObjC.cpp
@@ -6855,7 +6855,7 @@ void RewriteModernObjC::RewriteObjCProtocolMetaData(ObjCProtocolDecl *PDecl,
   std::vector<ObjCMethodDecl *> InstanceMethods, ClassMethods;
   std::vector<ObjCMethodDecl *> OptInstanceMethods, OptClassMethods;
   for (auto *MD : PDecl->instance_methods()) {
-    if (MD->getImplementationControl() == ObjCMethodDecl::Optional) {
+    if (MD->getImplementationControl() == ObjCImplementationControl::Optional) {
       OptInstanceMethods.push_back(MD);
     } else {
       InstanceMethods.push_back(MD);
@@ -6863,7 +6863,7 @@ void RewriteModernObjC::RewriteObjCProtocolMetaData(ObjCProtocolDecl *PDecl,
   }
 
   for (auto *MD : PDecl->class_methods()) {
-    if (MD->getImplementationControl() == ObjCMethodDecl::Optional) {
+    if (MD->getImplementationControl() == ObjCImplementationControl::Optional) {
       OptClassMethods.push_back(MD);
     } else {
       ClassMethods.push_back(MD);
diff --git a/clang/lib/Sema/SemaDeclObjC.cpp b/clang/lib/Sema/SemaDeclObjC.cpp
index fdfc6d312b387..8685838157b5c 100644
--- a/clang/lib/Sema/SemaDeclObjC.cpp
+++ b/clang/lib/Sema/SemaDeclObjC.cpp
@@ -2632,7 +2632,8 @@ void Sema::WarnExactTypedMethods(ObjCMethodDecl *ImpMethodDecl,
   // don't issue warning when protocol method is optional because primary
   // class is not required to implement it and it is safe for protocol
   // to implement it.
-  if (MethodDecl->getImplementationControl() == ObjCMethodDecl::Optional)
+  if (MethodDecl->getImplementationControl() ==
+      ObjCImplementationControl::Optional)
     return;
   // don't issue warning when primary class's method is
   // deprecated/unavailable.
@@ -2765,45 +2766,43 @@ static void CheckProtocolMethodDefs(
   // check unimplemented instance methods.
   if (!NSIDecl)
     for (auto *method : PDecl->instance_methods()) {
-      if (method->getImplementationControl() != ObjCMethodDecl::Optional &&
+      if (method->getImplementationControl() !=
+              ObjCImplementationControl::Optional &&
           !method->isPropertyAccessor() &&
           !InsMap.count(method->getSelector()) &&
-          (!Super || !Super->lookupMethod(method->getSelector(),
-                                          true /* instance */,
-                                          false /* shallowCategory */,
-                                          true /* followsSuper */,
-                                          nullptr /* category */))) {
-            // If a method is not implemented in the category implementation but
-            // has been declared in its primary class, superclass,
-            // or in one of their protocols, no need to issue the warning.
-            // This is because method will be implemented in the primary class
-            // or one of its super class implementation.
-
-            // Ugly, but necessary. Method declared in protocol might have
-            // have been synthesized due to a property declared in the class which
-            // uses the protocol.
-            if (ObjCMethodDecl *MethodInClass =
-                  IDecl->lookupMethod(method->getSelector(),
-                                      true /* instance */,
-                                      true /* shallowCategoryLookup */,
-                                      false /* followSuper */))
-              if (C || MethodInClass->isPropertyAccessor())
-                continue;
-            unsigned DIAG = diag::warn_unimplemented_protocol_method;
-            if (!S.Diags.isIgnored(DIAG, Impl->getLocation())) {
-              WarnUndefinedMethod(S, Impl, method, IncompleteImpl, DIAG, PDecl);
-            }
-          }
+          (!Super || !Super->lookupMethod(
+                         method->getSelector(), true /* instance */,
+                         false /* shallowCategory */, true /* followsSuper */,
+                         nullptr /* category */))) {
+        // If a method is not implemented in the category implementation but
+        // has been declared in its primary class, superclass,
+        // or in one of their protocols, no need to issue the warning.
+        // This is because method will be implemented in the primary class
+        // or one of its super class implementation.
+
+        // Ugly, but necessary. Method declared in protocol might have
+        // have been synthesized due to a property declared in the class which
+        // uses the protocol.
+        if (ObjCMethodDecl *MethodInClass = IDecl->lookupMethod(
+                method->getSelector(), true /* instance */,
+                true /* shallowCategoryLookup */, false /* followSuper */))
+          if (C || MethodInClass->isPropertyAccessor())
+            continue;
+        unsigned DIAG = diag::warn_unimplemented_protocol_method;
+        if (!S.Diags.isIgnored(DIAG, Impl->getLocation())) {
+          WarnUndefinedMethod(S, Impl, method, IncompleteImpl, DIAG, PDecl);
+        }
+      }
     }
   // check unimplemented class methods
   for (auto *method : PDecl->class_methods()) {
-    if (method->getImplementationControl() != ObjCMethodDecl::Optional &&
+    if (method->getImplementationControl() !=
+            ObjCImplementationControl::Optional &&
         !ClsMap.count(method->getSelector()) &&
-        (!Super || !Super->lookupMethod(method->getSelector(),
-                                        false /* class method */,
-                                        false /* shallowCategoryLookup */,
-                                        true  /* followSuper */,
-                                        nullptr /* category */))) {
+        (!Super || !Super->lookupMethod(
+                       method->getSelector(), false /* class method */,
+                       false /* shallowCategoryLookup */,
+                       true /* followSuper */, nullptr /* category */))) {
       // See above comment for instance method lookups.
       if (C && IDecl->lookupMethod(method->getSelector(),
                                    false /* class */,
@@ -4759,8 +4758,9 @@ Decl *Sema::ActOnMethodDeclaration(
       MethodType == tok::minus, isVariadic,
       /*isPropertyAccessor=*/false, /*isSynthesizedAccessorStub=*/false,
       /*isImplicitlyDeclared=*/false, /*isDefined=*/false,
-      MethodDeclKind == tok::objc_optional ? ObjCMethodDecl::Optional
-                                           : ObjCMethodDecl::Required,
+      MethodDeclKind == tok::objc_optional
+          ? ObjCImplementationControl::Optional
+          : ObjCImplementationControl::Required,
       HasRelatedResultType);
 
   SmallVector<ParmVarDecl*, 16> Params;
diff --git a/clang/lib/Sema/SemaExprObjC.cpp b/clang/lib/Sema/SemaExprObjC.cpp
index 33e98bc2d6f71..172a3d7fee2fb 100644
--- a/clang/lib/Sema/SemaExprObjC.cpp
+++ b/clang/lib/Sema/SemaExprObjC.cpp
@@ -285,15 +285,15 @@ static ObjCMethodDecl *getNSNumberFactoryMethod(Sema &S, SourceLocation Loc,
   if (!Method && S.getLangOpts().DebuggerObjCLiteral) {
     // create a stub definition this NSNumber factory method.
     TypeSourceInfo *ReturnTInfo = nullptr;
-    Method =
-        ObjCMethodDecl::Create(CX, SourceLocation(), SourceLocation(), Sel,
-                               S.NSNumberPointer, ReturnTInfo, S.NSNumberDecl,
-                               /*isInstance=*/false, /*isVariadic=*/false,
-                               /*isPropertyAccessor=*/false,
-                               /*isSynthesizedAccessorStub=*/false,
-                               /*isImplicitlyDeclared=*/true,
-                               /*isDefined=*/false, ObjCMethodDecl::Required,
-                               /*HasRelatedResultType=*/false);
+    Method = ObjCMethodDecl::Create(
+        CX, SourceLocation(), SourceLocation(), Sel, S.NSNumberPointer,
+        ReturnTInfo, S.NSNumberDecl,
+        /*isInstance=*/false, /*isVariadic=*/false,
+        /*isPropertyAccessor=*/false,
+        /*isSynthesizedAccessorStub=*/false,
+        /*isImplicitlyDeclared=*/true,
+        /*isDefined=*/false, ObjCImplementationControl::Required,
+        /*HasRelatedResultType=*/false);
     ParmVarDecl *value = ParmVarDecl::Create(S.Context, Method,
                                              SourceLocation(), SourceLocation(),
                                              &CX.Idents.get("value"),
@@ -568,7 +568,7 @@ ExprResult Sema::BuildObjCBoxedExpr(SourceRange SR, Expr *ValueExpr) {
               /*isPropertyAccessor=*/false,
               /*isSynthesizedAccessorStub=*/false,
               /*isImplicitlyDeclared=*/true,
-              /*isDefined=*/false, ObjCMethodDecl::Required,
+              /*isDefined=*/false, ObjCImplementationControl::Required,
               /*HasRelatedResultType=*/false);
           QualType ConstCharType = Context.CharTy.withConst();
           ParmVarDecl *value =
@@ -682,7 +682,7 @@ ExprResult Sema::BuildObjCBoxedExpr(SourceRange SR, Expr *ValueExpr) {
             /*isPropertyAccessor=*/false,
             /*isSynthesizedAccessorStub=*/false,
             /*isImplicitlyDeclared=*/true,
-            /*isDefined=*/false, ObjCMethodDecl::Required,
+            /*isDefined=*/false, ObjCImplementationControl::Required,
             /*HasRelatedResultType=*/false);
 
         SmallVector<ParmVarDecl *, 2> Params;
@@ -816,7 +816,7 @@ ExprResult Sema::BuildObjCArrayLiteral(SourceRange SR, MultiExprArg Elements) {
           false /*isVariadic*/,
           /*isPropertyAccessor=*/false, /*isSynthesizedAccessorStub=*/false,
           /*isImplicitlyDeclared=*/true, /*isDefined=*/false,
-          ObjCMethodDecl::Required, false);
+          ObjCImplementationControl::Required, false);
       SmallVector<ParmVarDecl *, 2> Params;
       ParmVarDecl *objects = ParmVarDecl::Create(Context, Method,
                                                  SourceLocation(),
@@ -978,7 +978,7 @@ ExprResult Sema::BuildObjCDictionaryLiteral(SourceRange SR,
           /*isPropertyAccessor=*/false,
           /*isSynthesizedAccessorStub=*/false,
           /*isImplicitlyDeclared=*/true, /*isDefined=*/false,
-          ObjCMethodDecl::Required, false);
+          ObjCImplementationControl::Required, false);
       SmallVector<ParmVarDecl *, 3> Params;
       ParmVarDecl *objects = ParmVarDecl::Create(Context, Method,
                                                  SourceLocation(),
@@ -1347,7 +1347,8 @@ ExprResult Sema::ParseObjCSelectorExpression(Selector Sel,
   }
 
   if (Method &&
-      Method->getImplementationControl() != ObjCMethodDecl::Optional &&
+      Method->getImplementationControl() !=
+          ObjCImplementationControl::Optional &&
       !getSourceManager().isInSystemHeader(Method->getLocation()))
     ReferencedSelectors.insert(std::make_pair(Sel, AtLoc));
 
diff --git a/clang/lib/Sema/SemaObjCProperty.cpp b/clang/lib/Sema/SemaObjCProperty.cpp
index 0381607f32bfe..22540af1cda8c 100644
--- a/clang/lib/Sema/SemaObjCProperty.cpp
+++ b/clang/lib/Sema/SemaObjCProperty.cpp
@@ -2488,8 +2488,8 @@ void Sema::ProcessPropertyDecl(ObjCPropertyDecl *property) {
         /*isPropertyAccessor=*/true, /*isSynthesizedAccessorStub=*/false,
         /*isImplicitlyDeclared=*/true, /*isDefined=*/false,
         (property->getPropertyImplementation() == ObjCPropertyDecl::Optional)
-            ? ObjCMethodDecl::Optional
-            : ObjCMethodDecl::Required);
+            ? ObjCImplementationControl::Optional
+            : ObjCImplementationControl::Required);
     CD->addDecl(GetterMethod);
 
     AddPropertyAttrs(*this, GetterMethod, property);
@@ -2530,19 +2530,17 @@ void Sema::ProcessPropertyDecl(ObjCPropertyDecl *property) {
       // for this class.
       SourceLocation Loc = property->getLocation();
 
-      SetterMethod =
-        ObjCMethodDecl::Create(Context, Loc, Loc,
-                               property->getSetterName(), Context.VoidTy,
-                               nullptr, CD, !IsClassProperty,
-                               /*isVariadic=*/false,
-                               /*isPropertyAccessor=*/true,
-                               /*isSynthesizedAccessorStub=*/false,
-                               /*isImplicitlyDeclared=*/true,
-                               /*isDefined=*/false,
-                               (property->getPropertyImplementation() ==
-                                ObjCPropertyDecl::Optional) ?
-                                ObjCMethodDecl::Optional :
-                                ObjCMethodDecl::Required);
+      SetterMethod = ObjCMethodDecl::Create(
+          Context, Loc, Loc, property->getSetterName(), Context.VoidTy, nullptr,
+          CD, !IsClassProperty,
+          /*isVariadic=*/false,
+          /*isPropertyAccessor=*/true,
+          /*isSynthesizedAccessorStub=*/false,
+          /*isImplicitlyDeclared=*/true,
+          /*isDefined=*/false,
+          (property->getPropertyImplementation() == ObjCPropertyDecl::Optional)
+              ? ObjCImplementationControl::Optional
+              : ObjCImplementationControl::Required);
 
       // Remove all qualifiers from the setter's parameter type.
       QualType paramTy =
diff --git a/clang/lib/Sema/SemaPseudoObject.cpp b/clang/lib/Sema/SemaPseudoObject.cpp
index 1ac895e4eb1c8..528c261c4a297 100644
--- a/clang/lib/Sema/SemaPseudoObject.cpp
+++ b/clang/lib/Sema/SemaPseudoObject.cpp
@@ -1195,7 +1195,7 @@ bool ObjCSubscriptOpBuilder::findAtIndexGetter() {
         /*isPropertyAccessor=*/false,
         /*isSynthesizedAccessorStub=*/false,
         /*isImplicitlyDeclared=*/true, /*isDefined=*/false,
-        ObjCMethodDecl::Required, false);
+        ObjCImplementationControl::Required, false);
     ParmVarDecl *Argument = ParmVarDecl::Create(S.Context, AtIndexGetter,
                                                 SourceLocation(), SourceLocation(),
                                                 arrayRef ? &S.Context.Idents.get("index")
@@ -1301,7 +1301,7 @@ bool ObjCSubscriptOpBuilder::findAtIndexSetter() {
         /*isPropertyAccessor=*/false,
         /*isSynthesizedAccessorStub=*/false,
         /*isImplicitlyDeclared=*/true, /*isDefined=*/false,
-        ObjCMethodDecl::Required, false);
+        ObjCImplementationControl::Required, false);
     SmallVector<ParmVarDecl *, 2> Params;
     ParmVarDecl *object = ParmVarDecl::Create(S.Context, AtIndexSetter,
                                                 SourceLocation(), SourceLocation(),
diff --git a/clang/lib/Serialization/ASTReaderDecl.cpp b/clang/lib/Serialization/ASTReaderDecl.cpp
index 94a678e0c4164..4b1d265d8250f 100644
--- a/clang/lib/Serialization/ASTReaderDecl.cpp
+++ b/clang/lib/Serialization/ASTReaderDecl.cpp
@@ -1151,7 +1151,8 @@ void ASTDeclReader::VisitObjCMethodDecl(ObjCMethodDecl *MD) {
     Reader.getContext().setObjCMethodRedeclaration(MD,
                                        readDeclAs<ObjCMethodDecl>());
 
-  MD->setDeclImplementation((ObjCMethodDecl::ImplementationControl)Record.readInt());
+  MD->setDeclImplementation(
+      static_cast<ObjCImplementationControl>(Record.readInt()));
   MD->setObjCDeclQualifier((Decl::ObjCDeclQualifier)Record.readInt());
   MD->setRelatedResultType(Record.readInt());
   MD->setReturnType(Record.readType());
diff --git a/clang/lib/Serialization/ASTWriterDecl.cpp b/clang/lib/Serialization/ASTWriterDecl.cpp
index a91fde561d23a..9e1816e97b3fd 100644
--- a/clang/lib/Serialization/ASTWriterDecl.cpp
+++ b/clang/lib/Serialization/ASTWriterDecl.cpp
@@ -756,7 +756,7 @@ void ASTDeclWriter::VisitObjCMethodDecl(ObjCMethodDecl *D) {
   }
 
   // FIXME: stable encoding for @required/@optional
-  Record.push_back(D->getImplementationControl());
+  Record.push_back(llvm::to_underlying(D->getImplementationControl()));
   // FIXME: stable encoding for in/out/inout/bycopy/byref/oneway/nullability
   Record.push_back(D->getObjCDeclQualifier());
   Record.push_back(D->hasRelatedResultType());
diff --git a/clang/tools/libclang/CIndex.cpp b/clang/tools/libclang/CIndex.cpp
index 46226f4325b0a..169ce4f9b7c6a 100644
--- a/clang/tools/libclang/CIndex.cpp
+++ b/clang/tools/libclang/CIndex.cpp
@@ -8762,7 +8762,8 @@ unsigned clang_Cursor_isObjCOptional(CXCursor C) {
   if (const ObjCPropertyDecl *PD = dyn_cast<ObjCPropertyDecl>(D))
     return PD->getPropertyImplementation() == ObjCPropertyDecl::Optional;
   if (const ObjCMethodDecl *MD = dyn_cast<ObjCMethodDecl>(D))
-    return MD->getImplementationControl() == ObjCMethodDecl::Optional;
+    return MD->getImplementationControl() ==
+           ObjCImplementationControl::Optional;
 
   return 0;
 }
diff --git a/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCDeclVendor.cpp b/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCDeclVendor.cpp
index 2764a2aa39fa1..5d7c5f38d1805 100644
--- a/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCDeclVendor.cpp
+++ b/lldb/source/Plugins/LanguageRuntime/ObjC/AppleObjCRuntime/AppleObjCDeclVendor.cpp
@@ -311,8 +311,8 @@ class ObjCRuntimeMethodType {
     const bool isSynthesizedAccessorStub = false;
     const bool isImplicitlyDeclared = true;
     const bool isDefined = false;
-    const clang::ObjCMethodDecl::ImplementationControl impControl =
-        clang::ObjCMethodDecl::None;
+    const clang::ObjCImplementationControl impControl =
+        clang::ObjCImplementationControl::None;
     const bool HasRelatedResultType = false;
     const bool for_expression = true;
 
diff --git a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
index b2a5cb4eb99f4..5f64e0e4abaf9 100644
--- a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
+++ b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
@@ -8010,8 +8010,8 @@ bool TypeSystemClang::AddObjCClassProperty(
     const bool isSynthesizedAccessorStub = false;
     const bool isImplicitlyDeclared = true;
     const bool isDefined = false;
-    const clang::ObjCMethodDecl::ImplementationControl impControl =
-        clang::ObjCMethodDecl::None;
+    const clang::ObjCImplementationControl impControl =
+        clang::ObjCImplementationControl::None;
     const bool HasRelatedResultType = false;
 
     getter = clang::ObjCMethodDecl::CreateDeserialized(clang_ast, 0);
@@ -8052,8 +8052,8 @@ bool TypeSystemClang::AddObjCClassProperty(
     const bool isSynthesizedAccessorStub = false;
     const bool isImplicitlyDeclared = true;
     const bool isDefined = false;
-    const clang::ObjCMethodDecl::ImplementationControl impControl =
-        clang::ObjCMethodDecl::None;
+    const clang::ObjCImplementationControl impControl =
+        clang::ObjCImplementationControl::None;
     const bool HasRelatedResultType = false;
 
     setter = clang::ObjCMethodDecl::CreateDeserialized(clang_ast, 0);
@@ -8174,8 +8174,8 @@ clang::ObjCMethodDecl *TypeSystemClang::AddMethodToObjCObjectType(
   /// Force this to true because we don't have source locations.
   const bool isImplicitlyDeclared = true;
   const bool isDefined = false;
-  const clang::ObjCMethodDecl::ImplementationControl impControl =
-      clang::ObjCMethodDecl::None;
+  const clang::ObjCImplementationControl impControl =
+      clang::ObjCImplementationControl::None;
   const bool HasRelatedResultType = false;
 
   const unsigned num_args = method_function_prototype->getNumParams();

From b8150c8f12fcb3c3c5e40611ddd883db1506be35 Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett@linaro.org>
Date: Wed, 1 Nov 2023 10:40:25 +0000
Subject: [PATCH 655/877] [lldb][AArch64] Add SME2's ZT0 register (#70205)

SME2 is documented as part of the main SME supplement:
https://developer.arm.com/documentation/ddi0616/latest/

The one change for debug is this new ZT0 register. This register
contains data to be used with new table lookup instructions.
It's size is always 512 bits (not scalable) and can be
interpreted in many different ways depending on the instructions
that use it.

The kernel has implemented this as a new register set containing
this single register. It always returns register data (with no header,
unlike ZA which does have a header).

https://docs.kernel.org/arch/arm64/sme.html

ZT0 is only active when ZA is active (when SVCR.ZA is 1). In the
inactive state the kernel returns 0s for its contents. Therefore
lldb doesn't need to create 0s like it does for ZA.

However, we will skip restoring the value of ZT0 if we know that
ZA is inactive. As writing to an inactive ZT0 sets SVCR.ZA to 1,
which is not desireable as it would activate ZA also. Whether
SVCR.ZA is set will be determined only by the ZA data we restore.

Due to this, I've added a new save/restore kind SME2. This is easier
than accounting for the variable length ZA in the SME data. We'll only
save an SME2 data block if ZA is active. If it's not we can get fresh
0s back from the kernel for ZT0 anyway so there's nothing for us to
restore.

This new register will only show up if the system has SME2 therefore
the SME set presented to the user may change, and I've had to account
for that in in a few places.

I've referred to it internally as simply "ZT" as the kernel does in
NT_ARM_ZT, but the architecture refers to the specific register as "ZT0"
so that's what you'll see in lldb.

```
(lldb) register read -s 6
Scalable Matrix Extension Registers:
      svcr = 0x0000000000000000
       svg = 0x0000000000000004
        za = {0x00 <...> 0x00}
       zt0 = {0x00 <...> 0x00}
```
---
 .../Python/lldbsuite/test/lldbtest.py         |   4 +
 .../NativeRegisterContextLinux_arm64.cpp      | 140 ++++++++++++++++--
 .../Linux/NativeRegisterContextLinux_arm64.h  |  12 ++
 .../Utility/RegisterInfoPOSIX_arm64.cpp       |  50 +++++--
 .../Process/Utility/RegisterInfoPOSIX_arm64.h |   5 +-
 .../Process/elf-core/RegisterUtilities.h      |   4 +
 .../TestArm64DynamicRegsets.py                |  88 +++++++++--
 .../save_restore}/Makefile                    |   2 +-
 .../TestSMEZRegistersSaveRestore.py}          |  24 ++-
 .../save_restore}/main.c                      |  31 +++-
 .../za_dynamic_resize/Makefile                |   0
 .../TestZAThreadedDynamic.py                  |   0
 .../za_dynamic_resize/main.c                  |   0
 13 files changed, 312 insertions(+), 48 deletions(-)
 rename lldb/test/API/commands/register/register/{aarch64_za_register/za_save_restore => aarch64_sme_z_registers/save_restore}/Makefile (50%)
 rename lldb/test/API/commands/register/register/{aarch64_za_register/za_save_restore/TestZARegisterSaveRestore.py => aarch64_sme_z_registers/save_restore/TestSMEZRegistersSaveRestore.py} (91%)
 rename lldb/test/API/commands/register/register/{aarch64_za_register/za_save_restore => aarch64_sme_z_registers/save_restore}/main.c (91%)
 rename lldb/test/API/commands/register/register/{aarch64_za_register => aarch64_sme_z_registers}/za_dynamic_resize/Makefile (100%)
 rename lldb/test/API/commands/register/register/{aarch64_za_register => aarch64_sme_z_registers}/za_dynamic_resize/TestZAThreadedDynamic.py (100%)
 rename lldb/test/API/commands/register/register/{aarch64_za_register => aarch64_sme_z_registers}/za_dynamic_resize/main.c (100%)

diff --git a/lldb/packages/Python/lldbsuite/test/lldbtest.py b/lldb/packages/Python/lldbsuite/test/lldbtest.py
index fefe91401cb8e..15e8ba21266c8 100644
--- a/lldb/packages/Python/lldbsuite/test/lldbtest.py
+++ b/lldb/packages/Python/lldbsuite/test/lldbtest.py
@@ -1271,6 +1271,10 @@ def isAArch64SVE(self):
     def isAArch64SME(self):
         return self.isAArch64() and "sme" in self.getCPUInfo()
 
+    def isAArch64SME2(self):
+        # If you have sme2, you also have sme.
+        return self.isAArch64() and "sme2" in self.getCPUInfo()
+
     def isAArch64SMEFA64(self):
         # smefa64 allows the use of the full A64 instruction set in streaming
         # mode. This is required by certain test programs to setup register
diff --git a/lldb/source/Plugins/Process/Linux/NativeRegisterContextLinux_arm64.cpp b/lldb/source/Plugins/Process/Linux/NativeRegisterContextLinux_arm64.cpp
index 22aa2f3a92094..e23165933c221 100644
--- a/lldb/source/Plugins/Process/Linux/NativeRegisterContextLinux_arm64.cpp
+++ b/lldb/source/Plugins/Process/Linux/NativeRegisterContextLinux_arm64.cpp
@@ -45,6 +45,11 @@
 #define NT_ARM_ZA 0x40c /* ARM Scalable Matrix Extension, Array Storage */
 #endif
 
+#ifndef NT_ARM_ZT
+#define NT_ARM_ZT                                                              \
+  0x40d /* ARM Scalable Matrix Extension 2, lookup table register */
+#endif
+
 #ifndef NT_ARM_PAC_MASK
 #define NT_ARM_PAC_MASK 0x406 /* Pointer authentication code masks */
 #endif
@@ -104,6 +109,17 @@ NativeRegisterContextLinux::CreateHostNativeRegisterContextLinux(
             .Success())
       opt_regsets.Set(RegisterInfoPOSIX_arm64::eRegsetMaskZA);
 
+    // SME's ZT0 is a 512 bit register.
+    std::array<uint8_t, 64> zt_reg;
+    ioVec.iov_base = zt_reg.data();
+    ioVec.iov_len = zt_reg.size();
+    regset = NT_ARM_ZT;
+    if (NativeProcessLinux::PtraceWrapper(PTRACE_GETREGSET,
+                                          native_thread.GetID(), &regset,
+                                          &ioVec, zt_reg.size())
+            .Success())
+      opt_regsets.Set(RegisterInfoPOSIX_arm64::eRegsetMaskZT);
+
     NativeProcessLinux &process = native_thread.GetProcess();
 
     std::optional<uint64_t> auxv_at_hwcap =
@@ -148,6 +164,7 @@ NativeRegisterContextLinux_arm64::NativeRegisterContextLinux_arm64(
   ::memset(&m_pac_mask, 0, sizeof(m_pac_mask));
   ::memset(&m_tls_regs, 0, sizeof(m_tls_regs));
   ::memset(&m_sme_pseudo_regs, 0, sizeof(m_sme_pseudo_regs));
+  std::fill(m_zt_reg.begin(), m_zt_reg.end(), 0);
 
   m_mte_ctrl_reg = 0;
 
@@ -164,6 +181,7 @@ NativeRegisterContextLinux_arm64::NativeRegisterContextLinux_arm64(
   m_pac_mask_is_valid = false;
   m_mte_ctrl_is_valid = false;
   m_tls_is_valid = false;
+  m_zt_buffer_is_valid = false;
 
   // SME adds the tpidr2 register
   m_tls_size = GetRegisterInfo().IsSSVEPresent() ? sizeof(m_tls_regs)
@@ -355,6 +373,15 @@ NativeRegisterContextLinux_arm64::ReadRegister(const RegisterInfo *reg_info,
       // storage. Therefore its effective byte offset is always 0 even if it
       // isn't 0 within the SME register set.
       src = (uint8_t *)GetZABuffer() + GetZAHeaderSize();
+    } else if (GetRegisterInfo().IsSMERegZT(reg)) {
+      // Unlike ZA, the kernel will return register data for ZT0 when ZA is not
+      // enabled. This data will be all 0s so we don't have to invent anything
+      // like we did for ZA.
+      error = ReadZT();
+      if (error.Fail())
+        return error;
+
+      src = (uint8_t *)GetZTBuffer();
     } else {
       error = ReadSMESVG();
       if (error.Fail())
@@ -552,22 +579,31 @@ Status NativeRegisterContextLinux_arm64::WriteRegister(
 
     return WriteTLS();
   } else if (IsSME(reg)) {
-    if (!GetRegisterInfo().IsSMERegZA(reg))
-      return Status("Writing to SVG or SVCR is not supported.");
+    if (GetRegisterInfo().IsSMERegZA(reg)) {
+      error = ReadZA();
+      if (error.Fail())
+        return error;
 
-    error = ReadZA();
-    if (error.Fail())
-      return error;
+      // ZA is part of the SME set but not stored with the other SME registers.
+      // So its byte offset is effectively always 0.
+      dst = (uint8_t *)GetZABuffer() + GetZAHeaderSize();
+      ::memcpy(dst, reg_value.GetBytes(), reg_info->byte_size);
 
-    // ZA is part of the SME set but not stored with the other SME registers.
-    // So its byte offset is effectively always 0.
-    dst = (uint8_t *)GetZABuffer() + GetZAHeaderSize();
-    ::memcpy(dst, reg_value.GetBytes(), reg_info->byte_size);
+      // While this is writing a header that contains a vector length, the only
+      // way to change that is via the vg register. So here we assume the length
+      // will always be the current length and no reconfigure is needed.
+      return WriteZA();
+    } else if (GetRegisterInfo().IsSMERegZT(reg)) {
+      error = ReadZT();
+      if (error.Fail())
+        return error;
 
-    // While this is writing a header that contains a vector length, the only
-    // way to change that is via the vg register. So here we assume the length
-    // will always be the current length and no reconfigure is needed.
-    return WriteZA();
+      dst = (uint8_t *)GetZTBuffer();
+      ::memcpy(dst, reg_value.GetBytes(), reg_info->byte_size);
+
+      return WriteZT();
+    } else
+      return Status("Writing to SVG or SVCR is not supported.");
   }
 
   return Status("Failed to write register value");
@@ -580,7 +616,8 @@ enum RegisterSetType : uint32_t {
   // Pointer authentication registers are read only, so not included here.
   MTE,
   TLS,
-  SME, // ZA only, SVCR and SVG are pseudo registers.
+  SME,  // ZA only, because SVCR and SVG are pseudo registers.
+  SME2, // ZT only.
 };
 
 static uint8_t *AddRegisterSetType(uint8_t *dst,
@@ -624,6 +661,21 @@ NativeRegisterContextLinux_arm64::CacheAllRegisters(uint32_t &cached_size) {
     error = ReadZA();
     if (error.Fail())
       return error;
+
+    // We will only be restoring ZT data if ZA is active. As writing to an
+    // inactive ZT enables ZA, which may not be desireable.
+    if (
+        // If we have ZT0, or in other words, if we have SME2.
+        GetRegisterInfo().IsZTPresent() &&
+        // And ZA is active, which means that ZT0 is also active.
+        m_za_header.size > sizeof(m_za_header)) {
+      cached_size += sizeof(RegisterSetType) + GetZTBufferSize();
+      // The kernel handles an inactive ZT0 for us, and it will read as 0s if
+      // inactive (unlike ZA where we fake that behaviour).
+      error = ReadZT();
+      if (error.Fail())
+        return error;
+    }
   }
 
   // If SVE is enabled we need not copy FPR separately.
@@ -731,6 +783,19 @@ Status NativeRegisterContextLinux_arm64::ReadAllRegisterValues(
                             m_za_header.size);
   }
 
+  // If ZT0 is present and we are going to be restoring an active ZA (which
+  // implies an active ZT0), then restore ZT0 after ZA has been set. This
+  // prevents us enabling ZA accidentally after the restore of ZA disabled it.
+  // If we leave ZA/ZT0 inactive and read ZT0, the kernel returns 0s. Therefore
+  // there's nothing for us to restore if ZA was originally inactive.
+  if (
+      // If we have SME2 and therefore ZT0.
+      GetRegisterInfo().IsZTPresent() &&
+      // And ZA is enabled.
+      m_za_header.size > sizeof(m_za_header))
+    dst = AddSavedRegisters(dst, RegisterSetType::SME2, GetZTBuffer(),
+                            GetZTBufferSize());
+
   if (GetRegisterInfo().IsMTEPresent()) {
     dst = AddSavedRegisters(dst, RegisterSetType::MTE, GetMTEControl(),
                             GetMTEControlSize());
@@ -874,6 +939,14 @@ Status NativeRegisterContextLinux_arm64::WriteAllRegisterValues(
       error = ReadZA();
       src += GetZABufferSize();
       break;
+    case RegisterSetType::SME2:
+      // Doing this would activate an inactive ZA, however we will only get here
+      // if the state we are restoring had an active ZA. Restoring ZT0 will
+      // always come after restoring ZA.
+      error = RestoreRegisters(
+          GetZTBuffer(), &src, GetZTBufferSize(), m_zt_buffer_is_valid,
+          std::bind(&NativeRegisterContextLinux_arm64::WriteZT, this));
+      break;
     }
 
     if (error.Fail())
@@ -1063,6 +1136,7 @@ void NativeRegisterContextLinux_arm64::InvalidateAllRegisters() {
   m_pac_mask_is_valid = false;
   m_mte_ctrl_is_valid = false;
   m_tls_is_valid = false;
+  m_zt_buffer_is_valid = false;
 
   // Update SVE and ZA registers in case there is change in configuration.
   ConfigureRegisterContext();
@@ -1300,10 +1374,48 @@ Status NativeRegisterContextLinux_arm64::WriteZA() {
 
   m_za_buffer_is_valid = false;
   m_za_header_is_valid = false;
+  // Writing to ZA may enable ZA, which means ZT0 may change too.
+  m_zt_buffer_is_valid = false;
 
   return WriteRegisterSet(&ioVec, GetZABufferSize(), NT_ARM_ZA);
 }
 
+Status NativeRegisterContextLinux_arm64::ReadZT() {
+  Status error;
+
+  if (m_zt_buffer_is_valid)
+    return error;
+
+  struct iovec ioVec;
+  ioVec.iov_base = GetZTBuffer();
+  ioVec.iov_len = GetZTBufferSize();
+
+  error = ReadRegisterSet(&ioVec, GetZTBufferSize(), NT_ARM_ZT);
+  m_zt_buffer_is_valid = error.Success();
+
+  return error;
+}
+
+Status NativeRegisterContextLinux_arm64::WriteZT() {
+  Status error;
+
+  error = ReadZT();
+  if (error.Fail())
+    return error;
+
+  struct iovec ioVec;
+  ioVec.iov_base = GetZTBuffer();
+  ioVec.iov_len = GetZTBufferSize();
+
+  m_zt_buffer_is_valid = false;
+  // Writing to an inactive ZT0 will enable ZA as well, which invalidates our
+  // current copy of it.
+  m_za_buffer_is_valid = false;
+  m_za_header_is_valid = false;
+
+  return WriteRegisterSet(&ioVec, GetZTBufferSize(), NT_ARM_ZT);
+}
+
 void NativeRegisterContextLinux_arm64::ConfigureRegisterContext() {
   // ConfigureRegisterContext gets called from InvalidateAllRegisters
   // on every stop and configures SVE vector length and whether we are in
diff --git a/lldb/source/Plugins/Process/Linux/NativeRegisterContextLinux_arm64.h b/lldb/source/Plugins/Process/Linux/NativeRegisterContextLinux_arm64.h
index 9fc2e7c1d2b9d..6df7c3beefb82 100644
--- a/lldb/source/Plugins/Process/Linux/NativeRegisterContextLinux_arm64.h
+++ b/lldb/source/Plugins/Process/Linux/NativeRegisterContextLinux_arm64.h
@@ -83,6 +83,7 @@ class NativeRegisterContextLinux_arm64
   bool m_fpu_is_valid;
   bool m_sve_buffer_is_valid;
   bool m_mte_ctrl_is_valid;
+  bool m_zt_buffer_is_valid;
 
   bool m_sve_header_is_valid;
   bool m_za_buffer_is_valid;
@@ -129,6 +130,9 @@ class NativeRegisterContextLinux_arm64
 
   struct tls_regs m_tls_regs;
 
+  // SME2's ZT is a 512 bit register.
+  std::array<uint8_t, 64> m_zt_reg;
+
   bool IsGPR(unsigned reg) const;
 
   bool IsFPR(unsigned reg) const;
@@ -163,6 +167,10 @@ class NativeRegisterContextLinux_arm64
   // Instead use WriteZA and ensure you have the correct ZA buffer size set
   // beforehand if you wish to disable it.
 
+  Status ReadZT();
+
+  Status WriteZT();
+
   // SVCR is a pseudo register and we do not allow writes to it.
   Status ReadSMEControl();
 
@@ -190,6 +198,8 @@ class NativeRegisterContextLinux_arm64
 
   void *GetSMEPseudoBuffer() { return &m_sme_pseudo_regs; }
 
+  void *GetZTBuffer() { return m_zt_reg.data(); }
+
   void *GetSVEBuffer() { return m_sve_ptrace_payload.data(); }
 
   size_t GetSVEHeaderSize() { return sizeof(m_sve_header); }
@@ -210,6 +220,8 @@ class NativeRegisterContextLinux_arm64
 
   size_t GetSMEPseudoBufferSize() { return sizeof(m_sme_pseudo_regs); }
 
+  size_t GetZTBufferSize() { return m_zt_reg.size(); }
+
   llvm::Error ReadHardwareDebugInfo() override;
 
   llvm::Error WriteHardwareDebugRegs(DREGType hwbType) override;
diff --git a/lldb/source/Plugins/Process/Utility/RegisterInfoPOSIX_arm64.cpp b/lldb/source/Plugins/Process/Utility/RegisterInfoPOSIX_arm64.cpp
index 20ce4df3027ac..054b7d9b2ec57 100644
--- a/lldb/source/Plugins/Process/Utility/RegisterInfoPOSIX_arm64.cpp
+++ b/lldb/source/Plugins/Process/Utility/RegisterInfoPOSIX_arm64.cpp
@@ -90,6 +90,10 @@ static lldb_private::RegisterInfo g_register_infos_sme[] = {
     {"za", nullptr, 16, 0, lldb::eEncodingVector, lldb::eFormatVectorOfUInt8,
      KIND_ALL_INVALID, nullptr, nullptr, nullptr}};
 
+static lldb_private::RegisterInfo g_register_infos_sme2[] = {
+    {"zt0", nullptr, 64, 0, lldb::eEncodingVector, lldb::eFormatVectorOfUInt8,
+     KIND_ALL_INVALID, nullptr, nullptr, nullptr}};
+
 // Number of register sets provided by this context.
 enum {
   k_num_gpr_registers = gpr_w28 - gpr_x0 + 1,
@@ -98,6 +102,8 @@ enum {
   k_num_mte_register = 1,
   // Number of TLS registers is dynamic so it is not listed here.
   k_num_pauth_register = 2,
+  // SME2's ZT0 will also be added to this set if present. So this number is
+  // only for SME1 registers.
   k_num_sme_register = 3,
   k_num_register_sets_default = 2,
   k_num_register_sets = 3
@@ -253,7 +259,7 @@ RegisterInfoPOSIX_arm64::RegisterInfoPOSIX_arm64(
       AddRegSetTLS(m_opt_regsets.AllSet(eRegsetMaskSSVE));
 
       if (m_opt_regsets.AnySet(eRegsetMaskSSVE))
-        AddRegSetSME();
+        AddRegSetSME(m_opt_regsets.AnySet(eRegsetMaskZT));
 
       m_register_info_count = m_dynamic_reg_infos.size();
       m_register_info_p = m_dynamic_reg_infos.data();
@@ -358,21 +364,35 @@ void RegisterInfoPOSIX_arm64::AddRegSetTLS(bool has_tpidr2) {
   m_dynamic_reg_sets.back().registers = m_tls_regnum_collection.data();
 }
 
-void RegisterInfoPOSIX_arm64::AddRegSetSME() {
-  uint32_t sme_regnum = m_dynamic_reg_infos.size();
-  for (uint32_t i = 0; i < k_num_sme_register; i++) {
-    m_sme_regnum_collection.push_back(sme_regnum + i);
+void RegisterInfoPOSIX_arm64::AddRegSetSME(bool has_zt) {
+  const uint32_t first_sme_regnum = m_dynamic_reg_infos.size();
+  uint32_t sme_regnum = first_sme_regnum;
+
+  for (uint32_t i = 0; i < k_num_sme_register; ++i, ++sme_regnum) {
+    m_sme_regnum_collection.push_back(sme_regnum);
     m_dynamic_reg_infos.push_back(g_register_infos_sme[i]);
-    m_dynamic_reg_infos[sme_regnum + i].byte_offset =
-        m_dynamic_reg_infos[sme_regnum + i - 1].byte_offset +
-        m_dynamic_reg_infos[sme_regnum + i - 1].byte_size;
-    m_dynamic_reg_infos[sme_regnum + i].kinds[lldb::eRegisterKindLLDB] =
-        sme_regnum + i;
+    m_dynamic_reg_infos[sme_regnum].byte_offset =
+        m_dynamic_reg_infos[sme_regnum - 1].byte_offset +
+        m_dynamic_reg_infos[sme_regnum - 1].byte_size;
+    m_dynamic_reg_infos[sme_regnum].kinds[lldb::eRegisterKindLLDB] = sme_regnum;
+  }
+
+  lldb_private::RegisterSet sme_regset = g_reg_set_sme_arm64;
+
+  if (has_zt) {
+    m_sme_regnum_collection.push_back(sme_regnum);
+    m_dynamic_reg_infos.push_back(g_register_infos_sme2[0]);
+    m_dynamic_reg_infos[sme_regnum].byte_offset =
+        m_dynamic_reg_infos[sme_regnum - 1].byte_offset +
+        m_dynamic_reg_infos[sme_regnum - 1].byte_size;
+    m_dynamic_reg_infos[sme_regnum].kinds[lldb::eRegisterKindLLDB] = sme_regnum;
+
+    sme_regset.num_registers += 1;
   }
 
   m_per_regset_regnum_range[m_register_set_count] =
-      std::make_pair(sme_regnum, m_dynamic_reg_infos.size());
-  m_dynamic_reg_sets.push_back(g_reg_set_sme_arm64);
+      std::make_pair(first_sme_regnum, m_dynamic_reg_infos.size());
+  m_dynamic_reg_sets.push_back(sme_regset);
   m_dynamic_reg_sets.back().registers = m_sme_regnum_collection.data();
 
   // When vg is written during streaming mode, svg will also change, as vg and
@@ -488,6 +508,12 @@ bool RegisterInfoPOSIX_arm64::IsSMERegZA(unsigned reg) const {
   return reg == m_sme_regnum_collection[2];
 }
 
+bool RegisterInfoPOSIX_arm64::IsSMERegZT(unsigned reg) const {
+  // ZT0 is part of the SME register set only if SME2 is present.
+  return m_sme_regnum_collection.size() >= 4 &&
+         reg == m_sme_regnum_collection[3];
+}
+
 bool RegisterInfoPOSIX_arm64::IsPAuthReg(unsigned reg) const {
   return llvm::is_contained(pauth_regnum_collection, reg);
 }
diff --git a/lldb/source/Plugins/Process/Utility/RegisterInfoPOSIX_arm64.h b/lldb/source/Plugins/Process/Utility/RegisterInfoPOSIX_arm64.h
index 1bb3400e426a9..3b8171042c732 100644
--- a/lldb/source/Plugins/Process/Utility/RegisterInfoPOSIX_arm64.h
+++ b/lldb/source/Plugins/Process/Utility/RegisterInfoPOSIX_arm64.h
@@ -31,6 +31,7 @@ class RegisterInfoPOSIX_arm64
     eRegsetMaskMTE = 8,
     eRegsetMaskTLS = 16,
     eRegsetMaskZA = 32,
+    eRegsetMaskZT = 64,
     eRegsetMaskDynamic = ~1,
   };
 
@@ -107,7 +108,7 @@ class RegisterInfoPOSIX_arm64
 
   void AddRegSetTLS(bool has_tpidr2);
 
-  void AddRegSetSME();
+  void AddRegSetSME(bool has_zt);
 
   uint32_t ConfigureVectorLengthSVE(uint32_t sve_vq);
 
@@ -123,6 +124,7 @@ class RegisterInfoPOSIX_arm64
   bool IsSVEPresent() const { return m_opt_regsets.AnySet(eRegsetMaskSVE); }
   bool IsSSVEPresent() const { return m_opt_regsets.AnySet(eRegsetMaskSSVE); }
   bool IsZAPresent() const { return m_opt_regsets.AnySet(eRegsetMaskZA); }
+  bool IsZTPresent() const { return m_opt_regsets.AnySet(eRegsetMaskZT); }
   bool IsPAuthPresent() const { return m_opt_regsets.AnySet(eRegsetMaskPAuth); }
   bool IsMTEPresent() const { return m_opt_regsets.AnySet(eRegsetMaskMTE); }
   bool IsTLSPresent() const { return m_opt_regsets.AnySet(eRegsetMaskTLS); }
@@ -136,6 +138,7 @@ class RegisterInfoPOSIX_arm64
   bool IsTLSReg(unsigned reg) const;
   bool IsSMEReg(unsigned reg) const;
   bool IsSMERegZA(unsigned reg) const;
+  bool IsSMERegZT(unsigned reg) const;
 
   uint32_t GetRegNumSVEZ0() const;
   uint32_t GetRegNumSVEFFR() const;
diff --git a/lldb/source/Plugins/Process/elf-core/RegisterUtilities.h b/lldb/source/Plugins/Process/elf-core/RegisterUtilities.h
index 4b01a56979bb1..12aa5f72371c5 100644
--- a/lldb/source/Plugins/Process/elf-core/RegisterUtilities.h
+++ b/lldb/source/Plugins/Process/elf-core/RegisterUtilities.h
@@ -127,6 +127,10 @@ constexpr RegsetDesc AARCH64_ZA_Desc[] = {
     {llvm::Triple::Linux, llvm::Triple::aarch64, llvm::ELF::NT_ARM_ZA},
 };
 
+constexpr RegsetDesc AARCH64_ZT_Desc[] = {
+    {llvm::Triple::Linux, llvm::Triple::aarch64, llvm::ELF::NT_ARM_ZT},
+};
+
 constexpr RegsetDesc AARCH64_PAC_Desc[] = {
     {llvm::Triple::Linux, llvm::Triple::aarch64, llvm::ELF::NT_ARM_PAC_MASK},
 };
diff --git a/lldb/test/API/commands/register/register/aarch64_dynamic_regset/TestArm64DynamicRegsets.py b/lldb/test/API/commands/register/register/aarch64_dynamic_regset/TestArm64DynamicRegsets.py
index 0ad69c268a9fd..eb121ecbfdbaf 100644
--- a/lldb/test/API/commands/register/register/aarch64_dynamic_regset/TestArm64DynamicRegsets.py
+++ b/lldb/test/API/commands/register/register/aarch64_dynamic_regset/TestArm64DynamicRegsets.py
@@ -136,6 +136,14 @@ def make_za_value(self, vl, generator):
             rows.append(" ".join([byte] * vl))
         return "{" + " ".join(rows) + "}"
 
+    def make_zt0_value(self, generator):
+        num_bytes = 512 // 8
+        elements = []
+        for i in range(num_bytes):
+            elements.append("0x{:02x}".format(generator(i)))
+
+        return "{" + " ".join(elements) + "}"
+
     @no_debug_info_test
     @skipIf(archs=no_match(["aarch64"]))
     @skipIf(oslist=no_match(["linux"]))
@@ -179,16 +187,11 @@ def test_aarch64_dynamic_regset_config_sme(self):
 
         # SVCR is read only so we do not test writing to it.
 
-    @no_debug_info_test
-    @skipIf(archs=no_match(["aarch64"]))
-    @skipIf(oslist=no_match(["linux"]))
-    def test_aarch64_dynamic_regset_config_sme_za_disabled(self):
-        """Test that ZA shows as 0s when disabled and can be enabled by writing
-        to it."""
-        if not self.isAArch64SME():
-            self.skipTest("SME must be present.")
+    def write_to_enable_za_test(self, has_zt0, write_za_first):
+        # Run a test where we start with ZA disabled, and write to either ZA
+        # or ZT0 which causes them to become enabled.
 
-        # No argument, so ZA will be disabled when we break.
+        # No argument, so ZA and ZT0 will be disabled when we break.
         register_sets = self.setup_register_config_test()
 
         # vg is the non-streaming vg as we are in non-streaming mode, so we need
@@ -205,14 +208,71 @@ def test_aarch64_dynamic_regset_config_sme_za_disabled(self):
 
         svl = svg * 8
         # A disabled ZA is shown as all 0s.
-        self.expect("register read za", substrs=[self.make_za_value(svl, lambda r: 0)])
+        disabled_za = self.make_za_value(svl, lambda r: 0)
+        self.expect("register read za", substrs=[disabled_za])
+
+        disabled_zt0 = self.make_zt0_value(lambda n: 0)
+        if has_zt0:
+            # A disabled zt0 is all 0s.
+            self.expect("register read zt0", substrs=[disabled_zt0])
+
+        # Writing to ZA or ZTO enables both and we should be able to read the
+        # value back.
         za_value = self.make_za_value(svl, lambda r: r + 1)
-        # Writing to it enables ZA, so the value should be there when we read
-        # it back.
-        self.runCmd("register write za '{}'".format(za_value))
-        self.expect("register read za", substrs=[za_value])
+        zt0_value = self.make_zt0_value(lambda n: n + 1)
+
+        if write_za_first:
+            # This enables ZA and ZT0.
+            self.runCmd("register write za '{}'".format(za_value))
+            self.expect("register read za", substrs=[za_value])
+
+            if has_zt0:
+                # ZT0 is still 0s at this point, though it is active.
+                self.expect("register read zt0", substrs=[disabled_zt0])
+
+                # Now write ZT0 to we can check it reads back correctly.
+                self.runCmd("register write zt0 '{}'".format(zt0_value))
+                self.expect("register read zt0", substrs=[zt0_value])
+        else:
+            if not has_zt0:
+                self.fail("Cannot write to zt0 when sme2 is not present.")
+
+            # Instead use the write of ZT0 to activate ZA.
+            self.runCmd("register write zt0 '{}'".format(zt0_value))
+            self.expect("register read zt0", substrs=[zt0_value])
+
+            # ZA will be active but 0s at this point, but it is active.
+            self.expect("register read zt0", substrs=[disabled_za])
+
+            # Write and read back ZA.
+            self.runCmd("register write za '{}'".format(za_value))
+            self.expect("register read za", substrs=[za_value])
 
         # Now SVCR.ZA should be set, which is bit 1.
         self.expect("register read svcr", substrs=["0x0000000000000002"])
 
         # SVCR is read only so we do not test writing to it.
+
+    @no_debug_info_test
+    @skipIf(archs=no_match(["aarch64"]))
+    @skipIf(oslist=no_match(["linux"]))
+    def test_aarch64_dynamic_regset_config_sme_write_za_to_enable(self):
+        """Test that ZA and ZT0 (if present) shows as 0s when disabled and
+        can be enabled by writing to ZA."""
+        if not self.isAArch64SME():
+            self.skipTest("SME must be present.")
+
+        self.write_to_enable_za_test(self.isAArch64SME2(), True)
+
+    @no_debug_info_test
+    @skipIf(archs=no_match(["aarch64"]))
+    @skipIf(oslist=no_match(["linux"]))
+    def test_aarch64_dynamic_regset_config_sme_write_zt0_to_enable(self):
+        """Test that ZA and ZT0 (if present) shows as 0s when disabled and
+        can be enabled by writing to ZT0."""
+        if not self.isAArch64SME():
+            self.skipTest("SME must be present.")
+        if not self.isAArch64SME2():
+            self.skipTest("SME2 must be present.")
+
+        self.write_to_enable_za_test(True, True)
diff --git a/lldb/test/API/commands/register/register/aarch64_za_register/za_save_restore/Makefile b/lldb/test/API/commands/register/register/aarch64_sme_z_registers/save_restore/Makefile
similarity index 50%
rename from lldb/test/API/commands/register/register/aarch64_za_register/za_save_restore/Makefile
rename to lldb/test/API/commands/register/register/aarch64_sme_z_registers/save_restore/Makefile
index f2ca08f3531aa..85304c044dab4 100644
--- a/lldb/test/API/commands/register/register/aarch64_za_register/za_save_restore/Makefile
+++ b/lldb/test/API/commands/register/register/aarch64_sme_z_registers/save_restore/Makefile
@@ -1,5 +1,5 @@
 C_SOURCES := main.c
 
-CFLAGS_EXTRAS := -march=armv8-a+sve+sme
+CFLAGS_EXTRAS := -march=armv8-a+sve+sme+sme2
 
 include Makefile.rules
diff --git a/lldb/test/API/commands/register/register/aarch64_za_register/za_save_restore/TestZARegisterSaveRestore.py b/lldb/test/API/commands/register/register/aarch64_sme_z_registers/save_restore/TestSMEZRegistersSaveRestore.py
similarity index 91%
rename from lldb/test/API/commands/register/register/aarch64_za_register/za_save_restore/TestZARegisterSaveRestore.py
rename to lldb/test/API/commands/register/register/aarch64_sme_z_registers/save_restore/TestSMEZRegistersSaveRestore.py
index a647c91f71119..a0949d80c5664 100644
--- a/lldb/test/API/commands/register/register/aarch64_za_register/za_save_restore/TestZARegisterSaveRestore.py
+++ b/lldb/test/API/commands/register/register/aarch64_sme_z_registers/save_restore/TestSMEZRegistersSaveRestore.py
@@ -1,5 +1,6 @@
 """
-Test the AArch64 SME ZA register is saved and restored around expressions.
+Test the AArch64 SME ZA and ZT0 registers are saved and restored around
+expressions.
 
 This attempts to cover expressions that change the following:
 * ZA enabled or not.
@@ -105,6 +106,21 @@ def check_za_disabled(self, vl):
         # When ZA is disabled, lldb will show ZA as all 0s.
         self.expect("register read za", substrs=[self.make_za_value(vl, lambda row: 0)])
 
+    def make_zt0_value(self, generator):
+        num_bytes = 512 // 8
+        elements = []
+        for i in range(num_bytes):
+            elements.append("0x{:02x}".format(generator(i)))
+
+        return "{" + " ".join(elements) + "}"
+
+    def check_zt0(self):
+        self.expect("register read zt0", substrs=[self.make_zt0_value(lambda n: n + 1)])
+
+    def check_zt0_disabled(self):
+        # Like ZA, zt0 reads as 0 when SVCR.ZA is not set.
+        self.expect("register read zt0", substrs=[self.make_zt0_value(lambda n: 0)])
+
     def za_expr_test_impl(self, sve_mode, za_state, swap_start_vl):
         if not self.isAArch64SMEFA64():
             self.skipTest("SME and the smefa64 extension must be present.")
@@ -168,11 +184,17 @@ def za_expr_test_impl(self, sve_mode, za_state, swap_start_vl):
         if za_state == ZA.Enabled:
             svcr_value += 2
 
+        has_zt0 = self.isAArch64SME2()
+
         def check_regs():
             if za_state == ZA.Enabled:
                 self.check_za(start_vl)
+                if has_zt0:
+                    self.check_zt0()
             else:
                 self.check_za_disabled(start_vl)
+                if has_zt0:
+                    self.check_zt0_disabled()
 
             # svg and vg are in units of 8 bytes.
             self.assertEqual(start_vl, self.read_svg() * 8)
diff --git a/lldb/test/API/commands/register/register/aarch64_za_register/za_save_restore/main.c b/lldb/test/API/commands/register/register/aarch64_sme_z_registers/save_restore/main.c
similarity index 91%
rename from lldb/test/API/commands/register/register/aarch64_za_register/za_save_restore/main.c
rename to lldb/test/API/commands/register/register/aarch64_sme_z_registers/save_restore/main.c
index a8434787a5a12..7dd424798d693 100644
--- a/lldb/test/API/commands/register/register/aarch64_za_register/za_save_restore/main.c
+++ b/lldb/test/API/commands/register/register/aarch64_sme_z_registers/save_restore/main.c
@@ -1,7 +1,9 @@
+#include <asm/hwcap.h>
 #include <stdbool.h>
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
+#include <sys/auxv.h>
 #include <sys/prctl.h>
 
 // Important details for this program:
@@ -30,8 +32,13 @@
 #define SMSTOP_SM SM_INST(2)
 #define SMSTOP_ZA SM_INST(4)
 
+#ifndef HWCAP2_SME2
+#define HWCAP2_SME2 (1UL << 37)
+#endif
+
 int start_vl = 0;
 int other_vl = 0;
+bool has_zt0 = false;
 
 void write_sve_regs() {
   // We assume the smefa64 feature is present, which allows ffr access
@@ -143,7 +150,7 @@ void write_sve_regs_expr() {
   asm volatile("cpy  z31.b, p15/z, #33\n\t");
 }
 
-void set_za_register(int svl, int value_offset) {
+void set_sme_registers(int svl, uint8_t value_offset) {
 #define MAX_VL_BYTES 256
   uint8_t data[MAX_VL_BYTES];
 
@@ -158,6 +165,17 @@ void set_za_register(int svl, int value_offset) {
                  "r"(&data)
                  : "w12");
   }
+#undef MAX_VL_BYTES
+
+  if (has_zt0) {
+#define ZTO_LEN (512 / 8)
+    uint8_t data[ZTO_LEN];
+    for (unsigned i = 0; i < ZTO_LEN; ++i)
+      data[i] = i + value_offset;
+
+    asm volatile("ldr zt0, [%0]" ::"r"(&data));
+#undef ZT0_LEN
+  }
 }
 
 void expr_disable_za() {
@@ -167,21 +185,21 @@ void expr_disable_za() {
 
 void expr_enable_za() {
   SMSTART_ZA;
-  set_za_register(start_vl, 2);
+  set_sme_registers(start_vl, 2);
   write_sve_regs_expr();
 }
 
 void expr_start_vl() {
   prctl(PR_SME_SET_VL, start_vl);
   SMSTART_ZA;
-  set_za_register(start_vl, 4);
+  set_sme_registers(start_vl, 4);
   write_sve_regs_expr();
 }
 
 void expr_other_vl() {
   prctl(PR_SME_SET_VL, other_vl);
   SMSTART_ZA;
-  set_za_register(other_vl, 5);
+  set_sme_registers(other_vl, 5);
   write_sve_regs_expr();
 }
 
@@ -209,6 +227,9 @@ int main(int argc, char *argv[]) {
   start_vl = atoi(argv[3]);
   other_vl = atoi(argv[4]);
 
+  if ((getauxval(AT_HWCAP2) & HWCAP2_SME2))
+    has_zt0 = true;
+
   prctl(PR_SME_SET_VL, start_vl);
 
   if (ssve)
@@ -216,7 +237,7 @@ int main(int argc, char *argv[]) {
 
   if (za) {
     SMSTART_ZA;
-    set_za_register(start_vl, 1);
+    set_sme_registers(start_vl, 1);
   }
 
   write_sve_regs();
diff --git a/lldb/test/API/commands/register/register/aarch64_za_register/za_dynamic_resize/Makefile b/lldb/test/API/commands/register/register/aarch64_sme_z_registers/za_dynamic_resize/Makefile
similarity index 100%
rename from lldb/test/API/commands/register/register/aarch64_za_register/za_dynamic_resize/Makefile
rename to lldb/test/API/commands/register/register/aarch64_sme_z_registers/za_dynamic_resize/Makefile
diff --git a/lldb/test/API/commands/register/register/aarch64_za_register/za_dynamic_resize/TestZAThreadedDynamic.py b/lldb/test/API/commands/register/register/aarch64_sme_z_registers/za_dynamic_resize/TestZAThreadedDynamic.py
similarity index 100%
rename from lldb/test/API/commands/register/register/aarch64_za_register/za_dynamic_resize/TestZAThreadedDynamic.py
rename to lldb/test/API/commands/register/register/aarch64_sme_z_registers/za_dynamic_resize/TestZAThreadedDynamic.py
diff --git a/lldb/test/API/commands/register/register/aarch64_za_register/za_dynamic_resize/main.c b/lldb/test/API/commands/register/register/aarch64_sme_z_registers/za_dynamic_resize/main.c
similarity index 100%
rename from lldb/test/API/commands/register/register/aarch64_za_register/za_dynamic_resize/main.c
rename to lldb/test/API/commands/register/register/aarch64_sme_z_registers/za_dynamic_resize/main.c

From 0b5e0fb62df05ecf3f803e128b1cda3fa23061b0 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Wed, 1 Nov 2023 11:37:38 +0100
Subject: [PATCH 656/877] [InstCombine] Avoid some uses of
 ConstantExpr::getIntegerCast() (NFC)

Use IRBuilder or ConstantFolding instead.
---
 .../Transforms/InstCombine/InstCombineCalls.cpp  |  7 +++----
 .../Transforms/InstCombine/InstCombineCasts.cpp  | 16 +++++++++-------
 2 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 8acdb7ef4545b..7e585e9166247 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -2539,10 +2539,9 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
     VectorType *NewVT = cast<VectorType>(II->getType());
     if (Constant *CV0 = dyn_cast<Constant>(Arg0)) {
       if (Constant *CV1 = dyn_cast<Constant>(Arg1)) {
-        CV0 = ConstantExpr::getIntegerCast(CV0, NewVT, /*isSigned=*/!Zext);
-        CV1 = ConstantExpr::getIntegerCast(CV1, NewVT, /*isSigned=*/!Zext);
-
-        return replaceInstUsesWith(CI, ConstantExpr::getMul(CV0, CV1));
+        Value *V0 = Builder.CreateIntCast(CV0, NewVT, /*isSigned=*/!Zext);
+        Value *V1 = Builder.CreateIntCast(CV1, NewVT, /*isSigned=*/!Zext);
+        return replaceInstUsesWith(CI, Builder.CreateMul(V0, V1));
       }
 
       // Couldn't simplify - canonicalize constant to the RHS.
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
index 2127000c4b780..55f410155e46a 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -582,13 +582,15 @@ Instruction *InstCombinerImpl::narrowBinOp(TruncInst &Trunc) {
                                       APInt(SrcWidth, MaxShiftAmt)))) {
         auto *OldShift = cast<Instruction>(Trunc.getOperand(0));
         bool IsExact = OldShift->isExact();
-        auto *ShAmt = ConstantExpr::getIntegerCast(C, A->getType(), true);
-        ShAmt = Constant::mergeUndefsWith(ShAmt, C);
-        Value *Shift =
-            OldShift->getOpcode() == Instruction::AShr
-                ? Builder.CreateAShr(A, ShAmt, OldShift->getName(), IsExact)
-                : Builder.CreateLShr(A, ShAmt, OldShift->getName(), IsExact);
-        return CastInst::CreateTruncOrBitCast(Shift, DestTy);
+        if (Constant *ShAmt = ConstantFoldIntegerCast(C, A->getType(),
+                                                      /*IsSigned*/ true, DL)) {
+          ShAmt = Constant::mergeUndefsWith(ShAmt, C);
+          Value *Shift =
+              OldShift->getOpcode() == Instruction::AShr
+                  ? Builder.CreateAShr(A, ShAmt, OldShift->getName(), IsExact)
+                  : Builder.CreateLShr(A, ShAmt, OldShift->getName(), IsExact);
+          return CastInst::CreateTruncOrBitCast(Shift, DestTy);
+        }
       }
     }
     break;

From ba31ed472577aea1f4b5d6669bb1e717aaf1fb4f Mon Sep 17 00:00:00 2001
From: Mitch Phillips <31459023+hctim@users.noreply.github.com>
Date: Wed, 1 Nov 2023 11:43:28 +0100
Subject: [PATCH 657/877] Disable memtag sanitization for global fnptrs going
 into .ctors (#70186)

Looks like there's code out there that, instead of using
'__attribute__((constructor(x)))' to add constructor functions, they
just declare a global function pointer and use
'__attribute__((section('.ctors')))' instead.

Problem is, with memtag-globals, we pad the global function pointer to
be 16 bytes large. This of course means we have an 8-byte real function
pointer, then 8 bytes of zero padding, and this trips up the loader when
it processes this section.

Fixes #69939
---
 clang/test/CodeGen/memtag-globals-asm.cpp     | 20 +++++++++++++++++++
 .../Target/AArch64/AArch64GlobalsTagging.cpp  | 12 +++++++++++
 2 files changed, 32 insertions(+)

diff --git a/clang/test/CodeGen/memtag-globals-asm.cpp b/clang/test/CodeGen/memtag-globals-asm.cpp
index 3f18671562def..4b76b394e0c1d 100644
--- a/clang/test/CodeGen/memtag-globals-asm.cpp
+++ b/clang/test/CodeGen/memtag-globals-asm.cpp
@@ -259,3 +259,23 @@ int f(int x) {
   // CHECK-Q-DAG: ldr   {{.*}}, [[[REG_O2]]]
       function_int;
 }
+
+typedef void (*func_t)(void);
+#define CONSTRUCTOR(section_name) \
+  __attribute__((used)) __attribute__((section(section_name)))
+
+__attribute__((constructor(0))) void func_constructor() {}
+CONSTRUCTOR(".init") func_t func_init = func_constructor;
+CONSTRUCTOR(".fini") func_t func_fini = func_constructor;
+CONSTRUCTOR(".ctors") func_t func_ctors = func_constructor;
+CONSTRUCTOR(".dtors") func_t func_dtors = func_constructor;
+CONSTRUCTOR(".init_array") func_t func_init_array = func_constructor;
+CONSTRUCTOR(".fini_array") func_t func_fini_array = func_constructor;
+
+// CHECK-NOT: .memtag func_constructor
+// CHECK-NOT: .memtag func_init
+// CHECK-NOT: .memtag func_fini
+// CHECK-NOT: .memtag func_ctors
+// CHECK-NOT: .memtag func_dtors
+// CHECK-NOT: .memtag func_init_array
+// CHECK-NOT: .memtag func_fini_array
diff --git a/llvm/lib/Target/AArch64/AArch64GlobalsTagging.cpp b/llvm/lib/Target/AArch64/AArch64GlobalsTagging.cpp
index 2ed668712897c..88e44eb0bfbb9 100644
--- a/llvm/lib/Target/AArch64/AArch64GlobalsTagging.cpp
+++ b/llvm/lib/Target/AArch64/AArch64GlobalsTagging.cpp
@@ -43,6 +43,18 @@ static bool shouldTagGlobal(GlobalVariable &G) {
     return false;
   }
 
+  // Don't instrument function pointers that are going into various init arrays
+  // via `__attribute__((section(<foo>)))`:
+  // https://github.com/llvm/llvm-project/issues/69939
+  if (G.hasSection() &&
+      (G.getSection() == ".init" || G.getSection() == ".fini" ||
+       G.getSection() == ".init_array" || G.getSection() == ".fini_array" ||
+       G.getSection() == ".ctors" || G.getSection() == ".dtors")) {
+    Meta.Memtag = false;
+    G.setSanitizerMetadata(Meta);
+    return false;
+  }
+
   return true;
 }
 

From bc41b0ac2b03b8c305d7399d772c87d27cc9831c Mon Sep 17 00:00:00 2001
From: Michael Buch <michaelbuch12@gmail.com>
Date: Wed, 1 Nov 2023 10:43:55 +0000
Subject: [PATCH 658/877] [lldb][Symtab][NFCI] Replace vector::swap with
 shrink_to_fit (#70918)

Replaces the old idiom (of swapping the container to shrink it) with the
newer STL alternative.

Similar transition in LLDB was done in: https://reviews.llvm.org/D47492
---
 lldb/source/Symbol/Symtab.cpp | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/lldb/source/Symbol/Symtab.cpp b/lldb/source/Symbol/Symtab.cpp
index 104faac38ffa0..1aebe198f9e78 100644
--- a/lldb/source/Symbol/Symtab.cpp
+++ b/lldb/source/Symbol/Symtab.cpp
@@ -1010,10 +1010,7 @@ void Symtab::Finalize() {
   // Calculate the size of symbols inside InitAddressIndexes.
   InitAddressIndexes();
   // Shrink to fit the symbols so we don't waste memory
-  if (m_symbols.capacity() > m_symbols.size()) {
-    collection new_symbols(m_symbols.begin(), m_symbols.end());
-    m_symbols.swap(new_symbols);
-  }
+  m_symbols.shrink_to_fit();
   SaveToCache();
 }
 

From b87110e298df6b4c93592f8418582fc4b3133d4a Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Wed, 1 Nov 2023 11:52:31 +0100
Subject: [PATCH 659/877] [SimplifyCFG] Avoid use of
 ConstantExpr::getIntegerCast() (NFC)

We're working on a ConstantInt here, so constant folding will
always succeed. Just avoid using the ConstantExpr API.
---
 llvm/lib/Transforms/Utils/SimplifyCFG.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index 68b5b1a78a346..5ef3a5292af54 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -502,7 +502,7 @@ static ConstantInt *GetConstantInt(Value *V, const DataLayout &DL) {
           return CI;
         else
           return cast<ConstantInt>(
-              ConstantExpr::getIntegerCast(CI, PtrTy, /*isSigned=*/false));
+              ConstantFoldIntegerCast(CI, PtrTy, /*isSigned=*/false, DL));
       }
   return nullptr;
 }

From 2efea512c25bb6f6178e0ff881a96e99b2aecee4 Mon Sep 17 00:00:00 2001
From: Sander de Smalen <sander.desmalen@arm.com>
Date: Wed, 1 Nov 2023 10:57:12 +0000
Subject: [PATCH 660/877] [AArch64] Fix spilling/filling of virtual registers
 in PNR regclass. (#70679)

We made the assumption that the registers were always physical
registers, which doesn't have to be true.
---
 llvm/lib/Target/AArch64/AArch64InstrInfo.cpp | 10 ++++-
 llvm/test/CodeGen/AArch64/spillfill-sve.mir  | 41 ++++++++++++++++++++
 2 files changed, 49 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index a9b0cfbbd33ea..4680af7d121c7 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -4773,7 +4773,10 @@ void AArch64InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
     } else if (AArch64::PNRRegClass.hasSubClassEq(RC)) {
       assert((Subtarget.hasSVE2p1() || Subtarget.hasSME2()) &&
              "Unexpected register store without SVE2p1 or SME2");
-      SrcReg = (SrcReg - AArch64::PN0) + AArch64::P0;
+      if (SrcReg.isVirtual())
+        MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::PPRRegClass);
+      else
+        SrcReg = (SrcReg - AArch64::PN0) + AArch64::P0;
       Opc = AArch64::STR_PXI;
       StackID = TargetStackID::ScalableVector;
     }
@@ -4946,7 +4949,10 @@ void AArch64InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
       assert((Subtarget.hasSVE2p1() || Subtarget.hasSME2()) &&
              "Unexpected register load without SVE2p1 or SME2");
       PNRReg = DestReg;
-      DestReg = (DestReg - AArch64::PN0) + AArch64::P0;
+      if (DestReg.isVirtual())
+        MF.getRegInfo().constrainRegClass(DestReg, &AArch64::PPRRegClass);
+      else
+        DestReg = (DestReg - AArch64::PN0) + AArch64::P0;
       Opc = AArch64::LDR_PXI;
       StackID = TargetStackID::ScalableVector;
     }
diff --git a/llvm/test/CodeGen/AArch64/spillfill-sve.mir b/llvm/test/CodeGen/AArch64/spillfill-sve.mir
index dfd41633c46c3..de93cbfe1aa19 100644
--- a/llvm/test/CodeGen/AArch64/spillfill-sve.mir
+++ b/llvm/test/CodeGen/AArch64/spillfill-sve.mir
@@ -8,6 +8,7 @@
 
   define aarch64_sve_vector_pcs void @spills_fills_stack_id_ppr() #0 { entry: unreachable }
   define aarch64_sve_vector_pcs void @spills_fills_stack_id_pnr() #1 { entry: unreachable }
+  define aarch64_sve_vector_pcs void @spills_fills_stack_id_virtreg_pnr() #1 { entry: unreachable }
   define aarch64_sve_vector_pcs void @spills_fills_stack_id_zpr() #0 { entry: unreachable }
   define aarch64_sve_vector_pcs void @spills_fills_stack_id_zpr2() #0 { entry: unreachable }
   define aarch64_sve_vector_pcs void @spills_fills_stack_id_zpr2strided() #0 { entry: unreachable }
@@ -106,6 +107,46 @@ body:             |
     RET_ReallyLR
 ...
 ---
+name: spills_fills_stack_id_virtreg_pnr
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: pnr_p8to15 }
+stack:
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: spills_fills_stack_id_virtreg_pnr
+    ; CHECK: stack:
+    ; CHECK:      - { id: 0, name: '', type: spill-slot, offset: 0, size: 2, alignment: 2
+    ; CHECK-NEXT:     stack-id: scalable-vector, callee-saved-register: ''
+
+    ; EXPAND-LABEL: name: spills_fills_stack_id_virtreg_pnr
+    ; EXPAND: renamable $pn8 = WHILEGE_CXX_B
+    ; EXPAND: STR_PXI killed renamable $pn8, $sp, 7
+    ; EXPAND: $p0 = LDR_PXI $sp, 7, implicit-def $pn0
+
+    %0:pnr_p8to15 = WHILEGE_CXX_B undef $x0, undef $x0, 0, implicit-def dead $nzcv
+
+    $pn0 = IMPLICIT_DEF
+    $pn1 = IMPLICIT_DEF
+    $pn2 = IMPLICIT_DEF
+    $pn3 = IMPLICIT_DEF
+    $pn4 = IMPLICIT_DEF
+    $pn5 = IMPLICIT_DEF
+    $pn6 = IMPLICIT_DEF
+    $pn7 = IMPLICIT_DEF
+    $pn8 = IMPLICIT_DEF
+    $pn9 = IMPLICIT_DEF
+    $pn10 = IMPLICIT_DEF
+    $pn11 = IMPLICIT_DEF
+    $pn12 = IMPLICIT_DEF
+    $pn13 = IMPLICIT_DEF
+    $pn14 = IMPLICIT_DEF
+    $pn15 = IMPLICIT_DEF
+
+    $pn0 = COPY %0
+    RET_ReallyLR
+...
+---
 name: spills_fills_stack_id_zpr
 tracksRegLiveness: true
 registers:

From 50848916e5e4f07231c7366e6d0014de6f7bc362 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Wed, 1 Nov 2023 11:57:28 +0100
Subject: [PATCH 661/877] [CGExprConstant] Avoid use of
 ConstantExpr::getIntegerCast() (NFC)

We're working on a ConstantInt here, so folding cannot fail. Only
avoid the API use.
---
 clang/lib/CodeGen/CGExprConstant.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/clang/lib/CodeGen/CGExprConstant.cpp b/clang/lib/CodeGen/CGExprConstant.cpp
index c46f38d651972..cd91a698a9336 100644
--- a/clang/lib/CodeGen/CGExprConstant.cpp
+++ b/clang/lib/CodeGen/CGExprConstant.cpp
@@ -1930,8 +1930,9 @@ ConstantLValueEmitter::tryEmitAbsolute(llvm::Type *destTy) {
   // FIXME: signedness depends on the original integer type.
   auto intptrTy = CGM.getDataLayout().getIntPtrType(destPtrTy);
   llvm::Constant *C;
-  C = llvm::ConstantExpr::getIntegerCast(getOffset(), intptrTy,
-                                         /*isSigned*/ false);
+  C = llvm::ConstantFoldIntegerCast(getOffset(), intptrTy, /*isSigned*/ false,
+                                    CGM.getDataLayout());
+  assert(C && "Must have folded, as Offset is a ConstantInt");
   C = llvm::ConstantExpr::getIntToPtr(C, destPtrTy);
   return C;
 }

From 4a7702b785809a0dcd8a38bf7cbdd8181427c716 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Wed, 1 Nov 2023 12:05:20 +0100
Subject: [PATCH 662/877] [AsmPrinter] Avoid use of
 ConstantExpr::getIntegerCast() (NFCI)

In the cases where this used to create a sext/zext constant
expression we should later error for that expression anyway, so
this should be NFC.
---
 llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index fd440718fd378..c6014cff24c10 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -3052,9 +3052,12 @@ const MCExpr *AsmPrinter::lowerConstant(const Constant *CV) {
     // Handle casts to pointers by changing them into casts to the appropriate
     // integer type.  This promotes constant folding and simplifies this code.
     Constant *Op = CE->getOperand(0);
-    Op = ConstantExpr::getIntegerCast(Op, DL.getIntPtrType(CV->getType()),
-                                      false/*ZExt*/);
-    return lowerConstant(Op);
+    Op = ConstantFoldIntegerCast(Op, DL.getIntPtrType(CV->getType()),
+                                 /*IsSigned*/ false, DL);
+    if (Op)
+      return lowerConstant(Op);
+
+    break; // Error
   }
 
   case Instruction::PtrToInt: {

From 43db8ac8ae56f24b5c06e9edb194fe1b7e9cbda0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Stefan=20Gr=C3=A4nitz?=
 <weliveindetail@users.noreply.github.com>
Date: Wed, 1 Nov 2023 12:09:12 +0100
Subject: [PATCH 663/877] [lldb] Fix missing comsumeError() with LLDB_LOG in
 ObjectFileCOFF/PECOFF (#70793)

All `llvm::Error`s must be checked/consumed before destruction. Previously,
the errors in this patch were only consumed when logging was enabled.
Using `LLDB_LOG_ERROR` instead of `LLDB_LOG` fixes that, because it
calls `llvm::consumeError()` explicitly when logging is disabled.
---
 lldb/source/Plugins/ObjectFile/COFF/ObjectFileCOFF.cpp   | 6 +++---
 .../Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.cpp       | 9 ++++-----
 2 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/lldb/source/Plugins/ObjectFile/COFF/ObjectFileCOFF.cpp b/lldb/source/Plugins/ObjectFile/COFF/ObjectFileCOFF.cpp
index 03c454bf3efab..a7ad5d27b237f 100644
--- a/lldb/source/Plugins/ObjectFile/COFF/ObjectFileCOFF.cpp
+++ b/lldb/source/Plugins/ObjectFile/COFF/ObjectFileCOFF.cpp
@@ -271,9 +271,9 @@ void ObjectFileCOFF::ParseSymtab(lldb_private::Symtab &symtab) {
     const auto COFFSymRef = m_object->getCOFFSymbol(SymRef);
 
     Expected<StringRef> NameOrErr = SymRef.getName();
-    if (auto error = NameOrErr.takeError()) {
-      LLDB_LOG(log, "ObjectFileCOFF: failed to get symbol name: {0}",
-               llvm::fmt_consume(std::move(error)));
+    if (!NameOrErr) {
+      LLDB_LOG_ERROR(log, NameOrErr.takeError(),
+                     "ObjectFileCOFF: failed to get symbol name: {0}");
       continue;
     }
 
diff --git a/lldb/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.cpp b/lldb/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.cpp
index 7fb10a69391c5..be0020cad5bee 100644
--- a/lldb/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.cpp
+++ b/lldb/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.cpp
@@ -791,11 +791,10 @@ void ObjectFilePECOFF::AppendFromCOFFSymbolTable(
   for (const auto &sym_ref : m_binary->symbols()) {
     const auto coff_sym_ref = m_binary->getCOFFSymbol(sym_ref);
     auto name_or_error = sym_ref.getName();
-    if (auto err = name_or_error.takeError()) {
-      LLDB_LOG(log,
-               "ObjectFilePECOFF::AppendFromCOFFSymbolTable - failed to get "
-               "symbol table entry name: {0}",
-               llvm::fmt_consume(std::move(err)));
+    if (!name_or_error) {
+      LLDB_LOG_ERROR(log, name_or_error.takeError(),
+                     "ObjectFilePECOFF::AppendFromCOFFSymbolTable - failed to "
+                     "get symbol table entry name: {0}");
       continue;
     }
     const llvm::StringRef sym_name = *name_or_error;

From d08496b51cc97d1d73d55a733cd0ec68e5fe9e84 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Wed, 1 Nov 2023 12:10:12 +0100
Subject: [PATCH 664/877] [AMDGPULowerKernelAttribute] Avoid use of
 ConstantExpr::getIntegerCast() (NFC)

We are working on ConstantInt here, so folding will always succeed.
This just avoids use of the ConstantExpr API.
---
 llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
index 26074cf060714..097722157d410 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
@@ -14,6 +14,7 @@
 
 #include "AMDGPU.h"
 #include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
@@ -286,8 +287,8 @@ static bool processUse(CallInst *CI, bool IsV5OrAbove) {
             if (HasReqdWorkGroupSize) {
               ConstantInt *KnownSize
                 = mdconst::extract<ConstantInt>(MD->getOperand(I));
-              UMin->replaceAllUsesWith(ConstantExpr::getIntegerCast(
-                  KnownSize, UMin->getType(), false));
+              UMin->replaceAllUsesWith(ConstantFoldIntegerCast(
+                  KnownSize, UMin->getType(), false, DL));
             } else {
               UMin->replaceAllUsesWith(ZextGroupSize);
             }
@@ -310,7 +311,7 @@ static bool processUse(CallInst *CI, bool IsV5OrAbove) {
 
     ConstantInt *KnownSize = mdconst::extract<ConstantInt>(MD->getOperand(I));
     GroupSize->replaceAllUsesWith(
-        ConstantExpr::getIntegerCast(KnownSize, GroupSize->getType(), false));
+        ConstantFoldIntegerCast(KnownSize, GroupSize->getType(), false, DL));
     MadeChange = true;
   }
 

From 2e6582903b8051b75d3324ec262ed65f0c163cd1 Mon Sep 17 00:00:00 2001
From: Alex Bradbury <asb@igalia.com>
Date: Wed, 1 Nov 2023 11:16:02 +0000
Subject: [PATCH 665/877] [docs][RISCV] List Zcmp and Zicbop as "Supported"
 rather than "Assembly support (#68717)

Zcmp codegen is implemented (see RISCVMoveMerger and
RISCVPushPopOptimizer), while Zicbop instructions are selected for
llvm.prefetch instrinsics. So I believe it's correct to mark both as
"Supported" and this is just an oversight.
---
 llvm/docs/RISCVUsage.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/docs/RISCVUsage.rst b/llvm/docs/RISCVUsage.rst
index 6812efaeb36e0..7e34269d7acb6 100644
--- a/llvm/docs/RISCVUsage.rst
+++ b/llvm/docs/RISCVUsage.rst
@@ -106,7 +106,7 @@ on support follow.
      ``Zcb``          Supported
      ``Zcd``          Supported
      ``Zcf``          Supported
-     ``Zcmp``         Assembly Support
+     ``Zcmp``         Supported
      ``Zcmt``         Assembly Support
      ``Zdinx``        Supported
      ``Zfa``          Supported
@@ -116,7 +116,7 @@ on support follow.
      ``Zhinx``        Supported
      ``Zhinxmin``     Supported
      ``Zicbom``       Assembly Support
-     ``Zicbop``       Assembly Support
+     ``Zicbop``       Supported
      ``Zicboz``       Assembly Support
      ``Zicntr``       (`See Note <#riscv-i2p1-note>`__)
      ``Zicsr``        (`See Note <#riscv-i2p1-note>`__)

From 00bfef272a36f0a73ba73892e668349f259a7efd Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Wed, 1 Nov 2023 12:14:43 +0100
Subject: [PATCH 666/877] [NVPTXAsmPrinter] Extract common error handling code

Make the code structure more similar to how the original code
in AsmPrinter looks nowadays: On error, break from the switch
and fall through to general error handling code, which will try
to ConstantFold and otherwise report an error. Ensures we always
go through the constant folding path and avoids duplicating the
error printing.
---
 llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp | 44 +++++++++++------------
 1 file changed, 20 insertions(+), 24 deletions(-)

diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index c1df063d80f5f..aa4d5fb7298ef 100644
--- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -1980,35 +1980,16 @@ NVPTXAsmPrinter::lowerConstantForGV(const Constant *CV, bool ProcessingGeneric)
   }
 
   switch (CE->getOpcode()) {
-  default: {
-    // If the code isn't optimized, there may be outstanding folding
-    // opportunities. Attempt to fold the expression using DataLayout as a
-    // last resort before giving up.
-    Constant *C = ConstantFoldConstant(CE, getDataLayout());
-    if (C != CE)
-      return lowerConstantForGV(C, ProcessingGeneric);
-
-    // Otherwise report the problem to the user.
-    std::string S;
-    raw_string_ostream OS(S);
-    OS << "Unsupported expression in static initializer: ";
-    CE->printAsOperand(OS, /*PrintType=*/false,
-                   !MF ? nullptr : MF->getFunction().getParent());
-    report_fatal_error(Twine(OS.str()));
-  }
+  default:
+    break; // Error
 
   case Instruction::AddrSpaceCast: {
     // Strip the addrspacecast and pass along the operand
     PointerType *DstTy = cast<PointerType>(CE->getType());
-    if (DstTy->getAddressSpace() == 0) {
+    if (DstTy->getAddressSpace() == 0)
       return lowerConstantForGV(cast<const Constant>(CE->getOperand(0)), true);
-    }
-    std::string S;
-    raw_string_ostream OS(S);
-    OS << "Unsupported expression in static initializer: ";
-    CE->printAsOperand(OS, /*PrintType=*/ false,
-                       !MF ? nullptr : MF->getFunction().getParent());
-    report_fatal_error(Twine(OS.str()));
+
+    break; // Error
   }
 
   case Instruction::GetElementPtr: {
@@ -2082,6 +2063,21 @@ NVPTXAsmPrinter::lowerConstantForGV(const Constant *CV, bool ProcessingGeneric)
     }
   }
   }
+
+  // If the code isn't optimized, there may be outstanding folding
+  // opportunities. Attempt to fold the expression using DataLayout as a
+  // last resort before giving up.
+  Constant *C = ConstantFoldConstant(CE, getDataLayout());
+  if (C != CE)
+    return lowerConstantForGV(C, ProcessingGeneric);
+
+  // Otherwise report the problem to the user.
+  std::string S;
+  raw_string_ostream OS(S);
+  OS << "Unsupported expression in static initializer: ";
+  CE->printAsOperand(OS, /*PrintType=*/false,
+                 !MF ? nullptr : MF->getFunction().getParent());
+  report_fatal_error(Twine(OS.str()));
 }
 
 // Copy of MCExpr::print customized for NVPTX

From 885d2e43317c1b5e154a1f14c6e6003dcf5680ab Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Wed, 1 Nov 2023 12:17:15 +0100
Subject: [PATCH 667/877] [NVPTXAsmPrinter] Avoid use of
 ConstantExpr::getIntegerCast() (NFC)

Sync the implementation with the original AsmPrinter code.
---
 llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index aa4d5fb7298ef..1ee0b09f98712 100644
--- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -2024,9 +2024,12 @@ NVPTXAsmPrinter::lowerConstantForGV(const Constant *CV, bool ProcessingGeneric)
     // Handle casts to pointers by changing them into casts to the appropriate
     // integer type.  This promotes constant folding and simplifies this code.
     Constant *Op = CE->getOperand(0);
-    Op = ConstantExpr::getIntegerCast(Op, DL.getIntPtrType(CV->getType()),
-                                      false/*ZExt*/);
-    return lowerConstantForGV(Op, ProcessingGeneric);
+    Op = ConstantFoldIntegerCast(Op, DL.getIntPtrType(CV->getType()),
+                                 /*IsSigned*/ false, DL);
+    if (Op)
+      return lowerConstantForGV(Op, ProcessingGeneric);
+
+    break; // Error
   }
 
   case Instruction::PtrToInt: {

From 3a2fbf547d09c51669d00c1557d5818fce0c8639 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Wed, 1 Nov 2023 12:22:56 +0100
Subject: [PATCH 668/877] [AggressiveInstCombine] Avoid use of
 ConstantExpr::getIntegerCast() (NFC)

This transform only deals in truncations, so use
ConstantExpr::getTrunc() instead of ConstantExpr::getIntegerCast()
to clarify what operation is being performed here.
---
 llvm/lib/Transforms/AggressiveInstCombine/TruncInstCombine.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/AggressiveInstCombine/TruncInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/TruncInstCombine.cpp
index 6c62e84077ace..4d9050be5c553 100644
--- a/llvm/lib/Transforms/AggressiveInstCombine/TruncInstCombine.cpp
+++ b/llvm/lib/Transforms/AggressiveInstCombine/TruncInstCombine.cpp
@@ -366,7 +366,7 @@ static Type *getReducedType(Value *V, Type *Ty) {
 Value *TruncInstCombine::getReducedOperand(Value *V, Type *SclTy) {
   Type *Ty = getReducedType(V, SclTy);
   if (auto *C = dyn_cast<Constant>(V)) {
-    C = ConstantExpr::getIntegerCast(C, Ty, false);
+    C = ConstantExpr::getTrunc(C, Ty);
     // If we got a constantexpr back, try to simplify it with DL info.
     return ConstantFoldConstant(C, DL, &TLI);
   }

From 79bd1d8a9521d0efc99a1717aa80e5b4ea8046fd Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Wed, 1 Nov 2023 12:25:37 +0100
Subject: [PATCH 669/877] [SLPVectorizer] Avoid use of
 ConstantExpr::getIntegerCast() (NFC)

We are working on a ConstantInt here, so folding will always
succeed. This just avoids use of the ConstantExpr API.
---
 llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 58974eefc4df5..80487d5861a42 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -34,6 +34,7 @@
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/CodeMetrics.h"
+#include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/DemandedBits.h"
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/IVDescriptors.h"
@@ -6286,8 +6287,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
         if (!CI)
           Operands.back().push_back(Op);
         else
-          Operands.back().push_back(ConstantExpr::getIntegerCast(
-              CI, Ty, CI->getValue().isSignBitSet()));
+          Operands.back().push_back(ConstantFoldIntegerCast(
+              CI, Ty, CI->getValue().isSignBitSet(), *DL));
       }
       TE->setOperand(IndexIdx, Operands.back());
 

From 6aa3dcb47f581cb05a4881ebc53b06a574778f9d Mon Sep 17 00:00:00 2001
From: Nicolas Vasilache <nicolasvasilache@users.noreply.github.com>
Date: Wed, 1 Nov 2023 12:42:38 +0100
Subject: [PATCH 670/877] =?UTF-8?q?[mlir][Transform]=20Relax=20the=20appli?=
 =?UTF-8?q?cability=20of=20transform.foreach=5Fmatch=20=E2=80=A6=20(#70209?=
 =?UTF-8?q?)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

…to also take into account the op itself

From aca9c891a26eaa6e9078b19096cd9aa52d136091 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Wed, 1 Nov 2023 11:36:56 +0100
Subject: [PATCH 671/877] [InstCombine] Avoid use of
 ConstantExpr::getIntegerCast()

Require that constants are ImmConstant for this transform, as we
may otherwise generate constant expressions, which are not
necessarily free.
---
 llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp   | 10 +++-------
 llvm/test/ThinLTO/X86/cfi-devirt.ll                    |  2 +-
 .../Transforms/InstCombine/constant-expr-datalayout.ll |  5 +++--
 3 files changed, 7 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
index 55f410155e46a..5bdb4d58412f4 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -29,11 +29,8 @@ using namespace PatternMatch;
 /// true for, actually insert the code to evaluate the expression.
 Value *InstCombinerImpl::EvaluateInDifferentType(Value *V, Type *Ty,
                                                  bool isSigned) {
-  if (Constant *C = dyn_cast<Constant>(V)) {
-    C = ConstantExpr::getIntegerCast(C, Ty, isSigned /*Sext or ZExt*/);
-    // If we got a constantexpr back, try to simplify it with DL info.
-    return ConstantFoldConstant(C, DL, &TLI);
-  }
+  if (Constant *C = dyn_cast<Constant>(V))
+    return ConstantFoldIntegerCast(C, Ty, isSigned, DL);
 
   // Otherwise, it must be an instruction.
   Instruction *I = cast<Instruction>(V);
@@ -216,7 +213,7 @@ Instruction *InstCombinerImpl::commonCastTransforms(CastInst &CI) {
 /// Constants and extensions/truncates from the destination type are always
 /// free to be evaluated in that type. This is a helper for canEvaluate*.
 static bool canAlwaysEvaluateInType(Value *V, Type *Ty) {
-  if (isa<Constant>(V))
+  if (match(V, m_ImmConstant()))
     return true;
   Value *X;
   if ((match(V, m_ZExtOrSExt(m_Value(X))) || match(V, m_Trunc(m_Value(X)))) &&
@@ -229,7 +226,6 @@ static bool canAlwaysEvaluateInType(Value *V, Type *Ty) {
 /// Filter out values that we can not evaluate in the destination type for free.
 /// This is a helper for canEvaluate*.
 static bool canNotEvaluateInType(Value *V, Type *Ty) {
-  assert(!isa<Constant>(V) && "Constant should already be handled.");
   if (!isa<Instruction>(V))
     return true;
   // We don't extend or shrink something that has multiple uses --  doing so
diff --git a/llvm/test/ThinLTO/X86/cfi-devirt.ll b/llvm/test/ThinLTO/X86/cfi-devirt.ll
index 7df936c02a341..70b0ba2f8faa5 100644
--- a/llvm/test/ThinLTO/X86/cfi-devirt.ll
+++ b/llvm/test/ThinLTO/X86/cfi-devirt.ll
@@ -91,7 +91,7 @@ cont2:
   ; CHECK-IR: br i1 {{.*}}, label %trap, label %cont2
 
   ; We still have to call it as virtual.
-  ; CHECK-IR: %call3 = tail call i32 %5
+  ; CHECK-IR: %call3 = tail call i32 %7
   %call3 = tail call i32 %5(ptr nonnull %obj, i32 %call)
   ret i32 %call3
 }
diff --git a/llvm/test/Transforms/InstCombine/constant-expr-datalayout.ll b/llvm/test/Transforms/InstCombine/constant-expr-datalayout.ll
index bdb210bd26d39..442089eecfcbb 100644
--- a/llvm/test/Transforms/InstCombine/constant-expr-datalayout.ll
+++ b/llvm/test/Transforms/InstCombine/constant-expr-datalayout.ll
@@ -22,8 +22,9 @@ define void @test1(ptr %ptr) {
 
 define i64 @OpenFilter(i64 %x) {
 ; CHECK-LABEL: @OpenFilter(
-; CHECK-NEXT:    [[T:%.*]] = sub i64 [[X:%.*]], zext (i8 ptrtoint (ptr @channel_wg4idx to i8) to i64)
-; CHECK-NEXT:    [[R:%.*]] = and i64 [[T]], 255
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[X:%.*]] to i8
+; CHECK-NEXT:    [[T:%.*]] = sub i8 [[TMP1]], ptrtoint (ptr @channel_wg4idx to i8)
+; CHECK-NEXT:    [[R:%.*]] = zext i8 [[T]] to i64
 ; CHECK-NEXT:    ret i64 [[R]]
 ;
   %sub = sub i64 %x, ptrtoint (ptr @channel_wg4idx to i64)

From 14043d42e4c094079c89db3714152d541555a93d Mon Sep 17 00:00:00 2001
From: Jacek Caban <jacek@codeweavers.com>
Date: Wed, 1 Nov 2023 13:27:29 +0100
Subject: [PATCH 672/877] [lld] Implement getOutputCharacteristics for
 non-section code thunks. (#70721)

This will be useful for ARM64EC, but it also fixes MinGW export handling
when synthetic function symbols are exported.
---
 lld/COFF/Chunks.h               | 18 ++++++++++++++----
 lld/COFF/DLL.cpp                | 16 ++++++++--------
 lld/test/COFF/export-thunk.test | 14 ++++++++++++++
 3 files changed, 36 insertions(+), 12 deletions(-)
 create mode 100644 lld/test/COFF/export-thunk.test

diff --git a/lld/COFF/Chunks.h b/lld/COFF/Chunks.h
index cbfeb5c025adb..b82bc9416775d 100644
--- a/lld/COFF/Chunks.h
+++ b/lld/COFF/Chunks.h
@@ -180,6 +180,16 @@ class NonSectionChunk : public Chunk {
   NonSectionChunk(Kind k = OtherKind) : Chunk(k) {}
 };
 
+class NonSectionCodeChunk : public NonSectionChunk {
+public:
+  virtual uint32_t getOutputCharacteristics() const override {
+    return llvm::COFF::IMAGE_SCN_MEM_READ | llvm::COFF::IMAGE_SCN_MEM_EXECUTE;
+  }
+
+protected:
+  NonSectionCodeChunk(Kind k = OtherKind) : NonSectionChunk(k) {}
+};
+
 // MinGW specific; information about one individual location in the image
 // that needs to be fixed up at runtime after loading. This represents
 // one individual element in the PseudoRelocTableChunk table.
@@ -508,10 +518,10 @@ static const uint8_t importThunkARM64[] = {
 // Windows-specific.
 // A chunk for DLL import jump table entry. In a final output, its
 // contents will be a JMP instruction to some __imp_ symbol.
-class ImportThunkChunk : public NonSectionChunk {
+class ImportThunkChunk : public NonSectionCodeChunk {
 public:
   ImportThunkChunk(COFFLinkerContext &ctx, Defined *s)
-      : NonSectionChunk(ImportThunkKind), impSymbol(s), ctx(ctx) {}
+      : NonSectionCodeChunk(ImportThunkKind), impSymbol(s), ctx(ctx) {}
   static bool classof(const Chunk *c) { return c->kind() == ImportThunkKind; }
 
 protected:
@@ -560,7 +570,7 @@ class ImportThunkChunkARM64 : public ImportThunkChunk {
   MachineTypes getMachine() const override { return ARM64; }
 };
 
-class RangeExtensionThunkARM : public NonSectionChunk {
+class RangeExtensionThunkARM : public NonSectionCodeChunk {
 public:
   explicit RangeExtensionThunkARM(COFFLinkerContext &ctx, Defined *t)
       : target(t), ctx(ctx) {
@@ -576,7 +586,7 @@ class RangeExtensionThunkARM : public NonSectionChunk {
   COFFLinkerContext &ctx;
 };
 
-class RangeExtensionThunkARM64 : public NonSectionChunk {
+class RangeExtensionThunkARM64 : public NonSectionCodeChunk {
 public:
   explicit RangeExtensionThunkARM64(COFFLinkerContext &ctx, Defined *t)
       : target(t), ctx(ctx) {
diff --git a/lld/COFF/DLL.cpp b/lld/COFF/DLL.cpp
index 0b337a209c377..6b516d8c6d5ef 100644
--- a/lld/COFF/DLL.cpp
+++ b/lld/COFF/DLL.cpp
@@ -313,7 +313,7 @@ static const uint8_t tailMergeARM64[] = {
 };
 
 // A chunk for the delay import thunk.
-class ThunkChunkX64 : public NonSectionChunk {
+class ThunkChunkX64 : public NonSectionCodeChunk {
 public:
   ThunkChunkX64(Defined *i, Chunk *tm) : imp(i), tailMerge(tm) {}
 
@@ -330,7 +330,7 @@ class ThunkChunkX64 : public NonSectionChunk {
   Chunk *tailMerge = nullptr;
 };
 
-class TailMergeChunkX64 : public NonSectionChunk {
+class TailMergeChunkX64 : public NonSectionCodeChunk {
 public:
   TailMergeChunkX64(Chunk *d, Defined *h) : desc(d), helper(h) {}
 
@@ -382,7 +382,7 @@ class TailMergeUnwindInfoX64 : public NonSectionChunk {
   }
 };
 
-class ThunkChunkX86 : public NonSectionChunk {
+class ThunkChunkX86 : public NonSectionCodeChunk {
 public:
   ThunkChunkX86(COFFLinkerContext &ctx, Defined *i, Chunk *tm)
       : imp(i), tailMerge(tm), ctx(ctx) {}
@@ -407,7 +407,7 @@ class ThunkChunkX86 : public NonSectionChunk {
   const COFFLinkerContext &ctx;
 };
 
-class TailMergeChunkX86 : public NonSectionChunk {
+class TailMergeChunkX86 : public NonSectionCodeChunk {
 public:
   TailMergeChunkX86(COFFLinkerContext &ctx, Chunk *d, Defined *h)
       : desc(d), helper(h), ctx(ctx) {}
@@ -432,7 +432,7 @@ class TailMergeChunkX86 : public NonSectionChunk {
   const COFFLinkerContext &ctx;
 };
 
-class ThunkChunkARM : public NonSectionChunk {
+class ThunkChunkARM : public NonSectionCodeChunk {
 public:
   ThunkChunkARM(COFFLinkerContext &ctx, Defined *i, Chunk *tm)
       : imp(i), tailMerge(tm), ctx(ctx) {
@@ -459,7 +459,7 @@ class ThunkChunkARM : public NonSectionChunk {
   const COFFLinkerContext &ctx;
 };
 
-class TailMergeChunkARM : public NonSectionChunk {
+class TailMergeChunkARM : public NonSectionCodeChunk {
 public:
   TailMergeChunkARM(COFFLinkerContext &ctx, Chunk *d, Defined *h)
       : desc(d), helper(h), ctx(ctx) {
@@ -486,7 +486,7 @@ class TailMergeChunkARM : public NonSectionChunk {
   const COFFLinkerContext &ctx;
 };
 
-class ThunkChunkARM64 : public NonSectionChunk {
+class ThunkChunkARM64 : public NonSectionCodeChunk {
 public:
   ThunkChunkARM64(Defined *i, Chunk *tm) : imp(i), tailMerge(tm) {
     setAlignment(4);
@@ -506,7 +506,7 @@ class ThunkChunkARM64 : public NonSectionChunk {
   Chunk *tailMerge = nullptr;
 };
 
-class TailMergeChunkARM64 : public NonSectionChunk {
+class TailMergeChunkARM64 : public NonSectionCodeChunk {
 public:
   TailMergeChunkARM64(Chunk *d, Defined *h) : desc(d), helper(h) {
     setAlignment(4);
diff --git a/lld/test/COFF/export-thunk.test b/lld/test/COFF/export-thunk.test
new file mode 100644
index 0000000000000..85e0e92bc0639
--- /dev/null
+++ b/lld/test/COFF/export-thunk.test
@@ -0,0 +1,14 @@
+REQUIRES: x86
+
+RUN: echo -e 'LIBRARY test.dll\nEXPORTS\nimpfunc\n' > %t.imp.def
+RUN: llvm-dlltool -m i386:x86-64 -d %t.imp.def -l %t.imp.lib
+RUN: lld-link -machine:amd64 -out:%t.dll -dll -noentry -lldmingw %t.imp.lib -export:impfunc -output-def:%t.def
+
+Check that the synthetic import thunk is exported as a function, not data.
+
+RUN: cat %t.def | FileCheck %s
+CHECK: EXPORTS
+CHECK-NEXT: impfunc @1
+
+RUN: cat %t.def | FileCheck -check-prefix=CHECK-NO-DATA %s
+CHECK-NO-DATA-NOT: DATA

From 47d9fbc04b91fb03b6da294e82c2fb4bca6b6343 Mon Sep 17 00:00:00 2001
From: Joseph Huber <jhuber6@vols.utk.edu>
Date: Wed, 1 Nov 2023 07:46:16 -0500
Subject: [PATCH 673/877] [LinkerWrapper] Add 'Freestanding' config to the LTO
 pass

Summary:
These GPU images are expected to be freestanding, so we should disable
emission of builtins for whatever target we are offloading to.
---
 clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
index 1c269ce890b3b..bafe8ace60d1c 100644
--- a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
+++ b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
@@ -526,6 +526,7 @@ std::unique_ptr<lto::LTO> createLTO(
 
   Conf.CPU = Arch.str();
   Conf.Options = codegen::InitTargetOptionsFromCodeGenFlags(Triple);
+  Conf.Freestanding = true;
 
   StringRef OptLevel = Args.getLastArgValue(OPT_opt_level, "O2");
   Conf.MAttrs = Features;

From 6477b41a0b56919c42f44c3499ed32720101b43b Mon Sep 17 00:00:00 2001
From: hassnaaHamdi <hassnaa.hamdi@arm.com>
Date: Wed, 1 Nov 2023 13:03:00 +0000
Subject: [PATCH 674/877] [llvm][AArch64][Assembly]: Add FP8FMA  assembly and
 disassembly. (#70134)

This patch adds the feature flag FP8FMA and the assembly/disassembly
for the following instructions of NEON and SVE2:
  * NEON:
    - FMLALBlane
    - FMLALTlane
    - FMLALLBBlane
    - FMLALLBTlane
    - FMLALLTBlane
    - FMLALLTTlane
    - FMLALB
    - FMLALT
    - FMLALLB
    - FMLALLBT
    - FMLALLTB
    - FMLALLTT
  * SVE2:
    - FMLALB_ZZZI
    - FMLALT_ZZZI
    - FMLALB_ZZZ
    - FMLALT_ZZZ
    - FMLALLBB_ZZZI
    - FMLALLBT_ZZZI
    - FMLALLTB_ZZZI
    - FMLALLTT_ZZZI
    - FMLALLBB_ZZZ
    - FMLALLBT_ZZZ
    - FMLALLTB_ZZZ
    - FMLALLTT_ZZZ

That is according to this documentation:
https://developer.arm.com/documentation/ddi0602/2023-09
---
 .../llvm/TargetParser/AArch64TargetParser.h   |   4 +
 .../llvm/TargetParser/SubtargetFeature.h      |   2 +-
 llvm/lib/Target/AArch64/AArch64.td            |  10 +-
 .../lib/Target/AArch64/AArch64InstrFormats.td |  34 +++
 llvm/lib/Target/AArch64/AArch64InstrInfo.td   |  22 ++
 .../Target/AArch64/AArch64RegisterInfo.cpp    |   2 +
 .../lib/Target/AArch64/AArch64RegisterInfo.td |  16 ++
 .../lib/Target/AArch64/AArch64SVEInstrInfo.td |  19 ++
 llvm/lib/Target/AArch64/AArch64SchedA64FX.td  |   2 +-
 .../AArch64/AsmParser/AArch64AsmParser.cpp    |  63 +++--
 .../Disassembler/AArch64Disassembler.cpp      |  11 +
 .../AArch64/GISel/AArch64RegisterBankInfo.cpp |   3 +-
 llvm/lib/Target/AArch64/SVEInstrFormats.td    |  73 +++++-
 .../MC/AArch64/FP8/directive-arch-negative.s  |  11 +
 llvm/test/MC/AArch64/FP8/directive-arch.s     |  12 +-
 llvm/test/MC/AArch64/FP8/mla-diagnostic.s     |  85 +++++++
 llvm/test/MC/AArch64/FP8/mla.s                |  98 ++++++++
 .../MC/AArch64/FP8_SVE2/fmlal-diagnostics.s   |  64 +++++
 llvm/test/MC/AArch64/FP8_SVE2/fmlal.s         | 114 +++++++++
 .../MC/AArch64/FP8_SVE2/fmlall-diagnostics.s  |  85 +++++++
 llvm/test/MC/AArch64/FP8_SVE2/fmlall.s        | 222 ++++++++++++++++++
 .../TargetParser/TargetParserTest.cpp         |  45 ++--
 22 files changed, 944 insertions(+), 53 deletions(-)
 create mode 100644 llvm/test/MC/AArch64/FP8/mla-diagnostic.s
 create mode 100644 llvm/test/MC/AArch64/FP8/mla.s
 create mode 100644 llvm/test/MC/AArch64/FP8_SVE2/fmlal-diagnostics.s
 create mode 100644 llvm/test/MC/AArch64/FP8_SVE2/fmlal.s
 create mode 100644 llvm/test/MC/AArch64/FP8_SVE2/fmlall-diagnostics.s
 create mode 100644 llvm/test/MC/AArch64/FP8_SVE2/fmlall.s

diff --git a/llvm/include/llvm/TargetParser/AArch64TargetParser.h b/llvm/include/llvm/TargetParser/AArch64TargetParser.h
index 232b3d6a6dbb1..d1a3c94bc18ed 100644
--- a/llvm/include/llvm/TargetParser/AArch64TargetParser.h
+++ b/llvm/include/llvm/TargetParser/AArch64TargetParser.h
@@ -162,6 +162,8 @@ enum ArchExtKind : unsigned {
   AEK_FPMR =          58, // FEAT_FPMR
   AEK_FP8 =           59, // FEAT_FP8
   AEK_FAMINMAX =      60, // FEAT_FAMINMAX
+  AEK_FP8FMA =        61, // FEAT_FP8FMA
+  AEK_SSVE_FP8FMA =   62, // FEAT_SSVE_FP8FMA
   AEK_NUM_EXTENSIONS
 };
 using ExtensionBitset = Bitset<AEK_NUM_EXTENSIONS>;
@@ -273,6 +275,8 @@ inline constexpr ExtensionInfo Extensions[] = {
     {"fpmr", AArch64::AEK_FPMR, "+fpmr", "-fpmr", FEAT_INIT, "", 0},
     {"fp8", AArch64::AEK_FP8, "+fp8", "-fp8", FEAT_INIT, "+fpmr", 0},
     {"faminmax", AArch64::AEK_FAMINMAX, "+faminmax", "-faminmax", FEAT_INIT, "", 0},
+    {"fp8fma", AArch64::AEK_FP8FMA, "+fp8fma", "-fp8fma", FEAT_INIT, "+fpmr", 0},
+    {"ssve-fp8fma", AArch64::AEK_SSVE_FP8FMA, "+ssve-fp8fma", "-ssve-fp8fma", FEAT_INIT, "+sme2", 0},
     // Special cases
     {"none", AArch64::AEK_NONE, {}, {}, FEAT_INIT, "", ExtensionInfo::MaxFMVPriority},
 };
diff --git a/llvm/include/llvm/TargetParser/SubtargetFeature.h b/llvm/include/llvm/TargetParser/SubtargetFeature.h
index e4dddfb78effb..2e1f00dad2df3 100644
--- a/llvm/include/llvm/TargetParser/SubtargetFeature.h
+++ b/llvm/include/llvm/TargetParser/SubtargetFeature.h
@@ -31,7 +31,7 @@ namespace llvm {
 class raw_ostream;
 class Triple;
 
-const unsigned MAX_SUBTARGET_WORDS = 4;
+const unsigned MAX_SUBTARGET_WORDS = 5;
 const unsigned MAX_SUBTARGET_FEATURES = MAX_SUBTARGET_WORDS * 64;
 
 /// Container class for subtarget features.
diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td
index 8fd9358c9f9c7..d3d4bd84f1e98 100644
--- a/llvm/lib/Target/AArch64/AArch64.td
+++ b/llvm/lib/Target/AArch64/AArch64.td
@@ -517,6 +517,12 @@ def FeatureSME2p1 : SubtargetFeature<"sme2p1", "HasSME2p1", "true",
 def FeatureFAMINMAX: SubtargetFeature<"faminmax", "HasFAMINMAX", "true",
    "Enable FAMIN and FAMAX instructions (FEAT_FAMINMAX)">;
 
+def FeatureFP8FMA : SubtargetFeature<"fp8fma", "HasFP8FMA", "true",
+  "Enable fp8 multiply-add instructions (FEAT_FP8FMA)">;
+
+def FeatureSSVE_FP8FMA : SubtargetFeature<"ssve-fp8fma", "HasSSVE_FP8FMA", "true",
+  "Enable SVE2 fp8 multiply-add instructions (FEAT_SSVE_FP8FMA)", [FeatureSME2]>;
+
 def FeatureAppleA7SysReg  : SubtargetFeature<"apple-a7-sysreg", "HasAppleA7SysReg", "true",
   "Apple A7 (the CPU formerly known as Cyclone)">;
 
@@ -747,7 +753,7 @@ let F = [HasSVE2p1, HasSVE2p1_or_HasSME2, HasSVE2p1_or_HasSME2p1] in
 def SVE2p1Unsupported : AArch64Unsupported;
 
 def SVE2Unsupported : AArch64Unsupported {
-  let F = !listconcat([HasSVE2, HasSVE2orSME,
+  let F = !listconcat([HasSVE2, HasSVE2orSME, HasSSVE_FP8FMA,
                        HasSVE2AES, HasSVE2SHA3, HasSVE2SM4, HasSVE2BitPerm],
                        SVE2p1Unsupported.F);
 }
@@ -761,7 +767,7 @@ let F = [HasSME2p1, HasSVE2p1_or_HasSME2p1] in
 def SME2p1Unsupported : AArch64Unsupported;
 
 def SME2Unsupported : AArch64Unsupported {
-  let F = !listconcat([HasSME2, HasSVE2p1_or_HasSME2],
+  let F = !listconcat([HasSME2, HasSVE2p1_or_HasSME2, HasSSVE_FP8FMA],
                       SME2p1Unsupported.F);
 }
 
diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index a48bf77a774b7..152e3b4c7407d 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -6055,6 +6055,15 @@ multiclass SIMDThreeSameVectorFML<bit U, bit b13, bits<3> size, string asm,
                                          v4f32, v8f16, OpNode>;
 }
 
+multiclass SIMDThreeSameVectorMLA<bit Q, string asm>{
+  def v8f16 : BaseSIMDThreeSameVectorDot<Q, 0b0, 0b11, 0b1111, asm, ".8h", ".16b",
+                                         V128, v8f16, v16i8, null_frag>;
+}
+
+multiclass SIMDThreeSameVectorMLAL<bit Q, bits<2> sz, string asm>{
+  def v4f32 : BaseSIMDThreeSameVectorDot<Q, 0b0, sz, 0b1000, asm, ".4s", ".16b",
+                                         V128, v4f32, v16i8, null_frag>;
+}
 
 // FP8 assembly/disassembly classes
 
@@ -8521,6 +8530,31 @@ class BF16ToSinglePrecision<string asm>
 }
 } // End of let mayStore = 0, mayLoad = 0, hasSideEffects = 0
 
+//----------------------------------------------------------------------------
+class BaseSIMDThreeSameVectorIndexB<bit Q, bit U, bits<2> sz, bits<4> opc,
+                                    string asm, string dst_kind,
+                                    RegisterOperand RegType,
+                                    RegisterOperand RegType_lo>
+  : BaseSIMDIndexedTied<Q, U, 0b0, sz, opc,
+                        RegType, RegType, RegType_lo, VectorIndexB,
+                        asm, "", dst_kind, ".16b", ".b", []> {
+
+  // idx = H:L:M
+  bits<4> idx;
+  let Inst{11}    = idx{3};
+  let Inst{21-19} = idx{2-0};
+}
+
+multiclass SIMDThreeSameVectorMLAIndex<bit Q, string asm> {
+  def v8f16 : BaseSIMDThreeSameVectorIndexB<Q, 0b0, 0b11, 0b0000, asm, ".8h",
+                                            V128, V128_0to7>;
+}
+
+multiclass SIMDThreeSameVectorMLALIndex<bit Q, bits<2> sz, string asm> {
+  def v4f32 : BaseSIMDThreeSameVectorIndexB<Q, 0b1, sz, 0b1000, asm, ".4s",
+                                            V128, V128_0to7>;
+}
+
 //----------------------------------------------------------------------------
 // Armv8.6 Matrix Multiply Extension
 //----------------------------------------------------------------------------
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index d262c8bbe485a..20f872dfc17b2 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -166,6 +166,13 @@ def HasFP8           : Predicate<"Subtarget->hasFP8()">,
                                  AssemblerPredicateWithAll<(all_of FeatureFP8), "fp8">;
 def HasFAMINMAX      : Predicate<"Subtarget->hasFAMINMAX()">,
                                  AssemblerPredicateWithAll<(all_of FeatureFAMINMAX), "faminmax">;
+def HasFP8FMA        : Predicate<"Subtarget->hasFP8FMA()">,
+                                 AssemblerPredicateWithAll<(all_of FeatureFP8FMA), "fp8fma">;
+def HasSSVE_FP8FMA   : Predicate<"Subtarget->SSVE_FP8FMA() || "
+                                 "(Subtarget->hasSVE2() && Subtarget->hasFP8FMA())">,
+                                 AssemblerPredicateWithAll<(any_of FeatureSSVE_FP8FMA,
+                                                           (all_of FeatureSVE2, FeatureFP8FMA)),
+                                                           "ssve-fp8fma or (sve2 and fp8fma)">;
 
 // A subset of SVE(2) instructions are legal in Streaming SVE execution mode,
 // they should be enabled if either has been specified.
@@ -9286,6 +9293,21 @@ let Predicates = [HasFAMINMAX] in {
  defm FAMIN : SIMDThreeSameVectorFP<0b1, 0b1, 0b011, "famin", null_frag>;
 } // End let Predicates = [HasFAMAXMIN]
 
+let Predicates = [HasFP8FMA] in {
+ defm FMLALBlane : SIMDThreeSameVectorMLAIndex<0b0, "fmlalb">;
+ defm FMLALTlane : SIMDThreeSameVectorMLAIndex<0b1, "fmlalt">;
+ defm FMLALLBBlane : SIMDThreeSameVectorMLALIndex<0b0, 0b00, "fmlallbb">;
+ defm FMLALLBTlane : SIMDThreeSameVectorMLALIndex<0b0, 0b01, "fmlallbt">;
+ defm FMLALLTBlane : SIMDThreeSameVectorMLALIndex<0b1, 0b00, "fmlalltb">;
+ defm FMLALLTTlane : SIMDThreeSameVectorMLALIndex<0b1, 0b01, "fmlalltt">;
+
+ defm FMLALB : SIMDThreeSameVectorMLA<0b0, "fmlalb">;
+ defm FMLALT : SIMDThreeSameVectorMLA<0b1, "fmlalt">;
+ defm FMLALLBB : SIMDThreeSameVectorMLAL<0b0, 0b00, "fmlallbb">;
+ defm FMLALLBT : SIMDThreeSameVectorMLAL<0b0, 0b01, "fmlallbt">;
+ defm FMLALLTB : SIMDThreeSameVectorMLAL<0b1, 0b00, "fmlalltb">;
+ defm FMLALLTT : SIMDThreeSameVectorMLAL<0b1, 0b01, "fmlalltt">;
+} // End let Predicates = [HasFP8FMA]
 
 include "AArch64InstrAtomics.td"
 include "AArch64SVEInstrInfo.td"
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
index cc5384613f1ab..1a730a0b955d3 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -981,6 +981,8 @@ unsigned AArch64RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
   case AArch64::FPR64_loRegClassID:
   case AArch64::FPR16_loRegClassID:
     return 16;
+  case AArch64::FPR128_0to7RegClassID:
+    return 8;
   }
 }
 
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
index eb26591908fd7..497c143584d48 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
@@ -467,6 +467,13 @@ def FPR128_lo : RegisterClass<"AArch64",
                                v8bf16],
                               128, (trunc FPR128, 16)>;
 
+// The lower 8 vector registers.  Some instructions can only take registers
+// in this range.
+def FPR128_0to7 : RegisterClass<"AArch64",
+                                [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64, v8f16,
+                                 v8bf16],
+                                128, (trunc FPR128, 8)>;
+
 // Pairs, triples, and quads of 64-bit vector registers.
 def DSeqPairs : RegisterTuples<[dsub0, dsub1], [(rotl FPR64, 0), (rotl FPR64, 1)]>;
 def DSeqTriples : RegisterTuples<[dsub0, dsub1, dsub2],
@@ -534,6 +541,15 @@ def V128_lo : RegisterOperand<FPR128_lo, "printVRegOperand"> {
   let ParserMatchClass = VectorRegLoAsmOperand;
 }
 
+def VectorReg0to7AsmOperand : AsmOperandClass {
+  let Name = "VectorReg0to7";
+  let PredicateMethod = "isNeonVectorReg0to7";
+}
+
+def V128_0to7 : RegisterOperand<FPR128_0to7, "printVRegOperand"> {
+  let ParserMatchClass = VectorReg0to7AsmOperand;
+}
+
 class TypedVecListAsmOperand<int count, string vecty, int lanes, int eltsize>
     : AsmOperandClass {
   let Name = "TypedVectorList" # count # "_" # lanes # eltsize;
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 3faa06f995e5b..abe07dc0ce327 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -4045,3 +4045,22 @@ let Predicates = [HasSVE2orSME2, HasFAMINMAX] in {
 defm FAMIN_ZPmZ : sve_fp_2op_p_zds<0b1111, "famin", "", null_frag, DestructiveOther>;
 defm FAMAX_ZPmZ : sve_fp_2op_p_zds<0b1110, "famax", "", null_frag, DestructiveOther>;
 } // End HasSVE2orSME2, HasFAMINMAX
+
+let Predicates = [HasSSVE_FP8FMA] in {
+// FP8 Widening Multiply-Add Long - Indexed Group
+def FMLALB_ZZZI : sve2_fp8_mla_long_by_indexed_elem<0b0, "fmlalb">;
+def FMLALT_ZZZI : sve2_fp8_mla_long_by_indexed_elem<0b1, "fmlalt">;
+// FP8 Widening Multiply-Add Long Group
+def FMLALB_ZZZ : sve2_fp8_mla<0b100, ZPR16, "fmlalb">;
+def FMLALT_ZZZ : sve2_fp8_mla<0b101, ZPR16, "fmlalt">;
+// FP8 Widening Multiply-Add Long Long - Indexed Group
+def FMLALLBB_ZZZI : sve2_fp8_mla_long_long_by_indexed_elem<0b00, "fmlallbb">;
+def FMLALLBT_ZZZI : sve2_fp8_mla_long_long_by_indexed_elem<0b01, "fmlallbt">;
+def FMLALLTB_ZZZI : sve2_fp8_mla_long_long_by_indexed_elem<0b10, "fmlalltb">;
+def FMLALLTT_ZZZI : sve2_fp8_mla_long_long_by_indexed_elem<0b11, "fmlalltt">;
+// FP8 Widening Multiply-Add Long Long Group
+def FMLALLBB_ZZZ : sve2_fp8_mla<0b000, ZPR32, "fmlallbb">;
+def FMLALLBT_ZZZ : sve2_fp8_mla<0b001, ZPR32, "fmlallbt">;
+def FMLALLTB_ZZZ : sve2_fp8_mla<0b010, ZPR32, "fmlalltb">;
+def FMLALLTT_ZZZ : sve2_fp8_mla<0b011, ZPR32, "fmlalltt">;
+} // End HasSSVE_FP8FMA
diff --git a/llvm/lib/Target/AArch64/AArch64SchedA64FX.td b/llvm/lib/Target/AArch64/AArch64SchedA64FX.td
index f3e844dc5b270..1de0e8291ef12 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedA64FX.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedA64FX.td
@@ -23,7 +23,7 @@ def A64FXModel : SchedMachineModel {
   list<Predicate> UnsupportedFeatures =
     [HasSVE2, HasSVE2AES, HasSVE2SM4, HasSVE2SHA3, HasSVE2BitPerm, HasPAuth,
      HasSVE2orSME, HasMTE, HasMatMulInt8, HasBF16, HasSME2, HasSME2p1, HasSVE2p1,
-     HasSVE2p1_or_HasSME2p1, HasSMEF16F16];
+     HasSVE2p1_or_HasSME2p1, HasSMEF16F16, HasSSVE_FP8FMA];
 
   let FullInstRWOverlapCheck = 0;
 }
diff --git a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
index 35abe3563eb81..afecff1303666 100644
--- a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
+++ b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -1223,6 +1223,12 @@ class AArch64Operand : public MCParsedAsmOperand {
                 Reg.RegNum));
   }
 
+  bool isNeonVectorReg0to7() const {
+    return Kind == k_Register && Reg.Kind == RegKind::NeonVector &&
+           (AArch64MCRegisterClasses[AArch64::FPR128_0to7RegClassID].contains(
+               Reg.RegNum));
+  }
+
   bool isMatrix() const { return Kind == k_MatrixRegister; }
   bool isMatrixTileList() const { return Kind == k_MatrixTileList; }
 
@@ -1766,6 +1772,11 @@ class AArch64Operand : public MCParsedAsmOperand {
     Inst.addOperand(MCOperand::createReg(getReg()));
   }
 
+  void addVectorReg0to7Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createReg(getReg()));
+  }
+
   enum VecListIndexType {
     VecListIdx_DReg = 0,
     VecListIdx_QReg = 1,
@@ -2598,31 +2609,31 @@ static std::optional<std::pair<int, int>> parseVectorKind(StringRef Suffix,
 
   switch (VectorKind) {
   case RegKind::NeonVector:
-    Res =
-        StringSwitch<std::pair<int, int>>(Suffix.lower())
-            .Case("", {0, 0})
-            .Case(".1d", {1, 64})
-            .Case(".1q", {1, 128})
-            // '.2h' needed for fp16 scalar pairwise reductions
-            .Case(".2h", {2, 16})
-            .Case(".2s", {2, 32})
-            .Case(".2d", {2, 64})
-            // '.4b' is another special case for the ARMv8.2a dot product
-            // operand
-            .Case(".4b", {4, 8})
-            .Case(".4h", {4, 16})
-            .Case(".4s", {4, 32})
-            .Case(".8b", {8, 8})
-            .Case(".8h", {8, 16})
-            .Case(".16b", {16, 8})
-            // Accept the width neutral ones, too, for verbose syntax. If those
-            // aren't used in the right places, the token operand won't match so
-            // all will work out.
-            .Case(".b", {0, 8})
-            .Case(".h", {0, 16})
-            .Case(".s", {0, 32})
-            .Case(".d", {0, 64})
-            .Default({-1, -1});
+    Res = StringSwitch<std::pair<int, int>>(Suffix.lower())
+              .Case("", {0, 0})
+              .Case(".1d", {1, 64})
+              .Case(".1q", {1, 128})
+              // '.2h' needed for fp16 scalar pairwise reductions
+              .Case(".2h", {2, 16})
+              .Case(".2b", {2, 8})
+              .Case(".2s", {2, 32})
+              .Case(".2d", {2, 64})
+              // '.4b' is another special case for the ARMv8.2a dot product
+              // operand
+              .Case(".4b", {4, 8})
+              .Case(".4h", {4, 16})
+              .Case(".4s", {4, 32})
+              .Case(".8b", {8, 8})
+              .Case(".8h", {8, 16})
+              .Case(".16b", {16, 8})
+              // Accept the width neutral ones, too, for verbose syntax. If
+              // those aren't used in the right places, the token operand won't
+              // match so all will work out.
+              .Case(".b", {0, 8})
+              .Case(".h", {0, 16})
+              .Case(".s", {0, 32})
+              .Case(".d", {0, 64})
+              .Default({-1, -1});
     break;
   case RegKind::SVEPredicateAsCounter:
   case RegKind::SVEPredicateVector:
@@ -3641,6 +3652,8 @@ static const struct Extension {
     {"fpmr", {AArch64::FeatureFPMR}},
     {"fp8", {AArch64::FeatureFP8}},
     {"faminmax", {AArch64::FeatureFAMINMAX}},
+    {"fp8fma", {AArch64::FeatureFP8FMA}},
+    {"ssve-fp8fma", {AArch64::FeatureSSVE_FP8FMA}},
 };
 
 static void setRequiredFeatureString(FeatureBitset FBS, std::string &Str) {
diff --git a/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp b/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
index df817f62f99fb..cf2d3879292d1 100644
--- a/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
+++ b/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
@@ -44,6 +44,9 @@ static DecodeStatus DecodeFPR128RegisterClass(MCInst &Inst, unsigned RegNo,
 static DecodeStatus DecodeFPR128_loRegisterClass(MCInst &Inst, unsigned RegNo,
                                                  uint64_t Address,
                                                  const MCDisassembler *Decoder);
+static DecodeStatus
+DecodeFPR128_0to7RegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Address,
+                               const MCDisassembler *Decoder);
 static DecodeStatus DecodeFPR64RegisterClass(MCInst &Inst, unsigned RegNo,
                                              uint64_t Address,
                                              const MCDisassembler *Decoder);
@@ -437,6 +440,14 @@ DecodeFPR128_loRegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Addr,
   return DecodeFPR128RegisterClass(Inst, RegNo, Addr, Decoder);
 }
 
+static DecodeStatus
+DecodeFPR128_0to7RegisterClass(MCInst &Inst, unsigned RegNo, uint64_t Addr,
+                               const MCDisassembler *Decoder) {
+  if (RegNo > 7)
+    return Fail;
+  return DecodeFPR128RegisterClass(Inst, RegNo, Addr, Decoder);
+}
+
 static DecodeStatus DecodeFPR64RegisterClass(MCInst &Inst, unsigned RegNo,
                                              uint64_t Addr,
                                              const MCDisassembler *Decoder) {
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
index b956cd4782b79..21f9f6437e4fe 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
@@ -245,9 +245,10 @@ AArch64RegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC,
   case AArch64::FPR32_with_hsub_in_FPR16_loRegClassID:
   case AArch64::FPR32RegClassID:
   case AArch64::FPR64RegClassID:
-  case AArch64::FPR64_loRegClassID:
   case AArch64::FPR128RegClassID:
+  case AArch64::FPR64_loRegClassID:
   case AArch64::FPR128_loRegClassID:
+  case AArch64::FPR128_0to7RegClassID:
   case AArch64::DDRegClassID:
   case AArch64::DDDRegClassID:
   case AArch64::DDDDRegClassID:
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index ff641a85f8440..20ccd61219039 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -10121,4 +10121,75 @@ class sve2_fp8_down_cvt_single<bits<2> opc, string mnemonic,
 
 multiclass sve2_fp8_down_cvt_single<bits<2> opc, string mnemonic, RegisterOperand src> {
   def NAME : sve2_fp8_down_cvt_single<opc, mnemonic, ZPR8, src>;
-}
\ No newline at end of file
+}
+
+// FP8 Widening Multiply-Add Long - Indexed Group
+class sve2_fp8_mla_long_by_indexed_elem<bit T, string mnemonic>
+    : I<(outs ZPR16:$Zda),
+      (ins ZPR16:$_Zda, ZPR8:$Zn, ZPR3b8:$Zm, VectorIndexB:$imm4),
+      mnemonic, "\t$Zda, $Zn, $Zm$imm4",
+      "", []>, Sched<[]>{
+  bits<5> Zda;
+  bits<5> Zn;
+  bits<3> Zm;
+  bits<4> imm4;
+  let Inst{31-24} = 0b01100100;
+  let Inst{23}    = T;
+  let Inst{22-21} = 0b01;
+  let Inst{20-19} = imm4{3-2};
+  let Inst{18-16} = Zm;
+  let Inst{15-12} = 0b0101;
+  let Inst{11-10} = imm4{1-0};
+  let Inst{9-5}   = Zn;
+  let Inst{4-0}   = Zda;
+  let Constraints = "$Zda = $_Zda";
+  let DestructiveInstType = DestructiveOther;
+  let ElementSize         = ZPR16.ElementSize;
+}
+
+// FP8 Widening Multiply-Add (Long)/(Long Long) Group
+class sve2_fp8_mla<bits<3>opc, ZPRRegOp dst_ty, string mnemonic>
+    : I<(outs dst_ty:$Zda),
+      (ins dst_ty:$_Zda, ZPR8:$Zn, ZPR8:$Zm),
+      mnemonic, "\t$Zda, $Zn, $Zm",
+      "", []>, Sched<[]>{
+  bits<5> Zda;
+  bits<5> Zn;
+  bits<5> Zm;
+  let Inst{31-24} = 0b01100100;
+  let Inst{23}    = opc{2};
+  let Inst{22-21} = 0b01;
+  let Inst{20-16} = Zm;
+  let Inst{15-14} = 0b10;
+  let Inst{13-12} = opc{1-0};
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Zn;
+  let Inst{4-0}   = Zda;
+  let Constraints = "$Zda = $_Zda";
+  let DestructiveInstType = DestructiveOther;
+  let ElementSize         = dst_ty.ElementSize;
+}
+
+// FP8 Widening Multiply-Add Long Long - Indexed Group
+class sve2_fp8_mla_long_long_by_indexed_elem<bits<2> TT, string mnemonic>
+    : I<(outs ZPR32:$Zda),
+      (ins ZPR32:$_Zda, ZPR8:$Zn, ZPR3b8:$Zm, VectorIndexB:$imm4),
+      mnemonic, "\t$Zda, $Zn, $Zm$imm4",
+      "", []>, Sched<[]>{
+  bits<5> Zda;
+  bits<5> Zn;
+  bits<3> Zm;
+  bits<4> imm4;
+  let Inst{31-24} = 0b01100100;
+  let Inst{23-22} = TT;
+  let Inst{21}    = 0b1;
+  let Inst{20-19} = imm4{3-2};
+  let Inst{18-16} = Zm;
+  let Inst{15-12} = 0b1100;
+  let Inst{11-10} = imm4{1-0};
+  let Inst{9-5}   = Zn;
+  let Inst{4-0}   = Zda;
+  let Constraints = "$Zda = $_Zda";
+  let DestructiveInstType = DestructiveOther;
+  let ElementSize         = ZPR32.ElementSize;
+}
diff --git a/llvm/test/MC/AArch64/FP8/directive-arch-negative.s b/llvm/test/MC/AArch64/FP8/directive-arch-negative.s
index 86525ff68134c..7a45f25fa052d 100644
--- a/llvm/test/MC/AArch64/FP8/directive-arch-negative.s
+++ b/llvm/test/MC/AArch64/FP8/directive-arch-negative.s
@@ -12,3 +12,14 @@ famax  v31.4h, v31.4h, v31.4h
 // CHECK: error: instruction requires: faminmax
 // CHECK: famax  v31.4h, v31.4h, v31.4h
 
+.arch armv9-a+fp8fma
+.arch armv9-a+nofp8fma
+fmlalb  v0.8h, v0.16b, v0.16b
+// CHECK: error: instruction requires: fp8fma
+// CHECK: fmlalb  v0.8h, v0.16b, v0.16b
+
+.arch armv9-a+ssve-fp8fma
+.arch armv9-a+nossve-fp8fma
+fmlalb  z23.h, z13.b, z0.b[7]
+// CHECK: error: instruction requires: ssve-fp8fma or (sve2 and fp8fma)
+// CHECK: fmlalb  z23.h, z13.b, z0.b[7]
diff --git a/llvm/test/MC/AArch64/FP8/directive-arch.s b/llvm/test/MC/AArch64/FP8/directive-arch.s
index e3f0a94c2ff9a..f3627f537b92d 100644
--- a/llvm/test/MC/AArch64/FP8/directive-arch.s
+++ b/llvm/test/MC/AArch64/FP8/directive-arch.s
@@ -3,11 +3,19 @@
 .arch armv9-a+fp8
 bf1cvtl v0.8h, v0.8b
 // CHECK: bf1cvtl v0.8h, v0.8b
-
 .arch armv9-a+nofp8
+
 .arch armv9-a+faminmax
 famax  v31.4h, v31.4h, v31.4h
 // CHECK: famax  v31.4h, v31.4h, v31.4h
-
 .arch armv9-a+nofaminmax
 
+.arch armv9-a+fp8fma
+fmlalb  v0.8h, v0.16b, v0.16b
+// CHECK: fmlalb  v0.8h, v0.16b, v0.16b
+.arch armv9-a+nofp8fma
+
+.arch armv9-a+ssve-fp8fma
+fmlalb z23.h, z13.b, z0.b[7]
+// CHECK: fmlalb z23.h, z13.b, z0.b[7]
+.arch armv9-a+nossve-fp8fma
diff --git a/llvm/test/MC/AArch64/FP8/mla-diagnostic.s b/llvm/test/MC/AArch64/FP8/mla-diagnostic.s
new file mode 100644
index 0000000000000..91a23a3d73e0c
--- /dev/null
+++ b/llvm/test/MC/AArch64/FP8/mla-diagnostic.s
@@ -0,0 +1,85 @@
+// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=fp8fma 2>&1 < %s| FileCheck %s
+
+// --------------------------------------------------------------------------//
+// Element size extension incorrect
+fmlalb  v0.8h, v0.8b, v0.8b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: fmlalb  v0.8h, v0.8b, v0.8b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fmlalt  v31.8h, v31.16b, v31.16h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid vector kind qualifier
+// CHECK-NEXT: fmlalt  v31.8h, v31.16b, v31.16h
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fmlallbb  v0.4s, v0, v31
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: fmlallbb  v0.4s, v0, v31
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fmlallbt  v31.4b, v31.16b, v0.16b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: fmlallbt  v31.4b, v31.16b, v0.16b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fmlalltb  v31.b, v31.16b, v0.16b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: fmlalltb  v31.b, v31.16b, v0.16b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fmlalltt  v0.4s, v0.16b, v31.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: fmlalltt  v0.4s, v0.16b, v31.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fmlalt  v31.8s, v31.16b, v31.16b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid vector kind qualifier
+// CHECK-NEXT: fmlalt  v31.8s, v31.16b, v31.16b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+
+//--------------------------------------------------------------------------//
+// Last Register range is between 0-7
+
+fmlalltb  v0.4s, v31.16b, v8.b[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: fmlalltb  v0.4s, v31.16b, v8.b[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}$
+
+fmlalt  v31.8h, v0.16b, v8.b[15]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: fmlalt  v31.8h, v0.16b, v8.b[15]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:$
+
+
+// --------------------------------------------------------------------------//
+// Out of range index
+fmlalb  v31.8h, v0.16b, v0.b[-1]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: vector lane must be an integer in range [0, 15].
+// CHECK-NEXT: fmlalb  v31.8h, v0.16b, v0.b[-1]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fmlalt  v31.8h, v0.16b, v0.b[16]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: vector lane must be an integer in range [0, 15].
+// CHECK-NEXT: fmlalt  v31.8h, v0.16b, v0.b[16]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fmlallbb  v31.4s, v0.16b, v7.b[-1]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: vector lane must be an integer in range [0, 15].
+// CHECK-NEXT: fmlallbb  v31.4s, v0.16b, v7.b[-1]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fmlalltt  v31.4s, v0.16b, v7.b[16]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: vector lane must be an integer in range [0, 15].
+// CHECK-NEXT: fmlalltt  v31.4s, v0.16b, v7.b[16]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fmlalltb  v0.4s, v31.16b, v7.b[-1]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: vector lane must be an integer in range [0, 15].
+// CHECK-NEXT: fmlalltb  v0.4s, v31.16b, v7.b[-1]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}
+
+fmlallbt  v0.4s, v31.16b, v0.b[16]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: vector lane must be an integer in range [0, 15].
+// CHECK-NEXT: fmlallbt  v0.4s, v31.16b, v0.b[16]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
diff --git a/llvm/test/MC/AArch64/FP8/mla.s b/llvm/test/MC/AArch64/FP8/mla.s
new file mode 100644
index 0000000000000..b3b6f1eb18062
--- /dev/null
+++ b/llvm/test/MC/AArch64/FP8/mla.s
@@ -0,0 +1,98 @@
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+fp8fma < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
+// RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+fp8fma < %s \
+// RUN:        | llvm-objdump -d --mattr=+fp8fma - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+fp8fma < %s \
+// RUN:        | llvm-objdump -d --mattr=-fp8fma - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+// Disassemble encoding and check the re-encoding (-show-encoding) matches.
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+fp8fma < %s \
+// RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
+// RUN:        | llvm-mc -triple=aarch64 -mattr=+fp8fma -disassemble -show-encoding \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+
+/// VECTOR
+// MLA
+fmlalb  v0.8h, v0.16b, v0.16b
+// CHECK-INST: fmlalb  v0.8h, v0.16b, v0.16b
+// CHECK-ENCODING: [0x00,0xfc,0xc0,0x0e]
+// CHECK-ERROR: instruction requires: fp8fma
+// CHECK-UNKNOWN: 0ec0fc00 <unknown>
+
+fmlalt  v31.8h, v31.16b, v31.16b
+// CHECK-INST: fmlalt  v31.8h, v31.16b, v31.16b
+// CHECK-ENCODING: [0xff,0xff,0xdf,0x4e]
+// CHECK-ERROR: instruction requires: fp8fma
+// CHECK-UNKNOWN: 4edfffff <unknown>
+
+
+// MLALL
+fmlallbb  v0.4s, v0.16b, v31.16b
+// CHECK-INST: fmlallbb  v0.4s, v0.16b, v31.16b
+// CHECK-ENCODING: [0x00,0xc4,0x1f,0x0e]
+// CHECK-ERROR: instruction requires: fp8fma
+// CHECK-UNKNOWN: 0e1fc400 <unknown>
+
+fmlallbt  v31.4s, v31.16b, v0.16b
+// CHECK-INST: fmlallbt  v31.4s, v31.16b, v0.16b
+// CHECK-ENCODING: [0xff,0xc7,0x40,0x0e]
+// CHECK-ERROR: instruction requires: fp8fma
+// CHECK-UNKNOWN: 0e40c7ff <unknown>
+
+fmlalltb  v31.4s, v31.16b, v0.16b
+// CHECK-INST: fmlalltb  v31.4s, v31.16b, v0.16b
+// CHECK-ENCODING: [0xff,0xc7,0x00,0x4e]
+// CHECK-ERROR: instruction requires: fp8fma
+// CHECK-UNKNOWN: 4e00c7ff <unknown>
+
+fmlalltt  v0.4s, v0.16b, v31.16b
+// CHECK-INST: fmlalltt  v0.4s, v0.16b, v31.16b
+// CHECK-ENCODING: [0x00,0xc4,0x5f,0x4e]
+// CHECK-ERROR: instruction requires: fp8fma
+// CHECK-UNKNOWN: 4e5fc400 <unknown>
+
+fmlalltt v31.4s, v31.16b, v31.16b  // 01001110-01011111-11000111-11111111
+// CHECK-INST: fmlalltt v31.4s, v31.16b, v31.16b
+// CHECK-ENCODING: [0xff,0xc7,0x5f,0x4e]
+// CHECK-ERROR: instruction requires: fp8
+// CHECK-UNKNOWN: 4e5fc7ff <unknown>
+
+//INDEXED
+// MLA
+fmlalb  v31.8h, v0.16b, v0.b[0]
+// CHECK-INST: fmlalb  v31.8h, v0.16b, v0.b[0]
+// CHECK-ENCODING: [0x1f,0x00,0xc0,0x0f]
+// CHECK-ERROR: instruction requires: fp8fma
+// CHECK-UNKNOWN: 0fc0001f <unknown>
+
+fmlalt  v31.8h, v0.16b, v0.b[15]
+// CHECK-INST: fmlalt  v31.8h, v0.16b, v0.b[15]
+// CHECK-ENCODING: [0x1f,0x08,0xf8,0x4f]
+// CHECK-ERROR: instruction requires: fp8fma
+// CHECK-UNKNOWN: 4ff8081f <unknown>
+
+// MLALL
+fmlallbb  v31.4s, v0.16b, v7.b[0]
+// CHECK-INST: fmlallbb  v31.4s, v0.16b, v7.b[0]
+// CHECK-ENCODING: [0x1f,0x80,0x07,0x2f]
+// CHECK-ERROR: instruction requires: fp8fma
+// CHECK-UNKNOWN: 2f07801f <unknown>
+
+fmlalltt  v31.4s, v0.16b, v7.b[0]
+// CHECK-INST: fmlalltt  v31.4s, v0.16b, v7.b[0]
+// CHECK-ENCODING: [0x1f,0x80,0x47,0x6f]
+// CHECK-ERROR: instruction requires: fp8fma
+// CHECK-UNKNOWN: 6f47801f <unknown>
+
+fmlalltb  v0.4s, v31.16b, v7.b[15]
+// CHECK-INST: fmlalltb  v0.4s, v31.16b, v7.b[15]
+// CHECK-ENCODING: [0xe0,0x8b,0x3f,0x6f]
+// CHECK-ERROR: instruction requires: fp8fma
+// CHECK-UNKNOWN: 6f3f8be0 <unknown>
+
+fmlallbt  v0.4s, v31.16b, v0.b[15]
+// CHECK-INST: fmlallbt  v0.4s, v31.16b, v0.b[15]
+// CHECK-ENCODING: [0xe0,0x8b,0x78,0x2f]
+// CHECK-ERROR: instruction requires: fp8fma
+// CHECK-UNKNOWN: 2f788be0 <unknown>
diff --git a/llvm/test/MC/AArch64/FP8_SVE2/fmlal-diagnostics.s b/llvm/test/MC/AArch64/FP8_SVE2/fmlal-diagnostics.s
new file mode 100644
index 0000000000000..627ec0f044bb9
--- /dev/null
+++ b/llvm/test/MC/AArch64/FP8_SVE2/fmlal-diagnostics.s
@@ -0,0 +1,64 @@
+// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+ssve-fp8fma  2>&1 < %s| FileCheck %s
+
+// ------------------------------------------------------------------------- //
+// z register out of range for index
+fmlalb z0.h, z1.b, z8.b[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: fmlalb z0.h, z1.b, z8.b[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fmlalt z0.h, z1.b, z8.b[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: fmlalt z0.h, z1.b, z8.b[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// ------------------------------------------------------------------------- //
+// Index out of bounds
+
+fmlalb z0.h, z1.b, z7.b[-1]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: vector lane must be an integer in range [0, 15].
+// CHECK-NEXT: fmlalb z0.h, z1.b, z7.b[-1]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fmlalt z0.h, z1.b, z7.b[16]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: vector lane must be an integer in range [0, 15].
+// CHECK-NEXT: fmlalt z0.h, z1.b, z7.b[16]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// ------------------------------------------------------------------------- //
+// Invalid element width
+
+fmlalb z0.h, z1.h, z2.h[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: fmlalb z0.h, z1.h, z2.h[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fmlalb z0.h, z1.h, z2.h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: fmlalb  z0.h, z1.h, z2.h
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fmlalt z0.s, z1.b, z2.b[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: fmlalt z0.s, z1.b, z2.b[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fmlalt z0.s, z1.b, z2.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: fmlalt  z0.s, z1.b, z2.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Negative tests for instructions that are incompatible with movprfx
+
+movprfx z0.b, p0/z, z0.b
+fmlalb  z0.h, z1.b, z7.b[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a predicated movprfx, suggest using unpredicated movprfx
+// CHECK-NEXT: fmlalb  z0.h, z1.b, z7.b[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+movprfx z29.b, p0/z, z7.b
+fmlalt  z29.h, z30.b, z31.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a predicated movprfx, suggest using unpredicated movprfx
+// CHECK-NEXT: fmlalt  z29.h, z30.b, z31.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
diff --git a/llvm/test/MC/AArch64/FP8_SVE2/fmlal.s b/llvm/test/MC/AArch64/FP8_SVE2/fmlal.s
new file mode 100644
index 0000000000000..b567364b07885
--- /dev/null
+++ b/llvm/test/MC/AArch64/FP8_SVE2/fmlal.s
@@ -0,0 +1,114 @@
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+ssve-fp8fma < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+
+// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
+// RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
+
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2,+fp8fma < %s \
+// RUN:        | llvm-objdump -d --mattr=+sve2,+fp8fma --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-INST
+
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2,+fp8fma < %s \
+// RUN:        | llvm-objdump -d --mattr=-fp8fma --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+
+// Disassemble encoding and check the re-encoding (-show-encoding) matches.
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2,+fp8fma < %s \
+// RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
+// RUN:        | llvm-mc -triple=aarch64 -mattr=+sve2,+fp8fma -disassemble -show-encoding \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+
+//
+// FMLALB instructions
+//
+// fmlalb - indexed
+
+fmlalb  z0.h, z0.b, z0.b[0]  // 01100100-00100000-01010000-00000000
+// CHECK-INST: fmlalb  z0.h, z0.b, z0.b[0]
+// CHECK-ENCODING: [0x00,0x50,0x20,0x64]
+// CHECK-ERROR: instruction requires: ssve-fp8fma or (sve2 and fp8fma)
+// CHECK-UNKNOWN: 64205000 <unknown>
+
+movprfx z23, z31
+fmlalb  z23.h, z13.b, z0.b[7]  // 01100100-00101000-01011101-10110111
+// CHECK-INST:  movprfx z23, z31
+// CHECK-INST: fmlalb  z23.h, z13.b, z0.b[7]
+// CHECK-ENCODING: [0xb7,0x5d,0x28,0x64]
+// CHECK-ERROR: instruction requires: ssve-fp8fma or (sve2 and fp8fma)
+// CHECK-UNKNOWN: 64285db7 <unknown>
+
+fmlalb  z31.h, z31.b, z7.b[15]  // 01100100-00111111-01011111-11111111
+// CHECK-INST: fmlalb  z31.h, z31.b, z7.b[15]
+// CHECK-ENCODING: [0xff,0x5f,0x3f,0x64]
+// CHECK-ERROR: instruction requires: ssve-fp8fma or (sve2 and fp8fma)
+// CHECK-UNKNOWN: 643f5fff <unknown>
+
+
+// fmlalb - group
+
+fmlalb  z0.h, z0.b, z0.b  // 01100100-10100000-10001000-00000000
+// CHECK-INST: fmlalb  z0.h, z0.b, z0.b
+// CHECK-ENCODING: [0x00,0x88,0xa0,0x64]
+// CHECK-ERROR: instruction requires: ssve-fp8fma or (sve2 and fp8fma)
+// CHECK-UNKNOWN: 64a08800 <unknown>
+
+movprfx z23, z31
+fmlalb  z23.h, z13.b, z8.b  // 01100100-10101000-10001001-10110111
+// CHECK-INST:  movprfx z23, z31
+// CHECK-INST: fmlalb  z23.h, z13.b, z8.b
+// CHECK-ENCODING: [0xb7,0x89,0xa8,0x64]
+// CHECK-ERROR: instruction requires: ssve-fp8fma or (sve2 and fp8fma)
+// CHECK-UNKNOWN: 64a889b7 <unknown>
+
+fmlalb  z31.h, z31.b, z31.b  // 01100100-10111111-10001011-11111111
+// CHECK-INST: fmlalb  z31.h, z31.b, z31.b
+// CHECK-ENCODING: [0xff,0x8b,0xbf,0x64]
+// CHECK-ERROR: instruction requires: ssve-fp8fma or (sve2 and fp8fma)
+// CHECK-UNKNOWN: 64bf8bff <unknown>
+
+
+//
+// FMLALT instructions
+//
+// fmlalt - indexed
+
+fmlalt  z0.h, z0.b, z0.b[0]  // 01100100-10100000-01010000-00000000
+// CHECK-INST: fmlalt  z0.h, z0.b, z0.b[0]
+// CHECK-ENCODING: [0x00,0x50,0xa0,0x64]
+// CHECK-ERROR: instruction requires: ssve-fp8fma or (sve2 and fp8fma)
+// CHECK-UNKNOWN: 64a05000 <unknown>
+
+movprfx z23, z31
+fmlalt  z23.h, z13.b, z0.b[7]  // 01100100-10101000-01011101-10110111
+// CHECK-INST:  movprfx z23, z31
+// CHECK-INST: fmlalt  z23.h, z13.b, z0.b[7]
+// CHECK-ENCODING: [0xb7,0x5d,0xa8,0x64]
+// CHECK-ERROR: instruction requires: ssve-fp8fma or (sve2 and fp8fma)
+// CHECK-UNKNOWN: 64a85db7 <unknown>
+
+fmlalt  z31.h, z31.b, z7.b[15]  // 01100100-10111111-01011111-11111111
+// CHECK-INST: fmlalt  z31.h, z31.b, z7.b[15]
+// CHECK-ENCODING: [0xff,0x5f,0xbf,0x64]
+// CHECK-ERROR: instruction requires: ssve-fp8fma or (sve2 and fp8fma)
+// CHECK-UNKNOWN: 64bf5fff <unknown>
+
+
+// fmlalt - group
+
+fmlalt  z0.h, z0.b, z0.b  // 01100100-10100000-10011000-00000000
+// CHECK-INST: fmlalt  z0.h, z0.b, z0.b
+// CHECK-ENCODING: [0x00,0x98,0xa0,0x64]
+// CHECK-ERROR: instruction requires: ssve-fp8fma or (sve2 and fp8fma)
+// CHECK-UNKNOWN: 64a09800 <unknown>
+
+movprfx z23, z31
+fmlalt  z23.h, z13.b, z8.b  // 01100100-10101000-10011001-10110111
+// CHECK-INST:  movprfx z23, z31
+// CHECK-INST: fmlalt  z23.h, z13.b, z8.b
+// CHECK-ENCODING: [0xb7,0x99,0xa8,0x64]
+// CHECK-ERROR: instruction requires: ssve-fp8fma or (sve2 and fp8fma)
+// CHECK-UNKNOWN: 64a899b7 <unknown>
+
+fmlalt  z31.h, z31.b, z31.b  // 01100100-10111111-10011011-11111111
+// CHECK-INST: fmlalt  z31.h, z31.b, z31.b
+// CHECK-ENCODING: [0xff,0x9b,0xbf,0x64]
+// CHECK-ERROR: instruction requires: ssve-fp8fma or (sve2 and fp8fma)
+// CHECK-UNKNOWN: 64bf9bff <unknown>
diff --git a/llvm/test/MC/AArch64/FP8_SVE2/fmlall-diagnostics.s b/llvm/test/MC/AArch64/FP8_SVE2/fmlall-diagnostics.s
new file mode 100644
index 0000000000000..7a8a299b8756d
--- /dev/null
+++ b/llvm/test/MC/AArch64/FP8_SVE2/fmlall-diagnostics.s
@@ -0,0 +1,85 @@
+// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+ssve-fp8fma  2>&1 < %s| FileCheck %s
+
+// ------------------------------------------------------------------------- //
+// z register out of range for index
+
+fmlallbb z0.s, z1.b, z8.b[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: fmlallbb z0.s, z1.b, z8.b[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fmlallbt z0.s, z1.b, z8.b[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: fmlallbt z0.s, z1.b, z8.b[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fmlalltb z0.s, z1.b, z8.b[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: fmlalltb z0.s, z1.b, z8.b[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fmlalltt z0.s, z1.b, z8.b[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: fmlalltt z0.s, z1.b, z8.b[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// ------------------------------------------------------------------------- //
+// Index out of bounds
+
+fmlallbb z0.s, z1.b, z7.b[-1]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: vector lane must be an integer in range [0, 15].
+// CHECK-NEXT: fmlallbb z0.s, z1.b, z7.b[-1]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fmlallbt z0.s, z1.b, z7.b[16]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: vector lane must be an integer in range [0, 15].
+// CHECK-NEXT: fmlallbt z0.s, z1.b, z7.b[16]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fmlalltb z0.s, z1.b, z7.b[-1]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: vector lane must be an integer in range [0, 15].
+// CHECK-NEXT: fmlalltb z0.s, z1.b, z7.b[-1]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fmlalltt z0.s, z1.b, z7.b[16]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: vector lane must be an integer in range [0, 15].
+// CHECK-NEXT: fmlalltt z0.s, z1.b, z7.b[16]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// ------------------------------------------------------------------------- //
+// Invalid element width
+
+fmlallbb z0.h, z1.b, z2.b[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: fmlallbb z0.h, z1.b, z2.b[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fmlallbt z0.h, z1.b, z2.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: fmlallbt  z0.h, z1.b, z2.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fmlalltb z0.s, z1.h, z2.h[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: fmlalltb z0.s, z1.h, z2.h[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fmlalltt z0.s, z1.h, z2.h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: fmlalltt  z0.s, z1.h, z2.h
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Negative tests for instructions that are incompatible with movprfx
+
+movprfx z0.s, p0/z, z0.s
+fmlallbb  z0.s, z1.b, z7.b[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a predicated movprfx, suggest using unpredicated movprfx
+// CHECK-NEXT: fmlallbb  z0.s, z1.b, z7.b[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+movprfx z29.s, p0/z, z7.s
+fmlalltt  z29.s, z30.b, z31.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a predicated movprfx, suggest using unpredicated movprfx
+// CHECK-NEXT: fmlalltt  z29.s, z30.b, z31.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
diff --git a/llvm/test/MC/AArch64/FP8_SVE2/fmlall.s b/llvm/test/MC/AArch64/FP8_SVE2/fmlall.s
new file mode 100644
index 0000000000000..319e664172da6
--- /dev/null
+++ b/llvm/test/MC/AArch64/FP8_SVE2/fmlall.s
@@ -0,0 +1,222 @@
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+ssve-fp8fma < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+
+// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
+// RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
+
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2,+fp8fma < %s \
+// RUN:        | llvm-objdump -d --mattr=+sve2,+fp8fma --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-INST
+
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2,+fp8fma < %s \
+// RUN:        | llvm-objdump -d --mattr=-sve2 --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+
+// Disassemble encoding and check the re-encoding (-show-encoding) matches.
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2,+fp8fma < %s \
+// RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
+// RUN:        | llvm-mc -triple=aarch64 -mattr=+sve2,+fp8fma -disassemble -show-encoding \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+
+//
+// FMLALLBB instructions
+//
+// fmlallbb - indexed
+
+fmlallbb z0.s, z0.b, z0.b[0]  // 01100100-00100000-11000000-00000000
+// CHECK-INST: fmlallbb z0.s, z0.b, z0.b[0]
+// CHECK-ENCODING: [0x00,0xc0,0x20,0x64]
+// CHECK-ERROR: instruction requires: ssve-fp8fma or (sve2 and fp8fma)
+// CHECK-UNKNOWN: 6420c000 <unknown>
+
+movprfx z23, z31
+fmlallbb z23.s, z13.b, z0.b[7]  // 01100100-00101000-11001101-10110111
+// CHECK-INST:  movprfx z23, z31
+// CHECK-INST: fmlallbb z23.s, z13.b, z0.b[7]
+// CHECK-ENCODING: [0xb7,0xcd,0x28,0x64]
+// CHECK-ERROR: instruction requires: ssve-fp8fma or (sve2 and fp8fma)
+// CHECK-UNKNOWN: 6428cdb7 <unknown>
+
+fmlallbb z31.s, z31.b, z7.b[15]  // 01100100-00111111-11001111-11111111
+// CHECK-INST: fmlallbb z31.s, z31.b, z7.b[15]
+// CHECK-ENCODING: [0xff,0xcf,0x3f,0x64]
+// CHECK-ERROR: instruction requires: ssve-fp8fma or (sve2 and fp8fma)
+// CHECK-UNKNOWN: 643fcfff <unknown>
+
+//
+// FMLALLBB instructions
+//
+// fmlallbb - group
+
+fmlallbb z0.s, z0.b, z0.b  // 01100100-00100000-10001000-00000000
+// CHECK-INST: fmlallbb z0.s, z0.b, z0.b
+// CHECK-ENCODING: [0x00,0x88,0x20,0x64]
+// CHECK-ERROR: instruction requires: ssve-fp8fma or (sve2 and fp8fma)
+// CHECK-UNKNOWN: 64208800 <unknown>
+
+movprfx z23, z31
+fmlallbb z23.s, z13.b, z8.b  // 01100100-00101000-10001001-10110111
+// CHECK-INST:  movprfx z23, z31
+// CHECK-INST: fmlallbb z23.s, z13.b, z8.b
+// CHECK-ENCODING: [0xb7,0x89,0x28,0x64]
+// CHECK-ERROR: instruction requires: ssve-fp8fma or (sve2 and fp8fma)
+// CHECK-UNKNOWN: 642889b7 <unknown>
+
+fmlallbb z31.s, z31.b, z31.b  // 01100100-00111111-10001011-11111111
+// CHECK-INST: fmlallbb z31.s, z31.b, z31.b
+// CHECK-ENCODING: [0xff,0x8b,0x3f,0x64]
+// CHECK-ERROR: instruction requires: ssve-fp8fma or (sve2 and fp8fma)
+// CHECK-UNKNOWN: 643f8bff <unknown>
+
+//--------------------------------------------//
+//
+// FMLALLBT instructions
+//
+// fmlallbt - indexed
+
+fmlallbt z0.s, z0.b, z0.b[0]  // 01100100-01100000-11000000-00000000
+// CHECK-INST: fmlallbt z0.s, z0.b, z0.b[0]
+// CHECK-ENCODING: [0x00,0xc0,0x60,0x64]
+// CHECK-ERROR: instruction requires: ssve-fp8fma or (sve2 and fp8fma)
+// CHECK-UNKNOWN: 6460c000 <unknown>
+
+movprfx z23, z31
+fmlallbt z23.s, z13.b, z0.b[7]  // 01100100-01101000-11001101-10110111
+// CHECK-INST:  movprfx z23, z31
+// CHECK-INST: fmlallbt z23.s, z13.b, z0.b[7]
+// CHECK-ENCODING: [0xb7,0xcd,0x68,0x64]
+// CHECK-ERROR: instruction requires: ssve-fp8fma or (sve2 and fp8fma)
+// CHECK-UNKNOWN: 6468cdb7 <unknown>
+
+fmlallbt z31.s, z31.b, z7.b[15]  // 01100100-01111111-11001111-11111111
+// CHECK-INST: fmlallbt z31.s, z31.b, z7.b[15]
+// CHECK-ENCODING: [0xff,0xcf,0x7f,0x64]
+// CHECK-ERROR: instruction requires: ssve-fp8fma or (sve2 and fp8fma)
+// CHECK-UNKNOWN: 647fcfff <unknown>
+
+//
+// FMLALLBT instructions
+//
+// fmlallbt - group
+
+fmlallbt z0.s, z0.b, z0.b  // 01100100-00100000-10011000-00000000
+// CHECK-INST: fmlallbt z0.s, z0.b, z0.b
+// CHECK-ENCODING: [0x00,0x98,0x20,0x64]
+// CHECK-ERROR: instruction requires: ssve-fp8fma or (sve2 and fp8fma)
+// CHECK-UNKNOWN: 64209800 <unknown>
+
+movprfx z23, z31
+fmlallbt z23.s, z13.b, z8.b  // 01100100-00101000-10011001-10110111
+// CHECK-INST:  movprfx z23, z31
+// CHECK-INST: fmlallbt z23.s, z13.b, z8.b
+// CHECK-ENCODING: [0xb7,0x99,0x28,0x64]
+// CHECK-ERROR: instruction requires: ssve-fp8fma or (sve2 and fp8fma)
+// CHECK-UNKNOWN: 642899b7 <unknown>
+
+fmlallbt z31.s, z31.b, z31.b  // 01100100-00111111-10011011-11111111
+// CHECK-INST: fmlallbt z31.s, z31.b, z31.b
+// CHECK-ENCODING: [0xff,0x9b,0x3f,0x64]
+// CHECK-ERROR: instruction requires: ssve-fp8fma or (sve2 and fp8fma)
+// CHECK-UNKNOWN: 643f9bff <unknown>
+
+//--------------------------------------------//
+//
+// FMLALLTB instructions
+//
+// fmlalltb - indexed
+
+fmlalltb z0.s, z0.b, z0.b[0]  // 01100100-10100000-11000000-00000000
+// CHECK-INST: fmlalltb z0.s, z0.b, z0.b[0]
+// CHECK-ENCODING: [0x00,0xc0,0xa0,0x64]
+// CHECK-ERROR: instruction requires: ssve-fp8fma or (sve2 and fp8fma)
+// CHECK-UNKNOWN: 64a0c000 <unknown>
+
+movprfx z23, z31
+fmlalltb z23.s, z13.b, z0.b[7]  // 01100100-10101000-11001101-10110111
+// CHECK-INST:  movprfx z23, z31
+// CHECK-INST: fmlalltb z23.s, z13.b, z0.b[7]
+// CHECK-ENCODING: [0xb7,0xcd,0xa8,0x64]
+// CHECK-ERROR: instruction requires: ssve-fp8fma or (sve2 and fp8fma)
+// CHECK-UNKNOWN: 64a8cdb7 <unknown>
+
+fmlalltb z31.s, z31.b, z7.b[15]  // 01100100-10111111-11001111-11111111
+// CHECK-INST: fmlalltb z31.s, z31.b, z7.b[15]
+// CHECK-ENCODING: [0xff,0xcf,0xbf,0x64]
+// CHECK-ERROR: instruction requires: ssve-fp8fma or (sve2 and fp8fma)
+// CHECK-UNKNOWN: 64bfcfff <unknown>
+
+//
+// FMLALLTB instructions
+//
+// fmlalltb - group
+
+fmlalltb z0.s, z0.b, z0.b  // 01100100-00100000-10101000-00000000
+// CHECK-INST: fmlalltb z0.s, z0.b, z0.b
+// CHECK-ENCODING: [0x00,0xa8,0x20,0x64]
+// CHECK-ERROR: instruction requires: ssve-fp8fma or (sve2 and fp8fma)
+// CHECK-UNKNOWN: 6420a800 <unknown>
+
+movprfx z23, z31
+fmlalltb z23.s, z13.b, z8.b  // 01100100-00101000-10101001-10110111
+// CHECK-INST:  movprfx z23, z31
+// CHECK-INST: fmlalltb z23.s, z13.b, z8.b
+// CHECK-ENCODING: [0xb7,0xa9,0x28,0x64]
+// CHECK-ERROR: instruction requires: ssve-fp8fma or (sve2 and fp8fma)
+// CHECK-UNKNOWN: 6428a9b7 <unknown>
+
+fmlalltb z31.s, z31.b, z31.b  // 01100100-00111111-10101011-11111111
+// CHECK-INST: fmlalltb z31.s, z31.b, z31.b
+// CHECK-ENCODING: [0xff,0xab,0x3f,0x64]
+// CHECK-ERROR: instruction requires: ssve-fp8fma or (sve2 and fp8fma)
+// CHECK-UNKNOWN: 643fabff <unknown>
+
+
+//--------------------------------------------//
+//
+// FMLALLTT instructions
+//
+// fmlalltt - indexed
+
+fmlalltt z0.s, z0.b, z0.b[0]  // 01100100-11100000-11000000-00000000
+// CHECK-INST: fmlalltt z0.s, z0.b, z0.b[0]
+// CHECK-ENCODING: [0x00,0xc0,0xe0,0x64]
+// CHECK-ERROR: instruction requires: ssve-fp8fma or (sve2 and fp8fma)
+// CHECK-UNKNOWN: 64e0c000 <unknown>
+
+movprfx z23, z31
+fmlalltt z23.s, z13.b, z0.b[7]  // 01100100-11101000-11001101-10110111
+// CHECK-INST:  movprfx z23, z31
+// CHECK-INST: fmlalltt z23.s, z13.b, z0.b[7]
+// CHECK-ENCODING: [0xb7,0xcd,0xe8,0x64]
+// CHECK-ERROR: instruction requires: ssve-fp8fma or (sve2 and fp8fma)
+// CHECK-UNKNOWN: 64e8cdb7 <unknown>
+
+fmlalltt z31.s, z31.b, z7.b[15]  // 01100100-11111111-11001111-11111111
+// CHECK-INST: fmlalltt z31.s, z31.b, z7.b[15]
+// CHECK-ENCODING: [0xff,0xcf,0xff,0x64]
+// CHECK-ERROR: instruction requires: ssve-fp8fma or (sve2 and fp8fma)
+// CHECK-UNKNOWN: 64ffcfff <unknown>
+
+
+//
+// FMLALLTT instructions
+//
+// fmlalltt - group
+
+fmlalltt z0.s, z0.b, z0.b  // 01100100-00100000-10111000-00000000
+// CHECK-INST: fmlalltt z0.s, z0.b, z0.b
+// CHECK-ENCODING: [0x00,0xb8,0x20,0x64]
+// CHECK-ERROR: instruction requires: ssve-fp8fma or (sve2 and fp8fma)
+// CHECK-UNKNOWN: 6420b800 <unknown>
+
+movprfx z23, z31
+fmlalltt z23.s, z13.b, z8.b  // 01100100-00101000-10111001-10110111
+// CHECK-INST:  movprfx z23, z31
+// CHECK-INST: fmlalltt z23.s, z13.b, z8.b
+// CHECK-ENCODING: [0xb7,0xb9,0x28,0x64]
+// CHECK-ERROR: instruction requires: ssve-fp8fma or (sve2 and fp8fma)
+// CHECK-UNKNOWN: 6428b9b7 <unknown>
+
+fmlalltt z31.s, z31.b, z31.b  // 01100100-00111111-10111011-11111111
+// CHECK-INST: fmlalltt z31.s, z31.b, z31.b
+// CHECK-ENCODING: [0xff,0xbb,0x3f,0x64]
+// CHECK-ERROR: instruction requires: ssve-fp8fma or (sve2 and fp8fma)
+// CHECK-UNKNOWN: 643fbbff <unknown>
diff --git a/llvm/unittests/TargetParser/TargetParserTest.cpp b/llvm/unittests/TargetParser/TargetParserTest.cpp
index 45f39a66ea8b3..3c6d463fee259 100644
--- a/llvm/unittests/TargetParser/TargetParserTest.cpp
+++ b/llvm/unittests/TargetParser/TargetParserTest.cpp
@@ -1713,26 +1713,27 @@ TEST(TargetParserTest, testAArch64Extension) {
 
 TEST(TargetParserTest, AArch64ExtensionFeatures) {
   std::vector<uint64_t> Extensions = {
-      AArch64::AEK_CRC,     AArch64::AEK_LSE,       AArch64::AEK_RDM,
-      AArch64::AEK_CRYPTO,  AArch64::AEK_SM4,       AArch64::AEK_SHA3,
-      AArch64::AEK_SHA2,    AArch64::AEK_AES,       AArch64::AEK_DOTPROD,
-      AArch64::AEK_FP,      AArch64::AEK_SIMD,      AArch64::AEK_FP16,
-      AArch64::AEK_FP16FML, AArch64::AEK_PROFILE,   AArch64::AEK_RAS,
-      AArch64::AEK_SVE,     AArch64::AEK_SVE2,      AArch64::AEK_SVE2AES,
-      AArch64::AEK_SVE2SM4, AArch64::AEK_SVE2SHA3,  AArch64::AEK_SVE2BITPERM,
-      AArch64::AEK_RCPC,    AArch64::AEK_RAND,      AArch64::AEK_MTE,
-      AArch64::AEK_SSBS,    AArch64::AEK_SB,        AArch64::AEK_PREDRES,
-      AArch64::AEK_BF16,    AArch64::AEK_I8MM,      AArch64::AEK_F32MM,
-      AArch64::AEK_F64MM,   AArch64::AEK_TME,       AArch64::AEK_LS64,
-      AArch64::AEK_BRBE,    AArch64::AEK_PAUTH,     AArch64::AEK_FLAGM,
-      AArch64::AEK_SME,     AArch64::AEK_SMEF64F64, AArch64::AEK_SMEI16I64,
-      AArch64::AEK_SME2,    AArch64::AEK_HBC,       AArch64::AEK_MOPS,
-      AArch64::AEK_PERFMON, AArch64::AEK_SVE2p1,    AArch64::AEK_SME2p1,
-      AArch64::AEK_B16B16,  AArch64::AEK_SMEF16F16, AArch64::AEK_CSSC,
-      AArch64::AEK_RCPC3,   AArch64::AEK_THE,       AArch64::AEK_D128,
-      AArch64::AEK_LSE128,  AArch64::AEK_SPECRES2,  AArch64::AEK_RASv2,
-      AArch64::AEK_ITE,     AArch64::AEK_GCS,       AArch64::AEK_FPMR,
-      AArch64::AEK_FP8,     AArch64::AEK_FAMINMAX};
+      AArch64::AEK_CRC,        AArch64::AEK_LSE,       AArch64::AEK_RDM,
+      AArch64::AEK_CRYPTO,     AArch64::AEK_SM4,       AArch64::AEK_SHA3,
+      AArch64::AEK_SHA2,       AArch64::AEK_AES,       AArch64::AEK_DOTPROD,
+      AArch64::AEK_FP,         AArch64::AEK_SIMD,      AArch64::AEK_FP16,
+      AArch64::AEK_FP16FML,    AArch64::AEK_PROFILE,   AArch64::AEK_RAS,
+      AArch64::AEK_SVE,        AArch64::AEK_SVE2,      AArch64::AEK_SVE2AES,
+      AArch64::AEK_SVE2SM4,    AArch64::AEK_SVE2SHA3,  AArch64::AEK_SVE2BITPERM,
+      AArch64::AEK_RCPC,       AArch64::AEK_RAND,      AArch64::AEK_MTE,
+      AArch64::AEK_SSBS,       AArch64::AEK_SB,        AArch64::AEK_PREDRES,
+      AArch64::AEK_BF16,       AArch64::AEK_I8MM,      AArch64::AEK_F32MM,
+      AArch64::AEK_F64MM,      AArch64::AEK_TME,       AArch64::AEK_LS64,
+      AArch64::AEK_BRBE,       AArch64::AEK_PAUTH,     AArch64::AEK_FLAGM,
+      AArch64::AEK_SME,        AArch64::AEK_SMEF64F64, AArch64::AEK_SMEI16I64,
+      AArch64::AEK_SME2,       AArch64::AEK_HBC,       AArch64::AEK_MOPS,
+      AArch64::AEK_PERFMON,    AArch64::AEK_SVE2p1,    AArch64::AEK_SME2p1,
+      AArch64::AEK_B16B16,     AArch64::AEK_SMEF16F16, AArch64::AEK_CSSC,
+      AArch64::AEK_RCPC3,      AArch64::AEK_THE,       AArch64::AEK_D128,
+      AArch64::AEK_LSE128,     AArch64::AEK_SPECRES2,  AArch64::AEK_RASv2,
+      AArch64::AEK_ITE,        AArch64::AEK_GCS,       AArch64::AEK_FPMR,
+      AArch64::AEK_FP8,        AArch64::AEK_FAMINMAX,  AArch64::AEK_FP8FMA,
+      AArch64::AEK_SSVE_FP8FMA};
 
   std::vector<StringRef> Features;
 
@@ -1807,6 +1808,8 @@ TEST(TargetParserTest, AArch64ExtensionFeatures) {
   EXPECT_TRUE(llvm::is_contained(Features, "+fpmr"));
   EXPECT_TRUE(llvm::is_contained(Features, "+fp8"));
   EXPECT_TRUE(llvm::is_contained(Features, "+faminmax"));
+  EXPECT_TRUE(llvm::is_contained(Features, "+fp8fma"));
+  EXPECT_TRUE(llvm::is_contained(Features, "+ssve-fp8fma"));
 
   // Assuming we listed every extension above, this should produce the same
   // result. (note that AEK_NONE doesn't have a name so it won't be in the
@@ -1933,6 +1936,8 @@ TEST(TargetParserTest, AArch64ArchExtFeature) {
       {"fpmr", "nofpmr", "+fpmr", "-fpmr"},
       {"fp8", "nofp8", "+fp8", "-fp8"},
       {"faminmax", "nofaminmax", "+faminmax", "-faminmax"},
+      {"fp8fma", "nofp8fma", "+fp8fma", "-fp8fma"},
+      {"ssve-fp8fma", "nossve-fp8fma", "+ssve-fp8fma", "-ssve-fp8fma"},
   };
 
   for (unsigned i = 0; i < std::size(ArchExt); i++) {

From 644cd0724d8826c08c151b2e4ada00e694eb6cbf Mon Sep 17 00:00:00 2001
From: Nicolas Vasilache <nicolasvasilache@users.noreply.github.com>
Date: Wed, 1 Nov 2023 14:30:42 +0100
Subject: [PATCH 675/877] [mlir][SCF] Add folding for IndexSwitchOp (#70924)

---
 mlir/include/mlir/Dialect/SCF/IR/SCFOps.td    |  1 +
 mlir/lib/Dialect/SCF/IR/SCF.cpp               | 29 ++++++++++++++++
 mlir/test/Dialect/SCF/canonicalize.mlir       | 33 +++++++++++++++++++
 .../SCF/for-loop-canonicalization.mlir        | 14 ++++----
 4 files changed, 70 insertions(+), 7 deletions(-)

diff --git a/mlir/include/mlir/Dialect/SCF/IR/SCFOps.td b/mlir/include/mlir/Dialect/SCF/IR/SCFOps.td
index 38937fe289494..2c5abe7a63ac4 100644
--- a/mlir/include/mlir/Dialect/SCF/IR/SCFOps.td
+++ b/mlir/include/mlir/Dialect/SCF/IR/SCFOps.td
@@ -1126,6 +1126,7 @@ def IndexSwitchOp : SCF_Op<"index_switch", [RecursiveMemoryEffects,
     Block &getCaseBlock(unsigned idx);
   }];
 
+  let hasFolder = 1;
   let hasVerifier = 1;
 }
 
diff --git a/mlir/lib/Dialect/SCF/IR/SCF.cpp b/mlir/lib/Dialect/SCF/IR/SCF.cpp
index bc33fe2a9a010..646284f8a9db4 100644
--- a/mlir/lib/Dialect/SCF/IR/SCF.cpp
+++ b/mlir/lib/Dialect/SCF/IR/SCF.cpp
@@ -4166,6 +4166,35 @@ void IndexSwitchOp::getRegionInvocationBounds(
     bounds.emplace_back(/*lb=*/0, /*ub=*/i == liveIndex);
 }
 
+LogicalResult IndexSwitchOp::fold(FoldAdaptor adaptor,
+                                  SmallVectorImpl<OpFoldResult> &results) {
+  std::optional<int64_t> maybeCst = getConstantIntValue(getArg());
+  if (!maybeCst.has_value())
+    return failure();
+  int64_t cst = *maybeCst;
+  int64_t caseIdx, e = getNumCases();
+  for (caseIdx = 0; caseIdx < e; ++caseIdx) {
+    if (cst == getCases()[caseIdx])
+      break;
+  }
+
+  Region &r = (caseIdx < getNumCases()) ? getCaseRegions()[caseIdx]
+                                        : getDefaultRegion();
+  Block &source = r.front();
+  results.assign(source.getTerminator()->getOperands().begin(),
+                 source.getTerminator()->getOperands().end());
+
+  Block *pDestination = (*this)->getBlock();
+  if (!pDestination)
+    return failure();
+  Block::iterator insertionPoint = (*this)->getIterator();
+  pDestination->getOperations().splice(insertionPoint, source.getOperations(),
+                                       source.getOperations().begin(),
+                                       std::prev(source.getOperations().end()));
+
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // TableGen'd op method definitions
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/Dialect/SCF/canonicalize.mlir b/mlir/test/Dialect/SCF/canonicalize.mlir
index e55be6127fe24..9dbf8d5dab11a 100644
--- a/mlir/test/Dialect/SCF/canonicalize.mlir
+++ b/mlir/test/Dialect/SCF/canonicalize.mlir
@@ -1756,3 +1756,36 @@ func.func @do_not_fold_tensor_cast_from_dynamic_to_static_type_into_forall(
 // CHECK:         parallel_insert_slice
 // CHECK-SAME:      : tensor<1xi32> into tensor<2xi32>
 // CHECK:         tensor.cast
+
+// -----
+
+func.func @index_switch_fold() -> (f32, f32) {
+  %switch_cst = arith.constant 1: index
+  %0 = scf.index_switch %switch_cst -> f32
+  case 1 {
+    %y = arith.constant 1.0 : f32
+    scf.yield %y : f32
+  }
+  default {
+    %y = arith.constant 42.0 : f32
+    scf.yield %y : f32
+  }
+  
+  %switch_cst_2 = arith.constant 2: index
+  %1 = scf.index_switch %switch_cst_2 -> f32
+  case 0 {
+    %y = arith.constant 0.0 : f32
+    scf.yield %y : f32
+  }
+  default {
+    %y = arith.constant 42.0 : f32
+    scf.yield %y : f32
+  }
+  
+  return %0, %1 : f32, f32
+}
+
+// CHECK-LABEL: func.func @index_switch_fold()
+//  CHECK-NEXT:   %[[c1:.*]] = arith.constant 1.000000e+00 : f32
+//  CHECK-NEXT:   %[[c42:.*]] = arith.constant 4.200000e+01 : f32
+//  CHECK-NEXT:   return %[[c1]], %[[c42]] : f32, f32
diff --git a/mlir/test/Dialect/SCF/for-loop-canonicalization.mlir b/mlir/test/Dialect/SCF/for-loop-canonicalization.mlir
index 83c236ea6b4a7..6fb475efcb658 100644
--- a/mlir/test/Dialect/SCF/for-loop-canonicalization.mlir
+++ b/mlir/test/Dialect/SCF/for-loop-canonicalization.mlir
@@ -394,14 +394,18 @@ func.func @regression_multiplication_with_sym(%A : memref<i64>) {
 
 // -----
 
+
 // Make sure min is transformed into zero.
 
-// CHECK: %[[ZERO:.+]] = arith.constant 0 : index
-// CHECK: scf.index_switch %[[ZERO]] -> i1
+// CHECK-LABEL: func.func @func1()
+//       CHECK:   %[[ZERO:.+]] = arith.constant 0 : index
+//       CHECK:   call @foo(%[[ZERO]]) : (index) -> ()
 
 #map6 = affine_map<(d0, d1, d2) -> (d0 floordiv 64)>
 #map29 = affine_map<(d0, d1, d2) -> (d2 * 64 - 2, 5, (d1 mod 4) floordiv 8)>
 module {
+  func.func private @foo(%0 : index) -> ()
+
   func.func @func1() {
     %true = arith.constant true
     %c0 = arith.constant 0 : index
@@ -412,11 +416,7 @@ module {
     %alloc_249 = memref.alloc() : memref<7xf32>
     %135 = affine.apply #map6(%c15, %c0, %c14)
     %163 = affine.min #map29(%c5, %135, %c11)
-    %196 = scf.index_switch %163 -> i1
-    default {
-      memref.assume_alignment %alloc_249, 1 : memref<7xf32>
-      scf.yield %true : i1
-    }
+    func.call @foo(%163) : (index) -> ()
     return
   }
 }

From 700d93b0584c9d6401ec646fc3e343e90f326fa2 Mon Sep 17 00:00:00 2001
From: Ying Yi <ying.yi@sony.com>
Date: Fri, 1 Sep 2023 15:30:44 +0100
Subject: [PATCH 676/877] Add two time-trace scope variables.

A time trace scope variable of `ParseDeclarationOrFunctionDefinition`
with the function's source location is added to record the time spent
parsing the function's declaration or definition. Another time trace
scope variable of `ParseFunctionDefinition` is also added to record the
name of the defined function. A release note is added as well.

Reviewed by: Aaron Ballman
Pull request: #65268
---
 clang/docs/ReleaseNotes.rst                   |  8 ++++
 clang/lib/Parse/Parser.cpp                    | 14 +++++-
 ...e-ParseDeclarationOrFunctionDefinition.cpp | 15 +++++++
 clang/unittests/Support/TimeProfilerTest.cpp  | 44 +++++++++++--------
 4 files changed, 62 insertions(+), 19 deletions(-)
 create mode 100644 clang/test/Driver/check-time-trace-ParseDeclarationOrFunctionDefinition.cpp

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 89c35af565fde..103dcbeb79624 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -427,6 +427,14 @@ Improvements to Clang's diagnostics
   (or, more commonly, ``NULL`` when the platform defines it as ``__null``) to be more consistent
   with GCC.
 
+Improvements to Clang's time-trace
+----------------------------------
+- Two time-trace scope variables are added. A time trace scope variable of
+  ``ParseDeclarationOrFunctionDefinition`` with the function's source location
+  is added to record the time spent parsing the function's declaration or
+  definition. Another time trace scope variable of ``ParseFunctionDefinition``
+  is also added to record the name of the defined function.
+
 Bug Fixes in This Version
 -------------------------
 - Fixed an issue where a class template specialization whose declaration is
diff --git a/clang/lib/Parse/Parser.cpp b/clang/lib/Parse/Parser.cpp
index 0f930248e7717..176d2149e7318 100644
--- a/clang/lib/Parse/Parser.cpp
+++ b/clang/lib/Parse/Parser.cpp
@@ -13,8 +13,8 @@
 #include "clang/Parse/Parser.h"
 #include "clang/AST/ASTConsumer.h"
 #include "clang/AST/ASTContext.h"
-#include "clang/AST/DeclTemplate.h"
 #include "clang/AST/ASTLambda.h"
+#include "clang/AST/DeclTemplate.h"
 #include "clang/Basic/FileManager.h"
 #include "clang/Parse/ParseDiagnostic.h"
 #include "clang/Parse/RAIIObjectsForParser.h"
@@ -22,6 +22,7 @@
 #include "clang/Sema/ParsedTemplate.h"
 #include "clang/Sema/Scope.h"
 #include "llvm/Support/Path.h"
+#include "llvm/Support/TimeProfiler.h"
 using namespace clang;
 
 
@@ -1229,6 +1230,13 @@ Parser::DeclGroupPtrTy Parser::ParseDeclOrFunctionDefInternal(
 Parser::DeclGroupPtrTy Parser::ParseDeclarationOrFunctionDefinition(
     ParsedAttributes &Attrs, ParsedAttributes &DeclSpecAttrs,
     ParsingDeclSpec *DS, AccessSpecifier AS) {
+  // Add an enclosing time trace scope for a bunch of small scopes with
+  // "EvaluateAsConstExpr".
+  llvm::TimeTraceScope TimeScope("ParseDeclarationOrFunctionDefinition", [&]() {
+    return Tok.getLocation().printToString(
+        Actions.getASTContext().getSourceManager());
+  });
+
   if (DS) {
     return ParseDeclOrFunctionDefInternal(Attrs, DeclSpecAttrs, *DS, AS);
   } else {
@@ -1259,6 +1267,10 @@ Parser::DeclGroupPtrTy Parser::ParseDeclarationOrFunctionDefinition(
 Decl *Parser::ParseFunctionDefinition(ParsingDeclarator &D,
                                       const ParsedTemplateInfo &TemplateInfo,
                                       LateParsedAttrList *LateParsedAttrs) {
+  llvm::TimeTraceScope TimeScope("ParseFunctionDefinition", [&]() {
+    return Actions.GetNameForDeclarator(D).getName().getAsString();
+  });
+
   // Poison SEH identifiers so they are flagged as illegal in function bodies.
   PoisonSEHIdentifiersRAIIObject PoisonSEHIdentifiers(*this, true);
   const DeclaratorChunk::FunctionTypeInfo &FTI = D.getFunctionTypeInfo();
diff --git a/clang/test/Driver/check-time-trace-ParseDeclarationOrFunctionDefinition.cpp b/clang/test/Driver/check-time-trace-ParseDeclarationOrFunctionDefinition.cpp
new file mode 100644
index 0000000000000..f854cddadbfcc
--- /dev/null
+++ b/clang/test/Driver/check-time-trace-ParseDeclarationOrFunctionDefinition.cpp
@@ -0,0 +1,15 @@
+// RUN: %clangxx -S -ftime-trace -ftime-trace-granularity=0 -o %T/check-time-trace-ParseDeclarationOrFunctionDefinition %s
+// RUN: cat %T/check-time-trace-ParseDeclarationOrFunctionDefinition.json \
+// RUN:   | %python -c 'import json, sys; json.dump(json.loads(sys.stdin.read()), sys.stdout, sort_keys=True, indent=2)' \
+// RUN:   | FileCheck %s
+
+// CHECK-DAG: "name": "ParseDeclarationOrFunctionDefinition"
+// CHECK-DAG: "detail": "{{.*}}check-time-trace-ParseDeclarationOrFunctionDefinition.cpp:15:1"
+// CHECK-DAG: "name": "ParseFunctionDefinition"
+// CHECK-DAG: "detail": "foo"
+// CHECK-DAG: "name": "ParseFunctionDefinition"
+// CHECK-DAG: "detail": "bar"
+
+template <typename T>
+void foo(T) {}
+void bar() { foo(0); }
diff --git a/clang/unittests/Support/TimeProfilerTest.cpp b/clang/unittests/Support/TimeProfilerTest.cpp
index a7ca2bf91e474..97fdbb7232b13 100644
--- a/clang/unittests/Support/TimeProfilerTest.cpp
+++ b/clang/unittests/Support/TimeProfilerTest.cpp
@@ -177,22 +177,29 @@ constexpr int slow_init_list[] = {1, 1, 2, 3, 5, 8, 13, 21}; // 25th line
   std::string TraceGraph = buildTraceGraph(Json);
   ASSERT_TRUE(TraceGraph == R"(
 Frontend
-| EvaluateAsRValue (<test.cc:8:21>)
-| EvaluateForOverflow (<test.cc:8:21, col:25>)
-| EvaluateForOverflow (<test.cc:8:30, col:32>)
-| EvaluateAsRValue (<test.cc:9:14>)
-| EvaluateForOverflow (<test.cc:9:9, col:14>)
-| isPotentialConstantExpr (slow_namespace::slow_func)
-| EvaluateAsBooleanCondition (<test.cc:8:21, col:25>)
-| | EvaluateAsRValue (<test.cc:8:21, col:25>)
-| EvaluateAsBooleanCondition (<test.cc:8:21, col:25>)
-| | EvaluateAsRValue (<test.cc:8:21, col:25>)
-| EvaluateAsInitializer (slow_value)
-| EvaluateAsConstantExpr (<test.cc:17:33, col:59>)
-| EvaluateAsConstantExpr (<test.cc:18:11, col:37>)
-| EvaluateAsConstantExpr (<test.cc:23:31, col:57>)
-| EvaluateAsRValue (<test.cc:22:14, line:23:58>)
-| EvaluateAsInitializer (slow_init_list)
+| ParseDeclarationOrFunctionDefinition (test.cc:2:1)
+| ParseDeclarationOrFunctionDefinition (test.cc:6:1)
+| | ParseFunctionDefinition (slow_func)
+| | | EvaluateAsRValue (<test.cc:8:21>)
+| | | EvaluateForOverflow (<test.cc:8:21, col:25>)
+| | | EvaluateForOverflow (<test.cc:8:30, col:32>)
+| | | EvaluateAsRValue (<test.cc:9:14>)
+| | | EvaluateForOverflow (<test.cc:9:9, col:14>)
+| | | isPotentialConstantExpr (slow_namespace::slow_func)
+| | | EvaluateAsBooleanCondition (<test.cc:8:21, col:25>)
+| | | | EvaluateAsRValue (<test.cc:8:21, col:25>)
+| | | EvaluateAsBooleanCondition (<test.cc:8:21, col:25>)
+| | | | EvaluateAsRValue (<test.cc:8:21, col:25>)
+| ParseDeclarationOrFunctionDefinition (test.cc:16:1)
+| | ParseFunctionDefinition (slow_test)
+| | | EvaluateAsInitializer (slow_value)
+| | | EvaluateAsConstantExpr (<test.cc:17:33, col:59>)
+| | | EvaluateAsConstantExpr (<test.cc:18:11, col:37>)
+| ParseDeclarationOrFunctionDefinition (test.cc:22:1)
+| | EvaluateAsConstantExpr (<test.cc:23:31, col:57>)
+| | EvaluateAsRValue (<test.cc:22:14, line:23:58>)
+| ParseDeclarationOrFunctionDefinition (test.cc:25:1)
+| | EvaluateAsInitializer (slow_init_list)
 | PerformPendingInstantiations
 )");
 
@@ -213,8 +220,9 @@ struct {
   std::string TraceGraph = buildTraceGraph(Json);
   ASSERT_TRUE(TraceGraph == R"(
 Frontend
-| isIntegerConstantExpr (<test.c:3:18>)
-| EvaluateKnownConstIntCheckOverflow (<test.c:3:18>)
+| ParseDeclarationOrFunctionDefinition (test.c:2:1)
+| | isIntegerConstantExpr (<test.c:3:18>)
+| | EvaluateKnownConstIntCheckOverflow (<test.c:3:18>)
 | PerformPendingInstantiations
 )");
 

From 65761200ce4e1f366e8418652efdafd2f744291b Mon Sep 17 00:00:00 2001
From: Vlad Serebrennikov <serebrennikov.vladislav@gmail.com>
Date: Wed, 1 Nov 2023 16:43:18 +0300
Subject: [PATCH 677/877] [clang][NFC] Refactor `LinkageSpecDecl::LanguageIDs`

This patch converts `LinkageSpecDecl::LanguageIDs` into scoped enum, and moves it to namespace scope, so that it can be forward-declared where required.
---
 clang/include/clang/AST/DeclBase.h            |  2 ++
 clang/include/clang/AST/DeclCXX.h             | 28 ++++++++++---------
 clang/lib/AST/DeclBase.cpp                    |  8 +++---
 clang/lib/AST/DeclCXX.cpp                     | 14 +++++-----
 clang/lib/AST/DeclPrinter.cpp                 |  4 +--
 clang/lib/AST/JSONNodeDumper.cpp              |  8 ++++--
 clang/lib/AST/TextNodeDumper.cpp              |  4 +--
 clang/lib/CodeGen/CodeGenModule.cpp           |  4 +--
 clang/lib/Sema/SemaDecl.cpp                   |  7 ++---
 clang/lib/Sema/SemaDeclCXX.cpp                |  6 ++--
 clang/lib/Sema/SemaModule.cpp                 |  4 +--
 clang/lib/Serialization/ASTReaderDecl.cpp     |  2 +-
 clang/lib/Serialization/ASTWriterDecl.cpp     |  2 +-
 .../Clang/NameSearchContext.cpp               |  6 ++--
 14 files changed, 53 insertions(+), 46 deletions(-)

diff --git a/clang/include/clang/AST/DeclBase.h b/clang/include/clang/AST/DeclBase.h
index 6f2c5b96554a9..32b6aed639766 100644
--- a/clang/include/clang/AST/DeclBase.h
+++ b/clang/include/clang/AST/DeclBase.h
@@ -1430,6 +1430,8 @@ enum class OMPDeclareReductionInitKind {
 
 enum class ObjCImplementationControl { None, Required, Optional };
 
+enum class LinkageSpecLanguageIDs;
+
 /// DeclContext - This is used only as base class of specific decl types that
 /// can act as declaration contexts. These decls are (only the top classes
 /// that directly derive from DeclContext are mentioned, not their subclasses):
diff --git a/clang/include/clang/AST/DeclCXX.h b/clang/include/clang/AST/DeclCXX.h
index 5eaae6bdd2bc6..df1dc5a401f39 100644
--- a/clang/include/clang/AST/DeclCXX.h
+++ b/clang/include/clang/AST/DeclCXX.h
@@ -2897,6 +2897,12 @@ class CXXConversionDecl : public CXXMethodDecl {
   static bool classofKind(Kind K) { return K == CXXConversion; }
 };
 
+/// Represents the language in a linkage specification.
+///
+/// The values are part of the serialization ABI for
+/// ASTs and cannot be changed without altering that ABI.
+enum class LinkageSpecLanguageIDs { C = 1, CXX = 2 };
+
 /// Represents a linkage specification.
 ///
 /// For example:
@@ -2907,14 +2913,7 @@ class LinkageSpecDecl : public Decl, public DeclContext {
   virtual void anchor();
   // This class stores some data in DeclContext::LinkageSpecDeclBits to save
   // some space. Use the provided accessors to access it.
-public:
-  /// Represents the language in a linkage specification.
-  ///
-  /// The values are part of the serialization ABI for
-  /// ASTs and cannot be changed without altering that ABI.
-  enum LanguageIDs { lang_c = 1, lang_cxx = 2 };
 
-private:
   /// The source location for the extern keyword.
   SourceLocation ExternLoc;
 
@@ -2922,22 +2921,25 @@ class LinkageSpecDecl : public Decl, public DeclContext {
   SourceLocation RBraceLoc;
 
   LinkageSpecDecl(DeclContext *DC, SourceLocation ExternLoc,
-                  SourceLocation LangLoc, LanguageIDs lang, bool HasBraces);
+                  SourceLocation LangLoc, LinkageSpecLanguageIDs lang,
+                  bool HasBraces);
 
 public:
   static LinkageSpecDecl *Create(ASTContext &C, DeclContext *DC,
                                  SourceLocation ExternLoc,
-                                 SourceLocation LangLoc, LanguageIDs Lang,
-                                 bool HasBraces);
+                                 SourceLocation LangLoc,
+                                 LinkageSpecLanguageIDs Lang, bool HasBraces);
   static LinkageSpecDecl *CreateDeserialized(ASTContext &C, unsigned ID);
 
   /// Return the language specified by this linkage specification.
-  LanguageIDs getLanguage() const {
-    return static_cast<LanguageIDs>(LinkageSpecDeclBits.Language);
+  LinkageSpecLanguageIDs getLanguage() const {
+    return static_cast<LinkageSpecLanguageIDs>(LinkageSpecDeclBits.Language);
   }
 
   /// Set the language specified by this linkage specification.
-  void setLanguage(LanguageIDs L) { LinkageSpecDeclBits.Language = L; }
+  void setLanguage(LinkageSpecLanguageIDs L) {
+    LinkageSpecDeclBits.Language = llvm::to_underlying(L);
+  }
 
   /// Determines whether this linkage specification had braces in
   /// its syntactic form.
diff --git a/clang/lib/AST/DeclBase.cpp b/clang/lib/AST/DeclBase.cpp
index a3847a73faf81..3fd4751d6d1f3 100644
--- a/clang/lib/AST/DeclBase.cpp
+++ b/clang/lib/AST/DeclBase.cpp
@@ -1320,7 +1320,7 @@ bool DeclContext::isTransparentContext() const {
 }
 
 static bool isLinkageSpecContext(const DeclContext *DC,
-                                 LinkageSpecDecl::LanguageIDs ID) {
+                                 LinkageSpecLanguageIDs ID) {
   while (DC->getDeclKind() != Decl::TranslationUnit) {
     if (DC->getDeclKind() == Decl::LinkageSpec)
       return cast<LinkageSpecDecl>(DC)->getLanguage() == ID;
@@ -1330,14 +1330,14 @@ static bool isLinkageSpecContext(const DeclContext *DC,
 }
 
 bool DeclContext::isExternCContext() const {
-  return isLinkageSpecContext(this, LinkageSpecDecl::lang_c);
+  return isLinkageSpecContext(this, LinkageSpecLanguageIDs::C);
 }
 
 const LinkageSpecDecl *DeclContext::getExternCContext() const {
   const DeclContext *DC = this;
   while (DC->getDeclKind() != Decl::TranslationUnit) {
     if (DC->getDeclKind() == Decl::LinkageSpec &&
-        cast<LinkageSpecDecl>(DC)->getLanguage() == LinkageSpecDecl::lang_c)
+        cast<LinkageSpecDecl>(DC)->getLanguage() == LinkageSpecLanguageIDs::C)
       return cast<LinkageSpecDecl>(DC);
     DC = DC->getLexicalParent();
   }
@@ -1345,7 +1345,7 @@ const LinkageSpecDecl *DeclContext::getExternCContext() const {
 }
 
 bool DeclContext::isExternCXXContext() const {
-  return isLinkageSpecContext(this, LinkageSpecDecl::lang_cxx);
+  return isLinkageSpecContext(this, LinkageSpecLanguageIDs::CXX);
 }
 
 bool DeclContext::Encloses(const DeclContext *DC) const {
diff --git a/clang/lib/AST/DeclCXX.cpp b/clang/lib/AST/DeclCXX.cpp
index 7f42decda5121..066b62b7c2411 100644
--- a/clang/lib/AST/DeclCXX.cpp
+++ b/clang/lib/AST/DeclCXX.cpp
@@ -2854,8 +2854,8 @@ bool CXXConversionDecl::isLambdaToBlockPointerConversion() const {
 }
 
 LinkageSpecDecl::LinkageSpecDecl(DeclContext *DC, SourceLocation ExternLoc,
-                                 SourceLocation LangLoc, LanguageIDs lang,
-                                 bool HasBraces)
+                                 SourceLocation LangLoc,
+                                 LinkageSpecLanguageIDs lang, bool HasBraces)
     : Decl(LinkageSpec, DC, LangLoc), DeclContext(LinkageSpec),
       ExternLoc(ExternLoc), RBraceLoc(SourceLocation()) {
   setLanguage(lang);
@@ -2864,19 +2864,19 @@ LinkageSpecDecl::LinkageSpecDecl(DeclContext *DC, SourceLocation ExternLoc,
 
 void LinkageSpecDecl::anchor() {}
 
-LinkageSpecDecl *LinkageSpecDecl::Create(ASTContext &C,
-                                         DeclContext *DC,
+LinkageSpecDecl *LinkageSpecDecl::Create(ASTContext &C, DeclContext *DC,
                                          SourceLocation ExternLoc,
                                          SourceLocation LangLoc,
-                                         LanguageIDs Lang,
+                                         LinkageSpecLanguageIDs Lang,
                                          bool HasBraces) {
   return new (C, DC) LinkageSpecDecl(DC, ExternLoc, LangLoc, Lang, HasBraces);
 }
 
 LinkageSpecDecl *LinkageSpecDecl::CreateDeserialized(ASTContext &C,
                                                      unsigned ID) {
-  return new (C, ID) LinkageSpecDecl(nullptr, SourceLocation(),
-                                     SourceLocation(), lang_c, false);
+  return new (C, ID)
+      LinkageSpecDecl(nullptr, SourceLocation(), SourceLocation(),
+                      LinkageSpecLanguageIDs::C, false);
 }
 
 void UsingDirectiveDecl::anchor() {}
diff --git a/clang/lib/AST/DeclPrinter.cpp b/clang/lib/AST/DeclPrinter.cpp
index 98fba958064a1..30a26d518386c 100644
--- a/clang/lib/AST/DeclPrinter.cpp
+++ b/clang/lib/AST/DeclPrinter.cpp
@@ -1194,10 +1194,10 @@ void DeclPrinter::VisitCXXRecordDecl(CXXRecordDecl *D) {
 
 void DeclPrinter::VisitLinkageSpecDecl(LinkageSpecDecl *D) {
   const char *l;
-  if (D->getLanguage() == LinkageSpecDecl::lang_c)
+  if (D->getLanguage() == LinkageSpecLanguageIDs::C)
     l = "C";
   else {
-    assert(D->getLanguage() == LinkageSpecDecl::lang_cxx &&
+    assert(D->getLanguage() == LinkageSpecLanguageIDs::CXX &&
            "unknown language in linkage specification");
     l = "C++";
   }
diff --git a/clang/lib/AST/JSONNodeDumper.cpp b/clang/lib/AST/JSONNodeDumper.cpp
index abc3b273433ac..dc82e05528d8f 100644
--- a/clang/lib/AST/JSONNodeDumper.cpp
+++ b/clang/lib/AST/JSONNodeDumper.cpp
@@ -1027,8 +1027,12 @@ void JSONNodeDumper::VisitTemplateTemplateParmDecl(
 void JSONNodeDumper::VisitLinkageSpecDecl(const LinkageSpecDecl *LSD) {
   StringRef Lang;
   switch (LSD->getLanguage()) {
-  case LinkageSpecDecl::lang_c: Lang = "C"; break;
-  case LinkageSpecDecl::lang_cxx: Lang = "C++"; break;
+  case LinkageSpecLanguageIDs::C:
+    Lang = "C";
+    break;
+  case LinkageSpecLanguageIDs::CXX:
+    Lang = "C++";
+    break;
   }
   JOS.attribute("language", Lang);
   attributeOnlyIfTrue("hasBraces", LSD->hasBraces());
diff --git a/clang/lib/AST/TextNodeDumper.cpp b/clang/lib/AST/TextNodeDumper.cpp
index bbdfd4523c820..8d0f421e3a7db 100644
--- a/clang/lib/AST/TextNodeDumper.cpp
+++ b/clang/lib/AST/TextNodeDumper.cpp
@@ -2413,10 +2413,10 @@ void TextNodeDumper::VisitConstructorUsingShadowDecl(
 
 void TextNodeDumper::VisitLinkageSpecDecl(const LinkageSpecDecl *D) {
   switch (D->getLanguage()) {
-  case LinkageSpecDecl::lang_c:
+  case LinkageSpecLanguageIDs::C:
     OS << " C";
     break;
-  case LinkageSpecDecl::lang_cxx:
+  case LinkageSpecLanguageIDs::CXX:
     OS << " C++";
     break;
   }
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index 5b80478d2265c..cc81a68b15c43 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -6514,8 +6514,8 @@ void CodeGenModule::EmitObjCIvarInitializations(ObjCImplementationDecl *D) {
 
 // EmitLinkageSpec - Emit all declarations in a linkage spec.
 void CodeGenModule::EmitLinkageSpec(const LinkageSpecDecl *LSD) {
-  if (LSD->getLanguage() != LinkageSpecDecl::lang_c &&
-      LSD->getLanguage() != LinkageSpecDecl::lang_cxx) {
+  if (LSD->getLanguage() != LinkageSpecLanguageIDs::C &&
+      LSD->getLanguage() != LinkageSpecLanguageIDs::CXX) {
     ErrorUnsupported(LSD, "linkage spec");
     return;
   }
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index 96128850f5be0..a1cafa0c6ad29 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -2407,7 +2407,7 @@ FunctionDecl *Sema::CreateBuiltin(IdentifierInfo *II, QualType Type,
 
   if (getLangOpts().CPlusPlus) {
     LinkageSpecDecl *CLinkageDecl = LinkageSpecDecl::Create(
-        Context, Parent, Loc, Loc, LinkageSpecDecl::lang_c, false);
+        Context, Parent, Loc, Loc, LinkageSpecLanguageIDs::C, false);
     CLinkageDecl->setImplicit();
     Parent->addDecl(CLinkageDecl);
     Parent = CLinkageDecl;
@@ -16627,11 +16627,10 @@ void Sema::AddKnownFunctionAttributes(FunctionDecl *FD) {
   IdentifierInfo *Name = FD->getIdentifier();
   if (!Name)
     return;
-  if ((!getLangOpts().CPlusPlus &&
-       FD->getDeclContext()->isTranslationUnit()) ||
+  if ((!getLangOpts().CPlusPlus && FD->getDeclContext()->isTranslationUnit()) ||
       (isa<LinkageSpecDecl>(FD->getDeclContext()) &&
        cast<LinkageSpecDecl>(FD->getDeclContext())->getLanguage() ==
-       LinkageSpecDecl::lang_c)) {
+           LinkageSpecLanguageIDs::C)) {
     // Okay: this could be a libc/libm/Objective-C function we know
     // about.
   } else
diff --git a/clang/lib/Sema/SemaDeclCXX.cpp b/clang/lib/Sema/SemaDeclCXX.cpp
index 8d74ecd375f67..f2d6db41e77a3 100644
--- a/clang/lib/Sema/SemaDeclCXX.cpp
+++ b/clang/lib/Sema/SemaDeclCXX.cpp
@@ -16784,11 +16784,11 @@ Decl *Sema::ActOnStartLinkageSpecification(Scope *S, SourceLocation ExternLoc,
   assert(Lit->isUnevaluated() && "Unexpected string literal kind");
 
   StringRef Lang = Lit->getString();
-  LinkageSpecDecl::LanguageIDs Language;
+  LinkageSpecLanguageIDs Language;
   if (Lang == "C")
-    Language = LinkageSpecDecl::lang_c;
+    Language = LinkageSpecLanguageIDs::C;
   else if (Lang == "C++")
-    Language = LinkageSpecDecl::lang_cxx;
+    Language = LinkageSpecLanguageIDs::CXX;
   else {
     Diag(LangStr->getExprLoc(), diag::err_language_linkage_spec_unknown)
       << LangStr->getSourceRange();
diff --git a/clang/lib/Sema/SemaModule.cpp b/clang/lib/Sema/SemaModule.cpp
index f8fc27188824e..5582afb119047 100644
--- a/clang/lib/Sema/SemaModule.cpp
+++ b/clang/lib/Sema/SemaModule.cpp
@@ -28,11 +28,11 @@ static void checkModuleImportContext(Sema &S, Module *M,
 
   if (auto *LSD = dyn_cast<LinkageSpecDecl>(DC)) {
     switch (LSD->getLanguage()) {
-    case LinkageSpecDecl::lang_c:
+    case LinkageSpecLanguageIDs::C:
       if (ExternCLoc.isInvalid())
         ExternCLoc = LSD->getBeginLoc();
       break;
-    case LinkageSpecDecl::lang_cxx:
+    case LinkageSpecLanguageIDs::CXX:
       break;
     }
     DC = LSD->getParent();
diff --git a/clang/lib/Serialization/ASTReaderDecl.cpp b/clang/lib/Serialization/ASTReaderDecl.cpp
index 4b1d265d8250f..a97b992fcf6e9 100644
--- a/clang/lib/Serialization/ASTReaderDecl.cpp
+++ b/clang/lib/Serialization/ASTReaderDecl.cpp
@@ -1773,7 +1773,7 @@ void ASTDeclReader::VisitCapturedDecl(CapturedDecl *CD) {
 
 void ASTDeclReader::VisitLinkageSpecDecl(LinkageSpecDecl *D) {
   VisitDecl(D);
-  D->setLanguage((LinkageSpecDecl::LanguageIDs)Record.readInt());
+  D->setLanguage(static_cast<LinkageSpecLanguageIDs>(Record.readInt()));
   D->setExternLoc(readSourceLocation());
   D->setRBraceLoc(readSourceLocation());
 }
diff --git a/clang/lib/Serialization/ASTWriterDecl.cpp b/clang/lib/Serialization/ASTWriterDecl.cpp
index 9e1816e97b3fd..78c5e1750dbca 100644
--- a/clang/lib/Serialization/ASTWriterDecl.cpp
+++ b/clang/lib/Serialization/ASTWriterDecl.cpp
@@ -1273,7 +1273,7 @@ void ASTDeclWriter::VisitLinkageSpecDecl(LinkageSpecDecl *D) {
                 "LinkageSpecDeclBits");
 
   VisitDecl(D);
-  Record.push_back(D->getLanguage());
+  Record.push_back(llvm::to_underlying(D->getLanguage()));
   Record.AddSourceLocation(D->getExternLoc());
   Record.AddSourceLocation(D->getRBraceLoc());
   Code = serialization::DECL_LINKAGE_SPEC;
diff --git a/lldb/source/Plugins/ExpressionParser/Clang/NameSearchContext.cpp b/lldb/source/Plugins/ExpressionParser/Clang/NameSearchContext.cpp
index a67cc8e10c175..da59855a9f162 100644
--- a/lldb/source/Plugins/ExpressionParser/Clang/NameSearchContext.cpp
+++ b/lldb/source/Plugins/ExpressionParser/Clang/NameSearchContext.cpp
@@ -62,9 +62,9 @@ clang::NamedDecl *NameSearchContext::AddFunDecl(const CompilerType &type,
   clang::DeclContext *context = const_cast<DeclContext *>(m_decl_context);
 
   if (extern_c) {
-    context = LinkageSpecDecl::Create(
-        ast, context, SourceLocation(), SourceLocation(),
-        clang::LinkageSpecDecl::LanguageIDs::lang_c, false);
+    context = LinkageSpecDecl::Create(ast, context, SourceLocation(),
+                                      SourceLocation(),
+                                      clang::LinkageSpecLanguageIDs::C, false);
     // FIXME: The LinkageSpecDecl here should be added to m_decl_context.
   }
 

From 128b3b61fe6768c724975fd1df2be0abec848cf6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=88=98=E9=9B=A8=E5=9F=B9?= <liuyupei951018@hotmail.com>
Date: Wed, 1 Nov 2023 21:45:48 +0800
Subject: [PATCH 678/877] [Clang] Defer the instantiation of explicit-specifier
 until constraint checking completes (#70548)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Modifications:

- Skip the instantiation of the explicit-specifier during Decl
substitution if we are deducing template arguments and the
explicit-specifier is value dependent.

- Instantiate the explicit-specifier after the constraint checking
completes.

- Make `instantiateExplicitSpecifier` a member function in order to
instantiate the explicit-specifier in different stages.


This PR doesn’t defer the instantiation of the explicit specifier for
deduction guides, because I’m not familiar with deduction guides yet.
I’ll dig into it after this PR.

According to my local test, GCC 13 tuple works with this PR.

Fixes #59827.

---------

Co-authored-by: Erich Keane <ekeane@nvidia.com>
---
 clang/docs/ReleaseNotes.rst                   |  4 ++
 clang/include/clang/Sema/Sema.h               |  3 ++
 clang/lib/Sema/SemaTemplateDeduction.cpp      | 53 +++++++++++++++++++
 .../lib/Sema/SemaTemplateInstantiateDecl.cpp  | 40 +++++++++-----
 .../SemaCXX/cxx2a-explicit-bool-deferred.cpp  | 31 +++++++++++
 5 files changed, 117 insertions(+), 14 deletions(-)
 create mode 100644 clang/test/SemaCXX/cxx2a-explicit-bool-deferred.cpp

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 103dcbeb79624..3198d6bfe75a2 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -682,6 +682,10 @@ Bug Fixes to C++ Support
   Fixes:
   (`#68769 <https://github.com/llvm/llvm-project/issues/68769>`_)
 
+- Clang now defers the instantiation of explicit specifier until constraint checking
+  completes (except deduction guides). Fixes:
+  (`#59827 <https://github.com/llvm/llvm-project/issues/59827>`_)
+
 Bug Fixes to AST Handling
 ^^^^^^^^^^^^^^^^^^^^^^^^^
 - Fixed an import failure of recursive friend class template.
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index 91a4211a5cf5c..daed24be0a86d 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -10430,6 +10430,9 @@ class Sema final {
                                   const CXXConstructorDecl *Tmpl,
                             const MultiLevelTemplateArgumentList &TemplateArgs);
 
+  ExplicitSpecifier instantiateExplicitSpecifier(
+      const MultiLevelTemplateArgumentList &TemplateArgs, ExplicitSpecifier ES);
+
   NamedDecl *FindInstantiatedDecl(SourceLocation Loc, NamedDecl *D,
                           const MultiLevelTemplateArgumentList &TemplateArgs,
                           bool FindingInstantiatedContext = false);
diff --git a/clang/lib/Sema/SemaTemplateDeduction.cpp b/clang/lib/Sema/SemaTemplateDeduction.cpp
index 0b3f0247ea3be..699e0985e595b 100644
--- a/clang/lib/Sema/SemaTemplateDeduction.cpp
+++ b/clang/lib/Sema/SemaTemplateDeduction.cpp
@@ -3553,6 +3553,48 @@ static unsigned getPackIndexForParam(Sema &S,
   llvm_unreachable("parameter index would not be produced from template");
 }
 
+// if `Specialization` is a `CXXConstructorDecl` or `CXXConversionDecl`,
+// we'll try to instantiate and update its explicit specifier after constraint
+// checking.
+static Sema::TemplateDeductionResult instantiateExplicitSpecifierDeferred(
+    Sema &S, FunctionDecl *Specialization,
+    const MultiLevelTemplateArgumentList &SubstArgs,
+    TemplateDeductionInfo &Info, FunctionTemplateDecl *FunctionTemplate,
+    ArrayRef<TemplateArgument> DeducedArgs) {
+  auto GetExplicitSpecifier = [](FunctionDecl *D) {
+    return isa<CXXConstructorDecl>(D)
+               ? cast<CXXConstructorDecl>(D)->getExplicitSpecifier()
+               : cast<CXXConversionDecl>(D)->getExplicitSpecifier();
+  };
+  auto SetExplicitSpecifier = [](FunctionDecl *D, ExplicitSpecifier ES) {
+    isa<CXXConstructorDecl>(D)
+        ? cast<CXXConstructorDecl>(D)->setExplicitSpecifier(ES)
+        : cast<CXXConversionDecl>(D)->setExplicitSpecifier(ES);
+  };
+
+  ExplicitSpecifier ES = GetExplicitSpecifier(Specialization);
+  Expr *ExplicitExpr = ES.getExpr();
+  if (!ExplicitExpr)
+    return Sema::TDK_Success;
+  if (!ExplicitExpr->isValueDependent())
+    return Sema::TDK_Success;
+
+  Sema::InstantiatingTemplate Inst(
+      S, Info.getLocation(), FunctionTemplate, DeducedArgs,
+      Sema::CodeSynthesisContext::DeducedTemplateArgumentSubstitution, Info);
+  if (Inst.isInvalid())
+    return Sema::TDK_InstantiationDepth;
+  Sema::SFINAETrap Trap(S);
+  const ExplicitSpecifier InstantiatedES =
+      S.instantiateExplicitSpecifier(SubstArgs, ES);
+  if (InstantiatedES.isInvalid() || Trap.hasErrorOccurred()) {
+    Specialization->setInvalidDecl(true);
+    return Sema::TDK_SubstitutionFailure;
+  }
+  SetExplicitSpecifier(Specialization, InstantiatedES);
+  return Sema::TDK_Success;
+}
+
 /// Finish template argument deduction for a function template,
 /// checking the deduced template arguments for completeness and forming
 /// the function template specialization.
@@ -3682,6 +3724,17 @@ Sema::TemplateDeductionResult Sema::FinishTemplateArgumentDeduction(
     }
   }
 
+  // We skipped the instantiation of the explicit-specifier during the
+  // substitution of `FD` before. So, we try to instantiate it back if
+  // `Specialization` is either a constructor or a conversion function.
+  if (isa<CXXConstructorDecl, CXXConversionDecl>(Specialization)) {
+    if (TDK_Success != instantiateExplicitSpecifierDeferred(
+                           *this, Specialization, SubstArgs, Info,
+                           FunctionTemplate, DeducedArgs)) {
+      return TDK_SubstitutionFailure;
+    }
+  }
+
   if (OriginalCallArgs) {
     // C++ [temp.deduct.call]p4:
     //   In general, the deduction process attempts to find template argument
diff --git a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
index 11d4c83dba77d..8edcbf4709a23 100644
--- a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
@@ -563,18 +563,16 @@ static void instantiateDependentAMDGPUFlatWorkGroupSizeAttr(
   S.addAMDGPUFlatWorkGroupSizeAttr(New, Attr, MinExpr, MaxExpr);
 }
 
-static ExplicitSpecifier
-instantiateExplicitSpecifier(Sema &S,
-                             const MultiLevelTemplateArgumentList &TemplateArgs,
-                             ExplicitSpecifier ES, FunctionDecl *New) {
+ExplicitSpecifier Sema::instantiateExplicitSpecifier(
+    const MultiLevelTemplateArgumentList &TemplateArgs, ExplicitSpecifier ES) {
   if (!ES.getExpr())
     return ES;
   Expr *OldCond = ES.getExpr();
   Expr *Cond = nullptr;
   {
     EnterExpressionEvaluationContext Unevaluated(
-        S, Sema::ExpressionEvaluationContext::ConstantEvaluated);
-    ExprResult SubstResult = S.SubstExpr(OldCond, TemplateArgs);
+        *this, Sema::ExpressionEvaluationContext::ConstantEvaluated);
+    ExprResult SubstResult = SubstExpr(OldCond, TemplateArgs);
     if (SubstResult.isInvalid()) {
       return ExplicitSpecifier::Invalid();
     }
@@ -582,7 +580,7 @@ instantiateExplicitSpecifier(Sema &S,
   }
   ExplicitSpecifier Result(Cond, ES.getKind());
   if (!Cond->isTypeDependent())
-    S.tryResolveExplicitSpecifier(Result);
+    tryResolveExplicitSpecifier(Result);
   return Result;
 }
 
@@ -2073,8 +2071,8 @@ Decl *TemplateDeclInstantiator::VisitFunctionDecl(
 
   ExplicitSpecifier InstantiatedExplicitSpecifier;
   if (auto *DGuide = dyn_cast<CXXDeductionGuideDecl>(D)) {
-    InstantiatedExplicitSpecifier = instantiateExplicitSpecifier(
-        SemaRef, TemplateArgs, DGuide->getExplicitSpecifier(), DGuide);
+    InstantiatedExplicitSpecifier = SemaRef.instantiateExplicitSpecifier(
+        TemplateArgs, DGuide->getExplicitSpecifier());
     if (InstantiatedExplicitSpecifier.isInvalid())
       return nullptr;
   }
@@ -2453,11 +2451,25 @@ Decl *TemplateDeclInstantiator::VisitCXXMethodDecl(
     }
   }
 
-  ExplicitSpecifier InstantiatedExplicitSpecifier =
-      instantiateExplicitSpecifier(SemaRef, TemplateArgs,
-                                   ExplicitSpecifier::getFromDecl(D), D);
-  if (InstantiatedExplicitSpecifier.isInvalid())
-    return nullptr;
+  auto InstantiatedExplicitSpecifier = ExplicitSpecifier::getFromDecl(D);
+  // deduction guides need this
+  const bool CouldInstantiate =
+      InstantiatedExplicitSpecifier.getExpr() == nullptr ||
+      !InstantiatedExplicitSpecifier.getExpr()->isValueDependent();
+
+  // Delay the instantiation of the explicit-specifier until after the
+  // constraints are checked during template argument deduction.
+  if (CouldInstantiate ||
+      SemaRef.CodeSynthesisContexts.back().Kind !=
+          Sema::CodeSynthesisContext::DeducedTemplateArgumentSubstitution) {
+    InstantiatedExplicitSpecifier = SemaRef.instantiateExplicitSpecifier(
+        TemplateArgs, InstantiatedExplicitSpecifier);
+
+    if (InstantiatedExplicitSpecifier.isInvalid())
+      return nullptr;
+  } else {
+    InstantiatedExplicitSpecifier.setKind(ExplicitSpecKind::Unresolved);
+  }
 
   // Implicit destructors/constructors created for local classes in
   // DeclareImplicit* (see SemaDeclCXX.cpp) might not have an associated TSI.
diff --git a/clang/test/SemaCXX/cxx2a-explicit-bool-deferred.cpp b/clang/test/SemaCXX/cxx2a-explicit-bool-deferred.cpp
new file mode 100644
index 0000000000000..4d667008f2e27
--- /dev/null
+++ b/clang/test/SemaCXX/cxx2a-explicit-bool-deferred.cpp
@@ -0,0 +1,31 @@
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c++2a %s
+
+template <typename T1, typename T2> struct is_same {
+  static constexpr bool value = false;
+};
+
+template <typename T> struct is_same<T, T> {
+  static constexpr bool value = true;
+};
+
+template <class T, class U>
+concept SameHelper = is_same<T, U>::value;
+template <class T, class U>
+concept same_as = SameHelper<T, U> && SameHelper<U, T>;
+
+namespace deferred_instantiation {
+template <class X> constexpr X do_not_instantiate() { return nullptr; }
+
+struct T {
+  template <same_as<float> X> explicit(do_not_instantiate<X>()) T(X) {}
+
+  T(int) {}
+};
+
+T t(5);
+// expected-error@17{{cannot initialize}}
+// expected-note@20{{in instantiation of function template specialization}}
+// expected-note@30{{while substituting deduced template arguments}}
+// expected-note@30{{in instantiation of function template specialization}}
+T t2(5.0f);
+} // namespace deferred_instantiation

From c7f0e499158fa4471ac86af46e13f45ccbfba399 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Wed, 1 Nov 2023 14:49:58 +0100
Subject: [PATCH 679/877] [InstCombine] Fix canAlwaysEvaluateInTy() with
 constant exprs

The m_ZExtOrSExt / m_Trunc in the following code can match constant
expressions, which we don't want here. Make sure we bail out early
for non-immediate constants.
---
 .../Transforms/InstCombine/InstCombineCasts.cpp   |  5 +++--
 llvm/test/Transforms/InstCombine/zext.ll          | 15 +++++++++++++++
 2 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
index 5bdb4d58412f4..e2f17aab9c442 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -213,8 +213,9 @@ Instruction *InstCombinerImpl::commonCastTransforms(CastInst &CI) {
 /// Constants and extensions/truncates from the destination type are always
 /// free to be evaluated in that type. This is a helper for canEvaluate*.
 static bool canAlwaysEvaluateInType(Value *V, Type *Ty) {
-  if (match(V, m_ImmConstant()))
-    return true;
+  if (isa<Constant>(V))
+    return match(V, m_ImmConstant());
+
   Value *X;
   if ((match(V, m_ZExtOrSExt(m_Value(X))) || match(V, m_Trunc(m_Value(X)))) &&
       X->getType() == Ty)
diff --git a/llvm/test/Transforms/InstCombine/zext.ll b/llvm/test/Transforms/InstCombine/zext.ll
index f595638ba9e30..255d9764410cd 100644
--- a/llvm/test/Transforms/InstCombine/zext.ll
+++ b/llvm/test/Transforms/InstCombine/zext.ll
@@ -1,6 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -passes=instcombine -S | FileCheck %s
 
+target datalayout = "n64"
+
 declare void @use1(i1)
 declare void @use32(i32)
 declare void @use_vec(<2 x i9>)
@@ -762,3 +764,16 @@ define i32  @zext_icmp_eq0_no_shift(ptr %ptr ) {
   %res = zext i1 %cmp to i32
   ret i32 %res
 }
+
+@g = external global i8
+
+define i64 @evaluate_zexted_const_expr(i1 %c) {
+; CHECK-LABEL: @evaluate_zexted_const_expr(
+; CHECK-NEXT:    [[AND:%.*]] = select i1 [[C:%.*]], i7 trunc (i64 add (i64 ptrtoint (ptr @g to i64), i64 1) to i7), i7 trunc (i64 add (i64 ptrtoint (ptr @g to i64), i64 2) to i7)
+; CHECK-NEXT:    [[EXT:%.*]] = zext i7 [[AND]] to i64
+; CHECK-NEXT:    ret i64 [[EXT]]
+;
+  %and = select i1 %c, i7 trunc (i64 add (i64 ptrtoint (ptr @g to i64), i64 1) to i7), i7 trunc (i64 add (i64 ptrtoint (ptr @g to i64), i64 2) to i7)
+  %ext = zext i7 %and to i64
+  ret i64 %ext
+}

From 008af1c9f4cd0188a69bf42b821749154a8142c8 Mon Sep 17 00:00:00 2001
From: Vlad Serebrennikov <serebrennikov.vladislav@gmail.com>
Date: Wed, 1 Nov 2023 17:00:10 +0300
Subject: [PATCH 680/877] [clang][NFC] Fix leftovers from
 `LinkageSpecDecl::LanguageIDs` refactoring

This fixes bot failure https://lab.llvm.org/buildbot/#/builders/139/builds/52622 introduced by 65761200ce4e1f366e8418652efdafd2f744291b
---
 .../clang-tidy/modernize/DeprecatedHeadersCheck.cpp           | 2 +-
 clang-tools-extra/modularize/Modularize.cpp                   | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/clang-tools-extra/clang-tidy/modernize/DeprecatedHeadersCheck.cpp b/clang-tools-extra/clang-tidy/modernize/DeprecatedHeadersCheck.cpp
index 9dacf1562429c..030a781e2099b 100644
--- a/clang-tools-extra/clang-tidy/modernize/DeprecatedHeadersCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/DeprecatedHeadersCheck.cpp
@@ -57,7 +57,7 @@ class ExternCRefutationVisitor
   bool shouldVisitLambdaBody() const { return false; }
 
   bool VisitLinkageSpecDecl(LinkageSpecDecl *LinkSpecDecl) const {
-    if (LinkSpecDecl->getLanguage() != LinkageSpecDecl::lang_c ||
+    if (LinkSpecDecl->getLanguage() != LinkageSpecLanguageIDs::C ||
         !LinkSpecDecl->hasBraces())
       return true;
 
diff --git a/clang-tools-extra/modularize/Modularize.cpp b/clang-tools-extra/modularize/Modularize.cpp
index a7db1df1a1d25..daa9c048279e0 100644
--- a/clang-tools-extra/modularize/Modularize.cpp
+++ b/clang-tools-extra/modularize/Modularize.cpp
@@ -574,10 +574,10 @@ class CollectEntitiesVisitor
     SourceRange BlockRange = D->getSourceRange();
     const char *LinkageLabel;
     switch (D->getLanguage()) {
-    case LinkageSpecDecl::lang_c:
+    case LinkageSpecLanguageIDs::C:
       LinkageLabel = "extern \"C\" {}";
       break;
-    case LinkageSpecDecl::lang_cxx:
+    case LinkageSpecLanguageIDs::CXX:
       LinkageLabel = "extern \"C++\" {}";
       break;
     }

From bc44e6e7c64ae7abd885fc31a62e37f649e307be Mon Sep 17 00:00:00 2001
From: Youngsuk Kim <youngsuk.kim@hpe.com>
Date: Tue, 31 Oct 2023 14:56:50 -0500
Subject: [PATCH 681/877] [clang] Remove no-op ptr-to-ptr bitcasts (NFC)

Opaque pointer cleanup effort.
---
 clang/lib/CodeGen/CGBuiltin.cpp     | 2 +-
 clang/lib/CodeGen/CGDecl.cpp        | 4 +---
 clang/lib/CodeGen/CodeGenPGO.cpp    | 5 ++---
 clang/lib/CodeGen/ItaniumCXXABI.cpp | 4 +---
 4 files changed, 5 insertions(+), 10 deletions(-)

diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 7c2911468cad5..e047d31c01211 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -5049,7 +5049,7 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
   case Builtin::BI__GetExceptionInfo: {
     if (llvm::GlobalVariable *GV =
             CGM.getCXXABI().getThrowInfo(FD->getParamDecl(0)->getType()))
-      return RValue::get(llvm::ConstantExpr::getBitCast(GV, CGM.Int8PtrTy));
+      return RValue::get(GV);
     break;
   }
 
diff --git a/clang/lib/CodeGen/CGDecl.cpp b/clang/lib/CodeGen/CGDecl.cpp
index 9e4319d27858e..0420f438ec139 100644
--- a/clang/lib/CodeGen/CGDecl.cpp
+++ b/clang/lib/CodeGen/CGDecl.cpp
@@ -386,9 +386,7 @@ CodeGenFunction::AddInitializerToStaticVarDecl(const VarDecl &D,
     GV->takeName(OldGV);
 
     // Replace all uses of the old global with the new global
-    llvm::Constant *NewPtrForOldDecl =
-    llvm::ConstantExpr::getBitCast(GV, OldGV->getType());
-    OldGV->replaceAllUsesWith(NewPtrForOldDecl);
+    OldGV->replaceAllUsesWith(GV);
 
     // Erase the old global, since it is no longer used.
     OldGV->eraseFromParent();
diff --git a/clang/lib/CodeGen/CodeGenPGO.cpp b/clang/lib/CodeGen/CodeGenPGO.cpp
index 63cdd0a047bcd..7d6c69f22d0e5 100644
--- a/clang/lib/CodeGen/CodeGenPGO.cpp
+++ b/clang/lib/CodeGen/CodeGenPGO.cpp
@@ -960,9 +960,8 @@ void CodeGenPGO::emitCounterIncrement(CGBuilderTy &Builder, const Stmt *S,
     return;
 
   unsigned Counter = (*RegionCounterMap)[S];
-  auto *I8PtrTy = llvm::Type::getInt8PtrTy(CGM.getLLVMContext());
 
-  llvm::Value *Args[] = {llvm::ConstantExpr::getBitCast(FuncNameVar, I8PtrTy),
+  llvm::Value *Args[] = {FuncNameVar,
                          Builder.getInt64(FunctionHash),
                          Builder.getInt32(NumRegionCounters),
                          Builder.getInt32(Counter), StepV};
@@ -1000,7 +999,7 @@ void CodeGenPGO::valueProfile(CGBuilderTy &Builder, uint32_t ValueKind,
     auto BuilderInsertPoint = Builder.saveIP();
     Builder.SetInsertPoint(ValueSite);
     llvm::Value *Args[5] = {
-        llvm::ConstantExpr::getBitCast(FuncNameVar, Builder.getInt8PtrTy()),
+        FuncNameVar,
         Builder.getInt64(FunctionHash),
         Builder.CreatePtrToInt(ValuePtr, Builder.getInt64Ty()),
         Builder.getInt32(ValueKind),
diff --git a/clang/lib/CodeGen/ItaniumCXXABI.cpp b/clang/lib/CodeGen/ItaniumCXXABI.cpp
index 7877235a63356..c7295b3144ed1 100644
--- a/clang/lib/CodeGen/ItaniumCXXABI.cpp
+++ b/clang/lib/CodeGen/ItaniumCXXABI.cpp
@@ -3938,9 +3938,7 @@ llvm::Constant *ItaniumRTTIBuilder::BuildTypeInfo(
   // If there's already an old global variable, replace it with the new one.
   if (OldGV) {
     GV->takeName(OldGV);
-    llvm::Constant *NewPtr =
-      llvm::ConstantExpr::getBitCast(GV, OldGV->getType());
-    OldGV->replaceAllUsesWith(NewPtr);
+    OldGV->replaceAllUsesWith(GV);
     OldGV->eraseFromParent();
   }
 

From 8c0ec793ee5baa5b0546272823644f53460f4849 Mon Sep 17 00:00:00 2001
From: Nicolas Vasilache <nicolasvasilache@users.noreply.github.com>
Date: Wed, 1 Nov 2023 15:42:19 +0100
Subject: [PATCH 682/877] [mlir][LLVM] Allow call_intrinsic to inline (#70940)

LLVM::CallIntrinsicOp was previously overlooked from the
isLegalToInline.
---
 mlir/lib/Dialect/LLVMIR/IR/LLVMInlining.cpp | 1 +
 mlir/test/Dialect/LLVMIR/inlining.mlir      | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMInlining.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMInlining.cpp
index b40be73ff21f7..6063abdba7b9a 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/LLVMInlining.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMInlining.cpp
@@ -694,6 +694,7 @@ struct LLVMInlinerInterface : public DialectInlinerInterface {
             LLVM::AtomicRMWOp,
             LLVM::AtomicCmpXchgOp,
             LLVM::CallOp,
+            LLVM::CallIntrinsicOp,
             LLVM::DbgDeclareOp,
             LLVM::DbgLabelOp,
             LLVM::DbgValueOp,
diff --git a/mlir/test/Dialect/LLVMIR/inlining.mlir b/mlir/test/Dialect/LLVMIR/inlining.mlir
index 3f14dc6de6b76..1296b8e031c13 100644
--- a/mlir/test/Dialect/LLVMIR/inlining.mlir
+++ b/mlir/test/Dialect/LLVMIR/inlining.mlir
@@ -28,6 +28,7 @@ func.func @inner_func_inlinable(%ptr : !llvm.ptr) -> i32 {
   llvm.unreachable
 ^bb2:
   llvm.intr.stackrestore %stack : !llvm.ptr
+  llvm.call_intrinsic "llvm.x86.sse41.round.ss"() : () -> (vector<8xf32>)
   return %1 : i32
 }
 
@@ -50,6 +51,7 @@ func.func @inner_func_inlinable(%ptr : !llvm.ptr) -> i32 {
 // CHECK: llvm.inline_asm has_side_effects "foo", "bar"
 // CHECK: llvm.unreachable
 // CHECK: llvm.intr.stackrestore %[[STACK]]
+// CHECK: llvm.call_intrinsic "llvm.x86.sse41.round.ss"(
 func.func @test_inline(%ptr : !llvm.ptr) -> i32 {
   %0 = call @inner_func_inlinable(%ptr) : (!llvm.ptr) -> i32
   return %0 : i32

From 979c19ab12f1a6defd9436fc8eaf2bca35fbd655 Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2@gmail.com>
Date: Wed, 1 Nov 2023 10:43:05 -0400
Subject: [PATCH 683/877] [libc++] Fix complexity guarantee in ranges::clamp
 (#68413)

This patch prevents us from calling the projection more than 3 times in
std::clamp, as required by the Standard.

Fixes #64717
---
 libcxx/include/__algorithm/ranges_clamp.h     |  5 +-
 .../alg.clamp/ranges.clamp.pass.cpp           | 62 ++++++++++---------
 2 files changed, 36 insertions(+), 31 deletions(-)

diff --git a/libcxx/include/__algorithm/ranges_clamp.h b/libcxx/include/__algorithm/ranges_clamp.h
index 9613f7f37720a..e6c86207254a1 100644
--- a/libcxx/include/__algorithm/ranges_clamp.h
+++ b/libcxx/include/__algorithm/ranges_clamp.h
@@ -37,9 +37,10 @@ struct __fn {
     _LIBCPP_ASSERT_UNCATEGORIZED(!bool(std::invoke(__comp, std::invoke(__proj, __high), std::invoke(__proj, __low))),
                                  "Bad bounds passed to std::ranges::clamp");
 
-    if (std::invoke(__comp, std::invoke(__proj, __value), std::invoke(__proj, __low)))
+    auto&& __projected = std::invoke(__proj, __value);
+    if (std::invoke(__comp, std::forward<decltype(__projected)>(__projected), std::invoke(__proj, __low)))
       return __low;
-    else if (std::invoke(__comp, std::invoke(__proj, __high), std::invoke(__proj, __value)))
+    else if (std::invoke(__comp, std::invoke(__proj, __high), std::forward<decltype(__projected)>(__projected)))
       return __high;
     else
       return __value;
diff --git a/libcxx/test/std/algorithms/alg.sorting/alg.clamp/ranges.clamp.pass.cpp b/libcxx/test/std/algorithms/alg.sorting/alg.clamp/ranges.clamp.pass.cpp
index 036552f75de4e..9dd749cf49619 100644
--- a/libcxx/test/std/algorithms/alg.sorting/alg.clamp/ranges.clamp.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.sorting/alg.clamp/ranges.clamp.pass.cpp
@@ -19,6 +19,7 @@
 #include <cassert>
 #include <concepts>
 #include <functional>
+#include <iterator>
 #include <utility>
 
 template <class T, class Comp = std::ranges::less, class Proj = std::identity>
@@ -38,6 +39,16 @@ static_assert(!HasClamp<NoComp>);
 static_assert(!HasClamp<int, NoComp>);
 static_assert(!HasClamp<int, std::ranges::less, CreateNoComp>);
 
+struct EnsureValueCategoryComp {
+  constexpr bool operator()(const int&& x, const int&& y) const { return x < y; }
+  constexpr bool operator()(const int&& x, int& y) const { return x < y; }
+  constexpr bool operator()(int& x, const int&& y) const { return x < y; }
+  constexpr bool operator()(int& x, int& y) const { return x < y; }
+  constexpr bool operator()(std::same_as<const int&> auto&& x, std::same_as<const int&> auto&& y) const {
+    return x < y;
+  }
+};
+
 constexpr bool test() {
   { // low < val < high
     int val = 2;
@@ -68,45 +79,38 @@ constexpr bool test() {
   { // Check that a custom projection works.
     struct S {
       int i;
-
-      constexpr const int& lvalue_proj() const { return i; }
-      constexpr int prvalue_proj() const { return i; }
-    };
-
-    struct Comp {
-      constexpr bool operator()(const int& lhs, const int& rhs) const { return lhs < rhs; }
-      constexpr bool operator()(int&& lhs, int&& rhs) const { return lhs > rhs; }
+      constexpr bool operator==(S const& other) const { return i == other.i; }
     };
 
     auto val = S{10};
     auto low = S{20};
     auto high = S{30};
-    // Check that the value category of the projection return type is preserved.
-    assert(&std::ranges::clamp(val, low, high, Comp{}, &S::lvalue_proj) == &low);
-    assert(&std::ranges::clamp(val, high, low, Comp{}, &S::prvalue_proj) == &low);
+    auto proj = [](S const& s) -> int const& { return s.i; };
+
+    assert(std::ranges::clamp(val, low, high, std::less{}, proj) == low);
   }
 
-  { // Check that the implementation doesn't cause double moves (which could result from calling the projection on
-    // `value` once and then forwarding the result into the comparator).
-    struct CheckDoubleMove {
-      int i;
-      bool moved = false;
-
-      constexpr explicit CheckDoubleMove(int set_i) : i(set_i) {}
-      constexpr CheckDoubleMove(const CheckDoubleMove&) = default;
-      constexpr CheckDoubleMove(CheckDoubleMove&& rhs) noexcept : i(rhs.i) {
-        assert(!rhs.moved);
-        rhs.moved = true;
-      }
+  { // Ensure that we respect the value category of the projection when calling the comparator.
+    // This additional example was provided by Tim Song in https://github.com/microsoft/STL/issues/3970#issuecomment-1685120958.
+    struct MoveProj {
+      constexpr int const&& operator()(int const& x) const { return std::move(x); }
     };
 
-    auto val = CheckDoubleMove{20};
-    auto low = CheckDoubleMove{10};
-    auto high = CheckDoubleMove{30};
+    static_assert(std::indirect_strict_weak_order<EnsureValueCategoryComp, std::projected<const int*, MoveProj>>);
 
-    auto moving_comp = [](CheckDoubleMove lhs, CheckDoubleMove rhs) { return lhs.i < rhs.i; };
-    auto prvalue_proj = [](const CheckDoubleMove& x) -> CheckDoubleMove { return x; };
-    assert(&std::ranges::clamp(val, low, high, moving_comp, prvalue_proj) == &val);
+    assert(std::ranges::clamp(0, 1, 2, EnsureValueCategoryComp{}, MoveProj{}) == 1);
+  }
+
+  { // Make sure we don't call the projection more than three times per [alg.clamp], see #64717
+    int counter              = 0;
+    auto projection_function = [&counter](const int value) -> int {
+      counter++;
+      return value;
+    };
+    assert(std::ranges::clamp(3, 2, 4, std::ranges::less{}, projection_function) == 3);
+#if !(defined(_LIBCPP_ENABLE_SAFE_MODE) || defined(_LIBCPP_ENABLE_DEBUG_MODE))
+    assert(counter <= 3);
+#endif
   }
 
   return true;

From 0a34aaedd8ec2dc2375076976c1327fdbfd7877f Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev@outlook.com>
Date: Tue, 18 Apr 2023 04:16:58 -0700
Subject: [PATCH 684/877] [SLP]Improve tryToGatherExtractElements by using
 per-register analysis.

Currently tryToGatherExtractElements function analyzes the whole vector,
regrdless number of actual registers, used in this vector. It may
prevent some optimizations, because per-register analysis may allow to
simplify the final code by reusing more already emitted vectors and
better shuffles.

Differential Revision: https://reviews.llvm.org/D148855
---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 535 ++++++++++--------
 .../AArch64/extractelements-to-shuffle.ll     | 135 ++---
 .../SLPVectorizer/X86/crash_clear_undefs.ll   |   2 +-
 .../SLPVectorizer/X86/hadd-inseltpoison.ll    | 152 ++++-
 .../test/Transforms/SLPVectorizer/X86/hadd.ll | 152 ++++-
 .../SLPVectorizer/X86/hsub-inseltpoison.ll    | 153 ++++-
 .../test/Transforms/SLPVectorizer/X86/hsub.ll | 153 ++++-
 .../X86/reused-extractelements.ll             |  23 +-
 8 files changed, 879 insertions(+), 426 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 80487d5861a42..88c72dde041e0 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -662,6 +662,36 @@ tryToGatherSingleRegisterExtractElements(MutableArrayRef<Value *> VL,
   return Res;
 }
 
+/// Tries to find extractelement instructions with constant indices from fixed
+/// vector type and gather such instructions into a bunch, which highly likely
+/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
+/// successful, the matched scalars are replaced by poison values in \p VL for
+/// future analysis.
+static SmallVector<std::optional<TTI::ShuffleKind>>
+tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
+                           SmallVectorImpl<int> &Mask, unsigned NumParts) {
+  assert(NumParts > 0 && "NumParts expected be greater than or equal to 1.");
+  SmallVector<std::optional<TTI::ShuffleKind>> ShufflesRes(NumParts);
+  Mask.assign(VL.size(), PoisonMaskElem);
+  unsigned SliceSize = VL.size() / NumParts;
+  for (unsigned Part = 0; Part < NumParts; ++Part) {
+    // Scan list of gathered scalars for extractelements that can be represented
+    // as shuffles.
+    MutableArrayRef<Value *> SubVL =
+        MutableArrayRef(VL).slice(Part * SliceSize, SliceSize);
+    SmallVector<int> SubMask;
+    std::optional<TTI::ShuffleKind> Res =
+        tryToGatherSingleRegisterExtractElements(SubVL, SubMask);
+    ShufflesRes[Part] = Res;
+    copy(SubMask, std::next(Mask.begin(), Part * SliceSize));
+  }
+  if (none_of(ShufflesRes, [](const std::optional<TTI::ShuffleKind> &Res) {
+        return Res.has_value();
+      }))
+    ShufflesRes.clear();
+  return ShufflesRes;
+}
+
 namespace {
 
 /// Main data required for vectorization of instructions.
@@ -7152,101 +7182,80 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
                 : R.getGatherCost(Gathers, !Root && VL.equals(Gathers)));
   };
 
-  /// Compute the cost of creating a vector of type \p VecTy containing the
-  /// extracted values from \p VL.
-  InstructionCost computeExtractCost(ArrayRef<Value *> VL, ArrayRef<int> Mask,
-                                     TTI::ShuffleKind ShuffleKind) {
-    unsigned NumElts = 0;
-    for (Value *V : VL) {
-      auto *EE = dyn_cast<ExtractElementInst>(V);
-      if (!EE)
-        continue;
-      auto *VecTy = cast<FixedVectorType>(EE->getVectorOperandType());
-      NumElts = std::max(NumElts, VecTy->getNumElements());
-    }
-    assert(NumElts > 0 &&
-           "Expected at least 1-element fixed length vector(s).");
-    auto *VecTy = FixedVectorType::get(VL.front()->getType(), NumElts);
-    unsigned NumOfParts = TTI.getNumberOfParts(VecTy);
-    if (!NumOfParts || NumElts < NumOfParts)
-      return TTI.getShuffleCost(ShuffleKind, VecTy, Mask);
-    unsigned EltsPerVector = PowerOf2Ceil(divideCeil(NumElts, NumOfParts));
-    int ValNum = -1;
-    int ValIdx = -1;
-    // Check that if trying to permute 2 input vectors (which may result in
-    // several vector registers), each per-register subvector is the result of
-    // the permutation of 2 single registers.
-    if (ShuffleKind != TargetTransformInfo::SK_PermuteSingleSrc &&
-        !all_of(enumerate(Mask), [&](auto &&Arg) {
-          if (Arg.value() == PoisonMaskElem)
-            return true;
-          int CurValNum = (Arg.value() % NumElts) / EltsPerVector;
-          int CurValIdx = Arg.index() / EltsPerVector;
-          if (ValIdx != CurValIdx) {
-            ValIdx = CurValIdx;
-            ValNum = CurValNum;
-            return true;
-          }
-          return CurValNum == ValNum;
-        }))
-      return TTI.getShuffleCost(ShuffleKind, VecTy, Mask);
-
+  /// Compute the cost of creating a vector containing the extracted values from
+  /// \p VL.
+  InstructionCost
+  computeExtractCost(ArrayRef<Value *> VL, ArrayRef<int> Mask,
+                     ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
+                     unsigned NumParts) {
+    assert(VL.size() > NumParts && "Unexpected scalarized shuffle.");
+    unsigned NumElts =
+        std::accumulate(VL.begin(), VL.end(), 0, [](unsigned Sz, Value *V) {
+          auto *EE = dyn_cast<ExtractElementInst>(V);
+          if (!EE)
+            return Sz;
+          auto *VecTy = cast<FixedVectorType>(EE->getVectorOperandType());
+          return std::max(Sz, VecTy->getNumElements());
+        });
+    unsigned NumSrcRegs = TTI.getNumberOfParts(
+        FixedVectorType::get(VL.front()->getType(), NumElts));
+    if (NumSrcRegs == 0)
+      NumSrcRegs = 1;
+    // FIXME: this must be moved to TTI for better estimation.
+    unsigned EltsPerVector = PowerOf2Ceil(std::max(
+        divideCeil(VL.size(), NumParts), divideCeil(NumElts, NumSrcRegs)));
+    auto CheckPerRegistersShuffle =
+        [&](MutableArrayRef<int> Mask) -> std::optional<TTI::ShuffleKind> {
+      DenseSet<int> RegIndices;
+      // Check that if trying to permute same single/2 input vectors.
+      TTI::ShuffleKind ShuffleKind = TTI::SK_PermuteSingleSrc;
+      int FirstRegId = -1;
+      for (int &I : Mask) {
+        if (I == PoisonMaskElem)
+          continue;
+        int RegId = (I / NumElts) * NumParts + (I % NumElts) / EltsPerVector;
+        if (FirstRegId < 0)
+          FirstRegId = RegId;
+        RegIndices.insert(RegId);
+        if (RegIndices.size() > 2)
+          return std::nullopt;
+        if (RegIndices.size() == 2)
+          ShuffleKind = TTI::SK_PermuteTwoSrc;
+        I = (I % NumElts) % EltsPerVector +
+            (RegId == FirstRegId ? 0 : EltsPerVector);
+      }
+      return ShuffleKind;
+    };
     InstructionCost Cost = 0;
 
     // Process extracts in blocks of EltsPerVector to check if the source vector
     // operand can be re-used directly. If not, add the cost of creating a
     // shuffle to extract the values into a vector register.
-    auto *RegisterVecTy =
-        FixedVectorType::get(VL.front()->getType(), EltsPerVector);
-    SmallVector<int> RegMask(EltsPerVector, PoisonMaskElem);
-    TTI::ShuffleKind RegisterSK = TargetTransformInfo::SK_PermuteSingleSrc;
-    Value *VecBase = nullptr;
-    bool IsIdentity = true;
-    for (auto [Idx, V] : enumerate(VL)) {
-      // Reached the start of a new vector registers.
-      if (Idx % EltsPerVector == 0) {
-        RegMask.assign(EltsPerVector, PoisonMaskElem);
-        RegisterSK = TargetTransformInfo::SK_PermuteSingleSrc;
-        VecBase = nullptr;
-      }
-
-      // Need to exclude undefs from analysis.
-      if (isa<UndefValue>(V) || Mask[Idx] == PoisonMaskElem)
+    for (unsigned Part = 0; Part < NumParts; ++Part) {
+      if (!ShuffleKinds[Part])
         continue;
-
-      // Check all extracts for a vector register on the target directly
-      // extract values in order.
-      unsigned CurrentIdx = *getExtractIndex(cast<Instruction>(V));
-      unsigned PrevIdx = CurrentIdx;
-      if (Idx % EltsPerVector != 0 && !isa<UndefValue>(VL[Idx - 1]) &&
-          Mask[Idx - 1] != PoisonMaskElem)
-        PrevIdx = *getExtractIndex(cast<Instruction>(VL[Idx - 1])) + 1;
-      if (!VecBase) {
-        VecBase = cast<ExtractElementInst>(V)->getVectorOperand();
-        RegMask[Idx % EltsPerVector] = CurrentIdx % EltsPerVector;
-        IsIdentity = CurrentIdx % EltsPerVector == Idx % EltsPerVector;
-      } else if (VecBase != cast<ExtractElementInst>(V)->getVectorOperand()) {
-        IsIdentity = false;
-        RegisterSK = TargetTransformInfo::SK_PermuteTwoSrc;
-        RegMask[Idx % EltsPerVector] =
-            CurrentIdx % EltsPerVector + EltsPerVector;
-      } else {
-        IsIdentity &= PrevIdx == CurrentIdx &&
-                      CurrentIdx % EltsPerVector == Idx % EltsPerVector;
-        RegMask[Idx % EltsPerVector] = CurrentIdx % EltsPerVector;
-      }
-
-      if (IsIdentity)
+      ArrayRef<int> MaskSlice =
+          Mask.slice(Part * EltsPerVector,
+                     (Part == NumParts - 1 && Mask.size() % EltsPerVector != 0)
+                         ? Mask.size() % EltsPerVector
+                         : EltsPerVector);
+      SmallVector<int> SubMask(EltsPerVector, PoisonMaskElem);
+      copy(MaskSlice, SubMask.begin());
+      std::optional<TTI::ShuffleKind> RegShuffleKind =
+          CheckPerRegistersShuffle(SubMask);
+      if (!RegShuffleKind) {
+        Cost += TTI.getShuffleCost(
+            *ShuffleKinds[Part],
+            FixedVectorType::get(VL.front()->getType(), NumElts), MaskSlice);
         continue;
-
-      // Skip all indices, except for the last index per vector block.
-      if ((Idx + 1) % EltsPerVector != 0 && Idx + 1 != VL.size())
-        continue;
-
-      // If we have a series of extracts which are not consecutive and hence
-      // cannot re-use the source vector register directly, compute the shuffle
-      // cost to extract the vector with EltsPerVector elements.
-      Cost += TTI.getShuffleCost(RegisterSK, RegisterVecTy, RegMask);
+      }
+      if (*RegShuffleKind != TTI::SK_PermuteSingleSrc ||
+          !ShuffleVectorInst::isIdentityMask(SubMask, EltsPerVector)) {
+        Cost += TTI.getShuffleCost(
+            *RegShuffleKind,
+            FixedVectorType::get(VL.front()->getType(), EltsPerVector),
+            SubMask);
+      }
     }
     return Cost;
   }
@@ -7464,90 +7473,76 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
                        SmallPtrSetImpl<Value *> &CheckedExtracts)
       : TTI(TTI), VectorizedVals(VectorizedVals.begin(), VectorizedVals.end()),
         R(R), CheckedExtracts(CheckedExtracts) {}
-  Value *adjustExtracts(const TreeEntry *E, ArrayRef<int> Mask,
-                        TTI::ShuffleKind ShuffleKind) {
+  Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
+                        ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
+                        unsigned NumParts) {
     if (Mask.empty())
       return nullptr;
     Value *VecBase = nullptr;
     ArrayRef<Value *> VL = E->Scalars;
-    auto *VecTy = FixedVectorType::get(VL.front()->getType(), VL.size());
     // If the resulting type is scalarized, do not adjust the cost.
-    unsigned VecNumParts = TTI.getNumberOfParts(VecTy);
-    if (VecNumParts == VecTy->getNumElements())
+    if (NumParts == VL.size())
       return nullptr;
-    DenseMap<Value *, int> ExtractVectorsTys;
-    for (auto [I, V] : enumerate(VL)) {
-      // Ignore non-extractelement scalars.
-      if (isa<UndefValue>(V) || (!Mask.empty() && Mask[I] == PoisonMaskElem))
-        continue;
-      // If all users of instruction are going to be vectorized and this
-      // instruction itself is not going to be vectorized, consider this
-      // instruction as dead and remove its cost from the final cost of the
-      // vectorized tree.
-      // Also, avoid adjusting the cost for extractelements with multiple uses
-      // in different graph entries.
-      const TreeEntry *VE = R.getTreeEntry(V);
-      if (!CheckedExtracts.insert(V).second ||
-          !R.areAllUsersVectorized(cast<Instruction>(V), &VectorizedVals) ||
-          (VE && VE != E))
-        continue;
-      auto *EE = cast<ExtractElementInst>(V);
-      VecBase = EE->getVectorOperand();
-      std::optional<unsigned> EEIdx = getExtractIndex(EE);
-      if (!EEIdx)
-        continue;
-      unsigned Idx = *EEIdx;
-      if (VecNumParts != TTI.getNumberOfParts(EE->getVectorOperandType())) {
-        auto It =
-            ExtractVectorsTys.try_emplace(EE->getVectorOperand(), Idx).first;
-        It->getSecond() = std::min<int>(It->second, Idx);
-      }
-      // Take credit for instruction that will become dead.
-      if (EE->hasOneUse()) {
-        Instruction *Ext = EE->user_back();
-        if (isa<SExtInst, ZExtInst>(Ext) && all_of(Ext->users(), [](User *U) {
-              return isa<GetElementPtrInst>(U);
-            })) {
-          // Use getExtractWithExtendCost() to calculate the cost of
-          // extractelement/ext pair.
-          Cost -= TTI.getExtractWithExtendCost(Ext->getOpcode(), Ext->getType(),
-                                               EE->getVectorOperandType(), Idx);
-          // Add back the cost of s|zext which is subtracted separately.
-          Cost += TTI.getCastInstrCost(
-              Ext->getOpcode(), Ext->getType(), EE->getType(),
-              TTI::getCastContextHint(Ext), CostKind, Ext);
+    // Check if it can be considered reused if same extractelements were
+    // vectorized already.
+    bool PrevNodeFound = any_of(
+        ArrayRef(R.VectorizableTree).take_front(E->Idx),
+        [&](const std::unique_ptr<TreeEntry> &TE) {
+          return ((!TE->isAltShuffle() &&
+                   TE->getOpcode() == Instruction::ExtractElement) ||
+                  TE->State == TreeEntry::NeedToGather) &&
+                 all_of(enumerate(TE->Scalars), [&](auto &&Data) {
+                   return VL.size() > Data.index() &&
+                          (Mask[Data.index()] == PoisonMaskElem ||
+                           isa<UndefValue>(VL[Data.index()]) ||
+                           Data.value() == VL[Data.index()]);
+                 });
+        });
+    unsigned SliceSize = VL.size() / NumParts;
+    for (unsigned Part = 0; Part < NumParts; ++Part) {
+      ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, SliceSize);
+      for (auto [I, V] : enumerate(VL.slice(Part * SliceSize, SliceSize))) {
+        // Ignore non-extractelement scalars.
+        if (isa<UndefValue>(V) ||
+            (!SubMask.empty() && SubMask[I] == PoisonMaskElem))
           continue;
-        }
-      }
-      Cost -= TTI.getVectorInstrCost(*EE, EE->getVectorOperandType(), CostKind,
-                                     Idx);
-    }
-    // Add a cost for subvector extracts/inserts if required.
-    for (const auto &Data : ExtractVectorsTys) {
-      auto *EEVTy = cast<FixedVectorType>(Data.first->getType());
-      unsigned NumElts = VecTy->getNumElements();
-      if (Data.second % NumElts == 0)
-        continue;
-      if (TTI.getNumberOfParts(EEVTy) > VecNumParts) {
-        unsigned Idx = (Data.second / NumElts) * NumElts;
-        unsigned EENumElts = EEVTy->getNumElements();
-        if (Idx % NumElts == 0)
+        // If all users of instruction are going to be vectorized and this
+        // instruction itself is not going to be vectorized, consider this
+        // instruction as dead and remove its cost from the final cost of the
+        // vectorized tree.
+        // Also, avoid adjusting the cost for extractelements with multiple uses
+        // in different graph entries.
+        const TreeEntry *VE = R.getTreeEntry(V);
+        if (!CheckedExtracts.insert(V).second ||
+            !R.areAllUsersVectorized(cast<Instruction>(V), &VectorizedVals) ||
+            (VE && VE != E))
           continue;
-        if (Idx + NumElts <= EENumElts) {
-          Cost += TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,
-                                     EEVTy, std::nullopt, CostKind, Idx, VecTy);
-        } else {
-          // Need to round up the subvector type vectorization factor to avoid a
-          // crash in cost model functions. Make SubVT so that Idx + VF of SubVT
-          // <= EENumElts.
-          auto *SubVT =
-              FixedVectorType::get(VecTy->getElementType(), EENumElts - Idx);
-          Cost += TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,
-                                     EEVTy, std::nullopt, CostKind, Idx, SubVT);
+        auto *EE = cast<ExtractElementInst>(V);
+        VecBase = EE->getVectorOperand();
+        std::optional<unsigned> EEIdx = getExtractIndex(EE);
+        if (!EEIdx)
+          continue;
+        unsigned Idx = *EEIdx;
+        // Take credit for instruction that will become dead.
+        if (EE->hasOneUse() || !PrevNodeFound) {
+          Instruction *Ext = EE->user_back();
+          if (isa<SExtInst, ZExtInst>(Ext) && all_of(Ext->users(), [](User *U) {
+                return isa<GetElementPtrInst>(U);
+              })) {
+            // Use getExtractWithExtendCost() to calculate the cost of
+            // extractelement/ext pair.
+            Cost -=
+                TTI.getExtractWithExtendCost(Ext->getOpcode(), Ext->getType(),
+                                             EE->getVectorOperandType(), Idx);
+            // Add back the cost of s|zext which is subtracted separately.
+            Cost += TTI.getCastInstrCost(
+                Ext->getOpcode(), Ext->getType(), EE->getType(),
+                TTI::getCastContextHint(Ext), CostKind, Ext);
+            continue;
+          }
         }
-      } else {
-        Cost += TTI.getShuffleCost(TargetTransformInfo::SK_InsertSubvector,
-                                   VecTy, std::nullopt, CostKind, 0, EEVTy);
+        Cost -= TTI.getVectorInstrCost(*EE, EE->getVectorOperandType(),
+                                       CostKind, Idx);
       }
     }
     // Check that gather of extractelements can be represented as just a
@@ -7555,7 +7550,9 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
     // Found the bunch of extractelement instructions that must be gathered
     // into a vector and can be represented as a permutation elements in a
     // single input vector or of 2 input vectors.
-    Cost += computeExtractCost(VL, Mask, ShuffleKind);
+    // Done for reused if same extractelements were vectorized already.
+    if (!PrevNodeFound)
+      Cost += computeExtractCost(VL, Mask, ShuffleKinds, NumParts);
     InVectors.assign(1, E);
     CommonMask.assign(Mask.begin(), Mask.end());
     transformMaskAfterShuffle(CommonMask, CommonMask);
@@ -7677,7 +7674,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
     assert((IsFinalized || CommonMask.empty()) &&
            "Shuffle construction must be finalized.");
   }
-};
+  };
 
 InstructionCost
 BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
@@ -7738,40 +7735,41 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
       reorderScalars(GatheredScalars, ReorderMask);
     SmallVector<int> Mask;
     SmallVector<int> ExtractMask;
-    std::optional<TargetTransformInfo::ShuffleKind> ExtractShuffle;
     SmallVector<std::optional<TargetTransformInfo::ShuffleKind>> GatherShuffles;
     SmallVector<SmallVector<const TreeEntry *>> Entries;
+    SmallVector<std::optional<TTI::ShuffleKind>> ExtractShuffles;
     // Check for gathered extracts.
-    ExtractShuffle =
-        tryToGatherSingleRegisterExtractElements(GatheredScalars, ExtractMask);
-
     bool Resized = false;
     unsigned NumParts = TTI->getNumberOfParts(VecTy);
     if (NumParts == 0 || NumParts >= GatheredScalars.size())
       NumParts = 1;
-    if (Value *VecBase = Estimator.adjustExtracts(
-            E, ExtractMask, ExtractShuffle.value_or(TTI::SK_PermuteTwoSrc))) {
-      if (auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))
-        if (VF == VecBaseTy->getNumElements() && GatheredScalars.size() != VF) {
-          Resized = true;
-          GatheredScalars.append(VF - GatheredScalars.size(),
-                                 PoisonValue::get(ScalarTy));
+    if (!all_of(GatheredScalars, UndefValue::classof)) {
+      ExtractShuffles =
+          tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
+      if (!ExtractShuffles.empty()) {
+        if (Value *VecBase = Estimator.adjustExtracts(
+                E, ExtractMask, ExtractShuffles, NumParts)) {
+          if (auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))
+            if (VF == VecBaseTy->getNumElements() &&
+                GatheredScalars.size() != VF) {
+              Resized = true;
+              GatheredScalars.append(VF - GatheredScalars.size(),
+                                     PoisonValue::get(ScalarTy));
+            }
         }
-    } else if (ExtractShuffle &&
-               TTI->getNumberOfParts(VecTy) == VecTy->getNumElements()) {
-      copy(VL, GatheredScalars.begin());
-    }
+      }
 
-    // Do not try to look for reshuffled loads for gathered loads (they will be
-    // handled later), for vectorized scalars, and cases, which are definitely
-    // not profitable (splats and small gather nodes.)
-    if (ExtractShuffle || E->getOpcode() != Instruction::Load ||
-        E->isAltShuffle() ||
-        all_of(E->Scalars, [this](Value *V) { return getTreeEntry(V); }) ||
-        isSplat(E->Scalars) ||
-        (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2))
-      GatherShuffles =
-          isGatherShuffledEntry(E, GatheredScalars, Mask, Entries, NumParts);
+      // Do not try to look for reshuffled loads for gathered loads (they will
+      // be handled later), for vectorized scalars, and cases, which are
+      // definitely not profitable (splats and small gather nodes.)
+      if (!ExtractShuffles.empty() || E->getOpcode() != Instruction::Load ||
+          E->isAltShuffle() ||
+          all_of(E->Scalars, [this](Value *V) { return getTreeEntry(V); }) ||
+          isSplat(E->Scalars) ||
+          (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2))
+        GatherShuffles =
+            isGatherShuffledEntry(E, GatheredScalars, Mask, Entries, NumParts);
+    }
     if (!GatherShuffles.empty()) {
       if (GatherShuffles.size() == 1 &&
           *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
@@ -10013,7 +10011,10 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
       : Builder(Builder), R(R) {}
 
   /// Adjusts extractelements after reusing them.
-  Value *adjustExtracts(const TreeEntry *E, ArrayRef<int> Mask) {
+  Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
+                        unsigned NumParts, bool &UseVecBaseAsInput) {
+    UseVecBaseAsInput = false;
+    SmallPtrSet<Value *, 4> UniqueBases;
     Value *VecBase = nullptr;
     for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
       int Idx = Mask[I];
@@ -10021,6 +10022,7 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
         continue;
       auto *EI = cast<ExtractElementInst>(E->Scalars[I]);
       VecBase = EI->getVectorOperand();
+      UniqueBases.insert(VecBase);
       // If the only one use is vectorized - can delete the extractelement
       // itself.
       if (!EI->hasOneUse() || any_of(EI->users(), [&](User *U) {
@@ -10029,7 +10031,75 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
         continue;
       R.eraseInstruction(EI);
     }
-    return VecBase;
+    if (NumParts == 1 || UniqueBases.size() == 1)
+      return VecBase;
+    UseVecBaseAsInput = true;
+    auto TransformToIdentity = [](MutableArrayRef<int> Mask) {
+      for (auto [I, Idx] : enumerate(Mask))
+        if (Idx != PoisonMaskElem)
+          Idx = I;
+    };
+    // Perform multi-register vector shuffle, joining them into a single virtual
+    // long vector.
+    // Need to shuffle each part independently and then insert all this parts
+    // into a long virtual vector register, forming the original vector.
+    Value *Vec = nullptr;
+    SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
+    unsigned SliceSize = E->Scalars.size() / NumParts;
+    for (unsigned Part = 0; Part < NumParts; ++Part) {
+      ArrayRef<Value *> VL =
+          ArrayRef(E->Scalars).slice(Part * SliceSize, SliceSize);
+      MutableArrayRef<int> SubMask = Mask.slice(Part * SliceSize, SliceSize);
+      constexpr int MaxBases = 2;
+      SmallVector<Value *, MaxBases> Bases(MaxBases);
+#ifndef NDEBUG
+      int PrevSize = 0;
+#endif // NDEBUG
+      for (const auto [I, V]: enumerate(VL)) {
+        if (SubMask[I] == PoisonMaskElem)
+          continue;
+        Value *VecOp = cast<ExtractElementInst>(V)->getVectorOperand();
+        const int Size =
+            cast<FixedVectorType>(VecOp->getType())->getNumElements();
+#ifndef NDEBUG
+        assert((PrevSize == Size || PrevSize == 0) &&
+               "Expected vectors of the same size.");
+        PrevSize = Size;
+#endif // NDEBUG
+        Bases[SubMask[I] < Size ? 0 : 1] = VecOp;
+      }
+      if (!Bases.front())
+        continue;
+      Value *SubVec;
+      if (Bases.back()) {
+        SubVec = createShuffle(Bases.front(), Bases.back(), SubMask);
+        TransformToIdentity(SubMask);
+      } else {
+        SubVec = Bases.front();
+      }
+      if (!Vec) {
+        Vec = SubVec;
+        copy(SubMask, VecMask.begin());
+      } else {
+        unsigned VF = cast<FixedVectorType>(Vec->getType())->getNumElements();
+        if (Vec->getType() != SubVec->getType()) {
+          unsigned SubVecVF =
+              cast<FixedVectorType>(SubVec->getType())->getNumElements();
+          if (VF < SubVecVF)
+            TransformToIdentity(VecMask);
+          VF = std::max(VF, SubVecVF);
+        }
+        // Adjust SubMask.
+        for (auto [I, Idx] : enumerate(SubMask))
+          if (Idx != PoisonMaskElem)
+            Idx += VF;
+        copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
+        Vec = createShuffle(Vec, SubVec, VecMask);
+        TransformToIdentity(VecMask);
+      }
+    }
+    copy(VecMask, Mask.begin());
+    return Vec;
   }
   /// Checks if the specified entry \p E needs to be delayed because of its
   /// dependency nodes.
@@ -10372,29 +10442,37 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
   BVTy ShuffleBuilder(Params...);
   ResTy Res = ResTy();
   SmallVector<int> Mask;
-  SmallVector<int> ExtractMask;
-  std::optional<TargetTransformInfo::ShuffleKind> ExtractShuffle;
+  SmallVector<int> ExtractMask(GatheredScalars.size(), PoisonMaskElem);
+  SmallVector<std::optional<TTI::ShuffleKind>> ExtractShuffles;
+  Value *ExtractVecBase = nullptr;
+  bool UseVecBaseAsInput;
   SmallVector<std::optional<TargetTransformInfo::ShuffleKind>> GatherShuffles;
   SmallVector<SmallVector<const TreeEntry *>> Entries;
   Type *ScalarTy = GatheredScalars.front()->getType();
-  unsigned NumParts = TTI->getNumberOfParts(
-      FixedVectorType::get(ScalarTy, GatheredScalars.size()));
+  auto *VecTy = FixedVectorType::get(ScalarTy, GatheredScalars.size());
+  unsigned NumParts = TTI->getNumberOfParts(VecTy);
   if (NumParts == 0 || NumParts >= GatheredScalars.size())
     NumParts = 1;
   if (!all_of(GatheredScalars, UndefValue::classof)) {
     // Check for gathered extracts.
-    ExtractShuffle =
-        tryToGatherSingleRegisterExtractElements(GatheredScalars, ExtractMask);
     bool Resized = false;
-    if (Value *VecBase = ShuffleBuilder.adjustExtracts(E, ExtractMask))
-      if (auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))
-        if (VF == VecBaseTy->getNumElements() && GatheredScalars.size() != VF) {
-          Resized = true;
-          GatheredScalars.append(VF - GatheredScalars.size(),
-                                 PoisonValue::get(ScalarTy));
-        }
+    ExtractShuffles =
+        tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
+    if (!ExtractShuffles.empty()) {
+      if (Value *VecBase = ShuffleBuilder.adjustExtracts(
+              E, ExtractMask, NumParts, UseVecBaseAsInput)) {
+        ExtractVecBase = VecBase;
+        if (auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))
+          if (VF == VecBaseTy->getNumElements() &&
+              GatheredScalars.size() != VF) {
+            Resized = true;
+            GatheredScalars.append(VF - GatheredScalars.size(),
+                                   PoisonValue::get(ScalarTy));
+          }
+      }
+    }
     // Gather extracts after we check for full matched gathers only.
-    if (ExtractShuffle || E->getOpcode() != Instruction::Load ||
+    if (!ExtractShuffles.empty() || E->getOpcode() != Instruction::Load ||
         E->isAltShuffle() ||
         all_of(E->Scalars, [this](Value *V) { return getTreeEntry(V); }) ||
         isSplat(E->Scalars) ||
@@ -10545,30 +10623,35 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
       }
     }
   };
-  if (ExtractShuffle || !GatherShuffles.empty()) {
+  if (!ExtractShuffles.empty() || !GatherShuffles.empty()) {
     bool IsNonPoisoned = true;
     bool IsUsedInExpr = true;
     Value *Vec1 = nullptr;
-    if (ExtractShuffle) {
+    if (!ExtractShuffles.empty()) {
       // Gather of extractelements can be represented as just a shuffle of
       // a single/two vectors the scalars are extracted from.
       // Find input vectors.
       Value *Vec2 = nullptr;
       for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
-        if (ExtractMask[I] == PoisonMaskElem ||
-            (!Mask.empty() && Mask[I] != PoisonMaskElem)) {
+        if (!Mask.empty() && Mask[I] != PoisonMaskElem)
           ExtractMask[I] = PoisonMaskElem;
-          continue;
-        }
-        if (isa<UndefValue>(E->Scalars[I]))
-          continue;
-        auto *EI = cast<ExtractElementInst>(E->Scalars[I]);
-        if (!Vec1) {
-          Vec1 = EI->getVectorOperand();
-        } else if (Vec1 != EI->getVectorOperand()) {
-          assert((!Vec2 || Vec2 == EI->getVectorOperand()) &&
-                 "Expected only 1 or 2 vectors shuffle.");
-          Vec2 = EI->getVectorOperand();
+      }
+      if (UseVecBaseAsInput) {
+        Vec1 = ExtractVecBase;
+      } else {
+        for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
+          if (ExtractMask[I] == PoisonMaskElem)
+            continue;
+          if (isa<UndefValue>(E->Scalars[I]))
+            continue;
+          auto *EI = cast<ExtractElementInst>(E->Scalars[I]);
+          if (!Vec1) {
+            Vec1 = EI->getVectorOperand();
+          } else if (Vec1 != EI->getVectorOperand()) {
+            assert((!Vec2 || Vec2 == EI->getVectorOperand()) &&
+                   "Expected only 1 or 2 vectors shuffle.");
+            Vec2 = EI->getVectorOperand();
+          }
         }
       }
       if (Vec2) {
@@ -10629,10 +10712,14 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
     int MSz = Mask.size();
     // Try to build constant vector and shuffle with it only if currently we
     // have a single permutation and more than 1 scalar constants.
-    bool IsSingleShuffle = !ExtractShuffle || GatherShuffles.empty();
+    bool IsSingleShuffle = ExtractShuffles.empty() || GatherShuffles.empty();
     bool IsIdentityShuffle =
-        (ExtractShuffle.value_or(TTI::SK_PermuteTwoSrc) ==
-             TTI::SK_PermuteSingleSrc &&
+        ((UseVecBaseAsInput ||
+          all_of(ExtractShuffles,
+                 [](const std::optional<TTI::ShuffleKind> &SK) {
+                   return SK.value_or(TTI::SK_PermuteTwoSrc) ==
+                          TTI::SK_PermuteSingleSrc;
+                 })) &&
          none_of(ExtractMask, [&](int I) { return I >= EMSz; }) &&
          ShuffleVectorInst::isIdentityMask(ExtractMask, EMSz)) ||
         (!GatherShuffles.empty() &&
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll
index e60e356e5cd81..8f76b2e54e6c2 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll
@@ -75,64 +75,47 @@ define void @dist_vec(ptr nocapture noundef readonly %pA, ptr nocapture noundef
 ; CHECK-NEXT:    [[TMP4TT_0_LCSSA:%.*]] = phi <2 x i64> [ zeroinitializer, [[ENTRY]] ], [ [[ADD_I]], [[WHILE_END_LOOPEXIT]] ]
 ; CHECK-NEXT:    [[PB_ADDR_0_LCSSA:%.*]] = phi ptr [ [[PB]], [[ENTRY]] ], [ [[SCEVGEP311]], [[WHILE_END_LOOPEXIT]] ]
 ; CHECK-NEXT:    [[PA_ADDR_0_LCSSA:%.*]] = phi ptr [ [[PA]], [[ENTRY]] ], [ [[SCEVGEP]], [[WHILE_END_LOOPEXIT]] ]
-; CHECK-NEXT:    [[VGETQ_LANE:%.*]] = extractelement <2 x i64> [[TMP4TT_0_LCSSA]], i64 0
-; CHECK-NEXT:    [[VGETQ_LANE45:%.*]] = extractelement <2 x i64> [[TMP4TT_0_LCSSA]], i64 1
-; CHECK-NEXT:    [[ADD:%.*]] = add i64 [[VGETQ_LANE]], [[VGETQ_LANE45]]
-; CHECK-NEXT:    [[CONV48:%.*]] = trunc i64 [[ADD]] to i32
-; CHECK-NEXT:    [[VGETQ_LANE51:%.*]] = extractelement <2 x i64> [[TMP4FF_0_LCSSA]], i64 0
-; CHECK-NEXT:    [[VGETQ_LANE55:%.*]] = extractelement <2 x i64> [[TMP4FF_0_LCSSA]], i64 1
-; CHECK-NEXT:    [[ADD57:%.*]] = add i64 [[VGETQ_LANE51]], [[VGETQ_LANE55]]
-; CHECK-NEXT:    [[CONV60:%.*]] = trunc i64 [[ADD57]] to i32
-; CHECK-NEXT:    [[VGETQ_LANE63:%.*]] = extractelement <2 x i64> [[TMP4TF_0_LCSSA]], i64 0
-; CHECK-NEXT:    [[VGETQ_LANE67:%.*]] = extractelement <2 x i64> [[TMP4TF_0_LCSSA]], i64 1
-; CHECK-NEXT:    [[ADD69:%.*]] = add i64 [[VGETQ_LANE63]], [[VGETQ_LANE67]]
-; CHECK-NEXT:    [[CONV72:%.*]] = trunc i64 [[ADD69]] to i32
-; CHECK-NEXT:    [[VGETQ_LANE75:%.*]] = extractelement <2 x i64> [[TMP4FT_0_LCSSA]], i64 0
-; CHECK-NEXT:    [[VGETQ_LANE79:%.*]] = extractelement <2 x i64> [[TMP4FT_0_LCSSA]], i64 1
-; CHECK-NEXT:    [[ADD81:%.*]] = add i64 [[VGETQ_LANE75]], [[VGETQ_LANE79]]
-; CHECK-NEXT:    [[CONV84:%.*]] = trunc i64 [[ADD81]] to i32
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x i64> [[TMP4FT_0_LCSSA]], <2 x i64> [[TMP4TF_0_LCSSA]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <2 x i64> [[TMP4TT_0_LCSSA]], <2 x i64> [[TMP4FF_0_LCSSA]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <2 x i64> [[TMP10]], <2 x i64> [[TMP11]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <2 x i64> [[TMP4FT_0_LCSSA]], <2 x i64> [[TMP4TF_0_LCSSA]], <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <2 x i64> [[TMP4TT_0_LCSSA]], <2 x i64> [[TMP4FF_0_LCSSA]], <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP16:%.*]] = add <4 x i64> [[TMP12]], [[TMP15]]
+; CHECK-NEXT:    [[TMP17:%.*]] = trunc <4 x i64> [[TMP16]] to <4 x i32>
 ; CHECK-NEXT:    [[AND:%.*]] = and i32 [[NUMBEROFBOOLS]], 127
 ; CHECK-NEXT:    [[CMP86284:%.*]] = icmp ugt i32 [[AND]], 31
 ; CHECK-NEXT:    br i1 [[CMP86284]], label [[WHILE_BODY88:%.*]], label [[WHILE_END122:%.*]]
 ; CHECK:       while.body88:
 ; CHECK-NEXT:    [[PA_ADDR_1291:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[WHILE_END121:%.*]] ], [ [[PA_ADDR_0_LCSSA]], [[WHILE_END]] ]
 ; CHECK-NEXT:    [[PB_ADDR_1290:%.*]] = phi ptr [ [[INCDEC_PTR89:%.*]], [[WHILE_END121]] ], [ [[PB_ADDR_0_LCSSA]], [[WHILE_END]] ]
-; CHECK-NEXT:    [[_CTT_0289:%.*]] = phi i32 [ [[ADD99:%.*]], [[WHILE_END121]] ], [ [[CONV48]], [[WHILE_END]] ]
-; CHECK-NEXT:    [[_CFF_0288:%.*]] = phi i32 [ [[ADD106:%.*]], [[WHILE_END121]] ], [ [[CONV60]], [[WHILE_END]] ]
-; CHECK-NEXT:    [[_CTF_0287:%.*]] = phi i32 [ [[ADD113:%.*]], [[WHILE_END121]] ], [ [[CONV72]], [[WHILE_END]] ]
-; CHECK-NEXT:    [[_CFT_0286:%.*]] = phi i32 [ [[ADD120:%.*]], [[WHILE_END121]] ], [ [[CONV84]], [[WHILE_END]] ]
 ; CHECK-NEXT:    [[NBBOOLBLOCK_1285:%.*]] = phi i32 [ [[SUB:%.*]], [[WHILE_END121]] ], [ [[AND]], [[WHILE_END]] ]
-; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[PA_ADDR_1291]], align 4
-; CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[PB_ADDR_1290]], align 4
+; CHECK-NEXT:    [[TMP18:%.*]] = phi <4 x i32> [ [[TMP34:%.*]], [[WHILE_END121]] ], [ [[TMP17]], [[WHILE_END]] ]
+; CHECK-NEXT:    [[TMP19:%.*]] = load i32, ptr [[PA_ADDR_1291]], align 4
+; CHECK-NEXT:    [[TMP20:%.*]] = load i32, ptr [[PB_ADDR_1290]], align 4
 ; CHECK-NEXT:    br label [[WHILE_BODY93:%.*]]
 ; CHECK:       while.body93:
-; CHECK-NEXT:    [[_CTT_1283:%.*]] = phi i32 [ [[_CTT_0289]], [[WHILE_BODY88]] ], [ [[ADD99]], [[WHILE_BODY93]] ]
-; CHECK-NEXT:    [[_CFF_1282:%.*]] = phi i32 [ [[_CFF_0288]], [[WHILE_BODY88]] ], [ [[ADD106]], [[WHILE_BODY93]] ]
-; CHECK-NEXT:    [[_CTF_1281:%.*]] = phi i32 [ [[_CTF_0287]], [[WHILE_BODY88]] ], [ [[ADD113]], [[WHILE_BODY93]] ]
-; CHECK-NEXT:    [[_CFT_1280:%.*]] = phi i32 [ [[_CFT_0286]], [[WHILE_BODY88]] ], [ [[ADD120]], [[WHILE_BODY93]] ]
-; CHECK-NEXT:    [[A_0279:%.*]] = phi i32 [ [[TMP10]], [[WHILE_BODY88]] ], [ [[SHR96:%.*]], [[WHILE_BODY93]] ]
-; CHECK-NEXT:    [[B_0278:%.*]] = phi i32 [ [[TMP11]], [[WHILE_BODY88]] ], [ [[SHR97:%.*]], [[WHILE_BODY93]] ]
+; CHECK-NEXT:    [[A_0279:%.*]] = phi i32 [ [[TMP19]], [[WHILE_BODY88]] ], [ [[SHR96:%.*]], [[WHILE_BODY93]] ]
+; CHECK-NEXT:    [[B_0278:%.*]] = phi i32 [ [[TMP20]], [[WHILE_BODY88]] ], [ [[SHR97:%.*]], [[WHILE_BODY93]] ]
 ; CHECK-NEXT:    [[SHIFT_0277:%.*]] = phi i32 [ 0, [[WHILE_BODY88]] ], [ [[INC:%.*]], [[WHILE_BODY93]] ]
+; CHECK-NEXT:    [[TMP21:%.*]] = phi <4 x i32> [ [[TMP18]], [[WHILE_BODY88]] ], [ [[TMP34]], [[WHILE_BODY93]] ]
 ; CHECK-NEXT:    [[AND94:%.*]] = and i32 [[A_0279]], 1
 ; CHECK-NEXT:    [[AND95:%.*]] = and i32 [[B_0278]], 1
 ; CHECK-NEXT:    [[SHR96]] = lshr i32 [[A_0279]], 1
 ; CHECK-NEXT:    [[SHR97]] = lshr i32 [[B_0278]], 1
-; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[AND94]], 0
-; CHECK-NEXT:    [[TOBOOL98:%.*]] = icmp ne i32 [[AND95]], 0
-; CHECK-NEXT:    [[TMP12:%.*]] = select i1 [[TOBOOL]], i1 [[TOBOOL98]], i1 false
-; CHECK-NEXT:    [[LAND_EXT:%.*]] = zext i1 [[TMP12]] to i32
-; CHECK-NEXT:    [[ADD99]] = add i32 [[_CTT_1283]], [[LAND_EXT]]
-; CHECK-NEXT:    [[TOBOOL100:%.*]] = icmp eq i32 [[AND94]], 0
-; CHECK-NEXT:    [[TOBOOL103:%.*]] = icmp eq i32 [[AND95]], 0
-; CHECK-NEXT:    [[TMP13:%.*]] = select i1 [[TOBOOL100]], i1 [[TOBOOL103]], i1 false
-; CHECK-NEXT:    [[LAND_EXT105:%.*]] = zext i1 [[TMP13]] to i32
-; CHECK-NEXT:    [[ADD106]] = add i32 [[_CFF_1282]], [[LAND_EXT105]]
-; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TOBOOL]], i1 [[TOBOOL103]], i1 false
-; CHECK-NEXT:    [[LAND_EXT112:%.*]] = zext i1 [[TMP14]] to i32
-; CHECK-NEXT:    [[ADD113]] = add i32 [[_CTF_1281]], [[LAND_EXT112]]
-; CHECK-NEXT:    [[TMP15:%.*]] = select i1 [[TOBOOL100]], i1 [[TOBOOL98]], i1 false
-; CHECK-NEXT:    [[LAND_EXT119:%.*]] = zext i1 [[TMP15]] to i32
-; CHECK-NEXT:    [[ADD120]] = add i32 [[_CFT_1280]], [[LAND_EXT119]]
+; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <2 x i32> poison, i32 [[AND94]], i32 0
+; CHECK-NEXT:    [[TMP23:%.*]] = shufflevector <2 x i32> [[TMP22]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP24:%.*]] = icmp eq <2 x i32> [[TMP23]], zeroinitializer
+; CHECK-NEXT:    [[TMP25:%.*]] = icmp ne <2 x i32> [[TMP23]], zeroinitializer
+; CHECK-NEXT:    [[TMP26:%.*]] = shufflevector <2 x i1> [[TMP24]], <2 x i1> [[TMP25]], <4 x i32> <i32 0, i32 3, i32 3, i32 0>
+; CHECK-NEXT:    [[TMP27:%.*]] = insertelement <2 x i32> poison, i32 [[AND95]], i32 0
+; CHECK-NEXT:    [[TMP28:%.*]] = shufflevector <2 x i32> [[TMP27]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP29:%.*]] = icmp ne <2 x i32> [[TMP28]], zeroinitializer
+; CHECK-NEXT:    [[TMP30:%.*]] = icmp eq <2 x i32> [[TMP28]], zeroinitializer
+; CHECK-NEXT:    [[TMP31:%.*]] = shufflevector <2 x i1> [[TMP29]], <2 x i1> [[TMP30]], <4 x i32> <i32 0, i32 3, i32 0, i32 3>
+; CHECK-NEXT:    [[TMP32:%.*]] = select <4 x i1> [[TMP26]], <4 x i1> [[TMP31]], <4 x i1> zeroinitializer
+; CHECK-NEXT:    [[TMP33:%.*]] = zext <4 x i1> [[TMP32]] to <4 x i32>
+; CHECK-NEXT:    [[TMP34]] = add <4 x i32> [[TMP21]], [[TMP33]]
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[SHIFT_0277]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], 32
 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[WHILE_END121]], label [[WHILE_BODY93]]
@@ -144,61 +127,53 @@ define void @dist_vec(ptr nocapture noundef readonly %pA, ptr nocapture noundef
 ; CHECK-NEXT:    br i1 [[CMP86]], label [[WHILE_BODY88]], label [[WHILE_END122]]
 ; CHECK:       while.end122:
 ; CHECK-NEXT:    [[NBBOOLBLOCK_1_LCSSA:%.*]] = phi i32 [ [[AND]], [[WHILE_END]] ], [ [[SUB]], [[WHILE_END121]] ]
-; CHECK-NEXT:    [[_CFT_0_LCSSA:%.*]] = phi i32 [ [[CONV84]], [[WHILE_END]] ], [ [[ADD120]], [[WHILE_END121]] ]
-; CHECK-NEXT:    [[_CTF_0_LCSSA:%.*]] = phi i32 [ [[CONV72]], [[WHILE_END]] ], [ [[ADD113]], [[WHILE_END121]] ]
-; CHECK-NEXT:    [[_CFF_0_LCSSA:%.*]] = phi i32 [ [[CONV60]], [[WHILE_END]] ], [ [[ADD106]], [[WHILE_END121]] ]
-; CHECK-NEXT:    [[_CTT_0_LCSSA:%.*]] = phi i32 [ [[CONV48]], [[WHILE_END]] ], [ [[ADD99]], [[WHILE_END121]] ]
 ; CHECK-NEXT:    [[PB_ADDR_1_LCSSA:%.*]] = phi ptr [ [[PB_ADDR_0_LCSSA]], [[WHILE_END]] ], [ [[INCDEC_PTR89]], [[WHILE_END121]] ]
 ; CHECK-NEXT:    [[PA_ADDR_1_LCSSA:%.*]] = phi ptr [ [[PA_ADDR_0_LCSSA]], [[WHILE_END]] ], [ [[INCDEC_PTR]], [[WHILE_END121]] ]
+; CHECK-NEXT:    [[TMP35:%.*]] = phi <4 x i32> [ [[TMP17]], [[WHILE_END]] ], [ [[TMP34]], [[WHILE_END121]] ]
 ; CHECK-NEXT:    [[CMP130_NOT299:%.*]] = icmp eq i32 [[NBBOOLBLOCK_1_LCSSA]], 0
 ; CHECK-NEXT:    br i1 [[CMP130_NOT299]], label [[WHILE_END166:%.*]], label [[WHILE_BODY132_PREHEADER:%.*]]
 ; CHECK:       while.body132.preheader:
-; CHECK-NEXT:    [[TMP16:%.*]] = load i32, ptr [[PB_ADDR_1_LCSSA]], align 4
+; CHECK-NEXT:    [[TMP36:%.*]] = load i32, ptr [[PB_ADDR_1_LCSSA]], align 4
 ; CHECK-NEXT:    [[SUB125:%.*]] = sub nuw nsw i32 32, [[NBBOOLBLOCK_1_LCSSA]]
-; CHECK-NEXT:    [[SHR128:%.*]] = lshr i32 [[TMP16]], [[SUB125]]
-; CHECK-NEXT:    [[TMP17:%.*]] = load i32, ptr [[PA_ADDR_1_LCSSA]], align 4
-; CHECK-NEXT:    [[SHR126:%.*]] = lshr i32 [[TMP17]], [[SUB125]]
+; CHECK-NEXT:    [[SHR128:%.*]] = lshr i32 [[TMP36]], [[SUB125]]
+; CHECK-NEXT:    [[TMP37:%.*]] = load i32, ptr [[PA_ADDR_1_LCSSA]], align 4
+; CHECK-NEXT:    [[SHR126:%.*]] = lshr i32 [[TMP37]], [[SUB125]]
 ; CHECK-NEXT:    br label [[WHILE_BODY132:%.*]]
 ; CHECK:       while.body132:
-; CHECK-NEXT:    [[_CTT_2306:%.*]] = phi i32 [ [[ADD142:%.*]], [[WHILE_BODY132]] ], [ [[_CTT_0_LCSSA]], [[WHILE_BODY132_PREHEADER]] ]
-; CHECK-NEXT:    [[_CFF_2305:%.*]] = phi i32 [ [[ADD150:%.*]], [[WHILE_BODY132]] ], [ [[_CFF_0_LCSSA]], [[WHILE_BODY132_PREHEADER]] ]
-; CHECK-NEXT:    [[_CTF_2304:%.*]] = phi i32 [ [[ADD157:%.*]], [[WHILE_BODY132]] ], [ [[_CTF_0_LCSSA]], [[WHILE_BODY132_PREHEADER]] ]
-; CHECK-NEXT:    [[_CFT_2303:%.*]] = phi i32 [ [[ADD164:%.*]], [[WHILE_BODY132]] ], [ [[_CFT_0_LCSSA]], [[WHILE_BODY132_PREHEADER]] ]
 ; CHECK-NEXT:    [[NBBOOLBLOCK_2302:%.*]] = phi i32 [ [[DEC165:%.*]], [[WHILE_BODY132]] ], [ [[NBBOOLBLOCK_1_LCSSA]], [[WHILE_BODY132_PREHEADER]] ]
 ; CHECK-NEXT:    [[A_1301:%.*]] = phi i32 [ [[SHR135:%.*]], [[WHILE_BODY132]] ], [ [[SHR126]], [[WHILE_BODY132_PREHEADER]] ]
 ; CHECK-NEXT:    [[B_1300:%.*]] = phi i32 [ [[SHR136:%.*]], [[WHILE_BODY132]] ], [ [[SHR128]], [[WHILE_BODY132_PREHEADER]] ]
+; CHECK-NEXT:    [[TMP38:%.*]] = phi <4 x i32> [ [[TMP51:%.*]], [[WHILE_BODY132]] ], [ [[TMP35]], [[WHILE_BODY132_PREHEADER]] ]
 ; CHECK-NEXT:    [[AND133:%.*]] = and i32 [[A_1301]], 1
 ; CHECK-NEXT:    [[AND134:%.*]] = and i32 [[B_1300]], 1
 ; CHECK-NEXT:    [[SHR135]] = lshr i32 [[A_1301]], 1
 ; CHECK-NEXT:    [[SHR136]] = lshr i32 [[B_1300]], 1
-; CHECK-NEXT:    [[TOBOOL137:%.*]] = icmp ne i32 [[AND133]], 0
-; CHECK-NEXT:    [[TOBOOL139:%.*]] = icmp ne i32 [[AND134]], 0
-; CHECK-NEXT:    [[TMP18:%.*]] = select i1 [[TOBOOL137]], i1 [[TOBOOL139]], i1 false
-; CHECK-NEXT:    [[LAND_EXT141:%.*]] = zext i1 [[TMP18]] to i32
-; CHECK-NEXT:    [[ADD142]] = add i32 [[_CTT_2306]], [[LAND_EXT141]]
-; CHECK-NEXT:    [[TOBOOL144:%.*]] = icmp eq i32 [[AND133]], 0
-; CHECK-NEXT:    [[TOBOOL147:%.*]] = icmp eq i32 [[AND134]], 0
-; CHECK-NEXT:    [[TMP19:%.*]] = select i1 [[TOBOOL144]], i1 [[TOBOOL147]], i1 false
-; CHECK-NEXT:    [[LAND_EXT149:%.*]] = zext i1 [[TMP19]] to i32
-; CHECK-NEXT:    [[ADD150]] = add i32 [[_CFF_2305]], [[LAND_EXT149]]
-; CHECK-NEXT:    [[TMP20:%.*]] = select i1 [[TOBOOL137]], i1 [[TOBOOL147]], i1 false
-; CHECK-NEXT:    [[LAND_EXT156:%.*]] = zext i1 [[TMP20]] to i32
-; CHECK-NEXT:    [[ADD157]] = add i32 [[_CTF_2304]], [[LAND_EXT156]]
-; CHECK-NEXT:    [[TMP21:%.*]] = select i1 [[TOBOOL144]], i1 [[TOBOOL139]], i1 false
-; CHECK-NEXT:    [[LAND_EXT163:%.*]] = zext i1 [[TMP21]] to i32
-; CHECK-NEXT:    [[ADD164]] = add i32 [[_CFT_2303]], [[LAND_EXT163]]
+; CHECK-NEXT:    [[TMP39:%.*]] = insertelement <2 x i32> poison, i32 [[AND133]], i32 0
+; CHECK-NEXT:    [[TMP40:%.*]] = shufflevector <2 x i32> [[TMP39]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP41:%.*]] = icmp eq <2 x i32> [[TMP40]], zeroinitializer
+; CHECK-NEXT:    [[TMP42:%.*]] = icmp ne <2 x i32> [[TMP40]], zeroinitializer
+; CHECK-NEXT:    [[TMP43:%.*]] = shufflevector <2 x i1> [[TMP41]], <2 x i1> [[TMP42]], <4 x i32> <i32 0, i32 3, i32 3, i32 0>
+; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <2 x i32> poison, i32 [[AND134]], i32 0
+; CHECK-NEXT:    [[TMP45:%.*]] = shufflevector <2 x i32> [[TMP44]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP46:%.*]] = icmp ne <2 x i32> [[TMP45]], zeroinitializer
+; CHECK-NEXT:    [[TMP47:%.*]] = icmp eq <2 x i32> [[TMP45]], zeroinitializer
+; CHECK-NEXT:    [[TMP48:%.*]] = shufflevector <2 x i1> [[TMP46]], <2 x i1> [[TMP47]], <4 x i32> <i32 0, i32 3, i32 0, i32 3>
+; CHECK-NEXT:    [[TMP49:%.*]] = select <4 x i1> [[TMP43]], <4 x i1> [[TMP48]], <4 x i1> zeroinitializer
+; CHECK-NEXT:    [[TMP50:%.*]] = zext <4 x i1> [[TMP49]] to <4 x i32>
+; CHECK-NEXT:    [[TMP51]] = add <4 x i32> [[TMP38]], [[TMP50]]
 ; CHECK-NEXT:    [[DEC165]] = add nsw i32 [[NBBOOLBLOCK_2302]], -1
 ; CHECK-NEXT:    [[CMP130_NOT:%.*]] = icmp eq i32 [[DEC165]], 0
 ; CHECK-NEXT:    br i1 [[CMP130_NOT]], label [[WHILE_END166]], label [[WHILE_BODY132]]
 ; CHECK:       while.end166:
-; CHECK-NEXT:    [[_CFT_2_LCSSA:%.*]] = phi i32 [ [[_CFT_0_LCSSA]], [[WHILE_END122]] ], [ [[ADD164]], [[WHILE_BODY132]] ]
-; CHECK-NEXT:    [[_CTF_2_LCSSA:%.*]] = phi i32 [ [[_CTF_0_LCSSA]], [[WHILE_END122]] ], [ [[ADD157]], [[WHILE_BODY132]] ]
-; CHECK-NEXT:    [[_CFF_2_LCSSA:%.*]] = phi i32 [ [[_CFF_0_LCSSA]], [[WHILE_END122]] ], [ [[ADD150]], [[WHILE_BODY132]] ]
-; CHECK-NEXT:    [[_CTT_2_LCSSA:%.*]] = phi i32 [ [[_CTT_0_LCSSA]], [[WHILE_END122]] ], [ [[ADD142]], [[WHILE_BODY132]] ]
-; CHECK-NEXT:    store i32 [[_CTT_2_LCSSA]], ptr [[CTT:%.*]], align 4
-; CHECK-NEXT:    store i32 [[_CFF_2_LCSSA]], ptr [[CFF:%.*]], align 4
-; CHECK-NEXT:    store i32 [[_CTF_2_LCSSA]], ptr [[CTF:%.*]], align 4
-; CHECK-NEXT:    store i32 [[_CFT_2_LCSSA]], ptr [[CFT:%.*]], align 4
+; CHECK-NEXT:    [[TMP52:%.*]] = phi <4 x i32> [ [[TMP35]], [[WHILE_END122]] ], [ [[TMP51]], [[WHILE_BODY132]] ]
+; CHECK-NEXT:    [[TMP53:%.*]] = extractelement <4 x i32> [[TMP52]], i32 2
+; CHECK-NEXT:    store i32 [[TMP53]], ptr [[CTT:%.*]], align 4
+; CHECK-NEXT:    [[TMP54:%.*]] = extractelement <4 x i32> [[TMP52]], i32 3
+; CHECK-NEXT:    store i32 [[TMP54]], ptr [[CFF:%.*]], align 4
+; CHECK-NEXT:    [[TMP55:%.*]] = extractelement <4 x i32> [[TMP52]], i32 1
+; CHECK-NEXT:    store i32 [[TMP55]], ptr [[CTF:%.*]], align 4
+; CHECK-NEXT:    [[TMP56:%.*]] = extractelement <4 x i32> [[TMP52]], i32 0
+; CHECK-NEXT:    store i32 [[TMP56]], ptr [[CFT:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_clear_undefs.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_clear_undefs.ll
index de99654d84eb8..c2369a6a89ec1 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/crash_clear_undefs.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_clear_undefs.ll
@@ -9,7 +9,7 @@ target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16
 ; YAML-NEXT:  Function:        foo
 ; YAML-NEXT:  Args:
 ; YAML-NEXT:    - String:          'SLP vectorized with cost '
-; YAML-NEXT:    - Cost:            '-3'
+; YAML-NEXT:    - Cost:            '-4'
 ; YAML-NEXT:    - String:          ' and with tree size '
 ; YAML-NEXT:    - TreeSize:        '10'
 ; YAML-NEXT:  ...
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/hadd-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/hadd-inseltpoison.ll
index 0217ddcac0046..4a9f717918a02 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/hadd-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/hadd-inseltpoison.ll
@@ -166,11 +166,31 @@ define void @test_v4f32_v2f32_store(<4 x float> %f, ptr %p){
 ;
 
 define <4 x double> @test_v4f64(<4 x double> %a, <4 x double> %b) {
-; CHECK-LABEL: @test_v4f64(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-; CHECK-NEXT:    [[TMP3:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    ret <4 x double> [[TMP3]]
+; SSE-LABEL: @test_v4f64(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 2, i32 6>
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 3, i32 7>
+; SSE-NEXT:    [[TMP5:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]]
+; SSE-NEXT:    [[TMP6:%.*]] = fadd <2 x double> [[TMP2]], [[TMP4]]
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:    ret <4 x double> [[TMP7]]
+;
+; SLM-LABEL: @test_v4f64(
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 2, i32 6>
+; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 3, i32 7>
+; SLM-NEXT:    [[TMP5:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]]
+; SLM-NEXT:    [[TMP6:%.*]] = fadd <2 x double> [[TMP2]], [[TMP4]]
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:    ret <4 x double> [[TMP7]]
+;
+; AVX-LABEL: @test_v4f64(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+; AVX-NEXT:    [[TMP3:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    ret <4 x double> [[TMP3]]
 ;
   %a0 = extractelement <4 x double> %a, i32 0
   %a1 = extractelement <4 x double> %a, i32 1
@@ -266,11 +286,31 @@ define <4 x double> @test_v4f64_partial_swizzle(<4 x double> %a, <4 x double> %b
 }
 
 define <8 x float> @test_v8f32(<8 x float> %a, <8 x float> %b) {
-; CHECK-LABEL: @test_v8f32(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
-; CHECK-NEXT:    [[TMP3:%.*]] = fadd <8 x float> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    ret <8 x float> [[TMP3]]
+; SSE-LABEL: @test_v8f32(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
+; SSE-NEXT:    [[TMP5:%.*]] = fadd <4 x float> [[TMP1]], [[TMP3]]
+; SSE-NEXT:    [[TMP6:%.*]] = fadd <4 x float> [[TMP2]], [[TMP4]]
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    ret <8 x float> [[TMP7]]
+;
+; SLM-LABEL: @test_v8f32(
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
+; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
+; SLM-NEXT:    [[TMP5:%.*]] = fadd <4 x float> [[TMP1]], [[TMP3]]
+; SLM-NEXT:    [[TMP6:%.*]] = fadd <4 x float> [[TMP2]], [[TMP4]]
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    ret <8 x float> [[TMP7]]
+;
+; AVX-LABEL: @test_v8f32(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
+; AVX-NEXT:    [[TMP3:%.*]] = fadd <8 x float> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    ret <8 x float> [[TMP3]]
 ;
   %a0 = extractelement <8 x float> %a, i32 0
   %a1 = extractelement <8 x float> %a, i32 1
@@ -308,11 +348,31 @@ define <8 x float> @test_v8f32(<8 x float> %a, <8 x float> %b) {
 }
 
 define <4 x i64> @test_v4i64(<4 x i64> %a, <4 x i64> %b) {
-; CHECK-LABEL: @test_v4i64(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-; CHECK-NEXT:    [[TMP3:%.*]] = add <4 x i64> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    ret <4 x i64> [[TMP3]]
+; SSE-LABEL: @test_v4i64(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> <i32 0, i32 4>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 2, i32 6>
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 1, i32 5>
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 3, i32 7>
+; SSE-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[TMP1]], [[TMP3]]
+; SSE-NEXT:    [[TMP6:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]]
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:    ret <4 x i64> [[TMP7]]
+;
+; SLM-LABEL: @test_v4i64(
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> <i32 0, i32 4>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 2, i32 6>
+; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 1, i32 5>
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 3, i32 7>
+; SLM-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[TMP1]], [[TMP3]]
+; SLM-NEXT:    [[TMP6:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]]
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:    ret <4 x i64> [[TMP7]]
+;
+; AVX-LABEL: @test_v4i64(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+; AVX-NEXT:    [[TMP3:%.*]] = add <4 x i64> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    ret <4 x i64> [[TMP3]]
 ;
   %a0 = extractelement <4 x i64> %a, i32 0
   %a1 = extractelement <4 x i64> %a, i32 1
@@ -334,11 +394,31 @@ define <4 x i64> @test_v4i64(<4 x i64> %a, <4 x i64> %b) {
 }
 
 define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) {
-; CHECK-LABEL: @test_v8i32(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
-; CHECK-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    ret <8 x i32> [[TMP3]]
+; SSE-LABEL: @test_v8i32(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
+; SSE-NEXT:    [[TMP5:%.*]] = add <4 x i32> [[TMP1]], [[TMP3]]
+; SSE-NEXT:    [[TMP6:%.*]] = add <4 x i32> [[TMP2]], [[TMP4]]
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    ret <8 x i32> [[TMP7]]
+;
+; SLM-LABEL: @test_v8i32(
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
+; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
+; SLM-NEXT:    [[TMP5:%.*]] = add <4 x i32> [[TMP1]], [[TMP3]]
+; SLM-NEXT:    [[TMP6:%.*]] = add <4 x i32> [[TMP2]], [[TMP4]]
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    ret <8 x i32> [[TMP7]]
+;
+; AVX-LABEL: @test_v8i32(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
+; AVX-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    ret <8 x i32> [[TMP3]]
 ;
   %a0 = extractelement <8 x i32> %a, i32 0
   %a1 = extractelement <8 x i32> %a, i32 1
@@ -376,11 +456,31 @@ define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) {
 }
 
 define <16 x i16> @test_v16i16(<16 x i16> %a, <16 x i16> %b) {
-; CHECK-LABEL: @test_v16i16(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
-; CHECK-NEXT:    [[TMP3:%.*]] = add <16 x i16> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    ret <16 x i16> [[TMP3]]
+; SSE-LABEL: @test_v16i16(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23>
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
+; SSE-NEXT:    [[TMP5:%.*]] = add <8 x i16> [[TMP1]], [[TMP3]]
+; SSE-NEXT:    [[TMP6:%.*]] = add <8 x i16> [[TMP2]], [[TMP4]]
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:    ret <16 x i16> [[TMP7]]
+;
+; SLM-LABEL: @test_v16i16(
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
+; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23>
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
+; SLM-NEXT:    [[TMP5:%.*]] = add <8 x i16> [[TMP1]], [[TMP3]]
+; SLM-NEXT:    [[TMP6:%.*]] = add <8 x i16> [[TMP2]], [[TMP4]]
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SLM-NEXT:    ret <16 x i16> [[TMP7]]
+;
+; AVX-LABEL: @test_v16i16(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
+; AVX-NEXT:    [[TMP3:%.*]] = add <16 x i16> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    ret <16 x i16> [[TMP3]]
 ;
   %a0  = extractelement <16 x i16> %a, i32 0
   %a1  = extractelement <16 x i16> %a, i32 1
@@ -448,5 +548,3 @@ define <16 x i16> @test_v16i16(<16 x i16> %a, <16 x i16> %b) {
   %rv15 = insertelement <16 x i16> %rv14, i16 %r15, i32 15
   ret <16 x i16> %rv15
 }
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; AVX: {{.*}}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll b/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll
index c38d116a7a323..cac6845c43004 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll
@@ -166,11 +166,31 @@ define void @test_v4f32_v2f32_store(<4 x float> %f, ptr %p){
 ;
 
 define <4 x double> @test_v4f64(<4 x double> %a, <4 x double> %b) {
-; CHECK-LABEL: @test_v4f64(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-; CHECK-NEXT:    [[TMP3:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    ret <4 x double> [[TMP3]]
+; SSE-LABEL: @test_v4f64(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 2, i32 6>
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 3, i32 7>
+; SSE-NEXT:    [[TMP5:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]]
+; SSE-NEXT:    [[TMP6:%.*]] = fadd <2 x double> [[TMP2]], [[TMP4]]
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:    ret <4 x double> [[TMP7]]
+;
+; SLM-LABEL: @test_v4f64(
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 2, i32 6>
+; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 3, i32 7>
+; SLM-NEXT:    [[TMP5:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]]
+; SLM-NEXT:    [[TMP6:%.*]] = fadd <2 x double> [[TMP2]], [[TMP4]]
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:    ret <4 x double> [[TMP7]]
+;
+; AVX-LABEL: @test_v4f64(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+; AVX-NEXT:    [[TMP3:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    ret <4 x double> [[TMP3]]
 ;
   %a0 = extractelement <4 x double> %a, i32 0
   %a1 = extractelement <4 x double> %a, i32 1
@@ -266,11 +286,31 @@ define <4 x double> @test_v4f64_partial_swizzle(<4 x double> %a, <4 x double> %b
 }
 
 define <8 x float> @test_v8f32(<8 x float> %a, <8 x float> %b) {
-; CHECK-LABEL: @test_v8f32(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
-; CHECK-NEXT:    [[TMP3:%.*]] = fadd <8 x float> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    ret <8 x float> [[TMP3]]
+; SSE-LABEL: @test_v8f32(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
+; SSE-NEXT:    [[TMP5:%.*]] = fadd <4 x float> [[TMP1]], [[TMP3]]
+; SSE-NEXT:    [[TMP6:%.*]] = fadd <4 x float> [[TMP2]], [[TMP4]]
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    ret <8 x float> [[TMP7]]
+;
+; SLM-LABEL: @test_v8f32(
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
+; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
+; SLM-NEXT:    [[TMP5:%.*]] = fadd <4 x float> [[TMP1]], [[TMP3]]
+; SLM-NEXT:    [[TMP6:%.*]] = fadd <4 x float> [[TMP2]], [[TMP4]]
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    ret <8 x float> [[TMP7]]
+;
+; AVX-LABEL: @test_v8f32(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
+; AVX-NEXT:    [[TMP3:%.*]] = fadd <8 x float> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    ret <8 x float> [[TMP3]]
 ;
   %a0 = extractelement <8 x float> %a, i32 0
   %a1 = extractelement <8 x float> %a, i32 1
@@ -308,11 +348,31 @@ define <8 x float> @test_v8f32(<8 x float> %a, <8 x float> %b) {
 }
 
 define <4 x i64> @test_v4i64(<4 x i64> %a, <4 x i64> %b) {
-; CHECK-LABEL: @test_v4i64(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-; CHECK-NEXT:    [[TMP3:%.*]] = add <4 x i64> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    ret <4 x i64> [[TMP3]]
+; SSE-LABEL: @test_v4i64(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> <i32 0, i32 4>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 2, i32 6>
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 1, i32 5>
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 3, i32 7>
+; SSE-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[TMP1]], [[TMP3]]
+; SSE-NEXT:    [[TMP6:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]]
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:    ret <4 x i64> [[TMP7]]
+;
+; SLM-LABEL: @test_v4i64(
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> <i32 0, i32 4>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 2, i32 6>
+; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 1, i32 5>
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 3, i32 7>
+; SLM-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[TMP1]], [[TMP3]]
+; SLM-NEXT:    [[TMP6:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]]
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:    ret <4 x i64> [[TMP7]]
+;
+; AVX-LABEL: @test_v4i64(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+; AVX-NEXT:    [[TMP3:%.*]] = add <4 x i64> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    ret <4 x i64> [[TMP3]]
 ;
   %a0 = extractelement <4 x i64> %a, i32 0
   %a1 = extractelement <4 x i64> %a, i32 1
@@ -334,11 +394,31 @@ define <4 x i64> @test_v4i64(<4 x i64> %a, <4 x i64> %b) {
 }
 
 define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) {
-; CHECK-LABEL: @test_v8i32(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
-; CHECK-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    ret <8 x i32> [[TMP3]]
+; SSE-LABEL: @test_v8i32(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
+; SSE-NEXT:    [[TMP5:%.*]] = add <4 x i32> [[TMP1]], [[TMP3]]
+; SSE-NEXT:    [[TMP6:%.*]] = add <4 x i32> [[TMP2]], [[TMP4]]
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    ret <8 x i32> [[TMP7]]
+;
+; SLM-LABEL: @test_v8i32(
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
+; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
+; SLM-NEXT:    [[TMP5:%.*]] = add <4 x i32> [[TMP1]], [[TMP3]]
+; SLM-NEXT:    [[TMP6:%.*]] = add <4 x i32> [[TMP2]], [[TMP4]]
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    ret <8 x i32> [[TMP7]]
+;
+; AVX-LABEL: @test_v8i32(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
+; AVX-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    ret <8 x i32> [[TMP3]]
 ;
   %a0 = extractelement <8 x i32> %a, i32 0
   %a1 = extractelement <8 x i32> %a, i32 1
@@ -376,11 +456,31 @@ define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) {
 }
 
 define <16 x i16> @test_v16i16(<16 x i16> %a, <16 x i16> %b) {
-; CHECK-LABEL: @test_v16i16(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
-; CHECK-NEXT:    [[TMP3:%.*]] = add <16 x i16> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    ret <16 x i16> [[TMP3]]
+; SSE-LABEL: @test_v16i16(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23>
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
+; SSE-NEXT:    [[TMP5:%.*]] = add <8 x i16> [[TMP1]], [[TMP3]]
+; SSE-NEXT:    [[TMP6:%.*]] = add <8 x i16> [[TMP2]], [[TMP4]]
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:    ret <16 x i16> [[TMP7]]
+;
+; SLM-LABEL: @test_v16i16(
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
+; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23>
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
+; SLM-NEXT:    [[TMP5:%.*]] = add <8 x i16> [[TMP1]], [[TMP3]]
+; SLM-NEXT:    [[TMP6:%.*]] = add <8 x i16> [[TMP2]], [[TMP4]]
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SLM-NEXT:    ret <16 x i16> [[TMP7]]
+;
+; AVX-LABEL: @test_v16i16(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
+; AVX-NEXT:    [[TMP3:%.*]] = add <16 x i16> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    ret <16 x i16> [[TMP3]]
 ;
   %a0  = extractelement <16 x i16> %a, i32 0
   %a1  = extractelement <16 x i16> %a, i32 1
@@ -448,5 +548,3 @@ define <16 x i16> @test_v16i16(<16 x i16> %a, <16 x i16> %b) {
   %rv15 = insertelement <16 x i16> %rv14, i16 %r15, i32 15
   ret <16 x i16> %rv15
 }
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; AVX: {{.*}}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/hsub-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/hsub-inseltpoison.ll
index 39400ba4ce1e8..40b6a8c32f5d0 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/hsub-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/hsub-inseltpoison.ll
@@ -145,11 +145,31 @@ define <8 x i16> @test_v8i16(<8 x i16> %a, <8 x i16> %b) {
 ;
 
 define <4 x double> @test_v4f64(<4 x double> %a, <4 x double> %b) {
-; CHECK-LABEL: @test_v4f64(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-; CHECK-NEXT:    [[TMP3:%.*]] = fsub <4 x double> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    ret <4 x double> [[TMP3]]
+; SSE-LABEL: @test_v4f64(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 2, i32 6>
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 3, i32 7>
+; SSE-NEXT:    [[TMP5:%.*]] = fsub <2 x double> [[TMP1]], [[TMP3]]
+; SSE-NEXT:    [[TMP6:%.*]] = fsub <2 x double> [[TMP2]], [[TMP4]]
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:    ret <4 x double> [[TMP7]]
+;
+; SLM-LABEL: @test_v4f64(
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 2, i32 6>
+; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 3, i32 7>
+; SLM-NEXT:    [[TMP5:%.*]] = fsub <2 x double> [[TMP1]], [[TMP3]]
+; SLM-NEXT:    [[TMP6:%.*]] = fsub <2 x double> [[TMP2]], [[TMP4]]
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:    ret <4 x double> [[TMP7]]
+;
+; AVX-LABEL: @test_v4f64(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+; AVX-NEXT:    [[TMP3:%.*]] = fsub <4 x double> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    ret <4 x double> [[TMP3]]
 ;
   %a0 = extractelement <4 x double> %a, i32 0
   %a1 = extractelement <4 x double> %a, i32 1
@@ -171,11 +191,31 @@ define <4 x double> @test_v4f64(<4 x double> %a, <4 x double> %b) {
 }
 
 define <8 x float> @test_v8f32(<8 x float> %a, <8 x float> %b) {
-; CHECK-LABEL: @test_v8f32(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
-; CHECK-NEXT:    [[TMP3:%.*]] = fsub <8 x float> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    ret <8 x float> [[TMP3]]
+; SSE-LABEL: @test_v8f32(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
+; SSE-NEXT:    [[TMP5:%.*]] = fsub <4 x float> [[TMP1]], [[TMP3]]
+; SSE-NEXT:    [[TMP6:%.*]] = fsub <4 x float> [[TMP2]], [[TMP4]]
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    ret <8 x float> [[TMP7]]
+;
+; SLM-LABEL: @test_v8f32(
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
+; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
+; SLM-NEXT:    [[TMP5:%.*]] = fsub <4 x float> [[TMP1]], [[TMP3]]
+; SLM-NEXT:    [[TMP6:%.*]] = fsub <4 x float> [[TMP2]], [[TMP4]]
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    ret <8 x float> [[TMP7]]
+;
+; AVX-LABEL: @test_v8f32(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
+; AVX-NEXT:    [[TMP3:%.*]] = fsub <8 x float> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    ret <8 x float> [[TMP3]]
 ;
   %a0 = extractelement <8 x float> %a, i32 0
   %a1 = extractelement <8 x float> %a, i32 1
@@ -213,11 +253,31 @@ define <8 x float> @test_v8f32(<8 x float> %a, <8 x float> %b) {
 }
 
 define <4 x i64> @test_v4i64(<4 x i64> %a, <4 x i64> %b) {
-; CHECK-LABEL: @test_v4i64(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-; CHECK-NEXT:    [[TMP3:%.*]] = sub <4 x i64> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    ret <4 x i64> [[TMP3]]
+; SSE-LABEL: @test_v4i64(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> <i32 0, i32 4>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 2, i32 6>
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 1, i32 5>
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 3, i32 7>
+; SSE-NEXT:    [[TMP5:%.*]] = sub <2 x i64> [[TMP1]], [[TMP3]]
+; SSE-NEXT:    [[TMP6:%.*]] = sub <2 x i64> [[TMP2]], [[TMP4]]
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:    ret <4 x i64> [[TMP7]]
+;
+; SLM-LABEL: @test_v4i64(
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> <i32 0, i32 4>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 2, i32 6>
+; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 1, i32 5>
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 3, i32 7>
+; SLM-NEXT:    [[TMP5:%.*]] = sub <2 x i64> [[TMP1]], [[TMP3]]
+; SLM-NEXT:    [[TMP6:%.*]] = sub <2 x i64> [[TMP2]], [[TMP4]]
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:    ret <4 x i64> [[TMP7]]
+;
+; AVX-LABEL: @test_v4i64(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+; AVX-NEXT:    [[TMP3:%.*]] = sub <4 x i64> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    ret <4 x i64> [[TMP3]]
 ;
   %a0 = extractelement <4 x i64> %a, i32 0
   %a1 = extractelement <4 x i64> %a, i32 1
@@ -239,11 +299,31 @@ define <4 x i64> @test_v4i64(<4 x i64> %a, <4 x i64> %b) {
 }
 
 define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) {
-; CHECK-LABEL: @test_v8i32(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
-; CHECK-NEXT:    [[TMP3:%.*]] = sub <8 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    ret <8 x i32> [[TMP3]]
+; SSE-LABEL: @test_v8i32(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
+; SSE-NEXT:    [[TMP5:%.*]] = sub <4 x i32> [[TMP1]], [[TMP3]]
+; SSE-NEXT:    [[TMP6:%.*]] = sub <4 x i32> [[TMP2]], [[TMP4]]
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    ret <8 x i32> [[TMP7]]
+;
+; SLM-LABEL: @test_v8i32(
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
+; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
+; SLM-NEXT:    [[TMP5:%.*]] = sub <4 x i32> [[TMP1]], [[TMP3]]
+; SLM-NEXT:    [[TMP6:%.*]] = sub <4 x i32> [[TMP2]], [[TMP4]]
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    ret <8 x i32> [[TMP7]]
+;
+; AVX-LABEL: @test_v8i32(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
+; AVX-NEXT:    [[TMP3:%.*]] = sub <8 x i32> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    ret <8 x i32> [[TMP3]]
 ;
   %a0 = extractelement <8 x i32> %a, i32 0
   %a1 = extractelement <8 x i32> %a, i32 1
@@ -281,11 +361,31 @@ define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) {
 }
 
 define <16 x i16> @test_v16i16(<16 x i16> %a, <16 x i16> %b) {
-; CHECK-LABEL: @test_v16i16(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
-; CHECK-NEXT:    [[TMP3:%.*]] = sub <16 x i16> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    ret <16 x i16> [[TMP3]]
+; SSE-LABEL: @test_v16i16(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23>
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
+; SSE-NEXT:    [[TMP5:%.*]] = sub <8 x i16> [[TMP1]], [[TMP3]]
+; SSE-NEXT:    [[TMP6:%.*]] = sub <8 x i16> [[TMP2]], [[TMP4]]
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:    ret <16 x i16> [[TMP7]]
+;
+; SLM-LABEL: @test_v16i16(
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
+; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23>
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
+; SLM-NEXT:    [[TMP5:%.*]] = sub <8 x i16> [[TMP1]], [[TMP3]]
+; SLM-NEXT:    [[TMP6:%.*]] = sub <8 x i16> [[TMP2]], [[TMP4]]
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SLM-NEXT:    ret <16 x i16> [[TMP7]]
+;
+; AVX-LABEL: @test_v16i16(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
+; AVX-NEXT:    [[TMP3:%.*]] = sub <16 x i16> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    ret <16 x i16> [[TMP3]]
 ;
   %a0  = extractelement <16 x i16> %a, i32 0
   %a1  = extractelement <16 x i16> %a, i32 1
@@ -354,9 +454,6 @@ define <16 x i16> @test_v16i16(<16 x i16> %a, <16 x i16> %b) {
   ret <16 x i16> %rv15
 }
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; AVX: {{.*}}
 ; AVX1: {{.*}}
 ; AVX2: {{.*}}
 ; AVX512: {{.*}}
-; SLM: {{.*}}
-; SSE: {{.*}}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/hsub.ll b/llvm/test/Transforms/SLPVectorizer/X86/hsub.ll
index 6b63de83c56be..09113323d3ab7 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/hsub.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/hsub.ll
@@ -145,11 +145,31 @@ define <8 x i16> @test_v8i16(<8 x i16> %a, <8 x i16> %b) {
 ;
 
 define <4 x double> @test_v4f64(<4 x double> %a, <4 x double> %b) {
-; CHECK-LABEL: @test_v4f64(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-; CHECK-NEXT:    [[TMP3:%.*]] = fsub <4 x double> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    ret <4 x double> [[TMP3]]
+; SSE-LABEL: @test_v4f64(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 2, i32 6>
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 3, i32 7>
+; SSE-NEXT:    [[TMP5:%.*]] = fsub <2 x double> [[TMP1]], [[TMP3]]
+; SSE-NEXT:    [[TMP6:%.*]] = fsub <2 x double> [[TMP2]], [[TMP4]]
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:    ret <4 x double> [[TMP7]]
+;
+; SLM-LABEL: @test_v4f64(
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 2, i32 6>
+; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 3, i32 7>
+; SLM-NEXT:    [[TMP5:%.*]] = fsub <2 x double> [[TMP1]], [[TMP3]]
+; SLM-NEXT:    [[TMP6:%.*]] = fsub <2 x double> [[TMP2]], [[TMP4]]
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:    ret <4 x double> [[TMP7]]
+;
+; AVX-LABEL: @test_v4f64(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+; AVX-NEXT:    [[TMP3:%.*]] = fsub <4 x double> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    ret <4 x double> [[TMP3]]
 ;
   %a0 = extractelement <4 x double> %a, i32 0
   %a1 = extractelement <4 x double> %a, i32 1
@@ -171,11 +191,31 @@ define <4 x double> @test_v4f64(<4 x double> %a, <4 x double> %b) {
 }
 
 define <8 x float> @test_v8f32(<8 x float> %a, <8 x float> %b) {
-; CHECK-LABEL: @test_v8f32(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
-; CHECK-NEXT:    [[TMP3:%.*]] = fsub <8 x float> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    ret <8 x float> [[TMP3]]
+; SSE-LABEL: @test_v8f32(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
+; SSE-NEXT:    [[TMP5:%.*]] = fsub <4 x float> [[TMP1]], [[TMP3]]
+; SSE-NEXT:    [[TMP6:%.*]] = fsub <4 x float> [[TMP2]], [[TMP4]]
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    ret <8 x float> [[TMP7]]
+;
+; SLM-LABEL: @test_v8f32(
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
+; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
+; SLM-NEXT:    [[TMP5:%.*]] = fsub <4 x float> [[TMP1]], [[TMP3]]
+; SLM-NEXT:    [[TMP6:%.*]] = fsub <4 x float> [[TMP2]], [[TMP4]]
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    ret <8 x float> [[TMP7]]
+;
+; AVX-LABEL: @test_v8f32(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
+; AVX-NEXT:    [[TMP3:%.*]] = fsub <8 x float> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    ret <8 x float> [[TMP3]]
 ;
   %a0 = extractelement <8 x float> %a, i32 0
   %a1 = extractelement <8 x float> %a, i32 1
@@ -213,11 +253,31 @@ define <8 x float> @test_v8f32(<8 x float> %a, <8 x float> %b) {
 }
 
 define <4 x i64> @test_v4i64(<4 x i64> %a, <4 x i64> %b) {
-; CHECK-LABEL: @test_v4i64(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-; CHECK-NEXT:    [[TMP3:%.*]] = sub <4 x i64> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    ret <4 x i64> [[TMP3]]
+; SSE-LABEL: @test_v4i64(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> <i32 0, i32 4>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 2, i32 6>
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 1, i32 5>
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 3, i32 7>
+; SSE-NEXT:    [[TMP5:%.*]] = sub <2 x i64> [[TMP1]], [[TMP3]]
+; SSE-NEXT:    [[TMP6:%.*]] = sub <2 x i64> [[TMP2]], [[TMP4]]
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:    ret <4 x i64> [[TMP7]]
+;
+; SLM-LABEL: @test_v4i64(
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> <i32 0, i32 4>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 2, i32 6>
+; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 1, i32 5>
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 3, i32 7>
+; SLM-NEXT:    [[TMP5:%.*]] = sub <2 x i64> [[TMP1]], [[TMP3]]
+; SLM-NEXT:    [[TMP6:%.*]] = sub <2 x i64> [[TMP2]], [[TMP4]]
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:    ret <4 x i64> [[TMP7]]
+;
+; AVX-LABEL: @test_v4i64(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+; AVX-NEXT:    [[TMP3:%.*]] = sub <4 x i64> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    ret <4 x i64> [[TMP3]]
 ;
   %a0 = extractelement <4 x i64> %a, i32 0
   %a1 = extractelement <4 x i64> %a, i32 1
@@ -239,11 +299,31 @@ define <4 x i64> @test_v4i64(<4 x i64> %a, <4 x i64> %b) {
 }
 
 define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) {
-; CHECK-LABEL: @test_v8i32(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
-; CHECK-NEXT:    [[TMP3:%.*]] = sub <8 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    ret <8 x i32> [[TMP3]]
+; SSE-LABEL: @test_v8i32(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
+; SSE-NEXT:    [[TMP5:%.*]] = sub <4 x i32> [[TMP1]], [[TMP3]]
+; SSE-NEXT:    [[TMP6:%.*]] = sub <4 x i32> [[TMP2]], [[TMP4]]
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    ret <8 x i32> [[TMP7]]
+;
+; SLM-LABEL: @test_v8i32(
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
+; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
+; SLM-NEXT:    [[TMP5:%.*]] = sub <4 x i32> [[TMP1]], [[TMP3]]
+; SLM-NEXT:    [[TMP6:%.*]] = sub <4 x i32> [[TMP2]], [[TMP4]]
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    ret <8 x i32> [[TMP7]]
+;
+; AVX-LABEL: @test_v8i32(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
+; AVX-NEXT:    [[TMP3:%.*]] = sub <8 x i32> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    ret <8 x i32> [[TMP3]]
 ;
   %a0 = extractelement <8 x i32> %a, i32 0
   %a1 = extractelement <8 x i32> %a, i32 1
@@ -281,11 +361,31 @@ define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) {
 }
 
 define <16 x i16> @test_v16i16(<16 x i16> %a, <16 x i16> %b) {
-; CHECK-LABEL: @test_v16i16(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
-; CHECK-NEXT:    [[TMP3:%.*]] = sub <16 x i16> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    ret <16 x i16> [[TMP3]]
+; SSE-LABEL: @test_v16i16(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23>
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
+; SSE-NEXT:    [[TMP5:%.*]] = sub <8 x i16> [[TMP1]], [[TMP3]]
+; SSE-NEXT:    [[TMP6:%.*]] = sub <8 x i16> [[TMP2]], [[TMP4]]
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:    ret <16 x i16> [[TMP7]]
+;
+; SLM-LABEL: @test_v16i16(
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
+; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23>
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
+; SLM-NEXT:    [[TMP5:%.*]] = sub <8 x i16> [[TMP1]], [[TMP3]]
+; SLM-NEXT:    [[TMP6:%.*]] = sub <8 x i16> [[TMP2]], [[TMP4]]
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SLM-NEXT:    ret <16 x i16> [[TMP7]]
+;
+; AVX-LABEL: @test_v16i16(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
+; AVX-NEXT:    [[TMP3:%.*]] = sub <16 x i16> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    ret <16 x i16> [[TMP3]]
 ;
   %a0  = extractelement <16 x i16> %a, i32 0
   %a1  = extractelement <16 x i16> %a, i32 1
@@ -354,9 +454,6 @@ define <16 x i16> @test_v16i16(<16 x i16> %a, <16 x i16> %b) {
   ret <16 x i16> %rv15
 }
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; AVX: {{.*}}
 ; AVX1: {{.*}}
 ; AVX2: {{.*}}
 ; AVX512: {{.*}}
-; SLM: {{.*}}
-; SSE: {{.*}}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reused-extractelements.ll b/llvm/test/Transforms/SLPVectorizer/X86/reused-extractelements.ll
index 35cb8c729e106..94a1d7aa1951c 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reused-extractelements.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reused-extractelements.ll
@@ -2,23 +2,24 @@
 ; RUN: opt < %s -passes=slp-vectorizer -S -o - -mtriple=x86_64-unknown-linux -mcpu=bdver2 -pass-remarks-output=%t | FileCheck %s
 ; RUN: FileCheck --input-file=%t --check-prefix=YAML %s
 
-; YAML: --- !Passed
+; YAML: --- !Missed
 ; YAML-NEXT: Pass:            slp-vectorizer
-; YAML-NEXT: Name:            VectorizedList
+; YAML-NEXT: Name:            NotBeneficial
 ; YAML-NEXT: Function:        g
 ; YAML-NEXT: Args:
-; YAML-NEXT:   - String:          'SLP vectorized with cost '
-; YAML-NEXT:   - Cost:            '-1'
-; YAML-NEXT:   - String:          ' and with tree size '
-; YAML-NEXT:   - TreeSize:        '4'
+; YAML-NEXT:   - String:          'List vectorization was possible but not beneficial with cost '
+; YAML-NEXT:   - Cost:            '0'
+; YAML-NEXT:   - String:          ' >= '
+; YAML-NEXT:   - Treshold:        '0'
 
 define <2 x i32> @g(<2 x i32> %x, i32 %a, i32 %b) {
 ; CHECK-LABEL: @g(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i32> [[X:%.*]], <2 x i32> poison, <2 x i32> <i32 1, i32 poison>
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[A:%.*]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[B:%.*]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = mul <2 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    ret <2 x i32> [[TMP4]]
+; CHECK-NEXT:    [[X1:%.*]] = extractelement <2 x i32> [[X:%.*]], i32 1
+; CHECK-NEXT:    [[X1X1:%.*]] = mul i32 [[X1]], [[X1]]
+; CHECK-NEXT:    [[AB:%.*]] = mul i32 [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[INS1:%.*]] = insertelement <2 x i32> poison, i32 [[X1X1]], i32 0
+; CHECK-NEXT:    [[INS2:%.*]] = insertelement <2 x i32> [[INS1]], i32 [[AB]], i32 1
+; CHECK-NEXT:    ret <2 x i32> [[INS2]]
 ;
   %x1 = extractelement <2 x i32> %x, i32 1
   %x1x1 = mul i32 %x1, %x1

From e3c2eacfd8b95346cfa4e71ba08b91cab124047b Mon Sep 17 00:00:00 2001
From: ZhangYin <zhangyin2018@iscas.ac.cn>
Date: Wed, 1 Nov 2023 22:51:23 +0800
Subject: [PATCH 685/877] [libc++] <experimental/simd> Add default constructor
 for class simd/simd_mask (#70424)

---
 libcxx/docs/Status/ParallelismProjects.csv    |   2 +
 libcxx/include/experimental/__simd/simd.h     |   2 +
 .../include/experimental/__simd/simd_mask.h   |   2 +
 .../simd.class/simd_ctor_default.pass.cpp     | 108 +++++++++++++++
 .../simd_mask_ctor_default.pass.cpp           | 123 ++++++++++++++++++
 5 files changed, 237 insertions(+)
 create mode 100644 libcxx/test/std/experimental/simd/simd.class/simd_ctor_default.pass.cpp
 create mode 100644 libcxx/test/std/experimental/simd/simd.mask.class/simd_mask_ctor_default.pass.cpp

diff --git a/libcxx/docs/Status/ParallelismProjects.csv b/libcxx/docs/Status/ParallelismProjects.csv
index 05a66080823d4..d4ce43365707f 100644
--- a/libcxx/docs/Status/ParallelismProjects.csv
+++ b/libcxx/docs/Status/ParallelismProjects.csv
@@ -18,6 +18,7 @@ Section,Description,Dependencies,Assignee,Complete
 | `[parallel.simd.reference] <https://wg21.link/N4808>`_, "`Element references operator= <https://github.com/llvm/llvm-project/pull/70020>`_", None, Yin Zhang, |Complete|
 | `[parallel.simd.class] <https://wg21.link/N4808>`_, "`Class template simd declaration and alias <https://reviews.llvm.org/D144362>`_", [parallel.simd.abi], Yin Zhang, |Complete|
 | `[parallel.simd.class] <https://wg21.link/N4808>`_, "`simd<>::size() <https://reviews.llvm.org/D144363>`_", [parallel.simd.traits] simd_size[_v], Yin Zhang, |Complete|
+| `[parallel.simd.class] <https://wg21.link/N4808>`_, "`simd default constructor <https://github.com/llvm/llvm-project/pull/70424>`_", None, Yin Zhang, |Complete|
 | `[parallel.simd.class] <https://wg21.link/N4808>`_, "`simd broadcast constructor <https://reviews.llvm.org/D156225>`_", None, Yin Zhang, |Complete|
 | `[parallel.simd.class] <https://wg21.link/N4808>`_, "`simd generate constructor <https://reviews.llvm.org/D159442>`_", None, Yin Zhang, |Complete|
 | `[parallel.simd.class] <https://wg21.link/N4808>`_, "`simd subscript operators <https://github.com/llvm/llvm-project/pull/68960>`_", None, Yin Zhang, |Complete|
@@ -25,6 +26,7 @@ Section,Description,Dependencies,Assignee,Complete
 | `[parallel.simd.nonmembers] <https://wg21.link/N4808>`_, "simd non-member operations", None, Yin Zhang, |In Progress|
 | `[parallel.simd.mask.class] <https://wg21.link/N4808>`_, "`Class template simd_mask declaration and alias <https://reviews.llvm.org/D144362>`_", [parallel.simd.abi], Yin Zhang, |Complete|
 | `[parallel.simd.mask.class] <https://wg21.link/N4808>`_, "`simd_mask<>::size() <https://reviews.llvm.org/D144363>`_", [parallel.simd.class] simd<>::size(), Yin Zhang, |Complete|
+| `[parallel.simd.mask.class] <https://wg21.link/N4808>`_, "`simd_mask default constructor <https://github.com/llvm/llvm-project/pull/70424>`_", None, Yin Zhang, |Complete|
 | `[parallel.simd.mask.class] <https://wg21.link/N4808>`_, "`simd_mask broadcast constructor <https://reviews.llvm.org/D156225>`_", None, Yin Zhang, |Complete|
 | `[parallel.simd.mask.class] <https://wg21.link/N4808>`_, "`simd_mask subscript operators <https://github.com/llvm/llvm-project/pull/68960>`_", None, Yin Zhang, |Complete|
 | `[parallel.simd.mask.class] <https://wg21.link/N4808>`_, "Class template simd_mask implementation", None, Yin Zhang, |In Progress|
diff --git a/libcxx/include/experimental/__simd/simd.h b/libcxx/include/experimental/__simd/simd.h
index c71dd625e46c6..29a566608603d 100644
--- a/libcxx/include/experimental/__simd/simd.h
+++ b/libcxx/include/experimental/__simd/simd.h
@@ -40,6 +40,8 @@ class simd {
 
   static _LIBCPP_HIDE_FROM_ABI constexpr size_t size() noexcept { return simd_size_v<value_type, abi_type>; }
 
+  _LIBCPP_HIDE_FROM_ABI simd() noexcept = default;
+
   // broadcast constructor
   template <class _Up, enable_if_t<__can_broadcast_v<value_type, __remove_cvref_t<_Up>>, int> = 0>
   _LIBCPP_HIDE_FROM_ABI simd(_Up&& __v) noexcept : __s_(_Impl::__broadcast(static_cast<value_type>(__v))) {}
diff --git a/libcxx/include/experimental/__simd/simd_mask.h b/libcxx/include/experimental/__simd/simd_mask.h
index 946338542ca0d..2e47b678913f8 100644
--- a/libcxx/include/experimental/__simd/simd_mask.h
+++ b/libcxx/include/experimental/__simd/simd_mask.h
@@ -38,6 +38,8 @@ class simd_mask {
 
   static _LIBCPP_HIDE_FROM_ABI constexpr size_t size() noexcept { return simd_type::size(); }
 
+  _LIBCPP_HIDE_FROM_ABI simd_mask() noexcept = default;
+
   // broadcast constructor
   _LIBCPP_HIDE_FROM_ABI explicit simd_mask(value_type __v) noexcept : __s_(_Impl::__broadcast(__v)) {}
 
diff --git a/libcxx/test/std/experimental/simd/simd.class/simd_ctor_default.pass.cpp b/libcxx/test/std/experimental/simd/simd.class/simd_ctor_default.pass.cpp
new file mode 100644
index 0000000000000..697a947f2fa4f
--- /dev/null
+++ b/libcxx/test/std/experimental/simd/simd.class/simd_ctor_default.pass.cpp
@@ -0,0 +1,108 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14
+
+// <experimental/simd>
+//
+// [simd.class]
+// simd() noexcept = default;
+
+#include "../test_utils.h"
+#include <experimental/simd>
+
+namespace ex = std::experimental::parallelism_v2;
+
+// See https://www.open-std.org/jtc1/sc22/WG21/docs/papers/2019/n4808.pdf
+// Default intialization performs no initialization of the elements; value-initialization initializes each element with T().
+// Thus, default initialization leaves the elements in an indeterminate state.
+template <class T, std::size_t>
+struct CheckSimdDefaultCtor {
+  template <class SimdAbi>
+  void operator()() {
+    static_assert(std::is_nothrow_default_constructible_v<ex::simd<T, SimdAbi>>);
+    ex::simd<T, SimdAbi> pure_simd;
+    // trash value in default ctor
+    static_assert(pure_simd.size() > 0);
+  }
+};
+
+template <class T, std::size_t>
+struct CheckSimdDefaultCopyCtor {
+  template <class SimdAbi>
+  void operator()() {
+    ex::simd<T, SimdAbi> pure_simd([](T i) { return i; });
+    constexpr std::size_t array_size = ex::simd_size_v<T, SimdAbi>;
+    std::array<T, array_size> expected_value;
+    for (size_t i = 0; i < array_size; ++i)
+      expected_value[i] = pure_simd[i];
+
+    static_assert(std::is_nothrow_copy_constructible_v<ex::simd<T, SimdAbi>>);
+    ex::simd<T, SimdAbi> from_copy_ctor(pure_simd);
+    assert_simd_values_equal<array_size>(from_copy_ctor, expected_value);
+  }
+};
+
+template <class T, std::size_t>
+struct CheckSimdDefaultMoveCtor {
+  template <class SimdAbi>
+  void operator()() {
+    ex::simd<T, SimdAbi> pure_simd([](T i) { return i; });
+    constexpr std::size_t array_size = ex::simd_size_v<T, SimdAbi>;
+    std::array<T, array_size> expected_value;
+    for (size_t i = 0; i < array_size; ++i)
+      expected_value[i] = pure_simd[i];
+
+    static_assert(std::is_nothrow_move_constructible_v<ex::simd<T, SimdAbi>>);
+    ex::simd<T, SimdAbi> from_move_ctor(std::move(pure_simd));
+    assert_simd_values_equal<array_size>(from_move_ctor, expected_value);
+  }
+};
+
+template <class T, std::size_t>
+struct CheckSimdDefaultCopyAssignment {
+  template <class SimdAbi>
+  void operator()() {
+    ex::simd<T, SimdAbi> pure_simd([](T i) { return i; });
+    constexpr std::size_t array_size = ex::simd_size_v<T, SimdAbi>;
+    std::array<T, array_size> expected_value;
+    for (size_t i = 0; i < array_size; ++i)
+      expected_value[i] = pure_simd[i];
+
+    static_assert(std::is_nothrow_copy_assignable_v<ex::simd<T, SimdAbi>>);
+    ex::simd<T, SimdAbi> from_copy_assignment;
+    from_copy_assignment = pure_simd;
+    assert_simd_values_equal<array_size>(from_copy_assignment, expected_value);
+  }
+};
+
+template <class T, std::size_t>
+struct CheckSimdDefaultMoveAssignment {
+  template <class SimdAbi>
+  void operator()() {
+    ex::simd<T, SimdAbi> pure_simd([](T i) { return i; });
+    constexpr std::size_t array_size = ex::simd_size_v<T, SimdAbi>;
+    std::array<T, array_size> expected_value;
+    for (size_t i = 0; i < array_size; ++i)
+      expected_value[i] = pure_simd[i];
+
+    static_assert(std::is_nothrow_move_assignable_v<ex::simd<T, SimdAbi>>);
+    ex::simd<T, SimdAbi> from_move_assignment;
+    from_move_assignment = std::move(pure_simd);
+    assert_simd_values_equal<array_size>(from_move_assignment, expected_value);
+  }
+};
+
+int main(int, char**) {
+  test_all_simd_abi<CheckSimdDefaultCtor>();
+  test_all_simd_abi<CheckSimdDefaultCopyCtor>();
+  test_all_simd_abi<CheckSimdDefaultMoveCtor>();
+  test_all_simd_abi<CheckSimdDefaultCopyAssignment>();
+  test_all_simd_abi<CheckSimdDefaultMoveAssignment>();
+  return 0;
+}
diff --git a/libcxx/test/std/experimental/simd/simd.mask.class/simd_mask_ctor_default.pass.cpp b/libcxx/test/std/experimental/simd/simd.mask.class/simd_mask_ctor_default.pass.cpp
new file mode 100644
index 0000000000000..8401e7efd3824
--- /dev/null
+++ b/libcxx/test/std/experimental/simd/simd.mask.class/simd_mask_ctor_default.pass.cpp
@@ -0,0 +1,123 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14
+
+// <experimental/simd>
+//
+// [simd.mask.class]
+// simd_mask() noexcept = default;
+
+#include "../test_utils.h"
+#include <experimental/simd>
+
+namespace ex = std::experimental::parallelism_v2;
+
+// See https://www.open-std.org/jtc1/sc22/WG21/docs/papers/2019/n4808.pdf
+// Default intialization performs no initialization of the elements; value-initialization initializes each element with T().
+// Thus, default initialization leaves the elements in an indeterminate state.
+template <class T, std::size_t>
+struct CheckSimdMaskDefaultCtor {
+  template <class SimdAbi>
+  void operator()() {
+    ex::simd_mask<T, SimdAbi> pure_mask;
+    // trash value in default ctor
+    static_assert(pure_mask.size() > 0);
+  }
+};
+
+template <class T, std::size_t>
+struct CheckSimdMaskDefaultCopyCtor {
+  template <class SimdAbi>
+  void operator()() {
+    ex::simd_mask<T, SimdAbi> pure_mask;
+    constexpr std::size_t array_size = ex::simd_size_v<T, SimdAbi>;
+    std::array<bool, array_size> expected_value;
+    for (size_t i = 0; i < array_size; ++i) {
+      if (i % 2 == 0)
+        pure_mask[i] = true;
+      else
+        pure_mask[i] = false;
+      expected_value[i] = pure_mask[i];
+    }
+
+    ex::simd_mask<T, SimdAbi> from_copy_ctor(pure_mask);
+    assert_simd_mask_values_equal<array_size>(from_copy_ctor, expected_value);
+  }
+};
+
+template <class T, std::size_t>
+struct CheckSimdMaskDefaultMoveCtor {
+  template <class SimdAbi>
+  void operator()() {
+    ex::simd_mask<T, SimdAbi> pure_mask;
+    constexpr std::size_t array_size = ex::simd_size_v<T, SimdAbi>;
+    std::array<bool, array_size> expected_value;
+    for (size_t i = 0; i < array_size; ++i) {
+      if (i % 2 == 0)
+        pure_mask[i] = true;
+      else
+        pure_mask[i] = false;
+      expected_value[i] = pure_mask[i];
+    }
+
+    ex::simd_mask<T, SimdAbi> from_move_ctor(std::move(pure_mask));
+    assert_simd_mask_values_equal<array_size>(from_move_ctor, expected_value);
+  }
+};
+
+template <class T, std::size_t>
+struct CheckSimdMaskDefaultCopyAssignment {
+  template <class SimdAbi>
+  void operator()() {
+    ex::simd_mask<T, SimdAbi> pure_mask;
+    constexpr std::size_t array_size = ex::simd_size_v<T, SimdAbi>;
+    std::array<bool, array_size> expected_value;
+    for (size_t i = 0; i < array_size; ++i) {
+      if (i % 2 == 0)
+        pure_mask[i] = true;
+      else
+        pure_mask[i] = false;
+      expected_value[i] = pure_mask[i];
+    }
+
+    ex::simd_mask<T, SimdAbi> from_copy_assignment;
+    from_copy_assignment = pure_mask;
+    assert_simd_mask_values_equal<array_size>(from_copy_assignment, expected_value);
+  }
+};
+
+template <class T, std::size_t>
+struct CheckSimdMaskDefaultMoveAssignment {
+  template <class SimdAbi>
+  void operator()() {
+    ex::simd_mask<T, SimdAbi> pure_mask;
+    constexpr std::size_t array_size = ex::simd_size_v<T, SimdAbi>;
+    std::array<bool, array_size> expected_value;
+    for (size_t i = 0; i < array_size; ++i) {
+      if (i % 2 == 0)
+        pure_mask[i] = true;
+      else
+        pure_mask[i] = false;
+      expected_value[i] = pure_mask[i];
+    }
+
+    ex::simd_mask<T, SimdAbi> from_move_assignment;
+    from_move_assignment = std::move(pure_mask);
+    assert_simd_mask_values_equal<array_size>(from_move_assignment, expected_value);
+  }
+};
+
+int main(int, char**) {
+  test_all_simd_abi<CheckSimdMaskDefaultCtor>();
+  test_all_simd_abi<CheckSimdMaskDefaultCopyCtor>();
+  test_all_simd_abi<CheckSimdMaskDefaultMoveCtor>();
+  test_all_simd_abi<CheckSimdMaskDefaultCopyAssignment>();
+  test_all_simd_abi<CheckSimdMaskDefaultMoveAssignment>();
+  return 0;
+}

From 930bc6c7b5f3d1c3a4f2f1a09414700aab526f17 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Wed, 1 Nov 2023 16:10:02 +0100
Subject: [PATCH 686/877] [InstCombine] Avoid use of
 ConstantExpr::getSExtOrTrunc()

InstCombine will canonicalize the index type, no need to handle
the non-canonical case.
---
 llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
index 2ff27abc79318..7574987d0e231 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -668,11 +668,11 @@ getAsConstantIndexedAddress(Type *ElemTy, Value *V, const DataLayout &DL) {
       if (!GEP->isInBounds())
         break;
       if (GEP->hasAllConstantIndices() && GEP->getNumIndices() == 1 &&
-          GEP->getSourceElementType() == ElemTy) {
+          GEP->getSourceElementType() == ElemTy &&
+          GEP->getOperand(1)->getType() == IndexType) {
         V = GEP->getOperand(0);
         Constant *GEPIndex = static_cast<Constant *>(GEP->getOperand(1));
-        Index = ConstantExpr::getAdd(
-            Index, ConstantExpr::getSExtOrTrunc(GEPIndex, IndexType));
+        Index = ConstantExpr::getAdd(Index, GEPIndex);
         continue;
       }
       break;

From c34efe3c2734629b925d9411b3c86a710911a93a Mon Sep 17 00:00:00 2001
From: Kerry McLaughlin <kerry.mclaughlin@arm.com>
Date: Wed, 1 Nov 2023 15:21:08 +0000
Subject: [PATCH 687/877] [AArch64][Clang] Refactor code to emit SVE & SME
 builtins (#70662)

This patch removes duplicated code in EmitAArch64SVEBuiltinExpr and
EmitAArch64SMEBuiltinExpr by creating a new function called
GetAArch64SVEProcessedOperands which handles splitting up multi-vector
arguments using vector extracts.

These changes are non-functional.
---
 clang/lib/CodeGen/CGBuiltin.cpp               | 150 ++---
 clang/lib/CodeGen/CodeGenFunction.h           |   5 +
 .../acle_sve_st2-bfloat.c                     |  36 +-
 .../aarch64-sve-intrinsics/acle_sve_st2.c     | 356 ++++++------
 .../acle_sve_st3-bfloat.c                     |  44 +-
 .../aarch64-sve-intrinsics/acle_sve_st3.c     | 436 +++++++--------
 .../acle_sve_st4-bfloat.c                     |  52 +-
 .../aarch64-sve-intrinsics/acle_sve_st4.c     | 516 +++++++++---------
 8 files changed, 803 insertions(+), 792 deletions(-)

diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index e047d31c01211..280a2e1f1ee2c 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -9617,22 +9617,17 @@ Value *CodeGenFunction::EmitSVEStructStore(const SVETypeFlags &TypeFlags,
   Value *BasePtr = Ops[1];
 
   // Does the store have an offset?
-  if (Ops.size() > 3)
+  if (Ops.size() > (2 + N))
     BasePtr = Builder.CreateGEP(VTy, BasePtr, Ops[2]);
 
-  Value *Val = Ops.back();
-
   // The llvm.aarch64.sve.st2/3/4 intrinsics take legal part vectors, so we
   // need to break up the tuple vector.
   SmallVector<llvm::Value*, 5> Operands;
-  unsigned MinElts = VTy->getElementCount().getKnownMinValue();
-  for (unsigned I = 0; I < N; ++I) {
-    Value *Idx = ConstantInt::get(CGM.Int64Ty, I * MinElts);
-    Operands.push_back(Builder.CreateExtractVector(VTy, Val, Idx));
-  }
+  for (unsigned I = Ops.size() - N; I < Ops.size(); ++I)
+    Operands.push_back(Ops[I]);
   Operands.append({Predicate, BasePtr});
-
   Function *F = CGM.getIntrinsic(IntID, { VTy });
+
   return Builder.CreateCall(F, Operands);
 }
 
@@ -9939,26 +9934,24 @@ Value *CodeGenFunction::FormSVEBuiltinResult(Value *Call) {
   return Call;
 }
 
-Value *CodeGenFunction::EmitAArch64SVEBuiltinExpr(unsigned BuiltinID,
-                                                  const CallExpr *E) {
+void CodeGenFunction::GetAArch64SVEProcessedOperands(
+    unsigned BuiltinID, const CallExpr *E, SmallVectorImpl<Value *> &Ops,
+    SVETypeFlags TypeFlags) {
   // Find out if any arguments are required to be integer constant expressions.
   unsigned ICEArguments = 0;
   ASTContext::GetBuiltinTypeError Error;
   getContext().GetBuiltinType(BuiltinID, Error, &ICEArguments);
   assert(Error == ASTContext::GE_None && "Should not codegen an error");
 
-  llvm::Type *Ty = ConvertType(E->getType());
-  if (BuiltinID >= SVE::BI__builtin_sve_reinterpret_s8_s8 &&
-      BuiltinID <= SVE::BI__builtin_sve_reinterpret_f64_f64) {
-    Value *Val = EmitScalarExpr(E->getArg(0));
-    return EmitSVEReinterpret(Val, Ty);
-  }
+  // Tuple set/get only requires one insert/extract vector, which is
+  // created by EmitSVETupleSetOrGet.
+  bool IsTupleGetOrSet = TypeFlags.isTupleSet() || TypeFlags.isTupleGet();
 
-  llvm::SmallVector<Value *, 4> Ops;
   for (unsigned i = 0, e = E->getNumArgs(); i != e; i++) {
-    if ((ICEArguments & (1 << i)) == 0)
-      Ops.push_back(EmitScalarExpr(E->getArg(i)));
-    else {
+    bool IsICE = ICEArguments & (1 << i);
+    Value *Arg = EmitScalarExpr(E->getArg(i));
+
+    if (IsICE) {
       // If this is required to be a constant, constant fold it so that we know
       // that the generated intrinsic gets a ConstantInt.
       std::optional<llvm::APSInt> Result =
@@ -9970,12 +9963,49 @@ Value *CodeGenFunction::EmitAArch64SVEBuiltinExpr(unsigned BuiltinID,
       // immediate requires more than a handful of bits.
       *Result = Result->extOrTrunc(32);
       Ops.push_back(llvm::ConstantInt::get(getLLVMContext(), *Result));
+      continue;
+    }
+
+    if (IsTupleGetOrSet || !isa<ScalableVectorType>(Arg->getType())) {
+      Ops.push_back(Arg);
+      continue;
+    }
+
+    auto *VTy = cast<ScalableVectorType>(Arg->getType());
+    unsigned MinElts = VTy->getMinNumElements();
+    bool IsPred = VTy->getElementType()->isIntegerTy(1);
+    unsigned N = (MinElts * VTy->getScalarSizeInBits()) / (IsPred ? 16 : 128);
+
+    if (N == 1) {
+      Ops.push_back(Arg);
+      continue;
+    }
+
+    for (unsigned I = 0; I < N; ++I) {
+      Value *Idx = ConstantInt::get(CGM.Int64Ty, (I * MinElts) / N);
+      auto *NewVTy =
+          ScalableVectorType::get(VTy->getElementType(), MinElts / N);
+      Ops.push_back(Builder.CreateExtractVector(NewVTy, Arg, Idx));
     }
   }
+}
+
+Value *CodeGenFunction::EmitAArch64SVEBuiltinExpr(unsigned BuiltinID,
+                                                  const CallExpr *E) {
+  llvm::Type *Ty = ConvertType(E->getType());
+  if (BuiltinID >= SVE::BI__builtin_sve_reinterpret_s8_s8 &&
+      BuiltinID <= SVE::BI__builtin_sve_reinterpret_f64_f64) {
+    Value *Val = EmitScalarExpr(E->getArg(0));
+    return EmitSVEReinterpret(Val, Ty);
+  }
 
   auto *Builtin = findARMVectorIntrinsicInMap(AArch64SVEIntrinsicMap, BuiltinID,
                                               AArch64SVEIntrinsicsProvenSorted);
+
+  llvm::SmallVector<Value *, 4> Ops;
   SVETypeFlags TypeFlags(Builtin->TypeModifier);
+  GetAArch64SVEProcessedOperands(BuiltinID, E, Ops, TypeFlags);
+
   if (TypeFlags.isLoad())
     return EmitSVEMaskedLoad(E, Ty, Ops, Builtin->LLVMIntrinsic,
                              TypeFlags.isZExtReturn());
@@ -9989,14 +10019,14 @@ Value *CodeGenFunction::EmitAArch64SVEBuiltinExpr(unsigned BuiltinID,
     return EmitSVEPrefetchLoad(TypeFlags, Ops, Builtin->LLVMIntrinsic);
   else if (TypeFlags.isGatherPrefetch())
     return EmitSVEGatherPrefetch(TypeFlags, Ops, Builtin->LLVMIntrinsic);
-	else if (TypeFlags.isStructLoad())
-		return EmitSVEStructLoad(TypeFlags, Ops, Builtin->LLVMIntrinsic);
-	else if (TypeFlags.isStructStore())
-		return EmitSVEStructStore(TypeFlags, Ops, Builtin->LLVMIntrinsic);
+  else if (TypeFlags.isStructLoad())
+    return EmitSVEStructLoad(TypeFlags, Ops, Builtin->LLVMIntrinsic);
+  else if (TypeFlags.isStructStore())
+    return EmitSVEStructStore(TypeFlags, Ops, Builtin->LLVMIntrinsic);
   else if (TypeFlags.isTupleSet() || TypeFlags.isTupleGet())
-        return EmitSVETupleSetOrGet(TypeFlags, Ty, Ops);
+    return EmitSVETupleSetOrGet(TypeFlags, Ty, Ops);
   else if (TypeFlags.isTupleCreate())
-        return EmitSVETupleCreate(TypeFlags, Ty, Ops);
+    return EmitSVETupleCreate(TypeFlags, Ty, Ops);
   else if (TypeFlags.isUndef())
     return UndefValue::get(Ty);
   else if (Builtin->LLVMIntrinsic != 0) {
@@ -10248,13 +10278,8 @@ Value *CodeGenFunction::EmitAArch64SVEBuiltinExpr(unsigned BuiltinID,
   case SVE::BI__builtin_sve_svtbl2_f64: {
     SVETypeFlags TF(Builtin->TypeModifier);
     auto VTy = cast<llvm::ScalableVectorType>(getSVEType(TF));
-    Value *V0 = Builder.CreateExtractVector(VTy, Ops[0],
-                                            ConstantInt::get(CGM.Int64Ty, 0));
-    unsigned MinElts = VTy->getMinNumElements();
-    Value *V1 = Builder.CreateExtractVector(
-        VTy, Ops[0], ConstantInt::get(CGM.Int64Ty, MinElts));
     Function *F = CGM.getIntrinsic(Intrinsic::aarch64_sve_tbl2, VTy);
-    return Builder.CreateCall(F, {V0, V1, Ops[1]});
+    return Builder.CreateCall(F, Ops);
   }
 
   case SVE::BI__builtin_sve_svset_neonq_s8:
@@ -10312,35 +10337,13 @@ Value *CodeGenFunction::EmitAArch64SVEBuiltinExpr(unsigned BuiltinID,
 
 Value *CodeGenFunction::EmitAArch64SMEBuiltinExpr(unsigned BuiltinID,
                                                   const CallExpr *E) {
-  // Find out if any arguments are required to be integer constant expressions.
-  unsigned ICEArguments = 0;
-  ASTContext::GetBuiltinTypeError Error;
-  getContext().GetBuiltinType(BuiltinID, Error, &ICEArguments);
-  assert(Error == ASTContext::GE_None && "Should not codegen an error");
-
-  llvm::Type *Ty = ConvertType(E->getType());
-  llvm::SmallVector<Value *, 4> Ops;
-  for (unsigned i = 0, e = E->getNumArgs(); i != e; i++) {
-    if ((ICEArguments & (1 << i)) == 0)
-      Ops.push_back(EmitScalarExpr(E->getArg(i)));
-    else {
-      // If this is required to be a constant, constant fold it so that we know
-      // that the generated intrinsic gets a ConstantInt.
-      std::optional<llvm::APSInt> Result =
-          E->getArg(i)->getIntegerConstantExpr(getContext());
-      assert(Result && "Expected argument to be a constant");
-
-      // Immediates for SVE llvm intrinsics are always 32bit.  We can safely
-      // truncate because the immediate has been range checked and no valid
-      // immediate requires more than a handful of bits.
-      *Result = Result->extOrTrunc(32);
-      Ops.push_back(llvm::ConstantInt::get(getLLVMContext(), *Result));
-    }
-  }
-
   auto *Builtin = findARMVectorIntrinsicInMap(AArch64SMEIntrinsicMap, BuiltinID,
                                               AArch64SMEIntrinsicsProvenSorted);
+
+  llvm::SmallVector<Value *, 4> Ops;
   SVETypeFlags TypeFlags(Builtin->TypeModifier);
+  GetAArch64SVEProcessedOperands(BuiltinID, E, Ops, TypeFlags);
+
   if (TypeFlags.isLoad() || TypeFlags.isStore())
     return EmitSMELd1St1(TypeFlags, Ops, Builtin->LLVMIntrinsic);
   else if (TypeFlags.isReadZA() || TypeFlags.isWriteZA())
@@ -10353,21 +10356,24 @@ Value *CodeGenFunction::EmitAArch64SMEBuiltinExpr(unsigned BuiltinID,
            BuiltinID == SME::BI__builtin_sme_svldr_za ||
            BuiltinID == SME::BI__builtin_sme_svstr_za)
     return EmitSMELdrStr(TypeFlags, Ops, Builtin->LLVMIntrinsic);
-  else if (Builtin->LLVMIntrinsic != 0) {
-    // Predicates must match the main datatype.
-    for (unsigned i = 0, e = Ops.size(); i != e; ++i)
-      if (auto PredTy = dyn_cast<llvm::VectorType>(Ops[i]->getType()))
-        if (PredTy->getElementType()->isIntegerTy(1))
-          Ops[i] = EmitSVEPredicateCast(Ops[i], getSVEType(TypeFlags));
 
-    Function *F = CGM.getIntrinsic(Builtin->LLVMIntrinsic,
-                                   getSVEOverloadTypes(TypeFlags, Ty, Ops));
-    Value *Call = Builder.CreateCall(F, Ops);
-    return Call;
-  }
+  // Should not happen!
+  if (Builtin->LLVMIntrinsic == 0)
+    return nullptr;
 
-  /// Should not happen
-  return nullptr;
+  // Predicates must match the main datatype.
+  for (unsigned i = 0, e = Ops.size(); i != e; ++i)
+    if (auto PredTy = dyn_cast<llvm::VectorType>(Ops[i]->getType()))
+      if (PredTy->getElementType()->isIntegerTy(1))
+        Ops[i] = EmitSVEPredicateCast(Ops[i], getSVEType(TypeFlags));
+
+  Function *F =
+      TypeFlags.isOverloadNone()
+          ? CGM.getIntrinsic(Builtin->LLVMIntrinsic)
+          : CGM.getIntrinsic(Builtin->LLVMIntrinsic, {getSVEType(TypeFlags)});
+  Value *Call = Builder.CreateCall(F, Ops);
+
+  return FormSVEBuiltinResult(Call);
 }
 
 Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index e82115e2d706c..ee2c4b1e10afb 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -4311,6 +4311,11 @@ class CodeGenFunction : public CodeGenTypeCache {
   llvm::Value *EmitSMELdrStr(const SVETypeFlags &TypeFlags,
                              llvm::SmallVectorImpl<llvm::Value *> &Ops,
                              unsigned IntID);
+
+  void GetAArch64SVEProcessedOperands(unsigned BuiltinID, const CallExpr *E,
+                                      SmallVectorImpl<llvm::Value *> &Ops,
+                                      SVETypeFlags TypeFlags);
+
   llvm::Value *EmitAArch64SMEBuiltinExpr(unsigned BuiltinID, const CallExpr *E);
 
   llvm::Value *EmitAArch64BuiltinExpr(unsigned BuiltinID, const CallExpr *E,
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st2-bfloat.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st2-bfloat.c
index abe59567b0387..89a7d01129368 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st2-bfloat.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st2-bfloat.c
@@ -16,18 +16,18 @@
 #endif
 // CHECK-LABEL: @test_svst2_bf16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[DATA]], i64 8)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv8bf16(<vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[DATA]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv8bf16(<vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x i1> [[TMP2]], ptr [[BASE:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z15test_svst2_bf16u10__SVBool_tPu6__bf1614svbfloat16x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[DATA]], i64 8)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv8bf16(<vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[DATA]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv8bf16(<vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x i1> [[TMP2]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst2_bf16(svbool_t pg, bfloat16_t *base, svbfloat16x2_t data)
@@ -37,20 +37,20 @@ void test_svst2_bf16(svbool_t pg, bfloat16_t *base, svbfloat16x2_t data)
 
 // CHECK-LABEL: @test_svst2_vnum_bf16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x bfloat>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[DATA]], i64 8)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv8bf16(<vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[DATA]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr <vscale x 8 x bfloat>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv8bf16(<vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x i1> [[TMP2]], ptr [[TMP3]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z20test_svst2_vnum_bf16u10__SVBool_tPu6__bf16l14svbfloat16x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x bfloat>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[DATA]], i64 8)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv8bf16(<vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[DATA]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr <vscale x 8 x bfloat>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv8bf16(<vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x i1> [[TMP2]], ptr [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst2_vnum_bf16(svbool_t pg, bfloat16_t *base, int64_t vnum, svbfloat16x2_t data)
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st2.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st2.c
index 6a6632d51b1c0..7848cbc5d9abc 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st2.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st2.c
@@ -35,18 +35,18 @@ void test_svst2_s8(svbool_t pg, int8_t *base, svint8x2_t data)
 
 // CHECK-LABEL: @test_svst2_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[DATA]], i64 8)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv8i16(<vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[DATA]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i1> [[TMP2]], ptr [[BASE:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z14test_svst2_s16u10__SVBool_tPs11svint16x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[DATA]], i64 8)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv8i16(<vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[DATA]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i1> [[TMP2]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst2_s16(svbool_t pg, int16_t *base, svint16x2_t data)
@@ -56,18 +56,18 @@ void test_svst2_s16(svbool_t pg, int16_t *base, svint16x2_t data)
 
 // CHECK-LABEL: @test_svst2_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[DATA]], i64 4)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv4i32(<vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[DATA]], i64 4)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i1> [[TMP2]], ptr [[BASE:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z14test_svst2_s32u10__SVBool_tPi11svint32x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[DATA]], i64 4)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv4i32(<vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[DATA]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i1> [[TMP2]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst2_s32(svbool_t pg, int32_t *base, svint32x2_t data)
@@ -77,18 +77,18 @@ void test_svst2_s32(svbool_t pg, int32_t *base, svint32x2_t data)
 
 // CHECK-LABEL: @test_svst2_s64(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[DATA]], i64 2)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv2i64(<vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[DATA]], i64 2)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i1> [[TMP2]], ptr [[BASE:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z14test_svst2_s64u10__SVBool_tPl11svint64x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[DATA]], i64 2)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv2i64(<vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[DATA]], i64 2)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i1> [[TMP2]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst2_s64(svbool_t pg, int64_t *base, svint64x2_t data)
@@ -117,18 +117,18 @@ void test_svst2_u8(svbool_t pg, uint8_t *base, svuint8x2_t data)
 
 // CHECK-LABEL: @test_svst2_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[DATA]], i64 8)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv8i16(<vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[DATA]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i1> [[TMP2]], ptr [[BASE:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z14test_svst2_u16u10__SVBool_tPt12svuint16x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[DATA]], i64 8)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv8i16(<vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[DATA]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i1> [[TMP2]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst2_u16(svbool_t pg, uint16_t *base, svuint16x2_t data)
@@ -138,18 +138,18 @@ void test_svst2_u16(svbool_t pg, uint16_t *base, svuint16x2_t data)
 
 // CHECK-LABEL: @test_svst2_u32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[DATA]], i64 4)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv4i32(<vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[DATA]], i64 4)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i1> [[TMP2]], ptr [[BASE:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z14test_svst2_u32u10__SVBool_tPj12svuint32x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[DATA]], i64 4)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv4i32(<vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[DATA]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i1> [[TMP2]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst2_u32(svbool_t pg, uint32_t *base, svuint32x2_t data)
@@ -159,18 +159,18 @@ void test_svst2_u32(svbool_t pg, uint32_t *base, svuint32x2_t data)
 
 // CHECK-LABEL: @test_svst2_u64(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[DATA]], i64 2)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv2i64(<vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[DATA]], i64 2)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i1> [[TMP2]], ptr [[BASE:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z14test_svst2_u64u10__SVBool_tPm12svuint64x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[DATA]], i64 2)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv2i64(<vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[DATA]], i64 2)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i1> [[TMP2]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst2_u64(svbool_t pg, uint64_t *base, svuint64x2_t data)
@@ -180,18 +180,18 @@ void test_svst2_u64(svbool_t pg, uint64_t *base, svuint64x2_t data)
 
 // CHECK-LABEL: @test_svst2_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[DATA]], i64 8)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv8f16(<vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[DATA]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv8f16(<vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x i1> [[TMP2]], ptr [[BASE:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z14test_svst2_f16u10__SVBool_tPDh13svfloat16x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[DATA]], i64 8)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv8f16(<vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[DATA]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv8f16(<vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x i1> [[TMP2]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst2_f16(svbool_t pg, float16_t *base, svfloat16x2_t data)
@@ -201,18 +201,18 @@ void test_svst2_f16(svbool_t pg, float16_t *base, svfloat16x2_t data)
 
 // CHECK-LABEL: @test_svst2_f32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[DATA]], i64 4)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv4f32(<vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[DATA]], i64 4)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x i1> [[TMP2]], ptr [[BASE:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z14test_svst2_f32u10__SVBool_tPf13svfloat32x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[DATA]], i64 4)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv4f32(<vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[DATA]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x i1> [[TMP2]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst2_f32(svbool_t pg, float32_t *base, svfloat32x2_t data)
@@ -222,18 +222,18 @@ void test_svst2_f32(svbool_t pg, float32_t *base, svfloat32x2_t data)
 
 // CHECK-LABEL: @test_svst2_f64(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[DATA]], i64 2)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv2f64(<vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[DATA]], i64 2)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv2f64(<vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x i1> [[TMP2]], ptr [[BASE:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z14test_svst2_f64u10__SVBool_tPd13svfloat64x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[DATA]], i64 2)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv2f64(<vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[DATA]], i64 2)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv2f64(<vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x i1> [[TMP2]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst2_f64(svbool_t pg, float64_t *base, svfloat64x2_t data)
@@ -243,18 +243,18 @@ void test_svst2_f64(svbool_t pg, float64_t *base, svfloat64x2_t data)
 
 // CHECK-LABEL: @test_svst2_vnum_s8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[DATA]], i64 16)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv16i8(<vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i1> [[PG:%.*]], ptr [[TMP0]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[DATA]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i1> [[PG:%.*]], ptr [[TMP2]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z18test_svst2_vnum_s8u10__SVBool_tPal10svint8x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[DATA]], i64 16)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv16i8(<vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i1> [[PG:%.*]], ptr [[TMP0]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[DATA]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i1> [[PG:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst2_vnum_s8(svbool_t pg, int8_t *base, int64_t vnum, svint8x2_t data)
@@ -264,20 +264,20 @@ void test_svst2_vnum_s8(svbool_t pg, int8_t *base, int64_t vnum, svint8x2_t data
 
 // CHECK-LABEL: @test_svst2_vnum_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[DATA]], i64 8)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv8i16(<vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[DATA]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i1> [[TMP2]], ptr [[TMP3]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z19test_svst2_vnum_s16u10__SVBool_tPsl11svint16x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[DATA]], i64 8)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv8i16(<vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[DATA]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i1> [[TMP2]], ptr [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst2_vnum_s16(svbool_t pg, int16_t *base, int64_t vnum, svint16x2_t data)
@@ -287,20 +287,20 @@ void test_svst2_vnum_s16(svbool_t pg, int16_t *base, int64_t vnum, svint16x2_t d
 
 // CHECK-LABEL: @test_svst2_vnum_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[DATA]], i64 4)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv4i32(<vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[DATA]], i64 4)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i1> [[TMP2]], ptr [[TMP3]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z19test_svst2_vnum_s32u10__SVBool_tPil11svint32x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[DATA]], i64 4)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv4i32(<vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[DATA]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i1> [[TMP2]], ptr [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst2_vnum_s32(svbool_t pg, int32_t *base, int64_t vnum, svint32x2_t data)
@@ -310,20 +310,20 @@ void test_svst2_vnum_s32(svbool_t pg, int32_t *base, int64_t vnum, svint32x2_t d
 
 // CHECK-LABEL: @test_svst2_vnum_s64(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[DATA]], i64 2)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv2i64(<vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[DATA]], i64 2)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i1> [[TMP2]], ptr [[TMP3]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z19test_svst2_vnum_s64u10__SVBool_tPll11svint64x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[DATA]], i64 2)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv2i64(<vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[DATA]], i64 2)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i1> [[TMP2]], ptr [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst2_vnum_s64(svbool_t pg, int64_t *base, int64_t vnum, svint64x2_t data)
@@ -333,18 +333,18 @@ void test_svst2_vnum_s64(svbool_t pg, int64_t *base, int64_t vnum, svint64x2_t d
 
 // CHECK-LABEL: @test_svst2_vnum_u8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[DATA]], i64 16)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv16i8(<vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i1> [[PG:%.*]], ptr [[TMP0]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[DATA]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i1> [[PG:%.*]], ptr [[TMP2]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z18test_svst2_vnum_u8u10__SVBool_tPhl11svuint8x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[DATA]], i64 16)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv16i8(<vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i1> [[PG:%.*]], ptr [[TMP0]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[DATA]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i1> [[PG:%.*]], ptr [[TMP2]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst2_vnum_u8(svbool_t pg, uint8_t *base, int64_t vnum, svuint8x2_t data)
@@ -354,20 +354,20 @@ void test_svst2_vnum_u8(svbool_t pg, uint8_t *base, int64_t vnum, svuint8x2_t da
 
 // CHECK-LABEL: @test_svst2_vnum_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[DATA]], i64 8)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv8i16(<vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[DATA]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i1> [[TMP2]], ptr [[TMP3]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z19test_svst2_vnum_u16u10__SVBool_tPtl12svuint16x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[DATA]], i64 8)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv8i16(<vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[DATA]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i1> [[TMP2]], ptr [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst2_vnum_u16(svbool_t pg, uint16_t *base, int64_t vnum, svuint16x2_t data)
@@ -377,20 +377,20 @@ void test_svst2_vnum_u16(svbool_t pg, uint16_t *base, int64_t vnum, svuint16x2_t
 
 // CHECK-LABEL: @test_svst2_vnum_u32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[DATA]], i64 4)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv4i32(<vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[DATA]], i64 4)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i1> [[TMP2]], ptr [[TMP3]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z19test_svst2_vnum_u32u10__SVBool_tPjl12svuint32x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[DATA]], i64 4)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv4i32(<vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[DATA]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i1> [[TMP2]], ptr [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst2_vnum_u32(svbool_t pg, uint32_t *base, int64_t vnum, svuint32x2_t data)
@@ -400,20 +400,20 @@ void test_svst2_vnum_u32(svbool_t pg, uint32_t *base, int64_t vnum, svuint32x2_t
 
 // CHECK-LABEL: @test_svst2_vnum_u64(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[DATA]], i64 2)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv2i64(<vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[DATA]], i64 2)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i1> [[TMP2]], ptr [[TMP3]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z19test_svst2_vnum_u64u10__SVBool_tPml12svuint64x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[DATA]], i64 2)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv2i64(<vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[DATA]], i64 2)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i1> [[TMP2]], ptr [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst2_vnum_u64(svbool_t pg, uint64_t *base, int64_t vnum, svuint64x2_t data)
@@ -423,20 +423,20 @@ void test_svst2_vnum_u64(svbool_t pg, uint64_t *base, int64_t vnum, svuint64x2_t
 
 // CHECK-LABEL: @test_svst2_vnum_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x half>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[DATA]], i64 8)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv8f16(<vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[DATA]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr <vscale x 8 x half>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv8f16(<vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x i1> [[TMP2]], ptr [[TMP3]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z19test_svst2_vnum_f16u10__SVBool_tPDhl13svfloat16x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x half>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[DATA]], i64 8)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv8f16(<vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[DATA]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr <vscale x 8 x half>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv8f16(<vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x i1> [[TMP2]], ptr [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst2_vnum_f16(svbool_t pg, float16_t *base, int64_t vnum, svfloat16x2_t data)
@@ -446,20 +446,20 @@ void test_svst2_vnum_f16(svbool_t pg, float16_t *base, int64_t vnum, svfloat16x2
 
 // CHECK-LABEL: @test_svst2_vnum_f32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 4 x float>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[DATA]], i64 4)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv4f32(<vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], <vscale x 4 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[DATA]], i64 4)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr <vscale x 4 x float>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x i1> [[TMP2]], ptr [[TMP3]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z19test_svst2_vnum_f32u10__SVBool_tPfl13svfloat32x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 4 x float>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[DATA]], i64 4)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv4f32(<vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], <vscale x 4 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[DATA]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr <vscale x 4 x float>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x i1> [[TMP2]], ptr [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst2_vnum_f32(svbool_t pg, float32_t *base, int64_t vnum, svfloat32x2_t data)
@@ -469,20 +469,20 @@ void test_svst2_vnum_f32(svbool_t pg, float32_t *base, int64_t vnum, svfloat32x2
 
 // CHECK-LABEL: @test_svst2_vnum_f64(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x double>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[DATA]], i64 2)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv2f64(<vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], <vscale x 2 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[DATA]], i64 2)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr <vscale x 2 x double>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv2f64(<vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x i1> [[TMP2]], ptr [[TMP3]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z19test_svst2_vnum_f64u10__SVBool_tPdl13svfloat64x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x double>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[DATA]], i64 2)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv2f64(<vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], <vscale x 2 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[DATA]], i64 2)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr <vscale x 2 x double>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv2f64(<vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x i1> [[TMP2]], ptr [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst2_vnum_f64(svbool_t pg, float64_t *base, int64_t vnum, svfloat64x2_t data)
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st3-bfloat.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st3-bfloat.c
index 30954b4eaac4c..fb66c1e961ac2 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st3-bfloat.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st3-bfloat.c
@@ -17,20 +17,20 @@
 
 // CHECK-LABEL: @test_svst3_bf16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv24bf16(<vscale x 24 x bfloat> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv24bf16(<vscale x 24 x bfloat> [[DATA]], i64 8)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv24bf16(<vscale x 24 x bfloat> [[DATA]], i64 16)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv8bf16(<vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv24bf16(<vscale x 24 x bfloat> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv24bf16(<vscale x 24 x bfloat> [[DATA]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv24bf16(<vscale x 24 x bfloat> [[DATA]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv8bf16(<vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x i1> [[TMP3]], ptr [[BASE:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z15test_svst3_bf16u10__SVBool_tPu6__bf1614svbfloat16x3_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv24bf16(<vscale x 24 x bfloat> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv24bf16(<vscale x 24 x bfloat> [[DATA]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv24bf16(<vscale x 24 x bfloat> [[DATA]], i64 16)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv8bf16(<vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv24bf16(<vscale x 24 x bfloat> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv24bf16(<vscale x 24 x bfloat> [[DATA]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv24bf16(<vscale x 24 x bfloat> [[DATA]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv8bf16(<vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x i1> [[TMP3]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst3_bf16(svbool_t pg, bfloat16_t *base, svbfloat16x3_t data)
@@ -40,22 +40,22 @@ void test_svst3_bf16(svbool_t pg, bfloat16_t *base, svbfloat16x3_t data)
 
 // CHECK-LABEL: @test_svst3_vnum_bf16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x bfloat>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv24bf16(<vscale x 24 x bfloat> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv24bf16(<vscale x 24 x bfloat> [[DATA]], i64 8)
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv24bf16(<vscale x 24 x bfloat> [[DATA]], i64 16)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv8bf16(<vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[TMP4]], <vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv24bf16(<vscale x 24 x bfloat> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv24bf16(<vscale x 24 x bfloat> [[DATA]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv24bf16(<vscale x 24 x bfloat> [[DATA]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 8 x bfloat>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv8bf16(<vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x i1> [[TMP3]], ptr [[TMP4]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z20test_svst3_vnum_bf16u10__SVBool_tPu6__bf16l14svbfloat16x3_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x bfloat>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv24bf16(<vscale x 24 x bfloat> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv24bf16(<vscale x 24 x bfloat> [[DATA]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv24bf16(<vscale x 24 x bfloat> [[DATA]], i64 16)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv8bf16(<vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[TMP4]], <vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv24bf16(<vscale x 24 x bfloat> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv24bf16(<vscale x 24 x bfloat> [[DATA]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv24bf16(<vscale x 24 x bfloat> [[DATA]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 8 x bfloat>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv8bf16(<vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x i1> [[TMP3]], ptr [[TMP4]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst3_vnum_bf16(svbool_t pg, bfloat16_t *base, int64_t vnum, svbfloat16x3_t data)
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st3.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st3.c
index d78c7d81b9530..cf85c72ed71c6 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st3.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st3.c
@@ -37,20 +37,20 @@ void test_svst3_s8(svbool_t pg, int8_t *base, svint8x3_t data)
 
 // CHECK-LABEL: @test_svst3_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA]], i64 8)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA]], i64 16)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv8i16(<vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i1> [[TMP3]], ptr [[BASE:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z14test_svst3_s16u10__SVBool_tPs11svint16x3_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA]], i64 16)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv8i16(<vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i1> [[TMP3]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst3_s16(svbool_t pg, int16_t *base, svint16x3_t data)
@@ -60,20 +60,20 @@ void test_svst3_s16(svbool_t pg, int16_t *base, svint16x3_t data)
 
 // CHECK-LABEL: @test_svst3_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA]], i64 4)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA]], i64 8)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv4i32(<vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA]], i64 4)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA]], i64 8)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i1> [[TMP3]], ptr [[BASE:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z14test_svst3_s32u10__SVBool_tPi11svint32x3_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA]], i64 8)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv4i32(<vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i1> [[TMP3]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst3_s32(svbool_t pg, int32_t *base, svint32x3_t data)
@@ -83,20 +83,20 @@ void test_svst3_s32(svbool_t pg, int32_t *base, svint32x3_t data)
 
 // CHECK-LABEL: @test_svst3_s64(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA]], i64 2)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA]], i64 4)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv2i64(<vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA]], i64 2)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA]], i64 4)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i1> [[TMP3]], ptr [[BASE:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z14test_svst3_s64u10__SVBool_tPl11svint64x3_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA]], i64 4)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv2i64(<vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA]], i64 2)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i1> [[TMP3]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst3_s64(svbool_t pg, int64_t *base, svint64x3_t data)
@@ -127,20 +127,20 @@ void test_svst3_u8(svbool_t pg, uint8_t *base, svuint8x3_t data)
 
 // CHECK-LABEL: @test_svst3_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA]], i64 8)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA]], i64 16)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv8i16(<vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i1> [[TMP3]], ptr [[BASE:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z14test_svst3_u16u10__SVBool_tPt12svuint16x3_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA]], i64 16)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv8i16(<vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i1> [[TMP3]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst3_u16(svbool_t pg, uint16_t *base, svuint16x3_t data)
@@ -150,20 +150,20 @@ void test_svst3_u16(svbool_t pg, uint16_t *base, svuint16x3_t data)
 
 // CHECK-LABEL: @test_svst3_u32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA]], i64 4)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA]], i64 8)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv4i32(<vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA]], i64 4)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA]], i64 8)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i1> [[TMP3]], ptr [[BASE:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z14test_svst3_u32u10__SVBool_tPj12svuint32x3_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA]], i64 8)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv4i32(<vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i1> [[TMP3]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst3_u32(svbool_t pg, uint32_t *base, svuint32x3_t data)
@@ -173,20 +173,20 @@ void test_svst3_u32(svbool_t pg, uint32_t *base, svuint32x3_t data)
 
 // CHECK-LABEL: @test_svst3_u64(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA]], i64 2)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA]], i64 4)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv2i64(<vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA]], i64 2)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA]], i64 4)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i1> [[TMP3]], ptr [[BASE:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z14test_svst3_u64u10__SVBool_tPm12svuint64x3_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA]], i64 4)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv2i64(<vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA]], i64 2)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i1> [[TMP3]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst3_u64(svbool_t pg, uint64_t *base, svuint64x3_t data)
@@ -196,20 +196,20 @@ void test_svst3_u64(svbool_t pg, uint64_t *base, svuint64x3_t data)
 
 // CHECK-LABEL: @test_svst3_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv24f16(<vscale x 24 x half> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv24f16(<vscale x 24 x half> [[DATA]], i64 8)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv24f16(<vscale x 24 x half> [[DATA]], i64 16)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv8f16(<vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv24f16(<vscale x 24 x half> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv24f16(<vscale x 24 x half> [[DATA]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv24f16(<vscale x 24 x half> [[DATA]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv8f16(<vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x i1> [[TMP3]], ptr [[BASE:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z14test_svst3_f16u10__SVBool_tPDh13svfloat16x3_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv24f16(<vscale x 24 x half> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv24f16(<vscale x 24 x half> [[DATA]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv24f16(<vscale x 24 x half> [[DATA]], i64 16)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv8f16(<vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv24f16(<vscale x 24 x half> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv24f16(<vscale x 24 x half> [[DATA]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv24f16(<vscale x 24 x half> [[DATA]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv8f16(<vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x i1> [[TMP3]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst3_f16(svbool_t pg, float16_t *base, svfloat16x3_t data)
@@ -219,20 +219,20 @@ void test_svst3_f16(svbool_t pg, float16_t *base, svfloat16x3_t data)
 
 // CHECK-LABEL: @test_svst3_f32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv12f32(<vscale x 12 x float> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv12f32(<vscale x 12 x float> [[DATA]], i64 4)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv12f32(<vscale x 12 x float> [[DATA]], i64 8)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv4f32(<vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], <vscale x 4 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv12f32(<vscale x 12 x float> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv12f32(<vscale x 12 x float> [[DATA]], i64 4)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv12f32(<vscale x 12 x float> [[DATA]], i64 8)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x i1> [[TMP3]], ptr [[BASE:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z14test_svst3_f32u10__SVBool_tPf13svfloat32x3_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv12f32(<vscale x 12 x float> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv12f32(<vscale x 12 x float> [[DATA]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv12f32(<vscale x 12 x float> [[DATA]], i64 8)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv4f32(<vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], <vscale x 4 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv12f32(<vscale x 12 x float> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv12f32(<vscale x 12 x float> [[DATA]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv12f32(<vscale x 12 x float> [[DATA]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x i1> [[TMP3]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst3_f32(svbool_t pg, float32_t *base, svfloat32x3_t data)
@@ -242,20 +242,20 @@ void test_svst3_f32(svbool_t pg, float32_t *base, svfloat32x3_t data)
 
 // CHECK-LABEL: @test_svst3_f64(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv6f64(<vscale x 6 x double> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv6f64(<vscale x 6 x double> [[DATA]], i64 2)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv6f64(<vscale x 6 x double> [[DATA]], i64 4)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv2f64(<vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], <vscale x 2 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv6f64(<vscale x 6 x double> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv6f64(<vscale x 6 x double> [[DATA]], i64 2)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv6f64(<vscale x 6 x double> [[DATA]], i64 4)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv2f64(<vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x i1> [[TMP3]], ptr [[BASE:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z14test_svst3_f64u10__SVBool_tPd13svfloat64x3_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv6f64(<vscale x 6 x double> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv6f64(<vscale x 6 x double> [[DATA]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv6f64(<vscale x 6 x double> [[DATA]], i64 4)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv2f64(<vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], <vscale x 2 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv6f64(<vscale x 6 x double> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv6f64(<vscale x 6 x double> [[DATA]], i64 2)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv6f64(<vscale x 6 x double> [[DATA]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv2f64(<vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x i1> [[TMP3]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst3_f64(svbool_t pg, float64_t *base, svfloat64x3_t data)
@@ -265,20 +265,20 @@ void test_svst3_f64(svbool_t pg, float64_t *base, svfloat64x3_t data)
 
 // CHECK-LABEL: @test_svst3_vnum_s8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv48i8(<vscale x 48 x i8> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv48i8(<vscale x 48 x i8> [[DATA]], i64 16)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv48i8(<vscale x 48 x i8> [[DATA]], i64 32)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv16i8(<vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i1> [[PG:%.*]], ptr [[TMP0]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv48i8(<vscale x 48 x i8> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv48i8(<vscale x 48 x i8> [[DATA]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv48i8(<vscale x 48 x i8> [[DATA]], i64 32)
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i1> [[PG:%.*]], ptr [[TMP3]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z18test_svst3_vnum_s8u10__SVBool_tPal10svint8x3_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv48i8(<vscale x 48 x i8> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv48i8(<vscale x 48 x i8> [[DATA]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv48i8(<vscale x 48 x i8> [[DATA]], i64 32)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv16i8(<vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i1> [[PG:%.*]], ptr [[TMP0]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv48i8(<vscale x 48 x i8> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv48i8(<vscale x 48 x i8> [[DATA]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv48i8(<vscale x 48 x i8> [[DATA]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i1> [[PG:%.*]], ptr [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst3_vnum_s8(svbool_t pg, int8_t *base, int64_t vnum, svint8x3_t data)
@@ -288,22 +288,22 @@ void test_svst3_vnum_s8(svbool_t pg, int8_t *base, int64_t vnum, svint8x3_t data
 
 // CHECK-LABEL: @test_svst3_vnum_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA]], i64 8)
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA]], i64 16)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv8i16(<vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i1> [[TMP3]], ptr [[TMP4]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z19test_svst3_vnum_s16u10__SVBool_tPsl11svint16x3_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA]], i64 16)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv8i16(<vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i1> [[TMP3]], ptr [[TMP4]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst3_vnum_s16(svbool_t pg, int16_t *base, int64_t vnum, svint16x3_t data)
@@ -313,22 +313,22 @@ void test_svst3_vnum_s16(svbool_t pg, int16_t *base, int64_t vnum, svint16x3_t d
 
 // CHECK-LABEL: @test_svst3_vnum_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA]], i64 4)
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA]], i64 8)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv4i32(<vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i32> [[TMP4]], <vscale x 4 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA]], i64 4)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA]], i64 8)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i1> [[TMP3]], ptr [[TMP4]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z19test_svst3_vnum_s32u10__SVBool_tPil11svint32x3_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA]], i64 8)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv4i32(<vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i32> [[TMP4]], <vscale x 4 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i1> [[TMP3]], ptr [[TMP4]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst3_vnum_s32(svbool_t pg, int32_t *base, int64_t vnum, svint32x3_t data)
@@ -338,22 +338,22 @@ void test_svst3_vnum_s32(svbool_t pg, int32_t *base, int64_t vnum, svint32x3_t d
 
 // CHECK-LABEL: @test_svst3_vnum_s64(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA]], i64 2)
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA]], i64 4)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv2i64(<vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP4]], <vscale x 2 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA]], i64 2)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA]], i64 4)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i1> [[TMP3]], ptr [[TMP4]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z19test_svst3_vnum_s64u10__SVBool_tPll11svint64x3_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA]], i64 4)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv2i64(<vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP4]], <vscale x 2 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA]], i64 2)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i1> [[TMP3]], ptr [[TMP4]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst3_vnum_s64(svbool_t pg, int64_t *base, int64_t vnum, svint64x3_t data)
@@ -363,20 +363,20 @@ void test_svst3_vnum_s64(svbool_t pg, int64_t *base, int64_t vnum, svint64x3_t d
 
 // CHECK-LABEL: @test_svst3_vnum_u8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv48i8(<vscale x 48 x i8> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv48i8(<vscale x 48 x i8> [[DATA]], i64 16)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv48i8(<vscale x 48 x i8> [[DATA]], i64 32)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv16i8(<vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i1> [[PG:%.*]], ptr [[TMP0]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv48i8(<vscale x 48 x i8> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv48i8(<vscale x 48 x i8> [[DATA]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv48i8(<vscale x 48 x i8> [[DATA]], i64 32)
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i1> [[PG:%.*]], ptr [[TMP3]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z18test_svst3_vnum_u8u10__SVBool_tPhl11svuint8x3_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv48i8(<vscale x 48 x i8> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv48i8(<vscale x 48 x i8> [[DATA]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv48i8(<vscale x 48 x i8> [[DATA]], i64 32)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv16i8(<vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i1> [[PG:%.*]], ptr [[TMP0]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv48i8(<vscale x 48 x i8> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv48i8(<vscale x 48 x i8> [[DATA]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv48i8(<vscale x 48 x i8> [[DATA]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i1> [[PG:%.*]], ptr [[TMP3]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst3_vnum_u8(svbool_t pg, uint8_t *base, int64_t vnum, svuint8x3_t data)
@@ -386,22 +386,22 @@ void test_svst3_vnum_u8(svbool_t pg, uint8_t *base, int64_t vnum, svuint8x3_t da
 
 // CHECK-LABEL: @test_svst3_vnum_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA]], i64 8)
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA]], i64 16)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv8i16(<vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i1> [[TMP3]], ptr [[TMP4]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z19test_svst3_vnum_u16u10__SVBool_tPtl12svuint16x3_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA]], i64 16)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv8i16(<vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i1> [[TMP3]], ptr [[TMP4]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst3_vnum_u16(svbool_t pg, uint16_t *base, int64_t vnum, svuint16x3_t data)
@@ -411,22 +411,22 @@ void test_svst3_vnum_u16(svbool_t pg, uint16_t *base, int64_t vnum, svuint16x3_t
 
 // CHECK-LABEL: @test_svst3_vnum_u32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA]], i64 4)
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA]], i64 8)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv4i32(<vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i32> [[TMP4]], <vscale x 4 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA]], i64 4)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA]], i64 8)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i1> [[TMP3]], ptr [[TMP4]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z19test_svst3_vnum_u32u10__SVBool_tPjl12svuint32x3_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA]], i64 8)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv4i32(<vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i32> [[TMP4]], <vscale x 4 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i1> [[TMP3]], ptr [[TMP4]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst3_vnum_u32(svbool_t pg, uint32_t *base, int64_t vnum, svuint32x3_t data)
@@ -436,22 +436,22 @@ void test_svst3_vnum_u32(svbool_t pg, uint32_t *base, int64_t vnum, svuint32x3_t
 
 // CHECK-LABEL: @test_svst3_vnum_u64(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA]], i64 2)
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA]], i64 4)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv2i64(<vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP4]], <vscale x 2 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA]], i64 2)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA]], i64 4)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i1> [[TMP3]], ptr [[TMP4]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z19test_svst3_vnum_u64u10__SVBool_tPml12svuint64x3_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA]], i64 4)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv2i64(<vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP4]], <vscale x 2 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA]], i64 2)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i1> [[TMP3]], ptr [[TMP4]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst3_vnum_u64(svbool_t pg, uint64_t *base, int64_t vnum, svuint64x3_t data)
@@ -461,22 +461,22 @@ void test_svst3_vnum_u64(svbool_t pg, uint64_t *base, int64_t vnum, svuint64x3_t
 
 // CHECK-LABEL: @test_svst3_vnum_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x half>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv24f16(<vscale x 24 x half> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv24f16(<vscale x 24 x half> [[DATA]], i64 8)
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv24f16(<vscale x 24 x half> [[DATA]], i64 16)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv8f16(<vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[TMP4]], <vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv24f16(<vscale x 24 x half> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv24f16(<vscale x 24 x half> [[DATA]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv24f16(<vscale x 24 x half> [[DATA]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 8 x half>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv8f16(<vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x i1> [[TMP3]], ptr [[TMP4]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z19test_svst3_vnum_f16u10__SVBool_tPDhl13svfloat16x3_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x half>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv24f16(<vscale x 24 x half> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv24f16(<vscale x 24 x half> [[DATA]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv24f16(<vscale x 24 x half> [[DATA]], i64 16)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv8f16(<vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[TMP4]], <vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv24f16(<vscale x 24 x half> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv24f16(<vscale x 24 x half> [[DATA]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv24f16(<vscale x 24 x half> [[DATA]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 8 x half>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv8f16(<vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x i1> [[TMP3]], ptr [[TMP4]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst3_vnum_f16(svbool_t pg, float16_t *base, int64_t vnum, svfloat16x3_t data)
@@ -486,22 +486,22 @@ void test_svst3_vnum_f16(svbool_t pg, float16_t *base, int64_t vnum, svfloat16x3
 
 // CHECK-LABEL: @test_svst3_vnum_f32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 4 x float>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv12f32(<vscale x 12 x float> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv12f32(<vscale x 12 x float> [[DATA]], i64 4)
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv12f32(<vscale x 12 x float> [[DATA]], i64 8)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv4f32(<vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], <vscale x 4 x float> [[TMP4]], <vscale x 4 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv12f32(<vscale x 12 x float> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv12f32(<vscale x 12 x float> [[DATA]], i64 4)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv12f32(<vscale x 12 x float> [[DATA]], i64 8)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 4 x float>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x i1> [[TMP3]], ptr [[TMP4]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z19test_svst3_vnum_f32u10__SVBool_tPfl13svfloat32x3_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 4 x float>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv12f32(<vscale x 12 x float> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv12f32(<vscale x 12 x float> [[DATA]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv12f32(<vscale x 12 x float> [[DATA]], i64 8)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv4f32(<vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], <vscale x 4 x float> [[TMP4]], <vscale x 4 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv12f32(<vscale x 12 x float> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv12f32(<vscale x 12 x float> [[DATA]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv12f32(<vscale x 12 x float> [[DATA]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 4 x float>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x i1> [[TMP3]], ptr [[TMP4]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst3_vnum_f32(svbool_t pg, float32_t *base, int64_t vnum, svfloat32x3_t data)
@@ -511,22 +511,22 @@ void test_svst3_vnum_f32(svbool_t pg, float32_t *base, int64_t vnum, svfloat32x3
 
 // CHECK-LABEL: @test_svst3_vnum_f64(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x double>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv6f64(<vscale x 6 x double> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv6f64(<vscale x 6 x double> [[DATA]], i64 2)
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv6f64(<vscale x 6 x double> [[DATA]], i64 4)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv2f64(<vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], <vscale x 2 x double> [[TMP4]], <vscale x 2 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv6f64(<vscale x 6 x double> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv6f64(<vscale x 6 x double> [[DATA]], i64 2)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv6f64(<vscale x 6 x double> [[DATA]], i64 4)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 2 x double>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv2f64(<vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x i1> [[TMP3]], ptr [[TMP4]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z19test_svst3_vnum_f64u10__SVBool_tPdl13svfloat64x3_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x double>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv6f64(<vscale x 6 x double> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv6f64(<vscale x 6 x double> [[DATA]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv6f64(<vscale x 6 x double> [[DATA]], i64 4)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv2f64(<vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], <vscale x 2 x double> [[TMP4]], <vscale x 2 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv6f64(<vscale x 6 x double> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv6f64(<vscale x 6 x double> [[DATA]], i64 2)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv6f64(<vscale x 6 x double> [[DATA]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 2 x double>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv2f64(<vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x i1> [[TMP3]], ptr [[TMP4]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst3_vnum_f64(svbool_t pg, float64_t *base, int64_t vnum, svfloat64x3_t data)
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st4-bfloat.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st4-bfloat.c
index bf5afb9458ee1..03ff3b4d51b63 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st4-bfloat.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st4-bfloat.c
@@ -17,22 +17,22 @@
 
 // CHECK-LABEL: @test_svst4_bf16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[DATA]], i64 8)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[DATA]], i64 16)
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[DATA]], i64 24)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv8bf16(<vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[TMP4]], <vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[DATA]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[DATA]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[DATA]], i64 24)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv8bf16(<vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x i1> [[TMP4]], ptr [[BASE:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z15test_svst4_bf16u10__SVBool_tPu6__bf1614svbfloat16x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[DATA]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[DATA]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[DATA]], i64 24)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv8bf16(<vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[TMP4]], <vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[DATA]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[DATA]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[DATA]], i64 24)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv8bf16(<vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x i1> [[TMP4]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst4_bf16(svbool_t pg, bfloat16_t *base, svbfloat16x4_t data)
@@ -42,24 +42,24 @@ void test_svst4_bf16(svbool_t pg, bfloat16_t *base, svbfloat16x4_t data)
 
 // CHECK-LABEL: @test_svst4_vnum_bf16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x bfloat>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[DATA]], i64 8)
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[DATA]], i64 16)
-// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[DATA]], i64 24)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv8bf16(<vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], <vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[DATA]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[DATA]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[DATA]], i64 24)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr <vscale x 8 x bfloat>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv8bf16(<vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x i1> [[TMP4]], ptr [[TMP5]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z20test_svst4_vnum_bf16u10__SVBool_tPu6__bf16l14svbfloat16x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x bfloat>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[DATA]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[DATA]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[DATA]], i64 24)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv8bf16(<vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], <vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[DATA]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[DATA]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[DATA]], i64 24)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = getelementptr <vscale x 8 x bfloat>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv8bf16(<vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x i1> [[TMP4]], ptr [[TMP5]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst4_vnum_bf16(svbool_t pg, bfloat16_t *base, int64_t vnum, svbfloat16x4_t data)
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st4.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st4.c
index fc8bdcd34ece0..c6b49361fdfba 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st4.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st4.c
@@ -39,22 +39,22 @@ void test_svst4_s8(svbool_t pg, int8_t *base, svint8x4_t data)
 
 // CHECK-LABEL: @test_svst4_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 8)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 16)
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 24)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv8i16(<vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 24)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i1> [[TMP4]], ptr [[BASE:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z14test_svst4_s16u10__SVBool_tPs11svint16x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 24)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv8i16(<vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 24)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i1> [[TMP4]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst4_s16(svbool_t pg, int16_t *base, svint16x4_t data)
@@ -64,22 +64,22 @@ void test_svst4_s16(svbool_t pg, int16_t *base, svint16x4_t data)
 
 // CHECK-LABEL: @test_svst4_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 4)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 8)
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 12)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv4i32(<vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i32> [[TMP4]], <vscale x 4 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 4)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 8)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 12)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i1> [[TMP4]], ptr [[BASE:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z14test_svst4_s32u10__SVBool_tPi11svint32x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 12)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv4i32(<vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i32> [[TMP4]], <vscale x 4 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 12)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i1> [[TMP4]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst4_s32(svbool_t pg, int32_t *base, svint32x4_t data)
@@ -89,22 +89,22 @@ void test_svst4_s32(svbool_t pg, int32_t *base, svint32x4_t data)
 
 // CHECK-LABEL: @test_svst4_s64(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 2)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 4)
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 6)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv2i64(<vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP4]], <vscale x 2 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 2)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 4)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 6)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i1> [[TMP4]], ptr [[BASE:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z14test_svst4_s64u10__SVBool_tPl11svint64x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 6)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv2i64(<vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP4]], <vscale x 2 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 2)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 6)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i1> [[TMP4]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst4_s64(svbool_t pg, int64_t *base, svint64x4_t data)
@@ -137,22 +137,22 @@ void test_svst4_u8(svbool_t pg, uint8_t *base, svuint8x4_t data)
 
 // CHECK-LABEL: @test_svst4_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 8)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 16)
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 24)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv8i16(<vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 24)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i1> [[TMP4]], ptr [[BASE:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z14test_svst4_u16u10__SVBool_tPt12svuint16x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 24)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv8i16(<vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 24)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i1> [[TMP4]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst4_u16(svbool_t pg, uint16_t *base, svuint16x4_t data)
@@ -162,22 +162,22 @@ void test_svst4_u16(svbool_t pg, uint16_t *base, svuint16x4_t data)
 
 // CHECK-LABEL: @test_svst4_u32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 4)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 8)
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 12)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv4i32(<vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i32> [[TMP4]], <vscale x 4 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 4)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 8)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 12)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i1> [[TMP4]], ptr [[BASE:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z14test_svst4_u32u10__SVBool_tPj12svuint32x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 12)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv4i32(<vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i32> [[TMP4]], <vscale x 4 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 12)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i1> [[TMP4]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst4_u32(svbool_t pg, uint32_t *base, svuint32x4_t data)
@@ -187,22 +187,22 @@ void test_svst4_u32(svbool_t pg, uint32_t *base, svuint32x4_t data)
 
 // CHECK-LABEL: @test_svst4_u64(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 2)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 4)
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 6)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv2i64(<vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP4]], <vscale x 2 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 2)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 4)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 6)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i1> [[TMP4]], ptr [[BASE:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z14test_svst4_u64u10__SVBool_tPm12svuint64x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 6)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv2i64(<vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP4]], <vscale x 2 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 2)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 6)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i1> [[TMP4]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst4_u64(svbool_t pg, uint64_t *base, svuint64x4_t data)
@@ -212,22 +212,22 @@ void test_svst4_u64(svbool_t pg, uint64_t *base, svuint64x4_t data)
 
 // CHECK-LABEL: @test_svst4_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[DATA]], i64 8)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[DATA]], i64 16)
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[DATA]], i64 24)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv8f16(<vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[TMP4]], <vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[DATA]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[DATA]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[DATA]], i64 24)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv8f16(<vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x i1> [[TMP4]], ptr [[BASE:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z14test_svst4_f16u10__SVBool_tPDh13svfloat16x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[DATA]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[DATA]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[DATA]], i64 24)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv8f16(<vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[TMP4]], <vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[DATA]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[DATA]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[DATA]], i64 24)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv8f16(<vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x i1> [[TMP4]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst4_f16(svbool_t pg, float16_t *base, svfloat16x4_t data)
@@ -237,22 +237,22 @@ void test_svst4_f16(svbool_t pg, float16_t *base, svfloat16x4_t data)
 
 // CHECK-LABEL: @test_svst4_f32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[DATA]], i64 4)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[DATA]], i64 8)
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[DATA]], i64 12)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv4f32(<vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], <vscale x 4 x float> [[TMP4]], <vscale x 4 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[DATA]], i64 4)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[DATA]], i64 8)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[DATA]], i64 12)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], <vscale x 4 x i1> [[TMP4]], ptr [[BASE:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z14test_svst4_f32u10__SVBool_tPf13svfloat32x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[DATA]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[DATA]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[DATA]], i64 12)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv4f32(<vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], <vscale x 4 x float> [[TMP4]], <vscale x 4 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[DATA]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[DATA]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[DATA]], i64 12)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], <vscale x 4 x i1> [[TMP4]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst4_f32(svbool_t pg, float32_t *base, svfloat32x4_t data)
@@ -262,22 +262,22 @@ void test_svst4_f32(svbool_t pg, float32_t *base, svfloat32x4_t data)
 
 // CHECK-LABEL: @test_svst4_f64(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[DATA]], i64 2)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[DATA]], i64 4)
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[DATA]], i64 6)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv2f64(<vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], <vscale x 2 x double> [[TMP4]], <vscale x 2 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[DATA]], i64 2)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[DATA]], i64 4)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[DATA]], i64 6)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv2f64(<vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], <vscale x 2 x i1> [[TMP4]], ptr [[BASE:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z14test_svst4_f64u10__SVBool_tPd13svfloat64x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[DATA]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[DATA]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[DATA]], i64 6)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv2f64(<vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], <vscale x 2 x double> [[TMP4]], <vscale x 2 x i1> [[TMP0]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[DATA]], i64 2)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[DATA]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[DATA]], i64 6)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv2f64(<vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], <vscale x 2 x i1> [[TMP4]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst4_f64(svbool_t pg, float64_t *base, svfloat64x4_t data)
@@ -287,22 +287,22 @@ void test_svst4_f64(svbool_t pg, float64_t *base, svfloat64x4_t data)
 
 // CHECK-LABEL: @test_svst4_vnum_s8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[DATA]], i64 16)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[DATA]], i64 32)
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[DATA]], i64 48)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv16i8(<vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], <vscale x 16 x i1> [[PG:%.*]], ptr [[TMP0]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[DATA]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[DATA]], i64 32)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[DATA]], i64 48)
+// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i1> [[PG:%.*]], ptr [[TMP4]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z18test_svst4_vnum_s8u10__SVBool_tPal10svint8x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[DATA]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[DATA]], i64 32)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[DATA]], i64 48)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv16i8(<vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], <vscale x 16 x i1> [[PG:%.*]], ptr [[TMP0]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[DATA]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[DATA]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[DATA]], i64 48)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i1> [[PG:%.*]], ptr [[TMP4]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst4_vnum_s8(svbool_t pg, int8_t *base, int64_t vnum, svint8x4_t data)
@@ -312,24 +312,24 @@ void test_svst4_vnum_s8(svbool_t pg, int8_t *base, int64_t vnum, svint8x4_t data
 
 // CHECK-LABEL: @test_svst4_vnum_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 8)
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 16)
-// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 24)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv8i16(<vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 24)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i1> [[TMP4]], ptr [[TMP5]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z19test_svst4_vnum_s16u10__SVBool_tPsl11svint16x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 24)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv8i16(<vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 24)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i1> [[TMP4]], ptr [[TMP5]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst4_vnum_s16(svbool_t pg, int16_t *base, int64_t vnum, svint16x4_t data)
@@ -339,24 +339,24 @@ void test_svst4_vnum_s16(svbool_t pg, int16_t *base, int64_t vnum, svint16x4_t d
 
 // CHECK-LABEL: @test_svst4_vnum_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 4)
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 8)
-// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 12)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv4i32(<vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], <vscale x 4 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 4)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 8)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 12)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i1> [[TMP4]], ptr [[TMP5]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z19test_svst4_vnum_s32u10__SVBool_tPil11svint32x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 12)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv4i32(<vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], <vscale x 4 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 12)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i1> [[TMP4]], ptr [[TMP5]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst4_vnum_s32(svbool_t pg, int32_t *base, int64_t vnum, svint32x4_t data)
@@ -366,24 +366,24 @@ void test_svst4_vnum_s32(svbool_t pg, int32_t *base, int64_t vnum, svint32x4_t d
 
 // CHECK-LABEL: @test_svst4_vnum_s64(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 2)
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 4)
-// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 6)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv2i64(<vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], <vscale x 2 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 2)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 4)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 6)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i1> [[TMP4]], ptr [[TMP5]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z19test_svst4_vnum_s64u10__SVBool_tPll11svint64x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 6)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv2i64(<vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], <vscale x 2 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 2)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 6)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i1> [[TMP4]], ptr [[TMP5]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst4_vnum_s64(svbool_t pg, int64_t *base, int64_t vnum, svint64x4_t data)
@@ -393,22 +393,22 @@ void test_svst4_vnum_s64(svbool_t pg, int64_t *base, int64_t vnum, svint64x4_t d
 
 // CHECK-LABEL: @test_svst4_vnum_u8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[DATA]], i64 16)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[DATA]], i64 32)
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[DATA]], i64 48)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv16i8(<vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], <vscale x 16 x i1> [[PG:%.*]], ptr [[TMP0]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[DATA]], i64 16)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[DATA]], i64 32)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[DATA]], i64 48)
+// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i1> [[PG:%.*]], ptr [[TMP4]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z18test_svst4_vnum_u8u10__SVBool_tPhl11svuint8x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[DATA]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[DATA]], i64 32)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[DATA]], i64 48)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv16i8(<vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], <vscale x 16 x i1> [[PG:%.*]], ptr [[TMP0]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[DATA]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[DATA]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[DATA]], i64 48)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i1> [[PG:%.*]], ptr [[TMP4]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst4_vnum_u8(svbool_t pg, uint8_t *base, int64_t vnum, svuint8x4_t data)
@@ -418,24 +418,24 @@ void test_svst4_vnum_u8(svbool_t pg, uint8_t *base, int64_t vnum, svuint8x4_t da
 
 // CHECK-LABEL: @test_svst4_vnum_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 8)
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 16)
-// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 24)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv8i16(<vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 24)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i1> [[TMP4]], ptr [[TMP5]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z19test_svst4_vnum_u16u10__SVBool_tPtl12svuint16x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 24)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv8i16(<vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 24)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i1> [[TMP4]], ptr [[TMP5]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst4_vnum_u16(svbool_t pg, uint16_t *base, int64_t vnum, svuint16x4_t data)
@@ -445,24 +445,24 @@ void test_svst4_vnum_u16(svbool_t pg, uint16_t *base, int64_t vnum, svuint16x4_t
 
 // CHECK-LABEL: @test_svst4_vnum_u32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 4)
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 8)
-// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 12)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv4i32(<vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], <vscale x 4 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 4)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 8)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 12)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i1> [[TMP4]], ptr [[TMP5]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z19test_svst4_vnum_u32u10__SVBool_tPjl12svuint32x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 12)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv4i32(<vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], <vscale x 4 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 12)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i1> [[TMP4]], ptr [[TMP5]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst4_vnum_u32(svbool_t pg, uint32_t *base, int64_t vnum, svuint32x4_t data)
@@ -472,24 +472,24 @@ void test_svst4_vnum_u32(svbool_t pg, uint32_t *base, int64_t vnum, svuint32x4_t
 
 // CHECK-LABEL: @test_svst4_vnum_u64(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 2)
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 4)
-// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 6)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv2i64(<vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], <vscale x 2 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 2)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 4)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 6)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i1> [[TMP4]], ptr [[TMP5]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z19test_svst4_vnum_u64u10__SVBool_tPml12svuint64x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 6)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv2i64(<vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], <vscale x 2 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 2)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 6)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i1> [[TMP4]], ptr [[TMP5]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst4_vnum_u64(svbool_t pg, uint64_t *base, int64_t vnum, svuint64x4_t data)
@@ -499,24 +499,24 @@ void test_svst4_vnum_u64(svbool_t pg, uint64_t *base, int64_t vnum, svuint64x4_t
 
 // CHECK-LABEL: @test_svst4_vnum_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x half>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[DATA]], i64 8)
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[DATA]], i64 16)
-// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[DATA]], i64 24)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv8f16(<vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], <vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[DATA]], i64 8)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[DATA]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[DATA]], i64 24)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr <vscale x 8 x half>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv8f16(<vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x i1> [[TMP4]], ptr [[TMP5]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z19test_svst4_vnum_f16u10__SVBool_tPDhl13svfloat16x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x half>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[DATA]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[DATA]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[DATA]], i64 24)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv8f16(<vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], <vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[DATA]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[DATA]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[DATA]], i64 24)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = getelementptr <vscale x 8 x half>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv8f16(<vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x i1> [[TMP4]], ptr [[TMP5]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst4_vnum_f16(svbool_t pg, float16_t *base, int64_t vnum, svfloat16x4_t data)
@@ -526,24 +526,24 @@ void test_svst4_vnum_f16(svbool_t pg, float16_t *base, int64_t vnum, svfloat16x4
 
 // CHECK-LABEL: @test_svst4_vnum_f32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 4 x float>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[DATA]], i64 4)
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[DATA]], i64 8)
-// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[DATA]], i64 12)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv4f32(<vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], <vscale x 4 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], <vscale x 4 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[DATA]], i64 4)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[DATA]], i64 8)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[DATA]], i64 12)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr <vscale x 4 x float>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], <vscale x 4 x i1> [[TMP4]], ptr [[TMP5]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z19test_svst4_vnum_f32u10__SVBool_tPfl13svfloat32x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 4 x float>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[DATA]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[DATA]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[DATA]], i64 12)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv4f32(<vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], <vscale x 4 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], <vscale x 4 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[DATA]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[DATA]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[DATA]], i64 12)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = getelementptr <vscale x 4 x float>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], <vscale x 4 x i1> [[TMP4]], ptr [[TMP5]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst4_vnum_f32(svbool_t pg, float32_t *base, int64_t vnum, svfloat32x4_t data)
@@ -553,24 +553,24 @@ void test_svst4_vnum_f32(svbool_t pg, float32_t *base, int64_t vnum, svfloat32x4
 
 // CHECK-LABEL: @test_svst4_vnum_f64(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x double>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[DATA]], i64 2)
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[DATA]], i64 4)
-// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[DATA]], i64 6)
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv2f64(<vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], <vscale x 2 x double> [[TMP4]], <vscale x 2 x double> [[TMP5]], <vscale x 2 x i1> [[TMP0]], ptr [[TMP1]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[DATA]], i64 2)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[DATA]], i64 4)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[DATA]], i64 6)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr <vscale x 2 x double>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv2f64(<vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], <vscale x 2 x i1> [[TMP4]], ptr [[TMP5]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z19test_svst4_vnum_f64u10__SVBool_tPdl13svfloat64x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x double>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[DATA]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[DATA]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[DATA]], i64 6)
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv2f64(<vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], <vscale x 2 x double> [[TMP4]], <vscale x 2 x double> [[TMP5]], <vscale x 2 x i1> [[TMP0]], ptr [[TMP1]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[DATA]], i64 2)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[DATA]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[DATA]], i64 6)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = getelementptr <vscale x 2 x double>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv2f64(<vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], <vscale x 2 x i1> [[TMP4]], ptr [[TMP5]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst4_vnum_f64(svbool_t pg, float64_t *base, int64_t vnum, svfloat64x4_t data)

From c449a64c3eb2e6bf5ad87ceb20911015f9bd4963 Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev@outlook.com>
Date: Wed, 1 Nov 2023 08:30:44 -0700
Subject: [PATCH 688/877] [SLP][NFC]Add the test shoing issue with
 -slp-vectorize-hor-store option, NFC.

---
 .../X86/horizontal-store-many-uses.ll         | 32 +++++++++++++++++++
 1 file changed, 32 insertions(+)
 create mode 100644 llvm/test/Transforms/SLPVectorizer/X86/horizontal-store-many-uses.ll

diff --git a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-store-many-uses.ll b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-store-many-uses.ll
new file mode 100644
index 0000000000000..d93f1edfc5971
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-store-many-uses.ll
@@ -0,0 +1,32 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; RUN: opt -passes=slp-vectorizer -slp-vectorize-hor -slp-vectorize-hor-store -S < %s -mtriple=x86_64-unknown-linux | FileCheck %s
+
+@arr_i32 = global [32 x i32] zeroinitializer, align 16
+define void @test(ptr noalias %pl, ptr noalias %res, ptr noalias %p2) {
+; CHECK-LABEL: define void @test(
+; CHECK-SAME: ptr noalias [[PL:%.*]], ptr noalias [[RES:%.*]], ptr noalias [[P2:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr @arr_i32, align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 1), align 4
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP1]], [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 2), align 8
+; CHECK-NEXT:    [[ADD_1:%.*]] = add nsw i32 [[TMP2]], [[ADD]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 3), align 4
+; CHECK-NEXT:    [[ADD_2:%.*]] = add nsw i32 [[TMP3]], [[ADD_1]]
+; CHECK-NEXT:    store i32 [[ADD_2]], ptr [[P2]], align 16
+; CHECK-NEXT:    store i32 [[ADD_2]], ptr [[RES]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = load i32, ptr @arr_i32, align 16
+  %1 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 1), align 4
+  %add = add nsw i32 %1, %0
+  %2 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 2), align 8
+  %add.1 = add nsw i32 %2, %add
+  %3 = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 3), align 4
+  %add.2 = add nsw i32 %3, %add.1
+  store i32 %add.2, ptr %p2, align 16
+  store i32 %add.2, ptr %res, align 16
+  ret void
+}
+

From 8ba5c7a326bd4477aa3ce2dcfbc0ba991d7ff356 Mon Sep 17 00:00:00 2001
From: Nicolas Vasilache <nicolasvasilache@users.noreply.github.com>
Date: Wed, 1 Nov 2023 16:34:04 +0100
Subject: [PATCH 689/877] [mlir][Vector] Add initial support for inlining in
 the presence of vector ops (#70942)

---
 mlir/lib/Dialect/Vector/IR/VectorOps.cpp | 16 ++++++++++++++++
 mlir/test/Dialect/Vector/inlining.mlir   | 14 ++++++++++++++
 2 files changed, 30 insertions(+)
 create mode 100644 mlir/test/Dialect/Vector/inlining.mlir

diff --git a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
index 4e34caa6d8aab..60416f550ee61 100644
--- a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
+++ b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
@@ -33,6 +33,7 @@
 #include "mlir/IR/TypeUtilities.h"
 #include "mlir/Interfaces/ValueBoundsOpInterface.h"
 #include "mlir/Support/LLVM.h"
+#include "mlir/Transforms/InliningUtils.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
@@ -348,6 +349,19 @@ struct BitmaskEnumStorage : public AttributeStorage {
 // VectorDialect
 //===----------------------------------------------------------------------===//
 
+namespace {
+/// This class defines the interface for handling inlining with vector dialect
+/// operations.
+struct VectorInlinerInterface : public DialectInlinerInterface {
+  using DialectInlinerInterface::DialectInlinerInterface;
+
+  /// All vector dialect ops can be inlined.
+  bool isLegalToInline(Operation *, Region *, bool, IRMapping &) const final {
+    return true;
+  }
+};
+} // namespace
+
 void VectorDialect::initialize() {
   addAttributes<
 #define GET_ATTRDEF_LIST
@@ -358,6 +372,8 @@ void VectorDialect::initialize() {
 #define GET_OP_LIST
 #include "mlir/Dialect/Vector/IR/VectorOps.cpp.inc"
       >();
+
+  addInterfaces<VectorInlinerInterface>();
 }
 
 /// Materialize a single constant operation from a given attribute value with
diff --git a/mlir/test/Dialect/Vector/inlining.mlir b/mlir/test/Dialect/Vector/inlining.mlir
new file mode 100644
index 0000000000000..053a115613ff6
--- /dev/null
+++ b/mlir/test/Dialect/Vector/inlining.mlir
@@ -0,0 +1,14 @@
+// RUN: mlir-opt %s -inline | FileCheck %s
+
+func.func @inner_func_inlinable(%v: f32) -> vector<4xf32> {
+  %1 = vector.broadcast %v : f32 to vector<4xf32>
+  return %1 : vector<4xf32>
+}
+
+// CHECK-LABEL: func.func @test_inline(
+//  CHECK-NOT:    func.call
+//  CHECK-NEXT:   vector.broadcast
+func.func @test_inline(%v: f32) -> vector<4xf32> {
+  %0 = call @inner_func_inlinable(%v) : (f32) -> vector<4xf32>
+  return %0 : vector<4xf32>
+}

From 7a5c14cb275e10d8ecef156d8c3273b3f2a74abe Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Wed, 1 Nov 2023 16:37:09 +0100
Subject: [PATCH 690/877] [X86] Regenerate test checks (NFC)

---
 .../CodeGen/X86/codegen-prepare-extload.ll    | 505 +++++++++++++-----
 1 file changed, 358 insertions(+), 147 deletions(-)

diff --git a/llvm/test/CodeGen/X86/codegen-prepare-extload.ll b/llvm/test/CodeGen/X86/codegen-prepare-extload.ll
index 6695576c557aa..f835d4f2dbad7 100644
--- a/llvm/test/CodeGen/X86/codegen-prepare-extload.ll
+++ b/llvm/test/CodeGen/X86/codegen-prepare-extload.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
 ; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s
 ; RUN: llc < %s -mtriple=x86_64-win64 | FileCheck %s
 ; RUN: opt -codegenprepare < %s -mtriple=x86_64-apple-macosx -S | FileCheck %s --check-prefix=OPTALL --check-prefix=OPT --check-prefix=NONSTRESS
@@ -10,13 +11,20 @@
 ;
 ; CHECK-LABEL: foo:
 ; CHECK: movsbl ({{%rdi|%rcx}}), %eax
-;
-; OPTALL-LABEL: @foo
-; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i8, ptr %p
-; OPTALL-NEXT: [[ZEXT:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i32
-; OPTALL: store i32 [[ZEXT]], ptr %q
-; OPTALL: ret
 define void @foo(ptr %p, ptr %q) {
+; OPTALL-LABEL: define void @foo(
+; OPTALL-SAME: ptr [[P:%.*]], ptr [[Q:%.*]]) {
+; OPTALL-NEXT:  entry:
+; OPTALL-NEXT:    [[T:%.*]] = load i8, ptr [[P]], align 1
+; OPTALL-NEXT:    [[S:%.*]] = zext i8 [[T]] to i32
+; OPTALL-NEXT:    [[A:%.*]] = icmp slt i8 [[T]], 20
+; OPTALL-NEXT:    br i1 [[A]], label [[TRUE:%.*]], label [[FALSE:%.*]]
+; OPTALL:       true:
+; OPTALL-NEXT:    store i32 [[S]], ptr [[Q]], align 4
+; OPTALL-NEXT:    ret void
+; OPTALL:       false:
+; OPTALL-NEXT:    ret void
+;
 entry:
   %t = load i8, ptr %p
   %a = icmp slt i8 %t, 20
@@ -31,16 +39,36 @@ false:
 
 ; Check that we manage to form a zextload is an operation with only one
 ; argument to explicitly extend is in the way.
-; OPTALL-LABEL: @promoteOneArg
-; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i8, ptr %p
-; OPT-NEXT: [[ZEXT:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i32
-; OPT-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = add nuw i32 [[ZEXT]], 2
 ; Make sure the operation is not promoted when the promotion pass is disabled.
-; DISABLE: [[ADD:%[a-zA-Z_0-9-]+]] = add nuw i8 [[LD]], 2
-; DISABLE: [[RES:%[a-zA-Z_0-9-]+]] = zext i8 [[ADD]] to i32
-; OPTALL: store i32 [[RES]], ptr %q
-; OPTALL: ret
 define void @promoteOneArg(ptr %p, ptr %q) {
+; OPT-LABEL: define void @promoteOneArg(
+; OPT-SAME: ptr [[P:%.*]], ptr [[Q:%.*]]) {
+; OPT-NEXT:  entry:
+; OPT-NEXT:    [[T:%.*]] = load i8, ptr [[P]], align 1
+; OPT-NEXT:    [[S:%.*]] = zext i8 [[T]] to i32
+; OPT-NEXT:    [[ADD:%.*]] = add nuw i32 [[S]], 2
+; OPT-NEXT:    [[A:%.*]] = icmp slt i8 [[T]], 20
+; OPT-NEXT:    br i1 [[A]], label [[TRUE:%.*]], label [[FALSE:%.*]]
+; OPT:       true:
+; OPT-NEXT:    store i32 [[ADD]], ptr [[Q]], align 4
+; OPT-NEXT:    ret void
+; OPT:       false:
+; OPT-NEXT:    ret void
+;
+; DISABLE-LABEL: define void @promoteOneArg(
+; DISABLE-SAME: ptr [[P:%.*]], ptr [[Q:%.*]]) {
+; DISABLE-NEXT:  entry:
+; DISABLE-NEXT:    [[T:%.*]] = load i8, ptr [[P]], align 1
+; DISABLE-NEXT:    [[ADD:%.*]] = add nuw i8 [[T]], 2
+; DISABLE-NEXT:    [[A:%.*]] = icmp slt i8 [[T]], 20
+; DISABLE-NEXT:    br i1 [[A]], label [[TRUE:%.*]], label [[FALSE:%.*]]
+; DISABLE:       true:
+; DISABLE-NEXT:    [[S:%.*]] = zext i8 [[ADD]] to i32
+; DISABLE-NEXT:    store i32 [[S]], ptr [[Q]], align 4
+; DISABLE-NEXT:    ret void
+; DISABLE:       false:
+; DISABLE-NEXT:    ret void
+;
 entry:
   %t = load i8, ptr %p
   %add = add nuw i8 %t, 2
@@ -57,15 +85,35 @@ false:
 ; Check that we manage to form a sextload is an operation with only one
 ; argument to explicitly extend is in the way.
 ; Version with sext.
-; OPTALL-LABEL: @promoteOneArgSExt
-; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i8, ptr %p
-; OPT-NEXT: [[SEXT:%[a-zA-Z_0-9-]+]] = sext i8 [[LD]] to i32
-; OPT-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = add nsw i32 [[SEXT]], 2
-; DISABLE: [[ADD:%[a-zA-Z_0-9-]+]] = add nsw i8 [[LD]], 2
-; DISABLE: [[RES:%[a-zA-Z_0-9-]+]] = sext i8 [[ADD]] to i32
-; OPTALL: store i32 [[RES]], ptr %q
-; OPTALL: ret
 define void @promoteOneArgSExt(ptr %p, ptr %q) {
+; OPT-LABEL: define void @promoteOneArgSExt(
+; OPT-SAME: ptr [[P:%.*]], ptr [[Q:%.*]]) {
+; OPT-NEXT:  entry:
+; OPT-NEXT:    [[T:%.*]] = load i8, ptr [[P]], align 1
+; OPT-NEXT:    [[S:%.*]] = sext i8 [[T]] to i32
+; OPT-NEXT:    [[ADD:%.*]] = add nsw i32 [[S]], 2
+; OPT-NEXT:    [[A:%.*]] = icmp slt i8 [[T]], 20
+; OPT-NEXT:    br i1 [[A]], label [[TRUE:%.*]], label [[FALSE:%.*]]
+; OPT:       true:
+; OPT-NEXT:    store i32 [[ADD]], ptr [[Q]], align 4
+; OPT-NEXT:    ret void
+; OPT:       false:
+; OPT-NEXT:    ret void
+;
+; DISABLE-LABEL: define void @promoteOneArgSExt(
+; DISABLE-SAME: ptr [[P:%.*]], ptr [[Q:%.*]]) {
+; DISABLE-NEXT:  entry:
+; DISABLE-NEXT:    [[T:%.*]] = load i8, ptr [[P]], align 1
+; DISABLE-NEXT:    [[ADD:%.*]] = add nsw i8 [[T]], 2
+; DISABLE-NEXT:    [[A:%.*]] = icmp slt i8 [[T]], 20
+; DISABLE-NEXT:    br i1 [[A]], label [[TRUE:%.*]], label [[FALSE:%.*]]
+; DISABLE:       true:
+; DISABLE-NEXT:    [[S:%.*]] = sext i8 [[ADD]] to i32
+; DISABLE-NEXT:    store i32 [[S]], ptr [[Q]], align 4
+; DISABLE-NEXT:    ret void
+; DISABLE:       false:
+; DISABLE-NEXT:    ret void
+;
 entry:
   %t = load i8, ptr %p
   %add = add nsw i8 %t, 2
@@ -87,24 +135,51 @@ false:
 ; #1 will not be removed as we do not know anything about %b.
 ; #2 may not be merged with the load because %t is used in a comparison.
 ; Since two extensions may be emitted in the end instead of one before the
-; transformation, the regular heuristic does not apply the optimization. 
-; 
-; OPTALL-LABEL: @promoteTwoArgZext
-; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i8, ptr %p
-;
-; STRESS-NEXT: [[ZEXTLD:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i32
-; STRESS-NEXT: [[ZEXTB:%[a-zA-Z_0-9-]+]] = zext i8 %b to i32
-; STRESS-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = add nuw i32 [[ZEXTLD]], [[ZEXTB]]
+; transformation, the regular heuristic does not apply the optimization.
+define void @promoteTwoArgZext(ptr %p, ptr %q, i8 %b) {
+; NONSTRESS-LABEL: define void @promoteTwoArgZext(
+; NONSTRESS-SAME: ptr [[P:%.*]], ptr [[Q:%.*]], i8 [[B:%.*]]) {
+; NONSTRESS-NEXT:  entry:
+; NONSTRESS-NEXT:    [[T:%.*]] = load i8, ptr [[P]], align 1
+; NONSTRESS-NEXT:    [[ADD:%.*]] = add nuw i8 [[T]], [[B]]
+; NONSTRESS-NEXT:    [[A:%.*]] = icmp slt i8 [[T]], 20
+; NONSTRESS-NEXT:    br i1 [[A]], label [[TRUE:%.*]], label [[FALSE:%.*]]
+; NONSTRESS:       true:
+; NONSTRESS-NEXT:    [[S:%.*]] = zext i8 [[ADD]] to i32
+; NONSTRESS-NEXT:    store i32 [[S]], ptr [[Q]], align 4
+; NONSTRESS-NEXT:    ret void
+; NONSTRESS:       false:
+; NONSTRESS-NEXT:    ret void
 ;
-; NONSTRESS: [[ADD:%[a-zA-Z_0-9-]+]] = add nuw i8 [[LD]], %b
-; NONSTRESS: [[RES:%[a-zA-Z_0-9-]+]] = zext i8 [[ADD]] to i32
+; STRESS-LABEL: define void @promoteTwoArgZext(
+; STRESS-SAME: ptr [[P:%.*]], ptr [[Q:%.*]], i8 [[B:%.*]]) {
+; STRESS-NEXT:  entry:
+; STRESS-NEXT:    [[T:%.*]] = load i8, ptr [[P]], align 1
+; STRESS-NEXT:    [[S:%.*]] = zext i8 [[T]] to i32
+; STRESS-NEXT:    [[PROMOTED:%.*]] = zext i8 [[B]] to i32
+; STRESS-NEXT:    [[ADD:%.*]] = add nuw i32 [[S]], [[PROMOTED]]
+; STRESS-NEXT:    [[A:%.*]] = icmp slt i8 [[T]], 20
+; STRESS-NEXT:    br i1 [[A]], label [[TRUE:%.*]], label [[FALSE:%.*]]
+; STRESS:       true:
+; STRESS-NEXT:    store i32 [[ADD]], ptr [[Q]], align 4
+; STRESS-NEXT:    ret void
+; STRESS:       false:
+; STRESS-NEXT:    ret void
 ;
-; DISABLE: [[ADD:%[a-zA-Z_0-9-]+]] = add nuw i8 [[LD]], %b
-; DISABLE: [[RES:%[a-zA-Z_0-9-]+]] = zext i8 [[ADD]] to i32
+; DISABLE-LABEL: define void @promoteTwoArgZext(
+; DISABLE-SAME: ptr [[P:%.*]], ptr [[Q:%.*]], i8 [[B:%.*]]) {
+; DISABLE-NEXT:  entry:
+; DISABLE-NEXT:    [[T:%.*]] = load i8, ptr [[P]], align 1
+; DISABLE-NEXT:    [[ADD:%.*]] = add nuw i8 [[T]], [[B]]
+; DISABLE-NEXT:    [[A:%.*]] = icmp slt i8 [[T]], 20
+; DISABLE-NEXT:    br i1 [[A]], label [[TRUE:%.*]], label [[FALSE:%.*]]
+; DISABLE:       true:
+; DISABLE-NEXT:    [[S:%.*]] = zext i8 [[ADD]] to i32
+; DISABLE-NEXT:    store i32 [[S]], ptr [[Q]], align 4
+; DISABLE-NEXT:    ret void
+; DISABLE:       false:
+; DISABLE-NEXT:    ret void
 ;
-; OPTALL: store i32 [[RES]], ptr %q
-; OPTALL: ret
-define void @promoteTwoArgZext(ptr %p, ptr %q, i8 %b) {
 entry:
   %t = load i8, ptr %p
   %add = add nuw i8 %t, %b
@@ -121,21 +196,50 @@ false:
 ; Check that we manage to form a sextload is an operation with two
 ; arguments to explicitly extend is in the way.
 ; Version with sext.
-; OPTALL-LABEL: @promoteTwoArgSExt
-; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i8, ptr %p
+define void @promoteTwoArgSExt(ptr %p, ptr %q, i8 %b) {
+; NONSTRESS-LABEL: define void @promoteTwoArgSExt(
+; NONSTRESS-SAME: ptr [[P:%.*]], ptr [[Q:%.*]], i8 [[B:%.*]]) {
+; NONSTRESS-NEXT:  entry:
+; NONSTRESS-NEXT:    [[T:%.*]] = load i8, ptr [[P]], align 1
+; NONSTRESS-NEXT:    [[ADD:%.*]] = add nsw i8 [[T]], [[B]]
+; NONSTRESS-NEXT:    [[A:%.*]] = icmp slt i8 [[T]], 20
+; NONSTRESS-NEXT:    br i1 [[A]], label [[TRUE:%.*]], label [[FALSE:%.*]]
+; NONSTRESS:       true:
+; NONSTRESS-NEXT:    [[S:%.*]] = sext i8 [[ADD]] to i32
+; NONSTRESS-NEXT:    store i32 [[S]], ptr [[Q]], align 4
+; NONSTRESS-NEXT:    ret void
+; NONSTRESS:       false:
+; NONSTRESS-NEXT:    ret void
 ;
-; STRESS-NEXT: [[SEXTLD:%[a-zA-Z_0-9-]+]] = sext i8 [[LD]] to i32
-; STRESS-NEXT: [[SEXTB:%[a-zA-Z_0-9-]+]] = sext i8 %b to i32
-; STRESS-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = add nsw i32 [[SEXTLD]], [[SEXTB]]
+; STRESS-LABEL: define void @promoteTwoArgSExt(
+; STRESS-SAME: ptr [[P:%.*]], ptr [[Q:%.*]], i8 [[B:%.*]]) {
+; STRESS-NEXT:  entry:
+; STRESS-NEXT:    [[T:%.*]] = load i8, ptr [[P]], align 1
+; STRESS-NEXT:    [[S:%.*]] = sext i8 [[T]] to i32
+; STRESS-NEXT:    [[PROMOTED:%.*]] = sext i8 [[B]] to i32
+; STRESS-NEXT:    [[ADD:%.*]] = add nsw i32 [[S]], [[PROMOTED]]
+; STRESS-NEXT:    [[A:%.*]] = icmp slt i8 [[T]], 20
+; STRESS-NEXT:    br i1 [[A]], label [[TRUE:%.*]], label [[FALSE:%.*]]
+; STRESS:       true:
+; STRESS-NEXT:    store i32 [[ADD]], ptr [[Q]], align 4
+; STRESS-NEXT:    ret void
+; STRESS:       false:
+; STRESS-NEXT:    ret void
 ;
-; NONSTRESS: [[ADD:%[a-zA-Z_0-9-]+]] = add nsw i8 [[LD]], %b
-; NONSTRESS: [[RES:%[a-zA-Z_0-9-]+]] = sext i8 [[ADD]] to i32
+; DISABLE-LABEL: define void @promoteTwoArgSExt(
+; DISABLE-SAME: ptr [[P:%.*]], ptr [[Q:%.*]], i8 [[B:%.*]]) {
+; DISABLE-NEXT:  entry:
+; DISABLE-NEXT:    [[T:%.*]] = load i8, ptr [[P]], align 1
+; DISABLE-NEXT:    [[ADD:%.*]] = add nsw i8 [[T]], [[B]]
+; DISABLE-NEXT:    [[A:%.*]] = icmp slt i8 [[T]], 20
+; DISABLE-NEXT:    br i1 [[A]], label [[TRUE:%.*]], label [[FALSE:%.*]]
+; DISABLE:       true:
+; DISABLE-NEXT:    [[S:%.*]] = sext i8 [[ADD]] to i32
+; DISABLE-NEXT:    store i32 [[S]], ptr [[Q]], align 4
+; DISABLE-NEXT:    ret void
+; DISABLE:       false:
+; DISABLE-NEXT:    ret void
 ;
-; DISABLE: [[ADD:%[a-zA-Z_0-9-]+]] = add nsw i8 [[LD]], %b
-; DISABLE: [[RES:%[a-zA-Z_0-9-]+]] = sext i8 [[ADD]] to i32
-; OPTALL: store i32 [[RES]], ptr %q
-; OPTALL: ret
-define void @promoteTwoArgSExt(ptr %p, ptr %q, i8 %b) {
 entry:
   %t = load i8, ptr %p
   %add = add nsw i8 %t, %b
@@ -151,26 +255,54 @@ false:
 
 ; Check that we do not a zextload if we need to introduce more than
 ; one additional extension.
-; OPTALL-LABEL: @promoteThreeArgZext
-; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i8, ptr %p
-;
-; STRESS-NEXT: [[ZEXTLD:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i32
-; STRESS-NEXT: [[ZEXTB:%[a-zA-Z_0-9-]+]] = zext i8 %b to i32
-; STRESS-NEXT: [[TMP:%[a-zA-Z_0-9-]+]] = add nuw i32 [[ZEXTLD]], [[ZEXTB]]
-; STRESS-NEXT: [[ZEXTC:%[a-zA-Z_0-9-]+]] = zext i8 %c to i32
-; STRESS-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = add nuw i32 [[TMP]], [[ZEXTC]]
+define void @promoteThreeArgZext(ptr %p, ptr %q, i8 %b, i8 %c) {
+; NONSTRESS-LABEL: define void @promoteThreeArgZext(
+; NONSTRESS-SAME: ptr [[P:%.*]], ptr [[Q:%.*]], i8 [[B:%.*]], i8 [[C:%.*]]) {
+; NONSTRESS-NEXT:  entry:
+; NONSTRESS-NEXT:    [[T:%.*]] = load i8, ptr [[P]], align 1
+; NONSTRESS-NEXT:    [[TMP:%.*]] = add nuw i8 [[T]], [[B]]
+; NONSTRESS-NEXT:    [[ADD:%.*]] = add nuw i8 [[TMP]], [[C]]
+; NONSTRESS-NEXT:    [[A:%.*]] = icmp slt i8 [[T]], 20
+; NONSTRESS-NEXT:    br i1 [[A]], label [[TRUE:%.*]], label [[FALSE:%.*]]
+; NONSTRESS:       true:
+; NONSTRESS-NEXT:    [[S:%.*]] = zext i8 [[ADD]] to i32
+; NONSTRESS-NEXT:    store i32 [[S]], ptr [[Q]], align 4
+; NONSTRESS-NEXT:    ret void
+; NONSTRESS:       false:
+; NONSTRESS-NEXT:    ret void
 ;
-; NONSTRESS-NEXT: [[TMP:%[a-zA-Z_0-9-]+]] = add nuw i8 [[LD]], %b
-; NONSTRESS-NEXT: [[ADD:%[a-zA-Z_0-9-]+]] = add nuw i8 [[TMP]], %c
-; NONSTRESS: [[RES:%[a-zA-Z_0-9-]+]] = zext i8 [[ADD]] to i32
+; STRESS-LABEL: define void @promoteThreeArgZext(
+; STRESS-SAME: ptr [[P:%.*]], ptr [[Q:%.*]], i8 [[B:%.*]], i8 [[C:%.*]]) {
+; STRESS-NEXT:  entry:
+; STRESS-NEXT:    [[T:%.*]] = load i8, ptr [[P]], align 1
+; STRESS-NEXT:    [[S:%.*]] = zext i8 [[T]] to i32
+; STRESS-NEXT:    [[PROMOTED1:%.*]] = zext i8 [[B]] to i32
+; STRESS-NEXT:    [[TMP:%.*]] = add nuw i32 [[S]], [[PROMOTED1]]
+; STRESS-NEXT:    [[PROMOTED:%.*]] = zext i8 [[C]] to i32
+; STRESS-NEXT:    [[ADD:%.*]] = add nuw i32 [[TMP]], [[PROMOTED]]
+; STRESS-NEXT:    [[A:%.*]] = icmp slt i8 [[T]], 20
+; STRESS-NEXT:    br i1 [[A]], label [[TRUE:%.*]], label [[FALSE:%.*]]
+; STRESS:       true:
+; STRESS-NEXT:    store i32 [[ADD]], ptr [[Q]], align 4
+; STRESS-NEXT:    ret void
+; STRESS:       false:
+; STRESS-NEXT:    ret void
 ;
-; DISABLE: add nuw i8
-; DISABLE: [[ADD:%[a-zA-Z_0-9-]+]] = add nuw i8
-; DISABLE: [[RES:%[a-zA-Z_0-9-]+]] = zext i8 [[ADD]] to i32
+; DISABLE-LABEL: define void @promoteThreeArgZext(
+; DISABLE-SAME: ptr [[P:%.*]], ptr [[Q:%.*]], i8 [[B:%.*]], i8 [[C:%.*]]) {
+; DISABLE-NEXT:  entry:
+; DISABLE-NEXT:    [[T:%.*]] = load i8, ptr [[P]], align 1
+; DISABLE-NEXT:    [[TMP:%.*]] = add nuw i8 [[T]], [[B]]
+; DISABLE-NEXT:    [[ADD:%.*]] = add nuw i8 [[TMP]], [[C]]
+; DISABLE-NEXT:    [[A:%.*]] = icmp slt i8 [[T]], 20
+; DISABLE-NEXT:    br i1 [[A]], label [[TRUE:%.*]], label [[FALSE:%.*]]
+; DISABLE:       true:
+; DISABLE-NEXT:    [[S:%.*]] = zext i8 [[ADD]] to i32
+; DISABLE-NEXT:    store i32 [[S]], ptr [[Q]], align 4
+; DISABLE-NEXT:    ret void
+; DISABLE:       false:
+; DISABLE-NEXT:    ret void
 ;
-; OPTALL: store i32 [[RES]], ptr %q
-; OPTALL: ret
-define void @promoteThreeArgZext(ptr %p, ptr %q, i8 %b, i8 %c) {
 entry:
   %t = load i8, ptr %p
   %tmp = add nuw i8 %t, %b
@@ -187,24 +319,52 @@ false:
 
 ; Check that we manage to form a zextload after promoting and merging
 ; two extensions.
-; OPTALL-LABEL: @promoteMergeExtArgZExt
-; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i8, ptr %p
-;
-; STRESS-NEXT: [[ZEXTLD:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i32
-; STRESS-NEXT: [[ZEXTB:%[a-zA-Z_0-9-]+]] = zext i16 %b to i32
-; STRESS-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = add nuw i32 [[ZEXTLD]], [[ZEXTB]]
+define void @promoteMergeExtArgZExt(ptr %p, ptr %q, i16 %b) {
+; NONSTRESS-LABEL: define void @promoteMergeExtArgZExt(
+; NONSTRESS-SAME: ptr [[P:%.*]], ptr [[Q:%.*]], i16 [[B:%.*]]) {
+; NONSTRESS-NEXT:  entry:
+; NONSTRESS-NEXT:    [[T:%.*]] = load i8, ptr [[P]], align 1
+; NONSTRESS-NEXT:    [[EXT:%.*]] = zext i8 [[T]] to i16
+; NONSTRESS-NEXT:    [[ADD:%.*]] = add nuw i16 [[EXT]], [[B]]
+; NONSTRESS-NEXT:    [[A:%.*]] = icmp slt i8 [[T]], 20
+; NONSTRESS-NEXT:    br i1 [[A]], label [[TRUE:%.*]], label [[FALSE:%.*]]
+; NONSTRESS:       true:
+; NONSTRESS-NEXT:    [[S:%.*]] = zext i16 [[ADD]] to i32
+; NONSTRESS-NEXT:    store i32 [[S]], ptr [[Q]], align 4
+; NONSTRESS-NEXT:    ret void
+; NONSTRESS:       false:
+; NONSTRESS-NEXT:    ret void
 ;
-; NONSTRESS: [[ZEXTLD:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i16
-; NONSTRESS: [[ADD:%[a-zA-Z_0-9-]+]] = add nuw i16 [[ZEXTLD]], %b
-; NONSTRESS: [[RES:%[a-zA-Z_0-9-]+]] = zext i16 [[ADD]] to i32
+; STRESS-LABEL: define void @promoteMergeExtArgZExt(
+; STRESS-SAME: ptr [[P:%.*]], ptr [[Q:%.*]], i16 [[B:%.*]]) {
+; STRESS-NEXT:  entry:
+; STRESS-NEXT:    [[T:%.*]] = load i8, ptr [[P]], align 1
+; STRESS-NEXT:    [[PROMOTED1:%.*]] = zext i8 [[T]] to i32
+; STRESS-NEXT:    [[PROMOTED:%.*]] = zext i16 [[B]] to i32
+; STRESS-NEXT:    [[ADD:%.*]] = add nuw i32 [[PROMOTED1]], [[PROMOTED]]
+; STRESS-NEXT:    [[A:%.*]] = icmp slt i8 [[T]], 20
+; STRESS-NEXT:    br i1 [[A]], label [[TRUE:%.*]], label [[FALSE:%.*]]
+; STRESS:       true:
+; STRESS-NEXT:    store i32 [[ADD]], ptr [[Q]], align 4
+; STRESS-NEXT:    ret void
+; STRESS:       false:
+; STRESS-NEXT:    ret void
 ;
-; DISABLE: [[ZEXTLD:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i16
-; DISABLE: [[ADD:%[a-zA-Z_0-9-]+]] = add nuw i16 [[ZEXTLD]], %b
-; DISABLE: [[RES:%[a-zA-Z_0-9-]+]] = zext i16 [[ADD]] to i32
+; DISABLE-LABEL: define void @promoteMergeExtArgZExt(
+; DISABLE-SAME: ptr [[P:%.*]], ptr [[Q:%.*]], i16 [[B:%.*]]) {
+; DISABLE-NEXT:  entry:
+; DISABLE-NEXT:    [[T:%.*]] = load i8, ptr [[P]], align 1
+; DISABLE-NEXT:    [[EXT:%.*]] = zext i8 [[T]] to i16
+; DISABLE-NEXT:    [[ADD:%.*]] = add nuw i16 [[EXT]], [[B]]
+; DISABLE-NEXT:    [[A:%.*]] = icmp slt i8 [[T]], 20
+; DISABLE-NEXT:    br i1 [[A]], label [[TRUE:%.*]], label [[FALSE:%.*]]
+; DISABLE:       true:
+; DISABLE-NEXT:    [[S:%.*]] = zext i16 [[ADD]] to i32
+; DISABLE-NEXT:    store i32 [[S]], ptr [[Q]], align 4
+; DISABLE-NEXT:    ret void
+; DISABLE:       false:
+; DISABLE-NEXT:    ret void
 ;
-; OPTALL: store i32 [[RES]], ptr %q
-; OPTALL: ret
-define void @promoteMergeExtArgZExt(ptr %p, ptr %q, i16 %b) {
 entry:
   %t = load i8, ptr %p
   %ext = zext i8 %t to i16
@@ -222,23 +382,52 @@ false:
 ; Check that we manage to form a sextload after promoting and merging
 ; two extensions.
 ; Version with sext.
-; OPTALL-LABEL: @promoteMergeExtArgSExt
-; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i8, ptr %p
+define void @promoteMergeExtArgSExt(ptr %p, ptr %q, i16 %b) {
+; NONSTRESS-LABEL: define void @promoteMergeExtArgSExt(
+; NONSTRESS-SAME: ptr [[P:%.*]], ptr [[Q:%.*]], i16 [[B:%.*]]) {
+; NONSTRESS-NEXT:  entry:
+; NONSTRESS-NEXT:    [[T:%.*]] = load i8, ptr [[P]], align 1
+; NONSTRESS-NEXT:    [[EXT:%.*]] = zext i8 [[T]] to i16
+; NONSTRESS-NEXT:    [[ADD:%.*]] = add nsw i16 [[EXT]], [[B]]
+; NONSTRESS-NEXT:    [[A:%.*]] = icmp slt i8 [[T]], 20
+; NONSTRESS-NEXT:    br i1 [[A]], label [[TRUE:%.*]], label [[FALSE:%.*]]
+; NONSTRESS:       true:
+; NONSTRESS-NEXT:    [[S:%.*]] = sext i16 [[ADD]] to i32
+; NONSTRESS-NEXT:    store i32 [[S]], ptr [[Q]], align 4
+; NONSTRESS-NEXT:    ret void
+; NONSTRESS:       false:
+; NONSTRESS-NEXT:    ret void
 ;
-; STRESS-NEXT: [[ZEXTLD:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i32
-; STRESS-NEXT: [[ZEXTB:%[a-zA-Z_0-9-]+]] = sext i16 %b to i32
-; STRESS-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = add nsw i32 [[ZEXTLD]], [[ZEXTB]]
+; STRESS-LABEL: define void @promoteMergeExtArgSExt(
+; STRESS-SAME: ptr [[P:%.*]], ptr [[Q:%.*]], i16 [[B:%.*]]) {
+; STRESS-NEXT:  entry:
+; STRESS-NEXT:    [[T:%.*]] = load i8, ptr [[P]], align 1
+; STRESS-NEXT:    [[PROMOTED1:%.*]] = zext i8 [[T]] to i32
+; STRESS-NEXT:    [[PROMOTED:%.*]] = sext i16 [[B]] to i32
+; STRESS-NEXT:    [[ADD:%.*]] = add nsw i32 [[PROMOTED1]], [[PROMOTED]]
+; STRESS-NEXT:    [[A:%.*]] = icmp slt i8 [[T]], 20
+; STRESS-NEXT:    br i1 [[A]], label [[TRUE:%.*]], label [[FALSE:%.*]]
+; STRESS:       true:
+; STRESS-NEXT:    store i32 [[ADD]], ptr [[Q]], align 4
+; STRESS-NEXT:    ret void
+; STRESS:       false:
+; STRESS-NEXT:    ret void
 ;
-; NONSTRESS: [[ZEXTLD:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i16
-; NONSTRESS: [[ADD:%[a-zA-Z_0-9-]+]] = add nsw i16 [[ZEXTLD]], %b
-; NONSTRESS: [[RES:%[a-zA-Z_0-9-]+]] = sext i16 [[ADD]] to i32
+; DISABLE-LABEL: define void @promoteMergeExtArgSExt(
+; DISABLE-SAME: ptr [[P:%.*]], ptr [[Q:%.*]], i16 [[B:%.*]]) {
+; DISABLE-NEXT:  entry:
+; DISABLE-NEXT:    [[T:%.*]] = load i8, ptr [[P]], align 1
+; DISABLE-NEXT:    [[EXT:%.*]] = zext i8 [[T]] to i16
+; DISABLE-NEXT:    [[ADD:%.*]] = add nsw i16 [[EXT]], [[B]]
+; DISABLE-NEXT:    [[A:%.*]] = icmp slt i8 [[T]], 20
+; DISABLE-NEXT:    br i1 [[A]], label [[TRUE:%.*]], label [[FALSE:%.*]]
+; DISABLE:       true:
+; DISABLE-NEXT:    [[S:%.*]] = sext i16 [[ADD]] to i32
+; DISABLE-NEXT:    store i32 [[S]], ptr [[Q]], align 4
+; DISABLE-NEXT:    ret void
+; DISABLE:       false:
+; DISABLE-NEXT:    ret void
 ;
-; DISABLE: [[ZEXTLD:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i16
-; DISABLE: [[ADD:%[a-zA-Z_0-9-]+]] = add nsw i16 [[ZEXTLD]], %b
-; DISABLE: [[RES:%[a-zA-Z_0-9-]+]] = sext i16 [[ADD]] to i32
-; OPTALL: store i32 [[RES]], ptr %q
-; OPTALL: ret
-define void @promoteMergeExtArgSExt(ptr %p, ptr %q, i16 %b) {
 entry:
   %t = load i8, ptr %p
   %ext = zext i8 %t to i16
@@ -281,30 +470,38 @@ false:
 ;       a zext of %ld.
 ; Currently we do not try to reuse existing extensions, so in the end we have
 ; 3 identical zext of %ld. The extensions will be CSE'ed by SDag.
+define void @severalPromotions(ptr %addr1, ptr %addr2, i8 %a, i32 %b) {
+; OPT-LABEL: define void @severalPromotions(
+; OPT-SAME: ptr [[ADDR1:%.*]], ptr [[ADDR2:%.*]], i8 [[A:%.*]], i32 [[B:%.*]]) {
+; OPT-NEXT:    [[LD:%.*]] = load i8, ptr [[ADDR1]], align 1
+; OPT-NEXT:    [[PROMOTED4:%.*]] = zext i8 [[LD]] to i64
+; OPT-NEXT:    [[PROMOTED3:%.*]] = zext i8 [[LD]] to i64
+; OPT-NEXT:    [[LD2:%.*]] = load i32, ptr [[ADDR2]], align 4
+; OPT-NEXT:    [[SEXTADD:%.*]] = sext i32 [[LD2]] to i64
+; OPT-NEXT:    [[PROMOTED1:%.*]] = zext i8 [[LD]] to i64
+; OPT-NEXT:    [[ADD:%.*]] = add nsw i64 [[SEXTADD]], [[PROMOTED1]]
+; OPT-NEXT:    [[PROMOTED2:%.*]] = zext i8 [[A]] to i64
+; OPT-NEXT:    [[ADDZA:%.*]] = add nsw i64 [[PROMOTED2]], [[PROMOTED3]]
+; OPT-NEXT:    [[SEXTADDB:%.*]] = sext i32 [[B]] to i64
+; OPT-NEXT:    [[ADDB:%.*]] = add nsw i64 [[SEXTADDB]], [[PROMOTED4]]
+; OPT-NEXT:    call void @dummy(i64 [[ADD]], i64 [[ADDZA]], i64 [[ADDB]])
+; OPT-NEXT:    ret void
 ;
-; OPTALL-LABEL: @severalPromotions
-; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i8, ptr %addr1
-; OPT-NEXT: [[ZEXTLD1_1:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i64
-; OPT-NEXT: [[ZEXTLD1_2:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i64
-; OPT-NEXT: [[LD2:%[a-zA-Z_0-9-]+]] = load i32, ptr %addr2
-; OPT-NEXT: [[SEXTLD2:%[a-zA-Z_0-9-]+]] = sext i32 [[LD2]] to i64
-; OPT-NEXT: [[ZEXTLD1_3:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i64
-; OPT-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = add nsw i64 [[SEXTLD2]], [[ZEXTLD1_3]]
-; OPT-NEXT: [[ZEXTLD1_4:%[a-zA-Z_0-9-]+]] = zext i8 %a to i64
-; OPT-NEXT: [[RESZA:%[a-zA-Z_0-9-]+]] = add nsw i64 [[ZEXTLD1_4]], [[ZEXTLD1_2]]
-; OPT-NEXT: [[SEXTB:%[a-zA-Z_0-9-]+]] = sext i32 %b to i64
-; OPT-NEXT: [[RESB:%[a-zA-Z_0-9-]+]] = add nsw i64 [[SEXTB]], [[ZEXTLD1_1]]
-;
-; DISABLE: [[ADD:%[a-zA-Z_0-9-]+]] = add nsw i32
-; DISABLE: [[RES:%[a-zA-Z_0-9-]+]]  = sext i32 [[ADD]] to i64
-; DISABLE: [[ADDZA:%[a-zA-Z_0-9-]+]] = add nsw i32
-; DISABLE: [[RESZA:%[a-zA-Z_0-9-]+]]  = sext i32 [[ADDZA]] to i64
-; DISABLE: [[ADDB:%[a-zA-Z_0-9-]+]] = add nsw i32
-; DISABLE: [[RESB:%[a-zA-Z_0-9-]+]]  = sext i32 [[ADDB]] to i64
+; DISABLE-LABEL: define void @severalPromotions(
+; DISABLE-SAME: ptr [[ADDR1:%.*]], ptr [[ADDR2:%.*]], i8 [[A:%.*]], i32 [[B:%.*]]) {
+; DISABLE-NEXT:    [[LD:%.*]] = load i8, ptr [[ADDR1]], align 1
+; DISABLE-NEXT:    [[ZEXTLD:%.*]] = zext i8 [[LD]] to i32
+; DISABLE-NEXT:    [[LD2:%.*]] = load i32, ptr [[ADDR2]], align 4
+; DISABLE-NEXT:    [[ADD:%.*]] = add nsw i32 [[LD2]], [[ZEXTLD]]
+; DISABLE-NEXT:    [[SEXTADD:%.*]] = sext i32 [[ADD]] to i64
+; DISABLE-NEXT:    [[ZEXTA:%.*]] = zext i8 [[A]] to i32
+; DISABLE-NEXT:    [[ADDZA:%.*]] = add nsw i32 [[ZEXTA]], [[ZEXTLD]]
+; DISABLE-NEXT:    [[SEXTADDZA:%.*]] = sext i32 [[ADDZA]] to i64
+; DISABLE-NEXT:    [[ADDB:%.*]] = add nsw i32 [[B]], [[ZEXTLD]]
+; DISABLE-NEXT:    [[SEXTADDB:%.*]] = sext i32 [[ADDB]] to i64
+; DISABLE-NEXT:    call void @dummy(i64 [[SEXTADD]], i64 [[SEXTADDZA]], i64 [[SEXTADDB]])
+; DISABLE-NEXT:    ret void
 ;
-; OPTALL: call void @dummy(i64 [[RES]], i64 [[RESZA]], i64 [[RESB]])
-; OPTALL: ret
-define void @severalPromotions(ptr %addr1, ptr %addr2, i8 %a, i32 %b) {
   %ld = load i8, ptr %addr1
   %zextld = zext i8 %ld to i32
   %ld2 = load i32, ptr %addr2
@@ -323,11 +520,13 @@ declare void @dummy(i64, i64, i64)
 
 ; Make sure we do not try to promote vector types since the type promotion
 ; helper does not support them for now.
-; OPTALL-LABEL: @vectorPromotion
-; OPTALL: [[SHL:%[a-zA-Z_0-9-]+]] = shl nuw nsw <2 x i32> zeroinitializer, <i32 8, i32 8>
-; OPTALL: [[ZEXT:%[a-zA-Z_0-9-]+]] = zext <2 x i32> [[SHL]] to <2 x i64>
-; OPTALL: ret
 define void @vectorPromotion() {
+; OPTALL-LABEL: define void @vectorPromotion() {
+; OPTALL-NEXT:  entry:
+; OPTALL-NEXT:    [[A:%.*]] = shl nuw nsw <2 x i32> zeroinitializer, <i32 8, i32 8>
+; OPTALL-NEXT:    [[B:%.*]] = zext <2 x i32> [[A]] to <2 x i64>
+; OPTALL-NEXT:    ret void
+;
 entry:
   %a = shl nuw nsw <2 x i32> zeroinitializer, <i32 8, i32 8>
   %b = zext <2 x i32> %a to <2 x i64>
@@ -341,17 +540,23 @@ entry:
 ; Make sure we support promotion of operands that produces a Value as opposed
 ; to an instruction.
 ; This used to cause a crash.
-; OPTALL-LABEL: @promotionOfArgEndsUpInValue
-; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i16, ptr %addr
-
-; OPT-NEXT: [[SEXT:%[a-zA-Z_0-9-]+]] = sext i16 [[LD]] to i32
-; OPT-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = add nuw nsw i32 [[SEXT]], zext (i1 icmp ne (ptr getelementptr inbounds ([2 x i32], ptr @c, i64 0, i64 1), ptr @a) to i32)
+define i32 @promotionOfArgEndsUpInValue(ptr %addr) {
+; OPT-LABEL: define i32 @promotionOfArgEndsUpInValue(
+; OPT-SAME: ptr [[ADDR:%.*]]) {
+; OPT-NEXT:  entry:
+; OPT-NEXT:    [[VAL:%.*]] = load i16, ptr [[ADDR]], align 2
+; OPT-NEXT:    [[CONV3:%.*]] = sext i16 [[VAL]] to i32
+; OPT-NEXT:    [[ADD:%.*]] = add nuw nsw i32 [[CONV3]], zext (i1 icmp ne (ptr getelementptr inbounds ([2 x i32], ptr @c, i64 0, i64 1), ptr @a) to i32)
+; OPT-NEXT:    ret i32 [[ADD]]
 ;
-; DISABLE-NEXT: [[ADD:%[a-zA-Z_0-9-]+]] = add nuw nsw i16 [[LD]], zext (i1 icmp ne (ptr getelementptr inbounds ([2 x i32], ptr @c, i64 0, i64 1), ptr @a) to i16)
-; DISABLE-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = sext i16 [[ADD]] to i32
+; DISABLE-LABEL: define i32 @promotionOfArgEndsUpInValue(
+; DISABLE-SAME: ptr [[ADDR:%.*]]) {
+; DISABLE-NEXT:  entry:
+; DISABLE-NEXT:    [[VAL:%.*]] = load i16, ptr [[ADDR]], align 2
+; DISABLE-NEXT:    [[ADD:%.*]] = add nuw nsw i16 [[VAL]], zext (i1 icmp ne (ptr getelementptr inbounds ([2 x i32], ptr @c, i64 0, i64 1), ptr @a) to i16)
+; DISABLE-NEXT:    [[CONV3:%.*]] = sext i16 [[ADD]] to i32
+; DISABLE-NEXT:    ret i32 [[CONV3]]
 ;
-; OPTALL-NEXT: ret i32 [[RES]]
-define i32 @promotionOfArgEndsUpInValue(ptr %addr) {
 entry:
   %val = load i16, ptr %addr
   %add = add nuw nsw i16 %val, zext (i1 icmp ne (ptr getelementptr inbounds ([2 x i32], ptr @c, i64 0, i64 1), ptr @a) to i16)
@@ -360,25 +565,31 @@ entry:
 }
 
 ; Check that we see that one zext can be derived from the other for free.
-; OPTALL-LABEL: @promoteTwoArgZextWithSourceExtendedTwice
-; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i8, ptr %p
-
-; OPT-NEXT: [[ZEXT64:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i64
-; OPT-NEXT: [[ZEXT32:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i32
-; OPT-NEXT: [[RES32:%[a-zA-Z_0-9-]+]] = add nuw i32 [[ZEXT32]], %b
-; OPT-NEXT: [[RES64:%[a-zA-Z_0-9-]+]] = add nuw i64 [[ZEXT64]], 12
-; OPT-NEXT: store i32 [[RES32]], ptr %addr
-; OPT-NEXT: store i64 [[RES64]], ptr %q
+define void @promoteTwoArgZextWithSourceExtendedTwice(ptr %p, ptr %q, i32 %b, ptr %addr) {
+; OPT-LABEL: define void @promoteTwoArgZextWithSourceExtendedTwice(
+; OPT-SAME: ptr [[P:%.*]], ptr [[Q:%.*]], i32 [[B:%.*]], ptr [[ADDR:%.*]]) {
+; OPT-NEXT:  entry:
+; OPT-NEXT:    [[T:%.*]] = load i8, ptr [[P]], align 1
+; OPT-NEXT:    [[PROMOTED:%.*]] = zext i8 [[T]] to i64
+; OPT-NEXT:    [[ZEXTT:%.*]] = zext i8 [[T]] to i32
+; OPT-NEXT:    [[ADD:%.*]] = add nuw i32 [[ZEXTT]], [[B]]
+; OPT-NEXT:    [[ADD2:%.*]] = add nuw i64 [[PROMOTED]], 12
+; OPT-NEXT:    store i32 [[ADD]], ptr [[ADDR]], align 4
+; OPT-NEXT:    store i64 [[ADD2]], ptr [[Q]], align 8
+; OPT-NEXT:    ret void
 ;
-; DISABLE-NEXT: [[ZEXT32:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i32
-; DISABLE-NEXT: [[RES32:%[a-zA-Z_0-9-]+]] = add nuw i32 [[ZEXT32]], %b
-; DISABLE-NEXT: [[RES2_32:%[a-zA-Z_0-9-]+]] = add nuw i32 [[ZEXT32]], 12
-; DISABLE-NEXT: store i32 [[RES32]], ptr %addr
-; DISABLE-NEXT: [[ZEXT64:%[a-zA-Z_0-9-]+]] = zext i32 [[RES2_32]] to i64
-; DISABLE-NEXT: store i64 [[ZEXT64]], ptr %q
+; DISABLE-LABEL: define void @promoteTwoArgZextWithSourceExtendedTwice(
+; DISABLE-SAME: ptr [[P:%.*]], ptr [[Q:%.*]], i32 [[B:%.*]], ptr [[ADDR:%.*]]) {
+; DISABLE-NEXT:  entry:
+; DISABLE-NEXT:    [[T:%.*]] = load i8, ptr [[P]], align 1
+; DISABLE-NEXT:    [[ZEXTT:%.*]] = zext i8 [[T]] to i32
+; DISABLE-NEXT:    [[ADD:%.*]] = add nuw i32 [[ZEXTT]], [[B]]
+; DISABLE-NEXT:    [[ADD2:%.*]] = add nuw i32 [[ZEXTT]], 12
+; DISABLE-NEXT:    store i32 [[ADD]], ptr [[ADDR]], align 4
+; DISABLE-NEXT:    [[S:%.*]] = zext i32 [[ADD2]] to i64
+; DISABLE-NEXT:    store i64 [[S]], ptr [[Q]], align 8
+; DISABLE-NEXT:    ret void
 ;
-; OPTALL-NEXT: ret void
-define void @promoteTwoArgZextWithSourceExtendedTwice(ptr %p, ptr %q, i32 %b, ptr %addr) {
 entry:
   %t = load i8, ptr %p
   %zextt = zext i8 %t to i32

From 68da7435ed0c5a9ccaecfd53caa417145d0a52e0 Mon Sep 17 00:00:00 2001
From: Slava Zakharin <szakharin@nvidia.com>
Date: Wed, 1 Nov 2023 08:40:11 -0700
Subject: [PATCH 691/877] [flang][openacc] Fixed bounds for full array
 sections. (#70859)

We have to use 0-based lowerbound/upperbound for data operands like
`a(:)`.
---
 flang/lib/Lower/DirectivesCommon.h          | 16 ++++-----
 flang/test/Lower/OpenACC/acc-enter-data.f90 | 36 ++++++++++-----------
 2 files changed, 24 insertions(+), 28 deletions(-)

diff --git a/flang/lib/Lower/DirectivesCommon.h b/flang/lib/Lower/DirectivesCommon.h
index 33b8198a2518b..b2e9ed374d210 100644
--- a/flang/lib/Lower/DirectivesCommon.h
+++ b/flang/lib/Lower/DirectivesCommon.h
@@ -707,7 +707,10 @@ genBoundsOps(fir::FirOpBuilder &builder, mlir::Location loc,
           asFortran << lexpr->AsFortran();
         }
       } else {
-        lbound = defaultLb ? zero : baseLb;
+        // If the lower bound is not specified, then the section
+        // starts from offset 0 of the dimension.
+        // Note that the lowerbound in the BoundsOp is always 0-based.
+        lbound = zero;
       }
       asFortran << ':';
       const auto &upper{std::get<1>(triplet->t)};
@@ -745,16 +748,9 @@ genBoundsOps(fir::FirOpBuilder &builder, mlir::Location loc,
         }
       }
       if (!ubound) {
+        // ub = extent - 1
         extent = fir::factory::readExtent(builder, loc, dataExv, dimension);
-        if (defaultLb) {
-          // ub = extent - 1
-          ubound = builder.create<mlir::arith::SubIOp>(loc, extent, one);
-        } else {
-          // ub = baseLb + extent - 1
-          mlir::Value lbExt =
-              builder.create<mlir::arith::AddIOp>(loc, extent, baseLb);
-          ubound = builder.create<mlir::arith::SubIOp>(loc, lbExt, one);
-        }
+        ubound = builder.create<mlir::arith::SubIOp>(loc, extent, one);
       }
       mlir::Value bound = builder.create<BoundsOp>(
           loc, boundTy, lbound, ubound, extent, stride, strideInBytes, baseLb);
diff --git a/flang/test/Lower/OpenACC/acc-enter-data.f90 b/flang/test/Lower/OpenACC/acc-enter-data.f90
index 0938d137ca02c..59de2071939ea 100644
--- a/flang/test/Lower/OpenACC/acc-enter-data.f90
+++ b/flang/test/Lower/OpenACC/acc-enter-data.f90
@@ -280,18 +280,17 @@ subroutine acc_enter_data_dummy(a, b, n, m)
 !HLFIR: %[[LOAD_N:.*]] = fir.load %[[DECLN]]#0 : !fir.ref<i32>
 !CHECK: %[[CONVERT_N:.*]] = fir.convert %[[LOAD_N]] : (i32) -> index
 !CHECK: %[[LB:.*]] = arith.subi %[[CONVERT_N]], %[[N_IDX]] : index
-!CHECK: %[[LBEXT:.*]] = arith.addi %[[EXT_B]], %[[N_IDX]] : index
-!CHECK: %[[UB:.*]] = arith.subi %[[LBEXT:.*]], %[[ONE]] : index
+!CHECK: %[[UB:.*]] = arith.subi %[[EXT_B]], %[[ONE]] : index
 !CHECK: %[[BOUND1:.*]] = acc.bounds lowerbound(%[[LB]] : index) upperbound(%[[UB]] : index) extent(%[[EXT_B]] : index) stride(%[[ONE]] : index) startIdx(%[[N_IDX]] : index)
 !FIR: %[[CREATE1:.*]] = acc.create varPtr(%[[B]] : !fir.ref<!fir.array<?xf32>>) bounds(%[[BOUND1]]) -> !fir.ref<!fir.array<?xf32>> {name = "b(n:)", structured = false}
 !HLFIR: %[[CREATE1:.*]] = acc.create varPtr(%[[DECLB]]#1 : !fir.ref<!fir.array<?xf32>>) bounds(%[[BOUND1]]) -> !fir.ref<!fir.array<?xf32>> {name = "b(n:)", structured = false}
 !CHECK: acc.enter_data dataOperands(%[[CREATE1]] : !fir.ref<!fir.array<?xf32>>)
 
   !$acc enter data create(b(:))
+!CHECK: %[[ZERO:.*]] = arith.constant 0 : index
 !CHECK: %[[ONE:.*]] = arith.constant 1 : index
-!CHECK: %[[LBEXT:.*]] = arith.addi %[[EXT_B]], %[[N_IDX]] : index
-!CHECK: %[[UB:.*]] = arith.subi %[[LBEXT]], %[[ONE]] : index
-!CHECK: %[[BOUND1:.*]] = acc.bounds lowerbound(%[[N_IDX]] : index) upperbound(%[[UB]] : index) extent(%[[EXT_B]] : index) stride(%[[ONE]] : index) startIdx(%[[N_IDX]] : index)
+!CHECK: %[[UB:.*]] = arith.subi %[[EXT_B]], %[[ONE]] : index
+!CHECK: %[[BOUND1:.*]] = acc.bounds lowerbound(%[[ZERO]] : index) upperbound(%[[UB]] : index) extent(%[[EXT_B]] : index) stride(%[[ONE]] : index) startIdx(%[[N_IDX]] : index)
 !FIR: %[[CREATE1:.*]] = acc.create varPtr(%[[B]] : !fir.ref<!fir.array<?xf32>>) bounds(%[[BOUND1]]) -> !fir.ref<!fir.array<?xf32>> {name = "b(:)", structured = false}
 !HLFIR: %[[CREATE1:.*]] = acc.create varPtr(%[[DECLB]]#1 : !fir.ref<!fir.array<?xf32>>) bounds(%[[BOUND1]]) -> !fir.ref<!fir.array<?xf32>> {name = "b(:)", structured = false}
 !CHECK: acc.enter_data dataOperands(%[[CREATE1]] : !fir.ref<!fir.array<?xf32>>)
@@ -323,19 +322,20 @@ subroutine acc_enter_data_non_default_lb()
 !CHECK: acc.enter_data dataOperands(%[[CREATE]] : !fir.ref<!fir.array<10xi32>>)
 
   !$acc enter data create(a(:))
+!CHECK: %[[ZERO:.*]] = arith.constant 0 : index
 !CHECK: %[[ONE:.*]] = arith.constant 1 : index
-!CHECK: %[[LBEXT:.*]] = arith.addi %[[EXTENT_C10]], %[[BASELB]] : index
-!CHECK: %[[UB:.*]] = arith.subi %[[LBEXT]], %[[ONE]] : index
-!CHECK: %[[BOUND:.*]] = acc.bounds lowerbound(%[[BASELB]] : index) upperbound(%[[UB]] : index) extent(%[[EXTENT_C10]] : index) stride(%[[ONE]] : index) startIdx(%[[BASELB]] : index)
+!CHECK: %[[UB:.*]] = arith.subi %[[EXTENT_C10]], %[[ONE]] : index
+!CHECK: %[[BOUND:.*]] = acc.bounds lowerbound(%[[ZERO]] : index) upperbound(%[[UB]] : index) extent(%[[EXTENT_C10]] : index) stride(%[[ONE]] : index) startIdx(%[[BASELB]] : index)
 !FIR: %[[CREATE:.*]] = acc.create varPtr(%[[A]] : !fir.ref<!fir.array<10xi32>>) bounds(%[[BOUND]]) -> !fir.ref<!fir.array<10xi32>> {name = "a(:)", structured = false}
 !HLFIR: %[[CREATE:.*]] = acc.create varPtr(%[[DECLA]]#1 : !fir.ref<!fir.array<10xi32>>) bounds(%[[BOUND]]) -> !fir.ref<!fir.array<10xi32>> {name = "a(:)", structured = false}
 !CHECK: acc.enter_data dataOperands(%[[CREATE]] : !fir.ref<!fir.array<10xi32>>)
 
   !$acc enter data create(a(:6))
+!CHECK: %[[ZERO:.*]] = arith.constant 0 : index
 !CHECK: %[[ONE:.*]] = arith.constant 1 : index
 !CHECK: %[[SECTIONUB:.*]] = arith.constant 6 : index
 !CHECK: %[[UB:.*]] = arith.subi %[[SECTIONUB]], %[[BASELB]] : index
-!CHECK: %[[BOUND:.*]] = acc.bounds lowerbound(%[[BASELB]] : index) upperbound(%[[UB]] : index) stride(%[[ONE]] : index) startIdx(%[[BASELB]] : index)
+!CHECK: %[[BOUND:.*]] = acc.bounds lowerbound(%[[ZERO]] : index) upperbound(%[[UB]] : index) stride(%[[ONE]] : index) startIdx(%[[BASELB]] : index)
 !FIR: %[[CREATE:.*]] = acc.create varPtr(%[[A]] : !fir.ref<!fir.array<10xi32>>) bounds(%[[BOUND]]) -> !fir.ref<!fir.array<10xi32>> {name = "a(:6)", structured = false}
 !HLFIR: %[[CREATE:.*]] = acc.create varPtr(%[[DECLA]]#1 : !fir.ref<!fir.array<10xi32>>) bounds(%[[BOUND]]) -> !fir.ref<!fir.array<10xi32>> {name = "a(:6)", structured = false}
 !CHECK: acc.enter_data dataOperands(%[[CREATE]] : !fir.ref<!fir.array<10xi32>>)
@@ -344,8 +344,7 @@ subroutine acc_enter_data_non_default_lb()
 !CHECK: %[[ONE:.*]] = arith.constant 1 : index
 !CHECK: %[[SECTIONLB:.*]] = arith.constant 4 : index
 !CHECK: %[[LB:.*]] = arith.subi %[[SECTIONLB]], %[[BASELB]] : index
-!CHECK: %[[LBEXT:.*]] = arith.addi %[[EXTENT_C10]], %[[BASELB]] : index
-!CHECK: %[[UB:.*]] = arith.subi %[[LBEXT]], %[[ONE]] : index
+!CHECK: %[[UB:.*]] = arith.subi %[[EXTENT_C10]], %[[ONE]] : index
 !CHECK: %[[BOUND:.*]] = acc.bounds lowerbound(%[[LB]] : index) upperbound(%[[UB]] : index) extent(%[[EXTENT_C10]] : index) stride(%[[ONE]] : index) startIdx(%[[BASELB]] : index)
 !FIR: %[[CREATE:.*]] = acc.create varPtr(%[[A]] : !fir.ref<!fir.array<10xi32>>) bounds(%[[BOUND]]) -> !fir.ref<!fir.array<10xi32>> {name = "a(4:)", structured = false}
 !HLFIR: %[[CREATE:.*]] = acc.create varPtr(%[[DECLA]]#1 : !fir.ref<!fir.array<10xi32>>) bounds(%[[BOUND]]) -> !fir.ref<!fir.array<10xi32>> {name = "a(4:)", structured = false}
@@ -503,6 +502,7 @@ subroutine acc_enter_data_assumed(a, b, n, m)
 !CHECK: acc.enter_data dataOperands(%[[CREATE]] : !fir.ref<!fir.array<?xf32>>)
 
   !$acc enter data create(b(:m))
+!CHECK: %[[ZERO:.*]] = arith.constant 0 : index
 !CHECK: %[[C0:.*]] = arith.constant 0 : index
 !FIR: %[[DIMS0:.*]]:3 = fir.box_dims %[[B]], %[[C0]] : (!fir.box<!fir.array<?xf32>>, index) -> (index, index, index)
 !HLFIR: %[[DIMS0:.*]]:3 = fir.box_dims %[[DECLB]]#1, %[[C0]] : (!fir.box<!fir.array<?xf32>>, index) -> (index, index, index)
@@ -510,7 +510,7 @@ subroutine acc_enter_data_assumed(a, b, n, m)
 !HLFIR: %[[LOAD_M:.*]] = fir.load %[[DECLM]]#0 : !fir.ref<i32>
 !CHECK: %[[CONVERT_M:.*]] = fir.convert %[[LOAD_M]] : (i32) -> index
 !CHECK: %[[UB:.*]] = arith.subi %[[CONVERT_M]], %[[LB_C10_IDX]] : index
-!CHECK: %[[BOUND:.*]] = acc.bounds lowerbound(%[[LB_C10_IDX]] : index) upperbound(%[[UB]] : index) stride(%[[DIMS0]]#2 : index) startIdx(%[[LB_C10_IDX]] : index) {strideInBytes = true}
+!CHECK: %[[BOUND:.*]] = acc.bounds lowerbound(%[[ZERO]] : index) upperbound(%[[UB]] : index) stride(%[[DIMS0]]#2 : index) startIdx(%[[LB_C10_IDX]] : index) {strideInBytes = true}
 !FIR: %[[BOX_ADDR:.*]] = fir.box_addr %[[B]] : (!fir.box<!fir.array<?xf32>>) -> !fir.ref<!fir.array<?xf32>>
 !HLFIR: %[[BOX_ADDR:.*]] = fir.box_addr %[[DECLB]]#1 : (!fir.box<!fir.array<?xf32>>) -> !fir.ref<!fir.array<?xf32>>
 !CHECK: %[[CREATE:.*]] = acc.create varPtr(%[[BOX_ADDR]] : !fir.ref<!fir.array<?xf32>>) bounds(%[[BOUND]]) -> !fir.ref<!fir.array<?xf32>> {name = "b(:m)", structured = false}
@@ -559,6 +559,7 @@ subroutine acc_enter_data_allocatable()
   !$acc enter data create(a(:))
 !FIR: %[[BOX_A_0:.*]] = fir.load %[[A]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
 !HLFIR: %[[BOX_A_0:.*]] = fir.load %[[DECLA]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
+!CHECK: %[[ZERO:.*]] = arith.constant 0 : index
 !CHECK: %[[ONE:.*]] = arith.constant 1 : index
 !FIR: %[[BOX_A_1:.*]] = fir.load %[[A]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
 !HLFIR: %[[BOX_A_1:.*]] = fir.load %[[DECLA]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
@@ -570,9 +571,8 @@ subroutine acc_enter_data_allocatable()
 !HLFIR: %[[BOX_A_2:.*]] = fir.load %[[DECLA]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
 !CHECK: %[[C0:.*]] = arith.constant 0 : index
 !CHECK: %[[DIMS2:.*]]:3 = fir.box_dims %[[BOX_A_2]], %[[C0]] : (!fir.box<!fir.heap<!fir.array<?xf32>>>, index) -> (index, index, index)
-!CHECK: %[[LBEXT:.*]] = arith.addi %[[DIMS2]]#1, %[[DIMS0]]#0 : index
-!CHECK: %[[UB:.*]] = arith.subi %[[LBEXT]], %[[ONE]] : index
-!CHECK: %[[BOUND:.*]] = acc.bounds lowerbound(%[[DIMS0]]#0 : index) upperbound(%[[UB:.*]] : index) stride(%[[DIMS1]]#2 : index) startIdx(%[[DIMS0]]#0 : index) {strideInBytes = true}
+!CHECK: %[[UB:.*]] = arith.subi %[[DIMS2]]#1, %[[ONE]] : index
+!CHECK: %[[BOUND:.*]] = acc.bounds lowerbound(%[[ZERO]] : index) upperbound(%[[UB:.*]] : index) stride(%[[DIMS1]]#2 : index) startIdx(%[[DIMS0]]#0 : index) {strideInBytes = true}
 !CHECK: %[[BOX_ADDR:.*]] = fir.box_addr %[[BOX_A_0]] : (!fir.box<!fir.heap<!fir.array<?xf32>>>) -> !fir.heap<!fir.array<?xf32>>
 !CHECK: %[[CREATE:.*]] = acc.create varPtr(%[[BOX_ADDR]] : !fir.heap<!fir.array<?xf32>>) bounds(%[[BOUND]]) -> !fir.heap<!fir.array<?xf32>> {name = "a(:)", structured = false}
 !CHECK: acc.enter_data dataOperands(%[[CREATE]] : !fir.heap<!fir.array<?xf32>>)
@@ -611,8 +611,7 @@ subroutine acc_enter_data_allocatable()
 !HLFIR: %[[BOX_A_1:.*]] = fir.load %[[DECLA]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
 !CHECK: %[[C0:.*]] = arith.constant 0 : index
 !CHECK: %[[DIMS2:.*]]:3 = fir.box_dims %[[BOX_A_1]], %[[C0]] : (!fir.box<!fir.heap<!fir.array<?xf32>>>, index) -> (index, index, index)
-!CHECK: %[[LBEXT:.*]] = arith.addi %[[DIMS2:.*]]#1, %[[DIMS0]]#0 : index
-!CHECK: %[[UB:.*]] = arith.subi %[[LBEXT]], %[[ONE]] : index
+!CHECK: %[[UB:.*]] = arith.subi %[[DIMS2]]#1, %[[ONE]] : index
 !CHECK: %[[BOUND:.*]] = acc.bounds lowerbound(%[[LB]] : index) upperbound(%[[UB]] : index) extent(%[[DIMS2]]#1 : index) stride(%[[DIMS1]]#2 : index) startIdx(%[[DIMS0]]#0 : index) {strideInBytes = true}
 !CHECK: %[[BOX_ADDR:.*]] = fir.box_addr %[[BOX_A_0]] : (!fir.box<!fir.heap<!fir.array<?xf32>>>) -> !fir.heap<!fir.array<?xf32>>
 !CHECK: %[[CREATE:.*]] = acc.create varPtr(%[[BOX_ADDR]] : !fir.heap<!fir.array<?xf32>>) bounds(%[[BOUND]]) -> !fir.heap<!fir.array<?xf32>> {name = "a(3:)", structured = false}
@@ -621,6 +620,7 @@ subroutine acc_enter_data_allocatable()
   !$acc enter data create(a(:7))
 !FIR: %[[BOX_A_0:.*]] = fir.load %[[A]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
 !HLFIR: %[[BOX_A_0:.*]] = fir.load %[[DECLA]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
+!CHECK: %[[ZERO:.*]] = arith.constant 0 : index
 !FIR: %[[BOX_A_1:.*]] = fir.load %[[A]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
 !HLFIR: %[[BOX_A_1:.*]] = fir.load %[[DECLA]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf32>>>>
 !CHECK: %[[C0:.*]] = arith.constant 0 : index
@@ -629,7 +629,7 @@ subroutine acc_enter_data_allocatable()
 !CHECK: %[[DIMS1:.*]]:3 = fir.box_dims %[[BOX_A_0]], %[[C0]] : (!fir.box<!fir.heap<!fir.array<?xf32>>>, index) -> (index, index, index)
 !CHECK: %[[C7:.*]] = arith.constant 7 : index
 !CHECK: %[[UB:.*]] = arith.subi %[[C7]], %[[DIMS0]]#0 : index
-!CHECK: %[[BOUND:.*]] = acc.bounds lowerbound(%[[DIMS0]]#0 : index) upperbound(%[[UB]] : index) stride(%[[DIMS1]]#2 : index) startIdx(%[[DIMS0]]#0 : index) {strideInBytes = true}
+!CHECK: %[[BOUND:.*]] = acc.bounds lowerbound(%[[ZERO]] : index) upperbound(%[[UB]] : index) stride(%[[DIMS1]]#2 : index) startIdx(%[[DIMS0]]#0 : index) {strideInBytes = true}
 !CHECK: %[[BOX_ADDR:.*]] = fir.box_addr %[[BOX_A_0]] : (!fir.box<!fir.heap<!fir.array<?xf32>>>) -> !fir.heap<!fir.array<?xf32>>
 !CHECK: %[[CREATE:.*]] = acc.create varPtr(%[[BOX_ADDR]] : !fir.heap<!fir.array<?xf32>>) bounds(%[[BOUND]]) -> !fir.heap<!fir.array<?xf32>> {name = "a(:7)", structured = false}
 !CHECK: acc.enter_data dataOperands(%[[CREATE]] : !fir.heap<!fir.array<?xf32>>)

From c28b7eb4966741895d95e021652ab017ddc2ac73 Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev@outlook.com>
Date: Wed, 1 Nov 2023 08:35:18 -0700
Subject: [PATCH 692/877] [SLP]Fix handling of -slp-vectorize-hor-store for
 values with many uses.

---
 llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp     |  4 ++--
 .../SLPVectorizer/X86/horizontal-store-many-uses.ll | 13 ++++---------
 2 files changed, 6 insertions(+), 11 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 88c72dde041e0..5ea3b3e245249 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -15822,8 +15822,8 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
         // to investigate if we can safely turn on slp-vectorize-hor-store
         // instead to allow lookup for reduction chains in all non-vectorized
         // stores (need to check side effects and compile time).
-        TryToVectorizeRoot = (I == Stores.end() || I->second.size() == 1) &&
-                             SI->getValueOperand()->hasOneUse();
+        TryToVectorizeRoot |= (I == Stores.end() || I->second.size() == 1) &&
+                              SI->getValueOperand()->hasOneUse();
       }
       if (TryToVectorizeRoot) {
         for (auto *V : it->operand_values()) {
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-store-many-uses.ll b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-store-many-uses.ll
index d93f1edfc5971..b94b4d99f58fe 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-store-many-uses.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-store-many-uses.ll
@@ -6,15 +6,10 @@ define void @test(ptr noalias %pl, ptr noalias %res, ptr noalias %p2) {
 ; CHECK-LABEL: define void @test(
 ; CHECK-SAME: ptr noalias [[PL:%.*]], ptr noalias [[RES:%.*]], ptr noalias [[P2:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr @arr_i32, align 16
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 1), align 4
-; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP1]], [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 2), align 8
-; CHECK-NEXT:    [[ADD_1:%.*]] = add nsw i32 [[TMP2]], [[ADD]]
-; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr_i32, i64 0, i64 3), align 4
-; CHECK-NEXT:    [[ADD_2:%.*]] = add nsw i32 [[TMP3]], [[ADD_1]]
-; CHECK-NEXT:    store i32 [[ADD_2]], ptr [[P2]], align 16
-; CHECK-NEXT:    store i32 [[ADD_2]], ptr [[RES]], align 16
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr @arr_i32, align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP0]])
+; CHECK-NEXT:    store i32 [[TMP1]], ptr [[P2]], align 16
+; CHECK-NEXT:    store i32 [[TMP1]], ptr [[RES]], align 16
 ; CHECK-NEXT:    ret void
 ;
 entry:

From 24228aef56291bae6d10f44d1a40b5cf25bc59d2 Mon Sep 17 00:00:00 2001
From: Vlad Serebrennikov <serebrennikov.vladislav@gmail.com>
Date: Wed, 1 Nov 2023 19:47:06 +0400
Subject: [PATCH 693/877] [clang][NFC] Improve locality of recently refactored
 enums (#70943)

This patch moves definition of recently refactored enums closer to the
types where they were originally defined. Since they are scoped enums at
namespace scope now, they can be forward-declared.

Refactorings in question are:
aaba3761db84032541712899964714f3184e8b3d
50dec541f328a251c2830421f354e4439e635def
b120fe8d3288c4dca1b5427ca34839ce8833f71c
ae7b20b583fab1325d8b51fe5f2eaf612de8b95e
4ad2ada5216ee2bb3c334a3233a9ab51f2521b82
49fd28d9601dde429436655ec74234e895c60b89
---
 clang/include/clang/AST/Decl.h       |  23 +++++
 clang/include/clang/AST/DeclBase.h   |  34 +-------
 clang/include/clang/AST/DeclObjC.h   |   2 +
 clang/include/clang/AST/DeclOpenMP.h |   6 ++
 clang/include/clang/AST/Type.h       | 123 ++++++++++++++-------------
 5 files changed, 97 insertions(+), 91 deletions(-)

diff --git a/clang/include/clang/AST/Decl.h b/clang/include/clang/AST/Decl.h
index 1c2158f51aa18..d9b00b1628ab2 100644
--- a/clang/include/clang/AST/Decl.h
+++ b/clang/include/clang/AST/Decl.h
@@ -4059,6 +4059,29 @@ class EnumDecl : public TagDecl {
   static bool classofKind(Kind K) { return K == Enum; }
 };
 
+/// Enum that represents the different ways arguments are passed to and
+/// returned from function calls. This takes into account the target-specific
+/// and version-specific rules along with the rules determined by the
+/// language.
+enum class ArgPassingKind {
+  /// The argument of this type can be passed directly in registers.
+  CanPassInRegs,
+
+  /// The argument of this type cannot be passed directly in registers.
+  /// Records containing this type as a subobject are not forced to be passed
+  /// indirectly. This value is used only in C++. This value is required by
+  /// C++ because, in uncommon situations, it is possible for a class to have
+  /// only trivial copy/move constructors even when one of its subobjects has
+  /// a non-trivial copy/move constructor (if e.g. the corresponding copy/move
+  /// constructor in the derived class is deleted).
+  CannotPassInRegs,
+
+  /// The argument of this type cannot be passed directly in registers.
+  /// Records containing this type as a subobject are forced to be passed
+  /// indirectly.
+  CanNeverPassInRegs
+};
+
 /// Represents a struct/union/class.  For example:
 ///   struct X;                  // Forward declaration, no "body".
 ///   union Y { int A, B; };     // Has body with members A and B (FieldDecls).
diff --git a/clang/include/clang/AST/DeclBase.h b/clang/include/clang/AST/DeclBase.h
index 32b6aed639766..6704c0cd41ecd 100644
--- a/clang/include/clang/AST/DeclBase.h
+++ b/clang/include/clang/AST/DeclBase.h
@@ -1399,37 +1399,9 @@ enum class DeductionCandidate : unsigned char {
   Aggregate,
 };
 
-/// Enum that represents the different ways arguments are passed to and
-/// returned from function calls. This takes into account the target-specific
-/// and version-specific rules along with the rules determined by the
-/// language.
-enum class ArgPassingKind {
-  /// The argument of this type can be passed directly in registers.
-  CanPassInRegs,
-
-  /// The argument of this type cannot be passed directly in registers.
-  /// Records containing this type as a subobject are not forced to be passed
-  /// indirectly. This value is used only in C++. This value is required by
-  /// C++ because, in uncommon situations, it is possible for a class to have
-  /// only trivial copy/move constructors even when one of its subobjects has
-  /// a non-trivial copy/move constructor (if e.g. the corresponding copy/move
-  /// constructor in the derived class is deleted).
-  CannotPassInRegs,
-
-  /// The argument of this type cannot be passed directly in registers.
-  /// Records containing this type as a subobject are forced to be passed
-  /// indirectly.
-  CanNeverPassInRegs
-};
-
-enum class OMPDeclareReductionInitKind {
-  Call,   // Initialized by function call.
-  Direct, // omp_priv(<expr>)
-  Copy    // omp_priv = <expr>
-};
-
-enum class ObjCImplementationControl { None, Required, Optional };
-
+enum class ArgPassingKind;
+enum class OMPDeclareReductionInitKind;
+enum class ObjCImplementationControl;
 enum class LinkageSpecLanguageIDs;
 
 /// DeclContext - This is used only as base class of specific decl types that
diff --git a/clang/include/clang/AST/DeclObjC.h b/clang/include/clang/AST/DeclObjC.h
index 2b205bee51de1..e0b31c58c39a2 100644
--- a/clang/include/clang/AST/DeclObjC.h
+++ b/clang/include/clang/AST/DeclObjC.h
@@ -115,6 +115,8 @@ class ObjCProtocolList : public ObjCList<ObjCProtocolDecl> {
            const SourceLocation *Locs, ASTContext &Ctx);
 };
 
+enum class ObjCImplementationControl { None, Required, Optional };
+
 /// ObjCMethodDecl - Represents an instance or class method declaration.
 /// ObjC methods can be declared within 4 contexts: class interfaces,
 /// categories, protocols, and class implementations. While C++ member
diff --git a/clang/include/clang/AST/DeclOpenMP.h b/clang/include/clang/AST/DeclOpenMP.h
index 2bbd159f59a35..73725e6e85666 100644
--- a/clang/include/clang/AST/DeclOpenMP.h
+++ b/clang/include/clang/AST/DeclOpenMP.h
@@ -158,6 +158,12 @@ class OMPThreadPrivateDecl final : public OMPDeclarativeDirective<Decl> {
   static bool classofKind(Kind K) { return K == OMPThreadPrivate; }
 };
 
+enum class OMPDeclareReductionInitKind {
+  Call,   // Initialized by function call.
+  Direct, // omp_priv(<expr>)
+  Copy    // omp_priv = <expr>
+};
+
 /// This represents '#pragma omp declare reduction ...' directive.
 /// For example, in the following, declared reduction 'foo' for types 'int' and
 /// 'float':
diff --git a/clang/include/clang/AST/Type.h b/clang/include/clang/AST/Type.h
index a15ed46cae0ee..23022d7e1a729 100644
--- a/clang/include/clang/AST/Type.h
+++ b/clang/include/clang/AST/Type.h
@@ -1569,66 +1569,9 @@ enum class AutoTypeKeyword {
   GNUAutoType
 };
 
-/// Capture whether this is a normal array (e.g. int X[4])
-/// an array with a static size (e.g. int X[static 4]), or an array
-/// with a star size (e.g. int X[*]).
-/// 'static' is only allowed on function parameters.
-enum class ArraySizeModifier { Normal, Static, Star };
-
-/// The elaboration keyword that precedes a qualified type name or
-/// introduces an elaborated-type-specifier.
-enum class ElaboratedTypeKeyword {
-  /// The "struct" keyword introduces the elaborated-type-specifier.
-  Struct,
-
-  /// The "__interface" keyword introduces the elaborated-type-specifier.
-  Interface,
-
-  /// The "union" keyword introduces the elaborated-type-specifier.
-  Union,
-
-  /// The "class" keyword introduces the elaborated-type-specifier.
-  Class,
-
-  /// The "enum" keyword introduces the elaborated-type-specifier.
-  Enum,
-
-  /// The "typename" keyword precedes the qualified type name, e.g.,
-  /// \c typename T::type.
-  Typename,
-
-  /// No keyword precedes the qualified type name.
-  None
-};
-
-enum class VectorKind {
-  /// not a target-specific vector type
-  Generic,
-
-  /// is AltiVec vector
-  AltiVecVector,
-
-  /// is AltiVec 'vector Pixel'
-  AltiVecPixel,
-
-  /// is AltiVec 'vector bool ...'
-  AltiVecBool,
-
-  /// is ARM Neon vector
-  Neon,
-
-  /// is ARM Neon polynomial vector
-  NeonPoly,
-
-  /// is AArch64 SVE fixed-length data vector
-  SveFixedLengthData,
-
-  /// is AArch64 SVE fixed-length predicate vector
-  SveFixedLengthPredicate,
-
-  /// is RISC-V RVV fixed-length data vector
-  RVVFixedLengthData,
-};
+enum class ArraySizeModifier;
+enum class ElaboratedTypeKeyword;
+enum class VectorKind;
 
 /// The base class of the type hierarchy.
 ///
@@ -3145,6 +3088,12 @@ class MemberPointerType : public Type, public llvm::FoldingSetNode {
   }
 };
 
+/// Capture whether this is a normal array (e.g. int X[4])
+/// an array with a static size (e.g. int X[static 4]), or an array
+/// with a star size (e.g. int X[*]).
+/// 'static' is only allowed on function parameters.
+enum class ArraySizeModifier { Normal, Static, Star };
+
 /// Represents an array type, per C99 6.7.5.2 - Array Declarators.
 class ArrayType : public Type, public llvm::FoldingSetNode {
 private:
@@ -3474,6 +3423,34 @@ class DependentSizedExtVectorType : public Type, public llvm::FoldingSetNode {
                       QualType ElementType, Expr *SizeExpr);
 };
 
+enum class VectorKind {
+  /// not a target-specific vector type
+  Generic,
+
+  /// is AltiVec vector
+  AltiVecVector,
+
+  /// is AltiVec 'vector Pixel'
+  AltiVecPixel,
+
+  /// is AltiVec 'vector bool ...'
+  AltiVecBool,
+
+  /// is ARM Neon vector
+  Neon,
+
+  /// is ARM Neon polynomial vector
+  NeonPoly,
+
+  /// is AArch64 SVE fixed-length data vector
+  SveFixedLengthData,
+
+  /// is AArch64 SVE fixed-length predicate vector
+  SveFixedLengthPredicate,
+
+  /// is RISC-V RVV fixed-length data vector
+  RVVFixedLengthData,
+};
 
 /// Represents a GCC generic vector type. This type is created using
 /// __attribute__((vector_size(n)), where "n" specifies the vector size in
@@ -5667,6 +5644,32 @@ class InjectedClassNameType : public Type {
   }
 };
 
+/// The elaboration keyword that precedes a qualified type name or
+/// introduces an elaborated-type-specifier.
+enum class ElaboratedTypeKeyword {
+  /// The "struct" keyword introduces the elaborated-type-specifier.
+  Struct,
+
+  /// The "__interface" keyword introduces the elaborated-type-specifier.
+  Interface,
+
+  /// The "union" keyword introduces the elaborated-type-specifier.
+  Union,
+
+  /// The "class" keyword introduces the elaborated-type-specifier.
+  Class,
+
+  /// The "enum" keyword introduces the elaborated-type-specifier.
+  Enum,
+
+  /// The "typename" keyword precedes the qualified type name, e.g.,
+  /// \c typename T::type.
+  Typename,
+
+  /// No keyword precedes the qualified type name.
+  None
+};
+
 /// The kind of a tag type.
 enum TagTypeKind {
   /// The "struct" keyword.

From c605431a4045f78929e8f6f0aeb20ee26ce6792c Mon Sep 17 00:00:00 2001
From: Jacek Caban <jacek@codeweavers.com>
Date: Wed, 1 Nov 2023 16:47:43 +0100
Subject: [PATCH 694/877] [lld] Sort data chunks before code chunks on ARM64EC.
 (#70722)

---
 lld/COFF/Chunks.h                  |  8 +++++--
 lld/COFF/Writer.cpp                |  4 +++-
 lld/test/COFF/arm64ec-codemap.test | 38 ++++++++++++++++++++++++++++++
 3 files changed, 47 insertions(+), 3 deletions(-)

diff --git a/lld/COFF/Chunks.h b/lld/COFF/Chunks.h
index b82bc9416775d..156e7a807cb8f 100644
--- a/lld/COFF/Chunks.h
+++ b/lld/COFF/Chunks.h
@@ -116,7 +116,7 @@ class Chunk {
   bool isHotPatchable() const;
 
   MachineTypes getMachine() const;
-  chpe_range_type getArm64ECRangeType() const;
+  std::optional<chpe_range_type> getArm64ECRangeType() const;
 
 protected:
   Chunk(Kind k = OtherKind) : chunkKind(k), hasData(true), p2Align(0) {}
@@ -437,7 +437,11 @@ inline MachineTypes Chunk::getMachine() const {
   return static_cast<const NonSectionChunk *>(this)->getMachine();
 }
 
-inline chpe_range_type Chunk::getArm64ECRangeType() const {
+inline std::optional<chpe_range_type> Chunk::getArm64ECRangeType() const {
+  // Data sections don't need codemap entries.
+  if (!(getOutputCharacteristics() & llvm::COFF::IMAGE_SCN_MEM_EXECUTE))
+    return std::nullopt;
+
   switch (getMachine()) {
   case AMD64:
     return chpe_range_type::Amd64;
diff --git a/lld/COFF/Writer.cpp b/lld/COFF/Writer.cpp
index 43d8e7c1d5308..895a6e00710c6 100644
--- a/lld/COFF/Writer.cpp
+++ b/lld/COFF/Writer.cpp
@@ -1389,7 +1389,9 @@ void Writer::sortECChunks() {
   for (OutputSection *sec : ctx.outputSections) {
     if (sec->isCodeSection())
       llvm::stable_sort(sec->chunks, [=](const Chunk *a, const Chunk *b) {
-        return a->getArm64ECRangeType() < b->getArm64ECRangeType();
+        std::optional<chpe_range_type> aType = a->getArm64ECRangeType(),
+                                       bType = b->getArm64ECRangeType();
+        return !aType || (bType && *aType < *bType);
       });
   }
 }
diff --git a/lld/test/COFF/arm64ec-codemap.test b/lld/test/COFF/arm64ec-codemap.test
index 424456a6dee66..fcc7db259858a 100644
--- a/lld/test/COFF/arm64ec-codemap.test
+++ b/lld/test/COFF/arm64ec-codemap.test
@@ -110,6 +110,40 @@ DISASMM-NEXT:                 ...
 DISASMM-NEXT: 180002ffe: 00 00                        addb    %al, (%rax)
 DISASMM-NEXT: 180003000: b8 06 00 00 00               movl    $0x6, %eax
 
+Merging data sections into code sections causes data to be separated from the code when sorting chunks.
+
+RUN: lld-link -out:testdm.dll -machine:arm64ec arm64ec-func-sym.obj x86_64-func-sym.obj codemap.obj \
+RUN:          data-sec.obj loadconfig-arm64ec.obj -dll -noentry -merge:.testdata=.text -merge:.rdata=test
+
+RUN: llvm-readobj --coff-load-config testdm.dll | FileCheck -check-prefix=CODEMAPDM %s
+CODEMAPDM:      CodeMap [
+CODEMAPDM-NEXT:     0x2000 - 0x2008  ARM64EC
+CODEMAPDM-NEXT:     0x3000 - 0x3006  X64
+CODEMAPDM-NEXT:     0x5200 - 0x5208  ARM64EC
+CODEMAPDM-NEXT:     0x6000 - 0x6006  X64
+CODEMAPDM-NEXT: ]
+
+RUN: llvm-objdump -d testdm.dll | FileCheck -check-prefix=DISASMDM %s
+DISASMDM:      Disassembly of section .text:
+DISASMDM-EMPTY:
+DISASMDM-NEXT: 0000000180001000 <.text>:
+DISASMDM-NEXT: 180001000: 00000001     udf     #0x1
+DISASMDM-NEXT:                 ...
+DISASMDM-NEXT: 180002000: 52800040     mov     w0, #0x2
+DISASMDM-NEXT: 180002004: d65f03c0     ret
+DISASMDM-NEXT:                 ...
+DISASMDM-NEXT: 180003000: b8 03 00 00 00               movl    $0x3, %eax
+DISASMDM-NEXT: 180003005: c3                           retq
+DISASMDM-EMPTY:
+DISASMDM-NEXT: Disassembly of section test:
+DISASMDM-EMPTY:
+DISASMDM-NEXT: 0000000180005000 <test>:
+DISASMDM:      180005200: 528000a0     mov     w0, #0x5
+DISASMDM-NEXT: 180005204: d65f03c0     ret
+DISASMDM-NEXT:                 ...
+DISASMDM-NEXT: 180006000: b8 06 00 00 00               movl    $0x6, %eax
+DISASMDM-NEXT: 180006005: c3                           retq
+
 #--- arm64-func-sym.s
     .text
     .globl arm64_func_sym
@@ -148,6 +182,10 @@ x86_64_func_sym2:
     movl $6, %eax
     retq
 
+#--- data-sec.s
+    .section .testdata, "rd"
+    .xword 1
+
 #--- codemap.s
     .section .rdata,"dr"
     .globl code_map

From 57384aeb3743dc2b5540cedd965977b6a67ae01f Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Wed, 1 Nov 2023 16:24:33 +0100
Subject: [PATCH 695/877] [ConstantFold] Avoid creating undesirable cast
 expressions

Similar to what we do for binops, for undesirable casts we should
call the constant folding API instead of the constant expr API,
to avoid indirect creation of undesirable cast ops.
---
 llvm/lib/IR/ConstantFold.cpp                  | 23 ++++++++++++++-----
 .../AArch64/arm64-codegen-prepare-extload.ll  |  3 ++-
 .../CodeGen/X86/codegen-prepare-extload.ll    |  3 ++-
 3 files changed, 21 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/IR/ConstantFold.cpp b/llvm/lib/IR/ConstantFold.cpp
index d35106ae02b0d..15a7485904518 100644
--- a/llvm/lib/IR/ConstantFold.cpp
+++ b/llvm/lib/IR/ConstantFold.cpp
@@ -295,6 +295,13 @@ static Constant *ExtractConstantBytes(Constant *C, unsigned ByteStart,
   }
 }
 
+static Constant *foldMaybeUndesirableCast(unsigned opc, Constant *V,
+                                          Type *DestTy) {
+  return ConstantExpr::isDesirableCastOp(opc)
+             ? ConstantExpr::getCast(opc, V, DestTy)
+             : ConstantFoldCastInstruction(opc, V, DestTy);
+}
+
 Constant *llvm::ConstantFoldCastInstruction(unsigned opc, Constant *V,
                                             Type *DestTy) {
   if (isa<PoisonValue>(V))
@@ -320,7 +327,7 @@ Constant *llvm::ConstantFoldCastInstruction(unsigned opc, Constant *V,
     if (CE->isCast()) {
       // Try hard to fold cast of cast because they are often eliminable.
       if (unsigned newOpc = foldConstantCastPair(opc, CE, DestTy))
-        return ConstantExpr::getCast(newOpc, CE->getOperand(0), DestTy);
+        return foldMaybeUndesirableCast(newOpc, CE->getOperand(0), DestTy);
     } else if (CE->getOpcode() == Instruction::GetElementPtr &&
                // Do not fold addrspacecast (gep 0, .., 0). It might make the
                // addrspacecast uncanonicalized.
@@ -357,18 +364,22 @@ Constant *llvm::ConstantFoldCastInstruction(unsigned opc, Constant *V,
     Type *DstEltTy = DestVecTy->getElementType();
     // Fast path for splatted constants.
     if (Constant *Splat = V->getSplatValue()) {
+      Constant *Res = foldMaybeUndesirableCast(opc, Splat, DstEltTy);
+      if (!Res)
+        return nullptr;
       return ConstantVector::getSplat(
-          cast<VectorType>(DestTy)->getElementCount(),
-          ConstantExpr::getCast(opc, Splat, DstEltTy));
+          cast<VectorType>(DestTy)->getElementCount(), Res);
     }
     SmallVector<Constant *, 16> res;
     Type *Ty = IntegerType::get(V->getContext(), 32);
     for (unsigned i = 0,
                   e = cast<FixedVectorType>(V->getType())->getNumElements();
          i != e; ++i) {
-      Constant *C =
-        ConstantExpr::getExtractElement(V, ConstantInt::get(Ty, i));
-      res.push_back(ConstantExpr::getCast(opc, C, DstEltTy));
+      Constant *C = ConstantExpr::getExtractElement(V, ConstantInt::get(Ty, i));
+      Constant *Casted = foldMaybeUndesirableCast(opc, C, DstEltTy);
+      if (!Casted)
+        return nullptr;
+      res.push_back(Casted);
     }
     return ConstantVector::get(res);
   }
diff --git a/llvm/test/CodeGen/AArch64/arm64-codegen-prepare-extload.ll b/llvm/test/CodeGen/AArch64/arm64-codegen-prepare-extload.ll
index 889a76b37ebe1..f5b853faed8a2 100644
--- a/llvm/test/CodeGen/AArch64/arm64-codegen-prepare-extload.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-codegen-prepare-extload.ll
@@ -338,7 +338,8 @@ entry:
 ; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i16, ptr %addr
 ;
 ; OPT-NEXT: [[SEXT:%[a-zA-Z_0-9-]+]] = sext i16 [[LD]] to i32
-; OPT-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = add nuw nsw i32 [[SEXT]], zext (i1 icmp ne (ptr getelementptr inbounds ([2 x i32], ptr @c, i64 0, i64 1), ptr @a) to i32)
+; OPT-NEXT: [[SEXT2:%[a-zA-Z_0-9-]+]] = sext i16 zext (i1 icmp ne (ptr getelementptr inbounds ([2 x i32], ptr @c, i64 0, i64 1), ptr @a) to i16) to i32
+; OPT-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = add nuw nsw i32 [[SEXT]], [[SEXT2]]
 ;
 ; DISABLE-NEXT: [[ADD:%[a-zA-Z_0-9-]+]] = add nuw nsw i16 [[LD]], zext (i1 icmp ne (ptr getelementptr inbounds ([2 x i32], ptr @c, i64 0, i64 1), ptr @a) to i16)
 ; DISABLE-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = sext i16 [[ADD]] to i32
diff --git a/llvm/test/CodeGen/X86/codegen-prepare-extload.ll b/llvm/test/CodeGen/X86/codegen-prepare-extload.ll
index f835d4f2dbad7..676322cd0014f 100644
--- a/llvm/test/CodeGen/X86/codegen-prepare-extload.ll
+++ b/llvm/test/CodeGen/X86/codegen-prepare-extload.ll
@@ -546,7 +546,8 @@ define i32 @promotionOfArgEndsUpInValue(ptr %addr) {
 ; OPT-NEXT:  entry:
 ; OPT-NEXT:    [[VAL:%.*]] = load i16, ptr [[ADDR]], align 2
 ; OPT-NEXT:    [[CONV3:%.*]] = sext i16 [[VAL]] to i32
-; OPT-NEXT:    [[ADD:%.*]] = add nuw nsw i32 [[CONV3]], zext (i1 icmp ne (ptr getelementptr inbounds ([2 x i32], ptr @c, i64 0, i64 1), ptr @a) to i32)
+; OPT-NEXT:    [[PROMOTED:%.*]] = sext i16 zext (i1 icmp ne (ptr getelementptr inbounds ([2 x i32], ptr @c, i64 0, i64 1), ptr @a) to i16) to i32
+; OPT-NEXT:    [[ADD:%.*]] = add nuw nsw i32 [[CONV3]], [[PROMOTED]]
 ; OPT-NEXT:    ret i32 [[ADD]]
 ;
 ; DISABLE-LABEL: define i32 @promotionOfArgEndsUpInValue(

From 2be251fbf4b2300195d5f89d819891e985ff0329 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Wed, 1 Nov 2023 15:00:44 +0000
Subject: [PATCH 696/877] [AMDGPU] Simplify expandPostRAPseudo for
 SI_PC_ADD_REL_OFFSET. NFC.

---
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 1bd7a28ca650e..7c5a91c555b21 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2400,11 +2400,10 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
                        .addReg(RegLo)
                        .add(MI.getOperand(1)));
 
-    MachineInstrBuilder MIB = BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi)
-                                  .addReg(RegHi);
-    MIB.add(MI.getOperand(2));
+    Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi)
+                       .addReg(RegHi)
+                       .add(MI.getOperand(2)));
 
-    Bundler.append(MIB);
     finalizeBundle(MBB, Bundler.begin());
 
     MI.eraseFromParent();

From 6e8d957a228bb5d8eaa3e02e61d95b101f5321f1 Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev@outlook.com>
Date: Wed, 1 Nov 2023 08:52:05 -0700
Subject: [PATCH 697/877] Revert "[SLP]Improve tryToGatherExtractElements by
 using per-register analysis."

This reverts commit 0a34aaedd8ec2dc2375076976c1327fdbfd7877f to fix
fails reported in https://lab.llvm.org/buildbot/#/builders/265/builds/40
---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 535 ++++++++----------
 .../AArch64/extractelements-to-shuffle.ll     | 135 +++--
 .../SLPVectorizer/X86/crash_clear_undefs.ll   |   2 +-
 .../SLPVectorizer/X86/hadd-inseltpoison.ll    | 152 +----
 .../test/Transforms/SLPVectorizer/X86/hadd.ll | 152 +----
 .../SLPVectorizer/X86/hsub-inseltpoison.ll    | 153 +----
 .../test/Transforms/SLPVectorizer/X86/hsub.ll | 153 +----
 .../X86/reused-extractelements.ll             |  23 +-
 8 files changed, 426 insertions(+), 879 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 5ea3b3e245249..cf776600338c6 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -662,36 +662,6 @@ tryToGatherSingleRegisterExtractElements(MutableArrayRef<Value *> VL,
   return Res;
 }
 
-/// Tries to find extractelement instructions with constant indices from fixed
-/// vector type and gather such instructions into a bunch, which highly likely
-/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
-/// successful, the matched scalars are replaced by poison values in \p VL for
-/// future analysis.
-static SmallVector<std::optional<TTI::ShuffleKind>>
-tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
-                           SmallVectorImpl<int> &Mask, unsigned NumParts) {
-  assert(NumParts > 0 && "NumParts expected be greater than or equal to 1.");
-  SmallVector<std::optional<TTI::ShuffleKind>> ShufflesRes(NumParts);
-  Mask.assign(VL.size(), PoisonMaskElem);
-  unsigned SliceSize = VL.size() / NumParts;
-  for (unsigned Part = 0; Part < NumParts; ++Part) {
-    // Scan list of gathered scalars for extractelements that can be represented
-    // as shuffles.
-    MutableArrayRef<Value *> SubVL =
-        MutableArrayRef(VL).slice(Part * SliceSize, SliceSize);
-    SmallVector<int> SubMask;
-    std::optional<TTI::ShuffleKind> Res =
-        tryToGatherSingleRegisterExtractElements(SubVL, SubMask);
-    ShufflesRes[Part] = Res;
-    copy(SubMask, std::next(Mask.begin(), Part * SliceSize));
-  }
-  if (none_of(ShufflesRes, [](const std::optional<TTI::ShuffleKind> &Res) {
-        return Res.has_value();
-      }))
-    ShufflesRes.clear();
-  return ShufflesRes;
-}
-
 namespace {
 
 /// Main data required for vectorization of instructions.
@@ -7182,80 +7152,101 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
                 : R.getGatherCost(Gathers, !Root && VL.equals(Gathers)));
   };
 
-  /// Compute the cost of creating a vector containing the extracted values from
-  /// \p VL.
-  InstructionCost
-  computeExtractCost(ArrayRef<Value *> VL, ArrayRef<int> Mask,
-                     ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
-                     unsigned NumParts) {
-    assert(VL.size() > NumParts && "Unexpected scalarized shuffle.");
-    unsigned NumElts =
-        std::accumulate(VL.begin(), VL.end(), 0, [](unsigned Sz, Value *V) {
-          auto *EE = dyn_cast<ExtractElementInst>(V);
-          if (!EE)
-            return Sz;
-          auto *VecTy = cast<FixedVectorType>(EE->getVectorOperandType());
-          return std::max(Sz, VecTy->getNumElements());
-        });
-    unsigned NumSrcRegs = TTI.getNumberOfParts(
-        FixedVectorType::get(VL.front()->getType(), NumElts));
-    if (NumSrcRegs == 0)
-      NumSrcRegs = 1;
-    // FIXME: this must be moved to TTI for better estimation.
-    unsigned EltsPerVector = PowerOf2Ceil(std::max(
-        divideCeil(VL.size(), NumParts), divideCeil(NumElts, NumSrcRegs)));
-    auto CheckPerRegistersShuffle =
-        [&](MutableArrayRef<int> Mask) -> std::optional<TTI::ShuffleKind> {
-      DenseSet<int> RegIndices;
-      // Check that if trying to permute same single/2 input vectors.
-      TTI::ShuffleKind ShuffleKind = TTI::SK_PermuteSingleSrc;
-      int FirstRegId = -1;
-      for (int &I : Mask) {
-        if (I == PoisonMaskElem)
-          continue;
-        int RegId = (I / NumElts) * NumParts + (I % NumElts) / EltsPerVector;
-        if (FirstRegId < 0)
-          FirstRegId = RegId;
-        RegIndices.insert(RegId);
-        if (RegIndices.size() > 2)
-          return std::nullopt;
-        if (RegIndices.size() == 2)
-          ShuffleKind = TTI::SK_PermuteTwoSrc;
-        I = (I % NumElts) % EltsPerVector +
-            (RegId == FirstRegId ? 0 : EltsPerVector);
-      }
-      return ShuffleKind;
-    };
+  /// Compute the cost of creating a vector of type \p VecTy containing the
+  /// extracted values from \p VL.
+  InstructionCost computeExtractCost(ArrayRef<Value *> VL, ArrayRef<int> Mask,
+                                     TTI::ShuffleKind ShuffleKind) {
+    unsigned NumElts = 0;
+    for (Value *V : VL) {
+      auto *EE = dyn_cast<ExtractElementInst>(V);
+      if (!EE)
+        continue;
+      auto *VecTy = cast<FixedVectorType>(EE->getVectorOperandType());
+      NumElts = std::max(NumElts, VecTy->getNumElements());
+    }
+    assert(NumElts > 0 &&
+           "Expected at least 1-element fixed length vector(s).");
+    auto *VecTy = FixedVectorType::get(VL.front()->getType(), NumElts);
+    unsigned NumOfParts = TTI.getNumberOfParts(VecTy);
+    if (!NumOfParts || NumElts < NumOfParts)
+      return TTI.getShuffleCost(ShuffleKind, VecTy, Mask);
+    unsigned EltsPerVector = PowerOf2Ceil(divideCeil(NumElts, NumOfParts));
+    int ValNum = -1;
+    int ValIdx = -1;
+    // Check that if trying to permute 2 input vectors (which may result in
+    // several vector registers), each per-register subvector is the result of
+    // the permutation of 2 single registers.
+    if (ShuffleKind != TargetTransformInfo::SK_PermuteSingleSrc &&
+        !all_of(enumerate(Mask), [&](auto &&Arg) {
+          if (Arg.value() == PoisonMaskElem)
+            return true;
+          int CurValNum = (Arg.value() % NumElts) / EltsPerVector;
+          int CurValIdx = Arg.index() / EltsPerVector;
+          if (ValIdx != CurValIdx) {
+            ValIdx = CurValIdx;
+            ValNum = CurValNum;
+            return true;
+          }
+          return CurValNum == ValNum;
+        }))
+      return TTI.getShuffleCost(ShuffleKind, VecTy, Mask);
+
     InstructionCost Cost = 0;
 
     // Process extracts in blocks of EltsPerVector to check if the source vector
     // operand can be re-used directly. If not, add the cost of creating a
     // shuffle to extract the values into a vector register.
-    for (unsigned Part = 0; Part < NumParts; ++Part) {
-      if (!ShuffleKinds[Part])
-        continue;
-      ArrayRef<int> MaskSlice =
-          Mask.slice(Part * EltsPerVector,
-                     (Part == NumParts - 1 && Mask.size() % EltsPerVector != 0)
-                         ? Mask.size() % EltsPerVector
-                         : EltsPerVector);
-      SmallVector<int> SubMask(EltsPerVector, PoisonMaskElem);
-      copy(MaskSlice, SubMask.begin());
-      std::optional<TTI::ShuffleKind> RegShuffleKind =
-          CheckPerRegistersShuffle(SubMask);
-      if (!RegShuffleKind) {
-        Cost += TTI.getShuffleCost(
-            *ShuffleKinds[Part],
-            FixedVectorType::get(VL.front()->getType(), NumElts), MaskSlice);
-        continue;
+    auto *RegisterVecTy =
+        FixedVectorType::get(VL.front()->getType(), EltsPerVector);
+    SmallVector<int> RegMask(EltsPerVector, PoisonMaskElem);
+    TTI::ShuffleKind RegisterSK = TargetTransformInfo::SK_PermuteSingleSrc;
+    Value *VecBase = nullptr;
+    bool IsIdentity = true;
+    for (auto [Idx, V] : enumerate(VL)) {
+      // Reached the start of a new vector registers.
+      if (Idx % EltsPerVector == 0) {
+        RegMask.assign(EltsPerVector, PoisonMaskElem);
+        RegisterSK = TargetTransformInfo::SK_PermuteSingleSrc;
+        VecBase = nullptr;
       }
-      if (*RegShuffleKind != TTI::SK_PermuteSingleSrc ||
-          !ShuffleVectorInst::isIdentityMask(SubMask, EltsPerVector)) {
-        Cost += TTI.getShuffleCost(
-            *RegShuffleKind,
-            FixedVectorType::get(VL.front()->getType(), EltsPerVector),
-            SubMask);
+
+      // Need to exclude undefs from analysis.
+      if (isa<UndefValue>(V) || Mask[Idx] == PoisonMaskElem)
+        continue;
+
+      // Check all extracts for a vector register on the target directly
+      // extract values in order.
+      unsigned CurrentIdx = *getExtractIndex(cast<Instruction>(V));
+      unsigned PrevIdx = CurrentIdx;
+      if (Idx % EltsPerVector != 0 && !isa<UndefValue>(VL[Idx - 1]) &&
+          Mask[Idx - 1] != PoisonMaskElem)
+        PrevIdx = *getExtractIndex(cast<Instruction>(VL[Idx - 1])) + 1;
+      if (!VecBase) {
+        VecBase = cast<ExtractElementInst>(V)->getVectorOperand();
+        RegMask[Idx % EltsPerVector] = CurrentIdx % EltsPerVector;
+        IsIdentity = CurrentIdx % EltsPerVector == Idx % EltsPerVector;
+      } else if (VecBase != cast<ExtractElementInst>(V)->getVectorOperand()) {
+        IsIdentity = false;
+        RegisterSK = TargetTransformInfo::SK_PermuteTwoSrc;
+        RegMask[Idx % EltsPerVector] =
+            CurrentIdx % EltsPerVector + EltsPerVector;
+      } else {
+        IsIdentity &= PrevIdx == CurrentIdx &&
+                      CurrentIdx % EltsPerVector == Idx % EltsPerVector;
+        RegMask[Idx % EltsPerVector] = CurrentIdx % EltsPerVector;
       }
+
+      if (IsIdentity)
+        continue;
+
+      // Skip all indices, except for the last index per vector block.
+      if ((Idx + 1) % EltsPerVector != 0 && Idx + 1 != VL.size())
+        continue;
+
+      // If we have a series of extracts which are not consecutive and hence
+      // cannot re-use the source vector register directly, compute the shuffle
+      // cost to extract the vector with EltsPerVector elements.
+      Cost += TTI.getShuffleCost(RegisterSK, RegisterVecTy, RegMask);
     }
     return Cost;
   }
@@ -7473,76 +7464,90 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
                        SmallPtrSetImpl<Value *> &CheckedExtracts)
       : TTI(TTI), VectorizedVals(VectorizedVals.begin(), VectorizedVals.end()),
         R(R), CheckedExtracts(CheckedExtracts) {}
-  Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
-                        ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
-                        unsigned NumParts) {
+  Value *adjustExtracts(const TreeEntry *E, ArrayRef<int> Mask,
+                        TTI::ShuffleKind ShuffleKind) {
     if (Mask.empty())
       return nullptr;
     Value *VecBase = nullptr;
     ArrayRef<Value *> VL = E->Scalars;
+    auto *VecTy = FixedVectorType::get(VL.front()->getType(), VL.size());
     // If the resulting type is scalarized, do not adjust the cost.
-    if (NumParts == VL.size())
+    unsigned VecNumParts = TTI.getNumberOfParts(VecTy);
+    if (VecNumParts == VecTy->getNumElements())
       return nullptr;
-    // Check if it can be considered reused if same extractelements were
-    // vectorized already.
-    bool PrevNodeFound = any_of(
-        ArrayRef(R.VectorizableTree).take_front(E->Idx),
-        [&](const std::unique_ptr<TreeEntry> &TE) {
-          return ((!TE->isAltShuffle() &&
-                   TE->getOpcode() == Instruction::ExtractElement) ||
-                  TE->State == TreeEntry::NeedToGather) &&
-                 all_of(enumerate(TE->Scalars), [&](auto &&Data) {
-                   return VL.size() > Data.index() &&
-                          (Mask[Data.index()] == PoisonMaskElem ||
-                           isa<UndefValue>(VL[Data.index()]) ||
-                           Data.value() == VL[Data.index()]);
-                 });
-        });
-    unsigned SliceSize = VL.size() / NumParts;
-    for (unsigned Part = 0; Part < NumParts; ++Part) {
-      ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, SliceSize);
-      for (auto [I, V] : enumerate(VL.slice(Part * SliceSize, SliceSize))) {
-        // Ignore non-extractelement scalars.
-        if (isa<UndefValue>(V) ||
-            (!SubMask.empty() && SubMask[I] == PoisonMaskElem))
-          continue;
-        // If all users of instruction are going to be vectorized and this
-        // instruction itself is not going to be vectorized, consider this
-        // instruction as dead and remove its cost from the final cost of the
-        // vectorized tree.
-        // Also, avoid adjusting the cost for extractelements with multiple uses
-        // in different graph entries.
-        const TreeEntry *VE = R.getTreeEntry(V);
-        if (!CheckedExtracts.insert(V).second ||
-            !R.areAllUsersVectorized(cast<Instruction>(V), &VectorizedVals) ||
-            (VE && VE != E))
+    DenseMap<Value *, int> ExtractVectorsTys;
+    for (auto [I, V] : enumerate(VL)) {
+      // Ignore non-extractelement scalars.
+      if (isa<UndefValue>(V) || (!Mask.empty() && Mask[I] == PoisonMaskElem))
+        continue;
+      // If all users of instruction are going to be vectorized and this
+      // instruction itself is not going to be vectorized, consider this
+      // instruction as dead and remove its cost from the final cost of the
+      // vectorized tree.
+      // Also, avoid adjusting the cost for extractelements with multiple uses
+      // in different graph entries.
+      const TreeEntry *VE = R.getTreeEntry(V);
+      if (!CheckedExtracts.insert(V).second ||
+          !R.areAllUsersVectorized(cast<Instruction>(V), &VectorizedVals) ||
+          (VE && VE != E))
+        continue;
+      auto *EE = cast<ExtractElementInst>(V);
+      VecBase = EE->getVectorOperand();
+      std::optional<unsigned> EEIdx = getExtractIndex(EE);
+      if (!EEIdx)
+        continue;
+      unsigned Idx = *EEIdx;
+      if (VecNumParts != TTI.getNumberOfParts(EE->getVectorOperandType())) {
+        auto It =
+            ExtractVectorsTys.try_emplace(EE->getVectorOperand(), Idx).first;
+        It->getSecond() = std::min<int>(It->second, Idx);
+      }
+      // Take credit for instruction that will become dead.
+      if (EE->hasOneUse()) {
+        Instruction *Ext = EE->user_back();
+        if (isa<SExtInst, ZExtInst>(Ext) && all_of(Ext->users(), [](User *U) {
+              return isa<GetElementPtrInst>(U);
+            })) {
+          // Use getExtractWithExtendCost() to calculate the cost of
+          // extractelement/ext pair.
+          Cost -= TTI.getExtractWithExtendCost(Ext->getOpcode(), Ext->getType(),
+                                               EE->getVectorOperandType(), Idx);
+          // Add back the cost of s|zext which is subtracted separately.
+          Cost += TTI.getCastInstrCost(
+              Ext->getOpcode(), Ext->getType(), EE->getType(),
+              TTI::getCastContextHint(Ext), CostKind, Ext);
           continue;
-        auto *EE = cast<ExtractElementInst>(V);
-        VecBase = EE->getVectorOperand();
-        std::optional<unsigned> EEIdx = getExtractIndex(EE);
-        if (!EEIdx)
+        }
+      }
+      Cost -= TTI.getVectorInstrCost(*EE, EE->getVectorOperandType(), CostKind,
+                                     Idx);
+    }
+    // Add a cost for subvector extracts/inserts if required.
+    for (const auto &Data : ExtractVectorsTys) {
+      auto *EEVTy = cast<FixedVectorType>(Data.first->getType());
+      unsigned NumElts = VecTy->getNumElements();
+      if (Data.second % NumElts == 0)
+        continue;
+      if (TTI.getNumberOfParts(EEVTy) > VecNumParts) {
+        unsigned Idx = (Data.second / NumElts) * NumElts;
+        unsigned EENumElts = EEVTy->getNumElements();
+        if (Idx % NumElts == 0)
           continue;
-        unsigned Idx = *EEIdx;
-        // Take credit for instruction that will become dead.
-        if (EE->hasOneUse() || !PrevNodeFound) {
-          Instruction *Ext = EE->user_back();
-          if (isa<SExtInst, ZExtInst>(Ext) && all_of(Ext->users(), [](User *U) {
-                return isa<GetElementPtrInst>(U);
-              })) {
-            // Use getExtractWithExtendCost() to calculate the cost of
-            // extractelement/ext pair.
-            Cost -=
-                TTI.getExtractWithExtendCost(Ext->getOpcode(), Ext->getType(),
-                                             EE->getVectorOperandType(), Idx);
-            // Add back the cost of s|zext which is subtracted separately.
-            Cost += TTI.getCastInstrCost(
-                Ext->getOpcode(), Ext->getType(), EE->getType(),
-                TTI::getCastContextHint(Ext), CostKind, Ext);
-            continue;
-          }
+        if (Idx + NumElts <= EENumElts) {
+          Cost += TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,
+                                     EEVTy, std::nullopt, CostKind, Idx, VecTy);
+        } else {
+          // Need to round up the subvector type vectorization factor to avoid a
+          // crash in cost model functions. Make SubVT so that Idx + VF of SubVT
+          // <= EENumElts.
+          auto *SubVT =
+              FixedVectorType::get(VecTy->getElementType(), EENumElts - Idx);
+          Cost += TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,
+                                     EEVTy, std::nullopt, CostKind, Idx, SubVT);
         }
-        Cost -= TTI.getVectorInstrCost(*EE, EE->getVectorOperandType(),
-                                       CostKind, Idx);
+      } else {
+        Cost += TTI.getShuffleCost(TargetTransformInfo::SK_InsertSubvector,
+                                   VecTy, std::nullopt, CostKind, 0, EEVTy);
       }
     }
     // Check that gather of extractelements can be represented as just a
@@ -7550,9 +7555,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
     // Found the bunch of extractelement instructions that must be gathered
     // into a vector and can be represented as a permutation elements in a
     // single input vector or of 2 input vectors.
-    // Done for reused if same extractelements were vectorized already.
-    if (!PrevNodeFound)
-      Cost += computeExtractCost(VL, Mask, ShuffleKinds, NumParts);
+    Cost += computeExtractCost(VL, Mask, ShuffleKind);
     InVectors.assign(1, E);
     CommonMask.assign(Mask.begin(), Mask.end());
     transformMaskAfterShuffle(CommonMask, CommonMask);
@@ -7674,7 +7677,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
     assert((IsFinalized || CommonMask.empty()) &&
            "Shuffle construction must be finalized.");
   }
-  };
+};
 
 InstructionCost
 BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
@@ -7735,41 +7738,40 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
       reorderScalars(GatheredScalars, ReorderMask);
     SmallVector<int> Mask;
     SmallVector<int> ExtractMask;
+    std::optional<TargetTransformInfo::ShuffleKind> ExtractShuffle;
     SmallVector<std::optional<TargetTransformInfo::ShuffleKind>> GatherShuffles;
     SmallVector<SmallVector<const TreeEntry *>> Entries;
-    SmallVector<std::optional<TTI::ShuffleKind>> ExtractShuffles;
     // Check for gathered extracts.
+    ExtractShuffle =
+        tryToGatherSingleRegisterExtractElements(GatheredScalars, ExtractMask);
+
     bool Resized = false;
     unsigned NumParts = TTI->getNumberOfParts(VecTy);
     if (NumParts == 0 || NumParts >= GatheredScalars.size())
       NumParts = 1;
-    if (!all_of(GatheredScalars, UndefValue::classof)) {
-      ExtractShuffles =
-          tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
-      if (!ExtractShuffles.empty()) {
-        if (Value *VecBase = Estimator.adjustExtracts(
-                E, ExtractMask, ExtractShuffles, NumParts)) {
-          if (auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))
-            if (VF == VecBaseTy->getNumElements() &&
-                GatheredScalars.size() != VF) {
-              Resized = true;
-              GatheredScalars.append(VF - GatheredScalars.size(),
-                                     PoisonValue::get(ScalarTy));
-            }
+    if (Value *VecBase = Estimator.adjustExtracts(
+            E, ExtractMask, ExtractShuffle.value_or(TTI::SK_PermuteTwoSrc))) {
+      if (auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))
+        if (VF == VecBaseTy->getNumElements() && GatheredScalars.size() != VF) {
+          Resized = true;
+          GatheredScalars.append(VF - GatheredScalars.size(),
+                                 PoisonValue::get(ScalarTy));
         }
-      }
-
-      // Do not try to look for reshuffled loads for gathered loads (they will
-      // be handled later), for vectorized scalars, and cases, which are
-      // definitely not profitable (splats and small gather nodes.)
-      if (!ExtractShuffles.empty() || E->getOpcode() != Instruction::Load ||
-          E->isAltShuffle() ||
-          all_of(E->Scalars, [this](Value *V) { return getTreeEntry(V); }) ||
-          isSplat(E->Scalars) ||
-          (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2))
-        GatherShuffles =
-            isGatherShuffledEntry(E, GatheredScalars, Mask, Entries, NumParts);
+    } else if (ExtractShuffle &&
+               TTI->getNumberOfParts(VecTy) == VecTy->getNumElements()) {
+      copy(VL, GatheredScalars.begin());
     }
+
+    // Do not try to look for reshuffled loads for gathered loads (they will be
+    // handled later), for vectorized scalars, and cases, which are definitely
+    // not profitable (splats and small gather nodes.)
+    if (ExtractShuffle || E->getOpcode() != Instruction::Load ||
+        E->isAltShuffle() ||
+        all_of(E->Scalars, [this](Value *V) { return getTreeEntry(V); }) ||
+        isSplat(E->Scalars) ||
+        (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2))
+      GatherShuffles =
+          isGatherShuffledEntry(E, GatheredScalars, Mask, Entries, NumParts);
     if (!GatherShuffles.empty()) {
       if (GatherShuffles.size() == 1 &&
           *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
@@ -10011,10 +10013,7 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
       : Builder(Builder), R(R) {}
 
   /// Adjusts extractelements after reusing them.
-  Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
-                        unsigned NumParts, bool &UseVecBaseAsInput) {
-    UseVecBaseAsInput = false;
-    SmallPtrSet<Value *, 4> UniqueBases;
+  Value *adjustExtracts(const TreeEntry *E, ArrayRef<int> Mask) {
     Value *VecBase = nullptr;
     for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
       int Idx = Mask[I];
@@ -10022,7 +10021,6 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
         continue;
       auto *EI = cast<ExtractElementInst>(E->Scalars[I]);
       VecBase = EI->getVectorOperand();
-      UniqueBases.insert(VecBase);
       // If the only one use is vectorized - can delete the extractelement
       // itself.
       if (!EI->hasOneUse() || any_of(EI->users(), [&](User *U) {
@@ -10031,75 +10029,7 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
         continue;
       R.eraseInstruction(EI);
     }
-    if (NumParts == 1 || UniqueBases.size() == 1)
-      return VecBase;
-    UseVecBaseAsInput = true;
-    auto TransformToIdentity = [](MutableArrayRef<int> Mask) {
-      for (auto [I, Idx] : enumerate(Mask))
-        if (Idx != PoisonMaskElem)
-          Idx = I;
-    };
-    // Perform multi-register vector shuffle, joining them into a single virtual
-    // long vector.
-    // Need to shuffle each part independently and then insert all this parts
-    // into a long virtual vector register, forming the original vector.
-    Value *Vec = nullptr;
-    SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
-    unsigned SliceSize = E->Scalars.size() / NumParts;
-    for (unsigned Part = 0; Part < NumParts; ++Part) {
-      ArrayRef<Value *> VL =
-          ArrayRef(E->Scalars).slice(Part * SliceSize, SliceSize);
-      MutableArrayRef<int> SubMask = Mask.slice(Part * SliceSize, SliceSize);
-      constexpr int MaxBases = 2;
-      SmallVector<Value *, MaxBases> Bases(MaxBases);
-#ifndef NDEBUG
-      int PrevSize = 0;
-#endif // NDEBUG
-      for (const auto [I, V]: enumerate(VL)) {
-        if (SubMask[I] == PoisonMaskElem)
-          continue;
-        Value *VecOp = cast<ExtractElementInst>(V)->getVectorOperand();
-        const int Size =
-            cast<FixedVectorType>(VecOp->getType())->getNumElements();
-#ifndef NDEBUG
-        assert((PrevSize == Size || PrevSize == 0) &&
-               "Expected vectors of the same size.");
-        PrevSize = Size;
-#endif // NDEBUG
-        Bases[SubMask[I] < Size ? 0 : 1] = VecOp;
-      }
-      if (!Bases.front())
-        continue;
-      Value *SubVec;
-      if (Bases.back()) {
-        SubVec = createShuffle(Bases.front(), Bases.back(), SubMask);
-        TransformToIdentity(SubMask);
-      } else {
-        SubVec = Bases.front();
-      }
-      if (!Vec) {
-        Vec = SubVec;
-        copy(SubMask, VecMask.begin());
-      } else {
-        unsigned VF = cast<FixedVectorType>(Vec->getType())->getNumElements();
-        if (Vec->getType() != SubVec->getType()) {
-          unsigned SubVecVF =
-              cast<FixedVectorType>(SubVec->getType())->getNumElements();
-          if (VF < SubVecVF)
-            TransformToIdentity(VecMask);
-          VF = std::max(VF, SubVecVF);
-        }
-        // Adjust SubMask.
-        for (auto [I, Idx] : enumerate(SubMask))
-          if (Idx != PoisonMaskElem)
-            Idx += VF;
-        copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
-        Vec = createShuffle(Vec, SubVec, VecMask);
-        TransformToIdentity(VecMask);
-      }
-    }
-    copy(VecMask, Mask.begin());
-    return Vec;
+    return VecBase;
   }
   /// Checks if the specified entry \p E needs to be delayed because of its
   /// dependency nodes.
@@ -10442,37 +10372,29 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
   BVTy ShuffleBuilder(Params...);
   ResTy Res = ResTy();
   SmallVector<int> Mask;
-  SmallVector<int> ExtractMask(GatheredScalars.size(), PoisonMaskElem);
-  SmallVector<std::optional<TTI::ShuffleKind>> ExtractShuffles;
-  Value *ExtractVecBase = nullptr;
-  bool UseVecBaseAsInput;
+  SmallVector<int> ExtractMask;
+  std::optional<TargetTransformInfo::ShuffleKind> ExtractShuffle;
   SmallVector<std::optional<TargetTransformInfo::ShuffleKind>> GatherShuffles;
   SmallVector<SmallVector<const TreeEntry *>> Entries;
   Type *ScalarTy = GatheredScalars.front()->getType();
-  auto *VecTy = FixedVectorType::get(ScalarTy, GatheredScalars.size());
-  unsigned NumParts = TTI->getNumberOfParts(VecTy);
+  unsigned NumParts = TTI->getNumberOfParts(
+      FixedVectorType::get(ScalarTy, GatheredScalars.size()));
   if (NumParts == 0 || NumParts >= GatheredScalars.size())
     NumParts = 1;
   if (!all_of(GatheredScalars, UndefValue::classof)) {
     // Check for gathered extracts.
+    ExtractShuffle =
+        tryToGatherSingleRegisterExtractElements(GatheredScalars, ExtractMask);
     bool Resized = false;
-    ExtractShuffles =
-        tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
-    if (!ExtractShuffles.empty()) {
-      if (Value *VecBase = ShuffleBuilder.adjustExtracts(
-              E, ExtractMask, NumParts, UseVecBaseAsInput)) {
-        ExtractVecBase = VecBase;
-        if (auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))
-          if (VF == VecBaseTy->getNumElements() &&
-              GatheredScalars.size() != VF) {
-            Resized = true;
-            GatheredScalars.append(VF - GatheredScalars.size(),
-                                   PoisonValue::get(ScalarTy));
-          }
-      }
-    }
+    if (Value *VecBase = ShuffleBuilder.adjustExtracts(E, ExtractMask))
+      if (auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))
+        if (VF == VecBaseTy->getNumElements() && GatheredScalars.size() != VF) {
+          Resized = true;
+          GatheredScalars.append(VF - GatheredScalars.size(),
+                                 PoisonValue::get(ScalarTy));
+        }
     // Gather extracts after we check for full matched gathers only.
-    if (!ExtractShuffles.empty() || E->getOpcode() != Instruction::Load ||
+    if (ExtractShuffle || E->getOpcode() != Instruction::Load ||
         E->isAltShuffle() ||
         all_of(E->Scalars, [this](Value *V) { return getTreeEntry(V); }) ||
         isSplat(E->Scalars) ||
@@ -10623,35 +10545,30 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
       }
     }
   };
-  if (!ExtractShuffles.empty() || !GatherShuffles.empty()) {
+  if (ExtractShuffle || !GatherShuffles.empty()) {
     bool IsNonPoisoned = true;
     bool IsUsedInExpr = true;
     Value *Vec1 = nullptr;
-    if (!ExtractShuffles.empty()) {
+    if (ExtractShuffle) {
       // Gather of extractelements can be represented as just a shuffle of
       // a single/two vectors the scalars are extracted from.
       // Find input vectors.
       Value *Vec2 = nullptr;
       for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
-        if (!Mask.empty() && Mask[I] != PoisonMaskElem)
+        if (ExtractMask[I] == PoisonMaskElem ||
+            (!Mask.empty() && Mask[I] != PoisonMaskElem)) {
           ExtractMask[I] = PoisonMaskElem;
-      }
-      if (UseVecBaseAsInput) {
-        Vec1 = ExtractVecBase;
-      } else {
-        for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
-          if (ExtractMask[I] == PoisonMaskElem)
-            continue;
-          if (isa<UndefValue>(E->Scalars[I]))
-            continue;
-          auto *EI = cast<ExtractElementInst>(E->Scalars[I]);
-          if (!Vec1) {
-            Vec1 = EI->getVectorOperand();
-          } else if (Vec1 != EI->getVectorOperand()) {
-            assert((!Vec2 || Vec2 == EI->getVectorOperand()) &&
-                   "Expected only 1 or 2 vectors shuffle.");
-            Vec2 = EI->getVectorOperand();
-          }
+          continue;
+        }
+        if (isa<UndefValue>(E->Scalars[I]))
+          continue;
+        auto *EI = cast<ExtractElementInst>(E->Scalars[I]);
+        if (!Vec1) {
+          Vec1 = EI->getVectorOperand();
+        } else if (Vec1 != EI->getVectorOperand()) {
+          assert((!Vec2 || Vec2 == EI->getVectorOperand()) &&
+                 "Expected only 1 or 2 vectors shuffle.");
+          Vec2 = EI->getVectorOperand();
         }
       }
       if (Vec2) {
@@ -10712,14 +10629,10 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
     int MSz = Mask.size();
     // Try to build constant vector and shuffle with it only if currently we
     // have a single permutation and more than 1 scalar constants.
-    bool IsSingleShuffle = ExtractShuffles.empty() || GatherShuffles.empty();
+    bool IsSingleShuffle = !ExtractShuffle || GatherShuffles.empty();
     bool IsIdentityShuffle =
-        ((UseVecBaseAsInput ||
-          all_of(ExtractShuffles,
-                 [](const std::optional<TTI::ShuffleKind> &SK) {
-                   return SK.value_or(TTI::SK_PermuteTwoSrc) ==
-                          TTI::SK_PermuteSingleSrc;
-                 })) &&
+        (ExtractShuffle.value_or(TTI::SK_PermuteTwoSrc) ==
+             TTI::SK_PermuteSingleSrc &&
          none_of(ExtractMask, [&](int I) { return I >= EMSz; }) &&
          ShuffleVectorInst::isIdentityMask(ExtractMask, EMSz)) ||
         (!GatherShuffles.empty() &&
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll
index 8f76b2e54e6c2..e60e356e5cd81 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll
@@ -75,47 +75,64 @@ define void @dist_vec(ptr nocapture noundef readonly %pA, ptr nocapture noundef
 ; CHECK-NEXT:    [[TMP4TT_0_LCSSA:%.*]] = phi <2 x i64> [ zeroinitializer, [[ENTRY]] ], [ [[ADD_I]], [[WHILE_END_LOOPEXIT]] ]
 ; CHECK-NEXT:    [[PB_ADDR_0_LCSSA:%.*]] = phi ptr [ [[PB]], [[ENTRY]] ], [ [[SCEVGEP311]], [[WHILE_END_LOOPEXIT]] ]
 ; CHECK-NEXT:    [[PA_ADDR_0_LCSSA:%.*]] = phi ptr [ [[PA]], [[ENTRY]] ], [ [[SCEVGEP]], [[WHILE_END_LOOPEXIT]] ]
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x i64> [[TMP4FT_0_LCSSA]], <2 x i64> [[TMP4TF_0_LCSSA]], <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <2 x i64> [[TMP4TT_0_LCSSA]], <2 x i64> [[TMP4FF_0_LCSSA]], <2 x i32> <i32 0, i32 2>
-; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <2 x i64> [[TMP10]], <2 x i64> [[TMP11]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <2 x i64> [[TMP4FT_0_LCSSA]], <2 x i64> [[TMP4TF_0_LCSSA]], <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <2 x i64> [[TMP4TT_0_LCSSA]], <2 x i64> [[TMP4FF_0_LCSSA]], <2 x i32> <i32 1, i32 3>
-; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP16:%.*]] = add <4 x i64> [[TMP12]], [[TMP15]]
-; CHECK-NEXT:    [[TMP17:%.*]] = trunc <4 x i64> [[TMP16]] to <4 x i32>
+; CHECK-NEXT:    [[VGETQ_LANE:%.*]] = extractelement <2 x i64> [[TMP4TT_0_LCSSA]], i64 0
+; CHECK-NEXT:    [[VGETQ_LANE45:%.*]] = extractelement <2 x i64> [[TMP4TT_0_LCSSA]], i64 1
+; CHECK-NEXT:    [[ADD:%.*]] = add i64 [[VGETQ_LANE]], [[VGETQ_LANE45]]
+; CHECK-NEXT:    [[CONV48:%.*]] = trunc i64 [[ADD]] to i32
+; CHECK-NEXT:    [[VGETQ_LANE51:%.*]] = extractelement <2 x i64> [[TMP4FF_0_LCSSA]], i64 0
+; CHECK-NEXT:    [[VGETQ_LANE55:%.*]] = extractelement <2 x i64> [[TMP4FF_0_LCSSA]], i64 1
+; CHECK-NEXT:    [[ADD57:%.*]] = add i64 [[VGETQ_LANE51]], [[VGETQ_LANE55]]
+; CHECK-NEXT:    [[CONV60:%.*]] = trunc i64 [[ADD57]] to i32
+; CHECK-NEXT:    [[VGETQ_LANE63:%.*]] = extractelement <2 x i64> [[TMP4TF_0_LCSSA]], i64 0
+; CHECK-NEXT:    [[VGETQ_LANE67:%.*]] = extractelement <2 x i64> [[TMP4TF_0_LCSSA]], i64 1
+; CHECK-NEXT:    [[ADD69:%.*]] = add i64 [[VGETQ_LANE63]], [[VGETQ_LANE67]]
+; CHECK-NEXT:    [[CONV72:%.*]] = trunc i64 [[ADD69]] to i32
+; CHECK-NEXT:    [[VGETQ_LANE75:%.*]] = extractelement <2 x i64> [[TMP4FT_0_LCSSA]], i64 0
+; CHECK-NEXT:    [[VGETQ_LANE79:%.*]] = extractelement <2 x i64> [[TMP4FT_0_LCSSA]], i64 1
+; CHECK-NEXT:    [[ADD81:%.*]] = add i64 [[VGETQ_LANE75]], [[VGETQ_LANE79]]
+; CHECK-NEXT:    [[CONV84:%.*]] = trunc i64 [[ADD81]] to i32
 ; CHECK-NEXT:    [[AND:%.*]] = and i32 [[NUMBEROFBOOLS]], 127
 ; CHECK-NEXT:    [[CMP86284:%.*]] = icmp ugt i32 [[AND]], 31
 ; CHECK-NEXT:    br i1 [[CMP86284]], label [[WHILE_BODY88:%.*]], label [[WHILE_END122:%.*]]
 ; CHECK:       while.body88:
 ; CHECK-NEXT:    [[PA_ADDR_1291:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[WHILE_END121:%.*]] ], [ [[PA_ADDR_0_LCSSA]], [[WHILE_END]] ]
 ; CHECK-NEXT:    [[PB_ADDR_1290:%.*]] = phi ptr [ [[INCDEC_PTR89:%.*]], [[WHILE_END121]] ], [ [[PB_ADDR_0_LCSSA]], [[WHILE_END]] ]
+; CHECK-NEXT:    [[_CTT_0289:%.*]] = phi i32 [ [[ADD99:%.*]], [[WHILE_END121]] ], [ [[CONV48]], [[WHILE_END]] ]
+; CHECK-NEXT:    [[_CFF_0288:%.*]] = phi i32 [ [[ADD106:%.*]], [[WHILE_END121]] ], [ [[CONV60]], [[WHILE_END]] ]
+; CHECK-NEXT:    [[_CTF_0287:%.*]] = phi i32 [ [[ADD113:%.*]], [[WHILE_END121]] ], [ [[CONV72]], [[WHILE_END]] ]
+; CHECK-NEXT:    [[_CFT_0286:%.*]] = phi i32 [ [[ADD120:%.*]], [[WHILE_END121]] ], [ [[CONV84]], [[WHILE_END]] ]
 ; CHECK-NEXT:    [[NBBOOLBLOCK_1285:%.*]] = phi i32 [ [[SUB:%.*]], [[WHILE_END121]] ], [ [[AND]], [[WHILE_END]] ]
-; CHECK-NEXT:    [[TMP18:%.*]] = phi <4 x i32> [ [[TMP34:%.*]], [[WHILE_END121]] ], [ [[TMP17]], [[WHILE_END]] ]
-; CHECK-NEXT:    [[TMP19:%.*]] = load i32, ptr [[PA_ADDR_1291]], align 4
-; CHECK-NEXT:    [[TMP20:%.*]] = load i32, ptr [[PB_ADDR_1290]], align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[PA_ADDR_1291]], align 4
+; CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[PB_ADDR_1290]], align 4
 ; CHECK-NEXT:    br label [[WHILE_BODY93:%.*]]
 ; CHECK:       while.body93:
-; CHECK-NEXT:    [[A_0279:%.*]] = phi i32 [ [[TMP19]], [[WHILE_BODY88]] ], [ [[SHR96:%.*]], [[WHILE_BODY93]] ]
-; CHECK-NEXT:    [[B_0278:%.*]] = phi i32 [ [[TMP20]], [[WHILE_BODY88]] ], [ [[SHR97:%.*]], [[WHILE_BODY93]] ]
+; CHECK-NEXT:    [[_CTT_1283:%.*]] = phi i32 [ [[_CTT_0289]], [[WHILE_BODY88]] ], [ [[ADD99]], [[WHILE_BODY93]] ]
+; CHECK-NEXT:    [[_CFF_1282:%.*]] = phi i32 [ [[_CFF_0288]], [[WHILE_BODY88]] ], [ [[ADD106]], [[WHILE_BODY93]] ]
+; CHECK-NEXT:    [[_CTF_1281:%.*]] = phi i32 [ [[_CTF_0287]], [[WHILE_BODY88]] ], [ [[ADD113]], [[WHILE_BODY93]] ]
+; CHECK-NEXT:    [[_CFT_1280:%.*]] = phi i32 [ [[_CFT_0286]], [[WHILE_BODY88]] ], [ [[ADD120]], [[WHILE_BODY93]] ]
+; CHECK-NEXT:    [[A_0279:%.*]] = phi i32 [ [[TMP10]], [[WHILE_BODY88]] ], [ [[SHR96:%.*]], [[WHILE_BODY93]] ]
+; CHECK-NEXT:    [[B_0278:%.*]] = phi i32 [ [[TMP11]], [[WHILE_BODY88]] ], [ [[SHR97:%.*]], [[WHILE_BODY93]] ]
 ; CHECK-NEXT:    [[SHIFT_0277:%.*]] = phi i32 [ 0, [[WHILE_BODY88]] ], [ [[INC:%.*]], [[WHILE_BODY93]] ]
-; CHECK-NEXT:    [[TMP21:%.*]] = phi <4 x i32> [ [[TMP18]], [[WHILE_BODY88]] ], [ [[TMP34]], [[WHILE_BODY93]] ]
 ; CHECK-NEXT:    [[AND94:%.*]] = and i32 [[A_0279]], 1
 ; CHECK-NEXT:    [[AND95:%.*]] = and i32 [[B_0278]], 1
 ; CHECK-NEXT:    [[SHR96]] = lshr i32 [[A_0279]], 1
 ; CHECK-NEXT:    [[SHR97]] = lshr i32 [[B_0278]], 1
-; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <2 x i32> poison, i32 [[AND94]], i32 0
-; CHECK-NEXT:    [[TMP23:%.*]] = shufflevector <2 x i32> [[TMP22]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP24:%.*]] = icmp eq <2 x i32> [[TMP23]], zeroinitializer
-; CHECK-NEXT:    [[TMP25:%.*]] = icmp ne <2 x i32> [[TMP23]], zeroinitializer
-; CHECK-NEXT:    [[TMP26:%.*]] = shufflevector <2 x i1> [[TMP24]], <2 x i1> [[TMP25]], <4 x i32> <i32 0, i32 3, i32 3, i32 0>
-; CHECK-NEXT:    [[TMP27:%.*]] = insertelement <2 x i32> poison, i32 [[AND95]], i32 0
-; CHECK-NEXT:    [[TMP28:%.*]] = shufflevector <2 x i32> [[TMP27]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP29:%.*]] = icmp ne <2 x i32> [[TMP28]], zeroinitializer
-; CHECK-NEXT:    [[TMP30:%.*]] = icmp eq <2 x i32> [[TMP28]], zeroinitializer
-; CHECK-NEXT:    [[TMP31:%.*]] = shufflevector <2 x i1> [[TMP29]], <2 x i1> [[TMP30]], <4 x i32> <i32 0, i32 3, i32 0, i32 3>
-; CHECK-NEXT:    [[TMP32:%.*]] = select <4 x i1> [[TMP26]], <4 x i1> [[TMP31]], <4 x i1> zeroinitializer
-; CHECK-NEXT:    [[TMP33:%.*]] = zext <4 x i1> [[TMP32]] to <4 x i32>
-; CHECK-NEXT:    [[TMP34]] = add <4 x i32> [[TMP21]], [[TMP33]]
+; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[AND94]], 0
+; CHECK-NEXT:    [[TOBOOL98:%.*]] = icmp ne i32 [[AND95]], 0
+; CHECK-NEXT:    [[TMP12:%.*]] = select i1 [[TOBOOL]], i1 [[TOBOOL98]], i1 false
+; CHECK-NEXT:    [[LAND_EXT:%.*]] = zext i1 [[TMP12]] to i32
+; CHECK-NEXT:    [[ADD99]] = add i32 [[_CTT_1283]], [[LAND_EXT]]
+; CHECK-NEXT:    [[TOBOOL100:%.*]] = icmp eq i32 [[AND94]], 0
+; CHECK-NEXT:    [[TOBOOL103:%.*]] = icmp eq i32 [[AND95]], 0
+; CHECK-NEXT:    [[TMP13:%.*]] = select i1 [[TOBOOL100]], i1 [[TOBOOL103]], i1 false
+; CHECK-NEXT:    [[LAND_EXT105:%.*]] = zext i1 [[TMP13]] to i32
+; CHECK-NEXT:    [[ADD106]] = add i32 [[_CFF_1282]], [[LAND_EXT105]]
+; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TOBOOL]], i1 [[TOBOOL103]], i1 false
+; CHECK-NEXT:    [[LAND_EXT112:%.*]] = zext i1 [[TMP14]] to i32
+; CHECK-NEXT:    [[ADD113]] = add i32 [[_CTF_1281]], [[LAND_EXT112]]
+; CHECK-NEXT:    [[TMP15:%.*]] = select i1 [[TOBOOL100]], i1 [[TOBOOL98]], i1 false
+; CHECK-NEXT:    [[LAND_EXT119:%.*]] = zext i1 [[TMP15]] to i32
+; CHECK-NEXT:    [[ADD120]] = add i32 [[_CFT_1280]], [[LAND_EXT119]]
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[SHIFT_0277]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], 32
 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[WHILE_END121]], label [[WHILE_BODY93]]
@@ -127,53 +144,61 @@ define void @dist_vec(ptr nocapture noundef readonly %pA, ptr nocapture noundef
 ; CHECK-NEXT:    br i1 [[CMP86]], label [[WHILE_BODY88]], label [[WHILE_END122]]
 ; CHECK:       while.end122:
 ; CHECK-NEXT:    [[NBBOOLBLOCK_1_LCSSA:%.*]] = phi i32 [ [[AND]], [[WHILE_END]] ], [ [[SUB]], [[WHILE_END121]] ]
+; CHECK-NEXT:    [[_CFT_0_LCSSA:%.*]] = phi i32 [ [[CONV84]], [[WHILE_END]] ], [ [[ADD120]], [[WHILE_END121]] ]
+; CHECK-NEXT:    [[_CTF_0_LCSSA:%.*]] = phi i32 [ [[CONV72]], [[WHILE_END]] ], [ [[ADD113]], [[WHILE_END121]] ]
+; CHECK-NEXT:    [[_CFF_0_LCSSA:%.*]] = phi i32 [ [[CONV60]], [[WHILE_END]] ], [ [[ADD106]], [[WHILE_END121]] ]
+; CHECK-NEXT:    [[_CTT_0_LCSSA:%.*]] = phi i32 [ [[CONV48]], [[WHILE_END]] ], [ [[ADD99]], [[WHILE_END121]] ]
 ; CHECK-NEXT:    [[PB_ADDR_1_LCSSA:%.*]] = phi ptr [ [[PB_ADDR_0_LCSSA]], [[WHILE_END]] ], [ [[INCDEC_PTR89]], [[WHILE_END121]] ]
 ; CHECK-NEXT:    [[PA_ADDR_1_LCSSA:%.*]] = phi ptr [ [[PA_ADDR_0_LCSSA]], [[WHILE_END]] ], [ [[INCDEC_PTR]], [[WHILE_END121]] ]
-; CHECK-NEXT:    [[TMP35:%.*]] = phi <4 x i32> [ [[TMP17]], [[WHILE_END]] ], [ [[TMP34]], [[WHILE_END121]] ]
 ; CHECK-NEXT:    [[CMP130_NOT299:%.*]] = icmp eq i32 [[NBBOOLBLOCK_1_LCSSA]], 0
 ; CHECK-NEXT:    br i1 [[CMP130_NOT299]], label [[WHILE_END166:%.*]], label [[WHILE_BODY132_PREHEADER:%.*]]
 ; CHECK:       while.body132.preheader:
-; CHECK-NEXT:    [[TMP36:%.*]] = load i32, ptr [[PB_ADDR_1_LCSSA]], align 4
+; CHECK-NEXT:    [[TMP16:%.*]] = load i32, ptr [[PB_ADDR_1_LCSSA]], align 4
 ; CHECK-NEXT:    [[SUB125:%.*]] = sub nuw nsw i32 32, [[NBBOOLBLOCK_1_LCSSA]]
-; CHECK-NEXT:    [[SHR128:%.*]] = lshr i32 [[TMP36]], [[SUB125]]
-; CHECK-NEXT:    [[TMP37:%.*]] = load i32, ptr [[PA_ADDR_1_LCSSA]], align 4
-; CHECK-NEXT:    [[SHR126:%.*]] = lshr i32 [[TMP37]], [[SUB125]]
+; CHECK-NEXT:    [[SHR128:%.*]] = lshr i32 [[TMP16]], [[SUB125]]
+; CHECK-NEXT:    [[TMP17:%.*]] = load i32, ptr [[PA_ADDR_1_LCSSA]], align 4
+; CHECK-NEXT:    [[SHR126:%.*]] = lshr i32 [[TMP17]], [[SUB125]]
 ; CHECK-NEXT:    br label [[WHILE_BODY132:%.*]]
 ; CHECK:       while.body132:
+; CHECK-NEXT:    [[_CTT_2306:%.*]] = phi i32 [ [[ADD142:%.*]], [[WHILE_BODY132]] ], [ [[_CTT_0_LCSSA]], [[WHILE_BODY132_PREHEADER]] ]
+; CHECK-NEXT:    [[_CFF_2305:%.*]] = phi i32 [ [[ADD150:%.*]], [[WHILE_BODY132]] ], [ [[_CFF_0_LCSSA]], [[WHILE_BODY132_PREHEADER]] ]
+; CHECK-NEXT:    [[_CTF_2304:%.*]] = phi i32 [ [[ADD157:%.*]], [[WHILE_BODY132]] ], [ [[_CTF_0_LCSSA]], [[WHILE_BODY132_PREHEADER]] ]
+; CHECK-NEXT:    [[_CFT_2303:%.*]] = phi i32 [ [[ADD164:%.*]], [[WHILE_BODY132]] ], [ [[_CFT_0_LCSSA]], [[WHILE_BODY132_PREHEADER]] ]
 ; CHECK-NEXT:    [[NBBOOLBLOCK_2302:%.*]] = phi i32 [ [[DEC165:%.*]], [[WHILE_BODY132]] ], [ [[NBBOOLBLOCK_1_LCSSA]], [[WHILE_BODY132_PREHEADER]] ]
 ; CHECK-NEXT:    [[A_1301:%.*]] = phi i32 [ [[SHR135:%.*]], [[WHILE_BODY132]] ], [ [[SHR126]], [[WHILE_BODY132_PREHEADER]] ]
 ; CHECK-NEXT:    [[B_1300:%.*]] = phi i32 [ [[SHR136:%.*]], [[WHILE_BODY132]] ], [ [[SHR128]], [[WHILE_BODY132_PREHEADER]] ]
-; CHECK-NEXT:    [[TMP38:%.*]] = phi <4 x i32> [ [[TMP51:%.*]], [[WHILE_BODY132]] ], [ [[TMP35]], [[WHILE_BODY132_PREHEADER]] ]
 ; CHECK-NEXT:    [[AND133:%.*]] = and i32 [[A_1301]], 1
 ; CHECK-NEXT:    [[AND134:%.*]] = and i32 [[B_1300]], 1
 ; CHECK-NEXT:    [[SHR135]] = lshr i32 [[A_1301]], 1
 ; CHECK-NEXT:    [[SHR136]] = lshr i32 [[B_1300]], 1
-; CHECK-NEXT:    [[TMP39:%.*]] = insertelement <2 x i32> poison, i32 [[AND133]], i32 0
-; CHECK-NEXT:    [[TMP40:%.*]] = shufflevector <2 x i32> [[TMP39]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP41:%.*]] = icmp eq <2 x i32> [[TMP40]], zeroinitializer
-; CHECK-NEXT:    [[TMP42:%.*]] = icmp ne <2 x i32> [[TMP40]], zeroinitializer
-; CHECK-NEXT:    [[TMP43:%.*]] = shufflevector <2 x i1> [[TMP41]], <2 x i1> [[TMP42]], <4 x i32> <i32 0, i32 3, i32 3, i32 0>
-; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <2 x i32> poison, i32 [[AND134]], i32 0
-; CHECK-NEXT:    [[TMP45:%.*]] = shufflevector <2 x i32> [[TMP44]], <2 x i32> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP46:%.*]] = icmp ne <2 x i32> [[TMP45]], zeroinitializer
-; CHECK-NEXT:    [[TMP47:%.*]] = icmp eq <2 x i32> [[TMP45]], zeroinitializer
-; CHECK-NEXT:    [[TMP48:%.*]] = shufflevector <2 x i1> [[TMP46]], <2 x i1> [[TMP47]], <4 x i32> <i32 0, i32 3, i32 0, i32 3>
-; CHECK-NEXT:    [[TMP49:%.*]] = select <4 x i1> [[TMP43]], <4 x i1> [[TMP48]], <4 x i1> zeroinitializer
-; CHECK-NEXT:    [[TMP50:%.*]] = zext <4 x i1> [[TMP49]] to <4 x i32>
-; CHECK-NEXT:    [[TMP51]] = add <4 x i32> [[TMP38]], [[TMP50]]
+; CHECK-NEXT:    [[TOBOOL137:%.*]] = icmp ne i32 [[AND133]], 0
+; CHECK-NEXT:    [[TOBOOL139:%.*]] = icmp ne i32 [[AND134]], 0
+; CHECK-NEXT:    [[TMP18:%.*]] = select i1 [[TOBOOL137]], i1 [[TOBOOL139]], i1 false
+; CHECK-NEXT:    [[LAND_EXT141:%.*]] = zext i1 [[TMP18]] to i32
+; CHECK-NEXT:    [[ADD142]] = add i32 [[_CTT_2306]], [[LAND_EXT141]]
+; CHECK-NEXT:    [[TOBOOL144:%.*]] = icmp eq i32 [[AND133]], 0
+; CHECK-NEXT:    [[TOBOOL147:%.*]] = icmp eq i32 [[AND134]], 0
+; CHECK-NEXT:    [[TMP19:%.*]] = select i1 [[TOBOOL144]], i1 [[TOBOOL147]], i1 false
+; CHECK-NEXT:    [[LAND_EXT149:%.*]] = zext i1 [[TMP19]] to i32
+; CHECK-NEXT:    [[ADD150]] = add i32 [[_CFF_2305]], [[LAND_EXT149]]
+; CHECK-NEXT:    [[TMP20:%.*]] = select i1 [[TOBOOL137]], i1 [[TOBOOL147]], i1 false
+; CHECK-NEXT:    [[LAND_EXT156:%.*]] = zext i1 [[TMP20]] to i32
+; CHECK-NEXT:    [[ADD157]] = add i32 [[_CTF_2304]], [[LAND_EXT156]]
+; CHECK-NEXT:    [[TMP21:%.*]] = select i1 [[TOBOOL144]], i1 [[TOBOOL139]], i1 false
+; CHECK-NEXT:    [[LAND_EXT163:%.*]] = zext i1 [[TMP21]] to i32
+; CHECK-NEXT:    [[ADD164]] = add i32 [[_CFT_2303]], [[LAND_EXT163]]
 ; CHECK-NEXT:    [[DEC165]] = add nsw i32 [[NBBOOLBLOCK_2302]], -1
 ; CHECK-NEXT:    [[CMP130_NOT:%.*]] = icmp eq i32 [[DEC165]], 0
 ; CHECK-NEXT:    br i1 [[CMP130_NOT]], label [[WHILE_END166]], label [[WHILE_BODY132]]
 ; CHECK:       while.end166:
-; CHECK-NEXT:    [[TMP52:%.*]] = phi <4 x i32> [ [[TMP35]], [[WHILE_END122]] ], [ [[TMP51]], [[WHILE_BODY132]] ]
-; CHECK-NEXT:    [[TMP53:%.*]] = extractelement <4 x i32> [[TMP52]], i32 2
-; CHECK-NEXT:    store i32 [[TMP53]], ptr [[CTT:%.*]], align 4
-; CHECK-NEXT:    [[TMP54:%.*]] = extractelement <4 x i32> [[TMP52]], i32 3
-; CHECK-NEXT:    store i32 [[TMP54]], ptr [[CFF:%.*]], align 4
-; CHECK-NEXT:    [[TMP55:%.*]] = extractelement <4 x i32> [[TMP52]], i32 1
-; CHECK-NEXT:    store i32 [[TMP55]], ptr [[CTF:%.*]], align 4
-; CHECK-NEXT:    [[TMP56:%.*]] = extractelement <4 x i32> [[TMP52]], i32 0
-; CHECK-NEXT:    store i32 [[TMP56]], ptr [[CFT:%.*]], align 4
+; CHECK-NEXT:    [[_CFT_2_LCSSA:%.*]] = phi i32 [ [[_CFT_0_LCSSA]], [[WHILE_END122]] ], [ [[ADD164]], [[WHILE_BODY132]] ]
+; CHECK-NEXT:    [[_CTF_2_LCSSA:%.*]] = phi i32 [ [[_CTF_0_LCSSA]], [[WHILE_END122]] ], [ [[ADD157]], [[WHILE_BODY132]] ]
+; CHECK-NEXT:    [[_CFF_2_LCSSA:%.*]] = phi i32 [ [[_CFF_0_LCSSA]], [[WHILE_END122]] ], [ [[ADD150]], [[WHILE_BODY132]] ]
+; CHECK-NEXT:    [[_CTT_2_LCSSA:%.*]] = phi i32 [ [[_CTT_0_LCSSA]], [[WHILE_END122]] ], [ [[ADD142]], [[WHILE_BODY132]] ]
+; CHECK-NEXT:    store i32 [[_CTT_2_LCSSA]], ptr [[CTT:%.*]], align 4
+; CHECK-NEXT:    store i32 [[_CFF_2_LCSSA]], ptr [[CFF:%.*]], align 4
+; CHECK-NEXT:    store i32 [[_CTF_2_LCSSA]], ptr [[CTF:%.*]], align 4
+; CHECK-NEXT:    store i32 [[_CFT_2_LCSSA]], ptr [[CFT:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_clear_undefs.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_clear_undefs.ll
index c2369a6a89ec1..de99654d84eb8 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/crash_clear_undefs.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_clear_undefs.ll
@@ -9,7 +9,7 @@ target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16
 ; YAML-NEXT:  Function:        foo
 ; YAML-NEXT:  Args:
 ; YAML-NEXT:    - String:          'SLP vectorized with cost '
-; YAML-NEXT:    - Cost:            '-4'
+; YAML-NEXT:    - Cost:            '-3'
 ; YAML-NEXT:    - String:          ' and with tree size '
 ; YAML-NEXT:    - TreeSize:        '10'
 ; YAML-NEXT:  ...
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/hadd-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/hadd-inseltpoison.ll
index 4a9f717918a02..0217ddcac0046 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/hadd-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/hadd-inseltpoison.ll
@@ -166,31 +166,11 @@ define void @test_v4f32_v2f32_store(<4 x float> %f, ptr %p){
 ;
 
 define <4 x double> @test_v4f64(<4 x double> %a, <4 x double> %b) {
-; SSE-LABEL: @test_v4f64(
-; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
-; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 2, i32 6>
-; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
-; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 3, i32 7>
-; SSE-NEXT:    [[TMP5:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]]
-; SSE-NEXT:    [[TMP6:%.*]] = fadd <2 x double> [[TMP2]], [[TMP4]]
-; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE-NEXT:    ret <4 x double> [[TMP7]]
-;
-; SLM-LABEL: @test_v4f64(
-; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
-; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 2, i32 6>
-; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
-; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 3, i32 7>
-; SLM-NEXT:    [[TMP5:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]]
-; SLM-NEXT:    [[TMP6:%.*]] = fadd <2 x double> [[TMP2]], [[TMP4]]
-; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SLM-NEXT:    ret <4 x double> [[TMP7]]
-;
-; AVX-LABEL: @test_v4f64(
-; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-; AVX-NEXT:    [[TMP3:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
-; AVX-NEXT:    ret <4 x double> [[TMP3]]
+; CHECK-LABEL: @test_v4f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x double> [[TMP3]]
 ;
   %a0 = extractelement <4 x double> %a, i32 0
   %a1 = extractelement <4 x double> %a, i32 1
@@ -286,31 +266,11 @@ define <4 x double> @test_v4f64_partial_swizzle(<4 x double> %a, <4 x double> %b
 }
 
 define <8 x float> @test_v8f32(<8 x float> %a, <8 x float> %b) {
-; SSE-LABEL: @test_v8f32(
-; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
-; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
-; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
-; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
-; SSE-NEXT:    [[TMP5:%.*]] = fadd <4 x float> [[TMP1]], [[TMP3]]
-; SSE-NEXT:    [[TMP6:%.*]] = fadd <4 x float> [[TMP2]], [[TMP4]]
-; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE-NEXT:    ret <8 x float> [[TMP7]]
-;
-; SLM-LABEL: @test_v8f32(
-; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
-; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
-; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
-; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
-; SLM-NEXT:    [[TMP5:%.*]] = fadd <4 x float> [[TMP1]], [[TMP3]]
-; SLM-NEXT:    [[TMP6:%.*]] = fadd <4 x float> [[TMP2]], [[TMP4]]
-; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT:    ret <8 x float> [[TMP7]]
-;
-; AVX-LABEL: @test_v8f32(
-; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
-; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
-; AVX-NEXT:    [[TMP3:%.*]] = fadd <8 x float> [[TMP1]], [[TMP2]]
-; AVX-NEXT:    ret <8 x float> [[TMP3]]
+; CHECK-LABEL: @test_v8f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <8 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <8 x float> [[TMP3]]
 ;
   %a0 = extractelement <8 x float> %a, i32 0
   %a1 = extractelement <8 x float> %a, i32 1
@@ -348,31 +308,11 @@ define <8 x float> @test_v8f32(<8 x float> %a, <8 x float> %b) {
 }
 
 define <4 x i64> @test_v4i64(<4 x i64> %a, <4 x i64> %b) {
-; SSE-LABEL: @test_v4i64(
-; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> <i32 0, i32 4>
-; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 2, i32 6>
-; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 1, i32 5>
-; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 3, i32 7>
-; SSE-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[TMP1]], [[TMP3]]
-; SSE-NEXT:    [[TMP6:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]]
-; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE-NEXT:    ret <4 x i64> [[TMP7]]
-;
-; SLM-LABEL: @test_v4i64(
-; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> <i32 0, i32 4>
-; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 2, i32 6>
-; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 1, i32 5>
-; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 3, i32 7>
-; SLM-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[TMP1]], [[TMP3]]
-; SLM-NEXT:    [[TMP6:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]]
-; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SLM-NEXT:    ret <4 x i64> [[TMP7]]
-;
-; AVX-LABEL: @test_v4i64(
-; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-; AVX-NEXT:    [[TMP3:%.*]] = add <4 x i64> [[TMP1]], [[TMP2]]
-; AVX-NEXT:    ret <4 x i64> [[TMP3]]
+; CHECK-LABEL: @test_v4i64(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = add <4 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x i64> [[TMP3]]
 ;
   %a0 = extractelement <4 x i64> %a, i32 0
   %a1 = extractelement <4 x i64> %a, i32 1
@@ -394,31 +334,11 @@ define <4 x i64> @test_v4i64(<4 x i64> %a, <4 x i64> %b) {
 }
 
 define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) {
-; SSE-LABEL: @test_v8i32(
-; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
-; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
-; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
-; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
-; SSE-NEXT:    [[TMP5:%.*]] = add <4 x i32> [[TMP1]], [[TMP3]]
-; SSE-NEXT:    [[TMP6:%.*]] = add <4 x i32> [[TMP2]], [[TMP4]]
-; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE-NEXT:    ret <8 x i32> [[TMP7]]
-;
-; SLM-LABEL: @test_v8i32(
-; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
-; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
-; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
-; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
-; SLM-NEXT:    [[TMP5:%.*]] = add <4 x i32> [[TMP1]], [[TMP3]]
-; SLM-NEXT:    [[TMP6:%.*]] = add <4 x i32> [[TMP2]], [[TMP4]]
-; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT:    ret <8 x i32> [[TMP7]]
-;
-; AVX-LABEL: @test_v8i32(
-; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
-; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
-; AVX-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[TMP1]], [[TMP2]]
-; AVX-NEXT:    ret <8 x i32> [[TMP3]]
+; CHECK-LABEL: @test_v8i32(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
+; CHECK-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <8 x i32> [[TMP3]]
 ;
   %a0 = extractelement <8 x i32> %a, i32 0
   %a1 = extractelement <8 x i32> %a, i32 1
@@ -456,31 +376,11 @@ define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) {
 }
 
 define <16 x i16> @test_v16i16(<16 x i16> %a, <16 x i16> %b) {
-; SSE-LABEL: @test_v16i16(
-; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22>
-; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
-; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23>
-; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
-; SSE-NEXT:    [[TMP5:%.*]] = add <8 x i16> [[TMP1]], [[TMP3]]
-; SSE-NEXT:    [[TMP6:%.*]] = add <8 x i16> [[TMP2]], [[TMP4]]
-; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE-NEXT:    ret <16 x i16> [[TMP7]]
-;
-; SLM-LABEL: @test_v16i16(
-; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22>
-; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
-; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23>
-; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
-; SLM-NEXT:    [[TMP5:%.*]] = add <8 x i16> [[TMP1]], [[TMP3]]
-; SLM-NEXT:    [[TMP6:%.*]] = add <8 x i16> [[TMP2]], [[TMP4]]
-; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SLM-NEXT:    ret <16 x i16> [[TMP7]]
-;
-; AVX-LABEL: @test_v16i16(
-; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
-; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
-; AVX-NEXT:    [[TMP3:%.*]] = add <16 x i16> [[TMP1]], [[TMP2]]
-; AVX-NEXT:    ret <16 x i16> [[TMP3]]
+; CHECK-LABEL: @test_v16i16(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
+; CHECK-NEXT:    [[TMP3:%.*]] = add <16 x i16> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <16 x i16> [[TMP3]]
 ;
   %a0  = extractelement <16 x i16> %a, i32 0
   %a1  = extractelement <16 x i16> %a, i32 1
@@ -548,3 +448,5 @@ define <16 x i16> @test_v16i16(<16 x i16> %a, <16 x i16> %b) {
   %rv15 = insertelement <16 x i16> %rv14, i16 %r15, i32 15
   ret <16 x i16> %rv15
 }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; AVX: {{.*}}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll b/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll
index cac6845c43004..c38d116a7a323 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll
@@ -166,31 +166,11 @@ define void @test_v4f32_v2f32_store(<4 x float> %f, ptr %p){
 ;
 
 define <4 x double> @test_v4f64(<4 x double> %a, <4 x double> %b) {
-; SSE-LABEL: @test_v4f64(
-; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
-; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 2, i32 6>
-; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
-; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 3, i32 7>
-; SSE-NEXT:    [[TMP5:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]]
-; SSE-NEXT:    [[TMP6:%.*]] = fadd <2 x double> [[TMP2]], [[TMP4]]
-; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE-NEXT:    ret <4 x double> [[TMP7]]
-;
-; SLM-LABEL: @test_v4f64(
-; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
-; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 2, i32 6>
-; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
-; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 3, i32 7>
-; SLM-NEXT:    [[TMP5:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]]
-; SLM-NEXT:    [[TMP6:%.*]] = fadd <2 x double> [[TMP2]], [[TMP4]]
-; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SLM-NEXT:    ret <4 x double> [[TMP7]]
-;
-; AVX-LABEL: @test_v4f64(
-; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-; AVX-NEXT:    [[TMP3:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
-; AVX-NEXT:    ret <4 x double> [[TMP3]]
+; CHECK-LABEL: @test_v4f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x double> [[TMP3]]
 ;
   %a0 = extractelement <4 x double> %a, i32 0
   %a1 = extractelement <4 x double> %a, i32 1
@@ -286,31 +266,11 @@ define <4 x double> @test_v4f64_partial_swizzle(<4 x double> %a, <4 x double> %b
 }
 
 define <8 x float> @test_v8f32(<8 x float> %a, <8 x float> %b) {
-; SSE-LABEL: @test_v8f32(
-; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
-; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
-; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
-; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
-; SSE-NEXT:    [[TMP5:%.*]] = fadd <4 x float> [[TMP1]], [[TMP3]]
-; SSE-NEXT:    [[TMP6:%.*]] = fadd <4 x float> [[TMP2]], [[TMP4]]
-; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE-NEXT:    ret <8 x float> [[TMP7]]
-;
-; SLM-LABEL: @test_v8f32(
-; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
-; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
-; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
-; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
-; SLM-NEXT:    [[TMP5:%.*]] = fadd <4 x float> [[TMP1]], [[TMP3]]
-; SLM-NEXT:    [[TMP6:%.*]] = fadd <4 x float> [[TMP2]], [[TMP4]]
-; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT:    ret <8 x float> [[TMP7]]
-;
-; AVX-LABEL: @test_v8f32(
-; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
-; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
-; AVX-NEXT:    [[TMP3:%.*]] = fadd <8 x float> [[TMP1]], [[TMP2]]
-; AVX-NEXT:    ret <8 x float> [[TMP3]]
+; CHECK-LABEL: @test_v8f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <8 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <8 x float> [[TMP3]]
 ;
   %a0 = extractelement <8 x float> %a, i32 0
   %a1 = extractelement <8 x float> %a, i32 1
@@ -348,31 +308,11 @@ define <8 x float> @test_v8f32(<8 x float> %a, <8 x float> %b) {
 }
 
 define <4 x i64> @test_v4i64(<4 x i64> %a, <4 x i64> %b) {
-; SSE-LABEL: @test_v4i64(
-; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> <i32 0, i32 4>
-; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 2, i32 6>
-; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 1, i32 5>
-; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 3, i32 7>
-; SSE-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[TMP1]], [[TMP3]]
-; SSE-NEXT:    [[TMP6:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]]
-; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE-NEXT:    ret <4 x i64> [[TMP7]]
-;
-; SLM-LABEL: @test_v4i64(
-; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> <i32 0, i32 4>
-; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 2, i32 6>
-; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 1, i32 5>
-; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 3, i32 7>
-; SLM-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[TMP1]], [[TMP3]]
-; SLM-NEXT:    [[TMP6:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]]
-; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SLM-NEXT:    ret <4 x i64> [[TMP7]]
-;
-; AVX-LABEL: @test_v4i64(
-; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-; AVX-NEXT:    [[TMP3:%.*]] = add <4 x i64> [[TMP1]], [[TMP2]]
-; AVX-NEXT:    ret <4 x i64> [[TMP3]]
+; CHECK-LABEL: @test_v4i64(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = add <4 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x i64> [[TMP3]]
 ;
   %a0 = extractelement <4 x i64> %a, i32 0
   %a1 = extractelement <4 x i64> %a, i32 1
@@ -394,31 +334,11 @@ define <4 x i64> @test_v4i64(<4 x i64> %a, <4 x i64> %b) {
 }
 
 define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) {
-; SSE-LABEL: @test_v8i32(
-; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
-; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
-; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
-; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
-; SSE-NEXT:    [[TMP5:%.*]] = add <4 x i32> [[TMP1]], [[TMP3]]
-; SSE-NEXT:    [[TMP6:%.*]] = add <4 x i32> [[TMP2]], [[TMP4]]
-; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE-NEXT:    ret <8 x i32> [[TMP7]]
-;
-; SLM-LABEL: @test_v8i32(
-; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
-; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
-; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
-; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
-; SLM-NEXT:    [[TMP5:%.*]] = add <4 x i32> [[TMP1]], [[TMP3]]
-; SLM-NEXT:    [[TMP6:%.*]] = add <4 x i32> [[TMP2]], [[TMP4]]
-; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT:    ret <8 x i32> [[TMP7]]
-;
-; AVX-LABEL: @test_v8i32(
-; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
-; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
-; AVX-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[TMP1]], [[TMP2]]
-; AVX-NEXT:    ret <8 x i32> [[TMP3]]
+; CHECK-LABEL: @test_v8i32(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
+; CHECK-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <8 x i32> [[TMP3]]
 ;
   %a0 = extractelement <8 x i32> %a, i32 0
   %a1 = extractelement <8 x i32> %a, i32 1
@@ -456,31 +376,11 @@ define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) {
 }
 
 define <16 x i16> @test_v16i16(<16 x i16> %a, <16 x i16> %b) {
-; SSE-LABEL: @test_v16i16(
-; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22>
-; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
-; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23>
-; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
-; SSE-NEXT:    [[TMP5:%.*]] = add <8 x i16> [[TMP1]], [[TMP3]]
-; SSE-NEXT:    [[TMP6:%.*]] = add <8 x i16> [[TMP2]], [[TMP4]]
-; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE-NEXT:    ret <16 x i16> [[TMP7]]
-;
-; SLM-LABEL: @test_v16i16(
-; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22>
-; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
-; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23>
-; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
-; SLM-NEXT:    [[TMP5:%.*]] = add <8 x i16> [[TMP1]], [[TMP3]]
-; SLM-NEXT:    [[TMP6:%.*]] = add <8 x i16> [[TMP2]], [[TMP4]]
-; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SLM-NEXT:    ret <16 x i16> [[TMP7]]
-;
-; AVX-LABEL: @test_v16i16(
-; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
-; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
-; AVX-NEXT:    [[TMP3:%.*]] = add <16 x i16> [[TMP1]], [[TMP2]]
-; AVX-NEXT:    ret <16 x i16> [[TMP3]]
+; CHECK-LABEL: @test_v16i16(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
+; CHECK-NEXT:    [[TMP3:%.*]] = add <16 x i16> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <16 x i16> [[TMP3]]
 ;
   %a0  = extractelement <16 x i16> %a, i32 0
   %a1  = extractelement <16 x i16> %a, i32 1
@@ -548,3 +448,5 @@ define <16 x i16> @test_v16i16(<16 x i16> %a, <16 x i16> %b) {
   %rv15 = insertelement <16 x i16> %rv14, i16 %r15, i32 15
   ret <16 x i16> %rv15
 }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; AVX: {{.*}}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/hsub-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/hsub-inseltpoison.ll
index 40b6a8c32f5d0..39400ba4ce1e8 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/hsub-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/hsub-inseltpoison.ll
@@ -145,31 +145,11 @@ define <8 x i16> @test_v8i16(<8 x i16> %a, <8 x i16> %b) {
 ;
 
 define <4 x double> @test_v4f64(<4 x double> %a, <4 x double> %b) {
-; SSE-LABEL: @test_v4f64(
-; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
-; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 2, i32 6>
-; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
-; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 3, i32 7>
-; SSE-NEXT:    [[TMP5:%.*]] = fsub <2 x double> [[TMP1]], [[TMP3]]
-; SSE-NEXT:    [[TMP6:%.*]] = fsub <2 x double> [[TMP2]], [[TMP4]]
-; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE-NEXT:    ret <4 x double> [[TMP7]]
-;
-; SLM-LABEL: @test_v4f64(
-; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
-; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 2, i32 6>
-; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
-; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 3, i32 7>
-; SLM-NEXT:    [[TMP5:%.*]] = fsub <2 x double> [[TMP1]], [[TMP3]]
-; SLM-NEXT:    [[TMP6:%.*]] = fsub <2 x double> [[TMP2]], [[TMP4]]
-; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SLM-NEXT:    ret <4 x double> [[TMP7]]
-;
-; AVX-LABEL: @test_v4f64(
-; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-; AVX-NEXT:    [[TMP3:%.*]] = fsub <4 x double> [[TMP1]], [[TMP2]]
-; AVX-NEXT:    ret <4 x double> [[TMP3]]
+; CHECK-LABEL: @test_v4f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = fsub <4 x double> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x double> [[TMP3]]
 ;
   %a0 = extractelement <4 x double> %a, i32 0
   %a1 = extractelement <4 x double> %a, i32 1
@@ -191,31 +171,11 @@ define <4 x double> @test_v4f64(<4 x double> %a, <4 x double> %b) {
 }
 
 define <8 x float> @test_v8f32(<8 x float> %a, <8 x float> %b) {
-; SSE-LABEL: @test_v8f32(
-; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
-; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
-; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
-; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
-; SSE-NEXT:    [[TMP5:%.*]] = fsub <4 x float> [[TMP1]], [[TMP3]]
-; SSE-NEXT:    [[TMP6:%.*]] = fsub <4 x float> [[TMP2]], [[TMP4]]
-; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE-NEXT:    ret <8 x float> [[TMP7]]
-;
-; SLM-LABEL: @test_v8f32(
-; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
-; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
-; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
-; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
-; SLM-NEXT:    [[TMP5:%.*]] = fsub <4 x float> [[TMP1]], [[TMP3]]
-; SLM-NEXT:    [[TMP6:%.*]] = fsub <4 x float> [[TMP2]], [[TMP4]]
-; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT:    ret <8 x float> [[TMP7]]
-;
-; AVX-LABEL: @test_v8f32(
-; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
-; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
-; AVX-NEXT:    [[TMP3:%.*]] = fsub <8 x float> [[TMP1]], [[TMP2]]
-; AVX-NEXT:    ret <8 x float> [[TMP3]]
+; CHECK-LABEL: @test_v8f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
+; CHECK-NEXT:    [[TMP3:%.*]] = fsub <8 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <8 x float> [[TMP3]]
 ;
   %a0 = extractelement <8 x float> %a, i32 0
   %a1 = extractelement <8 x float> %a, i32 1
@@ -253,31 +213,11 @@ define <8 x float> @test_v8f32(<8 x float> %a, <8 x float> %b) {
 }
 
 define <4 x i64> @test_v4i64(<4 x i64> %a, <4 x i64> %b) {
-; SSE-LABEL: @test_v4i64(
-; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> <i32 0, i32 4>
-; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 2, i32 6>
-; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 1, i32 5>
-; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 3, i32 7>
-; SSE-NEXT:    [[TMP5:%.*]] = sub <2 x i64> [[TMP1]], [[TMP3]]
-; SSE-NEXT:    [[TMP6:%.*]] = sub <2 x i64> [[TMP2]], [[TMP4]]
-; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE-NEXT:    ret <4 x i64> [[TMP7]]
-;
-; SLM-LABEL: @test_v4i64(
-; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> <i32 0, i32 4>
-; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 2, i32 6>
-; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 1, i32 5>
-; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 3, i32 7>
-; SLM-NEXT:    [[TMP5:%.*]] = sub <2 x i64> [[TMP1]], [[TMP3]]
-; SLM-NEXT:    [[TMP6:%.*]] = sub <2 x i64> [[TMP2]], [[TMP4]]
-; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SLM-NEXT:    ret <4 x i64> [[TMP7]]
-;
-; AVX-LABEL: @test_v4i64(
-; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-; AVX-NEXT:    [[TMP3:%.*]] = sub <4 x i64> [[TMP1]], [[TMP2]]
-; AVX-NEXT:    ret <4 x i64> [[TMP3]]
+; CHECK-LABEL: @test_v4i64(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = sub <4 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x i64> [[TMP3]]
 ;
   %a0 = extractelement <4 x i64> %a, i32 0
   %a1 = extractelement <4 x i64> %a, i32 1
@@ -299,31 +239,11 @@ define <4 x i64> @test_v4i64(<4 x i64> %a, <4 x i64> %b) {
 }
 
 define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) {
-; SSE-LABEL: @test_v8i32(
-; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
-; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
-; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
-; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
-; SSE-NEXT:    [[TMP5:%.*]] = sub <4 x i32> [[TMP1]], [[TMP3]]
-; SSE-NEXT:    [[TMP6:%.*]] = sub <4 x i32> [[TMP2]], [[TMP4]]
-; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE-NEXT:    ret <8 x i32> [[TMP7]]
-;
-; SLM-LABEL: @test_v8i32(
-; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
-; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
-; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
-; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
-; SLM-NEXT:    [[TMP5:%.*]] = sub <4 x i32> [[TMP1]], [[TMP3]]
-; SLM-NEXT:    [[TMP6:%.*]] = sub <4 x i32> [[TMP2]], [[TMP4]]
-; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT:    ret <8 x i32> [[TMP7]]
-;
-; AVX-LABEL: @test_v8i32(
-; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
-; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
-; AVX-NEXT:    [[TMP3:%.*]] = sub <8 x i32> [[TMP1]], [[TMP2]]
-; AVX-NEXT:    ret <8 x i32> [[TMP3]]
+; CHECK-LABEL: @test_v8i32(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
+; CHECK-NEXT:    [[TMP3:%.*]] = sub <8 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <8 x i32> [[TMP3]]
 ;
   %a0 = extractelement <8 x i32> %a, i32 0
   %a1 = extractelement <8 x i32> %a, i32 1
@@ -361,31 +281,11 @@ define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) {
 }
 
 define <16 x i16> @test_v16i16(<16 x i16> %a, <16 x i16> %b) {
-; SSE-LABEL: @test_v16i16(
-; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22>
-; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
-; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23>
-; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
-; SSE-NEXT:    [[TMP5:%.*]] = sub <8 x i16> [[TMP1]], [[TMP3]]
-; SSE-NEXT:    [[TMP6:%.*]] = sub <8 x i16> [[TMP2]], [[TMP4]]
-; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE-NEXT:    ret <16 x i16> [[TMP7]]
-;
-; SLM-LABEL: @test_v16i16(
-; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22>
-; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
-; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23>
-; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
-; SLM-NEXT:    [[TMP5:%.*]] = sub <8 x i16> [[TMP1]], [[TMP3]]
-; SLM-NEXT:    [[TMP6:%.*]] = sub <8 x i16> [[TMP2]], [[TMP4]]
-; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SLM-NEXT:    ret <16 x i16> [[TMP7]]
-;
-; AVX-LABEL: @test_v16i16(
-; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
-; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
-; AVX-NEXT:    [[TMP3:%.*]] = sub <16 x i16> [[TMP1]], [[TMP2]]
-; AVX-NEXT:    ret <16 x i16> [[TMP3]]
+; CHECK-LABEL: @test_v16i16(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
+; CHECK-NEXT:    [[TMP3:%.*]] = sub <16 x i16> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <16 x i16> [[TMP3]]
 ;
   %a0  = extractelement <16 x i16> %a, i32 0
   %a1  = extractelement <16 x i16> %a, i32 1
@@ -454,6 +354,9 @@ define <16 x i16> @test_v16i16(<16 x i16> %a, <16 x i16> %b) {
   ret <16 x i16> %rv15
 }
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; AVX: {{.*}}
 ; AVX1: {{.*}}
 ; AVX2: {{.*}}
 ; AVX512: {{.*}}
+; SLM: {{.*}}
+; SSE: {{.*}}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/hsub.ll b/llvm/test/Transforms/SLPVectorizer/X86/hsub.ll
index 09113323d3ab7..6b63de83c56be 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/hsub.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/hsub.ll
@@ -145,31 +145,11 @@ define <8 x i16> @test_v8i16(<8 x i16> %a, <8 x i16> %b) {
 ;
 
 define <4 x double> @test_v4f64(<4 x double> %a, <4 x double> %b) {
-; SSE-LABEL: @test_v4f64(
-; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
-; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 2, i32 6>
-; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
-; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 3, i32 7>
-; SSE-NEXT:    [[TMP5:%.*]] = fsub <2 x double> [[TMP1]], [[TMP3]]
-; SSE-NEXT:    [[TMP6:%.*]] = fsub <2 x double> [[TMP2]], [[TMP4]]
-; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE-NEXT:    ret <4 x double> [[TMP7]]
-;
-; SLM-LABEL: @test_v4f64(
-; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
-; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 2, i32 6>
-; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
-; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 3, i32 7>
-; SLM-NEXT:    [[TMP5:%.*]] = fsub <2 x double> [[TMP1]], [[TMP3]]
-; SLM-NEXT:    [[TMP6:%.*]] = fsub <2 x double> [[TMP2]], [[TMP4]]
-; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SLM-NEXT:    ret <4 x double> [[TMP7]]
-;
-; AVX-LABEL: @test_v4f64(
-; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-; AVX-NEXT:    [[TMP3:%.*]] = fsub <4 x double> [[TMP1]], [[TMP2]]
-; AVX-NEXT:    ret <4 x double> [[TMP3]]
+; CHECK-LABEL: @test_v4f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = fsub <4 x double> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x double> [[TMP3]]
 ;
   %a0 = extractelement <4 x double> %a, i32 0
   %a1 = extractelement <4 x double> %a, i32 1
@@ -191,31 +171,11 @@ define <4 x double> @test_v4f64(<4 x double> %a, <4 x double> %b) {
 }
 
 define <8 x float> @test_v8f32(<8 x float> %a, <8 x float> %b) {
-; SSE-LABEL: @test_v8f32(
-; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
-; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
-; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
-; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
-; SSE-NEXT:    [[TMP5:%.*]] = fsub <4 x float> [[TMP1]], [[TMP3]]
-; SSE-NEXT:    [[TMP6:%.*]] = fsub <4 x float> [[TMP2]], [[TMP4]]
-; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE-NEXT:    ret <8 x float> [[TMP7]]
-;
-; SLM-LABEL: @test_v8f32(
-; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
-; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
-; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
-; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
-; SLM-NEXT:    [[TMP5:%.*]] = fsub <4 x float> [[TMP1]], [[TMP3]]
-; SLM-NEXT:    [[TMP6:%.*]] = fsub <4 x float> [[TMP2]], [[TMP4]]
-; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT:    ret <8 x float> [[TMP7]]
-;
-; AVX-LABEL: @test_v8f32(
-; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
-; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
-; AVX-NEXT:    [[TMP3:%.*]] = fsub <8 x float> [[TMP1]], [[TMP2]]
-; AVX-NEXT:    ret <8 x float> [[TMP3]]
+; CHECK-LABEL: @test_v8f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
+; CHECK-NEXT:    [[TMP3:%.*]] = fsub <8 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <8 x float> [[TMP3]]
 ;
   %a0 = extractelement <8 x float> %a, i32 0
   %a1 = extractelement <8 x float> %a, i32 1
@@ -253,31 +213,11 @@ define <8 x float> @test_v8f32(<8 x float> %a, <8 x float> %b) {
 }
 
 define <4 x i64> @test_v4i64(<4 x i64> %a, <4 x i64> %b) {
-; SSE-LABEL: @test_v4i64(
-; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> <i32 0, i32 4>
-; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 2, i32 6>
-; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 1, i32 5>
-; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 3, i32 7>
-; SSE-NEXT:    [[TMP5:%.*]] = sub <2 x i64> [[TMP1]], [[TMP3]]
-; SSE-NEXT:    [[TMP6:%.*]] = sub <2 x i64> [[TMP2]], [[TMP4]]
-; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SSE-NEXT:    ret <4 x i64> [[TMP7]]
-;
-; SLM-LABEL: @test_v4i64(
-; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> <i32 0, i32 4>
-; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 2, i32 6>
-; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 1, i32 5>
-; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 3, i32 7>
-; SLM-NEXT:    [[TMP5:%.*]] = sub <2 x i64> [[TMP1]], [[TMP3]]
-; SLM-NEXT:    [[TMP6:%.*]] = sub <2 x i64> [[TMP2]], [[TMP4]]
-; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; SLM-NEXT:    ret <4 x i64> [[TMP7]]
-;
-; AVX-LABEL: @test_v4i64(
-; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-; AVX-NEXT:    [[TMP3:%.*]] = sub <4 x i64> [[TMP1]], [[TMP2]]
-; AVX-NEXT:    ret <4 x i64> [[TMP3]]
+; CHECK-LABEL: @test_v4i64(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+; CHECK-NEXT:    [[TMP3:%.*]] = sub <4 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <4 x i64> [[TMP3]]
 ;
   %a0 = extractelement <4 x i64> %a, i32 0
   %a1 = extractelement <4 x i64> %a, i32 1
@@ -299,31 +239,11 @@ define <4 x i64> @test_v4i64(<4 x i64> %a, <4 x i64> %b) {
 }
 
 define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) {
-; SSE-LABEL: @test_v8i32(
-; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
-; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
-; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
-; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
-; SSE-NEXT:    [[TMP5:%.*]] = sub <4 x i32> [[TMP1]], [[TMP3]]
-; SSE-NEXT:    [[TMP6:%.*]] = sub <4 x i32> [[TMP2]], [[TMP4]]
-; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SSE-NEXT:    ret <8 x i32> [[TMP7]]
-;
-; SLM-LABEL: @test_v8i32(
-; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
-; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
-; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
-; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
-; SLM-NEXT:    [[TMP5:%.*]] = sub <4 x i32> [[TMP1]], [[TMP3]]
-; SLM-NEXT:    [[TMP6:%.*]] = sub <4 x i32> [[TMP2]], [[TMP4]]
-; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; SLM-NEXT:    ret <8 x i32> [[TMP7]]
-;
-; AVX-LABEL: @test_v8i32(
-; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
-; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
-; AVX-NEXT:    [[TMP3:%.*]] = sub <8 x i32> [[TMP1]], [[TMP2]]
-; AVX-NEXT:    ret <8 x i32> [[TMP3]]
+; CHECK-LABEL: @test_v8i32(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
+; CHECK-NEXT:    [[TMP3:%.*]] = sub <8 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <8 x i32> [[TMP3]]
 ;
   %a0 = extractelement <8 x i32> %a, i32 0
   %a1 = extractelement <8 x i32> %a, i32 1
@@ -361,31 +281,11 @@ define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) {
 }
 
 define <16 x i16> @test_v16i16(<16 x i16> %a, <16 x i16> %b) {
-; SSE-LABEL: @test_v16i16(
-; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22>
-; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
-; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23>
-; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
-; SSE-NEXT:    [[TMP5:%.*]] = sub <8 x i16> [[TMP1]], [[TMP3]]
-; SSE-NEXT:    [[TMP6:%.*]] = sub <8 x i16> [[TMP2]], [[TMP4]]
-; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SSE-NEXT:    ret <16 x i16> [[TMP7]]
-;
-; SLM-LABEL: @test_v16i16(
-; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22>
-; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
-; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23>
-; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
-; SLM-NEXT:    [[TMP5:%.*]] = sub <8 x i16> [[TMP1]], [[TMP3]]
-; SLM-NEXT:    [[TMP6:%.*]] = sub <8 x i16> [[TMP2]], [[TMP4]]
-; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; SLM-NEXT:    ret <16 x i16> [[TMP7]]
-;
-; AVX-LABEL: @test_v16i16(
-; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
-; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
-; AVX-NEXT:    [[TMP3:%.*]] = sub <16 x i16> [[TMP1]], [[TMP2]]
-; AVX-NEXT:    ret <16 x i16> [[TMP3]]
+; CHECK-LABEL: @test_v16i16(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
+; CHECK-NEXT:    [[TMP3:%.*]] = sub <16 x i16> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret <16 x i16> [[TMP3]]
 ;
   %a0  = extractelement <16 x i16> %a, i32 0
   %a1  = extractelement <16 x i16> %a, i32 1
@@ -454,6 +354,9 @@ define <16 x i16> @test_v16i16(<16 x i16> %a, <16 x i16> %b) {
   ret <16 x i16> %rv15
 }
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; AVX: {{.*}}
 ; AVX1: {{.*}}
 ; AVX2: {{.*}}
 ; AVX512: {{.*}}
+; SLM: {{.*}}
+; SSE: {{.*}}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reused-extractelements.ll b/llvm/test/Transforms/SLPVectorizer/X86/reused-extractelements.ll
index 94a1d7aa1951c..35cb8c729e106 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reused-extractelements.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reused-extractelements.ll
@@ -2,24 +2,23 @@
 ; RUN: opt < %s -passes=slp-vectorizer -S -o - -mtriple=x86_64-unknown-linux -mcpu=bdver2 -pass-remarks-output=%t | FileCheck %s
 ; RUN: FileCheck --input-file=%t --check-prefix=YAML %s
 
-; YAML: --- !Missed
+; YAML: --- !Passed
 ; YAML-NEXT: Pass:            slp-vectorizer
-; YAML-NEXT: Name:            NotBeneficial
+; YAML-NEXT: Name:            VectorizedList
 ; YAML-NEXT: Function:        g
 ; YAML-NEXT: Args:
-; YAML-NEXT:   - String:          'List vectorization was possible but not beneficial with cost '
-; YAML-NEXT:   - Cost:            '0'
-; YAML-NEXT:   - String:          ' >= '
-; YAML-NEXT:   - Treshold:        '0'
+; YAML-NEXT:   - String:          'SLP vectorized with cost '
+; YAML-NEXT:   - Cost:            '-1'
+; YAML-NEXT:   - String:          ' and with tree size '
+; YAML-NEXT:   - TreeSize:        '4'
 
 define <2 x i32> @g(<2 x i32> %x, i32 %a, i32 %b) {
 ; CHECK-LABEL: @g(
-; CHECK-NEXT:    [[X1:%.*]] = extractelement <2 x i32> [[X:%.*]], i32 1
-; CHECK-NEXT:    [[X1X1:%.*]] = mul i32 [[X1]], [[X1]]
-; CHECK-NEXT:    [[AB:%.*]] = mul i32 [[A:%.*]], [[B:%.*]]
-; CHECK-NEXT:    [[INS1:%.*]] = insertelement <2 x i32> poison, i32 [[X1X1]], i32 0
-; CHECK-NEXT:    [[INS2:%.*]] = insertelement <2 x i32> [[INS1]], i32 [[AB]], i32 1
-; CHECK-NEXT:    ret <2 x i32> [[INS2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i32> [[X:%.*]], <2 x i32> poison, <2 x i32> <i32 1, i32 poison>
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[A:%.*]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[B:%.*]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = mul <2 x i32> [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    ret <2 x i32> [[TMP4]]
 ;
   %x1 = extractelement <2 x i32> %x, i32 1
   %x1x1 = mul i32 %x1, %x1

From e2550b7aa072130230baa9dba0feff808ffe907d Mon Sep 17 00:00:00 2001
From: Kerry McLaughlin <kerry.mclaughlin@arm.com>
Date: Wed, 1 Nov 2023 15:55:56 +0000
Subject: [PATCH 698/877] Revert "[AArch64][Clang] Refactor code to emit SVE &
 SME builtins (#70662)"

This reverts commit c34efe3c2734629b925d9411b3c86a710911a93a.
---
 clang/lib/CodeGen/CGBuiltin.cpp               | 150 +++--
 clang/lib/CodeGen/CodeGenFunction.h           |   5 -
 .../acle_sve_st2-bfloat.c                     |  36 +-
 .../aarch64-sve-intrinsics/acle_sve_st2.c     | 356 ++++++------
 .../acle_sve_st3-bfloat.c                     |  44 +-
 .../aarch64-sve-intrinsics/acle_sve_st3.c     | 436 +++++++--------
 .../acle_sve_st4-bfloat.c                     |  52 +-
 .../aarch64-sve-intrinsics/acle_sve_st4.c     | 516 +++++++++---------
 8 files changed, 792 insertions(+), 803 deletions(-)

diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 280a2e1f1ee2c..e047d31c01211 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -9617,17 +9617,22 @@ Value *CodeGenFunction::EmitSVEStructStore(const SVETypeFlags &TypeFlags,
   Value *BasePtr = Ops[1];
 
   // Does the store have an offset?
-  if (Ops.size() > (2 + N))
+  if (Ops.size() > 3)
     BasePtr = Builder.CreateGEP(VTy, BasePtr, Ops[2]);
 
+  Value *Val = Ops.back();
+
   // The llvm.aarch64.sve.st2/3/4 intrinsics take legal part vectors, so we
   // need to break up the tuple vector.
   SmallVector<llvm::Value*, 5> Operands;
-  for (unsigned I = Ops.size() - N; I < Ops.size(); ++I)
-    Operands.push_back(Ops[I]);
+  unsigned MinElts = VTy->getElementCount().getKnownMinValue();
+  for (unsigned I = 0; I < N; ++I) {
+    Value *Idx = ConstantInt::get(CGM.Int64Ty, I * MinElts);
+    Operands.push_back(Builder.CreateExtractVector(VTy, Val, Idx));
+  }
   Operands.append({Predicate, BasePtr});
-  Function *F = CGM.getIntrinsic(IntID, { VTy });
 
+  Function *F = CGM.getIntrinsic(IntID, { VTy });
   return Builder.CreateCall(F, Operands);
 }
 
@@ -9934,24 +9939,26 @@ Value *CodeGenFunction::FormSVEBuiltinResult(Value *Call) {
   return Call;
 }
 
-void CodeGenFunction::GetAArch64SVEProcessedOperands(
-    unsigned BuiltinID, const CallExpr *E, SmallVectorImpl<Value *> &Ops,
-    SVETypeFlags TypeFlags) {
+Value *CodeGenFunction::EmitAArch64SVEBuiltinExpr(unsigned BuiltinID,
+                                                  const CallExpr *E) {
   // Find out if any arguments are required to be integer constant expressions.
   unsigned ICEArguments = 0;
   ASTContext::GetBuiltinTypeError Error;
   getContext().GetBuiltinType(BuiltinID, Error, &ICEArguments);
   assert(Error == ASTContext::GE_None && "Should not codegen an error");
 
-  // Tuple set/get only requires one insert/extract vector, which is
-  // created by EmitSVETupleSetOrGet.
-  bool IsTupleGetOrSet = TypeFlags.isTupleSet() || TypeFlags.isTupleGet();
+  llvm::Type *Ty = ConvertType(E->getType());
+  if (BuiltinID >= SVE::BI__builtin_sve_reinterpret_s8_s8 &&
+      BuiltinID <= SVE::BI__builtin_sve_reinterpret_f64_f64) {
+    Value *Val = EmitScalarExpr(E->getArg(0));
+    return EmitSVEReinterpret(Val, Ty);
+  }
 
+  llvm::SmallVector<Value *, 4> Ops;
   for (unsigned i = 0, e = E->getNumArgs(); i != e; i++) {
-    bool IsICE = ICEArguments & (1 << i);
-    Value *Arg = EmitScalarExpr(E->getArg(i));
-
-    if (IsICE) {
+    if ((ICEArguments & (1 << i)) == 0)
+      Ops.push_back(EmitScalarExpr(E->getArg(i)));
+    else {
       // If this is required to be a constant, constant fold it so that we know
       // that the generated intrinsic gets a ConstantInt.
       std::optional<llvm::APSInt> Result =
@@ -9963,49 +9970,12 @@ void CodeGenFunction::GetAArch64SVEProcessedOperands(
       // immediate requires more than a handful of bits.
       *Result = Result->extOrTrunc(32);
       Ops.push_back(llvm::ConstantInt::get(getLLVMContext(), *Result));
-      continue;
-    }
-
-    if (IsTupleGetOrSet || !isa<ScalableVectorType>(Arg->getType())) {
-      Ops.push_back(Arg);
-      continue;
-    }
-
-    auto *VTy = cast<ScalableVectorType>(Arg->getType());
-    unsigned MinElts = VTy->getMinNumElements();
-    bool IsPred = VTy->getElementType()->isIntegerTy(1);
-    unsigned N = (MinElts * VTy->getScalarSizeInBits()) / (IsPred ? 16 : 128);
-
-    if (N == 1) {
-      Ops.push_back(Arg);
-      continue;
-    }
-
-    for (unsigned I = 0; I < N; ++I) {
-      Value *Idx = ConstantInt::get(CGM.Int64Ty, (I * MinElts) / N);
-      auto *NewVTy =
-          ScalableVectorType::get(VTy->getElementType(), MinElts / N);
-      Ops.push_back(Builder.CreateExtractVector(NewVTy, Arg, Idx));
     }
   }
-}
-
-Value *CodeGenFunction::EmitAArch64SVEBuiltinExpr(unsigned BuiltinID,
-                                                  const CallExpr *E) {
-  llvm::Type *Ty = ConvertType(E->getType());
-  if (BuiltinID >= SVE::BI__builtin_sve_reinterpret_s8_s8 &&
-      BuiltinID <= SVE::BI__builtin_sve_reinterpret_f64_f64) {
-    Value *Val = EmitScalarExpr(E->getArg(0));
-    return EmitSVEReinterpret(Val, Ty);
-  }
 
   auto *Builtin = findARMVectorIntrinsicInMap(AArch64SVEIntrinsicMap, BuiltinID,
                                               AArch64SVEIntrinsicsProvenSorted);
-
-  llvm::SmallVector<Value *, 4> Ops;
   SVETypeFlags TypeFlags(Builtin->TypeModifier);
-  GetAArch64SVEProcessedOperands(BuiltinID, E, Ops, TypeFlags);
-
   if (TypeFlags.isLoad())
     return EmitSVEMaskedLoad(E, Ty, Ops, Builtin->LLVMIntrinsic,
                              TypeFlags.isZExtReturn());
@@ -10019,14 +9989,14 @@ Value *CodeGenFunction::EmitAArch64SVEBuiltinExpr(unsigned BuiltinID,
     return EmitSVEPrefetchLoad(TypeFlags, Ops, Builtin->LLVMIntrinsic);
   else if (TypeFlags.isGatherPrefetch())
     return EmitSVEGatherPrefetch(TypeFlags, Ops, Builtin->LLVMIntrinsic);
-  else if (TypeFlags.isStructLoad())
-    return EmitSVEStructLoad(TypeFlags, Ops, Builtin->LLVMIntrinsic);
-  else if (TypeFlags.isStructStore())
-    return EmitSVEStructStore(TypeFlags, Ops, Builtin->LLVMIntrinsic);
+	else if (TypeFlags.isStructLoad())
+		return EmitSVEStructLoad(TypeFlags, Ops, Builtin->LLVMIntrinsic);
+	else if (TypeFlags.isStructStore())
+		return EmitSVEStructStore(TypeFlags, Ops, Builtin->LLVMIntrinsic);
   else if (TypeFlags.isTupleSet() || TypeFlags.isTupleGet())
-    return EmitSVETupleSetOrGet(TypeFlags, Ty, Ops);
+        return EmitSVETupleSetOrGet(TypeFlags, Ty, Ops);
   else if (TypeFlags.isTupleCreate())
-    return EmitSVETupleCreate(TypeFlags, Ty, Ops);
+        return EmitSVETupleCreate(TypeFlags, Ty, Ops);
   else if (TypeFlags.isUndef())
     return UndefValue::get(Ty);
   else if (Builtin->LLVMIntrinsic != 0) {
@@ -10278,8 +10248,13 @@ Value *CodeGenFunction::EmitAArch64SVEBuiltinExpr(unsigned BuiltinID,
   case SVE::BI__builtin_sve_svtbl2_f64: {
     SVETypeFlags TF(Builtin->TypeModifier);
     auto VTy = cast<llvm::ScalableVectorType>(getSVEType(TF));
+    Value *V0 = Builder.CreateExtractVector(VTy, Ops[0],
+                                            ConstantInt::get(CGM.Int64Ty, 0));
+    unsigned MinElts = VTy->getMinNumElements();
+    Value *V1 = Builder.CreateExtractVector(
+        VTy, Ops[0], ConstantInt::get(CGM.Int64Ty, MinElts));
     Function *F = CGM.getIntrinsic(Intrinsic::aarch64_sve_tbl2, VTy);
-    return Builder.CreateCall(F, Ops);
+    return Builder.CreateCall(F, {V0, V1, Ops[1]});
   }
 
   case SVE::BI__builtin_sve_svset_neonq_s8:
@@ -10337,13 +10312,35 @@ Value *CodeGenFunction::EmitAArch64SVEBuiltinExpr(unsigned BuiltinID,
 
 Value *CodeGenFunction::EmitAArch64SMEBuiltinExpr(unsigned BuiltinID,
                                                   const CallExpr *E) {
-  auto *Builtin = findARMVectorIntrinsicInMap(AArch64SMEIntrinsicMap, BuiltinID,
-                                              AArch64SMEIntrinsicsProvenSorted);
+  // Find out if any arguments are required to be integer constant expressions.
+  unsigned ICEArguments = 0;
+  ASTContext::GetBuiltinTypeError Error;
+  getContext().GetBuiltinType(BuiltinID, Error, &ICEArguments);
+  assert(Error == ASTContext::GE_None && "Should not codegen an error");
 
+  llvm::Type *Ty = ConvertType(E->getType());
   llvm::SmallVector<Value *, 4> Ops;
-  SVETypeFlags TypeFlags(Builtin->TypeModifier);
-  GetAArch64SVEProcessedOperands(BuiltinID, E, Ops, TypeFlags);
+  for (unsigned i = 0, e = E->getNumArgs(); i != e; i++) {
+    if ((ICEArguments & (1 << i)) == 0)
+      Ops.push_back(EmitScalarExpr(E->getArg(i)));
+    else {
+      // If this is required to be a constant, constant fold it so that we know
+      // that the generated intrinsic gets a ConstantInt.
+      std::optional<llvm::APSInt> Result =
+          E->getArg(i)->getIntegerConstantExpr(getContext());
+      assert(Result && "Expected argument to be a constant");
+
+      // Immediates for SVE llvm intrinsics are always 32bit.  We can safely
+      // truncate because the immediate has been range checked and no valid
+      // immediate requires more than a handful of bits.
+      *Result = Result->extOrTrunc(32);
+      Ops.push_back(llvm::ConstantInt::get(getLLVMContext(), *Result));
+    }
+  }
 
+  auto *Builtin = findARMVectorIntrinsicInMap(AArch64SMEIntrinsicMap, BuiltinID,
+                                              AArch64SMEIntrinsicsProvenSorted);
+  SVETypeFlags TypeFlags(Builtin->TypeModifier);
   if (TypeFlags.isLoad() || TypeFlags.isStore())
     return EmitSMELd1St1(TypeFlags, Ops, Builtin->LLVMIntrinsic);
   else if (TypeFlags.isReadZA() || TypeFlags.isWriteZA())
@@ -10356,24 +10353,21 @@ Value *CodeGenFunction::EmitAArch64SMEBuiltinExpr(unsigned BuiltinID,
            BuiltinID == SME::BI__builtin_sme_svldr_za ||
            BuiltinID == SME::BI__builtin_sme_svstr_za)
     return EmitSMELdrStr(TypeFlags, Ops, Builtin->LLVMIntrinsic);
+  else if (Builtin->LLVMIntrinsic != 0) {
+    // Predicates must match the main datatype.
+    for (unsigned i = 0, e = Ops.size(); i != e; ++i)
+      if (auto PredTy = dyn_cast<llvm::VectorType>(Ops[i]->getType()))
+        if (PredTy->getElementType()->isIntegerTy(1))
+          Ops[i] = EmitSVEPredicateCast(Ops[i], getSVEType(TypeFlags));
 
-  // Should not happen!
-  if (Builtin->LLVMIntrinsic == 0)
-    return nullptr;
-
-  // Predicates must match the main datatype.
-  for (unsigned i = 0, e = Ops.size(); i != e; ++i)
-    if (auto PredTy = dyn_cast<llvm::VectorType>(Ops[i]->getType()))
-      if (PredTy->getElementType()->isIntegerTy(1))
-        Ops[i] = EmitSVEPredicateCast(Ops[i], getSVEType(TypeFlags));
-
-  Function *F =
-      TypeFlags.isOverloadNone()
-          ? CGM.getIntrinsic(Builtin->LLVMIntrinsic)
-          : CGM.getIntrinsic(Builtin->LLVMIntrinsic, {getSVEType(TypeFlags)});
-  Value *Call = Builder.CreateCall(F, Ops);
+    Function *F = CGM.getIntrinsic(Builtin->LLVMIntrinsic,
+                                   getSVEOverloadTypes(TypeFlags, Ty, Ops));
+    Value *Call = Builder.CreateCall(F, Ops);
+    return Call;
+  }
 
-  return FormSVEBuiltinResult(Call);
+  /// Should not happen
+  return nullptr;
 }
 
 Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index ee2c4b1e10afb..e82115e2d706c 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -4311,11 +4311,6 @@ class CodeGenFunction : public CodeGenTypeCache {
   llvm::Value *EmitSMELdrStr(const SVETypeFlags &TypeFlags,
                              llvm::SmallVectorImpl<llvm::Value *> &Ops,
                              unsigned IntID);
-
-  void GetAArch64SVEProcessedOperands(unsigned BuiltinID, const CallExpr *E,
-                                      SmallVectorImpl<llvm::Value *> &Ops,
-                                      SVETypeFlags TypeFlags);
-
   llvm::Value *EmitAArch64SMEBuiltinExpr(unsigned BuiltinID, const CallExpr *E);
 
   llvm::Value *EmitAArch64BuiltinExpr(unsigned BuiltinID, const CallExpr *E,
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st2-bfloat.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st2-bfloat.c
index 89a7d01129368..abe59567b0387 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st2-bfloat.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st2-bfloat.c
@@ -16,18 +16,18 @@
 #endif
 // CHECK-LABEL: @test_svst2_bf16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[DATA]], i64 8)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv8bf16(<vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x i1> [[TMP2]], ptr [[BASE:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[DATA]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv8bf16(<vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z15test_svst2_bf16u10__SVBool_tPu6__bf1614svbfloat16x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[DATA]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv8bf16(<vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x i1> [[TMP2]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[DATA]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv8bf16(<vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst2_bf16(svbool_t pg, bfloat16_t *base, svbfloat16x2_t data)
@@ -37,20 +37,20 @@ void test_svst2_bf16(svbool_t pg, bfloat16_t *base, svbfloat16x2_t data)
 
 // CHECK-LABEL: @test_svst2_vnum_bf16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[DATA]], i64 8)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr <vscale x 8 x bfloat>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv8bf16(<vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x i1> [[TMP2]], ptr [[TMP3]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x bfloat>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[DATA]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv8bf16(<vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z20test_svst2_vnum_bf16u10__SVBool_tPu6__bf16l14svbfloat16x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[DATA]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr <vscale x 8 x bfloat>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv8bf16(<vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x i1> [[TMP2]], ptr [[TMP3]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x bfloat>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv16bf16(<vscale x 16 x bfloat> [[DATA]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv8bf16(<vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst2_vnum_bf16(svbool_t pg, bfloat16_t *base, int64_t vnum, svbfloat16x2_t data)
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st2.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st2.c
index 7848cbc5d9abc..6a6632d51b1c0 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st2.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st2.c
@@ -35,18 +35,18 @@ void test_svst2_s8(svbool_t pg, int8_t *base, svint8x2_t data)
 
 // CHECK-LABEL: @test_svst2_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[DATA]], i64 8)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i1> [[TMP2]], ptr [[BASE:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[DATA]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv8i16(<vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z14test_svst2_s16u10__SVBool_tPs11svint16x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[DATA]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i1> [[TMP2]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[DATA]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv8i16(<vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst2_s16(svbool_t pg, int16_t *base, svint16x2_t data)
@@ -56,18 +56,18 @@ void test_svst2_s16(svbool_t pg, int16_t *base, svint16x2_t data)
 
 // CHECK-LABEL: @test_svst2_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[DATA]], i64 4)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i1> [[TMP2]], ptr [[BASE:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[DATA]], i64 4)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv4i32(<vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i1> [[TMP0]], ptr [[BASE:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z14test_svst2_s32u10__SVBool_tPi11svint32x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[DATA]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i1> [[TMP2]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[DATA]], i64 4)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv4i32(<vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i1> [[TMP0]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst2_s32(svbool_t pg, int32_t *base, svint32x2_t data)
@@ -77,18 +77,18 @@ void test_svst2_s32(svbool_t pg, int32_t *base, svint32x2_t data)
 
 // CHECK-LABEL: @test_svst2_s64(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[DATA]], i64 2)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i1> [[TMP2]], ptr [[BASE:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[DATA]], i64 2)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv2i64(<vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i1> [[TMP0]], ptr [[BASE:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z14test_svst2_s64u10__SVBool_tPl11svint64x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[DATA]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i1> [[TMP2]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[DATA]], i64 2)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv2i64(<vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i1> [[TMP0]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst2_s64(svbool_t pg, int64_t *base, svint64x2_t data)
@@ -117,18 +117,18 @@ void test_svst2_u8(svbool_t pg, uint8_t *base, svuint8x2_t data)
 
 // CHECK-LABEL: @test_svst2_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[DATA]], i64 8)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i1> [[TMP2]], ptr [[BASE:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[DATA]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv8i16(<vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z14test_svst2_u16u10__SVBool_tPt12svuint16x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[DATA]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i1> [[TMP2]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[DATA]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv8i16(<vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst2_u16(svbool_t pg, uint16_t *base, svuint16x2_t data)
@@ -138,18 +138,18 @@ void test_svst2_u16(svbool_t pg, uint16_t *base, svuint16x2_t data)
 
 // CHECK-LABEL: @test_svst2_u32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[DATA]], i64 4)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i1> [[TMP2]], ptr [[BASE:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[DATA]], i64 4)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv4i32(<vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i1> [[TMP0]], ptr [[BASE:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z14test_svst2_u32u10__SVBool_tPj12svuint32x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[DATA]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i1> [[TMP2]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[DATA]], i64 4)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv4i32(<vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i1> [[TMP0]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst2_u32(svbool_t pg, uint32_t *base, svuint32x2_t data)
@@ -159,18 +159,18 @@ void test_svst2_u32(svbool_t pg, uint32_t *base, svuint32x2_t data)
 
 // CHECK-LABEL: @test_svst2_u64(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[DATA]], i64 2)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i1> [[TMP2]], ptr [[BASE:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[DATA]], i64 2)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv2i64(<vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i1> [[TMP0]], ptr [[BASE:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z14test_svst2_u64u10__SVBool_tPm12svuint64x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[DATA]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i1> [[TMP2]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[DATA]], i64 2)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv2i64(<vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i1> [[TMP0]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst2_u64(svbool_t pg, uint64_t *base, svuint64x2_t data)
@@ -180,18 +180,18 @@ void test_svst2_u64(svbool_t pg, uint64_t *base, svuint64x2_t data)
 
 // CHECK-LABEL: @test_svst2_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[DATA]], i64 8)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv8f16(<vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x i1> [[TMP2]], ptr [[BASE:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[DATA]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv8f16(<vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z14test_svst2_f16u10__SVBool_tPDh13svfloat16x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[DATA]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv8f16(<vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x i1> [[TMP2]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[DATA]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv8f16(<vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst2_f16(svbool_t pg, float16_t *base, svfloat16x2_t data)
@@ -201,18 +201,18 @@ void test_svst2_f16(svbool_t pg, float16_t *base, svfloat16x2_t data)
 
 // CHECK-LABEL: @test_svst2_f32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[DATA]], i64 4)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x i1> [[TMP2]], ptr [[BASE:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[DATA]], i64 4)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv4f32(<vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x i1> [[TMP0]], ptr [[BASE:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z14test_svst2_f32u10__SVBool_tPf13svfloat32x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[DATA]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x i1> [[TMP2]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[DATA]], i64 4)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv4f32(<vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x i1> [[TMP0]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst2_f32(svbool_t pg, float32_t *base, svfloat32x2_t data)
@@ -222,18 +222,18 @@ void test_svst2_f32(svbool_t pg, float32_t *base, svfloat32x2_t data)
 
 // CHECK-LABEL: @test_svst2_f64(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[DATA]], i64 2)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv2f64(<vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x i1> [[TMP2]], ptr [[BASE:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[DATA]], i64 2)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv2f64(<vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x i1> [[TMP0]], ptr [[BASE:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z14test_svst2_f64u10__SVBool_tPd13svfloat64x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[DATA]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv2f64(<vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x i1> [[TMP2]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[DATA]], i64 2)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv2f64(<vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x i1> [[TMP0]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst2_f64(svbool_t pg, float64_t *base, svfloat64x2_t data)
@@ -243,18 +243,18 @@ void test_svst2_f64(svbool_t pg, float64_t *base, svfloat64x2_t data)
 
 // CHECK-LABEL: @test_svst2_vnum_s8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[DATA]], i64 16)
-// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i1> [[PG:%.*]], ptr [[TMP2]])
+// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[DATA]], i64 16)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv16i8(<vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i1> [[PG:%.*]], ptr [[TMP0]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z18test_svst2_vnum_s8u10__SVBool_tPal10svint8x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[DATA]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i1> [[PG:%.*]], ptr [[TMP2]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[DATA]], i64 16)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv16i8(<vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i1> [[PG:%.*]], ptr [[TMP0]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst2_vnum_s8(svbool_t pg, int8_t *base, int64_t vnum, svint8x2_t data)
@@ -264,20 +264,20 @@ void test_svst2_vnum_s8(svbool_t pg, int8_t *base, int64_t vnum, svint8x2_t data
 
 // CHECK-LABEL: @test_svst2_vnum_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[DATA]], i64 8)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i1> [[TMP2]], ptr [[TMP3]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[DATA]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv8i16(<vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z19test_svst2_vnum_s16u10__SVBool_tPsl11svint16x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[DATA]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i1> [[TMP2]], ptr [[TMP3]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[DATA]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv8i16(<vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst2_vnum_s16(svbool_t pg, int16_t *base, int64_t vnum, svint16x2_t data)
@@ -287,20 +287,20 @@ void test_svst2_vnum_s16(svbool_t pg, int16_t *base, int64_t vnum, svint16x2_t d
 
 // CHECK-LABEL: @test_svst2_vnum_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[DATA]], i64 4)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i1> [[TMP2]], ptr [[TMP3]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[DATA]], i64 4)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv4i32(<vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i1> [[TMP0]], ptr [[TMP1]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z19test_svst2_vnum_s32u10__SVBool_tPil11svint32x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[DATA]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i1> [[TMP2]], ptr [[TMP3]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[DATA]], i64 4)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv4i32(<vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i1> [[TMP0]], ptr [[TMP1]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst2_vnum_s32(svbool_t pg, int32_t *base, int64_t vnum, svint32x2_t data)
@@ -310,20 +310,20 @@ void test_svst2_vnum_s32(svbool_t pg, int32_t *base, int64_t vnum, svint32x2_t d
 
 // CHECK-LABEL: @test_svst2_vnum_s64(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[DATA]], i64 2)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i1> [[TMP2]], ptr [[TMP3]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[DATA]], i64 2)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv2i64(<vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i1> [[TMP0]], ptr [[TMP1]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z19test_svst2_vnum_s64u10__SVBool_tPll11svint64x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[DATA]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i1> [[TMP2]], ptr [[TMP3]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[DATA]], i64 2)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv2i64(<vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i1> [[TMP0]], ptr [[TMP1]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst2_vnum_s64(svbool_t pg, int64_t *base, int64_t vnum, svint64x2_t data)
@@ -333,18 +333,18 @@ void test_svst2_vnum_s64(svbool_t pg, int64_t *base, int64_t vnum, svint64x2_t d
 
 // CHECK-LABEL: @test_svst2_vnum_u8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[DATA]], i64 16)
-// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i1> [[PG:%.*]], ptr [[TMP2]])
+// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[DATA]], i64 16)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv16i8(<vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i1> [[PG:%.*]], ptr [[TMP0]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z18test_svst2_vnum_u8u10__SVBool_tPhl11svuint8x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[DATA]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i1> [[PG:%.*]], ptr [[TMP2]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[DATA]], i64 16)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv16i8(<vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i1> [[PG:%.*]], ptr [[TMP0]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst2_vnum_u8(svbool_t pg, uint8_t *base, int64_t vnum, svuint8x2_t data)
@@ -354,20 +354,20 @@ void test_svst2_vnum_u8(svbool_t pg, uint8_t *base, int64_t vnum, svuint8x2_t da
 
 // CHECK-LABEL: @test_svst2_vnum_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[DATA]], i64 8)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i1> [[TMP2]], ptr [[TMP3]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[DATA]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv8i16(<vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z19test_svst2_vnum_u16u10__SVBool_tPtl12svuint16x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[DATA]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i1> [[TMP2]], ptr [[TMP3]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[DATA]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv8i16(<vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst2_vnum_u16(svbool_t pg, uint16_t *base, int64_t vnum, svuint16x2_t data)
@@ -377,20 +377,20 @@ void test_svst2_vnum_u16(svbool_t pg, uint16_t *base, int64_t vnum, svuint16x2_t
 
 // CHECK-LABEL: @test_svst2_vnum_u32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[DATA]], i64 4)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i1> [[TMP2]], ptr [[TMP3]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[DATA]], i64 4)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv4i32(<vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i1> [[TMP0]], ptr [[TMP1]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z19test_svst2_vnum_u32u10__SVBool_tPjl12svuint32x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[DATA]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i1> [[TMP2]], ptr [[TMP3]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[DATA]], i64 4)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv4i32(<vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i1> [[TMP0]], ptr [[TMP1]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst2_vnum_u32(svbool_t pg, uint32_t *base, int64_t vnum, svuint32x2_t data)
@@ -400,20 +400,20 @@ void test_svst2_vnum_u32(svbool_t pg, uint32_t *base, int64_t vnum, svuint32x2_t
 
 // CHECK-LABEL: @test_svst2_vnum_u64(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[DATA]], i64 2)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i1> [[TMP2]], ptr [[TMP3]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[DATA]], i64 2)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv2i64(<vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i1> [[TMP0]], ptr [[TMP1]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z19test_svst2_vnum_u64u10__SVBool_tPml12svuint64x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[DATA]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i1> [[TMP2]], ptr [[TMP3]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[DATA]], i64 2)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv2i64(<vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i1> [[TMP0]], ptr [[TMP1]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst2_vnum_u64(svbool_t pg, uint64_t *base, int64_t vnum, svuint64x2_t data)
@@ -423,20 +423,20 @@ void test_svst2_vnum_u64(svbool_t pg, uint64_t *base, int64_t vnum, svuint64x2_t
 
 // CHECK-LABEL: @test_svst2_vnum_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[DATA]], i64 8)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr <vscale x 8 x half>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv8f16(<vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x i1> [[TMP2]], ptr [[TMP3]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x half>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[DATA]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv8f16(<vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z19test_svst2_vnum_f16u10__SVBool_tPDhl13svfloat16x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[DATA]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr <vscale x 8 x half>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv8f16(<vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x i1> [[TMP2]], ptr [[TMP3]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x half>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv16f16(<vscale x 16 x half> [[DATA]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv8f16(<vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst2_vnum_f16(svbool_t pg, float16_t *base, int64_t vnum, svfloat16x2_t data)
@@ -446,20 +446,20 @@ void test_svst2_vnum_f16(svbool_t pg, float16_t *base, int64_t vnum, svfloat16x2
 
 // CHECK-LABEL: @test_svst2_vnum_f32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[DATA]], i64 4)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr <vscale x 4 x float>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x i1> [[TMP2]], ptr [[TMP3]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 4 x float>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[DATA]], i64 4)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv4f32(<vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], <vscale x 4 x i1> [[TMP0]], ptr [[TMP1]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z19test_svst2_vnum_f32u10__SVBool_tPfl13svfloat32x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[DATA]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr <vscale x 4 x float>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x i1> [[TMP2]], ptr [[TMP3]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 4 x float>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[DATA]], i64 4)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv4f32(<vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], <vscale x 4 x i1> [[TMP0]], ptr [[TMP1]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst2_vnum_f32(svbool_t pg, float32_t *base, int64_t vnum, svfloat32x2_t data)
@@ -469,20 +469,20 @@ void test_svst2_vnum_f32(svbool_t pg, float32_t *base, int64_t vnum, svfloat32x2
 
 // CHECK-LABEL: @test_svst2_vnum_f64(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[DATA]], i64 2)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr <vscale x 2 x double>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv2f64(<vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x i1> [[TMP2]], ptr [[TMP3]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x double>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[DATA]], i64 2)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv2f64(<vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], <vscale x 2 x i1> [[TMP0]], ptr [[TMP1]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z19test_svst2_vnum_f64u10__SVBool_tPdl13svfloat64x2_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[DATA]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr <vscale x 2 x double>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv2f64(<vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x i1> [[TMP2]], ptr [[TMP3]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x double>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv4f64(<vscale x 4 x double> [[DATA]], i64 2)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st2.nxv2f64(<vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], <vscale x 2 x i1> [[TMP0]], ptr [[TMP1]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst2_vnum_f64(svbool_t pg, float64_t *base, int64_t vnum, svfloat64x2_t data)
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st3-bfloat.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st3-bfloat.c
index fb66c1e961ac2..30954b4eaac4c 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st3-bfloat.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st3-bfloat.c
@@ -17,20 +17,20 @@
 
 // CHECK-LABEL: @test_svst3_bf16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv24bf16(<vscale x 24 x bfloat> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv24bf16(<vscale x 24 x bfloat> [[DATA]], i64 8)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv24bf16(<vscale x 24 x bfloat> [[DATA]], i64 16)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv8bf16(<vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x i1> [[TMP3]], ptr [[BASE:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv24bf16(<vscale x 24 x bfloat> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv24bf16(<vscale x 24 x bfloat> [[DATA]], i64 8)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv24bf16(<vscale x 24 x bfloat> [[DATA]], i64 16)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv8bf16(<vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z15test_svst3_bf16u10__SVBool_tPu6__bf1614svbfloat16x3_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv24bf16(<vscale x 24 x bfloat> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv24bf16(<vscale x 24 x bfloat> [[DATA]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv24bf16(<vscale x 24 x bfloat> [[DATA]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv8bf16(<vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x i1> [[TMP3]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv24bf16(<vscale x 24 x bfloat> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv24bf16(<vscale x 24 x bfloat> [[DATA]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv24bf16(<vscale x 24 x bfloat> [[DATA]], i64 16)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv8bf16(<vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst3_bf16(svbool_t pg, bfloat16_t *base, svbfloat16x3_t data)
@@ -40,22 +40,22 @@ void test_svst3_bf16(svbool_t pg, bfloat16_t *base, svbfloat16x3_t data)
 
 // CHECK-LABEL: @test_svst3_vnum_bf16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv24bf16(<vscale x 24 x bfloat> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv24bf16(<vscale x 24 x bfloat> [[DATA]], i64 8)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv24bf16(<vscale x 24 x bfloat> [[DATA]], i64 16)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 8 x bfloat>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv8bf16(<vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x i1> [[TMP3]], ptr [[TMP4]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x bfloat>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv24bf16(<vscale x 24 x bfloat> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv24bf16(<vscale x 24 x bfloat> [[DATA]], i64 8)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv24bf16(<vscale x 24 x bfloat> [[DATA]], i64 16)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv8bf16(<vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[TMP4]], <vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z20test_svst3_vnum_bf16u10__SVBool_tPu6__bf16l14svbfloat16x3_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv24bf16(<vscale x 24 x bfloat> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv24bf16(<vscale x 24 x bfloat> [[DATA]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv24bf16(<vscale x 24 x bfloat> [[DATA]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 8 x bfloat>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv8bf16(<vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x i1> [[TMP3]], ptr [[TMP4]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x bfloat>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv24bf16(<vscale x 24 x bfloat> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv24bf16(<vscale x 24 x bfloat> [[DATA]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv24bf16(<vscale x 24 x bfloat> [[DATA]], i64 16)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv8bf16(<vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[TMP4]], <vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst3_vnum_bf16(svbool_t pg, bfloat16_t *base, int64_t vnum, svbfloat16x3_t data)
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st3.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st3.c
index cf85c72ed71c6..d78c7d81b9530 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st3.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st3.c
@@ -37,20 +37,20 @@ void test_svst3_s8(svbool_t pg, int8_t *base, svint8x3_t data)
 
 // CHECK-LABEL: @test_svst3_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA]], i64 8)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA]], i64 16)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i1> [[TMP3]], ptr [[BASE:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA]], i64 8)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA]], i64 16)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv8i16(<vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z14test_svst3_s16u10__SVBool_tPs11svint16x3_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i1> [[TMP3]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA]], i64 16)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv8i16(<vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst3_s16(svbool_t pg, int16_t *base, svint16x3_t data)
@@ -60,20 +60,20 @@ void test_svst3_s16(svbool_t pg, int16_t *base, svint16x3_t data)
 
 // CHECK-LABEL: @test_svst3_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA]], i64 4)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA]], i64 8)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i1> [[TMP3]], ptr [[BASE:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA]], i64 4)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv4i32(<vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i1> [[TMP0]], ptr [[BASE:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z14test_svst3_s32u10__SVBool_tPi11svint32x3_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i1> [[TMP3]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv4i32(<vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i1> [[TMP0]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst3_s32(svbool_t pg, int32_t *base, svint32x3_t data)
@@ -83,20 +83,20 @@ void test_svst3_s32(svbool_t pg, int32_t *base, svint32x3_t data)
 
 // CHECK-LABEL: @test_svst3_s64(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA]], i64 2)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA]], i64 4)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i1> [[TMP3]], ptr [[BASE:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA]], i64 2)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA]], i64 4)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv2i64(<vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i1> [[TMP0]], ptr [[BASE:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z14test_svst3_s64u10__SVBool_tPl11svint64x3_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i1> [[TMP3]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA]], i64 2)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA]], i64 4)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv2i64(<vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i1> [[TMP0]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst3_s64(svbool_t pg, int64_t *base, svint64x3_t data)
@@ -127,20 +127,20 @@ void test_svst3_u8(svbool_t pg, uint8_t *base, svuint8x3_t data)
 
 // CHECK-LABEL: @test_svst3_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA]], i64 8)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA]], i64 16)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i1> [[TMP3]], ptr [[BASE:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA]], i64 8)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA]], i64 16)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv8i16(<vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z14test_svst3_u16u10__SVBool_tPt12svuint16x3_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i1> [[TMP3]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA]], i64 16)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv8i16(<vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst3_u16(svbool_t pg, uint16_t *base, svuint16x3_t data)
@@ -150,20 +150,20 @@ void test_svst3_u16(svbool_t pg, uint16_t *base, svuint16x3_t data)
 
 // CHECK-LABEL: @test_svst3_u32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA]], i64 4)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA]], i64 8)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i1> [[TMP3]], ptr [[BASE:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA]], i64 4)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv4i32(<vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i1> [[TMP0]], ptr [[BASE:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z14test_svst3_u32u10__SVBool_tPj12svuint32x3_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i1> [[TMP3]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv4i32(<vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i1> [[TMP0]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst3_u32(svbool_t pg, uint32_t *base, svuint32x3_t data)
@@ -173,20 +173,20 @@ void test_svst3_u32(svbool_t pg, uint32_t *base, svuint32x3_t data)
 
 // CHECK-LABEL: @test_svst3_u64(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA]], i64 2)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA]], i64 4)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i1> [[TMP3]], ptr [[BASE:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA]], i64 2)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA]], i64 4)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv2i64(<vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i1> [[TMP0]], ptr [[BASE:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z14test_svst3_u64u10__SVBool_tPm12svuint64x3_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i1> [[TMP3]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA]], i64 2)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA]], i64 4)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv2i64(<vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i1> [[TMP0]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst3_u64(svbool_t pg, uint64_t *base, svuint64x3_t data)
@@ -196,20 +196,20 @@ void test_svst3_u64(svbool_t pg, uint64_t *base, svuint64x3_t data)
 
 // CHECK-LABEL: @test_svst3_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv24f16(<vscale x 24 x half> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv24f16(<vscale x 24 x half> [[DATA]], i64 8)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv24f16(<vscale x 24 x half> [[DATA]], i64 16)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv8f16(<vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x i1> [[TMP3]], ptr [[BASE:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv24f16(<vscale x 24 x half> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv24f16(<vscale x 24 x half> [[DATA]], i64 8)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv24f16(<vscale x 24 x half> [[DATA]], i64 16)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv8f16(<vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z14test_svst3_f16u10__SVBool_tPDh13svfloat16x3_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv24f16(<vscale x 24 x half> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv24f16(<vscale x 24 x half> [[DATA]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv24f16(<vscale x 24 x half> [[DATA]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv8f16(<vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x i1> [[TMP3]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv24f16(<vscale x 24 x half> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv24f16(<vscale x 24 x half> [[DATA]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv24f16(<vscale x 24 x half> [[DATA]], i64 16)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv8f16(<vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst3_f16(svbool_t pg, float16_t *base, svfloat16x3_t data)
@@ -219,20 +219,20 @@ void test_svst3_f16(svbool_t pg, float16_t *base, svfloat16x3_t data)
 
 // CHECK-LABEL: @test_svst3_f32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv12f32(<vscale x 12 x float> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv12f32(<vscale x 12 x float> [[DATA]], i64 4)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv12f32(<vscale x 12 x float> [[DATA]], i64 8)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x i1> [[TMP3]], ptr [[BASE:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv12f32(<vscale x 12 x float> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv12f32(<vscale x 12 x float> [[DATA]], i64 4)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv12f32(<vscale x 12 x float> [[DATA]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv4f32(<vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], <vscale x 4 x i1> [[TMP0]], ptr [[BASE:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z14test_svst3_f32u10__SVBool_tPf13svfloat32x3_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv12f32(<vscale x 12 x float> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv12f32(<vscale x 12 x float> [[DATA]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv12f32(<vscale x 12 x float> [[DATA]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x i1> [[TMP3]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv12f32(<vscale x 12 x float> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv12f32(<vscale x 12 x float> [[DATA]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv12f32(<vscale x 12 x float> [[DATA]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv4f32(<vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], <vscale x 4 x i1> [[TMP0]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst3_f32(svbool_t pg, float32_t *base, svfloat32x3_t data)
@@ -242,20 +242,20 @@ void test_svst3_f32(svbool_t pg, float32_t *base, svfloat32x3_t data)
 
 // CHECK-LABEL: @test_svst3_f64(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv6f64(<vscale x 6 x double> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv6f64(<vscale x 6 x double> [[DATA]], i64 2)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv6f64(<vscale x 6 x double> [[DATA]], i64 4)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv2f64(<vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x i1> [[TMP3]], ptr [[BASE:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv6f64(<vscale x 6 x double> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv6f64(<vscale x 6 x double> [[DATA]], i64 2)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv6f64(<vscale x 6 x double> [[DATA]], i64 4)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv2f64(<vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], <vscale x 2 x i1> [[TMP0]], ptr [[BASE:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z14test_svst3_f64u10__SVBool_tPd13svfloat64x3_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv6f64(<vscale x 6 x double> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv6f64(<vscale x 6 x double> [[DATA]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv6f64(<vscale x 6 x double> [[DATA]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv2f64(<vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x i1> [[TMP3]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv6f64(<vscale x 6 x double> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv6f64(<vscale x 6 x double> [[DATA]], i64 2)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv6f64(<vscale x 6 x double> [[DATA]], i64 4)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv2f64(<vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], <vscale x 2 x i1> [[TMP0]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst3_f64(svbool_t pg, float64_t *base, svfloat64x3_t data)
@@ -265,20 +265,20 @@ void test_svst3_f64(svbool_t pg, float64_t *base, svfloat64x3_t data)
 
 // CHECK-LABEL: @test_svst3_vnum_s8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv48i8(<vscale x 48 x i8> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv48i8(<vscale x 48 x i8> [[DATA]], i64 16)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv48i8(<vscale x 48 x i8> [[DATA]], i64 32)
-// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i1> [[PG:%.*]], ptr [[TMP3]])
+// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv48i8(<vscale x 48 x i8> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv48i8(<vscale x 48 x i8> [[DATA]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv48i8(<vscale x 48 x i8> [[DATA]], i64 32)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv16i8(<vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i1> [[PG:%.*]], ptr [[TMP0]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z18test_svst3_vnum_s8u10__SVBool_tPal10svint8x3_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv48i8(<vscale x 48 x i8> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv48i8(<vscale x 48 x i8> [[DATA]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv48i8(<vscale x 48 x i8> [[DATA]], i64 32)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i1> [[PG:%.*]], ptr [[TMP3]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv48i8(<vscale x 48 x i8> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv48i8(<vscale x 48 x i8> [[DATA]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv48i8(<vscale x 48 x i8> [[DATA]], i64 32)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv16i8(<vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i1> [[PG:%.*]], ptr [[TMP0]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst3_vnum_s8(svbool_t pg, int8_t *base, int64_t vnum, svint8x3_t data)
@@ -288,22 +288,22 @@ void test_svst3_vnum_s8(svbool_t pg, int8_t *base, int64_t vnum, svint8x3_t data
 
 // CHECK-LABEL: @test_svst3_vnum_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA]], i64 8)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA]], i64 16)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i1> [[TMP3]], ptr [[TMP4]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA]], i64 8)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA]], i64 16)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv8i16(<vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z19test_svst3_vnum_s16u10__SVBool_tPsl11svint16x3_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i1> [[TMP3]], ptr [[TMP4]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA]], i64 16)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv8i16(<vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst3_vnum_s16(svbool_t pg, int16_t *base, int64_t vnum, svint16x3_t data)
@@ -313,22 +313,22 @@ void test_svst3_vnum_s16(svbool_t pg, int16_t *base, int64_t vnum, svint16x3_t d
 
 // CHECK-LABEL: @test_svst3_vnum_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA]], i64 4)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA]], i64 8)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i1> [[TMP3]], ptr [[TMP4]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA]], i64 4)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv4i32(<vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i32> [[TMP4]], <vscale x 4 x i1> [[TMP0]], ptr [[TMP1]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z19test_svst3_vnum_s32u10__SVBool_tPil11svint32x3_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i1> [[TMP3]], ptr [[TMP4]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv4i32(<vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i32> [[TMP4]], <vscale x 4 x i1> [[TMP0]], ptr [[TMP1]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst3_vnum_s32(svbool_t pg, int32_t *base, int64_t vnum, svint32x3_t data)
@@ -338,22 +338,22 @@ void test_svst3_vnum_s32(svbool_t pg, int32_t *base, int64_t vnum, svint32x3_t d
 
 // CHECK-LABEL: @test_svst3_vnum_s64(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA]], i64 2)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA]], i64 4)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i1> [[TMP3]], ptr [[TMP4]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA]], i64 2)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA]], i64 4)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv2i64(<vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP4]], <vscale x 2 x i1> [[TMP0]], ptr [[TMP1]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z19test_svst3_vnum_s64u10__SVBool_tPll11svint64x3_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i1> [[TMP3]], ptr [[TMP4]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA]], i64 2)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA]], i64 4)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv2i64(<vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP4]], <vscale x 2 x i1> [[TMP0]], ptr [[TMP1]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst3_vnum_s64(svbool_t pg, int64_t *base, int64_t vnum, svint64x3_t data)
@@ -363,20 +363,20 @@ void test_svst3_vnum_s64(svbool_t pg, int64_t *base, int64_t vnum, svint64x3_t d
 
 // CHECK-LABEL: @test_svst3_vnum_u8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv48i8(<vscale x 48 x i8> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv48i8(<vscale x 48 x i8> [[DATA]], i64 16)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv48i8(<vscale x 48 x i8> [[DATA]], i64 32)
-// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i1> [[PG:%.*]], ptr [[TMP3]])
+// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv48i8(<vscale x 48 x i8> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv48i8(<vscale x 48 x i8> [[DATA]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv48i8(<vscale x 48 x i8> [[DATA]], i64 32)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv16i8(<vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i1> [[PG:%.*]], ptr [[TMP0]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z18test_svst3_vnum_u8u10__SVBool_tPhl11svuint8x3_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv48i8(<vscale x 48 x i8> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv48i8(<vscale x 48 x i8> [[DATA]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv48i8(<vscale x 48 x i8> [[DATA]], i64 32)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i1> [[PG:%.*]], ptr [[TMP3]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv48i8(<vscale x 48 x i8> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv48i8(<vscale x 48 x i8> [[DATA]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv48i8(<vscale x 48 x i8> [[DATA]], i64 32)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv16i8(<vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i1> [[PG:%.*]], ptr [[TMP0]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst3_vnum_u8(svbool_t pg, uint8_t *base, int64_t vnum, svuint8x3_t data)
@@ -386,22 +386,22 @@ void test_svst3_vnum_u8(svbool_t pg, uint8_t *base, int64_t vnum, svuint8x3_t da
 
 // CHECK-LABEL: @test_svst3_vnum_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA]], i64 8)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA]], i64 16)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i1> [[TMP3]], ptr [[TMP4]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA]], i64 8)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA]], i64 16)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv8i16(<vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z19test_svst3_vnum_u16u10__SVBool_tPtl12svuint16x3_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i1> [[TMP3]], ptr [[TMP4]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv24i16(<vscale x 24 x i16> [[DATA]], i64 16)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv8i16(<vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst3_vnum_u16(svbool_t pg, uint16_t *base, int64_t vnum, svuint16x3_t data)
@@ -411,22 +411,22 @@ void test_svst3_vnum_u16(svbool_t pg, uint16_t *base, int64_t vnum, svuint16x3_t
 
 // CHECK-LABEL: @test_svst3_vnum_u32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA]], i64 4)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA]], i64 8)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i1> [[TMP3]], ptr [[TMP4]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA]], i64 4)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv4i32(<vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i32> [[TMP4]], <vscale x 4 x i1> [[TMP0]], ptr [[TMP1]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z19test_svst3_vnum_u32u10__SVBool_tPjl12svuint32x3_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i1> [[TMP3]], ptr [[TMP4]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv12i32(<vscale x 12 x i32> [[DATA]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv4i32(<vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i32> [[TMP4]], <vscale x 4 x i1> [[TMP0]], ptr [[TMP1]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst3_vnum_u32(svbool_t pg, uint32_t *base, int64_t vnum, svuint32x3_t data)
@@ -436,22 +436,22 @@ void test_svst3_vnum_u32(svbool_t pg, uint32_t *base, int64_t vnum, svuint32x3_t
 
 // CHECK-LABEL: @test_svst3_vnum_u64(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA]], i64 2)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA]], i64 4)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i1> [[TMP3]], ptr [[TMP4]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA]], i64 2)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA]], i64 4)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv2i64(<vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP4]], <vscale x 2 x i1> [[TMP0]], ptr [[TMP1]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z19test_svst3_vnum_u64u10__SVBool_tPml12svuint64x3_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i1> [[TMP3]], ptr [[TMP4]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA]], i64 2)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv6i64(<vscale x 6 x i64> [[DATA]], i64 4)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv2i64(<vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP4]], <vscale x 2 x i1> [[TMP0]], ptr [[TMP1]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst3_vnum_u64(svbool_t pg, uint64_t *base, int64_t vnum, svuint64x3_t data)
@@ -461,22 +461,22 @@ void test_svst3_vnum_u64(svbool_t pg, uint64_t *base, int64_t vnum, svuint64x3_t
 
 // CHECK-LABEL: @test_svst3_vnum_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv24f16(<vscale x 24 x half> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv24f16(<vscale x 24 x half> [[DATA]], i64 8)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv24f16(<vscale x 24 x half> [[DATA]], i64 16)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 8 x half>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv8f16(<vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x i1> [[TMP3]], ptr [[TMP4]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x half>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv24f16(<vscale x 24 x half> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv24f16(<vscale x 24 x half> [[DATA]], i64 8)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv24f16(<vscale x 24 x half> [[DATA]], i64 16)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv8f16(<vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[TMP4]], <vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z19test_svst3_vnum_f16u10__SVBool_tPDhl13svfloat16x3_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv24f16(<vscale x 24 x half> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv24f16(<vscale x 24 x half> [[DATA]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv24f16(<vscale x 24 x half> [[DATA]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 8 x half>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv8f16(<vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x i1> [[TMP3]], ptr [[TMP4]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x half>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv24f16(<vscale x 24 x half> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv24f16(<vscale x 24 x half> [[DATA]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv24f16(<vscale x 24 x half> [[DATA]], i64 16)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv8f16(<vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[TMP4]], <vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst3_vnum_f16(svbool_t pg, float16_t *base, int64_t vnum, svfloat16x3_t data)
@@ -486,22 +486,22 @@ void test_svst3_vnum_f16(svbool_t pg, float16_t *base, int64_t vnum, svfloat16x3
 
 // CHECK-LABEL: @test_svst3_vnum_f32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv12f32(<vscale x 12 x float> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv12f32(<vscale x 12 x float> [[DATA]], i64 4)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv12f32(<vscale x 12 x float> [[DATA]], i64 8)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 4 x float>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x i1> [[TMP3]], ptr [[TMP4]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 4 x float>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv12f32(<vscale x 12 x float> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv12f32(<vscale x 12 x float> [[DATA]], i64 4)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv12f32(<vscale x 12 x float> [[DATA]], i64 8)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv4f32(<vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], <vscale x 4 x float> [[TMP4]], <vscale x 4 x i1> [[TMP0]], ptr [[TMP1]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z19test_svst3_vnum_f32u10__SVBool_tPfl13svfloat32x3_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv12f32(<vscale x 12 x float> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv12f32(<vscale x 12 x float> [[DATA]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv12f32(<vscale x 12 x float> [[DATA]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 4 x float>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x i1> [[TMP3]], ptr [[TMP4]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 4 x float>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv12f32(<vscale x 12 x float> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv12f32(<vscale x 12 x float> [[DATA]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv12f32(<vscale x 12 x float> [[DATA]], i64 8)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv4f32(<vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], <vscale x 4 x float> [[TMP4]], <vscale x 4 x i1> [[TMP0]], ptr [[TMP1]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst3_vnum_f32(svbool_t pg, float32_t *base, int64_t vnum, svfloat32x3_t data)
@@ -511,22 +511,22 @@ void test_svst3_vnum_f32(svbool_t pg, float32_t *base, int64_t vnum, svfloat32x3
 
 // CHECK-LABEL: @test_svst3_vnum_f64(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv6f64(<vscale x 6 x double> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv6f64(<vscale x 6 x double> [[DATA]], i64 2)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv6f64(<vscale x 6 x double> [[DATA]], i64 4)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 2 x double>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv2f64(<vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x i1> [[TMP3]], ptr [[TMP4]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x double>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv6f64(<vscale x 6 x double> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv6f64(<vscale x 6 x double> [[DATA]], i64 2)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv6f64(<vscale x 6 x double> [[DATA]], i64 4)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv2f64(<vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], <vscale x 2 x double> [[TMP4]], <vscale x 2 x i1> [[TMP0]], ptr [[TMP1]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z19test_svst3_vnum_f64u10__SVBool_tPdl13svfloat64x3_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv6f64(<vscale x 6 x double> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv6f64(<vscale x 6 x double> [[DATA]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv6f64(<vscale x 6 x double> [[DATA]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 2 x double>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv2f64(<vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x i1> [[TMP3]], ptr [[TMP4]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x double>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv6f64(<vscale x 6 x double> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv6f64(<vscale x 6 x double> [[DATA]], i64 2)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv6f64(<vscale x 6 x double> [[DATA]], i64 4)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st3.nxv2f64(<vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], <vscale x 2 x double> [[TMP4]], <vscale x 2 x i1> [[TMP0]], ptr [[TMP1]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst3_vnum_f64(svbool_t pg, float64_t *base, int64_t vnum, svfloat64x3_t data)
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st4-bfloat.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st4-bfloat.c
index 03ff3b4d51b63..bf5afb9458ee1 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st4-bfloat.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st4-bfloat.c
@@ -17,22 +17,22 @@
 
 // CHECK-LABEL: @test_svst4_bf16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[DATA]], i64 8)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[DATA]], i64 16)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[DATA]], i64 24)
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv8bf16(<vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x i1> [[TMP4]], ptr [[BASE:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[DATA]], i64 8)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[DATA]], i64 16)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[DATA]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv8bf16(<vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[TMP4]], <vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z15test_svst4_bf16u10__SVBool_tPu6__bf1614svbfloat16x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[DATA]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[DATA]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[DATA]], i64 24)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv8bf16(<vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x i1> [[TMP4]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[DATA]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[DATA]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[DATA]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv8bf16(<vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[TMP4]], <vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst4_bf16(svbool_t pg, bfloat16_t *base, svbfloat16x4_t data)
@@ -42,24 +42,24 @@ void test_svst4_bf16(svbool_t pg, bfloat16_t *base, svbfloat16x4_t data)
 
 // CHECK-LABEL: @test_svst4_vnum_bf16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[DATA]], i64 8)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[DATA]], i64 16)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[DATA]], i64 24)
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr <vscale x 8 x bfloat>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv8bf16(<vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x i1> [[TMP4]], ptr [[TMP5]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x bfloat>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[DATA]], i64 8)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[DATA]], i64 16)
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[DATA]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv8bf16(<vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], <vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z20test_svst4_vnum_bf16u10__SVBool_tPu6__bf16l14svbfloat16x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[DATA]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[DATA]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[DATA]], i64 24)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = getelementptr <vscale x 8 x bfloat>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv8bf16(<vscale x 8 x bfloat> [[TMP0]], <vscale x 8 x bfloat> [[TMP1]], <vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x i1> [[TMP4]], ptr [[TMP5]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x bfloat>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[DATA]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[DATA]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.extract.nxv8bf16.nxv32bf16(<vscale x 32 x bfloat> [[DATA]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv8bf16(<vscale x 8 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[TMP4]], <vscale x 8 x bfloat> [[TMP5]], <vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst4_vnum_bf16(svbool_t pg, bfloat16_t *base, int64_t vnum, svbfloat16x4_t data)
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st4.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st4.c
index c6b49361fdfba..fc8bdcd34ece0 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st4.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st4.c
@@ -39,22 +39,22 @@ void test_svst4_s8(svbool_t pg, int8_t *base, svint8x4_t data)
 
 // CHECK-LABEL: @test_svst4_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 8)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 16)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 24)
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i1> [[TMP4]], ptr [[BASE:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 8)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 16)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv8i16(<vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z14test_svst4_s16u10__SVBool_tPs11svint16x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 24)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i1> [[TMP4]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv8i16(<vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst4_s16(svbool_t pg, int16_t *base, svint16x4_t data)
@@ -64,22 +64,22 @@ void test_svst4_s16(svbool_t pg, int16_t *base, svint16x4_t data)
 
 // CHECK-LABEL: @test_svst4_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 4)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 8)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 12)
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i1> [[TMP4]], ptr [[BASE:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 4)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 8)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 12)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv4i32(<vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i32> [[TMP4]], <vscale x 4 x i1> [[TMP0]], ptr [[BASE:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z14test_svst4_s32u10__SVBool_tPi11svint32x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 12)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i1> [[TMP4]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 12)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv4i32(<vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i32> [[TMP4]], <vscale x 4 x i1> [[TMP0]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst4_s32(svbool_t pg, int32_t *base, svint32x4_t data)
@@ -89,22 +89,22 @@ void test_svst4_s32(svbool_t pg, int32_t *base, svint32x4_t data)
 
 // CHECK-LABEL: @test_svst4_s64(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 2)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 4)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 6)
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i1> [[TMP4]], ptr [[BASE:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 2)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 4)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 6)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv2i64(<vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP4]], <vscale x 2 x i1> [[TMP0]], ptr [[BASE:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z14test_svst4_s64u10__SVBool_tPl11svint64x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 6)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i1> [[TMP4]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 2)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 6)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv2i64(<vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP4]], <vscale x 2 x i1> [[TMP0]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst4_s64(svbool_t pg, int64_t *base, svint64x4_t data)
@@ -137,22 +137,22 @@ void test_svst4_u8(svbool_t pg, uint8_t *base, svuint8x4_t data)
 
 // CHECK-LABEL: @test_svst4_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 8)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 16)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 24)
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i1> [[TMP4]], ptr [[BASE:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 8)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 16)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv8i16(<vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z14test_svst4_u16u10__SVBool_tPt12svuint16x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 24)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i1> [[TMP4]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv8i16(<vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst4_u16(svbool_t pg, uint16_t *base, svuint16x4_t data)
@@ -162,22 +162,22 @@ void test_svst4_u16(svbool_t pg, uint16_t *base, svuint16x4_t data)
 
 // CHECK-LABEL: @test_svst4_u32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 4)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 8)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 12)
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i1> [[TMP4]], ptr [[BASE:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 4)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 8)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 12)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv4i32(<vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i32> [[TMP4]], <vscale x 4 x i1> [[TMP0]], ptr [[BASE:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z14test_svst4_u32u10__SVBool_tPj12svuint32x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 12)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i1> [[TMP4]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 12)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv4i32(<vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i32> [[TMP4]], <vscale x 4 x i1> [[TMP0]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst4_u32(svbool_t pg, uint32_t *base, svuint32x4_t data)
@@ -187,22 +187,22 @@ void test_svst4_u32(svbool_t pg, uint32_t *base, svuint32x4_t data)
 
 // CHECK-LABEL: @test_svst4_u64(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 2)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 4)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 6)
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i1> [[TMP4]], ptr [[BASE:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 2)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 4)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 6)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv2i64(<vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP4]], <vscale x 2 x i1> [[TMP0]], ptr [[BASE:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z14test_svst4_u64u10__SVBool_tPm12svuint64x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 6)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i1> [[TMP4]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 2)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 6)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv2i64(<vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP4]], <vscale x 2 x i1> [[TMP0]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst4_u64(svbool_t pg, uint64_t *base, svuint64x4_t data)
@@ -212,22 +212,22 @@ void test_svst4_u64(svbool_t pg, uint64_t *base, svuint64x4_t data)
 
 // CHECK-LABEL: @test_svst4_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[DATA]], i64 8)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[DATA]], i64 16)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[DATA]], i64 24)
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv8f16(<vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x i1> [[TMP4]], ptr [[BASE:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[DATA]], i64 8)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[DATA]], i64 16)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[DATA]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv8f16(<vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[TMP4]], <vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z14test_svst4_f16u10__SVBool_tPDh13svfloat16x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[DATA]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[DATA]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[DATA]], i64 24)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv8f16(<vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x i1> [[TMP4]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[DATA]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[DATA]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[DATA]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv8f16(<vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[TMP4]], <vscale x 8 x i1> [[TMP0]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst4_f16(svbool_t pg, float16_t *base, svfloat16x4_t data)
@@ -237,22 +237,22 @@ void test_svst4_f16(svbool_t pg, float16_t *base, svfloat16x4_t data)
 
 // CHECK-LABEL: @test_svst4_f32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[DATA]], i64 4)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[DATA]], i64 8)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[DATA]], i64 12)
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], <vscale x 4 x i1> [[TMP4]], ptr [[BASE:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[DATA]], i64 4)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[DATA]], i64 8)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[DATA]], i64 12)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv4f32(<vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], <vscale x 4 x float> [[TMP4]], <vscale x 4 x i1> [[TMP0]], ptr [[BASE:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z14test_svst4_f32u10__SVBool_tPf13svfloat32x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[DATA]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[DATA]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[DATA]], i64 12)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], <vscale x 4 x i1> [[TMP4]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[DATA]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[DATA]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[DATA]], i64 12)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv4f32(<vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], <vscale x 4 x float> [[TMP4]], <vscale x 4 x i1> [[TMP0]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst4_f32(svbool_t pg, float32_t *base, svfloat32x4_t data)
@@ -262,22 +262,22 @@ void test_svst4_f32(svbool_t pg, float32_t *base, svfloat32x4_t data)
 
 // CHECK-LABEL: @test_svst4_f64(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[DATA]], i64 2)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[DATA]], i64 4)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[DATA]], i64 6)
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv2f64(<vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], <vscale x 2 x i1> [[TMP4]], ptr [[BASE:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[DATA]], i64 2)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[DATA]], i64 4)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[DATA]], i64 6)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv2f64(<vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], <vscale x 2 x double> [[TMP4]], <vscale x 2 x i1> [[TMP0]], ptr [[BASE:%.*]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z14test_svst4_f64u10__SVBool_tPd13svfloat64x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[DATA]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[DATA]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[DATA]], i64 6)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv2f64(<vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], <vscale x 2 x i1> [[TMP4]], ptr [[BASE:%.*]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[DATA]], i64 2)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[DATA]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[DATA]], i64 6)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv2f64(<vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], <vscale x 2 x double> [[TMP4]], <vscale x 2 x i1> [[TMP0]], ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst4_f64(svbool_t pg, float64_t *base, svfloat64x4_t data)
@@ -287,22 +287,22 @@ void test_svst4_f64(svbool_t pg, float64_t *base, svfloat64x4_t data)
 
 // CHECK-LABEL: @test_svst4_vnum_s8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[DATA]], i64 16)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[DATA]], i64 32)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[DATA]], i64 48)
-// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i1> [[PG:%.*]], ptr [[TMP4]])
+// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[DATA]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[DATA]], i64 32)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[DATA]], i64 48)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv16i8(<vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], <vscale x 16 x i1> [[PG:%.*]], ptr [[TMP0]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z18test_svst4_vnum_s8u10__SVBool_tPal10svint8x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[DATA]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[DATA]], i64 32)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[DATA]], i64 48)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i1> [[PG:%.*]], ptr [[TMP4]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[DATA]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[DATA]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[DATA]], i64 48)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv16i8(<vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], <vscale x 16 x i1> [[PG:%.*]], ptr [[TMP0]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst4_vnum_s8(svbool_t pg, int8_t *base, int64_t vnum, svint8x4_t data)
@@ -312,24 +312,24 @@ void test_svst4_vnum_s8(svbool_t pg, int8_t *base, int64_t vnum, svint8x4_t data
 
 // CHECK-LABEL: @test_svst4_vnum_s16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 8)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 16)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 24)
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i1> [[TMP4]], ptr [[TMP5]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 8)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 16)
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv8i16(<vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z19test_svst4_vnum_s16u10__SVBool_tPsl11svint16x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 24)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i1> [[TMP4]], ptr [[TMP5]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv8i16(<vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst4_vnum_s16(svbool_t pg, int16_t *base, int64_t vnum, svint16x4_t data)
@@ -339,24 +339,24 @@ void test_svst4_vnum_s16(svbool_t pg, int16_t *base, int64_t vnum, svint16x4_t d
 
 // CHECK-LABEL: @test_svst4_vnum_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 4)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 8)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 12)
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i1> [[TMP4]], ptr [[TMP5]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 4)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 8)
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 12)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv4i32(<vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], <vscale x 4 x i1> [[TMP0]], ptr [[TMP1]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z19test_svst4_vnum_s32u10__SVBool_tPil11svint32x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 12)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i1> [[TMP4]], ptr [[TMP5]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 12)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv4i32(<vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], <vscale x 4 x i1> [[TMP0]], ptr [[TMP1]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst4_vnum_s32(svbool_t pg, int32_t *base, int64_t vnum, svint32x4_t data)
@@ -366,24 +366,24 @@ void test_svst4_vnum_s32(svbool_t pg, int32_t *base, int64_t vnum, svint32x4_t d
 
 // CHECK-LABEL: @test_svst4_vnum_s64(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 2)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 4)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 6)
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i1> [[TMP4]], ptr [[TMP5]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 2)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 4)
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 6)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv2i64(<vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], <vscale x 2 x i1> [[TMP0]], ptr [[TMP1]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z19test_svst4_vnum_s64u10__SVBool_tPll11svint64x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 6)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i1> [[TMP4]], ptr [[TMP5]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 2)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 6)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv2i64(<vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], <vscale x 2 x i1> [[TMP0]], ptr [[TMP1]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst4_vnum_s64(svbool_t pg, int64_t *base, int64_t vnum, svint64x4_t data)
@@ -393,22 +393,22 @@ void test_svst4_vnum_s64(svbool_t pg, int64_t *base, int64_t vnum, svint64x4_t d
 
 // CHECK-LABEL: @test_svst4_vnum_u8(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[DATA]], i64 16)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[DATA]], i64 32)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[DATA]], i64 48)
-// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i1> [[PG:%.*]], ptr [[TMP4]])
+// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[DATA]], i64 16)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[DATA]], i64 32)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[DATA]], i64 48)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv16i8(<vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], <vscale x 16 x i1> [[PG:%.*]], ptr [[TMP0]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z18test_svst4_vnum_u8u10__SVBool_tPhl11svuint8x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[DATA]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[DATA]], i64 32)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[DATA]], i64 48)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv16i8(<vscale x 16 x i8> [[TMP0]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i1> [[PG:%.*]], ptr [[TMP4]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[DATA]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[DATA]], i64 32)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv64i8(<vscale x 64 x i8> [[DATA]], i64 48)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv16i8(<vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], <vscale x 16 x i1> [[PG:%.*]], ptr [[TMP0]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst4_vnum_u8(svbool_t pg, uint8_t *base, int64_t vnum, svuint8x4_t data)
@@ -418,24 +418,24 @@ void test_svst4_vnum_u8(svbool_t pg, uint8_t *base, int64_t vnum, svuint8x4_t da
 
 // CHECK-LABEL: @test_svst4_vnum_u16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 8)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 16)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 24)
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i1> [[TMP4]], ptr [[TMP5]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 8)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 16)
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv8i16(<vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z19test_svst4_vnum_u16u10__SVBool_tPtl12svuint16x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 24)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv8i16(<vscale x 8 x i16> [[TMP0]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i1> [[TMP4]], ptr [[TMP5]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x i16>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv32i16(<vscale x 32 x i16> [[DATA]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv8i16(<vscale x 8 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst4_vnum_u16(svbool_t pg, uint16_t *base, int64_t vnum, svuint16x4_t data)
@@ -445,24 +445,24 @@ void test_svst4_vnum_u16(svbool_t pg, uint16_t *base, int64_t vnum, svuint16x4_t
 
 // CHECK-LABEL: @test_svst4_vnum_u32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 4)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 8)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 12)
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i1> [[TMP4]], ptr [[TMP5]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 4)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 8)
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 12)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv4i32(<vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], <vscale x 4 x i1> [[TMP0]], ptr [[TMP1]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z19test_svst4_vnum_u32u10__SVBool_tPjl12svuint32x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 12)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i1> [[TMP4]], ptr [[TMP5]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[DATA]], i64 12)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv4i32(<vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], <vscale x 4 x i1> [[TMP0]], ptr [[TMP1]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst4_vnum_u32(svbool_t pg, uint32_t *base, int64_t vnum, svuint32x4_t data)
@@ -472,24 +472,24 @@ void test_svst4_vnum_u32(svbool_t pg, uint32_t *base, int64_t vnum, svuint32x4_t
 
 // CHECK-LABEL: @test_svst4_vnum_u64(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 2)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 4)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 6)
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i1> [[TMP4]], ptr [[TMP5]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 2)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 4)
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 6)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv2i64(<vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], <vscale x 2 x i1> [[TMP0]], ptr [[TMP1]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z19test_svst4_vnum_u64u10__SVBool_tPml12svuint64x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 6)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv2i64(<vscale x 2 x i64> [[TMP0]], <vscale x 2 x i64> [[TMP1]], <vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i1> [[TMP4]], ptr [[TMP5]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x i64>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 2)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv8i64(<vscale x 8 x i64> [[DATA]], i64 6)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv2i64(<vscale x 2 x i64> [[TMP2]], <vscale x 2 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP4]], <vscale x 2 x i64> [[TMP5]], <vscale x 2 x i1> [[TMP0]], ptr [[TMP1]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst4_vnum_u64(svbool_t pg, uint64_t *base, int64_t vnum, svuint64x4_t data)
@@ -499,24 +499,24 @@ void test_svst4_vnum_u64(svbool_t pg, uint64_t *base, int64_t vnum, svuint64x4_t
 
 // CHECK-LABEL: @test_svst4_vnum_f16(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[DATA]], i64 8)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[DATA]], i64 16)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[DATA]], i64 24)
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr <vscale x 8 x half>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv8f16(<vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x i1> [[TMP4]], ptr [[TMP5]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x half>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[DATA]], i64 8)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[DATA]], i64 16)
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[DATA]], i64 24)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv8f16(<vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], <vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z19test_svst4_vnum_f16u10__SVBool_tPDhl13svfloat16x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[DATA]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[DATA]], i64 16)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[DATA]], i64 24)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = getelementptr <vscale x 8 x half>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv8f16(<vscale x 8 x half> [[TMP0]], <vscale x 8 x half> [[TMP1]], <vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x i1> [[TMP4]], ptr [[TMP5]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 8 x half>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[DATA]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[DATA]], i64 16)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 8 x half> @llvm.vector.extract.nxv8f16.nxv32f16(<vscale x 32 x half> [[DATA]], i64 24)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv8f16(<vscale x 8 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[TMP4]], <vscale x 8 x half> [[TMP5]], <vscale x 8 x i1> [[TMP0]], ptr [[TMP1]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst4_vnum_f16(svbool_t pg, float16_t *base, int64_t vnum, svfloat16x4_t data)
@@ -526,24 +526,24 @@ void test_svst4_vnum_f16(svbool_t pg, float16_t *base, int64_t vnum, svfloat16x4
 
 // CHECK-LABEL: @test_svst4_vnum_f32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[DATA]], i64 4)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[DATA]], i64 8)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[DATA]], i64 12)
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr <vscale x 4 x float>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], <vscale x 4 x i1> [[TMP4]], ptr [[TMP5]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 4 x float>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[DATA]], i64 4)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[DATA]], i64 8)
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[DATA]], i64 12)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv4f32(<vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], <vscale x 4 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], <vscale x 4 x i1> [[TMP0]], ptr [[TMP1]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z19test_svst4_vnum_f32u10__SVBool_tPfl13svfloat32x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[DATA]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[DATA]], i64 8)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[DATA]], i64 12)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = getelementptr <vscale x 4 x float>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], <vscale x 4 x i1> [[TMP4]], ptr [[TMP5]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 4 x float>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[DATA]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[DATA]], i64 8)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[DATA]], i64 12)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv4f32(<vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], <vscale x 4 x float> [[TMP4]], <vscale x 4 x float> [[TMP5]], <vscale x 4 x i1> [[TMP0]], ptr [[TMP1]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst4_vnum_f32(svbool_t pg, float32_t *base, int64_t vnum, svfloat32x4_t data)
@@ -553,24 +553,24 @@ void test_svst4_vnum_f32(svbool_t pg, float32_t *base, int64_t vnum, svfloat32x4
 
 // CHECK-LABEL: @test_svst4_vnum_f64(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[DATA:%.*]], i64 0)
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[DATA]], i64 2)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[DATA]], i64 4)
-// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[DATA]], i64 6)
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr <vscale x 2 x double>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv2f64(<vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], <vscale x 2 x i1> [[TMP4]], ptr [[TMP5]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x double>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[DATA:%.*]], i64 0)
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[DATA]], i64 2)
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[DATA]], i64 4)
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[DATA]], i64 6)
+// CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv2f64(<vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], <vscale x 2 x double> [[TMP4]], <vscale x 2 x double> [[TMP5]], <vscale x 2 x i1> [[TMP0]], ptr [[TMP1]])
 // CHECK-NEXT:    ret void
 //
 // CPP-CHECK-LABEL: @_Z19test_svst4_vnum_f64u10__SVBool_tPdl13svfloat64x4_t(
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[DATA:%.*]], i64 0)
-// CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[DATA]], i64 2)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[DATA]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[DATA]], i64 6)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
-// CPP-CHECK-NEXT:    [[TMP5:%.*]] = getelementptr <vscale x 2 x double>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
-// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv2f64(<vscale x 2 x double> [[TMP0]], <vscale x 2 x double> [[TMP1]], <vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], <vscale x 2 x i1> [[TMP4]], ptr [[TMP5]])
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> [[PG:%.*]])
+// CPP-CHECK-NEXT:    [[TMP1:%.*]] = getelementptr <vscale x 2 x double>, ptr [[BASE:%.*]], i64 [[VNUM:%.*]]
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[DATA:%.*]], i64 0)
+// CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[DATA]], i64 2)
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[DATA]], i64 4)
+// CPP-CHECK-NEXT:    [[TMP5:%.*]] = tail call <vscale x 2 x double> @llvm.vector.extract.nxv2f64.nxv8f64(<vscale x 8 x double> [[DATA]], i64 6)
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sve.st4.nxv2f64(<vscale x 2 x double> [[TMP2]], <vscale x 2 x double> [[TMP3]], <vscale x 2 x double> [[TMP4]], <vscale x 2 x double> [[TMP5]], <vscale x 2 x i1> [[TMP0]], ptr [[TMP1]])
 // CPP-CHECK-NEXT:    ret void
 //
 void test_svst4_vnum_f64(svbool_t pg, float64_t *base, int64_t vnum, svfloat64x4_t data)

From 860f9e5170767c08a879b592c9121d35e90a320e Mon Sep 17 00:00:00 2001
From: Shengchen Kan <shengchen.kan@intel.com>
Date: Thu, 2 Nov 2023 00:12:05 +0800
Subject: [PATCH 699/877] [NFC][X86] Reorder the registers to reduce
 unnecessary iterations (#70222)

* Introduce field `PositionOrder` for class `Register` and
`RegisterTuples`
* If register A's `PositionOrder` < register B's `PositionOrder`, then A
is placed before B in the enum in X86GenRegisterInfo.inc
* The new order of registers in the enum for X86 will be
      1. Registers before AVX512,
      2. AVX512 registers (X/YMM16-31, ZMM0-31, K registers)
      3. AMX registers (TMM)
      4.  APX registers (R16-R31)
* Add a new target hook `getNumSupportedRegs()` to return the number of
registers for the function (may overestimate).
* Replace `getNumRegs()` with `getNumSupportedRegs()` in LiveVariables
to eliminate iterations on unsupported registers

This patch can reduce 0.3% instruction count regression for sqlite3
during compile-stage (O3) by not iterating on APX registers
for #67702
---
 llvm/include/llvm/CodeGen/LiveVariables.h     |  5 +-
 .../include/llvm/CodeGen/TargetRegisterInfo.h |  5 ++
 llvm/include/llvm/TableGen/Record.h           |  5 ++
 llvm/include/llvm/Target/Target.td            |  8 +++
 llvm/lib/CodeGen/LiveVariables.cpp            | 18 ++++---
 llvm/lib/Target/X86/AsmParser/X86Operand.h    | 16 +++---
 .../lib/Target/X86/MCTargetDesc/X86BaseInfo.h | 32 +++++++++++-
 .../X86/MCTargetDesc/X86InstComments.cpp      |  6 +--
 llvm/lib/Target/X86/X86RegisterInfo.cpp       | 25 ++++++++-
 llvm/lib/Target/X86/X86RegisterInfo.h         |  3 ++
 llvm/lib/Target/X86/X86RegisterInfo.td        | 49 ++++++++++-------
 llvm/test/CodeGen/X86/ipra-reg-usage.ll       |  2 +-
 llvm/utils/TableGen/CodeGenRegisters.cpp      | 52 +++++++++++++------
 13 files changed, 166 insertions(+), 60 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/LiveVariables.h b/llvm/include/llvm/CodeGen/LiveVariables.h
index 9ed4c7bdf7b17..5d7f9ff3053ca 100644
--- a/llvm/include/llvm/CodeGen/LiveVariables.h
+++ b/llvm/include/llvm/CodeGen/LiveVariables.h
@@ -147,7 +147,7 @@ class LiveVariables : public MachineFunctionPass {
   bool HandlePhysRegKill(Register Reg, MachineInstr *MI);
 
   /// HandleRegMask - Call HandlePhysRegKill for all registers clobbered by Mask.
-  void HandleRegMask(const MachineOperand&);
+  void HandleRegMask(const MachineOperand &, unsigned);
 
   void HandlePhysRegUse(Register Reg, MachineInstr &MI);
   void HandlePhysRegDef(Register Reg, MachineInstr *MI,
@@ -170,7 +170,8 @@ class LiveVariables : public MachineFunctionPass {
   /// is coming from.
   void analyzePHINodes(const MachineFunction& Fn);
 
-  void runOnInstr(MachineInstr &MI, SmallVectorImpl<unsigned> &Defs);
+  void runOnInstr(MachineInstr &MI, SmallVectorImpl<unsigned> &Defs,
+                  unsigned NumRegs);
 
   void runOnBlock(MachineBasicBlock *MBB, unsigned NumRegs);
 public:
diff --git a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
index 5bf27e40eee89..337fab735a095 100644
--- a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
@@ -266,6 +266,11 @@ class TargetRegisterInfo : public MCRegisterInfo {
   virtual ~TargetRegisterInfo();
 
 public:
+  /// Return the number of registers for the function. (may overestimate)
+  virtual unsigned getNumSupportedRegs(const MachineFunction &) const {
+    return getNumRegs();
+  }
+
   // Register numbers can represent physical registers, virtual registers, and
   // sometimes stack slots. The unsigned values are divided into these ranges:
   //
diff --git a/llvm/include/llvm/TableGen/Record.h b/llvm/include/llvm/TableGen/Record.h
index 5a47a79d8f8ff..c1b352e974eed 100644
--- a/llvm/include/llvm/TableGen/Record.h
+++ b/llvm/include/llvm/TableGen/Record.h
@@ -2154,6 +2154,11 @@ struct LessRecordRegister {
   };
 
   bool operator()(const Record *Rec1, const Record *Rec2) const {
+    int64_t LHSPositionOrder = Rec1->getValueAsInt("PositionOrder");
+    int64_t RHSPositionOrder = Rec2->getValueAsInt("PositionOrder");
+    if (LHSPositionOrder != RHSPositionOrder)
+      return LHSPositionOrder < RHSPositionOrder;
+
     RecordParts LHSParts(StringRef(Rec1->getName()));
     RecordParts RHSParts(StringRef(Rec2->getName()));
 
diff --git a/llvm/include/llvm/Target/Target.td b/llvm/include/llvm/Target/Target.td
index 8ad2066aa83de..3b1d2f45267e9 100644
--- a/llvm/include/llvm/Target/Target.td
+++ b/llvm/include/llvm/Target/Target.td
@@ -205,6 +205,10 @@ class Register<string n, list<string> altNames = []> {
   // isConstant - This register always holds a constant value (e.g. the zero
   // register in architectures such as MIPS)
   bit isConstant = false;
+
+  /// PositionOrder - Indicate tablegen to place the newly added register at a later
+  /// position to avoid iterations on them on unsupported target.
+  int PositionOrder = 0;
 }
 
 // RegisterWithSubRegs - This can be used to define instances of Register which
@@ -417,6 +421,10 @@ class RegisterTuples<list<SubRegIndex> Indices, list<dag> Regs,
 
   // List of asm names for the generated tuple registers.
   list<string> RegAsmNames = RegNames;
+
+  // PositionOrder - Indicate tablegen to place the newly added register at a later
+  // position to avoid iterations on them on unsupported target.
+  int PositionOrder = 0;
 }
 
 // RegisterCategory - This class is a list of RegisterClasses that belong to a
diff --git a/llvm/lib/CodeGen/LiveVariables.cpp b/llvm/lib/CodeGen/LiveVariables.cpp
index 6b983b6320c71..b85526cfb380b 100644
--- a/llvm/lib/CodeGen/LiveVariables.cpp
+++ b/llvm/lib/CodeGen/LiveVariables.cpp
@@ -406,11 +406,11 @@ bool LiveVariables::HandlePhysRegKill(Register Reg, MachineInstr *MI) {
   return true;
 }
 
-void LiveVariables::HandleRegMask(const MachineOperand &MO) {
+void LiveVariables::HandleRegMask(const MachineOperand &MO, unsigned NumRegs) {
   // Call HandlePhysRegKill() for all live registers clobbered by Mask.
   // Clobbered registers are always dead, sp there is no need to use
   // HandlePhysRegDef().
-  for (unsigned Reg = 1, NumRegs = TRI->getNumRegs(); Reg != NumRegs; ++Reg) {
+  for (unsigned Reg = 1; Reg != NumRegs; ++Reg) {
     // Skip dead regs.
     if (!PhysRegDef[Reg] && !PhysRegUse[Reg])
       continue;
@@ -421,7 +421,8 @@ void LiveVariables::HandleRegMask(const MachineOperand &MO) {
     // This avoids needless implicit operands.
     unsigned Super = Reg;
     for (MCPhysReg SR : TRI->superregs(Reg))
-      if ((PhysRegDef[SR] || PhysRegUse[SR]) && MO.clobbersPhysReg(SR))
+      if (SR < NumRegs && (PhysRegDef[SR] || PhysRegUse[SR]) &&
+          MO.clobbersPhysReg(SR))
         Super = SR;
     HandlePhysRegKill(Super, nullptr);
   }
@@ -478,7 +479,8 @@ void LiveVariables::UpdatePhysRegDefs(MachineInstr &MI,
 }
 
 void LiveVariables::runOnInstr(MachineInstr &MI,
-                               SmallVectorImpl<unsigned> &Defs) {
+                               SmallVectorImpl<unsigned> &Defs,
+                               unsigned NumRegs) {
   assert(!MI.isDebugOrPseudoInstr());
   // Process all of the operands of the instruction...
   unsigned NumOperandsToProcess = MI.getNumOperands();
@@ -527,7 +529,7 @@ void LiveVariables::runOnInstr(MachineInstr &MI,
 
   // Process all masked registers. (Call clobbers).
   for (unsigned Mask : RegMasks)
-    HandleRegMask(MI.getOperand(Mask));
+    HandleRegMask(MI.getOperand(Mask), NumRegs);
 
   // Process all defs.
   for (unsigned MOReg : DefRegs) {
@@ -539,7 +541,7 @@ void LiveVariables::runOnInstr(MachineInstr &MI,
   UpdatePhysRegDefs(MI, Defs);
 }
 
-void LiveVariables::runOnBlock(MachineBasicBlock *MBB, const unsigned NumRegs) {
+void LiveVariables::runOnBlock(MachineBasicBlock *MBB, unsigned NumRegs) {
   // Mark live-in registers as live-in.
   SmallVector<unsigned, 4> Defs;
   for (const auto &LI : MBB->liveins()) {
@@ -556,7 +558,7 @@ void LiveVariables::runOnBlock(MachineBasicBlock *MBB, const unsigned NumRegs) {
       continue;
     DistanceMap.insert(std::make_pair(&MI, Dist++));
 
-    runOnInstr(MI, Defs);
+    runOnInstr(MI, Defs, NumRegs);
   }
 
   // Handle any virtual assignments from PHI nodes which might be at the
@@ -597,7 +599,7 @@ bool LiveVariables::runOnMachineFunction(MachineFunction &mf) {
   MRI = &mf.getRegInfo();
   TRI = MF->getSubtarget().getRegisterInfo();
 
-  const unsigned NumRegs = TRI->getNumRegs();
+  const unsigned NumRegs = TRI->getNumSupportedRegs(mf);
   PhysRegDef.assign(NumRegs, nullptr);
   PhysRegUse.assign(NumRegs, nullptr);
   PHIVarInfo.resize(MF->getNumBlockIDs());
diff --git a/llvm/lib/Target/X86/AsmParser/X86Operand.h b/llvm/lib/Target/X86/AsmParser/X86Operand.h
index 4661e73c3ef8e..641158cb351fc 100644
--- a/llvm/lib/Target/X86/AsmParser/X86Operand.h
+++ b/llvm/lib/Target/X86/AsmParser/X86Operand.h
@@ -357,28 +357,28 @@ struct X86Operand final : public MCParsedAsmOperand {
   }
 
   bool isMem64_RC128X() const {
-    return isMem64() && isMemIndexReg(X86::XMM0, X86::XMM31);
+    return isMem64() && X86II::isXMMReg(Mem.IndexReg);
   }
   bool isMem128_RC128X() const {
-    return isMem128() && isMemIndexReg(X86::XMM0, X86::XMM31);
+    return isMem128() && X86II::isXMMReg(Mem.IndexReg);
   }
   bool isMem128_RC256X() const {
-    return isMem128() && isMemIndexReg(X86::YMM0, X86::YMM31);
+    return isMem128() && X86II::isYMMReg(Mem.IndexReg);
   }
   bool isMem256_RC128X() const {
-    return isMem256() && isMemIndexReg(X86::XMM0, X86::XMM31);
+    return isMem256() && X86II::isXMMReg(Mem.IndexReg);
   }
   bool isMem256_RC256X() const {
-    return isMem256() && isMemIndexReg(X86::YMM0, X86::YMM31);
+    return isMem256() && X86II::isYMMReg(Mem.IndexReg);
   }
   bool isMem256_RC512() const {
-    return isMem256() && isMemIndexReg(X86::ZMM0, X86::ZMM31);
+    return isMem256() && X86II::isZMMReg(Mem.IndexReg);
   }
   bool isMem512_RC256X() const {
-    return isMem512() && isMemIndexReg(X86::YMM0, X86::YMM31);
+    return isMem512() && X86II::isYMMReg(Mem.IndexReg);
   }
   bool isMem512_RC512() const {
-    return isMem512() && isMemIndexReg(X86::ZMM0, X86::ZMM31);
+    return isMem512() && X86II::isZMMReg(Mem.IndexReg);
   }
   bool isMem512_GR16() const {
     if (!isMem512())
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h b/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
index 1e5a3606f33a6..e6db840c08020 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
@@ -1182,11 +1182,39 @@ namespace X86II {
     }
   }
 
+  /// \returns true if the register is a XMM.
+  inline bool isXMMReg(unsigned RegNo) {
+    assert(X86::XMM15 - X86::XMM0 == 15 &&
+           "XMM0-15 registers are not continuous");
+    assert(X86::XMM31 - X86::XMM16 == 15 &&
+           "XMM16-31 registers are not continuous");
+    return (RegNo >= X86::XMM0 && RegNo <= X86::XMM15) ||
+           (RegNo >= X86::XMM16 && RegNo <= X86::XMM31);
+  }
+
+  /// \returns true if the register is a YMM.
+  inline bool isYMMReg(unsigned RegNo) {
+    assert(X86::YMM15 - X86::YMM0 == 15 &&
+           "YMM0-15 registers are not continuous");
+    assert(X86::YMM31 - X86::YMM16 == 15 &&
+           "YMM16-31 registers are not continuous");
+    return (RegNo >= X86::YMM0 && RegNo <= X86::YMM15) ||
+           (RegNo >= X86::YMM16 && RegNo <= X86::YMM31);
+  }
+
+  /// \returns true if the register is a ZMM.
+  inline bool isZMMReg(unsigned RegNo) {
+    assert(X86::ZMM31 - X86::ZMM0 == 31 && "ZMM registers are not continuous");
+    return RegNo >= X86::ZMM0 && RegNo <= X86::ZMM31;
+  }
+
   /// \returns true if the MachineOperand is a x86-64 extended (r8 or
   /// higher) register,  e.g. r8, xmm8, xmm13, etc.
   inline bool isX86_64ExtendedReg(unsigned RegNo) {
-    if ((RegNo >= X86::XMM8 && RegNo <= X86::XMM31) ||
-        (RegNo >= X86::YMM8 && RegNo <= X86::YMM31) ||
+    if ((RegNo >= X86::XMM8 && RegNo <= X86::XMM15) ||
+        (RegNo >= X86::XMM16 && RegNo <= X86::XMM31) ||
+        (RegNo >= X86::YMM8 && RegNo <= X86::YMM15) ||
+        (RegNo >= X86::YMM16 && RegNo <= X86::YMM31) ||
         (RegNo >= X86::ZMM8 && RegNo <= X86::ZMM31))
       return true;
 
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.cpp
index 031ba9f87acbb..ee82faebb57e6 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.cpp
@@ -234,11 +234,11 @@ using namespace llvm;
   CASE_AVX_INS_COMMON(Inst##SS4, , mr_Int)
 
 static unsigned getVectorRegSize(unsigned RegNo) {
-  if (X86::ZMM0 <= RegNo && RegNo <= X86::ZMM31)
+  if (X86II::isZMMReg(RegNo))
     return 512;
-  if (X86::YMM0 <= RegNo && RegNo <= X86::YMM31)
+  if (X86II::isYMMReg(RegNo))
     return 256;
-  if (X86::XMM0 <= RegNo && RegNo <= X86::XMM31)
+  if (X86II::isXMMReg(RegNo))
     return 128;
   if (X86::MM0 <= RegNo && RegNo <= X86::MM7)
     return 64;
diff --git a/llvm/lib/Target/X86/X86RegisterInfo.cpp b/llvm/lib/Target/X86/X86RegisterInfo.cpp
index 3504ca2b5743f..4fd8b6d17e862 100644
--- a/llvm/lib/Target/X86/X86RegisterInfo.cpp
+++ b/llvm/lib/Target/X86/X86RegisterInfo.cpp
@@ -604,8 +604,9 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
     }
   }
   if (!Is64Bit || !MF.getSubtarget<X86Subtarget>().hasAVX512()) {
-    for (unsigned n = 16; n != 32; ++n) {
-      for (MCRegAliasIterator AI(X86::XMM0 + n, this, true); AI.isValid(); ++AI)
+    for (unsigned n = 0; n != 16; ++n) {
+      for (MCRegAliasIterator AI(X86::XMM16 + n, this, true); AI.isValid();
+           ++AI)
         Reserved.set(*AI);
     }
   }
@@ -616,6 +617,26 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   return Reserved;
 }
 
+unsigned X86RegisterInfo::getNumSupportedRegs(const MachineFunction &MF) const {
+  // All existing Intel CPUs that support AMX support AVX512 and all existing
+  // Intel CPUs that support APX support AMX. AVX512 implies AVX.
+  //
+  // We enumerate the registers in X86GenRegisterInfo.inc in this order:
+  //
+  // Registers before AVX512,
+  // AVX512 registers (X/YMM16-31, ZMM0-31, K registers)
+  // AMX registers (TMM)
+  // APX registers (R16-R31)
+  //
+  // and try to return the minimum number of registers supported by the target.
+
+  assert((X86::R15WH + 1 == X86 ::YMM0) && (X86::YMM15 + 1 == X86::K0) &&
+         (X86::K6_K7 + 1 == X86::TMMCFG) &&
+         (X86::TMM7 + 1 == X86::NUM_TARGET_REGS) &&
+         "Register number may be incorrect");
+  return X86::NUM_TARGET_REGS;
+}
+
 bool X86RegisterInfo::isArgumentRegister(const MachineFunction &MF,
                                          MCRegister Reg) const {
   const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>();
diff --git a/llvm/lib/Target/X86/X86RegisterInfo.h b/llvm/lib/Target/X86/X86RegisterInfo.h
index 0671f79676009..7296a5f021e4a 100644
--- a/llvm/lib/Target/X86/X86RegisterInfo.h
+++ b/llvm/lib/Target/X86/X86RegisterInfo.h
@@ -51,6 +51,9 @@ class X86RegisterInfo final : public X86GenRegisterInfo {
 public:
   explicit X86RegisterInfo(const Triple &TT);
 
+  /// Return the number of registers for the function.
+  unsigned getNumSupportedRegs(const MachineFunction &MF) const override;
+
   // FIXME: This should be tablegen'd like getDwarfRegNum is
   int getSEHRegNum(unsigned i) const;
 
diff --git a/llvm/lib/Target/X86/X86RegisterInfo.td b/llvm/lib/Target/X86/X86RegisterInfo.td
index 81b7597cc8ea5..898a3f97e5236 100644
--- a/llvm/lib/Target/X86/X86RegisterInfo.td
+++ b/llvm/lib/Target/X86/X86RegisterInfo.td
@@ -223,6 +223,8 @@ def XMM13: X86Reg<"xmm13", 13>, DwarfRegNum<[30, -2, -2]>;
 def XMM14: X86Reg<"xmm14", 14>, DwarfRegNum<[31, -2, -2]>;
 def XMM15: X86Reg<"xmm15", 15>, DwarfRegNum<[32, -2, -2]>;
 
+let PositionOrder = 2 in {
+// XMM16-31 registers, used by AVX-512 instructions.
 def XMM16:  X86Reg<"xmm16", 16>, DwarfRegNum<[67, -2, -2]>;
 def XMM17:  X86Reg<"xmm17", 17>, DwarfRegNum<[68, -2, -2]>;
 def XMM18:  X86Reg<"xmm18", 18>, DwarfRegNum<[69, -2, -2]>;
@@ -239,27 +241,51 @@ def XMM28:  X86Reg<"xmm28", 28>, DwarfRegNum<[79, -2, -2]>;
 def XMM29:  X86Reg<"xmm29", 29>, DwarfRegNum<[80, -2, -2]>;
 def XMM30:  X86Reg<"xmm30", 30>, DwarfRegNum<[81, -2, -2]>;
 def XMM31:  X86Reg<"xmm31", 31>, DwarfRegNum<[82, -2, -2]>;
+}
 
 // YMM0-15 registers, used by AVX instructions and
 // YMM16-31 registers, used by AVX-512 instructions.
-let SubRegIndices = [sub_xmm] in {
-  foreach  Index = 0-31 in {
+let SubRegIndices = [sub_xmm], PositionOrder = 1 in {
+  foreach  Index = 0-15 in {
+    def YMM#Index : X86Reg<"ymm"#Index, Index, [!cast<X86Reg>("XMM"#Index)]>,
+                    DwarfRegAlias<!cast<X86Reg>("XMM"#Index)>;
+  }
+}
+let SubRegIndices = [sub_xmm], PositionOrder = 2 in {
+  foreach  Index = 16-31 in {
     def YMM#Index : X86Reg<"ymm"#Index, Index, [!cast<X86Reg>("XMM"#Index)]>,
                     DwarfRegAlias<!cast<X86Reg>("XMM"#Index)>;
   }
 }
 
+
 // ZMM Registers, used by AVX-512 instructions.
-let SubRegIndices = [sub_ymm] in {
+let SubRegIndices = [sub_ymm], PositionOrder = 2 in {
   foreach  Index = 0-31 in {
     def ZMM#Index : X86Reg<"zmm"#Index, Index, [!cast<X86Reg>("YMM"#Index)]>,
                     DwarfRegAlias<!cast<X86Reg>("XMM"#Index)>;
   }
 }
 
+let PositionOrder = 2 in {
+// Mask Registers, used by AVX-512 instructions.
+def K0 : X86Reg<"k0", 0>, DwarfRegNum<[118,  93,  93]>;
+def K1 : X86Reg<"k1", 1>, DwarfRegNum<[119,  94,  94]>;
+def K2 : X86Reg<"k2", 2>, DwarfRegNum<[120,  95,  95]>;
+def K3 : X86Reg<"k3", 3>, DwarfRegNum<[121,  96,  96]>;
+def K4 : X86Reg<"k4", 4>, DwarfRegNum<[122,  97,  97]>;
+def K5 : X86Reg<"k5", 5>, DwarfRegNum<[123,  98,  98]>;
+def K6 : X86Reg<"k6", 6>, DwarfRegNum<[124,  99,  99]>;
+def K7 : X86Reg<"k7", 7>, DwarfRegNum<[125, 100, 100]>;
+// Mask register pairs
+def KPAIRS : RegisterTuples<[sub_mask_0, sub_mask_1],
+                             [(add K0, K2, K4, K6), (add K1, K3, K5, K7)]>;
+}
+
+// TMM registers, used by AMX instructions.
+let PositionOrder = 3 in {
 // Tile config registers.
 def TMMCFG: X86Reg<"tmmcfg", 0>;
-
 // Tile "registers".
 def TMM0:  X86Reg<"tmm0",   0>;
 def TMM1:  X86Reg<"tmm1",   1>;
@@ -269,16 +295,7 @@ def TMM4:  X86Reg<"tmm4",   4>;
 def TMM5:  X86Reg<"tmm5",   5>;
 def TMM6:  X86Reg<"tmm6",   6>;
 def TMM7:  X86Reg<"tmm7",   7>;
-
-// Mask Registers, used by AVX-512 instructions.
-def K0 : X86Reg<"k0", 0>, DwarfRegNum<[118,  93,  93]>;
-def K1 : X86Reg<"k1", 1>, DwarfRegNum<[119,  94,  94]>;
-def K2 : X86Reg<"k2", 2>, DwarfRegNum<[120,  95,  95]>;
-def K3 : X86Reg<"k3", 3>, DwarfRegNum<[121,  96,  96]>;
-def K4 : X86Reg<"k4", 4>, DwarfRegNum<[122,  97,  97]>;
-def K5 : X86Reg<"k5", 5>, DwarfRegNum<[123,  98,  98]>;
-def K6 : X86Reg<"k6", 6>, DwarfRegNum<[124,  99,  99]>;
-def K7 : X86Reg<"k7", 7>, DwarfRegNum<[125, 100, 100]>;
+}
 
 // Floating point stack registers. These don't map one-to-one to the FP
 // pseudo registers, but we still mark them as aliasing FP registers. That
@@ -627,10 +644,6 @@ def VK16    : RegisterClass<"X86", [v16i1], 16, (add VK8)> {let Size = 16;}
 def VK32    : RegisterClass<"X86", [v32i1], 32, (add VK16)> {let Size = 32;}
 def VK64    : RegisterClass<"X86", [v64i1], 64, (add VK32)> {let Size = 64;}
 
-// Mask register pairs
-def KPAIRS : RegisterTuples<[sub_mask_0, sub_mask_1],
-                             [(add K0, K2, K4, K6), (add K1, K3, K5, K7)]>;
-
 def VK1PAIR   : RegisterClass<"X86", [untyped], 16, (add KPAIRS)> {let Size = 32;}
 def VK2PAIR   : RegisterClass<"X86", [untyped], 16, (add KPAIRS)> {let Size = 32;}
 def VK4PAIR   : RegisterClass<"X86", [untyped], 16, (add KPAIRS)> {let Size = 32;}
diff --git a/llvm/test/CodeGen/X86/ipra-reg-usage.ll b/llvm/test/CodeGen/X86/ipra-reg-usage.ll
index 36c4d6eff0018..4d0c94125c761 100644
--- a/llvm/test/CodeGen/X86/ipra-reg-usage.ll
+++ b/llvm/test/CodeGen/X86/ipra-reg-usage.ll
@@ -3,7 +3,7 @@
 target triple = "x86_64-unknown-unknown"
 declare void @bar1()
 define preserve_allcc void @foo()#0 {
-; CHECK: foo Clobbered Registers: $cs $df $ds $eflags $eip $eiz $es $esp $fpcw $fpsw $fs $fs_base $gs $gs_base $hip $hsp $ip $mxcsr $rflags $rip $riz $rsp $sp $sph $spl $ss $ssp $tmmcfg $_eflags $cr0 $cr1 $cr2 $cr3 $cr4 $cr5 $cr6 $cr7 $cr8 $cr9 $cr10 $cr11 $cr12 $cr13 $cr14 $cr15 $dr0 $dr1 $dr2 $dr3 $dr4 $dr5 $dr6 $dr7 $dr8 $dr9 $dr10 $dr11 $dr12 $dr13 $dr14 $dr15 $fp0 $fp1 $fp2 $fp3 $fp4 $fp5 $fp6 $fp7 $k0 $k1 $k2 $k3 $k4 $k5 $k6 $k7 $mm0 $mm1 $mm2 $mm3 $mm4 $mm5 $mm6 $mm7 $r11 $st0 $st1 $st2 $st3 $st4 $st5 $st6 $st7 $tmm0 $tmm1 $tmm2 $tmm3 $tmm4 $tmm5 $tmm6 $tmm7 $xmm16 $xmm17 $xmm18 $xmm19 $xmm20 $xmm21 $xmm22 $xmm23 $xmm24 $xmm25 $xmm26 $xmm27 $xmm28 $xmm29 $xmm30 $xmm31 $ymm0 $ymm1 $ymm2 $ymm3 $ymm4 $ymm5 $ymm6 $ymm7 $ymm8 $ymm9 $ymm10 $ymm11 $ymm12 $ymm13 $ymm14 $ymm15 $ymm16 $ymm17 $ymm18 $ymm19 $ymm20 $ymm21 $ymm22 $ymm23 $ymm24 $ymm25 $ymm26 $ymm27 $ymm28 $ymm29 $ymm30 $ymm31 $zmm0 $zmm1 $zmm2 $zmm3 $zmm4 $zmm5 $zmm6 $zmm7 $zmm8 $zmm9 $zmm10 $zmm11 $zmm12 $zmm13 $zmm14 $zmm15 $zmm16 $zmm17 $zmm18 $zmm19 $zmm20 $zmm21 $zmm22 $zmm23 $zmm24 $zmm25 $zmm26 $zmm27 $zmm28 $zmm29 $zmm30 $zmm31 $r11b $r11bh $r11d $r11w $r11wh $k0_k1 $k2_k3 $k4_k5 $k6_k7
+; CHECK: foo Clobbered Registers: $cs $df $ds $eflags $eip $eiz $es $esp $fpcw $fpsw $fs $fs_base $gs $gs_base $hip $hsp $ip $mxcsr $rflags $rip $riz $rsp $sp $sph $spl $ss $ssp $_eflags $cr0 $cr1 $cr2 $cr3 $cr4 $cr5 $cr6 $cr7 $cr8 $cr9 $cr10 $cr11 $cr12 $cr13 $cr14 $cr15 $dr0 $dr1 $dr2 $dr3 $dr4 $dr5 $dr6 $dr7 $dr8 $dr9 $dr10 $dr11 $dr12 $dr13 $dr14 $dr15 $fp0 $fp1 $fp2 $fp3 $fp4 $fp5 $fp6 $fp7 $mm0 $mm1 $mm2 $mm3 $mm4 $mm5 $mm6 $mm7 $r11 $st0 $st1 $st2 $st3 $st4 $st5 $st6 $st7 $r11b $r11bh $r11d $r11w $r11wh $ymm0 $ymm1 $ymm2 $ymm3 $ymm4 $ymm5 $ymm6 $ymm7 $ymm8 $ymm9 $ymm10 $ymm11 $ymm12 $ymm13 $ymm14 $ymm15 $k0 $k1 $k2 $k3 $k4 $k5 $k6 $k7 $xmm16 $xmm17 $xmm18 $xmm19 $xmm20 $xmm21 $xmm22 $xmm23 $xmm24 $xmm25 $xmm26 $xmm27 $xmm28 $xmm29 $xmm30 $xmm31 $ymm16 $ymm17 $ymm18 $ymm19 $ymm20 $ymm21 $ymm22 $ymm23 $ymm24 $ymm25 $ymm26 $ymm27 $ymm28 $ymm29 $ymm30 $ymm31 $zmm0 $zmm1 $zmm2 $zmm3 $zmm4 $zmm5 $zmm6 $zmm7 $zmm8 $zmm9 $zmm10 $zmm11 $zmm12 $zmm13 $zmm14 $zmm15 $zmm16 $zmm17 $zmm18 $zmm19 $zmm20 $zmm21 $zmm22 $zmm23 $zmm24 $zmm25 $zmm26 $zmm27 $zmm28 $zmm29 $zmm30 $zmm31 $k0_k1 $k2_k3 $k4_k5 $k6_k7 $tmmcfg $tmm0 $tmm1 $tmm2 $tmm3 $tmm4 $tmm5 $tmm6 $tmm7
   call void @bar1()
   call void @bar2()
   ret void
diff --git a/llvm/utils/TableGen/CodeGenRegisters.cpp b/llvm/utils/TableGen/CodeGenRegisters.cpp
index 2d5f5c841a174..d1abdb74ea4a9 100644
--- a/llvm/utils/TableGen/CodeGenRegisters.cpp
+++ b/llvm/utils/TableGen/CodeGenRegisters.cpp
@@ -1175,22 +1175,42 @@ CodeGenRegBank::CodeGenRegBank(RecordKeeper &Records,
   for (auto &Idx : SubRegIndices)
     Idx.updateComponents(*this);
 
-  // Read in the register definitions.
-  std::vector<Record*> Regs = Records.getAllDerivedDefinitions("Register");
-  llvm::sort(Regs, LessRecordRegister());
-  // Assign the enumeration values.
-  for (unsigned i = 0, e = Regs.size(); i != e; ++i)
-    getReg(Regs[i]);
-
-  // Expand tuples and number the new registers.
-  std::vector<Record*> Tups =
-    Records.getAllDerivedDefinitions("RegisterTuples");
-
-  for (Record *R : Tups) {
-    std::vector<Record *> TupRegs = *Sets.expand(R);
-    llvm::sort(TupRegs, LessRecordRegister());
-    for (Record *RC : TupRegs)
-      getReg(RC);
+  // Read in the register and register tuple definitions.
+  std::vector<Record *> Regs = Records.getAllDerivedDefinitions("Register");
+  if (!Regs.empty() && Regs[0]->isSubClassOf("X86Reg")) {
+    // For X86, we need to sort Registers and RegisterTuples together to list
+    // new registers and register tuples at a later position. So that we can
+    // reduce unnecessary iterations on unsupported registers in LiveVariables.
+    // TODO: Remove this logic when migrate from LiveVariables to LiveIntervals
+    // completely.
+    std::vector<Record *> Tups =
+        Records.getAllDerivedDefinitions("RegisterTuples");
+    for (Record *R : Tups) {
+      // Expand tuples and merge the vectors
+      std::vector<Record *> TupRegs = *Sets.expand(R);
+      Regs.insert(Regs.end(), TupRegs.begin(), TupRegs.end());
+    }
+
+    llvm::sort(Regs, LessRecordRegister());
+    // Assign the enumeration values.
+    for (unsigned i = 0, e = Regs.size(); i != e; ++i)
+      getReg(Regs[i]);
+  } else {
+    llvm::sort(Regs, LessRecordRegister());
+    // Assign the enumeration values.
+    for (unsigned i = 0, e = Regs.size(); i != e; ++i)
+      getReg(Regs[i]);
+
+    // Expand tuples and number the new registers.
+    std::vector<Record *> Tups =
+        Records.getAllDerivedDefinitions("RegisterTuples");
+
+    for (Record *R : Tups) {
+      std::vector<Record *> TupRegs = *Sets.expand(R);
+      llvm::sort(TupRegs, LessRecordRegister());
+      for (Record *RC : TupRegs)
+        getReg(RC);
+    }
   }
 
   // Now all the registers are known. Build the object graph of explicit

From 9220e0e647a1e4fbfb0f990117f88f171f9f2f65 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Wed, 1 Nov 2023 09:29:33 -0700
Subject: [PATCH 700/877] [ELF][test] Improve flag propagation when an output
 section does not contain input sections

---
 lld/test/ELF/linkerscript/Inputs/extend-pt-load.s |  3 +++
 lld/test/ELF/linkerscript/extend-pt-load1.test    |  4 +++-
 lld/test/ELF/linkerscript/extend-pt-load2.test    | 15 ++++++++++-----
 lld/test/ELF/linkerscript/extend-pt-load3.test    |  8 +++++---
 4 files changed, 21 insertions(+), 9 deletions(-)

diff --git a/lld/test/ELF/linkerscript/Inputs/extend-pt-load.s b/lld/test/ELF/linkerscript/Inputs/extend-pt-load.s
index 8993fb163346a..3f4b37d47a8a4 100644
--- a/lld/test/ELF/linkerscript/Inputs/extend-pt-load.s
+++ b/lld/test/ELF/linkerscript/Inputs/extend-pt-load.s
@@ -1,3 +1,6 @@
 nop
 .section .data.rel.ro, "aw"
 .byte 0
+
+.section .rodata,"a"
+.byte 0
diff --git a/lld/test/ELF/linkerscript/extend-pt-load1.test b/lld/test/ELF/linkerscript/extend-pt-load1.test
index 63c6f0d9ca8e0..9bec025a26adf 100644
--- a/lld/test/ELF/linkerscript/extend-pt-load1.test
+++ b/lld/test/ELF/linkerscript/extend-pt-load1.test
@@ -11,6 +11,7 @@ SECTIONS {
   .dynsym : {}
   .hash : {}
   .dynstr : {}
+  .rodata : { *(.rodata) }
   .text : { *(.text) }
   . = ALIGN(0x1000);
   .data.rel.ro : { *(.data.rel.ro) }
@@ -19,5 +20,6 @@ SECTIONS {
 # CHECK:      .text        PROGBITS 00000000000001f4 0001f4 000001 00 AX
 # CHECK-NEXT: .data.rel.ro PROGBITS 0000000000001000 001000 000001 00 WA
 
-# CHECK:      LOAD 0x000000 0x0000000000000000 0x0000000000000000 0x0001f1 0x0001f1 R   0x1000
+# CHECK:      LOAD 0x000000 0x0000000000000000 0x0000000000000000 0x0001f2 0x0001f2 R   0x1000
 # CHECK-NEXT: LOAD 0x0001f4 0x00000000000001f4 0x00000000000001f4 0x000001 0x000001 R E 0x1000
+# CHECK-NEXT: LOAD 0x001000 0x0000000000001000 0x0000000000001000 0x000068 0x000068 RW  0x1000
diff --git a/lld/test/ELF/linkerscript/extend-pt-load2.test b/lld/test/ELF/linkerscript/extend-pt-load2.test
index 3e91f3b4f162d..3be94a5670d8c 100644
--- a/lld/test/ELF/linkerscript/extend-pt-load2.test
+++ b/lld/test/ELF/linkerscript/extend-pt-load2.test
@@ -11,15 +11,20 @@ SECTIONS {
   .dynsym : {}
   .hash : {}
   .dynstr : {}
+  .rodata : { *(.rodata) }
+  foo : { sym = .; }
+
   .text : { *(.text) }
   bar : { . = ALIGN(0x1000); }
   .data.rel.ro : { *(.data.rel.ro) }
 }
 
-# CHECK:      .text        PROGBITS 00000000000001f4 0001f4 000001 00 AX
-# CHECK-NEXT: bar          PROGBITS 00000000000001f5 0001f5 000e0b 00 AX
-# CHECK-NEXT: .data.rel.ro PROGBITS 0000000000001000 001000 000001 00 WA
+# CHECK:      .rodata      PROGBITS 0000000000000215 000215 000001 00 A  0
+# CHECK-NEXT: foo          PROGBITS 0000000000000216 000216 000000 00 A  0
+# CHECK-NEXT: .text        PROGBITS 0000000000000218 000218 000001 00 AX 0
+# CHECK-NEXT: bar          PROGBITS 0000000000000219 000219 000de7 00 AX 0
+# CHECK-NEXT: .data.rel.ro PROGBITS 0000000000001000 001000 000001 00 WA 0
 
-# CHECK:      LOAD 0x000000 0x0000000000000000 0x0000000000000000 0x0001f1 0x0001f1 R   0x1000
-# CHECK:      LOAD 0x0001f4 0x00000000000001f4 0x00000000000001f4 0x000e0c 0x000e0c R E 0x1000
+# CHECK:      LOAD 0x000000 0x0000000000000000 0x0000000000000000 0x000216 0x000216 R   0x1000
+# CHECK:      LOAD 0x000218 0x0000000000000218 0x0000000000000218 0x000de8 0x000de8 R E 0x1000
 # CHECK-NEXT: LOAD 0x001000 0x0000000000001000 0x0000000000001000 0x000068 0x000068 RW  0x1000
diff --git a/lld/test/ELF/linkerscript/extend-pt-load3.test b/lld/test/ELF/linkerscript/extend-pt-load3.test
index 072fe90a6461c..5fbeab7924303 100644
--- a/lld/test/ELF/linkerscript/extend-pt-load3.test
+++ b/lld/test/ELF/linkerscript/extend-pt-load3.test
@@ -11,15 +11,17 @@ SECTIONS {
   .dynsym : {}
   .hash : {}
   .dynstr : {}
+  .rodata : { *(.rodata) }
   .text : { *(.text) }
   . = ALIGN(0x1000);
   HIDDEN(bar_sym = .);
   .data.rel.ro : { *(.data.rel.ro) }
 }
 
-# CHECK:      .text        PROGBITS 00000000000001f4 0001f4 000001 00 AX
-# CHECK-NEXT: .data.rel.ro PROGBITS 0000000000001000 001000 000001 00 WA
+# CHECK:      .rodata      PROGBITS 00000000000001f1 0001f1 000001 00 A  0
+# CHECK:      .text        PROGBITS 00000000000001f4 0001f4 000001 00 AX 0
+# CHECK-NEXT: .data.rel.ro PROGBITS 0000000000001000 001000 000001 00 WA 0
 
-# CHECK:      LOAD 0x000000 0x0000000000000000 0x0000000000000000 0x0001f1 0x0001f1 R   0x1000
+# CHECK:      LOAD 0x000000 0x0000000000000000 0x0000000000000000 0x0001f2 0x0001f2 R   0x1000
 # CHECK-NEXT: LOAD 0x0001f4 0x00000000000001f4 0x00000000000001f4 0x000001 0x000001 R E 0x1000
 # CHECK-NEXT: LOAD 0x001000 0x0000000000001000 0x0000000000001000 0x000068 0x000068 RW  0x1000

From 7dc20abed0d121b7492b3da261aca9d9ea4d0ad1 Mon Sep 17 00:00:00 2001
From: Sander de Smalen <sander.desmalen@arm.com>
Date: Wed, 1 Nov 2023 16:15:47 +0000
Subject: [PATCH 701/877] [AArch64] Fix spillfill-sve.mir with expensive
 checks.

This fixes an issue introduced by PR #70679.

Using constrainRegClass() is not strong enough to actually force
the use of a register to be a PPR register class. It will need an
actual COPY to do the conversion.

The downside is that this introduces an extra register, which is an
issue we may want to fix at a later point using a custom copy operation
where the register allocator uses the same register when it can.
---
 llvm/lib/Target/AArch64/AArch64InstrInfo.cpp | 20 ++++++++++++++------
 llvm/test/CodeGen/AArch64/spillfill-sve.mir  | 11 ++++++++---
 2 files changed, 22 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 4680af7d121c7..fc39e4b044ad8 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -4773,9 +4773,13 @@ void AArch64InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
     } else if (AArch64::PNRRegClass.hasSubClassEq(RC)) {
       assert((Subtarget.hasSVE2p1() || Subtarget.hasSME2()) &&
              "Unexpected register store without SVE2p1 or SME2");
-      if (SrcReg.isVirtual())
-        MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::PPRRegClass);
-      else
+      if (SrcReg.isVirtual()) {
+        auto NewSrcReg =
+            MF.getRegInfo().createVirtualRegister(&AArch64::PPRRegClass);
+        BuildMI(MBB, MBBI, DebugLoc(), get(TargetOpcode::COPY), NewSrcReg)
+            .addReg(SrcReg);
+        SrcReg = NewSrcReg;
+      } else
         SrcReg = (SrcReg - AArch64::PN0) + AArch64::P0;
       Opc = AArch64::STR_PXI;
       StackID = TargetStackID::ScalableVector;
@@ -4931,7 +4935,7 @@ void AArch64InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
   unsigned Opc = 0;
   bool Offset = true;
   unsigned StackID = TargetStackID::Default;
-  MCRegister PNRReg = MCRegister::NoRegister;
+  Register PNRReg = MCRegister::NoRegister;
   switch (TRI->getSpillSize(*RC)) {
   case 1:
     if (AArch64::FPR8RegClass.hasSubClassEq(RC))
@@ -4950,7 +4954,7 @@ void AArch64InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
              "Unexpected register load without SVE2p1 or SME2");
       PNRReg = DestReg;
       if (DestReg.isVirtual())
-        MF.getRegInfo().constrainRegClass(DestReg, &AArch64::PPRRegClass);
+        DestReg = MF.getRegInfo().createVirtualRegister(&AArch64::PPRRegClass);
       else
         DestReg = (DestReg - AArch64::PN0) + AArch64::P0;
       Opc = AArch64::LDR_PXI;
@@ -5061,9 +5065,13 @@ void AArch64InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
                                      .addFrameIndex(FI);
   if (Offset)
     MI.addImm(0);
-  if (PNRReg.isValid())
+  if (PNRReg.isValid() && !PNRReg.isVirtual())
     MI.addDef(PNRReg, RegState::Implicit);
   MI.addMemOperand(MMO);
+
+  if (PNRReg.isValid() && PNRReg.isVirtual())
+    BuildMI(MBB, MBBI, DebugLoc(), get(TargetOpcode::COPY), PNRReg)
+        .addReg(DestReg);
 }
 
 bool llvm::isNZCVTouchedInInstructionRange(const MachineInstr &DefMI,
diff --git a/llvm/test/CodeGen/AArch64/spillfill-sve.mir b/llvm/test/CodeGen/AArch64/spillfill-sve.mir
index de93cbfe1aa19..01756b8460019 100644
--- a/llvm/test/CodeGen/AArch64/spillfill-sve.mir
+++ b/llvm/test/CodeGen/AArch64/spillfill-sve.mir
@@ -121,8 +121,13 @@ body:             |
 
     ; EXPAND-LABEL: name: spills_fills_stack_id_virtreg_pnr
     ; EXPAND: renamable $pn8 = WHILEGE_CXX_B
-    ; EXPAND: STR_PXI killed renamable $pn8, $sp, 7
-    ; EXPAND: $p0 = LDR_PXI $sp, 7, implicit-def $pn0
+    ; EXPAND: $p0 = ORR_PPzPP $p8, $p8, killed $p8
+    ; EXPAND: STR_PXI killed renamable $p0, $sp, 7
+    ;
+    ; EXPAND: renamable $p0 = LDR_PXI $sp, 7
+    ; EXPAND: $p8 = ORR_PPzPP $p0, $p0, killed $p0, implicit-def $pn8
+    ; EXPAND: $p0 = PEXT_PCI_B killed renamable $pn8, 0
+
 
     %0:pnr_p8to15 = WHILEGE_CXX_B undef $x0, undef $x0, 0, implicit-def dead $nzcv
 
@@ -143,7 +148,7 @@ body:             |
     $pn14 = IMPLICIT_DEF
     $pn15 = IMPLICIT_DEF
 
-    $pn0 = COPY %0
+    $p0 = PEXT_PCI_B %0, 0
     RET_ReallyLR
 ...
 ---

From 8912200966409f18e27aa0627e521faa190029a6 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Wed, 1 Nov 2023 09:36:41 -0700
Subject: [PATCH 702/877] [RISCV] Add experimental support for making i32 a
 legal type on RV64 in SelectionDAG. (#70357)

This will select i32 operations directly to W instructions without
custom nodes. Hopefully this can allow us to be less dependent on
hasAllNBitUsers to recover i32 operations in RISCVISelDAGToDAG.cpp.

This support is enabled with a command line option that is off by
default.

Generated code is still not optimal.

I've duplicated many test cases for this, but its not complete. Enabling this runs all existing lit tests without crashing.
---
 llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp |   19 +-
 .../SelectionDAG/LegalizeIntegerTypes.cpp     |    2 +-
 .../SelectionDAG/SelectionDAGBuilder.cpp      |    2 +-
 llvm/lib/Target/RISCV/RISCVGISel.td           |   47 +-
 llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp   |   12 +-
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   |  332 ++-
 llvm/lib/Target/RISCV/RISCVISelLowering.h     |    6 +
 llvm/lib/Target/RISCV/RISCVInstrInfo.td       |   59 +-
 llvm/lib/Target/RISCV/RISCVInstrInfoA.td      |   58 +
 llvm/lib/Target/RISCV/RISCVInstrInfoD.td      |    4 +-
 llvm/lib/Target/RISCV/RISCVInstrInfoF.td      |   16 +-
 llvm/lib/Target/RISCV/RISCVInstrInfoM.td      |   15 +
 llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td |   14 +-
 llvm/lib/Target/RISCV/RISCVInstrInfoZb.td     |   26 +
 .../lib/Target/RISCV/RISCVInstrInfoZfbfmin.td |    2 +-
 llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td    |   16 +-
 .../CodeGen/RISCV/rv64-legal-i32/alu32.ll     |  240 ++
 llvm/test/CodeGen/RISCV/rv64-legal-i32/div.ll |  699 +++++
 llvm/test/CodeGen/RISCV/rv64-legal-i32/imm.ll | 2564 +++++++++++++++++
 llvm/test/CodeGen/RISCV/rv64-legal-i32/mem.ll |   92 +
 .../CodeGen/RISCV/rv64-legal-i32/mem64.ll     |  341 +++
 llvm/test/CodeGen/RISCV/rv64-legal-i32/rem.ll |  390 +++
 .../RISCV/rv64-legal-i32/rv64xtheadbb.ll      |  902 ++++++
 .../CodeGen/RISCV/rv64-legal-i32/rv64zba.ll   | 1798 ++++++++++++
 .../RISCV/rv64-legal-i32/rv64zbb-intrinsic.ll |   77 +
 .../RISCV/rv64-legal-i32/rv64zbb-zbkb.ll      |  600 ++++
 .../CodeGen/RISCV/rv64-legal-i32/rv64zbb.ll   | 1068 +++++++
 .../RISCV/rv64-legal-i32/rv64zbc-intrinsic.ll |   42 +
 .../rv64-legal-i32/rv64zbc-zbkc-intrinsic.ll  |   67 +
 .../rv64-legal-i32/rv64zbkb-intrinsic.ll      |   73 +
 .../CodeGen/RISCV/rv64-legal-i32/rv64zbs.ll   | 1000 +++++++
 .../CodeGen/RISCV/rv64-legal-i32/xaluo.ll     | 1308 +++++++++
 32 files changed, 11728 insertions(+), 163 deletions(-)
 create mode 100644 llvm/test/CodeGen/RISCV/rv64-legal-i32/alu32.ll
 create mode 100644 llvm/test/CodeGen/RISCV/rv64-legal-i32/div.ll
 create mode 100644 llvm/test/CodeGen/RISCV/rv64-legal-i32/imm.ll
 create mode 100644 llvm/test/CodeGen/RISCV/rv64-legal-i32/mem.ll
 create mode 100644 llvm/test/CodeGen/RISCV/rv64-legal-i32/mem64.ll
 create mode 100644 llvm/test/CodeGen/RISCV/rv64-legal-i32/rem.ll
 create mode 100644 llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64xtheadbb.ll
 create mode 100644 llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64zba.ll
 create mode 100644 llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64zbb-intrinsic.ll
 create mode 100644 llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64zbb-zbkb.ll
 create mode 100644 llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64zbb.ll
 create mode 100644 llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64zbc-intrinsic.ll
 create mode 100644 llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64zbc-zbkc-intrinsic.ll
 create mode 100644 llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64zbkb-intrinsic.ll
 create mode 100644 llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64zbs.ll
 create mode 100644 llvm/test/CodeGen/RISCV/rv64-legal-i32/xaluo.ll

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index f19beea3a3ed8..82751a442dbc3 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -5023,6 +5023,10 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) {
   case ISD::SREM:
   case ISD::UDIV:
   case ISD::UREM:
+  case ISD::SMIN:
+  case ISD::SMAX:
+  case ISD::UMIN:
+  case ISD::UMAX:
   case ISD::AND:
   case ISD::OR:
   case ISD::XOR: {
@@ -5039,12 +5043,21 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) {
         break;
       case ISD::SDIV:
       case ISD::SREM:
+      case ISD::SMIN:
+      case ISD::SMAX:
         ExtOp = ISD::SIGN_EXTEND;
         break;
       case ISD::UDIV:
       case ISD::UREM:
         ExtOp = ISD::ZERO_EXTEND;
         break;
+      case ISD::UMIN:
+      case ISD::UMAX:
+        if (TLI.isSExtCheaperThanZExt(OVT, NVT))
+          ExtOp = ISD::SIGN_EXTEND;
+        else
+          ExtOp = ISD::ZERO_EXTEND;
+        break;
       }
       TruncOp = ISD::TRUNCATE;
     }
@@ -5166,7 +5179,11 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) {
     unsigned ExtOp = ISD::FP_EXTEND;
     if (NVT.isInteger()) {
       ISD::CondCode CCCode = cast<CondCodeSDNode>(Node->getOperand(2))->get();
-      ExtOp = isSignedIntSetCC(CCCode) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
+      if (isSignedIntSetCC(CCCode) ||
+          TLI.isSExtCheaperThanZExt(Node->getOperand(0).getValueType(), NVT))
+        ExtOp = ISD::SIGN_EXTEND;
+      else
+        ExtOp = ISD::ZERO_EXTEND;
     }
     if (Node->isStrictFPOpcode()) {
       SDValue InChain = Node->getOperand(0);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 5bd04e2360679..2d2585b3db732 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -371,7 +371,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_AtomicCmpSwap(AtomicSDNode *N,
         N->getMemOperand());
     ReplaceValueWith(SDValue(N, 0), Res.getValue(0));
     ReplaceValueWith(SDValue(N, 2), Res.getValue(2));
-    return Res.getValue(1);
+    return DAG.getSExtOrTrunc(Res.getValue(1), SDLoc(N), NVT);
   }
 
   // Op2 is used for the comparison and thus must be extended according to the
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 229f220d8460b..29505f7505ba2 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -3468,7 +3468,7 @@ void SelectionDAGBuilder::visitSelect(const User &I) {
     }
 
     if (!IsUnaryAbs && Opc != ISD::DELETED_NODE &&
-        (TLI.isOperationLegalOrCustom(Opc, VT) ||
+        (TLI.isOperationLegalOrCustomOrPromote(Opc, VT) ||
          (UseScalarMinMax &&
           TLI.isOperationLegalOrCustom(Opc, VT.getScalarType()))) &&
         // If the underlying comparison instruction is used by any other
diff --git a/llvm/lib/Target/RISCV/RISCVGISel.td b/llvm/lib/Target/RISCV/RISCVGISel.td
index fcac2f3655962..458bf9a2efde4 100644
--- a/llvm/lib/Target/RISCV/RISCVGISel.td
+++ b/llvm/lib/Target/RISCV/RISCVGISel.td
@@ -21,8 +21,6 @@ def simm12Plus1 : ImmLeaf<XLenVT, [{
 def simm12Plus1i32 : ImmLeaf<i32, [{
     return (isInt<12>(Imm) && Imm != -2048) || Imm == 2048;}]>;
 
-def simm12i32 : ImmLeaf<i32, [{return isInt<12>(Imm);}]>;
-
 def uimm5i32 : ImmLeaf<i32, [{return isUInt<5>(Imm);}]>;
 
 // FIXME: This doesn't check that the G_CONSTANT we're deriving the immediate
@@ -49,11 +47,6 @@ def GIAddrRegImm :
   GIComplexOperandMatcher<s32, "selectAddrRegImm">,
   GIComplexPatternEquiv<AddrRegImm>;
 
-// Convert from i32 immediate to i64 target immediate to make SelectionDAG type
-// checking happy so we can use ADDIW which expects an XLen immediate.
-def as_i64imm : SDNodeXForm<imm, [{
-  return CurDAG->getTargetConstant(N->getSExtValue(), SDLoc(N), MVT::i64);
-}]>;
 def gi_as_i64imm : GICustomOperandRenderer<"renderImm">,
   GISDNodeXFormEquiv<as_i64imm>;
 
@@ -88,14 +81,10 @@ def : Pat<(XLenVT (sub GPR:$rs1, simm12Plus1:$imm)),
           (ADDI GPR:$rs1, (NegImm simm12Plus1:$imm))>;
 
 let Predicates = [IsRV64] in {
-def : Pat<(i32 (add GPR:$rs1, GPR:$rs2)), (ADDW GPR:$rs1, GPR:$rs2)>;
-def : Pat<(i32 (sub GPR:$rs1, GPR:$rs2)), (SUBW GPR:$rs1, GPR:$rs2)>;
 def : Pat<(i32 (and GPR:$rs1, GPR:$rs2)), (AND GPR:$rs1, GPR:$rs2)>;
 def : Pat<(i32 (or GPR:$rs1, GPR:$rs2)), (OR GPR:$rs1, GPR:$rs2)>;
 def : Pat<(i32 (xor GPR:$rs1, GPR:$rs2)), (XOR GPR:$rs1, GPR:$rs2)>;
 
-def : Pat<(i32 (add GPR:$rs1, simm12i32:$imm)),
-          (ADDIW GPR:$rs1, (i64 (as_i64imm $imm)))>;
 def : Pat<(i32 (sub GPR:$rs1, simm12Plus1i32:$imm)),
           (ADDIW GPR:$rs1, (i64 (NegImm $imm)))>;
 
@@ -116,19 +105,6 @@ def : Pat<(i32 (sra GPR:$rs1, uimm5i32:$imm)),
           (SRAIW GPR:$rs1, (i64 (as_i64imm $imm)))>;
 def : Pat<(i32 (srl GPR:$rs1, uimm5i32:$imm)),
           (SRLIW GPR:$rs1, (i64 (as_i64imm $imm)))>;
-
-def : Pat<(i64 (sext i32:$rs)), (ADDIW GPR:$rs, 0)>;
-}
-
-let Predicates = [HasStdExtMOrZmmul, IsRV64] in {
-def : Pat<(i32 (mul GPR:$rs1, GPR:$rs2)), (MULW GPR:$rs1, GPR:$rs2)>;
-}
-
-let Predicates = [HasStdExtM, IsRV64] in {
-def : Pat<(i32 (sdiv GPR:$rs1, GPR:$rs2)), (DIVW GPR:$rs1, GPR:$rs2)>;
-def : Pat<(i32 (srem GPR:$rs1, GPR:$rs2)), (REMW GPR:$rs1, GPR:$rs2)>;
-def : Pat<(i32 (udiv GPR:$rs1, GPR:$rs2)), (DIVUW GPR:$rs1, GPR:$rs2)>;
-def : Pat<(i32 (urem GPR:$rs1, GPR:$rs2)), (REMUW GPR:$rs1, GPR:$rs2)>;
 }
 
 let Predicates = [HasStdExtZba, IsRV64] in {
@@ -136,13 +112,8 @@ let Predicates = [HasStdExtZba, IsRV64] in {
 // in SDISel for RV64, which is not the case in GISel.
 def : Pat<(shl (i64 (zext i32:$rs1)), uimm5:$shamt),
           (SLLI_UW GPR:$rs1, uimm5:$shamt)>;
-
-def : Pat<(i64 (zext i32:$rs)), (ADD_UW GPR:$rs, (XLenVT X0))>;
 } // Predicates = [HasStdExtZba, IsRV64]
 
-let Predicates = [IsRV64, NotHasStdExtZba] in
-def: Pat<(i64 (zext i32:$rs)), (SRLI (SLLI GPR:$rs, 32), 32)>;
-
 // Ptr type used in patterns with GlobalISelEmitter
 def PtrVT : PtrValueTypeByHwMode<XLenVT, 0>;
 
@@ -196,8 +167,6 @@ def : Pat<(XLenVT (setle (Ty GPR:$rs1), (Ty GPR:$rs2))),
           (XORI (SLT GPR:$rs2, GPR:$rs1), 1)>;
 }
 
-// Define pattern expansions for load/extload and store/truncstore operations
-// for ptr return type
 let Predicates = [IsRV32] in {
 def : LdPat<load, LW, PtrVT>;
 def : StPat<store, SW, GPR, PtrVT>;
@@ -206,18 +175,4 @@ def : StPat<store, SW, GPR, PtrVT>;
 let Predicates = [IsRV64] in {
 def : LdPat<load, LD, PtrVT>;
 def : StPat<store, SD, GPR, PtrVT>;
-
-// Define pattern expansions for rv64 load/extloads and store/truncstore
-// operations for i32 return type
-def : LdPat<sextloadi8, LB, i32>;
-def : LdPat<extloadi8, LBU, i32>;
-def : LdPat<zextloadi8, LBU, i32>;
-def : LdPat<sextloadi16, LH, i32>;
-def : LdPat<extloadi16, LH, i32>;
-def : LdPat<zextloadi16, LHU, i32>;
-def : LdPat<load, LW, i32>;
-
-def : StPat<truncstorei8, SB, GPR, i32>;
-def : StPat<truncstorei16, SH, GPR, i32>;
-def : StPat<store, SW, GPR, i32>;
-} // Predicates = [IsRV64]
+}
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index 9f3c387914944..51a235bf2ca18 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -67,8 +67,11 @@ void RISCVDAGToDAGISel::PreprocessISelDAG() {
           VT.isInteger() ? RISCVISD::VMV_V_X_VL : RISCVISD::VFMV_V_F_VL;
       SDLoc DL(N);
       SDValue VL = CurDAG->getRegister(RISCV::X0, Subtarget->getXLenVT());
-      Result = CurDAG->getNode(Opc, DL, VT, CurDAG->getUNDEF(VT),
-                               N->getOperand(0), VL);
+      SDValue Src = N->getOperand(0);
+      if (VT.isInteger())
+        Src = CurDAG->getNode(ISD::ANY_EXTEND, DL, Subtarget->getXLenVT(),
+                              N->getOperand(0));
+      Result = CurDAG->getNode(Opc, DL, VT, CurDAG->getUNDEF(VT), Src, VL);
       break;
     }
     case RISCVISD::SPLAT_VECTOR_SPLIT_I64_VL: {
@@ -833,7 +836,7 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
 
   switch (Opcode) {
   case ISD::Constant: {
-    assert(VT == Subtarget->getXLenVT() && "Unexpected VT");
+    assert((VT == Subtarget->getXLenVT() || VT == MVT::i32) && "Unexpected VT");
     auto *ConstNode = cast<ConstantSDNode>(Node);
     if (ConstNode->isZero()) {
       SDValue New =
@@ -3299,6 +3302,9 @@ bool RISCVDAGToDAGISel::doPeepholeSExtW(SDNode *N) {
   case RISCV::TH_MULAH:
   case RISCV::TH_MULSW:
   case RISCV::TH_MULSH:
+    if (N0.getValueType() == MVT::i32)
+      break;
+
     // Result is already sign extended just remove the sext.w.
     // NOTE: We only handle the nodes that are selected with hasAllWUsers.
     ReplaceUses(N, N0.getNode());
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index e9f80432ab190..67f09377bf45e 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -75,6 +75,10 @@ static cl::opt<int>
                        "use for creating a floating-point immediate value"),
               cl::init(2));
 
+static cl::opt<bool>
+    RV64LegalI32("riscv-experimental-rv64-legal-i32", cl::ReallyHidden,
+                 cl::desc("Make i32 a legal type for SelectionDAG on RV64."));
+
 RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
                                          const RISCVSubtarget &STI)
     : TargetLowering(TM), Subtarget(STI) {
@@ -115,6 +119,8 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
 
   // Set up the register classes.
   addRegisterClass(XLenVT, &RISCV::GPRRegClass);
+  if (Subtarget.is64Bit() && RV64LegalI32)
+    addRegisterClass(MVT::i32, &RISCV::GPRRegClass);
 
   if (Subtarget.hasStdExtZfhOrZfhmin())
     addRegisterClass(MVT::f16, &RISCV::FPR16RegClass);
@@ -237,8 +243,12 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
 
   setOperationAction(ISD::BR_JT, MVT::Other, Expand);
   setOperationAction(ISD::BR_CC, XLenVT, Expand);
+  if (RV64LegalI32 && Subtarget.is64Bit())
+    setOperationAction(ISD::BR_CC, MVT::i32, Expand);
   setOperationAction(ISD::BRCOND, MVT::Other, Custom);
   setOperationAction(ISD::SELECT_CC, XLenVT, Expand);
+  if (RV64LegalI32 && Subtarget.is64Bit())
+    setOperationAction(ISD::SELECT_CC, MVT::i32, Expand);
 
   setCondCodeAction(ISD::SETLE, XLenVT, Expand);
   setCondCodeAction(ISD::SETGT, XLenVT, Custom);
@@ -247,6 +257,9 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
   setCondCodeAction(ISD::SETUGT, XLenVT, Custom);
   setCondCodeAction(ISD::SETUGE, XLenVT, Expand);
 
+  if (RV64LegalI32 && Subtarget.is64Bit())
+    setOperationAction(ISD::SETCC, MVT::i32, Promote);
+
   setOperationAction({ISD::STACKSAVE, ISD::STACKRESTORE}, MVT::Other, Expand);
 
   setOperationAction(ISD::VASTART, MVT::Other, Custom);
@@ -262,14 +275,20 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
   if (Subtarget.is64Bit()) {
     setOperationAction(ISD::EH_DWARF_CFA, MVT::i64, Custom);
 
-    setOperationAction(ISD::LOAD, MVT::i32, Custom);
+    if (!RV64LegalI32)
+      setOperationAction(ISD::LOAD, MVT::i32, Custom);
 
-    setOperationAction({ISD::ADD, ISD::SUB, ISD::SHL, ISD::SRA, ISD::SRL},
-                       MVT::i32, Custom);
+    if (RV64LegalI32)
+      setOperationAction({ISD::AND, ISD::OR, ISD::XOR}, MVT::i32, Promote);
+    else
+      setOperationAction({ISD::ADD, ISD::SUB, ISD::SHL, ISD::SRA, ISD::SRL},
+                         MVT::i32, Custom);
 
-    setOperationAction(ISD::SADDO, MVT::i32, Custom);
-    setOperationAction({ISD::UADDO, ISD::USUBO, ISD::UADDSAT, ISD::USUBSAT},
-                       MVT::i32, Custom);
+    if (!RV64LegalI32) {
+      setOperationAction(ISD::SADDO, MVT::i32, Custom);
+      setOperationAction({ISD::UADDO, ISD::USUBO, ISD::UADDSAT, ISD::USUBSAT},
+                         MVT::i32, Custom);
+    }
   } else {
     setLibcallName(
         {RTLIB::SHL_I128, RTLIB::SRL_I128, RTLIB::SRA_I128, RTLIB::MUL_I128},
@@ -277,19 +296,36 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
     setLibcallName(RTLIB::MULO_I64, nullptr);
   }
 
-  if (!Subtarget.hasStdExtM() && !Subtarget.hasStdExtZmmul())
+  if (!Subtarget.hasStdExtM() && !Subtarget.hasStdExtZmmul()) {
     setOperationAction({ISD::MUL, ISD::MULHS, ISD::MULHU}, XLenVT, Expand);
-  else if (Subtarget.is64Bit())
-    setOperationAction(ISD::MUL, {MVT::i32, MVT::i128}, Custom);
-  else
+    if (RV64LegalI32 && Subtarget.is64Bit())
+      setOperationAction(ISD::MUL, MVT::i32, Promote);
+  } else if (Subtarget.is64Bit()) {
+    setOperationAction(ISD::MUL, MVT::i128, Custom);
+    if (!RV64LegalI32)
+      setOperationAction(ISD::MUL, MVT::i32, Custom);
+  } else {
     setOperationAction(ISD::MUL, MVT::i64, Custom);
+  }
 
-  if (!Subtarget.hasStdExtM())
+  if (!Subtarget.hasStdExtM()) {
     setOperationAction({ISD::SDIV, ISD::UDIV, ISD::SREM, ISD::UREM},
                        XLenVT, Expand);
-  else if (Subtarget.is64Bit())
-    setOperationAction({ISD::SDIV, ISD::UDIV, ISD::UREM},
-                       {MVT::i8, MVT::i16, MVT::i32}, Custom);
+    if (RV64LegalI32 && Subtarget.is64Bit())
+      setOperationAction({ISD::SDIV, ISD::UDIV, ISD::SREM, ISD::UREM}, MVT::i32,
+                         Promote);
+  } else if (Subtarget.is64Bit()) {
+    if (!RV64LegalI32)
+      setOperationAction({ISD::SDIV, ISD::UDIV, ISD::UREM},
+                         {MVT::i8, MVT::i16, MVT::i32}, Custom);
+  }
+
+  if (RV64LegalI32 && Subtarget.is64Bit()) {
+    setOperationAction({ISD::MULHS, ISD::MULHU}, MVT::i32, Expand);
+    setOperationAction(
+        {ISD::SDIVREM, ISD::UDIVREM, ISD::SMUL_LOHI, ISD::UMUL_LOHI}, MVT::i32,
+        Expand);
+  }
 
   setOperationAction(
       {ISD::SDIVREM, ISD::UDIVREM, ISD::SMUL_LOHI, ISD::UMUL_LOHI}, XLenVT,
@@ -299,7 +335,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
                      Custom);
 
   if (Subtarget.hasStdExtZbb() || Subtarget.hasStdExtZbkb()) {
-    if (Subtarget.is64Bit())
+    if (!RV64LegalI32 && Subtarget.is64Bit())
       setOperationAction({ISD::ROTL, ISD::ROTR}, MVT::i32, Custom);
   } else if (Subtarget.hasVendorXTHeadBb()) {
     if (Subtarget.is64Bit())
@@ -307,6 +343,8 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
     setOperationAction({ISD::ROTL, ISD::ROTR}, XLenVT, Custom);
   } else {
     setOperationAction({ISD::ROTL, ISD::ROTR}, XLenVT, Expand);
+    if (RV64LegalI32 && Subtarget.is64Bit())
+      setOperationAction({ISD::ROTL, ISD::ROTR}, MVT::i32, Expand);
   }
 
   // With Zbb we have an XLen rev8 instruction, but not GREVI. So we'll
@@ -316,6 +354,13 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
                       Subtarget.hasVendorXTHeadBb())
                          ? Legal
                          : Expand);
+  if (RV64LegalI32 && Subtarget.is64Bit())
+    setOperationAction(ISD::BSWAP, MVT::i32,
+                       (Subtarget.hasStdExtZbb() || Subtarget.hasStdExtZbkb() ||
+                        Subtarget.hasVendorXTHeadBb())
+                           ? Promote
+                           : Expand);
+
   // Zbkb can use rev8+brev8 to implement bitreverse.
   setOperationAction(ISD::BITREVERSE, XLenVT,
                      Subtarget.hasStdExtZbkb() ? Custom : Expand);
@@ -323,30 +368,49 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
   if (Subtarget.hasStdExtZbb()) {
     setOperationAction({ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX}, XLenVT,
                        Legal);
+    if (RV64LegalI32 && Subtarget.is64Bit())
+      setOperationAction({ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX}, MVT::i32,
+                         Promote);
 
-    if (Subtarget.is64Bit())
-      setOperationAction(
-          {ISD::CTTZ, ISD::CTTZ_ZERO_UNDEF},
-          MVT::i32, Custom);
+    if (Subtarget.is64Bit()) {
+      if (RV64LegalI32)
+        setOperationAction(ISD::CTTZ, MVT::i32, Legal);
+      else
+        setOperationAction({ISD::CTTZ, ISD::CTTZ_ZERO_UNDEF}, MVT::i32, Custom);
+    }
   } else {
     setOperationAction({ISD::CTTZ, ISD::CTPOP}, XLenVT, Expand);
+    if (RV64LegalI32 && Subtarget.is64Bit())
+      setOperationAction({ISD::CTTZ, ISD::CTPOP}, MVT::i32, Expand);
   }
 
   if (Subtarget.hasStdExtZbb() || Subtarget.hasVendorXTHeadBb()) {
     // We need the custom lowering to make sure that the resulting sequence
     // for the 32bit case is efficient on 64bit targets.
-    if (Subtarget.is64Bit())
-      setOperationAction({ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF}, MVT::i32, Custom);
+    if (Subtarget.is64Bit()) {
+      if (RV64LegalI32) {
+        setOperationAction(ISD::CTLZ, MVT::i32,
+                           Subtarget.hasStdExtZbb() ? Legal : Promote);
+        if (!Subtarget.hasStdExtZbb())
+          setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Promote);
+      } else
+        setOperationAction({ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF}, MVT::i32, Custom);
+    }
   } else {
     setOperationAction(ISD::CTLZ, XLenVT, Expand);
+    if (RV64LegalI32 && Subtarget.is64Bit())
+      setOperationAction(ISD::CTLZ, MVT::i32, Expand);
   }
 
-  if (Subtarget.is64Bit())
+  if (!RV64LegalI32 && Subtarget.is64Bit())
     setOperationAction(ISD::ABS, MVT::i32, Custom);
 
   if (!Subtarget.hasVendorXTHeadCondMov())
     setOperationAction(ISD::SELECT, XLenVT, Custom);
 
+  if (RV64LegalI32 && Subtarget.is64Bit())
+    setOperationAction(ISD::SELECT, MVT::i32, Promote);
+
   static const unsigned FPLegalNodeTypes[] = {
       ISD::FMINNUM,        ISD::FMAXNUM,       ISD::LRINT,
       ISD::LLRINT,         ISD::LROUND,        ISD::LLROUND,
@@ -525,6 +589,11 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
                         ISD::STRICT_UINT_TO_FP, ISD::STRICT_SINT_TO_FP},
                        XLenVT, Legal);
 
+    if (RV64LegalI32 && Subtarget.is64Bit())
+      setOperationAction({ISD::STRICT_FP_TO_UINT, ISD::STRICT_FP_TO_SINT,
+                          ISD::STRICT_UINT_TO_FP, ISD::STRICT_SINT_TO_FP},
+                         MVT::i32, Legal);
+
     setOperationAction(ISD::GET_ROUNDING, XLenVT, Custom);
     setOperationAction(ISD::SET_ROUNDING, MVT::Other, Custom);
   }
@@ -569,6 +638,8 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
     setBooleanVectorContents(ZeroOrOneBooleanContent);
 
     setOperationAction(ISD::VSCALE, XLenVT, Custom);
+    if (RV64LegalI32 && Subtarget.is64Bit())
+      setOperationAction(ISD::VSCALE, MVT::i32, Custom);
 
     // RVV intrinsics may have illegal operands.
     // We also need to custom legalize vmv.x.s.
@@ -1247,8 +1318,11 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
     }
   }
 
-  if (Subtarget.hasStdExtA())
+  if (Subtarget.hasStdExtA()) {
     setOperationAction(ISD::ATOMIC_LOAD_SUB, XLenVT, Expand);
+    if (RV64LegalI32 && Subtarget.is64Bit())
+      setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, Expand);
+  }
 
   if (Subtarget.hasForcedAtomics()) {
     // Force __sync libcalls to be emitted for atomic rmw/cas operations.
@@ -2090,7 +2164,12 @@ MVT RISCVTargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
       !Subtarget.hasStdExtZfhOrZfhminOrZhinxOrZhinxmin())
     return MVT::f32;
 
-  return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
+  MVT PartVT = TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
+
+  if (RV64LegalI32 && Subtarget.is64Bit() && PartVT == MVT::i32)
+    return MVT::i64;
+
+  return PartVT;
 }
 
 unsigned RISCVTargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
@@ -2105,6 +2184,21 @@ unsigned RISCVTargetLowering::getNumRegistersForCallingConv(LLVMContext &Context
   return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
 }
 
+unsigned RISCVTargetLowering::getVectorTypeBreakdownForCallingConv(
+    LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
+    unsigned &NumIntermediates, MVT &RegisterVT) const {
+  unsigned NumRegs = TargetLowering::getVectorTypeBreakdownForCallingConv(
+      Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
+
+  if (RV64LegalI32 && Subtarget.is64Bit() && IntermediateVT == MVT::i32)
+    IntermediateVT = MVT::i64;
+
+  if (RV64LegalI32 && Subtarget.is64Bit() && RegisterVT == MVT::i32)
+    RegisterVT = MVT::i64;
+
+  return NumRegs;
+}
+
 // Changes the condition code and swaps operands if necessary, so the SetCC
 // operation matches one of the comparisons supported directly by branches
 // in the RISC-V ISA. May adjust compares to favor compare with 0 over compare
@@ -3263,6 +3357,8 @@ static SDValue lowerBuildVectorViaDominantValues(SDValue Op, SelectionDAG &DAG,
       Vec = convertToScalableVector(ContainerVT, Vec, DAG, Subtarget);
       auto OpCode =
         VT.isFloatingPoint() ? RISCVISD::VFSLIDE1DOWN_VL : RISCVISD::VSLIDE1DOWN_VL;
+      if (!VT.isFloatingPoint())
+        LastOp = DAG.getNode(ISD::ANY_EXTEND, DL, XLenVT, LastOp);
       Vec = DAG.getNode(OpCode, DL, ContainerVT, DAG.getUNDEF(ContainerVT), Vec,
                         LastOp, Mask, VL);
       Vec = convertFromScalableVector(VT, Vec, DAG, Subtarget);
@@ -3390,6 +3486,8 @@ static SDValue lowerBuildVectorOfConstants(SDValue Op, SelectionDAG &DAG,
   if (SDValue Splat = cast<BuildVectorSDNode>(Op)->getSplatValue()) {
     unsigned Opc = VT.isFloatingPoint() ? RISCVISD::VFMV_V_F_VL
                                         : RISCVISD::VMV_V_X_VL;
+    if (!VT.isFloatingPoint())
+      Splat = DAG.getNode(ISD::ANY_EXTEND, DL, XLenVT, Splat);
     Splat =
         DAG.getNode(Opc, DL, ContainerVT, DAG.getUNDEF(ContainerVT), Splat, VL);
     return convertFromScalableVector(VT, Splat, DAG, Subtarget);
@@ -3623,10 +3721,8 @@ static SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
     if (SDValue Splat = cast<BuildVectorSDNode>(Op)->getSplatValue()) {
       // For a splat, perform a scalar truncate before creating the wider
       // vector.
-      assert(Splat.getValueType() == XLenVT &&
-             "Unexpected type for i1 splat value");
-      Splat = DAG.getNode(ISD::AND, DL, XLenVT, Splat,
-                          DAG.getConstant(1, DL, XLenVT));
+      Splat = DAG.getNode(ISD::AND, DL, Splat.getValueType(), Splat,
+                          DAG.getConstant(1, DL, Splat.getValueType()));
       WideVec = DAG.getSplatBuildVector(WideVecVT, DL, Splat);
     } else {
       SmallVector<SDValue, 8> Ops(Op->op_values());
@@ -3643,6 +3739,8 @@ static SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
       return Gather;
     unsigned Opc = VT.isFloatingPoint() ? RISCVISD::VFMV_V_F_VL
                                         : RISCVISD::VMV_V_X_VL;
+    if (!VT.isFloatingPoint())
+      Splat = DAG.getNode(ISD::ANY_EXTEND, DL, XLenVT, Splat);
     Splat =
         DAG.getNode(Opc, DL, ContainerVT, DAG.getUNDEF(ContainerVT), Splat, VL);
     return convertFromScalableVector(VT, Splat, DAG, Subtarget);
@@ -3704,7 +3802,7 @@ static SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
 
   SDValue Vec = DAG.getUNDEF(ContainerVT);
   UndefCount = 0;
-  for (const SDValue &V : Op->ops()) {
+  for (SDValue V : Op->ops()) {
     if (V.isUndef()) {
       UndefCount++;
       continue;
@@ -3717,6 +3815,8 @@ static SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
     }
     auto OpCode =
       VT.isFloatingPoint() ? RISCVISD::VFSLIDE1DOWN_VL : RISCVISD::VSLIDE1DOWN_VL;
+    if (!VT.isFloatingPoint())
+      V = DAG.getNode(ISD::ANY_EXTEND, DL, Subtarget.getXLenVT(), V);
     Vec = DAG.getNode(OpCode, DL, ContainerVT, DAG.getUNDEF(ContainerVT), Vec,
                       V, Mask, VL);
   }
@@ -4277,6 +4377,8 @@ static SDValue lowerVECTOR_SHUFFLEAsVSlide1(const SDLoc &DL, MVT VT,
   auto OpCode = IsVSlidedown ?
     (VT.isFloatingPoint() ? RISCVISD::VFSLIDE1DOWN_VL : RISCVISD::VSLIDE1DOWN_VL) :
     (VT.isFloatingPoint() ? RISCVISD::VFSLIDE1UP_VL : RISCVISD::VSLIDE1UP_VL);
+  if (!VT.isFloatingPoint())
+    Splat = DAG.getNode(ISD::ANY_EXTEND, DL, Subtarget.getXLenVT(), Splat);
   auto Vec = DAG.getNode(OpCode, DL, ContainerVT,
                          DAG.getUNDEF(ContainerVT),
                          convertToScalableVector(ContainerVT, V2, DAG, Subtarget),
@@ -5162,10 +5264,12 @@ SDValue RISCVTargetLowering::LowerIS_FPCLASS(SDValue Op,
     return convertFromScalableVector(VT, VMSNE, DAG, Subtarget);
   }
 
-  SDValue FPCLASS = DAG.getNode(RISCVISD::FPCLASS, DL, VT, Op.getOperand(0));
-  SDValue AND = DAG.getNode(ISD::AND, DL, VT, FPCLASS, TDCMaskV);
-  return DAG.getSetCC(DL, VT, AND, DAG.getConstant(0, DL, XLenVT),
-                      ISD::CondCode::SETNE);
+  SDValue FPCLASS =
+      DAG.getNode(RISCVISD::FPCLASS, DL, XLenVT, Op.getOperand(0));
+  SDValue AND = DAG.getNode(ISD::AND, DL, XLenVT, FPCLASS, TDCMaskV);
+  SDValue Res = DAG.getSetCC(DL, XLenVT, AND, DAG.getConstant(0, DL, XLenVT),
+                             ISD::CondCode::SETNE);
+  return DAG.getNode(ISD::TRUNCATE, DL, VT, Res);
 }
 
 // Lower fmaximum and fminimum. Unlike our fmax and fmin instructions, these
@@ -5673,6 +5777,7 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
     if (VT.isFixedLengthVector())
       ContainerVT = getContainerForFixedLengthVector(VT);
     SDValue VL = getDefaultVLOps(VT, ContainerVT, DL, DAG, Subtarget).second;
+    Scalar = DAG.getNode(ISD::ANY_EXTEND, DL, Subtarget.getXLenVT(), Scalar);
     SDValue V = DAG.getNode(RISCVISD::VMV_S_X_VL, DL, ContainerVT,
                             DAG.getUNDEF(ContainerVT), Scalar, VL);
     if (VT.isFixedLengthVector())
@@ -5680,9 +5785,10 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
     return V;
   }
   case ISD::VSCALE: {
+    MVT XLenVT = Subtarget.getXLenVT();
     MVT VT = Op.getSimpleValueType();
     SDLoc DL(Op);
-    SDValue VLENB = DAG.getNode(RISCVISD::READ_VLENB, DL, VT);
+    SDValue Res = DAG.getNode(RISCVISD::READ_VLENB, DL, XLenVT);
     // We define our scalable vector types for lmul=1 to use a 64 bit known
     // minimum size. e.g. <vscale x 2 x i32>. VLENB is in bytes so we calculate
     // vscale as VLENB / 8.
@@ -5695,22 +5801,23 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
     if (isPowerOf2_64(Val)) {
       uint64_t Log2 = Log2_64(Val);
       if (Log2 < 3)
-        return DAG.getNode(ISD::SRL, DL, VT, VLENB,
-                           DAG.getConstant(3 - Log2, DL, VT));
-      if (Log2 > 3)
-        return DAG.getNode(ISD::SHL, DL, VT, VLENB,
-                           DAG.getConstant(Log2 - 3, DL, VT));
-      return VLENB;
-    }
-    // If the multiplier is a multiple of 8, scale it down to avoid needing
-    // to shift the VLENB value.
-    if ((Val % 8) == 0)
-      return DAG.getNode(ISD::MUL, DL, VT, VLENB,
-                         DAG.getConstant(Val / 8, DL, VT));
-
-    SDValue VScale = DAG.getNode(ISD::SRL, DL, VT, VLENB,
-                                 DAG.getConstant(3, DL, VT));
-    return DAG.getNode(ISD::MUL, DL, VT, VScale, Op.getOperand(0));
+        Res = DAG.getNode(ISD::SRL, DL, XLenVT, Res,
+                          DAG.getConstant(3 - Log2, DL, VT));
+      else if (Log2 > 3)
+        Res = DAG.getNode(ISD::SHL, DL, XLenVT, Res,
+                          DAG.getConstant(Log2 - 3, DL, XLenVT));
+    } else if ((Val % 8) == 0) {
+      // If the multiplier is a multiple of 8, scale it down to avoid needing
+      // to shift the VLENB value.
+      Res = DAG.getNode(ISD::MUL, DL, XLenVT, Res,
+                        DAG.getConstant(Val / 8, DL, XLenVT));
+    } else {
+      SDValue VScale = DAG.getNode(ISD::SRL, DL, XLenVT, Res,
+                                   DAG.getConstant(3, DL, XLenVT));
+      Res = DAG.getNode(ISD::MUL, DL, XLenVT, VScale,
+                        DAG.getConstant(Val, DL, XLenVT));
+    }
+    return DAG.getNode(ISD::TRUNCATE, DL, VT, Res);
   }
   case ISD::FPOWI: {
     // Custom promote f16 powi with illegal i32 integer type on RV64. Once
@@ -5958,7 +6065,7 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
         RTLIB::getFPROUND(Op.getOperand(0).getValueType(), MVT::bf16);
     SDValue Res =
         makeLibCall(DAG, LC, MVT::f32, Op.getOperand(0), CallOptions, DL).first;
-    if (Subtarget.is64Bit())
+    if (Subtarget.is64Bit() && !RV64LegalI32)
       return DAG.getNode(RISCVISD::FMV_X_ANYEXTW_RV64, DL, MVT::i64, Res);
     return DAG.getBitcast(MVT::i32, Res);
   }
@@ -5987,7 +6094,7 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
         RTLIB::getFPROUND(Op.getOperand(0).getValueType(), MVT::f16);
     SDValue Res =
         makeLibCall(DAG, LC, MVT::f32, Op.getOperand(0), CallOptions, DL).first;
-    if (Subtarget.is64Bit())
+    if (Subtarget.is64Bit() && !RV64LegalI32)
       return DAG.getNode(RISCVISD::FMV_X_ANYEXTW_RV64, DL, MVT::i64, Res);
     return DAG.getBitcast(MVT::i32, Res);
   }
@@ -7155,12 +7262,9 @@ SDValue RISCVTargetLowering::lowerVectorMaskSplat(SDValue Op,
     SDValue VL = getDefaultScalableVLOps(VT, DL, DAG, Subtarget).second;
     return DAG.getNode(RISCVISD::VMCLR_VL, DL, VT, VL);
   }
-  MVT XLenVT = Subtarget.getXLenVT();
-  assert(SplatVal.getValueType() == XLenVT &&
-         "Unexpected type for i1 splat value");
   MVT InterVT = VT.changeVectorElementType(MVT::i8);
-  SplatVal = DAG.getNode(ISD::AND, DL, XLenVT, SplatVal,
-                         DAG.getConstant(1, DL, XLenVT));
+  SplatVal = DAG.getNode(ISD::AND, DL, SplatVal.getValueType(), SplatVal,
+                         DAG.getConstant(1, DL, SplatVal.getValueType()));
   SDValue LHS = DAG.getSplatVector(InterVT, DL, SplatVal);
   SDValue Zero = DAG.getConstant(0, DL, InterVT);
   return DAG.getSetCC(DL, VT, LHS, Zero, ISD::SETNE);
@@ -7589,6 +7693,8 @@ SDValue RISCVTargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
     unsigned Opc =
         VecVT.isFloatingPoint() ? RISCVISD::VFMV_S_F_VL : RISCVISD::VMV_S_X_VL;
     if (isNullConstant(Idx)) {
+      if (!VecVT.isFloatingPoint())
+        Val = DAG.getNode(ISD::ANY_EXTEND, DL, XLenVT, Val);
       Vec = DAG.getNode(Opc, DL, ContainerVT, Vec, Val, VL);
 
       if (ContainerVT != OrigContainerVT)
@@ -7693,8 +7799,9 @@ SDValue RISCVTargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
       auto [Mask, VL] = getDefaultVLOps(VecVT, ContainerVT, DL, DAG, Subtarget);
       SDValue Vfirst =
           DAG.getNode(RISCVISD::VFIRST_VL, DL, XLenVT, Vec, Mask, VL);
-      return DAG.getSetCC(DL, XLenVT, Vfirst, DAG.getConstant(0, DL, XLenVT),
-                          ISD::SETEQ);
+      SDValue Res = DAG.getSetCC(DL, XLenVT, Vfirst,
+                                 DAG.getConstant(0, DL, XLenVT), ISD::SETEQ);
+      return DAG.getNode(ISD::TRUNCATE, DL, EltVT, Res);
     }
     if (VecVT.isFixedLengthVector()) {
       unsigned NumElts = VecVT.getVectorNumElements();
@@ -7732,8 +7839,9 @@ SDValue RISCVTargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
         // Extract the bit from GPR.
         SDValue ShiftRight =
             DAG.getNode(ISD::SRL, DL, XLenVT, ExtractElt, ExtractBitIdx);
-        return DAG.getNode(ISD::AND, DL, XLenVT, ShiftRight,
-                           DAG.getConstant(1, DL, XLenVT));
+        SDValue Res = DAG.getNode(ISD::AND, DL, XLenVT, ShiftRight,
+                                  DAG.getConstant(1, DL, XLenVT));
+        return DAG.getNode(ISD::TRUNCATE, DL, EltVT, Res);
       }
     }
     // Otherwise, promote to an i8 vector and extract from that.
@@ -8028,7 +8136,9 @@ static SDValue lowerGetVectorLength(SDNode *N, SelectionDAG &DAG,
   SDValue AVL = DAG.getNode(ISD::ZERO_EXTEND, DL, XLenVT, N->getOperand(1));
 
   SDValue ID = DAG.getTargetConstant(Intrinsic::riscv_vsetvli, DL, XLenVT);
-  return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, XLenVT, ID, AVL, Sew, LMul);
+  SDValue Res =
+      DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, XLenVT, ID, AVL, Sew, LMul);
+  return DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), Res);
 }
 
 // LMUL * VLEN should be greater than or equal to EGS * SEW
@@ -8072,12 +8182,30 @@ SDValue RISCVTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
     case Intrinsic::riscv_sm3p1:      Opc = RISCVISD::SM3P1;      break;
     }
 
+    if (RV64LegalI32 && Subtarget.is64Bit() && Op.getValueType() == MVT::i32) {
+      SDValue NewOp =
+          DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Op.getOperand(1));
+      SDValue Res = DAG.getNode(Opc, DL, MVT::i64, NewOp);
+      return DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Res);
+    }
+
     return DAG.getNode(Opc, DL, XLenVT, Op.getOperand(1));
   }
   case Intrinsic::riscv_sm4ks:
   case Intrinsic::riscv_sm4ed: {
     unsigned Opc =
         IntNo == Intrinsic::riscv_sm4ks ? RISCVISD::SM4KS : RISCVISD::SM4ED;
+
+    if (RV64LegalI32 && Subtarget.is64Bit() && Op.getValueType() == MVT::i32) {
+      SDValue NewOp0 =
+          DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Op.getOperand(1));
+      SDValue NewOp1 =
+          DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Op.getOperand(2));
+      SDValue Res =
+          DAG.getNode(Opc, DL, MVT::i64, NewOp0, NewOp1, Op.getOperand(3));
+      return DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Res);
+    }
+
     return DAG.getNode(Opc, DL, XLenVT, Op.getOperand(1), Op.getOperand(2),
                        Op.getOperand(3));
   }
@@ -8088,20 +8216,43 @@ SDValue RISCVTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
     return DAG.getNode(Opc, DL, XLenVT, Op.getOperand(1));
   }
   case Intrinsic::riscv_clmul:
+    if (RV64LegalI32 && Subtarget.is64Bit() && Op.getValueType() == MVT::i32) {
+      SDValue NewOp0 =
+          DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Op.getOperand(1));
+      SDValue NewOp1 =
+          DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Op.getOperand(2));
+      SDValue Res = DAG.getNode(RISCVISD::CLMUL, DL, MVT::i64, NewOp0, NewOp1);
+      return DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Res);
+    }
     return DAG.getNode(RISCVISD::CLMUL, DL, XLenVT, Op.getOperand(1),
                        Op.getOperand(2));
   case Intrinsic::riscv_clmulh:
-    return DAG.getNode(RISCVISD::CLMULH, DL, XLenVT, Op.getOperand(1),
-                       Op.getOperand(2));
-  case Intrinsic::riscv_clmulr:
-    return DAG.getNode(RISCVISD::CLMULR, DL, XLenVT, Op.getOperand(1),
-                       Op.getOperand(2));
+  case Intrinsic::riscv_clmulr: {
+    unsigned Opc =
+        IntNo == Intrinsic::riscv_clmulh ? RISCVISD::CLMULH : RISCVISD::CLMULR;
+    if (RV64LegalI32 && Subtarget.is64Bit() && Op.getValueType() == MVT::i32) {
+      SDValue NewOp0 =
+          DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Op.getOperand(1));
+      SDValue NewOp1 =
+          DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Op.getOperand(2));
+      NewOp0 = DAG.getNode(ISD::SHL, DL, MVT::i64, NewOp0,
+                           DAG.getConstant(32, DL, MVT::i64));
+      NewOp1 = DAG.getNode(ISD::SHL, DL, MVT::i64, NewOp1,
+                           DAG.getConstant(32, DL, MVT::i64));
+      SDValue Res = DAG.getNode(Opc, DL, MVT::i64, NewOp0, NewOp1);
+      Res = DAG.getNode(ISD::SRL, DL, MVT::i64, Res,
+                        DAG.getConstant(32, DL, MVT::i64));
+      return DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Res);
+    }
+
+    return DAG.getNode(Opc, DL, XLenVT, Op.getOperand(1), Op.getOperand(2));
+  }
   case Intrinsic::experimental_get_vector_length:
     return lowerGetVectorLength(Op.getNode(), DAG, Subtarget);
-  case Intrinsic::riscv_vmv_x_s:
-    assert(Op.getValueType() == XLenVT && "Unexpected VT!");
-    return DAG.getNode(RISCVISD::VMV_X_S, DL, Op.getValueType(),
-                       Op.getOperand(1));
+  case Intrinsic::riscv_vmv_x_s: {
+    SDValue Res = DAG.getNode(RISCVISD::VMV_X_S, DL, XLenVT, Op.getOperand(1));
+    return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Res);
+  }
   case Intrinsic::riscv_vfmv_f_s:
     return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getValueType(),
                        Op.getOperand(1), DAG.getConstant(0, DL, XLenVT));
@@ -8621,8 +8772,6 @@ SDValue RISCVTargetLowering::lowerVectorMaskVecReduction(SDValue Op,
          "Unexpected reduction lowering");
 
   MVT XLenVT = Subtarget.getXLenVT();
-  assert(Op.getValueType() == XLenVT &&
-         "Expected reduction output to be legalized to XLenVT");
 
   MVT ContainerVT = VecVT;
   if (VecVT.isFixedLengthVector()) {
@@ -8676,6 +8825,7 @@ SDValue RISCVTargetLowering::lowerVectorMaskVecReduction(SDValue Op,
   }
 
   SDValue SetCC = DAG.getSetCC(DL, XLenVT, Vec, Zero, CC);
+  SetCC = DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), SetCC);
 
   if (!IsVP)
     return SetCC;
@@ -8686,7 +8836,7 @@ SDValue RISCVTargetLowering::lowerVectorMaskVecReduction(SDValue Op,
   // 0 for an inactive vector, and so we've already received the neutral value:
   // AND gives us (0 == 0) -> 1 and OR/XOR give us (0 != 0) -> 0. Therefore we
   // can simply include the start value.
-  return DAG.getNode(BaseOpc, DL, XLenVT, SetCC, Op.getOperand(0));
+  return DAG.getNode(BaseOpc, DL, Op.getValueType(), SetCC, Op.getOperand(0));
 }
 
 static bool isNonZeroAVL(SDValue AVL) {
@@ -10581,6 +10731,8 @@ SDValue RISCVTargetLowering::lowerSET_ROUNDING(SDValue Op,
       (RISCVFPRndMode::RUP << 4 * int(RoundingMode::TowardPositive)) |
       (RISCVFPRndMode::RMM << 4 * int(RoundingMode::NearestTiesToAway));
 
+  RMValue = DAG.getNode(ISD::ZERO_EXTEND, DL, XLenVT, RMValue);
+
   SDValue Shift = DAG.getNode(ISD::SHL, DL, XLenVT, RMValue,
                               DAG.getConstant(2, DL, XLenVT));
   SDValue Shifted = DAG.getNode(ISD::SRL, DL, XLenVT,
@@ -16732,12 +16884,18 @@ static SDValue convertLocVTToValVT(SelectionDAG &DAG, SDValue Val,
     break;
   case CCValAssign::BCvt:
     if (VA.getLocVT().isInteger() &&
-        (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16))
+        (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16)) {
       Val = DAG.getNode(RISCVISD::FMV_H_X, DL, VA.getValVT(), Val);
-    else if (VA.getLocVT() == MVT::i64 && VA.getValVT() == MVT::f32)
-      Val = DAG.getNode(RISCVISD::FMV_W_X_RV64, DL, MVT::f32, Val);
-    else
+    } else if (VA.getLocVT() == MVT::i64 && VA.getValVT() == MVT::f32) {
+      if (RV64LegalI32) {
+        Val = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Val);
+        Val = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Val);
+      } else {
+        Val = DAG.getNode(RISCVISD::FMV_W_X_RV64, DL, MVT::f32, Val);
+      }
+    } else {
       Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
+    }
     break;
   }
   return Val;
@@ -16791,13 +16949,19 @@ static SDValue convertValVTToLocVT(SelectionDAG &DAG, SDValue Val,
       Val = convertToScalableVector(LocVT, Val, DAG, Subtarget);
     break;
   case CCValAssign::BCvt:
-    if (VA.getLocVT().isInteger() &&
-        (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16))
-      Val = DAG.getNode(RISCVISD::FMV_X_ANYEXTH, DL, VA.getLocVT(), Val);
-    else if (VA.getLocVT() == MVT::i64 && VA.getValVT() == MVT::f32)
-      Val = DAG.getNode(RISCVISD::FMV_X_ANYEXTW_RV64, DL, MVT::i64, Val);
-    else
+    if (LocVT.isInteger() &&
+        (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16)) {
+      Val = DAG.getNode(RISCVISD::FMV_X_ANYEXTH, DL, LocVT, Val);
+    } else if (LocVT == MVT::i64 && VA.getValVT() == MVT::f32) {
+      if (RV64LegalI32) {
+        Val = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Val);
+        Val = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Val);
+      } else {
+        Val = DAG.getNode(RISCVISD::FMV_X_ANYEXTW_RV64, DL, MVT::i64, Val);
+      }
+    } else {
       Val = DAG.getNode(ISD::BITCAST, DL, LocVT, Val);
+    }
     break;
   }
   return Val;
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index 49dd01eccb02a..bb2ac3c2e012d 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -487,6 +487,12 @@ class RISCVTargetLowering : public TargetLowering {
                                          CallingConv::ID CC,
                                          EVT VT) const override;
 
+  unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context,
+                                                CallingConv::ID CC, EVT VT,
+                                                EVT &IntermediateVT,
+                                                unsigned &NumIntermediates,
+                                                MVT &RegisterVT) const override;
+
   bool shouldFoldSelectWithIdentityConstant(unsigned BinOpcode,
                                             EVT VT) const override;
 
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
index 1a9242cff0b44..71ba4025b6a07 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
@@ -1183,11 +1183,13 @@ def : InstAlias<".insn_s $opcode, $funct3, $rs2, ${imm12}(${rs1})",
 
 class PatGpr<SDPatternOperator OpNode, RVInst Inst, ValueType vt = XLenVT>
     : Pat<(vt (OpNode (vt GPR:$rs1))), (Inst GPR:$rs1)>;
-class PatGprGpr<SDPatternOperator OpNode, RVInst Inst, ValueType vt = XLenVT>
-    : Pat<(vt (OpNode (vt GPR:$rs1), (vt GPR:$rs2))), (Inst GPR:$rs1, GPR:$rs2)>;
+class PatGprGpr<SDPatternOperator OpNode, RVInst Inst, ValueType vt1 = XLenVT,
+                ValueType vt2 = XLenVT>
+    : Pat<(vt1 (OpNode (vt1 GPR:$rs1), (vt2 GPR:$rs2))), (Inst GPR:$rs1, GPR:$rs2)>;
 
-class PatGprImm<SDPatternOperator OpNode, RVInst Inst, ImmLeaf ImmType>
-    : Pat<(XLenVT (OpNode (XLenVT GPR:$rs1), ImmType:$imm)),
+class PatGprImm<SDPatternOperator OpNode, RVInst Inst, ImmLeaf ImmType,
+                ValueType vt = XLenVT>
+    : Pat<(vt (OpNode (vt GPR:$rs1), ImmType:$imm)),
           (Inst GPR:$rs1, ImmType:$imm)>;
 class PatGprSimm12<SDPatternOperator OpNode, RVInstI Inst>
     : PatGprImm<OpNode, Inst, simm12>;
@@ -1744,7 +1746,7 @@ def : LdPat<sextloadi8, LB>;
 def : LdPat<extloadi8, LBU>; // Prefer unsigned due to no c.lb in Zcb.
 def : LdPat<sextloadi16, LH>;
 def : LdPat<extloadi16, LH>;
-def : LdPat<load, LW, i32>, Requires<[IsRV32]>;
+def : LdPat<load, LW, i32>;
 def : LdPat<zextloadi8, LBU>;
 def : LdPat<zextloadi16, LHU>;
 
@@ -1758,7 +1760,7 @@ class StPat<PatFrag StoreOp, RVInst Inst, RegisterClass StTy,
 
 def : StPat<truncstorei8, SB, GPR, XLenVT>;
 def : StPat<truncstorei16, SH, GPR, XLenVT>;
-def : StPat<store, SW, GPR, i32>, Requires<[IsRV32]>;
+def : StPat<store, SW, GPR, i32>;
 
 /// Fences
 
@@ -1992,6 +1994,51 @@ def : Pat<(binop_allwusers<add> GPR:$rs1, (AddiPair:$rs2)),
                  (AddiPairImmSmall AddiPair:$rs2))>;
 }
 
+//===----------------------------------------------------------------------===//
+// Experimental RV64 i32 legalization patterns.
+//===----------------------------------------------------------------------===//
+
+def simm12i32 : ImmLeaf<i32, [{return isInt<12>(Imm);}]>;
+
+// Convert from i32 immediate to i64 target immediate to make SelectionDAG type
+// checking happy so we can use ADDIW which expects an XLen immediate.
+def as_i64imm : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(N->getSExtValue(), SDLoc(N), MVT::i64);
+}]>;
+
+let Predicates = [IsRV64] in {
+def : LdPat<sextloadi8, LB, i32>;
+def : LdPat<extloadi8, LBU, i32>; // Prefer unsigned due to no c.lb in Zcb.
+def : LdPat<sextloadi16, LH, i32>;
+def : LdPat<extloadi16, LH, i32>;
+def : LdPat<zextloadi8, LBU, i32>;
+def : LdPat<zextloadi16, LHU, i32>;
+
+def : StPat<truncstorei8, SB, GPR, i32>;
+def : StPat<truncstorei16, SH, GPR, i32>;
+
+def : Pat<(anyext GPR:$src), (COPY GPR:$src)>;
+def : Pat<(sext GPR:$src), (ADDIW GPR:$src, 0)>;
+def : Pat<(trunc GPR:$src), (COPY GPR:$src)>;
+
+def : PatGprGpr<add, ADDW, i32, i32>;
+def : PatGprGpr<sub, SUBW, i32, i32>;
+def : PatGprGpr<shiftopw<shl>, SLLW, i32, i64>;
+def : PatGprGpr<shiftopw<srl>, SRLW, i32, i64>;
+def : PatGprGpr<shiftopw<sra>, SRAW, i32, i64>;
+
+def : Pat<(i32 (add GPR:$rs1, simm12i32:$imm)),
+          (ADDIW GPR:$rs1, (i64 (as_i64imm $imm)))>;
+
+def : PatGprImm<shl, SLLIW, uimm5, i32>;
+def : PatGprImm<srl, SRLIW, uimm5, i32>;
+def : PatGprImm<sra, SRAIW, uimm5, i32>;
+}
+
+let Predicates = [IsRV64, NotHasStdExtZba] in {
+def : Pat<(zext GPR:$src), (SRLI (SLLI GPR:$src, 32), 32)>;
+}
+
 //===----------------------------------------------------------------------===//
 // Standard extensions
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoA.td b/llvm/lib/Target/RISCV/RISCVInstrInfoA.td
index c43af14bb7f70..5a3d393bdb599 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoA.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoA.td
@@ -372,3 +372,61 @@ def : Pat<(int_riscv_masked_cmpxchg_i64
           (PseudoMaskedCmpXchg32
             GPR:$addr, GPR:$cmpval, GPR:$newval, GPR:$mask, timm:$ordering)>;
 } // Predicates = [HasStdExtA, IsRV64]
+
+//===----------------------------------------------------------------------===//
+// Experimental RV64 i32 legalization patterns.
+//===----------------------------------------------------------------------===//
+
+class PatGprGprA<SDPatternOperator OpNode, RVInst Inst, ValueType vt>
+    : Pat<(vt (OpNode (XLenVT GPR:$rs1), (vt GPR:$rs2))), (Inst GPR:$rs1, GPR:$rs2)>;
+
+multiclass AMOPat2<string AtomicOp, string BaseInst, ValueType vt = XLenVT,
+                   list<Predicate> ExtraPreds = []> {
+let Predicates = !listconcat([HasStdExtA, NotHasStdExtZtso], ExtraPreds) in {
+  def : PatGprGprA<!cast<PatFrag>(AtomicOp#"_monotonic"),
+                   !cast<RVInst>(BaseInst), vt>;
+  def : PatGprGprA<!cast<PatFrag>(AtomicOp#"_acquire"),
+                   !cast<RVInst>(BaseInst#"_AQ"), vt>;
+  def : PatGprGprA<!cast<PatFrag>(AtomicOp#"_release"),
+                   !cast<RVInst>(BaseInst#"_RL"), vt>;
+  def : PatGprGprA<!cast<PatFrag>(AtomicOp#"_acq_rel"),
+                   !cast<RVInst>(BaseInst#"_AQ_RL"), vt>;
+  def : PatGprGprA<!cast<PatFrag>(AtomicOp#"_seq_cst"),
+                   !cast<RVInst>(BaseInst#"_AQ_RL"), vt>;
+}
+let Predicates = !listconcat([HasStdExtA, HasStdExtZtso], ExtraPreds) in {
+  def : PatGprGprA<!cast<PatFrag>(AtomicOp#"_monotonic"),
+                   !cast<RVInst>(BaseInst), vt>;
+  def : PatGprGprA<!cast<PatFrag>(AtomicOp#"_acquire"),
+                   !cast<RVInst>(BaseInst), vt>;
+  def : PatGprGprA<!cast<PatFrag>(AtomicOp#"_release"),
+                   !cast<RVInst>(BaseInst), vt>;
+  def : PatGprGprA<!cast<PatFrag>(AtomicOp#"_acq_rel"),
+                   !cast<RVInst>(BaseInst), vt>;
+  def : PatGprGprA<!cast<PatFrag>(AtomicOp#"_seq_cst"),
+                   !cast<RVInst>(BaseInst), vt>;
+}
+}
+
+defm : AMOPat2<"atomic_swap_32", "AMOSWAP_W", i32>;
+defm : AMOPat2<"atomic_load_add_32", "AMOADD_W", i32>;
+defm : AMOPat2<"atomic_load_and_32", "AMOAND_W", i32>;
+defm : AMOPat2<"atomic_load_or_32", "AMOOR_W", i32>;
+defm : AMOPat2<"atomic_load_xor_32", "AMOXOR_W", i32>;
+defm : AMOPat2<"atomic_load_max_32", "AMOMAX_W", i32>;
+defm : AMOPat2<"atomic_load_min_32", "AMOMIN_W", i32>;
+defm : AMOPat2<"atomic_load_umax_32", "AMOMAXU_W", i32>;
+defm : AMOPat2<"atomic_load_umin_32", "AMOMINU_W", i32>;
+
+defm : PseudoCmpXchgPat<"atomic_cmp_swap_32", PseudoCmpXchg32, i32>;
+
+let Predicates = [HasAtomicLdSt] in {
+  def : LdPat<atomic_load_8,  LB, i32>;
+  def : LdPat<atomic_load_16, LH, i32>;
+  def : LdPat<atomic_load_32, LW, i32>;
+
+  def : StPat<atomic_store_8,  SB, GPR, i32>;
+  def : StPat<atomic_store_16, SH, GPR, i32>;
+  def : StPat<atomic_store_32, SW, GPR, i32>;
+}
+
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoD.td b/llvm/lib/Target/RISCV/RISCVInstrInfoD.td
index 34becfafe7747..f3794c8a0433b 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoD.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoD.td
@@ -538,7 +538,7 @@ def SplitF64Pseudo_INX
              [(set GPR:$dst1, GPR:$dst2, (RISCVSplitF64 FPR64IN32X:$src))]>;
 } // Predicates = [HasStdExtZdinx, IsRV32]
 
-let Predicates = [HasStdExtD, IsRV32] in {
+let Predicates = [HasStdExtD] in {
 
 // double->[u]int. Round-to-zero must be used.
 def : Pat<(i32 (any_fp_to_sint FPR64:$rs1)), (FCVT_W_D FPR64:$rs1, FRM_RTZ)>;
@@ -557,7 +557,7 @@ def : Pat<(i32 (any_lround FPR64:$rs1)), (FCVT_W_D $rs1, FRM_RMM)>;
 // [u]int->double.
 def : Pat<(any_sint_to_fp (i32 GPR:$rs1)), (FCVT_D_W GPR:$rs1, FRM_RNE)>;
 def : Pat<(any_uint_to_fp (i32 GPR:$rs1)), (FCVT_D_WU GPR:$rs1, FRM_RNE)>;
-} // Predicates = [HasStdExtD, IsRV32]
+} // Predicates = [HasStdExtD]
 
 let Predicates = [HasStdExtZdinx, IsRV32] in {
 
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoF.td b/llvm/lib/Target/RISCV/RISCVInstrInfoF.td
index 3a5794bb2d194..32a66882fcd54 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoF.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoF.td
@@ -680,19 +680,19 @@ def : Pat<(store (f32 FPR32INX:$rs2), (AddrRegImm (XLenVT GPR:$rs1), simm12:$imm
           (SW (COPY_TO_REGCLASS FPR32INX:$rs2, GPR), GPR:$rs1, simm12:$imm12)>;
 } // Predicates = [HasStdExtZfinx]
 
-let Predicates = [HasStdExtF, IsRV32] in {
+let Predicates = [HasStdExtF] in {
 // Moves (no conversion)
 def : Pat<(bitconvert (i32 GPR:$rs1)), (FMV_W_X GPR:$rs1)>;
 def : Pat<(i32 (bitconvert FPR32:$rs1)), (FMV_X_W FPR32:$rs1)>;
-} // Predicates = [HasStdExtF, IsRV32]
+} // Predicates = [HasStdExtF]
 
-let Predicates = [HasStdExtZfinx, IsRV32] in {
+let Predicates = [HasStdExtZfinx] in {
 // Moves (no conversion)
 def : Pat<(f32 (bitconvert (i32 GPR:$rs1))), (COPY_TO_REGCLASS GPR:$rs1, GPRF32)>;
 def : Pat<(i32 (bitconvert FPR32INX:$rs1)), (COPY_TO_REGCLASS FPR32INX:$rs1, GPR)>;
-} // Predicates = [HasStdExtZfinx, IsRV32]
+} // Predicates = [HasStdExtZfinx]
 
-let Predicates = [HasStdExtF, IsRV32] in {
+let Predicates = [HasStdExtF] in {
 // float->[u]int. Round-to-zero must be used.
 def : Pat<(i32 (any_fp_to_sint FPR32:$rs1)), (FCVT_W_S $rs1, FRM_RTZ)>;
 def : Pat<(i32 (any_fp_to_uint FPR32:$rs1)), (FCVT_WU_S $rs1, FRM_RTZ)>;
@@ -710,9 +710,9 @@ def : Pat<(i32 (any_lround FPR32:$rs1)), (FCVT_W_S $rs1, FRM_RMM)>;
 // [u]int->float. Match GCC and default to using dynamic rounding mode.
 def : Pat<(any_sint_to_fp (i32 GPR:$rs1)), (FCVT_S_W $rs1, FRM_DYN)>;
 def : Pat<(any_uint_to_fp (i32 GPR:$rs1)), (FCVT_S_WU $rs1, FRM_DYN)>;
-} // Predicates = [HasStdExtF, IsRV32]
+} // Predicates = [HasStdExtF]
 
-let Predicates = [HasStdExtZfinx, IsRV32] in {
+let Predicates = [HasStdExtZfinx] in {
 // float->[u]int. Round-to-zero must be used.
 def : Pat<(i32 (any_fp_to_sint FPR32INX:$rs1)), (FCVT_W_S_INX $rs1, FRM_RTZ)>;
 def : Pat<(i32 (any_fp_to_uint FPR32INX:$rs1)), (FCVT_WU_S_INX $rs1, FRM_RTZ)>;
@@ -730,7 +730,7 @@ def : Pat<(i32 (any_lround FPR32INX:$rs1)), (FCVT_W_S_INX $rs1, FRM_RMM)>;
 // [u]int->float. Match GCC and default to using dynamic rounding mode.
 def : Pat<(any_sint_to_fp (i32 GPR:$rs1)), (FCVT_S_W_INX $rs1, FRM_DYN)>;
 def : Pat<(any_uint_to_fp (i32 GPR:$rs1)), (FCVT_S_WU_INX $rs1, FRM_DYN)>;
-} // Predicates = [HasStdExtZfinx, IsRV32]
+} // Predicates = [HasStdExtZfinx]
 
 let Predicates = [HasStdExtF, IsRV64] in {
 // Moves (no conversion)
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoM.td b/llvm/lib/Target/RISCV/RISCVInstrInfoM.td
index 6c3c9a771d94b..f9890ca4b0eec 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoM.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoM.td
@@ -114,3 +114,18 @@ let Predicates = [HasStdExtMOrZmmul, IsRV64, NotHasStdExtZba] in {
 def : Pat<(i64 (mul (and GPR:$rs1, 0xffffffff), (and GPR:$rs2, 0xffffffff))),
           (MULHU (SLLI GPR:$rs1, 32), (SLLI GPR:$rs2, 32))>;
 } // Predicates = [HasStdExtMOrZmmul, IsRV64, NotHasStdExtZba]
+
+//===----------------------------------------------------------------------===//
+// Experimental RV64 i32 legalization patterns.
+//===----------------------------------------------------------------------===//
+
+let Predicates = [HasStdExtMOrZmmul, IsRV64] in {
+def : PatGprGpr<mul, MULW, i32, i32>;
+}
+
+let Predicates = [HasStdExtM, IsRV64] in {
+def : PatGprGpr<sdiv, DIVW, i32, i32>;
+def : PatGprGpr<udiv, DIVUW, i32, i32>;
+def : PatGprGpr<srem, REMW, i32, i32>;
+def : PatGprGpr<urem, REMUW, i32, i32>;
+}
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td
index 41e139e3c7a9e..1d44b1ad26364 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td
@@ -886,9 +886,7 @@ defm : StoreUpdatePat<post_truncsti8, TH_SBIA>;
 defm : StoreUpdatePat<pre_truncsti8, TH_SBIB>;
 defm : StoreUpdatePat<post_truncsti16, TH_SHIA>;
 defm : StoreUpdatePat<pre_truncsti16, TH_SHIB>;
-}
 
-let Predicates = [HasVendorXTHeadMemIdx, IsRV32] in {
 defm : StoreUpdatePat<post_store, TH_SWIA, i32>;
 defm : StoreUpdatePat<pre_store, TH_SWIB, i32>;
 }
@@ -899,3 +897,15 @@ defm : StoreUpdatePat<pre_truncsti32, TH_SWIB, i64>;
 defm : StoreUpdatePat<post_store, TH_SDIA, i64>;
 defm : StoreUpdatePat<pre_store, TH_SDIB, i64>;
 }
+
+//===----------------------------------------------------------------------===//
+// Experimental RV64 i32 legalization patterns.
+//===----------------------------------------------------------------------===//
+
+let Predicates = [HasVendorXTHeadMemIdx, IsRV64] in {
+defm : StoreUpdatePat<post_truncsti8, TH_SBIA, i32>;
+defm : StoreUpdatePat<pre_truncsti8, TH_SBIB, i32>;
+defm : StoreUpdatePat<post_truncsti16, TH_SHIA, i32>;
+defm : StoreUpdatePat<pre_truncsti16, TH_SHIB, i32>;
+}
+
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
index 4a62a61dadcf3..fec6396c602ba 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
@@ -812,3 +812,29 @@ let Predicates = [HasStdExtZbkx] in {
 def : PatGprGpr<int_riscv_xperm4, XPERM4>;
 def : PatGprGpr<int_riscv_xperm8, XPERM8>;
 } // Predicates = [HasStdExtZbkx]
+
+//===----------------------------------------------------------------------===//
+// Experimental RV64 i32 legalization patterns.
+//===----------------------------------------------------------------------===//
+
+let Predicates = [HasStdExtZbb, IsRV64] in {
+def : PatGpr<ctlz, CLZW, i32>;
+def : PatGpr<cttz, CTZW, i32>;
+def : PatGpr<ctpop, CPOPW, i32>;
+
+def : Pat<(i32 (sext_inreg GPR:$rs1, i8)), (SEXT_B GPR:$rs1)>;
+def : Pat<(i32 (sext_inreg GPR:$rs1, i16)), (SEXT_H GPR:$rs1)>;
+} // Predicates = [HasStdExtZbb, IsRV64]
+
+let Predicates = [HasStdExtZbbOrZbkb, IsRV64] in {
+def : PatGprGpr<shiftopw<rotl>, ROLW, i32, i64>;
+def : PatGprGpr<shiftopw<rotr>, RORW, i32, i64>;
+def : PatGprImm<rotr, RORIW, uimm5, i32>;
+
+def : Pat<(i32 (rotl GPR:$rs1, uimm5:$rs2)),
+          (RORIW GPR:$rs1, (ImmSubFrom32 uimm5:$rs2))>;
+} // Predicates = [HasStdExtZbbOrZbkb, IsRV64]
+
+let Predicates = [HasStdExtZba, IsRV64] in {
+def : Pat<(zext GPR:$src), (ADD_UW GPR:$src, (XLenVT X0))>;
+}
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZfbfmin.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZfbfmin.td
index f3809f2abff69..d819033eea68c 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZfbfmin.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZfbfmin.td
@@ -62,7 +62,7 @@ def : Pat<(riscv_fmv_x_anyexth (bf16 FPR16:$src)), (FMV_X_H FPR16:$src)>;
 def : Pat<(riscv_fmv_x_signexth (bf16 FPR16:$src)), (FMV_X_H FPR16:$src)>;
 } // Predicates = [HasStdExtZfbfmin]
 
-let Predicates = [HasStdExtZfbfmin, IsRV32] in {
+let Predicates = [HasStdExtZfbfmin] in {
 // bf16->[u]int. Round-to-zero must be used for the f32->int step, the
 // rounding mode has no effect for bf16->f32.
 def : Pat<(i32 (any_fp_to_sint (bf16 FPR16:$rs1))), (FCVT_W_S (FCVT_S_BF16 $rs1, FRM_RNE), FRM_RTZ)>;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td
index 1dc391d3f084f..19d467f3b344c 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td
@@ -461,7 +461,7 @@ def : Pat<(riscv_fmv_x_signexth FPR16INX:$src), (COPY_TO_REGCLASS FPR16INX:$src,
 def : Pat<(fcopysign FPR32INX:$rs1, FPR16INX:$rs2), (FSGNJ_S_INX $rs1, (FCVT_S_H_INX $rs2, FRM_RNE))>;
 } // Predicates = [HasStdExtZhinxOrZhinxmin]
 
-let Predicates = [HasStdExtZfh, IsRV32] in {
+let Predicates = [HasStdExtZfh] in {
 // half->[u]int. Round-to-zero must be used.
 def : Pat<(i32 (any_fp_to_sint (f16 FPR16:$rs1))), (FCVT_W_H $rs1, 0b001)>;
 def : Pat<(i32 (any_fp_to_uint (f16 FPR16:$rs1))), (FCVT_WU_H $rs1, 0b001)>;
@@ -479,9 +479,9 @@ def : Pat<(i32 (any_lround (f16 FPR16:$rs1))), (FCVT_W_H $rs1, FRM_RMM)>;
 // [u]int->half. Match GCC and default to using dynamic rounding mode.
 def : Pat<(f16 (any_sint_to_fp (i32 GPR:$rs1))), (FCVT_H_W $rs1, FRM_DYN)>;
 def : Pat<(f16 (any_uint_to_fp (i32 GPR:$rs1))), (FCVT_H_WU $rs1, FRM_DYN)>;
-} // Predicates = [HasStdExtZfh, IsRV32]
+} // Predicates = [HasStdExtZfh]
 
-let Predicates = [HasStdExtZhinx, IsRV32] in {
+let Predicates = [HasStdExtZhinx] in {
 // half->[u]int. Round-to-zero must be used.
 def : Pat<(i32 (any_fp_to_sint FPR16INX:$rs1)), (FCVT_W_H_INX $rs1, 0b001)>;
 def : Pat<(i32 (any_fp_to_uint FPR16INX:$rs1)), (FCVT_WU_H_INX $rs1, 0b001)>;
@@ -499,7 +499,7 @@ def : Pat<(i32 (any_lround FPR16INX:$rs1)), (FCVT_W_H_INX $rs1, FRM_RMM)>;
 // [u]int->half. Match GCC and default to using dynamic rounding mode.
 def : Pat<(any_sint_to_fp (i32 GPR:$rs1)), (FCVT_H_W_INX $rs1, FRM_DYN)>;
 def : Pat<(any_uint_to_fp (i32 GPR:$rs1)), (FCVT_H_WU_INX $rs1, FRM_DYN)>;
-} // Predicates = [HasStdExtZhinx, IsRV32]
+} // Predicates = [HasStdExtZhinx]
 
 let Predicates = [HasStdExtZfh, IsRV64] in {
 // Use target specific isd nodes to help us remember the result is sign
@@ -597,7 +597,7 @@ def : Pat<(fcopysign FPR16INX:$rs1, FPR64INX:$rs2),
 def : Pat<(fcopysign FPR64INX:$rs1, FPR16INX:$rs2), (FSGNJ_D_INX $rs1, (FCVT_D_H_INX $rs2, FRM_RNE))>;
 } // Predicates = [HasStdExtZhinxOrZhinxmin, HasStdExtZdinx, IsRV64]
 
-let Predicates = [HasStdExtZfhmin, NoStdExtZfh, IsRV32] in {
+let Predicates = [HasStdExtZfhmin, NoStdExtZfh] in {
 // half->[u]int. Round-to-zero must be used.
 def : Pat<(i32 (any_fp_to_sint (f16 FPR16:$rs1))), (FCVT_W_S (FCVT_S_H $rs1, FRM_RNE), FRM_RTZ)>;
 def : Pat<(i32 (any_fp_to_uint (f16 FPR16:$rs1))), (FCVT_WU_S (FCVT_S_H $rs1, FRM_RNE), FRM_RTZ)>;
@@ -611,9 +611,9 @@ def : Pat<(i32 (any_lround (f16 FPR16:$rs1))), (FCVT_W_S (FCVT_S_H $rs1, FRM_RNE
 // [u]int->half. Match GCC and default to using dynamic rounding mode.
 def : Pat<(f16 (any_sint_to_fp (i32 GPR:$rs1))), (FCVT_H_S (FCVT_S_W $rs1, FRM_DYN), FRM_DYN)>;
 def : Pat<(f16 (any_uint_to_fp (i32 GPR:$rs1))), (FCVT_H_S (FCVT_S_WU $rs1, FRM_DYN), FRM_DYN)>;
-} // Predicates = [HasStdExtZfhmin, NoStdExtZfh, IsRV32]
+} // Predicates = [HasStdExtZfhmin, NoStdExtZfh]
 
-let Predicates = [HasStdExtZhinxmin, NoStdExtZhinx, IsRV32] in {
+let Predicates = [HasStdExtZhinxmin, NoStdExtZhinx] in {
 // half->[u]int. Round-to-zero must be used.
 def : Pat<(i32 (any_fp_to_sint FPR16INX:$rs1)), (FCVT_W_S_INX (FCVT_S_H_INX $rs1, FRM_RNE), FRM_RTZ)>;
 def : Pat<(i32 (any_fp_to_uint FPR16INX:$rs1)), (FCVT_WU_S_INX (FCVT_S_H_INX $rs1, FRM_RNE), FRM_RTZ)>;
@@ -627,7 +627,7 @@ def : Pat<(i32 (any_lround FPR16INX:$rs1)), (FCVT_W_S_INX (FCVT_S_H_INX $rs1, FR
 // [u]int->half. Match GCC and default to using dynamic rounding mode.
 def : Pat<(any_sint_to_fp (i32 GPR:$rs1)), (FCVT_H_S_INX (FCVT_S_W_INX $rs1, FRM_DYN), FRM_DYN)>;
 def : Pat<(any_uint_to_fp (i32 GPR:$rs1)), (FCVT_H_S_INX (FCVT_S_WU_INX $rs1, FRM_DYN), FRM_DYN)>;
-} // Predicates = [HasStdExtZhinxmin, NoStdExtZhinx, IsRV32]
+} // Predicates = [HasStdExtZhinxmin, NoStdExtZhinx]
 
 let Predicates = [HasStdExtZfhmin, NoStdExtZfh, IsRV64] in {
 // half->[u]int64. Round-to-zero must be used.
diff --git a/llvm/test/CodeGen/RISCV/rv64-legal-i32/alu32.ll b/llvm/test/CodeGen/RISCV/rv64-legal-i32/alu32.ll
new file mode 100644
index 0000000000000..e4eca5c491edb
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rv64-legal-i32/alu32.ll
@@ -0,0 +1,240 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv64 -verify-machineinstrs < %s \
+; RUN:   -riscv-experimental-rv64-legal-i32 | FileCheck %s -check-prefix=RV64I
+
+; These tests are each targeted at a particular RISC-V ALU instruction. Most
+; other files in this folder exercise LLVM IR instructions that don't directly
+; match a RISC-V instruction.
+
+; Register-immediate instructions.
+
+define i32 @addi(i32 %a) nounwind {
+; RV64I-LABEL: addi:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addiw a0, a0, 1
+; RV64I-NEXT:    ret
+  %1 = add i32 %a, 1
+  ret i32 %1
+}
+
+define i32 @slti(i32 %a) nounwind {
+; RV64I-LABEL: slti:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sext.w a0, a0
+; RV64I-NEXT:    slti a0, a0, 2
+; RV64I-NEXT:    ret
+  %1 = icmp slt i32 %a, 2
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @sltiu(i32 %a) nounwind {
+; RV64I-LABEL: sltiu:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sext.w a0, a0
+; RV64I-NEXT:    sltiu a0, a0, 3
+; RV64I-NEXT:    ret
+  %1 = icmp ult i32 %a, 3
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @xori(i32 %a) nounwind {
+; RV64I-LABEL: xori:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    xori a0, a0, 4
+; RV64I-NEXT:    ret
+  %1 = xor i32 %a, 4
+  ret i32 %1
+}
+
+define i32 @ori(i32 %a) nounwind {
+; RV64I-LABEL: ori:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    ori a0, a0, 5
+; RV64I-NEXT:    ret
+  %1 = or i32 %a, 5
+  ret i32 %1
+}
+
+define i32 @andi(i32 %a) nounwind {
+; RV64I-LABEL: andi:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    andi a0, a0, 6
+; RV64I-NEXT:    ret
+  %1 = and i32 %a, 6
+  ret i32 %1
+}
+
+define i32 @slli(i32 %a) nounwind {
+; RV64I-LABEL: slli:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slliw a0, a0, 7
+; RV64I-NEXT:    ret
+  %1 = shl i32 %a, 7
+  ret i32 %1
+}
+
+define i32 @srli(i32 %a) nounwind {
+; RV64I-LABEL: srli:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    srliw a0, a0, 8
+; RV64I-NEXT:    ret
+  %1 = lshr i32 %a, 8
+  ret i32 %1
+}
+
+define i32 @srai(i32 %a) nounwind {
+; RV64I-LABEL: srai:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sraiw a0, a0, 9
+; RV64I-NEXT:    ret
+  %1 = ashr i32 %a, 9
+  ret i32 %1
+}
+
+; Register-register instructions
+
+define i32 @add(i32 %a, i32 %b) nounwind {
+; RV64I-LABEL: add:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addw a0, a0, a1
+; RV64I-NEXT:    ret
+  %1 = add i32 %a, %b
+  ret i32 %1
+}
+
+define i32 @sub(i32 %a, i32 %b) nounwind {
+; RV64I-LABEL: sub:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    subw a0, a0, a1
+; RV64I-NEXT:    ret
+  %1 = sub i32 %a, %b
+  ret i32 %1
+}
+
+define i32 @sub_negative_constant_lhs(i32 %a) nounwind {
+; RV64I-LABEL: sub_negative_constant_lhs:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a1, -2
+; RV64I-NEXT:    subw a0, a1, a0
+; RV64I-NEXT:    ret
+  %1 = sub i32 -2, %a
+  ret i32 %1
+}
+
+define i32 @sll(i32 %a, i32 %b) nounwind {
+; RV64I-LABEL: sll:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sllw a0, a0, a1
+; RV64I-NEXT:    ret
+  %1 = shl i32 %a, %b
+  ret i32 %1
+}
+
+define i32 @sll_negative_constant_lhs(i32 %a) nounwind {
+; RV64I-LABEL: sll_negative_constant_lhs:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a1, -1
+; RV64I-NEXT:    sllw a0, a1, a0
+; RV64I-NEXT:    ret
+  %1 = shl i32 -1, %a
+  ret i32 %1
+}
+
+define i32 @slt(i32 %a, i32 %b) nounwind {
+; RV64I-LABEL: slt:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sext.w a1, a1
+; RV64I-NEXT:    sext.w a0, a0
+; RV64I-NEXT:    slt a0, a0, a1
+; RV64I-NEXT:    ret
+  %1 = icmp slt i32 %a, %b
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @sltu(i32 %a, i32 %b) nounwind {
+;
+; RV64I-LABEL: sltu:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sext.w a1, a1
+; RV64I-NEXT:    sext.w a0, a0
+; RV64I-NEXT:    sltu a0, a0, a1
+; RV64I-NEXT:    ret
+  %1 = icmp ult i32 %a, %b
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+}
+
+define i32 @xor(i32 %a, i32 %b) nounwind {
+;
+; RV64I-LABEL: xor:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    xor a0, a0, a1
+; RV64I-NEXT:    ret
+  %1 = xor i32 %a, %b
+  ret i32 %1
+}
+
+define i32 @srl(i32 %a, i32 %b) nounwind {
+;
+; RV64I-LABEL: srl:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    srlw a0, a0, a1
+; RV64I-NEXT:    ret
+  %1 = lshr i32 %a, %b
+  ret i32 %1
+}
+
+define i32 @srl_negative_constant_lhs(i32 %a) nounwind {
+;
+; RV64I-LABEL: srl_negative_constant_lhs:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a1, -1
+; RV64I-NEXT:    srlw a0, a1, a0
+; RV64I-NEXT:    ret
+  %1 = lshr i32 -1, %a
+  ret i32 %1
+}
+
+define i32 @sra(i32 %a, i32 %b) nounwind {
+;
+; RV64I-LABEL: sra:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sraw a0, a0, a1
+; RV64I-NEXT:    ret
+  %1 = ashr i32 %a, %b
+  ret i32 %1
+}
+
+define i32 @sra_negative_constant_lhs(i32 %a) nounwind {
+;
+; RV64I-LABEL: sra_negative_constant_lhs:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lui a1, 524288
+; RV64I-NEXT:    sraw a0, a1, a0
+; RV64I-NEXT:    ret
+  %1 = ashr i32 2147483648, %a
+  ret i32 %1
+}
+
+define i32 @or(i32 %a, i32 %b) nounwind {
+;
+; RV64I-LABEL: or:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    ret
+  %1 = or i32 %a, %b
+  ret i32 %1
+}
+
+define i32 @and(i32 %a, i32 %b) nounwind {
+;
+; RV64I-LABEL: and:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    ret
+  %1 = and i32 %a, %b
+  ret i32 %1
+}
diff --git a/llvm/test/CodeGen/RISCV/rv64-legal-i32/div.ll b/llvm/test/CodeGen/RISCV/rv64-legal-i32/div.ll
new file mode 100644
index 0000000000000..f2228e9013ce9
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rv64-legal-i32/div.ll
@@ -0,0 +1,699 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv64 -verify-machineinstrs < %s \
+; RUN:   -riscv-experimental-rv64-legal-i32 | FileCheck -check-prefix=RV64I %s
+; RUN: llc -mtriple=riscv64 -mattr=+m -verify-machineinstrs < %s \
+; RUN:   -riscv-experimental-rv64-legal-i32 | FileCheck -check-prefix=RV64IM %s
+
+define i32 @udiv(i32 %a, i32 %b) nounwind {
+; RV64I-LABEL: udiv:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    slli a1, a1, 32
+; RV64I-NEXT:    srli a1, a1, 32
+; RV64I-NEXT:    call __udivdi3@plt
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
+;
+; RV64IM-LABEL: udiv:
+; RV64IM:       # %bb.0:
+; RV64IM-NEXT:    divuw a0, a0, a1
+; RV64IM-NEXT:    ret
+  %1 = udiv i32 %a, %b
+  ret i32 %1
+}
+
+define i32 @udiv_constant(i32 %a) nounwind {
+; RV64I-LABEL: udiv_constant:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    li a1, 5
+; RV64I-NEXT:    call __udivdi3@plt
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
+;
+; RV64IM-LABEL: udiv_constant:
+; RV64IM:       # %bb.0:
+; RV64IM-NEXT:    slli a0, a0, 32
+; RV64IM-NEXT:    lui a1, 838861
+; RV64IM-NEXT:    addi a1, a1, -819
+; RV64IM-NEXT:    slli a1, a1, 32
+; RV64IM-NEXT:    mulhu a0, a0, a1
+; RV64IM-NEXT:    srli a0, a0, 34
+; RV64IM-NEXT:    ret
+  %1 = udiv i32 %a, 5
+  ret i32 %1
+}
+
+define i32 @udiv_pow2(i32 %a) nounwind {
+; RV64I-LABEL: udiv_pow2:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    srliw a0, a0, 3
+; RV64I-NEXT:    ret
+;
+; RV64IM-LABEL: udiv_pow2:
+; RV64IM:       # %bb.0:
+; RV64IM-NEXT:    srliw a0, a0, 3
+; RV64IM-NEXT:    ret
+  %1 = udiv i32 %a, 8
+  ret i32 %1
+}
+
+define i32 @udiv_constant_lhs(i32 %a) nounwind {
+; RV64I-LABEL: udiv_constant_lhs:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a1, a0, 32
+; RV64I-NEXT:    li a0, 10
+; RV64I-NEXT:    call __udivdi3@plt
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
+;
+; RV64IM-LABEL: udiv_constant_lhs:
+; RV64IM:       # %bb.0:
+; RV64IM-NEXT:    li a1, 10
+; RV64IM-NEXT:    divuw a0, a1, a0
+; RV64IM-NEXT:    ret
+  %1 = udiv i32 10, %a
+  ret i32 %1
+}
+
+define i64 @udiv64(i64 %a, i64 %b) nounwind {
+; RV64I-LABEL: udiv64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    tail __udivdi3@plt
+;
+; RV64IM-LABEL: udiv64:
+; RV64IM:       # %bb.0:
+; RV64IM-NEXT:    divu a0, a0, a1
+; RV64IM-NEXT:    ret
+  %1 = udiv i64 %a, %b
+  ret i64 %1
+}
+
+define i64 @udiv64_constant(i64 %a) nounwind {
+; RV64I-LABEL: udiv64_constant:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a1, 5
+; RV64I-NEXT:    tail __udivdi3@plt
+;
+; RV64IM-LABEL: udiv64_constant:
+; RV64IM:       # %bb.0:
+; RV64IM-NEXT:    lui a1, 838861
+; RV64IM-NEXT:    addiw a1, a1, -819
+; RV64IM-NEXT:    slli a2, a1, 32
+; RV64IM-NEXT:    add a1, a1, a2
+; RV64IM-NEXT:    mulhu a0, a0, a1
+; RV64IM-NEXT:    srli a0, a0, 2
+; RV64IM-NEXT:    ret
+  %1 = udiv i64 %a, 5
+  ret i64 %1
+}
+
+define i64 @udiv64_constant_lhs(i64 %a) nounwind {
+; RV64I-LABEL: udiv64_constant_lhs:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    li a0, 10
+; RV64I-NEXT:    tail __udivdi3@plt
+;
+; RV64IM-LABEL: udiv64_constant_lhs:
+; RV64IM:       # %bb.0:
+; RV64IM-NEXT:    li a1, 10
+; RV64IM-NEXT:    divu a0, a1, a0
+; RV64IM-NEXT:    ret
+  %1 = udiv i64 10, %a
+  ret i64 %1
+}
+
+define i8 @udiv8(i8 %a, i8 %b) nounwind {
+; RV64I-LABEL: udiv8:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    andi a0, a0, 255
+; RV64I-NEXT:    andi a1, a1, 255
+; RV64I-NEXT:    call __udivdi3@plt
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
+;
+; RV64IM-LABEL: udiv8:
+; RV64IM:       # %bb.0:
+; RV64IM-NEXT:    andi a1, a1, 255
+; RV64IM-NEXT:    andi a0, a0, 255
+; RV64IM-NEXT:    divuw a0, a0, a1
+; RV64IM-NEXT:    ret
+  %1 = udiv i8 %a, %b
+  ret i8 %1
+}
+
+define i8 @udiv8_constant(i8 %a) nounwind {
+; RV64I-LABEL: udiv8_constant:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    andi a0, a0, 255
+; RV64I-NEXT:    li a1, 5
+; RV64I-NEXT:    call __udivdi3@plt
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
+;
+; RV64IM-LABEL: udiv8_constant:
+; RV64IM:       # %bb.0:
+; RV64IM-NEXT:    andi a0, a0, 255
+; RV64IM-NEXT:    li a1, 205
+; RV64IM-NEXT:    mul a0, a0, a1
+; RV64IM-NEXT:    srliw a0, a0, 10
+; RV64IM-NEXT:    ret
+  %1 = udiv i8 %a, 5
+  ret i8 %1
+}
+
+define i8 @udiv8_pow2(i8 %a) nounwind {
+; RV64I-LABEL: udiv8_pow2:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    andi a0, a0, 248
+; RV64I-NEXT:    srliw a0, a0, 3
+; RV64I-NEXT:    ret
+;
+; RV64IM-LABEL: udiv8_pow2:
+; RV64IM:       # %bb.0:
+; RV64IM-NEXT:    andi a0, a0, 248
+; RV64IM-NEXT:    srliw a0, a0, 3
+; RV64IM-NEXT:    ret
+  %1 = udiv i8 %a, 8
+  ret i8 %1
+}
+
+define i8 @udiv8_constant_lhs(i8 %a) nounwind {
+; RV64I-LABEL: udiv8_constant_lhs:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    andi a1, a0, 255
+; RV64I-NEXT:    li a0, 10
+; RV64I-NEXT:    call __udivdi3@plt
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
+;
+; RV64IM-LABEL: udiv8_constant_lhs:
+; RV64IM:       # %bb.0:
+; RV64IM-NEXT:    andi a0, a0, 255
+; RV64IM-NEXT:    li a1, 10
+; RV64IM-NEXT:    divuw a0, a1, a0
+; RV64IM-NEXT:    ret
+  %1 = udiv i8 10, %a
+  ret i8 %1
+}
+
+define i16 @udiv16(i16 %a, i16 %b) nounwind {
+; RV64I-LABEL: udiv16:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lui a2, 16
+; RV64I-NEXT:    addiw a2, a2, -1
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    call __udivdi3@plt
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
+;
+; RV64IM-LABEL: udiv16:
+; RV64IM:       # %bb.0:
+; RV64IM-NEXT:    lui a2, 16
+; RV64IM-NEXT:    addi a2, a2, -1
+; RV64IM-NEXT:    and a1, a1, a2
+; RV64IM-NEXT:    and a0, a0, a2
+; RV64IM-NEXT:    divuw a0, a0, a1
+; RV64IM-NEXT:    ret
+  %1 = udiv i16 %a, %b
+  ret i16 %1
+}
+
+define i16 @udiv16_constant(i16 %a) nounwind {
+; RV64I-LABEL: udiv16_constant:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    slli a0, a0, 48
+; RV64I-NEXT:    srli a0, a0, 48
+; RV64I-NEXT:    li a1, 5
+; RV64I-NEXT:    call __udivdi3@plt
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
+;
+; RV64IM-LABEL: udiv16_constant:
+; RV64IM:       # %bb.0:
+; RV64IM-NEXT:    slli a0, a0, 48
+; RV64IM-NEXT:    srli a0, a0, 48
+; RV64IM-NEXT:    lui a1, 13
+; RV64IM-NEXT:    addi a1, a1, -819
+; RV64IM-NEXT:    mul a0, a0, a1
+; RV64IM-NEXT:    srliw a0, a0, 18
+; RV64IM-NEXT:    ret
+  %1 = udiv i16 %a, 5
+  ret i16 %1
+}
+
+define i16 @udiv16_pow2(i16 %a) nounwind {
+; RV64I-LABEL: udiv16_pow2:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 48
+; RV64I-NEXT:    srli a0, a0, 48
+; RV64I-NEXT:    srliw a0, a0, 3
+; RV64I-NEXT:    ret
+;
+; RV64IM-LABEL: udiv16_pow2:
+; RV64IM:       # %bb.0:
+; RV64IM-NEXT:    slli a0, a0, 48
+; RV64IM-NEXT:    srli a0, a0, 48
+; RV64IM-NEXT:    srliw a0, a0, 3
+; RV64IM-NEXT:    ret
+  %1 = udiv i16 %a, 8
+  ret i16 %1
+}
+
+define i16 @udiv16_constant_lhs(i16 %a) nounwind {
+; RV64I-LABEL: udiv16_constant_lhs:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    slli a0, a0, 48
+; RV64I-NEXT:    srli a1, a0, 48
+; RV64I-NEXT:    li a0, 10
+; RV64I-NEXT:    call __udivdi3@plt
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
+;
+; RV64IM-LABEL: udiv16_constant_lhs:
+; RV64IM:       # %bb.0:
+; RV64IM-NEXT:    slli a0, a0, 48
+; RV64IM-NEXT:    srli a0, a0, 48
+; RV64IM-NEXT:    li a1, 10
+; RV64IM-NEXT:    divuw a0, a1, a0
+; RV64IM-NEXT:    ret
+  %1 = udiv i16 10, %a
+  ret i16 %1
+}
+
+define i32 @sdiv(i32 %a, i32 %b) nounwind {
+; RV64I-LABEL: sdiv:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sext.w a0, a0
+; RV64I-NEXT:    sext.w a1, a1
+; RV64I-NEXT:    call __divdi3@plt
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
+;
+; RV64IM-LABEL: sdiv:
+; RV64IM:       # %bb.0:
+; RV64IM-NEXT:    divw a0, a0, a1
+; RV64IM-NEXT:    ret
+  %1 = sdiv i32 %a, %b
+  ret i32 %1
+}
+
+define i32 @sdiv_constant(i32 %a) nounwind {
+; RV64I-LABEL: sdiv_constant:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sext.w a0, a0
+; RV64I-NEXT:    li a1, 5
+; RV64I-NEXT:    call __divdi3@plt
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
+;
+; RV64IM-LABEL: sdiv_constant:
+; RV64IM:       # %bb.0:
+; RV64IM-NEXT:    sext.w a0, a0
+; RV64IM-NEXT:    lui a1, 419430
+; RV64IM-NEXT:    addiw a1, a1, 1639
+; RV64IM-NEXT:    mul a0, a0, a1
+; RV64IM-NEXT:    srli a1, a0, 63
+; RV64IM-NEXT:    srai a0, a0, 33
+; RV64IM-NEXT:    addw a0, a0, a1
+; RV64IM-NEXT:    ret
+  %1 = sdiv i32 %a, 5
+  ret i32 %1
+}
+
+define i32 @sdiv_pow2(i32 %a) nounwind {
+; RV64I-LABEL: sdiv_pow2:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sraiw a1, a0, 31
+; RV64I-NEXT:    srliw a1, a1, 29
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    sraiw a0, a0, 3
+; RV64I-NEXT:    ret
+;
+; RV64IM-LABEL: sdiv_pow2:
+; RV64IM:       # %bb.0:
+; RV64IM-NEXT:    sraiw a1, a0, 31
+; RV64IM-NEXT:    srliw a1, a1, 29
+; RV64IM-NEXT:    add a0, a0, a1
+; RV64IM-NEXT:    sraiw a0, a0, 3
+; RV64IM-NEXT:    ret
+  %1 = sdiv i32 %a, 8
+  ret i32 %1
+}
+
+define i32 @sdiv_pow2_2(i32 %a) nounwind {
+; RV64I-LABEL: sdiv_pow2_2:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sraiw a1, a0, 31
+; RV64I-NEXT:    srliw a1, a1, 16
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    sraiw a0, a0, 16
+; RV64I-NEXT:    ret
+;
+; RV64IM-LABEL: sdiv_pow2_2:
+; RV64IM:       # %bb.0:
+; RV64IM-NEXT:    sraiw a1, a0, 31
+; RV64IM-NEXT:    srliw a1, a1, 16
+; RV64IM-NEXT:    add a0, a0, a1
+; RV64IM-NEXT:    sraiw a0, a0, 16
+; RV64IM-NEXT:    ret
+  %1 = sdiv i32 %a, 65536
+  ret i32 %1
+}
+
+define i32 @sdiv_constant_lhs(i32 %a) nounwind {
+; RV64I-LABEL: sdiv_constant_lhs:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sext.w a1, a0
+; RV64I-NEXT:    li a0, -10
+; RV64I-NEXT:    call __divdi3@plt
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
+;
+; RV64IM-LABEL: sdiv_constant_lhs:
+; RV64IM:       # %bb.0:
+; RV64IM-NEXT:    li a1, -10
+; RV64IM-NEXT:    divw a0, a1, a0
+; RV64IM-NEXT:    ret
+  %1 = sdiv i32 -10, %a
+  ret i32 %1
+}
+
+define i64 @sdiv64(i64 %a, i64 %b) nounwind {
+; RV64I-LABEL: sdiv64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    tail __divdi3@plt
+;
+; RV64IM-LABEL: sdiv64:
+; RV64IM:       # %bb.0:
+; RV64IM-NEXT:    div a0, a0, a1
+; RV64IM-NEXT:    ret
+  %1 = sdiv i64 %a, %b
+  ret i64 %1
+}
+
+define i64 @sdiv64_constant(i64 %a) nounwind {
+; RV64I-LABEL: sdiv64_constant:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a1, 5
+; RV64I-NEXT:    tail __divdi3@plt
+;
+; RV64IM-LABEL: sdiv64_constant:
+; RV64IM:       # %bb.0:
+; RV64IM-NEXT:    lui a1, %hi(.LCPI21_0)
+; RV64IM-NEXT:    ld a1, %lo(.LCPI21_0)(a1)
+; RV64IM-NEXT:    mulh a0, a0, a1
+; RV64IM-NEXT:    srli a1, a0, 63
+; RV64IM-NEXT:    srai a0, a0, 1
+; RV64IM-NEXT:    add a0, a0, a1
+; RV64IM-NEXT:    ret
+  %1 = sdiv i64 %a, 5
+  ret i64 %1
+}
+
+define i64 @sdiv64_constant_lhs(i64 %a) nounwind {
+; RV64I-LABEL: sdiv64_constant_lhs:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    li a0, 10
+; RV64I-NEXT:    tail __divdi3@plt
+;
+; RV64IM-LABEL: sdiv64_constant_lhs:
+; RV64IM:       # %bb.0:
+; RV64IM-NEXT:    li a1, 10
+; RV64IM-NEXT:    div a0, a1, a0
+; RV64IM-NEXT:    ret
+  %1 = sdiv i64 10, %a
+  ret i64 %1
+}
+
+; Although this sdiv has two sexti32 operands, it shouldn't compile to divw on
+; RV64M as that wouldn't produce the correct result for e.g. INT_MIN/-1.
+
+define i64 @sdiv64_sext_operands(i32 %a, i32 %b) nounwind {
+; RV64I-LABEL: sdiv64_sext_operands:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sext.w a0, a0
+; RV64I-NEXT:    sext.w a1, a1
+; RV64I-NEXT:    tail __divdi3@plt
+;
+; RV64IM-LABEL: sdiv64_sext_operands:
+; RV64IM:       # %bb.0:
+; RV64IM-NEXT:    sext.w a0, a0
+; RV64IM-NEXT:    sext.w a1, a1
+; RV64IM-NEXT:    div a0, a0, a1
+; RV64IM-NEXT:    ret
+  %1 = sext i32 %a to i64
+  %2 = sext i32 %b to i64
+  %3 = sdiv i64 %1, %2
+  ret i64 %3
+}
+
+define i8 @sdiv8(i8 %a, i8 %b) nounwind {
+; RV64I-LABEL: sdiv8:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    slli a1, a1, 24
+; RV64I-NEXT:    sraiw a1, a1, 24
+; RV64I-NEXT:    slli a0, a0, 24
+; RV64I-NEXT:    sraiw a0, a0, 24
+; RV64I-NEXT:    call __divdi3@plt
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
+;
+; RV64IM-LABEL: sdiv8:
+; RV64IM:       # %bb.0:
+; RV64IM-NEXT:    slli a1, a1, 24
+; RV64IM-NEXT:    sraiw a1, a1, 24
+; RV64IM-NEXT:    slli a0, a0, 24
+; RV64IM-NEXT:    sraiw a0, a0, 24
+; RV64IM-NEXT:    divw a0, a0, a1
+; RV64IM-NEXT:    ret
+  %1 = sdiv i8 %a, %b
+  ret i8 %1
+}
+
+define i8 @sdiv8_constant(i8 %a) nounwind {
+; RV64I-LABEL: sdiv8_constant:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    slli a0, a0, 24
+; RV64I-NEXT:    sraiw a0, a0, 24
+; RV64I-NEXT:    li a1, 5
+; RV64I-NEXT:    call __divdi3@plt
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
+;
+; RV64IM-LABEL: sdiv8_constant:
+; RV64IM:       # %bb.0:
+; RV64IM-NEXT:    slli a0, a0, 24
+; RV64IM-NEXT:    sraiw a0, a0, 24
+; RV64IM-NEXT:    li a1, 103
+; RV64IM-NEXT:    mul a0, a0, a1
+; RV64IM-NEXT:    sraiw a1, a0, 9
+; RV64IM-NEXT:    srliw a0, a0, 15
+; RV64IM-NEXT:    andi a0, a0, 1
+; RV64IM-NEXT:    addw a0, a1, a0
+; RV64IM-NEXT:    ret
+  %1 = sdiv i8 %a, 5
+  ret i8 %1
+}
+
+define i8 @sdiv8_pow2(i8 %a) nounwind {
+; RV64I-LABEL: sdiv8_pow2:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a1, a0, 24
+; RV64I-NEXT:    sraiw a1, a1, 24
+; RV64I-NEXT:    srliw a1, a1, 12
+; RV64I-NEXT:    andi a1, a1, 7
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    slli a0, a0, 24
+; RV64I-NEXT:    sraiw a0, a0, 27
+; RV64I-NEXT:    ret
+;
+; RV64IM-LABEL: sdiv8_pow2:
+; RV64IM:       # %bb.0:
+; RV64IM-NEXT:    slli a1, a0, 24
+; RV64IM-NEXT:    sraiw a1, a1, 24
+; RV64IM-NEXT:    srliw a1, a1, 12
+; RV64IM-NEXT:    andi a1, a1, 7
+; RV64IM-NEXT:    add a0, a0, a1
+; RV64IM-NEXT:    slli a0, a0, 24
+; RV64IM-NEXT:    sraiw a0, a0, 27
+; RV64IM-NEXT:    ret
+  %1 = sdiv i8 %a, 8
+  ret i8 %1
+}
+
+define i8 @sdiv8_constant_lhs(i8 %a) nounwind {
+; RV64I-LABEL: sdiv8_constant_lhs:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    slli a0, a0, 24
+; RV64I-NEXT:    sraiw a1, a0, 24
+; RV64I-NEXT:    li a0, -10
+; RV64I-NEXT:    call __divdi3@plt
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
+;
+; RV64IM-LABEL: sdiv8_constant_lhs:
+; RV64IM:       # %bb.0:
+; RV64IM-NEXT:    slli a0, a0, 24
+; RV64IM-NEXT:    sraiw a0, a0, 24
+; RV64IM-NEXT:    li a1, -10
+; RV64IM-NEXT:    divw a0, a1, a0
+; RV64IM-NEXT:    ret
+  %1 = sdiv i8 -10, %a
+  ret i8 %1
+}
+
+define i16 @sdiv16(i16 %a, i16 %b) nounwind {
+; RV64I-LABEL: sdiv16:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    slli a1, a1, 16
+; RV64I-NEXT:    sraiw a1, a1, 16
+; RV64I-NEXT:    slli a0, a0, 16
+; RV64I-NEXT:    sraiw a0, a0, 16
+; RV64I-NEXT:    call __divdi3@plt
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
+;
+; RV64IM-LABEL: sdiv16:
+; RV64IM:       # %bb.0:
+; RV64IM-NEXT:    slli a1, a1, 16
+; RV64IM-NEXT:    sraiw a1, a1, 16
+; RV64IM-NEXT:    slli a0, a0, 16
+; RV64IM-NEXT:    sraiw a0, a0, 16
+; RV64IM-NEXT:    divw a0, a0, a1
+; RV64IM-NEXT:    ret
+  %1 = sdiv i16 %a, %b
+  ret i16 %1
+}
+
+define i16 @sdiv16_constant(i16 %a) nounwind {
+; RV64I-LABEL: sdiv16_constant:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    slli a0, a0, 16
+; RV64I-NEXT:    sraiw a0, a0, 16
+; RV64I-NEXT:    li a1, 5
+; RV64I-NEXT:    call __divdi3@plt
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
+;
+; RV64IM-LABEL: sdiv16_constant:
+; RV64IM:       # %bb.0:
+; RV64IM-NEXT:    slli a0, a0, 16
+; RV64IM-NEXT:    sraiw a0, a0, 16
+; RV64IM-NEXT:    lui a1, 6
+; RV64IM-NEXT:    addi a1, a1, 1639
+; RV64IM-NEXT:    mul a0, a0, a1
+; RV64IM-NEXT:    srliw a1, a0, 31
+; RV64IM-NEXT:    sraiw a0, a0, 17
+; RV64IM-NEXT:    addw a0, a0, a1
+; RV64IM-NEXT:    ret
+  %1 = sdiv i16 %a, 5
+  ret i16 %1
+}
+
+define i16 @sdiv16_pow2(i16 %a) nounwind {
+; RV64I-LABEL: sdiv16_pow2:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a1, a0, 16
+; RV64I-NEXT:    sraiw a1, a1, 16
+; RV64I-NEXT:    srliw a1, a1, 28
+; RV64I-NEXT:    andi a1, a1, 7
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    slli a0, a0, 16
+; RV64I-NEXT:    sraiw a0, a0, 19
+; RV64I-NEXT:    ret
+;
+; RV64IM-LABEL: sdiv16_pow2:
+; RV64IM:       # %bb.0:
+; RV64IM-NEXT:    slli a1, a0, 16
+; RV64IM-NEXT:    sraiw a1, a1, 16
+; RV64IM-NEXT:    srliw a1, a1, 28
+; RV64IM-NEXT:    andi a1, a1, 7
+; RV64IM-NEXT:    add a0, a0, a1
+; RV64IM-NEXT:    slli a0, a0, 16
+; RV64IM-NEXT:    sraiw a0, a0, 19
+; RV64IM-NEXT:    ret
+  %1 = sdiv i16 %a, 8
+  ret i16 %1
+}
+
+define i16 @sdiv16_constant_lhs(i16 %a) nounwind {
+; RV64I-LABEL: sdiv16_constant_lhs:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    slli a0, a0, 16
+; RV64I-NEXT:    sraiw a1, a0, 16
+; RV64I-NEXT:    li a0, -10
+; RV64I-NEXT:    call __divdi3@plt
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
+;
+; RV64IM-LABEL: sdiv16_constant_lhs:
+; RV64IM:       # %bb.0:
+; RV64IM-NEXT:    slli a0, a0, 16
+; RV64IM-NEXT:    sraiw a0, a0, 16
+; RV64IM-NEXT:    li a1, -10
+; RV64IM-NEXT:    divw a0, a1, a0
+; RV64IM-NEXT:    ret
+  %1 = sdiv i16 -10, %a
+  ret i16 %1
+}
diff --git a/llvm/test/CodeGen/RISCV/rv64-legal-i32/imm.ll b/llvm/test/CodeGen/RISCV/rv64-legal-i32/imm.ll
new file mode 100644
index 0000000000000..0ef17ca964db5
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rv64-legal-i32/imm.ll
@@ -0,0 +1,2564 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv64 -riscv-disable-using-constant-pool-for-large-ints -verify-machineinstrs < %s \
+; RUN:   -riscv-experimental-rv64-legal-i32 | FileCheck %s -check-prefixes=RV64I,RV64-NOPOOL
+; RUN: llc -mtriple=riscv64 -verify-machineinstrs < %s \
+; RUN:   -riscv-experimental-rv64-legal-i32 | FileCheck %s -check-prefixes=RV64I,RV64I-POOL
+; RUN: llc -mtriple=riscv64 -riscv-disable-using-constant-pool-for-large-ints -mattr=+zba \
+; RUN:   -riscv-experimental-rv64-legal-i32 -verify-machineinstrs < %s | FileCheck %s -check-prefix=RV64IZBA
+; RUN: llc -mtriple=riscv64 -riscv-disable-using-constant-pool-for-large-ints -mattr=+zbb \
+; RUN:   -riscv-experimental-rv64-legal-i32 -verify-machineinstrs < %s | FileCheck %s -check-prefix=RV64IZBB
+; RUN: llc -mtriple=riscv64 -riscv-disable-using-constant-pool-for-large-ints -mattr=+zbs \
+; RUN:   -riscv-experimental-rv64-legal-i32 -verify-machineinstrs < %s | FileCheck %s -check-prefix=RV64IZBS
+; RUN: llc -mtriple=riscv64 -riscv-disable-using-constant-pool-for-large-ints -mattr=+xtheadbb \
+; RUN:   -riscv-experimental-rv64-legal-i32 -verify-machineinstrs < %s | FileCheck %s -check-prefix=RV64IXTHEADBB
+
+; Materializing constants
+
+; TODO: It would be preferable if anyext constant returns were sign rather
+; than zero extended. See PR39092. For now, mark returns as explicitly signext
+; (this matches what Clang would generate for equivalent C/C++ anyway).
+
+define signext i32 @zero() nounwind {
+; RV64I-LABEL: zero:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a0, 0
+; RV64I-NEXT:    ret
+;
+; RV64IZBA-LABEL: zero:
+; RV64IZBA:       # %bb.0:
+; RV64IZBA-NEXT:    li a0, 0
+; RV64IZBA-NEXT:    ret
+;
+; RV64IZBB-LABEL: zero:
+; RV64IZBB:       # %bb.0:
+; RV64IZBB-NEXT:    li a0, 0
+; RV64IZBB-NEXT:    ret
+;
+; RV64IZBS-LABEL: zero:
+; RV64IZBS:       # %bb.0:
+; RV64IZBS-NEXT:    li a0, 0
+; RV64IZBS-NEXT:    ret
+;
+; RV64IXTHEADBB-LABEL: zero:
+; RV64IXTHEADBB:       # %bb.0:
+; RV64IXTHEADBB-NEXT:    li a0, 0
+; RV64IXTHEADBB-NEXT:    ret
+  ret i32 0
+}
+
+define signext i32 @pos_small() nounwind {
+; RV64I-LABEL: pos_small:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a0, 2047
+; RV64I-NEXT:    ret
+;
+; RV64IZBA-LABEL: pos_small:
+; RV64IZBA:       # %bb.0:
+; RV64IZBA-NEXT:    li a0, 2047
+; RV64IZBA-NEXT:    ret
+;
+; RV64IZBB-LABEL: pos_small:
+; RV64IZBB:       # %bb.0:
+; RV64IZBB-NEXT:    li a0, 2047
+; RV64IZBB-NEXT:    ret
+;
+; RV64IZBS-LABEL: pos_small:
+; RV64IZBS:       # %bb.0:
+; RV64IZBS-NEXT:    li a0, 2047
+; RV64IZBS-NEXT:    ret
+;
+; RV64IXTHEADBB-LABEL: pos_small:
+; RV64IXTHEADBB:       # %bb.0:
+; RV64IXTHEADBB-NEXT:    li a0, 2047
+; RV64IXTHEADBB-NEXT:    ret
+  ret i32 2047
+}
+
+define signext i32 @neg_small() nounwind {
+; RV64I-LABEL: neg_small:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a0, -2048
+; RV64I-NEXT:    ret
+;
+; RV64IZBA-LABEL: neg_small:
+; RV64IZBA:       # %bb.0:
+; RV64IZBA-NEXT:    li a0, -2048
+; RV64IZBA-NEXT:    ret
+;
+; RV64IZBB-LABEL: neg_small:
+; RV64IZBB:       # %bb.0:
+; RV64IZBB-NEXT:    li a0, -2048
+; RV64IZBB-NEXT:    ret
+;
+; RV64IZBS-LABEL: neg_small:
+; RV64IZBS:       # %bb.0:
+; RV64IZBS-NEXT:    li a0, -2048
+; RV64IZBS-NEXT:    ret
+;
+; RV64IXTHEADBB-LABEL: neg_small:
+; RV64IXTHEADBB:       # %bb.0:
+; RV64IXTHEADBB-NEXT:    li a0, -2048
+; RV64IXTHEADBB-NEXT:    ret
+  ret i32 -2048
+}
+
+define signext i32 @pos_i32() nounwind {
+; RV64I-LABEL: pos_i32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lui a0, 423811
+; RV64I-NEXT:    addiw a0, a0, -1297
+; RV64I-NEXT:    ret
+;
+; RV64IZBA-LABEL: pos_i32:
+; RV64IZBA:       # %bb.0:
+; RV64IZBA-NEXT:    lui a0, 423811
+; RV64IZBA-NEXT:    addiw a0, a0, -1297
+; RV64IZBA-NEXT:    ret
+;
+; RV64IZBB-LABEL: pos_i32:
+; RV64IZBB:       # %bb.0:
+; RV64IZBB-NEXT:    lui a0, 423811
+; RV64IZBB-NEXT:    addiw a0, a0, -1297
+; RV64IZBB-NEXT:    ret
+;
+; RV64IZBS-LABEL: pos_i32:
+; RV64IZBS:       # %bb.0:
+; RV64IZBS-NEXT:    lui a0, 423811
+; RV64IZBS-NEXT:    addiw a0, a0, -1297
+; RV64IZBS-NEXT:    ret
+;
+; RV64IXTHEADBB-LABEL: pos_i32:
+; RV64IXTHEADBB:       # %bb.0:
+; RV64IXTHEADBB-NEXT:    lui a0, 423811
+; RV64IXTHEADBB-NEXT:    addiw a0, a0, -1297
+; RV64IXTHEADBB-NEXT:    ret
+  ret i32 1735928559
+}
+
+define signext i32 @neg_i32() nounwind {
+; RV64I-LABEL: neg_i32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lui a0, 912092
+; RV64I-NEXT:    addiw a0, a0, -273
+; RV64I-NEXT:    ret
+;
+; RV64IZBA-LABEL: neg_i32:
+; RV64IZBA:       # %bb.0:
+; RV64IZBA-NEXT:    lui a0, 912092
+; RV64IZBA-NEXT:    addiw a0, a0, -273
+; RV64IZBA-NEXT:    ret
+;
+; RV64IZBB-LABEL: neg_i32:
+; RV64IZBB:       # %bb.0:
+; RV64IZBB-NEXT:    lui a0, 912092
+; RV64IZBB-NEXT:    addiw a0, a0, -273
+; RV64IZBB-NEXT:    ret
+;
+; RV64IZBS-LABEL: neg_i32:
+; RV64IZBS:       # %bb.0:
+; RV64IZBS-NEXT:    lui a0, 912092
+; RV64IZBS-NEXT:    addiw a0, a0, -273
+; RV64IZBS-NEXT:    ret
+;
+; RV64IXTHEADBB-LABEL: neg_i32:
+; RV64IXTHEADBB:       # %bb.0:
+; RV64IXTHEADBB-NEXT:    lui a0, 912092
+; RV64IXTHEADBB-NEXT:    addiw a0, a0, -273
+; RV64IXTHEADBB-NEXT:    ret
+  ret i32 -559038737
+}
+
+define signext i32 @pos_i32_hi20_only() nounwind {
+; RV64I-LABEL: pos_i32_hi20_only:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lui a0, 16
+; RV64I-NEXT:    ret
+;
+; RV64IZBA-LABEL: pos_i32_hi20_only:
+; RV64IZBA:       # %bb.0:
+; RV64IZBA-NEXT:    lui a0, 16
+; RV64IZBA-NEXT:    ret
+;
+; RV64IZBB-LABEL: pos_i32_hi20_only:
+; RV64IZBB:       # %bb.0:
+; RV64IZBB-NEXT:    lui a0, 16
+; RV64IZBB-NEXT:    ret
+;
+; RV64IZBS-LABEL: pos_i32_hi20_only:
+; RV64IZBS:       # %bb.0:
+; RV64IZBS-NEXT:    lui a0, 16
+; RV64IZBS-NEXT:    ret
+;
+; RV64IXTHEADBB-LABEL: pos_i32_hi20_only:
+; RV64IXTHEADBB:       # %bb.0:
+; RV64IXTHEADBB-NEXT:    lui a0, 16
+; RV64IXTHEADBB-NEXT:    ret
+  ret i32 65536 ; 0x10000
+}
+
+define signext i32 @neg_i32_hi20_only() nounwind {
+; RV64I-LABEL: neg_i32_hi20_only:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lui a0, 1048560
+; RV64I-NEXT:    ret
+;
+; RV64IZBA-LABEL: neg_i32_hi20_only:
+; RV64IZBA:       # %bb.0:
+; RV64IZBA-NEXT:    lui a0, 1048560
+; RV64IZBA-NEXT:    ret
+;
+; RV64IZBB-LABEL: neg_i32_hi20_only:
+; RV64IZBB:       # %bb.0:
+; RV64IZBB-NEXT:    lui a0, 1048560
+; RV64IZBB-NEXT:    ret
+;
+; RV64IZBS-LABEL: neg_i32_hi20_only:
+; RV64IZBS:       # %bb.0:
+; RV64IZBS-NEXT:    lui a0, 1048560
+; RV64IZBS-NEXT:    ret
+;
+; RV64IXTHEADBB-LABEL: neg_i32_hi20_only:
+; RV64IXTHEADBB:       # %bb.0:
+; RV64IXTHEADBB-NEXT:    lui a0, 1048560
+; RV64IXTHEADBB-NEXT:    ret
+  ret i32 -65536 ; -0x10000
+}
+
+; This can be materialized with ADDI+SLLI, improving compressibility.
+
+define signext i32 @imm_left_shifted_addi() nounwind {
+; RV64I-LABEL: imm_left_shifted_addi:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lui a0, 32
+; RV64I-NEXT:    addiw a0, a0, -64
+; RV64I-NEXT:    ret
+;
+; RV64IZBA-LABEL: imm_left_shifted_addi:
+; RV64IZBA:       # %bb.0:
+; RV64IZBA-NEXT:    lui a0, 32
+; RV64IZBA-NEXT:    addiw a0, a0, -64
+; RV64IZBA-NEXT:    ret
+;
+; RV64IZBB-LABEL: imm_left_shifted_addi:
+; RV64IZBB:       # %bb.0:
+; RV64IZBB-NEXT:    lui a0, 32
+; RV64IZBB-NEXT:    addiw a0, a0, -64
+; RV64IZBB-NEXT:    ret
+;
+; RV64IZBS-LABEL: imm_left_shifted_addi:
+; RV64IZBS:       # %bb.0:
+; RV64IZBS-NEXT:    lui a0, 32
+; RV64IZBS-NEXT:    addiw a0, a0, -64
+; RV64IZBS-NEXT:    ret
+;
+; RV64IXTHEADBB-LABEL: imm_left_shifted_addi:
+; RV64IXTHEADBB:       # %bb.0:
+; RV64IXTHEADBB-NEXT:    lui a0, 32
+; RV64IXTHEADBB-NEXT:    addiw a0, a0, -64
+; RV64IXTHEADBB-NEXT:    ret
+  ret i32 131008 ; 0x1FFC0
+}
+
+; This can be materialized with ADDI+SRLI, improving compressibility.
+
+define signext i32 @imm_right_shifted_addi() nounwind {
+; RV64I-LABEL: imm_right_shifted_addi:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lui a0, 524288
+; RV64I-NEXT:    addiw a0, a0, -1
+; RV64I-NEXT:    ret
+;
+; RV64IZBA-LABEL: imm_right_shifted_addi:
+; RV64IZBA:       # %bb.0:
+; RV64IZBA-NEXT:    lui a0, 524288
+; RV64IZBA-NEXT:    addiw a0, a0, -1
+; RV64IZBA-NEXT:    ret
+;
+; RV64IZBB-LABEL: imm_right_shifted_addi:
+; RV64IZBB:       # %bb.0:
+; RV64IZBB-NEXT:    lui a0, 524288
+; RV64IZBB-NEXT:    addiw a0, a0, -1
+; RV64IZBB-NEXT:    ret
+;
+; RV64IZBS-LABEL: imm_right_shifted_addi:
+; RV64IZBS:       # %bb.0:
+; RV64IZBS-NEXT:    lui a0, 524288
+; RV64IZBS-NEXT:    addiw a0, a0, -1
+; RV64IZBS-NEXT:    ret
+;
+; RV64IXTHEADBB-LABEL: imm_right_shifted_addi:
+; RV64IXTHEADBB:       # %bb.0:
+; RV64IXTHEADBB-NEXT:    lui a0, 524288
+; RV64IXTHEADBB-NEXT:    addiw a0, a0, -1
+; RV64IXTHEADBB-NEXT:    ret
+  ret i32 2147483647 ; 0x7FFFFFFF
+}
+
+; This can be materialized with LUI+SRLI, improving compressibility.
+
+define signext i32 @imm_right_shifted_lui() nounwind {
+; RV64I-LABEL: imm_right_shifted_lui:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lui a0, 56
+; RV64I-NEXT:    addiw a0, a0, 580
+; RV64I-NEXT:    ret
+;
+; RV64IZBA-LABEL: imm_right_shifted_lui:
+; RV64IZBA:       # %bb.0:
+; RV64IZBA-NEXT:    lui a0, 56
+; RV64IZBA-NEXT:    addiw a0, a0, 580
+; RV64IZBA-NEXT:    ret
+;
+; RV64IZBB-LABEL: imm_right_shifted_lui:
+; RV64IZBB:       # %bb.0:
+; RV64IZBB-NEXT:    lui a0, 56
+; RV64IZBB-NEXT:    addiw a0, a0, 580
+; RV64IZBB-NEXT:    ret
+;
+; RV64IZBS-LABEL: imm_right_shifted_lui:
+; RV64IZBS:       # %bb.0:
+; RV64IZBS-NEXT:    lui a0, 56
+; RV64IZBS-NEXT:    addiw a0, a0, 580
+; RV64IZBS-NEXT:    ret
+;
+; RV64IXTHEADBB-LABEL: imm_right_shifted_lui:
+; RV64IXTHEADBB:       # %bb.0:
+; RV64IXTHEADBB-NEXT:    lui a0, 56
+; RV64IXTHEADBB-NEXT:    addiw a0, a0, 580
+; RV64IXTHEADBB-NEXT:    ret
+  ret i32 229956 ; 0x38244
+}
+
+define i64 @imm64_1() nounwind {
+; RV64I-LABEL: imm64_1:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a0, 1
+; RV64I-NEXT:    slli a0, a0, 31
+; RV64I-NEXT:    ret
+;
+; RV64IZBA-LABEL: imm64_1:
+; RV64IZBA:       # %bb.0:
+; RV64IZBA-NEXT:    li a0, 1
+; RV64IZBA-NEXT:    slli a0, a0, 31
+; RV64IZBA-NEXT:    ret
+;
+; RV64IZBB-LABEL: imm64_1:
+; RV64IZBB:       # %bb.0:
+; RV64IZBB-NEXT:    li a0, 1
+; RV64IZBB-NEXT:    slli a0, a0, 31
+; RV64IZBB-NEXT:    ret
+;
+; RV64IZBS-LABEL: imm64_1:
+; RV64IZBS:       # %bb.0:
+; RV64IZBS-NEXT:    bseti a0, zero, 31
+; RV64IZBS-NEXT:    ret
+;
+; RV64IXTHEADBB-LABEL: imm64_1:
+; RV64IXTHEADBB:       # %bb.0:
+; RV64IXTHEADBB-NEXT:    li a0, 1
+; RV64IXTHEADBB-NEXT:    slli a0, a0, 31
+; RV64IXTHEADBB-NEXT:    ret
+  ret i64 2147483648 ; 0x8000_0000
+}
+
+define i64 @imm64_2() nounwind {
+; RV64I-LABEL: imm64_2:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a0, -1
+; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    ret
+;
+; RV64IZBA-LABEL: imm64_2:
+; RV64IZBA:       # %bb.0:
+; RV64IZBA-NEXT:    li a0, -1
+; RV64IZBA-NEXT:    srli a0, a0, 32
+; RV64IZBA-NEXT:    ret
+;
+; RV64IZBB-LABEL: imm64_2:
+; RV64IZBB:       # %bb.0:
+; RV64IZBB-NEXT:    li a0, -1
+; RV64IZBB-NEXT:    srli a0, a0, 32
+; RV64IZBB-NEXT:    ret
+;
+; RV64IZBS-LABEL: imm64_2:
+; RV64IZBS:       # %bb.0:
+; RV64IZBS-NEXT:    li a0, -1
+; RV64IZBS-NEXT:    srli a0, a0, 32
+; RV64IZBS-NEXT:    ret
+;
+; RV64IXTHEADBB-LABEL: imm64_2:
+; RV64IXTHEADBB:       # %bb.0:
+; RV64IXTHEADBB-NEXT:    li a0, -1
+; RV64IXTHEADBB-NEXT:    srli a0, a0, 32
+; RV64IXTHEADBB-NEXT:    ret
+  ret i64 4294967295 ; 0xFFFF_FFFF
+}
+
+define i64 @imm64_3() nounwind {
+; RV64I-LABEL: imm64_3:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a0, 1
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    ret
+;
+; RV64IZBA-LABEL: imm64_3:
+; RV64IZBA:       # %bb.0:
+; RV64IZBA-NEXT:    li a0, 1
+; RV64IZBA-NEXT:    slli a0, a0, 32
+; RV64IZBA-NEXT:    ret
+;
+; RV64IZBB-LABEL: imm64_3:
+; RV64IZBB:       # %bb.0:
+; RV64IZBB-NEXT:    li a0, 1
+; RV64IZBB-NEXT:    slli a0, a0, 32
+; RV64IZBB-NEXT:    ret
+;
+; RV64IZBS-LABEL: imm64_3:
+; RV64IZBS:       # %bb.0:
+; RV64IZBS-NEXT:    bseti a0, zero, 32
+; RV64IZBS-NEXT:    ret
+;
+; RV64IXTHEADBB-LABEL: imm64_3:
+; RV64IXTHEADBB:       # %bb.0:
+; RV64IXTHEADBB-NEXT:    li a0, 1
+; RV64IXTHEADBB-NEXT:    slli a0, a0, 32
+; RV64IXTHEADBB-NEXT:    ret
+  ret i64 4294967296 ; 0x1_0000_0000
+}
+
+define i64 @imm64_4() nounwind {
+; RV64I-LABEL: imm64_4:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a0, -1
+; RV64I-NEXT:    slli a0, a0, 63
+; RV64I-NEXT:    ret
+;
+; RV64IZBA-LABEL: imm64_4:
+; RV64IZBA:       # %bb.0:
+; RV64IZBA-NEXT:    li a0, -1
+; RV64IZBA-NEXT:    slli a0, a0, 63
+; RV64IZBA-NEXT:    ret
+;
+; RV64IZBB-LABEL: imm64_4:
+; RV64IZBB:       # %bb.0:
+; RV64IZBB-NEXT:    li a0, -1
+; RV64IZBB-NEXT:    slli a0, a0, 63
+; RV64IZBB-NEXT:    ret
+;
+; RV64IZBS-LABEL: imm64_4:
+; RV64IZBS:       # %bb.0:
+; RV64IZBS-NEXT:    bseti a0, zero, 63
+; RV64IZBS-NEXT:    ret
+;
+; RV64IXTHEADBB-LABEL: imm64_4:
+; RV64IXTHEADBB:       # %bb.0:
+; RV64IXTHEADBB-NEXT:    li a0, -1
+; RV64IXTHEADBB-NEXT:    slli a0, a0, 63
+; RV64IXTHEADBB-NEXT:    ret
+  ret i64 9223372036854775808 ; 0x8000_0000_0000_0000
+}
+
+define i64 @imm64_5() nounwind {
+; RV64I-LABEL: imm64_5:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a0, -1
+; RV64I-NEXT:    slli a0, a0, 63
+; RV64I-NEXT:    ret
+;
+; RV64IZBA-LABEL: imm64_5:
+; RV64IZBA:       # %bb.0:
+; RV64IZBA-NEXT:    li a0, -1
+; RV64IZBA-NEXT:    slli a0, a0, 63
+; RV64IZBA-NEXT:    ret
+;
+; RV64IZBB-LABEL: imm64_5:
+; RV64IZBB:       # %bb.0:
+; RV64IZBB-NEXT:    li a0, -1
+; RV64IZBB-NEXT:    slli a0, a0, 63
+; RV64IZBB-NEXT:    ret
+;
+; RV64IZBS-LABEL: imm64_5:
+; RV64IZBS:       # %bb.0:
+; RV64IZBS-NEXT:    bseti a0, zero, 63
+; RV64IZBS-NEXT:    ret
+;
+; RV64IXTHEADBB-LABEL: imm64_5:
+; RV64IXTHEADBB:       # %bb.0:
+; RV64IXTHEADBB-NEXT:    li a0, -1
+; RV64IXTHEADBB-NEXT:    slli a0, a0, 63
+; RV64IXTHEADBB-NEXT:    ret
+  ret i64 -9223372036854775808 ; 0x8000_0000_0000_0000
+}
+
+define i64 @imm64_6() nounwind {
+; RV64I-LABEL: imm64_6:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lui a0, 9321
+; RV64I-NEXT:    addi a0, a0, -1329
+; RV64I-NEXT:    slli a0, a0, 35
+; RV64I-NEXT:    ret
+;
+; RV64IZBA-LABEL: imm64_6:
+; RV64IZBA:       # %bb.0:
+; RV64IZBA-NEXT:    lui a0, 9321
+; RV64IZBA-NEXT:    addi a0, a0, -1329
+; RV64IZBA-NEXT:    slli a0, a0, 35
+; RV64IZBA-NEXT:    ret
+;
+; RV64IZBB-LABEL: imm64_6:
+; RV64IZBB:       # %bb.0:
+; RV64IZBB-NEXT:    lui a0, 9321
+; RV64IZBB-NEXT:    addi a0, a0, -1329
+; RV64IZBB-NEXT:    slli a0, a0, 35
+; RV64IZBB-NEXT:    ret
+;
+; RV64IZBS-LABEL: imm64_6:
+; RV64IZBS:       # %bb.0:
+; RV64IZBS-NEXT:    lui a0, 9321
+; RV64IZBS-NEXT:    addi a0, a0, -1329
+; RV64IZBS-NEXT:    slli a0, a0, 35
+; RV64IZBS-NEXT:    ret
+;
+; RV64IXTHEADBB-LABEL: imm64_6:
+; RV64IXTHEADBB:       # %bb.0:
+; RV64IXTHEADBB-NEXT:    lui a0, 9321
+; RV64IXTHEADBB-NEXT:    addi a0, a0, -1329
+; RV64IXTHEADBB-NEXT:    slli a0, a0, 35
+; RV64IXTHEADBB-NEXT:    ret
+  ret i64 1311768464867721216 ; 0x1234_5678_0000_0000
+}
+
+define i64 @imm64_7() nounwind {
+; RV64I-LABEL: imm64_7:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a0, 7
+; RV64I-NEXT:    slli a0, a0, 36
+; RV64I-NEXT:    addi a0, a0, 11
+; RV64I-NEXT:    slli a0, a0, 24
+; RV64I-NEXT:    addi a0, a0, 15
+; RV64I-NEXT:    ret
+;
+; RV64IZBA-LABEL: imm64_7:
+; RV64IZBA:       # %bb.0:
+; RV64IZBA-NEXT:    li a0, 7
+; RV64IZBA-NEXT:    slli a0, a0, 36
+; RV64IZBA-NEXT:    addi a0, a0, 11
+; RV64IZBA-NEXT:    slli a0, a0, 24
+; RV64IZBA-NEXT:    addi a0, a0, 15
+; RV64IZBA-NEXT:    ret
+;
+; RV64IZBB-LABEL: imm64_7:
+; RV64IZBB:       # %bb.0:
+; RV64IZBB-NEXT:    li a0, 7
+; RV64IZBB-NEXT:    slli a0, a0, 36
+; RV64IZBB-NEXT:    addi a0, a0, 11
+; RV64IZBB-NEXT:    slli a0, a0, 24
+; RV64IZBB-NEXT:    addi a0, a0, 15
+; RV64IZBB-NEXT:    ret
+;
+; RV64IZBS-LABEL: imm64_7:
+; RV64IZBS:       # %bb.0:
+; RV64IZBS-NEXT:    li a0, 7
+; RV64IZBS-NEXT:    slli a0, a0, 36
+; RV64IZBS-NEXT:    addi a0, a0, 11
+; RV64IZBS-NEXT:    slli a0, a0, 24
+; RV64IZBS-NEXT:    addi a0, a0, 15
+; RV64IZBS-NEXT:    ret
+;
+; RV64IXTHEADBB-LABEL: imm64_7:
+; RV64IXTHEADBB:       # %bb.0:
+; RV64IXTHEADBB-NEXT:    li a0, 7
+; RV64IXTHEADBB-NEXT:    slli a0, a0, 36
+; RV64IXTHEADBB-NEXT:    addi a0, a0, 11
+; RV64IXTHEADBB-NEXT:    slli a0, a0, 24
+; RV64IXTHEADBB-NEXT:    addi a0, a0, 15
+; RV64IXTHEADBB-NEXT:    ret
+  ret i64 8070450532432478223 ; 0x7000_0000_0B00_000F
+}
+
+; TODO: it can be preferable to put constants that are expensive to materialise
+; into the constant pool, especially for -Os.
+define i64 @imm64_8() nounwind {
+; RV64-NOPOOL-LABEL: imm64_8:
+; RV64-NOPOOL:       # %bb.0:
+; RV64-NOPOOL-NEXT:    lui a0, 583
+; RV64-NOPOOL-NEXT:    addiw a0, a0, -1875
+; RV64-NOPOOL-NEXT:    slli a0, a0, 14
+; RV64-NOPOOL-NEXT:    addi a0, a0, -947
+; RV64-NOPOOL-NEXT:    slli a0, a0, 12
+; RV64-NOPOOL-NEXT:    addi a0, a0, 1511
+; RV64-NOPOOL-NEXT:    slli a0, a0, 13
+; RV64-NOPOOL-NEXT:    addi a0, a0, -272
+; RV64-NOPOOL-NEXT:    ret
+;
+; RV64I-POOL-LABEL: imm64_8:
+; RV64I-POOL:       # %bb.0:
+; RV64I-POOL-NEXT:    lui a0, %hi(.LCPI17_0)
+; RV64I-POOL-NEXT:    ld a0, %lo(.LCPI17_0)(a0)
+; RV64I-POOL-NEXT:    ret
+;
+; RV64IZBA-LABEL: imm64_8:
+; RV64IZBA:       # %bb.0:
+; RV64IZBA-NEXT:    lui a0, 596523
+; RV64IZBA-NEXT:    addi a0, a0, 965
+; RV64IZBA-NEXT:    slli.uw a0, a0, 13
+; RV64IZBA-NEXT:    addi a0, a0, -1347
+; RV64IZBA-NEXT:    slli a0, a0, 12
+; RV64IZBA-NEXT:    addi a0, a0, -529
+; RV64IZBA-NEXT:    slli a0, a0, 4
+; RV64IZBA-NEXT:    ret
+;
+; RV64IZBB-LABEL: imm64_8:
+; RV64IZBB:       # %bb.0:
+; RV64IZBB-NEXT:    lui a0, 583
+; RV64IZBB-NEXT:    addiw a0, a0, -1875
+; RV64IZBB-NEXT:    slli a0, a0, 14
+; RV64IZBB-NEXT:    addi a0, a0, -947
+; RV64IZBB-NEXT:    slli a0, a0, 12
+; RV64IZBB-NEXT:    addi a0, a0, 1511
+; RV64IZBB-NEXT:    slli a0, a0, 13
+; RV64IZBB-NEXT:    addi a0, a0, -272
+; RV64IZBB-NEXT:    ret
+;
+; RV64IZBS-LABEL: imm64_8:
+; RV64IZBS:       # %bb.0:
+; RV64IZBS-NEXT:    lui a0, 583
+; RV64IZBS-NEXT:    addiw a0, a0, -1875
+; RV64IZBS-NEXT:    slli a0, a0, 14
+; RV64IZBS-NEXT:    addi a0, a0, -947
+; RV64IZBS-NEXT:    slli a0, a0, 12
+; RV64IZBS-NEXT:    addi a0, a0, 1511
+; RV64IZBS-NEXT:    slli a0, a0, 13
+; RV64IZBS-NEXT:    addi a0, a0, -272
+; RV64IZBS-NEXT:    ret
+;
+; RV64IXTHEADBB-LABEL: imm64_8:
+; RV64IXTHEADBB:       # %bb.0:
+; RV64IXTHEADBB-NEXT:    lui a0, 583
+; RV64IXTHEADBB-NEXT:    addiw a0, a0, -1875
+; RV64IXTHEADBB-NEXT:    slli a0, a0, 14
+; RV64IXTHEADBB-NEXT:    addi a0, a0, -947
+; RV64IXTHEADBB-NEXT:    slli a0, a0, 12
+; RV64IXTHEADBB-NEXT:    addi a0, a0, 1511
+; RV64IXTHEADBB-NEXT:    slli a0, a0, 13
+; RV64IXTHEADBB-NEXT:    addi a0, a0, -272
+; RV64IXTHEADBB-NEXT:    ret
+  ret i64 1311768467463790320 ; 0x1234_5678_9ABC_DEF0
+}
+
+define i64 @imm64_9() nounwind {
+; RV64I-LABEL: imm64_9:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a0, -1
+; RV64I-NEXT:    ret
+;
+; RV64IZBA-LABEL: imm64_9:
+; RV64IZBA:       # %bb.0:
+; RV64IZBA-NEXT:    li a0, -1
+; RV64IZBA-NEXT:    ret
+;
+; RV64IZBB-LABEL: imm64_9:
+; RV64IZBB:       # %bb.0:
+; RV64IZBB-NEXT:    li a0, -1
+; RV64IZBB-NEXT:    ret
+;
+; RV64IZBS-LABEL: imm64_9:
+; RV64IZBS:       # %bb.0:
+; RV64IZBS-NEXT:    li a0, -1
+; RV64IZBS-NEXT:    ret
+;
+; RV64IXTHEADBB-LABEL: imm64_9:
+; RV64IXTHEADBB:       # %bb.0:
+; RV64IXTHEADBB-NEXT:    li a0, -1
+; RV64IXTHEADBB-NEXT:    ret
+  ret i64 -1
+}
+
+; Various cases where extraneous ADDIs can be inserted where a (left shifted)
+; LUI suffices.
+
+define i64 @imm_left_shifted_lui_1() nounwind {
+; RV64I-LABEL: imm_left_shifted_lui_1:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lui a0, 262145
+; RV64I-NEXT:    slli a0, a0, 1
+; RV64I-NEXT:    ret
+;
+; RV64IZBA-LABEL: imm_left_shifted_lui_1:
+; RV64IZBA:       # %bb.0:
+; RV64IZBA-NEXT:    lui a0, 262145
+; RV64IZBA-NEXT:    slli a0, a0, 1
+; RV64IZBA-NEXT:    ret
+;
+; RV64IZBB-LABEL: imm_left_shifted_lui_1:
+; RV64IZBB:       # %bb.0:
+; RV64IZBB-NEXT:    lui a0, 262145
+; RV64IZBB-NEXT:    slli a0, a0, 1
+; RV64IZBB-NEXT:    ret
+;
+; RV64IZBS-LABEL: imm_left_shifted_lui_1:
+; RV64IZBS:       # %bb.0:
+; RV64IZBS-NEXT:    lui a0, 262145
+; RV64IZBS-NEXT:    slli a0, a0, 1
+; RV64IZBS-NEXT:    ret
+;
+; RV64IXTHEADBB-LABEL: imm_left_shifted_lui_1:
+; RV64IXTHEADBB:       # %bb.0:
+; RV64IXTHEADBB-NEXT:    lui a0, 262145
+; RV64IXTHEADBB-NEXT:    slli a0, a0, 1
+; RV64IXTHEADBB-NEXT:    ret
+  ret i64 2147491840 ; 0x8000_2000
+}
+
+define i64 @imm_left_shifted_lui_2() nounwind {
+; RV64I-LABEL: imm_left_shifted_lui_2:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lui a0, 262145
+; RV64I-NEXT:    slli a0, a0, 2
+; RV64I-NEXT:    ret
+;
+; RV64IZBA-LABEL: imm_left_shifted_lui_2:
+; RV64IZBA:       # %bb.0:
+; RV64IZBA-NEXT:    lui a0, 262145
+; RV64IZBA-NEXT:    slli a0, a0, 2
+; RV64IZBA-NEXT:    ret
+;
+; RV64IZBB-LABEL: imm_left_shifted_lui_2:
+; RV64IZBB:       # %bb.0:
+; RV64IZBB-NEXT:    lui a0, 262145
+; RV64IZBB-NEXT:    slli a0, a0, 2
+; RV64IZBB-NEXT:    ret
+;
+; RV64IZBS-LABEL: imm_left_shifted_lui_2:
+; RV64IZBS:       # %bb.0:
+; RV64IZBS-NEXT:    lui a0, 262145
+; RV64IZBS-NEXT:    slli a0, a0, 2
+; RV64IZBS-NEXT:    ret
+;
+; RV64IXTHEADBB-LABEL: imm_left_shifted_lui_2:
+; RV64IXTHEADBB:       # %bb.0:
+; RV64IXTHEADBB-NEXT:    lui a0, 262145
+; RV64IXTHEADBB-NEXT:    slli a0, a0, 2
+; RV64IXTHEADBB-NEXT:    ret
+  ret i64 4294983680 ; 0x1_0000_4000
+}
+
+define i64 @imm_left_shifted_lui_3() nounwind {
+; RV64I-LABEL: imm_left_shifted_lui_3:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lui a0, 4097
+; RV64I-NEXT:    slli a0, a0, 20
+; RV64I-NEXT:    ret
+;
+; RV64IZBA-LABEL: imm_left_shifted_lui_3:
+; RV64IZBA:       # %bb.0:
+; RV64IZBA-NEXT:    lui a0, 4097
+; RV64IZBA-NEXT:    slli a0, a0, 20
+; RV64IZBA-NEXT:    ret
+;
+; RV64IZBB-LABEL: imm_left_shifted_lui_3:
+; RV64IZBB:       # %bb.0:
+; RV64IZBB-NEXT:    lui a0, 4097
+; RV64IZBB-NEXT:    slli a0, a0, 20
+; RV64IZBB-NEXT:    ret
+;
+; RV64IZBS-LABEL: imm_left_shifted_lui_3:
+; RV64IZBS:       # %bb.0:
+; RV64IZBS-NEXT:    lui a0, 4097
+; RV64IZBS-NEXT:    slli a0, a0, 20
+; RV64IZBS-NEXT:    ret
+;
+; RV64IXTHEADBB-LABEL: imm_left_shifted_lui_3:
+; RV64IXTHEADBB:       # %bb.0:
+; RV64IXTHEADBB-NEXT:    lui a0, 4097
+; RV64IXTHEADBB-NEXT:    slli a0, a0, 20
+; RV64IXTHEADBB-NEXT:    ret
+  ret i64 17596481011712 ; 0x1001_0000_0000
+}
+
+; Various cases where extraneous ADDIs can be inserted where a (right shifted)
+; LUI suffices, or where multiple ADDIs can be used instead of a single LUI.
+
+define i64 @imm_right_shifted_lui_1() nounwind {
+; RV64I-LABEL: imm_right_shifted_lui_1:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lui a0, 983056
+; RV64I-NEXT:    srli a0, a0, 16
+; RV64I-NEXT:    ret
+;
+; RV64IZBA-LABEL: imm_right_shifted_lui_1:
+; RV64IZBA:       # %bb.0:
+; RV64IZBA-NEXT:    lui a0, 983056
+; RV64IZBA-NEXT:    srli a0, a0, 16
+; RV64IZBA-NEXT:    ret
+;
+; RV64IZBB-LABEL: imm_right_shifted_lui_1:
+; RV64IZBB:       # %bb.0:
+; RV64IZBB-NEXT:    lui a0, 983056
+; RV64IZBB-NEXT:    srli a0, a0, 16
+; RV64IZBB-NEXT:    ret
+;
+; RV64IZBS-LABEL: imm_right_shifted_lui_1:
+; RV64IZBS:       # %bb.0:
+; RV64IZBS-NEXT:    lui a0, 983056
+; RV64IZBS-NEXT:    srli a0, a0, 16
+; RV64IZBS-NEXT:    ret
+;
+; RV64IXTHEADBB-LABEL: imm_right_shifted_lui_1:
+; RV64IXTHEADBB:       # %bb.0:
+; RV64IXTHEADBB-NEXT:    lui a0, 983056
+; RV64IXTHEADBB-NEXT:    srli a0, a0, 16
+; RV64IXTHEADBB-NEXT:    ret
+  ret i64 281474976706561 ; 0xFFFF_FFFF_F001
+}
+
+define i64 @imm_right_shifted_lui_2() nounwind {
+; RV64I-LABEL: imm_right_shifted_lui_2:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lui a0, 1044481
+; RV64I-NEXT:    slli a0, a0, 12
+; RV64I-NEXT:    srli a0, a0, 24
+; RV64I-NEXT:    ret
+;
+; RV64IZBA-LABEL: imm_right_shifted_lui_2:
+; RV64IZBA:       # %bb.0:
+; RV64IZBA-NEXT:    lui a0, 1044481
+; RV64IZBA-NEXT:    slli a0, a0, 12
+; RV64IZBA-NEXT:    srli a0, a0, 24
+; RV64IZBA-NEXT:    ret
+;
+; RV64IZBB-LABEL: imm_right_shifted_lui_2:
+; RV64IZBB:       # %bb.0:
+; RV64IZBB-NEXT:    lui a0, 1044481
+; RV64IZBB-NEXT:    slli a0, a0, 12
+; RV64IZBB-NEXT:    srli a0, a0, 24
+; RV64IZBB-NEXT:    ret
+;
+; RV64IZBS-LABEL: imm_right_shifted_lui_2:
+; RV64IZBS:       # %bb.0:
+; RV64IZBS-NEXT:    lui a0, 1044481
+; RV64IZBS-NEXT:    slli a0, a0, 12
+; RV64IZBS-NEXT:    srli a0, a0, 24
+; RV64IZBS-NEXT:    ret
+;
+; RV64IXTHEADBB-LABEL: imm_right_shifted_lui_2:
+; RV64IXTHEADBB:       # %bb.0:
+; RV64IXTHEADBB-NEXT:    lui a0, 1044481
+; RV64IXTHEADBB-NEXT:    slli a0, a0, 12
+; RV64IXTHEADBB-NEXT:    srli a0, a0, 24
+; RV64IXTHEADBB-NEXT:    ret
+  ret i64 1099511623681 ; 0xFF_FFFF_F001
+}
+
+; We can materialize the upper bits with a single (shifted) LUI, but that option
+; can be missed due to the lower bits, which aren't just 1s or just 0s.
+
+define i64 @imm_decoupled_lui_addi() nounwind {
+; RV64I-LABEL: imm_decoupled_lui_addi:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lui a0, 4097
+; RV64I-NEXT:    slli a0, a0, 20
+; RV64I-NEXT:    addi a0, a0, -3
+; RV64I-NEXT:    ret
+;
+; RV64IZBA-LABEL: imm_decoupled_lui_addi:
+; RV64IZBA:       # %bb.0:
+; RV64IZBA-NEXT:    lui a0, 4097
+; RV64IZBA-NEXT:    slli a0, a0, 20
+; RV64IZBA-NEXT:    addi a0, a0, -3
+; RV64IZBA-NEXT:    ret
+;
+; RV64IZBB-LABEL: imm_decoupled_lui_addi:
+; RV64IZBB:       # %bb.0:
+; RV64IZBB-NEXT:    lui a0, 4097
+; RV64IZBB-NEXT:    slli a0, a0, 20
+; RV64IZBB-NEXT:    addi a0, a0, -3
+; RV64IZBB-NEXT:    ret
+;
+; RV64IZBS-LABEL: imm_decoupled_lui_addi:
+; RV64IZBS:       # %bb.0:
+; RV64IZBS-NEXT:    lui a0, 4097
+; RV64IZBS-NEXT:    slli a0, a0, 20
+; RV64IZBS-NEXT:    addi a0, a0, -3
+; RV64IZBS-NEXT:    ret
+;
+; RV64IXTHEADBB-LABEL: imm_decoupled_lui_addi:
+; RV64IXTHEADBB:       # %bb.0:
+; RV64IXTHEADBB-NEXT:    lui a0, 4097
+; RV64IXTHEADBB-NEXT:    slli a0, a0, 20
+; RV64IXTHEADBB-NEXT:    addi a0, a0, -3
+; RV64IXTHEADBB-NEXT:    ret
+  ret i64 17596481011709 ; 0x1000_FFFF_FFFD
+}
+
+; This constant can be materialized for RV64 with LUI+SRLI+XORI.
+
+define i64 @imm_end_xori_1() nounwind {
+; RV64I-LABEL: imm_end_xori_1:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lui a0, 983040
+; RV64I-NEXT:    srli a0, a0, 3
+; RV64I-NEXT:    not a0, a0
+; RV64I-NEXT:    ret
+;
+; RV64IZBA-LABEL: imm_end_xori_1:
+; RV64IZBA:       # %bb.0:
+; RV64IZBA-NEXT:    lui a0, 983040
+; RV64IZBA-NEXT:    srli a0, a0, 3
+; RV64IZBA-NEXT:    not a0, a0
+; RV64IZBA-NEXT:    ret
+;
+; RV64IZBB-LABEL: imm_end_xori_1:
+; RV64IZBB:       # %bb.0:
+; RV64IZBB-NEXT:    lui a0, 983040
+; RV64IZBB-NEXT:    srli a0, a0, 3
+; RV64IZBB-NEXT:    not a0, a0
+; RV64IZBB-NEXT:    ret
+;
+; RV64IZBS-LABEL: imm_end_xori_1:
+; RV64IZBS:       # %bb.0:
+; RV64IZBS-NEXT:    lui a0, 983040
+; RV64IZBS-NEXT:    srli a0, a0, 3
+; RV64IZBS-NEXT:    not a0, a0
+; RV64IZBS-NEXT:    ret
+;
+; RV64IXTHEADBB-LABEL: imm_end_xori_1:
+; RV64IXTHEADBB:       # %bb.0:
+; RV64IXTHEADBB-NEXT:    lui a0, 983040
+; RV64IXTHEADBB-NEXT:    srli a0, a0, 3
+; RV64IXTHEADBB-NEXT:    not a0, a0
+; RV64IXTHEADBB-NEXT:    ret
+  ret i64 -2305843009180139521 ; 0xE000_0000_01FF_FFFF
+}
+
+; This constant can be materialized for RV64 with ADDI+SLLI+ADDI+ADDI.
+
+define i64 @imm_end_2addi_1() nounwind {
+; RV64I-LABEL: imm_end_2addi_1:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a0, -2047
+; RV64I-NEXT:    slli a0, a0, 39
+; RV64I-NEXT:    addi a0, a0, -2048
+; RV64I-NEXT:    addi a0, a0, -1
+; RV64I-NEXT:    ret
+;
+; RV64IZBA-LABEL: imm_end_2addi_1:
+; RV64IZBA:       # %bb.0:
+; RV64IZBA-NEXT:    li a0, -2047
+; RV64IZBA-NEXT:    slli a0, a0, 39
+; RV64IZBA-NEXT:    addi a0, a0, -2048
+; RV64IZBA-NEXT:    addi a0, a0, -1
+; RV64IZBA-NEXT:    ret
+;
+; RV64IZBB-LABEL: imm_end_2addi_1:
+; RV64IZBB:       # %bb.0:
+; RV64IZBB-NEXT:    li a0, -2047
+; RV64IZBB-NEXT:    slli a0, a0, 39
+; RV64IZBB-NEXT:    addi a0, a0, -2048
+; RV64IZBB-NEXT:    addi a0, a0, -1
+; RV64IZBB-NEXT:    ret
+;
+; RV64IZBS-LABEL: imm_end_2addi_1:
+; RV64IZBS:       # %bb.0:
+; RV64IZBS-NEXT:    li a0, -2047
+; RV64IZBS-NEXT:    slli a0, a0, 39
+; RV64IZBS-NEXT:    addi a0, a0, -2048
+; RV64IZBS-NEXT:    addi a0, a0, -1
+; RV64IZBS-NEXT:    ret
+;
+; RV64IXTHEADBB-LABEL: imm_end_2addi_1:
+; RV64IXTHEADBB:       # %bb.0:
+; RV64IXTHEADBB-NEXT:    li a0, -2047
+; RV64IXTHEADBB-NEXT:    slli a0, a0, 39
+; RV64IXTHEADBB-NEXT:    addi a0, a0, -2048
+; RV64IXTHEADBB-NEXT:    addi a0, a0, -1
+; RV64IXTHEADBB-NEXT:    ret
+  ret i64 -1125350151030785 ; 0xFFFC_007F_FFFF_F7FF
+}
+
+; This constant can be more efficiently materialized for RV64 if we use two
+; registers instead of one.
+
+define i64 @imm_2reg_1() nounwind {
+; RV64I-LABEL: imm_2reg_1:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lui a0, 74565
+; RV64I-NEXT:    addiw a0, a0, 1656
+; RV64I-NEXT:    slli a1, a0, 57
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64IZBA-LABEL: imm_2reg_1:
+; RV64IZBA:       # %bb.0:
+; RV64IZBA-NEXT:    lui a0, 74565
+; RV64IZBA-NEXT:    addiw a0, a0, 1656
+; RV64IZBA-NEXT:    slli a1, a0, 57
+; RV64IZBA-NEXT:    add a0, a0, a1
+; RV64IZBA-NEXT:    ret
+;
+; RV64IZBB-LABEL: imm_2reg_1:
+; RV64IZBB:       # %bb.0:
+; RV64IZBB-NEXT:    lui a0, 74565
+; RV64IZBB-NEXT:    addiw a0, a0, 1656
+; RV64IZBB-NEXT:    slli a1, a0, 57
+; RV64IZBB-NEXT:    add a0, a0, a1
+; RV64IZBB-NEXT:    ret
+;
+; RV64IZBS-LABEL: imm_2reg_1:
+; RV64IZBS:       # %bb.0:
+; RV64IZBS-NEXT:    lui a0, 74565
+; RV64IZBS-NEXT:    addiw a0, a0, 1656
+; RV64IZBS-NEXT:    slli a1, a0, 57
+; RV64IZBS-NEXT:    add a0, a0, a1
+; RV64IZBS-NEXT:    ret
+;
+; RV64IXTHEADBB-LABEL: imm_2reg_1:
+; RV64IXTHEADBB:       # %bb.0:
+; RV64IXTHEADBB-NEXT:    lui a0, 74565
+; RV64IXTHEADBB-NEXT:    addiw a0, a0, 1656
+; RV64IXTHEADBB-NEXT:    slli a1, a0, 57
+; RV64IXTHEADBB-NEXT:    add a0, a0, a1
+; RV64IXTHEADBB-NEXT:    ret
+  ret i64 -1152921504301427080 ; 0xF000_0000_1234_5678
+}
+
+; FIXME: This should use a single ADDI for the immediate.
+define void @imm_store_i16_neg1(ptr %p) nounwind {
+; RV64I-LABEL: imm_store_i16_neg1:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a1, -1
+; RV64I-NEXT:    sh a1, 0(a0)
+; RV64I-NEXT:    ret
+;
+; RV64IZBA-LABEL: imm_store_i16_neg1:
+; RV64IZBA:       # %bb.0:
+; RV64IZBA-NEXT:    li a1, -1
+; RV64IZBA-NEXT:    sh a1, 0(a0)
+; RV64IZBA-NEXT:    ret
+;
+; RV64IZBB-LABEL: imm_store_i16_neg1:
+; RV64IZBB:       # %bb.0:
+; RV64IZBB-NEXT:    li a1, -1
+; RV64IZBB-NEXT:    sh a1, 0(a0)
+; RV64IZBB-NEXT:    ret
+;
+; RV64IZBS-LABEL: imm_store_i16_neg1:
+; RV64IZBS:       # %bb.0:
+; RV64IZBS-NEXT:    li a1, -1
+; RV64IZBS-NEXT:    sh a1, 0(a0)
+; RV64IZBS-NEXT:    ret
+;
+; RV64IXTHEADBB-LABEL: imm_store_i16_neg1:
+; RV64IXTHEADBB:       # %bb.0:
+; RV64IXTHEADBB-NEXT:    li a1, -1
+; RV64IXTHEADBB-NEXT:    sh a1, 0(a0)
+; RV64IXTHEADBB-NEXT:    ret
+  store i16 -1, ptr %p
+  ret void
+}
+
+; FIXME: This should use a single ADDI for the immediate.
+define void @imm_store_i32_neg1(ptr %p) nounwind {
+; RV64I-LABEL: imm_store_i32_neg1:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a1, -1
+; RV64I-NEXT:    sw a1, 0(a0)
+; RV64I-NEXT:    ret
+;
+; RV64IZBA-LABEL: imm_store_i32_neg1:
+; RV64IZBA:       # %bb.0:
+; RV64IZBA-NEXT:    li a1, -1
+; RV64IZBA-NEXT:    sw a1, 0(a0)
+; RV64IZBA-NEXT:    ret
+;
+; RV64IZBB-LABEL: imm_store_i32_neg1:
+; RV64IZBB:       # %bb.0:
+; RV64IZBB-NEXT:    li a1, -1
+; RV64IZBB-NEXT:    sw a1, 0(a0)
+; RV64IZBB-NEXT:    ret
+;
+; RV64IZBS-LABEL: imm_store_i32_neg1:
+; RV64IZBS:       # %bb.0:
+; RV64IZBS-NEXT:    li a1, -1
+; RV64IZBS-NEXT:    sw a1, 0(a0)
+; RV64IZBS-NEXT:    ret
+;
+; RV64IXTHEADBB-LABEL: imm_store_i32_neg1:
+; RV64IXTHEADBB:       # %bb.0:
+; RV64IXTHEADBB-NEXT:    li a1, -1
+; RV64IXTHEADBB-NEXT:    sw a1, 0(a0)
+; RV64IXTHEADBB-NEXT:    ret
+  store i32 -1, ptr %p
+  ret void
+}
+
+define i64 @imm_5372288229() {
+; RV64I-LABEL: imm_5372288229:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lui a0, 160
+; RV64I-NEXT:    addiw a0, a0, 437
+; RV64I-NEXT:    slli a0, a0, 13
+; RV64I-NEXT:    addi a0, a0, -795
+; RV64I-NEXT:    ret
+;
+; RV64IZBA-LABEL: imm_5372288229:
+; RV64IZBA:       # %bb.0:
+; RV64IZBA-NEXT:    lui a0, 655797
+; RV64IZBA-NEXT:    slli.uw a0, a0, 1
+; RV64IZBA-NEXT:    addi a0, a0, -795
+; RV64IZBA-NEXT:    ret
+;
+; RV64IZBB-LABEL: imm_5372288229:
+; RV64IZBB:       # %bb.0:
+; RV64IZBB-NEXT:    lui a0, 160
+; RV64IZBB-NEXT:    addiw a0, a0, 437
+; RV64IZBB-NEXT:    slli a0, a0, 13
+; RV64IZBB-NEXT:    addi a0, a0, -795
+; RV64IZBB-NEXT:    ret
+;
+; RV64IZBS-LABEL: imm_5372288229:
+; RV64IZBS:       # %bb.0:
+; RV64IZBS-NEXT:    lui a0, 263018
+; RV64IZBS-NEXT:    addiw a0, a0, -795
+; RV64IZBS-NEXT:    bseti a0, a0, 32
+; RV64IZBS-NEXT:    ret
+;
+; RV64IXTHEADBB-LABEL: imm_5372288229:
+; RV64IXTHEADBB:       # %bb.0:
+; RV64IXTHEADBB-NEXT:    lui a0, 160
+; RV64IXTHEADBB-NEXT:    addiw a0, a0, 437
+; RV64IXTHEADBB-NEXT:    slli a0, a0, 13
+; RV64IXTHEADBB-NEXT:    addi a0, a0, -795
+; RV64IXTHEADBB-NEXT:    ret
+  ret i64 5372288229
+}
+
+define i64 @imm_neg_5372288229() {
+; RV64I-LABEL: imm_neg_5372288229:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lui a0, 1048416
+; RV64I-NEXT:    addiw a0, a0, -437
+; RV64I-NEXT:    slli a0, a0, 13
+; RV64I-NEXT:    addi a0, a0, 795
+; RV64I-NEXT:    ret
+;
+; RV64IZBA-LABEL: imm_neg_5372288229:
+; RV64IZBA:       # %bb.0:
+; RV64IZBA-NEXT:    lui a0, 611378
+; RV64IZBA-NEXT:    addiw a0, a0, 265
+; RV64IZBA-NEXT:    sh1add a0, a0, a0
+; RV64IZBA-NEXT:    ret
+;
+; RV64IZBB-LABEL: imm_neg_5372288229:
+; RV64IZBB:       # %bb.0:
+; RV64IZBB-NEXT:    lui a0, 1048416
+; RV64IZBB-NEXT:    addiw a0, a0, -437
+; RV64IZBB-NEXT:    slli a0, a0, 13
+; RV64IZBB-NEXT:    addi a0, a0, 795
+; RV64IZBB-NEXT:    ret
+;
+; RV64IZBS-LABEL: imm_neg_5372288229:
+; RV64IZBS:       # %bb.0:
+; RV64IZBS-NEXT:    lui a0, 785558
+; RV64IZBS-NEXT:    addiw a0, a0, 795
+; RV64IZBS-NEXT:    bclri a0, a0, 32
+; RV64IZBS-NEXT:    ret
+;
+; RV64IXTHEADBB-LABEL: imm_neg_5372288229:
+; RV64IXTHEADBB:       # %bb.0:
+; RV64IXTHEADBB-NEXT:    lui a0, 1048416
+; RV64IXTHEADBB-NEXT:    addiw a0, a0, -437
+; RV64IXTHEADBB-NEXT:    slli a0, a0, 13
+; RV64IXTHEADBB-NEXT:    addi a0, a0, 795
+; RV64IXTHEADBB-NEXT:    ret
+  ret i64 -5372288229
+}
+
+define i64 @imm_8953813715() {
+; RV64I-LABEL: imm_8953813715:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lui a0, 267
+; RV64I-NEXT:    addiw a0, a0, -637
+; RV64I-NEXT:    slli a0, a0, 13
+; RV64I-NEXT:    addi a0, a0, -1325
+; RV64I-NEXT:    ret
+;
+; RV64IZBA-LABEL: imm_8953813715:
+; RV64IZBA:       # %bb.0:
+; RV64IZBA-NEXT:    lui a0, 437198
+; RV64IZBA-NEXT:    addiw a0, a0, -265
+; RV64IZBA-NEXT:    sh2add a0, a0, a0
+; RV64IZBA-NEXT:    ret
+;
+; RV64IZBB-LABEL: imm_8953813715:
+; RV64IZBB:       # %bb.0:
+; RV64IZBB-NEXT:    lui a0, 267
+; RV64IZBB-NEXT:    addiw a0, a0, -637
+; RV64IZBB-NEXT:    slli a0, a0, 13
+; RV64IZBB-NEXT:    addi a0, a0, -1325
+; RV64IZBB-NEXT:    ret
+;
+; RV64IZBS-LABEL: imm_8953813715:
+; RV64IZBS:       # %bb.0:
+; RV64IZBS-NEXT:    lui a0, 88838
+; RV64IZBS-NEXT:    addiw a0, a0, -1325
+; RV64IZBS-NEXT:    bseti a0, a0, 33
+; RV64IZBS-NEXT:    ret
+;
+; RV64IXTHEADBB-LABEL: imm_8953813715:
+; RV64IXTHEADBB:       # %bb.0:
+; RV64IXTHEADBB-NEXT:    lui a0, 267
+; RV64IXTHEADBB-NEXT:    addiw a0, a0, -637
+; RV64IXTHEADBB-NEXT:    slli a0, a0, 13
+; RV64IXTHEADBB-NEXT:    addi a0, a0, -1325
+; RV64IXTHEADBB-NEXT:    ret
+  ret i64 8953813715
+}
+
+define i64 @imm_neg_8953813715() {
+; RV64I-LABEL: imm_neg_8953813715:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lui a0, 1048309
+; RV64I-NEXT:    addiw a0, a0, 637
+; RV64I-NEXT:    slli a0, a0, 13
+; RV64I-NEXT:    addi a0, a0, 1325
+; RV64I-NEXT:    ret
+;
+; RV64IZBA-LABEL: imm_neg_8953813715:
+; RV64IZBA:       # %bb.0:
+; RV64IZBA-NEXT:    lui a0, 611378
+; RV64IZBA-NEXT:    addiw a0, a0, 265
+; RV64IZBA-NEXT:    sh2add a0, a0, a0
+; RV64IZBA-NEXT:    ret
+;
+; RV64IZBB-LABEL: imm_neg_8953813715:
+; RV64IZBB:       # %bb.0:
+; RV64IZBB-NEXT:    lui a0, 1048309
+; RV64IZBB-NEXT:    addiw a0, a0, 637
+; RV64IZBB-NEXT:    slli a0, a0, 13
+; RV64IZBB-NEXT:    addi a0, a0, 1325
+; RV64IZBB-NEXT:    ret
+;
+; RV64IZBS-LABEL: imm_neg_8953813715:
+; RV64IZBS:       # %bb.0:
+; RV64IZBS-NEXT:    lui a0, 959738
+; RV64IZBS-NEXT:    addiw a0, a0, 1325
+; RV64IZBS-NEXT:    bclri a0, a0, 33
+; RV64IZBS-NEXT:    ret
+;
+; RV64IXTHEADBB-LABEL: imm_neg_8953813715:
+; RV64IXTHEADBB:       # %bb.0:
+; RV64IXTHEADBB-NEXT:    lui a0, 1048309
+; RV64IXTHEADBB-NEXT:    addiw a0, a0, 637
+; RV64IXTHEADBB-NEXT:    slli a0, a0, 13
+; RV64IXTHEADBB-NEXT:    addi a0, a0, 1325
+; RV64IXTHEADBB-NEXT:    ret
+  ret i64 -8953813715
+}
+
+define i64 @imm_16116864687() {
+; RV64I-LABEL: imm_16116864687:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lui a0, 961
+; RV64I-NEXT:    addiw a0, a0, -1475
+; RV64I-NEXT:    slli a0, a0, 12
+; RV64I-NEXT:    addi a0, a0, 1711
+; RV64I-NEXT:    ret
+;
+; RV64IZBA-LABEL: imm_16116864687:
+; RV64IZBA:       # %bb.0:
+; RV64IZBA-NEXT:    lui a0, 437198
+; RV64IZBA-NEXT:    addiw a0, a0, -265
+; RV64IZBA-NEXT:    sh3add a0, a0, a0
+; RV64IZBA-NEXT:    ret
+;
+; RV64IZBB-LABEL: imm_16116864687:
+; RV64IZBB:       # %bb.0:
+; RV64IZBB-NEXT:    lui a0, 961
+; RV64IZBB-NEXT:    addiw a0, a0, -1475
+; RV64IZBB-NEXT:    slli a0, a0, 12
+; RV64IZBB-NEXT:    addi a0, a0, 1711
+; RV64IZBB-NEXT:    ret
+;
+; RV64IZBS-LABEL: imm_16116864687:
+; RV64IZBS:       # %bb.0:
+; RV64IZBS-NEXT:    lui a0, 961
+; RV64IZBS-NEXT:    addiw a0, a0, -1475
+; RV64IZBS-NEXT:    slli a0, a0, 12
+; RV64IZBS-NEXT:    addi a0, a0, 1711
+; RV64IZBS-NEXT:    ret
+;
+; RV64IXTHEADBB-LABEL: imm_16116864687:
+; RV64IXTHEADBB:       # %bb.0:
+; RV64IXTHEADBB-NEXT:    lui a0, 961
+; RV64IXTHEADBB-NEXT:    addiw a0, a0, -1475
+; RV64IXTHEADBB-NEXT:    slli a0, a0, 12
+; RV64IXTHEADBB-NEXT:    addi a0, a0, 1711
+; RV64IXTHEADBB-NEXT:    ret
+  ret i64 16116864687
+}
+
+define i64 @imm_neg_16116864687() {
+; RV64I-LABEL: imm_neg_16116864687:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lui a0, 1047615
+; RV64I-NEXT:    addiw a0, a0, 1475
+; RV64I-NEXT:    slli a0, a0, 12
+; RV64I-NEXT:    addi a0, a0, -1711
+; RV64I-NEXT:    ret
+;
+; RV64IZBA-LABEL: imm_neg_16116864687:
+; RV64IZBA:       # %bb.0:
+; RV64IZBA-NEXT:    lui a0, 611378
+; RV64IZBA-NEXT:    addiw a0, a0, 265
+; RV64IZBA-NEXT:    sh3add a0, a0, a0
+; RV64IZBA-NEXT:    ret
+;
+; RV64IZBB-LABEL: imm_neg_16116864687:
+; RV64IZBB:       # %bb.0:
+; RV64IZBB-NEXT:    lui a0, 1047615
+; RV64IZBB-NEXT:    addiw a0, a0, 1475
+; RV64IZBB-NEXT:    slli a0, a0, 12
+; RV64IZBB-NEXT:    addi a0, a0, -1711
+; RV64IZBB-NEXT:    ret
+;
+; RV64IZBS-LABEL: imm_neg_16116864687:
+; RV64IZBS:       # %bb.0:
+; RV64IZBS-NEXT:    lui a0, 1047615
+; RV64IZBS-NEXT:    addiw a0, a0, 1475
+; RV64IZBS-NEXT:    slli a0, a0, 12
+; RV64IZBS-NEXT:    addi a0, a0, -1711
+; RV64IZBS-NEXT:    ret
+;
+; RV64IXTHEADBB-LABEL: imm_neg_16116864687:
+; RV64IXTHEADBB:       # %bb.0:
+; RV64IXTHEADBB-NEXT:    lui a0, 1047615
+; RV64IXTHEADBB-NEXT:    addiw a0, a0, 1475
+; RV64IXTHEADBB-NEXT:    slli a0, a0, 12
+; RV64IXTHEADBB-NEXT:    addi a0, a0, -1711
+; RV64IXTHEADBB-NEXT:    ret
+  ret i64 -16116864687
+}
+
+define i64 @imm_2344336315() {
+; RV64I-LABEL: imm_2344336315:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lui a0, 143087
+; RV64I-NEXT:    slli a0, a0, 2
+; RV64I-NEXT:    addi a0, a0, -1093
+; RV64I-NEXT:    ret
+;
+; RV64IZBA-LABEL: imm_2344336315:
+; RV64IZBA:       # %bb.0:
+; RV64IZBA-NEXT:    lui a0, 143087
+; RV64IZBA-NEXT:    slli a0, a0, 2
+; RV64IZBA-NEXT:    addi a0, a0, -1093
+; RV64IZBA-NEXT:    ret
+;
+; RV64IZBB-LABEL: imm_2344336315:
+; RV64IZBB:       # %bb.0:
+; RV64IZBB-NEXT:    lui a0, 143087
+; RV64IZBB-NEXT:    slli a0, a0, 2
+; RV64IZBB-NEXT:    addi a0, a0, -1093
+; RV64IZBB-NEXT:    ret
+;
+; RV64IZBS-LABEL: imm_2344336315:
+; RV64IZBS:       # %bb.0:
+; RV64IZBS-NEXT:    lui a0, 143087
+; RV64IZBS-NEXT:    slli a0, a0, 2
+; RV64IZBS-NEXT:    addi a0, a0, -1093
+; RV64IZBS-NEXT:    ret
+;
+; RV64IXTHEADBB-LABEL: imm_2344336315:
+; RV64IXTHEADBB:       # %bb.0:
+; RV64IXTHEADBB-NEXT:    lui a0, 143087
+; RV64IXTHEADBB-NEXT:    slli a0, a0, 2
+; RV64IXTHEADBB-NEXT:    addi a0, a0, -1093
+; RV64IXTHEADBB-NEXT:    ret
+  ret i64 2344336315 ; 0x8bbbbbbb
+}
+
+define i64 @imm_70370820078523() {
+; RV64-NOPOOL-LABEL: imm_70370820078523:
+; RV64-NOPOOL:       # %bb.0:
+; RV64-NOPOOL-NEXT:    lui a0, 256
+; RV64-NOPOOL-NEXT:    addiw a0, a0, 31
+; RV64-NOPOOL-NEXT:    slli a0, a0, 12
+; RV64-NOPOOL-NEXT:    addi a0, a0, -273
+; RV64-NOPOOL-NEXT:    slli a0, a0, 14
+; RV64-NOPOOL-NEXT:    addi a0, a0, -1093
+; RV64-NOPOOL-NEXT:    ret
+;
+; RV64I-POOL-LABEL: imm_70370820078523:
+; RV64I-POOL:       # %bb.0:
+; RV64I-POOL-NEXT:    lui a0, %hi(.LCPI37_0)
+; RV64I-POOL-NEXT:    ld a0, %lo(.LCPI37_0)(a0)
+; RV64I-POOL-NEXT:    ret
+;
+; RV64IZBA-LABEL: imm_70370820078523:
+; RV64IZBA:       # %bb.0:
+; RV64IZBA-NEXT:    lui a0, 256
+; RV64IZBA-NEXT:    addiw a0, a0, 31
+; RV64IZBA-NEXT:    slli a0, a0, 12
+; RV64IZBA-NEXT:    addi a0, a0, -273
+; RV64IZBA-NEXT:    slli a0, a0, 14
+; RV64IZBA-NEXT:    addi a0, a0, -1093
+; RV64IZBA-NEXT:    ret
+;
+; RV64IZBB-LABEL: imm_70370820078523:
+; RV64IZBB:       # %bb.0:
+; RV64IZBB-NEXT:    lui a0, 256
+; RV64IZBB-NEXT:    addiw a0, a0, 31
+; RV64IZBB-NEXT:    slli a0, a0, 12
+; RV64IZBB-NEXT:    addi a0, a0, -273
+; RV64IZBB-NEXT:    slli a0, a0, 14
+; RV64IZBB-NEXT:    addi a0, a0, -1093
+; RV64IZBB-NEXT:    ret
+;
+; RV64IZBS-LABEL: imm_70370820078523:
+; RV64IZBS:       # %bb.0:
+; RV64IZBS-NEXT:    lui a0, 506812
+; RV64IZBS-NEXT:    addiw a0, a0, -1093
+; RV64IZBS-NEXT:    bseti a0, a0, 46
+; RV64IZBS-NEXT:    ret
+;
+; RV64IXTHEADBB-LABEL: imm_70370820078523:
+; RV64IXTHEADBB:       # %bb.0:
+; RV64IXTHEADBB-NEXT:    lui a0, 256
+; RV64IXTHEADBB-NEXT:    addiw a0, a0, 31
+; RV64IXTHEADBB-NEXT:    slli a0, a0, 12
+; RV64IXTHEADBB-NEXT:    addi a0, a0, -273
+; RV64IXTHEADBB-NEXT:    slli a0, a0, 14
+; RV64IXTHEADBB-NEXT:    addi a0, a0, -1093
+; RV64IXTHEADBB-NEXT:    ret
+  ret i64 70370820078523 ; 0x40007bbbbbbb
+}
+
+define i64 @imm_neg_9223372034778874949() {
+; RV64I-LABEL: imm_neg_9223372034778874949:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lui a0, 506812
+; RV64I-NEXT:    addiw a0, a0, -1093
+; RV64I-NEXT:    slli a1, a0, 63
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64IZBA-LABEL: imm_neg_9223372034778874949:
+; RV64IZBA:       # %bb.0:
+; RV64IZBA-NEXT:    lui a0, 506812
+; RV64IZBA-NEXT:    addiw a0, a0, -1093
+; RV64IZBA-NEXT:    slli a1, a0, 63
+; RV64IZBA-NEXT:    add a0, a0, a1
+; RV64IZBA-NEXT:    ret
+;
+; RV64IZBB-LABEL: imm_neg_9223372034778874949:
+; RV64IZBB:       # %bb.0:
+; RV64IZBB-NEXT:    lui a0, 506812
+; RV64IZBB-NEXT:    addiw a0, a0, -1093
+; RV64IZBB-NEXT:    slli a1, a0, 63
+; RV64IZBB-NEXT:    add a0, a0, a1
+; RV64IZBB-NEXT:    ret
+;
+; RV64IZBS-LABEL: imm_neg_9223372034778874949:
+; RV64IZBS:       # %bb.0:
+; RV64IZBS-NEXT:    lui a0, 506812
+; RV64IZBS-NEXT:    addiw a0, a0, -1093
+; RV64IZBS-NEXT:    bseti a0, a0, 63
+; RV64IZBS-NEXT:    ret
+;
+; RV64IXTHEADBB-LABEL: imm_neg_9223372034778874949:
+; RV64IXTHEADBB:       # %bb.0:
+; RV64IXTHEADBB-NEXT:    lui a0, 506812
+; RV64IXTHEADBB-NEXT:    addiw a0, a0, -1093
+; RV64IXTHEADBB-NEXT:    slli a1, a0, 63
+; RV64IXTHEADBB-NEXT:    add a0, a0, a1
+; RV64IXTHEADBB-NEXT:    ret
+  ret i64 -9223372034778874949 ; 0x800000007bbbbbbb
+}
+
+define i64 @imm_neg_9223301666034697285() {
+; RV64-NOPOOL-LABEL: imm_neg_9223301666034697285:
+; RV64-NOPOOL:       # %bb.0:
+; RV64-NOPOOL-NEXT:    lui a0, 917505
+; RV64-NOPOOL-NEXT:    slli a0, a0, 8
+; RV64-NOPOOL-NEXT:    addi a0, a0, 31
+; RV64-NOPOOL-NEXT:    slli a0, a0, 12
+; RV64-NOPOOL-NEXT:    addi a0, a0, -273
+; RV64-NOPOOL-NEXT:    slli a0, a0, 14
+; RV64-NOPOOL-NEXT:    addi a0, a0, -1093
+; RV64-NOPOOL-NEXT:    ret
+;
+; RV64I-POOL-LABEL: imm_neg_9223301666034697285:
+; RV64I-POOL:       # %bb.0:
+; RV64I-POOL-NEXT:    lui a0, %hi(.LCPI39_0)
+; RV64I-POOL-NEXT:    ld a0, %lo(.LCPI39_0)(a0)
+; RV64I-POOL-NEXT:    ret
+;
+; RV64IZBA-LABEL: imm_neg_9223301666034697285:
+; RV64IZBA:       # %bb.0:
+; RV64IZBA-NEXT:    lui a0, 917505
+; RV64IZBA-NEXT:    slli a0, a0, 8
+; RV64IZBA-NEXT:    addi a0, a0, 31
+; RV64IZBA-NEXT:    slli a0, a0, 12
+; RV64IZBA-NEXT:    addi a0, a0, -273
+; RV64IZBA-NEXT:    slli a0, a0, 14
+; RV64IZBA-NEXT:    addi a0, a0, -1093
+; RV64IZBA-NEXT:    ret
+;
+; RV64IZBB-LABEL: imm_neg_9223301666034697285:
+; RV64IZBB:       # %bb.0:
+; RV64IZBB-NEXT:    lui a0, 917505
+; RV64IZBB-NEXT:    slli a0, a0, 8
+; RV64IZBB-NEXT:    addi a0, a0, 31
+; RV64IZBB-NEXT:    slli a0, a0, 12
+; RV64IZBB-NEXT:    addi a0, a0, -273
+; RV64IZBB-NEXT:    slli a0, a0, 14
+; RV64IZBB-NEXT:    addi a0, a0, -1093
+; RV64IZBB-NEXT:    ret
+;
+; RV64IZBS-LABEL: imm_neg_9223301666034697285:
+; RV64IZBS:       # %bb.0:
+; RV64IZBS-NEXT:    lui a0, 506812
+; RV64IZBS-NEXT:    addiw a0, a0, -1093
+; RV64IZBS-NEXT:    bseti a0, a0, 46
+; RV64IZBS-NEXT:    bseti a0, a0, 63
+; RV64IZBS-NEXT:    ret
+;
+; RV64IXTHEADBB-LABEL: imm_neg_9223301666034697285:
+; RV64IXTHEADBB:       # %bb.0:
+; RV64IXTHEADBB-NEXT:    lui a0, 917505
+; RV64IXTHEADBB-NEXT:    slli a0, a0, 8
+; RV64IXTHEADBB-NEXT:    addi a0, a0, 31
+; RV64IXTHEADBB-NEXT:    slli a0, a0, 12
+; RV64IXTHEADBB-NEXT:    addi a0, a0, -273
+; RV64IXTHEADBB-NEXT:    slli a0, a0, 14
+; RV64IXTHEADBB-NEXT:    addi a0, a0, -1093
+; RV64IXTHEADBB-NEXT:    ret
+  ret i64 -9223301666034697285 ; 0x800040007bbbbbbb
+}
+
+define i64 @imm_neg_2219066437() {
+; RV64I-LABEL: imm_neg_2219066437:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lui a0, 913135
+; RV64I-NEXT:    slli a0, a0, 2
+; RV64I-NEXT:    addi a0, a0, -1093
+; RV64I-NEXT:    ret
+;
+; RV64IZBA-LABEL: imm_neg_2219066437:
+; RV64IZBA:       # %bb.0:
+; RV64IZBA-NEXT:    lui a0, 913135
+; RV64IZBA-NEXT:    slli a0, a0, 2
+; RV64IZBA-NEXT:    addi a0, a0, -1093
+; RV64IZBA-NEXT:    ret
+;
+; RV64IZBB-LABEL: imm_neg_2219066437:
+; RV64IZBB:       # %bb.0:
+; RV64IZBB-NEXT:    lui a0, 913135
+; RV64IZBB-NEXT:    slli a0, a0, 2
+; RV64IZBB-NEXT:    addi a0, a0, -1093
+; RV64IZBB-NEXT:    ret
+;
+; RV64IZBS-LABEL: imm_neg_2219066437:
+; RV64IZBS:       # %bb.0:
+; RV64IZBS-NEXT:    lui a0, 913135
+; RV64IZBS-NEXT:    slli a0, a0, 2
+; RV64IZBS-NEXT:    addi a0, a0, -1093
+; RV64IZBS-NEXT:    ret
+;
+; RV64IXTHEADBB-LABEL: imm_neg_2219066437:
+; RV64IXTHEADBB:       # %bb.0:
+; RV64IXTHEADBB-NEXT:    lui a0, 913135
+; RV64IXTHEADBB-NEXT:    slli a0, a0, 2
+; RV64IXTHEADBB-NEXT:    addi a0, a0, -1093
+; RV64IXTHEADBB-NEXT:    ret
+  ret i64 -2219066437 ; 0xffffffff7bbbbbbb
+}
+
+define i64 @imm_neg_8798043653189() {
+; RV64I-LABEL: imm_neg_8798043653189:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lui a0, 917475
+; RV64I-NEXT:    addiw a0, a0, -273
+; RV64I-NEXT:    slli a0, a0, 14
+; RV64I-NEXT:    addi a0, a0, -1093
+; RV64I-NEXT:    ret
+;
+; RV64IZBA-LABEL: imm_neg_8798043653189:
+; RV64IZBA:       # %bb.0:
+; RV64IZBA-NEXT:    lui a0, 917475
+; RV64IZBA-NEXT:    addiw a0, a0, -273
+; RV64IZBA-NEXT:    slli a0, a0, 14
+; RV64IZBA-NEXT:    addi a0, a0, -1093
+; RV64IZBA-NEXT:    ret
+;
+; RV64IZBB-LABEL: imm_neg_8798043653189:
+; RV64IZBB:       # %bb.0:
+; RV64IZBB-NEXT:    lui a0, 917475
+; RV64IZBB-NEXT:    addiw a0, a0, -273
+; RV64IZBB-NEXT:    slli a0, a0, 14
+; RV64IZBB-NEXT:    addi a0, a0, -1093
+; RV64IZBB-NEXT:    ret
+;
+; RV64IZBS-LABEL: imm_neg_8798043653189:
+; RV64IZBS:       # %bb.0:
+; RV64IZBS-NEXT:    lui a0, 572348
+; RV64IZBS-NEXT:    addiw a0, a0, -1093
+; RV64IZBS-NEXT:    bclri a0, a0, 43
+; RV64IZBS-NEXT:    ret
+;
+; RV64IXTHEADBB-LABEL: imm_neg_8798043653189:
+; RV64IXTHEADBB:       # %bb.0:
+; RV64IXTHEADBB-NEXT:    lui a0, 917475
+; RV64IXTHEADBB-NEXT:    addiw a0, a0, -273
+; RV64IXTHEADBB-NEXT:    slli a0, a0, 14
+; RV64IXTHEADBB-NEXT:    addi a0, a0, -1093
+; RV64IXTHEADBB-NEXT:    ret
+  ret i64 -8798043653189 ; 0xfffff7ff8bbbbbbb
+}
+
+define i64 @imm_9223372034904144827() {
+; RV64I-LABEL: imm_9223372034904144827:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lui a0, 572348
+; RV64I-NEXT:    addiw a0, a0, -1093
+; RV64I-NEXT:    slli a1, a0, 63
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64IZBA-LABEL: imm_9223372034904144827:
+; RV64IZBA:       # %bb.0:
+; RV64IZBA-NEXT:    lui a0, 572348
+; RV64IZBA-NEXT:    addiw a0, a0, -1093
+; RV64IZBA-NEXT:    slli a1, a0, 63
+; RV64IZBA-NEXT:    add a0, a0, a1
+; RV64IZBA-NEXT:    ret
+;
+; RV64IZBB-LABEL: imm_9223372034904144827:
+; RV64IZBB:       # %bb.0:
+; RV64IZBB-NEXT:    lui a0, 572348
+; RV64IZBB-NEXT:    addiw a0, a0, -1093
+; RV64IZBB-NEXT:    slli a1, a0, 63
+; RV64IZBB-NEXT:    add a0, a0, a1
+; RV64IZBB-NEXT:    ret
+;
+; RV64IZBS-LABEL: imm_9223372034904144827:
+; RV64IZBS:       # %bb.0:
+; RV64IZBS-NEXT:    lui a0, 572348
+; RV64IZBS-NEXT:    addiw a0, a0, -1093
+; RV64IZBS-NEXT:    bclri a0, a0, 63
+; RV64IZBS-NEXT:    ret
+;
+; RV64IXTHEADBB-LABEL: imm_9223372034904144827:
+; RV64IXTHEADBB:       # %bb.0:
+; RV64IXTHEADBB-NEXT:    lui a0, 572348
+; RV64IXTHEADBB-NEXT:    addiw a0, a0, -1093
+; RV64IXTHEADBB-NEXT:    slli a1, a0, 63
+; RV64IXTHEADBB-NEXT:    add a0, a0, a1
+; RV64IXTHEADBB-NEXT:    ret
+  ret i64 9223372034904144827 ; 0x7fffffff8bbbbbbb
+}
+
+define i64 @imm_neg_9223354442718100411() {
+; RV64-NOPOOL-LABEL: imm_neg_9223354442718100411:
+; RV64-NOPOOL:       # %bb.0:
+; RV64-NOPOOL-NEXT:    lui a0, 524287
+; RV64-NOPOOL-NEXT:    slli a0, a0, 6
+; RV64-NOPOOL-NEXT:    addi a0, a0, -29
+; RV64-NOPOOL-NEXT:    slli a0, a0, 12
+; RV64-NOPOOL-NEXT:    addi a0, a0, -273
+; RV64-NOPOOL-NEXT:    slli a0, a0, 14
+; RV64-NOPOOL-NEXT:    addi a0, a0, -1093
+; RV64-NOPOOL-NEXT:    ret
+;
+; RV64I-POOL-LABEL: imm_neg_9223354442718100411:
+; RV64I-POOL:       # %bb.0:
+; RV64I-POOL-NEXT:    lui a0, %hi(.LCPI43_0)
+; RV64I-POOL-NEXT:    ld a0, %lo(.LCPI43_0)(a0)
+; RV64I-POOL-NEXT:    ret
+;
+; RV64IZBA-LABEL: imm_neg_9223354442718100411:
+; RV64IZBA:       # %bb.0:
+; RV64IZBA-NEXT:    lui a0, 524287
+; RV64IZBA-NEXT:    slli a0, a0, 6
+; RV64IZBA-NEXT:    addi a0, a0, -29
+; RV64IZBA-NEXT:    slli a0, a0, 12
+; RV64IZBA-NEXT:    addi a0, a0, -273
+; RV64IZBA-NEXT:    slli a0, a0, 14
+; RV64IZBA-NEXT:    addi a0, a0, -1093
+; RV64IZBA-NEXT:    ret
+;
+; RV64IZBB-LABEL: imm_neg_9223354442718100411:
+; RV64IZBB:       # %bb.0:
+; RV64IZBB-NEXT:    lui a0, 524287
+; RV64IZBB-NEXT:    slli a0, a0, 6
+; RV64IZBB-NEXT:    addi a0, a0, -29
+; RV64IZBB-NEXT:    slli a0, a0, 12
+; RV64IZBB-NEXT:    addi a0, a0, -273
+; RV64IZBB-NEXT:    slli a0, a0, 14
+; RV64IZBB-NEXT:    addi a0, a0, -1093
+; RV64IZBB-NEXT:    ret
+;
+; RV64IZBS-LABEL: imm_neg_9223354442718100411:
+; RV64IZBS:       # %bb.0:
+; RV64IZBS-NEXT:    lui a0, 572348
+; RV64IZBS-NEXT:    addiw a0, a0, -1093
+; RV64IZBS-NEXT:    bclri a0, a0, 44
+; RV64IZBS-NEXT:    bclri a0, a0, 63
+; RV64IZBS-NEXT:    ret
+;
+; RV64IXTHEADBB-LABEL: imm_neg_9223354442718100411:
+; RV64IXTHEADBB:       # %bb.0:
+; RV64IXTHEADBB-NEXT:    lui a0, 524287
+; RV64IXTHEADBB-NEXT:    slli a0, a0, 6
+; RV64IXTHEADBB-NEXT:    addi a0, a0, -29
+; RV64IXTHEADBB-NEXT:    slli a0, a0, 12
+; RV64IXTHEADBB-NEXT:    addi a0, a0, -273
+; RV64IXTHEADBB-NEXT:    slli a0, a0, 14
+; RV64IXTHEADBB-NEXT:    addi a0, a0, -1093
+; RV64IXTHEADBB-NEXT:    ret
+  ret i64 9223354442718100411 ; 0x7fffefff8bbbbbbb
+}
+
+define i64 @imm_2863311530() {
+; RV64I-LABEL: imm_2863311530:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lui a0, 349525
+; RV64I-NEXT:    addiw a0, a0, 1365
+; RV64I-NEXT:    slli a0, a0, 1
+; RV64I-NEXT:    ret
+;
+; RV64IZBA-LABEL: imm_2863311530:
+; RV64IZBA:       # %bb.0:
+; RV64IZBA-NEXT:    lui a0, 349525
+; RV64IZBA-NEXT:    addiw a0, a0, 1365
+; RV64IZBA-NEXT:    slli a0, a0, 1
+; RV64IZBA-NEXT:    ret
+;
+; RV64IZBB-LABEL: imm_2863311530:
+; RV64IZBB:       # %bb.0:
+; RV64IZBB-NEXT:    lui a0, 349525
+; RV64IZBB-NEXT:    addiw a0, a0, 1365
+; RV64IZBB-NEXT:    slli a0, a0, 1
+; RV64IZBB-NEXT:    ret
+;
+; RV64IZBS-LABEL: imm_2863311530:
+; RV64IZBS:       # %bb.0:
+; RV64IZBS-NEXT:    lui a0, 349525
+; RV64IZBS-NEXT:    addiw a0, a0, 1365
+; RV64IZBS-NEXT:    slli a0, a0, 1
+; RV64IZBS-NEXT:    ret
+;
+; RV64IXTHEADBB-LABEL: imm_2863311530:
+; RV64IXTHEADBB:       # %bb.0:
+; RV64IXTHEADBB-NEXT:    lui a0, 349525
+; RV64IXTHEADBB-NEXT:    addiw a0, a0, 1365
+; RV64IXTHEADBB-NEXT:    slli a0, a0, 1
+; RV64IXTHEADBB-NEXT:    ret
+	ret i64 2863311530 ; #0xaaaaaaaa
+}
+
+define i64 @imm_neg_2863311530() {
+; RV64I-LABEL: imm_neg_2863311530:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lui a0, 699051
+; RV64I-NEXT:    addiw a0, a0, -1365
+; RV64I-NEXT:    slli a0, a0, 1
+; RV64I-NEXT:    ret
+;
+; RV64IZBA-LABEL: imm_neg_2863311530:
+; RV64IZBA:       # %bb.0:
+; RV64IZBA-NEXT:    lui a0, 699051
+; RV64IZBA-NEXT:    addiw a0, a0, -1365
+; RV64IZBA-NEXT:    slli a0, a0, 1
+; RV64IZBA-NEXT:    ret
+;
+; RV64IZBB-LABEL: imm_neg_2863311530:
+; RV64IZBB:       # %bb.0:
+; RV64IZBB-NEXT:    lui a0, 699051
+; RV64IZBB-NEXT:    addiw a0, a0, -1365
+; RV64IZBB-NEXT:    slli a0, a0, 1
+; RV64IZBB-NEXT:    ret
+;
+; RV64IZBS-LABEL: imm_neg_2863311530:
+; RV64IZBS:       # %bb.0:
+; RV64IZBS-NEXT:    lui a0, 699051
+; RV64IZBS-NEXT:    addiw a0, a0, -1365
+; RV64IZBS-NEXT:    slli a0, a0, 1
+; RV64IZBS-NEXT:    ret
+;
+; RV64IXTHEADBB-LABEL: imm_neg_2863311530:
+; RV64IXTHEADBB:       # %bb.0:
+; RV64IXTHEADBB-NEXT:    lui a0, 699051
+; RV64IXTHEADBB-NEXT:    addiw a0, a0, -1365
+; RV64IXTHEADBB-NEXT:    slli a0, a0, 1
+; RV64IXTHEADBB-NEXT:    ret
+	ret i64 -2863311530 ; #0xffffffff55555556
+}
+
+define i64 @imm_2147486378() {
+; RV64I-LABEL: imm_2147486378:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a0, 1
+; RV64I-NEXT:    slli a0, a0, 31
+; RV64I-NEXT:    addi a0, a0, 1365
+; RV64I-NEXT:    ret
+;
+; RV64IZBA-LABEL: imm_2147486378:
+; RV64IZBA:       # %bb.0:
+; RV64IZBA-NEXT:    li a0, 1
+; RV64IZBA-NEXT:    slli a0, a0, 31
+; RV64IZBA-NEXT:    addi a0, a0, 1365
+; RV64IZBA-NEXT:    ret
+;
+; RV64IZBB-LABEL: imm_2147486378:
+; RV64IZBB:       # %bb.0:
+; RV64IZBB-NEXT:    li a0, 1
+; RV64IZBB-NEXT:    slli a0, a0, 31
+; RV64IZBB-NEXT:    addi a0, a0, 1365
+; RV64IZBB-NEXT:    ret
+;
+; RV64IZBS-LABEL: imm_2147486378:
+; RV64IZBS:       # %bb.0:
+; RV64IZBS-NEXT:    li a0, 1365
+; RV64IZBS-NEXT:    bseti a0, a0, 31
+; RV64IZBS-NEXT:    ret
+;
+; RV64IXTHEADBB-LABEL: imm_2147486378:
+; RV64IXTHEADBB:       # %bb.0:
+; RV64IXTHEADBB-NEXT:    li a0, 1
+; RV64IXTHEADBB-NEXT:    slli a0, a0, 31
+; RV64IXTHEADBB-NEXT:    addi a0, a0, 1365
+; RV64IXTHEADBB-NEXT:    ret
+  ret i64 2147485013
+}
+
+define i64 @imm_neg_2147485013() {
+; RV64I-LABEL: imm_neg_2147485013:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lui a0, 524288
+; RV64I-NEXT:    addi a0, a0, -1365
+; RV64I-NEXT:    ret
+;
+; RV64IZBA-LABEL: imm_neg_2147485013:
+; RV64IZBA:       # %bb.0:
+; RV64IZBA-NEXT:    lui a0, 524288
+; RV64IZBA-NEXT:    addi a0, a0, -1365
+; RV64IZBA-NEXT:    ret
+;
+; RV64IZBB-LABEL: imm_neg_2147485013:
+; RV64IZBB:       # %bb.0:
+; RV64IZBB-NEXT:    lui a0, 524288
+; RV64IZBB-NEXT:    addi a0, a0, -1365
+; RV64IZBB-NEXT:    ret
+;
+; RV64IZBS-LABEL: imm_neg_2147485013:
+; RV64IZBS:       # %bb.0:
+; RV64IZBS-NEXT:    lui a0, 524288
+; RV64IZBS-NEXT:    addi a0, a0, -1365
+; RV64IZBS-NEXT:    ret
+;
+; RV64IXTHEADBB-LABEL: imm_neg_2147485013:
+; RV64IXTHEADBB:       # %bb.0:
+; RV64IXTHEADBB-NEXT:    lui a0, 524288
+; RV64IXTHEADBB-NEXT:    addi a0, a0, -1365
+; RV64IXTHEADBB-NEXT:    ret
+  ret i64 -2147485013
+}
+
+define i64 @imm_12900924131259() {
+; RV64I-LABEL: imm_12900924131259:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lui a0, 188
+; RV64I-NEXT:    addiw a0, a0, -1093
+; RV64I-NEXT:    slli a0, a0, 24
+; RV64I-NEXT:    addi a0, a0, 1979
+; RV64I-NEXT:    ret
+;
+; RV64IZBA-LABEL: imm_12900924131259:
+; RV64IZBA:       # %bb.0:
+; RV64IZBA-NEXT:    lui a0, 768955
+; RV64IZBA-NEXT:    slli.uw a0, a0, 12
+; RV64IZBA-NEXT:    addi a0, a0, 1979
+; RV64IZBA-NEXT:    ret
+;
+; RV64IZBB-LABEL: imm_12900924131259:
+; RV64IZBB:       # %bb.0:
+; RV64IZBB-NEXT:    lui a0, 188
+; RV64IZBB-NEXT:    addiw a0, a0, -1093
+; RV64IZBB-NEXT:    slli a0, a0, 24
+; RV64IZBB-NEXT:    addi a0, a0, 1979
+; RV64IZBB-NEXT:    ret
+;
+; RV64IZBS-LABEL: imm_12900924131259:
+; RV64IZBS:       # %bb.0:
+; RV64IZBS-NEXT:    lui a0, 188
+; RV64IZBS-NEXT:    addiw a0, a0, -1093
+; RV64IZBS-NEXT:    slli a0, a0, 24
+; RV64IZBS-NEXT:    addi a0, a0, 1979
+; RV64IZBS-NEXT:    ret
+;
+; RV64IXTHEADBB-LABEL: imm_12900924131259:
+; RV64IXTHEADBB:       # %bb.0:
+; RV64IXTHEADBB-NEXT:    lui a0, 188
+; RV64IXTHEADBB-NEXT:    addiw a0, a0, -1093
+; RV64IXTHEADBB-NEXT:    slli a0, a0, 24
+; RV64IXTHEADBB-NEXT:    addi a0, a0, 1979
+; RV64IXTHEADBB-NEXT:    ret
+  ret i64 12900924131259
+}
+
+define i64 @imm_50394234880() {
+; RV64I-LABEL: imm_50394234880:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lui a0, 188
+; RV64I-NEXT:    addiw a0, a0, -1093
+; RV64I-NEXT:    slli a0, a0, 16
+; RV64I-NEXT:    ret
+;
+; RV64IZBA-LABEL: imm_50394234880:
+; RV64IZBA:       # %bb.0:
+; RV64IZBA-NEXT:    lui a0, 768955
+; RV64IZBA-NEXT:    slli.uw a0, a0, 4
+; RV64IZBA-NEXT:    ret
+;
+; RV64IZBB-LABEL: imm_50394234880:
+; RV64IZBB:       # %bb.0:
+; RV64IZBB-NEXT:    lui a0, 188
+; RV64IZBB-NEXT:    addiw a0, a0, -1093
+; RV64IZBB-NEXT:    slli a0, a0, 16
+; RV64IZBB-NEXT:    ret
+;
+; RV64IZBS-LABEL: imm_50394234880:
+; RV64IZBS:       # %bb.0:
+; RV64IZBS-NEXT:    lui a0, 188
+; RV64IZBS-NEXT:    addiw a0, a0, -1093
+; RV64IZBS-NEXT:    slli a0, a0, 16
+; RV64IZBS-NEXT:    ret
+;
+; RV64IXTHEADBB-LABEL: imm_50394234880:
+; RV64IXTHEADBB:       # %bb.0:
+; RV64IXTHEADBB-NEXT:    lui a0, 188
+; RV64IXTHEADBB-NEXT:    addiw a0, a0, -1093
+; RV64IXTHEADBB-NEXT:    slli a0, a0, 16
+; RV64IXTHEADBB-NEXT:    ret
+  ret i64 50394234880
+}
+
+define i64 @imm_12900936431479() {
+; RV64I-LABEL: imm_12900936431479:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lui a0, 192239
+; RV64I-NEXT:    slli a0, a0, 2
+; RV64I-NEXT:    addi a0, a0, -1093
+; RV64I-NEXT:    slli a0, a0, 12
+; RV64I-NEXT:    addi a0, a0, 1911
+; RV64I-NEXT:    ret
+;
+; RV64IZBA-LABEL: imm_12900936431479:
+; RV64IZBA:       # %bb.0:
+; RV64IZBA-NEXT:    lui a0, 768956
+; RV64IZBA-NEXT:    addi a0, a0, -1093
+; RV64IZBA-NEXT:    slli.uw a0, a0, 12
+; RV64IZBA-NEXT:    addi a0, a0, 1911
+; RV64IZBA-NEXT:    ret
+;
+; RV64IZBB-LABEL: imm_12900936431479:
+; RV64IZBB:       # %bb.0:
+; RV64IZBB-NEXT:    lui a0, 192239
+; RV64IZBB-NEXT:    slli a0, a0, 2
+; RV64IZBB-NEXT:    addi a0, a0, -1093
+; RV64IZBB-NEXT:    slli a0, a0, 12
+; RV64IZBB-NEXT:    addi a0, a0, 1911
+; RV64IZBB-NEXT:    ret
+;
+; RV64IZBS-LABEL: imm_12900936431479:
+; RV64IZBS:       # %bb.0:
+; RV64IZBS-NEXT:    lui a0, 192239
+; RV64IZBS-NEXT:    slli a0, a0, 2
+; RV64IZBS-NEXT:    addi a0, a0, -1093
+; RV64IZBS-NEXT:    slli a0, a0, 12
+; RV64IZBS-NEXT:    addi a0, a0, 1911
+; RV64IZBS-NEXT:    ret
+;
+; RV64IXTHEADBB-LABEL: imm_12900936431479:
+; RV64IXTHEADBB:       # %bb.0:
+; RV64IXTHEADBB-NEXT:    lui a0, 192239
+; RV64IXTHEADBB-NEXT:    slli a0, a0, 2
+; RV64IXTHEADBB-NEXT:    addi a0, a0, -1093
+; RV64IXTHEADBB-NEXT:    slli a0, a0, 12
+; RV64IXTHEADBB-NEXT:    addi a0, a0, 1911
+; RV64IXTHEADBB-NEXT:    ret
+  ret i64 12900936431479
+}
+
+define i64 @imm_12900918536874() {
+; RV64I-LABEL: imm_12900918536874:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lui a0, 384477
+; RV64I-NEXT:    addiw a0, a0, 1365
+; RV64I-NEXT:    slli a0, a0, 12
+; RV64I-NEXT:    addi a0, a0, 1365
+; RV64I-NEXT:    slli a0, a0, 1
+; RV64I-NEXT:    ret
+;
+; RV64IZBA-LABEL: imm_12900918536874:
+; RV64IZBA:       # %bb.0:
+; RV64IZBA-NEXT:    lui a0, 768955
+; RV64IZBA-NEXT:    addi a0, a0, -1365
+; RV64IZBA-NEXT:    slli.uw a0, a0, 12
+; RV64IZBA-NEXT:    addi a0, a0, -1366
+; RV64IZBA-NEXT:    ret
+;
+; RV64IZBB-LABEL: imm_12900918536874:
+; RV64IZBB:       # %bb.0:
+; RV64IZBB-NEXT:    lui a0, 384477
+; RV64IZBB-NEXT:    addiw a0, a0, 1365
+; RV64IZBB-NEXT:    slli a0, a0, 12
+; RV64IZBB-NEXT:    addi a0, a0, 1365
+; RV64IZBB-NEXT:    slli a0, a0, 1
+; RV64IZBB-NEXT:    ret
+;
+; RV64IZBS-LABEL: imm_12900918536874:
+; RV64IZBS:       # %bb.0:
+; RV64IZBS-NEXT:    lui a0, 384477
+; RV64IZBS-NEXT:    addiw a0, a0, 1365
+; RV64IZBS-NEXT:    slli a0, a0, 12
+; RV64IZBS-NEXT:    addi a0, a0, 1365
+; RV64IZBS-NEXT:    slli a0, a0, 1
+; RV64IZBS-NEXT:    ret
+;
+; RV64IXTHEADBB-LABEL: imm_12900918536874:
+; RV64IXTHEADBB:       # %bb.0:
+; RV64IXTHEADBB-NEXT:    lui a0, 384477
+; RV64IXTHEADBB-NEXT:    addiw a0, a0, 1365
+; RV64IXTHEADBB-NEXT:    slli a0, a0, 12
+; RV64IXTHEADBB-NEXT:    addi a0, a0, 1365
+; RV64IXTHEADBB-NEXT:    slli a0, a0, 1
+; RV64IXTHEADBB-NEXT:    ret
+  ret i64 12900918536874
+}
+
+define i64 @imm_12900925247761() {
+; RV64I-LABEL: imm_12900925247761:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lui a0, 384478
+; RV64I-NEXT:    addiw a0, a0, -1911
+; RV64I-NEXT:    slli a0, a0, 13
+; RV64I-NEXT:    addi a0, a0, -2048
+; RV64I-NEXT:    addi a0, a0, -1775
+; RV64I-NEXT:    ret
+;
+; RV64IZBA-LABEL: imm_12900925247761:
+; RV64IZBA:       # %bb.0:
+; RV64IZBA-NEXT:    lui a0, 768955
+; RV64IZBA-NEXT:    addi a0, a0, 273
+; RV64IZBA-NEXT:    slli.uw a0, a0, 12
+; RV64IZBA-NEXT:    addi a0, a0, 273
+; RV64IZBA-NEXT:    ret
+;
+; RV64IZBB-LABEL: imm_12900925247761:
+; RV64IZBB:       # %bb.0:
+; RV64IZBB-NEXT:    lui a0, 384478
+; RV64IZBB-NEXT:    addiw a0, a0, -1911
+; RV64IZBB-NEXT:    slli a0, a0, 13
+; RV64IZBB-NEXT:    addi a0, a0, -2048
+; RV64IZBB-NEXT:    addi a0, a0, -1775
+; RV64IZBB-NEXT:    ret
+;
+; RV64IZBS-LABEL: imm_12900925247761:
+; RV64IZBS:       # %bb.0:
+; RV64IZBS-NEXT:    lui a0, 384478
+; RV64IZBS-NEXT:    addiw a0, a0, -1911
+; RV64IZBS-NEXT:    slli a0, a0, 13
+; RV64IZBS-NEXT:    addi a0, a0, -2048
+; RV64IZBS-NEXT:    addi a0, a0, -1775
+; RV64IZBS-NEXT:    ret
+;
+; RV64IXTHEADBB-LABEL: imm_12900925247761:
+; RV64IXTHEADBB:       # %bb.0:
+; RV64IXTHEADBB-NEXT:    lui a0, 384478
+; RV64IXTHEADBB-NEXT:    addiw a0, a0, -1911
+; RV64IXTHEADBB-NEXT:    slli a0, a0, 13
+; RV64IXTHEADBB-NEXT:    addi a0, a0, -2048
+; RV64IXTHEADBB-NEXT:    addi a0, a0, -1775
+; RV64IXTHEADBB-NEXT:    ret
+  ret i64 12900925247761
+}
+
+define i64 @imm_7158272001() {
+; RV64I-LABEL: imm_7158272001:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lui a0, 427
+; RV64I-NEXT:    addiw a0, a0, -1367
+; RV64I-NEXT:    slli a0, a0, 12
+; RV64I-NEXT:    addi a0, a0, 1
+; RV64I-NEXT:    ret
+;
+; RV64IZBA-LABEL: imm_7158272001:
+; RV64IZBA:       # %bb.0:
+; RV64IZBA-NEXT:    lui a0, 349525
+; RV64IZBA-NEXT:    sh2add a0, a0, a0
+; RV64IZBA-NEXT:    addi a0, a0, 1
+; RV64IZBA-NEXT:    ret
+;
+; RV64IZBB-LABEL: imm_7158272001:
+; RV64IZBB:       # %bb.0:
+; RV64IZBB-NEXT:    lui a0, 427
+; RV64IZBB-NEXT:    addiw a0, a0, -1367
+; RV64IZBB-NEXT:    slli a0, a0, 12
+; RV64IZBB-NEXT:    addi a0, a0, 1
+; RV64IZBB-NEXT:    ret
+;
+; RV64IZBS-LABEL: imm_7158272001:
+; RV64IZBS:       # %bb.0:
+; RV64IZBS-NEXT:    lui a0, 427
+; RV64IZBS-NEXT:    addiw a0, a0, -1367
+; RV64IZBS-NEXT:    slli a0, a0, 12
+; RV64IZBS-NEXT:    addi a0, a0, 1
+; RV64IZBS-NEXT:    ret
+;
+; RV64IXTHEADBB-LABEL: imm_7158272001:
+; RV64IXTHEADBB:       # %bb.0:
+; RV64IXTHEADBB-NEXT:    lui a0, 427
+; RV64IXTHEADBB-NEXT:    addiw a0, a0, -1367
+; RV64IXTHEADBB-NEXT:    slli a0, a0, 12
+; RV64IXTHEADBB-NEXT:    addi a0, a0, 1
+; RV64IXTHEADBB-NEXT:    ret
+  ret i64 7158272001 ; 0x0000_0001_aaaa_9001
+}
+
+define i64 @imm_12884889601() {
+; RV64I-LABEL: imm_12884889601:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lui a0, 768
+; RV64I-NEXT:    addiw a0, a0, -3
+; RV64I-NEXT:    slli a0, a0, 12
+; RV64I-NEXT:    addi a0, a0, 1
+; RV64I-NEXT:    ret
+;
+; RV64IZBA-LABEL: imm_12884889601:
+; RV64IZBA:       # %bb.0:
+; RV64IZBA-NEXT:    lui a0, 349525
+; RV64IZBA-NEXT:    sh3add a0, a0, a0
+; RV64IZBA-NEXT:    addi a0, a0, 1
+; RV64IZBA-NEXT:    ret
+;
+; RV64IZBB-LABEL: imm_12884889601:
+; RV64IZBB:       # %bb.0:
+; RV64IZBB-NEXT:    lui a0, 768
+; RV64IZBB-NEXT:    addiw a0, a0, -3
+; RV64IZBB-NEXT:    slli a0, a0, 12
+; RV64IZBB-NEXT:    addi a0, a0, 1
+; RV64IZBB-NEXT:    ret
+;
+; RV64IZBS-LABEL: imm_12884889601:
+; RV64IZBS:       # %bb.0:
+; RV64IZBS-NEXT:    lui a0, 768
+; RV64IZBS-NEXT:    addiw a0, a0, -3
+; RV64IZBS-NEXT:    slli a0, a0, 12
+; RV64IZBS-NEXT:    addi a0, a0, 1
+; RV64IZBS-NEXT:    ret
+;
+; RV64IXTHEADBB-LABEL: imm_12884889601:
+; RV64IXTHEADBB:       # %bb.0:
+; RV64IXTHEADBB-NEXT:    lui a0, 768
+; RV64IXTHEADBB-NEXT:    addiw a0, a0, -3
+; RV64IXTHEADBB-NEXT:    slli a0, a0, 12
+; RV64IXTHEADBB-NEXT:    addi a0, a0, 1
+; RV64IXTHEADBB-NEXT:    ret
+  ret i64 12884889601 ; 0x0000_0002_ffff_d001
+}
+
+define i64 @imm_neg_3435982847() {
+; RV64I-LABEL: imm_neg_3435982847:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lui a0, 1048371
+; RV64I-NEXT:    addiw a0, a0, 817
+; RV64I-NEXT:    slli a0, a0, 12
+; RV64I-NEXT:    addi a0, a0, 1
+; RV64I-NEXT:    ret
+;
+; RV64IZBA-LABEL: imm_neg_3435982847:
+; RV64IZBA:       # %bb.0:
+; RV64IZBA-NEXT:    lui a0, 768955
+; RV64IZBA-NEXT:    sh1add a0, a0, a0
+; RV64IZBA-NEXT:    addi a0, a0, 1
+; RV64IZBA-NEXT:    ret
+;
+; RV64IZBB-LABEL: imm_neg_3435982847:
+; RV64IZBB:       # %bb.0:
+; RV64IZBB-NEXT:    lui a0, 1048371
+; RV64IZBB-NEXT:    addiw a0, a0, 817
+; RV64IZBB-NEXT:    slli a0, a0, 12
+; RV64IZBB-NEXT:    addi a0, a0, 1
+; RV64IZBB-NEXT:    ret
+;
+; RV64IZBS-LABEL: imm_neg_3435982847:
+; RV64IZBS:       # %bb.0:
+; RV64IZBS-NEXT:    lui a0, 734001
+; RV64IZBS-NEXT:    addiw a0, a0, 1
+; RV64IZBS-NEXT:    bclri a0, a0, 31
+; RV64IZBS-NEXT:    ret
+;
+; RV64IXTHEADBB-LABEL: imm_neg_3435982847:
+; RV64IXTHEADBB:       # %bb.0:
+; RV64IXTHEADBB-NEXT:    lui a0, 1048371
+; RV64IXTHEADBB-NEXT:    addiw a0, a0, 817
+; RV64IXTHEADBB-NEXT:    slli a0, a0, 12
+; RV64IXTHEADBB-NEXT:    addi a0, a0, 1
+; RV64IXTHEADBB-NEXT:    ret
+  ret i64 -3435982847 ; 0xffff_ffff_3333_1001
+}
+
+define i64 @imm_neg_5726842879() {
+; RV64I-LABEL: imm_neg_5726842879:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lui a0, 1048235
+; RV64I-NEXT:    addiw a0, a0, -1419
+; RV64I-NEXT:    slli a0, a0, 12
+; RV64I-NEXT:    addi a0, a0, 1
+; RV64I-NEXT:    ret
+;
+; RV64IZBA-LABEL: imm_neg_5726842879:
+; RV64IZBA:       # %bb.0:
+; RV64IZBA-NEXT:    lui a0, 768945
+; RV64IZBA-NEXT:    sh2add a0, a0, a0
+; RV64IZBA-NEXT:    addi a0, a0, 1
+; RV64IZBA-NEXT:    ret
+;
+; RV64IZBB-LABEL: imm_neg_5726842879:
+; RV64IZBB:       # %bb.0:
+; RV64IZBB-NEXT:    lui a0, 1048235
+; RV64IZBB-NEXT:    addiw a0, a0, -1419
+; RV64IZBB-NEXT:    slli a0, a0, 12
+; RV64IZBB-NEXT:    addi a0, a0, 1
+; RV64IZBB-NEXT:    ret
+;
+; RV64IZBS-LABEL: imm_neg_5726842879:
+; RV64IZBS:       # %bb.0:
+; RV64IZBS-NEXT:    lui a0, 698997
+; RV64IZBS-NEXT:    addiw a0, a0, 1
+; RV64IZBS-NEXT:    bclri a0, a0, 32
+; RV64IZBS-NEXT:    ret
+;
+; RV64IXTHEADBB-LABEL: imm_neg_5726842879:
+; RV64IXTHEADBB:       # %bb.0:
+; RV64IXTHEADBB-NEXT:    lui a0, 1048235
+; RV64IXTHEADBB-NEXT:    addiw a0, a0, -1419
+; RV64IXTHEADBB-NEXT:    slli a0, a0, 12
+; RV64IXTHEADBB-NEXT:    addi a0, a0, 1
+; RV64IXTHEADBB-NEXT:    ret
+  ret i64 -5726842879 ; 0xffff_fffe_aaa7_5001
+}
+
+define i64 @imm_neg_10307948543() {
+; RV64I-LABEL: imm_neg_10307948543:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lui a0, 1047962
+; RV64I-NEXT:    addiw a0, a0, -1645
+; RV64I-NEXT:    slli a0, a0, 12
+; RV64I-NEXT:    addi a0, a0, 1
+; RV64I-NEXT:    ret
+;
+; RV64IZBA-LABEL: imm_neg_10307948543:
+; RV64IZBA:       # %bb.0:
+; RV64IZBA-NEXT:    lui a0, 768955
+; RV64IZBA-NEXT:    sh3add a0, a0, a0
+; RV64IZBA-NEXT:    addi a0, a0, 1
+; RV64IZBA-NEXT:    ret
+;
+; RV64IZBB-LABEL: imm_neg_10307948543:
+; RV64IZBB:       # %bb.0:
+; RV64IZBB-NEXT:    lui a0, 1047962
+; RV64IZBB-NEXT:    addiw a0, a0, -1645
+; RV64IZBB-NEXT:    slli a0, a0, 12
+; RV64IZBB-NEXT:    addi a0, a0, 1
+; RV64IZBB-NEXT:    ret
+;
+; RV64IZBS-LABEL: imm_neg_10307948543:
+; RV64IZBS:       # %bb.0:
+; RV64IZBS-NEXT:    lui a0, 629139
+; RV64IZBS-NEXT:    addiw a0, a0, 1
+; RV64IZBS-NEXT:    bclri a0, a0, 33
+; RV64IZBS-NEXT:    ret
+;
+; RV64IXTHEADBB-LABEL: imm_neg_10307948543:
+; RV64IXTHEADBB:       # %bb.0:
+; RV64IXTHEADBB-NEXT:    lui a0, 1047962
+; RV64IXTHEADBB-NEXT:    addiw a0, a0, -1645
+; RV64IXTHEADBB-NEXT:    slli a0, a0, 12
+; RV64IXTHEADBB-NEXT:    addi a0, a0, 1
+; RV64IXTHEADBB-NEXT:    ret
+  ret i64 -10307948543 ; 0xffff_fffd_9999_3001
+}
+
+define i64 @li_rori_1() {
+; RV64I-LABEL: li_rori_1:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a0, -17
+; RV64I-NEXT:    slli a0, a0, 43
+; RV64I-NEXT:    addi a0, a0, -1
+; RV64I-NEXT:    ret
+;
+; RV64IZBA-LABEL: li_rori_1:
+; RV64IZBA:       # %bb.0:
+; RV64IZBA-NEXT:    li a0, -17
+; RV64IZBA-NEXT:    slli a0, a0, 43
+; RV64IZBA-NEXT:    addi a0, a0, -1
+; RV64IZBA-NEXT:    ret
+;
+; RV64IZBB-LABEL: li_rori_1:
+; RV64IZBB:       # %bb.0:
+; RV64IZBB-NEXT:    li a0, -18
+; RV64IZBB-NEXT:    rori a0, a0, 21
+; RV64IZBB-NEXT:    ret
+;
+; RV64IZBS-LABEL: li_rori_1:
+; RV64IZBS:       # %bb.0:
+; RV64IZBS-NEXT:    li a0, -17
+; RV64IZBS-NEXT:    slli a0, a0, 43
+; RV64IZBS-NEXT:    addi a0, a0, -1
+; RV64IZBS-NEXT:    ret
+;
+; RV64IXTHEADBB-LABEL: li_rori_1:
+; RV64IXTHEADBB:       # %bb.0:
+; RV64IXTHEADBB-NEXT:    li a0, -18
+; RV64IXTHEADBB-NEXT:    th.srri a0, a0, 21
+; RV64IXTHEADBB-NEXT:    ret
+  ret i64 -149533581377537
+}
+
+define i64 @li_rori_2() {
+; RV64I-LABEL: li_rori_2:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a0, -5
+; RV64I-NEXT:    slli a0, a0, 60
+; RV64I-NEXT:    addi a0, a0, -6
+; RV64I-NEXT:    ret
+;
+; RV64IZBA-LABEL: li_rori_2:
+; RV64IZBA:       # %bb.0:
+; RV64IZBA-NEXT:    li a0, -5
+; RV64IZBA-NEXT:    slli a0, a0, 60
+; RV64IZBA-NEXT:    addi a0, a0, -6
+; RV64IZBA-NEXT:    ret
+;
+; RV64IZBB-LABEL: li_rori_2:
+; RV64IZBB:       # %bb.0:
+; RV64IZBB-NEXT:    li a0, -86
+; RV64IZBB-NEXT:    rori a0, a0, 4
+; RV64IZBB-NEXT:    ret
+;
+; RV64IZBS-LABEL: li_rori_2:
+; RV64IZBS:       # %bb.0:
+; RV64IZBS-NEXT:    li a0, -5
+; RV64IZBS-NEXT:    slli a0, a0, 60
+; RV64IZBS-NEXT:    addi a0, a0, -6
+; RV64IZBS-NEXT:    ret
+;
+; RV64IXTHEADBB-LABEL: li_rori_2:
+; RV64IXTHEADBB:       # %bb.0:
+; RV64IXTHEADBB-NEXT:    li a0, -86
+; RV64IXTHEADBB-NEXT:    th.srri a0, a0, 4
+; RV64IXTHEADBB-NEXT:    ret
+  ret i64 -5764607523034234886
+}
+
+define i64 @li_rori_3() {
+; RV64I-LABEL: li_rori_3:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a0, -17
+; RV64I-NEXT:    slli a0, a0, 27
+; RV64I-NEXT:    addi a0, a0, -1
+; RV64I-NEXT:    ret
+;
+; RV64IZBA-LABEL: li_rori_3:
+; RV64IZBA:       # %bb.0:
+; RV64IZBA-NEXT:    li a0, -17
+; RV64IZBA-NEXT:    slli a0, a0, 27
+; RV64IZBA-NEXT:    addi a0, a0, -1
+; RV64IZBA-NEXT:    ret
+;
+; RV64IZBB-LABEL: li_rori_3:
+; RV64IZBB:       # %bb.0:
+; RV64IZBB-NEXT:    li a0, -18
+; RV64IZBB-NEXT:    rori a0, a0, 37
+; RV64IZBB-NEXT:    ret
+;
+; RV64IZBS-LABEL: li_rori_3:
+; RV64IZBS:       # %bb.0:
+; RV64IZBS-NEXT:    li a0, -17
+; RV64IZBS-NEXT:    slli a0, a0, 27
+; RV64IZBS-NEXT:    addi a0, a0, -1
+; RV64IZBS-NEXT:    ret
+;
+; RV64IXTHEADBB-LABEL: li_rori_3:
+; RV64IXTHEADBB:       # %bb.0:
+; RV64IXTHEADBB-NEXT:    li a0, -18
+; RV64IXTHEADBB-NEXT:    th.srri a0, a0, 37
+; RV64IXTHEADBB-NEXT:    ret
+  ret i64 -2281701377
+}
+
+; This used to assert when compiled with Zba.
+define i64 @PR54812() {
+; RV64I-LABEL: PR54812:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lui a0, 1048447
+; RV64I-NEXT:    addiw a0, a0, 1407
+; RV64I-NEXT:    slli a0, a0, 12
+; RV64I-NEXT:    ret
+;
+; RV64IZBA-LABEL: PR54812:
+; RV64IZBA:       # %bb.0:
+; RV64IZBA-NEXT:    lui a0, 872917
+; RV64IZBA-NEXT:    sh1add a0, a0, a0
+; RV64IZBA-NEXT:    ret
+;
+; RV64IZBB-LABEL: PR54812:
+; RV64IZBB:       # %bb.0:
+; RV64IZBB-NEXT:    lui a0, 1048447
+; RV64IZBB-NEXT:    addiw a0, a0, 1407
+; RV64IZBB-NEXT:    slli a0, a0, 12
+; RV64IZBB-NEXT:    ret
+;
+; RV64IZBS-LABEL: PR54812:
+; RV64IZBS:       # %bb.0:
+; RV64IZBS-NEXT:    lui a0, 1045887
+; RV64IZBS-NEXT:    bclri a0, a0, 31
+; RV64IZBS-NEXT:    ret
+;
+; RV64IXTHEADBB-LABEL: PR54812:
+; RV64IXTHEADBB:       # %bb.0:
+; RV64IXTHEADBB-NEXT:    lui a0, 1048447
+; RV64IXTHEADBB-NEXT:    addiw a0, a0, 1407
+; RV64IXTHEADBB-NEXT:    slli a0, a0, 12
+; RV64IXTHEADBB-NEXT:    ret
+  ret i64 -2158497792;
+}
+
+define signext i32 @pos_2048() nounwind {
+; RV64I-LABEL: pos_2048:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a0, 1
+; RV64I-NEXT:    slli a0, a0, 11
+; RV64I-NEXT:    ret
+;
+; RV64IZBA-LABEL: pos_2048:
+; RV64IZBA:       # %bb.0:
+; RV64IZBA-NEXT:    li a0, 1
+; RV64IZBA-NEXT:    slli a0, a0, 11
+; RV64IZBA-NEXT:    ret
+;
+; RV64IZBB-LABEL: pos_2048:
+; RV64IZBB:       # %bb.0:
+; RV64IZBB-NEXT:    li a0, 1
+; RV64IZBB-NEXT:    slli a0, a0, 11
+; RV64IZBB-NEXT:    ret
+;
+; RV64IZBS-LABEL: pos_2048:
+; RV64IZBS:       # %bb.0:
+; RV64IZBS-NEXT:    bseti a0, zero, 11
+; RV64IZBS-NEXT:    ret
+;
+; RV64IXTHEADBB-LABEL: pos_2048:
+; RV64IXTHEADBB:       # %bb.0:
+; RV64IXTHEADBB-NEXT:    li a0, 1
+; RV64IXTHEADBB-NEXT:    slli a0, a0, 11
+; RV64IXTHEADBB-NEXT:    ret
+  ret i32 2048
+}
+
+define i64 @imm64_same_lo_hi() nounwind {
+; RV64I-LABEL: imm64_same_lo_hi:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lui a0, 65793
+; RV64I-NEXT:    addiw a0, a0, 16
+; RV64I-NEXT:    slli a1, a0, 32
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64IZBA-LABEL: imm64_same_lo_hi:
+; RV64IZBA:       # %bb.0:
+; RV64IZBA-NEXT:    lui a0, 65793
+; RV64IZBA-NEXT:    addiw a0, a0, 16
+; RV64IZBA-NEXT:    slli a1, a0, 32
+; RV64IZBA-NEXT:    add a0, a0, a1
+; RV64IZBA-NEXT:    ret
+;
+; RV64IZBB-LABEL: imm64_same_lo_hi:
+; RV64IZBB:       # %bb.0:
+; RV64IZBB-NEXT:    lui a0, 65793
+; RV64IZBB-NEXT:    addiw a0, a0, 16
+; RV64IZBB-NEXT:    slli a1, a0, 32
+; RV64IZBB-NEXT:    add a0, a0, a1
+; RV64IZBB-NEXT:    ret
+;
+; RV64IZBS-LABEL: imm64_same_lo_hi:
+; RV64IZBS:       # %bb.0:
+; RV64IZBS-NEXT:    lui a0, 65793
+; RV64IZBS-NEXT:    addiw a0, a0, 16
+; RV64IZBS-NEXT:    slli a1, a0, 32
+; RV64IZBS-NEXT:    add a0, a0, a1
+; RV64IZBS-NEXT:    ret
+;
+; RV64IXTHEADBB-LABEL: imm64_same_lo_hi:
+; RV64IXTHEADBB:       # %bb.0:
+; RV64IXTHEADBB-NEXT:    lui a0, 65793
+; RV64IXTHEADBB-NEXT:    addiw a0, a0, 16
+; RV64IXTHEADBB-NEXT:    slli a1, a0, 32
+; RV64IXTHEADBB-NEXT:    add a0, a0, a1
+; RV64IXTHEADBB-NEXT:    ret
+  ret i64 1157442765409226768 ; 0x0101010101010101
+}
+
+; Same as above with optsize. Make sure we use constant pool on RV64
+define i64 @imm64_same_lo_hi_optsize() nounwind optsize {
+; RV64-NOPOOL-LABEL: imm64_same_lo_hi_optsize:
+; RV64-NOPOOL:       # %bb.0:
+; RV64-NOPOOL-NEXT:    lui a0, 65793
+; RV64-NOPOOL-NEXT:    addiw a0, a0, 16
+; RV64-NOPOOL-NEXT:    slli a1, a0, 32
+; RV64-NOPOOL-NEXT:    add a0, a0, a1
+; RV64-NOPOOL-NEXT:    ret
+;
+; RV64I-POOL-LABEL: imm64_same_lo_hi_optsize:
+; RV64I-POOL:       # %bb.0:
+; RV64I-POOL-NEXT:    lui a0, %hi(.LCPI64_0)
+; RV64I-POOL-NEXT:    ld a0, %lo(.LCPI64_0)(a0)
+; RV64I-POOL-NEXT:    ret
+;
+; RV64IZBA-LABEL: imm64_same_lo_hi_optsize:
+; RV64IZBA:       # %bb.0:
+; RV64IZBA-NEXT:    lui a0, 65793
+; RV64IZBA-NEXT:    addiw a0, a0, 16
+; RV64IZBA-NEXT:    slli a1, a0, 32
+; RV64IZBA-NEXT:    add a0, a0, a1
+; RV64IZBA-NEXT:    ret
+;
+; RV64IZBB-LABEL: imm64_same_lo_hi_optsize:
+; RV64IZBB:       # %bb.0:
+; RV64IZBB-NEXT:    lui a0, 65793
+; RV64IZBB-NEXT:    addiw a0, a0, 16
+; RV64IZBB-NEXT:    slli a1, a0, 32
+; RV64IZBB-NEXT:    add a0, a0, a1
+; RV64IZBB-NEXT:    ret
+;
+; RV64IZBS-LABEL: imm64_same_lo_hi_optsize:
+; RV64IZBS:       # %bb.0:
+; RV64IZBS-NEXT:    lui a0, 65793
+; RV64IZBS-NEXT:    addiw a0, a0, 16
+; RV64IZBS-NEXT:    slli a1, a0, 32
+; RV64IZBS-NEXT:    add a0, a0, a1
+; RV64IZBS-NEXT:    ret
+;
+; RV64IXTHEADBB-LABEL: imm64_same_lo_hi_optsize:
+; RV64IXTHEADBB:       # %bb.0:
+; RV64IXTHEADBB-NEXT:    lui a0, 65793
+; RV64IXTHEADBB-NEXT:    addiw a0, a0, 16
+; RV64IXTHEADBB-NEXT:    slli a1, a0, 32
+; RV64IXTHEADBB-NEXT:    add a0, a0, a1
+; RV64IXTHEADBB-NEXT:    ret
+  ret i64 1157442765409226768 ; 0x0101010101010101
+}
diff --git a/llvm/test/CodeGen/RISCV/rv64-legal-i32/mem.ll b/llvm/test/CodeGen/RISCV/rv64-legal-i32/mem.ll
new file mode 100644
index 0000000000000..456a880891f73
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rv64-legal-i32/mem.ll
@@ -0,0 +1,92 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv64 -verify-machineinstrs < %s \
+; RUN:   -riscv-experimental-rv64-legal-i32 | FileCheck -check-prefix=RV64I %s
+
+; Check indexed and unindexed, sext, zext and anyext loads
+
+define void @lb(ptr %a, ptr %b) nounwind {
+; RV64I-LABEL: lb:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lb a2, 1(a0)
+; RV64I-NEXT:    lbu zero, 0(a0)
+; RV64I-NEXT:    sw a2, 0(a1)
+; RV64I-NEXT:    ret
+  %1 = getelementptr i8, ptr %a, i32 1
+  %2 = load i8, ptr %1
+  %3 = sext i8 %2 to i32
+  ; the unused load will produce an anyext for selection
+  %4 = load volatile i8, ptr %a
+  store i32 %3, ptr %b
+  ret void
+}
+
+define void @lbu(ptr %a, ptr %b) nounwind {
+; RV64I-LABEL: lbu:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lbu a0, 1(a0)
+; RV64I-NEXT:    sw a0, 0(a1)
+; RV64I-NEXT:    ret
+  %1 = getelementptr i8, ptr %a, i32 1
+  %2 = load i8, ptr %1
+  %3 = zext i8 %2 to i32
+  store i32 %3, ptr %b
+  ret void
+}
+
+define void @lh(ptr %a, ptr %b) nounwind {
+; RV64I-LABEL: lh:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lh a2, 2(a0)
+; RV64I-NEXT:    lh zero, 0(a0)
+; RV64I-NEXT:    sw a2, 0(a1)
+; RV64I-NEXT:    ret
+  %1 = getelementptr i16, ptr %a, i32 1
+  %2 = load i16, ptr %1
+  %3 = sext i16 %2 to i32
+  ; the unused load will produce an anyext for selection
+  %4 = load volatile i16, ptr %a
+  store i32 %3, ptr %b
+  ret void
+}
+
+define void @lhu(ptr %a, ptr %b) nounwind {
+; RV64I-LABEL: lhu:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lhu a0, 2(a0)
+; RV64I-NEXT:    sw a0, 0(a1)
+; RV64I-NEXT:    ret
+  %1 = getelementptr i16, ptr %a, i32 1
+  %2 = load i16, ptr %1
+  %3 = zext i16 %2 to i32
+  store i32 %3, ptr %b
+  ret void
+}
+
+define void @lw(ptr %a, ptr %b) nounwind {
+; RV64I-LABEL: lw:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lw a2, 4(a0)
+; RV64I-NEXT:    lw zero, 0(a0)
+; RV64I-NEXT:    sd a2, 0(a1)
+; RV64I-NEXT:    ret
+  %1 = getelementptr i32, ptr %a, i64 1
+  %2 = load i32, ptr %1
+  %3 = sext i32 %2 to i64
+  ; the unused load will produce an anyext for selection
+  %4 = load volatile i32, ptr %a
+  store i64 %3, ptr %b
+  ret void
+}
+
+define void @lwu(ptr %a, ptr %b) nounwind {
+; RV64I-LABEL: lwu:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lwu a0, 4(a0)
+; RV64I-NEXT:    sd a0, 0(a1)
+; RV64I-NEXT:    ret
+  %1 = getelementptr i32, ptr %a, i64 1
+  %2 = load i32, ptr %1
+  %3 = zext i32 %2 to i64
+  store i64 %3, ptr %b
+  ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/rv64-legal-i32/mem64.ll b/llvm/test/CodeGen/RISCV/rv64-legal-i32/mem64.ll
new file mode 100644
index 0000000000000..76ab0e7d5810e
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rv64-legal-i32/mem64.ll
@@ -0,0 +1,341 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv64 -verify-machineinstrs < %s \
+; RUN:   -riscv-experimental-rv64-legal-i32 | FileCheck -check-prefix=RV64I %s
+
+; Check indexed and unindexed, sext, zext and anyext loads
+
+define dso_local i64 @lb(ptr %a) nounwind {
+; RV64I-LABEL: lb:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lb a1, 1(a0)
+; RV64I-NEXT:    lbu zero, 0(a0)
+; RV64I-NEXT:    mv a0, a1
+; RV64I-NEXT:    ret
+  %1 = getelementptr i8, ptr %a, i32 1
+  %2 = load i8, ptr %1
+  %3 = sext i8 %2 to i64
+  ; the unused load will produce an anyext for selection
+  %4 = load volatile i8, ptr %a
+  ret i64 %3
+}
+
+define dso_local i64 @lh(ptr %a) nounwind {
+; RV64I-LABEL: lh:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lh a1, 4(a0)
+; RV64I-NEXT:    lh zero, 0(a0)
+; RV64I-NEXT:    mv a0, a1
+; RV64I-NEXT:    ret
+  %1 = getelementptr i16, ptr %a, i32 2
+  %2 = load i16, ptr %1
+  %3 = sext i16 %2 to i64
+  ; the unused load will produce an anyext for selection
+  %4 = load volatile i16, ptr %a
+  ret i64 %3
+}
+
+define dso_local i64 @lw(ptr %a) nounwind {
+; RV64I-LABEL: lw:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lw a1, 12(a0)
+; RV64I-NEXT:    lw zero, 0(a0)
+; RV64I-NEXT:    mv a0, a1
+; RV64I-NEXT:    ret
+  %1 = getelementptr i32, ptr %a, i32 3
+  %2 = load i32, ptr %1
+  %3 = sext i32 %2 to i64
+  ; the unused load will produce an anyext for selection
+  %4 = load volatile i32, ptr %a
+  ret i64 %3
+}
+
+define dso_local i64 @lbu(ptr %a) nounwind {
+; RV64I-LABEL: lbu:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lbu a1, 4(a0)
+; RV64I-NEXT:    lbu a0, 0(a0)
+; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    ret
+  %1 = getelementptr i8, ptr %a, i32 4
+  %2 = load i8, ptr %1
+  %3 = zext i8 %2 to i64
+  %4 = load volatile i8, ptr %a
+  %5 = zext i8 %4 to i64
+  %6 = add i64 %3, %5
+  ret i64 %6
+}
+
+define dso_local i64 @lhu(ptr %a) nounwind {
+; RV64I-LABEL: lhu:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lhu a1, 10(a0)
+; RV64I-NEXT:    lhu a0, 0(a0)
+; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    ret
+  %1 = getelementptr i16, ptr %a, i32 5
+  %2 = load i16, ptr %1
+  %3 = zext i16 %2 to i64
+  %4 = load volatile i16, ptr %a
+  %5 = zext i16 %4 to i64
+  %6 = add i64 %3, %5
+  ret i64 %6
+}
+
+define dso_local i64 @lwu(ptr %a) nounwind {
+; RV64I-LABEL: lwu:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lwu a1, 24(a0)
+; RV64I-NEXT:    lwu a0, 0(a0)
+; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    ret
+  %1 = getelementptr i32, ptr %a, i32 6
+  %2 = load i32, ptr %1
+  %3 = zext i32 %2 to i64
+  %4 = load volatile i32, ptr %a
+  %5 = zext i32 %4 to i64
+  %6 = add i64 %3, %5
+  ret i64 %6
+}
+
+; 64-bit loads and stores
+
+define dso_local i64 @ld(ptr %a) nounwind {
+; RV64I-LABEL: ld:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    ld a1, 80(a0)
+; RV64I-NEXT:    ld zero, 0(a0)
+; RV64I-NEXT:    mv a0, a1
+; RV64I-NEXT:    ret
+  %1 = getelementptr i64, ptr %a, i32 10
+  %2 = load i64, ptr %1
+  %3 = load volatile i64, ptr %a
+  ret i64 %2
+}
+
+define dso_local void @sd(ptr %a, i64 %b) nounwind {
+; RV64I-LABEL: sd:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sd a1, 0(a0)
+; RV64I-NEXT:    sd a1, 88(a0)
+; RV64I-NEXT:    ret
+  store i64 %b, ptr %a
+  %1 = getelementptr i64, ptr %a, i32 11
+  store i64 %b, ptr %1
+  ret void
+}
+
+; Check load and store to an i1 location
+define dso_local i64 @load_sext_zext_anyext_i1(ptr %a) nounwind {
+; RV64I-LABEL: load_sext_zext_anyext_i1:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lbu a1, 1(a0)
+; RV64I-NEXT:    lbu a2, 2(a0)
+; RV64I-NEXT:    lbu zero, 0(a0)
+; RV64I-NEXT:    sub a0, a2, a1
+; RV64I-NEXT:    ret
+  ; sextload i1
+  %1 = getelementptr i1, ptr %a, i32 1
+  %2 = load i1, ptr %1
+  %3 = sext i1 %2 to i64
+  ; zextload i1
+  %4 = getelementptr i1, ptr %a, i32 2
+  %5 = load i1, ptr %4
+  %6 = zext i1 %5 to i64
+  %7 = add i64 %3, %6
+  ; extload i1 (anyext). Produced as the load is unused.
+  %8 = load volatile i1, ptr %a
+  ret i64 %7
+}
+
+define dso_local i16 @load_sext_zext_anyext_i1_i16(ptr %a) nounwind {
+; RV64I-LABEL: load_sext_zext_anyext_i1_i16:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lbu a1, 1(a0)
+; RV64I-NEXT:    lbu a2, 2(a0)
+; RV64I-NEXT:    lbu zero, 0(a0)
+; RV64I-NEXT:    subw a0, a2, a1
+; RV64I-NEXT:    ret
+  ; sextload i1
+  %1 = getelementptr i1, ptr %a, i32 1
+  %2 = load i1, ptr %1
+  %3 = sext i1 %2 to i16
+  ; zextload i1
+  %4 = getelementptr i1, ptr %a, i32 2
+  %5 = load i1, ptr %4
+  %6 = zext i1 %5 to i16
+  %7 = add i16 %3, %6
+  ; extload i1 (anyext). Produced as the load is unused.
+  %8 = load volatile i1, ptr %a
+  ret i16 %7
+}
+
+; Check load and store to a global
+@G = dso_local global i64 0
+
+define dso_local i64 @ld_sd_global(i64 %a) nounwind {
+; RV64I-LABEL: ld_sd_global:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lui a2, %hi(G)
+; RV64I-NEXT:    ld a1, %lo(G)(a2)
+; RV64I-NEXT:    addi a3, a2, %lo(G)
+; RV64I-NEXT:    sd a0, %lo(G)(a2)
+; RV64I-NEXT:    ld zero, 72(a3)
+; RV64I-NEXT:    sd a0, 72(a3)
+; RV64I-NEXT:    mv a0, a1
+; RV64I-NEXT:    ret
+  %1 = load volatile i64, ptr @G
+  store i64 %a, ptr @G
+  %2 = getelementptr i64, ptr @G, i64 9
+  %3 = load volatile i64, ptr %2
+  store i64 %a, ptr %2
+  ret i64 %1
+}
+
+define i64 @lw_near_local(ptr %a)  {
+; RV64I-LABEL: lw_near_local:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi a0, a0, 2047
+; RV64I-NEXT:    ld a0, 9(a0)
+; RV64I-NEXT:    ret
+  %1 = getelementptr inbounds i64, ptr %a, i64 257
+  %2 = load volatile i64, ptr %1
+  ret i64 %2
+}
+
+define void @st_near_local(ptr %a, i64 %b)  {
+; RV64I-LABEL: st_near_local:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi a0, a0, 2047
+; RV64I-NEXT:    sd a1, 9(a0)
+; RV64I-NEXT:    ret
+  %1 = getelementptr inbounds i64, ptr %a, i64 257
+  store i64 %b, ptr %1
+  ret void
+}
+
+define i64 @lw_sw_near_local(ptr %a, i64 %b)  {
+; RV64I-LABEL: lw_sw_near_local:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi a2, a0, 2047
+; RV64I-NEXT:    ld a0, 9(a2)
+; RV64I-NEXT:    sd a1, 9(a2)
+; RV64I-NEXT:    ret
+  %1 = getelementptr inbounds i64, ptr %a, i64 257
+  %2 = load volatile i64, ptr %1
+  store i64 %b, ptr %1
+  ret i64 %2
+}
+
+define i64 @lw_far_local(ptr %a)  {
+; RV64I-LABEL: lw_far_local:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lui a1, 8
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ld a0, -8(a0)
+; RV64I-NEXT:    ret
+  %1 = getelementptr inbounds i64, ptr %a, i64 4095
+  %2 = load volatile i64, ptr %1
+  ret i64 %2
+}
+
+define void @st_far_local(ptr %a, i64 %b)  {
+; RV64I-LABEL: st_far_local:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lui a2, 8
+; RV64I-NEXT:    add a0, a0, a2
+; RV64I-NEXT:    sd a1, -8(a0)
+; RV64I-NEXT:    ret
+  %1 = getelementptr inbounds i64, ptr %a, i64 4095
+  store i64 %b, ptr %1
+  ret void
+}
+
+define i64 @lw_sw_far_local(ptr %a, i64 %b)  {
+; RV64I-LABEL: lw_sw_far_local:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lui a2, 8
+; RV64I-NEXT:    add a2, a0, a2
+; RV64I-NEXT:    ld a0, -8(a2)
+; RV64I-NEXT:    sd a1, -8(a2)
+; RV64I-NEXT:    ret
+  %1 = getelementptr inbounds i64, ptr %a, i64 4095
+  %2 = load volatile i64, ptr %1
+  store i64 %b, ptr %1
+  ret i64 %2
+}
+
+; Make sure we don't fold the addiw into the load offset. The sign extend of the
+; addiw is required.
+define i64 @lw_really_far_local(ptr %a)  {
+; RV64I-LABEL: lw_really_far_local:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lui a1, 524288
+; RV64I-NEXT:    addiw a1, a1, -2048
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ld a0, 0(a0)
+; RV64I-NEXT:    ret
+  %1 = getelementptr inbounds i64, ptr %a, i64 268435200
+  %2 = load volatile i64, ptr %1
+  ret i64 %2
+}
+
+; Make sure we don't fold the addiw into the store offset. The sign extend of
+; the addiw is required.
+define void @st_really_far_local(ptr %a, i64 %b)  {
+; RV64I-LABEL: st_really_far_local:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lui a2, 524288
+; RV64I-NEXT:    addiw a2, a2, -2048
+; RV64I-NEXT:    add a0, a0, a2
+; RV64I-NEXT:    sd a1, 0(a0)
+; RV64I-NEXT:    ret
+  %1 = getelementptr inbounds i64, ptr %a, i64 268435200
+  store i64 %b, ptr %1
+  ret void
+}
+
+; Make sure we don't fold the addiw into the load/store offset. The sign extend
+; of the addiw is required.
+define i64 @lw_sw_really_far_local(ptr %a, i64 %b)  {
+; RV64I-LABEL: lw_sw_really_far_local:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lui a2, 524288
+; RV64I-NEXT:    addiw a2, a2, -2048
+; RV64I-NEXT:    add a2, a0, a2
+; RV64I-NEXT:    ld a0, 0(a2)
+; RV64I-NEXT:    sd a1, 0(a2)
+; RV64I-NEXT:    ret
+  %1 = getelementptr inbounds i64, ptr %a, i64 268435200
+  %2 = load volatile i64, ptr %1
+  store i64 %b, ptr %1
+  ret i64 %2
+}
+
+%struct.quux = type { i32, [0 x i8] }
+
+; Make sure we don't remove the addi and fold the C from
+; (add (addi FrameIndex, C), X) into the store address.
+; FrameIndex cannot be the operand of an ADD. We must keep the ADDI.
+define void @addi_fold_crash(i64 %arg) nounwind {
+; RV64I-LABEL: addi_fold_crash:
+; RV64I:       # %bb.0: # %bb
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    addi a1, sp, 4
+; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    sb zero, 0(a0)
+; RV64I-NEXT:    mv a0, a1
+; RV64I-NEXT:    call snork@plt
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
+bb:
+  %tmp = alloca %struct.quux, align 8
+  %tmp1 = getelementptr inbounds %struct.quux, ptr %tmp, i64 0, i32 1
+  %tmp2 = getelementptr inbounds %struct.quux, ptr %tmp, i64 0, i32 1, i64 %arg
+  store i8 0, ptr %tmp2, align 1
+  call void @snork(ptr %tmp1)
+  ret void
+}
+
+declare void @snork(ptr)
diff --git a/llvm/test/CodeGen/RISCV/rv64-legal-i32/rem.ll b/llvm/test/CodeGen/RISCV/rv64-legal-i32/rem.ll
new file mode 100644
index 0000000000000..11adbbdd245f1
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rv64-legal-i32/rem.ll
@@ -0,0 +1,390 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv64 -verify-machineinstrs < %s \
+; RUN:   -riscv-experimental-rv64-legal-i32 | FileCheck -check-prefix=RV64I %s
+; RUN: llc -mtriple=riscv64 -mattr=+m -verify-machineinstrs < %s \
+; RUN:   -riscv-experimental-rv64-legal-i32 | FileCheck -check-prefix=RV64IM %s
+
+define i32 @urem(i32 %a, i32 %b) nounwind {
+; RV64I-LABEL: urem:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    slli a1, a1, 32
+; RV64I-NEXT:    srli a1, a1, 32
+; RV64I-NEXT:    call __umoddi3@plt
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
+;
+; RV64IM-LABEL: urem:
+; RV64IM:       # %bb.0:
+; RV64IM-NEXT:    remuw a0, a0, a1
+; RV64IM-NEXT:    ret
+  %1 = urem i32 %a, %b
+  ret i32 %1
+}
+
+define i32 @urem_constant_lhs(i32 %a) nounwind {
+; RV64I-LABEL: urem_constant_lhs:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a1, a0, 32
+; RV64I-NEXT:    li a0, 10
+; RV64I-NEXT:    call __umoddi3@plt
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
+;
+; RV64IM-LABEL: urem_constant_lhs:
+; RV64IM:       # %bb.0:
+; RV64IM-NEXT:    li a1, 10
+; RV64IM-NEXT:    remuw a0, a1, a0
+; RV64IM-NEXT:    ret
+  %1 = urem i32 10, %a
+  ret i32 %1
+}
+
+define i32 @srem(i32 %a, i32 %b) nounwind {
+; RV64I-LABEL: srem:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sext.w a0, a0
+; RV64I-NEXT:    sext.w a1, a1
+; RV64I-NEXT:    call __moddi3@plt
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
+;
+; RV64IM-LABEL: srem:
+; RV64IM:       # %bb.0:
+; RV64IM-NEXT:    remw a0, a0, a1
+; RV64IM-NEXT:    ret
+  %1 = srem i32 %a, %b
+  ret i32 %1
+}
+
+define i32 @srem_pow2(i32 %a) nounwind {
+; RV64I-LABEL: srem_pow2:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sraiw a1, a0, 31
+; RV64I-NEXT:    srliw a1, a1, 29
+; RV64I-NEXT:    add a1, a0, a1
+; RV64I-NEXT:    andi a1, a1, -8
+; RV64I-NEXT:    subw a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64IM-LABEL: srem_pow2:
+; RV64IM:       # %bb.0:
+; RV64IM-NEXT:    sraiw a1, a0, 31
+; RV64IM-NEXT:    srliw a1, a1, 29
+; RV64IM-NEXT:    add a1, a0, a1
+; RV64IM-NEXT:    andi a1, a1, -8
+; RV64IM-NEXT:    subw a0, a0, a1
+; RV64IM-NEXT:    ret
+  %1 = srem i32 %a, 8
+  ret i32 %1
+}
+
+define i32 @srem_pow2_2(i32 %a) nounwind {
+; RV64I-LABEL: srem_pow2_2:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sraiw a1, a0, 31
+; RV64I-NEXT:    srliw a1, a1, 16
+; RV64I-NEXT:    add a1, a0, a1
+; RV64I-NEXT:    lui a2, 1048560
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    subw a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64IM-LABEL: srem_pow2_2:
+; RV64IM:       # %bb.0:
+; RV64IM-NEXT:    sraiw a1, a0, 31
+; RV64IM-NEXT:    srliw a1, a1, 16
+; RV64IM-NEXT:    add a1, a0, a1
+; RV64IM-NEXT:    lui a2, 1048560
+; RV64IM-NEXT:    and a1, a1, a2
+; RV64IM-NEXT:    subw a0, a0, a1
+; RV64IM-NEXT:    ret
+  %1 = srem i32 %a, 65536
+  ret i32 %1
+}
+
+define i32 @srem_constant_lhs(i32 %a) nounwind {
+; RV64I-LABEL: srem_constant_lhs:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sext.w a1, a0
+; RV64I-NEXT:    li a0, -10
+; RV64I-NEXT:    call __moddi3@plt
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
+;
+; RV64IM-LABEL: srem_constant_lhs:
+; RV64IM:       # %bb.0:
+; RV64IM-NEXT:    li a1, -10
+; RV64IM-NEXT:    remw a0, a1, a0
+; RV64IM-NEXT:    ret
+  %1 = srem i32 -10, %a
+  ret i32 %1
+}
+
+define i64 @urem64(i64 %a, i64 %b) nounwind {
+; RV64I-LABEL: urem64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    tail __umoddi3@plt
+;
+; RV64IM-LABEL: urem64:
+; RV64IM:       # %bb.0:
+; RV64IM-NEXT:    remu a0, a0, a1
+; RV64IM-NEXT:    ret
+  %1 = urem i64 %a, %b
+  ret i64 %1
+}
+
+define i64 @urem64_constant_lhs(i64 %a) nounwind {
+; RV64I-LABEL: urem64_constant_lhs:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    li a0, 10
+; RV64I-NEXT:    tail __umoddi3@plt
+;
+; RV64IM-LABEL: urem64_constant_lhs:
+; RV64IM:       # %bb.0:
+; RV64IM-NEXT:    li a1, 10
+; RV64IM-NEXT:    remu a0, a1, a0
+; RV64IM-NEXT:    ret
+  %1 = urem i64 10, %a
+  ret i64 %1
+}
+
+define i64 @srem64(i64 %a, i64 %b) nounwind {
+; RV64I-LABEL: srem64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    tail __moddi3@plt
+;
+; RV64IM-LABEL: srem64:
+; RV64IM:       # %bb.0:
+; RV64IM-NEXT:    rem a0, a0, a1
+; RV64IM-NEXT:    ret
+  %1 = srem i64 %a, %b
+  ret i64 %1
+}
+
+define i64 @srem64_constant_lhs(i64 %a) nounwind {
+; RV64I-LABEL: srem64_constant_lhs:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    mv a1, a0
+; RV64I-NEXT:    li a0, -10
+; RV64I-NEXT:    tail __moddi3@plt
+;
+; RV64IM-LABEL: srem64_constant_lhs:
+; RV64IM:       # %bb.0:
+; RV64IM-NEXT:    li a1, -10
+; RV64IM-NEXT:    rem a0, a1, a0
+; RV64IM-NEXT:    ret
+  %1 = srem i64 -10, %a
+  ret i64 %1
+}
+
+define i8 @urem8(i8 %a, i8 %b) nounwind {
+; RV64I-LABEL: urem8:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    andi a0, a0, 255
+; RV64I-NEXT:    andi a1, a1, 255
+; RV64I-NEXT:    call __umoddi3@plt
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
+;
+; RV64IM-LABEL: urem8:
+; RV64IM:       # %bb.0:
+; RV64IM-NEXT:    andi a1, a1, 255
+; RV64IM-NEXT:    andi a0, a0, 255
+; RV64IM-NEXT:    remuw a0, a0, a1
+; RV64IM-NEXT:    ret
+  %1 = urem i8 %a, %b
+  ret i8 %1
+}
+
+define i8 @urem8_constant_lhs(i8 %a) nounwind {
+; RV64I-LABEL: urem8_constant_lhs:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    andi a1, a0, 255
+; RV64I-NEXT:    li a0, 10
+; RV64I-NEXT:    call __umoddi3@plt
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
+;
+; RV64IM-LABEL: urem8_constant_lhs:
+; RV64IM:       # %bb.0:
+; RV64IM-NEXT:    andi a0, a0, 255
+; RV64IM-NEXT:    li a1, 10
+; RV64IM-NEXT:    remuw a0, a1, a0
+; RV64IM-NEXT:    ret
+  %1 = urem i8 10, %a
+  ret i8 %1
+}
+
+
+define i8 @srem8(i8 %a, i8 %b) nounwind {
+; RV64I-LABEL: srem8:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    slli a1, a1, 24
+; RV64I-NEXT:    sraiw a1, a1, 24
+; RV64I-NEXT:    slli a0, a0, 24
+; RV64I-NEXT:    sraiw a0, a0, 24
+; RV64I-NEXT:    call __moddi3@plt
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
+;
+; RV64IM-LABEL: srem8:
+; RV64IM:       # %bb.0:
+; RV64IM-NEXT:    slli a1, a1, 24
+; RV64IM-NEXT:    sraiw a1, a1, 24
+; RV64IM-NEXT:    slli a0, a0, 24
+; RV64IM-NEXT:    sraiw a0, a0, 24
+; RV64IM-NEXT:    remw a0, a0, a1
+; RV64IM-NEXT:    ret
+  %1 = srem i8 %a, %b
+  ret i8 %1
+}
+
+define i8 @srem8_constant_lhs(i8 %a) nounwind {
+; RV64I-LABEL: srem8_constant_lhs:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    slli a0, a0, 24
+; RV64I-NEXT:    sraiw a1, a0, 24
+; RV64I-NEXT:    li a0, -10
+; RV64I-NEXT:    call __moddi3@plt
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
+;
+; RV64IM-LABEL: srem8_constant_lhs:
+; RV64IM:       # %bb.0:
+; RV64IM-NEXT:    slli a0, a0, 24
+; RV64IM-NEXT:    sraiw a0, a0, 24
+; RV64IM-NEXT:    li a1, -10
+; RV64IM-NEXT:    remw a0, a1, a0
+; RV64IM-NEXT:    ret
+  %1 = srem i8 -10, %a
+  ret i8 %1
+}
+
+
+define i16 @urem16(i16 %a, i16 %b) nounwind {
+; RV64I-LABEL: urem16:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lui a2, 16
+; RV64I-NEXT:    addiw a2, a2, -1
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    call __umoddi3@plt
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
+;
+; RV64IM-LABEL: urem16:
+; RV64IM:       # %bb.0:
+; RV64IM-NEXT:    lui a2, 16
+; RV64IM-NEXT:    addi a2, a2, -1
+; RV64IM-NEXT:    and a1, a1, a2
+; RV64IM-NEXT:    and a0, a0, a2
+; RV64IM-NEXT:    remuw a0, a0, a1
+; RV64IM-NEXT:    ret
+  %1 = urem i16 %a, %b
+  ret i16 %1
+}
+
+define i16 @urem16_constant_lhs(i16 %a) nounwind {
+; RV64I-LABEL: urem16_constant_lhs:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    slli a0, a0, 48
+; RV64I-NEXT:    srli a1, a0, 48
+; RV64I-NEXT:    li a0, 10
+; RV64I-NEXT:    call __umoddi3@plt
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
+;
+; RV64IM-LABEL: urem16_constant_lhs:
+; RV64IM:       # %bb.0:
+; RV64IM-NEXT:    slli a0, a0, 48
+; RV64IM-NEXT:    srli a0, a0, 48
+; RV64IM-NEXT:    li a1, 10
+; RV64IM-NEXT:    remuw a0, a1, a0
+; RV64IM-NEXT:    ret
+  %1 = urem i16 10, %a
+  ret i16 %1
+}
+
+define i16 @srem16(i16 %a, i16 %b) nounwind {
+; RV64I-LABEL: srem16:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    slli a1, a1, 16
+; RV64I-NEXT:    sraiw a1, a1, 16
+; RV64I-NEXT:    slli a0, a0, 16
+; RV64I-NEXT:    sraiw a0, a0, 16
+; RV64I-NEXT:    call __moddi3@plt
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
+;
+; RV64IM-LABEL: srem16:
+; RV64IM:       # %bb.0:
+; RV64IM-NEXT:    slli a1, a1, 16
+; RV64IM-NEXT:    sraiw a1, a1, 16
+; RV64IM-NEXT:    slli a0, a0, 16
+; RV64IM-NEXT:    sraiw a0, a0, 16
+; RV64IM-NEXT:    remw a0, a0, a1
+; RV64IM-NEXT:    ret
+  %1 = srem i16 %a, %b
+  ret i16 %1
+}
+
+define i16 @srem16_constant_lhs(i16 %a) nounwind {
+; RV64I-LABEL: srem16_constant_lhs:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    slli a0, a0, 16
+; RV64I-NEXT:    sraiw a1, a0, 16
+; RV64I-NEXT:    li a0, -10
+; RV64I-NEXT:    call __moddi3@plt
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
+;
+; RV64IM-LABEL: srem16_constant_lhs:
+; RV64IM:       # %bb.0:
+; RV64IM-NEXT:    slli a0, a0, 16
+; RV64IM-NEXT:    sraiw a0, a0, 16
+; RV64IM-NEXT:    li a1, -10
+; RV64IM-NEXT:    remw a0, a1, a0
+; RV64IM-NEXT:    ret
+  %1 = srem i16 -10, %a
+  ret i16 %1
+}
diff --git a/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64xtheadbb.ll b/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64xtheadbb.ll
new file mode 100644
index 0000000000000..bb2f2b73d4a0c
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64xtheadbb.ll
@@ -0,0 +1,902 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv64 -verify-machineinstrs < %s \
+; RUN:   -riscv-experimental-rv64-legal-i32 | FileCheck %s -check-prefix=RV64I
+; RUN: llc -mtriple=riscv64 -mattr=+xtheadbb -verify-machineinstrs < %s \
+; RUN:   -riscv-experimental-rv64-legal-i32 | FileCheck %s -check-prefix=RV64XTHEADBB
+
+declare i32 @llvm.ctlz.i32(i32, i1)
+
+define signext i32 @ctlz_i32(i32 signext %a) nounwind {
+; RV64I-LABEL: ctlz_i32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    beqz a0, .LBB0_2
+; RV64I-NEXT:  # %bb.1: # %cond.false
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    srliw a1, a0, 1
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    srliw a1, a0, 2
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    srliw a1, a0, 4
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    srliw a1, a0, 8
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    srliw a1, a0, 16
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    not a0, a0
+; RV64I-NEXT:    srliw a1, a0, 1
+; RV64I-NEXT:    lui a2, 349525
+; RV64I-NEXT:    addi a2, a2, 1365
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    subw a0, a0, a1
+; RV64I-NEXT:    lui a1, 209715
+; RV64I-NEXT:    addi a1, a1, 819
+; RV64I-NEXT:    and a2, a0, a1
+; RV64I-NEXT:    srliw a0, a0, 2
+; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    add a0, a2, a0
+; RV64I-NEXT:    srliw a1, a0, 4
+; RV64I-NEXT:    addw a0, a0, a1
+; RV64I-NEXT:    lui a1, 61681
+; RV64I-NEXT:    addiw a1, a1, -241
+; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    lui a1, 4112
+; RV64I-NEXT:    addiw a1, a1, 257
+; RV64I-NEXT:    call __muldi3@plt
+; RV64I-NEXT:    srliw a0, a0, 24
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
+; RV64I-NEXT:  .LBB0_2:
+; RV64I-NEXT:    li a0, 32
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBB-LABEL: ctlz_i32:
+; RV64XTHEADBB:       # %bb.0:
+; RV64XTHEADBB-NEXT:    th.extu a0, a0, 31, 0
+; RV64XTHEADBB-NEXT:    th.ff1 a0, a0
+; RV64XTHEADBB-NEXT:    addi a0, a0, -32
+; RV64XTHEADBB-NEXT:    ret
+  %1 = call i32 @llvm.ctlz.i32(i32 %a, i1 false)
+  ret i32 %1
+}
+
+define signext i32 @log2_i32(i32 signext %a) nounwind {
+; RV64I-LABEL: log2_i32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    beqz a0, .LBB1_2
+; RV64I-NEXT:  # %bb.1: # %cond.false
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    srliw a1, a0, 1
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    srliw a1, a0, 2
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    srliw a1, a0, 4
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    srliw a1, a0, 8
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    srliw a1, a0, 16
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    not a0, a0
+; RV64I-NEXT:    srliw a1, a0, 1
+; RV64I-NEXT:    lui a2, 349525
+; RV64I-NEXT:    addi a2, a2, 1365
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    subw a0, a0, a1
+; RV64I-NEXT:    lui a1, 209715
+; RV64I-NEXT:    addi a1, a1, 819
+; RV64I-NEXT:    and a2, a0, a1
+; RV64I-NEXT:    srliw a0, a0, 2
+; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    add a0, a2, a0
+; RV64I-NEXT:    srliw a1, a0, 4
+; RV64I-NEXT:    addw a0, a0, a1
+; RV64I-NEXT:    lui a1, 61681
+; RV64I-NEXT:    addiw a1, a1, -241
+; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    lui a1, 4112
+; RV64I-NEXT:    addiw a1, a1, 257
+; RV64I-NEXT:    call __muldi3@plt
+; RV64I-NEXT:    srliw a0, a0, 24
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    j .LBB1_3
+; RV64I-NEXT:  .LBB1_2:
+; RV64I-NEXT:    li a0, 32
+; RV64I-NEXT:  .LBB1_3: # %cond.end
+; RV64I-NEXT:    li a1, 31
+; RV64I-NEXT:    subw a0, a1, a0
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBB-LABEL: log2_i32:
+; RV64XTHEADBB:       # %bb.0:
+; RV64XTHEADBB-NEXT:    th.extu a0, a0, 31, 0
+; RV64XTHEADBB-NEXT:    th.ff1 a0, a0
+; RV64XTHEADBB-NEXT:    addi a0, a0, -32
+; RV64XTHEADBB-NEXT:    li a1, 31
+; RV64XTHEADBB-NEXT:    subw a0, a1, a0
+; RV64XTHEADBB-NEXT:    ret
+  %1 = call i32 @llvm.ctlz.i32(i32 %a, i1 false)
+  %2 = sub i32 31, %1
+  ret i32 %2
+}
+
+define signext i32 @log2_ceil_i32(i32 signext %a) nounwind {
+; RV64I-LABEL: log2_ceil_i32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 0(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    addiw a0, a0, -1
+; RV64I-NEXT:    li s0, 32
+; RV64I-NEXT:    li a1, 32
+; RV64I-NEXT:    beqz a0, .LBB2_2
+; RV64I-NEXT:  # %bb.1: # %cond.false
+; RV64I-NEXT:    srliw a1, a0, 1
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    srliw a1, a0, 2
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    srliw a1, a0, 4
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    srliw a1, a0, 8
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    srliw a1, a0, 16
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    not a0, a0
+; RV64I-NEXT:    srliw a1, a0, 1
+; RV64I-NEXT:    lui a2, 349525
+; RV64I-NEXT:    addi a2, a2, 1365
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    subw a0, a0, a1
+; RV64I-NEXT:    lui a1, 209715
+; RV64I-NEXT:    addi a1, a1, 819
+; RV64I-NEXT:    and a2, a0, a1
+; RV64I-NEXT:    srliw a0, a0, 2
+; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    add a0, a2, a0
+; RV64I-NEXT:    srliw a1, a0, 4
+; RV64I-NEXT:    addw a0, a0, a1
+; RV64I-NEXT:    lui a1, 61681
+; RV64I-NEXT:    addiw a1, a1, -241
+; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    lui a1, 4112
+; RV64I-NEXT:    addiw a1, a1, 257
+; RV64I-NEXT:    call __muldi3@plt
+; RV64I-NEXT:    srliw a1, a0, 24
+; RV64I-NEXT:  .LBB2_2: # %cond.end
+; RV64I-NEXT:    subw a0, s0, a1
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 0(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBB-LABEL: log2_ceil_i32:
+; RV64XTHEADBB:       # %bb.0:
+; RV64XTHEADBB-NEXT:    addi a0, a0, -1
+; RV64XTHEADBB-NEXT:    slli a0, a0, 32
+; RV64XTHEADBB-NEXT:    srli a0, a0, 32
+; RV64XTHEADBB-NEXT:    th.ff1 a0, a0
+; RV64XTHEADBB-NEXT:    addi a0, a0, -32
+; RV64XTHEADBB-NEXT:    li a1, 32
+; RV64XTHEADBB-NEXT:    subw a0, a1, a0
+; RV64XTHEADBB-NEXT:    ret
+  %1 = sub i32 %a, 1
+  %2 = call i32 @llvm.ctlz.i32(i32 %1, i1 false)
+  %3 = sub i32 32, %2
+  ret i32 %3
+}
+
+define signext i32 @findLastSet_i32(i32 signext %a) nounwind {
+; RV64I-LABEL: findLastSet_i32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 0(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    mv s0, a0
+; RV64I-NEXT:    srliw a0, a0, 1
+; RV64I-NEXT:    or a0, s0, a0
+; RV64I-NEXT:    srliw a1, a0, 2
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    srliw a1, a0, 4
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    srliw a1, a0, 8
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    srliw a1, a0, 16
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    not a0, a0
+; RV64I-NEXT:    srliw a1, a0, 1
+; RV64I-NEXT:    lui a2, 349525
+; RV64I-NEXT:    addi a2, a2, 1365
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    subw a0, a0, a1
+; RV64I-NEXT:    lui a1, 209715
+; RV64I-NEXT:    addi a1, a1, 819
+; RV64I-NEXT:    and a2, a0, a1
+; RV64I-NEXT:    srliw a0, a0, 2
+; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    add a0, a2, a0
+; RV64I-NEXT:    srliw a1, a0, 4
+; RV64I-NEXT:    addw a0, a0, a1
+; RV64I-NEXT:    lui a1, 61681
+; RV64I-NEXT:    addiw a1, a1, -241
+; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    lui a1, 4112
+; RV64I-NEXT:    addiw a1, a1, 257
+; RV64I-NEXT:    call __muldi3@plt
+; RV64I-NEXT:    srliw a0, a0, 24
+; RV64I-NEXT:    xori a0, a0, 31
+; RV64I-NEXT:    snez a1, s0
+; RV64I-NEXT:    addiw a1, a1, -1
+; RV64I-NEXT:    or a0, a1, a0
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 0(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBB-LABEL: findLastSet_i32:
+; RV64XTHEADBB:       # %bb.0:
+; RV64XTHEADBB-NEXT:    th.extu a1, a0, 31, 0
+; RV64XTHEADBB-NEXT:    th.ff1 a1, a1
+; RV64XTHEADBB-NEXT:    addi a1, a1, -32
+; RV64XTHEADBB-NEXT:    xori a1, a1, 31
+; RV64XTHEADBB-NEXT:    snez a0, a0
+; RV64XTHEADBB-NEXT:    addi a0, a0, -1
+; RV64XTHEADBB-NEXT:    or a0, a0, a1
+; RV64XTHEADBB-NEXT:    ret
+  %1 = call i32 @llvm.ctlz.i32(i32 %a, i1 true)
+  %2 = xor i32 31, %1
+  %3 = icmp eq i32 %a, 0
+  %4 = select i1 %3, i32 -1, i32 %2
+  ret i32 %4
+}
+
+define i32 @ctlz_lshr_i32(i32 signext %a) {
+; RV64I-LABEL: ctlz_lshr_i32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    srliw a0, a0, 1
+; RV64I-NEXT:    beqz a0, .LBB4_2
+; RV64I-NEXT:  # %bb.1: # %cond.false
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    .cfi_def_cfa_offset 16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    .cfi_offset ra, -8
+; RV64I-NEXT:    srliw a1, a0, 1
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    srliw a1, a0, 2
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    srliw a1, a0, 4
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    srliw a1, a0, 8
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    srliw a1, a0, 16
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    not a0, a0
+; RV64I-NEXT:    srliw a1, a0, 1
+; RV64I-NEXT:    lui a2, 349525
+; RV64I-NEXT:    addi a2, a2, 1365
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    subw a0, a0, a1
+; RV64I-NEXT:    lui a1, 209715
+; RV64I-NEXT:    addi a1, a1, 819
+; RV64I-NEXT:    and a2, a0, a1
+; RV64I-NEXT:    srliw a0, a0, 2
+; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    add a0, a2, a0
+; RV64I-NEXT:    srliw a1, a0, 4
+; RV64I-NEXT:    addw a0, a0, a1
+; RV64I-NEXT:    lui a1, 61681
+; RV64I-NEXT:    addiw a1, a1, -241
+; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    lui a1, 4112
+; RV64I-NEXT:    addiw a1, a1, 257
+; RV64I-NEXT:    call __muldi3@plt
+; RV64I-NEXT:    srliw a0, a0, 24
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
+; RV64I-NEXT:  .LBB4_2:
+; RV64I-NEXT:    li a0, 32
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBB-LABEL: ctlz_lshr_i32:
+; RV64XTHEADBB:       # %bb.0:
+; RV64XTHEADBB-NEXT:    srliw a0, a0, 1
+; RV64XTHEADBB-NEXT:    slli a0, a0, 32
+; RV64XTHEADBB-NEXT:    srli a0, a0, 32
+; RV64XTHEADBB-NEXT:    th.ff1 a0, a0
+; RV64XTHEADBB-NEXT:    addi a0, a0, -32
+; RV64XTHEADBB-NEXT:    ret
+  %1 = lshr i32 %a, 1
+  %2 = call i32 @llvm.ctlz.i32(i32 %1, i1 false)
+  ret i32 %2
+}
+
+declare i64 @llvm.ctlz.i64(i64, i1)
+
+define i64 @ctlz_i64(i64 %a) nounwind {
+; RV64I-LABEL: ctlz_i64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    beqz a0, .LBB5_2
+; RV64I-NEXT:  # %bb.1: # %cond.false
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    srli a1, a0, 1
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    srli a1, a0, 2
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    srli a1, a0, 4
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    srli a1, a0, 8
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    srli a1, a0, 16
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    srli a1, a0, 32
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    not a0, a0
+; RV64I-NEXT:    srli a1, a0, 1
+; RV64I-NEXT:    lui a2, 349525
+; RV64I-NEXT:    addiw a2, a2, 1365
+; RV64I-NEXT:    slli a3, a2, 32
+; RV64I-NEXT:    add a2, a2, a3
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    sub a0, a0, a1
+; RV64I-NEXT:    lui a1, 209715
+; RV64I-NEXT:    addiw a1, a1, 819
+; RV64I-NEXT:    slli a2, a1, 32
+; RV64I-NEXT:    add a1, a1, a2
+; RV64I-NEXT:    and a2, a0, a1
+; RV64I-NEXT:    srli a0, a0, 2
+; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    add a0, a2, a0
+; RV64I-NEXT:    srli a1, a0, 4
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    lui a1, 61681
+; RV64I-NEXT:    addiw a1, a1, -241
+; RV64I-NEXT:    slli a2, a1, 32
+; RV64I-NEXT:    add a1, a1, a2
+; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    lui a1, 4112
+; RV64I-NEXT:    addiw a1, a1, 257
+; RV64I-NEXT:    slli a2, a1, 32
+; RV64I-NEXT:    add a1, a1, a2
+; RV64I-NEXT:    call __muldi3@plt
+; RV64I-NEXT:    srli a0, a0, 56
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
+; RV64I-NEXT:  .LBB5_2:
+; RV64I-NEXT:    li a0, 64
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBB-LABEL: ctlz_i64:
+; RV64XTHEADBB:       # %bb.0:
+; RV64XTHEADBB-NEXT:    th.ff1 a0, a0
+; RV64XTHEADBB-NEXT:    ret
+  %1 = call i64 @llvm.ctlz.i64(i64 %a, i1 false)
+  ret i64 %1
+}
+
+declare i32 @llvm.cttz.i32(i32, i1)
+
+define signext i32 @cttz_i32(i32 signext %a) nounwind {
+; RV64I-LABEL: cttz_i32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    beqz a0, .LBB6_2
+; RV64I-NEXT:  # %bb.1: # %cond.false
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    negw a1, a0
+; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    lui a1, 30667
+; RV64I-NEXT:    addiw a1, a1, 1329
+; RV64I-NEXT:    call __muldi3@plt
+; RV64I-NEXT:    srliw a0, a0, 27
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    lui a1, %hi(.LCPI6_0)
+; RV64I-NEXT:    addi a1, a1, %lo(.LCPI6_0)
+; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    lbu a0, 0(a0)
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
+; RV64I-NEXT:  .LBB6_2:
+; RV64I-NEXT:    li a0, 32
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBB-LABEL: cttz_i32:
+; RV64XTHEADBB:       # %bb.0:
+; RV64XTHEADBB-NEXT:    beqz a0, .LBB6_2
+; RV64XTHEADBB-NEXT:  # %bb.1: # %cond.false
+; RV64XTHEADBB-NEXT:    addi sp, sp, -16
+; RV64XTHEADBB-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64XTHEADBB-NEXT:    negw a1, a0
+; RV64XTHEADBB-NEXT:    and a0, a0, a1
+; RV64XTHEADBB-NEXT:    lui a1, 30667
+; RV64XTHEADBB-NEXT:    addiw a1, a1, 1329
+; RV64XTHEADBB-NEXT:    call __muldi3@plt
+; RV64XTHEADBB-NEXT:    srliw a0, a0, 27
+; RV64XTHEADBB-NEXT:    slli a0, a0, 32
+; RV64XTHEADBB-NEXT:    srli a0, a0, 32
+; RV64XTHEADBB-NEXT:    lui a1, %hi(.LCPI6_0)
+; RV64XTHEADBB-NEXT:    addi a1, a1, %lo(.LCPI6_0)
+; RV64XTHEADBB-NEXT:    add a0, a1, a0
+; RV64XTHEADBB-NEXT:    lbu a0, 0(a0)
+; RV64XTHEADBB-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64XTHEADBB-NEXT:    addi sp, sp, 16
+; RV64XTHEADBB-NEXT:    ret
+; RV64XTHEADBB-NEXT:  .LBB6_2:
+; RV64XTHEADBB-NEXT:    li a0, 32
+; RV64XTHEADBB-NEXT:    ret
+; RV64ZBB-LABEL: cttz_i32:
+; RV64ZBB:       # %bb.0:
+; RV64ZBB-NEXT:    ctzw a0, a0
+; RV64ZBB-NEXT:    ret
+  %1 = call i32 @llvm.cttz.i32(i32 %a, i1 false)
+  ret i32 %1
+}
+
+define signext i32 @cttz_zero_undef_i32(i32 signext %a) nounwind {
+; RV64I-LABEL: cttz_zero_undef_i32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    negw a1, a0
+; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    lui a1, 30667
+; RV64I-NEXT:    addiw a1, a1, 1329
+; RV64I-NEXT:    call __muldi3@plt
+; RV64I-NEXT:    srliw a0, a0, 27
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    lui a1, %hi(.LCPI7_0)
+; RV64I-NEXT:    addi a1, a1, %lo(.LCPI7_0)
+; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    lbu a0, 0(a0)
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBB-LABEL: cttz_zero_undef_i32:
+; RV64XTHEADBB:       # %bb.0:
+; RV64XTHEADBB-NEXT:    addi sp, sp, -16
+; RV64XTHEADBB-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64XTHEADBB-NEXT:    negw a1, a0
+; RV64XTHEADBB-NEXT:    and a0, a0, a1
+; RV64XTHEADBB-NEXT:    lui a1, 30667
+; RV64XTHEADBB-NEXT:    addiw a1, a1, 1329
+; RV64XTHEADBB-NEXT:    call __muldi3@plt
+; RV64XTHEADBB-NEXT:    srliw a0, a0, 27
+; RV64XTHEADBB-NEXT:    slli a0, a0, 32
+; RV64XTHEADBB-NEXT:    srli a0, a0, 32
+; RV64XTHEADBB-NEXT:    lui a1, %hi(.LCPI7_0)
+; RV64XTHEADBB-NEXT:    addi a1, a1, %lo(.LCPI7_0)
+; RV64XTHEADBB-NEXT:    add a0, a1, a0
+; RV64XTHEADBB-NEXT:    lbu a0, 0(a0)
+; RV64XTHEADBB-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64XTHEADBB-NEXT:    addi sp, sp, 16
+; RV64XTHEADBB-NEXT:    ret
+  %1 = call i32 @llvm.cttz.i32(i32 %a, i1 true)
+  ret i32 %1
+}
+
+define signext i32 @findFirstSet_i32(i32 signext %a) nounwind {
+; RV64I-LABEL: findFirstSet_i32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 0(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    mv s0, a0
+; RV64I-NEXT:    negw a0, a0
+; RV64I-NEXT:    and a0, s0, a0
+; RV64I-NEXT:    lui a1, 30667
+; RV64I-NEXT:    addiw a1, a1, 1329
+; RV64I-NEXT:    call __muldi3@plt
+; RV64I-NEXT:    srliw a0, a0, 27
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    lui a1, %hi(.LCPI8_0)
+; RV64I-NEXT:    addi a1, a1, %lo(.LCPI8_0)
+; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    lbu a0, 0(a0)
+; RV64I-NEXT:    snez a1, s0
+; RV64I-NEXT:    addi a1, a1, -1
+; RV64I-NEXT:    or a0, a1, a0
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 0(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBB-LABEL: findFirstSet_i32:
+; RV64XTHEADBB:       # %bb.0:
+; RV64XTHEADBB-NEXT:    addi sp, sp, -16
+; RV64XTHEADBB-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64XTHEADBB-NEXT:    sd s0, 0(sp) # 8-byte Folded Spill
+; RV64XTHEADBB-NEXT:    mv s0, a0
+; RV64XTHEADBB-NEXT:    negw a0, a0
+; RV64XTHEADBB-NEXT:    and a0, s0, a0
+; RV64XTHEADBB-NEXT:    lui a1, 30667
+; RV64XTHEADBB-NEXT:    addiw a1, a1, 1329
+; RV64XTHEADBB-NEXT:    call __muldi3@plt
+; RV64XTHEADBB-NEXT:    srliw a0, a0, 27
+; RV64XTHEADBB-NEXT:    slli a0, a0, 32
+; RV64XTHEADBB-NEXT:    srli a0, a0, 32
+; RV64XTHEADBB-NEXT:    lui a1, %hi(.LCPI8_0)
+; RV64XTHEADBB-NEXT:    addi a1, a1, %lo(.LCPI8_0)
+; RV64XTHEADBB-NEXT:    add a0, a1, a0
+; RV64XTHEADBB-NEXT:    lbu a0, 0(a0)
+; RV64XTHEADBB-NEXT:    snez a1, s0
+; RV64XTHEADBB-NEXT:    addi a1, a1, -1
+; RV64XTHEADBB-NEXT:    or a0, a1, a0
+; RV64XTHEADBB-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64XTHEADBB-NEXT:    ld s0, 0(sp) # 8-byte Folded Reload
+; RV64XTHEADBB-NEXT:    addi sp, sp, 16
+; RV64XTHEADBB-NEXT:    ret
+  %1 = call i32 @llvm.cttz.i32(i32 %a, i1 true)
+  %2 = icmp eq i32 %a, 0
+  %3 = select i1 %2, i32 -1, i32 %1
+  ret i32 %3
+}
+
+define signext i32 @ffs_i32(i32 signext %a) nounwind {
+; RV64I-LABEL: ffs_i32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 0(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    mv s0, a0
+; RV64I-NEXT:    negw a0, a0
+; RV64I-NEXT:    and a0, s0, a0
+; RV64I-NEXT:    lui a1, 30667
+; RV64I-NEXT:    addiw a1, a1, 1329
+; RV64I-NEXT:    call __muldi3@plt
+; RV64I-NEXT:    srliw a0, a0, 27
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    lui a1, %hi(.LCPI9_0)
+; RV64I-NEXT:    addi a1, a1, %lo(.LCPI9_0)
+; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    lbu a0, 0(a0)
+; RV64I-NEXT:    addi a0, a0, 1
+; RV64I-NEXT:    seqz a1, s0
+; RV64I-NEXT:    addi a1, a1, -1
+; RV64I-NEXT:    and a0, a1, a0
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 0(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBB-LABEL: ffs_i32:
+; RV64XTHEADBB:       # %bb.0:
+; RV64XTHEADBB-NEXT:    addi sp, sp, -16
+; RV64XTHEADBB-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64XTHEADBB-NEXT:    sd s0, 0(sp) # 8-byte Folded Spill
+; RV64XTHEADBB-NEXT:    mv s0, a0
+; RV64XTHEADBB-NEXT:    negw a0, a0
+; RV64XTHEADBB-NEXT:    and a0, s0, a0
+; RV64XTHEADBB-NEXT:    lui a1, 30667
+; RV64XTHEADBB-NEXT:    addiw a1, a1, 1329
+; RV64XTHEADBB-NEXT:    call __muldi3@plt
+; RV64XTHEADBB-NEXT:    srliw a0, a0, 27
+; RV64XTHEADBB-NEXT:    slli a0, a0, 32
+; RV64XTHEADBB-NEXT:    srli a0, a0, 32
+; RV64XTHEADBB-NEXT:    lui a1, %hi(.LCPI9_0)
+; RV64XTHEADBB-NEXT:    addi a1, a1, %lo(.LCPI9_0)
+; RV64XTHEADBB-NEXT:    add a0, a1, a0
+; RV64XTHEADBB-NEXT:    lbu a0, 0(a0)
+; RV64XTHEADBB-NEXT:    addi a0, a0, 1
+; RV64XTHEADBB-NEXT:    seqz a1, s0
+; RV64XTHEADBB-NEXT:    addi a1, a1, -1
+; RV64XTHEADBB-NEXT:    and a0, a1, a0
+; RV64XTHEADBB-NEXT:    slli a0, a0, 32
+; RV64XTHEADBB-NEXT:    srli a0, a0, 32
+; RV64XTHEADBB-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64XTHEADBB-NEXT:    ld s0, 0(sp) # 8-byte Folded Reload
+; RV64XTHEADBB-NEXT:    addi sp, sp, 16
+; RV64XTHEADBB-NEXT:    ret
+  %1 = call i32 @llvm.cttz.i32(i32 %a, i1 true)
+  %2 = add i32 %1, 1
+  %3 = icmp eq i32 %a, 0
+  %4 = select i1 %3, i32 0, i32 %2
+  ret i32 %4
+}
+
+declare i64 @llvm.cttz.i64(i64, i1)
+
+define i64 @cttz_i64(i64 %a) nounwind {
+; RV64I-LABEL: cttz_i64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    beqz a0, .LBB10_2
+; RV64I-NEXT:  # %bb.1: # %cond.false
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    neg a1, a0
+; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    lui a1, %hi(.LCPI10_0)
+; RV64I-NEXT:    ld a1, %lo(.LCPI10_0)(a1)
+; RV64I-NEXT:    call __muldi3@plt
+; RV64I-NEXT:    srli a0, a0, 58
+; RV64I-NEXT:    lui a1, %hi(.LCPI10_1)
+; RV64I-NEXT:    addi a1, a1, %lo(.LCPI10_1)
+; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    lbu a0, 0(a0)
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
+; RV64I-NEXT:  .LBB10_2:
+; RV64I-NEXT:    li a0, 64
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBB-LABEL: cttz_i64:
+; RV64XTHEADBB:       # %bb.0:
+; RV64XTHEADBB-NEXT:    beqz a0, .LBB10_2
+; RV64XTHEADBB-NEXT:  # %bb.1: # %cond.false
+; RV64XTHEADBB-NEXT:    addi a1, a0, -1
+; RV64XTHEADBB-NEXT:    not a0, a0
+; RV64XTHEADBB-NEXT:    and a0, a0, a1
+; RV64XTHEADBB-NEXT:    th.ff1 a0, a0
+; RV64XTHEADBB-NEXT:    li a1, 64
+; RV64XTHEADBB-NEXT:    sub a0, a1, a0
+; RV64XTHEADBB-NEXT:    ret
+; RV64XTHEADBB-NEXT:  .LBB10_2:
+; RV64XTHEADBB-NEXT:    li a0, 64
+; RV64XTHEADBB-NEXT:    ret
+  %1 = call i64 @llvm.cttz.i64(i64 %a, i1 false)
+  ret i64 %1
+}
+
+define signext i32 @sextb_i32(i32 signext %a) nounwind {
+; RV64I-LABEL: sextb_i32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 56
+; RV64I-NEXT:    srai a0, a0, 56
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBB-LABEL: sextb_i32:
+; RV64XTHEADBB:       # %bb.0:
+; RV64XTHEADBB-NEXT:    th.ext a0, a0, 7, 0
+; RV64XTHEADBB-NEXT:    ret
+  %shl = shl i32 %a, 24
+  %shr = ashr exact i32 %shl, 24
+  ret i32 %shr
+}
+
+define i64 @sextb_i64(i64 %a) nounwind {
+; RV64I-LABEL: sextb_i64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 56
+; RV64I-NEXT:    srai a0, a0, 56
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBB-LABEL: sextb_i64:
+; RV64XTHEADBB:       # %bb.0:
+; RV64XTHEADBB-NEXT:    th.ext a0, a0, 7, 0
+; RV64XTHEADBB-NEXT:    ret
+  %shl = shl i64 %a, 56
+  %shr = ashr exact i64 %shl, 56
+  ret i64 %shr
+}
+
+define signext i32 @sexth_i32(i32 signext %a) nounwind {
+; RV64I-LABEL: sexth_i32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 48
+; RV64I-NEXT:    srai a0, a0, 48
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBB-LABEL: sexth_i32:
+; RV64XTHEADBB:       # %bb.0:
+; RV64XTHEADBB-NEXT:    th.ext a0, a0, 15, 0
+; RV64XTHEADBB-NEXT:    ret
+  %shl = shl i32 %a, 16
+  %shr = ashr exact i32 %shl, 16
+  ret i32 %shr
+}
+
+define signext i32 @no_sexth_i32(i32 signext %a) nounwind {
+; RV64I-LABEL: no_sexth_i32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 17
+; RV64I-NEXT:    sraiw a0, a0, 16
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBB-LABEL: no_sexth_i32:
+; RV64XTHEADBB:       # %bb.0:
+; RV64XTHEADBB-NEXT:    slli a0, a0, 17
+; RV64XTHEADBB-NEXT:    sraiw a0, a0, 16
+; RV64XTHEADBB-NEXT:    ret
+  %shl = shl i32 %a, 17
+  %shr = ashr exact i32 %shl, 16
+  ret i32 %shr
+}
+
+define i64 @sexth_i64(i64 %a) nounwind {
+; RV64I-LABEL: sexth_i64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 48
+; RV64I-NEXT:    srai a0, a0, 48
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBB-LABEL: sexth_i64:
+; RV64XTHEADBB:       # %bb.0:
+; RV64XTHEADBB-NEXT:    th.ext a0, a0, 15, 0
+; RV64XTHEADBB-NEXT:    ret
+  %shl = shl i64 %a, 48
+  %shr = ashr exact i64 %shl, 48
+  ret i64 %shr
+}
+
+define i64 @no_sexth_i64(i64 %a) nounwind {
+; RV64I-LABEL: no_sexth_i64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 49
+; RV64I-NEXT:    srai a0, a0, 48
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBB-LABEL: no_sexth_i64:
+; RV64XTHEADBB:       # %bb.0:
+; RV64XTHEADBB-NEXT:    slli a0, a0, 49
+; RV64XTHEADBB-NEXT:    srai a0, a0, 48
+; RV64XTHEADBB-NEXT:    ret
+  %shl = shl i64 %a, 49
+  %shr = ashr exact i64 %shl, 48
+  ret i64 %shr
+}
+
+define i32 @zexth_i32(i32 %a) nounwind {
+; RV64I-LABEL: zexth_i32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 48
+; RV64I-NEXT:    srli a0, a0, 48
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBB-LABEL: zexth_i32:
+; RV64XTHEADBB:       # %bb.0:
+; RV64XTHEADBB-NEXT:    th.extu a0, a0, 15, 0
+; RV64XTHEADBB-NEXT:    ret
+  %and = and i32 %a, 65535
+  ret i32 %and
+}
+
+define i64 @zexth_i64(i64 %a) nounwind {
+; RV64I-LABEL: zexth_i64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 48
+; RV64I-NEXT:    srli a0, a0, 48
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBB-LABEL: zexth_i64:
+; RV64XTHEADBB:       # %bb.0:
+; RV64XTHEADBB-NEXT:    th.extu a0, a0, 15, 0
+; RV64XTHEADBB-NEXT:    ret
+  %and = and i64 %a, 65535
+  ret i64 %and
+}
+
+define i64 @zext_bf_i64(i64 %a) nounwind {
+; RV64I-LABEL: zext_bf_i64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 47
+; RV64I-NEXT:    srli a0, a0, 48
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBB-LABEL: zext_bf_i64:
+; RV64XTHEADBB:       # %bb.0:
+; RV64XTHEADBB-NEXT:    th.extu a0, a0, 16, 1
+; RV64XTHEADBB-NEXT:    ret
+  %1 = lshr i64 %a, 1
+  %and = and i64 %1, 65535
+  ret i64 %and
+}
+
+define i64 @zext_i64_srliw(i64 %a) nounwind {
+; RV64I-LABEL: zext_i64_srliw:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    srliw a0, a0, 16
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBB-LABEL: zext_i64_srliw:
+; RV64XTHEADBB:       # %bb.0:
+; RV64XTHEADBB-NEXT:    srliw a0, a0, 16
+; RV64XTHEADBB-NEXT:    ret
+  %1 = lshr i64 %a, 16
+  %and = and i64 %1, 65535
+  ret i64 %and
+}
+
+declare i32 @llvm.bswap.i32(i32)
+
+define signext i32 @bswap_i32(i32 signext %a) nounwind {
+; RV64I-LABEL: bswap_i32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    srliw a1, a0, 8
+; RV64I-NEXT:    lui a2, 16
+; RV64I-NEXT:    addiw a2, a2, -256
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    srliw a3, a0, 24
+; RV64I-NEXT:    or a1, a1, a3
+; RV64I-NEXT:    and a2, a0, a2
+; RV64I-NEXT:    slliw a2, a2, 8
+; RV64I-NEXT:    slliw a0, a0, 24
+; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBB-LABEL: bswap_i32:
+; RV64XTHEADBB:       # %bb.0:
+; RV64XTHEADBB-NEXT:    th.revw a0, a0
+; RV64XTHEADBB-NEXT:    ret
+  %1 = tail call i32 @llvm.bswap.i32(i32 %a)
+  ret i32 %1
+}
+
+; Similar to bswap_i32 but the result is not sign extended.
+define void @bswap_i32_nosext(i32 signext %a, ptr %x) nounwind {
+; RV64I-LABEL: bswap_i32_nosext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    srliw a2, a0, 8
+; RV64I-NEXT:    lui a3, 16
+; RV64I-NEXT:    addi a3, a3, -256
+; RV64I-NEXT:    and a2, a2, a3
+; RV64I-NEXT:    srliw a4, a0, 24
+; RV64I-NEXT:    or a2, a2, a4
+; RV64I-NEXT:    and a3, a0, a3
+; RV64I-NEXT:    slli a3, a3, 8
+; RV64I-NEXT:    slli a0, a0, 24
+; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    sw a0, 0(a1)
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBB-LABEL: bswap_i32_nosext:
+; RV64XTHEADBB:       # %bb.0:
+; RV64XTHEADBB-NEXT:    th.revw a0, a0
+; RV64XTHEADBB-NEXT:    sw a0, 0(a1)
+; RV64XTHEADBB-NEXT:    ret
+  %1 = tail call i32 @llvm.bswap.i32(i32 %a)
+  store i32 %1, ptr %x
+  ret void
+}
+
+declare i64 @llvm.bswap.i64(i64)
+
+define i64 @bswap_i64(i64 %a) {
+; RV64I-LABEL: bswap_i64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    srli a1, a0, 40
+; RV64I-NEXT:    lui a2, 16
+; RV64I-NEXT:    addiw a2, a2, -256
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    srli a3, a0, 56
+; RV64I-NEXT:    or a1, a1, a3
+; RV64I-NEXT:    srli a3, a0, 24
+; RV64I-NEXT:    lui a4, 4080
+; RV64I-NEXT:    and a3, a3, a4
+; RV64I-NEXT:    srli a5, a0, 8
+; RV64I-NEXT:    srliw a5, a5, 24
+; RV64I-NEXT:    slli a5, a5, 24
+; RV64I-NEXT:    or a3, a5, a3
+; RV64I-NEXT:    or a1, a3, a1
+; RV64I-NEXT:    and a4, a0, a4
+; RV64I-NEXT:    slli a4, a4, 24
+; RV64I-NEXT:    srliw a3, a0, 24
+; RV64I-NEXT:    slli a3, a3, 32
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    and a2, a0, a2
+; RV64I-NEXT:    slli a2, a2, 40
+; RV64I-NEXT:    slli a0, a0, 56
+; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64XTHEADBB-LABEL: bswap_i64:
+; RV64XTHEADBB:       # %bb.0:
+; RV64XTHEADBB-NEXT:    th.rev a0, a0
+; RV64XTHEADBB-NEXT:    ret
+  %1 = call i64 @llvm.bswap.i64(i64 %a)
+  ret i64 %1
+}
diff --git a/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64zba.ll b/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64zba.ll
new file mode 100644
index 0000000000000..8005ad60b8a11
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64zba.ll
@@ -0,0 +1,1798 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv64 -mattr=+m -verify-machineinstrs < %s \
+; RUN:   -riscv-experimental-rv64-legal-i32 | FileCheck %s -check-prefixes=CHECK,RV64I
+; RUN: llc -mtriple=riscv64 -mattr=+m,+zba -verify-machineinstrs < %s \
+; RUN:   -riscv-experimental-rv64-legal-i32 | FileCheck %s -check-prefixes=CHECK,RV64ZBA,RV64ZBANOZBB
+; RUN: llc -mtriple=riscv64 -mattr=+m,+zba,+zbb -verify-machineinstrs < %s \
+; RUN:   -riscv-experimental-rv64-legal-i32 | FileCheck %s -check-prefixes=CHECK,RV64ZBA,RV64ZBAZBB
+
+define i64 @slliuw(i64 %a) nounwind {
+; RV64I-LABEL: slliuw:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 31
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: slliuw:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    slli.uw a0, a0, 1
+; RV64ZBA-NEXT:    ret
+  %conv1 = shl i64 %a, 1
+  %shl = and i64 %conv1, 8589934590
+  ret i64 %shl
+}
+
+define i128 @slliuw_2(i32 signext %0, ptr %1) {
+; RV64I-LABEL: slliuw_2:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 28
+; RV64I-NEXT:    add a1, a1, a0
+; RV64I-NEXT:    ld a0, 0(a1)
+; RV64I-NEXT:    ld a1, 8(a1)
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: slliuw_2:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    slli.uw a0, a0, 4
+; RV64ZBA-NEXT:    add a1, a1, a0
+; RV64ZBA-NEXT:    ld a0, 0(a1)
+; RV64ZBA-NEXT:    ld a1, 8(a1)
+; RV64ZBA-NEXT:    ret
+  %3 = zext i32 %0 to i64
+  %4 = getelementptr inbounds i128, ptr %1, i64 %3
+  %5 = load i128, ptr %4
+  ret i128 %5
+}
+
+define i64 @adduw(i64 %a, i64 %b) nounwind {
+; RV64I-LABEL: adduw:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a1, a1, 32
+; RV64I-NEXT:    srli a1, a1, 32
+; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: adduw:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    add.uw a0, a1, a0
+; RV64ZBA-NEXT:    ret
+  %and = and i64 %b, 4294967295
+  %add = add i64 %and, %a
+  ret i64 %add
+}
+
+define signext i8 @adduw_2(i32 signext %0, ptr %1) {
+; RV64I-LABEL: adduw_2:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    lb a0, 0(a0)
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: adduw_2:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    add.uw a0, a0, a1
+; RV64ZBA-NEXT:    lb a0, 0(a0)
+; RV64ZBA-NEXT:    ret
+  %3 = zext i32 %0 to i64
+  %4 = getelementptr inbounds i8, ptr %1, i64 %3
+  %5 = load i8, ptr %4
+  ret i8 %5
+}
+
+define i64 @zextw_i64(i64 %a) nounwind {
+; RV64I-LABEL: zextw_i64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: zextw_i64:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    zext.w a0, a0
+; RV64ZBA-NEXT:    ret
+  %and = and i64 %a, 4294967295
+  ret i64 %and
+}
+
+; This makes sure targetShrinkDemandedConstant changes the and immmediate to
+; allow zext.w or slli+srli.
+define i64 @zextw_demandedbits_i64(i64 %0) {
+; RV64I-LABEL: zextw_demandedbits_i64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    ori a0, a0, 1
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: zextw_demandedbits_i64:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    ori a0, a0, 1
+; RV64ZBA-NEXT:    zext.w a0, a0
+; RV64ZBA-NEXT:    ret
+  %2 = and i64 %0, 4294967294
+  %3 = or i64 %2, 1
+  ret i64 %3
+}
+
+define signext i16 @sh1add(i64 %0, ptr %1) {
+; RV64I-LABEL: sh1add:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 1
+; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    lh a0, 0(a0)
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: sh1add:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    sh1add a0, a0, a1
+; RV64ZBA-NEXT:    lh a0, 0(a0)
+; RV64ZBA-NEXT:    ret
+  %3 = getelementptr inbounds i16, ptr %1, i64 %0
+  %4 = load i16, ptr %3
+  ret i16 %4
+}
+
+define signext i32 @sh2add(i64 %0, ptr %1) {
+; RV64I-LABEL: sh2add:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 2
+; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    lw a0, 0(a0)
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: sh2add:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    sh2add a0, a0, a1
+; RV64ZBA-NEXT:    lw a0, 0(a0)
+; RV64ZBA-NEXT:    ret
+  %3 = getelementptr inbounds i32, ptr %1, i64 %0
+  %4 = load i32, ptr %3
+  ret i32 %4
+}
+
+define i64 @sh3add(i64 %0, ptr %1) {
+; RV64I-LABEL: sh3add:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 3
+; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    ld a0, 0(a0)
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: sh3add:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    sh3add a0, a0, a1
+; RV64ZBA-NEXT:    ld a0, 0(a0)
+; RV64ZBA-NEXT:    ret
+  %3 = getelementptr inbounds i64, ptr %1, i64 %0
+  %4 = load i64, ptr %3
+  ret i64 %4
+}
+
+define signext i16 @sh1adduw(i32 signext %0, ptr %1) {
+; RV64I-LABEL: sh1adduw:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 31
+; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    lh a0, 0(a0)
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: sh1adduw:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    sh1add.uw a0, a0, a1
+; RV64ZBA-NEXT:    lh a0, 0(a0)
+; RV64ZBA-NEXT:    ret
+  %3 = zext i32 %0 to i64
+  %4 = getelementptr inbounds i16, ptr %1, i64 %3
+  %5 = load i16, ptr %4
+  ret i16 %5
+}
+
+define i64 @sh1adduw_2(i64 %0, i64 %1) {
+; RV64I-LABEL: sh1adduw_2:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 31
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: sh1adduw_2:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    sh1add.uw a0, a0, a1
+; RV64ZBA-NEXT:    ret
+  %3 = shl i64 %0, 1
+  %4 = and i64 %3, 8589934590
+  %5 = add i64 %4, %1
+  ret i64 %5
+}
+
+define signext i32 @sh2adduw(i32 signext %0, ptr %1) {
+; RV64I-LABEL: sh2adduw:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 30
+; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    lw a0, 0(a0)
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: sh2adduw:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    sh2add.uw a0, a0, a1
+; RV64ZBA-NEXT:    lw a0, 0(a0)
+; RV64ZBA-NEXT:    ret
+  %3 = zext i32 %0 to i64
+  %4 = getelementptr inbounds i32, ptr %1, i64 %3
+  %5 = load i32, ptr %4
+  ret i32 %5
+}
+
+define i64 @sh2adduw_2(i64 %0, i64 %1) {
+; RV64I-LABEL: sh2adduw_2:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 30
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: sh2adduw_2:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    sh2add.uw a0, a0, a1
+; RV64ZBA-NEXT:    ret
+  %3 = shl i64 %0, 2
+  %4 = and i64 %3, 17179869180
+  %5 = add i64 %4, %1
+  ret i64 %5
+}
+
+define i64 @sh3adduw(i32 signext %0, ptr %1) {
+; RV64I-LABEL: sh3adduw:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 29
+; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    ld a0, 0(a0)
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: sh3adduw:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    sh3add.uw a0, a0, a1
+; RV64ZBA-NEXT:    ld a0, 0(a0)
+; RV64ZBA-NEXT:    ret
+  %3 = zext i32 %0 to i64
+  %4 = getelementptr inbounds i64, ptr %1, i64 %3
+  %5 = load i64, ptr %4
+  ret i64 %5
+}
+
+define i64 @sh3adduw_2(i64 %0, i64 %1) {
+; RV64I-LABEL: sh3adduw_2:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 29
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: sh3adduw_2:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    sh3add.uw a0, a0, a1
+; RV64ZBA-NEXT:    ret
+  %3 = shl i64 %0, 3
+  %4 = and i64 %3, 34359738360
+  %5 = add i64 %4, %1
+  ret i64 %5
+}
+
+; Type legalization inserts a sext_inreg after the first add. That add will be
+; selected as sh2add which does not sign extend. SimplifyDemandedBits is unable
+; to remove the sext_inreg because it has multiple uses. The ashr will use the
+; sext_inreg to become sraiw. This leaves the sext_inreg only used by the shl.
+; If the shl is selected as sllw, we don't need the sext_inreg.
+define i64 @sh2add_extra_sext(i32 %x, i32 %y, i32 %z) {
+; RV64I-LABEL: sh2add_extra_sext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 2
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    slli a1, a0, 32
+; RV64I-NEXT:    srli a1, a1, 32
+; RV64I-NEXT:    sllw a1, a2, a1
+; RV64I-NEXT:    sraiw a0, a0, 2
+; RV64I-NEXT:    mul a0, a1, a0
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: sh2add_extra_sext:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    slli a0, a0, 2
+; RV64ZBA-NEXT:    add a0, a0, a1
+; RV64ZBA-NEXT:    zext.w a1, a0
+; RV64ZBA-NEXT:    sllw a1, a2, a1
+; RV64ZBA-NEXT:    sraiw a0, a0, 2
+; RV64ZBA-NEXT:    mul a0, a1, a0
+; RV64ZBA-NEXT:    ret
+  %a = shl i32 %x, 2
+  %b = add i32 %a, %y
+  %c = shl i32 %z, %b
+  %d = ashr i32 %b, 2
+  %e = sext i32 %c to i64
+  %f = sext i32 %d to i64
+  %g = mul i64 %e, %f
+  ret i64 %g
+}
+
+define i64 @addmul6(i64 %a, i64 %b) {
+; RV64I-LABEL: addmul6:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a2, 6
+; RV64I-NEXT:    mul a0, a0, a2
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: addmul6:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    sh1add a0, a0, a0
+; RV64ZBA-NEXT:    sh1add a0, a0, a1
+; RV64ZBA-NEXT:    ret
+  %c = mul i64 %a, 6
+  %d = add i64 %c, %b
+  ret i64 %d
+}
+
+define i64 @addmul10(i64 %a, i64 %b) {
+; RV64I-LABEL: addmul10:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a2, 10
+; RV64I-NEXT:    mul a0, a0, a2
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: addmul10:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    sh2add a0, a0, a0
+; RV64ZBA-NEXT:    sh1add a0, a0, a1
+; RV64ZBA-NEXT:    ret
+  %c = mul i64 %a, 10
+  %d = add i64 %c, %b
+  ret i64 %d
+}
+
+define i64 @addmul12(i64 %a, i64 %b) {
+; RV64I-LABEL: addmul12:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a2, 12
+; RV64I-NEXT:    mul a0, a0, a2
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: addmul12:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    sh1add a0, a0, a0
+; RV64ZBA-NEXT:    sh2add a0, a0, a1
+; RV64ZBA-NEXT:    ret
+  %c = mul i64 %a, 12
+  %d = add i64 %c, %b
+  ret i64 %d
+}
+
+define i64 @addmul18(i64 %a, i64 %b) {
+; RV64I-LABEL: addmul18:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a2, 18
+; RV64I-NEXT:    mul a0, a0, a2
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: addmul18:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    sh3add a0, a0, a0
+; RV64ZBA-NEXT:    sh1add a0, a0, a1
+; RV64ZBA-NEXT:    ret
+  %c = mul i64 %a, 18
+  %d = add i64 %c, %b
+  ret i64 %d
+}
+
+define i64 @addmul20(i64 %a, i64 %b) {
+; RV64I-LABEL: addmul20:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a2, 20
+; RV64I-NEXT:    mul a0, a0, a2
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: addmul20:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    sh2add a0, a0, a0
+; RV64ZBA-NEXT:    sh2add a0, a0, a1
+; RV64ZBA-NEXT:    ret
+  %c = mul i64 %a, 20
+  %d = add i64 %c, %b
+  ret i64 %d
+}
+
+define i64 @addmul24(i64 %a, i64 %b) {
+; RV64I-LABEL: addmul24:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a2, 24
+; RV64I-NEXT:    mul a0, a0, a2
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: addmul24:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    sh1add a0, a0, a0
+; RV64ZBA-NEXT:    sh3add a0, a0, a1
+; RV64ZBA-NEXT:    ret
+  %c = mul i64 %a, 24
+  %d = add i64 %c, %b
+  ret i64 %d
+}
+
+define i64 @addmul36(i64 %a, i64 %b) {
+; RV64I-LABEL: addmul36:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a2, 36
+; RV64I-NEXT:    mul a0, a0, a2
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: addmul36:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    sh3add a0, a0, a0
+; RV64ZBA-NEXT:    sh2add a0, a0, a1
+; RV64ZBA-NEXT:    ret
+  %c = mul i64 %a, 36
+  %d = add i64 %c, %b
+  ret i64 %d
+}
+
+define i64 @addmul40(i64 %a, i64 %b) {
+; RV64I-LABEL: addmul40:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a2, 40
+; RV64I-NEXT:    mul a0, a0, a2
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: addmul40:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    sh2add a0, a0, a0
+; RV64ZBA-NEXT:    sh3add a0, a0, a1
+; RV64ZBA-NEXT:    ret
+  %c = mul i64 %a, 40
+  %d = add i64 %c, %b
+  ret i64 %d
+}
+
+define i64 @addmul72(i64 %a, i64 %b) {
+; RV64I-LABEL: addmul72:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a2, 72
+; RV64I-NEXT:    mul a0, a0, a2
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: addmul72:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    sh3add a0, a0, a0
+; RV64ZBA-NEXT:    sh3add a0, a0, a1
+; RV64ZBA-NEXT:    ret
+  %c = mul i64 %a, 72
+  %d = add i64 %c, %b
+  ret i64 %d
+}
+
+define i64 @mul96(i64 %a) {
+; RV64I-LABEL: mul96:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a1, 96
+; RV64I-NEXT:    mul a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: mul96:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    sh1add a0, a0, a0
+; RV64ZBA-NEXT:    slli a0, a0, 5
+; RV64ZBA-NEXT:    ret
+  %c = mul i64 %a, 96
+  ret i64 %c
+}
+
+define i64 @mul160(i64 %a) {
+; RV64I-LABEL: mul160:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a1, 160
+; RV64I-NEXT:    mul a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: mul160:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    sh2add a0, a0, a0
+; RV64ZBA-NEXT:    slli a0, a0, 5
+; RV64ZBA-NEXT:    ret
+  %c = mul i64 %a, 160
+  ret i64 %c
+}
+
+define i64 @mul288(i64 %a) {
+; RV64I-LABEL: mul288:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a1, 288
+; RV64I-NEXT:    mul a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: mul288:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    sh3add a0, a0, a0
+; RV64ZBA-NEXT:    slli a0, a0, 5
+; RV64ZBA-NEXT:    ret
+  %c = mul i64 %a, 288
+  ret i64 %c
+}
+
+define i64 @zext_mul96(i32 signext %a) {
+; RV64I-LABEL: zext_mul96:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a1, 3
+; RV64I-NEXT:    slli a1, a1, 37
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    mulhu a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: zext_mul96:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    slli.uw a0, a0, 5
+; RV64ZBA-NEXT:    sh1add a0, a0, a0
+; RV64ZBA-NEXT:    ret
+  %b = zext i32 %a to i64
+  %c = mul i64 %b, 96
+  ret i64 %c
+}
+
+define i64 @zext_mul160(i32 signext %a) {
+; RV64I-LABEL: zext_mul160:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a1, 5
+; RV64I-NEXT:    slli a1, a1, 37
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    mulhu a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: zext_mul160:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    slli.uw a0, a0, 5
+; RV64ZBA-NEXT:    sh2add a0, a0, a0
+; RV64ZBA-NEXT:    ret
+  %b = zext i32 %a to i64
+  %c = mul i64 %b, 160
+  ret i64 %c
+}
+
+define i64 @zext_mul288(i32 signext %a) {
+; RV64I-LABEL: zext_mul288:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a1, 9
+; RV64I-NEXT:    slli a1, a1, 37
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    mulhu a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: zext_mul288:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    slli.uw a0, a0, 5
+; RV64ZBA-NEXT:    sh3add a0, a0, a0
+; RV64ZBA-NEXT:    ret
+  %b = zext i32 %a to i64
+  %c = mul i64 %b, 288
+  ret i64 %c
+}
+
+; We can't use slli.uw becaues the shift amount is more than 31.
+; FIXME: The zext.w is unneeded.
+define i64 @zext_mul12884901888(i32 signext %a) {
+; RV64I-LABEL: zext_mul12884901888:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    li a1, 3
+; RV64I-NEXT:    slli a1, a1, 32
+; RV64I-NEXT:    mul a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: zext_mul12884901888:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    andi a0, a0, -1
+; RV64ZBA-NEXT:    sh1add a0, a0, a0
+; RV64ZBA-NEXT:    slli a0, a0, 32
+; RV64ZBA-NEXT:    ret
+  %b = zext i32 %a to i64
+  %c = mul i64 %b, 12884901888
+  ret i64 %c
+}
+
+; We can't use slli.uw becaues the shift amount is more than 31.
+; FIXME: The zext.w is unneeded.
+define i64 @zext_mul21474836480(i32 signext %a) {
+; RV64I-LABEL: zext_mul21474836480:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    li a1, 5
+; RV64I-NEXT:    slli a1, a1, 32
+; RV64I-NEXT:    mul a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: zext_mul21474836480:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    andi a0, a0, -1
+; RV64ZBA-NEXT:    sh2add a0, a0, a0
+; RV64ZBA-NEXT:    slli a0, a0, 32
+; RV64ZBA-NEXT:    ret
+  %b = zext i32 %a to i64
+  %c = mul i64 %b, 21474836480
+  ret i64 %c
+}
+
+; We can't use slli.uw becaues the shift amount is more than 31.
+; FIXME: The zext.w is unneeded.
+define i64 @zext_mul38654705664(i32 signext %a) {
+; RV64I-LABEL: zext_mul38654705664:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    li a1, 9
+; RV64I-NEXT:    slli a1, a1, 32
+; RV64I-NEXT:    mul a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: zext_mul38654705664:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    andi a0, a0, -1
+; RV64ZBA-NEXT:    sh3add a0, a0, a0
+; RV64ZBA-NEXT:    slli a0, a0, 32
+; RV64ZBA-NEXT:    ret
+  %b = zext i32 %a to i64
+  %c = mul i64 %b, 38654705664
+  ret i64 %c
+}
+
+define i64 @sh1add_imm(i64 %0) {
+; CHECK-LABEL: sh1add_imm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    slli a0, a0, 1
+; CHECK-NEXT:    addi a0, a0, 5
+; CHECK-NEXT:    ret
+  %a = shl i64 %0, 1
+  %b = add i64 %a, 5
+  ret i64 %b
+}
+
+define i64 @sh2add_imm(i64 %0) {
+; CHECK-LABEL: sh2add_imm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    slli a0, a0, 2
+; CHECK-NEXT:    addi a0, a0, -6
+; CHECK-NEXT:    ret
+  %a = shl i64 %0, 2
+  %b = add i64 %a, -6
+  ret i64 %b
+}
+
+define i64 @sh3add_imm(i64 %0) {
+; CHECK-LABEL: sh3add_imm:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    addi a0, a0, 7
+; CHECK-NEXT:    ret
+  %a = shl i64 %0, 3
+  %b = add i64 %a, 7
+  ret i64 %b
+}
+
+define i64 @sh1adduw_imm(i32 signext %0) {
+; RV64I-LABEL: sh1adduw_imm:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 31
+; RV64I-NEXT:    addi a0, a0, 11
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: sh1adduw_imm:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    slli.uw a0, a0, 1
+; RV64ZBA-NEXT:    addi a0, a0, 11
+; RV64ZBA-NEXT:    ret
+  %a = zext i32 %0 to i64
+  %b = shl i64 %a, 1
+  %c = add i64 %b, 11
+  ret i64 %c
+}
+
+define i64 @sh2adduw_imm(i32 signext %0) {
+; RV64I-LABEL: sh2adduw_imm:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 30
+; RV64I-NEXT:    addi a0, a0, -12
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: sh2adduw_imm:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    slli.uw a0, a0, 2
+; RV64ZBA-NEXT:    addi a0, a0, -12
+; RV64ZBA-NEXT:    ret
+  %a = zext i32 %0 to i64
+  %b = shl i64 %a, 2
+  %c = add i64 %b, -12
+  ret i64 %c
+}
+
+define i64 @sh3adduw_imm(i32 signext %0) {
+; RV64I-LABEL: sh3adduw_imm:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 29
+; RV64I-NEXT:    addi a0, a0, 13
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: sh3adduw_imm:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    slli.uw a0, a0, 3
+; RV64ZBA-NEXT:    addi a0, a0, 13
+; RV64ZBA-NEXT:    ret
+  %a = zext i32 %0 to i64
+  %b = shl i64 %a, 3
+  %c = add i64 %b, 13
+  ret i64 %c
+}
+
+define i64 @adduw_imm(i32 signext %0) nounwind {
+; RV64I-LABEL: adduw_imm:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    addi a0, a0, 5
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: adduw_imm:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    zext.w a0, a0
+; RV64ZBA-NEXT:    addi a0, a0, 5
+; RV64ZBA-NEXT:    ret
+  %a = zext i32 %0 to i64
+  %b = add i64 %a, 5
+  ret i64 %b
+}
+
+define i64 @mul258(i64 %a) {
+; CHECK-LABEL: mul258:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, 258
+; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    ret
+  %c = mul i64 %a, 258
+  ret i64 %c
+}
+
+define i64 @mul260(i64 %a) {
+; CHECK-LABEL: mul260:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, 260
+; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    ret
+  %c = mul i64 %a, 260
+  ret i64 %c
+}
+
+define i64 @mul264(i64 %a) {
+; CHECK-LABEL: mul264:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, 264
+; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    ret
+  %c = mul i64 %a, 264
+  ret i64 %c
+}
+
+define i64 @imm_zextw() nounwind {
+; RV64I-LABEL: imm_zextw:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a0, 1
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    addi a0, a0, -2
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: imm_zextw:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    li a0, -2
+; RV64ZBA-NEXT:    zext.w a0, a0
+; RV64ZBA-NEXT:    ret
+  ret i64 4294967294 ; -2 in 32 bits.
+}
+
+define i64 @mul11(i64 %a) {
+; RV64I-LABEL: mul11:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a1, 11
+; RV64I-NEXT:    mul a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: mul11:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    sh2add a1, a0, a0
+; RV64ZBA-NEXT:    sh1add a0, a1, a0
+; RV64ZBA-NEXT:    ret
+  %c = mul i64 %a, 11
+  ret i64 %c
+}
+
+define i64 @mul19(i64 %a) {
+; RV64I-LABEL: mul19:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a1, 19
+; RV64I-NEXT:    mul a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: mul19:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    sh3add a1, a0, a0
+; RV64ZBA-NEXT:    sh1add a0, a1, a0
+; RV64ZBA-NEXT:    ret
+  %c = mul i64 %a, 19
+  ret i64 %c
+}
+
+define i64 @mul13(i64 %a) {
+; RV64I-LABEL: mul13:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a1, 13
+; RV64I-NEXT:    mul a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: mul13:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    sh1add a1, a0, a0
+; RV64ZBA-NEXT:    sh2add a0, a1, a0
+; RV64ZBA-NEXT:    ret
+  %c = mul i64 %a, 13
+  ret i64 %c
+}
+
+define i64 @mul21(i64 %a) {
+; RV64I-LABEL: mul21:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a1, 21
+; RV64I-NEXT:    mul a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: mul21:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    sh2add a1, a0, a0
+; RV64ZBA-NEXT:    sh2add a0, a1, a0
+; RV64ZBA-NEXT:    ret
+  %c = mul i64 %a, 21
+  ret i64 %c
+}
+
+define i64 @mul37(i64 %a) {
+; RV64I-LABEL: mul37:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a1, 37
+; RV64I-NEXT:    mul a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: mul37:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    sh3add a1, a0, a0
+; RV64ZBA-NEXT:    sh2add a0, a1, a0
+; RV64ZBA-NEXT:    ret
+  %c = mul i64 %a, 37
+  ret i64 %c
+}
+
+define i64 @mul25(i64 %a) {
+; RV64I-LABEL: mul25:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a1, 25
+; RV64I-NEXT:    mul a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: mul25:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    sh1add a1, a0, a0
+; RV64ZBA-NEXT:    sh3add a0, a1, a0
+; RV64ZBA-NEXT:    ret
+  %c = mul i64 %a, 25
+  ret i64 %c
+}
+
+define i64 @mul41(i64 %a) {
+; RV64I-LABEL: mul41:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a1, 41
+; RV64I-NEXT:    mul a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: mul41:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    sh2add a1, a0, a0
+; RV64ZBA-NEXT:    sh3add a0, a1, a0
+; RV64ZBA-NEXT:    ret
+  %c = mul i64 %a, 41
+  ret i64 %c
+}
+
+define i64 @mul73(i64 %a) {
+; RV64I-LABEL: mul73:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a1, 73
+; RV64I-NEXT:    mul a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: mul73:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    sh3add a1, a0, a0
+; RV64ZBA-NEXT:    sh3add a0, a1, a0
+; RV64ZBA-NEXT:    ret
+  %c = mul i64 %a, 73
+  ret i64 %c
+}
+
+define i64 @mul27(i64 %a) {
+; RV64I-LABEL: mul27:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a1, 27
+; RV64I-NEXT:    mul a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: mul27:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    sh3add a0, a0, a0
+; RV64ZBA-NEXT:    sh1add a0, a0, a0
+; RV64ZBA-NEXT:    ret
+  %c = mul i64 %a, 27
+  ret i64 %c
+}
+
+define i64 @mul45(i64 %a) {
+; RV64I-LABEL: mul45:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a1, 45
+; RV64I-NEXT:    mul a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: mul45:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    sh3add a0, a0, a0
+; RV64ZBA-NEXT:    sh2add a0, a0, a0
+; RV64ZBA-NEXT:    ret
+  %c = mul i64 %a, 45
+  ret i64 %c
+}
+
+define i64 @mul81(i64 %a) {
+; RV64I-LABEL: mul81:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a1, 81
+; RV64I-NEXT:    mul a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: mul81:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    sh3add a0, a0, a0
+; RV64ZBA-NEXT:    sh3add a0, a0, a0
+; RV64ZBA-NEXT:    ret
+  %c = mul i64 %a, 81
+  ret i64 %c
+}
+
+define i64 @mul4098(i64 %a) {
+; RV64I-LABEL: mul4098:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a1, a0, 1
+; RV64I-NEXT:    slli a0, a0, 12
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: mul4098:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    slli a1, a0, 12
+; RV64ZBA-NEXT:    sh1add a0, a0, a1
+; RV64ZBA-NEXT:    ret
+  %c = mul i64 %a, 4098
+  ret i64 %c
+}
+
+define i64 @mul4100(i64 %a) {
+; RV64I-LABEL: mul4100:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a1, a0, 2
+; RV64I-NEXT:    slli a0, a0, 12
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: mul4100:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    slli a1, a0, 12
+; RV64ZBA-NEXT:    sh2add a0, a0, a1
+; RV64ZBA-NEXT:    ret
+  %c = mul i64 %a, 4100
+  ret i64 %c
+}
+
+define i64 @mul4104(i64 %a) {
+; RV64I-LABEL: mul4104:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a1, a0, 3
+; RV64I-NEXT:    slli a0, a0, 12
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: mul4104:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    slli a1, a0, 12
+; RV64ZBA-NEXT:    sh3add a0, a0, a1
+; RV64ZBA-NEXT:    ret
+  %c = mul i64 %a, 4104
+  ret i64 %c
+}
+
+define signext i32 @mulw192(i32 signext %a) {
+; CHECK-LABEL: mulw192:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, 192
+; CHECK-NEXT:    mulw a0, a0, a1
+; CHECK-NEXT:    ret
+  %c = mul i32 %a, 192
+  ret i32 %c
+}
+
+define signext i32 @mulw320(i32 signext %a) {
+; CHECK-LABEL: mulw320:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, 320
+; CHECK-NEXT:    mulw a0, a0, a1
+; CHECK-NEXT:    ret
+  %c = mul i32 %a, 320
+  ret i32 %c
+}
+
+define signext i32 @mulw576(i32 signext %a) {
+; CHECK-LABEL: mulw576:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, 576
+; CHECK-NEXT:    mulw a0, a0, a1
+; CHECK-NEXT:    ret
+  %c = mul i32 %a, 576
+  ret i32 %c
+}
+
+define i64 @add4104(i64 %a) {
+; RV64I-LABEL: add4104:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lui a1, 1
+; RV64I-NEXT:    addiw a1, a1, 8
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: add4104:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    li a1, 1026
+; RV64ZBA-NEXT:    sh2add a0, a1, a0
+; RV64ZBA-NEXT:    ret
+  %c = add i64 %a, 4104
+  ret i64 %c
+}
+
+define i64 @add8208(i64 %a) {
+; RV64I-LABEL: add8208:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lui a1, 2
+; RV64I-NEXT:    addiw a1, a1, 16
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: add8208:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    li a1, 1026
+; RV64ZBA-NEXT:    sh3add a0, a1, a0
+; RV64ZBA-NEXT:    ret
+  %c = add i64 %a, 8208
+  ret i64 %c
+}
+
+; Make sure we prefer LUI for the 8192 instead of using sh3add.
+define signext i32 @add8192_i32(i32 signext %a) {
+; CHECK-LABEL: add8192_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lui a1, 2
+; CHECK-NEXT:    addw a0, a0, a1
+; CHECK-NEXT:    ret
+  %c = add i32 %a, 8192
+  ret i32 %c
+}
+
+; Make sure we prefer LUI for the 8192 instead of using sh3add.
+define i64 @add8192(i64 %a) {
+; CHECK-LABEL: add8192:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lui a1, 2
+; CHECK-NEXT:    add a0, a0, a1
+; CHECK-NEXT:    ret
+  %c = add i64 %a, 8192
+  ret i64 %c
+}
+
+define signext i32 @addshl32_5_6(i32 signext %a, i32 signext %b) {
+; RV64I-LABEL: addshl32_5_6:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 5
+; RV64I-NEXT:    slli a1, a1, 6
+; RV64I-NEXT:    addw a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: addshl32_5_6:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    slli a1, a1, 1
+; RV64ZBA-NEXT:    add a0, a1, a0
+; RV64ZBA-NEXT:    slliw a0, a0, 5
+; RV64ZBA-NEXT:    ret
+  %c = shl i32 %a, 5
+  %d = shl i32 %b, 6
+  %e = add i32 %c, %d
+  ret i32 %e
+}
+
+define i64 @addshl64_5_6(i64 %a, i64 %b) {
+; RV64I-LABEL: addshl64_5_6:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 5
+; RV64I-NEXT:    slli a1, a1, 6
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: addshl64_5_6:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    sh1add a0, a1, a0
+; RV64ZBA-NEXT:    slli a0, a0, 5
+; RV64ZBA-NEXT:    ret
+  %c = shl i64 %a, 5
+  %d = shl i64 %b, 6
+  %e = add i64 %c, %d
+  ret i64 %e
+}
+
+define signext i32 @addshl32_5_7(i32 signext %a, i32 signext %b) {
+; RV64I-LABEL: addshl32_5_7:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 5
+; RV64I-NEXT:    slli a1, a1, 7
+; RV64I-NEXT:    addw a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: addshl32_5_7:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    slli a1, a1, 2
+; RV64ZBA-NEXT:    add a0, a1, a0
+; RV64ZBA-NEXT:    slliw a0, a0, 5
+; RV64ZBA-NEXT:    ret
+  %c = shl i32 %a, 5
+  %d = shl i32 %b, 7
+  %e = add i32 %c, %d
+  ret i32 %e
+}
+
+define i64 @addshl64_5_7(i64 %a, i64 %b) {
+; RV64I-LABEL: addshl64_5_7:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 5
+; RV64I-NEXT:    slli a1, a1, 7
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: addshl64_5_7:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    sh2add a0, a1, a0
+; RV64ZBA-NEXT:    slli a0, a0, 5
+; RV64ZBA-NEXT:    ret
+  %c = shl i64 %a, 5
+  %d = shl i64 %b, 7
+  %e = add i64 %c, %d
+  ret i64 %e
+}
+
+define signext i32 @addshl32_5_8(i32 signext %a, i32 signext %b) {
+; RV64I-LABEL: addshl32_5_8:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 5
+; RV64I-NEXT:    slli a1, a1, 8
+; RV64I-NEXT:    addw a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: addshl32_5_8:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    slli a1, a1, 3
+; RV64ZBA-NEXT:    add a0, a1, a0
+; RV64ZBA-NEXT:    slliw a0, a0, 5
+; RV64ZBA-NEXT:    ret
+  %c = shl i32 %a, 5
+  %d = shl i32 %b, 8
+  %e = add i32 %c, %d
+  ret i32 %e
+}
+
+define i64 @addshl64_5_8(i64 %a, i64 %b) {
+; RV64I-LABEL: addshl64_5_8:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 5
+; RV64I-NEXT:    slli a1, a1, 8
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: addshl64_5_8:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    sh3add a0, a1, a0
+; RV64ZBA-NEXT:    slli a0, a0, 5
+; RV64ZBA-NEXT:    ret
+  %c = shl i64 %a, 5
+  %d = shl i64 %b, 8
+  %e = add i64 %c, %d
+  ret i64 %e
+}
+
+; Make sure we use sext.h+slli+srli for Zba+Zbb.
+; FIXME: The RV64I and Zba only cases can be done with only 3 shifts.
+define zeroext i32 @sext_ashr_zext_i8(i8 %a) nounwind {
+; RV64I-LABEL: sext_ashr_zext_i8:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 24
+; RV64I-NEXT:    sraiw a0, a0, 31
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    ret
+;
+; RV64ZBANOZBB-LABEL: sext_ashr_zext_i8:
+; RV64ZBANOZBB:       # %bb.0:
+; RV64ZBANOZBB-NEXT:    slli a0, a0, 24
+; RV64ZBANOZBB-NEXT:    sraiw a0, a0, 31
+; RV64ZBANOZBB-NEXT:    zext.w a0, a0
+; RV64ZBANOZBB-NEXT:    ret
+;
+; RV64ZBAZBB-LABEL: sext_ashr_zext_i8:
+; RV64ZBAZBB:       # %bb.0:
+; RV64ZBAZBB-NEXT:    sext.b a0, a0
+; RV64ZBAZBB-NEXT:    sraiw a0, a0, 9
+; RV64ZBAZBB-NEXT:    zext.w a0, a0
+; RV64ZBAZBB-NEXT:    ret
+  %ext = sext i8 %a to i32
+  %1 = ashr i32 %ext, 9
+  ret i32 %1
+}
+
+; Make sure we use sext.h+slli+srli for Zba+Zbb.
+; FIXME: The RV64I and Zba only cases can be done with only 3 shifts.
+define zeroext i32 @sext_ashr_zext_i16(i16 %a) nounwind {
+; RV64I-LABEL: sext_ashr_zext_i16:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 16
+; RV64I-NEXT:    sraiw a0, a0, 25
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    ret
+;
+; RV64ZBANOZBB-LABEL: sext_ashr_zext_i16:
+; RV64ZBANOZBB:       # %bb.0:
+; RV64ZBANOZBB-NEXT:    slli a0, a0, 16
+; RV64ZBANOZBB-NEXT:    sraiw a0, a0, 25
+; RV64ZBANOZBB-NEXT:    zext.w a0, a0
+; RV64ZBANOZBB-NEXT:    ret
+;
+; RV64ZBAZBB-LABEL: sext_ashr_zext_i16:
+; RV64ZBAZBB:       # %bb.0:
+; RV64ZBAZBB-NEXT:    slli a0, a0, 48
+; RV64ZBAZBB-NEXT:    srai a0, a0, 57
+; RV64ZBAZBB-NEXT:    zext.w a0, a0
+; RV64ZBAZBB-NEXT:    ret
+  %ext = sext i16 %a to i32
+  %1 = ashr i32 %ext, 9
+  ret i32 %1
+}
+
+; This the IR you get from InstCombine if take the difference of 2 pointers and
+; cast is to unsigned before using as an index.
+define signext i16 @sh1adduw_ptrdiff(i64 %diff, ptr %baseptr) {
+; RV64I-LABEL: sh1adduw_ptrdiff:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a2, 1
+; RV64I-NEXT:    slli a2, a2, 33
+; RV64I-NEXT:    addi a2, a2, -2
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    lh a0, 0(a0)
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: sh1adduw_ptrdiff:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    srli a0, a0, 1
+; RV64ZBA-NEXT:    sh1add.uw a0, a0, a1
+; RV64ZBA-NEXT:    lh a0, 0(a0)
+; RV64ZBA-NEXT:    ret
+  %ptrdiff = lshr exact i64 %diff, 1
+  %cast = and i64 %ptrdiff, 4294967295
+  %ptr = getelementptr inbounds i16, ptr %baseptr, i64 %cast
+  %res = load i16, ptr %ptr
+  ret i16 %res
+}
+
+define signext i32 @sh2adduw_ptrdiff(i64 %diff, ptr %baseptr) {
+; RV64I-LABEL: sh2adduw_ptrdiff:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a2, 1
+; RV64I-NEXT:    slli a2, a2, 34
+; RV64I-NEXT:    addi a2, a2, -4
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    lw a0, 0(a0)
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: sh2adduw_ptrdiff:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    srli a0, a0, 2
+; RV64ZBA-NEXT:    sh2add.uw a0, a0, a1
+; RV64ZBA-NEXT:    lw a0, 0(a0)
+; RV64ZBA-NEXT:    ret
+  %ptrdiff = lshr exact i64 %diff, 2
+  %cast = and i64 %ptrdiff, 4294967295
+  %ptr = getelementptr inbounds i32, ptr %baseptr, i64 %cast
+  %res = load i32, ptr %ptr
+  ret i32 %res
+}
+
+define i64 @sh3adduw_ptrdiff(i64 %diff, ptr %baseptr) {
+; RV64I-LABEL: sh3adduw_ptrdiff:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a2, 1
+; RV64I-NEXT:    slli a2, a2, 35
+; RV64I-NEXT:    addi a2, a2, -8
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    ld a0, 0(a0)
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: sh3adduw_ptrdiff:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    srli a0, a0, 3
+; RV64ZBA-NEXT:    sh3add.uw a0, a0, a1
+; RV64ZBA-NEXT:    ld a0, 0(a0)
+; RV64ZBA-NEXT:    ret
+  %ptrdiff = lshr exact i64 %diff, 3
+  %cast = and i64 %ptrdiff, 4294967295
+  %ptr = getelementptr inbounds i64, ptr %baseptr, i64 %cast
+  %res = load i64, ptr %ptr
+  ret i64 %res
+}
+
+define signext i16 @srliw_1_sh1add(ptr %0, i32 signext %1) {
+; RV64I-LABEL: srliw_1_sh1add:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    srliw a1, a1, 1
+; RV64I-NEXT:    slli a1, a1, 1
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    lh a0, 0(a0)
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: srliw_1_sh1add:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    srliw a1, a1, 1
+; RV64ZBA-NEXT:    sh1add a0, a1, a0
+; RV64ZBA-NEXT:    lh a0, 0(a0)
+; RV64ZBA-NEXT:    ret
+  %3 = lshr i32 %1, 1
+  %4 = zext i32 %3 to i64
+  %5 = getelementptr inbounds i16, ptr %0, i64 %4
+  %6 = load i16, ptr %5, align 2
+  ret i16 %6
+}
+
+define i128 @slliuw_ptrdiff(i64 %diff, ptr %baseptr) {
+; RV64I-LABEL: slliuw_ptrdiff:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a2, 1
+; RV64I-NEXT:    slli a2, a2, 36
+; RV64I-NEXT:    addi a2, a2, -16
+; RV64I-NEXT:    and a0, a0, a2
+; RV64I-NEXT:    add a1, a1, a0
+; RV64I-NEXT:    ld a0, 0(a1)
+; RV64I-NEXT:    ld a1, 8(a1)
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: slliuw_ptrdiff:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    srli a0, a0, 4
+; RV64ZBA-NEXT:    slli.uw a0, a0, 4
+; RV64ZBA-NEXT:    add a1, a1, a0
+; RV64ZBA-NEXT:    ld a0, 0(a1)
+; RV64ZBA-NEXT:    ld a1, 8(a1)
+; RV64ZBA-NEXT:    ret
+  %ptrdiff = lshr exact i64 %diff, 4
+  %cast = and i64 %ptrdiff, 4294967295
+  %ptr = getelementptr inbounds i128, ptr %baseptr, i64 %cast
+  %res = load i128, ptr %ptr
+  ret i128 %res
+}
+
+define signext i32 @srliw_2_sh2add(ptr %0, i32 signext %1) {
+; RV64I-LABEL: srliw_2_sh2add:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    srliw a1, a1, 2
+; RV64I-NEXT:    slli a1, a1, 2
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    lw a0, 0(a0)
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: srliw_2_sh2add:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    srliw a1, a1, 2
+; RV64ZBA-NEXT:    sh2add a0, a1, a0
+; RV64ZBA-NEXT:    lw a0, 0(a0)
+; RV64ZBA-NEXT:    ret
+  %3 = lshr i32 %1, 2
+  %4 = zext i32 %3 to i64
+  %5 = getelementptr inbounds i32, ptr %0, i64 %4
+  %6 = load i32, ptr %5, align 4
+  ret i32 %6
+}
+
+define i64 @srliw_3_sh3add(ptr %0, i32 signext %1) {
+; RV64I-LABEL: srliw_3_sh3add:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    srliw a1, a1, 3
+; RV64I-NEXT:    slli a1, a1, 3
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ld a0, 0(a0)
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: srliw_3_sh3add:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    srliw a1, a1, 3
+; RV64ZBA-NEXT:    sh3add a0, a1, a0
+; RV64ZBA-NEXT:    ld a0, 0(a0)
+; RV64ZBA-NEXT:    ret
+  %3 = lshr i32 %1, 3
+  %4 = zext i32 %3 to i64
+  %5 = getelementptr inbounds i64, ptr %0, i64 %4
+  %6 = load i64, ptr %5, align 8
+  ret i64 %6
+}
+
+define signext i32 @srliw_1_sh2add(ptr %0, i32 signext %1) {
+; RV64I-LABEL: srliw_1_sh2add:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    srliw a1, a1, 1
+; RV64I-NEXT:    slli a1, a1, 32
+; RV64I-NEXT:    srli a1, a1, 32
+; RV64I-NEXT:    slli a1, a1, 2
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    lw a0, 0(a0)
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: srliw_1_sh2add:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    srliw a1, a1, 1
+; RV64ZBA-NEXT:    zext.w a1, a1
+; RV64ZBA-NEXT:    sh2add a0, a1, a0
+; RV64ZBA-NEXT:    lw a0, 0(a0)
+; RV64ZBA-NEXT:    ret
+  %3 = lshr i32 %1, 1
+  %4 = zext i32 %3 to i64
+  %5 = getelementptr inbounds i32, ptr %0, i64 %4
+  %6 = load i32, ptr %5, align 4
+  ret i32 %6
+}
+
+define i64 @srliw_1_sh3add(ptr %0, i32 signext %1) {
+; RV64I-LABEL: srliw_1_sh3add:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    srliw a1, a1, 1
+; RV64I-NEXT:    slli a1, a1, 32
+; RV64I-NEXT:    srli a1, a1, 32
+; RV64I-NEXT:    slli a1, a1, 3
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ld a0, 0(a0)
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: srliw_1_sh3add:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    srliw a1, a1, 1
+; RV64ZBA-NEXT:    zext.w a1, a1
+; RV64ZBA-NEXT:    sh3add a0, a1, a0
+; RV64ZBA-NEXT:    ld a0, 0(a0)
+; RV64ZBA-NEXT:    ret
+  %3 = lshr i32 %1, 1
+  %4 = zext i32 %3 to i64
+  %5 = getelementptr inbounds i64, ptr %0, i64 %4
+  %6 = load i64, ptr %5, align 8
+  ret i64 %6
+}
+
+define i64 @srliw_2_sh3add(ptr %0, i32 signext %1) {
+; RV64I-LABEL: srliw_2_sh3add:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    srliw a1, a1, 2
+; RV64I-NEXT:    slli a1, a1, 32
+; RV64I-NEXT:    srli a1, a1, 32
+; RV64I-NEXT:    slli a1, a1, 3
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ld a0, 0(a0)
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: srliw_2_sh3add:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    srliw a1, a1, 2
+; RV64ZBA-NEXT:    zext.w a1, a1
+; RV64ZBA-NEXT:    sh3add a0, a1, a0
+; RV64ZBA-NEXT:    ld a0, 0(a0)
+; RV64ZBA-NEXT:    ret
+  %3 = lshr i32 %1, 2
+  %4 = zext i32 %3 to i64
+  %5 = getelementptr inbounds i64, ptr %0, i64 %4
+  %6 = load i64, ptr %5, align 8
+  ret i64 %6
+}
+
+define signext i16 @srliw_2_sh1add(ptr %0, i32 signext %1) {
+; RV64I-LABEL: srliw_2_sh1add:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    srliw a1, a1, 2
+; RV64I-NEXT:    slli a1, a1, 32
+; RV64I-NEXT:    srli a1, a1, 32
+; RV64I-NEXT:    slli a1, a1, 1
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    lh a0, 0(a0)
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: srliw_2_sh1add:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    srliw a1, a1, 2
+; RV64ZBA-NEXT:    zext.w a1, a1
+; RV64ZBA-NEXT:    sh1add a0, a1, a0
+; RV64ZBA-NEXT:    lh a0, 0(a0)
+; RV64ZBA-NEXT:    ret
+  %3 = lshr i32 %1, 2
+  %4 = zext i32 %3 to i64
+  %5 = getelementptr inbounds i16, ptr %0, i64 %4
+  %6 = load i16, ptr %5, align 2
+  ret i16 %6
+}
+
+
+define signext i32 @srliw_3_sh2add(ptr %0, i32 signext %1) {
+; RV64I-LABEL: srliw_3_sh2add:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    srliw a1, a1, 3
+; RV64I-NEXT:    slli a1, a1, 32
+; RV64I-NEXT:    srli a1, a1, 32
+; RV64I-NEXT:    slli a1, a1, 2
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    lw a0, 0(a0)
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: srliw_3_sh2add:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    srliw a1, a1, 3
+; RV64ZBA-NEXT:    zext.w a1, a1
+; RV64ZBA-NEXT:    sh2add a0, a1, a0
+; RV64ZBA-NEXT:    lw a0, 0(a0)
+; RV64ZBA-NEXT:    ret
+  %3 = lshr i32 %1, 3
+  %4 = zext i32 %3 to i64
+  %5 = getelementptr inbounds i32, ptr %0, i64 %4
+  %6 = load i32, ptr %5, align 4
+  ret i32 %6
+}
+
+define i64 @srliw_4_sh3add(ptr %0, i32 signext %1) {
+; RV64I-LABEL: srliw_4_sh3add:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    srliw a1, a1, 4
+; RV64I-NEXT:    slli a1, a1, 32
+; RV64I-NEXT:    srli a1, a1, 32
+; RV64I-NEXT:    slli a1, a1, 3
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ld a0, 0(a0)
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: srliw_4_sh3add:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    srliw a1, a1, 4
+; RV64ZBA-NEXT:    zext.w a1, a1
+; RV64ZBA-NEXT:    sh3add a0, a1, a0
+; RV64ZBA-NEXT:    ld a0, 0(a0)
+; RV64ZBA-NEXT:    ret
+  %3 = lshr i32 %1, 4
+  %4 = zext i32 %3 to i64
+  %5 = getelementptr inbounds i64, ptr %0, i64 %4
+  %6 = load i64, ptr %5, align 8
+  ret i64 %6
+}
+
+define signext i32 @srli_1_sh2add(ptr %0, i64 %1) {
+; RV64I-LABEL: srli_1_sh2add:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a1, a1, 1
+; RV64I-NEXT:    andi a1, a1, -4
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    lw a0, 0(a0)
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: srli_1_sh2add:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    srli a1, a1, 1
+; RV64ZBA-NEXT:    sh2add a0, a1, a0
+; RV64ZBA-NEXT:    lw a0, 0(a0)
+; RV64ZBA-NEXT:    ret
+  %3 = lshr i64 %1, 1
+  %4 = getelementptr inbounds i32, ptr %0, i64 %3
+  %5 = load i32, ptr %4, align 4
+  ret i32 %5
+}
+
+define i64 @srli_2_sh3add(ptr %0, i64 %1) {
+; RV64I-LABEL: srli_2_sh3add:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a1, a1, 1
+; RV64I-NEXT:    andi a1, a1, -8
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ld a0, 0(a0)
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: srli_2_sh3add:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    srli a1, a1, 2
+; RV64ZBA-NEXT:    sh3add a0, a1, a0
+; RV64ZBA-NEXT:    ld a0, 0(a0)
+; RV64ZBA-NEXT:    ret
+  %3 = lshr i64 %1, 2
+  %4 = getelementptr inbounds i64, ptr %0, i64 %3
+  %5 = load i64, ptr %4, align 8
+  ret i64 %5
+}
+
+define signext i16 @srli_2_sh1add(ptr %0, i64 %1) {
+; RV64I-LABEL: srli_2_sh1add:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    srli a1, a1, 1
+; RV64I-NEXT:    andi a1, a1, -2
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    lh a0, 0(a0)
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: srli_2_sh1add:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    srli a1, a1, 2
+; RV64ZBA-NEXT:    sh1add a0, a1, a0
+; RV64ZBA-NEXT:    lh a0, 0(a0)
+; RV64ZBA-NEXT:    ret
+  %3 = lshr i64 %1, 2
+  %4 = getelementptr inbounds i16, ptr %0, i64 %3
+  %5 = load i16, ptr %4, align 2
+  ret i16 %5
+}
+
+define signext i32 @srli_3_sh2add(ptr %0, i64 %1) {
+; RV64I-LABEL: srli_3_sh2add:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    srli a1, a1, 1
+; RV64I-NEXT:    andi a1, a1, -4
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    lw a0, 0(a0)
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: srli_3_sh2add:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    srli a1, a1, 3
+; RV64ZBA-NEXT:    sh2add a0, a1, a0
+; RV64ZBA-NEXT:    lw a0, 0(a0)
+; RV64ZBA-NEXT:    ret
+  %3 = lshr i64 %1, 3
+  %4 = getelementptr inbounds i32, ptr %0, i64 %3
+  %5 = load i32, ptr %4, align 4
+  ret i32 %5
+}
+
+define i64 @srli_4_sh3add(ptr %0, i64 %1) {
+; RV64I-LABEL: srli_4_sh3add:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    srli a1, a1, 1
+; RV64I-NEXT:    andi a1, a1, -8
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ld a0, 0(a0)
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: srli_4_sh3add:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    srli a1, a1, 4
+; RV64ZBA-NEXT:    sh3add a0, a1, a0
+; RV64ZBA-NEXT:    ld a0, 0(a0)
+; RV64ZBA-NEXT:    ret
+  %3 = lshr i64 %1, 4
+  %4 = getelementptr inbounds i64, ptr %0, i64 %3
+  %5 = load i64, ptr %4, align 8
+  ret i64 %5
+}
+
+define signext i16 @shl_2_sh1add(ptr %0, i32 signext %1) {
+; RV64I-LABEL: shl_2_sh1add:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a1, a1, 2
+; RV64I-NEXT:    slli a1, a1, 32
+; RV64I-NEXT:    srli a1, a1, 32
+; RV64I-NEXT:    slli a1, a1, 1
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    lh a0, 0(a0)
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: shl_2_sh1add:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    slli a1, a1, 2
+; RV64ZBA-NEXT:    zext.w a1, a1
+; RV64ZBA-NEXT:    sh1add a0, a1, a0
+; RV64ZBA-NEXT:    lh a0, 0(a0)
+; RV64ZBA-NEXT:    ret
+  %3 = shl i32 %1, 2
+  %4 = zext i32 %3 to i64
+  %5 = getelementptr inbounds i16, ptr %0, i64 %4
+  %6 = load i16, ptr %5, align 2
+  ret i16 %6
+}
+
+define signext i32 @shl_16_sh2add(ptr %0, i32 signext %1) {
+; RV64I-LABEL: shl_16_sh2add:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a1, a1, 16
+; RV64I-NEXT:    slli a1, a1, 32
+; RV64I-NEXT:    srli a1, a1, 32
+; RV64I-NEXT:    slli a1, a1, 2
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    lw a0, 0(a0)
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: shl_16_sh2add:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    slli a1, a1, 16
+; RV64ZBA-NEXT:    zext.w a1, a1
+; RV64ZBA-NEXT:    sh2add a0, a1, a0
+; RV64ZBA-NEXT:    lw a0, 0(a0)
+; RV64ZBA-NEXT:    ret
+  %3 = shl i32 %1, 16
+  %4 = zext i32 %3 to i64
+  %5 = getelementptr inbounds i32, ptr %0, i64 %4
+  %6 = load i32, ptr %5, align 4
+  ret i32 %6
+}
+
+define i64 @shl_31_sh3add(ptr %0, i32 signext %1) {
+; RV64I-LABEL: shl_31_sh3add:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a1, a1, 31
+; RV64I-NEXT:    slli a1, a1, 32
+; RV64I-NEXT:    srli a1, a1, 32
+; RV64I-NEXT:    slli a1, a1, 3
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    ld a0, 0(a0)
+; RV64I-NEXT:    ret
+;
+; RV64ZBA-LABEL: shl_31_sh3add:
+; RV64ZBA:       # %bb.0:
+; RV64ZBA-NEXT:    slli a1, a1, 31
+; RV64ZBA-NEXT:    zext.w a1, a1
+; RV64ZBA-NEXT:    sh3add a0, a1, a0
+; RV64ZBA-NEXT:    ld a0, 0(a0)
+; RV64ZBA-NEXT:    ret
+  %3 = shl i32 %1, 31
+  %4 = zext i32 %3 to i64
+  %5 = getelementptr inbounds i64, ptr %0, i64 %4
+  %6 = load i64, ptr %5, align 8
+  ret i64 %6
+}
diff --git a/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64zbb-intrinsic.ll b/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64zbb-intrinsic.ll
new file mode 100644
index 0000000000000..1ab37493b0ec6
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64zbb-intrinsic.ll
@@ -0,0 +1,77 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv64 -mattr=+zbb -verify-machineinstrs < %s \
+; RUN:   -riscv-experimental-rv64-legal-i32 | FileCheck %s -check-prefix=RV64ZBB
+
+declare i32 @llvm.riscv.orc.b.i32(i32)
+
+define signext i32 @orcb32(i32 signext %a) nounwind {
+; RV64ZBB-LABEL: orcb32:
+; RV64ZBB:       # %bb.0:
+; RV64ZBB-NEXT:    orc.b a0, a0
+; RV64ZBB-NEXT:    sext.w a0, a0
+; RV64ZBB-NEXT:    ret
+  %tmp = call i32 @llvm.riscv.orc.b.i32(i32 %a)
+  ret i32 %tmp
+}
+
+define zeroext i32 @orcb32_zext(i32 zeroext %a) nounwind {
+; RV64ZBB-LABEL: orcb32_zext:
+; RV64ZBB:       # %bb.0:
+; RV64ZBB-NEXT:    orc.b a0, a0
+; RV64ZBB-NEXT:    ret
+  %tmp = call i32 @llvm.riscv.orc.b.i32(i32 %a)
+  ret i32 %tmp
+}
+
+; Second and+or is redundant with the first, make sure we remove them.
+define signext i32 @orcb32_knownbits(i32 signext %a) nounwind {
+; RV64ZBB-LABEL: orcb32_knownbits:
+; RV64ZBB:       # %bb.0:
+; RV64ZBB-NEXT:    lui a1, 1044480
+; RV64ZBB-NEXT:    and a0, a0, a1
+; RV64ZBB-NEXT:    lui a1, 2048
+; RV64ZBB-NEXT:    addi a1, a1, 1
+; RV64ZBB-NEXT:    or a0, a0, a1
+; RV64ZBB-NEXT:    orc.b a0, a0
+; RV64ZBB-NEXT:    sext.w a0, a0
+; RV64ZBB-NEXT:    ret
+  %tmp = and i32 %a, 4278190080 ; 0xFF000000
+  %tmp2 = or i32 %tmp, 8388609 ; 0x800001
+  %tmp3 = call i32 @llvm.riscv.orc.b.i32(i32 %tmp2)
+  %tmp4 = and i32 %tmp3, 4278190080 ; 0xFF000000
+  %tmp5 = or i32 %tmp4, 16711935 ; 0xFF00FF
+  ret i32 %tmp5
+}
+
+declare i64 @llvm.riscv.orc.b.i64(i64)
+
+define i64 @orcb64(i64 %a) nounwind {
+; RV64ZBB-LABEL: orcb64:
+; RV64ZBB:       # %bb.0:
+; RV64ZBB-NEXT:    orc.b a0, a0
+; RV64ZBB-NEXT:    ret
+  %tmp = call i64 @llvm.riscv.orc.b.i64(i64 %a)
+  ret i64 %tmp
+}
+
+; Second and+or is redundant with the first, make sure we remove them.
+define i64 @orcb64_knownbits(i64 %a) nounwind {
+; RV64ZBB-LABEL: orcb64_knownbits:
+; RV64ZBB:       # %bb.0:
+; RV64ZBB-NEXT:    lui a1, 65535
+; RV64ZBB-NEXT:    slli a1, a1, 12
+; RV64ZBB-NEXT:    and a0, a0, a1
+; RV64ZBB-NEXT:    lui a1, 256
+; RV64ZBB-NEXT:    addiw a1, a1, 8
+; RV64ZBB-NEXT:    slli a2, a1, 42
+; RV64ZBB-NEXT:    add a1, a1, a2
+; RV64ZBB-NEXT:    or a0, a0, a1
+; RV64ZBB-NEXT:    orc.b a0, a0
+; RV64ZBB-NEXT:    ret
+  %tmp = and i64 %a, 1099494850560 ; 0x000000ffff000000
+  %tmp2 = or i64 %tmp, 4611721202800525320 ; 0x4000200000100008
+  %tmp3 = call i64 @llvm.riscv.orc.b.i64(i64 %tmp2)
+  %tmp4 = and i64 %tmp3, 1099494850560 ; 0x000000ffff000000
+  %tmp5 = or i64 %tmp4, 18374966855153418495 ; 0xff00ff0000ff00ff
+  ret i64 %tmp5
+}
diff --git a/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64zbb-zbkb.ll b/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64zbb-zbkb.ll
new file mode 100644
index 0000000000000..e6e9829c16f22
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64zbb-zbkb.ll
@@ -0,0 +1,600 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv64 -verify-machineinstrs < %s \
+; RUN:   -riscv-experimental-rv64-legal-i32 | FileCheck %s -check-prefixes=CHECK,RV64I
+; RUN: llc -mtriple=riscv64 -mattr=+zbb -verify-machineinstrs < %s \
+; RUN:   -riscv-experimental-rv64-legal-i32 | FileCheck %s -check-prefixes=CHECK,RV64ZBB-ZBKB,RV64ZBB
+; RUN: llc -mtriple=riscv64 -mattr=+zbkb -verify-machineinstrs < %s \
+; RUN:   -riscv-experimental-rv64-legal-i32 | FileCheck %s -check-prefixes=CHECK,RV64ZBB-ZBKB,RV64ZBKB
+
+define signext i32 @andn_i32(i32 signext %a, i32 signext %b) nounwind {
+; RV64I-LABEL: andn_i32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    not a1, a1
+; RV64I-NEXT:    and a0, a1, a0
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-ZBKB-LABEL: andn_i32:
+; RV64ZBB-ZBKB:       # %bb.0:
+; RV64ZBB-ZBKB-NEXT:    andn a0, a0, a1
+; RV64ZBB-ZBKB-NEXT:    ret
+  %neg = xor i32 %b, -1
+  %and = and i32 %neg, %a
+  ret i32 %and
+}
+
+define i64 @andn_i64(i64 %a, i64 %b) nounwind {
+; RV64I-LABEL: andn_i64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    not a1, a1
+; RV64I-NEXT:    and a0, a1, a0
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-ZBKB-LABEL: andn_i64:
+; RV64ZBB-ZBKB:       # %bb.0:
+; RV64ZBB-ZBKB-NEXT:    andn a0, a0, a1
+; RV64ZBB-ZBKB-NEXT:    ret
+  %neg = xor i64 %b, -1
+  %and = and i64 %neg, %a
+  ret i64 %and
+}
+
+define signext i32 @orn_i32(i32 signext %a, i32 signext %b) nounwind {
+; RV64I-LABEL: orn_i32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    not a1, a1
+; RV64I-NEXT:    or a0, a1, a0
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-ZBKB-LABEL: orn_i32:
+; RV64ZBB-ZBKB:       # %bb.0:
+; RV64ZBB-ZBKB-NEXT:    orn a0, a0, a1
+; RV64ZBB-ZBKB-NEXT:    ret
+  %neg = xor i32 %b, -1
+  %or = or i32 %neg, %a
+  ret i32 %or
+}
+
+define i64 @orn_i64(i64 %a, i64 %b) nounwind {
+; RV64I-LABEL: orn_i64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    not a1, a1
+; RV64I-NEXT:    or a0, a1, a0
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-ZBKB-LABEL: orn_i64:
+; RV64ZBB-ZBKB:       # %bb.0:
+; RV64ZBB-ZBKB-NEXT:    orn a0, a0, a1
+; RV64ZBB-ZBKB-NEXT:    ret
+  %neg = xor i64 %b, -1
+  %or = or i64 %neg, %a
+  ret i64 %or
+}
+
+define signext i32 @xnor_i32(i32 signext %a, i32 signext %b) nounwind {
+; RV64I-LABEL: xnor_i32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    xor a0, a0, a1
+; RV64I-NEXT:    not a0, a0
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-ZBKB-LABEL: xnor_i32:
+; RV64ZBB-ZBKB:       # %bb.0:
+; RV64ZBB-ZBKB-NEXT:    xnor a0, a0, a1
+; RV64ZBB-ZBKB-NEXT:    ret
+  %neg = xor i32 %a, -1
+  %xor = xor i32 %neg, %b
+  ret i32 %xor
+}
+
+define i64 @xnor_i64(i64 %a, i64 %b) nounwind {
+; RV64I-LABEL: xnor_i64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    xor a0, a0, a1
+; RV64I-NEXT:    not a0, a0
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-ZBKB-LABEL: xnor_i64:
+; RV64ZBB-ZBKB:       # %bb.0:
+; RV64ZBB-ZBKB-NEXT:    xnor a0, a0, a1
+; RV64ZBB-ZBKB-NEXT:    ret
+  %neg = xor i64 %a, -1
+  %xor = xor i64 %neg, %b
+  ret i64 %xor
+}
+
+declare i32 @llvm.fshl.i32(i32, i32, i32)
+
+define signext i32 @rol_i32(i32 signext %a, i32 signext %b) nounwind {
+; RV64I-LABEL: rol_i32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    andi a2, a1, -1
+; RV64I-NEXT:    sllw a1, a0, a1
+; RV64I-NEXT:    negw a2, a2
+; RV64I-NEXT:    srlw a0, a0, a2
+; RV64I-NEXT:    or a0, a1, a0
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-ZBKB-LABEL: rol_i32:
+; RV64ZBB-ZBKB:       # %bb.0:
+; RV64ZBB-ZBKB-NEXT:    rolw a0, a0, a1
+; RV64ZBB-ZBKB-NEXT:    ret
+  %1 = tail call i32 @llvm.fshl.i32(i32 %a, i32 %a, i32 %b)
+  ret i32 %1
+}
+
+; Similar to rol_i32, but doesn't sign extend the result.
+define void @rol_i32_nosext(i32 signext %a, i32 signext %b, ptr %x) nounwind {
+; RV64I-LABEL: rol_i32_nosext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    andi a3, a1, -1
+; RV64I-NEXT:    sllw a1, a0, a1
+; RV64I-NEXT:    negw a3, a3
+; RV64I-NEXT:    srlw a0, a0, a3
+; RV64I-NEXT:    or a0, a1, a0
+; RV64I-NEXT:    sw a0, 0(a2)
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-ZBKB-LABEL: rol_i32_nosext:
+; RV64ZBB-ZBKB:       # %bb.0:
+; RV64ZBB-ZBKB-NEXT:    rolw a0, a0, a1
+; RV64ZBB-ZBKB-NEXT:    sw a0, 0(a2)
+; RV64ZBB-ZBKB-NEXT:    ret
+  %1 = tail call i32 @llvm.fshl.i32(i32 %a, i32 %a, i32 %b)
+  store i32 %1, ptr %x
+  ret void
+}
+
+define signext i32 @rol_i32_neg_constant_rhs(i32 signext %a) nounwind {
+; RV64I-LABEL: rol_i32_neg_constant_rhs:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    andi a1, a0, -1
+; RV64I-NEXT:    li a2, -2
+; RV64I-NEXT:    sllw a0, a2, a0
+; RV64I-NEXT:    negw a1, a1
+; RV64I-NEXT:    srlw a1, a2, a1
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-ZBKB-LABEL: rol_i32_neg_constant_rhs:
+; RV64ZBB-ZBKB:       # %bb.0:
+; RV64ZBB-ZBKB-NEXT:    li a1, -2
+; RV64ZBB-ZBKB-NEXT:    rolw a0, a1, a0
+; RV64ZBB-ZBKB-NEXT:    ret
+  %1 = tail call i32 @llvm.fshl.i32(i32 -2, i32 -2, i32 %a)
+  ret i32 %1
+}
+
+declare i64 @llvm.fshl.i64(i64, i64, i64)
+
+define i64 @rol_i64(i64 %a, i64 %b) nounwind {
+; RV64I-LABEL: rol_i64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sll a2, a0, a1
+; RV64I-NEXT:    negw a1, a1
+; RV64I-NEXT:    srl a0, a0, a1
+; RV64I-NEXT:    or a0, a2, a0
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-ZBKB-LABEL: rol_i64:
+; RV64ZBB-ZBKB:       # %bb.0:
+; RV64ZBB-ZBKB-NEXT:    rol a0, a0, a1
+; RV64ZBB-ZBKB-NEXT:    ret
+  %or = tail call i64 @llvm.fshl.i64(i64 %a, i64 %a, i64 %b)
+  ret i64 %or
+}
+
+declare i32 @llvm.fshr.i32(i32, i32, i32)
+
+define signext i32 @ror_i32(i32 signext %a, i32 signext %b) nounwind {
+; RV64I-LABEL: ror_i32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    andi a2, a1, -1
+; RV64I-NEXT:    srlw a1, a0, a1
+; RV64I-NEXT:    negw a2, a2
+; RV64I-NEXT:    sllw a0, a0, a2
+; RV64I-NEXT:    or a0, a1, a0
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-ZBKB-LABEL: ror_i32:
+; RV64ZBB-ZBKB:       # %bb.0:
+; RV64ZBB-ZBKB-NEXT:    rorw a0, a0, a1
+; RV64ZBB-ZBKB-NEXT:    ret
+  %1 = tail call i32 @llvm.fshr.i32(i32 %a, i32 %a, i32 %b)
+  ret i32 %1
+}
+
+; Similar to ror_i32, but doesn't sign extend the result.
+define void @ror_i32_nosext(i32 signext %a, i32 signext %b, ptr %x) nounwind {
+; RV64I-LABEL: ror_i32_nosext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    andi a3, a1, -1
+; RV64I-NEXT:    srlw a1, a0, a1
+; RV64I-NEXT:    negw a3, a3
+; RV64I-NEXT:    sllw a0, a0, a3
+; RV64I-NEXT:    or a0, a1, a0
+; RV64I-NEXT:    sw a0, 0(a2)
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-ZBKB-LABEL: ror_i32_nosext:
+; RV64ZBB-ZBKB:       # %bb.0:
+; RV64ZBB-ZBKB-NEXT:    rorw a0, a0, a1
+; RV64ZBB-ZBKB-NEXT:    sw a0, 0(a2)
+; RV64ZBB-ZBKB-NEXT:    ret
+  %1 = tail call i32 @llvm.fshr.i32(i32 %a, i32 %a, i32 %b)
+  store i32 %1, ptr %x
+  ret void
+}
+
+define signext i32 @ror_i32_neg_constant_rhs(i32 signext %a) nounwind {
+; RV64I-LABEL: ror_i32_neg_constant_rhs:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    andi a1, a0, -1
+; RV64I-NEXT:    li a2, -2
+; RV64I-NEXT:    srlw a0, a2, a0
+; RV64I-NEXT:    negw a1, a1
+; RV64I-NEXT:    sllw a1, a2, a1
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-ZBKB-LABEL: ror_i32_neg_constant_rhs:
+; RV64ZBB-ZBKB:       # %bb.0:
+; RV64ZBB-ZBKB-NEXT:    li a1, -2
+; RV64ZBB-ZBKB-NEXT:    rorw a0, a1, a0
+; RV64ZBB-ZBKB-NEXT:    ret
+  %1 = tail call i32 @llvm.fshr.i32(i32 -2, i32 -2, i32 %a)
+  ret i32 %1
+}
+
+declare i64 @llvm.fshr.i64(i64, i64, i64)
+
+define i64 @ror_i64(i64 %a, i64 %b) nounwind {
+; RV64I-LABEL: ror_i64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    srl a2, a0, a1
+; RV64I-NEXT:    negw a1, a1
+; RV64I-NEXT:    sll a0, a0, a1
+; RV64I-NEXT:    or a0, a2, a0
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-ZBKB-LABEL: ror_i64:
+; RV64ZBB-ZBKB:       # %bb.0:
+; RV64ZBB-ZBKB-NEXT:    ror a0, a0, a1
+; RV64ZBB-ZBKB-NEXT:    ret
+  %or = tail call i64 @llvm.fshr.i64(i64 %a, i64 %a, i64 %b)
+  ret i64 %or
+}
+
+define signext i32 @rori_i32_fshl(i32 signext %a) nounwind {
+; RV64I-LABEL: rori_i32_fshl:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    srliw a1, a0, 1
+; RV64I-NEXT:    slliw a0, a0, 31
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-ZBKB-LABEL: rori_i32_fshl:
+; RV64ZBB-ZBKB:       # %bb.0:
+; RV64ZBB-ZBKB-NEXT:    roriw a0, a0, 1
+; RV64ZBB-ZBKB-NEXT:    ret
+  %1 = tail call i32 @llvm.fshl.i32(i32 %a, i32 %a, i32 31)
+  ret i32 %1
+}
+
+; Similar to rori_i32_fshl, but doesn't sign extend the result.
+define void @rori_i32_fshl_nosext(i32 signext %a, ptr %x) nounwind {
+; RV64I-LABEL: rori_i32_fshl_nosext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    srliw a2, a0, 1
+; RV64I-NEXT:    slli a0, a0, 31
+; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    sw a0, 0(a1)
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-ZBKB-LABEL: rori_i32_fshl_nosext:
+; RV64ZBB-ZBKB:       # %bb.0:
+; RV64ZBB-ZBKB-NEXT:    roriw a0, a0, 1
+; RV64ZBB-ZBKB-NEXT:    sw a0, 0(a1)
+; RV64ZBB-ZBKB-NEXT:    ret
+  %1 = tail call i32 @llvm.fshl.i32(i32 %a, i32 %a, i32 31)
+  store i32 %1, ptr %x
+  ret void
+}
+
+define signext i32 @rori_i32_fshr(i32 signext %a) nounwind {
+; RV64I-LABEL: rori_i32_fshr:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slliw a1, a0, 1
+; RV64I-NEXT:    srliw a0, a0, 31
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-ZBKB-LABEL: rori_i32_fshr:
+; RV64ZBB-ZBKB:       # %bb.0:
+; RV64ZBB-ZBKB-NEXT:    roriw a0, a0, 31
+; RV64ZBB-ZBKB-NEXT:    ret
+  %1 = tail call i32 @llvm.fshr.i32(i32 %a, i32 %a, i32 31)
+  ret i32 %1
+}
+
+; Similar to rori_i32_fshr, but doesn't sign extend the result.
+define void @rori_i32_fshr_nosext(i32 signext %a, ptr %x) nounwind {
+; RV64I-LABEL: rori_i32_fshr_nosext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a2, a0, 1
+; RV64I-NEXT:    srliw a0, a0, 31
+; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    sw a0, 0(a1)
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-ZBKB-LABEL: rori_i32_fshr_nosext:
+; RV64ZBB-ZBKB:       # %bb.0:
+; RV64ZBB-ZBKB-NEXT:    roriw a0, a0, 31
+; RV64ZBB-ZBKB-NEXT:    sw a0, 0(a1)
+; RV64ZBB-ZBKB-NEXT:    ret
+  %1 = tail call i32 @llvm.fshr.i32(i32 %a, i32 %a, i32 31)
+  store i32 %1, ptr %x
+  ret void
+}
+
+; This test is similar to the type legalized version of the fshl/fshr tests, but
+; instead of having the same input to both shifts it has different inputs. Make
+; sure we don't match it as a roriw.
+define signext i32 @not_rori_i32(i32 signext %x, i32 signext %y) nounwind {
+; CHECK-LABEL: not_rori_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    slliw a0, a0, 31
+; CHECK-NEXT:    srliw a1, a1, 1
+; CHECK-NEXT:    or a0, a0, a1
+; CHECK-NEXT:    ret
+  %a = shl i32 %x, 31
+  %b = lshr i32 %y, 1
+  %c = or i32 %a, %b
+  ret i32 %c
+}
+
+; This is similar to the type legalized roriw pattern, but the and mask is more
+; than 32 bits so the lshr doesn't shift zeroes into the lower 32 bits. Make
+; sure we don't match it to roriw.
+define i64 @roriw_bug(i64 %x) nounwind {
+; CHECK-LABEL: roriw_bug:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    slli a1, a0, 31
+; CHECK-NEXT:    andi a2, a0, -2
+; CHECK-NEXT:    srli a0, a0, 1
+; CHECK-NEXT:    or a0, a1, a0
+; CHECK-NEXT:    sext.w a0, a0
+; CHECK-NEXT:    xor a0, a2, a0
+; CHECK-NEXT:    ret
+  %a = shl i64 %x, 31
+  %b = and i64 %x, 18446744073709551614
+  %c = lshr i64 %b, 1
+  %d = or i64 %a, %c
+  %e = shl i64 %d, 32
+  %f = ashr i64 %e, 32
+  %g = xor i64 %b, %f ; to increase the use count on %b to disable SimplifyDemandedBits.
+  ret i64 %g
+}
+
+define i64 @rori_i64_fshl(i64 %a) nounwind {
+; RV64I-LABEL: rori_i64_fshl:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    srli a1, a0, 1
+; RV64I-NEXT:    slli a0, a0, 63
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-ZBKB-LABEL: rori_i64_fshl:
+; RV64ZBB-ZBKB:       # %bb.0:
+; RV64ZBB-ZBKB-NEXT:    rori a0, a0, 1
+; RV64ZBB-ZBKB-NEXT:    ret
+  %1 = tail call i64 @llvm.fshl.i64(i64 %a, i64 %a, i64 63)
+  ret i64 %1
+}
+
+define i64 @rori_i64_fshr(i64 %a) nounwind {
+; RV64I-LABEL: rori_i64_fshr:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a1, a0, 1
+; RV64I-NEXT:    srli a0, a0, 63
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-ZBKB-LABEL: rori_i64_fshr:
+; RV64ZBB-ZBKB:       # %bb.0:
+; RV64ZBB-ZBKB-NEXT:    rori a0, a0, 63
+; RV64ZBB-ZBKB-NEXT:    ret
+  %1 = tail call i64 @llvm.fshr.i64(i64 %a, i64 %a, i64 63)
+  ret i64 %1
+}
+
+define signext i32 @not_shl_one_i32(i32 signext %x) {
+; RV64I-LABEL: not_shl_one_i32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a1, 1
+; RV64I-NEXT:    sllw a0, a1, a0
+; RV64I-NEXT:    not a0, a0
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-ZBKB-LABEL: not_shl_one_i32:
+; RV64ZBB-ZBKB:       # %bb.0:
+; RV64ZBB-ZBKB-NEXT:    li a1, -2
+; RV64ZBB-ZBKB-NEXT:    rolw a0, a1, a0
+; RV64ZBB-ZBKB-NEXT:    ret
+  %1 = shl i32 1, %x
+  %2 = xor i32 %1, -1
+  ret i32 %2
+}
+
+define i64 @not_shl_one_i64(i64 %x) {
+; RV64I-LABEL: not_shl_one_i64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a1, 1
+; RV64I-NEXT:    sll a0, a1, a0
+; RV64I-NEXT:    not a0, a0
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-ZBKB-LABEL: not_shl_one_i64:
+; RV64ZBB-ZBKB:       # %bb.0:
+; RV64ZBB-ZBKB-NEXT:    li a1, -2
+; RV64ZBB-ZBKB-NEXT:    rol a0, a1, a0
+; RV64ZBB-ZBKB-NEXT:    ret
+  %1 = shl i64 1, %x
+  %2 = xor i64 %1, -1
+  ret i64 %2
+}
+
+define i8 @srli_i8(i8 %a) nounwind {
+; CHECK-LABEL: srli_i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    andi a0, a0, 192
+; CHECK-NEXT:    srliw a0, a0, 6
+; CHECK-NEXT:    ret
+  %1 = lshr i8 %a, 6
+  ret i8 %1
+}
+
+; We could use sext.b+srai, but slli+srai offers more opportunities for
+; comppressed instructions.
+define i8 @srai_i8(i8 %a) nounwind {
+; RV64I-LABEL: srai_i8:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 24
+; RV64I-NEXT:    sraiw a0, a0, 29
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-LABEL: srai_i8:
+; RV64ZBB:       # %bb.0:
+; RV64ZBB-NEXT:    slli a0, a0, 56
+; RV64ZBB-NEXT:    srai a0, a0, 61
+; RV64ZBB-NEXT:    ret
+;
+; RV64ZBKB-LABEL: srai_i8:
+; RV64ZBKB:       # %bb.0:
+; RV64ZBKB-NEXT:    slli a0, a0, 24
+; RV64ZBKB-NEXT:    sraiw a0, a0, 29
+; RV64ZBKB-NEXT:    ret
+  %1 = ashr i8 %a, 5
+  ret i8 %1
+}
+
+; We could use zext.h+srli, but slli+srli offers more opportunities for
+; comppressed instructions.
+define i16 @srli_i16(i16 %a) nounwind {
+; RV64I-LABEL: srli_i16:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 48
+; RV64I-NEXT:    srli a0, a0, 48
+; RV64I-NEXT:    srliw a0, a0, 6
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-LABEL: srli_i16:
+; RV64ZBB:       # %bb.0:
+; RV64ZBB-NEXT:    zext.h a0, a0
+; RV64ZBB-NEXT:    srliw a0, a0, 6
+; RV64ZBB-NEXT:    ret
+;
+; RV64ZBKB-LABEL: srli_i16:
+; RV64ZBKB:       # %bb.0:
+; RV64ZBKB-NEXT:    slli a0, a0, 48
+; RV64ZBKB-NEXT:    srli a0, a0, 48
+; RV64ZBKB-NEXT:    srliw a0, a0, 6
+; RV64ZBKB-NEXT:    ret
+  %1 = lshr i16 %a, 6
+  ret i16 %1
+}
+
+; We could use sext.h+srai, but slli+srai offers more opportunities for
+; comppressed instructions.
+define i16 @srai_i16(i16 %a) nounwind {
+; RV64I-LABEL: srai_i16:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 16
+; RV64I-NEXT:    sraiw a0, a0, 25
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-LABEL: srai_i16:
+; RV64ZBB:       # %bb.0:
+; RV64ZBB-NEXT:    slli a0, a0, 48
+; RV64ZBB-NEXT:    srai a0, a0, 57
+; RV64ZBB-NEXT:    ret
+;
+; RV64ZBKB-LABEL: srai_i16:
+; RV64ZBKB:       # %bb.0:
+; RV64ZBKB-NEXT:    slli a0, a0, 16
+; RV64ZBKB-NEXT:    sraiw a0, a0, 25
+; RV64ZBKB-NEXT:    ret
+  %1 = ashr i16 %a, 9
+  ret i16 %1
+}
+
+define i1 @andn_seqz_i32(i32 signext %a, i32 signext %b) nounwind {
+; RV64I-LABEL: andn_seqz_i32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    xor a0, a0, a1
+; RV64I-NEXT:    seqz a0, a0
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-ZBKB-LABEL: andn_seqz_i32:
+; RV64ZBB-ZBKB:       # %bb.0:
+; RV64ZBB-ZBKB-NEXT:    andn a0, a1, a0
+; RV64ZBB-ZBKB-NEXT:    seqz a0, a0
+; RV64ZBB-ZBKB-NEXT:    ret
+  %and = and i32 %a, %b
+  %cmpeq = icmp eq i32 %and, %b
+  ret i1 %cmpeq
+}
+
+define i1 @andn_seqz_i64(i64 %a, i64 %b) nounwind {
+; RV64I-LABEL: andn_seqz_i64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    xor a0, a0, a1
+; RV64I-NEXT:    seqz a0, a0
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-ZBKB-LABEL: andn_seqz_i64:
+; RV64ZBB-ZBKB:       # %bb.0:
+; RV64ZBB-ZBKB-NEXT:    andn a0, a1, a0
+; RV64ZBB-ZBKB-NEXT:    seqz a0, a0
+; RV64ZBB-ZBKB-NEXT:    ret
+  %and = and i64 %a, %b
+  %cmpeq = icmp eq i64 %and, %b
+  ret i1 %cmpeq
+}
+
+define i1 @andn_snez_i32(i32 signext %a, i32 signext %b) nounwind {
+; RV64I-LABEL: andn_snez_i32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    xor a0, a0, a1
+; RV64I-NEXT:    snez a0, a0
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-ZBKB-LABEL: andn_snez_i32:
+; RV64ZBB-ZBKB:       # %bb.0:
+; RV64ZBB-ZBKB-NEXT:    andn a0, a1, a0
+; RV64ZBB-ZBKB-NEXT:    snez a0, a0
+; RV64ZBB-ZBKB-NEXT:    ret
+  %and = and i32 %a, %b
+  %cmpeq = icmp ne i32 %and, %b
+  ret i1 %cmpeq
+}
+
+define i1 @andn_snez_i64(i64 %a, i64 %b) nounwind {
+; RV64I-LABEL: andn_snez_i64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    xor a0, a0, a1
+; RV64I-NEXT:    snez a0, a0
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-ZBKB-LABEL: andn_snez_i64:
+; RV64ZBB-ZBKB:       # %bb.0:
+; RV64ZBB-ZBKB-NEXT:    andn a0, a1, a0
+; RV64ZBB-ZBKB-NEXT:    snez a0, a0
+; RV64ZBB-ZBKB-NEXT:    ret
+  %and = and i64 %a, %b
+  %cmpeq = icmp ne i64 %and, %b
+  ret i1 %cmpeq
+}
diff --git a/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64zbb.ll b/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64zbb.ll
new file mode 100644
index 0000000000000..acc175186b858
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64zbb.ll
@@ -0,0 +1,1068 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv64 -verify-machineinstrs < %s \
+; RUN:   -riscv-experimental-rv64-legal-i32 | FileCheck %s -check-prefix=RV64I
+; RUN: llc -mtriple=riscv64 -mattr=+zbb -verify-machineinstrs < %s \
+; RUN:   -riscv-experimental-rv64-legal-i32 | FileCheck %s -check-prefix=RV64ZBB
+
+declare i32 @llvm.ctlz.i32(i32, i1)
+
+define signext i32 @ctlz_i32(i32 signext %a) nounwind {
+; RV64I-LABEL: ctlz_i32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    beqz a0, .LBB0_2
+; RV64I-NEXT:  # %bb.1: # %cond.false
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    srliw a1, a0, 1
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    srliw a1, a0, 2
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    srliw a1, a0, 4
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    srliw a1, a0, 8
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    srliw a1, a0, 16
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    not a0, a0
+; RV64I-NEXT:    srliw a1, a0, 1
+; RV64I-NEXT:    lui a2, 349525
+; RV64I-NEXT:    addi a2, a2, 1365
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    subw a0, a0, a1
+; RV64I-NEXT:    lui a1, 209715
+; RV64I-NEXT:    addi a1, a1, 819
+; RV64I-NEXT:    and a2, a0, a1
+; RV64I-NEXT:    srliw a0, a0, 2
+; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    add a0, a2, a0
+; RV64I-NEXT:    srliw a1, a0, 4
+; RV64I-NEXT:    addw a0, a0, a1
+; RV64I-NEXT:    lui a1, 61681
+; RV64I-NEXT:    addiw a1, a1, -241
+; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    lui a1, 4112
+; RV64I-NEXT:    addiw a1, a1, 257
+; RV64I-NEXT:    call __muldi3@plt
+; RV64I-NEXT:    srliw a0, a0, 24
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
+; RV64I-NEXT:  .LBB0_2:
+; RV64I-NEXT:    li a0, 32
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-LABEL: ctlz_i32:
+; RV64ZBB:       # %bb.0:
+; RV64ZBB-NEXT:    clzw a0, a0
+; RV64ZBB-NEXT:    ret
+  %1 = call i32 @llvm.ctlz.i32(i32 %a, i1 false)
+  ret i32 %1
+}
+
+define signext i32 @log2_i32(i32 signext %a) nounwind {
+; RV64I-LABEL: log2_i32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    beqz a0, .LBB1_2
+; RV64I-NEXT:  # %bb.1: # %cond.false
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    srliw a1, a0, 1
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    srliw a1, a0, 2
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    srliw a1, a0, 4
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    srliw a1, a0, 8
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    srliw a1, a0, 16
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    not a0, a0
+; RV64I-NEXT:    srliw a1, a0, 1
+; RV64I-NEXT:    lui a2, 349525
+; RV64I-NEXT:    addi a2, a2, 1365
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    subw a0, a0, a1
+; RV64I-NEXT:    lui a1, 209715
+; RV64I-NEXT:    addi a1, a1, 819
+; RV64I-NEXT:    and a2, a0, a1
+; RV64I-NEXT:    srliw a0, a0, 2
+; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    add a0, a2, a0
+; RV64I-NEXT:    srliw a1, a0, 4
+; RV64I-NEXT:    addw a0, a0, a1
+; RV64I-NEXT:    lui a1, 61681
+; RV64I-NEXT:    addiw a1, a1, -241
+; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    lui a1, 4112
+; RV64I-NEXT:    addiw a1, a1, 257
+; RV64I-NEXT:    call __muldi3@plt
+; RV64I-NEXT:    srliw a0, a0, 24
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    j .LBB1_3
+; RV64I-NEXT:  .LBB1_2:
+; RV64I-NEXT:    li a0, 32
+; RV64I-NEXT:  .LBB1_3: # %cond.end
+; RV64I-NEXT:    li a1, 31
+; RV64I-NEXT:    subw a0, a1, a0
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-LABEL: log2_i32:
+; RV64ZBB:       # %bb.0:
+; RV64ZBB-NEXT:    clzw a0, a0
+; RV64ZBB-NEXT:    li a1, 31
+; RV64ZBB-NEXT:    subw a0, a1, a0
+; RV64ZBB-NEXT:    ret
+  %1 = call i32 @llvm.ctlz.i32(i32 %a, i1 false)
+  %2 = sub i32 31, %1
+  ret i32 %2
+}
+
+define signext i32 @log2_ceil_i32(i32 signext %a) nounwind {
+; RV64I-LABEL: log2_ceil_i32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 0(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    addiw a0, a0, -1
+; RV64I-NEXT:    li s0, 32
+; RV64I-NEXT:    li a1, 32
+; RV64I-NEXT:    beqz a0, .LBB2_2
+; RV64I-NEXT:  # %bb.1: # %cond.false
+; RV64I-NEXT:    srliw a1, a0, 1
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    srliw a1, a0, 2
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    srliw a1, a0, 4
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    srliw a1, a0, 8
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    srliw a1, a0, 16
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    not a0, a0
+; RV64I-NEXT:    srliw a1, a0, 1
+; RV64I-NEXT:    lui a2, 349525
+; RV64I-NEXT:    addi a2, a2, 1365
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    subw a0, a0, a1
+; RV64I-NEXT:    lui a1, 209715
+; RV64I-NEXT:    addi a1, a1, 819
+; RV64I-NEXT:    and a2, a0, a1
+; RV64I-NEXT:    srliw a0, a0, 2
+; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    add a0, a2, a0
+; RV64I-NEXT:    srliw a1, a0, 4
+; RV64I-NEXT:    addw a0, a0, a1
+; RV64I-NEXT:    lui a1, 61681
+; RV64I-NEXT:    addiw a1, a1, -241
+; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    lui a1, 4112
+; RV64I-NEXT:    addiw a1, a1, 257
+; RV64I-NEXT:    call __muldi3@plt
+; RV64I-NEXT:    srliw a1, a0, 24
+; RV64I-NEXT:  .LBB2_2: # %cond.end
+; RV64I-NEXT:    subw a0, s0, a1
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 0(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-LABEL: log2_ceil_i32:
+; RV64ZBB:       # %bb.0:
+; RV64ZBB-NEXT:    addi a0, a0, -1
+; RV64ZBB-NEXT:    clzw a0, a0
+; RV64ZBB-NEXT:    li a1, 32
+; RV64ZBB-NEXT:    subw a0, a1, a0
+; RV64ZBB-NEXT:    ret
+  %1 = sub i32 %a, 1
+  %2 = call i32 @llvm.ctlz.i32(i32 %1, i1 false)
+  %3 = sub i32 32, %2
+  ret i32 %3
+}
+
+define signext i32 @findLastSet_i32(i32 signext %a) nounwind {
+; RV64I-LABEL: findLastSet_i32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 0(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    mv s0, a0
+; RV64I-NEXT:    srliw a0, a0, 1
+; RV64I-NEXT:    or a0, s0, a0
+; RV64I-NEXT:    srliw a1, a0, 2
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    srliw a1, a0, 4
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    srliw a1, a0, 8
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    srliw a1, a0, 16
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    not a0, a0
+; RV64I-NEXT:    srliw a1, a0, 1
+; RV64I-NEXT:    lui a2, 349525
+; RV64I-NEXT:    addi a2, a2, 1365
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    subw a0, a0, a1
+; RV64I-NEXT:    lui a1, 209715
+; RV64I-NEXT:    addi a1, a1, 819
+; RV64I-NEXT:    and a2, a0, a1
+; RV64I-NEXT:    srliw a0, a0, 2
+; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    add a0, a2, a0
+; RV64I-NEXT:    srliw a1, a0, 4
+; RV64I-NEXT:    addw a0, a0, a1
+; RV64I-NEXT:    lui a1, 61681
+; RV64I-NEXT:    addiw a1, a1, -241
+; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    lui a1, 4112
+; RV64I-NEXT:    addiw a1, a1, 257
+; RV64I-NEXT:    call __muldi3@plt
+; RV64I-NEXT:    srliw a0, a0, 24
+; RV64I-NEXT:    xori a0, a0, 31
+; RV64I-NEXT:    snez a1, s0
+; RV64I-NEXT:    addiw a1, a1, -1
+; RV64I-NEXT:    or a0, a1, a0
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 0(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-LABEL: findLastSet_i32:
+; RV64ZBB:       # %bb.0:
+; RV64ZBB-NEXT:    clzw a1, a0
+; RV64ZBB-NEXT:    xori a1, a1, 31
+; RV64ZBB-NEXT:    snez a0, a0
+; RV64ZBB-NEXT:    addiw a0, a0, -1
+; RV64ZBB-NEXT:    or a0, a0, a1
+; RV64ZBB-NEXT:    ret
+  %1 = call i32 @llvm.ctlz.i32(i32 %a, i1 true)
+  %2 = xor i32 31, %1
+  %3 = icmp eq i32 %a, 0
+  %4 = select i1 %3, i32 -1, i32 %2
+  ret i32 %4
+}
+
+define i32 @ctlz_lshr_i32(i32 signext %a) {
+; RV64I-LABEL: ctlz_lshr_i32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    srliw a0, a0, 1
+; RV64I-NEXT:    beqz a0, .LBB4_2
+; RV64I-NEXT:  # %bb.1: # %cond.false
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    .cfi_def_cfa_offset 16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    .cfi_offset ra, -8
+; RV64I-NEXT:    srliw a1, a0, 1
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    srliw a1, a0, 2
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    srliw a1, a0, 4
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    srliw a1, a0, 8
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    srliw a1, a0, 16
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    not a0, a0
+; RV64I-NEXT:    srliw a1, a0, 1
+; RV64I-NEXT:    lui a2, 349525
+; RV64I-NEXT:    addi a2, a2, 1365
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    subw a0, a0, a1
+; RV64I-NEXT:    lui a1, 209715
+; RV64I-NEXT:    addi a1, a1, 819
+; RV64I-NEXT:    and a2, a0, a1
+; RV64I-NEXT:    srliw a0, a0, 2
+; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    add a0, a2, a0
+; RV64I-NEXT:    srliw a1, a0, 4
+; RV64I-NEXT:    addw a0, a0, a1
+; RV64I-NEXT:    lui a1, 61681
+; RV64I-NEXT:    addiw a1, a1, -241
+; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    lui a1, 4112
+; RV64I-NEXT:    addiw a1, a1, 257
+; RV64I-NEXT:    call __muldi3@plt
+; RV64I-NEXT:    srliw a0, a0, 24
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
+; RV64I-NEXT:  .LBB4_2:
+; RV64I-NEXT:    li a0, 32
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-LABEL: ctlz_lshr_i32:
+; RV64ZBB:       # %bb.0:
+; RV64ZBB-NEXT:    srliw a0, a0, 1
+; RV64ZBB-NEXT:    clzw a0, a0
+; RV64ZBB-NEXT:    ret
+  %1 = lshr i32 %a, 1
+  %2 = call i32 @llvm.ctlz.i32(i32 %1, i1 false)
+  ret i32 %2
+}
+
+declare i64 @llvm.ctlz.i64(i64, i1)
+
+define i64 @ctlz_i64(i64 %a) nounwind {
+; RV64I-LABEL: ctlz_i64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    beqz a0, .LBB5_2
+; RV64I-NEXT:  # %bb.1: # %cond.false
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    srli a1, a0, 1
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    srli a1, a0, 2
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    srli a1, a0, 4
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    srli a1, a0, 8
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    srli a1, a0, 16
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    srli a1, a0, 32
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    not a0, a0
+; RV64I-NEXT:    srli a1, a0, 1
+; RV64I-NEXT:    lui a2, 349525
+; RV64I-NEXT:    addiw a2, a2, 1365
+; RV64I-NEXT:    slli a3, a2, 32
+; RV64I-NEXT:    add a2, a2, a3
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    sub a0, a0, a1
+; RV64I-NEXT:    lui a1, 209715
+; RV64I-NEXT:    addiw a1, a1, 819
+; RV64I-NEXT:    slli a2, a1, 32
+; RV64I-NEXT:    add a1, a1, a2
+; RV64I-NEXT:    and a2, a0, a1
+; RV64I-NEXT:    srli a0, a0, 2
+; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    add a0, a2, a0
+; RV64I-NEXT:    srli a1, a0, 4
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    lui a1, 61681
+; RV64I-NEXT:    addiw a1, a1, -241
+; RV64I-NEXT:    slli a2, a1, 32
+; RV64I-NEXT:    add a1, a1, a2
+; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    lui a1, 4112
+; RV64I-NEXT:    addiw a1, a1, 257
+; RV64I-NEXT:    slli a2, a1, 32
+; RV64I-NEXT:    add a1, a1, a2
+; RV64I-NEXT:    call __muldi3@plt
+; RV64I-NEXT:    srli a0, a0, 56
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
+; RV64I-NEXT:  .LBB5_2:
+; RV64I-NEXT:    li a0, 64
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-LABEL: ctlz_i64:
+; RV64ZBB:       # %bb.0:
+; RV64ZBB-NEXT:    clz a0, a0
+; RV64ZBB-NEXT:    ret
+  %1 = call i64 @llvm.ctlz.i64(i64 %a, i1 false)
+  ret i64 %1
+}
+
+declare i32 @llvm.cttz.i32(i32, i1)
+
+define signext i32 @cttz_i32(i32 signext %a) nounwind {
+; RV64I-LABEL: cttz_i32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    beqz a0, .LBB6_2
+; RV64I-NEXT:  # %bb.1: # %cond.false
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    negw a1, a0
+; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    lui a1, 30667
+; RV64I-NEXT:    addiw a1, a1, 1329
+; RV64I-NEXT:    call __muldi3@plt
+; RV64I-NEXT:    srliw a0, a0, 27
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    lui a1, %hi(.LCPI6_0)
+; RV64I-NEXT:    addi a1, a1, %lo(.LCPI6_0)
+; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    lbu a0, 0(a0)
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
+; RV64I-NEXT:  .LBB6_2:
+; RV64I-NEXT:    li a0, 32
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-LABEL: cttz_i32:
+; RV64ZBB:       # %bb.0:
+; RV64ZBB-NEXT:    ctzw a0, a0
+; RV64ZBB-NEXT:    ret
+  %1 = call i32 @llvm.cttz.i32(i32 %a, i1 false)
+  ret i32 %1
+}
+
+define signext i32 @cttz_zero_undef_i32(i32 signext %a) nounwind {
+; RV64I-LABEL: cttz_zero_undef_i32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    negw a1, a0
+; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    lui a1, 30667
+; RV64I-NEXT:    addiw a1, a1, 1329
+; RV64I-NEXT:    call __muldi3@plt
+; RV64I-NEXT:    srliw a0, a0, 27
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    lui a1, %hi(.LCPI7_0)
+; RV64I-NEXT:    addi a1, a1, %lo(.LCPI7_0)
+; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    lbu a0, 0(a0)
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-LABEL: cttz_zero_undef_i32:
+; RV64ZBB:       # %bb.0:
+; RV64ZBB-NEXT:    ctzw a0, a0
+; RV64ZBB-NEXT:    ret
+  %1 = call i32 @llvm.cttz.i32(i32 %a, i1 true)
+  ret i32 %1
+}
+
+define signext i32 @findFirstSet_i32(i32 signext %a) nounwind {
+; RV64I-LABEL: findFirstSet_i32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 0(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    mv s0, a0
+; RV64I-NEXT:    negw a0, a0
+; RV64I-NEXT:    and a0, s0, a0
+; RV64I-NEXT:    lui a1, 30667
+; RV64I-NEXT:    addiw a1, a1, 1329
+; RV64I-NEXT:    call __muldi3@plt
+; RV64I-NEXT:    srliw a0, a0, 27
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    lui a1, %hi(.LCPI8_0)
+; RV64I-NEXT:    addi a1, a1, %lo(.LCPI8_0)
+; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    lbu a0, 0(a0)
+; RV64I-NEXT:    snez a1, s0
+; RV64I-NEXT:    addi a1, a1, -1
+; RV64I-NEXT:    or a0, a1, a0
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 0(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-LABEL: findFirstSet_i32:
+; RV64ZBB:       # %bb.0:
+; RV64ZBB-NEXT:    ctzw a1, a0
+; RV64ZBB-NEXT:    snez a0, a0
+; RV64ZBB-NEXT:    addiw a0, a0, -1
+; RV64ZBB-NEXT:    or a0, a0, a1
+; RV64ZBB-NEXT:    ret
+  %1 = call i32 @llvm.cttz.i32(i32 %a, i1 true)
+  %2 = icmp eq i32 %a, 0
+  %3 = select i1 %2, i32 -1, i32 %1
+  ret i32 %3
+}
+
+define signext i32 @ffs_i32(i32 signext %a) nounwind {
+; RV64I-LABEL: ffs_i32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    sd s0, 0(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    mv s0, a0
+; RV64I-NEXT:    negw a0, a0
+; RV64I-NEXT:    and a0, s0, a0
+; RV64I-NEXT:    lui a1, 30667
+; RV64I-NEXT:    addiw a1, a1, 1329
+; RV64I-NEXT:    call __muldi3@plt
+; RV64I-NEXT:    srliw a0, a0, 27
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    lui a1, %hi(.LCPI9_0)
+; RV64I-NEXT:    addi a1, a1, %lo(.LCPI9_0)
+; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    lbu a0, 0(a0)
+; RV64I-NEXT:    addi a0, a0, 1
+; RV64I-NEXT:    seqz a1, s0
+; RV64I-NEXT:    addi a1, a1, -1
+; RV64I-NEXT:    and a0, a1, a0
+; RV64I-NEXT:    slli a0, a0, 32
+; RV64I-NEXT:    srli a0, a0, 32
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    ld s0, 0(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-LABEL: ffs_i32:
+; RV64ZBB:       # %bb.0:
+; RV64ZBB-NEXT:    ctzw a1, a0
+; RV64ZBB-NEXT:    addi a1, a1, 1
+; RV64ZBB-NEXT:    seqz a0, a0
+; RV64ZBB-NEXT:    addi a0, a0, -1
+; RV64ZBB-NEXT:    and a0, a0, a1
+; RV64ZBB-NEXT:    zext.h a0, a0
+; RV64ZBB-NEXT:    ret
+  %1 = call i32 @llvm.cttz.i32(i32 %a, i1 true)
+  %2 = add i32 %1, 1
+  %3 = icmp eq i32 %a, 0
+  %4 = select i1 %3, i32 0, i32 %2
+  ret i32 %4
+}
+
+declare i64 @llvm.cttz.i64(i64, i1)
+
+define i64 @cttz_i64(i64 %a) nounwind {
+; RV64I-LABEL: cttz_i64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    beqz a0, .LBB10_2
+; RV64I-NEXT:  # %bb.1: # %cond.false
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    neg a1, a0
+; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    lui a1, %hi(.LCPI10_0)
+; RV64I-NEXT:    ld a1, %lo(.LCPI10_0)(a1)
+; RV64I-NEXT:    call __muldi3@plt
+; RV64I-NEXT:    srli a0, a0, 58
+; RV64I-NEXT:    lui a1, %hi(.LCPI10_1)
+; RV64I-NEXT:    addi a1, a1, %lo(.LCPI10_1)
+; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    lbu a0, 0(a0)
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
+; RV64I-NEXT:  .LBB10_2:
+; RV64I-NEXT:    li a0, 64
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-LABEL: cttz_i64:
+; RV64ZBB:       # %bb.0:
+; RV64ZBB-NEXT:    ctz a0, a0
+; RV64ZBB-NEXT:    ret
+  %1 = call i64 @llvm.cttz.i64(i64 %a, i1 false)
+  ret i64 %1
+}
+
+declare i32 @llvm.ctpop.i32(i32)
+
+define signext i32 @ctpop_i32(i32 signext %a) nounwind {
+; RV64I-LABEL: ctpop_i32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    srliw a1, a0, 1
+; RV64I-NEXT:    lui a2, 349525
+; RV64I-NEXT:    addi a2, a2, 1365
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    subw a0, a0, a1
+; RV64I-NEXT:    lui a1, 209715
+; RV64I-NEXT:    addi a1, a1, 819
+; RV64I-NEXT:    and a2, a0, a1
+; RV64I-NEXT:    srliw a0, a0, 2
+; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    add a0, a2, a0
+; RV64I-NEXT:    srliw a1, a0, 4
+; RV64I-NEXT:    addw a0, a0, a1
+; RV64I-NEXT:    lui a1, 61681
+; RV64I-NEXT:    addiw a1, a1, -241
+; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    lui a1, 4112
+; RV64I-NEXT:    addiw a1, a1, 257
+; RV64I-NEXT:    call __muldi3@plt
+; RV64I-NEXT:    srliw a0, a0, 24
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-LABEL: ctpop_i32:
+; RV64ZBB:       # %bb.0:
+; RV64ZBB-NEXT:    cpopw a0, a0
+; RV64ZBB-NEXT:    ret
+  %1 = call i32 @llvm.ctpop.i32(i32 %a)
+  ret i32 %1
+}
+
+define signext i32 @ctpop_i32_load(ptr %p) nounwind {
+; RV64I-LABEL: ctpop_i32_load:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    lw a0, 0(a0)
+; RV64I-NEXT:    srliw a1, a0, 1
+; RV64I-NEXT:    lui a2, 349525
+; RV64I-NEXT:    addi a2, a2, 1365
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    subw a0, a0, a1
+; RV64I-NEXT:    lui a1, 209715
+; RV64I-NEXT:    addi a1, a1, 819
+; RV64I-NEXT:    and a2, a0, a1
+; RV64I-NEXT:    srliw a0, a0, 2
+; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    add a0, a2, a0
+; RV64I-NEXT:    srliw a1, a0, 4
+; RV64I-NEXT:    addw a0, a0, a1
+; RV64I-NEXT:    lui a1, 61681
+; RV64I-NEXT:    addiw a1, a1, -241
+; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    lui a1, 4112
+; RV64I-NEXT:    addiw a1, a1, 257
+; RV64I-NEXT:    call __muldi3@plt
+; RV64I-NEXT:    srliw a0, a0, 24
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-LABEL: ctpop_i32_load:
+; RV64ZBB:       # %bb.0:
+; RV64ZBB-NEXT:    lw a0, 0(a0)
+; RV64ZBB-NEXT:    cpopw a0, a0
+; RV64ZBB-NEXT:    ret
+  %a = load i32, ptr %p
+  %1 = call i32 @llvm.ctpop.i32(i32 %a)
+  ret i32 %1
+}
+
+declare i64 @llvm.ctpop.i64(i64)
+
+define i64 @ctpop_i64(i64 %a) nounwind {
+; RV64I-LABEL: ctpop_i64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    srli a1, a0, 1
+; RV64I-NEXT:    lui a2, 349525
+; RV64I-NEXT:    addiw a2, a2, 1365
+; RV64I-NEXT:    slli a3, a2, 32
+; RV64I-NEXT:    add a2, a2, a3
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    sub a0, a0, a1
+; RV64I-NEXT:    lui a1, 209715
+; RV64I-NEXT:    addiw a1, a1, 819
+; RV64I-NEXT:    slli a2, a1, 32
+; RV64I-NEXT:    add a1, a1, a2
+; RV64I-NEXT:    and a2, a0, a1
+; RV64I-NEXT:    srli a0, a0, 2
+; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    add a0, a2, a0
+; RV64I-NEXT:    srli a1, a0, 4
+; RV64I-NEXT:    add a0, a0, a1
+; RV64I-NEXT:    lui a1, 61681
+; RV64I-NEXT:    addiw a1, a1, -241
+; RV64I-NEXT:    slli a2, a1, 32
+; RV64I-NEXT:    add a1, a1, a2
+; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    lui a1, 4112
+; RV64I-NEXT:    addiw a1, a1, 257
+; RV64I-NEXT:    slli a2, a1, 32
+; RV64I-NEXT:    add a1, a1, a2
+; RV64I-NEXT:    call __muldi3@plt
+; RV64I-NEXT:    srli a0, a0, 56
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-LABEL: ctpop_i64:
+; RV64ZBB:       # %bb.0:
+; RV64ZBB-NEXT:    cpop a0, a0
+; RV64ZBB-NEXT:    ret
+  %1 = call i64 @llvm.ctpop.i64(i64 %a)
+  ret i64 %1
+}
+
+define signext i32 @sextb_i32(i32 signext %a) nounwind {
+; RV64I-LABEL: sextb_i32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 56
+; RV64I-NEXT:    srai a0, a0, 56
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-LABEL: sextb_i32:
+; RV64ZBB:       # %bb.0:
+; RV64ZBB-NEXT:    sext.b a0, a0
+; RV64ZBB-NEXT:    ret
+  %shl = shl i32 %a, 24
+  %shr = ashr exact i32 %shl, 24
+  ret i32 %shr
+}
+
+define i64 @sextb_i64(i64 %a) nounwind {
+; RV64I-LABEL: sextb_i64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 56
+; RV64I-NEXT:    srai a0, a0, 56
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-LABEL: sextb_i64:
+; RV64ZBB:       # %bb.0:
+; RV64ZBB-NEXT:    sext.b a0, a0
+; RV64ZBB-NEXT:    ret
+  %shl = shl i64 %a, 56
+  %shr = ashr exact i64 %shl, 56
+  ret i64 %shr
+}
+
+define signext i32 @sexth_i32(i32 signext %a) nounwind {
+; RV64I-LABEL: sexth_i32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 48
+; RV64I-NEXT:    srai a0, a0, 48
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-LABEL: sexth_i32:
+; RV64ZBB:       # %bb.0:
+; RV64ZBB-NEXT:    sext.h a0, a0
+; RV64ZBB-NEXT:    ret
+  %shl = shl i32 %a, 16
+  %shr = ashr exact i32 %shl, 16
+  ret i32 %shr
+}
+
+define i64 @sexth_i64(i64 %a) nounwind {
+; RV64I-LABEL: sexth_i64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 48
+; RV64I-NEXT:    srai a0, a0, 48
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-LABEL: sexth_i64:
+; RV64ZBB:       # %bb.0:
+; RV64ZBB-NEXT:    sext.h a0, a0
+; RV64ZBB-NEXT:    ret
+  %shl = shl i64 %a, 48
+  %shr = ashr exact i64 %shl, 48
+  ret i64 %shr
+}
+
+define signext i32 @min_i32(i32 signext %a, i32 signext %b) nounwind {
+; RV64I-LABEL: min_i32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    blt a0, a1, .LBB18_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    mv a0, a1
+; RV64I-NEXT:  .LBB18_2:
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-LABEL: min_i32:
+; RV64ZBB:       # %bb.0:
+; RV64ZBB-NEXT:    min a0, a0, a1
+; RV64ZBB-NEXT:    ret
+  %cmp = icmp slt i32 %a, %b
+  %cond = select i1 %cmp, i32 %a, i32 %b
+  ret i32 %cond
+}
+
+define i64 @min_i64(i64 %a, i64 %b) nounwind {
+; RV64I-LABEL: min_i64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    blt a0, a1, .LBB19_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    mv a0, a1
+; RV64I-NEXT:  .LBB19_2:
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-LABEL: min_i64:
+; RV64ZBB:       # %bb.0:
+; RV64ZBB-NEXT:    min a0, a0, a1
+; RV64ZBB-NEXT:    ret
+  %cmp = icmp slt i64 %a, %b
+  %cond = select i1 %cmp, i64 %a, i64 %b
+  ret i64 %cond
+}
+
+define signext i32 @max_i32(i32 signext %a, i32 signext %b) nounwind {
+; RV64I-LABEL: max_i32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    blt a1, a0, .LBB20_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    mv a0, a1
+; RV64I-NEXT:  .LBB20_2:
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-LABEL: max_i32:
+; RV64ZBB:       # %bb.0:
+; RV64ZBB-NEXT:    max a0, a0, a1
+; RV64ZBB-NEXT:    ret
+  %cmp = icmp sgt i32 %a, %b
+  %cond = select i1 %cmp, i32 %a, i32 %b
+  ret i32 %cond
+}
+
+define i64 @max_i64(i64 %a, i64 %b) nounwind {
+; RV64I-LABEL: max_i64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    blt a1, a0, .LBB21_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    mv a0, a1
+; RV64I-NEXT:  .LBB21_2:
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-LABEL: max_i64:
+; RV64ZBB:       # %bb.0:
+; RV64ZBB-NEXT:    max a0, a0, a1
+; RV64ZBB-NEXT:    ret
+  %cmp = icmp sgt i64 %a, %b
+  %cond = select i1 %cmp, i64 %a, i64 %b
+  ret i64 %cond
+}
+
+define signext i32 @minu_i32(i32 signext %a, i32 signext %b) nounwind {
+; RV64I-LABEL: minu_i32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    bltu a0, a1, .LBB22_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    mv a0, a1
+; RV64I-NEXT:  .LBB22_2:
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-LABEL: minu_i32:
+; RV64ZBB:       # %bb.0:
+; RV64ZBB-NEXT:    minu a0, a0, a1
+; RV64ZBB-NEXT:    ret
+  %cmp = icmp ult i32 %a, %b
+  %cond = select i1 %cmp, i32 %a, i32 %b
+  ret i32 %cond
+}
+
+define i64 @minu_i64(i64 %a, i64 %b) nounwind {
+; RV64I-LABEL: minu_i64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    bltu a0, a1, .LBB23_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    mv a0, a1
+; RV64I-NEXT:  .LBB23_2:
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-LABEL: minu_i64:
+; RV64ZBB:       # %bb.0:
+; RV64ZBB-NEXT:    minu a0, a0, a1
+; RV64ZBB-NEXT:    ret
+  %cmp = icmp ult i64 %a, %b
+  %cond = select i1 %cmp, i64 %a, i64 %b
+  ret i64 %cond
+}
+
+define signext i32 @maxu_i32(i32 signext %a, i32 signext %b) nounwind {
+; RV64I-LABEL: maxu_i32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    bltu a1, a0, .LBB24_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    mv a0, a1
+; RV64I-NEXT:  .LBB24_2:
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-LABEL: maxu_i32:
+; RV64ZBB:       # %bb.0:
+; RV64ZBB-NEXT:    maxu a0, a0, a1
+; RV64ZBB-NEXT:    ret
+  %cmp = icmp ugt i32 %a, %b
+  %cond = select i1 %cmp, i32 %a, i32 %b
+  ret i32 %cond
+}
+
+define i64 @maxu_i64(i64 %a, i64 %b) nounwind {
+; RV64I-LABEL: maxu_i64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    bltu a1, a0, .LBB25_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    mv a0, a1
+; RV64I-NEXT:  .LBB25_2:
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-LABEL: maxu_i64:
+; RV64ZBB:       # %bb.0:
+; RV64ZBB-NEXT:    maxu a0, a0, a1
+; RV64ZBB-NEXT:    ret
+  %cmp = icmp ugt i64 %a, %b
+  %cond = select i1 %cmp, i64 %a, i64 %b
+  ret i64 %cond
+}
+
+declare i32 @llvm.abs.i32(i32, i1 immarg)
+
+define i32 @abs_i32(i32 %x) {
+; RV64I-LABEL: abs_i32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sraiw a1, a0, 31
+; RV64I-NEXT:    xor a0, a0, a1
+; RV64I-NEXT:    subw a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-LABEL: abs_i32:
+; RV64ZBB:       # %bb.0:
+; RV64ZBB-NEXT:    sraiw a1, a0, 31
+; RV64ZBB-NEXT:    xor a0, a0, a1
+; RV64ZBB-NEXT:    subw a0, a0, a1
+; RV64ZBB-NEXT:    ret
+  %abs = tail call i32 @llvm.abs.i32(i32 %x, i1 true)
+  ret i32 %abs
+}
+
+define signext i32 @abs_i32_sext(i32 signext %x) {
+; RV64I-LABEL: abs_i32_sext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    sraiw a1, a0, 31
+; RV64I-NEXT:    xor a0, a0, a1
+; RV64I-NEXT:    subw a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-LABEL: abs_i32_sext:
+; RV64ZBB:       # %bb.0:
+; RV64ZBB-NEXT:    sraiw a1, a0, 31
+; RV64ZBB-NEXT:    xor a0, a0, a1
+; RV64ZBB-NEXT:    subw a0, a0, a1
+; RV64ZBB-NEXT:    ret
+  %abs = tail call i32 @llvm.abs.i32(i32 %x, i1 true)
+  ret i32 %abs
+}
+
+declare i64 @llvm.abs.i64(i64, i1 immarg)
+
+define i64 @abs_i64(i64 %x) {
+; RV64I-LABEL: abs_i64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    srai a1, a0, 63
+; RV64I-NEXT:    xor a0, a0, a1
+; RV64I-NEXT:    sub a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-LABEL: abs_i64:
+; RV64ZBB:       # %bb.0:
+; RV64ZBB-NEXT:    neg a1, a0
+; RV64ZBB-NEXT:    max a0, a0, a1
+; RV64ZBB-NEXT:    ret
+  %abs = tail call i64 @llvm.abs.i64(i64 %x, i1 true)
+  ret i64 %abs
+}
+
+define i32 @zexth_i32(i32 %a) nounwind {
+; RV64I-LABEL: zexth_i32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 48
+; RV64I-NEXT:    srli a0, a0, 48
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-LABEL: zexth_i32:
+; RV64ZBB:       # %bb.0:
+; RV64ZBB-NEXT:    zext.h a0, a0
+; RV64ZBB-NEXT:    ret
+  %and = and i32 %a, 65535
+  ret i32 %and
+}
+
+define i64 @zexth_i64(i64 %a) nounwind {
+; RV64I-LABEL: zexth_i64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 48
+; RV64I-NEXT:    srli a0, a0, 48
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-LABEL: zexth_i64:
+; RV64ZBB:       # %bb.0:
+; RV64ZBB-NEXT:    zext.h a0, a0
+; RV64ZBB-NEXT:    ret
+  %and = and i64 %a, 65535
+  ret i64 %and
+}
+
+declare i32 @llvm.bswap.i32(i32)
+
+define signext i32 @bswap_i32(i32 signext %a) nounwind {
+; RV64I-LABEL: bswap_i32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    srliw a1, a0, 8
+; RV64I-NEXT:    lui a2, 16
+; RV64I-NEXT:    addiw a2, a2, -256
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    srliw a3, a0, 24
+; RV64I-NEXT:    or a1, a1, a3
+; RV64I-NEXT:    and a2, a0, a2
+; RV64I-NEXT:    slliw a2, a2, 8
+; RV64I-NEXT:    slliw a0, a0, 24
+; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-LABEL: bswap_i32:
+; RV64ZBB:       # %bb.0:
+; RV64ZBB-NEXT:    rev8 a0, a0
+; RV64ZBB-NEXT:    srai a0, a0, 32
+; RV64ZBB-NEXT:    ret
+  %1 = tail call i32 @llvm.bswap.i32(i32 %a)
+  ret i32 %1
+}
+
+; Similar to bswap_i32 but the result is not sign extended.
+define void @bswap_i32_nosext(i32 signext %a, ptr %x) nounwind {
+; RV64I-LABEL: bswap_i32_nosext:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    srliw a2, a0, 8
+; RV64I-NEXT:    lui a3, 16
+; RV64I-NEXT:    addi a3, a3, -256
+; RV64I-NEXT:    and a2, a2, a3
+; RV64I-NEXT:    srliw a4, a0, 24
+; RV64I-NEXT:    or a2, a2, a4
+; RV64I-NEXT:    and a3, a0, a3
+; RV64I-NEXT:    slli a3, a3, 8
+; RV64I-NEXT:    slli a0, a0, 24
+; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    sw a0, 0(a1)
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-LABEL: bswap_i32_nosext:
+; RV64ZBB:       # %bb.0:
+; RV64ZBB-NEXT:    rev8 a0, a0
+; RV64ZBB-NEXT:    srli a0, a0, 32
+; RV64ZBB-NEXT:    sw a0, 0(a1)
+; RV64ZBB-NEXT:    ret
+  %1 = tail call i32 @llvm.bswap.i32(i32 %a)
+  store i32 %1, ptr %x
+  ret void
+}
+
+declare i64 @llvm.bswap.i64(i64)
+
+define i64 @bswap_i64(i64 %a) {
+; RV64I-LABEL: bswap_i64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    srli a1, a0, 40
+; RV64I-NEXT:    lui a2, 16
+; RV64I-NEXT:    addiw a2, a2, -256
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    srli a3, a0, 56
+; RV64I-NEXT:    or a1, a1, a3
+; RV64I-NEXT:    srli a3, a0, 24
+; RV64I-NEXT:    lui a4, 4080
+; RV64I-NEXT:    and a3, a3, a4
+; RV64I-NEXT:    srli a5, a0, 8
+; RV64I-NEXT:    srliw a5, a5, 24
+; RV64I-NEXT:    slli a5, a5, 24
+; RV64I-NEXT:    or a3, a5, a3
+; RV64I-NEXT:    or a1, a3, a1
+; RV64I-NEXT:    and a4, a0, a4
+; RV64I-NEXT:    slli a4, a4, 24
+; RV64I-NEXT:    srliw a3, a0, 24
+; RV64I-NEXT:    slli a3, a3, 32
+; RV64I-NEXT:    or a3, a4, a3
+; RV64I-NEXT:    and a2, a0, a2
+; RV64I-NEXT:    slli a2, a2, 40
+; RV64I-NEXT:    slli a0, a0, 56
+; RV64I-NEXT:    or a0, a0, a2
+; RV64I-NEXT:    or a0, a0, a3
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBB-LABEL: bswap_i64:
+; RV64ZBB:       # %bb.0:
+; RV64ZBB-NEXT:    rev8 a0, a0
+; RV64ZBB-NEXT:    ret
+  %1 = call i64 @llvm.bswap.i64(i64 %a)
+  ret i64 %1
+}
diff --git a/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64zbc-intrinsic.ll b/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64zbc-intrinsic.ll
new file mode 100644
index 0000000000000..9b37e8729576f
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64zbc-intrinsic.ll
@@ -0,0 +1,42 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv64 -mattr=+zbc -verify-machineinstrs < %s \
+; RUN:   -riscv-experimental-rv64-legal-i32 | FileCheck %s -check-prefix=RV64ZBC
+
+declare i64 @llvm.riscv.clmulr.i64(i64 %a, i64 %b)
+
+define i64 @clmul64r(i64 %a, i64 %b) nounwind {
+; RV64ZBC-LABEL: clmul64r:
+; RV64ZBC:       # %bb.0:
+; RV64ZBC-NEXT:    clmulr a0, a0, a1
+; RV64ZBC-NEXT:    ret
+  %tmp = call i64 @llvm.riscv.clmulr.i64(i64 %a, i64 %b)
+  ret i64 %tmp
+}
+
+declare i32 @llvm.riscv.clmulr.i32(i32 %a, i32 %b)
+
+define signext i32 @clmul32r(i32 signext %a, i32 signext %b) nounwind {
+; RV64ZBC-LABEL: clmul32r:
+; RV64ZBC:       # %bb.0:
+; RV64ZBC-NEXT:    slli a1, a1, 32
+; RV64ZBC-NEXT:    slli a0, a0, 32
+; RV64ZBC-NEXT:    clmulr a0, a0, a1
+; RV64ZBC-NEXT:    srai a0, a0, 32
+; RV64ZBC-NEXT:    ret
+  %tmp = call i32 @llvm.riscv.clmulr.i32(i32 %a, i32 %b)
+  ret i32 %tmp
+}
+
+; FIXME: We could avoid the slli instructions by using clmul+srli+sext.w since
+; the inputs are zero extended.
+define signext i32 @clmul32r_zext(i32 zeroext %a, i32 zeroext %b) nounwind {
+; RV64ZBC-LABEL: clmul32r_zext:
+; RV64ZBC:       # %bb.0:
+; RV64ZBC-NEXT:    slli a1, a1, 32
+; RV64ZBC-NEXT:    slli a0, a0, 32
+; RV64ZBC-NEXT:    clmulr a0, a0, a1
+; RV64ZBC-NEXT:    srai a0, a0, 32
+; RV64ZBC-NEXT:    ret
+  %tmp = call i32 @llvm.riscv.clmulr.i32(i32 %a, i32 %b)
+  ret i32 %tmp
+}
diff --git a/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64zbc-zbkc-intrinsic.ll b/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64zbc-zbkc-intrinsic.ll
new file mode 100644
index 0000000000000..e0c9740a9c4bb
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64zbc-zbkc-intrinsic.ll
@@ -0,0 +1,67 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv64 -mattr=+zbc -verify-machineinstrs < %s \
+; RUN:   -riscv-experimental-rv64-legal-i32 | FileCheck %s -check-prefix=RV64ZBC-ZBKC
+; RUN: llc -mtriple=riscv64 -mattr=+zbkc -verify-machineinstrs < %s \
+; RUN:   -riscv-experimental-rv64-legal-i32 | FileCheck %s -check-prefix=RV64ZBC-ZBKC
+
+declare i64 @llvm.riscv.clmul.i64(i64 %a, i64 %b)
+
+define i64 @clmul64(i64 %a, i64 %b) nounwind {
+; RV64ZBC-ZBKC-LABEL: clmul64:
+; RV64ZBC-ZBKC:       # %bb.0:
+; RV64ZBC-ZBKC-NEXT:    clmul a0, a0, a1
+; RV64ZBC-ZBKC-NEXT:    ret
+  %tmp = call i64 @llvm.riscv.clmul.i64(i64 %a, i64 %b)
+  ret i64 %tmp
+}
+
+declare i64 @llvm.riscv.clmulh.i64(i64 %a, i64 %b)
+
+define i64 @clmul64h(i64 %a, i64 %b) nounwind {
+; RV64ZBC-ZBKC-LABEL: clmul64h:
+; RV64ZBC-ZBKC:       # %bb.0:
+; RV64ZBC-ZBKC-NEXT:    clmulh a0, a0, a1
+; RV64ZBC-ZBKC-NEXT:    ret
+  %tmp = call i64 @llvm.riscv.clmulh.i64(i64 %a, i64 %b)
+  ret i64 %tmp
+}
+
+declare i32 @llvm.riscv.clmul.i32(i32 %a, i32 %b)
+
+define signext i32 @clmul32(i32 signext %a, i32 signext %b) nounwind {
+; RV64ZBC-ZBKC-LABEL: clmul32:
+; RV64ZBC-ZBKC:       # %bb.0:
+; RV64ZBC-ZBKC-NEXT:    clmul a0, a0, a1
+; RV64ZBC-ZBKC-NEXT:    sext.w a0, a0
+; RV64ZBC-ZBKC-NEXT:    ret
+  %tmp = call i32 @llvm.riscv.clmul.i32(i32 %a, i32 %b)
+  ret i32 %tmp
+}
+
+declare i32 @llvm.riscv.clmulh.i32(i32 %a, i32 %b)
+
+define signext i32 @clmul32h(i32 signext %a, i32 signext %b) nounwind {
+; RV64ZBC-ZBKC-LABEL: clmul32h:
+; RV64ZBC-ZBKC:       # %bb.0:
+; RV64ZBC-ZBKC-NEXT:    slli a1, a1, 32
+; RV64ZBC-ZBKC-NEXT:    slli a0, a0, 32
+; RV64ZBC-ZBKC-NEXT:    clmulh a0, a0, a1
+; RV64ZBC-ZBKC-NEXT:    srai a0, a0, 32
+; RV64ZBC-ZBKC-NEXT:    ret
+  %tmp = call i32 @llvm.riscv.clmulh.i32(i32 %a, i32 %b)
+  ret i32 %tmp
+}
+
+; FIXME: We could avoid the slli instructions by using clmul+srai since the
+; inputs are zero extended.
+define signext i32 @clmul32h_zext(i32 zeroext %a, i32 zeroext %b) nounwind {
+; RV64ZBC-ZBKC-LABEL: clmul32h_zext:
+; RV64ZBC-ZBKC:       # %bb.0:
+; RV64ZBC-ZBKC-NEXT:    slli a1, a1, 32
+; RV64ZBC-ZBKC-NEXT:    slli a0, a0, 32
+; RV64ZBC-ZBKC-NEXT:    clmulh a0, a0, a1
+; RV64ZBC-ZBKC-NEXT:    srai a0, a0, 32
+; RV64ZBC-ZBKC-NEXT:    ret
+  %tmp = call i32 @llvm.riscv.clmulh.i32(i32 %a, i32 %b)
+  ret i32 %tmp
+}
diff --git a/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64zbkb-intrinsic.ll b/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64zbkb-intrinsic.ll
new file mode 100644
index 0000000000000..3169f65f64671
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64zbkb-intrinsic.ll
@@ -0,0 +1,73 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv64 -mattr=+zbkb -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s -check-prefix=RV64ZBKB
+
+declare i64 @llvm.riscv.brev8.i64(i64)
+
+define i64 @brev8(i64 %a) nounwind {
+; RV64ZBKB-LABEL: brev8:
+; RV64ZBKB:       # %bb.0:
+; RV64ZBKB-NEXT:    brev8 a0, a0
+; RV64ZBKB-NEXT:    ret
+  %val = call i64 @llvm.riscv.brev8.i64(i64 %a)
+  ret i64 %val
+}
+
+; Test that rev8 is recognized as preserving zero extension.
+define zeroext i16 @brev8_knownbits(i16 zeroext %a) nounwind {
+; RV64ZBKB-LABEL: brev8_knownbits:
+; RV64ZBKB:       # %bb.0:
+; RV64ZBKB-NEXT:    brev8 a0, a0
+; RV64ZBKB-NEXT:    ret
+  %zext = zext i16 %a to i64
+  %val = call i64 @llvm.riscv.brev8.i64(i64 %zext)
+  %trunc = trunc i64 %val to i16
+  ret i16 %trunc
+}
+
+declare i64 @llvm.bswap.i64(i64)
+
+define i64 @rev8_i64(i64 %a) {
+; RV64ZBKB-LABEL: rev8_i64:
+; RV64ZBKB:       # %bb.0:
+; RV64ZBKB-NEXT:    rev8 a0, a0
+; RV64ZBKB-NEXT:    ret
+  %1 = call i64 @llvm.bswap.i64(i64 %a)
+  ret i64 %1
+}
+
+declare i32 @llvm.riscv.brev8.i32(i32)
+
+define signext i32 @brev8_i32(i32 signext %a) nounwind {
+; RV64ZBKB-LABEL: brev8_i32:
+; RV64ZBKB:       # %bb.0:
+; RV64ZBKB-NEXT:    brev8 a0, a0
+; RV64ZBKB-NEXT:    sext.w a0, a0
+; RV64ZBKB-NEXT:    ret
+  %val = call i32 @llvm.riscv.brev8.i32(i32 %a)
+  ret i32 %val
+}
+
+; Test that rev8 is recognized as preserving zero extension.
+define zeroext i16 @brev8_i32_knownbits(i16 zeroext %a) nounwind {
+; RV64ZBKB-LABEL: brev8_i32_knownbits:
+; RV64ZBKB:       # %bb.0:
+; RV64ZBKB-NEXT:    brev8 a0, a0
+; RV64ZBKB-NEXT:    ret
+  %zext = zext i16 %a to i32
+  %val = call i32 @llvm.riscv.brev8.i32(i32 %zext)
+  %trunc = trunc i32 %val to i16
+  ret i16 %trunc
+}
+
+declare i32 @llvm.bswap.i32(i32)
+
+define signext i32 @rev8_i32(i32 signext %a) {
+; RV64ZBKB-LABEL: rev8_i32:
+; RV64ZBKB:       # %bb.0:
+; RV64ZBKB-NEXT:    rev8 a0, a0
+; RV64ZBKB-NEXT:    srai a0, a0, 32
+; RV64ZBKB-NEXT:    ret
+  %1 = call i32 @llvm.bswap.i32(i32 %a)
+  ret i32 %1
+}
diff --git a/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64zbs.ll b/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64zbs.ll
new file mode 100644
index 0000000000000..c4680a5e15120
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64zbs.ll
@@ -0,0 +1,1000 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv64 -verify-machineinstrs < %s \
+; RUN:   -riscv-experimental-rv64-legal-i32 | FileCheck %s -check-prefixes=CHECK,RV64I
+; RUN: llc -mtriple=riscv64 -mattr=+zbs -verify-machineinstrs < %s \
+; RUN:   -riscv-experimental-rv64-legal-i32 | FileCheck %s -check-prefixes=CHECK,RV64ZBS
+
+define signext i32 @bclr_i32(i32 signext %a, i32 signext %b) nounwind {
+; CHECK-LABEL: bclr_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a2, 1
+; CHECK-NEXT:    sllw a1, a2, a1
+; CHECK-NEXT:    not a1, a1
+; CHECK-NEXT:    and a0, a1, a0
+; CHECK-NEXT:    ret
+  %and = and i32 %b, 31
+  %shl = shl nuw i32 1, %and
+  %neg = xor i32 %shl, -1
+  %and1 = and i32 %neg, %a
+  ret i32 %and1
+}
+
+define signext i32 @bclr_i32_no_mask(i32 signext %a, i32 signext %b) nounwind {
+; CHECK-LABEL: bclr_i32_no_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a2, 1
+; CHECK-NEXT:    sllw a1, a2, a1
+; CHECK-NEXT:    not a1, a1
+; CHECK-NEXT:    and a0, a1, a0
+; CHECK-NEXT:    ret
+  %shl = shl i32 1, %b
+  %neg = xor i32 %shl, -1
+  %and1 = and i32 %neg, %a
+  ret i32 %and1
+}
+
+define signext i32 @bclr_i32_load(ptr %p, i32 signext %b) nounwind {
+; CHECK-LABEL: bclr_i32_load:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lw a0, 0(a0)
+; CHECK-NEXT:    li a2, 1
+; CHECK-NEXT:    sllw a1, a2, a1
+; CHECK-NEXT:    not a1, a1
+; CHECK-NEXT:    and a0, a1, a0
+; CHECK-NEXT:    ret
+  %a = load i32, ptr %p
+  %shl = shl i32 1, %b
+  %neg = xor i32 %shl, -1
+  %and1 = and i32 %neg, %a
+  ret i32 %and1
+}
+
+define i64 @bclr_i64(i64 %a, i64 %b) nounwind {
+; RV64I-LABEL: bclr_i64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a2, 1
+; RV64I-NEXT:    sll a1, a2, a1
+; RV64I-NEXT:    not a1, a1
+; RV64I-NEXT:    and a0, a1, a0
+; RV64I-NEXT:    ret
+;
+; RV64ZBS-LABEL: bclr_i64:
+; RV64ZBS:       # %bb.0:
+; RV64ZBS-NEXT:    bclr a0, a0, a1
+; RV64ZBS-NEXT:    ret
+  %and = and i64 %b, 63
+  %shl = shl nuw i64 1, %and
+  %neg = xor i64 %shl, -1
+  %and1 = and i64 %neg, %a
+  ret i64 %and1
+}
+
+define i64 @bclr_i64_no_mask(i64 %a, i64 %b) nounwind {
+; RV64I-LABEL: bclr_i64_no_mask:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a2, 1
+; RV64I-NEXT:    sll a1, a2, a1
+; RV64I-NEXT:    not a1, a1
+; RV64I-NEXT:    and a0, a1, a0
+; RV64I-NEXT:    ret
+;
+; RV64ZBS-LABEL: bclr_i64_no_mask:
+; RV64ZBS:       # %bb.0:
+; RV64ZBS-NEXT:    bclr a0, a0, a1
+; RV64ZBS-NEXT:    ret
+  %shl = shl i64 1, %b
+  %neg = xor i64 %shl, -1
+  %and1 = and i64 %neg, %a
+  ret i64 %and1
+}
+
+define signext i32 @bset_i32(i32 signext %a, i32 signext %b) nounwind {
+; CHECK-LABEL: bset_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a2, 1
+; CHECK-NEXT:    sllw a1, a2, a1
+; CHECK-NEXT:    or a0, a1, a0
+; CHECK-NEXT:    ret
+  %and = and i32 %b, 31
+  %shl = shl nuw i32 1, %and
+  %or = or i32 %shl, %a
+  ret i32 %or
+}
+
+define signext i32 @bset_i32_no_mask(i32 signext %a, i32 signext %b) nounwind {
+; CHECK-LABEL: bset_i32_no_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a2, 1
+; CHECK-NEXT:    sllw a1, a2, a1
+; CHECK-NEXT:    or a0, a1, a0
+; CHECK-NEXT:    ret
+  %shl = shl i32 1, %b
+  %or = or i32 %shl, %a
+  ret i32 %or
+}
+
+define signext i32 @bset_i32_load(ptr %p, i32 signext %b) nounwind {
+; CHECK-LABEL: bset_i32_load:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lw a0, 0(a0)
+; CHECK-NEXT:    li a2, 1
+; CHECK-NEXT:    sllw a1, a2, a1
+; CHECK-NEXT:    or a0, a1, a0
+; CHECK-NEXT:    ret
+  %a = load i32, ptr %p
+  %shl = shl i32 1, %b
+  %or = or i32 %shl, %a
+  ret i32 %or
+}
+
+; We can use bsetw for 1 << x by setting the first source to zero.
+define signext i32 @bset_i32_zero(i32 signext %a) nounwind {
+; CHECK-LABEL: bset_i32_zero:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, 1
+; CHECK-NEXT:    sllw a0, a1, a0
+; CHECK-NEXT:    ret
+  %shl = shl i32 1, %a
+  ret i32 %shl
+}
+
+define i64 @bset_i64(i64 %a, i64 %b) nounwind {
+; RV64I-LABEL: bset_i64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a2, 1
+; RV64I-NEXT:    sll a1, a2, a1
+; RV64I-NEXT:    or a0, a1, a0
+; RV64I-NEXT:    ret
+;
+; RV64ZBS-LABEL: bset_i64:
+; RV64ZBS:       # %bb.0:
+; RV64ZBS-NEXT:    bset a0, a0, a1
+; RV64ZBS-NEXT:    ret
+  %conv = and i64 %b, 63
+  %shl = shl nuw i64 1, %conv
+  %or = or i64 %shl, %a
+  ret i64 %or
+}
+
+define i64 @bset_i64_no_mask(i64 %a, i64 %b) nounwind {
+; RV64I-LABEL: bset_i64_no_mask:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a2, 1
+; RV64I-NEXT:    sll a1, a2, a1
+; RV64I-NEXT:    or a0, a1, a0
+; RV64I-NEXT:    ret
+;
+; RV64ZBS-LABEL: bset_i64_no_mask:
+; RV64ZBS:       # %bb.0:
+; RV64ZBS-NEXT:    bset a0, a0, a1
+; RV64ZBS-NEXT:    ret
+  %shl = shl i64 1, %b
+  %or = or i64 %shl, %a
+  ret i64 %or
+}
+
+; We can use bsetw for 1 << x by setting the first source to zero.
+define signext i64 @bset_i64_zero(i64 signext %a) nounwind {
+; RV64I-LABEL: bset_i64_zero:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a1, 1
+; RV64I-NEXT:    sll a0, a1, a0
+; RV64I-NEXT:    ret
+;
+; RV64ZBS-LABEL: bset_i64_zero:
+; RV64ZBS:       # %bb.0:
+; RV64ZBS-NEXT:    bset a0, zero, a0
+; RV64ZBS-NEXT:    ret
+  %shl = shl i64 1, %a
+  ret i64 %shl
+}
+
+define signext i32 @binv_i32(i32 signext %a, i32 signext %b) nounwind {
+; CHECK-LABEL: binv_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a2, 1
+; CHECK-NEXT:    sllw a1, a2, a1
+; CHECK-NEXT:    xor a0, a1, a0
+; CHECK-NEXT:    ret
+  %and = and i32 %b, 31
+  %shl = shl nuw i32 1, %and
+  %xor = xor i32 %shl, %a
+  ret i32 %xor
+}
+
+define signext i32 @binv_i32_no_mask(i32 signext %a, i32 signext %b) nounwind {
+; CHECK-LABEL: binv_i32_no_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a2, 1
+; CHECK-NEXT:    sllw a1, a2, a1
+; CHECK-NEXT:    xor a0, a1, a0
+; CHECK-NEXT:    ret
+  %shl = shl i32 1, %b
+  %xor = xor i32 %shl, %a
+  ret i32 %xor
+}
+
+define signext i32 @binv_i32_load(ptr %p, i32 signext %b) nounwind {
+; CHECK-LABEL: binv_i32_load:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lw a0, 0(a0)
+; CHECK-NEXT:    li a2, 1
+; CHECK-NEXT:    sllw a1, a2, a1
+; CHECK-NEXT:    xor a0, a1, a0
+; CHECK-NEXT:    ret
+  %a = load i32, ptr %p
+  %shl = shl i32 1, %b
+  %xor = xor i32 %shl, %a
+  ret i32 %xor
+}
+
+define i64 @binv_i64(i64 %a, i64 %b) nounwind {
+; RV64I-LABEL: binv_i64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a2, 1
+; RV64I-NEXT:    sll a1, a2, a1
+; RV64I-NEXT:    xor a0, a1, a0
+; RV64I-NEXT:    ret
+;
+; RV64ZBS-LABEL: binv_i64:
+; RV64ZBS:       # %bb.0:
+; RV64ZBS-NEXT:    binv a0, a0, a1
+; RV64ZBS-NEXT:    ret
+  %conv = and i64 %b, 63
+  %shl = shl nuw i64 1, %conv
+  %xor = xor i64 %shl, %a
+  ret i64 %xor
+}
+
+define i64 @binv_i64_no_mask(i64 %a, i64 %b) nounwind {
+; RV64I-LABEL: binv_i64_no_mask:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a2, 1
+; RV64I-NEXT:    sll a1, a2, a1
+; RV64I-NEXT:    xor a0, a1, a0
+; RV64I-NEXT:    ret
+;
+; RV64ZBS-LABEL: binv_i64_no_mask:
+; RV64ZBS:       # %bb.0:
+; RV64ZBS-NEXT:    binv a0, a0, a1
+; RV64ZBS-NEXT:    ret
+  %shl = shl nuw i64 1, %b
+  %xor = xor i64 %shl, %a
+  ret i64 %xor
+}
+
+define signext i32 @bext_i32(i32 signext %a, i32 signext %b) nounwind {
+; RV64I-LABEL: bext_i32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    srlw a0, a0, a1
+; RV64I-NEXT:    andi a0, a0, 1
+; RV64I-NEXT:    ret
+;
+; RV64ZBS-LABEL: bext_i32:
+; RV64ZBS:       # %bb.0:
+; RV64ZBS-NEXT:    andi a1, a1, 31
+; RV64ZBS-NEXT:    bext a0, a0, a1
+; RV64ZBS-NEXT:    ret
+  %and = and i32 %b, 31
+  %shr = lshr i32 %a, %and
+  %and1 = and i32 %shr, 1
+  ret i32 %and1
+}
+
+define signext i32 @bext_i32_no_mask(i32 signext %a, i32 signext %b) nounwind {
+; RV64I-LABEL: bext_i32_no_mask:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    srlw a0, a0, a1
+; RV64I-NEXT:    andi a0, a0, 1
+; RV64I-NEXT:    ret
+;
+; RV64ZBS-LABEL: bext_i32_no_mask:
+; RV64ZBS:       # %bb.0:
+; RV64ZBS-NEXT:    bext a0, a0, a1
+; RV64ZBS-NEXT:    ret
+  %shr = lshr i32 %a, %b
+  %and1 = and i32 %shr, 1
+  ret i32 %and1
+}
+
+; This gets previous converted to (i1 (truncate (srl X, Y)). Make sure we are
+; able to use bext.
+define void @bext_i32_trunc(i32 signext %0, i32 signext %1) {
+; RV64I-LABEL: bext_i32_trunc:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    srlw a0, a0, a1
+; RV64I-NEXT:    andi a0, a0, 1
+; RV64I-NEXT:    beqz a0, .LBB19_2
+; RV64I-NEXT:  # %bb.1:
+; RV64I-NEXT:    ret
+; RV64I-NEXT:  .LBB19_2:
+; RV64I-NEXT:    tail bar@plt
+;
+; RV64ZBS-LABEL: bext_i32_trunc:
+; RV64ZBS:       # %bb.0:
+; RV64ZBS-NEXT:    bext a0, a0, a1
+; RV64ZBS-NEXT:    beqz a0, .LBB19_2
+; RV64ZBS-NEXT:  # %bb.1:
+; RV64ZBS-NEXT:    ret
+; RV64ZBS-NEXT:  .LBB19_2:
+; RV64ZBS-NEXT:    tail bar@plt
+  %3 = shl i32 1, %1
+  %4 = and i32 %3, %0
+  %5 = icmp eq i32 %4, 0
+  br i1 %5, label %6, label %7
+
+6:                                                ; preds = %2
+  tail call void @bar()
+  br label %7
+
+7:                                                ; preds = %6, %2
+  ret void
+}
+
+declare void @bar()
+
+define i64 @bext_i64(i64 %a, i64 %b) nounwind {
+; RV64I-LABEL: bext_i64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    srl a0, a0, a1
+; RV64I-NEXT:    andi a0, a0, 1
+; RV64I-NEXT:    ret
+;
+; RV64ZBS-LABEL: bext_i64:
+; RV64ZBS:       # %bb.0:
+; RV64ZBS-NEXT:    bext a0, a0, a1
+; RV64ZBS-NEXT:    ret
+  %conv = and i64 %b, 63
+  %shr = lshr i64 %a, %conv
+  %and1 = and i64 %shr, 1
+  ret i64 %and1
+}
+
+define i64 @bext_i64_no_mask(i64 %a, i64 %b) nounwind {
+; RV64I-LABEL: bext_i64_no_mask:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    srl a0, a0, a1
+; RV64I-NEXT:    andi a0, a0, 1
+; RV64I-NEXT:    ret
+;
+; RV64ZBS-LABEL: bext_i64_no_mask:
+; RV64ZBS:       # %bb.0:
+; RV64ZBS-NEXT:    bext a0, a0, a1
+; RV64ZBS-NEXT:    ret
+  %shr = lshr i64 %a, %b
+  %and1 = and i64 %shr, 1
+  ret i64 %and1
+}
+
+define signext i32 @bexti_i32(i32 signext %a) nounwind {
+; CHECK-LABEL: bexti_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    srliw a0, a0, 5
+; CHECK-NEXT:    andi a0, a0, 1
+; CHECK-NEXT:    ret
+  %shr = lshr i32 %a, 5
+  %and = and i32 %shr, 1
+  ret i32 %and
+}
+
+define i64 @bexti_i64(i64 %a) nounwind {
+; RV64I-LABEL: bexti_i64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 58
+; RV64I-NEXT:    srli a0, a0, 63
+; RV64I-NEXT:    ret
+;
+; RV64ZBS-LABEL: bexti_i64:
+; RV64ZBS:       # %bb.0:
+; RV64ZBS-NEXT:    bexti a0, a0, 5
+; RV64ZBS-NEXT:    ret
+  %shr = lshr i64 %a, 5
+  %and = and i64 %shr, 1
+  ret i64 %and
+}
+
+define signext i32 @bexti_i32_cmp(i32 signext %a) nounwind {
+; RV64I-LABEL: bexti_i32_cmp:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 58
+; RV64I-NEXT:    srli a0, a0, 63
+; RV64I-NEXT:    ret
+;
+; RV64ZBS-LABEL: bexti_i32_cmp:
+; RV64ZBS:       # %bb.0:
+; RV64ZBS-NEXT:    bexti a0, a0, 5
+; RV64ZBS-NEXT:    ret
+  %and = and i32 %a, 32
+  %cmp = icmp ne i32 %and, 0
+  %zext = zext i1 %cmp to i32
+  ret i32 %zext
+}
+
+define i64 @bexti_i64_cmp(i64 %a) nounwind {
+; RV64I-LABEL: bexti_i64_cmp:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 58
+; RV64I-NEXT:    srli a0, a0, 63
+; RV64I-NEXT:    ret
+;
+; RV64ZBS-LABEL: bexti_i64_cmp:
+; RV64ZBS:       # %bb.0:
+; RV64ZBS-NEXT:    bexti a0, a0, 5
+; RV64ZBS-NEXT:    ret
+  %and = and i64 %a, 32
+  %cmp = icmp ne i64 %and, 0
+  %zext = zext i1 %cmp to i64
+  ret i64 %zext
+}
+
+define signext i32 @bclri_i32_10(i32 signext %a) nounwind {
+; CHECK-LABEL: bclri_i32_10:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    andi a0, a0, -1025
+; CHECK-NEXT:    ret
+  %and = and i32 %a, -1025
+  ret i32 %and
+}
+
+define signext i32 @bclri_i32_11(i32 signext %a) nounwind {
+; RV64I-LABEL: bclri_i32_11:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lui a1, 1048575
+; RV64I-NEXT:    addiw a1, a1, 2047
+; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBS-LABEL: bclri_i32_11:
+; RV64ZBS:       # %bb.0:
+; RV64ZBS-NEXT:    bclri a0, a0, 11
+; RV64ZBS-NEXT:    ret
+  %and = and i32 %a, -2049
+  ret i32 %and
+}
+
+define signext i32 @bclri_i32_30(i32 signext %a) nounwind {
+; RV64I-LABEL: bclri_i32_30:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lui a1, 786432
+; RV64I-NEXT:    addiw a1, a1, -1
+; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBS-LABEL: bclri_i32_30:
+; RV64ZBS:       # %bb.0:
+; RV64ZBS-NEXT:    bclri a0, a0, 30
+; RV64ZBS-NEXT:    ret
+  %and = and i32 %a, -1073741825
+  ret i32 %and
+}
+
+define signext i32 @bclri_i32_31(i32 signext %a) nounwind {
+; CHECK-LABEL: bclri_i32_31:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    slli a0, a0, 33
+; CHECK-NEXT:    srli a0, a0, 33
+; CHECK-NEXT:    ret
+  %and = and i32 %a, -2147483649
+  ret i32 %and
+}
+
+define i64 @bclri_i64_10(i64 %a) nounwind {
+; CHECK-LABEL: bclri_i64_10:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    andi a0, a0, -1025
+; CHECK-NEXT:    ret
+  %and = and i64 %a, -1025
+  ret i64 %and
+}
+
+define i64 @bclri_i64_11(i64 %a) nounwind {
+; RV64I-LABEL: bclri_i64_11:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lui a1, 1048575
+; RV64I-NEXT:    addiw a1, a1, 2047
+; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBS-LABEL: bclri_i64_11:
+; RV64ZBS:       # %bb.0:
+; RV64ZBS-NEXT:    bclri a0, a0, 11
+; RV64ZBS-NEXT:    ret
+  %and = and i64 %a, -2049
+  ret i64 %and
+}
+
+define i64 @bclri_i64_30(i64 %a) nounwind {
+; RV64I-LABEL: bclri_i64_30:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lui a1, 786432
+; RV64I-NEXT:    addiw a1, a1, -1
+; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBS-LABEL: bclri_i64_30:
+; RV64ZBS:       # %bb.0:
+; RV64ZBS-NEXT:    bclri a0, a0, 30
+; RV64ZBS-NEXT:    ret
+  %and = and i64 %a, -1073741825
+  ret i64 %and
+}
+
+define i64 @bclri_i64_31(i64 %a) nounwind {
+; RV64I-LABEL: bclri_i64_31:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lui a1, 524288
+; RV64I-NEXT:    addi a1, a1, -1
+; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBS-LABEL: bclri_i64_31:
+; RV64ZBS:       # %bb.0:
+; RV64ZBS-NEXT:    bclri a0, a0, 31
+; RV64ZBS-NEXT:    ret
+  %and = and i64 %a, -2147483649
+  ret i64 %and
+}
+
+define i64 @bclri_i64_62(i64 %a) nounwind {
+; RV64I-LABEL: bclri_i64_62:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a1, -1
+; RV64I-NEXT:    slli a1, a1, 62
+; RV64I-NEXT:    addi a1, a1, -1
+; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBS-LABEL: bclri_i64_62:
+; RV64ZBS:       # %bb.0:
+; RV64ZBS-NEXT:    bclri a0, a0, 62
+; RV64ZBS-NEXT:    ret
+  %and = and i64 %a, -4611686018427387905
+  ret i64 %and
+}
+
+define i64 @bclri_i64_63(i64 %a) nounwind {
+; RV64I-LABEL: bclri_i64_63:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 1
+; RV64I-NEXT:    srli a0, a0, 1
+; RV64I-NEXT:    ret
+;
+; RV64ZBS-LABEL: bclri_i64_63:
+; RV64ZBS:       # %bb.0:
+; RV64ZBS-NEXT:    bclri a0, a0, 63
+; RV64ZBS-NEXT:    ret
+  %and = and i64 %a, -9223372036854775809
+  ret i64 %and
+}
+
+define i64 @bclri_i64_large0(i64 %a) nounwind {
+; RV64I-LABEL: bclri_i64_large0:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lui a1, 1044480
+; RV64I-NEXT:    addiw a1, a1, -256
+; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBS-LABEL: bclri_i64_large0:
+; RV64ZBS:       # %bb.0:
+; RV64ZBS-NEXT:    andi a0, a0, -256
+; RV64ZBS-NEXT:    bclri a0, a0, 24
+; RV64ZBS-NEXT:    ret
+  %and = and i64 %a, -16777472
+  ret i64 %and
+}
+
+define i64 @bclri_i64_large1(i64 %a) nounwind {
+; RV64I-LABEL: bclri_i64_large1:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lui a1, 1044464
+; RV64I-NEXT:    addiw a1, a1, -1
+; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBS-LABEL: bclri_i64_large1:
+; RV64ZBS:       # %bb.0:
+; RV64ZBS-NEXT:    bclri a0, a0, 16
+; RV64ZBS-NEXT:    bclri a0, a0, 24
+; RV64ZBS-NEXT:    ret
+  %and = and i64 %a, -16842753
+  ret i64 %and
+}
+
+define signext i32 @bseti_i32_10(i32 signext %a) nounwind {
+; CHECK-LABEL: bseti_i32_10:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ori a0, a0, 1024
+; CHECK-NEXT:    ret
+  %or = or i32 %a, 1024
+  ret i32 %or
+}
+
+define signext i32 @bseti_i32_11(i32 signext %a) nounwind {
+; RV64I-LABEL: bseti_i32_11:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a1, 1
+; RV64I-NEXT:    slli a1, a1, 11
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBS-LABEL: bseti_i32_11:
+; RV64ZBS:       # %bb.0:
+; RV64ZBS-NEXT:    bseti a0, a0, 11
+; RV64ZBS-NEXT:    ret
+  %or = or i32 %a, 2048
+  ret i32 %or
+}
+
+define signext i32 @bseti_i32_30(i32 signext %a) nounwind {
+; RV64I-LABEL: bseti_i32_30:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lui a1, 262144
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBS-LABEL: bseti_i32_30:
+; RV64ZBS:       # %bb.0:
+; RV64ZBS-NEXT:    bseti a0, a0, 30
+; RV64ZBS-NEXT:    ret
+  %or = or i32 %a, 1073741824
+  ret i32 %or
+}
+
+define signext i32 @bseti_i32_31(i32 signext %a) nounwind {
+; CHECK-LABEL: bseti_i32_31:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lui a1, 524288
+; CHECK-NEXT:    or a0, a0, a1
+; CHECK-NEXT:    ret
+  %or = or i32 %a, 2147483648
+  ret i32 %or
+}
+
+define i64 @bseti_i64_10(i64 %a) nounwind {
+; CHECK-LABEL: bseti_i64_10:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ori a0, a0, 1024
+; CHECK-NEXT:    ret
+  %or = or i64 %a, 1024
+  ret i64 %or
+}
+
+define i64 @bseti_i64_11(i64 %a) nounwind {
+; RV64I-LABEL: bseti_i64_11:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a1, 1
+; RV64I-NEXT:    slli a1, a1, 11
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBS-LABEL: bseti_i64_11:
+; RV64ZBS:       # %bb.0:
+; RV64ZBS-NEXT:    bseti a0, a0, 11
+; RV64ZBS-NEXT:    ret
+  %or = or i64 %a, 2048
+  ret i64 %or
+}
+
+define i64 @bseti_i64_30(i64 %a) nounwind {
+; RV64I-LABEL: bseti_i64_30:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lui a1, 262144
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBS-LABEL: bseti_i64_30:
+; RV64ZBS:       # %bb.0:
+; RV64ZBS-NEXT:    bseti a0, a0, 30
+; RV64ZBS-NEXT:    ret
+  %or = or i64 %a, 1073741824
+  ret i64 %or
+}
+
+define i64 @bseti_i64_31(i64 %a) nounwind {
+; RV64I-LABEL: bseti_i64_31:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a1, 1
+; RV64I-NEXT:    slli a1, a1, 31
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBS-LABEL: bseti_i64_31:
+; RV64ZBS:       # %bb.0:
+; RV64ZBS-NEXT:    bseti a0, a0, 31
+; RV64ZBS-NEXT:    ret
+  %or = or i64 %a, 2147483648
+  ret i64 %or
+}
+
+define i64 @bseti_i64_62(i64 %a) nounwind {
+; RV64I-LABEL: bseti_i64_62:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a1, 1
+; RV64I-NEXT:    slli a1, a1, 62
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBS-LABEL: bseti_i64_62:
+; RV64ZBS:       # %bb.0:
+; RV64ZBS-NEXT:    bseti a0, a0, 62
+; RV64ZBS-NEXT:    ret
+  %or = or i64 %a, 4611686018427387904
+  ret i64 %or
+}
+
+define i64 @bseti_i64_63(i64 %a) nounwind {
+; RV64I-LABEL: bseti_i64_63:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a1, -1
+; RV64I-NEXT:    slli a1, a1, 63
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBS-LABEL: bseti_i64_63:
+; RV64ZBS:       # %bb.0:
+; RV64ZBS-NEXT:    bseti a0, a0, 63
+; RV64ZBS-NEXT:    ret
+  %or = or i64 %a, 9223372036854775808
+  ret i64 %or
+}
+
+define signext i32 @binvi_i32_10(i32 signext %a) nounwind {
+; CHECK-LABEL: binvi_i32_10:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xori a0, a0, 1024
+; CHECK-NEXT:    ret
+  %xor = xor i32 %a, 1024
+  ret i32 %xor
+}
+
+define signext i32 @binvi_i32_11(i32 signext %a) nounwind {
+; RV64I-LABEL: binvi_i32_11:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a1, 1
+; RV64I-NEXT:    slli a1, a1, 11
+; RV64I-NEXT:    xor a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBS-LABEL: binvi_i32_11:
+; RV64ZBS:       # %bb.0:
+; RV64ZBS-NEXT:    binvi a0, a0, 11
+; RV64ZBS-NEXT:    ret
+  %xor = xor i32 %a, 2048
+  ret i32 %xor
+}
+
+define signext i32 @binvi_i32_30(i32 signext %a) nounwind {
+; RV64I-LABEL: binvi_i32_30:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lui a1, 262144
+; RV64I-NEXT:    xor a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBS-LABEL: binvi_i32_30:
+; RV64ZBS:       # %bb.0:
+; RV64ZBS-NEXT:    binvi a0, a0, 30
+; RV64ZBS-NEXT:    ret
+  %xor = xor i32 %a, 1073741824
+  ret i32 %xor
+}
+
+define signext i32 @binvi_i32_31(i32 signext %a) nounwind {
+; CHECK-LABEL: binvi_i32_31:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lui a1, 524288
+; CHECK-NEXT:    xor a0, a0, a1
+; CHECK-NEXT:    ret
+  %xor = xor i32 %a, 2147483648
+  ret i32 %xor
+}
+
+define i64 @binvi_i64_10(i64 %a) nounwind {
+; CHECK-LABEL: binvi_i64_10:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xori a0, a0, 1024
+; CHECK-NEXT:    ret
+  %xor = xor i64 %a, 1024
+  ret i64 %xor
+}
+
+define i64 @binvi_i64_11(i64 %a) nounwind {
+; RV64I-LABEL: binvi_i64_11:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a1, 1
+; RV64I-NEXT:    slli a1, a1, 11
+; RV64I-NEXT:    xor a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBS-LABEL: binvi_i64_11:
+; RV64ZBS:       # %bb.0:
+; RV64ZBS-NEXT:    binvi a0, a0, 11
+; RV64ZBS-NEXT:    ret
+  %xor = xor i64 %a, 2048
+  ret i64 %xor
+}
+
+define i64 @binvi_i64_30(i64 %a) nounwind {
+; RV64I-LABEL: binvi_i64_30:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lui a1, 262144
+; RV64I-NEXT:    xor a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBS-LABEL: binvi_i64_30:
+; RV64ZBS:       # %bb.0:
+; RV64ZBS-NEXT:    binvi a0, a0, 30
+; RV64ZBS-NEXT:    ret
+  %xor = xor i64 %a, 1073741824
+  ret i64 %xor
+}
+
+define i64 @binvi_i64_31(i64 %a) nounwind {
+; RV64I-LABEL: binvi_i64_31:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a1, 1
+; RV64I-NEXT:    slli a1, a1, 31
+; RV64I-NEXT:    xor a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBS-LABEL: binvi_i64_31:
+; RV64ZBS:       # %bb.0:
+; RV64ZBS-NEXT:    binvi a0, a0, 31
+; RV64ZBS-NEXT:    ret
+  %xor = xor i64 %a, 2147483648
+  ret i64 %xor
+}
+
+define i64 @binvi_i64_62(i64 %a) nounwind {
+; RV64I-LABEL: binvi_i64_62:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a1, 1
+; RV64I-NEXT:    slli a1, a1, 62
+; RV64I-NEXT:    xor a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBS-LABEL: binvi_i64_62:
+; RV64ZBS:       # %bb.0:
+; RV64ZBS-NEXT:    binvi a0, a0, 62
+; RV64ZBS-NEXT:    ret
+  %xor = xor i64 %a, 4611686018427387904
+  ret i64 %xor
+}
+
+define i64 @binvi_i64_63(i64 %a) nounwind {
+; RV64I-LABEL: binvi_i64_63:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a1, -1
+; RV64I-NEXT:    slli a1, a1, 63
+; RV64I-NEXT:    xor a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBS-LABEL: binvi_i64_63:
+; RV64ZBS:       # %bb.0:
+; RV64ZBS-NEXT:    binvi a0, a0, 63
+; RV64ZBS-NEXT:    ret
+  %xor = xor i64 %a, 9223372036854775808
+  ret i64 %xor
+}
+
+define i64 @xor_i64_large(i64 %a) nounwind {
+; RV64I-LABEL: xor_i64_large:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a1, 1
+; RV64I-NEXT:    slli a1, a1, 32
+; RV64I-NEXT:    addi a1, a1, 1
+; RV64I-NEXT:    xor a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBS-LABEL: xor_i64_large:
+; RV64ZBS:       # %bb.0:
+; RV64ZBS-NEXT:    binvi a0, a0, 0
+; RV64ZBS-NEXT:    binvi a0, a0, 32
+; RV64ZBS-NEXT:    ret
+  %xor = xor i64 %a, 4294967297
+  ret i64 %xor
+}
+
+define i64 @xor_i64_4099(i64 %a) nounwind {
+; RV64I-LABEL: xor_i64_4099:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lui a1, 1
+; RV64I-NEXT:    addiw a1, a1, 3
+; RV64I-NEXT:    xor a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBS-LABEL: xor_i64_4099:
+; RV64ZBS:       # %bb.0:
+; RV64ZBS-NEXT:    xori a0, a0, 3
+; RV64ZBS-NEXT:    binvi a0, a0, 12
+; RV64ZBS-NEXT:    ret
+  %xor = xor i64 %a, 4099
+  ret i64 %xor
+}
+
+define i64 @xor_i64_96(i64 %a) nounwind {
+; CHECK-LABEL: xor_i64_96:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xori a0, a0, 96
+; CHECK-NEXT:    ret
+  %xor = xor i64 %a, 96
+  ret i64 %xor
+}
+
+define i64 @or_i64_large(i64 %a) nounwind {
+; RV64I-LABEL: or_i64_large:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    li a1, 1
+; RV64I-NEXT:    slli a1, a1, 32
+; RV64I-NEXT:    addi a1, a1, 1
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBS-LABEL: or_i64_large:
+; RV64ZBS:       # %bb.0:
+; RV64ZBS-NEXT:    bseti a0, a0, 0
+; RV64ZBS-NEXT:    bseti a0, a0, 32
+; RV64ZBS-NEXT:    ret
+  %or = or i64 %a, 4294967297
+  ret i64 %or
+}
+
+define i64 @xor_i64_66901(i64 %a) nounwind {
+; RV64I-LABEL: xor_i64_66901:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lui a1, 16
+; RV64I-NEXT:    addiw a1, a1, 1365
+; RV64I-NEXT:    xor a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBS-LABEL: xor_i64_66901:
+; RV64ZBS:       # %bb.0:
+; RV64ZBS-NEXT:    xori a0, a0, 1365
+; RV64ZBS-NEXT:    binvi a0, a0, 16
+; RV64ZBS-NEXT:    ret
+  %xor = xor i64 %a, 66901
+  ret i64 %xor
+}
+
+define i64 @or_i64_4099(i64 %a) nounwind {
+; RV64I-LABEL: or_i64_4099:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lui a1, 1
+; RV64I-NEXT:    addiw a1, a1, 3
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBS-LABEL: or_i64_4099:
+; RV64ZBS:       # %bb.0:
+; RV64ZBS-NEXT:    ori a0, a0, 3
+; RV64ZBS-NEXT:    bseti a0, a0, 12
+; RV64ZBS-NEXT:    ret
+  %or = or i64 %a, 4099
+  ret i64 %or
+}
+
+define i64 @or_i64_96(i64 %a) nounwind {
+; CHECK-LABEL: or_i64_96:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ori a0, a0, 96
+; CHECK-NEXT:    ret
+  %or = or i64 %a, 96
+  ret i64 %or
+}
+
+define i64 @or_i64_66901(i64 %a) nounwind {
+; RV64I-LABEL: or_i64_66901:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lui a1, 16
+; RV64I-NEXT:    addiw a1, a1, 1365
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBS-LABEL: or_i64_66901:
+; RV64ZBS:       # %bb.0:
+; RV64ZBS-NEXT:    ori a0, a0, 1365
+; RV64ZBS-NEXT:    bseti a0, a0, 16
+; RV64ZBS-NEXT:    ret
+  %or = or i64 %a, 66901
+  ret i64 %or
+}
diff --git a/llvm/test/CodeGen/RISCV/rv64-legal-i32/xaluo.ll b/llvm/test/CodeGen/RISCV/rv64-legal-i32/xaluo.ll
new file mode 100644
index 0000000000000..774d1398644b9
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rv64-legal-i32/xaluo.ll
@@ -0,0 +1,1308 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv64 -mattr=+m -verify-machineinstrs < %s \
+; RUN:   -riscv-experimental-rv64-legal-i32 | FileCheck %s -check-prefix=RV64
+
+;
+; Get the actual value of the overflow bit.
+;
+define zeroext i1 @saddo1.i32(i32 signext %v1, i32 signext %v2, ptr %res) {
+; RV64-LABEL: saddo1.i32:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    addw a3, a0, a1
+; RV64-NEXT:    slt a0, a3, a0
+; RV64-NEXT:    slti a1, a1, 0
+; RV64-NEXT:    xor a0, a1, a0
+; RV64-NEXT:    sw a3, 0(a2)
+; RV64-NEXT:    ret
+entry:
+  %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  store i32 %val, ptr %res
+  ret i1 %obit
+}
+
+; Test the immediate version.
+define zeroext i1 @saddo2.i32(i32 signext %v1, ptr %res) {
+; RV64-LABEL: saddo2.i32:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    addiw a2, a0, 4
+; RV64-NEXT:    slt a0, a2, a0
+; RV64-NEXT:    sw a2, 0(a1)
+; RV64-NEXT:    ret
+entry:
+  %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 4)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  store i32 %val, ptr %res
+  ret i1 %obit
+}
+
+; Test negative immediates.
+define zeroext i1 @saddo3.i32(i32 signext %v1, ptr %res) {
+; RV64-LABEL: saddo3.i32:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    addiw a2, a0, -4
+; RV64-NEXT:    slt a0, a2, a0
+; RV64-NEXT:    xori a0, a0, 1
+; RV64-NEXT:    sw a2, 0(a1)
+; RV64-NEXT:    ret
+entry:
+  %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 -4)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  store i32 %val, ptr %res
+  ret i1 %obit
+}
+
+; Test immediates that are too large to be encoded.
+define zeroext i1 @saddo4.i32(i32 signext %v1, ptr %res) {
+; RV64-LABEL: saddo4.i32:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    lui a2, 4096
+; RV64-NEXT:    addi a2, a2, -1
+; RV64-NEXT:    addw a2, a0, a2
+; RV64-NEXT:    slt a0, a2, a0
+; RV64-NEXT:    sw a2, 0(a1)
+; RV64-NEXT:    ret
+entry:
+  %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 16777215)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  store i32 %val, ptr %res
+  ret i1 %obit
+}
+
+define zeroext i1 @saddo1.i64(i64 %v1, i64 %v2, ptr %res) {
+; RV64-LABEL: saddo1.i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    add a3, a0, a1
+; RV64-NEXT:    slt a0, a3, a0
+; RV64-NEXT:    slti a1, a1, 0
+; RV64-NEXT:    xor a0, a1, a0
+; RV64-NEXT:    sd a3, 0(a2)
+; RV64-NEXT:    ret
+entry:
+  %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 %v2)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  store i64 %val, ptr %res
+  ret i1 %obit
+}
+
+define zeroext i1 @saddo2.i64(i64 %v1, ptr %res) {
+; RV64-LABEL: saddo2.i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    addi a2, a0, 4
+; RV64-NEXT:    slt a0, a2, a0
+; RV64-NEXT:    sd a2, 0(a1)
+; RV64-NEXT:    ret
+entry:
+  %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 4)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  store i64 %val, ptr %res
+  ret i1 %obit
+}
+
+define zeroext i1 @saddo3.i64(i64 %v1, ptr %res) {
+; RV64-LABEL: saddo3.i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    addi a2, a0, -4
+; RV64-NEXT:    slt a0, a2, a0
+; RV64-NEXT:    xori a0, a0, 1
+; RV64-NEXT:    sd a2, 0(a1)
+; RV64-NEXT:    ret
+entry:
+  %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 -4)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  store i64 %val, ptr %res
+  ret i1 %obit
+}
+
+define zeroext i1 @uaddo.i32(i32 signext %v1, i32 signext %v2, ptr %res) {
+; RV64-LABEL: uaddo.i32:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    addw a1, a0, a1
+; RV64-NEXT:    sltu a0, a1, a0
+; RV64-NEXT:    sw a1, 0(a2)
+; RV64-NEXT:    ret
+entry:
+  %t = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %v1, i32 %v2)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  store i32 %val, ptr %res
+  ret i1 %obit
+}
+
+define zeroext i1 @uaddo.i32.constant(i32 signext %v1, ptr %res) {
+; RV64-LABEL: uaddo.i32.constant:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    addiw a2, a0, -2
+; RV64-NEXT:    sltu a0, a2, a0
+; RV64-NEXT:    sw a2, 0(a1)
+; RV64-NEXT:    ret
+entry:
+  %t = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %v1, i32 -2)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  store i32 %val, ptr %res
+  ret i1 %obit
+}
+
+define zeroext i1 @uaddo.i32.constant_one(i32 signext %v1, ptr %res) {
+; RV64-LABEL: uaddo.i32.constant_one:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    addiw a2, a0, 1
+; RV64-NEXT:    seqz a0, a2
+; RV64-NEXT:    sw a2, 0(a1)
+; RV64-NEXT:    ret
+entry:
+  %t = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %v1, i32 1)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  store i32 %val, ptr %res
+  ret i1 %obit
+}
+
+define zeroext i1 @uaddo.i64(i64 %v1, i64 %v2, ptr %res) {
+; RV64-LABEL: uaddo.i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    add a1, a0, a1
+; RV64-NEXT:    sltu a0, a1, a0
+; RV64-NEXT:    sd a1, 0(a2)
+; RV64-NEXT:    ret
+entry:
+  %t = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %v1, i64 %v2)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  store i64 %val, ptr %res
+  ret i1 %obit
+}
+
+define zeroext i1 @uaddo.i64.constant_one(i64 %v1, ptr %res) {
+; RV64-LABEL: uaddo.i64.constant_one:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    addi a2, a0, 1
+; RV64-NEXT:    seqz a0, a2
+; RV64-NEXT:    sd a2, 0(a1)
+; RV64-NEXT:    ret
+entry:
+  %t = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %v1, i64 1)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  store i64 %val, ptr %res
+  ret i1 %obit
+}
+
+define zeroext i1 @ssubo1.i32(i32 signext %v1, i32 signext %v2, ptr %res) {
+; RV64-LABEL: ssubo1.i32:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    sgtz a3, a1
+; RV64-NEXT:    subw a1, a0, a1
+; RV64-NEXT:    slt a0, a1, a0
+; RV64-NEXT:    xor a0, a3, a0
+; RV64-NEXT:    sw a1, 0(a2)
+; RV64-NEXT:    ret
+entry:
+  %t = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %v1, i32 %v2)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  store i32 %val, ptr %res
+  ret i1 %obit
+}
+
+define zeroext i1 @ssubo2.i32(i32 signext %v1, ptr %res) {
+; RV64-LABEL: ssubo2.i32:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    addiw a2, a0, 4
+; RV64-NEXT:    slt a0, a2, a0
+; RV64-NEXT:    sw a2, 0(a1)
+; RV64-NEXT:    ret
+entry:
+  %t = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %v1, i32 -4)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  store i32 %val, ptr %res
+  ret i1 %obit
+}
+
+define zeroext i1 @ssubo.i64(i64 %v1, i64 %v2, ptr %res) {
+; RV64-LABEL: ssubo.i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    sgtz a3, a1
+; RV64-NEXT:    sub a1, a0, a1
+; RV64-NEXT:    slt a0, a1, a0
+; RV64-NEXT:    xor a0, a3, a0
+; RV64-NEXT:    sd a1, 0(a2)
+; RV64-NEXT:    ret
+entry:
+  %t = call {i64, i1} @llvm.ssub.with.overflow.i64(i64 %v1, i64 %v2)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  store i64 %val, ptr %res
+  ret i1 %obit
+}
+
+define zeroext i1 @usubo.i32(i32 signext %v1, i32 signext %v2, ptr %res) {
+; RV64-LABEL: usubo.i32:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    subw a1, a0, a1
+; RV64-NEXT:    sltu a0, a0, a1
+; RV64-NEXT:    sw a1, 0(a2)
+; RV64-NEXT:    ret
+entry:
+  %t = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %v1, i32 %v2)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  store i32 %val, ptr %res
+  ret i1 %obit
+}
+
+define zeroext i1 @usubo.i32.constant.rhs(i32 signext %v1, ptr %res) {
+; RV64-LABEL: usubo.i32.constant.rhs:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    addiw a2, a0, 2
+; RV64-NEXT:    sltu a0, a0, a2
+; RV64-NEXT:    sw a2, 0(a1)
+; RV64-NEXT:    ret
+entry:
+  %t = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %v1, i32 -2)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  store i32 %val, ptr %res
+  ret i1 %obit
+}
+
+define zeroext i1 @usubo.i32.constant.lhs(i32 signext %v1, ptr %res) {
+; RV64-LABEL: usubo.i32.constant.lhs:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    li a2, -2
+; RV64-NEXT:    subw a2, a2, a0
+; RV64-NEXT:    addi a0, a2, 1
+; RV64-NEXT:    seqz a0, a0
+; RV64-NEXT:    sw a2, 0(a1)
+; RV64-NEXT:    ret
+entry:
+  %t = call {i32, i1} @llvm.usub.with.overflow.i32(i32 -2, i32 %v1)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  store i32 %val, ptr %res
+  ret i1 %obit
+}
+
+define zeroext i1 @usubo.i64(i64 %v1, i64 %v2, ptr %res) {
+; RV64-LABEL: usubo.i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    sub a1, a0, a1
+; RV64-NEXT:    sltu a0, a0, a1
+; RV64-NEXT:    sd a1, 0(a2)
+; RV64-NEXT:    ret
+entry:
+  %t = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %v1, i64 %v2)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  store i64 %val, ptr %res
+  ret i1 %obit
+}
+
+define zeroext i1 @smulo.i32(i32 signext %v1, i32 signext %v2, ptr %res) {
+; RV64-LABEL: smulo.i32:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    mul a1, a0, a1
+; RV64-NEXT:    srai a0, a1, 32
+; RV64-NEXT:    sraiw a3, a1, 31
+; RV64-NEXT:    xor a0, a0, a3
+; RV64-NEXT:    snez a0, a0
+; RV64-NEXT:    sw a1, 0(a2)
+; RV64-NEXT:    ret
+entry:
+  %t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %v1, i32 %v2)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  store i32 %val, ptr %res
+  ret i1 %obit
+}
+
+define zeroext i1 @smulo2.i32(i32 signext %v1, ptr %res) {
+; RV64-LABEL: smulo2.i32:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    li a2, 13
+; RV64-NEXT:    mul a2, a0, a2
+; RV64-NEXT:    srai a0, a2, 32
+; RV64-NEXT:    sraiw a3, a2, 31
+; RV64-NEXT:    xor a0, a0, a3
+; RV64-NEXT:    snez a0, a0
+; RV64-NEXT:    sw a2, 0(a1)
+; RV64-NEXT:    ret
+entry:
+  %t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %v1, i32 13)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  store i32 %val, ptr %res
+  ret i1 %obit
+}
+
+define zeroext i1 @smulo.i64(i64 %v1, i64 %v2, ptr %res) {
+; RV64-LABEL: smulo.i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    mulh a3, a0, a1
+; RV64-NEXT:    mul a1, a0, a1
+; RV64-NEXT:    srai a0, a1, 63
+; RV64-NEXT:    xor a0, a3, a0
+; RV64-NEXT:    snez a0, a0
+; RV64-NEXT:    sd a1, 0(a2)
+; RV64-NEXT:    ret
+entry:
+  %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 %v2)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  store i64 %val, ptr %res
+  ret i1 %obit
+}
+
+define zeroext i1 @smulo2.i64(i64 %v1, ptr %res) {
+; RV64-LABEL: smulo2.i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    li a2, 13
+; RV64-NEXT:    mulh a3, a0, a2
+; RV64-NEXT:    mul a2, a0, a2
+; RV64-NEXT:    srai a0, a2, 63
+; RV64-NEXT:    xor a0, a3, a0
+; RV64-NEXT:    snez a0, a0
+; RV64-NEXT:    sd a2, 0(a1)
+; RV64-NEXT:    ret
+entry:
+  %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 13)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  store i64 %val, ptr %res
+  ret i1 %obit
+}
+
+define zeroext i1 @umulo.i32(i32 signext %v1, i32 signext %v2, ptr %res) {
+; RV64-LABEL: umulo.i32:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    slli a1, a1, 32
+; RV64-NEXT:    slli a0, a0, 32
+; RV64-NEXT:    mulhu a1, a0, a1
+; RV64-NEXT:    srai a0, a1, 32
+; RV64-NEXT:    snez a0, a0
+; RV64-NEXT:    sw a1, 0(a2)
+; RV64-NEXT:    ret
+entry:
+  %t = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %v1, i32 %v2)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  store i32 %val, ptr %res
+  ret i1 %obit
+}
+
+define zeroext i1 @umulo2.i32(i32 signext %v1, ptr %res) {
+; RV64-LABEL: umulo2.i32:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    li a2, 13
+; RV64-NEXT:    slli a2, a2, 32
+; RV64-NEXT:    slli a0, a0, 32
+; RV64-NEXT:    mulhu a2, a0, a2
+; RV64-NEXT:    srli a0, a2, 32
+; RV64-NEXT:    snez a0, a0
+; RV64-NEXT:    sw a2, 0(a1)
+; RV64-NEXT:    ret
+entry:
+  %t = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %v1, i32 13)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  store i32 %val, ptr %res
+  ret i1 %obit
+}
+
+; Similar to umulo.i32, but storing the overflow and returning the result.
+define signext i32 @umulo3.i32(i32 signext %0, i32 signext %1, ptr %2) {
+; RV64-LABEL: umulo3.i32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    slli a1, a1, 32
+; RV64-NEXT:    slli a0, a0, 32
+; RV64-NEXT:    mulhu a0, a0, a1
+; RV64-NEXT:    srai a1, a0, 32
+; RV64-NEXT:    snez a1, a1
+; RV64-NEXT:    sext.w a0, a0
+; RV64-NEXT:    sw a1, 0(a2)
+; RV64-NEXT:    ret
+  %4 = tail call { i32, i1 } @llvm.umul.with.overflow.i32(i32 %0, i32 %1)
+  %5 = extractvalue { i32, i1 } %4, 1
+  %6 = extractvalue { i32, i1 } %4, 0
+  %7 = zext i1 %5 to i32
+  store i32 %7, ptr %2, align 4
+  ret i32 %6
+}
+
+define zeroext i1 @umulo.i64(i64 %v1, i64 %v2, ptr %res) {
+; RV64-LABEL: umulo.i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    mulhu a3, a0, a1
+; RV64-NEXT:    snez a3, a3
+; RV64-NEXT:    mul a0, a0, a1
+; RV64-NEXT:    sd a0, 0(a2)
+; RV64-NEXT:    mv a0, a3
+; RV64-NEXT:    ret
+entry:
+  %t = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %v1, i64 %v2)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  store i64 %val, ptr %res
+  ret i1 %obit
+}
+
+define zeroext i1 @umulo2.i64(i64 %v1, ptr %res) {
+; RV64-LABEL: umulo2.i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    li a3, 13
+; RV64-NEXT:    mulhu a2, a0, a3
+; RV64-NEXT:    snez a2, a2
+; RV64-NEXT:    mul a0, a0, a3
+; RV64-NEXT:    sd a0, 0(a1)
+; RV64-NEXT:    mv a0, a2
+; RV64-NEXT:    ret
+entry:
+  %t = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %v1, i64 13)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  store i64 %val, ptr %res
+  ret i1 %obit
+}
+
+
+;
+; Check the use of the overflow bit in combination with a select instruction.
+;
+define i32 @saddo.select.i32(i32 signext %v1, i32 signext %v2) {
+; RV64-LABEL: saddo.select.i32:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    addw a2, a0, a1
+; RV64-NEXT:    slt a2, a2, a0
+; RV64-NEXT:    slti a3, a1, 0
+; RV64-NEXT:    bne a3, a2, .LBB28_2
+; RV64-NEXT:  # %bb.1: # %entry
+; RV64-NEXT:    mv a0, a1
+; RV64-NEXT:  .LBB28_2: # %entry
+; RV64-NEXT:    ret
+entry:
+  %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2)
+  %obit = extractvalue {i32, i1} %t, 1
+  %ret = select i1 %obit, i32 %v1, i32 %v2
+  ret i32 %ret
+}
+
+define i1 @saddo.not.i32(i32 signext %v1, i32 signext %v2) {
+; RV64-LABEL: saddo.not.i32:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    addw a2, a0, a1
+; RV64-NEXT:    slt a0, a2, a0
+; RV64-NEXT:    slti a1, a1, 0
+; RV64-NEXT:    xor a0, a1, a0
+; RV64-NEXT:    xori a0, a0, 1
+; RV64-NEXT:    ret
+entry:
+  %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2)
+  %obit = extractvalue {i32, i1} %t, 1
+  %ret = xor i1 %obit, true
+  ret i1 %ret
+}
+
+define i64 @saddo.select.i64(i64 %v1, i64 %v2) {
+; RV64-LABEL: saddo.select.i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    add a2, a0, a1
+; RV64-NEXT:    slt a2, a2, a0
+; RV64-NEXT:    slti a3, a1, 0
+; RV64-NEXT:    bne a3, a2, .LBB30_2
+; RV64-NEXT:  # %bb.1: # %entry
+; RV64-NEXT:    mv a0, a1
+; RV64-NEXT:  .LBB30_2: # %entry
+; RV64-NEXT:    ret
+entry:
+  %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 %v2)
+  %obit = extractvalue {i64, i1} %t, 1
+  %ret = select i1 %obit, i64 %v1, i64 %v2
+  ret i64 %ret
+}
+
+define i1 @saddo.not.i64(i64 %v1, i64 %v2) {
+; RV64-LABEL: saddo.not.i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    add a2, a0, a1
+; RV64-NEXT:    slt a0, a2, a0
+; RV64-NEXT:    slti a1, a1, 0
+; RV64-NEXT:    xor a0, a1, a0
+; RV64-NEXT:    xori a0, a0, 1
+; RV64-NEXT:    ret
+entry:
+  %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 %v2)
+  %obit = extractvalue {i64, i1} %t, 1
+  %ret = xor i1 %obit, true
+  ret i1 %ret
+}
+
+define i32 @uaddo.select.i32(i32 signext %v1, i32 signext %v2) {
+; RV64-LABEL: uaddo.select.i32:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    addw a2, a0, a1
+; RV64-NEXT:    bltu a2, a0, .LBB32_2
+; RV64-NEXT:  # %bb.1: # %entry
+; RV64-NEXT:    mv a0, a1
+; RV64-NEXT:  .LBB32_2: # %entry
+; RV64-NEXT:    ret
+entry:
+  %t = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %v1, i32 %v2)
+  %obit = extractvalue {i32, i1} %t, 1
+  %ret = select i1 %obit, i32 %v1, i32 %v2
+  ret i32 %ret
+}
+
+define i1 @uaddo.not.i32(i32 signext %v1, i32 signext %v2) {
+; RV64-LABEL: uaddo.not.i32:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    addw a1, a0, a1
+; RV64-NEXT:    sltu a0, a1, a0
+; RV64-NEXT:    xori a0, a0, 1
+; RV64-NEXT:    ret
+entry:
+  %t = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %v1, i32 %v2)
+  %obit = extractvalue {i32, i1} %t, 1
+  %ret = xor i1 %obit, true
+  ret i1 %ret
+}
+
+define i64 @uaddo.select.i64(i64 %v1, i64 %v2) {
+; RV64-LABEL: uaddo.select.i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    add a2, a0, a1
+; RV64-NEXT:    bltu a2, a0, .LBB34_2
+; RV64-NEXT:  # %bb.1: # %entry
+; RV64-NEXT:    mv a0, a1
+; RV64-NEXT:  .LBB34_2: # %entry
+; RV64-NEXT:    ret
+entry:
+  %t = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %v1, i64 %v2)
+  %obit = extractvalue {i64, i1} %t, 1
+  %ret = select i1 %obit, i64 %v1, i64 %v2
+  ret i64 %ret
+}
+
+define i1 @uaddo.not.i64(i64 %v1, i64 %v2) {
+; RV64-LABEL: uaddo.not.i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    add a1, a0, a1
+; RV64-NEXT:    sltu a0, a1, a0
+; RV64-NEXT:    xori a0, a0, 1
+; RV64-NEXT:    ret
+entry:
+  %t = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %v1, i64 %v2)
+  %obit = extractvalue {i64, i1} %t, 1
+  %ret = xor i1 %obit, true
+  ret i1 %ret
+}
+
+define i32 @ssubo.select.i32(i32 signext %v1, i32 signext %v2) {
+; RV64-LABEL: ssubo.select.i32:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    sgtz a2, a1
+; RV64-NEXT:    subw a3, a0, a1
+; RV64-NEXT:    slt a3, a3, a0
+; RV64-NEXT:    bne a2, a3, .LBB36_2
+; RV64-NEXT:  # %bb.1: # %entry
+; RV64-NEXT:    mv a0, a1
+; RV64-NEXT:  .LBB36_2: # %entry
+; RV64-NEXT:    ret
+entry:
+  %t = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %v1, i32 %v2)
+  %obit = extractvalue {i32, i1} %t, 1
+  %ret = select i1 %obit, i32 %v1, i32 %v2
+  ret i32 %ret
+}
+
+define i1 @ssubo.not.i32(i32 signext %v1, i32 signext %v2) {
+; RV64-LABEL: ssubo.not.i32:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    sgtz a2, a1
+; RV64-NEXT:    subw a1, a0, a1
+; RV64-NEXT:    slt a0, a1, a0
+; RV64-NEXT:    xor a0, a2, a0
+; RV64-NEXT:    xori a0, a0, 1
+; RV64-NEXT:    ret
+entry:
+  %t = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %v1, i32 %v2)
+  %obit = extractvalue {i32, i1} %t, 1
+  %ret = xor i1 %obit, true
+  ret i1 %ret
+}
+
+define i64 @ssubo.select.i64(i64 %v1, i64 %v2) {
+; RV64-LABEL: ssubo.select.i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    sgtz a2, a1
+; RV64-NEXT:    sub a3, a0, a1
+; RV64-NEXT:    slt a3, a3, a0
+; RV64-NEXT:    bne a2, a3, .LBB38_2
+; RV64-NEXT:  # %bb.1: # %entry
+; RV64-NEXT:    mv a0, a1
+; RV64-NEXT:  .LBB38_2: # %entry
+; RV64-NEXT:    ret
+entry:
+  %t = call {i64, i1} @llvm.ssub.with.overflow.i64(i64 %v1, i64 %v2)
+  %obit = extractvalue {i64, i1} %t, 1
+  %ret = select i1 %obit, i64 %v1, i64 %v2
+  ret i64 %ret
+}
+
+define i1 @ssub.not.i64(i64 %v1, i64 %v2) {
+; RV64-LABEL: ssub.not.i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    sgtz a2, a1
+; RV64-NEXT:    sub a1, a0, a1
+; RV64-NEXT:    slt a0, a1, a0
+; RV64-NEXT:    xor a0, a2, a0
+; RV64-NEXT:    xori a0, a0, 1
+; RV64-NEXT:    ret
+entry:
+  %t = call {i64, i1} @llvm.ssub.with.overflow.i64(i64 %v1, i64 %v2)
+  %obit = extractvalue {i64, i1} %t, 1
+  %ret = xor i1 %obit, true
+  ret i1 %ret
+}
+
+define i32 @usubo.select.i32(i32 signext %v1, i32 signext %v2) {
+; RV64-LABEL: usubo.select.i32:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    subw a2, a0, a1
+; RV64-NEXT:    bltu a0, a2, .LBB40_2
+; RV64-NEXT:  # %bb.1: # %entry
+; RV64-NEXT:    mv a0, a1
+; RV64-NEXT:  .LBB40_2: # %entry
+; RV64-NEXT:    ret
+entry:
+  %t = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %v1, i32 %v2)
+  %obit = extractvalue {i32, i1} %t, 1
+  %ret = select i1 %obit, i32 %v1, i32 %v2
+  ret i32 %ret
+}
+
+define i1 @usubo.not.i32(i32 signext %v1, i32 signext %v2) {
+; RV64-LABEL: usubo.not.i32:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    subw a1, a0, a1
+; RV64-NEXT:    sltu a0, a0, a1
+; RV64-NEXT:    xori a0, a0, 1
+; RV64-NEXT:    ret
+entry:
+  %t = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %v1, i32 %v2)
+  %obit = extractvalue {i32, i1} %t, 1
+  %ret = xor i1 %obit, true
+  ret i1 %ret
+}
+
+define i64 @usubo.select.i64(i64 %v1, i64 %v2) {
+; RV64-LABEL: usubo.select.i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    sub a2, a0, a1
+; RV64-NEXT:    bltu a0, a2, .LBB42_2
+; RV64-NEXT:  # %bb.1: # %entry
+; RV64-NEXT:    mv a0, a1
+; RV64-NEXT:  .LBB42_2: # %entry
+; RV64-NEXT:    ret
+entry:
+  %t = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %v1, i64 %v2)
+  %obit = extractvalue {i64, i1} %t, 1
+  %ret = select i1 %obit, i64 %v1, i64 %v2
+  ret i64 %ret
+}
+
+define i1 @usubo.not.i64(i64 %v1, i64 %v2) {
+; RV64-LABEL: usubo.not.i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    sub a1, a0, a1
+; RV64-NEXT:    sltu a0, a0, a1
+; RV64-NEXT:    xori a0, a0, 1
+; RV64-NEXT:    ret
+entry:
+  %t = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %v1, i64 %v2)
+  %obit = extractvalue {i64, i1} %t, 1
+  %ret = xor i1 %obit, true
+  ret i1 %ret
+}
+
+define i32 @smulo.select.i32(i32 signext %v1, i32 signext %v2) {
+; RV64-LABEL: smulo.select.i32:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    mul a2, a0, a1
+; RV64-NEXT:    srai a3, a2, 32
+; RV64-NEXT:    sraiw a2, a2, 31
+; RV64-NEXT:    bne a3, a2, .LBB44_2
+; RV64-NEXT:  # %bb.1: # %entry
+; RV64-NEXT:    mv a0, a1
+; RV64-NEXT:  .LBB44_2: # %entry
+; RV64-NEXT:    ret
+entry:
+  %t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %v1, i32 %v2)
+  %obit = extractvalue {i32, i1} %t, 1
+  %ret = select i1 %obit, i32 %v1, i32 %v2
+  ret i32 %ret
+}
+
+define i1 @smulo.not.i32(i32 signext %v1, i32 signext %v2) {
+; RV64-LABEL: smulo.not.i32:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    mul a0, a0, a1
+; RV64-NEXT:    srai a1, a0, 32
+; RV64-NEXT:    sraiw a0, a0, 31
+; RV64-NEXT:    xor a0, a1, a0
+; RV64-NEXT:    seqz a0, a0
+; RV64-NEXT:    ret
+entry:
+  %t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %v1, i32 %v2)
+  %obit = extractvalue {i32, i1} %t, 1
+  %ret = xor i1 %obit, true
+  ret i1 %ret
+}
+
+define i64 @smulo.select.i64(i64 %v1, i64 %v2) {
+; RV64-LABEL: smulo.select.i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    mulh a2, a0, a1
+; RV64-NEXT:    mul a3, a0, a1
+; RV64-NEXT:    srai a3, a3, 63
+; RV64-NEXT:    bne a2, a3, .LBB46_2
+; RV64-NEXT:  # %bb.1: # %entry
+; RV64-NEXT:    mv a0, a1
+; RV64-NEXT:  .LBB46_2: # %entry
+; RV64-NEXT:    ret
+entry:
+  %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 %v2)
+  %obit = extractvalue {i64, i1} %t, 1
+  %ret = select i1 %obit, i64 %v1, i64 %v2
+  ret i64 %ret
+}
+
+define i1 @smulo.not.i64(i64 %v1, i64 %v2) {
+; RV64-LABEL: smulo.not.i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    mulh a2, a0, a1
+; RV64-NEXT:    mul a0, a0, a1
+; RV64-NEXT:    srai a0, a0, 63
+; RV64-NEXT:    xor a0, a2, a0
+; RV64-NEXT:    seqz a0, a0
+; RV64-NEXT:    ret
+entry:
+  %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 %v2)
+  %obit = extractvalue {i64, i1} %t, 1
+  %ret = xor i1 %obit, true
+  ret i1 %ret
+}
+
+define i32 @umulo.select.i32(i32 signext %v1, i32 signext %v2) {
+; RV64-LABEL: umulo.select.i32:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    slli a2, a1, 32
+; RV64-NEXT:    slli a3, a0, 32
+; RV64-NEXT:    mulhu a2, a3, a2
+; RV64-NEXT:    srai a2, a2, 32
+; RV64-NEXT:    bnez a2, .LBB48_2
+; RV64-NEXT:  # %bb.1: # %entry
+; RV64-NEXT:    mv a0, a1
+; RV64-NEXT:  .LBB48_2: # %entry
+; RV64-NEXT:    ret
+entry:
+  %t = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %v1, i32 %v2)
+  %obit = extractvalue {i32, i1} %t, 1
+  %ret = select i1 %obit, i32 %v1, i32 %v2
+  ret i32 %ret
+}
+
+define i1 @umulo.not.i32(i32 signext %v1, i32 signext %v2) {
+; RV64-LABEL: umulo.not.i32:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    slli a1, a1, 32
+; RV64-NEXT:    slli a0, a0, 32
+; RV64-NEXT:    mulhu a0, a0, a1
+; RV64-NEXT:    srai a0, a0, 32
+; RV64-NEXT:    seqz a0, a0
+; RV64-NEXT:    ret
+entry:
+  %t = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %v1, i32 %v2)
+  %obit = extractvalue {i32, i1} %t, 1
+  %ret = xor i1 %obit, true
+  ret i1 %ret
+}
+
+define i64 @umulo.select.i64(i64 %v1, i64 %v2) {
+; RV64-LABEL: umulo.select.i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    mulhu a2, a0, a1
+; RV64-NEXT:    bnez a2, .LBB50_2
+; RV64-NEXT:  # %bb.1: # %entry
+; RV64-NEXT:    mv a0, a1
+; RV64-NEXT:  .LBB50_2: # %entry
+; RV64-NEXT:    ret
+entry:
+  %t = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %v1, i64 %v2)
+  %obit = extractvalue {i64, i1} %t, 1
+  %ret = select i1 %obit, i64 %v1, i64 %v2
+  ret i64 %ret
+}
+
+define i1 @umulo.not.i64(i64 %v1, i64 %v2) {
+; RV64-LABEL: umulo.not.i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    mulhu a0, a0, a1
+; RV64-NEXT:    seqz a0, a0
+; RV64-NEXT:    ret
+entry:
+  %t = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %v1, i64 %v2)
+  %obit = extractvalue {i64, i1} %t, 1
+  %ret = xor i1 %obit, true
+  ret i1 %ret
+}
+
+
+;
+; Check the use of the overflow bit in combination with a branch instruction.
+;
+define zeroext i1 @saddo.br.i32(i32 signext %v1, i32 signext %v2) {
+; RV64-LABEL: saddo.br.i32:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    addw a2, a0, a1
+; RV64-NEXT:    slt a0, a2, a0
+; RV64-NEXT:    slti a1, a1, 0
+; RV64-NEXT:    beq a1, a0, .LBB52_2
+; RV64-NEXT:  # %bb.1: # %overflow
+; RV64-NEXT:    li a0, 0
+; RV64-NEXT:    ret
+; RV64-NEXT:  .LBB52_2: # %continue
+; RV64-NEXT:    li a0, 1
+; RV64-NEXT:    ret
+entry:
+  %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  br i1 %obit, label %overflow, label %continue
+
+overflow:
+  ret i1 false
+
+continue:
+  ret i1 true
+}
+
+define zeroext i1 @saddo.br.i64(i64 %v1, i64 %v2) {
+; RV64-LABEL: saddo.br.i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    add a2, a0, a1
+; RV64-NEXT:    slt a0, a2, a0
+; RV64-NEXT:    slti a1, a1, 0
+; RV64-NEXT:    beq a1, a0, .LBB53_2
+; RV64-NEXT:  # %bb.1: # %overflow
+; RV64-NEXT:    li a0, 0
+; RV64-NEXT:    ret
+; RV64-NEXT:  .LBB53_2: # %continue
+; RV64-NEXT:    li a0, 1
+; RV64-NEXT:    ret
+entry:
+  %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 %v2)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  br i1 %obit, label %overflow, label %continue
+
+overflow:
+  ret i1 false
+
+continue:
+  ret i1 true
+}
+
+define zeroext i1 @uaddo.br.i32(i32 %v1, i32 %v2) {
+; RV64-LABEL: uaddo.br.i32:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    addw a1, a0, a1
+; RV64-NEXT:    sext.w a0, a0
+; RV64-NEXT:    bgeu a1, a0, .LBB54_2
+; RV64-NEXT:  # %bb.1: # %overflow
+; RV64-NEXT:    li a0, 0
+; RV64-NEXT:    ret
+; RV64-NEXT:  .LBB54_2: # %continue
+; RV64-NEXT:    li a0, 1
+; RV64-NEXT:    ret
+entry:
+  %t = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %v1, i32 %v2)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  br i1 %obit, label %overflow, label %continue
+
+overflow:
+  ret i1 false
+
+continue:
+  ret i1 true
+}
+
+define zeroext i1 @uaddo.br.i64(i64 %v1, i64 %v2) {
+; RV64-LABEL: uaddo.br.i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    add a1, a0, a1
+; RV64-NEXT:    bgeu a1, a0, .LBB55_2
+; RV64-NEXT:  # %bb.1: # %overflow
+; RV64-NEXT:    li a0, 0
+; RV64-NEXT:    ret
+; RV64-NEXT:  .LBB55_2: # %continue
+; RV64-NEXT:    li a0, 1
+; RV64-NEXT:    ret
+entry:
+  %t = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %v1, i64 %v2)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  br i1 %obit, label %overflow, label %continue
+
+overflow:
+  ret i1 false
+
+continue:
+  ret i1 true
+}
+
+define zeroext i1 @ssubo.br.i32(i32 signext %v1, i32 signext %v2) {
+; RV64-LABEL: ssubo.br.i32:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    sgtz a2, a1
+; RV64-NEXT:    subw a1, a0, a1
+; RV64-NEXT:    slt a0, a1, a0
+; RV64-NEXT:    beq a2, a0, .LBB56_2
+; RV64-NEXT:  # %bb.1: # %overflow
+; RV64-NEXT:    li a0, 0
+; RV64-NEXT:    ret
+; RV64-NEXT:  .LBB56_2: # %continue
+; RV64-NEXT:    li a0, 1
+; RV64-NEXT:    ret
+entry:
+  %t = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %v1, i32 %v2)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  br i1 %obit, label %overflow, label %continue
+
+overflow:
+  ret i1 false
+
+continue:
+  ret i1 true
+}
+
+define zeroext i1 @ssubo.br.i64(i64 %v1, i64 %v2) {
+; RV64-LABEL: ssubo.br.i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    sgtz a2, a1
+; RV64-NEXT:    sub a1, a0, a1
+; RV64-NEXT:    slt a0, a1, a0
+; RV64-NEXT:    beq a2, a0, .LBB57_2
+; RV64-NEXT:  # %bb.1: # %overflow
+; RV64-NEXT:    li a0, 0
+; RV64-NEXT:    ret
+; RV64-NEXT:  .LBB57_2: # %continue
+; RV64-NEXT:    li a0, 1
+; RV64-NEXT:    ret
+entry:
+  %t = call {i64, i1} @llvm.ssub.with.overflow.i64(i64 %v1, i64 %v2)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  br i1 %obit, label %overflow, label %continue
+
+overflow:
+  ret i1 false
+
+continue:
+  ret i1 true
+}
+
+define zeroext i1 @usubo.br.i32(i32 signext %v1, i32 signext %v2) {
+; RV64-LABEL: usubo.br.i32:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    subw a1, a0, a1
+; RV64-NEXT:    bgeu a0, a1, .LBB58_2
+; RV64-NEXT:  # %bb.1: # %overflow
+; RV64-NEXT:    li a0, 0
+; RV64-NEXT:    ret
+; RV64-NEXT:  .LBB58_2: # %continue
+; RV64-NEXT:    li a0, 1
+; RV64-NEXT:    ret
+entry:
+  %t = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %v1, i32 %v2)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  br i1 %obit, label %overflow, label %continue
+
+overflow:
+  ret i1 false
+
+continue:
+  ret i1 true
+}
+
+define zeroext i1 @usubo.br.i64(i64 %v1, i64 %v2) {
+; RV64-LABEL: usubo.br.i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    sub a1, a0, a1
+; RV64-NEXT:    bgeu a0, a1, .LBB59_2
+; RV64-NEXT:  # %bb.1: # %overflow
+; RV64-NEXT:    li a0, 0
+; RV64-NEXT:    ret
+; RV64-NEXT:  .LBB59_2: # %continue
+; RV64-NEXT:    li a0, 1
+; RV64-NEXT:    ret
+entry:
+  %t = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %v1, i64 %v2)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  br i1 %obit, label %overflow, label %continue
+
+overflow:
+  ret i1 false
+
+continue:
+  ret i1 true
+}
+
+define zeroext i1 @smulo.br.i32(i32 signext %v1, i32 signext %v2) {
+; RV64-LABEL: smulo.br.i32:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    mul a0, a0, a1
+; RV64-NEXT:    srai a1, a0, 32
+; RV64-NEXT:    sraiw a0, a0, 31
+; RV64-NEXT:    beq a1, a0, .LBB60_2
+; RV64-NEXT:  # %bb.1: # %overflow
+; RV64-NEXT:    li a0, 0
+; RV64-NEXT:    ret
+; RV64-NEXT:  .LBB60_2: # %continue
+; RV64-NEXT:    li a0, 1
+; RV64-NEXT:    ret
+entry:
+  %t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %v1, i32 %v2)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  br i1 %obit, label %overflow, label %continue
+
+overflow:
+  ret i1 false
+
+continue:
+  ret i1 true
+}
+
+define zeroext i1 @smulo.br.i64(i64 %v1, i64 %v2) {
+; RV64-LABEL: smulo.br.i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    mulh a2, a0, a1
+; RV64-NEXT:    mul a0, a0, a1
+; RV64-NEXT:    srai a0, a0, 63
+; RV64-NEXT:    beq a2, a0, .LBB61_2
+; RV64-NEXT:  # %bb.1: # %overflow
+; RV64-NEXT:    li a0, 0
+; RV64-NEXT:    ret
+; RV64-NEXT:  .LBB61_2: # %continue
+; RV64-NEXT:    li a0, 1
+; RV64-NEXT:    ret
+entry:
+  %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 %v2)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  br i1 %obit, label %overflow, label %continue
+
+overflow:
+  ret i1 false
+
+continue:
+  ret i1 true
+}
+
+define zeroext i1 @smulo2.br.i64(i64 %v1) {
+; RV64-LABEL: smulo2.br.i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    li a1, -13
+; RV64-NEXT:    mulh a2, a0, a1
+; RV64-NEXT:    mul a0, a0, a1
+; RV64-NEXT:    srai a0, a0, 63
+; RV64-NEXT:    beq a2, a0, .LBB62_2
+; RV64-NEXT:  # %bb.1: # %overflow
+; RV64-NEXT:    li a0, 0
+; RV64-NEXT:    ret
+; RV64-NEXT:  .LBB62_2: # %continue
+; RV64-NEXT:    li a0, 1
+; RV64-NEXT:    ret
+entry:
+  %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 -13)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  br i1 %obit, label %overflow, label %continue
+
+overflow:
+  ret i1 false
+
+continue:
+  ret i1 true
+}
+
+define zeroext i1 @umulo.br.i32(i32 signext %v1, i32 signext %v2) {
+; RV64-LABEL: umulo.br.i32:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    slli a1, a1, 32
+; RV64-NEXT:    slli a0, a0, 32
+; RV64-NEXT:    mulhu a0, a0, a1
+; RV64-NEXT:    srai a0, a0, 32
+; RV64-NEXT:    beqz a0, .LBB63_2
+; RV64-NEXT:  # %bb.1: # %overflow
+; RV64-NEXT:    li a0, 0
+; RV64-NEXT:    ret
+; RV64-NEXT:  .LBB63_2: # %continue
+; RV64-NEXT:    li a0, 1
+; RV64-NEXT:    ret
+entry:
+  %t = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %v1, i32 %v2)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  br i1 %obit, label %overflow, label %continue
+
+overflow:
+  ret i1 false
+
+continue:
+  ret i1 true
+}
+
+define zeroext i1 @umulo.br.i64(i64 %v1, i64 %v2) {
+; RV64-LABEL: umulo.br.i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    mulhu a0, a0, a1
+; RV64-NEXT:    beqz a0, .LBB64_2
+; RV64-NEXT:  # %bb.1: # %overflow
+; RV64-NEXT:    li a0, 0
+; RV64-NEXT:    ret
+; RV64-NEXT:  .LBB64_2: # %continue
+; RV64-NEXT:    li a0, 1
+; RV64-NEXT:    ret
+entry:
+  %t = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %v1, i64 %v2)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  br i1 %obit, label %overflow, label %continue
+
+overflow:
+  ret i1 false
+
+continue:
+  ret i1 true
+}
+
+define zeroext i1 @umulo2.br.i64(i64 %v1) {
+; RV64-LABEL: umulo2.br.i64:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    add a1, a0, a0
+; RV64-NEXT:    bgeu a1, a0, .LBB65_2
+; RV64-NEXT:  # %bb.1: # %overflow
+; RV64-NEXT:    li a0, 0
+; RV64-NEXT:    ret
+; RV64-NEXT:  .LBB65_2: # %continue
+; RV64-NEXT:    li a0, 1
+; RV64-NEXT:    ret
+entry:
+  %t = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %v1, i64 2)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  br i1 %obit, label %overflow, label %continue
+
+overflow:
+  ret i1 false
+
+continue:
+  ret i1 true
+}
+
+define zeroext i1 @uaddo.i64.constant(i64 %v1, ptr %res) {
+; RV64-LABEL: uaddo.i64.constant:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    addi a2, a0, 2
+; RV64-NEXT:    sltu a0, a2, a0
+; RV64-NEXT:    sd a2, 0(a1)
+; RV64-NEXT:    ret
+entry:
+  %t = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %v1, i64 2)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  store i64 %val, ptr %res
+  ret i1 %obit
+}
+
+define zeroext i1 @uaddo.i64.constant_2048(i64 %v1, ptr %res) {
+; RV64-LABEL: uaddo.i64.constant_2048:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    addi a2, a0, 2047
+; RV64-NEXT:    addi a2, a2, 1
+; RV64-NEXT:    sltu a0, a2, a0
+; RV64-NEXT:    sd a2, 0(a1)
+; RV64-NEXT:    ret
+entry:
+  %t = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %v1, i64 2048)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  store i64 %val, ptr %res
+  ret i1 %obit
+}
+
+define zeroext i1 @uaddo.i64.constant_2049(i64 %v1, ptr %res) {
+; RV64-LABEL: uaddo.i64.constant_2049:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    addi a2, a0, 2047
+; RV64-NEXT:    addi a2, a2, 2
+; RV64-NEXT:    sltu a0, a2, a0
+; RV64-NEXT:    sd a2, 0(a1)
+; RV64-NEXT:    ret
+entry:
+  %t = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %v1, i64 2049)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  store i64 %val, ptr %res
+  ret i1 %obit
+}
+
+define i64 @uaddo.i64.constant_setcc_on_overflow_flag(ptr %p) {
+; RV64-LABEL: uaddo.i64.constant_setcc_on_overflow_flag:
+; RV64:       # %bb.0: # %entry
+; RV64-NEXT:    ld a1, 0(a0)
+; RV64-NEXT:    addi a0, a1, 2
+; RV64-NEXT:    bltu a0, a1, .LBB69_2
+; RV64-NEXT:  # %bb.1: # %IfOverflow
+; RV64-NEXT:    li a0, 0
+; RV64-NEXT:  .LBB69_2: # %IfNoOverflow
+; RV64-NEXT:    ret
+entry:
+  %v1 = load i64, ptr %p
+  %t = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %v1, i64 2)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  br i1 %obit, label %IfNoOverflow, label %IfOverflow
+IfOverflow:
+  ret i64 0
+IfNoOverflow:
+  ret i64 %val
+}
+
+declare {i32, i1} @llvm.sadd.with.overflow.i32(i32, i32) nounwind readnone
+declare {i64, i1} @llvm.sadd.with.overflow.i64(i64, i64) nounwind readnone
+declare {i32, i1} @llvm.uadd.with.overflow.i32(i32, i32) nounwind readnone
+declare {i64, i1} @llvm.uadd.with.overflow.i64(i64, i64) nounwind readnone
+declare {i32, i1} @llvm.ssub.with.overflow.i32(i32, i32) nounwind readnone
+declare {i64, i1} @llvm.ssub.with.overflow.i64(i64, i64) nounwind readnone
+declare {i32, i1} @llvm.usub.with.overflow.i32(i32, i32) nounwind readnone
+declare {i64, i1} @llvm.usub.with.overflow.i64(i64, i64) nounwind readnone
+declare {i32, i1} @llvm.smul.with.overflow.i32(i32, i32) nounwind readnone
+declare {i64, i1} @llvm.smul.with.overflow.i64(i64, i64) nounwind readnone
+declare {i32, i1} @llvm.umul.with.overflow.i32(i32, i32) nounwind readnone
+declare {i64, i1} @llvm.umul.with.overflow.i64(i64, i64) nounwind readnone
+

From a8ead5606800261e94c2c703a366c59a12347fc4 Mon Sep 17 00:00:00 2001
From: Vlad Serebrennikov <serebrennikov.vladislav@gmail.com>
Date: Wed, 1 Nov 2023 20:38:28 +0400
Subject: [PATCH 703/877] [clang][NFC] Rename ArgPassingKind to
 RecordArgPassingKind (#70955)

During the recent refactoring (b120fe8d3288c4dca1b5427ca34839ce8833f71c) this enum was moved out of `RecordDecl`. During post-commit review it was found out that its association with `RecordDecl` should be expressed in the name.
---
 clang/include/clang/AST/Decl.h                        | 11 ++++++-----
 clang/include/clang/AST/DeclBase.h                    |  2 +-
 clang/lib/AST/Decl.cpp                                |  2 +-
 clang/lib/AST/DeclCXX.cpp                             | 10 +++++-----
 clang/lib/CodeGen/CGCall.cpp                          |  3 ++-
 clang/lib/Sema/SemaDecl.cpp                           |  8 +++++---
 clang/lib/Sema/SemaDeclCXX.cpp                        |  9 +++++----
 clang/lib/Serialization/ASTReaderDecl.cpp             |  4 ++--
 .../Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp  |  2 +-
 9 files changed, 28 insertions(+), 23 deletions(-)

diff --git a/clang/include/clang/AST/Decl.h b/clang/include/clang/AST/Decl.h
index d9b00b1628ab2..d8ea8c1dfb4f2 100644
--- a/clang/include/clang/AST/Decl.h
+++ b/clang/include/clang/AST/Decl.h
@@ -4063,7 +4063,7 @@ class EnumDecl : public TagDecl {
 /// returned from function calls. This takes into account the target-specific
 /// and version-specific rules along with the rules determined by the
 /// language.
-enum class ArgPassingKind {
+enum class RecordArgPassingKind {
   /// The argument of this type can be passed directly in registers.
   CanPassInRegs,
 
@@ -4216,14 +4216,15 @@ class RecordDecl : public TagDecl {
   /// it must have at least one trivial, non-deleted copy or move constructor.
   /// FIXME: This should be set as part of completeDefinition.
   bool canPassInRegisters() const {
-    return getArgPassingRestrictions() == ArgPassingKind::CanPassInRegs;
+    return getArgPassingRestrictions() == RecordArgPassingKind::CanPassInRegs;
   }
 
-  ArgPassingKind getArgPassingRestrictions() const {
-    return static_cast<ArgPassingKind>(RecordDeclBits.ArgPassingRestrictions);
+  RecordArgPassingKind getArgPassingRestrictions() const {
+    return static_cast<RecordArgPassingKind>(
+        RecordDeclBits.ArgPassingRestrictions);
   }
 
-  void setArgPassingRestrictions(ArgPassingKind Kind) {
+  void setArgPassingRestrictions(RecordArgPassingKind Kind) {
     RecordDeclBits.ArgPassingRestrictions = llvm::to_underlying(Kind);
   }
 
diff --git a/clang/include/clang/AST/DeclBase.h b/clang/include/clang/AST/DeclBase.h
index 6704c0cd41ecd..df1d6e8a3b5af 100644
--- a/clang/include/clang/AST/DeclBase.h
+++ b/clang/include/clang/AST/DeclBase.h
@@ -1399,7 +1399,7 @@ enum class DeductionCandidate : unsigned char {
   Aggregate,
 };
 
-enum class ArgPassingKind;
+enum class RecordArgPassingKind;
 enum class OMPDeclareReductionInitKind;
 enum class ObjCImplementationControl;
 enum class LinkageSpecLanguageIDs;
diff --git a/clang/lib/AST/Decl.cpp b/clang/lib/AST/Decl.cpp
index 28243a76712d6..6efc177d61c03 100644
--- a/clang/lib/AST/Decl.cpp
+++ b/clang/lib/AST/Decl.cpp
@@ -4932,7 +4932,7 @@ RecordDecl::RecordDecl(Kind DK, TagKind TK, const ASTContext &C,
   setHasNonTrivialToPrimitiveDestructCUnion(false);
   setHasNonTrivialToPrimitiveCopyCUnion(false);
   setParamDestroyedInCallee(false);
-  setArgPassingRestrictions(ArgPassingKind::CanPassInRegs);
+  setArgPassingRestrictions(RecordArgPassingKind::CanPassInRegs);
   setIsRandomized(false);
   setODRHash(0);
 }
diff --git a/clang/lib/AST/DeclCXX.cpp b/clang/lib/AST/DeclCXX.cpp
index 066b62b7c2411..4002c63e9f94c 100644
--- a/clang/lib/AST/DeclCXX.cpp
+++ b/clang/lib/AST/DeclCXX.cpp
@@ -446,8 +446,8 @@ CXXRecordDecl::setBases(CXXBaseSpecifier const * const *Bases,
       setHasVolatileMember(true);
 
     if (BaseClassDecl->getArgPassingRestrictions() ==
-        ArgPassingKind::CanNeverPassInRegs)
-      setArgPassingRestrictions(ArgPassingKind::CanNeverPassInRegs);
+        RecordArgPassingKind::CanNeverPassInRegs)
+      setArgPassingRestrictions(RecordArgPassingKind::CanNeverPassInRegs);
 
     // Keep track of the presence of mutable fields.
     if (BaseClassDecl->hasMutableFields())
@@ -1032,7 +1032,7 @@ void CXXRecordDecl::addedMember(Decl *D) {
 
         // Structs with __weak fields should never be passed directly.
         if (LT == Qualifiers::OCL_Weak)
-          setArgPassingRestrictions(ArgPassingKind::CanNeverPassInRegs);
+          setArgPassingRestrictions(RecordArgPassingKind::CanNeverPassInRegs);
 
         Data.HasIrrelevantDestructor = false;
 
@@ -1226,8 +1226,8 @@ void CXXRecordDecl::addedMember(Decl *D) {
         if (FieldRec->hasVolatileMember())
           setHasVolatileMember(true);
         if (FieldRec->getArgPassingRestrictions() ==
-            ArgPassingKind::CanNeverPassInRegs)
-          setArgPassingRestrictions(ArgPassingKind::CanNeverPassInRegs);
+            RecordArgPassingKind::CanNeverPassInRegs)
+          setArgPassingRestrictions(RecordArgPassingKind::CanNeverPassInRegs);
 
         // C++0x [class]p7:
         //   A standard-layout class is a class that:
diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp
index d8168e35decb0..230a6c3dbebad 100644
--- a/clang/lib/CodeGen/CGCall.cpp
+++ b/clang/lib/CodeGen/CGCall.cpp
@@ -2719,7 +2719,8 @@ void CodeGenModule::ConstructAttributeList(StringRef Name,
 
       auto *Decl = ParamType->getAsRecordDecl();
       if (CodeGenOpts.PassByValueIsNoAlias && Decl &&
-          Decl->getArgPassingRestrictions() == ArgPassingKind::CanPassInRegs)
+          Decl->getArgPassingRestrictions() ==
+              RecordArgPassingKind::CanPassInRegs)
         // When calling the function, the pointer passed in will be the only
         // reference to the underlying object. Mark it accordingly.
         Attrs.addAttribute(llvm::Attribute::NoAlias);
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index a1cafa0c6ad29..a8bad12b670fc 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -19196,10 +19196,12 @@ void Sema::ActOnFields(Scope *S, SourceLocation RecLoc, Decl *EnclosingDecl,
 
       if (const auto *RT = FT->getAs<RecordType>()) {
         if (RT->getDecl()->getArgPassingRestrictions() ==
-            ArgPassingKind::CanNeverPassInRegs)
-          Record->setArgPassingRestrictions(ArgPassingKind::CanNeverPassInRegs);
+            RecordArgPassingKind::CanNeverPassInRegs)
+          Record->setArgPassingRestrictions(
+              RecordArgPassingKind::CanNeverPassInRegs);
       } else if (FT.getQualifiers().getObjCLifetime() == Qualifiers::OCL_Weak)
-        Record->setArgPassingRestrictions(ArgPassingKind::CanNeverPassInRegs);
+        Record->setArgPassingRestrictions(
+            RecordArgPassingKind::CanNeverPassInRegs);
     }
 
     if (Record && FD->getType().isVolatileQualified())
diff --git a/clang/lib/Sema/SemaDeclCXX.cpp b/clang/lib/Sema/SemaDeclCXX.cpp
index f2d6db41e77a3..be79defbbfac6 100644
--- a/clang/lib/Sema/SemaDeclCXX.cpp
+++ b/clang/lib/Sema/SemaDeclCXX.cpp
@@ -7281,10 +7281,11 @@ void Sema::CheckCompletedCXXClass(Scope *S, CXXRecordDecl *Record) {
 
   // Do not change ArgPassingRestrictions if it has already been set to
   // ArgPassingKind::CanNeverPassInRegs.
-  if (Record->getArgPassingRestrictions() != ArgPassingKind::CanNeverPassInRegs)
-    Record->setArgPassingRestrictions(CanPass
-                                          ? ArgPassingKind::CanPassInRegs
-                                          : ArgPassingKind::CannotPassInRegs);
+  if (Record->getArgPassingRestrictions() !=
+      RecordArgPassingKind::CanNeverPassInRegs)
+    Record->setArgPassingRestrictions(
+        CanPass ? RecordArgPassingKind::CanPassInRegs
+                : RecordArgPassingKind::CannotPassInRegs);
 
   // If canPassInRegisters returns true despite the record having a non-trivial
   // destructor, the record is destructed in the callee. This happens only when
diff --git a/clang/lib/Serialization/ASTReaderDecl.cpp b/clang/lib/Serialization/ASTReaderDecl.cpp
index a97b992fcf6e9..319a45108c6ab 100644
--- a/clang/lib/Serialization/ASTReaderDecl.cpp
+++ b/clang/lib/Serialization/ASTReaderDecl.cpp
@@ -845,7 +845,7 @@ ASTDeclReader::VisitRecordDeclImpl(RecordDecl *RD) {
   RD->setHasNonTrivialToPrimitiveDestructCUnion(Record.readInt());
   RD->setHasNonTrivialToPrimitiveCopyCUnion(Record.readInt());
   RD->setParamDestroyedInCallee(Record.readInt());
-  RD->setArgPassingRestrictions((ArgPassingKind)Record.readInt());
+  RD->setArgPassingRestrictions((RecordArgPassingKind)Record.readInt());
   return Redecl;
 }
 
@@ -4514,7 +4514,7 @@ void ASTDeclReader::UpdateDecl(Decl *D,
                     !Reader.PendingFakeDefinitionData.count(OldDD));
       RD->setParamDestroyedInCallee(Record.readInt());
       RD->setArgPassingRestrictions(
-          static_cast<ArgPassingKind>(Record.readInt()));
+          static_cast<RecordArgPassingKind>(Record.readInt()));
       ReadCXXRecordDefinition(RD, /*Update*/true);
 
       // Visible update is handled separately.
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp
index 83bc67c0a12ca..2805718560357 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp
@@ -1929,7 +1929,7 @@ DWARFASTParserClang::ParseStructureLikeDIE(const SymbolContext &sc,
         m_ast.GetAsCXXRecordDecl(clang_type.GetOpaqueQualType());
     if (record_decl)
       record_decl->setArgPassingRestrictions(
-          clang::ArgPassingKind::CannotPassInRegs);
+          clang::RecordArgPassingKind::CannotPassInRegs);
   }
   return type_sp;
 }

From f16b7c62acebc6731d80565b3994452333443ffa Mon Sep 17 00:00:00 2001
From: hassnaaHamdi <hassnaa.hamdi@arm.com>
Date: Wed, 1 Nov 2023 16:40:42 +0000
Subject: [PATCH 704/877] [llvm][AArch64][Assembly]: Add FDOT2/FDOT4  assembly
 and disassembly. (#70237)

This patch adds the feature flag FDOT2/FDOT4 and the
assembly/disassembly
for the following instructions of NEON and SVE2:
 * NEON:
   - FDOTlane
   - FDOT
 * SVE2:
   - FDOT_ZZZI_BtoH
   - FDOT_ZZZ_BtoH
   - FDOT_ZZZI_BtoS
   - FDOT_ZZZ_BtoS

That is according to this documentation:
https://developer.arm.com/documentation/ddi0602/2023-09
---
 .../llvm/TargetParser/AArch64TargetParser.h   |   8 ++
 llvm/lib/Target/AArch64/AArch64.td            |  12 ++
 .../lib/Target/AArch64/AArch64InstrFormats.td |  41 ++++++-
 llvm/lib/Target/AArch64/AArch64InstrInfo.td   |  24 ++++
 .../lib/Target/AArch64/AArch64SVEInstrInfo.td |  17 +++
 .../AArch64/AsmParser/AArch64AsmParser.cpp    |   4 +
 llvm/lib/Target/AArch64/SVEInstrFormats.td    |  10 ++
 .../MC/AArch64/FP8/directive-arch-negative.s  |  12 ++
 llvm/test/MC/AArch64/FP8/directive-arch.s     |  10 ++
 llvm/test/MC/AArch64/FP8/dot-diagnostic.s     |  59 +++++++++
 llvm/test/MC/AArch64/FP8/dot.s                |  63 ++++++++++
 .../MC/AArch64/FP8_SVE2/fdot-diagnostics.s    |  65 ++++++++++
 llvm/test/MC/AArch64/FP8_SVE2/fdot.s          | 113 ++++++++++++++++++
 .../test/MC/AArch64/SVE2p1/fdot-diagnostics.s |   2 +-
 .../TargetParser/TargetParserTest.cpp         |  62 ++++++----
 15 files changed, 476 insertions(+), 26 deletions(-)
 create mode 100644 llvm/test/MC/AArch64/FP8/dot-diagnostic.s
 create mode 100644 llvm/test/MC/AArch64/FP8/dot.s
 create mode 100644 llvm/test/MC/AArch64/FP8_SVE2/fdot-diagnostics.s
 create mode 100644 llvm/test/MC/AArch64/FP8_SVE2/fdot.s

diff --git a/llvm/include/llvm/TargetParser/AArch64TargetParser.h b/llvm/include/llvm/TargetParser/AArch64TargetParser.h
index d1a3c94bc18ed..80e3fe76e5c25 100644
--- a/llvm/include/llvm/TargetParser/AArch64TargetParser.h
+++ b/llvm/include/llvm/TargetParser/AArch64TargetParser.h
@@ -164,6 +164,10 @@ enum ArchExtKind : unsigned {
   AEK_FAMINMAX =      60, // FEAT_FAMINMAX
   AEK_FP8FMA =        61, // FEAT_FP8FMA
   AEK_SSVE_FP8FMA =   62, // FEAT_SSVE_FP8FMA
+  AEK_FP8DOT2 =       63, // FEAT_FP8DOT2
+  AEK_SSVE_FP8DOT2 =  64, // FEAT_SSVE_FP8DOT2
+  AEK_FP8DOT4 =       65, // FEAT_FP8DOT4
+  AEK_SSVE_FP8DOT4 =  66, // FEAT_SSVE_FP8DOT4
   AEK_NUM_EXTENSIONS
 };
 using ExtensionBitset = Bitset<AEK_NUM_EXTENSIONS>;
@@ -277,6 +281,10 @@ inline constexpr ExtensionInfo Extensions[] = {
     {"faminmax", AArch64::AEK_FAMINMAX, "+faminmax", "-faminmax", FEAT_INIT, "", 0},
     {"fp8fma", AArch64::AEK_FP8FMA, "+fp8fma", "-fp8fma", FEAT_INIT, "+fpmr", 0},
     {"ssve-fp8fma", AArch64::AEK_SSVE_FP8FMA, "+ssve-fp8fma", "-ssve-fp8fma", FEAT_INIT, "+sme2", 0},
+    {"fp8dot2", AArch64::AEK_FP8DOT2, "+fp8dot2", "-fp8dot2", FEAT_INIT, "", 0},
+    {"ssve-fp8dot2", AArch64::AEK_SSVE_FP8DOT2, "+ssve-fp8dot2", "-ssve-fp8dot2", FEAT_INIT, "+sme2", 0},
+    {"fp8dot4", AArch64::AEK_FP8DOT4, "+fp8dot4", "-fp8dot4", FEAT_INIT, "", 0},
+    {"ssve-fp8dot4", AArch64::AEK_SSVE_FP8DOT4, "+ssve-fp8dot4", "-ssve-fp8dot4", FEAT_INIT, "+sme2", 0},
     // Special cases
     {"none", AArch64::AEK_NONE, {}, {}, FEAT_INIT, "", ExtensionInfo::MaxFMVPriority},
 };
diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td
index d3d4bd84f1e98..aa0efb3e6ec13 100644
--- a/llvm/lib/Target/AArch64/AArch64.td
+++ b/llvm/lib/Target/AArch64/AArch64.td
@@ -523,6 +523,18 @@ def FeatureFP8FMA : SubtargetFeature<"fp8fma", "HasFP8FMA", "true",
 def FeatureSSVE_FP8FMA : SubtargetFeature<"ssve-fp8fma", "HasSSVE_FP8FMA", "true",
   "Enable SVE2 fp8 multiply-add instructions (FEAT_SSVE_FP8FMA)", [FeatureSME2]>;
 
+def FeatureFP8DOT2: SubtargetFeature<"fp8dot2", "HasFP8DOT2", "true",
+   "Enable fp8 2-way dot instructions (FEAT_FP8DOT2)">;
+
+def FeatureSSVE_FP8DOT2 : SubtargetFeature<"ssve-fp8dot2", "HasSSVE_FP8DOT2", "true",
+  "Enable SVE2 fp8 2-way dot product instructions (FEAT_SSVE_FP8DOT2)", [FeatureSME2]>;
+
+def FeatureFP8DOT4: SubtargetFeature<"fp8dot4", "HasFP8DOT4", "true",
+   "Enable fp8 4-way dot instructions (FEAT_FP8DOT4)">;
+
+def FeatureSSVE_FP8DOT4 : SubtargetFeature<"ssve-fp8dot4", "HasSSVE_FP8DOT4", "true",
+  "Enable SVE2 fp8 4-way dot product instructions (FEAT_SSVE_FP8DOT4)", [FeatureSME2]>;
+
 def FeatureAppleA7SysReg  : SubtargetFeature<"apple-a7-sysreg", "HasAppleA7SysReg", "true",
   "Apple A7 (the CPU formerly known as Cyclone)">;
 
diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index 152e3b4c7407d..ea965e2933c8d 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -6108,6 +6108,21 @@ multiclass SIMDThreeVectorCvt<string asm> {
                                            V128, v16i8, v4f32, null_frag>;
 }
 
+// TODO: Create a new Value Type v8f8 and v16f8
+multiclass SIMDThreeSameVectorDOT2<string asm> {
+   def v4f16 : BaseSIMDThreeSameVectorDot<0b0, 0b0, 0b01, 0b1111, asm, ".4h", ".8b",
+                                          V64, v4f16, v8i8, null_frag>;
+   def v8f16 : BaseSIMDThreeSameVectorDot<0b1, 0b0, 0b01, 0b1111, asm, ".8h", ".16b",
+                                          V128, v8f16, v16i8, null_frag>;
+}
+
+multiclass SIMDThreeSameVectorDOT4<string asm> {
+   def v2f32 : BaseSIMDThreeSameVectorDot<0b0, 0b0, 0b00, 0b1111, asm, ".2s", ".8b",
+                                          V64, v2f32, v8i8, null_frag>;
+   def v4f32 : BaseSIMDThreeSameVectorDot<0b1, 0b0, 0b00, 0b1111, asm, ".4s", ".16b",
+                                          V128, v4f32, v16i8, null_frag>;
+}
+
 //----------------------------------------------------------------------------
 // AdvSIMD two register vector instructions.
 //----------------------------------------------------------------------------
@@ -8570,10 +8585,10 @@ class SIMDThreeSameVectorMatMul<bit B, bit U, string asm, SDPatternOperator OpNo
 //----------------------------------------------------------------------------
 // ARMv8.2-A Dot Product Instructions (Indexed)
 class BaseSIMDThreeSameVectorIndexS<bit Q, bit U, bits<2> size, bits<4> opc, string asm,
-                                      string dst_kind, string lhs_kind, string rhs_kind,
-                                      RegisterOperand RegType,
-                                      ValueType AccumType, ValueType InputType,
-                                      SDPatternOperator OpNode> :
+                                    string dst_kind, string lhs_kind, string rhs_kind,
+                                    RegisterOperand RegType,
+                                    ValueType AccumType, ValueType InputType,
+                                    SDPatternOperator OpNode> :
         BaseSIMDIndexedTied<Q, U, 0b0, size, opc, RegType, RegType, V128,
                             VectorIndexS, asm, "", dst_kind, lhs_kind, rhs_kind,
         [(set (AccumType RegType:$dst),
@@ -8595,6 +8610,14 @@ multiclass SIMDThreeSameVectorDotIndex<bit U, bit Mixed, bits<2> size, string as
                                               V128, v4i32, v16i8, OpNode>;
 }
 
+// TODO: The vectors v8i8 and v16i8 should be v8f8 and v16f8
+multiclass SIMDThreeSameVectorFP8DOT4Index<string asm> {
+  def v8f8 : BaseSIMDThreeSameVectorIndexS<0b0, 0b0, 0b00, 0b0000, asm, ".2s", ".8b", ".4b",
+                                           V64, v2f32, v8i8, null_frag>;
+  def v16f8 : BaseSIMDThreeSameVectorIndexS<0b1, 0b0, 0b00, 0b0000, asm, ".4s", ".16b",".4b",
+                                            V128, v4f32, v16i8, null_frag>;
+}
+
 // ARMv8.2-A Fused Multiply Add-Long Instructions (Indexed)
 let mayRaiseFPException = 1, Uses = [FPCR] in
 class BaseSIMDThreeSameVectorIndexH<bit Q, bit U, bits<2> sz, bits<4> opc, string asm,
@@ -8624,6 +8647,16 @@ multiclass SIMDThreeSameVectorFMLIndex<bit U, bits<4> opc, string asm,
                                               V128, V128_lo, v4f32, v8f16, OpNode>;
 }
 
+//----------------------------------------------------------------------------
+// FP8 Advanced SIMD vector x indexed element
+// TODO: Replace value types v8i8 and v16i8 by v8f8 and v16f8
+multiclass SIMDThreeSameVectorFP8DOT2Index<string asm> {
+  def v4f16 : BaseSIMDThreeSameVectorIndexH<0b0, 0b0, 0b01, 0b0000, asm, ".4h", ".8b", ".2b",
+                                            V64, V128_lo, v4f16, v8i8, null_frag>;
+  def v8f16 : BaseSIMDThreeSameVectorIndexH<0b1, 0b0, 0b01, 0b0000, asm, ".8h", ".16b", ".2b",
+                                            V128, V128_lo, v8f16, v8i16, null_frag>;
+}
+
 multiclass SIMDFPIndexed<bit U, bits<4> opc, string asm,
                          SDPatternOperator OpNode> {
   let mayRaiseFPException = 1, Uses = [FPCR] in {
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 20f872dfc17b2..0125d3dbecf96 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -173,6 +173,20 @@ def HasSSVE_FP8FMA   : Predicate<"Subtarget->SSVE_FP8FMA() || "
                                  AssemblerPredicateWithAll<(any_of FeatureSSVE_FP8FMA,
                                                            (all_of FeatureSVE2, FeatureFP8FMA)),
                                                            "ssve-fp8fma or (sve2 and fp8fma)">;
+def HasFP8DOT2       : Predicate<"Subtarget->hasFP8DOT2()">,
+                                 AssemblerPredicateWithAll<(all_of FeatureFP8DOT2), "fp8dot2">;
+def HasSSVE_FP8DOT2  : Predicate<"Subtarget->hasSSVE_FP8DOT2() || "
+                                 "(Subtarget->hasSVE2() && Subtarget->hasFP8DOT2())">,
+                                 AssemblerPredicateWithAll<(any_of FeatureSSVE_FP8DOT2,
+                                                           (all_of FeatureSVE2, FeatureFP8DOT2)),
+                                "ssve-fp8dot2 or (sve2 and fp8dot2)">;
+def HasFP8DOT4       : Predicate<"Subtarget->hasFP8DOT4()">,
+                                 AssemblerPredicateWithAll<(all_of FeatureFP8DOT4), "fp8dot4">;
+def HasSSVE_FP8DOT4  : Predicate<"Subtarget->hasSSVE_FP8DOT4() || "
+                                 "(Subtarget->hasSVE2() && Subtarget->hasFP8DOT4())">,
+                                 AssemblerPredicateWithAll<(any_of FeatureSSVE_FP8DOT4,
+                                                           (all_of FeatureSVE2, FeatureFP8DOT4)),
+                                 "ssve-fp8dot4 or (sve2 and fp8dot4)">;
 
 // A subset of SVE(2) instructions are legal in Streaming SVE execution mode,
 // they should be enabled if either has been specified.
@@ -9309,6 +9323,16 @@ let Predicates = [HasFP8FMA] in {
  defm FMLALLTT : SIMDThreeSameVectorMLAL<0b1, 0b01, "fmlalltt">;
 } // End let Predicates = [HasFP8FMA]
 
+let Predicates = [HasFP8DOT2] in {
+ defm FDOTlane : SIMDThreeSameVectorFP8DOT2Index<"fdot">;
+ defm FDOT : SIMDThreeSameVectorDOT2<"fdot">;
+} // End let Predicates = [HasFP8DOT2]
+
+let Predicates = [HasFP8DOT4] in {
+ defm FDOTlane : SIMDThreeSameVectorFP8DOT4Index<"fdot">;
+ defm FDOT : SIMDThreeSameVectorDOT4<"fdot">;
+} // End let Predicates = [HasFP8DOT4]
+
 include "AArch64InstrAtomics.td"
 include "AArch64SVEInstrInfo.td"
 include "AArch64SMEInstrInfo.td"
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index abe07dc0ce327..d186a21f7e773 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -4064,3 +4064,20 @@ def FMLALLBT_ZZZ : sve2_fp8_mla<0b001, ZPR32, "fmlallbt">;
 def FMLALLTB_ZZZ : sve2_fp8_mla<0b010, ZPR32, "fmlalltb">;
 def FMLALLTT_ZZZ : sve2_fp8_mla<0b011, ZPR32, "fmlalltt">;
 } // End HasSSVE_FP8FMA
+
+let Predicates = [HasSSVE_FP8DOT2] in {
+// FP8 Widening Dot-Product - Indexed Group
+defm FDOT_ZZZI_BtoH : sve2_fp8_dot_indexed<"fdot">;
+// FP8 Widening Dot-Product - Group
+// TODO: Replace nxv16i8 by nxv16f8
+defm FDOT_ZZZ_BtoH : sve_float_dot<0b0, 0b1, ZPR16, ZPR8, "fdot", nxv16i8, null_frag>;
+}
+
+// TODO: Replace nxv16i8 by nxv16f8
+let Predicates = [HasSSVE_FP8DOT4] in {
+// FP8 Widening Dot-Product - Indexed Group
+defm FDOT_ZZZI_BtoS : sve_float_dot_indexed<0b1, 0b01, ZPR8, ZPR3b8, "fdot",
+                                            nxv16i8, null_frag>;
+// FP8 Widening Dot-Product - Group
+defm FDOT_ZZZ_BtoS : sve_float_dot<0b1, 0b1, ZPR32, ZPR8, "fdot", nxv16i8, null_frag>;
+}
diff --git a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
index afecff1303666..9c16d22677b9e 100644
--- a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
+++ b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -3654,6 +3654,10 @@ static const struct Extension {
     {"faminmax", {AArch64::FeatureFAMINMAX}},
     {"fp8fma", {AArch64::FeatureFP8FMA}},
     {"ssve-fp8fma", {AArch64::FeatureSSVE_FP8FMA}},
+    {"fp8dot2", {AArch64::FeatureFP8DOT2}},
+    {"ssve-fp8dot2", {AArch64::FeatureSSVE_FP8DOT2}},
+    {"fp8dot4", {AArch64::FeatureFP8DOT4}},
+    {"ssve-fp8dot4", {AArch64::FeatureSSVE_FP8DOT4}},
 };
 
 static void setRequiredFeatureString(FeatureBitset FBS, std::string &Str) {
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index 20ccd61219039..78c14379617a2 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -10193,3 +10193,13 @@ class sve2_fp8_mla_long_long_by_indexed_elem<bits<2> TT, string mnemonic>
   let DestructiveInstType = DestructiveOther;
   let ElementSize         = ZPR32.ElementSize;
 }
+
+// FP8 Widening Dot-Product - Indexed Group
+multiclass sve2_fp8_dot_indexed<string mnemonic>{
+  def NAME : sve_float_dot_indexed<0b0, ZPR16, ZPR8, ZPR3b8, VectorIndexH, mnemonic> {
+    bits<3> iop;
+    let Inst{20-19} = iop{2-1};
+    let Inst{11} = iop{0};
+    let Inst{10} = 0b1;
+  }
+}
diff --git a/llvm/test/MC/AArch64/FP8/directive-arch-negative.s b/llvm/test/MC/AArch64/FP8/directive-arch-negative.s
index 7a45f25fa052d..b4a1110e5bca7 100644
--- a/llvm/test/MC/AArch64/FP8/directive-arch-negative.s
+++ b/llvm/test/MC/AArch64/FP8/directive-arch-negative.s
@@ -23,3 +23,15 @@ fmlalb  v0.8h, v0.16b, v0.16b
 fmlalb  z23.h, z13.b, z0.b[7]
 // CHECK: error: instruction requires: ssve-fp8fma or (sve2 and fp8fma)
 // CHECK: fmlalb  z23.h, z13.b, z0.b[7]
+
+.arch armv9-a+fp8dot2
+.arch armv9-a+nofp8dot2
+fdot  v31.4h, v0.8b, v0.8b
+// CHECK: error: instruction requires: fp8dot2
+// CHECK: fdot  v31.4h, v0.8b, v0.8b
+
+.arch armv9-a+fp8dot4
+.arch armv9-a+nofp8dot4
+fdot  v0.2s, v0.8b, v31.8b
+// CHECK: error: instruction requires: fp8dot4
+// CHECK: fdot  v0.2s, v0.8b, v31.8b
diff --git a/llvm/test/MC/AArch64/FP8/directive-arch.s b/llvm/test/MC/AArch64/FP8/directive-arch.s
index f3627f537b92d..e984210fe3ef1 100644
--- a/llvm/test/MC/AArch64/FP8/directive-arch.s
+++ b/llvm/test/MC/AArch64/FP8/directive-arch.s
@@ -19,3 +19,13 @@ fmlalb  v0.8h, v0.16b, v0.16b
 fmlalb z23.h, z13.b, z0.b[7]
 // CHECK: fmlalb z23.h, z13.b, z0.b[7]
 .arch armv9-a+nossve-fp8fma
+
+.arch armv9-a+fp8dot2
+fdot  v31.4h, v0.8b, v0.8b
+// CHECK: fdot  v31.4h, v0.8b, v0.8b
+.arch armv9-a+nofp8dot2
+
+.arch armv9-a+fp8dot4
+fdot  v0.2s, v0.8b, v31.8b
+// CHECK: fdot  v0.2s, v0.8b, v31.8b
+.arch armv9-a+nofp8dot4
diff --git a/llvm/test/MC/AArch64/FP8/dot-diagnostic.s b/llvm/test/MC/AArch64/FP8/dot-diagnostic.s
new file mode 100644
index 0000000000000..a73310280cec6
--- /dev/null
+++ b/llvm/test/MC/AArch64/FP8/dot-diagnostic.s
@@ -0,0 +1,59 @@
+// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+fp8dot2,+fp8dot4 2>&1 < %s| FileCheck %s
+
+// --------------------------------------------------------------------------//
+// Element size extension incorrect
+
+fdot  v31.4h, v0.8h, v0.8b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: fdot  v31.4h, v0.8h, v0.8b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fdot  v31.8h, v0.16b, v31.16h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid vector kind qualifier
+// CHECK-NEXT: fdot  v31.8h, v0.16b, v31.16h
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fdot  v0.2s, v0.8s, v31.8b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid vector kind qualifier
+// CHECK-NEXT: fdot  v0.2s, v0.8s, v31.8b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fdot  v31.4s, v0, v31.16b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: fdot  v31.4s, v0, v31.16b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+//--------------------------------------------------------------------------//
+// Last Register range is between 0-15
+
+fdot  v31.4h, v31.8b, v16.2b[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: fdot  v31.4h, v31.8b, v16.2b[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fdot  v0.8h, v0.16b, v16.2b[7]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: fdot  v0.8h, v0.16b, v16.2b[7]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Out of range index
+fdot  v31.4h, v31.8b, v15.2b[-1]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: vector lane must be an integer in range [0, 7].
+// CHECK-NEXT: fdot  v31.4h, v31.8b, v15.2b[-1]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fdot  v0.8h, v0.16b, v15.2b[8]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: vector lane must be an integer in range [0, 7].
+// CHECK-NEXT: fdot  v0.8h, v0.16b, v15.2b[8]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fdot  v0.2s, v0.8b, v31.4b[-1]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: vector lane must be an integer in range [0, 3].
+// CHECK-NEXT: fdot  v0.2s, v0.8b, v31.4b[-1]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fdot  v0.4s, v31.16b, v0.4b[4]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: vector lane must be an integer in range [0, 3].
+// CHECK-NEXT: fdot  v0.4s, v31.16b, v0.4b[4]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
diff --git a/llvm/test/MC/AArch64/FP8/dot.s b/llvm/test/MC/AArch64/FP8/dot.s
new file mode 100644
index 0000000000000..e755430745c34
--- /dev/null
+++ b/llvm/test/MC/AArch64/FP8/dot.s
@@ -0,0 +1,63 @@
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+fp8dot2,+fp8dot4 < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
+// RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+fp8dot2,+fp8dot4 < %s \
+// RUN:        | llvm-objdump -d --mattr=+fp8dot2,+fp8dot4 - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+fp8dot2,+fp8dot4 < %s \
+// RUN:        | llvm-objdump -d --mattr=-sme2 - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+// Disassemble encoding and check the re-encoding (-show-encoding) matches.
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+fp8dot2,+fp8dot4 < %s \
+// RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
+// RUN:        | llvm-mc -triple=aarch64 -mattr=+fp8dot2,+fp8dot4 -disassemble -show-encoding \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+
+/// VECTOR
+fdot  v31.4h, v0.8b, v0.8b
+// CHECK-INST: fdot  v31.4h, v0.8b, v0.8b
+// CHECK-ENCODING: [0x1f,0xfc,0x40,0x0e]
+// CHECK-ERROR: instruction requires: fp8dot2
+// CHECK-UNKNOWN: 0e40fc1f  <unknown>
+
+fdot  v31.8h, v0.16b, v31.16b
+// CHECK-INST: fdot  v31.8h, v0.16b, v31.16b
+// CHECK-ENCODING:  [0x1f,0xfc,0x5f,0x4e]
+// CHECK-ERROR: instruction requires: fp8dot2
+// CHECK-UNKNOWN: 4e5ffc1f  <unknown>
+
+fdot  v0.2s, v0.8b, v31.8b
+// CHECK-INST: fdot   v0.2s, v0.8b, v31.8b
+// CHECK-ENCODING:  [0x00,0xfc,0x1f,0x0e]
+// CHECK-ERROR: instruction requires: fp8dot4
+// CHECK-UNKNOWN: 0e1ffc00  <unknown>
+
+fdot  v31.4s, v0.16b, v31.16b
+// CHECK-INST: fdot  v31.4s, v0.16b, v31.16b
+// CHECK-ENCODING:  [0x1f,0xfc,0x1f,0x4e]
+// CHECK-ERROR: instruction requires: fp8dot4
+// CHECK-UNKNOWN: 4e1ffc1f  <unknown>
+
+//INDEXED
+fdot  v31.4h, v31.8b, v15.2b[0]
+// CHECK-INST: fdot  v31.4h, v31.8b, v15.2b[0]
+// CHECK-ENCODING: [0xff,0x03,0x4f,0x0f]
+// CHECK-ERROR: instruction requires: fp8dot2
+// CHECK-UNKNOWN: 0f4f03ff <unknown>
+
+fdot  v0.8h, v0.16b, v15.2b[7]
+// CHECK-INST: fdot  v0.8h, v0.16b, v15.2b[7]
+// CHECK-ENCODING: [0x00,0x08,0x7f,0x4f]
+// CHECK-ERROR: instruction requires: fp8dot2
+// CHECK-UNKNOWN: 4f7f0800 <unknown>
+
+fdot  v0.2s, v0.8b, v31.4b[0]
+// CHECK-INST: fdot  v0.2s, v0.8b, v31.4b[0]
+// CHECK-ENCODING: [0x00,0x00,0x1f,0x0f]
+// CHECK-ERROR: instruction requires: fp8dot4
+// CHECK-UNKNOWN: 0f1f0000  <unknown>
+
+fdot  v0.4s, v31.16b, v0.4b[3]
+// CHECK-INST: fdot  v0.4s, v31.16b, v0.4b[3]
+// CHECK-ENCODING: [0xe0,0x0b,0x20,0x4f]
+// CHECK-ERROR: instruction requires: fp8dot4
+// CHECK-UNKNOWN: 4f200be0  <unknown>
diff --git a/llvm/test/MC/AArch64/FP8_SVE2/fdot-diagnostics.s b/llvm/test/MC/AArch64/FP8_SVE2/fdot-diagnostics.s
new file mode 100644
index 0000000000000..b80e527ec2953
--- /dev/null
+++ b/llvm/test/MC/AArch64/FP8_SVE2/fdot-diagnostics.s
@@ -0,0 +1,65 @@
+// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+ssve-fp8dot2,+ssve-fp8dot4 \
+// RUN: 2>&1 < %s | FileCheck %s
+
+// FDOT2
+// --------------------------------------------------------------------------//
+
+// z register out of range for index
+
+fdot z0.h, z0.b, z8.b[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: fdot z0.h, z0.b, z8.b[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+
+// Invalid vector lane index
+
+fdot z0.h, z0.b, z0.b[-1]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: vector lane must be an integer in range [0, 7].
+// CHECK-NEXT: fdot z0.h, z0.b, z0.b[-1]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fdot z0.h, z0.b, z0.b[8]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: vector lane must be an integer in range [0, 7].
+// CHECK-NEXT: fdot z0.h, z0.b, z0.b[8]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// Invalid vector suffix
+
+fdot z0.d, z0.b, z0.b[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: fdot z0.d, z0.b, z0.b[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fdot z0.h, z0.h, z0.h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: fdot z0.h, z0.h, z0.h
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+
+
+// FDOT4
+// --------------------------------------------------------------------------//
+// Invalid vector lane index
+
+fdot z0.s, z0.b, z0.b[-1]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: vector lane must be an integer in range [0, 3].
+// CHECK-NEXT: fdot z0.s, z0.b, z0.b[-1]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fdot z0.s, z0.b, z0.b[4]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: vector lane must be an integer in range [0, 3].
+// CHECK-NEXT: fdot z0.s, z0.b, z0.b[4]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// Invalid vector suffix
+
+fdot z0.s, z0.s, z0.s[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: fdot z0.s, z0.s, z0.s[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fdot z0.b, z0.b, z0.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: fdot z0.b, z0.b, z0.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
diff --git a/llvm/test/MC/AArch64/FP8_SVE2/fdot.s b/llvm/test/MC/AArch64/FP8_SVE2/fdot.s
new file mode 100644
index 0000000000000..eb16b59de7afd
--- /dev/null
+++ b/llvm/test/MC/AArch64/FP8_SVE2/fdot.s
@@ -0,0 +1,113 @@
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+ssve-fp8dot2,+ssve-fp8dot4 < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+
+// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
+// RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
+
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2,+fp8dot2,+fp8dot4 < %s \
+// RUN:        | llvm-objdump -d --mattr=+sve2,+fp8dot2,+fp8dot4 --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-INST
+
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2,+fp8dot2,+fp8dot4 < %s \
+// RUN:        | llvm-objdump -d --mattr=-sme2 --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+
+// Disassemble encoding and check the re-encoding (-show-encoding) matches.
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2,+fp8dot2,fp8dot4 < %s \
+// RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
+// RUN:        | llvm-mc -triple=aarch64 -mattr=+sve2,+fp8dot2,fp8dot4 -disassemble -show-encoding \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+
+//
+// FDOT2 instructions
+//
+// fdot2 - indexed
+
+fdot    z0.h, z0.b, z0.b[0]  // 01100100-00100000-01000100-00000000
+// CHECK-INST: fdot    z0.h, z0.b, z0.b[0]
+// CHECK-ENCODING: [0x00,0x44,0x20,0x64]
+// CHECK-ERROR: instruction requires: ssve-fp8dot2 or (sve2 and fp8dot2)
+// CHECK-UNKNOWN: 64204400 <unknown>
+
+movprfx z23, z31
+fdot    z23.h, z13.b, z0.b[3]  // 01100100-00101000-01001101-10110111
+// CHECK-INST:  movprfx z23, z31
+// CHECK-INST: fdot    z23.h, z13.b, z0.b[3]
+// CHECK-ENCODING: [0xb7,0x4d,0x28,0x64]
+// CHECK-ERROR: instruction requires: ssve-fp8dot2 or (sve2 and fp8dot2)
+// CHECK-UNKNOWN: 64284db7 <unknown>
+
+fdot    z31.h, z31.b, z7.b[7]  // 01100100-00111111-01001111-11111111
+// CHECK-INST: fdot    z31.h, z31.b, z7.b[7]
+// CHECK-ENCODING: [0xff,0x4f,0x3f,0x64]
+// CHECK-ERROR: instruction requires: ssve-fp8dot2 or (sve2 and fp8dot2)
+// CHECK-UNKNOWN: 643f4fff <unknown>
+
+
+// fdot2 - group
+
+fdot    z0.h, z0.b, z0.b  // 01100100-00100000-10000100-00000000
+// CHECK-INST: fdot    z0.h, z0.b, z0.b
+// CHECK-ENCODING: [0x00,0x84,0x20,0x64]
+// CHECK-ERROR: instruction requires: ssve-fp8dot2 or (sve2 and fp8dot2)
+// CHECK-UNKNOWN: 64208400 <unknown>
+
+movprfx z23, z31
+fdot    z23.h, z13.b, z8.b  // 01100100-00101000-10000101-10110111
+// CHECK-INST:  movprfx z23, z31
+// CHECK-INST: fdot    z23.h, z13.b, z8.b
+// CHECK-ENCODING: [0xb7,0x85,0x28,0x64]
+// CHECK-ERROR: instruction requires: ssve-fp8dot2 or (sve2 and fp8dot2)
+// CHECK-UNKNOWN: 642885b7 <unknown>
+
+fdot    z31.h, z31.b, z31.b  // 01100100-00111111-10000111-11111111
+// CHECK-INST: fdot    z31.h, z31.b, z31.b
+// CHECK-ENCODING: [0xff,0x87,0x3f,0x64]
+// CHECK-ERROR: instruction requires: ssve-fp8dot2 or (sve2 and fp8dot2)
+// CHECK-UNKNOWN: 643f87ff <unknown>
+
+
+//
+// FDOT4 instructions
+//
+// fdot4 - indexed
+
+fdot    z0.s, z0.b, z0.b[0]  // 01100100-01100000-01000100-00000000
+// CHECK-INST: fdot    z0.s, z0.b, z0.b[0]
+// CHECK-ENCODING: [0x00,0x44,0x60,0x64]
+// CHECK-ERROR: instruction requires: ssve-fp8dot4 or (sve2 and fp8dot4)
+// CHECK-UNKNOWN: 64604400 <unknown>
+
+movprfx z23, z31
+fdot    z23.s, z13.b, z0.b[1]  // 01100100-01101000-01000101-10110111
+// CHECK-INST:  movprfx z23, z31
+// CHECK-INST: fdot    z23.s, z13.b, z0.b[1]
+// CHECK-ENCODING: [0xb7,0x45,0x68,0x64]
+// CHECK-ERROR: instruction requires: ssve-fp8dot4 or (sve2 and fp8dot4)
+// CHECK-UNKNOWN: 646845b7 <unknown>
+
+fdot    z31.s, z31.b, z7.b[3]  // 01100100-01111111-01000111-11111111
+// CHECK-INST: fdot    z31.s, z31.b, z7.b[3]
+// CHECK-ENCODING: [0xff,0x47,0x7f,0x64]
+// CHECK-ERROR: instruction requires: ssve-fp8dot4 or (sve2 and fp8dot4)
+// CHECK-UNKNOWN: 647f47ff <unknown>
+
+// fdot4 - group
+
+fdot    z0.s, z0.b, z0.b  // 01100100-01100000-10000100-00000000
+// CHECK-INST: fdot    z0.s, z0.b, z0.b
+// CHECK-ENCODING: [0x00,0x84,0x60,0x64]
+// CHECK-ERROR: instruction requires: ssve-fp8dot4 or (sve2 and fp8dot4)
+// CHECK-UNKNOWN: 64608400 <unknown>
+
+movprfx z23, z31
+fdot    z23.s, z13.b, z8.b  // 01100100-01101000-10000101-10110111
+// CHECK-INST:  movprfx z23, z31
+// CHECK-INST: fdot    z23.s, z13.b, z8.b
+// CHECK-ENCODING: [0xb7,0x85,0x68,0x64]
+// CHECK-ERROR: instruction requires: ssve-fp8dot4 or (sve2 and fp8dot4)
+// CHECK-UNKNOWN: 646885b7 <unknown>
+
+fdot    z31.s, z31.b, z31.b  // 01100100-01111111-10000111-11111111
+// CHECK-INST: fdot    z31.s, z31.b, z31.b
+// CHECK-ENCODING: [0xff,0x87,0x7f,0x64]
+// CHECK-ERROR: instruction requires: ssve-fp8dot4 or (sve2 and fp8dot4)
+// CHECK-UNKNOWN: 647f87ff <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2p1/fdot-diagnostics.s b/llvm/test/MC/AArch64/SVE2p1/fdot-diagnostics.s
index bfcbbc4c89e6f..7a68b92ba30e1 100644
--- a/llvm/test/MC/AArch64/SVE2p1/fdot-diagnostics.s
+++ b/llvm/test/MC/AArch64/SVE2p1/fdot-diagnostics.s
@@ -17,7 +17,7 @@ fdot z0.s, z0.h, z0.h[-1]
 // Invalid vector suffix
 
 fdot z0.h, z0.s, z0.s
-// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
 // CHECK-NEXT: fdot z0.h, z0.s, z0.s
 // CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
 
diff --git a/llvm/unittests/TargetParser/TargetParserTest.cpp b/llvm/unittests/TargetParser/TargetParserTest.cpp
index 3c6d463fee259..d4c5d68cb3ca7 100644
--- a/llvm/unittests/TargetParser/TargetParserTest.cpp
+++ b/llvm/unittests/TargetParser/TargetParserTest.cpp
@@ -1713,27 +1713,39 @@ TEST(TargetParserTest, testAArch64Extension) {
 
 TEST(TargetParserTest, AArch64ExtensionFeatures) {
   std::vector<uint64_t> Extensions = {
-      AArch64::AEK_CRC,        AArch64::AEK_LSE,       AArch64::AEK_RDM,
-      AArch64::AEK_CRYPTO,     AArch64::AEK_SM4,       AArch64::AEK_SHA3,
-      AArch64::AEK_SHA2,       AArch64::AEK_AES,       AArch64::AEK_DOTPROD,
-      AArch64::AEK_FP,         AArch64::AEK_SIMD,      AArch64::AEK_FP16,
-      AArch64::AEK_FP16FML,    AArch64::AEK_PROFILE,   AArch64::AEK_RAS,
-      AArch64::AEK_SVE,        AArch64::AEK_SVE2,      AArch64::AEK_SVE2AES,
-      AArch64::AEK_SVE2SM4,    AArch64::AEK_SVE2SHA3,  AArch64::AEK_SVE2BITPERM,
-      AArch64::AEK_RCPC,       AArch64::AEK_RAND,      AArch64::AEK_MTE,
-      AArch64::AEK_SSBS,       AArch64::AEK_SB,        AArch64::AEK_PREDRES,
-      AArch64::AEK_BF16,       AArch64::AEK_I8MM,      AArch64::AEK_F32MM,
-      AArch64::AEK_F64MM,      AArch64::AEK_TME,       AArch64::AEK_LS64,
-      AArch64::AEK_BRBE,       AArch64::AEK_PAUTH,     AArch64::AEK_FLAGM,
-      AArch64::AEK_SME,        AArch64::AEK_SMEF64F64, AArch64::AEK_SMEI16I64,
-      AArch64::AEK_SME2,       AArch64::AEK_HBC,       AArch64::AEK_MOPS,
-      AArch64::AEK_PERFMON,    AArch64::AEK_SVE2p1,    AArch64::AEK_SME2p1,
-      AArch64::AEK_B16B16,     AArch64::AEK_SMEF16F16, AArch64::AEK_CSSC,
-      AArch64::AEK_RCPC3,      AArch64::AEK_THE,       AArch64::AEK_D128,
-      AArch64::AEK_LSE128,     AArch64::AEK_SPECRES2,  AArch64::AEK_RASv2,
-      AArch64::AEK_ITE,        AArch64::AEK_GCS,       AArch64::AEK_FPMR,
-      AArch64::AEK_FP8,        AArch64::AEK_FAMINMAX,  AArch64::AEK_FP8FMA,
-      AArch64::AEK_SSVE_FP8FMA};
+      AArch64::AEK_CRC,          AArch64::AEK_LSE,
+      AArch64::AEK_RDM,          AArch64::AEK_CRYPTO,
+      AArch64::AEK_SM4,          AArch64::AEK_SHA3,
+      AArch64::AEK_SHA2,         AArch64::AEK_AES,
+      AArch64::AEK_DOTPROD,      AArch64::AEK_FP,
+      AArch64::AEK_SIMD,         AArch64::AEK_FP16,
+      AArch64::AEK_FP16FML,      AArch64::AEK_PROFILE,
+      AArch64::AEK_RAS,          AArch64::AEK_SVE,
+      AArch64::AEK_SVE2,         AArch64::AEK_SVE2AES,
+      AArch64::AEK_SVE2SM4,      AArch64::AEK_SVE2SHA3,
+      AArch64::AEK_SVE2BITPERM,  AArch64::AEK_RCPC,
+      AArch64::AEK_RAND,         AArch64::AEK_MTE,
+      AArch64::AEK_SSBS,         AArch64::AEK_SB,
+      AArch64::AEK_PREDRES,      AArch64::AEK_BF16,
+      AArch64::AEK_I8MM,         AArch64::AEK_F32MM,
+      AArch64::AEK_F64MM,        AArch64::AEK_TME,
+      AArch64::AEK_LS64,         AArch64::AEK_BRBE,
+      AArch64::AEK_PAUTH,        AArch64::AEK_FLAGM,
+      AArch64::AEK_SME,          AArch64::AEK_SMEF64F64,
+      AArch64::AEK_SMEI16I64,    AArch64::AEK_SME2,
+      AArch64::AEK_HBC,          AArch64::AEK_MOPS,
+      AArch64::AEK_PERFMON,      AArch64::AEK_SVE2p1,
+      AArch64::AEK_SME2p1,       AArch64::AEK_B16B16,
+      AArch64::AEK_SMEF16F16,    AArch64::AEK_CSSC,
+      AArch64::AEK_RCPC3,        AArch64::AEK_THE,
+      AArch64::AEK_D128,         AArch64::AEK_LSE128,
+      AArch64::AEK_SPECRES2,     AArch64::AEK_RASv2,
+      AArch64::AEK_ITE,          AArch64::AEK_GCS,
+      AArch64::AEK_FPMR,         AArch64::AEK_FP8,
+      AArch64::AEK_FAMINMAX,     AArch64::AEK_FP8FMA,
+      AArch64::AEK_SSVE_FP8FMA,  AArch64::AEK_FP8DOT2,
+      AArch64::AEK_SSVE_FP8DOT2, AArch64::AEK_FP8DOT4,
+      AArch64::AEK_SSVE_FP8DOT4};
 
   std::vector<StringRef> Features;
 
@@ -1810,6 +1822,10 @@ TEST(TargetParserTest, AArch64ExtensionFeatures) {
   EXPECT_TRUE(llvm::is_contained(Features, "+faminmax"));
   EXPECT_TRUE(llvm::is_contained(Features, "+fp8fma"));
   EXPECT_TRUE(llvm::is_contained(Features, "+ssve-fp8fma"));
+  EXPECT_TRUE(llvm::is_contained(Features, "+fp8dot2"));
+  EXPECT_TRUE(llvm::is_contained(Features, "+ssve-fp8dot2"));
+  EXPECT_TRUE(llvm::is_contained(Features, "+fp8dot4"));
+  EXPECT_TRUE(llvm::is_contained(Features, "+ssve-fp8dot4"));
 
   // Assuming we listed every extension above, this should produce the same
   // result. (note that AEK_NONE doesn't have a name so it won't be in the
@@ -1938,6 +1954,10 @@ TEST(TargetParserTest, AArch64ArchExtFeature) {
       {"faminmax", "nofaminmax", "+faminmax", "-faminmax"},
       {"fp8fma", "nofp8fma", "+fp8fma", "-fp8fma"},
       {"ssve-fp8fma", "nossve-fp8fma", "+ssve-fp8fma", "-ssve-fp8fma"},
+      {"fp8dot2", "nofp8dot2", "+fp8dot2", "-fp8dot2"},
+      {"ssve-fp8dot2", "nossve-fp8dot2", "+ssve-fp8dot2", "-ssve-fp8dot2"},
+      {"fp8dot4", "nofp8dot4", "+fp8dot4", "-fp8dot4"},
+      {"ssve-fp8dot4", "nossve-fp8dot4", "+ssve-fp8dot4", "-ssve-fp8dot4"},
   };
 
   for (unsigned i = 0; i < std::size(ArchExt); i++) {

From 4c1c32c377c4325299b70b61f63974b03a571e13 Mon Sep 17 00:00:00 2001
From: Jacek Caban <jacek@codeweavers.com>
Date: Wed, 1 Nov 2023 17:45:17 +0100
Subject: [PATCH 705/877] [lld][NCF] Add missing llvm-mc invocation to
 arm64ec-codemap.test. (#70956)

Fixup for #70722.
---
 lld/test/COFF/arm64ec-codemap.test | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lld/test/COFF/arm64ec-codemap.test b/lld/test/COFF/arm64ec-codemap.test
index fcc7db259858a..84b40536ee51e 100644
--- a/lld/test/COFF/arm64ec-codemap.test
+++ b/lld/test/COFF/arm64ec-codemap.test
@@ -3,6 +3,7 @@ RUN: split-file %s %t.dir && cd %t.dir
 
 RUN: llvm-mc -filetype=obj -triple=arm64-windows arm64-func-sym.s -o arm64-func-sym.obj
 RUN: llvm-mc -filetype=obj -triple=arm64ec-windows arm64ec-func-sym.s -o arm64ec-func-sym.obj
+RUN: llvm-mc -filetype=obj -triple=arm64ec-windows data-sec.s -o data-sec.obj
 RUN: llvm-mc -filetype=obj -triple=x86_64-windows x86_64-func-sym.s -o x86_64-func-sym.obj
 RUN: llvm-mc -filetype=obj -triple=arm64ec-windows codemap.s -o codemap.obj
 RUN: llvm-mc -filetype=obj -triple=arm64ec-windows codemap2.s -o codemap2.obj

From b19c40c579dbfeb4c5615fb96316492c67f64edc Mon Sep 17 00:00:00 2001
From: Aart Bik <39774503+aartbik@users.noreply.github.com>
Date: Wed, 1 Nov 2023 10:01:22 -0700
Subject: [PATCH 706/877] [mlir][sparse] first end-to-end linalg.generic op on
 BSR (#70880)

---
 .../SparseTensor/IR/SparseTensorType.h        |  8 +++---
 .../Transforms/SparseReinterpretMap.cpp       |  8 +++---
 .../SparsificationAndBufferizationPass.cpp    |  6 +++--
 .../Dialect/SparseTensor/CPU/block.mlir       | 27 +++++++++++++++++--
 4 files changed, 36 insertions(+), 13 deletions(-)

diff --git a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorType.h b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorType.h
index 1fd91d0c02e4d..3e9cada83c6d5 100644
--- a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorType.h
+++ b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorType.h
@@ -341,11 +341,9 @@ template <typename T>
 inline SparseTensorType getSparseTensorType(T t) {
   return SparseTensorType(getRankedTensorType(t));
 }
-template <typename T>
-inline std::optional<SparseTensorType> tryGetSparseTensorType(T t) {
-  RankedTensorType rtp = getRankedTensorType(t);
-  if (rtp)
-    return SparseTensorType(rtp);
+inline std::optional<SparseTensorType> tryGetSparseTensorType(Value v) {
+  if (isa<RankedTensorType>(v.getType()))
+    return getSparseTensorType(v);
   return std::nullopt;
 }
 
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseReinterpretMap.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseReinterpretMap.cpp
index 31cc8525725d4..a822effbb2ab7 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseReinterpretMap.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseReinterpretMap.cpp
@@ -116,11 +116,11 @@ struct GenericOpReinterpretMap : public OpRewritePattern<linalg::GenericOp> {
         if (map.getResult(i).getKind() != AffineExprKind::DimId)
           return failure();
       // Inspect sparse operands.
-      auto stt = getSparseTensorType(t.get());
-      if (stt.hasEncoding()) {
-        if (stt.isPermutation())
+      auto stt = tryGetSparseTensorType(t.get());
+      if (stt && stt->hasEncoding()) {
+        if (stt->isPermutation())
           continue;
-        assert(stt.getDimRank() < stt.getLvlRank()); // only allowed non-perm
+        assert(stt->getDimRank() < stt->getLvlRank()); // only allowed non-perm
         if (tx)
           return failure(); // more than one non-perm
         if (!map.isIdentity())
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparsificationAndBufferizationPass.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparsificationAndBufferizationPass.cpp
index 41940f731e76c..4a293f6819d09 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparsificationAndBufferizationPass.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparsificationAndBufferizationPass.cpp
@@ -104,8 +104,8 @@ class SparsificationAndBufferizationPass
   }
 
   void runOnOperation() override {
+    // Run enabling transformations.
     {
-      // Run enabling transformations.
       OpPassManager pm("builtin.module");
       pm.addPass(createPreSparsificationRewritePass());
       pm.addNestedPass<func::FuncOp>(
@@ -128,7 +128,7 @@ class SparsificationAndBufferizationPass
                                                  bufferizationOptions)))
       return signalPassFailure();
 
-    // `testAnalysisOnly` is a debug/testing flag. If set, the results of
+    // Option `testAnalysisOnly` is a debug/testing flag. If set, the results of
     // OneShotAnalysis are added to the IR via attributes. In that case, do not
     // continue with the remaining pipeline.
     if (bufferizationOptions.testAnalysisOnly)
@@ -139,6 +139,8 @@ class SparsificationAndBufferizationPass
     // of `bufferization.alloc_tensor` ops.
     {
       OpPassManager pm("builtin.module");
+      pm.addPass(
+          createSparseReinterpretMapPass(ReinterpretMapScope::kGenericOnly));
       pm.addPass(createSparsificationPass(sparsificationOptions));
       pm.addNestedPass<func::FuncOp>(createStageSparseOperationsPass());
       pm.addPass(createLowerSparseOpsToForeachPass(enableRuntimeLibrary,
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/block.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/block.mlir
index 78d35ada6acc1..e1cdc9ed6ba3d 100755
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/block.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/block.mlir
@@ -25,6 +25,8 @@
 // REDEFINE: %{sparse_compiler_opts} = enable-runtime-library=false
 // R_UN: %{compile} | env %{env} %{run} | FileCheck %s
 
+!Filename = !llvm.ptr<i8>
+
 #BSR = #sparse_tensor.encoding<{
   map = (i, j) ->
     ( i floordiv 2 : dense
@@ -38,8 +40,12 @@
   map = (i, j, k, l) -> ( i  : dense, j  : compressed, k  : dense, l  : dense)
 }>
 
-
-!Filename = !llvm.ptr<i8>
+#trait_scale_inplace = {
+  indexing_maps = [
+    affine_map<(i,j) -> (i,j)>   // X (out)
+  ],
+  iterator_types = ["parallel", "parallel"]
+}
 
 //
 // Example 2x2 block storage:
@@ -62,6 +68,17 @@ module {
 
   func.func private @getTensorFilename(index) -> (!Filename)
 
+  func.func @scale(%arg0: tensor<?x?xf64, #BSR>) -> tensor<?x?xf64, #BSR> {
+    %c = arith.constant 3.0 : f64
+    %0 = linalg.generic #trait_scale_inplace
+      outs(%arg0: tensor<?x?xf64, #BSR>) {
+        ^bb(%x: f64):
+          %1 = arith.mulf %x, %c : f64
+          linalg.yield %1 : f64
+      } -> tensor<?x?xf64, #BSR>
+    return %0 : tensor<?x?xf64, #BSR>
+  }
+
   func.func @entry() {
     %c0 = arith.constant 0   : index
     %f0 = arith.constant 0.0 : f64
@@ -89,6 +106,12 @@ module {
     %vecdsdd = vector.transfer_read %vdsdd[%c0], %f0 : memref<?xf64>, vector<12xf64>
     vector.print %vecdsdd : vector<12xf64>
 
+    // CHECK-NEXT: ( 3, 6, 0, 9, 12, 0, 0, 15, 18, 21, 24, 0 )
+    %As = call @scale(%A) : (tensor<?x?xf64, #BSR>) -> (tensor<?x?xf64, #BSR>)
+    %vals = sparse_tensor.values %As : tensor<?x?xf64, #BSR> to memref<?xf64>
+    %vecs = vector.transfer_read %vals[%c0], %f0 : memref<?xf64>, vector<12xf64>
+    vector.print %vecs : vector<12xf64>
+
     // Release the resources.
     bufferization.dealloc_tensor %A: tensor<?x?xf64, #BSR>
 

From 2b898afdef7c8f40e74af9379a1cc7cf372ecb65 Mon Sep 17 00:00:00 2001
From: Jacob Lambert <39344363+lamb-j@users.noreply.github.com>
Date: Wed, 1 Nov 2023 10:05:11 -0700
Subject: [PATCH 707/877] [llvm] Add comment and assert for CloneModule edge
 case (#67734)

CloneModule is not currently designed to handle un-materialized Modules,
for example one created via a lazy initializer like
getLazyBitcodeModule(). In this case we get a somewhat cryptic
segmentation fault without a clear path forward.

In this patch, we add a comment to inform CloneModule users of this
shortcoming, and an assert to test for empty function bodies before the
segmentation fault is triggered.
---
 llvm/lib/Transforms/Utils/CloneModule.cpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/llvm/lib/Transforms/Utils/CloneModule.cpp b/llvm/lib/Transforms/Utils/CloneModule.cpp
index 55e051298a9a3..aae0a280afeb5 100644
--- a/llvm/lib/Transforms/Utils/CloneModule.cpp
+++ b/llvm/lib/Transforms/Utils/CloneModule.cpp
@@ -34,6 +34,8 @@ static void copyComdat(GlobalObject *Dst, const GlobalObject *Src) {
 /// copies of global variables and functions, and making their (initializers and
 /// references, respectively) refer to the right globals.
 ///
+/// Cloning un-materialized modules is not currently supported, so any
+/// modules initialized via lazy loading should be materialized before cloning
 std::unique_ptr<Module> llvm::CloneModule(const Module &M) {
   // Create the value map that maps things from the old module over to the new
   // module.
@@ -49,6 +51,9 @@ std::unique_ptr<Module> llvm::CloneModule(const Module &M,
 std::unique_ptr<Module> llvm::CloneModule(
     const Module &M, ValueToValueMapTy &VMap,
     function_ref<bool(const GlobalValue *)> ShouldCloneDefinition) {
+
+  assert(M.isMaterialized() && "Module must be materialized before cloning!");
+
   // First off, we need to create the new module.
   std::unique_ptr<Module> New =
       std::make_unique<Module>(M.getModuleIdentifier(), M.getContext());

From d0b2974e9badf60de180b2b04a42e5870354e14c Mon Sep 17 00:00:00 2001
From: Philip Reames <preames@rivosinc.com>
Date: Wed, 1 Nov 2023 10:03:48 -0700
Subject: [PATCH 708/877] [indvars] Regenerate a couple of auto-generated test
 cases [nfc]

Reducing spurious diff in upcoming changes.
---
 .../IndVarSimplify/AArch64/widen-loop-comp.ll |  4 +-
 .../IndVarSimplify/iv-widen-elim-ext.ll       | 42 ++++++++++++++++---
 .../test/Transforms/IndVarSimplify/pr55925.ll |  8 ++--
 3 files changed, 42 insertions(+), 12 deletions(-)

diff --git a/llvm/test/Transforms/IndVarSimplify/AArch64/widen-loop-comp.ll b/llvm/test/Transforms/IndVarSimplify/AArch64/widen-loop-comp.ll
index 001e7c91e9fc5..6f659a88da2e2 100644
--- a/llvm/test/Transforms/IndVarSimplify/AArch64/widen-loop-comp.ll
+++ b/llvm/test/Transforms/IndVarSimplify/AArch64/widen-loop-comp.ll
@@ -754,14 +754,14 @@ define i32 @test14a(i32 %start, ptr %p, ptr %q, i1 %c) personality i1 1 {
 ; CHECK-NEXT:    [[LOOP_COND:%.*]] = icmp eq i32 [[STOP]], 0
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nsw i64 [[INDVARS_IV]], -1
 ; CHECK-NEXT:    invoke void @test14a-callee(i1 [[LOOP_COND]])
-; CHECK-NEXT:    to label [[LOOP]] unwind label [[EXCEPTION:%.*]]
+; CHECK-NEXT:            to label [[LOOP]] unwind label [[EXCEPTION:%.*]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 -1 to i32
 ; CHECK-NEXT:    ret i32 [[TMP2]]
 ; CHECK:       exception:
 ; CHECK-NEXT:    [[FOO_LCSSA1_WIDE:%.*]] = phi i64 [ [[TMP1]], [[BACKEDGE]] ]
 ; CHECK-NEXT:    [[TMP3:%.*]] = landingpad i1
-; CHECK-NEXT:    cleanup
+; CHECK-NEXT:            cleanup
 ; CHECK-NEXT:    [[TMP4:%.*]] = trunc i64 [[FOO_LCSSA1_WIDE]] to i32
 ; CHECK-NEXT:    ret i32 [[TMP4]]
 ;
diff --git a/llvm/test/Transforms/IndVarSimplify/iv-widen-elim-ext.ll b/llvm/test/Transforms/IndVarSimplify/iv-widen-elim-ext.ll
index 42649ad73a2ce..0e21bf8ba347a 100644
--- a/llvm/test/Transforms/IndVarSimplify/iv-widen-elim-ext.ll
+++ b/llvm/test/Transforms/IndVarSimplify/iv-widen-elim-ext.ll
@@ -417,6 +417,42 @@ for.body:                                         ; preds = %for.body.lr.ph, %fo
 }
 
 define i32 @foo6(ptr %input, i32 %length, ptr %in) {
+; CHECK-LABEL: @foo6(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[STRIDE:%.*]] = getelementptr inbounds [[STRUCT_IMAGE:%.*]], ptr [[INPUT:%.*]], i64 0, i32 1
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[STRIDE]], align 4
+; CHECK-NEXT:    [[CMP17:%.*]] = icmp sgt i32 [[LENGTH:%.*]], 1
+; CHECK-NEXT:    br i1 [[CMP17]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK:       for.body.lr.ph:
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[LENGTH]] to i64
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.cond.cleanup.loopexit:
+; CHECK-NEXT:    [[TMP2:%.*]] = phi i32 [ [[TMP12:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    [[TMP3:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP2]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
+; CHECK-NEXT:    ret i32 [[TMP3]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ 1, [[FOR_BODY_LR_PH]] ]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[TMP4:%.*]] = and i32 [[LENGTH]], [[TMP0]]
+; CHECK-NEXT:    [[TMP5:%.*]] = zext i32 [[TMP4]] to i64
+; CHECK-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP5]], [[INDVARS_IV_NEXT]]
+; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i32, ptr [[IN:%.*]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[ADD_PTR]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = mul nuw i64 [[TMP1]], [[INDVARS_IV_NEXT]]
+; CHECK-NEXT:    [[ADD_PTR1:%.*]] = getelementptr inbounds i32, ptr [[IN]], i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP9:%.*]] = load i32, ptr [[ADD_PTR1]], align 4
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[LENGTH]], [[TMP9]]
+; CHECK-NEXT:    [[TMP10:%.*]] = zext i32 [[OR]] to i64
+; CHECK-NEXT:    [[TMP11:%.*]] = sub nuw i64 [[TMP10]], [[INDVARS_IV_NEXT]]
+; CHECK-NEXT:    [[PTR_OR:%.*]] = getelementptr inbounds i32, ptr [[IN]], i64 [[TMP11]]
+; CHECK-NEXT:    [[VAL_OR:%.*]] = load i32, ptr [[PTR_OR]], align 4
+; CHECK-NEXT:    [[TMP12]] = add i32 [[TMP7]], [[VAL_OR]]
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP_LOOPEXIT]]
+;
 entry:
   %stride = getelementptr inbounds %struct.image, ptr %input, i64 0, i32 1
   %0 = load i32, ptr %stride, align 4
@@ -436,12 +472,6 @@ for.cond.cleanup:                                 ; preds = %for.cond.cleanup.lo
 
 ; Extend foo4 so that any loop variants (%3 and %or) with mul/sub/add then extend will not
 ; need a trunc instruction
-; CHECK: for.body:
-; CHECK-NOT: trunc
-; CHECK:      [[TMP0:%.*]] = and i32 %length, %0
-; CHECK-NEXT: zext i32 [[TMP0]] to i64
-; CHECK:      [[TMP1:%.*]] = or i32 %length, [[TMP2:%.*]]
-; CHECK-NEXT: zext i32 [[TMP1]] to i64
 for.body:                                         ; preds = %for.body.lr.ph, %for.body
   %x.018 = phi i32 [ 1, %for.body.lr.ph ], [ %add, %for.body ]
   %add = add nuw nsw i32 %x.018, 1
diff --git a/llvm/test/Transforms/IndVarSimplify/pr55925.ll b/llvm/test/Transforms/IndVarSimplify/pr55925.ll
index 94ac4bbfdeef7..376e9440acbf2 100644
--- a/llvm/test/Transforms/IndVarSimplify/pr55925.ll
+++ b/llvm/test/Transforms/IndVarSimplify/pr55925.ll
@@ -16,7 +16,7 @@ define void @test(ptr %p) personality ptr undef {
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH]] ]
 ; CHECK-NEXT:    [[RES:%.*]] = invoke i32 @foo(i32 returned [[IV]])
-; CHECK-NEXT:    to label [[LOOP_LATCH]] unwind label [[EXIT:%.*]]
+; CHECK-NEXT:            to label [[LOOP_LATCH]] unwind label [[EXIT:%.*]]
 ; CHECK:       loop.latch:
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw i32 [[IV]], 1
@@ -25,7 +25,7 @@ define void @test(ptr %p) personality ptr undef {
 ; CHECK-NEXT:    br label [[LOOP]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    [[LP:%.*]] = landingpad { ptr, i32 }
-; CHECK-NEXT:    cleanup
+; CHECK-NEXT:            cleanup
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -60,7 +60,7 @@ define void @test_critedge(i1 %c, ptr %p) personality ptr undef {
 ; CHECK:       loop.invoke:
 ; CHECK-NEXT:    [[TMP0:%.*]] = trunc i64 [[INDVARS_IV]] to i32
 ; CHECK-NEXT:    [[RES:%.*]] = invoke i32 @foo(i32 returned [[IV]])
-; CHECK-NEXT:    to label [[LOOP_LATCH]] unwind label [[EXIT:%.*]]
+; CHECK-NEXT:            to label [[LOOP_LATCH]] unwind label [[EXIT:%.*]]
 ; CHECK:       loop.other:
 ; CHECK-NEXT:    br label [[LOOP_LATCH]]
 ; CHECK:       loop.latch:
@@ -71,7 +71,7 @@ define void @test_critedge(i1 %c, ptr %p) personality ptr undef {
 ; CHECK-NEXT:    br label [[LOOP]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    [[LP:%.*]] = landingpad { ptr, i32 }
-; CHECK-NEXT:    cleanup
+; CHECK-NEXT:            cleanup
 ; CHECK-NEXT:    ret void
 ;
 entry:

From 7f47f56e1a726ed384c14a2ec94166520f22d20a Mon Sep 17 00:00:00 2001
From: Andy Kaylor <andrew.kaylor@intel.com>
Date: Wed, 1 Nov 2023 10:13:26 -0700
Subject: [PATCH 709/877] Update urllib3 version (#11732)

This updates the urllib3 version listed in
llvm/utils/git/requirements.txt to address two problems reported by
Dependabot
---
 llvm/utils/git/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/utils/git/requirements.txt b/llvm/utils/git/requirements.txt
index deb211f251912..063076793ad37 100644
--- a/llvm/utils/git/requirements.txt
+++ b/llvm/utils/git/requirements.txt
@@ -38,7 +38,7 @@ smmap==5.0.0
     # via gitdb
 types-cryptography==3.3.23.2
     # via pyjwt
-urllib3==1.26.12
+urllib3==1.26.18
     # via requests
 wrapt==1.14.1
     # via deprecated

From bcb50a111f00884f8e2b3f92a250d1ec0e0f3a0c Mon Sep 17 00:00:00 2001
From: mmoadeli <mahmoud.moadeli@codeplay.com>
Date: Wed, 1 Nov 2023 17:22:30 +0000
Subject: [PATCH 710/877] [SYCL][HIP][CUDA] Enable cuda / hip support for
 empty_zero_dim_accessor test. (#11743)

Locally, on a machine with NVIDIA gpu verified to work.
---
 sycl/test-e2e/Basic/accessor/empty_zero_dim_accessor.cpp | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/sycl/test-e2e/Basic/accessor/empty_zero_dim_accessor.cpp b/sycl/test-e2e/Basic/accessor/empty_zero_dim_accessor.cpp
index 73c6fff3f4558..8cdc02f736d68 100644
--- a/sycl/test-e2e/Basic/accessor/empty_zero_dim_accessor.cpp
+++ b/sycl/test-e2e/Basic/accessor/empty_zero_dim_accessor.cpp
@@ -1,10 +1,6 @@
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
-// Disabled for HIP (https://github.com/intel/llvm/issues/10358) and CUDA
-// (https://github.com/intel/llvm/issues/10360)
-// UNSUPPORTED: cuda || hip
-
 // https://github.com/intel/llvm/issues/11434
 // XFAIL: gpu-intel-dg2
 

From 370e9b5ae64b5d7f2411124aa21c6af278e127be Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Wed, 1 Nov 2023 10:25:14 -0700
Subject: [PATCH 711/877] [mlir] Fix warnings about extra ';'
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch fixes:

  mlir/include/mlir/Bytecode/BytecodeImplementation.h:415:4: warning:
  extra ‘;’ [-Wpedantic]

  mlir/include/mlir/Bytecode/BytecodeWriter.h:110:4: warning: extra
  ‘;’ [-Wpedantic]
---
 mlir/include/mlir/Bytecode/BytecodeImplementation.h | 2 +-
 mlir/include/mlir/Bytecode/BytecodeWriter.h         | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/mlir/include/mlir/Bytecode/BytecodeImplementation.h b/mlir/include/mlir/Bytecode/BytecodeImplementation.h
index 396c821d4a9fd..bc91318cf1b6a 100644
--- a/mlir/include/mlir/Bytecode/BytecodeImplementation.h
+++ b/mlir/include/mlir/Bytecode/BytecodeImplementation.h
@@ -412,7 +412,7 @@ class DialectBytecodeWriter {
   template <class T>
   FailureOr<const DialectVersion *> getDialectVersion() const {
     return getDialectVersion(T::getDialectNamespace());
-  };
+  }
 };
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/Bytecode/BytecodeWriter.h b/mlir/include/mlir/Bytecode/BytecodeWriter.h
index 2cc489778d4de..b82d8ddad38ed 100644
--- a/mlir/include/mlir/Bytecode/BytecodeWriter.h
+++ b/mlir/include/mlir/Bytecode/BytecodeWriter.h
@@ -107,7 +107,7 @@ class BytecodeWriterConfig {
   void setDialectVersion(std::unique_ptr<DialectVersion> dialectVersion) const {
     return setDialectVersion(T::getDialectNamespace(),
                              std::move(dialectVersion));
-  };
+  }
   void setDialectVersion(StringRef dialectName,
                          std::unique_ptr<DialectVersion> dialectVersion) const;
 

From 6c320b434d249c0d5692cbecc0e67ec22d659b7e Mon Sep 17 00:00:00 2001
From: Natalie Chouinard <1953083+sudonatalie@users.noreply.github.com>
Date: Wed, 1 Nov 2023 13:41:30 -0400
Subject: [PATCH 712/877] [SPIRV] Add -spirv option to DXC driver (#65989)

Add an option to target SPIR-V to the clang-dxc driver, which sets the
target triple's architecture to logical SPIR-V. This does not yet
address the need for a Vulkan target environment in the triple, tracked
in Issue #70051.

Depends on #70330
---
 clang/include/clang/Driver/Options.td | 2 ++
 clang/lib/Driver/Driver.cpp           | 7 +++++++
 clang/test/Driver/dxc_spirv.hlsl      | 4 ++++
 3 files changed, 13 insertions(+)
 create mode 100644 clang/test/Driver/dxc_spirv.hlsl

diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index c8b730e0f7ecd..b1229b2f45623 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -8326,3 +8326,5 @@ def dxc_disable_validation : DXCFlag<"Vd">,
 def : Option<["/", "-"], "Qembed_debug", KIND_FLAG>, Group<dxc_Group>,
   Flags<[Ignored]>, Visibility<[DXCOption]>,
   HelpText<"Embed PDB in shader container (ignored)">;
+def spirv : DXCFlag<"spirv">,
+  HelpText<"Generate SPIR-V code">;
diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index bbbb30ae2201a..6f5ff81410326 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -1303,6 +1303,13 @@ Compilation *Driver::BuildCompilation(ArrayRef<const char *> ArgList) {
         Diag(diag::err_drv_invalid_directx_shader_module) << TargetProfile;
 
       A->claim();
+
+      // TODO: Specify Vulkan target environment somewhere in the triple.
+      if (Args.hasArg(options::OPT_spirv)) {
+        llvm::Triple T(TargetTriple);
+        T.setArch(llvm::Triple::spirv);
+        TargetTriple = T.str();
+      }
     } else {
       Diag(diag::err_drv_dxc_missing_target_profile);
     }
diff --git a/clang/test/Driver/dxc_spirv.hlsl b/clang/test/Driver/dxc_spirv.hlsl
new file mode 100644
index 0000000000000..a3c5c30af06e3
--- /dev/null
+++ b/clang/test/Driver/dxc_spirv.hlsl
@@ -0,0 +1,4 @@
+// RUN: %clang_dxc -T cs_6_0 -spirv -### %s 2>&1 | FileCheck %s
+
+// CHECK: "-triple" "spirv-unknown-shadermodel6.0-compute"
+// CHECK-SAME: "-x" "hlsl"

From 3e6d7c6d983dd5896e3a03857584654eb1360fda Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev@outlook.com>
Date: Tue, 18 Apr 2023 04:16:58 -0700
Subject: [PATCH 713/877] [SLP]Improve tryToGatherExtractElements by using
 per-register analysis.

Currently tryToGatherExtractElements function analyzes the whole vector,
regrdless number of actual registers, used in this vector. It may
prevent some optimizations, because per-register analysis may allow to
simplify the final code by reusing more already emitted vectors and
better shuffles.

Differential Revision: https://reviews.llvm.org/D148855
---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 535 ++++++++++--------
 .../AArch64/extractelements-to-shuffle.ll     | 135 ++---
 .../SLPVectorizer/X86/crash_clear_undefs.ll   |   2 +-
 .../SLPVectorizer/X86/hadd-inseltpoison.ll    | 152 ++++-
 .../test/Transforms/SLPVectorizer/X86/hadd.ll | 152 ++++-
 .../SLPVectorizer/X86/hsub-inseltpoison.ll    | 153 ++++-
 .../test/Transforms/SLPVectorizer/X86/hsub.ll | 153 ++++-
 .../X86/reused-extractelements.ll             |  23 +-
 8 files changed, 879 insertions(+), 426 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index cf776600338c6..14d2bcfd53f63 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -662,6 +662,36 @@ tryToGatherSingleRegisterExtractElements(MutableArrayRef<Value *> VL,
   return Res;
 }
 
+/// Tries to find extractelement instructions with constant indices from fixed
+/// vector type and gather such instructions into a bunch, which highly likely
+/// might be detected as a shuffle of 1 or 2 input vectors. If this attempt was
+/// successful, the matched scalars are replaced by poison values in \p VL for
+/// future analysis.
+static SmallVector<std::optional<TTI::ShuffleKind>>
+tryToGatherExtractElements(SmallVectorImpl<Value *> &VL,
+                           SmallVectorImpl<int> &Mask, unsigned NumParts) {
+  assert(NumParts > 0 && "NumParts expected be greater than or equal to 1.");
+  SmallVector<std::optional<TTI::ShuffleKind>> ShufflesRes(NumParts);
+  Mask.assign(VL.size(), PoisonMaskElem);
+  unsigned SliceSize = VL.size() / NumParts;
+  for (unsigned Part = 0; Part < NumParts; ++Part) {
+    // Scan list of gathered scalars for extractelements that can be represented
+    // as shuffles.
+    MutableArrayRef<Value *> SubVL =
+        MutableArrayRef(VL).slice(Part * SliceSize, SliceSize);
+    SmallVector<int> SubMask;
+    std::optional<TTI::ShuffleKind> Res =
+        tryToGatherSingleRegisterExtractElements(SubVL, SubMask);
+    ShufflesRes[Part] = Res;
+    copy(SubMask, std::next(Mask.begin(), Part * SliceSize));
+  }
+  if (none_of(ShufflesRes, [](const std::optional<TTI::ShuffleKind> &Res) {
+        return Res.has_value();
+      }))
+    ShufflesRes.clear();
+  return ShufflesRes;
+}
+
 namespace {
 
 /// Main data required for vectorization of instructions.
@@ -7152,101 +7182,80 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
                 : R.getGatherCost(Gathers, !Root && VL.equals(Gathers)));
   };
 
-  /// Compute the cost of creating a vector of type \p VecTy containing the
-  /// extracted values from \p VL.
-  InstructionCost computeExtractCost(ArrayRef<Value *> VL, ArrayRef<int> Mask,
-                                     TTI::ShuffleKind ShuffleKind) {
-    unsigned NumElts = 0;
-    for (Value *V : VL) {
-      auto *EE = dyn_cast<ExtractElementInst>(V);
-      if (!EE)
-        continue;
-      auto *VecTy = cast<FixedVectorType>(EE->getVectorOperandType());
-      NumElts = std::max(NumElts, VecTy->getNumElements());
-    }
-    assert(NumElts > 0 &&
-           "Expected at least 1-element fixed length vector(s).");
-    auto *VecTy = FixedVectorType::get(VL.front()->getType(), NumElts);
-    unsigned NumOfParts = TTI.getNumberOfParts(VecTy);
-    if (!NumOfParts || NumElts < NumOfParts)
-      return TTI.getShuffleCost(ShuffleKind, VecTy, Mask);
-    unsigned EltsPerVector = PowerOf2Ceil(divideCeil(NumElts, NumOfParts));
-    int ValNum = -1;
-    int ValIdx = -1;
-    // Check that if trying to permute 2 input vectors (which may result in
-    // several vector registers), each per-register subvector is the result of
-    // the permutation of 2 single registers.
-    if (ShuffleKind != TargetTransformInfo::SK_PermuteSingleSrc &&
-        !all_of(enumerate(Mask), [&](auto &&Arg) {
-          if (Arg.value() == PoisonMaskElem)
-            return true;
-          int CurValNum = (Arg.value() % NumElts) / EltsPerVector;
-          int CurValIdx = Arg.index() / EltsPerVector;
-          if (ValIdx != CurValIdx) {
-            ValIdx = CurValIdx;
-            ValNum = CurValNum;
-            return true;
-          }
-          return CurValNum == ValNum;
-        }))
-      return TTI.getShuffleCost(ShuffleKind, VecTy, Mask);
-
+  /// Compute the cost of creating a vector containing the extracted values from
+  /// \p VL.
+  InstructionCost
+  computeExtractCost(ArrayRef<Value *> VL, ArrayRef<int> Mask,
+                     ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
+                     unsigned NumParts) {
+    assert(VL.size() > NumParts && "Unexpected scalarized shuffle.");
+    unsigned NumElts =
+        std::accumulate(VL.begin(), VL.end(), 0, [](unsigned Sz, Value *V) {
+          auto *EE = dyn_cast<ExtractElementInst>(V);
+          if (!EE)
+            return Sz;
+          auto *VecTy = cast<FixedVectorType>(EE->getVectorOperandType());
+          return std::max(Sz, VecTy->getNumElements());
+        });
+    unsigned NumSrcRegs = TTI.getNumberOfParts(
+        FixedVectorType::get(VL.front()->getType(), NumElts));
+    if (NumSrcRegs == 0)
+      NumSrcRegs = 1;
+    // FIXME: this must be moved to TTI for better estimation.
+    unsigned EltsPerVector = PowerOf2Ceil(std::max(
+        divideCeil(VL.size(), NumParts), divideCeil(NumElts, NumSrcRegs)));
+    auto CheckPerRegistersShuffle =
+        [&](MutableArrayRef<int> Mask) -> std::optional<TTI::ShuffleKind> {
+      DenseSet<int> RegIndices;
+      // Check that if trying to permute same single/2 input vectors.
+      TTI::ShuffleKind ShuffleKind = TTI::SK_PermuteSingleSrc;
+      int FirstRegId = -1;
+      for (int &I : Mask) {
+        if (I == PoisonMaskElem)
+          continue;
+        int RegId = (I / NumElts) * NumParts + (I % NumElts) / EltsPerVector;
+        if (FirstRegId < 0)
+          FirstRegId = RegId;
+        RegIndices.insert(RegId);
+        if (RegIndices.size() > 2)
+          return std::nullopt;
+        if (RegIndices.size() == 2)
+          ShuffleKind = TTI::SK_PermuteTwoSrc;
+        I = (I % NumElts) % EltsPerVector +
+            (RegId == FirstRegId ? 0 : EltsPerVector);
+      }
+      return ShuffleKind;
+    };
     InstructionCost Cost = 0;
 
     // Process extracts in blocks of EltsPerVector to check if the source vector
     // operand can be re-used directly. If not, add the cost of creating a
     // shuffle to extract the values into a vector register.
-    auto *RegisterVecTy =
-        FixedVectorType::get(VL.front()->getType(), EltsPerVector);
-    SmallVector<int> RegMask(EltsPerVector, PoisonMaskElem);
-    TTI::ShuffleKind RegisterSK = TargetTransformInfo::SK_PermuteSingleSrc;
-    Value *VecBase = nullptr;
-    bool IsIdentity = true;
-    for (auto [Idx, V] : enumerate(VL)) {
-      // Reached the start of a new vector registers.
-      if (Idx % EltsPerVector == 0) {
-        RegMask.assign(EltsPerVector, PoisonMaskElem);
-        RegisterSK = TargetTransformInfo::SK_PermuteSingleSrc;
-        VecBase = nullptr;
-      }
-
-      // Need to exclude undefs from analysis.
-      if (isa<UndefValue>(V) || Mask[Idx] == PoisonMaskElem)
+    for (unsigned Part = 0; Part < NumParts; ++Part) {
+      if (!ShuffleKinds[Part])
         continue;
-
-      // Check all extracts for a vector register on the target directly
-      // extract values in order.
-      unsigned CurrentIdx = *getExtractIndex(cast<Instruction>(V));
-      unsigned PrevIdx = CurrentIdx;
-      if (Idx % EltsPerVector != 0 && !isa<UndefValue>(VL[Idx - 1]) &&
-          Mask[Idx - 1] != PoisonMaskElem)
-        PrevIdx = *getExtractIndex(cast<Instruction>(VL[Idx - 1])) + 1;
-      if (!VecBase) {
-        VecBase = cast<ExtractElementInst>(V)->getVectorOperand();
-        RegMask[Idx % EltsPerVector] = CurrentIdx % EltsPerVector;
-        IsIdentity = CurrentIdx % EltsPerVector == Idx % EltsPerVector;
-      } else if (VecBase != cast<ExtractElementInst>(V)->getVectorOperand()) {
-        IsIdentity = false;
-        RegisterSK = TargetTransformInfo::SK_PermuteTwoSrc;
-        RegMask[Idx % EltsPerVector] =
-            CurrentIdx % EltsPerVector + EltsPerVector;
-      } else {
-        IsIdentity &= PrevIdx == CurrentIdx &&
-                      CurrentIdx % EltsPerVector == Idx % EltsPerVector;
-        RegMask[Idx % EltsPerVector] = CurrentIdx % EltsPerVector;
-      }
-
-      if (IsIdentity)
+      ArrayRef<int> MaskSlice =
+          Mask.slice(Part * EltsPerVector,
+                     (Part == NumParts - 1 && Mask.size() % EltsPerVector != 0)
+                         ? Mask.size() % EltsPerVector
+                         : EltsPerVector);
+      SmallVector<int> SubMask(EltsPerVector, PoisonMaskElem);
+      copy(MaskSlice, SubMask.begin());
+      std::optional<TTI::ShuffleKind> RegShuffleKind =
+          CheckPerRegistersShuffle(SubMask);
+      if (!RegShuffleKind) {
+        Cost += TTI.getShuffleCost(
+            *ShuffleKinds[Part],
+            FixedVectorType::get(VL.front()->getType(), NumElts), MaskSlice);
         continue;
-
-      // Skip all indices, except for the last index per vector block.
-      if ((Idx + 1) % EltsPerVector != 0 && Idx + 1 != VL.size())
-        continue;
-
-      // If we have a series of extracts which are not consecutive and hence
-      // cannot re-use the source vector register directly, compute the shuffle
-      // cost to extract the vector with EltsPerVector elements.
-      Cost += TTI.getShuffleCost(RegisterSK, RegisterVecTy, RegMask);
+      }
+      if (*RegShuffleKind != TTI::SK_PermuteSingleSrc ||
+          !ShuffleVectorInst::isIdentityMask(SubMask, EltsPerVector)) {
+        Cost += TTI.getShuffleCost(
+            *RegShuffleKind,
+            FixedVectorType::get(VL.front()->getType(), EltsPerVector),
+            SubMask);
+      }
     }
     return Cost;
   }
@@ -7464,90 +7473,76 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
                        SmallPtrSetImpl<Value *> &CheckedExtracts)
       : TTI(TTI), VectorizedVals(VectorizedVals.begin(), VectorizedVals.end()),
         R(R), CheckedExtracts(CheckedExtracts) {}
-  Value *adjustExtracts(const TreeEntry *E, ArrayRef<int> Mask,
-                        TTI::ShuffleKind ShuffleKind) {
+  Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
+                        ArrayRef<std::optional<TTI::ShuffleKind>> ShuffleKinds,
+                        unsigned NumParts) {
     if (Mask.empty())
       return nullptr;
     Value *VecBase = nullptr;
     ArrayRef<Value *> VL = E->Scalars;
-    auto *VecTy = FixedVectorType::get(VL.front()->getType(), VL.size());
     // If the resulting type is scalarized, do not adjust the cost.
-    unsigned VecNumParts = TTI.getNumberOfParts(VecTy);
-    if (VecNumParts == VecTy->getNumElements())
+    if (NumParts == VL.size())
       return nullptr;
-    DenseMap<Value *, int> ExtractVectorsTys;
-    for (auto [I, V] : enumerate(VL)) {
-      // Ignore non-extractelement scalars.
-      if (isa<UndefValue>(V) || (!Mask.empty() && Mask[I] == PoisonMaskElem))
-        continue;
-      // If all users of instruction are going to be vectorized and this
-      // instruction itself is not going to be vectorized, consider this
-      // instruction as dead and remove its cost from the final cost of the
-      // vectorized tree.
-      // Also, avoid adjusting the cost for extractelements with multiple uses
-      // in different graph entries.
-      const TreeEntry *VE = R.getTreeEntry(V);
-      if (!CheckedExtracts.insert(V).second ||
-          !R.areAllUsersVectorized(cast<Instruction>(V), &VectorizedVals) ||
-          (VE && VE != E))
-        continue;
-      auto *EE = cast<ExtractElementInst>(V);
-      VecBase = EE->getVectorOperand();
-      std::optional<unsigned> EEIdx = getExtractIndex(EE);
-      if (!EEIdx)
-        continue;
-      unsigned Idx = *EEIdx;
-      if (VecNumParts != TTI.getNumberOfParts(EE->getVectorOperandType())) {
-        auto It =
-            ExtractVectorsTys.try_emplace(EE->getVectorOperand(), Idx).first;
-        It->getSecond() = std::min<int>(It->second, Idx);
-      }
-      // Take credit for instruction that will become dead.
-      if (EE->hasOneUse()) {
-        Instruction *Ext = EE->user_back();
-        if (isa<SExtInst, ZExtInst>(Ext) && all_of(Ext->users(), [](User *U) {
-              return isa<GetElementPtrInst>(U);
-            })) {
-          // Use getExtractWithExtendCost() to calculate the cost of
-          // extractelement/ext pair.
-          Cost -= TTI.getExtractWithExtendCost(Ext->getOpcode(), Ext->getType(),
-                                               EE->getVectorOperandType(), Idx);
-          // Add back the cost of s|zext which is subtracted separately.
-          Cost += TTI.getCastInstrCost(
-              Ext->getOpcode(), Ext->getType(), EE->getType(),
-              TTI::getCastContextHint(Ext), CostKind, Ext);
+    // Check if it can be considered reused if same extractelements were
+    // vectorized already.
+    bool PrevNodeFound = any_of(
+        ArrayRef(R.VectorizableTree).take_front(E->Idx),
+        [&](const std::unique_ptr<TreeEntry> &TE) {
+          return ((!TE->isAltShuffle() &&
+                   TE->getOpcode() == Instruction::ExtractElement) ||
+                  TE->State == TreeEntry::NeedToGather) &&
+                 all_of(enumerate(TE->Scalars), [&](auto &&Data) {
+                   return VL.size() > Data.index() &&
+                          (Mask[Data.index()] == PoisonMaskElem ||
+                           isa<UndefValue>(VL[Data.index()]) ||
+                           Data.value() == VL[Data.index()]);
+                 });
+        });
+    unsigned SliceSize = VL.size() / NumParts;
+    for (unsigned Part = 0; Part < NumParts; ++Part) {
+      ArrayRef<int> SubMask = Mask.slice(Part * SliceSize, SliceSize);
+      for (auto [I, V] : enumerate(VL.slice(Part * SliceSize, SliceSize))) {
+        // Ignore non-extractelement scalars.
+        if (isa<UndefValue>(V) ||
+            (!SubMask.empty() && SubMask[I] == PoisonMaskElem))
           continue;
-        }
-      }
-      Cost -= TTI.getVectorInstrCost(*EE, EE->getVectorOperandType(), CostKind,
-                                     Idx);
-    }
-    // Add a cost for subvector extracts/inserts if required.
-    for (const auto &Data : ExtractVectorsTys) {
-      auto *EEVTy = cast<FixedVectorType>(Data.first->getType());
-      unsigned NumElts = VecTy->getNumElements();
-      if (Data.second % NumElts == 0)
-        continue;
-      if (TTI.getNumberOfParts(EEVTy) > VecNumParts) {
-        unsigned Idx = (Data.second / NumElts) * NumElts;
-        unsigned EENumElts = EEVTy->getNumElements();
-        if (Idx % NumElts == 0)
+        // If all users of instruction are going to be vectorized and this
+        // instruction itself is not going to be vectorized, consider this
+        // instruction as dead and remove its cost from the final cost of the
+        // vectorized tree.
+        // Also, avoid adjusting the cost for extractelements with multiple uses
+        // in different graph entries.
+        const TreeEntry *VE = R.getTreeEntry(V);
+        if (!CheckedExtracts.insert(V).second ||
+            !R.areAllUsersVectorized(cast<Instruction>(V), &VectorizedVals) ||
+            (VE && VE != E))
           continue;
-        if (Idx + NumElts <= EENumElts) {
-          Cost += TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,
-                                     EEVTy, std::nullopt, CostKind, Idx, VecTy);
-        } else {
-          // Need to round up the subvector type vectorization factor to avoid a
-          // crash in cost model functions. Make SubVT so that Idx + VF of SubVT
-          // <= EENumElts.
-          auto *SubVT =
-              FixedVectorType::get(VecTy->getElementType(), EENumElts - Idx);
-          Cost += TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,
-                                     EEVTy, std::nullopt, CostKind, Idx, SubVT);
+        auto *EE = cast<ExtractElementInst>(V);
+        VecBase = EE->getVectorOperand();
+        std::optional<unsigned> EEIdx = getExtractIndex(EE);
+        if (!EEIdx)
+          continue;
+        unsigned Idx = *EEIdx;
+        // Take credit for instruction that will become dead.
+        if (EE->hasOneUse() || !PrevNodeFound) {
+          Instruction *Ext = EE->user_back();
+          if (isa<SExtInst, ZExtInst>(Ext) && all_of(Ext->users(), [](User *U) {
+                return isa<GetElementPtrInst>(U);
+              })) {
+            // Use getExtractWithExtendCost() to calculate the cost of
+            // extractelement/ext pair.
+            Cost -=
+                TTI.getExtractWithExtendCost(Ext->getOpcode(), Ext->getType(),
+                                             EE->getVectorOperandType(), Idx);
+            // Add back the cost of s|zext which is subtracted separately.
+            Cost += TTI.getCastInstrCost(
+                Ext->getOpcode(), Ext->getType(), EE->getType(),
+                TTI::getCastContextHint(Ext), CostKind, Ext);
+            continue;
+          }
         }
-      } else {
-        Cost += TTI.getShuffleCost(TargetTransformInfo::SK_InsertSubvector,
-                                   VecTy, std::nullopt, CostKind, 0, EEVTy);
+        Cost -= TTI.getVectorInstrCost(*EE, EE->getVectorOperandType(),
+                                       CostKind, Idx);
       }
     }
     // Check that gather of extractelements can be represented as just a
@@ -7555,7 +7550,9 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
     // Found the bunch of extractelement instructions that must be gathered
     // into a vector and can be represented as a permutation elements in a
     // single input vector or of 2 input vectors.
-    Cost += computeExtractCost(VL, Mask, ShuffleKind);
+    // Done for reused if same extractelements were vectorized already.
+    if (!PrevNodeFound)
+      Cost += computeExtractCost(VL, Mask, ShuffleKinds, NumParts);
     InVectors.assign(1, E);
     CommonMask.assign(Mask.begin(), Mask.end());
     transformMaskAfterShuffle(CommonMask, CommonMask);
@@ -7677,7 +7674,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
     assert((IsFinalized || CommonMask.empty()) &&
            "Shuffle construction must be finalized.");
   }
-};
+  };
 
 InstructionCost
 BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
@@ -7738,40 +7735,41 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
       reorderScalars(GatheredScalars, ReorderMask);
     SmallVector<int> Mask;
     SmallVector<int> ExtractMask;
-    std::optional<TargetTransformInfo::ShuffleKind> ExtractShuffle;
     SmallVector<std::optional<TargetTransformInfo::ShuffleKind>> GatherShuffles;
     SmallVector<SmallVector<const TreeEntry *>> Entries;
+    SmallVector<std::optional<TTI::ShuffleKind>> ExtractShuffles;
     // Check for gathered extracts.
-    ExtractShuffle =
-        tryToGatherSingleRegisterExtractElements(GatheredScalars, ExtractMask);
-
     bool Resized = false;
     unsigned NumParts = TTI->getNumberOfParts(VecTy);
     if (NumParts == 0 || NumParts >= GatheredScalars.size())
       NumParts = 1;
-    if (Value *VecBase = Estimator.adjustExtracts(
-            E, ExtractMask, ExtractShuffle.value_or(TTI::SK_PermuteTwoSrc))) {
-      if (auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))
-        if (VF == VecBaseTy->getNumElements() && GatheredScalars.size() != VF) {
-          Resized = true;
-          GatheredScalars.append(VF - GatheredScalars.size(),
-                                 PoisonValue::get(ScalarTy));
+    if (!all_of(GatheredScalars, UndefValue::classof)) {
+      ExtractShuffles =
+          tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
+      if (!ExtractShuffles.empty()) {
+        if (Value *VecBase = Estimator.adjustExtracts(
+                E, ExtractMask, ExtractShuffles, NumParts)) {
+          if (auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))
+            if (VF == VecBaseTy->getNumElements() &&
+                GatheredScalars.size() != VF) {
+              Resized = true;
+              GatheredScalars.append(VF - GatheredScalars.size(),
+                                     PoisonValue::get(ScalarTy));
+            }
         }
-    } else if (ExtractShuffle &&
-               TTI->getNumberOfParts(VecTy) == VecTy->getNumElements()) {
-      copy(VL, GatheredScalars.begin());
-    }
+      }
 
-    // Do not try to look for reshuffled loads for gathered loads (they will be
-    // handled later), for vectorized scalars, and cases, which are definitely
-    // not profitable (splats and small gather nodes.)
-    if (ExtractShuffle || E->getOpcode() != Instruction::Load ||
-        E->isAltShuffle() ||
-        all_of(E->Scalars, [this](Value *V) { return getTreeEntry(V); }) ||
-        isSplat(E->Scalars) ||
-        (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2))
-      GatherShuffles =
-          isGatherShuffledEntry(E, GatheredScalars, Mask, Entries, NumParts);
+      // Do not try to look for reshuffled loads for gathered loads (they will
+      // be handled later), for vectorized scalars, and cases, which are
+      // definitely not profitable (splats and small gather nodes.)
+      if (!ExtractShuffles.empty() || E->getOpcode() != Instruction::Load ||
+          E->isAltShuffle() ||
+          all_of(E->Scalars, [this](Value *V) { return getTreeEntry(V); }) ||
+          isSplat(E->Scalars) ||
+          (E->Scalars != GatheredScalars && GatheredScalars.size() <= 2))
+        GatherShuffles =
+            isGatherShuffledEntry(E, GatheredScalars, Mask, Entries, NumParts);
+    }
     if (!GatherShuffles.empty()) {
       if (GatherShuffles.size() == 1 &&
           *GatherShuffles.front() == TTI::SK_PermuteSingleSrc &&
@@ -10013,7 +10011,10 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
       : Builder(Builder), R(R) {}
 
   /// Adjusts extractelements after reusing them.
-  Value *adjustExtracts(const TreeEntry *E, ArrayRef<int> Mask) {
+  Value *adjustExtracts(const TreeEntry *E, MutableArrayRef<int> Mask,
+                        unsigned NumParts, bool &UseVecBaseAsInput) {
+    UseVecBaseAsInput = false;
+    SmallPtrSet<Value *, 4> UniqueBases;
     Value *VecBase = nullptr;
     for (int I = 0, Sz = Mask.size(); I < Sz; ++I) {
       int Idx = Mask[I];
@@ -10021,6 +10022,7 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
         continue;
       auto *EI = cast<ExtractElementInst>(E->Scalars[I]);
       VecBase = EI->getVectorOperand();
+      UniqueBases.insert(VecBase);
       // If the only one use is vectorized - can delete the extractelement
       // itself.
       if (!EI->hasOneUse() || any_of(EI->users(), [&](User *U) {
@@ -10029,7 +10031,75 @@ class BoUpSLP::ShuffleInstructionBuilder final : public BaseShuffleAnalysis {
         continue;
       R.eraseInstruction(EI);
     }
-    return VecBase;
+    if (NumParts == 1 || UniqueBases.size() == 1)
+      return VecBase;
+    UseVecBaseAsInput = true;
+    auto TransformToIdentity = [](MutableArrayRef<int> Mask) {
+      for (auto [I, Idx] : enumerate(Mask))
+        if (Idx != PoisonMaskElem)
+          Idx = I;
+    };
+    // Perform multi-register vector shuffle, joining them into a single virtual
+    // long vector.
+    // Need to shuffle each part independently and then insert all this parts
+    // into a long virtual vector register, forming the original vector.
+    Value *Vec = nullptr;
+    SmallVector<int> VecMask(Mask.size(), PoisonMaskElem);
+    unsigned SliceSize = E->Scalars.size() / NumParts;
+    for (unsigned Part = 0; Part < NumParts; ++Part) {
+      ArrayRef<Value *> VL =
+          ArrayRef(E->Scalars).slice(Part * SliceSize, SliceSize);
+      MutableArrayRef<int> SubMask = Mask.slice(Part * SliceSize, SliceSize);
+      constexpr int MaxBases = 2;
+      SmallVector<Value *, MaxBases> Bases(MaxBases);
+#ifndef NDEBUG
+      int PrevSize = 0;
+#endif // NDEBUG
+      for (const auto [I, V]: enumerate(VL)) {
+        if (SubMask[I] == PoisonMaskElem)
+          continue;
+        Value *VecOp = cast<ExtractElementInst>(V)->getVectorOperand();
+        const int Size =
+            cast<FixedVectorType>(VecOp->getType())->getNumElements();
+#ifndef NDEBUG
+        assert((PrevSize == Size || PrevSize == 0) &&
+               "Expected vectors of the same size.");
+        PrevSize = Size;
+#endif // NDEBUG
+        Bases[SubMask[I] < Size ? 0 : 1] = VecOp;
+      }
+      if (!Bases.front())
+        continue;
+      Value *SubVec;
+      if (Bases.back()) {
+        SubVec = createShuffle(Bases.front(), Bases.back(), SubMask);
+        TransformToIdentity(SubMask);
+      } else {
+        SubVec = Bases.front();
+      }
+      if (!Vec) {
+        Vec = SubVec;
+        copy(SubMask, VecMask.begin());
+      } else {
+        unsigned VF = cast<FixedVectorType>(Vec->getType())->getNumElements();
+        if (Vec->getType() != SubVec->getType()) {
+          unsigned SubVecVF =
+              cast<FixedVectorType>(SubVec->getType())->getNumElements();
+          if (VF < SubVecVF)
+            TransformToIdentity(VecMask);
+          VF = std::max(VF, SubVecVF);
+        }
+        // Adjust SubMask.
+        for (auto [I, Idx] : enumerate(SubMask))
+          if (Idx != PoisonMaskElem)
+            Idx += VF;
+        copy(SubMask, std::next(VecMask.begin(), Part * SliceSize));
+        Vec = createShuffle(Vec, SubVec, VecMask);
+        TransformToIdentity(VecMask);
+      }
+    }
+    copy(VecMask, Mask.begin());
+    return Vec;
   }
   /// Checks if the specified entry \p E needs to be delayed because of its
   /// dependency nodes.
@@ -10372,29 +10442,37 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
   BVTy ShuffleBuilder(Params...);
   ResTy Res = ResTy();
   SmallVector<int> Mask;
-  SmallVector<int> ExtractMask;
-  std::optional<TargetTransformInfo::ShuffleKind> ExtractShuffle;
+  SmallVector<int> ExtractMask(GatheredScalars.size(), PoisonMaskElem);
+  SmallVector<std::optional<TTI::ShuffleKind>> ExtractShuffles;
+  Value *ExtractVecBase = nullptr;
+  bool UseVecBaseAsInput = false;
   SmallVector<std::optional<TargetTransformInfo::ShuffleKind>> GatherShuffles;
   SmallVector<SmallVector<const TreeEntry *>> Entries;
   Type *ScalarTy = GatheredScalars.front()->getType();
-  unsigned NumParts = TTI->getNumberOfParts(
-      FixedVectorType::get(ScalarTy, GatheredScalars.size()));
+  auto *VecTy = FixedVectorType::get(ScalarTy, GatheredScalars.size());
+  unsigned NumParts = TTI->getNumberOfParts(VecTy);
   if (NumParts == 0 || NumParts >= GatheredScalars.size())
     NumParts = 1;
   if (!all_of(GatheredScalars, UndefValue::classof)) {
     // Check for gathered extracts.
-    ExtractShuffle =
-        tryToGatherSingleRegisterExtractElements(GatheredScalars, ExtractMask);
     bool Resized = false;
-    if (Value *VecBase = ShuffleBuilder.adjustExtracts(E, ExtractMask))
-      if (auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))
-        if (VF == VecBaseTy->getNumElements() && GatheredScalars.size() != VF) {
-          Resized = true;
-          GatheredScalars.append(VF - GatheredScalars.size(),
-                                 PoisonValue::get(ScalarTy));
-        }
+    ExtractShuffles =
+        tryToGatherExtractElements(GatheredScalars, ExtractMask, NumParts);
+    if (!ExtractShuffles.empty()) {
+      if (Value *VecBase = ShuffleBuilder.adjustExtracts(
+              E, ExtractMask, NumParts, UseVecBaseAsInput)) {
+        ExtractVecBase = VecBase;
+        if (auto *VecBaseTy = dyn_cast<FixedVectorType>(VecBase->getType()))
+          if (VF == VecBaseTy->getNumElements() &&
+              GatheredScalars.size() != VF) {
+            Resized = true;
+            GatheredScalars.append(VF - GatheredScalars.size(),
+                                   PoisonValue::get(ScalarTy));
+          }
+      }
+    }
     // Gather extracts after we check for full matched gathers only.
-    if (ExtractShuffle || E->getOpcode() != Instruction::Load ||
+    if (!ExtractShuffles.empty() || E->getOpcode() != Instruction::Load ||
         E->isAltShuffle() ||
         all_of(E->Scalars, [this](Value *V) { return getTreeEntry(V); }) ||
         isSplat(E->Scalars) ||
@@ -10545,30 +10623,35 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
       }
     }
   };
-  if (ExtractShuffle || !GatherShuffles.empty()) {
+  if (!ExtractShuffles.empty() || !GatherShuffles.empty()) {
     bool IsNonPoisoned = true;
     bool IsUsedInExpr = true;
     Value *Vec1 = nullptr;
-    if (ExtractShuffle) {
+    if (!ExtractShuffles.empty()) {
       // Gather of extractelements can be represented as just a shuffle of
       // a single/two vectors the scalars are extracted from.
       // Find input vectors.
       Value *Vec2 = nullptr;
       for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
-        if (ExtractMask[I] == PoisonMaskElem ||
-            (!Mask.empty() && Mask[I] != PoisonMaskElem)) {
+        if (!Mask.empty() && Mask[I] != PoisonMaskElem)
           ExtractMask[I] = PoisonMaskElem;
-          continue;
-        }
-        if (isa<UndefValue>(E->Scalars[I]))
-          continue;
-        auto *EI = cast<ExtractElementInst>(E->Scalars[I]);
-        if (!Vec1) {
-          Vec1 = EI->getVectorOperand();
-        } else if (Vec1 != EI->getVectorOperand()) {
-          assert((!Vec2 || Vec2 == EI->getVectorOperand()) &&
-                 "Expected only 1 or 2 vectors shuffle.");
-          Vec2 = EI->getVectorOperand();
+      }
+      if (UseVecBaseAsInput) {
+        Vec1 = ExtractVecBase;
+      } else {
+        for (unsigned I = 0, Sz = ExtractMask.size(); I < Sz; ++I) {
+          if (ExtractMask[I] == PoisonMaskElem)
+            continue;
+          if (isa<UndefValue>(E->Scalars[I]))
+            continue;
+          auto *EI = cast<ExtractElementInst>(E->Scalars[I]);
+          if (!Vec1) {
+            Vec1 = EI->getVectorOperand();
+          } else if (Vec1 != EI->getVectorOperand()) {
+            assert((!Vec2 || Vec2 == EI->getVectorOperand()) &&
+                   "Expected only 1 or 2 vectors shuffle.");
+            Vec2 = EI->getVectorOperand();
+          }
         }
       }
       if (Vec2) {
@@ -10629,10 +10712,14 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Args &...Params) {
     int MSz = Mask.size();
     // Try to build constant vector and shuffle with it only if currently we
     // have a single permutation and more than 1 scalar constants.
-    bool IsSingleShuffle = !ExtractShuffle || GatherShuffles.empty();
+    bool IsSingleShuffle = ExtractShuffles.empty() || GatherShuffles.empty();
     bool IsIdentityShuffle =
-        (ExtractShuffle.value_or(TTI::SK_PermuteTwoSrc) ==
-             TTI::SK_PermuteSingleSrc &&
+        ((UseVecBaseAsInput ||
+          all_of(ExtractShuffles,
+                 [](const std::optional<TTI::ShuffleKind> &SK) {
+                   return SK.value_or(TTI::SK_PermuteTwoSrc) ==
+                          TTI::SK_PermuteSingleSrc;
+                 })) &&
          none_of(ExtractMask, [&](int I) { return I >= EMSz; }) &&
          ShuffleVectorInst::isIdentityMask(ExtractMask, EMSz)) ||
         (!GatherShuffles.empty() &&
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll
index e60e356e5cd81..8f76b2e54e6c2 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll
@@ -75,64 +75,47 @@ define void @dist_vec(ptr nocapture noundef readonly %pA, ptr nocapture noundef
 ; CHECK-NEXT:    [[TMP4TT_0_LCSSA:%.*]] = phi <2 x i64> [ zeroinitializer, [[ENTRY]] ], [ [[ADD_I]], [[WHILE_END_LOOPEXIT]] ]
 ; CHECK-NEXT:    [[PB_ADDR_0_LCSSA:%.*]] = phi ptr [ [[PB]], [[ENTRY]] ], [ [[SCEVGEP311]], [[WHILE_END_LOOPEXIT]] ]
 ; CHECK-NEXT:    [[PA_ADDR_0_LCSSA:%.*]] = phi ptr [ [[PA]], [[ENTRY]] ], [ [[SCEVGEP]], [[WHILE_END_LOOPEXIT]] ]
-; CHECK-NEXT:    [[VGETQ_LANE:%.*]] = extractelement <2 x i64> [[TMP4TT_0_LCSSA]], i64 0
-; CHECK-NEXT:    [[VGETQ_LANE45:%.*]] = extractelement <2 x i64> [[TMP4TT_0_LCSSA]], i64 1
-; CHECK-NEXT:    [[ADD:%.*]] = add i64 [[VGETQ_LANE]], [[VGETQ_LANE45]]
-; CHECK-NEXT:    [[CONV48:%.*]] = trunc i64 [[ADD]] to i32
-; CHECK-NEXT:    [[VGETQ_LANE51:%.*]] = extractelement <2 x i64> [[TMP4FF_0_LCSSA]], i64 0
-; CHECK-NEXT:    [[VGETQ_LANE55:%.*]] = extractelement <2 x i64> [[TMP4FF_0_LCSSA]], i64 1
-; CHECK-NEXT:    [[ADD57:%.*]] = add i64 [[VGETQ_LANE51]], [[VGETQ_LANE55]]
-; CHECK-NEXT:    [[CONV60:%.*]] = trunc i64 [[ADD57]] to i32
-; CHECK-NEXT:    [[VGETQ_LANE63:%.*]] = extractelement <2 x i64> [[TMP4TF_0_LCSSA]], i64 0
-; CHECK-NEXT:    [[VGETQ_LANE67:%.*]] = extractelement <2 x i64> [[TMP4TF_0_LCSSA]], i64 1
-; CHECK-NEXT:    [[ADD69:%.*]] = add i64 [[VGETQ_LANE63]], [[VGETQ_LANE67]]
-; CHECK-NEXT:    [[CONV72:%.*]] = trunc i64 [[ADD69]] to i32
-; CHECK-NEXT:    [[VGETQ_LANE75:%.*]] = extractelement <2 x i64> [[TMP4FT_0_LCSSA]], i64 0
-; CHECK-NEXT:    [[VGETQ_LANE79:%.*]] = extractelement <2 x i64> [[TMP4FT_0_LCSSA]], i64 1
-; CHECK-NEXT:    [[ADD81:%.*]] = add i64 [[VGETQ_LANE75]], [[VGETQ_LANE79]]
-; CHECK-NEXT:    [[CONV84:%.*]] = trunc i64 [[ADD81]] to i32
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x i64> [[TMP4FT_0_LCSSA]], <2 x i64> [[TMP4TF_0_LCSSA]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <2 x i64> [[TMP4TT_0_LCSSA]], <2 x i64> [[TMP4FF_0_LCSSA]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <2 x i64> [[TMP10]], <2 x i64> [[TMP11]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <2 x i64> [[TMP4FT_0_LCSSA]], <2 x i64> [[TMP4TF_0_LCSSA]], <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP14:%.*]] = shufflevector <2 x i64> [[TMP4TT_0_LCSSA]], <2 x i64> [[TMP4FF_0_LCSSA]], <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP16:%.*]] = add <4 x i64> [[TMP12]], [[TMP15]]
+; CHECK-NEXT:    [[TMP17:%.*]] = trunc <4 x i64> [[TMP16]] to <4 x i32>
 ; CHECK-NEXT:    [[AND:%.*]] = and i32 [[NUMBEROFBOOLS]], 127
 ; CHECK-NEXT:    [[CMP86284:%.*]] = icmp ugt i32 [[AND]], 31
 ; CHECK-NEXT:    br i1 [[CMP86284]], label [[WHILE_BODY88:%.*]], label [[WHILE_END122:%.*]]
 ; CHECK:       while.body88:
 ; CHECK-NEXT:    [[PA_ADDR_1291:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[WHILE_END121:%.*]] ], [ [[PA_ADDR_0_LCSSA]], [[WHILE_END]] ]
 ; CHECK-NEXT:    [[PB_ADDR_1290:%.*]] = phi ptr [ [[INCDEC_PTR89:%.*]], [[WHILE_END121]] ], [ [[PB_ADDR_0_LCSSA]], [[WHILE_END]] ]
-; CHECK-NEXT:    [[_CTT_0289:%.*]] = phi i32 [ [[ADD99:%.*]], [[WHILE_END121]] ], [ [[CONV48]], [[WHILE_END]] ]
-; CHECK-NEXT:    [[_CFF_0288:%.*]] = phi i32 [ [[ADD106:%.*]], [[WHILE_END121]] ], [ [[CONV60]], [[WHILE_END]] ]
-; CHECK-NEXT:    [[_CTF_0287:%.*]] = phi i32 [ [[ADD113:%.*]], [[WHILE_END121]] ], [ [[CONV72]], [[WHILE_END]] ]
-; CHECK-NEXT:    [[_CFT_0286:%.*]] = phi i32 [ [[ADD120:%.*]], [[WHILE_END121]] ], [ [[CONV84]], [[WHILE_END]] ]
 ; CHECK-NEXT:    [[NBBOOLBLOCK_1285:%.*]] = phi i32 [ [[SUB:%.*]], [[WHILE_END121]] ], [ [[AND]], [[WHILE_END]] ]
-; CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[PA_ADDR_1291]], align 4
-; CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[PB_ADDR_1290]], align 4
+; CHECK-NEXT:    [[TMP18:%.*]] = phi <4 x i32> [ [[TMP34:%.*]], [[WHILE_END121]] ], [ [[TMP17]], [[WHILE_END]] ]
+; CHECK-NEXT:    [[TMP19:%.*]] = load i32, ptr [[PA_ADDR_1291]], align 4
+; CHECK-NEXT:    [[TMP20:%.*]] = load i32, ptr [[PB_ADDR_1290]], align 4
 ; CHECK-NEXT:    br label [[WHILE_BODY93:%.*]]
 ; CHECK:       while.body93:
-; CHECK-NEXT:    [[_CTT_1283:%.*]] = phi i32 [ [[_CTT_0289]], [[WHILE_BODY88]] ], [ [[ADD99]], [[WHILE_BODY93]] ]
-; CHECK-NEXT:    [[_CFF_1282:%.*]] = phi i32 [ [[_CFF_0288]], [[WHILE_BODY88]] ], [ [[ADD106]], [[WHILE_BODY93]] ]
-; CHECK-NEXT:    [[_CTF_1281:%.*]] = phi i32 [ [[_CTF_0287]], [[WHILE_BODY88]] ], [ [[ADD113]], [[WHILE_BODY93]] ]
-; CHECK-NEXT:    [[_CFT_1280:%.*]] = phi i32 [ [[_CFT_0286]], [[WHILE_BODY88]] ], [ [[ADD120]], [[WHILE_BODY93]] ]
-; CHECK-NEXT:    [[A_0279:%.*]] = phi i32 [ [[TMP10]], [[WHILE_BODY88]] ], [ [[SHR96:%.*]], [[WHILE_BODY93]] ]
-; CHECK-NEXT:    [[B_0278:%.*]] = phi i32 [ [[TMP11]], [[WHILE_BODY88]] ], [ [[SHR97:%.*]], [[WHILE_BODY93]] ]
+; CHECK-NEXT:    [[A_0279:%.*]] = phi i32 [ [[TMP19]], [[WHILE_BODY88]] ], [ [[SHR96:%.*]], [[WHILE_BODY93]] ]
+; CHECK-NEXT:    [[B_0278:%.*]] = phi i32 [ [[TMP20]], [[WHILE_BODY88]] ], [ [[SHR97:%.*]], [[WHILE_BODY93]] ]
 ; CHECK-NEXT:    [[SHIFT_0277:%.*]] = phi i32 [ 0, [[WHILE_BODY88]] ], [ [[INC:%.*]], [[WHILE_BODY93]] ]
+; CHECK-NEXT:    [[TMP21:%.*]] = phi <4 x i32> [ [[TMP18]], [[WHILE_BODY88]] ], [ [[TMP34]], [[WHILE_BODY93]] ]
 ; CHECK-NEXT:    [[AND94:%.*]] = and i32 [[A_0279]], 1
 ; CHECK-NEXT:    [[AND95:%.*]] = and i32 [[B_0278]], 1
 ; CHECK-NEXT:    [[SHR96]] = lshr i32 [[A_0279]], 1
 ; CHECK-NEXT:    [[SHR97]] = lshr i32 [[B_0278]], 1
-; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[AND94]], 0
-; CHECK-NEXT:    [[TOBOOL98:%.*]] = icmp ne i32 [[AND95]], 0
-; CHECK-NEXT:    [[TMP12:%.*]] = select i1 [[TOBOOL]], i1 [[TOBOOL98]], i1 false
-; CHECK-NEXT:    [[LAND_EXT:%.*]] = zext i1 [[TMP12]] to i32
-; CHECK-NEXT:    [[ADD99]] = add i32 [[_CTT_1283]], [[LAND_EXT]]
-; CHECK-NEXT:    [[TOBOOL100:%.*]] = icmp eq i32 [[AND94]], 0
-; CHECK-NEXT:    [[TOBOOL103:%.*]] = icmp eq i32 [[AND95]], 0
-; CHECK-NEXT:    [[TMP13:%.*]] = select i1 [[TOBOOL100]], i1 [[TOBOOL103]], i1 false
-; CHECK-NEXT:    [[LAND_EXT105:%.*]] = zext i1 [[TMP13]] to i32
-; CHECK-NEXT:    [[ADD106]] = add i32 [[_CFF_1282]], [[LAND_EXT105]]
-; CHECK-NEXT:    [[TMP14:%.*]] = select i1 [[TOBOOL]], i1 [[TOBOOL103]], i1 false
-; CHECK-NEXT:    [[LAND_EXT112:%.*]] = zext i1 [[TMP14]] to i32
-; CHECK-NEXT:    [[ADD113]] = add i32 [[_CTF_1281]], [[LAND_EXT112]]
-; CHECK-NEXT:    [[TMP15:%.*]] = select i1 [[TOBOOL100]], i1 [[TOBOOL98]], i1 false
-; CHECK-NEXT:    [[LAND_EXT119:%.*]] = zext i1 [[TMP15]] to i32
-; CHECK-NEXT:    [[ADD120]] = add i32 [[_CFT_1280]], [[LAND_EXT119]]
+; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <2 x i32> poison, i32 [[AND94]], i32 0
+; CHECK-NEXT:    [[TMP23:%.*]] = shufflevector <2 x i32> [[TMP22]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP24:%.*]] = icmp eq <2 x i32> [[TMP23]], zeroinitializer
+; CHECK-NEXT:    [[TMP25:%.*]] = icmp ne <2 x i32> [[TMP23]], zeroinitializer
+; CHECK-NEXT:    [[TMP26:%.*]] = shufflevector <2 x i1> [[TMP24]], <2 x i1> [[TMP25]], <4 x i32> <i32 0, i32 3, i32 3, i32 0>
+; CHECK-NEXT:    [[TMP27:%.*]] = insertelement <2 x i32> poison, i32 [[AND95]], i32 0
+; CHECK-NEXT:    [[TMP28:%.*]] = shufflevector <2 x i32> [[TMP27]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP29:%.*]] = icmp ne <2 x i32> [[TMP28]], zeroinitializer
+; CHECK-NEXT:    [[TMP30:%.*]] = icmp eq <2 x i32> [[TMP28]], zeroinitializer
+; CHECK-NEXT:    [[TMP31:%.*]] = shufflevector <2 x i1> [[TMP29]], <2 x i1> [[TMP30]], <4 x i32> <i32 0, i32 3, i32 0, i32 3>
+; CHECK-NEXT:    [[TMP32:%.*]] = select <4 x i1> [[TMP26]], <4 x i1> [[TMP31]], <4 x i1> zeroinitializer
+; CHECK-NEXT:    [[TMP33:%.*]] = zext <4 x i1> [[TMP32]] to <4 x i32>
+; CHECK-NEXT:    [[TMP34]] = add <4 x i32> [[TMP21]], [[TMP33]]
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[SHIFT_0277]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], 32
 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[WHILE_END121]], label [[WHILE_BODY93]]
@@ -144,61 +127,53 @@ define void @dist_vec(ptr nocapture noundef readonly %pA, ptr nocapture noundef
 ; CHECK-NEXT:    br i1 [[CMP86]], label [[WHILE_BODY88]], label [[WHILE_END122]]
 ; CHECK:       while.end122:
 ; CHECK-NEXT:    [[NBBOOLBLOCK_1_LCSSA:%.*]] = phi i32 [ [[AND]], [[WHILE_END]] ], [ [[SUB]], [[WHILE_END121]] ]
-; CHECK-NEXT:    [[_CFT_0_LCSSA:%.*]] = phi i32 [ [[CONV84]], [[WHILE_END]] ], [ [[ADD120]], [[WHILE_END121]] ]
-; CHECK-NEXT:    [[_CTF_0_LCSSA:%.*]] = phi i32 [ [[CONV72]], [[WHILE_END]] ], [ [[ADD113]], [[WHILE_END121]] ]
-; CHECK-NEXT:    [[_CFF_0_LCSSA:%.*]] = phi i32 [ [[CONV60]], [[WHILE_END]] ], [ [[ADD106]], [[WHILE_END121]] ]
-; CHECK-NEXT:    [[_CTT_0_LCSSA:%.*]] = phi i32 [ [[CONV48]], [[WHILE_END]] ], [ [[ADD99]], [[WHILE_END121]] ]
 ; CHECK-NEXT:    [[PB_ADDR_1_LCSSA:%.*]] = phi ptr [ [[PB_ADDR_0_LCSSA]], [[WHILE_END]] ], [ [[INCDEC_PTR89]], [[WHILE_END121]] ]
 ; CHECK-NEXT:    [[PA_ADDR_1_LCSSA:%.*]] = phi ptr [ [[PA_ADDR_0_LCSSA]], [[WHILE_END]] ], [ [[INCDEC_PTR]], [[WHILE_END121]] ]
+; CHECK-NEXT:    [[TMP35:%.*]] = phi <4 x i32> [ [[TMP17]], [[WHILE_END]] ], [ [[TMP34]], [[WHILE_END121]] ]
 ; CHECK-NEXT:    [[CMP130_NOT299:%.*]] = icmp eq i32 [[NBBOOLBLOCK_1_LCSSA]], 0
 ; CHECK-NEXT:    br i1 [[CMP130_NOT299]], label [[WHILE_END166:%.*]], label [[WHILE_BODY132_PREHEADER:%.*]]
 ; CHECK:       while.body132.preheader:
-; CHECK-NEXT:    [[TMP16:%.*]] = load i32, ptr [[PB_ADDR_1_LCSSA]], align 4
+; CHECK-NEXT:    [[TMP36:%.*]] = load i32, ptr [[PB_ADDR_1_LCSSA]], align 4
 ; CHECK-NEXT:    [[SUB125:%.*]] = sub nuw nsw i32 32, [[NBBOOLBLOCK_1_LCSSA]]
-; CHECK-NEXT:    [[SHR128:%.*]] = lshr i32 [[TMP16]], [[SUB125]]
-; CHECK-NEXT:    [[TMP17:%.*]] = load i32, ptr [[PA_ADDR_1_LCSSA]], align 4
-; CHECK-NEXT:    [[SHR126:%.*]] = lshr i32 [[TMP17]], [[SUB125]]
+; CHECK-NEXT:    [[SHR128:%.*]] = lshr i32 [[TMP36]], [[SUB125]]
+; CHECK-NEXT:    [[TMP37:%.*]] = load i32, ptr [[PA_ADDR_1_LCSSA]], align 4
+; CHECK-NEXT:    [[SHR126:%.*]] = lshr i32 [[TMP37]], [[SUB125]]
 ; CHECK-NEXT:    br label [[WHILE_BODY132:%.*]]
 ; CHECK:       while.body132:
-; CHECK-NEXT:    [[_CTT_2306:%.*]] = phi i32 [ [[ADD142:%.*]], [[WHILE_BODY132]] ], [ [[_CTT_0_LCSSA]], [[WHILE_BODY132_PREHEADER]] ]
-; CHECK-NEXT:    [[_CFF_2305:%.*]] = phi i32 [ [[ADD150:%.*]], [[WHILE_BODY132]] ], [ [[_CFF_0_LCSSA]], [[WHILE_BODY132_PREHEADER]] ]
-; CHECK-NEXT:    [[_CTF_2304:%.*]] = phi i32 [ [[ADD157:%.*]], [[WHILE_BODY132]] ], [ [[_CTF_0_LCSSA]], [[WHILE_BODY132_PREHEADER]] ]
-; CHECK-NEXT:    [[_CFT_2303:%.*]] = phi i32 [ [[ADD164:%.*]], [[WHILE_BODY132]] ], [ [[_CFT_0_LCSSA]], [[WHILE_BODY132_PREHEADER]] ]
 ; CHECK-NEXT:    [[NBBOOLBLOCK_2302:%.*]] = phi i32 [ [[DEC165:%.*]], [[WHILE_BODY132]] ], [ [[NBBOOLBLOCK_1_LCSSA]], [[WHILE_BODY132_PREHEADER]] ]
 ; CHECK-NEXT:    [[A_1301:%.*]] = phi i32 [ [[SHR135:%.*]], [[WHILE_BODY132]] ], [ [[SHR126]], [[WHILE_BODY132_PREHEADER]] ]
 ; CHECK-NEXT:    [[B_1300:%.*]] = phi i32 [ [[SHR136:%.*]], [[WHILE_BODY132]] ], [ [[SHR128]], [[WHILE_BODY132_PREHEADER]] ]
+; CHECK-NEXT:    [[TMP38:%.*]] = phi <4 x i32> [ [[TMP51:%.*]], [[WHILE_BODY132]] ], [ [[TMP35]], [[WHILE_BODY132_PREHEADER]] ]
 ; CHECK-NEXT:    [[AND133:%.*]] = and i32 [[A_1301]], 1
 ; CHECK-NEXT:    [[AND134:%.*]] = and i32 [[B_1300]], 1
 ; CHECK-NEXT:    [[SHR135]] = lshr i32 [[A_1301]], 1
 ; CHECK-NEXT:    [[SHR136]] = lshr i32 [[B_1300]], 1
-; CHECK-NEXT:    [[TOBOOL137:%.*]] = icmp ne i32 [[AND133]], 0
-; CHECK-NEXT:    [[TOBOOL139:%.*]] = icmp ne i32 [[AND134]], 0
-; CHECK-NEXT:    [[TMP18:%.*]] = select i1 [[TOBOOL137]], i1 [[TOBOOL139]], i1 false
-; CHECK-NEXT:    [[LAND_EXT141:%.*]] = zext i1 [[TMP18]] to i32
-; CHECK-NEXT:    [[ADD142]] = add i32 [[_CTT_2306]], [[LAND_EXT141]]
-; CHECK-NEXT:    [[TOBOOL144:%.*]] = icmp eq i32 [[AND133]], 0
-; CHECK-NEXT:    [[TOBOOL147:%.*]] = icmp eq i32 [[AND134]], 0
-; CHECK-NEXT:    [[TMP19:%.*]] = select i1 [[TOBOOL144]], i1 [[TOBOOL147]], i1 false
-; CHECK-NEXT:    [[LAND_EXT149:%.*]] = zext i1 [[TMP19]] to i32
-; CHECK-NEXT:    [[ADD150]] = add i32 [[_CFF_2305]], [[LAND_EXT149]]
-; CHECK-NEXT:    [[TMP20:%.*]] = select i1 [[TOBOOL137]], i1 [[TOBOOL147]], i1 false
-; CHECK-NEXT:    [[LAND_EXT156:%.*]] = zext i1 [[TMP20]] to i32
-; CHECK-NEXT:    [[ADD157]] = add i32 [[_CTF_2304]], [[LAND_EXT156]]
-; CHECK-NEXT:    [[TMP21:%.*]] = select i1 [[TOBOOL144]], i1 [[TOBOOL139]], i1 false
-; CHECK-NEXT:    [[LAND_EXT163:%.*]] = zext i1 [[TMP21]] to i32
-; CHECK-NEXT:    [[ADD164]] = add i32 [[_CFT_2303]], [[LAND_EXT163]]
+; CHECK-NEXT:    [[TMP39:%.*]] = insertelement <2 x i32> poison, i32 [[AND133]], i32 0
+; CHECK-NEXT:    [[TMP40:%.*]] = shufflevector <2 x i32> [[TMP39]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP41:%.*]] = icmp eq <2 x i32> [[TMP40]], zeroinitializer
+; CHECK-NEXT:    [[TMP42:%.*]] = icmp ne <2 x i32> [[TMP40]], zeroinitializer
+; CHECK-NEXT:    [[TMP43:%.*]] = shufflevector <2 x i1> [[TMP41]], <2 x i1> [[TMP42]], <4 x i32> <i32 0, i32 3, i32 3, i32 0>
+; CHECK-NEXT:    [[TMP44:%.*]] = insertelement <2 x i32> poison, i32 [[AND134]], i32 0
+; CHECK-NEXT:    [[TMP45:%.*]] = shufflevector <2 x i32> [[TMP44]], <2 x i32> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP46:%.*]] = icmp ne <2 x i32> [[TMP45]], zeroinitializer
+; CHECK-NEXT:    [[TMP47:%.*]] = icmp eq <2 x i32> [[TMP45]], zeroinitializer
+; CHECK-NEXT:    [[TMP48:%.*]] = shufflevector <2 x i1> [[TMP46]], <2 x i1> [[TMP47]], <4 x i32> <i32 0, i32 3, i32 0, i32 3>
+; CHECK-NEXT:    [[TMP49:%.*]] = select <4 x i1> [[TMP43]], <4 x i1> [[TMP48]], <4 x i1> zeroinitializer
+; CHECK-NEXT:    [[TMP50:%.*]] = zext <4 x i1> [[TMP49]] to <4 x i32>
+; CHECK-NEXT:    [[TMP51]] = add <4 x i32> [[TMP38]], [[TMP50]]
 ; CHECK-NEXT:    [[DEC165]] = add nsw i32 [[NBBOOLBLOCK_2302]], -1
 ; CHECK-NEXT:    [[CMP130_NOT:%.*]] = icmp eq i32 [[DEC165]], 0
 ; CHECK-NEXT:    br i1 [[CMP130_NOT]], label [[WHILE_END166]], label [[WHILE_BODY132]]
 ; CHECK:       while.end166:
-; CHECK-NEXT:    [[_CFT_2_LCSSA:%.*]] = phi i32 [ [[_CFT_0_LCSSA]], [[WHILE_END122]] ], [ [[ADD164]], [[WHILE_BODY132]] ]
-; CHECK-NEXT:    [[_CTF_2_LCSSA:%.*]] = phi i32 [ [[_CTF_0_LCSSA]], [[WHILE_END122]] ], [ [[ADD157]], [[WHILE_BODY132]] ]
-; CHECK-NEXT:    [[_CFF_2_LCSSA:%.*]] = phi i32 [ [[_CFF_0_LCSSA]], [[WHILE_END122]] ], [ [[ADD150]], [[WHILE_BODY132]] ]
-; CHECK-NEXT:    [[_CTT_2_LCSSA:%.*]] = phi i32 [ [[_CTT_0_LCSSA]], [[WHILE_END122]] ], [ [[ADD142]], [[WHILE_BODY132]] ]
-; CHECK-NEXT:    store i32 [[_CTT_2_LCSSA]], ptr [[CTT:%.*]], align 4
-; CHECK-NEXT:    store i32 [[_CFF_2_LCSSA]], ptr [[CFF:%.*]], align 4
-; CHECK-NEXT:    store i32 [[_CTF_2_LCSSA]], ptr [[CTF:%.*]], align 4
-; CHECK-NEXT:    store i32 [[_CFT_2_LCSSA]], ptr [[CFT:%.*]], align 4
+; CHECK-NEXT:    [[TMP52:%.*]] = phi <4 x i32> [ [[TMP35]], [[WHILE_END122]] ], [ [[TMP51]], [[WHILE_BODY132]] ]
+; CHECK-NEXT:    [[TMP53:%.*]] = extractelement <4 x i32> [[TMP52]], i32 2
+; CHECK-NEXT:    store i32 [[TMP53]], ptr [[CTT:%.*]], align 4
+; CHECK-NEXT:    [[TMP54:%.*]] = extractelement <4 x i32> [[TMP52]], i32 3
+; CHECK-NEXT:    store i32 [[TMP54]], ptr [[CFF:%.*]], align 4
+; CHECK-NEXT:    [[TMP55:%.*]] = extractelement <4 x i32> [[TMP52]], i32 1
+; CHECK-NEXT:    store i32 [[TMP55]], ptr [[CTF:%.*]], align 4
+; CHECK-NEXT:    [[TMP56:%.*]] = extractelement <4 x i32> [[TMP52]], i32 0
+; CHECK-NEXT:    store i32 [[TMP56]], ptr [[CFT:%.*]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_clear_undefs.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_clear_undefs.ll
index de99654d84eb8..c2369a6a89ec1 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/crash_clear_undefs.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_clear_undefs.ll
@@ -9,7 +9,7 @@ target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16
 ; YAML-NEXT:  Function:        foo
 ; YAML-NEXT:  Args:
 ; YAML-NEXT:    - String:          'SLP vectorized with cost '
-; YAML-NEXT:    - Cost:            '-3'
+; YAML-NEXT:    - Cost:            '-4'
 ; YAML-NEXT:    - String:          ' and with tree size '
 ; YAML-NEXT:    - TreeSize:        '10'
 ; YAML-NEXT:  ...
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/hadd-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/hadd-inseltpoison.ll
index 0217ddcac0046..4a9f717918a02 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/hadd-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/hadd-inseltpoison.ll
@@ -166,11 +166,31 @@ define void @test_v4f32_v2f32_store(<4 x float> %f, ptr %p){
 ;
 
 define <4 x double> @test_v4f64(<4 x double> %a, <4 x double> %b) {
-; CHECK-LABEL: @test_v4f64(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-; CHECK-NEXT:    [[TMP3:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    ret <4 x double> [[TMP3]]
+; SSE-LABEL: @test_v4f64(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 2, i32 6>
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 3, i32 7>
+; SSE-NEXT:    [[TMP5:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]]
+; SSE-NEXT:    [[TMP6:%.*]] = fadd <2 x double> [[TMP2]], [[TMP4]]
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:    ret <4 x double> [[TMP7]]
+;
+; SLM-LABEL: @test_v4f64(
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 2, i32 6>
+; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 3, i32 7>
+; SLM-NEXT:    [[TMP5:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]]
+; SLM-NEXT:    [[TMP6:%.*]] = fadd <2 x double> [[TMP2]], [[TMP4]]
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:    ret <4 x double> [[TMP7]]
+;
+; AVX-LABEL: @test_v4f64(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+; AVX-NEXT:    [[TMP3:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    ret <4 x double> [[TMP3]]
 ;
   %a0 = extractelement <4 x double> %a, i32 0
   %a1 = extractelement <4 x double> %a, i32 1
@@ -266,11 +286,31 @@ define <4 x double> @test_v4f64_partial_swizzle(<4 x double> %a, <4 x double> %b
 }
 
 define <8 x float> @test_v8f32(<8 x float> %a, <8 x float> %b) {
-; CHECK-LABEL: @test_v8f32(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
-; CHECK-NEXT:    [[TMP3:%.*]] = fadd <8 x float> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    ret <8 x float> [[TMP3]]
+; SSE-LABEL: @test_v8f32(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
+; SSE-NEXT:    [[TMP5:%.*]] = fadd <4 x float> [[TMP1]], [[TMP3]]
+; SSE-NEXT:    [[TMP6:%.*]] = fadd <4 x float> [[TMP2]], [[TMP4]]
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    ret <8 x float> [[TMP7]]
+;
+; SLM-LABEL: @test_v8f32(
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
+; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
+; SLM-NEXT:    [[TMP5:%.*]] = fadd <4 x float> [[TMP1]], [[TMP3]]
+; SLM-NEXT:    [[TMP6:%.*]] = fadd <4 x float> [[TMP2]], [[TMP4]]
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    ret <8 x float> [[TMP7]]
+;
+; AVX-LABEL: @test_v8f32(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
+; AVX-NEXT:    [[TMP3:%.*]] = fadd <8 x float> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    ret <8 x float> [[TMP3]]
 ;
   %a0 = extractelement <8 x float> %a, i32 0
   %a1 = extractelement <8 x float> %a, i32 1
@@ -308,11 +348,31 @@ define <8 x float> @test_v8f32(<8 x float> %a, <8 x float> %b) {
 }
 
 define <4 x i64> @test_v4i64(<4 x i64> %a, <4 x i64> %b) {
-; CHECK-LABEL: @test_v4i64(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-; CHECK-NEXT:    [[TMP3:%.*]] = add <4 x i64> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    ret <4 x i64> [[TMP3]]
+; SSE-LABEL: @test_v4i64(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> <i32 0, i32 4>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 2, i32 6>
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 1, i32 5>
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 3, i32 7>
+; SSE-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[TMP1]], [[TMP3]]
+; SSE-NEXT:    [[TMP6:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]]
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:    ret <4 x i64> [[TMP7]]
+;
+; SLM-LABEL: @test_v4i64(
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> <i32 0, i32 4>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 2, i32 6>
+; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 1, i32 5>
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 3, i32 7>
+; SLM-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[TMP1]], [[TMP3]]
+; SLM-NEXT:    [[TMP6:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]]
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:    ret <4 x i64> [[TMP7]]
+;
+; AVX-LABEL: @test_v4i64(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+; AVX-NEXT:    [[TMP3:%.*]] = add <4 x i64> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    ret <4 x i64> [[TMP3]]
 ;
   %a0 = extractelement <4 x i64> %a, i32 0
   %a1 = extractelement <4 x i64> %a, i32 1
@@ -334,11 +394,31 @@ define <4 x i64> @test_v4i64(<4 x i64> %a, <4 x i64> %b) {
 }
 
 define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) {
-; CHECK-LABEL: @test_v8i32(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
-; CHECK-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    ret <8 x i32> [[TMP3]]
+; SSE-LABEL: @test_v8i32(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
+; SSE-NEXT:    [[TMP5:%.*]] = add <4 x i32> [[TMP1]], [[TMP3]]
+; SSE-NEXT:    [[TMP6:%.*]] = add <4 x i32> [[TMP2]], [[TMP4]]
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    ret <8 x i32> [[TMP7]]
+;
+; SLM-LABEL: @test_v8i32(
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
+; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
+; SLM-NEXT:    [[TMP5:%.*]] = add <4 x i32> [[TMP1]], [[TMP3]]
+; SLM-NEXT:    [[TMP6:%.*]] = add <4 x i32> [[TMP2]], [[TMP4]]
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    ret <8 x i32> [[TMP7]]
+;
+; AVX-LABEL: @test_v8i32(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
+; AVX-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    ret <8 x i32> [[TMP3]]
 ;
   %a0 = extractelement <8 x i32> %a, i32 0
   %a1 = extractelement <8 x i32> %a, i32 1
@@ -376,11 +456,31 @@ define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) {
 }
 
 define <16 x i16> @test_v16i16(<16 x i16> %a, <16 x i16> %b) {
-; CHECK-LABEL: @test_v16i16(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
-; CHECK-NEXT:    [[TMP3:%.*]] = add <16 x i16> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    ret <16 x i16> [[TMP3]]
+; SSE-LABEL: @test_v16i16(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23>
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
+; SSE-NEXT:    [[TMP5:%.*]] = add <8 x i16> [[TMP1]], [[TMP3]]
+; SSE-NEXT:    [[TMP6:%.*]] = add <8 x i16> [[TMP2]], [[TMP4]]
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:    ret <16 x i16> [[TMP7]]
+;
+; SLM-LABEL: @test_v16i16(
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
+; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23>
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
+; SLM-NEXT:    [[TMP5:%.*]] = add <8 x i16> [[TMP1]], [[TMP3]]
+; SLM-NEXT:    [[TMP6:%.*]] = add <8 x i16> [[TMP2]], [[TMP4]]
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SLM-NEXT:    ret <16 x i16> [[TMP7]]
+;
+; AVX-LABEL: @test_v16i16(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
+; AVX-NEXT:    [[TMP3:%.*]] = add <16 x i16> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    ret <16 x i16> [[TMP3]]
 ;
   %a0  = extractelement <16 x i16> %a, i32 0
   %a1  = extractelement <16 x i16> %a, i32 1
@@ -448,5 +548,3 @@ define <16 x i16> @test_v16i16(<16 x i16> %a, <16 x i16> %b) {
   %rv15 = insertelement <16 x i16> %rv14, i16 %r15, i32 15
   ret <16 x i16> %rv15
 }
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; AVX: {{.*}}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll b/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll
index c38d116a7a323..cac6845c43004 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/hadd.ll
@@ -166,11 +166,31 @@ define void @test_v4f32_v2f32_store(<4 x float> %f, ptr %p){
 ;
 
 define <4 x double> @test_v4f64(<4 x double> %a, <4 x double> %b) {
-; CHECK-LABEL: @test_v4f64(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-; CHECK-NEXT:    [[TMP3:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    ret <4 x double> [[TMP3]]
+; SSE-LABEL: @test_v4f64(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 2, i32 6>
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 3, i32 7>
+; SSE-NEXT:    [[TMP5:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]]
+; SSE-NEXT:    [[TMP6:%.*]] = fadd <2 x double> [[TMP2]], [[TMP4]]
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:    ret <4 x double> [[TMP7]]
+;
+; SLM-LABEL: @test_v4f64(
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 2, i32 6>
+; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 3, i32 7>
+; SLM-NEXT:    [[TMP5:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]]
+; SLM-NEXT:    [[TMP6:%.*]] = fadd <2 x double> [[TMP2]], [[TMP4]]
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:    ret <4 x double> [[TMP7]]
+;
+; AVX-LABEL: @test_v4f64(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+; AVX-NEXT:    [[TMP3:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    ret <4 x double> [[TMP3]]
 ;
   %a0 = extractelement <4 x double> %a, i32 0
   %a1 = extractelement <4 x double> %a, i32 1
@@ -266,11 +286,31 @@ define <4 x double> @test_v4f64_partial_swizzle(<4 x double> %a, <4 x double> %b
 }
 
 define <8 x float> @test_v8f32(<8 x float> %a, <8 x float> %b) {
-; CHECK-LABEL: @test_v8f32(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
-; CHECK-NEXT:    [[TMP3:%.*]] = fadd <8 x float> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    ret <8 x float> [[TMP3]]
+; SSE-LABEL: @test_v8f32(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
+; SSE-NEXT:    [[TMP5:%.*]] = fadd <4 x float> [[TMP1]], [[TMP3]]
+; SSE-NEXT:    [[TMP6:%.*]] = fadd <4 x float> [[TMP2]], [[TMP4]]
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    ret <8 x float> [[TMP7]]
+;
+; SLM-LABEL: @test_v8f32(
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
+; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
+; SLM-NEXT:    [[TMP5:%.*]] = fadd <4 x float> [[TMP1]], [[TMP3]]
+; SLM-NEXT:    [[TMP6:%.*]] = fadd <4 x float> [[TMP2]], [[TMP4]]
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    ret <8 x float> [[TMP7]]
+;
+; AVX-LABEL: @test_v8f32(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
+; AVX-NEXT:    [[TMP3:%.*]] = fadd <8 x float> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    ret <8 x float> [[TMP3]]
 ;
   %a0 = extractelement <8 x float> %a, i32 0
   %a1 = extractelement <8 x float> %a, i32 1
@@ -308,11 +348,31 @@ define <8 x float> @test_v8f32(<8 x float> %a, <8 x float> %b) {
 }
 
 define <4 x i64> @test_v4i64(<4 x i64> %a, <4 x i64> %b) {
-; CHECK-LABEL: @test_v4i64(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-; CHECK-NEXT:    [[TMP3:%.*]] = add <4 x i64> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    ret <4 x i64> [[TMP3]]
+; SSE-LABEL: @test_v4i64(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> <i32 0, i32 4>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 2, i32 6>
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 1, i32 5>
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 3, i32 7>
+; SSE-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[TMP1]], [[TMP3]]
+; SSE-NEXT:    [[TMP6:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]]
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:    ret <4 x i64> [[TMP7]]
+;
+; SLM-LABEL: @test_v4i64(
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> <i32 0, i32 4>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 2, i32 6>
+; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 1, i32 5>
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 3, i32 7>
+; SLM-NEXT:    [[TMP5:%.*]] = add <2 x i64> [[TMP1]], [[TMP3]]
+; SLM-NEXT:    [[TMP6:%.*]] = add <2 x i64> [[TMP2]], [[TMP4]]
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:    ret <4 x i64> [[TMP7]]
+;
+; AVX-LABEL: @test_v4i64(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+; AVX-NEXT:    [[TMP3:%.*]] = add <4 x i64> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    ret <4 x i64> [[TMP3]]
 ;
   %a0 = extractelement <4 x i64> %a, i32 0
   %a1 = extractelement <4 x i64> %a, i32 1
@@ -334,11 +394,31 @@ define <4 x i64> @test_v4i64(<4 x i64> %a, <4 x i64> %b) {
 }
 
 define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) {
-; CHECK-LABEL: @test_v8i32(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
-; CHECK-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    ret <8 x i32> [[TMP3]]
+; SSE-LABEL: @test_v8i32(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
+; SSE-NEXT:    [[TMP5:%.*]] = add <4 x i32> [[TMP1]], [[TMP3]]
+; SSE-NEXT:    [[TMP6:%.*]] = add <4 x i32> [[TMP2]], [[TMP4]]
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    ret <8 x i32> [[TMP7]]
+;
+; SLM-LABEL: @test_v8i32(
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
+; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
+; SLM-NEXT:    [[TMP5:%.*]] = add <4 x i32> [[TMP1]], [[TMP3]]
+; SLM-NEXT:    [[TMP6:%.*]] = add <4 x i32> [[TMP2]], [[TMP4]]
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    ret <8 x i32> [[TMP7]]
+;
+; AVX-LABEL: @test_v8i32(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
+; AVX-NEXT:    [[TMP3:%.*]] = add <8 x i32> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    ret <8 x i32> [[TMP3]]
 ;
   %a0 = extractelement <8 x i32> %a, i32 0
   %a1 = extractelement <8 x i32> %a, i32 1
@@ -376,11 +456,31 @@ define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) {
 }
 
 define <16 x i16> @test_v16i16(<16 x i16> %a, <16 x i16> %b) {
-; CHECK-LABEL: @test_v16i16(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
-; CHECK-NEXT:    [[TMP3:%.*]] = add <16 x i16> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    ret <16 x i16> [[TMP3]]
+; SSE-LABEL: @test_v16i16(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23>
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
+; SSE-NEXT:    [[TMP5:%.*]] = add <8 x i16> [[TMP1]], [[TMP3]]
+; SSE-NEXT:    [[TMP6:%.*]] = add <8 x i16> [[TMP2]], [[TMP4]]
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:    ret <16 x i16> [[TMP7]]
+;
+; SLM-LABEL: @test_v16i16(
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
+; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23>
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
+; SLM-NEXT:    [[TMP5:%.*]] = add <8 x i16> [[TMP1]], [[TMP3]]
+; SLM-NEXT:    [[TMP6:%.*]] = add <8 x i16> [[TMP2]], [[TMP4]]
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SLM-NEXT:    ret <16 x i16> [[TMP7]]
+;
+; AVX-LABEL: @test_v16i16(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
+; AVX-NEXT:    [[TMP3:%.*]] = add <16 x i16> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    ret <16 x i16> [[TMP3]]
 ;
   %a0  = extractelement <16 x i16> %a, i32 0
   %a1  = extractelement <16 x i16> %a, i32 1
@@ -448,5 +548,3 @@ define <16 x i16> @test_v16i16(<16 x i16> %a, <16 x i16> %b) {
   %rv15 = insertelement <16 x i16> %rv14, i16 %r15, i32 15
   ret <16 x i16> %rv15
 }
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; AVX: {{.*}}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/hsub-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/hsub-inseltpoison.ll
index 39400ba4ce1e8..40b6a8c32f5d0 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/hsub-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/hsub-inseltpoison.ll
@@ -145,11 +145,31 @@ define <8 x i16> @test_v8i16(<8 x i16> %a, <8 x i16> %b) {
 ;
 
 define <4 x double> @test_v4f64(<4 x double> %a, <4 x double> %b) {
-; CHECK-LABEL: @test_v4f64(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-; CHECK-NEXT:    [[TMP3:%.*]] = fsub <4 x double> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    ret <4 x double> [[TMP3]]
+; SSE-LABEL: @test_v4f64(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 2, i32 6>
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 3, i32 7>
+; SSE-NEXT:    [[TMP5:%.*]] = fsub <2 x double> [[TMP1]], [[TMP3]]
+; SSE-NEXT:    [[TMP6:%.*]] = fsub <2 x double> [[TMP2]], [[TMP4]]
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:    ret <4 x double> [[TMP7]]
+;
+; SLM-LABEL: @test_v4f64(
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 2, i32 6>
+; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 3, i32 7>
+; SLM-NEXT:    [[TMP5:%.*]] = fsub <2 x double> [[TMP1]], [[TMP3]]
+; SLM-NEXT:    [[TMP6:%.*]] = fsub <2 x double> [[TMP2]], [[TMP4]]
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:    ret <4 x double> [[TMP7]]
+;
+; AVX-LABEL: @test_v4f64(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+; AVX-NEXT:    [[TMP3:%.*]] = fsub <4 x double> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    ret <4 x double> [[TMP3]]
 ;
   %a0 = extractelement <4 x double> %a, i32 0
   %a1 = extractelement <4 x double> %a, i32 1
@@ -171,11 +191,31 @@ define <4 x double> @test_v4f64(<4 x double> %a, <4 x double> %b) {
 }
 
 define <8 x float> @test_v8f32(<8 x float> %a, <8 x float> %b) {
-; CHECK-LABEL: @test_v8f32(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
-; CHECK-NEXT:    [[TMP3:%.*]] = fsub <8 x float> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    ret <8 x float> [[TMP3]]
+; SSE-LABEL: @test_v8f32(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
+; SSE-NEXT:    [[TMP5:%.*]] = fsub <4 x float> [[TMP1]], [[TMP3]]
+; SSE-NEXT:    [[TMP6:%.*]] = fsub <4 x float> [[TMP2]], [[TMP4]]
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    ret <8 x float> [[TMP7]]
+;
+; SLM-LABEL: @test_v8f32(
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
+; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
+; SLM-NEXT:    [[TMP5:%.*]] = fsub <4 x float> [[TMP1]], [[TMP3]]
+; SLM-NEXT:    [[TMP6:%.*]] = fsub <4 x float> [[TMP2]], [[TMP4]]
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    ret <8 x float> [[TMP7]]
+;
+; AVX-LABEL: @test_v8f32(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
+; AVX-NEXT:    [[TMP3:%.*]] = fsub <8 x float> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    ret <8 x float> [[TMP3]]
 ;
   %a0 = extractelement <8 x float> %a, i32 0
   %a1 = extractelement <8 x float> %a, i32 1
@@ -213,11 +253,31 @@ define <8 x float> @test_v8f32(<8 x float> %a, <8 x float> %b) {
 }
 
 define <4 x i64> @test_v4i64(<4 x i64> %a, <4 x i64> %b) {
-; CHECK-LABEL: @test_v4i64(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-; CHECK-NEXT:    [[TMP3:%.*]] = sub <4 x i64> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    ret <4 x i64> [[TMP3]]
+; SSE-LABEL: @test_v4i64(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> <i32 0, i32 4>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 2, i32 6>
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 1, i32 5>
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 3, i32 7>
+; SSE-NEXT:    [[TMP5:%.*]] = sub <2 x i64> [[TMP1]], [[TMP3]]
+; SSE-NEXT:    [[TMP6:%.*]] = sub <2 x i64> [[TMP2]], [[TMP4]]
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:    ret <4 x i64> [[TMP7]]
+;
+; SLM-LABEL: @test_v4i64(
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> <i32 0, i32 4>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 2, i32 6>
+; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 1, i32 5>
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 3, i32 7>
+; SLM-NEXT:    [[TMP5:%.*]] = sub <2 x i64> [[TMP1]], [[TMP3]]
+; SLM-NEXT:    [[TMP6:%.*]] = sub <2 x i64> [[TMP2]], [[TMP4]]
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:    ret <4 x i64> [[TMP7]]
+;
+; AVX-LABEL: @test_v4i64(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+; AVX-NEXT:    [[TMP3:%.*]] = sub <4 x i64> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    ret <4 x i64> [[TMP3]]
 ;
   %a0 = extractelement <4 x i64> %a, i32 0
   %a1 = extractelement <4 x i64> %a, i32 1
@@ -239,11 +299,31 @@ define <4 x i64> @test_v4i64(<4 x i64> %a, <4 x i64> %b) {
 }
 
 define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) {
-; CHECK-LABEL: @test_v8i32(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
-; CHECK-NEXT:    [[TMP3:%.*]] = sub <8 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    ret <8 x i32> [[TMP3]]
+; SSE-LABEL: @test_v8i32(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
+; SSE-NEXT:    [[TMP5:%.*]] = sub <4 x i32> [[TMP1]], [[TMP3]]
+; SSE-NEXT:    [[TMP6:%.*]] = sub <4 x i32> [[TMP2]], [[TMP4]]
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    ret <8 x i32> [[TMP7]]
+;
+; SLM-LABEL: @test_v8i32(
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
+; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
+; SLM-NEXT:    [[TMP5:%.*]] = sub <4 x i32> [[TMP1]], [[TMP3]]
+; SLM-NEXT:    [[TMP6:%.*]] = sub <4 x i32> [[TMP2]], [[TMP4]]
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    ret <8 x i32> [[TMP7]]
+;
+; AVX-LABEL: @test_v8i32(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
+; AVX-NEXT:    [[TMP3:%.*]] = sub <8 x i32> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    ret <8 x i32> [[TMP3]]
 ;
   %a0 = extractelement <8 x i32> %a, i32 0
   %a1 = extractelement <8 x i32> %a, i32 1
@@ -281,11 +361,31 @@ define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) {
 }
 
 define <16 x i16> @test_v16i16(<16 x i16> %a, <16 x i16> %b) {
-; CHECK-LABEL: @test_v16i16(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
-; CHECK-NEXT:    [[TMP3:%.*]] = sub <16 x i16> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    ret <16 x i16> [[TMP3]]
+; SSE-LABEL: @test_v16i16(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23>
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
+; SSE-NEXT:    [[TMP5:%.*]] = sub <8 x i16> [[TMP1]], [[TMP3]]
+; SSE-NEXT:    [[TMP6:%.*]] = sub <8 x i16> [[TMP2]], [[TMP4]]
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:    ret <16 x i16> [[TMP7]]
+;
+; SLM-LABEL: @test_v16i16(
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
+; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23>
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
+; SLM-NEXT:    [[TMP5:%.*]] = sub <8 x i16> [[TMP1]], [[TMP3]]
+; SLM-NEXT:    [[TMP6:%.*]] = sub <8 x i16> [[TMP2]], [[TMP4]]
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SLM-NEXT:    ret <16 x i16> [[TMP7]]
+;
+; AVX-LABEL: @test_v16i16(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
+; AVX-NEXT:    [[TMP3:%.*]] = sub <16 x i16> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    ret <16 x i16> [[TMP3]]
 ;
   %a0  = extractelement <16 x i16> %a, i32 0
   %a1  = extractelement <16 x i16> %a, i32 1
@@ -354,9 +454,6 @@ define <16 x i16> @test_v16i16(<16 x i16> %a, <16 x i16> %b) {
   ret <16 x i16> %rv15
 }
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; AVX: {{.*}}
 ; AVX1: {{.*}}
 ; AVX2: {{.*}}
 ; AVX512: {{.*}}
-; SLM: {{.*}}
-; SSE: {{.*}}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/hsub.ll b/llvm/test/Transforms/SLPVectorizer/X86/hsub.ll
index 6b63de83c56be..09113323d3ab7 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/hsub.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/hsub.ll
@@ -145,11 +145,31 @@ define <8 x i16> @test_v8i16(<8 x i16> %a, <8 x i16> %b) {
 ;
 
 define <4 x double> @test_v4f64(<4 x double> %a, <4 x double> %b) {
-; CHECK-LABEL: @test_v4f64(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-; CHECK-NEXT:    [[TMP3:%.*]] = fsub <4 x double> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    ret <4 x double> [[TMP3]]
+; SSE-LABEL: @test_v4f64(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 2, i32 6>
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 3, i32 7>
+; SSE-NEXT:    [[TMP5:%.*]] = fsub <2 x double> [[TMP1]], [[TMP3]]
+; SSE-NEXT:    [[TMP6:%.*]] = fsub <2 x double> [[TMP2]], [[TMP4]]
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:    ret <4 x double> [[TMP7]]
+;
+; SLM-LABEL: @test_v4f64(
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 2, i32 6>
+; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 3, i32 7>
+; SLM-NEXT:    [[TMP5:%.*]] = fsub <2 x double> [[TMP1]], [[TMP3]]
+; SLM-NEXT:    [[TMP6:%.*]] = fsub <2 x double> [[TMP2]], [[TMP4]]
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:    ret <4 x double> [[TMP7]]
+;
+; AVX-LABEL: @test_v4f64(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+; AVX-NEXT:    [[TMP3:%.*]] = fsub <4 x double> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    ret <4 x double> [[TMP3]]
 ;
   %a0 = extractelement <4 x double> %a, i32 0
   %a1 = extractelement <4 x double> %a, i32 1
@@ -171,11 +191,31 @@ define <4 x double> @test_v4f64(<4 x double> %a, <4 x double> %b) {
 }
 
 define <8 x float> @test_v8f32(<8 x float> %a, <8 x float> %b) {
-; CHECK-LABEL: @test_v8f32(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
-; CHECK-NEXT:    [[TMP3:%.*]] = fsub <8 x float> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    ret <8 x float> [[TMP3]]
+; SSE-LABEL: @test_v8f32(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
+; SSE-NEXT:    [[TMP5:%.*]] = fsub <4 x float> [[TMP1]], [[TMP3]]
+; SSE-NEXT:    [[TMP6:%.*]] = fsub <4 x float> [[TMP2]], [[TMP4]]
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    ret <8 x float> [[TMP7]]
+;
+; SLM-LABEL: @test_v8f32(
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
+; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
+; SLM-NEXT:    [[TMP5:%.*]] = fsub <4 x float> [[TMP1]], [[TMP3]]
+; SLM-NEXT:    [[TMP6:%.*]] = fsub <4 x float> [[TMP2]], [[TMP4]]
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    ret <8 x float> [[TMP7]]
+;
+; AVX-LABEL: @test_v8f32(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
+; AVX-NEXT:    [[TMP3:%.*]] = fsub <8 x float> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    ret <8 x float> [[TMP3]]
 ;
   %a0 = extractelement <8 x float> %a, i32 0
   %a1 = extractelement <8 x float> %a, i32 1
@@ -213,11 +253,31 @@ define <8 x float> @test_v8f32(<8 x float> %a, <8 x float> %b) {
 }
 
 define <4 x i64> @test_v4i64(<4 x i64> %a, <4 x i64> %b) {
-; CHECK-LABEL: @test_v4i64(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-; CHECK-NEXT:    [[TMP3:%.*]] = sub <4 x i64> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    ret <4 x i64> [[TMP3]]
+; SSE-LABEL: @test_v4i64(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> <i32 0, i32 4>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 2, i32 6>
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 1, i32 5>
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 3, i32 7>
+; SSE-NEXT:    [[TMP5:%.*]] = sub <2 x i64> [[TMP1]], [[TMP3]]
+; SSE-NEXT:    [[TMP6:%.*]] = sub <2 x i64> [[TMP2]], [[TMP4]]
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT:    ret <4 x i64> [[TMP7]]
+;
+; SLM-LABEL: @test_v4i64(
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <2 x i32> <i32 0, i32 4>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 2, i32 6>
+; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 1, i32 5>
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <2 x i32> <i32 3, i32 7>
+; SLM-NEXT:    [[TMP5:%.*]] = sub <2 x i64> [[TMP1]], [[TMP3]]
+; SLM-NEXT:    [[TMP6:%.*]] = sub <2 x i64> [[TMP2]], [[TMP4]]
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SLM-NEXT:    ret <4 x i64> [[TMP7]]
+;
+; AVX-LABEL: @test_v4i64(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i64> [[A]], <4 x i64> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+; AVX-NEXT:    [[TMP3:%.*]] = sub <4 x i64> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    ret <4 x i64> [[TMP3]]
 ;
   %a0 = extractelement <4 x i64> %a, i32 0
   %a1 = extractelement <4 x i64> %a, i32 1
@@ -239,11 +299,31 @@ define <4 x i64> @test_v4i64(<4 x i64> %a, <4 x i64> %b) {
 }
 
 define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) {
-; CHECK-LABEL: @test_v8i32(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
-; CHECK-NEXT:    [[TMP3:%.*]] = sub <8 x i32> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    ret <8 x i32> [[TMP3]]
+; SSE-LABEL: @test_v8i32(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
+; SSE-NEXT:    [[TMP5:%.*]] = sub <4 x i32> [[TMP1]], [[TMP3]]
+; SSE-NEXT:    [[TMP6:%.*]] = sub <4 x i32> [[TMP2]], [[TMP4]]
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT:    ret <8 x i32> [[TMP7]]
+;
+; SLM-LABEL: @test_v8i32(
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
+; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
+; SLM-NEXT:    [[TMP5:%.*]] = sub <4 x i32> [[TMP1]], [[TMP3]]
+; SLM-NEXT:    [[TMP6:%.*]] = sub <4 x i32> [[TMP2]], [[TMP4]]
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SLM-NEXT:    ret <8 x i32> [[TMP7]]
+;
+; AVX-LABEL: @test_v8i32(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
+; AVX-NEXT:    [[TMP3:%.*]] = sub <8 x i32> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    ret <8 x i32> [[TMP3]]
 ;
   %a0 = extractelement <8 x i32> %a, i32 0
   %a1 = extractelement <8 x i32> %a, i32 1
@@ -281,11 +361,31 @@ define <8 x i32> @test_v8i32(<8 x i32> %a, <8 x i32> %b) {
 }
 
 define <16 x i16> @test_v16i16(<16 x i16> %a, <16 x i16> %b) {
-; CHECK-LABEL: @test_v16i16(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
-; CHECK-NEXT:    [[TMP3:%.*]] = sub <16 x i16> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    ret <16 x i16> [[TMP3]]
+; SSE-LABEL: @test_v16i16(
+; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22>
+; SSE-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
+; SSE-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23>
+; SSE-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
+; SSE-NEXT:    [[TMP5:%.*]] = sub <8 x i16> [[TMP1]], [[TMP3]]
+; SSE-NEXT:    [[TMP6:%.*]] = sub <8 x i16> [[TMP2]], [[TMP4]]
+; SSE-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT:    ret <16 x i16> [[TMP7]]
+;
+; SLM-LABEL: @test_v16i16(
+; SLM-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22>
+; SLM-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
+; SLM-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23>
+; SLM-NEXT:    [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <8 x i32> <i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
+; SLM-NEXT:    [[TMP5:%.*]] = sub <8 x i16> [[TMP1]], [[TMP3]]
+; SLM-NEXT:    [[TMP6:%.*]] = sub <8 x i16> [[TMP2]], [[TMP4]]
+; SLM-NEXT:    [[TMP7:%.*]] = shufflevector <8 x i16> [[TMP5]], <8 x i16> [[TMP6]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SLM-NEXT:    ret <16 x i16> [[TMP7]]
+;
+; AVX-LABEL: @test_v16i16(
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 8, i32 10, i32 12, i32 14, i32 24, i32 26, i32 28, i32 30>
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 9, i32 11, i32 13, i32 15, i32 25, i32 27, i32 29, i32 31>
+; AVX-NEXT:    [[TMP3:%.*]] = sub <16 x i16> [[TMP1]], [[TMP2]]
+; AVX-NEXT:    ret <16 x i16> [[TMP3]]
 ;
   %a0  = extractelement <16 x i16> %a, i32 0
   %a1  = extractelement <16 x i16> %a, i32 1
@@ -354,9 +454,6 @@ define <16 x i16> @test_v16i16(<16 x i16> %a, <16 x i16> %b) {
   ret <16 x i16> %rv15
 }
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; AVX: {{.*}}
 ; AVX1: {{.*}}
 ; AVX2: {{.*}}
 ; AVX512: {{.*}}
-; SLM: {{.*}}
-; SSE: {{.*}}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reused-extractelements.ll b/llvm/test/Transforms/SLPVectorizer/X86/reused-extractelements.ll
index 35cb8c729e106..94a1d7aa1951c 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reused-extractelements.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reused-extractelements.ll
@@ -2,23 +2,24 @@
 ; RUN: opt < %s -passes=slp-vectorizer -S -o - -mtriple=x86_64-unknown-linux -mcpu=bdver2 -pass-remarks-output=%t | FileCheck %s
 ; RUN: FileCheck --input-file=%t --check-prefix=YAML %s
 
-; YAML: --- !Passed
+; YAML: --- !Missed
 ; YAML-NEXT: Pass:            slp-vectorizer
-; YAML-NEXT: Name:            VectorizedList
+; YAML-NEXT: Name:            NotBeneficial
 ; YAML-NEXT: Function:        g
 ; YAML-NEXT: Args:
-; YAML-NEXT:   - String:          'SLP vectorized with cost '
-; YAML-NEXT:   - Cost:            '-1'
-; YAML-NEXT:   - String:          ' and with tree size '
-; YAML-NEXT:   - TreeSize:        '4'
+; YAML-NEXT:   - String:          'List vectorization was possible but not beneficial with cost '
+; YAML-NEXT:   - Cost:            '0'
+; YAML-NEXT:   - String:          ' >= '
+; YAML-NEXT:   - Treshold:        '0'
 
 define <2 x i32> @g(<2 x i32> %x, i32 %a, i32 %b) {
 ; CHECK-LABEL: @g(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i32> [[X:%.*]], <2 x i32> poison, <2 x i32> <i32 1, i32 poison>
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[A:%.*]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[B:%.*]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = mul <2 x i32> [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    ret <2 x i32> [[TMP4]]
+; CHECK-NEXT:    [[X1:%.*]] = extractelement <2 x i32> [[X:%.*]], i32 1
+; CHECK-NEXT:    [[X1X1:%.*]] = mul i32 [[X1]], [[X1]]
+; CHECK-NEXT:    [[AB:%.*]] = mul i32 [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[INS1:%.*]] = insertelement <2 x i32> poison, i32 [[X1X1]], i32 0
+; CHECK-NEXT:    [[INS2:%.*]] = insertelement <2 x i32> [[INS1]], i32 [[AB]], i32 1
+; CHECK-NEXT:    ret <2 x i32> [[INS2]]
 ;
   %x1 = extractelement <2 x i32> %x, i32 1
   %x1x1 = mul i32 %x1, %x1

From 3215da7c199fdb20d25687837c5ab0656595f2c0 Mon Sep 17 00:00:00 2001
From: Yug Sahu <132282493+yug-intel@users.noreply.github.com>
Date: Wed, 1 Nov 2023 10:47:00 -0700
Subject: [PATCH 714/877] [SYCL] Shortening error messages in annotated_arg/ptr
 assertion failures (#11730)

This change shortens the error messages for the static assertion
failures for annotated_arg/ptr. The check is first assigned to a boolean
variable that is named to represent the check and this variable is used
in the static assertion. This prevents the props for annotated_arg/ptr
to printed in the error message.

For example, the following static assertion:

```
error: static assertion failed due to requirement 'is_valid_property<ac_int<17, true>, sycl::ext::oneapi::experimental::property_value<sycl::ext::intel::experimental::buffer_location_key, std::integral_constant<int, 0>>>::value': Property is invalid for the given type.
   57 |   static_assert(is_valid_property<T, Prop>::value,
      |                 ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
```

will look like this:

```
error: static assertion failed due to requirement 'is_valid_property_for_given_type': Property is invalid for the given type.
   59 |   static_assert(is_valid_property_for_given_type,
      |                 ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
```
---
 .../annotated_arg/annotated_arg.hpp           | 102 +++++++++++-------
 .../annotated_ptr/annotated_ptr.hpp           |  59 ++++++----
 .../properties.hpp                            |   4 +-
 3 files changed, 104 insertions(+), 61 deletions(-)

diff --git a/sycl/include/sycl/ext/oneapi/experimental/annotated_arg/annotated_arg.hpp b/sycl/include/sycl/ext/oneapi/experimental/annotated_arg/annotated_arg.hpp
index ace8f59c5d8e0..cb8a3229ae38a 100644
--- a/sycl/include/sycl/ext/oneapi/experimental/annotated_arg/annotated_arg.hpp
+++ b/sycl/include/sycl/ext/oneapi/experimental/annotated_arg/annotated_arg.hpp
@@ -54,8 +54,9 @@ annotated_arg(annotated_arg<T, old>, properties<std::tuple<ArgT...>>)
 template <typename T, typename PropertyListT = empty_properties_t>
 class annotated_arg {
   // This should always fail when instantiating the unspecialized version.
-  static_assert(is_property_list<PropertyListT>::value,
-                "Property list is invalid.");
+  static constexpr bool is_valid_property_list =
+      is_property_list<PropertyListT>::value;
+  static_assert(is_valid_property_list, "Property list is invalid.");
 };
 
 // Partial specialization for pointer type
@@ -83,12 +84,17 @@ __SYCL_TYPE(annotated_arg) annotated_arg<T *, detail::properties_t<Props...>> {
 #endif
 
 public:
-  static_assert(is_property_list<property_list_t>::value,
-                "Property list is invalid.");
-  static_assert(check_property_list<T *, Props...>::value,
+  static constexpr bool is_valid_property_list =
+      is_property_list<property_list_t>::value;
+  static_assert(is_valid_property_list, "Property list is invalid.");
+  static constexpr bool contains_valid_properties =
+      check_property_list<T *, Props...>::value;
+  static_assert(contains_valid_properties,
                 "The property list contains invalid property.");
   // check the set if FPGA specificed properties are used
-  static_assert(detail::checkValidFPGAPropertySet<Props...>::value,
+  static constexpr bool hasValidFPGAProperties =
+      detail::checkValidFPGAPropertySet<Props...>::value;
+  static_assert(hasValidFPGAProperties,
                 "FPGA Interface properties (i.e. awidth, dwidth, etc.)"
                 "can only be set with BufferLocation together.");
 
@@ -109,11 +115,12 @@ __SYCL_TYPE(annotated_arg) annotated_arg<T *, detail::properties_t<Props...>> {
   template <typename... PropertyValueTs>
   annotated_arg(T *_ptr, const PropertyValueTs &...props) noexcept
       : obj(global_pointer_t(_ptr)) {
+    static constexpr bool has_same_properties = std::is_same<
+        property_list_t,
+        detail::merged_properties_t<property_list_t,
+                                    decltype(properties{props...})>>::value;
     static_assert(
-        std::is_same<
-            property_list_t,
-            detail::merged_properties_t<property_list_t,
-                                        decltype(properties{props...})>>::value,
+        has_same_properties,
         "The property list must contain all properties of the input of the "
         "constructor");
   }
@@ -125,16 +132,19 @@ __SYCL_TYPE(annotated_arg) annotated_arg<T *, detail::properties_t<Props...>> {
   template <typename T2, typename PropertyList2>
   explicit annotated_arg(const annotated_arg<T2, PropertyList2> &other) noexcept
       : obj(other.obj) {
-    static_assert(std::is_convertible<T2, T *>::value,
+    static constexpr bool is_input_convertible =
+        std::is_convertible<T2, T *>::value;
+    static_assert(is_input_convertible,
                   "The underlying data type of the input annotated_arg is not "
                   "compatible");
 
+    static constexpr bool has_same_properties = std::is_same<
+        property_list_t,
+        detail::merged_properties_t<property_list_t, PropertyList2>>::value;
     static_assert(
-        std::is_same<
-            property_list_t,
-            detail::merged_properties_t<property_list_t, PropertyList2>>::value,
-        "The constructed annotated_arg type must contain all the properties of "
-        "the input annotated_arg");
+        has_same_properties,
+        "The constructed annotated_arg type must contain all the properties "
+        "of the input annotated_arg");
   }
 
   // Constructs an annotated_arg object from another annotated_arg object and a
@@ -146,13 +156,17 @@ __SYCL_TYPE(annotated_arg) annotated_arg<T *, detail::properties_t<Props...>> {
                          const PropertyListV &proplist) noexcept
       : obj(other.obj) {
     (void)proplist;
-    static_assert(std::is_convertible<T2, T *>::value,
+    static constexpr bool is_input_convertible =
+        std::is_convertible<T2, T *>::value;
+    static_assert(is_input_convertible,
                   "The underlying data type of the input annotated_arg is not "
                   "compatible");
 
+    static constexpr bool has_same_properties = std::is_same<
+        property_list_t,
+        detail::merged_properties_t<PropertyListU, PropertyListV>>::value;
     static_assert(
-        std::is_same<property_list_t, detail::merged_properties_t<
-                                          PropertyListU, PropertyListV>>::value,
+        has_same_properties,
         "The property list of constructed annotated_arg type must be the union "
         "of the input property lists");
   }
@@ -192,13 +206,19 @@ __SYCL_TYPE(annotated_arg) annotated_arg<T, detail::properties_t<Props...>> {
 #endif
 
 public:
-  static_assert(is_device_copyable_v<T>, "Type T must be device copyable.");
-  static_assert(is_property_list<property_list_t>::value,
-                "Property list is invalid.");
-  static_assert(check_property_list<T, Props...>::value,
+  static constexpr bool is_device_copyable = is_device_copyable_v<T>;
+  static_assert(is_device_copyable, "Type T must be device copyable.");
+  static constexpr bool is_valid_property_list =
+      is_property_list<property_list_t>::value;
+  static_assert(is_valid_property_list, "Property list is invalid.");
+  static constexpr bool contains_valid_properties =
+      check_property_list<T, Props...>::value;
+  static_assert(contains_valid_properties,
                 "The property list contains invalid property.");
   // check the set if FPGA specificed properties are used
-  static_assert(detail::checkValidFPGAPropertySet<Props...>::value,
+  static constexpr bool hasValidFPGAProperties =
+      detail::checkValidFPGAPropertySet<Props...>::value;
+  static_assert(hasValidFPGAProperties,
                 "FPGA Interface properties (i.e. awidth, dwidth, etc.)"
                 "can only be set with BufferLocation together.");
 
@@ -218,11 +238,12 @@ __SYCL_TYPE(annotated_arg) annotated_arg<T, detail::properties_t<Props...>> {
   // `PropertyValueTs...` must have the same property value.
   template <typename... PropertyValueTs>
   annotated_arg(const T &_obj, PropertyValueTs... props) noexcept : obj(_obj) {
+    static constexpr bool has_same_properties = std::is_same<
+        property_list_t,
+        detail::merged_properties_t<property_list_t,
+                                    decltype(properties{props...})>>::value;
     static_assert(
-        std::is_same<
-            property_list_t,
-            detail::merged_properties_t<property_list_t,
-                                        decltype(properties{props...})>>::value,
+        has_same_properties,
         "The property list must contain all properties of the input of the "
         "constructor");
   }
@@ -234,16 +255,19 @@ __SYCL_TYPE(annotated_arg) annotated_arg<T, detail::properties_t<Props...>> {
   template <typename T2, typename PropertyList2>
   explicit annotated_arg(const annotated_arg<T2, PropertyList2> &other) noexcept
       : obj(other.obj) {
-    static_assert(std::is_convertible<T2, T>::value,
+    static constexpr bool is_input_convertible =
+        std::is_convertible<T2, T>::value;
+    static_assert(is_input_convertible,
                   "The underlying data type of the input annotated_arg is not "
                   "compatible");
 
+    static constexpr bool has_same_properties = std::is_same<
+        property_list_t,
+        detail::merged_properties_t<property_list_t, PropertyList2>>::value;
     static_assert(
-        std::is_same<
-            property_list_t,
-            detail::merged_properties_t<property_list_t, PropertyList2>>::value,
-        "The constructed annotated_arg type must contain all the properties of "
-        "the input annotated_arg");
+        has_same_properties,
+        "The constructed annotated_arg type must contain all the properties "
+        "of the input annotated_arg");
   }
 
   // Constructs an annotated_arg object from another annotated_arg object and a
@@ -255,13 +279,17 @@ __SYCL_TYPE(annotated_arg) annotated_arg<T, detail::properties_t<Props...>> {
                          const PropertyListV &proplist) noexcept
       : obj(other.obj) {
     (void)proplist;
-    static_assert(std::is_convertible<T2, T>::value,
+    static constexpr bool is_input_convertible =
+        std::is_convertible<T2, T>::value;
+    static_assert(is_input_convertible,
                   "The underlying data type of the input annotated_arg is not "
                   "compatible");
 
+    static constexpr bool has_same_properties = std::is_same<
+        property_list_t,
+        detail::merged_properties_t<PropertyListU, PropertyListV>>::value;
     static_assert(
-        std::is_same<property_list_t, detail::merged_properties_t<
-                                          PropertyListU, PropertyListV>>::value,
+        has_same_properties,
         "The property list of constructed annotated_arg type must be the union "
         "of the input property lists");
   }
diff --git a/sycl/include/sycl/ext/oneapi/experimental/annotated_ptr/annotated_ptr.hpp b/sycl/include/sycl/ext/oneapi/experimental/annotated_ptr/annotated_ptr.hpp
index 397df720b2b58..1a54f73703278 100644
--- a/sycl/include/sycl/ext/oneapi/experimental/annotated_ptr/annotated_ptr.hpp
+++ b/sycl/include/sycl/ext/oneapi/experimental/annotated_ptr/annotated_ptr.hpp
@@ -62,8 +62,9 @@ struct PropertiesFilter {
 template <typename T, typename PropertyListT = empty_properties_t>
 class annotated_ref {
   // This should always fail when instantiating the unspecialized version.
-  static_assert(is_property_list<PropertyListT>::value,
-                "Property list is invalid.");
+  static constexpr bool is_valid_property_list =
+      is_property_list<PropertyListT>::value;
+  static_assert(is_valid_property_list, "Property list is invalid.");
 };
 
 template <typename T, typename... Props>
@@ -146,8 +147,9 @@ annotated_ptr(annotated_ptr<T, old>, properties<std::tuple<ArgT...>>)
 template <typename T, typename PropertyListT = empty_properties_t>
 class annotated_ptr {
   // This should always fail when instantiating the unspecialized version.
-  static_assert(is_property_list<PropertyListT>::value,
-                "Property list is invalid.");
+  static constexpr bool is_valid_property_list =
+      is_property_list<PropertyListT>::value;
+  static_assert(is_valid_property_list, "Property list is invalid.");
 };
 
 template <typename T, typename... Props>
@@ -192,12 +194,17 @@ __SYCL_TYPE(annotated_ptr) annotated_ptr<T, detail::properties_t<Props...>> {
 #endif
 
 public:
-  static_assert(is_property_list<property_list_t>::value,
-                "Property list is invalid.");
-  static_assert(check_property_list<T *, Props...>::value,
+  static constexpr bool is_valid_property_list =
+      is_property_list<property_list_t>::value;
+  static_assert(is_valid_property_list, "Property list is invalid.");
+  static constexpr bool contains_valid_properties =
+      check_property_list<T *, Props...>::value;
+  static_assert(contains_valid_properties,
                 "The property list contains invalid property.");
   // check the set if FPGA specificed properties are used
-  static_assert(detail::checkValidFPGAPropertySet<Props...>::value,
+  static constexpr bool hasValidFPGAProperties =
+      detail::checkValidFPGAPropertySet<Props...>::value;
+  static_assert(hasValidFPGAProperties,
                 "FPGA Interface properties (i.e. awidth, dwidth, etc.)"
                 "can only be set with BufferLocation together.");
 
@@ -216,11 +223,12 @@ __SYCL_TYPE(annotated_ptr) annotated_ptr<T, detail::properties_t<Props...>> {
   template <typename... PropertyValueTs>
   explicit annotated_ptr(T *Ptr, const PropertyValueTs &...props) noexcept
       : m_Ptr(global_pointer_t(Ptr)) {
+    static constexpr bool has_same_properties = std::is_same<
+        property_list_t,
+        detail::merged_properties_t<property_list_t,
+                                    decltype(properties{props...})>>::value;
     static_assert(
-        std::is_same<
-            property_list_t,
-            detail::merged_properties_t<property_list_t,
-                                        decltype(properties{props...})>>::value,
+        has_same_properties,
         "The property list must contain all properties of the input of the "
         "constructor");
   }
@@ -232,18 +240,20 @@ __SYCL_TYPE(annotated_ptr) annotated_ptr<T, detail::properties_t<Props...>> {
   template <typename T2, typename PropertyList2>
   explicit annotated_ptr(const annotated_ptr<T2, PropertyList2> &other) noexcept
       : m_Ptr(other.m_Ptr) {
+    static constexpr bool is_input_convertible =
+        std::is_convertible<T2 *, T *>::value;
     static_assert(
-        std::is_convertible<T2 *, T *>::value,
+        is_input_convertible,
         "The underlying pointer type of the input annotated_ptr is not "
         "convertible to the target pointer type");
 
+    static constexpr bool has_same_properties = std::is_same<
+        property_list_t,
+        detail::merged_properties_t<property_list_t, PropertyList2>>::value;
     static_assert(
-        std::is_same<
-            property_list_t,
-            detail::merged_properties_t<property_list_t, PropertyList2>>::value,
+        has_same_properties,
         "The constructed annotated_ptr type must contain all the properties "
-        "of "
-        "the input annotated_ptr");
+        "of the input annotated_ptr");
   }
 
   // Constructs an annotated_ptr object from another annotated_ptr object and
@@ -254,17 +264,20 @@ __SYCL_TYPE(annotated_ptr) annotated_ptr<T, detail::properties_t<Props...>> {
   explicit annotated_ptr(const annotated_ptr<T2, PropertyListU> &other,
                          const PropertyListV &) noexcept
       : m_Ptr(other.m_Ptr) {
+    static constexpr bool is_input_convertible =
+        std::is_convertible<T2 *, T *>::value;
     static_assert(
-        std::is_convertible<T2 *, T *>::value,
+        is_input_convertible,
         "The underlying pointer type of the input annotated_ptr is not "
         "convertible to the target pointer type");
 
+    static constexpr bool has_same_properties = std::is_same<
+        property_list_t,
+        detail::merged_properties_t<PropertyListU, PropertyListV>>::value;
     static_assert(
-        std::is_same<property_list_t, detail::merged_properties_t<
-                                          PropertyListU, PropertyListV>>::value,
+        has_same_properties,
         "The property list of constructed annotated_ptr type must be the "
-        "union "
-        "of the input property lists");
+        "union of the input property lists");
   }
 
   reference operator*() const noexcept { return reference(m_Ptr); }
diff --git a/sycl/include/sycl/ext/oneapi/experimental/common_annotated_properties/properties.hpp b/sycl/include/sycl/ext/oneapi/experimental/common_annotated_properties/properties.hpp
index 756200c979ea6..e37499b9cbe73 100644
--- a/sycl/include/sycl/ext/oneapi/experimental/common_annotated_properties/properties.hpp
+++ b/sycl/include/sycl/ext/oneapi/experimental/common_annotated_properties/properties.hpp
@@ -50,7 +50,9 @@ template <typename T, typename Prop, typename... Props>
 struct check_property_list<T, Prop, Props...>
     : std::conditional_t<is_valid_property<T, Prop>::value,
                          check_property_list<T, Props...>, std::false_type> {
-  static_assert(is_valid_property<T, Prop>::value,
+  static constexpr bool is_valid_property_for_given_type =
+      is_valid_property<T, Prop>::value;
+  static_assert(is_valid_property_for_given_type,
                 "Property is invalid for the given type.");
 };
 

From c0a1857928c557400af0ed53d198cc9f3f185f9a Mon Sep 17 00:00:00 2001
From: Egor Zhdan <e_zhdan@apple.com>
Date: Wed, 1 Nov 2023 17:56:39 +0000
Subject: [PATCH 715/877] [APINotes] Upstream APINotesOptions

This upstreams more of the Clang API Notes functionality that is
currently implemented in the Apple fork:
https://github.com/apple/llvm-project/tree/next/clang/lib/APINotes

This adds the first compiler options related to API Notes to the
upstream Clang: `-iapinotes-modules` and `-fapinotes-swift-version=`.
However, this does not add the `-fapinotes` flag that enables API Notes,
since the feature is not fully functional yet.
---
 .../include/clang/APINotes/APINotesOptions.h  | 34 +++++++++++++++++++
 clang/include/clang/Driver/Options.td         |  7 ++++
 .../clang/Frontend/CompilerInvocation.h       |  6 ++++
 clang/lib/Frontend/CompilerInvocation.cpp     | 12 +++++++
 4 files changed, 59 insertions(+)
 create mode 100644 clang/include/clang/APINotes/APINotesOptions.h

diff --git a/clang/include/clang/APINotes/APINotesOptions.h b/clang/include/clang/APINotes/APINotesOptions.h
new file mode 100644
index 0000000000000..e8b8a9ed2261f
--- /dev/null
+++ b/clang/include/clang/APINotes/APINotesOptions.h
@@ -0,0 +1,34 @@
+//===--- APINotesOptions.h --------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_APINOTES_APINOTESOPTIONS_H
+#define LLVM_CLANG_APINOTES_APINOTESOPTIONS_H
+
+#include "llvm/Support/VersionTuple.h"
+#include <string>
+#include <vector>
+
+namespace clang {
+
+/// Tracks various options which control how API notes are found and handled.
+class APINotesOptions {
+public:
+  /// The Swift version which should be used for API notes.
+  llvm::VersionTuple SwiftVersion;
+
+  /// The set of search paths where we API notes can be found for particular
+  /// modules.
+  ///
+  /// The API notes in this directory are stored as <ModuleName>.apinotes, and
+  /// are only applied when building the module <ModuleName>.
+  std::vector<std::string> ModuleSearchPaths;
+};
+
+} // namespace clang
+
+#endif // LLVM_CLANG_APINOTES_APINOTESOPTIONS_H
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index b1229b2f45623..fcf6a4b2ccb23 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -1733,6 +1733,10 @@ def fswift_async_fp_EQ : Joined<["-"], "fswift-async-fp=">,
     NormalizedValuesScope<"CodeGenOptions::SwiftAsyncFramePointerKind">,
     NormalizedValues<["Auto", "Always", "Never"]>,
     MarshallingInfoEnum<CodeGenOpts<"SwiftAsyncFramePointer">, "Always">;
+def fapinotes_swift_version : Joined<["-"], "fapinotes-swift-version=">,
+  Group<f_clang_Group>, Visibility<[ClangOption, CC1Option]>,
+  MetaVarName<"<version>">,
+  HelpText<"Specify the Swift version to use when filtering API notes">;
 
 defm addrsig : BoolFOption<"addrsig",
   CodeGenOpts<"Addrsig">, DefaultFalse,
@@ -4129,6 +4133,9 @@ def ibuiltininc : Flag<["-"], "ibuiltininc">, Group<clang_i_Group>,
 def index_header_map : Flag<["-"], "index-header-map">,
   Visibility<[ClangOption, CC1Option]>,
   HelpText<"Make the next included directory (-I or -F) an indexer header map">;
+def iapinotes_modules : JoinedOrSeparate<["-"], "iapinotes-modules">, Group<clang_i_Group>,
+  Visibility<[ClangOption, CC1Option]>,
+  HelpText<"Add directory to the API notes search path referenced by module name">, MetaVarName<"<directory>">;
 def idirafter : JoinedOrSeparate<["-"], "idirafter">, Group<clang_i_Group>,
   Visibility<[ClangOption, CC1Option]>,
   HelpText<"Add directory to AFTER include search path">;
diff --git a/clang/include/clang/Frontend/CompilerInvocation.h b/clang/include/clang/Frontend/CompilerInvocation.h
index 45e263e7bc768..d9c757a8a1568 100644
--- a/clang/include/clang/Frontend/CompilerInvocation.h
+++ b/clang/include/clang/Frontend/CompilerInvocation.h
@@ -9,6 +9,7 @@
 #ifndef LLVM_CLANG_FRONTEND_COMPILERINVOCATION_H
 #define LLVM_CLANG_FRONTEND_COMPILERINVOCATION_H
 
+#include "clang/APINotes/APINotesOptions.h"
 #include "clang/Basic/CodeGenOptions.h"
 #include "clang/Basic/DiagnosticOptions.h"
 #include "clang/Basic/FileSystemOptions.h"
@@ -92,6 +93,9 @@ class CompilerInvocationBase {
 
   std::shared_ptr<MigratorOptions> MigratorOpts;
 
+  /// Options controlling API notes.
+  std::shared_ptr<APINotesOptions> APINotesOpts;
+
   /// Options controlling IRgen and the backend.
   std::shared_ptr<CodeGenOptions> CodeGenOpts;
 
@@ -131,6 +135,7 @@ class CompilerInvocationBase {
   const PreprocessorOptions &getPreprocessorOpts() const { return *PPOpts; }
   const AnalyzerOptions &getAnalyzerOpts() const { return *AnalyzerOpts; }
   const MigratorOptions &getMigratorOpts() const { return *MigratorOpts; }
+  const APINotesOptions &getAPINotesOpts() const { return *APINotesOpts; }
   const CodeGenOptions &getCodeGenOpts() const { return *CodeGenOpts; }
   const FileSystemOptions &getFileSystemOpts() const { return *FSOpts; }
   const FrontendOptions &getFrontendOpts() const { return *FrontendOpts; }
@@ -242,6 +247,7 @@ class CompilerInvocation : public CompilerInvocationBase {
   PreprocessorOptions &getPreprocessorOpts() { return *PPOpts; }
   AnalyzerOptions &getAnalyzerOpts() { return *AnalyzerOpts; }
   MigratorOptions &getMigratorOpts() { return *MigratorOpts; }
+  APINotesOptions &getAPINotesOpts() { return *APINotesOpts; }
   CodeGenOptions &getCodeGenOpts() { return *CodeGenOpts; }
   FileSystemOptions &getFileSystemOpts() { return *FSOpts; }
   FrontendOptions &getFrontendOpts() { return *FrontendOpts; }
diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp
index fd6c250efeda2..140feb0ff9e79 100644
--- a/clang/lib/Frontend/CompilerInvocation.cpp
+++ b/clang/lib/Frontend/CompilerInvocation.cpp
@@ -3261,6 +3261,17 @@ static bool ParseHeaderSearchArgs(HeaderSearchOptions &Opts, ArgList &Args,
   return Diags.getNumErrors() == NumErrorsBefore;
 }
 
+static void ParseAPINotesArgs(APINotesOptions &Opts, ArgList &Args,
+                              DiagnosticsEngine &diags) {
+  if (const Arg *A = Args.getLastArg(OPT_fapinotes_swift_version)) {
+    if (Opts.SwiftVersion.tryParse(A->getValue()))
+      diags.Report(diag::err_drv_invalid_value)
+          << A->getAsString(Args) << A->getValue();
+  }
+  for (const Arg *A : Args.filtered(OPT_iapinotes_modules))
+    Opts.ModuleSearchPaths.push_back(A->getValue());
+}
+
 /// Check if input file kind and language standard are compatible.
 static bool IsInputCompatibleWithStandard(InputKind IK,
                                           const LangStandard &S) {
@@ -4538,6 +4549,7 @@ bool CompilerInvocation::CreateFromArgsImpl(
   llvm::Triple T(Res.getTargetOpts().Triple);
   ParseHeaderSearchArgs(Res.getHeaderSearchOpts(), Args, Diags,
                         Res.getFileSystemOpts().WorkingDir);
+  ParseAPINotesArgs(Res.getAPINotesOpts(), Args, Diags);
 
   ParseLangArgs(LangOpts, Args, DashX, T, Res.getPreprocessorOpts().Includes,
                 Diags);

From 0d3377c496ef2057224540650b657efb1848d63e Mon Sep 17 00:00:00 2001
From: Diogo Teles Sant'Anna <diogoteles@google.com>
Date: Wed, 1 Nov 2023 15:08:26 -0300
Subject: [PATCH 716/877] Add Scorecard Action (#69933)

Closes #69736
---
 .github/workflows/scorecard.yml | 63 +++++++++++++++++++++++++++++++++
 README.md                       |  2 ++
 2 files changed, 65 insertions(+)
 create mode 100644 .github/workflows/scorecard.yml

diff --git a/.github/workflows/scorecard.yml b/.github/workflows/scorecard.yml
new file mode 100644
index 0000000000000..4469222c79f1e
--- /dev/null
+++ b/.github/workflows/scorecard.yml
@@ -0,0 +1,63 @@
+# This workflow uses actions that are not certified by GitHub. They are provided
+# by a third-party and are governed by separate terms of service, privacy
+# policy, and support documentation.
+
+# Check current LLVM-Project results here: https://securityscorecards.dev/viewer/?uri=github.com/llvm/llvm-project
+
+name: Scorecard supply-chain security
+on:
+  # For Branch-Protection check. Only the default branch is supported. See
+  # https://github.com/ossf/scorecard/blob/main/docs/checks.md#branch-protection
+  branch_protection_rule:
+  # To guarantee Maintained check is occasionally updated. See
+  # https://github.com/ossf/scorecard/blob/main/docs/checks.md#maintained
+  schedule:
+    - cron: '38 20 * * 4'
+  push:
+    branches: [ "main" ]
+
+# Declare default permissions as read only.
+permissions:
+  contents: read
+
+jobs:
+  analysis:
+    name: Scorecard analysis
+    runs-on: ubuntu-latest
+    permissions:
+      # Needed to upload the results to code-scanning dashboard.
+      security-events: write
+      # Needed to publish results and get a badge (see publish_results below).
+      id-token: write      
+
+    steps:
+      - name: "Checkout code"
+        uses: actions/checkout@93ea575cb5d8a053eaa0ac8fa3b40d7e05a33cc8 # v3.1.0
+        with:
+          persist-credentials: false
+
+      - name: "Run analysis"
+        uses: ossf/scorecard-action@e38b1902ae4f44df626f11ba0734b14fb91f8f86 # v2.1.2
+        with:
+          results_file: results.sarif
+          results_format: sarif
+
+          #   - Publish results to OpenSSF REST API for easy access by consumers
+          #   - Allows the repository to include the Scorecard badge.
+          #   - See https://github.com/ossf/scorecard-action#publishing-results.      
+          publish_results: true
+
+      # Upload the results as artifacts (optional). Commenting out will disable uploads of run results in SARIF
+      # format to the repository Actions tab.
+      - name: "Upload artifact"
+        uses: actions/upload-artifact@3cea5372237819ed00197afe530f5a7ea3e805c8 # v3.1.0
+        with:
+          name: SARIF file
+          path: results.sarif
+          retention-days: 5
+
+      # Upload the results to GitHub's code scanning dashboard.
+      - name: "Upload to code-scanning"
+        uses: github/codeql-action/upload-sarif@17573ee1cc1b9d061760f3a006fc4aac4f944fd5 # v2.2.4
+        with:
+          sarif_file: results.sarif
diff --git a/README.md b/README.md
index 692f2bb3916f5..4ae7eaf9b083a 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,7 @@
 # The LLVM Compiler Infrastructure
 
+[![OpenSSF Scorecard](https://api.securityscorecards.dev/projects/github.com/llvm/llvm-project/badge)](https://securityscorecards.dev/viewer/?uri=github.com/llvm/llvm-project)
+
 Welcome to the LLVM project!
 
 This repository contains the source code for LLVM, a toolkit for the

From f9a89e6b9c4345df978bf7cbfedfd2b250029278 Mon Sep 17 00:00:00 2001
From: Johannes Doerfert <johannes@jdoerfert.de>
Date: Wed, 1 Nov 2023 11:11:48 -0700
Subject: [PATCH 717/877] [OpenMP][FIX] Allocate per launch memory for GPU team
 reductions (#70752)

We used to perform team reduction on global memory allocated in the
runtime and by clang. This was racy as multiple instances of a kernel,
or different kernels with team reductions, would use the same locations.
Since we now have the kernel launch environment, we can allocate dynamic
memory per-launch, allowing us to move all the state into a non-racy
place.

Fixes: https://github.com/llvm/llvm-project/issues/70249
---
 clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp      |  75 ++----
 clang/lib/CodeGen/CGOpenMPRuntimeGPU.h        |   2 -
 .../OpenMP/nvptx_teams_reduction_codegen.cpp  | 240 +++++++++---------
 .../target_teams_generic_loop_codegen.cpp     |  20 +-
 .../DeviceRTL/include/Interface.h             |   2 +
 .../libomptarget/DeviceRTL/src/Reduction.cpp  |  10 +-
 openmp/libomptarget/include/Environment.h     |  25 +-
 .../PluginInterface/PluginInterface.cpp       |  16 +-
 .../parallel_target_teams_reduction.cpp       |  36 +++
 9 files changed, 231 insertions(+), 195 deletions(-)
 create mode 100644 openmp/libomptarget/test/offloading/parallel_target_teams_reduction.cpp

diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
index bd9329b8e2d41..0ed665e0dfb97 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -803,8 +803,30 @@ void CGOpenMPRuntimeGPU::emitKernelDeinit(CodeGenFunction &CGF,
   if (!IsSPMD)
     emitGenericVarsEpilog(CGF);
 
+  // This is temporary until we remove the fixed sized buffer.
+  ASTContext &C = CGM.getContext();
+  RecordDecl *StaticRD = C.buildImplicitRecord(
+      "_openmp_teams_reduction_type_$_", RecordDecl::TagKind::TTK_Union);
+  StaticRD->startDefinition();
+  for (const RecordDecl *TeamReductionRec : TeamsReductions) {
+    QualType RecTy = C.getRecordType(TeamReductionRec);
+    auto *Field = FieldDecl::Create(
+        C, StaticRD, SourceLocation(), SourceLocation(), nullptr, RecTy,
+        C.getTrivialTypeSourceInfo(RecTy, SourceLocation()),
+        /*BW=*/nullptr, /*Mutable=*/false,
+        /*InitStyle=*/ICIS_NoInit);
+    Field->setAccess(AS_public);
+    StaticRD->addDecl(Field);
+  }
+  StaticRD->completeDefinition();
+  QualType StaticTy = C.getRecordType(StaticRD);
+  llvm::Type *LLVMReductionsBufferTy =
+      CGM.getTypes().ConvertTypeForMem(StaticTy);
+  const auto &DL = CGM.getModule().getDataLayout();
+  uint64_t BufferSize =
+      DL.getTypeAllocSize(LLVMReductionsBufferTy).getFixedValue();
   CGBuilderTy &Bld = CGF.Builder;
-  OMPBuilder.createTargetDeinit(Bld);
+  OMPBuilder.createTargetDeinit(Bld, BufferSize);
 }
 
 void CGOpenMPRuntimeGPU::emitSPMDKernel(const OMPExecutableDirective &D,
@@ -2998,15 +3020,10 @@ void CGOpenMPRuntimeGPU::emitReduction(
         CGM.getContext(), PrivatesReductions, std::nullopt, VarFieldMap,
         C.getLangOpts().OpenMPCUDAReductionBufNum);
     TeamsReductions.push_back(TeamReductionRec);
-    if (!KernelTeamsReductionPtr) {
-      KernelTeamsReductionPtr = new llvm::GlobalVariable(
-          CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/true,
-          llvm::GlobalValue::InternalLinkage, nullptr,
-          "_openmp_teams_reductions_buffer_$_$ptr");
-    }
-    llvm::Value *GlobalBufferPtr = CGF.EmitLoadOfScalar(
-        Address(KernelTeamsReductionPtr, CGF.VoidPtrTy, CGM.getPointerAlign()),
-        /*Volatile=*/false, C.getPointerType(C.VoidPtrTy), Loc);
+    auto *KernelTeamsReductionPtr = CGF.EmitRuntimeCall(
+        OMPBuilder.getOrCreateRuntimeFunction(
+            CGM.getModule(), OMPRTL___kmpc_reduction_get_fixed_buffer),
+        {}, "_openmp_teams_reductions_buffer_$_$ptr");
     llvm::Value *GlobalToBufferCpyFn = ::emitListToGlobalCopyFunction(
         CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap);
     llvm::Value *GlobalToBufferRedFn = ::emitListToGlobalReduceFunction(
@@ -3021,7 +3038,7 @@ void CGOpenMPRuntimeGPU::emitReduction(
     llvm::Value *Args[] = {
         RTLoc,
         ThreadId,
-        GlobalBufferPtr,
+        KernelTeamsReductionPtr,
         CGF.Builder.getInt32(C.getLangOpts().OpenMPCUDAReductionBufNum),
         RL,
         ShuffleAndReduceFn,
@@ -3654,42 +3671,6 @@ void CGOpenMPRuntimeGPU::processRequiresDirective(
   CGOpenMPRuntime::processRequiresDirective(D);
 }
 
-void CGOpenMPRuntimeGPU::clear() {
-
-  if (!TeamsReductions.empty()) {
-    ASTContext &C = CGM.getContext();
-    RecordDecl *StaticRD = C.buildImplicitRecord(
-        "_openmp_teams_reduction_type_$_", RecordDecl::TagKind::TTK_Union);
-    StaticRD->startDefinition();
-    for (const RecordDecl *TeamReductionRec : TeamsReductions) {
-      QualType RecTy = C.getRecordType(TeamReductionRec);
-      auto *Field = FieldDecl::Create(
-          C, StaticRD, SourceLocation(), SourceLocation(), nullptr, RecTy,
-          C.getTrivialTypeSourceInfo(RecTy, SourceLocation()),
-          /*BW=*/nullptr, /*Mutable=*/false,
-          /*InitStyle=*/ICIS_NoInit);
-      Field->setAccess(AS_public);
-      StaticRD->addDecl(Field);
-    }
-    StaticRD->completeDefinition();
-    QualType StaticTy = C.getRecordType(StaticRD);
-    llvm::Type *LLVMReductionsBufferTy =
-        CGM.getTypes().ConvertTypeForMem(StaticTy);
-    // FIXME: nvlink does not handle weak linkage correctly (object with the
-    // different size are reported as erroneous).
-    // Restore CommonLinkage as soon as nvlink is fixed.
-    auto *GV = new llvm::GlobalVariable(
-        CGM.getModule(), LLVMReductionsBufferTy,
-        /*isConstant=*/false, llvm::GlobalValue::InternalLinkage,
-        llvm::Constant::getNullValue(LLVMReductionsBufferTy),
-        "_openmp_teams_reductions_buffer_$_");
-    KernelTeamsReductionPtr->setInitializer(
-        llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(GV,
-                                                             CGM.VoidPtrTy));
-  }
-  CGOpenMPRuntime::clear();
-}
-
 llvm::Value *CGOpenMPRuntimeGPU::getGPUNumThreads(CodeGenFunction &CGF) {
   CGBuilderTy &Bld = CGF.Builder;
   llvm::Module *M = &CGF.CGM.getModule();
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
index 46e1361f2f895..141436f26230d 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.h
@@ -130,7 +130,6 @@ class CGOpenMPRuntimeGPU : public CGOpenMPRuntime {
 
 public:
   explicit CGOpenMPRuntimeGPU(CodeGenModule &CGM);
-  void clear() override;
 
   bool isGPU() const override { return true; };
 
@@ -386,7 +385,6 @@ class CGOpenMPRuntimeGPU : public CGOpenMPRuntime {
   /// Maps the function to the list of the globalized variables with their
   /// addresses.
   llvm::SmallDenseMap<llvm::Function *, FunctionData> FunctionGlobalizedDecls;
-  llvm::GlobalVariable *KernelTeamsReductionPtr = nullptr;
   /// List of the records with the list of fields for the reductions across the
   /// teams. Used to build the intermediate buffer for the fast teams
   /// reductions.
diff --git a/clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp b/clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp
index f4ec40b030a41..137ef3861751b 100644
--- a/clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp
@@ -97,14 +97,14 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
 // CHECK1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0
 // CHECK1-NEXT:    store ptr [[E1]], ptr [[TMP4]], align 8
-// CHECK1-NEXT:    [[TMP5:%.*]] = load ptr, ptr @"_openmp_teams_reductions_buffer_$_$ptr", align 8
-// CHECK1-NEXT:    [[TMP6:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(ptr @[[GLOB1]], i32 [[TMP3]], ptr [[TMP5]], i32 1024, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_omp_reduction_shuffle_and_reduce_func, ptr @_omp_reduction_inter_warp_copy_func, ptr @_omp_reduction_list_to_global_copy_func, ptr @_omp_reduction_list_to_global_reduce_func, ptr @_omp_reduction_global_to_list_copy_func, ptr @_omp_reduction_global_to_list_reduce_func)
-// CHECK1-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP6]], 1
-// CHECK1-NEXT:    br i1 [[TMP7]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
+// CHECK1-NEXT:    %"_openmp_teams_reductions_buffer_$_$ptr" = call ptr @__kmpc_reduction_get_fixed_buffer()
+// CHECK1-NEXT:    [[TMP5:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(ptr @[[GLOB1]], i32 [[TMP3]], ptr %"_openmp_teams_reductions_buffer_$_$ptr", i32 1024, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_omp_reduction_shuffle_and_reduce_func, ptr @_omp_reduction_inter_warp_copy_func, ptr @_omp_reduction_list_to_global_copy_func, ptr @_omp_reduction_list_to_global_reduce_func, ptr @_omp_reduction_global_to_list_copy_func, ptr @_omp_reduction_global_to_list_reduce_func)
+// CHECK1-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 1
+// CHECK1-NEXT:    br i1 [[TMP6]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
 // CHECK1:       .omp.reduction.then:
-// CHECK1-NEXT:    [[TMP8:%.*]] = load double, ptr [[TMP0]], align 8
-// CHECK1-NEXT:    [[TMP9:%.*]] = load double, ptr [[E1]], align 8
-// CHECK1-NEXT:    [[ADD2:%.*]] = fadd double [[TMP8]], [[TMP9]]
+// CHECK1-NEXT:    [[TMP7:%.*]] = load double, ptr [[TMP0]], align 8
+// CHECK1-NEXT:    [[TMP8:%.*]] = load double, ptr [[E1]], align 8
+// CHECK1-NEXT:    [[ADD2:%.*]] = fadd double [[TMP7]], [[TMP8]]
 // CHECK1-NEXT:    store double [[ADD2]], ptr [[TMP0]], align 8
 // CHECK1-NEXT:    call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP3]])
 // CHECK1-NEXT:    br label [[DOTOMP_REDUCTION_DONE]]
@@ -386,21 +386,21 @@ int bar(int n){
 // CHECK1-NEXT:    store ptr [[C1]], ptr [[TMP6]], align 8
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 1
 // CHECK1-NEXT:    store ptr [[D2]], ptr [[TMP7]], align 8
-// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr @"_openmp_teams_reductions_buffer_$_$ptr", align 8
-// CHECK1-NEXT:    [[TMP9:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(ptr @[[GLOB1]], i32 [[TMP5]], ptr [[TMP8]], i32 1024, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_omp_reduction_shuffle_and_reduce_func1, ptr @_omp_reduction_inter_warp_copy_func2, ptr @_omp_reduction_list_to_global_copy_func3, ptr @_omp_reduction_list_to_global_reduce_func4, ptr @_omp_reduction_global_to_list_copy_func5, ptr @_omp_reduction_global_to_list_reduce_func6)
-// CHECK1-NEXT:    [[TMP10:%.*]] = icmp eq i32 [[TMP9]], 1
-// CHECK1-NEXT:    br i1 [[TMP10]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
+// CHECK1-NEXT:    %"_openmp_teams_reductions_buffer_$_$ptr" = call ptr @__kmpc_reduction_get_fixed_buffer()
+// CHECK1-NEXT:    [[TMP8:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(ptr @[[GLOB1]], i32 [[TMP5]], ptr %"_openmp_teams_reductions_buffer_$_$ptr", i32 1024, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_omp_reduction_shuffle_and_reduce_func1, ptr @_omp_reduction_inter_warp_copy_func2, ptr @_omp_reduction_list_to_global_copy_func3, ptr @_omp_reduction_list_to_global_reduce_func4, ptr @_omp_reduction_global_to_list_copy_func5, ptr @_omp_reduction_global_to_list_reduce_func6)
+// CHECK1-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 1
+// CHECK1-NEXT:    br i1 [[TMP9]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
 // CHECK1:       .omp.reduction.then:
-// CHECK1-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP0]], align 1
-// CHECK1-NEXT:    [[CONV4:%.*]] = sext i8 [[TMP11]] to i32
-// CHECK1-NEXT:    [[TMP12:%.*]] = load i8, ptr [[C1]], align 1
-// CHECK1-NEXT:    [[CONV5:%.*]] = sext i8 [[TMP12]] to i32
+// CHECK1-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP0]], align 1
+// CHECK1-NEXT:    [[CONV4:%.*]] = sext i8 [[TMP10]] to i32
+// CHECK1-NEXT:    [[TMP11:%.*]] = load i8, ptr [[C1]], align 1
+// CHECK1-NEXT:    [[CONV5:%.*]] = sext i8 [[TMP11]] to i32
 // CHECK1-NEXT:    [[XOR6:%.*]] = xor i32 [[CONV4]], [[CONV5]]
 // CHECK1-NEXT:    [[CONV7:%.*]] = trunc i32 [[XOR6]] to i8
 // CHECK1-NEXT:    store i8 [[CONV7]], ptr [[TMP0]], align 1
-// CHECK1-NEXT:    [[TMP13:%.*]] = load float, ptr [[TMP1]], align 4
-// CHECK1-NEXT:    [[TMP14:%.*]] = load float, ptr [[D2]], align 4
-// CHECK1-NEXT:    [[MUL8:%.*]] = fmul float [[TMP13]], [[TMP14]]
+// CHECK1-NEXT:    [[TMP12:%.*]] = load float, ptr [[TMP1]], align 4
+// CHECK1-NEXT:    [[TMP13:%.*]] = load float, ptr [[D2]], align 4
+// CHECK1-NEXT:    [[MUL8:%.*]] = fmul float [[TMP12]], [[TMP13]]
 // CHECK1-NEXT:    store float [[MUL8]], ptr [[TMP1]], align 4
 // CHECK1-NEXT:    call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP5]])
 // CHECK1-NEXT:    br label [[DOTOMP_REDUCTION_DONE]]
@@ -727,29 +727,29 @@ int bar(int n){
 // CHECK1-NEXT:    store ptr [[A1]], ptr [[TMP6]], align 8
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 1
 // CHECK1-NEXT:    store ptr [[B2]], ptr [[TMP7]], align 8
-// CHECK1-NEXT:    [[TMP8:%.*]] = load ptr, ptr @"_openmp_teams_reductions_buffer_$_$ptr", align 8
-// CHECK1-NEXT:    [[TMP9:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(ptr @[[GLOB1]], i32 [[TMP5]], ptr [[TMP8]], i32 1024, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_omp_reduction_shuffle_and_reduce_func9, ptr @_omp_reduction_inter_warp_copy_func10, ptr @_omp_reduction_list_to_global_copy_func11, ptr @_omp_reduction_list_to_global_reduce_func12, ptr @_omp_reduction_global_to_list_copy_func13, ptr @_omp_reduction_global_to_list_reduce_func14)
-// CHECK1-NEXT:    [[TMP10:%.*]] = icmp eq i32 [[TMP9]], 1
-// CHECK1-NEXT:    br i1 [[TMP10]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
+// CHECK1-NEXT:    %"_openmp_teams_reductions_buffer_$_$ptr" = call ptr @__kmpc_reduction_get_fixed_buffer()
+// CHECK1-NEXT:    [[TMP8:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(ptr @[[GLOB1]], i32 [[TMP5]], ptr %"_openmp_teams_reductions_buffer_$_$ptr", i32 1024, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_omp_reduction_shuffle_and_reduce_func9, ptr @_omp_reduction_inter_warp_copy_func10, ptr @_omp_reduction_list_to_global_copy_func11, ptr @_omp_reduction_list_to_global_reduce_func12, ptr @_omp_reduction_global_to_list_copy_func13, ptr @_omp_reduction_global_to_list_reduce_func14)
+// CHECK1-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 1
+// CHECK1-NEXT:    br i1 [[TMP9]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
 // CHECK1:       .omp.reduction.then:
-// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP0]], align 4
-// CHECK1-NEXT:    [[TMP12:%.*]] = load i32, ptr [[A1]], align 4
-// CHECK1-NEXT:    [[OR:%.*]] = or i32 [[TMP11]], [[TMP12]]
+// CHECK1-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK1-NEXT:    [[TMP11:%.*]] = load i32, ptr [[A1]], align 4
+// CHECK1-NEXT:    [[OR:%.*]] = or i32 [[TMP10]], [[TMP11]]
 // CHECK1-NEXT:    store i32 [[OR]], ptr [[TMP0]], align 4
-// CHECK1-NEXT:    [[TMP13:%.*]] = load i16, ptr [[TMP1]], align 2
-// CHECK1-NEXT:    [[CONV:%.*]] = sext i16 [[TMP13]] to i32
-// CHECK1-NEXT:    [[TMP14:%.*]] = load i16, ptr [[B2]], align 2
-// CHECK1-NEXT:    [[CONV3:%.*]] = sext i16 [[TMP14]] to i32
+// CHECK1-NEXT:    [[TMP12:%.*]] = load i16, ptr [[TMP1]], align 2
+// CHECK1-NEXT:    [[CONV:%.*]] = sext i16 [[TMP12]] to i32
+// CHECK1-NEXT:    [[TMP13:%.*]] = load i16, ptr [[B2]], align 2
+// CHECK1-NEXT:    [[CONV3:%.*]] = sext i16 [[TMP13]] to i32
 // CHECK1-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CONV]], [[CONV3]]
 // CHECK1-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
 // CHECK1:       cond.true:
-// CHECK1-NEXT:    [[TMP15:%.*]] = load i16, ptr [[TMP1]], align 2
+// CHECK1-NEXT:    [[TMP14:%.*]] = load i16, ptr [[TMP1]], align 2
 // CHECK1-NEXT:    br label [[COND_END:%.*]]
 // CHECK1:       cond.false:
-// CHECK1-NEXT:    [[TMP16:%.*]] = load i16, ptr [[B2]], align 2
+// CHECK1-NEXT:    [[TMP15:%.*]] = load i16, ptr [[B2]], align 2
 // CHECK1-NEXT:    br label [[COND_END]]
 // CHECK1:       cond.end:
-// CHECK1-NEXT:    [[COND:%.*]] = phi i16 [ [[TMP15]], [[COND_TRUE]] ], [ [[TMP16]], [[COND_FALSE]] ]
+// CHECK1-NEXT:    [[COND:%.*]] = phi i16 [ [[TMP14]], [[COND_TRUE]] ], [ [[TMP15]], [[COND_FALSE]] ]
 // CHECK1-NEXT:    store i16 [[COND]], ptr [[TMP1]], align 2
 // CHECK1-NEXT:    call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP5]])
 // CHECK1-NEXT:    br label [[DOTOMP_REDUCTION_DONE]]
@@ -1157,13 +1157,13 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTADDR1]], align 4
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP3]], i64 0, i64 0
 // CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8
-// CHECK1-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], ptr [[TMP4]], i32 0, i32 0
+// CHECK1-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2:%.*]], ptr [[TMP4]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [1024 x i32], ptr [[A]], i32 0, i32 [[TMP5]]
 // CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP7]], align 4
 // CHECK1-NEXT:    store i32 [[TMP9]], ptr [[TMP8]], align 128
 // CHECK1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP3]], i64 0, i64 1
 // CHECK1-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[TMP10]], align 8
-// CHECK1-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], ptr [[TMP4]], i32 0, i32 1
+// CHECK1-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], ptr [[TMP4]], i32 0, i32 1
 // CHECK1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [1024 x i16], ptr [[B]], i32 0, i32 [[TMP5]]
 // CHECK1-NEXT:    [[TMP13:%.*]] = load i16, ptr [[TMP11]], align 2
 // CHECK1-NEXT:    store i16 [[TMP13]], ptr [[TMP12]], align 128
@@ -1183,11 +1183,11 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTADDR]], align 8
 // CHECK1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTADDR1]], align 4
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0
-// CHECK1-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], ptr [[TMP3]], i32 0, i32 0
+// CHECK1-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2:%.*]], ptr [[TMP3]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1024 x i32], ptr [[A]], i32 0, i32 [[TMP4]]
 // CHECK1-NEXT:    store ptr [[TMP6]], ptr [[TMP5]], align 8
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 1
-// CHECK1-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], ptr [[TMP3]], i32 0, i32 1
+// CHECK1-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], ptr [[TMP3]], i32 0, i32 1
 // CHECK1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [1024 x i16], ptr [[B]], i32 0, i32 [[TMP4]]
 // CHECK1-NEXT:    store ptr [[TMP8]], ptr [[TMP7]], align 8
 // CHECK1-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[DOTADDR2]], align 8
@@ -1209,13 +1209,13 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTADDR1]], align 4
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP3]], i64 0, i64 0
 // CHECK1-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8
-// CHECK1-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], ptr [[TMP4]], i32 0, i32 0
+// CHECK1-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2:%.*]], ptr [[TMP4]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [1024 x i32], ptr [[A]], i32 0, i32 [[TMP5]]
 // CHECK1-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP8]], align 128
 // CHECK1-NEXT:    store i32 [[TMP9]], ptr [[TMP7]], align 4
 // CHECK1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP3]], i64 0, i64 1
 // CHECK1-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[TMP10]], align 8
-// CHECK1-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], ptr [[TMP4]], i32 0, i32 1
+// CHECK1-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], ptr [[TMP4]], i32 0, i32 1
 // CHECK1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [1024 x i16], ptr [[B]], i32 0, i32 [[TMP5]]
 // CHECK1-NEXT:    [[TMP13:%.*]] = load i16, ptr [[TMP12]], align 128
 // CHECK1-NEXT:    store i16 [[TMP13]], ptr [[TMP11]], align 2
@@ -1235,11 +1235,11 @@ int bar(int n){
 // CHECK1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTADDR]], align 8
 // CHECK1-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTADDR1]], align 4
 // CHECK1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0
-// CHECK1-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], ptr [[TMP3]], i32 0, i32 0
+// CHECK1-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2:%.*]], ptr [[TMP3]], i32 0, i32 0
 // CHECK1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1024 x i32], ptr [[A]], i32 0, i32 [[TMP4]]
 // CHECK1-NEXT:    store ptr [[TMP6]], ptr [[TMP5]], align 8
 // CHECK1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 1
-// CHECK1-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], ptr [[TMP3]], i32 0, i32 1
+// CHECK1-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], ptr [[TMP3]], i32 0, i32 1
 // CHECK1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [1024 x i16], ptr [[B]], i32 0, i32 [[TMP4]]
 // CHECK1-NEXT:    store ptr [[TMP8]], ptr [[TMP7]], align 8
 // CHECK1-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[DOTADDR2]], align 8
@@ -1294,14 +1294,14 @@ int bar(int n){
 // CHECK2-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
 // CHECK2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
 // CHECK2-NEXT:    store ptr [[E1]], ptr [[TMP4]], align 4
-// CHECK2-NEXT:    [[TMP5:%.*]] = load ptr, ptr @"_openmp_teams_reductions_buffer_$_$ptr", align 4
-// CHECK2-NEXT:    [[TMP6:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(ptr @[[GLOB1]], i32 [[TMP3]], ptr [[TMP5]], i32 1024, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_omp_reduction_shuffle_and_reduce_func, ptr @_omp_reduction_inter_warp_copy_func, ptr @_omp_reduction_list_to_global_copy_func, ptr @_omp_reduction_list_to_global_reduce_func, ptr @_omp_reduction_global_to_list_copy_func, ptr @_omp_reduction_global_to_list_reduce_func)
-// CHECK2-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP6]], 1
-// CHECK2-NEXT:    br i1 [[TMP7]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
+// CHECK2-NEXT:    %"_openmp_teams_reductions_buffer_$_$ptr" = call ptr @__kmpc_reduction_get_fixed_buffer()
+// CHECK2-NEXT:    [[TMP5:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(ptr @[[GLOB1]], i32 [[TMP3]], ptr %"_openmp_teams_reductions_buffer_$_$ptr", i32 1024, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_omp_reduction_shuffle_and_reduce_func, ptr @_omp_reduction_inter_warp_copy_func, ptr @_omp_reduction_list_to_global_copy_func, ptr @_omp_reduction_list_to_global_reduce_func, ptr @_omp_reduction_global_to_list_copy_func, ptr @_omp_reduction_global_to_list_reduce_func)
+// CHECK2-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 1
+// CHECK2-NEXT:    br i1 [[TMP6]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
 // CHECK2:       .omp.reduction.then:
-// CHECK2-NEXT:    [[TMP8:%.*]] = load double, ptr [[TMP0]], align 8
-// CHECK2-NEXT:    [[TMP9:%.*]] = load double, ptr [[E1]], align 8
-// CHECK2-NEXT:    [[ADD2:%.*]] = fadd double [[TMP8]], [[TMP9]]
+// CHECK2-NEXT:    [[TMP7:%.*]] = load double, ptr [[TMP0]], align 8
+// CHECK2-NEXT:    [[TMP8:%.*]] = load double, ptr [[E1]], align 8
+// CHECK2-NEXT:    [[ADD2:%.*]] = fadd double [[TMP7]], [[TMP8]]
 // CHECK2-NEXT:    store double [[ADD2]], ptr [[TMP0]], align 8
 // CHECK2-NEXT:    call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP3]])
 // CHECK2-NEXT:    br label [[DOTOMP_REDUCTION_DONE]]
@@ -1583,21 +1583,21 @@ int bar(int n){
 // CHECK2-NEXT:    store ptr [[C1]], ptr [[TMP6]], align 4
 // CHECK2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1
 // CHECK2-NEXT:    store ptr [[D2]], ptr [[TMP7]], align 4
-// CHECK2-NEXT:    [[TMP8:%.*]] = load ptr, ptr @"_openmp_teams_reductions_buffer_$_$ptr", align 4
-// CHECK2-NEXT:    [[TMP9:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(ptr @[[GLOB1]], i32 [[TMP5]], ptr [[TMP8]], i32 1024, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_omp_reduction_shuffle_and_reduce_func1, ptr @_omp_reduction_inter_warp_copy_func2, ptr @_omp_reduction_list_to_global_copy_func3, ptr @_omp_reduction_list_to_global_reduce_func4, ptr @_omp_reduction_global_to_list_copy_func5, ptr @_omp_reduction_global_to_list_reduce_func6)
-// CHECK2-NEXT:    [[TMP10:%.*]] = icmp eq i32 [[TMP9]], 1
-// CHECK2-NEXT:    br i1 [[TMP10]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
+// CHECK2-NEXT:    %"_openmp_teams_reductions_buffer_$_$ptr" = call ptr @__kmpc_reduction_get_fixed_buffer()
+// CHECK2-NEXT:    [[TMP8:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(ptr @[[GLOB1]], i32 [[TMP5]], ptr %"_openmp_teams_reductions_buffer_$_$ptr", i32 1024, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_omp_reduction_shuffle_and_reduce_func1, ptr @_omp_reduction_inter_warp_copy_func2, ptr @_omp_reduction_list_to_global_copy_func3, ptr @_omp_reduction_list_to_global_reduce_func4, ptr @_omp_reduction_global_to_list_copy_func5, ptr @_omp_reduction_global_to_list_reduce_func6)
+// CHECK2-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 1
+// CHECK2-NEXT:    br i1 [[TMP9]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
 // CHECK2:       .omp.reduction.then:
-// CHECK2-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP0]], align 1
-// CHECK2-NEXT:    [[CONV4:%.*]] = sext i8 [[TMP11]] to i32
-// CHECK2-NEXT:    [[TMP12:%.*]] = load i8, ptr [[C1]], align 1
-// CHECK2-NEXT:    [[CONV5:%.*]] = sext i8 [[TMP12]] to i32
+// CHECK2-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP0]], align 1
+// CHECK2-NEXT:    [[CONV4:%.*]] = sext i8 [[TMP10]] to i32
+// CHECK2-NEXT:    [[TMP11:%.*]] = load i8, ptr [[C1]], align 1
+// CHECK2-NEXT:    [[CONV5:%.*]] = sext i8 [[TMP11]] to i32
 // CHECK2-NEXT:    [[XOR6:%.*]] = xor i32 [[CONV4]], [[CONV5]]
 // CHECK2-NEXT:    [[CONV7:%.*]] = trunc i32 [[XOR6]] to i8
 // CHECK2-NEXT:    store i8 [[CONV7]], ptr [[TMP0]], align 1
-// CHECK2-NEXT:    [[TMP13:%.*]] = load float, ptr [[TMP1]], align 4
-// CHECK2-NEXT:    [[TMP14:%.*]] = load float, ptr [[D2]], align 4
-// CHECK2-NEXT:    [[MUL8:%.*]] = fmul float [[TMP13]], [[TMP14]]
+// CHECK2-NEXT:    [[TMP12:%.*]] = load float, ptr [[TMP1]], align 4
+// CHECK2-NEXT:    [[TMP13:%.*]] = load float, ptr [[D2]], align 4
+// CHECK2-NEXT:    [[MUL8:%.*]] = fmul float [[TMP12]], [[TMP13]]
 // CHECK2-NEXT:    store float [[MUL8]], ptr [[TMP1]], align 4
 // CHECK2-NEXT:    call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP5]])
 // CHECK2-NEXT:    br label [[DOTOMP_REDUCTION_DONE]]
@@ -1924,29 +1924,29 @@ int bar(int n){
 // CHECK2-NEXT:    store ptr [[A1]], ptr [[TMP6]], align 4
 // CHECK2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1
 // CHECK2-NEXT:    store ptr [[B2]], ptr [[TMP7]], align 4
-// CHECK2-NEXT:    [[TMP8:%.*]] = load ptr, ptr @"_openmp_teams_reductions_buffer_$_$ptr", align 4
-// CHECK2-NEXT:    [[TMP9:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(ptr @[[GLOB1]], i32 [[TMP5]], ptr [[TMP8]], i32 1024, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_omp_reduction_shuffle_and_reduce_func9, ptr @_omp_reduction_inter_warp_copy_func10, ptr @_omp_reduction_list_to_global_copy_func11, ptr @_omp_reduction_list_to_global_reduce_func12, ptr @_omp_reduction_global_to_list_copy_func13, ptr @_omp_reduction_global_to_list_reduce_func14)
-// CHECK2-NEXT:    [[TMP10:%.*]] = icmp eq i32 [[TMP9]], 1
-// CHECK2-NEXT:    br i1 [[TMP10]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
+// CHECK2-NEXT:    %"_openmp_teams_reductions_buffer_$_$ptr" = call ptr @__kmpc_reduction_get_fixed_buffer()
+// CHECK2-NEXT:    [[TMP8:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(ptr @[[GLOB1]], i32 [[TMP5]], ptr %"_openmp_teams_reductions_buffer_$_$ptr", i32 1024, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_omp_reduction_shuffle_and_reduce_func9, ptr @_omp_reduction_inter_warp_copy_func10, ptr @_omp_reduction_list_to_global_copy_func11, ptr @_omp_reduction_list_to_global_reduce_func12, ptr @_omp_reduction_global_to_list_copy_func13, ptr @_omp_reduction_global_to_list_reduce_func14)
+// CHECK2-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 1
+// CHECK2-NEXT:    br i1 [[TMP9]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
 // CHECK2:       .omp.reduction.then:
-// CHECK2-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP0]], align 4
-// CHECK2-NEXT:    [[TMP12:%.*]] = load i32, ptr [[A1]], align 4
-// CHECK2-NEXT:    [[OR:%.*]] = or i32 [[TMP11]], [[TMP12]]
+// CHECK2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK2-NEXT:    [[TMP11:%.*]] = load i32, ptr [[A1]], align 4
+// CHECK2-NEXT:    [[OR:%.*]] = or i32 [[TMP10]], [[TMP11]]
 // CHECK2-NEXT:    store i32 [[OR]], ptr [[TMP0]], align 4
-// CHECK2-NEXT:    [[TMP13:%.*]] = load i16, ptr [[TMP1]], align 2
-// CHECK2-NEXT:    [[CONV:%.*]] = sext i16 [[TMP13]] to i32
-// CHECK2-NEXT:    [[TMP14:%.*]] = load i16, ptr [[B2]], align 2
-// CHECK2-NEXT:    [[CONV3:%.*]] = sext i16 [[TMP14]] to i32
+// CHECK2-NEXT:    [[TMP12:%.*]] = load i16, ptr [[TMP1]], align 2
+// CHECK2-NEXT:    [[CONV:%.*]] = sext i16 [[TMP12]] to i32
+// CHECK2-NEXT:    [[TMP13:%.*]] = load i16, ptr [[B2]], align 2
+// CHECK2-NEXT:    [[CONV3:%.*]] = sext i16 [[TMP13]] to i32
 // CHECK2-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CONV]], [[CONV3]]
 // CHECK2-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
 // CHECK2:       cond.true:
-// CHECK2-NEXT:    [[TMP15:%.*]] = load i16, ptr [[TMP1]], align 2
+// CHECK2-NEXT:    [[TMP14:%.*]] = load i16, ptr [[TMP1]], align 2
 // CHECK2-NEXT:    br label [[COND_END:%.*]]
 // CHECK2:       cond.false:
-// CHECK2-NEXT:    [[TMP16:%.*]] = load i16, ptr [[B2]], align 2
+// CHECK2-NEXT:    [[TMP15:%.*]] = load i16, ptr [[B2]], align 2
 // CHECK2-NEXT:    br label [[COND_END]]
 // CHECK2:       cond.end:
-// CHECK2-NEXT:    [[COND:%.*]] = phi i16 [ [[TMP15]], [[COND_TRUE]] ], [ [[TMP16]], [[COND_FALSE]] ]
+// CHECK2-NEXT:    [[COND:%.*]] = phi i16 [ [[TMP14]], [[COND_TRUE]] ], [ [[TMP15]], [[COND_FALSE]] ]
 // CHECK2-NEXT:    store i16 [[COND]], ptr [[TMP1]], align 2
 // CHECK2-NEXT:    call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP5]])
 // CHECK2-NEXT:    br label [[DOTOMP_REDUCTION_DONE]]
@@ -2354,13 +2354,13 @@ int bar(int n){
 // CHECK2-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTADDR1]], align 4
 // CHECK2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP3]], i32 0, i32 0
 // CHECK2-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 4
-// CHECK2-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], ptr [[TMP4]], i32 0, i32 0
+// CHECK2-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2:%.*]], ptr [[TMP4]], i32 0, i32 0
 // CHECK2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [1024 x i32], ptr [[A]], i32 0, i32 [[TMP5]]
 // CHECK2-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP7]], align 4
 // CHECK2-NEXT:    store i32 [[TMP9]], ptr [[TMP8]], align 128
 // CHECK2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP3]], i32 0, i32 1
 // CHECK2-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[TMP10]], align 4
-// CHECK2-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], ptr [[TMP4]], i32 0, i32 1
+// CHECK2-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], ptr [[TMP4]], i32 0, i32 1
 // CHECK2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [1024 x i16], ptr [[B]], i32 0, i32 [[TMP5]]
 // CHECK2-NEXT:    [[TMP13:%.*]] = load i16, ptr [[TMP11]], align 2
 // CHECK2-NEXT:    store i16 [[TMP13]], ptr [[TMP12]], align 128
@@ -2380,11 +2380,11 @@ int bar(int n){
 // CHECK2-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTADDR]], align 4
 // CHECK2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTADDR1]], align 4
 // CHECK2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
-// CHECK2-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], ptr [[TMP3]], i32 0, i32 0
+// CHECK2-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2:%.*]], ptr [[TMP3]], i32 0, i32 0
 // CHECK2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1024 x i32], ptr [[A]], i32 0, i32 [[TMP4]]
 // CHECK2-NEXT:    store ptr [[TMP6]], ptr [[TMP5]], align 4
 // CHECK2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1
-// CHECK2-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], ptr [[TMP3]], i32 0, i32 1
+// CHECK2-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], ptr [[TMP3]], i32 0, i32 1
 // CHECK2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [1024 x i16], ptr [[B]], i32 0, i32 [[TMP4]]
 // CHECK2-NEXT:    store ptr [[TMP8]], ptr [[TMP7]], align 4
 // CHECK2-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[DOTADDR2]], align 4
@@ -2406,13 +2406,13 @@ int bar(int n){
 // CHECK2-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTADDR1]], align 4
 // CHECK2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP3]], i32 0, i32 0
 // CHECK2-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 4
-// CHECK2-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], ptr [[TMP4]], i32 0, i32 0
+// CHECK2-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2:%.*]], ptr [[TMP4]], i32 0, i32 0
 // CHECK2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [1024 x i32], ptr [[A]], i32 0, i32 [[TMP5]]
 // CHECK2-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP8]], align 128
 // CHECK2-NEXT:    store i32 [[TMP9]], ptr [[TMP7]], align 4
 // CHECK2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP3]], i32 0, i32 1
 // CHECK2-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[TMP10]], align 4
-// CHECK2-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], ptr [[TMP4]], i32 0, i32 1
+// CHECK2-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], ptr [[TMP4]], i32 0, i32 1
 // CHECK2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [1024 x i16], ptr [[B]], i32 0, i32 [[TMP5]]
 // CHECK2-NEXT:    [[TMP13:%.*]] = load i16, ptr [[TMP12]], align 128
 // CHECK2-NEXT:    store i16 [[TMP13]], ptr [[TMP11]], align 2
@@ -2432,11 +2432,11 @@ int bar(int n){
 // CHECK2-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTADDR]], align 4
 // CHECK2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTADDR1]], align 4
 // CHECK2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
-// CHECK2-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], ptr [[TMP3]], i32 0, i32 0
+// CHECK2-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2:%.*]], ptr [[TMP3]], i32 0, i32 0
 // CHECK2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1024 x i32], ptr [[A]], i32 0, i32 [[TMP4]]
 // CHECK2-NEXT:    store ptr [[TMP6]], ptr [[TMP5]], align 4
 // CHECK2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1
-// CHECK2-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], ptr [[TMP3]], i32 0, i32 1
+// CHECK2-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], ptr [[TMP3]], i32 0, i32 1
 // CHECK2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [1024 x i16], ptr [[B]], i32 0, i32 [[TMP4]]
 // CHECK2-NEXT:    store ptr [[TMP8]], ptr [[TMP7]], align 4
 // CHECK2-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[DOTADDR2]], align 4
@@ -2491,14 +2491,14 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
 // CHECK3-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
 // CHECK3-NEXT:    store ptr [[E1]], ptr [[TMP4]], align 4
-// CHECK3-NEXT:    [[TMP5:%.*]] = load ptr, ptr @"_openmp_teams_reductions_buffer_$_$ptr", align 4
-// CHECK3-NEXT:    [[TMP6:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(ptr @[[GLOB1]], i32 [[TMP3]], ptr [[TMP5]], i32 2048, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_omp_reduction_shuffle_and_reduce_func, ptr @_omp_reduction_inter_warp_copy_func, ptr @_omp_reduction_list_to_global_copy_func, ptr @_omp_reduction_list_to_global_reduce_func, ptr @_omp_reduction_global_to_list_copy_func, ptr @_omp_reduction_global_to_list_reduce_func)
-// CHECK3-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP6]], 1
-// CHECK3-NEXT:    br i1 [[TMP7]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
+// CHECK3-NEXT:    %"_openmp_teams_reductions_buffer_$_$ptr" = call ptr @__kmpc_reduction_get_fixed_buffer()
+// CHECK3-NEXT:    [[TMP5:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(ptr @[[GLOB1]], i32 [[TMP3]], ptr %"_openmp_teams_reductions_buffer_$_$ptr", i32 2048, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_omp_reduction_shuffle_and_reduce_func, ptr @_omp_reduction_inter_warp_copy_func, ptr @_omp_reduction_list_to_global_copy_func, ptr @_omp_reduction_list_to_global_reduce_func, ptr @_omp_reduction_global_to_list_copy_func, ptr @_omp_reduction_global_to_list_reduce_func)
+// CHECK3-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 1
+// CHECK3-NEXT:    br i1 [[TMP6]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
 // CHECK3:       .omp.reduction.then:
-// CHECK3-NEXT:    [[TMP8:%.*]] = load double, ptr [[TMP0]], align 8
-// CHECK3-NEXT:    [[TMP9:%.*]] = load double, ptr [[E1]], align 8
-// CHECK3-NEXT:    [[ADD2:%.*]] = fadd double [[TMP8]], [[TMP9]]
+// CHECK3-NEXT:    [[TMP7:%.*]] = load double, ptr [[TMP0]], align 8
+// CHECK3-NEXT:    [[TMP8:%.*]] = load double, ptr [[E1]], align 8
+// CHECK3-NEXT:    [[ADD2:%.*]] = fadd double [[TMP7]], [[TMP8]]
 // CHECK3-NEXT:    store double [[ADD2]], ptr [[TMP0]], align 8
 // CHECK3-NEXT:    call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP3]])
 // CHECK3-NEXT:    br label [[DOTOMP_REDUCTION_DONE]]
@@ -2780,21 +2780,21 @@ int bar(int n){
 // CHECK3-NEXT:    store ptr [[C1]], ptr [[TMP6]], align 4
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1
 // CHECK3-NEXT:    store ptr [[D2]], ptr [[TMP7]], align 4
-// CHECK3-NEXT:    [[TMP8:%.*]] = load ptr, ptr @"_openmp_teams_reductions_buffer_$_$ptr", align 4
-// CHECK3-NEXT:    [[TMP9:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(ptr @[[GLOB1]], i32 [[TMP5]], ptr [[TMP8]], i32 2048, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_omp_reduction_shuffle_and_reduce_func1, ptr @_omp_reduction_inter_warp_copy_func2, ptr @_omp_reduction_list_to_global_copy_func3, ptr @_omp_reduction_list_to_global_reduce_func4, ptr @_omp_reduction_global_to_list_copy_func5, ptr @_omp_reduction_global_to_list_reduce_func6)
-// CHECK3-NEXT:    [[TMP10:%.*]] = icmp eq i32 [[TMP9]], 1
-// CHECK3-NEXT:    br i1 [[TMP10]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
+// CHECK3-NEXT:    %"_openmp_teams_reductions_buffer_$_$ptr" = call ptr @__kmpc_reduction_get_fixed_buffer()
+// CHECK3-NEXT:    [[TMP8:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(ptr @[[GLOB1]], i32 [[TMP5]], ptr %"_openmp_teams_reductions_buffer_$_$ptr", i32 2048, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_omp_reduction_shuffle_and_reduce_func1, ptr @_omp_reduction_inter_warp_copy_func2, ptr @_omp_reduction_list_to_global_copy_func3, ptr @_omp_reduction_list_to_global_reduce_func4, ptr @_omp_reduction_global_to_list_copy_func5, ptr @_omp_reduction_global_to_list_reduce_func6)
+// CHECK3-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 1
+// CHECK3-NEXT:    br i1 [[TMP9]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
 // CHECK3:       .omp.reduction.then:
-// CHECK3-NEXT:    [[TMP11:%.*]] = load i8, ptr [[TMP0]], align 1
-// CHECK3-NEXT:    [[CONV4:%.*]] = sext i8 [[TMP11]] to i32
-// CHECK3-NEXT:    [[TMP12:%.*]] = load i8, ptr [[C1]], align 1
-// CHECK3-NEXT:    [[CONV5:%.*]] = sext i8 [[TMP12]] to i32
+// CHECK3-NEXT:    [[TMP10:%.*]] = load i8, ptr [[TMP0]], align 1
+// CHECK3-NEXT:    [[CONV4:%.*]] = sext i8 [[TMP10]] to i32
+// CHECK3-NEXT:    [[TMP11:%.*]] = load i8, ptr [[C1]], align 1
+// CHECK3-NEXT:    [[CONV5:%.*]] = sext i8 [[TMP11]] to i32
 // CHECK3-NEXT:    [[XOR6:%.*]] = xor i32 [[CONV4]], [[CONV5]]
 // CHECK3-NEXT:    [[CONV7:%.*]] = trunc i32 [[XOR6]] to i8
 // CHECK3-NEXT:    store i8 [[CONV7]], ptr [[TMP0]], align 1
-// CHECK3-NEXT:    [[TMP13:%.*]] = load float, ptr [[TMP1]], align 4
-// CHECK3-NEXT:    [[TMP14:%.*]] = load float, ptr [[D2]], align 4
-// CHECK3-NEXT:    [[MUL8:%.*]] = fmul float [[TMP13]], [[TMP14]]
+// CHECK3-NEXT:    [[TMP12:%.*]] = load float, ptr [[TMP1]], align 4
+// CHECK3-NEXT:    [[TMP13:%.*]] = load float, ptr [[D2]], align 4
+// CHECK3-NEXT:    [[MUL8:%.*]] = fmul float [[TMP12]], [[TMP13]]
 // CHECK3-NEXT:    store float [[MUL8]], ptr [[TMP1]], align 4
 // CHECK3-NEXT:    call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP5]])
 // CHECK3-NEXT:    br label [[DOTOMP_REDUCTION_DONE]]
@@ -3121,29 +3121,29 @@ int bar(int n){
 // CHECK3-NEXT:    store ptr [[A1]], ptr [[TMP6]], align 4
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1
 // CHECK3-NEXT:    store ptr [[B2]], ptr [[TMP7]], align 4
-// CHECK3-NEXT:    [[TMP8:%.*]] = load ptr, ptr @"_openmp_teams_reductions_buffer_$_$ptr", align 4
-// CHECK3-NEXT:    [[TMP9:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(ptr @[[GLOB1]], i32 [[TMP5]], ptr [[TMP8]], i32 2048, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_omp_reduction_shuffle_and_reduce_func9, ptr @_omp_reduction_inter_warp_copy_func10, ptr @_omp_reduction_list_to_global_copy_func11, ptr @_omp_reduction_list_to_global_reduce_func12, ptr @_omp_reduction_global_to_list_copy_func13, ptr @_omp_reduction_global_to_list_reduce_func14)
-// CHECK3-NEXT:    [[TMP10:%.*]] = icmp eq i32 [[TMP9]], 1
-// CHECK3-NEXT:    br i1 [[TMP10]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
+// CHECK3-NEXT:    %"_openmp_teams_reductions_buffer_$_$ptr" = call ptr @__kmpc_reduction_get_fixed_buffer()
+// CHECK3-NEXT:    [[TMP8:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(ptr @[[GLOB1]], i32 [[TMP5]], ptr %"_openmp_teams_reductions_buffer_$_$ptr", i32 2048, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_omp_reduction_shuffle_and_reduce_func9, ptr @_omp_reduction_inter_warp_copy_func10, ptr @_omp_reduction_list_to_global_copy_func11, ptr @_omp_reduction_list_to_global_reduce_func12, ptr @_omp_reduction_global_to_list_copy_func13, ptr @_omp_reduction_global_to_list_reduce_func14)
+// CHECK3-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 1
+// CHECK3-NEXT:    br i1 [[TMP9]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
 // CHECK3:       .omp.reduction.then:
-// CHECK3-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP0]], align 4
-// CHECK3-NEXT:    [[TMP12:%.*]] = load i32, ptr [[A1]], align 4
-// CHECK3-NEXT:    [[OR:%.*]] = or i32 [[TMP11]], [[TMP12]]
+// CHECK3-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP0]], align 4
+// CHECK3-NEXT:    [[TMP11:%.*]] = load i32, ptr [[A1]], align 4
+// CHECK3-NEXT:    [[OR:%.*]] = or i32 [[TMP10]], [[TMP11]]
 // CHECK3-NEXT:    store i32 [[OR]], ptr [[TMP0]], align 4
-// CHECK3-NEXT:    [[TMP13:%.*]] = load i16, ptr [[TMP1]], align 2
-// CHECK3-NEXT:    [[CONV:%.*]] = sext i16 [[TMP13]] to i32
-// CHECK3-NEXT:    [[TMP14:%.*]] = load i16, ptr [[B2]], align 2
-// CHECK3-NEXT:    [[CONV3:%.*]] = sext i16 [[TMP14]] to i32
+// CHECK3-NEXT:    [[TMP12:%.*]] = load i16, ptr [[TMP1]], align 2
+// CHECK3-NEXT:    [[CONV:%.*]] = sext i16 [[TMP12]] to i32
+// CHECK3-NEXT:    [[TMP13:%.*]] = load i16, ptr [[B2]], align 2
+// CHECK3-NEXT:    [[CONV3:%.*]] = sext i16 [[TMP13]] to i32
 // CHECK3-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[CONV]], [[CONV3]]
 // CHECK3-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
 // CHECK3:       cond.true:
-// CHECK3-NEXT:    [[TMP15:%.*]] = load i16, ptr [[TMP1]], align 2
+// CHECK3-NEXT:    [[TMP14:%.*]] = load i16, ptr [[TMP1]], align 2
 // CHECK3-NEXT:    br label [[COND_END:%.*]]
 // CHECK3:       cond.false:
-// CHECK3-NEXT:    [[TMP16:%.*]] = load i16, ptr [[B2]], align 2
+// CHECK3-NEXT:    [[TMP15:%.*]] = load i16, ptr [[B2]], align 2
 // CHECK3-NEXT:    br label [[COND_END]]
 // CHECK3:       cond.end:
-// CHECK3-NEXT:    [[COND:%.*]] = phi i16 [ [[TMP15]], [[COND_TRUE]] ], [ [[TMP16]], [[COND_FALSE]] ]
+// CHECK3-NEXT:    [[COND:%.*]] = phi i16 [ [[TMP14]], [[COND_TRUE]] ], [ [[TMP15]], [[COND_FALSE]] ]
 // CHECK3-NEXT:    store i16 [[COND]], ptr [[TMP1]], align 2
 // CHECK3-NEXT:    call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP5]])
 // CHECK3-NEXT:    br label [[DOTOMP_REDUCTION_DONE]]
@@ -3551,13 +3551,13 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTADDR1]], align 4
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP3]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 4
-// CHECK3-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], ptr [[TMP4]], i32 0, i32 0
+// CHECK3-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2:%.*]], ptr [[TMP4]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2048 x i32], ptr [[A]], i32 0, i32 [[TMP5]]
 // CHECK3-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP7]], align 4
 // CHECK3-NEXT:    store i32 [[TMP9]], ptr [[TMP8]], align 128
 // CHECK3-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP3]], i32 0, i32 1
 // CHECK3-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[TMP10]], align 4
-// CHECK3-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], ptr [[TMP4]], i32 0, i32 1
+// CHECK3-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], ptr [[TMP4]], i32 0, i32 1
 // CHECK3-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [2048 x i16], ptr [[B]], i32 0, i32 [[TMP5]]
 // CHECK3-NEXT:    [[TMP13:%.*]] = load i16, ptr [[TMP11]], align 2
 // CHECK3-NEXT:    store i16 [[TMP13]], ptr [[TMP12]], align 128
@@ -3577,11 +3577,11 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTADDR]], align 4
 // CHECK3-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTADDR1]], align 4
 // CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
-// CHECK3-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], ptr [[TMP3]], i32 0, i32 0
+// CHECK3-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2:%.*]], ptr [[TMP3]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [2048 x i32], ptr [[A]], i32 0, i32 [[TMP4]]
 // CHECK3-NEXT:    store ptr [[TMP6]], ptr [[TMP5]], align 4
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1
-// CHECK3-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], ptr [[TMP3]], i32 0, i32 1
+// CHECK3-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], ptr [[TMP3]], i32 0, i32 1
 // CHECK3-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2048 x i16], ptr [[B]], i32 0, i32 [[TMP4]]
 // CHECK3-NEXT:    store ptr [[TMP8]], ptr [[TMP7]], align 4
 // CHECK3-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[DOTADDR2]], align 4
@@ -3603,13 +3603,13 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTADDR1]], align 4
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP3]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 4
-// CHECK3-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], ptr [[TMP4]], i32 0, i32 0
+// CHECK3-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2:%.*]], ptr [[TMP4]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2048 x i32], ptr [[A]], i32 0, i32 [[TMP5]]
 // CHECK3-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP8]], align 128
 // CHECK3-NEXT:    store i32 [[TMP9]], ptr [[TMP7]], align 4
 // CHECK3-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP3]], i32 0, i32 1
 // CHECK3-NEXT:    [[TMP11:%.*]] = load ptr, ptr [[TMP10]], align 4
-// CHECK3-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], ptr [[TMP4]], i32 0, i32 1
+// CHECK3-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], ptr [[TMP4]], i32 0, i32 1
 // CHECK3-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [2048 x i16], ptr [[B]], i32 0, i32 [[TMP5]]
 // CHECK3-NEXT:    [[TMP13:%.*]] = load i16, ptr [[TMP12]], align 128
 // CHECK3-NEXT:    store i16 [[TMP13]], ptr [[TMP11]], align 2
@@ -3629,11 +3629,11 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTADDR]], align 4
 // CHECK3-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTADDR1]], align 4
 // CHECK3-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
-// CHECK3-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], ptr [[TMP3]], i32 0, i32 0
+// CHECK3-NEXT:    [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2:%.*]], ptr [[TMP3]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [2048 x i32], ptr [[A]], i32 0, i32 [[TMP4]]
 // CHECK3-NEXT:    store ptr [[TMP6]], ptr [[TMP5]], align 4
 // CHECK3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1
-// CHECK3-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], ptr [[TMP3]], i32 0, i32 1
+// CHECK3-NEXT:    [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], ptr [[TMP3]], i32 0, i32 1
 // CHECK3-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2048 x i16], ptr [[B]], i32 0, i32 [[TMP4]]
 // CHECK3-NEXT:    store ptr [[TMP8]], ptr [[TMP7]], align 4
 // CHECK3-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[DOTADDR2]], align 4
diff --git a/clang/test/OpenMP/target_teams_generic_loop_codegen.cpp b/clang/test/OpenMP/target_teams_generic_loop_codegen.cpp
index 63926bb444081..f9aef3acb1c61 100644
--- a/clang/test/OpenMP/target_teams_generic_loop_codegen.cpp
+++ b/clang/test/OpenMP/target_teams_generic_loop_codegen.cpp
@@ -1328,24 +1328,24 @@ int foo() {
 // IR-GPU-NEXT:    [[TMP38:%.*]] = load i32, ptr [[TMP37]], align 4
 // IR-GPU-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST_ASCAST]], i64 0, i64 0
 // IR-GPU-NEXT:    store ptr [[SUM1_ASCAST]], ptr [[TMP39]], align 8
-// IR-GPU-NEXT:    [[TMP40:%.*]] = load ptr, ptr addrspace(1) @"_openmp_teams_reductions_buffer_$_$ptr", align 8
-// IR-GPU-NEXT:    [[TMP41:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr), i32 [[TMP38]], ptr [[TMP40]], i32 1024, ptr [[DOTOMP_REDUCTION_RED_LIST_ASCAST]], ptr @_omp_reduction_shuffle_and_reduce_func.1, ptr @_omp_reduction_inter_warp_copy_func.2, ptr @_omp_reduction_list_to_global_copy_func, ptr @_omp_reduction_list_to_global_reduce_func, ptr @_omp_reduction_global_to_list_copy_func, ptr @_omp_reduction_global_to_list_reduce_func)
-// IR-GPU-NEXT:    [[TMP42:%.*]] = icmp eq i32 [[TMP41]], 1
-// IR-GPU-NEXT:    br i1 [[TMP42]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
+// IR-GPU-NEXT:    %"_openmp_teams_reductions_buffer_$_$ptr" = call ptr @__kmpc_reduction_get_fixed_buffer()
+// IR-GPU-NEXT:    [[TMP40:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr), i32 [[TMP38]], ptr %"_openmp_teams_reductions_buffer_$_$ptr", i32 1024, ptr [[DOTOMP_REDUCTION_RED_LIST_ASCAST]], ptr @_omp_reduction_shuffle_and_reduce_func.1, ptr @_omp_reduction_inter_warp_copy_func.2, ptr @_omp_reduction_list_to_global_copy_func, ptr @_omp_reduction_list_to_global_reduce_func, ptr @_omp_reduction_global_to_list_copy_func, ptr @_omp_reduction_global_to_list_reduce_func)
+// IR-GPU-NEXT:    [[TMP41:%.*]] = icmp eq i32 [[TMP40]], 1
+// IR-GPU-NEXT:    br i1 [[TMP41]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
 // IR-GPU:       .omp.reduction.then:
-// IR-GPU-NEXT:    [[TMP43:%.*]] = getelementptr i32, ptr [[TMP0]], i64 100
-// IR-GPU-NEXT:    [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq ptr [[TMP0]], [[TMP43]]
+// IR-GPU-NEXT:    [[TMP42:%.*]] = getelementptr i32, ptr [[TMP0]], i64 100
+// IR-GPU-NEXT:    [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq ptr [[TMP0]], [[TMP42]]
 // IR-GPU-NEXT:    br i1 [[OMP_ARRAYCPY_ISEMPTY]], label [[OMP_ARRAYCPY_DONE17:%.*]], label [[OMP_ARRAYCPY_BODY:%.*]]
 // IR-GPU:       omp.arraycpy.body:
 // IR-GPU-NEXT:    [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi ptr [ [[SUM1_ASCAST]], [[DOTOMP_REDUCTION_THEN]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
 // IR-GPU-NEXT:    [[OMP_ARRAYCPY_DESTELEMENTPAST13:%.*]] = phi ptr [ [[TMP0]], [[DOTOMP_REDUCTION_THEN]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT15:%.*]], [[OMP_ARRAYCPY_BODY]] ]
-// IR-GPU-NEXT:    [[TMP44:%.*]] = load i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST13]], align 4
-// IR-GPU-NEXT:    [[TMP45:%.*]] = load i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], align 4
-// IR-GPU-NEXT:    [[ADD14:%.*]] = add nsw i32 [[TMP44]], [[TMP45]]
+// IR-GPU-NEXT:    [[TMP43:%.*]] = load i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST13]], align 4
+// IR-GPU-NEXT:    [[TMP44:%.*]] = load i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], align 4
+// IR-GPU-NEXT:    [[ADD14:%.*]] = add nsw i32 [[TMP43]], [[TMP44]]
 // IR-GPU-NEXT:    store i32 [[ADD14]], ptr [[OMP_ARRAYCPY_DESTELEMENTPAST13]], align 4
 // IR-GPU-NEXT:    [[OMP_ARRAYCPY_DEST_ELEMENT15]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST13]], i32 1
 // IR-GPU-NEXT:    [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1
-// IR-GPU-NEXT:    [[OMP_ARRAYCPY_DONE16:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT15]], [[TMP43]]
+// IR-GPU-NEXT:    [[OMP_ARRAYCPY_DONE16:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT15]], [[TMP42]]
 // IR-GPU-NEXT:    br i1 [[OMP_ARRAYCPY_DONE16]], label [[OMP_ARRAYCPY_DONE17]], label [[OMP_ARRAYCPY_BODY]]
 // IR-GPU:       omp.arraycpy.done17:
 // IR-GPU-NEXT:    call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP38]])
diff --git a/openmp/libomptarget/DeviceRTL/include/Interface.h b/openmp/libomptarget/DeviceRTL/include/Interface.h
index 6ce56475c09b3..a603e91d1182d 100644
--- a/openmp/libomptarget/DeviceRTL/include/Interface.h
+++ b/openmp/libomptarget/DeviceRTL/include/Interface.h
@@ -234,6 +234,8 @@ void __kmpc_nvptx_end_reduce(int32_t TId);
 
 void __kmpc_nvptx_end_reduce_nowait(int32_t TId);
 
+void *__kmpc_reduction_get_fixed_buffer();
+
 int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(
     IdentTy *Loc, int32_t TId, int32_t num_vars, uint64_t reduce_size,
     void *reduce_data, ShuffleReductFnTy shflFct, InterWarpCopyFnTy cpyFct);
diff --git a/openmp/libomptarget/DeviceRTL/src/Reduction.cpp b/openmp/libomptarget/DeviceRTL/src/Reduction.cpp
index a041d239e1abb..efa09cafa879e 100644
--- a/openmp/libomptarget/DeviceRTL/src/Reduction.cpp
+++ b/openmp/libomptarget/DeviceRTL/src/Reduction.cpp
@@ -167,9 +167,6 @@ uint32_t roundToWarpsize(uint32_t s) {
 
 uint32_t kmpcMin(uint32_t x, uint32_t y) { return x < y ? x : y; }
 
-static uint32_t IterCnt = 0;
-static uint32_t Cnt = 0;
-
 } // namespace
 
 extern "C" {
@@ -194,6 +191,9 @@ int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
     ThreadId = 0;
   }
 
+  uint32_t &IterCnt = state::getKernelLaunchEnvironment().ReductionIterCnt;
+  uint32_t &Cnt = state::getKernelLaunchEnvironment().ReductionCnt;
+
   // In non-generic mode all workers participate in the teams reduction.
   // In generic mode only the team master participates in the teams
   // reduction because the workers are waiting for parallel work.
@@ -313,4 +313,8 @@ void __kmpc_nvptx_end_reduce(int32_t TId) {}
 void __kmpc_nvptx_end_reduce_nowait(int32_t TId) {}
 }
 
+void *__kmpc_reduction_get_fixed_buffer() {
+  return state::getKernelLaunchEnvironment().ReductionBuffer;
+}
+
 #pragma omp end declare target
diff --git a/openmp/libomptarget/include/Environment.h b/openmp/libomptarget/include/Environment.h
index 9c02e2390581d..bd493e8a0be78 100644
--- a/openmp/libomptarget/include/Environment.h
+++ b/openmp/libomptarget/include/Environment.h
@@ -77,15 +77,16 @@ struct DynamicEnvironmentTy {
 // NOTE: Please don't change the order of those members as their indices are
 // used in the middle end. Always add the new data member at the end.
 struct ConfigurationEnvironmentTy {
-  uint8_t UseGenericStateMachine;
-  uint8_t MayUseNestedParallelism;
-  llvm::omp::OMPTgtExecModeFlags ExecMode;
+  uint8_t UseGenericStateMachine = 2;
+  uint8_t MayUseNestedParallelism = 2;
+  llvm::omp::OMPTgtExecModeFlags ExecMode = llvm::omp::OMP_TGT_EXEC_MODE_SPMD;
   // Information about (legal) launch configurations.
   //{
-  int32_t MinThreads;
-  int32_t MaxThreads;
-  int32_t MinTeams;
-  int32_t MaxTeams;
+  int32_t MinThreads = -1;
+  int32_t MaxThreads = -1;
+  int32_t MinTeams = -1;
+  int32_t MaxTeams = -1;
+  int32_t ReductionBufferSize = 0;
   //}
 };
 
@@ -93,10 +94,14 @@ struct ConfigurationEnvironmentTy {
 // used in the middle end. Always add the new data member at the end.
 struct KernelEnvironmentTy {
   ConfigurationEnvironmentTy Configuration;
-  IdentTy *Ident;
-  DynamicEnvironmentTy *DynamicEnv;
+  IdentTy *Ident = nullptr;
+  DynamicEnvironmentTy *DynamicEnv = nullptr;
 };
 
-struct KernelLaunchEnvironmentTy {};
+struct KernelLaunchEnvironmentTy {
+  uint32_t ReductionCnt = 0;
+  uint32_t ReductionIterCnt = 0;
+  void *ReductionBuffer = nullptr;
+};
 
 #endif // _OMPTARGET_ENVIRONMENT_H_
diff --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp
index 69943486aa720..8747502065a59 100644
--- a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp
+++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp
@@ -402,9 +402,8 @@ Error GenericKernelTy::init(GenericDeviceTy &GenericDevice,
     DP("Failed to read kernel environment for '%s': %s\n"
        "Using default SPMD (2) execution mode\n",
        Name, ErrStr.data());
-    KernelEnvironment.Configuration.ExecMode = OMP_TGT_EXEC_MODE_SPMD;
-    KernelEnvironment.Configuration.MayUseNestedParallelism = /*Unknown=*/2;
-    KernelEnvironment.Configuration.UseGenericStateMachine = /*Unknown=*/2;
+    assert(KernelEnvironment.Configuration.ReductionBufferSize == 0 &&
+           "Default initialization failed.");
   }
 
   // Max = Config.Max > 0 ? min(Config.Max, Device.Max) : Device.Max;
@@ -441,6 +440,17 @@ GenericKernelTy::getKernelLaunchEnvironment(
   /// async data transfer.
   auto &LocalKLE = (*AsyncInfoWrapper).KernelLaunchEnvironment;
   LocalKLE = KernelLaunchEnvironment;
+  if (KernelEnvironment.Configuration.ReductionBufferSize) {
+    auto AllocOrErr = GenericDevice.dataAlloc(
+        KernelEnvironment.Configuration.ReductionBufferSize,
+        /*HostPtr=*/nullptr, TargetAllocTy::TARGET_ALLOC_DEVICE);
+    if (!AllocOrErr)
+      return AllocOrErr.takeError();
+    LocalKLE.ReductionBuffer = *AllocOrErr;
+    // Remember to free the memory later.
+    AsyncInfoWrapper.freeAllocationAfterSynchronization(*AllocOrErr);
+  }
+
   auto Err = GenericDevice.dataSubmit(*AllocOrErr, &LocalKLE,
                                       sizeof(KernelLaunchEnvironmentTy),
                                       AsyncInfoWrapper);
diff --git a/openmp/libomptarget/test/offloading/parallel_target_teams_reduction.cpp b/openmp/libomptarget/test/offloading/parallel_target_teams_reduction.cpp
new file mode 100644
index 0000000000000..5303a9463f155
--- /dev/null
+++ b/openmp/libomptarget/test/offloading/parallel_target_teams_reduction.cpp
@@ -0,0 +1,36 @@
+// RUN: %libomptarget-compilexx-run-and-check-generic
+// RUN: %libomptarget-compileoptxx-run-and-check-generic
+
+// FIXME: This is a bug in host offload, this should run fine.
+// UNSUPPORTED: aarch64-unknown-linux-gnu
+// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: x86_64-pc-linux-gnu
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+
+#include <iostream>
+#include <vector>
+
+#define N 8
+
+int main() {
+  std::vector<int> avec(N);
+  int *a = avec.data();
+#pragma omp parallel for
+  for (int i = 0; i < N; i++) {
+    a[i] = 0;
+#pragma omp target teams distribute parallel for reduction(+ : a[i])
+    for (int j = 0; j < N; j++)
+      a[i] += 1;
+  }
+
+  // CHECK: 8
+  // CHECK: 8
+  // CHECK: 8
+  // CHECK: 8
+  // CHECK: 8
+  // CHECK: 8
+  // CHECK: 8
+  // CHECK: 8
+  for (int i = 0; i < N; i++)
+    std::cout << a[i] << std::endl;
+}

From 8b61ef092527ca74479f63f6def776f62a453c67 Mon Sep 17 00:00:00 2001
From: Konstantin Zhuravlyov <kzhuravl_dev@outlook.com>
Date: Wed, 1 Nov 2023 14:14:48 -0400
Subject: [PATCH 718/877] AMDGPU/Docs: Add links to instruction descriptions
 for gfx941, gfx942 (#70941)

Co-authored-by: Konstantin Zhuravlyov <kzhuravl@amd.com>
---
 llvm/docs/AMDGPUUsage.rst | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 9427df94e128e..4f61648254586 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -14745,6 +14745,10 @@ in this description.
 
     CDNA 3        :doc:`GFX9<AMDGPU/AMDGPUAsmGFX9>`             :doc:`gfx940<AMDGPU/AMDGPUAsmGFX940>`
 
+                                                                :doc:`gfx941<AMDGPU/AMDGPUAsmGFX940>`
+
+                                                                :doc:`gfx942<AMDGPU/AMDGPUAsmGFX940>`
+
     RDNA 1        :doc:`GFX10 RDNA1<AMDGPU/AMDGPUAsmGFX10>`     :doc:`gfx1010<AMDGPU/AMDGPUAsmGFX10>`
 
                                                                 :doc:`gfx1011<AMDGPU/AMDGPUAsmGFX1011>`

From 3c97c8b6fc28f296d93f2209088581144a0060de Mon Sep 17 00:00:00 2001
From: Zequan Wu <zequanwu@google.com>
Date: Wed, 1 Nov 2023 14:16:43 -0400
Subject: [PATCH 719/877] [Profile] Refactor profile correlation. (#70856)

Refactor some code from https://github.com/llvm/llvm-project/pull/69493.

#70712 was reverted due to linking failures. So, `-debug-info-correlate` remains unchanged and no new flag added.
---
 compiler-rt/lib/profile/InstrProfiling.c      |   4 +
 compiler-rt/lib/profile/InstrProfiling.h      |   6 ++
 .../lib/profile/InstrProfilingBuffer.c        |  11 ++
 compiler-rt/lib/profile/InstrProfilingMerge.c |  11 +-
 .../lib/profile/InstrProfilingWriter.c        |  21 ++--
 .../instrprof-show-debug-info-correlation.c   |   2 +-
 .../llvm/ProfileData/InstrProfCorrelator.h    |  13 ++-
 .../llvm/ProfileData/InstrProfReader.h        |   2 +
 llvm/lib/ProfileData/InstrProfCorrelator.cpp  | 100 +++++++++++-------
 llvm/lib/ProfileData/InstrProfReader.cpp      |   4 +-
 llvm/tools/llvm-profdata/llvm-profdata.cpp    |   9 +-
 11 files changed, 121 insertions(+), 62 deletions(-)

diff --git a/compiler-rt/lib/profile/InstrProfiling.c b/compiler-rt/lib/profile/InstrProfiling.c
index da04d8ebdec95..7d69e37815c94 100644
--- a/compiler-rt/lib/profile/InstrProfiling.c
+++ b/compiler-rt/lib/profile/InstrProfiling.c
@@ -89,3 +89,7 @@ COMPILER_RT_VISIBILITY void __llvm_profile_reset_counters(void) {
   }
   lprofSetProfileDumped(0);
 }
+
+inline int hasCorrelation() {
+  return (__llvm_profile_get_version() & VARIANT_MASK_DBG_CORRELATE) != 0ULL;
+}
diff --git a/compiler-rt/lib/profile/InstrProfiling.h b/compiler-rt/lib/profile/InstrProfiling.h
index e143149fca827..b8104af5f12b9 100644
--- a/compiler-rt/lib/profile/InstrProfiling.h
+++ b/compiler-rt/lib/profile/InstrProfiling.h
@@ -261,6 +261,9 @@ uint64_t __llvm_profile_get_magic(void);
 /*! \brief Get the version of the file format. */
 uint64_t __llvm_profile_get_version(void);
 
+/*! \brief If the binary is compiled with profile correlation. */
+int hasCorrelation();
+
 /*! \brief Get the number of entries in the profile data section. */
 uint64_t __llvm_profile_get_num_data(const __llvm_profile_data *Begin,
                                      const __llvm_profile_data *End);
@@ -282,6 +285,9 @@ uint64_t __llvm_profile_get_counters_size(const char *Begin, const char *End);
 uint64_t __llvm_profile_get_num_bitmap_bytes(const char *Begin,
                                              const char *End);
 
+/*! \brief Get the size of the profile name section in bytes. */
+uint64_t __llvm_profile_get_name_size(const char *Begin, const char *End);
+
 /* ! \brief Given the sizes of the data and counter information, return the
  * number of padding bytes before and after the counters, and after the names,
  * in the raw profile.
diff --git a/compiler-rt/lib/profile/InstrProfilingBuffer.c b/compiler-rt/lib/profile/InstrProfilingBuffer.c
index c7217b2dfef8a..69965142d9787 100644
--- a/compiler-rt/lib/profile/InstrProfilingBuffer.c
+++ b/compiler-rt/lib/profile/InstrProfilingBuffer.c
@@ -56,6 +56,8 @@ uint64_t __llvm_profile_get_size_for_buffer(void) {
 COMPILER_RT_VISIBILITY
 uint64_t __llvm_profile_get_num_data(const __llvm_profile_data *Begin,
                                      const __llvm_profile_data *End) {
+  if (hasCorrelation())
+    return 0;
   intptr_t BeginI = (intptr_t)Begin, EndI = (intptr_t)End;
   return ((EndI + sizeof(__llvm_profile_data) - 1) - BeginI) /
          sizeof(__llvm_profile_data);
@@ -64,6 +66,8 @@ uint64_t __llvm_profile_get_num_data(const __llvm_profile_data *Begin,
 COMPILER_RT_VISIBILITY
 uint64_t __llvm_profile_get_data_size(const __llvm_profile_data *Begin,
                                       const __llvm_profile_data *End) {
+  if (hasCorrelation())
+    return 0;
   return __llvm_profile_get_num_data(Begin, End) * sizeof(__llvm_profile_data);
 }
 
@@ -92,6 +96,13 @@ uint64_t __llvm_profile_get_num_bitmap_bytes(const char *Begin,
   return (End - Begin);
 }
 
+COMPILER_RT_VISIBILITY
+uint64_t __llvm_profile_get_name_size(const char *Begin, const char *End) {
+  if (hasCorrelation())
+    return 0;
+  return End - Begin;
+}
+
 /// Calculate the number of padding bytes needed to add to \p Offset in order
 /// for (\p Offset + Padding) to be page-aligned.
 static uint64_t calculateBytesNeededToPageAlign(uint64_t Offset) {
diff --git a/compiler-rt/lib/profile/InstrProfilingMerge.c b/compiler-rt/lib/profile/InstrProfilingMerge.c
index c5f168bf75177..b08973debda94 100644
--- a/compiler-rt/lib/profile/InstrProfilingMerge.c
+++ b/compiler-rt/lib/profile/InstrProfilingMerge.c
@@ -69,8 +69,9 @@ int __llvm_profile_check_compatibility(const char *ProfileData,
       Header->NumBitmapBytes !=
           __llvm_profile_get_num_bitmap_bytes(__llvm_profile_begin_bitmap(),
                                               __llvm_profile_end_bitmap()) ||
-      Header->NamesSize != (uint64_t)(__llvm_profile_end_names() -
-                                      __llvm_profile_begin_names()) ||
+      Header->NamesSize !=
+          __llvm_profile_get_name_size(__llvm_profile_begin_names(),
+                                       __llvm_profile_end_names()) ||
       Header->ValueKindLast != IPVK_Last)
     return 1;
 
@@ -138,9 +139,9 @@ int __llvm_profile_merge_from_buffer(const char *ProfileData,
   if (SrcNameStart < SrcCountersStart || SrcNameStart < SrcBitmapStart)
     return 1;
 
-  // Merge counters by iterating the entire counter section when debug info
-  // correlation is enabled.
-  if (__llvm_profile_get_version() & VARIANT_MASK_DBG_CORRELATE) {
+  // Merge counters by iterating the entire counter section when correlation is
+  // enabled.
+  if (hasCorrelation()) {
     for (SrcCounter = SrcCountersStart,
         DstCounter = __llvm_profile_begin_counters();
          SrcCounter < SrcCountersEnd;) {
diff --git a/compiler-rt/lib/profile/InstrProfilingWriter.c b/compiler-rt/lib/profile/InstrProfilingWriter.c
index 3b61f3def9f6e..d35ee6b20504f 100644
--- a/compiler-rt/lib/profile/InstrProfilingWriter.c
+++ b/compiler-rt/lib/profile/InstrProfilingWriter.c
@@ -262,21 +262,19 @@ lprofWriteDataImpl(ProfDataWriter *Writer, const __llvm_profile_data *DataBegin,
                    const char *BitmapBegin, const char *BitmapEnd,
                    VPDataReaderType *VPDataReader, const char *NamesBegin,
                    const char *NamesEnd, int SkipNameDataWrite) {
-  int DebugInfoCorrelate =
-      (__llvm_profile_get_version() & VARIANT_MASK_DBG_CORRELATE) != 0ULL;
+  int ProfileCorrelation = hasCorrelation();
 
   /* Calculate size of sections. */
   const uint64_t DataSectionSize =
-      DebugInfoCorrelate ? 0 : __llvm_profile_get_data_size(DataBegin, DataEnd);
-  const uint64_t NumData =
-      DebugInfoCorrelate ? 0 : __llvm_profile_get_num_data(DataBegin, DataEnd);
+      __llvm_profile_get_data_size(DataBegin, DataEnd);
+  const uint64_t NumData = __llvm_profile_get_num_data(DataBegin, DataEnd);
   const uint64_t CountersSectionSize =
       __llvm_profile_get_counters_size(CountersBegin, CountersEnd);
   const uint64_t NumCounters =
       __llvm_profile_get_num_counters(CountersBegin, CountersEnd);
   const uint64_t NumBitmapBytes =
       __llvm_profile_get_num_bitmap_bytes(BitmapBegin, BitmapEnd);
-  const uint64_t NamesSize = DebugInfoCorrelate ? 0 : NamesEnd - NamesBegin;
+  const uint64_t NamesSize = __llvm_profile_get_name_size(NamesBegin, NamesEnd);
 
   /* Create the header. */
   __llvm_profile_header Header;
@@ -304,7 +302,7 @@ lprofWriteDataImpl(ProfDataWriter *Writer, const __llvm_profile_data *DataBegin,
 #endif
 
   /* The data and names sections are omitted in lightweight mode. */
-  if (DebugInfoCorrelate) {
+  if (ProfileCorrelation) {
     Header.CountersDelta = 0;
     Header.NamesDelta = 0;
   }
@@ -320,21 +318,22 @@ lprofWriteDataImpl(ProfDataWriter *Writer, const __llvm_profile_data *DataBegin,
 
   /* Write the profile data. */
   ProfDataIOVec IOVecData[] = {
-      {DebugInfoCorrelate ? NULL : DataBegin, sizeof(uint8_t), DataSectionSize,
+      {ProfileCorrelation ? NULL : DataBegin, sizeof(uint8_t), DataSectionSize,
        0},
       {NULL, sizeof(uint8_t), PaddingBytesBeforeCounters, 1},
       {CountersBegin, sizeof(uint8_t), CountersSectionSize, 0},
       {NULL, sizeof(uint8_t), PaddingBytesAfterCounters, 1},
       {BitmapBegin, sizeof(uint8_t), NumBitmapBytes, 0},
       {NULL, sizeof(uint8_t), PaddingBytesAfterBitmapBytes, 1},
-      {(SkipNameDataWrite || DebugInfoCorrelate) ? NULL : NamesBegin,
+      {(SkipNameDataWrite || ProfileCorrelation) ? NULL : NamesBegin,
        sizeof(uint8_t), NamesSize, 0},
       {NULL, sizeof(uint8_t), PaddingBytesAfterNames, 1}};
   if (Writer->Write(Writer, IOVecData, sizeof(IOVecData) / sizeof(*IOVecData)))
     return -1;
 
-  /* Value profiling is not yet supported in continuous mode. */
-  if (__llvm_profile_is_continuous_mode_enabled())
+  /* Value profiling is not yet supported in continuous mode and profile
+   * correlation mode. */
+  if (__llvm_profile_is_continuous_mode_enabled() || ProfileCorrelation)
     return 0;
 
   return writeValueProfData(Writer, VPDataReader, DataBegin, DataEnd);
diff --git a/compiler-rt/test/profile/Linux/instrprof-show-debug-info-correlation.c b/compiler-rt/test/profile/Linux/instrprof-show-debug-info-correlation.c
index 226d678aca347..245dc79891042 100644
--- a/compiler-rt/test/profile/Linux/instrprof-show-debug-info-correlation.c
+++ b/compiler-rt/test/profile/Linux/instrprof-show-debug-info-correlation.c
@@ -4,7 +4,7 @@
 
 // RUN: %clang_pgogen -o %t.no.dbg -mllvm --debug-info-correlate -mllvm --disable-vp=true %s
 // RUN: not llvm-profdata show --debug-info=%t.no.dbg 2>&1 | FileCheck %s --check-prefix NO-DBG
-// NO-DBG: unable to correlate profile: could not find any profile metadata in debug info
+// NO-DBG: unable to correlate profile: could not find any profile data metadata in correlated file
 
 // YAML: Probes:
 // YAML:   - Function Name:   a
diff --git a/llvm/include/llvm/ProfileData/InstrProfCorrelator.h b/llvm/include/llvm/ProfileData/InstrProfCorrelator.h
index dd062951f277c..a3a0805a294a2 100644
--- a/llvm/include/llvm/ProfileData/InstrProfCorrelator.h
+++ b/llvm/include/llvm/ProfileData/InstrProfCorrelator.h
@@ -31,8 +31,11 @@ class ObjectFile;
 /// to their functions.
 class InstrProfCorrelator {
 public:
+  /// Indicate which kind correlator to use.
+  enum ProfCorrelatorKind { NONE, DEBUG_INFO };
+
   static llvm::Expected<std::unique_ptr<InstrProfCorrelator>>
-  get(StringRef DebugInfoFilename);
+  get(StringRef Filename, ProfCorrelatorKind FileKind);
 
   /// Construct a ProfileData vector used to correlate raw instrumentation data
   /// to their functions.
@@ -104,7 +107,7 @@ class InstrProfCorrelator {
 
 private:
   static llvm::Expected<std::unique_ptr<InstrProfCorrelator>>
-  get(std::unique_ptr<MemoryBuffer> Buffer);
+  get(std::unique_ptr<MemoryBuffer> Buffer, ProfCorrelatorKind FileKind);
 
   const InstrProfCorrelatorKind Kind;
 };
@@ -128,7 +131,7 @@ class InstrProfCorrelatorImpl : public InstrProfCorrelator {
 
   static llvm::Expected<std::unique_ptr<InstrProfCorrelatorImpl<IntPtrT>>>
   get(std::unique_ptr<InstrProfCorrelator::Context> Ctx,
-      const object::ObjectFile &Obj);
+      const object::ObjectFile &Obj, ProfCorrelatorKind FileKind);
 
 protected:
   std::vector<RawInstrProf::ProfileData<IntPtrT>> Data;
@@ -138,6 +141,8 @@ class InstrProfCorrelatorImpl : public InstrProfCorrelator {
       int MaxWarnings,
       InstrProfCorrelator::CorrelationData *Data = nullptr) = 0;
 
+  virtual Error correlateProfileNameImpl() = 0;
+
   Error dumpYaml(int MaxWarnings, raw_ostream &OS) override;
 
   void addProbe(StringRef FunctionName, uint64_t CFGHash, IntPtrT CounterOffset,
@@ -205,6 +210,8 @@ class DwarfInstrProfCorrelator : public InstrProfCorrelatorImpl<IntPtrT> {
   void correlateProfileDataImpl(
       int MaxWarnings,
       InstrProfCorrelator::CorrelationData *Data = nullptr) override;
+
+  Error correlateProfileNameImpl() override;
 };
 
 } // end namespace llvm
diff --git a/llvm/include/llvm/ProfileData/InstrProfReader.h b/llvm/include/llvm/ProfileData/InstrProfReader.h
index 1ebbb5c63ed68..4bbdda25e27a2 100644
--- a/llvm/include/llvm/ProfileData/InstrProfReader.h
+++ b/llvm/include/llvm/ProfileData/InstrProfReader.h
@@ -386,6 +386,8 @@ class RawInstrProfReader : public InstrProfReader {
     return (Version & VARIANT_MASK_DBG_CORRELATE) != 0;
   }
 
+  bool useCorrelate() const { return useDebugInfoCorrelate(); }
+
   bool hasSingleByteCoverage() const override {
     return (Version & VARIANT_MASK_BYTE_COVERAGE) != 0;
   }
diff --git a/llvm/lib/ProfileData/InstrProfCorrelator.cpp b/llvm/lib/ProfileData/InstrProfCorrelator.cpp
index 2138368500bed..e429b9f1da54c 100644
--- a/llvm/lib/ProfileData/InstrProfCorrelator.cpp
+++ b/llvm/lib/ProfileData/InstrProfCorrelator.cpp
@@ -24,15 +24,20 @@
 
 using namespace llvm;
 
-/// Get the __llvm_prf_cnts section.
-Expected<object::SectionRef> getCountersSection(const object::ObjectFile &Obj) {
+/// Get profile section.
+Expected<object::SectionRef> getInstrProfSection(const object::ObjectFile &Obj,
+                                                 InstrProfSectKind IPSK) {
+  Triple::ObjectFormatType ObjFormat = Obj.getTripleObjectFormat();
+  std::string ExpectedSectionName =
+      getInstrProfSectionName(IPSK, ObjFormat,
+                              /*AddSegmentInfo=*/false);
   for (auto &Section : Obj.sections())
     if (auto SectionName = Section.getName())
-      if (SectionName.get() == INSTR_PROF_CNTS_SECT_NAME)
+      if (SectionName.get() == ExpectedSectionName)
         return Section;
   return make_error<InstrProfError>(
       instrprof_error::unable_to_correlate_profile,
-      "could not find counter section (" INSTR_PROF_CNTS_SECT_NAME ")");
+      "could not find section (" + Twine(ExpectedSectionName) + ")");
 }
 
 const char *InstrProfCorrelator::FunctionNameAttributeName = "Function Name";
@@ -42,7 +47,7 @@ const char *InstrProfCorrelator::NumCountersAttributeName = "Num Counters";
 llvm::Expected<std::unique_ptr<InstrProfCorrelator::Context>>
 InstrProfCorrelator::Context::get(std::unique_ptr<MemoryBuffer> Buffer,
                                   const object::ObjectFile &Obj) {
-  auto CountersSection = getCountersSection(Obj);
+  auto CountersSection = getInstrProfSection(Obj, IPSK_cnts);
   if (auto Err = CountersSection.takeError())
     return std::move(Err);
   auto C = std::make_unique<Context>();
@@ -54,30 +59,32 @@ InstrProfCorrelator::Context::get(std::unique_ptr<MemoryBuffer> Buffer,
 }
 
 llvm::Expected<std::unique_ptr<InstrProfCorrelator>>
-InstrProfCorrelator::get(StringRef DebugInfoFilename) {
-  auto DsymObjectsOrErr =
-      object::MachOObjectFile::findDsymObjectMembers(DebugInfoFilename);
-  if (auto Err = DsymObjectsOrErr.takeError())
-    return std::move(Err);
-  if (!DsymObjectsOrErr->empty()) {
-    // TODO: Enable profile correlation when there are multiple objects in a
-    // dSYM bundle.
-    if (DsymObjectsOrErr->size() > 1)
-      return make_error<InstrProfError>(
-          instrprof_error::unable_to_correlate_profile,
-          "using multiple objects is not yet supported");
-    DebugInfoFilename = *DsymObjectsOrErr->begin();
+InstrProfCorrelator::get(StringRef Filename, ProfCorrelatorKind FileKind) {
+  if (FileKind == DEBUG_INFO) {
+    auto DsymObjectsOrErr =
+        object::MachOObjectFile::findDsymObjectMembers(Filename);
+    if (auto Err = DsymObjectsOrErr.takeError())
+      return std::move(Err);
+    if (!DsymObjectsOrErr->empty()) {
+      // TODO: Enable profile correlation when there are multiple objects in a
+      // dSYM bundle.
+      if (DsymObjectsOrErr->size() > 1)
+        return make_error<InstrProfError>(
+            instrprof_error::unable_to_correlate_profile,
+            "using multiple objects is not yet supported");
+      Filename = *DsymObjectsOrErr->begin();
+    }
   }
-  auto BufferOrErr =
-      errorOrToExpected(MemoryBuffer::getFile(DebugInfoFilename));
+  auto BufferOrErr = errorOrToExpected(MemoryBuffer::getFile(Filename));
   if (auto Err = BufferOrErr.takeError())
     return std::move(Err);
 
-  return get(std::move(*BufferOrErr));
+  return get(std::move(*BufferOrErr), FileKind);
 }
 
 llvm::Expected<std::unique_ptr<InstrProfCorrelator>>
-InstrProfCorrelator::get(std::unique_ptr<MemoryBuffer> Buffer) {
+InstrProfCorrelator::get(std::unique_ptr<MemoryBuffer> Buffer,
+                         ProfCorrelatorKind FileKind) {
   auto BinOrErr = object::createBinary(*Buffer);
   if (auto Err = BinOrErr.takeError())
     return std::move(Err);
@@ -88,9 +95,11 @@ InstrProfCorrelator::get(std::unique_ptr<MemoryBuffer> Buffer) {
       return std::move(Err);
     auto T = Obj->makeTriple();
     if (T.isArch64Bit())
-      return InstrProfCorrelatorImpl<uint64_t>::get(std::move(*CtxOrErr), *Obj);
+      return InstrProfCorrelatorImpl<uint64_t>::get(std::move(*CtxOrErr), *Obj,
+                                                    FileKind);
     if (T.isArch32Bit())
-      return InstrProfCorrelatorImpl<uint32_t>::get(std::move(*CtxOrErr), *Obj);
+      return InstrProfCorrelatorImpl<uint32_t>::get(std::move(*CtxOrErr), *Obj,
+                                                    FileKind);
   }
   return make_error<InstrProfError>(
       instrprof_error::unable_to_correlate_profile, "not an object file");
@@ -132,29 +141,33 @@ template <class IntPtrT>
 llvm::Expected<std::unique_ptr<InstrProfCorrelatorImpl<IntPtrT>>>
 InstrProfCorrelatorImpl<IntPtrT>::get(
     std::unique_ptr<InstrProfCorrelator::Context> Ctx,
-    const object::ObjectFile &Obj) {
-  if (Obj.isELF() || Obj.isMachO()) {
-    auto DICtx = DWARFContext::create(Obj);
-    return std::make_unique<DwarfInstrProfCorrelator<IntPtrT>>(std::move(DICtx),
-                                                               std::move(Ctx));
+    const object::ObjectFile &Obj, ProfCorrelatorKind FileKind) {
+  if (FileKind == DEBUG_INFO) {
+    if (Obj.isELF() || Obj.isMachO()) {
+      auto DICtx = DWARFContext::create(Obj);
+      return std::make_unique<DwarfInstrProfCorrelator<IntPtrT>>(
+          std::move(DICtx), std::move(Ctx));
+    }
+    return make_error<InstrProfError>(
+        instrprof_error::unable_to_correlate_profile,
+        "unsupported debug info format (only DWARF is supported)");
   }
   return make_error<InstrProfError>(
       instrprof_error::unable_to_correlate_profile,
-      "unsupported debug info format (only DWARF is supported)");
+      "unsupported correlation file type (only DWARF is supported)");
 }
 
 template <class IntPtrT>
 Error InstrProfCorrelatorImpl<IntPtrT>::correlateProfileData(int MaxWarnings) {
   assert(Data.empty() && Names.empty() && NamesVec.empty());
   correlateProfileDataImpl(MaxWarnings);
-  if (Data.empty() || NamesVec.empty())
+  if (this->Data.empty())
     return make_error<InstrProfError>(
         instrprof_error::unable_to_correlate_profile,
-        "could not find any profile metadata in debug info");
-  auto Result =
-      collectGlobalObjectNameStrings(NamesVec, /*doCompression=*/false, Names);
-  CounterOffsets.clear();
-  NamesVec.clear();
+        "could not find any profile data metadata in correlated file");
+  Error Result = correlateProfileNameImpl();
+  this->CounterOffsets.clear();
+  this->NamesVec.clear();
   return Result;
 }
 
@@ -189,7 +202,7 @@ Error InstrProfCorrelatorImpl<IntPtrT>::dumpYaml(int MaxWarnings,
   if (Data.Probes.empty())
     return make_error<InstrProfError>(
         instrprof_error::unable_to_correlate_profile,
-        "could not find any profile metadata in debug info");
+        "could not find any profile data metadata in debug info");
   yaml::Output YamlOS(OS);
   YamlOS << Data;
   return Error::success();
@@ -365,3 +378,16 @@ void DwarfInstrProfCorrelator<IntPtrT>::correlateProfileDataImpl(
     WithColor::warning() << format("Suppressed %d additional warnings\n",
                                    NumSuppressedWarnings);
 }
+
+template <class IntPtrT>
+Error DwarfInstrProfCorrelator<IntPtrT>::correlateProfileNameImpl() {
+  if (this->NamesVec.empty()) {
+    return make_error<InstrProfError>(
+        instrprof_error::unable_to_correlate_profile,
+        "could not find any profile name metadata in debug info");
+  }
+  auto Result =
+      collectGlobalObjectNameStrings(this->NamesVec,
+                                     /*doCompression=*/false, this->Names);
+  return Result;
+}
diff --git a/llvm/lib/ProfileData/InstrProfReader.cpp b/llvm/lib/ProfileData/InstrProfReader.cpp
index fd02f1f0c1235..9d5140da4db24 100644
--- a/llvm/lib/ProfileData/InstrProfReader.cpp
+++ b/llvm/lib/ProfileData/InstrProfReader.cpp
@@ -566,9 +566,9 @@ Error RawInstrProfReader<IntPtrT>::readHeader(
                   "\nPLEASE update this tool to version in the raw profile, or "
                   "regenerate raw profile with expected version.")
                      .str());
-  if (useDebugInfoCorrelate() && !Correlator)
+  if (useCorrelate() && !Correlator)
     return error(instrprof_error::missing_debug_info_for_correlation);
-  if (!useDebugInfoCorrelate() && Correlator)
+  if (!useCorrelate() && Correlator)
     return error(instrprof_error::unexpected_debug_info_for_correlation);
 
   BinaryIdsSize = swap(Header.BinaryIdsSize);
diff --git a/llvm/tools/llvm-profdata/llvm-profdata.cpp b/llvm/tools/llvm-profdata/llvm-profdata.cpp
index 84290ac609a6a..e7e7f8228d7d9 100644
--- a/llvm/tools/llvm-profdata/llvm-profdata.cpp
+++ b/llvm/tools/llvm-profdata/llvm-profdata.cpp
@@ -450,8 +450,9 @@ mergeInstrProfile(const WeightedFileVector &Inputs, StringRef DebugInfoFilename,
 
   std::unique_ptr<InstrProfCorrelator> Correlator;
   if (!DebugInfoFilename.empty()) {
-    if (auto Err =
-            InstrProfCorrelator::get(DebugInfoFilename).moveInto(Correlator))
+    if (auto Err = InstrProfCorrelator::get(DebugInfoFilename,
+                                            InstrProfCorrelator::DEBUG_INFO)
+                       .moveInto(Correlator))
       exitWithError(std::move(Err), DebugInfoFilename);
     if (auto Err = Correlator->correlateProfileData(MaxDbgCorrelationWarnings))
       exitWithError(std::move(Err), DebugInfoFilename);
@@ -2919,7 +2920,9 @@ static int showDebugInfoCorrelation(const std::string &Filename,
   if (SFormat == ShowFormat::Json)
     exitWithError("JSON output is not supported for debug info correlation");
   std::unique_ptr<InstrProfCorrelator> Correlator;
-  if (auto Err = InstrProfCorrelator::get(Filename).moveInto(Correlator))
+  if (auto Err =
+          InstrProfCorrelator::get(Filename, InstrProfCorrelator::DEBUG_INFO)
+              .moveInto(Correlator))
     exitWithError(std::move(Err), Filename);
   if (SFormat == ShowFormat::Yaml) {
     if (auto Err = Correlator->dumpYaml(MaxDbgCorrelationWarnings, OS))

From a273d17d4a0b0d621d111d33302172386d796f98 Mon Sep 17 00:00:00 2001
From: Johannes Doerfert <johannes@jdoerfert.de>
Date: Wed, 1 Nov 2023 08:26:38 -0700
Subject: [PATCH 720/877] [OpenMP][FIX] Do not add implicit argument to device
 Ctors and Dtors

Constructors and destructors on the device do not take any arguments,
also not the implicit dyn_ptr argument other kernels automatically take.
---
 .../PluginInterface/PluginInterface.cpp       |  3 +++
 .../common/PluginInterface/PluginInterface.h  |  6 +++++
 .../test/offloading/ctor_dtor.cpp             | 24 +++++++++++++++++++
 3 files changed, 33 insertions(+)
 create mode 100644 openmp/libomptarget/test/offloading/ctor_dtor.cpp

diff --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp
index 8747502065a59..106e7a68cd3ae 100644
--- a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp
+++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.cpp
@@ -511,6 +511,9 @@ void *GenericKernelTy::prepareArgs(
     uint32_t &NumArgs, llvm::SmallVectorImpl<void *> &Args,
     llvm::SmallVectorImpl<void *> &Ptrs,
     KernelLaunchEnvironmentTy *KernelLaunchEnvironment) const {
+  if (isCtorOrDtor())
+    return nullptr;
+
   NumArgs += 1;
 
   Args.resize(NumArgs);
diff --git a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h
index bbd6acd19bb01..d1294405c04b3 100644
--- a/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h
+++ b/openmp/libomptarget/plugins-nextgen/common/PluginInterface/PluginInterface.h
@@ -284,6 +284,12 @@ struct GenericKernelTy {
   /// Get the kernel name.
   const char *getName() const { return Name; }
 
+  /// Return true if this kernel is a constructor or destructor.
+  bool isCtorOrDtor() const {
+    // TODO: This is not a great solution and should be revisited.
+    return StringRef(Name).endswith("tor");
+  }
+
   /// Get the kernel image.
   DeviceImageTy &getImage() const {
     assert(ImagePtr && "Kernel is not initialized!");
diff --git a/openmp/libomptarget/test/offloading/ctor_dtor.cpp b/openmp/libomptarget/test/offloading/ctor_dtor.cpp
new file mode 100644
index 0000000000000..46e9dd46a3563
--- /dev/null
+++ b/openmp/libomptarget/test/offloading/ctor_dtor.cpp
@@ -0,0 +1,24 @@
+// RUN: %libomptarget-compilexx-run-and-check-generic
+// RUN: %libomptarget-compileoptxx-run-and-check-generic
+//
+#include <cstdio>
+struct S {
+  S() : i(7) {}
+  ~S() { foo(); }
+  int foo() { return i; }
+
+private:
+  int i;
+};
+
+S s;
+#pragma omp declare target(s)
+
+int main() {
+  int r;
+#pragma omp target map(from : r)
+  r = s.foo();
+
+  // CHECK: 7
+  printf("%i\n", r);
+}

From f2c24cceedba00bb2914ffcbf8e8898a45197c2a Mon Sep 17 00:00:00 2001
From: Chris B <chris.bieneman@me.com>
Date: Wed, 1 Nov 2023 13:24:28 -0500
Subject: [PATCH 721/877] Increase git fetch depth (#70946)

We've gotten ~750 commits in the last 7 days. Upping the fetch depth to
2000 will make it more likely that PRs up to 2 weeks old will have
enough fetch history to find a common parent. This _might_ address some
of the failures we're seeing in the clang-format action where it cannot
find a common base commit.
---
 .github/workflows/pr-code-format.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/pr-code-format.yml b/.github/workflows/pr-code-format.yml
index 3a91ffb0b1ad9..fa0e69211b147 100644
--- a/.github/workflows/pr-code-format.yml
+++ b/.github/workflows/pr-code-format.yml
@@ -10,14 +10,14 @@ jobs:
       - name: Fetch LLVM sources
         uses: actions/checkout@v4
         with:
-          fetch-depth: 2
+          fetch-depth: 2 # Fetches only the last 2 commits
 
       - name: Get changed files
         id: changed-files
         uses: tj-actions/changed-files@v39
         with:
           separator: ","
-          fetch_depth: 100 # Fetches only the last 10 commits
+          fetch_depth: 2000 # Fetches only the last 2000 commits
 
       - name: "Listed files"
         run: |

From 1c6c01fbde87171457a5bce7b147fa5bcbaddae7 Mon Sep 17 00:00:00 2001
From: Justin Bogner <mail@justinbogner.com>
Date: Wed, 1 Nov 2023 11:24:48 -0700
Subject: [PATCH 722/877] [Attributes][HLSL] Teach EnumArgument to refer to an
 external enum (#70835)

Rather than write a bunch of logic to shepherd between enums with the
same sets of values, add the ability for EnumArgument to refer to an
external enum in the first place.
---
 .../modularize/ModularizeUtilities.cpp        |   6 +-
 clang/include/clang/AST/Attr.h                |   3 +-
 clang/include/clang/Basic/Attr.td             |  35 ++--
 clang/lib/CodeGen/CGHLSLRuntime.cpp           |  59 +-----
 clang/lib/Sema/HLSLExternalSemaSource.cpp     |   8 +-
 clang/unittests/Sema/SemaNoloadLookupTest.cpp |   2 +-
 clang/utils/TableGen/ClangAttrEmitter.cpp     | 178 ++++++++++--------
 7 files changed, 136 insertions(+), 155 deletions(-)

diff --git a/clang-tools-extra/modularize/ModularizeUtilities.cpp b/clang-tools-extra/modularize/ModularizeUtilities.cpp
index 043f6f5b20b80..089f52f52ec4d 100644
--- a/clang-tools-extra/modularize/ModularizeUtilities.cpp
+++ b/clang-tools-extra/modularize/ModularizeUtilities.cpp
@@ -322,7 +322,7 @@ std::error_code ModularizeUtilities::loadModuleMap(
 // Walks the modules and collects referenced headers into
 // HeaderFileNames.
 bool ModularizeUtilities::collectModuleMapHeaders(clang::ModuleMap *ModMap) {
-  SmallVector<std::pair<StringRef, const Module *>, 0> Vec;
+  SmallVector<std::pair<StringRef, const clang::Module *>, 0> Vec;
   for (auto &M : ModMap->modules())
     Vec.emplace_back(M.first(), M.second);
   llvm::sort(Vec, llvm::less_first());
@@ -349,14 +349,14 @@ bool ModularizeUtilities::collectModuleHeaders(const clang::Module &Mod) {
   for (auto *Submodule : Mod.submodules())
     collectModuleHeaders(*Submodule);
 
-  if (std::optional<Module::Header> UmbrellaHeader =
+  if (std::optional<clang::Module::Header> UmbrellaHeader =
           Mod.getUmbrellaHeaderAsWritten()) {
     std::string HeaderPath = getCanonicalPath(UmbrellaHeader->Entry.getName());
     // Collect umbrella header.
     HeaderFileNames.push_back(HeaderPath);
 
     // FUTURE: When needed, umbrella header header collection goes here.
-  } else if (std::optional<Module::DirectoryName> UmbrellaDir =
+  } else if (std::optional<clang::Module::DirectoryName> UmbrellaDir =
                  Mod.getUmbrellaDirAsWritten()) {
     // If there normal headers, assume these are umbrellas and skip collection.
     if (Mod.Headers->size() == 0) {
diff --git a/clang/include/clang/AST/Attr.h b/clang/include/clang/AST/Attr.h
index 793732cd26b02..8884bd5a90de1 100644
--- a/clang/include/clang/AST/Attr.h
+++ b/clang/include/clang/AST/Attr.h
@@ -19,11 +19,12 @@
 #include "clang/AST/Type.h"
 #include "clang/Basic/AttrKinds.h"
 #include "clang/Basic/AttributeCommonInfo.h"
-#include "clang/Basic/LangOptions.h"
 #include "clang/Basic/LLVM.h"
+#include "clang/Basic/LangOptions.h"
 #include "clang/Basic/OpenMPKinds.h"
 #include "clang/Basic/Sanitizers.h"
 #include "clang/Basic/SourceLocation.h"
+#include "llvm/Frontend/HLSL/HLSLResource.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/VersionTuple.h"
 #include "llvm/Support/raw_ostream.h"
diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index 25231c5b82b90..60b549999c155 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -277,23 +277,28 @@ class DefaultIntArgument<string name, int default> : IntArgument<name, 1> {
   int Default = default;
 }
 
-// This argument is more complex, it includes the enumerator type name,
-// a list of strings to accept, and a list of enumerators to map them to.
+// This argument is more complex, it includes the enumerator type
+// name, whether the enum type is externally defined, a list of
+// strings to accept, and a list of enumerators to map them to.
 class EnumArgument<string name, string type, list<string> values,
-                   list<string> enums, bit opt = 0, bit fake = 0>
+                   list<string> enums, bit opt = 0, bit fake = 0,
+                   bit isExternalType = 0>
     : Argument<name, opt, fake> {
   string Type = type;
   list<string> Values = values;
   list<string> Enums = enums;
+  bit IsExternalType = isExternalType;
 }
 
 // FIXME: There should be a VariadicArgument type that takes any other type
 //        of argument and generates the appropriate type.
 class VariadicEnumArgument<string name, string type, list<string> values,
-                           list<string> enums> : Argument<name, 1>  {
+                           list<string> enums, bit isExternalType = 0>
+    : Argument<name, 1>  {
   string Type = type;
   list<string> Values = values;
   list<string> Enums = enums;
+  bit IsExternalType = isExternalType;
 }
 
 // This handles one spelling of an attribute.
@@ -4182,26 +4187,26 @@ def HLSLResource : InheritableAttr {
   let Spellings = [];
   let Subjects = SubjectList<[Struct]>;
   let LangOpts = [HLSL];
-  let Args = [EnumArgument<"ResourceType", "ResourceClass",
+  let Args = [EnumArgument<"ResourceClass", "llvm::hlsl::ResourceClass",
                            ["SRV", "UAV", "CBuffer", "Sampler"],
-                           ["SRV", "UAV", "CBuffer", "Sampler"]
-                           >,
-              EnumArgument<"ResourceShape", "ResourceKind",
+                           ["SRV", "UAV", "CBuffer", "Sampler"],
+                           /*opt=*/0, /*fake=*/0, /*isExternalType=*/1>,
+              EnumArgument<"ResourceKind", "llvm::hlsl::ResourceKind",
                            ["Texture1D", "Texture2D", "Texture2DMS",
                             "Texture3D", "TextureCube", "Texture1DArray",
                             "Texture2DArray", "Texture2DMSArray",
                             "TextureCubeArray", "TypedBuffer", "RawBuffer",
-                            "StructuredBuffer", "CBufferKind", "SamplerKind",
-                            "TBuffer", "RTAccelerationStructure", "FeedbackTexture2D",
-                            "FeedbackTexture2DArray"],
+                            "StructuredBuffer", "CBuffer", "Sampler",
+                            "TBuffer", "RTAccelerationStructure",
+                            "FeedbackTexture2D", "FeedbackTexture2DArray"],
                            ["Texture1D", "Texture2D", "Texture2DMS",
                             "Texture3D", "TextureCube", "Texture1DArray",
                             "Texture2DArray", "Texture2DMSArray",
                             "TextureCubeArray", "TypedBuffer", "RawBuffer",
-                            "StructuredBuffer", "CBufferKind", "SamplerKind",
-                            "TBuffer", "RTAccelerationStructure", "FeedbackTexture2D",
-                            "FeedbackTexture2DArray"]
-                            >
+                            "StructuredBuffer", "CBuffer", "Sampler",
+                            "TBuffer", "RTAccelerationStructure",
+                            "FeedbackTexture2D", "FeedbackTexture2DArray"],
+                           /*opt=*/0, /*fake=*/0, /*isExternalType=*/1>
               ];
   let Documentation = [InternalOnly];
 }
diff --git a/clang/lib/CodeGen/CGHLSLRuntime.cpp b/clang/lib/CodeGen/CGHLSLRuntime.cpp
index e9fa273f21cc8..c239bc17ef267 100644
--- a/clang/lib/CodeGen/CGHLSLRuntime.cpp
+++ b/clang/lib/CodeGen/CGHLSLRuntime.cpp
@@ -223,56 +223,6 @@ void CGHLSLRuntime::addBufferResourceAnnotation(llvm::GlobalVariable *GV,
   ResourceMD->addOperand(Res.getMetadata());
 }
 
-static llvm::hlsl::ResourceKind
-castResourceShapeToResourceKind(HLSLResourceAttr::ResourceKind RK) {
-  switch (RK) {
-  case HLSLResourceAttr::ResourceKind::Texture1D:
-    return llvm::hlsl::ResourceKind::Texture1D;
-  case HLSLResourceAttr::ResourceKind::Texture2D:
-    return llvm::hlsl::ResourceKind::Texture2D;
-  case HLSLResourceAttr::ResourceKind::Texture2DMS:
-    return llvm::hlsl::ResourceKind::Texture2DMS;
-  case HLSLResourceAttr::ResourceKind::Texture3D:
-    return llvm::hlsl::ResourceKind::Texture3D;
-  case HLSLResourceAttr::ResourceKind::TextureCube:
-    return llvm::hlsl::ResourceKind::TextureCube;
-  case HLSLResourceAttr::ResourceKind::Texture1DArray:
-    return llvm::hlsl::ResourceKind::Texture1DArray;
-  case HLSLResourceAttr::ResourceKind::Texture2DArray:
-    return llvm::hlsl::ResourceKind::Texture2DArray;
-  case HLSLResourceAttr::ResourceKind::Texture2DMSArray:
-    return llvm::hlsl::ResourceKind::Texture2DMSArray;
-  case HLSLResourceAttr::ResourceKind::TextureCubeArray:
-    return llvm::hlsl::ResourceKind::TextureCubeArray;
-  case HLSLResourceAttr::ResourceKind::TypedBuffer:
-    return llvm::hlsl::ResourceKind::TypedBuffer;
-  case HLSLResourceAttr::ResourceKind::RawBuffer:
-    return llvm::hlsl::ResourceKind::RawBuffer;
-  case HLSLResourceAttr::ResourceKind::StructuredBuffer:
-    return llvm::hlsl::ResourceKind::StructuredBuffer;
-  case HLSLResourceAttr::ResourceKind::CBufferKind:
-    return llvm::hlsl::ResourceKind::CBuffer;
-  case HLSLResourceAttr::ResourceKind::SamplerKind:
-    return llvm::hlsl::ResourceKind::Sampler;
-  case HLSLResourceAttr::ResourceKind::TBuffer:
-    return llvm::hlsl::ResourceKind::TBuffer;
-  case HLSLResourceAttr::ResourceKind::RTAccelerationStructure:
-    return llvm::hlsl::ResourceKind::RTAccelerationStructure;
-  case HLSLResourceAttr::ResourceKind::FeedbackTexture2D:
-    return llvm::hlsl::ResourceKind::FeedbackTexture2D;
-  case HLSLResourceAttr::ResourceKind::FeedbackTexture2DArray:
-    return llvm::hlsl::ResourceKind::FeedbackTexture2DArray;
-  }
-  // Make sure to update HLSLResourceAttr::ResourceKind when add new Kind to
-  // hlsl::ResourceKind. Assume FeedbackTexture2DArray is the last enum for
-  // HLSLResourceAttr::ResourceKind.
-  static_assert(
-      static_cast<uint32_t>(
-          HLSLResourceAttr::ResourceKind::FeedbackTexture2DArray) ==
-      (static_cast<uint32_t>(llvm::hlsl::ResourceKind::NumEntries) - 2));
-  llvm_unreachable("all switch cases should be covered");
-}
-
 void CGHLSLRuntime::annotateHLSLResource(const VarDecl *D, GlobalVariable *GV) {
   const Type *Ty = D->getType()->getPointeeOrArrayElementType();
   if (!Ty)
@@ -284,15 +234,12 @@ void CGHLSLRuntime::annotateHLSLResource(const VarDecl *D, GlobalVariable *GV) {
   if (!Attr)
     return;
 
-  HLSLResourceAttr::ResourceClass RC = Attr->getResourceType();
-  llvm::hlsl::ResourceKind RK =
-      castResourceShapeToResourceKind(Attr->getResourceShape());
+  llvm::hlsl::ResourceClass RC = Attr->getResourceClass();
+  llvm::hlsl::ResourceKind RK = Attr->getResourceKind();
 
   QualType QT(Ty, 0);
   BufferResBinding Binding(D->getAttr<HLSLResourceBindingAttr>());
-  addBufferResourceAnnotation(GV, QT.getAsString(),
-                              static_cast<llvm::hlsl::ResourceClass>(RC), RK,
-                              Binding);
+  addBufferResourceAnnotation(GV, QT.getAsString(), RC, RK, Binding);
 }
 
 CGHLSLRuntime::BufferResBinding::BufferResBinding(
diff --git a/clang/lib/Sema/HLSLExternalSemaSource.cpp b/clang/lib/Sema/HLSLExternalSemaSource.cpp
index fd86a5f8b49c0..0be76d4b36e04 100644
--- a/clang/lib/Sema/HLSLExternalSemaSource.cpp
+++ b/clang/lib/Sema/HLSLExternalSemaSource.cpp
@@ -115,9 +115,8 @@ struct BuiltinTypeDeclBuilder {
     return addMemberVariable("h", Ty, Access);
   }
 
-  BuiltinTypeDeclBuilder &
-  annotateResourceClass(HLSLResourceAttr::ResourceClass RC,
-                        HLSLResourceAttr::ResourceKind RK) {
+  BuiltinTypeDeclBuilder &annotateResourceClass(ResourceClass RC,
+                                                ResourceKind RK) {
     if (Record->isCompleteDefinition())
       return *this;
     Record->addAttr(
@@ -503,7 +502,6 @@ void HLSLExternalSemaSource::completeBufferType(CXXRecordDecl *Record) {
       .addHandleMember()
       .addDefaultHandleConstructor(*SemaPtr, ResourceClass::UAV)
       .addArraySubscriptOperators()
-      .annotateResourceClass(HLSLResourceAttr::UAV,
-                             HLSLResourceAttr::TypedBuffer)
+      .annotateResourceClass(ResourceClass::UAV, ResourceKind::TypedBuffer)
       .completeDefinition();
 }
diff --git a/clang/unittests/Sema/SemaNoloadLookupTest.cpp b/clang/unittests/Sema/SemaNoloadLookupTest.cpp
index 2e74ad3a30a93..b24c72cba407f 100644
--- a/clang/unittests/Sema/SemaNoloadLookupTest.cpp
+++ b/clang/unittests/Sema/SemaNoloadLookupTest.cpp
@@ -118,7 +118,7 @@ class NoloadLookupConsumer : public SemaConsumer {
     if (!ID)
       return true;
 
-    Module *M = ID->getImportedModule();
+    clang::Module *M = ID->getImportedModule();
     assert(M);
     if (M->Name != "R")
       return true;
diff --git a/clang/utils/TableGen/ClangAttrEmitter.cpp b/clang/utils/TableGen/ClangAttrEmitter.cpp
index 45e2fa7b283e1..4231bcbb36253 100644
--- a/clang/utils/TableGen/ClangAttrEmitter.cpp
+++ b/clang/utils/TableGen/ClangAttrEmitter.cpp
@@ -898,15 +898,25 @@ namespace {
   }
 
   class EnumArgument : public Argument {
-    std::string type;
+    std::string fullType;
+    StringRef shortType;
     std::vector<StringRef> values, enums, uniques;
+    bool isExternal;
 
   public:
     EnumArgument(const Record &Arg, StringRef Attr)
-        : Argument(Arg, Attr), type(std::string(Arg.getValueAsString("Type"))),
-          values(Arg.getValueAsListOfStrings("Values")),
+        : Argument(Arg, Attr), values(Arg.getValueAsListOfStrings("Values")),
           enums(Arg.getValueAsListOfStrings("Enums")),
-          uniques(uniqueEnumsInOrder(enums)) {
+          uniques(uniqueEnumsInOrder(enums)),
+          isExternal(Arg.getValueAsBit("IsExternalType")) {
+      StringRef Type = Arg.getValueAsString("Type");
+      shortType = isExternal ? Type.rsplit("::").second : Type;
+      // If shortType didn't contain :: at all rsplit will give us an empty
+      // string.
+      if (shortType.empty())
+        shortType = Type;
+      fullType = isExternal ? Type : (getAttrName() + "Attr::" + Type).str();
+
       // FIXME: Emit a proper error
       assert(!uniques.empty());
     }
@@ -914,7 +924,7 @@ namespace {
     bool isEnumArg() const override { return true; }
 
     void writeAccessors(raw_ostream &OS) const override {
-      OS << "  " << type << " get" << getUpperName() << "() const {\n";
+      OS << "  " << fullType << " get" << getUpperName() << "() const {\n";
       OS << "    return " << getLowerName() << ";\n";
       OS << "  }";
     }
@@ -930,30 +940,32 @@ namespace {
       OS << getLowerName() << "(" << getUpperName() << ")";
     }
     void writeCtorDefaultInitializers(raw_ostream &OS) const override {
-      OS << getLowerName() << "(" << type << "(0))";
+      OS << getLowerName() << "(" << fullType << "(0))";
     }
     void writeCtorParameters(raw_ostream &OS) const override {
-      OS << type << " " << getUpperName();
+      OS << fullType << " " << getUpperName();
     }
     void writeDeclarations(raw_ostream &OS) const override {
-      auto i = uniques.cbegin(), e = uniques.cend();
-      // The last one needs to not have a comma.
-      --e;
+      if (!isExternal) {
+        auto i = uniques.cbegin(), e = uniques.cend();
+        // The last one needs to not have a comma.
+        --e;
+
+        OS << "public:\n";
+        OS << "  enum " << shortType << " {\n";
+        for (; i != e; ++i)
+          OS << "    " << *i << ",\n";
+        OS << "    " << *e << "\n";
+        OS << "  };\n";
+      }
 
-      OS << "public:\n";
-      OS << "  enum " << type << " {\n";
-      for (; i != e; ++i)
-        OS << "    " << *i << ",\n";
-      OS << "    " << *e << "\n";
-      OS << "  };\n";
       OS << "private:\n";
-      OS << "  " << type << " " << getLowerName() << ";";
+      OS << "  " << fullType << " " << getLowerName() << ";";
     }
 
     void writePCHReadDecls(raw_ostream &OS) const override {
-      OS << "    " << getAttrName() << "Attr::" << type << " " << getLowerName()
-         << "(static_cast<" << getAttrName() << "Attr::" << type
-         << ">(Record.readInt()));\n";
+      OS << "    " << fullType << " " << getLowerName() << "(static_cast<"
+         << fullType << ">(Record.readInt()));\n";
     }
 
     void writePCHReadArgs(raw_ostream &OS) const override {
@@ -961,46 +973,50 @@ namespace {
     }
 
     void writePCHWrite(raw_ostream &OS) const override {
-      OS << "Record.push_back(SA->get" << getUpperName() << "());\n";
+      OS << "Record.push_back(static_cast<uint64_t>(SA->get" << getUpperName()
+         << "()));\n";
     }
 
     void writeValue(raw_ostream &OS) const override {
       // FIXME: this isn't 100% correct -- some enum arguments require printing
       // as a string literal, while others require printing as an identifier.
       // Tablegen currently does not distinguish between the two forms.
-      OS << "\\\"\" << " << getAttrName() << "Attr::Convert" << type << "ToStr(get"
-         << getUpperName() << "()) << \"\\\"";
+      OS << "\\\"\" << " << getAttrName() << "Attr::Convert" << shortType
+         << "ToStr(get" << getUpperName() << "()) << \"\\\"";
     }
 
     void writeDump(raw_ostream &OS) const override {
       OS << "    switch(SA->get" << getUpperName() << "()) {\n";
       for (const auto &I : uniques) {
-        OS << "    case " << getAttrName() << "Attr::" << I << ":\n";
+        OS << "    case " << fullType << "::" << I << ":\n";
         OS << "      OS << \" " << I << "\";\n";
         OS << "      break;\n";
       }
+      if (isExternal) {
+        OS << "    default:\n";
+        OS << "      llvm_unreachable(\"Invalid attribute value\");\n";
+      }
       OS << "    }\n";
     }
 
     void writeConversion(raw_ostream &OS, bool Header) const {
       if (Header) {
-        OS << "  static bool ConvertStrTo" << type << "(StringRef Val, " << type
-           << " &Out);\n";
-        OS << "  static const char *Convert" << type << "ToStr(" << type
-           << " Val);\n";
+        OS << "  static bool ConvertStrTo" << shortType << "(StringRef Val, "
+           << fullType << " &Out);\n";
+        OS << "  static const char *Convert" << shortType << "ToStr("
+           << fullType << " Val);\n";
         return;
       }
 
-      OS << "bool " << getAttrName() << "Attr::ConvertStrTo" << type
-         << "(StringRef Val, " << type << " &Out) {\n";
-      OS << "  std::optional<" << type
-         << "> R = llvm::StringSwitch<std::optional<";
-      OS << type << ">>(Val)\n";
+      OS << "bool " << getAttrName() << "Attr::ConvertStrTo" << shortType
+         << "(StringRef Val, " << fullType << " &Out) {\n";
+      OS << "  std::optional<" << fullType << "> "
+         << "R = llvm::StringSwitch<std::optional<" << fullType << ">>(Val)\n";
       for (size_t I = 0; I < enums.size(); ++I) {
         OS << "    .Case(\"" << values[I] << "\", ";
-        OS << getAttrName() << "Attr::" << enums[I] << ")\n";
+        OS << fullType << "::" << enums[I] << ")\n";
       }
-      OS << "    .Default(std::optional<" << type << ">());\n";
+      OS << "    .Default(std::optional<" << fullType << ">());\n";
       OS << "  if (R) {\n";
       OS << "    Out = *R;\n      return true;\n    }\n";
       OS << "  return false;\n";
@@ -1010,14 +1026,17 @@ namespace {
       // trivial because some enumeration values have multiple named
       // enumerators, such as type_visibility(internal) and
       // type_visibility(hidden) both mapping to TypeVisibilityAttr::Hidden.
-      OS << "const char *" << getAttrName() << "Attr::Convert" << type
-         << "ToStr(" << type << " Val) {\n"
+      OS << "const char *" << getAttrName() << "Attr::Convert" << shortType
+         << "ToStr(" << fullType << " Val) {\n"
          << "  switch(Val) {\n";
       SmallDenseSet<StringRef, 8> Uniques;
       for (size_t I = 0; I < enums.size(); ++I) {
         if (Uniques.insert(enums[I]).second)
-          OS << "  case " << getAttrName() << "Attr::" << enums[I]
-             << ": return \"" << values[I] << "\";\n";
+          OS << "  case " << fullType << "::" << enums[I] << ": return \""
+             << values[I] << "\";\n";
+      }
+      if (isExternal) {
+        OS << "  default: llvm_unreachable(\"Invalid attribute value\");\n";
       }
       OS << "  }\n"
          << "  llvm_unreachable(\"No enumerator with that value\");\n"
@@ -1026,27 +1045,36 @@ namespace {
   };
 
   class VariadicEnumArgument: public VariadicArgument {
-    std::string type, QualifiedTypeName;
+    std::string fullType;
+    StringRef shortType;
     std::vector<StringRef> values, enums, uniques;
+    bool isExternal;
 
   protected:
     void writeValueImpl(raw_ostream &OS) const override {
       // FIXME: this isn't 100% correct -- some enum arguments require printing
       // as a string literal, while others require printing as an identifier.
       // Tablegen currently does not distinguish between the two forms.
-      OS << "    OS << \"\\\"\" << " << getAttrName() << "Attr::Convert" << type
-         << "ToStr(Val)" << "<< \"\\\"\";\n";
+      OS << "    OS << \"\\\"\" << " << getAttrName() << "Attr::Convert"
+         << shortType << "ToStr(Val)"
+         << "<< \"\\\"\";\n";
     }
 
   public:
     VariadicEnumArgument(const Record &Arg, StringRef Attr)
         : VariadicArgument(Arg, Attr,
                            std::string(Arg.getValueAsString("Type"))),
-          type(std::string(Arg.getValueAsString("Type"))),
           values(Arg.getValueAsListOfStrings("Values")),
           enums(Arg.getValueAsListOfStrings("Enums")),
-          uniques(uniqueEnumsInOrder(enums)) {
-      QualifiedTypeName = getAttrName().str() + "Attr::" + type;
+          uniques(uniqueEnumsInOrder(enums)),
+          isExternal(Arg.getValueAsBit("IsExternalType")) {
+      StringRef Type = Arg.getValueAsString("Type");
+      shortType = isExternal ? Type.rsplit("::").second : Type;
+      // If shortType didn't contain :: at all rsplit will give us an empty
+      // string.
+      if (shortType.empty())
+        shortType = Type;
+      fullType = isExternal ? Type : (getAttrName() + "Attr::" + Type).str();
 
       // FIXME: Emit a proper error
       assert(!uniques.empty());
@@ -1055,16 +1083,18 @@ namespace {
     bool isVariadicEnumArg() const override { return true; }
 
     void writeDeclarations(raw_ostream &OS) const override {
-      auto i = uniques.cbegin(), e = uniques.cend();
-      // The last one needs to not have a comma.
-      --e;
-
-      OS << "public:\n";
-      OS << "  enum " << type << " {\n";
-      for (; i != e; ++i)
-        OS << "    " << *i << ",\n";
-      OS << "    " << *e << "\n";
-      OS << "  };\n";
+      if (!isExternal) {
+        auto i = uniques.cbegin(), e = uniques.cend();
+        // The last one needs to not have a comma.
+        --e;
+
+        OS << "public:\n";
+        OS << "  enum " << shortType << " {\n";
+        for (; i != e; ++i)
+          OS << "    " << *i << ",\n";
+        OS << "    " << *e << "\n";
+        OS << "  };\n";
+      }
       OS << "private:\n";
 
       VariadicArgument::writeDeclarations(OS);
@@ -1076,7 +1106,7 @@ namespace {
          << getLowerName() << "_end(); I != E; ++I) {\n";
       OS << "      switch(*I) {\n";
       for (const auto &UI : uniques) {
-        OS << "    case " << getAttrName() << "Attr::" << UI << ":\n";
+        OS << "    case " << fullType << "::" << UI << ":\n";
         OS << "      OS << \" " << UI << "\";\n";
         OS << "      break;\n";
       }
@@ -1086,13 +1116,13 @@ namespace {
 
     void writePCHReadDecls(raw_ostream &OS) const override {
       OS << "    unsigned " << getLowerName() << "Size = Record.readInt();\n";
-      OS << "    SmallVector<" << QualifiedTypeName << ", 4> " << getLowerName()
+      OS << "    SmallVector<" << fullType << ", 4> " << getLowerName()
          << ";\n";
       OS << "    " << getLowerName() << ".reserve(" << getLowerName()
          << "Size);\n";
       OS << "    for (unsigned i = " << getLowerName() << "Size; i; --i)\n";
-      OS << "      " << getLowerName() << ".push_back(" << "static_cast<"
-         << QualifiedTypeName << ">(Record.readInt()));\n";
+      OS << "      " << getLowerName() << ".push_back("
+         << "static_cast<" << fullType << ">(Record.readInt()));\n";
     }
 
     void writePCHWrite(raw_ostream &OS) const override {
@@ -1100,42 +1130,42 @@ namespace {
       OS << "    for (" << getAttrName() << "Attr::" << getLowerName()
          << "_iterator i = SA->" << getLowerName() << "_begin(), e = SA->"
          << getLowerName() << "_end(); i != e; ++i)\n";
-      OS << "      " << WritePCHRecord(QualifiedTypeName, "(*i)");
+      OS << "      " << WritePCHRecord(fullType, "(*i)");
     }
 
     void writeConversion(raw_ostream &OS, bool Header) const {
       if (Header) {
-        OS << "  static bool ConvertStrTo" << type << "(StringRef Val, " << type
-           << " &Out);\n";
-        OS << "  static const char *Convert" << type << "ToStr(" << type
-           << " Val);\n";
+        OS << "  static bool ConvertStrTo" << shortType << "(StringRef Val, "
+           << fullType << " &Out);\n";
+        OS << "  static const char *Convert" << shortType << "ToStr("
+           << fullType << " Val);\n";
         return;
       }
 
-      OS << "bool " << getAttrName() << "Attr::ConvertStrTo" << type
+      OS << "bool " << getAttrName() << "Attr::ConvertStrTo" << shortType
          << "(StringRef Val, ";
-      OS << type << " &Out) {\n";
-      OS << "  std::optional<" << type
+      OS << fullType << " &Out) {\n";
+      OS << "  std::optional<" << fullType
          << "> R = llvm::StringSwitch<std::optional<";
-      OS << type << ">>(Val)\n";
+      OS << fullType << ">>(Val)\n";
       for (size_t I = 0; I < enums.size(); ++I) {
         OS << "    .Case(\"" << values[I] << "\", ";
-        OS << getAttrName() << "Attr::" << enums[I] << ")\n";
+        OS << fullType << "::" << enums[I] << ")\n";
       }
-      OS << "    .Default(std::optional<" << type << ">());\n";
+      OS << "    .Default(std::optional<" << fullType << ">());\n";
       OS << "  if (R) {\n";
       OS << "    Out = *R;\n      return true;\n    }\n";
       OS << "  return false;\n";
       OS << "}\n\n";
 
-      OS << "const char *" << getAttrName() << "Attr::Convert" << type
-         << "ToStr(" << type << " Val) {\n"
+      OS << "const char *" << getAttrName() << "Attr::Convert" << shortType
+         << "ToStr(" << fullType << " Val) {\n"
          << "  switch(Val) {\n";
       SmallDenseSet<StringRef, 8> Uniques;
       for (size_t I = 0; I < enums.size(); ++I) {
         if (Uniques.insert(enums[I]).second)
-          OS << "  case " << getAttrName() << "Attr::" << enums[I]
-             << ": return \"" << values[I] << "\";\n";
+          OS << "  case " << fullType << "::" << enums[I] << ": return \""
+             << values[I] << "\";\n";
       }
       OS << "  }\n"
          << "  llvm_unreachable(\"No enumerator with that value\");\n"

From 0e06ddf0f6896cfd817a1b97a43b78331e0b1d66 Mon Sep 17 00:00:00 2001
From: Johannes Doerfert <johannes@jdoerfert.de>
Date: Wed, 1 Nov 2023 11:33:25 -0700
Subject: [PATCH 723/877] Revert "[APINotes] Upstream APINotesOptions"

This reverts commit c0a1857928c557400af0ed53d198cc9f3f185f9a.

A shared_ptr assertion always triggers causes all bots to fail.
---
 .../include/clang/APINotes/APINotesOptions.h  | 34 -------------------
 clang/include/clang/Driver/Options.td         |  7 ----
 .../clang/Frontend/CompilerInvocation.h       |  6 ----
 clang/lib/Frontend/CompilerInvocation.cpp     | 12 -------
 4 files changed, 59 deletions(-)
 delete mode 100644 clang/include/clang/APINotes/APINotesOptions.h

diff --git a/clang/include/clang/APINotes/APINotesOptions.h b/clang/include/clang/APINotes/APINotesOptions.h
deleted file mode 100644
index e8b8a9ed2261f..0000000000000
--- a/clang/include/clang/APINotes/APINotesOptions.h
+++ /dev/null
@@ -1,34 +0,0 @@
-//===--- APINotesOptions.h --------------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_CLANG_APINOTES_APINOTESOPTIONS_H
-#define LLVM_CLANG_APINOTES_APINOTESOPTIONS_H
-
-#include "llvm/Support/VersionTuple.h"
-#include <string>
-#include <vector>
-
-namespace clang {
-
-/// Tracks various options which control how API notes are found and handled.
-class APINotesOptions {
-public:
-  /// The Swift version which should be used for API notes.
-  llvm::VersionTuple SwiftVersion;
-
-  /// The set of search paths where we API notes can be found for particular
-  /// modules.
-  ///
-  /// The API notes in this directory are stored as <ModuleName>.apinotes, and
-  /// are only applied when building the module <ModuleName>.
-  std::vector<std::string> ModuleSearchPaths;
-};
-
-} // namespace clang
-
-#endif // LLVM_CLANG_APINOTES_APINOTESOPTIONS_H
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index fcf6a4b2ccb23..b1229b2f45623 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -1733,10 +1733,6 @@ def fswift_async_fp_EQ : Joined<["-"], "fswift-async-fp=">,
     NormalizedValuesScope<"CodeGenOptions::SwiftAsyncFramePointerKind">,
     NormalizedValues<["Auto", "Always", "Never"]>,
     MarshallingInfoEnum<CodeGenOpts<"SwiftAsyncFramePointer">, "Always">;
-def fapinotes_swift_version : Joined<["-"], "fapinotes-swift-version=">,
-  Group<f_clang_Group>, Visibility<[ClangOption, CC1Option]>,
-  MetaVarName<"<version>">,
-  HelpText<"Specify the Swift version to use when filtering API notes">;
 
 defm addrsig : BoolFOption<"addrsig",
   CodeGenOpts<"Addrsig">, DefaultFalse,
@@ -4133,9 +4129,6 @@ def ibuiltininc : Flag<["-"], "ibuiltininc">, Group<clang_i_Group>,
 def index_header_map : Flag<["-"], "index-header-map">,
   Visibility<[ClangOption, CC1Option]>,
   HelpText<"Make the next included directory (-I or -F) an indexer header map">;
-def iapinotes_modules : JoinedOrSeparate<["-"], "iapinotes-modules">, Group<clang_i_Group>,
-  Visibility<[ClangOption, CC1Option]>,
-  HelpText<"Add directory to the API notes search path referenced by module name">, MetaVarName<"<directory>">;
 def idirafter : JoinedOrSeparate<["-"], "idirafter">, Group<clang_i_Group>,
   Visibility<[ClangOption, CC1Option]>,
   HelpText<"Add directory to AFTER include search path">;
diff --git a/clang/include/clang/Frontend/CompilerInvocation.h b/clang/include/clang/Frontend/CompilerInvocation.h
index d9c757a8a1568..45e263e7bc768 100644
--- a/clang/include/clang/Frontend/CompilerInvocation.h
+++ b/clang/include/clang/Frontend/CompilerInvocation.h
@@ -9,7 +9,6 @@
 #ifndef LLVM_CLANG_FRONTEND_COMPILERINVOCATION_H
 #define LLVM_CLANG_FRONTEND_COMPILERINVOCATION_H
 
-#include "clang/APINotes/APINotesOptions.h"
 #include "clang/Basic/CodeGenOptions.h"
 #include "clang/Basic/DiagnosticOptions.h"
 #include "clang/Basic/FileSystemOptions.h"
@@ -93,9 +92,6 @@ class CompilerInvocationBase {
 
   std::shared_ptr<MigratorOptions> MigratorOpts;
 
-  /// Options controlling API notes.
-  std::shared_ptr<APINotesOptions> APINotesOpts;
-
   /// Options controlling IRgen and the backend.
   std::shared_ptr<CodeGenOptions> CodeGenOpts;
 
@@ -135,7 +131,6 @@ class CompilerInvocationBase {
   const PreprocessorOptions &getPreprocessorOpts() const { return *PPOpts; }
   const AnalyzerOptions &getAnalyzerOpts() const { return *AnalyzerOpts; }
   const MigratorOptions &getMigratorOpts() const { return *MigratorOpts; }
-  const APINotesOptions &getAPINotesOpts() const { return *APINotesOpts; }
   const CodeGenOptions &getCodeGenOpts() const { return *CodeGenOpts; }
   const FileSystemOptions &getFileSystemOpts() const { return *FSOpts; }
   const FrontendOptions &getFrontendOpts() const { return *FrontendOpts; }
@@ -247,7 +242,6 @@ class CompilerInvocation : public CompilerInvocationBase {
   PreprocessorOptions &getPreprocessorOpts() { return *PPOpts; }
   AnalyzerOptions &getAnalyzerOpts() { return *AnalyzerOpts; }
   MigratorOptions &getMigratorOpts() { return *MigratorOpts; }
-  APINotesOptions &getAPINotesOpts() { return *APINotesOpts; }
   CodeGenOptions &getCodeGenOpts() { return *CodeGenOpts; }
   FileSystemOptions &getFileSystemOpts() { return *FSOpts; }
   FrontendOptions &getFrontendOpts() { return *FrontendOpts; }
diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp
index 140feb0ff9e79..fd6c250efeda2 100644
--- a/clang/lib/Frontend/CompilerInvocation.cpp
+++ b/clang/lib/Frontend/CompilerInvocation.cpp
@@ -3261,17 +3261,6 @@ static bool ParseHeaderSearchArgs(HeaderSearchOptions &Opts, ArgList &Args,
   return Diags.getNumErrors() == NumErrorsBefore;
 }
 
-static void ParseAPINotesArgs(APINotesOptions &Opts, ArgList &Args,
-                              DiagnosticsEngine &diags) {
-  if (const Arg *A = Args.getLastArg(OPT_fapinotes_swift_version)) {
-    if (Opts.SwiftVersion.tryParse(A->getValue()))
-      diags.Report(diag::err_drv_invalid_value)
-          << A->getAsString(Args) << A->getValue();
-  }
-  for (const Arg *A : Args.filtered(OPT_iapinotes_modules))
-    Opts.ModuleSearchPaths.push_back(A->getValue());
-}
-
 /// Check if input file kind and language standard are compatible.
 static bool IsInputCompatibleWithStandard(InputKind IK,
                                           const LangStandard &S) {
@@ -4549,7 +4538,6 @@ bool CompilerInvocation::CreateFromArgsImpl(
   llvm::Triple T(Res.getTargetOpts().Triple);
   ParseHeaderSearchArgs(Res.getHeaderSearchOpts(), Args, Diags,
                         Res.getFileSystemOpts().WorkingDir);
-  ParseAPINotesArgs(Res.getAPINotesOpts(), Args, Diags);
 
   ParseLangArgs(LangOpts, Args, DashX, T, Res.getPreprocessorOpts().Includes,
                 Diags);

From 5570d3250fc3212ad63e9514d1ebac76e75f7ba5 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Wed, 1 Nov 2023 10:10:28 -0700
Subject: [PATCH 724/877] [RISCV] Don't promote i32 and/or/xor with
 -riscv-experimental-rv64-legal-i32.

Some test improvements, but also some regressions that need to be
fixed.
---
 llvm/lib/Target/RISCV/RISCVGISel.td           |  11 --
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   |   8 +-
 llvm/lib/Target/RISCV/RISCVInstrInfo.td       |   9 ++
 llvm/lib/Target/RISCV/RISCVInstrInfoZb.td     |   4 +
 llvm/test/CodeGen/RISCV/rv64-legal-i32/div.ll |  46 ++++---
 llvm/test/CodeGen/RISCV/rv64-legal-i32/rem.ll |   5 +-
 .../RISCV/rv64-legal-i32/rv64xtheadbb.ll      |   9 +-
 .../RISCV/rv64-legal-i32/rv64zbb-zbkb.ll      |  43 ++-----
 .../CodeGen/RISCV/rv64-legal-i32/rv64zbb.ll   |   9 +-
 .../CodeGen/RISCV/rv64-legal-i32/rv64zbs.ll   | 118 ++++++++----------
 .../CodeGen/RISCV/rv64-legal-i32/xaluo.ll     |  12 +-
 11 files changed, 123 insertions(+), 151 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVGISel.td b/llvm/lib/Target/RISCV/RISCVGISel.td
index 458bf9a2efde4..887671ecb435d 100644
--- a/llvm/lib/Target/RISCV/RISCVGISel.td
+++ b/llvm/lib/Target/RISCV/RISCVGISel.td
@@ -81,20 +81,9 @@ def : Pat<(XLenVT (sub GPR:$rs1, simm12Plus1:$imm)),
           (ADDI GPR:$rs1, (NegImm simm12Plus1:$imm))>;
 
 let Predicates = [IsRV64] in {
-def : Pat<(i32 (and GPR:$rs1, GPR:$rs2)), (AND GPR:$rs1, GPR:$rs2)>;
-def : Pat<(i32 (or GPR:$rs1, GPR:$rs2)), (OR GPR:$rs1, GPR:$rs2)>;
-def : Pat<(i32 (xor GPR:$rs1, GPR:$rs2)), (XOR GPR:$rs1, GPR:$rs2)>;
-
 def : Pat<(i32 (sub GPR:$rs1, simm12Plus1i32:$imm)),
           (ADDIW GPR:$rs1, (i64 (NegImm $imm)))>;
 
-def : Pat<(i32 (and GPR:$rs1, simm12i32:$imm)),
-          (ANDI GPR:$rs1, (i64 (as_i64imm $imm)))>;
-def : Pat<(i32 (or GPR:$rs1, simm12i32:$imm)),
-          (ORI GPR:$rs1, (i64 (as_i64imm $imm)))>;
-def : Pat<(i32 (xor GPR:$rs1, simm12i32:$imm)),
-          (XORI GPR:$rs1, (i64 (as_i64imm $imm)))>;
-
 def : Pat<(i32 (shl GPR:$rs1, (i32 GPR:$rs2))), (SLLW GPR:$rs1, GPR:$rs2)>;
 def : Pat<(i32 (sra GPR:$rs1, (i32 GPR:$rs2))), (SRAW GPR:$rs1, GPR:$rs2)>;
 def : Pat<(i32 (srl GPR:$rs1, (i32 GPR:$rs2))), (SRLW GPR:$rs1, GPR:$rs2)>;
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 67f09377bf45e..ec5ae3e8f8d54 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -275,16 +275,10 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
   if (Subtarget.is64Bit()) {
     setOperationAction(ISD::EH_DWARF_CFA, MVT::i64, Custom);
 
-    if (!RV64LegalI32)
+    if (!RV64LegalI32) {
       setOperationAction(ISD::LOAD, MVT::i32, Custom);
-
-    if (RV64LegalI32)
-      setOperationAction({ISD::AND, ISD::OR, ISD::XOR}, MVT::i32, Promote);
-    else
       setOperationAction({ISD::ADD, ISD::SUB, ISD::SHL, ISD::SRA, ISD::SRL},
                          MVT::i32, Custom);
-
-    if (!RV64LegalI32) {
       setOperationAction(ISD::SADDO, MVT::i32, Custom);
       setOperationAction({ISD::UADDO, ISD::USUBO, ISD::UADDSAT, ISD::USUBSAT},
                          MVT::i32, Custom);
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
index 71ba4025b6a07..c0a5c94835a37 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
@@ -2023,12 +2023,21 @@ def : Pat<(trunc GPR:$src), (COPY GPR:$src)>;
 
 def : PatGprGpr<add, ADDW, i32, i32>;
 def : PatGprGpr<sub, SUBW, i32, i32>;
+def : PatGprGpr<and, AND, i32, i32>;
+def : PatGprGpr<or, OR, i32, i32>;
+def : PatGprGpr<xor, XOR, i32, i32>;
 def : PatGprGpr<shiftopw<shl>, SLLW, i32, i64>;
 def : PatGprGpr<shiftopw<srl>, SRLW, i32, i64>;
 def : PatGprGpr<shiftopw<sra>, SRAW, i32, i64>;
 
 def : Pat<(i32 (add GPR:$rs1, simm12i32:$imm)),
           (ADDIW GPR:$rs1, (i64 (as_i64imm $imm)))>;
+def : Pat<(i32 (and GPR:$rs1, simm12i32:$imm)),
+          (ANDI GPR:$rs1, (i64 (as_i64imm $imm)))>;
+def : Pat<(i32 (or GPR:$rs1, simm12i32:$imm)),
+          (ORI GPR:$rs1, (i64 (as_i64imm $imm)))>;
+def : Pat<(i32 (xor GPR:$rs1, simm12i32:$imm)),
+          (XORI GPR:$rs1, (i64 (as_i64imm $imm)))>;
 
 def : PatGprImm<shl, SLLIW, uimm5, i32>;
 def : PatGprImm<srl, SRLIW, uimm5, i32>;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
index fec6396c602ba..2579d13b59a3c 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
@@ -827,6 +827,10 @@ def : Pat<(i32 (sext_inreg GPR:$rs1, i16)), (SEXT_H GPR:$rs1)>;
 } // Predicates = [HasStdExtZbb, IsRV64]
 
 let Predicates = [HasStdExtZbbOrZbkb, IsRV64] in {
+def : Pat<(i32 (and GPR:$rs1, (not GPR:$rs2))), (ANDN GPR:$rs1, GPR:$rs2)>;
+def : Pat<(i32 (or  GPR:$rs1, (not GPR:$rs2))), (ORN  GPR:$rs1, GPR:$rs2)>;
+def : Pat<(i32 (xor GPR:$rs1, (not GPR:$rs2))), (XNOR GPR:$rs1, GPR:$rs2)>;
+
 def : PatGprGpr<shiftopw<rotl>, ROLW, i32, i64>;
 def : PatGprGpr<shiftopw<rotr>, RORW, i32, i64>;
 def : PatGprImm<rotr, RORIW, uimm5, i32>;
diff --git a/llvm/test/CodeGen/RISCV/rv64-legal-i32/div.ll b/llvm/test/CodeGen/RISCV/rv64-legal-i32/div.ll
index f2228e9013ce9..a962a72d70373 100644
--- a/llvm/test/CodeGen/RISCV/rv64-legal-i32/div.ll
+++ b/llvm/test/CodeGen/RISCV/rv64-legal-i32/div.ll
@@ -184,14 +184,14 @@ define i8 @udiv8_constant(i8 %a) nounwind {
 define i8 @udiv8_pow2(i8 %a) nounwind {
 ; RV64I-LABEL: udiv8_pow2:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    andi a0, a0, 248
-; RV64I-NEXT:    srliw a0, a0, 3
+; RV64I-NEXT:    slli a0, a0, 56
+; RV64I-NEXT:    srli a0, a0, 59
 ; RV64I-NEXT:    ret
 ;
 ; RV64IM-LABEL: udiv8_pow2:
 ; RV64IM:       # %bb.0:
-; RV64IM-NEXT:    andi a0, a0, 248
-; RV64IM-NEXT:    srliw a0, a0, 3
+; RV64IM-NEXT:    slli a0, a0, 56
+; RV64IM-NEXT:    srli a0, a0, 59
 ; RV64IM-NEXT:    ret
   %1 = udiv i8 %a, 8
   ret i8 %1
@@ -260,11 +260,10 @@ define i16 @udiv16_constant(i16 %a) nounwind {
 ;
 ; RV64IM-LABEL: udiv16_constant:
 ; RV64IM:       # %bb.0:
+; RV64IM-NEXT:    lui a1, 52429
+; RV64IM-NEXT:    slli a1, a1, 4
 ; RV64IM-NEXT:    slli a0, a0, 48
-; RV64IM-NEXT:    srli a0, a0, 48
-; RV64IM-NEXT:    lui a1, 13
-; RV64IM-NEXT:    addi a1, a1, -819
-; RV64IM-NEXT:    mul a0, a0, a1
+; RV64IM-NEXT:    mulhu a0, a0, a1
 ; RV64IM-NEXT:    srliw a0, a0, 18
 ; RV64IM-NEXT:    ret
   %1 = udiv i16 %a, 5
@@ -275,15 +274,13 @@ define i16 @udiv16_pow2(i16 %a) nounwind {
 ; RV64I-LABEL: udiv16_pow2:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a0, a0, 48
-; RV64I-NEXT:    srli a0, a0, 48
-; RV64I-NEXT:    srliw a0, a0, 3
+; RV64I-NEXT:    srli a0, a0, 51
 ; RV64I-NEXT:    ret
 ;
 ; RV64IM-LABEL: udiv16_pow2:
 ; RV64IM:       # %bb.0:
 ; RV64IM-NEXT:    slli a0, a0, 48
-; RV64IM-NEXT:    srli a0, a0, 48
-; RV64IM-NEXT:    srliw a0, a0, 3
+; RV64IM-NEXT:    srli a0, a0, 51
 ; RV64IM-NEXT:    ret
   %1 = udiv i16 %a, 8
   ret i16 %1
@@ -304,8 +301,9 @@ define i16 @udiv16_constant_lhs(i16 %a) nounwind {
 ;
 ; RV64IM-LABEL: udiv16_constant_lhs:
 ; RV64IM:       # %bb.0:
-; RV64IM-NEXT:    slli a0, a0, 48
-; RV64IM-NEXT:    srli a0, a0, 48
+; RV64IM-NEXT:    lui a1, 16
+; RV64IM-NEXT:    addi a1, a1, -1
+; RV64IM-NEXT:    and a0, a0, a1
 ; RV64IM-NEXT:    li a1, 10
 ; RV64IM-NEXT:    divuw a0, a1, a0
 ; RV64IM-NEXT:    ret
@@ -536,8 +534,8 @@ define i8 @sdiv8_constant(i8 %a) nounwind {
 ; RV64IM-NEXT:    li a1, 103
 ; RV64IM-NEXT:    mul a0, a0, a1
 ; RV64IM-NEXT:    sraiw a1, a0, 9
-; RV64IM-NEXT:    srliw a0, a0, 15
-; RV64IM-NEXT:    andi a0, a0, 1
+; RV64IM-NEXT:    slli a0, a0, 48
+; RV64IM-NEXT:    srli a0, a0, 63
 ; RV64IM-NEXT:    addw a0, a1, a0
 ; RV64IM-NEXT:    ret
   %1 = sdiv i8 %a, 5
@@ -549,8 +547,8 @@ define i8 @sdiv8_pow2(i8 %a) nounwind {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a1, a0, 24
 ; RV64I-NEXT:    sraiw a1, a1, 24
-; RV64I-NEXT:    srliw a1, a1, 12
-; RV64I-NEXT:    andi a1, a1, 7
+; RV64I-NEXT:    slli a1, a1, 49
+; RV64I-NEXT:    srli a1, a1, 61
 ; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    slli a0, a0, 24
 ; RV64I-NEXT:    sraiw a0, a0, 27
@@ -560,8 +558,8 @@ define i8 @sdiv8_pow2(i8 %a) nounwind {
 ; RV64IM:       # %bb.0:
 ; RV64IM-NEXT:    slli a1, a0, 24
 ; RV64IM-NEXT:    sraiw a1, a1, 24
-; RV64IM-NEXT:    srliw a1, a1, 12
-; RV64IM-NEXT:    andi a1, a1, 7
+; RV64IM-NEXT:    slli a1, a1, 49
+; RV64IM-NEXT:    srli a1, a1, 61
 ; RV64IM-NEXT:    add a0, a0, a1
 ; RV64IM-NEXT:    slli a0, a0, 24
 ; RV64IM-NEXT:    sraiw a0, a0, 27
@@ -653,8 +651,8 @@ define i16 @sdiv16_pow2(i16 %a) nounwind {
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    slli a1, a0, 16
 ; RV64I-NEXT:    sraiw a1, a1, 16
-; RV64I-NEXT:    srliw a1, a1, 28
-; RV64I-NEXT:    andi a1, a1, 7
+; RV64I-NEXT:    slli a1, a1, 33
+; RV64I-NEXT:    srli a1, a1, 61
 ; RV64I-NEXT:    add a0, a0, a1
 ; RV64I-NEXT:    slli a0, a0, 16
 ; RV64I-NEXT:    sraiw a0, a0, 19
@@ -664,8 +662,8 @@ define i16 @sdiv16_pow2(i16 %a) nounwind {
 ; RV64IM:       # %bb.0:
 ; RV64IM-NEXT:    slli a1, a0, 16
 ; RV64IM-NEXT:    sraiw a1, a1, 16
-; RV64IM-NEXT:    srliw a1, a1, 28
-; RV64IM-NEXT:    andi a1, a1, 7
+; RV64IM-NEXT:    slli a1, a1, 33
+; RV64IM-NEXT:    srli a1, a1, 61
 ; RV64IM-NEXT:    add a0, a0, a1
 ; RV64IM-NEXT:    slli a0, a0, 16
 ; RV64IM-NEXT:    sraiw a0, a0, 19
diff --git a/llvm/test/CodeGen/RISCV/rv64-legal-i32/rem.ll b/llvm/test/CodeGen/RISCV/rv64-legal-i32/rem.ll
index 11adbbdd245f1..292c1d8087c53 100644
--- a/llvm/test/CodeGen/RISCV/rv64-legal-i32/rem.ll
+++ b/llvm/test/CodeGen/RISCV/rv64-legal-i32/rem.ll
@@ -330,8 +330,9 @@ define i16 @urem16_constant_lhs(i16 %a) nounwind {
 ;
 ; RV64IM-LABEL: urem16_constant_lhs:
 ; RV64IM:       # %bb.0:
-; RV64IM-NEXT:    slli a0, a0, 48
-; RV64IM-NEXT:    srli a0, a0, 48
+; RV64IM-NEXT:    lui a1, 16
+; RV64IM-NEXT:    addi a1, a1, -1
+; RV64IM-NEXT:    and a0, a0, a1
 ; RV64IM-NEXT:    li a1, 10
 ; RV64IM-NEXT:    remuw a0, a1, a0
 ; RV64IM-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64xtheadbb.ll b/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64xtheadbb.ll
index bb2f2b73d4a0c..ef505afc2d1d2 100644
--- a/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64xtheadbb.ll
+++ b/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64xtheadbb.ll
@@ -238,10 +238,10 @@ define signext i32 @findLastSet_i32(i32 signext %a) nounwind {
 ; RV64XTHEADBB:       # %bb.0:
 ; RV64XTHEADBB-NEXT:    th.extu a1, a0, 31, 0
 ; RV64XTHEADBB-NEXT:    th.ff1 a1, a1
-; RV64XTHEADBB-NEXT:    addi a1, a1, -32
+; RV64XTHEADBB-NEXT:    addiw a1, a1, -32
 ; RV64XTHEADBB-NEXT:    xori a1, a1, 31
 ; RV64XTHEADBB-NEXT:    snez a0, a0
-; RV64XTHEADBB-NEXT:    addi a0, a0, -1
+; RV64XTHEADBB-NEXT:    addiw a0, a0, -1
 ; RV64XTHEADBB-NEXT:    or a0, a0, a1
 ; RV64XTHEADBB-NEXT:    ret
   %1 = call i32 @llvm.ctlz.i32(i32 %a, i1 true)
@@ -749,8 +749,9 @@ define i64 @no_sexth_i64(i64 %a) nounwind {
 define i32 @zexth_i32(i32 %a) nounwind {
 ; RV64I-LABEL: zexth_i32:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    slli a0, a0, 48
-; RV64I-NEXT:    srli a0, a0, 48
+; RV64I-NEXT:    lui a1, 16
+; RV64I-NEXT:    addiw a1, a1, -1
+; RV64I-NEXT:    and a0, a0, a1
 ; RV64I-NEXT:    ret
 ;
 ; RV64XTHEADBB-LABEL: zexth_i32:
diff --git a/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64zbb-zbkb.ll b/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64zbb-zbkb.ll
index e6e9829c16f22..39a5b9b0f3676 100644
--- a/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64zbb-zbkb.ll
+++ b/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64zbb-zbkb.ll
@@ -71,16 +71,11 @@ define i64 @orn_i64(i64 %a, i64 %b) nounwind {
 }
 
 define signext i32 @xnor_i32(i32 signext %a, i32 signext %b) nounwind {
-; RV64I-LABEL: xnor_i32:
-; RV64I:       # %bb.0:
-; RV64I-NEXT:    xor a0, a0, a1
-; RV64I-NEXT:    not a0, a0
-; RV64I-NEXT:    ret
-;
-; RV64ZBB-ZBKB-LABEL: xnor_i32:
-; RV64ZBB-ZBKB:       # %bb.0:
-; RV64ZBB-ZBKB-NEXT:    xnor a0, a0, a1
-; RV64ZBB-ZBKB-NEXT:    ret
+; CHECK-LABEL: xnor_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xor a0, a0, a1
+; CHECK-NEXT:    not a0, a0
+; CHECK-NEXT:    ret
   %neg = xor i32 %a, -1
   %xor = xor i32 %neg, %b
   ret i32 %xor
@@ -446,8 +441,8 @@ define i64 @not_shl_one_i64(i64 %x) {
 define i8 @srli_i8(i8 %a) nounwind {
 ; CHECK-LABEL: srli_i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    andi a0, a0, 192
-; CHECK-NEXT:    srliw a0, a0, 6
+; CHECK-NEXT:    slli a0, a0, 56
+; CHECK-NEXT:    srli a0, a0, 62
 ; CHECK-NEXT:    ret
   %1 = lshr i8 %a, 6
   ret i8 %1
@@ -480,25 +475,11 @@ define i8 @srai_i8(i8 %a) nounwind {
 ; We could use zext.h+srli, but slli+srli offers more opportunities for
 ; comppressed instructions.
 define i16 @srli_i16(i16 %a) nounwind {
-; RV64I-LABEL: srli_i16:
-; RV64I:       # %bb.0:
-; RV64I-NEXT:    slli a0, a0, 48
-; RV64I-NEXT:    srli a0, a0, 48
-; RV64I-NEXT:    srliw a0, a0, 6
-; RV64I-NEXT:    ret
-;
-; RV64ZBB-LABEL: srli_i16:
-; RV64ZBB:       # %bb.0:
-; RV64ZBB-NEXT:    zext.h a0, a0
-; RV64ZBB-NEXT:    srliw a0, a0, 6
-; RV64ZBB-NEXT:    ret
-;
-; RV64ZBKB-LABEL: srli_i16:
-; RV64ZBKB:       # %bb.0:
-; RV64ZBKB-NEXT:    slli a0, a0, 48
-; RV64ZBKB-NEXT:    srli a0, a0, 48
-; RV64ZBKB-NEXT:    srliw a0, a0, 6
-; RV64ZBKB-NEXT:    ret
+; CHECK-LABEL: srli_i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    slli a0, a0, 48
+; CHECK-NEXT:    srli a0, a0, 54
+; CHECK-NEXT:    ret
   %1 = lshr i16 %a, 6
   ret i16 %1
 }
diff --git a/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64zbb.ll b/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64zbb.ll
index acc175186b858..695ddd6d308ec 100644
--- a/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64zbb.ll
+++ b/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64zbb.ll
@@ -944,13 +944,16 @@ define i64 @abs_i64(i64 %x) {
 define i32 @zexth_i32(i32 %a) nounwind {
 ; RV64I-LABEL: zexth_i32:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    slli a0, a0, 48
-; RV64I-NEXT:    srli a0, a0, 48
+; RV64I-NEXT:    lui a1, 16
+; RV64I-NEXT:    addiw a1, a1, -1
+; RV64I-NEXT:    and a0, a0, a1
 ; RV64I-NEXT:    ret
 ;
 ; RV64ZBB-LABEL: zexth_i32:
 ; RV64ZBB:       # %bb.0:
-; RV64ZBB-NEXT:    zext.h a0, a0
+; RV64ZBB-NEXT:    lui a1, 16
+; RV64ZBB-NEXT:    addiw a1, a1, -1
+; RV64ZBB-NEXT:    and a0, a0, a1
 ; RV64ZBB-NEXT:    ret
   %and = and i32 %a, 65535
   ret i32 %and
diff --git a/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64zbs.ll b/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64zbs.ll
index c4680a5e15120..302b1380b9c45 100644
--- a/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64zbs.ll
+++ b/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64zbs.ll
@@ -273,7 +273,8 @@ define signext i32 @bext_i32(i32 signext %a, i32 signext %b) nounwind {
 ; RV64ZBS-LABEL: bext_i32:
 ; RV64ZBS:       # %bb.0:
 ; RV64ZBS-NEXT:    andi a1, a1, 31
-; RV64ZBS-NEXT:    bext a0, a0, a1
+; RV64ZBS-NEXT:    srl a0, a0, a1
+; RV64ZBS-NEXT:    andi a0, a0, 1
 ; RV64ZBS-NEXT:    ret
   %and = and i32 %b, 31
   %shr = lshr i32 %a, %and
@@ -290,7 +291,8 @@ define signext i32 @bext_i32_no_mask(i32 signext %a, i32 signext %b) nounwind {
 ;
 ; RV64ZBS-LABEL: bext_i32_no_mask:
 ; RV64ZBS:       # %bb.0:
-; RV64ZBS-NEXT:    bext a0, a0, a1
+; RV64ZBS-NEXT:    srl a0, a0, a1
+; RV64ZBS-NEXT:    andi a0, a0, 1
 ; RV64ZBS-NEXT:    ret
   %shr = lshr i32 %a, %b
   %and1 = and i32 %shr, 1
@@ -367,11 +369,17 @@ define i64 @bext_i64_no_mask(i64 %a, i64 %b) nounwind {
 }
 
 define signext i32 @bexti_i32(i32 signext %a) nounwind {
-; CHECK-LABEL: bexti_i32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    srliw a0, a0, 5
-; CHECK-NEXT:    andi a0, a0, 1
-; CHECK-NEXT:    ret
+; RV64I-LABEL: bexti_i32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    slli a0, a0, 58
+; RV64I-NEXT:    srli a0, a0, 63
+; RV64I-NEXT:    ret
+;
+; RV64ZBS-LABEL: bexti_i32:
+; RV64ZBS:       # %bb.0:
+; RV64ZBS-NEXT:    srliw a0, a0, 5
+; RV64ZBS-NEXT:    andi a0, a0, 1
+; RV64ZBS-NEXT:    ret
   %shr = lshr i32 %a, 5
   %and = and i32 %shr, 1
   ret i32 %and
@@ -394,16 +402,11 @@ define i64 @bexti_i64(i64 %a) nounwind {
 }
 
 define signext i32 @bexti_i32_cmp(i32 signext %a) nounwind {
-; RV64I-LABEL: bexti_i32_cmp:
-; RV64I:       # %bb.0:
-; RV64I-NEXT:    slli a0, a0, 58
-; RV64I-NEXT:    srli a0, a0, 63
-; RV64I-NEXT:    ret
-;
-; RV64ZBS-LABEL: bexti_i32_cmp:
-; RV64ZBS:       # %bb.0:
-; RV64ZBS-NEXT:    bexti a0, a0, 5
-; RV64ZBS-NEXT:    ret
+; CHECK-LABEL: bexti_i32_cmp:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    andi a0, a0, 32
+; CHECK-NEXT:    snez a0, a0
+; CHECK-NEXT:    ret
   %and = and i32 %a, 32
   %cmp = icmp ne i32 %and, 0
   %zext = zext i1 %cmp to i32
@@ -437,33 +440,23 @@ define signext i32 @bclri_i32_10(i32 signext %a) nounwind {
 }
 
 define signext i32 @bclri_i32_11(i32 signext %a) nounwind {
-; RV64I-LABEL: bclri_i32_11:
-; RV64I:       # %bb.0:
-; RV64I-NEXT:    lui a1, 1048575
-; RV64I-NEXT:    addiw a1, a1, 2047
-; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    ret
-;
-; RV64ZBS-LABEL: bclri_i32_11:
-; RV64ZBS:       # %bb.0:
-; RV64ZBS-NEXT:    bclri a0, a0, 11
-; RV64ZBS-NEXT:    ret
+; CHECK-LABEL: bclri_i32_11:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lui a1, 1048575
+; CHECK-NEXT:    addiw a1, a1, 2047
+; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    ret
   %and = and i32 %a, -2049
   ret i32 %and
 }
 
 define signext i32 @bclri_i32_30(i32 signext %a) nounwind {
-; RV64I-LABEL: bclri_i32_30:
-; RV64I:       # %bb.0:
-; RV64I-NEXT:    lui a1, 786432
-; RV64I-NEXT:    addiw a1, a1, -1
-; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    ret
-;
-; RV64ZBS-LABEL: bclri_i32_30:
-; RV64ZBS:       # %bb.0:
-; RV64ZBS-NEXT:    bclri a0, a0, 30
-; RV64ZBS-NEXT:    ret
+; CHECK-LABEL: bclri_i32_30:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lui a1, 786432
+; CHECK-NEXT:    addiw a1, a1, -1
+; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    ret
   %and = and i32 %a, -1073741825
   ret i32 %and
 }
@@ -471,8 +464,9 @@ define signext i32 @bclri_i32_30(i32 signext %a) nounwind {
 define signext i32 @bclri_i32_31(i32 signext %a) nounwind {
 ; CHECK-LABEL: bclri_i32_31:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    slli a0, a0, 33
-; CHECK-NEXT:    srli a0, a0, 33
+; CHECK-NEXT:    lui a1, 524288
+; CHECK-NEXT:    addiw a1, a1, -1
+; CHECK-NEXT:    and a0, a0, a1
 ; CHECK-NEXT:    ret
   %and = and i32 %a, -2147483649
   ret i32 %and
@@ -614,29 +608,26 @@ define signext i32 @bseti_i32_11(i32 signext %a) nounwind {
 ; RV64I-LABEL: bseti_i32_11:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    li a1, 1
-; RV64I-NEXT:    slli a1, a1, 11
+; RV64I-NEXT:    slliw a1, a1, 11
 ; RV64I-NEXT:    or a0, a0, a1
 ; RV64I-NEXT:    ret
 ;
 ; RV64ZBS-LABEL: bseti_i32_11:
 ; RV64ZBS:       # %bb.0:
-; RV64ZBS-NEXT:    bseti a0, a0, 11
+; RV64ZBS-NEXT:    bseti a1, zero, 11
+; RV64ZBS-NEXT:    or a0, a0, a1
+; RV64ZBS-NEXT:    sext.w a0, a0
 ; RV64ZBS-NEXT:    ret
   %or = or i32 %a, 2048
   ret i32 %or
 }
 
 define signext i32 @bseti_i32_30(i32 signext %a) nounwind {
-; RV64I-LABEL: bseti_i32_30:
-; RV64I:       # %bb.0:
-; RV64I-NEXT:    lui a1, 262144
-; RV64I-NEXT:    or a0, a0, a1
-; RV64I-NEXT:    ret
-;
-; RV64ZBS-LABEL: bseti_i32_30:
-; RV64ZBS:       # %bb.0:
-; RV64ZBS-NEXT:    bseti a0, a0, 30
-; RV64ZBS-NEXT:    ret
+; CHECK-LABEL: bseti_i32_30:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lui a1, 262144
+; CHECK-NEXT:    or a0, a0, a1
+; CHECK-NEXT:    ret
   %or = or i32 %a, 1073741824
   ret i32 %or
 }
@@ -752,29 +743,26 @@ define signext i32 @binvi_i32_11(i32 signext %a) nounwind {
 ; RV64I-LABEL: binvi_i32_11:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    li a1, 1
-; RV64I-NEXT:    slli a1, a1, 11
+; RV64I-NEXT:    slliw a1, a1, 11
 ; RV64I-NEXT:    xor a0, a0, a1
 ; RV64I-NEXT:    ret
 ;
 ; RV64ZBS-LABEL: binvi_i32_11:
 ; RV64ZBS:       # %bb.0:
-; RV64ZBS-NEXT:    binvi a0, a0, 11
+; RV64ZBS-NEXT:    bseti a1, zero, 11
+; RV64ZBS-NEXT:    xor a0, a0, a1
+; RV64ZBS-NEXT:    sext.w a0, a0
 ; RV64ZBS-NEXT:    ret
   %xor = xor i32 %a, 2048
   ret i32 %xor
 }
 
 define signext i32 @binvi_i32_30(i32 signext %a) nounwind {
-; RV64I-LABEL: binvi_i32_30:
-; RV64I:       # %bb.0:
-; RV64I-NEXT:    lui a1, 262144
-; RV64I-NEXT:    xor a0, a0, a1
-; RV64I-NEXT:    ret
-;
-; RV64ZBS-LABEL: binvi_i32_30:
-; RV64ZBS:       # %bb.0:
-; RV64ZBS-NEXT:    binvi a0, a0, 30
-; RV64ZBS-NEXT:    ret
+; CHECK-LABEL: binvi_i32_30:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lui a1, 262144
+; CHECK-NEXT:    xor a0, a0, a1
+; CHECK-NEXT:    ret
   %xor = xor i32 %a, 1073741824
   ret i32 %xor
 }
diff --git a/llvm/test/CodeGen/RISCV/rv64-legal-i32/xaluo.ll b/llvm/test/CodeGen/RISCV/rv64-legal-i32/xaluo.ll
index 774d1398644b9..20eb4cc10f405 100644
--- a/llvm/test/CodeGen/RISCV/rv64-legal-i32/xaluo.ll
+++ b/llvm/test/CodeGen/RISCV/rv64-legal-i32/xaluo.ll
@@ -758,7 +758,8 @@ define i1 @smulo.not.i32(i32 signext %v1, i32 signext %v2) {
 ; RV64-NEXT:    srai a1, a0, 32
 ; RV64-NEXT:    sraiw a0, a0, 31
 ; RV64-NEXT:    xor a0, a1, a0
-; RV64-NEXT:    seqz a0, a0
+; RV64-NEXT:    snez a0, a0
+; RV64-NEXT:    xori a0, a0, 1
 ; RV64-NEXT:    ret
 entry:
   %t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %v1, i32 %v2)
@@ -792,7 +793,8 @@ define i1 @smulo.not.i64(i64 %v1, i64 %v2) {
 ; RV64-NEXT:    mul a0, a0, a1
 ; RV64-NEXT:    srai a0, a0, 63
 ; RV64-NEXT:    xor a0, a2, a0
-; RV64-NEXT:    seqz a0, a0
+; RV64-NEXT:    snez a0, a0
+; RV64-NEXT:    xori a0, a0, 1
 ; RV64-NEXT:    ret
 entry:
   %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 %v2)
@@ -827,7 +829,8 @@ define i1 @umulo.not.i32(i32 signext %v1, i32 signext %v2) {
 ; RV64-NEXT:    slli a0, a0, 32
 ; RV64-NEXT:    mulhu a0, a0, a1
 ; RV64-NEXT:    srai a0, a0, 32
-; RV64-NEXT:    seqz a0, a0
+; RV64-NEXT:    snez a0, a0
+; RV64-NEXT:    xori a0, a0, 1
 ; RV64-NEXT:    ret
 entry:
   %t = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %v1, i32 %v2)
@@ -856,7 +859,8 @@ define i1 @umulo.not.i64(i64 %v1, i64 %v2) {
 ; RV64-LABEL: umulo.not.i64:
 ; RV64:       # %bb.0: # %entry
 ; RV64-NEXT:    mulhu a0, a0, a1
-; RV64-NEXT:    seqz a0, a0
+; RV64-NEXT:    snez a0, a0
+; RV64-NEXT:    xori a0, a0, 1
 ; RV64-NEXT:    ret
 entry:
   %t = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %v1, i64 %v2)

From f5a5142571c55bdc2aa188c63b7092b6b20b0e9c Mon Sep 17 00:00:00 2001
From: Razvan Lupusoru <razvan.lupusoru@gmail.com>
Date: Wed, 1 Nov 2023 11:52:31 -0700
Subject: [PATCH 725/877] [openacc] Update acc.loop to expose data operands
 (#70954)

The compute and data constructs implement getNumDataOperands and
getDataOperand. The acc.loop operation similarly has multiple data
operands - thus it makes sense to expose them the same way.

For loop, only private and reduction operands are exposed this way.
Technically, acc.loop also holds cache operands - but these are hints
not a data attribute.
---
 flang/lib/Lower/OpenACC.cpp                     |  2 +-
 mlir/include/mlir/Dialect/OpenACC/OpenACC.h     |  4 ++++
 mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td | 11 +++++++++--
 mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp         | 16 ++++++++++++++++
 4 files changed, 30 insertions(+), 3 deletions(-)

diff --git a/flang/lib/Lower/OpenACC.cpp b/flang/lib/Lower/OpenACC.cpp
index 8218af691b79c..738af5d60a835 100644
--- a/flang/lib/Lower/OpenACC.cpp
+++ b/flang/lib/Lower/OpenACC.cpp
@@ -1592,9 +1592,9 @@ createLoopOp(Fortran::lower::AbstractConverter &converter,
   addOperand(operands, operandSegments, workerNum);
   addOperand(operands, operandSegments, vectorNum);
   addOperands(operands, operandSegments, tileOperands);
+  addOperands(operands, operandSegments, cacheOperands);
   addOperands(operands, operandSegments, privateOperands);
   addOperands(operands, operandSegments, reductionOperands);
-  addOperands(operands, operandSegments, cacheOperands);
 
   auto loopOp = createRegionOp<mlir::acc::LoopOp, mlir::acc::YieldOp>(
       builder, currentLocation, eval, operands, operandSegments);
diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACC.h b/mlir/include/mlir/Dialect/OpenACC/OpenACC.h
index bc4680656d4cf..4dc94782c1c9b 100644
--- a/mlir/include/mlir/Dialect/OpenACC/OpenACC.h
+++ b/mlir/include/mlir/Dialect/OpenACC/OpenACC.h
@@ -48,12 +48,16 @@
       mlir::acc::CacheOp
 #define ACC_COMPUTE_CONSTRUCT_OPS                                              \
   mlir::acc::ParallelOp, mlir::acc::KernelsOp, mlir::acc::SerialOp
+#define ACC_COMPUTE_CONSTRUCT_AND_LOOP_OPS                                     \
+  ACC_COMPUTE_CONSTRUCT_OPS, mlir::acc::LoopOp
 #define ACC_DATA_CONSTRUCT_OPS                                                 \
   mlir::acc::DataOp, mlir::acc::EnterDataOp, mlir::acc::ExitDataOp,            \
       mlir::acc::UpdateOp, mlir::acc::HostDataOp, mlir::acc::DeclareEnterOp,   \
       mlir::acc::DeclareExitOp, mlir::acc::DeclareOp
 #define ACC_COMPUTE_AND_DATA_CONSTRUCT_OPS                                     \
   ACC_COMPUTE_CONSTRUCT_OPS, ACC_DATA_CONSTRUCT_OPS
+#define ACC_COMPUTE_LOOP_AND_DATA_CONSTRUCT_OPS                                \
+  ACC_COMPUTE_CONSTRUCT_AND_LOOP_OPS, ACC_DATA_CONSTRUCT_OPS
 
 namespace mlir {
 namespace acc {
diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td b/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
index 3c5173fdda7f6..17cb9d6f1c2cb 100644
--- a/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
+++ b/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
@@ -1196,11 +1196,12 @@ def OpenACC_LoopOp : OpenACC_Op<"loop",
       UnitAttr:$hasWorker,
       UnitAttr:$hasVector,
       Variadic<IntOrIndex>:$tileOperands,
+      Variadic<OpenACC_PointerLikeTypeInterface>:$cacheOperands,
       Variadic<OpenACC_PointerLikeTypeInterface>:$privateOperands,
       OptionalAttr<SymbolRefArrayAttr>:$privatizations,
       Variadic<AnyType>:$reductionOperands,
-      OptionalAttr<SymbolRefArrayAttr>:$reductionRecipes,
-      Variadic<OpenACC_PointerLikeTypeInterface>:$cacheOperands);
+      OptionalAttr<SymbolRefArrayAttr>:$reductionRecipes
+      );
 
   let results = (outs Variadic<AnyType>:$results);
 
@@ -1211,6 +1212,12 @@ def OpenACC_LoopOp : OpenACC_Op<"loop",
     static StringRef getGangNumKeyword() { return "num"; }
     static StringRef getGangDimKeyword() { return "dim"; }
     static StringRef getGangStaticKeyword() { return "static"; }
+
+    /// The number of private and reduction operands.
+    unsigned getNumDataOperands();
+
+    /// The i-th data operand passed.
+    Value getDataOperand(unsigned i);
   }];
 
   let hasCustomAssemblyFormat = 1;
diff --git a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
index d3747d3310409..df64d561f46cb 100644
--- a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
+++ b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
@@ -878,6 +878,22 @@ LogicalResult acc::LoopOp::verify() {
   return success();
 }
 
+unsigned LoopOp::getNumDataOperands() {
+  return getReductionOperands().size() + getPrivateOperands().size();
+}
+
+Value LoopOp::getDataOperand(unsigned i) {
+  unsigned numOptional = getGangNum() ? 1 : 0;
+  numOptional += getGangDim() ? 1 : 0;
+  numOptional += getGangStatic() ? 1 : 0;
+  numOptional += getVectorLength() ? 1 : 0;
+  numOptional += getWorkerNum() ? 1 : 0;
+  numOptional += getVectorLength() ? 1 : 0;
+  numOptional += getTileOperands().size();
+  numOptional += getCacheOperands().size();
+  return getOperand(numOptional + i);
+}
+
 //===----------------------------------------------------------------------===//
 // DataOp
 //===----------------------------------------------------------------------===//

From 243588df15ed136b52ca62315c1e18b045cdf915 Mon Sep 17 00:00:00 2001
From: Jorge Gorbe Moya <jgorbe@google.com>
Date: Wed, 1 Nov 2023 11:53:09 -0700
Subject: [PATCH 726/877] [bazel] add missing dependency to //clang:ast after
 1c6c01fbde87171457a5bce7b147fa5bcbaddae7

---
 utils/bazel/llvm-project-overlay/clang/BUILD.bazel | 1 +
 1 file changed, 1 insertion(+)

diff --git a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
index 1ca2493a49637..43b251d9482e2 100644
--- a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
@@ -903,6 +903,7 @@ cc_library(
         ":type_nodes_gen",
         "//llvm:BinaryFormat",
         "//llvm:Core",
+        "//llvm:FrontendHLSL",
         "//llvm:FrontendOpenMP",
         "//llvm:Support",
         "//llvm:TargetParser",

From a3efd892fa57b72c2a0875a2fc9033b57f90d696 Mon Sep 17 00:00:00 2001
From: Jan Svoboda <jan_svoboda@apple.com>
Date: Wed, 1 Nov 2023 12:00:54 -0700
Subject: [PATCH 727/877] [clang][modules] Don't prevent translation of
 FW_Private includes when explicitly building FW (#70714)

We prevent translating `#include <FW/PrivateHeader.h>` into an import of
FW_Private when compiling the implementation of FW or FW_Private. This
is specified via `-fmodule-name=` on the TU command line (used to be
`-fmodule-implementation-of`).

This logic is supposed to only kick in when imported directly from a TU,
but it currently also kicks in when compiling the public FW module
explicitly (since it also has `-fmodule-name=` on the command line).

This patch makes sure this logic only kicks in for the case that used to
be `-fmodule-implementation-of` (for the TU), and not for all
`-fmodule-name=` cases (especially for the explicit compile of a
module).

rdar://101051277; related: rdar://37500098&38434694
---
 clang/lib/Basic/Module.cpp                    |   7 +-
 .../ClangScanDeps/modules-priv-fw-from-pub.m  | 121 ++++++++++++++++++
 2 files changed, 125 insertions(+), 3 deletions(-)
 create mode 100644 clang/test/ClangScanDeps/modules-priv-fw-from-pub.m

diff --git a/clang/lib/Basic/Module.cpp b/clang/lib/Basic/Module.cpp
index cc2e5be98d32d..e4ac1abf12a7f 100644
--- a/clang/lib/Basic/Module.cpp
+++ b/clang/lib/Basic/Module.cpp
@@ -161,9 +161,10 @@ bool Module::isForBuilding(const LangOptions &LangOpts) const {
   StringRef TopLevelName = getTopLevelModuleName();
   StringRef CurrentModule = LangOpts.CurrentModule;
 
-  // When building framework Foo, we want to make sure that Foo *and*
-  // Foo_Private are textually included and no modules are built for both.
-  if (getTopLevelModule()->IsFramework &&
+  // When building the implementation of framework Foo, we want to make sure
+  // that Foo *and* Foo_Private are textually included and no modules are built
+  // for either.
+  if (!LangOpts.isCompilingModule() && getTopLevelModule()->IsFramework &&
       CurrentModule == LangOpts.ModuleName &&
       !CurrentModule.endswith("_Private") && TopLevelName.endswith("_Private"))
     TopLevelName = TopLevelName.drop_back(8);
diff --git a/clang/test/ClangScanDeps/modules-priv-fw-from-pub.m b/clang/test/ClangScanDeps/modules-priv-fw-from-pub.m
new file mode 100644
index 0000000000000..cb82e0c366322
--- /dev/null
+++ b/clang/test/ClangScanDeps/modules-priv-fw-from-pub.m
@@ -0,0 +1,121 @@
+// This test checks that importing private headers from the public headers of
+// a framework is consistent between the dependency scanner and the explicit build.
+
+// RUN: rm -rf %t
+// RUN: split-file %s %t
+
+//--- frameworks/FW.framework/Modules/module.modulemap
+framework module FW { header "FW.h" }
+//--- frameworks/FW.framework/Modules/module.private.modulemap
+framework module FW_Private { header "FW_Private.h"}
+//--- frameworks/FW.framework/Headers/FW.h
+#include <FW/FW_Private.h>
+//--- frameworks/FW.framework/PrivateHeaders/FW_Private.h
+@import Dependency;
+
+//--- modules/module.modulemap
+module Dependency { header "dependency.h" }
+//--- modules/dependency.h
+// empty
+
+//--- tu.m
+#include <FW/FW.h>
+
+//--- cdb.json.in
+[{
+  "file": "DIR/tu.m",
+  "directory": "DIR",
+  "command": "clang -fmodules -fmodules-cache-path=DIR/cache -fimplicit-module-maps -I DIR/modules -F DIR/frameworks -Wno-framework-include-private-from-public -Wno-atimport-in-framework-header -c DIR/tu.m -o DIR/tu.o"
+}]
+
+// RUN: sed -e "s|DIR|%/t|g" %t/cdb.json.in > %t/cdb.json
+// RUN: clang-scan-deps -compilation-database %t/cdb.json -format experimental-full > %t/deps.json
+
+// Check that FW is reported to depend on FW_Private.
+// RUN: cat %t/deps.json | sed 's:\\\\\?:/:g' | FileCheck %s -DPREFIX=%/t
+// CHECK:      {
+// CHECK-NEXT:   "modules": [
+// CHECK-NEXT:     {
+// CHECK-NEXT:       "clang-module-deps": [],
+// CHECK-NEXT:       "clang-modulemap-file": "[[PREFIX]]/modules/module.modulemap",
+// CHECK-NEXT:       "command-line": [
+// CHECK:            ],
+// CHECK-NEXT:       "context-hash": "{{.*}}",
+// CHECK-NEXT:       "file-deps": [
+// CHECK-NEXT:         "[[PREFIX]]/modules/dependency.h",
+// CHECK-NEXT:         "[[PREFIX]]/modules/module.modulemap"
+// CHECK-NEXT:       ],
+// CHECK-NEXT:       "name": "Dependency"
+// CHECK-NEXT:     },
+// CHECK-NEXT:     {
+// CHECK-NEXT:       "clang-module-deps": [
+// CHECK-NEXT:         {
+// CHECK-NEXT:           "context-hash": "{{.*}}",
+// CHECK-NEXT:           "module-name": "FW_Private"
+// CHECK-NEXT:         }
+// CHECK-NEXT:       ],
+// CHECK-NEXT:       "clang-modulemap-file": "[[PREFIX]]/frameworks/FW.framework/Modules/module.modulemap",
+// CHECK-NEXT:       "command-line": [
+// CHECK:            ],
+// CHECK-NEXT:       "context-hash": "{{.*}}",
+// CHECK-NEXT:       "file-deps": [
+// CHECK-NEXT:         "[[PREFIX]]/frameworks/FW.framework/Headers/FW.h",
+// CHECK-NEXT:         "[[PREFIX]]/frameworks/FW.framework/Modules/module.modulemap",
+// CHECK-NEXT:         "[[PREFIX]]/frameworks/FW.framework/Modules/module.private.modulemap"
+// CHECK-NEXT:       ],
+// CHECK-NEXT:       "name": "FW"
+// CHECK-NEXT:     },
+// CHECK-NEXT:     {
+// CHECK-NEXT:       "clang-module-deps": [
+// CHECK-NEXT:         {
+// CHECK-NEXT:           "context-hash": "{{.*}}",
+// CHECK-NEXT:           "module-name": "Dependency"
+// CHECK-NEXT:         }
+// CHECK-NEXT:       ],
+// CHECK-NEXT:       "clang-modulemap-file": "[[PREFIX]]/frameworks/FW.framework/Modules/module.private.modulemap",
+// CHECK-NEXT:       "command-line": [
+// CHECK:            ],
+// CHECK-NEXT:       "context-hash": "{{.*}}",
+// CHECK-NEXT:       "file-deps": [
+// CHECK-NEXT:         "[[PREFIX]]/frameworks/FW.framework/Modules/module.private.modulemap",
+// CHECK-NEXT:         "[[PREFIX]]/frameworks/FW.framework/PrivateHeaders/FW_Private.h",
+// CHECK-NEXT:         "[[PREFIX]]/modules/module.modulemap"
+// CHECK-NEXT:       ],
+// CHECK-NEXT:       "name": "FW_Private"
+// CHECK-NEXT:     }
+// CHECK-NEXT:   ],
+// CHECK-NEXT:   "translation-units": [
+// CHECK-NEXT:     {
+// CHECK-NEXT:       "commands": [
+// CHECK-NEXT:         {
+// CHECK-NEXT:           "clang-context-hash": "{{.*}}",
+// CHECK-NEXT:           "clang-module-deps": [
+// CHECK-NEXT:             {
+// CHECK-NEXT:               "context-hash": "{{.*}}",
+// CHECK-NEXT:               "module-name": "FW"
+// CHECK-NEXT:             }
+// CHECK-NEXT:           ],
+// CHECK-NEXT:           "command-line": [
+// CHECK:                ],
+// CHECK-NEXT:           "executable": "clang",
+// CHECK-NEXT:           "file-deps": [
+// CHECK-NEXT:             "[[PREFIX]]/tu.m"
+// CHECK-NEXT:           ],
+// CHECK-NEXT:           "input-file": "[[PREFIX]]/tu.m"
+// CHECK-NEXT:         }
+// CHECK-NEXT:       ]
+// CHECK-NEXT:     }
+// CHECK-NEXT:   ]
+// CHECK-NEXT: }
+
+// Check that building FW succeeds. If FW_Private was to be treated textually,
+// building FW would fail due to Dependency not being present on the command line.
+// RUN: %deps-to-rsp %t/deps.json --module-name=Dependency > %t/Dependency.cc1.rsp
+// RUN: %deps-to-rsp %t/deps.json --module-name=FW_Private > %t/FW_Private.cc1.rsp
+// RUN: %deps-to-rsp %t/deps.json --module-name=FW > %t/FW.cc1.rsp
+// RUN: %deps-to-rsp %t/deps.json --tu-index=0 > %t/tu.rsp
+
+// RUN: %clang @%t/Dependency.cc1.rsp
+// RUN: %clang @%t/FW_Private.cc1.rsp
+// RUN: %clang @%t/FW.cc1.rsp
+// RUN: %clang @%t/tu.rsp

From 442da60a3f11e644aeefe65f52b14cf82c5d6c8e Mon Sep 17 00:00:00 2001
From: Philip Reames <preames@rivosinc.com>
Date: Wed, 1 Nov 2023 11:59:57 -0700
Subject: [PATCH 728/877] Run DCE after a LoopFlatten test to reduce spurious
 output [nfc]

LoopFlatten uses IndVars mechanics to widen induction variables, but then
fails to cleanup the dead instructions returned via the API.  This isn't
a functional problem, but means that this test is highly sensative to
indvars internals in an unhelpful way.  Use DCE to "normalize" the test
output to avoid false positives in a later change.
---
 llvm/test/Transforms/LoopFlatten/widen-iv.ll | 87 +++-----------------
 1 file changed, 10 insertions(+), 77 deletions(-)

diff --git a/llvm/test/Transforms/LoopFlatten/widen-iv.ll b/llvm/test/Transforms/LoopFlatten/widen-iv.ll
index ef5f9d78cd049..4692fa829bac5 100644
--- a/llvm/test/Transforms/LoopFlatten/widen-iv.ll
+++ b/llvm/test/Transforms/LoopFlatten/widen-iv.ll
@@ -1,11 +1,11 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 
-; RUN: opt < %s -S -passes='loop-simplify,loop(loop-flatten),verify' -loop-flatten-widen-iv=true \
+; RUN: opt < %s -S -passes='loop-simplify,loop(loop-flatten),dce,verify' -loop-flatten-widen-iv=true \
 ; RUN:     -verify-loop-info -verify-dom-info -verify-scev \
 ; RUN:     -loop-flatten-cost-threshold=6 | \
 ; RUN:     FileCheck %s --check-prefix=CHECK
 
-; RUN: opt < %s -S -passes='loop-simplify,loop(loop-flatten),verify' -loop-flatten-widen-iv=false \
+; RUN: opt < %s -S -passes='loop-simplify,loop(loop-flatten),dce,verify' -loop-flatten-widen-iv=false \
 ; RUN:     -verify-loop-info -verify-dom-info -verify-scev | \
 ; RUN:     FileCheck %s --check-prefix=DONTWIDEN
 
@@ -30,19 +30,12 @@ define void @foo(ptr %A, i32 %N, i32 %M) {
 ; CHECK-NEXT:    br label [[FOR_COND1_PREHEADER_US:%.*]]
 ; CHECK:       for.cond1.preheader.us:
 ; CHECK-NEXT:    [[INDVAR1:%.*]] = phi i64 [ [[INDVAR_NEXT2:%.*]], [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US:%.*]] ], [ 0, [[FOR_COND1_PREHEADER_US_PREHEADER]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[INDVAR1]] to i32
-; CHECK-NEXT:    [[MUL_US:%.*]] = mul nsw i32 [[TMP2]], [[M]]
 ; CHECK-NEXT:    [[FLATTEN_TRUNCIV:%.*]] = trunc i64 [[INDVAR1]] to i32
 ; CHECK-NEXT:    br label [[FOR_BODY4_US:%.*]]
 ; CHECK:       for.body4.us:
-; CHECK-NEXT:    [[INDVAR:%.*]] = phi i64 [ 0, [[FOR_COND1_PREHEADER_US]] ]
-; CHECK-NEXT:    [[TMP3:%.*]] = trunc i64 [[INDVAR]] to i32
-; CHECK-NEXT:    [[ADD_US:%.*]] = add nsw i32 [[TMP3]], [[MUL_US]]
 ; CHECK-NEXT:    [[IDXPROM_US:%.*]] = sext i32 [[FLATTEN_TRUNCIV]] to i64
 ; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IDXPROM_US]]
 ; CHECK-NEXT:    tail call void @f(ptr [[ARRAYIDX_US]])
-; CHECK-NEXT:    [[INDVAR_NEXT:%.*]] = add i64 [[INDVAR]], 1
-; CHECK-NEXT:    [[CMP2_US:%.*]] = icmp slt i64 [[INDVAR_NEXT]], [[TMP0]]
 ; CHECK-NEXT:    br label [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US]]
 ; CHECK:       for.cond1.for.cond.cleanup3_crit_edge.us:
 ; CHECK-NEXT:    [[INDVAR_NEXT2]] = add i64 [[INDVAR1]], 1
@@ -140,32 +133,17 @@ define void @foo2_sext(i32* nocapture readonly %A, i32 %N, i32 %M) {
 ; CHECK-NEXT:    br label [[FOR_COND1_PREHEADER:%.*]]
 ; CHECK:       for.cond1.preheader.us.preheader:
 ; CHECK-NEXT:    [[TMP0:%.*]] = sext i32 [[M]] to i64
-; CHECK-NEXT:    [[TMP1:%.*]] = sext i32 [[M]] to i64
-; CHECK-NEXT:    [[TMP2:%.*]] = sext i32 [[N]] to i64
-; CHECK-NEXT:    [[FLATTEN_TRIPCOUNT:%.*]] = mul i64 [[TMP0]], [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = sext i32 [[N]] to i64
+; CHECK-NEXT:    [[FLATTEN_TRIPCOUNT:%.*]] = mul i64 [[TMP0]], [[TMP1]]
 ; CHECK-NEXT:    br label [[FOR_COND1_PREHEADER_US:%.*]]
 ; CHECK:       for.cond1.preheader.us:
 ; CHECK-NEXT:    [[INDVAR2:%.*]] = phi i64 [ [[INDVAR_NEXT3:%.*]], [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US:%.*]] ], [ 0, [[FOR_COND1_PREHEADER_US_PREHEADER]] ]
 ; CHECK-NEXT:    [[I_018_US:%.*]] = phi i32 [ [[INC6_US:%.*]], [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US]] ], [ 0, [[FOR_COND1_PREHEADER_US_PREHEADER]] ]
-; CHECK-NEXT:    [[TMP3:%.*]] = mul nsw i64 [[INDVAR2]], [[TMP1]]
-; CHECK-NEXT:    [[MUL_US:%.*]] = mul nsw i32 [[I_018_US]], [[M]]
-; CHECK-NEXT:    [[TMP4:%.*]] = sext i32 [[MUL_US]] to i64
-; CHECK-NEXT:    [[FLATTEN_TRUNCIV:%.*]] = trunc i64 [[INDVAR2]] to i32
 ; CHECK-NEXT:    br label [[FOR_BODY4_US:%.*]]
 ; CHECK:       for.body4.us:
-; CHECK-NEXT:    [[INDVAR:%.*]] = phi i64 [ 0, [[FOR_COND1_PREHEADER_US]] ]
-; CHECK-NEXT:    [[J_016_US:%.*]] = phi i32 [ 0, [[FOR_COND1_PREHEADER_US]] ]
-; CHECK-NEXT:    [[TMP5:%.*]] = add nsw i64 [[INDVAR]], [[TMP3]]
-; CHECK-NEXT:    [[TMP6:%.*]] = sext i32 [[J_016_US]] to i64
-; CHECK-NEXT:    [[TMP7:%.*]] = add nsw i64 [[TMP6]], [[TMP3]]
-; CHECK-NEXT:    [[ADD_US:%.*]] = add nsw i32 [[J_016_US]], [[MUL_US]]
-; CHECK-NEXT:    [[IDXPROM_US:%.*]] = sext i32 [[FLATTEN_TRUNCIV]] to i64
 ; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDVAR2]]
-; CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr [[ARRAYIDX_US]], align 4
-; CHECK-NEXT:    tail call void @g(i32 [[TMP8]])
-; CHECK-NEXT:    [[INDVAR_NEXT:%.*]] = add i64 [[INDVAR]], 1
-; CHECK-NEXT:    [[INC_US:%.*]] = add nuw nsw i32 [[J_016_US]], 1
-; CHECK-NEXT:    [[CMP2_US:%.*]] = icmp slt i64 [[INDVAR_NEXT]], [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX_US]], align 4
+; CHECK-NEXT:    tail call void @g(i32 [[TMP2]])
 ; CHECK-NEXT:    br label [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US]]
 ; CHECK:       for.cond1.for.cond.cleanup3_crit_edge.us:
 ; CHECK-NEXT:    [[INDVAR_NEXT3]] = add i64 [[INDVAR2]], 1
@@ -300,20 +278,13 @@ define void @foo2_zext(i32* nocapture readonly %A, i32 %N, i32 %M) {
 ; CHECK-NEXT:    br label [[FOR_COND1_PREHEADER:%.*]]
 ; CHECK:       for.cond1.preheader.us:
 ; CHECK-NEXT:    [[INDVAR1:%.*]] = phi i64 [ [[INDVAR_NEXT2:%.*]], [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US:%.*]] ], [ 0, [[FOR_COND1_PREHEADER_US_PREHEADER]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[INDVAR1]] to i32
-; CHECK-NEXT:    [[MUL_US:%.*]] = mul i32 [[TMP2]], [[M]]
 ; CHECK-NEXT:    [[FLATTEN_TRUNCIV:%.*]] = trunc i64 [[INDVAR1]] to i32
 ; CHECK-NEXT:    br label [[FOR_BODY4_US:%.*]]
 ; CHECK:       for.body4.us:
-; CHECK-NEXT:    [[INDVAR:%.*]] = phi i64 [ 0, [[FOR_COND1_PREHEADER_US]] ]
-; CHECK-NEXT:    [[TMP3:%.*]] = trunc i64 [[INDVAR]] to i32
-; CHECK-NEXT:    [[ADD_US:%.*]] = add i32 [[TMP3]], [[MUL_US]]
 ; CHECK-NEXT:    [[IDXPROM_US:%.*]] = zext i32 [[FLATTEN_TRUNCIV]] to i64
 ; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IDXPROM_US]]
-; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX_US]], align 4
-; CHECK-NEXT:    tail call void @g(i32 [[TMP4]])
-; CHECK-NEXT:    [[INDVAR_NEXT:%.*]] = add i64 [[INDVAR]], 1
-; CHECK-NEXT:    [[CMP2_US:%.*]] = icmp ult i64 [[INDVAR_NEXT]], [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX_US]], align 4
+; CHECK-NEXT:    tail call void @g(i32 [[TMP2]])
 ; CHECK-NEXT:    br label [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US]]
 ; CHECK:       for.cond1.for.cond.cleanup3_crit_edge.us:
 ; CHECK-NEXT:    [[INDVAR_NEXT2]] = add i64 [[INDVAR1]], 1
@@ -435,21 +406,14 @@ define void @zext(i32 %N, ptr nocapture %A, i16 %val) {
 ; CHECK-NEXT:    br label [[FOR_COND1_PREHEADER_US:%.*]]
 ; CHECK:       for.cond1.preheader.us:
 ; CHECK-NEXT:    [[INDVAR1:%.*]] = phi i64 [ [[INDVAR_NEXT2:%.*]], [[FOR_COND1_FOR_INC7_CRIT_EDGE_US:%.*]] ], [ 0, [[FOR_COND1_PREHEADER_US_PREHEADER]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[INDVAR1]] to i32
-; CHECK-NEXT:    [[MUL_US:%.*]] = mul i32 [[TMP2]], [[N]]
 ; CHECK-NEXT:    [[FLATTEN_TRUNCIV:%.*]] = trunc i64 [[INDVAR1]] to i32
 ; CHECK-NEXT:    br label [[FOR_BODY3_US:%.*]]
 ; CHECK:       for.body3.us:
-; CHECK-NEXT:    [[INDVAR:%.*]] = phi i64 [ 0, [[FOR_COND1_PREHEADER_US]] ]
-; CHECK-NEXT:    [[TMP3:%.*]] = trunc i64 [[INDVAR]] to i32
-; CHECK-NEXT:    [[ADD_US:%.*]] = add i32 [[TMP3]], [[MUL_US]]
 ; CHECK-NEXT:    [[IDXPROM_US:%.*]] = zext i32 [[FLATTEN_TRUNCIV]] to i64
 ; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds i16, ptr [[A:%.*]], i64 [[IDXPROM_US]]
-; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr [[ARRAYIDX_US]], align 2
-; CHECK-NEXT:    [[ADD5_US:%.*]] = add i16 [[TMP4]], [[VAL:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr [[ARRAYIDX_US]], align 2
+; CHECK-NEXT:    [[ADD5_US:%.*]] = add i16 [[TMP2]], [[VAL:%.*]]
 ; CHECK-NEXT:    store i16 [[ADD5_US]], ptr [[ARRAYIDX_US]], align 2
-; CHECK-NEXT:    [[INDVAR_NEXT:%.*]] = add i64 [[INDVAR]], 1
-; CHECK-NEXT:    [[CMP2_US:%.*]] = icmp ult i64 [[INDVAR_NEXT]], [[TMP0]]
 ; CHECK-NEXT:    br label [[FOR_COND1_FOR_INC7_CRIT_EDGE_US]]
 ; CHECK:       for.cond1.for.inc7_crit_edge.us:
 ; CHECK-NEXT:    [[INDVAR_NEXT2]] = add i64 [[INDVAR1]], 1
@@ -553,18 +517,11 @@ define void @test(i8 %n, i8 %m) {
 ; CHECK-NEXT:    br label [[FOR_COND3_PREHEADER_US:%.*]]
 ; CHECK:       for.cond3.preheader.us:
 ; CHECK-NEXT:    [[INDVAR2:%.*]] = phi i64 [ [[INDVAR_NEXT3:%.*]], [[FOR_COND3_FOR_COND_CLEANUP8_CRIT_EDGE_US:%.*]] ], [ 0, [[FOR_COND3_PREHEADER_US_PREHEADER]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[INDVAR2]] to i8
-; CHECK-NEXT:    [[MUL_US:%.*]] = mul i8 [[TMP2]], [[M]]
 ; CHECK-NEXT:    [[FLATTEN_TRUNCIV:%.*]] = trunc i64 [[INDVAR2]] to i8
 ; CHECK-NEXT:    br label [[FOR_BODY9_US:%.*]]
 ; CHECK:       for.body9.us:
-; CHECK-NEXT:    [[INDVAR:%.*]] = phi i64 [ 0, [[FOR_COND3_PREHEADER_US]] ]
-; CHECK-NEXT:    [[TMP3:%.*]] = trunc i64 [[INDVAR]] to i8
-; CHECK-NEXT:    [[ADD_US:%.*]] = add i8 [[TMP3]], [[MUL_US]]
 ; CHECK-NEXT:    [[CONV14_US:%.*]] = zext i8 [[FLATTEN_TRUNCIV]] to i32
 ; CHECK-NEXT:    [[CALL_US:%.*]] = tail call i32 @use_32(i32 [[CONV14_US]])
-; CHECK-NEXT:    [[INDVAR_NEXT:%.*]] = add i64 [[INDVAR]], 1
-; CHECK-NEXT:    [[CMP6_US:%.*]] = icmp ult i64 [[INDVAR_NEXT]], [[TMP0]]
 ; CHECK-NEXT:    br label [[FOR_COND3_FOR_COND_CLEANUP8_CRIT_EDGE_US]]
 ; CHECK:       for.cond3.for.cond.cleanup8_crit_edge.us:
 ; CHECK-NEXT:    [[INDVAR_NEXT3]] = add i64 [[INDVAR2]], 1
@@ -695,14 +652,9 @@ define void @test3(i8 %n, i8 %m) {
 ; CHECK-NEXT:    br label [[FOR_COND3_PREHEADER_US:%.*]]
 ; CHECK:       for.cond3.preheader.us:
 ; CHECK-NEXT:    [[INDVAR2:%.*]] = phi i64 [ [[INDVAR_NEXT3:%.*]], [[FOR_COND3_FOR_COND_CLEANUP8_CRIT_EDGE_US:%.*]] ], [ 0, [[FOR_COND3_PREHEADER_US_PREHEADER]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[INDVAR2]] to i8
-; CHECK-NEXT:    [[MUL_US:%.*]] = mul i8 [[TMP2]], [[M]]
 ; CHECK-NEXT:    [[FLATTEN_TRUNCIV:%.*]] = trunc i64 [[INDVAR2]] to i8
 ; CHECK-NEXT:    br label [[FOR_BODY9_US:%.*]]
 ; CHECK:       for.body9.us:
-; CHECK-NEXT:    [[INDVAR:%.*]] = phi i64 [ 0, [[FOR_COND3_PREHEADER_US]] ]
-; CHECK-NEXT:    [[TMP3:%.*]] = trunc i64 [[INDVAR]] to i8
-; CHECK-NEXT:    [[ADD_US:%.*]] = add i8 [[TMP3]], [[MUL_US]]
 ; CHECK-NEXT:    [[CONV14_US:%.*]] = zext i8 [[FLATTEN_TRUNCIV]] to i32
 ; CHECK-NEXT:    [[CALL_US:%.*]] = tail call i32 @use_32(i32 [[CONV14_US]])
 ; CHECK-NEXT:    [[CONV15_US:%.*]] = zext i8 [[FLATTEN_TRUNCIV]] to i16
@@ -711,8 +663,6 @@ define void @test3(i8 %n, i8 %m) {
 ; CHECK-NEXT:    [[CALL20_US:%.*]] = tail call i32 @use_16(i16 [[CONV15_US]])
 ; CHECK-NEXT:    [[CONV21_US:%.*]] = zext i8 [[FLATTEN_TRUNCIV]] to i64
 ; CHECK-NEXT:    [[CALL22_US:%.*]] = tail call i32 @use_64(i64 [[CONV21_US]])
-; CHECK-NEXT:    [[INDVAR_NEXT:%.*]] = add i64 [[INDVAR]], 1
-; CHECK-NEXT:    [[CMP6_US:%.*]] = icmp ult i64 [[INDVAR_NEXT]], [[TMP0]]
 ; CHECK-NEXT:    br label [[FOR_COND3_FOR_COND_CLEANUP8_CRIT_EDGE_US]]
 ; CHECK:       for.cond3.for.cond.cleanup8_crit_edge.us:
 ; CHECK-NEXT:    [[INDVAR_NEXT3]] = add i64 [[INDVAR2]], 1
@@ -855,14 +805,9 @@ define void @test4(i16 %n, i16 %m) {
 ; CHECK-NEXT:    br label [[FOR_COND3_PREHEADER_US:%.*]]
 ; CHECK:       for.cond3.preheader.us:
 ; CHECK-NEXT:    [[INDVAR2:%.*]] = phi i64 [ [[INDVAR_NEXT3:%.*]], [[FOR_COND3_FOR_COND_CLEANUP8_CRIT_EDGE_US:%.*]] ], [ 0, [[FOR_COND3_PREHEADER_US_PREHEADER]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[INDVAR2]] to i16
-; CHECK-NEXT:    [[MUL_US:%.*]] = mul i16 [[TMP2]], [[M]]
 ; CHECK-NEXT:    [[FLATTEN_TRUNCIV:%.*]] = trunc i64 [[INDVAR2]] to i16
 ; CHECK-NEXT:    br label [[FOR_BODY9_US:%.*]]
 ; CHECK:       for.body9.us:
-; CHECK-NEXT:    [[INDVAR:%.*]] = phi i64 [ 0, [[FOR_COND3_PREHEADER_US]] ]
-; CHECK-NEXT:    [[TMP3:%.*]] = trunc i64 [[INDVAR]] to i16
-; CHECK-NEXT:    [[ADD_US:%.*]] = add i16 [[TMP3]], [[MUL_US]]
 ; CHECK-NEXT:    [[CONV14_US:%.*]] = sext i16 [[FLATTEN_TRUNCIV]] to i32
 ; CHECK-NEXT:    [[CALL_US:%.*]] = tail call i32 @use_32(i32 [[CONV14_US]])
 ; CHECK-NEXT:    [[CALL15_US:%.*]] = tail call i32 @use_16(i16 [[FLATTEN_TRUNCIV]])
@@ -870,8 +815,6 @@ define void @test4(i16 %n, i16 %m) {
 ; CHECK-NEXT:    [[CALL18_US:%.*]] = tail call i32 @use_16(i16 [[FLATTEN_TRUNCIV]])
 ; CHECK-NEXT:    [[CONV19_US:%.*]] = sext i16 [[FLATTEN_TRUNCIV]] to i64
 ; CHECK-NEXT:    [[CALL20_US:%.*]] = tail call i32 @use_64(i64 [[CONV19_US]])
-; CHECK-NEXT:    [[INDVAR_NEXT:%.*]] = add i64 [[INDVAR]], 1
-; CHECK-NEXT:    [[CMP6_US:%.*]] = icmp slt i64 [[INDVAR_NEXT]], [[TMP0]]
 ; CHECK-NEXT:    br label [[FOR_COND3_FOR_COND_CLEANUP8_CRIT_EDGE_US]]
 ; CHECK:       for.cond3.for.cond.cleanup8_crit_edge.us:
 ; CHECK-NEXT:    [[INDVAR_NEXT3]] = add i64 [[INDVAR2]], 1
@@ -991,10 +934,7 @@ define i32 @constTripCount() {
 ; CHECK-NEXT:    [[INDVAR1:%.*]] = phi i64 [ [[INDVAR_NEXT2:%.*]], [[J_LOOPDONE:%.*]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[J_LOOP:%.*]]
 ; CHECK:       j.loop:
-; CHECK-NEXT:    [[INDVAR:%.*]] = phi i64 [ 0, [[I_LOOP]] ]
 ; CHECK-NEXT:    call void @payload()
-; CHECK-NEXT:    [[INDVAR_NEXT:%.*]] = add i64 [[INDVAR]], 1
-; CHECK-NEXT:    [[J_ATEND:%.*]] = icmp eq i64 [[INDVAR_NEXT]], 20
 ; CHECK-NEXT:    br label [[J_LOOPDONE]]
 ; CHECK:       j.loopdone:
 ; CHECK-NEXT:    [[INDVAR_NEXT2]] = add i64 [[INDVAR1]], 1
@@ -1063,19 +1003,12 @@ define void @foo_M_sext(ptr %A, i32 %N, i16 %M) {
 ; CHECK-NEXT:    br label [[FOR_COND1_PREHEADER_US:%.*]]
 ; CHECK:       for.cond1.preheader.us:
 ; CHECK-NEXT:    [[INDVAR1:%.*]] = phi i64 [ [[INDVAR_NEXT2:%.*]], [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US:%.*]] ], [ 0, [[FOR_COND1_PREHEADER_US_PREHEADER]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[INDVAR1]] to i32
-; CHECK-NEXT:    [[MUL_US:%.*]] = mul nsw i32 [[TMP2]], [[M2]]
 ; CHECK-NEXT:    [[FLATTEN_TRUNCIV:%.*]] = trunc i64 [[INDVAR1]] to i32
 ; CHECK-NEXT:    br label [[FOR_BODY4_US:%.*]]
 ; CHECK:       for.body4.us:
-; CHECK-NEXT:    [[INDVAR:%.*]] = phi i64 [ 0, [[FOR_COND1_PREHEADER_US]] ]
-; CHECK-NEXT:    [[TMP3:%.*]] = trunc i64 [[INDVAR]] to i32
-; CHECK-NEXT:    [[ADD_US:%.*]] = add nsw i32 [[TMP3]], [[MUL_US]]
 ; CHECK-NEXT:    [[IDXPROM_US:%.*]] = sext i32 [[FLATTEN_TRUNCIV]] to i64
 ; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[IDXPROM_US]]
 ; CHECK-NEXT:    tail call void @f(ptr [[ARRAYIDX_US]])
-; CHECK-NEXT:    [[INDVAR_NEXT:%.*]] = add i64 [[INDVAR]], 1
-; CHECK-NEXT:    [[CMP2_US:%.*]] = icmp slt i64 [[INDVAR_NEXT]], [[TMP0]]
 ; CHECK-NEXT:    br label [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US]]
 ; CHECK:       for.cond1.for.cond.cleanup3_crit_edge.us:
 ; CHECK-NEXT:    [[INDVAR_NEXT2]] = add i64 [[INDVAR1]], 1

From a62b86a3e66f6e328adc88bbd3a0c8b1991d35bb Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Wed, 1 Nov 2023 12:10:44 -0700
Subject: [PATCH 729/877] [AArch64,ELF] Restrict MOVZ/MOVK to non-PIC large
 code model (#70178)

There is no PIC support for -mcmodel=large

(https://github.com/ARM-software/abi-aa/blob/main/sysvabi64/sysvabi64.rst)
and Clang recently rejects -mcmodel= with PIC (#70262).

The current backend code assumes that the large code model is non-PIC.
This patch adds `!getTargetMachine().isPositionIndependent()` conditions
to clarify that the support is non-PIC only. In addition, add some tests
as change detectors in case PIC large code model is supported in the
future.

If other front-ends/JITs use the large code model with PIC, they will
get small code model code sequence, instead of potentially-incorrect
MOVZ/MOVK sequence, which is only suitable for non-PIC. The sequence
will cause text relocations using ELF linkers.

(The small code model code sequence is usually sufficient as ADRP+ADD or
ADRP+LDR targets [-2**32,2**32), which has a doubled range of x86-64
R_X86_64_REX_GOTPCRELX/R_X86_64_PC32 [-2**32,2**32).)
---
 .../Target/AArch64/AArch64ISelLowering.cpp    |  32 ++--
 .../GISel/AArch64InstructionSelector.cpp      |   5 +-
 .../GlobalISel/select-blockaddress.mir        |  13 +-
 .../GlobalISel/select-gv-cmodel-large.mir     |  49 +++---
 .../GlobalISel/select-gv-with-offset.mir      |  37 +++--
 llvm/test/CodeGen/AArch64/blockaddress.ll     |   1 +
 .../CodeGen/AArch64/code-model-large-abs.ll   |  72 ---------
 llvm/test/CodeGen/AArch64/code-model-large.ll | 144 ++++++++++++++++++
 llvm/test/CodeGen/AArch64/jump-table.ll       |   1 +
 9 files changed, 234 insertions(+), 120 deletions(-)
 delete mode 100644 llvm/test/CodeGen/AArch64/code-model-large-abs.ll
 create mode 100644 llvm/test/CodeGen/AArch64/code-model-large.ll

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index d00db82c9e49a..291f0c8c5d991 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -8203,7 +8203,8 @@ SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op,
   }
 
   SDValue Result;
-  if (getTargetMachine().getCodeModel() == CodeModel::Large) {
+  if (getTargetMachine().getCodeModel() == CodeModel::Large &&
+      !getTargetMachine().isPositionIndependent()) {
     Result = getAddrLarge(GN, DAG, OpFlags);
   } else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) {
     Result = getAddrTiny(GN, DAG, OpFlags);
@@ -9568,12 +9569,12 @@ SDValue AArch64TargetLowering::LowerJumpTable(SDValue Op,
   // is necessary here. Just get the address of the jump table.
   JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
 
-  if (getTargetMachine().getCodeModel() == CodeModel::Large &&
-      !Subtarget->isTargetMachO()) {
+  CodeModel::Model CM = getTargetMachine().getCodeModel();
+  if (CM == CodeModel::Large && !getTargetMachine().isPositionIndependent() &&
+      !Subtarget->isTargetMachO())
     return getAddrLarge(JT, DAG);
-  } else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) {
+  if (CM == CodeModel::Tiny)
     return getAddrTiny(JT, DAG);
-  }
   return getAddr(JT, DAG);
 }
 
@@ -9599,27 +9600,28 @@ SDValue AArch64TargetLowering::LowerBR_JT(SDValue Op,
 SDValue AArch64TargetLowering::LowerConstantPool(SDValue Op,
                                                  SelectionDAG &DAG) const {
   ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
-
-  if (getTargetMachine().getCodeModel() == CodeModel::Large) {
+  CodeModel::Model CM = getTargetMachine().getCodeModel();
+  if (CM == CodeModel::Large) {
     // Use the GOT for the large code model on iOS.
     if (Subtarget->isTargetMachO()) {
       return getGOT(CP, DAG);
     }
-    return getAddrLarge(CP, DAG);
-  } else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) {
+    if (!getTargetMachine().isPositionIndependent())
+      return getAddrLarge(CP, DAG);
+  } else if (CM == CodeModel::Tiny) {
     return getAddrTiny(CP, DAG);
-  } else {
-    return getAddr(CP, DAG);
   }
+  return getAddr(CP, DAG);
 }
 
 SDValue AArch64TargetLowering::LowerBlockAddress(SDValue Op,
                                                SelectionDAG &DAG) const {
   BlockAddressSDNode *BA = cast<BlockAddressSDNode>(Op);
-  if (getTargetMachine().getCodeModel() == CodeModel::Large &&
-      !Subtarget->isTargetMachO()) {
-    return getAddrLarge(BA, DAG);
-  } else if (getTargetMachine().getCodeModel() == CodeModel::Tiny) {
+  CodeModel::Model CM = getTargetMachine().getCodeModel();
+  if (CM == CodeModel::Large && !Subtarget->isTargetMachO()) {
+    if (!getTargetMachine().isPositionIndependent())
+      return getAddrLarge(BA, DAG);
+  } else if (CM == CodeModel::Tiny) {
     return getAddrTiny(BA, DAG);
   }
   return getAddr(BA, DAG);
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
index 88516967515a5..3206844411427 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
@@ -2849,7 +2849,8 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
     if (OpFlags & AArch64II::MO_GOT) {
       I.setDesc(TII.get(AArch64::LOADgot));
       I.getOperand(1).setTargetFlags(OpFlags);
-    } else if (TM.getCodeModel() == CodeModel::Large) {
+    } else if (TM.getCodeModel() == CodeModel::Large &&
+               !TM.isPositionIndependent()) {
       // Materialize the global using movz/movk instructions.
       materializeLargeCMVal(I, GV, OpFlags);
       I.eraseFromParent();
@@ -3502,7 +3503,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
     return true;
   }
   case TargetOpcode::G_BLOCK_ADDR: {
-    if (TM.getCodeModel() == CodeModel::Large) {
+    if (TM.getCodeModel() == CodeModel::Large && !TM.isPositionIndependent()) {
       materializeLargeCMVal(I, I.getOperand(1).getBlockAddress(), 0);
       I.eraseFromParent();
       return true;
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/select-blockaddress.mir b/llvm/test/CodeGen/AArch64/GlobalISel/select-blockaddress.mir
index ee54f38868369..28d279d742164 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/select-blockaddress.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/select-blockaddress.mir
@@ -1,8 +1,8 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc -mtriple=aarch64-unknown-unknown -o - -verify-machineinstrs -run-pass=instruction-select %s | FileCheck %s
 # RUN: llc -mtriple=aarch64-unknown-unknown -o - -verify-machineinstrs -run-pass=instruction-select -code-model=large %s | FileCheck %s --check-prefix=LARGE
+# RUN: llc -mtriple=aarch64-unknown-unknown -o - -verify-machineinstrs -run-pass=instruction-select -code-model=large -relocation-model=pic %s | FileCheck %s --check-prefix=LARGE-PIC
 --- |
-  ; ModuleID = 'blockaddress.ll'
   source_filename = "blockaddress.ll"
   target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
   target triple = "aarch64-none-linux-gnu"
@@ -37,6 +37,7 @@ body:             |
   ; CHECK-NEXT:   BR [[MOVaddrBA]]
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1.block (ir-block-address-taken %ir-block.block):
+  ;
   ; LARGE-LABEL: name: test_blockaddress
   ; LARGE: bb.0 (%ir-block.0):
   ; LARGE-NEXT:   [[MOVZXi:%[0-9]+]]:gpr64 = MOVZXi target-flags(aarch64-g0, aarch64-nc) blockaddress(@test_blockaddress, %ir-block.block), 0
@@ -51,6 +52,16 @@ body:             |
   ; LARGE-NEXT:   BR [[MOVKXi2]]
   ; LARGE-NEXT: {{  $}}
   ; LARGE-NEXT: bb.1.block (ir-block-address-taken %ir-block.block):
+  ;
+  ; LARGE-PIC-LABEL: name: test_blockaddress
+  ; LARGE-PIC: bb.0 (%ir-block.0):
+  ; LARGE-PIC-NEXT:   [[MOVaddrBA:%[0-9]+]]:gpr64common = MOVaddrBA target-flags(aarch64-page) blockaddress(@test_blockaddress, %ir-block.block), target-flags(aarch64-pageoff, aarch64-nc) blockaddress(@test_blockaddress, %ir-block.block)
+  ; LARGE-PIC-NEXT:   [[MOVaddr:%[0-9]+]]:gpr64common = MOVaddr target-flags(aarch64-page) @addr, target-flags(aarch64-pageoff, aarch64-nc) @addr
+  ; LARGE-PIC-NEXT:   [[COPY:%[0-9]+]]:gpr64 = COPY [[MOVaddrBA]]
+  ; LARGE-PIC-NEXT:   STRXui [[COPY]], [[MOVaddr]], 0 :: (store (p0) into @addr)
+  ; LARGE-PIC-NEXT:   BR [[MOVaddrBA]]
+  ; LARGE-PIC-NEXT: {{  $}}
+  ; LARGE-PIC-NEXT: bb.1.block (ir-block-address-taken %ir-block.block):
   bb.1 (%ir-block.0):
     %0:gpr(p0) = G_BLOCK_ADDR blockaddress(@test_blockaddress, %ir-block.block)
     %1:gpr(p0) = G_GLOBAL_VALUE @addr
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/select-gv-cmodel-large.mir b/llvm/test/CodeGen/AArch64/GlobalISel/select-gv-cmodel-large.mir
index b2aadd7ca5c2d..d0ac97e6908aa 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/select-gv-cmodel-large.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/select-gv-cmodel-large.mir
@@ -1,5 +1,6 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=aarch64-linux-gnu -code-model=large -run-pass=instruction-select -verify-machineinstrs -O0 %s -o - | FileCheck %s
+# RUN: llc -mtriple=aarch64-linux-gnu -code-model=large -relocation-model=static -run-pass=instruction-select -verify-machineinstrs -O0 %s -o - | FileCheck %s --check-prefix=STATIC
+# RUN: llc -mtriple=aarch64-linux-gnu -code-model=large -relocation-model=pic -run-pass=instruction-select -verify-machineinstrs -O0 %s -o - | FileCheck %s --check-prefix=PIC
 --- |
   target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 
@@ -29,23 +30,35 @@ stack:
 constants:
 body:             |
   bb.1:
-    ; CHECK-LABEL: name: gv_large
-    ; CHECK: [[MOVZXi:%[0-9]+]]:gpr64 = MOVZXi target-flags(aarch64-g0, aarch64-nc) @foo1, 0
-    ; CHECK: [[MOVKXi:%[0-9]+]]:gpr64 = MOVKXi [[MOVZXi]], target-flags(aarch64-g1, aarch64-nc) @foo1, 16
-    ; CHECK: [[MOVKXi1:%[0-9]+]]:gpr64 = MOVKXi [[MOVKXi]], target-flags(aarch64-g2, aarch64-nc) @foo1, 32
-    ; CHECK: [[MOVKXi2:%[0-9]+]]:gpr64 = MOVKXi [[MOVKXi1]], target-flags(aarch64-g3) @foo1, 48
-    ; CHECK: [[COPY:%[0-9]+]]:gpr64sp = COPY [[MOVKXi2]]
-    ; CHECK: [[MOVZXi1:%[0-9]+]]:gpr64 = MOVZXi target-flags(aarch64-g0, aarch64-nc) @foo2, 0
-    ; CHECK: [[MOVKXi3:%[0-9]+]]:gpr64 = MOVKXi [[MOVZXi1]], target-flags(aarch64-g1, aarch64-nc) @foo2, 16
-    ; CHECK: [[MOVKXi4:%[0-9]+]]:gpr64 = MOVKXi [[MOVKXi3]], target-flags(aarch64-g2, aarch64-nc) @foo2, 32
-    ; CHECK: [[MOVKXi5:%[0-9]+]]:gpr64 = MOVKXi [[MOVKXi4]], target-flags(aarch64-g3) @foo2, 48
-    ; CHECK: [[COPY1:%[0-9]+]]:gpr64sp = COPY [[MOVKXi5]]
-    ; CHECK: STRWui $wzr, %stack.0.retval, 0 :: (store (s32) into %ir.retval)
-    ; CHECK: [[LDRWui:%[0-9]+]]:gpr32 = LDRWui [[COPY]], 0 :: (load (s32) from @foo1)
-    ; CHECK: [[LDRWui1:%[0-9]+]]:gpr32 = LDRWui [[COPY1]], 0 :: (load (s32) from @foo2)
-    ; CHECK: [[ADDWrr:%[0-9]+]]:gpr32 = ADDWrr [[LDRWui]], [[LDRWui1]]
-    ; CHECK: $w0 = COPY [[ADDWrr]]
-    ; CHECK: RET_ReallyLR implicit $w0
+    ; STATIC-LABEL: name: gv_large
+    ; STATIC: [[MOVZXi:%[0-9]+]]:gpr64 = MOVZXi target-flags(aarch64-g0, aarch64-nc) @foo1, 0
+    ; STATIC-NEXT: [[MOVKXi:%[0-9]+]]:gpr64 = MOVKXi [[MOVZXi]], target-flags(aarch64-g1, aarch64-nc) @foo1, 16
+    ; STATIC-NEXT: [[MOVKXi1:%[0-9]+]]:gpr64 = MOVKXi [[MOVKXi]], target-flags(aarch64-g2, aarch64-nc) @foo1, 32
+    ; STATIC-NEXT: [[MOVKXi2:%[0-9]+]]:gpr64 = MOVKXi [[MOVKXi1]], target-flags(aarch64-g3) @foo1, 48
+    ; STATIC-NEXT: [[COPY:%[0-9]+]]:gpr64sp = COPY [[MOVKXi2]]
+    ; STATIC-NEXT: [[MOVZXi1:%[0-9]+]]:gpr64 = MOVZXi target-flags(aarch64-g0, aarch64-nc) @foo2, 0
+    ; STATIC-NEXT: [[MOVKXi3:%[0-9]+]]:gpr64 = MOVKXi [[MOVZXi1]], target-flags(aarch64-g1, aarch64-nc) @foo2, 16
+    ; STATIC-NEXT: [[MOVKXi4:%[0-9]+]]:gpr64 = MOVKXi [[MOVKXi3]], target-flags(aarch64-g2, aarch64-nc) @foo2, 32
+    ; STATIC-NEXT: [[MOVKXi5:%[0-9]+]]:gpr64 = MOVKXi [[MOVKXi4]], target-flags(aarch64-g3) @foo2, 48
+    ; STATIC-NEXT: [[COPY1:%[0-9]+]]:gpr64sp = COPY [[MOVKXi5]]
+    ; STATIC-NEXT: STRWui $wzr, %stack.0.retval, 0 :: (store (s32) into %ir.retval)
+    ; STATIC-NEXT: [[LDRWui:%[0-9]+]]:gpr32 = LDRWui [[COPY]], 0 :: (load (s32) from @foo1)
+    ; STATIC-NEXT: [[LDRWui1:%[0-9]+]]:gpr32 = LDRWui [[COPY1]], 0 :: (load (s32) from @foo2)
+    ; STATIC-NEXT: [[ADDWrr:%[0-9]+]]:gpr32 = ADDWrr [[LDRWui]], [[LDRWui1]]
+    ; STATIC-NEXT: $w0 = COPY [[ADDWrr]]
+    ; STATIC-NEXT: RET_ReallyLR implicit $w0
+    ;
+    ; PIC-LABEL: name: gv_large
+    ; PIC: [[MOVaddr:%[0-9]+]]:gpr64common = MOVaddr target-flags(aarch64-page) @foo1, target-flags(aarch64-pageoff, aarch64-nc) @foo1
+    ; PIC-NEXT: [[COPY:%[0-9]+]]:gpr64sp = COPY [[MOVaddr]]
+    ; PIC-NEXT: [[MOVaddr1:%[0-9]+]]:gpr64common = MOVaddr target-flags(aarch64-page) @foo2, target-flags(aarch64-pageoff, aarch64-nc) @foo2
+    ; PIC-NEXT: [[COPY1:%[0-9]+]]:gpr64sp = COPY [[MOVaddr1]]
+    ; PIC-NEXT: STRWui $wzr, %stack.0.retval, 0 :: (store (s32) into %ir.retval)
+    ; PIC-NEXT: [[LDRWui:%[0-9]+]]:gpr32 = LDRWui [[COPY]], 0 :: (load (s32) from @foo1)
+    ; PIC-NEXT: [[LDRWui1:%[0-9]+]]:gpr32 = LDRWui [[COPY1]], 0 :: (load (s32) from @foo2)
+    ; PIC-NEXT: [[ADDWrr:%[0-9]+]]:gpr32 = ADDWrr [[LDRWui]], [[LDRWui1]]
+    ; PIC-NEXT: $w0 = COPY [[ADDWrr]]
+    ; PIC-NEXT: RET_ReallyLR implicit $w0
     %1:gpr(s32) = G_CONSTANT i32 0
     %4:gpr(p0) = G_GLOBAL_VALUE @foo1
     %3:gpr(p0) = COPY %4(p0)
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/select-gv-with-offset.mir b/llvm/test/CodeGen/AArch64/GlobalISel/select-gv-with-offset.mir
index 8b4ae941eb4d7..1713f36683ef9 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/select-gv-with-offset.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/select-gv-with-offset.mir
@@ -1,5 +1,6 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc -mtriple=aarch64 -code-model=large -run-pass=instruction-select -verify-machineinstrs -O0 %s -o - | FileCheck %s --check-prefix=LARGE
+# RUN: llc -mtriple=aarch64 -code-model=large -relocation-model=pic -run-pass=instruction-select -verify-machineinstrs -O0 %s -o - | FileCheck %s --check-prefix=LARGE-PIC
 # RUN: llc -mtriple=aarch64 -code-model=small -run-pass=instruction-select -verify-machineinstrs -O0 %s -o - | FileCheck %s --check-prefix=SMALL
 # RUN: llc -mtriple=aarch64 -code-model=tiny -run-pass=instruction-select -verify-machineinstrs -O0 %s -o - | FileCheck %s --check-prefix=TINY
 
@@ -17,22 +18,34 @@ body:             |
     liveins: $x0
     ; LARGE-LABEL: name: select_gv_with_offset
     ; LARGE: liveins: $x0
-    ; LARGE: [[MOVZXi:%[0-9]+]]:gpr64 = MOVZXi target-flags(aarch64-g0, aarch64-nc) @g + 1, 0
-    ; LARGE: [[MOVKXi:%[0-9]+]]:gpr64 = MOVKXi [[MOVZXi]], target-flags(aarch64-g1, aarch64-nc) @g + 1, 16
-    ; LARGE: [[MOVKXi1:%[0-9]+]]:gpr64 = MOVKXi [[MOVKXi]], target-flags(aarch64-g2, aarch64-nc) @g + 1, 32
-    ; LARGE: %g:gpr64 = MOVKXi [[MOVKXi1]], target-flags(aarch64-g3) @g + 1, 48
-    ; LARGE: $x0 = COPY %g
-    ; LARGE: RET_ReallyLR implicit $x0
+    ; LARGE-NEXT: {{  $}}
+    ; LARGE-NEXT: [[MOVZXi:%[0-9]+]]:gpr64 = MOVZXi target-flags(aarch64-g0, aarch64-nc) @g + 1, 0
+    ; LARGE-NEXT: [[MOVKXi:%[0-9]+]]:gpr64 = MOVKXi [[MOVZXi]], target-flags(aarch64-g1, aarch64-nc) @g + 1, 16
+    ; LARGE-NEXT: [[MOVKXi1:%[0-9]+]]:gpr64 = MOVKXi [[MOVKXi]], target-flags(aarch64-g2, aarch64-nc) @g + 1, 32
+    ; LARGE-NEXT: %g:gpr64 = MOVKXi [[MOVKXi1]], target-flags(aarch64-g3) @g + 1, 48
+    ; LARGE-NEXT: $x0 = COPY %g
+    ; LARGE-NEXT: RET_ReallyLR implicit $x0
+    ;
+    ; LARGE-PIC-LABEL: name: select_gv_with_offset
+    ; LARGE-PIC: liveins: $x0
+    ; LARGE-PIC-NEXT: {{  $}}
+    ; LARGE-PIC-NEXT: %g:gpr64common = MOVaddr target-flags(aarch64-page) @g + 1, target-flags(aarch64-pageoff, aarch64-nc) @g + 1
+    ; LARGE-PIC-NEXT: $x0 = COPY %g
+    ; LARGE-PIC-NEXT: RET_ReallyLR implicit $x0
+    ;
     ; SMALL-LABEL: name: select_gv_with_offset
     ; SMALL: liveins: $x0
-    ; SMALL: %g:gpr64common = MOVaddr target-flags(aarch64-page) @g + 1, target-flags(aarch64-pageoff, aarch64-nc) @g + 1
-    ; SMALL: $x0 = COPY %g
-    ; SMALL: RET_ReallyLR implicit $x0
+    ; SMALL-NEXT: {{  $}}
+    ; SMALL-NEXT: %g:gpr64common = MOVaddr target-flags(aarch64-page) @g + 1, target-flags(aarch64-pageoff, aarch64-nc) @g + 1
+    ; SMALL-NEXT: $x0 = COPY %g
+    ; SMALL-NEXT: RET_ReallyLR implicit $x0
+    ;
     ; TINY-LABEL: name: select_gv_with_offset
     ; TINY: liveins: $x0
-    ; TINY: %g:gpr64 = ADR @g + 1
-    ; TINY: $x0 = COPY %g
-    ; TINY: RET_ReallyLR implicit $x0
+    ; TINY-NEXT: {{  $}}
+    ; TINY-NEXT: %g:gpr64 = ADR @g + 1
+    ; TINY-NEXT: $x0 = COPY %g
+    ; TINY-NEXT: RET_ReallyLR implicit $x0
     %g:gpr(p0) = G_GLOBAL_VALUE @g + 1
     $x0 = COPY %g(p0)
     RET_ReallyLR implicit $x0
diff --git a/llvm/test/CodeGen/AArch64/blockaddress.ll b/llvm/test/CodeGen/AArch64/blockaddress.ll
index 732cfc23292d7..b222d2f373192 100644
--- a/llvm/test/CodeGen/AArch64/blockaddress.ll
+++ b/llvm/test/CodeGen/AArch64/blockaddress.ll
@@ -1,5 +1,6 @@
 ; RUN: llc -mtriple=aarch64-none-linux-gnu -aarch64-enable-atomic-cfg-tidy=0 -verify-machineinstrs < %s | FileCheck %s
 ; RUN: llc -code-model=large -mtriple=aarch64-none-linux-gnu -aarch64-enable-atomic-cfg-tidy=0 -verify-machineinstrs < %s | FileCheck --check-prefix=CHECK-LARGE %s
+; RUN: llc -code-model=large -relocation-model=pic -mtriple=aarch64-none-linux-gnu -aarch64-enable-atomic-cfg-tidy=0 < %s | FileCheck %s
 ; RUN: llc -code-model=tiny -mtriple=aarch64-none-elf -aarch64-enable-atomic-cfg-tidy=0 -verify-machineinstrs < %s | FileCheck --check-prefix=CHECK-TINY %s
 
 @addr = global ptr null
diff --git a/llvm/test/CodeGen/AArch64/code-model-large-abs.ll b/llvm/test/CodeGen/AArch64/code-model-large-abs.ll
deleted file mode 100644
index e6f1f28778df0..0000000000000
--- a/llvm/test/CodeGen/AArch64/code-model-large-abs.ll
+++ /dev/null
@@ -1,72 +0,0 @@
-; RUN: llc -mtriple=aarch64-linux-gnu -code-model=large -o - %s | FileCheck %s
-
-@var8 = dso_local global i8 0
-@var16 = dso_local global i16 0
-@var32 = dso_local global i32 0
-@var64 = dso_local global i64 0
-
-define dso_local ptr @global_addr() {
-; CHECK-LABEL: global_addr:
-  ret ptr @var8
-  ; The movz/movk calculation should end up returned directly in x0.
-; CHECK: movz x0, #:abs_g0_nc:var8
-; CHECK: movk x0, #:abs_g1_nc:var8
-; CHECK: movk x0, #:abs_g2_nc:var8
-; CHECK: movk x0, #:abs_g3:var8
-; CHECK-NEXT: ret
-}
-
-define dso_local i8 @global_i8() {
-; CHECK-LABEL: global_i8:
-  %val = load i8, ptr @var8
-  ret i8 %val
-; CHECK: movz x[[ADDR_REG:[0-9]+]], #:abs_g0_nc:var8
-; CHECK: movk x[[ADDR_REG]], #:abs_g1_nc:var8
-; CHECK: movk x[[ADDR_REG]], #:abs_g2_nc:var8
-; CHECK: movk x[[ADDR_REG]], #:abs_g3:var8
-; CHECK: ldrb w0, [x[[ADDR_REG]]]
-}
-
-define dso_local i16 @global_i16() {
-; CHECK-LABEL: global_i16:
-  %val = load i16, ptr @var16
-  ret i16 %val
-; CHECK: movz x[[ADDR_REG:[0-9]+]], #:abs_g0_nc:var16
-; CHECK: movk x[[ADDR_REG]], #:abs_g1_nc:var16
-; CHECK: movk x[[ADDR_REG]], #:abs_g2_nc:var16
-; CHECK: movk x[[ADDR_REG]], #:abs_g3:var16
-; CHECK: ldrh w0, [x[[ADDR_REG]]]
-}
-
-define dso_local i32 @global_i32() {
-; CHECK-LABEL: global_i32:
-  %val = load i32, ptr @var32
-  ret i32 %val
-; CHECK: movz x[[ADDR_REG:[0-9]+]], #:abs_g0_nc:var32
-; CHECK: movk x[[ADDR_REG]], #:abs_g1_nc:var32
-; CHECK: movk x[[ADDR_REG]], #:abs_g2_nc:var32
-; CHECK: movk x[[ADDR_REG]], #:abs_g3:var32
-; CHECK: ldr w0, [x[[ADDR_REG]]]
-}
-
-define dso_local i64 @global_i64() {
-; CHECK-LABEL: global_i64:
-  %val = load i64, ptr @var64
-  ret i64 %val
-; CHECK: movz x[[ADDR_REG:[0-9]+]], #:abs_g0_nc:var64
-; CHECK: movk x[[ADDR_REG]], #:abs_g1_nc:var64
-; CHECK: movk x[[ADDR_REG]], #:abs_g2_nc:var64
-; CHECK: movk x[[ADDR_REG]], #:abs_g3:var64
-; CHECK: ldr x0, [x[[ADDR_REG]]]
-}
-
-define dso_local <2 x i64> @constpool() {
-; CHECK-LABEL: constpool:
-  ret <2 x i64> <i64 123456789, i64 987654321100>
-
-; CHECK: movz x[[ADDR_REG:[0-9]+]], #:abs_g0_nc:[[CPADDR:.LCPI[0-9]+_[0-9]+]]
-; CHECK: movk x[[ADDR_REG]], #:abs_g1_nc:[[CPADDR]]
-; CHECK: movk x[[ADDR_REG]], #:abs_g2_nc:[[CPADDR]]
-; CHECK: movk x[[ADDR_REG]], #:abs_g3:[[CPADDR]]
-; CHECK: ldr q0, [x[[ADDR_REG]]]
-}
diff --git a/llvm/test/CodeGen/AArch64/code-model-large.ll b/llvm/test/CodeGen/AArch64/code-model-large.ll
new file mode 100644
index 0000000000000..11133abcb8be3
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/code-model-large.ll
@@ -0,0 +1,144 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc -mtriple=aarch64-linux-gnu -code-model=large -o - %s | FileCheck %s --check-prefix=STATIC
+; RUN: llc -mtriple=aarch64-linux-gnu -code-model=large -relocation-model=pic -o - %s | FileCheck %s --check-prefix=PIC
+
+@var8 = dso_local global i8 0
+@var16 = dso_local global i16 0
+@var32 = dso_local global i32 0
+@var64 = dso_local global i64 0
+
+define dso_local ptr @global_addr() {
+;
+; STATIC-LABEL: global_addr:
+; STATIC:       // %bb.0:
+; STATIC-NEXT:    movz x0, #:abs_g0_nc:var8
+; STATIC-NEXT:    movk x0, #:abs_g1_nc:var8
+; STATIC-NEXT:    movk x0, #:abs_g2_nc:var8
+; STATIC-NEXT:    movk x0, #:abs_g3:var8
+; STATIC-NEXT:    ret
+;
+; PIC-LABEL: global_addr:
+; PIC:       .Lglobal_addr$local:
+; PIC-NEXT:    .type .Lglobal_addr$local,@function
+; PIC-NEXT:    .cfi_startproc
+; PIC-NEXT:  // %bb.0:
+; PIC-NEXT:    adrp x0, .Lvar8$local
+; PIC-NEXT:    add x0, x0, :lo12:.Lvar8$local
+; PIC-NEXT:    ret
+  ret ptr @var8
+  ; The movz/movk calculation should end up returned directly in x0.
+}
+
+define dso_local i8 @global_i8() {
+;
+; STATIC-LABEL: global_i8:
+; STATIC:       // %bb.0:
+; STATIC-NEXT:    movz x8, #:abs_g0_nc:var8
+; STATIC-NEXT:    movk x8, #:abs_g1_nc:var8
+; STATIC-NEXT:    movk x8, #:abs_g2_nc:var8
+; STATIC-NEXT:    movk x8, #:abs_g3:var8
+; STATIC-NEXT:    ldrb w0, [x8]
+; STATIC-NEXT:    ret
+;
+; PIC-LABEL: global_i8:
+; PIC:       .Lglobal_i8$local:
+; PIC-NEXT:    .type .Lglobal_i8$local,@function
+; PIC-NEXT:    .cfi_startproc
+; PIC-NEXT:  // %bb.0:
+; PIC-NEXT:    adrp x8, .Lvar8$local
+; PIC-NEXT:    ldrb w0, [x8, :lo12:.Lvar8$local]
+; PIC-NEXT:    ret
+  %val = load i8, ptr @var8
+  ret i8 %val
+}
+
+define dso_local i16 @global_i16() {
+;
+; STATIC-LABEL: global_i16:
+; STATIC:       // %bb.0:
+; STATIC-NEXT:    movz x8, #:abs_g0_nc:var16
+; STATIC-NEXT:    movk x8, #:abs_g1_nc:var16
+; STATIC-NEXT:    movk x8, #:abs_g2_nc:var16
+; STATIC-NEXT:    movk x8, #:abs_g3:var16
+; STATIC-NEXT:    ldrh w0, [x8]
+; STATIC-NEXT:    ret
+;
+; PIC-LABEL: global_i16:
+; PIC:       .Lglobal_i16$local:
+; PIC-NEXT:    .type .Lglobal_i16$local,@function
+; PIC-NEXT:    .cfi_startproc
+; PIC-NEXT:  // %bb.0:
+; PIC-NEXT:    adrp x8, .Lvar16$local
+; PIC-NEXT:    ldrh w0, [x8, :lo12:.Lvar16$local]
+; PIC-NEXT:    ret
+  %val = load i16, ptr @var16
+  ret i16 %val
+}
+
+define dso_local i32 @global_i32() {
+;
+; STATIC-LABEL: global_i32:
+; STATIC:       // %bb.0:
+; STATIC-NEXT:    movz x8, #:abs_g0_nc:var32
+; STATIC-NEXT:    movk x8, #:abs_g1_nc:var32
+; STATIC-NEXT:    movk x8, #:abs_g2_nc:var32
+; STATIC-NEXT:    movk x8, #:abs_g3:var32
+; STATIC-NEXT:    ldr w0, [x8]
+; STATIC-NEXT:    ret
+;
+; PIC-LABEL: global_i32:
+; PIC:       .Lglobal_i32$local:
+; PIC-NEXT:    .type .Lglobal_i32$local,@function
+; PIC-NEXT:    .cfi_startproc
+; PIC-NEXT:  // %bb.0:
+; PIC-NEXT:    adrp x8, .Lvar32$local
+; PIC-NEXT:    ldr w0, [x8, :lo12:.Lvar32$local]
+; PIC-NEXT:    ret
+  %val = load i32, ptr @var32
+  ret i32 %val
+}
+
+define dso_local i64 @global_i64() {
+;
+; STATIC-LABEL: global_i64:
+; STATIC:       // %bb.0:
+; STATIC-NEXT:    movz x8, #:abs_g0_nc:var64
+; STATIC-NEXT:    movk x8, #:abs_g1_nc:var64
+; STATIC-NEXT:    movk x8, #:abs_g2_nc:var64
+; STATIC-NEXT:    movk x8, #:abs_g3:var64
+; STATIC-NEXT:    ldr x0, [x8]
+; STATIC-NEXT:    ret
+;
+; PIC-LABEL: global_i64:
+; PIC:       .Lglobal_i64$local:
+; PIC-NEXT:    .type .Lglobal_i64$local,@function
+; PIC-NEXT:    .cfi_startproc
+; PIC-NEXT:  // %bb.0:
+; PIC-NEXT:    adrp x8, .Lvar64$local
+; PIC-NEXT:    ldr x0, [x8, :lo12:.Lvar64$local]
+; PIC-NEXT:    ret
+  %val = load i64, ptr @var64
+  ret i64 %val
+}
+
+define dso_local <2 x i64> @constpool() {
+;
+; STATIC-LABEL: constpool:
+; STATIC:       // %bb.0:
+; STATIC-NEXT:    movz x8, #:abs_g0_nc:.LCPI5_0
+; STATIC-NEXT:    movk x8, #:abs_g1_nc:.LCPI5_0
+; STATIC-NEXT:    movk x8, #:abs_g2_nc:.LCPI5_0
+; STATIC-NEXT:    movk x8, #:abs_g3:.LCPI5_0
+; STATIC-NEXT:    ldr q0, [x8]
+; STATIC-NEXT:    ret
+;
+; PIC-LABEL: constpool:
+; PIC:       .Lconstpool$local:
+; PIC-NEXT:    .type .Lconstpool$local,@function
+; PIC-NEXT:    .cfi_startproc
+; PIC-NEXT:  // %bb.0:
+; PIC-NEXT:    adrp x8, .LCPI5_0
+; PIC-NEXT:    ldr q0, [x8, :lo12:.LCPI5_0]
+; PIC-NEXT:    ret
+  ret <2 x i64> <i64 123456789, i64 987654321100>
+}
diff --git a/llvm/test/CodeGen/AArch64/jump-table.ll b/llvm/test/CodeGen/AArch64/jump-table.ll
index 5b414d5b5e33d..6c32444657542 100644
--- a/llvm/test/CodeGen/AArch64/jump-table.ll
+++ b/llvm/test/CodeGen/AArch64/jump-table.ll
@@ -1,5 +1,6 @@
 ; RUN: llc -no-integrated-as -verify-machineinstrs -o - %s -mtriple=aarch64-none-linux-gnu -aarch64-enable-atomic-cfg-tidy=0 | FileCheck %s
 ; RUN: llc -no-integrated-as -code-model=large -verify-machineinstrs -o - %s -mtriple=aarch64-none-linux-gnu -aarch64-enable-atomic-cfg-tidy=0 | FileCheck --check-prefix=CHECK-LARGE %s
+; RUN: llc -no-integrated-as -code-model=large -relocation-model=pic -o - %s -mtriple=aarch64-none-linux-gnu -aarch64-enable-atomic-cfg-tidy=0 | FileCheck --check-prefix=CHECK-PIC %s
 ; RUN: llc -no-integrated-as -mtriple=aarch64-none-linux-gnu -verify-machineinstrs -relocation-model=pic -aarch64-enable-atomic-cfg-tidy=0 -o - %s | FileCheck --check-prefix=CHECK-PIC %s
 ; RUN: llc -no-integrated-as -verify-machineinstrs -o - %s -mtriple=arm64-apple-ios -aarch64-enable-atomic-cfg-tidy=0 | FileCheck --check-prefix=CHECK-IOS %s
 ; RUN: llc -no-integrated-as -code-model=tiny -verify-machineinstrs -o - %s -mtriple=aarch64-none-linux-gnu -aarch64-enable-atomic-cfg-tidy=0 | FileCheck --check-prefix=CHECK-TINY %s

From 3426d330a7ca11776a87413fcab00e6348a53831 Mon Sep 17 00:00:00 2001
From: Peiming Liu <36770114+PeimingLiu@users.noreply.github.com>
Date: Wed, 1 Nov 2023 12:11:47 -0700
Subject: [PATCH 730/877] [mlir][sparse] Implement rewriters to reinterpret
 maps on foreach (#70868)

---
 .../Transforms/SparseReinterpretMap.cpp       | 187 ++++++++++++------
 .../Transforms/SparseTensorRewriting.cpp      |  27 ++-
 .../SparseTensor/sparse_reinterpret_map.mlir  |  48 +++--
 3 files changed, 193 insertions(+), 69 deletions(-)

diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseReinterpretMap.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseReinterpretMap.cpp
index a822effbb2ab7..d14df6db8ee6b 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseReinterpretMap.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseReinterpretMap.cpp
@@ -18,10 +18,8 @@
 using namespace mlir;
 using namespace mlir::sparse_tensor;
 
-namespace {
-
 //===----------------------------------------------------------------------===//
-// Helper methods.
+// File Local Helper methods.
 //===----------------------------------------------------------------------===//
 
 // Translates a "simple" map according to an identity lvl-map.
@@ -51,6 +49,27 @@ static Value genRemap(OpBuilder &builder, SparseTensorEncodingAttr enc,
   return builder.create<ReinterpretMapOp>(val.getLoc(), enc, val);
 }
 
+static SmallVector<Value> remapValueRange(OpBuilder &rewriter, TypeRange types,
+                                          ValueRange outs) {
+  SmallVector<Value> ret(outs);
+  assert(outs.size() == types.size());
+  for (auto [r, t] : llvm::zip(ret, types))
+    if (r.getType() != t)
+      r = rewriter.create<ReinterpretMapOp>(r.getLoc(), t, r);
+  return ret;
+}
+
+/// Whether the operation has any sparse tensor with non-identity dim2lvl maps.
+static bool hasNonIdentityOperandsOrResults(Operation *op) {
+  auto hasNonIdentityMap = [](Value v) {
+    auto stt = tryGetSparseTensorType(v);
+    return stt && !stt->isIdentity();
+  };
+
+  return llvm::any_of(op->getOperands(), hasNonIdentityMap) ||
+         llvm::any_of(op->getResults(), hasNonIdentityMap);
+}
+
 // Generates a clone of the given linalg generic operation, but with
 // remapped arguments, index maps, and iteration types.
 //
@@ -86,6 +105,8 @@ static linalg::GenericOp genGenericLinalg(PatternRewriter &rewriter,
   return newOp;
 }
 
+namespace {
+
 //===----------------------------------------------------------------------===//
 // Rewriting rules for linalg generic ops.
 //===----------------------------------------------------------------------===//
@@ -142,21 +163,17 @@ struct GenericOpReinterpretMap : public OpRewritePattern<linalg::GenericOp> {
 };
 
 //===----------------------------------------------------------------------===//
-// Rewriting rules for operations other than linalg generic ops.
+// Reinterpret Map Rewriters for operations other than linalg.generics
 //===----------------------------------------------------------------------===//
 
-// CRTP to help implementing a rewriter that demaps all its inputs and remaps
-// all its outputs.
+// CRTP to help implementing a rewriter that demaps all its inputs.
 template <typename SubClass, typename SourceOp>
-struct DemapInsRemapOutsRewriter : public OpRewritePattern<SourceOp> {
+struct DemapInsRewriter : public OpRewritePattern<SourceOp> {
   using OpRewritePattern<SourceOp>::OpRewritePattern;
   using OpAdaptor = typename SourceOp::Adaptor;
 
   LogicalResult matchAndRewrite(SourceOp op,
                                 PatternRewriter &rewriter) const override {
-    if (!static_cast<const SubClass *>(this)->matchOp(op))
-      return failure();
-
     Location loc = op.getLoc();
     // Demaps non-trivial inputs.
     SmallVector<Value> deMappedIns(op->getOperands());
@@ -166,61 +183,119 @@ struct DemapInsRemapOutsRewriter : public OpRewritePattern<SourceOp> {
 
     // CRTP call.
     OpAdaptor adaptor(deMappedIns);
-    ValueRange outs =
-        static_cast<const SubClass *>(this)->rewriteOp(op, adaptor, rewriter);
-    assert(outs.size() == op->getResults().size());
-
-    // Remap  outputs.
-    SmallVector<Value> reMappedOuts(outs);
-    for (auto [r, a] : llvm::zip(reMappedOuts, op->getResults()))
-      if (r.getType() != a.getType())
-        r = rewriter.create<ReinterpretMapOp>(loc, a.getType(), r);
-
-    rewriter.replaceOp(op, reMappedOuts);
-    return success();
+    return static_cast<const SubClass *>(this)->rewriteOp(op, adaptor,
+                                                          rewriter);
   }
 };
 
-struct CrdTranslateRewriter : public OpRewritePattern<CrdTranslateOp> {
-  using OpRewritePattern::OpRewritePattern;
-  LogicalResult matchAndRewrite(CrdTranslateOp op,
-                                PatternRewriter &rewriter) const override {
-    AffineMap map = op.getDirection() == CrdTransDirectionKind::dim2lvl
-                        ? op.getEncoder().getDimToLvl()
-                        : op.getEncoder().getLvlToDim();
-
-    SmallVector<Value> outCrds;
-    for (AffineExpr result : map.getResults()) {
-      // TODO: we should probably expand the affine map to IR using our own
-      // rules, since affine.apply assume signed value, while the cooridinates
-      // we provided must always be signless.
-      Value trans = rewriter.create<affine::AffineApplyOp>(
-          op.getLoc(), AffineMap::get(map.getNumDims(), 0, result),
-          op.getInCrds());
-      outCrds.push_back(trans);
-    }
-    rewriter.replaceOp(op, outCrds);
+struct TensorInsertDemapper
+    : public DemapInsRewriter<TensorInsertDemapper, tensor::InsertOp> {
+  using DemapInsRewriter::DemapInsRewriter;
+  LogicalResult rewriteOp(tensor::InsertOp op, OpAdaptor adaptor,
+                          PatternRewriter &rewriter) const {
+    if (!hasAnySparseResult(op))
+      return failure();
+
+    Location loc = op.getLoc();
+    auto stt = getSparseTensorType(op.getResult());
+    ValueRange lvlCrd = stt.translateCrds(rewriter, loc, op.getIndices(),
+                                          CrdTransDirectionKind::dim2lvl);
+    auto insertOp = rewriter.create<sparse_tensor::InsertOp>(
+        loc, op.getScalar(), adaptor.getDest(), lvlCrd);
+
+    Value out = genRemap(rewriter, stt.getEncoding(), insertOp.getResult());
+    rewriter.replaceOp(op, out);
     return success();
   }
 };
 
-struct TensorInsertRewriter
-    : public DemapInsRemapOutsRewriter<TensorInsertRewriter, tensor::InsertOp> {
-  using DemapInsRemapOutsRewriter::DemapInsRemapOutsRewriter;
+struct ForeachOpDemapper
+    : public DemapInsRewriter<ForeachOpDemapper, ForeachOp> {
+  using DemapInsRewriter::DemapInsRewriter;
+  LogicalResult rewriteOp(ForeachOp op, OpAdaptor adaptor,
+                          PatternRewriter &rewriter) const {
+    // Only handle operations with sparse input/output with non-identity dim2lvl
+    // maps.
+    if (!hasNonIdentityOperandsOrResults(op))
+      return failure();
 
-  bool matchOp(tensor::InsertOp op) const {
-    return op.getResult().getType().getEncoding() != nullptr;
-  }
+    // TODO: demap constant as well.
+    if (auto constOp = op.getTensor().getDefiningOp<arith::ConstantOp>())
+      if (auto attr = dyn_cast<SparseElementsAttr>(constOp.getValue()))
+        return failure();
 
-  ValueRange rewriteOp(tensor::InsertOp op, OpAdaptor adaptor,
-                       PatternRewriter &rewriter) const {
     Location loc = op.getLoc();
-    auto stt = getSparseTensorType(op.getResult());
-    ValueRange lvlCrd = stt.translateCrds(rewriter, loc, op.getIndices(),
-                                          CrdTransDirectionKind::dim2lvl);
-    Operation *insertOp = rewriter.create<sparse_tensor::InsertOp>(
-        loc, op.getScalar(), adaptor.getDest(), lvlCrd);
-    return insertOp->getResults();
+    // Cache the type information since we update the foreach op in-place.
+    auto srcStt = getSparseTensorType(op.getTensor());
+    SmallVector<Type> prevRetTps(op.getResultTypes());
+
+    rewriter.startRootUpdate(op);
+    op.getTensorMutable().assign(adaptor.getTensor());
+    op.getInitArgsMutable().assign(adaptor.getInitArgs());
+    // Update results' types.
+    for (auto r : op.getResults())
+      if (auto stt = tryGetSparseTensorType(r); stt && !stt->isIdentity())
+        r.setType(stt->getDemappedType());
+
+    Level lvlRank = getSparseTensorType(adaptor.getTensor()).getLvlRank();
+    // Update the foreach body.
+    SmallVector<Type> blockArgTps(lvlRank, rewriter.getIndexType());
+    blockArgTps.push_back(srcStt.getElementType());
+    blockArgTps.append(adaptor.getInitArgs().getTypes().begin(),
+                       adaptor.getInitArgs().getTypes().end());
+    Block *body = op.getBody();
+    // Block Args: [dimCrd, val, initArgs]
+    unsigned preArgNum = body->getNumArguments();
+    for (Type t : blockArgTps)
+      body->addArgument(t, loc);
+
+    // Block Args: [dimCrd, val, initArgs, lvlCrds, val, DemappedArgs]
+    rewriter.setInsertionPointToStart(body);
+    ValueRange lvlCrds = body->getArguments().slice(preArgNum, lvlRank);
+
+    ValueRange dimCrds = srcStt.translateCrds(rewriter, loc, lvlCrds,
+                                              CrdTransDirectionKind::lvl2dim);
+    rewriter.replaceAllUsesWith(
+        body->getArguments().take_front(srcStt.getDimRank()), dimCrds);
+    body->eraseArguments(0, srcStt.getDimRank());
+    // Block Args: [val, initArgs, lvlCrds, val, DemappedArgs]
+    unsigned numInitArgs = op.getInitArgs().size();
+    rewriter.replaceAllUsesWith(body->getArgument(0),
+                                body->getArgument(lvlRank + numInitArgs + 1));
+    body->eraseArgument(0);
+    // Block Args: [initArgs, lvlCrds, val, DemappedArgs]
+    ValueRange srcArgs = body->getArguments().take_front(numInitArgs);
+    ValueRange dstArgs = body->getArguments().take_back(numInitArgs);
+    // Remap back before replacement.
+    SmallVector<Value> reMappedArgs =
+        remapValueRange(rewriter, srcArgs.getTypes(), dstArgs);
+    rewriter.replaceAllUsesWith(srcArgs, reMappedArgs);
+    body->eraseArguments(0, numInitArgs);
+    // Block Args: [lvlCrds, DemappedArgs] and we are done.
+
+    // Update yield operations.
+    if (numInitArgs != 0) {
+      rewriter.setInsertionPointToEnd(body);
+      auto yield = llvm::cast<YieldOp>(body->getTerminator());
+      if (auto stt = tryGetSparseTensorType(yield.getResult());
+          stt && !stt->isIdentity()) {
+        Value y = genDemap(rewriter, stt->getEncoding(), yield.getResult());
+        rewriter.create<YieldOp>(loc, y);
+        rewriter.eraseOp(yield);
+      }
+    }
+    rewriter.finalizeRootUpdate(op);
+
+    rewriter.setInsertionPointAfter(op);
+    SmallVector<Value> outs =
+        remapValueRange(rewriter, prevRetTps, op.getResults());
+
+    // Replace all the uses of the foreach results, expect the use in
+    // reinterpret_map used to remap the output.
+    for (auto [from, to] : llvm::zip(op.getResults(), outs))
+      rewriter.replaceAllUsesExcept(from, to, to.getDefiningOp());
+
+    return success();
   }
 };
 
@@ -234,7 +309,7 @@ void mlir::populateSparseReinterpretMap(RewritePatternSet &patterns,
   }
   if (scope == ReinterpretMapScope::kAll ||
       scope == ReinterpretMapScope::kExceptGeneric) {
-    patterns.add<CrdTranslateRewriter, TensorInsertRewriter>(
+    patterns.add<TensorInsertDemapper, ForeachOpDemapper>(
         patterns.getContext());
   }
 }
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp
index 02796bc9a7e7d..c00f19916e49f 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp
@@ -1063,6 +1063,29 @@ struct DirectConvertRewriter : public OpRewritePattern<ConvertOp> {
   }
 };
 
+struct CrdTranslateRewriter : public OpRewritePattern<CrdTranslateOp> {
+  using OpRewritePattern::OpRewritePattern;
+  LogicalResult matchAndRewrite(CrdTranslateOp op,
+                                PatternRewriter &rewriter) const override {
+    AffineMap map = op.getDirection() == CrdTransDirectionKind::dim2lvl
+                        ? op.getEncoder().getDimToLvl()
+                        : op.getEncoder().getLvlToDim();
+
+    SmallVector<Value> outCrds;
+    for (AffineExpr result : map.getResults()) {
+      // TODO: we should probably expand the affine map to IR using our own
+      // rules, since affine.apply assume signed value, while the cooridinates
+      // we provided must always be signless.
+      Value trans = rewriter.create<affine::AffineApplyOp>(
+          op.getLoc(), AffineMap::get(map.getNumDims(), 0, result),
+          op.getInCrds());
+      outCrds.push_back(trans);
+    }
+    rewriter.replaceOp(op, outCrds);
+    return success();
+  }
+};
+
 /// Sparse rewriting rule for the foreach operator.
 struct ForeachRewriter : public OpRewritePattern<ForeachOp> {
 public:
@@ -1284,5 +1307,7 @@ void mlir::populateLowerSparseOpsToForeachPatterns(RewritePatternSet &patterns,
 }
 
 void mlir::populateLowerForeachToSCFPatterns(RewritePatternSet &patterns) {
-  patterns.add<ForeachRewriter>(patterns.getContext());
+  // Run CrdTranslateRewriter later in the pipeline so that operation can be
+  // folded before lowering to affine.apply
+  patterns.add<CrdTranslateRewriter, ForeachRewriter>(patterns.getContext());
 }
diff --git a/mlir/test/Dialect/SparseTensor/sparse_reinterpret_map.mlir b/mlir/test/Dialect/SparseTensor/sparse_reinterpret_map.mlir
index 149c0bc46e251..be3ab37e9cbd1 100644
--- a/mlir/test/Dialect/SparseTensor/sparse_reinterpret_map.mlir
+++ b/mlir/test/Dialect/SparseTensor/sparse_reinterpret_map.mlir
@@ -1,15 +1,4 @@
-// RUN: mlir-opt %s -split-input-file  --sparse-reinterpret-map | FileCheck %s
-
-#SparseVector = #sparse_tensor.encoding<{ map = (d0) -> (d0 : compressed) }>
-
-// CHECK-LABEL: func @sparse_nop(
-//  CHECK-SAME: %[[A0:.*]]: tensor<?xf64, #sparse_tensor.encoding<{{{.*}}}>>)
-//       CHECK: return %[[A0]]
-func.func @sparse_nop(%arg0: tensor<?xf64, #SparseVector>) -> tensor<?xf64, #SparseVector> {
-  return %arg0 : tensor<?xf64, #SparseVector>
-}
-
-// -----
+// RUN: mlir-opt %s -split-input-file --sparse-reinterpret-map | FileCheck %s
 
 #trait_mul = {
   indexing_maps = [
@@ -55,3 +44,38 @@ func.func @mul(%arg0: tensor<32x32xf32>,
   return %0 : tensor<32x32xf32, #BSR>
 }
 
+// -----
+
+#BSR = #sparse_tensor.encoding<{
+   map = ( i, j ) ->
+      ( i floordiv 2 : dense,
+        j floordiv 2 : compressed,
+        i mod 2      : dense,
+        j mod 2      : dense
+      )
+}>
+
+// CHECK-LABEL:   func.func @sparse_foreach_reinterpret_map(
+// CHECK-SAME:      %[[VAL_0:.*]]: tensor<2x4xf64
+// CHECK:           %[[VAL_1:.*]] = bufferization.alloc_tensor() : tensor<2x4xf64
+// CHECK:           %[[VAL_2:.*]] = sparse_tensor.reinterpret_map %[[VAL_0]] : tensor<2x4xf64
+// CHECK:           %[[VAL_3:.*]] = sparse_tensor.reinterpret_map %[[VAL_1]] : tensor<2x4xf64
+// CHECK:           %[[VAL_4:.*]] = sparse_tensor.foreach in %[[VAL_2]] init(%[[VAL_3]])
+// CHECK:           ^bb0(%[[VAL_5:.*]]: index, %[[VAL_6:.*]]: index, %[[VAL_7:.*]]: index, %[[VAL_8:.*]]: index, %[[VAL_9:.*]]: f64, %[[VAL_10:.*]]: tensor<1x2x2x2xf64
+// CHECK:             %[[VAL_11:.*]] = sparse_tensor.insert %[[VAL_9]] into %[[VAL_10]]{{\[}}%[[VAL_5]], %[[VAL_6]], %[[VAL_7]], %[[VAL_8]]] : tensor<1x2x2x2xf64
+// CHECK:             sparse_tensor.yield %[[VAL_11]] : tensor<1x2x2x2xf64
+// CHECK:           }
+// CHECK:           %[[VAL_12:.*]] = sparse_tensor.reinterpret_map %[[VAL_4]] : tensor<1x2x2x2xf64
+// CHECK:           %[[VAL_13:.*]] = sparse_tensor.load %[[VAL_12]] hasInserts : tensor<2x4xf64
+// CHECK:           return %[[VAL_13]] : tensor<2x4xf64
+// CHECK:         }
+func.func @sparse_foreach_reinterpret_map(%6 : tensor<2x4xf64, #BSR>) -> tensor<2x4xf64, #BSR> {
+  %7 = bufferization.alloc_tensor() : tensor<2x4xf64, #BSR>
+  %8 = sparse_tensor.foreach in %6 init(%7) : tensor<2x4xf64, #BSR>, tensor<2x4xf64, #BSR> -> tensor<2x4xf64, #BSR> do {
+    ^bb0(%arg0: index, %arg1: index, %arg2: f64, %arg3: tensor<2x4xf64, #BSR>):
+      %inserted = tensor.insert %arg2 into %arg3[%arg0, %arg1] : tensor<2x4xf64, #BSR>
+      sparse_tensor.yield %inserted : tensor<2x4xf64, #BSR>
+  }
+  %9 = sparse_tensor.load %8 hasInserts : tensor<2x4xf64, #BSR>
+  return %9 : tensor<2x4xf64, #BSR>
+}

From a8152086ff422d6679ce7cba96bba777eeddd60b Mon Sep 17 00:00:00 2001
From: Johannes Doerfert <johannes@jdoerfert.de>
Date: Wed, 1 Nov 2023 11:30:53 -0700
Subject: [PATCH 731/877] [Attributor][FIX] Ensure new BBs are registered

---
 .../Transforms/IPO/AttributorAttributes.cpp   |  3 ++
 llvm/lib/Transforms/IPO/OpenMPOpt.cpp         |  2 ++
 .../generic_multiple_parallel_regions.c       | 29 +++++++++++++++++++
 3 files changed, 34 insertions(+)
 create mode 100644 openmp/libomptarget/test/offloading/generic_multiple_parallel_regions.c

diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
index 21d83edbb18c4..c698d1babbf63 100644
--- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
+++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
@@ -12405,11 +12405,14 @@ struct AAIndirectCallInfoCallSite : public AAIndirectCallInfo {
       Instruction *ThenTI =
           SplitBlockAndInsertIfThen(LastCmp, IP, /* Unreachable */ false);
       BasicBlock *CBBB = CB->getParent();
+      A.registerManifestAddedBasicBlock(*ThenTI->getParent());
+      A.registerManifestAddedBasicBlock(*CBBB);
       auto *SplitTI = cast<BranchInst>(LastCmp->getNextNode());
       BasicBlock *ElseBB;
       if (IP == CB) {
         ElseBB = BasicBlock::Create(ThenTI->getContext(), "",
                                     ThenTI->getFunction(), CBBB);
+        A.registerManifestAddedBasicBlock(*ElseBB);
         IP = BranchInst::Create(CBBB, ElseBB);
         SplitTI->replaceUsesOfWith(CBBB, ElseBB);
       } else {
diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
index 3e9705d61e8e0..74ebbcde57292 100644
--- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
+++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -4567,6 +4567,8 @@ struct AAKernelInfoFunction : AAKernelInfo {
       BasicBlock *PRNextBB =
           BasicBlock::Create(Ctx, "worker_state_machine.parallel_region.check",
                              Kernel, StateMachineEndParallelBB);
+      A.registerManifestAddedBasicBlock(*PRExecuteBB);
+      A.registerManifestAddedBasicBlock(*PRNextBB);
 
       // Check if we need to compare the pointer at all or if we can just
       // call the parallel region function.
diff --git a/openmp/libomptarget/test/offloading/generic_multiple_parallel_regions.c b/openmp/libomptarget/test/offloading/generic_multiple_parallel_regions.c
new file mode 100644
index 0000000000000..33fe2ca127b6d
--- /dev/null
+++ b/openmp/libomptarget/test/offloading/generic_multiple_parallel_regions.c
@@ -0,0 +1,29 @@
+// RUN: %libomptarget-compile-run-and-check-generic
+// RUN: %libomptarget-compileopt-run-and-check-generic
+
+#include <omp.h>
+#include <stdio.h>
+
+__attribute__((optnone)) void optnone() {}
+
+int main() {
+  int i = 0;
+#pragma omp target teams num_teams(1) map(tofrom : i)
+  {
+    optnone();
+#pragma omp parallel
+    if (omp_get_thread_num() == 0)
+      ++i;
+#pragma omp parallel
+    if (omp_get_thread_num() == 0)
+      ++i;
+#pragma omp parallel
+    if (omp_get_thread_num() == 0)
+      ++i;
+#pragma omp parallel
+    if (omp_get_thread_num() == 0)
+      ++i;
+  }
+  // CHECK: 4
+  printf("%i\n", i);
+}

From 0a925be0b5755d7ca08767dc51e341099345aa40 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Wed, 1 Nov 2023 12:21:46 -0700
Subject: [PATCH 732/877] [IPO] Remove unused function getAndUpdateAAFor
 (#69544)

The last use was removed by:

  commit 18d9f7ebbaa6022fb924c1607a0edc7ebe6ec8b8
  Author: Johannes Doerfert <johannes@jdoerfert.de>
  Date:   Wed Jul 5 15:35:24 2023 -0700
---
 llvm/include/llvm/Transforms/IPO/Attributor.h | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/llvm/include/llvm/Transforms/IPO/Attributor.h b/llvm/include/llvm/Transforms/IPO/Attributor.h
index bd1bd8261123e..7190036d60c8c 100644
--- a/llvm/include/llvm/Transforms/IPO/Attributor.h
+++ b/llvm/include/llvm/Transforms/IPO/Attributor.h
@@ -1541,18 +1541,6 @@ struct Attributor {
                                     /* ForceUpdate */ false);
   }
 
-  /// Similar to getAAFor but the return abstract attribute will be updated (via
-  /// `AbstractAttribute::update`) even if it is found in the cache. This is
-  /// especially useful for AAIsDead as changes in liveness can make updates
-  /// possible/useful that were not happening before as the abstract attribute
-  /// was assumed dead.
-  template <typename AAType>
-  const AAType *getAndUpdateAAFor(const AbstractAttribute &QueryingAA,
-                                  const IRPosition &IRP, DepClassTy DepClass) {
-    return getOrCreateAAFor<AAType>(IRP, &QueryingAA, DepClass,
-                                    /* ForceUpdate */ true);
-  }
-
   /// The version of getAAFor that allows to omit a querying abstract
   /// attribute. Using this after Attributor started running is restricted to
   /// only the Attributor itself. Initial seeding of AAs can be done via this

From e2c440b5da8f87a33015e1da41220b8ec8c59e19 Mon Sep 17 00:00:00 2001
From: Tom Stellard <tstellar@redhat.com>
Date: Wed, 1 Nov 2023 12:24:20 -0700
Subject: [PATCH 733/877] workflows: Unsplit new-prs (#69560)

This is essentially a revert of
91fdb20915696ab955be0baea18b976717e370b9. It is safe to use the
pull_request_target event for new-prs, because it does not checkout any
code from the pull request branch.
---
 .github/workflows/new-prs.yml    | 58 +++++++++++---------------------
 .github/workflows/pr-receive.yml | 34 -------------------
 2 files changed, 19 insertions(+), 73 deletions(-)
 delete mode 100644 .github/workflows/pr-receive.yml

diff --git a/.github/workflows/new-prs.yml b/.github/workflows/new-prs.yml
index c1952ddab83f7..9ba55d59ff15b 100644
--- a/.github/workflows/new-prs.yml
+++ b/.github/workflows/new-prs.yml
@@ -1,56 +1,36 @@
 name: "Labelling new pull requests"
+
+permissions:
+  contents: read
+
 on:
-  workflow_run:
-    workflows: ["PR Receive"]
+  # It's safe to use pull_request_target here, because we aren't checking out
+  # code from the pull request branch.
+  # See https://securitylab.github.com/research/github-actions-preventing-pwn-requests/
+  pull_request_target:
+    types:
+      - opened
+      - reopened
+      - ready_for_review
+      - synchronize
 
 jobs:
   automate-prs-labels:
     permissions:
-      contents: read
       pull-requests: write
     runs-on: ubuntu-latest
+    # Ignore PRs with more than 10 commits.  Pull requests with a lot of
+    # commits tend to be accidents usually when someone made a mistake while trying
+    # to rebase.  We want to ignore these pull requests to avoid excessive
+    # notifications.
     if: >
       github.repository == 'llvm/llvm-project' &&
-      github.event.workflow_run.event == 'pull_request_target' &&
-      github.event.workflow_run.conclusion == 'success'
+      github.event.pull_request.draft == false &&
+      github.event.pull_request.commits < 10
     steps:
-      # From: https://securitylab.github.com/research/github-actions-preventing-pwn-requests/
-      # Updated version here: https://docs.github.com/en/actions/using-workflows/events-that-trigger-workflows#using-data-from-the-triggering-workflow
-      - name: Debug
-        run: |
-          echo "Event: ${{ github.event.workflow_run.event }} Conclusion: ${{ github.event.workflow_run.conclusion }}"
-      - name: 'Download artifact'
-        uses: actions/github-script@v6
-        with:
-          script: |
-            const artifacts = await github.rest.actions.listWorkflowRunArtifacts({
-               owner: context.repo.owner,
-               repo: context.repo.repo,
-               run_id: context.payload.workflow_run.id
-            });
-            const matchArtifact = artifacts.data.artifacts.find((artifact) =>
-              artifact.name === 'pr'
-            );
-            const download = await github.rest.actions.downloadArtifact({
-               owner: context.repo.owner,
-               repo: context.repo.repo,
-               artifact_id: matchArtifact.id,
-               archive_format: 'zip'
-            });
-            const { writeFileSync } = require('node:fs');
-            writeFileSync('${{ github.workspace }}/pr.zip', Buffer.from(download.data));
-
-      - run: unzip pr.zip
-
-      - name: "Get PR Number"
-        id: vars
-        run:
-          echo "pr-number=$(cat NR)" >> "$GITHUB_OUTPUT"
-
       - uses: actions/labeler@v4
         with:
           configuration-path: .github/new-prs-labeler.yml
           # workaround for https://github.com/actions/labeler/issues/112
           sync-labels: ''
           repo-token: ${{ secrets.ISSUE_SUBSCRIBER_TOKEN }}
-          pr-number: ${{ steps.vars.outputs.pr-number }}
diff --git a/.github/workflows/pr-receive.yml b/.github/workflows/pr-receive.yml
deleted file mode 100644
index 13f1a883cf8ff..0000000000000
--- a/.github/workflows/pr-receive.yml
+++ /dev/null
@@ -1,34 +0,0 @@
-# See https://securitylab.github.com/research/github-actions-preventing-pwn-requests/
-
-name: PR Receive
-on:
-  pull_request_target:
-    types:
-      - opened
-      - reopened
-      - ready_for_review
-      - synchronize
-
-permissions:
-  contents: read
-
-jobs:
-  pr-target:
-    runs-on: ubuntu-latest
-    # Ignore PRs with more than 10 commits.  Pull requests with a lot of
-    # commits tend to be accidents usually when someone made a mistake while trying
-    # to rebase.  We want to ignore these pull requests to avoid excessive
-    # notifications.
-    if: github.repository == 'llvm/llvm-project' &&
-        github.event.pull_request.draft == false &&
-        github.event.pull_request.commits < 10
-    steps:
-      - name: Store PR Information
-        run: |
-          mkdir -p ./pr
-          echo ${{ github.event.number }} > ./pr/NR
-
-      - uses: actions/upload-artifact@v3
-        with:
-          name: pr
-          path: pr/

From 3c3926eeb347f74fe99fda3c923611f3f8262318 Mon Sep 17 00:00:00 2001
From: Evgeniy <evgeniy.tyurin@intel.com>
Date: Wed, 1 Nov 2023 12:28:26 -0700
Subject: [PATCH 734/877] [SYCL][ESIMD] Allow MSVC math functions in ESIMD
 context (#11699)

After change in the way math functions are included, these builtins
started trigerring diagnostic for ESIMD context as unsupported.
This functions are implemented in libdevice and should be allowed.
---
 clang/lib/Sema/SemaExpr.cpp                   | 22 +++++++++++++-
 clang/test/SemaSYCL/esimd-msvc-math.cpp       | 12 ++++++++
 .../esimd/regression/windows_msvc_math.cpp    | 29 +++++++++++++++++++
 3 files changed, 62 insertions(+), 1 deletion(-)
 create mode 100644 clang/test/SemaSYCL/esimd-msvc-math.cpp
 create mode 100644 sycl/test/esimd/regression/windows_msvc_math.cpp

diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index a3b36cfb8960c..4496468516fbe 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -262,12 +262,32 @@ bool Sema::DiagnoseUseOfDecl(NamedDecl *D, ArrayRef<SourceLocation> Locs,
       // there are some exceptions from this rule - functions that start
       // with '__spirv_', '__sycl_', '__assert_fail', etc.
       const IdentifierInfo *Id = FDecl->getIdentifier();
+      auto isMsvcMathFn = [=](StringRef Name) {
+        auto *AuxInfo = Context.getAuxTargetInfo();
+        if (!AuxInfo)
+          return false;
+        if (!AuxInfo->getTriple().isWindowsMSVCEnvironment())
+          return false;
+        return llvm::StringSwitch<bool>(Name)
+            .Case("_FDtest", true)
+            .Case("_hypotf", true)
+            .Case("_fdpcomp", true)
+            .Case("_fdsign", true)
+            .Case("_fdtest", true)
+            .Case("_FDnorm", true)
+            .Case("_FDscale", true)
+            .Case("_FExp", true)
+            .Case("_FCosh", true)
+            .Case("_FSinh", true)
+            .Default(false);
+      };
       if ((getEmissionReason(FDecl) == Sema::DeviceDiagnosticReason::Sycl) &&
           Id && !Id->getName().startswith("__spirv_") &&
           !Id->getName().startswith("__sycl_") &&
           !Id->getName().startswith("__devicelib_ConvertBF16ToFINTEL") &&
           !Id->getName().startswith("__devicelib_ConvertFToBF16INTEL") &&
-          !Id->getName().startswith("__assert_fail")) {
+          !Id->getName().startswith("__assert_fail") &&
+          !isMsvcMathFn(Id->getName())) {
         SYCLDiagIfDeviceCode(
             *Locs.begin(), diag::err_sycl_device_function_is_called_from_esimd,
             Sema::DeviceDiagnosticReason::Esimd);
diff --git a/clang/test/SemaSYCL/esimd-msvc-math.cpp b/clang/test/SemaSYCL/esimd-msvc-math.cpp
new file mode 100644
index 0000000000000..6710aa824702f
--- /dev/null
+++ b/clang/test/SemaSYCL/esimd-msvc-math.cpp
@@ -0,0 +1,12 @@
+// RUN: %clang_cc1 -fsycl-is-device -triple spir64-unknown-windows -aux-triple x86_64-pc-windows-msvc -fsyntax-only -verify %s
+// expected-no-diagnostics
+
+// The test validates ability to build ESIMD code with msvc math functions
+// on windows platform.
+
+extern __attribute__((sycl_device)) short _FDtest(float *px);
+
+__attribute__((sycl_device)) __attribute__((sycl_explicit_simd)) void kern() {
+  float a;
+  _FDtest(&a);
+}
diff --git a/sycl/test/esimd/regression/windows_msvc_math.cpp b/sycl/test/esimd/regression/windows_msvc_math.cpp
new file mode 100644
index 0000000000000..b1b999a785504
--- /dev/null
+++ b/sycl/test/esimd/regression/windows_msvc_math.cpp
@@ -0,0 +1,29 @@
+//==------- windows_msvc_math.cpp - DPC++ ESIMD build test -----------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// REQUIRES: windows
+// RUN: %clangxx -fsycl -fsyntax-only -Xclang -verify %s
+// expected-no-diagnostics
+
+// The tests validates an ability to build ESIMD code on windows platform.
+
+#include <cmath>
+#include <sycl/ext/intel/esimd.hpp>
+#include <sycl/sycl.hpp>
+
+using namespace sycl;
+
+int main() {
+  queue q;
+
+  q.single_task([=]() SYCL_ESIMD_KERNEL {
+     float a;
+     _FDtest(&a);
+   }).wait();
+
+  return 0;
+}

From c4649d05cfc216da87868f818b2c04e78cf756e0 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Wed, 1 Nov 2023 12:08:24 -0700
Subject: [PATCH 735/877] [RISCV] Teach RISCVOptWInstrs that 'bset x0, 30-0'
 satisfies isSignExtendingOpW.

Constant materialization can use bset x0, 11 to create 2048.
---
 llvm/lib/Target/RISCV/RISCVOptWInstrs.cpp         | 4 ++++
 llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64zbs.ll | 2 --
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVOptWInstrs.cpp b/llvm/lib/Target/RISCV/RISCVOptWInstrs.cpp
index a62c7b4bbae06..e73cab58d7016 100644
--- a/llvm/lib/Target/RISCV/RISCVOptWInstrs.cpp
+++ b/llvm/lib/Target/RISCV/RISCVOptWInstrs.cpp
@@ -359,6 +359,10 @@ static bool isSignExtendingOpW(const MachineInstr &MI,
   // An ORI with an >11 bit immediate (negative 12-bit) will set bits 63:11.
   case RISCV::ORI:
     return !isUInt<11>(MI.getOperand(2).getImm());
+  // A bseti with X0 is sign extended if the immediate is less than 31.
+  case RISCV::BSETI:
+    return MI.getOperand(2).getImm() < 31 &&
+           MI.getOperand(1).getReg() == RISCV::X0;
   // Copying from X0 produces zero.
   case RISCV::COPY:
     return MI.getOperand(1).getReg() == RISCV::X0;
diff --git a/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64zbs.ll b/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64zbs.ll
index 302b1380b9c45..b0b75dde02231 100644
--- a/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64zbs.ll
+++ b/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64zbs.ll
@@ -616,7 +616,6 @@ define signext i32 @bseti_i32_11(i32 signext %a) nounwind {
 ; RV64ZBS:       # %bb.0:
 ; RV64ZBS-NEXT:    bseti a1, zero, 11
 ; RV64ZBS-NEXT:    or a0, a0, a1
-; RV64ZBS-NEXT:    sext.w a0, a0
 ; RV64ZBS-NEXT:    ret
   %or = or i32 %a, 2048
   ret i32 %or
@@ -751,7 +750,6 @@ define signext i32 @binvi_i32_11(i32 signext %a) nounwind {
 ; RV64ZBS:       # %bb.0:
 ; RV64ZBS-NEXT:    bseti a1, zero, 11
 ; RV64ZBS-NEXT:    xor a0, a0, a1
-; RV64ZBS-NEXT:    sext.w a0, a0
 ; RV64ZBS-NEXT:    ret
   %xor = xor i32 %a, 2048
   ret i32 %xor

From 67a53ae73b6752d37979f53aa27b7bd193767468 Mon Sep 17 00:00:00 2001
From: Tom Stellard <tstellar@redhat.com>
Date: Wed, 1 Nov 2023 12:31:13 -0700
Subject: [PATCH 736/877] workflows/scorecard: Run this job once per day
 instead of for every push

We started going over our API limits and suspect this was the cause.
For now, we'll run it once a day while we sort this out.
---
 .github/workflows/scorecard.yml | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/.github/workflows/scorecard.yml b/.github/workflows/scorecard.yml
index 4469222c79f1e..270f58f777ce8 100644
--- a/.github/workflows/scorecard.yml
+++ b/.github/workflows/scorecard.yml
@@ -12,9 +12,7 @@ on:
   # To guarantee Maintained check is occasionally updated. See
   # https://github.com/ossf/scorecard/blob/main/docs/checks.md#maintained
   schedule:
-    - cron: '38 20 * * 4'
-  push:
-    branches: [ "main" ]
+    - cron: '38 20 * * *'
 
 # Declare default permissions as read only.
 permissions:

From 64d5da60d437049548a752001c4081f65137ea47 Mon Sep 17 00:00:00 2001
From: Tom Stellard <tstellar@redhat.com>
Date: Wed, 1 Nov 2023 12:38:31 -0700
Subject: [PATCH 737/877] workflows: Unsplit pr-subscriber (#69748)

This is essentially a revert of
1ed710836a77a5a29f3649da87f9f3e8ae3aa086. It is safe to use the
pull_request_target event for pr-subscriber, because it does not
checkout any code from the pull request branch.
---
 .github/workflows/pr-receive-label.yml  | 26 -------------
 .github/workflows/pr-subscriber-wait.py | 27 -------------
 .github/workflows/pr-subscriber.yml     | 52 +++----------------------
 3 files changed, 5 insertions(+), 100 deletions(-)
 delete mode 100644 .github/workflows/pr-receive-label.yml
 delete mode 100644 .github/workflows/pr-subscriber-wait.py

diff --git a/.github/workflows/pr-receive-label.yml b/.github/workflows/pr-receive-label.yml
deleted file mode 100644
index c9085c44b9836..0000000000000
--- a/.github/workflows/pr-receive-label.yml
+++ /dev/null
@@ -1,26 +0,0 @@
-# See https://securitylab.github.com/research/github-actions-preventing-pwn-requests/
-
-name: PR Receive Label
-on:
-  pull_request:
-    types:
-      - labeled
-
-permissions:
-  contents: read
-
-jobs:
-  pr-subscriber:
-    runs-on: ubuntu-latest
-    if: github.repository == 'llvm/llvm-project'
-    steps:
-      - name: Store PR Information
-        run: |
-          mkdir -p ./pr
-          echo ${{ github.event.number }} > ./pr/NR
-          echo ${{ github.event.label.name }} > ./pr/LABEL
-
-      - uses: actions/upload-artifact@v3
-        with:
-          name: pr
-          path: pr/
diff --git a/.github/workflows/pr-subscriber-wait.py b/.github/workflows/pr-subscriber-wait.py
deleted file mode 100644
index 633f78c147707..0000000000000
--- a/.github/workflows/pr-subscriber-wait.py
+++ /dev/null
@@ -1,27 +0,0 @@
-import github
-import os
-import sys
-import time
-
-
-def needs_to_wait(repo):
-    workflow_name = os.environ.get("GITHUB_WORKFLOW")
-    run_number = os.environ.get("GITHUB_RUN_NUMBER")
-    print("Workflow Name:", workflow_name, "Run Number:", run_number)
-    for status in ["in_progress", "queued"]:
-        for workflow in repo.get_workflow_runs(status=status):
-            print("Looking at ", workflow.name, "#", workflow.run_number)
-            if workflow.name != workflow_name:
-                continue
-            if workflow.run_number < int(run_number):
-                print("Workflow {} still {} ".format(workflow.run_number, status))
-                return True
-    return False
-
-
-repo_name = os.environ.get("GITHUB_REPOSITORY")
-token = os.environ.get("GITHUB_TOKEN")
-gh = github.Github(token)
-repo = gh.get_repo(repo_name)
-while needs_to_wait(repo):
-    time.sleep(30)
diff --git a/.github/workflows/pr-subscriber.yml b/.github/workflows/pr-subscriber.yml
index 1fc3bfed3a66b..ef2ef7b9e4a35 100644
--- a/.github/workflows/pr-subscriber.yml
+++ b/.github/workflows/pr-subscriber.yml
@@ -1,22 +1,17 @@
 name: PR Subscriber
 
 on:
-  workflow_run:
-    workflows: ["PR Receive Label"]
+  pull_request_target:
     types:
-      - completed
+      - labeled
 
 permissions:
-  actions: read
   contents: read
 
 jobs:
   auto-subscribe:
     runs-on: ubuntu-latest
-    if: >
-      github.repository == 'llvm/llvm-project' &&
-      github.event.workflow_run.event == 'pull_request' &&
-      github.event.workflow_run.conclusion == 'success'
+    if: github.repository == 'llvm/llvm-project'
     steps:
       - name: Setup Automation Script
         run: |
@@ -26,47 +21,10 @@ jobs:
           chmod a+x github-automation.py
           pip install -r requirements.txt
 
-      - name: 'Wait for other actions'
-        # We can't use the concurrency tag for these jobs, because it will
-        # cancel pending jobs if another job is running.
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        run: |
-          python3 pr-subscriber-wait.py
-
-
-      # From: https://securitylab.github.com/research/github-actions-preventing-pwn-requests/
-      # Updated version here: https://docs.github.com/en/actions/using-workflows/events-that-trigger-workflows#using-data-from-the-triggering-workflow
-      - name: 'Download artifact'
-        uses: actions/github-script@v6
-        with:
-          script: |
-            const artifacts = await github.rest.actions.listWorkflowRunArtifacts({
-               owner: context.repo.owner,
-               repo: context.repo.repo,
-               run_id: context.payload.workflow_run.id
-            });
-            const matchArtifact = artifacts.data.artifacts.find((artifact) =>
-              artifact.name === 'pr'
-            );
-            const download = await github.rest.actions.downloadArtifact({
-               owner: context.repo.owner,
-               repo: context.repo.repo,
-               artifact_id: matchArtifact.id,
-               archive_format: 'zip'
-            });
-            const { writeFileSync } = require('node:fs');
-            writeFileSync('${{ github.workspace }}/pr.zip', Buffer.from(download.data));
-
-      - run: unzip pr.zip
-
       - name: Update watchers
-        # https://docs.github.com/en/actions/security-guides/security-hardening-for-github-actions#using-an-intermediate-environment-variable
         run: |
-          PR_NUMBER=$(cat NR)
-          LABEL_NAME=$(cat LABEL)
           ./github-automation.py \
             --token '${{ secrets.ISSUE_SUBSCRIBER_TOKEN }}' \
             pr-subscriber \
-            --issue-number "$PR_NUMBER" \
-            --label-name "$LABEL_NAME"
+            --issue-number "${{ github.event.number }}" \
+            --label-name "${{ github.event.label.name }}"

From 86f2e0925051a860eb0a2c187b185b47c88ec45a Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Wed, 1 Nov 2023 19:48:30 +0000
Subject: [PATCH 738/877] [AMDGPU] Tweak handling of GlobalAddress operands in
 SI_PC_ADD_REL_OFFSET (#70960)

When SI_PC_ADD_REL_OFFSET is expanded to S_GETPC/S_ADD/S_ADDC, the
GlobalAddress operands have to be adjusted by 4 or 12 bytes to account
for the offset from the end of the S_GETPC instruction to the literal
operands. Do this all in SIInstrInfo::expandPostRAPseudo instead of
duplicating the adjustment code in both AMDGPULegalizerInfo and
SITargetLowering. NFCI.
---
 .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 13 +---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     | 20 ++----
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp        | 26 +++++--
 .../GlobalISel/dropped_debug_info_assert.ll   |  2 +-
 .../CodeGen/AMDGPU/GlobalISel/global-value.ll | 16 ++---
 .../AMDGPU/amdgcn-load-offset-from-reg.ll     |  2 +-
 .../branch-folding-implicit-def-subreg.ll     |  2 +-
 .../CodeGen/AMDGPU/isel-amdgpu-cs-chain-cc.ll | 72 +++++++++----------
 8 files changed, 72 insertions(+), 81 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 3d70ed150df12..7518387aeebc6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -2736,15 +2736,6 @@ bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
   //   operand to the global variable.
-  //
-  // What we want here is an offset from the value returned by s_getpc
-  // (which is the address of the s_add_u32 instruction) to the global
-  // variable, but since the encoding of $symbol starts 4 bytes after the start
-  // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
-  // small. This requires us to add 4 to the global variable offset in order to
-  // compute the correct address. Similarly for the s_addc_u32 instruction, the
-  // encoding of $symbol starts 12 bytes after the start of the s_add_u32
-  // instruction.
 
   LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
 
@@ -2754,11 +2745,11 @@ bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
   MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
     .addDef(PCReg);
 
-  MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
+  MIB.addGlobalAddress(GV, Offset, GAFlags);
   if (GAFlags == SIInstrInfo::MO_NONE)
     MIB.addImm(0);
   else
-    MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1);
+    MIB.addGlobalAddress(GV, Offset, GAFlags + 1);
 
   if (!B.getMRI()->getRegClassOrNull(PCReg))
     B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 413b3b5afa57a..ea37cc5e82772 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -6725,24 +6725,12 @@ buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV,
   //   $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
   //   which is a 64-bit pc-relative offset from the encoding of the $symbol
   //   operand to the global variable.
-  //
-  // What we want here is an offset from the value returned by s_getpc
-  // (which is the address of the s_add_u32 instruction) to the global
-  // variable, but since the encoding of $symbol starts 4 bytes after the start
-  // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
-  // small. This requires us to add 4 to the global variable offset in order to
-  // compute the correct address. Similarly for the s_addc_u32 instruction, the
-  // encoding of $symbol starts 12 bytes after the start of the s_add_u32
-  // instruction.
-  SDValue PtrLo =
-      DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4, GAFlags);
+  SDValue PtrLo = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags);
   SDValue PtrHi;
-  if (GAFlags == SIInstrInfo::MO_NONE) {
+  if (GAFlags == SIInstrInfo::MO_NONE)
     PtrHi = DAG.getTargetConstant(0, DL, MVT::i32);
-  } else {
-    PtrHi =
-        DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 12, GAFlags + 1);
-  }
+  else
+    PtrHi = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags + 1);
   return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, PtrLo, PtrHi);
 }
 
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 7c5a91c555b21..43dedbb03cc83 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2388,21 +2388,33 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
     Register Reg = MI.getOperand(0).getReg();
     Register RegLo = RI.getSubReg(Reg, AMDGPU::sub0);
     Register RegHi = RI.getSubReg(Reg, AMDGPU::sub1);
+    MachineOperand OpLo = MI.getOperand(1);
+    MachineOperand OpHi = MI.getOperand(2);
 
     // Create a bundle so these instructions won't be re-ordered by the
     // post-RA scheduler.
     MIBundleBuilder Bundler(MBB, MI);
     Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg));
 
-    // Add 32-bit offset from this instruction to the start of the
-    // constant data.
-    Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo)
-                       .addReg(RegLo)
-                       .add(MI.getOperand(1)));
-
+    // What we want here is an offset from the value returned by s_getpc (which
+    // is the address of the s_add_u32 instruction) to the global variable, but
+    // since the encoding of $symbol starts 4 bytes after the start of the
+    // s_add_u32 instruction, we end up with an offset that is 4 bytes too
+    // small. This requires us to add 4 to the global variable offset in order
+    // to compute the correct address. Similarly for the s_addc_u32 instruction,
+    // the encoding of $symbol starts 12 bytes after the start of the s_add_u32
+    // instruction.
+
+    if (OpLo.isGlobal())
+      OpLo.setOffset(OpLo.getOffset() + 4);
+    Bundler.append(
+        BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo).addReg(RegLo).add(OpLo));
+
+    if (OpHi.isGlobal())
+      OpHi.setOffset(OpHi.getOffset() + 12);
     Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi)
                        .addReg(RegHi)
-                       .add(MI.getOperand(2)));
+                       .add(OpHi));
 
     finalizeBundle(MBB, Bundler.begin());
 
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/dropped_debug_info_assert.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/dropped_debug_info_assert.ll
index c4a45f5ee8253..c4e383c3708b3 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/dropped_debug_info_assert.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/dropped_debug_info_assert.ll
@@ -45,7 +45,7 @@ define amdgpu_kernel void @call_debug_loc() {
   ; CHECK-NEXT:   $sgpr14 = COPY [[COPY15]], debug-location !6
   ; CHECK-NEXT:   $sgpr15 = COPY [[DEF]], debug-location !6
   ; CHECK-NEXT:   $vgpr31 = COPY [[V_OR3_B32_e64_]], debug-location !6
-  ; CHECK-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee + 4, target-flags(amdgpu-gotprel32-hi) @callee + 12, implicit-def $scc, debug-location !6
+  ; CHECK-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @callee, target-flags(amdgpu-gotprel32-hi) @callee, implicit-def $scc, debug-location !6
   ; CHECK-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET]], 0, 0, debug-location !6 :: (dereferenceable invariant load (p0) from got, addrspace 4)
   ; CHECK-NEXT:   $sgpr30_sgpr31 = noconvergent SI_CALL [[S_LOAD_DWORDX2_IMM]], @callee, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31, debug-location !6
   ; CHECK-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc, debug-location !6
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-value.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-value.ll
index b98b366b01a46..da17977602cb1 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/global-value.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/global-value.ll
@@ -17,7 +17,7 @@ define ptr addrspace(4) @external_constant_got() {
 
   ; GCN-LABEL: name: external_constant_got
   ; GCN: bb.1 (%ir-block.0):
-  ; GCN-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64(p4) = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @external_constant + 4, target-flags(amdgpu-gotprel32-hi) @external_constant + 12, implicit-def $scc
+  ; GCN-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64(p4) = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @external_constant, target-flags(amdgpu-gotprel32-hi) @external_constant, implicit-def $scc
   ; GCN-NEXT:   [[LOAD:%[0-9]+]]:_(p4) = G_LOAD [[SI_PC_ADD_REL_OFFSET]](p4) :: (dereferenceable invariant load (p4) from got, addrspace 4)
   ; GCN-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](p4)
   ; GCN-NEXT:   $vgpr0 = COPY [[UV]](s32)
@@ -38,7 +38,7 @@ define ptr addrspace(1) @external_global_got() {
 
   ; GCN-LABEL: name: external_global_got
   ; GCN: bb.1 (%ir-block.0):
-  ; GCN-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64(p4) = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @external_global + 4, target-flags(amdgpu-gotprel32-hi) @external_global + 12, implicit-def $scc
+  ; GCN-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64(p4) = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @external_global, target-flags(amdgpu-gotprel32-hi) @external_global, implicit-def $scc
   ; GCN-NEXT:   [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[SI_PC_ADD_REL_OFFSET]](p4) :: (dereferenceable invariant load (p1) from got, addrspace 4)
   ; GCN-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](p1)
   ; GCN-NEXT:   $vgpr0 = COPY [[UV]](s32)
@@ -59,7 +59,7 @@ define ptr addrspace(999) @external_other_got() {
 
   ; GCN-LABEL: name: external_other_got
   ; GCN: bb.1 (%ir-block.0):
-  ; GCN-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64(p4) = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @external_other + 4, target-flags(amdgpu-gotprel32-hi) @external_other + 12, implicit-def $scc
+  ; GCN-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64(p4) = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @external_other, target-flags(amdgpu-gotprel32-hi) @external_other, implicit-def $scc
   ; GCN-NEXT:   [[LOAD:%[0-9]+]]:_(p999) = G_LOAD [[SI_PC_ADD_REL_OFFSET]](p4) :: (dereferenceable invariant load (p999) from got, addrspace 4)
   ; GCN-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](p999)
   ; GCN-NEXT:   $vgpr0 = COPY [[UV]](s32)
@@ -80,7 +80,7 @@ define ptr addrspace(4) @internal_constant_pcrel() {
 
   ; GCN-LABEL: name: internal_constant_pcrel
   ; GCN: bb.1 (%ir-block.0):
-  ; GCN-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64(p4) = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @internal_constant + 4, target-flags(amdgpu-rel32-hi) @internal_constant + 12, implicit-def $scc
+  ; GCN-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64(p4) = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @internal_constant, target-flags(amdgpu-rel32-hi) @internal_constant, implicit-def $scc
   ; GCN-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[SI_PC_ADD_REL_OFFSET]](p4)
   ; GCN-NEXT:   $vgpr0 = COPY [[UV]](s32)
   ; GCN-NEXT:   $vgpr1 = COPY [[UV1]](s32)
@@ -100,7 +100,7 @@ define ptr addrspace(1) @internal_global_pcrel() {
 
   ; GCN-LABEL: name: internal_global_pcrel
   ; GCN: bb.1 (%ir-block.0):
-  ; GCN-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64(p1) = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @internal_global + 4, target-flags(amdgpu-rel32-hi) @internal_global + 12, implicit-def $scc
+  ; GCN-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64(p1) = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @internal_global, target-flags(amdgpu-rel32-hi) @internal_global, implicit-def $scc
   ; GCN-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[SI_PC_ADD_REL_OFFSET]](p1)
   ; GCN-NEXT:   $vgpr0 = COPY [[UV]](s32)
   ; GCN-NEXT:   $vgpr1 = COPY [[UV1]](s32)
@@ -120,7 +120,7 @@ define ptr addrspace(999) @internal_other_pcrel() {
 
   ; GCN-LABEL: name: internal_other_pcrel
   ; GCN: bb.1 (%ir-block.0):
-  ; GCN-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64(p999) = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @internal_other + 4, target-flags(amdgpu-rel32-hi) @internal_other + 12, implicit-def $scc
+  ; GCN-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64(p999) = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @internal_other, target-flags(amdgpu-rel32-hi) @internal_other, implicit-def $scc
   ; GCN-NEXT:   [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[SI_PC_ADD_REL_OFFSET]](p999)
   ; GCN-NEXT:   $vgpr0 = COPY [[UV]](s32)
   ; GCN-NEXT:   $vgpr1 = COPY [[UV1]](s32)
@@ -140,7 +140,7 @@ define ptr addrspace(6) @external_constant32_got() {
 
   ; GCN-LABEL: name: external_constant32_got
   ; GCN: bb.1 (%ir-block.0):
-  ; GCN-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64(p4) = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @external_constant32 + 4, target-flags(amdgpu-gotprel32-hi) @external_constant32 + 12, implicit-def $scc
+  ; GCN-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64(p4) = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @external_constant32, target-flags(amdgpu-gotprel32-hi) @external_constant32, implicit-def $scc
   ; GCN-NEXT:   [[LOAD:%[0-9]+]]:_(p4) = G_LOAD [[SI_PC_ADD_REL_OFFSET]](p4) :: (dereferenceable invariant load (p4) from got, addrspace 4)
   ; GCN-NEXT:   [[EXTRACT:%[0-9]+]]:_(p6) = G_EXTRACT [[LOAD]](p4), 0
   ; GCN-NEXT:   $vgpr0 = COPY [[EXTRACT]](p6)
@@ -158,7 +158,7 @@ define ptr addrspace(6) @internal_constant32_pcrel() {
 
   ; GCN-LABEL: name: internal_constant32_pcrel
   ; GCN: bb.1 (%ir-block.0):
-  ; GCN-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64(p4) = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @internal_constant32 + 4, target-flags(amdgpu-rel32-hi) @internal_constant32 + 12, implicit-def $scc
+  ; GCN-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64(p4) = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @internal_constant32, target-flags(amdgpu-rel32-hi) @internal_constant32, implicit-def $scc
   ; GCN-NEXT:   [[EXTRACT:%[0-9]+]]:_(p6) = G_EXTRACT [[SI_PC_ADD_REL_OFFSET]](p4), 0
   ; GCN-NEXT:   $vgpr0 = COPY [[EXTRACT]](p6)
   ; GCN-NEXT:   SI_RETURN implicit $vgpr0
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn-load-offset-from-reg.ll b/llvm/test/CodeGen/AMDGPU/amdgcn-load-offset-from-reg.ll
index d7b9eebff77c0..b014a17241c14 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn-load-offset-from-reg.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn-load-offset-from-reg.ll
@@ -31,7 +31,7 @@ define amdgpu_cs void @test_load_zext(i32 inreg %0, i32 inreg %1, i32 inreg %res
 ; Make sure we match constant bases with register offests, in which case
 ; the base may be the RHS operand of the load in SDAG.
 ; GCN-LABEL: name: test_complex_reg_offset
-; GCN-DAG: %[[BASE:.*]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @0 + 4,
+; GCN-DAG: %[[BASE:.*]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @0,
 ; SDAG-DAG: %[[OFFSET:.*]]:sreg_32 = nuw nsw S_LSHL_B32
 ; GISEL-DAG: %[[OFFSET:.*]]:sreg_32 = S_LSHL_B32
 ; SDAG: S_LOAD_DWORD_SGPR_IMM killed %[[BASE]], killed %[[OFFSET]], 0, 0
diff --git a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
index 96dbb03cded55..afd43de2eaafd 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
@@ -201,7 +201,7 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
   ; GFX90A-NEXT: {{  $}}
   ; GFX90A-NEXT:   renamable $sgpr8 = S_ADD_U32 renamable $sgpr8, 48, implicit-def $scc
   ; GFX90A-NEXT:   renamable $sgpr9 = S_ADDC_U32 killed renamable $sgpr9, 0, implicit-def dead $scc, implicit killed $scc
-  ; GFX90A-NEXT:   renamable $sgpr12_sgpr13 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @f2 + 4, target-flags(amdgpu-gotprel32-hi) @f2 + 12, implicit-def dead $scc
+  ; GFX90A-NEXT:   renamable $sgpr12_sgpr13 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @f2, target-flags(amdgpu-gotprel32-hi) @f2, implicit-def dead $scc
   ; GFX90A-NEXT:   renamable $sgpr18_sgpr19 = S_LOAD_DWORDX2_IMM killed renamable $sgpr12_sgpr13, 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4)
   ; GFX90A-NEXT:   $sgpr12 = COPY killed renamable $sgpr14
   ; GFX90A-NEXT:   $sgpr13 = COPY killed renamable $sgpr15
diff --git a/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-cc.ll b/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-cc.ll
index a6ca056211fff..f75d83ab6cbb5 100644
--- a/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-cc.ll
+++ b/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-cc.ll
@@ -34,7 +34,7 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_cc(<4 x i32> inreg %a, <4 x i32> %b
   ; GISEL-GFX11-NEXT:   $vgpr5 = COPY [[COPY5]]
   ; GISEL-GFX11-NEXT:   $vgpr6 = COPY [[COPY6]]
   ; GISEL-GFX11-NEXT:   $vgpr7 = COPY [[COPY7]]
-  ; GISEL-GFX11-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @use + 4, target-flags(amdgpu-gotprel32-hi) @use + 12, implicit-def $scc
+  ; GISEL-GFX11-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @use, target-flags(amdgpu-gotprel32-hi) @use, implicit-def $scc
   ; GISEL-GFX11-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
   ; GISEL-GFX11-NEXT:   $sgpr30_sgpr31 = noconvergent SI_CALL [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
   ; GISEL-GFX11-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32
@@ -63,7 +63,7 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_cc(<4 x i32> inreg %a, <4 x i32> %b
   ; GISEL-GFX10-NEXT:   $vgpr7 = COPY [[COPY7]]
   ; GISEL-GFX10-NEXT:   [[COPY8:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51
   ; GISEL-GFX10-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY8]]
-  ; GISEL-GFX10-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @use + 4, target-flags(amdgpu-gotprel32-hi) @use + 12, implicit-def $scc
+  ; GISEL-GFX10-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @use, target-flags(amdgpu-gotprel32-hi) @use, implicit-def $scc
   ; GISEL-GFX10-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
   ; GISEL-GFX10-NEXT:   $sgpr30_sgpr31 = noconvergent SI_CALL [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $sgpr0_sgpr1_sgpr2_sgpr3
   ; GISEL-GFX10-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32
@@ -82,7 +82,7 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_cc(<4 x i32> inreg %a, <4 x i32> %b
   ; DAGISEL-GFX11-NEXT:   [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1
   ; DAGISEL-GFX11-NEXT:   [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0
   ; DAGISEL-GFX11-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
-  ; DAGISEL-GFX11-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @use + 4, target-flags(amdgpu-gotprel32-hi) @use + 12, implicit-def dead $scc
+  ; DAGISEL-GFX11-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @use, target-flags(amdgpu-gotprel32-hi) @use, implicit-def dead $scc
   ; DAGISEL-GFX11-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4)
   ; DAGISEL-GFX11-NEXT:   $vgpr0 = COPY [[COPY7]]
   ; DAGISEL-GFX11-NEXT:   $vgpr1 = COPY [[COPY6]]
@@ -109,7 +109,7 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_cc(<4 x i32> inreg %a, <4 x i32> %b
   ; DAGISEL-GFX10-NEXT:   [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1
   ; DAGISEL-GFX10-NEXT:   [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0
   ; DAGISEL-GFX10-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
-  ; DAGISEL-GFX10-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @use + 4, target-flags(amdgpu-gotprel32-hi) @use + 12, implicit-def dead $scc
+  ; DAGISEL-GFX10-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @use, target-flags(amdgpu-gotprel32-hi) @use, implicit-def dead $scc
   ; DAGISEL-GFX10-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4)
   ; DAGISEL-GFX10-NEXT:   [[COPY8:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51
   ; DAGISEL-GFX10-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY8]]
@@ -158,7 +158,7 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_cc_ptr(ptr inreg %a, ptr %b, ptr ad
   ; GISEL-GFX11-NEXT:   $vgpr9 = COPY [[COPY9]]
   ; GISEL-GFX11-NEXT:   $vgpr10 = COPY [[COPY10]]
   ; GISEL-GFX11-NEXT:   $vgpr11 = COPY [[COPY11]]
-  ; GISEL-GFX11-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @use + 4, target-flags(amdgpu-gotprel32-hi) @use + 12, implicit-def $scc
+  ; GISEL-GFX11-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @use, target-flags(amdgpu-gotprel32-hi) @use, implicit-def $scc
   ; GISEL-GFX11-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
   ; GISEL-GFX11-NEXT:   $sgpr30_sgpr31 = noconvergent SI_CALL [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
   ; GISEL-GFX11-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32
@@ -195,7 +195,7 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_cc_ptr(ptr inreg %a, ptr %b, ptr ad
   ; GISEL-GFX10-NEXT:   $vgpr11 = COPY [[COPY11]]
   ; GISEL-GFX10-NEXT:   [[COPY12:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51
   ; GISEL-GFX10-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY12]]
-  ; GISEL-GFX10-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @use + 4, target-flags(amdgpu-gotprel32-hi) @use + 12, implicit-def $scc
+  ; GISEL-GFX10-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @use, target-flags(amdgpu-gotprel32-hi) @use, implicit-def $scc
   ; GISEL-GFX10-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
   ; GISEL-GFX10-NEXT:   $sgpr30_sgpr31 = noconvergent SI_CALL [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $sgpr0_sgpr1_sgpr2_sgpr3
   ; GISEL-GFX10-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32
@@ -218,7 +218,7 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_cc_ptr(ptr inreg %a, ptr %b, ptr ad
   ; DAGISEL-GFX11-NEXT:   [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr1
   ; DAGISEL-GFX11-NEXT:   [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr0
   ; DAGISEL-GFX11-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
-  ; DAGISEL-GFX11-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @use + 4, target-flags(amdgpu-gotprel32-hi) @use + 12, implicit-def dead $scc
+  ; DAGISEL-GFX11-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @use, target-flags(amdgpu-gotprel32-hi) @use, implicit-def dead $scc
   ; DAGISEL-GFX11-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4)
   ; DAGISEL-GFX11-NEXT:   $vgpr0 = COPY [[COPY11]]
   ; DAGISEL-GFX11-NEXT:   $vgpr1 = COPY [[COPY10]]
@@ -253,7 +253,7 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_cc_ptr(ptr inreg %a, ptr %b, ptr ad
   ; DAGISEL-GFX10-NEXT:   [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr1
   ; DAGISEL-GFX10-NEXT:   [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr0
   ; DAGISEL-GFX10-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
-  ; DAGISEL-GFX10-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @use + 4, target-flags(amdgpu-gotprel32-hi) @use + 12, implicit-def dead $scc
+  ; DAGISEL-GFX10-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @use, target-flags(amdgpu-gotprel32-hi) @use, implicit-def dead $scc
   ; DAGISEL-GFX10-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4)
   ; DAGISEL-GFX10-NEXT:   [[COPY12:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51
   ; DAGISEL-GFX10-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY12]]
@@ -310,7 +310,7 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_cc_struct( {ptr, i32, <4 x i32>} in
   ; GISEL-GFX11-NEXT:   $vgpr11 = COPY [[COPY11]]
   ; GISEL-GFX11-NEXT:   $vgpr12 = COPY [[COPY12]]
   ; GISEL-GFX11-NEXT:   $vgpr13 = COPY [[COPY13]]
-  ; GISEL-GFX11-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @use + 4, target-flags(amdgpu-gotprel32-hi) @use + 12, implicit-def $scc
+  ; GISEL-GFX11-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @use, target-flags(amdgpu-gotprel32-hi) @use, implicit-def $scc
   ; GISEL-GFX11-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
   ; GISEL-GFX11-NEXT:   $sgpr30_sgpr31 = noconvergent SI_CALL [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13
   ; GISEL-GFX11-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32
@@ -351,7 +351,7 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_cc_struct( {ptr, i32, <4 x i32>} in
   ; GISEL-GFX10-NEXT:   $vgpr13 = COPY [[COPY13]]
   ; GISEL-GFX10-NEXT:   [[COPY14:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51
   ; GISEL-GFX10-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY14]]
-  ; GISEL-GFX10-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @use + 4, target-flags(amdgpu-gotprel32-hi) @use + 12, implicit-def $scc
+  ; GISEL-GFX10-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @use, target-flags(amdgpu-gotprel32-hi) @use, implicit-def $scc
   ; GISEL-GFX10-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
   ; GISEL-GFX10-NEXT:   $sgpr30_sgpr31 = noconvergent SI_CALL [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $sgpr0_sgpr1_sgpr2_sgpr3
   ; GISEL-GFX10-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32
@@ -384,7 +384,7 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_cc_struct( {ptr, i32, <4 x i32>} in
   ; DAGISEL-GFX11-NEXT:   [[COPY16:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
   ; DAGISEL-GFX11-NEXT:   [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
   ; DAGISEL-GFX11-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
-  ; DAGISEL-GFX11-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @use + 4, target-flags(amdgpu-gotprel32-hi) @use + 12, implicit-def dead $scc
+  ; DAGISEL-GFX11-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @use, target-flags(amdgpu-gotprel32-hi) @use, implicit-def dead $scc
   ; DAGISEL-GFX11-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4)
   ; DAGISEL-GFX11-NEXT:   $vgpr0 = COPY [[COPY16]]
   ; DAGISEL-GFX11-NEXT:   $vgpr1 = COPY [[COPY14]]
@@ -431,7 +431,7 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_cc_struct( {ptr, i32, <4 x i32>} in
   ; DAGISEL-GFX10-NEXT:   [[COPY16:%[0-9]+]]:sreg_32 = COPY [[REG_SEQUENCE]].sub0
   ; DAGISEL-GFX10-NEXT:   [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0
   ; DAGISEL-GFX10-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
-  ; DAGISEL-GFX10-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @use + 4, target-flags(amdgpu-gotprel32-hi) @use + 12, implicit-def dead $scc
+  ; DAGISEL-GFX10-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @use, target-flags(amdgpu-gotprel32-hi) @use, implicit-def dead $scc
   ; DAGISEL-GFX10-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4)
   ; DAGISEL-GFX10-NEXT:   [[COPY18:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51
   ; DAGISEL-GFX10-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY18]]
@@ -466,7 +466,7 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_cc_float(float inreg %a, float %b)
   ; GISEL-GFX11-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32
   ; GISEL-GFX11-NEXT:   $vgpr0 = COPY [[COPY]]
   ; GISEL-GFX11-NEXT:   $vgpr1 = COPY [[COPY1]]
-  ; GISEL-GFX11-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @use + 4, target-flags(amdgpu-gotprel32-hi) @use + 12, implicit-def $scc
+  ; GISEL-GFX11-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @use, target-flags(amdgpu-gotprel32-hi) @use, implicit-def $scc
   ; GISEL-GFX11-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
   ; GISEL-GFX11-NEXT:   $sgpr30_sgpr31 = noconvergent SI_CALL [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $vgpr0, implicit $vgpr1
   ; GISEL-GFX11-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32
@@ -483,7 +483,7 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_cc_float(float inreg %a, float %b)
   ; GISEL-GFX10-NEXT:   $vgpr1 = COPY [[COPY1]]
   ; GISEL-GFX10-NEXT:   [[COPY2:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51
   ; GISEL-GFX10-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY2]]
-  ; GISEL-GFX10-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @use + 4, target-flags(amdgpu-gotprel32-hi) @use + 12, implicit-def $scc
+  ; GISEL-GFX10-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @use, target-flags(amdgpu-gotprel32-hi) @use, implicit-def $scc
   ; GISEL-GFX10-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
   ; GISEL-GFX10-NEXT:   $sgpr30_sgpr31 = noconvergent SI_CALL [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $vgpr0, implicit $vgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3
   ; GISEL-GFX10-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32
@@ -496,7 +496,7 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_cc_float(float inreg %a, float %b)
   ; DAGISEL-GFX11-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
   ; DAGISEL-GFX11-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0
   ; DAGISEL-GFX11-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
-  ; DAGISEL-GFX11-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @use + 4, target-flags(amdgpu-gotprel32-hi) @use + 12, implicit-def dead $scc
+  ; DAGISEL-GFX11-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @use, target-flags(amdgpu-gotprel32-hi) @use, implicit-def dead $scc
   ; DAGISEL-GFX11-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4)
   ; DAGISEL-GFX11-NEXT:   $vgpr0 = COPY [[COPY1]]
   ; DAGISEL-GFX11-NEXT:   $vgpr1 = COPY [[COPY]]
@@ -511,7 +511,7 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_cc_float(float inreg %a, float %b)
   ; DAGISEL-GFX10-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
   ; DAGISEL-GFX10-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0
   ; DAGISEL-GFX10-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
-  ; DAGISEL-GFX10-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @use + 4, target-flags(amdgpu-gotprel32-hi) @use + 12, implicit-def dead $scc
+  ; DAGISEL-GFX10-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @use, target-flags(amdgpu-gotprel32-hi) @use, implicit-def dead $scc
   ; DAGISEL-GFX10-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4)
   ; DAGISEL-GFX10-NEXT:   [[COPY2:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51
   ; DAGISEL-GFX10-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY2]]
@@ -534,7 +534,7 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_cc_half(half inreg %a, half %b) {
   ; GISEL-GFX11-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32
   ; GISEL-GFX11-NEXT:   $vgpr0 = COPY [[COPY]]
   ; GISEL-GFX11-NEXT:   $vgpr1 = COPY [[COPY1]]
-  ; GISEL-GFX11-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @use + 4, target-flags(amdgpu-gotprel32-hi) @use + 12, implicit-def $scc
+  ; GISEL-GFX11-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @use, target-flags(amdgpu-gotprel32-hi) @use, implicit-def $scc
   ; GISEL-GFX11-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
   ; GISEL-GFX11-NEXT:   $sgpr30_sgpr31 = noconvergent SI_CALL [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $vgpr0, implicit $vgpr1
   ; GISEL-GFX11-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32
@@ -551,7 +551,7 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_cc_half(half inreg %a, half %b) {
   ; GISEL-GFX10-NEXT:   $vgpr1 = COPY [[COPY1]]
   ; GISEL-GFX10-NEXT:   [[COPY2:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51
   ; GISEL-GFX10-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY2]]
-  ; GISEL-GFX10-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @use + 4, target-flags(amdgpu-gotprel32-hi) @use + 12, implicit-def $scc
+  ; GISEL-GFX10-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @use, target-flags(amdgpu-gotprel32-hi) @use, implicit-def $scc
   ; GISEL-GFX10-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
   ; GISEL-GFX10-NEXT:   $sgpr30_sgpr31 = noconvergent SI_CALL [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $vgpr0, implicit $vgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3
   ; GISEL-GFX10-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32
@@ -564,7 +564,7 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_cc_half(half inreg %a, half %b) {
   ; DAGISEL-GFX11-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
   ; DAGISEL-GFX11-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0
   ; DAGISEL-GFX11-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
-  ; DAGISEL-GFX11-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @use + 4, target-flags(amdgpu-gotprel32-hi) @use + 12, implicit-def dead $scc
+  ; DAGISEL-GFX11-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @use, target-flags(amdgpu-gotprel32-hi) @use, implicit-def dead $scc
   ; DAGISEL-GFX11-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4)
   ; DAGISEL-GFX11-NEXT:   $vgpr0 = COPY [[COPY1]]
   ; DAGISEL-GFX11-NEXT:   $vgpr1 = COPY [[COPY]]
@@ -579,7 +579,7 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_cc_half(half inreg %a, half %b) {
   ; DAGISEL-GFX10-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
   ; DAGISEL-GFX10-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0
   ; DAGISEL-GFX10-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
-  ; DAGISEL-GFX10-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @use + 4, target-flags(amdgpu-gotprel32-hi) @use + 12, implicit-def dead $scc
+  ; DAGISEL-GFX10-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @use, target-flags(amdgpu-gotprel32-hi) @use, implicit-def dead $scc
   ; DAGISEL-GFX10-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4)
   ; DAGISEL-GFX10-NEXT:   [[COPY2:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51
   ; DAGISEL-GFX10-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY2]]
@@ -602,7 +602,7 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_cc_bfloat(bfloat inreg %a, bfloat %
   ; GISEL-GFX11-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32
   ; GISEL-GFX11-NEXT:   $vgpr0 = COPY [[COPY]]
   ; GISEL-GFX11-NEXT:   $vgpr1 = COPY [[COPY1]]
-  ; GISEL-GFX11-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @use + 4, target-flags(amdgpu-gotprel32-hi) @use + 12, implicit-def $scc
+  ; GISEL-GFX11-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @use, target-flags(amdgpu-gotprel32-hi) @use, implicit-def $scc
   ; GISEL-GFX11-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
   ; GISEL-GFX11-NEXT:   $sgpr30_sgpr31 = noconvergent SI_CALL [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $vgpr0, implicit $vgpr1
   ; GISEL-GFX11-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32
@@ -619,7 +619,7 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_cc_bfloat(bfloat inreg %a, bfloat %
   ; GISEL-GFX10-NEXT:   $vgpr1 = COPY [[COPY1]]
   ; GISEL-GFX10-NEXT:   [[COPY2:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51
   ; GISEL-GFX10-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY2]]
-  ; GISEL-GFX10-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @use + 4, target-flags(amdgpu-gotprel32-hi) @use + 12, implicit-def $scc
+  ; GISEL-GFX10-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @use, target-flags(amdgpu-gotprel32-hi) @use, implicit-def $scc
   ; GISEL-GFX10-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
   ; GISEL-GFX10-NEXT:   $sgpr30_sgpr31 = noconvergent SI_CALL [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $vgpr0, implicit $vgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3
   ; GISEL-GFX10-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32
@@ -632,7 +632,7 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_cc_bfloat(bfloat inreg %a, bfloat %
   ; DAGISEL-GFX11-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
   ; DAGISEL-GFX11-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0
   ; DAGISEL-GFX11-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
-  ; DAGISEL-GFX11-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @use + 4, target-flags(amdgpu-gotprel32-hi) @use + 12, implicit-def dead $scc
+  ; DAGISEL-GFX11-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @use, target-flags(amdgpu-gotprel32-hi) @use, implicit-def dead $scc
   ; DAGISEL-GFX11-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4)
   ; DAGISEL-GFX11-NEXT:   $vgpr0 = COPY [[COPY1]]
   ; DAGISEL-GFX11-NEXT:   $vgpr1 = COPY [[COPY]]
@@ -647,7 +647,7 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_cc_bfloat(bfloat inreg %a, bfloat %
   ; DAGISEL-GFX10-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
   ; DAGISEL-GFX10-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0
   ; DAGISEL-GFX10-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
-  ; DAGISEL-GFX10-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @use + 4, target-flags(amdgpu-gotprel32-hi) @use + 12, implicit-def dead $scc
+  ; DAGISEL-GFX10-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @use, target-flags(amdgpu-gotprel32-hi) @use, implicit-def dead $scc
   ; DAGISEL-GFX10-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4)
   ; DAGISEL-GFX10-NEXT:   [[COPY2:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51
   ; DAGISEL-GFX10-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY2]]
@@ -670,7 +670,7 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_cc_i16(i16 inreg %a, i16 %b) {
   ; GISEL-GFX11-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32
   ; GISEL-GFX11-NEXT:   $vgpr0 = COPY [[COPY]]
   ; GISEL-GFX11-NEXT:   $vgpr1 = COPY [[COPY1]]
-  ; GISEL-GFX11-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @use + 4, target-flags(amdgpu-gotprel32-hi) @use + 12, implicit-def $scc
+  ; GISEL-GFX11-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @use, target-flags(amdgpu-gotprel32-hi) @use, implicit-def $scc
   ; GISEL-GFX11-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
   ; GISEL-GFX11-NEXT:   $sgpr30_sgpr31 = noconvergent SI_CALL [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $vgpr0, implicit $vgpr1
   ; GISEL-GFX11-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32
@@ -687,7 +687,7 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_cc_i16(i16 inreg %a, i16 %b) {
   ; GISEL-GFX10-NEXT:   $vgpr1 = COPY [[COPY1]]
   ; GISEL-GFX10-NEXT:   [[COPY2:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51
   ; GISEL-GFX10-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY2]]
-  ; GISEL-GFX10-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @use + 4, target-flags(amdgpu-gotprel32-hi) @use + 12, implicit-def $scc
+  ; GISEL-GFX10-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @use, target-flags(amdgpu-gotprel32-hi) @use, implicit-def $scc
   ; GISEL-GFX10-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
   ; GISEL-GFX10-NEXT:   $sgpr30_sgpr31 = noconvergent SI_CALL [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $vgpr0, implicit $vgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3
   ; GISEL-GFX10-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32
@@ -700,7 +700,7 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_cc_i16(i16 inreg %a, i16 %b) {
   ; DAGISEL-GFX11-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
   ; DAGISEL-GFX11-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0
   ; DAGISEL-GFX11-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
-  ; DAGISEL-GFX11-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @use + 4, target-flags(amdgpu-gotprel32-hi) @use + 12, implicit-def dead $scc
+  ; DAGISEL-GFX11-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @use, target-flags(amdgpu-gotprel32-hi) @use, implicit-def dead $scc
   ; DAGISEL-GFX11-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4)
   ; DAGISEL-GFX11-NEXT:   $vgpr0 = COPY [[COPY1]]
   ; DAGISEL-GFX11-NEXT:   $vgpr1 = COPY [[COPY]]
@@ -715,7 +715,7 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_cc_i16(i16 inreg %a, i16 %b) {
   ; DAGISEL-GFX10-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
   ; DAGISEL-GFX10-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0
   ; DAGISEL-GFX10-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
-  ; DAGISEL-GFX10-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @use + 4, target-flags(amdgpu-gotprel32-hi) @use + 12, implicit-def dead $scc
+  ; DAGISEL-GFX10-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @use, target-flags(amdgpu-gotprel32-hi) @use, implicit-def dead $scc
   ; DAGISEL-GFX10-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4)
   ; DAGISEL-GFX10-NEXT:   [[COPY2:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51
   ; DAGISEL-GFX10-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY2]]
@@ -766,7 +766,7 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_cc_v16i16(<16 x i16> inreg %a, <16
   ; GISEL-GFX11-NEXT:   $vgpr13 = COPY [[COPY13]]
   ; GISEL-GFX11-NEXT:   $vgpr14 = COPY [[COPY14]]
   ; GISEL-GFX11-NEXT:   $vgpr15 = COPY [[COPY15]]
-  ; GISEL-GFX11-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @use + 4, target-flags(amdgpu-gotprel32-hi) @use + 12, implicit-def $scc
+  ; GISEL-GFX11-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @use, target-flags(amdgpu-gotprel32-hi) @use, implicit-def $scc
   ; GISEL-GFX11-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
   ; GISEL-GFX11-NEXT:   $sgpr30_sgpr31 = noconvergent SI_CALL [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15
   ; GISEL-GFX11-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32
@@ -811,7 +811,7 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_cc_v16i16(<16 x i16> inreg %a, <16
   ; GISEL-GFX10-NEXT:   $vgpr15 = COPY [[COPY15]]
   ; GISEL-GFX10-NEXT:   [[COPY16:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51
   ; GISEL-GFX10-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY16]]
-  ; GISEL-GFX10-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @use + 4, target-flags(amdgpu-gotprel32-hi) @use + 12, implicit-def $scc
+  ; GISEL-GFX10-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @use, target-flags(amdgpu-gotprel32-hi) @use, implicit-def $scc
   ; GISEL-GFX10-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
   ; GISEL-GFX10-NEXT:   $sgpr30_sgpr31 = noconvergent SI_CALL [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15, implicit $sgpr0_sgpr1_sgpr2_sgpr3
   ; GISEL-GFX10-NEXT:   ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32
@@ -838,7 +838,7 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_cc_v16i16(<16 x i16> inreg %a, <16
   ; DAGISEL-GFX11-NEXT:   [[COPY14:%[0-9]+]]:sgpr_32 = COPY $sgpr1
   ; DAGISEL-GFX11-NEXT:   [[COPY15:%[0-9]+]]:sgpr_32 = COPY $sgpr0
   ; DAGISEL-GFX11-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
-  ; DAGISEL-GFX11-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @use + 4, target-flags(amdgpu-gotprel32-hi) @use + 12, implicit-def dead $scc
+  ; DAGISEL-GFX11-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @use, target-flags(amdgpu-gotprel32-hi) @use, implicit-def dead $scc
   ; DAGISEL-GFX11-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4)
   ; DAGISEL-GFX11-NEXT:   $vgpr0 = COPY [[COPY15]]
   ; DAGISEL-GFX11-NEXT:   $vgpr1 = COPY [[COPY14]]
@@ -881,7 +881,7 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_cc_v16i16(<16 x i16> inreg %a, <16
   ; DAGISEL-GFX10-NEXT:   [[COPY14:%[0-9]+]]:sgpr_32 = COPY $sgpr1
   ; DAGISEL-GFX10-NEXT:   [[COPY15:%[0-9]+]]:sgpr_32 = COPY $sgpr0
   ; DAGISEL-GFX10-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
-  ; DAGISEL-GFX10-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @use + 4, target-flags(amdgpu-gotprel32-hi) @use + 12, implicit-def dead $scc
+  ; DAGISEL-GFX10-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @use, target-flags(amdgpu-gotprel32-hi) @use, implicit-def dead $scc
   ; DAGISEL-GFX10-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4)
   ; DAGISEL-GFX10-NEXT:   [[COPY16:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51
   ; DAGISEL-GFX10-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY16]]
@@ -1511,7 +1511,7 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_many_regs(<36 x i32> inreg %a, <128
   ; GISEL-GFX11-NEXT:   $vgpr29 = COPY [[COPY29]]
   ; GISEL-GFX11-NEXT:   $vgpr30 = COPY [[COPY30]]
   ; GISEL-GFX11-NEXT:   $vgpr31 = COPY [[COPY31]]
-  ; GISEL-GFX11-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @use + 4, target-flags(amdgpu-gotprel32-hi) @use + 12, implicit-def $scc
+  ; GISEL-GFX11-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @use, target-flags(amdgpu-gotprel32-hi) @use, implicit-def $scc
   ; GISEL-GFX11-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
   ; GISEL-GFX11-NEXT:   $sgpr30_sgpr31 = noconvergent SI_CALL [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15, implicit $vgpr16, implicit $vgpr17, implicit $vgpr18, implicit $vgpr19, implicit $vgpr20, implicit $vgpr21, implicit $vgpr22, implicit $vgpr23, implicit $vgpr24, implicit $vgpr25, implicit $vgpr26, implicit $vgpr27, implicit $vgpr28, implicit $vgpr29, implicit $vgpr30, implicit $vgpr31
   ; GISEL-GFX11-NEXT:   ADJCALLSTACKDOWN 0, 528, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32
@@ -1856,7 +1856,7 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_many_regs(<36 x i32> inreg %a, <128
   ; GISEL-GFX10-NEXT:   $vgpr31 = COPY [[COPY31]]
   ; GISEL-GFX10-NEXT:   [[COPY168:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51
   ; GISEL-GFX10-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY168]]
-  ; GISEL-GFX10-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @use + 4, target-flags(amdgpu-gotprel32-hi) @use + 12, implicit-def $scc
+  ; GISEL-GFX10-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @use, target-flags(amdgpu-gotprel32-hi) @use, implicit-def $scc
   ; GISEL-GFX10-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4)
   ; GISEL-GFX10-NEXT:   $sgpr30_sgpr31 = noconvergent SI_CALL [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15, implicit $vgpr16, implicit $vgpr17, implicit $vgpr18, implicit $vgpr19, implicit $vgpr20, implicit $vgpr21, implicit $vgpr22, implicit $vgpr23, implicit $vgpr24, implicit $vgpr25, implicit $vgpr26, implicit $vgpr27, implicit $vgpr28, implicit $vgpr29, implicit $vgpr30, implicit $vgpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3
   ; GISEL-GFX10-NEXT:   ADJCALLSTACKDOWN 0, 528, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32
@@ -2430,7 +2430,7 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_many_regs(<36 x i32> inreg %a, <128
   ; DAGISEL-GFX11-NEXT:   [[S_ADD_I32_130:%[0-9]+]]:sreg_32_xexec_hi = S_ADD_I32 [[COPY164]], killed [[S_MOV_B32_130]], implicit-def dead $scc
   ; DAGISEL-GFX11-NEXT:   [[COPY168:%[0-9]+]]:vgpr_32 = COPY [[COPY130]]
   ; DAGISEL-GFX11-NEXT:   SCRATCH_STORE_DWORD_SADDR [[COPY168]], killed [[S_ADD_I32_130]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into stack + 4, addrspace 5)
-  ; DAGISEL-GFX11-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @use + 4, target-flags(amdgpu-gotprel32-hi) @use + 12, implicit-def dead $scc
+  ; DAGISEL-GFX11-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @use, target-flags(amdgpu-gotprel32-hi) @use, implicit-def dead $scc
   ; DAGISEL-GFX11-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4)
   ; DAGISEL-GFX11-NEXT:   $vgpr0 = COPY [[COPY163]]
   ; DAGISEL-GFX11-NEXT:   $vgpr1 = COPY [[COPY162]]
@@ -2775,7 +2775,7 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_many_regs(<36 x i32> inreg %a, <128
   ; DAGISEL-GFX10-NEXT:   BUFFER_STORE_DWORD_OFFSET [[COPY2]], $sgpr48_sgpr49_sgpr50_sgpr51, [[COPY165]], 516, 0, 0, implicit $exec :: (store (s32) into stack + 516, addrspace 5)
   ; DAGISEL-GFX10-NEXT:   BUFFER_STORE_DWORD_OFFSET [[COPY1]], $sgpr48_sgpr49_sgpr50_sgpr51, [[COPY165]], 520, 0, 0, implicit $exec :: (store (s32) into stack + 520, align 8, addrspace 5)
   ; DAGISEL-GFX10-NEXT:   BUFFER_STORE_DWORD_OFFSET [[COPY]], $sgpr48_sgpr49_sgpr50_sgpr51, [[COPY165]], 524, 0, 0, implicit $exec :: (store (s32) into stack + 524, addrspace 5)
-  ; DAGISEL-GFX10-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @use + 4, target-flags(amdgpu-gotprel32-hi) @use + 12, implicit-def dead $scc
+  ; DAGISEL-GFX10-NEXT:   [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @use, target-flags(amdgpu-gotprel32-hi) @use, implicit-def dead $scc
   ; DAGISEL-GFX10-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4)
   ; DAGISEL-GFX10-NEXT:   $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY164]]
   ; DAGISEL-GFX10-NEXT:   $vgpr0 = COPY [[COPY163]]

From d8714566592159c37960e626d831bfca4e24baa3 Mon Sep 17 00:00:00 2001
From: Zequan Wu <zequanwu@google.com>
Date: Wed, 1 Nov 2023 15:46:30 -0400
Subject: [PATCH 739/877] [Profile] Remove inline for hasCorrelation.

---
 compiler-rt/lib/profile/InstrProfiling.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/compiler-rt/lib/profile/InstrProfiling.c b/compiler-rt/lib/profile/InstrProfiling.c
index 7d69e37815c94..3285af4bf01f3 100644
--- a/compiler-rt/lib/profile/InstrProfiling.c
+++ b/compiler-rt/lib/profile/InstrProfiling.c
@@ -90,6 +90,6 @@ COMPILER_RT_VISIBILITY void __llvm_profile_reset_counters(void) {
   lprofSetProfileDumped(0);
 }
 
-inline int hasCorrelation() {
+int hasCorrelation() {
   return (__llvm_profile_get_version() & VARIANT_MASK_DBG_CORRELATE) != 0ULL;
 }

From 32d91449ef98c267715b346871912a34899a8e64 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?=
 =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?=
 =?UTF-8?q?=E3=83=B3=29?= <clementval@gmail.com>
Date: Wed, 1 Nov 2023 12:59:59 -0700
Subject: [PATCH 740/877] [flang][openacc] Only issue a warning when acc
 routine func is not found (#70964)

Do not issue a hard error when the function in acc routine directive is
not present in the current translation unit. Only issue a warning.
---
 flang/lib/Lower/OpenACC.cpp | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/flang/lib/Lower/OpenACC.cpp b/flang/lib/Lower/OpenACC.cpp
index 738af5d60a835..809fd3b3be7cf 100644
--- a/flang/lib/Lower/OpenACC.cpp
+++ b/flang/lib/Lower/OpenACC.cpp
@@ -3295,9 +3295,12 @@ void Fortran::lower::finalizeOpenACCRoutineAttachment(
     mlir::func::FuncOp funcOp =
         mod.lookupSymbol<mlir::func::FuncOp>(mapping.first);
     if (!funcOp)
-      llvm::report_fatal_error(
-          "could not find function to attach OpenACC routine information.");
-    attachRoutineInfo(funcOp, mapping.second);
+      mlir::emitWarning(mod.getLoc(),
+                        llvm::Twine("function '") + llvm::Twine(mapping.first) +
+                            llvm::Twine("' in acc routine directive is not "
+                                        "found in this translation unit."));
+    else
+      attachRoutineInfo(funcOp, mapping.second);
   }
   accRoutineInfos.clear();
 }

From 6266b964202336a02f40007928719e060bc81694 Mon Sep 17 00:00:00 2001
From: Graham Inggs <ginggs@debian.org>
Date: Wed, 1 Nov 2023 20:59:53 +0100
Subject: [PATCH 741/877] Add support of the next Ubuntu (Ubuntu 24.04 - Noble
 Numbat)

Co-authored-by: Sylvestre Ledru <sylvestre@debian.org>
---
 clang/include/clang/Driver/Distro.h | 3 ++-
 clang/lib/Driver/Distro.cpp         | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/clang/include/clang/Driver/Distro.h b/clang/include/clang/Driver/Distro.h
index 8291f6575a711..a8de94163e8b0 100644
--- a/clang/include/clang/Driver/Distro.h
+++ b/clang/include/clang/Driver/Distro.h
@@ -78,6 +78,7 @@ class Distro {
     UbuntuKinetic,
     UbuntuLunar,
     UbuntuMantic,
+    UbuntuNoble,
     UnknownDistro
   };
 
@@ -129,7 +130,7 @@ class Distro {
   }
 
   bool IsUbuntu() const {
-    return DistroVal >= UbuntuHardy && DistroVal <= UbuntuMantic;
+    return DistroVal >= UbuntuHardy && DistroVal <= UbuntuNoble;
   }
 
   bool IsAlpineLinux() const { return DistroVal == AlpineLinux; }
diff --git a/clang/lib/Driver/Distro.cpp b/clang/lib/Driver/Distro.cpp
index 005c31bd38893..36f828f8cae26 100644
--- a/clang/lib/Driver/Distro.cpp
+++ b/clang/lib/Driver/Distro.cpp
@@ -94,6 +94,7 @@ static Distro::DistroType DetectLsbRelease(llvm::vfs::FileSystem &VFS) {
                     .Case("kinetic", Distro::UbuntuKinetic)
                     .Case("lunar", Distro::UbuntuLunar)
                     .Case("mantic", Distro::UbuntuMantic)
+                    .Case("noble", Distro::UbuntuNoble)
                     .Default(Distro::UnknownDistro);
   return Version;
 }

From 09f1aaca0bdc260c29043188aa2f63bcbe07f02f Mon Sep 17 00:00:00 2001
From: Youngsuk Kim <youngsuk.kim@hpe.com>
Date: Wed, 1 Nov 2023 14:50:19 -0500
Subject: [PATCH 742/877] [CGBlocks] Remove no-op ptr-to-ptr bitcasts (NFC)

Opaque ptr cleanup effort. NFC.
---
 clang/lib/CodeGen/CGBlocks.cpp | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/clang/lib/CodeGen/CGBlocks.cpp b/clang/lib/CodeGen/CGBlocks.cpp
index 27f525ee35edb..dbc4cffa8976d 100644
--- a/clang/lib/CodeGen/CGBlocks.cpp
+++ b/clang/lib/CodeGen/CGBlocks.cpp
@@ -203,8 +203,7 @@ static llvm::Constant *buildBlockDescriptor(CodeGenModule &CGM,
   // Signature.  Mandatory ObjC-style method descriptor @encode sequence.
   std::string typeAtEncoding =
     CGM.getContext().getObjCEncodingForBlock(blockInfo.getBlockExpr());
-  elements.add(llvm::ConstantExpr::getBitCast(
-    CGM.GetAddrOfConstantCString(typeAtEncoding).getPointer(), i8p));
+  elements.add(CGM.GetAddrOfConstantCString(typeAtEncoding).getPointer());
 
   // GC layout.
   if (C.getLangOpts().ObjC) {
@@ -809,7 +808,7 @@ llvm::Value *CodeGenFunction::EmitBlockLiteral(const CGBlockInfo &blockInfo) {
     llvm::Constant *blockISA = blockInfo.NoEscape
                                    ? CGM.getNSConcreteGlobalBlock()
                                    : CGM.getNSConcreteStackBlock();
-    isa = llvm::ConstantExpr::getBitCast(blockISA, VoidPtrTy);
+    isa = blockISA;
 
     // Build the block descriptor.
     descriptor = buildBlockDescriptor(CGM, blockInfo);
@@ -1869,7 +1868,7 @@ CodeGenFunction::GenerateCopyHelperFunction(const CGBlockInfo &blockInfo) {
       CaptureStrKind::CopyHelper, CGM);
 
   if (llvm::GlobalValue *Func = CGM.getModule().getNamedValue(FuncName))
-    return llvm::ConstantExpr::getBitCast(Func, VoidPtrTy);
+    return Func;
 
   ASTContext &C = getContext();
 
@@ -1990,7 +1989,7 @@ CodeGenFunction::GenerateCopyHelperFunction(const CGBlockInfo &blockInfo) {
 
   FinishFunction();
 
-  return llvm::ConstantExpr::getBitCast(Fn, VoidPtrTy);
+  return Fn;
 }
 
 static BlockFieldFlags
@@ -2056,7 +2055,7 @@ CodeGenFunction::GenerateDestroyHelperFunction(const CGBlockInfo &blockInfo) {
       CaptureStrKind::DisposeHelper, CGM);
 
   if (llvm::GlobalValue *Func = CGM.getModule().getNamedValue(FuncName))
-    return llvm::ConstantExpr::getBitCast(Func, VoidPtrTy);
+    return Func;
 
   ASTContext &C = getContext();
 
@@ -2113,7 +2112,7 @@ CodeGenFunction::GenerateDestroyHelperFunction(const CGBlockInfo &blockInfo) {
 
   FinishFunction();
 
-  return llvm::ConstantExpr::getBitCast(Fn, VoidPtrTy);
+  return Fn;
 }
 
 namespace {
@@ -2352,7 +2351,7 @@ generateByrefCopyHelper(CodeGenFunction &CGF, const BlockByrefInfo &byrefInfo,
 
   CGF.FinishFunction();
 
-  return llvm::ConstantExpr::getBitCast(Fn, CGF.Int8PtrTy);
+  return Fn;
 }
 
 /// Build the copy helper for a __block variable.
@@ -2408,7 +2407,7 @@ generateByrefDisposeHelper(CodeGenFunction &CGF,
 
   CGF.FinishFunction();
 
-  return llvm::ConstantExpr::getBitCast(Fn, CGF.Int8PtrTy);
+  return Fn;
 }
 
 /// Build the dispose helper for a __block variable.

From cfb791aa4bb9667d60c27ff889bd5cbe8e19c806 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Wed, 1 Nov 2023 13:23:19 -0700
Subject: [PATCH 743/877] [RISCV] Add RV64 i32 patterns for bseti/bclri/binvi.

Needed for -riscv-experimental-rv64-legal-i32 and probably GISel.
---
 llvm/lib/Target/RISCV/RISCVInstrInfoZb.td     |  16 +++
 .../CodeGen/RISCV/rv64-legal-i32/rv64zbs.ll   | 120 ++++++++++++------
 2 files changed, 94 insertions(+), 42 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
index 2579d13b59a3c..0a19d3542d7b1 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
@@ -817,6 +817,13 @@ def : PatGprGpr<int_riscv_xperm8, XPERM8>;
 // Experimental RV64 i32 legalization patterns.
 //===----------------------------------------------------------------------===//
 
+def BCLRMaski32 : ImmLeaf<i32, [{
+  return !isInt<12>(Imm) && isPowerOf2_32(~Imm);
+}]>;
+def SingleBitSetMaski32 : ImmLeaf<i32, [{
+  return !isInt<12>(Imm) && isPowerOf2_32(Imm);
+}]>;
+
 let Predicates = [HasStdExtZbb, IsRV64] in {
 def : PatGpr<ctlz, CLZW, i32>;
 def : PatGpr<cttz, CTZW, i32>;
@@ -842,3 +849,12 @@ def : Pat<(i32 (rotl GPR:$rs1, uimm5:$rs2)),
 let Predicates = [HasStdExtZba, IsRV64] in {
 def : Pat<(zext GPR:$src), (ADD_UW GPR:$src, (XLenVT X0))>;
 }
+
+let Predicates = [HasStdExtZbs, IsRV64] in {
+def : Pat<(i32 (and GPR:$rs1, BCLRMaski32:$mask)),
+          (BCLRI GPR:$rs1, (i64 (BCLRXForm $mask)))>;
+def : Pat<(i32 (or GPR:$rs1, SingleBitSetMaski32:$mask)),
+          (BSETI GPR:$rs1, (i64 (SingleBitSetMaskToIndex $mask)))>;
+def : Pat<(i32 (xor GPR:$rs1, SingleBitSetMaski32:$mask)),
+          (BINVI GPR:$rs1, (i64 (SingleBitSetMaskToIndex $mask)))>;
+} // Predicates = [HasStdExtZbs, IsRV64]
diff --git a/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64zbs.ll b/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64zbs.ll
index b0b75dde02231..0dfa56d4a0059 100644
--- a/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64zbs.ll
+++ b/llvm/test/CodeGen/RISCV/rv64-legal-i32/rv64zbs.ll
@@ -440,34 +440,50 @@ define signext i32 @bclri_i32_10(i32 signext %a) nounwind {
 }
 
 define signext i32 @bclri_i32_11(i32 signext %a) nounwind {
-; CHECK-LABEL: bclri_i32_11:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a1, 1048575
-; CHECK-NEXT:    addiw a1, a1, 2047
-; CHECK-NEXT:    and a0, a0, a1
-; CHECK-NEXT:    ret
+; RV64I-LABEL: bclri_i32_11:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lui a1, 1048575
+; RV64I-NEXT:    addiw a1, a1, 2047
+; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBS-LABEL: bclri_i32_11:
+; RV64ZBS:       # %bb.0:
+; RV64ZBS-NEXT:    bclri a0, a0, 11
+; RV64ZBS-NEXT:    ret
   %and = and i32 %a, -2049
   ret i32 %and
 }
 
 define signext i32 @bclri_i32_30(i32 signext %a) nounwind {
-; CHECK-LABEL: bclri_i32_30:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a1, 786432
-; CHECK-NEXT:    addiw a1, a1, -1
-; CHECK-NEXT:    and a0, a0, a1
-; CHECK-NEXT:    ret
+; RV64I-LABEL: bclri_i32_30:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lui a1, 786432
+; RV64I-NEXT:    addiw a1, a1, -1
+; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBS-LABEL: bclri_i32_30:
+; RV64ZBS:       # %bb.0:
+; RV64ZBS-NEXT:    bclri a0, a0, 30
+; RV64ZBS-NEXT:    ret
   %and = and i32 %a, -1073741825
   ret i32 %and
 }
 
 define signext i32 @bclri_i32_31(i32 signext %a) nounwind {
-; CHECK-LABEL: bclri_i32_31:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a1, 524288
-; CHECK-NEXT:    addiw a1, a1, -1
-; CHECK-NEXT:    and a0, a0, a1
-; CHECK-NEXT:    ret
+; RV64I-LABEL: bclri_i32_31:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lui a1, 524288
+; RV64I-NEXT:    addiw a1, a1, -1
+; RV64I-NEXT:    and a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBS-LABEL: bclri_i32_31:
+; RV64ZBS:       # %bb.0:
+; RV64ZBS-NEXT:    bclri a0, a0, 31
+; RV64ZBS-NEXT:    sext.w a0, a0
+; RV64ZBS-NEXT:    ret
   %and = and i32 %a, -2147483649
   ret i32 %and
 }
@@ -614,29 +630,39 @@ define signext i32 @bseti_i32_11(i32 signext %a) nounwind {
 ;
 ; RV64ZBS-LABEL: bseti_i32_11:
 ; RV64ZBS:       # %bb.0:
-; RV64ZBS-NEXT:    bseti a1, zero, 11
-; RV64ZBS-NEXT:    or a0, a0, a1
+; RV64ZBS-NEXT:    bseti a0, a0, 11
 ; RV64ZBS-NEXT:    ret
   %or = or i32 %a, 2048
   ret i32 %or
 }
 
 define signext i32 @bseti_i32_30(i32 signext %a) nounwind {
-; CHECK-LABEL: bseti_i32_30:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a1, 262144
-; CHECK-NEXT:    or a0, a0, a1
-; CHECK-NEXT:    ret
+; RV64I-LABEL: bseti_i32_30:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lui a1, 262144
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBS-LABEL: bseti_i32_30:
+; RV64ZBS:       # %bb.0:
+; RV64ZBS-NEXT:    bseti a0, a0, 30
+; RV64ZBS-NEXT:    ret
   %or = or i32 %a, 1073741824
   ret i32 %or
 }
 
 define signext i32 @bseti_i32_31(i32 signext %a) nounwind {
-; CHECK-LABEL: bseti_i32_31:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a1, 524288
-; CHECK-NEXT:    or a0, a0, a1
-; CHECK-NEXT:    ret
+; RV64I-LABEL: bseti_i32_31:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lui a1, 524288
+; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBS-LABEL: bseti_i32_31:
+; RV64ZBS:       # %bb.0:
+; RV64ZBS-NEXT:    bseti a0, a0, 31
+; RV64ZBS-NEXT:    sext.w a0, a0
+; RV64ZBS-NEXT:    ret
   %or = or i32 %a, 2147483648
   ret i32 %or
 }
@@ -748,29 +774,39 @@ define signext i32 @binvi_i32_11(i32 signext %a) nounwind {
 ;
 ; RV64ZBS-LABEL: binvi_i32_11:
 ; RV64ZBS:       # %bb.0:
-; RV64ZBS-NEXT:    bseti a1, zero, 11
-; RV64ZBS-NEXT:    xor a0, a0, a1
+; RV64ZBS-NEXT:    binvi a0, a0, 11
 ; RV64ZBS-NEXT:    ret
   %xor = xor i32 %a, 2048
   ret i32 %xor
 }
 
 define signext i32 @binvi_i32_30(i32 signext %a) nounwind {
-; CHECK-LABEL: binvi_i32_30:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a1, 262144
-; CHECK-NEXT:    xor a0, a0, a1
-; CHECK-NEXT:    ret
+; RV64I-LABEL: binvi_i32_30:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lui a1, 262144
+; RV64I-NEXT:    xor a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBS-LABEL: binvi_i32_30:
+; RV64ZBS:       # %bb.0:
+; RV64ZBS-NEXT:    binvi a0, a0, 30
+; RV64ZBS-NEXT:    ret
   %xor = xor i32 %a, 1073741824
   ret i32 %xor
 }
 
 define signext i32 @binvi_i32_31(i32 signext %a) nounwind {
-; CHECK-LABEL: binvi_i32_31:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a1, 524288
-; CHECK-NEXT:    xor a0, a0, a1
-; CHECK-NEXT:    ret
+; RV64I-LABEL: binvi_i32_31:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    lui a1, 524288
+; RV64I-NEXT:    xor a0, a0, a1
+; RV64I-NEXT:    ret
+;
+; RV64ZBS-LABEL: binvi_i32_31:
+; RV64ZBS:       # %bb.0:
+; RV64ZBS-NEXT:    binvi a0, a0, 31
+; RV64ZBS-NEXT:    sext.w a0, a0
+; RV64ZBS-NEXT:    ret
   %xor = xor i32 %a, 2147483648
   ret i32 %xor
 }

From bb66aa77f27a9a79637848e5e48aeca744f9a469 Mon Sep 17 00:00:00 2001
From: Alexey Sachkov <alexey.sachkov@intel.com>
Date: Wed, 1 Nov 2023 22:12:59 +0100
Subject: [PATCH 744/877] [SYCL][NFC] Outline `vec::convert` helpers (#11710)

Current `vec::convert` implementation is not efficient, because it does
per-element conversion even though there are SPIR-V instructions to do a
whole vector convert operation through a single instruction.

There is an upcoming improvement of `vec::convert` implementation to
benefit from native SPIR-V vector conversion instructions, but it will
require siginifcant revamp of the implementation, potentially making it
take more lines of code.

Therefore, to simplify further development, review and code maintanence,
all helpers related to `vec::convert` implementation were outlined into
a separate file by this commit.
---
 sycl/include/sycl/detail/vector_convert.hpp | 330 ++++++++++++++++++++
 sycl/include/sycl/types.hpp                 | 309 +-----------------
 2 files changed, 331 insertions(+), 308 deletions(-)
 create mode 100644 sycl/include/sycl/detail/vector_convert.hpp

diff --git a/sycl/include/sycl/detail/vector_convert.hpp b/sycl/include/sycl/detail/vector_convert.hpp
new file mode 100644
index 0000000000000..9e5be22d5a9f8
--- /dev/null
+++ b/sycl/include/sycl/detail/vector_convert.hpp
@@ -0,0 +1,330 @@
+//==-- vector_convert.hpp --- vec::convert implementation ------------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <sycl/detail/generic_type_traits.hpp> // for is_sigeninteger, is_s...
+
+#ifndef __SYCL_DEVICE_ONLY__
+#include <sycl/builtins_scalar_gen.hpp> // for ceil, floor, rint, trunc
+
+#include <cfenv> // for fesetround, fegetround
+#endif
+
+#include <type_traits>
+
+namespace sycl {
+
+enum class rounding_mode { automatic = 0, rte = 1, rtz = 2, rtp = 3, rtn = 4 };
+
+inline namespace _V1 {
+namespace detail {
+
+template <typename T, typename R>
+using is_int_to_int = std::integral_constant<bool, std::is_integral_v<T> &&
+                                                       std::is_integral_v<R>>;
+
+template <typename T, typename R>
+using is_sint_to_sint =
+    std::integral_constant<bool, is_sigeninteger_v<T> && is_sigeninteger_v<R>>;
+
+template <typename T, typename R>
+using is_uint_to_uint =
+    std::integral_constant<bool, is_sugeninteger_v<T> && is_sugeninteger_v<R>>;
+
+template <typename T, typename R>
+using is_sint_to_from_uint =
+    std::integral_constant<bool,
+                           (is_sugeninteger_v<T> && is_sigeninteger_v<R>) ||
+                               (is_sigeninteger_v<T> && is_sugeninteger_v<R>)>;
+
+template <typename T, typename R>
+using is_sint_to_float = std::integral_constant<
+    bool, std::is_integral_v<T> &&
+              !(std::is_unsigned_v<T>)&&detail::is_floating_point<R>::value>;
+
+template <typename T, typename R>
+using is_uint_to_float =
+    std::integral_constant<bool, std::is_unsigned_v<T> &&
+                                     detail::is_floating_point<R>::value>;
+
+template <typename T, typename R>
+using is_int_to_float =
+    std::integral_constant<bool, std::is_integral_v<T> &&
+                                     detail::is_floating_point<R>::value>;
+
+template <typename T, typename R>
+using is_float_to_int =
+    std::integral_constant<bool, detail::is_floating_point<T>::value &&
+                                     std::is_integral_v<R>>;
+
+template <typename T, typename R>
+using is_float_to_float =
+    std::integral_constant<bool, detail::is_floating_point<T>::value &&
+                                     detail::is_floating_point<R>::value>;
+template <typename T>
+using is_standard_type = std::integral_constant<bool, detail::is_sgentype_v<T>>;
+
+template <typename T, typename R, rounding_mode roundingMode, typename OpenCLT,
+          typename OpenCLR>
+std::enable_if_t<std::is_same_v<T, R>, R> convertImpl(T Value) {
+  return Value;
+}
+
+#ifndef __SYCL_DEVICE_ONLY__
+
+// Note for float to half conversions, static_cast calls the conversion operator
+// implemented for host that takes care of the precision requirements.
+template <typename T, typename R, rounding_mode roundingMode, typename OpenCLT,
+          typename OpenCLR>
+std::enable_if_t<!std::is_same_v<T, R> && (is_int_to_int<T, R>::value ||
+                                           is_int_to_float<T, R>::value ||
+                                           is_float_to_float<T, R>::value),
+                 R>
+convertImpl(T Value) {
+  return static_cast<R>(Value);
+}
+
+// float to int
+template <typename T, typename R, rounding_mode roundingMode, typename OpenCLT,
+          typename OpenCLR>
+std::enable_if_t<is_float_to_int<T, R>::value, R> convertImpl(T Value) {
+  switch (roundingMode) {
+    // Round to nearest even is default rounding mode for floating-point types
+  case rounding_mode::automatic:
+    // Round to nearest even.
+  case rounding_mode::rte: {
+    int OldRoundingDirection = std::fegetround();
+    int Err = std::fesetround(FE_TONEAREST);
+    if (Err)
+      throw sycl::exception(make_error_code(errc::runtime),
+                            "Unable to set rounding mode to FE_TONEAREST");
+    R Result = sycl::rint(Value);
+    Err = std::fesetround(OldRoundingDirection);
+    if (Err)
+      throw sycl::exception(make_error_code(errc::runtime),
+                            "Unable to restore rounding mode.");
+    return Result;
+  }
+    // Round toward zero.
+  case rounding_mode::rtz:
+    return sycl::trunc(Value);
+    // Round toward positive infinity.
+  case rounding_mode::rtp:
+    return sycl::ceil(Value);
+    // Round toward negative infinity.
+  case rounding_mode::rtn:
+    return sycl::floor(Value);
+  };
+  assert(false && "Unsupported rounding mode!");
+  return static_cast<R>(Value);
+}
+#else
+
+template <rounding_mode Mode>
+using RteOrAutomatic = std::bool_constant<Mode == rounding_mode::automatic ||
+                                          Mode == rounding_mode::rte>;
+
+template <rounding_mode Mode>
+using Rtz = std::bool_constant<Mode == rounding_mode::rtz>;
+
+template <rounding_mode Mode>
+using Rtp = std::bool_constant<Mode == rounding_mode::rtp>;
+
+template <rounding_mode Mode>
+using Rtn = std::bool_constant<Mode == rounding_mode::rtn>;
+
+// convert types with an equal size and diff names
+template <typename T, typename R, rounding_mode roundingMode, typename OpenCLT,
+          typename OpenCLR>
+std::enable_if_t<!std::is_same_v<T, R> && std::is_same_v<OpenCLT, OpenCLR>, R>
+convertImpl(T Value) {
+  return static_cast<R>(Value);
+}
+
+// signed to signed
+#define __SYCL_GENERATE_CONVERT_IMPL(DestType)                                 \
+  template <typename T, typename R, rounding_mode roundingMode,                \
+            typename OpenCLT, typename OpenCLR>                                \
+  std::enable_if_t<is_sint_to_sint<T, R>::value &&                             \
+                       !std::is_same_v<OpenCLT, OpenCLR> &&                    \
+                       (std::is_same_v<OpenCLR, opencl::cl_##DestType> ||      \
+                        (std::is_same_v<OpenCLR, signed char> &&               \
+                         std::is_same_v<DestType, char>)),                     \
+                   R>                                                          \
+  convertImpl(T Value) {                                                       \
+    OpenCLT OpValue = sycl::detail::convertDataToType<T, OpenCLT>(Value);      \
+    return __spirv_SConvert##_R##DestType(OpValue);                            \
+  }
+
+__SYCL_GENERATE_CONVERT_IMPL(char)
+__SYCL_GENERATE_CONVERT_IMPL(short)
+__SYCL_GENERATE_CONVERT_IMPL(int)
+__SYCL_GENERATE_CONVERT_IMPL(long)
+
+#undef __SYCL_GENERATE_CONVERT_IMPL
+
+// unsigned to unsigned
+#define __SYCL_GENERATE_CONVERT_IMPL(DestType)                                 \
+  template <typename T, typename R, rounding_mode roundingMode,                \
+            typename OpenCLT, typename OpenCLR>                                \
+  std::enable_if_t<is_uint_to_uint<T, R>::value &&                             \
+                       !std::is_same_v<OpenCLT, OpenCLR> &&                    \
+                       std::is_same_v<OpenCLR, opencl::cl_##DestType>,         \
+                   R>                                                          \
+  convertImpl(T Value) {                                                       \
+    OpenCLT OpValue = sycl::detail::convertDataToType<T, OpenCLT>(Value);      \
+    return __spirv_UConvert##_R##DestType(OpValue);                            \
+  }
+
+__SYCL_GENERATE_CONVERT_IMPL(uchar)
+__SYCL_GENERATE_CONVERT_IMPL(ushort)
+__SYCL_GENERATE_CONVERT_IMPL(uint)
+__SYCL_GENERATE_CONVERT_IMPL(ulong)
+
+#undef __SYCL_GENERATE_CONVERT_IMPL
+
+// unsigned to (from) signed
+template <typename T, typename R, rounding_mode roundingMode, typename OpenCLT,
+          typename OpenCLR>
+std::enable_if_t<is_sint_to_from_uint<T, R>::value &&
+                     is_standard_type<OpenCLT>::value &&
+                     is_standard_type<OpenCLR>::value,
+                 R>
+convertImpl(T Value) {
+  return static_cast<R>(Value);
+}
+
+// sint to float
+#define __SYCL_GENERATE_CONVERT_IMPL(SPIRVOp, DestType)                        \
+  template <typename T, typename R, rounding_mode roundingMode,                \
+            typename OpenCLT, typename OpenCLR>                                \
+  std::enable_if_t<is_sint_to_float<T, R>::value &&                            \
+                       (std::is_same_v<OpenCLR, DestType> ||                   \
+                        (std::is_same_v<OpenCLR, _Float16> &&                  \
+                         std::is_same_v<DestType, half>)),                     \
+                   R>                                                          \
+  convertImpl(T Value) {                                                       \
+    OpenCLT OpValue = sycl::detail::convertDataToType<T, OpenCLT>(Value);      \
+    return __spirv_Convert##SPIRVOp##_R##DestType(OpValue);                    \
+  }
+
+__SYCL_GENERATE_CONVERT_IMPL(SToF, half)
+__SYCL_GENERATE_CONVERT_IMPL(SToF, float)
+__SYCL_GENERATE_CONVERT_IMPL(SToF, double)
+
+#undef __SYCL_GENERATE_CONVERT_IMPL
+
+// uint to float
+#define __SYCL_GENERATE_CONVERT_IMPL(SPIRVOp, DestType)                        \
+  template <typename T, typename R, rounding_mode roundingMode,                \
+            typename OpenCLT, typename OpenCLR>                                \
+  std::enable_if_t<is_uint_to_float<T, R>::value &&                            \
+                       (std::is_same_v<OpenCLR, DestType> ||                   \
+                        (std::is_same_v<OpenCLR, _Float16> &&                  \
+                         std::is_same_v<DestType, half>)),                     \
+                   R>                                                          \
+  convertImpl(T Value) {                                                       \
+    OpenCLT OpValue = sycl::detail::convertDataToType<T, OpenCLT>(Value);      \
+    return __spirv_Convert##SPIRVOp##_R##DestType(OpValue);                    \
+  }
+
+__SYCL_GENERATE_CONVERT_IMPL(UToF, half)
+__SYCL_GENERATE_CONVERT_IMPL(UToF, float)
+__SYCL_GENERATE_CONVERT_IMPL(UToF, double)
+
+#undef __SYCL_GENERATE_CONVERT_IMPL
+
+// float to float
+#define __SYCL_GENERATE_CONVERT_IMPL(DestType, RoundingMode,                   \
+                                     RoundingModeCondition)                    \
+  template <typename T, typename R, rounding_mode roundingMode,                \
+            typename OpenCLT, typename OpenCLR>                                \
+  std::enable_if_t<is_float_to_float<T, R>::value &&                           \
+                       !std::is_same_v<OpenCLT, OpenCLR> &&                    \
+                       (std::is_same_v<OpenCLR, DestType> ||                   \
+                        (std::is_same_v<OpenCLR, _Float16> &&                  \
+                         std::is_same_v<DestType, half>)) &&                   \
+                       RoundingModeCondition<roundingMode>::value,             \
+                   R>                                                          \
+  convertImpl(T Value) {                                                       \
+    OpenCLT OpValue = sycl::detail::convertDataToType<T, OpenCLT>(Value);      \
+    return __spirv_FConvert##_R##DestType##_##RoundingMode(OpValue);           \
+  }
+
+#define __SYCL_GENERATE_CONVERT_IMPL_FOR_ROUNDING_MODE(RoundingMode,           \
+                                                       RoundingModeCondition)  \
+  __SYCL_GENERATE_CONVERT_IMPL(double, RoundingMode, RoundingModeCondition)    \
+  __SYCL_GENERATE_CONVERT_IMPL(float, RoundingMode, RoundingModeCondition)     \
+  __SYCL_GENERATE_CONVERT_IMPL(half, RoundingMode, RoundingModeCondition)
+
+__SYCL_GENERATE_CONVERT_IMPL_FOR_ROUNDING_MODE(rte, RteOrAutomatic)
+__SYCL_GENERATE_CONVERT_IMPL_FOR_ROUNDING_MODE(rtz, Rtz)
+__SYCL_GENERATE_CONVERT_IMPL_FOR_ROUNDING_MODE(rtp, Rtp)
+__SYCL_GENERATE_CONVERT_IMPL_FOR_ROUNDING_MODE(rtn, Rtn)
+
+#undef __SYCL_GENERATE_CONVERT_IMPL_FOR_ROUNDING_MODE
+#undef __SYCL_GENERATE_CONVERT_IMPL
+
+// float to int
+#define __SYCL_GENERATE_CONVERT_IMPL(SPIRVOp, DestType, RoundingMode,          \
+                                     RoundingModeCondition)                    \
+  template <typename T, typename R, rounding_mode roundingMode,                \
+            typename OpenCLT, typename OpenCLR>                                \
+  std::enable_if_t<is_float_to_int<T, R>::value &&                             \
+                       (std::is_same_v<OpenCLR, opencl::cl_##DestType> ||      \
+                        (std::is_same_v<OpenCLR, signed char> &&               \
+                         std::is_same_v<DestType, char>)) &&                   \
+                       RoundingModeCondition<roundingMode>::value,             \
+                   R>                                                          \
+  convertImpl(T Value) {                                                       \
+    OpenCLT OpValue = sycl::detail::convertDataToType<T, OpenCLT>(Value);      \
+    return __spirv_Convert##SPIRVOp##_R##DestType##_##RoundingMode(OpValue);   \
+  }
+
+#define __SYCL_GENERATE_CONVERT_IMPL_FOR_ROUNDING_MODE(RoundingMode,           \
+                                                       RoundingModeCondition)  \
+  __SYCL_GENERATE_CONVERT_IMPL(FToS, int, RoundingMode, RoundingModeCondition) \
+  __SYCL_GENERATE_CONVERT_IMPL(FToS, char, RoundingMode,                       \
+                               RoundingModeCondition)                          \
+  __SYCL_GENERATE_CONVERT_IMPL(FToS, short, RoundingMode,                      \
+                               RoundingModeCondition)                          \
+  __SYCL_GENERATE_CONVERT_IMPL(FToS, long, RoundingMode,                       \
+                               RoundingModeCondition)                          \
+  __SYCL_GENERATE_CONVERT_IMPL(FToU, uint, RoundingMode,                       \
+                               RoundingModeCondition)                          \
+  __SYCL_GENERATE_CONVERT_IMPL(FToU, uchar, RoundingMode,                      \
+                               RoundingModeCondition)                          \
+  __SYCL_GENERATE_CONVERT_IMPL(FToU, ushort, RoundingMode,                     \
+                               RoundingModeCondition)                          \
+  __SYCL_GENERATE_CONVERT_IMPL(FToU, ulong, RoundingMode, RoundingModeCondition)
+
+__SYCL_GENERATE_CONVERT_IMPL_FOR_ROUNDING_MODE(rte, RteOrAutomatic)
+__SYCL_GENERATE_CONVERT_IMPL_FOR_ROUNDING_MODE(rtz, Rtz)
+__SYCL_GENERATE_CONVERT_IMPL_FOR_ROUNDING_MODE(rtp, Rtp)
+__SYCL_GENERATE_CONVERT_IMPL_FOR_ROUNDING_MODE(rtn, Rtn)
+
+#undef __SYCL_GENERATE_CONVERT_IMPL_FOR_ROUNDING_MODE
+#undef __SYCL_GENERATE_CONVERT_IMPL
+
+// Back up
+template <typename T, typename R, rounding_mode roundingMode, typename OpenCLT,
+          typename OpenCLR>
+std::enable_if_t<
+    ((!is_standard_type<T>::value && !is_standard_type<OpenCLT>::value) ||
+     (!is_standard_type<R>::value && !is_standard_type<OpenCLR>::value)) &&
+        !std::is_same_v<OpenCLT, OpenCLR>,
+    R>
+convertImpl(T Value) {
+  return static_cast<R>(Value);
+}
+
+#endif // __SYCL_DEVICE_ONLY__
+} // namespace detail
+} // namespace _V1
+} // namespace sycl
diff --git a/sycl/include/sycl/types.hpp b/sycl/include/sycl/types.hpp
index 6a5857e0ed83b..c0ef7d393b574 100644
--- a/sycl/include/sycl/types.hpp
+++ b/sycl/include/sycl/types.hpp
@@ -39,6 +39,7 @@
 #include <sycl/detail/memcpy.hpp>              // for memcpy
 #include <sycl/detail/type_list.hpp>           // for is_contained
 #include <sycl/detail/type_traits.hpp>         // for is_floating_point
+#include <sycl/detail/vector_convert.hpp>      // for convertImpl
 #include <sycl/detail/vector_traits.hpp>       // for vector_alignment
 #include <sycl/exception.hpp>                  // for make_error_code, errc
 #include <sycl/half_type.hpp>                  // for StorageT, half, Vec16...
@@ -58,19 +59,12 @@
 #include <utility>     // for index_sequence, make_...
 #include <variant>     // for tuple, variant
 
-#ifndef __SYCL_DEVICE_ONLY__
-#include <sycl/builtins_scalar_gen.hpp> // for ceil, floor, rint, trunc
-
-#include <cfenv> // for fesetround, fegetround
-#endif
-
 // 4.10.1: Scalar data types
 // 4.10.2: SYCL vector types
 
 namespace sycl {
 inline namespace _V1 {
 
-enum class rounding_mode { automatic = 0, rte = 1, rtz = 2, rtp = 3, rtn = 4 };
 struct elem {
   static constexpr int x = 0;
   static constexpr int y = 1;
@@ -227,307 +221,6 @@ template <typename T> struct LShift {
   }
 };
 
-template <typename T, typename R>
-using is_int_to_int = std::integral_constant<bool, std::is_integral_v<T> &&
-                                                       std::is_integral_v<R>>;
-
-template <typename T, typename R>
-using is_sint_to_sint =
-    std::integral_constant<bool, is_sigeninteger_v<T> && is_sigeninteger_v<R>>;
-
-template <typename T, typename R>
-using is_uint_to_uint =
-    std::integral_constant<bool, is_sugeninteger_v<T> && is_sugeninteger_v<R>>;
-
-template <typename T, typename R>
-using is_sint_to_from_uint =
-    std::integral_constant<bool,
-                           (is_sugeninteger_v<T> && is_sigeninteger_v<R>) ||
-                               (is_sigeninteger_v<T> && is_sugeninteger_v<R>)>;
-
-template <typename T, typename R>
-using is_sint_to_float = std::integral_constant<
-    bool, std::is_integral_v<T> &&
-              !(std::is_unsigned_v<T>)&&detail::is_floating_point<R>::value>;
-
-template <typename T, typename R>
-using is_uint_to_float =
-    std::integral_constant<bool, std::is_unsigned_v<T> &&
-                                     detail::is_floating_point<R>::value>;
-
-template <typename T, typename R>
-using is_int_to_float =
-    std::integral_constant<bool, std::is_integral_v<T> &&
-                                     detail::is_floating_point<R>::value>;
-
-template <typename T, typename R>
-using is_float_to_int =
-    std::integral_constant<bool, detail::is_floating_point<T>::value &&
-                                     std::is_integral_v<R>>;
-
-template <typename T, typename R>
-using is_float_to_float =
-    std::integral_constant<bool, detail::is_floating_point<T>::value &&
-                                     detail::is_floating_point<R>::value>;
-template <typename T>
-using is_standard_type = std::integral_constant<bool, detail::is_sgentype_v<T>>;
-
-template <typename T, typename R, rounding_mode roundingMode, typename OpenCLT,
-          typename OpenCLR>
-std::enable_if_t<std::is_same_v<T, R>, R> convertImpl(T Value) {
-  return Value;
-}
-
-#ifndef __SYCL_DEVICE_ONLY__
-
-// Note for float to half conversions, static_cast calls the conversion operator
-// implemented for host that takes care of the precision requirements.
-template <typename T, typename R, rounding_mode roundingMode, typename OpenCLT,
-          typename OpenCLR>
-std::enable_if_t<!std::is_same_v<T, R> && (is_int_to_int<T, R>::value ||
-                                           is_int_to_float<T, R>::value ||
-                                           is_float_to_float<T, R>::value),
-                 R>
-convertImpl(T Value) {
-  return static_cast<R>(Value);
-}
-
-// float to int
-template <typename T, typename R, rounding_mode roundingMode, typename OpenCLT,
-          typename OpenCLR>
-std::enable_if_t<is_float_to_int<T, R>::value, R> convertImpl(T Value) {
-  switch (roundingMode) {
-    // Round to nearest even is default rounding mode for floating-point types
-  case rounding_mode::automatic:
-    // Round to nearest even.
-  case rounding_mode::rte: {
-    int OldRoundingDirection = std::fegetround();
-    int Err = std::fesetround(FE_TONEAREST);
-    if (Err)
-      throw sycl::exception(make_error_code(errc::runtime),
-                            "Unable to set rounding mode to FE_TONEAREST");
-    R Result = sycl::rint(Value);
-    Err = std::fesetround(OldRoundingDirection);
-    if (Err)
-      throw sycl::exception(make_error_code(errc::runtime),
-                            "Unable to restore rounding mode.");
-    return Result;
-  }
-    // Round toward zero.
-  case rounding_mode::rtz:
-    return sycl::trunc(Value);
-    // Round toward positive infinity.
-  case rounding_mode::rtp:
-    return sycl::ceil(Value);
-    // Round toward negative infinity.
-  case rounding_mode::rtn:
-    return sycl::floor(Value);
-  };
-  assert(false && "Unsupported rounding mode!");
-  return static_cast<R>(Value);
-}
-#else
-
-template <rounding_mode Mode>
-using RteOrAutomatic = std::bool_constant<Mode == rounding_mode::automatic ||
-                                          Mode == rounding_mode::rte>;
-
-template <rounding_mode Mode>
-using Rtz = std::bool_constant<Mode == rounding_mode::rtz>;
-
-template <rounding_mode Mode>
-using Rtp = std::bool_constant<Mode == rounding_mode::rtp>;
-
-template <rounding_mode Mode>
-using Rtn = std::bool_constant<Mode == rounding_mode::rtn>;
-
-// convert types with an equal size and diff names
-template <typename T, typename R, rounding_mode roundingMode, typename OpenCLT,
-          typename OpenCLR>
-std::enable_if_t<!std::is_same_v<T, R> && std::is_same_v<OpenCLT, OpenCLR>, R>
-convertImpl(T Value) {
-  return static_cast<R>(Value);
-}
-
-// signed to signed
-#define __SYCL_GENERATE_CONVERT_IMPL(DestType)                                 \
-  template <typename T, typename R, rounding_mode roundingMode,                \
-            typename OpenCLT, typename OpenCLR>                                \
-  std::enable_if_t<is_sint_to_sint<T, R>::value &&                             \
-                       !std::is_same_v<OpenCLT, OpenCLR> &&                    \
-                       (std::is_same_v<OpenCLR, opencl::cl_##DestType> ||      \
-                        (std::is_same_v<OpenCLR, signed char> &&               \
-                         std::is_same_v<DestType, char>)),                     \
-                   R>                                                          \
-  convertImpl(T Value) {                                                       \
-    OpenCLT OpValue = sycl::detail::convertDataToType<T, OpenCLT>(Value);      \
-    return __spirv_SConvert##_R##DestType(OpValue);                            \
-  }
-
-__SYCL_GENERATE_CONVERT_IMPL(char)
-__SYCL_GENERATE_CONVERT_IMPL(short)
-__SYCL_GENERATE_CONVERT_IMPL(int)
-__SYCL_GENERATE_CONVERT_IMPL(long)
-
-#undef __SYCL_GENERATE_CONVERT_IMPL
-
-// unsigned to unsigned
-#define __SYCL_GENERATE_CONVERT_IMPL(DestType)                                 \
-  template <typename T, typename R, rounding_mode roundingMode,                \
-            typename OpenCLT, typename OpenCLR>                                \
-  std::enable_if_t<is_uint_to_uint<T, R>::value &&                             \
-                       !std::is_same_v<OpenCLT, OpenCLR> &&                    \
-                       std::is_same_v<OpenCLR, opencl::cl_##DestType>,         \
-                   R>                                                          \
-  convertImpl(T Value) {                                                       \
-    OpenCLT OpValue = sycl::detail::convertDataToType<T, OpenCLT>(Value);      \
-    return __spirv_UConvert##_R##DestType(OpValue);                            \
-  }
-
-__SYCL_GENERATE_CONVERT_IMPL(uchar)
-__SYCL_GENERATE_CONVERT_IMPL(ushort)
-__SYCL_GENERATE_CONVERT_IMPL(uint)
-__SYCL_GENERATE_CONVERT_IMPL(ulong)
-
-#undef __SYCL_GENERATE_CONVERT_IMPL
-
-// unsigned to (from) signed
-template <typename T, typename R, rounding_mode roundingMode, typename OpenCLT,
-          typename OpenCLR>
-std::enable_if_t<is_sint_to_from_uint<T, R>::value &&
-                     is_standard_type<OpenCLT>::value &&
-                     is_standard_type<OpenCLR>::value,
-                 R>
-convertImpl(T Value) {
-  return static_cast<R>(Value);
-}
-
-// sint to float
-#define __SYCL_GENERATE_CONVERT_IMPL(SPIRVOp, DestType)                        \
-  template <typename T, typename R, rounding_mode roundingMode,                \
-            typename OpenCLT, typename OpenCLR>                                \
-  std::enable_if_t<is_sint_to_float<T, R>::value &&                            \
-                       (std::is_same_v<OpenCLR, DestType> ||                   \
-                        (std::is_same_v<OpenCLR, _Float16> &&                  \
-                         std::is_same_v<DestType, half>)),                     \
-                   R>                                                          \
-  convertImpl(T Value) {                                                       \
-    OpenCLT OpValue = sycl::detail::convertDataToType<T, OpenCLT>(Value);      \
-    return __spirv_Convert##SPIRVOp##_R##DestType(OpValue);                    \
-  }
-
-__SYCL_GENERATE_CONVERT_IMPL(SToF, half)
-__SYCL_GENERATE_CONVERT_IMPL(SToF, float)
-__SYCL_GENERATE_CONVERT_IMPL(SToF, double)
-
-#undef __SYCL_GENERATE_CONVERT_IMPL
-
-// uint to float
-#define __SYCL_GENERATE_CONVERT_IMPL(SPIRVOp, DestType)                        \
-  template <typename T, typename R, rounding_mode roundingMode,                \
-            typename OpenCLT, typename OpenCLR>                                \
-  std::enable_if_t<is_uint_to_float<T, R>::value &&                            \
-                       (std::is_same_v<OpenCLR, DestType> ||                   \
-                        (std::is_same_v<OpenCLR, _Float16> &&                  \
-                         std::is_same_v<DestType, half>)),                     \
-                   R>                                                          \
-  convertImpl(T Value) {                                                       \
-    OpenCLT OpValue = sycl::detail::convertDataToType<T, OpenCLT>(Value);      \
-    return __spirv_Convert##SPIRVOp##_R##DestType(OpValue);                    \
-  }
-
-__SYCL_GENERATE_CONVERT_IMPL(UToF, half)
-__SYCL_GENERATE_CONVERT_IMPL(UToF, float)
-__SYCL_GENERATE_CONVERT_IMPL(UToF, double)
-
-#undef __SYCL_GENERATE_CONVERT_IMPL
-
-// float to float
-#define __SYCL_GENERATE_CONVERT_IMPL(DestType, RoundingMode,                   \
-                                     RoundingModeCondition)                    \
-  template <typename T, typename R, rounding_mode roundingMode,                \
-            typename OpenCLT, typename OpenCLR>                                \
-  std::enable_if_t<is_float_to_float<T, R>::value &&                           \
-                       !std::is_same_v<OpenCLT, OpenCLR> &&                    \
-                       (std::is_same_v<OpenCLR, DestType> ||                   \
-                        (std::is_same_v<OpenCLR, _Float16> &&                  \
-                         std::is_same_v<DestType, half>)) &&                   \
-                       RoundingModeCondition<roundingMode>::value,             \
-                   R>                                                          \
-  convertImpl(T Value) {                                                       \
-    OpenCLT OpValue = sycl::detail::convertDataToType<T, OpenCLT>(Value);      \
-    return __spirv_FConvert##_R##DestType##_##RoundingMode(OpValue);           \
-  }
-
-#define __SYCL_GENERATE_CONVERT_IMPL_FOR_ROUNDING_MODE(RoundingMode,           \
-                                                       RoundingModeCondition)  \
-  __SYCL_GENERATE_CONVERT_IMPL(double, RoundingMode, RoundingModeCondition)    \
-  __SYCL_GENERATE_CONVERT_IMPL(float, RoundingMode, RoundingModeCondition)     \
-  __SYCL_GENERATE_CONVERT_IMPL(half, RoundingMode, RoundingModeCondition)
-
-__SYCL_GENERATE_CONVERT_IMPL_FOR_ROUNDING_MODE(rte, RteOrAutomatic)
-__SYCL_GENERATE_CONVERT_IMPL_FOR_ROUNDING_MODE(rtz, Rtz)
-__SYCL_GENERATE_CONVERT_IMPL_FOR_ROUNDING_MODE(rtp, Rtp)
-__SYCL_GENERATE_CONVERT_IMPL_FOR_ROUNDING_MODE(rtn, Rtn)
-
-#undef __SYCL_GENERATE_CONVERT_IMPL_FOR_ROUNDING_MODE
-#undef __SYCL_GENERATE_CONVERT_IMPL
-
-// float to int
-#define __SYCL_GENERATE_CONVERT_IMPL(SPIRVOp, DestType, RoundingMode,          \
-                                     RoundingModeCondition)                    \
-  template <typename T, typename R, rounding_mode roundingMode,                \
-            typename OpenCLT, typename OpenCLR>                                \
-  std::enable_if_t<is_float_to_int<T, R>::value &&                             \
-                       (std::is_same_v<OpenCLR, opencl::cl_##DestType> ||      \
-                        (std::is_same_v<OpenCLR, signed char> &&               \
-                         std::is_same_v<DestType, char>)) &&                   \
-                       RoundingModeCondition<roundingMode>::value,             \
-                   R>                                                          \
-  convertImpl(T Value) {                                                       \
-    OpenCLT OpValue = sycl::detail::convertDataToType<T, OpenCLT>(Value);      \
-    return __spirv_Convert##SPIRVOp##_R##DestType##_##RoundingMode(OpValue);   \
-  }
-
-#define __SYCL_GENERATE_CONVERT_IMPL_FOR_ROUNDING_MODE(RoundingMode,           \
-                                                       RoundingModeCondition)  \
-  __SYCL_GENERATE_CONVERT_IMPL(FToS, int, RoundingMode, RoundingModeCondition) \
-  __SYCL_GENERATE_CONVERT_IMPL(FToS, char, RoundingMode,                       \
-                               RoundingModeCondition)                          \
-  __SYCL_GENERATE_CONVERT_IMPL(FToS, short, RoundingMode,                      \
-                               RoundingModeCondition)                          \
-  __SYCL_GENERATE_CONVERT_IMPL(FToS, long, RoundingMode,                       \
-                               RoundingModeCondition)                          \
-  __SYCL_GENERATE_CONVERT_IMPL(FToU, uint, RoundingMode,                       \
-                               RoundingModeCondition)                          \
-  __SYCL_GENERATE_CONVERT_IMPL(FToU, uchar, RoundingMode,                      \
-                               RoundingModeCondition)                          \
-  __SYCL_GENERATE_CONVERT_IMPL(FToU, ushort, RoundingMode,                     \
-                               RoundingModeCondition)                          \
-  __SYCL_GENERATE_CONVERT_IMPL(FToU, ulong, RoundingMode, RoundingModeCondition)
-
-__SYCL_GENERATE_CONVERT_IMPL_FOR_ROUNDING_MODE(rte, RteOrAutomatic)
-__SYCL_GENERATE_CONVERT_IMPL_FOR_ROUNDING_MODE(rtz, Rtz)
-__SYCL_GENERATE_CONVERT_IMPL_FOR_ROUNDING_MODE(rtp, Rtp)
-__SYCL_GENERATE_CONVERT_IMPL_FOR_ROUNDING_MODE(rtn, Rtn)
-
-#undef __SYCL_GENERATE_CONVERT_IMPL_FOR_ROUNDING_MODE
-#undef __SYCL_GENERATE_CONVERT_IMPL
-
-// Back up
-template <typename T, typename R, rounding_mode roundingMode, typename OpenCLT,
-          typename OpenCLR>
-std::enable_if_t<
-    ((!is_standard_type<T>::value && !is_standard_type<OpenCLT>::value) ||
-     (!is_standard_type<R>::value && !is_standard_type<OpenCLR>::value)) &&
-        !std::is_same_v<OpenCLT, OpenCLR>,
-    R>
-convertImpl(T Value) {
-  return static_cast<R>(Value);
-}
-
-#endif // __SYCL_DEVICE_ONLY__
-
 // Forward declarations
 template <typename TransformedArgType, int Dims, typename KernelType>
 class RoundedRangeKernel;

From 7982bc340bf7e627ddc552e941d8b0493ab9248d Mon Sep 17 00:00:00 2001
From: Jeremy Kun <2467754+j2kun@users.noreply.github.com>
Date: Wed, 1 Nov 2023 14:21:53 -0700
Subject: [PATCH 745/877] Add minor doc improvements (#70984)

- Document legacy ops lowered by `convert-func-to-llvm` and link it to a
GH issue
- Document the need for bufferization before `convert-linalg-to-loops`
- Document the ops lowered by `expand-strided-metadata`.
---
 mlir/include/mlir/Conversion/Passes.td                | 4 ++++
 mlir/include/mlir/Dialect/Linalg/Passes.td            | 7 +++++++
 mlir/include/mlir/Dialect/MemRef/Transforms/Passes.td | 8 ++++++++
 mlir/lib/Conversion/FuncToLLVM/FuncToLLVM.cpp         | 3 ++-
 4 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/mlir/include/mlir/Conversion/Passes.td b/mlir/include/mlir/Conversion/Passes.td
index d7253212e5fe3..08ad52e7a778c 100644
--- a/mlir/include/mlir/Conversion/Passes.td
+++ b/mlir/include/mlir/Conversion/Passes.td
@@ -391,6 +391,10 @@ def ConvertFuncToLLVMPass : Pass<"convert-func-to-llvm", "ModuleOp"> {
     1 value is returned, packed into an LLVM IR struct type. Function calls and
     returns are updated accordingly. Block argument types are updated to use
     LLVM IR types.
+
+    Note that until https://github.com/llvm/llvm-project/issues/70982 is resolved,
+    this pass includes patterns that lower `arith` and `cf` to LLVM. This is legacy
+    code due to when they were all converted in the same pass.
   }];
   let dependentDialects = ["LLVM::LLVMDialect"];
   let options = [
diff --git a/mlir/include/mlir/Dialect/Linalg/Passes.td b/mlir/include/mlir/Dialect/Linalg/Passes.td
index 3093604af63e3..cca50e21d5ce0 100644
--- a/mlir/include/mlir/Dialect/Linalg/Passes.td
+++ b/mlir/include/mlir/Dialect/Linalg/Passes.td
@@ -69,6 +69,13 @@ def LinalgLowerToAffineLoops : Pass<"convert-linalg-to-affine-loops"> {
 
 def LinalgLowerToLoops : Pass<"convert-linalg-to-loops"> {
   let summary = "Lower the operations from the linalg dialect into loops";
+  let description = [{
+    Lowers the `linalg` ops to loop nests using `scf.for`.
+
+    Pre-condition: the operands used by the `linalg` ops have buffer semantics,
+    i.e., tensor operands and results must be converted to memrefs via
+    bufferization.
+  }];
   let constructor = "mlir::createConvertLinalgToLoopsPass()";
   let dependentDialects = [
     "linalg::LinalgDialect",
diff --git a/mlir/include/mlir/Dialect/MemRef/Transforms/Passes.td b/mlir/include/mlir/Dialect/MemRef/Transforms/Passes.td
index d7ee492b9e990..651ee05ae1f3c 100644
--- a/mlir/include/mlir/Dialect/MemRef/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/MemRef/Transforms/Passes.td
@@ -196,6 +196,14 @@ def ExpandStridedMetadata : Pass<"expand-strided-metadata"> {
     In particular, this pass transforms operations into explicit sequence of
     operations that model the effect of this operation on the different metadata.
     This pass uses affine constructs to materialize these effects.
+
+    Supported ops include:
+
+    - `memref.collapse_shape`
+    - `memref.expand_shape`
+    - `memref.extract_aligned_pointer_as_index`
+    - `memref.extract_strided_metadata`
+    - `memref.subview`
   }];
   let constructor = "mlir::memref::createExpandStridedMetadataPass()";
   let dependentDialects = [
diff --git a/mlir/lib/Conversion/FuncToLLVM/FuncToLLVM.cpp b/mlir/lib/Conversion/FuncToLLVM/FuncToLLVM.cpp
index 3126d1dee32cb..84e145c98e971 100644
--- a/mlir/lib/Conversion/FuncToLLVM/FuncToLLVM.cpp
+++ b/mlir/lib/Conversion/FuncToLLVM/FuncToLLVM.cpp
@@ -804,7 +804,8 @@ struct ConvertFuncToLLVMPass
     RewritePatternSet patterns(&getContext());
     populateFuncToLLVMConversionPatterns(typeConverter, patterns, symbolTable);
 
-    // TODO: Remove these in favor of their dedicated conversion passes.
+    // TODO(https://github.com/llvm/llvm-project/issues/70982): Remove these in
+    // favor of their dedicated conversion passes.
     arith::populateArithToLLVMConversionPatterns(typeConverter, patterns);
     cf::populateControlFlowToLLVMConversionPatterns(typeConverter, patterns);
 

From dba04bbcf47007bb57f4686f10b55455cbbad84e Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2@gmail.com>
Date: Wed, 1 Nov 2023 17:35:18 -0400
Subject: [PATCH 746/877] [libc++] Add a %{verify} substitution (#70878)

This makes it easier to write .sh.cpp tests that perform multiple clang-verify checks in the same test.
---
 .../convenience_substitutions/verify.sh.cpp   | 16 ++++++++++
 libcxx/utils/libcxx/test/format.py            | 30 ++++++++++++-------
 2 files changed, 35 insertions(+), 11 deletions(-)
 create mode 100644 libcxx/test/libcxx/selftest/convenience_substitutions/verify.sh.cpp

diff --git a/libcxx/test/libcxx/selftest/convenience_substitutions/verify.sh.cpp b/libcxx/test/libcxx/selftest/convenience_substitutions/verify.sh.cpp
new file mode 100644
index 0000000000000..d1713b44a086d
--- /dev/null
+++ b/libcxx/test/libcxx/selftest/convenience_substitutions/verify.sh.cpp
@@ -0,0 +1,16 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// REQUIRES: verify-support
+
+// Make sure that we provide the %{verify} convenience substitution.
+
+// RUN: %{verify}
+
+struct Foo {};
+typedef Foo::x x; // expected-error {{no type named 'x' in 'Foo'}}
diff --git a/libcxx/utils/libcxx/test/format.py b/libcxx/utils/libcxx/test/format.py
index 4106cd83db34b..52c6f9cd8f2ef 100644
--- a/libcxx/utils/libcxx/test/format.py
+++ b/libcxx/utils/libcxx/test/format.py
@@ -75,11 +75,21 @@ def parseScript(test, preamble):
     tmpDir, tmpBase = _getTempPaths(test)
     substitutions = lit.TestRunner.getDefaultSubstitutions(test, tmpDir, tmpBase)
 
-    # Check base substitutions and add the %{build} and %{run} convenience substitutions
+    # Check base substitutions and add the %{build}, %{verify} and %{run} convenience substitutions
+    #
+    # Note: We use -Wno-error with %{verify} to make sure that we don't treat all diagnostics as
+    #       errors, which doesn't make sense for clang-verify tests because we may want to check
+    #       for specific warning diagnostics.
     _checkBaseSubstitutions(substitutions)
     substitutions.append(
         ("%{build}", "%{cxx} %s %{flags} %{compile_flags} %{link_flags} -o %t.exe")
     )
+    substitutions.append(
+        (
+            "%{verify}",
+            "%{cxx} %s %{flags} %{compile_flags} -fsyntax-only -Wno-error -Xclang -verify -Xclang -verify-ignore-unexpected=note -ferror-limit=0",
+        )
+    )
     substitutions.append(("%{run}", "%{exec} %t.exe"))
 
     # Parse the test file, including custom directives
@@ -227,6 +237,13 @@ class CxxStandardLibraryTest(lit.formats.FileBasedTest):
             file with the %{flags}, %{compile_flags} and %{link_flags}
             substitutions, and that produces an executable named %t.exe.
 
+        %{verify}
+            Expands to a command-line that builds the current source
+            file with the %{flags} and %{compile_flags} substitutions
+            and enables clang-verify. This can be used to write .sh.cpp
+            tests that use clang-verify. Note that this substitution can
+            only be used when the 'verify-support' feature is available.
+
         %{run}
             Equivalent to `%{exec} %t.exe`. This is intended to be used
             in conjunction with the %{build} substitution.
@@ -265,9 +282,6 @@ def getTestsForPath(self, testSuite, pathInSuite, litConfig, localConfig):
             yield lit.Test.Test(testSuite, pathInSuite, localConfig)
 
     def execute(self, test, litConfig):
-        VERIFY_FLAGS = (
-            "-Xclang -verify -Xclang -verify-ignore-unexpected=note -ferror-limit=0"
-        )
         supportsVerify = "verify-support" in test.config.available_features
         filename = test.path_in_suite[-1]
 
@@ -305,13 +319,7 @@ def execute(self, test, litConfig):
                         test.getFullName()
                     ),
                 )
-            steps = [
-                # Note: Use -Wno-error to make sure all diagnostics are not treated as errors,
-                #       which doesn't make sense for clang-verify tests.
-                "%dbg(COMPILED WITH) %{{cxx}} %s %{{flags}} %{{compile_flags}} -fsyntax-only -Wno-error {}".format(
-                    VERIFY_FLAGS
-                )
-            ]
+            steps = ["%dbg(COMPILED WITH) %{verify}"]
             return self._executeShTest(test, litConfig, steps)
         # Make sure to check these ones last, since they will match other
         # suffixes above too.

From e69e066bfe53a96f73114ae450fecb77fbef3796 Mon Sep 17 00:00:00 2001
From: Youngsuk Kim <youngsuk.kim@hpe.com>
Date: Wed, 1 Nov 2023 16:33:09 -0500
Subject: [PATCH 747/877] [llvm][PowerPC] Remove no-op ptr-to-ptr bitcasts
 (NFC)

Opaque ptr cleanup effort.
---
 llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
index f2d7fb3e245db..4a1247991e342 100644
--- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@@ -76,16 +76,14 @@ PPCTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
     if (getOrEnforceKnownAlignment(
             II.getArgOperand(0), Align(16), IC.getDataLayout(), &II,
             &IC.getAssumptionCache(), &IC.getDominatorTree()) >= 16) {
-      Value *Ptr = IC.Builder.CreateBitCast(
-          II.getArgOperand(0), PointerType::getUnqual(II.getType()));
+      Value *Ptr = II.getArgOperand(0);
       return new LoadInst(II.getType(), Ptr, "", false, Align(16));
     }
     break;
   case Intrinsic::ppc_vsx_lxvw4x:
   case Intrinsic::ppc_vsx_lxvd2x: {
     // Turn PPC VSX loads into normal loads.
-    Value *Ptr = IC.Builder.CreateBitCast(II.getArgOperand(0),
-                                          PointerType::getUnqual(II.getType()));
+    Value *Ptr = II.getArgOperand(0);
     return new LoadInst(II.getType(), Ptr, Twine(""), false, Align(1));
   }
   case Intrinsic::ppc_altivec_stvx:
@@ -94,16 +92,14 @@ PPCTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
     if (getOrEnforceKnownAlignment(
             II.getArgOperand(1), Align(16), IC.getDataLayout(), &II,
             &IC.getAssumptionCache(), &IC.getDominatorTree()) >= 16) {
-      Type *OpPtrTy = PointerType::getUnqual(II.getArgOperand(0)->getType());
-      Value *Ptr = IC.Builder.CreateBitCast(II.getArgOperand(1), OpPtrTy);
+      Value *Ptr = II.getArgOperand(1);
       return new StoreInst(II.getArgOperand(0), Ptr, false, Align(16));
     }
     break;
   case Intrinsic::ppc_vsx_stxvw4x:
   case Intrinsic::ppc_vsx_stxvd2x: {
     // Turn PPC VSX stores into normal stores.
-    Type *OpPtrTy = PointerType::getUnqual(II.getArgOperand(0)->getType());
-    Value *Ptr = IC.Builder.CreateBitCast(II.getArgOperand(1), OpPtrTy);
+    Value *Ptr = II.getArgOperand(1);
     return new StoreInst(II.getArgOperand(0), Ptr, false, Align(1));
   }
   case Intrinsic::ppc_altivec_vperm:

From e808f8a61683ccd9da88f722135ff59ddcdb5aef Mon Sep 17 00:00:00 2001
From: Valery Pykhtin <valery.pykhtin@gmail.com>
Date: Wed, 1 Nov 2023 23:01:39 +0100
Subject: [PATCH 748/877] [AMDGPU] GCNRegPressurePrinter pass to print
 GCNRegPressure values for testing. (#70031)

Using GCNDownwardRPTracker or GCNUpwardRPTracker the pass collects register pressure values for a function and prints these values next to instructions. Output can be used to generate Filecheck rules in mir tests.
---
 llvm/lib/Target/AMDGPU/AMDGPU.h               |   3 +
 .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp |   1 +
 llvm/lib/Target/AMDGPU/GCNRegPressure.cpp     | 144 +++++-
 llvm/lib/Target/AMDGPU/GCNRegPressure.h       |  34 +-
 .../CodeGen/AMDGPU/regpressure_printer.mir    | 462 ++++++++++++++++++
 5 files changed, 625 insertions(+), 19 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/regpressure_printer.mir

diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 97a413296c55e..2c29710f8c8cb 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -191,6 +191,9 @@ extern char &AMDGPUImageIntrinsicOptimizerID;
 void initializeAMDGPUPerfHintAnalysisPass(PassRegistry &);
 extern char &AMDGPUPerfHintAnalysisID;
 
+void initializeGCNRegPressurePrinterPass(PassRegistry &);
+extern char &GCNRegPressurePrinterID;
+
 // Passes common to R600 and SI
 FunctionPass *createAMDGPUPromoteAlloca();
 void initializeAMDGPUPromoteAllocaPass(PassRegistry&);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index dc7321cd5de9f..375df27206f7b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -428,6 +428,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
   initializeGCNPreRAOptimizationsPass(*PR);
   initializeGCNPreRALongBranchRegPass(*PR);
   initializeGCNRewritePartialRegUsesPass(*PR);
+  initializeGCNRegPressurePrinterPass(*PR);
 }
 
 static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
index 1ca0f3b6e06b8..a04c470b7b976 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "GCNRegPressure.h"
+#include "AMDGPU.h"
 #include "llvm/CodeGen/RegisterPressure.h"
 
 using namespace llvm;
@@ -31,7 +32,6 @@ bool llvm::isEqual(const GCNRPTracker::LiveRegSet &S1,
   return true;
 }
 
-
 ///////////////////////////////////////////////////////////////////////////////
 // GCNRegPressure
 
@@ -135,8 +135,6 @@ bool GCNRegPressure::less(const GCNSubtarget &ST,
                           O.getVGPRNum(ST.hasGFX90AInsts()));
 }
 
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-LLVM_DUMP_METHOD
 Printable llvm::print(const GCNRegPressure &RP, const GCNSubtarget *ST) {
   return Printable([&RP, ST](raw_ostream &OS) {
     OS << "VGPRs: " << RP.Value[GCNRegPressure::VGPR32] << ' '
@@ -155,7 +153,6 @@ Printable llvm::print(const GCNRegPressure &RP, const GCNSubtarget *ST) {
     OS << '\n';
   });
 }
-#endif
 
 static LaneBitmask getDefRegMask(const MachineOperand &MO,
                                  const MachineRegisterInfo &MRI) {
@@ -269,6 +266,13 @@ void GCNUpwardRPTracker::reset(const MachineInstr &MI,
   GCNRPTracker::reset(MI, LiveRegsCopy, true);
 }
 
+void GCNUpwardRPTracker::reset(const MachineRegisterInfo &MRI_,
+                               const LiveRegSet &LiveRegs_) {
+  MRI = &MRI_;
+  LiveRegs = LiveRegs_;
+  MaxPressure = CurPressure = getRegPressure(MRI_, LiveRegs_);
+}
+
 void GCNUpwardRPTracker::recede(const MachineInstr &MI) {
   assert(MRI && "call reset first");
 
@@ -418,19 +422,17 @@ bool GCNDownwardRPTracker::advance(MachineBasicBlock::const_iterator Begin,
   return advance(End);
 }
 
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-LLVM_DUMP_METHOD
 Printable llvm::reportMismatch(const GCNRPTracker::LiveRegSet &LISLR,
                                const GCNRPTracker::LiveRegSet &TrackedLR,
-                               const TargetRegisterInfo *TRI) {
-  return Printable([&LISLR, &TrackedLR, TRI](raw_ostream &OS) {
+                               const TargetRegisterInfo *TRI, StringRef Pfx) {
+  return Printable([&LISLR, &TrackedLR, TRI, Pfx](raw_ostream &OS) {
     for (auto const &P : TrackedLR) {
       auto I = LISLR.find(P.first);
       if (I == LISLR.end()) {
-        OS << "  " << printReg(P.first, TRI) << ":L" << PrintLaneMask(P.second)
+        OS << Pfx << printReg(P.first, TRI) << ":L" << PrintLaneMask(P.second)
            << " isn't found in LIS reported set\n";
       } else if (I->second != P.second) {
-        OS << "  " << printReg(P.first, TRI)
+        OS << Pfx << printReg(P.first, TRI)
            << " masks doesn't match: LIS reported " << PrintLaneMask(I->second)
            << ", tracked " << PrintLaneMask(P.second) << '\n';
       }
@@ -438,7 +440,7 @@ Printable llvm::reportMismatch(const GCNRPTracker::LiveRegSet &LISLR,
     for (auto const &P : LISLR) {
       auto I = TrackedLR.find(P.first);
       if (I == TrackedLR.end()) {
-        OS << "  " << printReg(P.first, TRI) << ":L" << PrintLaneMask(P.second)
+        OS << Pfx << printReg(P.first, TRI) << ":L" << PrintLaneMask(P.second)
            << " isn't found in tracked set\n";
       }
     }
@@ -467,7 +469,6 @@ bool GCNUpwardRPTracker::isValid() const {
   return true;
 }
 
-LLVM_DUMP_METHOD
 Printable llvm::print(const GCNRPTracker::LiveRegSet &LiveRegs,
                       const MachineRegisterInfo &MRI) {
   return Printable([&LiveRegs, &MRI](raw_ostream &OS) {
@@ -483,7 +484,122 @@ Printable llvm::print(const GCNRPTracker::LiveRegSet &LiveRegs,
   });
 }
 
-LLVM_DUMP_METHOD
 void GCNRegPressure::dump() const { dbgs() << print(*this); }
 
-#endif
+static cl::opt<bool> UseDownwardTracker(
+    "amdgpu-print-rp-downward",
+    cl::desc("Use GCNDownwardRPTracker for GCNRegPressurePrinter pass"),
+    cl::init(false), cl::Hidden);
+
+char llvm::GCNRegPressurePrinter::ID = 0;
+char &llvm::GCNRegPressurePrinterID = GCNRegPressurePrinter::ID;
+
+INITIALIZE_PASS(GCNRegPressurePrinter, "amdgpu-print-rp", "", true, true)
+
+bool GCNRegPressurePrinter::runOnMachineFunction(MachineFunction &MF) {
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+  const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo();
+  const LiveIntervals &LIS = getAnalysis<LiveIntervals>();
+
+  auto &OS = dbgs();
+
+// Leading spaces are important for YAML syntax.
+#define PFX "  "
+
+  OS << "---\nname: " << MF.getName() << "\nbody:             |\n";
+
+  auto printRP = [](const GCNRegPressure &RP) {
+    return Printable([&RP](raw_ostream &OS) {
+      OS << format(PFX "  %-5d", RP.getSGPRNum())
+         << format(" %-5d", RP.getVGPRNum(false));
+    });
+  };
+
+  auto ReportLISMismatchIfAny = [&](const GCNRPTracker::LiveRegSet &TrackedLR,
+                                    const GCNRPTracker::LiveRegSet &LISLR) {
+    if (LISLR != TrackedLR) {
+      OS << PFX "  mis LIS: " << llvm::print(LISLR, MRI)
+         << reportMismatch(LISLR, TrackedLR, TRI, PFX "    ");
+    }
+  };
+
+  // Register pressure before and at an instruction (in program order).
+  SmallVector<std::pair<GCNRegPressure, GCNRegPressure>, 16> RP;
+
+  for (auto &MBB : MF) {
+    RP.clear();
+    RP.reserve(MBB.size());
+
+    OS << PFX;
+    MBB.printName(OS);
+    OS << ":\n";
+
+    SlotIndex MBBStartSlot = LIS.getSlotIndexes()->getMBBStartIdx(&MBB);
+    SlotIndex MBBEndSlot = LIS.getSlotIndexes()->getMBBEndIdx(&MBB);
+
+    GCNRPTracker::LiveRegSet LiveIn, LiveOut;
+    GCNRegPressure RPAtMBBEnd;
+
+    if (UseDownwardTracker) {
+      if (MBB.empty()) {
+        LiveIn = LiveOut = getLiveRegs(MBBStartSlot, LIS, MRI);
+        RPAtMBBEnd = getRegPressure(MRI, LiveIn);
+      } else {
+        GCNDownwardRPTracker RPT(LIS);
+        RPT.reset(MBB.front());
+
+        LiveIn = RPT.getLiveRegs();
+
+        while (!RPT.advanceBeforeNext()) {
+          GCNRegPressure RPBeforeMI = RPT.getPressure();
+          RPT.advanceToNext();
+          RP.emplace_back(RPBeforeMI, RPT.getPressure());
+        }
+
+        LiveOut = RPT.getLiveRegs();
+        RPAtMBBEnd = RPT.getPressure();
+      }
+    } else {
+      GCNUpwardRPTracker RPT(LIS);
+      RPT.reset(MRI, MBBEndSlot);
+      RPT.moveMaxPressure(); // Clear max pressure.
+
+      LiveOut = RPT.getLiveRegs();
+      RPAtMBBEnd = RPT.getPressure();
+
+      for (auto &MI : reverse(MBB)) {
+        RPT.recede(MI);
+        if (!MI.isDebugInstr())
+          RP.emplace_back(RPT.getPressure(), RPT.moveMaxPressure());
+      }
+
+      LiveIn = RPT.getLiveRegs();
+    }
+
+    OS << PFX "  Live-in: " << llvm::print(LiveIn, MRI);
+    if (!UseDownwardTracker)
+      ReportLISMismatchIfAny(LiveIn, getLiveRegs(MBBStartSlot, LIS, MRI));
+
+    OS << PFX "  SGPR  VGPR\n";
+    int I = 0;
+    for (auto &MI : MBB) {
+      if (!MI.isDebugInstr()) {
+        auto &[RPBeforeInstr, RPAtInstr] =
+            RP[UseDownwardTracker ? I : (RP.size() - 1 - I)];
+        ++I;
+        OS << printRP(RPBeforeInstr) << '\n' << printRP(RPAtInstr) << "  ";
+      } else
+        OS << PFX "               ";
+      MI.print(OS);
+    }
+    OS << printRP(RPAtMBBEnd) << '\n';
+
+    OS << PFX "  Live-out:" << llvm::print(LiveOut, MRI);
+    if (UseDownwardTracker)
+      ReportLISMismatchIfAny(LiveOut, getLiveRegs(MBBEndSlot, LIS, MRI));
+  }
+  OS << "...\n";
+  return false;
+
+#undef PFX
+}
\ No newline at end of file
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
index 72e18acc1b8e4..c750fe74749e2 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
@@ -128,6 +128,8 @@ class GCNRPTracker {
 
   void clearMaxPressure() { MaxPressure.clear(); }
 
+  GCNRegPressure getPressure() const { return CurPressure; }
+
   // returns MaxPressure, resetting it
   decltype(MaxPressure) moveMaxPressure() {
     auto Res = MaxPressure;
@@ -140,6 +142,9 @@ class GCNRPTracker {
   }
 };
 
+GCNRPTracker::LiveRegSet getLiveRegs(SlotIndex SI, const LiveIntervals &LIS,
+                                     const MachineRegisterInfo &MRI);
+
 class GCNUpwardRPTracker : public GCNRPTracker {
 public:
   GCNUpwardRPTracker(const LiveIntervals &LIS_) : GCNRPTracker(LIS_) {}
@@ -148,6 +153,14 @@ class GCNUpwardRPTracker : public GCNRPTracker {
   // filling live regs upon this point using LIS
   void reset(const MachineInstr &MI, const LiveRegSet *LiveRegs = nullptr);
 
+  // reset tracker and set live register set to the specified value.
+  void reset(const MachineRegisterInfo &MRI_, const LiveRegSet &LiveRegs_);
+
+  // reset tracker at the specified slot index.
+  void reset(const MachineRegisterInfo &MRI_, SlotIndex SI) {
+    reset(MRI_, llvm::getLiveRegs(SI, LIS, MRI_));
+  }
+
   // move to the state just above the MI
   void recede(const MachineInstr &MI);
 
@@ -196,10 +209,6 @@ LaneBitmask getLiveLaneMask(unsigned Reg,
                             const LiveIntervals &LIS,
                             const MachineRegisterInfo &MRI);
 
-GCNRPTracker::LiveRegSet getLiveRegs(SlotIndex SI,
-                                     const LiveIntervals &LIS,
-                                     const MachineRegisterInfo &MRI);
-
 /// creates a map MachineInstr -> LiveRegSet
 /// R - range of iterators on instructions
 /// After - upon entry or exit of every instruction
@@ -275,7 +284,22 @@ Printable print(const GCNRPTracker::LiveRegSet &LiveRegs,
 
 Printable reportMismatch(const GCNRPTracker::LiveRegSet &LISLR,
                          const GCNRPTracker::LiveRegSet &TrackedL,
-                         const TargetRegisterInfo *TRI);
+                         const TargetRegisterInfo *TRI, StringRef Pfx = "  ");
+
+struct GCNRegPressurePrinter : public MachineFunctionPass {
+  static char ID;
+
+public:
+  GCNRegPressurePrinter() : MachineFunctionPass(ID) {}
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<LiveIntervals>();
+    AU.setPreservesAll();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
 
 } // end namespace llvm
 
diff --git a/llvm/test/CodeGen/AMDGPU/regpressure_printer.mir b/llvm/test/CodeGen/AMDGPU/regpressure_printer.mir
new file mode 100644
index 0000000000000..d53050167e98b
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/regpressure_printer.mir
@@ -0,0 +1,462 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --filetype=null --run-pass=amdgpu-print-rp %s 2>&1 >/dev/null | FileCheck %s --check-prefix=RP --check-prefix=RPU
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --filetype=null --run-pass=amdgpu-print-rp -amdgpu-print-rp-downward %s 2>&1 >/dev/null | FileCheck %s --check-prefix=RP --check-prefix=RPD
+
+
+---
+name:  trivial
+tracksRegLiveness: true
+body:             |
+  ; RP-LABEL: name: trivial
+  ; RP: bb.0:
+  ; RP-NEXT:   Live-in:
+  ; RP-NEXT:   SGPR  VGPR
+  ; RP-NEXT:   0     0
+  ; RP-NEXT:   0     1      %0:vgpr_32 = V_MOV_B32_e32 42, implicit $exec
+  ; RP-NEXT:   0     1
+  ; RP-NEXT:   2     1      %1:sgpr_64 = IMPLICIT_DEF
+  ; RP-NEXT:   2     1
+  ; RP-NEXT:   Live-out: %0:0000000000000003 %1:000000000000000F
+  ; RP-NEXT: bb.1:
+  ; RP-NEXT:   Live-in:  %0:0000000000000003 %1:000000000000000F
+  ; RP-NEXT:   SGPR  VGPR
+  ; RP-NEXT:   2     1
+  ; RP-NEXT:   Live-out: %0:0000000000000003 %1:000000000000000F
+  ; RP-NEXT: bb.2:
+  ; RP-NEXT:   Live-in:  %0:0000000000000003 %1:000000000000000F
+  ; RP-NEXT:   SGPR  VGPR
+  ; RP-NEXT:   2     1
+  ; RP-NEXT:   2     1      S_NOP 0, implicit %0:vgpr_32, implicit %1:sgpr_64
+  ; RP-NEXT:   0     0
+  ; RP-NEXT:   Live-out:
+  bb.0:
+    %0:vgpr_32 = V_MOV_B32_e32 42, implicit $exec
+    %1:sgpr_64 = IMPLICIT_DEF
+  bb.1:
+
+  bb.2:
+    S_NOP 0, implicit %0, implicit %1
+...
+---
+name:  live_through_test
+tracksRegLiveness: true
+body:             |
+  ; RPU-LABEL: name: live_through_test
+  ; RPU: bb.0:
+  ; RPU-NEXT:   Live-in:
+  ; RPU-NEXT:   SGPR  VGPR
+  ; RPU-NEXT:   0     0
+  ; RPU-NEXT:   3     0      %0:sgpr_128 = IMPLICIT_DEF
+  ; RPU-NEXT:   3     0
+  ; RPU-NEXT:   Live-out: %0:00000000000000F3
+  ; RPU-NEXT: bb.1:
+  ; RPU-NEXT:   Live-in:  %0:00000000000000F3
+  ; RPU-NEXT:   SGPR  VGPR
+  ; RPU-NEXT:   3     0
+  ; RPU-NEXT:   3     0      S_NOP 0, implicit %0.sub0:sgpr_128
+  ; RPU-NEXT:   2     0
+  ; RPU-NEXT:   3     0      %0.sub0:sgpr_128 = IMPLICIT_DEF
+  ; RPU-NEXT:   3     0
+  ; RPU-NEXT:   3     0      %0.sub1:sgpr_128 = IMPLICIT_DEF
+  ; RPU-NEXT:   3     0
+  ; RPU-NEXT:   3     0      S_NOP 0, implicit %0.sub2:sgpr_128
+  ; RPU-NEXT:   2     0
+  ; RPU-NEXT:   3     0      %0.sub2:sgpr_128 = IMPLICIT_DEF
+  ; RPU-NEXT:   3     0
+  ; RPU-NEXT:   3     0      S_NOP 0, implicit %0.sub2:sgpr_128
+  ; RPU-NEXT:   2     0
+  ; RPU-NEXT:   2     0      S_NOP 0, implicit %0.sub3:sgpr_128
+  ; RPU-NEXT:   2     0
+  ; RPU-NEXT:   Live-out: %0:00000000000000C3
+  ; RPU-NEXT: bb.2:
+  ; RPU-NEXT:   Live-in:  %0:00000000000000C3
+  ; RPU-NEXT:   SGPR  VGPR
+  ; RPU-NEXT:   2     0
+  ; RPU-NEXT:   2     0      S_NOP 0, implicit %0.sub3:sgpr_128, implicit %0.sub0:sgpr_128
+  ; RPU-NEXT:   0     0
+  ; RPU-NEXT:   Live-out:
+  ;
+  ; RPD-LABEL: name: live_through_test
+  ; RPD: bb.0:
+  ; RPD-NEXT:   Live-in:
+  ; RPD-NEXT:   SGPR  VGPR
+  ; RPD-NEXT:   0     0
+  ; RPD-NEXT:   4     0      %0:sgpr_128 = IMPLICIT_DEF
+  ; RPD-NEXT:   3     0
+  ; RPD-NEXT:   Live-out: %0:00000000000000F3
+  ; RPD-NEXT: bb.1:
+  ; RPD-NEXT:   Live-in:  %0:00000000000000F3
+  ; RPD-NEXT:   SGPR  VGPR
+  ; RPD-NEXT:   3     0
+  ; RPD-NEXT:   3     0      S_NOP 0, implicit %0.sub0:sgpr_128
+  ; RPD-NEXT:   2     0
+  ; RPD-NEXT:   3     0      %0.sub0:sgpr_128 = IMPLICIT_DEF
+  ; RPD-NEXT:   3     0
+  ; RPD-NEXT:   4     0      %0.sub1:sgpr_128 = IMPLICIT_DEF
+  ; RPD-NEXT:   3     0
+  ; RPD-NEXT:   3     0      S_NOP 0, implicit %0.sub2:sgpr_128
+  ; RPD-NEXT:   2     0
+  ; RPD-NEXT:   3     0      %0.sub2:sgpr_128 = IMPLICIT_DEF
+  ; RPD-NEXT:   3     0
+  ; RPD-NEXT:   3     0      S_NOP 0, implicit %0.sub2:sgpr_128
+  ; RPD-NEXT:   2     0
+  ; RPD-NEXT:   2     0      S_NOP 0, implicit %0.sub3:sgpr_128
+  ; RPD-NEXT:   2     0
+  ; RPD-NEXT:   Live-out: %0:00000000000000C3
+  ; RPD-NEXT: bb.2:
+  ; RPD-NEXT:   Live-in:  %0:00000000000000C3
+  ; RPD-NEXT:   SGPR  VGPR
+  ; RPD-NEXT:   2     0
+  ; RPD-NEXT:   2     0      S_NOP 0, implicit %0.sub3:sgpr_128, implicit %0.sub0:sgpr_128
+  ; RPD-NEXT:   0     0
+  ; RPD-NEXT:   Live-out:
+  bb.0:
+    %0:sgpr_128 = IMPLICIT_DEF
+  bb.1:
+
+    S_NOP 0, implicit %0.sub0 ; kill sub0
+    %0.sub0 = IMPLICIT_DEF ; redef sub0
+
+    %0.sub1:sgpr_128 = IMPLICIT_DEF ; redef sub1
+
+    S_NOP 0, implicit %0.sub2 ; kill sub2
+    %0.sub2:sgpr_128 = IMPLICIT_DEF ; redef sub2
+    S_NOP 0, implicit %0.sub2 ; kill sub2
+
+    S_NOP 0, implicit %0.sub3 ; use sub3, live-through
+
+  bb.2:
+    S_NOP 0, implicit %0.sub3, implicit %0.sub0
+...
+
+# This testcase shows the problem with LiveIntervals: it doesn't create
+# subranges for undefined but used subregisters. Upward tracker is able to see
+# the use of undefined subregister and tracks it correctly.
+---
+name:  upward_problem_lis_subregs_mismatch
+tracksRegLiveness: true
+body:             |
+  ; RPU-LABEL: name: upward_problem_lis_subregs_mismatch
+  ; RPU: bb.0:
+  ; RPU-NEXT:   Live-in:
+  ; RPU-NEXT:   SGPR  VGPR
+  ; RPU-NEXT:   0     0
+  ; RPU-NEXT:   0     1      undef %0.sub0:vreg_64 = V_MOV_B32_e32 42, implicit $exec
+  ; RPU-NEXT:   0     1
+  ; RPU-NEXT:   0     2      undef %1.sub1:vreg_64 = V_MOV_B32_e32 33, implicit $exec
+  ; RPU-NEXT:   0     2
+  ; RPU-NEXT:   Live-out: %0:0000000000000003 %1:000000000000000C
+  ; RPU-NEXT: bb.1:
+  ; RPU-NEXT:   Live-in:  %0:0000000000000003 %1:000000000000000C
+  ; RPU-NEXT:   SGPR  VGPR
+  ; RPU-NEXT:   0     2
+  ; RPU-NEXT:   Live-out: %0:0000000000000003 %1:000000000000000C
+  ; RPU-NEXT: bb.2:
+  ; RPU-NEXT:   Live-in:  %0:000000000000000F %1:000000000000000F
+  ; RPU-NEXT:   mis LIS:  %0:0000000000000003 %1:000000000000000C
+  ; RPU-NEXT:     %0 masks doesn't match: LIS reported 0000000000000003, tracked 000000000000000F
+  ; RPU-NEXT:     %1 masks doesn't match: LIS reported 000000000000000C, tracked 000000000000000F
+  ; RPU-NEXT:   SGPR  VGPR
+  ; RPU-NEXT:   0     4
+  ; RPU-NEXT:   0     4      S_NOP 0, implicit %0:vreg_64, implicit %1:vreg_64
+  ; RPU-NEXT:   0     0
+  ; RPU-NEXT:   Live-out:
+  ;
+  ; RPD-LABEL: name: upward_problem_lis_subregs_mismatch
+  ; RPD: bb.0:
+  ; RPD-NEXT:   Live-in:
+  ; RPD-NEXT:   SGPR  VGPR
+  ; RPD-NEXT:   0     0
+  ; RPD-NEXT:   0     1      undef %0.sub0:vreg_64 = V_MOV_B32_e32 42, implicit $exec
+  ; RPD-NEXT:   0     1
+  ; RPD-NEXT:   0     2      undef %1.sub1:vreg_64 = V_MOV_B32_e32 33, implicit $exec
+  ; RPD-NEXT:   0     2
+  ; RPD-NEXT:   Live-out: %0:0000000000000003 %1:000000000000000C
+  ; RPD-NEXT: bb.1:
+  ; RPD-NEXT:   Live-in:  %0:0000000000000003 %1:000000000000000C
+  ; RPD-NEXT:   SGPR  VGPR
+  ; RPD-NEXT:   0     2
+  ; RPD-NEXT:   Live-out: %0:0000000000000003 %1:000000000000000C
+  ; RPD-NEXT: bb.2:
+  ; RPD-NEXT:   Live-in:  %0:0000000000000003 %1:000000000000000C
+  ; RPD-NEXT:   SGPR  VGPR
+  ; RPD-NEXT:   0     2
+  ; RPD-NEXT:   0     2      S_NOP 0, implicit %0:vreg_64, implicit %1:vreg_64
+  ; RPD-NEXT:   0     0
+  ; RPD-NEXT:   Live-out:
+  bb.0:
+    undef %0.sub0:vreg_64 = V_MOV_B32_e32 42, implicit $exec
+    undef %1.sub1:vreg_64 = V_MOV_B32_e32 33, implicit $exec
+
+  bb.1:
+
+  bb.2:
+    S_NOP 0, implicit %0, implicit %1
+...
+---
+name:            only_dbg_value_sched_region
+tracksRegLiveness: true
+machineFunctionInfo:
+  isEntryFunction: true
+  waveLimiter:     true
+body:             |
+  ; RPU-LABEL: name: only_dbg_value_sched_region
+  ; RPU: bb.0:
+  ; RPU-NEXT:   Live-in:
+  ; RPU-NEXT:   SGPR  VGPR
+  ; RPU-NEXT:   0     0
+  ; RPU-NEXT:   0     1      %0:vgpr_32 = COPY $vgpr0
+  ; RPU-NEXT:   0     1
+  ; RPU-NEXT:   0     3      %1:vreg_64 = IMPLICIT_DEF
+  ; RPU-NEXT:   0     3
+  ; RPU-NEXT:   0     5      %2:vreg_64 = GLOBAL_LOAD_DWORDX2 %1:vreg_64, 0, 0, implicit $exec
+  ; RPU-NEXT:   0     5
+  ; RPU-NEXT:   0     6      %3:vgpr_32 = GLOBAL_LOAD_DWORD %1:vreg_64, 8, 0, implicit $exec
+  ; RPU-NEXT:   0     6
+  ; RPU-NEXT:   0     7      undef %4.sub1:vreg_64 = V_ADD_U32_e32 %0:vgpr_32, %0:vgpr_32, implicit $exec
+  ; RPU-NEXT:   0     7
+  ; RPU-NEXT:   0     8      %4.sub0:vreg_64 = V_MOV_B32_e32 111, implicit $exec
+  ; RPU-NEXT:   0     8
+  ; RPU-NEXT:   0     10     %5:vreg_64 = COPY %2:vreg_64
+  ; RPU-NEXT:   0     9
+  ; RPU-NEXT:   0     9      undef %6.sub0:vreg_64 = V_ADD_F32_e32 %1.sub0:vreg_64, %5.sub0:vreg_64, implicit $mode, implicit $exec
+  ; RPU-NEXT:   0     8
+  ; RPU-NEXT:   0     8      dead %6.sub1:vreg_64 = V_ADD_F32_e32 %1.sub1:vreg_64, %5.sub0:vreg_64, implicit $mode, implicit $exec
+  ; RPU-NEXT:   0     7
+  ; RPU-NEXT:   0     8      %7:vgpr_32 = GLOBAL_LOAD_DWORD %5:vreg_64, 0, 0, implicit $exec
+  ; RPU-NEXT:   0     6
+  ; RPU-NEXT:   0     7      %8:vreg_64 = IMPLICIT_DEF
+  ; RPU-NEXT:   0     7
+  ; RPU-NEXT:   0     9      %9:vreg_64 = IMPLICIT_DEF
+  ; RPU-NEXT:   0     9
+  ; RPU-NEXT:   0     11     %10:vreg_64 = IMPLICIT_DEF
+  ; RPU-NEXT:   0     11
+  ; RPU-NEXT:   0     12     undef %11.sub1:vreg_64 = IMPLICIT_DEF
+  ; RPU-NEXT:   0     12
+  ; RPU-NEXT:   0     13     %12:vgpr_32 = IMPLICIT_DEF
+  ; RPU-NEXT:   0     13
+  ; RPU-NEXT:   0     14     %13:vgpr_32 = IMPLICIT_DEF
+  ; RPU-NEXT:   0     14
+  ; RPU-NEXT:   0     16     %14:vreg_64 = IMPLICIT_DEF
+  ; RPU-NEXT:   0     16
+  ; RPU-NEXT:   0     18     %15:vreg_64 = IMPLICIT_DEF
+  ; RPU-NEXT:   0     18
+  ; RPU-NEXT:   0     19     %16:vgpr_32 = IMPLICIT_DEF
+  ; RPU-NEXT:   0     19
+  ; RPU-NEXT:   0     20     %17:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; RPU-NEXT:   0     20
+  ; RPU-NEXT:   0     21     %18:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; RPU-NEXT:   0     21
+  ; RPU-NEXT:   0     22     undef %19.sub0:vreg_64 = V_ADD_F32_e32 %7:vgpr_32, %2.sub0:vreg_64, implicit $mode, implicit $exec
+  ; RPU-NEXT:   0     20
+  ; RPU-NEXT:   0     21     %19.sub1:vreg_64 = V_ADD_F32_e32 %3:vgpr_32, %3:vgpr_32, implicit $mode, implicit $exec
+  ; RPU-NEXT:                DBG_VALUE
+  ; RPU-NEXT:   0     20
+  ; RPU-NEXT:   0     20     GLOBAL_STORE_DWORDX2 %19:vreg_64, %4:vreg_64, 32, 0, implicit $exec
+  ; RPU-NEXT:   0     16
+  ; RPU-NEXT:   0     17     %11.sub0:vreg_64 = GLOBAL_LOAD_DWORD %9:vreg_64, 0, 0, implicit $exec
+  ; RPU-NEXT:   0     15
+  ; RPU-NEXT:   0     16     %8.sub0:vreg_64 = GLOBAL_LOAD_DWORD %10:vreg_64, 0, 0, implicit $exec
+  ; RPU-NEXT:   0     14
+  ; RPU-NEXT:   0     14     dead %20:vgpr_32 = GLOBAL_LOAD_DWORD %11:vreg_64, 0, 0, implicit $exec
+  ; RPU-NEXT:                DBG_VALUE
+  ; RPU-NEXT:                DBG_VALUE
+  ; RPU-NEXT:   0     12
+  ; RPU-NEXT:   0     12     dead %21:vgpr_32 = GLOBAL_LOAD_DWORD %14:vreg_64, 0, 0, implicit $exec
+  ; RPU-NEXT:   0     10
+  ; RPU-NEXT:   0     10     dead %22:vgpr_32 = GLOBAL_LOAD_DWORD %15:vreg_64, 0, 0, implicit $exec
+  ; RPU-NEXT:   0     10
+  ; RPU-NEXT:   0     11     %23:vreg_64 = V_LSHLREV_B64_e64 2, %8:vreg_64, implicit $exec
+  ; RPU-NEXT:   0     9
+  ; RPU-NEXT:   0     9      S_NOP 0, implicit %13:vgpr_32, implicit %23.sub0:vreg_64, implicit %12:vgpr_32, implicit %17:vgpr_32
+  ; RPU-NEXT:   0     5
+  ; RPU-NEXT:   0     5      GLOBAL_STORE_DWORD %15:vreg_64, %18:vgpr_32, 0, 0, implicit $exec
+  ; RPU-NEXT:   0     2
+  ; RPU-NEXT:   Live-out: %0:0000000000000003 %16:0000000000000003
+  ; RPU-NEXT: bb.1:
+  ; RPU-NEXT:   Live-in:  %0:0000000000000003 %16:0000000000000003
+  ; RPU-NEXT:   SGPR  VGPR
+  ; RPU-NEXT:                DBG_VALUE
+  ; RPU-NEXT:   0     2
+  ; RPU-NEXT:   0     2      S_SETREG_IMM32_B32 0, 1, implicit-def $mode, implicit $mode
+  ; RPU-NEXT:                DBG_VALUE
+  ; RPU-NEXT:                DBG_VALUE
+  ; RPU-NEXT:   0     2
+  ; RPU-NEXT:   0     2      S_SETREG_IMM32_B32 0, 1, implicit-def $mode, implicit $mode
+  ; RPU-NEXT:                DBG_VALUE
+  ; RPU-NEXT:   0     2
+  ; RPU-NEXT:   Live-out: %0:0000000000000003 %16:0000000000000003
+  ; RPU-NEXT: bb.2:
+  ; RPU-NEXT:   Live-in:  %0:0000000000000003 %16:0000000000000003
+  ; RPU-NEXT:   SGPR  VGPR
+  ; RPU-NEXT:   0     2
+  ; RPU-NEXT:   Live-out: %0:0000000000000003 %16:0000000000000003
+  ; RPU-NEXT: bb.3:
+  ; RPU-NEXT:   Live-in:  %0:0000000000000003 %16:0000000000000003
+  ; RPU-NEXT:   SGPR  VGPR
+  ; RPU-NEXT:   0     2
+  ; RPU-NEXT:   0     2      S_NOP 0, implicit %0:vgpr_32
+  ; RPU-NEXT:   0     1
+  ; RPU-NEXT:   0     1      S_NOP 0, implicit %16:vgpr_32
+  ; RPU-NEXT:   0     0
+  ; RPU-NEXT:   0     0      S_ENDPGM 0
+  ; RPU-NEXT:   0     0
+  ; RPU-NEXT:   Live-out:
+  ;
+  ; RPD-LABEL: name: only_dbg_value_sched_region
+  ; RPD: bb.0:
+  ; RPD-NEXT:   Live-in:
+  ; RPD-NEXT:   SGPR  VGPR
+  ; RPD-NEXT:   0     0
+  ; RPD-NEXT:   0     1      %0:vgpr_32 = COPY $vgpr0
+  ; RPD-NEXT:   0     1
+  ; RPD-NEXT:   0     3      %1:vreg_64 = IMPLICIT_DEF
+  ; RPD-NEXT:   0     3
+  ; RPD-NEXT:   0     5      %2:vreg_64 = GLOBAL_LOAD_DWORDX2 %1:vreg_64, 0, 0, implicit $exec
+  ; RPD-NEXT:   0     5
+  ; RPD-NEXT:   0     6      %3:vgpr_32 = GLOBAL_LOAD_DWORD %1:vreg_64, 8, 0, implicit $exec
+  ; RPD-NEXT:   0     6
+  ; RPD-NEXT:   0     7      undef %4.sub1:vreg_64 = V_ADD_U32_e32 %0:vgpr_32, %0:vgpr_32, implicit $exec
+  ; RPD-NEXT:   0     7
+  ; RPD-NEXT:   0     8      %4.sub0:vreg_64 = V_MOV_B32_e32 111, implicit $exec
+  ; RPD-NEXT:   0     8
+  ; RPD-NEXT:   0     10     %5:vreg_64 = COPY %2:vreg_64
+  ; RPD-NEXT:   0     9
+  ; RPD-NEXT:   0     10     undef %6.sub0:vreg_64 = V_ADD_F32_e32 %1.sub0:vreg_64, %5.sub0:vreg_64, implicit $mode, implicit $exec
+  ; RPD-NEXT:   0     8
+  ; RPD-NEXT:   0     9      dead %6.sub1:vreg_64 = V_ADD_F32_e32 %1.sub1:vreg_64, %5.sub0:vreg_64, implicit $mode, implicit $exec
+  ; RPD-NEXT:   0     7
+  ; RPD-NEXT:   0     8      %7:vgpr_32 = GLOBAL_LOAD_DWORD %5:vreg_64, 0, 0, implicit $exec
+  ; RPD-NEXT:   0     6
+  ; RPD-NEXT:   0     8      %8:vreg_64 = IMPLICIT_DEF
+  ; RPD-NEXT:   0     7
+  ; RPD-NEXT:   0     9      %9:vreg_64 = IMPLICIT_DEF
+  ; RPD-NEXT:   0     9
+  ; RPD-NEXT:   0     11     %10:vreg_64 = IMPLICIT_DEF
+  ; RPD-NEXT:   0     11
+  ; RPD-NEXT:   0     12     undef %11.sub1:vreg_64 = IMPLICIT_DEF
+  ; RPD-NEXT:   0     12
+  ; RPD-NEXT:   0     13     %12:vgpr_32 = IMPLICIT_DEF
+  ; RPD-NEXT:   0     13
+  ; RPD-NEXT:   0     14     %13:vgpr_32 = IMPLICIT_DEF
+  ; RPD-NEXT:   0     14
+  ; RPD-NEXT:   0     16     %14:vreg_64 = IMPLICIT_DEF
+  ; RPD-NEXT:   0     16
+  ; RPD-NEXT:   0     18     %15:vreg_64 = IMPLICIT_DEF
+  ; RPD-NEXT:   0     18
+  ; RPD-NEXT:   0     19     %16:vgpr_32 = IMPLICIT_DEF
+  ; RPD-NEXT:   0     19
+  ; RPD-NEXT:   0     20     %17:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; RPD-NEXT:   0     20
+  ; RPD-NEXT:   0     21     %18:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; RPD-NEXT:   0     21
+  ; RPD-NEXT:   0     22     undef %19.sub0:vreg_64 = V_ADD_F32_e32 %7:vgpr_32, %2.sub0:vreg_64, implicit $mode, implicit $exec
+  ; RPD-NEXT:   0     20
+  ; RPD-NEXT:   0     21     %19.sub1:vreg_64 = V_ADD_F32_e32 %3:vgpr_32, %3:vgpr_32, implicit $mode, implicit $exec
+  ; RPD-NEXT:                DBG_VALUE
+  ; RPD-NEXT:   0     20
+  ; RPD-NEXT:   0     20     GLOBAL_STORE_DWORDX2 %19:vreg_64, %4:vreg_64, 32, 0, implicit $exec
+  ; RPD-NEXT:   0     16
+  ; RPD-NEXT:   0     17     %11.sub0:vreg_64 = GLOBAL_LOAD_DWORD %9:vreg_64, 0, 0, implicit $exec
+  ; RPD-NEXT:   0     15
+  ; RPD-NEXT:   0     16     %8.sub0:vreg_64 = GLOBAL_LOAD_DWORD %10:vreg_64, 0, 0, implicit $exec
+  ; RPD-NEXT:   0     14
+  ; RPD-NEXT:   0     15     dead %20:vgpr_32 = GLOBAL_LOAD_DWORD %11:vreg_64, 0, 0, implicit $exec
+  ; RPD-NEXT:                DBG_VALUE
+  ; RPD-NEXT:                DBG_VALUE
+  ; RPD-NEXT:   0     12
+  ; RPD-NEXT:   0     13     dead %21:vgpr_32 = GLOBAL_LOAD_DWORD %14:vreg_64, 0, 0, implicit $exec
+  ; RPD-NEXT:   0     10
+  ; RPD-NEXT:   0     11     dead %22:vgpr_32 = GLOBAL_LOAD_DWORD %15:vreg_64, 0, 0, implicit $exec
+  ; RPD-NEXT:   0     10
+  ; RPD-NEXT:   0     12     %23:vreg_64 = V_LSHLREV_B64_e64 2, %8:vreg_64, implicit $exec
+  ; RPD-NEXT:   0     9
+  ; RPD-NEXT:   0     9      S_NOP 0, implicit %13:vgpr_32, implicit %23.sub0:vreg_64, implicit %12:vgpr_32, implicit %17:vgpr_32
+  ; RPD-NEXT:   0     5
+  ; RPD-NEXT:   0     5      GLOBAL_STORE_DWORD %15:vreg_64, %18:vgpr_32, 0, 0, implicit $exec
+  ; RPD-NEXT:   0     2
+  ; RPD-NEXT:   Live-out: %0:0000000000000003 %16:0000000000000003
+  ; RPD-NEXT: bb.1:
+  ; RPD-NEXT:   Live-in:  %0:0000000000000003 %16:0000000000000003
+  ; RPD-NEXT:   SGPR  VGPR
+  ; RPD-NEXT:                DBG_VALUE
+  ; RPD-NEXT:   0     2
+  ; RPD-NEXT:   0     2      S_SETREG_IMM32_B32 0, 1, implicit-def $mode, implicit $mode
+  ; RPD-NEXT:                DBG_VALUE
+  ; RPD-NEXT:                DBG_VALUE
+  ; RPD-NEXT:   0     2
+  ; RPD-NEXT:   0     2      S_SETREG_IMM32_B32 0, 1, implicit-def $mode, implicit $mode
+  ; RPD-NEXT:                DBG_VALUE
+  ; RPD-NEXT:   0     2
+  ; RPD-NEXT:   Live-out: %0:0000000000000003 %16:0000000000000003
+  ; RPD-NEXT: bb.2:
+  ; RPD-NEXT:   Live-in:  %0:0000000000000003 %16:0000000000000003
+  ; RPD-NEXT:   SGPR  VGPR
+  ; RPD-NEXT:   0     2
+  ; RPD-NEXT:   Live-out: %0:0000000000000003 %16:0000000000000003
+  ; RPD-NEXT: bb.3:
+  ; RPD-NEXT:   Live-in:  %0:0000000000000003 %16:0000000000000003
+  ; RPD-NEXT:   SGPR  VGPR
+  ; RPD-NEXT:   0     2
+  ; RPD-NEXT:   0     2      S_NOP 0, implicit %0:vgpr_32
+  ; RPD-NEXT:   0     1
+  ; RPD-NEXT:   0     1      S_NOP 0, implicit %16:vgpr_32
+  ; RPD-NEXT:   0     0
+  ; RPD-NEXT:   0     0      S_ENDPGM 0
+  ; RPD-NEXT:   0     0
+  ; RPD-NEXT:   Live-out:
+  bb.0:
+    liveins: $vgpr0
+
+    %0:vgpr_32 = COPY $vgpr0
+    %1:vreg_64 = IMPLICIT_DEF
+    %2:vreg_64 = GLOBAL_LOAD_DWORDX2 %1, 0, 0, implicit $exec
+    %3:vgpr_32 = GLOBAL_LOAD_DWORD %1, 8, 0, implicit $exec
+    undef %4.sub1:vreg_64 = V_ADD_U32_e32 %0, %0, implicit $exec
+    %4.sub0:vreg_64 = V_MOV_B32_e32 111, implicit $exec
+    %5:vreg_64 = COPY %2
+    undef %6.sub0:vreg_64 = V_ADD_F32_e32 %1.sub0, %5.sub0, implicit $mode, implicit $exec
+    %6.sub1:vreg_64 = V_ADD_F32_e32 %1.sub1, %5.sub0, implicit $mode, implicit $exec
+    %7:vgpr_32 = GLOBAL_LOAD_DWORD %5, 0, 0, implicit $exec
+    %8:vreg_64 = IMPLICIT_DEF
+    %9:vreg_64 = IMPLICIT_DEF
+    %10:vreg_64 = IMPLICIT_DEF
+    undef %11.sub1:vreg_64 = IMPLICIT_DEF
+    %12:vgpr_32 = IMPLICIT_DEF
+    %13:vgpr_32 = IMPLICIT_DEF
+    %14:vreg_64 = IMPLICIT_DEF
+    %15:vreg_64 = IMPLICIT_DEF
+    %16:vgpr_32 = IMPLICIT_DEF
+    %17:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %18:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    undef %19.sub0:vreg_64 = V_ADD_F32_e32 %7, %2.sub0, implicit $mode, implicit $exec
+    %19.sub1:vreg_64 = V_ADD_F32_e32 %3, %3, implicit $mode, implicit $exec
+    DBG_VALUE
+    GLOBAL_STORE_DWORDX2 %19, %4, 32, 0, implicit $exec
+    %11.sub0:vreg_64 = GLOBAL_LOAD_DWORD %9, 0, 0, implicit $exec
+    %8.sub0:vreg_64 = GLOBAL_LOAD_DWORD %10, 0, 0, implicit $exec
+    %20:vgpr_32 = GLOBAL_LOAD_DWORD %11, 0, 0, implicit $exec
+    DBG_VALUE
+    DBG_VALUE
+    %21:vgpr_32 = GLOBAL_LOAD_DWORD %14, 0, 0, implicit $exec
+    %22:vgpr_32 = GLOBAL_LOAD_DWORD %15, 0, 0, implicit $exec
+    %23:vreg_64 = V_LSHLREV_B64_e64 2, %8, implicit $exec
+    S_NOP 0, implicit %13, implicit %23.sub0, implicit %12, implicit %17
+    GLOBAL_STORE_DWORD %15, %18, 0, 0, implicit $exec
+
+  bb.1:
+    DBG_VALUE
+    S_SETREG_IMM32_B32 0, 1, implicit-def $mode, implicit $mode
+    DBG_VALUE
+    DBG_VALUE
+    S_SETREG_IMM32_B32 0, 1, implicit-def $mode, implicit $mode
+    DBG_VALUE
+
+  bb.3:
+
+  bb.2:
+    S_NOP 0, implicit %0
+    S_NOP 0, implicit %16
+    S_ENDPGM 0
+...
+

From 05ebc70453f03f7952758d5a4c39ac09a169b767 Mon Sep 17 00:00:00 2001
From: nicole mazzuca <nicole@strega-nil.co>
Date: Wed, 1 Nov 2023 15:09:49 -0700
Subject: [PATCH 749/877] [MSVC][ASan] fix interface header (#70992)

looks like there may be some missing testing here.
---
 compiler-rt/include/sanitizer/common_interface_defs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/compiler-rt/include/sanitizer/common_interface_defs.h b/compiler-rt/include/sanitizer/common_interface_defs.h
index 0bbade454244a..5a78bb1a23a29 100644
--- a/compiler-rt/include/sanitizer/common_interface_defs.h
+++ b/compiler-rt/include/sanitizer/common_interface_defs.h
@@ -298,7 +298,7 @@ void SANITIZER_CDECL __sanitizer_symbolize_global(void *data_ptr,
 #define __sanitizer_return_address()                                           \
   __builtin_extract_return_addr(__builtin_return_address(0))
 #else
-extern "C" void *SANITIZER_CDECL _ReturnAddress(void);
+void *SANITIZER_CDECL _ReturnAddress(void);
 #pragma intrinsic(_ReturnAddress)
 #define __sanitizer_return_address() _ReturnAddress()
 #endif

From 801c78d5b474c2319aa8ead44db7ba8cacac4714 Mon Sep 17 00:00:00 2001
From: Antonio Abbatangelo <contact@antangelo.com>
Date: Wed, 1 Nov 2023 18:02:41 -0400
Subject: [PATCH 750/877] Revert "Reland "[clang][Sema] Use original template
 pattern when declaring implicit deduction guides for nested template classes"
 (#69676)"

This reverts commit f418319730341e9d41ce8ead6fbfe5603c343985.

Failing test case: https://github.com/llvm/llvm-project/pull/69676#issuecomment-1789255366
---
 clang/docs/ReleaseNotes.rst                   |  5 ----
 clang/lib/Sema/SemaTemplate.cpp               | 26 +------------------
 .../SemaTemplate/nested-deduction-guides.cpp  |  5 ----
 .../nested-implicit-deduction-guides.cpp      | 15 -----------
 4 files changed, 1 insertion(+), 50 deletions(-)
 delete mode 100644 clang/test/SemaTemplate/nested-implicit-deduction-guides.cpp

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 3198d6bfe75a2..4696836b3a00c 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -665,11 +665,6 @@ Bug Fixes to C++ Support
   declaration definition. Fixes:
   (`#61763 <https://github.com/llvm/llvm-project/issues/61763>`_)
 
-- Fix a bug where implicit deduction guides are not correctly generated for nested template
-  classes. Fixes:
-  (`#46200 <https://github.com/llvm/llvm-project/issues/46200>`_)
-  (`#57812 <https://github.com/llvm/llvm-project/issues/57812>`_)
-
 - Diagnose use of a variable-length array in a coroutine. The design of
   coroutines is such that it is not possible to support VLA use. Fixes:
   (`#65858 <https://github.com/llvm/llvm-project/issues/65858>`_)
diff --git a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp
index 59721c8dc664a..9044400fbb1f3 100644
--- a/clang/lib/Sema/SemaTemplate.cpp
+++ b/clang/lib/Sema/SemaTemplate.cpp
@@ -2253,7 +2253,6 @@ struct ConvertConstructorToDeductionGuideTransform {
 
   Sema &SemaRef;
   ClassTemplateDecl *Template;
-  ClassTemplateDecl *NestedPattern = nullptr;
 
   DeclContext *DC = Template->getDeclContext();
   CXXRecordDecl *Primary = Template->getTemplatedDecl();
@@ -2333,9 +2332,6 @@ struct ConvertConstructorToDeductionGuideTransform {
       Args.addOuterRetainedLevel();
     }
 
-    if (NestedPattern)
-      Args.addOuterRetainedLevels(NestedPattern->getTemplateDepth());
-
     FunctionProtoTypeLoc FPTL = CD->getTypeSourceInfo()->getTypeLoc()
                                    .getAsAdjusted<FunctionProtoTypeLoc>();
     assert(FPTL && "no prototype for constructor declaration");
@@ -2445,17 +2441,10 @@ struct ConvertConstructorToDeductionGuideTransform {
     SmallVector<QualType, 4> ParamTypes;
     const FunctionProtoType *T = TL.getTypePtr();
 
-    MultiLevelTemplateArgumentList OuterInstantiationArgs;
-    if (NestedPattern)
-      OuterInstantiationArgs = SemaRef.getTemplateInstantiationArgs(Template);
-
     //    -- The types of the function parameters are those of the constructor.
     for (auto *OldParam : TL.getParams()) {
       ParmVarDecl *NewParam =
           transformFunctionTypeParam(OldParam, Args, MaterializedTypedefs);
-      if (NestedPattern && NewParam)
-        NewParam = transformFunctionTypeParam(NewParam, OuterInstantiationArgs,
-                                              MaterializedTypedefs);
       if (!NewParam)
         return QualType();
       ParamTypes.push_back(NewParam->getType());
@@ -2661,24 +2650,13 @@ void Sema::DeclareImplicitDeductionGuides(TemplateDecl *Template,
   if (BuildingDeductionGuides.isInvalid())
     return;
 
-  // If the template is nested, then we need to use the original
-  // pattern to iterate over the constructors.
-  ClassTemplateDecl *Pattern = Transform.Template;
-  while (Pattern->getInstantiatedFromMemberTemplate()) {
-    if (Pattern->isMemberSpecialization())
-      break;
-    Pattern = Pattern->getInstantiatedFromMemberTemplate();
-    Transform.NestedPattern = Pattern;
-  }
-
   // Convert declared constructors into deduction guide templates.
   // FIXME: Skip constructors for which deduction must necessarily fail (those
   // for which some class template parameter without a default argument never
   // appears in a deduced context).
-  ContextRAII SavedContext(*this, Pattern->getTemplatedDecl());
   llvm::SmallPtrSet<NamedDecl *, 8> ProcessedCtors;
   bool AddedAny = false;
-  for (NamedDecl *D : LookupConstructors(Pattern->getTemplatedDecl())) {
+  for (NamedDecl *D : LookupConstructors(Transform.Primary)) {
     D = D->getUnderlyingDecl();
     if (D->isInvalidDecl() || D->isImplicit())
       continue;
@@ -2724,8 +2702,6 @@ void Sema::DeclareImplicitDeductionGuides(TemplateDecl *Template,
           Transform.buildSimpleDeductionGuide(Transform.DeducedType))
           ->getTemplatedDecl())
       ->setDeductionCandidateKind(DeductionCandidate::Copy);
-
-  SavedContext.pop();
 }
 
 /// Diagnose the presence of a default template argument on a
diff --git a/clang/test/SemaTemplate/nested-deduction-guides.cpp b/clang/test/SemaTemplate/nested-deduction-guides.cpp
index 38410b93ead3b..2c5dda456a138 100644
--- a/clang/test/SemaTemplate/nested-deduction-guides.cpp
+++ b/clang/test/SemaTemplate/nested-deduction-guides.cpp
@@ -4,15 +4,10 @@
 template<typename T> struct A {
   template<typename U> struct B {
     B(...);
-    B(const B &) = default;
   };
   template<typename U> B(U) -> B<U>;
 };
 A<void>::B b = 123;
-A<void>::B copy = b;
 
 using T = decltype(b);
 using T = A<void>::B<int>;
-
-using Copy = decltype(copy);
-using Copy = A<void>::B<int>;
diff --git a/clang/test/SemaTemplate/nested-implicit-deduction-guides.cpp b/clang/test/SemaTemplate/nested-implicit-deduction-guides.cpp
deleted file mode 100644
index 10b70f9c8c4f3..0000000000000
--- a/clang/test/SemaTemplate/nested-implicit-deduction-guides.cpp
+++ /dev/null
@@ -1,15 +0,0 @@
-// RUN: %clang_cc1 -std=c++17 -verify %s
-// expected-no-diagnostics
-
-template<class T> struct S {
-    template<class U> struct N {
-        N(T) {}
-        N(T, U) {}
-        template<class V> N(V, U) {}
-    };
-};
-
-S<int>::N x{"a", 1};
-
-using T = decltype(x);
-using T = S<int>::N<int>;

From f7595ac7527e67a59e33550851366db7d886ac3f Mon Sep 17 00:00:00 2001
From: Georgi Mirazchiyski <georgi.mirazchiyski@codeplay.com>
Date: Wed, 1 Nov 2023 22:39:00 +0000
Subject: [PATCH 751/877] [SYCL][Driver] Fix -fgpu-rdc option for CUDA (#11420)

This commit fixes an issue with -fgpu-rdc for NVPTX targets in the SYCL
driver. When the driver calls the `ptxas` assembling job in relocatable
device code compilation mode, we pass `-c` to the command-line to get a
relocatable code ELF instead of a statically fully linked object. This
is the correct behaviour for the purpose of CUDA separate compilation
flow with RDC, but in SYCL it does not make sense for us to do so
because we do RDC at a bitcode level rather than PTX. We link the IR
modules with llvm-link way earlier and the linking method based on
`-fgpu[no]-rdc` is determined there. Hence, we always want a fully
linked object from `ptxas` in SYCL. This simplifies the need for adding
a `nvlink` job in the pipeline.

Related github issue: https://github.com/intel/llvm/issues/8690
---
 clang/lib/Driver/ToolChains/Cuda.cpp |  8 ++++++--
 clang/test/Driver/sycl-cuda-rdc.cpp  | 12 ++++++++++++
 2 files changed, 18 insertions(+), 2 deletions(-)
 create mode 100644 clang/test/Driver/sycl-cuda-rdc.cpp

diff --git a/clang/lib/Driver/ToolChains/Cuda.cpp b/clang/lib/Driver/ToolChains/Cuda.cpp
index b6f781f185f53..1626c303e34aa 100644
--- a/clang/lib/Driver/ToolChains/Cuda.cpp
+++ b/clang/lib/Driver/ToolChains/Cuda.cpp
@@ -476,11 +476,15 @@ void NVPTX::Assembler::ConstructJob(Compilation &C, const JobAction &JA,
     Relocatable = Args.hasFlag(options::OPT_fopenmp_relocatable_target,
                                options::OPT_fnoopenmp_relocatable_target,
                                /*Default=*/true);
-  else if (JA.isOffloading(Action::OFK_Cuda) ||
-           JA.isOffloading(Action::OFK_SYCL))
+  else if (JA.isOffloading(Action::OFK_Cuda))
     // In CUDA we generate relocatable code by default.
     Relocatable = Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc,
                                /*Default=*/false);
+  else if (JA.isOffloading(Action::OFK_SYCL))
+    // In SYCL we control [no-]rdc linking at bitcode stage with 'llvm-link'.
+    // This allows for link-time optimisations and for now we do no support a
+    // non-LTO path, which means we cannot generate relocatable device code.
+    Relocatable = false;
   else
     // Otherwise, we are compiling directly and should create linkable output.
     Relocatable = true;
diff --git a/clang/test/Driver/sycl-cuda-rdc.cpp b/clang/test/Driver/sycl-cuda-rdc.cpp
new file mode 100644
index 0000000000000..ced8fcd24b98f
--- /dev/null
+++ b/clang/test/Driver/sycl-cuda-rdc.cpp
@@ -0,0 +1,12 @@
+// Tests for -fgpu-rdc with CUDA
+
+// REQUIRES: nvptx-registered-target
+
+// UNSUPPORTED: system-windows
+
+// RUN: %clangxx -### -fsycl -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend --cuda-gpu-arch=sm_61 -fgpu-rdc %s 2>&1 \
+// RUN: | FileCheck %s -check-prefix=CHECK-SYCL_RDC_NVPTX
+
+// Verify that ptxas does not pass "-c"
+// CHECK-SYCL_RDC_NVPTX: {{.*}} "-cc1" "-triple" "nvptx64-nvidia-cuda" {{.*}} "-target-cpu" "sm_61" "-target-feature" "+ptx{{[0-9]+}}" {{.*}} "-fgpu-rdc" {{.*}} "-o" "[[PTX:.+]].s"
+// CHECK-SYCL_RDC_NVPTX-NOT: ptxas{{.*}}"-m64" "-O3" "--gpu-name" "sm_61" "--output-file" "[[CUBIN:.+]].cubin" "[[PTX]].s" "-c"

From 4de92601c83b13e64e9a4a3722d0be12e70a4212 Mon Sep 17 00:00:00 2001
From: Davide Italiano <dcci@users.noreply.github.com>
Date: Wed, 1 Nov 2023 15:39:01 -0700
Subject: [PATCH 752/877] [build_symbolizer] Introduce ZLIB_SRC to specify an
 on-disk location for (#70994)

zlib.

Not everyone wants to checkout from `git`. Tested with and without the
env var.
---
 .../symbolizer/scripts/build_symbolizer.sh     | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/compiler-rt/lib/sanitizer_common/symbolizer/scripts/build_symbolizer.sh b/compiler-rt/lib/sanitizer_common/symbolizer/scripts/build_symbolizer.sh
index 6eb9fa7abb7e0..13acced825e0b 100755
--- a/compiler-rt/lib/sanitizer_common/symbolizer/scripts/build_symbolizer.sh
+++ b/compiler-rt/lib/sanitizer_common/symbolizer/scripts/build_symbolizer.sh
@@ -1,6 +1,7 @@
 #!/usr/bin/env bash
 #
 # Run as: CLANG=bin/clang build_symbolizer.sh out.o
+# If you want to use a local copy of zlib, set ZLIB_SRC.
 # zlib can be downloaded from http://www.zlib.net.
 #
 # Script compiles self-contained object file with symbolization code.
@@ -20,6 +21,12 @@ set -u
 
 SCRIPT_DIR=$(cd "$(dirname "$0")" && pwd)
 SRC_DIR=$(readlink -f $SCRIPT_DIR/..)
+
+if [[ $# -ne 1 ]]; then
+  echo "Missing output file"
+  exit 1
+fi
+
 OUTPUT=$(readlink -f $1)
 COMPILER_RT_SRC=$(readlink -f ${SCRIPT_DIR}/../../../..)
 LLVM_SRC=${LLVM_SRC:-${COMPILER_RT_SRC}/../llvm}
@@ -52,6 +59,7 @@ LLVM_BUILD=${BUILD_DIR}/llvm
 SYMBOLIZER_BUILD=${BUILD_DIR}/symbolizer
 
 FLAGS=${FLAGS:-}
+ZLIB_SRC=${ZLIB_SRC:-}
 TARGET_TRIPLE=$($CC -print-target-triple $FLAGS)
 if [[ "$FLAGS" =~ "-m32" ]] ; then
   # Avoid new wrappers.
@@ -63,7 +71,15 @@ FLAGS+=" -include ${SRC_DIR}/../sanitizer_redefine_builtins.h -DSANITIZER_COMMON
 LINKFLAGS="-fuse-ld=lld -target $TARGET_TRIPLE"
 
 # Build zlib.
-[[ -d ${ZLIB_BUILD} ]] || git clone https://github.com/madler/zlib ${ZLIB_BUILD}
+if [[ -d ${ZLIB_BUILD} ]]; then
+  if [[ -z "${ZLIB_SRC}" ]]; then
+    git clone https://github.com/madler/zlib ${ZLIB_BUILD}
+  else
+    ZLIB_SRC=$(readlink -f $ZLIB_SRC)
+    cp -r ${ZLIB_SRC}/* ${ZLIB_BUILD}/
+  fi
+fi
+
 cd ${ZLIB_BUILD}
 AR="${AR}" CC="${CC}" CFLAGS="$FLAGS -Wno-deprecated-non-prototype" RANLIB=/bin/true ./configure --static
 make -j libz.a

From 3651f9699e5e8ba3ddae41b6c63cb9531596fd0d Mon Sep 17 00:00:00 2001
From: Nick Sarnie <sarnex@users.noreply.github.com>
Date: Wed, 1 Nov 2023 22:41:11 +0000
Subject: [PATCH 753/877] [SYCL][ESIMD] Don't use std::max in block_store
 (#11745)

It won't compile on Windows if windows.h is included.

Signed-off-by: Sarnie, Nick <nick.sarnie@intel.com>
---
 sycl/include/sycl/ext/intel/esimd/memory.hpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/sycl/include/sycl/ext/intel/esimd/memory.hpp b/sycl/include/sycl/ext/intel/esimd/memory.hpp
index a4d0b80c87cdd..308e95c312ecc 100644
--- a/sycl/include/sycl/ext/intel/esimd/memory.hpp
+++ b/sycl/include/sycl/ext/intel/esimd/memory.hpp
@@ -588,7 +588,9 @@ block_store_impl(T *p, simd<T, NElts> vals, simd_mask<1> pred, FlagsT flags) {
 
   constexpr int SmallIntFactor64Bit = sizeof(uint64_t) / sizeof(T);
   constexpr int SmallIntFactor32Bit =
-      std::max(static_cast<size_t>(1), sizeof(uint32_t) / sizeof(T));
+      sizeof(uint32_t) / sizeof(T) > static_cast<size_t>(1)
+          ? sizeof(uint32_t) / sizeof(T)
+          : static_cast<size_t>(1);
   static_assert(NElts > 0 && NElts % SmallIntFactor32Bit == 0,
                 "Number of elements is not supported by Transposed store");
 

From f0e100a05acbe895a7a3e76b4713d2f740fe8463 Mon Sep 17 00:00:00 2001
From: Jon Chesterfield <jonathanchesterfield@gmail.com>
Date: Wed, 1 Nov 2023 22:43:34 +0000
Subject: [PATCH 754/877] [amdgpu][openmp] Treat missing TIMESTAMP_FREQUENCY as
 non-fatal (#70987)

If you build with dynamic_hsa, the symbol is known and compilation
succeeds. If you then run with a slightly older libhsa, this argument is
not recognised and an error returned. I'd rather the program runs with a
misleading omp wtime than refuses to run at all.
---
 libc/utils/gpu/loader/amdgpu/Loader.cpp       | 31 ++++++++++---------
 .../plugins-nextgen/amdgpu/src/rtl.cpp        | 10 +++---
 2 files changed, 22 insertions(+), 19 deletions(-)

diff --git a/libc/utils/gpu/loader/amdgpu/Loader.cpp b/libc/utils/gpu/loader/amdgpu/Loader.cpp
index 2f99076a720e2..b1b3aa6ce028c 100644
--- a/libc/utils/gpu/loader/amdgpu/Loader.cpp
+++ b/libc/utils/gpu/loader/amdgpu/Loader.cpp
@@ -487,21 +487,22 @@ int load(int argc, char **argv, char **envp, void *image, size_t size,
       handle_error(err);
     hsa_amd_agents_allow_access(1, &dev_agent, nullptr, host_clock_freq);
 
-    if (hsa_status_t err =
-            hsa_agent_get_info(dev_agent,
-                               static_cast<hsa_agent_info_t>(
-                                   HSA_AMD_AGENT_INFO_TIMESTAMP_FREQUENCY),
-                               host_clock_freq))
-      handle_error(err);
-
-    void *freq_addr;
-    if (hsa_status_t err = hsa_executable_symbol_get_info(
-            freq_sym, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, &freq_addr))
-      handle_error(err);
-
-    if (hsa_status_t err = hsa_memcpy(freq_addr, dev_agent, host_clock_freq,
-                                      host_agent, sizeof(uint64_t)))
-      handle_error(err);
+    if (HSA_STATUS_SUCCESS ==
+        hsa_agent_get_info(dev_agent,
+                           static_cast<hsa_agent_info_t>(
+                               HSA_AMD_AGENT_INFO_TIMESTAMP_FREQUENCY),
+                           host_clock_freq)) {
+
+      void *freq_addr;
+      if (hsa_status_t err = hsa_executable_symbol_get_info(
+              freq_sym, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS,
+              &freq_addr))
+        handle_error(err);
+
+      if (hsa_status_t err = hsa_memcpy(freq_addr, dev_agent, host_clock_freq,
+                                        host_agent, sizeof(uint64_t)))
+        handle_error(err);
+    }
   }
 
   // Obtain a queue with the minimum (power of two) size, used to send commands
diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
index 71207f767fdcc..378cad8f8ca4f 100644
--- a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -1810,10 +1810,12 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
       return Err;
     GridValues.GV_Warp_Size = WavefrontSize;
 
-    // Get the frequency of the steady clock.
-    if (auto Err = getDeviceAttr(HSA_AMD_AGENT_INFO_TIMESTAMP_FREQUENCY,
-                                 ClockFrequency))
-      return Err;
+    // Get the frequency of the steady clock. If the attribute is missing
+    // assume running on an older libhsa and default to 0, omp_get_wtime
+    // will be inaccurate but otherwise programs can still run.
+    if (auto Err = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_TIMESTAMP_FREQUENCY,
+                                    ClockFrequency))
+      ClockFrequency = 0;
 
     // Load the grid values dependending on the wavefront.
     if (WavefrontSize == 32)

From ec350ad418a24f70c88758259c774a1e11c06d74 Mon Sep 17 00:00:00 2001
From: philnik777 <nikolasklauser@berlin.de>
Date: Wed, 1 Nov 2023 23:56:43 +0100
Subject: [PATCH 755/877] [libc++] Remove availability annotations which can
 never be triggered (#69226)

According to https://developer.apple.com/support/xcode/, quite a few of
our availability macros don't do anything anymore, so we might as well
remove them to clean up the code a bit.
---
 libcxx/docs/ReleaseNotes/18.rst               |   5 +
 libcxx/docs/index.rst                         |   2 +-
 libcxx/include/__availability                 | 144 +-----------------
 libcxx/include/__exception/operations.h       |   2 +-
 libcxx/include/__locale                       |   2 +-
 libcxx/include/__memory/shared_ptr.h          |  13 +-
 libcxx/include/__utility/small_buffer.h       |   6 +-
 libcxx/include/future                         |  67 +++-----
 libcxx/include/new                            |   8 +-
 libcxx/include/shared_mutex                   |   9 +-
 libcxx/include/typeinfo                       |   2 -
 libcxx/include/version                        |   4 +-
 .../global_memory_resource_lifetime.pass.cpp  |   2 -
 .../new_delete_resource_lifetime.pass.cpp     |   2 -
 .../aligned_alloc_availability.verify.cpp     |  50 ------
 .../support.dynamic/libcpp_deallocate.sh.cpp  |   4 -
 .../new_faligned_allocation.pass.cpp          |   4 -
 .../aligned_allocation_macro.compile.pass.cpp |   1 -
 .../thread_safety.verify.cpp                  |   1 -
 .../thread_safety.verify.cpp                  |   1 -
 .../default.pass.cpp                          |   2 -
 .../equal.pass.cpp                            |   2 -
 .../not_equal.pass.cpp                        |   2 -
 .../allocate.pass.cpp                         |   2 -
 .../construct_pair.pass.cpp                   |   2 -
 .../construct_pair_const_lvalue_pair.pass.cpp |   2 -
 .../construct_pair_rvalue.pass.cpp            |   2 -
 .../construct_pair_values.pass.cpp            |   2 -
 .../construct_piecewise_pair.pass.cpp         |   2 -
 .../construct_piecewise_pair_evil.pass.cpp    |   2 -
 .../construct_types.pass.cpp                  |   2 -
 .../deallocate.pass.cpp                       |   2 -
 .../destroy.pass.cpp                          |   2 -
 .../resource.pass.cpp                         |   2 -
 ...ct_on_container_copy_construction.pass.cpp |   2 -
 .../header_deque_synop.pass.cpp               |   2 -
 .../header_forward_list_synop.pass.cpp        |   2 -
 .../header_list_synop.pass.cpp                |   2 -
 .../header_map_synop.pass.cpp                 |   2 -
 .../header_regex_synop.pass.cpp               |   2 -
 .../header_set_synop.pass.cpp                 |   2 -
 .../header_string_synop.pass.cpp              |   2 -
 .../header_unordered_map_synop.pass.cpp       |   2 -
 .../header_unordered_set_synop.pass.cpp       |   2 -
 .../header_vector_synop.pass.cpp              |   2 -
 .../default_resource.pass.cpp                 |   2 -
 .../new_delete_resource.pass.cpp              |   2 -
 .../null_memory_resource.pass.cpp             |   2 -
 .../new.delete.array/new.size_align.pass.cpp  |   3 -
 .../new.size_align.replace.indirect.pass.cpp  |   3 -
 .../new.size_align_nothrow.pass.cpp           |   3 -
 ...ze_align_nothrow.replace.indirect.pass.cpp |   3 -
 .../new.size_align_nothrow.replace.pass.cpp   |   3 -
 .../new.delete.array/nodiscard.verify.cpp     |   3 -
 .../new.delete.single/new.size_align.pass.cpp |   3 -
 .../new.size_align_nothrow.pass.cpp           |   3 -
 ...ze_align_nothrow.replace.indirect.pass.cpp |   3 -
 .../new.size_align_nothrow.replace.pass.cpp   |   3 -
 .../new.delete.single/nodiscard.verify.cpp    |   3 -
 .../shared_mutex.version.compile.pass.cpp     |  36 ++---
 .../version.version.compile.pass.cpp          |  36 ++---
 .../thread.lock.shared/implicit_ctad.pass.cpp |   2 -
 .../thread.lock.shared.cons/default.pass.cpp  |   2 -
 .../move_assign.pass.cpp                      |   2 -
 .../move_ctor.pass.cpp                        |   2 -
 .../thread.lock.shared.cons/mutex.pass.cpp    |   2 -
 .../mutex_adopt_lock.pass.cpp                 |   2 -
 .../mutex_defer_lock.pass.cpp                 |   2 -
 .../mutex_duration.pass.cpp                   |   2 -
 .../mutex_time_point.pass.cpp                 |   2 -
 .../mutex_try_to_lock.pass.cpp                |   2 -
 .../thread.lock.shared.locking/lock.pass.cpp  |   2 -
 .../thread.lock.shared.obs/mutex.pass.cpp     |   2 -
 .../thread.lock.shared.obs/owns_lock.pass.cpp |   2 -
 .../assign.verify.cpp                         |   2 -
 .../thread.shared_mutex.class/copy.verify.cpp |   2 -
 .../default.pass.cpp                          |   2 -
 .../thread.shared_mutex.class/lock.pass.cpp   |   2 -
 .../lock_shared.pass.cpp                      |   2 -
 .../try_lock.pass.cpp                         |   2 -
 .../try_lock_shared.pass.cpp                  |   2 -
 .../assign.compile.fail.cpp                   |   2 -
 .../copy.compile.fail.cpp                     |   2 -
 .../default.pass.cpp                          |   2 -
 .../lock.pass.cpp                             |   2 -
 .../lock_shared.pass.cpp                      |   2 -
 .../try_lock.pass.cpp                         |   2 -
 .../try_lock_for.pass.cpp                     |   2 -
 .../try_lock_shared.pass.cpp                  |   2 -
 .../try_lock_shared_for.pass.cpp              |   2 -
 .../try_lock_shared_until.pass.cpp            |   2 -
 .../try_lock_until.pass.cpp                   |   2 -
 .../try_lock_until_deadlock_bug.pass.cpp      |   2 -
 .../any/any.class/any.assign/copy.pass.cpp    |   2 -
 .../any/any.class/any.assign/move.pass.cpp    |   2 -
 .../any/any.class/any.assign/value.pass.cpp   |   2 -
 .../any/any.class/any.cons/copy.pass.cpp      |   2 -
 .../any.class/any.cons/in_place_type.pass.cpp |   2 -
 .../any/any.class/any.cons/move.pass.cpp      |   2 -
 .../any/any.class/any.cons/value.pass.cpp     |   2 -
 .../any.class/any.modifiers/emplace.pass.cpp  |   2 -
 .../any.class/any.modifiers/reset.pass.cpp    |   2 -
 .../any/any.class/any.modifiers/swap.pass.cpp |   2 -
 .../any.cast/any_cast_pointer.pass.cpp        |   2 -
 .../any.cast/any_cast_reference.pass.cpp      |   2 -
 ..._request_invalid_value_category.verify.cpp |   2 -
 .../any.cast/const_correctness.verify.cpp     |   2 -
 .../not_copy_constructible.verify.cpp         |   2 -
 .../any/any.nonmembers/make_any.pass.cpp      |   2 -
 .../any/any.nonmembers/swap.pass.cpp          |   2 -
 .../allocate_shared.array.bounded.pass.cpp    |   3 -
 .../allocate_shared.array.unbounded.pass.cpp  |   3 -
 .../make_shared.array.bounded.pass.cpp        |   3 -
 .../make_shared.array.unbounded.pass.cpp      |   3 -
 .../default.pass.cpp                          |   1 -
 .../derive.pass.cpp                           |   1 -
 .../optional.monadic/and_then.pass.cpp        |   2 -
 .../optional.monadic/transform.pass.cpp       |   2 -
 .../optional.object.ctor/U.pass.cpp           |   2 -
 .../optional.object.ctor/const_T.pass.cpp     |   2 -
 .../optional.object.ctor/move.pass.cpp        |   2 -
 .../optional.object.ctor/rvalue_T.pass.cpp    |   2 -
 .../optional.object.observe/value.pass.cpp    |   2 -
 .../value_const.pass.cpp                      |   2 -
 .../value_const_rvalue.pass.cpp               |   2 -
 .../value_rvalue.pass.cpp                     |   2 -
 .../optional.specalg/make_optional.pass.cpp   |   2 -
 .../bad_variant_access.pass.cpp               |   1 -
 .../variant/variant.get/get_index.pass.cpp    |   2 -
 .../variant/variant.get/get_type.pass.cpp     |   2 -
 .../variant.variant/variant.assign/T.pass.cpp |   2 -
 .../variant.assign/copy.pass.cpp              |   2 -
 .../variant.assign/move.pass.cpp              |   2 -
 .../variant.variant/variant.ctor/T.pass.cpp   |   2 -
 .../variant.ctor/copy.pass.cpp                |   2 -
 .../variant.ctor/default.pass.cpp             |   2 -
 .../variant.ctor/in_place_index_args.pass.cpp |   2 -
 .../in_place_index_init_list_args.pass.cpp    |   2 -
 .../variant.ctor/in_place_type_args.pass.cpp  |   2 -
 .../in_place_type_init_list_args.pass.cpp     |   2 -
 .../variant.ctor/move.pass.cpp                |   2 -
 .../variant.mod/emplace_index_args.pass.cpp   |   2 -
 .../emplace_index_init_list_args.pass.cpp     |   2 -
 .../variant.mod/emplace_type_args.pass.cpp    |   2 -
 .../emplace_type_init_list_args.pass.cpp      |   2 -
 .../variant.swap/swap.pass.cpp                |   2 -
 .../variant.visit/robust_against_adl.pass.cpp |   2 -
 .../variant/variant.visit/visit.pass.cpp      |   2 -
 .../variant.visit/visit_return_type.pass.cpp  |   2 -
 libcxx/utils/data/ignore_format.txt           |   1 -
 .../generate_feature_test_macro_components.py |   8 +-
 libcxx/utils/libcxx/test/features.py          |  55 +------
 152 files changed, 101 insertions(+), 630 deletions(-)
 delete mode 100644 libcxx/test/libcxx/language.support/support.dynamic/aligned_alloc_availability.verify.cpp

diff --git a/libcxx/docs/ReleaseNotes/18.rst b/libcxx/docs/ReleaseNotes/18.rst
index ac78563aa7384..41c3f7cc455f1 100644
--- a/libcxx/docs/ReleaseNotes/18.rst
+++ b/libcxx/docs/ReleaseNotes/18.rst
@@ -72,6 +72,11 @@ Improvements and New Features
 Deprecations and Removals
 -------------------------
 
+- Availability macros which will never trigger an error have been removed. This includes anything that has been
+  introduced before macOS 10.13, iOS 12, tvOS 12 and watchOS 4. This shouldn't affect anybody, since AppleClang 15
+  doesn't support any older OSes. If you are a vendor and make use of these macros, please inform the libc++ team so we
+  can re-introduce them and consider upstreaming support for your platform.
+
 - The non-conforming constructor ``std::future_error(std::error_code)`` has been removed. Please use the
   ``std::future_error(std::future_errc)`` constructor provided in C++17 instead.
 
diff --git a/libcxx/docs/index.rst b/libcxx/docs/index.rst
index 72c80d7dc954a..c4199dc69d7c2 100644
--- a/libcxx/docs/index.rst
+++ b/libcxx/docs/index.rst
@@ -127,7 +127,7 @@ Libc++ also supports common platforms and architectures:
 =============== ========================= ============================
 Target platform Target architecture       Notes
 =============== ========================= ============================
-macOS 10.9+     i386, x86_64, arm64       Building the shared library itself requires targetting macOS 10.13+
+macOS 10.13+    i386, x86_64, arm64
 FreeBSD 12+     i386, x86_64, arm
 Linux           i386, x86_64, arm, arm64  Only glibc-2.24 and later and no other libc is officially supported
 Android 5.0+    i386, x86_64, arm, arm64
diff --git a/libcxx/include/__availability b/libcxx/include/__availability
index a6367945edc7c..99a16c968de3c 100644
--- a/libcxx/include/__availability
+++ b/libcxx/include/__availability
@@ -87,11 +87,6 @@
 
 #if defined(_LIBCPP_HAS_NO_VENDOR_AVAILABILITY_ANNOTATIONS)
 
-    // This controls the availability of std::shared_mutex and std::shared_timed_mutex,
-    // which were added to the dylib later.
-// #   define _LIBCPP_AVAILABILITY_HAS_NO_SHARED_MUTEX
-#   define _LIBCPP_AVAILABILITY_SHARED_MUTEX
-
     // These macros control the availability of std::bad_optional_access and
     // other exception types. These were put in the shared library to prevent
     // code bloat from every user program defining the vtable for these exception
@@ -109,41 +104,6 @@
 // #   define _LIBCPP_AVAILABILITY_HAS_NO_BAD_ANY_CAST
 #   define _LIBCPP_AVAILABILITY_BAD_ANY_CAST
 
-    // This controls the availability of std::uncaught_exceptions().
-// #   define _LIBCPP_AVAILABILITY_HAS_NO_UNCAUGHT_EXCEPTIONS
-#   define _LIBCPP_AVAILABILITY_UNCAUGHT_EXCEPTIONS
-
-    // This controls the availability of the sized version of ::operator delete,
-    // ::operator delete[], and their align_val_t variants, which were all added
-    // in C++17, and hence not present in early dylibs.
-// #   define _LIBCPP_AVAILABILITY_HAS_NO_SIZED_NEW_DELETE
-#   define _LIBCPP_AVAILABILITY_SIZED_NEW_DELETE
-
-    // This controls the availability of the std::future_error exception.
-    //
-    // Note that when exceptions are disabled, the methods that normally throw
-    // std::future_error can be used even on older deployment targets, but those
-    // methods will abort instead of throwing.
-// #   define _LIBCPP_AVAILABILITY_HAS_NO_FUTURE_ERROR
-#   define _LIBCPP_AVAILABILITY_FUTURE_ERROR
-
-    // This controls the availability of std::type_info's vtable.
-    // I can't imagine how using std::type_info can work at all if
-    // this isn't supported.
-// #   define _LIBCPP_AVAILABILITY_HAS_NO_TYPEINFO_VTABLE
-#   define _LIBCPP_AVAILABILITY_TYPEINFO_VTABLE
-
-    // This controls the availability of std::locale::category members
-    // (e.g. std::locale::collate), which are defined in the dylib.
-// #   define _LIBCPP_AVAILABILITY_HAS_NO_LOCALE_CATEGORY
-#   define _LIBCPP_AVAILABILITY_LOCALE_CATEGORY
-
-    // This controls the availability of atomic operations on std::shared_ptr
-    // (e.g. `std::atomic_store(std::shared_ptr)`), which require a shared
-    // lock table located in the dylib.
-// #   define _LIBCPP_AVAILABILITY_HAS_NO_ATOMIC_SHARED_PTR
-#   define _LIBCPP_AVAILABILITY_ATOMIC_SHARED_PTR
-
     // These macros control the availability of all parts of <filesystem> that
     // depend on something in the dylib.
 // #   define _LIBCPP_AVAILABILITY_HAS_NO_FILESYSTEM_LIBRARY
@@ -192,100 +152,14 @@
 
 #elif defined(__APPLE__)
 
-    // shared_mutex and shared_timed_mutex
-#   if (defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ < 101200) ||    \
-        (defined(__ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__ < 100000) || \
-        (defined(__ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__ < 100000) ||         \
-        (defined(__ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__ < 30000)
-#       define _LIBCPP_AVAILABILITY_HAS_NO_SHARED_MUTEX
-#   endif
-#   define _LIBCPP_AVAILABILITY_SHARED_MUTEX                                    \
-        __attribute__((availability(macos,strict,introduced=10.12)))            \
-        __attribute__((availability(ios,strict,introduced=10.0)))               \
-        __attribute__((availability(tvos,strict,introduced=10.0)))              \
-        __attribute__((availability(watchos,strict,introduced=3.0)))
-
-        // bad_optional_access, bad_variant_access and bad_any_cast
-        // Note: bad_optional_access & friends were not introduced in the matching
-        // macOS and iOS versions, so the version mismatch between macOS and others
-        // is intended.
-#   if (defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ < 101300) ||    \
-        (defined(__ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__ < 120000) || \
-        (defined(__ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__ < 120000) ||         \
-        (defined(__ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__ < 50000)
-#       define _LIBCPP_AVAILABILITY_HAS_NO_BAD_OPTIONAL_ACCESS
-#       define _LIBCPP_AVAILABILITY_HAS_NO_BAD_VARIANT_ACCESS
-#       define _LIBCPP_AVAILABILITY_HAS_NO_BAD_ANY_CAST
-#   endif
-#   define _LIBCPP_AVAILABILITY_BAD_OPTIONAL_ACCESS                             \
-        __attribute__((availability(macos,strict,introduced=10.13)))            \
-        __attribute__((availability(ios,strict,introduced=12.0)))               \
-        __attribute__((availability(tvos,strict,introduced=12.0)))              \
-        __attribute__((availability(watchos,strict,introduced=5.0)))
-#   define _LIBCPP_AVAILABILITY_BAD_VARIANT_ACCESS                              \
-        _LIBCPP_AVAILABILITY_BAD_OPTIONAL_ACCESS
-#   define _LIBCPP_AVAILABILITY_BAD_ANY_CAST                                    \
-        _LIBCPP_AVAILABILITY_BAD_OPTIONAL_ACCESS
-
-    // uncaught_exceptions
-#   if (defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ < 101200) ||    \
-        (defined(__ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__ < 100000) || \
-        (defined(__ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__ < 100000) ||         \
-        (defined(__ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__ < 30000)
-#       define _LIBCPP_AVAILABILITY_HAS_NO_UNCAUGHT_EXCEPTIONS
-#   endif
-#   define _LIBCPP_AVAILABILITY_UNCAUGHT_EXCEPTIONS                             \
-        __attribute__((availability(macos,strict,introduced=10.12)))            \
-        __attribute__((availability(ios,strict,introduced=10.0)))               \
-        __attribute__((availability(tvos,strict,introduced=10.0)))              \
-        __attribute__((availability(watchos,strict,introduced=3.0)))
-
-    // sized operator new and sized operator delete
-#   if (defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ < 101200) ||    \
-        (defined(__ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__ < 100000) || \
-        (defined(__ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__ < 100000) ||         \
-        (defined(__ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__ < 30000)
-#       define _LIBCPP_AVAILABILITY_HAS_NO_SIZED_NEW_DELETE
-#   endif
-#   define _LIBCPP_AVAILABILITY_SIZED_NEW_DELETE                                \
-        __attribute__((availability(macos,strict,introduced=10.12)))            \
-        __attribute__((availability(ios,strict,introduced=10.0)))               \
-        __attribute__((availability(tvos,strict,introduced=10.0)))              \
-        __attribute__((availability(watchos,strict,introduced=3.0)))
-
-    // future_error
-#   if (defined(__ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__ < 60000)
-#       define _LIBCPP_AVAILABILITY_HAS_NO_FUTURE_ERROR
-#   endif
-#   define _LIBCPP_AVAILABILITY_FUTURE_ERROR                                    \
-        __attribute__((availability(ios,strict,introduced=6.0)))
-
-    // type_info's vtable
-#   if (defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ < 100900) ||    \
-        (defined(__ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__ < 70000)
-#       define _LIBCPP_AVAILABILITY_HAS_NO_TYPEINFO_VTABLE
-#   endif
-#   define _LIBCPP_AVAILABILITY_TYPEINFO_VTABLE                                 \
-        __attribute__((availability(macos,strict,introduced=10.9)))             \
-        __attribute__((availability(ios,strict,introduced=7.0)))
-
-    // locale::category
-#   if (defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ < 100900) ||    \
-        (defined(__ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__ < 70000)
-#       define _LIBCPP_AVAILABILITY_HAS_NO_LOCALE_CATEGORY
-#   endif
-#   define _LIBCPP_AVAILABILITY_LOCALE_CATEGORY                                 \
-        __attribute__((availability(macos,strict,introduced=10.9)))             \
-        __attribute__((availability(ios,strict,introduced=7.0)))
-
-    // atomic operations on shared_ptr
-#   if (defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ < 100900) ||    \
-        (defined(__ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__ < 70000)
-#       define _LIBCPP_AVAILABILITY_HAS_NO_ATOMIC_SHARED_PTR
-#   endif
-#   define _LIBCPP_AVAILABILITY_ATOMIC_SHARED_PTR                               \
-        __attribute__((availability(macos,strict,introduced=10.9)))             \
-        __attribute__((availability(ios,strict,introduced=7.0)))
+#  if (defined(__ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__ < 50000)
+#    define _LIBCPP_AVAILABILITY_HAS_NO_BAD_OPTIONAL_ACCESS
+#    define _LIBCPP_AVAILABILITY_HAS_NO_BAD_VARIANT_ACCESS
+#    define _LIBCPP_AVAILABILITY_HAS_NO_BAD_ANY_CAST
+#  endif
+#  define _LIBCPP_AVAILABILITY_BAD_OPTIONAL_ACCESS __attribute__((availability(watchos,strict,introduced=5.0)))
+#  define _LIBCPP_AVAILABILITY_BAD_VARIANT_ACCESS _LIBCPP_AVAILABILITY_BAD_OPTIONAL_ACCESS
+#  define _LIBCPP_AVAILABILITY_BAD_ANY_CAST _LIBCPP_AVAILABILITY_BAD_OPTIONAL_ACCESS
 
     // <filesystem>
 #   if (defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ < 101500) ||    \
@@ -385,12 +259,10 @@
 // Those are defined in terms of the availability attributes above, and
 // should not be vendor-specific.
 #if defined(_LIBCPP_HAS_NO_EXCEPTIONS)
-#   define _LIBCPP_AVAILABILITY_FUTURE
 #   define _LIBCPP_AVAILABILITY_THROW_BAD_ANY_CAST
 #   define _LIBCPP_AVAILABILITY_THROW_BAD_OPTIONAL_ACCESS
 #   define _LIBCPP_AVAILABILITY_THROW_BAD_VARIANT_ACCESS
 #else
-#   define _LIBCPP_AVAILABILITY_FUTURE                    _LIBCPP_AVAILABILITY_FUTURE_ERROR
 #   define _LIBCPP_AVAILABILITY_THROW_BAD_ANY_CAST        _LIBCPP_AVAILABILITY_BAD_ANY_CAST
 #   define _LIBCPP_AVAILABILITY_THROW_BAD_OPTIONAL_ACCESS _LIBCPP_AVAILABILITY_BAD_OPTIONAL_ACCESS
 #   define _LIBCPP_AVAILABILITY_THROW_BAD_VARIANT_ACCESS  _LIBCPP_AVAILABILITY_BAD_VARIANT_ACCESS
diff --git a/libcxx/include/__exception/operations.h b/libcxx/include/__exception/operations.h
index 78e7c7a5d0b51..8f374c0ccee50 100644
--- a/libcxx/include/__exception/operations.h
+++ b/libcxx/include/__exception/operations.h
@@ -31,7 +31,7 @@ _LIBCPP_EXPORTED_FROM_ABI terminate_handler set_terminate(terminate_handler) _NO
 _LIBCPP_EXPORTED_FROM_ABI terminate_handler get_terminate() _NOEXCEPT;
 
 _LIBCPP_EXPORTED_FROM_ABI bool uncaught_exception() _NOEXCEPT;
-_LIBCPP_EXPORTED_FROM_ABI _LIBCPP_AVAILABILITY_UNCAUGHT_EXCEPTIONS int uncaught_exceptions() _NOEXCEPT;
+_LIBCPP_EXPORTED_FROM_ABI int uncaught_exceptions() _NOEXCEPT;
 
 class _LIBCPP_EXPORTED_FROM_ABI exception_ptr;
 
diff --git a/libcxx/include/__locale b/libcxx/include/__locale
index 90dcad3590c3d..b1502dd71edad 100644
--- a/libcxx/include/__locale
+++ b/libcxx/include/__locale
@@ -78,7 +78,7 @@ public:
     class _LIBCPP_EXPORTED_FROM_ABI id;
 
     typedef int category;
-    _LIBCPP_AVAILABILITY_LOCALE_CATEGORY
+
     static const category // values assigned here are for exposition only
         none     = 0,
         collate  = LC_COLLATE_MASK,
diff --git a/libcxx/include/__memory/shared_ptr.h b/libcxx/include/__memory/shared_ptr.h
index 96e1f80bdede5..10e36bc87610b 100644
--- a/libcxx/include/__memory/shared_ptr.h
+++ b/libcxx/include/__memory/shared_ptr.h
@@ -1985,8 +1985,7 @@ class _LIBCPP_EXPORTED_FROM_ABI __sp_mut
     friend _LIBCPP_EXPORTED_FROM_ABI __sp_mut& __get_sp_mut(const void*);
 };
 
-_LIBCPP_EXPORTED_FROM_ABI _LIBCPP_AVAILABILITY_ATOMIC_SHARED_PTR
-__sp_mut& __get_sp_mut(const void*);
+_LIBCPP_EXPORTED_FROM_ABI __sp_mut& __get_sp_mut(const void*);
 
 template <class _Tp>
 inline _LIBCPP_INLINE_VISIBILITY
@@ -1997,7 +1996,6 @@ atomic_is_lock_free(const shared_ptr<_Tp>*)
 }
 
 template <class _Tp>
-_LIBCPP_AVAILABILITY_ATOMIC_SHARED_PTR
 _LIBCPP_HIDE_FROM_ABI shared_ptr<_Tp>
 atomic_load(const shared_ptr<_Tp>* __p)
 {
@@ -2010,7 +2008,6 @@ atomic_load(const shared_ptr<_Tp>* __p)
 
 template <class _Tp>
 inline _LIBCPP_INLINE_VISIBILITY
-_LIBCPP_AVAILABILITY_ATOMIC_SHARED_PTR
 shared_ptr<_Tp>
 atomic_load_explicit(const shared_ptr<_Tp>* __p, memory_order)
 {
@@ -2018,7 +2015,6 @@ atomic_load_explicit(const shared_ptr<_Tp>* __p, memory_order)
 }
 
 template <class _Tp>
-_LIBCPP_AVAILABILITY_ATOMIC_SHARED_PTR
 _LIBCPP_HIDE_FROM_ABI void
 atomic_store(shared_ptr<_Tp>* __p, shared_ptr<_Tp> __r)
 {
@@ -2030,7 +2026,6 @@ atomic_store(shared_ptr<_Tp>* __p, shared_ptr<_Tp> __r)
 
 template <class _Tp>
 inline _LIBCPP_INLINE_VISIBILITY
-_LIBCPP_AVAILABILITY_ATOMIC_SHARED_PTR
 void
 atomic_store_explicit(shared_ptr<_Tp>* __p, shared_ptr<_Tp> __r, memory_order)
 {
@@ -2038,7 +2033,6 @@ atomic_store_explicit(shared_ptr<_Tp>* __p, shared_ptr<_Tp> __r, memory_order)
 }
 
 template <class _Tp>
-_LIBCPP_AVAILABILITY_ATOMIC_SHARED_PTR
 _LIBCPP_HIDE_FROM_ABI shared_ptr<_Tp>
 atomic_exchange(shared_ptr<_Tp>* __p, shared_ptr<_Tp> __r)
 {
@@ -2051,7 +2045,6 @@ atomic_exchange(shared_ptr<_Tp>* __p, shared_ptr<_Tp> __r)
 
 template <class _Tp>
 inline _LIBCPP_INLINE_VISIBILITY
-_LIBCPP_AVAILABILITY_ATOMIC_SHARED_PTR
 shared_ptr<_Tp>
 atomic_exchange_explicit(shared_ptr<_Tp>* __p, shared_ptr<_Tp> __r, memory_order)
 {
@@ -2059,7 +2052,6 @@ atomic_exchange_explicit(shared_ptr<_Tp>* __p, shared_ptr<_Tp> __r, memory_order
 }
 
 template <class _Tp>
-_LIBCPP_AVAILABILITY_ATOMIC_SHARED_PTR
 _LIBCPP_HIDE_FROM_ABI bool
 atomic_compare_exchange_strong(shared_ptr<_Tp>* __p, shared_ptr<_Tp>* __v, shared_ptr<_Tp> __w)
 {
@@ -2081,7 +2073,6 @@ atomic_compare_exchange_strong(shared_ptr<_Tp>* __p, shared_ptr<_Tp>* __v, share
 
 template <class _Tp>
 inline _LIBCPP_INLINE_VISIBILITY
-_LIBCPP_AVAILABILITY_ATOMIC_SHARED_PTR
 bool
 atomic_compare_exchange_weak(shared_ptr<_Tp>* __p, shared_ptr<_Tp>* __v, shared_ptr<_Tp> __w)
 {
@@ -2090,7 +2081,6 @@ atomic_compare_exchange_weak(shared_ptr<_Tp>* __p, shared_ptr<_Tp>* __v, shared_
 
 template <class _Tp>
 inline _LIBCPP_INLINE_VISIBILITY
-_LIBCPP_AVAILABILITY_ATOMIC_SHARED_PTR
 bool
 atomic_compare_exchange_strong_explicit(shared_ptr<_Tp>* __p, shared_ptr<_Tp>* __v,
                                         shared_ptr<_Tp> __w, memory_order, memory_order)
@@ -2100,7 +2090,6 @@ atomic_compare_exchange_strong_explicit(shared_ptr<_Tp>* __p, shared_ptr<_Tp>* _
 
 template <class _Tp>
 inline _LIBCPP_INLINE_VISIBILITY
-_LIBCPP_AVAILABILITY_ATOMIC_SHARED_PTR
 bool
 atomic_compare_exchange_weak_explicit(shared_ptr<_Tp>* __p, shared_ptr<_Tp>* __v,
                                       shared_ptr<_Tp> __w, memory_order, memory_order)
diff --git a/libcxx/include/__utility/small_buffer.h b/libcxx/include/__utility/small_buffer.h
index 0c6d4986a5851..b029ca96c2d1c 100644
--- a/libcxx/include/__utility/small_buffer.h
+++ b/libcxx/include/__utility/small_buffer.h
@@ -62,7 +62,7 @@ class __small_buffer {
   }
 
   template <class _Stored>
-  _LIBCPP_AVAILABILITY_SIZED_NEW_DELETE _LIBCPP_HIDE_FROM_ABI _Stored* __alloc() {
+  _LIBCPP_HIDE_FROM_ABI _Stored* __alloc() {
     if constexpr (__fits_in_buffer<_Stored>) {
       return std::launder(reinterpret_cast<_Stored*>(__buffer_));
     } else {
@@ -73,13 +73,13 @@ class __small_buffer {
   }
 
   template <class _Stored>
-  _LIBCPP_AVAILABILITY_SIZED_NEW_DELETE _LIBCPP_HIDE_FROM_ABI void __dealloc() noexcept {
+  _LIBCPP_HIDE_FROM_ABI void __dealloc() noexcept {
     if constexpr (!__fits_in_buffer<_Stored>)
       ::operator delete[](*reinterpret_cast<void**>(__buffer_), sizeof(_Stored), align_val_t{alignof(_Stored)});
   }
 
   template <class _Stored, class... _Args>
-  _LIBCPP_AVAILABILITY_SIZED_NEW_DELETE _LIBCPP_HIDE_FROM_ABI void __construct(_Args&&... __args) {
+  _LIBCPP_HIDE_FROM_ABI void __construct(_Args&&... __args) {
     _Stored* __buffer = __alloc<_Stored>();
     auto __guard      = std::__make_exception_guard([&] { __dealloc<_Stored>(); });
     std::construct_at(__buffer, std::forward<_Args>(__args)...);
diff --git a/libcxx/include/future b/libcxx/include/future
index 97b22aa458bbf..9836228c628c7 100644
--- a/libcxx/include/future
+++ b/libcxx/include/future
@@ -516,15 +516,9 @@ make_error_condition(future_errc __e) _NOEXCEPT
     return error_condition(static_cast<int>(__e), future_category());
 }
 
-_LIBCPP_NORETURN inline _LIBCPP_HIDE_FROM_ABI
-#ifndef _LIBCPP_HAS_NO_EXCEPTIONS
-_LIBCPP_AVAILABILITY_FUTURE_ERROR
-#endif
-void __throw_future_error(future_errc __ev);
+_LIBCPP_NORETURN inline _LIBCPP_HIDE_FROM_ABI void __throw_future_error(future_errc __ev);
 
-class _LIBCPP_EXPORTED_FROM_ABI _LIBCPP_AVAILABILITY_FUTURE_ERROR future_error
-    : public logic_error
-{
+class _LIBCPP_EXPORTED_FROM_ABI future_error : public logic_error {
     error_code __ec_;
 
     future_error(error_code);
@@ -554,9 +548,7 @@ void __throw_future_error(future_errc __ev)
 #endif
 }
 
-class _LIBCPP_EXPORTED_FROM_ABI _LIBCPP_AVAILABILITY_FUTURE __assoc_sub_state
-    : public __shared_count
-{
+class _LIBCPP_EXPORTED_FROM_ABI __assoc_sub_state : public __shared_count {
 protected:
     exception_ptr __exception_;
     mutable mutex __mut_;
@@ -642,8 +634,7 @@ __assoc_sub_state::wait_for(const chrono::duration<_Rep, _Period>& __rel_time) c
 }
 
 template <class _Rp>
-class _LIBCPP_AVAILABILITY_FUTURE _LIBCPP_HIDDEN __assoc_state
-    : public __assoc_sub_state
+class _LIBCPP_HIDDEN __assoc_state : public __assoc_sub_state
 {
     typedef __assoc_sub_state base;
 _LIBCPP_SUPPRESS_DEPRECATED_PUSH
@@ -676,7 +667,6 @@ __assoc_state<_Rp>::__on_zero_shared() _NOEXCEPT
 
 template <class _Rp>
 template <class _Arg>
-_LIBCPP_AVAILABILITY_FUTURE
 void
 __assoc_state<_Rp>::set_value(_Arg&& __arg)
 {
@@ -724,8 +714,7 @@ __assoc_state<_Rp>::copy()
 }
 
 template <class _Rp>
-class _LIBCPP_AVAILABILITY_FUTURE __assoc_state<_Rp&>
-    : public __assoc_sub_state
+class __assoc_state<_Rp&> : public __assoc_sub_state
 {
     typedef __assoc_sub_state base;
     typedef _Rp* _Up;
@@ -784,8 +773,7 @@ __assoc_state<_Rp&>::copy()
 }
 
 template <class _Rp, class _Alloc>
-class _LIBCPP_AVAILABILITY_FUTURE __assoc_state_alloc
-    : public __assoc_state<_Rp>
+class __assoc_state_alloc : public __assoc_state<_Rp>
 {
     typedef __assoc_state<_Rp> base;
     _Alloc __alloc_;
@@ -812,8 +800,7 @@ __assoc_state_alloc<_Rp, _Alloc>::__on_zero_shared() _NOEXCEPT
 }
 
 template <class _Rp, class _Alloc>
-class _LIBCPP_AVAILABILITY_FUTURE __assoc_state_alloc<_Rp&, _Alloc>
-    : public __assoc_state<_Rp&>
+class __assoc_state_alloc<_Rp&, _Alloc> : public __assoc_state<_Rp&>
 {
     typedef __assoc_state<_Rp&> base;
     _Alloc __alloc_;
@@ -838,8 +825,7 @@ __assoc_state_alloc<_Rp&, _Alloc>::__on_zero_shared() _NOEXCEPT
 }
 
 template <class _Alloc>
-class _LIBCPP_AVAILABILITY_FUTURE __assoc_sub_state_alloc
-    : public __assoc_sub_state
+class __assoc_sub_state_alloc : public __assoc_sub_state
 {
     typedef __assoc_sub_state base;
     _Alloc __alloc_;
@@ -864,8 +850,7 @@ __assoc_sub_state_alloc<_Alloc>::__on_zero_shared() _NOEXCEPT
 }
 
 template <class _Rp, class _Fp>
-class _LIBCPP_AVAILABILITY_FUTURE __deferred_assoc_state
-    : public __assoc_state<_Rp>
+class __deferred_assoc_state : public __assoc_state<_Rp>
 {
     typedef __assoc_state<_Rp> base;
 
@@ -905,8 +890,7 @@ __deferred_assoc_state<_Rp, _Fp>::__execute()
 }
 
 template <class _Fp>
-class _LIBCPP_AVAILABILITY_FUTURE __deferred_assoc_state<void, _Fp>
-    : public __assoc_sub_state
+class __deferred_assoc_state<void, _Fp> : public __assoc_sub_state
 {
     typedef __assoc_sub_state base;
 
@@ -947,8 +931,7 @@ __deferred_assoc_state<void, _Fp>::__execute()
 }
 
 template <class _Rp, class _Fp>
-class _LIBCPP_AVAILABILITY_FUTURE __async_assoc_state
-    : public __assoc_state<_Rp>
+class __async_assoc_state : public __assoc_state<_Rp>
 {
     typedef __assoc_state<_Rp> base;
 
@@ -996,8 +979,7 @@ __async_assoc_state<_Rp, _Fp>::__on_zero_shared() _NOEXCEPT
 }
 
 template <class _Fp>
-class _LIBCPP_AVAILABILITY_FUTURE __async_assoc_state<void, _Fp>
-    : public __assoc_sub_state
+class __async_assoc_state<void, _Fp> : public __assoc_sub_state
 {
     typedef __assoc_sub_state base;
 
@@ -1061,7 +1043,7 @@ _LIBCPP_INLINE_VISIBILITY future<_Rp>
 __make_async_assoc_state(_Fp&& __f);
 
 template <class _Rp>
-class _LIBCPP_TEMPLATE_VIS _LIBCPP_AVAILABILITY_FUTURE future
+class _LIBCPP_TEMPLATE_VIS future
 {
     __assoc_state<_Rp>* __state_;
 
@@ -1148,7 +1130,7 @@ future<_Rp>::get()
 }
 
 template <class _Rp>
-class _LIBCPP_TEMPLATE_VIS _LIBCPP_AVAILABILITY_FUTURE future<_Rp&>
+class _LIBCPP_TEMPLATE_VIS future<_Rp&>
 {
     __assoc_state<_Rp&>* __state_;
 
@@ -1230,7 +1212,7 @@ future<_Rp&>::get()
 }
 
 template <>
-class _LIBCPP_EXPORTED_FROM_ABI _LIBCPP_AVAILABILITY_FUTURE future<void>
+class _LIBCPP_EXPORTED_FROM_ABI future<void>
 {
     __assoc_sub_state* __state_;
 
@@ -1300,7 +1282,7 @@ swap(future<_Rp>& __x, future<_Rp>& __y) _NOEXCEPT
 template <class _Callable> class packaged_task;
 
 template <class _Rp>
-class _LIBCPP_TEMPLATE_VIS _LIBCPP_AVAILABILITY_FUTURE promise
+class _LIBCPP_TEMPLATE_VIS promise
 {
     __assoc_state<_Rp>* __state_;
 
@@ -1442,7 +1424,7 @@ promise<_Rp>::set_exception_at_thread_exit(exception_ptr __p)
 // promise<R&>
 
 template <class _Rp>
-class _LIBCPP_TEMPLATE_VIS _LIBCPP_AVAILABILITY_FUTURE promise<_Rp&>
+class _LIBCPP_TEMPLATE_VIS promise<_Rp&>
 {
     __assoc_state<_Rp&>* __state_;
 
@@ -1565,7 +1547,7 @@ promise<_Rp&>::set_exception_at_thread_exit(exception_ptr __p)
 // promise<void>
 
 template <>
-class _LIBCPP_EXPORTED_FROM_ABI _LIBCPP_AVAILABILITY_FUTURE promise<void>
+class _LIBCPP_EXPORTED_FROM_ABI promise<void>
 {
     __assoc_sub_state* __state_;
 
@@ -1638,7 +1620,7 @@ template <class _Rp, class _Alloc>
 template<class _Fp> class __packaged_task_base;
 
 template<class _Rp, class ..._ArgTypes>
-class _LIBCPP_AVAILABILITY_FUTURE __packaged_task_base<_Rp(_ArgTypes...)>
+class __packaged_task_base<_Rp(_ArgTypes...)>
 {
     __packaged_task_base(const __packaged_task_base&);
     __packaged_task_base& operator=(const __packaged_task_base&);
@@ -1656,8 +1638,7 @@ public:
 template<class _FD, class _Alloc, class _FB> class __packaged_task_func;
 
 template<class _Fp, class _Alloc, class _Rp, class ..._ArgTypes>
-class _LIBCPP_AVAILABILITY_FUTURE __packaged_task_func<_Fp, _Alloc, _Rp(_ArgTypes...)>
-    : public  __packaged_task_base<_Rp(_ArgTypes...)>
+class __packaged_task_func<_Fp, _Alloc, _Rp(_ArgTypes...)> : public  __packaged_task_base<_Rp(_ArgTypes...)>
 {
     __compressed_pair<_Fp, _Alloc> __f_;
 public:
@@ -1714,7 +1695,7 @@ __packaged_task_func<_Fp, _Alloc, _Rp(_ArgTypes...)>::operator()(_ArgTypes&& ...
 template <class _Callable> class __packaged_task_function;
 
 template<class _Rp, class ..._ArgTypes>
-class _LIBCPP_AVAILABILITY_FUTURE __packaged_task_function<_Rp(_ArgTypes...)>
+class __packaged_task_function<_Rp(_ArgTypes...)>
 {
     typedef __packaged_task_base<_Rp(_ArgTypes...)> __base;
 
@@ -1898,7 +1879,7 @@ __packaged_task_function<_Rp(_ArgTypes...)>::operator()(_ArgTypes... __arg) cons
 }
 
 template<class _Rp, class ..._ArgTypes>
-class _LIBCPP_TEMPLATE_VIS _LIBCPP_AVAILABILITY_FUTURE packaged_task<_Rp(_ArgTypes...)>
+class _LIBCPP_TEMPLATE_VIS packaged_task<_Rp(_ArgTypes...)>
 {
 public:
     typedef _Rp result_type; // extension
@@ -2013,7 +1994,7 @@ packaged_task<_Rp(_ArgTypes...)>::reset()
 }
 
 template<class ..._ArgTypes>
-class _LIBCPP_TEMPLATE_VIS _LIBCPP_AVAILABILITY_FUTURE packaged_task<void(_ArgTypes...)>
+class _LIBCPP_TEMPLATE_VIS packaged_task<void(_ArgTypes...)>
 {
 public:
     typedef void result_type; // extension
@@ -2384,7 +2365,7 @@ shared_future<_Rp&>::operator=(const shared_future& __rhs)
 }
 
 template <>
-class _LIBCPP_EXPORTED_FROM_ABI _LIBCPP_AVAILABILITY_FUTURE shared_future<void>
+class _LIBCPP_EXPORTED_FROM_ABI shared_future<void>
 {
     __assoc_sub_state* __state_;
 
diff --git a/libcxx/include/new b/libcxx/include/new
index e95f2e46d0fe3..cb280743b6e18 100644
--- a/libcxx/include/new
+++ b/libcxx/include/new
@@ -216,7 +216,7 @@ _LIBCPP_NODISCARD_AFTER_CXX17 _LIBCPP_OVERRIDABLE_FUNC_VIS void* operator new(st
 _LIBCPP_OVERRIDABLE_FUNC_VIS void  operator delete(void* __p) _NOEXCEPT;
 _LIBCPP_OVERRIDABLE_FUNC_VIS void  operator delete(void* __p, const std::nothrow_t&) _NOEXCEPT;
 #ifndef _LIBCPP_HAS_NO_LIBRARY_SIZED_DEALLOCATION
-_LIBCPP_OVERRIDABLE_FUNC_VIS _LIBCPP_AVAILABILITY_SIZED_NEW_DELETE void  operator delete(void* __p, std::size_t __sz) _NOEXCEPT;
+_LIBCPP_OVERRIDABLE_FUNC_VIS void operator delete(void* __p, std::size_t __sz) _NOEXCEPT;
 #endif
 
 _LIBCPP_NODISCARD_AFTER_CXX17 _LIBCPP_OVERRIDABLE_FUNC_VIS void* operator new[](std::size_t __sz) _THROW_BAD_ALLOC;
@@ -224,7 +224,7 @@ _LIBCPP_NODISCARD_AFTER_CXX17 _LIBCPP_OVERRIDABLE_FUNC_VIS void* operator new[](
 _LIBCPP_OVERRIDABLE_FUNC_VIS void  operator delete[](void* __p) _NOEXCEPT;
 _LIBCPP_OVERRIDABLE_FUNC_VIS void  operator delete[](void* __p, const std::nothrow_t&) _NOEXCEPT;
 #ifndef _LIBCPP_HAS_NO_LIBRARY_SIZED_DEALLOCATION
-_LIBCPP_OVERRIDABLE_FUNC_VIS _LIBCPP_AVAILABILITY_SIZED_NEW_DELETE void  operator delete[](void* __p, std::size_t __sz) _NOEXCEPT;
+_LIBCPP_OVERRIDABLE_FUNC_VIS void operator delete[](void* __p, std::size_t __sz) _NOEXCEPT;
 #endif
 
 #ifndef _LIBCPP_HAS_NO_LIBRARY_ALIGNED_ALLOCATION
@@ -233,7 +233,7 @@ _LIBCPP_NODISCARD_AFTER_CXX17 _LIBCPP_OVERRIDABLE_FUNC_VIS void* operator new(st
 _LIBCPP_OVERRIDABLE_FUNC_VIS void  operator delete(void* __p, std::align_val_t) _NOEXCEPT;
 _LIBCPP_OVERRIDABLE_FUNC_VIS void  operator delete(void* __p, std::align_val_t, const std::nothrow_t&) _NOEXCEPT;
 #ifndef _LIBCPP_HAS_NO_LIBRARY_SIZED_DEALLOCATION
-_LIBCPP_OVERRIDABLE_FUNC_VIS _LIBCPP_AVAILABILITY_SIZED_NEW_DELETE void  operator delete(void* __p, std::size_t __sz, std::align_val_t) _NOEXCEPT;
+_LIBCPP_OVERRIDABLE_FUNC_VIS void operator delete(void* __p, std::size_t __sz, std::align_val_t) _NOEXCEPT;
 #endif
 
 _LIBCPP_NODISCARD_AFTER_CXX17 _LIBCPP_OVERRIDABLE_FUNC_VIS void* operator new[](std::size_t __sz, std::align_val_t) _THROW_BAD_ALLOC;
@@ -241,7 +241,7 @@ _LIBCPP_NODISCARD_AFTER_CXX17 _LIBCPP_OVERRIDABLE_FUNC_VIS void* operator new[](
 _LIBCPP_OVERRIDABLE_FUNC_VIS void  operator delete[](void* __p, std::align_val_t) _NOEXCEPT;
 _LIBCPP_OVERRIDABLE_FUNC_VIS void  operator delete[](void* __p, std::align_val_t, const std::nothrow_t&) _NOEXCEPT;
 #ifndef _LIBCPP_HAS_NO_LIBRARY_SIZED_DEALLOCATION
-_LIBCPP_OVERRIDABLE_FUNC_VIS _LIBCPP_AVAILABILITY_SIZED_NEW_DELETE void  operator delete[](void* __p, std::size_t __sz, std::align_val_t) _NOEXCEPT;
+_LIBCPP_OVERRIDABLE_FUNC_VIS void operator delete[](void* __p, std::size_t __sz, std::align_val_t) _NOEXCEPT;
 #endif
 #endif
 
diff --git a/libcxx/include/shared_mutex b/libcxx/include/shared_mutex
index 350c841e11a0a..3118676d4e6b9 100644
--- a/libcxx/include/shared_mutex
+++ b/libcxx/include/shared_mutex
@@ -153,7 +153,7 @@ _LIBCPP_PUSH_MACROS
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
-struct _LIBCPP_EXPORTED_FROM_ABI _LIBCPP_AVAILABILITY_SHARED_MUTEX __shared_mutex_base {
+struct _LIBCPP_EXPORTED_FROM_ABI __shared_mutex_base {
   mutex __mut_;
   condition_variable __gate1_;
   condition_variable __gate2_;
@@ -183,8 +183,7 @@ struct _LIBCPP_EXPORTED_FROM_ABI _LIBCPP_AVAILABILITY_SHARED_MUTEX __shared_mute
 };
 
 #  if _LIBCPP_STD_VER >= 17
-class _LIBCPP_EXPORTED_FROM_ABI
-    _LIBCPP_AVAILABILITY_SHARED_MUTEX _LIBCPP_THREAD_SAFETY_ANNOTATION(__capability__("shared_mutex")) shared_mutex {
+class _LIBCPP_EXPORTED_FROM_ABI _LIBCPP_THREAD_SAFETY_ANNOTATION(__capability__("shared_mutex")) shared_mutex {
   __shared_mutex_base __base_;
 
 public:
@@ -222,8 +221,8 @@ public:
 };
 #  endif
 
-class _LIBCPP_EXPORTED_FROM_ABI _LIBCPP_AVAILABILITY_SHARED_MUTEX _LIBCPP_THREAD_SAFETY_ANNOTATION(
-    __capability__("shared_timed_mutex")) shared_timed_mutex {
+class _LIBCPP_EXPORTED_FROM_ABI _LIBCPP_THREAD_SAFETY_ANNOTATION(__capability__("shared_timed_mutex"))
+    shared_timed_mutex {
   __shared_mutex_base __base_;
 
 public:
diff --git a/libcxx/include/typeinfo b/libcxx/include/typeinfo
index 59bc291454c3d..cf9c29dbad003 100644
--- a/libcxx/include/typeinfo
+++ b/libcxx/include/typeinfo
@@ -92,7 +92,6 @@ class _LIBCPP_EXPORTED_FROM_ABI type_info
     int __compare(const type_info &__rhs) const _NOEXCEPT;
 
 public:
-    _LIBCPP_AVAILABILITY_TYPEINFO_VTABLE
     virtual ~type_info();
 
     const char *name() const _NOEXCEPT;
@@ -314,7 +313,6 @@ class _LIBCPP_EXPORTED_FROM_ABI type_info
       : __type_name(__impl::__string_to_type_name(__n)) {}
 
 public:
-    _LIBCPP_AVAILABILITY_TYPEINFO_VTABLE
     virtual ~type_info();
 
     _LIBCPP_INLINE_VISIBILITY
diff --git a/libcxx/include/version b/libcxx/include/version
index c41ce745b5930..3c8e1a132e87f 100644
--- a/libcxx/include/version
+++ b/libcxx/include/version
@@ -248,7 +248,7 @@ __cpp_lib_within_lifetime                               202306L <type_traits>
 # define __cpp_lib_quoted_string_io                     201304L
 # define __cpp_lib_result_of_sfinae                     201210L
 # define __cpp_lib_robust_nonmodifying_seq_ops          201304L
-# if !defined(_LIBCPP_HAS_NO_THREADS) && !defined(_LIBCPP_AVAILABILITY_HAS_NO_SHARED_MUTEX)
+# if !defined(_LIBCPP_HAS_NO_THREADS)
 #   define __cpp_lib_shared_timed_mutex                 201402L
 # endif
 # define __cpp_lib_string_udls                          201304L
@@ -303,7 +303,7 @@ __cpp_lib_within_lifetime                               202306L <type_traits>
 # define __cpp_lib_raw_memory_algorithms                201606L
 # define __cpp_lib_sample                               201603L
 # define __cpp_lib_scoped_lock                          201703L
-# if !defined(_LIBCPP_HAS_NO_THREADS) && !defined(_LIBCPP_AVAILABILITY_HAS_NO_SHARED_MUTEX)
+# if !defined(_LIBCPP_HAS_NO_THREADS)
 #   define __cpp_lib_shared_mutex                       201505L
 # endif
 # define __cpp_lib_shared_ptr_arrays                    201611L
diff --git a/libcxx/test/libcxx/experimental/memory/memory.resource.global/global_memory_resource_lifetime.pass.cpp b/libcxx/test/libcxx/experimental/memory/memory.resource.global/global_memory_resource_lifetime.pass.cpp
index fc12e65bdc720..87ffb28a07c78 100644
--- a/libcxx/test/libcxx/experimental/memory/memory.resource.global/global_memory_resource_lifetime.pass.cpp
+++ b/libcxx/test/libcxx/experimental/memory/memory.resource.global/global_memory_resource_lifetime.pass.cpp
@@ -8,8 +8,6 @@
 
 // UNSUPPORTED: c++03
 
-// XFAIL: availability-aligned_allocation-missing
-
 // <experimental/memory_resource>
 
 // memory_resource * new_delete_resource()
diff --git a/libcxx/test/libcxx/experimental/memory/memory.resource.global/new_delete_resource_lifetime.pass.cpp b/libcxx/test/libcxx/experimental/memory/memory.resource.global/new_delete_resource_lifetime.pass.cpp
index 5e2e02fe18b48..bba56e7dc1dd5 100644
--- a/libcxx/test/libcxx/experimental/memory/memory.resource.global/new_delete_resource_lifetime.pass.cpp
+++ b/libcxx/test/libcxx/experimental/memory/memory.resource.global/new_delete_resource_lifetime.pass.cpp
@@ -8,8 +8,6 @@
 
 // UNSUPPORTED: c++03
 
-// XFAIL: availability-aligned_allocation-missing
-
 // <experimental/memory_resource>
 
 // memory_resource * new_delete_resource()
diff --git a/libcxx/test/libcxx/language.support/support.dynamic/aligned_alloc_availability.verify.cpp b/libcxx/test/libcxx/language.support/support.dynamic/aligned_alloc_availability.verify.cpp
deleted file mode 100644
index 159ab9b998866..0000000000000
--- a/libcxx/test/libcxx/language.support/support.dynamic/aligned_alloc_availability.verify.cpp
+++ /dev/null
@@ -1,50 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// Make sure we get compile-time availability errors when trying to use aligned
-// allocation/deallocation on deployment targets that don't support it (before macosx10.13).
-
-// UNSUPPORTED: c++03, c++11, c++14
-// REQUIRES: availability-aligned_allocation-missing
-
-#include <new>
-#include <cstddef>
-
-#include "test_macros.h"
-
-constexpr auto OverAligned = __STDCPP_DEFAULT_NEW_ALIGNMENT__ * 2;
-
-struct alignas(OverAligned) A { };
-
-void f() {
-    // Normal versions
-    {
-        A *a1 = new A; // expected-error-re {{aligned allocation function of type {{.+}} is only available on}}
-        // `delete` is also required by the line above if construction fails
-        // expected-error-re@-2 {{aligned deallocation function of type {{.+}} is only available on}}
-
-        delete a1; // expected-error-re {{aligned deallocation function of type {{.+}} is only available on}}
-
-        A* a2 = new(std::nothrow) A; // expected-error-re {{aligned allocation function of type {{.+}} is only available on}}
-        // `delete` is also required above for the same reason
-        // expected-error-re@-2 {{aligned deallocation function of type {{.+}} is only available on}}
-    }
-
-    // Array versions
-    {
-        A *a1 = new A[2]; // expected-error-re {{aligned allocation function of type {{.+}} is only available on}}
-        // `delete` is also required by the line above if construction fails
-        // expected-error-re@-2 {{aligned deallocation function of type {{.+}} is only available on}}
-
-        delete[] a1; // expected-error-re {{aligned deallocation function of type {{.+}} is only available on}}
-
-        A* a2 = new(std::nothrow) A[2]; // expected-error-re {{aligned allocation function of type {{.+}} is only available on}}
-        // `delete` is also required above for the same reason
-        // expected-error-re@-2 {{aligned deallocation function of type {{.+}} is only available on}}
-    }
-}
diff --git a/libcxx/test/libcxx/language.support/support.dynamic/libcpp_deallocate.sh.cpp b/libcxx/test/libcxx/language.support/support.dynamic/libcpp_deallocate.sh.cpp
index 3b0849873ac91..fb56ce4518a71 100644
--- a/libcxx/test/libcxx/language.support/support.dynamic/libcpp_deallocate.sh.cpp
+++ b/libcxx/test/libcxx/language.support/support.dynamic/libcpp_deallocate.sh.cpp
@@ -9,10 +9,6 @@
 // test libc++'s implementation of align_val_t, and the relevant new/delete
 // overloads in all dialects when -faligned-allocation is present.
 
-// Some dylibs do not contain the aligned allocation functions, so trying to force
-// using those with -faligned-allocation results in a link error.
-// XFAIL: availability-aligned_allocation-missing
-
 // Libc++ when built for z/OS doesn't contain the aligned allocation functions,
 // nor does the dynamic library shipped with z/OS.
 // UNSUPPORTED: target={{.+}}-zos{{.*}}
diff --git a/libcxx/test/libcxx/language.support/support.dynamic/new_faligned_allocation.pass.cpp b/libcxx/test/libcxx/language.support/support.dynamic/new_faligned_allocation.pass.cpp
index b6d3b5c1eb803..9cf1b275abc55 100644
--- a/libcxx/test/libcxx/language.support/support.dynamic/new_faligned_allocation.pass.cpp
+++ b/libcxx/test/libcxx/language.support/support.dynamic/new_faligned_allocation.pass.cpp
@@ -9,10 +9,6 @@
 // test libc++'s implementation of align_val_t, and the relevant new/delete
 // overloads in all dialects when -faligned-allocation is present.
 
-// Some dylibs do not contain the aligned allocation functions, so trying to force
-// using those with -faligned-allocation results in a link error.
-// XFAIL: availability-aligned_allocation-missing
-
 // Libc++ when built for z/OS doesn't contain the aligned allocation functions,
 // nor does the dynamic library shipped with z/OS.
 // UNSUPPORTED: target={{.+}}-zos{{.*}}
diff --git a/libcxx/test/libcxx/memory/aligned_allocation_macro.compile.pass.cpp b/libcxx/test/libcxx/memory/aligned_allocation_macro.compile.pass.cpp
index d26339730be9c..34a3146e26c95 100644
--- a/libcxx/test/libcxx/memory/aligned_allocation_macro.compile.pass.cpp
+++ b/libcxx/test/libcxx/memory/aligned_allocation_macro.compile.pass.cpp
@@ -7,7 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 // UNSUPPORTED: c++03, c++11, c++14
-// XFAIL: availability-aligned_allocation-missing
 
 #include <new>
 
diff --git a/libcxx/test/libcxx/thread/thread.shared_mutex/thread_safety.verify.cpp b/libcxx/test/libcxx/thread/thread.shared_mutex/thread_safety.verify.cpp
index 19d8aab292b87..fee940fff847b 100644
--- a/libcxx/test/libcxx/thread/thread.shared_mutex/thread_safety.verify.cpp
+++ b/libcxx/test/libcxx/thread/thread.shared_mutex/thread_safety.verify.cpp
@@ -8,7 +8,6 @@
 
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: no-threads
-// UNSUPPORTED: availability-shared_mutex-missing
 // REQUIRES: thread-safety
 // ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_ENABLE_THREAD_SAFETY_ANNOTATIONS
 
diff --git a/libcxx/test/libcxx/thread/thread.shared_timed_mutex/thread_safety.verify.cpp b/libcxx/test/libcxx/thread/thread.shared_timed_mutex/thread_safety.verify.cpp
index 9fa1ef7d9996e..baa8bc614f149 100644
--- a/libcxx/test/libcxx/thread/thread.shared_timed_mutex/thread_safety.verify.cpp
+++ b/libcxx/test/libcxx/thread/thread.shared_timed_mutex/thread_safety.verify.cpp
@@ -8,7 +8,6 @@
 
 // UNSUPPORTED: c++03, c++11
 // UNSUPPORTED: no-threads
-// UNSUPPORTED: availability-shared_mutex-missing
 // REQUIRES: thread-safety
 // ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_ENABLE_THREAD_SAFETY_ANNOTATIONS
 
diff --git a/libcxx/test/std/experimental/memory/memory.polymorphic.allocator.class/memory.polymorphic.allocator.ctor/default.pass.cpp b/libcxx/test/std/experimental/memory/memory.polymorphic.allocator.class/memory.polymorphic.allocator.ctor/default.pass.cpp
index 2f5a43d812064..099c39bceea3e 100644
--- a/libcxx/test/std/experimental/memory/memory.polymorphic.allocator.class/memory.polymorphic.allocator.ctor/default.pass.cpp
+++ b/libcxx/test/std/experimental/memory/memory.polymorphic.allocator.class/memory.polymorphic.allocator.ctor/default.pass.cpp
@@ -11,8 +11,6 @@
 // test_memory_resource requires RTTI for dynamic_cast
 // UNSUPPORTED: no-rtti
 
-// XFAIL: availability-aligned_allocation-missing
-
 // <experimental/memory_resource>
 
 // template <class T> class polymorphic_allocator
diff --git a/libcxx/test/std/experimental/memory/memory.polymorphic.allocator.class/memory.polymorphic.allocator.eq/equal.pass.cpp b/libcxx/test/std/experimental/memory/memory.polymorphic.allocator.class/memory.polymorphic.allocator.eq/equal.pass.cpp
index 3153c562151f8..475c4cee4fa5a 100644
--- a/libcxx/test/std/experimental/memory/memory.polymorphic.allocator.class/memory.polymorphic.allocator.eq/equal.pass.cpp
+++ b/libcxx/test/std/experimental/memory/memory.polymorphic.allocator.class/memory.polymorphic.allocator.eq/equal.pass.cpp
@@ -11,8 +11,6 @@
 // test_memory_resource requires RTTI for dynamic_cast
 // UNSUPPORTED: no-rtti
 
-// XFAIL: availability-aligned_allocation-missing
-
 // <experimental/memory_resource>
 
 // template <class T> class polymorphic_allocator;
diff --git a/libcxx/test/std/experimental/memory/memory.polymorphic.allocator.class/memory.polymorphic.allocator.eq/not_equal.pass.cpp b/libcxx/test/std/experimental/memory/memory.polymorphic.allocator.class/memory.polymorphic.allocator.eq/not_equal.pass.cpp
index 908e32cb553f5..3376459b311e0 100644
--- a/libcxx/test/std/experimental/memory/memory.polymorphic.allocator.class/memory.polymorphic.allocator.eq/not_equal.pass.cpp
+++ b/libcxx/test/std/experimental/memory/memory.polymorphic.allocator.class/memory.polymorphic.allocator.eq/not_equal.pass.cpp
@@ -11,8 +11,6 @@
 // test_memory_resource requires RTTI for dynamic_cast
 // UNSUPPORTED: no-rtti
 
-// XFAIL: availability-aligned_allocation-missing
-
 // <experimental/memory_resource>
 
 // template <class T> class polymorphic_allocator;
diff --git a/libcxx/test/std/experimental/memory/memory.polymorphic.allocator.class/memory.polymorphic.allocator.mem/allocate.pass.cpp b/libcxx/test/std/experimental/memory/memory.polymorphic.allocator.class/memory.polymorphic.allocator.mem/allocate.pass.cpp
index 289d2ee6accd6..b2dc33737f1a8 100644
--- a/libcxx/test/std/experimental/memory/memory.polymorphic.allocator.class/memory.polymorphic.allocator.mem/allocate.pass.cpp
+++ b/libcxx/test/std/experimental/memory/memory.polymorphic.allocator.class/memory.polymorphic.allocator.mem/allocate.pass.cpp
@@ -11,8 +11,6 @@
 // test_memory_resource requires RTTI for dynamic_cast
 // UNSUPPORTED: no-rtti
 
-// XFAIL: availability-aligned_allocation-missing
-
 // <experimental/memory_resource>
 
 // template <class T> class polymorphic_allocator
diff --git a/libcxx/test/std/experimental/memory/memory.polymorphic.allocator.class/memory.polymorphic.allocator.mem/construct_pair.pass.cpp b/libcxx/test/std/experimental/memory/memory.polymorphic.allocator.class/memory.polymorphic.allocator.mem/construct_pair.pass.cpp
index 17b9758b7f528..4899587274923 100644
--- a/libcxx/test/std/experimental/memory/memory.polymorphic.allocator.class/memory.polymorphic.allocator.mem/construct_pair.pass.cpp
+++ b/libcxx/test/std/experimental/memory/memory.polymorphic.allocator.class/memory.polymorphic.allocator.mem/construct_pair.pass.cpp
@@ -8,8 +8,6 @@
 
 // UNSUPPORTED: c++03
 
-// XFAIL: availability-aligned_allocation-missing
-
 // <experimental/memory_resource>
 
 // template <class T> class polymorphic_allocator
diff --git a/libcxx/test/std/experimental/memory/memory.polymorphic.allocator.class/memory.polymorphic.allocator.mem/construct_pair_const_lvalue_pair.pass.cpp b/libcxx/test/std/experimental/memory/memory.polymorphic.allocator.class/memory.polymorphic.allocator.mem/construct_pair_const_lvalue_pair.pass.cpp
index 0c09d4c72fa79..96a13b481460b 100644
--- a/libcxx/test/std/experimental/memory/memory.polymorphic.allocator.class/memory.polymorphic.allocator.mem/construct_pair_const_lvalue_pair.pass.cpp
+++ b/libcxx/test/std/experimental/memory/memory.polymorphic.allocator.class/memory.polymorphic.allocator.mem/construct_pair_const_lvalue_pair.pass.cpp
@@ -11,8 +11,6 @@
 // test_memory_resource requires RTTI for dynamic_cast
 // UNSUPPORTED: no-rtti
 
-// XFAIL: availability-aligned_allocation-missing
-
 // <experimental/memory_resource>
 
 // template <class T> class polymorphic_allocator
diff --git a/libcxx/test/std/experimental/memory/memory.polymorphic.allocator.class/memory.polymorphic.allocator.mem/construct_pair_rvalue.pass.cpp b/libcxx/test/std/experimental/memory/memory.polymorphic.allocator.class/memory.polymorphic.allocator.mem/construct_pair_rvalue.pass.cpp
index e442e7e9a7203..c7be5efe91435 100644
--- a/libcxx/test/std/experimental/memory/memory.polymorphic.allocator.class/memory.polymorphic.allocator.mem/construct_pair_rvalue.pass.cpp
+++ b/libcxx/test/std/experimental/memory/memory.polymorphic.allocator.class/memory.polymorphic.allocator.mem/construct_pair_rvalue.pass.cpp
@@ -11,8 +11,6 @@
 // test_memory_resource requires RTTI for dynamic_cast
 // UNSUPPORTED: no-rtti
 
-// XFAIL: availability-aligned_allocation-missing
-
 // <experimental/memory_resource>
 
 // template <class T> class polymorphic_allocator
diff --git a/libcxx/test/std/experimental/memory/memory.polymorphic.allocator.class/memory.polymorphic.allocator.mem/construct_pair_values.pass.cpp b/libcxx/test/std/experimental/memory/memory.polymorphic.allocator.class/memory.polymorphic.allocator.mem/construct_pair_values.pass.cpp
index d203864a4d99d..03e466e3821a0 100644
--- a/libcxx/test/std/experimental/memory/memory.polymorphic.allocator.class/memory.polymorphic.allocator.mem/construct_pair_values.pass.cpp
+++ b/libcxx/test/std/experimental/memory/memory.polymorphic.allocator.class/memory.polymorphic.allocator.mem/construct_pair_values.pass.cpp
@@ -11,8 +11,6 @@
 // test_memory_resource requires RTTI for dynamic_cast
 // UNSUPPORTED: no-rtti
 
-// XFAIL: availability-aligned_allocation-missing
-
 // <experimental/memory_resource>
 
 // template <class T> class polymorphic_allocator
diff --git a/libcxx/test/std/experimental/memory/memory.polymorphic.allocator.class/memory.polymorphic.allocator.mem/construct_piecewise_pair.pass.cpp b/libcxx/test/std/experimental/memory/memory.polymorphic.allocator.class/memory.polymorphic.allocator.mem/construct_piecewise_pair.pass.cpp
index c7f49fdb65957..7e3f92bed4421 100644
--- a/libcxx/test/std/experimental/memory/memory.polymorphic.allocator.class/memory.polymorphic.allocator.mem/construct_piecewise_pair.pass.cpp
+++ b/libcxx/test/std/experimental/memory/memory.polymorphic.allocator.class/memory.polymorphic.allocator.mem/construct_piecewise_pair.pass.cpp
@@ -11,8 +11,6 @@
 // test_memory_resource requires RTTI for dynamic_cast
 // UNSUPPORTED: no-rtti
 
-// XFAIL: availability-aligned_allocation-missing
-
 // <experimental/memory_resource>
 
 // template <class T> class polymorphic_allocator
diff --git a/libcxx/test/std/experimental/memory/memory.polymorphic.allocator.class/memory.polymorphic.allocator.mem/construct_piecewise_pair_evil.pass.cpp b/libcxx/test/std/experimental/memory/memory.polymorphic.allocator.class/memory.polymorphic.allocator.mem/construct_piecewise_pair_evil.pass.cpp
index 47fbb997ae9ca..7b8ce2ccd451e 100644
--- a/libcxx/test/std/experimental/memory/memory.polymorphic.allocator.class/memory.polymorphic.allocator.mem/construct_piecewise_pair_evil.pass.cpp
+++ b/libcxx/test/std/experimental/memory/memory.polymorphic.allocator.class/memory.polymorphic.allocator.mem/construct_piecewise_pair_evil.pass.cpp
@@ -8,8 +8,6 @@
 
 // UNSUPPORTED: c++03
 
-// XFAIL: availability-aligned_allocation-missing
-
 // <memory_resource>
 
 // template <class T> class polymorphic_allocator
diff --git a/libcxx/test/std/experimental/memory/memory.polymorphic.allocator.class/memory.polymorphic.allocator.mem/construct_types.pass.cpp b/libcxx/test/std/experimental/memory/memory.polymorphic.allocator.class/memory.polymorphic.allocator.mem/construct_types.pass.cpp
index 252044a8aa51d..2b7a46d1ef01d 100644
--- a/libcxx/test/std/experimental/memory/memory.polymorphic.allocator.class/memory.polymorphic.allocator.mem/construct_types.pass.cpp
+++ b/libcxx/test/std/experimental/memory/memory.polymorphic.allocator.class/memory.polymorphic.allocator.mem/construct_types.pass.cpp
@@ -11,8 +11,6 @@
 // test_memory_resource requires RTTI for dynamic_cast
 // UNSUPPORTED: no-rtti
 
-// XFAIL: availability-aligned_allocation-missing
-
 // <experimental/memory_resource>
 
 // template <class T> class polymorphic_allocator
diff --git a/libcxx/test/std/experimental/memory/memory.polymorphic.allocator.class/memory.polymorphic.allocator.mem/deallocate.pass.cpp b/libcxx/test/std/experimental/memory/memory.polymorphic.allocator.class/memory.polymorphic.allocator.mem/deallocate.pass.cpp
index bdcc5815ca7c0..1911ebd378b9e 100644
--- a/libcxx/test/std/experimental/memory/memory.polymorphic.allocator.class/memory.polymorphic.allocator.mem/deallocate.pass.cpp
+++ b/libcxx/test/std/experimental/memory/memory.polymorphic.allocator.class/memory.polymorphic.allocator.mem/deallocate.pass.cpp
@@ -11,8 +11,6 @@
 // test_memory_resource requires RTTI for dynamic_cast
 // UNSUPPORTED: no-rtti
 
-// XFAIL: availability-aligned_allocation-missing
-
 // <experimental/memory_resource>
 
 // template <class T> class polymorphic_allocator
diff --git a/libcxx/test/std/experimental/memory/memory.polymorphic.allocator.class/memory.polymorphic.allocator.mem/destroy.pass.cpp b/libcxx/test/std/experimental/memory/memory.polymorphic.allocator.class/memory.polymorphic.allocator.mem/destroy.pass.cpp
index 4075404e06924..dc99d8aea63d4 100644
--- a/libcxx/test/std/experimental/memory/memory.polymorphic.allocator.class/memory.polymorphic.allocator.mem/destroy.pass.cpp
+++ b/libcxx/test/std/experimental/memory/memory.polymorphic.allocator.class/memory.polymorphic.allocator.mem/destroy.pass.cpp
@@ -8,8 +8,6 @@
 
 // UNSUPPORTED: c++03
 
-// XFAIL: availability-aligned_allocation-missing
-
 // <experimental/memory_resource>
 
 // template <class T> class polymorphic_allocator
diff --git a/libcxx/test/std/experimental/memory/memory.polymorphic.allocator.class/memory.polymorphic.allocator.mem/resource.pass.cpp b/libcxx/test/std/experimental/memory/memory.polymorphic.allocator.class/memory.polymorphic.allocator.mem/resource.pass.cpp
index afe614b52f8d5..e47784760bf02 100644
--- a/libcxx/test/std/experimental/memory/memory.polymorphic.allocator.class/memory.polymorphic.allocator.mem/resource.pass.cpp
+++ b/libcxx/test/std/experimental/memory/memory.polymorphic.allocator.class/memory.polymorphic.allocator.mem/resource.pass.cpp
@@ -8,8 +8,6 @@
 
 // UNSUPPORTED: c++03
 
-// XFAIL: availability-aligned_allocation-missing
-
 // <experimental/memory_resource>
 
 // template <class T> class polymorphic_allocator
diff --git a/libcxx/test/std/experimental/memory/memory.polymorphic.allocator.class/memory.polymorphic.allocator.mem/select_on_container_copy_construction.pass.cpp b/libcxx/test/std/experimental/memory/memory.polymorphic.allocator.class/memory.polymorphic.allocator.mem/select_on_container_copy_construction.pass.cpp
index d80cb2f689b57..698773a2c5f16 100644
--- a/libcxx/test/std/experimental/memory/memory.polymorphic.allocator.class/memory.polymorphic.allocator.mem/select_on_container_copy_construction.pass.cpp
+++ b/libcxx/test/std/experimental/memory/memory.polymorphic.allocator.class/memory.polymorphic.allocator.mem/select_on_container_copy_construction.pass.cpp
@@ -8,8 +8,6 @@
 
 // UNSUPPORTED: c++03
 
-// XFAIL: availability-aligned_allocation-missing
-
 // <experimental/memory_resource>
 
 // template <class T> class polymorphic_allocator
diff --git a/libcxx/test/std/experimental/memory/memory.resource.aliases/header_deque_synop.pass.cpp b/libcxx/test/std/experimental/memory/memory.resource.aliases/header_deque_synop.pass.cpp
index 919c867bd6ea9..6dc013b7f7cc2 100644
--- a/libcxx/test/std/experimental/memory/memory.resource.aliases/header_deque_synop.pass.cpp
+++ b/libcxx/test/std/experimental/memory/memory.resource.aliases/header_deque_synop.pass.cpp
@@ -8,8 +8,6 @@
 
 // UNSUPPORTED: c++03
 
-// XFAIL: availability-aligned_allocation-missing
-
 // <experimental/deque>
 
 // namespace std { namespace experimental { namespace pmr {
diff --git a/libcxx/test/std/experimental/memory/memory.resource.aliases/header_forward_list_synop.pass.cpp b/libcxx/test/std/experimental/memory/memory.resource.aliases/header_forward_list_synop.pass.cpp
index 0a3c771c7b542..fa726ed1f5a38 100644
--- a/libcxx/test/std/experimental/memory/memory.resource.aliases/header_forward_list_synop.pass.cpp
+++ b/libcxx/test/std/experimental/memory/memory.resource.aliases/header_forward_list_synop.pass.cpp
@@ -8,8 +8,6 @@
 
 // UNSUPPORTED: c++03
 
-// XFAIL: availability-aligned_allocation-missing
-
 // <experimental/forward_list>
 
 // namespace std { namespace experimental { namespace pmr {
diff --git a/libcxx/test/std/experimental/memory/memory.resource.aliases/header_list_synop.pass.cpp b/libcxx/test/std/experimental/memory/memory.resource.aliases/header_list_synop.pass.cpp
index cd9150ca5d43b..5fee88359b857 100644
--- a/libcxx/test/std/experimental/memory/memory.resource.aliases/header_list_synop.pass.cpp
+++ b/libcxx/test/std/experimental/memory/memory.resource.aliases/header_list_synop.pass.cpp
@@ -8,8 +8,6 @@
 
 // UNSUPPORTED: c++03
 
-// XFAIL: availability-aligned_allocation-missing
-
 // <experimental/list>
 
 // namespace std { namespace experimental { namespace pmr {
diff --git a/libcxx/test/std/experimental/memory/memory.resource.aliases/header_map_synop.pass.cpp b/libcxx/test/std/experimental/memory/memory.resource.aliases/header_map_synop.pass.cpp
index b59b7f14412b6..2f7e9067a82b4 100644
--- a/libcxx/test/std/experimental/memory/memory.resource.aliases/header_map_synop.pass.cpp
+++ b/libcxx/test/std/experimental/memory/memory.resource.aliases/header_map_synop.pass.cpp
@@ -8,8 +8,6 @@
 
 // UNSUPPORTED: c++03
 
-// XFAIL: availability-aligned_allocation-missing
-
 // <experimental/map>
 
 // namespace std { namespace experimental { namespace pmr {
diff --git a/libcxx/test/std/experimental/memory/memory.resource.aliases/header_regex_synop.pass.cpp b/libcxx/test/std/experimental/memory/memory.resource.aliases/header_regex_synop.pass.cpp
index c650c870db01d..6d452bdfe3517 100644
--- a/libcxx/test/std/experimental/memory/memory.resource.aliases/header_regex_synop.pass.cpp
+++ b/libcxx/test/std/experimental/memory/memory.resource.aliases/header_regex_synop.pass.cpp
@@ -9,8 +9,6 @@
 // UNSUPPORTED: c++03
 // UNSUPPORTED: no-localization
 
-// XFAIL: availability-aligned_allocation-missing
-
 // <experimental/regex>
 
 // namespace std { namespace experimental { namespace pmr {
diff --git a/libcxx/test/std/experimental/memory/memory.resource.aliases/header_set_synop.pass.cpp b/libcxx/test/std/experimental/memory/memory.resource.aliases/header_set_synop.pass.cpp
index dcb829063fb5b..b6f1251027342 100644
--- a/libcxx/test/std/experimental/memory/memory.resource.aliases/header_set_synop.pass.cpp
+++ b/libcxx/test/std/experimental/memory/memory.resource.aliases/header_set_synop.pass.cpp
@@ -8,8 +8,6 @@
 
 // UNSUPPORTED: c++03
 
-// XFAIL: availability-aligned_allocation-missing
-
 // <experimental/set>
 
 // namespace std { namespace experimental { namespace pmr {
diff --git a/libcxx/test/std/experimental/memory/memory.resource.aliases/header_string_synop.pass.cpp b/libcxx/test/std/experimental/memory/memory.resource.aliases/header_string_synop.pass.cpp
index 818c6691be91c..ddd2783d17ce1 100644
--- a/libcxx/test/std/experimental/memory/memory.resource.aliases/header_string_synop.pass.cpp
+++ b/libcxx/test/std/experimental/memory/memory.resource.aliases/header_string_synop.pass.cpp
@@ -8,8 +8,6 @@
 
 // UNSUPPORTED: c++03
 
-// XFAIL: availability-aligned_allocation-missing
-
 // <experimental/string>
 
 // namespace std { namespace experimental { namespace pmr {
diff --git a/libcxx/test/std/experimental/memory/memory.resource.aliases/header_unordered_map_synop.pass.cpp b/libcxx/test/std/experimental/memory/memory.resource.aliases/header_unordered_map_synop.pass.cpp
index 5f55cc8d016d3..b7dcdf698d7a2 100644
--- a/libcxx/test/std/experimental/memory/memory.resource.aliases/header_unordered_map_synop.pass.cpp
+++ b/libcxx/test/std/experimental/memory/memory.resource.aliases/header_unordered_map_synop.pass.cpp
@@ -8,8 +8,6 @@
 
 // UNSUPPORTED: c++03
 
-// XFAIL: availability-aligned_allocation-missing
-
 // <experimental/unordered_map>
 
 // namespace std { namespace experimental { namespace pmr {
diff --git a/libcxx/test/std/experimental/memory/memory.resource.aliases/header_unordered_set_synop.pass.cpp b/libcxx/test/std/experimental/memory/memory.resource.aliases/header_unordered_set_synop.pass.cpp
index 4c80a0ca60c43..30908dd7ee38e 100644
--- a/libcxx/test/std/experimental/memory/memory.resource.aliases/header_unordered_set_synop.pass.cpp
+++ b/libcxx/test/std/experimental/memory/memory.resource.aliases/header_unordered_set_synop.pass.cpp
@@ -8,8 +8,6 @@
 
 // UNSUPPORTED: c++03
 
-// XFAIL: availability-aligned_allocation-missing
-
 // <experimental/unordered_set>
 
 // namespace std { namespace experimental { namespace pmr {
diff --git a/libcxx/test/std/experimental/memory/memory.resource.aliases/header_vector_synop.pass.cpp b/libcxx/test/std/experimental/memory/memory.resource.aliases/header_vector_synop.pass.cpp
index 783843b746043..f2dbcd323cbe3 100644
--- a/libcxx/test/std/experimental/memory/memory.resource.aliases/header_vector_synop.pass.cpp
+++ b/libcxx/test/std/experimental/memory/memory.resource.aliases/header_vector_synop.pass.cpp
@@ -8,8 +8,6 @@
 
 // UNSUPPORTED: c++03
 
-// XFAIL: availability-aligned_allocation-missing
-
 // <experimental/vector>
 
 // namespace std { namespace experimental { namespace pmr {
diff --git a/libcxx/test/std/experimental/memory/memory.resource.global/default_resource.pass.cpp b/libcxx/test/std/experimental/memory/memory.resource.global/default_resource.pass.cpp
index 2378d37cc341c..35f3f4a6eea0c 100644
--- a/libcxx/test/std/experimental/memory/memory.resource.global/default_resource.pass.cpp
+++ b/libcxx/test/std/experimental/memory/memory.resource.global/default_resource.pass.cpp
@@ -11,8 +11,6 @@
 // test_memory_resource requires RTTI for dynamic_cast
 // UNSUPPORTED: no-rtti
 
-// XFAIL: availability-aligned_allocation-missing
-
 // <experimental/memory_resource>
 
 // ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
diff --git a/libcxx/test/std/experimental/memory/memory.resource.global/new_delete_resource.pass.cpp b/libcxx/test/std/experimental/memory/memory.resource.global/new_delete_resource.pass.cpp
index f308752752cb7..a6223123cd97b 100644
--- a/libcxx/test/std/experimental/memory/memory.resource.global/new_delete_resource.pass.cpp
+++ b/libcxx/test/std/experimental/memory/memory.resource.global/new_delete_resource.pass.cpp
@@ -8,8 +8,6 @@
 
 // UNSUPPORTED: c++03
 
-// XFAIL: availability-aligned_allocation-missing
-
 // <experimental/memory_resource>
 
 // memory_resource * new_delete_resource()
diff --git a/libcxx/test/std/experimental/memory/memory.resource.global/null_memory_resource.pass.cpp b/libcxx/test/std/experimental/memory/memory.resource.global/null_memory_resource.pass.cpp
index e9f57db3a0845..e1bb4c4a7a00d 100644
--- a/libcxx/test/std/experimental/memory/memory.resource.global/null_memory_resource.pass.cpp
+++ b/libcxx/test/std/experimental/memory/memory.resource.global/null_memory_resource.pass.cpp
@@ -8,8 +8,6 @@
 
 // UNSUPPORTED: c++03
 
-// XFAIL: availability-aligned_allocation-missing
-
 // <experimental/memory_resource>
 
 // memory_resource * null_memory_resource()
diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align.pass.cpp
index b7bfee9f58609..c903c63f68c4a 100644
--- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align.pass.cpp
+++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align.pass.cpp
@@ -13,9 +13,6 @@
 // asan and msan will not call the new handler.
 // UNSUPPORTED: sanitizer-new-delete
 
-// We get availability markup errors when aligned allocation is missing
-// XFAIL: availability-aligned_allocation-missing
-
 // Libc++ when built for z/OS doesn't contain the aligned allocation functions,
 // nor does the dynamic library shipped with z/OS.
 // UNSUPPORTED: target={{.+}}-zos{{.*}}
diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align.replace.indirect.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align.replace.indirect.pass.cpp
index 64d987ed691b2..dcaa76a650d37 100644
--- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align.replace.indirect.pass.cpp
+++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align.replace.indirect.pass.cpp
@@ -15,9 +15,6 @@
 
 // XFAIL: LIBCXX-AIX-FIXME
 
-// We get availability markup errors when aligned allocation is missing
-// XFAIL: availability-aligned_allocation-missing
-
 // Libc++ when built for z/OS doesn't contain the aligned allocation functions,
 // nor does the dynamic library shipped with z/OS.
 // UNSUPPORTED: target={{.+}}-zos{{.*}}
diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align_nothrow.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align_nothrow.pass.cpp
index ca1e529d6b74f..0343d51f184d9 100644
--- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align_nothrow.pass.cpp
+++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align_nothrow.pass.cpp
@@ -13,9 +13,6 @@
 // asan and msan will not call the new handler.
 // UNSUPPORTED: sanitizer-new-delete
 
-// We get availability markup errors when aligned allocation is missing
-// XFAIL: availability-aligned_allocation-missing
-
 // Libc++ when built for z/OS doesn't contain the aligned allocation functions,
 // nor does the dynamic library shipped with z/OS.
 // UNSUPPORTED: target={{.+}}-zos{{.*}}
diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align_nothrow.replace.indirect.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align_nothrow.replace.indirect.pass.cpp
index a62128942eaf6..eba8a9026fa45 100644
--- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align_nothrow.replace.indirect.pass.cpp
+++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align_nothrow.replace.indirect.pass.cpp
@@ -16,9 +16,6 @@
 
 // XFAIL: LIBCXX-AIX-FIXME
 
-// We get availability markup errors when aligned allocation is missing
-// XFAIL: availability-aligned_allocation-missing
-
 // Libc++ when built for z/OS doesn't contain the aligned allocation functions,
 // nor does the dynamic library shipped with z/OS.
 // UNSUPPORTED: target={{.+}}-zos{{.*}}
diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align_nothrow.replace.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align_nothrow.replace.pass.cpp
index 9a830e0e82990..62a040e297ae5 100644
--- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align_nothrow.replace.pass.cpp
+++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align_nothrow.replace.pass.cpp
@@ -13,9 +13,6 @@
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: sanitizer-new-delete
 
-// We get availability markup errors when aligned allocation is missing
-// XFAIL: availability-aligned_allocation-missing
-
 // Libc++ when built for z/OS doesn't contain the aligned allocation functions,
 // nor does the dynamic library shipped with z/OS.
 // UNSUPPORTED: target={{.+}}-zos{{.*}}
diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/nodiscard.verify.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/nodiscard.verify.cpp
index cb5ea43fdef36..0f30cf0135a41 100644
--- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/nodiscard.verify.cpp
+++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/nodiscard.verify.cpp
@@ -17,9 +17,6 @@
 // [[nodiscard]] enabled before C++20 in libc++ as an extension
 // UNSUPPORTED: (c++11 || c++14 || c++17) && !stdlib=libc++
 
-// We get availability markup errors when aligned allocation is missing
-// XFAIL: availability-aligned_allocation-missing
-
 // Libc++ when built for z/OS doesn't contain the aligned allocation functions,
 // nor does the dynamic library shipped with z/OS.
 // UNSUPPORTED: target={{.+}}-zos{{.*}}
diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_align.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_align.pass.cpp
index 49b797f69d09b..93edb32130c3e 100644
--- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_align.pass.cpp
+++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_align.pass.cpp
@@ -10,9 +10,6 @@
 
 // UNSUPPORTED: c++03, c++11, c++14
 
-// We get availability markup errors when aligned allocation is missing
-// XFAIL: availability-aligned_allocation-missing
-
 // asan and msan will not call the new handler.
 // UNSUPPORTED: sanitizer-new-delete
 
diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_align_nothrow.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_align_nothrow.pass.cpp
index 7149a377f983a..4e5d36cd7c6df 100644
--- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_align_nothrow.pass.cpp
+++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_align_nothrow.pass.cpp
@@ -10,9 +10,6 @@
 
 // UNSUPPORTED: c++03, c++11, c++14
 
-// We get availability markup errors when aligned allocation is missing
-// XFAIL: availability-aligned_allocation-missing
-
 // asan and msan will not call the new handler.
 // UNSUPPORTED: sanitizer-new-delete
 
diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_align_nothrow.replace.indirect.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_align_nothrow.replace.indirect.pass.cpp
index 3cbb5aaf8f0be..4a18ad2df8f2a 100644
--- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_align_nothrow.replace.indirect.pass.cpp
+++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_align_nothrow.replace.indirect.pass.cpp
@@ -15,9 +15,6 @@
 
 // XFAIL: LIBCXX-AIX-FIXME
 
-// We get availability markup errors when aligned allocation is missing
-// XFAIL: availability-aligned_allocation-missing
-
 // Libc++ when built for z/OS doesn't contain the aligned allocation functions,
 // nor does the dynamic library shipped with z/OS.
 // UNSUPPORTED: target={{.+}}-zos{{.*}}
diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_align_nothrow.replace.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_align_nothrow.replace.pass.cpp
index 25a3bf767307f..3c9e17b0b02be 100644
--- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_align_nothrow.replace.pass.cpp
+++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_align_nothrow.replace.pass.cpp
@@ -13,9 +13,6 @@
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: sanitizer-new-delete
 
-// We get availability markup errors when aligned allocation is missing
-// XFAIL: availability-aligned_allocation-missing
-
 // Libc++ when built for z/OS doesn't contain the aligned allocation functions,
 // nor does the dynamic library shipped with z/OS.
 // UNSUPPORTED: target={{.+}}-zos{{.*}}
diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/nodiscard.verify.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/nodiscard.verify.cpp
index d7547ae5783d8..16d6a223eb535 100644
--- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/nodiscard.verify.cpp
+++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/nodiscard.verify.cpp
@@ -17,9 +17,6 @@
 // [[nodiscard]] enabled before C++20 in libc++ as an extension
 // UNSUPPORTED: (c++11 || c++14 || c++17) && !stdlib=libc++
 
-// We get availability markup errors when aligned allocation is missing
-// XFAIL: availability-aligned_allocation-missing
-
 // Libc++ when built for z/OS doesn't contain the aligned allocation functions,
 // nor does the dynamic library shipped with z/OS.
 // UNSUPPORTED: target={{.+}}-zos{{.*}}
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/shared_mutex.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/shared_mutex.version.compile.pass.cpp
index be00ed79d0c49..6662de9c33f30 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/shared_mutex.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/shared_mutex.version.compile.pass.cpp
@@ -41,7 +41,7 @@
 #   error "__cpp_lib_shared_mutex should not be defined before c++17"
 # endif
 
-# if !defined(_LIBCPP_HAS_NO_THREADS) && !defined(_LIBCPP_AVAILABILITY_HAS_NO_SHARED_MUTEX)
+# if !defined(_LIBCPP_HAS_NO_THREADS)
 #   ifndef __cpp_lib_shared_timed_mutex
 #     error "__cpp_lib_shared_timed_mutex should be defined in c++14"
 #   endif
@@ -50,13 +50,13 @@
 #   endif
 # else
 #   ifdef __cpp_lib_shared_timed_mutex
-#     error "__cpp_lib_shared_timed_mutex should not be defined when the requirement '!defined(_LIBCPP_HAS_NO_THREADS) && !defined(_LIBCPP_AVAILABILITY_HAS_NO_SHARED_MUTEX)' is not met!"
+#     error "__cpp_lib_shared_timed_mutex should not be defined when the requirement '!defined(_LIBCPP_HAS_NO_THREADS)' is not met!"
 #   endif
 # endif
 
 #elif TEST_STD_VER == 17
 
-# if !defined(_LIBCPP_HAS_NO_THREADS) && !defined(_LIBCPP_AVAILABILITY_HAS_NO_SHARED_MUTEX)
+# if !defined(_LIBCPP_HAS_NO_THREADS)
 #   ifndef __cpp_lib_shared_mutex
 #     error "__cpp_lib_shared_mutex should be defined in c++17"
 #   endif
@@ -65,11 +65,11 @@
 #   endif
 # else
 #   ifdef __cpp_lib_shared_mutex
-#     error "__cpp_lib_shared_mutex should not be defined when the requirement '!defined(_LIBCPP_HAS_NO_THREADS) && !defined(_LIBCPP_AVAILABILITY_HAS_NO_SHARED_MUTEX)' is not met!"
+#     error "__cpp_lib_shared_mutex should not be defined when the requirement '!defined(_LIBCPP_HAS_NO_THREADS)' is not met!"
 #   endif
 # endif
 
-# if !defined(_LIBCPP_HAS_NO_THREADS) && !defined(_LIBCPP_AVAILABILITY_HAS_NO_SHARED_MUTEX)
+# if !defined(_LIBCPP_HAS_NO_THREADS)
 #   ifndef __cpp_lib_shared_timed_mutex
 #     error "__cpp_lib_shared_timed_mutex should be defined in c++17"
 #   endif
@@ -78,13 +78,13 @@
 #   endif
 # else
 #   ifdef __cpp_lib_shared_timed_mutex
-#     error "__cpp_lib_shared_timed_mutex should not be defined when the requirement '!defined(_LIBCPP_HAS_NO_THREADS) && !defined(_LIBCPP_AVAILABILITY_HAS_NO_SHARED_MUTEX)' is not met!"
+#     error "__cpp_lib_shared_timed_mutex should not be defined when the requirement '!defined(_LIBCPP_HAS_NO_THREADS)' is not met!"
 #   endif
 # endif
 
 #elif TEST_STD_VER == 20
 
-# if !defined(_LIBCPP_HAS_NO_THREADS) && !defined(_LIBCPP_AVAILABILITY_HAS_NO_SHARED_MUTEX)
+# if !defined(_LIBCPP_HAS_NO_THREADS)
 #   ifndef __cpp_lib_shared_mutex
 #     error "__cpp_lib_shared_mutex should be defined in c++20"
 #   endif
@@ -93,11 +93,11 @@
 #   endif
 # else
 #   ifdef __cpp_lib_shared_mutex
-#     error "__cpp_lib_shared_mutex should not be defined when the requirement '!defined(_LIBCPP_HAS_NO_THREADS) && !defined(_LIBCPP_AVAILABILITY_HAS_NO_SHARED_MUTEX)' is not met!"
+#     error "__cpp_lib_shared_mutex should not be defined when the requirement '!defined(_LIBCPP_HAS_NO_THREADS)' is not met!"
 #   endif
 # endif
 
-# if !defined(_LIBCPP_HAS_NO_THREADS) && !defined(_LIBCPP_AVAILABILITY_HAS_NO_SHARED_MUTEX)
+# if !defined(_LIBCPP_HAS_NO_THREADS)
 #   ifndef __cpp_lib_shared_timed_mutex
 #     error "__cpp_lib_shared_timed_mutex should be defined in c++20"
 #   endif
@@ -106,13 +106,13 @@
 #   endif
 # else
 #   ifdef __cpp_lib_shared_timed_mutex
-#     error "__cpp_lib_shared_timed_mutex should not be defined when the requirement '!defined(_LIBCPP_HAS_NO_THREADS) && !defined(_LIBCPP_AVAILABILITY_HAS_NO_SHARED_MUTEX)' is not met!"
+#     error "__cpp_lib_shared_timed_mutex should not be defined when the requirement '!defined(_LIBCPP_HAS_NO_THREADS)' is not met!"
 #   endif
 # endif
 
 #elif TEST_STD_VER == 23
 
-# if !defined(_LIBCPP_HAS_NO_THREADS) && !defined(_LIBCPP_AVAILABILITY_HAS_NO_SHARED_MUTEX)
+# if !defined(_LIBCPP_HAS_NO_THREADS)
 #   ifndef __cpp_lib_shared_mutex
 #     error "__cpp_lib_shared_mutex should be defined in c++23"
 #   endif
@@ -121,11 +121,11 @@
 #   endif
 # else
 #   ifdef __cpp_lib_shared_mutex
-#     error "__cpp_lib_shared_mutex should not be defined when the requirement '!defined(_LIBCPP_HAS_NO_THREADS) && !defined(_LIBCPP_AVAILABILITY_HAS_NO_SHARED_MUTEX)' is not met!"
+#     error "__cpp_lib_shared_mutex should not be defined when the requirement '!defined(_LIBCPP_HAS_NO_THREADS)' is not met!"
 #   endif
 # endif
 
-# if !defined(_LIBCPP_HAS_NO_THREADS) && !defined(_LIBCPP_AVAILABILITY_HAS_NO_SHARED_MUTEX)
+# if !defined(_LIBCPP_HAS_NO_THREADS)
 #   ifndef __cpp_lib_shared_timed_mutex
 #     error "__cpp_lib_shared_timed_mutex should be defined in c++23"
 #   endif
@@ -134,13 +134,13 @@
 #   endif
 # else
 #   ifdef __cpp_lib_shared_timed_mutex
-#     error "__cpp_lib_shared_timed_mutex should not be defined when the requirement '!defined(_LIBCPP_HAS_NO_THREADS) && !defined(_LIBCPP_AVAILABILITY_HAS_NO_SHARED_MUTEX)' is not met!"
+#     error "__cpp_lib_shared_timed_mutex should not be defined when the requirement '!defined(_LIBCPP_HAS_NO_THREADS)' is not met!"
 #   endif
 # endif
 
 #elif TEST_STD_VER > 23
 
-# if !defined(_LIBCPP_HAS_NO_THREADS) && !defined(_LIBCPP_AVAILABILITY_HAS_NO_SHARED_MUTEX)
+# if !defined(_LIBCPP_HAS_NO_THREADS)
 #   ifndef __cpp_lib_shared_mutex
 #     error "__cpp_lib_shared_mutex should be defined in c++26"
 #   endif
@@ -149,11 +149,11 @@
 #   endif
 # else
 #   ifdef __cpp_lib_shared_mutex
-#     error "__cpp_lib_shared_mutex should not be defined when the requirement '!defined(_LIBCPP_HAS_NO_THREADS) && !defined(_LIBCPP_AVAILABILITY_HAS_NO_SHARED_MUTEX)' is not met!"
+#     error "__cpp_lib_shared_mutex should not be defined when the requirement '!defined(_LIBCPP_HAS_NO_THREADS)' is not met!"
 #   endif
 # endif
 
-# if !defined(_LIBCPP_HAS_NO_THREADS) && !defined(_LIBCPP_AVAILABILITY_HAS_NO_SHARED_MUTEX)
+# if !defined(_LIBCPP_HAS_NO_THREADS)
 #   ifndef __cpp_lib_shared_timed_mutex
 #     error "__cpp_lib_shared_timed_mutex should be defined in c++26"
 #   endif
@@ -162,7 +162,7 @@
 #   endif
 # else
 #   ifdef __cpp_lib_shared_timed_mutex
-#     error "__cpp_lib_shared_timed_mutex should not be defined when the requirement '!defined(_LIBCPP_HAS_NO_THREADS) && !defined(_LIBCPP_AVAILABILITY_HAS_NO_SHARED_MUTEX)' is not met!"
+#     error "__cpp_lib_shared_timed_mutex should not be defined when the requirement '!defined(_LIBCPP_HAS_NO_THREADS)' is not met!"
 #   endif
 # endif
 
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp
index ed86c555221c5..3cc7ed5f6d222 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp
@@ -1545,7 +1545,7 @@
 #   error "__cpp_lib_shared_ptr_weak_type should not be defined before c++17"
 # endif
 
-# if !defined(_LIBCPP_HAS_NO_THREADS) && !defined(_LIBCPP_AVAILABILITY_HAS_NO_SHARED_MUTEX)
+# if !defined(_LIBCPP_HAS_NO_THREADS)
 #   ifndef __cpp_lib_shared_timed_mutex
 #     error "__cpp_lib_shared_timed_mutex should be defined in c++14"
 #   endif
@@ -1554,7 +1554,7 @@
 #   endif
 # else
 #   ifdef __cpp_lib_shared_timed_mutex
-#     error "__cpp_lib_shared_timed_mutex should not be defined when the requirement '!defined(_LIBCPP_HAS_NO_THREADS) && !defined(_LIBCPP_AVAILABILITY_HAS_NO_SHARED_MUTEX)' is not met!"
+#     error "__cpp_lib_shared_timed_mutex should not be defined when the requirement '!defined(_LIBCPP_HAS_NO_THREADS)' is not met!"
 #   endif
 # endif
 
@@ -2471,7 +2471,7 @@
 #   error "__cpp_lib_semaphore should not be defined before c++20"
 # endif
 
-# if !defined(_LIBCPP_HAS_NO_THREADS) && !defined(_LIBCPP_AVAILABILITY_HAS_NO_SHARED_MUTEX)
+# if !defined(_LIBCPP_HAS_NO_THREADS)
 #   ifndef __cpp_lib_shared_mutex
 #     error "__cpp_lib_shared_mutex should be defined in c++17"
 #   endif
@@ -2480,7 +2480,7 @@
 #   endif
 # else
 #   ifdef __cpp_lib_shared_mutex
-#     error "__cpp_lib_shared_mutex should not be defined when the requirement '!defined(_LIBCPP_HAS_NO_THREADS) && !defined(_LIBCPP_AVAILABILITY_HAS_NO_SHARED_MUTEX)' is not met!"
+#     error "__cpp_lib_shared_mutex should not be defined when the requirement '!defined(_LIBCPP_HAS_NO_THREADS)' is not met!"
 #   endif
 # endif
 
@@ -2498,7 +2498,7 @@
 #   error "__cpp_lib_shared_ptr_weak_type should have the value 201606L in c++17"
 # endif
 
-# if !defined(_LIBCPP_HAS_NO_THREADS) && !defined(_LIBCPP_AVAILABILITY_HAS_NO_SHARED_MUTEX)
+# if !defined(_LIBCPP_HAS_NO_THREADS)
 #   ifndef __cpp_lib_shared_timed_mutex
 #     error "__cpp_lib_shared_timed_mutex should be defined in c++17"
 #   endif
@@ -2507,7 +2507,7 @@
 #   endif
 # else
 #   ifdef __cpp_lib_shared_timed_mutex
-#     error "__cpp_lib_shared_timed_mutex should not be defined when the requirement '!defined(_LIBCPP_HAS_NO_THREADS) && !defined(_LIBCPP_AVAILABILITY_HAS_NO_SHARED_MUTEX)' is not met!"
+#     error "__cpp_lib_shared_timed_mutex should not be defined when the requirement '!defined(_LIBCPP_HAS_NO_THREADS)' is not met!"
 #   endif
 # endif
 
@@ -3682,7 +3682,7 @@
 #   endif
 # endif
 
-# if !defined(_LIBCPP_HAS_NO_THREADS) && !defined(_LIBCPP_AVAILABILITY_HAS_NO_SHARED_MUTEX)
+# if !defined(_LIBCPP_HAS_NO_THREADS)
 #   ifndef __cpp_lib_shared_mutex
 #     error "__cpp_lib_shared_mutex should be defined in c++20"
 #   endif
@@ -3691,7 +3691,7 @@
 #   endif
 # else
 #   ifdef __cpp_lib_shared_mutex
-#     error "__cpp_lib_shared_mutex should not be defined when the requirement '!defined(_LIBCPP_HAS_NO_THREADS) && !defined(_LIBCPP_AVAILABILITY_HAS_NO_SHARED_MUTEX)' is not met!"
+#     error "__cpp_lib_shared_mutex should not be defined when the requirement '!defined(_LIBCPP_HAS_NO_THREADS)' is not met!"
 #   endif
 # endif
 
@@ -3709,7 +3709,7 @@
 #   error "__cpp_lib_shared_ptr_weak_type should have the value 201606L in c++20"
 # endif
 
-# if !defined(_LIBCPP_HAS_NO_THREADS) && !defined(_LIBCPP_AVAILABILITY_HAS_NO_SHARED_MUTEX)
+# if !defined(_LIBCPP_HAS_NO_THREADS)
 #   ifndef __cpp_lib_shared_timed_mutex
 #     error "__cpp_lib_shared_timed_mutex should be defined in c++20"
 #   endif
@@ -3718,7 +3718,7 @@
 #   endif
 # else
 #   ifdef __cpp_lib_shared_timed_mutex
-#     error "__cpp_lib_shared_timed_mutex should not be defined when the requirement '!defined(_LIBCPP_HAS_NO_THREADS) && !defined(_LIBCPP_AVAILABILITY_HAS_NO_SHARED_MUTEX)' is not met!"
+#     error "__cpp_lib_shared_timed_mutex should not be defined when the requirement '!defined(_LIBCPP_HAS_NO_THREADS)' is not met!"
 #   endif
 # endif
 
@@ -5127,7 +5127,7 @@
 #   endif
 # endif
 
-# if !defined(_LIBCPP_HAS_NO_THREADS) && !defined(_LIBCPP_AVAILABILITY_HAS_NO_SHARED_MUTEX)
+# if !defined(_LIBCPP_HAS_NO_THREADS)
 #   ifndef __cpp_lib_shared_mutex
 #     error "__cpp_lib_shared_mutex should be defined in c++23"
 #   endif
@@ -5136,7 +5136,7 @@
 #   endif
 # else
 #   ifdef __cpp_lib_shared_mutex
-#     error "__cpp_lib_shared_mutex should not be defined when the requirement '!defined(_LIBCPP_HAS_NO_THREADS) && !defined(_LIBCPP_AVAILABILITY_HAS_NO_SHARED_MUTEX)' is not met!"
+#     error "__cpp_lib_shared_mutex should not be defined when the requirement '!defined(_LIBCPP_HAS_NO_THREADS)' is not met!"
 #   endif
 # endif
 
@@ -5154,7 +5154,7 @@
 #   error "__cpp_lib_shared_ptr_weak_type should have the value 201606L in c++23"
 # endif
 
-# if !defined(_LIBCPP_HAS_NO_THREADS) && !defined(_LIBCPP_AVAILABILITY_HAS_NO_SHARED_MUTEX)
+# if !defined(_LIBCPP_HAS_NO_THREADS)
 #   ifndef __cpp_lib_shared_timed_mutex
 #     error "__cpp_lib_shared_timed_mutex should be defined in c++23"
 #   endif
@@ -5163,7 +5163,7 @@
 #   endif
 # else
 #   ifdef __cpp_lib_shared_timed_mutex
-#     error "__cpp_lib_shared_timed_mutex should not be defined when the requirement '!defined(_LIBCPP_HAS_NO_THREADS) && !defined(_LIBCPP_AVAILABILITY_HAS_NO_SHARED_MUTEX)' is not met!"
+#     error "__cpp_lib_shared_timed_mutex should not be defined when the requirement '!defined(_LIBCPP_HAS_NO_THREADS)' is not met!"
 #   endif
 # endif
 
@@ -6674,7 +6674,7 @@
 #   endif
 # endif
 
-# if !defined(_LIBCPP_HAS_NO_THREADS) && !defined(_LIBCPP_AVAILABILITY_HAS_NO_SHARED_MUTEX)
+# if !defined(_LIBCPP_HAS_NO_THREADS)
 #   ifndef __cpp_lib_shared_mutex
 #     error "__cpp_lib_shared_mutex should be defined in c++26"
 #   endif
@@ -6683,7 +6683,7 @@
 #   endif
 # else
 #   ifdef __cpp_lib_shared_mutex
-#     error "__cpp_lib_shared_mutex should not be defined when the requirement '!defined(_LIBCPP_HAS_NO_THREADS) && !defined(_LIBCPP_AVAILABILITY_HAS_NO_SHARED_MUTEX)' is not met!"
+#     error "__cpp_lib_shared_mutex should not be defined when the requirement '!defined(_LIBCPP_HAS_NO_THREADS)' is not met!"
 #   endif
 # endif
 
@@ -6701,7 +6701,7 @@
 #   error "__cpp_lib_shared_ptr_weak_type should have the value 201606L in c++26"
 # endif
 
-# if !defined(_LIBCPP_HAS_NO_THREADS) && !defined(_LIBCPP_AVAILABILITY_HAS_NO_SHARED_MUTEX)
+# if !defined(_LIBCPP_HAS_NO_THREADS)
 #   ifndef __cpp_lib_shared_timed_mutex
 #     error "__cpp_lib_shared_timed_mutex should be defined in c++26"
 #   endif
@@ -6710,7 +6710,7 @@
 #   endif
 # else
 #   ifdef __cpp_lib_shared_timed_mutex
-#     error "__cpp_lib_shared_timed_mutex should not be defined when the requirement '!defined(_LIBCPP_HAS_NO_THREADS) && !defined(_LIBCPP_AVAILABILITY_HAS_NO_SHARED_MUTEX)' is not met!"
+#     error "__cpp_lib_shared_timed_mutex should not be defined when the requirement '!defined(_LIBCPP_HAS_NO_THREADS)' is not met!"
 #   endif
 # endif
 
diff --git a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.shared/implicit_ctad.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.shared/implicit_ctad.pass.cpp
index 23a946b331be9..9a595f90ed4ff 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.shared/implicit_ctad.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.shared/implicit_ctad.pass.cpp
@@ -9,8 +9,6 @@
 // UNSUPPORTED: no-threads
 // UNSUPPORTED: c++98, c++03, c++11, c++14
 
-// XFAIL: availability-shared_mutex-missing
-
 // <shared_mutex>
 
 // shared_lock
diff --git a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.shared/thread.lock.shared.cons/default.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.shared/thread.lock.shared.cons/default.pass.cpp
index 547e514f00a98..3304e7526e3ce 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.shared/thread.lock.shared.cons/default.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.shared/thread.lock.shared.cons/default.pass.cpp
@@ -9,8 +9,6 @@
 // UNSUPPORTED: no-threads
 // UNSUPPORTED: c++03, c++11
 
-// XFAIL: availability-shared_mutex-missing
-
 // <shared_mutex>
 
 // template <class Mutex> class shared_lock;
diff --git a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.shared/thread.lock.shared.cons/move_assign.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.shared/thread.lock.shared.cons/move_assign.pass.cpp
index 4f6801efa7cee..6d7838e8c6c95 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.shared/thread.lock.shared.cons/move_assign.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.shared/thread.lock.shared.cons/move_assign.pass.cpp
@@ -9,8 +9,6 @@
 // UNSUPPORTED: no-threads
 // UNSUPPORTED: c++03, c++11
 
-// XFAIL: availability-shared_mutex-missing
-
 // <shared_mutex>
 
 // template <class Mutex> class shared_lock;
diff --git a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.shared/thread.lock.shared.cons/move_ctor.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.shared/thread.lock.shared.cons/move_ctor.pass.cpp
index c76b73a2fa0ea..fa3c689398aaa 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.shared/thread.lock.shared.cons/move_ctor.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.shared/thread.lock.shared.cons/move_ctor.pass.cpp
@@ -9,8 +9,6 @@
 // UNSUPPORTED: no-threads
 // UNSUPPORTED: c++03, c++11
 
-// XFAIL: availability-shared_mutex-missing
-
 // <shared_mutex>
 
 // template <class Mutex> class shared_lock;
diff --git a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.shared/thread.lock.shared.cons/mutex.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.shared/thread.lock.shared.cons/mutex.pass.cpp
index 3742becccf8ac..4940041bcf965 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.shared/thread.lock.shared.cons/mutex.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.shared/thread.lock.shared.cons/mutex.pass.cpp
@@ -10,8 +10,6 @@
 // UNSUPPORTED: c++03, c++11
 // ALLOW_RETRIES: 2
 
-// XFAIL: availability-shared_mutex-missing
-
 // <shared_mutex>
 
 // template <class Mutex> class shared_lock;
diff --git a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.shared/thread.lock.shared.cons/mutex_adopt_lock.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.shared/thread.lock.shared.cons/mutex_adopt_lock.pass.cpp
index 6a2e2592155f8..9cb6dc173db35 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.shared/thread.lock.shared.cons/mutex_adopt_lock.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.shared/thread.lock.shared.cons/mutex_adopt_lock.pass.cpp
@@ -9,8 +9,6 @@
 // UNSUPPORTED: no-threads
 // UNSUPPORTED: c++03, c++11
 
-// XFAIL: availability-shared_mutex-missing
-
 // <shared_mutex>
 
 // template <class Mutex> class shared_lock;
diff --git a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.shared/thread.lock.shared.cons/mutex_defer_lock.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.shared/thread.lock.shared.cons/mutex_defer_lock.pass.cpp
index 949d6c2955cb6..e231c130330d6 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.shared/thread.lock.shared.cons/mutex_defer_lock.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.shared/thread.lock.shared.cons/mutex_defer_lock.pass.cpp
@@ -9,8 +9,6 @@
 // UNSUPPORTED: no-threads
 // UNSUPPORTED: c++03, c++11
 
-// XFAIL: availability-shared_mutex-missing
-
 // <shared_mutex>
 
 // template <class Mutex> class shared_lock;
diff --git a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.shared/thread.lock.shared.cons/mutex_duration.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.shared/thread.lock.shared.cons/mutex_duration.pass.cpp
index 508d1cb81b763..07963a4ee1d51 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.shared/thread.lock.shared.cons/mutex_duration.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.shared/thread.lock.shared.cons/mutex_duration.pass.cpp
@@ -9,8 +9,6 @@
 // UNSUPPORTED: no-threads
 // UNSUPPORTED: c++03, c++11
 
-// XFAIL: availability-shared_mutex-missing
-
 // <shared_mutex>
 
 // class timed_mutex;
diff --git a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.shared/thread.lock.shared.cons/mutex_time_point.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.shared/thread.lock.shared.cons/mutex_time_point.pass.cpp
index 1aa437e6c6c1a..1fd97b4173e01 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.shared/thread.lock.shared.cons/mutex_time_point.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.shared/thread.lock.shared.cons/mutex_time_point.pass.cpp
@@ -9,8 +9,6 @@
 // UNSUPPORTED: no-threads
 // UNSUPPORTED: c++03, c++11
 
-// XFAIL: availability-shared_mutex-missing
-
 // <shared_mutex>
 
 // class shared_timed_mutex;
diff --git a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.shared/thread.lock.shared.cons/mutex_try_to_lock.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.shared/thread.lock.shared.cons/mutex_try_to_lock.pass.cpp
index 4f7b764197c2e..9a969c01d2c41 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.shared/thread.lock.shared.cons/mutex_try_to_lock.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.shared/thread.lock.shared.cons/mutex_try_to_lock.pass.cpp
@@ -10,8 +10,6 @@
 // UNSUPPORTED: c++03, c++11
 // ALLOW_RETRIES: 2
 
-// XFAIL: availability-shared_mutex-missing
-
 // <shared_mutex>
 
 // template <class Mutex> class shared_lock;
diff --git a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.shared/thread.lock.shared.locking/lock.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.shared/thread.lock.shared.locking/lock.pass.cpp
index e7d24621e0225..edb7c42356ace 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.shared/thread.lock.shared.locking/lock.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.shared/thread.lock.shared.locking/lock.pass.cpp
@@ -10,8 +10,6 @@
 // UNSUPPORTED: c++03, c++11
 // ALLOW_RETRIES: 2
 
-// XFAIL: availability-shared_mutex-missing
-
 // <shared_mutex>
 
 // template <class Mutex> class shared_lock;
diff --git a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.shared/thread.lock.shared.obs/mutex.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.shared/thread.lock.shared.obs/mutex.pass.cpp
index d5c88ce9f5a14..889641b14f6cf 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.shared/thread.lock.shared.obs/mutex.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.shared/thread.lock.shared.obs/mutex.pass.cpp
@@ -9,8 +9,6 @@
 // UNSUPPORTED: no-threads
 // UNSUPPORTED: c++03, c++11
 
-// XFAIL: availability-shared_mutex-missing
-
 // <shared_mutex>
 
 // template <class Mutex> class shared_lock;
diff --git a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.shared/thread.lock.shared.obs/owns_lock.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.shared/thread.lock.shared.obs/owns_lock.pass.cpp
index 91e25a2d74f18..162bd989b823e 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.shared/thread.lock.shared.obs/owns_lock.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.lock/thread.lock.shared/thread.lock.shared.obs/owns_lock.pass.cpp
@@ -9,8 +9,6 @@
 // UNSUPPORTED: no-threads
 // UNSUPPORTED: c++03, c++11
 
-// XFAIL: availability-shared_mutex-missing
-
 // <shared_mutex>
 
 // template <class Mutex> class shared_lock;
diff --git a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.shared_mutex.requirements/thread.shared_mutex.class/assign.verify.cpp b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.shared_mutex.requirements/thread.shared_mutex.class/assign.verify.cpp
index 0dcd48e750d09..34164aaa0413c 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.shared_mutex.requirements/thread.shared_mutex.class/assign.verify.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.shared_mutex.requirements/thread.shared_mutex.class/assign.verify.cpp
@@ -9,8 +9,6 @@
 // UNSUPPORTED: no-threads
 // UNSUPPORTED: c++03, c++11, c++14
 
-// UNSUPPORTED: availability-shared_mutex-missing
-
 // <shared_mutex>
 
 // class shared_mutex;
diff --git a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.shared_mutex.requirements/thread.shared_mutex.class/copy.verify.cpp b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.shared_mutex.requirements/thread.shared_mutex.class/copy.verify.cpp
index ad3ffd8a745f1..9b43198d0a37b 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.shared_mutex.requirements/thread.shared_mutex.class/copy.verify.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.shared_mutex.requirements/thread.shared_mutex.class/copy.verify.cpp
@@ -9,8 +9,6 @@
 // UNSUPPORTED: no-threads
 // UNSUPPORTED: c++03, c++11, c++14
 
-// UNSUPPORTED: availability-shared_mutex-missing
-
 // <shared_mutex>
 
 // class shared_mutex;
diff --git a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.shared_mutex.requirements/thread.shared_mutex.class/default.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.shared_mutex.requirements/thread.shared_mutex.class/default.pass.cpp
index 84763cd831d79..91320a52b62ae 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.shared_mutex.requirements/thread.shared_mutex.class/default.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.shared_mutex.requirements/thread.shared_mutex.class/default.pass.cpp
@@ -9,8 +9,6 @@
 // UNSUPPORTED: no-threads
 // UNSUPPORTED: c++03, c++11, c++14
 
-// UNSUPPORTED: availability-shared_mutex-missing
-
 // <shared_mutex>
 
 // class shared_mutex;
diff --git a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.shared_mutex.requirements/thread.shared_mutex.class/lock.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.shared_mutex.requirements/thread.shared_mutex.class/lock.pass.cpp
index 82ad573c54cc2..122f2b0cddb79 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.shared_mutex.requirements/thread.shared_mutex.class/lock.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.shared_mutex.requirements/thread.shared_mutex.class/lock.pass.cpp
@@ -11,8 +11,6 @@
 
 // ALLOW_RETRIES: 2
 
-// UNSUPPORTED: availability-shared_mutex-missing
-
 // <shared_mutex>
 
 // class shared_mutex;
diff --git a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.shared_mutex.requirements/thread.shared_mutex.class/lock_shared.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.shared_mutex.requirements/thread.shared_mutex.class/lock_shared.pass.cpp
index b7616750a5770..9df0d57c9621c 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.shared_mutex.requirements/thread.shared_mutex.class/lock_shared.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.shared_mutex.requirements/thread.shared_mutex.class/lock_shared.pass.cpp
@@ -11,8 +11,6 @@
 
 // ALLOW_RETRIES: 2
 
-// UNSUPPORTED: availability-shared_mutex-missing
-
 // <shared_mutex>
 
 // class shared_mutex;
diff --git a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.shared_mutex.requirements/thread.shared_mutex.class/try_lock.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.shared_mutex.requirements/thread.shared_mutex.class/try_lock.pass.cpp
index 675369fb09bcc..f39b1ee29f7db 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.shared_mutex.requirements/thread.shared_mutex.class/try_lock.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.shared_mutex.requirements/thread.shared_mutex.class/try_lock.pass.cpp
@@ -11,8 +11,6 @@
 
 // ALLOW_RETRIES: 2
 
-// UNSUPPORTED: availability-shared_mutex-missing
-
 // <shared_mutex>
 
 // class shared_mutex;
diff --git a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.shared_mutex.requirements/thread.shared_mutex.class/try_lock_shared.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.shared_mutex.requirements/thread.shared_mutex.class/try_lock_shared.pass.cpp
index 33bbf0a2e547c..c091b06f47492 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.shared_mutex.requirements/thread.shared_mutex.class/try_lock_shared.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.shared_mutex.requirements/thread.shared_mutex.class/try_lock_shared.pass.cpp
@@ -11,8 +11,6 @@
 
 // ALLOW_RETRIES: 2
 
-// UNSUPPORTED: availability-shared_mutex-missing
-
 // <shared_mutex>
 
 // class shared_mutex;
diff --git a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/assign.compile.fail.cpp b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/assign.compile.fail.cpp
index ca1d0ff0f76ff..c2cd8931b16cd 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/assign.compile.fail.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/assign.compile.fail.cpp
@@ -8,8 +8,6 @@
 
 // UNSUPPORTED: c++03, c++11
 
-// UNSUPPORTED: availability-shared_mutex-missing
-
 // <shared_mutex>
 
 // class shared_timed_mutex;
diff --git a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/copy.compile.fail.cpp b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/copy.compile.fail.cpp
index 0458f9c93aff6..9b0a66160e070 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/copy.compile.fail.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/copy.compile.fail.cpp
@@ -8,8 +8,6 @@
 
 // UNSUPPORTED: c++03, c++11
 
-// UNSUPPORTED: availability-shared_mutex-missing
-
 // <shared_mutex>
 
 // class shared_timed_mutex;
diff --git a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/default.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/default.pass.cpp
index e6633665c7657..b278419df6eed 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/default.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/default.pass.cpp
@@ -9,8 +9,6 @@
 // UNSUPPORTED: no-threads
 // UNSUPPORTED: c++03, c++11
 
-// UNSUPPORTED: availability-shared_mutex-missing
-
 // <shared_mutex>
 
 // class shared_timed_mutex;
diff --git a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/lock.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/lock.pass.cpp
index 606a377e72c54..acabbabb3896b 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/lock.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/lock.pass.cpp
@@ -9,8 +9,6 @@
 // UNSUPPORTED: no-threads
 // UNSUPPORTED: c++03, c++11
 
-// UNSUPPORTED: availability-shared_mutex-missing
-
 // ALLOW_RETRIES: 3
 
 // <shared_mutex>
diff --git a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/lock_shared.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/lock_shared.pass.cpp
index 20d7347b21cdc..36f5dbaa2040f 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/lock_shared.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/lock_shared.pass.cpp
@@ -9,8 +9,6 @@
 // UNSUPPORTED: no-threads
 // UNSUPPORTED: c++03, c++11
 
-// UNSUPPORTED: availability-shared_mutex-missing
-
 // ALLOW_RETRIES: 3
 
 // <shared_mutex>
diff --git a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/try_lock.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/try_lock.pass.cpp
index 52d7e2245847d..cc7091fed415d 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/try_lock.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/try_lock.pass.cpp
@@ -11,8 +11,6 @@
 
 // ALLOW_RETRIES: 2
 
-// UNSUPPORTED: availability-shared_mutex-missing
-
 // <shared_mutex>
 
 // class shared_timed_mutex;
diff --git a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/try_lock_for.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/try_lock_for.pass.cpp
index 7eb7f9001aa2a..30fc3c510a237 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/try_lock_for.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/try_lock_for.pass.cpp
@@ -10,8 +10,6 @@
 // UNSUPPORTED: c++03, c++11
 // ALLOW_RETRIES: 2
 
-// UNSUPPORTED: availability-shared_mutex-missing
-
 // <shared_mutex>
 
 // class shared_timed_mutex;
diff --git a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/try_lock_shared.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/try_lock_shared.pass.cpp
index 1c15acb62f477..8523df08f8129 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/try_lock_shared.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/try_lock_shared.pass.cpp
@@ -11,8 +11,6 @@
 
 // ALLOW_RETRIES: 2
 
-// UNSUPPORTED: availability-shared_mutex-missing
-
 // <shared_mutex>
 
 // class shared_timed_mutex;
diff --git a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/try_lock_shared_for.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/try_lock_shared_for.pass.cpp
index dae0a448d888b..c7d02a3a72001 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/try_lock_shared_for.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/try_lock_shared_for.pass.cpp
@@ -11,8 +11,6 @@
 
 // ALLOW_RETRIES: 3
 
-// UNSUPPORTED: availability-shared_mutex-missing
-
 // <shared_mutex>
 
 // class shared_timed_mutex;
diff --git a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/try_lock_shared_until.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/try_lock_shared_until.pass.cpp
index ae7c26029868a..a95ffab0aa60d 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/try_lock_shared_until.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/try_lock_shared_until.pass.cpp
@@ -11,8 +11,6 @@
 
 // ALLOW_RETRIES: 2
 
-// UNSUPPORTED: availability-shared_mutex-missing
-
 // <shared_mutex>
 
 // class shared_timed_mutex;
diff --git a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/try_lock_until.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/try_lock_until.pass.cpp
index 7aab2353e4eb0..fb521efdb93f5 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/try_lock_until.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/try_lock_until.pass.cpp
@@ -11,8 +11,6 @@
 
 // ALLOW_RETRIES: 2
 
-// UNSUPPORTED: availability-shared_mutex-missing
-
 // <shared_mutex>
 
 // class shared_timed_mutex;
diff --git a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/try_lock_until_deadlock_bug.pass.cpp b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/try_lock_until_deadlock_bug.pass.cpp
index a8b44efdab1e1..cdb0afcdf7846 100644
--- a/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/try_lock_until_deadlock_bug.pass.cpp
+++ b/libcxx/test/std/thread/thread.mutex/thread.mutex.requirements/thread.sharedtimedmutex.requirements/thread.sharedtimedmutex.class/try_lock_until_deadlock_bug.pass.cpp
@@ -9,8 +9,6 @@
 // UNSUPPORTED: no-threads
 // UNSUPPORTED: c++03, c++11
 
-// UNSUPPORTED: availability-shared_mutex-missing
-
 // <shared_mutex>
 
 // class shared_timed_mutex;
diff --git a/libcxx/test/std/utilities/any/any.class/any.assign/copy.pass.cpp b/libcxx/test/std/utilities/any/any.class/any.assign/copy.pass.cpp
index 0d4cab8b809c5..a3c43de5c0b1f 100644
--- a/libcxx/test/std/utilities/any/any.class/any.assign/copy.pass.cpp
+++ b/libcxx/test/std/utilities/any/any.class/any.assign/copy.pass.cpp
@@ -8,8 +8,6 @@
 
 // UNSUPPORTED: c++03, c++11, c++14
 
-// XFAIL: availability-bad_any_cast-missing && !no-exceptions
-
 // <any>
 
 // any& operator=(const any&);
diff --git a/libcxx/test/std/utilities/any/any.class/any.assign/move.pass.cpp b/libcxx/test/std/utilities/any/any.class/any.assign/move.pass.cpp
index 08c9acd606838..2cfc3f63aa769 100644
--- a/libcxx/test/std/utilities/any/any.class/any.assign/move.pass.cpp
+++ b/libcxx/test/std/utilities/any/any.class/any.assign/move.pass.cpp
@@ -8,8 +8,6 @@
 
 // UNSUPPORTED: c++03, c++11, c++14
 
-// XFAIL: availability-bad_any_cast-missing && !no-exceptions
-
 // <any>
 
 // any& operator=(any &&);
diff --git a/libcxx/test/std/utilities/any/any.class/any.assign/value.pass.cpp b/libcxx/test/std/utilities/any/any.class/any.assign/value.pass.cpp
index 7bd0ffafd9552..939c766777c0b 100644
--- a/libcxx/test/std/utilities/any/any.class/any.assign/value.pass.cpp
+++ b/libcxx/test/std/utilities/any/any.class/any.assign/value.pass.cpp
@@ -8,8 +8,6 @@
 
 // UNSUPPORTED: c++03, c++11, c++14
 
-// XFAIL: availability-bad_any_cast-missing && !no-exceptions
-
 // <any>
 
 // template <class ValueType>
diff --git a/libcxx/test/std/utilities/any/any.class/any.cons/copy.pass.cpp b/libcxx/test/std/utilities/any/any.class/any.cons/copy.pass.cpp
index 6bb5f13030904..c804887ff68f0 100644
--- a/libcxx/test/std/utilities/any/any.class/any.cons/copy.pass.cpp
+++ b/libcxx/test/std/utilities/any/any.class/any.cons/copy.pass.cpp
@@ -8,8 +8,6 @@
 
 // UNSUPPORTED: c++03, c++11, c++14
 
-// XFAIL: availability-bad_any_cast-missing && !no-exceptions
-
 // <any>
 
 // any(any const &);
diff --git a/libcxx/test/std/utilities/any/any.class/any.cons/in_place_type.pass.cpp b/libcxx/test/std/utilities/any/any.class/any.cons/in_place_type.pass.cpp
index ec53bf1300851..3b96126d91751 100644
--- a/libcxx/test/std/utilities/any/any.class/any.cons/in_place_type.pass.cpp
+++ b/libcxx/test/std/utilities/any/any.class/any.cons/in_place_type.pass.cpp
@@ -8,8 +8,6 @@
 
 // UNSUPPORTED: c++03, c++11, c++14
 
-// XFAIL: availability-bad_any_cast-missing && !no-exceptions
-
 // <any>
 
 // template <class T, class ...Args> any(in_place_type_t<T>, Args&&...);
diff --git a/libcxx/test/std/utilities/any/any.class/any.cons/move.pass.cpp b/libcxx/test/std/utilities/any/any.class/any.cons/move.pass.cpp
index a8a362b5c45cc..37d3e1aa8602e 100644
--- a/libcxx/test/std/utilities/any/any.class/any.cons/move.pass.cpp
+++ b/libcxx/test/std/utilities/any/any.class/any.cons/move.pass.cpp
@@ -8,8 +8,6 @@
 
 // UNSUPPORTED: c++03, c++11, c++14
 
-// XFAIL: availability-bad_any_cast-missing && !no-exceptions
-
 // <any>
 
 // any(any &&) noexcept;
diff --git a/libcxx/test/std/utilities/any/any.class/any.cons/value.pass.cpp b/libcxx/test/std/utilities/any/any.class/any.cons/value.pass.cpp
index ed7f4e94b39f4..120dfc22f3fbc 100644
--- a/libcxx/test/std/utilities/any/any.class/any.cons/value.pass.cpp
+++ b/libcxx/test/std/utilities/any/any.class/any.cons/value.pass.cpp
@@ -8,8 +8,6 @@
 
 // UNSUPPORTED: c++03, c++11, c++14
 
-// XFAIL: availability-bad_any_cast-missing && !no-exceptions
-
 // <any>
 
 // template <class Value> any(Value &&)
diff --git a/libcxx/test/std/utilities/any/any.class/any.modifiers/emplace.pass.cpp b/libcxx/test/std/utilities/any/any.class/any.modifiers/emplace.pass.cpp
index 00c4c50ae2f59..3d3ecf9b870ce 100644
--- a/libcxx/test/std/utilities/any/any.class/any.modifiers/emplace.pass.cpp
+++ b/libcxx/test/std/utilities/any/any.class/any.modifiers/emplace.pass.cpp
@@ -8,8 +8,6 @@
 
 // UNSUPPORTED: c++03, c++11, c++14
 
-// XFAIL: availability-bad_any_cast-missing && !no-exceptions
-
 // <any>
 
 // template <class T, class ...Args> T& emplace(Args&&...);
diff --git a/libcxx/test/std/utilities/any/any.class/any.modifiers/reset.pass.cpp b/libcxx/test/std/utilities/any/any.class/any.modifiers/reset.pass.cpp
index eee478dd36ea9..6b52036a6e716 100644
--- a/libcxx/test/std/utilities/any/any.class/any.modifiers/reset.pass.cpp
+++ b/libcxx/test/std/utilities/any/any.class/any.modifiers/reset.pass.cpp
@@ -8,8 +8,6 @@
 
 // UNSUPPORTED: c++03, c++11, c++14
 
-// XFAIL: availability-bad_any_cast-missing && !no-exceptions
-
 // <any>
 
 // any::reset() noexcept
diff --git a/libcxx/test/std/utilities/any/any.class/any.modifiers/swap.pass.cpp b/libcxx/test/std/utilities/any/any.class/any.modifiers/swap.pass.cpp
index 163dafca59a8c..c13cb8b96a3af 100644
--- a/libcxx/test/std/utilities/any/any.class/any.modifiers/swap.pass.cpp
+++ b/libcxx/test/std/utilities/any/any.class/any.modifiers/swap.pass.cpp
@@ -8,8 +8,6 @@
 
 // UNSUPPORTED: c++03, c++11, c++14
 
-// XFAIL: availability-bad_any_cast-missing && !no-exceptions
-
 // <any>
 
 // any::swap(any &) noexcept
diff --git a/libcxx/test/std/utilities/any/any.nonmembers/any.cast/any_cast_pointer.pass.cpp b/libcxx/test/std/utilities/any/any.nonmembers/any.cast/any_cast_pointer.pass.cpp
index 4faa75fee6d64..5143befffce73 100644
--- a/libcxx/test/std/utilities/any/any.nonmembers/any.cast/any_cast_pointer.pass.cpp
+++ b/libcxx/test/std/utilities/any/any.nonmembers/any.cast/any_cast_pointer.pass.cpp
@@ -8,8 +8,6 @@
 
 // UNSUPPORTED: c++03, c++11, c++14
 
-// XFAIL: availability-bad_any_cast-missing && !no-exceptions
-
 // <any>
 
 // template <class ValueType>
diff --git a/libcxx/test/std/utilities/any/any.nonmembers/any.cast/any_cast_reference.pass.cpp b/libcxx/test/std/utilities/any/any.nonmembers/any.cast/any_cast_reference.pass.cpp
index e2db999b7ae04..079be66771944 100644
--- a/libcxx/test/std/utilities/any/any.nonmembers/any.cast/any_cast_reference.pass.cpp
+++ b/libcxx/test/std/utilities/any/any.nonmembers/any.cast/any_cast_reference.pass.cpp
@@ -8,8 +8,6 @@
 
 // UNSUPPORTED: c++03, c++11, c++14
 
-// XFAIL: availability-bad_any_cast-missing && !no-exceptions
-
 // <any>
 
 // template <class ValueType>
diff --git a/libcxx/test/std/utilities/any/any.nonmembers/any.cast/any_cast_request_invalid_value_category.verify.cpp b/libcxx/test/std/utilities/any/any.nonmembers/any.cast/any_cast_request_invalid_value_category.verify.cpp
index 3565df8ac8b1e..34f785fdff6bc 100644
--- a/libcxx/test/std/utilities/any/any.nonmembers/any.cast/any_cast_request_invalid_value_category.verify.cpp
+++ b/libcxx/test/std/utilities/any/any.nonmembers/any.cast/any_cast_request_invalid_value_category.verify.cpp
@@ -8,8 +8,6 @@
 
 // UNSUPPORTED: c++03, c++11, c++14
 
-// UNSUPPORTED: availability-bad_any_cast-missing
-
 // <any>
 
 // template <class ValueType>
diff --git a/libcxx/test/std/utilities/any/any.nonmembers/any.cast/const_correctness.verify.cpp b/libcxx/test/std/utilities/any/any.nonmembers/any.cast/const_correctness.verify.cpp
index 2369febb5e6e5..1830626d7f410 100644
--- a/libcxx/test/std/utilities/any/any.nonmembers/any.cast/const_correctness.verify.cpp
+++ b/libcxx/test/std/utilities/any/any.nonmembers/any.cast/const_correctness.verify.cpp
@@ -8,8 +8,6 @@
 
 // UNSUPPORTED: c++03, c++11, c++14
 
-// UNSUPPORTED: availability-bad_any_cast-missing
-
 // <any>
 
 // template <class ValueType>
diff --git a/libcxx/test/std/utilities/any/any.nonmembers/any.cast/not_copy_constructible.verify.cpp b/libcxx/test/std/utilities/any/any.nonmembers/any.cast/not_copy_constructible.verify.cpp
index cd62f88b4f500..3b9aac581a083 100644
--- a/libcxx/test/std/utilities/any/any.nonmembers/any.cast/not_copy_constructible.verify.cpp
+++ b/libcxx/test/std/utilities/any/any.nonmembers/any.cast/not_copy_constructible.verify.cpp
@@ -8,8 +8,6 @@
 
 // UNSUPPORTED: c++03, c++11, c++14
 
-// UNSUPPORTED: availability-bad_any_cast-missing
-
 // <any>
 
 // template <class ValueType>
diff --git a/libcxx/test/std/utilities/any/any.nonmembers/make_any.pass.cpp b/libcxx/test/std/utilities/any/any.nonmembers/make_any.pass.cpp
index 1a68488b01f15..bf98896f8b676 100644
--- a/libcxx/test/std/utilities/any/any.nonmembers/make_any.pass.cpp
+++ b/libcxx/test/std/utilities/any/any.nonmembers/make_any.pass.cpp
@@ -8,8 +8,6 @@
 
 // UNSUPPORTED: c++03, c++11, c++14
 
-// XFAIL: availability-bad_any_cast-missing && !no-exceptions
-
 // <any>
 
 // template <class T, class ...Args> any make_any(Args&&...);
diff --git a/libcxx/test/std/utilities/any/any.nonmembers/swap.pass.cpp b/libcxx/test/std/utilities/any/any.nonmembers/swap.pass.cpp
index becd1982e35b2..0abed9998520a 100644
--- a/libcxx/test/std/utilities/any/any.nonmembers/swap.pass.cpp
+++ b/libcxx/test/std/utilities/any/any.nonmembers/swap.pass.cpp
@@ -8,8 +8,6 @@
 
 // UNSUPPORTED: c++03, c++11, c++14
 
-// XFAIL: availability-bad_any_cast-missing && !no-exceptions
-
 // <any>
 
 // void swap(any &, any &) noexcept
diff --git a/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.create/allocate_shared.array.bounded.pass.cpp b/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.create/allocate_shared.array.bounded.pass.cpp
index 79558cfc4c6c0..c42063d53326a 100644
--- a/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.create/allocate_shared.array.bounded.pass.cpp
+++ b/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.create/allocate_shared.array.bounded.pass.cpp
@@ -8,9 +8,6 @@
 
 // UNSUPPORTED: c++03, c++11, c++14, c++17
 
-// This test requires support for aligned allocation to test overaligned types.
-// XFAIL: availability-aligned_allocation-missing
-
 // <memory>
 
 // shared_ptr
diff --git a/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.create/allocate_shared.array.unbounded.pass.cpp b/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.create/allocate_shared.array.unbounded.pass.cpp
index f6e0f4e246715..f3802e32f75bc 100644
--- a/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.create/allocate_shared.array.unbounded.pass.cpp
+++ b/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.create/allocate_shared.array.unbounded.pass.cpp
@@ -8,9 +8,6 @@
 
 // UNSUPPORTED: c++03, c++11, c++14, c++17
 
-// This test requires support for aligned allocation to test overaligned types.
-// XFAIL: availability-aligned_allocation-missing
-
 // <memory>
 
 // shared_ptr
diff --git a/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.create/make_shared.array.bounded.pass.cpp b/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.create/make_shared.array.bounded.pass.cpp
index 60fb9bb25ff86..f75466bcdcf8c 100644
--- a/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.create/make_shared.array.bounded.pass.cpp
+++ b/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.create/make_shared.array.bounded.pass.cpp
@@ -8,9 +8,6 @@
 
 // UNSUPPORTED: c++03, c++11, c++14, c++17
 
-// This test requires support for aligned allocation to test overaligned types.
-// XFAIL: availability-aligned_allocation-missing
-
 // <memory>
 
 // shared_ptr
diff --git a/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.create/make_shared.array.unbounded.pass.cpp b/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.create/make_shared.array.unbounded.pass.cpp
index 01681cdf3a675..cd6c548010692 100644
--- a/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.create/make_shared.array.unbounded.pass.cpp
+++ b/libcxx/test/std/utilities/memory/util.smartptr/util.smartptr.shared/util.smartptr.shared.create/make_shared.array.unbounded.pass.cpp
@@ -8,9 +8,6 @@
 
 // UNSUPPORTED: c++03, c++11, c++14, c++17
 
-// This test requires support for aligned allocation to test overaligned types.
-// XFAIL: availability-aligned_allocation-missing
-
 // <memory>
 
 // shared_ptr
diff --git a/libcxx/test/std/utilities/optional/optional.bad_optional_access/default.pass.cpp b/libcxx/test/std/utilities/optional/optional.bad_optional_access/default.pass.cpp
index ab01a74c9a967..45e6e64203777 100644
--- a/libcxx/test/std/utilities/optional/optional.bad_optional_access/default.pass.cpp
+++ b/libcxx/test/std/utilities/optional/optional.bad_optional_access/default.pass.cpp
@@ -7,7 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 // UNSUPPORTED: c++03, c++11, c++14
-// XFAIL: availability-bad_optional_access-missing
 
 // <optional>
 
diff --git a/libcxx/test/std/utilities/optional/optional.bad_optional_access/derive.pass.cpp b/libcxx/test/std/utilities/optional/optional.bad_optional_access/derive.pass.cpp
index 954f399ac5c58..9cf3b08930121 100644
--- a/libcxx/test/std/utilities/optional/optional.bad_optional_access/derive.pass.cpp
+++ b/libcxx/test/std/utilities/optional/optional.bad_optional_access/derive.pass.cpp
@@ -7,7 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 // UNSUPPORTED: c++03, c++11, c++14
-// XFAIL: availability-bad_optional_access-missing
 
 // <optional>
 
diff --git a/libcxx/test/std/utilities/optional/optional.monadic/and_then.pass.cpp b/libcxx/test/std/utilities/optional/optional.monadic/and_then.pass.cpp
index 375820a9033d9..97305d976e066 100644
--- a/libcxx/test/std/utilities/optional/optional.monadic/and_then.pass.cpp
+++ b/libcxx/test/std/utilities/optional/optional.monadic/and_then.pass.cpp
@@ -8,8 +8,6 @@
 
 // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
 
-// XFAIL: availability-bad_optional_access-missing && !no-exceptions
-
 // <optional>
 
 // template<class F> constexpr auto and_then(F&&) &;
diff --git a/libcxx/test/std/utilities/optional/optional.monadic/transform.pass.cpp b/libcxx/test/std/utilities/optional/optional.monadic/transform.pass.cpp
index 79a8cfc5e73c3..0a151517b101c 100644
--- a/libcxx/test/std/utilities/optional/optional.monadic/transform.pass.cpp
+++ b/libcxx/test/std/utilities/optional/optional.monadic/transform.pass.cpp
@@ -8,8 +8,6 @@
 
 // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
 
-// XFAIL: availability-bad_optional_access-missing && !no-exceptions
-
 // <optional>
 
 // template<class F> constexpr auto transform(F&&) &;
diff --git a/libcxx/test/std/utilities/optional/optional.object/optional.object.ctor/U.pass.cpp b/libcxx/test/std/utilities/optional/optional.object/optional.object.ctor/U.pass.cpp
index 60e589ba46d1b..a5ee602ab7bce 100644
--- a/libcxx/test/std/utilities/optional/optional.object/optional.object.ctor/U.pass.cpp
+++ b/libcxx/test/std/utilities/optional/optional.object/optional.object.ctor/U.pass.cpp
@@ -8,8 +8,6 @@
 //
 // UNSUPPORTED: c++03, c++11, c++14
 
-// XFAIL: availability-bad_optional_access-missing && !no-exceptions
-
 // <optional>
 
 // template <class U>
diff --git a/libcxx/test/std/utilities/optional/optional.object/optional.object.ctor/const_T.pass.cpp b/libcxx/test/std/utilities/optional/optional.object/optional.object.ctor/const_T.pass.cpp
index 32334a0bdc280..91a2323eebbf4 100644
--- a/libcxx/test/std/utilities/optional/optional.object/optional.object.ctor/const_T.pass.cpp
+++ b/libcxx/test/std/utilities/optional/optional.object/optional.object.ctor/const_T.pass.cpp
@@ -8,8 +8,6 @@
 //
 // UNSUPPORTED: c++03, c++11, c++14
 
-// XFAIL: availability-bad_optional_access-missing && !no-exceptions
-
 // <optional>
 
 // constexpr optional(const T& v);
diff --git a/libcxx/test/std/utilities/optional/optional.object/optional.object.ctor/move.pass.cpp b/libcxx/test/std/utilities/optional/optional.object/optional.object.ctor/move.pass.cpp
index 41b563357eac2..f856c1d41d05a 100644
--- a/libcxx/test/std/utilities/optional/optional.object/optional.object.ctor/move.pass.cpp
+++ b/libcxx/test/std/utilities/optional/optional.object/optional.object.ctor/move.pass.cpp
@@ -8,8 +8,6 @@
 
 // UNSUPPORTED: c++03, c++11, c++14
 
-// XFAIL: availability-bad_optional_access-missing && !no-exceptions
-
 // <optional>
 
 // constexpr optional(optional<T>&& rhs);
diff --git a/libcxx/test/std/utilities/optional/optional.object/optional.object.ctor/rvalue_T.pass.cpp b/libcxx/test/std/utilities/optional/optional.object/optional.object.ctor/rvalue_T.pass.cpp
index d301212eaa73b..12425955f5a86 100644
--- a/libcxx/test/std/utilities/optional/optional.object/optional.object.ctor/rvalue_T.pass.cpp
+++ b/libcxx/test/std/utilities/optional/optional.object/optional.object.ctor/rvalue_T.pass.cpp
@@ -8,8 +8,6 @@
 //
 // UNSUPPORTED: c++03, c++11, c++14
 
-// XFAIL: availability-bad_optional_access-missing && !no-exceptions
-
 // <optional>
 
 // constexpr optional(T&& v);
diff --git a/libcxx/test/std/utilities/optional/optional.object/optional.object.observe/value.pass.cpp b/libcxx/test/std/utilities/optional/optional.object/optional.object.observe/value.pass.cpp
index d4c0e8fc62da3..781784c6806a4 100644
--- a/libcxx/test/std/utilities/optional/optional.object/optional.object.observe/value.pass.cpp
+++ b/libcxx/test/std/utilities/optional/optional.object/optional.object.observe/value.pass.cpp
@@ -8,8 +8,6 @@
 
 // UNSUPPORTED: c++03, c++11, c++14
 
-// XFAIL: availability-bad_optional_access-missing && !no-exceptions
-
 // <optional>
 
 // constexpr T& optional<T>::value() &;
diff --git a/libcxx/test/std/utilities/optional/optional.object/optional.object.observe/value_const.pass.cpp b/libcxx/test/std/utilities/optional/optional.object/optional.object.observe/value_const.pass.cpp
index 9349f1de0d53c..ea62f76d4082a 100644
--- a/libcxx/test/std/utilities/optional/optional.object/optional.object.observe/value_const.pass.cpp
+++ b/libcxx/test/std/utilities/optional/optional.object/optional.object.observe/value_const.pass.cpp
@@ -8,8 +8,6 @@
 
 // UNSUPPORTED: c++03, c++11, c++14
 
-// XFAIL: availability-bad_optional_access-missing && !no-exceptions
-
 // <optional>
 
 // constexpr const T& optional<T>::value() const &;
diff --git a/libcxx/test/std/utilities/optional/optional.object/optional.object.observe/value_const_rvalue.pass.cpp b/libcxx/test/std/utilities/optional/optional.object/optional.object.observe/value_const_rvalue.pass.cpp
index 2831816cf5b10..49dbe4e58b57d 100644
--- a/libcxx/test/std/utilities/optional/optional.object/optional.object.observe/value_const_rvalue.pass.cpp
+++ b/libcxx/test/std/utilities/optional/optional.object/optional.object.observe/value_const_rvalue.pass.cpp
@@ -8,8 +8,6 @@
 
 // UNSUPPORTED: c++03, c++11, c++14
 
-// XFAIL: availability-bad_optional_access-missing && !no-exceptions
-
 // <optional>
 
 // constexpr const T& optional<T>::value() const &&;
diff --git a/libcxx/test/std/utilities/optional/optional.object/optional.object.observe/value_rvalue.pass.cpp b/libcxx/test/std/utilities/optional/optional.object/optional.object.observe/value_rvalue.pass.cpp
index dabeced527500..5de0187b49572 100644
--- a/libcxx/test/std/utilities/optional/optional.object/optional.object.observe/value_rvalue.pass.cpp
+++ b/libcxx/test/std/utilities/optional/optional.object/optional.object.observe/value_rvalue.pass.cpp
@@ -9,8 +9,6 @@
 // UNSUPPORTED: c++03, c++11, c++14
 // <optional>
 
-// XFAIL: availability-bad_optional_access-missing && !no-exceptions
-
 // constexpr T& optional<T>::value() &&;
 
 #include <optional>
diff --git a/libcxx/test/std/utilities/optional/optional.specalg/make_optional.pass.cpp b/libcxx/test/std/utilities/optional/optional.specalg/make_optional.pass.cpp
index d6c770fe18e9c..e325a7af558eb 100644
--- a/libcxx/test/std/utilities/optional/optional.specalg/make_optional.pass.cpp
+++ b/libcxx/test/std/utilities/optional/optional.specalg/make_optional.pass.cpp
@@ -8,8 +8,6 @@
 
 // UNSUPPORTED: c++03, c++11, c++14
 
-// XFAIL: availability-bad_optional_access-missing && !no-exceptions
-
 // <optional>
 //
 // template <class T>
diff --git a/libcxx/test/std/utilities/variant/variant.bad_variant_access/bad_variant_access.pass.cpp b/libcxx/test/std/utilities/variant/variant.bad_variant_access/bad_variant_access.pass.cpp
index aa9c9251e6ff6..57c9d4bf1b046 100644
--- a/libcxx/test/std/utilities/variant/variant.bad_variant_access/bad_variant_access.pass.cpp
+++ b/libcxx/test/std/utilities/variant/variant.bad_variant_access/bad_variant_access.pass.cpp
@@ -7,7 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 // UNSUPPORTED: c++03, c++11, c++14
-// XFAIL: availability-bad_variant_access-missing
 
 // <variant>
 
diff --git a/libcxx/test/std/utilities/variant/variant.get/get_index.pass.cpp b/libcxx/test/std/utilities/variant/variant.get/get_index.pass.cpp
index 47aed9a8e5e65..97c7ff0ed0957 100644
--- a/libcxx/test/std/utilities/variant/variant.get/get_index.pass.cpp
+++ b/libcxx/test/std/utilities/variant/variant.get/get_index.pass.cpp
@@ -8,8 +8,6 @@
 
 // UNSUPPORTED: c++03, c++11, c++14
 
-// XFAIL: availability-bad_variant_access-missing && !no-exceptions
-
 // <variant>
 
 // template <size_t I, class... Types>
diff --git a/libcxx/test/std/utilities/variant/variant.get/get_type.pass.cpp b/libcxx/test/std/utilities/variant/variant.get/get_type.pass.cpp
index 734fe1b480fbf..d5e54d41e2f7b 100644
--- a/libcxx/test/std/utilities/variant/variant.get/get_type.pass.cpp
+++ b/libcxx/test/std/utilities/variant/variant.get/get_type.pass.cpp
@@ -8,8 +8,6 @@
 
 // UNSUPPORTED: c++03, c++11, c++14
 
-// XFAIL: availability-bad_variant_access-missing && !no-exceptions
-
 // <variant>
 
 // template <class T, class... Types> constexpr T& get(variant<Types...>& v);
diff --git a/libcxx/test/std/utilities/variant/variant.variant/variant.assign/T.pass.cpp b/libcxx/test/std/utilities/variant/variant.variant/variant.assign/T.pass.cpp
index 759685594c5c3..0cab568911436 100644
--- a/libcxx/test/std/utilities/variant/variant.variant/variant.assign/T.pass.cpp
+++ b/libcxx/test/std/utilities/variant/variant.variant/variant.assign/T.pass.cpp
@@ -8,8 +8,6 @@
 
 // UNSUPPORTED: c++03, c++11, c++14
 
-// XFAIL: availability-bad_variant_access-missing && !no-exceptions
-
 // <variant>
 
 // template <class ...Types> class variant;
diff --git a/libcxx/test/std/utilities/variant/variant.variant/variant.assign/copy.pass.cpp b/libcxx/test/std/utilities/variant/variant.variant/variant.assign/copy.pass.cpp
index a0b6684bddb1d..096d365d2d752 100644
--- a/libcxx/test/std/utilities/variant/variant.variant/variant.assign/copy.pass.cpp
+++ b/libcxx/test/std/utilities/variant/variant.variant/variant.assign/copy.pass.cpp
@@ -8,8 +8,6 @@
 
 // UNSUPPORTED: c++03, c++11, c++14
 
-// XFAIL: availability-bad_variant_access-missing && !no-exceptions
-
 // <variant>
 
 // template <class ...Types> class variant;
diff --git a/libcxx/test/std/utilities/variant/variant.variant/variant.assign/move.pass.cpp b/libcxx/test/std/utilities/variant/variant.variant/variant.assign/move.pass.cpp
index f62b5be113596..84094347aed3a 100644
--- a/libcxx/test/std/utilities/variant/variant.variant/variant.assign/move.pass.cpp
+++ b/libcxx/test/std/utilities/variant/variant.variant/variant.assign/move.pass.cpp
@@ -8,8 +8,6 @@
 
 // UNSUPPORTED: c++03, c++11, c++14
 
-// XFAIL: availability-bad_variant_access-missing && !no-exceptions
-
 // <variant>
 
 // template <class ...Types> class variant;
diff --git a/libcxx/test/std/utilities/variant/variant.variant/variant.ctor/T.pass.cpp b/libcxx/test/std/utilities/variant/variant.variant/variant.ctor/T.pass.cpp
index 108f92ded4e88..5d3e8d884ee1a 100644
--- a/libcxx/test/std/utilities/variant/variant.variant/variant.ctor/T.pass.cpp
+++ b/libcxx/test/std/utilities/variant/variant.variant/variant.ctor/T.pass.cpp
@@ -8,8 +8,6 @@
 
 // UNSUPPORTED: c++03, c++11, c++14
 
-// XFAIL: availability-bad_variant_access-missing && !no-exceptions
-
 // <variant>
 
 // template <class ...Types> class variant;
diff --git a/libcxx/test/std/utilities/variant/variant.variant/variant.ctor/copy.pass.cpp b/libcxx/test/std/utilities/variant/variant.variant/variant.ctor/copy.pass.cpp
index 3a487c7e1f086..d1e5768f58d2b 100644
--- a/libcxx/test/std/utilities/variant/variant.variant/variant.ctor/copy.pass.cpp
+++ b/libcxx/test/std/utilities/variant/variant.variant/variant.ctor/copy.pass.cpp
@@ -8,8 +8,6 @@
 
 // UNSUPPORTED: c++03, c++11, c++14
 
-// XFAIL: availability-bad_variant_access-missing && !no-exceptions
-
 // <variant>
 
 // template <class ...Types> class variant;
diff --git a/libcxx/test/std/utilities/variant/variant.variant/variant.ctor/default.pass.cpp b/libcxx/test/std/utilities/variant/variant.variant/variant.ctor/default.pass.cpp
index fd490a3c59ccf..cc1a3fe8ff78a 100644
--- a/libcxx/test/std/utilities/variant/variant.variant/variant.ctor/default.pass.cpp
+++ b/libcxx/test/std/utilities/variant/variant.variant/variant.ctor/default.pass.cpp
@@ -8,8 +8,6 @@
 
 // UNSUPPORTED: c++03, c++11, c++14
 
-// XFAIL: availability-bad_variant_access-missing && !no-exceptions
-
 // <variant>
 
 // template <class ...Types> class variant;
diff --git a/libcxx/test/std/utilities/variant/variant.variant/variant.ctor/in_place_index_args.pass.cpp b/libcxx/test/std/utilities/variant/variant.variant/variant.ctor/in_place_index_args.pass.cpp
index 56d85858c545a..00b3d70984ca3 100644
--- a/libcxx/test/std/utilities/variant/variant.variant/variant.ctor/in_place_index_args.pass.cpp
+++ b/libcxx/test/std/utilities/variant/variant.variant/variant.ctor/in_place_index_args.pass.cpp
@@ -8,8 +8,6 @@
 
 // UNSUPPORTED: c++03, c++11, c++14
 
-// XFAIL: availability-bad_variant_access-missing && !no-exceptions
-
 // <variant>
 
 // template <class ...Types> class variant;
diff --git a/libcxx/test/std/utilities/variant/variant.variant/variant.ctor/in_place_index_init_list_args.pass.cpp b/libcxx/test/std/utilities/variant/variant.variant/variant.ctor/in_place_index_init_list_args.pass.cpp
index 8b4c0cd575fec..c93f456eb0381 100644
--- a/libcxx/test/std/utilities/variant/variant.variant/variant.ctor/in_place_index_init_list_args.pass.cpp
+++ b/libcxx/test/std/utilities/variant/variant.variant/variant.ctor/in_place_index_init_list_args.pass.cpp
@@ -8,8 +8,6 @@
 
 // UNSUPPORTED: c++03, c++11, c++14
 
-// XFAIL: availability-bad_variant_access-missing && !no-exceptions
-
 // <variant>
 
 // template <class ...Types> class variant;
diff --git a/libcxx/test/std/utilities/variant/variant.variant/variant.ctor/in_place_type_args.pass.cpp b/libcxx/test/std/utilities/variant/variant.variant/variant.ctor/in_place_type_args.pass.cpp
index dbee67fc8a6da..417845792675d 100644
--- a/libcxx/test/std/utilities/variant/variant.variant/variant.ctor/in_place_type_args.pass.cpp
+++ b/libcxx/test/std/utilities/variant/variant.variant/variant.ctor/in_place_type_args.pass.cpp
@@ -8,8 +8,6 @@
 
 // UNSUPPORTED: c++03, c++11, c++14
 
-// XFAIL: availability-bad_variant_access-missing && !no-exceptions
-
 // <variant>
 
 // template <class ...Types> class variant;
diff --git a/libcxx/test/std/utilities/variant/variant.variant/variant.ctor/in_place_type_init_list_args.pass.cpp b/libcxx/test/std/utilities/variant/variant.variant/variant.ctor/in_place_type_init_list_args.pass.cpp
index ea05d906f097d..11f5e1f9f94e6 100644
--- a/libcxx/test/std/utilities/variant/variant.variant/variant.ctor/in_place_type_init_list_args.pass.cpp
+++ b/libcxx/test/std/utilities/variant/variant.variant/variant.ctor/in_place_type_init_list_args.pass.cpp
@@ -8,8 +8,6 @@
 
 // UNSUPPORTED: c++03, c++11, c++14
 
-// XFAIL: availability-bad_variant_access-missing && !no-exceptions
-
 // <variant>
 
 // template <class ...Types> class variant;
diff --git a/libcxx/test/std/utilities/variant/variant.variant/variant.ctor/move.pass.cpp b/libcxx/test/std/utilities/variant/variant.variant/variant.ctor/move.pass.cpp
index 308461e93f78d..e2518fe29caf7 100644
--- a/libcxx/test/std/utilities/variant/variant.variant/variant.ctor/move.pass.cpp
+++ b/libcxx/test/std/utilities/variant/variant.variant/variant.ctor/move.pass.cpp
@@ -8,8 +8,6 @@
 
 // UNSUPPORTED: c++03, c++11, c++14
 
-// XFAIL: availability-bad_variant_access-missing && !no-exceptions
-
 // <variant>
 
 // template <class ...Types> class variant;
diff --git a/libcxx/test/std/utilities/variant/variant.variant/variant.mod/emplace_index_args.pass.cpp b/libcxx/test/std/utilities/variant/variant.variant/variant.mod/emplace_index_args.pass.cpp
index 21a02c24f7db4..96fcd7e7bee48 100644
--- a/libcxx/test/std/utilities/variant/variant.variant/variant.mod/emplace_index_args.pass.cpp
+++ b/libcxx/test/std/utilities/variant/variant.variant/variant.mod/emplace_index_args.pass.cpp
@@ -8,8 +8,6 @@
 
 // UNSUPPORTED: c++03, c++11, c++14
 
-// XFAIL: availability-bad_variant_access-missing && !no-exceptions
-
 // <variant>
 
 // template <class ...Types> class variant;
diff --git a/libcxx/test/std/utilities/variant/variant.variant/variant.mod/emplace_index_init_list_args.pass.cpp b/libcxx/test/std/utilities/variant/variant.variant/variant.mod/emplace_index_init_list_args.pass.cpp
index efbe6bd2b4c78..9068aacc43592 100644
--- a/libcxx/test/std/utilities/variant/variant.variant/variant.mod/emplace_index_init_list_args.pass.cpp
+++ b/libcxx/test/std/utilities/variant/variant.variant/variant.mod/emplace_index_init_list_args.pass.cpp
@@ -8,8 +8,6 @@
 
 // UNSUPPORTED: c++03, c++11, c++14
 
-// XFAIL: availability-bad_variant_access-missing && !no-exceptions
-
 // <variant>
 
 // template <class ...Types> class variant;
diff --git a/libcxx/test/std/utilities/variant/variant.variant/variant.mod/emplace_type_args.pass.cpp b/libcxx/test/std/utilities/variant/variant.variant/variant.mod/emplace_type_args.pass.cpp
index 45c891caf4b75..24305aa0a35da 100644
--- a/libcxx/test/std/utilities/variant/variant.variant/variant.mod/emplace_type_args.pass.cpp
+++ b/libcxx/test/std/utilities/variant/variant.variant/variant.mod/emplace_type_args.pass.cpp
@@ -8,8 +8,6 @@
 
 // UNSUPPORTED: c++03, c++11, c++14
 
-// XFAIL: availability-bad_variant_access-missing && !no-exceptions
-
 // <variant>
 
 // template <class ...Types> class variant;
diff --git a/libcxx/test/std/utilities/variant/variant.variant/variant.mod/emplace_type_init_list_args.pass.cpp b/libcxx/test/std/utilities/variant/variant.variant/variant.mod/emplace_type_init_list_args.pass.cpp
index 9f961664fa8d1..74d834b9b3457 100644
--- a/libcxx/test/std/utilities/variant/variant.variant/variant.mod/emplace_type_init_list_args.pass.cpp
+++ b/libcxx/test/std/utilities/variant/variant.variant/variant.mod/emplace_type_init_list_args.pass.cpp
@@ -8,8 +8,6 @@
 
 // UNSUPPORTED: c++03, c++11, c++14
 
-// XFAIL: availability-bad_variant_access-missing && !no-exceptions
-
 // <variant>
 
 // template <class ...Types> class variant;
diff --git a/libcxx/test/std/utilities/variant/variant.variant/variant.swap/swap.pass.cpp b/libcxx/test/std/utilities/variant/variant.variant/variant.swap/swap.pass.cpp
index 1f388b44f9546..1802bc4670bba 100644
--- a/libcxx/test/std/utilities/variant/variant.variant/variant.swap/swap.pass.cpp
+++ b/libcxx/test/std/utilities/variant/variant.variant/variant.swap/swap.pass.cpp
@@ -8,8 +8,6 @@
 
 // UNSUPPORTED: c++03, c++11, c++14
 
-// XFAIL: availability-bad_variant_access-missing && !no-exceptions
-
 // <variant>
 
 // template <class ...Types> class variant;
diff --git a/libcxx/test/std/utilities/variant/variant.visit/robust_against_adl.pass.cpp b/libcxx/test/std/utilities/variant/variant.visit/robust_against_adl.pass.cpp
index f4b191a24623f..6f17fa32648d4 100644
--- a/libcxx/test/std/utilities/variant/variant.visit/robust_against_adl.pass.cpp
+++ b/libcxx/test/std/utilities/variant/variant.visit/robust_against_adl.pass.cpp
@@ -8,8 +8,6 @@
 
 // UNSUPPORTED: c++03, c++11, c++14
 
-// XFAIL: availability-bad_variant_access-missing && !no-exceptions
-
 // <variant>
 // template <class Visitor, class... Variants>
 // constexpr see below visit(Visitor&& vis, Variants&&... vars);
diff --git a/libcxx/test/std/utilities/variant/variant.visit/visit.pass.cpp b/libcxx/test/std/utilities/variant/variant.visit/visit.pass.cpp
index 26a44b364b24c..097b784f2bf2c 100644
--- a/libcxx/test/std/utilities/variant/variant.visit/visit.pass.cpp
+++ b/libcxx/test/std/utilities/variant/variant.visit/visit.pass.cpp
@@ -8,8 +8,6 @@
 
 // UNSUPPORTED: c++03, c++11, c++14
 
-// XFAIL: availability-bad_variant_access-missing && !no-exceptions
-
 // <variant>
 // template <class Visitor, class... Variants>
 // constexpr see below visit(Visitor&& vis, Variants&&... vars);
diff --git a/libcxx/test/std/utilities/variant/variant.visit/visit_return_type.pass.cpp b/libcxx/test/std/utilities/variant/variant.visit/visit_return_type.pass.cpp
index 3a848caaaba12..eb425c07f9322 100644
--- a/libcxx/test/std/utilities/variant/variant.visit/visit_return_type.pass.cpp
+++ b/libcxx/test/std/utilities/variant/variant.visit/visit_return_type.pass.cpp
@@ -8,8 +8,6 @@
 
 // UNSUPPORTED: c++03, c++11, c++14, c++17
 
-// XFAIL: availability-bad_variant_access-missing && !no-exceptions
-
 // <variant>
 // template <class R, class Visitor, class... Variants>
 // constexpr R visit(Visitor&& vis, Variants&&... vars);
diff --git a/libcxx/utils/data/ignore_format.txt b/libcxx/utils/data/ignore_format.txt
index cf6dca7cd7d3c..a81b27ea68865 100644
--- a/libcxx/utils/data/ignore_format.txt
+++ b/libcxx/utils/data/ignore_format.txt
@@ -827,7 +827,6 @@ libcxx/test/libcxx/iterators/predef.iterators/__unconstrained_reverse_iterator/t
 libcxx/test/libcxx/iterators/stream.iterators/ostreambuf.iterator/ostreambuf.iter.ops/failed.pass.cpp
 libcxx/test/libcxx/iterators/unwrap_iter.pass.cpp
 libcxx/test/libcxx/language.support/cxa_deleted_virtual.pass.cpp
-libcxx/test/libcxx/language.support/support.dynamic/aligned_alloc_availability.verify.cpp
 libcxx/test/libcxx/language.support/support.dynamic/libcpp_deallocate.sh.cpp
 libcxx/test/libcxx/language.support/support.dynamic/new_faligned_allocation.pass.cpp
 libcxx/test/libcxx/language.support/support.rtti/type.info/type_info.comparison.apple.compile.pass.cpp
diff --git a/libcxx/utils/generate_feature_test_macro_components.py b/libcxx/utils/generate_feature_test_macro_components.py
index 76faa275d2d7e..774fb6155398f 100755
--- a/libcxx/utils/generate_feature_test_macro_components.py
+++ b/libcxx/utils/generate_feature_test_macro_components.py
@@ -911,8 +911,8 @@ def add_version_header(tc):
             "name": "__cpp_lib_shared_mutex",
             "values": {"c++17": 201505},
             "headers": ["shared_mutex"],
-            "test_suite_guard": "!defined(_LIBCPP_HAS_NO_THREADS) && !defined(_LIBCPP_AVAILABILITY_HAS_NO_SHARED_MUTEX)",
-            "libcxx_guard": "!defined(_LIBCPP_HAS_NO_THREADS) && !defined(_LIBCPP_AVAILABILITY_HAS_NO_SHARED_MUTEX)",
+            "test_suite_guard": "!defined(_LIBCPP_HAS_NO_THREADS)",
+            "libcxx_guard": "!defined(_LIBCPP_HAS_NO_THREADS)",
         },
         {
             "name": "__cpp_lib_shared_ptr_arrays",
@@ -928,8 +928,8 @@ def add_version_header(tc):
             "name": "__cpp_lib_shared_timed_mutex",
             "values": {"c++14": 201402},
             "headers": ["shared_mutex"],
-            "test_suite_guard": "!defined(_LIBCPP_HAS_NO_THREADS) && !defined(_LIBCPP_AVAILABILITY_HAS_NO_SHARED_MUTEX)",
-            "libcxx_guard": "!defined(_LIBCPP_HAS_NO_THREADS) && !defined(_LIBCPP_AVAILABILITY_HAS_NO_SHARED_MUTEX)",
+            "test_suite_guard": "!defined(_LIBCPP_HAS_NO_THREADS)",
+            "libcxx_guard": "!defined(_LIBCPP_HAS_NO_THREADS)",
         },
         {
             "name": "__cpp_lib_shift",
diff --git a/libcxx/utils/libcxx/test/features.py b/libcxx/utils/libcxx/test/features.py
index 11f60a9620546..de7d3e140dd06 100644
--- a/libcxx/utils/libcxx/test/features.py
+++ b/libcxx/utils/libcxx/test/features.py
@@ -497,7 +497,7 @@ def check_gdb(cfg):
     Feature(
         name="availability-fp_to_chars-missing",
         when=lambda cfg: BooleanExpression.evaluate(
-            "stdlib=apple-libc++ && target={{.+}}-apple-macosx{{(10.9|10.10|10.11|10.12|10.13|10.14|10.15|11.0|12.0|13.0)(.0)?}}",
+            "stdlib=apple-libc++ && target={{.+}}-apple-macosx{{(10.13|10.14|10.15|11.0|12.0|13.0)(.0)?}}",
             cfg.available_features,
         ),
     ),
@@ -505,7 +505,7 @@ def check_gdb(cfg):
     Feature(
         name="availability-char8_t_support-missing",
         when=lambda cfg: BooleanExpression.evaluate(
-            "stdlib=apple-libc++ && target={{.+}}-apple-macosx{{(10.9|10.10|10.11|10.12|10.13|10.14|10.15|11.0)(.0)?}}",
+            "stdlib=apple-libc++ && target={{.+}}-apple-macosx{{(10.13|10.14|10.15|11.0)(.0)?}}",
             cfg.available_features,
         ),
     ),
@@ -513,31 +513,7 @@ def check_gdb(cfg):
     Feature(
         name="availability-verbose_abort-missing",
         when=lambda cfg: BooleanExpression.evaluate(
-            "stdlib=apple-libc++ && target={{.+}}-apple-macosx{{(10.9|10.10|10.11|10.12|10.13|10.14|10.15|11.0|12.0|13.0)(.0)?}}",
-            cfg.available_features,
-        ),
-    ),
-    # Tests that require std::bad_variant_access in the built library
-    Feature(
-        name="availability-bad_variant_access-missing",
-        when=lambda cfg: BooleanExpression.evaluate(
-            "stdlib=apple-libc++ && target={{.+}}-apple-macosx10.{{(9|10|11|12)(.0)?}}",
-            cfg.available_features,
-        ),
-    ),
-    # Tests that require std::bad_optional_access in the built library
-    Feature(
-        name="availability-bad_optional_access-missing",
-        when=lambda cfg: BooleanExpression.evaluate(
-            "stdlib=apple-libc++ && target={{.+}}-apple-macosx10.{{(9|10|11|12)(.0)?}}",
-            cfg.available_features,
-        ),
-    ),
-    # Tests that require std::bad_any_cast in the built library
-    Feature(
-        name="availability-bad_any_cast-missing",
-        when=lambda cfg: BooleanExpression.evaluate(
-            "stdlib=apple-libc++ && target={{.+}}-apple-macosx10.{{(9|10|11|12)(.0)?}}",
+            "stdlib=apple-libc++ && target={{.+}}-apple-macosx{{(10.13|10.14|10.15|11.0|12.0|13.0)(.0)?}}",
             cfg.available_features,
         ),
     ),
@@ -545,7 +521,7 @@ def check_gdb(cfg):
     Feature(
         name="availability-pmr-missing",
         when=lambda cfg: BooleanExpression.evaluate(
-            "stdlib=apple-libc++ && target={{.+}}-apple-macosx{{(10.9|10.10|10.11|10.12|10.13|10.14|10.15|11.0|12.0|13.0)(.0)?}}",
+            "stdlib=apple-libc++ && target={{.+}}-apple-macosx{{(10.13|10.14|10.15|11.0|12.0|13.0)(.0)?}}",
             cfg.available_features,
         ),
     ),
@@ -553,7 +529,7 @@ def check_gdb(cfg):
     Feature(
         name="availability-filesystem-missing",
         when=lambda cfg: BooleanExpression.evaluate(
-            "stdlib=apple-libc++ && target={{.+}}-apple-macosx10.{{(9|10|11|12|13|14)(.0)?}}",
+            "stdlib=apple-libc++ && target={{.+}}-apple-macosx10.{{(13|14)(.0)?}}",
             cfg.available_features,
         ),
     ),
@@ -561,24 +537,7 @@ def check_gdb(cfg):
     Feature(
         name="availability-synchronization_library-missing",
         when=lambda cfg: BooleanExpression.evaluate(
-            "stdlib=apple-libc++ && target={{.+}}-apple-macosx10.{{(9|10|11|12|13|14|15)(.0)?}}",
-            cfg.available_features,
-        ),
-    ),
-    # Tests that require support for std::shared_mutex and std::shared_timed_mutex in the built library
-    Feature(
-        name="availability-shared_mutex-missing",
-        when=lambda cfg: BooleanExpression.evaluate(
-            "stdlib=apple-libc++ && target={{.+}}-apple-macosx10.{{(9|10|11)(.0)?}}",
-            cfg.available_features,
-        ),
-    ),
-    # Tests that require support for aligned allocation in the built library. This is about `operator new(..., std::align_val_t, ...)` specifically,
-    # not other forms of aligned allocation.
-    Feature(
-        name="availability-aligned_allocation-missing",
-        when=lambda cfg: BooleanExpression.evaluate(
-            "stdlib=apple-libc++ && target={{.+}}-apple-macosx10.{{(9|10|11|12)(.0)?}}",
+            "stdlib=apple-libc++ && target={{.+}}-apple-macosx10.{{(13|14|15)(.0)?}}",
             cfg.available_features,
         ),
     ),
@@ -587,7 +546,7 @@ def check_gdb(cfg):
         name="availability-tzdb-missing",
         when=lambda cfg: BooleanExpression.evaluate(
             # TODO(ldionne) Please provide the correct value.
-            "(stdlib=apple-libc++ && target={{.+}}-apple-macosx{{(10.9|10.10|10.11|10.12|10.13|10.14|10.15|11.0|12.0|13.0)(.0)?}})",
+            "(stdlib=apple-libc++ && target={{.+}}-apple-macosx{{(10.13|10.14|10.15|11.0|12.0|13.0)(.0)?}})",
             cfg.available_features,
         ),
     ),

From 66b92830c963158c9f74dd5533265c28d60cc265 Mon Sep 17 00:00:00 2001
From: Jason Molenda <jmolenda@apple.com>
Date: Wed, 1 Nov 2023 16:14:36 -0700
Subject: [PATCH 756/877] [lldb] [debugserver] Shut down the exception thread
 when clearing (#70979)

MachProcess has a MachTask as an ivar. In the MachProcess dtor, we call
MachTask::Clear() to clear its state, before running the dtor of all our
ivars, including the MachTask one.

When we attach on darwin, MachProcess calls
MachTask::StartExceptionThread which does the task_for_pid and then
starts a thread to listen for mach messages. Then MachProcess calls
ptrace(PT_ATTACHEXC). If that ptrace() fails, MachProcess will call
MachTask::Clear. But the exception thread is now up & running and is not
stopped; its ivars will be reset by the Clear() method, and its object
will be freed after the dtor runs.

Actually eliciting a crash in this scenario is very timing sensitive; I
hand-modified debugserver to fail to PT_ATTACHEXC trying to simulate it
on my desktop and was unable. But looking at the source, and an
occasional crash report we've received, it's clear that this is
possible.

rdar://117521198
---
 lldb/tools/debugserver/source/MacOSX/MachTask.mm | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/lldb/tools/debugserver/source/MacOSX/MachTask.mm b/lldb/tools/debugserver/source/MacOSX/MachTask.mm
index 4f5b4039243f6..fd2ac64ac6cf7 100644
--- a/lldb/tools/debugserver/source/MacOSX/MachTask.mm
+++ b/lldb/tools/debugserver/source/MacOSX/MachTask.mm
@@ -145,6 +145,8 @@
 //----------------------------------------------------------------------
 void MachTask::Clear() {
   // Do any cleanup needed for this task
+  if (m_exception_thread)
+    ShutDownExcecptionThread();
   m_task = TASK_NULL;
   m_exception_thread = 0;
   m_exception_port = MACH_PORT_NULL;

From 373c343a77a7afaa07179db1754a52b620dfaf2e Mon Sep 17 00:00:00 2001
From: Tobias Stadler <mail@stadler-tobias.de>
Date: Fri, 29 Sep 2023 02:11:16 +0200
Subject: [PATCH 757/877] Reland: [GlobalISel] LegalizationArtifactCombiner:
 Elide redundant G_AND

Reland 3686a0b after fixing an exposed miscompile in #68840

Differential Revision: https://reviews.llvm.org/D159140
---
 .../llvm/CodeGen/GlobalISel/GISelKnownBits.h  |    6 +-
 .../GlobalISel/LegalizationArtifactCombiner.h |   24 +-
 .../lib/CodeGen/GlobalISel/GISelKnownBits.cpp |   10 +
 llvm/lib/CodeGen/GlobalISel/Legalizer.cpp     |    2 +-
 .../AArch64/Atomics/aarch64-atomicrmw-lse2.ll | 1180 +++++------------
 .../Atomics/aarch64-atomicrmw-lse2_lse128.ll  |  780 ++++-------
 .../aarch64-atomicrmw-outline_atomics.ll      | 1180 +++++------------
 .../AArch64/Atomics/aarch64-atomicrmw-rcpc.ll | 1180 +++++------------
 .../Atomics/aarch64-atomicrmw-rcpc3.ll        | 1180 +++++------------
 .../Atomics/aarch64-atomicrmw-v8_1a.ll        |  780 ++++-------
 .../AArch64/Atomics/aarch64-atomicrmw-v8a.ll  | 1180 +++++------------
 .../AArch64/GlobalISel/arm64-atomic.ll        |   52 +-
 .../GlobalISel/combine-ext-debugloc.mir       |    2 +-
 .../GlobalISel/combine-sext-debugloc.mir      |    2 +-
 .../CodeGen/AArch64/GlobalISel/huge-switch.ll |    4 +-
 .../AArch64/GlobalISel/invoke-region.ll       |    4 +-
 .../irtranslator-hoisted-constants.ll         |    4 +-
 .../AArch64/GlobalISel/legalize-add.mir       |   31 +-
 .../AArch64/GlobalISel/legalize-cmp.mir       |   33 +-
 .../AArch64/GlobalISel/legalize-ctlz.mir      |   17 +-
 .../AArch64/GlobalISel/legalize-ctpop.mir     |    8 +-
 .../AArch64/GlobalISel/legalize-cttz.mir      |   25 +-
 .../AArch64/GlobalISel/legalize-div.mir       |   51 +-
 .../AArch64/GlobalISel/legalize-ext-cse.mir   |   12 +-
 .../legalize-ext-csedebug-output.mir          |    4 +-
 .../AArch64/GlobalISel/legalize-ext.mir       |    4 +-
 .../AArch64/GlobalISel/legalize-extracts.mir  |    2 +-
 .../AArch64/GlobalISel/legalize-freeze.mir    |   28 +-
 .../AArch64/GlobalISel/legalize-fshl.mir      |   24 +-
 .../AArch64/GlobalISel/legalize-fshr.mir      |   16 +-
 .../AArch64/GlobalISel/legalize-inserts.mir   |    2 +-
 .../GlobalISel/legalize-intrinsic-min-max.mir |   32 +-
 .../AArch64/GlobalISel/legalize-itofp.mir     |    8 +-
 .../GlobalISel/legalize-load-store.mir        |    2 +-
 .../GlobalISel/legalize-merge-values.mir      |   14 +-
 .../AArch64/GlobalISel/legalize-mul.mir       |    4 +-
 .../legalize-phi-insertpt-decrement.mir       |    6 +-
 .../AArch64/GlobalISel/legalize-phi.mir       |   73 +-
 .../AArch64/GlobalISel/legalize-ptrtoint.mir  |    2 +-
 .../AArch64/GlobalISel/legalize-rem.mir       |    2 +-
 .../AArch64/GlobalISel/legalize-sadde.mir     |   15 +-
 .../AArch64/GlobalISel/legalize-saddo.mir     |   11 +-
 .../AArch64/GlobalISel/legalize-saddsat.mir   |   37 +-
 .../AArch64/GlobalISel/legalize-select.mir    |   58 +-
 .../AArch64/GlobalISel/legalize-shift.mir     |   73 +-
 .../AArch64/GlobalISel/legalize-simple.mir    |   16 +-
 .../AArch64/GlobalISel/legalize-ssube.mir     |   15 +-
 .../AArch64/GlobalISel/legalize-ssubo.mir     |   11 +-
 .../AArch64/GlobalISel/legalize-ssubsat.mir   |   37 +-
 .../AArch64/GlobalISel/legalize-sub.mir       |   39 +-
 .../AArch64/GlobalISel/legalize-uadd-sat.mir  |   20 +-
 .../AArch64/GlobalISel/legalize-uadde.mir     |   17 +-
 .../AArch64/GlobalISel/legalize-uaddo.mir     |   13 +-
 .../AArch64/GlobalISel/legalize-usub-sat.mir  |   20 +-
 .../AArch64/GlobalISel/legalize-usube.mir     |   17 +-
 .../AArch64/GlobalISel/legalize-usubo.mir     |   13 +-
 .../legalizer-combiner-zext-trunc-crash.mir   |    2 +-
 .../GlobalISel/retry-artifact-combine.mir     |    3 +-
 llvm/test/CodeGen/AArch64/zext.ll             |   30 +-
 ...artifact-combiner-cse-leaves-dead-cast.mir |    2 +-
 .../artifact-combiner-unmerge-values.mir      |   15 +-
 .../GlobalISel/artifact-combiner-zext.mir     |   34 +-
 llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll   |  798 ++++++-----
 llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll   |  799 ++++++-----
 .../GlobalISel/inst-select-load-private.mir   |   63 +-
 .../AMDGPU/GlobalISel/legalize-add.mir        |   26 +-
 .../AMDGPU/GlobalISel/legalize-and.mir        |   46 +-
 .../AMDGPU/GlobalISel/legalize-anyext.mir     |   14 +-
 .../AMDGPU/GlobalISel/legalize-ashr.mir       |  372 +++---
 .../AMDGPU/GlobalISel/legalize-bitcast.mir    |   48 +-
 .../AMDGPU/GlobalISel/legalize-bitreverse.mir |    7 +-
 .../AMDGPU/GlobalISel/legalize-bswap.mir      |   21 +-
 .../GlobalISel/legalize-ctlz-zero-undef.mir   |   15 +-
 .../AMDGPU/GlobalISel/legalize-ctlz.mir       |   19 +-
 .../AMDGPU/GlobalISel/legalize-ctpop.mir      |   33 +-
 .../AMDGPU/GlobalISel/legalize-cttz.mir       |    4 +-
 .../AMDGPU/GlobalISel/legalize-fabs.mir       |   40 +-
 .../AMDGPU/GlobalISel/legalize-fadd.mir       |   34 +-
 .../AMDGPU/GlobalISel/legalize-fcopysign.mir  |   40 +-
 .../AMDGPU/GlobalISel/legalize-fma.mir        |   34 +-
 .../AMDGPU/GlobalISel/legalize-fmaxnum.mir    |   50 +-
 .../AMDGPU/GlobalISel/legalize-fminnum.mir    |   50 +-
 .../AMDGPU/GlobalISel/legalize-fmul.mir       |   34 +-
 .../AMDGPU/GlobalISel/legalize-fneg.mir       |   33 +-
 .../AMDGPU/GlobalISel/legalize-fshl.mir       |  206 ++-
 .../AMDGPU/GlobalISel/legalize-fshr.mir       |  347 +++--
 .../AMDGPU/GlobalISel/legalize-fsub.mir       |   36 +-
 .../AMDGPU/GlobalISel/legalize-icmp.mir       |   78 +-
 .../GlobalISel/legalize-implicit-def.mir      |   27 +-
 .../AMDGPU/GlobalISel/legalize-insert.mir     |   91 +-
 .../GlobalISel/legalize-intrinsic-round.mir   |   14 +-
 ...legalize-llvm.amdgcn.image.store.2d.d16.ll |    7 +-
 .../legalize-llvm.amdgcn.s.buffer.load.mir    |   34 +-
 .../GlobalISel/legalize-load-constant.mir     |  296 ++++-
 .../AMDGPU/GlobalISel/legalize-load-flat.mir  |  330 +++--
 .../GlobalISel/legalize-load-global.mir       |  811 +++++++++--
 .../AMDGPU/GlobalISel/legalize-load-local.mir |  952 ++++++++++++-
 .../GlobalISel/legalize-load-private.mir      |  620 +++++++--
 .../AMDGPU/GlobalISel/legalize-lshr.mir       |  454 ++++---
 .../GlobalISel/legalize-merge-values.mir      |  658 ++++-----
 .../CodeGen/AMDGPU/GlobalISel/legalize-or.mir |   42 +-
 .../AMDGPU/GlobalISel/legalize-phi.mir        |   31 +-
 .../AMDGPU/GlobalISel/legalize-ptrmask.mir    |    4 +-
 .../AMDGPU/GlobalISel/legalize-rotl-rotr.mir  |   73 +-
 .../AMDGPU/GlobalISel/legalize-saddo.mir      |   15 +-
 .../AMDGPU/GlobalISel/legalize-saddsat.mir    |   38 +-
 .../AMDGPU/GlobalISel/legalize-select.mir     |   16 +-
 .../AMDGPU/GlobalISel/legalize-sext-inreg.mir |   72 +-
 .../AMDGPU/GlobalISel/legalize-sext.mir       |    2 +-
 .../AMDGPU/GlobalISel/legalize-shl.mir        |  376 +++---
 .../GlobalISel/legalize-shuffle-vector.mir    |   21 +-
 .../legalize-shuffle-vector.s16.mir           |  123 +-
 .../AMDGPU/GlobalISel/legalize-smulh.mir      |    9 +-
 .../AMDGPU/GlobalISel/legalize-smulo.mir      |   12 +-
 .../AMDGPU/GlobalISel/legalize-sshlsat.mir    |  147 +-
 .../AMDGPU/GlobalISel/legalize-ssubo.mir      |   15 +-
 .../AMDGPU/GlobalISel/legalize-ssubsat.mir    |   38 +-
 .../GlobalISel/legalize-store-global.mir      |  830 ++++++++----
 .../AMDGPU/GlobalISel/legalize-store.mir      |   66 +-
 .../AMDGPU/GlobalISel/legalize-trunc.mir      |    8 +-
 .../AMDGPU/GlobalISel/legalize-uaddo.mir      |   99 +-
 .../AMDGPU/GlobalISel/legalize-uaddsat.mir    |   86 +-
 .../AMDGPU/GlobalISel/legalize-udiv.mir       |  115 +-
 .../AMDGPU/GlobalISel/legalize-umax.mir       |   70 +-
 .../AMDGPU/GlobalISel/legalize-umin.mir       |   70 +-
 .../AMDGPU/GlobalISel/legalize-umulh.mir      |   73 +-
 .../AMDGPU/GlobalISel/legalize-umulo.mir      |   65 +-
 .../GlobalISel/legalize-unmerge-values.mir    |   57 +-
 .../AMDGPU/GlobalISel/legalize-urem.mir       |  123 +-
 .../AMDGPU/GlobalISel/legalize-ushlsat.mir    |  145 +-
 .../AMDGPU/GlobalISel/legalize-usubo.mir      |   99 +-
 .../AMDGPU/GlobalISel/legalize-usubsat.mir    |   86 +-
 .../AMDGPU/GlobalISel/legalize-xor.mir        |   42 +-
 .../AMDGPU/GlobalISel/legalize-zext.mir       |  114 +-
 .../ARM/GlobalISel/arm-legalize-fp.mir        |   13 +-
 .../CodeGen/Mips/GlobalISel/legalizer/add.mir |  222 ++--
 .../Mips/GlobalISel/legalizer/bitwise.mir     |  451 ++++---
 .../Mips/GlobalISel/legalizer/constants.mir   |   42 +-
 .../Mips/GlobalISel/legalizer/ctlz.mir        |   36 +-
 .../Mips/GlobalISel/legalizer/cttz.mir        |  170 ++-
 .../legalizer/fptosi_and_fptoui.mir           |  672 +++++-----
 .../Mips/GlobalISel/legalizer/icmp.mir        |  290 ++--
 .../legalizer/jump_table_and_brjt.mir         |  141 +-
 .../CodeGen/Mips/GlobalISel/legalizer/mul.mir |  342 +++--
 .../Mips/GlobalISel/legalizer/select.mir      |  159 +--
 .../CodeGen/Mips/GlobalISel/legalizer/sub.mir |  198 +--
 .../CodeGen/Mips/GlobalISel/llvm-ir/add.ll    |    5 +-
 .../Mips/GlobalISel/llvm-ir/bitwise.ll        |   74 +-
 .../CodeGen/Mips/GlobalISel/llvm-ir/ctlz.ll   |    7 +-
 .../CodeGen/Mips/GlobalISel/llvm-ir/cttz.ll   |   34 +-
 .../GlobalISel/llvm-ir/fptosi_and_fptoui.ll   |    9 -
 .../CodeGen/Mips/GlobalISel/llvm-ir/icmp.ll   |   48 +-
 .../GlobalISel/llvm-ir/jump_table_and_brjt.ll |    4 -
 .../CodeGen/Mips/GlobalISel/llvm-ir/mul.ll    |    8 -
 .../CodeGen/Mips/GlobalISel/llvm-ir/select.ll |    1 -
 .../GlobalISel/llvm-ir/sitofp_and_uitofp.ll   |   56 +-
 .../CodeGen/Mips/GlobalISel/llvm-ir/sub.ll    |    4 +-
 .../legalizer/rv32/legalize-abs.mir           |    4 +-
 .../legalizer/rv32/legalize-add.mir           |   24 +-
 .../legalizer/rv32/legalize-addo-subo.mir     |   56 +-
 .../legalizer/rv32/legalize-ashr.mir          |  136 +-
 .../legalizer/rv32/legalize-bswap.mir         |   11 +-
 .../legalizer/rv32/legalize-icmp.mir          |   64 +-
 .../legalizer/rv32/legalize-lshr.mir          |  132 +-
 .../legalizer/rv32/legalize-mul-ext.mir       |    6 +-
 .../legalizer/rv32/legalize-shl.mir           |   96 +-
 .../legalizer/rv32/legalize-smax.mir          |   32 +-
 .../legalizer/rv32/legalize-smin.mir          |   32 +-
 .../legalizer/rv32/legalize-store.mir         |    8 +-
 .../legalizer/rv32/legalize-sub.mir           |   24 +-
 .../legalizer/rv32/legalize-umax.mir          |   32 +-
 .../legalizer/rv32/legalize-umin.mir          |   32 +-
 .../legalizer/rv64/legalize-abs.mir           |    2 +-
 .../legalizer/rv64/legalize-add.mir           |   26 +-
 .../legalizer/rv64/legalize-addo-subo.mir     |    4 +-
 .../legalizer/rv64/legalize-ashr.mir          |  142 +-
 .../legalizer/rv64/legalize-bswap.mir         |   11 +-
 .../legalizer/rv64/legalize-div.mir           |   12 +-
 .../legalizer/rv64/legalize-icmp.mir          |   64 +-
 .../legalizer/rv64/legalize-lshr.mir          |  144 +-
 .../legalizer/rv64/legalize-mul-ext.mir       |    6 +-
 .../legalizer/rv64/legalize-rem.mir           |   12 +-
 .../legalizer/rv64/legalize-shl.mir           |  102 +-
 .../legalizer/rv64/legalize-smax.mir          |   28 +-
 .../legalizer/rv64/legalize-smin.mir          |   28 +-
 .../legalizer/rv64/legalize-store.mir         |    8 +-
 .../legalizer/rv64/legalize-sub.mir           |   26 +-
 .../legalizer/rv64/legalize-umax.mir          |   34 +-
 .../legalizer/rv64/legalize-umin.mir          |   34 +-
 .../CodeGen/X86/GlobalISel/legalize-cmp.mir   |   90 +-
 .../X86/GlobalISel/legalize-ext-x86-64.mir    |  132 +-
 .../CodeGen/X86/GlobalISel/legalize-ext.mir   |  390 +++---
 .../legalize-trailing-zeros-undef.mir         |    6 +-
 .../GlobalISel/legalize-trailing-zeros.mir    |    6 +-
 .../CodeGen/X86/GlobalISel/legalize-trunc.mir |   41 +-
 .../CodeGen/X86/GlobalISel/lshr-scalar.ll     |    3 +-
 .../X86/GlobalISel/x86_64-legalize-zext.mir   |  120 +-
 .../CodeGen/GlobalISel/LegalizerTest.cpp      |    2 +-
 198 files changed, 11824 insertions(+), 12515 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/GlobalISel/GISelKnownBits.h b/llvm/include/llvm/CodeGen/GlobalISel/GISelKnownBits.h
index eff87c5617d99..ea75c2c7f6f41 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/GISelKnownBits.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/GISelKnownBits.h
@@ -118,11 +118,7 @@ class GISelKnownBitsAnalysis : public MachineFunctionPass {
   GISelKnownBitsAnalysis() : MachineFunctionPass(ID) {
     initializeGISelKnownBitsAnalysisPass(*PassRegistry::getPassRegistry());
   }
-  GISelKnownBits &get(MachineFunction &MF) {
-    if (!Info)
-      Info = std::make_unique<GISelKnownBits>(MF);
-    return *Info.get();
-  }
+  GISelKnownBits &get(MachineFunction &MF);
   void getAnalysisUsage(AnalysisUsage &AU) const override;
   bool runOnMachineFunction(MachineFunction &MF) override;
   void releaseMemory() override { Info.reset(); }
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h
index 7ccb99d511fff..851353042cc26 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h
@@ -35,6 +35,7 @@ class LegalizationArtifactCombiner {
   MachineIRBuilder &Builder;
   MachineRegisterInfo &MRI;
   const LegalizerInfo &LI;
+  GISelKnownBits *KB;
 
   static bool isArtifactCast(unsigned Opc) {
     switch (Opc) {
@@ -50,8 +51,9 @@ class LegalizationArtifactCombiner {
 
 public:
   LegalizationArtifactCombiner(MachineIRBuilder &B, MachineRegisterInfo &MRI,
-                    const LegalizerInfo &LI)
-      : Builder(B), MRI(MRI), LI(LI) {}
+                               const LegalizerInfo &LI,
+                               GISelKnownBits *KB = nullptr)
+      : Builder(B), MRI(MRI), LI(LI), KB(KB) {}
 
   bool tryCombineAnyExt(MachineInstr &MI,
                         SmallVectorImpl<MachineInstr *> &DeadInsts,
@@ -131,13 +133,25 @@ class LegalizationArtifactCombiner {
       LLVM_DEBUG(dbgs() << ".. Combine MI: " << MI;);
       LLT SrcTy = MRI.getType(SrcReg);
       APInt MaskVal = APInt::getAllOnes(SrcTy.getScalarSizeInBits());
-      auto Mask = Builder.buildConstant(
-        DstTy, MaskVal.zext(DstTy.getScalarSizeInBits()));
       if (SextSrc && (DstTy != MRI.getType(SextSrc)))
         SextSrc = Builder.buildSExtOrTrunc(DstTy, SextSrc).getReg(0);
       if (TruncSrc && (DstTy != MRI.getType(TruncSrc)))
         TruncSrc = Builder.buildAnyExtOrTrunc(DstTy, TruncSrc).getReg(0);
-      Builder.buildAnd(DstReg, SextSrc ? SextSrc : TruncSrc, Mask);
+      APInt ExtMaskVal = MaskVal.zext(DstTy.getScalarSizeInBits());
+      Register AndSrc = SextSrc ? SextSrc : TruncSrc;
+      // Elide G_AND and mask constant if possible.
+      // The G_AND would also be removed by the post-legalize redundant_and
+      // combine, but in this very common case, eliding early and regardless of
+      // OptLevel results in significant compile-time and O0 code-size
+      // improvements. Inserting unnecessary instructions between boolean defs
+      // and uses hinders a lot of folding during ISel.
+      if (KB && (KB->getKnownZeroes(AndSrc) | ExtMaskVal).isAllOnes()) {
+        replaceRegOrBuildCopy(DstReg, AndSrc, MRI, Builder, UpdatedDefs,
+                              Observer);
+      } else {
+        auto Mask = Builder.buildConstant(DstTy, ExtMaskVal);
+        Builder.buildAnd(DstReg, AndSrc, Mask);
+      }
       markInstAndDefDead(MI, *MRI.getVRegDef(SrcReg), DeadInsts);
       return true;
     }
diff --git a/llvm/lib/CodeGen/GlobalISel/GISelKnownBits.cpp b/llvm/lib/CodeGen/GlobalISel/GISelKnownBits.cpp
index bbe82e201366d..ea8c20cdcd45d 100644
--- a/llvm/lib/CodeGen/GlobalISel/GISelKnownBits.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/GISelKnownBits.cpp
@@ -19,6 +19,7 @@
 #include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/CodeGen/TargetOpcodes.h"
 #include "llvm/IR/Module.h"
+#include "llvm/Target/TargetMachine.h"
 
 #define DEBUG_TYPE "gisel-known-bits"
 
@@ -773,3 +774,12 @@ void GISelKnownBitsAnalysis::getAnalysisUsage(AnalysisUsage &AU) const {
 bool GISelKnownBitsAnalysis::runOnMachineFunction(MachineFunction &MF) {
   return false;
 }
+
+GISelKnownBits &GISelKnownBitsAnalysis::get(MachineFunction &MF) {
+  if (!Info) {
+    unsigned MaxDepth =
+        MF.getTarget().getOptLevel() == CodeGenOptLevel::None ? 2 : 6;
+    Info = std::make_unique<GISelKnownBits>(MF, MaxDepth);
+  }
+  return *Info.get();
+}
diff --git a/llvm/lib/CodeGen/GlobalISel/Legalizer.cpp b/llvm/lib/CodeGen/GlobalISel/Legalizer.cpp
index aecbe0b7604c0..6d75258c1041b 100644
--- a/llvm/lib/CodeGen/GlobalISel/Legalizer.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/Legalizer.cpp
@@ -218,7 +218,7 @@ Legalizer::legalizeMachineFunction(MachineFunction &MF, const LegalizerInfo &LI,
   // This will keep all the observers notified about new insertions/deletions.
   RAIIMFObsDelInstaller Installer(MF, WrapperObserver);
   LegalizerHelper Helper(MF, LI, WrapperObserver, MIRBuilder, KB);
-  LegalizationArtifactCombiner ArtCombiner(MIRBuilder, MRI, LI);
+  LegalizationArtifactCombiner ArtCombiner(MIRBuilder, MRI, LI, KB);
   bool Changed = false;
   SmallVector<MachineInstr *, 128> RetryList;
   do {
diff --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-lse2.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-lse2.ll
index 0c52a8a683e3a..88061756d8fee 100644
--- a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-lse2.ll
+++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-lse2.ll
@@ -926,7 +926,6 @@ define dso_local i64 @atomicrmw_add_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 define dso_local i128 @atomicrmw_add_i128_aligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_add_i128_aligned_monotonic:
 ; -O0:    adds x14, x8, x10
-; -O0:    and w10, w8, #0x1
 ; -O0:    subs w10, w10, #1
 ; -O0:    ldxp x10, x9, [x11]
 ; -O0:    cmp x10, x12
@@ -949,7 +948,6 @@ define dso_local i128 @atomicrmw_add_i128_aligned_monotonic(ptr %ptr, i128 %valu
 define dso_local i128 @atomicrmw_add_i128_aligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_add_i128_aligned_acquire:
 ; -O0:    adds x14, x8, x10
-; -O0:    and w10, w8, #0x1
 ; -O0:    subs w10, w10, #1
 ; -O0:    ldaxp x10, x9, [x11]
 ; -O0:    cmp x10, x12
@@ -972,7 +970,6 @@ define dso_local i128 @atomicrmw_add_i128_aligned_acquire(ptr %ptr, i128 %value)
 define dso_local i128 @atomicrmw_add_i128_aligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_add_i128_aligned_release:
 ; -O0:    adds x14, x8, x10
-; -O0:    and w10, w8, #0x1
 ; -O0:    subs w10, w10, #1
 ; -O0:    ldxp x10, x9, [x11]
 ; -O0:    cmp x10, x12
@@ -995,7 +992,6 @@ define dso_local i128 @atomicrmw_add_i128_aligned_release(ptr %ptr, i128 %value)
 define dso_local i128 @atomicrmw_add_i128_aligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_add_i128_aligned_acq_rel:
 ; -O0:    adds x14, x8, x10
-; -O0:    and w10, w8, #0x1
 ; -O0:    subs w10, w10, #1
 ; -O0:    ldaxp x10, x9, [x11]
 ; -O0:    cmp x10, x12
@@ -1018,7 +1014,6 @@ define dso_local i128 @atomicrmw_add_i128_aligned_acq_rel(ptr %ptr, i128 %value)
 define dso_local i128 @atomicrmw_add_i128_aligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_add_i128_aligned_seq_cst:
 ; -O0:    adds x14, x8, x10
-; -O0:    and w10, w8, #0x1
 ; -O0:    subs w10, w10, #1
 ; -O0:    ldaxp x10, x9, [x11]
 ; -O0:    cmp x10, x12
@@ -1306,7 +1301,6 @@ define dso_local i64 @atomicrmw_add_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 define dso_local i128 @atomicrmw_add_i128_unaligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_add_i128_unaligned_monotonic:
 ; -O0:    adds x9, x8, x9
-; -O0:    and w11, w8, #0x1
 ; -O0:    subs w11, w11, #1
 ; -O0:    bl __atomic_compare_exchange
 ;
@@ -1321,7 +1315,6 @@ define dso_local i128 @atomicrmw_add_i128_unaligned_monotonic(ptr %ptr, i128 %va
 define dso_local i128 @atomicrmw_add_i128_unaligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_add_i128_unaligned_acquire:
 ; -O0:    adds x9, x8, x9
-; -O0:    and w11, w8, #0x1
 ; -O0:    subs w11, w11, #1
 ; -O0:    bl __atomic_compare_exchange
 ;
@@ -1336,7 +1329,6 @@ define dso_local i128 @atomicrmw_add_i128_unaligned_acquire(ptr %ptr, i128 %valu
 define dso_local i128 @atomicrmw_add_i128_unaligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_add_i128_unaligned_release:
 ; -O0:    adds x9, x8, x9
-; -O0:    and w11, w8, #0x1
 ; -O0:    subs w11, w11, #1
 ; -O0:    bl __atomic_compare_exchange
 ;
@@ -1351,7 +1343,6 @@ define dso_local i128 @atomicrmw_add_i128_unaligned_release(ptr %ptr, i128 %valu
 define dso_local i128 @atomicrmw_add_i128_unaligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_add_i128_unaligned_acq_rel:
 ; -O0:    adds x9, x8, x9
-; -O0:    and w11, w8, #0x1
 ; -O0:    subs w11, w11, #1
 ; -O0:    bl __atomic_compare_exchange
 ;
@@ -1366,7 +1357,6 @@ define dso_local i128 @atomicrmw_add_i128_unaligned_acq_rel(ptr %ptr, i128 %valu
 define dso_local i128 @atomicrmw_add_i128_unaligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_add_i128_unaligned_seq_cst:
 ; -O0:    adds x9, x8, x9
-; -O0:    and w11, w8, #0x1
 ; -O0:    subs w11, w11, #1
 ; -O0:    bl __atomic_compare_exchange
 ;
@@ -1706,7 +1696,6 @@ define dso_local i64 @atomicrmw_sub_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 define dso_local i128 @atomicrmw_sub_i128_aligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_sub_i128_aligned_monotonic:
 ; -O0:    subs x14, x8, x10
-; -O0:    and w10, w8, #0x1
 ; -O0:    ldxp x10, x9, [x11]
 ; -O0:    cmp x10, x12
 ; -O0:    cmp x9, x13
@@ -1728,7 +1717,6 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_monotonic(ptr %ptr, i128 %valu
 define dso_local i128 @atomicrmw_sub_i128_aligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_sub_i128_aligned_acquire:
 ; -O0:    subs x14, x8, x10
-; -O0:    and w10, w8, #0x1
 ; -O0:    ldaxp x10, x9, [x11]
 ; -O0:    cmp x10, x12
 ; -O0:    cmp x9, x13
@@ -1750,7 +1738,6 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_acquire(ptr %ptr, i128 %value)
 define dso_local i128 @atomicrmw_sub_i128_aligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_sub_i128_aligned_release:
 ; -O0:    subs x14, x8, x10
-; -O0:    and w10, w8, #0x1
 ; -O0:    ldxp x10, x9, [x11]
 ; -O0:    cmp x10, x12
 ; -O0:    cmp x9, x13
@@ -1772,7 +1759,6 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_release(ptr %ptr, i128 %value)
 define dso_local i128 @atomicrmw_sub_i128_aligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_sub_i128_aligned_acq_rel:
 ; -O0:    subs x14, x8, x10
-; -O0:    and w10, w8, #0x1
 ; -O0:    ldaxp x10, x9, [x11]
 ; -O0:    cmp x10, x12
 ; -O0:    cmp x9, x13
@@ -1794,7 +1780,6 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_acq_rel(ptr %ptr, i128 %value)
 define dso_local i128 @atomicrmw_sub_i128_aligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_sub_i128_aligned_seq_cst:
 ; -O0:    subs x14, x8, x10
-; -O0:    and w10, w8, #0x1
 ; -O0:    ldaxp x10, x9, [x11]
 ; -O0:    cmp x10, x12
 ; -O0:    cmp x9, x13
@@ -2081,7 +2066,6 @@ define dso_local i64 @atomicrmw_sub_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 define dso_local i128 @atomicrmw_sub_i128_unaligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_sub_i128_unaligned_monotonic:
 ; -O0:    subs x9, x8, x9
-; -O0:    and w11, w8, #0x1
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i128_unaligned_monotonic:
@@ -2095,7 +2079,6 @@ define dso_local i128 @atomicrmw_sub_i128_unaligned_monotonic(ptr %ptr, i128 %va
 define dso_local i128 @atomicrmw_sub_i128_unaligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_sub_i128_unaligned_acquire:
 ; -O0:    subs x9, x8, x9
-; -O0:    and w11, w8, #0x1
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i128_unaligned_acquire:
@@ -2109,7 +2092,6 @@ define dso_local i128 @atomicrmw_sub_i128_unaligned_acquire(ptr %ptr, i128 %valu
 define dso_local i128 @atomicrmw_sub_i128_unaligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_sub_i128_unaligned_release:
 ; -O0:    subs x9, x8, x9
-; -O0:    and w11, w8, #0x1
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i128_unaligned_release:
@@ -2123,7 +2105,6 @@ define dso_local i128 @atomicrmw_sub_i128_unaligned_release(ptr %ptr, i128 %valu
 define dso_local i128 @atomicrmw_sub_i128_unaligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_sub_i128_unaligned_acq_rel:
 ; -O0:    subs x9, x8, x9
-; -O0:    and w11, w8, #0x1
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i128_unaligned_acq_rel:
@@ -2137,7 +2118,6 @@ define dso_local i128 @atomicrmw_sub_i128_unaligned_acq_rel(ptr %ptr, i128 %valu
 define dso_local i128 @atomicrmw_sub_i128_unaligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_sub_i128_unaligned_seq_cst:
 ; -O0:    subs x9, x8, x9
-; -O0:    and w11, w8, #0x1
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i128_unaligned_seq_cst:
@@ -5392,9 +5372,7 @@ define dso_local i8 @atomicrmw_max_i8_aligned_monotonic(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_max_i8_aligned_monotonic:
 ; -O0:    sxtb w9, w10
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, gt
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -5415,9 +5393,7 @@ define dso_local i8 @atomicrmw_max_i8_aligned_acquire(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_max_i8_aligned_acquire:
 ; -O0:    sxtb w9, w10
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, gt
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -5438,9 +5414,7 @@ define dso_local i8 @atomicrmw_max_i8_aligned_release(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_max_i8_aligned_release:
 ; -O0:    sxtb w9, w10
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, gt
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -5461,9 +5435,7 @@ define dso_local i8 @atomicrmw_max_i8_aligned_acq_rel(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_max_i8_aligned_acq_rel:
 ; -O0:    sxtb w9, w10
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, gt
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -5484,9 +5456,7 @@ define dso_local i8 @atomicrmw_max_i8_aligned_seq_cst(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_max_i8_aligned_seq_cst:
 ; -O0:    sxtb w9, w10
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, gt
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -5507,9 +5477,7 @@ define dso_local i16 @atomicrmw_max_i16_aligned_monotonic(ptr %ptr, i16 %value)
 ; -O0-LABEL: atomicrmw_max_i16_aligned_monotonic:
 ; -O0:    sxth w10, w8
 ; -O0:    subs w10, w10, w9, sxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, gt
 ; -O0:    ldaxrh w9, [x11]
 ; -O0:    cmp w9, w8, uxth
 ; -O0:    stlxrh w10, w12, [x11]
@@ -5529,9 +5497,7 @@ define dso_local i16 @atomicrmw_max_i16_aligned_acquire(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_max_i16_aligned_acquire:
 ; -O0:    sxth w10, w8
 ; -O0:    subs w10, w10, w9, sxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, gt
 ; -O0:    ldaxrh w9, [x11]
 ; -O0:    cmp w9, w8, uxth
 ; -O0:    stlxrh w10, w12, [x11]
@@ -5551,9 +5517,7 @@ define dso_local i16 @atomicrmw_max_i16_aligned_release(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_max_i16_aligned_release:
 ; -O0:    sxth w10, w8
 ; -O0:    subs w10, w10, w9, sxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, gt
 ; -O0:    ldaxrh w9, [x11]
 ; -O0:    cmp w9, w8, uxth
 ; -O0:    stlxrh w10, w12, [x11]
@@ -5573,9 +5537,7 @@ define dso_local i16 @atomicrmw_max_i16_aligned_acq_rel(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_max_i16_aligned_acq_rel:
 ; -O0:    sxth w10, w8
 ; -O0:    subs w10, w10, w9, sxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, gt
 ; -O0:    ldaxrh w9, [x11]
 ; -O0:    cmp w9, w8, uxth
 ; -O0:    stlxrh w10, w12, [x11]
@@ -5595,9 +5557,7 @@ define dso_local i16 @atomicrmw_max_i16_aligned_seq_cst(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_max_i16_aligned_seq_cst:
 ; -O0:    sxth w10, w8
 ; -O0:    subs w10, w10, w9, sxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, gt
 ; -O0:    ldaxrh w9, [x11]
 ; -O0:    cmp w9, w8, uxth
 ; -O0:    stlxrh w10, w12, [x11]
@@ -5616,9 +5576,7 @@ define dso_local i16 @atomicrmw_max_i16_aligned_seq_cst(ptr %ptr, i16 %value) {
 define dso_local i32 @atomicrmw_max_i32_aligned_monotonic(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_max_i32_aligned_monotonic:
 ; -O0:    subs w10, w8, w9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, gt
 ; -O0:    ldaxr w9, [x11]
 ; -O0:    cmp w9, w8
 ; -O0:    stlxr w10, w12, [x11]
@@ -5636,9 +5594,7 @@ define dso_local i32 @atomicrmw_max_i32_aligned_monotonic(ptr %ptr, i32 %value)
 define dso_local i32 @atomicrmw_max_i32_aligned_acquire(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_max_i32_aligned_acquire:
 ; -O0:    subs w10, w8, w9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, gt
 ; -O0:    ldaxr w9, [x11]
 ; -O0:    cmp w9, w8
 ; -O0:    stlxr w10, w12, [x11]
@@ -5656,9 +5612,7 @@ define dso_local i32 @atomicrmw_max_i32_aligned_acquire(ptr %ptr, i32 %value) {
 define dso_local i32 @atomicrmw_max_i32_aligned_release(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_max_i32_aligned_release:
 ; -O0:    subs w10, w8, w9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, gt
 ; -O0:    ldaxr w9, [x11]
 ; -O0:    cmp w9, w8
 ; -O0:    stlxr w10, w12, [x11]
@@ -5676,9 +5630,7 @@ define dso_local i32 @atomicrmw_max_i32_aligned_release(ptr %ptr, i32 %value) {
 define dso_local i32 @atomicrmw_max_i32_aligned_acq_rel(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_max_i32_aligned_acq_rel:
 ; -O0:    subs w10, w8, w9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, gt
 ; -O0:    ldaxr w9, [x11]
 ; -O0:    cmp w9, w8
 ; -O0:    stlxr w10, w12, [x11]
@@ -5696,9 +5648,7 @@ define dso_local i32 @atomicrmw_max_i32_aligned_acq_rel(ptr %ptr, i32 %value) {
 define dso_local i32 @atomicrmw_max_i32_aligned_seq_cst(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_max_i32_aligned_seq_cst:
 ; -O0:    subs w10, w8, w9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, gt
 ; -O0:    ldaxr w9, [x11]
 ; -O0:    cmp w9, w8
 ; -O0:    stlxr w10, w12, [x11]
@@ -5716,9 +5666,7 @@ define dso_local i32 @atomicrmw_max_i32_aligned_seq_cst(ptr %ptr, i32 %value) {
 define dso_local i64 @atomicrmw_max_i64_aligned_monotonic(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_max_i64_aligned_monotonic:
 ; -O0:    subs x10, x8, x9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x12, x8, x9, ne
+; -O0:    csel x12, x8, x9, gt
 ; -O0:    ldaxr x9, [x11]
 ; -O0:    cmp x9, x8
 ; -O0:    stlxr w10, x12, [x11]
@@ -5736,9 +5684,7 @@ define dso_local i64 @atomicrmw_max_i64_aligned_monotonic(ptr %ptr, i64 %value)
 define dso_local i64 @atomicrmw_max_i64_aligned_acquire(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_max_i64_aligned_acquire:
 ; -O0:    subs x10, x8, x9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x12, x8, x9, ne
+; -O0:    csel x12, x8, x9, gt
 ; -O0:    ldaxr x9, [x11]
 ; -O0:    cmp x9, x8
 ; -O0:    stlxr w10, x12, [x11]
@@ -5756,9 +5702,7 @@ define dso_local i64 @atomicrmw_max_i64_aligned_acquire(ptr %ptr, i64 %value) {
 define dso_local i64 @atomicrmw_max_i64_aligned_release(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_max_i64_aligned_release:
 ; -O0:    subs x10, x8, x9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x12, x8, x9, ne
+; -O0:    csel x12, x8, x9, gt
 ; -O0:    ldaxr x9, [x11]
 ; -O0:    cmp x9, x8
 ; -O0:    stlxr w10, x12, [x11]
@@ -5776,9 +5720,7 @@ define dso_local i64 @atomicrmw_max_i64_aligned_release(ptr %ptr, i64 %value) {
 define dso_local i64 @atomicrmw_max_i64_aligned_acq_rel(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_max_i64_aligned_acq_rel:
 ; -O0:    subs x10, x8, x9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x12, x8, x9, ne
+; -O0:    csel x12, x8, x9, gt
 ; -O0:    ldaxr x9, [x11]
 ; -O0:    cmp x9, x8
 ; -O0:    stlxr w10, x12, [x11]
@@ -5796,9 +5738,7 @@ define dso_local i64 @atomicrmw_max_i64_aligned_acq_rel(ptr %ptr, i64 %value) {
 define dso_local i64 @atomicrmw_max_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_max_i64_aligned_seq_cst:
 ; -O0:    subs x10, x8, x9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x12, x8, x9, ne
+; -O0:    csel x12, x8, x9, gt
 ; -O0:    ldaxr x9, [x11]
 ; -O0:    cmp x9, x8
 ; -O0:    stlxr w10, x12, [x11]
@@ -5816,15 +5756,11 @@ define dso_local i64 @atomicrmw_max_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 define dso_local i128 @atomicrmw_max_i128_aligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_aligned_monotonic:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w8, w8, w10, ne
-; -O0:    and w13, w8, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x14, x10, x12, ne
-; -O0:    and w10, w8, #0x1
+; -O0:    subs x13, x13, x9
+; -O0:    csel w10, w8, w10, eq
+; -O0:    ands w13, w10, #0x1
+; -O0:    csel x14, x8, x12, ne
 ; -O0:    ands w10, w10, #0x1
 ; -O0:    csel x15, x8, x9, ne
 ; -O0:    ldxp x10, x9, [x11]
@@ -5850,15 +5786,11 @@ define dso_local i128 @atomicrmw_max_i128_aligned_monotonic(ptr %ptr, i128 %valu
 define dso_local i128 @atomicrmw_max_i128_aligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_aligned_acquire:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w8, w8, w10, ne
-; -O0:    and w13, w8, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x14, x10, x12, ne
-; -O0:    and w10, w8, #0x1
+; -O0:    subs x13, x13, x9
+; -O0:    csel w10, w8, w10, eq
+; -O0:    ands w13, w10, #0x1
+; -O0:    csel x14, x8, x12, ne
 ; -O0:    ands w10, w10, #0x1
 ; -O0:    csel x15, x8, x9, ne
 ; -O0:    ldaxp x10, x9, [x11]
@@ -5884,15 +5816,11 @@ define dso_local i128 @atomicrmw_max_i128_aligned_acquire(ptr %ptr, i128 %value)
 define dso_local i128 @atomicrmw_max_i128_aligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_aligned_release:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w8, w8, w10, ne
-; -O0:    and w13, w8, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x14, x10, x12, ne
-; -O0:    and w10, w8, #0x1
+; -O0:    subs x13, x13, x9
+; -O0:    csel w10, w8, w10, eq
+; -O0:    ands w13, w10, #0x1
+; -O0:    csel x14, x8, x12, ne
 ; -O0:    ands w10, w10, #0x1
 ; -O0:    csel x15, x8, x9, ne
 ; -O0:    ldxp x10, x9, [x11]
@@ -5918,15 +5846,11 @@ define dso_local i128 @atomicrmw_max_i128_aligned_release(ptr %ptr, i128 %value)
 define dso_local i128 @atomicrmw_max_i128_aligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_aligned_acq_rel:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w8, w8, w10, ne
-; -O0:    and w13, w8, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x14, x10, x12, ne
-; -O0:    and w10, w8, #0x1
+; -O0:    subs x13, x13, x9
+; -O0:    csel w10, w8, w10, eq
+; -O0:    ands w13, w10, #0x1
+; -O0:    csel x14, x8, x12, ne
 ; -O0:    ands w10, w10, #0x1
 ; -O0:    csel x15, x8, x9, ne
 ; -O0:    ldaxp x10, x9, [x11]
@@ -5952,15 +5876,11 @@ define dso_local i128 @atomicrmw_max_i128_aligned_acq_rel(ptr %ptr, i128 %value)
 define dso_local i128 @atomicrmw_max_i128_aligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_aligned_seq_cst:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w8, w8, w10, ne
-; -O0:    and w13, w8, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x14, x10, x12, ne
-; -O0:    and w10, w8, #0x1
+; -O0:    subs x13, x13, x9
+; -O0:    csel w10, w8, w10, eq
+; -O0:    ands w13, w10, #0x1
+; -O0:    csel x14, x8, x12, ne
 ; -O0:    ands w10, w10, #0x1
 ; -O0:    csel x15, x8, x9, ne
 ; -O0:    ldaxp x10, x9, [x11]
@@ -5987,9 +5907,7 @@ define dso_local i8 @atomicrmw_max_i8_unaligned_monotonic(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_max_i8_unaligned_monotonic:
 ; -O0:    sxtb w9, w10
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, gt
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -6010,9 +5928,7 @@ define dso_local i8 @atomicrmw_max_i8_unaligned_acquire(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_max_i8_unaligned_acquire:
 ; -O0:    sxtb w9, w10
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, gt
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -6033,9 +5949,7 @@ define dso_local i8 @atomicrmw_max_i8_unaligned_release(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_max_i8_unaligned_release:
 ; -O0:    sxtb w9, w10
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, gt
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -6056,9 +5970,7 @@ define dso_local i8 @atomicrmw_max_i8_unaligned_acq_rel(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_max_i8_unaligned_acq_rel:
 ; -O0:    sxtb w9, w10
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, gt
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -6079,9 +5991,7 @@ define dso_local i8 @atomicrmw_max_i8_unaligned_seq_cst(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_max_i8_unaligned_seq_cst:
 ; -O0:    sxtb w9, w10
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, gt
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -6102,9 +6012,7 @@ define dso_local i16 @atomicrmw_max_i16_unaligned_monotonic(ptr %ptr, i16 %value
 ; -O0-LABEL: atomicrmw_max_i16_unaligned_monotonic:
 ; -O0:    sxth w10, w9
 ; -O0:    subs w10, w10, w8, sxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, gt
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i16_unaligned_monotonic:
@@ -6120,9 +6028,7 @@ define dso_local i16 @atomicrmw_max_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ; -O0-LABEL: atomicrmw_max_i16_unaligned_acquire:
 ; -O0:    sxth w10, w9
 ; -O0:    subs w10, w10, w8, sxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, gt
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i16_unaligned_acquire:
@@ -6138,9 +6044,7 @@ define dso_local i16 @atomicrmw_max_i16_unaligned_release(ptr %ptr, i16 %value)
 ; -O0-LABEL: atomicrmw_max_i16_unaligned_release:
 ; -O0:    sxth w10, w9
 ; -O0:    subs w10, w10, w8, sxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, gt
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i16_unaligned_release:
@@ -6156,9 +6060,7 @@ define dso_local i16 @atomicrmw_max_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ; -O0-LABEL: atomicrmw_max_i16_unaligned_acq_rel:
 ; -O0:    sxth w10, w9
 ; -O0:    subs w10, w10, w8, sxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, gt
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i16_unaligned_acq_rel:
@@ -6174,9 +6076,7 @@ define dso_local i16 @atomicrmw_max_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ; -O0-LABEL: atomicrmw_max_i16_unaligned_seq_cst:
 ; -O0:    sxth w10, w9
 ; -O0:    subs w10, w10, w8, sxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, gt
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i16_unaligned_seq_cst:
@@ -6191,9 +6091,7 @@ define dso_local i16 @atomicrmw_max_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 define dso_local i32 @atomicrmw_max_i32_unaligned_monotonic(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_max_i32_unaligned_monotonic:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, gt
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i32_unaligned_monotonic:
@@ -6207,9 +6105,7 @@ define dso_local i32 @atomicrmw_max_i32_unaligned_monotonic(ptr %ptr, i32 %value
 define dso_local i32 @atomicrmw_max_i32_unaligned_acquire(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_max_i32_unaligned_acquire:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, gt
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i32_unaligned_acquire:
@@ -6223,9 +6119,7 @@ define dso_local i32 @atomicrmw_max_i32_unaligned_acquire(ptr %ptr, i32 %value)
 define dso_local i32 @atomicrmw_max_i32_unaligned_release(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_max_i32_unaligned_release:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, gt
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i32_unaligned_release:
@@ -6239,9 +6133,7 @@ define dso_local i32 @atomicrmw_max_i32_unaligned_release(ptr %ptr, i32 %value)
 define dso_local i32 @atomicrmw_max_i32_unaligned_acq_rel(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_max_i32_unaligned_acq_rel:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, gt
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i32_unaligned_acq_rel:
@@ -6255,9 +6147,7 @@ define dso_local i32 @atomicrmw_max_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 define dso_local i32 @atomicrmw_max_i32_unaligned_seq_cst(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_max_i32_unaligned_seq_cst:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, gt
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i32_unaligned_seq_cst:
@@ -6271,9 +6161,7 @@ define dso_local i32 @atomicrmw_max_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 define dso_local i64 @atomicrmw_max_i64_unaligned_monotonic(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_max_i64_unaligned_monotonic:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, gt
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i64_unaligned_monotonic:
@@ -6287,9 +6175,7 @@ define dso_local i64 @atomicrmw_max_i64_unaligned_monotonic(ptr %ptr, i64 %value
 define dso_local i64 @atomicrmw_max_i64_unaligned_acquire(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_max_i64_unaligned_acquire:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, gt
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i64_unaligned_acquire:
@@ -6303,9 +6189,7 @@ define dso_local i64 @atomicrmw_max_i64_unaligned_acquire(ptr %ptr, i64 %value)
 define dso_local i64 @atomicrmw_max_i64_unaligned_release(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_max_i64_unaligned_release:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, gt
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i64_unaligned_release:
@@ -6319,9 +6203,7 @@ define dso_local i64 @atomicrmw_max_i64_unaligned_release(ptr %ptr, i64 %value)
 define dso_local i64 @atomicrmw_max_i64_unaligned_acq_rel(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_max_i64_unaligned_acq_rel:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, gt
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i64_unaligned_acq_rel:
@@ -6335,9 +6217,7 @@ define dso_local i64 @atomicrmw_max_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 define dso_local i64 @atomicrmw_max_i64_unaligned_seq_cst(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_max_i64_unaligned_seq_cst:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, gt
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i64_unaligned_seq_cst:
@@ -6351,15 +6231,11 @@ define dso_local i64 @atomicrmw_max_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 define dso_local i128 @atomicrmw_max_i128_unaligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_unaligned_monotonic:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -6377,15 +6253,11 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_monotonic(ptr %ptr, i128 %va
 define dso_local i128 @atomicrmw_max_i128_unaligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_unaligned_acquire:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -6403,15 +6275,11 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_acquire(ptr %ptr, i128 %valu
 define dso_local i128 @atomicrmw_max_i128_unaligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_unaligned_release:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -6429,15 +6297,11 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_release(ptr %ptr, i128 %valu
 define dso_local i128 @atomicrmw_max_i128_unaligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_unaligned_acq_rel:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -6455,15 +6319,11 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_acq_rel(ptr %ptr, i128 %valu
 define dso_local i128 @atomicrmw_max_i128_unaligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_unaligned_seq_cst:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -6482,9 +6342,7 @@ define dso_local i8 @atomicrmw_min_i8_aligned_monotonic(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_min_i8_aligned_monotonic:
 ; -O0:    sxtb w9, w10
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, le
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -6505,9 +6363,7 @@ define dso_local i8 @atomicrmw_min_i8_aligned_acquire(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_min_i8_aligned_acquire:
 ; -O0:    sxtb w9, w10
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, le
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -6528,9 +6384,7 @@ define dso_local i8 @atomicrmw_min_i8_aligned_release(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_min_i8_aligned_release:
 ; -O0:    sxtb w9, w10
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, le
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -6551,9 +6405,7 @@ define dso_local i8 @atomicrmw_min_i8_aligned_acq_rel(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_min_i8_aligned_acq_rel:
 ; -O0:    sxtb w9, w10
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, le
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -6574,9 +6426,7 @@ define dso_local i8 @atomicrmw_min_i8_aligned_seq_cst(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_min_i8_aligned_seq_cst:
 ; -O0:    sxtb w9, w10
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, le
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -6597,9 +6447,7 @@ define dso_local i16 @atomicrmw_min_i16_aligned_monotonic(ptr %ptr, i16 %value)
 ; -O0-LABEL: atomicrmw_min_i16_aligned_monotonic:
 ; -O0:    sxth w10, w8
 ; -O0:    subs w10, w10, w9, sxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, le
 ; -O0:    ldaxrh w9, [x11]
 ; -O0:    cmp w9, w8, uxth
 ; -O0:    stlxrh w10, w12, [x11]
@@ -6619,9 +6467,7 @@ define dso_local i16 @atomicrmw_min_i16_aligned_acquire(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_min_i16_aligned_acquire:
 ; -O0:    sxth w10, w8
 ; -O0:    subs w10, w10, w9, sxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, le
 ; -O0:    ldaxrh w9, [x11]
 ; -O0:    cmp w9, w8, uxth
 ; -O0:    stlxrh w10, w12, [x11]
@@ -6641,9 +6487,7 @@ define dso_local i16 @atomicrmw_min_i16_aligned_release(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_min_i16_aligned_release:
 ; -O0:    sxth w10, w8
 ; -O0:    subs w10, w10, w9, sxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, le
 ; -O0:    ldaxrh w9, [x11]
 ; -O0:    cmp w9, w8, uxth
 ; -O0:    stlxrh w10, w12, [x11]
@@ -6663,9 +6507,7 @@ define dso_local i16 @atomicrmw_min_i16_aligned_acq_rel(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_min_i16_aligned_acq_rel:
 ; -O0:    sxth w10, w8
 ; -O0:    subs w10, w10, w9, sxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, le
 ; -O0:    ldaxrh w9, [x11]
 ; -O0:    cmp w9, w8, uxth
 ; -O0:    stlxrh w10, w12, [x11]
@@ -6685,9 +6527,7 @@ define dso_local i16 @atomicrmw_min_i16_aligned_seq_cst(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_min_i16_aligned_seq_cst:
 ; -O0:    sxth w10, w8
 ; -O0:    subs w10, w10, w9, sxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, le
 ; -O0:    ldaxrh w9, [x11]
 ; -O0:    cmp w9, w8, uxth
 ; -O0:    stlxrh w10, w12, [x11]
@@ -6706,9 +6546,7 @@ define dso_local i16 @atomicrmw_min_i16_aligned_seq_cst(ptr %ptr, i16 %value) {
 define dso_local i32 @atomicrmw_min_i32_aligned_monotonic(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_min_i32_aligned_monotonic:
 ; -O0:    subs w10, w8, w9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, le
 ; -O0:    ldaxr w9, [x11]
 ; -O0:    cmp w9, w8
 ; -O0:    stlxr w10, w12, [x11]
@@ -6726,9 +6564,7 @@ define dso_local i32 @atomicrmw_min_i32_aligned_monotonic(ptr %ptr, i32 %value)
 define dso_local i32 @atomicrmw_min_i32_aligned_acquire(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_min_i32_aligned_acquire:
 ; -O0:    subs w10, w8, w9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, le
 ; -O0:    ldaxr w9, [x11]
 ; -O0:    cmp w9, w8
 ; -O0:    stlxr w10, w12, [x11]
@@ -6746,9 +6582,7 @@ define dso_local i32 @atomicrmw_min_i32_aligned_acquire(ptr %ptr, i32 %value) {
 define dso_local i32 @atomicrmw_min_i32_aligned_release(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_min_i32_aligned_release:
 ; -O0:    subs w10, w8, w9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, le
 ; -O0:    ldaxr w9, [x11]
 ; -O0:    cmp w9, w8
 ; -O0:    stlxr w10, w12, [x11]
@@ -6766,9 +6600,7 @@ define dso_local i32 @atomicrmw_min_i32_aligned_release(ptr %ptr, i32 %value) {
 define dso_local i32 @atomicrmw_min_i32_aligned_acq_rel(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_min_i32_aligned_acq_rel:
 ; -O0:    subs w10, w8, w9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, le
 ; -O0:    ldaxr w9, [x11]
 ; -O0:    cmp w9, w8
 ; -O0:    stlxr w10, w12, [x11]
@@ -6786,9 +6618,7 @@ define dso_local i32 @atomicrmw_min_i32_aligned_acq_rel(ptr %ptr, i32 %value) {
 define dso_local i32 @atomicrmw_min_i32_aligned_seq_cst(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_min_i32_aligned_seq_cst:
 ; -O0:    subs w10, w8, w9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, le
 ; -O0:    ldaxr w9, [x11]
 ; -O0:    cmp w9, w8
 ; -O0:    stlxr w10, w12, [x11]
@@ -6806,9 +6636,7 @@ define dso_local i32 @atomicrmw_min_i32_aligned_seq_cst(ptr %ptr, i32 %value) {
 define dso_local i64 @atomicrmw_min_i64_aligned_monotonic(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_min_i64_aligned_monotonic:
 ; -O0:    subs x10, x8, x9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x12, x8, x9, ne
+; -O0:    csel x12, x8, x9, le
 ; -O0:    ldaxr x9, [x11]
 ; -O0:    cmp x9, x8
 ; -O0:    stlxr w10, x12, [x11]
@@ -6826,9 +6654,7 @@ define dso_local i64 @atomicrmw_min_i64_aligned_monotonic(ptr %ptr, i64 %value)
 define dso_local i64 @atomicrmw_min_i64_aligned_acquire(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_min_i64_aligned_acquire:
 ; -O0:    subs x10, x8, x9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x12, x8, x9, ne
+; -O0:    csel x12, x8, x9, le
 ; -O0:    ldaxr x9, [x11]
 ; -O0:    cmp x9, x8
 ; -O0:    stlxr w10, x12, [x11]
@@ -6846,9 +6672,7 @@ define dso_local i64 @atomicrmw_min_i64_aligned_acquire(ptr %ptr, i64 %value) {
 define dso_local i64 @atomicrmw_min_i64_aligned_release(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_min_i64_aligned_release:
 ; -O0:    subs x10, x8, x9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x12, x8, x9, ne
+; -O0:    csel x12, x8, x9, le
 ; -O0:    ldaxr x9, [x11]
 ; -O0:    cmp x9, x8
 ; -O0:    stlxr w10, x12, [x11]
@@ -6866,9 +6690,7 @@ define dso_local i64 @atomicrmw_min_i64_aligned_release(ptr %ptr, i64 %value) {
 define dso_local i64 @atomicrmw_min_i64_aligned_acq_rel(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_min_i64_aligned_acq_rel:
 ; -O0:    subs x10, x8, x9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x12, x8, x9, ne
+; -O0:    csel x12, x8, x9, le
 ; -O0:    ldaxr x9, [x11]
 ; -O0:    cmp x9, x8
 ; -O0:    stlxr w10, x12, [x11]
@@ -6886,9 +6708,7 @@ define dso_local i64 @atomicrmw_min_i64_aligned_acq_rel(ptr %ptr, i64 %value) {
 define dso_local i64 @atomicrmw_min_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_min_i64_aligned_seq_cst:
 ; -O0:    subs x10, x8, x9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x12, x8, x9, ne
+; -O0:    csel x12, x8, x9, le
 ; -O0:    ldaxr x9, [x11]
 ; -O0:    cmp x9, x8
 ; -O0:    stlxr w10, x12, [x11]
@@ -6906,15 +6726,11 @@ define dso_local i64 @atomicrmw_min_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 define dso_local i128 @atomicrmw_min_i128_aligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_aligned_monotonic:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w8, w8, w10, ne
-; -O0:    and w13, w8, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x14, x10, x12, ne
-; -O0:    and w10, w8, #0x1
+; -O0:    subs x13, x13, x9
+; -O0:    csel w10, w8, w10, eq
+; -O0:    ands w13, w10, #0x1
+; -O0:    csel x14, x8, x12, ne
 ; -O0:    ands w10, w10, #0x1
 ; -O0:    csel x15, x8, x9, ne
 ; -O0:    ldxp x10, x9, [x11]
@@ -6940,15 +6756,11 @@ define dso_local i128 @atomicrmw_min_i128_aligned_monotonic(ptr %ptr, i128 %valu
 define dso_local i128 @atomicrmw_min_i128_aligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_aligned_acquire:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w8, w8, w10, ne
-; -O0:    and w13, w8, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x14, x10, x12, ne
-; -O0:    and w10, w8, #0x1
+; -O0:    subs x13, x13, x9
+; -O0:    csel w10, w8, w10, eq
+; -O0:    ands w13, w10, #0x1
+; -O0:    csel x14, x8, x12, ne
 ; -O0:    ands w10, w10, #0x1
 ; -O0:    csel x15, x8, x9, ne
 ; -O0:    ldaxp x10, x9, [x11]
@@ -6974,15 +6786,11 @@ define dso_local i128 @atomicrmw_min_i128_aligned_acquire(ptr %ptr, i128 %value)
 define dso_local i128 @atomicrmw_min_i128_aligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_aligned_release:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w8, w8, w10, ne
-; -O0:    and w13, w8, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x14, x10, x12, ne
-; -O0:    and w10, w8, #0x1
+; -O0:    subs x13, x13, x9
+; -O0:    csel w10, w8, w10, eq
+; -O0:    ands w13, w10, #0x1
+; -O0:    csel x14, x8, x12, ne
 ; -O0:    ands w10, w10, #0x1
 ; -O0:    csel x15, x8, x9, ne
 ; -O0:    ldxp x10, x9, [x11]
@@ -7008,15 +6816,11 @@ define dso_local i128 @atomicrmw_min_i128_aligned_release(ptr %ptr, i128 %value)
 define dso_local i128 @atomicrmw_min_i128_aligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_aligned_acq_rel:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w8, w8, w10, ne
-; -O0:    and w13, w8, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x14, x10, x12, ne
-; -O0:    and w10, w8, #0x1
+; -O0:    subs x13, x13, x9
+; -O0:    csel w10, w8, w10, eq
+; -O0:    ands w13, w10, #0x1
+; -O0:    csel x14, x8, x12, ne
 ; -O0:    ands w10, w10, #0x1
 ; -O0:    csel x15, x8, x9, ne
 ; -O0:    ldaxp x10, x9, [x11]
@@ -7042,15 +6846,11 @@ define dso_local i128 @atomicrmw_min_i128_aligned_acq_rel(ptr %ptr, i128 %value)
 define dso_local i128 @atomicrmw_min_i128_aligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_aligned_seq_cst:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w8, w8, w10, ne
-; -O0:    and w13, w8, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x14, x10, x12, ne
-; -O0:    and w10, w8, #0x1
+; -O0:    subs x13, x13, x9
+; -O0:    csel w10, w8, w10, eq
+; -O0:    ands w13, w10, #0x1
+; -O0:    csel x14, x8, x12, ne
 ; -O0:    ands w10, w10, #0x1
 ; -O0:    csel x15, x8, x9, ne
 ; -O0:    ldaxp x10, x9, [x11]
@@ -7077,9 +6877,7 @@ define dso_local i8 @atomicrmw_min_i8_unaligned_monotonic(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_min_i8_unaligned_monotonic:
 ; -O0:    sxtb w9, w10
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, le
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -7100,9 +6898,7 @@ define dso_local i8 @atomicrmw_min_i8_unaligned_acquire(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_min_i8_unaligned_acquire:
 ; -O0:    sxtb w9, w10
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, le
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -7123,9 +6919,7 @@ define dso_local i8 @atomicrmw_min_i8_unaligned_release(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_min_i8_unaligned_release:
 ; -O0:    sxtb w9, w10
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, le
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -7146,9 +6940,7 @@ define dso_local i8 @atomicrmw_min_i8_unaligned_acq_rel(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_min_i8_unaligned_acq_rel:
 ; -O0:    sxtb w9, w10
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, le
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -7169,9 +6961,7 @@ define dso_local i8 @atomicrmw_min_i8_unaligned_seq_cst(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_min_i8_unaligned_seq_cst:
 ; -O0:    sxtb w9, w10
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, le
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -7192,9 +6982,7 @@ define dso_local i16 @atomicrmw_min_i16_unaligned_monotonic(ptr %ptr, i16 %value
 ; -O0-LABEL: atomicrmw_min_i16_unaligned_monotonic:
 ; -O0:    sxth w10, w9
 ; -O0:    subs w10, w10, w8, sxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, le
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i16_unaligned_monotonic:
@@ -7210,9 +6998,7 @@ define dso_local i16 @atomicrmw_min_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ; -O0-LABEL: atomicrmw_min_i16_unaligned_acquire:
 ; -O0:    sxth w10, w9
 ; -O0:    subs w10, w10, w8, sxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, le
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i16_unaligned_acquire:
@@ -7228,9 +7014,7 @@ define dso_local i16 @atomicrmw_min_i16_unaligned_release(ptr %ptr, i16 %value)
 ; -O0-LABEL: atomicrmw_min_i16_unaligned_release:
 ; -O0:    sxth w10, w9
 ; -O0:    subs w10, w10, w8, sxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, le
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i16_unaligned_release:
@@ -7246,9 +7030,7 @@ define dso_local i16 @atomicrmw_min_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ; -O0-LABEL: atomicrmw_min_i16_unaligned_acq_rel:
 ; -O0:    sxth w10, w9
 ; -O0:    subs w10, w10, w8, sxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, le
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i16_unaligned_acq_rel:
@@ -7264,9 +7046,7 @@ define dso_local i16 @atomicrmw_min_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ; -O0-LABEL: atomicrmw_min_i16_unaligned_seq_cst:
 ; -O0:    sxth w10, w9
 ; -O0:    subs w10, w10, w8, sxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, le
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i16_unaligned_seq_cst:
@@ -7281,9 +7061,7 @@ define dso_local i16 @atomicrmw_min_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 define dso_local i32 @atomicrmw_min_i32_unaligned_monotonic(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_min_i32_unaligned_monotonic:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, le
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i32_unaligned_monotonic:
@@ -7297,9 +7075,7 @@ define dso_local i32 @atomicrmw_min_i32_unaligned_monotonic(ptr %ptr, i32 %value
 define dso_local i32 @atomicrmw_min_i32_unaligned_acquire(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_min_i32_unaligned_acquire:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, le
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i32_unaligned_acquire:
@@ -7313,9 +7089,7 @@ define dso_local i32 @atomicrmw_min_i32_unaligned_acquire(ptr %ptr, i32 %value)
 define dso_local i32 @atomicrmw_min_i32_unaligned_release(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_min_i32_unaligned_release:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, le
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i32_unaligned_release:
@@ -7329,9 +7103,7 @@ define dso_local i32 @atomicrmw_min_i32_unaligned_release(ptr %ptr, i32 %value)
 define dso_local i32 @atomicrmw_min_i32_unaligned_acq_rel(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_min_i32_unaligned_acq_rel:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, le
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i32_unaligned_acq_rel:
@@ -7345,9 +7117,7 @@ define dso_local i32 @atomicrmw_min_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 define dso_local i32 @atomicrmw_min_i32_unaligned_seq_cst(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_min_i32_unaligned_seq_cst:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, le
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i32_unaligned_seq_cst:
@@ -7361,9 +7131,7 @@ define dso_local i32 @atomicrmw_min_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 define dso_local i64 @atomicrmw_min_i64_unaligned_monotonic(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_min_i64_unaligned_monotonic:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, le
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i64_unaligned_monotonic:
@@ -7377,9 +7145,7 @@ define dso_local i64 @atomicrmw_min_i64_unaligned_monotonic(ptr %ptr, i64 %value
 define dso_local i64 @atomicrmw_min_i64_unaligned_acquire(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_min_i64_unaligned_acquire:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, le
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i64_unaligned_acquire:
@@ -7393,9 +7159,7 @@ define dso_local i64 @atomicrmw_min_i64_unaligned_acquire(ptr %ptr, i64 %value)
 define dso_local i64 @atomicrmw_min_i64_unaligned_release(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_min_i64_unaligned_release:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, le
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i64_unaligned_release:
@@ -7409,9 +7173,7 @@ define dso_local i64 @atomicrmw_min_i64_unaligned_release(ptr %ptr, i64 %value)
 define dso_local i64 @atomicrmw_min_i64_unaligned_acq_rel(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_min_i64_unaligned_acq_rel:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, le
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i64_unaligned_acq_rel:
@@ -7425,9 +7187,7 @@ define dso_local i64 @atomicrmw_min_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 define dso_local i64 @atomicrmw_min_i64_unaligned_seq_cst(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_min_i64_unaligned_seq_cst:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, le
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i64_unaligned_seq_cst:
@@ -7441,15 +7201,11 @@ define dso_local i64 @atomicrmw_min_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 define dso_local i128 @atomicrmw_min_i128_unaligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_unaligned_monotonic:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -7467,15 +7223,11 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_monotonic(ptr %ptr, i128 %va
 define dso_local i128 @atomicrmw_min_i128_unaligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_unaligned_acquire:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -7493,15 +7245,11 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_acquire(ptr %ptr, i128 %valu
 define dso_local i128 @atomicrmw_min_i128_unaligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_unaligned_release:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -7519,15 +7267,11 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_release(ptr %ptr, i128 %valu
 define dso_local i128 @atomicrmw_min_i128_unaligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_unaligned_acq_rel:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -7545,15 +7289,11 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_acq_rel(ptr %ptr, i128 %valu
 define dso_local i128 @atomicrmw_min_i128_unaligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_unaligned_seq_cst:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -7572,9 +7312,7 @@ define dso_local i8 @atomicrmw_umax_i8_aligned_monotonic(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umax_i8_aligned_monotonic:
 ; -O0:    and w9, w10, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, hi
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -7595,9 +7333,7 @@ define dso_local i8 @atomicrmw_umax_i8_aligned_acquire(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umax_i8_aligned_acquire:
 ; -O0:    and w9, w10, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, hi
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -7618,9 +7354,7 @@ define dso_local i8 @atomicrmw_umax_i8_aligned_release(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umax_i8_aligned_release:
 ; -O0:    and w9, w10, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, hi
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -7641,9 +7375,7 @@ define dso_local i8 @atomicrmw_umax_i8_aligned_acq_rel(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umax_i8_aligned_acq_rel:
 ; -O0:    and w9, w10, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, hi
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -7664,9 +7396,7 @@ define dso_local i8 @atomicrmw_umax_i8_aligned_seq_cst(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umax_i8_aligned_seq_cst:
 ; -O0:    and w9, w10, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, hi
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -7686,9 +7416,7 @@ define dso_local i8 @atomicrmw_umax_i8_aligned_seq_cst(ptr %ptr, i8 %value) {
 define dso_local i16 @atomicrmw_umax_i16_aligned_monotonic(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umax_i16_aligned_monotonic:
 ; -O0:    subs w10, w10, w9, uxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, hi
 ; -O0:    ldaxrh w9, [x11]
 ; -O0:    cmp w9, w8, uxth
 ; -O0:    stlxrh w10, w12, [x11]
@@ -7707,9 +7435,7 @@ define dso_local i16 @atomicrmw_umax_i16_aligned_monotonic(ptr %ptr, i16 %value)
 define dso_local i16 @atomicrmw_umax_i16_aligned_acquire(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umax_i16_aligned_acquire:
 ; -O0:    subs w10, w10, w9, uxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, hi
 ; -O0:    ldaxrh w9, [x11]
 ; -O0:    cmp w9, w8, uxth
 ; -O0:    stlxrh w10, w12, [x11]
@@ -7728,9 +7454,7 @@ define dso_local i16 @atomicrmw_umax_i16_aligned_acquire(ptr %ptr, i16 %value) {
 define dso_local i16 @atomicrmw_umax_i16_aligned_release(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umax_i16_aligned_release:
 ; -O0:    subs w10, w10, w9, uxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, hi
 ; -O0:    ldaxrh w9, [x11]
 ; -O0:    cmp w9, w8, uxth
 ; -O0:    stlxrh w10, w12, [x11]
@@ -7749,9 +7473,7 @@ define dso_local i16 @atomicrmw_umax_i16_aligned_release(ptr %ptr, i16 %value) {
 define dso_local i16 @atomicrmw_umax_i16_aligned_acq_rel(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umax_i16_aligned_acq_rel:
 ; -O0:    subs w10, w10, w9, uxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, hi
 ; -O0:    ldaxrh w9, [x11]
 ; -O0:    cmp w9, w8, uxth
 ; -O0:    stlxrh w10, w12, [x11]
@@ -7770,9 +7492,7 @@ define dso_local i16 @atomicrmw_umax_i16_aligned_acq_rel(ptr %ptr, i16 %value) {
 define dso_local i16 @atomicrmw_umax_i16_aligned_seq_cst(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umax_i16_aligned_seq_cst:
 ; -O0:    subs w10, w10, w9, uxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, hi
 ; -O0:    ldaxrh w9, [x11]
 ; -O0:    cmp w9, w8, uxth
 ; -O0:    stlxrh w10, w12, [x11]
@@ -7791,9 +7511,7 @@ define dso_local i16 @atomicrmw_umax_i16_aligned_seq_cst(ptr %ptr, i16 %value) {
 define dso_local i32 @atomicrmw_umax_i32_aligned_monotonic(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umax_i32_aligned_monotonic:
 ; -O0:    subs w10, w8, w9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, hi
 ; -O0:    ldaxr w9, [x11]
 ; -O0:    cmp w9, w8
 ; -O0:    stlxr w10, w12, [x11]
@@ -7811,9 +7529,7 @@ define dso_local i32 @atomicrmw_umax_i32_aligned_monotonic(ptr %ptr, i32 %value)
 define dso_local i32 @atomicrmw_umax_i32_aligned_acquire(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umax_i32_aligned_acquire:
 ; -O0:    subs w10, w8, w9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, hi
 ; -O0:    ldaxr w9, [x11]
 ; -O0:    cmp w9, w8
 ; -O0:    stlxr w10, w12, [x11]
@@ -7831,9 +7547,7 @@ define dso_local i32 @atomicrmw_umax_i32_aligned_acquire(ptr %ptr, i32 %value) {
 define dso_local i32 @atomicrmw_umax_i32_aligned_release(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umax_i32_aligned_release:
 ; -O0:    subs w10, w8, w9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, hi
 ; -O0:    ldaxr w9, [x11]
 ; -O0:    cmp w9, w8
 ; -O0:    stlxr w10, w12, [x11]
@@ -7851,9 +7565,7 @@ define dso_local i32 @atomicrmw_umax_i32_aligned_release(ptr %ptr, i32 %value) {
 define dso_local i32 @atomicrmw_umax_i32_aligned_acq_rel(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umax_i32_aligned_acq_rel:
 ; -O0:    subs w10, w8, w9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, hi
 ; -O0:    ldaxr w9, [x11]
 ; -O0:    cmp w9, w8
 ; -O0:    stlxr w10, w12, [x11]
@@ -7871,9 +7583,7 @@ define dso_local i32 @atomicrmw_umax_i32_aligned_acq_rel(ptr %ptr, i32 %value) {
 define dso_local i32 @atomicrmw_umax_i32_aligned_seq_cst(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umax_i32_aligned_seq_cst:
 ; -O0:    subs w10, w8, w9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, hi
 ; -O0:    ldaxr w9, [x11]
 ; -O0:    cmp w9, w8
 ; -O0:    stlxr w10, w12, [x11]
@@ -7891,9 +7601,7 @@ define dso_local i32 @atomicrmw_umax_i32_aligned_seq_cst(ptr %ptr, i32 %value) {
 define dso_local i64 @atomicrmw_umax_i64_aligned_monotonic(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umax_i64_aligned_monotonic:
 ; -O0:    subs x10, x8, x9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x12, x8, x9, ne
+; -O0:    csel x12, x8, x9, hi
 ; -O0:    ldaxr x9, [x11]
 ; -O0:    cmp x9, x8
 ; -O0:    stlxr w10, x12, [x11]
@@ -7911,9 +7619,7 @@ define dso_local i64 @atomicrmw_umax_i64_aligned_monotonic(ptr %ptr, i64 %value)
 define dso_local i64 @atomicrmw_umax_i64_aligned_acquire(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umax_i64_aligned_acquire:
 ; -O0:    subs x10, x8, x9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x12, x8, x9, ne
+; -O0:    csel x12, x8, x9, hi
 ; -O0:    ldaxr x9, [x11]
 ; -O0:    cmp x9, x8
 ; -O0:    stlxr w10, x12, [x11]
@@ -7931,9 +7637,7 @@ define dso_local i64 @atomicrmw_umax_i64_aligned_acquire(ptr %ptr, i64 %value) {
 define dso_local i64 @atomicrmw_umax_i64_aligned_release(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umax_i64_aligned_release:
 ; -O0:    subs x10, x8, x9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x12, x8, x9, ne
+; -O0:    csel x12, x8, x9, hi
 ; -O0:    ldaxr x9, [x11]
 ; -O0:    cmp x9, x8
 ; -O0:    stlxr w10, x12, [x11]
@@ -7951,9 +7655,7 @@ define dso_local i64 @atomicrmw_umax_i64_aligned_release(ptr %ptr, i64 %value) {
 define dso_local i64 @atomicrmw_umax_i64_aligned_acq_rel(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umax_i64_aligned_acq_rel:
 ; -O0:    subs x10, x8, x9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x12, x8, x9, ne
+; -O0:    csel x12, x8, x9, hi
 ; -O0:    ldaxr x9, [x11]
 ; -O0:    cmp x9, x8
 ; -O0:    stlxr w10, x12, [x11]
@@ -7971,9 +7673,7 @@ define dso_local i64 @atomicrmw_umax_i64_aligned_acq_rel(ptr %ptr, i64 %value) {
 define dso_local i64 @atomicrmw_umax_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umax_i64_aligned_seq_cst:
 ; -O0:    subs x10, x8, x9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x12, x8, x9, ne
+; -O0:    csel x12, x8, x9, hi
 ; -O0:    ldaxr x9, [x11]
 ; -O0:    cmp x9, x8
 ; -O0:    stlxr w10, x12, [x11]
@@ -7991,15 +7691,11 @@ define dso_local i64 @atomicrmw_umax_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 define dso_local i128 @atomicrmw_umax_i128_aligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_aligned_monotonic:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w8, w8, w10, ne
-; -O0:    and w13, w8, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x14, x10, x12, ne
-; -O0:    and w10, w8, #0x1
+; -O0:    subs x13, x13, x9
+; -O0:    csel w10, w8, w10, eq
+; -O0:    ands w13, w10, #0x1
+; -O0:    csel x14, x8, x12, ne
 ; -O0:    ands w10, w10, #0x1
 ; -O0:    csel x15, x8, x9, ne
 ; -O0:    ldxp x10, x9, [x11]
@@ -8025,15 +7721,11 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_monotonic(ptr %ptr, i128 %val
 define dso_local i128 @atomicrmw_umax_i128_aligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_aligned_acquire:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w8, w8, w10, ne
-; -O0:    and w13, w8, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x14, x10, x12, ne
-; -O0:    and w10, w8, #0x1
+; -O0:    subs x13, x13, x9
+; -O0:    csel w10, w8, w10, eq
+; -O0:    ands w13, w10, #0x1
+; -O0:    csel x14, x8, x12, ne
 ; -O0:    ands w10, w10, #0x1
 ; -O0:    csel x15, x8, x9, ne
 ; -O0:    ldaxp x10, x9, [x11]
@@ -8059,15 +7751,11 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_acquire(ptr %ptr, i128 %value
 define dso_local i128 @atomicrmw_umax_i128_aligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_aligned_release:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w8, w8, w10, ne
-; -O0:    and w13, w8, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x14, x10, x12, ne
-; -O0:    and w10, w8, #0x1
+; -O0:    subs x13, x13, x9
+; -O0:    csel w10, w8, w10, eq
+; -O0:    ands w13, w10, #0x1
+; -O0:    csel x14, x8, x12, ne
 ; -O0:    ands w10, w10, #0x1
 ; -O0:    csel x15, x8, x9, ne
 ; -O0:    ldxp x10, x9, [x11]
@@ -8093,15 +7781,11 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_release(ptr %ptr, i128 %value
 define dso_local i128 @atomicrmw_umax_i128_aligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_aligned_acq_rel:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w8, w8, w10, ne
-; -O0:    and w13, w8, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x14, x10, x12, ne
-; -O0:    and w10, w8, #0x1
+; -O0:    subs x13, x13, x9
+; -O0:    csel w10, w8, w10, eq
+; -O0:    ands w13, w10, #0x1
+; -O0:    csel x14, x8, x12, ne
 ; -O0:    ands w10, w10, #0x1
 ; -O0:    csel x15, x8, x9, ne
 ; -O0:    ldaxp x10, x9, [x11]
@@ -8127,15 +7811,11 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_acq_rel(ptr %ptr, i128 %value
 define dso_local i128 @atomicrmw_umax_i128_aligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_aligned_seq_cst:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w8, w8, w10, ne
-; -O0:    and w13, w8, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x14, x10, x12, ne
-; -O0:    and w10, w8, #0x1
+; -O0:    subs x13, x13, x9
+; -O0:    csel w10, w8, w10, eq
+; -O0:    ands w13, w10, #0x1
+; -O0:    csel x14, x8, x12, ne
 ; -O0:    ands w10, w10, #0x1
 ; -O0:    csel x15, x8, x9, ne
 ; -O0:    ldaxp x10, x9, [x11]
@@ -8162,9 +7842,7 @@ define dso_local i8 @atomicrmw_umax_i8_unaligned_monotonic(ptr %ptr, i8 %value)
 ; -O0-LABEL: atomicrmw_umax_i8_unaligned_monotonic:
 ; -O0:    and w9, w10, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, hi
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -8185,9 +7863,7 @@ define dso_local i8 @atomicrmw_umax_i8_unaligned_acquire(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umax_i8_unaligned_acquire:
 ; -O0:    and w9, w10, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, hi
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -8208,9 +7884,7 @@ define dso_local i8 @atomicrmw_umax_i8_unaligned_release(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umax_i8_unaligned_release:
 ; -O0:    and w9, w10, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, hi
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -8231,9 +7905,7 @@ define dso_local i8 @atomicrmw_umax_i8_unaligned_acq_rel(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umax_i8_unaligned_acq_rel:
 ; -O0:    and w9, w10, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, hi
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -8254,9 +7926,7 @@ define dso_local i8 @atomicrmw_umax_i8_unaligned_seq_cst(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umax_i8_unaligned_seq_cst:
 ; -O0:    and w9, w10, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, hi
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -8276,9 +7946,7 @@ define dso_local i8 @atomicrmw_umax_i8_unaligned_seq_cst(ptr %ptr, i8 %value) {
 define dso_local i16 @atomicrmw_umax_i16_unaligned_monotonic(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umax_i16_unaligned_monotonic:
 ; -O0:    subs w10, w10, w8, uxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, hi
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i16_unaligned_monotonic:
@@ -8293,9 +7961,7 @@ define dso_local i16 @atomicrmw_umax_i16_unaligned_monotonic(ptr %ptr, i16 %valu
 define dso_local i16 @atomicrmw_umax_i16_unaligned_acquire(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umax_i16_unaligned_acquire:
 ; -O0:    subs w10, w10, w8, uxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, hi
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i16_unaligned_acquire:
@@ -8310,9 +7976,7 @@ define dso_local i16 @atomicrmw_umax_i16_unaligned_acquire(ptr %ptr, i16 %value)
 define dso_local i16 @atomicrmw_umax_i16_unaligned_release(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umax_i16_unaligned_release:
 ; -O0:    subs w10, w10, w8, uxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, hi
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i16_unaligned_release:
@@ -8327,9 +7991,7 @@ define dso_local i16 @atomicrmw_umax_i16_unaligned_release(ptr %ptr, i16 %value)
 define dso_local i16 @atomicrmw_umax_i16_unaligned_acq_rel(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umax_i16_unaligned_acq_rel:
 ; -O0:    subs w10, w10, w8, uxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, hi
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i16_unaligned_acq_rel:
@@ -8344,9 +8006,7 @@ define dso_local i16 @atomicrmw_umax_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 define dso_local i16 @atomicrmw_umax_i16_unaligned_seq_cst(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umax_i16_unaligned_seq_cst:
 ; -O0:    subs w10, w10, w8, uxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, hi
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i16_unaligned_seq_cst:
@@ -8361,9 +8021,7 @@ define dso_local i16 @atomicrmw_umax_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 define dso_local i32 @atomicrmw_umax_i32_unaligned_monotonic(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umax_i32_unaligned_monotonic:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, hi
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i32_unaligned_monotonic:
@@ -8377,9 +8035,7 @@ define dso_local i32 @atomicrmw_umax_i32_unaligned_monotonic(ptr %ptr, i32 %valu
 define dso_local i32 @atomicrmw_umax_i32_unaligned_acquire(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umax_i32_unaligned_acquire:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, hi
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i32_unaligned_acquire:
@@ -8393,9 +8049,7 @@ define dso_local i32 @atomicrmw_umax_i32_unaligned_acquire(ptr %ptr, i32 %value)
 define dso_local i32 @atomicrmw_umax_i32_unaligned_release(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umax_i32_unaligned_release:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, hi
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i32_unaligned_release:
@@ -8409,9 +8063,7 @@ define dso_local i32 @atomicrmw_umax_i32_unaligned_release(ptr %ptr, i32 %value)
 define dso_local i32 @atomicrmw_umax_i32_unaligned_acq_rel(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umax_i32_unaligned_acq_rel:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, hi
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i32_unaligned_acq_rel:
@@ -8425,9 +8077,7 @@ define dso_local i32 @atomicrmw_umax_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 define dso_local i32 @atomicrmw_umax_i32_unaligned_seq_cst(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umax_i32_unaligned_seq_cst:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, hi
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i32_unaligned_seq_cst:
@@ -8441,9 +8091,7 @@ define dso_local i32 @atomicrmw_umax_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 define dso_local i64 @atomicrmw_umax_i64_unaligned_monotonic(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umax_i64_unaligned_monotonic:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, hi
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i64_unaligned_monotonic:
@@ -8457,9 +8105,7 @@ define dso_local i64 @atomicrmw_umax_i64_unaligned_monotonic(ptr %ptr, i64 %valu
 define dso_local i64 @atomicrmw_umax_i64_unaligned_acquire(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umax_i64_unaligned_acquire:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, hi
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i64_unaligned_acquire:
@@ -8473,9 +8119,7 @@ define dso_local i64 @atomicrmw_umax_i64_unaligned_acquire(ptr %ptr, i64 %value)
 define dso_local i64 @atomicrmw_umax_i64_unaligned_release(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umax_i64_unaligned_release:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, hi
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i64_unaligned_release:
@@ -8489,9 +8133,7 @@ define dso_local i64 @atomicrmw_umax_i64_unaligned_release(ptr %ptr, i64 %value)
 define dso_local i64 @atomicrmw_umax_i64_unaligned_acq_rel(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umax_i64_unaligned_acq_rel:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, hi
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i64_unaligned_acq_rel:
@@ -8505,9 +8147,7 @@ define dso_local i64 @atomicrmw_umax_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 define dso_local i64 @atomicrmw_umax_i64_unaligned_seq_cst(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umax_i64_unaligned_seq_cst:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, hi
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i64_unaligned_seq_cst:
@@ -8521,15 +8161,11 @@ define dso_local i64 @atomicrmw_umax_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 define dso_local i128 @atomicrmw_umax_i128_unaligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_unaligned_monotonic:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -8547,15 +8183,11 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_monotonic(ptr %ptr, i128 %v
 define dso_local i128 @atomicrmw_umax_i128_unaligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_unaligned_acquire:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -8573,15 +8205,11 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_acquire(ptr %ptr, i128 %val
 define dso_local i128 @atomicrmw_umax_i128_unaligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_unaligned_release:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -8599,15 +8227,11 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_release(ptr %ptr, i128 %val
 define dso_local i128 @atomicrmw_umax_i128_unaligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_unaligned_acq_rel:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -8625,15 +8249,11 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_acq_rel(ptr %ptr, i128 %val
 define dso_local i128 @atomicrmw_umax_i128_unaligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_unaligned_seq_cst:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -8652,9 +8272,7 @@ define dso_local i8 @atomicrmw_umin_i8_aligned_monotonic(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umin_i8_aligned_monotonic:
 ; -O0:    and w9, w10, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, ls
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -8675,9 +8293,7 @@ define dso_local i8 @atomicrmw_umin_i8_aligned_acquire(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umin_i8_aligned_acquire:
 ; -O0:    and w9, w10, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, ls
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -8698,9 +8314,7 @@ define dso_local i8 @atomicrmw_umin_i8_aligned_release(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umin_i8_aligned_release:
 ; -O0:    and w9, w10, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, ls
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -8721,9 +8335,7 @@ define dso_local i8 @atomicrmw_umin_i8_aligned_acq_rel(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umin_i8_aligned_acq_rel:
 ; -O0:    and w9, w10, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, ls
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -8744,9 +8356,7 @@ define dso_local i8 @atomicrmw_umin_i8_aligned_seq_cst(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umin_i8_aligned_seq_cst:
 ; -O0:    and w9, w10, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, ls
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -8766,9 +8376,7 @@ define dso_local i8 @atomicrmw_umin_i8_aligned_seq_cst(ptr %ptr, i8 %value) {
 define dso_local i16 @atomicrmw_umin_i16_aligned_monotonic(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umin_i16_aligned_monotonic:
 ; -O0:    subs w10, w10, w9, uxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, ls
 ; -O0:    ldaxrh w9, [x11]
 ; -O0:    cmp w9, w8, uxth
 ; -O0:    stlxrh w10, w12, [x11]
@@ -8787,9 +8395,7 @@ define dso_local i16 @atomicrmw_umin_i16_aligned_monotonic(ptr %ptr, i16 %value)
 define dso_local i16 @atomicrmw_umin_i16_aligned_acquire(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umin_i16_aligned_acquire:
 ; -O0:    subs w10, w10, w9, uxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, ls
 ; -O0:    ldaxrh w9, [x11]
 ; -O0:    cmp w9, w8, uxth
 ; -O0:    stlxrh w10, w12, [x11]
@@ -8808,9 +8414,7 @@ define dso_local i16 @atomicrmw_umin_i16_aligned_acquire(ptr %ptr, i16 %value) {
 define dso_local i16 @atomicrmw_umin_i16_aligned_release(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umin_i16_aligned_release:
 ; -O0:    subs w10, w10, w9, uxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, ls
 ; -O0:    ldaxrh w9, [x11]
 ; -O0:    cmp w9, w8, uxth
 ; -O0:    stlxrh w10, w12, [x11]
@@ -8829,9 +8433,7 @@ define dso_local i16 @atomicrmw_umin_i16_aligned_release(ptr %ptr, i16 %value) {
 define dso_local i16 @atomicrmw_umin_i16_aligned_acq_rel(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umin_i16_aligned_acq_rel:
 ; -O0:    subs w10, w10, w9, uxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, ls
 ; -O0:    ldaxrh w9, [x11]
 ; -O0:    cmp w9, w8, uxth
 ; -O0:    stlxrh w10, w12, [x11]
@@ -8850,9 +8452,7 @@ define dso_local i16 @atomicrmw_umin_i16_aligned_acq_rel(ptr %ptr, i16 %value) {
 define dso_local i16 @atomicrmw_umin_i16_aligned_seq_cst(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umin_i16_aligned_seq_cst:
 ; -O0:    subs w10, w10, w9, uxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, ls
 ; -O0:    ldaxrh w9, [x11]
 ; -O0:    cmp w9, w8, uxth
 ; -O0:    stlxrh w10, w12, [x11]
@@ -8871,9 +8471,7 @@ define dso_local i16 @atomicrmw_umin_i16_aligned_seq_cst(ptr %ptr, i16 %value) {
 define dso_local i32 @atomicrmw_umin_i32_aligned_monotonic(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umin_i32_aligned_monotonic:
 ; -O0:    subs w10, w8, w9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, ls
 ; -O0:    ldaxr w9, [x11]
 ; -O0:    cmp w9, w8
 ; -O0:    stlxr w10, w12, [x11]
@@ -8891,9 +8489,7 @@ define dso_local i32 @atomicrmw_umin_i32_aligned_monotonic(ptr %ptr, i32 %value)
 define dso_local i32 @atomicrmw_umin_i32_aligned_acquire(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umin_i32_aligned_acquire:
 ; -O0:    subs w10, w8, w9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, ls
 ; -O0:    ldaxr w9, [x11]
 ; -O0:    cmp w9, w8
 ; -O0:    stlxr w10, w12, [x11]
@@ -8911,9 +8507,7 @@ define dso_local i32 @atomicrmw_umin_i32_aligned_acquire(ptr %ptr, i32 %value) {
 define dso_local i32 @atomicrmw_umin_i32_aligned_release(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umin_i32_aligned_release:
 ; -O0:    subs w10, w8, w9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, ls
 ; -O0:    ldaxr w9, [x11]
 ; -O0:    cmp w9, w8
 ; -O0:    stlxr w10, w12, [x11]
@@ -8931,9 +8525,7 @@ define dso_local i32 @atomicrmw_umin_i32_aligned_release(ptr %ptr, i32 %value) {
 define dso_local i32 @atomicrmw_umin_i32_aligned_acq_rel(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umin_i32_aligned_acq_rel:
 ; -O0:    subs w10, w8, w9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, ls
 ; -O0:    ldaxr w9, [x11]
 ; -O0:    cmp w9, w8
 ; -O0:    stlxr w10, w12, [x11]
@@ -8951,9 +8543,7 @@ define dso_local i32 @atomicrmw_umin_i32_aligned_acq_rel(ptr %ptr, i32 %value) {
 define dso_local i32 @atomicrmw_umin_i32_aligned_seq_cst(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umin_i32_aligned_seq_cst:
 ; -O0:    subs w10, w8, w9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, ls
 ; -O0:    ldaxr w9, [x11]
 ; -O0:    cmp w9, w8
 ; -O0:    stlxr w10, w12, [x11]
@@ -8971,9 +8561,7 @@ define dso_local i32 @atomicrmw_umin_i32_aligned_seq_cst(ptr %ptr, i32 %value) {
 define dso_local i64 @atomicrmw_umin_i64_aligned_monotonic(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umin_i64_aligned_monotonic:
 ; -O0:    subs x10, x8, x9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x12, x8, x9, ne
+; -O0:    csel x12, x8, x9, ls
 ; -O0:    ldaxr x9, [x11]
 ; -O0:    cmp x9, x8
 ; -O0:    stlxr w10, x12, [x11]
@@ -8991,9 +8579,7 @@ define dso_local i64 @atomicrmw_umin_i64_aligned_monotonic(ptr %ptr, i64 %value)
 define dso_local i64 @atomicrmw_umin_i64_aligned_acquire(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umin_i64_aligned_acquire:
 ; -O0:    subs x10, x8, x9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x12, x8, x9, ne
+; -O0:    csel x12, x8, x9, ls
 ; -O0:    ldaxr x9, [x11]
 ; -O0:    cmp x9, x8
 ; -O0:    stlxr w10, x12, [x11]
@@ -9011,9 +8597,7 @@ define dso_local i64 @atomicrmw_umin_i64_aligned_acquire(ptr %ptr, i64 %value) {
 define dso_local i64 @atomicrmw_umin_i64_aligned_release(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umin_i64_aligned_release:
 ; -O0:    subs x10, x8, x9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x12, x8, x9, ne
+; -O0:    csel x12, x8, x9, ls
 ; -O0:    ldaxr x9, [x11]
 ; -O0:    cmp x9, x8
 ; -O0:    stlxr w10, x12, [x11]
@@ -9031,9 +8615,7 @@ define dso_local i64 @atomicrmw_umin_i64_aligned_release(ptr %ptr, i64 %value) {
 define dso_local i64 @atomicrmw_umin_i64_aligned_acq_rel(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umin_i64_aligned_acq_rel:
 ; -O0:    subs x10, x8, x9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x12, x8, x9, ne
+; -O0:    csel x12, x8, x9, ls
 ; -O0:    ldaxr x9, [x11]
 ; -O0:    cmp x9, x8
 ; -O0:    stlxr w10, x12, [x11]
@@ -9051,9 +8633,7 @@ define dso_local i64 @atomicrmw_umin_i64_aligned_acq_rel(ptr %ptr, i64 %value) {
 define dso_local i64 @atomicrmw_umin_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umin_i64_aligned_seq_cst:
 ; -O0:    subs x10, x8, x9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x12, x8, x9, ne
+; -O0:    csel x12, x8, x9, ls
 ; -O0:    ldaxr x9, [x11]
 ; -O0:    cmp x9, x8
 ; -O0:    stlxr w10, x12, [x11]
@@ -9071,15 +8651,11 @@ define dso_local i64 @atomicrmw_umin_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 define dso_local i128 @atomicrmw_umin_i128_aligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_aligned_monotonic:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w8, w8, w10, ne
-; -O0:    and w13, w8, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x14, x10, x12, ne
-; -O0:    and w10, w8, #0x1
+; -O0:    subs x13, x13, x9
+; -O0:    csel w10, w8, w10, eq
+; -O0:    ands w13, w10, #0x1
+; -O0:    csel x14, x8, x12, ne
 ; -O0:    ands w10, w10, #0x1
 ; -O0:    csel x15, x8, x9, ne
 ; -O0:    ldxp x10, x9, [x11]
@@ -9105,15 +8681,11 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_monotonic(ptr %ptr, i128 %val
 define dso_local i128 @atomicrmw_umin_i128_aligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_aligned_acquire:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w8, w8, w10, ne
-; -O0:    and w13, w8, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x14, x10, x12, ne
-; -O0:    and w10, w8, #0x1
+; -O0:    subs x13, x13, x9
+; -O0:    csel w10, w8, w10, eq
+; -O0:    ands w13, w10, #0x1
+; -O0:    csel x14, x8, x12, ne
 ; -O0:    ands w10, w10, #0x1
 ; -O0:    csel x15, x8, x9, ne
 ; -O0:    ldaxp x10, x9, [x11]
@@ -9139,15 +8711,11 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_acquire(ptr %ptr, i128 %value
 define dso_local i128 @atomicrmw_umin_i128_aligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_aligned_release:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w8, w8, w10, ne
-; -O0:    and w13, w8, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x14, x10, x12, ne
-; -O0:    and w10, w8, #0x1
+; -O0:    subs x13, x13, x9
+; -O0:    csel w10, w8, w10, eq
+; -O0:    ands w13, w10, #0x1
+; -O0:    csel x14, x8, x12, ne
 ; -O0:    ands w10, w10, #0x1
 ; -O0:    csel x15, x8, x9, ne
 ; -O0:    ldxp x10, x9, [x11]
@@ -9173,15 +8741,11 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_release(ptr %ptr, i128 %value
 define dso_local i128 @atomicrmw_umin_i128_aligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_aligned_acq_rel:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w8, w8, w10, ne
-; -O0:    and w13, w8, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x14, x10, x12, ne
-; -O0:    and w10, w8, #0x1
+; -O0:    subs x13, x13, x9
+; -O0:    csel w10, w8, w10, eq
+; -O0:    ands w13, w10, #0x1
+; -O0:    csel x14, x8, x12, ne
 ; -O0:    ands w10, w10, #0x1
 ; -O0:    csel x15, x8, x9, ne
 ; -O0:    ldaxp x10, x9, [x11]
@@ -9207,15 +8771,11 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_acq_rel(ptr %ptr, i128 %value
 define dso_local i128 @atomicrmw_umin_i128_aligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_aligned_seq_cst:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w8, w8, w10, ne
-; -O0:    and w13, w8, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x14, x10, x12, ne
-; -O0:    and w10, w8, #0x1
+; -O0:    subs x13, x13, x9
+; -O0:    csel w10, w8, w10, eq
+; -O0:    ands w13, w10, #0x1
+; -O0:    csel x14, x8, x12, ne
 ; -O0:    ands w10, w10, #0x1
 ; -O0:    csel x15, x8, x9, ne
 ; -O0:    ldaxp x10, x9, [x11]
@@ -9242,9 +8802,7 @@ define dso_local i8 @atomicrmw_umin_i8_unaligned_monotonic(ptr %ptr, i8 %value)
 ; -O0-LABEL: atomicrmw_umin_i8_unaligned_monotonic:
 ; -O0:    and w9, w10, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, ls
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -9265,9 +8823,7 @@ define dso_local i8 @atomicrmw_umin_i8_unaligned_acquire(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umin_i8_unaligned_acquire:
 ; -O0:    and w9, w10, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, ls
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -9288,9 +8844,7 @@ define dso_local i8 @atomicrmw_umin_i8_unaligned_release(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umin_i8_unaligned_release:
 ; -O0:    and w9, w10, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, ls
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -9311,9 +8865,7 @@ define dso_local i8 @atomicrmw_umin_i8_unaligned_acq_rel(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umin_i8_unaligned_acq_rel:
 ; -O0:    and w9, w10, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, ls
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -9334,9 +8886,7 @@ define dso_local i8 @atomicrmw_umin_i8_unaligned_seq_cst(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umin_i8_unaligned_seq_cst:
 ; -O0:    and w9, w10, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, ls
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -9356,9 +8906,7 @@ define dso_local i8 @atomicrmw_umin_i8_unaligned_seq_cst(ptr %ptr, i8 %value) {
 define dso_local i16 @atomicrmw_umin_i16_unaligned_monotonic(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umin_i16_unaligned_monotonic:
 ; -O0:    subs w10, w10, w8, uxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, ls
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i16_unaligned_monotonic:
@@ -9373,9 +8921,7 @@ define dso_local i16 @atomicrmw_umin_i16_unaligned_monotonic(ptr %ptr, i16 %valu
 define dso_local i16 @atomicrmw_umin_i16_unaligned_acquire(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umin_i16_unaligned_acquire:
 ; -O0:    subs w10, w10, w8, uxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, ls
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i16_unaligned_acquire:
@@ -9390,9 +8936,7 @@ define dso_local i16 @atomicrmw_umin_i16_unaligned_acquire(ptr %ptr, i16 %value)
 define dso_local i16 @atomicrmw_umin_i16_unaligned_release(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umin_i16_unaligned_release:
 ; -O0:    subs w10, w10, w8, uxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, ls
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i16_unaligned_release:
@@ -9407,9 +8951,7 @@ define dso_local i16 @atomicrmw_umin_i16_unaligned_release(ptr %ptr, i16 %value)
 define dso_local i16 @atomicrmw_umin_i16_unaligned_acq_rel(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umin_i16_unaligned_acq_rel:
 ; -O0:    subs w10, w10, w8, uxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, ls
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i16_unaligned_acq_rel:
@@ -9424,9 +8966,7 @@ define dso_local i16 @atomicrmw_umin_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 define dso_local i16 @atomicrmw_umin_i16_unaligned_seq_cst(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umin_i16_unaligned_seq_cst:
 ; -O0:    subs w10, w10, w8, uxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, ls
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i16_unaligned_seq_cst:
@@ -9441,9 +8981,7 @@ define dso_local i16 @atomicrmw_umin_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 define dso_local i32 @atomicrmw_umin_i32_unaligned_monotonic(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umin_i32_unaligned_monotonic:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, ls
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i32_unaligned_monotonic:
@@ -9457,9 +8995,7 @@ define dso_local i32 @atomicrmw_umin_i32_unaligned_monotonic(ptr %ptr, i32 %valu
 define dso_local i32 @atomicrmw_umin_i32_unaligned_acquire(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umin_i32_unaligned_acquire:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, ls
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i32_unaligned_acquire:
@@ -9473,9 +9009,7 @@ define dso_local i32 @atomicrmw_umin_i32_unaligned_acquire(ptr %ptr, i32 %value)
 define dso_local i32 @atomicrmw_umin_i32_unaligned_release(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umin_i32_unaligned_release:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, ls
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i32_unaligned_release:
@@ -9489,9 +9023,7 @@ define dso_local i32 @atomicrmw_umin_i32_unaligned_release(ptr %ptr, i32 %value)
 define dso_local i32 @atomicrmw_umin_i32_unaligned_acq_rel(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umin_i32_unaligned_acq_rel:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, ls
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i32_unaligned_acq_rel:
@@ -9505,9 +9037,7 @@ define dso_local i32 @atomicrmw_umin_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 define dso_local i32 @atomicrmw_umin_i32_unaligned_seq_cst(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umin_i32_unaligned_seq_cst:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, ls
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i32_unaligned_seq_cst:
@@ -9521,9 +9051,7 @@ define dso_local i32 @atomicrmw_umin_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 define dso_local i64 @atomicrmw_umin_i64_unaligned_monotonic(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umin_i64_unaligned_monotonic:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, ls
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i64_unaligned_monotonic:
@@ -9537,9 +9065,7 @@ define dso_local i64 @atomicrmw_umin_i64_unaligned_monotonic(ptr %ptr, i64 %valu
 define dso_local i64 @atomicrmw_umin_i64_unaligned_acquire(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umin_i64_unaligned_acquire:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, ls
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i64_unaligned_acquire:
@@ -9553,9 +9079,7 @@ define dso_local i64 @atomicrmw_umin_i64_unaligned_acquire(ptr %ptr, i64 %value)
 define dso_local i64 @atomicrmw_umin_i64_unaligned_release(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umin_i64_unaligned_release:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, ls
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i64_unaligned_release:
@@ -9569,9 +9093,7 @@ define dso_local i64 @atomicrmw_umin_i64_unaligned_release(ptr %ptr, i64 %value)
 define dso_local i64 @atomicrmw_umin_i64_unaligned_acq_rel(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umin_i64_unaligned_acq_rel:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, ls
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i64_unaligned_acq_rel:
@@ -9585,9 +9107,7 @@ define dso_local i64 @atomicrmw_umin_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 define dso_local i64 @atomicrmw_umin_i64_unaligned_seq_cst(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umin_i64_unaligned_seq_cst:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, ls
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i64_unaligned_seq_cst:
@@ -9601,15 +9121,11 @@ define dso_local i64 @atomicrmw_umin_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 define dso_local i128 @atomicrmw_umin_i128_unaligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_unaligned_monotonic:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -9627,15 +9143,11 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_monotonic(ptr %ptr, i128 %v
 define dso_local i128 @atomicrmw_umin_i128_unaligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_unaligned_acquire:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -9653,15 +9165,11 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_acquire(ptr %ptr, i128 %val
 define dso_local i128 @atomicrmw_umin_i128_unaligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_unaligned_release:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -9679,15 +9187,11 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_release(ptr %ptr, i128 %val
 define dso_local i128 @atomicrmw_umin_i128_unaligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_unaligned_acq_rel:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -9705,15 +9209,11 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_acq_rel(ptr %ptr, i128 %val
 define dso_local i128 @atomicrmw_umin_i128_unaligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_unaligned_seq_cst:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
diff --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-lse2_lse128.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-lse2_lse128.ll
index 4b0371d23b5b8..a1712a5ec7a27 100644
--- a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-lse2_lse128.ll
+++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-lse2_lse128.ll
@@ -511,7 +511,6 @@ define dso_local i64 @atomicrmw_add_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 define dso_local i128 @atomicrmw_add_i128_aligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_add_i128_aligned_monotonic:
 ; -O0:    adds x2, x9, x11
-; -O0:    and w11, w9, #0x1
 ; -O0:    subs w11, w11, #1
 ; -O0:    casp x0, x1, x2, x3, [x8]
 ; -O0:    eor x8, x10, x8
@@ -532,7 +531,6 @@ define dso_local i128 @atomicrmw_add_i128_aligned_monotonic(ptr %ptr, i128 %valu
 define dso_local i128 @atomicrmw_add_i128_aligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_add_i128_aligned_acquire:
 ; -O0:    adds x2, x9, x11
-; -O0:    and w11, w9, #0x1
 ; -O0:    subs w11, w11, #1
 ; -O0:    caspa x0, x1, x2, x3, [x8]
 ; -O0:    eor x8, x10, x8
@@ -553,7 +551,6 @@ define dso_local i128 @atomicrmw_add_i128_aligned_acquire(ptr %ptr, i128 %value)
 define dso_local i128 @atomicrmw_add_i128_aligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_add_i128_aligned_release:
 ; -O0:    adds x2, x9, x11
-; -O0:    and w11, w9, #0x1
 ; -O0:    subs w11, w11, #1
 ; -O0:    caspl x0, x1, x2, x3, [x8]
 ; -O0:    eor x8, x10, x8
@@ -574,7 +571,6 @@ define dso_local i128 @atomicrmw_add_i128_aligned_release(ptr %ptr, i128 %value)
 define dso_local i128 @atomicrmw_add_i128_aligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_add_i128_aligned_acq_rel:
 ; -O0:    adds x2, x9, x11
-; -O0:    and w11, w9, #0x1
 ; -O0:    subs w11, w11, #1
 ; -O0:    caspal x0, x1, x2, x3, [x8]
 ; -O0:    eor x8, x10, x8
@@ -595,7 +591,6 @@ define dso_local i128 @atomicrmw_add_i128_aligned_acq_rel(ptr %ptr, i128 %value)
 define dso_local i128 @atomicrmw_add_i128_aligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_add_i128_aligned_seq_cst:
 ; -O0:    adds x2, x9, x11
-; -O0:    and w11, w9, #0x1
 ; -O0:    subs w11, w11, #1
 ; -O0:    caspal x0, x1, x2, x3, [x8]
 ; -O0:    eor x8, x10, x8
@@ -831,7 +826,6 @@ define dso_local i64 @atomicrmw_add_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 define dso_local i128 @atomicrmw_add_i128_unaligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_add_i128_unaligned_monotonic:
 ; -O0:    adds x9, x8, x9
-; -O0:    and w11, w8, #0x1
 ; -O0:    subs w11, w11, #1
 ; -O0:    bl __atomic_compare_exchange
 ;
@@ -846,7 +840,6 @@ define dso_local i128 @atomicrmw_add_i128_unaligned_monotonic(ptr %ptr, i128 %va
 define dso_local i128 @atomicrmw_add_i128_unaligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_add_i128_unaligned_acquire:
 ; -O0:    adds x9, x8, x9
-; -O0:    and w11, w8, #0x1
 ; -O0:    subs w11, w11, #1
 ; -O0:    bl __atomic_compare_exchange
 ;
@@ -861,7 +854,6 @@ define dso_local i128 @atomicrmw_add_i128_unaligned_acquire(ptr %ptr, i128 %valu
 define dso_local i128 @atomicrmw_add_i128_unaligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_add_i128_unaligned_release:
 ; -O0:    adds x9, x8, x9
-; -O0:    and w11, w8, #0x1
 ; -O0:    subs w11, w11, #1
 ; -O0:    bl __atomic_compare_exchange
 ;
@@ -876,7 +868,6 @@ define dso_local i128 @atomicrmw_add_i128_unaligned_release(ptr %ptr, i128 %valu
 define dso_local i128 @atomicrmw_add_i128_unaligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_add_i128_unaligned_acq_rel:
 ; -O0:    adds x9, x8, x9
-; -O0:    and w11, w8, #0x1
 ; -O0:    subs w11, w11, #1
 ; -O0:    bl __atomic_compare_exchange
 ;
@@ -891,7 +882,6 @@ define dso_local i128 @atomicrmw_add_i128_unaligned_acq_rel(ptr %ptr, i128 %valu
 define dso_local i128 @atomicrmw_add_i128_unaligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_add_i128_unaligned_seq_cst:
 ; -O0:    adds x9, x8, x9
-; -O0:    and w11, w8, #0x1
 ; -O0:    subs w11, w11, #1
 ; -O0:    bl __atomic_compare_exchange
 ;
@@ -1046,7 +1036,6 @@ define dso_local i64 @atomicrmw_sub_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 define dso_local i128 @atomicrmw_sub_i128_aligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_sub_i128_aligned_monotonic:
 ; -O0:    subs x2, x9, x11
-; -O0:    and w11, w9, #0x1
 ; -O0:    casp x0, x1, x2, x3, [x8]
 ; -O0:    eor x8, x10, x8
 ; -O0:    eor x11, x9, x11
@@ -1066,7 +1055,6 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_monotonic(ptr %ptr, i128 %valu
 define dso_local i128 @atomicrmw_sub_i128_aligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_sub_i128_aligned_acquire:
 ; -O0:    subs x2, x9, x11
-; -O0:    and w11, w9, #0x1
 ; -O0:    caspa x0, x1, x2, x3, [x8]
 ; -O0:    eor x8, x10, x8
 ; -O0:    eor x11, x9, x11
@@ -1086,7 +1074,6 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_acquire(ptr %ptr, i128 %value)
 define dso_local i128 @atomicrmw_sub_i128_aligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_sub_i128_aligned_release:
 ; -O0:    subs x2, x9, x11
-; -O0:    and w11, w9, #0x1
 ; -O0:    caspl x0, x1, x2, x3, [x8]
 ; -O0:    eor x8, x10, x8
 ; -O0:    eor x11, x9, x11
@@ -1106,7 +1093,6 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_release(ptr %ptr, i128 %value)
 define dso_local i128 @atomicrmw_sub_i128_aligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_sub_i128_aligned_acq_rel:
 ; -O0:    subs x2, x9, x11
-; -O0:    and w11, w9, #0x1
 ; -O0:    caspal x0, x1, x2, x3, [x8]
 ; -O0:    eor x8, x10, x8
 ; -O0:    eor x11, x9, x11
@@ -1126,7 +1112,6 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_acq_rel(ptr %ptr, i128 %value)
 define dso_local i128 @atomicrmw_sub_i128_aligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_sub_i128_aligned_seq_cst:
 ; -O0:    subs x2, x9, x11
-; -O0:    and w11, w9, #0x1
 ; -O0:    caspal x0, x1, x2, x3, [x8]
 ; -O0:    eor x8, x10, x8
 ; -O0:    eor x11, x9, x11
@@ -1361,7 +1346,6 @@ define dso_local i64 @atomicrmw_sub_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 define dso_local i128 @atomicrmw_sub_i128_unaligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_sub_i128_unaligned_monotonic:
 ; -O0:    subs x9, x8, x9
-; -O0:    and w11, w8, #0x1
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i128_unaligned_monotonic:
@@ -1375,7 +1359,6 @@ define dso_local i128 @atomicrmw_sub_i128_unaligned_monotonic(ptr %ptr, i128 %va
 define dso_local i128 @atomicrmw_sub_i128_unaligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_sub_i128_unaligned_acquire:
 ; -O0:    subs x9, x8, x9
-; -O0:    and w11, w8, #0x1
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i128_unaligned_acquire:
@@ -1389,7 +1372,6 @@ define dso_local i128 @atomicrmw_sub_i128_unaligned_acquire(ptr %ptr, i128 %valu
 define dso_local i128 @atomicrmw_sub_i128_unaligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_sub_i128_unaligned_release:
 ; -O0:    subs x9, x8, x9
-; -O0:    and w11, w8, #0x1
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i128_unaligned_release:
@@ -1403,7 +1385,6 @@ define dso_local i128 @atomicrmw_sub_i128_unaligned_release(ptr %ptr, i128 %valu
 define dso_local i128 @atomicrmw_sub_i128_unaligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_sub_i128_unaligned_acq_rel:
 ; -O0:    subs x9, x8, x9
-; -O0:    and w11, w8, #0x1
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i128_unaligned_acq_rel:
@@ -1417,7 +1398,6 @@ define dso_local i128 @atomicrmw_sub_i128_unaligned_acq_rel(ptr %ptr, i128 %valu
 define dso_local i128 @atomicrmw_sub_i128_unaligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_sub_i128_unaligned_seq_cst:
 ; -O0:    subs x9, x8, x9
-; -O0:    and w11, w8, #0x1
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i128_unaligned_seq_cst:
@@ -3951,15 +3931,11 @@ define dso_local i64 @atomicrmw_max_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 define dso_local i128 @atomicrmw_max_i128_aligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_aligned_monotonic:
 ; -O0:    subs x9, x9, x10
-; -O0:    subs x9, x9, x10
 ; -O0:    subs x9, x9, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w9, w9, w11, ne
-; -O0:    and w13, w9, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x2, x11, x12, ne
-; -O0:    and w11, w9, #0x1
+; -O0:    subs x13, x13, x10
+; -O0:    csel w11, w9, w11, eq
+; -O0:    ands w13, w11, #0x1
+; -O0:    csel x2, x9, x12, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x9, x9, x10, ne
 ; -O0:    casp x0, x1, x2, x3, [x8]
@@ -3983,15 +3959,11 @@ define dso_local i128 @atomicrmw_max_i128_aligned_monotonic(ptr %ptr, i128 %valu
 define dso_local i128 @atomicrmw_max_i128_aligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_aligned_acquire:
 ; -O0:    subs x9, x9, x10
-; -O0:    subs x9, x9, x10
 ; -O0:    subs x9, x9, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w9, w9, w11, ne
-; -O0:    and w13, w9, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x2, x11, x12, ne
-; -O0:    and w11, w9, #0x1
+; -O0:    subs x13, x13, x10
+; -O0:    csel w11, w9, w11, eq
+; -O0:    ands w13, w11, #0x1
+; -O0:    csel x2, x9, x12, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x9, x9, x10, ne
 ; -O0:    caspa x0, x1, x2, x3, [x8]
@@ -4015,15 +3987,11 @@ define dso_local i128 @atomicrmw_max_i128_aligned_acquire(ptr %ptr, i128 %value)
 define dso_local i128 @atomicrmw_max_i128_aligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_aligned_release:
 ; -O0:    subs x9, x9, x10
-; -O0:    subs x9, x9, x10
 ; -O0:    subs x9, x9, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w9, w9, w11, ne
-; -O0:    and w13, w9, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x2, x11, x12, ne
-; -O0:    and w11, w9, #0x1
+; -O0:    subs x13, x13, x10
+; -O0:    csel w11, w9, w11, eq
+; -O0:    ands w13, w11, #0x1
+; -O0:    csel x2, x9, x12, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x9, x9, x10, ne
 ; -O0:    caspl x0, x1, x2, x3, [x8]
@@ -4047,15 +4015,11 @@ define dso_local i128 @atomicrmw_max_i128_aligned_release(ptr %ptr, i128 %value)
 define dso_local i128 @atomicrmw_max_i128_aligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_aligned_acq_rel:
 ; -O0:    subs x9, x9, x10
-; -O0:    subs x9, x9, x10
 ; -O0:    subs x9, x9, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w9, w9, w11, ne
-; -O0:    and w13, w9, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x2, x11, x12, ne
-; -O0:    and w11, w9, #0x1
+; -O0:    subs x13, x13, x10
+; -O0:    csel w11, w9, w11, eq
+; -O0:    ands w13, w11, #0x1
+; -O0:    csel x2, x9, x12, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x9, x9, x10, ne
 ; -O0:    caspal x0, x1, x2, x3, [x8]
@@ -4079,15 +4043,11 @@ define dso_local i128 @atomicrmw_max_i128_aligned_acq_rel(ptr %ptr, i128 %value)
 define dso_local i128 @atomicrmw_max_i128_aligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_aligned_seq_cst:
 ; -O0:    subs x9, x9, x10
-; -O0:    subs x9, x9, x10
 ; -O0:    subs x9, x9, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w9, w9, w11, ne
-; -O0:    and w13, w9, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x2, x11, x12, ne
-; -O0:    and w11, w9, #0x1
+; -O0:    subs x13, x13, x10
+; -O0:    csel w11, w9, w11, eq
+; -O0:    ands w13, w11, #0x1
+; -O0:    csel x2, x9, x12, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x9, x9, x10, ne
 ; -O0:    caspal x0, x1, x2, x3, [x8]
@@ -4147,9 +4107,7 @@ define dso_local i16 @atomicrmw_max_i16_unaligned_monotonic(ptr %ptr, i16 %value
 ; -O0-LABEL: atomicrmw_max_i16_unaligned_monotonic:
 ; -O0:    sxth w10, w9
 ; -O0:    subs w10, w10, w8, sxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, gt
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i16_unaligned_monotonic:
@@ -4165,9 +4123,7 @@ define dso_local i16 @atomicrmw_max_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ; -O0-LABEL: atomicrmw_max_i16_unaligned_acquire:
 ; -O0:    sxth w10, w9
 ; -O0:    subs w10, w10, w8, sxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, gt
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i16_unaligned_acquire:
@@ -4183,9 +4139,7 @@ define dso_local i16 @atomicrmw_max_i16_unaligned_release(ptr %ptr, i16 %value)
 ; -O0-LABEL: atomicrmw_max_i16_unaligned_release:
 ; -O0:    sxth w10, w9
 ; -O0:    subs w10, w10, w8, sxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, gt
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i16_unaligned_release:
@@ -4201,9 +4155,7 @@ define dso_local i16 @atomicrmw_max_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ; -O0-LABEL: atomicrmw_max_i16_unaligned_acq_rel:
 ; -O0:    sxth w10, w9
 ; -O0:    subs w10, w10, w8, sxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, gt
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i16_unaligned_acq_rel:
@@ -4219,9 +4171,7 @@ define dso_local i16 @atomicrmw_max_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ; -O0-LABEL: atomicrmw_max_i16_unaligned_seq_cst:
 ; -O0:    sxth w10, w9
 ; -O0:    subs w10, w10, w8, sxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, gt
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i16_unaligned_seq_cst:
@@ -4236,9 +4186,7 @@ define dso_local i16 @atomicrmw_max_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 define dso_local i32 @atomicrmw_max_i32_unaligned_monotonic(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_max_i32_unaligned_monotonic:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, gt
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i32_unaligned_monotonic:
@@ -4252,9 +4200,7 @@ define dso_local i32 @atomicrmw_max_i32_unaligned_monotonic(ptr %ptr, i32 %value
 define dso_local i32 @atomicrmw_max_i32_unaligned_acquire(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_max_i32_unaligned_acquire:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, gt
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i32_unaligned_acquire:
@@ -4268,9 +4214,7 @@ define dso_local i32 @atomicrmw_max_i32_unaligned_acquire(ptr %ptr, i32 %value)
 define dso_local i32 @atomicrmw_max_i32_unaligned_release(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_max_i32_unaligned_release:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, gt
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i32_unaligned_release:
@@ -4284,9 +4228,7 @@ define dso_local i32 @atomicrmw_max_i32_unaligned_release(ptr %ptr, i32 %value)
 define dso_local i32 @atomicrmw_max_i32_unaligned_acq_rel(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_max_i32_unaligned_acq_rel:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, gt
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i32_unaligned_acq_rel:
@@ -4300,9 +4242,7 @@ define dso_local i32 @atomicrmw_max_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 define dso_local i32 @atomicrmw_max_i32_unaligned_seq_cst(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_max_i32_unaligned_seq_cst:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, gt
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i32_unaligned_seq_cst:
@@ -4316,9 +4256,7 @@ define dso_local i32 @atomicrmw_max_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 define dso_local i64 @atomicrmw_max_i64_unaligned_monotonic(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_max_i64_unaligned_monotonic:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, gt
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i64_unaligned_monotonic:
@@ -4332,9 +4270,7 @@ define dso_local i64 @atomicrmw_max_i64_unaligned_monotonic(ptr %ptr, i64 %value
 define dso_local i64 @atomicrmw_max_i64_unaligned_acquire(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_max_i64_unaligned_acquire:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, gt
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i64_unaligned_acquire:
@@ -4348,9 +4284,7 @@ define dso_local i64 @atomicrmw_max_i64_unaligned_acquire(ptr %ptr, i64 %value)
 define dso_local i64 @atomicrmw_max_i64_unaligned_release(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_max_i64_unaligned_release:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, gt
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i64_unaligned_release:
@@ -4364,9 +4298,7 @@ define dso_local i64 @atomicrmw_max_i64_unaligned_release(ptr %ptr, i64 %value)
 define dso_local i64 @atomicrmw_max_i64_unaligned_acq_rel(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_max_i64_unaligned_acq_rel:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, gt
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i64_unaligned_acq_rel:
@@ -4380,9 +4312,7 @@ define dso_local i64 @atomicrmw_max_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 define dso_local i64 @atomicrmw_max_i64_unaligned_seq_cst(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_max_i64_unaligned_seq_cst:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, gt
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i64_unaligned_seq_cst:
@@ -4396,15 +4326,11 @@ define dso_local i64 @atomicrmw_max_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 define dso_local i128 @atomicrmw_max_i128_unaligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_unaligned_monotonic:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -4422,15 +4348,11 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_monotonic(ptr %ptr, i128 %va
 define dso_local i128 @atomicrmw_max_i128_unaligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_unaligned_acquire:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -4448,15 +4370,11 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_acquire(ptr %ptr, i128 %valu
 define dso_local i128 @atomicrmw_max_i128_unaligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_unaligned_release:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -4474,15 +4392,11 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_release(ptr %ptr, i128 %valu
 define dso_local i128 @atomicrmw_max_i128_unaligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_unaligned_acq_rel:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -4500,15 +4414,11 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_acq_rel(ptr %ptr, i128 %valu
 define dso_local i128 @atomicrmw_max_i128_unaligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_unaligned_seq_cst:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -4666,15 +4576,11 @@ define dso_local i64 @atomicrmw_min_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 define dso_local i128 @atomicrmw_min_i128_aligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_aligned_monotonic:
 ; -O0:    subs x9, x9, x10
-; -O0:    subs x9, x9, x10
 ; -O0:    subs x9, x9, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w9, w9, w11, ne
-; -O0:    and w13, w9, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x2, x11, x12, ne
-; -O0:    and w11, w9, #0x1
+; -O0:    subs x13, x13, x10
+; -O0:    csel w11, w9, w11, eq
+; -O0:    ands w13, w11, #0x1
+; -O0:    csel x2, x9, x12, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x9, x9, x10, ne
 ; -O0:    casp x0, x1, x2, x3, [x8]
@@ -4698,15 +4604,11 @@ define dso_local i128 @atomicrmw_min_i128_aligned_monotonic(ptr %ptr, i128 %valu
 define dso_local i128 @atomicrmw_min_i128_aligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_aligned_acquire:
 ; -O0:    subs x9, x9, x10
-; -O0:    subs x9, x9, x10
 ; -O0:    subs x9, x9, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w9, w9, w11, ne
-; -O0:    and w13, w9, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x2, x11, x12, ne
-; -O0:    and w11, w9, #0x1
+; -O0:    subs x13, x13, x10
+; -O0:    csel w11, w9, w11, eq
+; -O0:    ands w13, w11, #0x1
+; -O0:    csel x2, x9, x12, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x9, x9, x10, ne
 ; -O0:    caspa x0, x1, x2, x3, [x8]
@@ -4730,15 +4632,11 @@ define dso_local i128 @atomicrmw_min_i128_aligned_acquire(ptr %ptr, i128 %value)
 define dso_local i128 @atomicrmw_min_i128_aligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_aligned_release:
 ; -O0:    subs x9, x9, x10
-; -O0:    subs x9, x9, x10
 ; -O0:    subs x9, x9, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w9, w9, w11, ne
-; -O0:    and w13, w9, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x2, x11, x12, ne
-; -O0:    and w11, w9, #0x1
+; -O0:    subs x13, x13, x10
+; -O0:    csel w11, w9, w11, eq
+; -O0:    ands w13, w11, #0x1
+; -O0:    csel x2, x9, x12, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x9, x9, x10, ne
 ; -O0:    caspl x0, x1, x2, x3, [x8]
@@ -4762,15 +4660,11 @@ define dso_local i128 @atomicrmw_min_i128_aligned_release(ptr %ptr, i128 %value)
 define dso_local i128 @atomicrmw_min_i128_aligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_aligned_acq_rel:
 ; -O0:    subs x9, x9, x10
-; -O0:    subs x9, x9, x10
 ; -O0:    subs x9, x9, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w9, w9, w11, ne
-; -O0:    and w13, w9, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x2, x11, x12, ne
-; -O0:    and w11, w9, #0x1
+; -O0:    subs x13, x13, x10
+; -O0:    csel w11, w9, w11, eq
+; -O0:    ands w13, w11, #0x1
+; -O0:    csel x2, x9, x12, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x9, x9, x10, ne
 ; -O0:    caspal x0, x1, x2, x3, [x8]
@@ -4794,15 +4688,11 @@ define dso_local i128 @atomicrmw_min_i128_aligned_acq_rel(ptr %ptr, i128 %value)
 define dso_local i128 @atomicrmw_min_i128_aligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_aligned_seq_cst:
 ; -O0:    subs x9, x9, x10
-; -O0:    subs x9, x9, x10
 ; -O0:    subs x9, x9, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w9, w9, w11, ne
-; -O0:    and w13, w9, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x2, x11, x12, ne
-; -O0:    and w11, w9, #0x1
+; -O0:    subs x13, x13, x10
+; -O0:    csel w11, w9, w11, eq
+; -O0:    ands w13, w11, #0x1
+; -O0:    csel x2, x9, x12, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x9, x9, x10, ne
 ; -O0:    caspal x0, x1, x2, x3, [x8]
@@ -4862,9 +4752,7 @@ define dso_local i16 @atomicrmw_min_i16_unaligned_monotonic(ptr %ptr, i16 %value
 ; -O0-LABEL: atomicrmw_min_i16_unaligned_monotonic:
 ; -O0:    sxth w10, w9
 ; -O0:    subs w10, w10, w8, sxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, le
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i16_unaligned_monotonic:
@@ -4880,9 +4768,7 @@ define dso_local i16 @atomicrmw_min_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ; -O0-LABEL: atomicrmw_min_i16_unaligned_acquire:
 ; -O0:    sxth w10, w9
 ; -O0:    subs w10, w10, w8, sxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, le
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i16_unaligned_acquire:
@@ -4898,9 +4784,7 @@ define dso_local i16 @atomicrmw_min_i16_unaligned_release(ptr %ptr, i16 %value)
 ; -O0-LABEL: atomicrmw_min_i16_unaligned_release:
 ; -O0:    sxth w10, w9
 ; -O0:    subs w10, w10, w8, sxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, le
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i16_unaligned_release:
@@ -4916,9 +4800,7 @@ define dso_local i16 @atomicrmw_min_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ; -O0-LABEL: atomicrmw_min_i16_unaligned_acq_rel:
 ; -O0:    sxth w10, w9
 ; -O0:    subs w10, w10, w8, sxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, le
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i16_unaligned_acq_rel:
@@ -4934,9 +4816,7 @@ define dso_local i16 @atomicrmw_min_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ; -O0-LABEL: atomicrmw_min_i16_unaligned_seq_cst:
 ; -O0:    sxth w10, w9
 ; -O0:    subs w10, w10, w8, sxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, le
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i16_unaligned_seq_cst:
@@ -4951,9 +4831,7 @@ define dso_local i16 @atomicrmw_min_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 define dso_local i32 @atomicrmw_min_i32_unaligned_monotonic(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_min_i32_unaligned_monotonic:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, le
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i32_unaligned_monotonic:
@@ -4967,9 +4845,7 @@ define dso_local i32 @atomicrmw_min_i32_unaligned_monotonic(ptr %ptr, i32 %value
 define dso_local i32 @atomicrmw_min_i32_unaligned_acquire(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_min_i32_unaligned_acquire:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, le
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i32_unaligned_acquire:
@@ -4983,9 +4859,7 @@ define dso_local i32 @atomicrmw_min_i32_unaligned_acquire(ptr %ptr, i32 %value)
 define dso_local i32 @atomicrmw_min_i32_unaligned_release(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_min_i32_unaligned_release:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, le
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i32_unaligned_release:
@@ -4999,9 +4873,7 @@ define dso_local i32 @atomicrmw_min_i32_unaligned_release(ptr %ptr, i32 %value)
 define dso_local i32 @atomicrmw_min_i32_unaligned_acq_rel(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_min_i32_unaligned_acq_rel:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, le
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i32_unaligned_acq_rel:
@@ -5015,9 +4887,7 @@ define dso_local i32 @atomicrmw_min_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 define dso_local i32 @atomicrmw_min_i32_unaligned_seq_cst(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_min_i32_unaligned_seq_cst:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, le
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i32_unaligned_seq_cst:
@@ -5031,9 +4901,7 @@ define dso_local i32 @atomicrmw_min_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 define dso_local i64 @atomicrmw_min_i64_unaligned_monotonic(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_min_i64_unaligned_monotonic:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, le
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i64_unaligned_monotonic:
@@ -5047,9 +4915,7 @@ define dso_local i64 @atomicrmw_min_i64_unaligned_monotonic(ptr %ptr, i64 %value
 define dso_local i64 @atomicrmw_min_i64_unaligned_acquire(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_min_i64_unaligned_acquire:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, le
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i64_unaligned_acquire:
@@ -5063,9 +4929,7 @@ define dso_local i64 @atomicrmw_min_i64_unaligned_acquire(ptr %ptr, i64 %value)
 define dso_local i64 @atomicrmw_min_i64_unaligned_release(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_min_i64_unaligned_release:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, le
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i64_unaligned_release:
@@ -5079,9 +4943,7 @@ define dso_local i64 @atomicrmw_min_i64_unaligned_release(ptr %ptr, i64 %value)
 define dso_local i64 @atomicrmw_min_i64_unaligned_acq_rel(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_min_i64_unaligned_acq_rel:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, le
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i64_unaligned_acq_rel:
@@ -5095,9 +4957,7 @@ define dso_local i64 @atomicrmw_min_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 define dso_local i64 @atomicrmw_min_i64_unaligned_seq_cst(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_min_i64_unaligned_seq_cst:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, le
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i64_unaligned_seq_cst:
@@ -5111,15 +4971,11 @@ define dso_local i64 @atomicrmw_min_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 define dso_local i128 @atomicrmw_min_i128_unaligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_unaligned_monotonic:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -5137,15 +4993,11 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_monotonic(ptr %ptr, i128 %va
 define dso_local i128 @atomicrmw_min_i128_unaligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_unaligned_acquire:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -5163,15 +5015,11 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_acquire(ptr %ptr, i128 %valu
 define dso_local i128 @atomicrmw_min_i128_unaligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_unaligned_release:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -5189,15 +5037,11 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_release(ptr %ptr, i128 %valu
 define dso_local i128 @atomicrmw_min_i128_unaligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_unaligned_acq_rel:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -5215,15 +5059,11 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_acq_rel(ptr %ptr, i128 %valu
 define dso_local i128 @atomicrmw_min_i128_unaligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_unaligned_seq_cst:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -5381,15 +5221,11 @@ define dso_local i64 @atomicrmw_umax_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 define dso_local i128 @atomicrmw_umax_i128_aligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_aligned_monotonic:
 ; -O0:    subs x9, x9, x10
-; -O0:    subs x9, x9, x10
 ; -O0:    subs x9, x9, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w9, w9, w11, ne
-; -O0:    and w13, w9, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x2, x11, x12, ne
-; -O0:    and w11, w9, #0x1
+; -O0:    subs x13, x13, x10
+; -O0:    csel w11, w9, w11, eq
+; -O0:    ands w13, w11, #0x1
+; -O0:    csel x2, x9, x12, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x9, x9, x10, ne
 ; -O0:    casp x0, x1, x2, x3, [x8]
@@ -5413,15 +5249,11 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_monotonic(ptr %ptr, i128 %val
 define dso_local i128 @atomicrmw_umax_i128_aligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_aligned_acquire:
 ; -O0:    subs x9, x9, x10
-; -O0:    subs x9, x9, x10
 ; -O0:    subs x9, x9, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w9, w9, w11, ne
-; -O0:    and w13, w9, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x2, x11, x12, ne
-; -O0:    and w11, w9, #0x1
+; -O0:    subs x13, x13, x10
+; -O0:    csel w11, w9, w11, eq
+; -O0:    ands w13, w11, #0x1
+; -O0:    csel x2, x9, x12, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x9, x9, x10, ne
 ; -O0:    caspa x0, x1, x2, x3, [x8]
@@ -5445,15 +5277,11 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_acquire(ptr %ptr, i128 %value
 define dso_local i128 @atomicrmw_umax_i128_aligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_aligned_release:
 ; -O0:    subs x9, x9, x10
-; -O0:    subs x9, x9, x10
 ; -O0:    subs x9, x9, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w9, w9, w11, ne
-; -O0:    and w13, w9, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x2, x11, x12, ne
-; -O0:    and w11, w9, #0x1
+; -O0:    subs x13, x13, x10
+; -O0:    csel w11, w9, w11, eq
+; -O0:    ands w13, w11, #0x1
+; -O0:    csel x2, x9, x12, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x9, x9, x10, ne
 ; -O0:    caspl x0, x1, x2, x3, [x8]
@@ -5477,15 +5305,11 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_release(ptr %ptr, i128 %value
 define dso_local i128 @atomicrmw_umax_i128_aligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_aligned_acq_rel:
 ; -O0:    subs x9, x9, x10
-; -O0:    subs x9, x9, x10
 ; -O0:    subs x9, x9, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w9, w9, w11, ne
-; -O0:    and w13, w9, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x2, x11, x12, ne
-; -O0:    and w11, w9, #0x1
+; -O0:    subs x13, x13, x10
+; -O0:    csel w11, w9, w11, eq
+; -O0:    ands w13, w11, #0x1
+; -O0:    csel x2, x9, x12, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x9, x9, x10, ne
 ; -O0:    caspal x0, x1, x2, x3, [x8]
@@ -5509,15 +5333,11 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_acq_rel(ptr %ptr, i128 %value
 define dso_local i128 @atomicrmw_umax_i128_aligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_aligned_seq_cst:
 ; -O0:    subs x9, x9, x10
-; -O0:    subs x9, x9, x10
 ; -O0:    subs x9, x9, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w9, w9, w11, ne
-; -O0:    and w13, w9, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x2, x11, x12, ne
-; -O0:    and w11, w9, #0x1
+; -O0:    subs x13, x13, x10
+; -O0:    csel w11, w9, w11, eq
+; -O0:    ands w13, w11, #0x1
+; -O0:    csel x2, x9, x12, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x9, x9, x10, ne
 ; -O0:    caspal x0, x1, x2, x3, [x8]
@@ -5576,9 +5396,7 @@ define dso_local i8 @atomicrmw_umax_i8_unaligned_seq_cst(ptr %ptr, i8 %value) {
 define dso_local i16 @atomicrmw_umax_i16_unaligned_monotonic(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umax_i16_unaligned_monotonic:
 ; -O0:    subs w10, w10, w8, uxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, hi
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i16_unaligned_monotonic:
@@ -5593,9 +5411,7 @@ define dso_local i16 @atomicrmw_umax_i16_unaligned_monotonic(ptr %ptr, i16 %valu
 define dso_local i16 @atomicrmw_umax_i16_unaligned_acquire(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umax_i16_unaligned_acquire:
 ; -O0:    subs w10, w10, w8, uxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, hi
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i16_unaligned_acquire:
@@ -5610,9 +5426,7 @@ define dso_local i16 @atomicrmw_umax_i16_unaligned_acquire(ptr %ptr, i16 %value)
 define dso_local i16 @atomicrmw_umax_i16_unaligned_release(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umax_i16_unaligned_release:
 ; -O0:    subs w10, w10, w8, uxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, hi
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i16_unaligned_release:
@@ -5627,9 +5441,7 @@ define dso_local i16 @atomicrmw_umax_i16_unaligned_release(ptr %ptr, i16 %value)
 define dso_local i16 @atomicrmw_umax_i16_unaligned_acq_rel(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umax_i16_unaligned_acq_rel:
 ; -O0:    subs w10, w10, w8, uxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, hi
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i16_unaligned_acq_rel:
@@ -5644,9 +5456,7 @@ define dso_local i16 @atomicrmw_umax_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 define dso_local i16 @atomicrmw_umax_i16_unaligned_seq_cst(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umax_i16_unaligned_seq_cst:
 ; -O0:    subs w10, w10, w8, uxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, hi
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i16_unaligned_seq_cst:
@@ -5661,9 +5471,7 @@ define dso_local i16 @atomicrmw_umax_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 define dso_local i32 @atomicrmw_umax_i32_unaligned_monotonic(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umax_i32_unaligned_monotonic:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, hi
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i32_unaligned_monotonic:
@@ -5677,9 +5485,7 @@ define dso_local i32 @atomicrmw_umax_i32_unaligned_monotonic(ptr %ptr, i32 %valu
 define dso_local i32 @atomicrmw_umax_i32_unaligned_acquire(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umax_i32_unaligned_acquire:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, hi
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i32_unaligned_acquire:
@@ -5693,9 +5499,7 @@ define dso_local i32 @atomicrmw_umax_i32_unaligned_acquire(ptr %ptr, i32 %value)
 define dso_local i32 @atomicrmw_umax_i32_unaligned_release(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umax_i32_unaligned_release:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, hi
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i32_unaligned_release:
@@ -5709,9 +5513,7 @@ define dso_local i32 @atomicrmw_umax_i32_unaligned_release(ptr %ptr, i32 %value)
 define dso_local i32 @atomicrmw_umax_i32_unaligned_acq_rel(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umax_i32_unaligned_acq_rel:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, hi
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i32_unaligned_acq_rel:
@@ -5725,9 +5527,7 @@ define dso_local i32 @atomicrmw_umax_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 define dso_local i32 @atomicrmw_umax_i32_unaligned_seq_cst(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umax_i32_unaligned_seq_cst:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, hi
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i32_unaligned_seq_cst:
@@ -5741,9 +5541,7 @@ define dso_local i32 @atomicrmw_umax_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 define dso_local i64 @atomicrmw_umax_i64_unaligned_monotonic(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umax_i64_unaligned_monotonic:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, hi
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i64_unaligned_monotonic:
@@ -5757,9 +5555,7 @@ define dso_local i64 @atomicrmw_umax_i64_unaligned_monotonic(ptr %ptr, i64 %valu
 define dso_local i64 @atomicrmw_umax_i64_unaligned_acquire(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umax_i64_unaligned_acquire:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, hi
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i64_unaligned_acquire:
@@ -5773,9 +5569,7 @@ define dso_local i64 @atomicrmw_umax_i64_unaligned_acquire(ptr %ptr, i64 %value)
 define dso_local i64 @atomicrmw_umax_i64_unaligned_release(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umax_i64_unaligned_release:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, hi
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i64_unaligned_release:
@@ -5789,9 +5583,7 @@ define dso_local i64 @atomicrmw_umax_i64_unaligned_release(ptr %ptr, i64 %value)
 define dso_local i64 @atomicrmw_umax_i64_unaligned_acq_rel(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umax_i64_unaligned_acq_rel:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, hi
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i64_unaligned_acq_rel:
@@ -5805,9 +5597,7 @@ define dso_local i64 @atomicrmw_umax_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 define dso_local i64 @atomicrmw_umax_i64_unaligned_seq_cst(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umax_i64_unaligned_seq_cst:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, hi
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i64_unaligned_seq_cst:
@@ -5821,15 +5611,11 @@ define dso_local i64 @atomicrmw_umax_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 define dso_local i128 @atomicrmw_umax_i128_unaligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_unaligned_monotonic:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -5847,15 +5633,11 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_monotonic(ptr %ptr, i128 %v
 define dso_local i128 @atomicrmw_umax_i128_unaligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_unaligned_acquire:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -5873,15 +5655,11 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_acquire(ptr %ptr, i128 %val
 define dso_local i128 @atomicrmw_umax_i128_unaligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_unaligned_release:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -5899,15 +5677,11 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_release(ptr %ptr, i128 %val
 define dso_local i128 @atomicrmw_umax_i128_unaligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_unaligned_acq_rel:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -5925,15 +5699,11 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_acq_rel(ptr %ptr, i128 %val
 define dso_local i128 @atomicrmw_umax_i128_unaligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_unaligned_seq_cst:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -6091,15 +5861,11 @@ define dso_local i64 @atomicrmw_umin_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 define dso_local i128 @atomicrmw_umin_i128_aligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_aligned_monotonic:
 ; -O0:    subs x9, x9, x10
-; -O0:    subs x9, x9, x10
 ; -O0:    subs x9, x9, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w9, w9, w11, ne
-; -O0:    and w13, w9, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x2, x11, x12, ne
-; -O0:    and w11, w9, #0x1
+; -O0:    subs x13, x13, x10
+; -O0:    csel w11, w9, w11, eq
+; -O0:    ands w13, w11, #0x1
+; -O0:    csel x2, x9, x12, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x9, x9, x10, ne
 ; -O0:    casp x0, x1, x2, x3, [x8]
@@ -6123,15 +5889,11 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_monotonic(ptr %ptr, i128 %val
 define dso_local i128 @atomicrmw_umin_i128_aligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_aligned_acquire:
 ; -O0:    subs x9, x9, x10
-; -O0:    subs x9, x9, x10
 ; -O0:    subs x9, x9, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w9, w9, w11, ne
-; -O0:    and w13, w9, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x2, x11, x12, ne
-; -O0:    and w11, w9, #0x1
+; -O0:    subs x13, x13, x10
+; -O0:    csel w11, w9, w11, eq
+; -O0:    ands w13, w11, #0x1
+; -O0:    csel x2, x9, x12, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x9, x9, x10, ne
 ; -O0:    caspa x0, x1, x2, x3, [x8]
@@ -6155,15 +5917,11 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_acquire(ptr %ptr, i128 %value
 define dso_local i128 @atomicrmw_umin_i128_aligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_aligned_release:
 ; -O0:    subs x9, x9, x10
-; -O0:    subs x9, x9, x10
 ; -O0:    subs x9, x9, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w9, w9, w11, ne
-; -O0:    and w13, w9, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x2, x11, x12, ne
-; -O0:    and w11, w9, #0x1
+; -O0:    subs x13, x13, x10
+; -O0:    csel w11, w9, w11, eq
+; -O0:    ands w13, w11, #0x1
+; -O0:    csel x2, x9, x12, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x9, x9, x10, ne
 ; -O0:    caspl x0, x1, x2, x3, [x8]
@@ -6187,15 +5945,11 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_release(ptr %ptr, i128 %value
 define dso_local i128 @atomicrmw_umin_i128_aligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_aligned_acq_rel:
 ; -O0:    subs x9, x9, x10
-; -O0:    subs x9, x9, x10
 ; -O0:    subs x9, x9, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w9, w9, w11, ne
-; -O0:    and w13, w9, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x2, x11, x12, ne
-; -O0:    and w11, w9, #0x1
+; -O0:    subs x13, x13, x10
+; -O0:    csel w11, w9, w11, eq
+; -O0:    ands w13, w11, #0x1
+; -O0:    csel x2, x9, x12, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x9, x9, x10, ne
 ; -O0:    caspal x0, x1, x2, x3, [x8]
@@ -6219,15 +5973,11 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_acq_rel(ptr %ptr, i128 %value
 define dso_local i128 @atomicrmw_umin_i128_aligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_aligned_seq_cst:
 ; -O0:    subs x9, x9, x10
-; -O0:    subs x9, x9, x10
 ; -O0:    subs x9, x9, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w9, w9, w11, ne
-; -O0:    and w13, w9, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x2, x11, x12, ne
-; -O0:    and w11, w9, #0x1
+; -O0:    subs x13, x13, x10
+; -O0:    csel w11, w9, w11, eq
+; -O0:    ands w13, w11, #0x1
+; -O0:    csel x2, x9, x12, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x9, x9, x10, ne
 ; -O0:    caspal x0, x1, x2, x3, [x8]
@@ -6286,9 +6036,7 @@ define dso_local i8 @atomicrmw_umin_i8_unaligned_seq_cst(ptr %ptr, i8 %value) {
 define dso_local i16 @atomicrmw_umin_i16_unaligned_monotonic(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umin_i16_unaligned_monotonic:
 ; -O0:    subs w10, w10, w8, uxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, ls
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i16_unaligned_monotonic:
@@ -6303,9 +6051,7 @@ define dso_local i16 @atomicrmw_umin_i16_unaligned_monotonic(ptr %ptr, i16 %valu
 define dso_local i16 @atomicrmw_umin_i16_unaligned_acquire(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umin_i16_unaligned_acquire:
 ; -O0:    subs w10, w10, w8, uxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, ls
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i16_unaligned_acquire:
@@ -6320,9 +6066,7 @@ define dso_local i16 @atomicrmw_umin_i16_unaligned_acquire(ptr %ptr, i16 %value)
 define dso_local i16 @atomicrmw_umin_i16_unaligned_release(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umin_i16_unaligned_release:
 ; -O0:    subs w10, w10, w8, uxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, ls
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i16_unaligned_release:
@@ -6337,9 +6081,7 @@ define dso_local i16 @atomicrmw_umin_i16_unaligned_release(ptr %ptr, i16 %value)
 define dso_local i16 @atomicrmw_umin_i16_unaligned_acq_rel(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umin_i16_unaligned_acq_rel:
 ; -O0:    subs w10, w10, w8, uxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, ls
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i16_unaligned_acq_rel:
@@ -6354,9 +6096,7 @@ define dso_local i16 @atomicrmw_umin_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 define dso_local i16 @atomicrmw_umin_i16_unaligned_seq_cst(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umin_i16_unaligned_seq_cst:
 ; -O0:    subs w10, w10, w8, uxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, ls
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i16_unaligned_seq_cst:
@@ -6371,9 +6111,7 @@ define dso_local i16 @atomicrmw_umin_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 define dso_local i32 @atomicrmw_umin_i32_unaligned_monotonic(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umin_i32_unaligned_monotonic:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, ls
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i32_unaligned_monotonic:
@@ -6387,9 +6125,7 @@ define dso_local i32 @atomicrmw_umin_i32_unaligned_monotonic(ptr %ptr, i32 %valu
 define dso_local i32 @atomicrmw_umin_i32_unaligned_acquire(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umin_i32_unaligned_acquire:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, ls
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i32_unaligned_acquire:
@@ -6403,9 +6139,7 @@ define dso_local i32 @atomicrmw_umin_i32_unaligned_acquire(ptr %ptr, i32 %value)
 define dso_local i32 @atomicrmw_umin_i32_unaligned_release(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umin_i32_unaligned_release:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, ls
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i32_unaligned_release:
@@ -6419,9 +6153,7 @@ define dso_local i32 @atomicrmw_umin_i32_unaligned_release(ptr %ptr, i32 %value)
 define dso_local i32 @atomicrmw_umin_i32_unaligned_acq_rel(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umin_i32_unaligned_acq_rel:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, ls
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i32_unaligned_acq_rel:
@@ -6435,9 +6167,7 @@ define dso_local i32 @atomicrmw_umin_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 define dso_local i32 @atomicrmw_umin_i32_unaligned_seq_cst(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umin_i32_unaligned_seq_cst:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, ls
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i32_unaligned_seq_cst:
@@ -6451,9 +6181,7 @@ define dso_local i32 @atomicrmw_umin_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 define dso_local i64 @atomicrmw_umin_i64_unaligned_monotonic(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umin_i64_unaligned_monotonic:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, ls
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i64_unaligned_monotonic:
@@ -6467,9 +6195,7 @@ define dso_local i64 @atomicrmw_umin_i64_unaligned_monotonic(ptr %ptr, i64 %valu
 define dso_local i64 @atomicrmw_umin_i64_unaligned_acquire(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umin_i64_unaligned_acquire:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, ls
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i64_unaligned_acquire:
@@ -6483,9 +6209,7 @@ define dso_local i64 @atomicrmw_umin_i64_unaligned_acquire(ptr %ptr, i64 %value)
 define dso_local i64 @atomicrmw_umin_i64_unaligned_release(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umin_i64_unaligned_release:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, ls
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i64_unaligned_release:
@@ -6499,9 +6223,7 @@ define dso_local i64 @atomicrmw_umin_i64_unaligned_release(ptr %ptr, i64 %value)
 define dso_local i64 @atomicrmw_umin_i64_unaligned_acq_rel(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umin_i64_unaligned_acq_rel:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, ls
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i64_unaligned_acq_rel:
@@ -6515,9 +6237,7 @@ define dso_local i64 @atomicrmw_umin_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 define dso_local i64 @atomicrmw_umin_i64_unaligned_seq_cst(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umin_i64_unaligned_seq_cst:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, ls
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i64_unaligned_seq_cst:
@@ -6531,15 +6251,11 @@ define dso_local i64 @atomicrmw_umin_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 define dso_local i128 @atomicrmw_umin_i128_unaligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_unaligned_monotonic:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -6557,15 +6273,11 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_monotonic(ptr %ptr, i128 %v
 define dso_local i128 @atomicrmw_umin_i128_unaligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_unaligned_acquire:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -6583,15 +6295,11 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_acquire(ptr %ptr, i128 %val
 define dso_local i128 @atomicrmw_umin_i128_unaligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_unaligned_release:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -6609,15 +6317,11 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_release(ptr %ptr, i128 %val
 define dso_local i128 @atomicrmw_umin_i128_unaligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_unaligned_acq_rel:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -6635,15 +6339,11 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_acq_rel(ptr %ptr, i128 %val
 define dso_local i128 @atomicrmw_umin_i128_unaligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_unaligned_seq_cst:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
diff --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-outline_atomics.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-outline_atomics.ll
index 89d22c59e630b..c660c139e35d4 100644
--- a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-outline_atomics.ll
+++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-outline_atomics.ll
@@ -556,7 +556,6 @@ define dso_local i64 @atomicrmw_add_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 define dso_local i128 @atomicrmw_add_i128_aligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_add_i128_aligned_monotonic:
 ; -O0:    adds x14, x8, x10
-; -O0:    and w10, w8, #0x1
 ; -O0:    subs w10, w10, #1
 ; -O0:    ldxp x10, x9, [x11]
 ; -O0:    cmp x10, x12
@@ -579,7 +578,6 @@ define dso_local i128 @atomicrmw_add_i128_aligned_monotonic(ptr %ptr, i128 %valu
 define dso_local i128 @atomicrmw_add_i128_aligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_add_i128_aligned_acquire:
 ; -O0:    adds x14, x8, x10
-; -O0:    and w10, w8, #0x1
 ; -O0:    subs w10, w10, #1
 ; -O0:    ldaxp x10, x9, [x11]
 ; -O0:    cmp x10, x12
@@ -602,7 +600,6 @@ define dso_local i128 @atomicrmw_add_i128_aligned_acquire(ptr %ptr, i128 %value)
 define dso_local i128 @atomicrmw_add_i128_aligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_add_i128_aligned_release:
 ; -O0:    adds x14, x8, x10
-; -O0:    and w10, w8, #0x1
 ; -O0:    subs w10, w10, #1
 ; -O0:    ldxp x10, x9, [x11]
 ; -O0:    cmp x10, x12
@@ -625,7 +622,6 @@ define dso_local i128 @atomicrmw_add_i128_aligned_release(ptr %ptr, i128 %value)
 define dso_local i128 @atomicrmw_add_i128_aligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_add_i128_aligned_acq_rel:
 ; -O0:    adds x14, x8, x10
-; -O0:    and w10, w8, #0x1
 ; -O0:    subs w10, w10, #1
 ; -O0:    ldaxp x10, x9, [x11]
 ; -O0:    cmp x10, x12
@@ -648,7 +644,6 @@ define dso_local i128 @atomicrmw_add_i128_aligned_acq_rel(ptr %ptr, i128 %value)
 define dso_local i128 @atomicrmw_add_i128_aligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_add_i128_aligned_seq_cst:
 ; -O0:    adds x14, x8, x10
-; -O0:    and w10, w8, #0x1
 ; -O0:    subs w10, w10, #1
 ; -O0:    ldaxp x10, x9, [x11]
 ; -O0:    cmp x10, x12
@@ -886,7 +881,6 @@ define dso_local i64 @atomicrmw_add_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 define dso_local i128 @atomicrmw_add_i128_unaligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_add_i128_unaligned_monotonic:
 ; -O0:    adds x9, x8, x9
-; -O0:    and w11, w8, #0x1
 ; -O0:    subs w11, w11, #1
 ; -O0:    bl __atomic_compare_exchange
 ;
@@ -901,7 +895,6 @@ define dso_local i128 @atomicrmw_add_i128_unaligned_monotonic(ptr %ptr, i128 %va
 define dso_local i128 @atomicrmw_add_i128_unaligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_add_i128_unaligned_acquire:
 ; -O0:    adds x9, x8, x9
-; -O0:    and w11, w8, #0x1
 ; -O0:    subs w11, w11, #1
 ; -O0:    bl __atomic_compare_exchange
 ;
@@ -916,7 +909,6 @@ define dso_local i128 @atomicrmw_add_i128_unaligned_acquire(ptr %ptr, i128 %valu
 define dso_local i128 @atomicrmw_add_i128_unaligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_add_i128_unaligned_release:
 ; -O0:    adds x9, x8, x9
-; -O0:    and w11, w8, #0x1
 ; -O0:    subs w11, w11, #1
 ; -O0:    bl __atomic_compare_exchange
 ;
@@ -931,7 +923,6 @@ define dso_local i128 @atomicrmw_add_i128_unaligned_release(ptr %ptr, i128 %valu
 define dso_local i128 @atomicrmw_add_i128_unaligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_add_i128_unaligned_acq_rel:
 ; -O0:    adds x9, x8, x9
-; -O0:    and w11, w8, #0x1
 ; -O0:    subs w11, w11, #1
 ; -O0:    bl __atomic_compare_exchange
 ;
@@ -946,7 +937,6 @@ define dso_local i128 @atomicrmw_add_i128_unaligned_acq_rel(ptr %ptr, i128 %valu
 define dso_local i128 @atomicrmw_add_i128_unaligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_add_i128_unaligned_seq_cst:
 ; -O0:    adds x9, x8, x9
-; -O0:    and w11, w8, #0x1
 ; -O0:    subs w11, w11, #1
 ; -O0:    bl __atomic_compare_exchange
 ;
@@ -1181,7 +1171,6 @@ define dso_local i64 @atomicrmw_sub_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 define dso_local i128 @atomicrmw_sub_i128_aligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_sub_i128_aligned_monotonic:
 ; -O0:    subs x14, x8, x10
-; -O0:    and w10, w8, #0x1
 ; -O0:    ldxp x10, x9, [x11]
 ; -O0:    cmp x10, x12
 ; -O0:    cmp x9, x13
@@ -1203,7 +1192,6 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_monotonic(ptr %ptr, i128 %valu
 define dso_local i128 @atomicrmw_sub_i128_aligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_sub_i128_aligned_acquire:
 ; -O0:    subs x14, x8, x10
-; -O0:    and w10, w8, #0x1
 ; -O0:    ldaxp x10, x9, [x11]
 ; -O0:    cmp x10, x12
 ; -O0:    cmp x9, x13
@@ -1225,7 +1213,6 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_acquire(ptr %ptr, i128 %value)
 define dso_local i128 @atomicrmw_sub_i128_aligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_sub_i128_aligned_release:
 ; -O0:    subs x14, x8, x10
-; -O0:    and w10, w8, #0x1
 ; -O0:    ldxp x10, x9, [x11]
 ; -O0:    cmp x10, x12
 ; -O0:    cmp x9, x13
@@ -1247,7 +1234,6 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_release(ptr %ptr, i128 %value)
 define dso_local i128 @atomicrmw_sub_i128_aligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_sub_i128_aligned_acq_rel:
 ; -O0:    subs x14, x8, x10
-; -O0:    and w10, w8, #0x1
 ; -O0:    ldaxp x10, x9, [x11]
 ; -O0:    cmp x10, x12
 ; -O0:    cmp x9, x13
@@ -1269,7 +1255,6 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_acq_rel(ptr %ptr, i128 %value)
 define dso_local i128 @atomicrmw_sub_i128_aligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_sub_i128_aligned_seq_cst:
 ; -O0:    subs x14, x8, x10
-; -O0:    and w10, w8, #0x1
 ; -O0:    ldaxp x10, x9, [x11]
 ; -O0:    cmp x10, x12
 ; -O0:    cmp x9, x13
@@ -1526,7 +1511,6 @@ define dso_local i64 @atomicrmw_sub_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 define dso_local i128 @atomicrmw_sub_i128_unaligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_sub_i128_unaligned_monotonic:
 ; -O0:    subs x9, x8, x9
-; -O0:    and w11, w8, #0x1
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i128_unaligned_monotonic:
@@ -1540,7 +1524,6 @@ define dso_local i128 @atomicrmw_sub_i128_unaligned_monotonic(ptr %ptr, i128 %va
 define dso_local i128 @atomicrmw_sub_i128_unaligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_sub_i128_unaligned_acquire:
 ; -O0:    subs x9, x8, x9
-; -O0:    and w11, w8, #0x1
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i128_unaligned_acquire:
@@ -1554,7 +1537,6 @@ define dso_local i128 @atomicrmw_sub_i128_unaligned_acquire(ptr %ptr, i128 %valu
 define dso_local i128 @atomicrmw_sub_i128_unaligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_sub_i128_unaligned_release:
 ; -O0:    subs x9, x8, x9
-; -O0:    and w11, w8, #0x1
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i128_unaligned_release:
@@ -1568,7 +1550,6 @@ define dso_local i128 @atomicrmw_sub_i128_unaligned_release(ptr %ptr, i128 %valu
 define dso_local i128 @atomicrmw_sub_i128_unaligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_sub_i128_unaligned_acq_rel:
 ; -O0:    subs x9, x8, x9
-; -O0:    and w11, w8, #0x1
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i128_unaligned_acq_rel:
@@ -1582,7 +1563,6 @@ define dso_local i128 @atomicrmw_sub_i128_unaligned_acq_rel(ptr %ptr, i128 %valu
 define dso_local i128 @atomicrmw_sub_i128_unaligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_sub_i128_unaligned_seq_cst:
 ; -O0:    subs x9, x8, x9
-; -O0:    and w11, w8, #0x1
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i128_unaligned_seq_cst:
@@ -4257,9 +4237,7 @@ define dso_local i8 @atomicrmw_max_i8_aligned_monotonic(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_max_i8_aligned_monotonic:
 ; -O0:    sxtb w9, w10
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, gt
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -4280,9 +4258,7 @@ define dso_local i8 @atomicrmw_max_i8_aligned_acquire(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_max_i8_aligned_acquire:
 ; -O0:    sxtb w9, w10
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, gt
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -4303,9 +4279,7 @@ define dso_local i8 @atomicrmw_max_i8_aligned_release(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_max_i8_aligned_release:
 ; -O0:    sxtb w9, w10
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, gt
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -4326,9 +4300,7 @@ define dso_local i8 @atomicrmw_max_i8_aligned_acq_rel(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_max_i8_aligned_acq_rel:
 ; -O0:    sxtb w9, w10
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, gt
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -4349,9 +4321,7 @@ define dso_local i8 @atomicrmw_max_i8_aligned_seq_cst(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_max_i8_aligned_seq_cst:
 ; -O0:    sxtb w9, w10
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, gt
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -4372,9 +4342,7 @@ define dso_local i16 @atomicrmw_max_i16_aligned_monotonic(ptr %ptr, i16 %value)
 ; -O0-LABEL: atomicrmw_max_i16_aligned_monotonic:
 ; -O0:    sxth w10, w8
 ; -O0:    subs w10, w10, w9, sxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, gt
 ; -O0:    ldaxrh w9, [x11]
 ; -O0:    cmp w9, w8, uxth
 ; -O0:    stlxrh w10, w12, [x11]
@@ -4394,9 +4362,7 @@ define dso_local i16 @atomicrmw_max_i16_aligned_acquire(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_max_i16_aligned_acquire:
 ; -O0:    sxth w10, w8
 ; -O0:    subs w10, w10, w9, sxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, gt
 ; -O0:    ldaxrh w9, [x11]
 ; -O0:    cmp w9, w8, uxth
 ; -O0:    stlxrh w10, w12, [x11]
@@ -4416,9 +4382,7 @@ define dso_local i16 @atomicrmw_max_i16_aligned_release(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_max_i16_aligned_release:
 ; -O0:    sxth w10, w8
 ; -O0:    subs w10, w10, w9, sxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, gt
 ; -O0:    ldaxrh w9, [x11]
 ; -O0:    cmp w9, w8, uxth
 ; -O0:    stlxrh w10, w12, [x11]
@@ -4438,9 +4402,7 @@ define dso_local i16 @atomicrmw_max_i16_aligned_acq_rel(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_max_i16_aligned_acq_rel:
 ; -O0:    sxth w10, w8
 ; -O0:    subs w10, w10, w9, sxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, gt
 ; -O0:    ldaxrh w9, [x11]
 ; -O0:    cmp w9, w8, uxth
 ; -O0:    stlxrh w10, w12, [x11]
@@ -4460,9 +4422,7 @@ define dso_local i16 @atomicrmw_max_i16_aligned_seq_cst(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_max_i16_aligned_seq_cst:
 ; -O0:    sxth w10, w8
 ; -O0:    subs w10, w10, w9, sxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, gt
 ; -O0:    ldaxrh w9, [x11]
 ; -O0:    cmp w9, w8, uxth
 ; -O0:    stlxrh w10, w12, [x11]
@@ -4481,9 +4441,7 @@ define dso_local i16 @atomicrmw_max_i16_aligned_seq_cst(ptr %ptr, i16 %value) {
 define dso_local i32 @atomicrmw_max_i32_aligned_monotonic(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_max_i32_aligned_monotonic:
 ; -O0:    subs w10, w8, w9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, gt
 ; -O0:    ldaxr w9, [x11]
 ; -O0:    cmp w9, w8
 ; -O0:    stlxr w10, w12, [x11]
@@ -4501,9 +4459,7 @@ define dso_local i32 @atomicrmw_max_i32_aligned_monotonic(ptr %ptr, i32 %value)
 define dso_local i32 @atomicrmw_max_i32_aligned_acquire(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_max_i32_aligned_acquire:
 ; -O0:    subs w10, w8, w9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, gt
 ; -O0:    ldaxr w9, [x11]
 ; -O0:    cmp w9, w8
 ; -O0:    stlxr w10, w12, [x11]
@@ -4521,9 +4477,7 @@ define dso_local i32 @atomicrmw_max_i32_aligned_acquire(ptr %ptr, i32 %value) {
 define dso_local i32 @atomicrmw_max_i32_aligned_release(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_max_i32_aligned_release:
 ; -O0:    subs w10, w8, w9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, gt
 ; -O0:    ldaxr w9, [x11]
 ; -O0:    cmp w9, w8
 ; -O0:    stlxr w10, w12, [x11]
@@ -4541,9 +4495,7 @@ define dso_local i32 @atomicrmw_max_i32_aligned_release(ptr %ptr, i32 %value) {
 define dso_local i32 @atomicrmw_max_i32_aligned_acq_rel(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_max_i32_aligned_acq_rel:
 ; -O0:    subs w10, w8, w9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, gt
 ; -O0:    ldaxr w9, [x11]
 ; -O0:    cmp w9, w8
 ; -O0:    stlxr w10, w12, [x11]
@@ -4561,9 +4513,7 @@ define dso_local i32 @atomicrmw_max_i32_aligned_acq_rel(ptr %ptr, i32 %value) {
 define dso_local i32 @atomicrmw_max_i32_aligned_seq_cst(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_max_i32_aligned_seq_cst:
 ; -O0:    subs w10, w8, w9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, gt
 ; -O0:    ldaxr w9, [x11]
 ; -O0:    cmp w9, w8
 ; -O0:    stlxr w10, w12, [x11]
@@ -4581,9 +4531,7 @@ define dso_local i32 @atomicrmw_max_i32_aligned_seq_cst(ptr %ptr, i32 %value) {
 define dso_local i64 @atomicrmw_max_i64_aligned_monotonic(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_max_i64_aligned_monotonic:
 ; -O0:    subs x10, x8, x9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x12, x8, x9, ne
+; -O0:    csel x12, x8, x9, gt
 ; -O0:    ldaxr x9, [x11]
 ; -O0:    cmp x9, x8
 ; -O0:    stlxr w10, x12, [x11]
@@ -4601,9 +4549,7 @@ define dso_local i64 @atomicrmw_max_i64_aligned_monotonic(ptr %ptr, i64 %value)
 define dso_local i64 @atomicrmw_max_i64_aligned_acquire(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_max_i64_aligned_acquire:
 ; -O0:    subs x10, x8, x9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x12, x8, x9, ne
+; -O0:    csel x12, x8, x9, gt
 ; -O0:    ldaxr x9, [x11]
 ; -O0:    cmp x9, x8
 ; -O0:    stlxr w10, x12, [x11]
@@ -4621,9 +4567,7 @@ define dso_local i64 @atomicrmw_max_i64_aligned_acquire(ptr %ptr, i64 %value) {
 define dso_local i64 @atomicrmw_max_i64_aligned_release(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_max_i64_aligned_release:
 ; -O0:    subs x10, x8, x9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x12, x8, x9, ne
+; -O0:    csel x12, x8, x9, gt
 ; -O0:    ldaxr x9, [x11]
 ; -O0:    cmp x9, x8
 ; -O0:    stlxr w10, x12, [x11]
@@ -4641,9 +4585,7 @@ define dso_local i64 @atomicrmw_max_i64_aligned_release(ptr %ptr, i64 %value) {
 define dso_local i64 @atomicrmw_max_i64_aligned_acq_rel(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_max_i64_aligned_acq_rel:
 ; -O0:    subs x10, x8, x9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x12, x8, x9, ne
+; -O0:    csel x12, x8, x9, gt
 ; -O0:    ldaxr x9, [x11]
 ; -O0:    cmp x9, x8
 ; -O0:    stlxr w10, x12, [x11]
@@ -4661,9 +4603,7 @@ define dso_local i64 @atomicrmw_max_i64_aligned_acq_rel(ptr %ptr, i64 %value) {
 define dso_local i64 @atomicrmw_max_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_max_i64_aligned_seq_cst:
 ; -O0:    subs x10, x8, x9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x12, x8, x9, ne
+; -O0:    csel x12, x8, x9, gt
 ; -O0:    ldaxr x9, [x11]
 ; -O0:    cmp x9, x8
 ; -O0:    stlxr w10, x12, [x11]
@@ -4681,15 +4621,11 @@ define dso_local i64 @atomicrmw_max_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 define dso_local i128 @atomicrmw_max_i128_aligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_aligned_monotonic:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w8, w8, w10, ne
-; -O0:    and w13, w8, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x14, x10, x12, ne
-; -O0:    and w10, w8, #0x1
+; -O0:    subs x13, x13, x9
+; -O0:    csel w10, w8, w10, eq
+; -O0:    ands w13, w10, #0x1
+; -O0:    csel x14, x8, x12, ne
 ; -O0:    ands w10, w10, #0x1
 ; -O0:    csel x15, x8, x9, ne
 ; -O0:    ldxp x10, x9, [x11]
@@ -4715,15 +4651,11 @@ define dso_local i128 @atomicrmw_max_i128_aligned_monotonic(ptr %ptr, i128 %valu
 define dso_local i128 @atomicrmw_max_i128_aligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_aligned_acquire:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w8, w8, w10, ne
-; -O0:    and w13, w8, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x14, x10, x12, ne
-; -O0:    and w10, w8, #0x1
+; -O0:    subs x13, x13, x9
+; -O0:    csel w10, w8, w10, eq
+; -O0:    ands w13, w10, #0x1
+; -O0:    csel x14, x8, x12, ne
 ; -O0:    ands w10, w10, #0x1
 ; -O0:    csel x15, x8, x9, ne
 ; -O0:    ldaxp x10, x9, [x11]
@@ -4749,15 +4681,11 @@ define dso_local i128 @atomicrmw_max_i128_aligned_acquire(ptr %ptr, i128 %value)
 define dso_local i128 @atomicrmw_max_i128_aligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_aligned_release:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w8, w8, w10, ne
-; -O0:    and w13, w8, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x14, x10, x12, ne
-; -O0:    and w10, w8, #0x1
+; -O0:    subs x13, x13, x9
+; -O0:    csel w10, w8, w10, eq
+; -O0:    ands w13, w10, #0x1
+; -O0:    csel x14, x8, x12, ne
 ; -O0:    ands w10, w10, #0x1
 ; -O0:    csel x15, x8, x9, ne
 ; -O0:    ldxp x10, x9, [x11]
@@ -4783,15 +4711,11 @@ define dso_local i128 @atomicrmw_max_i128_aligned_release(ptr %ptr, i128 %value)
 define dso_local i128 @atomicrmw_max_i128_aligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_aligned_acq_rel:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w8, w8, w10, ne
-; -O0:    and w13, w8, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x14, x10, x12, ne
-; -O0:    and w10, w8, #0x1
+; -O0:    subs x13, x13, x9
+; -O0:    csel w10, w8, w10, eq
+; -O0:    ands w13, w10, #0x1
+; -O0:    csel x14, x8, x12, ne
 ; -O0:    ands w10, w10, #0x1
 ; -O0:    csel x15, x8, x9, ne
 ; -O0:    ldaxp x10, x9, [x11]
@@ -4817,15 +4741,11 @@ define dso_local i128 @atomicrmw_max_i128_aligned_acq_rel(ptr %ptr, i128 %value)
 define dso_local i128 @atomicrmw_max_i128_aligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_aligned_seq_cst:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w8, w8, w10, ne
-; -O0:    and w13, w8, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x14, x10, x12, ne
-; -O0:    and w10, w8, #0x1
+; -O0:    subs x13, x13, x9
+; -O0:    csel w10, w8, w10, eq
+; -O0:    ands w13, w10, #0x1
+; -O0:    csel x14, x8, x12, ne
 ; -O0:    ands w10, w10, #0x1
 ; -O0:    csel x15, x8, x9, ne
 ; -O0:    ldaxp x10, x9, [x11]
@@ -4852,9 +4772,7 @@ define dso_local i8 @atomicrmw_max_i8_unaligned_monotonic(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_max_i8_unaligned_monotonic:
 ; -O0:    sxtb w9, w10
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, gt
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -4875,9 +4793,7 @@ define dso_local i8 @atomicrmw_max_i8_unaligned_acquire(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_max_i8_unaligned_acquire:
 ; -O0:    sxtb w9, w10
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, gt
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -4898,9 +4814,7 @@ define dso_local i8 @atomicrmw_max_i8_unaligned_release(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_max_i8_unaligned_release:
 ; -O0:    sxtb w9, w10
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, gt
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -4921,9 +4835,7 @@ define dso_local i8 @atomicrmw_max_i8_unaligned_acq_rel(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_max_i8_unaligned_acq_rel:
 ; -O0:    sxtb w9, w10
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, gt
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -4944,9 +4856,7 @@ define dso_local i8 @atomicrmw_max_i8_unaligned_seq_cst(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_max_i8_unaligned_seq_cst:
 ; -O0:    sxtb w9, w10
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, gt
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -4967,9 +4877,7 @@ define dso_local i16 @atomicrmw_max_i16_unaligned_monotonic(ptr %ptr, i16 %value
 ; -O0-LABEL: atomicrmw_max_i16_unaligned_monotonic:
 ; -O0:    sxth w10, w9
 ; -O0:    subs w10, w10, w8, sxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, gt
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i16_unaligned_monotonic:
@@ -4985,9 +4893,7 @@ define dso_local i16 @atomicrmw_max_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ; -O0-LABEL: atomicrmw_max_i16_unaligned_acquire:
 ; -O0:    sxth w10, w9
 ; -O0:    subs w10, w10, w8, sxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, gt
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i16_unaligned_acquire:
@@ -5003,9 +4909,7 @@ define dso_local i16 @atomicrmw_max_i16_unaligned_release(ptr %ptr, i16 %value)
 ; -O0-LABEL: atomicrmw_max_i16_unaligned_release:
 ; -O0:    sxth w10, w9
 ; -O0:    subs w10, w10, w8, sxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, gt
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i16_unaligned_release:
@@ -5021,9 +4925,7 @@ define dso_local i16 @atomicrmw_max_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ; -O0-LABEL: atomicrmw_max_i16_unaligned_acq_rel:
 ; -O0:    sxth w10, w9
 ; -O0:    subs w10, w10, w8, sxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, gt
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i16_unaligned_acq_rel:
@@ -5039,9 +4941,7 @@ define dso_local i16 @atomicrmw_max_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ; -O0-LABEL: atomicrmw_max_i16_unaligned_seq_cst:
 ; -O0:    sxth w10, w9
 ; -O0:    subs w10, w10, w8, sxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, gt
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i16_unaligned_seq_cst:
@@ -5056,9 +4956,7 @@ define dso_local i16 @atomicrmw_max_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 define dso_local i32 @atomicrmw_max_i32_unaligned_monotonic(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_max_i32_unaligned_monotonic:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, gt
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i32_unaligned_monotonic:
@@ -5072,9 +4970,7 @@ define dso_local i32 @atomicrmw_max_i32_unaligned_monotonic(ptr %ptr, i32 %value
 define dso_local i32 @atomicrmw_max_i32_unaligned_acquire(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_max_i32_unaligned_acquire:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, gt
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i32_unaligned_acquire:
@@ -5088,9 +4984,7 @@ define dso_local i32 @atomicrmw_max_i32_unaligned_acquire(ptr %ptr, i32 %value)
 define dso_local i32 @atomicrmw_max_i32_unaligned_release(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_max_i32_unaligned_release:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, gt
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i32_unaligned_release:
@@ -5104,9 +4998,7 @@ define dso_local i32 @atomicrmw_max_i32_unaligned_release(ptr %ptr, i32 %value)
 define dso_local i32 @atomicrmw_max_i32_unaligned_acq_rel(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_max_i32_unaligned_acq_rel:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, gt
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i32_unaligned_acq_rel:
@@ -5120,9 +5012,7 @@ define dso_local i32 @atomicrmw_max_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 define dso_local i32 @atomicrmw_max_i32_unaligned_seq_cst(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_max_i32_unaligned_seq_cst:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, gt
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i32_unaligned_seq_cst:
@@ -5136,9 +5026,7 @@ define dso_local i32 @atomicrmw_max_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 define dso_local i64 @atomicrmw_max_i64_unaligned_monotonic(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_max_i64_unaligned_monotonic:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, gt
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i64_unaligned_monotonic:
@@ -5152,9 +5040,7 @@ define dso_local i64 @atomicrmw_max_i64_unaligned_monotonic(ptr %ptr, i64 %value
 define dso_local i64 @atomicrmw_max_i64_unaligned_acquire(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_max_i64_unaligned_acquire:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, gt
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i64_unaligned_acquire:
@@ -5168,9 +5054,7 @@ define dso_local i64 @atomicrmw_max_i64_unaligned_acquire(ptr %ptr, i64 %value)
 define dso_local i64 @atomicrmw_max_i64_unaligned_release(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_max_i64_unaligned_release:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, gt
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i64_unaligned_release:
@@ -5184,9 +5068,7 @@ define dso_local i64 @atomicrmw_max_i64_unaligned_release(ptr %ptr, i64 %value)
 define dso_local i64 @atomicrmw_max_i64_unaligned_acq_rel(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_max_i64_unaligned_acq_rel:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, gt
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i64_unaligned_acq_rel:
@@ -5200,9 +5082,7 @@ define dso_local i64 @atomicrmw_max_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 define dso_local i64 @atomicrmw_max_i64_unaligned_seq_cst(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_max_i64_unaligned_seq_cst:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, gt
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i64_unaligned_seq_cst:
@@ -5216,15 +5096,11 @@ define dso_local i64 @atomicrmw_max_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 define dso_local i128 @atomicrmw_max_i128_unaligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_unaligned_monotonic:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -5242,15 +5118,11 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_monotonic(ptr %ptr, i128 %va
 define dso_local i128 @atomicrmw_max_i128_unaligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_unaligned_acquire:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -5268,15 +5140,11 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_acquire(ptr %ptr, i128 %valu
 define dso_local i128 @atomicrmw_max_i128_unaligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_unaligned_release:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -5294,15 +5162,11 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_release(ptr %ptr, i128 %valu
 define dso_local i128 @atomicrmw_max_i128_unaligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_unaligned_acq_rel:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -5320,15 +5184,11 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_acq_rel(ptr %ptr, i128 %valu
 define dso_local i128 @atomicrmw_max_i128_unaligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_unaligned_seq_cst:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -5347,9 +5207,7 @@ define dso_local i8 @atomicrmw_min_i8_aligned_monotonic(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_min_i8_aligned_monotonic:
 ; -O0:    sxtb w9, w10
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, le
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -5370,9 +5228,7 @@ define dso_local i8 @atomicrmw_min_i8_aligned_acquire(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_min_i8_aligned_acquire:
 ; -O0:    sxtb w9, w10
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, le
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -5393,9 +5249,7 @@ define dso_local i8 @atomicrmw_min_i8_aligned_release(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_min_i8_aligned_release:
 ; -O0:    sxtb w9, w10
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, le
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -5416,9 +5270,7 @@ define dso_local i8 @atomicrmw_min_i8_aligned_acq_rel(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_min_i8_aligned_acq_rel:
 ; -O0:    sxtb w9, w10
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, le
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -5439,9 +5291,7 @@ define dso_local i8 @atomicrmw_min_i8_aligned_seq_cst(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_min_i8_aligned_seq_cst:
 ; -O0:    sxtb w9, w10
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, le
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -5462,9 +5312,7 @@ define dso_local i16 @atomicrmw_min_i16_aligned_monotonic(ptr %ptr, i16 %value)
 ; -O0-LABEL: atomicrmw_min_i16_aligned_monotonic:
 ; -O0:    sxth w10, w8
 ; -O0:    subs w10, w10, w9, sxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, le
 ; -O0:    ldaxrh w9, [x11]
 ; -O0:    cmp w9, w8, uxth
 ; -O0:    stlxrh w10, w12, [x11]
@@ -5484,9 +5332,7 @@ define dso_local i16 @atomicrmw_min_i16_aligned_acquire(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_min_i16_aligned_acquire:
 ; -O0:    sxth w10, w8
 ; -O0:    subs w10, w10, w9, sxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, le
 ; -O0:    ldaxrh w9, [x11]
 ; -O0:    cmp w9, w8, uxth
 ; -O0:    stlxrh w10, w12, [x11]
@@ -5506,9 +5352,7 @@ define dso_local i16 @atomicrmw_min_i16_aligned_release(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_min_i16_aligned_release:
 ; -O0:    sxth w10, w8
 ; -O0:    subs w10, w10, w9, sxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, le
 ; -O0:    ldaxrh w9, [x11]
 ; -O0:    cmp w9, w8, uxth
 ; -O0:    stlxrh w10, w12, [x11]
@@ -5528,9 +5372,7 @@ define dso_local i16 @atomicrmw_min_i16_aligned_acq_rel(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_min_i16_aligned_acq_rel:
 ; -O0:    sxth w10, w8
 ; -O0:    subs w10, w10, w9, sxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, le
 ; -O0:    ldaxrh w9, [x11]
 ; -O0:    cmp w9, w8, uxth
 ; -O0:    stlxrh w10, w12, [x11]
@@ -5550,9 +5392,7 @@ define dso_local i16 @atomicrmw_min_i16_aligned_seq_cst(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_min_i16_aligned_seq_cst:
 ; -O0:    sxth w10, w8
 ; -O0:    subs w10, w10, w9, sxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, le
 ; -O0:    ldaxrh w9, [x11]
 ; -O0:    cmp w9, w8, uxth
 ; -O0:    stlxrh w10, w12, [x11]
@@ -5571,9 +5411,7 @@ define dso_local i16 @atomicrmw_min_i16_aligned_seq_cst(ptr %ptr, i16 %value) {
 define dso_local i32 @atomicrmw_min_i32_aligned_monotonic(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_min_i32_aligned_monotonic:
 ; -O0:    subs w10, w8, w9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, le
 ; -O0:    ldaxr w9, [x11]
 ; -O0:    cmp w9, w8
 ; -O0:    stlxr w10, w12, [x11]
@@ -5591,9 +5429,7 @@ define dso_local i32 @atomicrmw_min_i32_aligned_monotonic(ptr %ptr, i32 %value)
 define dso_local i32 @atomicrmw_min_i32_aligned_acquire(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_min_i32_aligned_acquire:
 ; -O0:    subs w10, w8, w9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, le
 ; -O0:    ldaxr w9, [x11]
 ; -O0:    cmp w9, w8
 ; -O0:    stlxr w10, w12, [x11]
@@ -5611,9 +5447,7 @@ define dso_local i32 @atomicrmw_min_i32_aligned_acquire(ptr %ptr, i32 %value) {
 define dso_local i32 @atomicrmw_min_i32_aligned_release(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_min_i32_aligned_release:
 ; -O0:    subs w10, w8, w9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, le
 ; -O0:    ldaxr w9, [x11]
 ; -O0:    cmp w9, w8
 ; -O0:    stlxr w10, w12, [x11]
@@ -5631,9 +5465,7 @@ define dso_local i32 @atomicrmw_min_i32_aligned_release(ptr %ptr, i32 %value) {
 define dso_local i32 @atomicrmw_min_i32_aligned_acq_rel(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_min_i32_aligned_acq_rel:
 ; -O0:    subs w10, w8, w9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, le
 ; -O0:    ldaxr w9, [x11]
 ; -O0:    cmp w9, w8
 ; -O0:    stlxr w10, w12, [x11]
@@ -5651,9 +5483,7 @@ define dso_local i32 @atomicrmw_min_i32_aligned_acq_rel(ptr %ptr, i32 %value) {
 define dso_local i32 @atomicrmw_min_i32_aligned_seq_cst(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_min_i32_aligned_seq_cst:
 ; -O0:    subs w10, w8, w9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, le
 ; -O0:    ldaxr w9, [x11]
 ; -O0:    cmp w9, w8
 ; -O0:    stlxr w10, w12, [x11]
@@ -5671,9 +5501,7 @@ define dso_local i32 @atomicrmw_min_i32_aligned_seq_cst(ptr %ptr, i32 %value) {
 define dso_local i64 @atomicrmw_min_i64_aligned_monotonic(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_min_i64_aligned_monotonic:
 ; -O0:    subs x10, x8, x9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x12, x8, x9, ne
+; -O0:    csel x12, x8, x9, le
 ; -O0:    ldaxr x9, [x11]
 ; -O0:    cmp x9, x8
 ; -O0:    stlxr w10, x12, [x11]
@@ -5691,9 +5519,7 @@ define dso_local i64 @atomicrmw_min_i64_aligned_monotonic(ptr %ptr, i64 %value)
 define dso_local i64 @atomicrmw_min_i64_aligned_acquire(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_min_i64_aligned_acquire:
 ; -O0:    subs x10, x8, x9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x12, x8, x9, ne
+; -O0:    csel x12, x8, x9, le
 ; -O0:    ldaxr x9, [x11]
 ; -O0:    cmp x9, x8
 ; -O0:    stlxr w10, x12, [x11]
@@ -5711,9 +5537,7 @@ define dso_local i64 @atomicrmw_min_i64_aligned_acquire(ptr %ptr, i64 %value) {
 define dso_local i64 @atomicrmw_min_i64_aligned_release(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_min_i64_aligned_release:
 ; -O0:    subs x10, x8, x9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x12, x8, x9, ne
+; -O0:    csel x12, x8, x9, le
 ; -O0:    ldaxr x9, [x11]
 ; -O0:    cmp x9, x8
 ; -O0:    stlxr w10, x12, [x11]
@@ -5731,9 +5555,7 @@ define dso_local i64 @atomicrmw_min_i64_aligned_release(ptr %ptr, i64 %value) {
 define dso_local i64 @atomicrmw_min_i64_aligned_acq_rel(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_min_i64_aligned_acq_rel:
 ; -O0:    subs x10, x8, x9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x12, x8, x9, ne
+; -O0:    csel x12, x8, x9, le
 ; -O0:    ldaxr x9, [x11]
 ; -O0:    cmp x9, x8
 ; -O0:    stlxr w10, x12, [x11]
@@ -5751,9 +5573,7 @@ define dso_local i64 @atomicrmw_min_i64_aligned_acq_rel(ptr %ptr, i64 %value) {
 define dso_local i64 @atomicrmw_min_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_min_i64_aligned_seq_cst:
 ; -O0:    subs x10, x8, x9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x12, x8, x9, ne
+; -O0:    csel x12, x8, x9, le
 ; -O0:    ldaxr x9, [x11]
 ; -O0:    cmp x9, x8
 ; -O0:    stlxr w10, x12, [x11]
@@ -5771,15 +5591,11 @@ define dso_local i64 @atomicrmw_min_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 define dso_local i128 @atomicrmw_min_i128_aligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_aligned_monotonic:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w8, w8, w10, ne
-; -O0:    and w13, w8, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x14, x10, x12, ne
-; -O0:    and w10, w8, #0x1
+; -O0:    subs x13, x13, x9
+; -O0:    csel w10, w8, w10, eq
+; -O0:    ands w13, w10, #0x1
+; -O0:    csel x14, x8, x12, ne
 ; -O0:    ands w10, w10, #0x1
 ; -O0:    csel x15, x8, x9, ne
 ; -O0:    ldxp x10, x9, [x11]
@@ -5805,15 +5621,11 @@ define dso_local i128 @atomicrmw_min_i128_aligned_monotonic(ptr %ptr, i128 %valu
 define dso_local i128 @atomicrmw_min_i128_aligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_aligned_acquire:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w8, w8, w10, ne
-; -O0:    and w13, w8, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x14, x10, x12, ne
-; -O0:    and w10, w8, #0x1
+; -O0:    subs x13, x13, x9
+; -O0:    csel w10, w8, w10, eq
+; -O0:    ands w13, w10, #0x1
+; -O0:    csel x14, x8, x12, ne
 ; -O0:    ands w10, w10, #0x1
 ; -O0:    csel x15, x8, x9, ne
 ; -O0:    ldaxp x10, x9, [x11]
@@ -5839,15 +5651,11 @@ define dso_local i128 @atomicrmw_min_i128_aligned_acquire(ptr %ptr, i128 %value)
 define dso_local i128 @atomicrmw_min_i128_aligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_aligned_release:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w8, w8, w10, ne
-; -O0:    and w13, w8, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x14, x10, x12, ne
-; -O0:    and w10, w8, #0x1
+; -O0:    subs x13, x13, x9
+; -O0:    csel w10, w8, w10, eq
+; -O0:    ands w13, w10, #0x1
+; -O0:    csel x14, x8, x12, ne
 ; -O0:    ands w10, w10, #0x1
 ; -O0:    csel x15, x8, x9, ne
 ; -O0:    ldxp x10, x9, [x11]
@@ -5873,15 +5681,11 @@ define dso_local i128 @atomicrmw_min_i128_aligned_release(ptr %ptr, i128 %value)
 define dso_local i128 @atomicrmw_min_i128_aligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_aligned_acq_rel:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w8, w8, w10, ne
-; -O0:    and w13, w8, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x14, x10, x12, ne
-; -O0:    and w10, w8, #0x1
+; -O0:    subs x13, x13, x9
+; -O0:    csel w10, w8, w10, eq
+; -O0:    ands w13, w10, #0x1
+; -O0:    csel x14, x8, x12, ne
 ; -O0:    ands w10, w10, #0x1
 ; -O0:    csel x15, x8, x9, ne
 ; -O0:    ldaxp x10, x9, [x11]
@@ -5907,15 +5711,11 @@ define dso_local i128 @atomicrmw_min_i128_aligned_acq_rel(ptr %ptr, i128 %value)
 define dso_local i128 @atomicrmw_min_i128_aligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_aligned_seq_cst:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w8, w8, w10, ne
-; -O0:    and w13, w8, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x14, x10, x12, ne
-; -O0:    and w10, w8, #0x1
+; -O0:    subs x13, x13, x9
+; -O0:    csel w10, w8, w10, eq
+; -O0:    ands w13, w10, #0x1
+; -O0:    csel x14, x8, x12, ne
 ; -O0:    ands w10, w10, #0x1
 ; -O0:    csel x15, x8, x9, ne
 ; -O0:    ldaxp x10, x9, [x11]
@@ -5942,9 +5742,7 @@ define dso_local i8 @atomicrmw_min_i8_unaligned_monotonic(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_min_i8_unaligned_monotonic:
 ; -O0:    sxtb w9, w10
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, le
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -5965,9 +5763,7 @@ define dso_local i8 @atomicrmw_min_i8_unaligned_acquire(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_min_i8_unaligned_acquire:
 ; -O0:    sxtb w9, w10
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, le
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -5988,9 +5784,7 @@ define dso_local i8 @atomicrmw_min_i8_unaligned_release(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_min_i8_unaligned_release:
 ; -O0:    sxtb w9, w10
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, le
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -6011,9 +5805,7 @@ define dso_local i8 @atomicrmw_min_i8_unaligned_acq_rel(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_min_i8_unaligned_acq_rel:
 ; -O0:    sxtb w9, w10
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, le
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -6034,9 +5826,7 @@ define dso_local i8 @atomicrmw_min_i8_unaligned_seq_cst(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_min_i8_unaligned_seq_cst:
 ; -O0:    sxtb w9, w10
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, le
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -6057,9 +5847,7 @@ define dso_local i16 @atomicrmw_min_i16_unaligned_monotonic(ptr %ptr, i16 %value
 ; -O0-LABEL: atomicrmw_min_i16_unaligned_monotonic:
 ; -O0:    sxth w10, w9
 ; -O0:    subs w10, w10, w8, sxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, le
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i16_unaligned_monotonic:
@@ -6075,9 +5863,7 @@ define dso_local i16 @atomicrmw_min_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ; -O0-LABEL: atomicrmw_min_i16_unaligned_acquire:
 ; -O0:    sxth w10, w9
 ; -O0:    subs w10, w10, w8, sxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, le
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i16_unaligned_acquire:
@@ -6093,9 +5879,7 @@ define dso_local i16 @atomicrmw_min_i16_unaligned_release(ptr %ptr, i16 %value)
 ; -O0-LABEL: atomicrmw_min_i16_unaligned_release:
 ; -O0:    sxth w10, w9
 ; -O0:    subs w10, w10, w8, sxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, le
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i16_unaligned_release:
@@ -6111,9 +5895,7 @@ define dso_local i16 @atomicrmw_min_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ; -O0-LABEL: atomicrmw_min_i16_unaligned_acq_rel:
 ; -O0:    sxth w10, w9
 ; -O0:    subs w10, w10, w8, sxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, le
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i16_unaligned_acq_rel:
@@ -6129,9 +5911,7 @@ define dso_local i16 @atomicrmw_min_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ; -O0-LABEL: atomicrmw_min_i16_unaligned_seq_cst:
 ; -O0:    sxth w10, w9
 ; -O0:    subs w10, w10, w8, sxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, le
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i16_unaligned_seq_cst:
@@ -6146,9 +5926,7 @@ define dso_local i16 @atomicrmw_min_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 define dso_local i32 @atomicrmw_min_i32_unaligned_monotonic(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_min_i32_unaligned_monotonic:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, le
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i32_unaligned_monotonic:
@@ -6162,9 +5940,7 @@ define dso_local i32 @atomicrmw_min_i32_unaligned_monotonic(ptr %ptr, i32 %value
 define dso_local i32 @atomicrmw_min_i32_unaligned_acquire(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_min_i32_unaligned_acquire:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, le
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i32_unaligned_acquire:
@@ -6178,9 +5954,7 @@ define dso_local i32 @atomicrmw_min_i32_unaligned_acquire(ptr %ptr, i32 %value)
 define dso_local i32 @atomicrmw_min_i32_unaligned_release(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_min_i32_unaligned_release:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, le
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i32_unaligned_release:
@@ -6194,9 +5968,7 @@ define dso_local i32 @atomicrmw_min_i32_unaligned_release(ptr %ptr, i32 %value)
 define dso_local i32 @atomicrmw_min_i32_unaligned_acq_rel(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_min_i32_unaligned_acq_rel:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, le
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i32_unaligned_acq_rel:
@@ -6210,9 +5982,7 @@ define dso_local i32 @atomicrmw_min_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 define dso_local i32 @atomicrmw_min_i32_unaligned_seq_cst(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_min_i32_unaligned_seq_cst:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, le
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i32_unaligned_seq_cst:
@@ -6226,9 +5996,7 @@ define dso_local i32 @atomicrmw_min_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 define dso_local i64 @atomicrmw_min_i64_unaligned_monotonic(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_min_i64_unaligned_monotonic:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, le
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i64_unaligned_monotonic:
@@ -6242,9 +6010,7 @@ define dso_local i64 @atomicrmw_min_i64_unaligned_monotonic(ptr %ptr, i64 %value
 define dso_local i64 @atomicrmw_min_i64_unaligned_acquire(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_min_i64_unaligned_acquire:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, le
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i64_unaligned_acquire:
@@ -6258,9 +6024,7 @@ define dso_local i64 @atomicrmw_min_i64_unaligned_acquire(ptr %ptr, i64 %value)
 define dso_local i64 @atomicrmw_min_i64_unaligned_release(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_min_i64_unaligned_release:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, le
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i64_unaligned_release:
@@ -6274,9 +6038,7 @@ define dso_local i64 @atomicrmw_min_i64_unaligned_release(ptr %ptr, i64 %value)
 define dso_local i64 @atomicrmw_min_i64_unaligned_acq_rel(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_min_i64_unaligned_acq_rel:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, le
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i64_unaligned_acq_rel:
@@ -6290,9 +6052,7 @@ define dso_local i64 @atomicrmw_min_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 define dso_local i64 @atomicrmw_min_i64_unaligned_seq_cst(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_min_i64_unaligned_seq_cst:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, le
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i64_unaligned_seq_cst:
@@ -6306,15 +6066,11 @@ define dso_local i64 @atomicrmw_min_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 define dso_local i128 @atomicrmw_min_i128_unaligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_unaligned_monotonic:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -6332,15 +6088,11 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_monotonic(ptr %ptr, i128 %va
 define dso_local i128 @atomicrmw_min_i128_unaligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_unaligned_acquire:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -6358,15 +6110,11 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_acquire(ptr %ptr, i128 %valu
 define dso_local i128 @atomicrmw_min_i128_unaligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_unaligned_release:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -6384,15 +6132,11 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_release(ptr %ptr, i128 %valu
 define dso_local i128 @atomicrmw_min_i128_unaligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_unaligned_acq_rel:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -6410,15 +6154,11 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_acq_rel(ptr %ptr, i128 %valu
 define dso_local i128 @atomicrmw_min_i128_unaligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_unaligned_seq_cst:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -6437,9 +6177,7 @@ define dso_local i8 @atomicrmw_umax_i8_aligned_monotonic(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umax_i8_aligned_monotonic:
 ; -O0:    and w9, w10, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, hi
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -6460,9 +6198,7 @@ define dso_local i8 @atomicrmw_umax_i8_aligned_acquire(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umax_i8_aligned_acquire:
 ; -O0:    and w9, w10, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, hi
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -6483,9 +6219,7 @@ define dso_local i8 @atomicrmw_umax_i8_aligned_release(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umax_i8_aligned_release:
 ; -O0:    and w9, w10, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, hi
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -6506,9 +6240,7 @@ define dso_local i8 @atomicrmw_umax_i8_aligned_acq_rel(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umax_i8_aligned_acq_rel:
 ; -O0:    and w9, w10, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, hi
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -6529,9 +6261,7 @@ define dso_local i8 @atomicrmw_umax_i8_aligned_seq_cst(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umax_i8_aligned_seq_cst:
 ; -O0:    and w9, w10, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, hi
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -6551,9 +6281,7 @@ define dso_local i8 @atomicrmw_umax_i8_aligned_seq_cst(ptr %ptr, i8 %value) {
 define dso_local i16 @atomicrmw_umax_i16_aligned_monotonic(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umax_i16_aligned_monotonic:
 ; -O0:    subs w10, w10, w9, uxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, hi
 ; -O0:    ldaxrh w9, [x11]
 ; -O0:    cmp w9, w8, uxth
 ; -O0:    stlxrh w10, w12, [x11]
@@ -6572,9 +6300,7 @@ define dso_local i16 @atomicrmw_umax_i16_aligned_monotonic(ptr %ptr, i16 %value)
 define dso_local i16 @atomicrmw_umax_i16_aligned_acquire(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umax_i16_aligned_acquire:
 ; -O0:    subs w10, w10, w9, uxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, hi
 ; -O0:    ldaxrh w9, [x11]
 ; -O0:    cmp w9, w8, uxth
 ; -O0:    stlxrh w10, w12, [x11]
@@ -6593,9 +6319,7 @@ define dso_local i16 @atomicrmw_umax_i16_aligned_acquire(ptr %ptr, i16 %value) {
 define dso_local i16 @atomicrmw_umax_i16_aligned_release(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umax_i16_aligned_release:
 ; -O0:    subs w10, w10, w9, uxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, hi
 ; -O0:    ldaxrh w9, [x11]
 ; -O0:    cmp w9, w8, uxth
 ; -O0:    stlxrh w10, w12, [x11]
@@ -6614,9 +6338,7 @@ define dso_local i16 @atomicrmw_umax_i16_aligned_release(ptr %ptr, i16 %value) {
 define dso_local i16 @atomicrmw_umax_i16_aligned_acq_rel(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umax_i16_aligned_acq_rel:
 ; -O0:    subs w10, w10, w9, uxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, hi
 ; -O0:    ldaxrh w9, [x11]
 ; -O0:    cmp w9, w8, uxth
 ; -O0:    stlxrh w10, w12, [x11]
@@ -6635,9 +6357,7 @@ define dso_local i16 @atomicrmw_umax_i16_aligned_acq_rel(ptr %ptr, i16 %value) {
 define dso_local i16 @atomicrmw_umax_i16_aligned_seq_cst(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umax_i16_aligned_seq_cst:
 ; -O0:    subs w10, w10, w9, uxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, hi
 ; -O0:    ldaxrh w9, [x11]
 ; -O0:    cmp w9, w8, uxth
 ; -O0:    stlxrh w10, w12, [x11]
@@ -6656,9 +6376,7 @@ define dso_local i16 @atomicrmw_umax_i16_aligned_seq_cst(ptr %ptr, i16 %value) {
 define dso_local i32 @atomicrmw_umax_i32_aligned_monotonic(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umax_i32_aligned_monotonic:
 ; -O0:    subs w10, w8, w9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, hi
 ; -O0:    ldaxr w9, [x11]
 ; -O0:    cmp w9, w8
 ; -O0:    stlxr w10, w12, [x11]
@@ -6676,9 +6394,7 @@ define dso_local i32 @atomicrmw_umax_i32_aligned_monotonic(ptr %ptr, i32 %value)
 define dso_local i32 @atomicrmw_umax_i32_aligned_acquire(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umax_i32_aligned_acquire:
 ; -O0:    subs w10, w8, w9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, hi
 ; -O0:    ldaxr w9, [x11]
 ; -O0:    cmp w9, w8
 ; -O0:    stlxr w10, w12, [x11]
@@ -6696,9 +6412,7 @@ define dso_local i32 @atomicrmw_umax_i32_aligned_acquire(ptr %ptr, i32 %value) {
 define dso_local i32 @atomicrmw_umax_i32_aligned_release(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umax_i32_aligned_release:
 ; -O0:    subs w10, w8, w9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, hi
 ; -O0:    ldaxr w9, [x11]
 ; -O0:    cmp w9, w8
 ; -O0:    stlxr w10, w12, [x11]
@@ -6716,9 +6430,7 @@ define dso_local i32 @atomicrmw_umax_i32_aligned_release(ptr %ptr, i32 %value) {
 define dso_local i32 @atomicrmw_umax_i32_aligned_acq_rel(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umax_i32_aligned_acq_rel:
 ; -O0:    subs w10, w8, w9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, hi
 ; -O0:    ldaxr w9, [x11]
 ; -O0:    cmp w9, w8
 ; -O0:    stlxr w10, w12, [x11]
@@ -6736,9 +6448,7 @@ define dso_local i32 @atomicrmw_umax_i32_aligned_acq_rel(ptr %ptr, i32 %value) {
 define dso_local i32 @atomicrmw_umax_i32_aligned_seq_cst(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umax_i32_aligned_seq_cst:
 ; -O0:    subs w10, w8, w9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, hi
 ; -O0:    ldaxr w9, [x11]
 ; -O0:    cmp w9, w8
 ; -O0:    stlxr w10, w12, [x11]
@@ -6756,9 +6466,7 @@ define dso_local i32 @atomicrmw_umax_i32_aligned_seq_cst(ptr %ptr, i32 %value) {
 define dso_local i64 @atomicrmw_umax_i64_aligned_monotonic(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umax_i64_aligned_monotonic:
 ; -O0:    subs x10, x8, x9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x12, x8, x9, ne
+; -O0:    csel x12, x8, x9, hi
 ; -O0:    ldaxr x9, [x11]
 ; -O0:    cmp x9, x8
 ; -O0:    stlxr w10, x12, [x11]
@@ -6776,9 +6484,7 @@ define dso_local i64 @atomicrmw_umax_i64_aligned_monotonic(ptr %ptr, i64 %value)
 define dso_local i64 @atomicrmw_umax_i64_aligned_acquire(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umax_i64_aligned_acquire:
 ; -O0:    subs x10, x8, x9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x12, x8, x9, ne
+; -O0:    csel x12, x8, x9, hi
 ; -O0:    ldaxr x9, [x11]
 ; -O0:    cmp x9, x8
 ; -O0:    stlxr w10, x12, [x11]
@@ -6796,9 +6502,7 @@ define dso_local i64 @atomicrmw_umax_i64_aligned_acquire(ptr %ptr, i64 %value) {
 define dso_local i64 @atomicrmw_umax_i64_aligned_release(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umax_i64_aligned_release:
 ; -O0:    subs x10, x8, x9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x12, x8, x9, ne
+; -O0:    csel x12, x8, x9, hi
 ; -O0:    ldaxr x9, [x11]
 ; -O0:    cmp x9, x8
 ; -O0:    stlxr w10, x12, [x11]
@@ -6816,9 +6520,7 @@ define dso_local i64 @atomicrmw_umax_i64_aligned_release(ptr %ptr, i64 %value) {
 define dso_local i64 @atomicrmw_umax_i64_aligned_acq_rel(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umax_i64_aligned_acq_rel:
 ; -O0:    subs x10, x8, x9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x12, x8, x9, ne
+; -O0:    csel x12, x8, x9, hi
 ; -O0:    ldaxr x9, [x11]
 ; -O0:    cmp x9, x8
 ; -O0:    stlxr w10, x12, [x11]
@@ -6836,9 +6538,7 @@ define dso_local i64 @atomicrmw_umax_i64_aligned_acq_rel(ptr %ptr, i64 %value) {
 define dso_local i64 @atomicrmw_umax_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umax_i64_aligned_seq_cst:
 ; -O0:    subs x10, x8, x9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x12, x8, x9, ne
+; -O0:    csel x12, x8, x9, hi
 ; -O0:    ldaxr x9, [x11]
 ; -O0:    cmp x9, x8
 ; -O0:    stlxr w10, x12, [x11]
@@ -6856,15 +6556,11 @@ define dso_local i64 @atomicrmw_umax_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 define dso_local i128 @atomicrmw_umax_i128_aligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_aligned_monotonic:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w8, w8, w10, ne
-; -O0:    and w13, w8, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x14, x10, x12, ne
-; -O0:    and w10, w8, #0x1
+; -O0:    subs x13, x13, x9
+; -O0:    csel w10, w8, w10, eq
+; -O0:    ands w13, w10, #0x1
+; -O0:    csel x14, x8, x12, ne
 ; -O0:    ands w10, w10, #0x1
 ; -O0:    csel x15, x8, x9, ne
 ; -O0:    ldxp x10, x9, [x11]
@@ -6890,15 +6586,11 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_monotonic(ptr %ptr, i128 %val
 define dso_local i128 @atomicrmw_umax_i128_aligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_aligned_acquire:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w8, w8, w10, ne
-; -O0:    and w13, w8, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x14, x10, x12, ne
-; -O0:    and w10, w8, #0x1
+; -O0:    subs x13, x13, x9
+; -O0:    csel w10, w8, w10, eq
+; -O0:    ands w13, w10, #0x1
+; -O0:    csel x14, x8, x12, ne
 ; -O0:    ands w10, w10, #0x1
 ; -O0:    csel x15, x8, x9, ne
 ; -O0:    ldaxp x10, x9, [x11]
@@ -6924,15 +6616,11 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_acquire(ptr %ptr, i128 %value
 define dso_local i128 @atomicrmw_umax_i128_aligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_aligned_release:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w8, w8, w10, ne
-; -O0:    and w13, w8, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x14, x10, x12, ne
-; -O0:    and w10, w8, #0x1
+; -O0:    subs x13, x13, x9
+; -O0:    csel w10, w8, w10, eq
+; -O0:    ands w13, w10, #0x1
+; -O0:    csel x14, x8, x12, ne
 ; -O0:    ands w10, w10, #0x1
 ; -O0:    csel x15, x8, x9, ne
 ; -O0:    ldxp x10, x9, [x11]
@@ -6958,15 +6646,11 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_release(ptr %ptr, i128 %value
 define dso_local i128 @atomicrmw_umax_i128_aligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_aligned_acq_rel:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w8, w8, w10, ne
-; -O0:    and w13, w8, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x14, x10, x12, ne
-; -O0:    and w10, w8, #0x1
+; -O0:    subs x13, x13, x9
+; -O0:    csel w10, w8, w10, eq
+; -O0:    ands w13, w10, #0x1
+; -O0:    csel x14, x8, x12, ne
 ; -O0:    ands w10, w10, #0x1
 ; -O0:    csel x15, x8, x9, ne
 ; -O0:    ldaxp x10, x9, [x11]
@@ -6992,15 +6676,11 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_acq_rel(ptr %ptr, i128 %value
 define dso_local i128 @atomicrmw_umax_i128_aligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_aligned_seq_cst:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w8, w8, w10, ne
-; -O0:    and w13, w8, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x14, x10, x12, ne
-; -O0:    and w10, w8, #0x1
+; -O0:    subs x13, x13, x9
+; -O0:    csel w10, w8, w10, eq
+; -O0:    ands w13, w10, #0x1
+; -O0:    csel x14, x8, x12, ne
 ; -O0:    ands w10, w10, #0x1
 ; -O0:    csel x15, x8, x9, ne
 ; -O0:    ldaxp x10, x9, [x11]
@@ -7027,9 +6707,7 @@ define dso_local i8 @atomicrmw_umax_i8_unaligned_monotonic(ptr %ptr, i8 %value)
 ; -O0-LABEL: atomicrmw_umax_i8_unaligned_monotonic:
 ; -O0:    and w9, w10, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, hi
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -7050,9 +6728,7 @@ define dso_local i8 @atomicrmw_umax_i8_unaligned_acquire(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umax_i8_unaligned_acquire:
 ; -O0:    and w9, w10, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, hi
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -7073,9 +6749,7 @@ define dso_local i8 @atomicrmw_umax_i8_unaligned_release(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umax_i8_unaligned_release:
 ; -O0:    and w9, w10, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, hi
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -7096,9 +6770,7 @@ define dso_local i8 @atomicrmw_umax_i8_unaligned_acq_rel(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umax_i8_unaligned_acq_rel:
 ; -O0:    and w9, w10, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, hi
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -7119,9 +6791,7 @@ define dso_local i8 @atomicrmw_umax_i8_unaligned_seq_cst(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umax_i8_unaligned_seq_cst:
 ; -O0:    and w9, w10, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, hi
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -7141,9 +6811,7 @@ define dso_local i8 @atomicrmw_umax_i8_unaligned_seq_cst(ptr %ptr, i8 %value) {
 define dso_local i16 @atomicrmw_umax_i16_unaligned_monotonic(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umax_i16_unaligned_monotonic:
 ; -O0:    subs w10, w10, w8, uxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, hi
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i16_unaligned_monotonic:
@@ -7158,9 +6826,7 @@ define dso_local i16 @atomicrmw_umax_i16_unaligned_monotonic(ptr %ptr, i16 %valu
 define dso_local i16 @atomicrmw_umax_i16_unaligned_acquire(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umax_i16_unaligned_acquire:
 ; -O0:    subs w10, w10, w8, uxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, hi
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i16_unaligned_acquire:
@@ -7175,9 +6841,7 @@ define dso_local i16 @atomicrmw_umax_i16_unaligned_acquire(ptr %ptr, i16 %value)
 define dso_local i16 @atomicrmw_umax_i16_unaligned_release(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umax_i16_unaligned_release:
 ; -O0:    subs w10, w10, w8, uxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, hi
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i16_unaligned_release:
@@ -7192,9 +6856,7 @@ define dso_local i16 @atomicrmw_umax_i16_unaligned_release(ptr %ptr, i16 %value)
 define dso_local i16 @atomicrmw_umax_i16_unaligned_acq_rel(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umax_i16_unaligned_acq_rel:
 ; -O0:    subs w10, w10, w8, uxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, hi
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i16_unaligned_acq_rel:
@@ -7209,9 +6871,7 @@ define dso_local i16 @atomicrmw_umax_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 define dso_local i16 @atomicrmw_umax_i16_unaligned_seq_cst(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umax_i16_unaligned_seq_cst:
 ; -O0:    subs w10, w10, w8, uxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, hi
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i16_unaligned_seq_cst:
@@ -7226,9 +6886,7 @@ define dso_local i16 @atomicrmw_umax_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 define dso_local i32 @atomicrmw_umax_i32_unaligned_monotonic(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umax_i32_unaligned_monotonic:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, hi
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i32_unaligned_monotonic:
@@ -7242,9 +6900,7 @@ define dso_local i32 @atomicrmw_umax_i32_unaligned_monotonic(ptr %ptr, i32 %valu
 define dso_local i32 @atomicrmw_umax_i32_unaligned_acquire(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umax_i32_unaligned_acquire:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, hi
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i32_unaligned_acquire:
@@ -7258,9 +6914,7 @@ define dso_local i32 @atomicrmw_umax_i32_unaligned_acquire(ptr %ptr, i32 %value)
 define dso_local i32 @atomicrmw_umax_i32_unaligned_release(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umax_i32_unaligned_release:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, hi
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i32_unaligned_release:
@@ -7274,9 +6928,7 @@ define dso_local i32 @atomicrmw_umax_i32_unaligned_release(ptr %ptr, i32 %value)
 define dso_local i32 @atomicrmw_umax_i32_unaligned_acq_rel(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umax_i32_unaligned_acq_rel:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, hi
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i32_unaligned_acq_rel:
@@ -7290,9 +6942,7 @@ define dso_local i32 @atomicrmw_umax_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 define dso_local i32 @atomicrmw_umax_i32_unaligned_seq_cst(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umax_i32_unaligned_seq_cst:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, hi
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i32_unaligned_seq_cst:
@@ -7306,9 +6956,7 @@ define dso_local i32 @atomicrmw_umax_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 define dso_local i64 @atomicrmw_umax_i64_unaligned_monotonic(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umax_i64_unaligned_monotonic:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, hi
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i64_unaligned_monotonic:
@@ -7322,9 +6970,7 @@ define dso_local i64 @atomicrmw_umax_i64_unaligned_monotonic(ptr %ptr, i64 %valu
 define dso_local i64 @atomicrmw_umax_i64_unaligned_acquire(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umax_i64_unaligned_acquire:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, hi
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i64_unaligned_acquire:
@@ -7338,9 +6984,7 @@ define dso_local i64 @atomicrmw_umax_i64_unaligned_acquire(ptr %ptr, i64 %value)
 define dso_local i64 @atomicrmw_umax_i64_unaligned_release(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umax_i64_unaligned_release:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, hi
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i64_unaligned_release:
@@ -7354,9 +6998,7 @@ define dso_local i64 @atomicrmw_umax_i64_unaligned_release(ptr %ptr, i64 %value)
 define dso_local i64 @atomicrmw_umax_i64_unaligned_acq_rel(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umax_i64_unaligned_acq_rel:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, hi
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i64_unaligned_acq_rel:
@@ -7370,9 +7012,7 @@ define dso_local i64 @atomicrmw_umax_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 define dso_local i64 @atomicrmw_umax_i64_unaligned_seq_cst(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umax_i64_unaligned_seq_cst:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, hi
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i64_unaligned_seq_cst:
@@ -7386,15 +7026,11 @@ define dso_local i64 @atomicrmw_umax_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 define dso_local i128 @atomicrmw_umax_i128_unaligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_unaligned_monotonic:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -7412,15 +7048,11 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_monotonic(ptr %ptr, i128 %v
 define dso_local i128 @atomicrmw_umax_i128_unaligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_unaligned_acquire:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -7438,15 +7070,11 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_acquire(ptr %ptr, i128 %val
 define dso_local i128 @atomicrmw_umax_i128_unaligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_unaligned_release:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -7464,15 +7092,11 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_release(ptr %ptr, i128 %val
 define dso_local i128 @atomicrmw_umax_i128_unaligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_unaligned_acq_rel:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -7490,15 +7114,11 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_acq_rel(ptr %ptr, i128 %val
 define dso_local i128 @atomicrmw_umax_i128_unaligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_unaligned_seq_cst:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -7517,9 +7137,7 @@ define dso_local i8 @atomicrmw_umin_i8_aligned_monotonic(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umin_i8_aligned_monotonic:
 ; -O0:    and w9, w10, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, ls
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -7540,9 +7158,7 @@ define dso_local i8 @atomicrmw_umin_i8_aligned_acquire(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umin_i8_aligned_acquire:
 ; -O0:    and w9, w10, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, ls
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -7563,9 +7179,7 @@ define dso_local i8 @atomicrmw_umin_i8_aligned_release(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umin_i8_aligned_release:
 ; -O0:    and w9, w10, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, ls
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -7586,9 +7200,7 @@ define dso_local i8 @atomicrmw_umin_i8_aligned_acq_rel(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umin_i8_aligned_acq_rel:
 ; -O0:    and w9, w10, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, ls
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -7609,9 +7221,7 @@ define dso_local i8 @atomicrmw_umin_i8_aligned_seq_cst(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umin_i8_aligned_seq_cst:
 ; -O0:    and w9, w10, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, ls
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -7631,9 +7241,7 @@ define dso_local i8 @atomicrmw_umin_i8_aligned_seq_cst(ptr %ptr, i8 %value) {
 define dso_local i16 @atomicrmw_umin_i16_aligned_monotonic(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umin_i16_aligned_monotonic:
 ; -O0:    subs w10, w10, w9, uxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, ls
 ; -O0:    ldaxrh w9, [x11]
 ; -O0:    cmp w9, w8, uxth
 ; -O0:    stlxrh w10, w12, [x11]
@@ -7652,9 +7260,7 @@ define dso_local i16 @atomicrmw_umin_i16_aligned_monotonic(ptr %ptr, i16 %value)
 define dso_local i16 @atomicrmw_umin_i16_aligned_acquire(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umin_i16_aligned_acquire:
 ; -O0:    subs w10, w10, w9, uxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, ls
 ; -O0:    ldaxrh w9, [x11]
 ; -O0:    cmp w9, w8, uxth
 ; -O0:    stlxrh w10, w12, [x11]
@@ -7673,9 +7279,7 @@ define dso_local i16 @atomicrmw_umin_i16_aligned_acquire(ptr %ptr, i16 %value) {
 define dso_local i16 @atomicrmw_umin_i16_aligned_release(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umin_i16_aligned_release:
 ; -O0:    subs w10, w10, w9, uxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, ls
 ; -O0:    ldaxrh w9, [x11]
 ; -O0:    cmp w9, w8, uxth
 ; -O0:    stlxrh w10, w12, [x11]
@@ -7694,9 +7298,7 @@ define dso_local i16 @atomicrmw_umin_i16_aligned_release(ptr %ptr, i16 %value) {
 define dso_local i16 @atomicrmw_umin_i16_aligned_acq_rel(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umin_i16_aligned_acq_rel:
 ; -O0:    subs w10, w10, w9, uxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, ls
 ; -O0:    ldaxrh w9, [x11]
 ; -O0:    cmp w9, w8, uxth
 ; -O0:    stlxrh w10, w12, [x11]
@@ -7715,9 +7317,7 @@ define dso_local i16 @atomicrmw_umin_i16_aligned_acq_rel(ptr %ptr, i16 %value) {
 define dso_local i16 @atomicrmw_umin_i16_aligned_seq_cst(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umin_i16_aligned_seq_cst:
 ; -O0:    subs w10, w10, w9, uxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, ls
 ; -O0:    ldaxrh w9, [x11]
 ; -O0:    cmp w9, w8, uxth
 ; -O0:    stlxrh w10, w12, [x11]
@@ -7736,9 +7336,7 @@ define dso_local i16 @atomicrmw_umin_i16_aligned_seq_cst(ptr %ptr, i16 %value) {
 define dso_local i32 @atomicrmw_umin_i32_aligned_monotonic(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umin_i32_aligned_monotonic:
 ; -O0:    subs w10, w8, w9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, ls
 ; -O0:    ldaxr w9, [x11]
 ; -O0:    cmp w9, w8
 ; -O0:    stlxr w10, w12, [x11]
@@ -7756,9 +7354,7 @@ define dso_local i32 @atomicrmw_umin_i32_aligned_monotonic(ptr %ptr, i32 %value)
 define dso_local i32 @atomicrmw_umin_i32_aligned_acquire(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umin_i32_aligned_acquire:
 ; -O0:    subs w10, w8, w9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, ls
 ; -O0:    ldaxr w9, [x11]
 ; -O0:    cmp w9, w8
 ; -O0:    stlxr w10, w12, [x11]
@@ -7776,9 +7372,7 @@ define dso_local i32 @atomicrmw_umin_i32_aligned_acquire(ptr %ptr, i32 %value) {
 define dso_local i32 @atomicrmw_umin_i32_aligned_release(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umin_i32_aligned_release:
 ; -O0:    subs w10, w8, w9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, ls
 ; -O0:    ldaxr w9, [x11]
 ; -O0:    cmp w9, w8
 ; -O0:    stlxr w10, w12, [x11]
@@ -7796,9 +7390,7 @@ define dso_local i32 @atomicrmw_umin_i32_aligned_release(ptr %ptr, i32 %value) {
 define dso_local i32 @atomicrmw_umin_i32_aligned_acq_rel(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umin_i32_aligned_acq_rel:
 ; -O0:    subs w10, w8, w9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, ls
 ; -O0:    ldaxr w9, [x11]
 ; -O0:    cmp w9, w8
 ; -O0:    stlxr w10, w12, [x11]
@@ -7816,9 +7408,7 @@ define dso_local i32 @atomicrmw_umin_i32_aligned_acq_rel(ptr %ptr, i32 %value) {
 define dso_local i32 @atomicrmw_umin_i32_aligned_seq_cst(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umin_i32_aligned_seq_cst:
 ; -O0:    subs w10, w8, w9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, ls
 ; -O0:    ldaxr w9, [x11]
 ; -O0:    cmp w9, w8
 ; -O0:    stlxr w10, w12, [x11]
@@ -7836,9 +7426,7 @@ define dso_local i32 @atomicrmw_umin_i32_aligned_seq_cst(ptr %ptr, i32 %value) {
 define dso_local i64 @atomicrmw_umin_i64_aligned_monotonic(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umin_i64_aligned_monotonic:
 ; -O0:    subs x10, x8, x9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x12, x8, x9, ne
+; -O0:    csel x12, x8, x9, ls
 ; -O0:    ldaxr x9, [x11]
 ; -O0:    cmp x9, x8
 ; -O0:    stlxr w10, x12, [x11]
@@ -7856,9 +7444,7 @@ define dso_local i64 @atomicrmw_umin_i64_aligned_monotonic(ptr %ptr, i64 %value)
 define dso_local i64 @atomicrmw_umin_i64_aligned_acquire(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umin_i64_aligned_acquire:
 ; -O0:    subs x10, x8, x9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x12, x8, x9, ne
+; -O0:    csel x12, x8, x9, ls
 ; -O0:    ldaxr x9, [x11]
 ; -O0:    cmp x9, x8
 ; -O0:    stlxr w10, x12, [x11]
@@ -7876,9 +7462,7 @@ define dso_local i64 @atomicrmw_umin_i64_aligned_acquire(ptr %ptr, i64 %value) {
 define dso_local i64 @atomicrmw_umin_i64_aligned_release(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umin_i64_aligned_release:
 ; -O0:    subs x10, x8, x9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x12, x8, x9, ne
+; -O0:    csel x12, x8, x9, ls
 ; -O0:    ldaxr x9, [x11]
 ; -O0:    cmp x9, x8
 ; -O0:    stlxr w10, x12, [x11]
@@ -7896,9 +7480,7 @@ define dso_local i64 @atomicrmw_umin_i64_aligned_release(ptr %ptr, i64 %value) {
 define dso_local i64 @atomicrmw_umin_i64_aligned_acq_rel(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umin_i64_aligned_acq_rel:
 ; -O0:    subs x10, x8, x9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x12, x8, x9, ne
+; -O0:    csel x12, x8, x9, ls
 ; -O0:    ldaxr x9, [x11]
 ; -O0:    cmp x9, x8
 ; -O0:    stlxr w10, x12, [x11]
@@ -7916,9 +7498,7 @@ define dso_local i64 @atomicrmw_umin_i64_aligned_acq_rel(ptr %ptr, i64 %value) {
 define dso_local i64 @atomicrmw_umin_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umin_i64_aligned_seq_cst:
 ; -O0:    subs x10, x8, x9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x12, x8, x9, ne
+; -O0:    csel x12, x8, x9, ls
 ; -O0:    ldaxr x9, [x11]
 ; -O0:    cmp x9, x8
 ; -O0:    stlxr w10, x12, [x11]
@@ -7936,15 +7516,11 @@ define dso_local i64 @atomicrmw_umin_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 define dso_local i128 @atomicrmw_umin_i128_aligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_aligned_monotonic:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w8, w8, w10, ne
-; -O0:    and w13, w8, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x14, x10, x12, ne
-; -O0:    and w10, w8, #0x1
+; -O0:    subs x13, x13, x9
+; -O0:    csel w10, w8, w10, eq
+; -O0:    ands w13, w10, #0x1
+; -O0:    csel x14, x8, x12, ne
 ; -O0:    ands w10, w10, #0x1
 ; -O0:    csel x15, x8, x9, ne
 ; -O0:    ldxp x10, x9, [x11]
@@ -7970,15 +7546,11 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_monotonic(ptr %ptr, i128 %val
 define dso_local i128 @atomicrmw_umin_i128_aligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_aligned_acquire:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w8, w8, w10, ne
-; -O0:    and w13, w8, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x14, x10, x12, ne
-; -O0:    and w10, w8, #0x1
+; -O0:    subs x13, x13, x9
+; -O0:    csel w10, w8, w10, eq
+; -O0:    ands w13, w10, #0x1
+; -O0:    csel x14, x8, x12, ne
 ; -O0:    ands w10, w10, #0x1
 ; -O0:    csel x15, x8, x9, ne
 ; -O0:    ldaxp x10, x9, [x11]
@@ -8004,15 +7576,11 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_acquire(ptr %ptr, i128 %value
 define dso_local i128 @atomicrmw_umin_i128_aligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_aligned_release:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w8, w8, w10, ne
-; -O0:    and w13, w8, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x14, x10, x12, ne
-; -O0:    and w10, w8, #0x1
+; -O0:    subs x13, x13, x9
+; -O0:    csel w10, w8, w10, eq
+; -O0:    ands w13, w10, #0x1
+; -O0:    csel x14, x8, x12, ne
 ; -O0:    ands w10, w10, #0x1
 ; -O0:    csel x15, x8, x9, ne
 ; -O0:    ldxp x10, x9, [x11]
@@ -8038,15 +7606,11 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_release(ptr %ptr, i128 %value
 define dso_local i128 @atomicrmw_umin_i128_aligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_aligned_acq_rel:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w8, w8, w10, ne
-; -O0:    and w13, w8, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x14, x10, x12, ne
-; -O0:    and w10, w8, #0x1
+; -O0:    subs x13, x13, x9
+; -O0:    csel w10, w8, w10, eq
+; -O0:    ands w13, w10, #0x1
+; -O0:    csel x14, x8, x12, ne
 ; -O0:    ands w10, w10, #0x1
 ; -O0:    csel x15, x8, x9, ne
 ; -O0:    ldaxp x10, x9, [x11]
@@ -8072,15 +7636,11 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_acq_rel(ptr %ptr, i128 %value
 define dso_local i128 @atomicrmw_umin_i128_aligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_aligned_seq_cst:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w8, w8, w10, ne
-; -O0:    and w13, w8, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x14, x10, x12, ne
-; -O0:    and w10, w8, #0x1
+; -O0:    subs x13, x13, x9
+; -O0:    csel w10, w8, w10, eq
+; -O0:    ands w13, w10, #0x1
+; -O0:    csel x14, x8, x12, ne
 ; -O0:    ands w10, w10, #0x1
 ; -O0:    csel x15, x8, x9, ne
 ; -O0:    ldaxp x10, x9, [x11]
@@ -8107,9 +7667,7 @@ define dso_local i8 @atomicrmw_umin_i8_unaligned_monotonic(ptr %ptr, i8 %value)
 ; -O0-LABEL: atomicrmw_umin_i8_unaligned_monotonic:
 ; -O0:    and w9, w10, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, ls
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -8130,9 +7688,7 @@ define dso_local i8 @atomicrmw_umin_i8_unaligned_acquire(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umin_i8_unaligned_acquire:
 ; -O0:    and w9, w10, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, ls
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -8153,9 +7709,7 @@ define dso_local i8 @atomicrmw_umin_i8_unaligned_release(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umin_i8_unaligned_release:
 ; -O0:    and w9, w10, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, ls
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -8176,9 +7730,7 @@ define dso_local i8 @atomicrmw_umin_i8_unaligned_acq_rel(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umin_i8_unaligned_acq_rel:
 ; -O0:    and w9, w10, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, ls
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -8199,9 +7751,7 @@ define dso_local i8 @atomicrmw_umin_i8_unaligned_seq_cst(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umin_i8_unaligned_seq_cst:
 ; -O0:    and w9, w10, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, ls
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -8221,9 +7771,7 @@ define dso_local i8 @atomicrmw_umin_i8_unaligned_seq_cst(ptr %ptr, i8 %value) {
 define dso_local i16 @atomicrmw_umin_i16_unaligned_monotonic(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umin_i16_unaligned_monotonic:
 ; -O0:    subs w10, w10, w8, uxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, ls
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i16_unaligned_monotonic:
@@ -8238,9 +7786,7 @@ define dso_local i16 @atomicrmw_umin_i16_unaligned_monotonic(ptr %ptr, i16 %valu
 define dso_local i16 @atomicrmw_umin_i16_unaligned_acquire(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umin_i16_unaligned_acquire:
 ; -O0:    subs w10, w10, w8, uxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, ls
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i16_unaligned_acquire:
@@ -8255,9 +7801,7 @@ define dso_local i16 @atomicrmw_umin_i16_unaligned_acquire(ptr %ptr, i16 %value)
 define dso_local i16 @atomicrmw_umin_i16_unaligned_release(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umin_i16_unaligned_release:
 ; -O0:    subs w10, w10, w8, uxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, ls
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i16_unaligned_release:
@@ -8272,9 +7816,7 @@ define dso_local i16 @atomicrmw_umin_i16_unaligned_release(ptr %ptr, i16 %value)
 define dso_local i16 @atomicrmw_umin_i16_unaligned_acq_rel(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umin_i16_unaligned_acq_rel:
 ; -O0:    subs w10, w10, w8, uxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, ls
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i16_unaligned_acq_rel:
@@ -8289,9 +7831,7 @@ define dso_local i16 @atomicrmw_umin_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 define dso_local i16 @atomicrmw_umin_i16_unaligned_seq_cst(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umin_i16_unaligned_seq_cst:
 ; -O0:    subs w10, w10, w8, uxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, ls
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i16_unaligned_seq_cst:
@@ -8306,9 +7846,7 @@ define dso_local i16 @atomicrmw_umin_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 define dso_local i32 @atomicrmw_umin_i32_unaligned_monotonic(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umin_i32_unaligned_monotonic:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, ls
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i32_unaligned_monotonic:
@@ -8322,9 +7860,7 @@ define dso_local i32 @atomicrmw_umin_i32_unaligned_monotonic(ptr %ptr, i32 %valu
 define dso_local i32 @atomicrmw_umin_i32_unaligned_acquire(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umin_i32_unaligned_acquire:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, ls
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i32_unaligned_acquire:
@@ -8338,9 +7874,7 @@ define dso_local i32 @atomicrmw_umin_i32_unaligned_acquire(ptr %ptr, i32 %value)
 define dso_local i32 @atomicrmw_umin_i32_unaligned_release(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umin_i32_unaligned_release:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, ls
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i32_unaligned_release:
@@ -8354,9 +7888,7 @@ define dso_local i32 @atomicrmw_umin_i32_unaligned_release(ptr %ptr, i32 %value)
 define dso_local i32 @atomicrmw_umin_i32_unaligned_acq_rel(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umin_i32_unaligned_acq_rel:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, ls
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i32_unaligned_acq_rel:
@@ -8370,9 +7902,7 @@ define dso_local i32 @atomicrmw_umin_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 define dso_local i32 @atomicrmw_umin_i32_unaligned_seq_cst(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umin_i32_unaligned_seq_cst:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, ls
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i32_unaligned_seq_cst:
@@ -8386,9 +7916,7 @@ define dso_local i32 @atomicrmw_umin_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 define dso_local i64 @atomicrmw_umin_i64_unaligned_monotonic(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umin_i64_unaligned_monotonic:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, ls
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i64_unaligned_monotonic:
@@ -8402,9 +7930,7 @@ define dso_local i64 @atomicrmw_umin_i64_unaligned_monotonic(ptr %ptr, i64 %valu
 define dso_local i64 @atomicrmw_umin_i64_unaligned_acquire(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umin_i64_unaligned_acquire:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, ls
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i64_unaligned_acquire:
@@ -8418,9 +7944,7 @@ define dso_local i64 @atomicrmw_umin_i64_unaligned_acquire(ptr %ptr, i64 %value)
 define dso_local i64 @atomicrmw_umin_i64_unaligned_release(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umin_i64_unaligned_release:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, ls
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i64_unaligned_release:
@@ -8434,9 +7958,7 @@ define dso_local i64 @atomicrmw_umin_i64_unaligned_release(ptr %ptr, i64 %value)
 define dso_local i64 @atomicrmw_umin_i64_unaligned_acq_rel(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umin_i64_unaligned_acq_rel:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, ls
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i64_unaligned_acq_rel:
@@ -8450,9 +7972,7 @@ define dso_local i64 @atomicrmw_umin_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 define dso_local i64 @atomicrmw_umin_i64_unaligned_seq_cst(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umin_i64_unaligned_seq_cst:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, ls
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i64_unaligned_seq_cst:
@@ -8466,15 +7986,11 @@ define dso_local i64 @atomicrmw_umin_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 define dso_local i128 @atomicrmw_umin_i128_unaligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_unaligned_monotonic:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -8492,15 +8008,11 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_monotonic(ptr %ptr, i128 %v
 define dso_local i128 @atomicrmw_umin_i128_unaligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_unaligned_acquire:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -8518,15 +8030,11 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_acquire(ptr %ptr, i128 %val
 define dso_local i128 @atomicrmw_umin_i128_unaligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_unaligned_release:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -8544,15 +8052,11 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_release(ptr %ptr, i128 %val
 define dso_local i128 @atomicrmw_umin_i128_unaligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_unaligned_acq_rel:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -8570,15 +8074,11 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_acq_rel(ptr %ptr, i128 %val
 define dso_local i128 @atomicrmw_umin_i128_unaligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_unaligned_seq_cst:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
diff --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-rcpc.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-rcpc.ll
index bb6163f5bc387..4f9e520997a22 100644
--- a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-rcpc.ll
+++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-rcpc.ll
@@ -926,7 +926,6 @@ define dso_local i64 @atomicrmw_add_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 define dso_local i128 @atomicrmw_add_i128_aligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_add_i128_aligned_monotonic:
 ; -O0:    adds x14, x8, x10
-; -O0:    and w10, w8, #0x1
 ; -O0:    subs w10, w10, #1
 ; -O0:    ldxp x10, x9, [x11]
 ; -O0:    cmp x10, x12
@@ -949,7 +948,6 @@ define dso_local i128 @atomicrmw_add_i128_aligned_monotonic(ptr %ptr, i128 %valu
 define dso_local i128 @atomicrmw_add_i128_aligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_add_i128_aligned_acquire:
 ; -O0:    adds x14, x8, x10
-; -O0:    and w10, w8, #0x1
 ; -O0:    subs w10, w10, #1
 ; -O0:    ldaxp x10, x9, [x11]
 ; -O0:    cmp x10, x12
@@ -972,7 +970,6 @@ define dso_local i128 @atomicrmw_add_i128_aligned_acquire(ptr %ptr, i128 %value)
 define dso_local i128 @atomicrmw_add_i128_aligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_add_i128_aligned_release:
 ; -O0:    adds x14, x8, x10
-; -O0:    and w10, w8, #0x1
 ; -O0:    subs w10, w10, #1
 ; -O0:    ldxp x10, x9, [x11]
 ; -O0:    cmp x10, x12
@@ -995,7 +992,6 @@ define dso_local i128 @atomicrmw_add_i128_aligned_release(ptr %ptr, i128 %value)
 define dso_local i128 @atomicrmw_add_i128_aligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_add_i128_aligned_acq_rel:
 ; -O0:    adds x14, x8, x10
-; -O0:    and w10, w8, #0x1
 ; -O0:    subs w10, w10, #1
 ; -O0:    ldaxp x10, x9, [x11]
 ; -O0:    cmp x10, x12
@@ -1018,7 +1014,6 @@ define dso_local i128 @atomicrmw_add_i128_aligned_acq_rel(ptr %ptr, i128 %value)
 define dso_local i128 @atomicrmw_add_i128_aligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_add_i128_aligned_seq_cst:
 ; -O0:    adds x14, x8, x10
-; -O0:    and w10, w8, #0x1
 ; -O0:    subs w10, w10, #1
 ; -O0:    ldaxp x10, x9, [x11]
 ; -O0:    cmp x10, x12
@@ -1306,7 +1301,6 @@ define dso_local i64 @atomicrmw_add_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 define dso_local i128 @atomicrmw_add_i128_unaligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_add_i128_unaligned_monotonic:
 ; -O0:    adds x9, x8, x9
-; -O0:    and w11, w8, #0x1
 ; -O0:    subs w11, w11, #1
 ; -O0:    bl __atomic_compare_exchange
 ;
@@ -1321,7 +1315,6 @@ define dso_local i128 @atomicrmw_add_i128_unaligned_monotonic(ptr %ptr, i128 %va
 define dso_local i128 @atomicrmw_add_i128_unaligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_add_i128_unaligned_acquire:
 ; -O0:    adds x9, x8, x9
-; -O0:    and w11, w8, #0x1
 ; -O0:    subs w11, w11, #1
 ; -O0:    bl __atomic_compare_exchange
 ;
@@ -1336,7 +1329,6 @@ define dso_local i128 @atomicrmw_add_i128_unaligned_acquire(ptr %ptr, i128 %valu
 define dso_local i128 @atomicrmw_add_i128_unaligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_add_i128_unaligned_release:
 ; -O0:    adds x9, x8, x9
-; -O0:    and w11, w8, #0x1
 ; -O0:    subs w11, w11, #1
 ; -O0:    bl __atomic_compare_exchange
 ;
@@ -1351,7 +1343,6 @@ define dso_local i128 @atomicrmw_add_i128_unaligned_release(ptr %ptr, i128 %valu
 define dso_local i128 @atomicrmw_add_i128_unaligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_add_i128_unaligned_acq_rel:
 ; -O0:    adds x9, x8, x9
-; -O0:    and w11, w8, #0x1
 ; -O0:    subs w11, w11, #1
 ; -O0:    bl __atomic_compare_exchange
 ;
@@ -1366,7 +1357,6 @@ define dso_local i128 @atomicrmw_add_i128_unaligned_acq_rel(ptr %ptr, i128 %valu
 define dso_local i128 @atomicrmw_add_i128_unaligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_add_i128_unaligned_seq_cst:
 ; -O0:    adds x9, x8, x9
-; -O0:    and w11, w8, #0x1
 ; -O0:    subs w11, w11, #1
 ; -O0:    bl __atomic_compare_exchange
 ;
@@ -1706,7 +1696,6 @@ define dso_local i64 @atomicrmw_sub_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 define dso_local i128 @atomicrmw_sub_i128_aligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_sub_i128_aligned_monotonic:
 ; -O0:    subs x14, x8, x10
-; -O0:    and w10, w8, #0x1
 ; -O0:    ldxp x10, x9, [x11]
 ; -O0:    cmp x10, x12
 ; -O0:    cmp x9, x13
@@ -1728,7 +1717,6 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_monotonic(ptr %ptr, i128 %valu
 define dso_local i128 @atomicrmw_sub_i128_aligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_sub_i128_aligned_acquire:
 ; -O0:    subs x14, x8, x10
-; -O0:    and w10, w8, #0x1
 ; -O0:    ldaxp x10, x9, [x11]
 ; -O0:    cmp x10, x12
 ; -O0:    cmp x9, x13
@@ -1750,7 +1738,6 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_acquire(ptr %ptr, i128 %value)
 define dso_local i128 @atomicrmw_sub_i128_aligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_sub_i128_aligned_release:
 ; -O0:    subs x14, x8, x10
-; -O0:    and w10, w8, #0x1
 ; -O0:    ldxp x10, x9, [x11]
 ; -O0:    cmp x10, x12
 ; -O0:    cmp x9, x13
@@ -1772,7 +1759,6 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_release(ptr %ptr, i128 %value)
 define dso_local i128 @atomicrmw_sub_i128_aligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_sub_i128_aligned_acq_rel:
 ; -O0:    subs x14, x8, x10
-; -O0:    and w10, w8, #0x1
 ; -O0:    ldaxp x10, x9, [x11]
 ; -O0:    cmp x10, x12
 ; -O0:    cmp x9, x13
@@ -1794,7 +1780,6 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_acq_rel(ptr %ptr, i128 %value)
 define dso_local i128 @atomicrmw_sub_i128_aligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_sub_i128_aligned_seq_cst:
 ; -O0:    subs x14, x8, x10
-; -O0:    and w10, w8, #0x1
 ; -O0:    ldaxp x10, x9, [x11]
 ; -O0:    cmp x10, x12
 ; -O0:    cmp x9, x13
@@ -2081,7 +2066,6 @@ define dso_local i64 @atomicrmw_sub_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 define dso_local i128 @atomicrmw_sub_i128_unaligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_sub_i128_unaligned_monotonic:
 ; -O0:    subs x9, x8, x9
-; -O0:    and w11, w8, #0x1
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i128_unaligned_monotonic:
@@ -2095,7 +2079,6 @@ define dso_local i128 @atomicrmw_sub_i128_unaligned_monotonic(ptr %ptr, i128 %va
 define dso_local i128 @atomicrmw_sub_i128_unaligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_sub_i128_unaligned_acquire:
 ; -O0:    subs x9, x8, x9
-; -O0:    and w11, w8, #0x1
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i128_unaligned_acquire:
@@ -2109,7 +2092,6 @@ define dso_local i128 @atomicrmw_sub_i128_unaligned_acquire(ptr %ptr, i128 %valu
 define dso_local i128 @atomicrmw_sub_i128_unaligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_sub_i128_unaligned_release:
 ; -O0:    subs x9, x8, x9
-; -O0:    and w11, w8, #0x1
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i128_unaligned_release:
@@ -2123,7 +2105,6 @@ define dso_local i128 @atomicrmw_sub_i128_unaligned_release(ptr %ptr, i128 %valu
 define dso_local i128 @atomicrmw_sub_i128_unaligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_sub_i128_unaligned_acq_rel:
 ; -O0:    subs x9, x8, x9
-; -O0:    and w11, w8, #0x1
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i128_unaligned_acq_rel:
@@ -2137,7 +2118,6 @@ define dso_local i128 @atomicrmw_sub_i128_unaligned_acq_rel(ptr %ptr, i128 %valu
 define dso_local i128 @atomicrmw_sub_i128_unaligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_sub_i128_unaligned_seq_cst:
 ; -O0:    subs x9, x8, x9
-; -O0:    and w11, w8, #0x1
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i128_unaligned_seq_cst:
@@ -5392,9 +5372,7 @@ define dso_local i8 @atomicrmw_max_i8_aligned_monotonic(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_max_i8_aligned_monotonic:
 ; -O0:    sxtb w9, w10
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, gt
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -5415,9 +5393,7 @@ define dso_local i8 @atomicrmw_max_i8_aligned_acquire(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_max_i8_aligned_acquire:
 ; -O0:    sxtb w9, w10
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, gt
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -5438,9 +5414,7 @@ define dso_local i8 @atomicrmw_max_i8_aligned_release(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_max_i8_aligned_release:
 ; -O0:    sxtb w9, w10
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, gt
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -5461,9 +5435,7 @@ define dso_local i8 @atomicrmw_max_i8_aligned_acq_rel(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_max_i8_aligned_acq_rel:
 ; -O0:    sxtb w9, w10
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, gt
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -5484,9 +5456,7 @@ define dso_local i8 @atomicrmw_max_i8_aligned_seq_cst(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_max_i8_aligned_seq_cst:
 ; -O0:    sxtb w9, w10
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, gt
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -5507,9 +5477,7 @@ define dso_local i16 @atomicrmw_max_i16_aligned_monotonic(ptr %ptr, i16 %value)
 ; -O0-LABEL: atomicrmw_max_i16_aligned_monotonic:
 ; -O0:    sxth w10, w8
 ; -O0:    subs w10, w10, w9, sxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, gt
 ; -O0:    ldaxrh w9, [x11]
 ; -O0:    cmp w9, w8, uxth
 ; -O0:    stlxrh w10, w12, [x11]
@@ -5529,9 +5497,7 @@ define dso_local i16 @atomicrmw_max_i16_aligned_acquire(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_max_i16_aligned_acquire:
 ; -O0:    sxth w10, w8
 ; -O0:    subs w10, w10, w9, sxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, gt
 ; -O0:    ldaxrh w9, [x11]
 ; -O0:    cmp w9, w8, uxth
 ; -O0:    stlxrh w10, w12, [x11]
@@ -5551,9 +5517,7 @@ define dso_local i16 @atomicrmw_max_i16_aligned_release(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_max_i16_aligned_release:
 ; -O0:    sxth w10, w8
 ; -O0:    subs w10, w10, w9, sxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, gt
 ; -O0:    ldaxrh w9, [x11]
 ; -O0:    cmp w9, w8, uxth
 ; -O0:    stlxrh w10, w12, [x11]
@@ -5573,9 +5537,7 @@ define dso_local i16 @atomicrmw_max_i16_aligned_acq_rel(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_max_i16_aligned_acq_rel:
 ; -O0:    sxth w10, w8
 ; -O0:    subs w10, w10, w9, sxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, gt
 ; -O0:    ldaxrh w9, [x11]
 ; -O0:    cmp w9, w8, uxth
 ; -O0:    stlxrh w10, w12, [x11]
@@ -5595,9 +5557,7 @@ define dso_local i16 @atomicrmw_max_i16_aligned_seq_cst(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_max_i16_aligned_seq_cst:
 ; -O0:    sxth w10, w8
 ; -O0:    subs w10, w10, w9, sxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, gt
 ; -O0:    ldaxrh w9, [x11]
 ; -O0:    cmp w9, w8, uxth
 ; -O0:    stlxrh w10, w12, [x11]
@@ -5616,9 +5576,7 @@ define dso_local i16 @atomicrmw_max_i16_aligned_seq_cst(ptr %ptr, i16 %value) {
 define dso_local i32 @atomicrmw_max_i32_aligned_monotonic(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_max_i32_aligned_monotonic:
 ; -O0:    subs w10, w8, w9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, gt
 ; -O0:    ldaxr w9, [x11]
 ; -O0:    cmp w9, w8
 ; -O0:    stlxr w10, w12, [x11]
@@ -5636,9 +5594,7 @@ define dso_local i32 @atomicrmw_max_i32_aligned_monotonic(ptr %ptr, i32 %value)
 define dso_local i32 @atomicrmw_max_i32_aligned_acquire(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_max_i32_aligned_acquire:
 ; -O0:    subs w10, w8, w9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, gt
 ; -O0:    ldaxr w9, [x11]
 ; -O0:    cmp w9, w8
 ; -O0:    stlxr w10, w12, [x11]
@@ -5656,9 +5612,7 @@ define dso_local i32 @atomicrmw_max_i32_aligned_acquire(ptr %ptr, i32 %value) {
 define dso_local i32 @atomicrmw_max_i32_aligned_release(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_max_i32_aligned_release:
 ; -O0:    subs w10, w8, w9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, gt
 ; -O0:    ldaxr w9, [x11]
 ; -O0:    cmp w9, w8
 ; -O0:    stlxr w10, w12, [x11]
@@ -5676,9 +5630,7 @@ define dso_local i32 @atomicrmw_max_i32_aligned_release(ptr %ptr, i32 %value) {
 define dso_local i32 @atomicrmw_max_i32_aligned_acq_rel(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_max_i32_aligned_acq_rel:
 ; -O0:    subs w10, w8, w9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, gt
 ; -O0:    ldaxr w9, [x11]
 ; -O0:    cmp w9, w8
 ; -O0:    stlxr w10, w12, [x11]
@@ -5696,9 +5648,7 @@ define dso_local i32 @atomicrmw_max_i32_aligned_acq_rel(ptr %ptr, i32 %value) {
 define dso_local i32 @atomicrmw_max_i32_aligned_seq_cst(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_max_i32_aligned_seq_cst:
 ; -O0:    subs w10, w8, w9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, gt
 ; -O0:    ldaxr w9, [x11]
 ; -O0:    cmp w9, w8
 ; -O0:    stlxr w10, w12, [x11]
@@ -5716,9 +5666,7 @@ define dso_local i32 @atomicrmw_max_i32_aligned_seq_cst(ptr %ptr, i32 %value) {
 define dso_local i64 @atomicrmw_max_i64_aligned_monotonic(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_max_i64_aligned_monotonic:
 ; -O0:    subs x10, x8, x9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x12, x8, x9, ne
+; -O0:    csel x12, x8, x9, gt
 ; -O0:    ldaxr x9, [x11]
 ; -O0:    cmp x9, x8
 ; -O0:    stlxr w10, x12, [x11]
@@ -5736,9 +5684,7 @@ define dso_local i64 @atomicrmw_max_i64_aligned_monotonic(ptr %ptr, i64 %value)
 define dso_local i64 @atomicrmw_max_i64_aligned_acquire(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_max_i64_aligned_acquire:
 ; -O0:    subs x10, x8, x9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x12, x8, x9, ne
+; -O0:    csel x12, x8, x9, gt
 ; -O0:    ldaxr x9, [x11]
 ; -O0:    cmp x9, x8
 ; -O0:    stlxr w10, x12, [x11]
@@ -5756,9 +5702,7 @@ define dso_local i64 @atomicrmw_max_i64_aligned_acquire(ptr %ptr, i64 %value) {
 define dso_local i64 @atomicrmw_max_i64_aligned_release(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_max_i64_aligned_release:
 ; -O0:    subs x10, x8, x9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x12, x8, x9, ne
+; -O0:    csel x12, x8, x9, gt
 ; -O0:    ldaxr x9, [x11]
 ; -O0:    cmp x9, x8
 ; -O0:    stlxr w10, x12, [x11]
@@ -5776,9 +5720,7 @@ define dso_local i64 @atomicrmw_max_i64_aligned_release(ptr %ptr, i64 %value) {
 define dso_local i64 @atomicrmw_max_i64_aligned_acq_rel(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_max_i64_aligned_acq_rel:
 ; -O0:    subs x10, x8, x9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x12, x8, x9, ne
+; -O0:    csel x12, x8, x9, gt
 ; -O0:    ldaxr x9, [x11]
 ; -O0:    cmp x9, x8
 ; -O0:    stlxr w10, x12, [x11]
@@ -5796,9 +5738,7 @@ define dso_local i64 @atomicrmw_max_i64_aligned_acq_rel(ptr %ptr, i64 %value) {
 define dso_local i64 @atomicrmw_max_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_max_i64_aligned_seq_cst:
 ; -O0:    subs x10, x8, x9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x12, x8, x9, ne
+; -O0:    csel x12, x8, x9, gt
 ; -O0:    ldaxr x9, [x11]
 ; -O0:    cmp x9, x8
 ; -O0:    stlxr w10, x12, [x11]
@@ -5816,15 +5756,11 @@ define dso_local i64 @atomicrmw_max_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 define dso_local i128 @atomicrmw_max_i128_aligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_aligned_monotonic:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w8, w8, w10, ne
-; -O0:    and w13, w8, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x14, x10, x12, ne
-; -O0:    and w10, w8, #0x1
+; -O0:    subs x13, x13, x9
+; -O0:    csel w10, w8, w10, eq
+; -O0:    ands w13, w10, #0x1
+; -O0:    csel x14, x8, x12, ne
 ; -O0:    ands w10, w10, #0x1
 ; -O0:    csel x15, x8, x9, ne
 ; -O0:    ldxp x10, x9, [x11]
@@ -5850,15 +5786,11 @@ define dso_local i128 @atomicrmw_max_i128_aligned_monotonic(ptr %ptr, i128 %valu
 define dso_local i128 @atomicrmw_max_i128_aligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_aligned_acquire:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w8, w8, w10, ne
-; -O0:    and w13, w8, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x14, x10, x12, ne
-; -O0:    and w10, w8, #0x1
+; -O0:    subs x13, x13, x9
+; -O0:    csel w10, w8, w10, eq
+; -O0:    ands w13, w10, #0x1
+; -O0:    csel x14, x8, x12, ne
 ; -O0:    ands w10, w10, #0x1
 ; -O0:    csel x15, x8, x9, ne
 ; -O0:    ldaxp x10, x9, [x11]
@@ -5884,15 +5816,11 @@ define dso_local i128 @atomicrmw_max_i128_aligned_acquire(ptr %ptr, i128 %value)
 define dso_local i128 @atomicrmw_max_i128_aligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_aligned_release:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w8, w8, w10, ne
-; -O0:    and w13, w8, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x14, x10, x12, ne
-; -O0:    and w10, w8, #0x1
+; -O0:    subs x13, x13, x9
+; -O0:    csel w10, w8, w10, eq
+; -O0:    ands w13, w10, #0x1
+; -O0:    csel x14, x8, x12, ne
 ; -O0:    ands w10, w10, #0x1
 ; -O0:    csel x15, x8, x9, ne
 ; -O0:    ldxp x10, x9, [x11]
@@ -5918,15 +5846,11 @@ define dso_local i128 @atomicrmw_max_i128_aligned_release(ptr %ptr, i128 %value)
 define dso_local i128 @atomicrmw_max_i128_aligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_aligned_acq_rel:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w8, w8, w10, ne
-; -O0:    and w13, w8, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x14, x10, x12, ne
-; -O0:    and w10, w8, #0x1
+; -O0:    subs x13, x13, x9
+; -O0:    csel w10, w8, w10, eq
+; -O0:    ands w13, w10, #0x1
+; -O0:    csel x14, x8, x12, ne
 ; -O0:    ands w10, w10, #0x1
 ; -O0:    csel x15, x8, x9, ne
 ; -O0:    ldaxp x10, x9, [x11]
@@ -5952,15 +5876,11 @@ define dso_local i128 @atomicrmw_max_i128_aligned_acq_rel(ptr %ptr, i128 %value)
 define dso_local i128 @atomicrmw_max_i128_aligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_aligned_seq_cst:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w8, w8, w10, ne
-; -O0:    and w13, w8, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x14, x10, x12, ne
-; -O0:    and w10, w8, #0x1
+; -O0:    subs x13, x13, x9
+; -O0:    csel w10, w8, w10, eq
+; -O0:    ands w13, w10, #0x1
+; -O0:    csel x14, x8, x12, ne
 ; -O0:    ands w10, w10, #0x1
 ; -O0:    csel x15, x8, x9, ne
 ; -O0:    ldaxp x10, x9, [x11]
@@ -5987,9 +5907,7 @@ define dso_local i8 @atomicrmw_max_i8_unaligned_monotonic(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_max_i8_unaligned_monotonic:
 ; -O0:    sxtb w9, w10
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, gt
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -6010,9 +5928,7 @@ define dso_local i8 @atomicrmw_max_i8_unaligned_acquire(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_max_i8_unaligned_acquire:
 ; -O0:    sxtb w9, w10
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, gt
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -6033,9 +5949,7 @@ define dso_local i8 @atomicrmw_max_i8_unaligned_release(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_max_i8_unaligned_release:
 ; -O0:    sxtb w9, w10
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, gt
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -6056,9 +5970,7 @@ define dso_local i8 @atomicrmw_max_i8_unaligned_acq_rel(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_max_i8_unaligned_acq_rel:
 ; -O0:    sxtb w9, w10
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, gt
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -6079,9 +5991,7 @@ define dso_local i8 @atomicrmw_max_i8_unaligned_seq_cst(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_max_i8_unaligned_seq_cst:
 ; -O0:    sxtb w9, w10
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, gt
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -6102,9 +6012,7 @@ define dso_local i16 @atomicrmw_max_i16_unaligned_monotonic(ptr %ptr, i16 %value
 ; -O0-LABEL: atomicrmw_max_i16_unaligned_monotonic:
 ; -O0:    sxth w10, w9
 ; -O0:    subs w10, w10, w8, sxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, gt
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i16_unaligned_monotonic:
@@ -6120,9 +6028,7 @@ define dso_local i16 @atomicrmw_max_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ; -O0-LABEL: atomicrmw_max_i16_unaligned_acquire:
 ; -O0:    sxth w10, w9
 ; -O0:    subs w10, w10, w8, sxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, gt
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i16_unaligned_acquire:
@@ -6138,9 +6044,7 @@ define dso_local i16 @atomicrmw_max_i16_unaligned_release(ptr %ptr, i16 %value)
 ; -O0-LABEL: atomicrmw_max_i16_unaligned_release:
 ; -O0:    sxth w10, w9
 ; -O0:    subs w10, w10, w8, sxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, gt
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i16_unaligned_release:
@@ -6156,9 +6060,7 @@ define dso_local i16 @atomicrmw_max_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ; -O0-LABEL: atomicrmw_max_i16_unaligned_acq_rel:
 ; -O0:    sxth w10, w9
 ; -O0:    subs w10, w10, w8, sxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, gt
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i16_unaligned_acq_rel:
@@ -6174,9 +6076,7 @@ define dso_local i16 @atomicrmw_max_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ; -O0-LABEL: atomicrmw_max_i16_unaligned_seq_cst:
 ; -O0:    sxth w10, w9
 ; -O0:    subs w10, w10, w8, sxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, gt
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i16_unaligned_seq_cst:
@@ -6191,9 +6091,7 @@ define dso_local i16 @atomicrmw_max_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 define dso_local i32 @atomicrmw_max_i32_unaligned_monotonic(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_max_i32_unaligned_monotonic:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, gt
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i32_unaligned_monotonic:
@@ -6207,9 +6105,7 @@ define dso_local i32 @atomicrmw_max_i32_unaligned_monotonic(ptr %ptr, i32 %value
 define dso_local i32 @atomicrmw_max_i32_unaligned_acquire(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_max_i32_unaligned_acquire:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, gt
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i32_unaligned_acquire:
@@ -6223,9 +6119,7 @@ define dso_local i32 @atomicrmw_max_i32_unaligned_acquire(ptr %ptr, i32 %value)
 define dso_local i32 @atomicrmw_max_i32_unaligned_release(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_max_i32_unaligned_release:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, gt
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i32_unaligned_release:
@@ -6239,9 +6133,7 @@ define dso_local i32 @atomicrmw_max_i32_unaligned_release(ptr %ptr, i32 %value)
 define dso_local i32 @atomicrmw_max_i32_unaligned_acq_rel(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_max_i32_unaligned_acq_rel:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, gt
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i32_unaligned_acq_rel:
@@ -6255,9 +6147,7 @@ define dso_local i32 @atomicrmw_max_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 define dso_local i32 @atomicrmw_max_i32_unaligned_seq_cst(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_max_i32_unaligned_seq_cst:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, gt
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i32_unaligned_seq_cst:
@@ -6271,9 +6161,7 @@ define dso_local i32 @atomicrmw_max_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 define dso_local i64 @atomicrmw_max_i64_unaligned_monotonic(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_max_i64_unaligned_monotonic:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, gt
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i64_unaligned_monotonic:
@@ -6287,9 +6175,7 @@ define dso_local i64 @atomicrmw_max_i64_unaligned_monotonic(ptr %ptr, i64 %value
 define dso_local i64 @atomicrmw_max_i64_unaligned_acquire(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_max_i64_unaligned_acquire:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, gt
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i64_unaligned_acquire:
@@ -6303,9 +6189,7 @@ define dso_local i64 @atomicrmw_max_i64_unaligned_acquire(ptr %ptr, i64 %value)
 define dso_local i64 @atomicrmw_max_i64_unaligned_release(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_max_i64_unaligned_release:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, gt
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i64_unaligned_release:
@@ -6319,9 +6203,7 @@ define dso_local i64 @atomicrmw_max_i64_unaligned_release(ptr %ptr, i64 %value)
 define dso_local i64 @atomicrmw_max_i64_unaligned_acq_rel(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_max_i64_unaligned_acq_rel:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, gt
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i64_unaligned_acq_rel:
@@ -6335,9 +6217,7 @@ define dso_local i64 @atomicrmw_max_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 define dso_local i64 @atomicrmw_max_i64_unaligned_seq_cst(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_max_i64_unaligned_seq_cst:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, gt
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i64_unaligned_seq_cst:
@@ -6351,15 +6231,11 @@ define dso_local i64 @atomicrmw_max_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 define dso_local i128 @atomicrmw_max_i128_unaligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_unaligned_monotonic:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -6377,15 +6253,11 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_monotonic(ptr %ptr, i128 %va
 define dso_local i128 @atomicrmw_max_i128_unaligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_unaligned_acquire:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -6403,15 +6275,11 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_acquire(ptr %ptr, i128 %valu
 define dso_local i128 @atomicrmw_max_i128_unaligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_unaligned_release:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -6429,15 +6297,11 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_release(ptr %ptr, i128 %valu
 define dso_local i128 @atomicrmw_max_i128_unaligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_unaligned_acq_rel:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -6455,15 +6319,11 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_acq_rel(ptr %ptr, i128 %valu
 define dso_local i128 @atomicrmw_max_i128_unaligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_unaligned_seq_cst:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -6482,9 +6342,7 @@ define dso_local i8 @atomicrmw_min_i8_aligned_monotonic(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_min_i8_aligned_monotonic:
 ; -O0:    sxtb w9, w10
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, le
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -6505,9 +6363,7 @@ define dso_local i8 @atomicrmw_min_i8_aligned_acquire(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_min_i8_aligned_acquire:
 ; -O0:    sxtb w9, w10
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, le
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -6528,9 +6384,7 @@ define dso_local i8 @atomicrmw_min_i8_aligned_release(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_min_i8_aligned_release:
 ; -O0:    sxtb w9, w10
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, le
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -6551,9 +6405,7 @@ define dso_local i8 @atomicrmw_min_i8_aligned_acq_rel(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_min_i8_aligned_acq_rel:
 ; -O0:    sxtb w9, w10
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, le
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -6574,9 +6426,7 @@ define dso_local i8 @atomicrmw_min_i8_aligned_seq_cst(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_min_i8_aligned_seq_cst:
 ; -O0:    sxtb w9, w10
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, le
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -6597,9 +6447,7 @@ define dso_local i16 @atomicrmw_min_i16_aligned_monotonic(ptr %ptr, i16 %value)
 ; -O0-LABEL: atomicrmw_min_i16_aligned_monotonic:
 ; -O0:    sxth w10, w8
 ; -O0:    subs w10, w10, w9, sxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, le
 ; -O0:    ldaxrh w9, [x11]
 ; -O0:    cmp w9, w8, uxth
 ; -O0:    stlxrh w10, w12, [x11]
@@ -6619,9 +6467,7 @@ define dso_local i16 @atomicrmw_min_i16_aligned_acquire(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_min_i16_aligned_acquire:
 ; -O0:    sxth w10, w8
 ; -O0:    subs w10, w10, w9, sxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, le
 ; -O0:    ldaxrh w9, [x11]
 ; -O0:    cmp w9, w8, uxth
 ; -O0:    stlxrh w10, w12, [x11]
@@ -6641,9 +6487,7 @@ define dso_local i16 @atomicrmw_min_i16_aligned_release(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_min_i16_aligned_release:
 ; -O0:    sxth w10, w8
 ; -O0:    subs w10, w10, w9, sxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, le
 ; -O0:    ldaxrh w9, [x11]
 ; -O0:    cmp w9, w8, uxth
 ; -O0:    stlxrh w10, w12, [x11]
@@ -6663,9 +6507,7 @@ define dso_local i16 @atomicrmw_min_i16_aligned_acq_rel(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_min_i16_aligned_acq_rel:
 ; -O0:    sxth w10, w8
 ; -O0:    subs w10, w10, w9, sxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, le
 ; -O0:    ldaxrh w9, [x11]
 ; -O0:    cmp w9, w8, uxth
 ; -O0:    stlxrh w10, w12, [x11]
@@ -6685,9 +6527,7 @@ define dso_local i16 @atomicrmw_min_i16_aligned_seq_cst(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_min_i16_aligned_seq_cst:
 ; -O0:    sxth w10, w8
 ; -O0:    subs w10, w10, w9, sxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, le
 ; -O0:    ldaxrh w9, [x11]
 ; -O0:    cmp w9, w8, uxth
 ; -O0:    stlxrh w10, w12, [x11]
@@ -6706,9 +6546,7 @@ define dso_local i16 @atomicrmw_min_i16_aligned_seq_cst(ptr %ptr, i16 %value) {
 define dso_local i32 @atomicrmw_min_i32_aligned_monotonic(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_min_i32_aligned_monotonic:
 ; -O0:    subs w10, w8, w9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, le
 ; -O0:    ldaxr w9, [x11]
 ; -O0:    cmp w9, w8
 ; -O0:    stlxr w10, w12, [x11]
@@ -6726,9 +6564,7 @@ define dso_local i32 @atomicrmw_min_i32_aligned_monotonic(ptr %ptr, i32 %value)
 define dso_local i32 @atomicrmw_min_i32_aligned_acquire(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_min_i32_aligned_acquire:
 ; -O0:    subs w10, w8, w9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, le
 ; -O0:    ldaxr w9, [x11]
 ; -O0:    cmp w9, w8
 ; -O0:    stlxr w10, w12, [x11]
@@ -6746,9 +6582,7 @@ define dso_local i32 @atomicrmw_min_i32_aligned_acquire(ptr %ptr, i32 %value) {
 define dso_local i32 @atomicrmw_min_i32_aligned_release(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_min_i32_aligned_release:
 ; -O0:    subs w10, w8, w9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, le
 ; -O0:    ldaxr w9, [x11]
 ; -O0:    cmp w9, w8
 ; -O0:    stlxr w10, w12, [x11]
@@ -6766,9 +6600,7 @@ define dso_local i32 @atomicrmw_min_i32_aligned_release(ptr %ptr, i32 %value) {
 define dso_local i32 @atomicrmw_min_i32_aligned_acq_rel(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_min_i32_aligned_acq_rel:
 ; -O0:    subs w10, w8, w9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, le
 ; -O0:    ldaxr w9, [x11]
 ; -O0:    cmp w9, w8
 ; -O0:    stlxr w10, w12, [x11]
@@ -6786,9 +6618,7 @@ define dso_local i32 @atomicrmw_min_i32_aligned_acq_rel(ptr %ptr, i32 %value) {
 define dso_local i32 @atomicrmw_min_i32_aligned_seq_cst(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_min_i32_aligned_seq_cst:
 ; -O0:    subs w10, w8, w9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, le
 ; -O0:    ldaxr w9, [x11]
 ; -O0:    cmp w9, w8
 ; -O0:    stlxr w10, w12, [x11]
@@ -6806,9 +6636,7 @@ define dso_local i32 @atomicrmw_min_i32_aligned_seq_cst(ptr %ptr, i32 %value) {
 define dso_local i64 @atomicrmw_min_i64_aligned_monotonic(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_min_i64_aligned_monotonic:
 ; -O0:    subs x10, x8, x9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x12, x8, x9, ne
+; -O0:    csel x12, x8, x9, le
 ; -O0:    ldaxr x9, [x11]
 ; -O0:    cmp x9, x8
 ; -O0:    stlxr w10, x12, [x11]
@@ -6826,9 +6654,7 @@ define dso_local i64 @atomicrmw_min_i64_aligned_monotonic(ptr %ptr, i64 %value)
 define dso_local i64 @atomicrmw_min_i64_aligned_acquire(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_min_i64_aligned_acquire:
 ; -O0:    subs x10, x8, x9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x12, x8, x9, ne
+; -O0:    csel x12, x8, x9, le
 ; -O0:    ldaxr x9, [x11]
 ; -O0:    cmp x9, x8
 ; -O0:    stlxr w10, x12, [x11]
@@ -6846,9 +6672,7 @@ define dso_local i64 @atomicrmw_min_i64_aligned_acquire(ptr %ptr, i64 %value) {
 define dso_local i64 @atomicrmw_min_i64_aligned_release(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_min_i64_aligned_release:
 ; -O0:    subs x10, x8, x9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x12, x8, x9, ne
+; -O0:    csel x12, x8, x9, le
 ; -O0:    ldaxr x9, [x11]
 ; -O0:    cmp x9, x8
 ; -O0:    stlxr w10, x12, [x11]
@@ -6866,9 +6690,7 @@ define dso_local i64 @atomicrmw_min_i64_aligned_release(ptr %ptr, i64 %value) {
 define dso_local i64 @atomicrmw_min_i64_aligned_acq_rel(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_min_i64_aligned_acq_rel:
 ; -O0:    subs x10, x8, x9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x12, x8, x9, ne
+; -O0:    csel x12, x8, x9, le
 ; -O0:    ldaxr x9, [x11]
 ; -O0:    cmp x9, x8
 ; -O0:    stlxr w10, x12, [x11]
@@ -6886,9 +6708,7 @@ define dso_local i64 @atomicrmw_min_i64_aligned_acq_rel(ptr %ptr, i64 %value) {
 define dso_local i64 @atomicrmw_min_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_min_i64_aligned_seq_cst:
 ; -O0:    subs x10, x8, x9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x12, x8, x9, ne
+; -O0:    csel x12, x8, x9, le
 ; -O0:    ldaxr x9, [x11]
 ; -O0:    cmp x9, x8
 ; -O0:    stlxr w10, x12, [x11]
@@ -6906,15 +6726,11 @@ define dso_local i64 @atomicrmw_min_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 define dso_local i128 @atomicrmw_min_i128_aligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_aligned_monotonic:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w8, w8, w10, ne
-; -O0:    and w13, w8, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x14, x10, x12, ne
-; -O0:    and w10, w8, #0x1
+; -O0:    subs x13, x13, x9
+; -O0:    csel w10, w8, w10, eq
+; -O0:    ands w13, w10, #0x1
+; -O0:    csel x14, x8, x12, ne
 ; -O0:    ands w10, w10, #0x1
 ; -O0:    csel x15, x8, x9, ne
 ; -O0:    ldxp x10, x9, [x11]
@@ -6940,15 +6756,11 @@ define dso_local i128 @atomicrmw_min_i128_aligned_monotonic(ptr %ptr, i128 %valu
 define dso_local i128 @atomicrmw_min_i128_aligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_aligned_acquire:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w8, w8, w10, ne
-; -O0:    and w13, w8, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x14, x10, x12, ne
-; -O0:    and w10, w8, #0x1
+; -O0:    subs x13, x13, x9
+; -O0:    csel w10, w8, w10, eq
+; -O0:    ands w13, w10, #0x1
+; -O0:    csel x14, x8, x12, ne
 ; -O0:    ands w10, w10, #0x1
 ; -O0:    csel x15, x8, x9, ne
 ; -O0:    ldaxp x10, x9, [x11]
@@ -6974,15 +6786,11 @@ define dso_local i128 @atomicrmw_min_i128_aligned_acquire(ptr %ptr, i128 %value)
 define dso_local i128 @atomicrmw_min_i128_aligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_aligned_release:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w8, w8, w10, ne
-; -O0:    and w13, w8, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x14, x10, x12, ne
-; -O0:    and w10, w8, #0x1
+; -O0:    subs x13, x13, x9
+; -O0:    csel w10, w8, w10, eq
+; -O0:    ands w13, w10, #0x1
+; -O0:    csel x14, x8, x12, ne
 ; -O0:    ands w10, w10, #0x1
 ; -O0:    csel x15, x8, x9, ne
 ; -O0:    ldxp x10, x9, [x11]
@@ -7008,15 +6816,11 @@ define dso_local i128 @atomicrmw_min_i128_aligned_release(ptr %ptr, i128 %value)
 define dso_local i128 @atomicrmw_min_i128_aligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_aligned_acq_rel:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w8, w8, w10, ne
-; -O0:    and w13, w8, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x14, x10, x12, ne
-; -O0:    and w10, w8, #0x1
+; -O0:    subs x13, x13, x9
+; -O0:    csel w10, w8, w10, eq
+; -O0:    ands w13, w10, #0x1
+; -O0:    csel x14, x8, x12, ne
 ; -O0:    ands w10, w10, #0x1
 ; -O0:    csel x15, x8, x9, ne
 ; -O0:    ldaxp x10, x9, [x11]
@@ -7042,15 +6846,11 @@ define dso_local i128 @atomicrmw_min_i128_aligned_acq_rel(ptr %ptr, i128 %value)
 define dso_local i128 @atomicrmw_min_i128_aligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_aligned_seq_cst:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w8, w8, w10, ne
-; -O0:    and w13, w8, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x14, x10, x12, ne
-; -O0:    and w10, w8, #0x1
+; -O0:    subs x13, x13, x9
+; -O0:    csel w10, w8, w10, eq
+; -O0:    ands w13, w10, #0x1
+; -O0:    csel x14, x8, x12, ne
 ; -O0:    ands w10, w10, #0x1
 ; -O0:    csel x15, x8, x9, ne
 ; -O0:    ldaxp x10, x9, [x11]
@@ -7077,9 +6877,7 @@ define dso_local i8 @atomicrmw_min_i8_unaligned_monotonic(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_min_i8_unaligned_monotonic:
 ; -O0:    sxtb w9, w10
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, le
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -7100,9 +6898,7 @@ define dso_local i8 @atomicrmw_min_i8_unaligned_acquire(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_min_i8_unaligned_acquire:
 ; -O0:    sxtb w9, w10
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, le
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -7123,9 +6919,7 @@ define dso_local i8 @atomicrmw_min_i8_unaligned_release(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_min_i8_unaligned_release:
 ; -O0:    sxtb w9, w10
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, le
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -7146,9 +6940,7 @@ define dso_local i8 @atomicrmw_min_i8_unaligned_acq_rel(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_min_i8_unaligned_acq_rel:
 ; -O0:    sxtb w9, w10
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, le
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -7169,9 +6961,7 @@ define dso_local i8 @atomicrmw_min_i8_unaligned_seq_cst(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_min_i8_unaligned_seq_cst:
 ; -O0:    sxtb w9, w10
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, le
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -7192,9 +6982,7 @@ define dso_local i16 @atomicrmw_min_i16_unaligned_monotonic(ptr %ptr, i16 %value
 ; -O0-LABEL: atomicrmw_min_i16_unaligned_monotonic:
 ; -O0:    sxth w10, w9
 ; -O0:    subs w10, w10, w8, sxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, le
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i16_unaligned_monotonic:
@@ -7210,9 +6998,7 @@ define dso_local i16 @atomicrmw_min_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ; -O0-LABEL: atomicrmw_min_i16_unaligned_acquire:
 ; -O0:    sxth w10, w9
 ; -O0:    subs w10, w10, w8, sxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, le
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i16_unaligned_acquire:
@@ -7228,9 +7014,7 @@ define dso_local i16 @atomicrmw_min_i16_unaligned_release(ptr %ptr, i16 %value)
 ; -O0-LABEL: atomicrmw_min_i16_unaligned_release:
 ; -O0:    sxth w10, w9
 ; -O0:    subs w10, w10, w8, sxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, le
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i16_unaligned_release:
@@ -7246,9 +7030,7 @@ define dso_local i16 @atomicrmw_min_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ; -O0-LABEL: atomicrmw_min_i16_unaligned_acq_rel:
 ; -O0:    sxth w10, w9
 ; -O0:    subs w10, w10, w8, sxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, le
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i16_unaligned_acq_rel:
@@ -7264,9 +7046,7 @@ define dso_local i16 @atomicrmw_min_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ; -O0-LABEL: atomicrmw_min_i16_unaligned_seq_cst:
 ; -O0:    sxth w10, w9
 ; -O0:    subs w10, w10, w8, sxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, le
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i16_unaligned_seq_cst:
@@ -7281,9 +7061,7 @@ define dso_local i16 @atomicrmw_min_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 define dso_local i32 @atomicrmw_min_i32_unaligned_monotonic(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_min_i32_unaligned_monotonic:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, le
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i32_unaligned_monotonic:
@@ -7297,9 +7075,7 @@ define dso_local i32 @atomicrmw_min_i32_unaligned_monotonic(ptr %ptr, i32 %value
 define dso_local i32 @atomicrmw_min_i32_unaligned_acquire(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_min_i32_unaligned_acquire:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, le
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i32_unaligned_acquire:
@@ -7313,9 +7089,7 @@ define dso_local i32 @atomicrmw_min_i32_unaligned_acquire(ptr %ptr, i32 %value)
 define dso_local i32 @atomicrmw_min_i32_unaligned_release(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_min_i32_unaligned_release:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, le
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i32_unaligned_release:
@@ -7329,9 +7103,7 @@ define dso_local i32 @atomicrmw_min_i32_unaligned_release(ptr %ptr, i32 %value)
 define dso_local i32 @atomicrmw_min_i32_unaligned_acq_rel(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_min_i32_unaligned_acq_rel:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, le
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i32_unaligned_acq_rel:
@@ -7345,9 +7117,7 @@ define dso_local i32 @atomicrmw_min_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 define dso_local i32 @atomicrmw_min_i32_unaligned_seq_cst(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_min_i32_unaligned_seq_cst:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, le
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i32_unaligned_seq_cst:
@@ -7361,9 +7131,7 @@ define dso_local i32 @atomicrmw_min_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 define dso_local i64 @atomicrmw_min_i64_unaligned_monotonic(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_min_i64_unaligned_monotonic:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, le
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i64_unaligned_monotonic:
@@ -7377,9 +7145,7 @@ define dso_local i64 @atomicrmw_min_i64_unaligned_monotonic(ptr %ptr, i64 %value
 define dso_local i64 @atomicrmw_min_i64_unaligned_acquire(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_min_i64_unaligned_acquire:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, le
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i64_unaligned_acquire:
@@ -7393,9 +7159,7 @@ define dso_local i64 @atomicrmw_min_i64_unaligned_acquire(ptr %ptr, i64 %value)
 define dso_local i64 @atomicrmw_min_i64_unaligned_release(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_min_i64_unaligned_release:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, le
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i64_unaligned_release:
@@ -7409,9 +7173,7 @@ define dso_local i64 @atomicrmw_min_i64_unaligned_release(ptr %ptr, i64 %value)
 define dso_local i64 @atomicrmw_min_i64_unaligned_acq_rel(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_min_i64_unaligned_acq_rel:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, le
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i64_unaligned_acq_rel:
@@ -7425,9 +7187,7 @@ define dso_local i64 @atomicrmw_min_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 define dso_local i64 @atomicrmw_min_i64_unaligned_seq_cst(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_min_i64_unaligned_seq_cst:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, le
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i64_unaligned_seq_cst:
@@ -7441,15 +7201,11 @@ define dso_local i64 @atomicrmw_min_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 define dso_local i128 @atomicrmw_min_i128_unaligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_unaligned_monotonic:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -7467,15 +7223,11 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_monotonic(ptr %ptr, i128 %va
 define dso_local i128 @atomicrmw_min_i128_unaligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_unaligned_acquire:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -7493,15 +7245,11 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_acquire(ptr %ptr, i128 %valu
 define dso_local i128 @atomicrmw_min_i128_unaligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_unaligned_release:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -7519,15 +7267,11 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_release(ptr %ptr, i128 %valu
 define dso_local i128 @atomicrmw_min_i128_unaligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_unaligned_acq_rel:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -7545,15 +7289,11 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_acq_rel(ptr %ptr, i128 %valu
 define dso_local i128 @atomicrmw_min_i128_unaligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_unaligned_seq_cst:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -7572,9 +7312,7 @@ define dso_local i8 @atomicrmw_umax_i8_aligned_monotonic(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umax_i8_aligned_monotonic:
 ; -O0:    and w9, w10, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, hi
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -7595,9 +7333,7 @@ define dso_local i8 @atomicrmw_umax_i8_aligned_acquire(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umax_i8_aligned_acquire:
 ; -O0:    and w9, w10, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, hi
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -7618,9 +7354,7 @@ define dso_local i8 @atomicrmw_umax_i8_aligned_release(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umax_i8_aligned_release:
 ; -O0:    and w9, w10, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, hi
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -7641,9 +7375,7 @@ define dso_local i8 @atomicrmw_umax_i8_aligned_acq_rel(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umax_i8_aligned_acq_rel:
 ; -O0:    and w9, w10, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, hi
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -7664,9 +7396,7 @@ define dso_local i8 @atomicrmw_umax_i8_aligned_seq_cst(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umax_i8_aligned_seq_cst:
 ; -O0:    and w9, w10, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, hi
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -7686,9 +7416,7 @@ define dso_local i8 @atomicrmw_umax_i8_aligned_seq_cst(ptr %ptr, i8 %value) {
 define dso_local i16 @atomicrmw_umax_i16_aligned_monotonic(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umax_i16_aligned_monotonic:
 ; -O0:    subs w10, w10, w9, uxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, hi
 ; -O0:    ldaxrh w9, [x11]
 ; -O0:    cmp w9, w8, uxth
 ; -O0:    stlxrh w10, w12, [x11]
@@ -7707,9 +7435,7 @@ define dso_local i16 @atomicrmw_umax_i16_aligned_monotonic(ptr %ptr, i16 %value)
 define dso_local i16 @atomicrmw_umax_i16_aligned_acquire(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umax_i16_aligned_acquire:
 ; -O0:    subs w10, w10, w9, uxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, hi
 ; -O0:    ldaxrh w9, [x11]
 ; -O0:    cmp w9, w8, uxth
 ; -O0:    stlxrh w10, w12, [x11]
@@ -7728,9 +7454,7 @@ define dso_local i16 @atomicrmw_umax_i16_aligned_acquire(ptr %ptr, i16 %value) {
 define dso_local i16 @atomicrmw_umax_i16_aligned_release(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umax_i16_aligned_release:
 ; -O0:    subs w10, w10, w9, uxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, hi
 ; -O0:    ldaxrh w9, [x11]
 ; -O0:    cmp w9, w8, uxth
 ; -O0:    stlxrh w10, w12, [x11]
@@ -7749,9 +7473,7 @@ define dso_local i16 @atomicrmw_umax_i16_aligned_release(ptr %ptr, i16 %value) {
 define dso_local i16 @atomicrmw_umax_i16_aligned_acq_rel(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umax_i16_aligned_acq_rel:
 ; -O0:    subs w10, w10, w9, uxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, hi
 ; -O0:    ldaxrh w9, [x11]
 ; -O0:    cmp w9, w8, uxth
 ; -O0:    stlxrh w10, w12, [x11]
@@ -7770,9 +7492,7 @@ define dso_local i16 @atomicrmw_umax_i16_aligned_acq_rel(ptr %ptr, i16 %value) {
 define dso_local i16 @atomicrmw_umax_i16_aligned_seq_cst(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umax_i16_aligned_seq_cst:
 ; -O0:    subs w10, w10, w9, uxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, hi
 ; -O0:    ldaxrh w9, [x11]
 ; -O0:    cmp w9, w8, uxth
 ; -O0:    stlxrh w10, w12, [x11]
@@ -7791,9 +7511,7 @@ define dso_local i16 @atomicrmw_umax_i16_aligned_seq_cst(ptr %ptr, i16 %value) {
 define dso_local i32 @atomicrmw_umax_i32_aligned_monotonic(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umax_i32_aligned_monotonic:
 ; -O0:    subs w10, w8, w9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, hi
 ; -O0:    ldaxr w9, [x11]
 ; -O0:    cmp w9, w8
 ; -O0:    stlxr w10, w12, [x11]
@@ -7811,9 +7529,7 @@ define dso_local i32 @atomicrmw_umax_i32_aligned_monotonic(ptr %ptr, i32 %value)
 define dso_local i32 @atomicrmw_umax_i32_aligned_acquire(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umax_i32_aligned_acquire:
 ; -O0:    subs w10, w8, w9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, hi
 ; -O0:    ldaxr w9, [x11]
 ; -O0:    cmp w9, w8
 ; -O0:    stlxr w10, w12, [x11]
@@ -7831,9 +7547,7 @@ define dso_local i32 @atomicrmw_umax_i32_aligned_acquire(ptr %ptr, i32 %value) {
 define dso_local i32 @atomicrmw_umax_i32_aligned_release(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umax_i32_aligned_release:
 ; -O0:    subs w10, w8, w9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, hi
 ; -O0:    ldaxr w9, [x11]
 ; -O0:    cmp w9, w8
 ; -O0:    stlxr w10, w12, [x11]
@@ -7851,9 +7565,7 @@ define dso_local i32 @atomicrmw_umax_i32_aligned_release(ptr %ptr, i32 %value) {
 define dso_local i32 @atomicrmw_umax_i32_aligned_acq_rel(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umax_i32_aligned_acq_rel:
 ; -O0:    subs w10, w8, w9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, hi
 ; -O0:    ldaxr w9, [x11]
 ; -O0:    cmp w9, w8
 ; -O0:    stlxr w10, w12, [x11]
@@ -7871,9 +7583,7 @@ define dso_local i32 @atomicrmw_umax_i32_aligned_acq_rel(ptr %ptr, i32 %value) {
 define dso_local i32 @atomicrmw_umax_i32_aligned_seq_cst(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umax_i32_aligned_seq_cst:
 ; -O0:    subs w10, w8, w9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, hi
 ; -O0:    ldaxr w9, [x11]
 ; -O0:    cmp w9, w8
 ; -O0:    stlxr w10, w12, [x11]
@@ -7891,9 +7601,7 @@ define dso_local i32 @atomicrmw_umax_i32_aligned_seq_cst(ptr %ptr, i32 %value) {
 define dso_local i64 @atomicrmw_umax_i64_aligned_monotonic(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umax_i64_aligned_monotonic:
 ; -O0:    subs x10, x8, x9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x12, x8, x9, ne
+; -O0:    csel x12, x8, x9, hi
 ; -O0:    ldaxr x9, [x11]
 ; -O0:    cmp x9, x8
 ; -O0:    stlxr w10, x12, [x11]
@@ -7911,9 +7619,7 @@ define dso_local i64 @atomicrmw_umax_i64_aligned_monotonic(ptr %ptr, i64 %value)
 define dso_local i64 @atomicrmw_umax_i64_aligned_acquire(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umax_i64_aligned_acquire:
 ; -O0:    subs x10, x8, x9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x12, x8, x9, ne
+; -O0:    csel x12, x8, x9, hi
 ; -O0:    ldaxr x9, [x11]
 ; -O0:    cmp x9, x8
 ; -O0:    stlxr w10, x12, [x11]
@@ -7931,9 +7637,7 @@ define dso_local i64 @atomicrmw_umax_i64_aligned_acquire(ptr %ptr, i64 %value) {
 define dso_local i64 @atomicrmw_umax_i64_aligned_release(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umax_i64_aligned_release:
 ; -O0:    subs x10, x8, x9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x12, x8, x9, ne
+; -O0:    csel x12, x8, x9, hi
 ; -O0:    ldaxr x9, [x11]
 ; -O0:    cmp x9, x8
 ; -O0:    stlxr w10, x12, [x11]
@@ -7951,9 +7655,7 @@ define dso_local i64 @atomicrmw_umax_i64_aligned_release(ptr %ptr, i64 %value) {
 define dso_local i64 @atomicrmw_umax_i64_aligned_acq_rel(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umax_i64_aligned_acq_rel:
 ; -O0:    subs x10, x8, x9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x12, x8, x9, ne
+; -O0:    csel x12, x8, x9, hi
 ; -O0:    ldaxr x9, [x11]
 ; -O0:    cmp x9, x8
 ; -O0:    stlxr w10, x12, [x11]
@@ -7971,9 +7673,7 @@ define dso_local i64 @atomicrmw_umax_i64_aligned_acq_rel(ptr %ptr, i64 %value) {
 define dso_local i64 @atomicrmw_umax_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umax_i64_aligned_seq_cst:
 ; -O0:    subs x10, x8, x9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x12, x8, x9, ne
+; -O0:    csel x12, x8, x9, hi
 ; -O0:    ldaxr x9, [x11]
 ; -O0:    cmp x9, x8
 ; -O0:    stlxr w10, x12, [x11]
@@ -7991,15 +7691,11 @@ define dso_local i64 @atomicrmw_umax_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 define dso_local i128 @atomicrmw_umax_i128_aligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_aligned_monotonic:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w8, w8, w10, ne
-; -O0:    and w13, w8, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x14, x10, x12, ne
-; -O0:    and w10, w8, #0x1
+; -O0:    subs x13, x13, x9
+; -O0:    csel w10, w8, w10, eq
+; -O0:    ands w13, w10, #0x1
+; -O0:    csel x14, x8, x12, ne
 ; -O0:    ands w10, w10, #0x1
 ; -O0:    csel x15, x8, x9, ne
 ; -O0:    ldxp x10, x9, [x11]
@@ -8025,15 +7721,11 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_monotonic(ptr %ptr, i128 %val
 define dso_local i128 @atomicrmw_umax_i128_aligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_aligned_acquire:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w8, w8, w10, ne
-; -O0:    and w13, w8, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x14, x10, x12, ne
-; -O0:    and w10, w8, #0x1
+; -O0:    subs x13, x13, x9
+; -O0:    csel w10, w8, w10, eq
+; -O0:    ands w13, w10, #0x1
+; -O0:    csel x14, x8, x12, ne
 ; -O0:    ands w10, w10, #0x1
 ; -O0:    csel x15, x8, x9, ne
 ; -O0:    ldaxp x10, x9, [x11]
@@ -8059,15 +7751,11 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_acquire(ptr %ptr, i128 %value
 define dso_local i128 @atomicrmw_umax_i128_aligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_aligned_release:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w8, w8, w10, ne
-; -O0:    and w13, w8, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x14, x10, x12, ne
-; -O0:    and w10, w8, #0x1
+; -O0:    subs x13, x13, x9
+; -O0:    csel w10, w8, w10, eq
+; -O0:    ands w13, w10, #0x1
+; -O0:    csel x14, x8, x12, ne
 ; -O0:    ands w10, w10, #0x1
 ; -O0:    csel x15, x8, x9, ne
 ; -O0:    ldxp x10, x9, [x11]
@@ -8093,15 +7781,11 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_release(ptr %ptr, i128 %value
 define dso_local i128 @atomicrmw_umax_i128_aligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_aligned_acq_rel:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w8, w8, w10, ne
-; -O0:    and w13, w8, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x14, x10, x12, ne
-; -O0:    and w10, w8, #0x1
+; -O0:    subs x13, x13, x9
+; -O0:    csel w10, w8, w10, eq
+; -O0:    ands w13, w10, #0x1
+; -O0:    csel x14, x8, x12, ne
 ; -O0:    ands w10, w10, #0x1
 ; -O0:    csel x15, x8, x9, ne
 ; -O0:    ldaxp x10, x9, [x11]
@@ -8127,15 +7811,11 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_acq_rel(ptr %ptr, i128 %value
 define dso_local i128 @atomicrmw_umax_i128_aligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_aligned_seq_cst:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w8, w8, w10, ne
-; -O0:    and w13, w8, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x14, x10, x12, ne
-; -O0:    and w10, w8, #0x1
+; -O0:    subs x13, x13, x9
+; -O0:    csel w10, w8, w10, eq
+; -O0:    ands w13, w10, #0x1
+; -O0:    csel x14, x8, x12, ne
 ; -O0:    ands w10, w10, #0x1
 ; -O0:    csel x15, x8, x9, ne
 ; -O0:    ldaxp x10, x9, [x11]
@@ -8162,9 +7842,7 @@ define dso_local i8 @atomicrmw_umax_i8_unaligned_monotonic(ptr %ptr, i8 %value)
 ; -O0-LABEL: atomicrmw_umax_i8_unaligned_monotonic:
 ; -O0:    and w9, w10, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, hi
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -8185,9 +7863,7 @@ define dso_local i8 @atomicrmw_umax_i8_unaligned_acquire(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umax_i8_unaligned_acquire:
 ; -O0:    and w9, w10, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, hi
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -8208,9 +7884,7 @@ define dso_local i8 @atomicrmw_umax_i8_unaligned_release(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umax_i8_unaligned_release:
 ; -O0:    and w9, w10, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, hi
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -8231,9 +7905,7 @@ define dso_local i8 @atomicrmw_umax_i8_unaligned_acq_rel(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umax_i8_unaligned_acq_rel:
 ; -O0:    and w9, w10, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, hi
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -8254,9 +7926,7 @@ define dso_local i8 @atomicrmw_umax_i8_unaligned_seq_cst(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umax_i8_unaligned_seq_cst:
 ; -O0:    and w9, w10, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, hi
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -8276,9 +7946,7 @@ define dso_local i8 @atomicrmw_umax_i8_unaligned_seq_cst(ptr %ptr, i8 %value) {
 define dso_local i16 @atomicrmw_umax_i16_unaligned_monotonic(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umax_i16_unaligned_monotonic:
 ; -O0:    subs w10, w10, w8, uxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, hi
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i16_unaligned_monotonic:
@@ -8293,9 +7961,7 @@ define dso_local i16 @atomicrmw_umax_i16_unaligned_monotonic(ptr %ptr, i16 %valu
 define dso_local i16 @atomicrmw_umax_i16_unaligned_acquire(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umax_i16_unaligned_acquire:
 ; -O0:    subs w10, w10, w8, uxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, hi
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i16_unaligned_acquire:
@@ -8310,9 +7976,7 @@ define dso_local i16 @atomicrmw_umax_i16_unaligned_acquire(ptr %ptr, i16 %value)
 define dso_local i16 @atomicrmw_umax_i16_unaligned_release(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umax_i16_unaligned_release:
 ; -O0:    subs w10, w10, w8, uxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, hi
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i16_unaligned_release:
@@ -8327,9 +7991,7 @@ define dso_local i16 @atomicrmw_umax_i16_unaligned_release(ptr %ptr, i16 %value)
 define dso_local i16 @atomicrmw_umax_i16_unaligned_acq_rel(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umax_i16_unaligned_acq_rel:
 ; -O0:    subs w10, w10, w8, uxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, hi
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i16_unaligned_acq_rel:
@@ -8344,9 +8006,7 @@ define dso_local i16 @atomicrmw_umax_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 define dso_local i16 @atomicrmw_umax_i16_unaligned_seq_cst(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umax_i16_unaligned_seq_cst:
 ; -O0:    subs w10, w10, w8, uxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, hi
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i16_unaligned_seq_cst:
@@ -8361,9 +8021,7 @@ define dso_local i16 @atomicrmw_umax_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 define dso_local i32 @atomicrmw_umax_i32_unaligned_monotonic(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umax_i32_unaligned_monotonic:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, hi
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i32_unaligned_monotonic:
@@ -8377,9 +8035,7 @@ define dso_local i32 @atomicrmw_umax_i32_unaligned_monotonic(ptr %ptr, i32 %valu
 define dso_local i32 @atomicrmw_umax_i32_unaligned_acquire(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umax_i32_unaligned_acquire:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, hi
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i32_unaligned_acquire:
@@ -8393,9 +8049,7 @@ define dso_local i32 @atomicrmw_umax_i32_unaligned_acquire(ptr %ptr, i32 %value)
 define dso_local i32 @atomicrmw_umax_i32_unaligned_release(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umax_i32_unaligned_release:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, hi
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i32_unaligned_release:
@@ -8409,9 +8063,7 @@ define dso_local i32 @atomicrmw_umax_i32_unaligned_release(ptr %ptr, i32 %value)
 define dso_local i32 @atomicrmw_umax_i32_unaligned_acq_rel(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umax_i32_unaligned_acq_rel:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, hi
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i32_unaligned_acq_rel:
@@ -8425,9 +8077,7 @@ define dso_local i32 @atomicrmw_umax_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 define dso_local i32 @atomicrmw_umax_i32_unaligned_seq_cst(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umax_i32_unaligned_seq_cst:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, hi
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i32_unaligned_seq_cst:
@@ -8441,9 +8091,7 @@ define dso_local i32 @atomicrmw_umax_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 define dso_local i64 @atomicrmw_umax_i64_unaligned_monotonic(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umax_i64_unaligned_monotonic:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, hi
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i64_unaligned_monotonic:
@@ -8457,9 +8105,7 @@ define dso_local i64 @atomicrmw_umax_i64_unaligned_monotonic(ptr %ptr, i64 %valu
 define dso_local i64 @atomicrmw_umax_i64_unaligned_acquire(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umax_i64_unaligned_acquire:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, hi
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i64_unaligned_acquire:
@@ -8473,9 +8119,7 @@ define dso_local i64 @atomicrmw_umax_i64_unaligned_acquire(ptr %ptr, i64 %value)
 define dso_local i64 @atomicrmw_umax_i64_unaligned_release(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umax_i64_unaligned_release:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, hi
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i64_unaligned_release:
@@ -8489,9 +8133,7 @@ define dso_local i64 @atomicrmw_umax_i64_unaligned_release(ptr %ptr, i64 %value)
 define dso_local i64 @atomicrmw_umax_i64_unaligned_acq_rel(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umax_i64_unaligned_acq_rel:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, hi
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i64_unaligned_acq_rel:
@@ -8505,9 +8147,7 @@ define dso_local i64 @atomicrmw_umax_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 define dso_local i64 @atomicrmw_umax_i64_unaligned_seq_cst(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umax_i64_unaligned_seq_cst:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, hi
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i64_unaligned_seq_cst:
@@ -8521,15 +8161,11 @@ define dso_local i64 @atomicrmw_umax_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 define dso_local i128 @atomicrmw_umax_i128_unaligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_unaligned_monotonic:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -8547,15 +8183,11 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_monotonic(ptr %ptr, i128 %v
 define dso_local i128 @atomicrmw_umax_i128_unaligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_unaligned_acquire:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -8573,15 +8205,11 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_acquire(ptr %ptr, i128 %val
 define dso_local i128 @atomicrmw_umax_i128_unaligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_unaligned_release:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -8599,15 +8227,11 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_release(ptr %ptr, i128 %val
 define dso_local i128 @atomicrmw_umax_i128_unaligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_unaligned_acq_rel:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -8625,15 +8249,11 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_acq_rel(ptr %ptr, i128 %val
 define dso_local i128 @atomicrmw_umax_i128_unaligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_unaligned_seq_cst:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -8652,9 +8272,7 @@ define dso_local i8 @atomicrmw_umin_i8_aligned_monotonic(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umin_i8_aligned_monotonic:
 ; -O0:    and w9, w10, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, ls
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -8675,9 +8293,7 @@ define dso_local i8 @atomicrmw_umin_i8_aligned_acquire(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umin_i8_aligned_acquire:
 ; -O0:    and w9, w10, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, ls
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -8698,9 +8314,7 @@ define dso_local i8 @atomicrmw_umin_i8_aligned_release(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umin_i8_aligned_release:
 ; -O0:    and w9, w10, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, ls
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -8721,9 +8335,7 @@ define dso_local i8 @atomicrmw_umin_i8_aligned_acq_rel(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umin_i8_aligned_acq_rel:
 ; -O0:    and w9, w10, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, ls
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -8744,9 +8356,7 @@ define dso_local i8 @atomicrmw_umin_i8_aligned_seq_cst(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umin_i8_aligned_seq_cst:
 ; -O0:    and w9, w10, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, ls
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -8766,9 +8376,7 @@ define dso_local i8 @atomicrmw_umin_i8_aligned_seq_cst(ptr %ptr, i8 %value) {
 define dso_local i16 @atomicrmw_umin_i16_aligned_monotonic(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umin_i16_aligned_monotonic:
 ; -O0:    subs w10, w10, w9, uxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, ls
 ; -O0:    ldaxrh w9, [x11]
 ; -O0:    cmp w9, w8, uxth
 ; -O0:    stlxrh w10, w12, [x11]
@@ -8787,9 +8395,7 @@ define dso_local i16 @atomicrmw_umin_i16_aligned_monotonic(ptr %ptr, i16 %value)
 define dso_local i16 @atomicrmw_umin_i16_aligned_acquire(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umin_i16_aligned_acquire:
 ; -O0:    subs w10, w10, w9, uxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, ls
 ; -O0:    ldaxrh w9, [x11]
 ; -O0:    cmp w9, w8, uxth
 ; -O0:    stlxrh w10, w12, [x11]
@@ -8808,9 +8414,7 @@ define dso_local i16 @atomicrmw_umin_i16_aligned_acquire(ptr %ptr, i16 %value) {
 define dso_local i16 @atomicrmw_umin_i16_aligned_release(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umin_i16_aligned_release:
 ; -O0:    subs w10, w10, w9, uxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, ls
 ; -O0:    ldaxrh w9, [x11]
 ; -O0:    cmp w9, w8, uxth
 ; -O0:    stlxrh w10, w12, [x11]
@@ -8829,9 +8433,7 @@ define dso_local i16 @atomicrmw_umin_i16_aligned_release(ptr %ptr, i16 %value) {
 define dso_local i16 @atomicrmw_umin_i16_aligned_acq_rel(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umin_i16_aligned_acq_rel:
 ; -O0:    subs w10, w10, w9, uxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, ls
 ; -O0:    ldaxrh w9, [x11]
 ; -O0:    cmp w9, w8, uxth
 ; -O0:    stlxrh w10, w12, [x11]
@@ -8850,9 +8452,7 @@ define dso_local i16 @atomicrmw_umin_i16_aligned_acq_rel(ptr %ptr, i16 %value) {
 define dso_local i16 @atomicrmw_umin_i16_aligned_seq_cst(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umin_i16_aligned_seq_cst:
 ; -O0:    subs w10, w10, w9, uxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, ls
 ; -O0:    ldaxrh w9, [x11]
 ; -O0:    cmp w9, w8, uxth
 ; -O0:    stlxrh w10, w12, [x11]
@@ -8871,9 +8471,7 @@ define dso_local i16 @atomicrmw_umin_i16_aligned_seq_cst(ptr %ptr, i16 %value) {
 define dso_local i32 @atomicrmw_umin_i32_aligned_monotonic(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umin_i32_aligned_monotonic:
 ; -O0:    subs w10, w8, w9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, ls
 ; -O0:    ldaxr w9, [x11]
 ; -O0:    cmp w9, w8
 ; -O0:    stlxr w10, w12, [x11]
@@ -8891,9 +8489,7 @@ define dso_local i32 @atomicrmw_umin_i32_aligned_monotonic(ptr %ptr, i32 %value)
 define dso_local i32 @atomicrmw_umin_i32_aligned_acquire(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umin_i32_aligned_acquire:
 ; -O0:    subs w10, w8, w9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, ls
 ; -O0:    ldaxr w9, [x11]
 ; -O0:    cmp w9, w8
 ; -O0:    stlxr w10, w12, [x11]
@@ -8911,9 +8507,7 @@ define dso_local i32 @atomicrmw_umin_i32_aligned_acquire(ptr %ptr, i32 %value) {
 define dso_local i32 @atomicrmw_umin_i32_aligned_release(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umin_i32_aligned_release:
 ; -O0:    subs w10, w8, w9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, ls
 ; -O0:    ldaxr w9, [x11]
 ; -O0:    cmp w9, w8
 ; -O0:    stlxr w10, w12, [x11]
@@ -8931,9 +8525,7 @@ define dso_local i32 @atomicrmw_umin_i32_aligned_release(ptr %ptr, i32 %value) {
 define dso_local i32 @atomicrmw_umin_i32_aligned_acq_rel(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umin_i32_aligned_acq_rel:
 ; -O0:    subs w10, w8, w9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, ls
 ; -O0:    ldaxr w9, [x11]
 ; -O0:    cmp w9, w8
 ; -O0:    stlxr w10, w12, [x11]
@@ -8951,9 +8543,7 @@ define dso_local i32 @atomicrmw_umin_i32_aligned_acq_rel(ptr %ptr, i32 %value) {
 define dso_local i32 @atomicrmw_umin_i32_aligned_seq_cst(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umin_i32_aligned_seq_cst:
 ; -O0:    subs w10, w8, w9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, ls
 ; -O0:    ldaxr w9, [x11]
 ; -O0:    cmp w9, w8
 ; -O0:    stlxr w10, w12, [x11]
@@ -8971,9 +8561,7 @@ define dso_local i32 @atomicrmw_umin_i32_aligned_seq_cst(ptr %ptr, i32 %value) {
 define dso_local i64 @atomicrmw_umin_i64_aligned_monotonic(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umin_i64_aligned_monotonic:
 ; -O0:    subs x10, x8, x9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x12, x8, x9, ne
+; -O0:    csel x12, x8, x9, ls
 ; -O0:    ldaxr x9, [x11]
 ; -O0:    cmp x9, x8
 ; -O0:    stlxr w10, x12, [x11]
@@ -8991,9 +8579,7 @@ define dso_local i64 @atomicrmw_umin_i64_aligned_monotonic(ptr %ptr, i64 %value)
 define dso_local i64 @atomicrmw_umin_i64_aligned_acquire(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umin_i64_aligned_acquire:
 ; -O0:    subs x10, x8, x9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x12, x8, x9, ne
+; -O0:    csel x12, x8, x9, ls
 ; -O0:    ldaxr x9, [x11]
 ; -O0:    cmp x9, x8
 ; -O0:    stlxr w10, x12, [x11]
@@ -9011,9 +8597,7 @@ define dso_local i64 @atomicrmw_umin_i64_aligned_acquire(ptr %ptr, i64 %value) {
 define dso_local i64 @atomicrmw_umin_i64_aligned_release(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umin_i64_aligned_release:
 ; -O0:    subs x10, x8, x9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x12, x8, x9, ne
+; -O0:    csel x12, x8, x9, ls
 ; -O0:    ldaxr x9, [x11]
 ; -O0:    cmp x9, x8
 ; -O0:    stlxr w10, x12, [x11]
@@ -9031,9 +8615,7 @@ define dso_local i64 @atomicrmw_umin_i64_aligned_release(ptr %ptr, i64 %value) {
 define dso_local i64 @atomicrmw_umin_i64_aligned_acq_rel(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umin_i64_aligned_acq_rel:
 ; -O0:    subs x10, x8, x9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x12, x8, x9, ne
+; -O0:    csel x12, x8, x9, ls
 ; -O0:    ldaxr x9, [x11]
 ; -O0:    cmp x9, x8
 ; -O0:    stlxr w10, x12, [x11]
@@ -9051,9 +8633,7 @@ define dso_local i64 @atomicrmw_umin_i64_aligned_acq_rel(ptr %ptr, i64 %value) {
 define dso_local i64 @atomicrmw_umin_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umin_i64_aligned_seq_cst:
 ; -O0:    subs x10, x8, x9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x12, x8, x9, ne
+; -O0:    csel x12, x8, x9, ls
 ; -O0:    ldaxr x9, [x11]
 ; -O0:    cmp x9, x8
 ; -O0:    stlxr w10, x12, [x11]
@@ -9071,15 +8651,11 @@ define dso_local i64 @atomicrmw_umin_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 define dso_local i128 @atomicrmw_umin_i128_aligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_aligned_monotonic:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w8, w8, w10, ne
-; -O0:    and w13, w8, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x14, x10, x12, ne
-; -O0:    and w10, w8, #0x1
+; -O0:    subs x13, x13, x9
+; -O0:    csel w10, w8, w10, eq
+; -O0:    ands w13, w10, #0x1
+; -O0:    csel x14, x8, x12, ne
 ; -O0:    ands w10, w10, #0x1
 ; -O0:    csel x15, x8, x9, ne
 ; -O0:    ldxp x10, x9, [x11]
@@ -9105,15 +8681,11 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_monotonic(ptr %ptr, i128 %val
 define dso_local i128 @atomicrmw_umin_i128_aligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_aligned_acquire:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w8, w8, w10, ne
-; -O0:    and w13, w8, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x14, x10, x12, ne
-; -O0:    and w10, w8, #0x1
+; -O0:    subs x13, x13, x9
+; -O0:    csel w10, w8, w10, eq
+; -O0:    ands w13, w10, #0x1
+; -O0:    csel x14, x8, x12, ne
 ; -O0:    ands w10, w10, #0x1
 ; -O0:    csel x15, x8, x9, ne
 ; -O0:    ldaxp x10, x9, [x11]
@@ -9139,15 +8711,11 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_acquire(ptr %ptr, i128 %value
 define dso_local i128 @atomicrmw_umin_i128_aligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_aligned_release:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w8, w8, w10, ne
-; -O0:    and w13, w8, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x14, x10, x12, ne
-; -O0:    and w10, w8, #0x1
+; -O0:    subs x13, x13, x9
+; -O0:    csel w10, w8, w10, eq
+; -O0:    ands w13, w10, #0x1
+; -O0:    csel x14, x8, x12, ne
 ; -O0:    ands w10, w10, #0x1
 ; -O0:    csel x15, x8, x9, ne
 ; -O0:    ldxp x10, x9, [x11]
@@ -9173,15 +8741,11 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_release(ptr %ptr, i128 %value
 define dso_local i128 @atomicrmw_umin_i128_aligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_aligned_acq_rel:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w8, w8, w10, ne
-; -O0:    and w13, w8, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x14, x10, x12, ne
-; -O0:    and w10, w8, #0x1
+; -O0:    subs x13, x13, x9
+; -O0:    csel w10, w8, w10, eq
+; -O0:    ands w13, w10, #0x1
+; -O0:    csel x14, x8, x12, ne
 ; -O0:    ands w10, w10, #0x1
 ; -O0:    csel x15, x8, x9, ne
 ; -O0:    ldaxp x10, x9, [x11]
@@ -9207,15 +8771,11 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_acq_rel(ptr %ptr, i128 %value
 define dso_local i128 @atomicrmw_umin_i128_aligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_aligned_seq_cst:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w8, w8, w10, ne
-; -O0:    and w13, w8, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x14, x10, x12, ne
-; -O0:    and w10, w8, #0x1
+; -O0:    subs x13, x13, x9
+; -O0:    csel w10, w8, w10, eq
+; -O0:    ands w13, w10, #0x1
+; -O0:    csel x14, x8, x12, ne
 ; -O0:    ands w10, w10, #0x1
 ; -O0:    csel x15, x8, x9, ne
 ; -O0:    ldaxp x10, x9, [x11]
@@ -9242,9 +8802,7 @@ define dso_local i8 @atomicrmw_umin_i8_unaligned_monotonic(ptr %ptr, i8 %value)
 ; -O0-LABEL: atomicrmw_umin_i8_unaligned_monotonic:
 ; -O0:    and w9, w10, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, ls
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -9265,9 +8823,7 @@ define dso_local i8 @atomicrmw_umin_i8_unaligned_acquire(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umin_i8_unaligned_acquire:
 ; -O0:    and w9, w10, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, ls
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -9288,9 +8844,7 @@ define dso_local i8 @atomicrmw_umin_i8_unaligned_release(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umin_i8_unaligned_release:
 ; -O0:    and w9, w10, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, ls
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -9311,9 +8865,7 @@ define dso_local i8 @atomicrmw_umin_i8_unaligned_acq_rel(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umin_i8_unaligned_acq_rel:
 ; -O0:    and w9, w10, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, ls
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -9334,9 +8886,7 @@ define dso_local i8 @atomicrmw_umin_i8_unaligned_seq_cst(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umin_i8_unaligned_seq_cst:
 ; -O0:    and w9, w10, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, ls
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -9356,9 +8906,7 @@ define dso_local i8 @atomicrmw_umin_i8_unaligned_seq_cst(ptr %ptr, i8 %value) {
 define dso_local i16 @atomicrmw_umin_i16_unaligned_monotonic(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umin_i16_unaligned_monotonic:
 ; -O0:    subs w10, w10, w8, uxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, ls
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i16_unaligned_monotonic:
@@ -9373,9 +8921,7 @@ define dso_local i16 @atomicrmw_umin_i16_unaligned_monotonic(ptr %ptr, i16 %valu
 define dso_local i16 @atomicrmw_umin_i16_unaligned_acquire(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umin_i16_unaligned_acquire:
 ; -O0:    subs w10, w10, w8, uxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, ls
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i16_unaligned_acquire:
@@ -9390,9 +8936,7 @@ define dso_local i16 @atomicrmw_umin_i16_unaligned_acquire(ptr %ptr, i16 %value)
 define dso_local i16 @atomicrmw_umin_i16_unaligned_release(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umin_i16_unaligned_release:
 ; -O0:    subs w10, w10, w8, uxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, ls
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i16_unaligned_release:
@@ -9407,9 +8951,7 @@ define dso_local i16 @atomicrmw_umin_i16_unaligned_release(ptr %ptr, i16 %value)
 define dso_local i16 @atomicrmw_umin_i16_unaligned_acq_rel(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umin_i16_unaligned_acq_rel:
 ; -O0:    subs w10, w10, w8, uxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, ls
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i16_unaligned_acq_rel:
@@ -9424,9 +8966,7 @@ define dso_local i16 @atomicrmw_umin_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 define dso_local i16 @atomicrmw_umin_i16_unaligned_seq_cst(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umin_i16_unaligned_seq_cst:
 ; -O0:    subs w10, w10, w8, uxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, ls
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i16_unaligned_seq_cst:
@@ -9441,9 +8981,7 @@ define dso_local i16 @atomicrmw_umin_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 define dso_local i32 @atomicrmw_umin_i32_unaligned_monotonic(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umin_i32_unaligned_monotonic:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, ls
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i32_unaligned_monotonic:
@@ -9457,9 +8995,7 @@ define dso_local i32 @atomicrmw_umin_i32_unaligned_monotonic(ptr %ptr, i32 %valu
 define dso_local i32 @atomicrmw_umin_i32_unaligned_acquire(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umin_i32_unaligned_acquire:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, ls
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i32_unaligned_acquire:
@@ -9473,9 +9009,7 @@ define dso_local i32 @atomicrmw_umin_i32_unaligned_acquire(ptr %ptr, i32 %value)
 define dso_local i32 @atomicrmw_umin_i32_unaligned_release(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umin_i32_unaligned_release:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, ls
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i32_unaligned_release:
@@ -9489,9 +9023,7 @@ define dso_local i32 @atomicrmw_umin_i32_unaligned_release(ptr %ptr, i32 %value)
 define dso_local i32 @atomicrmw_umin_i32_unaligned_acq_rel(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umin_i32_unaligned_acq_rel:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, ls
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i32_unaligned_acq_rel:
@@ -9505,9 +9037,7 @@ define dso_local i32 @atomicrmw_umin_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 define dso_local i32 @atomicrmw_umin_i32_unaligned_seq_cst(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umin_i32_unaligned_seq_cst:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, ls
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i32_unaligned_seq_cst:
@@ -9521,9 +9051,7 @@ define dso_local i32 @atomicrmw_umin_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 define dso_local i64 @atomicrmw_umin_i64_unaligned_monotonic(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umin_i64_unaligned_monotonic:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, ls
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i64_unaligned_monotonic:
@@ -9537,9 +9065,7 @@ define dso_local i64 @atomicrmw_umin_i64_unaligned_monotonic(ptr %ptr, i64 %valu
 define dso_local i64 @atomicrmw_umin_i64_unaligned_acquire(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umin_i64_unaligned_acquire:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, ls
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i64_unaligned_acquire:
@@ -9553,9 +9079,7 @@ define dso_local i64 @atomicrmw_umin_i64_unaligned_acquire(ptr %ptr, i64 %value)
 define dso_local i64 @atomicrmw_umin_i64_unaligned_release(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umin_i64_unaligned_release:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, ls
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i64_unaligned_release:
@@ -9569,9 +9093,7 @@ define dso_local i64 @atomicrmw_umin_i64_unaligned_release(ptr %ptr, i64 %value)
 define dso_local i64 @atomicrmw_umin_i64_unaligned_acq_rel(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umin_i64_unaligned_acq_rel:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, ls
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i64_unaligned_acq_rel:
@@ -9585,9 +9107,7 @@ define dso_local i64 @atomicrmw_umin_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 define dso_local i64 @atomicrmw_umin_i64_unaligned_seq_cst(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umin_i64_unaligned_seq_cst:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, ls
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i64_unaligned_seq_cst:
@@ -9601,15 +9121,11 @@ define dso_local i64 @atomicrmw_umin_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 define dso_local i128 @atomicrmw_umin_i128_unaligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_unaligned_monotonic:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -9627,15 +9143,11 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_monotonic(ptr %ptr, i128 %v
 define dso_local i128 @atomicrmw_umin_i128_unaligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_unaligned_acquire:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -9653,15 +9165,11 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_acquire(ptr %ptr, i128 %val
 define dso_local i128 @atomicrmw_umin_i128_unaligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_unaligned_release:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -9679,15 +9187,11 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_release(ptr %ptr, i128 %val
 define dso_local i128 @atomicrmw_umin_i128_unaligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_unaligned_acq_rel:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -9705,15 +9209,11 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_acq_rel(ptr %ptr, i128 %val
 define dso_local i128 @atomicrmw_umin_i128_unaligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_unaligned_seq_cst:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
diff --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-rcpc3.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-rcpc3.ll
index 635620bb5ae11..3437ccc8be40d 100644
--- a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-rcpc3.ll
+++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-rcpc3.ll
@@ -926,7 +926,6 @@ define dso_local i64 @atomicrmw_add_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 define dso_local i128 @atomicrmw_add_i128_aligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_add_i128_aligned_monotonic:
 ; -O0:    adds x14, x8, x10
-; -O0:    and w10, w8, #0x1
 ; -O0:    subs w10, w10, #1
 ; -O0:    ldxp x10, x9, [x11]
 ; -O0:    cmp x10, x12
@@ -949,7 +948,6 @@ define dso_local i128 @atomicrmw_add_i128_aligned_monotonic(ptr %ptr, i128 %valu
 define dso_local i128 @atomicrmw_add_i128_aligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_add_i128_aligned_acquire:
 ; -O0:    adds x14, x8, x10
-; -O0:    and w10, w8, #0x1
 ; -O0:    subs w10, w10, #1
 ; -O0:    ldaxp x10, x9, [x11]
 ; -O0:    cmp x10, x12
@@ -972,7 +970,6 @@ define dso_local i128 @atomicrmw_add_i128_aligned_acquire(ptr %ptr, i128 %value)
 define dso_local i128 @atomicrmw_add_i128_aligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_add_i128_aligned_release:
 ; -O0:    adds x14, x8, x10
-; -O0:    and w10, w8, #0x1
 ; -O0:    subs w10, w10, #1
 ; -O0:    ldxp x10, x9, [x11]
 ; -O0:    cmp x10, x12
@@ -995,7 +992,6 @@ define dso_local i128 @atomicrmw_add_i128_aligned_release(ptr %ptr, i128 %value)
 define dso_local i128 @atomicrmw_add_i128_aligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_add_i128_aligned_acq_rel:
 ; -O0:    adds x14, x8, x10
-; -O0:    and w10, w8, #0x1
 ; -O0:    subs w10, w10, #1
 ; -O0:    ldaxp x10, x9, [x11]
 ; -O0:    cmp x10, x12
@@ -1018,7 +1014,6 @@ define dso_local i128 @atomicrmw_add_i128_aligned_acq_rel(ptr %ptr, i128 %value)
 define dso_local i128 @atomicrmw_add_i128_aligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_add_i128_aligned_seq_cst:
 ; -O0:    adds x14, x8, x10
-; -O0:    and w10, w8, #0x1
 ; -O0:    subs w10, w10, #1
 ; -O0:    ldaxp x10, x9, [x11]
 ; -O0:    cmp x10, x12
@@ -1306,7 +1301,6 @@ define dso_local i64 @atomicrmw_add_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 define dso_local i128 @atomicrmw_add_i128_unaligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_add_i128_unaligned_monotonic:
 ; -O0:    adds x9, x8, x9
-; -O0:    and w11, w8, #0x1
 ; -O0:    subs w11, w11, #1
 ; -O0:    bl __atomic_compare_exchange
 ;
@@ -1321,7 +1315,6 @@ define dso_local i128 @atomicrmw_add_i128_unaligned_monotonic(ptr %ptr, i128 %va
 define dso_local i128 @atomicrmw_add_i128_unaligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_add_i128_unaligned_acquire:
 ; -O0:    adds x9, x8, x9
-; -O0:    and w11, w8, #0x1
 ; -O0:    subs w11, w11, #1
 ; -O0:    bl __atomic_compare_exchange
 ;
@@ -1336,7 +1329,6 @@ define dso_local i128 @atomicrmw_add_i128_unaligned_acquire(ptr %ptr, i128 %valu
 define dso_local i128 @atomicrmw_add_i128_unaligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_add_i128_unaligned_release:
 ; -O0:    adds x9, x8, x9
-; -O0:    and w11, w8, #0x1
 ; -O0:    subs w11, w11, #1
 ; -O0:    bl __atomic_compare_exchange
 ;
@@ -1351,7 +1343,6 @@ define dso_local i128 @atomicrmw_add_i128_unaligned_release(ptr %ptr, i128 %valu
 define dso_local i128 @atomicrmw_add_i128_unaligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_add_i128_unaligned_acq_rel:
 ; -O0:    adds x9, x8, x9
-; -O0:    and w11, w8, #0x1
 ; -O0:    subs w11, w11, #1
 ; -O0:    bl __atomic_compare_exchange
 ;
@@ -1366,7 +1357,6 @@ define dso_local i128 @atomicrmw_add_i128_unaligned_acq_rel(ptr %ptr, i128 %valu
 define dso_local i128 @atomicrmw_add_i128_unaligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_add_i128_unaligned_seq_cst:
 ; -O0:    adds x9, x8, x9
-; -O0:    and w11, w8, #0x1
 ; -O0:    subs w11, w11, #1
 ; -O0:    bl __atomic_compare_exchange
 ;
@@ -1706,7 +1696,6 @@ define dso_local i64 @atomicrmw_sub_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 define dso_local i128 @atomicrmw_sub_i128_aligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_sub_i128_aligned_monotonic:
 ; -O0:    subs x14, x8, x10
-; -O0:    and w10, w8, #0x1
 ; -O0:    ldxp x10, x9, [x11]
 ; -O0:    cmp x10, x12
 ; -O0:    cmp x9, x13
@@ -1728,7 +1717,6 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_monotonic(ptr %ptr, i128 %valu
 define dso_local i128 @atomicrmw_sub_i128_aligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_sub_i128_aligned_acquire:
 ; -O0:    subs x14, x8, x10
-; -O0:    and w10, w8, #0x1
 ; -O0:    ldaxp x10, x9, [x11]
 ; -O0:    cmp x10, x12
 ; -O0:    cmp x9, x13
@@ -1750,7 +1738,6 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_acquire(ptr %ptr, i128 %value)
 define dso_local i128 @atomicrmw_sub_i128_aligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_sub_i128_aligned_release:
 ; -O0:    subs x14, x8, x10
-; -O0:    and w10, w8, #0x1
 ; -O0:    ldxp x10, x9, [x11]
 ; -O0:    cmp x10, x12
 ; -O0:    cmp x9, x13
@@ -1772,7 +1759,6 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_release(ptr %ptr, i128 %value)
 define dso_local i128 @atomicrmw_sub_i128_aligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_sub_i128_aligned_acq_rel:
 ; -O0:    subs x14, x8, x10
-; -O0:    and w10, w8, #0x1
 ; -O0:    ldaxp x10, x9, [x11]
 ; -O0:    cmp x10, x12
 ; -O0:    cmp x9, x13
@@ -1794,7 +1780,6 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_acq_rel(ptr %ptr, i128 %value)
 define dso_local i128 @atomicrmw_sub_i128_aligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_sub_i128_aligned_seq_cst:
 ; -O0:    subs x14, x8, x10
-; -O0:    and w10, w8, #0x1
 ; -O0:    ldaxp x10, x9, [x11]
 ; -O0:    cmp x10, x12
 ; -O0:    cmp x9, x13
@@ -2081,7 +2066,6 @@ define dso_local i64 @atomicrmw_sub_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 define dso_local i128 @atomicrmw_sub_i128_unaligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_sub_i128_unaligned_monotonic:
 ; -O0:    subs x9, x8, x9
-; -O0:    and w11, w8, #0x1
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i128_unaligned_monotonic:
@@ -2095,7 +2079,6 @@ define dso_local i128 @atomicrmw_sub_i128_unaligned_monotonic(ptr %ptr, i128 %va
 define dso_local i128 @atomicrmw_sub_i128_unaligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_sub_i128_unaligned_acquire:
 ; -O0:    subs x9, x8, x9
-; -O0:    and w11, w8, #0x1
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i128_unaligned_acquire:
@@ -2109,7 +2092,6 @@ define dso_local i128 @atomicrmw_sub_i128_unaligned_acquire(ptr %ptr, i128 %valu
 define dso_local i128 @atomicrmw_sub_i128_unaligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_sub_i128_unaligned_release:
 ; -O0:    subs x9, x8, x9
-; -O0:    and w11, w8, #0x1
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i128_unaligned_release:
@@ -2123,7 +2105,6 @@ define dso_local i128 @atomicrmw_sub_i128_unaligned_release(ptr %ptr, i128 %valu
 define dso_local i128 @atomicrmw_sub_i128_unaligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_sub_i128_unaligned_acq_rel:
 ; -O0:    subs x9, x8, x9
-; -O0:    and w11, w8, #0x1
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i128_unaligned_acq_rel:
@@ -2137,7 +2118,6 @@ define dso_local i128 @atomicrmw_sub_i128_unaligned_acq_rel(ptr %ptr, i128 %valu
 define dso_local i128 @atomicrmw_sub_i128_unaligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_sub_i128_unaligned_seq_cst:
 ; -O0:    subs x9, x8, x9
-; -O0:    and w11, w8, #0x1
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i128_unaligned_seq_cst:
@@ -5392,9 +5372,7 @@ define dso_local i8 @atomicrmw_max_i8_aligned_monotonic(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_max_i8_aligned_monotonic:
 ; -O0:    sxtb w9, w10
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, gt
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -5415,9 +5393,7 @@ define dso_local i8 @atomicrmw_max_i8_aligned_acquire(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_max_i8_aligned_acquire:
 ; -O0:    sxtb w9, w10
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, gt
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -5438,9 +5414,7 @@ define dso_local i8 @atomicrmw_max_i8_aligned_release(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_max_i8_aligned_release:
 ; -O0:    sxtb w9, w10
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, gt
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -5461,9 +5435,7 @@ define dso_local i8 @atomicrmw_max_i8_aligned_acq_rel(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_max_i8_aligned_acq_rel:
 ; -O0:    sxtb w9, w10
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, gt
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -5484,9 +5456,7 @@ define dso_local i8 @atomicrmw_max_i8_aligned_seq_cst(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_max_i8_aligned_seq_cst:
 ; -O0:    sxtb w9, w10
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, gt
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -5507,9 +5477,7 @@ define dso_local i16 @atomicrmw_max_i16_aligned_monotonic(ptr %ptr, i16 %value)
 ; -O0-LABEL: atomicrmw_max_i16_aligned_monotonic:
 ; -O0:    sxth w10, w8
 ; -O0:    subs w10, w10, w9, sxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, gt
 ; -O0:    ldaxrh w9, [x11]
 ; -O0:    cmp w9, w8, uxth
 ; -O0:    stlxrh w10, w12, [x11]
@@ -5529,9 +5497,7 @@ define dso_local i16 @atomicrmw_max_i16_aligned_acquire(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_max_i16_aligned_acquire:
 ; -O0:    sxth w10, w8
 ; -O0:    subs w10, w10, w9, sxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, gt
 ; -O0:    ldaxrh w9, [x11]
 ; -O0:    cmp w9, w8, uxth
 ; -O0:    stlxrh w10, w12, [x11]
@@ -5551,9 +5517,7 @@ define dso_local i16 @atomicrmw_max_i16_aligned_release(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_max_i16_aligned_release:
 ; -O0:    sxth w10, w8
 ; -O0:    subs w10, w10, w9, sxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, gt
 ; -O0:    ldaxrh w9, [x11]
 ; -O0:    cmp w9, w8, uxth
 ; -O0:    stlxrh w10, w12, [x11]
@@ -5573,9 +5537,7 @@ define dso_local i16 @atomicrmw_max_i16_aligned_acq_rel(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_max_i16_aligned_acq_rel:
 ; -O0:    sxth w10, w8
 ; -O0:    subs w10, w10, w9, sxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, gt
 ; -O0:    ldaxrh w9, [x11]
 ; -O0:    cmp w9, w8, uxth
 ; -O0:    stlxrh w10, w12, [x11]
@@ -5595,9 +5557,7 @@ define dso_local i16 @atomicrmw_max_i16_aligned_seq_cst(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_max_i16_aligned_seq_cst:
 ; -O0:    sxth w10, w8
 ; -O0:    subs w10, w10, w9, sxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, gt
 ; -O0:    ldaxrh w9, [x11]
 ; -O0:    cmp w9, w8, uxth
 ; -O0:    stlxrh w10, w12, [x11]
@@ -5616,9 +5576,7 @@ define dso_local i16 @atomicrmw_max_i16_aligned_seq_cst(ptr %ptr, i16 %value) {
 define dso_local i32 @atomicrmw_max_i32_aligned_monotonic(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_max_i32_aligned_monotonic:
 ; -O0:    subs w10, w8, w9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, gt
 ; -O0:    ldaxr w9, [x11]
 ; -O0:    cmp w9, w8
 ; -O0:    stlxr w10, w12, [x11]
@@ -5636,9 +5594,7 @@ define dso_local i32 @atomicrmw_max_i32_aligned_monotonic(ptr %ptr, i32 %value)
 define dso_local i32 @atomicrmw_max_i32_aligned_acquire(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_max_i32_aligned_acquire:
 ; -O0:    subs w10, w8, w9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, gt
 ; -O0:    ldaxr w9, [x11]
 ; -O0:    cmp w9, w8
 ; -O0:    stlxr w10, w12, [x11]
@@ -5656,9 +5612,7 @@ define dso_local i32 @atomicrmw_max_i32_aligned_acquire(ptr %ptr, i32 %value) {
 define dso_local i32 @atomicrmw_max_i32_aligned_release(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_max_i32_aligned_release:
 ; -O0:    subs w10, w8, w9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, gt
 ; -O0:    ldaxr w9, [x11]
 ; -O0:    cmp w9, w8
 ; -O0:    stlxr w10, w12, [x11]
@@ -5676,9 +5630,7 @@ define dso_local i32 @atomicrmw_max_i32_aligned_release(ptr %ptr, i32 %value) {
 define dso_local i32 @atomicrmw_max_i32_aligned_acq_rel(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_max_i32_aligned_acq_rel:
 ; -O0:    subs w10, w8, w9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, gt
 ; -O0:    ldaxr w9, [x11]
 ; -O0:    cmp w9, w8
 ; -O0:    stlxr w10, w12, [x11]
@@ -5696,9 +5648,7 @@ define dso_local i32 @atomicrmw_max_i32_aligned_acq_rel(ptr %ptr, i32 %value) {
 define dso_local i32 @atomicrmw_max_i32_aligned_seq_cst(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_max_i32_aligned_seq_cst:
 ; -O0:    subs w10, w8, w9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, gt
 ; -O0:    ldaxr w9, [x11]
 ; -O0:    cmp w9, w8
 ; -O0:    stlxr w10, w12, [x11]
@@ -5716,9 +5666,7 @@ define dso_local i32 @atomicrmw_max_i32_aligned_seq_cst(ptr %ptr, i32 %value) {
 define dso_local i64 @atomicrmw_max_i64_aligned_monotonic(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_max_i64_aligned_monotonic:
 ; -O0:    subs x10, x8, x9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x12, x8, x9, ne
+; -O0:    csel x12, x8, x9, gt
 ; -O0:    ldaxr x9, [x11]
 ; -O0:    cmp x9, x8
 ; -O0:    stlxr w10, x12, [x11]
@@ -5736,9 +5684,7 @@ define dso_local i64 @atomicrmw_max_i64_aligned_monotonic(ptr %ptr, i64 %value)
 define dso_local i64 @atomicrmw_max_i64_aligned_acquire(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_max_i64_aligned_acquire:
 ; -O0:    subs x10, x8, x9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x12, x8, x9, ne
+; -O0:    csel x12, x8, x9, gt
 ; -O0:    ldaxr x9, [x11]
 ; -O0:    cmp x9, x8
 ; -O0:    stlxr w10, x12, [x11]
@@ -5756,9 +5702,7 @@ define dso_local i64 @atomicrmw_max_i64_aligned_acquire(ptr %ptr, i64 %value) {
 define dso_local i64 @atomicrmw_max_i64_aligned_release(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_max_i64_aligned_release:
 ; -O0:    subs x10, x8, x9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x12, x8, x9, ne
+; -O0:    csel x12, x8, x9, gt
 ; -O0:    ldaxr x9, [x11]
 ; -O0:    cmp x9, x8
 ; -O0:    stlxr w10, x12, [x11]
@@ -5776,9 +5720,7 @@ define dso_local i64 @atomicrmw_max_i64_aligned_release(ptr %ptr, i64 %value) {
 define dso_local i64 @atomicrmw_max_i64_aligned_acq_rel(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_max_i64_aligned_acq_rel:
 ; -O0:    subs x10, x8, x9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x12, x8, x9, ne
+; -O0:    csel x12, x8, x9, gt
 ; -O0:    ldaxr x9, [x11]
 ; -O0:    cmp x9, x8
 ; -O0:    stlxr w10, x12, [x11]
@@ -5796,9 +5738,7 @@ define dso_local i64 @atomicrmw_max_i64_aligned_acq_rel(ptr %ptr, i64 %value) {
 define dso_local i64 @atomicrmw_max_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_max_i64_aligned_seq_cst:
 ; -O0:    subs x10, x8, x9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x12, x8, x9, ne
+; -O0:    csel x12, x8, x9, gt
 ; -O0:    ldaxr x9, [x11]
 ; -O0:    cmp x9, x8
 ; -O0:    stlxr w10, x12, [x11]
@@ -5816,15 +5756,11 @@ define dso_local i64 @atomicrmw_max_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 define dso_local i128 @atomicrmw_max_i128_aligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_aligned_monotonic:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w8, w8, w10, ne
-; -O0:    and w13, w8, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x14, x10, x12, ne
-; -O0:    and w10, w8, #0x1
+; -O0:    subs x13, x13, x9
+; -O0:    csel w10, w8, w10, eq
+; -O0:    ands w13, w10, #0x1
+; -O0:    csel x14, x8, x12, ne
 ; -O0:    ands w10, w10, #0x1
 ; -O0:    csel x15, x8, x9, ne
 ; -O0:    ldxp x10, x9, [x11]
@@ -5850,15 +5786,11 @@ define dso_local i128 @atomicrmw_max_i128_aligned_monotonic(ptr %ptr, i128 %valu
 define dso_local i128 @atomicrmw_max_i128_aligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_aligned_acquire:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w8, w8, w10, ne
-; -O0:    and w13, w8, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x14, x10, x12, ne
-; -O0:    and w10, w8, #0x1
+; -O0:    subs x13, x13, x9
+; -O0:    csel w10, w8, w10, eq
+; -O0:    ands w13, w10, #0x1
+; -O0:    csel x14, x8, x12, ne
 ; -O0:    ands w10, w10, #0x1
 ; -O0:    csel x15, x8, x9, ne
 ; -O0:    ldaxp x10, x9, [x11]
@@ -5884,15 +5816,11 @@ define dso_local i128 @atomicrmw_max_i128_aligned_acquire(ptr %ptr, i128 %value)
 define dso_local i128 @atomicrmw_max_i128_aligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_aligned_release:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w8, w8, w10, ne
-; -O0:    and w13, w8, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x14, x10, x12, ne
-; -O0:    and w10, w8, #0x1
+; -O0:    subs x13, x13, x9
+; -O0:    csel w10, w8, w10, eq
+; -O0:    ands w13, w10, #0x1
+; -O0:    csel x14, x8, x12, ne
 ; -O0:    ands w10, w10, #0x1
 ; -O0:    csel x15, x8, x9, ne
 ; -O0:    ldxp x10, x9, [x11]
@@ -5918,15 +5846,11 @@ define dso_local i128 @atomicrmw_max_i128_aligned_release(ptr %ptr, i128 %value)
 define dso_local i128 @atomicrmw_max_i128_aligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_aligned_acq_rel:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w8, w8, w10, ne
-; -O0:    and w13, w8, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x14, x10, x12, ne
-; -O0:    and w10, w8, #0x1
+; -O0:    subs x13, x13, x9
+; -O0:    csel w10, w8, w10, eq
+; -O0:    ands w13, w10, #0x1
+; -O0:    csel x14, x8, x12, ne
 ; -O0:    ands w10, w10, #0x1
 ; -O0:    csel x15, x8, x9, ne
 ; -O0:    ldaxp x10, x9, [x11]
@@ -5952,15 +5876,11 @@ define dso_local i128 @atomicrmw_max_i128_aligned_acq_rel(ptr %ptr, i128 %value)
 define dso_local i128 @atomicrmw_max_i128_aligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_aligned_seq_cst:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w8, w8, w10, ne
-; -O0:    and w13, w8, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x14, x10, x12, ne
-; -O0:    and w10, w8, #0x1
+; -O0:    subs x13, x13, x9
+; -O0:    csel w10, w8, w10, eq
+; -O0:    ands w13, w10, #0x1
+; -O0:    csel x14, x8, x12, ne
 ; -O0:    ands w10, w10, #0x1
 ; -O0:    csel x15, x8, x9, ne
 ; -O0:    ldaxp x10, x9, [x11]
@@ -5987,9 +5907,7 @@ define dso_local i8 @atomicrmw_max_i8_unaligned_monotonic(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_max_i8_unaligned_monotonic:
 ; -O0:    sxtb w9, w10
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, gt
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -6010,9 +5928,7 @@ define dso_local i8 @atomicrmw_max_i8_unaligned_acquire(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_max_i8_unaligned_acquire:
 ; -O0:    sxtb w9, w10
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, gt
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -6033,9 +5949,7 @@ define dso_local i8 @atomicrmw_max_i8_unaligned_release(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_max_i8_unaligned_release:
 ; -O0:    sxtb w9, w10
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, gt
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -6056,9 +5970,7 @@ define dso_local i8 @atomicrmw_max_i8_unaligned_acq_rel(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_max_i8_unaligned_acq_rel:
 ; -O0:    sxtb w9, w10
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, gt
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -6079,9 +5991,7 @@ define dso_local i8 @atomicrmw_max_i8_unaligned_seq_cst(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_max_i8_unaligned_seq_cst:
 ; -O0:    sxtb w9, w10
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, gt
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -6102,9 +6012,7 @@ define dso_local i16 @atomicrmw_max_i16_unaligned_monotonic(ptr %ptr, i16 %value
 ; -O0-LABEL: atomicrmw_max_i16_unaligned_monotonic:
 ; -O0:    sxth w10, w9
 ; -O0:    subs w10, w10, w8, sxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, gt
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i16_unaligned_monotonic:
@@ -6120,9 +6028,7 @@ define dso_local i16 @atomicrmw_max_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ; -O0-LABEL: atomicrmw_max_i16_unaligned_acquire:
 ; -O0:    sxth w10, w9
 ; -O0:    subs w10, w10, w8, sxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, gt
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i16_unaligned_acquire:
@@ -6138,9 +6044,7 @@ define dso_local i16 @atomicrmw_max_i16_unaligned_release(ptr %ptr, i16 %value)
 ; -O0-LABEL: atomicrmw_max_i16_unaligned_release:
 ; -O0:    sxth w10, w9
 ; -O0:    subs w10, w10, w8, sxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, gt
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i16_unaligned_release:
@@ -6156,9 +6060,7 @@ define dso_local i16 @atomicrmw_max_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ; -O0-LABEL: atomicrmw_max_i16_unaligned_acq_rel:
 ; -O0:    sxth w10, w9
 ; -O0:    subs w10, w10, w8, sxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, gt
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i16_unaligned_acq_rel:
@@ -6174,9 +6076,7 @@ define dso_local i16 @atomicrmw_max_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ; -O0-LABEL: atomicrmw_max_i16_unaligned_seq_cst:
 ; -O0:    sxth w10, w9
 ; -O0:    subs w10, w10, w8, sxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, gt
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i16_unaligned_seq_cst:
@@ -6191,9 +6091,7 @@ define dso_local i16 @atomicrmw_max_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 define dso_local i32 @atomicrmw_max_i32_unaligned_monotonic(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_max_i32_unaligned_monotonic:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, gt
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i32_unaligned_monotonic:
@@ -6207,9 +6105,7 @@ define dso_local i32 @atomicrmw_max_i32_unaligned_monotonic(ptr %ptr, i32 %value
 define dso_local i32 @atomicrmw_max_i32_unaligned_acquire(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_max_i32_unaligned_acquire:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, gt
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i32_unaligned_acquire:
@@ -6223,9 +6119,7 @@ define dso_local i32 @atomicrmw_max_i32_unaligned_acquire(ptr %ptr, i32 %value)
 define dso_local i32 @atomicrmw_max_i32_unaligned_release(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_max_i32_unaligned_release:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, gt
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i32_unaligned_release:
@@ -6239,9 +6133,7 @@ define dso_local i32 @atomicrmw_max_i32_unaligned_release(ptr %ptr, i32 %value)
 define dso_local i32 @atomicrmw_max_i32_unaligned_acq_rel(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_max_i32_unaligned_acq_rel:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, gt
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i32_unaligned_acq_rel:
@@ -6255,9 +6147,7 @@ define dso_local i32 @atomicrmw_max_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 define dso_local i32 @atomicrmw_max_i32_unaligned_seq_cst(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_max_i32_unaligned_seq_cst:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, gt
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i32_unaligned_seq_cst:
@@ -6271,9 +6161,7 @@ define dso_local i32 @atomicrmw_max_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 define dso_local i64 @atomicrmw_max_i64_unaligned_monotonic(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_max_i64_unaligned_monotonic:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, gt
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i64_unaligned_monotonic:
@@ -6287,9 +6175,7 @@ define dso_local i64 @atomicrmw_max_i64_unaligned_monotonic(ptr %ptr, i64 %value
 define dso_local i64 @atomicrmw_max_i64_unaligned_acquire(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_max_i64_unaligned_acquire:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, gt
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i64_unaligned_acquire:
@@ -6303,9 +6189,7 @@ define dso_local i64 @atomicrmw_max_i64_unaligned_acquire(ptr %ptr, i64 %value)
 define dso_local i64 @atomicrmw_max_i64_unaligned_release(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_max_i64_unaligned_release:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, gt
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i64_unaligned_release:
@@ -6319,9 +6203,7 @@ define dso_local i64 @atomicrmw_max_i64_unaligned_release(ptr %ptr, i64 %value)
 define dso_local i64 @atomicrmw_max_i64_unaligned_acq_rel(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_max_i64_unaligned_acq_rel:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, gt
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i64_unaligned_acq_rel:
@@ -6335,9 +6217,7 @@ define dso_local i64 @atomicrmw_max_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 define dso_local i64 @atomicrmw_max_i64_unaligned_seq_cst(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_max_i64_unaligned_seq_cst:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, gt
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i64_unaligned_seq_cst:
@@ -6351,15 +6231,11 @@ define dso_local i64 @atomicrmw_max_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 define dso_local i128 @atomicrmw_max_i128_unaligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_unaligned_monotonic:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -6377,15 +6253,11 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_monotonic(ptr %ptr, i128 %va
 define dso_local i128 @atomicrmw_max_i128_unaligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_unaligned_acquire:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -6403,15 +6275,11 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_acquire(ptr %ptr, i128 %valu
 define dso_local i128 @atomicrmw_max_i128_unaligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_unaligned_release:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -6429,15 +6297,11 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_release(ptr %ptr, i128 %valu
 define dso_local i128 @atomicrmw_max_i128_unaligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_unaligned_acq_rel:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -6455,15 +6319,11 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_acq_rel(ptr %ptr, i128 %valu
 define dso_local i128 @atomicrmw_max_i128_unaligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_unaligned_seq_cst:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -6482,9 +6342,7 @@ define dso_local i8 @atomicrmw_min_i8_aligned_monotonic(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_min_i8_aligned_monotonic:
 ; -O0:    sxtb w9, w10
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, le
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -6505,9 +6363,7 @@ define dso_local i8 @atomicrmw_min_i8_aligned_acquire(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_min_i8_aligned_acquire:
 ; -O0:    sxtb w9, w10
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, le
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -6528,9 +6384,7 @@ define dso_local i8 @atomicrmw_min_i8_aligned_release(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_min_i8_aligned_release:
 ; -O0:    sxtb w9, w10
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, le
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -6551,9 +6405,7 @@ define dso_local i8 @atomicrmw_min_i8_aligned_acq_rel(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_min_i8_aligned_acq_rel:
 ; -O0:    sxtb w9, w10
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, le
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -6574,9 +6426,7 @@ define dso_local i8 @atomicrmw_min_i8_aligned_seq_cst(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_min_i8_aligned_seq_cst:
 ; -O0:    sxtb w9, w10
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, le
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -6597,9 +6447,7 @@ define dso_local i16 @atomicrmw_min_i16_aligned_monotonic(ptr %ptr, i16 %value)
 ; -O0-LABEL: atomicrmw_min_i16_aligned_monotonic:
 ; -O0:    sxth w10, w8
 ; -O0:    subs w10, w10, w9, sxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, le
 ; -O0:    ldaxrh w9, [x11]
 ; -O0:    cmp w9, w8, uxth
 ; -O0:    stlxrh w10, w12, [x11]
@@ -6619,9 +6467,7 @@ define dso_local i16 @atomicrmw_min_i16_aligned_acquire(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_min_i16_aligned_acquire:
 ; -O0:    sxth w10, w8
 ; -O0:    subs w10, w10, w9, sxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, le
 ; -O0:    ldaxrh w9, [x11]
 ; -O0:    cmp w9, w8, uxth
 ; -O0:    stlxrh w10, w12, [x11]
@@ -6641,9 +6487,7 @@ define dso_local i16 @atomicrmw_min_i16_aligned_release(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_min_i16_aligned_release:
 ; -O0:    sxth w10, w8
 ; -O0:    subs w10, w10, w9, sxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, le
 ; -O0:    ldaxrh w9, [x11]
 ; -O0:    cmp w9, w8, uxth
 ; -O0:    stlxrh w10, w12, [x11]
@@ -6663,9 +6507,7 @@ define dso_local i16 @atomicrmw_min_i16_aligned_acq_rel(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_min_i16_aligned_acq_rel:
 ; -O0:    sxth w10, w8
 ; -O0:    subs w10, w10, w9, sxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, le
 ; -O0:    ldaxrh w9, [x11]
 ; -O0:    cmp w9, w8, uxth
 ; -O0:    stlxrh w10, w12, [x11]
@@ -6685,9 +6527,7 @@ define dso_local i16 @atomicrmw_min_i16_aligned_seq_cst(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_min_i16_aligned_seq_cst:
 ; -O0:    sxth w10, w8
 ; -O0:    subs w10, w10, w9, sxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, le
 ; -O0:    ldaxrh w9, [x11]
 ; -O0:    cmp w9, w8, uxth
 ; -O0:    stlxrh w10, w12, [x11]
@@ -6706,9 +6546,7 @@ define dso_local i16 @atomicrmw_min_i16_aligned_seq_cst(ptr %ptr, i16 %value) {
 define dso_local i32 @atomicrmw_min_i32_aligned_monotonic(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_min_i32_aligned_monotonic:
 ; -O0:    subs w10, w8, w9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, le
 ; -O0:    ldaxr w9, [x11]
 ; -O0:    cmp w9, w8
 ; -O0:    stlxr w10, w12, [x11]
@@ -6726,9 +6564,7 @@ define dso_local i32 @atomicrmw_min_i32_aligned_monotonic(ptr %ptr, i32 %value)
 define dso_local i32 @atomicrmw_min_i32_aligned_acquire(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_min_i32_aligned_acquire:
 ; -O0:    subs w10, w8, w9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, le
 ; -O0:    ldaxr w9, [x11]
 ; -O0:    cmp w9, w8
 ; -O0:    stlxr w10, w12, [x11]
@@ -6746,9 +6582,7 @@ define dso_local i32 @atomicrmw_min_i32_aligned_acquire(ptr %ptr, i32 %value) {
 define dso_local i32 @atomicrmw_min_i32_aligned_release(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_min_i32_aligned_release:
 ; -O0:    subs w10, w8, w9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, le
 ; -O0:    ldaxr w9, [x11]
 ; -O0:    cmp w9, w8
 ; -O0:    stlxr w10, w12, [x11]
@@ -6766,9 +6600,7 @@ define dso_local i32 @atomicrmw_min_i32_aligned_release(ptr %ptr, i32 %value) {
 define dso_local i32 @atomicrmw_min_i32_aligned_acq_rel(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_min_i32_aligned_acq_rel:
 ; -O0:    subs w10, w8, w9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, le
 ; -O0:    ldaxr w9, [x11]
 ; -O0:    cmp w9, w8
 ; -O0:    stlxr w10, w12, [x11]
@@ -6786,9 +6618,7 @@ define dso_local i32 @atomicrmw_min_i32_aligned_acq_rel(ptr %ptr, i32 %value) {
 define dso_local i32 @atomicrmw_min_i32_aligned_seq_cst(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_min_i32_aligned_seq_cst:
 ; -O0:    subs w10, w8, w9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, le
 ; -O0:    ldaxr w9, [x11]
 ; -O0:    cmp w9, w8
 ; -O0:    stlxr w10, w12, [x11]
@@ -6806,9 +6636,7 @@ define dso_local i32 @atomicrmw_min_i32_aligned_seq_cst(ptr %ptr, i32 %value) {
 define dso_local i64 @atomicrmw_min_i64_aligned_monotonic(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_min_i64_aligned_monotonic:
 ; -O0:    subs x10, x8, x9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x12, x8, x9, ne
+; -O0:    csel x12, x8, x9, le
 ; -O0:    ldaxr x9, [x11]
 ; -O0:    cmp x9, x8
 ; -O0:    stlxr w10, x12, [x11]
@@ -6826,9 +6654,7 @@ define dso_local i64 @atomicrmw_min_i64_aligned_monotonic(ptr %ptr, i64 %value)
 define dso_local i64 @atomicrmw_min_i64_aligned_acquire(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_min_i64_aligned_acquire:
 ; -O0:    subs x10, x8, x9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x12, x8, x9, ne
+; -O0:    csel x12, x8, x9, le
 ; -O0:    ldaxr x9, [x11]
 ; -O0:    cmp x9, x8
 ; -O0:    stlxr w10, x12, [x11]
@@ -6846,9 +6672,7 @@ define dso_local i64 @atomicrmw_min_i64_aligned_acquire(ptr %ptr, i64 %value) {
 define dso_local i64 @atomicrmw_min_i64_aligned_release(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_min_i64_aligned_release:
 ; -O0:    subs x10, x8, x9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x12, x8, x9, ne
+; -O0:    csel x12, x8, x9, le
 ; -O0:    ldaxr x9, [x11]
 ; -O0:    cmp x9, x8
 ; -O0:    stlxr w10, x12, [x11]
@@ -6866,9 +6690,7 @@ define dso_local i64 @atomicrmw_min_i64_aligned_release(ptr %ptr, i64 %value) {
 define dso_local i64 @atomicrmw_min_i64_aligned_acq_rel(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_min_i64_aligned_acq_rel:
 ; -O0:    subs x10, x8, x9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x12, x8, x9, ne
+; -O0:    csel x12, x8, x9, le
 ; -O0:    ldaxr x9, [x11]
 ; -O0:    cmp x9, x8
 ; -O0:    stlxr w10, x12, [x11]
@@ -6886,9 +6708,7 @@ define dso_local i64 @atomicrmw_min_i64_aligned_acq_rel(ptr %ptr, i64 %value) {
 define dso_local i64 @atomicrmw_min_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_min_i64_aligned_seq_cst:
 ; -O0:    subs x10, x8, x9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x12, x8, x9, ne
+; -O0:    csel x12, x8, x9, le
 ; -O0:    ldaxr x9, [x11]
 ; -O0:    cmp x9, x8
 ; -O0:    stlxr w10, x12, [x11]
@@ -6906,15 +6726,11 @@ define dso_local i64 @atomicrmw_min_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 define dso_local i128 @atomicrmw_min_i128_aligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_aligned_monotonic:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w8, w8, w10, ne
-; -O0:    and w13, w8, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x14, x10, x12, ne
-; -O0:    and w10, w8, #0x1
+; -O0:    subs x13, x13, x9
+; -O0:    csel w10, w8, w10, eq
+; -O0:    ands w13, w10, #0x1
+; -O0:    csel x14, x8, x12, ne
 ; -O0:    ands w10, w10, #0x1
 ; -O0:    csel x15, x8, x9, ne
 ; -O0:    ldxp x10, x9, [x11]
@@ -6940,15 +6756,11 @@ define dso_local i128 @atomicrmw_min_i128_aligned_monotonic(ptr %ptr, i128 %valu
 define dso_local i128 @atomicrmw_min_i128_aligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_aligned_acquire:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w8, w8, w10, ne
-; -O0:    and w13, w8, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x14, x10, x12, ne
-; -O0:    and w10, w8, #0x1
+; -O0:    subs x13, x13, x9
+; -O0:    csel w10, w8, w10, eq
+; -O0:    ands w13, w10, #0x1
+; -O0:    csel x14, x8, x12, ne
 ; -O0:    ands w10, w10, #0x1
 ; -O0:    csel x15, x8, x9, ne
 ; -O0:    ldaxp x10, x9, [x11]
@@ -6974,15 +6786,11 @@ define dso_local i128 @atomicrmw_min_i128_aligned_acquire(ptr %ptr, i128 %value)
 define dso_local i128 @atomicrmw_min_i128_aligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_aligned_release:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w8, w8, w10, ne
-; -O0:    and w13, w8, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x14, x10, x12, ne
-; -O0:    and w10, w8, #0x1
+; -O0:    subs x13, x13, x9
+; -O0:    csel w10, w8, w10, eq
+; -O0:    ands w13, w10, #0x1
+; -O0:    csel x14, x8, x12, ne
 ; -O0:    ands w10, w10, #0x1
 ; -O0:    csel x15, x8, x9, ne
 ; -O0:    ldxp x10, x9, [x11]
@@ -7008,15 +6816,11 @@ define dso_local i128 @atomicrmw_min_i128_aligned_release(ptr %ptr, i128 %value)
 define dso_local i128 @atomicrmw_min_i128_aligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_aligned_acq_rel:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w8, w8, w10, ne
-; -O0:    and w13, w8, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x14, x10, x12, ne
-; -O0:    and w10, w8, #0x1
+; -O0:    subs x13, x13, x9
+; -O0:    csel w10, w8, w10, eq
+; -O0:    ands w13, w10, #0x1
+; -O0:    csel x14, x8, x12, ne
 ; -O0:    ands w10, w10, #0x1
 ; -O0:    csel x15, x8, x9, ne
 ; -O0:    ldaxp x10, x9, [x11]
@@ -7042,15 +6846,11 @@ define dso_local i128 @atomicrmw_min_i128_aligned_acq_rel(ptr %ptr, i128 %value)
 define dso_local i128 @atomicrmw_min_i128_aligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_aligned_seq_cst:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w8, w8, w10, ne
-; -O0:    and w13, w8, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x14, x10, x12, ne
-; -O0:    and w10, w8, #0x1
+; -O0:    subs x13, x13, x9
+; -O0:    csel w10, w8, w10, eq
+; -O0:    ands w13, w10, #0x1
+; -O0:    csel x14, x8, x12, ne
 ; -O0:    ands w10, w10, #0x1
 ; -O0:    csel x15, x8, x9, ne
 ; -O0:    ldaxp x10, x9, [x11]
@@ -7077,9 +6877,7 @@ define dso_local i8 @atomicrmw_min_i8_unaligned_monotonic(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_min_i8_unaligned_monotonic:
 ; -O0:    sxtb w9, w10
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, le
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -7100,9 +6898,7 @@ define dso_local i8 @atomicrmw_min_i8_unaligned_acquire(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_min_i8_unaligned_acquire:
 ; -O0:    sxtb w9, w10
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, le
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -7123,9 +6919,7 @@ define dso_local i8 @atomicrmw_min_i8_unaligned_release(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_min_i8_unaligned_release:
 ; -O0:    sxtb w9, w10
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, le
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -7146,9 +6940,7 @@ define dso_local i8 @atomicrmw_min_i8_unaligned_acq_rel(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_min_i8_unaligned_acq_rel:
 ; -O0:    sxtb w9, w10
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, le
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -7169,9 +6961,7 @@ define dso_local i8 @atomicrmw_min_i8_unaligned_seq_cst(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_min_i8_unaligned_seq_cst:
 ; -O0:    sxtb w9, w10
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, le
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -7192,9 +6982,7 @@ define dso_local i16 @atomicrmw_min_i16_unaligned_monotonic(ptr %ptr, i16 %value
 ; -O0-LABEL: atomicrmw_min_i16_unaligned_monotonic:
 ; -O0:    sxth w10, w9
 ; -O0:    subs w10, w10, w8, sxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, le
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i16_unaligned_monotonic:
@@ -7210,9 +6998,7 @@ define dso_local i16 @atomicrmw_min_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ; -O0-LABEL: atomicrmw_min_i16_unaligned_acquire:
 ; -O0:    sxth w10, w9
 ; -O0:    subs w10, w10, w8, sxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, le
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i16_unaligned_acquire:
@@ -7228,9 +7014,7 @@ define dso_local i16 @atomicrmw_min_i16_unaligned_release(ptr %ptr, i16 %value)
 ; -O0-LABEL: atomicrmw_min_i16_unaligned_release:
 ; -O0:    sxth w10, w9
 ; -O0:    subs w10, w10, w8, sxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, le
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i16_unaligned_release:
@@ -7246,9 +7030,7 @@ define dso_local i16 @atomicrmw_min_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ; -O0-LABEL: atomicrmw_min_i16_unaligned_acq_rel:
 ; -O0:    sxth w10, w9
 ; -O0:    subs w10, w10, w8, sxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, le
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i16_unaligned_acq_rel:
@@ -7264,9 +7046,7 @@ define dso_local i16 @atomicrmw_min_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ; -O0-LABEL: atomicrmw_min_i16_unaligned_seq_cst:
 ; -O0:    sxth w10, w9
 ; -O0:    subs w10, w10, w8, sxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, le
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i16_unaligned_seq_cst:
@@ -7281,9 +7061,7 @@ define dso_local i16 @atomicrmw_min_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 define dso_local i32 @atomicrmw_min_i32_unaligned_monotonic(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_min_i32_unaligned_monotonic:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, le
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i32_unaligned_monotonic:
@@ -7297,9 +7075,7 @@ define dso_local i32 @atomicrmw_min_i32_unaligned_monotonic(ptr %ptr, i32 %value
 define dso_local i32 @atomicrmw_min_i32_unaligned_acquire(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_min_i32_unaligned_acquire:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, le
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i32_unaligned_acquire:
@@ -7313,9 +7089,7 @@ define dso_local i32 @atomicrmw_min_i32_unaligned_acquire(ptr %ptr, i32 %value)
 define dso_local i32 @atomicrmw_min_i32_unaligned_release(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_min_i32_unaligned_release:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, le
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i32_unaligned_release:
@@ -7329,9 +7103,7 @@ define dso_local i32 @atomicrmw_min_i32_unaligned_release(ptr %ptr, i32 %value)
 define dso_local i32 @atomicrmw_min_i32_unaligned_acq_rel(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_min_i32_unaligned_acq_rel:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, le
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i32_unaligned_acq_rel:
@@ -7345,9 +7117,7 @@ define dso_local i32 @atomicrmw_min_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 define dso_local i32 @atomicrmw_min_i32_unaligned_seq_cst(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_min_i32_unaligned_seq_cst:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, le
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i32_unaligned_seq_cst:
@@ -7361,9 +7131,7 @@ define dso_local i32 @atomicrmw_min_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 define dso_local i64 @atomicrmw_min_i64_unaligned_monotonic(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_min_i64_unaligned_monotonic:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, le
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i64_unaligned_monotonic:
@@ -7377,9 +7145,7 @@ define dso_local i64 @atomicrmw_min_i64_unaligned_monotonic(ptr %ptr, i64 %value
 define dso_local i64 @atomicrmw_min_i64_unaligned_acquire(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_min_i64_unaligned_acquire:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, le
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i64_unaligned_acquire:
@@ -7393,9 +7159,7 @@ define dso_local i64 @atomicrmw_min_i64_unaligned_acquire(ptr %ptr, i64 %value)
 define dso_local i64 @atomicrmw_min_i64_unaligned_release(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_min_i64_unaligned_release:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, le
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i64_unaligned_release:
@@ -7409,9 +7173,7 @@ define dso_local i64 @atomicrmw_min_i64_unaligned_release(ptr %ptr, i64 %value)
 define dso_local i64 @atomicrmw_min_i64_unaligned_acq_rel(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_min_i64_unaligned_acq_rel:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, le
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i64_unaligned_acq_rel:
@@ -7425,9 +7187,7 @@ define dso_local i64 @atomicrmw_min_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 define dso_local i64 @atomicrmw_min_i64_unaligned_seq_cst(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_min_i64_unaligned_seq_cst:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, le
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i64_unaligned_seq_cst:
@@ -7441,15 +7201,11 @@ define dso_local i64 @atomicrmw_min_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 define dso_local i128 @atomicrmw_min_i128_unaligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_unaligned_monotonic:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -7467,15 +7223,11 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_monotonic(ptr %ptr, i128 %va
 define dso_local i128 @atomicrmw_min_i128_unaligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_unaligned_acquire:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -7493,15 +7245,11 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_acquire(ptr %ptr, i128 %valu
 define dso_local i128 @atomicrmw_min_i128_unaligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_unaligned_release:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -7519,15 +7267,11 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_release(ptr %ptr, i128 %valu
 define dso_local i128 @atomicrmw_min_i128_unaligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_unaligned_acq_rel:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -7545,15 +7289,11 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_acq_rel(ptr %ptr, i128 %valu
 define dso_local i128 @atomicrmw_min_i128_unaligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_unaligned_seq_cst:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -7572,9 +7312,7 @@ define dso_local i8 @atomicrmw_umax_i8_aligned_monotonic(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umax_i8_aligned_monotonic:
 ; -O0:    and w9, w10, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, hi
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -7595,9 +7333,7 @@ define dso_local i8 @atomicrmw_umax_i8_aligned_acquire(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umax_i8_aligned_acquire:
 ; -O0:    and w9, w10, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, hi
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -7618,9 +7354,7 @@ define dso_local i8 @atomicrmw_umax_i8_aligned_release(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umax_i8_aligned_release:
 ; -O0:    and w9, w10, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, hi
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -7641,9 +7375,7 @@ define dso_local i8 @atomicrmw_umax_i8_aligned_acq_rel(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umax_i8_aligned_acq_rel:
 ; -O0:    and w9, w10, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, hi
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -7664,9 +7396,7 @@ define dso_local i8 @atomicrmw_umax_i8_aligned_seq_cst(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umax_i8_aligned_seq_cst:
 ; -O0:    and w9, w10, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, hi
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -7686,9 +7416,7 @@ define dso_local i8 @atomicrmw_umax_i8_aligned_seq_cst(ptr %ptr, i8 %value) {
 define dso_local i16 @atomicrmw_umax_i16_aligned_monotonic(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umax_i16_aligned_monotonic:
 ; -O0:    subs w10, w10, w9, uxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, hi
 ; -O0:    ldaxrh w9, [x11]
 ; -O0:    cmp w9, w8, uxth
 ; -O0:    stlxrh w10, w12, [x11]
@@ -7707,9 +7435,7 @@ define dso_local i16 @atomicrmw_umax_i16_aligned_monotonic(ptr %ptr, i16 %value)
 define dso_local i16 @atomicrmw_umax_i16_aligned_acquire(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umax_i16_aligned_acquire:
 ; -O0:    subs w10, w10, w9, uxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, hi
 ; -O0:    ldaxrh w9, [x11]
 ; -O0:    cmp w9, w8, uxth
 ; -O0:    stlxrh w10, w12, [x11]
@@ -7728,9 +7454,7 @@ define dso_local i16 @atomicrmw_umax_i16_aligned_acquire(ptr %ptr, i16 %value) {
 define dso_local i16 @atomicrmw_umax_i16_aligned_release(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umax_i16_aligned_release:
 ; -O0:    subs w10, w10, w9, uxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, hi
 ; -O0:    ldaxrh w9, [x11]
 ; -O0:    cmp w9, w8, uxth
 ; -O0:    stlxrh w10, w12, [x11]
@@ -7749,9 +7473,7 @@ define dso_local i16 @atomicrmw_umax_i16_aligned_release(ptr %ptr, i16 %value) {
 define dso_local i16 @atomicrmw_umax_i16_aligned_acq_rel(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umax_i16_aligned_acq_rel:
 ; -O0:    subs w10, w10, w9, uxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, hi
 ; -O0:    ldaxrh w9, [x11]
 ; -O0:    cmp w9, w8, uxth
 ; -O0:    stlxrh w10, w12, [x11]
@@ -7770,9 +7492,7 @@ define dso_local i16 @atomicrmw_umax_i16_aligned_acq_rel(ptr %ptr, i16 %value) {
 define dso_local i16 @atomicrmw_umax_i16_aligned_seq_cst(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umax_i16_aligned_seq_cst:
 ; -O0:    subs w10, w10, w9, uxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, hi
 ; -O0:    ldaxrh w9, [x11]
 ; -O0:    cmp w9, w8, uxth
 ; -O0:    stlxrh w10, w12, [x11]
@@ -7791,9 +7511,7 @@ define dso_local i16 @atomicrmw_umax_i16_aligned_seq_cst(ptr %ptr, i16 %value) {
 define dso_local i32 @atomicrmw_umax_i32_aligned_monotonic(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umax_i32_aligned_monotonic:
 ; -O0:    subs w10, w8, w9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, hi
 ; -O0:    ldaxr w9, [x11]
 ; -O0:    cmp w9, w8
 ; -O0:    stlxr w10, w12, [x11]
@@ -7811,9 +7529,7 @@ define dso_local i32 @atomicrmw_umax_i32_aligned_monotonic(ptr %ptr, i32 %value)
 define dso_local i32 @atomicrmw_umax_i32_aligned_acquire(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umax_i32_aligned_acquire:
 ; -O0:    subs w10, w8, w9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, hi
 ; -O0:    ldaxr w9, [x11]
 ; -O0:    cmp w9, w8
 ; -O0:    stlxr w10, w12, [x11]
@@ -7831,9 +7547,7 @@ define dso_local i32 @atomicrmw_umax_i32_aligned_acquire(ptr %ptr, i32 %value) {
 define dso_local i32 @atomicrmw_umax_i32_aligned_release(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umax_i32_aligned_release:
 ; -O0:    subs w10, w8, w9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, hi
 ; -O0:    ldaxr w9, [x11]
 ; -O0:    cmp w9, w8
 ; -O0:    stlxr w10, w12, [x11]
@@ -7851,9 +7565,7 @@ define dso_local i32 @atomicrmw_umax_i32_aligned_release(ptr %ptr, i32 %value) {
 define dso_local i32 @atomicrmw_umax_i32_aligned_acq_rel(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umax_i32_aligned_acq_rel:
 ; -O0:    subs w10, w8, w9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, hi
 ; -O0:    ldaxr w9, [x11]
 ; -O0:    cmp w9, w8
 ; -O0:    stlxr w10, w12, [x11]
@@ -7871,9 +7583,7 @@ define dso_local i32 @atomicrmw_umax_i32_aligned_acq_rel(ptr %ptr, i32 %value) {
 define dso_local i32 @atomicrmw_umax_i32_aligned_seq_cst(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umax_i32_aligned_seq_cst:
 ; -O0:    subs w10, w8, w9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, hi
 ; -O0:    ldaxr w9, [x11]
 ; -O0:    cmp w9, w8
 ; -O0:    stlxr w10, w12, [x11]
@@ -7891,9 +7601,7 @@ define dso_local i32 @atomicrmw_umax_i32_aligned_seq_cst(ptr %ptr, i32 %value) {
 define dso_local i64 @atomicrmw_umax_i64_aligned_monotonic(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umax_i64_aligned_monotonic:
 ; -O0:    subs x10, x8, x9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x12, x8, x9, ne
+; -O0:    csel x12, x8, x9, hi
 ; -O0:    ldaxr x9, [x11]
 ; -O0:    cmp x9, x8
 ; -O0:    stlxr w10, x12, [x11]
@@ -7911,9 +7619,7 @@ define dso_local i64 @atomicrmw_umax_i64_aligned_monotonic(ptr %ptr, i64 %value)
 define dso_local i64 @atomicrmw_umax_i64_aligned_acquire(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umax_i64_aligned_acquire:
 ; -O0:    subs x10, x8, x9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x12, x8, x9, ne
+; -O0:    csel x12, x8, x9, hi
 ; -O0:    ldaxr x9, [x11]
 ; -O0:    cmp x9, x8
 ; -O0:    stlxr w10, x12, [x11]
@@ -7931,9 +7637,7 @@ define dso_local i64 @atomicrmw_umax_i64_aligned_acquire(ptr %ptr, i64 %value) {
 define dso_local i64 @atomicrmw_umax_i64_aligned_release(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umax_i64_aligned_release:
 ; -O0:    subs x10, x8, x9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x12, x8, x9, ne
+; -O0:    csel x12, x8, x9, hi
 ; -O0:    ldaxr x9, [x11]
 ; -O0:    cmp x9, x8
 ; -O0:    stlxr w10, x12, [x11]
@@ -7951,9 +7655,7 @@ define dso_local i64 @atomicrmw_umax_i64_aligned_release(ptr %ptr, i64 %value) {
 define dso_local i64 @atomicrmw_umax_i64_aligned_acq_rel(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umax_i64_aligned_acq_rel:
 ; -O0:    subs x10, x8, x9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x12, x8, x9, ne
+; -O0:    csel x12, x8, x9, hi
 ; -O0:    ldaxr x9, [x11]
 ; -O0:    cmp x9, x8
 ; -O0:    stlxr w10, x12, [x11]
@@ -7971,9 +7673,7 @@ define dso_local i64 @atomicrmw_umax_i64_aligned_acq_rel(ptr %ptr, i64 %value) {
 define dso_local i64 @atomicrmw_umax_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umax_i64_aligned_seq_cst:
 ; -O0:    subs x10, x8, x9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x12, x8, x9, ne
+; -O0:    csel x12, x8, x9, hi
 ; -O0:    ldaxr x9, [x11]
 ; -O0:    cmp x9, x8
 ; -O0:    stlxr w10, x12, [x11]
@@ -7991,15 +7691,11 @@ define dso_local i64 @atomicrmw_umax_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 define dso_local i128 @atomicrmw_umax_i128_aligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_aligned_monotonic:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w8, w8, w10, ne
-; -O0:    and w13, w8, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x14, x10, x12, ne
-; -O0:    and w10, w8, #0x1
+; -O0:    subs x13, x13, x9
+; -O0:    csel w10, w8, w10, eq
+; -O0:    ands w13, w10, #0x1
+; -O0:    csel x14, x8, x12, ne
 ; -O0:    ands w10, w10, #0x1
 ; -O0:    csel x15, x8, x9, ne
 ; -O0:    ldxp x10, x9, [x11]
@@ -8025,15 +7721,11 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_monotonic(ptr %ptr, i128 %val
 define dso_local i128 @atomicrmw_umax_i128_aligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_aligned_acquire:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w8, w8, w10, ne
-; -O0:    and w13, w8, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x14, x10, x12, ne
-; -O0:    and w10, w8, #0x1
+; -O0:    subs x13, x13, x9
+; -O0:    csel w10, w8, w10, eq
+; -O0:    ands w13, w10, #0x1
+; -O0:    csel x14, x8, x12, ne
 ; -O0:    ands w10, w10, #0x1
 ; -O0:    csel x15, x8, x9, ne
 ; -O0:    ldaxp x10, x9, [x11]
@@ -8059,15 +7751,11 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_acquire(ptr %ptr, i128 %value
 define dso_local i128 @atomicrmw_umax_i128_aligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_aligned_release:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w8, w8, w10, ne
-; -O0:    and w13, w8, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x14, x10, x12, ne
-; -O0:    and w10, w8, #0x1
+; -O0:    subs x13, x13, x9
+; -O0:    csel w10, w8, w10, eq
+; -O0:    ands w13, w10, #0x1
+; -O0:    csel x14, x8, x12, ne
 ; -O0:    ands w10, w10, #0x1
 ; -O0:    csel x15, x8, x9, ne
 ; -O0:    ldxp x10, x9, [x11]
@@ -8093,15 +7781,11 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_release(ptr %ptr, i128 %value
 define dso_local i128 @atomicrmw_umax_i128_aligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_aligned_acq_rel:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w8, w8, w10, ne
-; -O0:    and w13, w8, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x14, x10, x12, ne
-; -O0:    and w10, w8, #0x1
+; -O0:    subs x13, x13, x9
+; -O0:    csel w10, w8, w10, eq
+; -O0:    ands w13, w10, #0x1
+; -O0:    csel x14, x8, x12, ne
 ; -O0:    ands w10, w10, #0x1
 ; -O0:    csel x15, x8, x9, ne
 ; -O0:    ldaxp x10, x9, [x11]
@@ -8127,15 +7811,11 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_acq_rel(ptr %ptr, i128 %value
 define dso_local i128 @atomicrmw_umax_i128_aligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_aligned_seq_cst:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w8, w8, w10, ne
-; -O0:    and w13, w8, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x14, x10, x12, ne
-; -O0:    and w10, w8, #0x1
+; -O0:    subs x13, x13, x9
+; -O0:    csel w10, w8, w10, eq
+; -O0:    ands w13, w10, #0x1
+; -O0:    csel x14, x8, x12, ne
 ; -O0:    ands w10, w10, #0x1
 ; -O0:    csel x15, x8, x9, ne
 ; -O0:    ldaxp x10, x9, [x11]
@@ -8162,9 +7842,7 @@ define dso_local i8 @atomicrmw_umax_i8_unaligned_monotonic(ptr %ptr, i8 %value)
 ; -O0-LABEL: atomicrmw_umax_i8_unaligned_monotonic:
 ; -O0:    and w9, w10, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, hi
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -8185,9 +7863,7 @@ define dso_local i8 @atomicrmw_umax_i8_unaligned_acquire(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umax_i8_unaligned_acquire:
 ; -O0:    and w9, w10, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, hi
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -8208,9 +7884,7 @@ define dso_local i8 @atomicrmw_umax_i8_unaligned_release(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umax_i8_unaligned_release:
 ; -O0:    and w9, w10, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, hi
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -8231,9 +7905,7 @@ define dso_local i8 @atomicrmw_umax_i8_unaligned_acq_rel(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umax_i8_unaligned_acq_rel:
 ; -O0:    and w9, w10, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, hi
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -8254,9 +7926,7 @@ define dso_local i8 @atomicrmw_umax_i8_unaligned_seq_cst(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umax_i8_unaligned_seq_cst:
 ; -O0:    and w9, w10, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, hi
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -8276,9 +7946,7 @@ define dso_local i8 @atomicrmw_umax_i8_unaligned_seq_cst(ptr %ptr, i8 %value) {
 define dso_local i16 @atomicrmw_umax_i16_unaligned_monotonic(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umax_i16_unaligned_monotonic:
 ; -O0:    subs w10, w10, w8, uxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, hi
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i16_unaligned_monotonic:
@@ -8293,9 +7961,7 @@ define dso_local i16 @atomicrmw_umax_i16_unaligned_monotonic(ptr %ptr, i16 %valu
 define dso_local i16 @atomicrmw_umax_i16_unaligned_acquire(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umax_i16_unaligned_acquire:
 ; -O0:    subs w10, w10, w8, uxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, hi
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i16_unaligned_acquire:
@@ -8310,9 +7976,7 @@ define dso_local i16 @atomicrmw_umax_i16_unaligned_acquire(ptr %ptr, i16 %value)
 define dso_local i16 @atomicrmw_umax_i16_unaligned_release(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umax_i16_unaligned_release:
 ; -O0:    subs w10, w10, w8, uxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, hi
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i16_unaligned_release:
@@ -8327,9 +7991,7 @@ define dso_local i16 @atomicrmw_umax_i16_unaligned_release(ptr %ptr, i16 %value)
 define dso_local i16 @atomicrmw_umax_i16_unaligned_acq_rel(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umax_i16_unaligned_acq_rel:
 ; -O0:    subs w10, w10, w8, uxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, hi
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i16_unaligned_acq_rel:
@@ -8344,9 +8006,7 @@ define dso_local i16 @atomicrmw_umax_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 define dso_local i16 @atomicrmw_umax_i16_unaligned_seq_cst(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umax_i16_unaligned_seq_cst:
 ; -O0:    subs w10, w10, w8, uxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, hi
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i16_unaligned_seq_cst:
@@ -8361,9 +8021,7 @@ define dso_local i16 @atomicrmw_umax_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 define dso_local i32 @atomicrmw_umax_i32_unaligned_monotonic(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umax_i32_unaligned_monotonic:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, hi
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i32_unaligned_monotonic:
@@ -8377,9 +8035,7 @@ define dso_local i32 @atomicrmw_umax_i32_unaligned_monotonic(ptr %ptr, i32 %valu
 define dso_local i32 @atomicrmw_umax_i32_unaligned_acquire(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umax_i32_unaligned_acquire:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, hi
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i32_unaligned_acquire:
@@ -8393,9 +8049,7 @@ define dso_local i32 @atomicrmw_umax_i32_unaligned_acquire(ptr %ptr, i32 %value)
 define dso_local i32 @atomicrmw_umax_i32_unaligned_release(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umax_i32_unaligned_release:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, hi
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i32_unaligned_release:
@@ -8409,9 +8063,7 @@ define dso_local i32 @atomicrmw_umax_i32_unaligned_release(ptr %ptr, i32 %value)
 define dso_local i32 @atomicrmw_umax_i32_unaligned_acq_rel(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umax_i32_unaligned_acq_rel:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, hi
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i32_unaligned_acq_rel:
@@ -8425,9 +8077,7 @@ define dso_local i32 @atomicrmw_umax_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 define dso_local i32 @atomicrmw_umax_i32_unaligned_seq_cst(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umax_i32_unaligned_seq_cst:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, hi
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i32_unaligned_seq_cst:
@@ -8441,9 +8091,7 @@ define dso_local i32 @atomicrmw_umax_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 define dso_local i64 @atomicrmw_umax_i64_unaligned_monotonic(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umax_i64_unaligned_monotonic:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, hi
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i64_unaligned_monotonic:
@@ -8457,9 +8105,7 @@ define dso_local i64 @atomicrmw_umax_i64_unaligned_monotonic(ptr %ptr, i64 %valu
 define dso_local i64 @atomicrmw_umax_i64_unaligned_acquire(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umax_i64_unaligned_acquire:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, hi
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i64_unaligned_acquire:
@@ -8473,9 +8119,7 @@ define dso_local i64 @atomicrmw_umax_i64_unaligned_acquire(ptr %ptr, i64 %value)
 define dso_local i64 @atomicrmw_umax_i64_unaligned_release(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umax_i64_unaligned_release:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, hi
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i64_unaligned_release:
@@ -8489,9 +8133,7 @@ define dso_local i64 @atomicrmw_umax_i64_unaligned_release(ptr %ptr, i64 %value)
 define dso_local i64 @atomicrmw_umax_i64_unaligned_acq_rel(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umax_i64_unaligned_acq_rel:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, hi
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i64_unaligned_acq_rel:
@@ -8505,9 +8147,7 @@ define dso_local i64 @atomicrmw_umax_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 define dso_local i64 @atomicrmw_umax_i64_unaligned_seq_cst(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umax_i64_unaligned_seq_cst:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, hi
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i64_unaligned_seq_cst:
@@ -8521,15 +8161,11 @@ define dso_local i64 @atomicrmw_umax_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 define dso_local i128 @atomicrmw_umax_i128_unaligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_unaligned_monotonic:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -8547,15 +8183,11 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_monotonic(ptr %ptr, i128 %v
 define dso_local i128 @atomicrmw_umax_i128_unaligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_unaligned_acquire:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -8573,15 +8205,11 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_acquire(ptr %ptr, i128 %val
 define dso_local i128 @atomicrmw_umax_i128_unaligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_unaligned_release:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -8599,15 +8227,11 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_release(ptr %ptr, i128 %val
 define dso_local i128 @atomicrmw_umax_i128_unaligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_unaligned_acq_rel:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -8625,15 +8249,11 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_acq_rel(ptr %ptr, i128 %val
 define dso_local i128 @atomicrmw_umax_i128_unaligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_unaligned_seq_cst:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -8652,9 +8272,7 @@ define dso_local i8 @atomicrmw_umin_i8_aligned_monotonic(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umin_i8_aligned_monotonic:
 ; -O0:    and w9, w10, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, ls
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -8675,9 +8293,7 @@ define dso_local i8 @atomicrmw_umin_i8_aligned_acquire(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umin_i8_aligned_acquire:
 ; -O0:    and w9, w10, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, ls
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -8698,9 +8314,7 @@ define dso_local i8 @atomicrmw_umin_i8_aligned_release(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umin_i8_aligned_release:
 ; -O0:    and w9, w10, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, ls
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -8721,9 +8335,7 @@ define dso_local i8 @atomicrmw_umin_i8_aligned_acq_rel(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umin_i8_aligned_acq_rel:
 ; -O0:    and w9, w10, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, ls
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -8744,9 +8356,7 @@ define dso_local i8 @atomicrmw_umin_i8_aligned_seq_cst(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umin_i8_aligned_seq_cst:
 ; -O0:    and w9, w10, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, ls
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -8766,9 +8376,7 @@ define dso_local i8 @atomicrmw_umin_i8_aligned_seq_cst(ptr %ptr, i8 %value) {
 define dso_local i16 @atomicrmw_umin_i16_aligned_monotonic(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umin_i16_aligned_monotonic:
 ; -O0:    subs w10, w10, w9, uxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, ls
 ; -O0:    ldaxrh w9, [x11]
 ; -O0:    cmp w9, w8, uxth
 ; -O0:    stlxrh w10, w12, [x11]
@@ -8787,9 +8395,7 @@ define dso_local i16 @atomicrmw_umin_i16_aligned_monotonic(ptr %ptr, i16 %value)
 define dso_local i16 @atomicrmw_umin_i16_aligned_acquire(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umin_i16_aligned_acquire:
 ; -O0:    subs w10, w10, w9, uxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, ls
 ; -O0:    ldaxrh w9, [x11]
 ; -O0:    cmp w9, w8, uxth
 ; -O0:    stlxrh w10, w12, [x11]
@@ -8808,9 +8414,7 @@ define dso_local i16 @atomicrmw_umin_i16_aligned_acquire(ptr %ptr, i16 %value) {
 define dso_local i16 @atomicrmw_umin_i16_aligned_release(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umin_i16_aligned_release:
 ; -O0:    subs w10, w10, w9, uxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, ls
 ; -O0:    ldaxrh w9, [x11]
 ; -O0:    cmp w9, w8, uxth
 ; -O0:    stlxrh w10, w12, [x11]
@@ -8829,9 +8433,7 @@ define dso_local i16 @atomicrmw_umin_i16_aligned_release(ptr %ptr, i16 %value) {
 define dso_local i16 @atomicrmw_umin_i16_aligned_acq_rel(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umin_i16_aligned_acq_rel:
 ; -O0:    subs w10, w10, w9, uxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, ls
 ; -O0:    ldaxrh w9, [x11]
 ; -O0:    cmp w9, w8, uxth
 ; -O0:    stlxrh w10, w12, [x11]
@@ -8850,9 +8452,7 @@ define dso_local i16 @atomicrmw_umin_i16_aligned_acq_rel(ptr %ptr, i16 %value) {
 define dso_local i16 @atomicrmw_umin_i16_aligned_seq_cst(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umin_i16_aligned_seq_cst:
 ; -O0:    subs w10, w10, w9, uxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, ls
 ; -O0:    ldaxrh w9, [x11]
 ; -O0:    cmp w9, w8, uxth
 ; -O0:    stlxrh w10, w12, [x11]
@@ -8871,9 +8471,7 @@ define dso_local i16 @atomicrmw_umin_i16_aligned_seq_cst(ptr %ptr, i16 %value) {
 define dso_local i32 @atomicrmw_umin_i32_aligned_monotonic(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umin_i32_aligned_monotonic:
 ; -O0:    subs w10, w8, w9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, ls
 ; -O0:    ldaxr w9, [x11]
 ; -O0:    cmp w9, w8
 ; -O0:    stlxr w10, w12, [x11]
@@ -8891,9 +8489,7 @@ define dso_local i32 @atomicrmw_umin_i32_aligned_monotonic(ptr %ptr, i32 %value)
 define dso_local i32 @atomicrmw_umin_i32_aligned_acquire(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umin_i32_aligned_acquire:
 ; -O0:    subs w10, w8, w9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, ls
 ; -O0:    ldaxr w9, [x11]
 ; -O0:    cmp w9, w8
 ; -O0:    stlxr w10, w12, [x11]
@@ -8911,9 +8507,7 @@ define dso_local i32 @atomicrmw_umin_i32_aligned_acquire(ptr %ptr, i32 %value) {
 define dso_local i32 @atomicrmw_umin_i32_aligned_release(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umin_i32_aligned_release:
 ; -O0:    subs w10, w8, w9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, ls
 ; -O0:    ldaxr w9, [x11]
 ; -O0:    cmp w9, w8
 ; -O0:    stlxr w10, w12, [x11]
@@ -8931,9 +8525,7 @@ define dso_local i32 @atomicrmw_umin_i32_aligned_release(ptr %ptr, i32 %value) {
 define dso_local i32 @atomicrmw_umin_i32_aligned_acq_rel(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umin_i32_aligned_acq_rel:
 ; -O0:    subs w10, w8, w9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, ls
 ; -O0:    ldaxr w9, [x11]
 ; -O0:    cmp w9, w8
 ; -O0:    stlxr w10, w12, [x11]
@@ -8951,9 +8543,7 @@ define dso_local i32 @atomicrmw_umin_i32_aligned_acq_rel(ptr %ptr, i32 %value) {
 define dso_local i32 @atomicrmw_umin_i32_aligned_seq_cst(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umin_i32_aligned_seq_cst:
 ; -O0:    subs w10, w8, w9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, ls
 ; -O0:    ldaxr w9, [x11]
 ; -O0:    cmp w9, w8
 ; -O0:    stlxr w10, w12, [x11]
@@ -8971,9 +8561,7 @@ define dso_local i32 @atomicrmw_umin_i32_aligned_seq_cst(ptr %ptr, i32 %value) {
 define dso_local i64 @atomicrmw_umin_i64_aligned_monotonic(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umin_i64_aligned_monotonic:
 ; -O0:    subs x10, x8, x9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x12, x8, x9, ne
+; -O0:    csel x12, x8, x9, ls
 ; -O0:    ldaxr x9, [x11]
 ; -O0:    cmp x9, x8
 ; -O0:    stlxr w10, x12, [x11]
@@ -8991,9 +8579,7 @@ define dso_local i64 @atomicrmw_umin_i64_aligned_monotonic(ptr %ptr, i64 %value)
 define dso_local i64 @atomicrmw_umin_i64_aligned_acquire(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umin_i64_aligned_acquire:
 ; -O0:    subs x10, x8, x9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x12, x8, x9, ne
+; -O0:    csel x12, x8, x9, ls
 ; -O0:    ldaxr x9, [x11]
 ; -O0:    cmp x9, x8
 ; -O0:    stlxr w10, x12, [x11]
@@ -9011,9 +8597,7 @@ define dso_local i64 @atomicrmw_umin_i64_aligned_acquire(ptr %ptr, i64 %value) {
 define dso_local i64 @atomicrmw_umin_i64_aligned_release(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umin_i64_aligned_release:
 ; -O0:    subs x10, x8, x9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x12, x8, x9, ne
+; -O0:    csel x12, x8, x9, ls
 ; -O0:    ldaxr x9, [x11]
 ; -O0:    cmp x9, x8
 ; -O0:    stlxr w10, x12, [x11]
@@ -9031,9 +8615,7 @@ define dso_local i64 @atomicrmw_umin_i64_aligned_release(ptr %ptr, i64 %value) {
 define dso_local i64 @atomicrmw_umin_i64_aligned_acq_rel(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umin_i64_aligned_acq_rel:
 ; -O0:    subs x10, x8, x9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x12, x8, x9, ne
+; -O0:    csel x12, x8, x9, ls
 ; -O0:    ldaxr x9, [x11]
 ; -O0:    cmp x9, x8
 ; -O0:    stlxr w10, x12, [x11]
@@ -9051,9 +8633,7 @@ define dso_local i64 @atomicrmw_umin_i64_aligned_acq_rel(ptr %ptr, i64 %value) {
 define dso_local i64 @atomicrmw_umin_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umin_i64_aligned_seq_cst:
 ; -O0:    subs x10, x8, x9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x12, x8, x9, ne
+; -O0:    csel x12, x8, x9, ls
 ; -O0:    ldaxr x9, [x11]
 ; -O0:    cmp x9, x8
 ; -O0:    stlxr w10, x12, [x11]
@@ -9071,15 +8651,11 @@ define dso_local i64 @atomicrmw_umin_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 define dso_local i128 @atomicrmw_umin_i128_aligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_aligned_monotonic:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w8, w8, w10, ne
-; -O0:    and w13, w8, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x14, x10, x12, ne
-; -O0:    and w10, w8, #0x1
+; -O0:    subs x13, x13, x9
+; -O0:    csel w10, w8, w10, eq
+; -O0:    ands w13, w10, #0x1
+; -O0:    csel x14, x8, x12, ne
 ; -O0:    ands w10, w10, #0x1
 ; -O0:    csel x15, x8, x9, ne
 ; -O0:    ldxp x10, x9, [x11]
@@ -9105,15 +8681,11 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_monotonic(ptr %ptr, i128 %val
 define dso_local i128 @atomicrmw_umin_i128_aligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_aligned_acquire:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w8, w8, w10, ne
-; -O0:    and w13, w8, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x14, x10, x12, ne
-; -O0:    and w10, w8, #0x1
+; -O0:    subs x13, x13, x9
+; -O0:    csel w10, w8, w10, eq
+; -O0:    ands w13, w10, #0x1
+; -O0:    csel x14, x8, x12, ne
 ; -O0:    ands w10, w10, #0x1
 ; -O0:    csel x15, x8, x9, ne
 ; -O0:    ldaxp x10, x9, [x11]
@@ -9139,15 +8711,11 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_acquire(ptr %ptr, i128 %value
 define dso_local i128 @atomicrmw_umin_i128_aligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_aligned_release:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w8, w8, w10, ne
-; -O0:    and w13, w8, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x14, x10, x12, ne
-; -O0:    and w10, w8, #0x1
+; -O0:    subs x13, x13, x9
+; -O0:    csel w10, w8, w10, eq
+; -O0:    ands w13, w10, #0x1
+; -O0:    csel x14, x8, x12, ne
 ; -O0:    ands w10, w10, #0x1
 ; -O0:    csel x15, x8, x9, ne
 ; -O0:    ldxp x10, x9, [x11]
@@ -9173,15 +8741,11 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_release(ptr %ptr, i128 %value
 define dso_local i128 @atomicrmw_umin_i128_aligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_aligned_acq_rel:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w8, w8, w10, ne
-; -O0:    and w13, w8, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x14, x10, x12, ne
-; -O0:    and w10, w8, #0x1
+; -O0:    subs x13, x13, x9
+; -O0:    csel w10, w8, w10, eq
+; -O0:    ands w13, w10, #0x1
+; -O0:    csel x14, x8, x12, ne
 ; -O0:    ands w10, w10, #0x1
 ; -O0:    csel x15, x8, x9, ne
 ; -O0:    ldaxp x10, x9, [x11]
@@ -9207,15 +8771,11 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_acq_rel(ptr %ptr, i128 %value
 define dso_local i128 @atomicrmw_umin_i128_aligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_aligned_seq_cst:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w8, w8, w10, ne
-; -O0:    and w13, w8, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x14, x10, x12, ne
-; -O0:    and w10, w8, #0x1
+; -O0:    subs x13, x13, x9
+; -O0:    csel w10, w8, w10, eq
+; -O0:    ands w13, w10, #0x1
+; -O0:    csel x14, x8, x12, ne
 ; -O0:    ands w10, w10, #0x1
 ; -O0:    csel x15, x8, x9, ne
 ; -O0:    ldaxp x10, x9, [x11]
@@ -9242,9 +8802,7 @@ define dso_local i8 @atomicrmw_umin_i8_unaligned_monotonic(ptr %ptr, i8 %value)
 ; -O0-LABEL: atomicrmw_umin_i8_unaligned_monotonic:
 ; -O0:    and w9, w10, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, ls
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -9265,9 +8823,7 @@ define dso_local i8 @atomicrmw_umin_i8_unaligned_acquire(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umin_i8_unaligned_acquire:
 ; -O0:    and w9, w10, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, ls
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -9288,9 +8844,7 @@ define dso_local i8 @atomicrmw_umin_i8_unaligned_release(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umin_i8_unaligned_release:
 ; -O0:    and w9, w10, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, ls
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -9311,9 +8865,7 @@ define dso_local i8 @atomicrmw_umin_i8_unaligned_acq_rel(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umin_i8_unaligned_acq_rel:
 ; -O0:    and w9, w10, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, ls
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -9334,9 +8886,7 @@ define dso_local i8 @atomicrmw_umin_i8_unaligned_seq_cst(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umin_i8_unaligned_seq_cst:
 ; -O0:    and w9, w10, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, ls
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -9356,9 +8906,7 @@ define dso_local i8 @atomicrmw_umin_i8_unaligned_seq_cst(ptr %ptr, i8 %value) {
 define dso_local i16 @atomicrmw_umin_i16_unaligned_monotonic(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umin_i16_unaligned_monotonic:
 ; -O0:    subs w10, w10, w8, uxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, ls
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i16_unaligned_monotonic:
@@ -9373,9 +8921,7 @@ define dso_local i16 @atomicrmw_umin_i16_unaligned_monotonic(ptr %ptr, i16 %valu
 define dso_local i16 @atomicrmw_umin_i16_unaligned_acquire(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umin_i16_unaligned_acquire:
 ; -O0:    subs w10, w10, w8, uxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, ls
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i16_unaligned_acquire:
@@ -9390,9 +8936,7 @@ define dso_local i16 @atomicrmw_umin_i16_unaligned_acquire(ptr %ptr, i16 %value)
 define dso_local i16 @atomicrmw_umin_i16_unaligned_release(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umin_i16_unaligned_release:
 ; -O0:    subs w10, w10, w8, uxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, ls
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i16_unaligned_release:
@@ -9407,9 +8951,7 @@ define dso_local i16 @atomicrmw_umin_i16_unaligned_release(ptr %ptr, i16 %value)
 define dso_local i16 @atomicrmw_umin_i16_unaligned_acq_rel(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umin_i16_unaligned_acq_rel:
 ; -O0:    subs w10, w10, w8, uxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, ls
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i16_unaligned_acq_rel:
@@ -9424,9 +8966,7 @@ define dso_local i16 @atomicrmw_umin_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 define dso_local i16 @atomicrmw_umin_i16_unaligned_seq_cst(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umin_i16_unaligned_seq_cst:
 ; -O0:    subs w10, w10, w8, uxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, ls
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i16_unaligned_seq_cst:
@@ -9441,9 +8981,7 @@ define dso_local i16 @atomicrmw_umin_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 define dso_local i32 @atomicrmw_umin_i32_unaligned_monotonic(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umin_i32_unaligned_monotonic:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, ls
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i32_unaligned_monotonic:
@@ -9457,9 +8995,7 @@ define dso_local i32 @atomicrmw_umin_i32_unaligned_monotonic(ptr %ptr, i32 %valu
 define dso_local i32 @atomicrmw_umin_i32_unaligned_acquire(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umin_i32_unaligned_acquire:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, ls
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i32_unaligned_acquire:
@@ -9473,9 +9009,7 @@ define dso_local i32 @atomicrmw_umin_i32_unaligned_acquire(ptr %ptr, i32 %value)
 define dso_local i32 @atomicrmw_umin_i32_unaligned_release(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umin_i32_unaligned_release:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, ls
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i32_unaligned_release:
@@ -9489,9 +9023,7 @@ define dso_local i32 @atomicrmw_umin_i32_unaligned_release(ptr %ptr, i32 %value)
 define dso_local i32 @atomicrmw_umin_i32_unaligned_acq_rel(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umin_i32_unaligned_acq_rel:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, ls
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i32_unaligned_acq_rel:
@@ -9505,9 +9037,7 @@ define dso_local i32 @atomicrmw_umin_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 define dso_local i32 @atomicrmw_umin_i32_unaligned_seq_cst(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umin_i32_unaligned_seq_cst:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, ls
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i32_unaligned_seq_cst:
@@ -9521,9 +9051,7 @@ define dso_local i32 @atomicrmw_umin_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 define dso_local i64 @atomicrmw_umin_i64_unaligned_monotonic(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umin_i64_unaligned_monotonic:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, ls
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i64_unaligned_monotonic:
@@ -9537,9 +9065,7 @@ define dso_local i64 @atomicrmw_umin_i64_unaligned_monotonic(ptr %ptr, i64 %valu
 define dso_local i64 @atomicrmw_umin_i64_unaligned_acquire(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umin_i64_unaligned_acquire:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, ls
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i64_unaligned_acquire:
@@ -9553,9 +9079,7 @@ define dso_local i64 @atomicrmw_umin_i64_unaligned_acquire(ptr %ptr, i64 %value)
 define dso_local i64 @atomicrmw_umin_i64_unaligned_release(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umin_i64_unaligned_release:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, ls
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i64_unaligned_release:
@@ -9569,9 +9093,7 @@ define dso_local i64 @atomicrmw_umin_i64_unaligned_release(ptr %ptr, i64 %value)
 define dso_local i64 @atomicrmw_umin_i64_unaligned_acq_rel(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umin_i64_unaligned_acq_rel:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, ls
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i64_unaligned_acq_rel:
@@ -9585,9 +9107,7 @@ define dso_local i64 @atomicrmw_umin_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 define dso_local i64 @atomicrmw_umin_i64_unaligned_seq_cst(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umin_i64_unaligned_seq_cst:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, ls
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i64_unaligned_seq_cst:
@@ -9601,15 +9121,11 @@ define dso_local i64 @atomicrmw_umin_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 define dso_local i128 @atomicrmw_umin_i128_unaligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_unaligned_monotonic:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -9627,15 +9143,11 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_monotonic(ptr %ptr, i128 %v
 define dso_local i128 @atomicrmw_umin_i128_unaligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_unaligned_acquire:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -9653,15 +9165,11 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_acquire(ptr %ptr, i128 %val
 define dso_local i128 @atomicrmw_umin_i128_unaligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_unaligned_release:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -9679,15 +9187,11 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_release(ptr %ptr, i128 %val
 define dso_local i128 @atomicrmw_umin_i128_unaligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_unaligned_acq_rel:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -9705,15 +9209,11 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_acq_rel(ptr %ptr, i128 %val
 define dso_local i128 @atomicrmw_umin_i128_unaligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_unaligned_seq_cst:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
diff --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-v8_1a.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-v8_1a.ll
index 7fc733a13bf07..ee5fbe39b4492 100644
--- a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-v8_1a.ll
+++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-v8_1a.ll
@@ -546,7 +546,6 @@ define dso_local i64 @atomicrmw_add_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 define dso_local i128 @atomicrmw_add_i128_aligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_add_i128_aligned_monotonic:
 ; -O0:    adds x2, x9, x11
-; -O0:    and w11, w9, #0x1
 ; -O0:    subs w11, w11, #1
 ; -O0:    casp x0, x1, x2, x3, [x8]
 ; -O0:    eor x8, x10, x8
@@ -567,7 +566,6 @@ define dso_local i128 @atomicrmw_add_i128_aligned_monotonic(ptr %ptr, i128 %valu
 define dso_local i128 @atomicrmw_add_i128_aligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_add_i128_aligned_acquire:
 ; -O0:    adds x2, x9, x11
-; -O0:    and w11, w9, #0x1
 ; -O0:    subs w11, w11, #1
 ; -O0:    caspa x0, x1, x2, x3, [x8]
 ; -O0:    eor x8, x10, x8
@@ -588,7 +586,6 @@ define dso_local i128 @atomicrmw_add_i128_aligned_acquire(ptr %ptr, i128 %value)
 define dso_local i128 @atomicrmw_add_i128_aligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_add_i128_aligned_release:
 ; -O0:    adds x2, x9, x11
-; -O0:    and w11, w9, #0x1
 ; -O0:    subs w11, w11, #1
 ; -O0:    caspl x0, x1, x2, x3, [x8]
 ; -O0:    eor x8, x10, x8
@@ -609,7 +606,6 @@ define dso_local i128 @atomicrmw_add_i128_aligned_release(ptr %ptr, i128 %value)
 define dso_local i128 @atomicrmw_add_i128_aligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_add_i128_aligned_acq_rel:
 ; -O0:    adds x2, x9, x11
-; -O0:    and w11, w9, #0x1
 ; -O0:    subs w11, w11, #1
 ; -O0:    caspal x0, x1, x2, x3, [x8]
 ; -O0:    eor x8, x10, x8
@@ -630,7 +626,6 @@ define dso_local i128 @atomicrmw_add_i128_aligned_acq_rel(ptr %ptr, i128 %value)
 define dso_local i128 @atomicrmw_add_i128_aligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_add_i128_aligned_seq_cst:
 ; -O0:    adds x2, x9, x11
-; -O0:    and w11, w9, #0x1
 ; -O0:    subs w11, w11, #1
 ; -O0:    caspal x0, x1, x2, x3, [x8]
 ; -O0:    eor x8, x10, x8
@@ -866,7 +861,6 @@ define dso_local i64 @atomicrmw_add_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 define dso_local i128 @atomicrmw_add_i128_unaligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_add_i128_unaligned_monotonic:
 ; -O0:    adds x9, x8, x9
-; -O0:    and w11, w8, #0x1
 ; -O0:    subs w11, w11, #1
 ; -O0:    bl __atomic_compare_exchange
 ;
@@ -881,7 +875,6 @@ define dso_local i128 @atomicrmw_add_i128_unaligned_monotonic(ptr %ptr, i128 %va
 define dso_local i128 @atomicrmw_add_i128_unaligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_add_i128_unaligned_acquire:
 ; -O0:    adds x9, x8, x9
-; -O0:    and w11, w8, #0x1
 ; -O0:    subs w11, w11, #1
 ; -O0:    bl __atomic_compare_exchange
 ;
@@ -896,7 +889,6 @@ define dso_local i128 @atomicrmw_add_i128_unaligned_acquire(ptr %ptr, i128 %valu
 define dso_local i128 @atomicrmw_add_i128_unaligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_add_i128_unaligned_release:
 ; -O0:    adds x9, x8, x9
-; -O0:    and w11, w8, #0x1
 ; -O0:    subs w11, w11, #1
 ; -O0:    bl __atomic_compare_exchange
 ;
@@ -911,7 +903,6 @@ define dso_local i128 @atomicrmw_add_i128_unaligned_release(ptr %ptr, i128 %valu
 define dso_local i128 @atomicrmw_add_i128_unaligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_add_i128_unaligned_acq_rel:
 ; -O0:    adds x9, x8, x9
-; -O0:    and w11, w8, #0x1
 ; -O0:    subs w11, w11, #1
 ; -O0:    bl __atomic_compare_exchange
 ;
@@ -926,7 +917,6 @@ define dso_local i128 @atomicrmw_add_i128_unaligned_acq_rel(ptr %ptr, i128 %valu
 define dso_local i128 @atomicrmw_add_i128_unaligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_add_i128_unaligned_seq_cst:
 ; -O0:    adds x9, x8, x9
-; -O0:    and w11, w8, #0x1
 ; -O0:    subs w11, w11, #1
 ; -O0:    bl __atomic_compare_exchange
 ;
@@ -1081,7 +1071,6 @@ define dso_local i64 @atomicrmw_sub_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 define dso_local i128 @atomicrmw_sub_i128_aligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_sub_i128_aligned_monotonic:
 ; -O0:    subs x2, x9, x11
-; -O0:    and w11, w9, #0x1
 ; -O0:    casp x0, x1, x2, x3, [x8]
 ; -O0:    eor x8, x10, x8
 ; -O0:    eor x11, x9, x11
@@ -1101,7 +1090,6 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_monotonic(ptr %ptr, i128 %valu
 define dso_local i128 @atomicrmw_sub_i128_aligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_sub_i128_aligned_acquire:
 ; -O0:    subs x2, x9, x11
-; -O0:    and w11, w9, #0x1
 ; -O0:    caspa x0, x1, x2, x3, [x8]
 ; -O0:    eor x8, x10, x8
 ; -O0:    eor x11, x9, x11
@@ -1121,7 +1109,6 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_acquire(ptr %ptr, i128 %value)
 define dso_local i128 @atomicrmw_sub_i128_aligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_sub_i128_aligned_release:
 ; -O0:    subs x2, x9, x11
-; -O0:    and w11, w9, #0x1
 ; -O0:    caspl x0, x1, x2, x3, [x8]
 ; -O0:    eor x8, x10, x8
 ; -O0:    eor x11, x9, x11
@@ -1141,7 +1128,6 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_release(ptr %ptr, i128 %value)
 define dso_local i128 @atomicrmw_sub_i128_aligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_sub_i128_aligned_acq_rel:
 ; -O0:    subs x2, x9, x11
-; -O0:    and w11, w9, #0x1
 ; -O0:    caspal x0, x1, x2, x3, [x8]
 ; -O0:    eor x8, x10, x8
 ; -O0:    eor x11, x9, x11
@@ -1161,7 +1147,6 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_acq_rel(ptr %ptr, i128 %value)
 define dso_local i128 @atomicrmw_sub_i128_aligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_sub_i128_aligned_seq_cst:
 ; -O0:    subs x2, x9, x11
-; -O0:    and w11, w9, #0x1
 ; -O0:    caspal x0, x1, x2, x3, [x8]
 ; -O0:    eor x8, x10, x8
 ; -O0:    eor x11, x9, x11
@@ -1396,7 +1381,6 @@ define dso_local i64 @atomicrmw_sub_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 define dso_local i128 @atomicrmw_sub_i128_unaligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_sub_i128_unaligned_monotonic:
 ; -O0:    subs x9, x8, x9
-; -O0:    and w11, w8, #0x1
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i128_unaligned_monotonic:
@@ -1410,7 +1394,6 @@ define dso_local i128 @atomicrmw_sub_i128_unaligned_monotonic(ptr %ptr, i128 %va
 define dso_local i128 @atomicrmw_sub_i128_unaligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_sub_i128_unaligned_acquire:
 ; -O0:    subs x9, x8, x9
-; -O0:    and w11, w8, #0x1
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i128_unaligned_acquire:
@@ -1424,7 +1407,6 @@ define dso_local i128 @atomicrmw_sub_i128_unaligned_acquire(ptr %ptr, i128 %valu
 define dso_local i128 @atomicrmw_sub_i128_unaligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_sub_i128_unaligned_release:
 ; -O0:    subs x9, x8, x9
-; -O0:    and w11, w8, #0x1
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i128_unaligned_release:
@@ -1438,7 +1420,6 @@ define dso_local i128 @atomicrmw_sub_i128_unaligned_release(ptr %ptr, i128 %valu
 define dso_local i128 @atomicrmw_sub_i128_unaligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_sub_i128_unaligned_acq_rel:
 ; -O0:    subs x9, x8, x9
-; -O0:    and w11, w8, #0x1
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i128_unaligned_acq_rel:
@@ -1452,7 +1433,6 @@ define dso_local i128 @atomicrmw_sub_i128_unaligned_acq_rel(ptr %ptr, i128 %valu
 define dso_local i128 @atomicrmw_sub_i128_unaligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_sub_i128_unaligned_seq_cst:
 ; -O0:    subs x9, x8, x9
-; -O0:    and w11, w8, #0x1
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i128_unaligned_seq_cst:
@@ -4076,15 +4056,11 @@ define dso_local i64 @atomicrmw_max_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 define dso_local i128 @atomicrmw_max_i128_aligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_aligned_monotonic:
 ; -O0:    subs x9, x9, x10
-; -O0:    subs x9, x9, x10
 ; -O0:    subs x9, x9, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w9, w9, w11, ne
-; -O0:    and w13, w9, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x2, x11, x12, ne
-; -O0:    and w11, w9, #0x1
+; -O0:    subs x13, x13, x10
+; -O0:    csel w11, w9, w11, eq
+; -O0:    ands w13, w11, #0x1
+; -O0:    csel x2, x9, x12, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x9, x9, x10, ne
 ; -O0:    casp x0, x1, x2, x3, [x8]
@@ -4108,15 +4084,11 @@ define dso_local i128 @atomicrmw_max_i128_aligned_monotonic(ptr %ptr, i128 %valu
 define dso_local i128 @atomicrmw_max_i128_aligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_aligned_acquire:
 ; -O0:    subs x9, x9, x10
-; -O0:    subs x9, x9, x10
 ; -O0:    subs x9, x9, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w9, w9, w11, ne
-; -O0:    and w13, w9, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x2, x11, x12, ne
-; -O0:    and w11, w9, #0x1
+; -O0:    subs x13, x13, x10
+; -O0:    csel w11, w9, w11, eq
+; -O0:    ands w13, w11, #0x1
+; -O0:    csel x2, x9, x12, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x9, x9, x10, ne
 ; -O0:    caspa x0, x1, x2, x3, [x8]
@@ -4140,15 +4112,11 @@ define dso_local i128 @atomicrmw_max_i128_aligned_acquire(ptr %ptr, i128 %value)
 define dso_local i128 @atomicrmw_max_i128_aligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_aligned_release:
 ; -O0:    subs x9, x9, x10
-; -O0:    subs x9, x9, x10
 ; -O0:    subs x9, x9, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w9, w9, w11, ne
-; -O0:    and w13, w9, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x2, x11, x12, ne
-; -O0:    and w11, w9, #0x1
+; -O0:    subs x13, x13, x10
+; -O0:    csel w11, w9, w11, eq
+; -O0:    ands w13, w11, #0x1
+; -O0:    csel x2, x9, x12, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x9, x9, x10, ne
 ; -O0:    caspl x0, x1, x2, x3, [x8]
@@ -4172,15 +4140,11 @@ define dso_local i128 @atomicrmw_max_i128_aligned_release(ptr %ptr, i128 %value)
 define dso_local i128 @atomicrmw_max_i128_aligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_aligned_acq_rel:
 ; -O0:    subs x9, x9, x10
-; -O0:    subs x9, x9, x10
 ; -O0:    subs x9, x9, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w9, w9, w11, ne
-; -O0:    and w13, w9, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x2, x11, x12, ne
-; -O0:    and w11, w9, #0x1
+; -O0:    subs x13, x13, x10
+; -O0:    csel w11, w9, w11, eq
+; -O0:    ands w13, w11, #0x1
+; -O0:    csel x2, x9, x12, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x9, x9, x10, ne
 ; -O0:    caspal x0, x1, x2, x3, [x8]
@@ -4204,15 +4168,11 @@ define dso_local i128 @atomicrmw_max_i128_aligned_acq_rel(ptr %ptr, i128 %value)
 define dso_local i128 @atomicrmw_max_i128_aligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_aligned_seq_cst:
 ; -O0:    subs x9, x9, x10
-; -O0:    subs x9, x9, x10
 ; -O0:    subs x9, x9, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w9, w9, w11, ne
-; -O0:    and w13, w9, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x2, x11, x12, ne
-; -O0:    and w11, w9, #0x1
+; -O0:    subs x13, x13, x10
+; -O0:    csel w11, w9, w11, eq
+; -O0:    ands w13, w11, #0x1
+; -O0:    csel x2, x9, x12, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x9, x9, x10, ne
 ; -O0:    caspal x0, x1, x2, x3, [x8]
@@ -4272,9 +4232,7 @@ define dso_local i16 @atomicrmw_max_i16_unaligned_monotonic(ptr %ptr, i16 %value
 ; -O0-LABEL: atomicrmw_max_i16_unaligned_monotonic:
 ; -O0:    sxth w10, w9
 ; -O0:    subs w10, w10, w8, sxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, gt
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i16_unaligned_monotonic:
@@ -4290,9 +4248,7 @@ define dso_local i16 @atomicrmw_max_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ; -O0-LABEL: atomicrmw_max_i16_unaligned_acquire:
 ; -O0:    sxth w10, w9
 ; -O0:    subs w10, w10, w8, sxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, gt
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i16_unaligned_acquire:
@@ -4308,9 +4264,7 @@ define dso_local i16 @atomicrmw_max_i16_unaligned_release(ptr %ptr, i16 %value)
 ; -O0-LABEL: atomicrmw_max_i16_unaligned_release:
 ; -O0:    sxth w10, w9
 ; -O0:    subs w10, w10, w8, sxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, gt
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i16_unaligned_release:
@@ -4326,9 +4280,7 @@ define dso_local i16 @atomicrmw_max_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ; -O0-LABEL: atomicrmw_max_i16_unaligned_acq_rel:
 ; -O0:    sxth w10, w9
 ; -O0:    subs w10, w10, w8, sxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, gt
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i16_unaligned_acq_rel:
@@ -4344,9 +4296,7 @@ define dso_local i16 @atomicrmw_max_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ; -O0-LABEL: atomicrmw_max_i16_unaligned_seq_cst:
 ; -O0:    sxth w10, w9
 ; -O0:    subs w10, w10, w8, sxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, gt
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i16_unaligned_seq_cst:
@@ -4361,9 +4311,7 @@ define dso_local i16 @atomicrmw_max_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 define dso_local i32 @atomicrmw_max_i32_unaligned_monotonic(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_max_i32_unaligned_monotonic:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, gt
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i32_unaligned_monotonic:
@@ -4377,9 +4325,7 @@ define dso_local i32 @atomicrmw_max_i32_unaligned_monotonic(ptr %ptr, i32 %value
 define dso_local i32 @atomicrmw_max_i32_unaligned_acquire(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_max_i32_unaligned_acquire:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, gt
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i32_unaligned_acquire:
@@ -4393,9 +4339,7 @@ define dso_local i32 @atomicrmw_max_i32_unaligned_acquire(ptr %ptr, i32 %value)
 define dso_local i32 @atomicrmw_max_i32_unaligned_release(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_max_i32_unaligned_release:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, gt
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i32_unaligned_release:
@@ -4409,9 +4353,7 @@ define dso_local i32 @atomicrmw_max_i32_unaligned_release(ptr %ptr, i32 %value)
 define dso_local i32 @atomicrmw_max_i32_unaligned_acq_rel(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_max_i32_unaligned_acq_rel:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, gt
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i32_unaligned_acq_rel:
@@ -4425,9 +4367,7 @@ define dso_local i32 @atomicrmw_max_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 define dso_local i32 @atomicrmw_max_i32_unaligned_seq_cst(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_max_i32_unaligned_seq_cst:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, gt
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i32_unaligned_seq_cst:
@@ -4441,9 +4381,7 @@ define dso_local i32 @atomicrmw_max_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 define dso_local i64 @atomicrmw_max_i64_unaligned_monotonic(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_max_i64_unaligned_monotonic:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, gt
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i64_unaligned_monotonic:
@@ -4457,9 +4395,7 @@ define dso_local i64 @atomicrmw_max_i64_unaligned_monotonic(ptr %ptr, i64 %value
 define dso_local i64 @atomicrmw_max_i64_unaligned_acquire(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_max_i64_unaligned_acquire:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, gt
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i64_unaligned_acquire:
@@ -4473,9 +4409,7 @@ define dso_local i64 @atomicrmw_max_i64_unaligned_acquire(ptr %ptr, i64 %value)
 define dso_local i64 @atomicrmw_max_i64_unaligned_release(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_max_i64_unaligned_release:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, gt
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i64_unaligned_release:
@@ -4489,9 +4423,7 @@ define dso_local i64 @atomicrmw_max_i64_unaligned_release(ptr %ptr, i64 %value)
 define dso_local i64 @atomicrmw_max_i64_unaligned_acq_rel(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_max_i64_unaligned_acq_rel:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, gt
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i64_unaligned_acq_rel:
@@ -4505,9 +4437,7 @@ define dso_local i64 @atomicrmw_max_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 define dso_local i64 @atomicrmw_max_i64_unaligned_seq_cst(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_max_i64_unaligned_seq_cst:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, gt
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i64_unaligned_seq_cst:
@@ -4521,15 +4451,11 @@ define dso_local i64 @atomicrmw_max_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 define dso_local i128 @atomicrmw_max_i128_unaligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_unaligned_monotonic:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -4547,15 +4473,11 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_monotonic(ptr %ptr, i128 %va
 define dso_local i128 @atomicrmw_max_i128_unaligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_unaligned_acquire:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -4573,15 +4495,11 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_acquire(ptr %ptr, i128 %valu
 define dso_local i128 @atomicrmw_max_i128_unaligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_unaligned_release:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -4599,15 +4517,11 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_release(ptr %ptr, i128 %valu
 define dso_local i128 @atomicrmw_max_i128_unaligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_unaligned_acq_rel:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -4625,15 +4539,11 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_acq_rel(ptr %ptr, i128 %valu
 define dso_local i128 @atomicrmw_max_i128_unaligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_unaligned_seq_cst:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -4791,15 +4701,11 @@ define dso_local i64 @atomicrmw_min_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 define dso_local i128 @atomicrmw_min_i128_aligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_aligned_monotonic:
 ; -O0:    subs x9, x9, x10
-; -O0:    subs x9, x9, x10
 ; -O0:    subs x9, x9, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w9, w9, w11, ne
-; -O0:    and w13, w9, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x2, x11, x12, ne
-; -O0:    and w11, w9, #0x1
+; -O0:    subs x13, x13, x10
+; -O0:    csel w11, w9, w11, eq
+; -O0:    ands w13, w11, #0x1
+; -O0:    csel x2, x9, x12, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x9, x9, x10, ne
 ; -O0:    casp x0, x1, x2, x3, [x8]
@@ -4823,15 +4729,11 @@ define dso_local i128 @atomicrmw_min_i128_aligned_monotonic(ptr %ptr, i128 %valu
 define dso_local i128 @atomicrmw_min_i128_aligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_aligned_acquire:
 ; -O0:    subs x9, x9, x10
-; -O0:    subs x9, x9, x10
 ; -O0:    subs x9, x9, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w9, w9, w11, ne
-; -O0:    and w13, w9, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x2, x11, x12, ne
-; -O0:    and w11, w9, #0x1
+; -O0:    subs x13, x13, x10
+; -O0:    csel w11, w9, w11, eq
+; -O0:    ands w13, w11, #0x1
+; -O0:    csel x2, x9, x12, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x9, x9, x10, ne
 ; -O0:    caspa x0, x1, x2, x3, [x8]
@@ -4855,15 +4757,11 @@ define dso_local i128 @atomicrmw_min_i128_aligned_acquire(ptr %ptr, i128 %value)
 define dso_local i128 @atomicrmw_min_i128_aligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_aligned_release:
 ; -O0:    subs x9, x9, x10
-; -O0:    subs x9, x9, x10
 ; -O0:    subs x9, x9, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w9, w9, w11, ne
-; -O0:    and w13, w9, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x2, x11, x12, ne
-; -O0:    and w11, w9, #0x1
+; -O0:    subs x13, x13, x10
+; -O0:    csel w11, w9, w11, eq
+; -O0:    ands w13, w11, #0x1
+; -O0:    csel x2, x9, x12, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x9, x9, x10, ne
 ; -O0:    caspl x0, x1, x2, x3, [x8]
@@ -4887,15 +4785,11 @@ define dso_local i128 @atomicrmw_min_i128_aligned_release(ptr %ptr, i128 %value)
 define dso_local i128 @atomicrmw_min_i128_aligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_aligned_acq_rel:
 ; -O0:    subs x9, x9, x10
-; -O0:    subs x9, x9, x10
 ; -O0:    subs x9, x9, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w9, w9, w11, ne
-; -O0:    and w13, w9, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x2, x11, x12, ne
-; -O0:    and w11, w9, #0x1
+; -O0:    subs x13, x13, x10
+; -O0:    csel w11, w9, w11, eq
+; -O0:    ands w13, w11, #0x1
+; -O0:    csel x2, x9, x12, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x9, x9, x10, ne
 ; -O0:    caspal x0, x1, x2, x3, [x8]
@@ -4919,15 +4813,11 @@ define dso_local i128 @atomicrmw_min_i128_aligned_acq_rel(ptr %ptr, i128 %value)
 define dso_local i128 @atomicrmw_min_i128_aligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_aligned_seq_cst:
 ; -O0:    subs x9, x9, x10
-; -O0:    subs x9, x9, x10
 ; -O0:    subs x9, x9, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w9, w9, w11, ne
-; -O0:    and w13, w9, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x2, x11, x12, ne
-; -O0:    and w11, w9, #0x1
+; -O0:    subs x13, x13, x10
+; -O0:    csel w11, w9, w11, eq
+; -O0:    ands w13, w11, #0x1
+; -O0:    csel x2, x9, x12, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x9, x9, x10, ne
 ; -O0:    caspal x0, x1, x2, x3, [x8]
@@ -4987,9 +4877,7 @@ define dso_local i16 @atomicrmw_min_i16_unaligned_monotonic(ptr %ptr, i16 %value
 ; -O0-LABEL: atomicrmw_min_i16_unaligned_monotonic:
 ; -O0:    sxth w10, w9
 ; -O0:    subs w10, w10, w8, sxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, le
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i16_unaligned_monotonic:
@@ -5005,9 +4893,7 @@ define dso_local i16 @atomicrmw_min_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ; -O0-LABEL: atomicrmw_min_i16_unaligned_acquire:
 ; -O0:    sxth w10, w9
 ; -O0:    subs w10, w10, w8, sxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, le
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i16_unaligned_acquire:
@@ -5023,9 +4909,7 @@ define dso_local i16 @atomicrmw_min_i16_unaligned_release(ptr %ptr, i16 %value)
 ; -O0-LABEL: atomicrmw_min_i16_unaligned_release:
 ; -O0:    sxth w10, w9
 ; -O0:    subs w10, w10, w8, sxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, le
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i16_unaligned_release:
@@ -5041,9 +4925,7 @@ define dso_local i16 @atomicrmw_min_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ; -O0-LABEL: atomicrmw_min_i16_unaligned_acq_rel:
 ; -O0:    sxth w10, w9
 ; -O0:    subs w10, w10, w8, sxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, le
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i16_unaligned_acq_rel:
@@ -5059,9 +4941,7 @@ define dso_local i16 @atomicrmw_min_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ; -O0-LABEL: atomicrmw_min_i16_unaligned_seq_cst:
 ; -O0:    sxth w10, w9
 ; -O0:    subs w10, w10, w8, sxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, le
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i16_unaligned_seq_cst:
@@ -5076,9 +4956,7 @@ define dso_local i16 @atomicrmw_min_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 define dso_local i32 @atomicrmw_min_i32_unaligned_monotonic(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_min_i32_unaligned_monotonic:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, le
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i32_unaligned_monotonic:
@@ -5092,9 +4970,7 @@ define dso_local i32 @atomicrmw_min_i32_unaligned_monotonic(ptr %ptr, i32 %value
 define dso_local i32 @atomicrmw_min_i32_unaligned_acquire(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_min_i32_unaligned_acquire:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, le
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i32_unaligned_acquire:
@@ -5108,9 +4984,7 @@ define dso_local i32 @atomicrmw_min_i32_unaligned_acquire(ptr %ptr, i32 %value)
 define dso_local i32 @atomicrmw_min_i32_unaligned_release(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_min_i32_unaligned_release:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, le
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i32_unaligned_release:
@@ -5124,9 +4998,7 @@ define dso_local i32 @atomicrmw_min_i32_unaligned_release(ptr %ptr, i32 %value)
 define dso_local i32 @atomicrmw_min_i32_unaligned_acq_rel(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_min_i32_unaligned_acq_rel:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, le
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i32_unaligned_acq_rel:
@@ -5140,9 +5012,7 @@ define dso_local i32 @atomicrmw_min_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 define dso_local i32 @atomicrmw_min_i32_unaligned_seq_cst(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_min_i32_unaligned_seq_cst:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, le
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i32_unaligned_seq_cst:
@@ -5156,9 +5026,7 @@ define dso_local i32 @atomicrmw_min_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 define dso_local i64 @atomicrmw_min_i64_unaligned_monotonic(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_min_i64_unaligned_monotonic:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, le
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i64_unaligned_monotonic:
@@ -5172,9 +5040,7 @@ define dso_local i64 @atomicrmw_min_i64_unaligned_monotonic(ptr %ptr, i64 %value
 define dso_local i64 @atomicrmw_min_i64_unaligned_acquire(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_min_i64_unaligned_acquire:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, le
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i64_unaligned_acquire:
@@ -5188,9 +5054,7 @@ define dso_local i64 @atomicrmw_min_i64_unaligned_acquire(ptr %ptr, i64 %value)
 define dso_local i64 @atomicrmw_min_i64_unaligned_release(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_min_i64_unaligned_release:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, le
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i64_unaligned_release:
@@ -5204,9 +5068,7 @@ define dso_local i64 @atomicrmw_min_i64_unaligned_release(ptr %ptr, i64 %value)
 define dso_local i64 @atomicrmw_min_i64_unaligned_acq_rel(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_min_i64_unaligned_acq_rel:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, le
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i64_unaligned_acq_rel:
@@ -5220,9 +5082,7 @@ define dso_local i64 @atomicrmw_min_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 define dso_local i64 @atomicrmw_min_i64_unaligned_seq_cst(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_min_i64_unaligned_seq_cst:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, le
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i64_unaligned_seq_cst:
@@ -5236,15 +5096,11 @@ define dso_local i64 @atomicrmw_min_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 define dso_local i128 @atomicrmw_min_i128_unaligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_unaligned_monotonic:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -5262,15 +5118,11 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_monotonic(ptr %ptr, i128 %va
 define dso_local i128 @atomicrmw_min_i128_unaligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_unaligned_acquire:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -5288,15 +5140,11 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_acquire(ptr %ptr, i128 %valu
 define dso_local i128 @atomicrmw_min_i128_unaligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_unaligned_release:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -5314,15 +5162,11 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_release(ptr %ptr, i128 %valu
 define dso_local i128 @atomicrmw_min_i128_unaligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_unaligned_acq_rel:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -5340,15 +5184,11 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_acq_rel(ptr %ptr, i128 %valu
 define dso_local i128 @atomicrmw_min_i128_unaligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_unaligned_seq_cst:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -5506,15 +5346,11 @@ define dso_local i64 @atomicrmw_umax_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 define dso_local i128 @atomicrmw_umax_i128_aligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_aligned_monotonic:
 ; -O0:    subs x9, x9, x10
-; -O0:    subs x9, x9, x10
 ; -O0:    subs x9, x9, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w9, w9, w11, ne
-; -O0:    and w13, w9, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x2, x11, x12, ne
-; -O0:    and w11, w9, #0x1
+; -O0:    subs x13, x13, x10
+; -O0:    csel w11, w9, w11, eq
+; -O0:    ands w13, w11, #0x1
+; -O0:    csel x2, x9, x12, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x9, x9, x10, ne
 ; -O0:    casp x0, x1, x2, x3, [x8]
@@ -5538,15 +5374,11 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_monotonic(ptr %ptr, i128 %val
 define dso_local i128 @atomicrmw_umax_i128_aligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_aligned_acquire:
 ; -O0:    subs x9, x9, x10
-; -O0:    subs x9, x9, x10
 ; -O0:    subs x9, x9, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w9, w9, w11, ne
-; -O0:    and w13, w9, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x2, x11, x12, ne
-; -O0:    and w11, w9, #0x1
+; -O0:    subs x13, x13, x10
+; -O0:    csel w11, w9, w11, eq
+; -O0:    ands w13, w11, #0x1
+; -O0:    csel x2, x9, x12, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x9, x9, x10, ne
 ; -O0:    caspa x0, x1, x2, x3, [x8]
@@ -5570,15 +5402,11 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_acquire(ptr %ptr, i128 %value
 define dso_local i128 @atomicrmw_umax_i128_aligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_aligned_release:
 ; -O0:    subs x9, x9, x10
-; -O0:    subs x9, x9, x10
 ; -O0:    subs x9, x9, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w9, w9, w11, ne
-; -O0:    and w13, w9, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x2, x11, x12, ne
-; -O0:    and w11, w9, #0x1
+; -O0:    subs x13, x13, x10
+; -O0:    csel w11, w9, w11, eq
+; -O0:    ands w13, w11, #0x1
+; -O0:    csel x2, x9, x12, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x9, x9, x10, ne
 ; -O0:    caspl x0, x1, x2, x3, [x8]
@@ -5602,15 +5430,11 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_release(ptr %ptr, i128 %value
 define dso_local i128 @atomicrmw_umax_i128_aligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_aligned_acq_rel:
 ; -O0:    subs x9, x9, x10
-; -O0:    subs x9, x9, x10
 ; -O0:    subs x9, x9, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w9, w9, w11, ne
-; -O0:    and w13, w9, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x2, x11, x12, ne
-; -O0:    and w11, w9, #0x1
+; -O0:    subs x13, x13, x10
+; -O0:    csel w11, w9, w11, eq
+; -O0:    ands w13, w11, #0x1
+; -O0:    csel x2, x9, x12, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x9, x9, x10, ne
 ; -O0:    caspal x0, x1, x2, x3, [x8]
@@ -5634,15 +5458,11 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_acq_rel(ptr %ptr, i128 %value
 define dso_local i128 @atomicrmw_umax_i128_aligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_aligned_seq_cst:
 ; -O0:    subs x9, x9, x10
-; -O0:    subs x9, x9, x10
 ; -O0:    subs x9, x9, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w9, w9, w11, ne
-; -O0:    and w13, w9, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x2, x11, x12, ne
-; -O0:    and w11, w9, #0x1
+; -O0:    subs x13, x13, x10
+; -O0:    csel w11, w9, w11, eq
+; -O0:    ands w13, w11, #0x1
+; -O0:    csel x2, x9, x12, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x9, x9, x10, ne
 ; -O0:    caspal x0, x1, x2, x3, [x8]
@@ -5701,9 +5521,7 @@ define dso_local i8 @atomicrmw_umax_i8_unaligned_seq_cst(ptr %ptr, i8 %value) {
 define dso_local i16 @atomicrmw_umax_i16_unaligned_monotonic(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umax_i16_unaligned_monotonic:
 ; -O0:    subs w10, w10, w8, uxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, hi
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i16_unaligned_monotonic:
@@ -5718,9 +5536,7 @@ define dso_local i16 @atomicrmw_umax_i16_unaligned_monotonic(ptr %ptr, i16 %valu
 define dso_local i16 @atomicrmw_umax_i16_unaligned_acquire(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umax_i16_unaligned_acquire:
 ; -O0:    subs w10, w10, w8, uxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, hi
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i16_unaligned_acquire:
@@ -5735,9 +5551,7 @@ define dso_local i16 @atomicrmw_umax_i16_unaligned_acquire(ptr %ptr, i16 %value)
 define dso_local i16 @atomicrmw_umax_i16_unaligned_release(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umax_i16_unaligned_release:
 ; -O0:    subs w10, w10, w8, uxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, hi
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i16_unaligned_release:
@@ -5752,9 +5566,7 @@ define dso_local i16 @atomicrmw_umax_i16_unaligned_release(ptr %ptr, i16 %value)
 define dso_local i16 @atomicrmw_umax_i16_unaligned_acq_rel(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umax_i16_unaligned_acq_rel:
 ; -O0:    subs w10, w10, w8, uxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, hi
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i16_unaligned_acq_rel:
@@ -5769,9 +5581,7 @@ define dso_local i16 @atomicrmw_umax_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 define dso_local i16 @atomicrmw_umax_i16_unaligned_seq_cst(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umax_i16_unaligned_seq_cst:
 ; -O0:    subs w10, w10, w8, uxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, hi
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i16_unaligned_seq_cst:
@@ -5786,9 +5596,7 @@ define dso_local i16 @atomicrmw_umax_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 define dso_local i32 @atomicrmw_umax_i32_unaligned_monotonic(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umax_i32_unaligned_monotonic:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, hi
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i32_unaligned_monotonic:
@@ -5802,9 +5610,7 @@ define dso_local i32 @atomicrmw_umax_i32_unaligned_monotonic(ptr %ptr, i32 %valu
 define dso_local i32 @atomicrmw_umax_i32_unaligned_acquire(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umax_i32_unaligned_acquire:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, hi
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i32_unaligned_acquire:
@@ -5818,9 +5624,7 @@ define dso_local i32 @atomicrmw_umax_i32_unaligned_acquire(ptr %ptr, i32 %value)
 define dso_local i32 @atomicrmw_umax_i32_unaligned_release(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umax_i32_unaligned_release:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, hi
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i32_unaligned_release:
@@ -5834,9 +5638,7 @@ define dso_local i32 @atomicrmw_umax_i32_unaligned_release(ptr %ptr, i32 %value)
 define dso_local i32 @atomicrmw_umax_i32_unaligned_acq_rel(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umax_i32_unaligned_acq_rel:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, hi
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i32_unaligned_acq_rel:
@@ -5850,9 +5652,7 @@ define dso_local i32 @atomicrmw_umax_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 define dso_local i32 @atomicrmw_umax_i32_unaligned_seq_cst(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umax_i32_unaligned_seq_cst:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, hi
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i32_unaligned_seq_cst:
@@ -5866,9 +5666,7 @@ define dso_local i32 @atomicrmw_umax_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 define dso_local i64 @atomicrmw_umax_i64_unaligned_monotonic(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umax_i64_unaligned_monotonic:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, hi
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i64_unaligned_monotonic:
@@ -5882,9 +5680,7 @@ define dso_local i64 @atomicrmw_umax_i64_unaligned_monotonic(ptr %ptr, i64 %valu
 define dso_local i64 @atomicrmw_umax_i64_unaligned_acquire(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umax_i64_unaligned_acquire:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, hi
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i64_unaligned_acquire:
@@ -5898,9 +5694,7 @@ define dso_local i64 @atomicrmw_umax_i64_unaligned_acquire(ptr %ptr, i64 %value)
 define dso_local i64 @atomicrmw_umax_i64_unaligned_release(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umax_i64_unaligned_release:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, hi
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i64_unaligned_release:
@@ -5914,9 +5708,7 @@ define dso_local i64 @atomicrmw_umax_i64_unaligned_release(ptr %ptr, i64 %value)
 define dso_local i64 @atomicrmw_umax_i64_unaligned_acq_rel(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umax_i64_unaligned_acq_rel:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, hi
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i64_unaligned_acq_rel:
@@ -5930,9 +5722,7 @@ define dso_local i64 @atomicrmw_umax_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 define dso_local i64 @atomicrmw_umax_i64_unaligned_seq_cst(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umax_i64_unaligned_seq_cst:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, hi
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i64_unaligned_seq_cst:
@@ -5946,15 +5736,11 @@ define dso_local i64 @atomicrmw_umax_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 define dso_local i128 @atomicrmw_umax_i128_unaligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_unaligned_monotonic:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -5972,15 +5758,11 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_monotonic(ptr %ptr, i128 %v
 define dso_local i128 @atomicrmw_umax_i128_unaligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_unaligned_acquire:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -5998,15 +5780,11 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_acquire(ptr %ptr, i128 %val
 define dso_local i128 @atomicrmw_umax_i128_unaligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_unaligned_release:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -6024,15 +5802,11 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_release(ptr %ptr, i128 %val
 define dso_local i128 @atomicrmw_umax_i128_unaligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_unaligned_acq_rel:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -6050,15 +5824,11 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_acq_rel(ptr %ptr, i128 %val
 define dso_local i128 @atomicrmw_umax_i128_unaligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_unaligned_seq_cst:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -6216,15 +5986,11 @@ define dso_local i64 @atomicrmw_umin_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 define dso_local i128 @atomicrmw_umin_i128_aligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_aligned_monotonic:
 ; -O0:    subs x9, x9, x10
-; -O0:    subs x9, x9, x10
 ; -O0:    subs x9, x9, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w9, w9, w11, ne
-; -O0:    and w13, w9, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x2, x11, x12, ne
-; -O0:    and w11, w9, #0x1
+; -O0:    subs x13, x13, x10
+; -O0:    csel w11, w9, w11, eq
+; -O0:    ands w13, w11, #0x1
+; -O0:    csel x2, x9, x12, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x9, x9, x10, ne
 ; -O0:    casp x0, x1, x2, x3, [x8]
@@ -6248,15 +6014,11 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_monotonic(ptr %ptr, i128 %val
 define dso_local i128 @atomicrmw_umin_i128_aligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_aligned_acquire:
 ; -O0:    subs x9, x9, x10
-; -O0:    subs x9, x9, x10
 ; -O0:    subs x9, x9, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w9, w9, w11, ne
-; -O0:    and w13, w9, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x2, x11, x12, ne
-; -O0:    and w11, w9, #0x1
+; -O0:    subs x13, x13, x10
+; -O0:    csel w11, w9, w11, eq
+; -O0:    ands w13, w11, #0x1
+; -O0:    csel x2, x9, x12, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x9, x9, x10, ne
 ; -O0:    caspa x0, x1, x2, x3, [x8]
@@ -6280,15 +6042,11 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_acquire(ptr %ptr, i128 %value
 define dso_local i128 @atomicrmw_umin_i128_aligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_aligned_release:
 ; -O0:    subs x9, x9, x10
-; -O0:    subs x9, x9, x10
 ; -O0:    subs x9, x9, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w9, w9, w11, ne
-; -O0:    and w13, w9, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x2, x11, x12, ne
-; -O0:    and w11, w9, #0x1
+; -O0:    subs x13, x13, x10
+; -O0:    csel w11, w9, w11, eq
+; -O0:    ands w13, w11, #0x1
+; -O0:    csel x2, x9, x12, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x9, x9, x10, ne
 ; -O0:    caspl x0, x1, x2, x3, [x8]
@@ -6312,15 +6070,11 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_release(ptr %ptr, i128 %value
 define dso_local i128 @atomicrmw_umin_i128_aligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_aligned_acq_rel:
 ; -O0:    subs x9, x9, x10
-; -O0:    subs x9, x9, x10
 ; -O0:    subs x9, x9, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w9, w9, w11, ne
-; -O0:    and w13, w9, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x2, x11, x12, ne
-; -O0:    and w11, w9, #0x1
+; -O0:    subs x13, x13, x10
+; -O0:    csel w11, w9, w11, eq
+; -O0:    ands w13, w11, #0x1
+; -O0:    csel x2, x9, x12, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x9, x9, x10, ne
 ; -O0:    caspal x0, x1, x2, x3, [x8]
@@ -6344,15 +6098,11 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_acq_rel(ptr %ptr, i128 %value
 define dso_local i128 @atomicrmw_umin_i128_aligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_aligned_seq_cst:
 ; -O0:    subs x9, x9, x10
-; -O0:    subs x9, x9, x10
 ; -O0:    subs x9, x9, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w9, w9, w11, ne
-; -O0:    and w13, w9, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x2, x11, x12, ne
-; -O0:    and w11, w9, #0x1
+; -O0:    subs x13, x13, x10
+; -O0:    csel w11, w9, w11, eq
+; -O0:    ands w13, w11, #0x1
+; -O0:    csel x2, x9, x12, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x9, x9, x10, ne
 ; -O0:    caspal x0, x1, x2, x3, [x8]
@@ -6411,9 +6161,7 @@ define dso_local i8 @atomicrmw_umin_i8_unaligned_seq_cst(ptr %ptr, i8 %value) {
 define dso_local i16 @atomicrmw_umin_i16_unaligned_monotonic(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umin_i16_unaligned_monotonic:
 ; -O0:    subs w10, w10, w8, uxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, ls
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i16_unaligned_monotonic:
@@ -6428,9 +6176,7 @@ define dso_local i16 @atomicrmw_umin_i16_unaligned_monotonic(ptr %ptr, i16 %valu
 define dso_local i16 @atomicrmw_umin_i16_unaligned_acquire(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umin_i16_unaligned_acquire:
 ; -O0:    subs w10, w10, w8, uxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, ls
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i16_unaligned_acquire:
@@ -6445,9 +6191,7 @@ define dso_local i16 @atomicrmw_umin_i16_unaligned_acquire(ptr %ptr, i16 %value)
 define dso_local i16 @atomicrmw_umin_i16_unaligned_release(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umin_i16_unaligned_release:
 ; -O0:    subs w10, w10, w8, uxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, ls
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i16_unaligned_release:
@@ -6462,9 +6206,7 @@ define dso_local i16 @atomicrmw_umin_i16_unaligned_release(ptr %ptr, i16 %value)
 define dso_local i16 @atomicrmw_umin_i16_unaligned_acq_rel(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umin_i16_unaligned_acq_rel:
 ; -O0:    subs w10, w10, w8, uxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, ls
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i16_unaligned_acq_rel:
@@ -6479,9 +6221,7 @@ define dso_local i16 @atomicrmw_umin_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 define dso_local i16 @atomicrmw_umin_i16_unaligned_seq_cst(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umin_i16_unaligned_seq_cst:
 ; -O0:    subs w10, w10, w8, uxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, ls
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i16_unaligned_seq_cst:
@@ -6496,9 +6236,7 @@ define dso_local i16 @atomicrmw_umin_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 define dso_local i32 @atomicrmw_umin_i32_unaligned_monotonic(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umin_i32_unaligned_monotonic:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, ls
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i32_unaligned_monotonic:
@@ -6512,9 +6250,7 @@ define dso_local i32 @atomicrmw_umin_i32_unaligned_monotonic(ptr %ptr, i32 %valu
 define dso_local i32 @atomicrmw_umin_i32_unaligned_acquire(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umin_i32_unaligned_acquire:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, ls
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i32_unaligned_acquire:
@@ -6528,9 +6264,7 @@ define dso_local i32 @atomicrmw_umin_i32_unaligned_acquire(ptr %ptr, i32 %value)
 define dso_local i32 @atomicrmw_umin_i32_unaligned_release(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umin_i32_unaligned_release:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, ls
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i32_unaligned_release:
@@ -6544,9 +6278,7 @@ define dso_local i32 @atomicrmw_umin_i32_unaligned_release(ptr %ptr, i32 %value)
 define dso_local i32 @atomicrmw_umin_i32_unaligned_acq_rel(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umin_i32_unaligned_acq_rel:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, ls
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i32_unaligned_acq_rel:
@@ -6560,9 +6292,7 @@ define dso_local i32 @atomicrmw_umin_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 define dso_local i32 @atomicrmw_umin_i32_unaligned_seq_cst(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umin_i32_unaligned_seq_cst:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, ls
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i32_unaligned_seq_cst:
@@ -6576,9 +6306,7 @@ define dso_local i32 @atomicrmw_umin_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 define dso_local i64 @atomicrmw_umin_i64_unaligned_monotonic(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umin_i64_unaligned_monotonic:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, ls
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i64_unaligned_monotonic:
@@ -6592,9 +6320,7 @@ define dso_local i64 @atomicrmw_umin_i64_unaligned_monotonic(ptr %ptr, i64 %valu
 define dso_local i64 @atomicrmw_umin_i64_unaligned_acquire(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umin_i64_unaligned_acquire:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, ls
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i64_unaligned_acquire:
@@ -6608,9 +6334,7 @@ define dso_local i64 @atomicrmw_umin_i64_unaligned_acquire(ptr %ptr, i64 %value)
 define dso_local i64 @atomicrmw_umin_i64_unaligned_release(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umin_i64_unaligned_release:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, ls
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i64_unaligned_release:
@@ -6624,9 +6348,7 @@ define dso_local i64 @atomicrmw_umin_i64_unaligned_release(ptr %ptr, i64 %value)
 define dso_local i64 @atomicrmw_umin_i64_unaligned_acq_rel(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umin_i64_unaligned_acq_rel:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, ls
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i64_unaligned_acq_rel:
@@ -6640,9 +6362,7 @@ define dso_local i64 @atomicrmw_umin_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 define dso_local i64 @atomicrmw_umin_i64_unaligned_seq_cst(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umin_i64_unaligned_seq_cst:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, ls
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i64_unaligned_seq_cst:
@@ -6656,15 +6376,11 @@ define dso_local i64 @atomicrmw_umin_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 define dso_local i128 @atomicrmw_umin_i128_unaligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_unaligned_monotonic:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -6682,15 +6398,11 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_monotonic(ptr %ptr, i128 %v
 define dso_local i128 @atomicrmw_umin_i128_unaligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_unaligned_acquire:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -6708,15 +6420,11 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_acquire(ptr %ptr, i128 %val
 define dso_local i128 @atomicrmw_umin_i128_unaligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_unaligned_release:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -6734,15 +6442,11 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_release(ptr %ptr, i128 %val
 define dso_local i128 @atomicrmw_umin_i128_unaligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_unaligned_acq_rel:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -6760,15 +6464,11 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_acq_rel(ptr %ptr, i128 %val
 define dso_local i128 @atomicrmw_umin_i128_unaligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_unaligned_seq_cst:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
diff --git a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-v8a.ll b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-v8a.ll
index 0ea04d18788f6..2473147509dc8 100644
--- a/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-v8a.ll
+++ b/llvm/test/CodeGen/AArch64/Atomics/aarch64-atomicrmw-v8a.ll
@@ -926,7 +926,6 @@ define dso_local i64 @atomicrmw_add_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 define dso_local i128 @atomicrmw_add_i128_aligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_add_i128_aligned_monotonic:
 ; -O0:    adds x14, x8, x10
-; -O0:    and w10, w8, #0x1
 ; -O0:    subs w10, w10, #1
 ; -O0:    ldxp x10, x9, [x11]
 ; -O0:    cmp x10, x12
@@ -949,7 +948,6 @@ define dso_local i128 @atomicrmw_add_i128_aligned_monotonic(ptr %ptr, i128 %valu
 define dso_local i128 @atomicrmw_add_i128_aligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_add_i128_aligned_acquire:
 ; -O0:    adds x14, x8, x10
-; -O0:    and w10, w8, #0x1
 ; -O0:    subs w10, w10, #1
 ; -O0:    ldaxp x10, x9, [x11]
 ; -O0:    cmp x10, x12
@@ -972,7 +970,6 @@ define dso_local i128 @atomicrmw_add_i128_aligned_acquire(ptr %ptr, i128 %value)
 define dso_local i128 @atomicrmw_add_i128_aligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_add_i128_aligned_release:
 ; -O0:    adds x14, x8, x10
-; -O0:    and w10, w8, #0x1
 ; -O0:    subs w10, w10, #1
 ; -O0:    ldxp x10, x9, [x11]
 ; -O0:    cmp x10, x12
@@ -995,7 +992,6 @@ define dso_local i128 @atomicrmw_add_i128_aligned_release(ptr %ptr, i128 %value)
 define dso_local i128 @atomicrmw_add_i128_aligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_add_i128_aligned_acq_rel:
 ; -O0:    adds x14, x8, x10
-; -O0:    and w10, w8, #0x1
 ; -O0:    subs w10, w10, #1
 ; -O0:    ldaxp x10, x9, [x11]
 ; -O0:    cmp x10, x12
@@ -1018,7 +1014,6 @@ define dso_local i128 @atomicrmw_add_i128_aligned_acq_rel(ptr %ptr, i128 %value)
 define dso_local i128 @atomicrmw_add_i128_aligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_add_i128_aligned_seq_cst:
 ; -O0:    adds x14, x8, x10
-; -O0:    and w10, w8, #0x1
 ; -O0:    subs w10, w10, #1
 ; -O0:    ldaxp x10, x9, [x11]
 ; -O0:    cmp x10, x12
@@ -1306,7 +1301,6 @@ define dso_local i64 @atomicrmw_add_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 define dso_local i128 @atomicrmw_add_i128_unaligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_add_i128_unaligned_monotonic:
 ; -O0:    adds x9, x8, x9
-; -O0:    and w11, w8, #0x1
 ; -O0:    subs w11, w11, #1
 ; -O0:    bl __atomic_compare_exchange
 ;
@@ -1321,7 +1315,6 @@ define dso_local i128 @atomicrmw_add_i128_unaligned_monotonic(ptr %ptr, i128 %va
 define dso_local i128 @atomicrmw_add_i128_unaligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_add_i128_unaligned_acquire:
 ; -O0:    adds x9, x8, x9
-; -O0:    and w11, w8, #0x1
 ; -O0:    subs w11, w11, #1
 ; -O0:    bl __atomic_compare_exchange
 ;
@@ -1336,7 +1329,6 @@ define dso_local i128 @atomicrmw_add_i128_unaligned_acquire(ptr %ptr, i128 %valu
 define dso_local i128 @atomicrmw_add_i128_unaligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_add_i128_unaligned_release:
 ; -O0:    adds x9, x8, x9
-; -O0:    and w11, w8, #0x1
 ; -O0:    subs w11, w11, #1
 ; -O0:    bl __atomic_compare_exchange
 ;
@@ -1351,7 +1343,6 @@ define dso_local i128 @atomicrmw_add_i128_unaligned_release(ptr %ptr, i128 %valu
 define dso_local i128 @atomicrmw_add_i128_unaligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_add_i128_unaligned_acq_rel:
 ; -O0:    adds x9, x8, x9
-; -O0:    and w11, w8, #0x1
 ; -O0:    subs w11, w11, #1
 ; -O0:    bl __atomic_compare_exchange
 ;
@@ -1366,7 +1357,6 @@ define dso_local i128 @atomicrmw_add_i128_unaligned_acq_rel(ptr %ptr, i128 %valu
 define dso_local i128 @atomicrmw_add_i128_unaligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_add_i128_unaligned_seq_cst:
 ; -O0:    adds x9, x8, x9
-; -O0:    and w11, w8, #0x1
 ; -O0:    subs w11, w11, #1
 ; -O0:    bl __atomic_compare_exchange
 ;
@@ -1706,7 +1696,6 @@ define dso_local i64 @atomicrmw_sub_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 define dso_local i128 @atomicrmw_sub_i128_aligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_sub_i128_aligned_monotonic:
 ; -O0:    subs x14, x8, x10
-; -O0:    and w10, w8, #0x1
 ; -O0:    ldxp x10, x9, [x11]
 ; -O0:    cmp x10, x12
 ; -O0:    cmp x9, x13
@@ -1728,7 +1717,6 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_monotonic(ptr %ptr, i128 %valu
 define dso_local i128 @atomicrmw_sub_i128_aligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_sub_i128_aligned_acquire:
 ; -O0:    subs x14, x8, x10
-; -O0:    and w10, w8, #0x1
 ; -O0:    ldaxp x10, x9, [x11]
 ; -O0:    cmp x10, x12
 ; -O0:    cmp x9, x13
@@ -1750,7 +1738,6 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_acquire(ptr %ptr, i128 %value)
 define dso_local i128 @atomicrmw_sub_i128_aligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_sub_i128_aligned_release:
 ; -O0:    subs x14, x8, x10
-; -O0:    and w10, w8, #0x1
 ; -O0:    ldxp x10, x9, [x11]
 ; -O0:    cmp x10, x12
 ; -O0:    cmp x9, x13
@@ -1772,7 +1759,6 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_release(ptr %ptr, i128 %value)
 define dso_local i128 @atomicrmw_sub_i128_aligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_sub_i128_aligned_acq_rel:
 ; -O0:    subs x14, x8, x10
-; -O0:    and w10, w8, #0x1
 ; -O0:    ldaxp x10, x9, [x11]
 ; -O0:    cmp x10, x12
 ; -O0:    cmp x9, x13
@@ -1794,7 +1780,6 @@ define dso_local i128 @atomicrmw_sub_i128_aligned_acq_rel(ptr %ptr, i128 %value)
 define dso_local i128 @atomicrmw_sub_i128_aligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_sub_i128_aligned_seq_cst:
 ; -O0:    subs x14, x8, x10
-; -O0:    and w10, w8, #0x1
 ; -O0:    ldaxp x10, x9, [x11]
 ; -O0:    cmp x10, x12
 ; -O0:    cmp x9, x13
@@ -2081,7 +2066,6 @@ define dso_local i64 @atomicrmw_sub_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 define dso_local i128 @atomicrmw_sub_i128_unaligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_sub_i128_unaligned_monotonic:
 ; -O0:    subs x9, x8, x9
-; -O0:    and w11, w8, #0x1
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i128_unaligned_monotonic:
@@ -2095,7 +2079,6 @@ define dso_local i128 @atomicrmw_sub_i128_unaligned_monotonic(ptr %ptr, i128 %va
 define dso_local i128 @atomicrmw_sub_i128_unaligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_sub_i128_unaligned_acquire:
 ; -O0:    subs x9, x8, x9
-; -O0:    and w11, w8, #0x1
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i128_unaligned_acquire:
@@ -2109,7 +2092,6 @@ define dso_local i128 @atomicrmw_sub_i128_unaligned_acquire(ptr %ptr, i128 %valu
 define dso_local i128 @atomicrmw_sub_i128_unaligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_sub_i128_unaligned_release:
 ; -O0:    subs x9, x8, x9
-; -O0:    and w11, w8, #0x1
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i128_unaligned_release:
@@ -2123,7 +2105,6 @@ define dso_local i128 @atomicrmw_sub_i128_unaligned_release(ptr %ptr, i128 %valu
 define dso_local i128 @atomicrmw_sub_i128_unaligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_sub_i128_unaligned_acq_rel:
 ; -O0:    subs x9, x8, x9
-; -O0:    and w11, w8, #0x1
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i128_unaligned_acq_rel:
@@ -2137,7 +2118,6 @@ define dso_local i128 @atomicrmw_sub_i128_unaligned_acq_rel(ptr %ptr, i128 %valu
 define dso_local i128 @atomicrmw_sub_i128_unaligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_sub_i128_unaligned_seq_cst:
 ; -O0:    subs x9, x8, x9
-; -O0:    and w11, w8, #0x1
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_sub_i128_unaligned_seq_cst:
@@ -5392,9 +5372,7 @@ define dso_local i8 @atomicrmw_max_i8_aligned_monotonic(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_max_i8_aligned_monotonic:
 ; -O0:    sxtb w9, w10
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, gt
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -5415,9 +5393,7 @@ define dso_local i8 @atomicrmw_max_i8_aligned_acquire(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_max_i8_aligned_acquire:
 ; -O0:    sxtb w9, w10
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, gt
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -5438,9 +5414,7 @@ define dso_local i8 @atomicrmw_max_i8_aligned_release(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_max_i8_aligned_release:
 ; -O0:    sxtb w9, w10
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, gt
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -5461,9 +5435,7 @@ define dso_local i8 @atomicrmw_max_i8_aligned_acq_rel(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_max_i8_aligned_acq_rel:
 ; -O0:    sxtb w9, w10
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, gt
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -5484,9 +5456,7 @@ define dso_local i8 @atomicrmw_max_i8_aligned_seq_cst(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_max_i8_aligned_seq_cst:
 ; -O0:    sxtb w9, w10
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, gt
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -5507,9 +5477,7 @@ define dso_local i16 @atomicrmw_max_i16_aligned_monotonic(ptr %ptr, i16 %value)
 ; -O0-LABEL: atomicrmw_max_i16_aligned_monotonic:
 ; -O0:    sxth w10, w8
 ; -O0:    subs w10, w10, w9, sxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, gt
 ; -O0:    ldaxrh w9, [x11]
 ; -O0:    cmp w9, w8, uxth
 ; -O0:    stlxrh w10, w12, [x11]
@@ -5529,9 +5497,7 @@ define dso_local i16 @atomicrmw_max_i16_aligned_acquire(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_max_i16_aligned_acquire:
 ; -O0:    sxth w10, w8
 ; -O0:    subs w10, w10, w9, sxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, gt
 ; -O0:    ldaxrh w9, [x11]
 ; -O0:    cmp w9, w8, uxth
 ; -O0:    stlxrh w10, w12, [x11]
@@ -5551,9 +5517,7 @@ define dso_local i16 @atomicrmw_max_i16_aligned_release(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_max_i16_aligned_release:
 ; -O0:    sxth w10, w8
 ; -O0:    subs w10, w10, w9, sxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, gt
 ; -O0:    ldaxrh w9, [x11]
 ; -O0:    cmp w9, w8, uxth
 ; -O0:    stlxrh w10, w12, [x11]
@@ -5573,9 +5537,7 @@ define dso_local i16 @atomicrmw_max_i16_aligned_acq_rel(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_max_i16_aligned_acq_rel:
 ; -O0:    sxth w10, w8
 ; -O0:    subs w10, w10, w9, sxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, gt
 ; -O0:    ldaxrh w9, [x11]
 ; -O0:    cmp w9, w8, uxth
 ; -O0:    stlxrh w10, w12, [x11]
@@ -5595,9 +5557,7 @@ define dso_local i16 @atomicrmw_max_i16_aligned_seq_cst(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_max_i16_aligned_seq_cst:
 ; -O0:    sxth w10, w8
 ; -O0:    subs w10, w10, w9, sxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, gt
 ; -O0:    ldaxrh w9, [x11]
 ; -O0:    cmp w9, w8, uxth
 ; -O0:    stlxrh w10, w12, [x11]
@@ -5616,9 +5576,7 @@ define dso_local i16 @atomicrmw_max_i16_aligned_seq_cst(ptr %ptr, i16 %value) {
 define dso_local i32 @atomicrmw_max_i32_aligned_monotonic(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_max_i32_aligned_monotonic:
 ; -O0:    subs w10, w8, w9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, gt
 ; -O0:    ldaxr w9, [x11]
 ; -O0:    cmp w9, w8
 ; -O0:    stlxr w10, w12, [x11]
@@ -5636,9 +5594,7 @@ define dso_local i32 @atomicrmw_max_i32_aligned_monotonic(ptr %ptr, i32 %value)
 define dso_local i32 @atomicrmw_max_i32_aligned_acquire(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_max_i32_aligned_acquire:
 ; -O0:    subs w10, w8, w9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, gt
 ; -O0:    ldaxr w9, [x11]
 ; -O0:    cmp w9, w8
 ; -O0:    stlxr w10, w12, [x11]
@@ -5656,9 +5612,7 @@ define dso_local i32 @atomicrmw_max_i32_aligned_acquire(ptr %ptr, i32 %value) {
 define dso_local i32 @atomicrmw_max_i32_aligned_release(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_max_i32_aligned_release:
 ; -O0:    subs w10, w8, w9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, gt
 ; -O0:    ldaxr w9, [x11]
 ; -O0:    cmp w9, w8
 ; -O0:    stlxr w10, w12, [x11]
@@ -5676,9 +5630,7 @@ define dso_local i32 @atomicrmw_max_i32_aligned_release(ptr %ptr, i32 %value) {
 define dso_local i32 @atomicrmw_max_i32_aligned_acq_rel(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_max_i32_aligned_acq_rel:
 ; -O0:    subs w10, w8, w9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, gt
 ; -O0:    ldaxr w9, [x11]
 ; -O0:    cmp w9, w8
 ; -O0:    stlxr w10, w12, [x11]
@@ -5696,9 +5648,7 @@ define dso_local i32 @atomicrmw_max_i32_aligned_acq_rel(ptr %ptr, i32 %value) {
 define dso_local i32 @atomicrmw_max_i32_aligned_seq_cst(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_max_i32_aligned_seq_cst:
 ; -O0:    subs w10, w8, w9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, gt
 ; -O0:    ldaxr w9, [x11]
 ; -O0:    cmp w9, w8
 ; -O0:    stlxr w10, w12, [x11]
@@ -5716,9 +5666,7 @@ define dso_local i32 @atomicrmw_max_i32_aligned_seq_cst(ptr %ptr, i32 %value) {
 define dso_local i64 @atomicrmw_max_i64_aligned_monotonic(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_max_i64_aligned_monotonic:
 ; -O0:    subs x10, x8, x9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x12, x8, x9, ne
+; -O0:    csel x12, x8, x9, gt
 ; -O0:    ldaxr x9, [x11]
 ; -O0:    cmp x9, x8
 ; -O0:    stlxr w10, x12, [x11]
@@ -5736,9 +5684,7 @@ define dso_local i64 @atomicrmw_max_i64_aligned_monotonic(ptr %ptr, i64 %value)
 define dso_local i64 @atomicrmw_max_i64_aligned_acquire(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_max_i64_aligned_acquire:
 ; -O0:    subs x10, x8, x9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x12, x8, x9, ne
+; -O0:    csel x12, x8, x9, gt
 ; -O0:    ldaxr x9, [x11]
 ; -O0:    cmp x9, x8
 ; -O0:    stlxr w10, x12, [x11]
@@ -5756,9 +5702,7 @@ define dso_local i64 @atomicrmw_max_i64_aligned_acquire(ptr %ptr, i64 %value) {
 define dso_local i64 @atomicrmw_max_i64_aligned_release(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_max_i64_aligned_release:
 ; -O0:    subs x10, x8, x9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x12, x8, x9, ne
+; -O0:    csel x12, x8, x9, gt
 ; -O0:    ldaxr x9, [x11]
 ; -O0:    cmp x9, x8
 ; -O0:    stlxr w10, x12, [x11]
@@ -5776,9 +5720,7 @@ define dso_local i64 @atomicrmw_max_i64_aligned_release(ptr %ptr, i64 %value) {
 define dso_local i64 @atomicrmw_max_i64_aligned_acq_rel(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_max_i64_aligned_acq_rel:
 ; -O0:    subs x10, x8, x9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x12, x8, x9, ne
+; -O0:    csel x12, x8, x9, gt
 ; -O0:    ldaxr x9, [x11]
 ; -O0:    cmp x9, x8
 ; -O0:    stlxr w10, x12, [x11]
@@ -5796,9 +5738,7 @@ define dso_local i64 @atomicrmw_max_i64_aligned_acq_rel(ptr %ptr, i64 %value) {
 define dso_local i64 @atomicrmw_max_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_max_i64_aligned_seq_cst:
 ; -O0:    subs x10, x8, x9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x12, x8, x9, ne
+; -O0:    csel x12, x8, x9, gt
 ; -O0:    ldaxr x9, [x11]
 ; -O0:    cmp x9, x8
 ; -O0:    stlxr w10, x12, [x11]
@@ -5816,15 +5756,11 @@ define dso_local i64 @atomicrmw_max_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 define dso_local i128 @atomicrmw_max_i128_aligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_aligned_monotonic:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w8, w8, w10, ne
-; -O0:    and w13, w8, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x14, x10, x12, ne
-; -O0:    and w10, w8, #0x1
+; -O0:    subs x13, x13, x9
+; -O0:    csel w10, w8, w10, eq
+; -O0:    ands w13, w10, #0x1
+; -O0:    csel x14, x8, x12, ne
 ; -O0:    ands w10, w10, #0x1
 ; -O0:    csel x15, x8, x9, ne
 ; -O0:    ldxp x10, x9, [x11]
@@ -5850,15 +5786,11 @@ define dso_local i128 @atomicrmw_max_i128_aligned_monotonic(ptr %ptr, i128 %valu
 define dso_local i128 @atomicrmw_max_i128_aligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_aligned_acquire:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w8, w8, w10, ne
-; -O0:    and w13, w8, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x14, x10, x12, ne
-; -O0:    and w10, w8, #0x1
+; -O0:    subs x13, x13, x9
+; -O0:    csel w10, w8, w10, eq
+; -O0:    ands w13, w10, #0x1
+; -O0:    csel x14, x8, x12, ne
 ; -O0:    ands w10, w10, #0x1
 ; -O0:    csel x15, x8, x9, ne
 ; -O0:    ldaxp x10, x9, [x11]
@@ -5884,15 +5816,11 @@ define dso_local i128 @atomicrmw_max_i128_aligned_acquire(ptr %ptr, i128 %value)
 define dso_local i128 @atomicrmw_max_i128_aligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_aligned_release:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w8, w8, w10, ne
-; -O0:    and w13, w8, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x14, x10, x12, ne
-; -O0:    and w10, w8, #0x1
+; -O0:    subs x13, x13, x9
+; -O0:    csel w10, w8, w10, eq
+; -O0:    ands w13, w10, #0x1
+; -O0:    csel x14, x8, x12, ne
 ; -O0:    ands w10, w10, #0x1
 ; -O0:    csel x15, x8, x9, ne
 ; -O0:    ldxp x10, x9, [x11]
@@ -5918,15 +5846,11 @@ define dso_local i128 @atomicrmw_max_i128_aligned_release(ptr %ptr, i128 %value)
 define dso_local i128 @atomicrmw_max_i128_aligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_aligned_acq_rel:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w8, w8, w10, ne
-; -O0:    and w13, w8, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x14, x10, x12, ne
-; -O0:    and w10, w8, #0x1
+; -O0:    subs x13, x13, x9
+; -O0:    csel w10, w8, w10, eq
+; -O0:    ands w13, w10, #0x1
+; -O0:    csel x14, x8, x12, ne
 ; -O0:    ands w10, w10, #0x1
 ; -O0:    csel x15, x8, x9, ne
 ; -O0:    ldaxp x10, x9, [x11]
@@ -5952,15 +5876,11 @@ define dso_local i128 @atomicrmw_max_i128_aligned_acq_rel(ptr %ptr, i128 %value)
 define dso_local i128 @atomicrmw_max_i128_aligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_aligned_seq_cst:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w8, w8, w10, ne
-; -O0:    and w13, w8, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x14, x10, x12, ne
-; -O0:    and w10, w8, #0x1
+; -O0:    subs x13, x13, x9
+; -O0:    csel w10, w8, w10, eq
+; -O0:    ands w13, w10, #0x1
+; -O0:    csel x14, x8, x12, ne
 ; -O0:    ands w10, w10, #0x1
 ; -O0:    csel x15, x8, x9, ne
 ; -O0:    ldaxp x10, x9, [x11]
@@ -5987,9 +5907,7 @@ define dso_local i8 @atomicrmw_max_i8_unaligned_monotonic(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_max_i8_unaligned_monotonic:
 ; -O0:    sxtb w9, w10
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, gt
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -6010,9 +5928,7 @@ define dso_local i8 @atomicrmw_max_i8_unaligned_acquire(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_max_i8_unaligned_acquire:
 ; -O0:    sxtb w9, w10
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, gt
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -6033,9 +5949,7 @@ define dso_local i8 @atomicrmw_max_i8_unaligned_release(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_max_i8_unaligned_release:
 ; -O0:    sxtb w9, w10
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, gt
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -6056,9 +5970,7 @@ define dso_local i8 @atomicrmw_max_i8_unaligned_acq_rel(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_max_i8_unaligned_acq_rel:
 ; -O0:    sxtb w9, w10
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, gt
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -6079,9 +5991,7 @@ define dso_local i8 @atomicrmw_max_i8_unaligned_seq_cst(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_max_i8_unaligned_seq_cst:
 ; -O0:    sxtb w9, w10
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, gt
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -6102,9 +6012,7 @@ define dso_local i16 @atomicrmw_max_i16_unaligned_monotonic(ptr %ptr, i16 %value
 ; -O0-LABEL: atomicrmw_max_i16_unaligned_monotonic:
 ; -O0:    sxth w10, w9
 ; -O0:    subs w10, w10, w8, sxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, gt
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i16_unaligned_monotonic:
@@ -6120,9 +6028,7 @@ define dso_local i16 @atomicrmw_max_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ; -O0-LABEL: atomicrmw_max_i16_unaligned_acquire:
 ; -O0:    sxth w10, w9
 ; -O0:    subs w10, w10, w8, sxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, gt
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i16_unaligned_acquire:
@@ -6138,9 +6044,7 @@ define dso_local i16 @atomicrmw_max_i16_unaligned_release(ptr %ptr, i16 %value)
 ; -O0-LABEL: atomicrmw_max_i16_unaligned_release:
 ; -O0:    sxth w10, w9
 ; -O0:    subs w10, w10, w8, sxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, gt
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i16_unaligned_release:
@@ -6156,9 +6060,7 @@ define dso_local i16 @atomicrmw_max_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ; -O0-LABEL: atomicrmw_max_i16_unaligned_acq_rel:
 ; -O0:    sxth w10, w9
 ; -O0:    subs w10, w10, w8, sxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, gt
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i16_unaligned_acq_rel:
@@ -6174,9 +6076,7 @@ define dso_local i16 @atomicrmw_max_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ; -O0-LABEL: atomicrmw_max_i16_unaligned_seq_cst:
 ; -O0:    sxth w10, w9
 ; -O0:    subs w10, w10, w8, sxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, gt
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i16_unaligned_seq_cst:
@@ -6191,9 +6091,7 @@ define dso_local i16 @atomicrmw_max_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 define dso_local i32 @atomicrmw_max_i32_unaligned_monotonic(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_max_i32_unaligned_monotonic:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, gt
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i32_unaligned_monotonic:
@@ -6207,9 +6105,7 @@ define dso_local i32 @atomicrmw_max_i32_unaligned_monotonic(ptr %ptr, i32 %value
 define dso_local i32 @atomicrmw_max_i32_unaligned_acquire(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_max_i32_unaligned_acquire:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, gt
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i32_unaligned_acquire:
@@ -6223,9 +6119,7 @@ define dso_local i32 @atomicrmw_max_i32_unaligned_acquire(ptr %ptr, i32 %value)
 define dso_local i32 @atomicrmw_max_i32_unaligned_release(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_max_i32_unaligned_release:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, gt
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i32_unaligned_release:
@@ -6239,9 +6133,7 @@ define dso_local i32 @atomicrmw_max_i32_unaligned_release(ptr %ptr, i32 %value)
 define dso_local i32 @atomicrmw_max_i32_unaligned_acq_rel(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_max_i32_unaligned_acq_rel:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, gt
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i32_unaligned_acq_rel:
@@ -6255,9 +6147,7 @@ define dso_local i32 @atomicrmw_max_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 define dso_local i32 @atomicrmw_max_i32_unaligned_seq_cst(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_max_i32_unaligned_seq_cst:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, gt
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i32_unaligned_seq_cst:
@@ -6271,9 +6161,7 @@ define dso_local i32 @atomicrmw_max_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 define dso_local i64 @atomicrmw_max_i64_unaligned_monotonic(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_max_i64_unaligned_monotonic:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, gt
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i64_unaligned_monotonic:
@@ -6287,9 +6175,7 @@ define dso_local i64 @atomicrmw_max_i64_unaligned_monotonic(ptr %ptr, i64 %value
 define dso_local i64 @atomicrmw_max_i64_unaligned_acquire(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_max_i64_unaligned_acquire:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, gt
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i64_unaligned_acquire:
@@ -6303,9 +6189,7 @@ define dso_local i64 @atomicrmw_max_i64_unaligned_acquire(ptr %ptr, i64 %value)
 define dso_local i64 @atomicrmw_max_i64_unaligned_release(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_max_i64_unaligned_release:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, gt
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i64_unaligned_release:
@@ -6319,9 +6203,7 @@ define dso_local i64 @atomicrmw_max_i64_unaligned_release(ptr %ptr, i64 %value)
 define dso_local i64 @atomicrmw_max_i64_unaligned_acq_rel(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_max_i64_unaligned_acq_rel:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, gt
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i64_unaligned_acq_rel:
@@ -6335,9 +6217,7 @@ define dso_local i64 @atomicrmw_max_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 define dso_local i64 @atomicrmw_max_i64_unaligned_seq_cst(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_max_i64_unaligned_seq_cst:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, gt
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_max_i64_unaligned_seq_cst:
@@ -6351,15 +6231,11 @@ define dso_local i64 @atomicrmw_max_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 define dso_local i128 @atomicrmw_max_i128_unaligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_unaligned_monotonic:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -6377,15 +6253,11 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_monotonic(ptr %ptr, i128 %va
 define dso_local i128 @atomicrmw_max_i128_unaligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_unaligned_acquire:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -6403,15 +6275,11 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_acquire(ptr %ptr, i128 %valu
 define dso_local i128 @atomicrmw_max_i128_unaligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_unaligned_release:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -6429,15 +6297,11 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_release(ptr %ptr, i128 %valu
 define dso_local i128 @atomicrmw_max_i128_unaligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_unaligned_acq_rel:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -6455,15 +6319,11 @@ define dso_local i128 @atomicrmw_max_i128_unaligned_acq_rel(ptr %ptr, i128 %valu
 define dso_local i128 @atomicrmw_max_i128_unaligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_max_i128_unaligned_seq_cst:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -6482,9 +6342,7 @@ define dso_local i8 @atomicrmw_min_i8_aligned_monotonic(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_min_i8_aligned_monotonic:
 ; -O0:    sxtb w9, w10
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, le
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -6505,9 +6363,7 @@ define dso_local i8 @atomicrmw_min_i8_aligned_acquire(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_min_i8_aligned_acquire:
 ; -O0:    sxtb w9, w10
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, le
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -6528,9 +6384,7 @@ define dso_local i8 @atomicrmw_min_i8_aligned_release(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_min_i8_aligned_release:
 ; -O0:    sxtb w9, w10
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, le
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -6551,9 +6405,7 @@ define dso_local i8 @atomicrmw_min_i8_aligned_acq_rel(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_min_i8_aligned_acq_rel:
 ; -O0:    sxtb w9, w10
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, le
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -6574,9 +6426,7 @@ define dso_local i8 @atomicrmw_min_i8_aligned_seq_cst(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_min_i8_aligned_seq_cst:
 ; -O0:    sxtb w9, w10
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, le
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -6597,9 +6447,7 @@ define dso_local i16 @atomicrmw_min_i16_aligned_monotonic(ptr %ptr, i16 %value)
 ; -O0-LABEL: atomicrmw_min_i16_aligned_monotonic:
 ; -O0:    sxth w10, w8
 ; -O0:    subs w10, w10, w9, sxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, le
 ; -O0:    ldaxrh w9, [x11]
 ; -O0:    cmp w9, w8, uxth
 ; -O0:    stlxrh w10, w12, [x11]
@@ -6619,9 +6467,7 @@ define dso_local i16 @atomicrmw_min_i16_aligned_acquire(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_min_i16_aligned_acquire:
 ; -O0:    sxth w10, w8
 ; -O0:    subs w10, w10, w9, sxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, le
 ; -O0:    ldaxrh w9, [x11]
 ; -O0:    cmp w9, w8, uxth
 ; -O0:    stlxrh w10, w12, [x11]
@@ -6641,9 +6487,7 @@ define dso_local i16 @atomicrmw_min_i16_aligned_release(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_min_i16_aligned_release:
 ; -O0:    sxth w10, w8
 ; -O0:    subs w10, w10, w9, sxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, le
 ; -O0:    ldaxrh w9, [x11]
 ; -O0:    cmp w9, w8, uxth
 ; -O0:    stlxrh w10, w12, [x11]
@@ -6663,9 +6507,7 @@ define dso_local i16 @atomicrmw_min_i16_aligned_acq_rel(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_min_i16_aligned_acq_rel:
 ; -O0:    sxth w10, w8
 ; -O0:    subs w10, w10, w9, sxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, le
 ; -O0:    ldaxrh w9, [x11]
 ; -O0:    cmp w9, w8, uxth
 ; -O0:    stlxrh w10, w12, [x11]
@@ -6685,9 +6527,7 @@ define dso_local i16 @atomicrmw_min_i16_aligned_seq_cst(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_min_i16_aligned_seq_cst:
 ; -O0:    sxth w10, w8
 ; -O0:    subs w10, w10, w9, sxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, le
 ; -O0:    ldaxrh w9, [x11]
 ; -O0:    cmp w9, w8, uxth
 ; -O0:    stlxrh w10, w12, [x11]
@@ -6706,9 +6546,7 @@ define dso_local i16 @atomicrmw_min_i16_aligned_seq_cst(ptr %ptr, i16 %value) {
 define dso_local i32 @atomicrmw_min_i32_aligned_monotonic(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_min_i32_aligned_monotonic:
 ; -O0:    subs w10, w8, w9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, le
 ; -O0:    ldaxr w9, [x11]
 ; -O0:    cmp w9, w8
 ; -O0:    stlxr w10, w12, [x11]
@@ -6726,9 +6564,7 @@ define dso_local i32 @atomicrmw_min_i32_aligned_monotonic(ptr %ptr, i32 %value)
 define dso_local i32 @atomicrmw_min_i32_aligned_acquire(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_min_i32_aligned_acquire:
 ; -O0:    subs w10, w8, w9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, le
 ; -O0:    ldaxr w9, [x11]
 ; -O0:    cmp w9, w8
 ; -O0:    stlxr w10, w12, [x11]
@@ -6746,9 +6582,7 @@ define dso_local i32 @atomicrmw_min_i32_aligned_acquire(ptr %ptr, i32 %value) {
 define dso_local i32 @atomicrmw_min_i32_aligned_release(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_min_i32_aligned_release:
 ; -O0:    subs w10, w8, w9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, le
 ; -O0:    ldaxr w9, [x11]
 ; -O0:    cmp w9, w8
 ; -O0:    stlxr w10, w12, [x11]
@@ -6766,9 +6600,7 @@ define dso_local i32 @atomicrmw_min_i32_aligned_release(ptr %ptr, i32 %value) {
 define dso_local i32 @atomicrmw_min_i32_aligned_acq_rel(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_min_i32_aligned_acq_rel:
 ; -O0:    subs w10, w8, w9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, le
 ; -O0:    ldaxr w9, [x11]
 ; -O0:    cmp w9, w8
 ; -O0:    stlxr w10, w12, [x11]
@@ -6786,9 +6618,7 @@ define dso_local i32 @atomicrmw_min_i32_aligned_acq_rel(ptr %ptr, i32 %value) {
 define dso_local i32 @atomicrmw_min_i32_aligned_seq_cst(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_min_i32_aligned_seq_cst:
 ; -O0:    subs w10, w8, w9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, le
 ; -O0:    ldaxr w9, [x11]
 ; -O0:    cmp w9, w8
 ; -O0:    stlxr w10, w12, [x11]
@@ -6806,9 +6636,7 @@ define dso_local i32 @atomicrmw_min_i32_aligned_seq_cst(ptr %ptr, i32 %value) {
 define dso_local i64 @atomicrmw_min_i64_aligned_monotonic(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_min_i64_aligned_monotonic:
 ; -O0:    subs x10, x8, x9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x12, x8, x9, ne
+; -O0:    csel x12, x8, x9, le
 ; -O0:    ldaxr x9, [x11]
 ; -O0:    cmp x9, x8
 ; -O0:    stlxr w10, x12, [x11]
@@ -6826,9 +6654,7 @@ define dso_local i64 @atomicrmw_min_i64_aligned_monotonic(ptr %ptr, i64 %value)
 define dso_local i64 @atomicrmw_min_i64_aligned_acquire(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_min_i64_aligned_acquire:
 ; -O0:    subs x10, x8, x9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x12, x8, x9, ne
+; -O0:    csel x12, x8, x9, le
 ; -O0:    ldaxr x9, [x11]
 ; -O0:    cmp x9, x8
 ; -O0:    stlxr w10, x12, [x11]
@@ -6846,9 +6672,7 @@ define dso_local i64 @atomicrmw_min_i64_aligned_acquire(ptr %ptr, i64 %value) {
 define dso_local i64 @atomicrmw_min_i64_aligned_release(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_min_i64_aligned_release:
 ; -O0:    subs x10, x8, x9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x12, x8, x9, ne
+; -O0:    csel x12, x8, x9, le
 ; -O0:    ldaxr x9, [x11]
 ; -O0:    cmp x9, x8
 ; -O0:    stlxr w10, x12, [x11]
@@ -6866,9 +6690,7 @@ define dso_local i64 @atomicrmw_min_i64_aligned_release(ptr %ptr, i64 %value) {
 define dso_local i64 @atomicrmw_min_i64_aligned_acq_rel(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_min_i64_aligned_acq_rel:
 ; -O0:    subs x10, x8, x9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x12, x8, x9, ne
+; -O0:    csel x12, x8, x9, le
 ; -O0:    ldaxr x9, [x11]
 ; -O0:    cmp x9, x8
 ; -O0:    stlxr w10, x12, [x11]
@@ -6886,9 +6708,7 @@ define dso_local i64 @atomicrmw_min_i64_aligned_acq_rel(ptr %ptr, i64 %value) {
 define dso_local i64 @atomicrmw_min_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_min_i64_aligned_seq_cst:
 ; -O0:    subs x10, x8, x9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x12, x8, x9, ne
+; -O0:    csel x12, x8, x9, le
 ; -O0:    ldaxr x9, [x11]
 ; -O0:    cmp x9, x8
 ; -O0:    stlxr w10, x12, [x11]
@@ -6906,15 +6726,11 @@ define dso_local i64 @atomicrmw_min_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 define dso_local i128 @atomicrmw_min_i128_aligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_aligned_monotonic:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w8, w8, w10, ne
-; -O0:    and w13, w8, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x14, x10, x12, ne
-; -O0:    and w10, w8, #0x1
+; -O0:    subs x13, x13, x9
+; -O0:    csel w10, w8, w10, eq
+; -O0:    ands w13, w10, #0x1
+; -O0:    csel x14, x8, x12, ne
 ; -O0:    ands w10, w10, #0x1
 ; -O0:    csel x15, x8, x9, ne
 ; -O0:    ldxp x10, x9, [x11]
@@ -6940,15 +6756,11 @@ define dso_local i128 @atomicrmw_min_i128_aligned_monotonic(ptr %ptr, i128 %valu
 define dso_local i128 @atomicrmw_min_i128_aligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_aligned_acquire:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w8, w8, w10, ne
-; -O0:    and w13, w8, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x14, x10, x12, ne
-; -O0:    and w10, w8, #0x1
+; -O0:    subs x13, x13, x9
+; -O0:    csel w10, w8, w10, eq
+; -O0:    ands w13, w10, #0x1
+; -O0:    csel x14, x8, x12, ne
 ; -O0:    ands w10, w10, #0x1
 ; -O0:    csel x15, x8, x9, ne
 ; -O0:    ldaxp x10, x9, [x11]
@@ -6974,15 +6786,11 @@ define dso_local i128 @atomicrmw_min_i128_aligned_acquire(ptr %ptr, i128 %value)
 define dso_local i128 @atomicrmw_min_i128_aligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_aligned_release:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w8, w8, w10, ne
-; -O0:    and w13, w8, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x14, x10, x12, ne
-; -O0:    and w10, w8, #0x1
+; -O0:    subs x13, x13, x9
+; -O0:    csel w10, w8, w10, eq
+; -O0:    ands w13, w10, #0x1
+; -O0:    csel x14, x8, x12, ne
 ; -O0:    ands w10, w10, #0x1
 ; -O0:    csel x15, x8, x9, ne
 ; -O0:    ldxp x10, x9, [x11]
@@ -7008,15 +6816,11 @@ define dso_local i128 @atomicrmw_min_i128_aligned_release(ptr %ptr, i128 %value)
 define dso_local i128 @atomicrmw_min_i128_aligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_aligned_acq_rel:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w8, w8, w10, ne
-; -O0:    and w13, w8, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x14, x10, x12, ne
-; -O0:    and w10, w8, #0x1
+; -O0:    subs x13, x13, x9
+; -O0:    csel w10, w8, w10, eq
+; -O0:    ands w13, w10, #0x1
+; -O0:    csel x14, x8, x12, ne
 ; -O0:    ands w10, w10, #0x1
 ; -O0:    csel x15, x8, x9, ne
 ; -O0:    ldaxp x10, x9, [x11]
@@ -7042,15 +6846,11 @@ define dso_local i128 @atomicrmw_min_i128_aligned_acq_rel(ptr %ptr, i128 %value)
 define dso_local i128 @atomicrmw_min_i128_aligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_aligned_seq_cst:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w8, w8, w10, ne
-; -O0:    and w13, w8, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x14, x10, x12, ne
-; -O0:    and w10, w8, #0x1
+; -O0:    subs x13, x13, x9
+; -O0:    csel w10, w8, w10, eq
+; -O0:    ands w13, w10, #0x1
+; -O0:    csel x14, x8, x12, ne
 ; -O0:    ands w10, w10, #0x1
 ; -O0:    csel x15, x8, x9, ne
 ; -O0:    ldaxp x10, x9, [x11]
@@ -7077,9 +6877,7 @@ define dso_local i8 @atomicrmw_min_i8_unaligned_monotonic(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_min_i8_unaligned_monotonic:
 ; -O0:    sxtb w9, w10
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, le
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -7100,9 +6898,7 @@ define dso_local i8 @atomicrmw_min_i8_unaligned_acquire(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_min_i8_unaligned_acquire:
 ; -O0:    sxtb w9, w10
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, le
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -7123,9 +6919,7 @@ define dso_local i8 @atomicrmw_min_i8_unaligned_release(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_min_i8_unaligned_release:
 ; -O0:    sxtb w9, w10
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, le
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -7146,9 +6940,7 @@ define dso_local i8 @atomicrmw_min_i8_unaligned_acq_rel(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_min_i8_unaligned_acq_rel:
 ; -O0:    sxtb w9, w10
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, le
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -7169,9 +6961,7 @@ define dso_local i8 @atomicrmw_min_i8_unaligned_seq_cst(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_min_i8_unaligned_seq_cst:
 ; -O0:    sxtb w9, w10
 ; -O0:    subs w9, w9, w8, sxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, le
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -7192,9 +6982,7 @@ define dso_local i16 @atomicrmw_min_i16_unaligned_monotonic(ptr %ptr, i16 %value
 ; -O0-LABEL: atomicrmw_min_i16_unaligned_monotonic:
 ; -O0:    sxth w10, w9
 ; -O0:    subs w10, w10, w8, sxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, le
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i16_unaligned_monotonic:
@@ -7210,9 +6998,7 @@ define dso_local i16 @atomicrmw_min_i16_unaligned_acquire(ptr %ptr, i16 %value)
 ; -O0-LABEL: atomicrmw_min_i16_unaligned_acquire:
 ; -O0:    sxth w10, w9
 ; -O0:    subs w10, w10, w8, sxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, le
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i16_unaligned_acquire:
@@ -7228,9 +7014,7 @@ define dso_local i16 @atomicrmw_min_i16_unaligned_release(ptr %ptr, i16 %value)
 ; -O0-LABEL: atomicrmw_min_i16_unaligned_release:
 ; -O0:    sxth w10, w9
 ; -O0:    subs w10, w10, w8, sxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, le
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i16_unaligned_release:
@@ -7246,9 +7030,7 @@ define dso_local i16 @atomicrmw_min_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 ; -O0-LABEL: atomicrmw_min_i16_unaligned_acq_rel:
 ; -O0:    sxth w10, w9
 ; -O0:    subs w10, w10, w8, sxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, le
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i16_unaligned_acq_rel:
@@ -7264,9 +7046,7 @@ define dso_local i16 @atomicrmw_min_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 ; -O0-LABEL: atomicrmw_min_i16_unaligned_seq_cst:
 ; -O0:    sxth w10, w9
 ; -O0:    subs w10, w10, w8, sxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, le
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i16_unaligned_seq_cst:
@@ -7281,9 +7061,7 @@ define dso_local i16 @atomicrmw_min_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 define dso_local i32 @atomicrmw_min_i32_unaligned_monotonic(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_min_i32_unaligned_monotonic:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, le
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i32_unaligned_monotonic:
@@ -7297,9 +7075,7 @@ define dso_local i32 @atomicrmw_min_i32_unaligned_monotonic(ptr %ptr, i32 %value
 define dso_local i32 @atomicrmw_min_i32_unaligned_acquire(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_min_i32_unaligned_acquire:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, le
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i32_unaligned_acquire:
@@ -7313,9 +7089,7 @@ define dso_local i32 @atomicrmw_min_i32_unaligned_acquire(ptr %ptr, i32 %value)
 define dso_local i32 @atomicrmw_min_i32_unaligned_release(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_min_i32_unaligned_release:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, le
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i32_unaligned_release:
@@ -7329,9 +7103,7 @@ define dso_local i32 @atomicrmw_min_i32_unaligned_release(ptr %ptr, i32 %value)
 define dso_local i32 @atomicrmw_min_i32_unaligned_acq_rel(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_min_i32_unaligned_acq_rel:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, le
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i32_unaligned_acq_rel:
@@ -7345,9 +7117,7 @@ define dso_local i32 @atomicrmw_min_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 define dso_local i32 @atomicrmw_min_i32_unaligned_seq_cst(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_min_i32_unaligned_seq_cst:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, le
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i32_unaligned_seq_cst:
@@ -7361,9 +7131,7 @@ define dso_local i32 @atomicrmw_min_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 define dso_local i64 @atomicrmw_min_i64_unaligned_monotonic(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_min_i64_unaligned_monotonic:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, le
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i64_unaligned_monotonic:
@@ -7377,9 +7145,7 @@ define dso_local i64 @atomicrmw_min_i64_unaligned_monotonic(ptr %ptr, i64 %value
 define dso_local i64 @atomicrmw_min_i64_unaligned_acquire(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_min_i64_unaligned_acquire:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, le
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i64_unaligned_acquire:
@@ -7393,9 +7159,7 @@ define dso_local i64 @atomicrmw_min_i64_unaligned_acquire(ptr %ptr, i64 %value)
 define dso_local i64 @atomicrmw_min_i64_unaligned_release(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_min_i64_unaligned_release:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, le
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i64_unaligned_release:
@@ -7409,9 +7173,7 @@ define dso_local i64 @atomicrmw_min_i64_unaligned_release(ptr %ptr, i64 %value)
 define dso_local i64 @atomicrmw_min_i64_unaligned_acq_rel(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_min_i64_unaligned_acq_rel:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, le
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i64_unaligned_acq_rel:
@@ -7425,9 +7187,7 @@ define dso_local i64 @atomicrmw_min_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 define dso_local i64 @atomicrmw_min_i64_unaligned_seq_cst(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_min_i64_unaligned_seq_cst:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, le
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_min_i64_unaligned_seq_cst:
@@ -7441,15 +7201,11 @@ define dso_local i64 @atomicrmw_min_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 define dso_local i128 @atomicrmw_min_i128_unaligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_unaligned_monotonic:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -7467,15 +7223,11 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_monotonic(ptr %ptr, i128 %va
 define dso_local i128 @atomicrmw_min_i128_unaligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_unaligned_acquire:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -7493,15 +7245,11 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_acquire(ptr %ptr, i128 %valu
 define dso_local i128 @atomicrmw_min_i128_unaligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_unaligned_release:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -7519,15 +7267,11 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_release(ptr %ptr, i128 %valu
 define dso_local i128 @atomicrmw_min_i128_unaligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_unaligned_acq_rel:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -7545,15 +7289,11 @@ define dso_local i128 @atomicrmw_min_i128_unaligned_acq_rel(ptr %ptr, i128 %valu
 define dso_local i128 @atomicrmw_min_i128_unaligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_min_i128_unaligned_seq_cst:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -7572,9 +7312,7 @@ define dso_local i8 @atomicrmw_umax_i8_aligned_monotonic(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umax_i8_aligned_monotonic:
 ; -O0:    and w9, w10, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, hi
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -7595,9 +7333,7 @@ define dso_local i8 @atomicrmw_umax_i8_aligned_acquire(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umax_i8_aligned_acquire:
 ; -O0:    and w9, w10, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, hi
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -7618,9 +7354,7 @@ define dso_local i8 @atomicrmw_umax_i8_aligned_release(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umax_i8_aligned_release:
 ; -O0:    and w9, w10, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, hi
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -7641,9 +7375,7 @@ define dso_local i8 @atomicrmw_umax_i8_aligned_acq_rel(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umax_i8_aligned_acq_rel:
 ; -O0:    and w9, w10, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, hi
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -7664,9 +7396,7 @@ define dso_local i8 @atomicrmw_umax_i8_aligned_seq_cst(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umax_i8_aligned_seq_cst:
 ; -O0:    and w9, w10, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, hi
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -7686,9 +7416,7 @@ define dso_local i8 @atomicrmw_umax_i8_aligned_seq_cst(ptr %ptr, i8 %value) {
 define dso_local i16 @atomicrmw_umax_i16_aligned_monotonic(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umax_i16_aligned_monotonic:
 ; -O0:    subs w10, w10, w9, uxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, hi
 ; -O0:    ldaxrh w9, [x11]
 ; -O0:    cmp w9, w8, uxth
 ; -O0:    stlxrh w10, w12, [x11]
@@ -7707,9 +7435,7 @@ define dso_local i16 @atomicrmw_umax_i16_aligned_monotonic(ptr %ptr, i16 %value)
 define dso_local i16 @atomicrmw_umax_i16_aligned_acquire(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umax_i16_aligned_acquire:
 ; -O0:    subs w10, w10, w9, uxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, hi
 ; -O0:    ldaxrh w9, [x11]
 ; -O0:    cmp w9, w8, uxth
 ; -O0:    stlxrh w10, w12, [x11]
@@ -7728,9 +7454,7 @@ define dso_local i16 @atomicrmw_umax_i16_aligned_acquire(ptr %ptr, i16 %value) {
 define dso_local i16 @atomicrmw_umax_i16_aligned_release(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umax_i16_aligned_release:
 ; -O0:    subs w10, w10, w9, uxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, hi
 ; -O0:    ldaxrh w9, [x11]
 ; -O0:    cmp w9, w8, uxth
 ; -O0:    stlxrh w10, w12, [x11]
@@ -7749,9 +7473,7 @@ define dso_local i16 @atomicrmw_umax_i16_aligned_release(ptr %ptr, i16 %value) {
 define dso_local i16 @atomicrmw_umax_i16_aligned_acq_rel(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umax_i16_aligned_acq_rel:
 ; -O0:    subs w10, w10, w9, uxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, hi
 ; -O0:    ldaxrh w9, [x11]
 ; -O0:    cmp w9, w8, uxth
 ; -O0:    stlxrh w10, w12, [x11]
@@ -7770,9 +7492,7 @@ define dso_local i16 @atomicrmw_umax_i16_aligned_acq_rel(ptr %ptr, i16 %value) {
 define dso_local i16 @atomicrmw_umax_i16_aligned_seq_cst(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umax_i16_aligned_seq_cst:
 ; -O0:    subs w10, w10, w9, uxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, hi
 ; -O0:    ldaxrh w9, [x11]
 ; -O0:    cmp w9, w8, uxth
 ; -O0:    stlxrh w10, w12, [x11]
@@ -7791,9 +7511,7 @@ define dso_local i16 @atomicrmw_umax_i16_aligned_seq_cst(ptr %ptr, i16 %value) {
 define dso_local i32 @atomicrmw_umax_i32_aligned_monotonic(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umax_i32_aligned_monotonic:
 ; -O0:    subs w10, w8, w9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, hi
 ; -O0:    ldaxr w9, [x11]
 ; -O0:    cmp w9, w8
 ; -O0:    stlxr w10, w12, [x11]
@@ -7811,9 +7529,7 @@ define dso_local i32 @atomicrmw_umax_i32_aligned_monotonic(ptr %ptr, i32 %value)
 define dso_local i32 @atomicrmw_umax_i32_aligned_acquire(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umax_i32_aligned_acquire:
 ; -O0:    subs w10, w8, w9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, hi
 ; -O0:    ldaxr w9, [x11]
 ; -O0:    cmp w9, w8
 ; -O0:    stlxr w10, w12, [x11]
@@ -7831,9 +7547,7 @@ define dso_local i32 @atomicrmw_umax_i32_aligned_acquire(ptr %ptr, i32 %value) {
 define dso_local i32 @atomicrmw_umax_i32_aligned_release(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umax_i32_aligned_release:
 ; -O0:    subs w10, w8, w9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, hi
 ; -O0:    ldaxr w9, [x11]
 ; -O0:    cmp w9, w8
 ; -O0:    stlxr w10, w12, [x11]
@@ -7851,9 +7565,7 @@ define dso_local i32 @atomicrmw_umax_i32_aligned_release(ptr %ptr, i32 %value) {
 define dso_local i32 @atomicrmw_umax_i32_aligned_acq_rel(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umax_i32_aligned_acq_rel:
 ; -O0:    subs w10, w8, w9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, hi
 ; -O0:    ldaxr w9, [x11]
 ; -O0:    cmp w9, w8
 ; -O0:    stlxr w10, w12, [x11]
@@ -7871,9 +7583,7 @@ define dso_local i32 @atomicrmw_umax_i32_aligned_acq_rel(ptr %ptr, i32 %value) {
 define dso_local i32 @atomicrmw_umax_i32_aligned_seq_cst(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umax_i32_aligned_seq_cst:
 ; -O0:    subs w10, w8, w9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, hi
 ; -O0:    ldaxr w9, [x11]
 ; -O0:    cmp w9, w8
 ; -O0:    stlxr w10, w12, [x11]
@@ -7891,9 +7601,7 @@ define dso_local i32 @atomicrmw_umax_i32_aligned_seq_cst(ptr %ptr, i32 %value) {
 define dso_local i64 @atomicrmw_umax_i64_aligned_monotonic(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umax_i64_aligned_monotonic:
 ; -O0:    subs x10, x8, x9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x12, x8, x9, ne
+; -O0:    csel x12, x8, x9, hi
 ; -O0:    ldaxr x9, [x11]
 ; -O0:    cmp x9, x8
 ; -O0:    stlxr w10, x12, [x11]
@@ -7911,9 +7619,7 @@ define dso_local i64 @atomicrmw_umax_i64_aligned_monotonic(ptr %ptr, i64 %value)
 define dso_local i64 @atomicrmw_umax_i64_aligned_acquire(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umax_i64_aligned_acquire:
 ; -O0:    subs x10, x8, x9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x12, x8, x9, ne
+; -O0:    csel x12, x8, x9, hi
 ; -O0:    ldaxr x9, [x11]
 ; -O0:    cmp x9, x8
 ; -O0:    stlxr w10, x12, [x11]
@@ -7931,9 +7637,7 @@ define dso_local i64 @atomicrmw_umax_i64_aligned_acquire(ptr %ptr, i64 %value) {
 define dso_local i64 @atomicrmw_umax_i64_aligned_release(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umax_i64_aligned_release:
 ; -O0:    subs x10, x8, x9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x12, x8, x9, ne
+; -O0:    csel x12, x8, x9, hi
 ; -O0:    ldaxr x9, [x11]
 ; -O0:    cmp x9, x8
 ; -O0:    stlxr w10, x12, [x11]
@@ -7951,9 +7655,7 @@ define dso_local i64 @atomicrmw_umax_i64_aligned_release(ptr %ptr, i64 %value) {
 define dso_local i64 @atomicrmw_umax_i64_aligned_acq_rel(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umax_i64_aligned_acq_rel:
 ; -O0:    subs x10, x8, x9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x12, x8, x9, ne
+; -O0:    csel x12, x8, x9, hi
 ; -O0:    ldaxr x9, [x11]
 ; -O0:    cmp x9, x8
 ; -O0:    stlxr w10, x12, [x11]
@@ -7971,9 +7673,7 @@ define dso_local i64 @atomicrmw_umax_i64_aligned_acq_rel(ptr %ptr, i64 %value) {
 define dso_local i64 @atomicrmw_umax_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umax_i64_aligned_seq_cst:
 ; -O0:    subs x10, x8, x9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x12, x8, x9, ne
+; -O0:    csel x12, x8, x9, hi
 ; -O0:    ldaxr x9, [x11]
 ; -O0:    cmp x9, x8
 ; -O0:    stlxr w10, x12, [x11]
@@ -7991,15 +7691,11 @@ define dso_local i64 @atomicrmw_umax_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 define dso_local i128 @atomicrmw_umax_i128_aligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_aligned_monotonic:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w8, w8, w10, ne
-; -O0:    and w13, w8, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x14, x10, x12, ne
-; -O0:    and w10, w8, #0x1
+; -O0:    subs x13, x13, x9
+; -O0:    csel w10, w8, w10, eq
+; -O0:    ands w13, w10, #0x1
+; -O0:    csel x14, x8, x12, ne
 ; -O0:    ands w10, w10, #0x1
 ; -O0:    csel x15, x8, x9, ne
 ; -O0:    ldxp x10, x9, [x11]
@@ -8025,15 +7721,11 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_monotonic(ptr %ptr, i128 %val
 define dso_local i128 @atomicrmw_umax_i128_aligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_aligned_acquire:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w8, w8, w10, ne
-; -O0:    and w13, w8, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x14, x10, x12, ne
-; -O0:    and w10, w8, #0x1
+; -O0:    subs x13, x13, x9
+; -O0:    csel w10, w8, w10, eq
+; -O0:    ands w13, w10, #0x1
+; -O0:    csel x14, x8, x12, ne
 ; -O0:    ands w10, w10, #0x1
 ; -O0:    csel x15, x8, x9, ne
 ; -O0:    ldaxp x10, x9, [x11]
@@ -8059,15 +7751,11 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_acquire(ptr %ptr, i128 %value
 define dso_local i128 @atomicrmw_umax_i128_aligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_aligned_release:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w8, w8, w10, ne
-; -O0:    and w13, w8, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x14, x10, x12, ne
-; -O0:    and w10, w8, #0x1
+; -O0:    subs x13, x13, x9
+; -O0:    csel w10, w8, w10, eq
+; -O0:    ands w13, w10, #0x1
+; -O0:    csel x14, x8, x12, ne
 ; -O0:    ands w10, w10, #0x1
 ; -O0:    csel x15, x8, x9, ne
 ; -O0:    ldxp x10, x9, [x11]
@@ -8093,15 +7781,11 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_release(ptr %ptr, i128 %value
 define dso_local i128 @atomicrmw_umax_i128_aligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_aligned_acq_rel:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w8, w8, w10, ne
-; -O0:    and w13, w8, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x14, x10, x12, ne
-; -O0:    and w10, w8, #0x1
+; -O0:    subs x13, x13, x9
+; -O0:    csel w10, w8, w10, eq
+; -O0:    ands w13, w10, #0x1
+; -O0:    csel x14, x8, x12, ne
 ; -O0:    ands w10, w10, #0x1
 ; -O0:    csel x15, x8, x9, ne
 ; -O0:    ldaxp x10, x9, [x11]
@@ -8127,15 +7811,11 @@ define dso_local i128 @atomicrmw_umax_i128_aligned_acq_rel(ptr %ptr, i128 %value
 define dso_local i128 @atomicrmw_umax_i128_aligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_aligned_seq_cst:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w8, w8, w10, ne
-; -O0:    and w13, w8, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x14, x10, x12, ne
-; -O0:    and w10, w8, #0x1
+; -O0:    subs x13, x13, x9
+; -O0:    csel w10, w8, w10, eq
+; -O0:    ands w13, w10, #0x1
+; -O0:    csel x14, x8, x12, ne
 ; -O0:    ands w10, w10, #0x1
 ; -O0:    csel x15, x8, x9, ne
 ; -O0:    ldaxp x10, x9, [x11]
@@ -8162,9 +7842,7 @@ define dso_local i8 @atomicrmw_umax_i8_unaligned_monotonic(ptr %ptr, i8 %value)
 ; -O0-LABEL: atomicrmw_umax_i8_unaligned_monotonic:
 ; -O0:    and w9, w10, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, hi
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -8185,9 +7863,7 @@ define dso_local i8 @atomicrmw_umax_i8_unaligned_acquire(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umax_i8_unaligned_acquire:
 ; -O0:    and w9, w10, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, hi
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -8208,9 +7884,7 @@ define dso_local i8 @atomicrmw_umax_i8_unaligned_release(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umax_i8_unaligned_release:
 ; -O0:    and w9, w10, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, hi
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -8231,9 +7905,7 @@ define dso_local i8 @atomicrmw_umax_i8_unaligned_acq_rel(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umax_i8_unaligned_acq_rel:
 ; -O0:    and w9, w10, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, hi
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -8254,9 +7926,7 @@ define dso_local i8 @atomicrmw_umax_i8_unaligned_seq_cst(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umax_i8_unaligned_seq_cst:
 ; -O0:    and w9, w10, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, hi
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -8276,9 +7946,7 @@ define dso_local i8 @atomicrmw_umax_i8_unaligned_seq_cst(ptr %ptr, i8 %value) {
 define dso_local i16 @atomicrmw_umax_i16_unaligned_monotonic(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umax_i16_unaligned_monotonic:
 ; -O0:    subs w10, w10, w8, uxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, hi
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i16_unaligned_monotonic:
@@ -8293,9 +7961,7 @@ define dso_local i16 @atomicrmw_umax_i16_unaligned_monotonic(ptr %ptr, i16 %valu
 define dso_local i16 @atomicrmw_umax_i16_unaligned_acquire(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umax_i16_unaligned_acquire:
 ; -O0:    subs w10, w10, w8, uxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, hi
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i16_unaligned_acquire:
@@ -8310,9 +7976,7 @@ define dso_local i16 @atomicrmw_umax_i16_unaligned_acquire(ptr %ptr, i16 %value)
 define dso_local i16 @atomicrmw_umax_i16_unaligned_release(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umax_i16_unaligned_release:
 ; -O0:    subs w10, w10, w8, uxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, hi
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i16_unaligned_release:
@@ -8327,9 +7991,7 @@ define dso_local i16 @atomicrmw_umax_i16_unaligned_release(ptr %ptr, i16 %value)
 define dso_local i16 @atomicrmw_umax_i16_unaligned_acq_rel(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umax_i16_unaligned_acq_rel:
 ; -O0:    subs w10, w10, w8, uxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, hi
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i16_unaligned_acq_rel:
@@ -8344,9 +8006,7 @@ define dso_local i16 @atomicrmw_umax_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 define dso_local i16 @atomicrmw_umax_i16_unaligned_seq_cst(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umax_i16_unaligned_seq_cst:
 ; -O0:    subs w10, w10, w8, uxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, hi
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i16_unaligned_seq_cst:
@@ -8361,9 +8021,7 @@ define dso_local i16 @atomicrmw_umax_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 define dso_local i32 @atomicrmw_umax_i32_unaligned_monotonic(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umax_i32_unaligned_monotonic:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, hi
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i32_unaligned_monotonic:
@@ -8377,9 +8035,7 @@ define dso_local i32 @atomicrmw_umax_i32_unaligned_monotonic(ptr %ptr, i32 %valu
 define dso_local i32 @atomicrmw_umax_i32_unaligned_acquire(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umax_i32_unaligned_acquire:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, hi
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i32_unaligned_acquire:
@@ -8393,9 +8049,7 @@ define dso_local i32 @atomicrmw_umax_i32_unaligned_acquire(ptr %ptr, i32 %value)
 define dso_local i32 @atomicrmw_umax_i32_unaligned_release(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umax_i32_unaligned_release:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, hi
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i32_unaligned_release:
@@ -8409,9 +8063,7 @@ define dso_local i32 @atomicrmw_umax_i32_unaligned_release(ptr %ptr, i32 %value)
 define dso_local i32 @atomicrmw_umax_i32_unaligned_acq_rel(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umax_i32_unaligned_acq_rel:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, hi
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i32_unaligned_acq_rel:
@@ -8425,9 +8077,7 @@ define dso_local i32 @atomicrmw_umax_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 define dso_local i32 @atomicrmw_umax_i32_unaligned_seq_cst(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umax_i32_unaligned_seq_cst:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, hi
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i32_unaligned_seq_cst:
@@ -8441,9 +8091,7 @@ define dso_local i32 @atomicrmw_umax_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 define dso_local i64 @atomicrmw_umax_i64_unaligned_monotonic(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umax_i64_unaligned_monotonic:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, hi
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i64_unaligned_monotonic:
@@ -8457,9 +8105,7 @@ define dso_local i64 @atomicrmw_umax_i64_unaligned_monotonic(ptr %ptr, i64 %valu
 define dso_local i64 @atomicrmw_umax_i64_unaligned_acquire(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umax_i64_unaligned_acquire:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, hi
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i64_unaligned_acquire:
@@ -8473,9 +8119,7 @@ define dso_local i64 @atomicrmw_umax_i64_unaligned_acquire(ptr %ptr, i64 %value)
 define dso_local i64 @atomicrmw_umax_i64_unaligned_release(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umax_i64_unaligned_release:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, hi
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i64_unaligned_release:
@@ -8489,9 +8133,7 @@ define dso_local i64 @atomicrmw_umax_i64_unaligned_release(ptr %ptr, i64 %value)
 define dso_local i64 @atomicrmw_umax_i64_unaligned_acq_rel(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umax_i64_unaligned_acq_rel:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, hi
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i64_unaligned_acq_rel:
@@ -8505,9 +8147,7 @@ define dso_local i64 @atomicrmw_umax_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 define dso_local i64 @atomicrmw_umax_i64_unaligned_seq_cst(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umax_i64_unaligned_seq_cst:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, hi
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umax_i64_unaligned_seq_cst:
@@ -8521,15 +8161,11 @@ define dso_local i64 @atomicrmw_umax_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 define dso_local i128 @atomicrmw_umax_i128_unaligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_unaligned_monotonic:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -8547,15 +8183,11 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_monotonic(ptr %ptr, i128 %v
 define dso_local i128 @atomicrmw_umax_i128_unaligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_unaligned_acquire:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -8573,15 +8205,11 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_acquire(ptr %ptr, i128 %val
 define dso_local i128 @atomicrmw_umax_i128_unaligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_unaligned_release:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -8599,15 +8227,11 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_release(ptr %ptr, i128 %val
 define dso_local i128 @atomicrmw_umax_i128_unaligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_unaligned_acq_rel:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -8625,15 +8249,11 @@ define dso_local i128 @atomicrmw_umax_i128_unaligned_acq_rel(ptr %ptr, i128 %val
 define dso_local i128 @atomicrmw_umax_i128_unaligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umax_i128_unaligned_seq_cst:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -8652,9 +8272,7 @@ define dso_local i8 @atomicrmw_umin_i8_aligned_monotonic(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umin_i8_aligned_monotonic:
 ; -O0:    and w9, w10, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, ls
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -8675,9 +8293,7 @@ define dso_local i8 @atomicrmw_umin_i8_aligned_acquire(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umin_i8_aligned_acquire:
 ; -O0:    and w9, w10, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, ls
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -8698,9 +8314,7 @@ define dso_local i8 @atomicrmw_umin_i8_aligned_release(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umin_i8_aligned_release:
 ; -O0:    and w9, w10, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, ls
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -8721,9 +8335,7 @@ define dso_local i8 @atomicrmw_umin_i8_aligned_acq_rel(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umin_i8_aligned_acq_rel:
 ; -O0:    and w9, w10, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, ls
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -8744,9 +8356,7 @@ define dso_local i8 @atomicrmw_umin_i8_aligned_seq_cst(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umin_i8_aligned_seq_cst:
 ; -O0:    and w9, w10, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, ls
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -8766,9 +8376,7 @@ define dso_local i8 @atomicrmw_umin_i8_aligned_seq_cst(ptr %ptr, i8 %value) {
 define dso_local i16 @atomicrmw_umin_i16_aligned_monotonic(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umin_i16_aligned_monotonic:
 ; -O0:    subs w10, w10, w9, uxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, ls
 ; -O0:    ldaxrh w9, [x11]
 ; -O0:    cmp w9, w8, uxth
 ; -O0:    stlxrh w10, w12, [x11]
@@ -8787,9 +8395,7 @@ define dso_local i16 @atomicrmw_umin_i16_aligned_monotonic(ptr %ptr, i16 %value)
 define dso_local i16 @atomicrmw_umin_i16_aligned_acquire(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umin_i16_aligned_acquire:
 ; -O0:    subs w10, w10, w9, uxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, ls
 ; -O0:    ldaxrh w9, [x11]
 ; -O0:    cmp w9, w8, uxth
 ; -O0:    stlxrh w10, w12, [x11]
@@ -8808,9 +8414,7 @@ define dso_local i16 @atomicrmw_umin_i16_aligned_acquire(ptr %ptr, i16 %value) {
 define dso_local i16 @atomicrmw_umin_i16_aligned_release(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umin_i16_aligned_release:
 ; -O0:    subs w10, w10, w9, uxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, ls
 ; -O0:    ldaxrh w9, [x11]
 ; -O0:    cmp w9, w8, uxth
 ; -O0:    stlxrh w10, w12, [x11]
@@ -8829,9 +8433,7 @@ define dso_local i16 @atomicrmw_umin_i16_aligned_release(ptr %ptr, i16 %value) {
 define dso_local i16 @atomicrmw_umin_i16_aligned_acq_rel(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umin_i16_aligned_acq_rel:
 ; -O0:    subs w10, w10, w9, uxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, ls
 ; -O0:    ldaxrh w9, [x11]
 ; -O0:    cmp w9, w8, uxth
 ; -O0:    stlxrh w10, w12, [x11]
@@ -8850,9 +8452,7 @@ define dso_local i16 @atomicrmw_umin_i16_aligned_acq_rel(ptr %ptr, i16 %value) {
 define dso_local i16 @atomicrmw_umin_i16_aligned_seq_cst(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umin_i16_aligned_seq_cst:
 ; -O0:    subs w10, w10, w9, uxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, ls
 ; -O0:    ldaxrh w9, [x11]
 ; -O0:    cmp w9, w8, uxth
 ; -O0:    stlxrh w10, w12, [x11]
@@ -8871,9 +8471,7 @@ define dso_local i16 @atomicrmw_umin_i16_aligned_seq_cst(ptr %ptr, i16 %value) {
 define dso_local i32 @atomicrmw_umin_i32_aligned_monotonic(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umin_i32_aligned_monotonic:
 ; -O0:    subs w10, w8, w9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, ls
 ; -O0:    ldaxr w9, [x11]
 ; -O0:    cmp w9, w8
 ; -O0:    stlxr w10, w12, [x11]
@@ -8891,9 +8489,7 @@ define dso_local i32 @atomicrmw_umin_i32_aligned_monotonic(ptr %ptr, i32 %value)
 define dso_local i32 @atomicrmw_umin_i32_aligned_acquire(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umin_i32_aligned_acquire:
 ; -O0:    subs w10, w8, w9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, ls
 ; -O0:    ldaxr w9, [x11]
 ; -O0:    cmp w9, w8
 ; -O0:    stlxr w10, w12, [x11]
@@ -8911,9 +8507,7 @@ define dso_local i32 @atomicrmw_umin_i32_aligned_acquire(ptr %ptr, i32 %value) {
 define dso_local i32 @atomicrmw_umin_i32_aligned_release(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umin_i32_aligned_release:
 ; -O0:    subs w10, w8, w9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, ls
 ; -O0:    ldaxr w9, [x11]
 ; -O0:    cmp w9, w8
 ; -O0:    stlxr w10, w12, [x11]
@@ -8931,9 +8525,7 @@ define dso_local i32 @atomicrmw_umin_i32_aligned_release(ptr %ptr, i32 %value) {
 define dso_local i32 @atomicrmw_umin_i32_aligned_acq_rel(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umin_i32_aligned_acq_rel:
 ; -O0:    subs w10, w8, w9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, ls
 ; -O0:    ldaxr w9, [x11]
 ; -O0:    cmp w9, w8
 ; -O0:    stlxr w10, w12, [x11]
@@ -8951,9 +8543,7 @@ define dso_local i32 @atomicrmw_umin_i32_aligned_acq_rel(ptr %ptr, i32 %value) {
 define dso_local i32 @atomicrmw_umin_i32_aligned_seq_cst(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umin_i32_aligned_seq_cst:
 ; -O0:    subs w10, w8, w9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w12, w8, w9, ne
+; -O0:    csel w12, w8, w9, ls
 ; -O0:    ldaxr w9, [x11]
 ; -O0:    cmp w9, w8
 ; -O0:    stlxr w10, w12, [x11]
@@ -8971,9 +8561,7 @@ define dso_local i32 @atomicrmw_umin_i32_aligned_seq_cst(ptr %ptr, i32 %value) {
 define dso_local i64 @atomicrmw_umin_i64_aligned_monotonic(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umin_i64_aligned_monotonic:
 ; -O0:    subs x10, x8, x9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x12, x8, x9, ne
+; -O0:    csel x12, x8, x9, ls
 ; -O0:    ldaxr x9, [x11]
 ; -O0:    cmp x9, x8
 ; -O0:    stlxr w10, x12, [x11]
@@ -8991,9 +8579,7 @@ define dso_local i64 @atomicrmw_umin_i64_aligned_monotonic(ptr %ptr, i64 %value)
 define dso_local i64 @atomicrmw_umin_i64_aligned_acquire(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umin_i64_aligned_acquire:
 ; -O0:    subs x10, x8, x9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x12, x8, x9, ne
+; -O0:    csel x12, x8, x9, ls
 ; -O0:    ldaxr x9, [x11]
 ; -O0:    cmp x9, x8
 ; -O0:    stlxr w10, x12, [x11]
@@ -9011,9 +8597,7 @@ define dso_local i64 @atomicrmw_umin_i64_aligned_acquire(ptr %ptr, i64 %value) {
 define dso_local i64 @atomicrmw_umin_i64_aligned_release(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umin_i64_aligned_release:
 ; -O0:    subs x10, x8, x9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x12, x8, x9, ne
+; -O0:    csel x12, x8, x9, ls
 ; -O0:    ldaxr x9, [x11]
 ; -O0:    cmp x9, x8
 ; -O0:    stlxr w10, x12, [x11]
@@ -9031,9 +8615,7 @@ define dso_local i64 @atomicrmw_umin_i64_aligned_release(ptr %ptr, i64 %value) {
 define dso_local i64 @atomicrmw_umin_i64_aligned_acq_rel(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umin_i64_aligned_acq_rel:
 ; -O0:    subs x10, x8, x9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x12, x8, x9, ne
+; -O0:    csel x12, x8, x9, ls
 ; -O0:    ldaxr x9, [x11]
 ; -O0:    cmp x9, x8
 ; -O0:    stlxr w10, x12, [x11]
@@ -9051,9 +8633,7 @@ define dso_local i64 @atomicrmw_umin_i64_aligned_acq_rel(ptr %ptr, i64 %value) {
 define dso_local i64 @atomicrmw_umin_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umin_i64_aligned_seq_cst:
 ; -O0:    subs x10, x8, x9
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x12, x8, x9, ne
+; -O0:    csel x12, x8, x9, ls
 ; -O0:    ldaxr x9, [x11]
 ; -O0:    cmp x9, x8
 ; -O0:    stlxr w10, x12, [x11]
@@ -9071,15 +8651,11 @@ define dso_local i64 @atomicrmw_umin_i64_aligned_seq_cst(ptr %ptr, i64 %value) {
 define dso_local i128 @atomicrmw_umin_i128_aligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_aligned_monotonic:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w8, w8, w10, ne
-; -O0:    and w13, w8, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x14, x10, x12, ne
-; -O0:    and w10, w8, #0x1
+; -O0:    subs x13, x13, x9
+; -O0:    csel w10, w8, w10, eq
+; -O0:    ands w13, w10, #0x1
+; -O0:    csel x14, x8, x12, ne
 ; -O0:    ands w10, w10, #0x1
 ; -O0:    csel x15, x8, x9, ne
 ; -O0:    ldxp x10, x9, [x11]
@@ -9105,15 +8681,11 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_monotonic(ptr %ptr, i128 %val
 define dso_local i128 @atomicrmw_umin_i128_aligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_aligned_acquire:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w8, w8, w10, ne
-; -O0:    and w13, w8, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x14, x10, x12, ne
-; -O0:    and w10, w8, #0x1
+; -O0:    subs x13, x13, x9
+; -O0:    csel w10, w8, w10, eq
+; -O0:    ands w13, w10, #0x1
+; -O0:    csel x14, x8, x12, ne
 ; -O0:    ands w10, w10, #0x1
 ; -O0:    csel x15, x8, x9, ne
 ; -O0:    ldaxp x10, x9, [x11]
@@ -9139,15 +8711,11 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_acquire(ptr %ptr, i128 %value
 define dso_local i128 @atomicrmw_umin_i128_aligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_aligned_release:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w8, w8, w10, ne
-; -O0:    and w13, w8, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x14, x10, x12, ne
-; -O0:    and w10, w8, #0x1
+; -O0:    subs x13, x13, x9
+; -O0:    csel w10, w8, w10, eq
+; -O0:    ands w13, w10, #0x1
+; -O0:    csel x14, x8, x12, ne
 ; -O0:    ands w10, w10, #0x1
 ; -O0:    csel x15, x8, x9, ne
 ; -O0:    ldxp x10, x9, [x11]
@@ -9173,15 +8741,11 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_release(ptr %ptr, i128 %value
 define dso_local i128 @atomicrmw_umin_i128_aligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_aligned_acq_rel:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w8, w8, w10, ne
-; -O0:    and w13, w8, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x14, x10, x12, ne
-; -O0:    and w10, w8, #0x1
+; -O0:    subs x13, x13, x9
+; -O0:    csel w10, w8, w10, eq
+; -O0:    ands w13, w10, #0x1
+; -O0:    csel x14, x8, x12, ne
 ; -O0:    ands w10, w10, #0x1
 ; -O0:    csel x15, x8, x9, ne
 ; -O0:    ldaxp x10, x9, [x11]
@@ -9207,15 +8771,11 @@ define dso_local i128 @atomicrmw_umin_i128_aligned_acq_rel(ptr %ptr, i128 %value
 define dso_local i128 @atomicrmw_umin_i128_aligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_aligned_seq_cst:
 ; -O0:    subs x8, x8, x9
-; -O0:    subs x8, x8, x9
 ; -O0:    subs x8, x8, x12
-; -O0:    and w13, w13, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel w8, w8, w10, ne
-; -O0:    and w13, w8, #0x1
-; -O0:    ands w13, w13, #0x1
-; -O0:    csel x14, x10, x12, ne
-; -O0:    and w10, w8, #0x1
+; -O0:    subs x13, x13, x9
+; -O0:    csel w10, w8, w10, eq
+; -O0:    ands w13, w10, #0x1
+; -O0:    csel x14, x8, x12, ne
 ; -O0:    ands w10, w10, #0x1
 ; -O0:    csel x15, x8, x9, ne
 ; -O0:    ldaxp x10, x9, [x11]
@@ -9242,9 +8802,7 @@ define dso_local i8 @atomicrmw_umin_i8_unaligned_monotonic(ptr %ptr, i8 %value)
 ; -O0-LABEL: atomicrmw_umin_i8_unaligned_monotonic:
 ; -O0:    and w9, w10, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, ls
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -9265,9 +8823,7 @@ define dso_local i8 @atomicrmw_umin_i8_unaligned_acquire(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umin_i8_unaligned_acquire:
 ; -O0:    and w9, w10, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, ls
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -9288,9 +8844,7 @@ define dso_local i8 @atomicrmw_umin_i8_unaligned_release(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umin_i8_unaligned_release:
 ; -O0:    and w9, w10, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, ls
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -9311,9 +8865,7 @@ define dso_local i8 @atomicrmw_umin_i8_unaligned_acq_rel(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umin_i8_unaligned_acq_rel:
 ; -O0:    and w9, w10, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, ls
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -9334,9 +8886,7 @@ define dso_local i8 @atomicrmw_umin_i8_unaligned_seq_cst(ptr %ptr, i8 %value) {
 ; -O0-LABEL: atomicrmw_umin_i8_unaligned_seq_cst:
 ; -O0:    and w9, w10, #0xff
 ; -O0:    subs w9, w9, w8, uxtb
-; -O0:    and w9, w9, #0x1
-; -O0:    ands w9, w9, #0x1
-; -O0:    csel w12, w10, w8, ne
+; -O0:    csel w12, w10, w8, ls
 ; -O0:    ldaxrb w9, [x11]
 ; -O0:    cmp w9, w10, uxtb
 ; -O0:    stlxrb w8, w12, [x11]
@@ -9356,9 +8906,7 @@ define dso_local i8 @atomicrmw_umin_i8_unaligned_seq_cst(ptr %ptr, i8 %value) {
 define dso_local i16 @atomicrmw_umin_i16_unaligned_monotonic(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umin_i16_unaligned_monotonic:
 ; -O0:    subs w10, w10, w8, uxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, ls
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i16_unaligned_monotonic:
@@ -9373,9 +8921,7 @@ define dso_local i16 @atomicrmw_umin_i16_unaligned_monotonic(ptr %ptr, i16 %valu
 define dso_local i16 @atomicrmw_umin_i16_unaligned_acquire(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umin_i16_unaligned_acquire:
 ; -O0:    subs w10, w10, w8, uxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, ls
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i16_unaligned_acquire:
@@ -9390,9 +8936,7 @@ define dso_local i16 @atomicrmw_umin_i16_unaligned_acquire(ptr %ptr, i16 %value)
 define dso_local i16 @atomicrmw_umin_i16_unaligned_release(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umin_i16_unaligned_release:
 ; -O0:    subs w10, w10, w8, uxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, ls
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i16_unaligned_release:
@@ -9407,9 +8951,7 @@ define dso_local i16 @atomicrmw_umin_i16_unaligned_release(ptr %ptr, i16 %value)
 define dso_local i16 @atomicrmw_umin_i16_unaligned_acq_rel(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umin_i16_unaligned_acq_rel:
 ; -O0:    subs w10, w10, w8, uxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, ls
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i16_unaligned_acq_rel:
@@ -9424,9 +8966,7 @@ define dso_local i16 @atomicrmw_umin_i16_unaligned_acq_rel(ptr %ptr, i16 %value)
 define dso_local i16 @atomicrmw_umin_i16_unaligned_seq_cst(ptr %ptr, i16 %value) {
 ; -O0-LABEL: atomicrmw_umin_i16_unaligned_seq_cst:
 ; -O0:    subs w10, w10, w8, uxth
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, ls
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i16_unaligned_seq_cst:
@@ -9441,9 +8981,7 @@ define dso_local i16 @atomicrmw_umin_i16_unaligned_seq_cst(ptr %ptr, i16 %value)
 define dso_local i32 @atomicrmw_umin_i32_unaligned_monotonic(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umin_i32_unaligned_monotonic:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, ls
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i32_unaligned_monotonic:
@@ -9457,9 +8995,7 @@ define dso_local i32 @atomicrmw_umin_i32_unaligned_monotonic(ptr %ptr, i32 %valu
 define dso_local i32 @atomicrmw_umin_i32_unaligned_acquire(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umin_i32_unaligned_acquire:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, ls
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i32_unaligned_acquire:
@@ -9473,9 +9009,7 @@ define dso_local i32 @atomicrmw_umin_i32_unaligned_acquire(ptr %ptr, i32 %value)
 define dso_local i32 @atomicrmw_umin_i32_unaligned_release(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umin_i32_unaligned_release:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, ls
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i32_unaligned_release:
@@ -9489,9 +9023,7 @@ define dso_local i32 @atomicrmw_umin_i32_unaligned_release(ptr %ptr, i32 %value)
 define dso_local i32 @atomicrmw_umin_i32_unaligned_acq_rel(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umin_i32_unaligned_acq_rel:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, ls
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i32_unaligned_acq_rel:
@@ -9505,9 +9037,7 @@ define dso_local i32 @atomicrmw_umin_i32_unaligned_acq_rel(ptr %ptr, i32 %value)
 define dso_local i32 @atomicrmw_umin_i32_unaligned_seq_cst(ptr %ptr, i32 %value) {
 ; -O0-LABEL: atomicrmw_umin_i32_unaligned_seq_cst:
 ; -O0:    subs w10, w9, w8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel w8, w9, w8, ne
+; -O0:    csel w8, w9, w8, ls
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i32_unaligned_seq_cst:
@@ -9521,9 +9051,7 @@ define dso_local i32 @atomicrmw_umin_i32_unaligned_seq_cst(ptr %ptr, i32 %value)
 define dso_local i64 @atomicrmw_umin_i64_unaligned_monotonic(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umin_i64_unaligned_monotonic:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, ls
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i64_unaligned_monotonic:
@@ -9537,9 +9065,7 @@ define dso_local i64 @atomicrmw_umin_i64_unaligned_monotonic(ptr %ptr, i64 %valu
 define dso_local i64 @atomicrmw_umin_i64_unaligned_acquire(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umin_i64_unaligned_acquire:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, ls
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i64_unaligned_acquire:
@@ -9553,9 +9079,7 @@ define dso_local i64 @atomicrmw_umin_i64_unaligned_acquire(ptr %ptr, i64 %value)
 define dso_local i64 @atomicrmw_umin_i64_unaligned_release(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umin_i64_unaligned_release:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, ls
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i64_unaligned_release:
@@ -9569,9 +9093,7 @@ define dso_local i64 @atomicrmw_umin_i64_unaligned_release(ptr %ptr, i64 %value)
 define dso_local i64 @atomicrmw_umin_i64_unaligned_acq_rel(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umin_i64_unaligned_acq_rel:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, ls
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i64_unaligned_acq_rel:
@@ -9585,9 +9107,7 @@ define dso_local i64 @atomicrmw_umin_i64_unaligned_acq_rel(ptr %ptr, i64 %value)
 define dso_local i64 @atomicrmw_umin_i64_unaligned_seq_cst(ptr %ptr, i64 %value) {
 ; -O0-LABEL: atomicrmw_umin_i64_unaligned_seq_cst:
 ; -O0:    subs x10, x9, x8
-; -O0:    and w10, w10, #0x1
-; -O0:    ands w10, w10, #0x1
-; -O0:    csel x8, x9, x8, ne
+; -O0:    csel x8, x9, x8, ls
 ; -O0:    bl __atomic_compare_exchange
 ;
 ; -O1-LABEL: atomicrmw_umin_i64_unaligned_seq_cst:
@@ -9601,15 +9121,11 @@ define dso_local i64 @atomicrmw_umin_i64_unaligned_seq_cst(ptr %ptr, i64 %value)
 define dso_local i128 @atomicrmw_umin_i128_unaligned_monotonic(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_unaligned_monotonic:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -9627,15 +9143,11 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_monotonic(ptr %ptr, i128 %v
 define dso_local i128 @atomicrmw_umin_i128_unaligned_acquire(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_unaligned_acquire:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -9653,15 +9165,11 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_acquire(ptr %ptr, i128 %val
 define dso_local i128 @atomicrmw_umin_i128_unaligned_release(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_unaligned_release:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -9679,15 +9187,11 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_release(ptr %ptr, i128 %val
 define dso_local i128 @atomicrmw_umin_i128_unaligned_acq_rel(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_unaligned_acq_rel:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
@@ -9705,15 +9209,11 @@ define dso_local i128 @atomicrmw_umin_i128_unaligned_acq_rel(ptr %ptr, i128 %val
 define dso_local i128 @atomicrmw_umin_i128_unaligned_seq_cst(ptr %ptr, i128 %value) {
 ; -O0-LABEL: atomicrmw_umin_i128_unaligned_seq_cst:
 ; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x10
-; -O0:    subs x8, x8, x11
-; -O0:    and w12, w12, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel w8, w8, w9, ne
-; -O0:    and w12, w8, #0x1
-; -O0:    ands w12, w12, #0x1
-; -O0:    csel x9, x9, x11, ne
-; -O0:    and w11, w8, #0x1
+; -O0:    subs x8, x8, x9
+; -O0:    subs x12, x12, x10
+; -O0:    csel w11, w8, w11, eq
+; -O0:    ands w12, w11, #0x1
+; -O0:    csel x9, x8, x9, ne
 ; -O0:    ands w11, w11, #0x1
 ; -O0:    csel x8, x8, x10, ne
 ; -O0:    bl __atomic_compare_exchange
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic.ll b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic.ll
index aa5e54992f486..d03647f8b294e 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-atomic.ll
@@ -1770,10 +1770,7 @@ define i8 @atomicrmw_min_i8(ptr %ptr, i8 %rhs) {
 ; CHECK-NOLSE-O0-NEXT:    ldr w8, [sp, #24] ; 4-byte Folded Reload
 ; CHECK-NOLSE-O0-NEXT:    sxtb w9, w10
 ; CHECK-NOLSE-O0-NEXT:    subs w9, w9, w8, sxtb
-; CHECK-NOLSE-O0-NEXT:    cset w9, le
-; CHECK-NOLSE-O0-NEXT:    and w9, w9, #0x1
-; CHECK-NOLSE-O0-NEXT:    ands w9, w9, #0x1
-; CHECK-NOLSE-O0-NEXT:    csel w12, w10, w8, ne
+; CHECK-NOLSE-O0-NEXT:    csel w12, w10, w8, le
 ; CHECK-NOLSE-O0-NEXT:  LBB33_2: ; %atomicrmw.start
 ; CHECK-NOLSE-O0-NEXT:    ; Parent Loop BB33_1 Depth=1
 ; CHECK-NOLSE-O0-NEXT:    ; => This Inner Loop Header: Depth=2
@@ -1843,10 +1840,7 @@ define i8 @atomicrmw_max_i8(ptr %ptr, i8 %rhs) {
 ; CHECK-NOLSE-O0-NEXT:    ldr w8, [sp, #24] ; 4-byte Folded Reload
 ; CHECK-NOLSE-O0-NEXT:    sxtb w9, w10
 ; CHECK-NOLSE-O0-NEXT:    subs w9, w9, w8, sxtb
-; CHECK-NOLSE-O0-NEXT:    cset w9, gt
-; CHECK-NOLSE-O0-NEXT:    and w9, w9, #0x1
-; CHECK-NOLSE-O0-NEXT:    ands w9, w9, #0x1
-; CHECK-NOLSE-O0-NEXT:    csel w12, w10, w8, ne
+; CHECK-NOLSE-O0-NEXT:    csel w12, w10, w8, gt
 ; CHECK-NOLSE-O0-NEXT:  LBB34_2: ; %atomicrmw.start
 ; CHECK-NOLSE-O0-NEXT:    ; Parent Loop BB34_1 Depth=1
 ; CHECK-NOLSE-O0-NEXT:    ; => This Inner Loop Header: Depth=2
@@ -1917,10 +1911,7 @@ define i8 @atomicrmw_umin_i8(ptr %ptr, i8 %rhs) {
 ; CHECK-NOLSE-O0-NEXT:    ldr w8, [sp, #24] ; 4-byte Folded Reload
 ; CHECK-NOLSE-O0-NEXT:    and w9, w10, #0xff
 ; CHECK-NOLSE-O0-NEXT:    subs w9, w9, w8, uxtb
-; CHECK-NOLSE-O0-NEXT:    cset w9, ls
-; CHECK-NOLSE-O0-NEXT:    and w9, w9, #0x1
-; CHECK-NOLSE-O0-NEXT:    ands w9, w9, #0x1
-; CHECK-NOLSE-O0-NEXT:    csel w12, w10, w8, ne
+; CHECK-NOLSE-O0-NEXT:    csel w12, w10, w8, ls
 ; CHECK-NOLSE-O0-NEXT:  LBB35_2: ; %atomicrmw.start
 ; CHECK-NOLSE-O0-NEXT:    ; Parent Loop BB35_1 Depth=1
 ; CHECK-NOLSE-O0-NEXT:    ; => This Inner Loop Header: Depth=2
@@ -1991,10 +1982,7 @@ define i8 @atomicrmw_umax_i8(ptr %ptr, i8 %rhs) {
 ; CHECK-NOLSE-O0-NEXT:    ldr w8, [sp, #24] ; 4-byte Folded Reload
 ; CHECK-NOLSE-O0-NEXT:    and w9, w10, #0xff
 ; CHECK-NOLSE-O0-NEXT:    subs w9, w9, w8, uxtb
-; CHECK-NOLSE-O0-NEXT:    cset w9, hi
-; CHECK-NOLSE-O0-NEXT:    and w9, w9, #0x1
-; CHECK-NOLSE-O0-NEXT:    ands w9, w9, #0x1
-; CHECK-NOLSE-O0-NEXT:    csel w12, w10, w8, ne
+; CHECK-NOLSE-O0-NEXT:    csel w12, w10, w8, hi
 ; CHECK-NOLSE-O0-NEXT:  LBB36_2: ; %atomicrmw.start
 ; CHECK-NOLSE-O0-NEXT:    ; Parent Loop BB36_1 Depth=1
 ; CHECK-NOLSE-O0-NEXT:    ; => This Inner Loop Header: Depth=2
@@ -2463,10 +2451,7 @@ define i16 @atomicrmw_min_i16(ptr %ptr, i16 %rhs) {
 ; CHECK-NOLSE-O0-NEXT:    ldr w9, [sp, #24] ; 4-byte Folded Reload
 ; CHECK-NOLSE-O0-NEXT:    sxth w10, w8
 ; CHECK-NOLSE-O0-NEXT:    subs w10, w10, w9, sxth
-; CHECK-NOLSE-O0-NEXT:    cset w10, le
-; CHECK-NOLSE-O0-NEXT:    and w10, w10, #0x1
-; CHECK-NOLSE-O0-NEXT:    ands w10, w10, #0x1
-; CHECK-NOLSE-O0-NEXT:    csel w12, w8, w9, ne
+; CHECK-NOLSE-O0-NEXT:    csel w12, w8, w9, le
 ; CHECK-NOLSE-O0-NEXT:  LBB43_2: ; %atomicrmw.start
 ; CHECK-NOLSE-O0-NEXT:    ; Parent Loop BB43_1 Depth=1
 ; CHECK-NOLSE-O0-NEXT:    ; => This Inner Loop Header: Depth=2
@@ -2536,10 +2521,7 @@ define i16 @atomicrmw_max_i16(ptr %ptr, i16 %rhs) {
 ; CHECK-NOLSE-O0-NEXT:    ldr w9, [sp, #24] ; 4-byte Folded Reload
 ; CHECK-NOLSE-O0-NEXT:    sxth w10, w8
 ; CHECK-NOLSE-O0-NEXT:    subs w10, w10, w9, sxth
-; CHECK-NOLSE-O0-NEXT:    cset w10, gt
-; CHECK-NOLSE-O0-NEXT:    and w10, w10, #0x1
-; CHECK-NOLSE-O0-NEXT:    ands w10, w10, #0x1
-; CHECK-NOLSE-O0-NEXT:    csel w12, w8, w9, ne
+; CHECK-NOLSE-O0-NEXT:    csel w12, w8, w9, gt
 ; CHECK-NOLSE-O0-NEXT:  LBB44_2: ; %atomicrmw.start
 ; CHECK-NOLSE-O0-NEXT:    ; Parent Loop BB44_1 Depth=1
 ; CHECK-NOLSE-O0-NEXT:    ; => This Inner Loop Header: Depth=2
@@ -2610,10 +2592,7 @@ define i16 @atomicrmw_umin_i16(ptr %ptr, i16 %rhs) {
 ; CHECK-NOLSE-O0-NEXT:    ldr w9, [sp, #24] ; 4-byte Folded Reload
 ; CHECK-NOLSE-O0-NEXT:    uxth w10, w8
 ; CHECK-NOLSE-O0-NEXT:    subs w10, w10, w9, uxth
-; CHECK-NOLSE-O0-NEXT:    cset w10, ls
-; CHECK-NOLSE-O0-NEXT:    and w10, w10, #0x1
-; CHECK-NOLSE-O0-NEXT:    ands w10, w10, #0x1
-; CHECK-NOLSE-O0-NEXT:    csel w12, w8, w9, ne
+; CHECK-NOLSE-O0-NEXT:    csel w12, w8, w9, ls
 ; CHECK-NOLSE-O0-NEXT:  LBB45_2: ; %atomicrmw.start
 ; CHECK-NOLSE-O0-NEXT:    ; Parent Loop BB45_1 Depth=1
 ; CHECK-NOLSE-O0-NEXT:    ; => This Inner Loop Header: Depth=2
@@ -2684,10 +2663,7 @@ define i16 @atomicrmw_umax_i16(ptr %ptr, i16 %rhs) {
 ; CHECK-NOLSE-O0-NEXT:    ldr w9, [sp, #24] ; 4-byte Folded Reload
 ; CHECK-NOLSE-O0-NEXT:    uxth w10, w8
 ; CHECK-NOLSE-O0-NEXT:    subs w10, w10, w9, uxth
-; CHECK-NOLSE-O0-NEXT:    cset w10, hi
-; CHECK-NOLSE-O0-NEXT:    and w10, w10, #0x1
-; CHECK-NOLSE-O0-NEXT:    ands w10, w10, #0x1
-; CHECK-NOLSE-O0-NEXT:    csel w12, w8, w9, ne
+; CHECK-NOLSE-O0-NEXT:    csel w12, w8, w9, hi
 ; CHECK-NOLSE-O0-NEXT:  LBB46_2: ; %atomicrmw.start
 ; CHECK-NOLSE-O0-NEXT:    ; Parent Loop BB46_1 Depth=1
 ; CHECK-NOLSE-O0-NEXT:    ; => This Inner Loop Header: Depth=2
@@ -2763,8 +2739,7 @@ define { i8, i1 } @cmpxchg_i8(ptr %ptr, i8 %desired, i8 %new) {
 ; CHECK-NOLSE-O0-NEXT:  LBB47_3:
 ; CHECK-NOLSE-O0-NEXT:    and w8, w0, #0xff
 ; CHECK-NOLSE-O0-NEXT:    subs w8, w8, w1, uxtb
-; CHECK-NOLSE-O0-NEXT:    cset w8, eq
-; CHECK-NOLSE-O0-NEXT:    and w1, w8, #0x1
+; CHECK-NOLSE-O0-NEXT:    cset w1, eq
 ; CHECK-NOLSE-O0-NEXT:    ret
 ;
 ; CHECK-LSE-O1-LABEL: cmpxchg_i8:
@@ -2784,8 +2759,7 @@ define { i8, i1 } @cmpxchg_i8(ptr %ptr, i8 %desired, i8 %new) {
 ; CHECK-LSE-O0-NEXT:    casb w0, w2, [x8]
 ; CHECK-LSE-O0-NEXT:    and w8, w0, #0xff
 ; CHECK-LSE-O0-NEXT:    subs w8, w8, w1, uxtb
-; CHECK-LSE-O0-NEXT:    cset w8, eq
-; CHECK-LSE-O0-NEXT:    and w1, w8, #0x1
+; CHECK-LSE-O0-NEXT:    cset w1, eq
 ; CHECK-LSE-O0-NEXT:    ret
   %res = cmpxchg ptr %ptr, i8 %desired, i8 %new monotonic monotonic
   ret { i8, i1 } %res
@@ -2829,8 +2803,7 @@ define { i16, i1 } @cmpxchg_i16(ptr %ptr, i16 %desired, i16 %new) {
 ; CHECK-NOLSE-O0-NEXT:  LBB48_3:
 ; CHECK-NOLSE-O0-NEXT:    and w8, w0, #0xffff
 ; CHECK-NOLSE-O0-NEXT:    subs w8, w8, w1, uxth
-; CHECK-NOLSE-O0-NEXT:    cset w8, eq
-; CHECK-NOLSE-O0-NEXT:    and w1, w8, #0x1
+; CHECK-NOLSE-O0-NEXT:    cset w1, eq
 ; CHECK-NOLSE-O0-NEXT:    ret
 ;
 ; CHECK-LSE-O1-LABEL: cmpxchg_i16:
@@ -2850,8 +2823,7 @@ define { i16, i1 } @cmpxchg_i16(ptr %ptr, i16 %desired, i16 %new) {
 ; CHECK-LSE-O0-NEXT:    cash w0, w2, [x8]
 ; CHECK-LSE-O0-NEXT:    and w8, w0, #0xffff
 ; CHECK-LSE-O0-NEXT:    subs w8, w8, w1, uxth
-; CHECK-LSE-O0-NEXT:    cset w8, eq
-; CHECK-LSE-O0-NEXT:    and w1, w8, #0x1
+; CHECK-LSE-O0-NEXT:    cset w1, eq
 ; CHECK-LSE-O0-NEXT:    ret
   %res = cmpxchg ptr %ptr, i16 %desired, i16 %new monotonic monotonic
   ret { i16, i1 } %res
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-ext-debugloc.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-ext-debugloc.mir
index 02d27e556fd58..860df510b2118 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-ext-debugloc.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-ext-debugloc.mir
@@ -2,7 +2,7 @@
 
 # Check that when we combine ZEXT/ANYEXT we assign the correct location.
 # CHECK: !8 = !DILocation(line: 23, column: 5, scope: !4)
-# CHECK:  G_AND %16, %15, debug-location !8
+# CHECK:  G_AND %15, %16, debug-location !8
 
 --- |
   target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-sext-debugloc.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-sext-debugloc.mir
index 4f44badb8e38c..b0d7b04ed50a9 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-sext-debugloc.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-sext-debugloc.mir
@@ -2,7 +2,7 @@
 
 # Check that when we combine SEXT we assign the correct debug location.
 # CHECK: !9 = !DILocation(line: 36, column: 21, scope: !4)
-# CHECK: G_AND %5, %4, debug-location !9
+# CHECK: G_AND %4, %5, debug-location !9
 
 --- |
   target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/huge-switch.ll b/llvm/test/CodeGen/AArch64/GlobalISel/huge-switch.ll
index 3c193307a863e..8742a848c4af1 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/huge-switch.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/huge-switch.ll
@@ -1,9 +1,7 @@
 ; RUN: llc -mtriple=arm64-apple-ios %s -o - -O0 -global-isel=1 | FileCheck %s
 define void @foo(i512 %in) {
 ; CHECK-LABEL: foo:
-; CHECK: subs
-; CHECK-NEXT: cset
-; CHECK-NEXT: tbnz
+; CHECK: cbz
   switch i512 %in, label %default [
     i512 3923188584616675477397368389504791510063972152790021570560, label %l1
     i512 3923188584616675477397368389504791510063972152790021570561, label %l2
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/invoke-region.ll b/llvm/test/CodeGen/AArch64/GlobalISel/invoke-region.ll
index 291eb5e22ca14..007e1fb3d63da 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/invoke-region.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/invoke-region.ll
@@ -38,8 +38,8 @@ define i1 @test_lpad_phi_widen_into_pred() personality ptr @__gxx_personality_v0
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3.continue:
   ; CHECK-NEXT:   [[PHI1:%[0-9]+]]:_(s16) = G_PHI [[C2]](s16), %bb.1, [[C3]](s16), %bb.2
-  ; CHECK-NEXT:   [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
   ; CHECK-NEXT:   [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[PHI1]](s16)
+  ; CHECK-NEXT:   [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
   ; CHECK-NEXT:   [[AND:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C4]]
   ; CHECK-NEXT:   $w0 = COPY [[AND]](s32)
   ; CHECK-NEXT:   RET_ReallyLR implicit $w0
@@ -95,8 +95,8 @@ define i1 @test_lpad_phi_widen_into_pred_ext(ptr %ptr) personality ptr @__gxx_pe
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3.continue:
   ; CHECK-NEXT:   [[PHI1:%[0-9]+]]:_(s16) = G_PHI [[ANYEXT]](s16), %bb.1, [[C2]](s16), %bb.2
-  ; CHECK-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
   ; CHECK-NEXT:   [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[PHI1]](s16)
+  ; CHECK-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
   ; CHECK-NEXT:   [[AND:%[0-9]+]]:_(s32) = G_AND [[ANYEXT1]], [[C3]]
   ; CHECK-NEXT:   $w0 = COPY [[AND]](s32)
   ; CHECK-NEXT:   RET_ReallyLR implicit $w0
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-hoisted-constants.ll b/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-hoisted-constants.ll
index fa3074a554944..a7d7a1e81617e 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-hoisted-constants.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-hoisted-constants.ll
@@ -38,6 +38,7 @@ define i32 @test(i32 %a, i1 %c) {
   ; TRANSLATED-NEXT:   BL @callee, csr_darwin_aarch64_aapcs, implicit-def $lr, implicit $sp
   ; TRANSLATED-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $sp, implicit $sp
   ; TRANSLATED-NEXT:   G_BR %bb.2
+  ;
   ; PRESELECTION-LABEL: name: test
   ; PRESELECTION: bb.1.entry:
   ; PRESELECTION-NEXT:   successors: %bb.3(0x40000000), %bb.2(0x40000000)
@@ -50,8 +51,8 @@ define i32 @test(i32 %a, i1 %c) {
   ; PRESELECTION-NEXT:   [[C:%[0-9]+]]:gpr(s32) = G_CONSTANT i32 0
   ; PRESELECTION-NEXT:   [[C1:%[0-9]+]]:gpr(s32) = G_CONSTANT i32 100000
   ; PRESELECTION-NEXT:   [[CONSTANT_FOLD_BARRIER:%[0-9]+]]:gpr(s32) = G_CONSTANT_FOLD_BARRIER [[C1]]
-  ; PRESELECTION-NEXT:   [[C2:%[0-9]+]]:gpr(s32) = G_CONSTANT i32 1
   ; PRESELECTION-NEXT:   [[ANYEXT:%[0-9]+]]:gpr(s32) = G_ANYEXT [[ASSERT_ZEXT]](s8)
+  ; PRESELECTION-NEXT:   [[C2:%[0-9]+]]:gpr(s32) = G_CONSTANT i32 1
   ; PRESELECTION-NEXT:   [[AND:%[0-9]+]]:gpr(s32) = G_AND [[ANYEXT]], [[C2]]
   ; PRESELECTION-NEXT:   G_BRCOND [[AND]](s32), %bb.3
   ; PRESELECTION-NEXT:   G_BR %bb.2
@@ -69,6 +70,7 @@ define i32 @test(i32 %a, i1 %c) {
   ; PRESELECTION-NEXT:   BL @callee, csr_darwin_aarch64_aapcs, implicit-def $lr, implicit $sp
   ; PRESELECTION-NEXT:   ADJCALLSTACKUP 0, 0, implicit-def $sp, implicit $sp
   ; PRESELECTION-NEXT:   G_BR %bb.2
+  ;
   ; POSTSELECTION-LABEL: name: test
   ; POSTSELECTION: bb.1.entry:
   ; POSTSELECTION-NEXT:   successors: %bb.3(0x40000000), %bb.2(0x40000000)
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-add.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-add.mir
index 02f96cbe99789..ceddb4bca7255 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-add.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-add.mir
@@ -10,9 +10,7 @@ body:             |
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY $x2
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s64) = COPY $x3
     ; CHECK-NEXT: [[UADDO:%[0-9]+]]:_(s64), [[UADDO1:%[0-9]+]]:_(s32) = G_UADDO [[COPY]], [[COPY2]]
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[UADDO1]], [[C]]
-    ; CHECK-NEXT: [[UADDE:%[0-9]+]]:_(s64), [[UADDE1:%[0-9]+]]:_(s32) = G_UADDE [[COPY1]], [[COPY3]], [[AND]]
+    ; CHECK-NEXT: [[UADDE:%[0-9]+]]:_(s64), [[UADDE1:%[0-9]+]]:_(s32) = G_UADDE [[COPY1]], [[COPY3]], [[UADDO1]]
     ; CHECK-NEXT: $x0 = COPY [[UADDO]](s64)
     ; CHECK-NEXT: $x1 = COPY [[UADDE]](s64)
     %0:_(s64) = COPY $x0
@@ -37,11 +35,8 @@ body:             |
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY $x2
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s64) = COPY $x3
     ; CHECK-NEXT: [[UADDO:%[0-9]+]]:_(s64), [[UADDO1:%[0-9]+]]:_(s32) = G_UADDO [[COPY]], [[COPY1]]
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[UADDO1]], [[C]]
-    ; CHECK-NEXT: [[UADDE:%[0-9]+]]:_(s64), [[UADDE1:%[0-9]+]]:_(s32) = G_UADDE [[COPY1]], [[COPY2]], [[AND]]
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[UADDE1]], [[C]]
-    ; CHECK-NEXT: [[UADDE2:%[0-9]+]]:_(s64), [[UADDE3:%[0-9]+]]:_(s32) = G_UADDE [[COPY2]], [[COPY3]], [[AND1]]
+    ; CHECK-NEXT: [[UADDE:%[0-9]+]]:_(s64), [[UADDE1:%[0-9]+]]:_(s32) = G_UADDE [[COPY1]], [[COPY2]], [[UADDO1]]
+    ; CHECK-NEXT: [[UADDE2:%[0-9]+]]:_(s64), [[UADDE3:%[0-9]+]]:_(s32) = G_UADDE [[COPY2]], [[COPY3]], [[UADDE1]]
     ; CHECK-NEXT: $x0 = COPY [[UADDO]](s64)
     ; CHECK-NEXT: $x1 = COPY [[UADDE]](s64)
     ; CHECK-NEXT: $x2 = COPY [[UADDE2]](s64)
@@ -89,9 +84,7 @@ body:             |
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY $x2
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s64) = COPY $x3
     ; CHECK-NEXT: [[UADDO:%[0-9]+]]:_(s64), [[UADDO1:%[0-9]+]]:_(s32) = G_UADDO [[COPY]], [[COPY2]]
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[UADDO1]], [[C]]
-    ; CHECK-NEXT: [[UADDE:%[0-9]+]]:_(s64), [[UADDE1:%[0-9]+]]:_(s32) = G_UADDE [[COPY1]], [[COPY3]], [[AND]]
+    ; CHECK-NEXT: [[UADDE:%[0-9]+]]:_(s64), [[UADDE1:%[0-9]+]]:_(s32) = G_UADDE [[COPY1]], [[COPY3]], [[UADDO1]]
     ; CHECK-NEXT: $x0 = COPY [[UADDO]](s64)
     ; CHECK-NEXT: $x1 = COPY [[UADDE]](s64)
     %0:_(s64) = COPY $x0
@@ -119,9 +112,7 @@ body:             |
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY $x2
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s64) = COPY $x3
     ; CHECK-NEXT: [[UADDO:%[0-9]+]]:_(s64), [[UADDO1:%[0-9]+]]:_(s32) = G_UADDO [[COPY]], [[COPY2]]
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[UADDO1]], [[C]]
-    ; CHECK-NEXT: [[UADDE:%[0-9]+]]:_(s64), [[UADDE1:%[0-9]+]]:_(s32) = G_UADDE [[COPY1]], [[COPY3]], [[AND]]
+    ; CHECK-NEXT: [[UADDE:%[0-9]+]]:_(s64), [[UADDE1:%[0-9]+]]:_(s32) = G_UADDE [[COPY1]], [[COPY3]], [[UADDO1]]
     ; CHECK-NEXT: $x0 = COPY [[UADDO]](s64)
     ; CHECK-NEXT: $x1 = COPY [[UADDE]](s64)
     %0:_(s64) = COPY $x0
@@ -367,15 +358,15 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s8) = COPY $b0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s8) = COPY $b1
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s8) = COPY $b2
-    ; CHECK-NEXT: [[ANYEXT0:%[0-9]+]]:_(s16) = G_ANYEXT [[COPY]](s8)
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s16) = G_ANYEXT [[COPY]](s8)
     ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s16) = G_ANYEXT [[COPY1]](s8)
     ; CHECK-NEXT: [[ANYEXT2:%[0-9]+]]:_(s16) = G_ANYEXT [[COPY2]](s8)
-    ; CHECK-NEXT: [[IMPLICIT_DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
-    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s16>) = G_BUILD_VECTOR [[ANYEXT0]](s16), [[ANYEXT1]](s16), [[ANYEXT2]](s16), [[IMPLICIT_DEF]](s16)
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s16>) = G_BUILD_VECTOR [[ANYEXT]](s16), [[ANYEXT1]](s16), [[ANYEXT2]](s16), [[DEF]](s16)
     ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(<4 x s16>) = G_ADD [[BUILD_VECTOR]], [[BUILD_VECTOR]]
-    ; CHECK-NEXT: [[VAL0:%[0-9]+]]:_(s16), [[VAL1:%[0-9]+]]:_(s16), [[VAL2:%[0-9]+]]:_(s16), [[VAL3:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[ADD]](<4 x s16>)
-    ; CHECK-NEXT: [[TRUNC3:%[0-9]+]]:_(s8) = G_TRUNC [[VAL0]](s16)
-    ; CHECK-NEXT: $b0 = COPY [[TRUNC3]](s8)
+    ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16), [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[ADD]](<4 x s16>)
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[UV]](s16)
+    ; CHECK-NEXT: $b0 = COPY [[TRUNC]](s8)
     ; CHECK-NEXT: RET_ReallyLR implicit $b0
     %1:_(s8) = COPY $b0
     %2:_(s8) = COPY $b1
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-cmp.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-cmp.mir
index c7c061a15a2f9..e9b3aa0a3a8fd 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-cmp.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-cmp.mir
@@ -9,8 +9,8 @@ body:             |
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x0
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(sge), [[COPY]](s64), [[COPY1]]
     ; CHECK-NEXT: $w0 = COPY [[ICMP]](s32)
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
     ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
     ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[TRUNC]], [[C]]
     ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64)
     ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[TRUNC1]], [[C]]
@@ -50,11 +50,8 @@ body:             |
   ; CHECK-NEXT:   [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[DEF]](s64), [[C1]]
   ; CHECK-NEXT:   [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[DEF]](s64), [[C1]]
   ; CHECK-NEXT:   [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[DEF]](s64), [[C]]
-  ; CHECK-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-  ; CHECK-NEXT:   [[AND:%[0-9]+]]:_(s32) = G_AND [[ICMP1]], [[C2]]
-  ; CHECK-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AND]](s32), [[ICMP2]], [[ICMP]]
-  ; CHECK-NEXT:   [[AND1:%[0-9]+]]:_(s32) = G_AND [[SELECT]], [[C2]]
-  ; CHECK-NEXT:   G_BRCOND [[AND1]](s32), %bb.1
+  ; CHECK-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s32), [[ICMP2]], [[ICMP]]
+  ; CHECK-NEXT:   G_BRCOND [[SELECT]](s32), %bb.1
   ; CHECK-NEXT:   G_BR %bb.2
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
@@ -91,9 +88,7 @@ body:             |
   ; CHECK-NEXT:   [[XOR1:%[0-9]+]]:_(s64) = G_XOR [[DEF]], [[DEF]]
   ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s64) = G_OR [[XOR]], [[XOR1]]
   ; CHECK-NEXT:   [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[OR]](s64), [[C]]
-  ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-  ; CHECK-NEXT:   [[AND:%[0-9]+]]:_(s32) = G_AND [[ICMP]], [[C1]]
-  ; CHECK-NEXT:   G_BRCOND [[AND]](s32), %bb.1
+  ; CHECK-NEXT:   G_BRCOND [[ICMP]](s32), %bb.1
   ; CHECK-NEXT:   G_BR %bb.2
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
@@ -132,9 +127,7 @@ body:             |
   ; CHECK-NEXT:   [[XOR1:%[0-9]+]]:_(s64) = G_XOR [[AND1]], [[AND3]]
   ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s64) = G_OR [[XOR]], [[XOR1]]
   ; CHECK-NEXT:   [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[OR]](s64), [[C2]]
-  ; CHECK-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-  ; CHECK-NEXT:   [[AND4:%[0-9]+]]:_(s32) = G_AND [[ICMP]], [[C3]]
-  ; CHECK-NEXT:   G_BRCOND [[AND4]](s32), %bb.1
+  ; CHECK-NEXT:   G_BRCOND [[ICMP]](s32), %bb.1
   ; CHECK-NEXT:   G_BR %bb.2
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
@@ -173,9 +166,7 @@ body:             |
   ; CHECK-NEXT:   [[XOR1:%[0-9]+]]:_(s64) = G_XOR [[AND1]], [[AND3]]
   ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s64) = G_OR [[XOR]], [[XOR1]]
   ; CHECK-NEXT:   [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ne), [[OR]](s64), [[C2]]
-  ; CHECK-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-  ; CHECK-NEXT:   [[AND4:%[0-9]+]]:_(s32) = G_AND [[ICMP]], [[C3]]
-  ; CHECK-NEXT:   G_BRCOND [[AND4]](s32), %bb.1
+  ; CHECK-NEXT:   G_BRCOND [[ICMP]](s32), %bb.1
   ; CHECK-NEXT:   G_BR %bb.2
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
@@ -214,9 +205,7 @@ body:             |
   ; CHECK-NEXT:   [[XOR1:%[0-9]+]]:_(s64) = G_XOR [[AND1]], [[AND3]]
   ; CHECK-NEXT:   [[OR:%[0-9]+]]:_(s64) = G_OR [[XOR]], [[XOR1]]
   ; CHECK-NEXT:   [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[OR]](s64), [[C2]]
-  ; CHECK-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-  ; CHECK-NEXT:   [[AND4:%[0-9]+]]:_(s32) = G_AND [[ICMP]], [[C3]]
-  ; CHECK-NEXT:   G_BRCOND [[AND4]](s32), %bb.1
+  ; CHECK-NEXT:   G_BRCOND [[ICMP]](s32), %bb.1
   ; CHECK-NEXT:   G_BR %bb.2
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
@@ -278,9 +267,7 @@ body:             |
   ; CHECK-NEXT:   [[OR5:%[0-9]+]]:_(s64) = G_OR [[OR4]], [[XOR6]]
   ; CHECK-NEXT:   [[OR6:%[0-9]+]]:_(s64) = G_OR [[OR5]], [[XOR7]]
   ; CHECK-NEXT:   [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[OR6]](s64), [[C2]]
-  ; CHECK-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-  ; CHECK-NEXT:   [[AND16:%[0-9]+]]:_(s32) = G_AND [[ICMP]], [[C3]]
-  ; CHECK-NEXT:   G_BRCOND [[AND16]](s32), %bb.1
+  ; CHECK-NEXT:   G_BRCOND [[ICMP]](s32), %bb.1
   ; CHECK-NEXT:   G_BR %bb.2
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
@@ -326,9 +313,7 @@ body:             |
   ; CHECK-NEXT:   [[OR1:%[0-9]+]]:_(s64) = G_OR [[OR]], [[XOR2]]
   ; CHECK-NEXT:   [[OR2:%[0-9]+]]:_(s64) = G_OR [[OR1]], [[XOR3]]
   ; CHECK-NEXT:   [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[OR2]](s64), [[C2]]
-  ; CHECK-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-  ; CHECK-NEXT:   [[AND8:%[0-9]+]]:_(s32) = G_AND [[ICMP]], [[C3]]
-  ; CHECK-NEXT:   G_BRCOND [[AND8]](s32), %bb.1
+  ; CHECK-NEXT:   G_BRCOND [[ICMP]](s32), %bb.1
   ; CHECK-NEXT:   G_BR %bb.2
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-ctlz.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-ctlz.mir
index 35261d26fbb4e..1cd1ab4a22e18 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-ctlz.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-ctlz.mir
@@ -292,18 +292,13 @@ body:             |
     ; CHECK-NEXT: [[CTLZ:%[0-9]+]]:_(s64) = G_CTLZ [[AND]](s64)
     ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 64
     ; CHECK-NEXT: [[UADDO:%[0-9]+]]:_(s64), [[UADDO1:%[0-9]+]]:_(s32) = G_UADDO [[CTLZ]], [[C3]]
-    ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[UADDO1]], [[C4]]
-    ; CHECK-NEXT: [[UADDE:%[0-9]+]]:_(s64), [[UADDE1:%[0-9]+]]:_(s32) = G_UADDE [[C2]], [[C2]], [[AND2]]
+    ; CHECK-NEXT: [[UADDE:%[0-9]+]]:_(s64), [[UADDE1:%[0-9]+]]:_(s32) = G_UADDE [[C2]], [[C2]], [[UADDO1]]
     ; CHECK-NEXT: [[CTLZ1:%[0-9]+]]:_(s64) = G_CTLZ [[AND1]](s64)
-    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[ICMP]], [[C4]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[AND3]](s32), [[UADDO]], [[CTLZ1]]
-    ; CHECK-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[ICMP]], [[C4]]
-    ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[AND4]](s32), [[UADDE]], [[C2]]
-    ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 63
-    ; CHECK-NEXT: [[USUBO:%[0-9]+]]:_(s64), [[USUBO1:%[0-9]+]]:_(s32) = G_USUBO [[SELECT]], [[C5]]
-    ; CHECK-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[USUBO1]], [[C4]]
-    ; CHECK-NEXT: [[USUBE:%[0-9]+]]:_(s64), [[USUBE1:%[0-9]+]]:_(s32) = G_USUBE [[SELECT1]], [[C2]], [[AND5]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s32), [[UADDO]], [[CTLZ1]]
+    ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s32), [[UADDE]], [[C2]]
+    ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 63
+    ; CHECK-NEXT: [[USUBO:%[0-9]+]]:_(s64), [[USUBO1:%[0-9]+]]:_(s32) = G_USUBO [[SELECT]], [[C4]]
+    ; CHECK-NEXT: [[USUBE:%[0-9]+]]:_(s64), [[USUBE1:%[0-9]+]]:_(s32) = G_USUBE [[SELECT1]], [[C2]], [[USUBO1]]
     ; CHECK-NEXT: $x0 = COPY [[USUBO]](s64)
     ; CHECK-NEXT: $x1 = COPY [[USUBE]](s64)
     ; CHECK-NEXT: RET_ReallyLR implicit $x0, implicit $x1
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-ctpop.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-ctpop.mir
index 472b09a3d78cb..ae0c29927afa6 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-ctpop.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-ctpop.mir
@@ -172,8 +172,8 @@ body:             |
     ; CHECK: liveins: $w0
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: %copy:_(s32) = COPY $w0
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 65535
     ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT %copy(s32)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 65535
     ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[ANYEXT]], [[C]]
     ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<8 x s8>) = G_BITCAST [[AND]](s64)
     ; CHECK-NEXT: [[CTPOP:%[0-9]+]]:_(<8 x s8>) = G_CTPOP [[BITCAST]](<8 x s8>)
@@ -211,8 +211,8 @@ body:             |
     ; CHECK: liveins: $w0
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: %copy:_(s32) = COPY $w0
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 255
     ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT %copy(s32)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 255
     ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[ANYEXT]], [[C]]
     ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<8 x s8>) = G_BITCAST [[AND]](s64)
     ; CHECK-NEXT: [[CTPOP:%[0-9]+]]:_(<8 x s8>) = G_CTPOP [[BITCAST]](<8 x s8>)
@@ -250,8 +250,8 @@ body:             |
     ; CHECK: liveins: $w0
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: %copy:_(s32) = COPY $w0
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 7
     ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT %copy(s32)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 7
     ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[ANYEXT]], [[C]]
     ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<8 x s8>) = G_BITCAST [[AND]](s64)
     ; CHECK-NEXT: [[CTPOP:%[0-9]+]]:_(<8 x s8>) = G_CTPOP [[BITCAST]](<8 x s8>)
@@ -288,8 +288,8 @@ body:             |
     ; CHECK: liveins: $w0
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: %copy:_(s32) = COPY $w0
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 255
     ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT %copy(s32)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 255
     ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[ANYEXT]], [[C]]
     ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<8 x s8>) = G_BITCAST [[AND]](s64)
     ; CHECK-NEXT: [[CTPOP:%[0-9]+]]:_(<8 x s8>) = G_CTPOP [[BITCAST]](<8 x s8>)
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-cttz.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-cttz.mir
index 17b80e03df7e2..94ac5146006b1 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-cttz.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-cttz.mir
@@ -19,6 +19,7 @@ body:             |
     ; CHECK-NEXT: [[CTLZ:%[0-9]+]]:_(s32) = G_CTLZ [[BITREVERSE]](s32)
     ; CHECK-NEXT: $w0 = COPY [[CTLZ]](s32)
     ; CHECK-NEXT: RET_ReallyLR implicit $w0
+    ;
     ; CHECK-CSSC-LABEL: name: s8
     ; CHECK-CSSC: liveins: $w0
     ; CHECK-CSSC-NEXT: {{  $}}
@@ -51,6 +52,7 @@ body:             |
     ; CHECK-NEXT: [[CTLZ:%[0-9]+]]:_(s32) = G_CTLZ [[BITREVERSE]](s32)
     ; CHECK-NEXT: $w0 = COPY [[CTLZ]](s32)
     ; CHECK-NEXT: RET_ReallyLR implicit $w0
+    ;
     ; CHECK-CSSC-LABEL: name: s16
     ; CHECK-CSSC: liveins: $w0
     ; CHECK-CSSC-NEXT: {{  $}}
@@ -83,6 +85,7 @@ body:             |
     ; CHECK-NEXT: [[CTLZ:%[0-9]+]]:_(s32) = G_CTLZ [[BITREVERSE]](s32)
     ; CHECK-NEXT: $w0 = COPY [[CTLZ]](s32)
     ; CHECK-NEXT: RET_ReallyLR implicit $w0
+    ;
     ; CHECK-CSSC-LABEL: name: s32
     ; CHECK-CSSC: liveins: $w0
     ; CHECK-CSSC-NEXT: {{  $}}
@@ -112,6 +115,7 @@ body:             |
     ; CHECK-NEXT: [[CTLZ:%[0-9]+]]:_(s64) = G_CTLZ [[BITREVERSE]](s64)
     ; CHECK-NEXT: $x0 = COPY [[CTLZ]](s64)
     ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    ;
     ; CHECK-CSSC-LABEL: name: s64
     ; CHECK-CSSC: liveins: $x0
     ; CHECK-CSSC-NEXT: {{  $}}
@@ -144,6 +148,7 @@ body:             |
     ; CHECK-NEXT: [[CTPOP:%[0-9]+]]:_(<4 x s32>) = G_CTPOP [[AND]](<4 x s32>)
     ; CHECK-NEXT: $q0 = COPY [[CTPOP]](<4 x s32>)
     ; CHECK-NEXT: RET_ReallyLR implicit $q0
+    ;
     ; CHECK-CSSC-LABEL: name: v4s32
     ; CHECK-CSSC: liveins: $q0
     ; CHECK-CSSC-NEXT: {{  $}}
@@ -180,6 +185,7 @@ body:             |
     ; CHECK-NEXT: [[CTLZ:%[0-9]+]]:_(s64) = G_CTLZ [[BITREVERSE]](s64)
     ; CHECK-NEXT: $x0 = COPY [[CTLZ]](s64)
     ; CHECK-NEXT: RET_ReallyLR implicit $x0
+    ;
     ; CHECK-CSSC-LABEL: name: s35
     ; CHECK-CSSC: liveins: $x0
     ; CHECK-CSSC-NEXT: {{  $}}
@@ -218,17 +224,15 @@ body:             |
     ; CHECK-NEXT: [[CTLZ:%[0-9]+]]:_(s64) = G_CTLZ [[BITREVERSE]](s64)
     ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 64
     ; CHECK-NEXT: [[UADDO:%[0-9]+]]:_(s64), [[UADDO1:%[0-9]+]]:_(s32) = G_UADDO [[CTLZ]], [[C2]]
-    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[UADDO1]], [[C3]]
-    ; CHECK-NEXT: [[UADDE:%[0-9]+]]:_(s64), [[UADDE1:%[0-9]+]]:_(s32) = G_UADDE [[C]], [[C]], [[AND]]
+    ; CHECK-NEXT: [[UADDE:%[0-9]+]]:_(s64), [[UADDE1:%[0-9]+]]:_(s32) = G_UADDE [[C]], [[C]], [[UADDO1]]
     ; CHECK-NEXT: [[BITREVERSE1:%[0-9]+]]:_(s64) = G_BITREVERSE [[OR]]
     ; CHECK-NEXT: [[CTLZ1:%[0-9]+]]:_(s64) = G_CTLZ [[BITREVERSE1]](s64)
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[ICMP]], [[C3]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[AND1]](s32), [[UADDO]], [[CTLZ1]]
-    ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[AND1]](s32), [[UADDE]], [[C]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s32), [[UADDO]], [[CTLZ1]]
+    ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s32), [[UADDE]], [[C]]
     ; CHECK-NEXT: $x0 = COPY [[SELECT]](s64)
     ; CHECK-NEXT: $x1 = COPY [[SELECT1]](s64)
     ; CHECK-NEXT: RET_ReallyLR implicit $x0, implicit $x1
+    ;
     ; CHECK-CSSC-LABEL: name: s65
     ; CHECK-CSSC: liveins: $x0, $x1
     ; CHECK-CSSC-NEXT: {{  $}}
@@ -242,13 +246,10 @@ body:             |
     ; CHECK-CSSC-NEXT: [[CTTZ:%[0-9]+]]:_(s64) = G_CTTZ [[OR1]](s64)
     ; CHECK-CSSC-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 64
     ; CHECK-CSSC-NEXT: [[UADDO:%[0-9]+]]:_(s64), [[UADDO1:%[0-9]+]]:_(s32) = G_UADDO [[CTTZ]], [[C2]]
-    ; CHECK-CSSC-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-CSSC-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[UADDO1]], [[C3]]
-    ; CHECK-CSSC-NEXT: [[UADDE:%[0-9]+]]:_(s64), [[UADDE1:%[0-9]+]]:_(s32) = G_UADDE [[C]], [[C]], [[AND]]
+    ; CHECK-CSSC-NEXT: [[UADDE:%[0-9]+]]:_(s64), [[UADDE1:%[0-9]+]]:_(s32) = G_UADDE [[C]], [[C]], [[UADDO1]]
     ; CHECK-CSSC-NEXT: [[CTTZ1:%[0-9]+]]:_(s64) = G_CTTZ [[OR]](s64)
-    ; CHECK-CSSC-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[ICMP]], [[C3]]
-    ; CHECK-CSSC-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[AND1]](s32), [[UADDO]], [[CTTZ1]]
-    ; CHECK-CSSC-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[AND1]](s32), [[UADDE]], [[C]]
+    ; CHECK-CSSC-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s32), [[UADDO]], [[CTTZ1]]
+    ; CHECK-CSSC-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s32), [[UADDE]], [[C]]
     ; CHECK-CSSC-NEXT: $x0 = COPY [[SELECT]](s64)
     ; CHECK-CSSC-NEXT: $x1 = COPY [[SELECT1]](s64)
     ; CHECK-CSSC-NEXT: RET_ReallyLR implicit $x0, implicit $x1
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-div.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-div.mir
index 7fec515fe3fb0..d0089bba68bec 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-div.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-div.mir
@@ -6,20 +6,20 @@ body:             |
   bb.0.entry:
     ; CHECK-LABEL: name: test_div
     ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
-    ; CHECK: [[COPY1:%[0-9]+]]:_(s64) = COPY $x1
-    ; CHECK: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
-    ; CHECK: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[TRUNC]], 8
-    ; CHECK: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64)
-    ; CHECK: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[TRUNC1]], 8
-    ; CHECK: [[SDIV:%[0-9]+]]:_(s32) = G_SDIV [[SEXT_INREG]], [[SEXT_INREG1]]
-    ; CHECK: $w0 = COPY [[SDIV]](s32)
-    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
-    ; CHECK: [[TRUNC2:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
-    ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[TRUNC2]], [[C]]
-    ; CHECK: [[TRUNC3:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64)
-    ; CHECK: [[AND1:%[0-9]+]]:_(s32) = G_AND [[TRUNC3]], [[C]]
-    ; CHECK: [[UDIV:%[0-9]+]]:_(s32) = G_UDIV [[AND]], [[AND1]]
-    ; CHECK: $w0 = COPY [[UDIV]](s32)
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x1
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[TRUNC]], 8
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64)
+    ; CHECK-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[TRUNC1]], 8
+    ; CHECK-NEXT: [[SDIV:%[0-9]+]]:_(s32) = G_SDIV [[SEXT_INREG]], [[SEXT_INREG1]]
+    ; CHECK-NEXT: $w0 = COPY [[SDIV]](s32)
+    ; CHECK-NEXT: [[TRUNC2:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[TRUNC2]], [[C]]
+    ; CHECK-NEXT: [[TRUNC3:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64)
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[TRUNC3]], [[C]]
+    ; CHECK-NEXT: [[UDIV:%[0-9]+]]:_(s32) = G_UDIV [[AND]], [[AND1]]
+    ; CHECK-NEXT: $w0 = COPY [[UDIV]](s32)
     %0:_(s64) = COPY $x0
     %1:_(s64) = COPY $x1
     %2:_(s8) = G_TRUNC %0(s64)
@@ -43,17 +43,18 @@ body:             |
 
     ; CHECK-LABEL: name: sdiv_v4s32
     ; CHECK: liveins: $q0, $q1
-    ; CHECK: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0
-    ; CHECK: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $q1
-    ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<4 x s32>)
-    ; CHECK: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<4 x s32>)
-    ; CHECK: [[SDIV:%[0-9]+]]:_(s32) = G_SDIV [[UV]], [[UV4]]
-    ; CHECK: [[SDIV1:%[0-9]+]]:_(s32) = G_SDIV [[UV1]], [[UV5]]
-    ; CHECK: [[SDIV2:%[0-9]+]]:_(s32) = G_SDIV [[UV2]], [[UV6]]
-    ; CHECK: [[SDIV3:%[0-9]+]]:_(s32) = G_SDIV [[UV3]], [[UV7]]
-    ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[SDIV]](s32), [[SDIV1]](s32), [[SDIV2]](s32), [[SDIV3]](s32)
-    ; CHECK: $q0 = COPY [[BUILD_VECTOR]](<4 x s32>)
-    ; CHECK: RET_ReallyLR implicit $q0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $q1
+    ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<4 x s32>)
+    ; CHECK-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<4 x s32>)
+    ; CHECK-NEXT: [[SDIV:%[0-9]+]]:_(s32) = G_SDIV [[UV]], [[UV4]]
+    ; CHECK-NEXT: [[SDIV1:%[0-9]+]]:_(s32) = G_SDIV [[UV1]], [[UV5]]
+    ; CHECK-NEXT: [[SDIV2:%[0-9]+]]:_(s32) = G_SDIV [[UV2]], [[UV6]]
+    ; CHECK-NEXT: [[SDIV3:%[0-9]+]]:_(s32) = G_SDIV [[UV3]], [[UV7]]
+    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[SDIV]](s32), [[SDIV1]](s32), [[SDIV2]](s32), [[SDIV3]](s32)
+    ; CHECK-NEXT: $q0 = COPY [[BUILD_VECTOR]](<4 x s32>)
+    ; CHECK-NEXT: RET_ReallyLR implicit $q0
     %0:_(<4 x s32>) = COPY $q0
     %1:_(<4 x s32>) = COPY $q1
     %2:_(<4 x s32>) = G_SDIV %0, %1
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-ext-cse.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-ext-cse.mir
index 98e6cbbda593c..f105b2e934d3c 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-ext-cse.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-ext-cse.mir
@@ -6,12 +6,12 @@ body:             |
   bb.0.entry:
     ; CHECK-LABEL: name: test_cse_in_legalizer
     ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
-    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
-    ; CHECK: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
-    ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[TRUNC]], [[C]]
-    ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[AND]](s32)
-    ; CHECK: $w0 = COPY [[COPY1]](s32)
-    ; CHECK: $w0 = COPY [[AND]](s32)
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[TRUNC]], [[C]]
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[AND]](s32)
+    ; CHECK-NEXT: $w0 = COPY [[COPY1]](s32)
+    ; CHECK-NEXT: $w0 = COPY [[AND]](s32)
     %0:_(s64) = COPY $x0
     %1:_(s8) = G_TRUNC %0(s64)
     %19:_(s32) = G_ZEXT %1(s8)
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-ext-csedebug-output.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-ext-csedebug-output.mir
index fe6f4a1344c77..2a45a05b2e5a0 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-ext-csedebug-output.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-ext-csedebug-output.mir
@@ -10,11 +10,11 @@ body:             |
     ; CHECK: CSEInfo::Add MI: %{{[0-9]+}}:_(s32) = G_ZEXT
     ; CHECK: CSEInfo::Add MI: %{{[0-9]+}}:_(s8) = G_TRUNC
     ; CHECK: CSEInfo::Add MI: %{{[0-9]+}}:_(s32) = G_ZEXT
-    ; CHECK: CSEInfo::Recording new MI G_CONSTANT
     ; CHECK: CSEInfo::Recording new MI G_TRUNC
+    ; CHECK: CSEInfo::Recording new MI G_CONSTANT
     ; CHECK: CSEInfo::Recording new MI G_AND
-    ; CHECK: CSEInfo::Found Instr %{{[0-9]+}}:_(s32) = G_CONSTANT
     ; CHECK: CSEInfo::Found Instr %{{[0-9]+}}:_(s32) = G_TRUNC
+    ; CHECK: CSEInfo::Found Instr %{{[0-9]+}}:_(s32) = G_CONSTANT
     ; CHECK: CSEInfo::Found Instr %{{[0-9]+}}:_(s32) = G_AND
     ; CHECK: CSEInfo::CSE Hit for Opc {{[0-9]+}} : 1
     ; CHECK: CSEInfo::CSE Hit for Opc {{[0-9]+}} : 1
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-ext.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-ext.mir
index 8275ed81d3e1f..74401a2f1ceac 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-ext.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-ext.mir
@@ -24,14 +24,14 @@ body:             |
     ; CHECK-NEXT: [[TRUNC4:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
     ; CHECK-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[TRUNC4]], 1
     ; CHECK-NEXT: $w0 = COPY [[SEXT_INREG1]](s32)
-    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
     ; CHECK-NEXT: [[TRUNC5:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
     ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[TRUNC5]], [[C1]]
     ; CHECK-NEXT: $w0 = COPY [[AND1]](s32)
     ; CHECK-NEXT: [[TRUNC6:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
     ; CHECK-NEXT: $w0 = COPY [[TRUNC6]](s32)
-    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; CHECK-NEXT: [[TRUNC7:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[TRUNC7]], [[C2]]
     ; CHECK-NEXT: $w0 = COPY [[AND2]](s32)
     ; CHECK-NEXT: [[TRUNC8:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-extracts.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-extracts.mir
index 62a0c92004f7f..68302f5f18695 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-extracts.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-extracts.mir
@@ -291,8 +291,8 @@ body: |
     ; CHECK: liveins: $w0
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s64) = G_IMPLICIT_DEF
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 7
     ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[DEF]](s64)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 7
     ; CHECK-NEXT: %ext:_(s32) = G_AND [[TRUNC]], [[C]]
     ; CHECK-NEXT: $w0 = COPY %ext(s32)
     ; CHECK-NEXT: RET_ReallyLR implicit $w0
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-freeze.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-freeze.mir
index 57bd3e761f81d..be674d79b54f1 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-freeze.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-freeze.mir
@@ -7,7 +7,9 @@ body: |
     liveins: $x0
 
     ; CHECK-LABEL: name: test_freeze_s64
-    ; CHECK: %x0:_(s64) = COPY $x0
+    ; CHECK: liveins: $x0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %x0:_(s64) = COPY $x0
     ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(s64) = G_FREEZE %x0
     ; CHECK-NEXT: $x0 = COPY [[FREEZE]](s64)
     %x0:_(s64) = COPY $x0
@@ -21,7 +23,9 @@ body: |
     liveins: $q0
 
     ; CHECK-LABEL: name: test_freeze_v4s32
-    ; CHECK: %q0:_(<4 x s32>) = COPY $q0
+    ; CHECK: liveins: $q0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %q0:_(<4 x s32>) = COPY $q0
     ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(<4 x s32>) = G_FREEZE %q0
     ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<2 x s32>), [[UV1:%[0-9]+]]:_(<2 x s32>) = G_UNMERGE_VALUES [[FREEZE]](<4 x s32>)
     ; CHECK-NEXT: $x0 = COPY [[UV]](<2 x s32>)
@@ -56,7 +60,9 @@ body: |
     liveins: $d0
 
     ; CHECK-LABEL: name: test_freeze_v2s32
-    ; CHECK: %d0:_(<2 x s32>) = COPY $d0
+    ; CHECK: liveins: $d0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %d0:_(<2 x s32>) = COPY $d0
     ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(<2 x s32>) = G_FREEZE %d0
     ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[FREEZE]](<2 x s32>)
     ; CHECK-NEXT: $w0 = COPY [[UV]](s32)
@@ -74,7 +80,9 @@ body: |
     liveins: $d0
 
     ; CHECK-LABEL: name: test_freeze_v8s8
-    ; CHECK: %d0:_(<8 x s8>) = COPY $d0
+    ; CHECK: liveins: $d0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: %d0:_(<8 x s8>) = COPY $d0
     ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(<8 x s8>) = G_FREEZE %d0
     ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<4 x s8>), [[UV1:%[0-9]+]]:_(<4 x s8>) = G_UNMERGE_VALUES [[FREEZE]](<8 x s8>)
     ; CHECK-NEXT: $w0 = COPY [[UV]](<4 x s8>)
@@ -91,10 +99,12 @@ body: |
   bb.0.entry:
     liveins: $x0
     ; CHECK-LABEL: name: test_freeze_s1
-    ; CHECK: [[DEF:%[0-9]+]]:_(s8) = G_IMPLICIT_DEF
+    ; CHECK: liveins: $x0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s8) = G_IMPLICIT_DEF
     ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(s8) = G_FREEZE [[DEF]]
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
     ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[FREEZE]](s8)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
     ; CHECK-NEXT: %ext:_(s64) = G_AND [[ANYEXT]], [[C]]
     ; CHECK-NEXT: $x0 = COPY %ext(s64)
     %x:_(s1) = G_IMPLICIT_DEF
@@ -108,10 +118,12 @@ body: |
   bb.0.entry:
     liveins: $x0
     ; CHECK-LABEL: name: test_freeze_s2
-    ; CHECK: [[DEF:%[0-9]+]]:_(s8) = G_IMPLICIT_DEF
+    ; CHECK: liveins: $x0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s8) = G_IMPLICIT_DEF
     ; CHECK-NEXT: [[FREEZE:%[0-9]+]]:_(s8) = G_FREEZE [[DEF]]
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 3
     ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[FREEZE]](s8)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 3
     ; CHECK-NEXT: %ext:_(s64) = G_AND [[ANYEXT]], [[C]]
     ; CHECK-NEXT: $x0 = COPY %ext(s64)
     %x:_(s2) = G_IMPLICIT_DEF
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-fshl.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-fshl.mir
index 80617314ce0e5..c3c23aecc161f 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-fshl.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-fshl.mir
@@ -21,15 +21,13 @@ body:             |
     ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[COPY2]], [[C1]]
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C]](s32)
     ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[XOR]], [[COPY3]]
+    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[AND]](s32)
     ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
-    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[AND]], [[C2]]
-    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[AND2]](s32)
-    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C2]]
+    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C2]]
     ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[AND3]], [[C3]](s64)
-    ; CHECK-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[AND1]], [[C2]]
-    ; CHECK-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C2]]
-    ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[AND5]], [[AND4]](s32)
+    ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[AND2]], [[C3]](s64)
+    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C2]]
+    ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[AND3]], [[AND1]](s32)
     ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[LSHR1]]
     ; CHECK-NEXT: $w0 = COPY [[OR]](s32)
     ; CHECK-NEXT: RET_ReallyLR implicit $w0
@@ -66,15 +64,13 @@ body:             |
     ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[COPY2]], [[C1]]
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C]](s32)
     ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[XOR]], [[COPY3]]
+    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[AND]](s32)
     ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
-    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[AND]], [[C2]]
-    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[AND2]](s32)
-    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C2]]
+    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C2]]
     ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[AND3]], [[C3]](s64)
-    ; CHECK-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[AND1]], [[C2]]
-    ; CHECK-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C2]]
-    ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[AND5]], [[AND4]](s32)
+    ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[AND2]], [[C3]](s64)
+    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C2]]
+    ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[AND3]], [[AND1]](s32)
     ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[LSHR1]]
     ; CHECK-NEXT: $w0 = COPY [[OR]](s32)
     ; CHECK-NEXT: RET_ReallyLR implicit $w0
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-fshr.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-fshr.mir
index 81bf4332b456d..09520445b71d9 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-fshr.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-fshr.mir
@@ -23,12 +23,10 @@ body:             |
     ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[XOR]], [[COPY3]]
     ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
     ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[C2]](s64)
+    ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[SHL]], [[AND1]](s32)
     ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
-    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[AND1]], [[C3]]
-    ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[SHL]], [[AND2]](s32)
-    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[AND]], [[C3]]
-    ; CHECK-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]]
-    ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[AND4]], [[AND3]](s32)
+    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]]
+    ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[AND2]], [[AND]](s32)
     ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[LSHR]]
     ; CHECK-NEXT: $w0 = COPY [[OR]](s32)
     ; CHECK-NEXT: RET_ReallyLR implicit $w0
@@ -67,12 +65,10 @@ body:             |
     ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[XOR]], [[COPY3]]
     ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
     ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[C2]](s64)
+    ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[SHL]], [[AND1]](s32)
     ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
-    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[AND1]], [[C3]]
-    ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[SHL]], [[AND2]](s32)
-    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[AND]], [[C3]]
-    ; CHECK-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]]
-    ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[AND4]], [[AND3]](s32)
+    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]]
+    ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[AND2]], [[AND]](s32)
     ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[LSHR]]
     ; CHECK-NEXT: $w0 = COPY [[OR]](s32)
     ; CHECK-NEXT: RET_ReallyLR implicit $w0
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-inserts.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-inserts.mir
index c49e62e53f07b..942bb60f5c062 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-inserts.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-inserts.mir
@@ -66,9 +66,9 @@ body: |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x2
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[DEF]](s32)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C]]
     ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
     ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND]], [[C1]](s64)
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-intrinsic-min-max.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-intrinsic-min-max.mir
index 3ad54ba3f17fb..fafaa4646fa22 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-intrinsic-min-max.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-intrinsic-min-max.mir
@@ -18,9 +18,7 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $w1
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(slt), [[COPY]](s32), [[COPY1]]
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ICMP]], [[C]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AND]](s32), [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s32), [[COPY]], [[COPY1]]
     ; CHECK-NEXT: $w0 = COPY [[SELECT]](s32)
     ; CHECK-NEXT: RET_ReallyLR implicit $w0
     %0:_(s32) = COPY $w0
@@ -47,9 +45,7 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x1
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(slt), [[COPY]](s64), [[COPY1]]
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ICMP]], [[C]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[AND]](s32), [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s32), [[COPY]], [[COPY1]]
     ; CHECK-NEXT: $x0 = COPY [[SELECT]](s64)
     ; CHECK-NEXT: RET_ReallyLR implicit $x0
     %0:_(s64) = COPY $x0
@@ -76,9 +72,7 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $w1
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(sgt), [[COPY]](s32), [[COPY1]]
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ICMP]], [[C]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AND]](s32), [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s32), [[COPY]], [[COPY1]]
     ; CHECK-NEXT: $w0 = COPY [[SELECT]](s32)
     ; CHECK-NEXT: RET_ReallyLR implicit $w0
     %0:_(s32) = COPY $w0
@@ -105,9 +99,7 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x1
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(sgt), [[COPY]](s64), [[COPY1]]
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ICMP]], [[C]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[AND]](s32), [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s32), [[COPY]], [[COPY1]]
     ; CHECK-NEXT: $x0 = COPY [[SELECT]](s64)
     ; CHECK-NEXT: RET_ReallyLR implicit $x0
     %0:_(s64) = COPY $x0
@@ -136,9 +128,7 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $w1
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[COPY]](s32), [[COPY1]]
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ICMP]], [[C]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AND]](s32), [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s32), [[COPY]], [[COPY1]]
     ; CHECK-NEXT: $w0 = COPY [[SELECT]](s32)
     ; CHECK-NEXT: RET_ReallyLR implicit $w0
     %0:_(s32) = COPY $w0
@@ -165,9 +155,7 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x1
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[COPY]](s64), [[COPY1]]
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ICMP]], [[C]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[AND]](s32), [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s32), [[COPY]], [[COPY1]]
     ; CHECK-NEXT: $x0 = COPY [[SELECT]](s64)
     ; CHECK-NEXT: RET_ReallyLR implicit $x0
     %0:_(s64) = COPY $x0
@@ -194,9 +182,7 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $w1
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ugt), [[COPY]](s32), [[COPY1]]
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ICMP]], [[C]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AND]](s32), [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s32), [[COPY]], [[COPY1]]
     ; CHECK-NEXT: $w0 = COPY [[SELECT]](s32)
     ; CHECK-NEXT: RET_ReallyLR implicit $w0
     %0:_(s32) = COPY $w0
@@ -223,9 +209,7 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x1
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ugt), [[COPY]](s64), [[COPY1]]
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ICMP]], [[C]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[AND]](s32), [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s32), [[COPY]], [[COPY1]]
     ; CHECK-NEXT: $x0 = COPY [[SELECT]](s64)
     ; CHECK-NEXT: RET_ReallyLR implicit $x0
     %0:_(s64) = COPY $x0
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-itofp.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-itofp.mir
index 8c6a30aaed048..fc417b2eca618 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-itofp.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-itofp.mir
@@ -274,12 +274,12 @@ body: |
     ; CHECK-LABEL: name: test_uitofp_v2s64_v2i1
     ; CHECK: liveins: $q0
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[C]](s64), [[C]](s64)
     ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s64) = G_IMPLICIT_DEF
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY [[DEF]](s64)
-    ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[DEF]](s64), [[COPY]](s64)
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(<2 x s64>) = G_AND [[BUILD_VECTOR1]], [[BUILD_VECTOR]]
+    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[DEF]](s64), [[COPY]](s64)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[C]](s64), [[C]](s64)
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(<2 x s64>) = G_AND [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
     ; CHECK-NEXT: [[UITOFP:%[0-9]+]]:_(<2 x s64>) = G_UITOFP [[AND]](<2 x s64>)
     ; CHECK-NEXT: $q0 = COPY [[UITOFP]](<2 x s64>)
     %0:_(<2 x s1>) = G_IMPLICIT_DEF
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-load-store.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-load-store.mir
index 1040a5d6ff4d9..5cbb8649d158b 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-load-store.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-load-store.mir
@@ -688,8 +688,8 @@ body:             |
     ; CHECK-NEXT: %ptr:_(p0) = COPY $x0
     ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s8) = G_LOAD %ptr(p0) :: (load (s8))
     ; CHECK-NEXT: [[ASSERT_ZEXT:%[0-9]+]]:_(s8) = G_ASSERT_ZEXT [[LOAD]], 1
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
     ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[ASSERT_ZEXT]](s8)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
     ; CHECK-NEXT: %ext:_(s64) = G_AND [[ANYEXT]], [[C]]
     ; CHECK-NEXT: $x0 = COPY %ext(s64)
     ; CHECK-NEXT: RET_ReallyLR implicit $x0
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-merge-values.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-merge-values.mir
index 6e399d8a38835..8fc2ad3e41789 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-merge-values.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-merge-values.mir
@@ -6,14 +6,11 @@ name:            test_merge_s4
 body: |
   bb.0:
     ; CHECK-LABEL: name: test_merge_s4
-    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 15
-    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[C1]], [[C]]
-    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND]], [[C2]](s64)
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY [[C1]](s32)
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C]]
-    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND1]], [[SHL]]
+    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
+    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[C]], [[C1]](s64)
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY [[C]](s32)
+    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY]], [[SHL]]
     ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR]](s32)
     ; CHECK-NEXT: $x0 = COPY [[ANYEXT]](s64)
     %0:_(s64) = G_CONSTANT i64 0
@@ -29,7 +26,6 @@ name:            test_merge_s16_s8
 body: |
   bb.0:
 
-    ; This isn't legal but we don't support widening the destination type.
     ; CHECK-LABEL: name: test_merge_s16_s8
     ; CHECK: %a:_(s32) = COPY $w0
     ; CHECK-NEXT: %b:_(s32) = COPY $w1
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-mul.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-mul.mir
index 0e895c49eff09..b29670a89c8cf 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-mul.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-mul.mir
@@ -131,8 +131,8 @@ body:             |
     ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16777215
     ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND %lhs_wide, [[C]]
     ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND %rhs_wide, [[C]]
-    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 16777215
     ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT %lhs_wide(s32)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 16777215
     ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s64) = G_AND [[ANYEXT]], [[C1]]
     ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT %rhs_wide(s32)
     ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s64) = G_AND [[ANYEXT1]], [[C1]]
@@ -223,8 +223,8 @@ body:             |
     ; CHECK-NEXT: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[LOAD]], [[LOAD1]]
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ne), [[UMULH]](s64), [[C]]
     ; CHECK-NEXT: G_STORE [[C]](s64), [[FRAME_INDEX2]](p0) :: (store (s64), align 1)
-    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
     ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[ICMP]](s32)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
     ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[ANYEXT]], [[C1]]
     ; CHECK-NEXT: $x0 = COPY [[MUL]](s64)
     ; CHECK-NEXT: $x1 = COPY [[AND]](s64)
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-phi-insertpt-decrement.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-phi-insertpt-decrement.mir
index 60d7af273eb5e..f1aee90bb07dd 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-phi-insertpt-decrement.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-phi-insertpt-decrement.mir
@@ -52,8 +52,8 @@ body:             |
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[PHI:%[0-9]+]]:_(p0) = G_PHI %6(p0), %bb.2, [[DEF]](p0), %bb.0
   ; CHECK-NEXT:   [[PHI1:%[0-9]+]]:_(s16) = G_PHI %22(s16), %bb.2, [[DEF1]](s16), %bb.0
-  ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
   ; CHECK-NEXT:   [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[PHI1]](s16)
+  ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
   ; CHECK-NEXT:   [[AND:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C1]]
   ; CHECK-NEXT:   G_BRCOND [[AND]](s32), %bb.3
   ; CHECK-NEXT: {{  $}}
@@ -71,10 +71,8 @@ body:             |
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY [[C3]](s32)
   ; CHECK-NEXT:   [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[ZEXT1]](s32), [[COPY]]
   ; CHECK-NEXT:   [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[PHI]], [[C2]](s64)
-  ; CHECK-NEXT:   [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-  ; CHECK-NEXT:   [[AND1:%[0-9]+]]:_(s32) = G_AND [[ICMP]], [[C4]]
   ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[ICMP1]](s32)
-  ; CHECK-NEXT:   G_BRCOND [[AND1]](s32), %bb.3
+  ; CHECK-NEXT:   G_BRCOND [[ICMP]](s32), %bb.3
   ; CHECK-NEXT:   G_BR %bb.1
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3.bb10:
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-phi.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-phi.mir
index 3c952cb779d23..4154ab7039c2f 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-phi.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-phi.mir
@@ -32,8 +32,7 @@ body:             |
   ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
   ; CHECK-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
   ; CHECK-NEXT:   [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ugt), [[COPY]](s32), [[C]]
-  ; CHECK-NEXT:   [[AND:%[0-9]+]]:_(s32) = G_AND [[ICMP]], [[C1]]
-  ; CHECK-NEXT:   G_BRCOND [[AND]](s32), %bb.1
+  ; CHECK-NEXT:   G_BRCOND [[ICMP]](s32), %bb.1
   ; CHECK-NEXT:   G_BR %bb.2
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
@@ -51,10 +50,10 @@ body:             |
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
   ; CHECK-NEXT:   [[PHI:%[0-9]+]]:_(s16) = G_PHI [[TRUNC]](s16), %bb.1, [[TRUNC1]](s16), %bb.2
-  ; CHECK-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
   ; CHECK-NEXT:   [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[PHI]](s16)
-  ; CHECK-NEXT:   [[AND1:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C3]]
-  ; CHECK-NEXT:   $w0 = COPY [[AND1]](s32)
+  ; CHECK-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+  ; CHECK-NEXT:   [[AND:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C3]]
+  ; CHECK-NEXT:   $w0 = COPY [[AND]](s32)
   ; CHECK-NEXT:   RET_ReallyLR implicit $w0
   bb.0:
    ; Test that we insert legalization artifacts(Truncs here) into the correct BBs
@@ -185,8 +184,7 @@ body:             |
   ; CHECK-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
   ; CHECK-NEXT:   [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ugt), [[COPY]](s32), [[C]]
   ; CHECK-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY]], [[C1]]
-  ; CHECK-NEXT:   [[AND:%[0-9]+]]:_(s32) = G_AND [[ICMP]], [[C2]]
-  ; CHECK-NEXT:   G_BRCOND [[AND]](s32), %bb.1
+  ; CHECK-NEXT:   G_BRCOND [[ICMP]](s32), %bb.1
   ; CHECK-NEXT:   G_BR %bb.2
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
@@ -203,10 +201,10 @@ body:             |
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3:
   ; CHECK-NEXT:   [[PHI:%[0-9]+]]:_(s16) = G_PHI [[TRUNC]](s16), %bb.1, [[TRUNC1]](s16), %bb.2
-  ; CHECK-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
   ; CHECK-NEXT:   [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[PHI]](s16)
-  ; CHECK-NEXT:   [[AND1:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C3]]
-  ; CHECK-NEXT:   $w0 = COPY [[AND1]](s32)
+  ; CHECK-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+  ; CHECK-NEXT:   [[AND:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C3]]
+  ; CHECK-NEXT:   $w0 = COPY [[AND]](s32)
   ; CHECK-NEXT:   RET_ReallyLR implicit $w0
   bb.0:
     successors: %bb.1(0x40000000), %bb.2(0x40000000)
@@ -281,14 +279,13 @@ body:             |
   ; CHECK-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
   ; CHECK-NEXT:   [[AND:%[0-9]+]]:_(s32) = G_AND [[ADD]], [[C2]]
   ; CHECK-NEXT:   [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ugt), [[AND]](s32), [[COPY]]
-  ; CHECK-NEXT:   [[AND1:%[0-9]+]]:_(s32) = G_AND [[ICMP]], [[C1]]
   ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[ADD]](s32)
-  ; CHECK-NEXT:   G_BRCOND [[AND1]](s32), %bb.1
+  ; CHECK-NEXT:   G_BRCOND [[ICMP]](s32), %bb.1
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
-  ; CHECK-NEXT:   [[AND2:%[0-9]+]]:_(s32) = G_AND [[ADD]], [[C3]]
-  ; CHECK-NEXT:   $w0 = COPY [[AND2]](s32)
+  ; CHECK-NEXT:   [[AND1:%[0-9]+]]:_(s32) = G_AND [[ADD]], [[C3]]
+  ; CHECK-NEXT:   $w0 = COPY [[AND1]](s32)
   ; CHECK-NEXT:   RET_ReallyLR implicit $w0
   bb.0:
     successors: %bb.1(0x80000000)
@@ -342,13 +339,11 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[PHI:%[0-9]+]]:_(s16) = G_PHI [[C]](s16), %bb.0, [[PHI]](s16), %bb.1
-  ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
   ; CHECK-NEXT:   [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[PHI]](s16)
+  ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
   ; CHECK-NEXT:   [[AND:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C1]]
   ; CHECK-NEXT:   [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ugt), [[AND]](s32), [[COPY]]
-  ; CHECK-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-  ; CHECK-NEXT:   [[AND1:%[0-9]+]]:_(s32) = G_AND [[ICMP]], [[C2]]
-  ; CHECK-NEXT:   G_BRCOND [[AND1]](s32), %bb.1
+  ; CHECK-NEXT:   G_BRCOND [[ICMP]](s32), %bb.1
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   $w0 = COPY [[AND]](s32)
@@ -412,8 +407,7 @@ body:             |
   ; CHECK-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
   ; CHECK-NEXT:   [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ugt), [[COPY]](s32), [[C]]
   ; CHECK-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY]], [[C1]]
-  ; CHECK-NEXT:   [[AND:%[0-9]+]]:_(s32) = G_AND [[ICMP]], [[C2]]
-  ; CHECK-NEXT:   G_BRCOND [[AND]](s32), %bb.1
+  ; CHECK-NEXT:   G_BRCOND [[ICMP]](s32), %bb.1
   ; CHECK-NEXT:   G_BR %bb.2
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
@@ -433,12 +427,12 @@ body:             |
   ; CHECK-NEXT: bb.3:
   ; CHECK-NEXT:   [[PHI:%[0-9]+]]:_(s16) = G_PHI [[TRUNC1]](s16), %bb.1, [[TRUNC2]](s16), %bb.2
   ; CHECK-NEXT:   [[PHI1:%[0-9]+]]:_(s16) = G_PHI [[TRUNC]](s16), %bb.1, [[C3]](s16), %bb.2
-  ; CHECK-NEXT:   [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
   ; CHECK-NEXT:   [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[PHI]](s16)
-  ; CHECK-NEXT:   [[AND1:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C4]]
+  ; CHECK-NEXT:   [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
+  ; CHECK-NEXT:   [[AND:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C4]]
   ; CHECK-NEXT:   [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[PHI1]](s16)
-  ; CHECK-NEXT:   [[AND2:%[0-9]+]]:_(s32) = G_AND [[ANYEXT1]], [[C4]]
-  ; CHECK-NEXT:   [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[AND1]], [[AND2]]
+  ; CHECK-NEXT:   [[AND1:%[0-9]+]]:_(s32) = G_AND [[ANYEXT1]], [[C4]]
+  ; CHECK-NEXT:   [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[AND]], [[AND1]]
   ; CHECK-NEXT:   $w0 = COPY [[ADD2]](s32)
   ; CHECK-NEXT:   RET_ReallyLR implicit $w0
   bb.0:
@@ -524,31 +518,28 @@ body:             |
   ; CHECK-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY]], [[C1]]
   ; CHECK-NEXT:   [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[ADD]](s32)
   ; CHECK-NEXT:   [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[ADD]](s32)
-  ; CHECK-NEXT:   [[AND:%[0-9]+]]:_(s32) = G_AND [[ICMP]], [[C2]]
-  ; CHECK-NEXT:   G_BRCOND [[AND]](s32), %bb.1
+  ; CHECK-NEXT:   G_BRCOND [[ICMP]](s32), %bb.1
   ; CHECK-NEXT:   G_BR %bb.2
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[PHI:%[0-9]+]]:_(s16) = G_PHI [[TRUNC1]](s16), %bb.0, %22(s16), %bb.1
-  ; CHECK-NEXT:   [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
   ; CHECK-NEXT:   [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[PHI]](s16)
-  ; CHECK-NEXT:   [[AND1:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C4]]
-  ; CHECK-NEXT:   [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[AND1]], [[C2]]
+  ; CHECK-NEXT:   [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
+  ; CHECK-NEXT:   [[AND:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C4]]
+  ; CHECK-NEXT:   [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[AND]], [[C2]]
   ; CHECK-NEXT:   [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(ugt), [[ADD1]](s32), [[C3]]
-  ; CHECK-NEXT:   [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-  ; CHECK-NEXT:   [[AND2:%[0-9]+]]:_(s32) = G_AND [[ICMP1]], [[C5]]
-  ; CHECK-NEXT:   [[C6:%[0-9]+]]:_(s16) = G_CONSTANT i16 43
-  ; CHECK-NEXT:   G_BRCOND [[AND2]](s32), %bb.2
+  ; CHECK-NEXT:   [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 43
+  ; CHECK-NEXT:   G_BRCOND [[ICMP1]](s32), %bb.2
   ; CHECK-NEXT:   G_BR %bb.1
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   [[PHI1:%[0-9]+]]:_(s16) = G_PHI [[PHI]](s16), %bb.1, [[TRUNC]](s16), %bb.0
-  ; CHECK-NEXT:   [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
   ; CHECK-NEXT:   [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[PHI1]](s16)
-  ; CHECK-NEXT:   [[AND3:%[0-9]+]]:_(s32) = G_AND [[ANYEXT1]], [[C7]]
-  ; CHECK-NEXT:   $w0 = COPY [[AND3]](s32)
+  ; CHECK-NEXT:   [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
+  ; CHECK-NEXT:   [[AND1:%[0-9]+]]:_(s32) = G_AND [[ANYEXT1]], [[C6]]
+  ; CHECK-NEXT:   $w0 = COPY [[AND1]](s32)
   ; CHECK-NEXT:   RET_ReallyLR implicit $w0
   bb.0:
     successors: %bb.1(0x40000000), %bb.3(0x40000000)
@@ -671,8 +662,8 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
   ; CHECK-NEXT:   liveins: $x0
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
   ; CHECK-NEXT:   [[AND:%[0-9]+]]:_(s32) = G_AND [[DEF]], [[C]]
   ; CHECK-NEXT:   G_BRCOND [[AND]](s32), %bb.1
   ; CHECK-NEXT:   G_BR %bb.2
@@ -727,8 +718,8 @@ body:             |
   ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
   ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD %ptr1, [[C]](s64)
   ; CHECK-NEXT:   [[LOAD1:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[PTR_ADD]](p0) :: (load (<2 x s64>) from unknown-address + 16)
-  ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
   ; CHECK-NEXT:   [[AND:%[0-9]+]]:_(s32) = G_AND [[DEF]], [[C1]]
   ; CHECK-NEXT:   G_BRCOND [[AND]](s32), %bb.2
   ; CHECK-NEXT:   G_BR %bb.1
@@ -778,8 +769,8 @@ body:             |
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[DEF]](s32), [[DEF]](s32), [[DEF]](s32), [[DEF]](s32)
-  ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY [[DEF]](s32)
+  ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
   ; CHECK-NEXT:   [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C]]
   ; CHECK-NEXT:   G_BRCOND [[AND]](s32), %bb.2
   ; CHECK-NEXT:   G_BR %bb.1
@@ -824,8 +815,8 @@ body:             |
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
   ; CHECK-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s16>) = G_BUILD_VECTOR [[DEF]](s16), [[DEF]](s16), [[DEF]](s16), [[DEF]](s16), [[DEF]](s16), [[DEF]](s16), [[DEF]](s16), [[DEF]](s16)
-  ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
   ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
   ; CHECK-NEXT:   [[AND:%[0-9]+]]:_(s32) = G_AND [[DEF1]], [[C]]
   ; CHECK-NEXT:   G_BRCOND [[AND]](s32), %bb.2
   ; CHECK-NEXT:   G_BR %bb.1
@@ -919,8 +910,8 @@ body:             |
   ; CHECK-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD %ptr1, [[C]](s64)
   ; CHECK-NEXT:   [[LOAD1:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[PTR_ADD]](p0) :: (load (<2 x s64>) from unknown-address + 16)
   ; CHECK-NEXT:   [[BITCAST1:%[0-9]+]]:_(<2 x p0>) = G_BITCAST [[LOAD1]](<2 x s64>)
-  ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
   ; CHECK-NEXT:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
   ; CHECK-NEXT:   [[AND:%[0-9]+]]:_(s32) = G_AND [[DEF]], [[C1]]
   ; CHECK-NEXT:   G_BRCOND [[AND]](s32), %bb.2
   ; CHECK-NEXT:   G_BR %bb.1
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-ptrtoint.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-ptrtoint.mir
index f62a948a4b36e..dbc538a21afda 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-ptrtoint.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-ptrtoint.mir
@@ -83,8 +83,8 @@ body: |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: %ptr:_(p0) = COPY $x0
     ; CHECK-NEXT: [[PTRTOINT:%[0-9]+]]:_(s64) = G_PTRTOINT %ptr(p0)
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[PTRTOINT]](s64)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; CHECK-NEXT: %ext:_(s32) = G_AND [[TRUNC]], [[C]]
     ; CHECK-NEXT: $w0 = COPY %ext(s32)
     ; CHECK-NEXT: RET_ReallyLR implicit $w0
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-rem.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-rem.mir
index b707e3ce2a3d5..7a8f42b43787a 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-rem.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-rem.mir
@@ -96,8 +96,8 @@ body:             |
     ; CHECK-LABEL: name: test_urem_1
     ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x1
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[TRUNC]], [[C]]
     ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64)
     ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[TRUNC1]], [[C]]
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-sadde.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-sadde.mir
index 6e72e749eaef7..bbb6fda40dd51 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-sadde.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-sadde.mir
@@ -11,12 +11,11 @@ body:             |
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY $x2
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s64) = COPY $x3
     ; CHECK-NEXT: [[COPY4:%[0-9]+]]:_(s64) = COPY $x4
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY4]](s64)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[TRUNC]], [[C]]
     ; CHECK-NEXT: [[UADDE:%[0-9]+]]:_(s64), [[UADDE1:%[0-9]+]]:_(s32) = G_UADDE [[COPY]], [[COPY2]], [[AND]]
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[UADDE1]], [[C]]
-    ; CHECK-NEXT: [[SADDE:%[0-9]+]]:_(s64), [[SADDE1:%[0-9]+]]:_(s32) = G_SADDE [[COPY1]], [[COPY3]], [[AND1]]
+    ; CHECK-NEXT: [[SADDE:%[0-9]+]]:_(s64), [[SADDE1:%[0-9]+]]:_(s32) = G_SADDE [[COPY1]], [[COPY3]], [[UADDE1]]
     ; CHECK-NEXT: %carry_out_ext:_(s64) = G_ANYEXT [[SADDE1]](s32)
     ; CHECK-NEXT: $x0 = COPY [[UADDE]](s64)
     ; CHECK-NEXT: $x1 = COPY [[SADDE]](s64)
@@ -47,14 +46,12 @@ body:             |
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY $x2
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s64) = COPY $x3
     ; CHECK-NEXT: [[COPY4:%[0-9]+]]:_(s64) = COPY $x4
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY4]](s64)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[TRUNC]], [[C]]
     ; CHECK-NEXT: [[UADDE:%[0-9]+]]:_(s64), [[UADDE1:%[0-9]+]]:_(s32) = G_UADDE [[COPY]], [[COPY1]], [[AND]]
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[UADDE1]], [[C]]
-    ; CHECK-NEXT: [[UADDE2:%[0-9]+]]:_(s64), [[UADDE3:%[0-9]+]]:_(s32) = G_UADDE [[COPY1]], [[COPY2]], [[AND1]]
-    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[UADDE3]], [[C]]
-    ; CHECK-NEXT: [[SADDE:%[0-9]+]]:_(s64), [[SADDE1:%[0-9]+]]:_(s32) = G_SADDE [[COPY2]], [[COPY3]], [[AND2]]
+    ; CHECK-NEXT: [[UADDE2:%[0-9]+]]:_(s64), [[UADDE3:%[0-9]+]]:_(s32) = G_UADDE [[COPY1]], [[COPY2]], [[UADDE1]]
+    ; CHECK-NEXT: [[SADDE:%[0-9]+]]:_(s64), [[SADDE1:%[0-9]+]]:_(s32) = G_SADDE [[COPY2]], [[COPY3]], [[UADDE3]]
     ; CHECK-NEXT: %carry_out_ext:_(s64) = G_ANYEXT [[SADDE1]](s32)
     ; CHECK-NEXT: $x0 = COPY [[UADDE]](s64)
     ; CHECK-NEXT: $x1 = COPY [[UADDE2]](s64)
@@ -89,8 +86,8 @@ body:             |
     ; CHECK-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[TRUNC]], 8
     ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64)
     ; CHECK-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[TRUNC1]], 8
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; CHECK-NEXT: [[TRUNC2:%[0-9]+]]:_(s32) = G_TRUNC [[COPY2]](s64)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[TRUNC2]], [[C]]
     ; CHECK-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s32) = G_UADDE [[SEXT_INREG]], [[SEXT_INREG1]], [[AND]]
     ; CHECK-NEXT: [[SEXT_INREG2:%[0-9]+]]:_(s32) = G_SEXT_INREG [[UADDE]], 8
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-saddo.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-saddo.mir
index 0b04b45cddf3e..bf0af67e56f87 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-saddo.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-saddo.mir
@@ -11,9 +11,7 @@ body:             |
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY $x2
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s64) = COPY $x3
     ; CHECK-NEXT: [[UADDO:%[0-9]+]]:_(s64), [[UADDO1:%[0-9]+]]:_(s32) = G_UADDO [[COPY]], [[COPY2]]
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[UADDO1]], [[C]]
-    ; CHECK-NEXT: [[SADDE:%[0-9]+]]:_(s64), [[SADDE1:%[0-9]+]]:_(s32) = G_SADDE [[COPY1]], [[COPY3]], [[AND]]
+    ; CHECK-NEXT: [[SADDE:%[0-9]+]]:_(s64), [[SADDE1:%[0-9]+]]:_(s32) = G_SADDE [[COPY1]], [[COPY3]], [[UADDO1]]
     ; CHECK-NEXT: %carry_out_ext:_(s64) = G_ANYEXT [[SADDE1]](s32)
     ; CHECK-NEXT: $x0 = COPY [[UADDO]](s64)
     ; CHECK-NEXT: $x1 = COPY [[SADDE]](s64)
@@ -42,11 +40,8 @@ body:             |
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY $x2
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s64) = COPY $x3
     ; CHECK-NEXT: [[UADDO:%[0-9]+]]:_(s64), [[UADDO1:%[0-9]+]]:_(s32) = G_UADDO [[COPY]], [[COPY1]]
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[UADDO1]], [[C]]
-    ; CHECK-NEXT: [[UADDE:%[0-9]+]]:_(s64), [[UADDE1:%[0-9]+]]:_(s32) = G_UADDE [[COPY1]], [[COPY2]], [[AND]]
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[UADDE1]], [[C]]
-    ; CHECK-NEXT: [[SADDE:%[0-9]+]]:_(s64), [[SADDE1:%[0-9]+]]:_(s32) = G_SADDE [[COPY2]], [[COPY3]], [[AND1]]
+    ; CHECK-NEXT: [[UADDE:%[0-9]+]]:_(s64), [[UADDE1:%[0-9]+]]:_(s32) = G_UADDE [[COPY1]], [[COPY2]], [[UADDO1]]
+    ; CHECK-NEXT: [[SADDE:%[0-9]+]]:_(s64), [[SADDE1:%[0-9]+]]:_(s32) = G_SADDE [[COPY2]], [[COPY3]], [[UADDE1]]
     ; CHECK-NEXT: %carry_out_ext:_(s64) = G_ANYEXT [[SADDE1]](s32)
     ; CHECK-NEXT: $x0 = COPY [[UADDO]](s64)
     ; CHECK-NEXT: $x1 = COPY [[UADDE]](s64)
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-saddsat.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-saddsat.mir
index b2fb18733b7cb..b8bdef06cac6d 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-saddsat.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-saddsat.mir
@@ -18,9 +18,7 @@ body:             |
     ; CHECK-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[SADDO]], [[C]](s64)
     ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -2147483648
     ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[ASHR]], [[C1]]
-    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[SADDO1]], [[C2]]
-    ; CHECK-NEXT: %saddsat:_(s32) = G_SELECT [[AND]](s32), [[ADD]], [[SADDO]]
+    ; CHECK-NEXT: %saddsat:_(s32) = G_SELECT [[SADDO1]](s32), [[ADD]], [[SADDO]]
     ; CHECK-NEXT: $w0 = COPY %saddsat(s32)
     ; CHECK-NEXT: RET_ReallyLR implicit $w0
     %x:_(s32) = COPY $w0
@@ -46,9 +44,7 @@ body:             |
     ; CHECK-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[SADDO]], [[C]](s64)
     ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 -9223372036854775808
     ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[ASHR]], [[C1]]
-    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[SADDO1]], [[C2]]
-    ; CHECK-NEXT: %saddsat:_(s64) = G_SELECT [[AND]](s32), [[ADD]], [[SADDO]]
+    ; CHECK-NEXT: %saddsat:_(s64) = G_SELECT [[SADDO1]](s32), [[ADD]], [[SADDO]]
     ; CHECK-NEXT: $x0 = COPY %saddsat(s64)
     ; CHECK-NEXT: RET_ReallyLR implicit $x0
     %x:_(s64) = COPY $x0
@@ -80,9 +76,7 @@ body:             |
     ; CHECK-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[COPY]], [[C]](s64)
     ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -32768
     ; CHECK-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[ASHR]], [[C1]]
-    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ICMP]], [[C2]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AND]](s32), [[ADD1]], [[ADD]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s32), [[ADD1]], [[ADD]]
     ; CHECK-NEXT: $w0 = COPY [[SELECT]](s32)
     ; CHECK-NEXT: RET_ReallyLR implicit $w0
     %copy_1:_(s32) = COPY $w0
@@ -117,8 +111,7 @@ body:             |
     ; CHECK-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[COPY]], [[C]](s64)
     ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; CHECK-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[ASHR]], [[C1]]
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ICMP]], [[C1]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AND]](s32), [[ADD1]], [[ADD]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s32), [[ADD1]], [[ADD]]
     ; CHECK-NEXT: $w0 = COPY [[SELECT]](s32)
     ; CHECK-NEXT: RET_ReallyLR implicit $w0
     %copy_1:_(s32) = COPY $w0
@@ -153,9 +146,7 @@ body:             |
     ; CHECK-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[COPY]], [[C]](s64)
     ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
     ; CHECK-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[ASHR]], [[C1]]
-    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ICMP]], [[C2]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AND]](s32), [[ADD1]], [[ADD]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s32), [[ADD1]], [[ADD]]
     ; CHECK-NEXT: $w0 = COPY [[SELECT]](s32)
     ; CHECK-NEXT: RET_ReallyLR implicit $w0
     %copy_1:_(s32) = COPY $w0
@@ -187,13 +178,10 @@ body:             |
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ne), [[ADD]](s64), [[SEXT_INREG2]]
     ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 35
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY [[SEXT_INREG2]](s64)
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY [[C]](s64)
-    ; CHECK-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[COPY]], [[COPY1]](s64)
+    ; CHECK-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[COPY]], [[C]](s64)
     ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 34359738368
     ; CHECK-NEXT: [[ADD1:%[0-9]+]]:_(s64) = G_ADD [[ASHR]], [[C1]]
-    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ICMP]], [[C2]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[AND]](s32), [[ADD1]], [[ADD]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s32), [[ADD1]], [[ADD]]
     ; CHECK-NEXT: $x0 = COPY [[SELECT]](s64)
     ; CHECK-NEXT: RET_ReallyLR implicit $x0
     %copy_1:_(s64) = COPY $x0
@@ -227,9 +215,7 @@ body:             |
     ; CHECK-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[TRUNC]], 24
     ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[UV3]](s64)
     ; CHECK-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[TRUNC1]], 24
-    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[UADDO1]], [[C1]]
-    ; CHECK-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s32) = G_UADDE [[SEXT_INREG]], [[SEXT_INREG1]], [[AND]]
+    ; CHECK-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s32) = G_UADDE [[SEXT_INREG]], [[SEXT_INREG1]], [[UADDO1]]
     ; CHECK-NEXT: [[SEXT_INREG2:%[0-9]+]]:_(s32) = G_SEXT_INREG [[UADDE]], 24
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ne), [[UADDE]](s32), [[SEXT_INREG2]]
     ; CHECK-NEXT: [[UV4:%[0-9]+]]:_(s8), [[UV5:%[0-9]+]]:_(s8), [[UV6:%[0-9]+]]:_(s8), [[UV7:%[0-9]+]]:_(s8) = G_UNMERGE_VALUES [[UADDE]](s32)
@@ -240,11 +226,10 @@ body:             |
     ; CHECK-NEXT: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[UV8]](s8), [[UV9]](s8), [[UV10]](s8), [[UV8]](s8)
     ; CHECK-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[MV]](s32), [[MV1]](s32)
     ; CHECK-NEXT: [[SEXT_INREG3:%[0-9]+]]:_(s64) = G_SEXT_INREG [[MV2]], 24
-    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 23
-    ; CHECK-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[SEXT_INREG3]], [[C2]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 23
+    ; CHECK-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[SEXT_INREG3]], [[C1]](s64)
     ; CHECK-NEXT: [[UADDO2:%[0-9]+]]:_(s64), [[UADDO3:%[0-9]+]]:_(s32) = G_UADDO [[ASHR]], [[C]]
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[ICMP]], [[C1]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[AND1]](s32), [[UADDO2]], [[UADDO]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s32), [[UADDO2]], [[UADDO]]
     ; CHECK-NEXT: $x0 = COPY [[SELECT]](s64)
     ; CHECK-NEXT: RET_ReallyLR implicit $x0
     %copy_1:_(s128) = COPY $q0
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-select.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-select.mir
index 31b87a04a6dc1..4879ffd28784c 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-select.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-select.mir
@@ -125,20 +125,18 @@ body:             |
     ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00
     ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[C1]](s32), [[C1]](s32), [[C1]](s32), [[C1]](s32)
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[COPY]](s32), [[C]]
-    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ICMP]], [[C2]]
-    ; CHECK-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[AND]], 1
+    ; CHECK-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[ICMP]], 1
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[SEXT_INREG]](s32)
     ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<4 x s32>) = G_IMPLICIT_DEF
-    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<4 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[COPY2]](s32), [[C3]](s64)
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<4 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[COPY2]](s32), [[C2]](s64)
     ; CHECK-NEXT: [[SHUF:%[0-9]+]]:_(<4 x s32>) = G_SHUFFLE_VECTOR [[IVEC]](<4 x s32>), [[DEF]], shufflemask(0, 0, 0, 0)
-    ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
-    ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[C4]](s32), [[C4]](s32), [[C4]](s32), [[C4]](s32)
+    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[C3]](s32), [[C3]](s32), [[C3]](s32), [[C3]](s32)
     ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(<4 x s32>) = G_XOR [[SHUF]], [[BUILD_VECTOR1]]
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(<4 x s32>) = G_AND [[COPY1]], [[SHUF]]
-    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(<4 x s32>) = G_AND [[BUILD_VECTOR]], [[XOR]]
-    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(<4 x s32>) = G_OR [[AND1]], [[AND2]]
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(<4 x s32>) = G_AND [[COPY1]], [[SHUF]]
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(<4 x s32>) = G_AND [[BUILD_VECTOR]], [[XOR]]
+    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(<4 x s32>) = G_OR [[AND]], [[AND1]]
     ; CHECK-NEXT: $q0 = COPY [[OR]](<4 x s32>)
     ; CHECK-NEXT: RET_ReallyLR implicit $q0
     %0:_(s32) = COPY $w0
@@ -166,9 +164,7 @@ body:             |
     ; CHECK-NEXT: %b:_(s32) = COPY $w1
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(sgt), %a(s32), %b
     ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s64) = G_IMPLICIT_DEF
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ICMP]], [[C]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[AND]](s32), [[DEF]], [[DEF]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s32), [[DEF]], [[DEF]]
     ; CHECK-NEXT: $x0 = COPY [[SELECT]](s64)
     ; CHECK-NEXT: RET_ReallyLR implicit $x0
     %a:_(s32) = COPY $w0
@@ -311,17 +307,17 @@ body:             |
     ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(<4 x s16>) = G_ANYEXT [[BUILD_VECTOR1]](<4 x s1>)
     ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(<4 x s16>) = G_XOR [[ANYEXT]], [[ANYEXT1]]
     ; CHECK-NEXT: [[TRUNC2:%[0-9]+]]:_(<4 x s1>) = G_TRUNC [[XOR]](<4 x s16>)
-    ; CHECK-NEXT: [[ANYEXT3:%[0-9]+]]:_(<4 x s16>) = G_ANYEXT %vec_cond0(<4 x s1>)
-    ; CHECK-NEXT: [[ANYEXT4:%[0-9]+]]:_(<4 x s16>) = G_ANYEXT [[SHUF]](<4 x s1>)
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(<4 x s16>) = G_AND [[ANYEXT3]], [[ANYEXT4]]
+    ; CHECK-NEXT: [[ANYEXT2:%[0-9]+]]:_(<4 x s16>) = G_ANYEXT %vec_cond0(<4 x s1>)
+    ; CHECK-NEXT: [[ANYEXT3:%[0-9]+]]:_(<4 x s16>) = G_ANYEXT [[SHUF]](<4 x s1>)
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(<4 x s16>) = G_AND [[ANYEXT2]], [[ANYEXT3]]
     ; CHECK-NEXT: [[TRUNC3:%[0-9]+]]:_(<4 x s1>) = G_TRUNC [[AND]](<4 x s16>)
-    ; CHECK-NEXT: [[ANYEXT5:%[0-9]+]]:_(<4 x s16>) = G_ANYEXT %vec_cond1(<4 x s1>)
-    ; CHECK-NEXT: [[ANYEXT6:%[0-9]+]]:_(<4 x s16>) = G_ANYEXT [[TRUNC2]](<4 x s1>)
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(<4 x s16>) = G_AND [[ANYEXT5]], [[ANYEXT6]]
+    ; CHECK-NEXT: [[ANYEXT4:%[0-9]+]]:_(<4 x s16>) = G_ANYEXT %vec_cond1(<4 x s1>)
+    ; CHECK-NEXT: [[ANYEXT5:%[0-9]+]]:_(<4 x s16>) = G_ANYEXT [[TRUNC2]](<4 x s1>)
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(<4 x s16>) = G_AND [[ANYEXT4]], [[ANYEXT5]]
     ; CHECK-NEXT: [[TRUNC4:%[0-9]+]]:_(<4 x s1>) = G_TRUNC [[AND1]](<4 x s16>)
-    ; CHECK-NEXT: [[ANYEXT7:%[0-9]+]]:_(<4 x s16>) = G_ANYEXT [[TRUNC3]](<4 x s1>)
-    ; CHECK-NEXT: [[ANYEXT8:%[0-9]+]]:_(<4 x s16>) = G_ANYEXT [[TRUNC4]](<4 x s1>)
-    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(<4 x s16>) = G_OR [[ANYEXT7]], [[ANYEXT8]]
+    ; CHECK-NEXT: [[ANYEXT6:%[0-9]+]]:_(<4 x s16>) = G_ANYEXT [[TRUNC3]](<4 x s1>)
+    ; CHECK-NEXT: [[ANYEXT7:%[0-9]+]]:_(<4 x s16>) = G_ANYEXT [[TRUNC4]](<4 x s1>)
+    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(<4 x s16>) = G_OR [[ANYEXT6]], [[ANYEXT7]]
     ; CHECK-NEXT: %select:_(<4 x s1>) = G_TRUNC [[OR]](<4 x s16>)
     ; CHECK-NEXT: %zext_select:_(<4 x s32>) = G_ZEXT %select(<4 x s1>)
     ; CHECK-NEXT: $q0 = COPY %zext_select(<4 x s32>)
@@ -359,22 +355,20 @@ body:             |
     ; CHECK-NEXT: [[C:%[0-9]+]]:_(p0) = G_CONSTANT i64 0
     ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p0>) = G_BUILD_VECTOR [[C]](p0), [[C]](p0)
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[COPY]](p0), [[C]]
-    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ICMP]], [[C1]]
     ; CHECK-NEXT: [[PTRTOINT:%[0-9]+]]:_(<2 x s64>) = G_PTRTOINT [[COPY1]](<2 x p0>)
     ; CHECK-NEXT: [[PTRTOINT1:%[0-9]+]]:_(<2 x s64>) = G_PTRTOINT [[BUILD_VECTOR]](<2 x p0>)
-    ; CHECK-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[AND]], 1
+    ; CHECK-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[ICMP]], 1
     ; CHECK-NEXT: [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[SEXT_INREG]](s32)
     ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<2 x s64>) = G_IMPLICIT_DEF
-    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<2 x s64>) = G_INSERT_VECTOR_ELT [[DEF]], [[SEXT]](s64), [[C2]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<2 x s64>) = G_INSERT_VECTOR_ELT [[DEF]], [[SEXT]](s64), [[C1]](s64)
     ; CHECK-NEXT: [[SHUF:%[0-9]+]]:_(<2 x s64>) = G_SHUFFLE_VECTOR [[IVEC]](<2 x s64>), [[DEF]], shufflemask(0, 0)
-    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
-    ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[C3]](s64), [[C3]](s64)
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
+    ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[C2]](s64), [[C2]](s64)
     ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(<2 x s64>) = G_XOR [[SHUF]], [[BUILD_VECTOR1]]
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(<2 x s64>) = G_AND [[PTRTOINT]], [[SHUF]]
-    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(<2 x s64>) = G_AND [[PTRTOINT1]], [[XOR]]
-    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(<2 x s64>) = G_OR [[AND1]], [[AND2]]
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(<2 x s64>) = G_AND [[PTRTOINT]], [[SHUF]]
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(<2 x s64>) = G_AND [[PTRTOINT1]], [[XOR]]
+    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(<2 x s64>) = G_OR [[AND]], [[AND1]]
     ; CHECK-NEXT: [[INTTOPTR:%[0-9]+]]:_(<2 x p0>) = G_INTTOPTR [[OR]](<2 x s64>)
     ; CHECK-NEXT: $q0 = COPY [[INTTOPTR]](<2 x p0>)
     ; CHECK-NEXT: RET_ReallyLR implicit $q0
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-shift.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-shift.mir
index e7073bdc3d685..9a9b35cf293da 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-shift.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-shift.mir
@@ -7,8 +7,8 @@ body:             |
     ; CHECK-LABEL: name: test_shift
     ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x1
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
     ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
     ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[TRUNC]], [[C]]
     ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
     ; CHECK-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[TRUNC1]], 8
@@ -109,13 +109,9 @@ body:             |
     ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s64) = G_SHL [[UV1]], [[TRUNC]](s64)
     ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s64) = G_OR [[LSHR]], [[SHL1]]
     ; CHECK-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[UV]], [[SUB]](s64)
-    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ICMP]], [[C2]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[AND]](s32), [[SHL]], [[C1]]
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[ICMP]], [[C2]]
-    ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[AND1]](s32), [[OR]], [[SHL2]]
-    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[ICMP1]], [[C2]]
-    ; CHECK-NEXT: [[SELECT2:%[0-9]+]]:_(s64) = G_SELECT [[AND2]](s32), [[UV1]], [[SELECT1]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s32), [[SHL]], [[C1]]
+    ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s32), [[OR]], [[SHL2]]
+    ; CHECK-NEXT: [[SELECT2:%[0-9]+]]:_(s64) = G_SELECT [[ICMP1]](s32), [[UV1]], [[SELECT1]]
     ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[SELECT]](s64), [[SELECT2]](s64)
     ; CHECK-NEXT: $q0 = COPY [[MV]](s128)
     %0:_(s128) = COPY $q0
@@ -145,13 +141,9 @@ body:             |
     ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[UV1]], [[SUB1]](s64)
     ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s64) = G_OR [[LSHR1]], [[SHL]]
     ; CHECK-NEXT: [[LSHR2:%[0-9]+]]:_(s64) = G_LSHR [[UV1]], [[SUB]](s64)
-    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ICMP]], [[C2]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[AND]](s32), [[OR]], [[LSHR2]]
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[ICMP1]], [[C2]]
-    ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[AND1]](s32), [[UV]], [[SELECT]]
-    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[ICMP]], [[C2]]
-    ; CHECK-NEXT: [[SELECT2:%[0-9]+]]:_(s64) = G_SELECT [[AND2]](s32), [[LSHR]], [[C1]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s32), [[OR]], [[LSHR2]]
+    ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[ICMP1]](s32), [[UV]], [[SELECT]]
+    ; CHECK-NEXT: [[SELECT2:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s32), [[LSHR]], [[C1]]
     ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[SELECT1]](s64), [[SELECT2]](s64)
     ; CHECK-NEXT: $q0 = COPY [[MV]](s128)
     %0:_(s128) = COPY $q0
@@ -183,13 +175,9 @@ body:             |
     ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 63
     ; CHECK-NEXT: [[ASHR1:%[0-9]+]]:_(s64) = G_ASHR [[UV1]], [[C2]](s64)
     ; CHECK-NEXT: [[ASHR2:%[0-9]+]]:_(s64) = G_ASHR [[UV1]], [[SUB]](s64)
-    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ICMP]], [[C3]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[AND]](s32), [[OR]], [[ASHR2]]
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[ICMP1]], [[C3]]
-    ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[AND1]](s32), [[UV]], [[SELECT]]
-    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[ICMP]], [[C3]]
-    ; CHECK-NEXT: [[SELECT2:%[0-9]+]]:_(s64) = G_SELECT [[AND2]](s32), [[ASHR]], [[ASHR1]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s32), [[OR]], [[ASHR2]]
+    ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[ICMP1]](s32), [[UV]], [[SELECT]]
+    ; CHECK-NEXT: [[SELECT2:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s32), [[ASHR]], [[ASHR1]]
     ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[SELECT1]](s64), [[SELECT2]](s64)
     ; CHECK-NEXT: $q0 = COPY [[MV]](s128)
     %0:_(s128) = COPY $q0
@@ -236,7 +224,9 @@ body:             |
     liveins: $w0
 
     ; CHECK-LABEL: name: shl_cimm_32
-    ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+    ; CHECK: liveins: $w0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
     ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
     ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[C]](s64)
     ; CHECK-NEXT: $w0 = COPY [[SHL]](s32)
@@ -255,7 +245,9 @@ body:             |
     liveins: $w0
 
     ; CHECK-LABEL: name: lshr_cimm_32
-    ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+    ; CHECK: liveins: $w0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
     ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
     ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C]](s64)
     ; CHECK-NEXT: $w0 = COPY [[LSHR]](s32)
@@ -274,7 +266,9 @@ body:             |
     liveins: $w0
 
     ; CHECK-LABEL: name: ashr_cimm_32
-    ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+    ; CHECK: liveins: $w0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
     ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
     ; CHECK-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[COPY]], [[C]](s64)
     ; CHECK-NEXT: $w0 = COPY [[ASHR]](s32)
@@ -496,11 +490,8 @@ body:             |
     ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[MV1]], [[SUB1]](s64)
     ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s64) = G_OR [[LSHR]], [[SHL]]
     ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(s64) = G_LSHR [[MV1]], [[SUB]](s64)
-    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ICMP]], [[C3]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[AND]](s32), [[OR]], [[LSHR1]]
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[ICMP1]], [[C3]]
-    ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[AND1]](s32), [[MV]], [[SELECT]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s32), [[OR]], [[LSHR1]]
+    ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[ICMP1]](s32), [[MV]], [[SELECT]]
     ; CHECK-NEXT: %d1:_(s32), %d2:_(s32) = G_UNMERGE_VALUES [[SELECT1]](s64)
     ; CHECK-NEXT: $w0 = COPY %d2(s32)
     %0:_(s64) = COPY $x0
@@ -529,7 +520,6 @@ body:             |
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[COPY]](s64), [[C2]]
     ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[COPY]](s64), [[C1]]
     ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 64
-    ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; CHECK-NEXT: [[SUB2:%[0-9]+]]:_(s64) = G_SUB [[COPY]], [[C3]]
     ; CHECK-NEXT: [[SUB3:%[0-9]+]]:_(s64) = G_SUB [[C3]], [[COPY]]
     ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[COPY]](s64), [[C3]]
@@ -538,14 +528,11 @@ body:             |
     ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[MV1]], [[SUB3]](s64)
     ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s64) = G_OR [[LSHR]], [[SHL]]
     ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(s64) = G_LSHR [[MV1]], [[SUB2]](s64)
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ICMP2]], [[C4]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[AND]](s32), [[OR]], [[LSHR1]]
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[ICMP3]], [[C4]]
-    ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[AND1]](s32), [[MV]], [[SELECT]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP2]](s32), [[OR]], [[LSHR1]]
+    ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[ICMP3]](s32), [[MV]], [[SELECT]]
     ; CHECK-NEXT: [[ICMP4:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[SUB1]](s64), [[C3]]
     ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s64) = G_SHL [[MV2]], [[SUB1]](s64)
-    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[ICMP4]], [[C4]]
-    ; CHECK-NEXT: [[SELECT2:%[0-9]+]]:_(s64) = G_SELECT [[AND2]](s32), [[SHL1]], [[C1]]
+    ; CHECK-NEXT: [[SELECT2:%[0-9]+]]:_(s64) = G_SELECT [[ICMP4]](s32), [[SHL1]], [[C1]]
     ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s64) = G_OR [[SELECT1]], [[SELECT2]]
     ; CHECK-NEXT: [[SUB4:%[0-9]+]]:_(s64) = G_SUB [[SUB]], [[C3]]
     ; CHECK-NEXT: [[SUB5:%[0-9]+]]:_(s64) = G_SUB [[C3]], [[SUB]]
@@ -555,14 +542,10 @@ body:             |
     ; CHECK-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[C1]], [[SUB5]](s64)
     ; CHECK-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[LSHR2]], [[SHL2]]
     ; CHECK-NEXT: [[LSHR3:%[0-9]+]]:_(s64) = G_LSHR [[C1]], [[SUB4]](s64)
-    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[ICMP5]], [[C4]]
-    ; CHECK-NEXT: [[SELECT3:%[0-9]+]]:_(s64) = G_SELECT [[AND3]](s32), [[OR2]], [[LSHR3]]
-    ; CHECK-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[ICMP6]], [[C4]]
-    ; CHECK-NEXT: [[SELECT4:%[0-9]+]]:_(s64) = G_SELECT [[AND4]](s32), [[MV2]], [[SELECT3]]
-    ; CHECK-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[ICMP]], [[C4]]
-    ; CHECK-NEXT: [[SELECT5:%[0-9]+]]:_(s64) = G_SELECT [[AND5]](s32), [[OR1]], [[SELECT4]]
-    ; CHECK-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[ICMP1]], [[C4]]
-    ; CHECK-NEXT: [[SELECT6:%[0-9]+]]:_(s64) = G_SELECT [[AND6]](s32), [[MV]], [[SELECT5]]
+    ; CHECK-NEXT: [[SELECT3:%[0-9]+]]:_(s64) = G_SELECT [[ICMP5]](s32), [[OR2]], [[LSHR3]]
+    ; CHECK-NEXT: [[SELECT4:%[0-9]+]]:_(s64) = G_SELECT [[ICMP6]](s32), [[MV2]], [[SELECT3]]
+    ; CHECK-NEXT: [[SELECT5:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s32), [[OR1]], [[SELECT4]]
+    ; CHECK-NEXT: [[SELECT6:%[0-9]+]]:_(s64) = G_SELECT [[ICMP1]](s32), [[MV]], [[SELECT5]]
     ; CHECK-NEXT: %d1:_(s32), %d2:_(s32) = G_UNMERGE_VALUES [[SELECT6]](s64)
     ; CHECK-NEXT: $w0 = COPY %d2(s32)
     %0:_(s64) = COPY $x0
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-simple.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-simple.mir
index c1bf5cd33bea4..62865bcfdf081 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-simple.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-simple.mir
@@ -12,16 +12,16 @@ body:             |
   ; CHECK-NEXT:   [[INTTOPTR:%[0-9]+]]:_(p0) = G_INTTOPTR [[COPY]](s64)
   ; CHECK-NEXT:   [[PTRTOINT:%[0-9]+]]:_(s64) = G_PTRTOINT [[INTTOPTR]](p0)
   ; CHECK-NEXT:   $x0 = COPY [[PTRTOINT]](s64)
-  ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
   ; CHECK-NEXT:   [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+  ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
   ; CHECK-NEXT:   [[AND:%[0-9]+]]:_(s32) = G_AND [[TRUNC1]], [[C]]
   ; CHECK-NEXT:   G_BRCOND [[AND]](s32), %bb.1
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.1:
   ; CHECK-NEXT:   [[TRUNC2:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
   ; CHECK-NEXT:   [[TRUNC3:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
-  ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
   ; CHECK-NEXT:   [[TRUNC4:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+  ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
   ; CHECK-NEXT:   [[AND1:%[0-9]+]]:_(s32) = G_AND [[TRUNC4]], [[C1]]
   ; CHECK-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AND1]](s32), [[TRUNC2]], [[TRUNC3]]
   ; CHECK-NEXT:   $w0 = COPY [[SELECT]](s32)
@@ -101,7 +101,9 @@ body:             |
     liveins: $x0, $x1
 
     ; CHECK-LABEL: name: bitcast128
-    ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
+    ; CHECK: liveins: $x0, $x1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x1
     ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[COPY]](s64), [[COPY1]](s64)
     ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s64>) = G_BITCAST [[MV]](s128)
@@ -122,7 +124,9 @@ body:             |
     liveins: $x0
 
     ; CHECK-LABEL: name: testExtOfCopyOfTrunc
-    ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
+    ; CHECK: liveins: $x0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
     ; CHECK-NEXT: $x0 = COPY [[COPY]](s64)
     ; CHECK-NEXT: RET_ReallyLR implicit $x0
     %0:_(s64) = COPY $x0
@@ -140,7 +144,9 @@ body:             |
     liveins: $x0
 
     ; CHECK-LABEL: name: testExtOf2CopyOfTrunc
-    ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
+    ; CHECK: liveins: $x0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
     ; CHECK-NEXT: $x0 = COPY [[COPY]](s64)
     ; CHECK-NEXT: RET_ReallyLR implicit $x0
     %0:_(s64) = COPY $x0
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-ssube.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-ssube.mir
index fa372217c8835..e7d1f8e7a50f9 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-ssube.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-ssube.mir
@@ -11,12 +11,11 @@ body:             |
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY $x2
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s64) = COPY $x3
     ; CHECK-NEXT: [[COPY4:%[0-9]+]]:_(s64) = COPY $x4
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY4]](s64)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[TRUNC]], [[C]]
     ; CHECK-NEXT: [[USUBE:%[0-9]+]]:_(s64), [[USUBE1:%[0-9]+]]:_(s32) = G_USUBE [[COPY]], [[COPY2]], [[AND]]
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[USUBE1]], [[C]]
-    ; CHECK-NEXT: [[SSUBE:%[0-9]+]]:_(s64), [[SSUBE1:%[0-9]+]]:_(s32) = G_SSUBE [[COPY1]], [[COPY3]], [[AND1]]
+    ; CHECK-NEXT: [[SSUBE:%[0-9]+]]:_(s64), [[SSUBE1:%[0-9]+]]:_(s32) = G_SSUBE [[COPY1]], [[COPY3]], [[USUBE1]]
     ; CHECK-NEXT: %carry_out_ext:_(s64) = G_ANYEXT [[SSUBE1]](s32)
     ; CHECK-NEXT: $x0 = COPY [[USUBE]](s64)
     ; CHECK-NEXT: $x1 = COPY [[SSUBE]](s64)
@@ -47,14 +46,12 @@ body:             |
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY $x2
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s64) = COPY $x3
     ; CHECK-NEXT: [[COPY4:%[0-9]+]]:_(s64) = COPY $x4
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY4]](s64)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[TRUNC]], [[C]]
     ; CHECK-NEXT: [[USUBE:%[0-9]+]]:_(s64), [[USUBE1:%[0-9]+]]:_(s32) = G_USUBE [[COPY]], [[COPY1]], [[AND]]
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[USUBE1]], [[C]]
-    ; CHECK-NEXT: [[USUBE2:%[0-9]+]]:_(s64), [[USUBE3:%[0-9]+]]:_(s32) = G_USUBE [[COPY1]], [[COPY2]], [[AND1]]
-    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[USUBE3]], [[C]]
-    ; CHECK-NEXT: [[SSUBE:%[0-9]+]]:_(s64), [[SSUBE1:%[0-9]+]]:_(s32) = G_SSUBE [[COPY2]], [[COPY3]], [[AND2]]
+    ; CHECK-NEXT: [[USUBE2:%[0-9]+]]:_(s64), [[USUBE3:%[0-9]+]]:_(s32) = G_USUBE [[COPY1]], [[COPY2]], [[USUBE1]]
+    ; CHECK-NEXT: [[SSUBE:%[0-9]+]]:_(s64), [[SSUBE1:%[0-9]+]]:_(s32) = G_SSUBE [[COPY2]], [[COPY3]], [[USUBE3]]
     ; CHECK-NEXT: %carry_out_ext:_(s64) = G_ANYEXT [[SSUBE1]](s32)
     ; CHECK-NEXT: $x0 = COPY [[USUBE]](s64)
     ; CHECK-NEXT: $x1 = COPY [[USUBE2]](s64)
@@ -89,8 +86,8 @@ body:             |
     ; CHECK-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[TRUNC]], 8
     ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64)
     ; CHECK-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[TRUNC1]], 8
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; CHECK-NEXT: [[TRUNC2:%[0-9]+]]:_(s32) = G_TRUNC [[COPY2]](s64)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[TRUNC2]], [[C]]
     ; CHECK-NEXT: [[USUBE:%[0-9]+]]:_(s32), [[USUBE1:%[0-9]+]]:_(s32) = G_USUBE [[SEXT_INREG]], [[SEXT_INREG1]], [[AND]]
     ; CHECK-NEXT: [[SEXT_INREG2:%[0-9]+]]:_(s32) = G_SEXT_INREG [[USUBE]], 8
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-ssubo.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-ssubo.mir
index cc53b387940b2..5e8d96a4b6d68 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-ssubo.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-ssubo.mir
@@ -11,9 +11,7 @@ body:             |
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY $x2
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s64) = COPY $x3
     ; CHECK-NEXT: [[USUBO:%[0-9]+]]:_(s64), [[USUBO1:%[0-9]+]]:_(s32) = G_USUBO [[COPY]], [[COPY2]]
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[USUBO1]], [[C]]
-    ; CHECK-NEXT: [[SSUBE:%[0-9]+]]:_(s64), [[SSUBE1:%[0-9]+]]:_(s32) = G_SSUBE [[COPY1]], [[COPY3]], [[AND]]
+    ; CHECK-NEXT: [[SSUBE:%[0-9]+]]:_(s64), [[SSUBE1:%[0-9]+]]:_(s32) = G_SSUBE [[COPY1]], [[COPY3]], [[USUBO1]]
     ; CHECK-NEXT: %carry_out_ext:_(s64) = G_ANYEXT [[SSUBE1]](s32)
     ; CHECK-NEXT: $x0 = COPY [[USUBO]](s64)
     ; CHECK-NEXT: $x1 = COPY [[SSUBE]](s64)
@@ -42,11 +40,8 @@ body:             |
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY $x2
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s64) = COPY $x3
     ; CHECK-NEXT: [[USUBO:%[0-9]+]]:_(s64), [[USUBO1:%[0-9]+]]:_(s32) = G_USUBO [[COPY]], [[COPY1]]
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[USUBO1]], [[C]]
-    ; CHECK-NEXT: [[USUBE:%[0-9]+]]:_(s64), [[USUBE1:%[0-9]+]]:_(s32) = G_USUBE [[COPY1]], [[COPY2]], [[AND]]
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[USUBE1]], [[C]]
-    ; CHECK-NEXT: [[SSUBE:%[0-9]+]]:_(s64), [[SSUBE1:%[0-9]+]]:_(s32) = G_SSUBE [[COPY2]], [[COPY3]], [[AND1]]
+    ; CHECK-NEXT: [[USUBE:%[0-9]+]]:_(s64), [[USUBE1:%[0-9]+]]:_(s32) = G_USUBE [[COPY1]], [[COPY2]], [[USUBO1]]
+    ; CHECK-NEXT: [[SSUBE:%[0-9]+]]:_(s64), [[SSUBE1:%[0-9]+]]:_(s32) = G_SSUBE [[COPY2]], [[COPY3]], [[USUBE1]]
     ; CHECK-NEXT: %carry_out_ext:_(s64) = G_ANYEXT [[SSUBE1]](s32)
     ; CHECK-NEXT: $x0 = COPY [[USUBO]](s64)
     ; CHECK-NEXT: $x1 = COPY [[USUBE]](s64)
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-ssubsat.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-ssubsat.mir
index 8baa96bc564d5..2311be6b425cb 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-ssubsat.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-ssubsat.mir
@@ -18,9 +18,7 @@ body:             |
     ; CHECK-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[SSUBO]], [[C]](s64)
     ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -2147483648
     ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[ASHR]], [[C1]]
-    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[SSUBO1]], [[C2]]
-    ; CHECK-NEXT: %ssubsat:_(s32) = G_SELECT [[AND]](s32), [[ADD]], [[SSUBO]]
+    ; CHECK-NEXT: %ssubsat:_(s32) = G_SELECT [[SSUBO1]](s32), [[ADD]], [[SSUBO]]
     ; CHECK-NEXT: $w0 = COPY %ssubsat(s32)
     ; CHECK-NEXT: RET_ReallyLR implicit $w0
     %x:_(s32) = COPY $w0
@@ -46,9 +44,7 @@ body:             |
     ; CHECK-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[SSUBO]], [[C]](s64)
     ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 -9223372036854775808
     ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[ASHR]], [[C1]]
-    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[SSUBO1]], [[C2]]
-    ; CHECK-NEXT: %ssubsat:_(s64) = G_SELECT [[AND]](s32), [[ADD]], [[SSUBO]]
+    ; CHECK-NEXT: %ssubsat:_(s64) = G_SELECT [[SSUBO1]](s32), [[ADD]], [[SSUBO]]
     ; CHECK-NEXT: $x0 = COPY %ssubsat(s64)
     ; CHECK-NEXT: RET_ReallyLR implicit $x0
     %x:_(s64) = COPY $x0
@@ -80,9 +76,7 @@ body:             |
     ; CHECK-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[COPY]], [[C]](s64)
     ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -32768
     ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[ASHR]], [[C1]]
-    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ICMP]], [[C2]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AND]](s32), [[ADD]], [[SUB]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s32), [[ADD]], [[SUB]]
     ; CHECK-NEXT: $w0 = COPY [[SELECT]](s32)
     ; CHECK-NEXT: RET_ReallyLR implicit $w0
     %copy_1:_(s32) = COPY $w0
@@ -117,8 +111,7 @@ body:             |
     ; CHECK-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[COPY]], [[C]](s64)
     ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[ASHR]], [[C1]]
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ICMP]], [[C1]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AND]](s32), [[ADD]], [[SUB]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s32), [[ADD]], [[SUB]]
     ; CHECK-NEXT: $w0 = COPY [[SELECT]](s32)
     ; CHECK-NEXT: RET_ReallyLR implicit $w0
     %copy_1:_(s32) = COPY $w0
@@ -153,9 +146,7 @@ body:             |
     ; CHECK-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[COPY]], [[C]](s64)
     ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
     ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[ASHR]], [[C1]]
-    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ICMP]], [[C2]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AND]](s32), [[ADD]], [[SUB]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s32), [[ADD]], [[SUB]]
     ; CHECK-NEXT: $w0 = COPY [[SELECT]](s32)
     ; CHECK-NEXT: RET_ReallyLR implicit $w0
     %copy_1:_(s32) = COPY $w0
@@ -187,13 +178,10 @@ body:             |
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ne), [[SUB]](s64), [[SEXT_INREG2]]
     ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 35
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY [[SEXT_INREG2]](s64)
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY [[C]](s64)
-    ; CHECK-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[COPY]], [[COPY1]](s64)
+    ; CHECK-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[COPY]], [[C]](s64)
     ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 34359738368
     ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[ASHR]], [[C1]]
-    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ICMP]], [[C2]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[AND]](s32), [[ADD]], [[SUB]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s32), [[ADD]], [[SUB]]
     ; CHECK-NEXT: $x0 = COPY [[SELECT]](s64)
     ; CHECK-NEXT: RET_ReallyLR implicit $x0
     %copy_1:_(s64) = COPY $x0
@@ -227,9 +215,7 @@ body:             |
     ; CHECK-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[TRUNC]], 24
     ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[UV3]](s64)
     ; CHECK-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[TRUNC1]], 24
-    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[USUBO1]], [[C1]]
-    ; CHECK-NEXT: [[USUBE:%[0-9]+]]:_(s32), [[USUBE1:%[0-9]+]]:_(s32) = G_USUBE [[SEXT_INREG]], [[SEXT_INREG1]], [[AND]]
+    ; CHECK-NEXT: [[USUBE:%[0-9]+]]:_(s32), [[USUBE1:%[0-9]+]]:_(s32) = G_USUBE [[SEXT_INREG]], [[SEXT_INREG1]], [[USUBO1]]
     ; CHECK-NEXT: [[SEXT_INREG2:%[0-9]+]]:_(s32) = G_SEXT_INREG [[USUBE]], 24
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ne), [[USUBE]](s32), [[SEXT_INREG2]]
     ; CHECK-NEXT: [[UV4:%[0-9]+]]:_(s8), [[UV5:%[0-9]+]]:_(s8), [[UV6:%[0-9]+]]:_(s8), [[UV7:%[0-9]+]]:_(s8) = G_UNMERGE_VALUES [[USUBE]](s32)
@@ -240,11 +226,10 @@ body:             |
     ; CHECK-NEXT: [[MV1:%[0-9]+]]:_(s32) = G_MERGE_VALUES [[UV8]](s8), [[UV9]](s8), [[UV10]](s8), [[UV8]](s8)
     ; CHECK-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[MV]](s32), [[MV1]](s32)
     ; CHECK-NEXT: [[SEXT_INREG3:%[0-9]+]]:_(s64) = G_SEXT_INREG [[MV2]], 24
-    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 23
-    ; CHECK-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[SEXT_INREG3]], [[C2]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 23
+    ; CHECK-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[SEXT_INREG3]], [[C1]](s64)
     ; CHECK-NEXT: [[UADDO:%[0-9]+]]:_(s64), [[UADDO1:%[0-9]+]]:_(s32) = G_UADDO [[ASHR]], [[C]]
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[ICMP]], [[C1]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[AND1]](s32), [[UADDO]], [[USUBO]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s32), [[UADDO]], [[USUBO]]
     ; CHECK-NEXT: $x0 = COPY [[SELECT]](s64)
     ; CHECK-NEXT: RET_ReallyLR implicit $x0
     %copy_1:_(s128) = COPY $q0
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-sub.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-sub.mir
index 3de25d98f00c9..205c32f6971ac 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-sub.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-sub.mir
@@ -11,9 +11,7 @@ body:             |
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY $x2
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s64) = COPY $x3
     ; CHECK-NEXT: [[USUBO:%[0-9]+]]:_(s64), [[USUBO1:%[0-9]+]]:_(s32) = G_USUBO [[COPY]], [[COPY2]]
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[USUBO1]], [[C]]
-    ; CHECK-NEXT: [[USUBE:%[0-9]+]]:_(s64), [[USUBE1:%[0-9]+]]:_(s32) = G_USUBE [[COPY1]], [[COPY3]], [[AND]]
+    ; CHECK-NEXT: [[USUBE:%[0-9]+]]:_(s64), [[USUBE1:%[0-9]+]]:_(s32) = G_USUBE [[COPY1]], [[COPY3]], [[USUBO1]]
     ; CHECK-NEXT: $x0 = COPY [[USUBO]](s64)
     ; CHECK-NEXT: $x1 = COPY [[USUBE]](s64)
     %0:_(s64) = COPY $x0
@@ -38,11 +36,8 @@ body:             |
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY $x2
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s64) = COPY $x3
     ; CHECK-NEXT: [[USUBO:%[0-9]+]]:_(s64), [[USUBO1:%[0-9]+]]:_(s32) = G_USUBO [[COPY]], [[COPY1]]
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[USUBO1]], [[C]]
-    ; CHECK-NEXT: [[USUBE:%[0-9]+]]:_(s64), [[USUBE1:%[0-9]+]]:_(s32) = G_USUBE [[COPY1]], [[COPY2]], [[AND]]
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[USUBE1]], [[C]]
-    ; CHECK-NEXT: [[USUBE2:%[0-9]+]]:_(s64), [[USUBE3:%[0-9]+]]:_(s32) = G_USUBE [[COPY2]], [[COPY3]], [[AND1]]
+    ; CHECK-NEXT: [[USUBE:%[0-9]+]]:_(s64), [[USUBE1:%[0-9]+]]:_(s32) = G_USUBE [[COPY1]], [[COPY2]], [[USUBO1]]
+    ; CHECK-NEXT: [[USUBE2:%[0-9]+]]:_(s64), [[USUBE3:%[0-9]+]]:_(s32) = G_USUBE [[COPY2]], [[COPY3]], [[USUBE1]]
     ; CHECK-NEXT: $x0 = COPY [[USUBO]](s64)
     ; CHECK-NEXT: $x1 = COPY [[USUBE]](s64)
     ; CHECK-NEXT: $x2 = COPY [[USUBE2]](s64)
@@ -136,8 +131,8 @@ body:             |
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s32>) = COPY $d3
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(<2 x s32>) = G_ICMP intpred(eq), [[COPY]](<2 x s32>), [[COPY1]]
     ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(<2 x s32>) = G_ICMP intpred(eq), [[COPY2]](<2 x s32>), [[COPY3]]
-    ; CHECK-NEXT: [[sub:%[0-9]+]]:_(<2 x s32>) = G_SUB [[ICMP]], [[ICMP1]]
-    ; CHECK-NEXT: $d0 = COPY [[sub]](<2 x s32>)
+    ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(<2 x s32>) = G_SUB [[ICMP]], [[ICMP1]]
+    ; CHECK-NEXT: $d0 = COPY [[SUB]](<2 x s32>)
     ; CHECK-NEXT: RET_ReallyLR implicit $d0
     %0:_(<2 x s32>) = COPY $d0
     %1:_(<2 x s32>) = COPY $d1
@@ -163,15 +158,15 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s8) = COPY $b0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s8) = COPY $b1
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s8) = COPY $b2
-    ; CHECK-NEXT: [[ANYEXT0:%[0-9]+]]:_(s16) = G_ANYEXT [[COPY]](s8)
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s16) = G_ANYEXT [[COPY]](s8)
     ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s16) = G_ANYEXT [[COPY1]](s8)
     ; CHECK-NEXT: [[ANYEXT2:%[0-9]+]]:_(s16) = G_ANYEXT [[COPY2]](s8)
-    ; CHECK-NEXT: [[IMPLICIT_DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
-    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s16>) = G_BUILD_VECTOR [[ANYEXT0]](s16), [[ANYEXT1]](s16), [[ANYEXT2]](s16), [[IMPLICIT_DEF]](s16)
+    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s16>) = G_BUILD_VECTOR [[ANYEXT]](s16), [[ANYEXT1]](s16), [[ANYEXT2]](s16), [[DEF]](s16)
     ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(<4 x s16>) = G_SUB [[BUILD_VECTOR]], [[BUILD_VECTOR]]
-    ; CHECK-NEXT: [[VAL0:%[0-9]+]]:_(s16), [[VAL1:%[0-9]+]]:_(s16), [[VAL2:%[0-9]+]]:_(s16), [[VAL3:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[SUB]](<4 x s16>)
-    ; CHECK-NEXT: [[TRUNC3:%[0-9]+]]:_(s8) = G_TRUNC [[VAL0]](s16)
-    ; CHECK-NEXT: $b0 = COPY [[TRUNC3]](s8)
+    ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16), [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[SUB]](<4 x s16>)
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[UV]](s16)
+    ; CHECK-NEXT: $b0 = COPY [[TRUNC]](s8)
     ; CHECK-NEXT: RET_ReallyLR implicit $b0
     %1:_(s8) = COPY $b0
     %2:_(s8) = COPY $b1
@@ -200,8 +195,8 @@ body:             |
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(<4 x s16>) = COPY $d3
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(<4 x s16>) = G_ICMP intpred(eq), [[COPY]](<4 x s16>), [[COPY1]]
     ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(<4 x s16>) = G_ICMP intpred(eq), [[COPY2]](<4 x s16>), [[COPY3]]
-    ; CHECK-NEXT: [[sub:%[0-9]+]]:_(<4 x s16>) = G_SUB [[ICMP]], [[ICMP1]]
-    ; CHECK-NEXT: $d0 = COPY [[sub]](<4 x s16>)
+    ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(<4 x s16>) = G_SUB [[ICMP]], [[ICMP1]]
+    ; CHECK-NEXT: $d0 = COPY [[SUB]](<4 x s16>)
     ; CHECK-NEXT: RET_ReallyLR implicit $d0
     %0:_(<4 x s16>) = COPY $d0
     %1:_(<4 x s16>) = COPY $d1
@@ -230,8 +225,8 @@ body:             |
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(<8 x s8>) = COPY $d3
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(<8 x s8>) = G_ICMP intpred(eq), [[COPY]](<8 x s8>), [[COPY1]]
     ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(<8 x s8>) = G_ICMP intpred(eq), [[COPY2]](<8 x s8>), [[COPY3]]
-    ; CHECK-NEXT: [[sub:%[0-9]+]]:_(<8 x s8>) = G_SUB [[ICMP]], [[ICMP1]]
-    ; CHECK-NEXT: $d0 = COPY [[sub]](<8 x s8>)
+    ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(<8 x s8>) = G_SUB [[ICMP]], [[ICMP1]]
+    ; CHECK-NEXT: $d0 = COPY [[SUB]](<8 x s8>)
     ; CHECK-NEXT: RET_ReallyLR implicit $d0
     %0:_(<8 x s8>) = COPY $d0
     %1:_(<8 x s8>) = COPY $d1
@@ -260,8 +255,8 @@ body:             |
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(<16 x s8>) = COPY $q3
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(<16 x s8>) = G_ICMP intpred(eq), [[COPY]](<16 x s8>), [[COPY1]]
     ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(<16 x s8>) = G_ICMP intpred(eq), [[COPY2]](<16 x s8>), [[COPY3]]
-    ; CHECK-NEXT: [[sub:%[0-9]+]]:_(<16 x s8>) = G_SUB [[ICMP]], [[ICMP1]]
-    ; CHECK-NEXT: $q0 = COPY [[sub]](<16 x s8>)
+    ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(<16 x s8>) = G_SUB [[ICMP]], [[ICMP1]]
+    ; CHECK-NEXT: $q0 = COPY [[SUB]](<16 x s8>)
     ; CHECK-NEXT: RET_ReallyLR implicit $q0
     %0:_(<16 x s8>) = COPY $q0
     %1:_(<16 x s8>) = COPY $q1
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-uadd-sat.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-uadd-sat.mir
index 9dbbd14541546..e6293e72f547f 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-uadd-sat.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-uadd-sat.mir
@@ -18,9 +18,7 @@ body:             |
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $w1
     ; CHECK-NEXT: [[UADDO:%[0-9]+]]:_(s32), [[UADDO1:%[0-9]+]]:_(s32) = G_UADDO [[COPY]], [[COPY1]]
     ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
-    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[UADDO1]], [[C1]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AND]](s32), [[C]], [[UADDO]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[UADDO1]](s32), [[C]], [[UADDO]]
     ; CHECK-NEXT: $w0 = COPY [[SELECT]](s32)
     ; CHECK-NEXT: RET_ReallyLR implicit $w0
     %0:_(s32) = COPY $w0
@@ -48,9 +46,7 @@ body:             |
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x1
     ; CHECK-NEXT: [[UADDO:%[0-9]+]]:_(s64), [[UADDO1:%[0-9]+]]:_(s32) = G_UADDO [[COPY]], [[COPY1]]
     ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
-    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[UADDO1]], [[C1]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[AND]](s32), [[C]], [[UADDO]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[UADDO1]](s32), [[C]], [[UADDO]]
     ; CHECK-NEXT: $x0 = COPY [[SELECT]](s64)
     ; CHECK-NEXT: RET_ReallyLR implicit $x0
     %0:_(s64) = COPY $x0
@@ -83,9 +79,7 @@ body:             |
     ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[ADD]], [[C]]
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ne), [[ADD]](s32), [[AND2]]
     ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
-    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[ICMP]], [[C2]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AND3]](s32), [[C1]], [[ADD]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s32), [[C1]], [[ADD]]
     ; CHECK-NEXT: $w0 = COPY [[SELECT]](s32)
     ; CHECK-NEXT: RET_ReallyLR implicit $w0
     %2:_(s32) = COPY $w0
@@ -121,9 +115,7 @@ body:             |
     ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[ADD]], [[C]]
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ne), [[ADD]](s32), [[AND2]]
     ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
-    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[ICMP]], [[C2]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AND3]](s32), [[C1]], [[ADD]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s32), [[C1]], [[ADD]]
     ; CHECK-NEXT: $w0 = COPY [[SELECT]](s32)
     ; CHECK-NEXT: RET_ReallyLR implicit $w0
     %2:_(s32) = COPY $w0
@@ -162,9 +154,7 @@ body:             |
     ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[ADD]], [[C]]
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ne), [[ADD]](s32), [[AND2]]
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C]](s32)
-    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[ICMP]], [[C1]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AND3]](s32), [[COPY2]], [[ADD]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s32), [[COPY2]], [[ADD]]
     ; CHECK-NEXT: $w0 = COPY [[SELECT]](s32)
     ; CHECK-NEXT: RET_ReallyLR implicit $w0
     %2:_(s32) = COPY $w0
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-uadde.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-uadde.mir
index fcd50131dc75e..11f1f09cbbe0f 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-uadde.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-uadde.mir
@@ -11,12 +11,11 @@ body:             |
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY $x2
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s64) = COPY $x3
     ; CHECK-NEXT: [[COPY4:%[0-9]+]]:_(s64) = COPY $x4
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY4]](s64)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[TRUNC]], [[C]]
     ; CHECK-NEXT: [[UADDE:%[0-9]+]]:_(s64), [[UADDE1:%[0-9]+]]:_(s32) = G_UADDE [[COPY]], [[COPY2]], [[AND]]
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[UADDE1]], [[C]]
-    ; CHECK-NEXT: [[UADDE2:%[0-9]+]]:_(s64), [[UADDE3:%[0-9]+]]:_(s32) = G_UADDE [[COPY1]], [[COPY3]], [[AND1]]
+    ; CHECK-NEXT: [[UADDE2:%[0-9]+]]:_(s64), [[UADDE3:%[0-9]+]]:_(s32) = G_UADDE [[COPY1]], [[COPY3]], [[UADDE1]]
     ; CHECK-NEXT: %carry_out_ext:_(s64) = G_ANYEXT [[UADDE3]](s32)
     ; CHECK-NEXT: $x0 = COPY [[UADDE]](s64)
     ; CHECK-NEXT: $x1 = COPY [[UADDE2]](s64)
@@ -47,14 +46,12 @@ body:             |
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY $x2
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s64) = COPY $x3
     ; CHECK-NEXT: [[COPY4:%[0-9]+]]:_(s64) = COPY $x4
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY4]](s64)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[TRUNC]], [[C]]
     ; CHECK-NEXT: [[UADDE:%[0-9]+]]:_(s64), [[UADDE1:%[0-9]+]]:_(s32) = G_UADDE [[COPY]], [[COPY1]], [[AND]]
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[UADDE1]], [[C]]
-    ; CHECK-NEXT: [[UADDE2:%[0-9]+]]:_(s64), [[UADDE3:%[0-9]+]]:_(s32) = G_UADDE [[COPY1]], [[COPY2]], [[AND1]]
-    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[UADDE3]], [[C]]
-    ; CHECK-NEXT: [[UADDE4:%[0-9]+]]:_(s64), [[UADDE5:%[0-9]+]]:_(s32) = G_UADDE [[COPY2]], [[COPY3]], [[AND2]]
+    ; CHECK-NEXT: [[UADDE2:%[0-9]+]]:_(s64), [[UADDE3:%[0-9]+]]:_(s32) = G_UADDE [[COPY1]], [[COPY2]], [[UADDE1]]
+    ; CHECK-NEXT: [[UADDE4:%[0-9]+]]:_(s64), [[UADDE5:%[0-9]+]]:_(s32) = G_UADDE [[COPY2]], [[COPY3]], [[UADDE3]]
     ; CHECK-NEXT: %carry_out_ext:_(s64) = G_ANYEXT [[UADDE5]](s32)
     ; CHECK-NEXT: $x0 = COPY [[UADDE]](s64)
     ; CHECK-NEXT: $x1 = COPY [[UADDE2]](s64)
@@ -85,13 +82,13 @@ body:             |
     ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x1
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY $x2
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
     ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
     ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[TRUNC]], [[C]]
     ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64)
     ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[TRUNC1]], [[C]]
-    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; CHECK-NEXT: [[TRUNC2:%[0-9]+]]:_(s32) = G_TRUNC [[COPY2]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[TRUNC2]], [[C1]]
     ; CHECK-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s32) = G_UADDE [[AND]], [[AND1]], [[AND2]]
     ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[UADDE]], [[C]]
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-uaddo.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-uaddo.mir
index f646c6b053f17..7628233a6addc 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-uaddo.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-uaddo.mir
@@ -11,9 +11,7 @@ body:             |
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY $x2
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s64) = COPY $x3
     ; CHECK-NEXT: [[UADDO:%[0-9]+]]:_(s64), [[UADDO1:%[0-9]+]]:_(s32) = G_UADDO [[COPY]], [[COPY2]]
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[UADDO1]], [[C]]
-    ; CHECK-NEXT: [[UADDE:%[0-9]+]]:_(s64), [[UADDE1:%[0-9]+]]:_(s32) = G_UADDE [[COPY1]], [[COPY3]], [[AND]]
+    ; CHECK-NEXT: [[UADDE:%[0-9]+]]:_(s64), [[UADDE1:%[0-9]+]]:_(s32) = G_UADDE [[COPY1]], [[COPY3]], [[UADDO1]]
     ; CHECK-NEXT: %carry_out_ext:_(s64) = G_ANYEXT [[UADDE1]](s32)
     ; CHECK-NEXT: $x0 = COPY [[UADDO]](s64)
     ; CHECK-NEXT: $x1 = COPY [[UADDE]](s64)
@@ -42,11 +40,8 @@ body:             |
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY $x2
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s64) = COPY $x3
     ; CHECK-NEXT: [[UADDO:%[0-9]+]]:_(s64), [[UADDO1:%[0-9]+]]:_(s32) = G_UADDO [[COPY]], [[COPY1]]
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[UADDO1]], [[C]]
-    ; CHECK-NEXT: [[UADDE:%[0-9]+]]:_(s64), [[UADDE1:%[0-9]+]]:_(s32) = G_UADDE [[COPY1]], [[COPY2]], [[AND]]
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[UADDE1]], [[C]]
-    ; CHECK-NEXT: [[UADDE2:%[0-9]+]]:_(s64), [[UADDE3:%[0-9]+]]:_(s32) = G_UADDE [[COPY2]], [[COPY3]], [[AND1]]
+    ; CHECK-NEXT: [[UADDE:%[0-9]+]]:_(s64), [[UADDE1:%[0-9]+]]:_(s32) = G_UADDE [[COPY1]], [[COPY2]], [[UADDO1]]
+    ; CHECK-NEXT: [[UADDE2:%[0-9]+]]:_(s64), [[UADDE3:%[0-9]+]]:_(s32) = G_UADDE [[COPY2]], [[COPY3]], [[UADDE1]]
     ; CHECK-NEXT: %carry_out_ext:_(s64) = G_ANYEXT [[UADDE3]](s32)
     ; CHECK-NEXT: $x0 = COPY [[UADDO]](s64)
     ; CHECK-NEXT: $x1 = COPY [[UADDE]](s64)
@@ -74,8 +69,8 @@ body:             |
     ; CHECK-LABEL: name: test_scalar_uaddo_small
     ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x1
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
     ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
     ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[TRUNC]], [[C]]
     ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64)
     ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[TRUNC1]], [[C]]
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-usub-sat.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-usub-sat.mir
index 5fc30343b42f9..784dc389af58b 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-usub-sat.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-usub-sat.mir
@@ -18,9 +18,7 @@ body:             |
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $w1
     ; CHECK-NEXT: [[USUBO:%[0-9]+]]:_(s32), [[USUBO1:%[0-9]+]]:_(s32) = G_USUBO [[COPY]], [[COPY1]]
     ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[USUBO1]], [[C1]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AND]](s32), [[C]], [[USUBO]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[USUBO1]](s32), [[C]], [[USUBO]]
     ; CHECK-NEXT: $w0 = COPY [[SELECT]](s32)
     ; CHECK-NEXT: RET_ReallyLR implicit $w0
     %0:_(s32) = COPY $w0
@@ -48,9 +46,7 @@ body:             |
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x1
     ; CHECK-NEXT: [[USUBO:%[0-9]+]]:_(s64), [[USUBO1:%[0-9]+]]:_(s32) = G_USUBO [[COPY]], [[COPY1]]
     ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[USUBO1]], [[C1]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[AND]](s32), [[C]], [[USUBO]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[USUBO1]](s32), [[C]], [[USUBO]]
     ; CHECK-NEXT: $x0 = COPY [[SELECT]](s64)
     ; CHECK-NEXT: RET_ReallyLR implicit $x0
     %0:_(s64) = COPY $x0
@@ -83,9 +79,7 @@ body:             |
     ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[SUB]], [[C]]
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ne), [[SUB]](s32), [[AND2]]
     ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[ICMP]], [[C2]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AND3]](s32), [[C1]], [[SUB]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s32), [[C1]], [[SUB]]
     ; CHECK-NEXT: $w0 = COPY [[SELECT]](s32)
     ; CHECK-NEXT: RET_ReallyLR implicit $w0
     %2:_(s32) = COPY $w0
@@ -121,9 +115,7 @@ body:             |
     ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[SUB]], [[C]]
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ne), [[SUB]](s32), [[AND2]]
     ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[ICMP]], [[C2]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AND3]](s32), [[C1]], [[SUB]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s32), [[C1]], [[SUB]]
     ; CHECK-NEXT: $w0 = COPY [[SELECT]](s32)
     ; CHECK-NEXT: RET_ReallyLR implicit $w0
     %2:_(s32) = COPY $w0
@@ -159,9 +151,7 @@ body:             |
     ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[SUB]], [[C]]
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ne), [[SUB]](s32), [[AND2]]
     ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[ICMP]], [[C2]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AND3]](s32), [[C1]], [[SUB]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s32), [[C1]], [[SUB]]
     ; CHECK-NEXT: $w0 = COPY [[SELECT]](s32)
     ; CHECK-NEXT: RET_ReallyLR implicit $w0
     %2:_(s32) = COPY $w0
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-usube.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-usube.mir
index ca9fef41e45b5..a9c0f3135b420 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-usube.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-usube.mir
@@ -11,12 +11,11 @@ body:             |
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY $x2
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s64) = COPY $x3
     ; CHECK-NEXT: [[COPY4:%[0-9]+]]:_(s64) = COPY $x4
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY4]](s64)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[TRUNC]], [[C]]
     ; CHECK-NEXT: [[USUBE:%[0-9]+]]:_(s64), [[USUBE1:%[0-9]+]]:_(s32) = G_USUBE [[COPY]], [[COPY2]], [[AND]]
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[USUBE1]], [[C]]
-    ; CHECK-NEXT: [[USUBE2:%[0-9]+]]:_(s64), [[USUBE3:%[0-9]+]]:_(s32) = G_USUBE [[COPY1]], [[COPY3]], [[AND1]]
+    ; CHECK-NEXT: [[USUBE2:%[0-9]+]]:_(s64), [[USUBE3:%[0-9]+]]:_(s32) = G_USUBE [[COPY1]], [[COPY3]], [[USUBE1]]
     ; CHECK-NEXT: %carry_out_ext:_(s64) = G_ANYEXT [[USUBE3]](s32)
     ; CHECK-NEXT: $x0 = COPY [[USUBE]](s64)
     ; CHECK-NEXT: $x1 = COPY [[USUBE2]](s64)
@@ -47,14 +46,12 @@ body:             |
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY $x2
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s64) = COPY $x3
     ; CHECK-NEXT: [[COPY4:%[0-9]+]]:_(s64) = COPY $x4
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY4]](s64)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[TRUNC]], [[C]]
     ; CHECK-NEXT: [[USUBE:%[0-9]+]]:_(s64), [[USUBE1:%[0-9]+]]:_(s32) = G_USUBE [[COPY]], [[COPY1]], [[AND]]
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[USUBE1]], [[C]]
-    ; CHECK-NEXT: [[USUBE2:%[0-9]+]]:_(s64), [[USUBE3:%[0-9]+]]:_(s32) = G_USUBE [[COPY1]], [[COPY2]], [[AND1]]
-    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[USUBE3]], [[C]]
-    ; CHECK-NEXT: [[USUBE4:%[0-9]+]]:_(s64), [[USUBE5:%[0-9]+]]:_(s32) = G_USUBE [[COPY2]], [[COPY3]], [[AND2]]
+    ; CHECK-NEXT: [[USUBE2:%[0-9]+]]:_(s64), [[USUBE3:%[0-9]+]]:_(s32) = G_USUBE [[COPY1]], [[COPY2]], [[USUBE1]]
+    ; CHECK-NEXT: [[USUBE4:%[0-9]+]]:_(s64), [[USUBE5:%[0-9]+]]:_(s32) = G_USUBE [[COPY2]], [[COPY3]], [[USUBE3]]
     ; CHECK-NEXT: %carry_out_ext:_(s64) = G_ANYEXT [[USUBE5]](s32)
     ; CHECK-NEXT: $x0 = COPY [[USUBE]](s64)
     ; CHECK-NEXT: $x1 = COPY [[USUBE2]](s64)
@@ -85,13 +82,13 @@ body:             |
     ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x1
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY $x2
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
     ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
     ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[TRUNC]], [[C]]
     ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64)
     ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[TRUNC1]], [[C]]
-    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; CHECK-NEXT: [[TRUNC2:%[0-9]+]]:_(s32) = G_TRUNC [[COPY2]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[TRUNC2]], [[C1]]
     ; CHECK-NEXT: [[USUBE:%[0-9]+]]:_(s32), [[USUBE1:%[0-9]+]]:_(s32) = G_USUBE [[AND]], [[AND1]], [[AND2]]
     ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[USUBE]], [[C]]
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-usubo.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-usubo.mir
index 34eabeebe5169..30faa53d89362 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-usubo.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-usubo.mir
@@ -11,9 +11,7 @@ body:             |
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY $x2
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s64) = COPY $x3
     ; CHECK-NEXT: [[USUBO:%[0-9]+]]:_(s64), [[USUBO1:%[0-9]+]]:_(s32) = G_USUBO [[COPY]], [[COPY2]]
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[USUBO1]], [[C]]
-    ; CHECK-NEXT: [[USUBE:%[0-9]+]]:_(s64), [[USUBE1:%[0-9]+]]:_(s32) = G_USUBE [[COPY1]], [[COPY3]], [[AND]]
+    ; CHECK-NEXT: [[USUBE:%[0-9]+]]:_(s64), [[USUBE1:%[0-9]+]]:_(s32) = G_USUBE [[COPY1]], [[COPY3]], [[USUBO1]]
     ; CHECK-NEXT: %carry_out_ext:_(s64) = G_ANYEXT [[USUBE1]](s32)
     ; CHECK-NEXT: $x0 = COPY [[USUBO]](s64)
     ; CHECK-NEXT: $x1 = COPY [[USUBE]](s64)
@@ -42,11 +40,8 @@ body:             |
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY $x2
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s64) = COPY $x3
     ; CHECK-NEXT: [[USUBO:%[0-9]+]]:_(s64), [[USUBO1:%[0-9]+]]:_(s32) = G_USUBO [[COPY]], [[COPY1]]
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[USUBO1]], [[C]]
-    ; CHECK-NEXT: [[USUBE:%[0-9]+]]:_(s64), [[USUBE1:%[0-9]+]]:_(s32) = G_USUBE [[COPY1]], [[COPY2]], [[AND]]
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[USUBE1]], [[C]]
-    ; CHECK-NEXT: [[USUBE2:%[0-9]+]]:_(s64), [[USUBE3:%[0-9]+]]:_(s32) = G_USUBE [[COPY2]], [[COPY3]], [[AND1]]
+    ; CHECK-NEXT: [[USUBE:%[0-9]+]]:_(s64), [[USUBE1:%[0-9]+]]:_(s32) = G_USUBE [[COPY1]], [[COPY2]], [[USUBO1]]
+    ; CHECK-NEXT: [[USUBE2:%[0-9]+]]:_(s64), [[USUBE3:%[0-9]+]]:_(s32) = G_USUBE [[COPY2]], [[COPY3]], [[USUBE1]]
     ; CHECK-NEXT: %carry_out_ext:_(s64) = G_ANYEXT [[USUBE3]](s32)
     ; CHECK-NEXT: $x0 = COPY [[USUBO]](s64)
     ; CHECK-NEXT: $x1 = COPY [[USUBE]](s64)
@@ -74,8 +69,8 @@ body:             |
     ; CHECK-LABEL: name: test_scalar_usubo_small
     ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x1
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
     ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
     ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[TRUNC]], [[C]]
     ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64)
     ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[TRUNC1]], [[C]]
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-combiner-zext-trunc-crash.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-combiner-zext-trunc-crash.mir
index 3fc7505e44e04..0982d9a0218b7 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-combiner-zext-trunc-crash.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-combiner-zext-trunc-crash.mir
@@ -17,8 +17,8 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[PHI:%[0-9]+]]:_(s16) = G_PHI %33(s16), %bb.2, [[DEF]](s16), %bb.0
-  ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
   ; CHECK-NEXT:   [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[PHI]](s16)
+  ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
   ; CHECK-NEXT:   [[AND:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C]]
   ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 46
   ; CHECK-NEXT:   [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[AND]](s32), [[C1]]
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/retry-artifact-combine.mir b/llvm/test/CodeGen/AArch64/GlobalISel/retry-artifact-combine.mir
index b39cdabb03f26..242b09546522b 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/retry-artifact-combine.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/retry-artifact-combine.mir
@@ -10,8 +10,7 @@ body:             |
     ; CHECK: [[FCMP:%[0-9]+]]:_(s32) = G_FCMP floatpred(ogt), [[COPY]](s32), [[COPY1]]
     ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[FCMP]], [[C]]
-    ; CHECK: [[AND1:%[0-9]+]]:_(s32) = G_AND [[AND]], [[C]]
-    ; CHECK: $w0 = COPY [[AND1]](s32)
+    ; CHECK: $w0 = COPY [[AND]](s32)
     %0:_(s32) = COPY $w0
     %1:_(s32) = COPY $w1
     %2:_(s1) = G_FCMP floatpred(ogt), %0(s32), %1
diff --git a/llvm/test/CodeGen/AArch64/zext.ll b/llvm/test/CodeGen/AArch64/zext.ll
index 8573a8a2d2571..42c0bf79e7789 100644
--- a/llvm/test/CodeGen/AArch64/zext.ll
+++ b/llvm/test/CodeGen/AArch64/zext.ll
@@ -1196,7 +1196,7 @@ define <16 x i64> @zext_v16i10_v16i64(<16 x i10> %a) {
 ;
 ; CHECK-GI-LABEL: zext_v16i10_v16i64:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    fmov s7, w0
+; CHECK-GI-NEXT:    fmov s16, w0
 ; CHECK-GI-NEXT:    fmov s17, w2
 ; CHECK-GI-NEXT:    ldr s0, [sp]
 ; CHECK-GI-NEXT:    fmov s18, w4
@@ -1207,33 +1207,33 @@ define <16 x i64> @zext_v16i10_v16i64(<16 x i10> %a) {
 ; CHECK-GI-NEXT:    ldr s4, [sp, #32]
 ; CHECK-GI-NEXT:    ldr s5, [sp, #40]
 ; CHECK-GI-NEXT:    ldr s6, [sp, #48]
-; CHECK-GI-NEXT:    ldr s16, [sp, #56]
-; CHECK-GI-NEXT:    mov v7.s[1], w1
+; CHECK-GI-NEXT:    ldr s7, [sp, #56]
+; CHECK-GI-NEXT:    mov v16.s[1], w1
 ; CHECK-GI-NEXT:    mov v17.s[1], w3
 ; CHECK-GI-NEXT:    mov v18.s[1], w5
 ; CHECK-GI-NEXT:    mov v19.s[1], w7
 ; CHECK-GI-NEXT:    mov v0.s[1], v1.s[0]
 ; CHECK-GI-NEXT:    mov v2.s[1], v3.s[0]
 ; CHECK-GI-NEXT:    mov v4.s[1], v5.s[0]
-; CHECK-GI-NEXT:    mov v6.s[1], v16.s[0]
+; CHECK-GI-NEXT:    mov v6.s[1], v7.s[0]
 ; CHECK-GI-NEXT:    adrp x8, .LCPI54_0
-; CHECK-GI-NEXT:    ldr q16, [x8, :lo12:.LCPI54_0]
-; CHECK-GI-NEXT:    ushll v1.2d, v7.2s, #0
+; CHECK-GI-NEXT:    ushll v1.2d, v16.2s, #0
 ; CHECK-GI-NEXT:    ushll v3.2d, v17.2s, #0
 ; CHECK-GI-NEXT:    ushll v5.2d, v18.2s, #0
 ; CHECK-GI-NEXT:    ushll v7.2d, v19.2s, #0
-; CHECK-GI-NEXT:    ushll v17.2d, v0.2s, #0
+; CHECK-GI-NEXT:    ushll v16.2d, v0.2s, #0
 ; CHECK-GI-NEXT:    ushll v18.2d, v2.2s, #0
 ; CHECK-GI-NEXT:    ushll v19.2d, v4.2s, #0
 ; CHECK-GI-NEXT:    ushll v20.2d, v6.2s, #0
-; CHECK-GI-NEXT:    and v0.16b, v1.16b, v16.16b
-; CHECK-GI-NEXT:    and v1.16b, v3.16b, v16.16b
-; CHECK-GI-NEXT:    and v2.16b, v5.16b, v16.16b
-; CHECK-GI-NEXT:    and v3.16b, v7.16b, v16.16b
-; CHECK-GI-NEXT:    and v4.16b, v17.16b, v16.16b
-; CHECK-GI-NEXT:    and v5.16b, v18.16b, v16.16b
-; CHECK-GI-NEXT:    and v6.16b, v19.16b, v16.16b
-; CHECK-GI-NEXT:    and v7.16b, v20.16b, v16.16b
+; CHECK-GI-NEXT:    ldr q17, [x8, :lo12:.LCPI54_0]
+; CHECK-GI-NEXT:    and v0.16b, v1.16b, v17.16b
+; CHECK-GI-NEXT:    and v1.16b, v3.16b, v17.16b
+; CHECK-GI-NEXT:    and v2.16b, v5.16b, v17.16b
+; CHECK-GI-NEXT:    and v3.16b, v7.16b, v17.16b
+; CHECK-GI-NEXT:    and v4.16b, v16.16b, v17.16b
+; CHECK-GI-NEXT:    and v5.16b, v18.16b, v17.16b
+; CHECK-GI-NEXT:    and v6.16b, v19.16b, v17.16b
+; CHECK-GI-NEXT:    and v7.16b, v20.16b, v17.16b
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = zext <16 x i10> %a to <16 x i64>
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/artifact-combiner-cse-leaves-dead-cast.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/artifact-combiner-cse-leaves-dead-cast.mir
index b99b4cf54c5b3..bd2f5181fd525 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/artifact-combiner-cse-leaves-dead-cast.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/artifact-combiner-cse-leaves-dead-cast.mir
@@ -50,8 +50,8 @@ body:             |
     ; CHECK-NEXT: %and5:_(s1) = G_XOR %unmerge3_5, %negone
     ; CHECK-NEXT: %and6:_(s1) = G_XOR %unmerge3_6, %negone
     ; CHECK-NEXT: %and7:_(s1) = G_XOR %unmerge3_7, %negone
-    ; CHECK-NEXT: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
     ; CHECK-NEXT: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT %and0(s1)
+    ; CHECK-NEXT: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
     ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[SEXT]], [[C10]]
     ; CHECK-NEXT: [[SEXT1:%[0-9]+]]:_(s32) = G_SEXT %and1(s1)
     ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[SEXT1]], [[C10]]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/artifact-combiner-unmerge-values.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/artifact-combiner-unmerge-values.mir
index 2ff6433b362d6..3b456ed248b3a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/artifact-combiner-unmerge-values.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/artifact-combiner-unmerge-values.mir
@@ -139,8 +139,7 @@ body:             |
     ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
     ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
     ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C1]]
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]]
-    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[AND]](s32), [[AND1]](s32)
+    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[AND]](s32), [[LSHR]](s32)
     ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
     %0:_(<2 x s32>) = COPY $vgpr0_vgpr1
     %1:_(<2 x s32>) = COPY $vgpr0_vgpr1
@@ -549,11 +548,9 @@ body:             |
     ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
     ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
     ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[BITCAST]], [[C1]]
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]]
-    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C1]]
-    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C1]]
-    ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[AND]](s32), [[AND1]](s32)
-    ; CHECK-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[AND2]](s32), [[AND3]](s32)
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C1]]
+    ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[AND]](s32), [[LSHR]](s32)
+    ; CHECK-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[AND1]](s32), [[LSHR1]](s32)
     ; CHECK-NEXT: S_ENDPGM 0, implicit [[MV]](s64), implicit [[MV1]](s64)
     %0:_(<2 x s16>) = COPY $vgpr0
     %1:_(<2 x s16>) = COPY $vgpr1
@@ -1305,8 +1302,8 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
     ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<2 x s64>), [[UV1:%[0-9]+]]:_(<2 x s64>) = G_UNMERGE_VALUES [[COPY]](<4 x s64>)
     ; CHECK-NEXT: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[UV]](<2 x s64>)
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
     ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[UV2]](s64)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
     ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[TRUNC]], [[C]]
     ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[UV3]](s64)
     ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[TRUNC1]], [[C]]
@@ -1453,8 +1450,8 @@ body:             |
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[COPY2]]
     ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[COPY2]]
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
     ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[ICMP]](s1)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
     ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[ANYEXT]], [[C]]
     ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[ICMP1]](s1)
     ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[ANYEXT1]], [[C]]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/artifact-combiner-zext.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/artifact-combiner-zext.mir
index 6f866ea478569..7334ae71d63cf 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/artifact-combiner-zext.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/artifact-combiner-zext.mir
@@ -31,10 +31,10 @@ body: |
     ; CHECK: liveins: $vgpr0_vgpr1
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 65535
     ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>)
     ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[UV]](s32)
     ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[UV1]](s32)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 65535
     ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[ANYEXT]], [[C]]
     ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[ANYEXT1]], [[C]]
     ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[AND]](s64), [[AND1]](s64)
@@ -55,13 +55,13 @@ body: |
     ; CHECK: liveins: $vgpr0_vgpr1
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(<2 x s16>) = G_TRUNC [[COPY]](<2 x s32>)
     ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C]](s32)
     ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
     ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[C]], [[C1]](s32)
     ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY1]], [[SHL]]
     ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
-    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(<2 x s16>) = G_TRUNC [[COPY]](<2 x s32>)
     ; CHECK-NEXT: [[AND:%[0-9]+]]:_(<2 x s16>) = G_AND [[TRUNC]], [[BITCAST]]
     ; CHECK-NEXT: $vgpr0 = COPY [[AND]](<2 x s16>)
     %0:_(<2 x s32>) = COPY $vgpr0_vgpr1
@@ -134,8 +134,8 @@ body: |
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[COPY1]]
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
     ; CHECK-NEXT: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[ICMP]](s1)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
     ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[SEXT]], [[C]]
     ; CHECK-NEXT: $vgpr0 = COPY [[AND]](s32)
     %0:_(s32) = COPY $vgpr0
@@ -158,8 +158,8 @@ body: |
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[COPY1]]
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
     ; CHECK-NEXT: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[ICMP]](s1)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
     ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[SEXT]], [[C]]
     ; CHECK-NEXT: $vgpr0 = COPY [[AND]](s32)
     %0:_(s32) = COPY $vgpr0
@@ -181,8 +181,8 @@ body: |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s8), addrspace 1)
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
     ; CHECK-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[LOAD]], 8
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
     ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[SEXT_INREG]], [[C]]
     ; CHECK-NEXT: $vgpr0 = COPY [[AND]](s32)
     %0:_(p1) = COPY $vgpr0_vgpr1
@@ -209,12 +209,12 @@ body: |
     ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[UV1]](s32), [[UV3]]
     ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[ICMP]](s1)
     ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[ICMP1]](s1)
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
-    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C]](s32)
     ; CHECK-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[ANYEXT]], 1
     ; CHECK-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[ANYEXT1]], 1
-    ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[SEXT_INREG]](s32), [[SEXT_INREG1]](s32)
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(<2 x s32>) = G_AND [[BUILD_VECTOR1]], [[BUILD_VECTOR]]
+    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[SEXT_INREG]](s32), [[SEXT_INREG1]](s32)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
+    ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C]](s32)
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(<2 x s32>) = G_AND [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
     ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[AND]](<2 x s32>)
     %0:_(<2 x s32>) = COPY $vgpr0_vgpr1
     %1:_(<2 x s32>) = COPY $vgpr2_vgpr3
@@ -241,12 +241,12 @@ body: |
     ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[UV1]](s32), [[UV3]]
     ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[ICMP]](s1)
     ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[ICMP1]](s1)
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
-    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C]](s32)
     ; CHECK-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[ANYEXT]], 1
     ; CHECK-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[ANYEXT1]], 1
-    ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[SEXT_INREG]](s32), [[SEXT_INREG1]](s32)
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(<2 x s32>) = G_AND [[BUILD_VECTOR1]], [[BUILD_VECTOR]]
+    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[SEXT_INREG]](s32), [[SEXT_INREG1]](s32)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+    ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C]](s32)
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(<2 x s32>) = G_AND [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
     ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[AND]](<2 x s32>)
     %0:_(<2 x s32>) = COPY $vgpr0_vgpr1
     %1:_(<2 x s32>) = COPY $vgpr2_vgpr3
@@ -269,12 +269,12 @@ body: |
     ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s16), addrspace 1)
     ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
     ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C]](s32)
-    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
-    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[C1]](s32), [[C1]](s32)
     ; CHECK-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[LOAD]], 8
     ; CHECK-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[LSHR]], 8
-    ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[SEXT_INREG]](s32), [[SEXT_INREG1]](s32)
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(<2 x s32>) = G_AND [[BUILD_VECTOR1]], [[BUILD_VECTOR]]
+    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[SEXT_INREG]](s32), [[SEXT_INREG1]](s32)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+    ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[C1]](s32), [[C1]](s32)
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(<2 x s32>) = G_AND [[BUILD_VECTOR]], [[BUILD_VECTOR1]]
     ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[AND]](<2 x s32>)
     %0:_(p1) = COPY $vgpr0_vgpr1
     %1:_(<2 x s8>) = G_LOAD %0 :: (load (<2 x s8>), addrspace 1)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
index 056629ca35451..525075e516d21 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
@@ -1327,9 +1327,9 @@ define i32 @v_fshl_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 24, v2
 ; GFX9-NEXT:    v_and_b32_e32 v8, 7, v2
 ; GFX9-NEXT:    v_not_b32_e32 v2, v2
-; GFX9-NEXT:    s_mov_b32 s5, 1
+; GFX9-NEXT:    s_mov_b32 s4, 1
 ; GFX9-NEXT:    v_and_b32_e32 v2, 7, v2
-; GFX9-NEXT:    v_lshrrev_b16_sdwa v10, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_lshrrev_b16_sdwa v10, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v8, v8, v0
 ; GFX9-NEXT:    v_lshrrev_b16_e32 v2, v2, v10
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 8, v1
@@ -1338,7 +1338,7 @@ define i32 @v_fshl_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
 ; GFX9-NEXT:    v_not_b32_e32 v5, v5
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
 ; GFX9-NEXT:    v_and_b32_e32 v5, 7, v5
-; GFX9-NEXT:    v_lshrrev_b16_sdwa v4, s5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_lshrrev_b16_sdwa v4, s4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_mov_b32_e32 v9, 0xff
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v3, v8, v3
 ; GFX9-NEXT:    v_lshrrev_b16_e32 v4, v5, v4
@@ -1360,9 +1360,9 @@ define i32 @v_fshl_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
 ; GFX9-NEXT:    v_lshrrev_b16_e32 v1, v6, v1
 ; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 8
-; GFX9-NEXT:    s_movk_i32 s4, 0xff
+; GFX9-NEXT:    s_movk_i32 s5, 0xff
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_and_or_b32 v1, v2, s4, v1
+; GFX9-NEXT:    v_and_or_b32 v1, v2, s5, v1
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xff, v4
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
@@ -1807,48 +1807,47 @@ define i24 @v_fshl_i24(i24 %lhs, i24 %rhs, i24 %amt) {
 define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 inreg %amt.arg) {
 ; GFX6-LABEL: s_fshl_v2i24:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    v_cvt_f32_ubyte0_e32 v0, 24
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX6-NEXT:    s_lshr_b32 s6, s0, 16
 ; GFX6-NEXT:    s_lshr_b32 s7, s0, 24
 ; GFX6-NEXT:    s_and_b32 s9, s0, 0xff
 ; GFX6-NEXT:    s_bfe_u32 s0, s0, 0x80008
+; GFX6-NEXT:    v_cvt_f32_ubyte0_e32 v0, 24
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 8
 ; GFX6-NEXT:    s_and_b32 s6, s6, 0xff
+; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX6-NEXT:    s_or_b32 s0, s9, s0
 ; GFX6-NEXT:    s_and_b32 s6, 0xffff, s6
-; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX6-NEXT:    s_lshr_b32 s8, s1, 8
 ; GFX6-NEXT:    s_and_b32 s0, 0xffff, s0
 ; GFX6-NEXT:    s_lshl_b32 s6, s6, 16
 ; GFX6-NEXT:    s_and_b32 s1, s1, 0xff
-; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX6-NEXT:    s_or_b32 s0, s0, s6
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 8
 ; GFX6-NEXT:    s_and_b32 s6, s8, 0xff
 ; GFX6-NEXT:    s_or_b32 s1, s7, s1
 ; GFX6-NEXT:    s_and_b32 s6, 0xffff, s6
+; GFX6-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v0
 ; GFX6-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX6-NEXT:    s_lshl_b32 s6, s6, 16
-; GFX6-NEXT:    v_mov_b32_e32 v1, 0xffffffe8
+; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX6-NEXT:    s_or_b32 s1, s1, s6
 ; GFX6-NEXT:    s_lshr_b32 s6, s2, 16
 ; GFX6-NEXT:    s_lshr_b32 s7, s2, 24
 ; GFX6-NEXT:    s_and_b32 s9, s2, 0xff
 ; GFX6-NEXT:    s_bfe_u32 s2, s2, 0x80008
-; GFX6-NEXT:    v_mul_lo_u32 v2, v0, v1
 ; GFX6-NEXT:    s_lshl_b32 s2, s2, 8
 ; GFX6-NEXT:    s_and_b32 s6, s6, 0xff
 ; GFX6-NEXT:    s_or_b32 s2, s9, s2
 ; GFX6-NEXT:    s_and_b32 s6, 0xffff, s6
+; GFX6-NEXT:    v_mov_b32_e32 v2, 0xffffffe8
 ; GFX6-NEXT:    s_lshr_b32 s8, s3, 8
 ; GFX6-NEXT:    s_and_b32 s2, 0xffff, s2
 ; GFX6-NEXT:    s_lshl_b32 s6, s6, 16
 ; GFX6-NEXT:    s_and_b32 s3, s3, 0xff
+; GFX6-NEXT:    v_mul_lo_u32 v3, v1, v2
 ; GFX6-NEXT:    s_or_b32 s2, s2, s6
 ; GFX6-NEXT:    s_lshl_b32 s3, s3, 8
 ; GFX6-NEXT:    s_and_b32 s6, s8, 0xff
-; GFX6-NEXT:    v_mul_hi_u32 v2, v0, v2
 ; GFX6-NEXT:    s_or_b32 s3, s7, s3
 ; GFX6-NEXT:    s_and_b32 s6, 0xffff, s6
 ; GFX6-NEXT:    s_and_b32 s3, 0xffff, s3
@@ -1858,78 +1857,77 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX6-NEXT:    s_lshr_b32 s7, s4, 24
 ; GFX6-NEXT:    s_and_b32 s9, s4, 0xff
 ; GFX6-NEXT:    s_bfe_u32 s4, s4, 0x80008
+; GFX6-NEXT:    v_mul_hi_u32 v3, v1, v3
 ; GFX6-NEXT:    s_lshl_b32 s4, s4, 8
 ; GFX6-NEXT:    s_and_b32 s6, s6, 0xff
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; GFX6-NEXT:    v_cvt_f32_ubyte0_e32 v2, 24
 ; GFX6-NEXT:    s_or_b32 s4, s9, s4
 ; GFX6-NEXT:    s_and_b32 s6, 0xffff, s6
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v2
 ; GFX6-NEXT:    s_and_b32 s4, 0xffff, s4
 ; GFX6-NEXT:    s_lshl_b32 s6, s6, 16
 ; GFX6-NEXT:    s_or_b32 s4, s4, s6
-; GFX6-NEXT:    v_mul_hi_u32 v0, s4, v0
-; GFX6-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
-; GFX6-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
+; GFX6-NEXT:    v_mul_hi_u32 v1, s4, v1
+; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX6-NEXT:    s_lshr_b32 s8, s5, 8
-; GFX6-NEXT:    v_mul_lo_u32 v0, v0, 24
+; GFX6-NEXT:    v_mul_lo_u32 v1, v1, 24
 ; GFX6-NEXT:    s_and_b32 s5, s5, 0xff
-; GFX6-NEXT:    v_mul_lo_u32 v1, v2, v1
+; GFX6-NEXT:    v_mul_lo_u32 v2, v0, v2
 ; GFX6-NEXT:    s_lshl_b32 s5, s5, 8
-; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
-; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, 24, v0
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 24, v0
-; GFX6-NEXT:    v_mul_hi_u32 v1, v2, v1
+; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s4, v1
+; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, 24, v1
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 24, v1
+; GFX6-NEXT:    v_mul_hi_u32 v2, v0, v2
 ; GFX6-NEXT:    s_and_b32 s6, s8, 0xff
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX6-NEXT:    s_or_b32 s5, s7, s5
 ; GFX6-NEXT:    s_and_b32 s6, 0xffff, s6
-; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, 24, v0
+; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, 24, v1
 ; GFX6-NEXT:    s_and_b32 s5, 0xffff, s5
 ; GFX6-NEXT:    s_lshl_b32 s6, s6, 16
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 24, v0
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 24, v1
 ; GFX6-NEXT:    s_or_b32 s5, s5, s6
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
-; GFX6-NEXT:    v_mul_hi_u32 v1, s5, v1
-; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, 23, v0
-; GFX6-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
-; GFX6-NEXT:    v_mul_lo_u32 v1, v1, 24
-; GFX6-NEXT:    v_lshl_b32_e32 v0, s0, v0
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GFX6-NEXT:    v_mul_hi_u32 v0, s5, v0
+; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, 23, v1
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
+; GFX6-NEXT:    v_mul_lo_u32 v0, v0, 24
+; GFX6-NEXT:    v_lshl_b32_e32 v1, s0, v1
 ; GFX6-NEXT:    s_lshr_b32 s0, s2, 1
 ; GFX6-NEXT:    v_and_b32_e32 v2, 0xffffff, v3
 ; GFX6-NEXT:    v_lshr_b32_e32 v2, s0, v2
-; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s5, v1
-; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
-; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, 24, v1
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 24, v1
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, 24, v1
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 24, v1
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, 23, v1
-; GFX6-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
+; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s5, v0
+; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, 24, v0
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 24, v0
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, 24, v0
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 24, v0
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, 23, v0
+; GFX6-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
 ; GFX6-NEXT:    s_lshr_b32 s0, s3, 1
 ; GFX6-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
-; GFX6-NEXT:    v_lshl_b32_e32 v1, s1, v1
+; GFX6-NEXT:    v_lshl_b32_e32 v0, s1, v0
 ; GFX6-NEXT:    v_lshr_b32_e32 v2, s0, v2
-; GFX6-NEXT:    v_bfe_u32 v3, v0, 8, 8
-; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
-; GFX6-NEXT:    v_and_b32_e32 v2, 0xff, v0
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
-; GFX6-NEXT:    v_bfe_u32 v0, v0, 16, 8
-; GFX6-NEXT:    v_or_b32_e32 v2, v2, v3
-; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX6-NEXT:    v_or_b32_e32 v0, v2, v0
-; GFX6-NEXT:    v_and_b32_e32 v2, 0xff, v1
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
+; GFX6-NEXT:    v_bfe_u32 v3, v1, 8, 8
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
-; GFX6-NEXT:    v_bfe_u32 v2, v1, 8, 8
+; GFX6-NEXT:    v_and_b32_e32 v2, 0xff, v1
+; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
 ; GFX6-NEXT:    v_bfe_u32 v1, v1, 16, 8
-; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
+; GFX6-NEXT:    v_or_b32_e32 v2, v2, v3
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_or_b32_e32 v1, v2, v1
-; GFX6-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX6-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX6-NEXT:    v_and_b32_e32 v2, 0xff, v0
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
+; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX6-NEXT:    v_bfe_u32 v2, v0, 8, 8
+; GFX6-NEXT:    v_bfe_u32 v0, v0, 16, 8
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; GFX6-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX6-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX6-NEXT:    v_readfirstlane_b32 s1, v0
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: s_fshl_v2i24:
@@ -1942,9 +1940,7 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX8-NEXT:    s_lshl_b32 s6, s6, 8
 ; GFX8-NEXT:    s_or_b32 s0, s0, s6
 ; GFX8-NEXT:    s_and_b32 s6, s7, 0xff
-; GFX8-NEXT:    v_cvt_f32_ubyte0_e32 v0, 24
 ; GFX8-NEXT:    s_and_b32 s6, 0xffff, s6
-; GFX8-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX8-NEXT:    s_lshr_b32 s9, s1, 8
 ; GFX8-NEXT:    s_and_b32 s0, 0xffff, s0
 ; GFX8-NEXT:    s_lshl_b32 s6, s6, 16
@@ -1952,24 +1948,24 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX8-NEXT:    s_or_b32 s0, s0, s6
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 8
 ; GFX8-NEXT:    s_and_b32 s6, s9, 0xff
+; GFX8-NEXT:    v_cvt_f32_ubyte0_e32 v0, 24
 ; GFX8-NEXT:    s_or_b32 s1, s8, s1
 ; GFX8-NEXT:    s_and_b32 s6, 0xffff, s6
+; GFX8-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX8-NEXT:    s_lshl_b32 s6, s6, 16
-; GFX8-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX8-NEXT:    s_or_b32 s1, s1, s6
 ; GFX8-NEXT:    s_lshr_b32 s6, s2, 8
-; GFX8-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX8-NEXT:    s_and_b32 s6, s6, 0xff
 ; GFX8-NEXT:    s_lshr_b32 s7, s2, 16
 ; GFX8-NEXT:    s_lshr_b32 s8, s2, 24
 ; GFX8-NEXT:    s_and_b32 s2, s2, 0xff
 ; GFX8-NEXT:    s_lshl_b32 s6, s6, 8
+; GFX8-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v0
 ; GFX8-NEXT:    s_or_b32 s2, s2, s6
 ; GFX8-NEXT:    s_and_b32 s6, s7, 0xff
-; GFX8-NEXT:    v_mov_b32_e32 v1, 0xffffffe8
+; GFX8-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX8-NEXT:    s_and_b32 s6, 0xffff, s6
-; GFX8-NEXT:    v_mul_lo_u32 v2, v0, v1
 ; GFX8-NEXT:    s_lshr_b32 s9, s3, 8
 ; GFX8-NEXT:    s_and_b32 s2, 0xffff, s2
 ; GFX8-NEXT:    s_lshl_b32 s6, s6, 16
@@ -1977,11 +1973,12 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX8-NEXT:    s_or_b32 s2, s2, s6
 ; GFX8-NEXT:    s_lshl_b32 s3, s3, 8
 ; GFX8-NEXT:    s_and_b32 s6, s9, 0xff
+; GFX8-NEXT:    v_mov_b32_e32 v2, 0xffffffe8
 ; GFX8-NEXT:    s_or_b32 s3, s8, s3
 ; GFX8-NEXT:    s_and_b32 s6, 0xffff, s6
+; GFX8-NEXT:    v_mul_lo_u32 v3, v1, v2
 ; GFX8-NEXT:    s_and_b32 s3, 0xffff, s3
 ; GFX8-NEXT:    s_lshl_b32 s6, s6, 16
-; GFX8-NEXT:    v_mul_hi_u32 v2, v0, v2
 ; GFX8-NEXT:    s_or_b32 s3, s3, s6
 ; GFX8-NEXT:    s_lshr_b32 s6, s4, 8
 ; GFX8-NEXT:    s_and_b32 s6, s6, 0xff
@@ -1989,212 +1986,207 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX8-NEXT:    s_lshr_b32 s8, s4, 24
 ; GFX8-NEXT:    s_and_b32 s4, s4, 0xff
 ; GFX8-NEXT:    s_lshl_b32 s6, s6, 8
+; GFX8-NEXT:    v_mul_hi_u32 v3, v1, v3
 ; GFX8-NEXT:    s_or_b32 s4, s4, s6
 ; GFX8-NEXT:    s_and_b32 s6, s7, 0xff
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
-; GFX8-NEXT:    v_cvt_f32_ubyte0_e32 v2, 24
 ; GFX8-NEXT:    s_and_b32 s6, 0xffff, s6
-; GFX8-NEXT:    v_rcp_iflag_f32_e32 v2, v2
 ; GFX8-NEXT:    s_and_b32 s4, 0xffff, s4
 ; GFX8-NEXT:    s_lshl_b32 s6, s6, 16
 ; GFX8-NEXT:    s_or_b32 s4, s4, s6
-; GFX8-NEXT:    v_mul_hi_u32 v0, s4, v0
-; GFX8-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
-; GFX8-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v3
+; GFX8-NEXT:    v_mul_hi_u32 v1, s4, v1
+; GFX8-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX8-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX8-NEXT:    s_lshr_b32 s9, s5, 8
-; GFX8-NEXT:    v_mul_lo_u32 v0, v0, 24
+; GFX8-NEXT:    v_mul_lo_u32 v1, v1, 24
 ; GFX8-NEXT:    s_and_b32 s5, s5, 0xff
-; GFX8-NEXT:    v_mul_lo_u32 v1, v2, v1
+; GFX8-NEXT:    v_mul_lo_u32 v2, v0, v2
 ; GFX8-NEXT:    s_lshl_b32 s5, s5, 8
-; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s4, v0
-; GFX8-NEXT:    v_subrev_u32_e32 v3, vcc, 24, v0
-; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 24, v0
-; GFX8-NEXT:    v_mul_hi_u32 v1, v2, v1
+; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, s4, v1
+; GFX8-NEXT:    v_subrev_u32_e32 v3, vcc, 24, v1
+; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 24, v1
+; GFX8-NEXT:    v_mul_hi_u32 v2, v0, v2
 ; GFX8-NEXT:    s_and_b32 s6, s9, 0xff
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX8-NEXT:    s_or_b32 s5, s8, s5
 ; GFX8-NEXT:    s_and_b32 s6, 0xffff, s6
-; GFX8-NEXT:    v_subrev_u32_e32 v3, vcc, 24, v0
+; GFX8-NEXT:    v_subrev_u32_e32 v3, vcc, 24, v1
 ; GFX8-NEXT:    s_and_b32 s5, 0xffff, s5
 ; GFX8-NEXT:    s_lshl_b32 s6, s6, 16
-; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 24, v0
+; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 24, v1
 ; GFX8-NEXT:    s_or_b32 s5, s5, s6
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v2, v1
-; GFX8-NEXT:    v_mul_hi_u32 v1, s5, v1
-; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, 23, v0
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
-; GFX8-NEXT:    v_mul_lo_u32 v1, v1, 24
-; GFX8-NEXT:    v_lshlrev_b32_e64 v0, v0, s0
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT:    v_mul_hi_u32 v0, s5, v0
+; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, 23, v1
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
+; GFX8-NEXT:    v_mul_lo_u32 v0, v0, 24
+; GFX8-NEXT:    v_lshlrev_b32_e64 v1, v1, s0
 ; GFX8-NEXT:    s_lshr_b32 s0, s2, 1
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffffff, v3
 ; GFX8-NEXT:    v_lshrrev_b32_e64 v2, v2, s0
-; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, s5, v1
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
-; GFX8-NEXT:    v_subrev_u32_e32 v2, vcc, 24, v1
-; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 24, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX8-NEXT:    v_subrev_u32_e32 v2, vcc, 24, v1
-; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 24, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 23, v1
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
+; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s5, v0
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX8-NEXT:    v_subrev_u32_e32 v2, vcc, 24, v0
+; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 24, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX8-NEXT:    v_subrev_u32_e32 v2, vcc, 24, v0
+; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 24, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 23, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
 ; GFX8-NEXT:    s_lshr_b32 s0, s3, 1
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
-; GFX8-NEXT:    v_lshlrev_b32_e64 v1, v1, s1
+; GFX8-NEXT:    v_lshlrev_b32_e64 v0, v0, s1
 ; GFX8-NEXT:    v_lshrrev_b32_e64 v2, v2, s0
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 8
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v3, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX8-NEXT:    v_mov_b32_e32 v4, 16
-; GFX8-NEXT:    v_or_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX8-NEXT:    v_or_b32_e32 v0, v3, v0
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xff, v1
+; GFX8-NEXT:    v_or_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX8-NEXT:    v_or_b32_e32 v1, v3, v1
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xff, v0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v3
-; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX8-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; GFX8-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX8-NEXT:    v_readfirstlane_b32 s1, v0
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_fshl_v2i24:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, 24
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX9-NEXT:    s_lshr_b32 s7, s0, 8
 ; GFX9-NEXT:    s_and_b32 s7, s7, 0xff
 ; GFX9-NEXT:    s_lshr_b32 s9, s0, 16
-; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    s_lshr_b32 s10, s0, 24
 ; GFX9-NEXT:    s_and_b32 s0, s0, 0xff
 ; GFX9-NEXT:    s_lshl_b32 s7, s7, 8
 ; GFX9-NEXT:    s_or_b32 s0, s0, s7
 ; GFX9-NEXT:    s_and_b32 s7, s9, 0xff
 ; GFX9-NEXT:    s_and_b32 s7, 0xffff, s7
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0xffffffe8
 ; GFX9-NEXT:    s_lshr_b32 s11, s1, 8
 ; GFX9-NEXT:    s_and_b32 s0, 0xffff, s0
 ; GFX9-NEXT:    s_lshl_b32 s7, s7, 16
 ; GFX9-NEXT:    s_and_b32 s1, s1, 0xff
-; GFX9-NEXT:    v_mul_lo_u32 v2, v0, v1
 ; GFX9-NEXT:    s_or_b32 s0, s0, s7
 ; GFX9-NEXT:    s_lshl_b32 s1, s1, 8
 ; GFX9-NEXT:    s_and_b32 s7, s11, 0xff
+; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, 24
 ; GFX9-NEXT:    s_or_b32 s1, s10, s1
 ; GFX9-NEXT:    s_and_b32 s7, 0xffff, s7
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX9-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX9-NEXT:    s_lshl_b32 s7, s7, 16
 ; GFX9-NEXT:    s_or_b32 s1, s1, s7
 ; GFX9-NEXT:    s_lshr_b32 s7, s2, 8
-; GFX9-NEXT:    v_mul_hi_u32 v2, v0, v2
 ; GFX9-NEXT:    s_and_b32 s7, s7, 0xff
 ; GFX9-NEXT:    s_lshr_b32 s9, s2, 16
 ; GFX9-NEXT:    s_lshr_b32 s10, s2, 24
 ; GFX9-NEXT:    s_and_b32 s2, s2, 0xff
 ; GFX9-NEXT:    s_lshl_b32 s7, s7, 8
+; GFX9-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v0
 ; GFX9-NEXT:    s_or_b32 s2, s2, s7
 ; GFX9-NEXT:    s_and_b32 s7, s9, 0xff
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX9-NEXT:    s_and_b32 s7, 0xffff, s7
-; GFX9-NEXT:    v_add_u32_e32 v0, v0, v2
-; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v2, 24
 ; GFX9-NEXT:    s_lshr_b32 s11, s3, 8
 ; GFX9-NEXT:    s_and_b32 s2, 0xffff, s2
 ; GFX9-NEXT:    s_lshl_b32 s7, s7, 16
 ; GFX9-NEXT:    s_and_b32 s3, s3, 0xff
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v2
 ; GFX9-NEXT:    s_or_b32 s2, s2, s7
 ; GFX9-NEXT:    s_lshl_b32 s3, s3, 8
 ; GFX9-NEXT:    s_and_b32 s7, s11, 0xff
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0xffffffe8
 ; GFX9-NEXT:    s_or_b32 s3, s10, s3
 ; GFX9-NEXT:    s_and_b32 s7, 0xffff, s7
+; GFX9-NEXT:    v_mul_lo_u32 v3, v1, v2
 ; GFX9-NEXT:    s_and_b32 s3, 0xffff, s3
 ; GFX9-NEXT:    s_lshl_b32 s7, s7, 16
 ; GFX9-NEXT:    s_or_b32 s3, s3, s7
 ; GFX9-NEXT:    s_lshr_b32 s7, s4, 8
-; GFX9-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
+; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    s_and_b32 s7, s7, 0xff
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    s_lshr_b32 s9, s4, 16
 ; GFX9-NEXT:    s_lshr_b32 s10, s4, 24
 ; GFX9-NEXT:    s_and_b32 s4, s4, 0xff
 ; GFX9-NEXT:    s_lshl_b32 s7, s7, 8
+; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v3
 ; GFX9-NEXT:    s_or_b32 s4, s4, s7
 ; GFX9-NEXT:    s_and_b32 s7, s9, 0xff
 ; GFX9-NEXT:    s_and_b32 s7, 0xffff, s7
 ; GFX9-NEXT:    s_and_b32 s4, 0xffff, s4
 ; GFX9-NEXT:    s_lshl_b32 s7, s7, 16
-; GFX9-NEXT:    v_mul_lo_u32 v1, v2, v1
+; GFX9-NEXT:    v_mul_lo_u32 v2, v0, v2
 ; GFX9-NEXT:    s_or_b32 s4, s4, s7
-; GFX9-NEXT:    v_mul_hi_u32 v0, s4, v0
+; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
+; GFX9-NEXT:    v_mul_hi_u32 v1, s4, v1
 ; GFX9-NEXT:    s_lshr_b32 s11, s5, 8
 ; GFX9-NEXT:    s_and_b32 s5, s5, 0xff
-; GFX9-NEXT:    v_mul_hi_u32 v1, v2, v1
+; GFX9-NEXT:    v_mul_hi_u32 v2, v0, v2
 ; GFX9-NEXT:    s_lshl_b32 s5, s5, 8
 ; GFX9-NEXT:    s_and_b32 s7, s11, 0xff
 ; GFX9-NEXT:    s_or_b32 s5, s10, s5
 ; GFX9-NEXT:    s_and_b32 s7, 0xffff, s7
-; GFX9-NEXT:    v_mul_lo_u32 v0, v0, 24
+; GFX9-NEXT:    v_mul_lo_u32 v1, v1, 24
 ; GFX9-NEXT:    s_and_b32 s5, 0xffff, s5
 ; GFX9-NEXT:    s_lshl_b32 s7, s7, 16
 ; GFX9-NEXT:    s_or_b32 s5, s5, s7
-; GFX9-NEXT:    v_add_u32_e32 v1, v2, v1
-; GFX9-NEXT:    v_mul_hi_u32 v1, s5, v1
-; GFX9-NEXT:    v_sub_u32_e32 v0, s4, v0
-; GFX9-NEXT:    v_subrev_u32_e32 v3, 24, v0
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 24, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v3, 24, v0
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 24, v0
-; GFX9-NEXT:    v_mul_lo_u32 v1, v1, 24
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX9-NEXT:    v_sub_u32_e32 v2, 23, v0
-; GFX9-NEXT:    s_lshr_b32 s2, s2, 1
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
-; GFX9-NEXT:    v_lshrrev_b32_e64 v2, v2, s2
-; GFX9-NEXT:    v_sub_u32_e32 v1, s5, v1
-; GFX9-NEXT:    v_lshl_or_b32 v0, s0, v0, v2
-; GFX9-NEXT:    v_subrev_u32_e32 v2, 24, v1
+; GFX9-NEXT:    v_add_u32_e32 v0, v0, v2
+; GFX9-NEXT:    v_mul_hi_u32 v0, s5, v0
+; GFX9-NEXT:    v_sub_u32_e32 v1, s4, v1
+; GFX9-NEXT:    v_subrev_u32_e32 v3, 24, v1
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 24, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v2, 24, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX9-NEXT:    v_subrev_u32_e32 v3, 24, v1
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 24, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v0, v0, 24
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX9-NEXT:    v_sub_u32_e32 v2, 23, v1
-; GFX9-NEXT:    s_lshr_b32 s0, s3, 1
+; GFX9-NEXT:    s_lshr_b32 s2, s2, 1
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
+; GFX9-NEXT:    v_lshrrev_b32_e64 v2, v2, s2
+; GFX9-NEXT:    v_sub_u32_e32 v0, s5, v0
+; GFX9-NEXT:    v_lshl_or_b32 v1, s0, v1, v2
+; GFX9-NEXT:    v_subrev_u32_e32 v2, 24, v0
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 24, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-NEXT:    v_subrev_u32_e32 v2, 24, v0
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 24, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-NEXT:    v_sub_u32_e32 v2, 23, v0
+; GFX9-NEXT:    s_lshr_b32 s0, s3, 1
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e64 v2, v2, s0
 ; GFX9-NEXT:    s_mov_b32 s6, 8
-; GFX9-NEXT:    v_lshl_or_b32 v1, s1, v1, v2
+; GFX9-NEXT:    v_lshl_or_b32 v0, s1, v0, v2
 ; GFX9-NEXT:    s_mov_b32 s8, 16
 ; GFX9-NEXT:    s_movk_i32 s0, 0xff
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xff, v1
-; GFX9-NEXT:    v_and_or_b32 v2, v0, s0, v2
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, s6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xff, v0
+; GFX9-NEXT:    v_and_or_b32 v2, v1, s0, v2
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, s8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX9-NEXT:    v_or3_b32 v0, v2, v0, v3
-; GFX9-NEXT:    v_bfe_u32 v2, v1, 8, 8
-; GFX9-NEXT:    v_bfe_u32 v1, v1, 16, 8
-; GFX9-NEXT:    v_lshl_or_b32 v1, v1, 8, v2
-; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX9-NEXT:    v_or3_b32 v1, v2, v1, v3
+; GFX9-NEXT:    v_bfe_u32 v2, v0, 8, 8
+; GFX9-NEXT:    v_bfe_u32 v0, v0, 16, 8
+; GFX9-NEXT:    v_lshl_or_b32 v0, v0, 8, v2
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s1, v0
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: s_fshl_v2i24:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, 24
-; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v1, 24
 ; GFX10-NEXT:    s_lshr_b32 s6, s0, 8
 ; GFX10-NEXT:    s_lshr_b32 s7, s0, 16
 ; GFX10-NEXT:    s_and_b32 s6, s6, 0xff
-; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX10-NEXT:    v_rcp_iflag_f32_e32 v1, v1
 ; GFX10-NEXT:    s_lshr_b32 s8, s0, 24
+; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX10-NEXT:    s_and_b32 s0, s0, 0xff
 ; GFX10-NEXT:    s_lshl_b32 s6, s6, 8
 ; GFX10-NEXT:    s_and_b32 s7, s7, 0xff
@@ -2202,244 +2194,251 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX10-NEXT:    s_and_b32 s6, 0xffff, s7
 ; GFX10-NEXT:    s_lshr_b32 s7, s4, 8
 ; GFX10-NEXT:    s_lshr_b32 s10, s4, 16
-; GFX10-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
-; GFX10-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
 ; GFX10-NEXT:    s_and_b32 s7, s7, 0xff
 ; GFX10-NEXT:    s_lshr_b32 s11, s4, 24
+; GFX10-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v0
+; GFX10-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX10-NEXT:    s_and_b32 s4, s4, 0xff
-; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX10-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX10-NEXT:    s_lshl_b32 s7, s7, 8
 ; GFX10-NEXT:    s_lshr_b32 s12, s5, 8
+; GFX10-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX10-NEXT:    s_or_b32 s4, s4, s7
-; GFX10-NEXT:    v_mul_lo_u32 v2, 0xffffffe8, v0
-; GFX10-NEXT:    v_mul_lo_u32 v3, 0xffffffe8, v1
 ; GFX10-NEXT:    s_and_b32 s7, s10, 0xff
 ; GFX10-NEXT:    s_and_b32 s4, 0xffff, s4
+; GFX10-NEXT:    v_mul_lo_u32 v2, 0xffffffe8, v1
+; GFX10-NEXT:    v_mul_lo_u32 v3, 0xffffffe8, v0
 ; GFX10-NEXT:    s_and_b32 s7, 0xffff, s7
 ; GFX10-NEXT:    s_and_b32 s5, s5, 0xff
 ; GFX10-NEXT:    s_lshl_b32 s7, s7, 16
 ; GFX10-NEXT:    s_lshl_b32 s5, s5, 8
-; GFX10-NEXT:    v_mul_hi_u32 v2, v0, v2
-; GFX10-NEXT:    v_mul_hi_u32 v3, v1, v3
 ; GFX10-NEXT:    s_or_b32 s4, s4, s7
 ; GFX10-NEXT:    s_and_b32 s7, s12, 0xff
+; GFX10-NEXT:    v_mul_hi_u32 v2, v1, v2
+; GFX10-NEXT:    v_mul_hi_u32 v3, v0, v3
 ; GFX10-NEXT:    s_or_b32 s5, s11, s5
 ; GFX10-NEXT:    s_and_b32 s7, 0xffff, s7
 ; GFX10-NEXT:    s_and_b32 s5, 0xffff, s5
 ; GFX10-NEXT:    s_lshl_b32 s7, s7, 16
-; GFX10-NEXT:    v_add_nc_u32_e32 v0, v0, v2
-; GFX10-NEXT:    v_add_nc_u32_e32 v1, v1, v3
-; GFX10-NEXT:    s_or_b32 s5, s5, s7
 ; GFX10-NEXT:    s_lshr_b32 s9, s1, 8
+; GFX10-NEXT:    s_or_b32 s5, s5, s7
+; GFX10-NEXT:    v_add_nc_u32_e32 v1, v1, v2
+; GFX10-NEXT:    v_add_nc_u32_e32 v0, v0, v3
 ; GFX10-NEXT:    s_and_b32 s1, s1, 0xff
-; GFX10-NEXT:    v_mul_hi_u32 v0, s4, v0
-; GFX10-NEXT:    v_mul_hi_u32 v1, s5, v1
-; GFX10-NEXT:    s_lshl_b32 s1, s1, 8
 ; GFX10-NEXT:    s_and_b32 s7, s9, 0xff
+; GFX10-NEXT:    s_lshl_b32 s1, s1, 8
+; GFX10-NEXT:    v_mul_hi_u32 v1, s4, v1
+; GFX10-NEXT:    v_mul_hi_u32 v0, s5, v0
 ; GFX10-NEXT:    s_or_b32 s1, s8, s1
 ; GFX10-NEXT:    s_lshr_b32 s8, s2, 8
 ; GFX10-NEXT:    s_lshr_b32 s9, s2, 16
 ; GFX10-NEXT:    s_and_b32 s8, s8, 0xff
-; GFX10-NEXT:    v_mul_lo_u32 v0, v0, 24
-; GFX10-NEXT:    v_mul_lo_u32 v1, v1, 24
 ; GFX10-NEXT:    s_lshr_b32 s10, s2, 24
 ; GFX10-NEXT:    s_and_b32 s2, s2, 0xff
+; GFX10-NEXT:    v_mul_lo_u32 v1, v1, 24
+; GFX10-NEXT:    v_mul_lo_u32 v0, v0, 24
 ; GFX10-NEXT:    s_lshl_b32 s8, s8, 8
 ; GFX10-NEXT:    s_and_b32 s7, 0xffff, s7
 ; GFX10-NEXT:    s_or_b32 s2, s2, s8
 ; GFX10-NEXT:    s_and_b32 s0, 0xffff, s0
-; GFX10-NEXT:    v_sub_nc_u32_e32 v0, s4, v0
-; GFX10-NEXT:    v_sub_nc_u32_e32 v1, s5, v1
+; GFX10-NEXT:    s_and_b32 s2, 0xffff, s2
+; GFX10-NEXT:    s_lshl_b32 s6, s6, 16
+; GFX10-NEXT:    v_sub_nc_u32_e32 v1, s4, v1
+; GFX10-NEXT:    v_sub_nc_u32_e32 v0, s5, v0
 ; GFX10-NEXT:    s_lshr_b32 s4, s3, 8
 ; GFX10-NEXT:    s_and_b32 s5, s9, 0xff
 ; GFX10-NEXT:    s_and_b32 s3, s3, 0xff
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v2, 24, v0
-; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v0
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, 24, v1
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v2, 24, v1
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v1
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, 24, v0
 ; GFX10-NEXT:    s_and_b32 s5, 0xffff, s5
 ; GFX10-NEXT:    s_lshl_b32 s3, s3, 8
 ; GFX10-NEXT:    s_and_b32 s4, s4, 0xff
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v1
-; GFX10-NEXT:    s_and_b32 s2, 0xffff, s2
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc_lo
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v0
 ; GFX10-NEXT:    s_lshl_b32 s5, s5, 16
 ; GFX10-NEXT:    s_or_b32 s3, s10, s3
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v2, 24, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
-; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v0
 ; GFX10-NEXT:    s_and_b32 s4, 0xffff, s4
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v2, 24, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc_lo
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v1
 ; GFX10-NEXT:    s_or_b32 s2, s2, s5
 ; GFX10-NEXT:    s_and_b32 s3, 0xffff, s3
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, 24, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v1
 ; GFX10-NEXT:    s_lshl_b32 s4, s4, 16
-; GFX10-NEXT:    s_lshr_b32 s2, s2, 1
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, 24, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc_lo
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v0
 ; GFX10-NEXT:    s_or_b32 s3, s3, s4
-; GFX10-NEXT:    v_sub_nc_u32_e32 v2, 23, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
-; GFX10-NEXT:    s_lshl_b32 s6, s6, 16
+; GFX10-NEXT:    s_lshr_b32 s2, s2, 1
 ; GFX10-NEXT:    s_and_b32 s1, 0xffff, s1
-; GFX10-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
-; GFX10-NEXT:    v_sub_nc_u32_e32 v3, 23, v1
+; GFX10-NEXT:    v_sub_nc_u32_e32 v2, 23, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc_lo
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
 ; GFX10-NEXT:    s_lshl_b32 s7, s7, 16
 ; GFX10-NEXT:    s_or_b32 s0, s0, s6
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
+; GFX10-NEXT:    v_sub_nc_u32_e32 v3, 23, v0
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
+; GFX10-NEXT:    s_or_b32 s1, s1, s7
 ; GFX10-NEXT:    v_lshrrev_b32_e64 v2, v2, s2
 ; GFX10-NEXT:    v_and_b32_e32 v3, 0xffffff, v3
 ; GFX10-NEXT:    s_lshr_b32 s2, s3, 1
-; GFX10-NEXT:    s_or_b32 s1, s1, s7
-; GFX10-NEXT:    v_lshl_or_b32 v0, s0, v0, v2
+; GFX10-NEXT:    v_lshl_or_b32 v1, s0, v1, v2
 ; GFX10-NEXT:    v_lshrrev_b32_e64 v3, v3, s2
 ; GFX10-NEXT:    s_mov_b32 s0, 8
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX10-NEXT:    v_lshl_or_b32 v1, s1, v1, v3
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX10-NEXT:    v_lshl_or_b32 v0, s1, v0, v3
 ; GFX10-NEXT:    s_mov_b32 s0, 16
-; GFX10-NEXT:    v_and_or_b32 v2, v0, 0xff, v2
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xff, v1
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v0, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX10-NEXT:    v_bfe_u32 v4, v1, 8, 8
-; GFX10-NEXT:    v_bfe_u32 v1, v1, 16, 8
+; GFX10-NEXT:    v_and_or_b32 v2, v1, 0xff, v2
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xff, v0
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX10-NEXT:    v_bfe_u32 v4, v0, 8, 8
+; GFX10-NEXT:    v_bfe_u32 v0, v0, 16, 8
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX10-NEXT:    v_lshl_or_b32 v1, v1, 8, v4
-; GFX10-NEXT:    v_or3_b32 v0, v2, v0, v3
-; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
-; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX10-NEXT:    v_lshl_or_b32 v0, v0, 8, v4
+; GFX10-NEXT:    v_or3_b32 v1, v2, v1, v3
+; GFX10-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX10-NEXT:    v_readfirstlane_b32 s0, v1
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: s_fshl_v2i24:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v0, 24
-; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v1, 24
 ; GFX11-NEXT:    s_lshr_b32 s6, s0, 8
 ; GFX11-NEXT:    s_lshr_b32 s7, s0, 16
 ; GFX11-NEXT:    s_and_b32 s6, s6, 0xff
-; GFX11-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX11-NEXT:    v_rcp_iflag_f32_e32 v1, v1
 ; GFX11-NEXT:    s_lshr_b32 s8, s0, 24
+; GFX11-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX11-NEXT:    s_and_b32 s0, s0, 0xff
 ; GFX11-NEXT:    s_lshl_b32 s6, s6, 8
-; GFX11-NEXT:    s_lshr_b32 s10, s4, 24
+; GFX11-NEXT:    s_and_b32 s7, s7, 0xff
 ; GFX11-NEXT:    s_or_b32 s0, s0, s6
-; GFX11-NEXT:    s_and_b32 s6, s7, 0xff
+; GFX11-NEXT:    s_and_b32 s6, 0xffff, s7
 ; GFX11-NEXT:    s_and_b32 s0, 0xffff, s0
-; GFX11-NEXT:    s_and_b32 s6, 0xffff, s6
-; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_dual_mul_f32 v0, 0x4f7ffffe, v0 :: v_dual_mul_f32 v1, 0x4f7ffffe, v1
 ; GFX11-NEXT:    s_lshl_b32 s6, s6, 16
-; GFX11-NEXT:    s_lshr_b32 s7, s4, 16
+; GFX11-NEXT:    s_lshr_b32 s9, s4, 16
 ; GFX11-NEXT:    s_or_b32 s0, s0, s6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX11-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX11-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v0
+; GFX11-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX11-NEXT:    s_lshr_b32 s6, s4, 8
-; GFX11-NEXT:    s_and_b32 s4, s4, 0xff
+; GFX11-NEXT:    s_lshr_b32 s10, s4, 24
 ; GFX11-NEXT:    s_and_b32 s6, s6, 0xff
-; GFX11-NEXT:    v_mul_lo_u32 v2, 0xffffffe8, v0
-; GFX11-NEXT:    v_mul_lo_u32 v3, 0xffffffe8, v1
+; GFX11-NEXT:    s_and_b32 s4, s4, 0xff
+; GFX11-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX11-NEXT:    s_lshl_b32 s6, s6, 8
-; GFX11-NEXT:    s_and_b32 s7, s7, 0xff
-; GFX11-NEXT:    s_or_b32 s4, s4, s6
-; GFX11-NEXT:    s_and_b32 s6, 0xffff, s7
 ; GFX11-NEXT:    s_lshr_b32 s11, s5, 8
+; GFX11-NEXT:    s_or_b32 s4, s4, s6
+; GFX11-NEXT:    s_and_b32 s6, s9, 0xff
+; GFX11-NEXT:    v_mul_lo_u32 v3, 0xffffffe8, v0
+; GFX11-NEXT:    s_and_b32 s6, 0xffff, s6
 ; GFX11-NEXT:    s_and_b32 s4, 0xffff, s4
-; GFX11-NEXT:    v_mul_hi_u32 v2, v0, v2
 ; GFX11-NEXT:    s_lshl_b32 s6, s6, 16
 ; GFX11-NEXT:    s_and_b32 s5, s5, 0xff
 ; GFX11-NEXT:    s_or_b32 s4, s4, s6
 ; GFX11-NEXT:    s_lshl_b32 s5, s5, 8
 ; GFX11-NEXT:    s_and_b32 s6, s11, 0xff
+; GFX11-NEXT:    v_mul_hi_u32 v3, v0, v3
 ; GFX11-NEXT:    s_or_b32 s5, s10, s5
 ; GFX11-NEXT:    s_and_b32 s6, 0xffff, s6
-; GFX11-NEXT:    v_add_nc_u32_e32 v0, v0, v2
-; GFX11-NEXT:    v_mul_hi_u32 v2, v1, v3
 ; GFX11-NEXT:    s_and_b32 s5, 0xffff, s5
 ; GFX11-NEXT:    s_lshl_b32 s6, s6, 16
-; GFX11-NEXT:    s_lshr_b32 s9, s1, 8
-; GFX11-NEXT:    v_mul_hi_u32 v0, s4, v0
+; GFX11-NEXT:    s_lshr_b32 s7, s1, 8
 ; GFX11-NEXT:    s_or_b32 s5, s5, s6
 ; GFX11-NEXT:    s_and_b32 s1, s1, 0xff
-; GFX11-NEXT:    s_and_b32 s7, s9, 0xff
-; GFX11-NEXT:    v_add_nc_u32_e32 v1, v1, v2
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, v0, v3
+; GFX11-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX11-NEXT:    s_lshl_b32 s1, s1, 8
-; GFX11-NEXT:    s_and_b32 s6, 0xffff, s7
+; GFX11-NEXT:    s_and_b32 s6, s7, 0xff
 ; GFX11-NEXT:    s_lshr_b32 s7, s2, 8
-; GFX11-NEXT:    v_mul_lo_u32 v0, v0, 24
-; GFX11-NEXT:    v_mul_hi_u32 v1, s5, v1
+; GFX11-NEXT:    v_mul_hi_u32 v0, s5, v0
+; GFX11-NEXT:    v_mul_lo_u32 v2, 0xffffffe8, v1
 ; GFX11-NEXT:    s_or_b32 s1, s8, s1
 ; GFX11-NEXT:    s_lshr_b32 s8, s2, 16
 ; GFX11-NEXT:    s_and_b32 s7, s7, 0xff
-; GFX11-NEXT:    s_lshr_b32 s9, s3, 8
-; GFX11-NEXT:    s_lshl_b32 s7, s7, 8
-; GFX11-NEXT:    s_and_b32 s3, s3, 0xff
-; GFX11-NEXT:    v_sub_nc_u32_e32 v0, s4, v0
-; GFX11-NEXT:    v_mul_lo_u32 v1, v1, 24
-; GFX11-NEXT:    s_lshr_b32 s4, s2, 24
+; GFX11-NEXT:    s_lshr_b32 s9, s2, 24
 ; GFX11-NEXT:    s_and_b32 s2, s2, 0xff
-; GFX11-NEXT:    s_lshl_b32 s3, s3, 8
-; GFX11-NEXT:    v_subrev_nc_u32_e32 v2, 24, v0
-; GFX11-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v0
+; GFX11-NEXT:    s_lshl_b32 s7, s7, 8
+; GFX11-NEXT:    v_mul_lo_u32 v0, v0, 24
+; GFX11-NEXT:    v_mul_hi_u32 v2, v1, v2
 ; GFX11-NEXT:    s_or_b32 s2, s2, s7
-; GFX11-NEXT:    s_or_b32 s3, s4, s3
-; GFX11-NEXT:    v_sub_nc_u32_e32 v1, s5, v1
+; GFX11-NEXT:    s_and_b32 s6, 0xffff, s6
+; GFX11-NEXT:    s_and_b32 s2, 0xffff, s2
+; GFX11-NEXT:    s_and_b32 s1, 0xffff, s1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_sub_nc_u32_e32 v0, s5, v0
+; GFX11-NEXT:    v_add_nc_u32_e32 v1, v1, v2
 ; GFX11-NEXT:    s_and_b32 s5, s8, 0xff
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    s_and_b32 s5, 0xffff, s5
-; GFX11-NEXT:    s_and_b32 s2, 0xffff, s2
-; GFX11-NEXT:    v_subrev_nc_u32_e32 v2, 24, v1
-; GFX11-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v1
 ; GFX11-NEXT:    v_subrev_nc_u32_e32 v3, 24, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    v_mul_hi_u32 v1, s4, v1
 ; GFX11-NEXT:    s_lshl_b32 s5, s5, 16
-; GFX11-NEXT:    s_and_b32 s4, s9, 0xff
 ; GFX11-NEXT:    s_or_b32 s2, s2, s5
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc_lo
-; GFX11-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    s_lshr_b32 s2, s2, 1
-; GFX11-NEXT:    s_and_b32 s4, 0xffff, s4
-; GFX11-NEXT:    s_and_b32 s1, 0xffff, s1
+; GFX11-NEXT:    v_mul_lo_u32 v1, v1, 24
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_sub_nc_u32_e32 v1, s4, v1
+; GFX11-NEXT:    s_lshr_b32 s4, s3, 8
+; GFX11-NEXT:    s_and_b32 s3, s3, 0xff
+; GFX11-NEXT:    s_and_b32 s4, s4, 0xff
+; GFX11-NEXT:    s_lshl_b32 s3, s3, 8
 ; GFX11-NEXT:    v_subrev_nc_u32_e32 v2, 24, v1
+; GFX11-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v1
+; GFX11-NEXT:    s_or_b32 s3, s9, s3
+; GFX11-NEXT:    s_and_b32 s4, 0xffff, s4
+; GFX11-NEXT:    s_and_b32 s3, 0xffff, s3
+; GFX11-NEXT:    s_lshl_b32 s4, s4, 16
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc_lo
+; GFX11-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v0
+; GFX11-NEXT:    s_or_b32 s3, s3, s4
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    s_lshr_b32 s3, s3, 1
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc_lo
+; GFX11-NEXT:    v_subrev_nc_u32_e32 v2, 24, v1
 ; GFX11-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v1
-; GFX11-NEXT:    s_lshl_b32 s6, s6, 16
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_sub_nc_u32_e32 v3, 23, v0
-; GFX11-NEXT:    v_dual_cndmask_b32 v1, v1, v2 :: v_dual_and_b32 v0, 0xffffff, v0
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffffff, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_sub_nc_u32_e32 v3, 23, v1
+; GFX11-NEXT:    v_subrev_nc_u32_e32 v3, 24, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc_lo
+; GFX11-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v0
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_sub_nc_u32_e32 v2, 23, v1
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
-; GFX11-NEXT:    v_lshrrev_b32_e64 v2, v2, s2
-; GFX11-NEXT:    s_and_b32 s2, 0xffff, s3
-; GFX11-NEXT:    s_lshl_b32 s3, s4, 16
+; GFX11-NEXT:    v_sub_nc_u32_e32 v3, 23, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
 ; GFX11-NEXT:    v_and_b32_e32 v3, 0xffffff, v3
-; GFX11-NEXT:    s_or_b32 s2, s2, s3
-; GFX11-NEXT:    v_lshl_or_b32 v0, s0, v0, v2
-; GFX11-NEXT:    s_lshr_b32 s0, s2, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    v_lshrrev_b32_e64 v2, v3, s0
-; GFX11-NEXT:    s_or_b32 s0, s1, s6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_bfe_u32 v3, v0, 8, 8
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_lshrrev_b32_e64 v2, v2, s2
+; GFX11-NEXT:    s_lshl_b32 s2, s6, 16
+; GFX11-NEXT:    v_lshrrev_b32_e64 v3, v3, s3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_lshl_or_b32 v1, s0, v1, v2
+; GFX11-NEXT:    s_or_b32 s0, s1, s2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    v_lshl_or_b32 v0, s0, v0, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 8, v3
-; GFX11-NEXT:    v_bfe_u32 v3, v0, 16, 8
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v1
+; GFX11-NEXT:    v_bfe_u32 v2, v1, 8, 8
+; GFX11-NEXT:    v_bfe_u32 v3, v1, 16, 8
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_and_or_b32 v0, v0, 0xff, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 24, v4
-; GFX11-NEXT:    v_bfe_u32 v4, v1, 8, 8
-; GFX11-NEXT:    v_bfe_u32 v1, v1, 16, 8
-; GFX11-NEXT:    v_or3_b32 v0, v0, v2, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
+; GFX11-NEXT:    v_and_or_b32 v1, v1, 0xff, v2
+; GFX11-NEXT:    v_bfe_u32 v2, v0, 8, 8
+; GFX11-NEXT:    v_bfe_u32 v0, v0, 16, 8
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or3_b32 v1, v1, v3, v4
+; GFX11-NEXT:    v_lshl_or_b32 v0, v0, 8, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_lshl_or_b32 v1, v1, 8, v4
-; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX11-NEXT:    v_readfirstlane_b32 s1, v0
 ; GFX11-NEXT:    ; return to shader part epilog
   %lhs = bitcast i48 %lhs.arg to <2 x i24>
   %rhs = bitcast i48 %rhs.arg to <2 x i24>
@@ -2455,37 +2454,35 @@ define <2 x i24> @v_fshl_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) {
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_cvt_f32_ubyte0_e32 v6, 24
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v6, v6
-; GFX6-NEXT:    v_mov_b32_e32 v7, 0xffffffe8
-; GFX6-NEXT:    v_cvt_f32_ubyte0_e32 v9, 24
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v9, v9
-; GFX6-NEXT:    v_mul_f32_e32 v6, 0x4f7ffffe, v6
-; GFX6-NEXT:    v_cvt_u32_f32_e32 v6, v6
+; GFX6-NEXT:    v_mov_b32_e32 v8, 0xffffffe8
 ; GFX6-NEXT:    v_and_b32_e32 v4, 0xffffff, v4
 ; GFX6-NEXT:    v_and_b32_e32 v5, 0xffffff, v5
+; GFX6-NEXT:    v_mul_f32_e32 v7, 0x4f7ffffe, v6
+; GFX6-NEXT:    v_cvt_u32_f32_e32 v7, v7
+; GFX6-NEXT:    v_mul_f32_e32 v6, 0x4f7ffffe, v6
+; GFX6-NEXT:    v_cvt_u32_f32_e32 v6, v6
 ; GFX6-NEXT:    v_bfe_u32 v2, v2, 1, 23
-; GFX6-NEXT:    v_mul_lo_u32 v8, v6, v7
+; GFX6-NEXT:    v_mul_lo_u32 v9, v7, v8
+; GFX6-NEXT:    v_mul_lo_u32 v8, v6, v8
+; GFX6-NEXT:    v_mul_hi_u32 v9, v7, v9
 ; GFX6-NEXT:    v_mul_hi_u32 v8, v6, v8
-; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
-; GFX6-NEXT:    v_mul_hi_u32 v6, v4, v6
-; GFX6-NEXT:    v_mul_f32_e32 v8, 0x4f7ffffe, v9
-; GFX6-NEXT:    v_cvt_u32_f32_e32 v8, v8
-; GFX6-NEXT:    v_mul_lo_u32 v6, v6, 24
-; GFX6-NEXT:    v_mul_lo_u32 v7, v8, v7
-; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, v4, v6
-; GFX6-NEXT:    v_subrev_i32_e32 v6, vcc, 24, v4
+; GFX6-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
+; GFX6-NEXT:    v_mul_hi_u32 v7, v4, v7
+; GFX6-NEXT:    v_mul_lo_u32 v7, v7, 24
+; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, v4, v7
+; GFX6-NEXT:    v_subrev_i32_e32 v7, vcc, 24, v4
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 24, v4
-; GFX6-NEXT:    v_mul_hi_u32 v7, v8, v7
-; GFX6-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
-; GFX6-NEXT:    v_subrev_i32_e32 v6, vcc, 24, v4
+; GFX6-NEXT:    v_cndmask_b32_e32 v4, v4, v7, vcc
+; GFX6-NEXT:    v_subrev_i32_e32 v7, vcc, 24, v4
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 24, v4
-; GFX6-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
-; GFX6-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
-; GFX6-NEXT:    v_mul_hi_u32 v7, v5, v7
-; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, 23, v4
+; GFX6-NEXT:    v_cndmask_b32_e32 v4, v4, v7, vcc
+; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
+; GFX6-NEXT:    v_mul_hi_u32 v6, v5, v6
+; GFX6-NEXT:    v_sub_i32_e32 v7, vcc, 23, v4
 ; GFX6-NEXT:    v_and_b32_e32 v4, 0xffffff, v4
+; GFX6-NEXT:    v_mul_lo_u32 v6, v6, 24
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, v4, v0
-; GFX6-NEXT:    v_and_b32_e32 v4, 0xffffff, v6
-; GFX6-NEXT:    v_mul_lo_u32 v6, v7, 24
+; GFX6-NEXT:    v_and_b32_e32 v4, 0xffffff, v7
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v4, v2
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v5, v6
@@ -2509,37 +2506,35 @@ define <2 x i24> @v_fshl_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cvt_f32_ubyte0_e32 v6, 24
 ; GFX8-NEXT:    v_rcp_iflag_f32_e32 v6, v6
-; GFX8-NEXT:    v_mov_b32_e32 v7, 0xffffffe8
-; GFX8-NEXT:    v_cvt_f32_ubyte0_e32 v9, 24
-; GFX8-NEXT:    v_rcp_iflag_f32_e32 v9, v9
-; GFX8-NEXT:    v_mul_f32_e32 v6, 0x4f7ffffe, v6
-; GFX8-NEXT:    v_cvt_u32_f32_e32 v6, v6
+; GFX8-NEXT:    v_mov_b32_e32 v8, 0xffffffe8
 ; GFX8-NEXT:    v_and_b32_e32 v4, 0xffffff, v4
 ; GFX8-NEXT:    v_and_b32_e32 v5, 0xffffff, v5
+; GFX8-NEXT:    v_mul_f32_e32 v7, 0x4f7ffffe, v6
+; GFX8-NEXT:    v_cvt_u32_f32_e32 v7, v7
+; GFX8-NEXT:    v_mul_f32_e32 v6, 0x4f7ffffe, v6
+; GFX8-NEXT:    v_cvt_u32_f32_e32 v6, v6
 ; GFX8-NEXT:    v_bfe_u32 v2, v2, 1, 23
-; GFX8-NEXT:    v_mul_lo_u32 v8, v6, v7
+; GFX8-NEXT:    v_mul_lo_u32 v9, v7, v8
+; GFX8-NEXT:    v_mul_lo_u32 v8, v6, v8
+; GFX8-NEXT:    v_mul_hi_u32 v9, v7, v9
 ; GFX8-NEXT:    v_mul_hi_u32 v8, v6, v8
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v8
-; GFX8-NEXT:    v_mul_hi_u32 v6, v4, v6
-; GFX8-NEXT:    v_mul_f32_e32 v8, 0x4f7ffffe, v9
-; GFX8-NEXT:    v_cvt_u32_f32_e32 v8, v8
-; GFX8-NEXT:    v_mul_lo_u32 v6, v6, 24
-; GFX8-NEXT:    v_mul_lo_u32 v7, v8, v7
-; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, v4, v6
-; GFX8-NEXT:    v_subrev_u32_e32 v6, vcc, 24, v4
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v9
+; GFX8-NEXT:    v_mul_hi_u32 v7, v4, v7
+; GFX8-NEXT:    v_mul_lo_u32 v7, v7, 24
+; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, v4, v7
+; GFX8-NEXT:    v_subrev_u32_e32 v7, vcc, 24, v4
 ; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 24, v4
-; GFX8-NEXT:    v_mul_hi_u32 v7, v8, v7
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
-; GFX8-NEXT:    v_subrev_u32_e32 v6, vcc, 24, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v7, vcc
+; GFX8-NEXT:    v_subrev_u32_e32 v7, vcc, 24, v4
 ; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 24, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v8, v7
-; GFX8-NEXT:    v_mul_hi_u32 v7, v5, v7
-; GFX8-NEXT:    v_sub_u32_e32 v6, vcc, 23, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v7, vcc
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v8
+; GFX8-NEXT:    v_mul_hi_u32 v6, v5, v6
+; GFX8-NEXT:    v_sub_u32_e32 v7, vcc, 23, v4
 ; GFX8-NEXT:    v_and_b32_e32 v4, 0xffffff, v4
+; GFX8-NEXT:    v_mul_lo_u32 v6, v6, 24
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, v4, v0
-; GFX8-NEXT:    v_and_b32_e32 v4, 0xffffff, v6
-; GFX8-NEXT:    v_mul_lo_u32 v6, v7, 24
+; GFX8-NEXT:    v_and_b32_e32 v4, 0xffffff, v7
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, v4, v2
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, v5, v6
@@ -2563,37 +2558,35 @@ define <2 x i24> @v_fshl_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v6, 24
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v6, v6
-; GFX9-NEXT:    v_mov_b32_e32 v7, 0xffffffe8
-; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v9, 24
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v9, v9
-; GFX9-NEXT:    v_mul_f32_e32 v6, 0x4f7ffffe, v6
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v6, v6
+; GFX9-NEXT:    v_mov_b32_e32 v8, 0xffffffe8
 ; GFX9-NEXT:    v_and_b32_e32 v4, 0xffffff, v4
-; GFX9-NEXT:    v_mul_f32_e32 v9, 0x4f7ffffe, v9
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v9, v9
-; GFX9-NEXT:    v_mul_lo_u32 v8, v6, v7
 ; GFX9-NEXT:    v_and_b32_e32 v5, 0xffffff, v5
+; GFX9-NEXT:    v_mul_f32_e32 v7, 0x4f7ffffe, v6
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v7, v7
+; GFX9-NEXT:    v_mul_f32_e32 v6, 0x4f7ffffe, v6
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v6, v6
 ; GFX9-NEXT:    v_bfe_u32 v2, v2, 1, 23
-; GFX9-NEXT:    v_mul_lo_u32 v7, v9, v7
-; GFX9-NEXT:    v_mul_hi_u32 v8, v6, v8
+; GFX9-NEXT:    v_mul_lo_u32 v9, v7, v8
 ; GFX9-NEXT:    v_bfe_u32 v3, v3, 1, 23
-; GFX9-NEXT:    v_mul_hi_u32 v7, v9, v7
+; GFX9-NEXT:    v_mul_lo_u32 v8, v6, v8
+; GFX9-NEXT:    v_mul_hi_u32 v9, v7, v9
+; GFX9-NEXT:    v_mul_hi_u32 v8, v6, v8
+; GFX9-NEXT:    v_add_u32_e32 v7, v7, v9
+; GFX9-NEXT:    v_mul_hi_u32 v7, v4, v7
 ; GFX9-NEXT:    v_add_u32_e32 v6, v6, v8
-; GFX9-NEXT:    v_mul_hi_u32 v6, v4, v6
-; GFX9-NEXT:    v_add_u32_e32 v7, v9, v7
+; GFX9-NEXT:    v_mul_hi_u32 v6, v5, v6
+; GFX9-NEXT:    v_mul_lo_u32 v7, v7, 24
 ; GFX9-NEXT:    v_mul_lo_u32 v6, v6, 24
-; GFX9-NEXT:    v_sub_u32_e32 v4, v4, v6
-; GFX9-NEXT:    v_subrev_u32_e32 v6, 24, v4
+; GFX9-NEXT:    v_sub_u32_e32 v4, v4, v7
+; GFX9-NEXT:    v_subrev_u32_e32 v7, 24, v4
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 24, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v6, 24, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v7, vcc
+; GFX9-NEXT:    v_subrev_u32_e32 v7, 24, v4
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 24, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
-; GFX9-NEXT:    v_mul_hi_u32 v6, v5, v7
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v7, vcc
 ; GFX9-NEXT:    v_sub_u32_e32 v7, 23, v4
 ; GFX9-NEXT:    v_and_b32_e32 v7, 0xffffff, v7
 ; GFX9-NEXT:    v_and_b32_e32 v4, 0xffffff, v4
-; GFX9-NEXT:    v_mul_lo_u32 v6, v6, 24
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, v7, v2
 ; GFX9-NEXT:    v_lshl_or_b32 v0, v0, v4, v2
 ; GFX9-NEXT:    v_sub_u32_e32 v2, v5, v6
@@ -2614,29 +2607,27 @@ define <2 x i24> @v_fshl_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v6, 24
-; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v7, 24
 ; GFX10-NEXT:    v_and_b32_e32 v4, 0xffffff, v4
 ; GFX10-NEXT:    v_and_b32_e32 v5, 0xffffff, v5
 ; GFX10-NEXT:    v_bfe_u32 v2, v2, 1, 23
-; GFX10-NEXT:    v_rcp_iflag_f32_e32 v6, v6
-; GFX10-NEXT:    v_rcp_iflag_f32_e32 v7, v7
 ; GFX10-NEXT:    v_bfe_u32 v3, v3, 1, 23
+; GFX10-NEXT:    v_rcp_iflag_f32_e32 v6, v6
+; GFX10-NEXT:    v_mul_f32_e32 v7, 0x4f7ffffe, v6
 ; GFX10-NEXT:    v_mul_f32_e32 v6, 0x4f7ffffe, v6
-; GFX10-NEXT:    v_mul_f32_e32 v7, 0x4f7ffffe, v7
-; GFX10-NEXT:    v_cvt_u32_f32_e32 v6, v6
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v7, v7
-; GFX10-NEXT:    v_mul_lo_u32 v8, 0xffffffe8, v6
-; GFX10-NEXT:    v_mul_lo_u32 v9, 0xffffffe8, v7
-; GFX10-NEXT:    v_mul_hi_u32 v8, v6, v8
-; GFX10-NEXT:    v_mul_hi_u32 v9, v7, v9
-; GFX10-NEXT:    v_add_nc_u32_e32 v6, v6, v8
-; GFX10-NEXT:    v_add_nc_u32_e32 v7, v7, v9
-; GFX10-NEXT:    v_mul_hi_u32 v6, v4, v6
-; GFX10-NEXT:    v_mul_hi_u32 v7, v5, v7
-; GFX10-NEXT:    v_mul_lo_u32 v6, v6, 24
+; GFX10-NEXT:    v_cvt_u32_f32_e32 v6, v6
+; GFX10-NEXT:    v_mul_lo_u32 v8, 0xffffffe8, v7
+; GFX10-NEXT:    v_mul_lo_u32 v9, 0xffffffe8, v6
+; GFX10-NEXT:    v_mul_hi_u32 v8, v7, v8
+; GFX10-NEXT:    v_mul_hi_u32 v9, v6, v9
+; GFX10-NEXT:    v_add_nc_u32_e32 v7, v7, v8
+; GFX10-NEXT:    v_add_nc_u32_e32 v6, v6, v9
+; GFX10-NEXT:    v_mul_hi_u32 v7, v4, v7
+; GFX10-NEXT:    v_mul_hi_u32 v6, v5, v6
 ; GFX10-NEXT:    v_mul_lo_u32 v7, v7, 24
-; GFX10-NEXT:    v_sub_nc_u32_e32 v4, v4, v6
-; GFX10-NEXT:    v_sub_nc_u32_e32 v5, v5, v7
+; GFX10-NEXT:    v_mul_lo_u32 v6, v6, 24
+; GFX10-NEXT:    v_sub_nc_u32_e32 v4, v4, v7
+; GFX10-NEXT:    v_sub_nc_u32_e32 v5, v5, v6
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v6, 24, v4
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v4
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v7, 24, v5
@@ -2665,64 +2656,63 @@ define <2 x i24> @v_fshl_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v6, 24
-; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v7, 24
 ; GFX11-NEXT:    v_and_b32_e32 v5, 0xffffff, v5
 ; GFX11-NEXT:    v_bfe_u32 v2, v2, 1, 23
 ; GFX11-NEXT:    v_bfe_u32 v3, v3, 1, 23
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_rcp_iflag_f32_e32 v6, v6
-; GFX11-NEXT:    v_rcp_iflag_f32_e32 v7, v7
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_dual_mul_f32 v6, 0x4f7ffffe, v6 :: v_dual_mul_f32 v7, 0x4f7ffffe, v7
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_mul_f32_e32 v7, 0x4f7ffffe, v6
+; GFX11-NEXT:    v_mul_f32_e32 v6, 0x4f7ffffe, v6
 ; GFX11-NEXT:    v_cvt_u32_f32_e32 v6, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_lo_u32 v9, 0xffffffe8, v6
+; GFX11-NEXT:    v_mul_hi_u32 v9, v6, v9
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_add_nc_u32_e32 v6, v6, v9
 ; GFX11-NEXT:    v_cvt_u32_f32_e32 v7, v7
+; GFX11-NEXT:    v_mul_hi_u32 v6, v5, v6
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_mul_lo_u32 v8, 0xffffffe8, v6
-; GFX11-NEXT:    v_mul_lo_u32 v9, 0xffffffe8, v7
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_mul_hi_u32 v8, v6, v8
-; GFX11-NEXT:    v_mul_hi_u32 v9, v7, v9
+; GFX11-NEXT:    v_mul_lo_u32 v8, 0xffffffe8, v7
+; GFX11-NEXT:    v_mul_lo_u32 v6, v6, 24
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_nc_u32_e32 v6, v6, v8
-; GFX11-NEXT:    v_add_nc_u32_e32 v7, v7, v9
+; GFX11-NEXT:    v_mul_hi_u32 v8, v7, v8
+; GFX11-NEXT:    v_sub_nc_u32_e32 v5, v5, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add_nc_u32_e32 v7, v7, v8
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xffffff, v4
+; GFX11-NEXT:    v_mul_hi_u32 v7, v4, v7
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_mul_hi_u32 v7, v5, v7
 ; GFX11-NEXT:    v_mul_lo_u32 v7, v7, 24
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_sub_nc_u32_e32 v5, v5, v7
+; GFX11-NEXT:    v_sub_nc_u32_e32 v4, v4, v7
 ; GFX11-NEXT:    v_subrev_nc_u32_e32 v7, 24, v5
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffffff, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_mul_hi_u32 v6, v4, v6
-; GFX11-NEXT:    v_mul_lo_u32 v6, v6, 24
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_sub_nc_u32_e32 v4, v4, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_subrev_nc_u32_e32 v6, 24, v4
 ; GFX11-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc_lo
 ; GFX11-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_subrev_nc_u32_e32 v6, 24, v4
 ; GFX11-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc_lo
 ; GFX11-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_subrev_nc_u32_e32 v7, 24, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc_lo
 ; GFX11-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v6, 23, v4
-; GFX11-NEXT:    v_dual_cndmask_b32 v5, v5, v7 :: v_dual_and_b32 v4, 0xffffff, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffffff, v6
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xffffff, v4
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v7, 23, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffffff, v6
 ; GFX11-NEXT:    v_and_b32_e32 v5, 0xffffff, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v2, v6, v2
 ; GFX11-NEXT:    v_and_b32_e32 v7, 0xffffff, v7
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v2, v6, v2
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, v7, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_lshl_or_b32 v0, v0, v4, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, v7, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshl_or_b32 v1, v1, v5, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call <2 x i24> @llvm.fshl.v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
index 88fa7a8406475..3098ce672bd1c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
@@ -1371,48 +1371,48 @@ define i32 @v_fshr_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
 ; GFX10-LABEL: v_fshr_v4i8:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_not_b32_e32 v5, v2
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 8, v2
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
-; GFX10-NEXT:    v_not_b32_e32 v8, v2
-; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
-; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 24, v2
-; GFX10-NEXT:    v_not_b32_e32 v12, v7
-; GFX10-NEXT:    v_lshlrev_b16 v3, 1, v3
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 24, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 8, v1
-; GFX10-NEXT:    v_and_b32_e32 v12, 7, v12
-; GFX10-NEXT:    v_and_b32_e32 v8, 7, v8
+; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 24, v0
+; GFX10-NEXT:    v_and_b32_e32 v5, 7, v5
 ; GFX10-NEXT:    v_lshlrev_b16 v0, 1, v0
-; GFX10-NEXT:    v_not_b32_e32 v13, v10
+; GFX10-NEXT:    v_not_b32_e32 v10, v7
+; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 8, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 16, v2
+; GFX10-NEXT:    v_lshrrev_b32_e32 v12, 24, v2
+; GFX10-NEXT:    v_lshlrev_b16 v0, v5, v0
+; GFX10-NEXT:    v_and_b32_e32 v5, 7, v10
+; GFX10-NEXT:    v_lshlrev_b16 v3, 1, v3
+; GFX10-NEXT:    v_not_b32_e32 v13, v11
 ; GFX10-NEXT:    s_movk_i32 s4, 0xff
-; GFX10-NEXT:    v_lshlrev_b16 v3, v12, v3
-; GFX10-NEXT:    v_not_b32_e32 v12, v11
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 24, v1
-; GFX10-NEXT:    v_lshlrev_b16 v0, v8, v0
-; GFX10-NEXT:    v_and_b32_e32 v8, 0xff, v1
+; GFX10-NEXT:    v_and_b32_e32 v10, 0xff, v1
+; GFX10-NEXT:    v_lshlrev_b16 v3, v5, v3
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xff, v8
+; GFX10-NEXT:    v_not_b32_e32 v8, v12
 ; GFX10-NEXT:    v_and_b32_e32 v7, 7, v7
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX10-NEXT:    v_and_b32_e32 v10, 7, v10
+; GFX10-NEXT:    v_and_b32_e32 v11, 7, v11
 ; GFX10-NEXT:    v_and_b32_e32 v13, 7, v13
 ; GFX10-NEXT:    v_lshlrev_b16 v4, 1, v4
 ; GFX10-NEXT:    v_and_b32_sdwa v1, v1, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_and_b32_e32 v8, 7, v8
+; GFX10-NEXT:    v_lshlrev_b16 v6, 1, v6
 ; GFX10-NEXT:    v_and_b32_e32 v12, 7, v12
-; GFX10-NEXT:    v_lshlrev_b16 v5, 1, v5
-; GFX10-NEXT:    v_and_b32_e32 v11, 7, v11
 ; GFX10-NEXT:    v_and_b32_e32 v2, 7, v2
-; GFX10-NEXT:    v_lshrrev_b16 v6, v7, v6
+; GFX10-NEXT:    v_lshrrev_b16 v5, v7, v5
 ; GFX10-NEXT:    v_lshlrev_b16 v4, v13, v4
-; GFX10-NEXT:    v_lshrrev_b16 v1, v10, v1
-; GFX10-NEXT:    v_lshlrev_b16 v5, v12, v5
-; GFX10-NEXT:    v_lshrrev_b16 v7, v11, v9
-; GFX10-NEXT:    v_lshrrev_b16 v2, v2, v8
-; GFX10-NEXT:    v_or_b32_e32 v3, v3, v6
-; GFX10-NEXT:    v_mov_b32_e32 v6, 8
+; GFX10-NEXT:    v_lshrrev_b16 v1, v11, v1
+; GFX10-NEXT:    v_lshlrev_b16 v6, v8, v6
+; GFX10-NEXT:    v_lshrrev_b16 v7, v12, v9
+; GFX10-NEXT:    v_lshrrev_b16 v2, v2, v10
+; GFX10-NEXT:    v_or_b32_e32 v3, v3, v5
+; GFX10-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX10-NEXT:    v_or_b32_e32 v1, v4, v1
-; GFX10-NEXT:    v_or_b32_e32 v4, v5, v7
+; GFX10-NEXT:    v_or_b32_e32 v4, v6, v7
 ; GFX10-NEXT:    v_or_b32_e32 v0, v0, v2
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX10-NEXT:    v_and_b32_e32 v3, 0xff, v4
 ; GFX10-NEXT:    v_and_or_b32 v0, v0, 0xff, v2
@@ -1820,14 +1820,13 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX6-NEXT:    s_lshr_b32 s6, s0, 16
 ; GFX6-NEXT:    s_lshr_b32 s7, s0, 24
 ; GFX6-NEXT:    s_lshr_b32 s8, s1, 8
-; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
-; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX6-NEXT:    s_and_b32 s9, s0, 0xff
 ; GFX6-NEXT:    s_bfe_u32 s0, s0, 0x80008
 ; GFX6-NEXT:    s_and_b32 s1, s1, 0xff
+; GFX6-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v0
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 8
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 8
-; GFX6-NEXT:    v_mov_b32_e32 v1, 0xffffffe8
+; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX6-NEXT:    s_or_b32 s0, s9, s0
 ; GFX6-NEXT:    s_or_b32 s1, s7, s1
 ; GFX6-NEXT:    s_and_b32 s7, s8, 0xff
@@ -1835,19 +1834,19 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX6-NEXT:    s_lshr_b32 s9, s2, 24
 ; GFX6-NEXT:    s_and_b32 s11, s2, 0xff
 ; GFX6-NEXT:    s_bfe_u32 s2, s2, 0x80008
-; GFX6-NEXT:    v_mul_lo_u32 v2, v0, v1
 ; GFX6-NEXT:    s_lshl_b32 s2, s2, 8
 ; GFX6-NEXT:    s_and_b32 s8, s8, 0xff
 ; GFX6-NEXT:    s_or_b32 s2, s11, s2
 ; GFX6-NEXT:    s_and_b32 s8, 0xffff, s8
+; GFX6-NEXT:    v_mov_b32_e32 v2, 0xffffffe8
 ; GFX6-NEXT:    s_lshr_b32 s10, s3, 8
 ; GFX6-NEXT:    s_and_b32 s2, 0xffff, s2
 ; GFX6-NEXT:    s_lshl_b32 s8, s8, 16
 ; GFX6-NEXT:    s_and_b32 s3, s3, 0xff
+; GFX6-NEXT:    v_mul_lo_u32 v3, v1, v2
 ; GFX6-NEXT:    s_or_b32 s2, s2, s8
 ; GFX6-NEXT:    s_lshl_b32 s3, s3, 8
 ; GFX6-NEXT:    s_and_b32 s8, s10, 0xff
-; GFX6-NEXT:    v_mul_hi_u32 v2, v0, v2
 ; GFX6-NEXT:    s_or_b32 s3, s9, s3
 ; GFX6-NEXT:    s_and_b32 s8, 0xffff, s8
 ; GFX6-NEXT:    s_and_b32 s3, 0xffff, s3
@@ -1857,103 +1856,100 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX6-NEXT:    s_lshr_b32 s9, s4, 24
 ; GFX6-NEXT:    s_and_b32 s11, s4, 0xff
 ; GFX6-NEXT:    s_bfe_u32 s4, s4, 0x80008
+; GFX6-NEXT:    v_mul_hi_u32 v3, v1, v3
 ; GFX6-NEXT:    s_lshl_b32 s4, s4, 8
 ; GFX6-NEXT:    s_and_b32 s8, s8, 0xff
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; GFX6-NEXT:    v_cvt_f32_ubyte0_e32 v2, 24
 ; GFX6-NEXT:    s_or_b32 s4, s11, s4
 ; GFX6-NEXT:    s_and_b32 s8, 0xffff, s8
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v2
 ; GFX6-NEXT:    s_and_b32 s4, 0xffff, s4
 ; GFX6-NEXT:    s_lshl_b32 s8, s8, 16
 ; GFX6-NEXT:    s_or_b32 s4, s4, s8
-; GFX6-NEXT:    v_mul_hi_u32 v0, s4, v0
-; GFX6-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
-; GFX6-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
+; GFX6-NEXT:    v_mul_hi_u32 v1, s4, v1
+; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX6-NEXT:    s_lshr_b32 s10, s5, 8
-; GFX6-NEXT:    v_mul_lo_u32 v0, v0, 24
+; GFX6-NEXT:    v_mul_lo_u32 v1, v1, 24
 ; GFX6-NEXT:    s_and_b32 s5, s5, 0xff
-; GFX6-NEXT:    v_mul_lo_u32 v1, v2, v1
+; GFX6-NEXT:    v_mul_lo_u32 v2, v0, v2
 ; GFX6-NEXT:    s_lshl_b32 s5, s5, 8
-; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
-; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, 24, v0
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 24, v0
-; GFX6-NEXT:    v_mul_hi_u32 v1, v2, v1
+; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s4, v1
+; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, 24, v1
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 24, v1
+; GFX6-NEXT:    v_mul_hi_u32 v2, v0, v2
 ; GFX6-NEXT:    s_and_b32 s8, s10, 0xff
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX6-NEXT:    s_or_b32 s5, s9, s5
 ; GFX6-NEXT:    s_and_b32 s8, 0xffff, s8
-; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, 24, v0
+; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, 24, v1
 ; GFX6-NEXT:    s_and_b32 s5, 0xffff, s5
 ; GFX6-NEXT:    s_lshl_b32 s8, s8, 16
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 24, v0
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 24, v1
 ; GFX6-NEXT:    s_or_b32 s5, s5, s8
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
-; GFX6-NEXT:    v_mul_hi_u32 v1, s5, v1
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GFX6-NEXT:    v_mul_hi_u32 v0, s5, v0
 ; GFX6-NEXT:    s_and_b32 s6, s6, 0xff
 ; GFX6-NEXT:    s_and_b32 s0, 0xffff, s0
 ; GFX6-NEXT:    s_and_b32 s6, 0xffff, s6
-; GFX6-NEXT:    v_mul_lo_u32 v1, v1, 24
-; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, 23, v0
+; GFX6-NEXT:    v_mul_lo_u32 v0, v0, 24
+; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, 23, v1
 ; GFX6-NEXT:    s_lshl_b32 s4, s6, 17
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 1
 ; GFX6-NEXT:    s_or_b32 s0, s4, s0
 ; GFX6-NEXT:    v_and_b32_e32 v2, 0xffffff, v3
-; GFX6-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
 ; GFX6-NEXT:    v_lshl_b32_e32 v2, s0, v2
-; GFX6-NEXT:    v_lshr_b32_e32 v0, s2, v0
-; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s5, v1
-; GFX6-NEXT:    v_or_b32_e32 v0, v2, v0
-; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, 24, v1
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 24, v1
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, 24, v1
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 24, v1
+; GFX6-NEXT:    v_lshr_b32_e32 v1, s2, v1
+; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s5, v0
+; GFX6-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, 24, v0
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 24, v0
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, 24, v0
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 24, v0
 ; GFX6-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX6-NEXT:    s_and_b32 s7, 0xffff, s7
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, 23, v1
+; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, 23, v0
 ; GFX6-NEXT:    s_lshl_b32 s0, s7, 17
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 1
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
 ; GFX6-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
-; GFX6-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
+; GFX6-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
 ; GFX6-NEXT:    v_lshl_b32_e32 v2, s0, v2
-; GFX6-NEXT:    v_lshr_b32_e32 v1, s3, v1
-; GFX6-NEXT:    v_bfe_u32 v3, v0, 8, 8
-; GFX6-NEXT:    v_or_b32_e32 v1, v2, v1
-; GFX6-NEXT:    v_and_b32_e32 v2, 0xff, v0
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
-; GFX6-NEXT:    v_bfe_u32 v0, v0, 16, 8
-; GFX6-NEXT:    v_or_b32_e32 v2, v2, v3
-; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX6-NEXT:    v_lshr_b32_e32 v0, s3, v0
+; GFX6-NEXT:    v_bfe_u32 v3, v1, 8, 8
 ; GFX6-NEXT:    v_or_b32_e32 v0, v2, v0
 ; GFX6-NEXT:    v_and_b32_e32 v2, 0xff, v1
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
-; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
-; GFX6-NEXT:    v_bfe_u32 v2, v1, 8, 8
+; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
 ; GFX6-NEXT:    v_bfe_u32 v1, v1, 16, 8
-; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
+; GFX6-NEXT:    v_or_b32_e32 v2, v2, v3
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_or_b32_e32 v1, v2, v1
-; GFX6-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX6-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX6-NEXT:    v_and_b32_e32 v2, 0xff, v0
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
+; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX6-NEXT:    v_bfe_u32 v2, v0, 8, 8
+; GFX6-NEXT:    v_bfe_u32 v0, v0, 16, 8
+; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
+; GFX6-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX6-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX6-NEXT:    v_readfirstlane_b32 s1, v0
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: s_fshr_v2i24:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    v_cvt_f32_ubyte0_e32 v0, 24
-; GFX8-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX8-NEXT:    s_lshr_b32 s9, s1, 8
 ; GFX8-NEXT:    s_and_b32 s1, s1, 0xff
+; GFX8-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX8-NEXT:    s_lshr_b32 s6, s0, 8
 ; GFX8-NEXT:    s_lshr_b32 s8, s0, 24
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 8
-; GFX8-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX8-NEXT:    s_and_b32 s6, s6, 0xff
 ; GFX8-NEXT:    s_or_b32 s1, s8, s1
 ; GFX8-NEXT:    s_lshr_b32 s8, s2, 8
-; GFX8-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX8-NEXT:    s_lshr_b32 s7, s0, 16
 ; GFX8-NEXT:    s_and_b32 s0, s0, 0xff
 ; GFX8-NEXT:    s_lshl_b32 s6, s6, 8
@@ -1965,11 +1961,11 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX8-NEXT:    s_lshr_b32 s10, s2, 24
 ; GFX8-NEXT:    s_and_b32 s2, s2, 0xff
 ; GFX8-NEXT:    s_lshl_b32 s8, s8, 8
+; GFX8-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v0
 ; GFX8-NEXT:    s_or_b32 s2, s2, s8
 ; GFX8-NEXT:    s_and_b32 s8, s9, 0xff
-; GFX8-NEXT:    v_mov_b32_e32 v1, 0xffffffe8
+; GFX8-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX8-NEXT:    s_and_b32 s8, 0xffff, s8
-; GFX8-NEXT:    v_mul_lo_u32 v2, v0, v1
 ; GFX8-NEXT:    s_lshr_b32 s11, s3, 8
 ; GFX8-NEXT:    s_and_b32 s2, 0xffff, s2
 ; GFX8-NEXT:    s_lshl_b32 s8, s8, 16
@@ -1977,11 +1973,12 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX8-NEXT:    s_or_b32 s2, s2, s8
 ; GFX8-NEXT:    s_lshl_b32 s3, s3, 8
 ; GFX8-NEXT:    s_and_b32 s8, s11, 0xff
+; GFX8-NEXT:    v_mov_b32_e32 v2, 0xffffffe8
 ; GFX8-NEXT:    s_or_b32 s3, s10, s3
 ; GFX8-NEXT:    s_and_b32 s8, 0xffff, s8
+; GFX8-NEXT:    v_mul_lo_u32 v3, v1, v2
 ; GFX8-NEXT:    s_and_b32 s3, 0xffff, s3
 ; GFX8-NEXT:    s_lshl_b32 s8, s8, 16
-; GFX8-NEXT:    v_mul_hi_u32 v2, v0, v2
 ; GFX8-NEXT:    s_or_b32 s3, s3, s8
 ; GFX8-NEXT:    s_lshr_b32 s8, s4, 8
 ; GFX8-NEXT:    s_and_b32 s8, s8, 0xff
@@ -1989,101 +1986,95 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX8-NEXT:    s_lshr_b32 s10, s4, 24
 ; GFX8-NEXT:    s_and_b32 s4, s4, 0xff
 ; GFX8-NEXT:    s_lshl_b32 s8, s8, 8
+; GFX8-NEXT:    v_mul_hi_u32 v3, v1, v3
 ; GFX8-NEXT:    s_or_b32 s4, s4, s8
 ; GFX8-NEXT:    s_and_b32 s8, s9, 0xff
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
-; GFX8-NEXT:    v_cvt_f32_ubyte0_e32 v2, 24
 ; GFX8-NEXT:    s_and_b32 s8, 0xffff, s8
-; GFX8-NEXT:    v_rcp_iflag_f32_e32 v2, v2
 ; GFX8-NEXT:    s_and_b32 s4, 0xffff, s4
 ; GFX8-NEXT:    s_lshl_b32 s8, s8, 16
 ; GFX8-NEXT:    s_or_b32 s4, s4, s8
-; GFX8-NEXT:    v_mul_hi_u32 v0, s4, v0
-; GFX8-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
-; GFX8-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v3
+; GFX8-NEXT:    v_mul_hi_u32 v1, s4, v1
+; GFX8-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX8-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX8-NEXT:    s_lshr_b32 s11, s5, 8
-; GFX8-NEXT:    v_mul_lo_u32 v0, v0, 24
+; GFX8-NEXT:    v_mul_lo_u32 v1, v1, 24
 ; GFX8-NEXT:    s_and_b32 s5, s5, 0xff
-; GFX8-NEXT:    v_mul_lo_u32 v1, v2, v1
+; GFX8-NEXT:    v_mul_lo_u32 v2, v0, v2
 ; GFX8-NEXT:    s_lshl_b32 s5, s5, 8
-; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s4, v0
-; GFX8-NEXT:    v_subrev_u32_e32 v3, vcc, 24, v0
-; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 24, v0
-; GFX8-NEXT:    v_mul_hi_u32 v1, v2, v1
+; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, s4, v1
+; GFX8-NEXT:    v_subrev_u32_e32 v3, vcc, 24, v1
+; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 24, v1
+; GFX8-NEXT:    v_mul_hi_u32 v2, v0, v2
 ; GFX8-NEXT:    s_and_b32 s8, s11, 0xff
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX8-NEXT:    s_or_b32 s5, s10, s5
 ; GFX8-NEXT:    s_and_b32 s8, 0xffff, s8
-; GFX8-NEXT:    v_subrev_u32_e32 v3, vcc, 24, v0
+; GFX8-NEXT:    v_subrev_u32_e32 v3, vcc, 24, v1
 ; GFX8-NEXT:    s_and_b32 s5, 0xffff, s5
 ; GFX8-NEXT:    s_lshl_b32 s8, s8, 16
-; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 24, v0
+; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 24, v1
 ; GFX8-NEXT:    s_or_b32 s5, s5, s8
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v2, v1
-; GFX8-NEXT:    v_mul_hi_u32 v1, s5, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT:    v_mul_hi_u32 v0, s5, v0
 ; GFX8-NEXT:    s_and_b32 s0, 0xffff, s0
 ; GFX8-NEXT:    s_and_b32 s6, 0xffff, s6
-; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, 23, v0
-; GFX8-NEXT:    v_mul_lo_u32 v1, v1, 24
+; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, 23, v1
+; GFX8-NEXT:    v_mul_lo_u32 v0, v0, 24
 ; GFX8-NEXT:    s_lshl_b32 s4, s6, 17
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, 1
 ; GFX8-NEXT:    s_or_b32 s0, s4, s0
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffffff, v3
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
 ; GFX8-NEXT:    v_lshlrev_b32_e64 v2, v2, s0
-; GFX8-NEXT:    v_lshrrev_b32_e64 v0, v0, s2
-; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, s5, v1
-; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
-; GFX8-NEXT:    v_subrev_u32_e32 v2, vcc, 24, v1
-; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 24, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX8-NEXT:    v_subrev_u32_e32 v2, vcc, 24, v1
-; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 24, v1
+; GFX8-NEXT:    v_lshrrev_b32_e64 v1, v1, s2
+; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s5, v0
+; GFX8-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX8-NEXT:    v_subrev_u32_e32 v2, vcc, 24, v0
+; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 24, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX8-NEXT:    v_subrev_u32_e32 v2, vcc, 24, v0
+; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 24, v0
 ; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX8-NEXT:    s_and_b32 s7, 0xffff, s7
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 23, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 23, v0
 ; GFX8-NEXT:    s_lshl_b32 s0, s7, 17
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 1
 ; GFX8-NEXT:    s_or_b32 s0, s0, s1
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
+; GFX8-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
 ; GFX8-NEXT:    v_lshlrev_b32_e64 v2, v2, s0
-; GFX8-NEXT:    v_lshrrev_b32_e64 v1, v1, s3
-; GFX8-NEXT:    v_or_b32_e32 v1, v2, v1
+; GFX8-NEXT:    v_lshrrev_b32_e64 v0, v0, s3
+; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 8
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v3, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX8-NEXT:    v_mov_b32_e32 v4, 16
-; GFX8-NEXT:    v_or_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX8-NEXT:    v_or_b32_e32 v0, v3, v0
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xff, v1
+; GFX8-NEXT:    v_or_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX8-NEXT:    v_or_b32_e32 v1, v3, v1
+; GFX8-NEXT:    v_and_b32_e32 v3, 0xff, v0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v3
-; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX8-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; GFX8-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX8-NEXT:    v_readfirstlane_b32 s1, v0
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_fshr_v2i24:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, 24
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0xffffffe8
 ; GFX9-NEXT:    s_lshr_b32 s11, s1, 8
 ; GFX9-NEXT:    s_and_b32 s1, s1, 0xff
-; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX9-NEXT:    s_lshr_b32 s7, s0, 8
 ; GFX9-NEXT:    s_lshr_b32 s10, s0, 24
 ; GFX9-NEXT:    s_lshl_b32 s1, s1, 8
-; GFX9-NEXT:    v_mul_lo_u32 v2, v0, v1
 ; GFX9-NEXT:    s_and_b32 s7, s7, 0xff
 ; GFX9-NEXT:    s_or_b32 s1, s10, s1
 ; GFX9-NEXT:    s_lshr_b32 s10, s2, 8
-; GFX9-NEXT:    v_mul_hi_u32 v2, v0, v2
 ; GFX9-NEXT:    s_lshr_b32 s9, s0, 16
 ; GFX9-NEXT:    s_and_b32 s0, s0, 0xff
 ; GFX9-NEXT:    s_lshl_b32 s7, s7, 8
@@ -2095,12 +2086,11 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX9-NEXT:    s_lshr_b32 s12, s2, 24
 ; GFX9-NEXT:    s_and_b32 s2, s2, 0xff
 ; GFX9-NEXT:    s_lshl_b32 s10, s10, 8
+; GFX9-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v0
 ; GFX9-NEXT:    s_or_b32 s2, s2, s10
 ; GFX9-NEXT:    s_and_b32 s10, s11, 0xff
-; GFX9-NEXT:    v_add_u32_e32 v0, v0, v2
-; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v2, 24
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX9-NEXT:    s_and_b32 s10, 0xffff, s10
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v2
 ; GFX9-NEXT:    s_lshr_b32 s13, s3, 8
 ; GFX9-NEXT:    s_and_b32 s2, 0xffff, s2
 ; GFX9-NEXT:    s_lshl_b32 s10, s10, 16
@@ -2108,101 +2098,103 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX9-NEXT:    s_or_b32 s2, s2, s10
 ; GFX9-NEXT:    s_lshl_b32 s3, s3, 8
 ; GFX9-NEXT:    s_and_b32 s10, s13, 0xff
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0xffffffe8
 ; GFX9-NEXT:    s_or_b32 s3, s12, s3
 ; GFX9-NEXT:    s_and_b32 s10, 0xffff, s10
+; GFX9-NEXT:    v_mul_lo_u32 v3, v1, v2
 ; GFX9-NEXT:    s_and_b32 s3, 0xffff, s3
 ; GFX9-NEXT:    s_lshl_b32 s10, s10, 16
-; GFX9-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
+; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    s_or_b32 s3, s3, s10
 ; GFX9-NEXT:    s_lshr_b32 s10, s4, 8
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    s_and_b32 s10, s10, 0xff
 ; GFX9-NEXT:    s_lshr_b32 s11, s4, 16
 ; GFX9-NEXT:    s_lshr_b32 s12, s4, 24
 ; GFX9-NEXT:    s_and_b32 s4, s4, 0xff
 ; GFX9-NEXT:    s_lshl_b32 s10, s10, 8
+; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v3
 ; GFX9-NEXT:    s_or_b32 s4, s4, s10
 ; GFX9-NEXT:    s_and_b32 s10, s11, 0xff
 ; GFX9-NEXT:    s_and_b32 s10, 0xffff, s10
-; GFX9-NEXT:    v_mul_lo_u32 v1, v2, v1
+; GFX9-NEXT:    v_mul_lo_u32 v2, v0, v2
 ; GFX9-NEXT:    s_and_b32 s4, 0xffff, s4
 ; GFX9-NEXT:    s_lshl_b32 s10, s10, 16
 ; GFX9-NEXT:    s_or_b32 s4, s4, s10
-; GFX9-NEXT:    v_mul_hi_u32 v0, s4, v0
+; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
+; GFX9-NEXT:    v_mul_hi_u32 v1, s4, v1
 ; GFX9-NEXT:    s_lshr_b32 s13, s5, 8
 ; GFX9-NEXT:    s_and_b32 s5, s5, 0xff
-; GFX9-NEXT:    v_mul_hi_u32 v1, v2, v1
+; GFX9-NEXT:    v_mul_hi_u32 v2, v0, v2
 ; GFX9-NEXT:    s_lshl_b32 s5, s5, 8
 ; GFX9-NEXT:    s_and_b32 s10, s13, 0xff
 ; GFX9-NEXT:    s_or_b32 s5, s12, s5
 ; GFX9-NEXT:    s_and_b32 s10, 0xffff, s10
 ; GFX9-NEXT:    s_and_b32 s5, 0xffff, s5
 ; GFX9-NEXT:    s_lshl_b32 s10, s10, 16
-; GFX9-NEXT:    v_mul_lo_u32 v0, v0, 24
-; GFX9-NEXT:    s_or_b32 s5, s5, s10
-; GFX9-NEXT:    v_add_u32_e32 v1, v2, v1
-; GFX9-NEXT:    v_mul_hi_u32 v1, s5, v1
-; GFX9-NEXT:    v_sub_u32_e32 v0, s4, v0
-; GFX9-NEXT:    v_subrev_u32_e32 v3, 24, v0
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 24, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v1, v1, 24
-; GFX9-NEXT:    v_subrev_u32_e32 v3, 24, v0
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 24, v0
+; GFX9-NEXT:    s_or_b32 s5, s5, s10
+; GFX9-NEXT:    v_add_u32_e32 v0, v0, v2
+; GFX9-NEXT:    v_mul_hi_u32 v0, s5, v0
+; GFX9-NEXT:    v_sub_u32_e32 v1, s4, v1
+; GFX9-NEXT:    v_subrev_u32_e32 v3, 24, v1
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 24, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v0, v0, 24
+; GFX9-NEXT:    v_subrev_u32_e32 v3, 24, v1
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 24, v1
 ; GFX9-NEXT:    s_and_b32 s0, 0xffff, s0
 ; GFX9-NEXT:    s_and_b32 s7, 0xffff, s7
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX9-NEXT:    v_sub_u32_e32 v3, 23, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX9-NEXT:    v_sub_u32_e32 v3, 23, v1
 ; GFX9-NEXT:    s_lshl_b32 s4, s7, 17
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, 1
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
+; GFX9-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
 ; GFX9-NEXT:    s_or_b32 s0, s4, s0
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffffff, v3
-; GFX9-NEXT:    v_lshrrev_b32_e64 v0, v0, s2
-; GFX9-NEXT:    v_sub_u32_e32 v1, s5, v1
-; GFX9-NEXT:    v_lshl_or_b32 v0, s0, v2, v0
-; GFX9-NEXT:    v_subrev_u32_e32 v2, 24, v1
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 24, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v2, 24, v1
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 24, v1
+; GFX9-NEXT:    v_lshrrev_b32_e64 v1, v1, s2
+; GFX9-NEXT:    v_sub_u32_e32 v0, s5, v0
+; GFX9-NEXT:    v_lshl_or_b32 v1, s0, v2, v1
+; GFX9-NEXT:    v_subrev_u32_e32 v2, 24, v0
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 24, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-NEXT:    v_subrev_u32_e32 v2, 24, v0
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 24, v0
 ; GFX9-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX9-NEXT:    s_and_b32 s9, 0xffff, s9
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX9-NEXT:    v_sub_u32_e32 v2, 23, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-NEXT:    v_sub_u32_e32 v2, 23, v0
 ; GFX9-NEXT:    s_lshl_b32 s0, s9, 17
 ; GFX9-NEXT:    s_lshl_b32 s1, s1, 1
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
+; GFX9-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
 ; GFX9-NEXT:    s_or_b32 s0, s0, s1
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
-; GFX9-NEXT:    v_lshrrev_b32_e64 v1, v1, s3
+; GFX9-NEXT:    v_lshrrev_b32_e64 v0, v0, s3
 ; GFX9-NEXT:    s_mov_b32 s6, 8
-; GFX9-NEXT:    v_lshl_or_b32 v1, s0, v2, v1
+; GFX9-NEXT:    v_lshl_or_b32 v0, s0, v2, v0
 ; GFX9-NEXT:    s_mov_b32 s8, 16
 ; GFX9-NEXT:    s_movk_i32 s0, 0xff
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xff, v1
-; GFX9-NEXT:    v_and_or_b32 v2, v0, s0, v2
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, s6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX9-NEXT:    v_and_b32_e32 v3, 0xff, v0
+; GFX9-NEXT:    v_and_or_b32 v2, v1, s0, v2
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, s8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX9-NEXT:    v_or3_b32 v0, v2, v0, v3
-; GFX9-NEXT:    v_bfe_u32 v2, v1, 8, 8
-; GFX9-NEXT:    v_bfe_u32 v1, v1, 16, 8
-; GFX9-NEXT:    v_lshl_or_b32 v1, v1, 8, v2
-; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX9-NEXT:    v_or3_b32 v1, v2, v1, v3
+; GFX9-NEXT:    v_bfe_u32 v2, v0, 8, 8
+; GFX9-NEXT:    v_bfe_u32 v0, v0, 16, 8
+; GFX9-NEXT:    v_lshl_or_b32 v0, v0, 8, v2
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s1, v0
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: s_fshr_v2i24:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, 24
-; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v1, 24
 ; GFX10-NEXT:    s_lshr_b32 s9, s1, 8
 ; GFX10-NEXT:    s_and_b32 s1, s1, 0xff
 ; GFX10-NEXT:    s_lshr_b32 s6, s0, 8
-; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX10-NEXT:    v_rcp_iflag_f32_e32 v1, v1
 ; GFX10-NEXT:    s_lshr_b32 s8, s0, 24
+; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX10-NEXT:    s_lshl_b32 s1, s1, 8
 ; GFX10-NEXT:    s_and_b32 s6, s6, 0xff
 ; GFX10-NEXT:    s_or_b32 s1, s8, s1
@@ -2210,123 +2202,121 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX10-NEXT:    s_lshr_b32 s7, s0, 16
 ; GFX10-NEXT:    s_and_b32 s0, s0, 0xff
 ; GFX10-NEXT:    s_lshl_b32 s6, s6, 8
-; GFX10-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
-; GFX10-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
 ; GFX10-NEXT:    s_and_b32 s8, s8, 0xff
 ; GFX10-NEXT:    s_or_b32 s0, s0, s6
+; GFX10-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v0
+; GFX10-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX10-NEXT:    s_and_b32 s6, s7, 0xff
-; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX10-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX10-NEXT:    s_and_b32 s7, s9, 0xff
 ; GFX10-NEXT:    s_lshr_b32 s9, s4, 16
+; GFX10-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX10-NEXT:    s_lshr_b32 s10, s4, 24
-; GFX10-NEXT:    v_mul_lo_u32 v2, 0xffffffe8, v0
-; GFX10-NEXT:    v_mul_lo_u32 v3, 0xffffffe8, v1
 ; GFX10-NEXT:    s_and_b32 s4, s4, 0xff
 ; GFX10-NEXT:    s_lshl_b32 s8, s8, 8
-; GFX10-NEXT:    s_lshr_b32 s11, s5, 8
+; GFX10-NEXT:    v_mul_lo_u32 v2, 0xffffffe8, v1
+; GFX10-NEXT:    v_mul_lo_u32 v3, 0xffffffe8, v0
 ; GFX10-NEXT:    s_or_b32 s4, s4, s8
 ; GFX10-NEXT:    s_and_b32 s8, s9, 0xff
-; GFX10-NEXT:    s_and_b32 s4, 0xffff, s4
-; GFX10-NEXT:    v_mul_hi_u32 v2, v0, v2
-; GFX10-NEXT:    v_mul_hi_u32 v3, v1, v3
+; GFX10-NEXT:    s_lshr_b32 s11, s5, 8
 ; GFX10-NEXT:    s_and_b32 s8, 0xffff, s8
+; GFX10-NEXT:    s_and_b32 s4, 0xffff, s4
 ; GFX10-NEXT:    s_and_b32 s5, s5, 0xff
+; GFX10-NEXT:    v_mul_hi_u32 v2, v1, v2
+; GFX10-NEXT:    v_mul_hi_u32 v3, v0, v3
 ; GFX10-NEXT:    s_lshl_b32 s8, s8, 16
 ; GFX10-NEXT:    s_lshl_b32 s5, s5, 8
 ; GFX10-NEXT:    s_or_b32 s4, s4, s8
 ; GFX10-NEXT:    s_and_b32 s8, s11, 0xff
-; GFX10-NEXT:    v_add_nc_u32_e32 v0, v0, v2
 ; GFX10-NEXT:    s_or_b32 s5, s10, s5
 ; GFX10-NEXT:    s_and_b32 s8, 0xffff, s8
-; GFX10-NEXT:    v_add_nc_u32_e32 v1, v1, v3
+; GFX10-NEXT:    v_add_nc_u32_e32 v1, v1, v2
+; GFX10-NEXT:    v_add_nc_u32_e32 v0, v0, v3
 ; GFX10-NEXT:    s_and_b32 s5, 0xffff, s5
-; GFX10-NEXT:    v_mul_hi_u32 v0, s4, v0
 ; GFX10-NEXT:    s_lshl_b32 s8, s8, 16
 ; GFX10-NEXT:    s_lshr_b32 s9, s2, 8
+; GFX10-NEXT:    v_mul_hi_u32 v1, s4, v1
 ; GFX10-NEXT:    s_or_b32 s5, s5, s8
 ; GFX10-NEXT:    s_lshr_b32 s8, s2, 16
-; GFX10-NEXT:    v_mul_hi_u32 v1, s5, v1
+; GFX10-NEXT:    v_mul_hi_u32 v0, s5, v0
 ; GFX10-NEXT:    s_and_b32 s9, s9, 0xff
 ; GFX10-NEXT:    s_lshr_b32 s10, s2, 24
-; GFX10-NEXT:    v_mul_lo_u32 v0, v0, 24
 ; GFX10-NEXT:    s_lshr_b32 s11, s3, 8
 ; GFX10-NEXT:    s_and_b32 s2, s2, 0xff
+; GFX10-NEXT:    v_mul_lo_u32 v1, v1, 24
 ; GFX10-NEXT:    s_lshl_b32 s9, s9, 8
 ; GFX10-NEXT:    s_and_b32 s8, s8, 0xff
-; GFX10-NEXT:    v_mul_lo_u32 v1, v1, 24
+; GFX10-NEXT:    v_mul_lo_u32 v0, v0, 24
 ; GFX10-NEXT:    s_and_b32 s3, s3, 0xff
 ; GFX10-NEXT:    s_or_b32 s2, s2, s9
-; GFX10-NEXT:    v_sub_nc_u32_e32 v0, s4, v0
-; GFX10-NEXT:    s_and_b32 s4, 0xffff, s8
 ; GFX10-NEXT:    s_lshl_b32 s3, s3, 8
 ; GFX10-NEXT:    s_and_b32 s2, 0xffff, s2
-; GFX10-NEXT:    s_lshl_b32 s4, s4, 16
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v2, 24, v0
-; GFX10-NEXT:    v_sub_nc_u32_e32 v1, s5, v1
-; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v0
-; GFX10-NEXT:    s_and_b32 s5, s11, 0xff
+; GFX10-NEXT:    v_sub_nc_u32_e32 v1, s4, v1
+; GFX10-NEXT:    s_and_b32 s4, 0xffff, s8
 ; GFX10-NEXT:    s_or_b32 s3, s10, s3
-; GFX10-NEXT:    s_and_b32 s5, 0xffff, s5
-; GFX10-NEXT:    s_and_b32 s3, 0xffff, s3
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX10-NEXT:    v_sub_nc_u32_e32 v0, s5, v0
+; GFX10-NEXT:    s_and_b32 s5, s11, 0xff
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v2, 24, v1
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v1
+; GFX10-NEXT:    s_lshl_b32 s4, s4, 16
+; GFX10-NEXT:    s_and_b32 s5, 0xffff, s5
+; GFX10-NEXT:    s_and_b32 s3, 0xffff, s3
 ; GFX10-NEXT:    s_lshl_b32 s5, s5, 16
-; GFX10-NEXT:    s_or_b32 s2, s2, s4
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, 24, v0
-; GFX10-NEXT:    s_and_b32 s0, 0xffff, s0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc_lo
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v2, 24, v0
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v0
+; GFX10-NEXT:    s_or_b32 s2, s2, s4
+; GFX10-NEXT:    s_and_b32 s0, 0xffff, s0
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, 24, v1
 ; GFX10-NEXT:    s_and_b32 s6, 0xffff, s6
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v1
 ; GFX10-NEXT:    s_or_b32 s3, s3, s5
 ; GFX10-NEXT:    s_and_b32 s1, 0xffff, s1
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v2, 24, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc_lo
-; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v1
 ; GFX10-NEXT:    s_and_b32 s7, 0xffff, s7
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v2, 24, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v0
 ; GFX10-NEXT:    s_lshl_b32 s4, s6, 17
 ; GFX10-NEXT:    s_lshl_b32 s0, s0, 1
-; GFX10-NEXT:    v_sub_nc_u32_e32 v3, 23, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc_lo
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
-; GFX10-NEXT:    s_or_b32 s0, s4, s0
 ; GFX10-NEXT:    s_lshl_b32 s1, s1, 1
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffffff, v3
-; GFX10-NEXT:    v_sub_nc_u32_e32 v2, 23, v1
+; GFX10-NEXT:    v_sub_nc_u32_e32 v3, 23, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
-; GFX10-NEXT:    v_lshrrev_b32_e64 v0, v0, s2
+; GFX10-NEXT:    s_or_b32 s0, s4, s0
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffffff, v3
+; GFX10-NEXT:    v_sub_nc_u32_e32 v2, 23, v0
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
+; GFX10-NEXT:    v_lshrrev_b32_e64 v1, v1, s2
 ; GFX10-NEXT:    s_lshl_b32 s2, s7, 17
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
-; GFX10-NEXT:    v_lshrrev_b32_e64 v1, v1, s3
-; GFX10-NEXT:    v_lshl_or_b32 v0, s0, v3, v0
+; GFX10-NEXT:    v_lshrrev_b32_e64 v0, v0, s3
+; GFX10-NEXT:    v_lshl_or_b32 v1, s0, v3, v1
 ; GFX10-NEXT:    s_or_b32 s0, s2, s1
-; GFX10-NEXT:    v_lshl_or_b32 v1, s0, v2, v1
+; GFX10-NEXT:    v_lshl_or_b32 v0, s0, v2, v0
 ; GFX10-NEXT:    s_mov_b32 s0, 8
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; GFX10-NEXT:    s_mov_b32 s0, 16
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xff, v1
-; GFX10-NEXT:    v_bfe_u32 v4, v1, 8, 8
-; GFX10-NEXT:    v_bfe_u32 v1, v1, 16, 8
-; GFX10-NEXT:    v_and_or_b32 v2, v0, 0xff, v2
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v0, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xff, v0
+; GFX10-NEXT:    v_bfe_u32 v4, v0, 8, 8
+; GFX10-NEXT:    v_bfe_u32 v0, v0, 16, 8
+; GFX10-NEXT:    v_and_or_b32 v2, v1, 0xff, v2
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX10-NEXT:    v_lshl_or_b32 v1, v1, 8, v4
-; GFX10-NEXT:    v_or3_b32 v0, v2, v0, v3
-; GFX10-NEXT:    v_readfirstlane_b32 s1, v1
-; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX10-NEXT:    v_lshl_or_b32 v0, v0, 8, v4
+; GFX10-NEXT:    v_or3_b32 v1, v2, v1, v3
+; GFX10-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX10-NEXT:    v_readfirstlane_b32 s0, v1
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: s_fshr_v2i24:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v0, 24
-; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v1, 24
 ; GFX11-NEXT:    s_lshr_b32 s6, s0, 8
 ; GFX11-NEXT:    s_lshr_b32 s7, s0, 16
 ; GFX11-NEXT:    s_and_b32 s6, s6, 0xff
-; GFX11-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX11-NEXT:    v_rcp_iflag_f32_e32 v1, v1
 ; GFX11-NEXT:    s_lshr_b32 s8, s0, 24
+; GFX11-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX11-NEXT:    s_and_b32 s0, s0, 0xff
 ; GFX11-NEXT:    s_lshl_b32 s6, s6, 8
 ; GFX11-NEXT:    s_lshr_b32 s9, s1, 8
@@ -2334,122 +2324,124 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX11-NEXT:    s_and_b32 s6, s7, 0xff
 ; GFX11-NEXT:    s_and_b32 s7, s9, 0xff
 ; GFX11-NEXT:    s_lshr_b32 s9, s4, 8
-; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_dual_mul_f32 v0, 0x4f7ffffe, v0 :: v_dual_mul_f32 v1, 0x4f7ffffe, v1
 ; GFX11-NEXT:    s_lshr_b32 s10, s4, 16
 ; GFX11-NEXT:    s_and_b32 s9, s9, 0xff
+; GFX11-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v0
 ; GFX11-NEXT:    s_and_b32 s11, s4, 0xff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX11-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX11-NEXT:    s_lshl_b32 s9, s9, 8
 ; GFX11-NEXT:    s_and_b32 s10, s10, 0xff
 ; GFX11-NEXT:    s_or_b32 s9, s11, s9
-; GFX11-NEXT:    v_mul_lo_u32 v2, 0xffffffe8, v0
-; GFX11-NEXT:    v_mul_lo_u32 v3, 0xffffffe8, v1
+; GFX11-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX11-NEXT:    s_and_b32 s10, 0xffff, s10
 ; GFX11-NEXT:    s_and_b32 s9, 0xffff, s9
 ; GFX11-NEXT:    s_lshl_b32 s10, s10, 16
 ; GFX11-NEXT:    s_lshr_b32 s11, s5, 8
-; GFX11-NEXT:    s_or_b32 s9, s9, s10
+; GFX11-NEXT:    v_mul_lo_u32 v2, 0xffffffe8, v1
 ; GFX11-NEXT:    s_and_b32 s5, s5, 0xff
-; GFX11-NEXT:    v_mul_hi_u32 v2, v0, v2
 ; GFX11-NEXT:    s_lshr_b32 s4, s4, 24
+; GFX11-NEXT:    s_or_b32 s9, s9, s10
 ; GFX11-NEXT:    s_lshl_b32 s5, s5, 8
 ; GFX11-NEXT:    s_and_b32 s10, s11, 0xff
 ; GFX11-NEXT:    s_or_b32 s4, s4, s5
 ; GFX11-NEXT:    s_and_b32 s5, 0xffff, s10
+; GFX11-NEXT:    v_mul_hi_u32 v2, v1, v2
+; GFX11-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX11-NEXT:    s_and_b32 s4, 0xffff, s4
 ; GFX11-NEXT:    s_lshl_b32 s5, s5, 16
-; GFX11-NEXT:    v_add_nc_u32_e32 v0, v0, v2
-; GFX11-NEXT:    v_mul_hi_u32 v2, v1, v3
-; GFX11-NEXT:    s_or_b32 s4, s4, s5
 ; GFX11-NEXT:    s_and_b32 s1, s1, 0xff
-; GFX11-NEXT:    s_lshr_b32 s10, s2, 16
-; GFX11-NEXT:    v_mul_hi_u32 v0, s9, v0
+; GFX11-NEXT:    s_or_b32 s4, s4, s5
 ; GFX11-NEXT:    s_lshl_b32 s1, s1, 8
-; GFX11-NEXT:    s_lshr_b32 s5, s2, 24
-; GFX11-NEXT:    s_or_b32 s1, s8, s1
+; GFX11-NEXT:    s_lshr_b32 s10, s2, 16
 ; GFX11-NEXT:    v_add_nc_u32_e32 v1, v1, v2
+; GFX11-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX11-NEXT:    s_or_b32 s1, s8, s1
 ; GFX11-NEXT:    s_lshr_b32 s8, s2, 8
-; GFX11-NEXT:    s_and_b32 s2, s2, 0xff
+; GFX11-NEXT:    s_lshr_b32 s5, s2, 24
+; GFX11-NEXT:    v_mul_hi_u32 v1, s9, v1
+; GFX11-NEXT:    v_mul_lo_u32 v3, 0xffffffe8, v0
 ; GFX11-NEXT:    s_and_b32 s8, s8, 0xff
-; GFX11-NEXT:    v_mul_lo_u32 v0, v0, 24
-; GFX11-NEXT:    v_mul_hi_u32 v1, s4, v1
+; GFX11-NEXT:    s_and_b32 s2, s2, 0xff
 ; GFX11-NEXT:    s_lshl_b32 s8, s8, 8
 ; GFX11-NEXT:    s_and_b32 s0, 0xffff, s0
 ; GFX11-NEXT:    s_or_b32 s2, s2, s8
 ; GFX11-NEXT:    s_and_b32 s8, s10, 0xff
-; GFX11-NEXT:    s_and_b32 s2, 0xffff, s2
-; GFX11-NEXT:    s_and_b32 s8, 0xffff, s8
-; GFX11-NEXT:    v_sub_nc_u32_e32 v0, s9, v0
 ; GFX11-NEXT:    v_mul_lo_u32 v1, v1, 24
+; GFX11-NEXT:    v_mul_hi_u32 v2, v0, v3
+; GFX11-NEXT:    s_and_b32 s8, 0xffff, s8
+; GFX11-NEXT:    s_and_b32 s2, 0xffff, s2
+; GFX11-NEXT:    s_lshl_b32 s8, s8, 16
+; GFX11-NEXT:    s_and_b32 s6, 0xffff, s6
+; GFX11-NEXT:    s_or_b32 s2, s2, s8
+; GFX11-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX11-NEXT:    v_sub_nc_u32_e32 v1, s9, v1
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, v0, v2
 ; GFX11-NEXT:    s_lshr_b32 s9, s3, 8
 ; GFX11-NEXT:    s_and_b32 s3, s3, 0xff
-; GFX11-NEXT:    s_lshl_b32 s8, s8, 16
-; GFX11-NEXT:    v_subrev_nc_u32_e32 v2, 24, v0
-; GFX11-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v0
+; GFX11-NEXT:    s_and_b32 s1, 0xffff, s1
+; GFX11-NEXT:    v_subrev_nc_u32_e32 v2, 24, v1
+; GFX11-NEXT:    v_mul_hi_u32 v0, s4, v0
+; GFX11-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v1
 ; GFX11-NEXT:    s_lshl_b32 s3, s3, 8
-; GFX11-NEXT:    s_or_b32 s2, s2, s8
-; GFX11-NEXT:    v_sub_nc_u32_e32 v1, s4, v1
-; GFX11-NEXT:    s_and_b32 s4, s9, 0xff
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX11-NEXT:    s_and_b32 s6, 0xffff, s6
+; GFX11-NEXT:    s_and_b32 s7, 0xffff, s7
 ; GFX11-NEXT:    s_or_b32 s3, s5, s3
-; GFX11-NEXT:    v_subrev_nc_u32_e32 v3, 24, v1
-; GFX11-NEXT:    s_and_b32 s4, 0xffff, s4
-; GFX11-NEXT:    v_subrev_nc_u32_e32 v2, 24, v0
-; GFX11-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v0
-; GFX11-NEXT:    s_and_b32 s3, 0xffff, s3
-; GFX11-NEXT:    s_lshl_b32 s4, s4, 16
 ; GFX11-NEXT:    s_lshl_b32 s5, s6, 17
-; GFX11-NEXT:    s_lshl_b32 s0, s0, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX11-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v1
+; GFX11-NEXT:    s_and_b32 s3, 0xffff, s3
 ; GFX11-NEXT:    s_or_b32 s0, s5, s0
-; GFX11-NEXT:    s_and_b32 s1, 0xffff, s1
-; GFX11-NEXT:    s_and_b32 s7, 0xffff, s7
+; GFX11-NEXT:    v_mul_lo_u32 v0, v0, 24
 ; GFX11-NEXT:    s_lshl_b32 s1, s1, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_subrev_nc_u32_e32 v3, 24, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    v_sub_nc_u32_e32 v0, s4, v0
+; GFX11-NEXT:    s_and_b32 s4, s9, 0xff
+; GFX11-NEXT:    s_and_b32 s4, 0xffff, s4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_subrev_nc_u32_e32 v3, 24, v0
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc_lo
+; GFX11-NEXT:    s_lshl_b32 s4, s4, 16
+; GFX11-NEXT:    v_subrev_nc_u32_e32 v2, 24, v1
 ; GFX11-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v1
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_sub_nc_u32_e32 v3, 23, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc_lo
+; GFX11-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v0
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc_lo
+; GFX11-NEXT:    v_subrev_nc_u32_e32 v3, 24, v0
+; GFX11-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc_lo
+; GFX11-NEXT:    v_sub_nc_u32_e32 v2, 23, v1
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
-; GFX11-NEXT:    v_sub_nc_u32_e32 v2, 23, v0
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
+; GFX11-NEXT:    v_sub_nc_u32_e32 v3, 23, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    v_lshrrev_b32_e64 v0, v0, s2
-; GFX11-NEXT:    s_or_b32 s2, s3, s4
 ; GFX11-NEXT:    v_lshrrev_b32_e64 v1, v1, s2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    v_lshl_or_b32 v0, s0, v2, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
+; GFX11-NEXT:    s_or_b32 s2, s3, s4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_lshl_or_b32 v1, s0, v2, v1
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0xffffff, v3
+; GFX11-NEXT:    v_lshrrev_b32_e64 v0, v0, s2
 ; GFX11-NEXT:    s_lshl_b32 s0, s7, 17
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    s_or_b32 s0, s0, s1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_bfe_u32 v3, v0, 8, 8
-; GFX11-NEXT:    v_lshl_or_b32 v1, s0, v2, v1
+; GFX11-NEXT:    v_bfe_u32 v3, v1, 8, 8
+; GFX11-NEXT:    v_lshl_or_b32 v0, s0, v2, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 8, v3
-; GFX11-NEXT:    v_bfe_u32 v3, v0, 16, 8
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v1
+; GFX11-NEXT:    v_bfe_u32 v3, v1, 16, 8
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_and_or_b32 v0, v0, 0xff, v2
+; GFX11-NEXT:    v_and_or_b32 v1, v1, 0xff, v2
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 24, v4
-; GFX11-NEXT:    v_bfe_u32 v4, v1, 8, 8
-; GFX11-NEXT:    v_bfe_u32 v1, v1, 16, 8
-; GFX11-NEXT:    v_or3_b32 v0, v0, v2, v3
+; GFX11-NEXT:    v_bfe_u32 v4, v0, 8, 8
+; GFX11-NEXT:    v_bfe_u32 v0, v0, 16, 8
+; GFX11-NEXT:    v_or3_b32 v1, v1, v2, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_lshl_or_b32 v1, v1, 8, v4
-; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11-NEXT:    v_lshl_or_b32 v0, v0, 8, v4
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX11-NEXT:    v_readfirstlane_b32 s1, v0
 ; GFX11-NEXT:    ; return to shader part epilog
   %lhs = bitcast i48 %lhs.arg to <2 x i24>
   %rhs = bitcast i48 %rhs.arg to <2 x i24>
@@ -2465,42 +2457,40 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) {
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_cvt_f32_ubyte0_e32 v6, 24
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v6, v6
-; GFX6-NEXT:    v_mov_b32_e32 v7, 0xffffffe8
+; GFX6-NEXT:    v_mov_b32_e32 v8, 0xffffffe8
 ; GFX6-NEXT:    v_and_b32_e32 v4, 0xffffff, v4
-; GFX6-NEXT:    v_cvt_f32_ubyte0_e32 v9, 24
+; GFX6-NEXT:    v_and_b32_e32 v5, 0xffffff, v5
+; GFX6-NEXT:    v_mul_f32_e32 v7, 0x4f7ffffe, v6
+; GFX6-NEXT:    v_cvt_u32_f32_e32 v7, v7
 ; GFX6-NEXT:    v_mul_f32_e32 v6, 0x4f7ffffe, v6
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v6, v6
-; GFX6-NEXT:    v_and_b32_e32 v5, 0xffffff, v5
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX6-NEXT:    v_mul_lo_u32 v9, v7, v8
 ; GFX6-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
-; GFX6-NEXT:    v_mul_lo_u32 v8, v6, v7
+; GFX6-NEXT:    v_mul_lo_u32 v8, v6, v8
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 1, v1
+; GFX6-NEXT:    v_mul_hi_u32 v9, v7, v9
 ; GFX6-NEXT:    v_and_b32_e32 v3, 0xffffff, v3
 ; GFX6-NEXT:    v_mul_hi_u32 v8, v6, v8
-; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
-; GFX6-NEXT:    v_mul_hi_u32 v6, v4, v6
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v8, v9
-; GFX6-NEXT:    v_mul_lo_u32 v6, v6, 24
-; GFX6-NEXT:    v_mul_f32_e32 v8, 0x4f7ffffe, v8
-; GFX6-NEXT:    v_cvt_u32_f32_e32 v8, v8
-; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, v4, v6
-; GFX6-NEXT:    v_subrev_i32_e32 v6, vcc, 24, v4
+; GFX6-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
+; GFX6-NEXT:    v_mul_hi_u32 v7, v4, v7
+; GFX6-NEXT:    v_mul_lo_u32 v7, v7, 24
+; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, v4, v7
+; GFX6-NEXT:    v_subrev_i32_e32 v7, vcc, 24, v4
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 24, v4
-; GFX6-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
-; GFX6-NEXT:    v_subrev_i32_e32 v6, vcc, 24, v4
+; GFX6-NEXT:    v_cndmask_b32_e32 v4, v4, v7, vcc
+; GFX6-NEXT:    v_subrev_i32_e32 v7, vcc, 24, v4
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 24, v4
-; GFX6-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v6, v8, v7
+; GFX6-NEXT:    v_cndmask_b32_e32 v4, v4, v7, vcc
+; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
+; GFX6-NEXT:    v_mul_hi_u32 v6, v5, v6
 ; GFX6-NEXT:    v_sub_i32_e32 v7, vcc, 23, v4
 ; GFX6-NEXT:    v_and_b32_e32 v7, 0xffffff, v7
-; GFX6-NEXT:    v_mul_hi_u32 v6, v8, v6
+; GFX6-NEXT:    v_mul_lo_u32 v6, v6, 24
 ; GFX6-NEXT:    v_and_b32_e32 v4, 0xffffff, v4
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, v7, v0
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v4, v2
-; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
-; GFX6-NEXT:    v_mul_hi_u32 v6, v5, v6
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
-; GFX6-NEXT:    v_mul_lo_u32 v6, v6, 24
 ; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v5, v6
 ; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, 24, v2
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 24, v2
@@ -2521,42 +2511,40 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cvt_f32_ubyte0_e32 v6, 24
 ; GFX8-NEXT:    v_rcp_iflag_f32_e32 v6, v6
-; GFX8-NEXT:    v_mov_b32_e32 v7, 0xffffffe8
+; GFX8-NEXT:    v_mov_b32_e32 v8, 0xffffffe8
 ; GFX8-NEXT:    v_and_b32_e32 v4, 0xffffff, v4
-; GFX8-NEXT:    v_cvt_f32_ubyte0_e32 v9, 24
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffffff, v5
+; GFX8-NEXT:    v_mul_f32_e32 v7, 0x4f7ffffe, v6
+; GFX8-NEXT:    v_cvt_u32_f32_e32 v7, v7
 ; GFX8-NEXT:    v_mul_f32_e32 v6, 0x4f7ffffe, v6
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v6, v6
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffffff, v5
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX8-NEXT:    v_mul_lo_u32 v9, v7, v8
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
-; GFX8-NEXT:    v_mul_lo_u32 v8, v6, v7
+; GFX8-NEXT:    v_mul_lo_u32 v8, v6, v8
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 1, v1
+; GFX8-NEXT:    v_mul_hi_u32 v9, v7, v9
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xffffff, v3
 ; GFX8-NEXT:    v_mul_hi_u32 v8, v6, v8
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v8
-; GFX8-NEXT:    v_mul_hi_u32 v6, v4, v6
-; GFX8-NEXT:    v_rcp_iflag_f32_e32 v8, v9
-; GFX8-NEXT:    v_mul_lo_u32 v6, v6, 24
-; GFX8-NEXT:    v_mul_f32_e32 v8, 0x4f7ffffe, v8
-; GFX8-NEXT:    v_cvt_u32_f32_e32 v8, v8
-; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, v4, v6
-; GFX8-NEXT:    v_subrev_u32_e32 v6, vcc, 24, v4
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v9
+; GFX8-NEXT:    v_mul_hi_u32 v7, v4, v7
+; GFX8-NEXT:    v_mul_lo_u32 v7, v7, 24
+; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, v4, v7
+; GFX8-NEXT:    v_subrev_u32_e32 v7, vcc, 24, v4
 ; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 24, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
-; GFX8-NEXT:    v_subrev_u32_e32 v6, vcc, 24, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v7, vcc
+; GFX8-NEXT:    v_subrev_u32_e32 v7, vcc, 24, v4
 ; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 24, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
-; GFX8-NEXT:    v_mul_lo_u32 v6, v8, v7
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v7, vcc
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v8
+; GFX8-NEXT:    v_mul_hi_u32 v6, v5, v6
 ; GFX8-NEXT:    v_sub_u32_e32 v7, vcc, 23, v4
 ; GFX8-NEXT:    v_and_b32_e32 v7, 0xffffff, v7
-; GFX8-NEXT:    v_mul_hi_u32 v6, v8, v6
+; GFX8-NEXT:    v_mul_lo_u32 v6, v6, 24
 ; GFX8-NEXT:    v_and_b32_e32 v4, 0xffffff, v4
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, v7, v0
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, v4, v2
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v8, v6
-; GFX8-NEXT:    v_mul_hi_u32 v6, v5, v6
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
-; GFX8-NEXT:    v_mul_lo_u32 v6, v6, 24
 ; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, v5, v6
 ; GFX8-NEXT:    v_subrev_u32_e32 v4, vcc, 24, v2
 ; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 24, v2
@@ -2577,42 +2565,40 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v6, 24
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v6, v6
-; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v9, 24
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v9, v9
-; GFX9-NEXT:    v_mov_b32_e32 v7, 0xffffffe8
-; GFX9-NEXT:    v_mul_f32_e32 v6, 0x4f7ffffe, v6
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v6, v6
-; GFX9-NEXT:    v_mul_f32_e32 v9, 0x4f7ffffe, v9
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v9, v9
+; GFX9-NEXT:    v_mov_b32_e32 v8, 0xffffffe8
 ; GFX9-NEXT:    v_and_b32_e32 v4, 0xffffff, v4
-; GFX9-NEXT:    v_mul_lo_u32 v8, v6, v7
 ; GFX9-NEXT:    v_and_b32_e32 v5, 0xffffff, v5
-; GFX9-NEXT:    v_mul_lo_u32 v7, v9, v7
+; GFX9-NEXT:    v_mul_f32_e32 v7, 0x4f7ffffe, v6
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v7, v7
+; GFX9-NEXT:    v_mul_f32_e32 v6, 0x4f7ffffe, v6
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v6, v6
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
-; GFX9-NEXT:    v_mul_hi_u32 v8, v6, v8
+; GFX9-NEXT:    v_mul_lo_u32 v9, v7, v8
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_mul_hi_u32 v7, v9, v7
+; GFX9-NEXT:    v_mul_lo_u32 v8, v6, v8
 ; GFX9-NEXT:    v_and_b32_e32 v3, 0xffffff, v3
-; GFX9-NEXT:    v_add_u32_e32 v6, v6, v8
-; GFX9-NEXT:    v_mul_hi_u32 v6, v4, v6
-; GFX9-NEXT:    v_add_u32_e32 v7, v9, v7
-; GFX9-NEXT:    v_mul_hi_u32 v7, v5, v7
+; GFX9-NEXT:    v_mul_hi_u32 v9, v7, v9
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 1, v1
-; GFX9-NEXT:    v_mul_lo_u32 v6, v6, 24
+; GFX9-NEXT:    v_mul_hi_u32 v8, v6, v8
+; GFX9-NEXT:    v_add_u32_e32 v7, v7, v9
+; GFX9-NEXT:    v_mul_hi_u32 v7, v4, v7
+; GFX9-NEXT:    v_add_u32_e32 v6, v6, v8
+; GFX9-NEXT:    v_mul_hi_u32 v6, v5, v6
 ; GFX9-NEXT:    v_mul_lo_u32 v7, v7, 24
-; GFX9-NEXT:    v_sub_u32_e32 v4, v4, v6
-; GFX9-NEXT:    v_subrev_u32_e32 v6, 24, v4
+; GFX9-NEXT:    v_mul_lo_u32 v6, v6, 24
+; GFX9-NEXT:    v_sub_u32_e32 v4, v4, v7
+; GFX9-NEXT:    v_subrev_u32_e32 v7, 24, v4
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 24, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v6, 24, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v7, vcc
+; GFX9-NEXT:    v_subrev_u32_e32 v7, 24, v4
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 24, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
-; GFX9-NEXT:    v_sub_u32_e32 v6, 23, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v7, vcc
+; GFX9-NEXT:    v_sub_u32_e32 v7, 23, v4
 ; GFX9-NEXT:    v_and_b32_e32 v4, 0xffffff, v4
-; GFX9-NEXT:    v_and_b32_e32 v6, 0xffffff, v6
+; GFX9-NEXT:    v_and_b32_e32 v7, 0xffffff, v7
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, v4, v2
-; GFX9-NEXT:    v_lshl_or_b32 v0, v0, v6, v2
-; GFX9-NEXT:    v_sub_u32_e32 v2, v5, v7
+; GFX9-NEXT:    v_lshl_or_b32 v0, v0, v7, v2
+; GFX9-NEXT:    v_sub_u32_e32 v2, v5, v6
 ; GFX9-NEXT:    v_subrev_u32_e32 v4, 24, v2
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 24, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
@@ -2630,31 +2616,29 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v6, 24
-; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v7, 24
 ; GFX10-NEXT:    v_and_b32_e32 v4, 0xffffff, v4
 ; GFX10-NEXT:    v_and_b32_e32 v5, 0xffffff, v5
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
-; GFX10-NEXT:    v_rcp_iflag_f32_e32 v6, v6
-; GFX10-NEXT:    v_rcp_iflag_f32_e32 v7, v7
 ; GFX10-NEXT:    v_and_b32_e32 v3, 0xffffff, v3
+; GFX10-NEXT:    v_rcp_iflag_f32_e32 v6, v6
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 1, v1
+; GFX10-NEXT:    v_mul_f32_e32 v7, 0x4f7ffffe, v6
 ; GFX10-NEXT:    v_mul_f32_e32 v6, 0x4f7ffffe, v6
-; GFX10-NEXT:    v_mul_f32_e32 v7, 0x4f7ffffe, v7
-; GFX10-NEXT:    v_cvt_u32_f32_e32 v6, v6
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v7, v7
-; GFX10-NEXT:    v_mul_lo_u32 v8, 0xffffffe8, v6
-; GFX10-NEXT:    v_mul_lo_u32 v9, 0xffffffe8, v7
-; GFX10-NEXT:    v_mul_hi_u32 v8, v6, v8
-; GFX10-NEXT:    v_mul_hi_u32 v9, v7, v9
-; GFX10-NEXT:    v_add_nc_u32_e32 v6, v6, v8
-; GFX10-NEXT:    v_add_nc_u32_e32 v7, v7, v9
-; GFX10-NEXT:    v_mul_hi_u32 v6, v4, v6
-; GFX10-NEXT:    v_mul_hi_u32 v7, v5, v7
-; GFX10-NEXT:    v_mul_lo_u32 v6, v6, 24
+; GFX10-NEXT:    v_cvt_u32_f32_e32 v6, v6
+; GFX10-NEXT:    v_mul_lo_u32 v8, 0xffffffe8, v7
+; GFX10-NEXT:    v_mul_lo_u32 v9, 0xffffffe8, v6
+; GFX10-NEXT:    v_mul_hi_u32 v8, v7, v8
+; GFX10-NEXT:    v_mul_hi_u32 v9, v6, v9
+; GFX10-NEXT:    v_add_nc_u32_e32 v7, v7, v8
+; GFX10-NEXT:    v_add_nc_u32_e32 v6, v6, v9
+; GFX10-NEXT:    v_mul_hi_u32 v7, v4, v7
+; GFX10-NEXT:    v_mul_hi_u32 v6, v5, v6
 ; GFX10-NEXT:    v_mul_lo_u32 v7, v7, 24
-; GFX10-NEXT:    v_sub_nc_u32_e32 v4, v4, v6
-; GFX10-NEXT:    v_sub_nc_u32_e32 v5, v5, v7
+; GFX10-NEXT:    v_mul_lo_u32 v6, v6, 24
+; GFX10-NEXT:    v_sub_nc_u32_e32 v4, v4, v7
+; GFX10-NEXT:    v_sub_nc_u32_e32 v5, v5, v6
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v6, 24, v4
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v4
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v7, 24, v5
@@ -2683,40 +2667,37 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v6, 24
-; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v7, 24
 ; GFX11-NEXT:    v_and_b32_e32 v5, 0xffffff, v5
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 1, v1
 ; GFX11-NEXT:    v_and_b32_e32 v3, 0xffffff, v3
 ; GFX11-NEXT:    v_rcp_iflag_f32_e32 v6, v6
-; GFX11-NEXT:    v_rcp_iflag_f32_e32 v7, v7
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_dual_mul_f32 v6, 0x4f7ffffe, v6 :: v_dual_lshlrev_b32 v1, 1, v1
-; GFX11-NEXT:    v_mul_f32_e32 v7, 0x4f7ffffe, v7
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_mul_f32_e32 v7, 0x4f7ffffe, v6
+; GFX11-NEXT:    v_mul_f32_e32 v6, 0x4f7ffffe, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cvt_u32_f32_e32 v6, v6
+; GFX11-NEXT:    v_mul_lo_u32 v9, 0xffffffe8, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_hi_u32 v9, v6, v9
+; GFX11-NEXT:    v_add_nc_u32_e32 v6, v6, v9
 ; GFX11-NEXT:    v_cvt_u32_f32_e32 v7, v7
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_mul_lo_u32 v8, 0xffffffe8, v6
-; GFX11-NEXT:    v_mul_lo_u32 v9, 0xffffffe8, v7
+; GFX11-NEXT:    v_mul_hi_u32 v6, v5, v6
+; GFX11-NEXT:    v_mul_lo_u32 v8, 0xffffffe8, v7
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_mul_hi_u32 v8, v6, v8
-; GFX11-NEXT:    v_mul_hi_u32 v9, v7, v9
+; GFX11-NEXT:    v_mul_lo_u32 v6, v6, 24
+; GFX11-NEXT:    v_mul_hi_u32 v8, v7, v8
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_nc_u32_e32 v6, v6, v8
-; GFX11-NEXT:    v_add_nc_u32_e32 v7, v7, v9
+; GFX11-NEXT:    v_sub_nc_u32_e32 v5, v5, v6
+; GFX11-NEXT:    v_add_nc_u32_e32 v7, v7, v8
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xffffff, v4
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_mul_hi_u32 v7, v5, v7
+; GFX11-NEXT:    v_mul_hi_u32 v7, v4, v7
 ; GFX11-NEXT:    v_mul_lo_u32 v7, v7, 24
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_sub_nc_u32_e32 v5, v5, v7
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_sub_nc_u32_e32 v4, v4, v7
 ; GFX11-NEXT:    v_subrev_nc_u32_e32 v7, 24, v5
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffffff, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_mul_hi_u32 v6, v4, v6
-; GFX11-NEXT:    v_mul_lo_u32 v6, v6, 24
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_sub_nc_u32_e32 v4, v4, v6
 ; GFX11-NEXT:    v_subrev_nc_u32_e32 v6, 24, v4
 ; GFX11-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v4
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
@@ -2729,20 +2710,20 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) {
 ; GFX11-NEXT:    v_subrev_nc_u32_e32 v7, 24, v5
 ; GFX11-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc_lo
 ; GFX11-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v6, 23, v4
 ; GFX11-NEXT:    v_dual_cndmask_b32 v5, v5, v7 :: v_dual_and_b32 v4, 0xffffff, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_and_b32_e32 v6, 0xffffff, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v7, 23, v5
 ; GFX11-NEXT:    v_and_b32_e32 v5, 0xffffff, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v2, v4, v2
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffffff, v7
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xffffff, v7
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v3, v5, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_lshl_or_b32 v0, v0, v6, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_lshl_or_b32 v1, v1, v4, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call <2 x i24> @llvm.fshr.v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-private.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-private.mir
index 4bf6b6a623d82..ebd5e2e085632 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-private.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-private.mir
@@ -25,12 +25,14 @@ body: |
     ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX6-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (load (s32), addrspace 5)
     ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
+    ;
     ; GFX9-LABEL: name: load_private_s32_from_4
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (load (s32), addrspace 5)
     ; GFX9-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
+    ;
     ; GFX11-LABEL: name: load_private_s32_from_4
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -63,12 +65,14 @@ body: |
     ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX6-NEXT: [[BUFFER_LOAD_USHORT_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (load (s16), addrspace 5)
     ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_USHORT_OFFEN]]
+    ;
     ; GFX9-LABEL: name: load_private_s32_from_2
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9-NEXT: [[BUFFER_LOAD_USHORT_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (load (s16), addrspace 5)
     ; GFX9-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_USHORT_OFFEN]]
+    ;
     ; GFX11-LABEL: name: load_private_s32_from_2
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -101,12 +105,14 @@ body: |
     ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 5)
     ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
+    ;
     ; GFX9-LABEL: name: load_private_s32_from_1
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9-NEXT: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 5)
     ; GFX9-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
+    ;
     ; GFX11-LABEL: name: load_private_s32_from_1
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -139,12 +145,14 @@ body: |
     ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX6-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (load (p3), addrspace 5)
     ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
+    ;
     ; GFX9-LABEL: name: load_private_p3_from_4
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (load (p3), addrspace 5)
     ; GFX9-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
+    ;
     ; GFX11-LABEL: name: load_private_p3_from_4
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -177,12 +185,14 @@ body: |
     ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX6-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (load (p5), addrspace 5)
     ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
+    ;
     ; GFX9-LABEL: name: load_private_p5_from_4
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (load (p5), addrspace 5)
     ; GFX9-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
+    ;
     ; GFX11-LABEL: name: load_private_p5_from_4
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -216,12 +226,14 @@ body: |
     ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX6-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (load (<2 x s16>), addrspace 5)
     ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
+    ;
     ; GFX9-LABEL: name: load_private_v2s16
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (load (<2 x s16>), addrspace 5)
     ; GFX9-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
+    ;
     ; GFX11-LABEL: name: load_private_v2s16
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -260,12 +272,14 @@ body: |
     ; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
     ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_CO_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 5)
     ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
+    ;
     ; GFX9-LABEL: name: load_private_s32_from_1_gep_2047
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9-NEXT: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 2047, 0, 0, implicit $exec :: (load (s8), addrspace 5)
     ; GFX9-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
+    ;
     ; GFX11-LABEL: name: load_private_s32_from_1_gep_2047
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -296,14 +310,6 @@ body: |
   bb.0:
     liveins:  $vgpr0
 
-    ; GFX6-LABEL: name: load_private_s32_from_1_gep_2047_known_bits
-    ; GFX6: liveins: $vgpr0
-    ; GFX6-NEXT: {{  $}}
-    ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX6-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2147483647, implicit $exec
-    ; GFX6-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY]], [[V_MOV_B32_e32_]], implicit $exec
-    ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_AND_B32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 2047, 0, 0, implicit $exec :: (load (s8), addrspace 5)
-    ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
     ; GFX9-LABEL: name: load_private_s32_from_1_gep_2047_known_bits
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -312,6 +318,7 @@ body: |
     ; GFX9-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY]], [[V_MOV_B32_e32_]], implicit $exec
     ; GFX9-NEXT: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_AND_B32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 2047, 0, 0, implicit $exec :: (load (s8), addrspace 5)
     ; GFX9-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
+    ;
     ; GFX11-LABEL: name: load_private_s32_from_1_gep_2047_known_bits
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -353,12 +360,14 @@ body: |
     ; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
     ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_CO_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 5)
     ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
+    ;
     ; GFX9-LABEL: name: load_private_s32_from_1_gep_2048
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9-NEXT: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 2048, 0, 0, implicit $exec :: (load (s8), addrspace 5)
     ; GFX9-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
+    ;
     ; GFX11-LABEL: name: load_private_s32_from_1_gep_2048
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -397,6 +406,7 @@ body: |
     ; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
     ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_CO_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 5)
     ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
+    ;
     ; GFX9-LABEL: name: load_private_s32_from_1_gep_m2047
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -405,6 +415,7 @@ body: |
     ; GFX9-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
     ; GFX9-NEXT: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 5)
     ; GFX9-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
+    ;
     ; GFX11-LABEL: name: load_private_s32_from_1_gep_m2047
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -443,6 +454,7 @@ body: |
     ; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
     ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_CO_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 5)
     ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
+    ;
     ; GFX9-LABEL: name: load_private_s32_from_1_gep_m2048
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -451,6 +463,7 @@ body: |
     ; GFX9-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
     ; GFX9-NEXT: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 5)
     ; GFX9-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
+    ;
     ; GFX11-LABEL: name: load_private_s32_from_1_gep_m2048
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -489,12 +502,14 @@ body: |
     ; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
     ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_CO_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 5)
     ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
+    ;
     ; GFX9-LABEL: name: load_private_s32_from_1_gep_4095
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9-NEXT: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4095, 0, 0, implicit $exec :: (load (s8), addrspace 5)
     ; GFX9-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
+    ;
     ; GFX11-LABEL: name: load_private_s32_from_1_gep_4095
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -533,6 +548,7 @@ body: |
     ; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
     ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_CO_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 5)
     ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
+    ;
     ; GFX9-LABEL: name: load_private_s32_from_1_gep_4096
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -541,6 +557,7 @@ body: |
     ; GFX9-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
     ; GFX9-NEXT: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 5)
     ; GFX9-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
+    ;
     ; GFX11-LABEL: name: load_private_s32_from_1_gep_4096
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -579,6 +596,7 @@ body: |
     ; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
     ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_CO_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 5)
     ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
+    ;
     ; GFX9-LABEL: name: load_private_s32_from_1_gep_m4095
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -587,6 +605,7 @@ body: |
     ; GFX9-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
     ; GFX9-NEXT: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 5)
     ; GFX9-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
+    ;
     ; GFX11-LABEL: name: load_private_s32_from_1_gep_m4095
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -625,6 +644,7 @@ body: |
     ; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
     ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_CO_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 5)
     ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
+    ;
     ; GFX9-LABEL: name: load_private_s32_from_1_gep_m4096
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -633,6 +653,7 @@ body: |
     ; GFX9-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
     ; GFX9-NEXT: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 5)
     ; GFX9-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
+    ;
     ; GFX11-LABEL: name: load_private_s32_from_1_gep_m4096
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -671,6 +692,7 @@ body: |
     ; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
     ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_CO_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 5)
     ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
+    ;
     ; GFX9-LABEL: name: load_private_s32_from_1_gep_8191
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -679,6 +701,7 @@ body: |
     ; GFX9-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
     ; GFX9-NEXT: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 5)
     ; GFX9-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
+    ;
     ; GFX11-LABEL: name: load_private_s32_from_1_gep_8191
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -717,6 +740,7 @@ body: |
     ; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
     ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_CO_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 5)
     ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
+    ;
     ; GFX9-LABEL: name: load_private_s32_from_1_gep_8192
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -725,6 +749,7 @@ body: |
     ; GFX9-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
     ; GFX9-NEXT: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 5)
     ; GFX9-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
+    ;
     ; GFX11-LABEL: name: load_private_s32_from_1_gep_8192
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -763,6 +788,7 @@ body: |
     ; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
     ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_CO_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 5)
     ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
+    ;
     ; GFX9-LABEL: name: load_private_s32_from_1_gep_m8191
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -771,6 +797,7 @@ body: |
     ; GFX9-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
     ; GFX9-NEXT: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 5)
     ; GFX9-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
+    ;
     ; GFX11-LABEL: name: load_private_s32_from_1_gep_m8191
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -809,6 +836,7 @@ body: |
     ; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
     ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_CO_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 5)
     ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
+    ;
     ; GFX9-LABEL: name: load_private_s32_from_1_gep_m8192
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -817,6 +845,7 @@ body: |
     ; GFX9-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
     ; GFX9-NEXT: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 5)
     ; GFX9-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
+    ;
     ; GFX11-LABEL: name: load_private_s32_from_1_gep_m8192
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -849,9 +878,11 @@ body: |
     ; GFX6-LABEL: name: load_private_s32_from_4_constant_0
     ; GFX6: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (load (s32), addrspace 5)
     ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFSET]]
+    ;
     ; GFX9-LABEL: name: load_private_s32_from_4_constant_0
     ; GFX9: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (load (s32), addrspace 5)
     ; GFX9-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFSET]]
+    ;
     ; GFX11-LABEL: name: load_private_s32_from_4_constant_0
     ; GFX11: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
     ; GFX11-NEXT: [[SCRATCH_LOAD_DWORD:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_DWORD [[V_MOV_B32_e32_]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s32), addrspace 5)
@@ -878,9 +909,11 @@ body: |
     ; GFX6-LABEL: name: load_private_s32_from_4_constant_sgpr_16
     ; GFX6: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 16, 0, 0, implicit $exec :: (load (s32), addrspace 5)
     ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFSET]]
+    ;
     ; GFX9-LABEL: name: load_private_s32_from_4_constant_sgpr_16
     ; GFX9: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 16, 0, 0, implicit $exec :: (load (s32), addrspace 5)
     ; GFX9-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFSET]]
+    ;
     ; GFX11-LABEL: name: load_private_s32_from_4_constant_sgpr_16
     ; GFX11: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xexec_hi = S_MOV_B32 16
     ; GFX11-NEXT: [[SCRATCH_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_DWORD_SADDR [[S_MOV_B32_]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s32), addrspace 5)
@@ -907,9 +940,11 @@ body: |
     ; GFX6-LABEL: name: load_private_s32_from_1_constant_4095
     ; GFX6: [[BUFFER_LOAD_UBYTE_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4095, 0, 0, implicit $exec :: (load (s8), addrspace 5)
     ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFSET]]
+    ;
     ; GFX9-LABEL: name: load_private_s32_from_1_constant_4095
     ; GFX9: [[BUFFER_LOAD_UBYTE_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4095, 0, 0, implicit $exec :: (load (s8), addrspace 5)
     ; GFX9-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFSET]]
+    ;
     ; GFX11-LABEL: name: load_private_s32_from_1_constant_4095
     ; GFX11: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec
     ; GFX11-NEXT: [[SCRATCH_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE [[V_MOV_B32_e32_]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5)
@@ -937,10 +972,12 @@ body: |
     ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
     ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 5)
     ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
+    ;
     ; GFX9-LABEL: name: load_private_s32_from_1_constant_4096
     ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
     ; GFX9-NEXT: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 5)
     ; GFX9-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
+    ;
     ; GFX11-LABEL: name: load_private_s32_from_1_constant_4096
     ; GFX11: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
     ; GFX11-NEXT: [[SCRATCH_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE [[V_MOV_B32_e32_]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5)
@@ -969,9 +1006,11 @@ body: |
     ; GFX6-LABEL: name: load_private_s32_from_fi
     ; GFX6: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (load (s32), addrspace 5)
     ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
+    ;
     ; GFX9-LABEL: name: load_private_s32_from_fi
     ; GFX9: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (load (s32), addrspace 5)
     ; GFX9-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
+    ;
     ; GFX11-LABEL: name: load_private_s32_from_fi
     ; GFX11: [[SCRATCH_LOAD_DWORD_SADDR:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_DWORD_SADDR %stack.0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32), addrspace 5)
     ; GFX11-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_DWORD_SADDR]]
@@ -998,9 +1037,11 @@ body: |
     ; GFX6-LABEL: name: load_private_s32_from_1_fi_offset_4095
     ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4095, 0, 0, implicit $exec :: (load (s8), addrspace 5)
     ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
+    ;
     ; GFX9-LABEL: name: load_private_s32_from_1_fi_offset_4095
     ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4095, 0, 0, implicit $exec :: (load (s8), addrspace 5)
     ; GFX9-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
+    ;
     ; GFX11-LABEL: name: load_private_s32_from_1_fi_offset_4095
     ; GFX11: [[SCRATCH_LOAD_UBYTE_SADDR:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE_SADDR %stack.0, 4095, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5)
     ; GFX11-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_UBYTE_SADDR]]
@@ -1030,9 +1071,11 @@ body: |
     ; GFX6-LABEL: name: load_private_s32_from_1_fi_offset_sgpr_4095
     ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4095, 0, 0, implicit $exec :: (load (s8), addrspace 5)
     ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
+    ;
     ; GFX9-LABEL: name: load_private_s32_from_1_fi_offset_sgpr_4095
     ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 4095, 0, 0, implicit $exec :: (load (s8), addrspace 5)
     ; GFX9-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
+    ;
     ; GFX11-LABEL: name: load_private_s32_from_1_fi_offset_sgpr_4095
     ; GFX11: [[SCRATCH_LOAD_UBYTE_SADDR:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE_SADDR %stack.0, 4095, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5)
     ; GFX11-NEXT: $vgpr0 = COPY [[SCRATCH_LOAD_UBYTE_SADDR]]
@@ -1066,12 +1109,14 @@ body: |
     ; GFX6-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, dead [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], 0, implicit $exec
     ; GFX6-NEXT: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_CO_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 5)
     ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
+    ;
     ; GFX9-LABEL: name: load_private_s32_from_1_fi_offset_4096
     ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec
     ; GFX9-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
     ; GFX9-NEXT: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], 0, implicit $exec
     ; GFX9-NEXT: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (load (s8), addrspace 5)
     ; GFX9-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]]
+    ;
     ; GFX11-LABEL: name: load_private_s32_from_1_fi_offset_4096
     ; GFX11: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
     ; GFX11-NEXT: [[SCRATCH_LOAD_UBYTE_SVS:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_UBYTE_SVS [[V_MOV_B32_e32_]], %stack.0, 0, 0, implicit $exec, implicit $flat_scr :: (load (s8), addrspace 5)
@@ -1102,10 +1147,12 @@ body: |
     ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
     ; GFX6-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, -1, 0, 0, implicit $exec :: (load (s32), addrspace 5)
     ; GFX6-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
+    ;
     ; GFX9-LABEL: name: load_private_s32_from_neg1
     ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
     ; GFX9-NEXT: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, 0, -1, 0, 0, implicit $exec :: (load (s32), addrspace 5)
     ; GFX9-NEXT: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]]
+    ;
     ; GFX11-LABEL: name: load_private_s32_from_neg1
     ; GFX11: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec
     ; GFX11-NEXT: [[SCRATCH_LOAD_DWORD:%[0-9]+]]:vgpr_32 = SCRATCH_LOAD_DWORD [[V_MOV_B32_e32_]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s32), addrspace 5)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-add.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-add.mir
index 98b29199dcc6d..7faf05e0aaf30 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-add.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-add.mir
@@ -18,6 +18,7 @@ body: |
     ; GFX6-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
     ; GFX6-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY]], [[COPY1]]
     ; GFX6-NEXT: $vgpr0 = COPY [[ADD]](s32)
+    ;
     ; GFX8-LABEL: name: test_add_s32
     ; GFX8: liveins: $vgpr0, $vgpr1
     ; GFX8-NEXT: {{  $}}
@@ -25,6 +26,7 @@ body: |
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
     ; GFX8-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY]], [[COPY1]]
     ; GFX8-NEXT: $vgpr0 = COPY [[ADD]](s32)
+    ;
     ; GFX9-LABEL: name: test_add_s32
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -55,6 +57,7 @@ body: |
     ; GFX6-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[UV1]], [[UV3]]
     ; GFX6-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[ADD]](s32), [[ADD1]](s32)
     ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; GFX8-LABEL: name: test_add_v2s32
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX8-NEXT: {{  $}}
@@ -66,6 +69,7 @@ body: |
     ; GFX8-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[UV1]], [[UV3]]
     ; GFX8-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[ADD]](s32), [[ADD1]](s32)
     ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; GFX9-LABEL: name: test_add_v2s32
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -98,6 +102,7 @@ body: |
     ; GFX6-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
     ; GFX6-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ADD]], [[C]]
     ; GFX6-NEXT: $vgpr0 = COPY [[AND]](s32)
+    ;
     ; GFX8-LABEL: name: test_add_s16
     ; GFX8: liveins: $vgpr0, $vgpr1
     ; GFX8-NEXT: {{  $}}
@@ -108,6 +113,7 @@ body: |
     ; GFX8-NEXT: [[ADD:%[0-9]+]]:_(s16) = G_ADD [[TRUNC]], [[TRUNC1]]
     ; GFX8-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[ADD]](s16)
     ; GFX8-NEXT: $vgpr0 = COPY [[ZEXT]](s32)
+    ;
     ; GFX9-LABEL: name: test_add_s16
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -152,6 +158,7 @@ body: |
     ; GFX6-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
     ; GFX6-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; GFX6-NEXT: $vgpr0 = COPY [[BITCAST2]](<2 x s16>)
+    ;
     ; GFX8-LABEL: name: test_add_v2s16
     ; GFX8: liveins: $vgpr0, $vgpr1
     ; GFX8-NEXT: {{  $}}
@@ -174,6 +181,7 @@ body: |
     ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL]]
     ; GFX8-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; GFX8-NEXT: $vgpr0 = COPY [[BITCAST2]](<2 x s16>)
+    ;
     ; GFX9-LABEL: name: test_add_v2s16
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -208,6 +216,7 @@ body: |
     ; GFX6-NEXT: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[COPY2]], [[COPY5]]
     ; GFX6-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[ADD2]](s32)
     ; GFX6-NEXT: S_ENDPGM 0, implicit [[TRUNC]](s16), implicit [[TRUNC1]](s16), implicit [[TRUNC2]](s16)
+    ;
     ; GFX8-LABEL: name: test_add_v3s16
     ; GFX8: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
     ; GFX8-NEXT: {{  $}}
@@ -227,6 +236,7 @@ body: |
     ; GFX8-NEXT: [[ADD1:%[0-9]+]]:_(s16) = G_ADD [[TRUNC1]], [[TRUNC4]]
     ; GFX8-NEXT: [[ADD2:%[0-9]+]]:_(s16) = G_ADD [[TRUNC2]], [[TRUNC5]]
     ; GFX8-NEXT: S_ENDPGM 0, implicit [[ADD]](s16), implicit [[ADD1]](s16), implicit [[ADD2]](s16)
+    ;
     ; GFX9-LABEL: name: test_add_v3s16
     ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
     ; GFX9-NEXT: {{  $}}
@@ -315,6 +325,7 @@ body: |
     ; GFX6-NEXT: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
     ; GFX6-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>)
     ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
     ; GFX8-LABEL: name: test_add_v4s16
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX8-NEXT: {{  $}}
@@ -355,6 +366,7 @@ body: |
     ; GFX8-NEXT: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
     ; GFX8-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>)
     ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
     ; GFX9-LABEL: name: test_add_v4s16
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -389,6 +401,7 @@ body: |
     ; GFX6-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s1) = G_UADDE [[UV1]], [[UV3]], [[UADDO1]]
     ; GFX6-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO]](s32), [[UADDE]](s32)
     ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64)
+    ;
     ; GFX8-LABEL: name: test_add_s64
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX8-NEXT: {{  $}}
@@ -400,6 +413,7 @@ body: |
     ; GFX8-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s1) = G_UADDE [[UV1]], [[UV3]], [[UADDO1]]
     ; GFX8-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO]](s32), [[UADDE]](s32)
     ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64)
+    ;
     ; GFX9-LABEL: name: test_add_s64
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -432,6 +446,7 @@ body: |
     ; GFX6-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 127
     ; GFX6-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ADD]], [[C]]
     ; GFX6-NEXT: $vgpr0 = COPY [[AND]](s32)
+    ;
     ; GFX8-LABEL: name: test_add_s7
     ; GFX8: liveins: $vgpr0, $vgpr1
     ; GFX8-NEXT: {{  $}}
@@ -440,10 +455,11 @@ body: |
     ; GFX8-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
     ; GFX8-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
     ; GFX8-NEXT: [[ADD:%[0-9]+]]:_(s16) = G_ADD [[TRUNC]], [[TRUNC1]]
-    ; GFX8-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 127
     ; GFX8-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[ADD]](s16)
+    ; GFX8-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 127
     ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C]]
     ; GFX8-NEXT: $vgpr0 = COPY [[AND]](s32)
+    ;
     ; GFX9-LABEL: name: test_add_s7
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -452,8 +468,8 @@ body: |
     ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
     ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
     ; GFX9-NEXT: [[ADD:%[0-9]+]]:_(s16) = G_ADD [[TRUNC]], [[TRUNC1]]
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 127
     ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[ADD]](s16)
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 127
     ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C]]
     ; GFX9-NEXT: $vgpr0 = COPY [[AND]](s32)
     %0:_(s32) = COPY $vgpr0
@@ -478,6 +494,7 @@ body: |
     ; GFX6-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
     ; GFX6-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY]], [[COPY1]]
     ; GFX6-NEXT: $vgpr0 = COPY [[ADD]](s32)
+    ;
     ; GFX8-LABEL: name: test_add_s24
     ; GFX8: liveins: $vgpr0, $vgpr1
     ; GFX8-NEXT: {{  $}}
@@ -485,6 +502,7 @@ body: |
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
     ; GFX8-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY]], [[COPY1]]
     ; GFX8-NEXT: $vgpr0 = COPY [[ADD]](s32)
+    ;
     ; GFX9-LABEL: name: test_add_s24
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -517,6 +535,7 @@ body: |
     ; GFX6-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s1) = G_UADDE [[UV1]], [[UV3]], [[UADDO1]]
     ; GFX6-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO]](s32), [[UADDE]](s32)
     ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64)
+    ;
     ; GFX8-LABEL: name: test_add_s33
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX8-NEXT: {{  $}}
@@ -528,6 +547,7 @@ body: |
     ; GFX8-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s1) = G_UADDE [[UV1]], [[UV3]], [[UADDO1]]
     ; GFX8-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO]](s32), [[UADDE]](s32)
     ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64)
+    ;
     ; GFX9-LABEL: name: test_add_s33
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -566,6 +586,7 @@ body: |
     ; GFX6-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[UV2]], [[UV5]], [[UADDE1]]
     ; GFX6-NEXT: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[UADDO]](s32), [[UADDE]](s32), [[UADDE2]](s32)
     ; GFX6-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[MV]](s96)
+    ;
     ; GFX8-LABEL: name: test_add_s96
     ; GFX8: liveins: $vgpr0_vgpr1_vgpr2, $vgpr3_vgpr4_vgpr5
     ; GFX8-NEXT: {{  $}}
@@ -578,6 +599,7 @@ body: |
     ; GFX8-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[UV2]], [[UV5]], [[UADDE1]]
     ; GFX8-NEXT: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[UADDO]](s32), [[UADDE]](s32), [[UADDE2]](s32)
     ; GFX8-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[MV]](s96)
+    ;
     ; GFX9-LABEL: name: test_add_s96
     ; GFX9: liveins: $vgpr0_vgpr1_vgpr2, $vgpr3_vgpr4_vgpr5
     ; GFX9-NEXT: {{  $}}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-and.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-and.mir
index 60f626491c6b4..66e95921679e5 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-and.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-and.mir
@@ -469,10 +469,9 @@ body: |
     ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C]](s32)
     ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND1]], [[SHL]]
     ; CHECK-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
-    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]]
-    ; CHECK-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[BITCAST2]], [[C1]]
-    ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND4]], [[C]](s32)
-    ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND3]], [[SHL1]]
+    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[BITCAST2]], [[C1]]
+    ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C]](s32)
+    ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[LSHR]], [[SHL1]]
     ; CHECK-NEXT: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
     ; CHECK-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[UV6]](<2 x s16>), [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>)
     ; CHECK-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS2]](<6 x s16>)
@@ -531,10 +530,9 @@ body: |
     ; CHECK-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST4]], [[C]](s32)
     ; CHECK-NEXT: [[BITCAST5:%[0-9]+]]:_(s32) = G_BITCAST [[UV7]](<2 x s16>)
     ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]]
-    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C1]]
-    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C]](s32)
-    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND1]], [[SHL]]
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C1]]
+    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
+    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[LSHR]], [[SHL]]
     ; CHECK-NEXT: [[BITCAST6:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
@@ -543,38 +541,34 @@ body: |
     ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[COPY]], [[SHL1]]
     ; CHECK-NEXT: [[BITCAST7:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
     ; CHECK-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST6]](<2 x s16>), [[BITCAST7]](<2 x s16>)
-    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C1]]
-    ; CHECK-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[BITCAST3]], [[C1]]
-    ; CHECK-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND4]], [[C]](s32)
-    ; CHECK-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND3]], [[SHL2]]
+    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[BITCAST3]], [[C1]]
+    ; CHECK-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C]](s32)
+    ; CHECK-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[LSHR1]], [[SHL2]]
     ; CHECK-NEXT: [[BITCAST8:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
     ; CHECK-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[C2]], [[C]](s32)
     ; CHECK-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[COPY2]], [[SHL3]]
     ; CHECK-NEXT: [[BITCAST9:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR3]](s32)
     ; CHECK-NEXT: [[CONCAT_VECTORS3:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST8]](<2 x s16>), [[BITCAST9]](<2 x s16>)
-    ; CHECK-NEXT: [[AND5:%[0-9]+]]:_(<4 x s16>) = G_AND [[CONCAT_VECTORS2]], [[CONCAT_VECTORS3]]
-    ; CHECK-NEXT: [[UV8:%[0-9]+]]:_(<2 x s16>), [[UV9:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[AND5]](<4 x s16>)
+    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(<4 x s16>) = G_AND [[CONCAT_VECTORS2]], [[CONCAT_VECTORS3]]
+    ; CHECK-NEXT: [[UV8:%[0-9]+]]:_(<2 x s16>), [[UV9:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[AND3]](<4 x s16>)
     ; CHECK-NEXT: [[BITCAST10:%[0-9]+]]:_(s32) = G_BITCAST [[UV8]](<2 x s16>)
     ; CHECK-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST10]], [[C]](s32)
     ; CHECK-NEXT: [[DEF2:%[0-9]+]]:_(<8 x s16>) = G_IMPLICIT_DEF
     ; CHECK-NEXT: [[UV10:%[0-9]+]]:_(<2 x s16>), [[UV11:%[0-9]+]]:_(<2 x s16>), [[UV12:%[0-9]+]]:_(<2 x s16>), [[UV13:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF2]](<8 x s16>)
     ; CHECK-NEXT: [[BITCAST11:%[0-9]+]]:_(s32) = G_BITCAST [[UV12]](<2 x s16>)
     ; CHECK-NEXT: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST11]], [[C]](s32)
-    ; CHECK-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[BITCAST4]], [[C1]]
-    ; CHECK-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[LSHR2]], [[C1]]
-    ; CHECK-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C]](s32)
-    ; CHECK-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[AND6]], [[SHL4]]
+    ; CHECK-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[BITCAST4]], [[C1]]
+    ; CHECK-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LSHR2]], [[C]](s32)
+    ; CHECK-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL4]]
     ; CHECK-NEXT: [[BITCAST12:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR4]](s32)
-    ; CHECK-NEXT: [[AND8:%[0-9]+]]:_(s32) = G_AND [[BITCAST5]], [[C1]]
-    ; CHECK-NEXT: [[AND9:%[0-9]+]]:_(s32) = G_AND [[BITCAST10]], [[C1]]
-    ; CHECK-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[C]](s32)
-    ; CHECK-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[AND8]], [[SHL5]]
+    ; CHECK-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[BITCAST5]], [[C1]]
+    ; CHECK-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[BITCAST10]], [[C1]]
+    ; CHECK-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND6]], [[C]](s32)
+    ; CHECK-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[AND5]], [[SHL5]]
     ; CHECK-NEXT: [[BITCAST13:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR5]](s32)
-    ; CHECK-NEXT: [[AND10:%[0-9]+]]:_(s32) = G_AND [[LSHR3]], [[C1]]
-    ; CHECK-NEXT: [[AND11:%[0-9]+]]:_(s32) = G_AND [[LSHR4]], [[C1]]
-    ; CHECK-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[C]](s32)
-    ; CHECK-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[AND10]], [[SHL6]]
+    ; CHECK-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[LSHR4]], [[C]](s32)
+    ; CHECK-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[LSHR3]], [[SHL6]]
     ; CHECK-NEXT: [[BITCAST14:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR6]](s32)
     ; CHECK-NEXT: [[CONCAT_VECTORS4:%[0-9]+]]:_(<8 x s16>) = G_CONCAT_VECTORS [[BITCAST12]](<2 x s16>), [[BITCAST13]](<2 x s16>), [[BITCAST14]](<2 x s16>), [[UV13]](<2 x s16>)
     ; CHECK-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[CONCAT_VECTORS4]](<8 x s16>)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-anyext.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-anyext.mir
index 023617350e52f..59adc212386a2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-anyext.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-anyext.mir
@@ -620,8 +620,8 @@ body: |
     ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
     ; CHECK-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C2]](s32)
     ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C3]]
     ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
     ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C3]]
@@ -631,13 +631,12 @@ body: |
     ; CHECK-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
     ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C3]]
     ; CHECK-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32)
-    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]]
-    ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16)
+    ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[TRUNC3]], [[C4]](s16)
     ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL1]]
     ; CHECK-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[DEF]](s32)
-    ; CHECK-NEXT: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC4]], [[C3]]
-    ; CHECK-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND4]], [[C4]](s16)
-    ; CHECK-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL2]]
+    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC4]], [[C3]]
+    ; CHECK-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16)
+    ; CHECK-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND3]], [[SHL2]]
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s16) = COPY [[OR2]](s16)
     ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16)
     ; CHECK-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16)
@@ -685,8 +684,7 @@ body: |
     ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C]](s32)
     ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
     ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C1]]
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]]
-    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
+    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LSHR]], [[C]](s32)
     ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
     ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
     ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[C2]], [[C]](s32)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ashr.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ashr.mir
index 68ce57cf51c6f..f4aaab745e03b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ashr.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ashr.mir
@@ -18,6 +18,7 @@ body: |
     ; SI-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
     ; SI-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[COPY]], [[COPY1]](s32)
     ; SI-NEXT: $vgpr0 = COPY [[ASHR]](s32)
+    ;
     ; VI-LABEL: name: test_ashr_s32_s32
     ; VI: liveins: $vgpr0, $vgpr1
     ; VI-NEXT: {{  $}}
@@ -25,6 +26,7 @@ body: |
     ; VI-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
     ; VI-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[COPY]], [[COPY1]](s32)
     ; VI-NEXT: $vgpr0 = COPY [[ASHR]](s32)
+    ;
     ; GFX9PLUS-LABEL: name: test_ashr_s32_s32
     ; GFX9PLUS: liveins: $vgpr0, $vgpr1
     ; GFX9PLUS-NEXT: {{  $}}
@@ -51,6 +53,7 @@ body: |
     ; SI-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64)
     ; SI-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[COPY]], [[TRUNC]](s32)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[ASHR]](s64)
+    ;
     ; VI-LABEL: name: test_ashr_s64_s64
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; VI-NEXT: {{  $}}
@@ -59,6 +62,7 @@ body: |
     ; VI-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64)
     ; VI-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[COPY]], [[TRUNC]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[ASHR]](s64)
+    ;
     ; GFX9PLUS-LABEL: name: test_ashr_s64_s64
     ; GFX9PLUS: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9PLUS-NEXT: {{  $}}
@@ -85,6 +89,7 @@ body: |
     ; SI-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr2
     ; SI-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[COPY]], [[COPY1]](s32)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[ASHR]](s64)
+    ;
     ; VI-LABEL: name: test_ashr_s64_s32
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2
     ; VI-NEXT: {{  $}}
@@ -92,6 +97,7 @@ body: |
     ; VI-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr2
     ; VI-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[COPY]], [[COPY1]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[ASHR]](s64)
+    ;
     ; GFX9PLUS-LABEL: name: test_ashr_s64_s32
     ; GFX9PLUS: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9PLUS-NEXT: {{  $}}
@@ -119,6 +125,7 @@ body: |
     ; SI-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C]]
     ; SI-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[COPY]], [[AND]](s32)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[ASHR]](s64)
+    ;
     ; VI-LABEL: name: test_ashr_s64_s16
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2
     ; VI-NEXT: {{  $}}
@@ -128,6 +135,7 @@ body: |
     ; VI-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C]]
     ; VI-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[COPY]], [[AND]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[ASHR]](s64)
+    ;
     ; GFX9PLUS-LABEL: name: test_ashr_s64_s16
     ; GFX9PLUS: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9PLUS-NEXT: {{  $}}
@@ -158,6 +166,7 @@ body: |
     ; SI-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY]], 16
     ; SI-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[SEXT_INREG]], [[COPY1]](s32)
     ; SI-NEXT: $vgpr0 = COPY [[ASHR]](s32)
+    ;
     ; VI-LABEL: name: test_ashr_s16_s32
     ; VI: liveins: $vgpr0, $vgpr1
     ; VI-NEXT: {{  $}}
@@ -168,6 +177,7 @@ body: |
     ; VI-NEXT: [[ASHR:%[0-9]+]]:_(s16) = G_ASHR [[TRUNC]], [[TRUNC1]](s16)
     ; VI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[ASHR]](s16)
     ; VI-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ;
     ; GFX9PLUS-LABEL: name: test_ashr_s16_s32
     ; GFX9PLUS: liveins: $vgpr0, $vgpr1
     ; GFX9PLUS-NEXT: {{  $}}
@@ -202,6 +212,7 @@ body: |
     ; SI-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY]], 16
     ; SI-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[SEXT_INREG]], [[AND]](s32)
     ; SI-NEXT: $vgpr0 = COPY [[ASHR]](s32)
+    ;
     ; VI-LABEL: name: test_ashr_s16_s16
     ; VI: liveins: $vgpr0, $vgpr1
     ; VI-NEXT: {{  $}}
@@ -212,6 +223,7 @@ body: |
     ; VI-NEXT: [[ASHR:%[0-9]+]]:_(s16) = G_ASHR [[TRUNC]], [[TRUNC1]](s16)
     ; VI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[ASHR]](s16)
     ; VI-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ;
     ; GFX9PLUS-LABEL: name: test_ashr_s16_s16
     ; GFX9PLUS: liveins: $vgpr0, $vgpr1
     ; GFX9PLUS-NEXT: {{  $}}
@@ -247,26 +259,28 @@ body: |
     ; SI-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY]], 16
     ; SI-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[SEXT_INREG]], [[AND]](s32)
     ; SI-NEXT: $vgpr0 = COPY [[ASHR]](s32)
+    ;
     ; VI-LABEL: name: test_ashr_s16_i8
     ; VI: liveins: $vgpr0, $vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; VI-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
     ; VI-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
-    ; VI-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; VI-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+    ; VI-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; VI-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C]]
     ; VI-NEXT: [[ASHR:%[0-9]+]]:_(s16) = G_ASHR [[TRUNC]], [[AND]](s16)
     ; VI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[ASHR]](s16)
     ; VI-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ;
     ; GFX9PLUS-LABEL: name: test_ashr_s16_i8
     ; GFX9PLUS: liveins: $vgpr0, $vgpr1
     ; GFX9PLUS-NEXT: {{  $}}
     ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX9PLUS-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
     ; GFX9PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
-    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; GFX9PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; GFX9PLUS-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C]]
     ; GFX9PLUS-NEXT: [[ASHR:%[0-9]+]]:_(s16) = G_ASHR [[TRUNC]], [[AND]](s16)
     ; GFX9PLUS-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[ASHR]](s16)
@@ -296,13 +310,14 @@ body: |
     ; SI-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY]], 8
     ; SI-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[SEXT_INREG]], [[AND]](s32)
     ; SI-NEXT: $vgpr0 = COPY [[ASHR]](s32)
+    ;
     ; VI-LABEL: name: test_ashr_i8_i8
     ; VI: liveins: $vgpr0, $vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; VI-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; VI-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; VI-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+    ; VI-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; VI-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C]]
     ; VI-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
     ; VI-NEXT: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
@@ -311,13 +326,14 @@ body: |
     ; VI-NEXT: [[ASHR1:%[0-9]+]]:_(s16) = G_ASHR [[ASHR]], [[AND]](s16)
     ; VI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[ASHR1]](s16)
     ; VI-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ;
     ; GFX9PLUS-LABEL: name: test_ashr_i8_i8
     ; GFX9PLUS: liveins: $vgpr0, $vgpr1
     ; GFX9PLUS-NEXT: {{  $}}
     ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX9PLUS-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; GFX9PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; GFX9PLUS-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C]]
     ; GFX9PLUS-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY]], 8
     ; GFX9PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[SEXT_INREG]](s32)
@@ -349,13 +365,14 @@ body: |
     ; SI-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY]], 7
     ; SI-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[SEXT_INREG]], [[AND]](s32)
     ; SI-NEXT: $vgpr0 = COPY [[ASHR]](s32)
+    ;
     ; VI-LABEL: name: test_ashr_s7_s7
     ; VI: liveins: $vgpr0, $vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; VI-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; VI-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 127
     ; VI-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+    ; VI-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 127
     ; VI-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C]]
     ; VI-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
     ; VI-NEXT: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 9
@@ -364,13 +381,14 @@ body: |
     ; VI-NEXT: [[ASHR1:%[0-9]+]]:_(s16) = G_ASHR [[ASHR]], [[AND]](s16)
     ; VI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[ASHR1]](s16)
     ; VI-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ;
     ; GFX9PLUS-LABEL: name: test_ashr_s7_s7
     ; GFX9PLUS: liveins: $vgpr0, $vgpr1
     ; GFX9PLUS-NEXT: {{  $}}
     ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX9PLUS-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 127
     ; GFX9PLUS-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 127
     ; GFX9PLUS-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C]]
     ; GFX9PLUS-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY]], 7
     ; GFX9PLUS-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[SEXT_INREG]](s32)
@@ -402,6 +420,7 @@ body: |
     ; SI-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY]], 24
     ; SI-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[SEXT_INREG]], [[AND]](s32)
     ; SI-NEXT: $vgpr0 = COPY [[ASHR]](s32)
+    ;
     ; VI-LABEL: name: test_ashr_s24_s24
     ; VI: liveins: $vgpr0, $vgpr1
     ; VI-NEXT: {{  $}}
@@ -412,6 +431,7 @@ body: |
     ; VI-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY]], 24
     ; VI-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[SEXT_INREG]], [[AND]](s32)
     ; VI-NEXT: $vgpr0 = COPY [[ASHR]](s32)
+    ;
     ; GFX9PLUS-LABEL: name: test_ashr_s24_s24
     ; GFX9PLUS: liveins: $vgpr0, $vgpr1
     ; GFX9PLUS-NEXT: {{  $}}
@@ -446,6 +466,7 @@ body: |
     ; SI-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C]]
     ; SI-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[COPY]], [[AND]](s32)
     ; SI-NEXT: $vgpr0 = COPY [[ASHR]](s32)
+    ;
     ; VI-LABEL: name: test_ashr_s32_s24
     ; VI: liveins: $vgpr0, $vgpr1
     ; VI-NEXT: {{  $}}
@@ -455,6 +476,7 @@ body: |
     ; VI-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C]]
     ; VI-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[COPY]], [[AND]](s32)
     ; VI-NEXT: $vgpr0 = COPY [[ASHR]](s32)
+    ;
     ; GFX9PLUS-LABEL: name: test_ashr_s32_s24
     ; GFX9PLUS: liveins: $vgpr0, $vgpr1
     ; GFX9PLUS-NEXT: {{  $}}
@@ -488,6 +510,7 @@ body: |
     ; SI-NEXT: [[ASHR1:%[0-9]+]]:_(s32) = G_ASHR [[UV1]], [[UV3]](s32)
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[ASHR]](s32), [[ASHR1]](s32)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; VI-LABEL: name: test_ashr_v2s32_v2s32
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; VI-NEXT: {{  $}}
@@ -499,6 +522,7 @@ body: |
     ; VI-NEXT: [[ASHR1:%[0-9]+]]:_(s32) = G_ASHR [[UV1]], [[UV3]](s32)
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[ASHR]](s32), [[ASHR1]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; GFX9PLUS-LABEL: name: test_ashr_v2s32_v2s32
     ; GFX9PLUS: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9PLUS-NEXT: {{  $}}
@@ -534,6 +558,7 @@ body: |
     ; SI-NEXT: [[ASHR2:%[0-9]+]]:_(s32) = G_ASHR [[UV2]], [[UV5]](s32)
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[ASHR]](s32), [[ASHR1]](s32), [[ASHR2]](s32)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
+    ;
     ; VI-LABEL: name: test_ashr_v3s32_v3s32
     ; VI: liveins: $vgpr0_vgpr1_vgpr2, $vgpr3_vgpr4_vgpr5
     ; VI-NEXT: {{  $}}
@@ -546,6 +571,7 @@ body: |
     ; VI-NEXT: [[ASHR2:%[0-9]+]]:_(s32) = G_ASHR [[UV2]], [[UV5]](s32)
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[ASHR]](s32), [[ASHR1]](s32), [[ASHR2]](s32)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
+    ;
     ; GFX9PLUS-LABEL: name: test_ashr_v3s32_v3s32
     ; GFX9PLUS: liveins: $vgpr0_vgpr1_vgpr2, $vgpr3_vgpr4_vgpr5
     ; GFX9PLUS-NEXT: {{  $}}
@@ -581,6 +607,7 @@ body: |
     ; SI-NEXT: [[ASHR1:%[0-9]+]]:_(s64) = G_ASHR [[UV1]], [[UV3]](s32)
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[ASHR]](s64), [[ASHR1]](s64)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ;
     ; VI-LABEL: name: test_ashr_v2s64_v2s32
     ; VI: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5
     ; VI-NEXT: {{  $}}
@@ -592,6 +619,7 @@ body: |
     ; VI-NEXT: [[ASHR1:%[0-9]+]]:_(s64) = G_ASHR [[UV1]], [[UV3]](s32)
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[ASHR]](s64), [[ASHR1]](s64)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ;
     ; GFX9PLUS-LABEL: name: test_ashr_v2s64_v2s32
     ; GFX9PLUS: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5
     ; GFX9PLUS-NEXT: {{  $}}
@@ -629,6 +657,7 @@ body: |
     ; SI-NEXT: [[UV7:%[0-9]+]]:_(s64), [[UV8:%[0-9]+]]:_(s64), [[UV9:%[0-9]+]]:_(s64), [[UV10:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>)
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[ASHR]](s64), [[ASHR1]](s64), [[ASHR2]](s64), [[UV10]](s64)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
+    ;
     ; VI-LABEL: name: test_ashr_v3s64_v3s32
     ; VI: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10
     ; VI-NEXT: {{  $}}
@@ -643,6 +672,7 @@ body: |
     ; VI-NEXT: [[UV7:%[0-9]+]]:_(s64), [[UV8:%[0-9]+]]:_(s64), [[UV9:%[0-9]+]]:_(s64), [[UV10:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>)
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[ASHR]](s64), [[ASHR1]](s64), [[ASHR2]](s64), [[UV10]](s64)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
+    ;
     ; GFX9PLUS-LABEL: name: test_ashr_v3s64_v3s32
     ; GFX9PLUS: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10
     ; GFX9PLUS-NEXT: {{  $}}
@@ -686,15 +716,15 @@ body: |
     ; SI-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C1]]
     ; SI-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[BITCAST]], 16
     ; SI-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[SEXT_INREG]], [[AND]](s32)
-    ; SI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C1]]
     ; SI-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[LSHR]], 16
-    ; SI-NEXT: [[ASHR1:%[0-9]+]]:_(s32) = G_ASHR [[SEXT_INREG1]], [[AND1]](s32)
-    ; SI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[ASHR]], [[C1]]
-    ; SI-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[ASHR1]], [[C1]]
-    ; SI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C]](s32)
-    ; SI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL]]
+    ; SI-NEXT: [[ASHR1:%[0-9]+]]:_(s32) = G_ASHR [[SEXT_INREG1]], [[LSHR1]](s32)
+    ; SI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[ASHR]], [[C1]]
+    ; SI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[ASHR1]], [[C1]]
+    ; SI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C]](s32)
+    ; SI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND1]], [[SHL]]
     ; SI-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; SI-NEXT: $vgpr0 = COPY [[BITCAST2]](<2 x s16>)
+    ;
     ; VI-LABEL: name: test_ashr_v2s16_v2s16
     ; VI: liveins: $vgpr0, $vgpr1
     ; VI-NEXT: {{  $}}
@@ -717,6 +747,7 @@ body: |
     ; VI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL]]
     ; VI-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; VI-NEXT: $vgpr0 = COPY [[BITCAST2]](<2 x s16>)
+    ;
     ; GFX9PLUS-LABEL: name: test_ashr_v2s16_v2s16
     ; GFX9PLUS: liveins: $vgpr0, $vgpr1
     ; GFX9PLUS-NEXT: {{  $}}
@@ -756,6 +787,7 @@ body: |
     ; SI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
     ; SI-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; SI-NEXT: $vgpr0 = COPY [[BITCAST1]](<2 x s16>)
+    ;
     ; VI-LABEL: name: test_ashr_v2s16_v2s32
     ; VI: liveins: $vgpr0, $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -777,6 +809,7 @@ body: |
     ; VI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL]]
     ; VI-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; VI-NEXT: $vgpr0 = COPY [[BITCAST1]](<2 x s16>)
+    ;
     ; GFX9PLUS-LABEL: name: test_ashr_v2s16_v2s32
     ; GFX9PLUS: liveins: $vgpr0, $vgpr0_vgpr1
     ; GFX9PLUS-NEXT: {{  $}}
@@ -823,34 +856,33 @@ body: |
     ; SI-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[BITCAST2]], [[C1]]
     ; SI-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[BITCAST]], 16
     ; SI-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[SEXT_INREG]], [[AND]](s32)
-    ; SI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C1]]
     ; SI-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[LSHR]], 16
-    ; SI-NEXT: [[ASHR1:%[0-9]+]]:_(s32) = G_ASHR [[SEXT_INREG1]], [[AND1]](s32)
-    ; SI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[BITCAST3]], [[C1]]
+    ; SI-NEXT: [[ASHR1:%[0-9]+]]:_(s32) = G_ASHR [[SEXT_INREG1]], [[LSHR1]](s32)
+    ; SI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[BITCAST3]], [[C1]]
     ; SI-NEXT: [[SEXT_INREG2:%[0-9]+]]:_(s32) = G_SEXT_INREG [[BITCAST1]], 16
-    ; SI-NEXT: [[ASHR2:%[0-9]+]]:_(s32) = G_ASHR [[SEXT_INREG2]], [[AND2]](s32)
+    ; SI-NEXT: [[ASHR2:%[0-9]+]]:_(s32) = G_ASHR [[SEXT_INREG2]], [[AND1]](s32)
     ; SI-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
     ; SI-NEXT: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>)
     ; SI-NEXT: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[UV6]](<2 x s16>)
     ; SI-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST4]], [[C]](s32)
     ; SI-NEXT: [[BITCAST5:%[0-9]+]]:_(s32) = G_BITCAST [[UV7]](<2 x s16>)
-    ; SI-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[ASHR]], [[C1]]
-    ; SI-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[ASHR1]], [[C1]]
-    ; SI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND4]], [[C]](s32)
-    ; SI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND3]], [[SHL]]
+    ; SI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[ASHR]], [[C1]]
+    ; SI-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[ASHR1]], [[C1]]
+    ; SI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C]](s32)
+    ; SI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL]]
     ; SI-NEXT: [[BITCAST6:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
-    ; SI-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[ASHR2]], [[C1]]
-    ; SI-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[BITCAST4]], [[C1]]
-    ; SI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND6]], [[C]](s32)
-    ; SI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND5]], [[SHL1]]
+    ; SI-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[ASHR2]], [[C1]]
+    ; SI-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[BITCAST4]], [[C1]]
+    ; SI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C]](s32)
+    ; SI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL1]]
     ; SI-NEXT: [[BITCAST7:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
-    ; SI-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[LSHR2]], [[C1]]
-    ; SI-NEXT: [[AND8:%[0-9]+]]:_(s32) = G_AND [[BITCAST5]], [[C1]]
-    ; SI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND8]], [[C]](s32)
-    ; SI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND7]], [[SHL2]]
+    ; SI-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[BITCAST5]], [[C1]]
+    ; SI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND6]], [[C]](s32)
+    ; SI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[LSHR2]], [[SHL2]]
     ; SI-NEXT: [[BITCAST8:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
     ; SI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST6]](<2 x s16>), [[BITCAST7]](<2 x s16>), [[BITCAST8]](<2 x s16>)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
     ; VI-LABEL: name: test_ashr_v3s16_v3s16
     ; VI: liveins: $vgpr0_vgpr1_vgpr2, $vgpr3_vgpr4_vgpr5
     ; VI-NEXT: {{  $}}
@@ -890,13 +922,13 @@ body: |
     ; VI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND]], [[C]](s32)
     ; VI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL1]]
     ; VI-NEXT: [[BITCAST7:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
-    ; VI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR2]], [[C1]]
-    ; VI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[BITCAST5]], [[C1]]
-    ; VI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C]](s32)
-    ; VI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND1]], [[SHL2]]
+    ; VI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[BITCAST5]], [[C1]]
+    ; VI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
+    ; VI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[LSHR2]], [[SHL2]]
     ; VI-NEXT: [[BITCAST8:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
     ; VI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST6]](<2 x s16>), [[BITCAST7]](<2 x s16>), [[BITCAST8]](<2 x s16>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
     ; GFX9PLUS-LABEL: name: test_ashr_v3s16_v3s16
     ; GFX9PLUS: liveins: $vgpr0_vgpr1_vgpr2, $vgpr3_vgpr4_vgpr5
     ; GFX9PLUS-NEXT: {{  $}}
@@ -967,27 +999,26 @@ body: |
     ; SI-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[BITCAST2]], [[C1]]
     ; SI-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[BITCAST]], 16
     ; SI-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[SEXT_INREG]], [[AND]](s32)
-    ; SI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR2]], [[C1]]
     ; SI-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[LSHR]], 16
-    ; SI-NEXT: [[ASHR1:%[0-9]+]]:_(s32) = G_ASHR [[SEXT_INREG1]], [[AND1]](s32)
-    ; SI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[BITCAST3]], [[C1]]
+    ; SI-NEXT: [[ASHR1:%[0-9]+]]:_(s32) = G_ASHR [[SEXT_INREG1]], [[LSHR2]](s32)
+    ; SI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[BITCAST3]], [[C1]]
     ; SI-NEXT: [[SEXT_INREG2:%[0-9]+]]:_(s32) = G_SEXT_INREG [[BITCAST1]], 16
-    ; SI-NEXT: [[ASHR2:%[0-9]+]]:_(s32) = G_ASHR [[SEXT_INREG2]], [[AND2]](s32)
-    ; SI-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[LSHR3]], [[C1]]
+    ; SI-NEXT: [[ASHR2:%[0-9]+]]:_(s32) = G_ASHR [[SEXT_INREG2]], [[AND1]](s32)
     ; SI-NEXT: [[SEXT_INREG3:%[0-9]+]]:_(s32) = G_SEXT_INREG [[LSHR1]], 16
-    ; SI-NEXT: [[ASHR3:%[0-9]+]]:_(s32) = G_ASHR [[SEXT_INREG3]], [[AND3]](s32)
-    ; SI-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[ASHR]], [[C1]]
-    ; SI-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[ASHR1]], [[C1]]
-    ; SI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C]](s32)
-    ; SI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL]]
+    ; SI-NEXT: [[ASHR3:%[0-9]+]]:_(s32) = G_ASHR [[SEXT_INREG3]], [[LSHR3]](s32)
+    ; SI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[ASHR]], [[C1]]
+    ; SI-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[ASHR1]], [[C1]]
+    ; SI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C]](s32)
+    ; SI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL]]
     ; SI-NEXT: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
-    ; SI-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[ASHR2]], [[C1]]
-    ; SI-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[ASHR3]], [[C1]]
-    ; SI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C]](s32)
-    ; SI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND6]], [[SHL1]]
+    ; SI-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[ASHR2]], [[C1]]
+    ; SI-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[ASHR3]], [[C1]]
+    ; SI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C]](s32)
+    ; SI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL1]]
     ; SI-NEXT: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
     ; SI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
     ; VI-LABEL: name: test_ashr_v4s16_v4s16
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; VI-NEXT: {{  $}}
@@ -1028,6 +1059,7 @@ body: |
     ; VI-NEXT: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
     ; VI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
     ; GFX9PLUS-LABEL: name: test_ashr_v4s16_v4s16
     ; GFX9PLUS: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9PLUS-NEXT: {{  $}}
@@ -1056,13 +1088,13 @@ body: |
     ; SI-NEXT: {{  $}}
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(s128) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
     ; SI-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr4
-    ; SI-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 64
+    ; SI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 64
     ; SI-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[COPY]](s128)
-    ; SI-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[COPY1]], [[C1]]
-    ; SI-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[C1]], [[COPY1]]
-    ; SI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-    ; SI-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY1]](s32), [[C1]]
-    ; SI-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C]]
+    ; SI-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[COPY1]], [[C]]
+    ; SI-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[C]], [[COPY1]]
+    ; SI-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; SI-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY1]](s32), [[C]]
+    ; SI-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C1]]
     ; SI-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[UV1]], [[COPY1]](s32)
     ; SI-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[UV]], [[COPY1]](s32)
     ; SI-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[UV1]], [[SUB1]](s32)
@@ -1075,18 +1107,19 @@ body: |
     ; SI-NEXT: [[SELECT2:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s1), [[ASHR]], [[ASHR1]]
     ; SI-NEXT: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[SELECT1]](s64), [[SELECT2]](s64)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV]](s128)
+    ;
     ; VI-LABEL: name: test_ashr_s128_s128
     ; VI: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(s128) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
     ; VI-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr4
-    ; VI-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 64
+    ; VI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 64
     ; VI-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[COPY]](s128)
-    ; VI-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[COPY1]], [[C1]]
-    ; VI-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[C1]], [[COPY1]]
-    ; VI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-    ; VI-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY1]](s32), [[C1]]
-    ; VI-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C]]
+    ; VI-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[COPY1]], [[C]]
+    ; VI-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[C]], [[COPY1]]
+    ; VI-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; VI-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY1]](s32), [[C]]
+    ; VI-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C1]]
     ; VI-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[UV1]], [[COPY1]](s32)
     ; VI-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[UV]], [[COPY1]](s32)
     ; VI-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[UV1]], [[SUB1]](s32)
@@ -1099,18 +1132,19 @@ body: |
     ; VI-NEXT: [[SELECT2:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s1), [[ASHR]], [[ASHR1]]
     ; VI-NEXT: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[SELECT1]](s64), [[SELECT2]](s64)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV]](s128)
+    ;
     ; GFX9PLUS-LABEL: name: test_ashr_s128_s128
     ; GFX9PLUS: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4
     ; GFX9PLUS-NEXT: {{  $}}
     ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(s128) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
     ; GFX9PLUS-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr4
-    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 64
+    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 64
     ; GFX9PLUS-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[COPY]](s128)
-    ; GFX9PLUS-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[COPY1]], [[C1]]
-    ; GFX9PLUS-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[C1]], [[COPY1]]
-    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-    ; GFX9PLUS-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY1]](s32), [[C1]]
-    ; GFX9PLUS-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C]]
+    ; GFX9PLUS-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[COPY1]], [[C]]
+    ; GFX9PLUS-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[C]], [[COPY1]]
+    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; GFX9PLUS-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY1]](s32), [[C]]
+    ; GFX9PLUS-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C1]]
     ; GFX9PLUS-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[UV1]], [[COPY1]](s32)
     ; GFX9PLUS-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[UV]], [[COPY1]](s32)
     ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[UV1]], [[SUB1]](s32)
@@ -1160,6 +1194,7 @@ body: |
     ; SI-NEXT: [[SELECT2:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s1), [[ASHR]], [[ASHR1]]
     ; SI-NEXT: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[SELECT1]](s64), [[SELECT2]](s64)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV]](s128)
+    ;
     ; VI-LABEL: name: test_ashr_s128_s132
     ; VI: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4
     ; VI-NEXT: {{  $}}
@@ -1184,6 +1219,7 @@ body: |
     ; VI-NEXT: [[SELECT2:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s1), [[ASHR]], [[ASHR1]]
     ; VI-NEXT: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[SELECT1]](s64), [[SELECT2]](s64)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV]](s128)
+    ;
     ; GFX9PLUS-LABEL: name: test_ashr_s128_s132
     ; GFX9PLUS: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4
     ; GFX9PLUS-NEXT: {{  $}}
@@ -1225,11 +1261,13 @@ body: |
     ; SI-NEXT: {{  $}}
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(s128) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[COPY]](s128)
+    ;
     ; VI-LABEL: name: test_ashr_s128_s32_0
     ; VI: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(s128) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[COPY]](s128)
+    ;
     ; GFX9PLUS-LABEL: name: test_ashr_s128_s32_0
     ; GFX9PLUS: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4
     ; GFX9PLUS-NEXT: {{  $}}
@@ -1261,6 +1299,7 @@ body: |
     ; SI-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[UV1]], [[C]](s32)
     ; SI-NEXT: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[OR]](s64), [[ASHR]](s64)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV]](s128)
+    ;
     ; VI-LABEL: name: test_ashr_s128_s32_23
     ; VI: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4
     ; VI-NEXT: {{  $}}
@@ -1274,6 +1313,7 @@ body: |
     ; VI-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[UV1]], [[C]](s32)
     ; VI-NEXT: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[OR]](s64), [[ASHR]](s64)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV]](s128)
+    ;
     ; GFX9PLUS-LABEL: name: test_ashr_s128_s32_23
     ; GFX9PLUS: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4
     ; GFX9PLUS-NEXT: {{  $}}
@@ -1312,6 +1352,7 @@ body: |
     ; SI-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[UV1]], [[C]](s32)
     ; SI-NEXT: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[OR]](s64), [[ASHR]](s64)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV]](s128)
+    ;
     ; VI-LABEL: name: test_ashr_s128_s32_31
     ; VI: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4
     ; VI-NEXT: {{  $}}
@@ -1325,6 +1366,7 @@ body: |
     ; VI-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[UV1]], [[C]](s32)
     ; VI-NEXT: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[OR]](s64), [[ASHR]](s64)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV]](s128)
+    ;
     ; GFX9PLUS-LABEL: name: test_ashr_s128_s32_31
     ; GFX9PLUS: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4
     ; GFX9PLUS-NEXT: {{  $}}
@@ -1362,6 +1404,7 @@ body: |
     ; SI-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[UV1]], [[C]](s32)
     ; SI-NEXT: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[OR]](s64), [[ASHR]](s64)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV]](s128)
+    ;
     ; VI-LABEL: name: test_ashr_s128_s32_32
     ; VI: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4
     ; VI-NEXT: {{  $}}
@@ -1374,6 +1417,7 @@ body: |
     ; VI-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[UV1]], [[C]](s32)
     ; VI-NEXT: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[OR]](s64), [[ASHR]](s64)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV]](s128)
+    ;
     ; GFX9PLUS-LABEL: name: test_ashr_s128_s32_32
     ; GFX9PLUS: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4
     ; GFX9PLUS-NEXT: {{  $}}
@@ -1411,6 +1455,7 @@ body: |
     ; SI-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[UV1]], [[C]](s32)
     ; SI-NEXT: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[OR]](s64), [[ASHR]](s64)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV]](s128)
+    ;
     ; VI-LABEL: name: test_ashr_s128_s32_33
     ; VI: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4
     ; VI-NEXT: {{  $}}
@@ -1424,6 +1469,7 @@ body: |
     ; VI-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[UV1]], [[C]](s32)
     ; VI-NEXT: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[OR]](s64), [[ASHR]](s64)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV]](s128)
+    ;
     ; GFX9PLUS-LABEL: name: test_ashr_s128_s32_33
     ; GFX9PLUS: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4
     ; GFX9PLUS-NEXT: {{  $}}
@@ -1459,6 +1505,7 @@ body: |
     ; SI-NEXT: [[ASHR1:%[0-9]+]]:_(s64) = G_ASHR [[UV1]], [[C]](s32)
     ; SI-NEXT: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[ASHR]](s64), [[ASHR1]](s64)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV]](s128)
+    ;
     ; VI-LABEL: name: test_ashr_s128_s32_127
     ; VI: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4
     ; VI-NEXT: {{  $}}
@@ -1469,6 +1516,7 @@ body: |
     ; VI-NEXT: [[ASHR1:%[0-9]+]]:_(s64) = G_ASHR [[UV1]], [[C]](s32)
     ; VI-NEXT: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[ASHR]](s64), [[ASHR1]](s64)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV]](s128)
+    ;
     ; GFX9PLUS-LABEL: name: test_ashr_s128_s32_127
     ; GFX9PLUS: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4
     ; GFX9PLUS-NEXT: {{  $}}
@@ -1496,71 +1544,71 @@ body: |
     ; SI-NEXT: {{  $}}
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(s256) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
     ; SI-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr8
-    ; SI-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 128
+    ; SI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 128
     ; SI-NEXT: [[UV:%[0-9]+]]:_(s128), [[UV1:%[0-9]+]]:_(s128) = G_UNMERGE_VALUES [[COPY]](s256)
-    ; SI-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[COPY1]], [[C2]]
-    ; SI-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[C2]], [[COPY1]]
-    ; SI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-    ; SI-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY1]](s32), [[C2]]
-    ; SI-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C]]
-    ; SI-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 64
+    ; SI-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[COPY1]], [[C]]
+    ; SI-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[C]], [[COPY1]]
+    ; SI-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; SI-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY1]](s32), [[C]]
+    ; SI-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C1]]
+    ; SI-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 64
     ; SI-NEXT: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[UV1]](s128)
-    ; SI-NEXT: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[COPY1]], [[C3]]
-    ; SI-NEXT: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[C3]], [[COPY1]]
-    ; SI-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY1]](s32), [[C3]]
-    ; SI-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C]]
+    ; SI-NEXT: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[COPY1]], [[C2]]
+    ; SI-NEXT: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[C2]], [[COPY1]]
+    ; SI-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY1]](s32), [[C2]]
+    ; SI-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C1]]
     ; SI-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[UV3]], [[COPY1]](s32)
     ; SI-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[UV2]], [[COPY1]](s32)
     ; SI-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[UV3]], [[SUB3]](s32)
     ; SI-NEXT: [[OR:%[0-9]+]]:_(s64) = G_OR [[LSHR]], [[SHL]]
-    ; SI-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 63
-    ; SI-NEXT: [[ASHR1:%[0-9]+]]:_(s64) = G_ASHR [[UV3]], [[C4]](s32)
+    ; SI-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 63
+    ; SI-NEXT: [[ASHR1:%[0-9]+]]:_(s64) = G_ASHR [[UV3]], [[C3]](s32)
     ; SI-NEXT: [[ASHR2:%[0-9]+]]:_(s64) = G_ASHR [[UV3]], [[SUB2]](s32)
     ; SI-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP2]](s1), [[OR]], [[ASHR2]]
     ; SI-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[ICMP3]](s1), [[UV2]], [[SELECT]]
     ; SI-NEXT: [[SELECT2:%[0-9]+]]:_(s64) = G_SELECT [[ICMP2]](s1), [[ASHR]], [[ASHR1]]
     ; SI-NEXT: [[UV4:%[0-9]+]]:_(s64), [[UV5:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[UV]](s128)
-    ; SI-NEXT: [[SUB4:%[0-9]+]]:_(s32) = G_SUB [[COPY1]], [[C3]]
-    ; SI-NEXT: [[SUB5:%[0-9]+]]:_(s32) = G_SUB [[C3]], [[COPY1]]
-    ; SI-NEXT: [[ICMP4:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY1]](s32), [[C3]]
-    ; SI-NEXT: [[ICMP5:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C]]
+    ; SI-NEXT: [[SUB4:%[0-9]+]]:_(s32) = G_SUB [[COPY1]], [[C2]]
+    ; SI-NEXT: [[SUB5:%[0-9]+]]:_(s32) = G_SUB [[C2]], [[COPY1]]
+    ; SI-NEXT: [[ICMP4:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY1]](s32), [[C2]]
+    ; SI-NEXT: [[ICMP5:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C1]]
     ; SI-NEXT: [[LSHR1:%[0-9]+]]:_(s64) = G_LSHR [[UV5]], [[COPY1]](s32)
     ; SI-NEXT: [[LSHR2:%[0-9]+]]:_(s64) = G_LSHR [[UV4]], [[COPY1]](s32)
     ; SI-NEXT: [[SHL1:%[0-9]+]]:_(s64) = G_SHL [[UV5]], [[SUB5]](s32)
     ; SI-NEXT: [[OR1:%[0-9]+]]:_(s64) = G_OR [[LSHR2]], [[SHL1]]
-    ; SI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; SI-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
     ; SI-NEXT: [[LSHR3:%[0-9]+]]:_(s64) = G_LSHR [[UV5]], [[SUB4]](s32)
     ; SI-NEXT: [[SELECT3:%[0-9]+]]:_(s64) = G_SELECT [[ICMP4]](s1), [[OR1]], [[LSHR3]]
     ; SI-NEXT: [[SELECT4:%[0-9]+]]:_(s64) = G_SELECT [[ICMP5]](s1), [[UV4]], [[SELECT3]]
-    ; SI-NEXT: [[SELECT5:%[0-9]+]]:_(s64) = G_SELECT [[ICMP4]](s1), [[LSHR1]], [[C1]]
+    ; SI-NEXT: [[SELECT5:%[0-9]+]]:_(s64) = G_SELECT [[ICMP4]](s1), [[LSHR1]], [[C4]]
     ; SI-NEXT: [[UV6:%[0-9]+]]:_(s64), [[UV7:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[UV1]](s128)
-    ; SI-NEXT: [[SUB6:%[0-9]+]]:_(s32) = G_SUB [[SUB1]], [[C3]]
-    ; SI-NEXT: [[SUB7:%[0-9]+]]:_(s32) = G_SUB [[C3]], [[SUB1]]
-    ; SI-NEXT: [[ICMP6:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[SUB1]](s32), [[C3]]
-    ; SI-NEXT: [[ICMP7:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[SUB1]](s32), [[C]]
+    ; SI-NEXT: [[SUB6:%[0-9]+]]:_(s32) = G_SUB [[SUB1]], [[C2]]
+    ; SI-NEXT: [[SUB7:%[0-9]+]]:_(s32) = G_SUB [[C2]], [[SUB1]]
+    ; SI-NEXT: [[ICMP6:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[SUB1]](s32), [[C2]]
+    ; SI-NEXT: [[ICMP7:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[SUB1]](s32), [[C1]]
     ; SI-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[UV6]], [[SUB1]](s32)
     ; SI-NEXT: [[LSHR4:%[0-9]+]]:_(s64) = G_LSHR [[UV6]], [[SUB7]](s32)
     ; SI-NEXT: [[SHL3:%[0-9]+]]:_(s64) = G_SHL [[UV7]], [[SUB1]](s32)
     ; SI-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[LSHR4]], [[SHL3]]
     ; SI-NEXT: [[SHL4:%[0-9]+]]:_(s64) = G_SHL [[UV6]], [[SUB6]](s32)
-    ; SI-NEXT: [[SELECT6:%[0-9]+]]:_(s64) = G_SELECT [[ICMP6]](s1), [[SHL2]], [[C1]]
+    ; SI-NEXT: [[SELECT6:%[0-9]+]]:_(s64) = G_SELECT [[ICMP6]](s1), [[SHL2]], [[C4]]
     ; SI-NEXT: [[SELECT7:%[0-9]+]]:_(s64) = G_SELECT [[ICMP6]](s1), [[OR2]], [[SHL4]]
     ; SI-NEXT: [[SELECT8:%[0-9]+]]:_(s64) = G_SELECT [[ICMP7]](s1), [[UV7]], [[SELECT7]]
     ; SI-NEXT: [[OR3:%[0-9]+]]:_(s64) = G_OR [[SELECT4]], [[SELECT6]]
     ; SI-NEXT: [[OR4:%[0-9]+]]:_(s64) = G_OR [[SELECT5]], [[SELECT8]]
     ; SI-NEXT: [[UV8:%[0-9]+]]:_(s64), [[UV9:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[UV1]](s128)
-    ; SI-NEXT: [[ASHR3:%[0-9]+]]:_(s64) = G_ASHR [[UV9]], [[C4]](s32)
-    ; SI-NEXT: [[ASHR4:%[0-9]+]]:_(s64) = G_ASHR [[UV9]], [[C4]](s32)
+    ; SI-NEXT: [[ASHR3:%[0-9]+]]:_(s64) = G_ASHR [[UV9]], [[C3]](s32)
+    ; SI-NEXT: [[ASHR4:%[0-9]+]]:_(s64) = G_ASHR [[UV9]], [[C3]](s32)
     ; SI-NEXT: [[UV10:%[0-9]+]]:_(s64), [[UV11:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[UV1]](s128)
-    ; SI-NEXT: [[SUB8:%[0-9]+]]:_(s32) = G_SUB [[SUB]], [[C3]]
-    ; SI-NEXT: [[SUB9:%[0-9]+]]:_(s32) = G_SUB [[C3]], [[SUB]]
-    ; SI-NEXT: [[ICMP8:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[SUB]](s32), [[C3]]
-    ; SI-NEXT: [[ICMP9:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[SUB]](s32), [[C]]
+    ; SI-NEXT: [[SUB8:%[0-9]+]]:_(s32) = G_SUB [[SUB]], [[C2]]
+    ; SI-NEXT: [[SUB9:%[0-9]+]]:_(s32) = G_SUB [[C2]], [[SUB]]
+    ; SI-NEXT: [[ICMP8:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[SUB]](s32), [[C2]]
+    ; SI-NEXT: [[ICMP9:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[SUB]](s32), [[C1]]
     ; SI-NEXT: [[ASHR5:%[0-9]+]]:_(s64) = G_ASHR [[UV11]], [[SUB]](s32)
     ; SI-NEXT: [[LSHR5:%[0-9]+]]:_(s64) = G_LSHR [[UV10]], [[SUB]](s32)
     ; SI-NEXT: [[SHL5:%[0-9]+]]:_(s64) = G_SHL [[UV11]], [[SUB9]](s32)
     ; SI-NEXT: [[OR5:%[0-9]+]]:_(s64) = G_OR [[LSHR5]], [[SHL5]]
-    ; SI-NEXT: [[ASHR6:%[0-9]+]]:_(s64) = G_ASHR [[UV11]], [[C4]](s32)
+    ; SI-NEXT: [[ASHR6:%[0-9]+]]:_(s64) = G_ASHR [[UV11]], [[C3]](s32)
     ; SI-NEXT: [[ASHR7:%[0-9]+]]:_(s64) = G_ASHR [[UV11]], [[SUB8]](s32)
     ; SI-NEXT: [[SELECT9:%[0-9]+]]:_(s64) = G_SELECT [[ICMP8]](s1), [[OR5]], [[ASHR7]]
     ; SI-NEXT: [[SELECT10:%[0-9]+]]:_(s64) = G_SELECT [[ICMP9]](s1), [[UV10]], [[SELECT9]]
@@ -1576,76 +1624,77 @@ body: |
     ; SI-NEXT: [[MV1:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[SELECT16]](s64), [[SELECT17]](s64)
     ; SI-NEXT: [[MV2:%[0-9]+]]:_(s256) = G_MERGE_VALUES [[MV]](s128), [[MV1]](s128)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[MV2]](s256)
+    ;
     ; VI-LABEL: name: test_ashr_s256_s256
     ; VI: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(s256) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
     ; VI-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr8
-    ; VI-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 128
+    ; VI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 128
     ; VI-NEXT: [[UV:%[0-9]+]]:_(s128), [[UV1:%[0-9]+]]:_(s128) = G_UNMERGE_VALUES [[COPY]](s256)
-    ; VI-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[COPY1]], [[C2]]
-    ; VI-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[C2]], [[COPY1]]
-    ; VI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-    ; VI-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY1]](s32), [[C2]]
-    ; VI-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C]]
-    ; VI-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 64
+    ; VI-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[COPY1]], [[C]]
+    ; VI-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[C]], [[COPY1]]
+    ; VI-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; VI-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY1]](s32), [[C]]
+    ; VI-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C1]]
+    ; VI-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 64
     ; VI-NEXT: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[UV1]](s128)
-    ; VI-NEXT: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[COPY1]], [[C3]]
-    ; VI-NEXT: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[C3]], [[COPY1]]
-    ; VI-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY1]](s32), [[C3]]
-    ; VI-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C]]
+    ; VI-NEXT: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[COPY1]], [[C2]]
+    ; VI-NEXT: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[C2]], [[COPY1]]
+    ; VI-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY1]](s32), [[C2]]
+    ; VI-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C1]]
     ; VI-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[UV3]], [[COPY1]](s32)
     ; VI-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[UV2]], [[COPY1]](s32)
     ; VI-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[UV3]], [[SUB3]](s32)
     ; VI-NEXT: [[OR:%[0-9]+]]:_(s64) = G_OR [[LSHR]], [[SHL]]
-    ; VI-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 63
-    ; VI-NEXT: [[ASHR1:%[0-9]+]]:_(s64) = G_ASHR [[UV3]], [[C4]](s32)
+    ; VI-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 63
+    ; VI-NEXT: [[ASHR1:%[0-9]+]]:_(s64) = G_ASHR [[UV3]], [[C3]](s32)
     ; VI-NEXT: [[ASHR2:%[0-9]+]]:_(s64) = G_ASHR [[UV3]], [[SUB2]](s32)
     ; VI-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP2]](s1), [[OR]], [[ASHR2]]
     ; VI-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[ICMP3]](s1), [[UV2]], [[SELECT]]
     ; VI-NEXT: [[SELECT2:%[0-9]+]]:_(s64) = G_SELECT [[ICMP2]](s1), [[ASHR]], [[ASHR1]]
     ; VI-NEXT: [[UV4:%[0-9]+]]:_(s64), [[UV5:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[UV]](s128)
-    ; VI-NEXT: [[SUB4:%[0-9]+]]:_(s32) = G_SUB [[COPY1]], [[C3]]
-    ; VI-NEXT: [[SUB5:%[0-9]+]]:_(s32) = G_SUB [[C3]], [[COPY1]]
-    ; VI-NEXT: [[ICMP4:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY1]](s32), [[C3]]
-    ; VI-NEXT: [[ICMP5:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C]]
+    ; VI-NEXT: [[SUB4:%[0-9]+]]:_(s32) = G_SUB [[COPY1]], [[C2]]
+    ; VI-NEXT: [[SUB5:%[0-9]+]]:_(s32) = G_SUB [[C2]], [[COPY1]]
+    ; VI-NEXT: [[ICMP4:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY1]](s32), [[C2]]
+    ; VI-NEXT: [[ICMP5:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C1]]
     ; VI-NEXT: [[LSHR1:%[0-9]+]]:_(s64) = G_LSHR [[UV5]], [[COPY1]](s32)
     ; VI-NEXT: [[LSHR2:%[0-9]+]]:_(s64) = G_LSHR [[UV4]], [[COPY1]](s32)
     ; VI-NEXT: [[SHL1:%[0-9]+]]:_(s64) = G_SHL [[UV5]], [[SUB5]](s32)
     ; VI-NEXT: [[OR1:%[0-9]+]]:_(s64) = G_OR [[LSHR2]], [[SHL1]]
-    ; VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; VI-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
     ; VI-NEXT: [[LSHR3:%[0-9]+]]:_(s64) = G_LSHR [[UV5]], [[SUB4]](s32)
     ; VI-NEXT: [[SELECT3:%[0-9]+]]:_(s64) = G_SELECT [[ICMP4]](s1), [[OR1]], [[LSHR3]]
     ; VI-NEXT: [[SELECT4:%[0-9]+]]:_(s64) = G_SELECT [[ICMP5]](s1), [[UV4]], [[SELECT3]]
-    ; VI-NEXT: [[SELECT5:%[0-9]+]]:_(s64) = G_SELECT [[ICMP4]](s1), [[LSHR1]], [[C1]]
+    ; VI-NEXT: [[SELECT5:%[0-9]+]]:_(s64) = G_SELECT [[ICMP4]](s1), [[LSHR1]], [[C4]]
     ; VI-NEXT: [[UV6:%[0-9]+]]:_(s64), [[UV7:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[UV1]](s128)
-    ; VI-NEXT: [[SUB6:%[0-9]+]]:_(s32) = G_SUB [[SUB1]], [[C3]]
-    ; VI-NEXT: [[SUB7:%[0-9]+]]:_(s32) = G_SUB [[C3]], [[SUB1]]
-    ; VI-NEXT: [[ICMP6:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[SUB1]](s32), [[C3]]
-    ; VI-NEXT: [[ICMP7:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[SUB1]](s32), [[C]]
+    ; VI-NEXT: [[SUB6:%[0-9]+]]:_(s32) = G_SUB [[SUB1]], [[C2]]
+    ; VI-NEXT: [[SUB7:%[0-9]+]]:_(s32) = G_SUB [[C2]], [[SUB1]]
+    ; VI-NEXT: [[ICMP6:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[SUB1]](s32), [[C2]]
+    ; VI-NEXT: [[ICMP7:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[SUB1]](s32), [[C1]]
     ; VI-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[UV6]], [[SUB1]](s32)
     ; VI-NEXT: [[LSHR4:%[0-9]+]]:_(s64) = G_LSHR [[UV6]], [[SUB7]](s32)
     ; VI-NEXT: [[SHL3:%[0-9]+]]:_(s64) = G_SHL [[UV7]], [[SUB1]](s32)
     ; VI-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[LSHR4]], [[SHL3]]
     ; VI-NEXT: [[SHL4:%[0-9]+]]:_(s64) = G_SHL [[UV6]], [[SUB6]](s32)
-    ; VI-NEXT: [[SELECT6:%[0-9]+]]:_(s64) = G_SELECT [[ICMP6]](s1), [[SHL2]], [[C1]]
+    ; VI-NEXT: [[SELECT6:%[0-9]+]]:_(s64) = G_SELECT [[ICMP6]](s1), [[SHL2]], [[C4]]
     ; VI-NEXT: [[SELECT7:%[0-9]+]]:_(s64) = G_SELECT [[ICMP6]](s1), [[OR2]], [[SHL4]]
     ; VI-NEXT: [[SELECT8:%[0-9]+]]:_(s64) = G_SELECT [[ICMP7]](s1), [[UV7]], [[SELECT7]]
     ; VI-NEXT: [[OR3:%[0-9]+]]:_(s64) = G_OR [[SELECT4]], [[SELECT6]]
     ; VI-NEXT: [[OR4:%[0-9]+]]:_(s64) = G_OR [[SELECT5]], [[SELECT8]]
     ; VI-NEXT: [[UV8:%[0-9]+]]:_(s64), [[UV9:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[UV1]](s128)
-    ; VI-NEXT: [[ASHR3:%[0-9]+]]:_(s64) = G_ASHR [[UV9]], [[C4]](s32)
-    ; VI-NEXT: [[ASHR4:%[0-9]+]]:_(s64) = G_ASHR [[UV9]], [[C4]](s32)
+    ; VI-NEXT: [[ASHR3:%[0-9]+]]:_(s64) = G_ASHR [[UV9]], [[C3]](s32)
+    ; VI-NEXT: [[ASHR4:%[0-9]+]]:_(s64) = G_ASHR [[UV9]], [[C3]](s32)
     ; VI-NEXT: [[UV10:%[0-9]+]]:_(s64), [[UV11:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[UV1]](s128)
-    ; VI-NEXT: [[SUB8:%[0-9]+]]:_(s32) = G_SUB [[SUB]], [[C3]]
-    ; VI-NEXT: [[SUB9:%[0-9]+]]:_(s32) = G_SUB [[C3]], [[SUB]]
-    ; VI-NEXT: [[ICMP8:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[SUB]](s32), [[C3]]
-    ; VI-NEXT: [[ICMP9:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[SUB]](s32), [[C]]
+    ; VI-NEXT: [[SUB8:%[0-9]+]]:_(s32) = G_SUB [[SUB]], [[C2]]
+    ; VI-NEXT: [[SUB9:%[0-9]+]]:_(s32) = G_SUB [[C2]], [[SUB]]
+    ; VI-NEXT: [[ICMP8:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[SUB]](s32), [[C2]]
+    ; VI-NEXT: [[ICMP9:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[SUB]](s32), [[C1]]
     ; VI-NEXT: [[ASHR5:%[0-9]+]]:_(s64) = G_ASHR [[UV11]], [[SUB]](s32)
     ; VI-NEXT: [[LSHR5:%[0-9]+]]:_(s64) = G_LSHR [[UV10]], [[SUB]](s32)
     ; VI-NEXT: [[SHL5:%[0-9]+]]:_(s64) = G_SHL [[UV11]], [[SUB9]](s32)
     ; VI-NEXT: [[OR5:%[0-9]+]]:_(s64) = G_OR [[LSHR5]], [[SHL5]]
-    ; VI-NEXT: [[ASHR6:%[0-9]+]]:_(s64) = G_ASHR [[UV11]], [[C4]](s32)
+    ; VI-NEXT: [[ASHR6:%[0-9]+]]:_(s64) = G_ASHR [[UV11]], [[C3]](s32)
     ; VI-NEXT: [[ASHR7:%[0-9]+]]:_(s64) = G_ASHR [[UV11]], [[SUB8]](s32)
     ; VI-NEXT: [[SELECT9:%[0-9]+]]:_(s64) = G_SELECT [[ICMP8]](s1), [[OR5]], [[ASHR7]]
     ; VI-NEXT: [[SELECT10:%[0-9]+]]:_(s64) = G_SELECT [[ICMP9]](s1), [[UV10]], [[SELECT9]]
@@ -1661,76 +1710,77 @@ body: |
     ; VI-NEXT: [[MV1:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[SELECT16]](s64), [[SELECT17]](s64)
     ; VI-NEXT: [[MV2:%[0-9]+]]:_(s256) = G_MERGE_VALUES [[MV]](s128), [[MV1]](s128)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[MV2]](s256)
+    ;
     ; GFX9PLUS-LABEL: name: test_ashr_s256_s256
     ; GFX9PLUS: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8
     ; GFX9PLUS-NEXT: {{  $}}
     ; GFX9PLUS-NEXT: [[COPY:%[0-9]+]]:_(s256) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
     ; GFX9PLUS-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr8
-    ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 128
+    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 128
     ; GFX9PLUS-NEXT: [[UV:%[0-9]+]]:_(s128), [[UV1:%[0-9]+]]:_(s128) = G_UNMERGE_VALUES [[COPY]](s256)
-    ; GFX9PLUS-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[COPY1]], [[C2]]
-    ; GFX9PLUS-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[C2]], [[COPY1]]
-    ; GFX9PLUS-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-    ; GFX9PLUS-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY1]](s32), [[C2]]
-    ; GFX9PLUS-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C]]
-    ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 64
+    ; GFX9PLUS-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[COPY1]], [[C]]
+    ; GFX9PLUS-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[C]], [[COPY1]]
+    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; GFX9PLUS-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY1]](s32), [[C]]
+    ; GFX9PLUS-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C1]]
+    ; GFX9PLUS-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 64
     ; GFX9PLUS-NEXT: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[UV1]](s128)
-    ; GFX9PLUS-NEXT: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[COPY1]], [[C3]]
-    ; GFX9PLUS-NEXT: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[C3]], [[COPY1]]
-    ; GFX9PLUS-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY1]](s32), [[C3]]
-    ; GFX9PLUS-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C]]
+    ; GFX9PLUS-NEXT: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[COPY1]], [[C2]]
+    ; GFX9PLUS-NEXT: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[C2]], [[COPY1]]
+    ; GFX9PLUS-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY1]](s32), [[C2]]
+    ; GFX9PLUS-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C1]]
     ; GFX9PLUS-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[UV3]], [[COPY1]](s32)
     ; GFX9PLUS-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[UV2]], [[COPY1]](s32)
     ; GFX9PLUS-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[UV3]], [[SUB3]](s32)
     ; GFX9PLUS-NEXT: [[OR:%[0-9]+]]:_(s64) = G_OR [[LSHR]], [[SHL]]
-    ; GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 63
-    ; GFX9PLUS-NEXT: [[ASHR1:%[0-9]+]]:_(s64) = G_ASHR [[UV3]], [[C4]](s32)
+    ; GFX9PLUS-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 63
+    ; GFX9PLUS-NEXT: [[ASHR1:%[0-9]+]]:_(s64) = G_ASHR [[UV3]], [[C3]](s32)
     ; GFX9PLUS-NEXT: [[ASHR2:%[0-9]+]]:_(s64) = G_ASHR [[UV3]], [[SUB2]](s32)
     ; GFX9PLUS-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP2]](s1), [[OR]], [[ASHR2]]
     ; GFX9PLUS-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[ICMP3]](s1), [[UV2]], [[SELECT]]
     ; GFX9PLUS-NEXT: [[SELECT2:%[0-9]+]]:_(s64) = G_SELECT [[ICMP2]](s1), [[ASHR]], [[ASHR1]]
     ; GFX9PLUS-NEXT: [[UV4:%[0-9]+]]:_(s64), [[UV5:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[UV]](s128)
-    ; GFX9PLUS-NEXT: [[SUB4:%[0-9]+]]:_(s32) = G_SUB [[COPY1]], [[C3]]
-    ; GFX9PLUS-NEXT: [[SUB5:%[0-9]+]]:_(s32) = G_SUB [[C3]], [[COPY1]]
-    ; GFX9PLUS-NEXT: [[ICMP4:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY1]](s32), [[C3]]
-    ; GFX9PLUS-NEXT: [[ICMP5:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C]]
+    ; GFX9PLUS-NEXT: [[SUB4:%[0-9]+]]:_(s32) = G_SUB [[COPY1]], [[C2]]
+    ; GFX9PLUS-NEXT: [[SUB5:%[0-9]+]]:_(s32) = G_SUB [[C2]], [[COPY1]]
+    ; GFX9PLUS-NEXT: [[ICMP4:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY1]](s32), [[C2]]
+    ; GFX9PLUS-NEXT: [[ICMP5:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C1]]
     ; GFX9PLUS-NEXT: [[LSHR1:%[0-9]+]]:_(s64) = G_LSHR [[UV5]], [[COPY1]](s32)
     ; GFX9PLUS-NEXT: [[LSHR2:%[0-9]+]]:_(s64) = G_LSHR [[UV4]], [[COPY1]](s32)
     ; GFX9PLUS-NEXT: [[SHL1:%[0-9]+]]:_(s64) = G_SHL [[UV5]], [[SUB5]](s32)
     ; GFX9PLUS-NEXT: [[OR1:%[0-9]+]]:_(s64) = G_OR [[LSHR2]], [[SHL1]]
-    ; GFX9PLUS-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; GFX9PLUS-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
     ; GFX9PLUS-NEXT: [[LSHR3:%[0-9]+]]:_(s64) = G_LSHR [[UV5]], [[SUB4]](s32)
     ; GFX9PLUS-NEXT: [[SELECT3:%[0-9]+]]:_(s64) = G_SELECT [[ICMP4]](s1), [[OR1]], [[LSHR3]]
     ; GFX9PLUS-NEXT: [[SELECT4:%[0-9]+]]:_(s64) = G_SELECT [[ICMP5]](s1), [[UV4]], [[SELECT3]]
-    ; GFX9PLUS-NEXT: [[SELECT5:%[0-9]+]]:_(s64) = G_SELECT [[ICMP4]](s1), [[LSHR1]], [[C1]]
+    ; GFX9PLUS-NEXT: [[SELECT5:%[0-9]+]]:_(s64) = G_SELECT [[ICMP4]](s1), [[LSHR1]], [[C4]]
     ; GFX9PLUS-NEXT: [[UV6:%[0-9]+]]:_(s64), [[UV7:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[UV1]](s128)
-    ; GFX9PLUS-NEXT: [[SUB6:%[0-9]+]]:_(s32) = G_SUB [[SUB1]], [[C3]]
-    ; GFX9PLUS-NEXT: [[SUB7:%[0-9]+]]:_(s32) = G_SUB [[C3]], [[SUB1]]
-    ; GFX9PLUS-NEXT: [[ICMP6:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[SUB1]](s32), [[C3]]
-    ; GFX9PLUS-NEXT: [[ICMP7:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[SUB1]](s32), [[C]]
+    ; GFX9PLUS-NEXT: [[SUB6:%[0-9]+]]:_(s32) = G_SUB [[SUB1]], [[C2]]
+    ; GFX9PLUS-NEXT: [[SUB7:%[0-9]+]]:_(s32) = G_SUB [[C2]], [[SUB1]]
+    ; GFX9PLUS-NEXT: [[ICMP6:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[SUB1]](s32), [[C2]]
+    ; GFX9PLUS-NEXT: [[ICMP7:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[SUB1]](s32), [[C1]]
     ; GFX9PLUS-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[UV6]], [[SUB1]](s32)
     ; GFX9PLUS-NEXT: [[LSHR4:%[0-9]+]]:_(s64) = G_LSHR [[UV6]], [[SUB7]](s32)
     ; GFX9PLUS-NEXT: [[SHL3:%[0-9]+]]:_(s64) = G_SHL [[UV7]], [[SUB1]](s32)
     ; GFX9PLUS-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[LSHR4]], [[SHL3]]
     ; GFX9PLUS-NEXT: [[SHL4:%[0-9]+]]:_(s64) = G_SHL [[UV6]], [[SUB6]](s32)
-    ; GFX9PLUS-NEXT: [[SELECT6:%[0-9]+]]:_(s64) = G_SELECT [[ICMP6]](s1), [[SHL2]], [[C1]]
+    ; GFX9PLUS-NEXT: [[SELECT6:%[0-9]+]]:_(s64) = G_SELECT [[ICMP6]](s1), [[SHL2]], [[C4]]
     ; GFX9PLUS-NEXT: [[SELECT7:%[0-9]+]]:_(s64) = G_SELECT [[ICMP6]](s1), [[OR2]], [[SHL4]]
     ; GFX9PLUS-NEXT: [[SELECT8:%[0-9]+]]:_(s64) = G_SELECT [[ICMP7]](s1), [[UV7]], [[SELECT7]]
     ; GFX9PLUS-NEXT: [[OR3:%[0-9]+]]:_(s64) = G_OR [[SELECT4]], [[SELECT6]]
     ; GFX9PLUS-NEXT: [[OR4:%[0-9]+]]:_(s64) = G_OR [[SELECT5]], [[SELECT8]]
     ; GFX9PLUS-NEXT: [[UV8:%[0-9]+]]:_(s64), [[UV9:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[UV1]](s128)
-    ; GFX9PLUS-NEXT: [[ASHR3:%[0-9]+]]:_(s64) = G_ASHR [[UV9]], [[C4]](s32)
-    ; GFX9PLUS-NEXT: [[ASHR4:%[0-9]+]]:_(s64) = G_ASHR [[UV9]], [[C4]](s32)
+    ; GFX9PLUS-NEXT: [[ASHR3:%[0-9]+]]:_(s64) = G_ASHR [[UV9]], [[C3]](s32)
+    ; GFX9PLUS-NEXT: [[ASHR4:%[0-9]+]]:_(s64) = G_ASHR [[UV9]], [[C3]](s32)
     ; GFX9PLUS-NEXT: [[UV10:%[0-9]+]]:_(s64), [[UV11:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[UV1]](s128)
-    ; GFX9PLUS-NEXT: [[SUB8:%[0-9]+]]:_(s32) = G_SUB [[SUB]], [[C3]]
-    ; GFX9PLUS-NEXT: [[SUB9:%[0-9]+]]:_(s32) = G_SUB [[C3]], [[SUB]]
-    ; GFX9PLUS-NEXT: [[ICMP8:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[SUB]](s32), [[C3]]
-    ; GFX9PLUS-NEXT: [[ICMP9:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[SUB]](s32), [[C]]
+    ; GFX9PLUS-NEXT: [[SUB8:%[0-9]+]]:_(s32) = G_SUB [[SUB]], [[C2]]
+    ; GFX9PLUS-NEXT: [[SUB9:%[0-9]+]]:_(s32) = G_SUB [[C2]], [[SUB]]
+    ; GFX9PLUS-NEXT: [[ICMP8:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[SUB]](s32), [[C2]]
+    ; GFX9PLUS-NEXT: [[ICMP9:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[SUB]](s32), [[C1]]
     ; GFX9PLUS-NEXT: [[ASHR5:%[0-9]+]]:_(s64) = G_ASHR [[UV11]], [[SUB]](s32)
     ; GFX9PLUS-NEXT: [[LSHR5:%[0-9]+]]:_(s64) = G_LSHR [[UV10]], [[SUB]](s32)
     ; GFX9PLUS-NEXT: [[SHL5:%[0-9]+]]:_(s64) = G_SHL [[UV11]], [[SUB9]](s32)
     ; GFX9PLUS-NEXT: [[OR5:%[0-9]+]]:_(s64) = G_OR [[LSHR5]], [[SHL5]]
-    ; GFX9PLUS-NEXT: [[ASHR6:%[0-9]+]]:_(s64) = G_ASHR [[UV11]], [[C4]](s32)
+    ; GFX9PLUS-NEXT: [[ASHR6:%[0-9]+]]:_(s64) = G_ASHR [[UV11]], [[C3]](s32)
     ; GFX9PLUS-NEXT: [[ASHR7:%[0-9]+]]:_(s64) = G_ASHR [[UV11]], [[SUB8]](s32)
     ; GFX9PLUS-NEXT: [[SELECT9:%[0-9]+]]:_(s64) = G_SELECT [[ICMP8]](s1), [[OR5]], [[ASHR7]]
     ; GFX9PLUS-NEXT: [[SELECT10:%[0-9]+]]:_(s64) = G_SELECT [[ICMP9]](s1), [[UV10]], [[SELECT9]]
@@ -1801,6 +1851,7 @@ body: |
     ; SI-NEXT: [[MV1:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[SELECT4]](s64), [[SELECT5]](s64)
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s128>) = G_BUILD_VECTOR [[MV]](s128), [[MV1]](s128)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<2 x s128>)
+    ;
     ; VI-LABEL: name: test_ashr_v2s128_v2s32
     ; VI: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr4_vgpr5
     ; VI-NEXT: {{  $}}
@@ -1843,6 +1894,7 @@ body: |
     ; VI-NEXT: [[MV1:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[SELECT4]](s64), [[SELECT5]](s64)
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s128>) = G_BUILD_VECTOR [[MV]](s128), [[MV1]](s128)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<2 x s128>)
+    ;
     ; GFX9PLUS-LABEL: name: test_ashr_v2s128_v2s32
     ; GFX9PLUS: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr4_vgpr5
     ; GFX9PLUS-NEXT: {{  $}}
@@ -1927,6 +1979,7 @@ body: |
     ; SI-NEXT: [[MV2:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[SELECT1]](s64), [[SELECT2]](s64)
     ; SI-NEXT: [[TRUNC1:%[0-9]+]]:_(s96) = G_TRUNC [[MV2]](s128)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[TRUNC1]](s96)
+    ;
     ; VI-LABEL: name: test_ashr_s65_s32
     ; VI: liveins: $vgpr0_vgpr1_vgpr2, $vgpr3
     ; VI-NEXT: {{  $}}
@@ -1957,6 +2010,7 @@ body: |
     ; VI-NEXT: [[MV2:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[SELECT1]](s64), [[SELECT2]](s64)
     ; VI-NEXT: [[TRUNC1:%[0-9]+]]:_(s96) = G_TRUNC [[MV2]](s128)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[TRUNC1]](s96)
+    ;
     ; GFX9PLUS-LABEL: name: test_ashr_s65_s32
     ; GFX9PLUS: liveins: $vgpr0_vgpr1_vgpr2, $vgpr3
     ; GFX9PLUS-NEXT: {{  $}}
@@ -2030,6 +2084,7 @@ body: |
     ; SI-NEXT: [[MV2:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[SELECT1]](s64), [[SELECT2]](s64)
     ; SI-NEXT: [[TRUNC1:%[0-9]+]]:_(s96) = G_TRUNC [[MV2]](s128)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[TRUNC1]](s96)
+    ;
     ; VI-LABEL: name: test_ashr_s65_s32_constant8
     ; VI: liveins: $vgpr0_vgpr1_vgpr2
     ; VI-NEXT: {{  $}}
@@ -2059,6 +2114,7 @@ body: |
     ; VI-NEXT: [[MV2:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[SELECT1]](s64), [[SELECT2]](s64)
     ; VI-NEXT: [[TRUNC1:%[0-9]+]]:_(s96) = G_TRUNC [[MV2]](s128)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[TRUNC1]](s96)
+    ;
     ; GFX9PLUS-LABEL: name: test_ashr_s65_s32_constant8
     ; GFX9PLUS: liveins: $vgpr0_vgpr1_vgpr2
     ; GFX9PLUS-NEXT: {{  $}}
@@ -2133,6 +2189,7 @@ body: |
     ; SI-NEXT: [[MV2:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[SELECT1]](s64), [[SELECT2]](s64)
     ; SI-NEXT: [[TRUNC:%[0-9]+]]:_(s96) = G_TRUNC [[MV2]](s128)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[TRUNC]](s96)
+    ;
     ; VI-LABEL: name: test_ashr_s65_s32_known_pow2
     ; VI: liveins: $vgpr0_vgpr1_vgpr2, $vgpr3
     ; VI-NEXT: {{  $}}
@@ -2164,6 +2221,7 @@ body: |
     ; VI-NEXT: [[MV2:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[SELECT1]](s64), [[SELECT2]](s64)
     ; VI-NEXT: [[TRUNC:%[0-9]+]]:_(s96) = G_TRUNC [[MV2]](s128)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[TRUNC]](s96)
+    ;
     ; GFX9PLUS-LABEL: name: test_ashr_s65_s32_known_pow2
     ; GFX9PLUS: liveins: $vgpr0_vgpr1_vgpr2, $vgpr3
     ; GFX9PLUS-NEXT: {{  $}}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-bitcast.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-bitcast.mir
index 18a7032300afe..8b5c27288453e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-bitcast.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-bitcast.mir
@@ -440,8 +440,8 @@ body: |
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<3 x s32>) = COPY $vgpr0_vgpr1_vgpr2
     ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<3 x s32>)
     ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[UV]](s32)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C]]
     ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[UV1]](s32)
     ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C]]
@@ -529,8 +529,8 @@ body: |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1
     ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>)
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[UV]](s32)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C]]
     ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[UV1]](s32)
     ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C]]
@@ -968,8 +968,8 @@ body: |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
     ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<8 x s32>)
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[UV]](s32)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C]]
     ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[UV1]](s32)
     ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C]]
@@ -1645,8 +1645,8 @@ body: |
     ; CHECK-NEXT: [[TRUNC5:%[0-9]+]]:_(s8) = G_TRUNC [[UV5]](s32)
     ; CHECK-NEXT: [[TRUNC6:%[0-9]+]]:_(s8) = G_TRUNC [[UV6]](s32)
     ; CHECK-NEXT: [[TRUNC7:%[0-9]+]]:_(s8) = G_TRUNC [[UV7]](s32)
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; CHECK-NEXT: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[UV]](s32)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC8]], [[C]]
     ; CHECK-NEXT: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[UV1]](s32)
     ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC9]], [[C]]
@@ -2018,8 +2018,8 @@ body: |
     ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32), [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32), [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32), [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32), [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32), [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32), [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<16 x s32>)
     ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[UV]](s32)
     ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s8) = G_TRUNC [[UV1]](s32)
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; CHECK-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[UV]](s32)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C]]
     ; CHECK-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[UV1]](s32)
     ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C]]
@@ -2285,60 +2285,54 @@ body: |
     ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ZEXT]], [[C]](s32)
     ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
     ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C2]]
     ; CHECK-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[LSHR3]](s16)
     ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C]](s32)
-    ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND1]], [[SHL1]]
+    ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[LSHR]], [[SHL1]]
     ; CHECK-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
-    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[UV4]], [[C2]]
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[UV4]], [[C2]]
     ; CHECK-NEXT: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[LSHR4]](s16)
     ; CHECK-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT2]], [[C]](s32)
-    ; CHECK-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL2]]
+    ; CHECK-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND1]], [[SHL2]]
     ; CHECK-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
-    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C2]]
     ; CHECK-NEXT: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[LSHR5]](s16)
     ; CHECK-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C]](s32)
-    ; CHECK-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND3]], [[SHL3]]
+    ; CHECK-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[LSHR1]], [[SHL3]]
     ; CHECK-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR3]](s32)
-    ; CHECK-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[UV5]], [[C2]]
+    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[UV5]], [[C2]]
     ; CHECK-NEXT: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[LSHR8]](s16)
     ; CHECK-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT4]], [[C]](s32)
-    ; CHECK-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL4]]
+    ; CHECK-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL4]]
     ; CHECK-NEXT: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR4]](s32)
-    ; CHECK-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[LSHR6]], [[C2]]
     ; CHECK-NEXT: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[LSHR9]](s16)
     ; CHECK-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C]](s32)
-    ; CHECK-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[AND5]], [[SHL5]]
+    ; CHECK-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[LSHR6]], [[SHL5]]
     ; CHECK-NEXT: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR5]](s32)
-    ; CHECK-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[UV6]], [[C2]]
+    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[UV6]], [[C2]]
     ; CHECK-NEXT: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[LSHR10]](s16)
     ; CHECK-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXT6]], [[C]](s32)
-    ; CHECK-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[AND6]], [[SHL6]]
+    ; CHECK-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[AND3]], [[SHL6]]
     ; CHECK-NEXT: [[BITCAST6:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR6]](s32)
-    ; CHECK-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[LSHR7]], [[C2]]
     ; CHECK-NEXT: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[LSHR11]](s16)
     ; CHECK-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXT7]], [[C]](s32)
-    ; CHECK-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[AND7]], [[SHL7]]
+    ; CHECK-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[LSHR7]], [[SHL7]]
     ; CHECK-NEXT: [[BITCAST7:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR7]](s32)
-    ; CHECK-NEXT: [[AND8:%[0-9]+]]:_(s32) = G_AND [[UV7]], [[C2]]
+    ; CHECK-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[UV7]], [[C2]]
     ; CHECK-NEXT: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[LSHR14]](s16)
     ; CHECK-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[ZEXT8]], [[C]](s32)
-    ; CHECK-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[AND8]], [[SHL8]]
+    ; CHECK-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL8]]
     ; CHECK-NEXT: [[BITCAST8:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR8]](s32)
-    ; CHECK-NEXT: [[AND9:%[0-9]+]]:_(s32) = G_AND [[LSHR12]], [[C2]]
     ; CHECK-NEXT: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[LSHR15]](s16)
     ; CHECK-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[ZEXT9]], [[C]](s32)
-    ; CHECK-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[AND9]], [[SHL9]]
+    ; CHECK-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[LSHR12]], [[SHL9]]
     ; CHECK-NEXT: [[BITCAST9:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR9]](s32)
-    ; CHECK-NEXT: [[AND10:%[0-9]+]]:_(s32) = G_AND [[UV8]], [[C2]]
+    ; CHECK-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[UV8]], [[C2]]
     ; CHECK-NEXT: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[LSHR16]](s16)
     ; CHECK-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ZEXT10]], [[C]](s32)
-    ; CHECK-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[AND10]], [[SHL10]]
+    ; CHECK-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[AND5]], [[SHL10]]
     ; CHECK-NEXT: [[BITCAST10:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR10]](s32)
-    ; CHECK-NEXT: [[AND11:%[0-9]+]]:_(s32) = G_AND [[LSHR13]], [[C2]]
     ; CHECK-NEXT: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[LSHR17]](s16)
     ; CHECK-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[ZEXT11]], [[C]](s32)
-    ; CHECK-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[AND11]], [[SHL11]]
+    ; CHECK-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[LSHR13]], [[SHL11]]
     ; CHECK-NEXT: [[BITCAST11:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR11]](s32)
     ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<24 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>), [[BITCAST6]](<2 x s16>), [[BITCAST7]](<2 x s16>), [[BITCAST8]](<2 x s16>), [[BITCAST9]](<2 x s16>), [[BITCAST10]](<2 x s16>), [[BITCAST11]](<2 x s16>)
     ; CHECK-NEXT: S_ENDPGM 0, implicit [[CONCAT_VECTORS]](<24 x s16>)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-bitreverse.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-bitreverse.mir
index 5a147ca27d340..92eb440fbd8ea 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-bitreverse.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-bitreverse.mir
@@ -98,11 +98,8 @@ body: |
     ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITREVERSE]], [[C]](s32)
     ; CHECK-NEXT: [[BITREVERSE1:%[0-9]+]]:_(s32) = G_BITREVERSE [[LSHR]]
     ; CHECK-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITREVERSE1]], [[C]](s32)
-    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C1]]
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR2]], [[C1]]
-    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
-    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
+    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LSHR2]], [[C]](s32)
+    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[LSHR1]], [[SHL]]
     ; CHECK-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; CHECK-NEXT: $vgpr0 = COPY [[BITCAST1]](<2 x s16>)
     %0:_(<2 x s16>) = COPY $vgpr0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-bswap.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-bswap.mir
index 63235842de57b..2794a3fa31daa 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-bswap.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-bswap.mir
@@ -23,6 +23,7 @@ body: |
     ; GFX7-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[TRUNC1]], [[TRUNC]]
     ; GFX7-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[OR]](s16)
     ; GFX7-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ;
     ; GFX8-LABEL: name: bswap_s16
     ; GFX8: liveins: $vgpr0
     ; GFX8-NEXT: {{  $}}
@@ -50,6 +51,7 @@ body: |
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX7-NEXT: [[BSWAP:%[0-9]+]]:_(s32) = G_BSWAP [[COPY]]
     ; GFX7-NEXT: $vgpr0 = COPY [[BSWAP]](s32)
+    ;
     ; GFX8-LABEL: name: bswap_s32
     ; GFX8: liveins: $vgpr0
     ; GFX8-NEXT: {{  $}}
@@ -87,8 +89,7 @@ body: |
     ; GFX7-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LSHR]], [[COPY2]](s32)
     ; GFX7-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32)
     ; GFX7-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C1]](s32)
-    ; GFX7-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C2]]
-    ; GFX7-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[AND1]], [[COPY3]](s32)
+    ; GFX7-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[LSHR]], [[COPY3]](s32)
     ; GFX7-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32)
     ; GFX7-NEXT: [[OR1:%[0-9]+]]:_(s16) = G_OR [[TRUNC3]], [[TRUNC2]]
     ; GFX7-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16)
@@ -97,6 +98,7 @@ body: |
     ; GFX7-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]]
     ; GFX7-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
     ; GFX7-NEXT: $vgpr0 = COPY [[BITCAST1]](<2 x s16>)
+    ;
     ; GFX8-LABEL: name: bswap_v2s16
     ; GFX8: liveins: $vgpr0
     ; GFX8-NEXT: {{  $}}
@@ -151,6 +153,7 @@ body: |
     ; GFX7-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
     ; GFX7-NEXT: $vgpr1 = COPY [[ANYEXT1]](s32)
     ; GFX7-NEXT: $vgpr2 = COPY [[ANYEXT2]](s32)
+    ;
     ; GFX8-LABEL: name: bswap_v3s16
     ; GFX8: liveins: $vgpr0, $vgpr1, $vgpr2
     ; GFX8-NEXT: {{  $}}
@@ -224,24 +227,22 @@ body: |
     ; GFX7-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LSHR]], [[COPY2]](s32)
     ; GFX7-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32)
     ; GFX7-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C1]](s32)
-    ; GFX7-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C2]]
-    ; GFX7-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[AND1]], [[COPY3]](s32)
+    ; GFX7-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[LSHR]], [[COPY3]](s32)
     ; GFX7-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR3]](s32)
     ; GFX7-NEXT: [[OR1:%[0-9]+]]:_(s16) = G_OR [[TRUNC3]], [[TRUNC2]]
     ; GFX7-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY [[C1]](s32)
     ; GFX7-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[BITCAST1]], [[COPY4]](s32)
     ; GFX7-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[SHL2]](s32)
     ; GFX7-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY [[C1]](s32)
-    ; GFX7-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C2]]
-    ; GFX7-NEXT: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[AND2]], [[COPY5]](s32)
+    ; GFX7-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C2]]
+    ; GFX7-NEXT: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[AND1]], [[COPY5]](s32)
     ; GFX7-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR4]](s32)
     ; GFX7-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[TRUNC5]], [[TRUNC4]]
     ; GFX7-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY [[C1]](s32)
     ; GFX7-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LSHR1]], [[COPY6]](s32)
     ; GFX7-NEXT: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32)
     ; GFX7-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY [[C1]](s32)
-    ; GFX7-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C2]]
-    ; GFX7-NEXT: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[AND3]], [[COPY7]](s32)
+    ; GFX7-NEXT: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[LSHR1]], [[COPY7]](s32)
     ; GFX7-NEXT: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR5]](s32)
     ; GFX7-NEXT: [[OR3:%[0-9]+]]:_(s16) = G_OR [[TRUNC7]], [[TRUNC6]]
     ; GFX7-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16)
@@ -256,6 +257,7 @@ body: |
     ; GFX7-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR5]](s32)
     ; GFX7-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>)
     ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
     ; GFX8-LABEL: name: bswap_v4s16
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
@@ -285,6 +287,7 @@ body: |
     ; GFX7-NEXT: [[BSWAP1:%[0-9]+]]:_(s32) = G_BSWAP [[UV1]]
     ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[BSWAP]](s32), [[BSWAP1]](s32)
     ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; GFX8-LABEL: name: bswap_v2s32
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
@@ -314,6 +317,7 @@ body: |
     ; GFX7-NEXT: [[BSWAP1:%[0-9]+]]:_(s32) = G_BSWAP [[UV]]
     ; GFX7-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[BSWAP]](s32), [[BSWAP1]](s32)
     ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64)
+    ;
     ; GFX8-LABEL: name: bswap_s64
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
@@ -349,6 +353,7 @@ body: |
     ; GFX7-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[BSWAP2]](s32), [[BSWAP3]](s32)
     ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64)
     ; GFX7-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ;
     ; GFX8-LABEL: name: bswap_v2s64
     ; GFX8: liveins: $vgpr0_vgpr1_vgpr2_vgpr3
     ; GFX8-NEXT: {{  $}}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ctlz-zero-undef.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ctlz-zero-undef.mir
index c82de08bb6e50..e9f8180aae66b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ctlz-zero-undef.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ctlz-zero-undef.mir
@@ -156,14 +156,13 @@ body: |
     ; CHECK-NEXT: [[CTLZ_ZERO_UNDEF:%[0-9]+]]:_(s32) = G_CTLZ_ZERO_UNDEF [[AND]](s32)
     ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[CTLZ_ZERO_UNDEF]], [[C]]
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[SUB]](s32)
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]]
-    ; CHECK-NEXT: [[CTLZ_ZERO_UNDEF1:%[0-9]+]]:_(s32) = G_CTLZ_ZERO_UNDEF [[AND1]](s32)
+    ; CHECK-NEXT: [[CTLZ_ZERO_UNDEF1:%[0-9]+]]:_(s32) = G_CTLZ_ZERO_UNDEF [[LSHR]](s32)
     ; CHECK-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[CTLZ_ZERO_UNDEF1]], [[C]]
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[SUB1]](s32)
-    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]]
-    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]]
-    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C]](s32)
-    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL]]
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]]
+    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]]
+    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C]](s32)
+    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND1]], [[SHL]]
     ; CHECK-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; CHECK-NEXT: $vgpr0 = COPY [[BITCAST1]](<2 x s16>)
     %0:_(<2 x s16>) = COPY $vgpr0
@@ -214,8 +213,8 @@ body: |
     ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 31
     ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C1]](s64)
     ; CHECK-NEXT: [[USUBO:%[0-9]+]]:_(s32), [[USUBO1:%[0-9]+]]:_(s1) = G_USUBO [[CTLZ_ZERO_UNDEF]], [[UV]]
-    ; CHECK-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[USUBO]](s32)
-    ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[ZEXT1]](s64)
+    ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[USUBO]](s32)
+    ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[ZEXT]](s64)
     %0:_(s64) = COPY $vgpr0_vgpr1
     %1:_(s33) = G_TRUNC %0
     %2:_(s33) = G_CTLZ_ZERO_UNDEF %1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ctlz.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ctlz.mir
index 318960e578f44..dd116927779b5 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ctlz.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ctlz.mir
@@ -72,9 +72,7 @@ body: |
     ; CHECK-NEXT: [[AMDGPU_FFBH_U32_:%[0-9]+]]:_(s32) = G_AMDGPU_FFBH_U32 [[COPY]](s32)
     ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
     ; CHECK-NEXT: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[AMDGPU_FFBH_U32_]], [[C]]
-    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[UMIN]], [[C1]]
-    ; CHECK-NEXT: $vgpr0 = COPY [[AND]](s32)
+    ; CHECK-NEXT: $vgpr0 = COPY [[UMIN]](s32)
     %0:_(s32) = COPY $vgpr0
     %1:_(s16) = G_CTLZ %0
     %2:_(s32) = G_ZEXT %1
@@ -174,15 +172,14 @@ body: |
     ; CHECK-NEXT: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[AMDGPU_FFBH_U32_]], [[C2]]
     ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UMIN]], [[C]]
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[SUB]](s32)
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]]
-    ; CHECK-NEXT: [[AMDGPU_FFBH_U32_1:%[0-9]+]]:_(s32) = G_AMDGPU_FFBH_U32 [[AND1]](s32)
+    ; CHECK-NEXT: [[AMDGPU_FFBH_U32_1:%[0-9]+]]:_(s32) = G_AMDGPU_FFBH_U32 [[LSHR]](s32)
     ; CHECK-NEXT: [[UMIN1:%[0-9]+]]:_(s32) = G_UMIN [[AMDGPU_FFBH_U32_1]], [[C2]]
     ; CHECK-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[UMIN1]], [[C]]
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[SUB1]](s32)
-    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]]
-    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]]
-    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C]](s32)
-    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL]]
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]]
+    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]]
+    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C]](s32)
+    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND1]], [[SHL]]
     ; CHECK-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; CHECK-NEXT: $vgpr0 = COPY [[BITCAST1]](<2 x s16>)
     %0:_(<2 x s16>) = COPY $vgpr0
@@ -237,8 +234,8 @@ body: |
     ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 31
     ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C2]](s64)
     ; CHECK-NEXT: [[USUBO:%[0-9]+]]:_(s32), [[USUBO1:%[0-9]+]]:_(s1) = G_USUBO [[UMIN]], [[UV]]
-    ; CHECK-NEXT: [[ZEXT1:%[0-9]+]]:_(s64) = G_ZEXT [[USUBO]](s32)
-    ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[ZEXT1]](s64)
+    ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[USUBO]](s32)
+    ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[ZEXT]](s64)
     %0:_(s64) = COPY $vgpr0_vgpr1
     %1:_(s33) = G_TRUNC %0
     %2:_(s33) = G_CTLZ %1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ctpop.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ctpop.mir
index cacf2f449a623..f183271cd5f27 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ctpop.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ctpop.mir
@@ -14,8 +14,7 @@ body: |
     ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C]]
     ; CHECK-NEXT: [[CTPOP:%[0-9]+]]:_(s32) = G_CTPOP [[AND]](s32)
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[CTPOP]](s32)
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C]]
-    ; CHECK-NEXT: $vgpr0 = COPY [[AND1]](s32)
+    ; CHECK-NEXT: $vgpr0 = COPY [[COPY1]](s32)
     %0:_(s32) = COPY $vgpr0
     %1:_(s8) = G_TRUNC %0
     %2:_(s8) = G_CTPOP %1
@@ -36,8 +35,7 @@ body: |
     ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C]]
     ; CHECK-NEXT: [[CTPOP:%[0-9]+]]:_(s32) = G_CTPOP [[AND]](s32)
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[CTPOP]](s32)
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C]]
-    ; CHECK-NEXT: $vgpr0 = COPY [[AND1]](s32)
+    ; CHECK-NEXT: $vgpr0 = COPY [[COPY1]](s32)
     %0:_(s32) = COPY $vgpr0
     %1:_(s9) = G_TRUNC %0
     %2:_(s9) = G_CTPOP %1
@@ -108,9 +106,7 @@ body: |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; CHECK-NEXT: [[CTPOP:%[0-9]+]]:_(s32) = G_CTPOP [[COPY]](s32)
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[CTPOP]], [[C]]
-    ; CHECK-NEXT: $vgpr0 = COPY [[AND]](s32)
+    ; CHECK-NEXT: $vgpr0 = COPY [[CTPOP]](s32)
     %0:_(s32) = COPY $vgpr0
     %1:_(s16) = G_CTPOP %0
     %2:_(s32) = G_ZEXT %1
@@ -131,8 +127,7 @@ body: |
     ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C]]
     ; CHECK-NEXT: [[CTPOP:%[0-9]+]]:_(s32) = G_CTPOP [[AND]](s32)
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[CTPOP]](s32)
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C]]
-    ; CHECK-NEXT: $vgpr0 = COPY [[AND1]](s32)
+    ; CHECK-NEXT: $vgpr0 = COPY [[COPY1]](s32)
     %0:_(s32) = COPY $vgpr0
     %1:_(s16) = G_TRUNC %0
     %2:_(s16) = G_CTPOP %1
@@ -197,13 +192,10 @@ body: |
     ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[BITCAST]], [[C1]]
     ; CHECK-NEXT: [[CTPOP:%[0-9]+]]:_(s32) = G_CTPOP [[AND]](s32)
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[CTPOP]](s32)
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]]
-    ; CHECK-NEXT: [[CTPOP1:%[0-9]+]]:_(s32) = G_CTPOP [[AND1]](s32)
+    ; CHECK-NEXT: [[CTPOP1:%[0-9]+]]:_(s32) = G_CTPOP [[LSHR]](s32)
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[CTPOP1]](s32)
-    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]]
-    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]]
-    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C]](s32)
-    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL]]
+    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY2]], [[C]](s32)
+    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY1]], [[SHL]]
     ; CHECK-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; CHECK-NEXT: $vgpr0 = COPY [[BITCAST1]](<2 x s16>)
     %0:_(<2 x s16>) = COPY $vgpr0
@@ -226,8 +218,7 @@ body: |
     ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C]]
     ; CHECK-NEXT: [[CTPOP:%[0-9]+]]:_(s32) = G_CTPOP [[AND]](s32)
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[CTPOP]](s32)
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C]]
-    ; CHECK-NEXT: $vgpr0 = COPY [[AND1]](s32)
+    ; CHECK-NEXT: $vgpr0 = COPY [[COPY1]](s32)
     %0:_(s32) = COPY $vgpr0
     %1:_(s7) = G_TRUNC %0
     %2:_(s7) = G_CTPOP %1
@@ -267,12 +258,12 @@ body: |
     ; CHECK: liveins: $vgpr0_vgpr1_vgpr2
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s96) = COPY $vgpr0_vgpr1_vgpr2
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
-    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
     ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s96)
     ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
     ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UV]](s32), [[UV1]](s32)
     ; CHECK-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UV2]](s32), [[DEF]](s32)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
     ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[MV]], [[C]]
     ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[MV1]], [[C1]]
     ; CHECK-NEXT: [[CTPOP:%[0-9]+]]:_(s32) = G_CTPOP [[AND]](s64)
@@ -295,12 +286,12 @@ body: |
     ; CHECK: liveins: $vgpr0_vgpr1_vgpr2
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s96) = COPY $vgpr0_vgpr1_vgpr2
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
-    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
     ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s96)
     ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
     ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UV]](s32), [[UV1]](s32)
     ; CHECK-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UV2]](s32), [[DEF]](s32)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
     ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[MV]], [[C]]
     ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[MV1]], [[C1]]
     ; CHECK-NEXT: [[CTPOP:%[0-9]+]]:_(s32) = G_CTPOP [[AND]](s64)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-cttz.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-cttz.mir
index 39d51cdd6a743..0ef31a602961c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-cttz.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-cttz.mir
@@ -72,9 +72,7 @@ body: |
     ; CHECK-NEXT: [[AMDGPU_FFBL_B32_:%[0-9]+]]:_(s32) = G_AMDGPU_FFBL_B32 [[COPY]](s32)
     ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
     ; CHECK-NEXT: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[AMDGPU_FFBL_B32_]], [[C]]
-    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[UMIN]], [[C1]]
-    ; CHECK-NEXT: $vgpr0 = COPY [[AND]](s32)
+    ; CHECK-NEXT: $vgpr0 = COPY [[UMIN]](s32)
     %0:_(s32) = COPY $vgpr0
     %1:_(s16) = G_CTTZ %0
     %2:_(s32) = G_ZEXT %1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fabs.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fabs.mir
index 4a85edd75a9dc..801d2e918f087 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fabs.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fabs.mir
@@ -18,12 +18,14 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; SI-NEXT: [[FABS:%[0-9]+]]:_(s32) = G_FABS [[COPY]]
     ; SI-NEXT: $vgpr0 = COPY [[FABS]](s32)
+    ;
     ; VI-LABEL: name: test_fabs_s32
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; VI-NEXT: [[FABS:%[0-9]+]]:_(s32) = G_FABS [[COPY]]
     ; VI-NEXT: $vgpr0 = COPY [[FABS]](s32)
+    ;
     ; GFX9-LABEL: name: test_fabs_s32
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -47,12 +49,14 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
     ; SI-NEXT: [[FABS:%[0-9]+]]:_(s64) = G_FABS [[COPY]]
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[FABS]](s64)
+    ;
     ; VI-LABEL: name: test_fabs_s64
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[FABS:%[0-9]+]]:_(s64) = G_FABS [[COPY]]
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[FABS]](s64)
+    ;
     ; GFX9-LABEL: name: test_fabs_s64
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -77,6 +81,7 @@ body: |
     ; SI-NEXT: [[FABS:%[0-9]+]]:_(s16) = G_FABS [[TRUNC]]
     ; SI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FABS]](s16)
     ; SI-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ;
     ; VI-LABEL: name: test_fabs_s16
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -85,6 +90,7 @@ body: |
     ; VI-NEXT: [[FABS:%[0-9]+]]:_(s16) = G_FABS [[TRUNC]]
     ; VI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FABS]](s16)
     ; VI-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ;
     ; GFX9-LABEL: name: test_fabs_s16
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -115,6 +121,7 @@ body: |
     ; SI-NEXT: [[FABS1:%[0-9]+]]:_(s32) = G_FABS [[UV1]]
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[FABS]](s32), [[FABS1]](s32)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; VI-LABEL: name: test_fabs_v2s32
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -124,6 +131,7 @@ body: |
     ; VI-NEXT: [[FABS1:%[0-9]+]]:_(s32) = G_FABS [[UV1]]
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[FABS]](s32), [[FABS1]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; GFX9-LABEL: name: test_fabs_v2s32
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -154,6 +162,7 @@ body: |
     ; SI-NEXT: [[FABS2:%[0-9]+]]:_(s32) = G_FABS [[UV2]]
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[FABS]](s32), [[FABS1]](s32), [[FABS2]](s32)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
+    ;
     ; VI-LABEL: name: test_fabs_v3s32
     ; VI: liveins: $vgpr0_vgpr1_vgpr2
     ; VI-NEXT: {{  $}}
@@ -164,6 +173,7 @@ body: |
     ; VI-NEXT: [[FABS2:%[0-9]+]]:_(s32) = G_FABS [[UV2]]
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[FABS]](s32), [[FABS1]](s32), [[FABS2]](s32)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
+    ;
     ; GFX9-LABEL: name: test_fabs_v3s32
     ; GFX9: liveins: $vgpr0_vgpr1_vgpr2
     ; GFX9-NEXT: {{  $}}
@@ -194,6 +204,7 @@ body: |
     ; SI-NEXT: [[FABS1:%[0-9]+]]:_(s64) = G_FABS [[UV1]]
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[FABS]](s64), [[FABS1]](s64)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ;
     ; VI-LABEL: name: test_fabs_v2s64
     ; VI: liveins: $vgpr0_vgpr1_vgpr2_vgpr3
     ; VI-NEXT: {{  $}}
@@ -203,6 +214,7 @@ body: |
     ; VI-NEXT: [[FABS1:%[0-9]+]]:_(s64) = G_FABS [[UV1]]
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[FABS]](s64), [[FABS1]](s64)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ;
     ; GFX9-LABEL: name: test_fabs_v2s64
     ; GFX9: liveins: $vgpr0_vgpr1_vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -229,12 +241,14 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
     ; SI-NEXT: [[FABS:%[0-9]+]]:_(<2 x s16>) = G_FABS [[COPY]]
     ; SI-NEXT: $vgpr0 = COPY [[FABS]](<2 x s16>)
+    ;
     ; VI-LABEL: name: test_fabs_v2s16
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
     ; VI-NEXT: [[FABS:%[0-9]+]]:_(<2 x s16>) = G_FABS [[COPY]]
     ; VI-NEXT: $vgpr0 = COPY [[FABS]](<2 x s16>)
+    ;
     ; GFX9-LABEL: name: test_fabs_v2s16
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -268,19 +282,19 @@ body: |
     ; SI-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32)
     ; SI-NEXT: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[FABS1]](<2 x s16>)
     ; SI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[BITCAST2]], [[C1]]
-    ; SI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]]
-    ; SI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C]](s32)
+    ; SI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LSHR]], [[C]](s32)
     ; SI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND1]], [[SHL1]]
     ; SI-NEXT: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
-    ; SI-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[BITCAST3]], [[C1]]
+    ; SI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[BITCAST3]], [[C1]]
     ; SI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
-    ; SI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND3]], [[SHL2]]
+    ; SI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL2]]
     ; SI-NEXT: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
-    ; SI-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C]](s32)
-    ; SI-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL3]]
+    ; SI-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C]](s32)
+    ; SI-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[LSHR]], [[SHL3]]
     ; SI-NEXT: [[BITCAST6:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR3]](s32)
     ; SI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>), [[BITCAST6]](<2 x s16>)
     ; SI-NEXT: S_NOP 0, implicit [[CONCAT_VECTORS]](<6 x s16>)
+    ;
     ; VI-LABEL: name: test_fabs_v3s16
     ; VI: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
     ; VI-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>)
@@ -298,19 +312,19 @@ body: |
     ; VI-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32)
     ; VI-NEXT: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[FABS1]](<2 x s16>)
     ; VI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[BITCAST2]], [[C1]]
-    ; VI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]]
-    ; VI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C]](s32)
+    ; VI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LSHR]], [[C]](s32)
     ; VI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND1]], [[SHL1]]
     ; VI-NEXT: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
-    ; VI-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[BITCAST3]], [[C1]]
+    ; VI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[BITCAST3]], [[C1]]
     ; VI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
-    ; VI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND3]], [[SHL2]]
+    ; VI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL2]]
     ; VI-NEXT: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
-    ; VI-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C]](s32)
-    ; VI-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL3]]
+    ; VI-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C]](s32)
+    ; VI-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[LSHR]], [[SHL3]]
     ; VI-NEXT: [[BITCAST6:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR3]](s32)
     ; VI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>), [[BITCAST6]](<2 x s16>)
     ; VI-NEXT: S_NOP 0, implicit [[CONCAT_VECTORS]](<6 x s16>)
+    ;
     ; GFX9-LABEL: name: test_fabs_v3s16
     ; GFX9: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
     ; GFX9-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>)
@@ -353,6 +367,7 @@ body: |
     ; SI-NEXT: [[FABS1:%[0-9]+]]:_(<2 x s16>) = G_FABS [[UV1]]
     ; SI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[FABS]](<2 x s16>), [[FABS1]](<2 x s16>)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
     ; VI-LABEL: name: test_fabs_v4s16
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -362,6 +377,7 @@ body: |
     ; VI-NEXT: [[FABS1:%[0-9]+]]:_(<2 x s16>) = G_FABS [[UV1]]
     ; VI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[FABS]](<2 x s16>), [[FABS1]](<2 x s16>)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
     ; GFX9-LABEL: name: test_fabs_v4s16
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fadd.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fadd.mir
index 92cde9de38cb7..8b6e53cb78267 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fadd.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fadd.mir
@@ -18,6 +18,7 @@ body: |
     ; SI-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
     ; SI-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[COPY]], [[COPY1]]
     ; SI-NEXT: $vgpr0 = COPY [[FADD]](s32)
+    ;
     ; VI-LABEL: name: test_fadd_s32
     ; VI: liveins: $vgpr0, $vgpr1
     ; VI-NEXT: {{  $}}
@@ -25,6 +26,7 @@ body: |
     ; VI-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
     ; VI-NEXT: [[FADD:%[0-9]+]]:_(s32) = G_FADD [[COPY]], [[COPY1]]
     ; VI-NEXT: $vgpr0 = COPY [[FADD]](s32)
+    ;
     ; GFX9-LABEL: name: test_fadd_s32
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -51,6 +53,7 @@ body: |
     ; SI-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3
     ; SI-NEXT: [[FADD:%[0-9]+]]:_(s64) = G_FADD [[COPY]], [[COPY1]]
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[FADD]](s64)
+    ;
     ; VI-LABEL: name: test_fadd_s64
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; VI-NEXT: {{  $}}
@@ -58,6 +61,7 @@ body: |
     ; VI-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3
     ; VI-NEXT: [[FADD:%[0-9]+]]:_(s64) = G_FADD [[COPY]], [[COPY1]]
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[FADD]](s64)
+    ;
     ; GFX9-LABEL: name: test_fadd_s64
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -90,6 +94,7 @@ body: |
     ; SI-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD]](s32)
     ; SI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FPTRUNC]](s16)
     ; SI-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ;
     ; VI-LABEL: name: test_fadd_s16
     ; VI: liveins: $vgpr0, $vgpr1
     ; VI-NEXT: {{  $}}
@@ -100,6 +105,7 @@ body: |
     ; VI-NEXT: [[FADD:%[0-9]+]]:_(s16) = G_FADD [[TRUNC]], [[TRUNC1]]
     ; VI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FADD]](s16)
     ; VI-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ;
     ; GFX9-LABEL: name: test_fadd_s16
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -137,6 +143,7 @@ body: |
     ; SI-NEXT: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[UV1]], [[UV3]]
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[FADD]](s32), [[FADD1]](s32)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; VI-LABEL: name: test_fadd_v2s32
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; VI-NEXT: {{  $}}
@@ -148,6 +155,7 @@ body: |
     ; VI-NEXT: [[FADD1:%[0-9]+]]:_(s32) = G_FADD [[UV1]], [[UV3]]
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[FADD]](s32), [[FADD1]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; GFX9-LABEL: name: test_fadd_v2s32
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -182,6 +190,7 @@ body: |
     ; SI-NEXT: [[FADD1:%[0-9]+]]:_(s32) = nnan G_FADD [[UV1]], [[UV3]]
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[FADD]](s32), [[FADD1]](s32)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; VI-LABEL: name: test_fadd_v2s32_flags
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; VI-NEXT: {{  $}}
@@ -193,6 +202,7 @@ body: |
     ; VI-NEXT: [[FADD1:%[0-9]+]]:_(s32) = nnan G_FADD [[UV1]], [[UV3]]
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[FADD]](s32), [[FADD1]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; GFX9-LABEL: name: test_fadd_v2s32_flags
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -228,6 +238,7 @@ body: |
     ; SI-NEXT: [[FADD2:%[0-9]+]]:_(s32) = G_FADD [[UV2]], [[UV5]]
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[FADD]](s32), [[FADD1]](s32), [[FADD2]](s32)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
+    ;
     ; VI-LABEL: name: test_fadd_v3s32
     ; VI: liveins: $vgpr0_vgpr1_vgpr2, $vgpr3_vgpr4_vgpr5
     ; VI-NEXT: {{  $}}
@@ -240,6 +251,7 @@ body: |
     ; VI-NEXT: [[FADD2:%[0-9]+]]:_(s32) = G_FADD [[UV2]], [[UV5]]
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[FADD]](s32), [[FADD1]](s32), [[FADD2]](s32)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
+    ;
     ; GFX9-LABEL: name: test_fadd_v3s32
     ; GFX9: liveins: $vgpr0_vgpr1_vgpr2, $vgpr3_vgpr4_vgpr5
     ; GFX9-NEXT: {{  $}}
@@ -275,6 +287,7 @@ body: |
     ; SI-NEXT: [[FADD1:%[0-9]+]]:_(s64) = G_FADD [[UV1]], [[UV3]]
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[FADD]](s64), [[FADD1]](s64)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ;
     ; VI-LABEL: name: test_fadd_v2s64
     ; VI: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7
     ; VI-NEXT: {{  $}}
@@ -286,6 +299,7 @@ body: |
     ; VI-NEXT: [[FADD1:%[0-9]+]]:_(s64) = G_FADD [[UV1]], [[UV3]]
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[FADD]](s64), [[FADD1]](s64)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ;
     ; GFX9-LABEL: name: test_fadd_v2s64
     ; GFX9: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7
     ; GFX9-NEXT: {{  $}}
@@ -337,6 +351,7 @@ body: |
     ; SI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL]]
     ; SI-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; SI-NEXT: $vgpr0 = COPY [[BITCAST2]](<2 x s16>)
+    ;
     ; VI-LABEL: name: test_fadd_v2s16
     ; VI: liveins: $vgpr0, $vgpr1
     ; VI-NEXT: {{  $}}
@@ -359,6 +374,7 @@ body: |
     ; VI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL]]
     ; VI-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; VI-NEXT: $vgpr0 = COPY [[BITCAST2]](<2 x s16>)
+    ;
     ; GFX9-LABEL: name: test_fadd_v2s16
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -425,13 +441,13 @@ body: |
     ; SI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND]], [[C]](s32)
     ; SI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL1]]
     ; SI-NEXT: [[BITCAST7:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
-    ; SI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR2]], [[C1]]
-    ; SI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[BITCAST5]], [[C1]]
-    ; SI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C]](s32)
-    ; SI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND1]], [[SHL2]]
+    ; SI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[BITCAST5]], [[C1]]
+    ; SI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
+    ; SI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[LSHR2]], [[SHL2]]
     ; SI-NEXT: [[BITCAST8:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
     ; SI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST6]](<2 x s16>), [[BITCAST7]](<2 x s16>), [[BITCAST8]](<2 x s16>)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
     ; VI-LABEL: name: test_fadd_v3s16
     ; VI: liveins: $vgpr0_vgpr1_vgpr2, $vgpr3_vgpr4_vgpr5
     ; VI-NEXT: {{  $}}
@@ -471,13 +487,13 @@ body: |
     ; VI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND]], [[C]](s32)
     ; VI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL1]]
     ; VI-NEXT: [[BITCAST7:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
-    ; VI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR2]], [[C1]]
-    ; VI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[BITCAST5]], [[C1]]
-    ; VI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C]](s32)
-    ; VI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND1]], [[SHL2]]
+    ; VI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[BITCAST5]], [[C1]]
+    ; VI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
+    ; VI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[LSHR2]], [[SHL2]]
     ; VI-NEXT: [[BITCAST8:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
     ; VI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST6]](<2 x s16>), [[BITCAST7]](<2 x s16>), [[BITCAST8]](<2 x s16>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
     ; GFX9-LABEL: name: test_fadd_v3s16
     ; GFX9: liveins: $vgpr0_vgpr1_vgpr2, $vgpr3_vgpr4_vgpr5
     ; GFX9-NEXT: {{  $}}
@@ -583,6 +599,7 @@ body: |
     ; SI-NEXT: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
     ; SI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
     ; VI-LABEL: name: test_fadd_v4s16
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; VI-NEXT: {{  $}}
@@ -623,6 +640,7 @@ body: |
     ; VI-NEXT: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
     ; VI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
     ; GFX9-LABEL: name: test_fadd_v4s16
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fcopysign.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fcopysign.mir
index 6f086c613eba8..ac72bf1dbbb6f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fcopysign.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fcopysign.mir
@@ -25,6 +25,7 @@ body: |
     ; SI-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[AND1]]
     ; SI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[OR]](s16)
     ; SI-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ;
     ; VI-LABEL: name: test_copysign_s16_s16
     ; VI: liveins: $vgpr0, $vgpr1
     ; VI-NEXT: {{  $}}
@@ -39,6 +40,7 @@ body: |
     ; VI-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[AND1]]
     ; VI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[OR]](s16)
     ; VI-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ;
     ; GFX9-LABEL: name: test_copysign_s16_s16
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -79,6 +81,7 @@ body: |
     ; SI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C]]
     ; SI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[AND1]]
     ; SI-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
     ; VI-LABEL: name: test_copysign_s32_s32
     ; VI: liveins: $vgpr0, $vgpr1
     ; VI-NEXT: {{  $}}
@@ -90,6 +93,7 @@ body: |
     ; VI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C]]
     ; VI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[AND1]]
     ; VI-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
     ; GFX9-LABEL: name: test_copysign_s32_s32
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -124,6 +128,7 @@ body: |
     ; SI-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[COPY1]], [[C]]
     ; SI-NEXT: [[OR:%[0-9]+]]:_(s64) = G_OR [[AND]], [[AND1]]
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[OR]](s64)
+    ;
     ; VI-LABEL: name: test_copysign_s64_s64
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; VI-NEXT: {{  $}}
@@ -135,6 +140,7 @@ body: |
     ; VI-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[COPY1]], [[C]]
     ; VI-NEXT: [[OR:%[0-9]+]]:_(s64) = G_OR [[AND]], [[AND1]]
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[OR]](s64)
+    ;
     ; GFX9-LABEL: name: test_copysign_s64_s64
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -172,6 +178,7 @@ body: |
     ; SI-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[SHL]], [[C]]
     ; SI-NEXT: [[OR:%[0-9]+]]:_(s64) = G_OR [[AND]], [[AND1]]
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[OR]](s64)
+    ;
     ; VI-LABEL: name: test_copysign_s64_s32
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2
     ; VI-NEXT: {{  $}}
@@ -186,6 +193,7 @@ body: |
     ; VI-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[SHL]], [[C]]
     ; VI-NEXT: [[OR:%[0-9]+]]:_(s64) = G_OR [[AND]], [[AND1]]
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[OR]](s64)
+    ;
     ; GFX9-LABEL: name: test_copysign_s64_s32
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9-NEXT: {{  $}}
@@ -226,6 +234,7 @@ body: |
     ; SI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[TRUNC]], [[C]]
     ; SI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[AND1]]
     ; SI-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
     ; VI-LABEL: name: test_copysign_s32_s64
     ; VI: liveins: $vgpr0, $vgpr1_vgpr2
     ; VI-NEXT: {{  $}}
@@ -240,6 +249,7 @@ body: |
     ; VI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[TRUNC]], [[C]]
     ; VI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[AND1]]
     ; VI-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
     ; GFX9-LABEL: name: test_copysign_s32_s64
     ; GFX9: liveins: $vgpr0, $vgpr1_vgpr2
     ; GFX9-NEXT: {{  $}}
@@ -282,6 +292,7 @@ body: |
     ; SI-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[AND1]]
     ; SI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[OR]](s16)
     ; SI-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ;
     ; VI-LABEL: name: test_copysign_s16_s32
     ; VI: liveins: $vgpr0, $vgpr1
     ; VI-NEXT: {{  $}}
@@ -298,6 +309,7 @@ body: |
     ; VI-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[AND1]]
     ; VI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[OR]](s16)
     ; VI-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ;
     ; GFX9-LABEL: name: test_copysign_s16_s32
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -343,6 +355,7 @@ body: |
     ; SI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[SHL]], [[C]]
     ; SI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[AND2]]
     ; SI-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
     ; VI-LABEL: name: test_copysign_s32_s16
     ; VI: liveins: $vgpr0, $vgpr1
     ; VI-NEXT: {{  $}}
@@ -358,6 +371,7 @@ body: |
     ; VI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[SHL]], [[C]]
     ; VI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[AND2]]
     ; VI-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
     ; GFX9-LABEL: name: test_copysign_s32_s16
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -394,14 +408,15 @@ body: |
     ; SI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -9223372036854775808
     ; SI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 9223372036854775807
     ; SI-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY]], [[C1]]
-    ; SI-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 65535
     ; SI-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[COPY1]](s32)
+    ; SI-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 65535
     ; SI-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[ANYEXT]], [[C2]]
     ; SI-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 48
     ; SI-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[AND1]], [[C3]](s32)
     ; SI-NEXT: [[AND2:%[0-9]+]]:_(s64) = G_AND [[SHL]], [[C]]
     ; SI-NEXT: [[OR:%[0-9]+]]:_(s64) = G_OR [[AND]], [[AND2]]
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[OR]](s64)
+    ;
     ; VI-LABEL: name: test_copysign_s64_s16
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2
     ; VI-NEXT: {{  $}}
@@ -410,14 +425,15 @@ body: |
     ; VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -9223372036854775808
     ; VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 9223372036854775807
     ; VI-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY]], [[C1]]
-    ; VI-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 65535
     ; VI-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[COPY1]](s32)
+    ; VI-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 65535
     ; VI-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[ANYEXT]], [[C2]]
     ; VI-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 48
     ; VI-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[AND1]], [[C3]](s32)
     ; VI-NEXT: [[AND2:%[0-9]+]]:_(s64) = G_AND [[SHL]], [[C]]
     ; VI-NEXT: [[OR:%[0-9]+]]:_(s64) = G_OR [[AND]], [[AND2]]
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[OR]](s64)
+    ;
     ; GFX9-LABEL: name: test_copysign_s64_s16
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9-NEXT: {{  $}}
@@ -426,8 +442,8 @@ body: |
     ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -9223372036854775808
     ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 9223372036854775807
     ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[COPY]], [[C1]]
-    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 65535
     ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[COPY1]](s32)
+    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 65535
     ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[ANYEXT]], [[C2]]
     ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 48
     ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[AND1]], [[C3]](s32)
@@ -463,6 +479,7 @@ body: |
     ; SI-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[AND1]]
     ; SI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[OR]](s16)
     ; SI-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ;
     ; VI-LABEL: name: test_copysign_s16_s64
     ; VI: liveins: $vgpr0, $vgpr1_vgpr2
     ; VI-NEXT: {{  $}}
@@ -479,6 +496,7 @@ body: |
     ; VI-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[AND1]]
     ; VI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[OR]](s16)
     ; VI-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ;
     ; GFX9-LABEL: name: test_copysign_s16_s64
     ; GFX9: liveins: $vgpr0, $vgpr1_vgpr2
     ; GFX9-NEXT: {{  $}}
@@ -527,6 +545,7 @@ body: |
     ; SI-NEXT: [[AND1:%[0-9]+]]:_(<2 x s16>) = G_AND [[COPY1]], [[BITCAST]]
     ; SI-NEXT: [[OR2:%[0-9]+]]:_(<2 x s16>) = G_OR [[AND]], [[AND1]]
     ; SI-NEXT: $vgpr0 = COPY [[OR2]](<2 x s16>)
+    ;
     ; VI-LABEL: name: test_copysign_v2s16_v2s16
     ; VI: liveins: $vgpr0, $vgpr1
     ; VI-NEXT: {{  $}}
@@ -545,6 +564,7 @@ body: |
     ; VI-NEXT: [[AND1:%[0-9]+]]:_(<2 x s16>) = G_AND [[COPY1]], [[BITCAST]]
     ; VI-NEXT: [[OR2:%[0-9]+]]:_(<2 x s16>) = G_OR [[AND]], [[AND1]]
     ; VI-NEXT: $vgpr0 = COPY [[OR2]](<2 x s16>)
+    ;
     ; GFX9-LABEL: name: test_copysign_v2s16_v2s16
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -583,6 +603,7 @@ body: |
     ; SI-NEXT: [[AND1:%[0-9]+]]:_(<2 x s32>) = G_AND [[COPY1]], [[BUILD_VECTOR]]
     ; SI-NEXT: [[OR:%[0-9]+]]:_(<2 x s32>) = G_OR [[AND]], [[AND1]]
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[OR]](<2 x s32>)
+    ;
     ; VI-LABEL: name: test_copysign_v2s32_v2s32
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; VI-NEXT: {{  $}}
@@ -596,6 +617,7 @@ body: |
     ; VI-NEXT: [[AND1:%[0-9]+]]:_(<2 x s32>) = G_AND [[COPY1]], [[BUILD_VECTOR]]
     ; VI-NEXT: [[OR:%[0-9]+]]:_(<2 x s32>) = G_OR [[AND]], [[AND1]]
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[OR]](<2 x s32>)
+    ;
     ; GFX9-LABEL: name: test_copysign_v2s32_v2s32
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -638,6 +660,7 @@ body: |
     ; SI-NEXT: [[OR1:%[0-9]+]]:_(s64) = G_OR [[AND1]], [[AND3]]
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR]](s64), [[OR1]](s64)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ;
     ; VI-LABEL: name: test_copysign_v2s64_v2s64
     ; VI: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7
     ; VI-NEXT: {{  $}}
@@ -655,6 +678,7 @@ body: |
     ; VI-NEXT: [[OR1:%[0-9]+]]:_(s64) = G_OR [[AND1]], [[AND3]]
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR]](s64), [[OR1]](s64)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ;
     ; GFX9-LABEL: name: test_copysign_v2s64_v2s64
     ; GFX9: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7
     ; GFX9-NEXT: {{  $}}
@@ -707,6 +731,7 @@ body: |
     ; SI-NEXT: [[OR1:%[0-9]+]]:_(s64) = G_OR [[AND1]], [[AND3]]
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR]](s64), [[OR1]](s64)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ;
     ; VI-LABEL: name: test_copysign_v2s64_v2s32
     ; VI: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5
     ; VI-NEXT: {{  $}}
@@ -730,6 +755,7 @@ body: |
     ; VI-NEXT: [[OR1:%[0-9]+]]:_(s64) = G_OR [[AND1]], [[AND3]]
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR]](s64), [[OR1]](s64)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ;
     ; GFX9-LABEL: name: test_copysign_v2s64_v2s32
     ; GFX9: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5
     ; GFX9-NEXT: {{  $}}
@@ -787,6 +813,7 @@ body: |
     ; SI-NEXT: [[AND1:%[0-9]+]]:_(<2 x s32>) = G_AND [[BUILD_VECTOR2]], [[BUILD_VECTOR]]
     ; SI-NEXT: [[OR:%[0-9]+]]:_(<2 x s32>) = G_OR [[AND]], [[AND1]]
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[OR]](<2 x s32>)
+    ;
     ; VI-LABEL: name: test_copysign_v2s32_v2s64
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; VI-NEXT: {{  $}}
@@ -808,6 +835,7 @@ body: |
     ; VI-NEXT: [[AND1:%[0-9]+]]:_(<2 x s32>) = G_AND [[BUILD_VECTOR2]], [[BUILD_VECTOR]]
     ; VI-NEXT: [[OR:%[0-9]+]]:_(<2 x s32>) = G_OR [[AND]], [[AND1]]
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[OR]](<2 x s32>)
+    ;
     ; GFX9-LABEL: name: test_copysign_v2s32_v2s64
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX9-NEXT: {{  $}}
@@ -852,6 +880,7 @@ body: |
     ; SI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C]]
     ; SI-NEXT: [[OR:%[0-9]+]]:_(s32) = nnan G_OR [[AND]], [[AND1]]
     ; SI-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
     ; VI-LABEL: name: test_copysign_s32_s32_flagss
     ; VI: liveins: $vgpr0, $vgpr1
     ; VI-NEXT: {{  $}}
@@ -863,6 +892,7 @@ body: |
     ; VI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C]]
     ; VI-NEXT: [[OR:%[0-9]+]]:_(s32) = nnan G_OR [[AND]], [[AND1]]
     ; VI-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
     ; GFX9-LABEL: name: test_copysign_s32_s32_flagss
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -901,6 +931,7 @@ body: |
     ; SI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[SHL]], [[C]]
     ; SI-NEXT: [[OR:%[0-9]+]]:_(s32) = nnan G_OR [[AND]], [[AND2]]
     ; SI-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
     ; VI-LABEL: name: test_copysign_s32_s16_flags
     ; VI: liveins: $vgpr0, $vgpr1
     ; VI-NEXT: {{  $}}
@@ -916,6 +947,7 @@ body: |
     ; VI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[SHL]], [[C]]
     ; VI-NEXT: [[OR:%[0-9]+]]:_(s32) = nnan G_OR [[AND]], [[AND2]]
     ; VI-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
     ; GFX9-LABEL: name: test_copysign_s32_s16_flags
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -961,6 +993,7 @@ body: |
     ; SI-NEXT: [[OR:%[0-9]+]]:_(s16) = nnan G_OR [[AND]], [[AND1]]
     ; SI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[OR]](s16)
     ; SI-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ;
     ; VI-LABEL: name: test_copysign_s16_s32_flags
     ; VI: liveins: $vgpr0, $vgpr1
     ; VI-NEXT: {{  $}}
@@ -977,6 +1010,7 @@ body: |
     ; VI-NEXT: [[OR:%[0-9]+]]:_(s16) = nnan G_OR [[AND]], [[AND1]]
     ; VI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[OR]](s16)
     ; VI-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ;
     ; GFX9-LABEL: name: test_copysign_s16_s32_flags
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fma.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fma.mir
index 77a3e47d6deee..2ccdc5d9c6b73 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fma.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fma.mir
@@ -19,6 +19,7 @@ body: |
     ; SI-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
     ; SI-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
     ; SI-NEXT: $vgpr0 = COPY [[FMA]](s32)
+    ;
     ; VI-LABEL: name: test_fma_s32
     ; VI: liveins: $vgpr0, $vgpr1, $vgpr2
     ; VI-NEXT: {{  $}}
@@ -27,6 +28,7 @@ body: |
     ; VI-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
     ; VI-NEXT: [[FMA:%[0-9]+]]:_(s32) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
     ; VI-NEXT: $vgpr0 = COPY [[FMA]](s32)
+    ;
     ; GFX9-LABEL: name: test_fma_s32
     ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2
     ; GFX9-NEXT: {{  $}}
@@ -55,6 +57,7 @@ body: |
     ; SI-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY $vgpr4_vgpr5
     ; SI-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[FMA]](s64)
+    ;
     ; VI-LABEL: name: test_fma_s64
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $vgpr4_vgpr5
     ; VI-NEXT: {{  $}}
@@ -63,6 +66,7 @@ body: |
     ; VI-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY $vgpr4_vgpr5
     ; VI-NEXT: [[FMA:%[0-9]+]]:_(s64) = G_FMA [[COPY]], [[COPY1]], [[COPY2]]
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[FMA]](s64)
+    ;
     ; GFX9-LABEL: name: test_fma_s64
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $vgpr4_vgpr5
     ; GFX9-NEXT: {{  $}}
@@ -100,6 +104,7 @@ body: |
     ; SI-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMA]](s32)
     ; SI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FPTRUNC]](s16)
     ; SI-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ;
     ; VI-LABEL: name: test_fma_s16
     ; VI: liveins: $vgpr0, $vgpr1, $vgpr2
     ; VI-NEXT: {{  $}}
@@ -112,6 +117,7 @@ body: |
     ; VI-NEXT: [[FMA:%[0-9]+]]:_(s16) = G_FMA [[TRUNC]], [[TRUNC1]], [[TRUNC2]]
     ; VI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMA]](s16)
     ; VI-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ;
     ; GFX9-LABEL: name: test_fma_s16
     ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2
     ; GFX9-NEXT: {{  $}}
@@ -155,6 +161,7 @@ body: |
     ; SI-NEXT: [[FMA1:%[0-9]+]]:_(s32) = G_FMA [[UV1]], [[UV3]], [[UV5]]
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[FMA]](s32), [[FMA1]](s32)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; VI-LABEL: name: test_fma_v2s32
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $vgpr4_vgpr5
     ; VI-NEXT: {{  $}}
@@ -168,6 +175,7 @@ body: |
     ; VI-NEXT: [[FMA1:%[0-9]+]]:_(s32) = G_FMA [[UV1]], [[UV3]], [[UV5]]
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[FMA]](s32), [[FMA1]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; GFX9-LABEL: name: test_fma_v2s32
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $vgpr4_vgpr5
     ; GFX9-NEXT: {{  $}}
@@ -208,6 +216,7 @@ body: |
     ; SI-NEXT: [[FMA2:%[0-9]+]]:_(s32) = G_FMA [[UV2]], [[UV5]], [[UV8]]
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[FMA]](s32), [[FMA1]](s32), [[FMA2]](s32)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
+    ;
     ; VI-LABEL: name: test_fma_v3s32
     ; VI: liveins: $vgpr0_vgpr1_vgpr2, $vgpr3_vgpr4_vgpr5, $vgpr6_vgpr7_vgpr8
     ; VI-NEXT: {{  $}}
@@ -222,6 +231,7 @@ body: |
     ; VI-NEXT: [[FMA2:%[0-9]+]]:_(s32) = G_FMA [[UV2]], [[UV5]], [[UV8]]
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[FMA]](s32), [[FMA1]](s32), [[FMA2]](s32)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
+    ;
     ; GFX9-LABEL: name: test_fma_v3s32
     ; GFX9: liveins: $vgpr0_vgpr1_vgpr2, $vgpr3_vgpr4_vgpr5, $vgpr6_vgpr7_vgpr8
     ; GFX9-NEXT: {{  $}}
@@ -264,6 +274,7 @@ body: |
     ; SI-NEXT: [[FMA3:%[0-9]+]]:_(s32) = G_FMA [[UV3]], [[UV7]], [[UV11]]
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[FMA]](s32), [[FMA1]](s32), [[FMA2]](s32), [[FMA3]](s32)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
+    ;
     ; VI-LABEL: name: test_fma_v4s32
     ; VI: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10_vgpr11
     ; VI-NEXT: {{  $}}
@@ -279,6 +290,7 @@ body: |
     ; VI-NEXT: [[FMA3:%[0-9]+]]:_(s32) = G_FMA [[UV3]], [[UV7]], [[UV11]]
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[FMA]](s32), [[FMA1]](s32), [[FMA2]](s32), [[FMA3]](s32)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
+    ;
     ; GFX9-LABEL: name: test_fma_v4s32
     ; GFX9: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10_vgpr11
     ; GFX9-NEXT: {{  $}}
@@ -320,6 +332,7 @@ body: |
     ; SI-NEXT: [[FMA1:%[0-9]+]]:_(s64) = G_FMA [[UV1]], [[UV3]], [[UV5]]
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[FMA]](s64), [[FMA1]](s64)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ;
     ; VI-LABEL: name: test_fma_v2s64
     ; VI: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10_vgpr11
     ; VI-NEXT: {{  $}}
@@ -333,6 +346,7 @@ body: |
     ; VI-NEXT: [[FMA1:%[0-9]+]]:_(s64) = G_FMA [[UV1]], [[UV3]], [[UV5]]
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[FMA]](s64), [[FMA1]](s64)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ;
     ; GFX9-LABEL: name: test_fma_v2s64
     ; GFX9: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10_vgpr11
     ; GFX9-NEXT: {{  $}}
@@ -394,6 +408,7 @@ body: |
     ; SI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL]]
     ; SI-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; SI-NEXT: $vgpr0 = COPY [[BITCAST3]](<2 x s16>)
+    ;
     ; VI-LABEL: name: test_fma_v2s16
     ; VI: liveins: $vgpr0, $vgpr1, $vgpr2
     ; VI-NEXT: {{  $}}
@@ -421,6 +436,7 @@ body: |
     ; VI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL]]
     ; VI-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; VI-NEXT: $vgpr0 = COPY [[BITCAST3]](<2 x s16>)
+    ;
     ; GFX9-LABEL: name: test_fma_v2s16
     ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2
     ; GFX9-NEXT: {{  $}}
@@ -501,13 +517,13 @@ body: |
     ; SI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND]], [[C]](s32)
     ; SI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL1]]
     ; SI-NEXT: [[BITCAST9:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
-    ; SI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR3]], [[C1]]
-    ; SI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[BITCAST7]], [[C1]]
-    ; SI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C]](s32)
-    ; SI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND1]], [[SHL2]]
+    ; SI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[BITCAST7]], [[C1]]
+    ; SI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
+    ; SI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[LSHR3]], [[SHL2]]
     ; SI-NEXT: [[BITCAST10:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
     ; SI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST8]](<2 x s16>), [[BITCAST9]](<2 x s16>), [[BITCAST10]](<2 x s16>)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
     ; VI-LABEL: name: test_fma_v3s16
     ; VI: liveins: $vgpr0_vgpr1_vgpr2, $vgpr3_vgpr4_vgpr5, $vgpr6_vgpr7_vgpr8
     ; VI-NEXT: {{  $}}
@@ -555,13 +571,13 @@ body: |
     ; VI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND]], [[C]](s32)
     ; VI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL1]]
     ; VI-NEXT: [[BITCAST9:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
-    ; VI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR3]], [[C1]]
-    ; VI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[BITCAST7]], [[C1]]
-    ; VI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C]](s32)
-    ; VI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND1]], [[SHL2]]
+    ; VI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[BITCAST7]], [[C1]]
+    ; VI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
+    ; VI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[LSHR3]], [[SHL2]]
     ; VI-NEXT: [[BITCAST10:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
     ; VI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST8]](<2 x s16>), [[BITCAST9]](<2 x s16>), [[BITCAST10]](<2 x s16>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
     ; GFX9-LABEL: name: test_fma_v3s16
     ; GFX9: liveins: $vgpr0_vgpr1_vgpr2, $vgpr3_vgpr4_vgpr5, $vgpr6_vgpr7_vgpr8
     ; GFX9-NEXT: {{  $}}
@@ -688,6 +704,7 @@ body: |
     ; SI-NEXT: [[BITCAST7:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
     ; SI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST6]](<2 x s16>), [[BITCAST7]](<2 x s16>)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
     ; VI-LABEL: name: test_fma_v4s16
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $vgpr4_vgpr5
     ; VI-NEXT: {{  $}}
@@ -738,6 +755,7 @@ body: |
     ; VI-NEXT: [[BITCAST7:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
     ; VI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST6]](<2 x s16>), [[BITCAST7]](<2 x s16>)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
     ; GFX9-LABEL: name: test_fma_v4s16
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $vgpr4_vgpr5
     ; GFX9-NEXT: {{  $}}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fmaxnum.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fmaxnum.mir
index 2b42ee80b45e6..78bed9e19c65e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fmaxnum.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fmaxnum.mir
@@ -23,6 +23,7 @@ body: |
     ; SI-NEXT: [[FCANONICALIZE1:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[COPY1]]
     ; SI-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[FCANONICALIZE]], [[FCANONICALIZE1]]
     ; SI-NEXT: $vgpr0 = COPY [[FMAXNUM_IEEE]](s32)
+    ;
     ; VI-LABEL: name: test_fmaxnum_s32_ieee_mode_on
     ; VI: liveins: $vgpr0, $vgpr1
     ; VI-NEXT: {{  $}}
@@ -32,6 +33,7 @@ body: |
     ; VI-NEXT: [[FCANONICALIZE1:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[COPY1]]
     ; VI-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[FCANONICALIZE]], [[FCANONICALIZE1]]
     ; VI-NEXT: $vgpr0 = COPY [[FMAXNUM_IEEE]](s32)
+    ;
     ; GFX9-LABEL: name: test_fmaxnum_s32_ieee_mode_on
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -63,6 +65,7 @@ body: |
     ; SI-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
     ; SI-NEXT: [[FMAXNUM:%[0-9]+]]:_(s32) = G_FMAXNUM [[COPY]], [[COPY1]]
     ; SI-NEXT: $vgpr0 = COPY [[FMAXNUM]](s32)
+    ;
     ; VI-LABEL: name: test_fmaxnum_s32_ieee_mode_off
     ; VI: liveins: $vgpr0, $vgpr1
     ; VI-NEXT: {{  $}}
@@ -70,6 +73,7 @@ body: |
     ; VI-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
     ; VI-NEXT: [[FMAXNUM:%[0-9]+]]:_(s32) = G_FMAXNUM [[COPY]], [[COPY1]]
     ; VI-NEXT: $vgpr0 = COPY [[FMAXNUM]](s32)
+    ;
     ; GFX9-LABEL: name: test_fmaxnum_s32_ieee_mode_off
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -96,6 +100,7 @@ body: |
     ; SI-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
     ; SI-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(s32) = nnan G_FMAXNUM_IEEE [[COPY]], [[COPY1]]
     ; SI-NEXT: $vgpr0 = COPY [[FMAXNUM_IEEE]](s32)
+    ;
     ; VI-LABEL: name: test_fmaxnum_s32_nnan
     ; VI: liveins: $vgpr0, $vgpr1
     ; VI-NEXT: {{  $}}
@@ -103,6 +108,7 @@ body: |
     ; VI-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
     ; VI-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(s32) = nnan G_FMAXNUM_IEEE [[COPY]], [[COPY1]]
     ; VI-NEXT: $vgpr0 = COPY [[FMAXNUM_IEEE]](s32)
+    ;
     ; GFX9-LABEL: name: test_fmaxnum_s32_nnan
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -131,6 +137,7 @@ body: |
     ; SI-NEXT: [[FCANONICALIZE:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[COPY1]]
     ; SI-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[COPY]], [[FCANONICALIZE]]
     ; SI-NEXT: $vgpr0 = COPY [[FMAXNUM_IEEE]](s32)
+    ;
     ; VI-LABEL: name: test_fmaxnum_s32_nnan_lhs
     ; VI: liveins: $vgpr0, $vgpr1
     ; VI-NEXT: {{  $}}
@@ -139,6 +146,7 @@ body: |
     ; VI-NEXT: [[FCANONICALIZE:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[COPY1]]
     ; VI-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[COPY]], [[FCANONICALIZE]]
     ; VI-NEXT: $vgpr0 = COPY [[FMAXNUM_IEEE]](s32)
+    ;
     ; GFX9-LABEL: name: test_fmaxnum_s32_nnan_lhs
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -168,6 +176,7 @@ body: |
     ; SI-NEXT: [[FCANONICALIZE:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[COPY]]
     ; SI-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[FCANONICALIZE]], [[COPY1]]
     ; SI-NEXT: $vgpr0 = COPY [[FMAXNUM_IEEE]](s32)
+    ;
     ; VI-LABEL: name: test_fmaxnum_s32_nnan_rhs
     ; VI: liveins: $vgpr0, $vgpr1
     ; VI-NEXT: {{  $}}
@@ -176,6 +185,7 @@ body: |
     ; VI-NEXT: [[FCANONICALIZE:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[COPY]]
     ; VI-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[FCANONICALIZE]], [[COPY1]]
     ; VI-NEXT: $vgpr0 = COPY [[FMAXNUM_IEEE]](s32)
+    ;
     ; GFX9-LABEL: name: test_fmaxnum_s32_nnan_rhs
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -203,6 +213,7 @@ body: |
     ; SI-NEXT: [[COPY1:%[0-9]+]]:_(s32) = nnan COPY $vgpr1
     ; SI-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[COPY]], [[COPY1]]
     ; SI-NEXT: $vgpr0 = COPY [[FMAXNUM_IEEE]](s32)
+    ;
     ; VI-LABEL: name: test_fmaxnum_s32_nnan_lhs_rhs
     ; VI: liveins: $vgpr0, $vgpr1
     ; VI-NEXT: {{  $}}
@@ -210,6 +221,7 @@ body: |
     ; VI-NEXT: [[COPY1:%[0-9]+]]:_(s32) = nnan COPY $vgpr1
     ; VI-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[COPY]], [[COPY1]]
     ; VI-NEXT: $vgpr0 = COPY [[FMAXNUM_IEEE]](s32)
+    ;
     ; GFX9-LABEL: name: test_fmaxnum_s32_nnan_lhs_rhs
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -238,6 +250,7 @@ body: |
     ; SI-NEXT: [[FCANONICALIZE1:%[0-9]+]]:_(s64) = G_FCANONICALIZE [[COPY1]]
     ; SI-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(s64) = G_FMAXNUM_IEEE [[FCANONICALIZE]], [[FCANONICALIZE1]]
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[FMAXNUM_IEEE]](s64)
+    ;
     ; VI-LABEL: name: test_fmaxnum_s64
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; VI-NEXT: {{  $}}
@@ -247,6 +260,7 @@ body: |
     ; VI-NEXT: [[FCANONICALIZE1:%[0-9]+]]:_(s64) = G_FCANONICALIZE [[COPY1]]
     ; VI-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(s64) = G_FMAXNUM_IEEE [[FCANONICALIZE]], [[FCANONICALIZE1]]
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[FMAXNUM_IEEE]](s64)
+    ;
     ; GFX9-LABEL: name: test_fmaxnum_s64
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -281,6 +295,7 @@ body: |
     ; SI-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMAXNUM_IEEE]](s32)
     ; SI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FPTRUNC]](s16)
     ; SI-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ;
     ; VI-LABEL: name: test_fmaxnum_s16
     ; VI: liveins: $vgpr0, $vgpr1
     ; VI-NEXT: {{  $}}
@@ -293,6 +308,7 @@ body: |
     ; VI-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(s16) = G_FMAXNUM_IEEE [[FCANONICALIZE]], [[FCANONICALIZE1]]
     ; VI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMAXNUM_IEEE]](s16)
     ; VI-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ;
     ; GFX9-LABEL: name: test_fmaxnum_s16
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -335,6 +351,7 @@ body: |
     ; SI-NEXT: [[FMAXNUM_IEEE1:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[FCANONICALIZE2]], [[FCANONICALIZE3]]
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[FMAXNUM_IEEE]](s32), [[FMAXNUM_IEEE1]](s32)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; VI-LABEL: name: test_fmaxnum_v2s32
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; VI-NEXT: {{  $}}
@@ -350,6 +367,7 @@ body: |
     ; VI-NEXT: [[FMAXNUM_IEEE1:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[FCANONICALIZE2]], [[FCANONICALIZE3]]
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[FMAXNUM_IEEE]](s32), [[FMAXNUM_IEEE1]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; GFX9-LABEL: name: test_fmaxnum_v2s32
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -405,6 +423,7 @@ body: |
     ; SI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL]]
     ; SI-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; SI-NEXT: $vgpr0 = COPY [[BITCAST2]](<2 x s16>)
+    ;
     ; VI-LABEL: name: test_fmaxnum_v2s16
     ; VI: liveins: $vgpr0, $vgpr1
     ; VI-NEXT: {{  $}}
@@ -431,6 +450,7 @@ body: |
     ; VI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL]]
     ; VI-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; VI-NEXT: $vgpr0 = COPY [[BITCAST2]](<2 x s16>)
+    ;
     ; GFX9-LABEL: name: test_fmaxnum_v2s16
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -499,13 +519,13 @@ body: |
     ; SI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND]], [[C]](s32)
     ; SI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL1]]
     ; SI-NEXT: [[BITCAST7:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
-    ; SI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR2]], [[C1]]
-    ; SI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[BITCAST5]], [[C1]]
-    ; SI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C]](s32)
-    ; SI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND1]], [[SHL2]]
+    ; SI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[BITCAST5]], [[C1]]
+    ; SI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
+    ; SI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[LSHR2]], [[SHL2]]
     ; SI-NEXT: [[BITCAST8:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
     ; SI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST6]](<2 x s16>), [[BITCAST7]](<2 x s16>), [[BITCAST8]](<2 x s16>)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
     ; VI-LABEL: name: test_fmaxnum_v3s16
     ; VI: liveins: $vgpr0_vgpr1_vgpr2, $vgpr3_vgpr4_vgpr5
     ; VI-NEXT: {{  $}}
@@ -551,13 +571,13 @@ body: |
     ; VI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND]], [[C]](s32)
     ; VI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL1]]
     ; VI-NEXT: [[BITCAST7:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
-    ; VI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR2]], [[C1]]
-    ; VI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[BITCAST5]], [[C1]]
-    ; VI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C]](s32)
-    ; VI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND1]], [[SHL2]]
+    ; VI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[BITCAST5]], [[C1]]
+    ; VI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
+    ; VI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[LSHR2]], [[SHL2]]
     ; VI-NEXT: [[BITCAST8:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
     ; VI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST6]](<2 x s16>), [[BITCAST7]](<2 x s16>), [[BITCAST8]](<2 x s16>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
     ; GFX9-LABEL: name: test_fmaxnum_v3s16
     ; GFX9: liveins: $vgpr0_vgpr1_vgpr2, $vgpr3_vgpr4_vgpr5
     ; GFX9-NEXT: {{  $}}
@@ -667,6 +687,7 @@ body: |
     ; SI-NEXT: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
     ; SI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
     ; VI-LABEL: name: test_fmaxnum_v4s16
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; VI-NEXT: {{  $}}
@@ -715,6 +736,7 @@ body: |
     ; VI-NEXT: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
     ; VI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
     ; GFX9-LABEL: name: test_fmaxnum_v4s16
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -758,6 +780,7 @@ body: |
     ; SI-NEXT: [[FCANONICALIZE3:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[COPY2]]
     ; SI-NEXT: [[FMAXNUM_IEEE1:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[FCANONICALIZE2]], [[FCANONICALIZE3]]
     ; SI-NEXT: $vgpr0 = COPY [[FMAXNUM_IEEE1]](s32)
+    ;
     ; VI-LABEL: name: test_fmaxnum_with_fmaxnum_argument_s32_ieee_mode_on
     ; VI: liveins: $vgpr0, $vgpr1, $vgpr2
     ; VI-NEXT: {{  $}}
@@ -771,6 +794,7 @@ body: |
     ; VI-NEXT: [[FCANONICALIZE3:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[COPY2]]
     ; VI-NEXT: [[FMAXNUM_IEEE1:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[FCANONICALIZE2]], [[FCANONICALIZE3]]
     ; VI-NEXT: $vgpr0 = COPY [[FMAXNUM_IEEE1]](s32)
+    ;
     ; GFX9-LABEL: name: test_fmaxnum_with_fmaxnum_argument_s32_ieee_mode_on
     ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2
     ; GFX9-NEXT: {{  $}}
@@ -812,6 +836,7 @@ body: |
     ; SI-NEXT: [[FCANONICALIZE1:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[COPY1]]
     ; SI-NEXT: [[FMAXNUM_IEEE1:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[FMAXNUM_IEEE]], [[FCANONICALIZE1]]
     ; SI-NEXT: $vgpr0 = COPY [[FMAXNUM_IEEE1]](s32)
+    ;
     ; VI-LABEL: name: test_fmaxnum_with_nonNaN_fmaxnum_argument_s32_ieee_mode_on
     ; VI: liveins: $vgpr0, $vgpr1
     ; VI-NEXT: {{  $}}
@@ -823,6 +848,7 @@ body: |
     ; VI-NEXT: [[FCANONICALIZE1:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[COPY1]]
     ; VI-NEXT: [[FMAXNUM_IEEE1:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[FMAXNUM_IEEE]], [[FCANONICALIZE1]]
     ; VI-NEXT: $vgpr0 = COPY [[FMAXNUM_IEEE1]](s32)
+    ;
     ; GFX9-LABEL: name: test_fmaxnum_with_nonNaN_fmaxnum_argument_s32_ieee_mode_on
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -864,6 +890,7 @@ body: |
     ; SI-NEXT: [[FCANONICALIZE3:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[COPY2]]
     ; SI-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[FCANONICALIZE2]], [[FCANONICALIZE3]]
     ; SI-NEXT: $vgpr0 = COPY [[FMAXNUM_IEEE]](s32)
+    ;
     ; VI-LABEL: name: test_fmaxnum_with_fminnum_argument_s32_ieee_mode_on
     ; VI: liveins: $vgpr0, $vgpr1, $vgpr2
     ; VI-NEXT: {{  $}}
@@ -877,6 +904,7 @@ body: |
     ; VI-NEXT: [[FCANONICALIZE3:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[COPY2]]
     ; VI-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[FCANONICALIZE2]], [[FCANONICALIZE3]]
     ; VI-NEXT: $vgpr0 = COPY [[FMAXNUM_IEEE]](s32)
+    ;
     ; GFX9-LABEL: name: test_fmaxnum_with_fminnum_argument_s32_ieee_mode_on
     ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2
     ; GFX9-NEXT: {{  $}}
@@ -918,6 +946,7 @@ body: |
     ; SI-NEXT: [[FCANONICALIZE1:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[COPY1]]
     ; SI-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[FMINNUM_IEEE]], [[FCANONICALIZE1]]
     ; SI-NEXT: $vgpr0 = COPY [[FMAXNUM_IEEE]](s32)
+    ;
     ; VI-LABEL: name: test_fmaxnum_with_nonNaN_fminnum_argument_s32_ieee_mode_on
     ; VI: liveins: $vgpr0, $vgpr1
     ; VI-NEXT: {{  $}}
@@ -929,6 +958,7 @@ body: |
     ; VI-NEXT: [[FCANONICALIZE1:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[COPY1]]
     ; VI-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[FMINNUM_IEEE]], [[FCANONICALIZE1]]
     ; VI-NEXT: $vgpr0 = COPY [[FMAXNUM_IEEE]](s32)
+    ;
     ; GFX9-LABEL: name: test_fmaxnum_with_nonNaN_fminnum_argument_s32_ieee_mode_on
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -965,6 +995,7 @@ body: |
     ; SI-NEXT: [[FCANONICALIZE:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[COPY]]
     ; SI-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[FCANONICALIZE]], [[C]]
     ; SI-NEXT: $vgpr0 = COPY [[FMAXNUM_IEEE]](s32)
+    ;
     ; VI-LABEL: name: test_fmaxnum_with_constant_argument_s32_ieee_mode_on
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -973,6 +1004,7 @@ body: |
     ; VI-NEXT: [[FCANONICALIZE:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[COPY]]
     ; VI-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[FCANONICALIZE]], [[C]]
     ; VI-NEXT: $vgpr0 = COPY [[FMAXNUM_IEEE]](s32)
+    ;
     ; GFX9-LABEL: name: test_fmaxnum_with_constant_argument_s32_ieee_mode_on
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -1020,6 +1052,7 @@ body: |
     ; SI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL]]
     ; SI-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; SI-NEXT: $vgpr0 = COPY [[BITCAST1]](<2 x s16>)
+    ;
     ; VI-LABEL: name: test_fmaxnum_with_constant_vector_argument_v2s16_ieee_mode_on
     ; VI: liveins: $vgpr0, $vgpr1
     ; VI-NEXT: {{  $}}
@@ -1042,6 +1075,7 @@ body: |
     ; VI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL]]
     ; VI-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; VI-NEXT: $vgpr0 = COPY [[BITCAST1]](<2 x s16>)
+    ;
     ; GFX9-LABEL: name: test_fmaxnum_with_constant_vector_argument_v2s16_ieee_mode_on
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fminnum.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fminnum.mir
index d32be74e2c996..a20c2fa21eb1e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fminnum.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fminnum.mir
@@ -23,6 +23,7 @@ body: |
     ; SI-NEXT: [[FCANONICALIZE1:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[COPY1]]
     ; SI-NEXT: [[FMINNUM_IEEE:%[0-9]+]]:_(s32) = G_FMINNUM_IEEE [[FCANONICALIZE]], [[FCANONICALIZE1]]
     ; SI-NEXT: $vgpr0 = COPY [[FMINNUM_IEEE]](s32)
+    ;
     ; VI-LABEL: name: test_fminnum_s32_ieee_mode_on
     ; VI: liveins: $vgpr0, $vgpr1
     ; VI-NEXT: {{  $}}
@@ -32,6 +33,7 @@ body: |
     ; VI-NEXT: [[FCANONICALIZE1:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[COPY1]]
     ; VI-NEXT: [[FMINNUM_IEEE:%[0-9]+]]:_(s32) = G_FMINNUM_IEEE [[FCANONICALIZE]], [[FCANONICALIZE1]]
     ; VI-NEXT: $vgpr0 = COPY [[FMINNUM_IEEE]](s32)
+    ;
     ; GFX9-LABEL: name: test_fminnum_s32_ieee_mode_on
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -63,6 +65,7 @@ body: |
     ; SI-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
     ; SI-NEXT: [[FMINNUM:%[0-9]+]]:_(s32) = G_FMINNUM [[COPY]], [[COPY1]]
     ; SI-NEXT: $vgpr0 = COPY [[FMINNUM]](s32)
+    ;
     ; VI-LABEL: name: test_fminnum_s32_ieee_mode_off
     ; VI: liveins: $vgpr0, $vgpr1
     ; VI-NEXT: {{  $}}
@@ -70,6 +73,7 @@ body: |
     ; VI-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
     ; VI-NEXT: [[FMINNUM:%[0-9]+]]:_(s32) = G_FMINNUM [[COPY]], [[COPY1]]
     ; VI-NEXT: $vgpr0 = COPY [[FMINNUM]](s32)
+    ;
     ; GFX9-LABEL: name: test_fminnum_s32_ieee_mode_off
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -96,6 +100,7 @@ body: |
     ; SI-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
     ; SI-NEXT: [[FMINNUM_IEEE:%[0-9]+]]:_(s32) = nnan G_FMINNUM_IEEE [[COPY]], [[COPY1]]
     ; SI-NEXT: $vgpr0 = COPY [[FMINNUM_IEEE]](s32)
+    ;
     ; VI-LABEL: name: test_fminnum_s32_nnan
     ; VI: liveins: $vgpr0, $vgpr1
     ; VI-NEXT: {{  $}}
@@ -103,6 +108,7 @@ body: |
     ; VI-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
     ; VI-NEXT: [[FMINNUM_IEEE:%[0-9]+]]:_(s32) = nnan G_FMINNUM_IEEE [[COPY]], [[COPY1]]
     ; VI-NEXT: $vgpr0 = COPY [[FMINNUM_IEEE]](s32)
+    ;
     ; GFX9-LABEL: name: test_fminnum_s32_nnan
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -131,6 +137,7 @@ body: |
     ; SI-NEXT: [[FCANONICALIZE:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[COPY1]]
     ; SI-NEXT: [[FMINNUM_IEEE:%[0-9]+]]:_(s32) = G_FMINNUM_IEEE [[COPY]], [[FCANONICALIZE]]
     ; SI-NEXT: $vgpr0 = COPY [[FMINNUM_IEEE]](s32)
+    ;
     ; VI-LABEL: name: test_fminnum_s32_nnan_lhs
     ; VI: liveins: $vgpr0, $vgpr1
     ; VI-NEXT: {{  $}}
@@ -139,6 +146,7 @@ body: |
     ; VI-NEXT: [[FCANONICALIZE:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[COPY1]]
     ; VI-NEXT: [[FMINNUM_IEEE:%[0-9]+]]:_(s32) = G_FMINNUM_IEEE [[COPY]], [[FCANONICALIZE]]
     ; VI-NEXT: $vgpr0 = COPY [[FMINNUM_IEEE]](s32)
+    ;
     ; GFX9-LABEL: name: test_fminnum_s32_nnan_lhs
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -168,6 +176,7 @@ body: |
     ; SI-NEXT: [[FCANONICALIZE:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[COPY]]
     ; SI-NEXT: [[FMINNUM_IEEE:%[0-9]+]]:_(s32) = G_FMINNUM_IEEE [[FCANONICALIZE]], [[COPY1]]
     ; SI-NEXT: $vgpr0 = COPY [[FMINNUM_IEEE]](s32)
+    ;
     ; VI-LABEL: name: test_fminnum_s32_nnan_rhs
     ; VI: liveins: $vgpr0, $vgpr1
     ; VI-NEXT: {{  $}}
@@ -176,6 +185,7 @@ body: |
     ; VI-NEXT: [[FCANONICALIZE:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[COPY]]
     ; VI-NEXT: [[FMINNUM_IEEE:%[0-9]+]]:_(s32) = G_FMINNUM_IEEE [[FCANONICALIZE]], [[COPY1]]
     ; VI-NEXT: $vgpr0 = COPY [[FMINNUM_IEEE]](s32)
+    ;
     ; GFX9-LABEL: name: test_fminnum_s32_nnan_rhs
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -203,6 +213,7 @@ body: |
     ; SI-NEXT: [[COPY1:%[0-9]+]]:_(s32) = nnan COPY $vgpr1
     ; SI-NEXT: [[FMINNUM_IEEE:%[0-9]+]]:_(s32) = G_FMINNUM_IEEE [[COPY]], [[COPY1]]
     ; SI-NEXT: $vgpr0 = COPY [[FMINNUM_IEEE]](s32)
+    ;
     ; VI-LABEL: name: test_fminnum_s32_nnan_lhs_rhs
     ; VI: liveins: $vgpr0, $vgpr1
     ; VI-NEXT: {{  $}}
@@ -210,6 +221,7 @@ body: |
     ; VI-NEXT: [[COPY1:%[0-9]+]]:_(s32) = nnan COPY $vgpr1
     ; VI-NEXT: [[FMINNUM_IEEE:%[0-9]+]]:_(s32) = G_FMINNUM_IEEE [[COPY]], [[COPY1]]
     ; VI-NEXT: $vgpr0 = COPY [[FMINNUM_IEEE]](s32)
+    ;
     ; GFX9-LABEL: name: test_fminnum_s32_nnan_lhs_rhs
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -238,6 +250,7 @@ body: |
     ; SI-NEXT: [[FCANONICALIZE1:%[0-9]+]]:_(s64) = G_FCANONICALIZE [[COPY1]]
     ; SI-NEXT: [[FMINNUM_IEEE:%[0-9]+]]:_(s64) = G_FMINNUM_IEEE [[FCANONICALIZE]], [[FCANONICALIZE1]]
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[FMINNUM_IEEE]](s64)
+    ;
     ; VI-LABEL: name: test_fminnum_s64
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; VI-NEXT: {{  $}}
@@ -247,6 +260,7 @@ body: |
     ; VI-NEXT: [[FCANONICALIZE1:%[0-9]+]]:_(s64) = G_FCANONICALIZE [[COPY1]]
     ; VI-NEXT: [[FMINNUM_IEEE:%[0-9]+]]:_(s64) = G_FMINNUM_IEEE [[FCANONICALIZE]], [[FCANONICALIZE1]]
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[FMINNUM_IEEE]](s64)
+    ;
     ; GFX9-LABEL: name: test_fminnum_s64
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -281,6 +295,7 @@ body: |
     ; SI-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMINNUM_IEEE]](s32)
     ; SI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FPTRUNC]](s16)
     ; SI-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ;
     ; VI-LABEL: name: test_fminnum_s16
     ; VI: liveins: $vgpr0, $vgpr1
     ; VI-NEXT: {{  $}}
@@ -293,6 +308,7 @@ body: |
     ; VI-NEXT: [[FMINNUM_IEEE:%[0-9]+]]:_(s16) = G_FMINNUM_IEEE [[FCANONICALIZE]], [[FCANONICALIZE1]]
     ; VI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMINNUM_IEEE]](s16)
     ; VI-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ;
     ; GFX9-LABEL: name: test_fminnum_s16
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -335,6 +351,7 @@ body: |
     ; SI-NEXT: [[FMINNUM_IEEE1:%[0-9]+]]:_(s32) = G_FMINNUM_IEEE [[FCANONICALIZE2]], [[FCANONICALIZE3]]
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[FMINNUM_IEEE]](s32), [[FMINNUM_IEEE1]](s32)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; VI-LABEL: name: test_fminnum_v2s32
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; VI-NEXT: {{  $}}
@@ -350,6 +367,7 @@ body: |
     ; VI-NEXT: [[FMINNUM_IEEE1:%[0-9]+]]:_(s32) = G_FMINNUM_IEEE [[FCANONICALIZE2]], [[FCANONICALIZE3]]
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[FMINNUM_IEEE]](s32), [[FMINNUM_IEEE1]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; GFX9-LABEL: name: test_fminnum_v2s32
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -405,6 +423,7 @@ body: |
     ; SI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL]]
     ; SI-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; SI-NEXT: $vgpr0 = COPY [[BITCAST2]](<2 x s16>)
+    ;
     ; VI-LABEL: name: test_fminnum_v2s16
     ; VI: liveins: $vgpr0, $vgpr1
     ; VI-NEXT: {{  $}}
@@ -431,6 +450,7 @@ body: |
     ; VI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL]]
     ; VI-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; VI-NEXT: $vgpr0 = COPY [[BITCAST2]](<2 x s16>)
+    ;
     ; GFX9-LABEL: name: test_fminnum_v2s16
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -499,13 +519,13 @@ body: |
     ; SI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND]], [[C]](s32)
     ; SI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL1]]
     ; SI-NEXT: [[BITCAST7:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
-    ; SI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR2]], [[C1]]
-    ; SI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[BITCAST5]], [[C1]]
-    ; SI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C]](s32)
-    ; SI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND1]], [[SHL2]]
+    ; SI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[BITCAST5]], [[C1]]
+    ; SI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
+    ; SI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[LSHR2]], [[SHL2]]
     ; SI-NEXT: [[BITCAST8:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
     ; SI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST6]](<2 x s16>), [[BITCAST7]](<2 x s16>), [[BITCAST8]](<2 x s16>)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
     ; VI-LABEL: name: test_fminnum_v3s16
     ; VI: liveins: $vgpr0_vgpr1_vgpr2, $vgpr3_vgpr4_vgpr5
     ; VI-NEXT: {{  $}}
@@ -551,13 +571,13 @@ body: |
     ; VI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND]], [[C]](s32)
     ; VI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL1]]
     ; VI-NEXT: [[BITCAST7:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
-    ; VI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR2]], [[C1]]
-    ; VI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[BITCAST5]], [[C1]]
-    ; VI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C]](s32)
-    ; VI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND1]], [[SHL2]]
+    ; VI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[BITCAST5]], [[C1]]
+    ; VI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
+    ; VI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[LSHR2]], [[SHL2]]
     ; VI-NEXT: [[BITCAST8:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
     ; VI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST6]](<2 x s16>), [[BITCAST7]](<2 x s16>), [[BITCAST8]](<2 x s16>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
     ; GFX9-LABEL: name: test_fminnum_v3s16
     ; GFX9: liveins: $vgpr0_vgpr1_vgpr2, $vgpr3_vgpr4_vgpr5
     ; GFX9-NEXT: {{  $}}
@@ -667,6 +687,7 @@ body: |
     ; SI-NEXT: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
     ; SI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
     ; VI-LABEL: name: test_fminnum_v4s16
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; VI-NEXT: {{  $}}
@@ -715,6 +736,7 @@ body: |
     ; VI-NEXT: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
     ; VI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
     ; GFX9-LABEL: name: test_fminnum_v4s16
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -758,6 +780,7 @@ body: |
     ; SI-NEXT: [[FCANONICALIZE3:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[COPY2]]
     ; SI-NEXT: [[FMINNUM_IEEE1:%[0-9]+]]:_(s32) = G_FMINNUM_IEEE [[FCANONICALIZE2]], [[FCANONICALIZE3]]
     ; SI-NEXT: $vgpr0 = COPY [[FMINNUM_IEEE1]](s32)
+    ;
     ; VI-LABEL: name: test_fminnum_with_fminnum_argument_s32_ieee_mode_on
     ; VI: liveins: $vgpr0, $vgpr1, $vgpr2
     ; VI-NEXT: {{  $}}
@@ -771,6 +794,7 @@ body: |
     ; VI-NEXT: [[FCANONICALIZE3:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[COPY2]]
     ; VI-NEXT: [[FMINNUM_IEEE1:%[0-9]+]]:_(s32) = G_FMINNUM_IEEE [[FCANONICALIZE2]], [[FCANONICALIZE3]]
     ; VI-NEXT: $vgpr0 = COPY [[FMINNUM_IEEE1]](s32)
+    ;
     ; GFX9-LABEL: name: test_fminnum_with_fminnum_argument_s32_ieee_mode_on
     ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2
     ; GFX9-NEXT: {{  $}}
@@ -812,6 +836,7 @@ body: |
     ; SI-NEXT: [[FCANONICALIZE1:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[COPY1]]
     ; SI-NEXT: [[FMINNUM_IEEE1:%[0-9]+]]:_(s32) = G_FMINNUM_IEEE [[FMINNUM_IEEE]], [[FCANONICALIZE1]]
     ; SI-NEXT: $vgpr0 = COPY [[FMINNUM_IEEE1]](s32)
+    ;
     ; VI-LABEL: name: test_fminnum_with_nonNaN_fminnum_argument_s32_ieee_mode_on
     ; VI: liveins: $vgpr0, $vgpr1
     ; VI-NEXT: {{  $}}
@@ -823,6 +848,7 @@ body: |
     ; VI-NEXT: [[FCANONICALIZE1:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[COPY1]]
     ; VI-NEXT: [[FMINNUM_IEEE1:%[0-9]+]]:_(s32) = G_FMINNUM_IEEE [[FMINNUM_IEEE]], [[FCANONICALIZE1]]
     ; VI-NEXT: $vgpr0 = COPY [[FMINNUM_IEEE1]](s32)
+    ;
     ; GFX9-LABEL: name: test_fminnum_with_nonNaN_fminnum_argument_s32_ieee_mode_on
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -864,6 +890,7 @@ body: |
     ; SI-NEXT: [[FCANONICALIZE3:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[COPY2]]
     ; SI-NEXT: [[FMINNUM_IEEE:%[0-9]+]]:_(s32) = G_FMINNUM_IEEE [[FCANONICALIZE2]], [[FCANONICALIZE3]]
     ; SI-NEXT: $vgpr0 = COPY [[FMINNUM_IEEE]](s32)
+    ;
     ; VI-LABEL: name: test_fminnum_with_fmaxnum_argument_s32_ieee_mode_on
     ; VI: liveins: $vgpr0, $vgpr1, $vgpr2
     ; VI-NEXT: {{  $}}
@@ -877,6 +904,7 @@ body: |
     ; VI-NEXT: [[FCANONICALIZE3:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[COPY2]]
     ; VI-NEXT: [[FMINNUM_IEEE:%[0-9]+]]:_(s32) = G_FMINNUM_IEEE [[FCANONICALIZE2]], [[FCANONICALIZE3]]
     ; VI-NEXT: $vgpr0 = COPY [[FMINNUM_IEEE]](s32)
+    ;
     ; GFX9-LABEL: name: test_fminnum_with_fmaxnum_argument_s32_ieee_mode_on
     ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2
     ; GFX9-NEXT: {{  $}}
@@ -918,6 +946,7 @@ body: |
     ; SI-NEXT: [[FCANONICALIZE1:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[COPY1]]
     ; SI-NEXT: [[FMINNUM_IEEE:%[0-9]+]]:_(s32) = G_FMINNUM_IEEE [[FMAXNUM_IEEE]], [[FCANONICALIZE1]]
     ; SI-NEXT: $vgpr0 = COPY [[FMINNUM_IEEE]](s32)
+    ;
     ; VI-LABEL: name: test_fminnum_with_nonNaN_fmaxnum_argument_s32_ieee_mode_on
     ; VI: liveins: $vgpr0, $vgpr1
     ; VI-NEXT: {{  $}}
@@ -929,6 +958,7 @@ body: |
     ; VI-NEXT: [[FCANONICALIZE1:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[COPY1]]
     ; VI-NEXT: [[FMINNUM_IEEE:%[0-9]+]]:_(s32) = G_FMINNUM_IEEE [[FMAXNUM_IEEE]], [[FCANONICALIZE1]]
     ; VI-NEXT: $vgpr0 = COPY [[FMINNUM_IEEE]](s32)
+    ;
     ; GFX9-LABEL: name: test_fminnum_with_nonNaN_fmaxnum_argument_s32_ieee_mode_on
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -965,6 +995,7 @@ body: |
     ; SI-NEXT: [[FCANONICALIZE:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[COPY]]
     ; SI-NEXT: [[FMINNUM_IEEE:%[0-9]+]]:_(s32) = G_FMINNUM_IEEE [[FCANONICALIZE]], [[C]]
     ; SI-NEXT: $vgpr0 = COPY [[FMINNUM_IEEE]](s32)
+    ;
     ; VI-LABEL: name: test_fminnum_with_constant_argument_s32_ieee_mode_on
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -973,6 +1004,7 @@ body: |
     ; VI-NEXT: [[FCANONICALIZE:%[0-9]+]]:_(s32) = G_FCANONICALIZE [[COPY]]
     ; VI-NEXT: [[FMINNUM_IEEE:%[0-9]+]]:_(s32) = G_FMINNUM_IEEE [[FCANONICALIZE]], [[C]]
     ; VI-NEXT: $vgpr0 = COPY [[FMINNUM_IEEE]](s32)
+    ;
     ; GFX9-LABEL: name: test_fminnum_with_constant_argument_s32_ieee_mode_on
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -1020,6 +1052,7 @@ body: |
     ; SI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL]]
     ; SI-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; SI-NEXT: $vgpr0 = COPY [[BITCAST1]](<2 x s16>)
+    ;
     ; VI-LABEL: name: test_fminnum_with_constant_vector_argument_v2s16_ieee_mode_on
     ; VI: liveins: $vgpr0, $vgpr1
     ; VI-NEXT: {{  $}}
@@ -1042,6 +1075,7 @@ body: |
     ; VI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL]]
     ; VI-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; VI-NEXT: $vgpr0 = COPY [[BITCAST1]](<2 x s16>)
+    ;
     ; GFX9-LABEL: name: test_fminnum_with_constant_vector_argument_v2s16_ieee_mode_on
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fmul.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fmul.mir
index 74c8ff83f318e..acfc6f69d8f79 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fmul.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fmul.mir
@@ -18,6 +18,7 @@ body: |
     ; SI-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
     ; SI-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[COPY]], [[COPY1]]
     ; SI-NEXT: $vgpr0 = COPY [[FMUL]](s32)
+    ;
     ; VI-LABEL: name: test_fmul_s32
     ; VI: liveins: $vgpr0, $vgpr1
     ; VI-NEXT: {{  $}}
@@ -25,6 +26,7 @@ body: |
     ; VI-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
     ; VI-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[COPY]], [[COPY1]]
     ; VI-NEXT: $vgpr0 = COPY [[FMUL]](s32)
+    ;
     ; GFX9PLUS-LABEL: name: test_fmul_s32
     ; GFX9PLUS: liveins: $vgpr0, $vgpr1
     ; GFX9PLUS-NEXT: {{  $}}
@@ -50,6 +52,7 @@ body: |
     ; SI-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3
     ; SI-NEXT: [[FMUL:%[0-9]+]]:_(s64) = G_FMUL [[COPY]], [[COPY1]]
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[FMUL]](s64)
+    ;
     ; VI-LABEL: name: test_fmul_s64
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; VI-NEXT: {{  $}}
@@ -57,6 +60,7 @@ body: |
     ; VI-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3
     ; VI-NEXT: [[FMUL:%[0-9]+]]:_(s64) = G_FMUL [[COPY]], [[COPY1]]
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[FMUL]](s64)
+    ;
     ; GFX9PLUS-LABEL: name: test_fmul_s64
     ; GFX9PLUS: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9PLUS-NEXT: {{  $}}
@@ -89,6 +93,7 @@ body: |
     ; SI-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL]](s32)
     ; SI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FPTRUNC]](s16)
     ; SI-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ;
     ; VI-LABEL: name: test_fmul_s16
     ; VI: liveins: $vgpr0, $vgpr1
     ; VI-NEXT: {{  $}}
@@ -99,6 +104,7 @@ body: |
     ; VI-NEXT: [[FMUL:%[0-9]+]]:_(s16) = G_FMUL [[TRUNC]], [[TRUNC1]]
     ; VI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMUL]](s16)
     ; VI-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ;
     ; GFX9PLUS-LABEL: name: test_fmul_s16
     ; GFX9PLUS: liveins: $vgpr0, $vgpr1
     ; GFX9PLUS-NEXT: {{  $}}
@@ -136,6 +142,7 @@ body: |
     ; SI-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[UV1]], [[UV3]]
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[FMUL]](s32), [[FMUL1]](s32)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; VI-LABEL: name: test_fmul_v2s32
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; VI-NEXT: {{  $}}
@@ -147,6 +154,7 @@ body: |
     ; VI-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[UV1]], [[UV3]]
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[FMUL]](s32), [[FMUL1]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; GFX9PLUS-LABEL: name: test_fmul_v2s32
     ; GFX9PLUS: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9PLUS-NEXT: {{  $}}
@@ -181,6 +189,7 @@ body: |
     ; SI-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = nnan G_FMUL [[UV1]], [[UV3]]
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[FMUL]](s32), [[FMUL1]](s32)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; VI-LABEL: name: test_fmul_v2s32_flags
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; VI-NEXT: {{  $}}
@@ -192,6 +201,7 @@ body: |
     ; VI-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = nnan G_FMUL [[UV1]], [[UV3]]
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[FMUL]](s32), [[FMUL1]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; GFX9PLUS-LABEL: name: test_fmul_v2s32_flags
     ; GFX9PLUS: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9PLUS-NEXT: {{  $}}
@@ -227,6 +237,7 @@ body: |
     ; SI-NEXT: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[UV2]], [[UV5]]
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[FMUL]](s32), [[FMUL1]](s32), [[FMUL2]](s32)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
+    ;
     ; VI-LABEL: name: test_fmul_v3s32
     ; VI: liveins: $vgpr0_vgpr1_vgpr2, $vgpr3_vgpr4_vgpr5
     ; VI-NEXT: {{  $}}
@@ -239,6 +250,7 @@ body: |
     ; VI-NEXT: [[FMUL2:%[0-9]+]]:_(s32) = G_FMUL [[UV2]], [[UV5]]
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[FMUL]](s32), [[FMUL1]](s32), [[FMUL2]](s32)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
+    ;
     ; GFX9PLUS-LABEL: name: test_fmul_v3s32
     ; GFX9PLUS: liveins: $vgpr0_vgpr1_vgpr2, $vgpr3_vgpr4_vgpr5
     ; GFX9PLUS-NEXT: {{  $}}
@@ -274,6 +286,7 @@ body: |
     ; SI-NEXT: [[FMUL1:%[0-9]+]]:_(s64) = G_FMUL [[UV1]], [[UV3]]
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[FMUL]](s64), [[FMUL1]](s64)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ;
     ; VI-LABEL: name: test_fmul_v2s64
     ; VI: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7
     ; VI-NEXT: {{  $}}
@@ -285,6 +298,7 @@ body: |
     ; VI-NEXT: [[FMUL1:%[0-9]+]]:_(s64) = G_FMUL [[UV1]], [[UV3]]
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[FMUL]](s64), [[FMUL1]](s64)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ;
     ; GFX9PLUS-LABEL: name: test_fmul_v2s64
     ; GFX9PLUS: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7
     ; GFX9PLUS-NEXT: {{  $}}
@@ -336,6 +350,7 @@ body: |
     ; SI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL]]
     ; SI-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; SI-NEXT: $vgpr0 = COPY [[BITCAST2]](<2 x s16>)
+    ;
     ; VI-LABEL: name: test_fmul_v2s16
     ; VI: liveins: $vgpr0, $vgpr1
     ; VI-NEXT: {{  $}}
@@ -358,6 +373,7 @@ body: |
     ; VI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL]]
     ; VI-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; VI-NEXT: $vgpr0 = COPY [[BITCAST2]](<2 x s16>)
+    ;
     ; GFX9PLUS-LABEL: name: test_fmul_v2s16
     ; GFX9PLUS: liveins: $vgpr0, $vgpr1
     ; GFX9PLUS-NEXT: {{  $}}
@@ -425,13 +441,13 @@ body: |
     ; SI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND]], [[C]](s32)
     ; SI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL1]]
     ; SI-NEXT: [[BITCAST7:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
-    ; SI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR2]], [[C1]]
-    ; SI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[BITCAST5]], [[C1]]
-    ; SI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C]](s32)
-    ; SI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND1]], [[SHL2]]
+    ; SI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[BITCAST5]], [[C1]]
+    ; SI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
+    ; SI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[LSHR2]], [[SHL2]]
     ; SI-NEXT: [[BITCAST8:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
     ; SI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST6]](<2 x s16>), [[BITCAST7]](<2 x s16>), [[BITCAST8]](<2 x s16>)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
     ; VI-LABEL: name: test_fmul_v3s16
     ; VI: liveins: $vgpr0_vgpr1_vgpr2, $vgpr3_vgpr4_vgpr5
     ; VI-NEXT: {{  $}}
@@ -471,13 +487,13 @@ body: |
     ; VI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND]], [[C]](s32)
     ; VI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL1]]
     ; VI-NEXT: [[BITCAST7:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
-    ; VI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR2]], [[C1]]
-    ; VI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[BITCAST5]], [[C1]]
-    ; VI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C]](s32)
-    ; VI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND1]], [[SHL2]]
+    ; VI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[BITCAST5]], [[C1]]
+    ; VI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
+    ; VI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[LSHR2]], [[SHL2]]
     ; VI-NEXT: [[BITCAST8:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
     ; VI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST6]](<2 x s16>), [[BITCAST7]](<2 x s16>), [[BITCAST8]](<2 x s16>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
     ; GFX9PLUS-LABEL: name: test_fmul_v3s16
     ; GFX9PLUS: liveins: $vgpr0_vgpr1_vgpr2, $vgpr3_vgpr4_vgpr5
     ; GFX9PLUS-NEXT: {{  $}}
@@ -583,6 +599,7 @@ body: |
     ; SI-NEXT: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
     ; SI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
     ; VI-LABEL: name: test_fmul_v4s16
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; VI-NEXT: {{  $}}
@@ -623,6 +640,7 @@ body: |
     ; VI-NEXT: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
     ; VI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
     ; GFX9PLUS-LABEL: name: test_fmul_v4s16
     ; GFX9PLUS: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9PLUS-NEXT: {{  $}}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fneg.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fneg.mir
index eddc0cdbc6122..49cd1f7986129 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fneg.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fneg.mir
@@ -17,12 +17,14 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; SI-NEXT: [[FNEG:%[0-9]+]]:_(s32) = G_FNEG [[COPY]]
     ; SI-NEXT: $vgpr0 = COPY [[FNEG]](s32)
+    ;
     ; VI-LABEL: name: test_fneg_s32
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; VI-NEXT: [[FNEG:%[0-9]+]]:_(s32) = G_FNEG [[COPY]]
     ; VI-NEXT: $vgpr0 = COPY [[FNEG]](s32)
+    ;
     ; GFX9-LABEL: name: test_fneg_s32
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -45,12 +47,14 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
     ; SI-NEXT: [[FNEG:%[0-9]+]]:_(s64) = G_FNEG [[COPY]]
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[FNEG]](s64)
+    ;
     ; VI-LABEL: name: test_fneg_s64
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[FNEG:%[0-9]+]]:_(s64) = G_FNEG [[COPY]]
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[FNEG]](s64)
+    ;
     ; GFX9-LABEL: name: test_fneg_s64
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -75,6 +79,7 @@ body: |
     ; SI-NEXT: [[FNEG:%[0-9]+]]:_(s16) = G_FNEG [[TRUNC]]
     ; SI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FNEG]](s16)
     ; SI-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ;
     ; VI-LABEL: name: test_fneg_s16
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -83,6 +88,7 @@ body: |
     ; VI-NEXT: [[FNEG:%[0-9]+]]:_(s16) = G_FNEG [[TRUNC]]
     ; VI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FNEG]](s16)
     ; VI-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ;
     ; GFX9-LABEL: name: test_fneg_s16
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -113,6 +119,7 @@ body: |
     ; SI-NEXT: [[FNEG1:%[0-9]+]]:_(s32) = G_FNEG [[UV1]]
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[FNEG]](s32), [[FNEG1]](s32)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; VI-LABEL: name: test_fneg_v2s32
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -122,6 +129,7 @@ body: |
     ; VI-NEXT: [[FNEG1:%[0-9]+]]:_(s32) = G_FNEG [[UV1]]
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[FNEG]](s32), [[FNEG1]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; GFX9-LABEL: name: test_fneg_v2s32
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -152,6 +160,7 @@ body: |
     ; SI-NEXT: [[FNEG2:%[0-9]+]]:_(s32) = G_FNEG [[UV2]]
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[FNEG]](s32), [[FNEG1]](s32), [[FNEG2]](s32)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
+    ;
     ; VI-LABEL: name: test_fneg_v3s32
     ; VI: liveins: $vgpr0_vgpr1_vgpr2
     ; VI-NEXT: {{  $}}
@@ -162,6 +171,7 @@ body: |
     ; VI-NEXT: [[FNEG2:%[0-9]+]]:_(s32) = G_FNEG [[UV2]]
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[FNEG]](s32), [[FNEG1]](s32), [[FNEG2]](s32)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
+    ;
     ; GFX9-LABEL: name: test_fneg_v3s32
     ; GFX9: liveins: $vgpr0_vgpr1_vgpr2
     ; GFX9-NEXT: {{  $}}
@@ -192,6 +202,7 @@ body: |
     ; SI-NEXT: [[FNEG1:%[0-9]+]]:_(s64) = G_FNEG [[UV1]]
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[FNEG]](s64), [[FNEG1]](s64)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ;
     ; VI-LABEL: name: test_fneg_v2s64
     ; VI: liveins: $vgpr0_vgpr1_vgpr2_vgpr3
     ; VI-NEXT: {{  $}}
@@ -201,6 +212,7 @@ body: |
     ; VI-NEXT: [[FNEG1:%[0-9]+]]:_(s64) = G_FNEG [[UV1]]
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[FNEG]](s64), [[FNEG1]](s64)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ;
     ; GFX9-LABEL: name: test_fneg_v2s64
     ; GFX9: liveins: $vgpr0_vgpr1_vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -227,12 +239,14 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
     ; SI-NEXT: [[FNEG:%[0-9]+]]:_(<2 x s16>) = G_FNEG [[COPY]]
     ; SI-NEXT: $vgpr0 = COPY [[FNEG]](<2 x s16>)
+    ;
     ; VI-LABEL: name: test_fneg_v2s16
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
     ; VI-NEXT: [[FNEG:%[0-9]+]]:_(<2 x s16>) = G_FNEG [[COPY]]
     ; VI-NEXT: $vgpr0 = COPY [[FNEG]](<2 x s16>)
+    ;
     ; GFX9-LABEL: name: test_fneg_v2s16
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -266,10 +280,10 @@ body: |
     ; SI-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32)
     ; SI-NEXT: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[FNEG1]](<2 x s16>)
     ; SI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[BITCAST2]], [[C1]]
-    ; SI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]]
-    ; SI-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[BITCAST3]], [[C1]]
-    ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[AND1]](s32), [[AND2]](s32), [[AND3]](s32)
+    ; SI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[BITCAST3]], [[C1]]
+    ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[AND1]](s32), [[LSHR]](s32), [[AND2]](s32)
     ; SI-NEXT: S_NOP 0, implicit [[BUILD_VECTOR]](<3 x s32>)
+    ;
     ; VI-LABEL: name: test_fneg_v3s16
     ; VI: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
     ; VI-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>)
@@ -287,10 +301,10 @@ body: |
     ; VI-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32)
     ; VI-NEXT: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[FNEG1]](<2 x s16>)
     ; VI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[BITCAST2]], [[C1]]
-    ; VI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]]
-    ; VI-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[BITCAST3]], [[C1]]
-    ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[AND1]](s32), [[AND2]](s32), [[AND3]](s32)
+    ; VI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[BITCAST3]], [[C1]]
+    ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[AND1]](s32), [[LSHR]](s32), [[AND2]](s32)
     ; VI-NEXT: S_NOP 0, implicit [[BUILD_VECTOR]](<3 x s32>)
+    ;
     ; GFX9-LABEL: name: test_fneg_v3s16
     ; GFX9: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
     ; GFX9-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>)
@@ -306,9 +320,8 @@ body: |
     ; GFX9-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[FNEG1]](<2 x s16>)
     ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
     ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C1]]
-    ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]]
-    ; GFX9-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[BITCAST2]], [[C1]]
-    ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[AND]](s32), [[AND1]](s32), [[AND2]](s32)
+    ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[BITCAST2]], [[C1]]
+    ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[AND]](s32), [[LSHR]](s32), [[AND1]](s32)
     ; GFX9-NEXT: S_NOP 0, implicit [[BUILD_VECTOR1]](<3 x s32>)
     %0:_(<3 x s16>) = G_IMPLICIT_DEF
     %1:_(<3 x s16>) = G_FNEG %0
@@ -331,6 +344,7 @@ body: |
     ; SI-NEXT: [[FNEG1:%[0-9]+]]:_(<2 x s16>) = G_FNEG [[UV1]]
     ; SI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[FNEG]](<2 x s16>), [[FNEG1]](<2 x s16>)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
     ; VI-LABEL: name: test_fneg_v4s16
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -340,6 +354,7 @@ body: |
     ; VI-NEXT: [[FNEG1:%[0-9]+]]:_(<2 x s16>) = G_FNEG [[UV1]]
     ; VI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[FNEG]](<2 x s16>), [[FNEG1]](<2 x s16>)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
     ; GFX9-LABEL: name: test_fneg_v4s16
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fshl.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fshl.mir
index a6ebd27c6b03b..70ed6ffcd2aa9 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fshl.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fshl.mir
@@ -256,22 +256,19 @@ body: |
     ; SI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C4]]
     ; SI-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[AND2]], [[C3]](s32)
     ; SI-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[AND1]](s16)
-    ; SI-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[LSHR3]], [[C4]]
-    ; SI-NEXT: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[AND3]], [[ZEXT1]](s32)
+    ; SI-NEXT: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[LSHR3]], [[ZEXT1]](s32)
     ; SI-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR4]](s32)
     ; SI-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[TRUNC2]], [[TRUNC3]]
-    ; SI-NEXT: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C1]]
+    ; SI-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C1]]
     ; SI-NEXT: [[XOR1:%[0-9]+]]:_(s16) = G_XOR [[TRUNC1]], [[C2]]
-    ; SI-NEXT: [[AND5:%[0-9]+]]:_(s16) = G_AND [[XOR1]], [[C1]]
-    ; SI-NEXT: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[AND4]](s16)
+    ; SI-NEXT: [[AND4:%[0-9]+]]:_(s16) = G_AND [[XOR1]], [[C1]]
+    ; SI-NEXT: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[AND3]](s16)
     ; SI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LSHR]], [[ZEXT2]](s32)
     ; SI-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32)
     ; SI-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C3]](s32)
-    ; SI-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C4]]
-    ; SI-NEXT: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[AND6]], [[COPY3]](s32)
-    ; SI-NEXT: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[AND5]](s16)
-    ; SI-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[LSHR5]], [[C4]]
-    ; SI-NEXT: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[AND7]], [[ZEXT3]](s32)
+    ; SI-NEXT: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[LSHR1]], [[COPY3]](s32)
+    ; SI-NEXT: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[AND4]](s16)
+    ; SI-NEXT: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[LSHR5]], [[ZEXT3]](s32)
     ; SI-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR6]](s32)
     ; SI-NEXT: [[OR1:%[0-9]+]]:_(s16) = G_OR [[TRUNC4]], [[TRUNC5]]
     ; SI-NEXT: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16)
@@ -441,15 +438,12 @@ body: |
     ; SI-NEXT: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[COPY2]], [[C1]]
     ; SI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[XOR]], [[C]]
     ; SI-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; SI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[AND]](s32)
     ; SI-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
-    ; SI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[AND]], [[C3]]
-    ; SI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[AND2]](s32)
-    ; SI-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; SI-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]]
-    ; SI-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[AND3]], [[COPY3]](s32)
-    ; SI-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[AND1]], [[C3]]
-    ; SI-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C3]]
-    ; SI-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[AND5]], [[AND4]](s32)
+    ; SI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]]
+    ; SI-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[AND2]], [[C2]](s32)
+    ; SI-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C3]]
+    ; SI-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[AND3]], [[AND1]](s32)
     ; SI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[LSHR1]]
     ; SI-NEXT: $vgpr0 = COPY [[OR]](s32)
     ;
@@ -464,20 +458,19 @@ body: |
     ; VI-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
     ; VI-NEXT: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[COPY2]], [[C1]]
     ; VI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[XOR]], [[C]]
-    ; VI-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; VI-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[AND]](s32)
+    ; VI-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; VI-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C2]]
     ; VI-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
     ; VI-NEXT: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC1]], [[AND2]](s16)
     ; VI-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 1
-    ; VI-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[C3]], [[C2]]
     ; VI-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
-    ; VI-NEXT: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C2]]
-    ; VI-NEXT: [[LSHR:%[0-9]+]]:_(s16) = G_LSHR [[AND4]], [[AND3]](s16)
+    ; VI-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C2]]
+    ; VI-NEXT: [[LSHR:%[0-9]+]]:_(s16) = G_LSHR [[AND3]], [[C3]](s16)
     ; VI-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[AND1]](s32)
-    ; VI-NEXT: [[AND5:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C2]]
-    ; VI-NEXT: [[AND6:%[0-9]+]]:_(s16) = G_AND [[LSHR]], [[C2]]
-    ; VI-NEXT: [[LSHR1:%[0-9]+]]:_(s16) = G_LSHR [[AND6]], [[AND5]](s16)
+    ; VI-NEXT: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C2]]
+    ; VI-NEXT: [[AND5:%[0-9]+]]:_(s16) = G_AND [[LSHR]], [[C2]]
+    ; VI-NEXT: [[LSHR1:%[0-9]+]]:_(s16) = G_LSHR [[AND5]], [[AND4]](s16)
     ; VI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[SHL]](s16)
     ; VI-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR1]](s16)
     ; VI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[ANYEXT]], [[ANYEXT1]]
@@ -494,20 +487,19 @@ body: |
     ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
     ; GFX9-NEXT: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[COPY2]], [[C1]]
     ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[XOR]], [[C]]
-    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[AND]](s32)
+    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; GFX9-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C2]]
     ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
     ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC1]], [[AND2]](s16)
     ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 1
-    ; GFX9-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[C3]], [[C2]]
     ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
-    ; GFX9-NEXT: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C2]]
-    ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s16) = G_LSHR [[AND4]], [[AND3]](s16)
+    ; GFX9-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C2]]
+    ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s16) = G_LSHR [[AND3]], [[C3]](s16)
     ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[AND1]](s32)
-    ; GFX9-NEXT: [[AND5:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C2]]
-    ; GFX9-NEXT: [[AND6:%[0-9]+]]:_(s16) = G_AND [[LSHR]], [[C2]]
-    ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s16) = G_LSHR [[AND6]], [[AND5]](s16)
+    ; GFX9-NEXT: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C2]]
+    ; GFX9-NEXT: [[AND5:%[0-9]+]]:_(s16) = G_AND [[LSHR]], [[C2]]
+    ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s16) = G_LSHR [[AND5]], [[AND4]](s16)
     ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[SHL]](s16)
     ; GFX9-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR1]](s16)
     ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[ANYEXT]], [[ANYEXT1]]
@@ -539,33 +531,31 @@ body: |
     ; SI-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
     ; SI-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16777215
     ; SI-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C2]]
-    ; SI-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C1]](s32)
-    ; SI-NEXT: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[COPY3]](s32)
+    ; SI-NEXT: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[C1]](s32)
     ; SI-NEXT: [[AMDGPU_RCP_IFLAG:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG [[UITOFP]](s32)
     ; SI-NEXT: [[C3:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41EFFFFFC0000000
     ; SI-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG]], [[C3]]
     ; SI-NEXT: [[FPTOUI:%[0-9]+]]:_(s32) = G_FPTOUI [[FMUL]](s32)
     ; SI-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-    ; SI-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C4]], [[COPY3]]
+    ; SI-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C4]], [[C1]]
     ; SI-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[SUB]], [[FPTOUI]]
     ; SI-NEXT: [[UMULH:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI]], [[MUL]]
     ; SI-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI]], [[UMULH]]
     ; SI-NEXT: [[UMULH1:%[0-9]+]]:_(s32) = G_UMULH [[AND]], [[ADD]]
-    ; SI-NEXT: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[UMULH1]], [[COPY3]]
+    ; SI-NEXT: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[UMULH1]], [[C1]]
     ; SI-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[AND]], [[MUL1]]
     ; SI-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; SI-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SUB1]](s32), [[COPY3]]
-    ; SI-NEXT: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[SUB1]], [[COPY3]]
+    ; SI-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SUB1]](s32), [[C1]]
+    ; SI-NEXT: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[SUB1]], [[C1]]
     ; SI-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[SUB2]], [[SUB1]]
-    ; SI-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SELECT]](s32), [[COPY3]]
-    ; SI-NEXT: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[SELECT]], [[COPY3]]
+    ; SI-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SELECT]](s32), [[C1]]
+    ; SI-NEXT: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[SELECT]], [[C1]]
     ; SI-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s1), [[SUB3]], [[SELECT]]
     ; SI-NEXT: [[SUB4:%[0-9]+]]:_(s32) = G_SUB [[C]], [[SELECT1]]
     ; SI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[SELECT1]], [[C2]]
     ; SI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[AND1]](s32)
-    ; SI-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
     ; SI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C2]]
-    ; SI-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[AND2]], [[COPY4]](s32)
+    ; SI-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[AND2]], [[C5]](s32)
     ; SI-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[SUB4]], [[C2]]
     ; SI-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C2]]
     ; SI-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[AND4]], [[AND3]](s32)
@@ -582,33 +572,31 @@ body: |
     ; VI-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
     ; VI-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16777215
     ; VI-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C2]]
-    ; VI-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C1]](s32)
-    ; VI-NEXT: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[COPY3]](s32)
+    ; VI-NEXT: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[C1]](s32)
     ; VI-NEXT: [[AMDGPU_RCP_IFLAG:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG [[UITOFP]](s32)
     ; VI-NEXT: [[C3:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41EFFFFFC0000000
     ; VI-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG]], [[C3]]
     ; VI-NEXT: [[FPTOUI:%[0-9]+]]:_(s32) = G_FPTOUI [[FMUL]](s32)
     ; VI-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-    ; VI-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C4]], [[COPY3]]
+    ; VI-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C4]], [[C1]]
     ; VI-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[SUB]], [[FPTOUI]]
     ; VI-NEXT: [[UMULH:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI]], [[MUL]]
     ; VI-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI]], [[UMULH]]
     ; VI-NEXT: [[UMULH1:%[0-9]+]]:_(s32) = G_UMULH [[AND]], [[ADD]]
-    ; VI-NEXT: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[UMULH1]], [[COPY3]]
+    ; VI-NEXT: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[UMULH1]], [[C1]]
     ; VI-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[AND]], [[MUL1]]
     ; VI-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; VI-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SUB1]](s32), [[COPY3]]
-    ; VI-NEXT: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[SUB1]], [[COPY3]]
+    ; VI-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SUB1]](s32), [[C1]]
+    ; VI-NEXT: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[SUB1]], [[C1]]
     ; VI-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[SUB2]], [[SUB1]]
-    ; VI-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SELECT]](s32), [[COPY3]]
-    ; VI-NEXT: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[SELECT]], [[COPY3]]
+    ; VI-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SELECT]](s32), [[C1]]
+    ; VI-NEXT: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[SELECT]], [[C1]]
     ; VI-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s1), [[SUB3]], [[SELECT]]
     ; VI-NEXT: [[SUB4:%[0-9]+]]:_(s32) = G_SUB [[C]], [[SELECT1]]
     ; VI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[SELECT1]], [[C2]]
     ; VI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[AND1]](s32)
-    ; VI-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
     ; VI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C2]]
-    ; VI-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[AND2]], [[COPY4]](s32)
+    ; VI-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[AND2]], [[C5]](s32)
     ; VI-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[SUB4]], [[C2]]
     ; VI-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C2]]
     ; VI-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[AND4]], [[AND3]](s32)
@@ -625,33 +613,31 @@ body: |
     ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
     ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16777215
     ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C2]]
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C1]](s32)
-    ; GFX9-NEXT: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[COPY3]](s32)
+    ; GFX9-NEXT: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[C1]](s32)
     ; GFX9-NEXT: [[AMDGPU_RCP_IFLAG:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG [[UITOFP]](s32)
     ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41EFFFFFC0000000
     ; GFX9-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG]], [[C3]]
     ; GFX9-NEXT: [[FPTOUI:%[0-9]+]]:_(s32) = G_FPTOUI [[FMUL]](s32)
     ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-    ; GFX9-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C4]], [[COPY3]]
+    ; GFX9-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C4]], [[C1]]
     ; GFX9-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[SUB]], [[FPTOUI]]
     ; GFX9-NEXT: [[UMULH:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI]], [[MUL]]
     ; GFX9-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI]], [[UMULH]]
     ; GFX9-NEXT: [[UMULH1:%[0-9]+]]:_(s32) = G_UMULH [[AND]], [[ADD]]
-    ; GFX9-NEXT: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[UMULH1]], [[COPY3]]
+    ; GFX9-NEXT: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[UMULH1]], [[C1]]
     ; GFX9-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[AND]], [[MUL1]]
     ; GFX9-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; GFX9-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SUB1]](s32), [[COPY3]]
-    ; GFX9-NEXT: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[SUB1]], [[COPY3]]
+    ; GFX9-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SUB1]](s32), [[C1]]
+    ; GFX9-NEXT: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[SUB1]], [[C1]]
     ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[SUB2]], [[SUB1]]
-    ; GFX9-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SELECT]](s32), [[COPY3]]
-    ; GFX9-NEXT: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[SELECT]], [[COPY3]]
+    ; GFX9-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SELECT]](s32), [[C1]]
+    ; GFX9-NEXT: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[SELECT]], [[C1]]
     ; GFX9-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s1), [[SUB3]], [[SELECT]]
     ; GFX9-NEXT: [[SUB4:%[0-9]+]]:_(s32) = G_SUB [[C]], [[SELECT1]]
     ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[SELECT1]], [[C2]]
     ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[AND1]](s32)
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
     ; GFX9-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C2]]
-    ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[AND2]], [[COPY4]](s32)
+    ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[AND2]], [[C5]](s32)
     ; GFX9-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[SUB4]], [[C2]]
     ; GFX9-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C2]]
     ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[AND4]], [[AND3]](s32)
@@ -709,36 +695,32 @@ body: |
     ; SI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[BITCAST2]], [[C4]]
     ; SI-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[AND2]], [[C3]](s32)
     ; SI-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[AND1]](s16)
-    ; SI-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[LSHR3]], [[C4]]
-    ; SI-NEXT: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[AND3]], [[ZEXT1]](s32)
+    ; SI-NEXT: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[LSHR3]], [[ZEXT1]](s32)
     ; SI-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR4]](s32)
     ; SI-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[TRUNC3]], [[TRUNC4]]
-    ; SI-NEXT: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C1]]
+    ; SI-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C1]]
     ; SI-NEXT: [[XOR1:%[0-9]+]]:_(s16) = G_XOR [[TRUNC1]], [[C2]]
-    ; SI-NEXT: [[AND5:%[0-9]+]]:_(s16) = G_AND [[XOR1]], [[C1]]
-    ; SI-NEXT: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[AND4]](s16)
+    ; SI-NEXT: [[AND4:%[0-9]+]]:_(s16) = G_AND [[XOR1]], [[C1]]
+    ; SI-NEXT: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[AND3]](s16)
     ; SI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LSHR]], [[ZEXT2]](s32)
     ; SI-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32)
     ; SI-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY [[C3]](s32)
-    ; SI-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C4]]
-    ; SI-NEXT: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[AND6]], [[COPY6]](s32)
-    ; SI-NEXT: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[AND5]](s16)
-    ; SI-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[LSHR5]], [[C4]]
-    ; SI-NEXT: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[AND7]], [[ZEXT3]](s32)
+    ; SI-NEXT: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[LSHR1]], [[COPY6]](s32)
+    ; SI-NEXT: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[AND4]](s16)
+    ; SI-NEXT: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[LSHR5]], [[ZEXT3]](s32)
     ; SI-NEXT: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR6]](s32)
     ; SI-NEXT: [[OR1:%[0-9]+]]:_(s16) = G_OR [[TRUNC5]], [[TRUNC6]]
-    ; SI-NEXT: [[AND8:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C1]]
+    ; SI-NEXT: [[AND5:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C1]]
     ; SI-NEXT: [[XOR2:%[0-9]+]]:_(s16) = G_XOR [[TRUNC2]], [[C2]]
-    ; SI-NEXT: [[AND9:%[0-9]+]]:_(s16) = G_AND [[XOR2]], [[C1]]
-    ; SI-NEXT: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[AND8]](s16)
+    ; SI-NEXT: [[AND6:%[0-9]+]]:_(s16) = G_AND [[XOR2]], [[C1]]
+    ; SI-NEXT: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[AND5]](s16)
     ; SI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[BITCAST1]], [[ZEXT4]](s32)
     ; SI-NEXT: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[SHL2]](s32)
     ; SI-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY [[C3]](s32)
-    ; SI-NEXT: [[AND10:%[0-9]+]]:_(s32) = G_AND [[BITCAST3]], [[C4]]
-    ; SI-NEXT: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[AND10]], [[COPY7]](s32)
-    ; SI-NEXT: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[AND9]](s16)
-    ; SI-NEXT: [[AND11:%[0-9]+]]:_(s32) = G_AND [[LSHR7]], [[C4]]
-    ; SI-NEXT: [[LSHR8:%[0-9]+]]:_(s32) = G_LSHR [[AND11]], [[ZEXT5]](s32)
+    ; SI-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[BITCAST3]], [[C4]]
+    ; SI-NEXT: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[AND7]], [[COPY7]](s32)
+    ; SI-NEXT: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[AND6]](s16)
+    ; SI-NEXT: [[LSHR8:%[0-9]+]]:_(s32) = G_LSHR [[LSHR7]], [[ZEXT5]](s32)
     ; SI-NEXT: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR8]](s32)
     ; SI-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[TRUNC7]], [[TRUNC8]]
     ; SI-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
@@ -752,14 +734,13 @@ body: |
     ; SI-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[ZEXT6]], [[SHL3]]
     ; SI-NEXT: [[BITCAST8:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR3]](s32)
     ; SI-NEXT: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16)
-    ; SI-NEXT: [[AND12:%[0-9]+]]:_(s32) = G_AND [[BITCAST6]], [[C4]]
-    ; SI-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND12]], [[C]](s32)
+    ; SI-NEXT: [[AND8:%[0-9]+]]:_(s32) = G_AND [[BITCAST6]], [[C4]]
+    ; SI-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND8]], [[C]](s32)
     ; SI-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT8]], [[SHL4]]
     ; SI-NEXT: [[BITCAST9:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR4]](s32)
-    ; SI-NEXT: [[AND13:%[0-9]+]]:_(s32) = G_AND [[LSHR9]], [[C4]]
-    ; SI-NEXT: [[AND14:%[0-9]+]]:_(s32) = G_AND [[BITCAST7]], [[C4]]
-    ; SI-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND14]], [[C]](s32)
-    ; SI-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[AND13]], [[SHL5]]
+    ; SI-NEXT: [[AND9:%[0-9]+]]:_(s32) = G_AND [[BITCAST7]], [[C4]]
+    ; SI-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[C]](s32)
+    ; SI-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[LSHR9]], [[SHL5]]
     ; SI-NEXT: [[BITCAST10:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR5]](s32)
     ; SI-NEXT: $vgpr0 = COPY [[BITCAST8]](<2 x s16>)
     ; SI-NEXT: $vgpr1 = COPY [[BITCAST9]](<2 x s16>)
@@ -833,10 +814,9 @@ body: |
     ; VI-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND6]], [[C]](s32)
     ; VI-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL4]]
     ; VI-NEXT: [[BITCAST9:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR4]](s32)
-    ; VI-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[LSHR9]], [[C4]]
-    ; VI-NEXT: [[AND8:%[0-9]+]]:_(s32) = G_AND [[BITCAST7]], [[C4]]
-    ; VI-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND8]], [[C]](s32)
-    ; VI-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[AND7]], [[SHL5]]
+    ; VI-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[BITCAST7]], [[C4]]
+    ; VI-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C]](s32)
+    ; VI-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[LSHR9]], [[SHL5]]
     ; VI-NEXT: [[BITCAST10:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR5]](s32)
     ; VI-NEXT: $vgpr0 = COPY [[BITCAST8]](<2 x s16>)
     ; VI-NEXT: $vgpr1 = COPY [[BITCAST9]](<2 x s16>)
@@ -957,50 +937,44 @@ body: |
     ; SI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[BITCAST2]], [[C4]]
     ; SI-NEXT: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[AND2]], [[C3]](s32)
     ; SI-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[AND1]](s16)
-    ; SI-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[LSHR6]], [[C4]]
-    ; SI-NEXT: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[AND3]], [[ZEXT1]](s32)
+    ; SI-NEXT: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[LSHR6]], [[ZEXT1]](s32)
     ; SI-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR7]](s32)
     ; SI-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[TRUNC4]], [[TRUNC5]]
-    ; SI-NEXT: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C1]]
+    ; SI-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C1]]
     ; SI-NEXT: [[XOR1:%[0-9]+]]:_(s16) = G_XOR [[TRUNC1]], [[C2]]
-    ; SI-NEXT: [[AND5:%[0-9]+]]:_(s16) = G_AND [[XOR1]], [[C1]]
-    ; SI-NEXT: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[AND4]](s16)
+    ; SI-NEXT: [[AND4:%[0-9]+]]:_(s16) = G_AND [[XOR1]], [[C1]]
+    ; SI-NEXT: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[AND3]](s16)
     ; SI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LSHR]], [[ZEXT2]](s32)
     ; SI-NEXT: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32)
     ; SI-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C3]](s32)
-    ; SI-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[LSHR2]], [[C4]]
-    ; SI-NEXT: [[LSHR8:%[0-9]+]]:_(s32) = G_LSHR [[AND6]], [[COPY3]](s32)
-    ; SI-NEXT: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[AND5]](s16)
-    ; SI-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[LSHR8]], [[C4]]
-    ; SI-NEXT: [[LSHR9:%[0-9]+]]:_(s32) = G_LSHR [[AND7]], [[ZEXT3]](s32)
+    ; SI-NEXT: [[LSHR8:%[0-9]+]]:_(s32) = G_LSHR [[LSHR2]], [[COPY3]](s32)
+    ; SI-NEXT: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[AND4]](s16)
+    ; SI-NEXT: [[LSHR9:%[0-9]+]]:_(s32) = G_LSHR [[LSHR8]], [[ZEXT3]](s32)
     ; SI-NEXT: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR9]](s32)
     ; SI-NEXT: [[OR1:%[0-9]+]]:_(s16) = G_OR [[TRUNC6]], [[TRUNC7]]
-    ; SI-NEXT: [[AND8:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C1]]
+    ; SI-NEXT: [[AND5:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C1]]
     ; SI-NEXT: [[XOR2:%[0-9]+]]:_(s16) = G_XOR [[TRUNC2]], [[C2]]
-    ; SI-NEXT: [[AND9:%[0-9]+]]:_(s16) = G_AND [[XOR2]], [[C1]]
-    ; SI-NEXT: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[AND8]](s16)
+    ; SI-NEXT: [[AND6:%[0-9]+]]:_(s16) = G_AND [[XOR2]], [[C1]]
+    ; SI-NEXT: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[AND5]](s16)
     ; SI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[BITCAST1]], [[ZEXT4]](s32)
     ; SI-NEXT: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[SHL2]](s32)
     ; SI-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY [[C3]](s32)
-    ; SI-NEXT: [[AND10:%[0-9]+]]:_(s32) = G_AND [[BITCAST3]], [[C4]]
-    ; SI-NEXT: [[LSHR10:%[0-9]+]]:_(s32) = G_LSHR [[AND10]], [[COPY4]](s32)
-    ; SI-NEXT: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[AND9]](s16)
-    ; SI-NEXT: [[AND11:%[0-9]+]]:_(s32) = G_AND [[LSHR10]], [[C4]]
-    ; SI-NEXT: [[LSHR11:%[0-9]+]]:_(s32) = G_LSHR [[AND11]], [[ZEXT5]](s32)
+    ; SI-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[BITCAST3]], [[C4]]
+    ; SI-NEXT: [[LSHR10:%[0-9]+]]:_(s32) = G_LSHR [[AND7]], [[COPY4]](s32)
+    ; SI-NEXT: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[AND6]](s16)
+    ; SI-NEXT: [[LSHR11:%[0-9]+]]:_(s32) = G_LSHR [[LSHR10]], [[ZEXT5]](s32)
     ; SI-NEXT: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR11]](s32)
     ; SI-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[TRUNC8]], [[TRUNC9]]
-    ; SI-NEXT: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C1]]
+    ; SI-NEXT: [[AND8:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C1]]
     ; SI-NEXT: [[XOR3:%[0-9]+]]:_(s16) = G_XOR [[TRUNC3]], [[C2]]
-    ; SI-NEXT: [[AND13:%[0-9]+]]:_(s16) = G_AND [[XOR3]], [[C1]]
-    ; SI-NEXT: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[AND12]](s16)
+    ; SI-NEXT: [[AND9:%[0-9]+]]:_(s16) = G_AND [[XOR3]], [[C1]]
+    ; SI-NEXT: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[AND8]](s16)
     ; SI-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LSHR1]], [[ZEXT6]](s32)
     ; SI-NEXT: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32)
     ; SI-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY [[C3]](s32)
-    ; SI-NEXT: [[AND14:%[0-9]+]]:_(s32) = G_AND [[LSHR3]], [[C4]]
-    ; SI-NEXT: [[LSHR12:%[0-9]+]]:_(s32) = G_LSHR [[AND14]], [[COPY5]](s32)
-    ; SI-NEXT: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[AND13]](s16)
-    ; SI-NEXT: [[AND15:%[0-9]+]]:_(s32) = G_AND [[LSHR12]], [[C4]]
-    ; SI-NEXT: [[LSHR13:%[0-9]+]]:_(s32) = G_LSHR [[AND15]], [[ZEXT7]](s32)
+    ; SI-NEXT: [[LSHR12:%[0-9]+]]:_(s32) = G_LSHR [[LSHR3]], [[COPY5]](s32)
+    ; SI-NEXT: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[AND9]](s16)
+    ; SI-NEXT: [[LSHR13:%[0-9]+]]:_(s32) = G_LSHR [[LSHR12]], [[ZEXT7]](s32)
     ; SI-NEXT: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR13]](s32)
     ; SI-NEXT: [[OR3:%[0-9]+]]:_(s16) = G_OR [[TRUNC10]], [[TRUNC11]]
     ; SI-NEXT: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fshr.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fshr.mir
index f3d284c7e00f5..4502629afd421 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fshr.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fshr.mir
@@ -215,22 +215,19 @@ body: |
     ; SI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C5]]
     ; SI-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[AND2]], [[C4]](s32)
     ; SI-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[AND1]](s16)
-    ; SI-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[LSHR2]], [[C5]]
-    ; SI-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[AND3]], [[ZEXT1]](s32)
+    ; SI-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[LSHR2]], [[ZEXT1]](s32)
     ; SI-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR3]](s32)
     ; SI-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[TRUNC]], [[TRUNC1]]
-    ; SI-NEXT: [[AND4:%[0-9]+]]:_(s16) = G_AND [[C]], [[C2]]
+    ; SI-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[C]], [[C2]]
     ; SI-NEXT: [[XOR1:%[0-9]+]]:_(s16) = G_XOR [[C]], [[C3]]
-    ; SI-NEXT: [[AND5:%[0-9]+]]:_(s16) = G_AND [[XOR1]], [[C2]]
-    ; SI-NEXT: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[AND4]](s16)
+    ; SI-NEXT: [[AND4:%[0-9]+]]:_(s16) = G_AND [[XOR1]], [[C2]]
+    ; SI-NEXT: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[AND3]](s16)
     ; SI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LSHR]], [[ZEXT2]](s32)
     ; SI-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32)
     ; SI-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C4]](s32)
-    ; SI-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C5]]
-    ; SI-NEXT: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[AND6]], [[COPY3]](s32)
-    ; SI-NEXT: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[AND5]](s16)
-    ; SI-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[LSHR4]], [[C5]]
-    ; SI-NEXT: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[AND7]], [[ZEXT3]](s32)
+    ; SI-NEXT: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[LSHR1]], [[COPY3]](s32)
+    ; SI-NEXT: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[AND4]](s16)
+    ; SI-NEXT: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[LSHR4]], [[ZEXT3]](s32)
     ; SI-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR5]](s32)
     ; SI-NEXT: [[OR1:%[0-9]+]]:_(s16) = G_OR [[TRUNC2]], [[TRUNC3]]
     ; SI-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>)
@@ -249,34 +246,32 @@ body: |
     ; SI-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST4]](s32)
     ; SI-NEXT: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST4]], [[C1]](s32)
     ; SI-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR7]](s32)
-    ; SI-NEXT: [[AND8:%[0-9]+]]:_(s16) = G_AND [[TRUNC4]], [[C2]]
+    ; SI-NEXT: [[AND5:%[0-9]+]]:_(s16) = G_AND [[TRUNC4]], [[C2]]
     ; SI-NEXT: [[XOR3:%[0-9]+]]:_(s16) = G_XOR [[TRUNC4]], [[C3]]
-    ; SI-NEXT: [[AND9:%[0-9]+]]:_(s16) = G_AND [[XOR3]], [[C2]]
-    ; SI-NEXT: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[AND8]](s16)
+    ; SI-NEXT: [[AND6:%[0-9]+]]:_(s16) = G_AND [[XOR3]], [[C2]]
+    ; SI-NEXT: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[AND5]](s16)
     ; SI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[OR]](s16)
     ; SI-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ANYEXT]], [[ZEXT4]](s32)
     ; SI-NEXT: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[SHL5]](s32)
     ; SI-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY [[C4]](s32)
-    ; SI-NEXT: [[AND10:%[0-9]+]]:_(s32) = G_AND [[SHL2]], [[C5]]
-    ; SI-NEXT: [[LSHR8:%[0-9]+]]:_(s32) = G_LSHR [[AND10]], [[COPY8]](s32)
-    ; SI-NEXT: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[AND9]](s16)
-    ; SI-NEXT: [[AND11:%[0-9]+]]:_(s32) = G_AND [[LSHR8]], [[C5]]
-    ; SI-NEXT: [[LSHR9:%[0-9]+]]:_(s32) = G_LSHR [[AND11]], [[ZEXT5]](s32)
+    ; SI-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[SHL2]], [[C5]]
+    ; SI-NEXT: [[LSHR8:%[0-9]+]]:_(s32) = G_LSHR [[AND7]], [[COPY8]](s32)
+    ; SI-NEXT: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[AND6]](s16)
+    ; SI-NEXT: [[LSHR9:%[0-9]+]]:_(s32) = G_LSHR [[LSHR8]], [[ZEXT5]](s32)
     ; SI-NEXT: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR9]](s32)
     ; SI-NEXT: [[OR3:%[0-9]+]]:_(s16) = G_OR [[TRUNC6]], [[TRUNC7]]
-    ; SI-NEXT: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC5]], [[C2]]
+    ; SI-NEXT: [[AND8:%[0-9]+]]:_(s16) = G_AND [[TRUNC5]], [[C2]]
     ; SI-NEXT: [[XOR4:%[0-9]+]]:_(s16) = G_XOR [[TRUNC5]], [[C3]]
-    ; SI-NEXT: [[AND13:%[0-9]+]]:_(s16) = G_AND [[XOR4]], [[C2]]
-    ; SI-NEXT: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[AND12]](s16)
+    ; SI-NEXT: [[AND9:%[0-9]+]]:_(s16) = G_AND [[XOR4]], [[C2]]
+    ; SI-NEXT: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[AND8]](s16)
     ; SI-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[OR1]](s16)
     ; SI-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ANYEXT1]], [[ZEXT6]](s32)
     ; SI-NEXT: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32)
     ; SI-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[C4]](s32)
-    ; SI-NEXT: [[AND14:%[0-9]+]]:_(s32) = G_AND [[SHL3]], [[C5]]
-    ; SI-NEXT: [[LSHR10:%[0-9]+]]:_(s32) = G_LSHR [[AND14]], [[COPY9]](s32)
-    ; SI-NEXT: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[AND13]](s16)
-    ; SI-NEXT: [[AND15:%[0-9]+]]:_(s32) = G_AND [[LSHR10]], [[C5]]
-    ; SI-NEXT: [[LSHR11:%[0-9]+]]:_(s32) = G_LSHR [[AND15]], [[ZEXT7]](s32)
+    ; SI-NEXT: [[AND10:%[0-9]+]]:_(s32) = G_AND [[SHL3]], [[C5]]
+    ; SI-NEXT: [[LSHR10:%[0-9]+]]:_(s32) = G_LSHR [[AND10]], [[COPY9]](s32)
+    ; SI-NEXT: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[AND9]](s16)
+    ; SI-NEXT: [[LSHR11:%[0-9]+]]:_(s32) = G_LSHR [[LSHR10]], [[ZEXT7]](s32)
     ; SI-NEXT: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR11]](s32)
     ; SI-NEXT: [[OR4:%[0-9]+]]:_(s16) = G_OR [[TRUNC8]], [[TRUNC9]]
     ; SI-NEXT: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16)
@@ -472,14 +467,11 @@ body: |
     ; SI-NEXT: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[COPY2]], [[C1]]
     ; SI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[XOR]], [[C]]
     ; SI-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; SI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[C2]](s32)
+    ; SI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[SHL]], [[AND1]](s32)
     ; SI-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
-    ; SI-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; SI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[COPY3]](s32)
-    ; SI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[AND1]], [[C3]]
-    ; SI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[SHL]], [[AND2]](s32)
-    ; SI-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[AND]], [[C3]]
-    ; SI-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]]
-    ; SI-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[AND4]], [[AND3]](s32)
+    ; SI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C3]]
+    ; SI-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[AND2]], [[AND]](s32)
     ; SI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[LSHR]]
     ; SI-NEXT: $vgpr0 = COPY [[OR]](s32)
     ;
@@ -494,19 +486,18 @@ body: |
     ; VI-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
     ; VI-NEXT: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[COPY2]], [[C1]]
     ; VI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[XOR]], [[C]]
-    ; VI-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
-    ; VI-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 1
-    ; VI-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[C3]], [[C2]]
+    ; VI-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 1
     ; VI-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
-    ; VI-NEXT: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[AND2]](s16)
+    ; VI-NEXT: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[C2]](s16)
     ; VI-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[AND1]](s32)
-    ; VI-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C2]]
-    ; VI-NEXT: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[SHL]], [[AND3]](s16)
+    ; VI-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
+    ; VI-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C3]]
+    ; VI-NEXT: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[SHL]], [[AND2]](s16)
     ; VI-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[AND]](s32)
-    ; VI-NEXT: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C2]]
+    ; VI-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C3]]
     ; VI-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
-    ; VI-NEXT: [[AND5:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C2]]
-    ; VI-NEXT: [[LSHR:%[0-9]+]]:_(s16) = G_LSHR [[AND5]], [[AND4]](s16)
+    ; VI-NEXT: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]]
+    ; VI-NEXT: [[LSHR:%[0-9]+]]:_(s16) = G_LSHR [[AND4]], [[AND3]](s16)
     ; VI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[SHL1]](s16)
     ; VI-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR]](s16)
     ; VI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[ANYEXT]], [[ANYEXT1]]
@@ -523,19 +514,18 @@ body: |
     ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
     ; GFX9-NEXT: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[COPY2]], [[C1]]
     ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[XOR]], [[C]]
-    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
-    ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 1
-    ; GFX9-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[C3]], [[C2]]
+    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 1
     ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[AND2]](s16)
+    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[C2]](s16)
     ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[AND1]](s32)
-    ; GFX9-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C2]]
-    ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[SHL]], [[AND3]](s16)
+    ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
+    ; GFX9-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C3]]
+    ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[SHL]], [[AND2]](s16)
     ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[AND]](s32)
-    ; GFX9-NEXT: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C2]]
+    ; GFX9-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C3]]
     ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
-    ; GFX9-NEXT: [[AND5:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C2]]
-    ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s16) = G_LSHR [[AND5]], [[AND4]](s16)
+    ; GFX9-NEXT: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]]
+    ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s16) = G_LSHR [[AND4]], [[AND3]](s16)
     ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[SHL1]](s16)
     ; GFX9-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR]](s16)
     ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[ANYEXT]], [[ANYEXT1]]
@@ -567,30 +557,28 @@ body: |
     ; SI-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
     ; SI-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16777215
     ; SI-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C2]]
-    ; SI-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C1]](s32)
-    ; SI-NEXT: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[COPY3]](s32)
+    ; SI-NEXT: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[C1]](s32)
     ; SI-NEXT: [[AMDGPU_RCP_IFLAG:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG [[UITOFP]](s32)
     ; SI-NEXT: [[C3:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41EFFFFFC0000000
     ; SI-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG]], [[C3]]
     ; SI-NEXT: [[FPTOUI:%[0-9]+]]:_(s32) = G_FPTOUI [[FMUL]](s32)
     ; SI-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-    ; SI-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C4]], [[COPY3]]
+    ; SI-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C4]], [[C1]]
     ; SI-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[SUB]], [[FPTOUI]]
     ; SI-NEXT: [[UMULH:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI]], [[MUL]]
     ; SI-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI]], [[UMULH]]
     ; SI-NEXT: [[UMULH1:%[0-9]+]]:_(s32) = G_UMULH [[AND]], [[ADD]]
-    ; SI-NEXT: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[UMULH1]], [[COPY3]]
+    ; SI-NEXT: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[UMULH1]], [[C1]]
     ; SI-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[AND]], [[MUL1]]
     ; SI-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; SI-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SUB1]](s32), [[COPY3]]
-    ; SI-NEXT: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[SUB1]], [[COPY3]]
+    ; SI-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SUB1]](s32), [[C1]]
+    ; SI-NEXT: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[SUB1]], [[C1]]
     ; SI-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[SUB2]], [[SUB1]]
-    ; SI-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SELECT]](s32), [[COPY3]]
-    ; SI-NEXT: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[SELECT]], [[COPY3]]
+    ; SI-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SELECT]](s32), [[C1]]
+    ; SI-NEXT: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[SELECT]], [[C1]]
     ; SI-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s1), [[SUB3]], [[SELECT]]
     ; SI-NEXT: [[SUB4:%[0-9]+]]:_(s32) = G_SUB [[C]], [[SELECT1]]
-    ; SI-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
-    ; SI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[COPY4]](s32)
+    ; SI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[C5]](s32)
     ; SI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[SUB4]], [[C2]]
     ; SI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[SHL]], [[AND1]](s32)
     ; SI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[SELECT1]], [[C2]]
@@ -609,30 +597,28 @@ body: |
     ; VI-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
     ; VI-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16777215
     ; VI-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C2]]
-    ; VI-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C1]](s32)
-    ; VI-NEXT: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[COPY3]](s32)
+    ; VI-NEXT: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[C1]](s32)
     ; VI-NEXT: [[AMDGPU_RCP_IFLAG:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG [[UITOFP]](s32)
     ; VI-NEXT: [[C3:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41EFFFFFC0000000
     ; VI-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG]], [[C3]]
     ; VI-NEXT: [[FPTOUI:%[0-9]+]]:_(s32) = G_FPTOUI [[FMUL]](s32)
     ; VI-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-    ; VI-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C4]], [[COPY3]]
+    ; VI-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C4]], [[C1]]
     ; VI-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[SUB]], [[FPTOUI]]
     ; VI-NEXT: [[UMULH:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI]], [[MUL]]
     ; VI-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI]], [[UMULH]]
     ; VI-NEXT: [[UMULH1:%[0-9]+]]:_(s32) = G_UMULH [[AND]], [[ADD]]
-    ; VI-NEXT: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[UMULH1]], [[COPY3]]
+    ; VI-NEXT: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[UMULH1]], [[C1]]
     ; VI-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[AND]], [[MUL1]]
     ; VI-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; VI-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SUB1]](s32), [[COPY3]]
-    ; VI-NEXT: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[SUB1]], [[COPY3]]
+    ; VI-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SUB1]](s32), [[C1]]
+    ; VI-NEXT: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[SUB1]], [[C1]]
     ; VI-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[SUB2]], [[SUB1]]
-    ; VI-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SELECT]](s32), [[COPY3]]
-    ; VI-NEXT: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[SELECT]], [[COPY3]]
+    ; VI-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SELECT]](s32), [[C1]]
+    ; VI-NEXT: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[SELECT]], [[C1]]
     ; VI-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s1), [[SUB3]], [[SELECT]]
     ; VI-NEXT: [[SUB4:%[0-9]+]]:_(s32) = G_SUB [[C]], [[SELECT1]]
-    ; VI-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
-    ; VI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[COPY4]](s32)
+    ; VI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[C5]](s32)
     ; VI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[SUB4]], [[C2]]
     ; VI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[SHL]], [[AND1]](s32)
     ; VI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[SELECT1]], [[C2]]
@@ -651,30 +637,28 @@ body: |
     ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
     ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16777215
     ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C2]]
-    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C1]](s32)
-    ; GFX9-NEXT: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[COPY3]](s32)
+    ; GFX9-NEXT: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[C1]](s32)
     ; GFX9-NEXT: [[AMDGPU_RCP_IFLAG:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG [[UITOFP]](s32)
     ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41EFFFFFC0000000
     ; GFX9-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG]], [[C3]]
     ; GFX9-NEXT: [[FPTOUI:%[0-9]+]]:_(s32) = G_FPTOUI [[FMUL]](s32)
     ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-    ; GFX9-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C4]], [[COPY3]]
+    ; GFX9-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C4]], [[C1]]
     ; GFX9-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[SUB]], [[FPTOUI]]
     ; GFX9-NEXT: [[UMULH:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI]], [[MUL]]
     ; GFX9-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI]], [[UMULH]]
     ; GFX9-NEXT: [[UMULH1:%[0-9]+]]:_(s32) = G_UMULH [[AND]], [[ADD]]
-    ; GFX9-NEXT: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[UMULH1]], [[COPY3]]
+    ; GFX9-NEXT: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[UMULH1]], [[C1]]
     ; GFX9-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[AND]], [[MUL1]]
     ; GFX9-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; GFX9-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SUB1]](s32), [[COPY3]]
-    ; GFX9-NEXT: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[SUB1]], [[COPY3]]
+    ; GFX9-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SUB1]](s32), [[C1]]
+    ; GFX9-NEXT: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[SUB1]], [[C1]]
     ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[SUB2]], [[SUB1]]
-    ; GFX9-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SELECT]](s32), [[COPY3]]
-    ; GFX9-NEXT: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[SELECT]], [[COPY3]]
+    ; GFX9-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SELECT]](s32), [[C1]]
+    ; GFX9-NEXT: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[SELECT]], [[C1]]
     ; GFX9-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s1), [[SUB3]], [[SELECT]]
     ; GFX9-NEXT: [[SUB4:%[0-9]+]]:_(s32) = G_SUB [[C]], [[SELECT1]]
-    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[COPY4]](s32)
+    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[C5]](s32)
     ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[SUB4]], [[C2]]
     ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[SHL]], [[AND1]](s32)
     ; GFX9-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[SELECT1]], [[C2]]
@@ -729,22 +713,19 @@ body: |
     ; SI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[BITCAST2]], [[C5]]
     ; SI-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[AND2]], [[C4]](s32)
     ; SI-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[AND1]](s16)
-    ; SI-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[LSHR2]], [[C5]]
-    ; SI-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[AND3]], [[ZEXT1]](s32)
+    ; SI-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[LSHR2]], [[ZEXT1]](s32)
     ; SI-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR3]](s32)
     ; SI-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[TRUNC]], [[TRUNC1]]
-    ; SI-NEXT: [[AND4:%[0-9]+]]:_(s16) = G_AND [[C1]], [[C2]]
+    ; SI-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[C1]], [[C2]]
     ; SI-NEXT: [[XOR1:%[0-9]+]]:_(s16) = G_XOR [[C1]], [[C3]]
-    ; SI-NEXT: [[AND5:%[0-9]+]]:_(s16) = G_AND [[XOR1]], [[C2]]
-    ; SI-NEXT: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[AND4]](s16)
+    ; SI-NEXT: [[AND4:%[0-9]+]]:_(s16) = G_AND [[XOR1]], [[C2]]
+    ; SI-NEXT: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[AND3]](s16)
     ; SI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LSHR]], [[ZEXT2]](s32)
     ; SI-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32)
     ; SI-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY [[C4]](s32)
-    ; SI-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C5]]
-    ; SI-NEXT: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[AND6]], [[COPY6]](s32)
-    ; SI-NEXT: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[AND5]](s16)
-    ; SI-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[LSHR4]], [[C5]]
-    ; SI-NEXT: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[AND7]], [[ZEXT3]](s32)
+    ; SI-NEXT: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[LSHR1]], [[COPY6]](s32)
+    ; SI-NEXT: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[AND4]](s16)
+    ; SI-NEXT: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[LSHR4]], [[ZEXT3]](s32)
     ; SI-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR5]](s32)
     ; SI-NEXT: [[OR1:%[0-9]+]]:_(s16) = G_OR [[TRUNC2]], [[TRUNC3]]
     ; SI-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY [[C4]](s32)
@@ -761,48 +742,45 @@ body: |
     ; SI-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST5]](s32)
     ; SI-NEXT: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST5]], [[C]](s32)
     ; SI-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR6]](s32)
-    ; SI-NEXT: [[AND8:%[0-9]+]]:_(s16) = G_AND [[TRUNC4]], [[C2]]
+    ; SI-NEXT: [[AND5:%[0-9]+]]:_(s16) = G_AND [[TRUNC4]], [[C2]]
     ; SI-NEXT: [[XOR3:%[0-9]+]]:_(s16) = G_XOR [[TRUNC4]], [[C3]]
-    ; SI-NEXT: [[AND9:%[0-9]+]]:_(s16) = G_AND [[XOR3]], [[C2]]
-    ; SI-NEXT: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[AND8]](s16)
+    ; SI-NEXT: [[AND6:%[0-9]+]]:_(s16) = G_AND [[XOR3]], [[C2]]
+    ; SI-NEXT: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[AND5]](s16)
     ; SI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[OR]](s16)
     ; SI-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ANYEXT]], [[ZEXT4]](s32)
     ; SI-NEXT: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[SHL5]](s32)
     ; SI-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[C4]](s32)
-    ; SI-NEXT: [[AND10:%[0-9]+]]:_(s32) = G_AND [[SHL2]], [[C5]]
-    ; SI-NEXT: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[AND10]], [[COPY11]](s32)
-    ; SI-NEXT: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[AND9]](s16)
-    ; SI-NEXT: [[AND11:%[0-9]+]]:_(s32) = G_AND [[LSHR7]], [[C5]]
-    ; SI-NEXT: [[LSHR8:%[0-9]+]]:_(s32) = G_LSHR [[AND11]], [[ZEXT5]](s32)
+    ; SI-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[SHL2]], [[C5]]
+    ; SI-NEXT: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[AND7]], [[COPY11]](s32)
+    ; SI-NEXT: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[AND6]](s16)
+    ; SI-NEXT: [[LSHR8:%[0-9]+]]:_(s32) = G_LSHR [[LSHR7]], [[ZEXT5]](s32)
     ; SI-NEXT: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR8]](s32)
     ; SI-NEXT: [[OR3:%[0-9]+]]:_(s16) = G_OR [[TRUNC6]], [[TRUNC7]]
-    ; SI-NEXT: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC5]], [[C2]]
+    ; SI-NEXT: [[AND8:%[0-9]+]]:_(s16) = G_AND [[TRUNC5]], [[C2]]
     ; SI-NEXT: [[XOR4:%[0-9]+]]:_(s16) = G_XOR [[TRUNC5]], [[C3]]
-    ; SI-NEXT: [[AND13:%[0-9]+]]:_(s16) = G_AND [[XOR4]], [[C2]]
-    ; SI-NEXT: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[AND12]](s16)
+    ; SI-NEXT: [[AND9:%[0-9]+]]:_(s16) = G_AND [[XOR4]], [[C2]]
+    ; SI-NEXT: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[AND8]](s16)
     ; SI-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[OR1]](s16)
     ; SI-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ANYEXT1]], [[ZEXT6]](s32)
     ; SI-NEXT: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32)
     ; SI-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[C4]](s32)
-    ; SI-NEXT: [[AND14:%[0-9]+]]:_(s32) = G_AND [[SHL3]], [[C5]]
-    ; SI-NEXT: [[LSHR9:%[0-9]+]]:_(s32) = G_LSHR [[AND14]], [[COPY12]](s32)
-    ; SI-NEXT: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[AND13]](s16)
-    ; SI-NEXT: [[AND15:%[0-9]+]]:_(s32) = G_AND [[LSHR9]], [[C5]]
-    ; SI-NEXT: [[LSHR10:%[0-9]+]]:_(s32) = G_LSHR [[AND15]], [[ZEXT7]](s32)
+    ; SI-NEXT: [[AND10:%[0-9]+]]:_(s32) = G_AND [[SHL3]], [[C5]]
+    ; SI-NEXT: [[LSHR9:%[0-9]+]]:_(s32) = G_LSHR [[AND10]], [[COPY12]](s32)
+    ; SI-NEXT: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[AND9]](s16)
+    ; SI-NEXT: [[LSHR10:%[0-9]+]]:_(s32) = G_LSHR [[LSHR9]], [[ZEXT7]](s32)
     ; SI-NEXT: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR10]](s32)
     ; SI-NEXT: [[OR4:%[0-9]+]]:_(s16) = G_OR [[TRUNC8]], [[TRUNC9]]
-    ; SI-NEXT: [[AND16:%[0-9]+]]:_(s16) = G_AND [[C1]], [[C2]]
+    ; SI-NEXT: [[AND11:%[0-9]+]]:_(s16) = G_AND [[C1]], [[C2]]
     ; SI-NEXT: [[XOR5:%[0-9]+]]:_(s16) = G_XOR [[C1]], [[C3]]
-    ; SI-NEXT: [[AND17:%[0-9]+]]:_(s16) = G_AND [[XOR5]], [[C2]]
-    ; SI-NEXT: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[AND16]](s16)
+    ; SI-NEXT: [[AND12:%[0-9]+]]:_(s16) = G_AND [[XOR5]], [[C2]]
+    ; SI-NEXT: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[AND11]](s16)
     ; SI-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[BITCAST1]], [[ZEXT8]](s32)
     ; SI-NEXT: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[SHL7]](s32)
     ; SI-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[C4]](s32)
-    ; SI-NEXT: [[AND18:%[0-9]+]]:_(s32) = G_AND [[BITCAST3]], [[C5]]
-    ; SI-NEXT: [[LSHR11:%[0-9]+]]:_(s32) = G_LSHR [[AND18]], [[COPY13]](s32)
-    ; SI-NEXT: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[AND17]](s16)
-    ; SI-NEXT: [[AND19:%[0-9]+]]:_(s32) = G_AND [[LSHR11]], [[C5]]
-    ; SI-NEXT: [[LSHR12:%[0-9]+]]:_(s32) = G_LSHR [[AND19]], [[ZEXT9]](s32)
+    ; SI-NEXT: [[AND13:%[0-9]+]]:_(s32) = G_AND [[BITCAST3]], [[C5]]
+    ; SI-NEXT: [[LSHR11:%[0-9]+]]:_(s32) = G_LSHR [[AND13]], [[COPY13]](s32)
+    ; SI-NEXT: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[AND12]](s16)
+    ; SI-NEXT: [[LSHR12:%[0-9]+]]:_(s32) = G_LSHR [[LSHR11]], [[ZEXT9]](s32)
     ; SI-NEXT: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR12]](s32)
     ; SI-NEXT: [[OR5:%[0-9]+]]:_(s16) = G_OR [[TRUNC10]], [[TRUNC11]]
     ; SI-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[C4]](s32)
@@ -815,19 +793,18 @@ body: |
     ; SI-NEXT: [[XOR6:%[0-9]+]]:_(<2 x s16>) = G_XOR [[COPY5]], [[BITCAST6]]
     ; SI-NEXT: [[BITCAST7:%[0-9]+]]:_(s32) = G_BITCAST [[XOR6]](<2 x s16>)
     ; SI-NEXT: [[TRUNC12:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST7]](s32)
-    ; SI-NEXT: [[AND20:%[0-9]+]]:_(s16) = G_AND [[TRUNC12]], [[C2]]
+    ; SI-NEXT: [[AND14:%[0-9]+]]:_(s16) = G_AND [[TRUNC12]], [[C2]]
     ; SI-NEXT: [[XOR7:%[0-9]+]]:_(s16) = G_XOR [[TRUNC12]], [[C3]]
-    ; SI-NEXT: [[AND21:%[0-9]+]]:_(s16) = G_AND [[XOR7]], [[C2]]
-    ; SI-NEXT: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[AND20]](s16)
+    ; SI-NEXT: [[AND15:%[0-9]+]]:_(s16) = G_AND [[XOR7]], [[C2]]
+    ; SI-NEXT: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[AND14]](s16)
     ; SI-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[OR5]](s16)
     ; SI-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[ANYEXT2]], [[ZEXT10]](s32)
     ; SI-NEXT: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[SHL10]](s32)
     ; SI-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[C4]](s32)
-    ; SI-NEXT: [[AND22:%[0-9]+]]:_(s32) = G_AND [[SHL8]], [[C5]]
-    ; SI-NEXT: [[LSHR13:%[0-9]+]]:_(s32) = G_LSHR [[AND22]], [[COPY17]](s32)
-    ; SI-NEXT: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[AND21]](s16)
-    ; SI-NEXT: [[AND23:%[0-9]+]]:_(s32) = G_AND [[LSHR13]], [[C5]]
-    ; SI-NEXT: [[LSHR14:%[0-9]+]]:_(s32) = G_LSHR [[AND23]], [[ZEXT11]](s32)
+    ; SI-NEXT: [[AND16:%[0-9]+]]:_(s32) = G_AND [[SHL8]], [[C5]]
+    ; SI-NEXT: [[LSHR13:%[0-9]+]]:_(s32) = G_LSHR [[AND16]], [[COPY17]](s32)
+    ; SI-NEXT: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[AND15]](s16)
+    ; SI-NEXT: [[LSHR14:%[0-9]+]]:_(s32) = G_LSHR [[LSHR13]], [[ZEXT11]](s32)
     ; SI-NEXT: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR14]](s32)
     ; SI-NEXT: [[OR7:%[0-9]+]]:_(s16) = G_OR [[TRUNC13]], [[TRUNC14]]
     ; SI-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
@@ -841,14 +818,13 @@ body: |
     ; SI-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[ZEXT12]], [[SHL11]]
     ; SI-NEXT: [[BITCAST10:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR8]](s32)
     ; SI-NEXT: [[ZEXT14:%[0-9]+]]:_(s32) = G_ZEXT [[OR7]](s16)
-    ; SI-NEXT: [[AND24:%[0-9]+]]:_(s32) = G_AND [[BITCAST8]], [[C5]]
-    ; SI-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[AND24]], [[C]](s32)
+    ; SI-NEXT: [[AND17:%[0-9]+]]:_(s32) = G_AND [[BITCAST8]], [[C5]]
+    ; SI-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[AND17]], [[C]](s32)
     ; SI-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[ZEXT14]], [[SHL12]]
     ; SI-NEXT: [[BITCAST11:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR9]](s32)
-    ; SI-NEXT: [[AND25:%[0-9]+]]:_(s32) = G_AND [[LSHR15]], [[C5]]
-    ; SI-NEXT: [[AND26:%[0-9]+]]:_(s32) = G_AND [[BITCAST9]], [[C5]]
-    ; SI-NEXT: [[SHL13:%[0-9]+]]:_(s32) = G_SHL [[AND26]], [[C]](s32)
-    ; SI-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[AND25]], [[SHL13]]
+    ; SI-NEXT: [[AND18:%[0-9]+]]:_(s32) = G_AND [[BITCAST9]], [[C5]]
+    ; SI-NEXT: [[SHL13:%[0-9]+]]:_(s32) = G_SHL [[AND18]], [[C]](s32)
+    ; SI-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[LSHR15]], [[SHL13]]
     ; SI-NEXT: [[BITCAST12:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR10]](s32)
     ; SI-NEXT: $vgpr0 = COPY [[BITCAST10]](<2 x s16>)
     ; SI-NEXT: $vgpr1 = COPY [[BITCAST11]](<2 x s16>)
@@ -957,10 +933,9 @@ body: |
     ; VI-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[AND12]], [[C]](s32)
     ; VI-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL12]]
     ; VI-NEXT: [[BITCAST11:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR9]](s32)
-    ; VI-NEXT: [[AND13:%[0-9]+]]:_(s32) = G_AND [[LSHR15]], [[C4]]
-    ; VI-NEXT: [[AND14:%[0-9]+]]:_(s32) = G_AND [[BITCAST9]], [[C4]]
-    ; VI-NEXT: [[SHL13:%[0-9]+]]:_(s32) = G_SHL [[AND14]], [[C]](s32)
-    ; VI-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[AND13]], [[SHL13]]
+    ; VI-NEXT: [[AND13:%[0-9]+]]:_(s32) = G_AND [[BITCAST9]], [[C4]]
+    ; VI-NEXT: [[SHL13:%[0-9]+]]:_(s32) = G_SHL [[AND13]], [[C]](s32)
+    ; VI-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[LSHR15]], [[SHL13]]
     ; VI-NEXT: [[BITCAST12:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR10]](s32)
     ; VI-NEXT: $vgpr0 = COPY [[BITCAST10]](<2 x s16>)
     ; VI-NEXT: $vgpr1 = COPY [[BITCAST11]](<2 x s16>)
@@ -1070,22 +1045,19 @@ body: |
     ; SI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C5]]
     ; SI-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[AND2]], [[C4]](s32)
     ; SI-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[AND1]](s16)
-    ; SI-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[LSHR2]], [[C5]]
-    ; SI-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[AND3]], [[ZEXT1]](s32)
+    ; SI-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[LSHR2]], [[ZEXT1]](s32)
     ; SI-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR3]](s32)
     ; SI-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[TRUNC]], [[TRUNC1]]
-    ; SI-NEXT: [[AND4:%[0-9]+]]:_(s16) = G_AND [[C]], [[C2]]
+    ; SI-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[C]], [[C2]]
     ; SI-NEXT: [[XOR1:%[0-9]+]]:_(s16) = G_XOR [[C]], [[C3]]
-    ; SI-NEXT: [[AND5:%[0-9]+]]:_(s16) = G_AND [[XOR1]], [[C2]]
-    ; SI-NEXT: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[AND4]](s16)
+    ; SI-NEXT: [[AND4:%[0-9]+]]:_(s16) = G_AND [[XOR1]], [[C2]]
+    ; SI-NEXT: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[AND3]](s16)
     ; SI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LSHR]], [[ZEXT2]](s32)
     ; SI-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32)
     ; SI-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C4]](s32)
-    ; SI-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C5]]
-    ; SI-NEXT: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[AND6]], [[COPY3]](s32)
-    ; SI-NEXT: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[AND5]](s16)
-    ; SI-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[LSHR4]], [[C5]]
-    ; SI-NEXT: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[AND7]], [[ZEXT3]](s32)
+    ; SI-NEXT: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[LSHR1]], [[COPY3]](s32)
+    ; SI-NEXT: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[AND4]](s16)
+    ; SI-NEXT: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[LSHR4]], [[ZEXT3]](s32)
     ; SI-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR5]](s32)
     ; SI-NEXT: [[OR1:%[0-9]+]]:_(s16) = G_OR [[TRUNC2]], [[TRUNC3]]
     ; SI-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>)
@@ -1104,34 +1076,32 @@ body: |
     ; SI-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST4]](s32)
     ; SI-NEXT: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST4]], [[C1]](s32)
     ; SI-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR7]](s32)
-    ; SI-NEXT: [[AND8:%[0-9]+]]:_(s16) = G_AND [[TRUNC4]], [[C2]]
+    ; SI-NEXT: [[AND5:%[0-9]+]]:_(s16) = G_AND [[TRUNC4]], [[C2]]
     ; SI-NEXT: [[XOR3:%[0-9]+]]:_(s16) = G_XOR [[TRUNC4]], [[C3]]
-    ; SI-NEXT: [[AND9:%[0-9]+]]:_(s16) = G_AND [[XOR3]], [[C2]]
-    ; SI-NEXT: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[AND8]](s16)
+    ; SI-NEXT: [[AND6:%[0-9]+]]:_(s16) = G_AND [[XOR3]], [[C2]]
+    ; SI-NEXT: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[AND5]](s16)
     ; SI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[OR]](s16)
     ; SI-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ANYEXT]], [[ZEXT4]](s32)
     ; SI-NEXT: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[SHL5]](s32)
     ; SI-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY [[C4]](s32)
-    ; SI-NEXT: [[AND10:%[0-9]+]]:_(s32) = G_AND [[SHL2]], [[C5]]
-    ; SI-NEXT: [[LSHR8:%[0-9]+]]:_(s32) = G_LSHR [[AND10]], [[COPY8]](s32)
-    ; SI-NEXT: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[AND9]](s16)
-    ; SI-NEXT: [[AND11:%[0-9]+]]:_(s32) = G_AND [[LSHR8]], [[C5]]
-    ; SI-NEXT: [[LSHR9:%[0-9]+]]:_(s32) = G_LSHR [[AND11]], [[ZEXT5]](s32)
+    ; SI-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[SHL2]], [[C5]]
+    ; SI-NEXT: [[LSHR8:%[0-9]+]]:_(s32) = G_LSHR [[AND7]], [[COPY8]](s32)
+    ; SI-NEXT: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[AND6]](s16)
+    ; SI-NEXT: [[LSHR9:%[0-9]+]]:_(s32) = G_LSHR [[LSHR8]], [[ZEXT5]](s32)
     ; SI-NEXT: [[TRUNC7:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR9]](s32)
     ; SI-NEXT: [[OR3:%[0-9]+]]:_(s16) = G_OR [[TRUNC6]], [[TRUNC7]]
-    ; SI-NEXT: [[AND12:%[0-9]+]]:_(s16) = G_AND [[TRUNC5]], [[C2]]
+    ; SI-NEXT: [[AND8:%[0-9]+]]:_(s16) = G_AND [[TRUNC5]], [[C2]]
     ; SI-NEXT: [[XOR4:%[0-9]+]]:_(s16) = G_XOR [[TRUNC5]], [[C3]]
-    ; SI-NEXT: [[AND13:%[0-9]+]]:_(s16) = G_AND [[XOR4]], [[C2]]
-    ; SI-NEXT: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[AND12]](s16)
+    ; SI-NEXT: [[AND9:%[0-9]+]]:_(s16) = G_AND [[XOR4]], [[C2]]
+    ; SI-NEXT: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[AND8]](s16)
     ; SI-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[OR1]](s16)
     ; SI-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ANYEXT1]], [[ZEXT6]](s32)
     ; SI-NEXT: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[SHL6]](s32)
     ; SI-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[C4]](s32)
-    ; SI-NEXT: [[AND14:%[0-9]+]]:_(s32) = G_AND [[SHL3]], [[C5]]
-    ; SI-NEXT: [[LSHR10:%[0-9]+]]:_(s32) = G_LSHR [[AND14]], [[COPY9]](s32)
-    ; SI-NEXT: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[AND13]](s16)
-    ; SI-NEXT: [[AND15:%[0-9]+]]:_(s32) = G_AND [[LSHR10]], [[C5]]
-    ; SI-NEXT: [[LSHR11:%[0-9]+]]:_(s32) = G_LSHR [[AND15]], [[ZEXT7]](s32)
+    ; SI-NEXT: [[AND10:%[0-9]+]]:_(s32) = G_AND [[SHL3]], [[C5]]
+    ; SI-NEXT: [[LSHR10:%[0-9]+]]:_(s32) = G_LSHR [[AND10]], [[COPY9]](s32)
+    ; SI-NEXT: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[AND9]](s16)
+    ; SI-NEXT: [[LSHR11:%[0-9]+]]:_(s32) = G_LSHR [[LSHR10]], [[ZEXT7]](s32)
     ; SI-NEXT: [[TRUNC9:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR11]](s32)
     ; SI-NEXT: [[OR4:%[0-9]+]]:_(s16) = G_OR [[TRUNC8]], [[TRUNC9]]
     ; SI-NEXT: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16)
@@ -1143,32 +1113,29 @@ body: |
     ; SI-NEXT: [[LSHR12:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST6]], [[C1]](s32)
     ; SI-NEXT: [[BITCAST7:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>)
     ; SI-NEXT: [[LSHR13:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST7]], [[C1]](s32)
-    ; SI-NEXT: [[AND16:%[0-9]+]]:_(s16) = G_AND [[C]], [[C2]]
+    ; SI-NEXT: [[AND11:%[0-9]+]]:_(s16) = G_AND [[C]], [[C2]]
     ; SI-NEXT: [[XOR5:%[0-9]+]]:_(s16) = G_XOR [[C]], [[C3]]
-    ; SI-NEXT: [[AND17:%[0-9]+]]:_(s16) = G_AND [[XOR5]], [[C2]]
-    ; SI-NEXT: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[AND16]](s16)
+    ; SI-NEXT: [[AND12:%[0-9]+]]:_(s16) = G_AND [[XOR5]], [[C2]]
+    ; SI-NEXT: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[AND11]](s16)
     ; SI-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[BITCAST6]], [[ZEXT10]](s32)
     ; SI-NEXT: [[TRUNC10:%[0-9]+]]:_(s16) = G_TRUNC [[SHL8]](s32)
     ; SI-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[C4]](s32)
-    ; SI-NEXT: [[AND18:%[0-9]+]]:_(s32) = G_AND [[BITCAST7]], [[C5]]
-    ; SI-NEXT: [[LSHR14:%[0-9]+]]:_(s32) = G_LSHR [[AND18]], [[COPY10]](s32)
-    ; SI-NEXT: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[AND17]](s16)
-    ; SI-NEXT: [[AND19:%[0-9]+]]:_(s32) = G_AND [[LSHR14]], [[C5]]
-    ; SI-NEXT: [[LSHR15:%[0-9]+]]:_(s32) = G_LSHR [[AND19]], [[ZEXT11]](s32)
+    ; SI-NEXT: [[AND13:%[0-9]+]]:_(s32) = G_AND [[BITCAST7]], [[C5]]
+    ; SI-NEXT: [[LSHR14:%[0-9]+]]:_(s32) = G_LSHR [[AND13]], [[COPY10]](s32)
+    ; SI-NEXT: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[AND12]](s16)
+    ; SI-NEXT: [[LSHR15:%[0-9]+]]:_(s32) = G_LSHR [[LSHR14]], [[ZEXT11]](s32)
     ; SI-NEXT: [[TRUNC11:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR15]](s32)
     ; SI-NEXT: [[OR6:%[0-9]+]]:_(s16) = G_OR [[TRUNC10]], [[TRUNC11]]
-    ; SI-NEXT: [[AND20:%[0-9]+]]:_(s16) = G_AND [[C]], [[C2]]
+    ; SI-NEXT: [[AND14:%[0-9]+]]:_(s16) = G_AND [[C]], [[C2]]
     ; SI-NEXT: [[XOR6:%[0-9]+]]:_(s16) = G_XOR [[C]], [[C3]]
-    ; SI-NEXT: [[AND21:%[0-9]+]]:_(s16) = G_AND [[XOR6]], [[C2]]
-    ; SI-NEXT: [[ZEXT12:%[0-9]+]]:_(s32) = G_ZEXT [[AND20]](s16)
+    ; SI-NEXT: [[AND15:%[0-9]+]]:_(s16) = G_AND [[XOR6]], [[C2]]
+    ; SI-NEXT: [[ZEXT12:%[0-9]+]]:_(s32) = G_ZEXT [[AND14]](s16)
     ; SI-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[LSHR12]], [[ZEXT12]](s32)
     ; SI-NEXT: [[TRUNC12:%[0-9]+]]:_(s16) = G_TRUNC [[SHL9]](s32)
     ; SI-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[C4]](s32)
-    ; SI-NEXT: [[AND22:%[0-9]+]]:_(s32) = G_AND [[LSHR13]], [[C5]]
-    ; SI-NEXT: [[LSHR16:%[0-9]+]]:_(s32) = G_LSHR [[AND22]], [[COPY11]](s32)
-    ; SI-NEXT: [[ZEXT13:%[0-9]+]]:_(s32) = G_ZEXT [[AND21]](s16)
-    ; SI-NEXT: [[AND23:%[0-9]+]]:_(s32) = G_AND [[LSHR16]], [[C5]]
-    ; SI-NEXT: [[LSHR17:%[0-9]+]]:_(s32) = G_LSHR [[AND23]], [[ZEXT13]](s32)
+    ; SI-NEXT: [[LSHR16:%[0-9]+]]:_(s32) = G_LSHR [[LSHR13]], [[COPY11]](s32)
+    ; SI-NEXT: [[ZEXT13:%[0-9]+]]:_(s32) = G_ZEXT [[AND15]](s16)
+    ; SI-NEXT: [[LSHR17:%[0-9]+]]:_(s32) = G_LSHR [[LSHR16]], [[ZEXT13]](s32)
     ; SI-NEXT: [[TRUNC13:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR17]](s32)
     ; SI-NEXT: [[OR7:%[0-9]+]]:_(s16) = G_OR [[TRUNC12]], [[TRUNC13]]
     ; SI-NEXT: [[BITCAST8:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>)
@@ -1187,34 +1154,32 @@ body: |
     ; SI-NEXT: [[TRUNC14:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST10]](s32)
     ; SI-NEXT: [[LSHR19:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST10]], [[C1]](s32)
     ; SI-NEXT: [[TRUNC15:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR19]](s32)
-    ; SI-NEXT: [[AND24:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C2]]
+    ; SI-NEXT: [[AND16:%[0-9]+]]:_(s16) = G_AND [[TRUNC14]], [[C2]]
     ; SI-NEXT: [[XOR8:%[0-9]+]]:_(s16) = G_XOR [[TRUNC14]], [[C3]]
-    ; SI-NEXT: [[AND25:%[0-9]+]]:_(s16) = G_AND [[XOR8]], [[C2]]
-    ; SI-NEXT: [[ZEXT14:%[0-9]+]]:_(s32) = G_ZEXT [[AND24]](s16)
+    ; SI-NEXT: [[AND17:%[0-9]+]]:_(s16) = G_AND [[XOR8]], [[C2]]
+    ; SI-NEXT: [[ZEXT14:%[0-9]+]]:_(s32) = G_ZEXT [[AND16]](s16)
     ; SI-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[OR6]](s16)
     ; SI-NEXT: [[SHL13:%[0-9]+]]:_(s32) = G_SHL [[ANYEXT2]], [[ZEXT14]](s32)
     ; SI-NEXT: [[TRUNC16:%[0-9]+]]:_(s16) = G_TRUNC [[SHL13]](s32)
     ; SI-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[C4]](s32)
-    ; SI-NEXT: [[AND26:%[0-9]+]]:_(s32) = G_AND [[SHL10]], [[C5]]
-    ; SI-NEXT: [[LSHR20:%[0-9]+]]:_(s32) = G_LSHR [[AND26]], [[COPY16]](s32)
-    ; SI-NEXT: [[ZEXT15:%[0-9]+]]:_(s32) = G_ZEXT [[AND25]](s16)
-    ; SI-NEXT: [[AND27:%[0-9]+]]:_(s32) = G_AND [[LSHR20]], [[C5]]
-    ; SI-NEXT: [[LSHR21:%[0-9]+]]:_(s32) = G_LSHR [[AND27]], [[ZEXT15]](s32)
+    ; SI-NEXT: [[AND18:%[0-9]+]]:_(s32) = G_AND [[SHL10]], [[C5]]
+    ; SI-NEXT: [[LSHR20:%[0-9]+]]:_(s32) = G_LSHR [[AND18]], [[COPY16]](s32)
+    ; SI-NEXT: [[ZEXT15:%[0-9]+]]:_(s32) = G_ZEXT [[AND17]](s16)
+    ; SI-NEXT: [[LSHR21:%[0-9]+]]:_(s32) = G_LSHR [[LSHR20]], [[ZEXT15]](s32)
     ; SI-NEXT: [[TRUNC17:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR21]](s32)
     ; SI-NEXT: [[OR9:%[0-9]+]]:_(s16) = G_OR [[TRUNC16]], [[TRUNC17]]
-    ; SI-NEXT: [[AND28:%[0-9]+]]:_(s16) = G_AND [[TRUNC15]], [[C2]]
+    ; SI-NEXT: [[AND19:%[0-9]+]]:_(s16) = G_AND [[TRUNC15]], [[C2]]
     ; SI-NEXT: [[XOR9:%[0-9]+]]:_(s16) = G_XOR [[TRUNC15]], [[C3]]
-    ; SI-NEXT: [[AND29:%[0-9]+]]:_(s16) = G_AND [[XOR9]], [[C2]]
-    ; SI-NEXT: [[ZEXT16:%[0-9]+]]:_(s32) = G_ZEXT [[AND28]](s16)
+    ; SI-NEXT: [[AND20:%[0-9]+]]:_(s16) = G_AND [[XOR9]], [[C2]]
+    ; SI-NEXT: [[ZEXT16:%[0-9]+]]:_(s32) = G_ZEXT [[AND19]](s16)
     ; SI-NEXT: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[OR7]](s16)
     ; SI-NEXT: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[ANYEXT3]], [[ZEXT16]](s32)
     ; SI-NEXT: [[TRUNC18:%[0-9]+]]:_(s16) = G_TRUNC [[SHL14]](s32)
     ; SI-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[C4]](s32)
-    ; SI-NEXT: [[AND30:%[0-9]+]]:_(s32) = G_AND [[SHL11]], [[C5]]
-    ; SI-NEXT: [[LSHR22:%[0-9]+]]:_(s32) = G_LSHR [[AND30]], [[COPY17]](s32)
-    ; SI-NEXT: [[ZEXT17:%[0-9]+]]:_(s32) = G_ZEXT [[AND29]](s16)
-    ; SI-NEXT: [[AND31:%[0-9]+]]:_(s32) = G_AND [[LSHR22]], [[C5]]
-    ; SI-NEXT: [[LSHR23:%[0-9]+]]:_(s32) = G_LSHR [[AND31]], [[ZEXT17]](s32)
+    ; SI-NEXT: [[AND21:%[0-9]+]]:_(s32) = G_AND [[SHL11]], [[C5]]
+    ; SI-NEXT: [[LSHR22:%[0-9]+]]:_(s32) = G_LSHR [[AND21]], [[COPY17]](s32)
+    ; SI-NEXT: [[ZEXT17:%[0-9]+]]:_(s32) = G_ZEXT [[AND20]](s16)
+    ; SI-NEXT: [[LSHR23:%[0-9]+]]:_(s32) = G_LSHR [[LSHR22]], [[ZEXT17]](s32)
     ; SI-NEXT: [[TRUNC19:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR23]](s32)
     ; SI-NEXT: [[OR10:%[0-9]+]]:_(s16) = G_OR [[TRUNC18]], [[TRUNC19]]
     ; SI-NEXT: [[ZEXT18:%[0-9]+]]:_(s32) = G_ZEXT [[OR9]](s16)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fsub.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fsub.mir
index 8e999e400a666..206ad3e24ee02 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fsub.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fsub.mir
@@ -18,6 +18,7 @@ body: |
     ; SI-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
     ; SI-NEXT: [[FSUB:%[0-9]+]]:_(s32) = G_FSUB [[COPY]], [[COPY1]]
     ; SI-NEXT: $vgpr0 = COPY [[FSUB]](s32)
+    ;
     ; VI-LABEL: name: test_fsub_s32
     ; VI: liveins: $vgpr0, $vgpr1
     ; VI-NEXT: {{  $}}
@@ -25,6 +26,7 @@ body: |
     ; VI-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
     ; VI-NEXT: [[FSUB:%[0-9]+]]:_(s32) = G_FSUB [[COPY]], [[COPY1]]
     ; VI-NEXT: $vgpr0 = COPY [[FSUB]](s32)
+    ;
     ; GFX9-LABEL: name: test_fsub_s32
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -51,6 +53,7 @@ body: |
     ; SI-NEXT: [[FNEG:%[0-9]+]]:_(s64) = G_FNEG [[COPY1]]
     ; SI-NEXT: [[FADD:%[0-9]+]]:_(s64) = G_FADD [[COPY]], [[FNEG]]
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[FADD]](s64)
+    ;
     ; VI-LABEL: name: test_fsub_s64
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; VI-NEXT: {{  $}}
@@ -59,6 +62,7 @@ body: |
     ; VI-NEXT: [[FNEG:%[0-9]+]]:_(s64) = G_FNEG [[COPY1]]
     ; VI-NEXT: [[FADD:%[0-9]+]]:_(s64) = G_FADD [[COPY]], [[FNEG]]
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[FADD]](s64)
+    ;
     ; GFX9-LABEL: name: test_fsub_s64
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -87,6 +91,7 @@ body: |
     ; SI-NEXT: [[FNEG:%[0-9]+]]:_(s64) = G_FNEG [[COPY1]]
     ; SI-NEXT: [[FADD:%[0-9]+]]:_(s64) = nnan nsz G_FADD [[COPY]], [[FNEG]]
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[FADD]](s64)
+    ;
     ; VI-LABEL: name: test_fsub_s64_fmf
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; VI-NEXT: {{  $}}
@@ -95,6 +100,7 @@ body: |
     ; VI-NEXT: [[FNEG:%[0-9]+]]:_(s64) = G_FNEG [[COPY1]]
     ; VI-NEXT: [[FADD:%[0-9]+]]:_(s64) = nnan nsz G_FADD [[COPY]], [[FNEG]]
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[FADD]](s64)
+    ;
     ; GFX9-LABEL: name: test_fsub_s64_fmf
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -129,6 +135,7 @@ body: |
     ; SI-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FADD]](s32)
     ; SI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FPTRUNC]](s16)
     ; SI-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ;
     ; VI-LABEL: name: test_fsub_s16
     ; VI: liveins: $vgpr0, $vgpr1
     ; VI-NEXT: {{  $}}
@@ -139,6 +146,7 @@ body: |
     ; VI-NEXT: [[FSUB:%[0-9]+]]:_(s16) = G_FSUB [[TRUNC]], [[TRUNC1]]
     ; VI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FSUB]](s16)
     ; VI-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ;
     ; GFX9-LABEL: name: test_fsub_s16
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -176,6 +184,7 @@ body: |
     ; SI-NEXT: [[FSUB1:%[0-9]+]]:_(s32) = G_FSUB [[UV1]], [[UV3]]
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[FSUB]](s32), [[FSUB1]](s32)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; VI-LABEL: name: test_fsub_v2s32
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; VI-NEXT: {{  $}}
@@ -187,6 +196,7 @@ body: |
     ; VI-NEXT: [[FSUB1:%[0-9]+]]:_(s32) = G_FSUB [[UV1]], [[UV3]]
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[FSUB]](s32), [[FSUB1]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; GFX9-LABEL: name: test_fsub_v2s32
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -221,6 +231,7 @@ body: |
     ; SI-NEXT: [[FSUB1:%[0-9]+]]:_(s32) = nnan G_FSUB [[UV1]], [[UV3]]
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[FSUB]](s32), [[FSUB1]](s32)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; VI-LABEL: name: test_fsub_v2s32_flags
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; VI-NEXT: {{  $}}
@@ -232,6 +243,7 @@ body: |
     ; VI-NEXT: [[FSUB1:%[0-9]+]]:_(s32) = nnan G_FSUB [[UV1]], [[UV3]]
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[FSUB]](s32), [[FSUB1]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; GFX9-LABEL: name: test_fsub_v2s32_flags
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -267,6 +279,7 @@ body: |
     ; SI-NEXT: [[FSUB2:%[0-9]+]]:_(s32) = G_FSUB [[UV2]], [[UV5]]
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[FSUB]](s32), [[FSUB1]](s32), [[FSUB2]](s32)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
+    ;
     ; VI-LABEL: name: test_fsub_v3s32
     ; VI: liveins: $vgpr0_vgpr1_vgpr2, $vgpr3_vgpr4_vgpr5
     ; VI-NEXT: {{  $}}
@@ -279,6 +292,7 @@ body: |
     ; VI-NEXT: [[FSUB2:%[0-9]+]]:_(s32) = G_FSUB [[UV2]], [[UV5]]
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[FSUB]](s32), [[FSUB1]](s32), [[FSUB2]](s32)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
+    ;
     ; GFX9-LABEL: name: test_fsub_v3s32
     ; GFX9: liveins: $vgpr0_vgpr1_vgpr2, $vgpr3_vgpr4_vgpr5
     ; GFX9-NEXT: {{  $}}
@@ -316,6 +330,7 @@ body: |
     ; SI-NEXT: [[FADD1:%[0-9]+]]:_(s64) = G_FADD [[UV1]], [[FNEG1]]
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[FADD]](s64), [[FADD1]](s64)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ;
     ; VI-LABEL: name: test_fsub_v2s64
     ; VI: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7
     ; VI-NEXT: {{  $}}
@@ -329,6 +344,7 @@ body: |
     ; VI-NEXT: [[FADD1:%[0-9]+]]:_(s64) = G_FADD [[UV1]], [[FNEG1]]
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[FADD]](s64), [[FADD1]](s64)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ;
     ; GFX9-LABEL: name: test_fsub_v2s64
     ; GFX9: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7
     ; GFX9-NEXT: {{  $}}
@@ -383,6 +399,7 @@ body: |
     ; SI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL]]
     ; SI-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; SI-NEXT: $vgpr0 = COPY [[BITCAST2]](<2 x s16>)
+    ;
     ; VI-LABEL: name: test_fsub_v2s16
     ; VI: liveins: $vgpr0, $vgpr1
     ; VI-NEXT: {{  $}}
@@ -406,6 +423,7 @@ body: |
     ; VI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL]]
     ; VI-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; VI-NEXT: $vgpr0 = COPY [[BITCAST2]](<2 x s16>)
+    ;
     ; GFX9-LABEL: name: test_fsub_v2s16
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -476,13 +494,13 @@ body: |
     ; SI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND]], [[C]](s32)
     ; SI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL1]]
     ; SI-NEXT: [[BITCAST7:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
-    ; SI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR2]], [[C1]]
-    ; SI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[BITCAST5]], [[C1]]
-    ; SI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C]](s32)
-    ; SI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND1]], [[SHL2]]
+    ; SI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[BITCAST5]], [[C1]]
+    ; SI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
+    ; SI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[LSHR2]], [[SHL2]]
     ; SI-NEXT: [[BITCAST8:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
     ; SI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST6]](<2 x s16>), [[BITCAST7]](<2 x s16>), [[BITCAST8]](<2 x s16>)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
     ; VI-LABEL: name: test_fsub_v3s16
     ; VI: liveins: $vgpr0_vgpr1_vgpr2, $vgpr3_vgpr4_vgpr5
     ; VI-NEXT: {{  $}}
@@ -522,13 +540,13 @@ body: |
     ; VI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND]], [[C]](s32)
     ; VI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL1]]
     ; VI-NEXT: [[BITCAST7:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
-    ; VI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR2]], [[C1]]
-    ; VI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[BITCAST5]], [[C1]]
-    ; VI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C]](s32)
-    ; VI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND1]], [[SHL2]]
+    ; VI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[BITCAST5]], [[C1]]
+    ; VI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
+    ; VI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[LSHR2]], [[SHL2]]
     ; VI-NEXT: [[BITCAST8:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
     ; VI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST6]](<2 x s16>), [[BITCAST7]](<2 x s16>), [[BITCAST8]](<2 x s16>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
     ; GFX9-LABEL: name: test_fsub_v3s16
     ; GFX9: liveins: $vgpr0_vgpr1_vgpr2, $vgpr3_vgpr4_vgpr5
     ; GFX9-NEXT: {{  $}}
@@ -638,6 +656,7 @@ body: |
     ; SI-NEXT: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
     ; SI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
     ; VI-LABEL: name: test_fsub_v4s16
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; VI-NEXT: {{  $}}
@@ -678,6 +697,7 @@ body: |
     ; VI-NEXT: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
     ; VI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
     ; GFX9-LABEL: name: test_fsub_v4s16
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-icmp.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-icmp.mir
index a166c2d45abbc..a655f47a8969e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-icmp.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-icmp.mir
@@ -18,6 +18,7 @@ body: |
     ; GFX7-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[C]](s32), [[COPY]]
     ; GFX7-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[C]], [[COPY]]
     ; GFX7-NEXT: $vgpr0 = COPY [[SELECT]](s32)
+    ;
     ; GFX8-LABEL: name: test_icmp_s32
     ; GFX8: liveins: $vgpr0
     ; GFX8-NEXT: {{  $}}
@@ -26,6 +27,7 @@ body: |
     ; GFX8-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[C]](s32), [[COPY]]
     ; GFX8-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[C]], [[COPY]]
     ; GFX8-NEXT: $vgpr0 = COPY [[SELECT]](s32)
+    ;
     ; GFX9-LABEL: name: test_icmp_s32
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -54,6 +56,7 @@ body: |
     ; GFX7-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[C]](s64), [[COPY]]
     ; GFX7-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s1), [[C]], [[COPY]]
     ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[SELECT]](s64)
+    ;
     ; GFX8-LABEL: name: test_icmp_s64
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
@@ -62,6 +65,7 @@ body: |
     ; GFX8-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[C]](s64), [[COPY]]
     ; GFX8-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s1), [[C]], [[COPY]]
     ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[SELECT]](s64)
+    ;
     ; GFX9-LABEL: name: test_icmp_s64
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -95,6 +99,7 @@ body: |
     ; GFX7-NEXT: [[SELECT:%[0-9]+]]:_(s16) = G_SELECT [[ICMP]](s1), [[C]], [[TRUNC]]
     ; GFX7-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[SELECT]](s16)
     ; GFX7-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ;
     ; GFX8-LABEL: name: test_icmp_s16
     ; GFX8: liveins: $vgpr0
     ; GFX8-NEXT: {{  $}}
@@ -105,6 +110,7 @@ body: |
     ; GFX8-NEXT: [[SELECT:%[0-9]+]]:_(s16) = G_SELECT [[ICMP]](s1), [[C]], [[TRUNC]]
     ; GFX8-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[SELECT]](s16)
     ; GFX8-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ;
     ; GFX9-LABEL: name: test_icmp_s16
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -135,37 +141,36 @@ body: |
     ; GFX7-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX7-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
-    ; GFX7-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C]](s32)
     ; GFX7-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C1]]
-    ; GFX7-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[COPY1]](s32), [[AND]]
+    ; GFX7-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[C]](s32), [[AND]]
     ; GFX7-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 0
     ; GFX7-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
     ; GFX7-NEXT: [[SELECT:%[0-9]+]]:_(s16) = G_SELECT [[ICMP]](s1), [[C2]], [[TRUNC]]
     ; GFX7-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[SELECT]](s16)
     ; GFX7-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ;
     ; GFX8-LABEL: name: test_icmp_s8
     ; GFX8: liveins: $vgpr0
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
-    ; GFX8-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C]](s32)
     ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C1]]
-    ; GFX8-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[COPY1]](s32), [[AND]]
+    ; GFX8-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[C]](s32), [[AND]]
     ; GFX8-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 0
     ; GFX8-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
     ; GFX8-NEXT: [[SELECT:%[0-9]+]]:_(s16) = G_SELECT [[ICMP]](s1), [[C2]], [[TRUNC]]
     ; GFX8-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[SELECT]](s16)
     ; GFX8-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ;
     ; GFX9-LABEL: name: test_icmp_s8
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C]](s32)
     ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C1]]
-    ; GFX9-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[COPY1]](s32), [[AND]]
+    ; GFX9-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[C]](s32), [[AND]]
     ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 0
     ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
     ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s16) = G_SELECT [[ICMP]](s1), [[C2]], [[TRUNC]]
@@ -191,31 +196,30 @@ body: |
     ; GFX7-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
     ; GFX7-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX7-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16777215
-    ; GFX7-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C]](s32)
     ; GFX7-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C1]]
-    ; GFX7-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[COPY1]](s32), [[AND]]
+    ; GFX7-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[C]](s32), [[AND]]
     ; GFX7-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[C]], [[COPY]]
     ; GFX7-NEXT: $vgpr0 = COPY [[SELECT]](s32)
+    ;
     ; GFX8-LABEL: name: test_icmp_s24
     ; GFX8: liveins: $vgpr0
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16777215
-    ; GFX8-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C]](s32)
     ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C1]]
-    ; GFX8-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[COPY1]](s32), [[AND]]
+    ; GFX8-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[C]](s32), [[AND]]
     ; GFX8-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[C]], [[COPY]]
     ; GFX8-NEXT: $vgpr0 = COPY [[SELECT]](s32)
+    ;
     ; GFX9-LABEL: name: test_icmp_s24
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16777215
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C]](s32)
     ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C1]]
-    ; GFX9-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[COPY1]](s32), [[AND]]
+    ; GFX9-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[C]](s32), [[AND]]
     ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[C]], [[COPY]]
     ; GFX9-NEXT: $vgpr0 = COPY [[SELECT]](s32)
     %0:_(s24) = G_CONSTANT i24 0
@@ -247,6 +251,7 @@ body: |
     ; GFX7-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[ANYEXT1]], [[C1]]
     ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[AND]](s32), [[AND1]](s32)
     ; GFX7-NEXT: S_NOP 0, implicit [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; GFX8-LABEL: name: test_icmp_v2s32
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
@@ -262,6 +267,7 @@ body: |
     ; GFX8-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[ANYEXT1]], [[C1]]
     ; GFX8-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[AND]](s32), [[AND1]](s32)
     ; GFX8-NEXT: S_NOP 0, implicit [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; GFX9-LABEL: name: test_icmp_v2s32
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -270,8 +276,8 @@ body: |
     ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>)
     ; GFX9-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[C]](s32), [[UV]]
     ; GFX9-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[C]](s32), [[UV1]]
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[ICMP]](s1)
+    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C1]]
     ; GFX9-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[ICMP1]](s1)
     ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[ANYEXT1]], [[C1]]
@@ -310,6 +316,7 @@ body: |
     ; GFX7-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[ANYEXT2]], [[C]]
     ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[AND]](s32), [[AND1]](s32), [[AND2]](s32)
     ; GFX7-NEXT: S_NOP 0, implicit [[BUILD_VECTOR]](<3 x s32>)
+    ;
     ; GFX8-LABEL: name: test_icmp_v3s32
     ; GFX8: liveins: $vgpr0_vgpr1_vgpr2
     ; GFX8-NEXT: {{  $}}
@@ -329,6 +336,7 @@ body: |
     ; GFX8-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[ANYEXT2]], [[C]]
     ; GFX8-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[AND]](s32), [[AND1]](s32), [[AND2]](s32)
     ; GFX8-NEXT: S_NOP 0, implicit [[BUILD_VECTOR]](<3 x s32>)
+    ;
     ; GFX9-LABEL: name: test_icmp_v3s32
     ; GFX9: liveins: $vgpr0_vgpr1_vgpr2
     ; GFX9-NEXT: {{  $}}
@@ -339,8 +347,8 @@ body: |
     ; GFX9-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[UV]](s32), [[UV3]]
     ; GFX9-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[UV1]](s32), [[UV4]]
     ; GFX9-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[UV2]](s32), [[UV5]]
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[ICMP]](s1)
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C]]
     ; GFX9-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[ICMP1]](s1)
     ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[ANYEXT1]], [[C]]
@@ -384,6 +392,7 @@ body: |
     ; GFX7-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[ANYEXT3]], [[C]]
     ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[AND]](s32), [[AND1]](s32), [[AND2]](s32), [[AND3]](s32)
     ; GFX7-NEXT: S_NOP 0, implicit [[BUILD_VECTOR]](<4 x s32>)
+    ;
     ; GFX8-LABEL: name: test_icmp_v4s32
     ; GFX8: liveins: $vgpr0_vgpr1_vgpr2_vgpr3
     ; GFX8-NEXT: {{  $}}
@@ -407,6 +416,7 @@ body: |
     ; GFX8-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[ANYEXT3]], [[C]]
     ; GFX8-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[AND]](s32), [[AND1]](s32), [[AND2]](s32), [[AND3]](s32)
     ; GFX8-NEXT: S_NOP 0, implicit [[BUILD_VECTOR]](<4 x s32>)
+    ;
     ; GFX9-LABEL: name: test_icmp_v4s32
     ; GFX9: liveins: $vgpr0_vgpr1_vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -419,8 +429,8 @@ body: |
     ; GFX9-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[UV1]](s32), [[UV5]]
     ; GFX9-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[UV2]](s32), [[UV6]]
     ; GFX9-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[UV3]](s32), [[UV7]]
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[ICMP]](s1)
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C]]
     ; GFX9-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[ICMP1]](s1)
     ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[ANYEXT1]], [[C]]
@@ -452,6 +462,7 @@ body: |
     ; GFX7-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[COPY]](p0), [[COPY1]]
     ; GFX7-NEXT: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[ICMP]](s1)
     ; GFX7-NEXT: $vgpr0 = COPY [[SEXT]](s32)
+    ;
     ; GFX8-LABEL: name: test_icmp_p0
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX8-NEXT: {{  $}}
@@ -460,6 +471,7 @@ body: |
     ; GFX8-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[COPY]](p0), [[COPY1]]
     ; GFX8-NEXT: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[ICMP]](s1)
     ; GFX8-NEXT: $vgpr0 = COPY [[SEXT]](s32)
+    ;
     ; GFX9-LABEL: name: test_icmp_p0
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -488,6 +500,7 @@ body: |
     ; GFX7-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[COPY]](p1), [[COPY1]]
     ; GFX7-NEXT: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[ICMP]](s1)
     ; GFX7-NEXT: $vgpr0 = COPY [[SEXT]](s32)
+    ;
     ; GFX8-LABEL: name: test_icmp_p1
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX8-NEXT: {{  $}}
@@ -496,6 +509,7 @@ body: |
     ; GFX8-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[COPY]](p1), [[COPY1]]
     ; GFX8-NEXT: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[ICMP]](s1)
     ; GFX8-NEXT: $vgpr0 = COPY [[SEXT]](s32)
+    ;
     ; GFX9-LABEL: name: test_icmp_p1
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -525,6 +539,7 @@ body: |
     ; GFX7-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[COPY]](p2), [[COPY1]]
     ; GFX7-NEXT: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[ICMP]](s1)
     ; GFX7-NEXT: $vgpr0 = COPY [[SEXT]](s32)
+    ;
     ; GFX8-LABEL: name: test_icmp_p2
     ; GFX8: liveins: $vgpr0, $vgpr1
     ; GFX8-NEXT: {{  $}}
@@ -533,6 +548,7 @@ body: |
     ; GFX8-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[COPY]](p2), [[COPY1]]
     ; GFX8-NEXT: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[ICMP]](s1)
     ; GFX8-NEXT: $vgpr0 = COPY [[SEXT]](s32)
+    ;
     ; GFX9-LABEL: name: test_icmp_p2
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -562,6 +578,7 @@ body: |
     ; GFX7-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[COPY]](p3), [[COPY1]]
     ; GFX7-NEXT: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[ICMP]](s1)
     ; GFX7-NEXT: $vgpr0 = COPY [[SEXT]](s32)
+    ;
     ; GFX8-LABEL: name: test_icmp_p3
     ; GFX8: liveins: $vgpr0, $vgpr1
     ; GFX8-NEXT: {{  $}}
@@ -570,6 +587,7 @@ body: |
     ; GFX8-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[COPY]](p3), [[COPY1]]
     ; GFX8-NEXT: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[ICMP]](s1)
     ; GFX8-NEXT: $vgpr0 = COPY [[SEXT]](s32)
+    ;
     ; GFX9-LABEL: name: test_icmp_p3
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -598,6 +616,7 @@ body: |
     ; GFX7-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[COPY]](p4), [[COPY1]]
     ; GFX7-NEXT: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[ICMP]](s1)
     ; GFX7-NEXT: $vgpr0 = COPY [[SEXT]](s32)
+    ;
     ; GFX8-LABEL: name: test_icmp_p4
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX8-NEXT: {{  $}}
@@ -606,6 +625,7 @@ body: |
     ; GFX8-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[COPY]](p4), [[COPY1]]
     ; GFX8-NEXT: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[ICMP]](s1)
     ; GFX8-NEXT: $vgpr0 = COPY [[SEXT]](s32)
+    ;
     ; GFX9-LABEL: name: test_icmp_p4
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -635,6 +655,7 @@ body: |
     ; GFX7-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[COPY]](p5), [[COPY1]]
     ; GFX7-NEXT: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[ICMP]](s1)
     ; GFX7-NEXT: $vgpr0 = COPY [[SEXT]](s32)
+    ;
     ; GFX8-LABEL: name: test_icmp_p5
     ; GFX8: liveins: $vgpr0, $vgpr1
     ; GFX8-NEXT: {{  $}}
@@ -643,6 +664,7 @@ body: |
     ; GFX8-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[COPY]](p5), [[COPY1]]
     ; GFX8-NEXT: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[ICMP]](s1)
     ; GFX8-NEXT: $vgpr0 = COPY [[SEXT]](s32)
+    ;
     ; GFX9-LABEL: name: test_icmp_p5
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -671,6 +693,7 @@ body: |
     ; GFX7-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[COPY]](p999), [[COPY1]]
     ; GFX7-NEXT: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[ICMP]](s1)
     ; GFX7-NEXT: $vgpr0 = COPY [[SEXT]](s32)
+    ;
     ; GFX8-LABEL: name: test_icmp_p999
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX8-NEXT: {{  $}}
@@ -679,6 +702,7 @@ body: |
     ; GFX8-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[COPY]](p999), [[COPY1]]
     ; GFX8-NEXT: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[ICMP]](s1)
     ; GFX8-NEXT: $vgpr0 = COPY [[SEXT]](s32)
+    ;
     ; GFX9-LABEL: name: test_icmp_p999
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -714,6 +738,7 @@ body: |
     ; GFX7-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[ANYEXT1]], 1
     ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[SEXT_INREG]](s32), [[SEXT_INREG1]](s32)
     ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; GFX8-LABEL: name: test_icmp_v2p3
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX8-NEXT: {{  $}}
@@ -729,6 +754,7 @@ body: |
     ; GFX8-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[ANYEXT1]], 1
     ; GFX8-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[SEXT_INREG]](s32), [[SEXT_INREG1]](s32)
     ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; GFX9-LABEL: name: test_icmp_v2p3
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -771,6 +797,7 @@ body: |
     ; GFX7-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[ANYEXT1]], 1
     ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[SEXT_INREG]](s32), [[SEXT_INREG1]](s32)
     ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; GFX8-LABEL: name: test_icmp_v2p999
     ; GFX8: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7
     ; GFX8-NEXT: {{  $}}
@@ -786,6 +813,7 @@ body: |
     ; GFX8-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[ANYEXT1]], 1
     ; GFX8-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[SEXT_INREG]](s32), [[SEXT_INREG1]](s32)
     ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; GFX9-LABEL: name: test_icmp_v2p999
     ; GFX9: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7
     ; GFX9-NEXT: {{  $}}
@@ -829,15 +857,14 @@ body: |
     ; GFX7-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[BITCAST]], [[C1]]
     ; GFX7-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C1]]
     ; GFX7-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[AND]](s32), [[AND1]]
-    ; GFX7-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]]
-    ; GFX7-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C1]]
-    ; GFX7-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[AND2]](s32), [[AND3]]
+    ; GFX7-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[LSHR]](s32), [[LSHR1]]
     ; GFX7-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY2]](<2 x s32>)
     ; GFX7-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY3]](<2 x s32>)
     ; GFX7-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[UV]], [[UV2]]
     ; GFX7-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s1), [[UV1]], [[UV3]]
     ; GFX7-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[SELECT]](s32), [[SELECT1]](s32)
     ; GFX7-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; GFX8-LABEL: name: test_icmp_v2s16
     ; GFX8: liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vgpr4_vgpr5
     ; GFX8-NEXT: {{  $}}
@@ -862,6 +889,7 @@ body: |
     ; GFX8-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s1), [[UV1]], [[UV3]]
     ; GFX8-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[SELECT]](s32), [[SELECT1]](s32)
     ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; GFX9-LABEL: name: test_icmp_v2s16
     ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2_vgpr3, $vgpr4_vgpr5
     ; GFX9-NEXT: {{  $}}
@@ -904,25 +932,21 @@ body: |
     ; GFX7: liveins: $vgpr0_vgpr1
     ; GFX7-NEXT: {{  $}}
     ; GFX7-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-    ; GFX7-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY [[C]](s64)
-    ; GFX7-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY [[C]](s64)
-    ; GFX7-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[COPY]](s64), [[COPY1]]
+    ; GFX7-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[C]](s64), [[C]]
     ; GFX7-NEXT: S_ENDPGM 0, implicit [[ICMP]](s1)
+    ;
     ; GFX8-LABEL: name: test_icmp_s33
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-    ; GFX8-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY [[C]](s64)
-    ; GFX8-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY [[C]](s64)
-    ; GFX8-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[COPY]](s64), [[COPY1]]
+    ; GFX8-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[C]](s64), [[C]]
     ; GFX8-NEXT: S_ENDPGM 0, implicit [[ICMP]](s1)
+    ;
     ; GFX9-LABEL: name: test_icmp_s33
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY [[C]](s64)
-    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY [[C]](s64)
-    ; GFX9-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[COPY]](s64), [[COPY1]]
+    ; GFX9-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[C]](s64), [[C]]
     ; GFX9-NEXT: S_ENDPGM 0, implicit [[ICMP]](s1)
     %0:_(s64) = COPY $vgpr0_vgpr1
     %1:_(s33) = G_TRUNC %0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-implicit-def.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-implicit-def.mir
index ff2e6296aa4b2..b9edfbfa6d0a9 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-implicit-def.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-implicit-def.mir
@@ -441,14 +441,12 @@ body: |
     ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32)
     ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
     ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[BITCAST]], [[C1]]
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]]
-    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
+    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LSHR]], [[C]](s32)
     ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
     ; CHECK-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
-    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C1]]
-    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C1]]
-    ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C]](s32)
-    ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]]
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C1]]
+    ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LSHR1]], [[C]](s32)
+    ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND1]], [[SHL1]]
     ; CHECK-NEXT: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
     ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>)
     ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
@@ -490,19 +488,16 @@ body: |
     ; CHECK-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32)
     ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
     ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[BITCAST]], [[C1]]
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]]
-    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
+    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LSHR]], [[C]](s32)
     ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
     ; CHECK-NEXT: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
-    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C1]]
-    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C1]]
-    ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C]](s32)
-    ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]]
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C1]]
+    ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LSHR1]], [[C]](s32)
+    ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND1]], [[SHL1]]
     ; CHECK-NEXT: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
-    ; CHECK-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[BITCAST2]], [[C1]]
-    ; CHECK-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[LSHR2]], [[C1]]
-    ; CHECK-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C]](s32)
-    ; CHECK-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]]
+    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[BITCAST2]], [[C1]]
+    ; CHECK-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LSHR2]], [[C]](s32)
+    ; CHECK-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL2]]
     ; CHECK-NEXT: [[BITCAST6:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
     ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>), [[BITCAST6]](<2 x s16>), [[UV6]](<2 x s16>)
     ; CHECK-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[CONCAT_VECTORS]](<8 x s16>)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-insert.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-insert.mir
index 90c2b0a69b1f8..d8d0f9b9cd898 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-insert.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-insert.mir
@@ -890,8 +890,7 @@ body: |
     ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
     ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
     ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]]
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]]
-    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
+    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LSHR]], [[C]](s32)
     ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
     ; CHECK-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; CHECK-NEXT: $vgpr0 = COPY [[BITCAST1]](<2 x s16>)
@@ -977,14 +976,12 @@ body: |
     ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32)
     ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
     ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]]
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]]
-    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
+    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LSHR]], [[C]](s32)
     ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
     ; CHECK-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
-    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C1]]
-    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C1]]
-    ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C]](s32)
-    ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]]
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C1]]
+    ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LSHR1]], [[C]](s32)
+    ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND1]], [[SHL1]]
     ; CHECK-NEXT: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
     ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>)
     ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
@@ -1023,8 +1020,7 @@ body: |
     ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
     ; CHECK-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C1]]
-    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]]
-    ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C]](s32)
+    ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LSHR]], [[C]](s32)
     ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]]
     ; CHECK-NEXT: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
     ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>)
@@ -1057,8 +1053,7 @@ body: |
     ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
     ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
     ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]]
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]]
-    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
+    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LSHR]], [[C]](s32)
     ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
     ; CHECK-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[UV]](<2 x s16>), [[BITCAST1]](<2 x s16>)
@@ -1094,14 +1089,12 @@ body: |
     ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32)
     ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
     ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C1]]
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]]
-    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
+    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LSHR]], [[C]](s32)
     ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
     ; CHECK-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
-    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[BITCAST]], [[C1]]
-    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C1]]
-    ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C]](s32)
-    ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]]
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[BITCAST]], [[C1]]
+    ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LSHR1]], [[C]](s32)
+    ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND1]], [[SHL1]]
     ; CHECK-NEXT: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
     ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>)
     ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
@@ -1139,10 +1132,8 @@ body: |
     ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
     ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
     ; CHECK-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
-    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]]
-    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C1]]
-    ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C]](s32)
-    ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]]
+    ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LSHR1]], [[C]](s32)
+    ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[LSHR]], [[SHL1]]
     ; CHECK-NEXT: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
     ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>)
     ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
@@ -1175,14 +1166,12 @@ body: |
     ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
     ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
     ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]]
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]]
-    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
+    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LSHR]], [[C]](s32)
     ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
     ; CHECK-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
-    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[BITCAST]], [[C1]]
-    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C1]]
-    ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C]](s32)
-    ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]]
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[BITCAST]], [[C1]]
+    ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LSHR1]], [[C]](s32)
+    ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND1]], [[SHL1]]
     ; CHECK-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
     ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>)
     ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
@@ -1219,10 +1208,8 @@ body: |
     ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
     ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
     ; CHECK-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
-    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]]
-    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C1]]
-    ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C]](s32)
-    ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]]
+    ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LSHR1]], [[C]](s32)
+    ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[LSHR]], [[SHL1]]
     ; CHECK-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
     ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>)
     ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
@@ -1251,8 +1238,7 @@ body: |
     ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
     ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
     ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]]
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]]
-    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
+    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LSHR]], [[C]](s32)
     ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
     ; CHECK-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST1]](<2 x s16>), [[UV1]](<2 x s16>)
@@ -1308,8 +1294,7 @@ body: |
     ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
     ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
     ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]]
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]]
-    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
+    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LSHR]], [[C]](s32)
     ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
     ; CHECK-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[UV]](<2 x s16>), [[BITCAST1]](<2 x s16>)
@@ -1365,8 +1350,7 @@ body: |
     ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
     ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
     ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[BITCAST]], [[C1]]
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]]
-    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
+    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LSHR]], [[C]](s32)
     ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
     ; CHECK-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST1]](<2 x s16>), [[UV1]](<2 x s16>)
@@ -1400,10 +1384,8 @@ body: |
     ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
     ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
     ; CHECK-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
-    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C1]]
-    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]]
-    ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C]](s32)
-    ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]]
+    ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LSHR]], [[C]](s32)
+    ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[LSHR1]], [[SHL1]]
     ; CHECK-NEXT: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
     ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>)
     ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
@@ -1429,8 +1411,7 @@ body: |
     ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
     ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
     ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[BITCAST]], [[C1]]
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]]
-    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
+    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LSHR]], [[C]](s32)
     ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
     ; CHECK-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[UV]](<2 x s16>), [[BITCAST1]](<2 x s16>)
@@ -1459,8 +1440,7 @@ body: |
     ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
     ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
     ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[BITCAST]], [[C1]]
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]]
-    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
+    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LSHR]], [[C]](s32)
     ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
     ; CHECK-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[UV]](<2 x s16>), [[BITCAST2]](<2 x s16>)
@@ -1495,10 +1475,9 @@ body: |
     ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
     ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
     ; CHECK-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
-    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]]
-    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C1]]
-    ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C]](s32)
-    ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]]
+    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C1]]
+    ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C]](s32)
+    ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[LSHR]], [[SHL1]]
     ; CHECK-NEXT: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
     ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>)
     ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
@@ -1524,8 +1503,7 @@ body: |
     ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY1]], [[C]](s32)
     ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
     ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]]
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]]
-    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
+    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LSHR]], [[C]](s32)
     ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
     ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[UV1]](<2 x s16>)
@@ -1558,10 +1536,8 @@ body: |
     ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
     ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
     ; CHECK-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
-    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C1]]
-    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]]
-    ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C]](s32)
-    ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]]
+    ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LSHR]], [[C]](s32)
+    ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[LSHR1]], [[SHL1]]
     ; CHECK-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
     ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>)
     ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
@@ -1586,8 +1562,7 @@ body: |
     ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY1]], [[C]](s32)
     ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
     ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]]
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]]
-    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
+    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LSHR]], [[C]](s32)
     ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
     ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[UV]](<2 x s16>), [[BITCAST]](<2 x s16>)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-intrinsic-round.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-intrinsic-round.mir
index b97d04e809a6f..79c726fc80049 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-intrinsic-round.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-intrinsic-round.mir
@@ -811,10 +811,9 @@ body: |
     ; GFX6-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND6]], [[C]](s32)
     ; GFX6-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL1]]
     ; GFX6-NEXT: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR4]](s32)
-    ; GFX6-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C6]]
-    ; GFX6-NEXT: [[AND8:%[0-9]+]]:_(s32) = G_AND [[BITCAST3]], [[C6]]
-    ; GFX6-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND8]], [[C]](s32)
-    ; GFX6-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[AND7]], [[SHL2]]
+    ; GFX6-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[BITCAST3]], [[C6]]
+    ; GFX6-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C]](s32)
+    ; GFX6-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[LSHR1]], [[SHL2]]
     ; GFX6-NEXT: [[BITCAST6:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR5]](s32)
     ; GFX6-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>), [[BITCAST6]](<2 x s16>)
     ; GFX6-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
@@ -879,10 +878,9 @@ body: |
     ; GFX8-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND6]], [[C]](s32)
     ; GFX8-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL1]]
     ; GFX8-NEXT: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR4]](s32)
-    ; GFX8-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C6]]
-    ; GFX8-NEXT: [[AND8:%[0-9]+]]:_(s32) = G_AND [[BITCAST3]], [[C6]]
-    ; GFX8-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND8]], [[C]](s32)
-    ; GFX8-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[AND7]], [[SHL2]]
+    ; GFX8-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[BITCAST3]], [[C6]]
+    ; GFX8-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C]](s32)
+    ; GFX8-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[LSHR1]], [[SHL2]]
     ; GFX8-NEXT: [[BITCAST6:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR5]](s32)
     ; GFX8-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>), [[BITCAST6]](<2 x s16>)
     ; GFX8-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.store.2d.d16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.store.2d.d16.ll
index 29bf366d13587..155cc06a43ea3 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.store.2d.d16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.image.store.2d.d16.ll
@@ -235,14 +235,13 @@ define amdgpu_ps void @image_store_v3f16(<8 x i32> inreg %rsrc, i32 %s, i32 %t,
   ; GFX81-NEXT:   [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32)
   ; GFX81-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
   ; GFX81-NEXT:   [[AND:%[0-9]+]]:_(s32) = G_AND [[BITCAST]], [[C1]]
-  ; GFX81-NEXT:   [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]]
-  ; GFX81-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
+  ; GFX81-NEXT:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LSHR]], [[C]](s32)
   ; GFX81-NEXT:   [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
   ; GFX81-NEXT:   [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
-  ; GFX81-NEXT:   [[AND2:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C1]]
+  ; GFX81-NEXT:   [[AND1:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C1]]
   ; GFX81-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX81-NEXT:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[C2]], [[C]](s32)
-  ; GFX81-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]]
+  ; GFX81-NEXT:   [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND1]], [[SHL1]]
   ; GFX81-NEXT:   [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
   ; GFX81-NEXT:   [[OR2:%[0-9]+]]:_(s32) = G_OR [[C2]], [[SHL1]]
   ; GFX81-NEXT:   [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.s.buffer.load.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.s.buffer.load.mir
index a5a58bb01df95..420c55f8f6da2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.s.buffer.load.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-llvm.amdgcn.s.buffer.load.mir
@@ -165,30 +165,24 @@ body:             |
     ; GCN-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32)
     ; GCN-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
     ; GCN-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
-    ; GCN-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C4]]
-    ; GCN-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[LSHR2]], [[C4]]
-    ; GCN-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C2]](s32)
-    ; GCN-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]]
+    ; GCN-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LSHR2]], [[C2]](s32)
+    ; GCN-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[LSHR1]], [[SHL1]]
     ; GCN-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
-    ; GCN-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[UV1]], [[C4]]
-    ; GCN-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[LSHR3]], [[C4]]
-    ; GCN-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C2]](s32)
-    ; GCN-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]]
+    ; GCN-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[UV1]], [[C4]]
+    ; GCN-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[LSHR3]], [[C4]]
+    ; GCN-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C2]](s32)
+    ; GCN-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL2]]
     ; GCN-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
-    ; GCN-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[LSHR4]], [[C4]]
-    ; GCN-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[LSHR5]], [[C4]]
-    ; GCN-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C2]](s32)
-    ; GCN-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND6]], [[SHL3]]
+    ; GCN-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LSHR5]], [[C2]](s32)
+    ; GCN-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[LSHR4]], [[SHL3]]
     ; GCN-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR3]](s32)
-    ; GCN-NEXT: [[AND8:%[0-9]+]]:_(s32) = G_AND [[UV2]], [[C4]]
-    ; GCN-NEXT: [[AND9:%[0-9]+]]:_(s32) = G_AND [[LSHR6]], [[C4]]
-    ; GCN-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[C2]](s32)
-    ; GCN-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[AND8]], [[SHL4]]
+    ; GCN-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[UV2]], [[C4]]
+    ; GCN-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[LSHR6]], [[C4]]
+    ; GCN-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C2]](s32)
+    ; GCN-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL4]]
     ; GCN-NEXT: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR4]](s32)
-    ; GCN-NEXT: [[AND10:%[0-9]+]]:_(s32) = G_AND [[LSHR7]], [[C4]]
-    ; GCN-NEXT: [[AND11:%[0-9]+]]:_(s32) = G_AND [[LSHR8]], [[C4]]
-    ; GCN-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[C2]](s32)
-    ; GCN-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[AND10]], [[SHL5]]
+    ; GCN-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[LSHR8]], [[C2]](s32)
+    ; GCN-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[LSHR7]], [[SHL5]]
     ; GCN-NEXT: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR5]](s32)
     ; GCN-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<12 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>), [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>)
     ; GCN-NEXT: S_ENDPGM 0, implicit [[CONCAT_VECTORS]](<12 x s16>)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-constant.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-constant.mir
index 2ce09043b15b6..a63df136e003c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-constant.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-constant.mir
@@ -19,6 +19,7 @@ body: |
     ; CI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; CI-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LOAD]], [[C]]
     ; CI-NEXT: $vgpr0 = COPY [[AND]](s32)
+    ;
     ; VI-LABEL: name: test_load_constant_s1_align1
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -27,6 +28,7 @@ body: |
     ; VI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; VI-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LOAD]], [[C]]
     ; VI-NEXT: $vgpr0 = COPY [[AND]](s32)
+    ;
     ; GFX9-LABEL: name: test_load_constant_s1_align1
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -55,6 +57,7 @@ body: |
     ; CI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
     ; CI-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LOAD]], [[C]]
     ; CI-NEXT: $vgpr0 = COPY [[AND]](s32)
+    ;
     ; VI-LABEL: name: test_load_constant_s2_align1
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -63,6 +66,7 @@ body: |
     ; VI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
     ; VI-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LOAD]], [[C]]
     ; VI-NEXT: $vgpr0 = COPY [[AND]](s32)
+    ;
     ; GFX9-LABEL: name: test_load_constant_s2_align1
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -89,12 +93,14 @@ body: |
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p4) :: (load (s8), align 4, addrspace 4)
     ; CI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; VI-LABEL: name: test_load_constant_s8_align4
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p4) :: (load (s8), align 4, addrspace 4)
     ; VI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX9-LABEL: name: test_load_constant_s8_align4
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -119,12 +125,14 @@ body: |
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p4) :: (load (s8), addrspace 4)
     ; CI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; VI-LABEL: name: test_load_constant_s8_align1
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p4) :: (load (s8), addrspace 4)
     ; VI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX9-LABEL: name: test_load_constant_s8_align1
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -149,12 +157,14 @@ body: |
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p4) :: (load (s16), align 4, addrspace 4)
     ; CI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; VI-LABEL: name: test_load_constant_s16_align4
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p4) :: (load (s16), align 4, addrspace 4)
     ; VI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX9-LABEL: name: test_load_constant_s16_align4
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -179,12 +189,14 @@ body: |
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p4) :: (load (s16), addrspace 4)
     ; CI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; VI-LABEL: name: test_load_constant_s16_align2
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p4) :: (load (s16), addrspace 4)
     ; VI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX9-LABEL: name: test_load_constant_s16_align2
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -215,6 +227,7 @@ body: |
     ; CI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
     ; CI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; CI-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
     ; VI-LABEL: name: test_load_constant_s16_align1
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -227,6 +240,7 @@ body: |
     ; VI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
     ; VI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; VI-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
     ; GFX9-LABEL: name: test_load_constant_s16_align1
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -257,12 +271,14 @@ body: |
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p4) :: (load (s32), addrspace 4)
     ; CI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; VI-LABEL: name: test_load_constant_s32_align4
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p4) :: (load (s32), addrspace 4)
     ; VI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX9-LABEL: name: test_load_constant_s32_align4
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -292,6 +308,7 @@ body: |
     ; CI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
     ; CI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; CI-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
     ; VI-LABEL: name: test_load_constant_s32_align2
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -304,6 +321,7 @@ body: |
     ; VI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
     ; VI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; VI-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
     ; GFX9-LABEL: name: test_load_constant_s32_align2
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -349,6 +367,7 @@ body: |
     ; CI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
     ; CI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
     ; CI-NEXT: $vgpr0 = COPY [[OR2]](s32)
+    ;
     ; VI-LABEL: name: test_load_constant_s32_align1
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -371,6 +390,7 @@ body: |
     ; VI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
     ; VI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
     ; VI-NEXT: $vgpr0 = COPY [[OR2]](s32)
+    ;
     ; GFX9-LABEL: name: test_load_constant_s32_align1
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -410,12 +430,14 @@ body: |
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p4) :: (load (s32), align 8, addrspace 4)
     ; CI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; VI-LABEL: name: test_load_constant_s24_align8
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p4) :: (load (s32), align 8, addrspace 4)
     ; VI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX9-LABEL: name: test_load_constant_s24_align8
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -440,12 +462,14 @@ body: |
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p4) :: (load (s32), addrspace 4)
     ; CI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; VI-LABEL: name: test_load_constant_s24_align4
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p4) :: (load (s32), addrspace 4)
     ; VI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX9-LABEL: name: test_load_constant_s24_align4
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -476,6 +500,7 @@ body: |
     ; CI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
     ; CI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; CI-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
     ; VI-LABEL: name: test_load_constant_s24_align2
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -488,6 +513,7 @@ body: |
     ; VI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
     ; VI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; VI-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
     ; GFX9-LABEL: name: test_load_constant_s24_align2
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -530,6 +556,7 @@ body: |
     ; CI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C3]](s32)
     ; CI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[OR]]
     ; CI-NEXT: $vgpr0 = COPY [[OR1]](s32)
+    ;
     ; VI-LABEL: name: test_load_constant_s24_align1
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -548,6 +575,7 @@ body: |
     ; VI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C3]](s32)
     ; VI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[OR]]
     ; VI-NEXT: $vgpr0 = COPY [[OR1]](s32)
+    ;
     ; GFX9-LABEL: name: test_load_constant_s24_align1
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -586,6 +614,7 @@ body: |
     ; CI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 281474976710655
     ; CI-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[LOAD]], [[C]]
     ; CI-NEXT: $vgpr0_vgpr1 = COPY [[AND]](s64)
+    ;
     ; VI-LABEL: name: test_load_constant_s48_align8
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -594,6 +623,7 @@ body: |
     ; VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 281474976710655
     ; VI-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[LOAD]], [[C]]
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[AND]](s64)
+    ;
     ; GFX9-LABEL: name: test_load_constant_s48_align8
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -620,12 +650,14 @@ body: |
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p4) :: (load (s64), addrspace 4)
     ; CI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
+    ;
     ; VI-LABEL: name: test_load_constant_s64_align8
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p4) :: (load (s64), addrspace 4)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
+    ;
     ; GFX9-LABEL: name: test_load_constant_s64_align8
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -649,12 +681,14 @@ body: |
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p4) :: (load (s64), align 4, addrspace 4)
     ; CI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
+    ;
     ; VI-LABEL: name: test_load_constant_s64_align4
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p4) :: (load (s64), align 4, addrspace 4)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
+    ;
     ; GFX9-LABEL: name: test_load_constant_s64_align4
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -696,6 +730,7 @@ body: |
     ; CI-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32)
     ; CI-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]]
     ; CI-NEXT: $vgpr0_vgpr1 = COPY [[OR2]](s64)
+    ;
     ; VI-LABEL: name: test_load_constant_s64_align2
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -720,6 +755,7 @@ body: |
     ; VI-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32)
     ; VI-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]]
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[OR2]](s64)
+    ;
     ; GFX9-LABEL: name: test_load_constant_s64_align2
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -797,6 +833,7 @@ body: |
     ; CI-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
     ; CI-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
     ; CI-NEXT: $vgpr0_vgpr1 = COPY [[OR6]](s64)
+    ;
     ; VI-LABEL: name: test_load_constant_s64_align1
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -839,6 +876,7 @@ body: |
     ; VI-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
     ; VI-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[OR6]](s64)
+    ;
     ; GFX9-LABEL: name: test_load_constant_s64_align1
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -899,6 +937,7 @@ body: |
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p4) :: (load (<3 x s32>), align 16, addrspace 4)
     ; CI-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
     ; VI-LABEL: name: test_load_constant_s96_align16
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -906,6 +945,7 @@ body: |
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p4) :: (load (<3 x s32>), align 16, addrspace 4)
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
     ; GFX9-LABEL: name: test_load_constant_s96_align16
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -931,6 +971,7 @@ body: |
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p4) :: (load (<3 x s32>), align 8, addrspace 4)
     ; CI-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
     ; VI-LABEL: name: test_load_constant_s96_align8
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -938,6 +979,7 @@ body: |
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p4) :: (load (<3 x s32>), align 8, addrspace 4)
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
     ; GFX9-LABEL: name: test_load_constant_s96_align8
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -963,6 +1005,7 @@ body: |
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p4) :: (load (<3 x s32>), align 4, addrspace 4)
     ; CI-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
     ; VI-LABEL: name: test_load_constant_s96_align4
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -970,6 +1013,7 @@ body: |
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p4) :: (load (<3 x s32>), align 4, addrspace 4)
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
     ; GFX9-LABEL: name: test_load_constant_s96_align4
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -1016,6 +1060,7 @@ body: |
     ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32)
     ; CI-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
     ; VI-LABEL: name: test_load_constant_s96_align2
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -1044,6 +1089,7 @@ body: |
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32)
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
     ; GFX9-LABEL: name: test_load_constant_s96_align2
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -1137,6 +1183,7 @@ body: |
     ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32)
     ; CI-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
     ; VI-LABEL: name: test_load_constant_s96_align1
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -1191,6 +1238,7 @@ body: |
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32)
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
     ; GFX9-LABEL: name: test_load_constant_s96_align1
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -1268,6 +1316,7 @@ body: |
     ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<5 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[UV2]](s32), [[UV3]](s32), [[LOAD1]](s32)
     ; CI-NEXT: [[BITCAST:%[0-9]+]]:_(s160) = G_BITCAST [[BUILD_VECTOR]](<5 x s32>)
     ; CI-NEXT: S_NOP 0, implicit [[BITCAST]](s160)
+    ;
     ; VI-LABEL: name: test_load_constant_s160_align4
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -1280,6 +1329,7 @@ body: |
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<5 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[UV2]](s32), [[UV3]](s32), [[LOAD1]](s32)
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(s160) = G_BITCAST [[BUILD_VECTOR]](<5 x s32>)
     ; VI-NEXT: S_NOP 0, implicit [[BITCAST]](s160)
+    ;
     ; GFX9-LABEL: name: test_load_constant_s160_align4
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -1318,6 +1368,7 @@ body: |
     ; CI-NEXT: [[DEF:%[0-9]+]]:_(s256) = G_IMPLICIT_DEF
     ; CI-NEXT: [[INSERT:%[0-9]+]]:_(s256) = G_INSERT [[DEF]], [[BITCAST]](s224), 0
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[INSERT]](s256)
+    ;
     ; VI-LABEL: name: test_load_constant_s224_align4
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -1333,6 +1384,7 @@ body: |
     ; VI-NEXT: [[DEF:%[0-9]+]]:_(s256) = G_IMPLICIT_DEF
     ; VI-NEXT: [[INSERT:%[0-9]+]]:_(s256) = G_INSERT [[DEF]], [[BITCAST]](s224), 0
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[INSERT]](s256)
+    ;
     ; GFX9-LABEL: name: test_load_constant_s224_align4
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -1369,6 +1421,7 @@ body: |
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p4) :: (load (<4 x s32>), addrspace 4)
     ; CI-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
     ; VI-LABEL: name: test_load_constant_s128_align16
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -1376,6 +1429,7 @@ body: |
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p4) :: (load (<4 x s32>), addrspace 4)
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
     ; GFX9-LABEL: name: test_load_constant_s128_align16
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -1401,6 +1455,7 @@ body: |
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p4) :: (load (<4 x s32>), align 4, addrspace 4)
     ; CI-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
     ; VI-LABEL: name: test_load_constant_s128_align4
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -1408,6 +1463,7 @@ body: |
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p4) :: (load (<4 x s32>), align 4, addrspace 4)
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
     ; GFX9-LABEL: name: test_load_constant_s128_align4
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -1495,6 +1551,7 @@ body: |
     ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
     ; CI-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
     ; VI-LABEL: name: test_load_constant_s128_align1
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -1564,6 +1621,7 @@ body: |
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
     ; GFX9-LABEL: name: test_load_constant_s128_align1
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -1651,6 +1709,7 @@ body: |
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(<8 x s32>) = G_LOAD [[COPY]](p4) :: (load (<8 x s32>), align 16, addrspace 4)
     ; CI-NEXT: [[BITCAST:%[0-9]+]]:_(s256) = G_BITCAST [[LOAD]](<8 x s32>)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BITCAST]](s256)
+    ;
     ; VI-LABEL: name: test_load_constant_s256_align32
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -1658,6 +1717,7 @@ body: |
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<8 x s32>) = G_LOAD [[COPY]](p4) :: (load (<8 x s32>), align 16, addrspace 4)
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(s256) = G_BITCAST [[LOAD]](<8 x s32>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BITCAST]](s256)
+    ;
     ; GFX9-LABEL: name: test_load_constant_s256_align32
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -1682,12 +1742,14 @@ body: |
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p4) :: (load (p1), addrspace 4)
     ; CI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
+    ;
     ; VI-LABEL: name: test_load_constant_p1_align8
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p4) :: (load (p1), addrspace 4)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
+    ;
     ; GFX9-LABEL: name: test_load_constant_p1_align8
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -1711,12 +1773,14 @@ body: |
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p4) :: (load (p1), align 4, addrspace 4)
     ; CI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
+    ;
     ; VI-LABEL: name: test_load_constant_p1_align4
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p4) :: (load (p1), align 4, addrspace 4)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
+    ;
     ; GFX9-LABEL: name: test_load_constant_p1_align4
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -1777,6 +1841,7 @@ body: |
     ; CI-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
     ; CI-NEXT: [[INTTOPTR:%[0-9]+]]:_(p1) = G_INTTOPTR [[OR6]](s64)
     ; CI-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p1)
+    ;
     ; VI-LABEL: name: test_load_constant_p1_align1
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -1820,6 +1885,7 @@ body: |
     ; VI-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
     ; VI-NEXT: [[INTTOPTR:%[0-9]+]]:_(p1) = G_INTTOPTR [[OR6]](s64)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p1)
+    ;
     ; GFX9-LABEL: name: test_load_constant_p1_align1
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -1880,12 +1946,14 @@ body: |
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p4) :: (load (p3), addrspace 4)
     ; CI-NEXT: $vgpr0 = COPY [[LOAD]](p3)
+    ;
     ; VI-LABEL: name: test_load_constant_p3_align4
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p4) :: (load (p3), addrspace 4)
     ; VI-NEXT: $vgpr0 = COPY [[LOAD]](p3)
+    ;
     ; GFX9-LABEL: name: test_load_constant_p3_align4
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -1909,12 +1977,14 @@ body: |
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(p4) = G_LOAD [[COPY]](p4) :: (load (p4), addrspace 4)
     ; CI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p4)
+    ;
     ; VI-LABEL: name: test_load_constant_p4_align8
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(p4) = G_LOAD [[COPY]](p4) :: (load (p4), addrspace 4)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p4)
+    ;
     ; GFX9-LABEL: name: test_load_constant_p4_align8
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -1938,12 +2008,14 @@ body: |
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(p4) = G_LOAD [[COPY]](p4) :: (load (p4), align 4, addrspace 4)
     ; CI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p4)
+    ;
     ; VI-LABEL: name: test_load_constant_p4_align4
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(p4) = G_LOAD [[COPY]](p4) :: (load (p4), align 4, addrspace 4)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p4)
+    ;
     ; GFX9-LABEL: name: test_load_constant_p4_align4
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -1986,6 +2058,7 @@ body: |
     ; CI-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]]
     ; CI-NEXT: [[INTTOPTR:%[0-9]+]]:_(p4) = G_INTTOPTR [[OR2]](s64)
     ; CI-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p4)
+    ;
     ; VI-LABEL: name: test_load_constant_p4_align2
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -2011,6 +2084,7 @@ body: |
     ; VI-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]]
     ; VI-NEXT: [[INTTOPTR:%[0-9]+]]:_(p4) = G_INTTOPTR [[OR2]](s64)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p4)
+    ;
     ; GFX9-LABEL: name: test_load_constant_p4_align2
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -2090,6 +2164,7 @@ body: |
     ; CI-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
     ; CI-NEXT: [[INTTOPTR:%[0-9]+]]:_(p4) = G_INTTOPTR [[OR6]](s64)
     ; CI-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p4)
+    ;
     ; VI-LABEL: name: test_load_constant_p4_align1
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -2133,6 +2208,7 @@ body: |
     ; VI-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
     ; VI-NEXT: [[INTTOPTR:%[0-9]+]]:_(p4) = G_INTTOPTR [[OR6]](s64)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p4)
+    ;
     ; GFX9-LABEL: name: test_load_constant_p4_align1
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -2193,12 +2269,14 @@ body: |
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p4) :: (load (p5), addrspace 4)
     ; CI-NEXT: $vgpr0 = COPY [[LOAD]](p5)
+    ;
     ; VI-LABEL: name: test_load_constant_p5_align4
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p4) :: (load (p5), addrspace 4)
     ; VI-NEXT: $vgpr0 = COPY [[LOAD]](p5)
+    ;
     ; GFX9-LABEL: name: test_load_constant_p5_align4
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -2229,6 +2307,7 @@ body: |
     ; CI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; CI-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR]](s32)
     ; CI-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5)
+    ;
     ; VI-LABEL: name: test_load_constant_p5_align2
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -2242,6 +2321,7 @@ body: |
     ; VI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; VI-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR]](s32)
     ; VI-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5)
+    ;
     ; GFX9-LABEL: name: test_load_constant_p5_align2
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -2289,6 +2369,7 @@ body: |
     ; CI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
     ; CI-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR2]](s32)
     ; CI-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5)
+    ;
     ; VI-LABEL: name: test_load_constant_p5_align1
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -2312,6 +2393,7 @@ body: |
     ; VI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
     ; VI-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR2]](s32)
     ; VI-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5)
+    ;
     ; GFX9-LABEL: name: test_load_constant_p5_align1
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -2352,12 +2434,14 @@ body: |
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p4) :: (load (s16), align 4, addrspace 4)
     ; CI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; VI-LABEL: name: test_load_constant_v2s8_align4
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p4) :: (load (s16), align 4, addrspace 4)
     ; VI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX9-LABEL: name: test_load_constant_v2s8_align4
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -2383,12 +2467,14 @@ body: |
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p4) :: (load (s16), addrspace 4)
     ; CI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; VI-LABEL: name: test_load_constant_v2s8_align2
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p4) :: (load (s16), addrspace 4)
     ; VI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX9-LABEL: name: test_load_constant_v2s8_align2
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -2420,6 +2506,7 @@ body: |
     ; CI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
     ; CI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; CI-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
     ; VI-LABEL: name: test_load_constant_v2s8_align1
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -2432,6 +2519,7 @@ body: |
     ; VI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
     ; VI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; VI-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
     ; GFX9-LABEL: name: test_load_constant_v2s8_align1
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -2467,8 +2555,8 @@ body: |
     ; CI-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
     ; CI-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C1]](s32)
     ; CI-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-    ; CI-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; CI-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
+    ; CI-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; CI-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C2]]
     ; CI-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C]](s32)
     ; CI-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
@@ -2488,6 +2576,7 @@ body: |
     ; CI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C1]](s32)
     ; CI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]]
     ; CI-NEXT: $vgpr0 = COPY [[OR2]](s32)
+    ;
     ; VI-LABEL: name: test_load_constant_v3s8_align4
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -2498,8 +2587,8 @@ body: |
     ; VI-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
     ; VI-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C1]](s32)
     ; VI-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-    ; VI-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; VI-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
+    ; VI-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; VI-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C2]]
     ; VI-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
     ; VI-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C2]]
@@ -2517,6 +2606,7 @@ body: |
     ; VI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C1]](s32)
     ; VI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]]
     ; VI-NEXT: $vgpr0 = COPY [[OR2]](s32)
+    ;
     ; GFX9-LABEL: name: test_load_constant_v3s8_align4
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -2527,8 +2617,8 @@ body: |
     ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
     ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C1]](s32)
     ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
+    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C2]]
     ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
     ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C2]]
@@ -2579,8 +2669,8 @@ body: |
     ; CI-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C1]](s32)
     ; CI-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C3]](s32)
     ; CI-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-    ; CI-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; CI-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
+    ; CI-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; CI-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C4]]
     ; CI-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C1]](s32)
     ; CI-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
@@ -2600,6 +2690,7 @@ body: |
     ; CI-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C3]](s32)
     ; CI-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]]
     ; CI-NEXT: $vgpr0 = COPY [[OR4]](s32)
+    ;
     ; VI-LABEL: name: test_load_constant_v3s8_align1
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -2620,8 +2711,8 @@ body: |
     ; VI-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C1]](s32)
     ; VI-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C3]](s32)
     ; VI-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-    ; VI-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; VI-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
+    ; VI-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; VI-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C4]]
     ; VI-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
     ; VI-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C4]]
@@ -2639,6 +2730,7 @@ body: |
     ; VI-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C3]](s32)
     ; VI-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]]
     ; VI-NEXT: $vgpr0 = COPY [[OR4]](s32)
+    ;
     ; GFX9-LABEL: name: test_load_constant_v3s8_align1
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -2659,8 +2751,8 @@ body: |
     ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C1]](s32)
     ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C3]](s32)
     ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-    ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
+    ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C4]]
     ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
     ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C4]]
@@ -2697,12 +2789,14 @@ body: |
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p4) :: (load (s32), addrspace 4)
     ; CI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; VI-LABEL: name: test_load_constant_v4s8_align4
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p4) :: (load (s32), addrspace 4)
     ; VI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX9-LABEL: name: test_load_constant_v4s8_align4
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -2733,6 +2827,7 @@ body: |
     ; CI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
     ; CI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; CI-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
     ; VI-LABEL: name: test_load_constant_v4s8_align2
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -2745,6 +2840,7 @@ body: |
     ; VI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
     ; VI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; VI-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
     ; GFX9-LABEL: name: test_load_constant_v4s8_align2
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -2791,6 +2887,7 @@ body: |
     ; CI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
     ; CI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
     ; CI-NEXT: $vgpr0 = COPY [[OR2]](s32)
+    ;
     ; VI-LABEL: name: test_load_constant_v4s8_align1
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -2813,6 +2910,7 @@ body: |
     ; VI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
     ; VI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
     ; VI-NEXT: $vgpr0 = COPY [[OR2]](s32)
+    ;
     ; GFX9-LABEL: name: test_load_constant_v4s8_align1
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -2853,12 +2951,14 @@ body: |
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p4) :: (load (<2 x s32>), addrspace 4)
     ; CI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; VI-LABEL: name: test_load_constant_v8s8_align8
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p4) :: (load (<2 x s32>), addrspace 4)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; GFX9-LABEL: name: test_load_constant_v8s8_align8
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -2908,6 +3008,7 @@ body: |
     ; CI-NEXT: [[TRUNC3:%[0-9]+]]:_(<4 x s8>) = G_TRUNC [[BUILD_VECTOR3]](<4 x s32>)
     ; CI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<16 x s8>) = G_CONCAT_VECTORS [[TRUNC]](<4 x s8>), [[TRUNC1]](<4 x s8>), [[TRUNC2]](<4 x s8>), [[TRUNC3]](<4 x s8>)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[CONCAT_VECTORS]](<16 x s8>)
+    ;
     ; VI-LABEL: name: test_load_constant_v16s8_align16
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -2939,6 +3040,7 @@ body: |
     ; VI-NEXT: [[TRUNC3:%[0-9]+]]:_(<4 x s8>) = G_TRUNC [[BUILD_VECTOR3]](<4 x s32>)
     ; VI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<16 x s8>) = G_CONCAT_VECTORS [[TRUNC]](<4 x s8>), [[TRUNC1]](<4 x s8>), [[TRUNC2]](<4 x s8>), [[TRUNC3]](<4 x s8>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[CONCAT_VECTORS]](<16 x s8>)
+    ;
     ; GFX9-LABEL: name: test_load_constant_v16s8_align16
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -3056,6 +3158,7 @@ body: |
     ; CI-NEXT: [[TRUNC7:%[0-9]+]]:_(<4 x s8>) = G_TRUNC [[BUILD_VECTOR7]](<4 x s32>)
     ; CI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<32 x s8>) = G_CONCAT_VECTORS [[TRUNC]](<4 x s8>), [[TRUNC1]](<4 x s8>), [[TRUNC2]](<4 x s8>), [[TRUNC3]](<4 x s8>), [[TRUNC4]](<4 x s8>), [[TRUNC5]](<4 x s8>), [[TRUNC6]](<4 x s8>), [[TRUNC7]](<4 x s8>)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<32 x s8>)
+    ;
     ; VI-LABEL: name: test_load_constant_v32s8_align32
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -3107,6 +3210,7 @@ body: |
     ; VI-NEXT: [[TRUNC7:%[0-9]+]]:_(<4 x s8>) = G_TRUNC [[BUILD_VECTOR7]](<4 x s32>)
     ; VI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<32 x s8>) = G_CONCAT_VECTORS [[TRUNC]](<4 x s8>), [[TRUNC1]](<4 x s8>), [[TRUNC2]](<4 x s8>), [[TRUNC3]](<4 x s8>), [[TRUNC4]](<4 x s8>), [[TRUNC5]](<4 x s8>), [[TRUNC6]](<4 x s8>), [[TRUNC7]](<4 x s8>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<32 x s8>)
+    ;
     ; GFX9-LABEL: name: test_load_constant_v32s8_align32
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -3224,12 +3328,14 @@ body: |
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p4) :: (load (<2 x s16>), addrspace 4)
     ; CI-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>)
+    ;
     ; VI-LABEL: name: test_load_constant_v2s16_align4
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p4) :: (load (<2 x s16>), addrspace 4)
     ; VI-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>)
+    ;
     ; GFX9-LABEL: name: test_load_constant_v2s16_align4
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -3263,6 +3369,7 @@ body: |
     ; CI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
     ; CI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; CI-NEXT: $vgpr0 = COPY [[BITCAST]](<2 x s16>)
+    ;
     ; VI-LABEL: name: test_load_constant_v2s16_align2
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -3279,6 +3386,7 @@ body: |
     ; VI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; VI-NEXT: $vgpr0 = COPY [[BITCAST]](<2 x s16>)
+    ;
     ; GFX9-LABEL: name: test_load_constant_v2s16_align2
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -3328,6 +3436,7 @@ body: |
     ; CI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL2]]
     ; CI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
     ; CI-NEXT: $vgpr0 = COPY [[BITCAST]](<2 x s16>)
+    ;
     ; VI-LABEL: name: test_load_constant_v2s16_align1
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -3354,6 +3463,7 @@ body: |
     ; VI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL2]]
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
     ; VI-NEXT: $vgpr0 = COPY [[BITCAST]](<2 x s16>)
+    ;
     ; GFX9-LABEL: name: test_load_constant_v2s16_align1
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -3406,13 +3516,13 @@ body: |
     ; CI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
     ; CI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
     ; CI-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
-    ; CI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]]
-    ; CI-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[BITCAST2]], [[C1]]
-    ; CI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C]](s32)
-    ; CI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]]
+    ; CI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[BITCAST2]], [[C1]]
+    ; CI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C]](s32)
+    ; CI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[LSHR]], [[SHL1]]
     ; CI-NEXT: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
     ; CI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[UV]](<2 x s16>), [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
     ; VI-LABEL: name: test_load_constant_v3s16_align8
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -3432,13 +3542,13 @@ body: |
     ; VI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
     ; VI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
     ; VI-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
-    ; VI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]]
-    ; VI-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[BITCAST2]], [[C1]]
-    ; VI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C]](s32)
-    ; VI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]]
+    ; VI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[BITCAST2]], [[C1]]
+    ; VI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C]](s32)
+    ; VI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[LSHR]], [[SHL1]]
     ; VI-NEXT: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
     ; VI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[UV]](<2 x s16>), [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
     ; GFX9-LABEL: name: test_load_constant_v3s16_align8
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -3501,13 +3611,13 @@ body: |
     ; CI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C2]](s32)
     ; CI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]]
     ; CI-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
-    ; CI-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C3]]
-    ; CI-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C3]]
-    ; CI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C2]](s32)
-    ; CI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]]
+    ; CI-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C3]]
+    ; CI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND4]], [[C2]](s32)
+    ; CI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[LSHR]], [[SHL2]]
     ; CI-NEXT: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
     ; CI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
     ; VI-LABEL: name: test_load_constant_v3s16_align4
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -3536,13 +3646,13 @@ body: |
     ; VI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C2]](s32)
     ; VI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]]
     ; VI-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
-    ; VI-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C3]]
-    ; VI-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C3]]
-    ; VI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C2]](s32)
-    ; VI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]]
+    ; VI-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C3]]
+    ; VI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND4]], [[C2]](s32)
+    ; VI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[LSHR]], [[SHL2]]
     ; VI-NEXT: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
     ; VI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
     ; GFX9-LABEL: name: test_load_constant_v3s16_align4
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -3612,13 +3722,13 @@ body: |
     ; CI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C2]](s32)
     ; CI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]]
     ; CI-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
-    ; CI-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C3]]
-    ; CI-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C3]]
-    ; CI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C2]](s32)
-    ; CI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]]
+    ; CI-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C3]]
+    ; CI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND4]], [[C2]](s32)
+    ; CI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[LSHR]], [[SHL2]]
     ; CI-NEXT: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
     ; CI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
     ; VI-LABEL: name: test_load_constant_v3s16_align2
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -3647,13 +3757,13 @@ body: |
     ; VI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C2]](s32)
     ; VI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]]
     ; VI-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
-    ; VI-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C3]]
-    ; VI-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C3]]
-    ; VI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C2]](s32)
-    ; VI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]]
+    ; VI-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C3]]
+    ; VI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND4]], [[C2]](s32)
+    ; VI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[LSHR]], [[SHL2]]
     ; VI-NEXT: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
     ; VI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
     ; GFX9-LABEL: name: test_load_constant_v3s16_align2
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -3737,13 +3847,13 @@ body: |
     ; CI-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32)
     ; CI-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL4]]
     ; CI-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR4]](s32)
-    ; CI-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C5]]
-    ; CI-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C5]]
-    ; CI-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C4]](s32)
-    ; CI-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL5]]
+    ; CI-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C5]]
+    ; CI-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND4]], [[C4]](s32)
+    ; CI-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[LSHR]], [[SHL5]]
     ; CI-NEXT: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR5]](s32)
     ; CI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
     ; VI-LABEL: name: test_load_constant_v3s16_align1
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -3786,13 +3896,13 @@ body: |
     ; VI-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32)
     ; VI-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL4]]
     ; VI-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR4]](s32)
-    ; VI-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C5]]
-    ; VI-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C5]]
-    ; VI-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C4]](s32)
-    ; VI-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL5]]
+    ; VI-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C5]]
+    ; VI-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND4]], [[C4]](s32)
+    ; VI-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[LSHR]], [[SHL5]]
     ; VI-NEXT: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR5]](s32)
     ; VI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
     ; GFX9-LABEL: name: test_load_constant_v3s16_align1
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -3854,12 +3964,14 @@ body: |
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p4) :: (load (<4 x s16>), addrspace 4)
     ; CI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
+    ;
     ; VI-LABEL: name: test_load_constant_v4s16_align8
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p4) :: (load (<4 x s16>), addrspace 4)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
+    ;
     ; GFX9-LABEL: name: test_load_constant_v4s16_align8
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -3883,12 +3995,14 @@ body: |
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p4) :: (load (<4 x s16>), align 4, addrspace 4)
     ; CI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
+    ;
     ; VI-LABEL: name: test_load_constant_v4s16_align4
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p4) :: (load (<4 x s16>), align 4, addrspace 4)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
+    ;
     ; GFX9-LABEL: name: test_load_constant_v4s16_align4
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -3934,6 +4048,7 @@ body: |
     ; CI-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
     ; CI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>)
     ; CI-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
     ; VI-LABEL: name: test_load_constant_v4s16_align2
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -3962,6 +4077,7 @@ body: |
     ; VI-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
     ; VI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
     ; GFX9-LABEL: name: test_load_constant_v4s16_align2
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -4041,6 +4157,7 @@ body: |
     ; CI-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR5]](s32)
     ; CI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>)
     ; CI-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
     ; VI-LABEL: name: test_load_constant_v4s16_align1
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -4087,6 +4204,7 @@ body: |
     ; VI-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR5]](s32)
     ; VI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
     ; GFX9-LABEL: name: test_load_constant_v4s16_align1
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -4145,6 +4263,7 @@ body: |
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p4) :: (load (<4 x s32>), align 8, addrspace 4)
     ; CI-NEXT: [[BITCAST:%[0-9]+]]:_(<8 x s16>) = G_BITCAST [[LOAD]](<4 x s32>)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<8 x s16>)
+    ;
     ; VI-LABEL: name: test_load_constant_v8s16_align8
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -4152,6 +4271,7 @@ body: |
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p4) :: (load (<4 x s32>), align 8, addrspace 4)
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(<8 x s16>) = G_BITCAST [[LOAD]](<4 x s32>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<8 x s16>)
+    ;
     ; GFX9-LABEL: name: test_load_constant_v8s16_align8
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -4176,12 +4296,14 @@ body: |
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p4) :: (load (<2 x s32>), addrspace 4)
     ; CI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; VI-LABEL: name: test_load_constant_v2s32_align8
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p4) :: (load (<2 x s32>), addrspace 4)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; GFX9-LABEL: name: test_load_constant_v2s32_align8
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -4205,12 +4327,14 @@ body: |
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p4) :: (load (<2 x s32>), align 4, addrspace 4)
     ; CI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; VI-LABEL: name: test_load_constant_v2s32_align4
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p4) :: (load (<2 x s32>), align 4, addrspace 4)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; GFX9-LABEL: name: test_load_constant_v2s32_align4
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -4248,6 +4372,7 @@ body: |
     ; CI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
     ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32)
     ; CI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; VI-LABEL: name: test_load_constant_v2s32_align2
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -4268,6 +4393,7 @@ body: |
     ; VI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; GFX9-LABEL: name: test_load_constant_v2s32_align2
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -4337,6 +4463,7 @@ body: |
     ; CI-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
     ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32)
     ; CI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; VI-LABEL: name: test_load_constant_v2s32_align1
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -4375,6 +4502,7 @@ body: |
     ; VI-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; GFX9-LABEL: name: test_load_constant_v2s32_align1
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -4430,12 +4558,14 @@ body: |
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p4) :: (load (<3 x s32>), align 16, addrspace 4)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](<3 x s32>)
+    ;
     ; VI-LABEL: name: test_load_constant_v3s32_align16
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p4) :: (load (<3 x s32>), align 16, addrspace 4)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](<3 x s32>)
+    ;
     ; GFX9-LABEL: name: test_load_constant_v3s32_align16
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -4461,12 +4591,14 @@ body: |
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p4) :: (load (<3 x s32>), align 4, addrspace 4)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](<3 x s32>)
+    ;
     ; VI-LABEL: name: test_load_constant_v3s32_align4
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p4) :: (load (<3 x s32>), align 4, addrspace 4)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](<3 x s32>)
+    ;
     ; GFX9-LABEL: name: test_load_constant_v3s32_align4
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -4490,12 +4622,14 @@ body: |
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p4) :: (load (<4 x s32>), addrspace 4)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ;
     ; VI-LABEL: name: test_load_constant_v4s32_align16
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p4) :: (load (<4 x s32>), addrspace 4)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ;
     ; GFX9-LABEL: name: test_load_constant_v4s32_align16
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -4519,12 +4653,14 @@ body: |
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p4) :: (load (<4 x s32>), align 8, addrspace 4)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ;
     ; VI-LABEL: name: test_load_constant_v4s32_align8
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p4) :: (load (<4 x s32>), align 8, addrspace 4)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ;
     ; GFX9-LABEL: name: test_load_constant_v4s32_align8
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -4548,12 +4684,14 @@ body: |
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p4) :: (load (<4 x s32>), align 4, addrspace 4)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ;
     ; VI-LABEL: name: test_load_constant_v4s32_align4
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p4) :: (load (<4 x s32>), align 4, addrspace 4)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ;
     ; GFX9-LABEL: name: test_load_constant_v4s32_align4
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -4577,12 +4715,14 @@ body: |
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(<8 x s32>) = G_LOAD [[COPY]](p4) :: (load (<8 x s32>), addrspace 4)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[LOAD]](<8 x s32>)
+    ;
     ; VI-LABEL: name: test_load_constant_v8s32_align32
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<8 x s32>) = G_LOAD [[COPY]](p4) :: (load (<8 x s32>), addrspace 4)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[LOAD]](<8 x s32>)
+    ;
     ; GFX9-LABEL: name: test_load_constant_v8s32_align32
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -4606,12 +4746,14 @@ body: |
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(<16 x s32>) = G_LOAD [[COPY]](p4) :: (load (<16 x s32>), align 32, addrspace 4)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[LOAD]](<16 x s32>)
+    ;
     ; VI-LABEL: name: test_load_constant_v16s32_align32
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<16 x s32>) = G_LOAD [[COPY]](p4) :: (load (<16 x s32>), align 32, addrspace 4)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[LOAD]](<16 x s32>)
+    ;
     ; GFX9-LABEL: name: test_load_constant_v16s32_align32
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -4635,12 +4777,14 @@ body: |
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(<16 x s32>) = G_LOAD [[COPY]](p4) :: (load (<16 x s16>), addrspace 4)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[LOAD]](<16 x s32>)
+    ;
     ; VI-LABEL: name: test_load_constant_v16s32_align32_extload_from_v16s16
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<16 x s32>) = G_LOAD [[COPY]](p4) :: (load (<16 x s16>), addrspace 4)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[LOAD]](<16 x s32>)
+    ;
     ; GFX9-LABEL: name: test_load_constant_v16s32_align32_extload_from_v16s16
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -4664,12 +4808,14 @@ body: |
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p4) :: (load (<2 x s64>), addrspace 4)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>)
+    ;
     ; VI-LABEL: name: test_load_constant_v2s64_align16
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p4) :: (load (<2 x s64>), addrspace 4)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>)
+    ;
     ; GFX9-LABEL: name: test_load_constant_v2s64_align16
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -4693,12 +4839,14 @@ body: |
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p4) :: (load (<2 x s64>), align 8, addrspace 4)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>)
+    ;
     ; VI-LABEL: name: test_load_constant_v2s64_align8
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p4) :: (load (<2 x s64>), align 8, addrspace 4)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>)
+    ;
     ; GFX9-LABEL: name: test_load_constant_v2s64_align8
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -4722,12 +4870,14 @@ body: |
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p4) :: (load (<2 x s64>), align 4, addrspace 4)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>)
+    ;
     ; VI-LABEL: name: test_load_constant_v2s64_align4
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p4) :: (load (<2 x s64>), align 4, addrspace 4)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>)
+    ;
     ; GFX9-LABEL: name: test_load_constant_v2s64_align4
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -4788,6 +4938,7 @@ body: |
     ; CI-NEXT: [[OR5:%[0-9]+]]:_(s64) = G_OR [[SHL5]], [[ZEXT1]]
     ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR2]](s64), [[OR5]](s64)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ;
     ; VI-LABEL: name: test_load_constant_v2s64_align2
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -4831,6 +4982,7 @@ body: |
     ; VI-NEXT: [[OR5:%[0-9]+]]:_(s64) = G_OR [[SHL5]], [[ZEXT1]]
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR2]](s64), [[OR5]](s64)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ;
     ; GFX9-LABEL: name: test_load_constant_v2s64_align2
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -4962,6 +5114,7 @@ body: |
     ; CI-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]]
     ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ;
     ; VI-LABEL: name: test_load_constant_v2s64_align1
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -5039,6 +5192,7 @@ body: |
     ; VI-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]]
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ;
     ; GFX9-LABEL: name: test_load_constant_v2s64_align1
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -5137,6 +5291,7 @@ body: |
     ; CI-NEXT: [[UV4:%[0-9]+]]:_(s64), [[UV5:%[0-9]+]]:_(s64), [[UV6:%[0-9]+]]:_(s64), [[UV7:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>)
     ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[UV]](s64), [[UV1]](s64), [[UV2]](s64), [[UV7]](s64)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
+    ;
     ; VI-LABEL: name: test_load_constant_v3s64_align32
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -5147,6 +5302,7 @@ body: |
     ; VI-NEXT: [[UV4:%[0-9]+]]:_(s64), [[UV5:%[0-9]+]]:_(s64), [[UV6:%[0-9]+]]:_(s64), [[UV7:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>)
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[UV]](s64), [[UV1]](s64), [[UV2]](s64), [[UV7]](s64)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
+    ;
     ; GFX9-LABEL: name: test_load_constant_v3s64_align32
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -5183,6 +5339,7 @@ body: |
     ; CI-NEXT: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64), [[UV4:%[0-9]+]]:_(s64), [[UV5:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>)
     ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[UV]](s64), [[UV1]](s64), [[LOAD1]](s64), [[UV5]](s64)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
+    ;
     ; VI-LABEL: name: test_load_constant_v3s64_align8
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -5196,6 +5353,7 @@ body: |
     ; VI-NEXT: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64), [[UV4:%[0-9]+]]:_(s64), [[UV5:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>)
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[UV]](s64), [[UV1]](s64), [[LOAD1]](s64), [[UV5]](s64)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
+    ;
     ; GFX9-LABEL: name: test_load_constant_v3s64_align8
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -5335,6 +5493,7 @@ body: |
     ; CI-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64), [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>)
     ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64), [[OR20]](s64), [[UV3]](s64)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
+    ;
     ; VI-LABEL: name: test_load_constant_v3s64_align1
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -5448,6 +5607,7 @@ body: |
     ; VI-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64), [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>)
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64), [[OR20]](s64), [[UV3]](s64)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
+    ;
     ; GFX9-LABEL: name: test_load_constant_v3s64_align1
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -5580,12 +5740,14 @@ body: |
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s64>) = G_LOAD [[COPY]](p4) :: (load (<4 x s64>), addrspace 4)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[LOAD]](<4 x s64>)
+    ;
     ; VI-LABEL: name: test_load_constant_v4s64_align32
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s64>) = G_LOAD [[COPY]](p4) :: (load (<4 x s64>), addrspace 4)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[LOAD]](<4 x s64>)
+    ;
     ; GFX9-LABEL: name: test_load_constant_v4s64_align32
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -5609,12 +5771,14 @@ body: |
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s64>) = G_LOAD [[COPY]](p4) :: (load (<4 x s64>), align 8, addrspace 4)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[LOAD]](<4 x s64>)
+    ;
     ; VI-LABEL: name: test_load_constant_v4s64_align8
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s64>) = G_LOAD [[COPY]](p4) :: (load (<4 x s64>), align 8, addrspace 4)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[LOAD]](<4 x s64>)
+    ;
     ; GFX9-LABEL: name: test_load_constant_v4s64_align8
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -5777,6 +5941,7 @@ body: |
     ; CI-NEXT: [[OR27:%[0-9]+]]:_(s64) = G_OR [[SHL27]], [[ZEXT3]]
     ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64), [[OR20]](s64), [[OR27]](s64)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
+    ;
     ; VI-LABEL: name: test_load_constant_v4s64_align1
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -5922,6 +6087,7 @@ body: |
     ; VI-NEXT: [[OR27:%[0-9]+]]:_(s64) = G_OR [[SHL27]], [[ZEXT3]]
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64), [[OR20]](s64), [[OR27]](s64)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
+    ;
     ; GFX9-LABEL: name: test_load_constant_v4s64_align1
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -6085,6 +6251,7 @@ body: |
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(<8 x s32>) = G_LOAD [[COPY]](p4) :: (load (<8 x s32>), addrspace 4)
     ; CI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s128>) = G_BITCAST [[LOAD]](<8 x s32>)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BITCAST]](<2 x s128>)
+    ;
     ; VI-LABEL: name: test_load_constant_v2s128_align32
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -6092,6 +6259,7 @@ body: |
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<8 x s32>) = G_LOAD [[COPY]](p4) :: (load (<8 x s32>), addrspace 4)
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s128>) = G_BITCAST [[LOAD]](<8 x s32>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BITCAST]](<2 x s128>)
+    ;
     ; GFX9-LABEL: name: test_load_constant_v2s128_align32
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -6117,6 +6285,7 @@ body: |
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p4) :: (load (<4 x s32>), addrspace 4)
     ; CI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>)
+    ;
     ; VI-LABEL: name: test_load_constant_v2p1_align16
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -6124,6 +6293,7 @@ body: |
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p4) :: (load (<4 x s32>), addrspace 4)
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>)
+    ;
     ; GFX9-LABEL: name: test_load_constant_v2p1_align16
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -6149,6 +6319,7 @@ body: |
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p4) :: (load (<4 x s32>), align 8, addrspace 4)
     ; CI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>)
+    ;
     ; VI-LABEL: name: test_load_constant_v2p1_align8
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -6156,6 +6327,7 @@ body: |
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p4) :: (load (<4 x s32>), align 8, addrspace 4)
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>)
+    ;
     ; GFX9-LABEL: name: test_load_constant_v2p1_align8
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -6181,6 +6353,7 @@ body: |
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p4) :: (load (<4 x s32>), align 4, addrspace 4)
     ; CI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>)
+    ;
     ; VI-LABEL: name: test_load_constant_v2p1_align4
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -6188,6 +6361,7 @@ body: |
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p4) :: (load (<4 x s32>), align 4, addrspace 4)
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>)
+    ;
     ; GFX9-LABEL: name: test_load_constant_v2p1_align4
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -6275,6 +6449,7 @@ body: |
     ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
     ; CI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>)
+    ;
     ; VI-LABEL: name: test_load_constant_v2p1_align1
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -6344,6 +6519,7 @@ body: |
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>)
+    ;
     ; GFX9-LABEL: name: test_load_constant_v2p1_align1
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -6430,12 +6606,14 @@ body: |
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p4) :: (load (<2 x p3>), addrspace 4)
     ; CI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
+    ;
     ; VI-LABEL: name: test_load_constant_v2p3_align8
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p4) :: (load (<2 x p3>), addrspace 4)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
+    ;
     ; GFX9-LABEL: name: test_load_constant_v2p3_align8
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -6459,12 +6637,14 @@ body: |
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p4) :: (load (<2 x p3>), align 4, addrspace 4)
     ; CI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
+    ;
     ; VI-LABEL: name: test_load_constant_v2p3_align4
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p4) :: (load (<2 x p3>), align 4, addrspace 4)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
+    ;
     ; GFX9-LABEL: name: test_load_constant_v2p3_align4
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -6522,6 +6702,7 @@ body: |
     ; CI-NEXT: [[INTTOPTR1:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR5]](s32)
     ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[INTTOPTR]](p3), [[INTTOPTR1]](p3)
     ; CI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x p3>)
+    ;
     ; VI-LABEL: name: test_load_constant_v2p3_align1
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -6562,6 +6743,7 @@ body: |
     ; VI-NEXT: [[INTTOPTR1:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR5]](s32)
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[INTTOPTR]](p3), [[INTTOPTR1]](p3)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x p3>)
+    ;
     ; GFX9-LABEL: name: test_load_constant_v2p3_align1
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -6619,12 +6801,14 @@ body: |
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p4) :: (load (s8), align 4, addrspace 4)
     ; CI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; VI-LABEL: name: test_ext_load_constant_s32_from_1_align4
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p4) :: (load (s8), align 4, addrspace 4)
     ; VI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX9-LABEL: name: test_ext_load_constant_s32_from_1_align4
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -6648,12 +6832,14 @@ body: |
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p4) :: (load (s16), align 4, addrspace 4)
     ; CI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; VI-LABEL: name: test_ext_load_constant_s32_from_2_align4
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p4) :: (load (s16), align 4, addrspace 4)
     ; VI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX9-LABEL: name: test_ext_load_constant_s32_from_2_align4
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -6679,6 +6865,7 @@ body: |
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p4) :: (load (s8), align 4, addrspace 4)
     ; CI-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; CI-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
     ; VI-LABEL: name: test_ext_load_constant_s64_from_1_align4
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -6686,6 +6873,7 @@ body: |
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p4) :: (load (s8), align 4, addrspace 4)
     ; VI-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
     ; GFX9-LABEL: name: test_ext_load_constant_s64_from_1_align4
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -6711,6 +6899,7 @@ body: |
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p4) :: (load (s16), align 4, addrspace 4)
     ; CI-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; CI-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
     ; VI-LABEL: name: test_ext_load_constant_s64_from_2_align4
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -6718,6 +6907,7 @@ body: |
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p4) :: (load (s16), align 4, addrspace 4)
     ; VI-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
     ; GFX9-LABEL: name: test_ext_load_constant_s64_from_2_align4
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -6743,6 +6933,7 @@ body: |
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p4) :: (load (s32), addrspace 4)
     ; CI-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; CI-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
     ; VI-LABEL: name: test_ext_load_constant_s64_from_4_align4
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -6750,6 +6941,7 @@ body: |
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p4) :: (load (s32), addrspace 4)
     ; VI-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
     ; GFX9-LABEL: name: test_ext_load_constant_s64_from_4_align4
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -6778,6 +6970,7 @@ body: |
     ; CI-NEXT: [[DEF1:%[0-9]+]]:_(s64) = G_IMPLICIT_DEF
     ; CI-NEXT: [[MV1:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[MV]](s64), [[DEF1]](s64)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV1]](s128)
+    ;
     ; VI-LABEL: name: test_ext_load_constant_s128_from_4_align4
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -6788,6 +6981,7 @@ body: |
     ; VI-NEXT: [[DEF1:%[0-9]+]]:_(s64) = G_IMPLICIT_DEF
     ; VI-NEXT: [[MV1:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[MV]](s64), [[DEF1]](s64)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV1]](s128)
+    ;
     ; GFX9-LABEL: name: test_ext_load_constant_s128_from_4_align4
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -6816,6 +7010,7 @@ body: |
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p4) :: (load (s16), align 4, addrspace 4)
     ; CI-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; CI-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
     ; VI-LABEL: name: test_ext_load_constant_s64_from_2_align2
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -6823,6 +7018,7 @@ body: |
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p4) :: (load (s16), align 4, addrspace 4)
     ; VI-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
     ; GFX9-LABEL: name: test_ext_load_constant_s64_from_2_align2
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -6848,6 +7044,7 @@ body: |
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p4) :: (load (s8), align 4, addrspace 4)
     ; CI-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; CI-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
     ; VI-LABEL: name: test_ext_load_constant_s64_from_1_align1
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -6855,6 +7052,7 @@ body: |
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p4) :: (load (s8), align 4, addrspace 4)
     ; VI-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
     ; GFX9-LABEL: name: test_ext_load_constant_s64_from_1_align1
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -6911,6 +7109,7 @@ body: |
     ; CI-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
     ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32)
     ; CI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; VI-LABEL: name: test_extload_constant_v2s32_from_4_align1
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -6949,6 +7148,7 @@ body: |
     ; VI-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; GFX9-LABEL: name: test_extload_constant_v2s32_from_4_align1
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -7018,6 +7218,7 @@ body: |
     ; CI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
     ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32)
     ; CI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; VI-LABEL: name: test_extload_constant_v2s32_from_4_align2
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -7038,6 +7239,7 @@ body: |
     ; VI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; GFX9-LABEL: name: test_extload_constant_v2s32_from_4_align2
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -7075,12 +7277,14 @@ body: |
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p4) :: (load (<2 x s32>), align 4, addrspace 1)
     ; CI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; VI-LABEL: name: test_extload_constant_v2s32_from_4_align4
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p4) :: (load (<2 x s32>), align 4, addrspace 1)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; GFX9-LABEL: name: test_extload_constant_v2s32_from_4_align4
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -7104,12 +7308,14 @@ body: |
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p4) :: (load (<3 x s32>), align 4, addrspace 1)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](<3 x s32>)
+    ;
     ; VI-LABEL: name: test_extload_constant_v3s32_from_6_align4
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p4) :: (load (<3 x s32>), align 4, addrspace 1)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](<3 x s32>)
+    ;
     ; GFX9-LABEL: name: test_extload_constant_v3s32_from_6_align4
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -7133,12 +7339,14 @@ body: |
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p4) :: (load (<4 x s32>), align 4, addrspace 1)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ;
     ; VI-LABEL: name: test_extload_constant_v4s32_from_8_align4
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p4) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p4) :: (load (<4 x s32>), align 4, addrspace 1)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ;
     ; GFX9-LABEL: name: test_extload_constant_v4s32_from_8_align4
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -7258,6 +7466,7 @@ body: |
     ; CI-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)
     ; CI-NEXT: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96)
+    ;
     ; VI-LABEL: name: test_extload_constant_v2s96_from_24_align1
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -7360,6 +7569,7 @@ body: |
     ; VI-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)
     ; VI-NEXT: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96)
+    ;
     ; GFX9-LABEL: name: test_extload_constant_v2s96_from_24_align1
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -7528,6 +7738,7 @@ body: |
     ; CI-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)
     ; CI-NEXT: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96)
+    ;
     ; VI-LABEL: name: test_extload_constant_v2s96_from_24_align2
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -7580,6 +7791,7 @@ body: |
     ; VI-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)
     ; VI-NEXT: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96)
+    ;
     ; GFX9-LABEL: name: test_extload_constant_v2s96_from_24_align2
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -7660,6 +7872,7 @@ body: |
     ; CI-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)
     ; CI-NEXT: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96)
+    ;
     ; VI-LABEL: name: test_extload_constant_v2s96_from_24_align4
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -7674,6 +7887,7 @@ body: |
     ; VI-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)
     ; VI-NEXT: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96)
+    ;
     ; GFX9-LABEL: name: test_extload_constant_v2s96_from_24_align4
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -7716,6 +7930,7 @@ body: |
     ; CI-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)
     ; CI-NEXT: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96)
+    ;
     ; VI-LABEL: name: test_extload_constant_v2s96_from_24_align16
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -7730,6 +7945,7 @@ body: |
     ; VI-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)
     ; VI-NEXT: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96)
+    ;
     ; GFX9-LABEL: name: test_extload_constant_v2s96_from_24_align16
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -7765,6 +7981,7 @@ body: |
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(<16 x s32>) = G_LOAD [[COPY]](p4) :: (load (<16 x s32>), align 32, addrspace 4)
     ; CI-NEXT: [[BITCAST:%[0-9]+]]:_(s512) = G_BITCAST [[LOAD]](<16 x s32>)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[BITCAST]](s512)
+    ;
     ; VI-LABEL: name: test_load_constant_s512_align32
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -7772,6 +7989,7 @@ body: |
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<16 x s32>) = G_LOAD [[COPY]](p4) :: (load (<16 x s32>), align 32, addrspace 4)
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(s512) = G_BITCAST [[LOAD]](<16 x s32>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[BITCAST]](s512)
+    ;
     ; GFX9-LABEL: name: test_load_constant_s512_align32
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -7797,6 +8015,7 @@ body: |
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(<16 x s32>) = G_LOAD [[COPY]](p4) :: (load (<16 x s32>), align 32, addrspace 4)
     ; CI-NEXT: [[BITCAST:%[0-9]+]]:_(<4 x s128>) = G_BITCAST [[LOAD]](<16 x s32>)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[BITCAST]](<4 x s128>)
+    ;
     ; VI-LABEL: name: test_load_constant_v4s128_align32
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -7804,6 +8023,7 @@ body: |
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<16 x s32>) = G_LOAD [[COPY]](p4) :: (load (<16 x s32>), align 32, addrspace 4)
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(<4 x s128>) = G_BITCAST [[LOAD]](<16 x s32>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[BITCAST]](<4 x s128>)
+    ;
     ; GFX9-LABEL: name: test_load_constant_v4s128_align32
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-flat.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-flat.mir
index d8600768ebb6b..324839482976e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-flat.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-flat.mir
@@ -19,6 +19,7 @@ body: |
     ; CI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; CI-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LOAD]], [[C]]
     ; CI-NEXT: $vgpr0 = COPY [[AND]](s32)
+    ;
     ; VI-LABEL: name: test_load_flat_s1_align1
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -27,6 +28,7 @@ body: |
     ; VI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; VI-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LOAD]], [[C]]
     ; VI-NEXT: $vgpr0 = COPY [[AND]](s32)
+    ;
     ; GFX9-LABEL: name: test_load_flat_s1_align1
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -55,6 +57,7 @@ body: |
     ; CI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
     ; CI-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LOAD]], [[C]]
     ; CI-NEXT: $vgpr0 = COPY [[AND]](s32)
+    ;
     ; VI-LABEL: name: test_load_flat_s2_align1
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -63,6 +66,7 @@ body: |
     ; VI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
     ; VI-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LOAD]], [[C]]
     ; VI-NEXT: $vgpr0 = COPY [[AND]](s32)
+    ;
     ; GFX9-LABEL: name: test_load_flat_s2_align1
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -89,12 +93,14 @@ body: |
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s8), align 4)
     ; CI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; VI-LABEL: name: test_load_flat_s8_align4
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s8), align 4)
     ; VI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX9-LABEL: name: test_load_flat_s8_align4
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -119,12 +125,14 @@ body: |
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s8))
     ; CI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; VI-LABEL: name: test_load_flat_s8_align1
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s8))
     ; VI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX9-LABEL: name: test_load_flat_s8_align1
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -149,12 +157,14 @@ body: |
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 4)
     ; CI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; VI-LABEL: name: test_load_flat_s16_align4
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 4)
     ; VI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX9-LABEL: name: test_load_flat_s16_align4
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -179,12 +189,14 @@ body: |
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16))
     ; CI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; VI-LABEL: name: test_load_flat_s16_align2
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16))
     ; VI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX9-LABEL: name: test_load_flat_s16_align2
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -215,6 +227,7 @@ body: |
     ; CI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
     ; CI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; CI-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
     ; VI-LABEL: name: test_load_flat_s16_align1
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -227,6 +240,7 @@ body: |
     ; VI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
     ; VI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; VI-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
     ; GFX9-LABEL: name: test_load_flat_s16_align1
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -257,12 +271,14 @@ body: |
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32))
     ; CI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; VI-LABEL: name: test_load_flat_s32_align4
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32))
     ; VI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX9-LABEL: name: test_load_flat_s32_align4
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -292,6 +308,7 @@ body: |
     ; CI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
     ; CI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; CI-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
     ; VI-LABEL: name: test_load_flat_s32_align2
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -304,6 +321,7 @@ body: |
     ; VI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
     ; VI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; VI-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
     ; GFX9-LABEL: name: test_load_flat_s32_align2
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -349,6 +367,7 @@ body: |
     ; CI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
     ; CI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
     ; CI-NEXT: $vgpr0 = COPY [[OR2]](s32)
+    ;
     ; VI-LABEL: name: test_load_flat_s32_align1
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -371,6 +390,7 @@ body: |
     ; VI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
     ; VI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
     ; VI-NEXT: $vgpr0 = COPY [[OR2]](s32)
+    ;
     ; GFX9-LABEL: name: test_load_flat_s32_align1
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -416,17 +436,17 @@ body: |
     ; CI-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C1]](s32)
     ; CI-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
     ; CI-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LOAD]], [[C2]]
-    ; CI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C2]]
-    ; CI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C1]](s32)
+    ; CI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LSHR]], [[C1]](s32)
     ; CI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
-    ; CI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[LOAD1]], [[C2]]
+    ; CI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LOAD1]], [[C2]]
     ; CI-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
     ; CI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[C3]], [[C1]](s32)
-    ; CI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]]
+    ; CI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND1]], [[SHL1]]
     ; CI-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32)
     ; CI-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 281474976710655
-    ; CI-NEXT: [[AND3:%[0-9]+]]:_(s64) = G_AND [[MV]], [[C4]]
-    ; CI-NEXT: $vgpr0_vgpr1 = COPY [[AND3]](s64)
+    ; CI-NEXT: [[AND2:%[0-9]+]]:_(s64) = G_AND [[MV]], [[C4]]
+    ; CI-NEXT: $vgpr0_vgpr1 = COPY [[AND2]](s64)
+    ;
     ; VI-LABEL: name: test_load_flat_s48_align8
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -439,17 +459,17 @@ body: |
     ; VI-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C1]](s32)
     ; VI-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
     ; VI-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LOAD]], [[C2]]
-    ; VI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C2]]
-    ; VI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C1]](s32)
+    ; VI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LSHR]], [[C1]](s32)
     ; VI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
-    ; VI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[LOAD1]], [[C2]]
+    ; VI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LOAD1]], [[C2]]
     ; VI-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
     ; VI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[C3]], [[C1]](s32)
-    ; VI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]]
+    ; VI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND1]], [[SHL1]]
     ; VI-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32)
     ; VI-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 281474976710655
-    ; VI-NEXT: [[AND3:%[0-9]+]]:_(s64) = G_AND [[MV]], [[C4]]
-    ; VI-NEXT: $vgpr0_vgpr1 = COPY [[AND3]](s64)
+    ; VI-NEXT: [[AND2:%[0-9]+]]:_(s64) = G_AND [[MV]], [[C4]]
+    ; VI-NEXT: $vgpr0_vgpr1 = COPY [[AND2]](s64)
+    ;
     ; GFX9-LABEL: name: test_load_flat_s48_align8
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -480,6 +500,7 @@ body: |
     ; CI-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s32) from unknown-address + 4)
     ; CI-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32)
     ; CI-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64)
+    ;
     ; VI-LABEL: name: test_load_flat_s64_align8
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -490,6 +511,7 @@ body: |
     ; VI-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s32) from unknown-address + 4)
     ; VI-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64)
+    ;
     ; GFX9-LABEL: name: test_load_flat_s64_align8
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -517,6 +539,7 @@ body: |
     ; CI-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s32) from unknown-address + 4)
     ; CI-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32)
     ; CI-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64)
+    ;
     ; VI-LABEL: name: test_load_flat_s64_align4
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -527,6 +550,7 @@ body: |
     ; VI-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s32) from unknown-address + 4)
     ; VI-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64)
+    ;
     ; GFX9-LABEL: name: test_load_flat_s64_align4
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -564,6 +588,7 @@ body: |
     ; CI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
     ; CI-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32)
     ; CI-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64)
+    ;
     ; VI-LABEL: name: test_load_flat_s64_align2
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -584,6 +609,7 @@ body: |
     ; VI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
     ; VI-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64)
+    ;
     ; GFX9-LABEL: name: test_load_flat_s64_align2
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -657,6 +683,7 @@ body: |
     ; CI-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
     ; CI-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR2]](s32), [[OR5]](s32)
     ; CI-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64)
+    ;
     ; VI-LABEL: name: test_load_flat_s64_align1
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -695,6 +722,7 @@ body: |
     ; VI-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
     ; VI-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR2]](s32), [[OR5]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64)
+    ;
     ; GFX9-LABEL: name: test_load_flat_s64_align1
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -762,6 +790,7 @@ body: |
     ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32)
     ; CI-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
     ; VI-LABEL: name: test_load_flat_s96_align16
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -776,6 +805,7 @@ body: |
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32)
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
     ; GFX9-LABEL: name: test_load_flat_s96_align16
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -808,6 +838,7 @@ body: |
     ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32)
     ; CI-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
     ; VI-LABEL: name: test_load_flat_s96_align8
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -822,6 +853,7 @@ body: |
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32)
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
     ; GFX9-LABEL: name: test_load_flat_s96_align8
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -854,6 +886,7 @@ body: |
     ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32)
     ; CI-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
     ; VI-LABEL: name: test_load_flat_s96_align4
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -868,6 +901,7 @@ body: |
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32)
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
     ; GFX9-LABEL: name: test_load_flat_s96_align4
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -914,6 +948,7 @@ body: |
     ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32)
     ; CI-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
     ; VI-LABEL: name: test_load_flat_s96_align2
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -942,6 +977,7 @@ body: |
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32)
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
     ; GFX9-LABEL: name: test_load_flat_s96_align2
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -1035,6 +1071,7 @@ body: |
     ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32)
     ; CI-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
     ; VI-LABEL: name: test_load_flat_s96_align1
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -1089,6 +1126,7 @@ body: |
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32)
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
     ; GFX9-LABEL: name: test_load_flat_s96_align1
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -1174,6 +1212,7 @@ body: |
     ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<5 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32), [[LOAD4]](s32)
     ; CI-NEXT: [[BITCAST:%[0-9]+]]:_(s160) = G_BITCAST [[BUILD_VECTOR]](<5 x s32>)
     ; CI-NEXT: S_NOP 0, implicit [[BITCAST]](s160)
+    ;
     ; VI-LABEL: name: test_load_flat_s160_align4
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -1194,6 +1233,7 @@ body: |
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<5 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32), [[LOAD4]](s32)
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(s160) = G_BITCAST [[BUILD_VECTOR]](<5 x s32>)
     ; VI-NEXT: S_NOP 0, implicit [[BITCAST]](s160)
+    ;
     ; GFX9-LABEL: name: test_load_flat_s160_align4
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -1245,6 +1285,7 @@ body: |
     ; CI-NEXT: [[DEF:%[0-9]+]]:_(s256) = G_IMPLICIT_DEF
     ; CI-NEXT: [[INSERT:%[0-9]+]]:_(s256) = G_INSERT [[DEF]], [[BITCAST]](s224), 0
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[INSERT]](s256)
+    ;
     ; VI-LABEL: name: test_load_flat_s224_align4
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -1273,6 +1314,7 @@ body: |
     ; VI-NEXT: [[DEF:%[0-9]+]]:_(s256) = G_IMPLICIT_DEF
     ; VI-NEXT: [[INSERT:%[0-9]+]]:_(s256) = G_INSERT [[DEF]], [[BITCAST]](s224), 0
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[INSERT]](s256)
+    ;
     ; GFX9-LABEL: name: test_load_flat_s224_align4
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -1319,6 +1361,7 @@ body: |
     ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32)
     ; CI-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
     ; VI-LABEL: name: test_load_flat_s128_align16
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -1336,6 +1379,7 @@ body: |
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32)
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
     ; GFX9-LABEL: name: test_load_flat_s128_align16
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -1371,6 +1415,7 @@ body: |
     ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32)
     ; CI-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
     ; VI-LABEL: name: test_load_flat_s128_align4
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -1388,6 +1433,7 @@ body: |
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32)
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
     ; GFX9-LABEL: name: test_load_flat_s128_align4
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -1475,6 +1521,7 @@ body: |
     ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
     ; CI-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
     ; VI-LABEL: name: test_load_flat_s128_align1
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -1544,6 +1591,7 @@ body: |
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
     ; GFX9-LABEL: name: test_load_flat_s128_align1
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -1653,6 +1701,7 @@ body: |
     ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32), [[LOAD4]](s32), [[LOAD5]](s32), [[LOAD6]](s32), [[LOAD7]](s32)
     ; CI-NEXT: [[BITCAST:%[0-9]+]]:_(s256) = G_BITCAST [[BUILD_VECTOR]](<8 x s32>)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BITCAST]](s256)
+    ;
     ; VI-LABEL: name: test_load_flat_s256_align32
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -1682,6 +1731,7 @@ body: |
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32), [[LOAD4]](s32), [[LOAD5]](s32), [[LOAD6]](s32), [[LOAD7]](s32)
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(s256) = G_BITCAST [[BUILD_VECTOR]](<8 x s32>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BITCAST]](s256)
+    ;
     ; GFX9-LABEL: name: test_load_flat_s256_align32
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -1714,6 +1764,7 @@ body: |
     ; CI-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s32) from unknown-address + 4)
     ; CI-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32)
     ; CI-NEXT: $vgpr0_vgpr1 = COPY [[MV]](p1)
+    ;
     ; VI-LABEL: name: test_load_flat_p1_align8
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -1724,6 +1775,7 @@ body: |
     ; VI-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s32) from unknown-address + 4)
     ; VI-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[MV]](p1)
+    ;
     ; GFX9-LABEL: name: test_load_flat_p1_align8
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -1751,6 +1803,7 @@ body: |
     ; CI-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s32) from unknown-address + 4)
     ; CI-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32)
     ; CI-NEXT: $vgpr0_vgpr1 = COPY [[MV]](p1)
+    ;
     ; VI-LABEL: name: test_load_flat_p1_align4
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -1761,6 +1814,7 @@ body: |
     ; VI-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s32) from unknown-address + 4)
     ; VI-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[MV]](p1)
+    ;
     ; GFX9-LABEL: name: test_load_flat_p1_align4
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -1816,6 +1870,7 @@ body: |
     ; CI-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
     ; CI-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR2]](s32), [[OR5]](s32)
     ; CI-NEXT: $vgpr0_vgpr1 = COPY [[MV]](p1)
+    ;
     ; VI-LABEL: name: test_load_flat_p1_align1
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -1854,6 +1909,7 @@ body: |
     ; VI-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
     ; VI-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR2]](s32), [[OR5]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[MV]](p1)
+    ;
     ; GFX9-LABEL: name: test_load_flat_p1_align1
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -1914,12 +1970,14 @@ body: |
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p0) :: (load (p3))
     ; CI-NEXT: $vgpr0 = COPY [[LOAD]](p3)
+    ;
     ; VI-LABEL: name: test_load_flat_p3_align4
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p0) :: (load (p3))
     ; VI-NEXT: $vgpr0 = COPY [[LOAD]](p3)
+    ;
     ; GFX9-LABEL: name: test_load_flat_p3_align4
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -1947,6 +2005,7 @@ body: |
     ; CI-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s32) from unknown-address + 4)
     ; CI-NEXT: [[MV:%[0-9]+]]:_(p4) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32)
     ; CI-NEXT: $vgpr0_vgpr1 = COPY [[MV]](p4)
+    ;
     ; VI-LABEL: name: test_load_flat_p4_align8
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -1957,6 +2016,7 @@ body: |
     ; VI-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s32) from unknown-address + 4)
     ; VI-NEXT: [[MV:%[0-9]+]]:_(p4) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[MV]](p4)
+    ;
     ; GFX9-LABEL: name: test_load_flat_p4_align8
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -1984,6 +2044,7 @@ body: |
     ; CI-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s32) from unknown-address + 4)
     ; CI-NEXT: [[MV:%[0-9]+]]:_(p4) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32)
     ; CI-NEXT: $vgpr0_vgpr1 = COPY [[MV]](p4)
+    ;
     ; VI-LABEL: name: test_load_flat_p4_align4
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -1994,6 +2055,7 @@ body: |
     ; VI-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s32) from unknown-address + 4)
     ; VI-NEXT: [[MV:%[0-9]+]]:_(p4) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[MV]](p4)
+    ;
     ; GFX9-LABEL: name: test_load_flat_p4_align4
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -2031,6 +2093,7 @@ body: |
     ; CI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
     ; CI-NEXT: [[MV:%[0-9]+]]:_(p4) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32)
     ; CI-NEXT: $vgpr0_vgpr1 = COPY [[MV]](p4)
+    ;
     ; VI-LABEL: name: test_load_flat_p4_align2
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -2051,6 +2114,7 @@ body: |
     ; VI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
     ; VI-NEXT: [[MV:%[0-9]+]]:_(p4) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[MV]](p4)
+    ;
     ; GFX9-LABEL: name: test_load_flat_p4_align2
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -2125,6 +2189,7 @@ body: |
     ; CI-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
     ; CI-NEXT: [[MV:%[0-9]+]]:_(p4) = G_MERGE_VALUES [[OR2]](s32), [[OR5]](s32)
     ; CI-NEXT: $vgpr0_vgpr1 = COPY [[MV]](p4)
+    ;
     ; VI-LABEL: name: test_load_flat_p4_align1
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -2163,6 +2228,7 @@ body: |
     ; VI-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
     ; VI-NEXT: [[MV:%[0-9]+]]:_(p4) = G_MERGE_VALUES [[OR2]](s32), [[OR5]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[MV]](p4)
+    ;
     ; GFX9-LABEL: name: test_load_flat_p4_align1
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -2223,12 +2289,14 @@ body: |
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p0) :: (load (p5))
     ; CI-NEXT: $vgpr0 = COPY [[LOAD]](p5)
+    ;
     ; VI-LABEL: name: test_load_flat_p5_align4
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p0) :: (load (p5))
     ; VI-NEXT: $vgpr0 = COPY [[LOAD]](p5)
+    ;
     ; GFX9-LABEL: name: test_load_flat_p5_align4
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -2259,6 +2327,7 @@ body: |
     ; CI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; CI-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR]](s32)
     ; CI-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5)
+    ;
     ; VI-LABEL: name: test_load_flat_p5_align2
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -2272,6 +2341,7 @@ body: |
     ; VI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; VI-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR]](s32)
     ; VI-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5)
+    ;
     ; GFX9-LABEL: name: test_load_flat_p5_align2
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -2319,6 +2389,7 @@ body: |
     ; CI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
     ; CI-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR2]](s32)
     ; CI-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5)
+    ;
     ; VI-LABEL: name: test_load_flat_p5_align1
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -2342,6 +2413,7 @@ body: |
     ; VI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
     ; VI-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR2]](s32)
     ; VI-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5)
+    ;
     ; GFX9-LABEL: name: test_load_flat_p5_align1
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -2382,12 +2454,14 @@ body: |
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 4)
     ; CI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; VI-LABEL: name: test_load_flat_v2s8_align4
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 4)
     ; VI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX9-LABEL: name: test_load_flat_v2s8_align4
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -2413,12 +2487,14 @@ body: |
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16))
     ; CI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; VI-LABEL: name: test_load_flat_v2s8_align2
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16))
     ; VI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX9-LABEL: name: test_load_flat_v2s8_align2
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -2450,6 +2526,7 @@ body: |
     ; CI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
     ; CI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; CI-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
     ; VI-LABEL: name: test_load_flat_v2s8_align1
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -2462,6 +2539,7 @@ body: |
     ; VI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
     ; VI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; VI-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
     ; GFX9-LABEL: name: test_load_flat_v2s8_align1
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -2497,8 +2575,8 @@ body: |
     ; CI-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
     ; CI-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C1]](s32)
     ; CI-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-    ; CI-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; CI-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
+    ; CI-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; CI-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C2]]
     ; CI-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C]](s32)
     ; CI-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
@@ -2518,6 +2596,7 @@ body: |
     ; CI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C1]](s32)
     ; CI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]]
     ; CI-NEXT: $vgpr0 = COPY [[OR2]](s32)
+    ;
     ; VI-LABEL: name: test_load_flat_v3s8_align4
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -2528,8 +2607,8 @@ body: |
     ; VI-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
     ; VI-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C1]](s32)
     ; VI-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-    ; VI-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; VI-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
+    ; VI-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; VI-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C2]]
     ; VI-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
     ; VI-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C2]]
@@ -2547,6 +2626,7 @@ body: |
     ; VI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C1]](s32)
     ; VI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]]
     ; VI-NEXT: $vgpr0 = COPY [[OR2]](s32)
+    ;
     ; GFX9-LABEL: name: test_load_flat_v3s8_align4
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -2557,8 +2637,8 @@ body: |
     ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
     ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C1]](s32)
     ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
+    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C2]]
     ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
     ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C2]]
@@ -2609,8 +2689,8 @@ body: |
     ; CI-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C1]](s32)
     ; CI-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C3]](s32)
     ; CI-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-    ; CI-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; CI-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
+    ; CI-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; CI-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C4]]
     ; CI-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C1]](s32)
     ; CI-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
@@ -2630,6 +2710,7 @@ body: |
     ; CI-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C3]](s32)
     ; CI-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]]
     ; CI-NEXT: $vgpr0 = COPY [[OR4]](s32)
+    ;
     ; VI-LABEL: name: test_load_flat_v3s8_align1
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -2650,8 +2731,8 @@ body: |
     ; VI-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C1]](s32)
     ; VI-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C3]](s32)
     ; VI-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-    ; VI-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; VI-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
+    ; VI-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; VI-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C4]]
     ; VI-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
     ; VI-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C4]]
@@ -2669,6 +2750,7 @@ body: |
     ; VI-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C3]](s32)
     ; VI-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]]
     ; VI-NEXT: $vgpr0 = COPY [[OR4]](s32)
+    ;
     ; GFX9-LABEL: name: test_load_flat_v3s8_align1
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -2689,8 +2771,8 @@ body: |
     ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C1]](s32)
     ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C3]](s32)
     ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-    ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
+    ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C4]]
     ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
     ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C4]]
@@ -2727,12 +2809,14 @@ body: |
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32))
     ; CI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; VI-LABEL: name: test_load_flat_v4s8_align4
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32))
     ; VI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX9-LABEL: name: test_load_flat_v4s8_align4
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -2763,6 +2847,7 @@ body: |
     ; CI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
     ; CI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; CI-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
     ; VI-LABEL: name: test_load_flat_v4s8_align2
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -2775,6 +2860,7 @@ body: |
     ; VI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
     ; VI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; VI-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
     ; GFX9-LABEL: name: test_load_flat_v4s8_align2
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -2821,6 +2907,7 @@ body: |
     ; CI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
     ; CI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
     ; CI-NEXT: $vgpr0 = COPY [[OR2]](s32)
+    ;
     ; VI-LABEL: name: test_load_flat_v4s8_align1
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -2843,6 +2930,7 @@ body: |
     ; VI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
     ; VI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
     ; VI-NEXT: $vgpr0 = COPY [[OR2]](s32)
+    ;
     ; GFX9-LABEL: name: test_load_flat_v4s8_align1
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -2887,6 +2975,7 @@ body: |
     ; CI-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s32) from unknown-address + 4)
     ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32)
     ; CI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; VI-LABEL: name: test_load_flat_v8s8_align8
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -2897,6 +2986,7 @@ body: |
     ; VI-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s32) from unknown-address + 4)
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; GFX9-LABEL: name: test_load_flat_v8s8_align8
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -2931,6 +3021,7 @@ body: |
     ; CI-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s32) from unknown-address + 12)
     ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
+    ;
     ; VI-LABEL: name: test_load_flat_v16s8_align16
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -2947,6 +3038,7 @@ body: |
     ; VI-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s32) from unknown-address + 12)
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
+    ;
     ; GFX9-LABEL: name: test_load_flat_v16s8_align16
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -2993,6 +3085,7 @@ body: |
     ; CI-NEXT: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s32) from unknown-address + 28)
     ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32), [[LOAD4]](s32), [[LOAD5]](s32), [[LOAD6]](s32), [[LOAD7]](s32)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<8 x s32>)
+    ;
     ; VI-LABEL: name: test_load_flat_v32s8_align32
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -3021,6 +3114,7 @@ body: |
     ; VI-NEXT: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s32) from unknown-address + 28)
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32), [[LOAD4]](s32), [[LOAD5]](s32), [[LOAD6]](s32), [[LOAD7]](s32)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<8 x s32>)
+    ;
     ; GFX9-LABEL: name: test_load_flat_v32s8_align32
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -3050,12 +3144,14 @@ body: |
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p0) :: (load (<2 x s16>))
     ; CI-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>)
+    ;
     ; VI-LABEL: name: test_load_flat_v2s16_align4
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p0) :: (load (<2 x s16>))
     ; VI-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>)
+    ;
     ; GFX9-LABEL: name: test_load_flat_v2s16_align4
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -3089,6 +3185,7 @@ body: |
     ; CI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
     ; CI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; CI-NEXT: $vgpr0 = COPY [[BITCAST]](<2 x s16>)
+    ;
     ; VI-LABEL: name: test_load_flat_v2s16_align2
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -3105,6 +3202,7 @@ body: |
     ; VI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; VI-NEXT: $vgpr0 = COPY [[BITCAST]](<2 x s16>)
+    ;
     ; GFX9-LABEL: name: test_load_flat_v2s16_align2
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -3154,6 +3252,7 @@ body: |
     ; CI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL2]]
     ; CI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
     ; CI-NEXT: $vgpr0 = COPY [[BITCAST]](<2 x s16>)
+    ;
     ; VI-LABEL: name: test_load_flat_v2s16_align1
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -3180,6 +3279,7 @@ body: |
     ; VI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL2]]
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
     ; VI-NEXT: $vgpr0 = COPY [[BITCAST]](<2 x s16>)
+    ;
     ; GFX9-LABEL: name: test_load_flat_v2s16_align1
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -3231,22 +3331,21 @@ body: |
     ; CI-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
     ; CI-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
     ; CI-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[BITCAST]], [[C2]]
-    ; CI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C2]]
-    ; CI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C1]](s32)
+    ; CI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LSHR]], [[C1]](s32)
     ; CI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
     ; CI-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
-    ; CI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[LOAD1]], [[C2]]
-    ; CI-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C2]]
-    ; CI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C1]](s32)
-    ; CI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]]
+    ; CI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LOAD1]], [[C2]]
+    ; CI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C2]]
+    ; CI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C1]](s32)
+    ; CI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND1]], [[SHL1]]
     ; CI-NEXT: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
-    ; CI-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C2]]
-    ; CI-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[BITCAST2]], [[C2]]
-    ; CI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C1]](s32)
-    ; CI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]]
+    ; CI-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[BITCAST2]], [[C2]]
+    ; CI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C1]](s32)
+    ; CI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[LSHR1]], [[SHL2]]
     ; CI-NEXT: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
     ; CI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
     ; VI-LABEL: name: test_load_flat_v3s16_align8
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -3265,22 +3364,21 @@ body: |
     ; VI-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
     ; VI-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
     ; VI-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[BITCAST]], [[C2]]
-    ; VI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C2]]
-    ; VI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C1]](s32)
+    ; VI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LSHR]], [[C1]](s32)
     ; VI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
     ; VI-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
-    ; VI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[LOAD1]], [[C2]]
-    ; VI-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C2]]
-    ; VI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C1]](s32)
-    ; VI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]]
+    ; VI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LOAD1]], [[C2]]
+    ; VI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C2]]
+    ; VI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C1]](s32)
+    ; VI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND1]], [[SHL1]]
     ; VI-NEXT: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
-    ; VI-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C2]]
-    ; VI-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[BITCAST2]], [[C2]]
-    ; VI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C1]](s32)
-    ; VI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]]
+    ; VI-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[BITCAST2]], [[C2]]
+    ; VI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C1]](s32)
+    ; VI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[LSHR1]], [[SHL2]]
     ; VI-NEXT: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
     ; VI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
     ; GFX9-LABEL: name: test_load_flat_v3s16_align8
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -3333,22 +3431,21 @@ body: |
     ; CI-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
     ; CI-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
     ; CI-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[BITCAST]], [[C2]]
-    ; CI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C2]]
-    ; CI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C1]](s32)
+    ; CI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LSHR]], [[C1]](s32)
     ; CI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
     ; CI-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
-    ; CI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[LOAD1]], [[C2]]
-    ; CI-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C2]]
-    ; CI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C1]](s32)
-    ; CI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]]
+    ; CI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LOAD1]], [[C2]]
+    ; CI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C2]]
+    ; CI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C1]](s32)
+    ; CI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND1]], [[SHL1]]
     ; CI-NEXT: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
-    ; CI-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C2]]
-    ; CI-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[BITCAST2]], [[C2]]
-    ; CI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C1]](s32)
-    ; CI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]]
+    ; CI-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[BITCAST2]], [[C2]]
+    ; CI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C1]](s32)
+    ; CI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[LSHR1]], [[SHL2]]
     ; CI-NEXT: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
     ; CI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
     ; VI-LABEL: name: test_load_flat_v3s16_align4
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -3367,22 +3464,21 @@ body: |
     ; VI-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
     ; VI-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
     ; VI-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[BITCAST]], [[C2]]
-    ; VI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C2]]
-    ; VI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C1]](s32)
+    ; VI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LSHR]], [[C1]](s32)
     ; VI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
     ; VI-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
-    ; VI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[LOAD1]], [[C2]]
-    ; VI-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C2]]
-    ; VI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C1]](s32)
-    ; VI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]]
+    ; VI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LOAD1]], [[C2]]
+    ; VI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C2]]
+    ; VI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C1]](s32)
+    ; VI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND1]], [[SHL1]]
     ; VI-NEXT: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
-    ; VI-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C2]]
-    ; VI-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[BITCAST2]], [[C2]]
-    ; VI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C1]](s32)
-    ; VI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]]
+    ; VI-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[BITCAST2]], [[C2]]
+    ; VI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C1]](s32)
+    ; VI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[LSHR1]], [[SHL2]]
     ; VI-NEXT: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
     ; VI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
     ; GFX9-LABEL: name: test_load_flat_v3s16_align4
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -3452,13 +3548,13 @@ body: |
     ; CI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C2]](s32)
     ; CI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]]
     ; CI-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
-    ; CI-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C3]]
-    ; CI-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C3]]
-    ; CI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C2]](s32)
-    ; CI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]]
+    ; CI-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C3]]
+    ; CI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND4]], [[C2]](s32)
+    ; CI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[LSHR]], [[SHL2]]
     ; CI-NEXT: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
     ; CI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
     ; VI-LABEL: name: test_load_flat_v3s16_align2
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -3487,13 +3583,13 @@ body: |
     ; VI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C2]](s32)
     ; VI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]]
     ; VI-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
-    ; VI-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C3]]
-    ; VI-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C3]]
-    ; VI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C2]](s32)
-    ; VI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]]
+    ; VI-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C3]]
+    ; VI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND4]], [[C2]](s32)
+    ; VI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[LSHR]], [[SHL2]]
     ; VI-NEXT: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
     ; VI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
     ; GFX9-LABEL: name: test_load_flat_v3s16_align2
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -3577,13 +3673,13 @@ body: |
     ; CI-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32)
     ; CI-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL4]]
     ; CI-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR4]](s32)
-    ; CI-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C5]]
-    ; CI-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C5]]
-    ; CI-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C4]](s32)
-    ; CI-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL5]]
+    ; CI-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C5]]
+    ; CI-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND4]], [[C4]](s32)
+    ; CI-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[LSHR]], [[SHL5]]
     ; CI-NEXT: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR5]](s32)
     ; CI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
     ; VI-LABEL: name: test_load_flat_v3s16_align1
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -3626,13 +3722,13 @@ body: |
     ; VI-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32)
     ; VI-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL4]]
     ; VI-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR4]](s32)
-    ; VI-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C5]]
-    ; VI-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C5]]
-    ; VI-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C4]](s32)
-    ; VI-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL5]]
+    ; VI-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C5]]
+    ; VI-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND4]], [[C4]](s32)
+    ; VI-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[LSHR]], [[SHL5]]
     ; VI-NEXT: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR5]](s32)
     ; VI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
     ; GFX9-LABEL: name: test_load_flat_v3s16_align1
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -3698,6 +3794,7 @@ body: |
     ; CI-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[PTR_ADD]](p0) :: (load (<2 x s16>) from unknown-address + 4)
     ; CI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[LOAD]](<2 x s16>), [[LOAD1]](<2 x s16>)
     ; CI-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
     ; VI-LABEL: name: test_load_flat_v4s16_align8
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -3708,6 +3805,7 @@ body: |
     ; VI-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[PTR_ADD]](p0) :: (load (<2 x s16>) from unknown-address + 4)
     ; VI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[LOAD]](<2 x s16>), [[LOAD1]](<2 x s16>)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
     ; GFX9-LABEL: name: test_load_flat_v4s16_align8
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -3735,6 +3833,7 @@ body: |
     ; CI-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[PTR_ADD]](p0) :: (load (<2 x s16>) from unknown-address + 4)
     ; CI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[LOAD]](<2 x s16>), [[LOAD1]](<2 x s16>)
     ; CI-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
     ; VI-LABEL: name: test_load_flat_v4s16_align4
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -3745,6 +3844,7 @@ body: |
     ; VI-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[PTR_ADD]](p0) :: (load (<2 x s16>) from unknown-address + 4)
     ; VI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[LOAD]](<2 x s16>), [[LOAD1]](<2 x s16>)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
     ; GFX9-LABEL: name: test_load_flat_v4s16_align4
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -3789,6 +3889,7 @@ body: |
     ; CI-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
     ; CI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>)
     ; CI-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
     ; VI-LABEL: name: test_load_flat_v4s16_align2
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -3816,6 +3917,7 @@ body: |
     ; VI-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
     ; VI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
     ; GFX9-LABEL: name: test_load_flat_v4s16_align2
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -3894,6 +3996,7 @@ body: |
     ; CI-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR5]](s32)
     ; CI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>)
     ; CI-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
     ; VI-LABEL: name: test_load_flat_v4s16_align1
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -3939,6 +4042,7 @@ body: |
     ; VI-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR5]](s32)
     ; VI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
     ; GFX9-LABEL: name: test_load_flat_v4s16_align1
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -4007,6 +4111,7 @@ body: |
     ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32)
     ; CI-NEXT: [[BITCAST:%[0-9]+]]:_(<8 x s16>) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<8 x s16>)
+    ;
     ; VI-LABEL: name: test_load_flat_v8s16_align8
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -4024,6 +4129,7 @@ body: |
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32)
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(<8 x s16>) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<8 x s16>)
+    ;
     ; GFX9-LABEL: name: test_load_flat_v8s16_align8
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -4052,6 +4158,7 @@ body: |
     ; CI-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s32) from unknown-address + 4)
     ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32)
     ; CI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; VI-LABEL: name: test_load_flat_v2s32_align8
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -4062,6 +4169,7 @@ body: |
     ; VI-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s32) from unknown-address + 4)
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; GFX9-LABEL: name: test_load_flat_v2s32_align8
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -4089,6 +4197,7 @@ body: |
     ; CI-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s32) from unknown-address + 4)
     ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32)
     ; CI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; VI-LABEL: name: test_load_flat_v2s32_align4
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -4099,6 +4208,7 @@ body: |
     ; VI-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s32) from unknown-address + 4)
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; GFX9-LABEL: name: test_load_flat_v2s32_align4
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -4127,6 +4237,7 @@ body: |
     ; CI-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s32) from unknown-address + 4)
     ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32)
     ; CI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; VI-LABEL: name: test_load_flat_v2s32_align1
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -4137,6 +4248,7 @@ body: |
     ; VI-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p0) :: (load (s32) from unknown-address + 4)
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; GFX9-LABEL: name: test_load_flat_v2s32_align1
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -4167,6 +4279,7 @@ body: |
     ; CI-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s32) from unknown-address + 8, align 8)
     ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
+    ;
     ; VI-LABEL: name: test_load_flat_v3s32_align16
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -4180,6 +4293,7 @@ body: |
     ; VI-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s32) from unknown-address + 8, align 8)
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
+    ;
     ; GFX9-LABEL: name: test_load_flat_v3s32_align16
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -4212,6 +4326,7 @@ body: |
     ; CI-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s32) from unknown-address + 8)
     ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
+    ;
     ; VI-LABEL: name: test_load_flat_v3s32_align4
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -4225,6 +4340,7 @@ body: |
     ; VI-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s32) from unknown-address + 8)
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
+    ;
     ; GFX9-LABEL: name: test_load_flat_v3s32_align4
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -4258,6 +4374,7 @@ body: |
     ; CI-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s32) from unknown-address + 12)
     ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
+    ;
     ; VI-LABEL: name: test_load_flat_v4s32_align16
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -4274,6 +4391,7 @@ body: |
     ; VI-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s32) from unknown-address + 12)
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
+    ;
     ; GFX9-LABEL: name: test_load_flat_v4s32_align16
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -4307,6 +4425,7 @@ body: |
     ; CI-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s32) from unknown-address + 12)
     ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
+    ;
     ; VI-LABEL: name: test_load_flat_v4s32_align8
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -4323,6 +4442,7 @@ body: |
     ; VI-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s32) from unknown-address + 12)
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
+    ;
     ; GFX9-LABEL: name: test_load_flat_v4s32_align8
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -4356,6 +4476,7 @@ body: |
     ; CI-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s32) from unknown-address + 12)
     ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
+    ;
     ; VI-LABEL: name: test_load_flat_v4s32_align4
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -4372,6 +4493,7 @@ body: |
     ; VI-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p0) :: (load (s32) from unknown-address + 12)
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
+    ;
     ; GFX9-LABEL: name: test_load_flat_v4s32_align4
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -4417,6 +4539,7 @@ body: |
     ; CI-NEXT: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s32) from unknown-address + 28)
     ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32), [[LOAD4]](s32), [[LOAD5]](s32), [[LOAD6]](s32), [[LOAD7]](s32)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<8 x s32>)
+    ;
     ; VI-LABEL: name: test_load_flat_v8s32_align32
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -4445,6 +4568,7 @@ body: |
     ; VI-NEXT: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p0) :: (load (s32) from unknown-address + 28)
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32), [[LOAD4]](s32), [[LOAD5]](s32), [[LOAD6]](s32), [[LOAD7]](s32)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<8 x s32>)
+    ;
     ; GFX9-LABEL: name: test_load_flat_v8s32_align32
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -4518,6 +4642,7 @@ body: |
     ; CI-NEXT: [[LOAD15:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s32) from unknown-address + 60)
     ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<16 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32), [[LOAD4]](s32), [[LOAD5]](s32), [[LOAD6]](s32), [[LOAD7]](s32), [[LOAD8]](s32), [[LOAD9]](s32), [[LOAD10]](s32), [[LOAD11]](s32), [[LOAD12]](s32), [[LOAD13]](s32), [[LOAD14]](s32), [[LOAD15]](s32)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[BUILD_VECTOR]](<16 x s32>)
+    ;
     ; VI-LABEL: name: test_load_flat_v16s32_align32
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -4570,6 +4695,7 @@ body: |
     ; VI-NEXT: [[LOAD15:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p0) :: (load (s32) from unknown-address + 60)
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<16 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32), [[LOAD4]](s32), [[LOAD5]](s32), [[LOAD6]](s32), [[LOAD7]](s32), [[LOAD8]](s32), [[LOAD9]](s32), [[LOAD10]](s32), [[LOAD11]](s32), [[LOAD12]](s32), [[LOAD13]](s32), [[LOAD14]](s32), [[LOAD15]](s32)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[BUILD_VECTOR]](<16 x s32>)
+    ;
     ; GFX9-LABEL: name: test_load_flat_v16s32_align32
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -4614,6 +4740,7 @@ body: |
     ; CI-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD2]](s32), [[LOAD3]](s32)
     ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ;
     ; VI-LABEL: name: test_load_flat_v2s64_align16
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -4631,6 +4758,7 @@ body: |
     ; VI-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD2]](s32), [[LOAD3]](s32)
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ;
     ; GFX9-LABEL: name: test_load_flat_v2s64_align16
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -4665,6 +4793,7 @@ body: |
     ; CI-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD2]](s32), [[LOAD3]](s32)
     ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ;
     ; VI-LABEL: name: test_load_flat_v2s64_align8
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -4682,6 +4811,7 @@ body: |
     ; VI-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD2]](s32), [[LOAD3]](s32)
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ;
     ; GFX9-LABEL: name: test_load_flat_v2s64_align8
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -4716,6 +4846,7 @@ body: |
     ; CI-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD2]](s32), [[LOAD3]](s32)
     ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ;
     ; VI-LABEL: name: test_load_flat_v2s64_align4
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -4733,6 +4864,7 @@ body: |
     ; VI-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD2]](s32), [[LOAD3]](s32)
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ;
     ; GFX9-LABEL: name: test_load_flat_v2s64_align4
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -4785,6 +4917,7 @@ body: |
     ; CI-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR2]](s32), [[OR3]](s32)
     ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ;
     ; VI-LABEL: name: test_load_flat_v2s64_align2
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -4820,6 +4953,7 @@ body: |
     ; VI-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR2]](s32), [[OR3]](s32)
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ;
     ; GFX9-LABEL: name: test_load_flat_v2s64_align2
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -4943,6 +5077,7 @@ body: |
     ; CI-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR8]](s32), [[OR11]](s32)
     ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ;
     ; VI-LABEL: name: test_load_flat_v2s64_align1
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -5012,6 +5147,7 @@ body: |
     ; VI-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR8]](s32), [[OR11]](s32)
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ;
     ; GFX9-LABEL: name: test_load_flat_v2s64_align1
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -5125,6 +5261,7 @@ body: |
     ; CI-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64), [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>)
     ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[UV3]](s64)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
+    ;
     ; VI-LABEL: name: test_load_flat_v3s64_align32
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -5150,6 +5287,7 @@ body: |
     ; VI-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64), [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>)
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[UV3]](s64)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
+    ;
     ; GFX9-LABEL: name: test_load_flat_v3s64_align32
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -5201,6 +5339,7 @@ body: |
     ; CI-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64), [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>)
     ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[UV3]](s64)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
+    ;
     ; VI-LABEL: name: test_load_flat_v3s64_align8
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -5226,6 +5365,7 @@ body: |
     ; VI-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64), [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>)
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[UV3]](s64)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
+    ;
     ; GFX9-LABEL: name: test_load_flat_v3s64_align8
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -5353,6 +5493,7 @@ body: |
     ; CI-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64), [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>)
     ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[UV3]](s64)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
+    ;
     ; VI-LABEL: name: test_load_flat_v3s64_align1
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -5454,6 +5595,7 @@ body: |
     ; VI-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64), [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>)
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[UV3]](s64)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
+    ;
     ; GFX9-LABEL: name: test_load_flat_v3s64_align1
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -5609,6 +5751,7 @@ body: |
     ; CI-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD6]](s32), [[LOAD7]](s32)
     ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[MV3]](s64)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
+    ;
     ; VI-LABEL: name: test_load_flat_v4s64_align32
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -5638,6 +5781,7 @@ body: |
     ; VI-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD6]](s32), [[LOAD7]](s32)
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[MV3]](s64)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
+    ;
     ; GFX9-LABEL: name: test_load_flat_v4s64_align32
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -5688,6 +5832,7 @@ body: |
     ; CI-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD6]](s32), [[LOAD7]](s32)
     ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[MV3]](s64)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
+    ;
     ; VI-LABEL: name: test_load_flat_v4s64_align8
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -5717,6 +5862,7 @@ body: |
     ; VI-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD6]](s32), [[LOAD7]](s32)
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[MV3]](s64)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
+    ;
     ; GFX9-LABEL: name: test_load_flat_v4s64_align8
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -5867,6 +6013,7 @@ body: |
     ; CI-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR20]](s32), [[OR23]](s32)
     ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[MV3]](s64)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
+    ;
     ; VI-LABEL: name: test_load_flat_v4s64_align1
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -5996,6 +6143,7 @@ body: |
     ; VI-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR20]](s32), [[OR23]](s32)
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[MV3]](s64)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
+    ;
     ; GFX9-LABEL: name: test_load_flat_v4s64_align1
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -6182,6 +6330,7 @@ body: |
     ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32), [[LOAD4]](s32), [[LOAD5]](s32), [[LOAD6]](s32), [[LOAD7]](s32)
     ; CI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s128>) = G_BITCAST [[BUILD_VECTOR]](<8 x s32>)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BITCAST]](<2 x s128>)
+    ;
     ; VI-LABEL: name: test_load_flat_v2s128_align32
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -6211,6 +6360,7 @@ body: |
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32), [[LOAD4]](s32), [[LOAD5]](s32), [[LOAD6]](s32), [[LOAD7]](s32)
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s128>) = G_BITCAST [[BUILD_VECTOR]](<8 x s32>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BITCAST]](<2 x s128>)
+    ;
     ; GFX9-LABEL: name: test_load_flat_v2s128_align32
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -6250,6 +6400,7 @@ body: |
     ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32)
     ; CI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>)
+    ;
     ; VI-LABEL: name: test_load_flat_v2p1_align16
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -6267,6 +6418,7 @@ body: |
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32)
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>)
+    ;
     ; GFX9-LABEL: name: test_load_flat_v2p1_align16
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -6302,6 +6454,7 @@ body: |
     ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32)
     ; CI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>)
+    ;
     ; VI-LABEL: name: test_load_flat_v2p1_align8
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -6319,6 +6472,7 @@ body: |
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32)
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>)
+    ;
     ; GFX9-LABEL: name: test_load_flat_v2p1_align8
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -6354,6 +6508,7 @@ body: |
     ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32)
     ; CI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>)
+    ;
     ; VI-LABEL: name: test_load_flat_v2p1_align4
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -6371,6 +6526,7 @@ body: |
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32)
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>)
+    ;
     ; GFX9-LABEL: name: test_load_flat_v2p1_align4
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -6458,6 +6614,7 @@ body: |
     ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
     ; CI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>)
+    ;
     ; VI-LABEL: name: test_load_flat_v2p1_align1
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -6527,6 +6684,7 @@ body: |
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>)
+    ;
     ; GFX9-LABEL: name: test_load_flat_v2p1_align1
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -6617,6 +6775,7 @@ body: |
     ; CI-NEXT: [[LOAD1:%[0-9]+]]:_(p3) = G_LOAD [[PTR_ADD]](p0) :: (load (p3) from unknown-address + 4)
     ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[LOAD]](p3), [[LOAD1]](p3)
     ; CI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x p3>)
+    ;
     ; VI-LABEL: name: test_load_flat_v2p3_align8
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -6627,6 +6786,7 @@ body: |
     ; VI-NEXT: [[LOAD1:%[0-9]+]]:_(p3) = G_LOAD [[PTR_ADD]](p0) :: (load (p3) from unknown-address + 4)
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[LOAD]](p3), [[LOAD1]](p3)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x p3>)
+    ;
     ; GFX9-LABEL: name: test_load_flat_v2p3_align8
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -6654,6 +6814,7 @@ body: |
     ; CI-NEXT: [[LOAD1:%[0-9]+]]:_(p3) = G_LOAD [[PTR_ADD]](p0) :: (load (p3) from unknown-address + 4)
     ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[LOAD]](p3), [[LOAD1]](p3)
     ; CI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x p3>)
+    ;
     ; VI-LABEL: name: test_load_flat_v2p3_align4
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -6664,6 +6825,7 @@ body: |
     ; VI-NEXT: [[LOAD1:%[0-9]+]]:_(p3) = G_LOAD [[PTR_ADD]](p0) :: (load (p3) from unknown-address + 4)
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[LOAD]](p3), [[LOAD1]](p3)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x p3>)
+    ;
     ; GFX9-LABEL: name: test_load_flat_v2p3_align4
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -6721,6 +6883,7 @@ body: |
     ; CI-NEXT: [[INTTOPTR1:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR5]](s32)
     ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[INTTOPTR]](p3), [[INTTOPTR1]](p3)
     ; CI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x p3>)
+    ;
     ; VI-LABEL: name: test_load_flat_v2p3_align1
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -6761,6 +6924,7 @@ body: |
     ; VI-NEXT: [[INTTOPTR1:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR5]](s32)
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[INTTOPTR]](p3), [[INTTOPTR1]](p3)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x p3>)
+    ;
     ; GFX9-LABEL: name: test_load_flat_v2p3_align1
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -6818,12 +6982,14 @@ body: |
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s8), align 4)
     ; CI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; VI-LABEL: name: test_ext_load_flat_s32_from_1_align4
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s8), align 4)
     ; VI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX9-LABEL: name: test_ext_load_flat_s32_from_1_align4
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -6847,12 +7013,14 @@ body: |
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 4)
     ; CI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; VI-LABEL: name: test_ext_load_flat_s32_from_2_align4
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 4)
     ; VI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX9-LABEL: name: test_ext_load_flat_s32_from_2_align4
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -6878,6 +7046,7 @@ body: |
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s8), align 4)
     ; CI-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; CI-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
     ; VI-LABEL: name: test_ext_load_flat_s64_from_1_align4
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -6885,6 +7054,7 @@ body: |
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s8), align 4)
     ; VI-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
     ; GFX9-LABEL: name: test_ext_load_flat_s64_from_1_align4
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -6910,6 +7080,7 @@ body: |
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 4)
     ; CI-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; CI-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
     ; VI-LABEL: name: test_ext_load_flat_s64_from_2_align4
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -6917,6 +7088,7 @@ body: |
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 4)
     ; VI-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
     ; GFX9-LABEL: name: test_ext_load_flat_s64_from_2_align4
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -6942,6 +7114,7 @@ body: |
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32))
     ; CI-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; CI-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
     ; VI-LABEL: name: test_ext_load_flat_s64_from_4_align4
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -6949,6 +7122,7 @@ body: |
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s32))
     ; VI-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
     ; GFX9-LABEL: name: test_ext_load_flat_s64_from_4_align4
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -6977,6 +7151,7 @@ body: |
     ; CI-NEXT: [[DEF1:%[0-9]+]]:_(s64) = G_IMPLICIT_DEF
     ; CI-NEXT: [[MV1:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[MV]](s64), [[DEF1]](s64)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV1]](s128)
+    ;
     ; VI-LABEL: name: test_ext_load_flat_s128_from_4_align4
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -6987,6 +7162,7 @@ body: |
     ; VI-NEXT: [[DEF1:%[0-9]+]]:_(s64) = G_IMPLICIT_DEF
     ; VI-NEXT: [[MV1:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[MV]](s64), [[DEF1]](s64)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV1]](s128)
+    ;
     ; GFX9-LABEL: name: test_ext_load_flat_s128_from_4_align4
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -7015,6 +7191,7 @@ body: |
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 4)
     ; CI-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; CI-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
     ; VI-LABEL: name: test_ext_load_flat_s64_from_2_align2
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -7022,6 +7199,7 @@ body: |
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 4)
     ; VI-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
     ; GFX9-LABEL: name: test_ext_load_flat_s64_from_2_align2
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -7047,6 +7225,7 @@ body: |
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s8), align 4)
     ; CI-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; CI-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
     ; VI-LABEL: name: test_ext_load_flat_s64_from_1_align1
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -7054,6 +7233,7 @@ body: |
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s8), align 4)
     ; VI-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
     ; GFX9-LABEL: name: test_ext_load_flat_s64_from_1_align1
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -7079,12 +7259,14 @@ body: |
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 536870912)
     ; CI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; VI-LABEL: name: test_load_flat_s32_align536870912
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p0) :: (load (s16), align 536870912)
     ; VI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX9-LABEL: name: test_load_flat_s32_align536870912
     ; GFX9: liveins: $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-global.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-global.mir
index fcbcfddef1580..c0fb02fb184cf 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-global.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-global.mir
@@ -39,6 +39,7 @@ body: |
     ; SI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; SI-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LOAD]], [[C]]
     ; SI-NEXT: $vgpr0 = COPY [[AND]](s32)
+    ;
     ; CI-HSA-LABEL: name: test_load_global_s1_align1
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
@@ -47,6 +48,7 @@ body: |
     ; CI-HSA-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; CI-HSA-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LOAD]], [[C]]
     ; CI-HSA-NEXT: $vgpr0 = COPY [[AND]](s32)
+    ;
     ; CI-MESA-LABEL: name: test_load_global_s1_align1
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
@@ -55,6 +57,7 @@ body: |
     ; CI-MESA-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; CI-MESA-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LOAD]], [[C]]
     ; CI-MESA-NEXT: $vgpr0 = COPY [[AND]](s32)
+    ;
     ; VI-LABEL: name: test_load_global_s1_align1
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -63,6 +66,7 @@ body: |
     ; VI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; VI-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LOAD]], [[C]]
     ; VI-NEXT: $vgpr0 = COPY [[AND]](s32)
+    ;
     ; GFX9-HSA-LABEL: name: test_load_global_s1_align1
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
@@ -71,6 +75,7 @@ body: |
     ; GFX9-HSA-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; GFX9-HSA-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LOAD]], [[C]]
     ; GFX9-HSA-NEXT: $vgpr0 = COPY [[AND]](s32)
+    ;
     ; GFX9-MESA-LABEL: name: test_load_global_s1_align1
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -99,6 +104,7 @@ body: |
     ; SI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
     ; SI-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LOAD]], [[C]]
     ; SI-NEXT: $vgpr0 = COPY [[AND]](s32)
+    ;
     ; CI-HSA-LABEL: name: test_load_global_s2_align1
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
@@ -107,6 +113,7 @@ body: |
     ; CI-HSA-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
     ; CI-HSA-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LOAD]], [[C]]
     ; CI-HSA-NEXT: $vgpr0 = COPY [[AND]](s32)
+    ;
     ; CI-MESA-LABEL: name: test_load_global_s2_align1
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
@@ -115,6 +122,7 @@ body: |
     ; CI-MESA-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
     ; CI-MESA-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LOAD]], [[C]]
     ; CI-MESA-NEXT: $vgpr0 = COPY [[AND]](s32)
+    ;
     ; VI-LABEL: name: test_load_global_s2_align1
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -123,6 +131,7 @@ body: |
     ; VI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
     ; VI-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LOAD]], [[C]]
     ; VI-NEXT: $vgpr0 = COPY [[AND]](s32)
+    ;
     ; GFX9-HSA-LABEL: name: test_load_global_s2_align1
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
@@ -131,6 +140,7 @@ body: |
     ; GFX9-HSA-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
     ; GFX9-HSA-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LOAD]], [[C]]
     ; GFX9-HSA-NEXT: $vgpr0 = COPY [[AND]](s32)
+    ;
     ; GFX9-MESA-LABEL: name: test_load_global_s2_align1
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -157,30 +167,35 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; SI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s8), align 4, addrspace 1)
     ; SI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; CI-HSA-LABEL: name: test_load_global_s8_align4
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
     ; CI-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-HSA-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s8), align 4, addrspace 1)
     ; CI-HSA-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; CI-MESA-LABEL: name: test_load_global_s8_align4
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
     ; CI-MESA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-MESA-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s8), align 4, addrspace 1)
     ; CI-MESA-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; VI-LABEL: name: test_load_global_s8_align4
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s8), align 4, addrspace 1)
     ; VI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX9-HSA-LABEL: name: test_load_global_s8_align4
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
     ; GFX9-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s8), align 4, addrspace 1)
     ; GFX9-HSA-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX9-MESA-LABEL: name: test_load_global_s8_align4
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -205,30 +220,35 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; SI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s8), addrspace 1)
     ; SI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; CI-HSA-LABEL: name: test_load_global_s8_align1
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
     ; CI-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-HSA-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s8), addrspace 1)
     ; CI-HSA-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; CI-MESA-LABEL: name: test_load_global_s8_align1
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
     ; CI-MESA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-MESA-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s8), addrspace 1)
     ; CI-MESA-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; VI-LABEL: name: test_load_global_s8_align1
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s8), addrspace 1)
     ; VI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX9-HSA-LABEL: name: test_load_global_s8_align1
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
     ; GFX9-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s8), addrspace 1)
     ; GFX9-HSA-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX9-MESA-LABEL: name: test_load_global_s8_align1
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -253,30 +273,35 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; SI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s16), align 4, addrspace 1)
     ; SI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; CI-HSA-LABEL: name: test_load_global_s16_align4
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
     ; CI-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-HSA-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s16), align 4, addrspace 1)
     ; CI-HSA-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; CI-MESA-LABEL: name: test_load_global_s16_align4
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
     ; CI-MESA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-MESA-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s16), align 4, addrspace 1)
     ; CI-MESA-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; VI-LABEL: name: test_load_global_s16_align4
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s16), align 4, addrspace 1)
     ; VI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX9-HSA-LABEL: name: test_load_global_s16_align4
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
     ; GFX9-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s16), align 4, addrspace 1)
     ; GFX9-HSA-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX9-MESA-LABEL: name: test_load_global_s16_align4
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -301,30 +326,35 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; SI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s16), addrspace 1)
     ; SI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; CI-HSA-LABEL: name: test_load_global_s16_align2
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
     ; CI-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-HSA-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s16), addrspace 1)
     ; CI-HSA-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; CI-MESA-LABEL: name: test_load_global_s16_align2
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
     ; CI-MESA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-MESA-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s16), addrspace 1)
     ; CI-MESA-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; VI-LABEL: name: test_load_global_s16_align2
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s16), addrspace 1)
     ; VI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX9-HSA-LABEL: name: test_load_global_s16_align2
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
     ; GFX9-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s16), addrspace 1)
     ; GFX9-HSA-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX9-MESA-LABEL: name: test_load_global_s16_align2
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -355,12 +385,14 @@ body: |
     ; SI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
     ; SI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; SI-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
     ; CI-HSA-LABEL: name: test_load_global_s16_align1
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
     ; CI-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-HSA-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s16), align 1, addrspace 1)
     ; CI-HSA-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; CI-MESA-LABEL: name: test_load_global_s16_align1
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
@@ -373,6 +405,7 @@ body: |
     ; CI-MESA-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
     ; CI-MESA-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; CI-MESA-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
     ; VI-LABEL: name: test_load_global_s16_align1
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -385,12 +418,14 @@ body: |
     ; VI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
     ; VI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; VI-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
     ; GFX9-HSA-LABEL: name: test_load_global_s16_align1
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
     ; GFX9-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s16), align 1, addrspace 1)
     ; GFX9-HSA-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX9-MESA-LABEL: name: test_load_global_s16_align1
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -421,30 +456,35 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; SI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s32), addrspace 1)
     ; SI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; CI-HSA-LABEL: name: test_load_global_s32_align4
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
     ; CI-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-HSA-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s32), addrspace 1)
     ; CI-HSA-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; CI-MESA-LABEL: name: test_load_global_s32_align4
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
     ; CI-MESA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-MESA-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s32), addrspace 1)
     ; CI-MESA-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; VI-LABEL: name: test_load_global_s32_align4
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s32), addrspace 1)
     ; VI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX9-HSA-LABEL: name: test_load_global_s32_align4
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
     ; GFX9-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s32), addrspace 1)
     ; GFX9-HSA-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX9-MESA-LABEL: name: test_load_global_s32_align4
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -474,12 +514,14 @@ body: |
     ; SI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
     ; SI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; SI-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
     ; CI-HSA-LABEL: name: test_load_global_s32_align2
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
     ; CI-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-HSA-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s32), align 2, addrspace 1)
     ; CI-HSA-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; CI-MESA-LABEL: name: test_load_global_s32_align2
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
@@ -492,6 +534,7 @@ body: |
     ; CI-MESA-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
     ; CI-MESA-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; CI-MESA-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
     ; VI-LABEL: name: test_load_global_s32_align2
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -504,12 +547,14 @@ body: |
     ; VI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
     ; VI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; VI-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
     ; GFX9-HSA-LABEL: name: test_load_global_s32_align2
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
     ; GFX9-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s32), align 2, addrspace 1)
     ; GFX9-HSA-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX9-MESA-LABEL: name: test_load_global_s32_align2
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -555,12 +600,14 @@ body: |
     ; SI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
     ; SI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
     ; SI-NEXT: $vgpr0 = COPY [[OR2]](s32)
+    ;
     ; CI-HSA-LABEL: name: test_load_global_s32_align1
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
     ; CI-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-HSA-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s32), align 1, addrspace 1)
     ; CI-HSA-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; CI-MESA-LABEL: name: test_load_global_s32_align1
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
@@ -583,6 +630,7 @@ body: |
     ; CI-MESA-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
     ; CI-MESA-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
     ; CI-MESA-NEXT: $vgpr0 = COPY [[OR2]](s32)
+    ;
     ; VI-LABEL: name: test_load_global_s32_align1
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -605,12 +653,14 @@ body: |
     ; VI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
     ; VI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
     ; VI-NEXT: $vgpr0 = COPY [[OR2]](s32)
+    ;
     ; GFX9-HSA-LABEL: name: test_load_global_s32_align1
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
     ; GFX9-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s32), align 1, addrspace 1)
     ; GFX9-HSA-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX9-MESA-LABEL: name: test_load_global_s32_align1
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -650,30 +700,35 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; SI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s32), align 8, addrspace 1)
     ; SI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; CI-HSA-LABEL: name: test_load_global_s24_align8
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
     ; CI-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-HSA-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s32), align 8, addrspace 1)
     ; CI-HSA-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; CI-MESA-LABEL: name: test_load_global_s24_align8
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
     ; CI-MESA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-MESA-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s32), align 8, addrspace 1)
     ; CI-MESA-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; VI-LABEL: name: test_load_global_s24_align8
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s32), align 8, addrspace 1)
     ; VI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX9-HSA-LABEL: name: test_load_global_s24_align8
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
     ; GFX9-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s32), align 8, addrspace 1)
     ; GFX9-HSA-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX9-MESA-LABEL: name: test_load_global_s24_align8
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -698,30 +753,35 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; SI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s32), addrspace 1)
     ; SI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; CI-HSA-LABEL: name: test_load_global_s24_align4
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
     ; CI-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-HSA-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s32), addrspace 1)
     ; CI-HSA-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; CI-MESA-LABEL: name: test_load_global_s24_align4
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
     ; CI-MESA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-MESA-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s32), addrspace 1)
     ; CI-MESA-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; VI-LABEL: name: test_load_global_s24_align4
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s32), addrspace 1)
     ; VI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX9-HSA-LABEL: name: test_load_global_s24_align4
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
     ; GFX9-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s32), addrspace 1)
     ; GFX9-HSA-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX9-MESA-LABEL: name: test_load_global_s24_align4
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -752,6 +812,7 @@ body: |
     ; SI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
     ; SI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; SI-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
     ; CI-HSA-LABEL: name: test_load_global_s24_align2
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
@@ -764,6 +825,7 @@ body: |
     ; CI-HSA-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
     ; CI-HSA-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; CI-HSA-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
     ; CI-MESA-LABEL: name: test_load_global_s24_align2
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
@@ -776,6 +838,7 @@ body: |
     ; CI-MESA-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
     ; CI-MESA-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; CI-MESA-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
     ; VI-LABEL: name: test_load_global_s24_align2
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -788,6 +851,7 @@ body: |
     ; VI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
     ; VI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; VI-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
     ; GFX9-HSA-LABEL: name: test_load_global_s24_align2
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
@@ -800,6 +864,7 @@ body: |
     ; GFX9-HSA-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
     ; GFX9-HSA-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; GFX9-HSA-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
     ; GFX9-MESA-LABEL: name: test_load_global_s24_align2
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -842,6 +907,7 @@ body: |
     ; SI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C3]](s32)
     ; SI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[OR]]
     ; SI-NEXT: $vgpr0 = COPY [[OR1]](s32)
+    ;
     ; CI-HSA-LABEL: name: test_load_global_s24_align1
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
@@ -854,6 +920,7 @@ body: |
     ; CI-HSA-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
     ; CI-HSA-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; CI-HSA-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
     ; CI-MESA-LABEL: name: test_load_global_s24_align1
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
@@ -872,6 +939,7 @@ body: |
     ; CI-MESA-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C3]](s32)
     ; CI-MESA-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[OR]]
     ; CI-MESA-NEXT: $vgpr0 = COPY [[OR1]](s32)
+    ;
     ; VI-LABEL: name: test_load_global_s24_align1
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -890,6 +958,7 @@ body: |
     ; VI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C3]](s32)
     ; VI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[OR]]
     ; VI-NEXT: $vgpr0 = COPY [[OR1]](s32)
+    ;
     ; GFX9-HSA-LABEL: name: test_load_global_s24_align1
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
@@ -902,6 +971,7 @@ body: |
     ; GFX9-HSA-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
     ; GFX9-HSA-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; GFX9-HSA-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
     ; GFX9-MESA-LABEL: name: test_load_global_s24_align1
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -940,6 +1010,7 @@ body: |
     ; SI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 281474976710655
     ; SI-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[LOAD]], [[C]]
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[AND]](s64)
+    ;
     ; CI-HSA-LABEL: name: test_load_global_s48_align8
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
@@ -948,6 +1019,7 @@ body: |
     ; CI-HSA-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 281474976710655
     ; CI-HSA-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[LOAD]], [[C]]
     ; CI-HSA-NEXT: $vgpr0_vgpr1 = COPY [[AND]](s64)
+    ;
     ; CI-MESA-LABEL: name: test_load_global_s48_align8
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
@@ -956,6 +1028,7 @@ body: |
     ; CI-MESA-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 281474976710655
     ; CI-MESA-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[LOAD]], [[C]]
     ; CI-MESA-NEXT: $vgpr0_vgpr1 = COPY [[AND]](s64)
+    ;
     ; VI-LABEL: name: test_load_global_s48_align8
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -964,6 +1037,7 @@ body: |
     ; VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 281474976710655
     ; VI-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[LOAD]], [[C]]
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[AND]](s64)
+    ;
     ; GFX9-HSA-LABEL: name: test_load_global_s48_align8
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
@@ -972,6 +1046,7 @@ body: |
     ; GFX9-HSA-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 281474976710655
     ; GFX9-HSA-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[LOAD]], [[C]]
     ; GFX9-HSA-NEXT: $vgpr0_vgpr1 = COPY [[AND]](s64)
+    ;
     ; GFX9-MESA-LABEL: name: test_load_global_s48_align8
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -998,30 +1073,35 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; SI-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p1) :: (load (s64), addrspace 1)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
+    ;
     ; CI-HSA-LABEL: name: test_load_global_s64_align8
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
     ; CI-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-HSA-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p1) :: (load (s64), addrspace 1)
     ; CI-HSA-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
+    ;
     ; CI-MESA-LABEL: name: test_load_global_s64_align8
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
     ; CI-MESA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-MESA-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p1) :: (load (s64), addrspace 1)
     ; CI-MESA-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
+    ;
     ; VI-LABEL: name: test_load_global_s64_align8
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p1) :: (load (s64), addrspace 1)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
+    ;
     ; GFX9-HSA-LABEL: name: test_load_global_s64_align8
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
     ; GFX9-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p1) :: (load (s64), addrspace 1)
     ; GFX9-HSA-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
+    ;
     ; GFX9-MESA-LABEL: name: test_load_global_s64_align8
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -1045,30 +1125,35 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; SI-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p1) :: (load (s64), align 4, addrspace 1)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
+    ;
     ; CI-HSA-LABEL: name: test_load_global_s64_align4
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
     ; CI-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-HSA-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p1) :: (load (s64), align 4, addrspace 1)
     ; CI-HSA-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
+    ;
     ; CI-MESA-LABEL: name: test_load_global_s64_align4
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
     ; CI-MESA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-MESA-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p1) :: (load (s64), align 4, addrspace 1)
     ; CI-MESA-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
+    ;
     ; VI-LABEL: name: test_load_global_s64_align4
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p1) :: (load (s64), align 4, addrspace 1)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
+    ;
     ; GFX9-HSA-LABEL: name: test_load_global_s64_align4
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
     ; GFX9-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p1) :: (load (s64), align 4, addrspace 1)
     ; GFX9-HSA-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
+    ;
     ; GFX9-MESA-LABEL: name: test_load_global_s64_align4
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -1110,12 +1195,14 @@ body: |
     ; SI-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32)
     ; SI-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]]
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[OR2]](s64)
+    ;
     ; CI-HSA-LABEL: name: test_load_global_s64_align2
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
     ; CI-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-HSA-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p1) :: (load (s64), align 2, addrspace 1)
     ; CI-HSA-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
+    ;
     ; CI-MESA-LABEL: name: test_load_global_s64_align2
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
@@ -1140,6 +1227,7 @@ body: |
     ; CI-MESA-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32)
     ; CI-MESA-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]]
     ; CI-MESA-NEXT: $vgpr0_vgpr1 = COPY [[OR2]](s64)
+    ;
     ; VI-LABEL: name: test_load_global_s64_align2
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -1164,12 +1252,14 @@ body: |
     ; VI-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32)
     ; VI-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]]
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[OR2]](s64)
+    ;
     ; GFX9-HSA-LABEL: name: test_load_global_s64_align2
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
     ; GFX9-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p1) :: (load (s64), align 2, addrspace 1)
     ; GFX9-HSA-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
+    ;
     ; GFX9-MESA-LABEL: name: test_load_global_s64_align2
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -1247,12 +1337,14 @@ body: |
     ; SI-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
     ; SI-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[OR6]](s64)
+    ;
     ; CI-HSA-LABEL: name: test_load_global_s64_align1
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
     ; CI-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-HSA-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p1) :: (load (s64), align 1, addrspace 1)
     ; CI-HSA-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
+    ;
     ; CI-MESA-LABEL: name: test_load_global_s64_align1
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
@@ -1295,6 +1387,7 @@ body: |
     ; CI-MESA-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
     ; CI-MESA-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
     ; CI-MESA-NEXT: $vgpr0_vgpr1 = COPY [[OR6]](s64)
+    ;
     ; VI-LABEL: name: test_load_global_s64_align1
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -1337,12 +1430,14 @@ body: |
     ; VI-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
     ; VI-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[OR6]](s64)
+    ;
     ; GFX9-HSA-LABEL: name: test_load_global_s64_align1
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
     ; GFX9-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p1) :: (load (s64), align 1, addrspace 1)
     ; GFX9-HSA-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
+    ;
     ; GFX9-MESA-LABEL: name: test_load_global_s64_align1
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -1405,6 +1500,7 @@ body: |
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[UV2]](s32)
     ; SI-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
     ; CI-HSA-LABEL: name: test_load_global_s96_align16
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
@@ -1412,6 +1508,7 @@ body: |
     ; CI-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p1) :: (load (<3 x s32>), align 16, addrspace 1)
     ; CI-HSA-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
     ; CI-HSA-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
     ; CI-MESA-LABEL: name: test_load_global_s96_align16
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
@@ -1419,6 +1516,7 @@ body: |
     ; CI-MESA-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p1) :: (load (<3 x s32>), align 16, addrspace 1)
     ; CI-MESA-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
     ; CI-MESA-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
     ; VI-LABEL: name: test_load_global_s96_align16
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -1426,6 +1524,7 @@ body: |
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p1) :: (load (<3 x s32>), align 16, addrspace 1)
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
     ; GFX9-HSA-LABEL: name: test_load_global_s96_align16
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
@@ -1433,6 +1532,7 @@ body: |
     ; GFX9-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p1) :: (load (<3 x s32>), align 16, addrspace 1)
     ; GFX9-HSA-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
     ; GFX9-HSA-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
     ; GFX9-MESA-LABEL: name: test_load_global_s96_align16
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -1463,6 +1563,7 @@ body: |
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[LOAD1]](s32)
     ; SI-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
     ; CI-HSA-LABEL: name: test_load_global_s96_align8
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
@@ -1470,6 +1571,7 @@ body: |
     ; CI-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p1) :: (load (<3 x s32>), align 8, addrspace 1)
     ; CI-HSA-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
     ; CI-HSA-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
     ; CI-MESA-LABEL: name: test_load_global_s96_align8
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
@@ -1477,6 +1579,7 @@ body: |
     ; CI-MESA-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p1) :: (load (<3 x s32>), align 8, addrspace 1)
     ; CI-MESA-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
     ; CI-MESA-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
     ; VI-LABEL: name: test_load_global_s96_align8
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -1484,6 +1587,7 @@ body: |
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p1) :: (load (<3 x s32>), align 8, addrspace 1)
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
     ; GFX9-HSA-LABEL: name: test_load_global_s96_align8
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
@@ -1491,6 +1595,7 @@ body: |
     ; GFX9-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p1) :: (load (<3 x s32>), align 8, addrspace 1)
     ; GFX9-HSA-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
     ; GFX9-HSA-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
     ; GFX9-MESA-LABEL: name: test_load_global_s96_align8
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -1521,6 +1626,7 @@ body: |
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[LOAD1]](s32)
     ; SI-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
     ; CI-HSA-LABEL: name: test_load_global_s96_align4
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
@@ -1528,6 +1634,7 @@ body: |
     ; CI-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p1) :: (load (<3 x s32>), align 4, addrspace 1)
     ; CI-HSA-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
     ; CI-HSA-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
     ; CI-MESA-LABEL: name: test_load_global_s96_align4
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
@@ -1535,6 +1642,7 @@ body: |
     ; CI-MESA-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p1) :: (load (<3 x s32>), align 4, addrspace 1)
     ; CI-MESA-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
     ; CI-MESA-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
     ; VI-LABEL: name: test_load_global_s96_align4
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -1542,6 +1650,7 @@ body: |
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p1) :: (load (<3 x s32>), align 4, addrspace 1)
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
     ; GFX9-HSA-LABEL: name: test_load_global_s96_align4
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
@@ -1549,6 +1658,7 @@ body: |
     ; GFX9-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p1) :: (load (<3 x s32>), align 4, addrspace 1)
     ; GFX9-HSA-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
     ; GFX9-HSA-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
     ; GFX9-MESA-LABEL: name: test_load_global_s96_align4
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -1595,6 +1705,7 @@ body: |
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32)
     ; SI-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
     ; CI-HSA-LABEL: name: test_load_global_s96_align2
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
@@ -1602,6 +1713,7 @@ body: |
     ; CI-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p1) :: (load (<3 x s32>), align 2, addrspace 1)
     ; CI-HSA-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
     ; CI-HSA-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
     ; CI-MESA-LABEL: name: test_load_global_s96_align2
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
@@ -1630,6 +1742,7 @@ body: |
     ; CI-MESA-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32)
     ; CI-MESA-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; CI-MESA-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
     ; VI-LABEL: name: test_load_global_s96_align2
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -1658,6 +1771,7 @@ body: |
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32)
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
     ; GFX9-HSA-LABEL: name: test_load_global_s96_align2
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
@@ -1665,6 +1779,7 @@ body: |
     ; GFX9-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p1) :: (load (<3 x s32>), align 2, addrspace 1)
     ; GFX9-HSA-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
     ; GFX9-HSA-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
     ; GFX9-MESA-LABEL: name: test_load_global_s96_align2
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -1758,6 +1873,7 @@ body: |
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32)
     ; SI-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
     ; CI-HSA-LABEL: name: test_load_global_s96_align1
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
@@ -1765,6 +1881,7 @@ body: |
     ; CI-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p1) :: (load (<3 x s32>), align 1, addrspace 1)
     ; CI-HSA-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
     ; CI-HSA-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
     ; CI-MESA-LABEL: name: test_load_global_s96_align1
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
@@ -1819,6 +1936,7 @@ body: |
     ; CI-MESA-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32)
     ; CI-MESA-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; CI-MESA-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
     ; VI-LABEL: name: test_load_global_s96_align1
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -1873,6 +1991,7 @@ body: |
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32)
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
     ; GFX9-HSA-LABEL: name: test_load_global_s96_align1
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
@@ -1880,6 +1999,7 @@ body: |
     ; GFX9-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p1) :: (load (<3 x s32>), align 1, addrspace 1)
     ; GFX9-HSA-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
     ; GFX9-HSA-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
     ; GFX9-MESA-LABEL: name: test_load_global_s96_align1
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -1957,6 +2077,7 @@ body: |
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<5 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[UV2]](s32), [[UV3]](s32), [[LOAD1]](s32)
     ; SI-NEXT: [[BITCAST:%[0-9]+]]:_(s160) = G_BITCAST [[BUILD_VECTOR]](<5 x s32>)
     ; SI-NEXT: S_NOP 0, implicit [[BITCAST]](s160)
+    ;
     ; CI-HSA-LABEL: name: test_load_global_s160_align4
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
@@ -1969,6 +2090,7 @@ body: |
     ; CI-HSA-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<5 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[UV2]](s32), [[UV3]](s32), [[LOAD1]](s32)
     ; CI-HSA-NEXT: [[BITCAST:%[0-9]+]]:_(s160) = G_BITCAST [[BUILD_VECTOR]](<5 x s32>)
     ; CI-HSA-NEXT: S_NOP 0, implicit [[BITCAST]](s160)
+    ;
     ; CI-MESA-LABEL: name: test_load_global_s160_align4
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
@@ -1981,6 +2103,7 @@ body: |
     ; CI-MESA-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<5 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[UV2]](s32), [[UV3]](s32), [[LOAD1]](s32)
     ; CI-MESA-NEXT: [[BITCAST:%[0-9]+]]:_(s160) = G_BITCAST [[BUILD_VECTOR]](<5 x s32>)
     ; CI-MESA-NEXT: S_NOP 0, implicit [[BITCAST]](s160)
+    ;
     ; VI-LABEL: name: test_load_global_s160_align4
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -1993,6 +2116,7 @@ body: |
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<5 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[UV2]](s32), [[UV3]](s32), [[LOAD1]](s32)
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(s160) = G_BITCAST [[BUILD_VECTOR]](<5 x s32>)
     ; VI-NEXT: S_NOP 0, implicit [[BITCAST]](s160)
+    ;
     ; GFX9-HSA-LABEL: name: test_load_global_s160_align4
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
@@ -2005,6 +2129,7 @@ body: |
     ; GFX9-HSA-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<5 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[UV2]](s32), [[UV3]](s32), [[LOAD1]](s32)
     ; GFX9-HSA-NEXT: [[BITCAST:%[0-9]+]]:_(s160) = G_BITCAST [[BUILD_VECTOR]](<5 x s32>)
     ; GFX9-HSA-NEXT: S_NOP 0, implicit [[BITCAST]](s160)
+    ;
     ; GFX9-MESA-LABEL: name: test_load_global_s160_align4
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -2046,6 +2171,7 @@ body: |
     ; SI-NEXT: [[DEF:%[0-9]+]]:_(s256) = G_IMPLICIT_DEF
     ; SI-NEXT: [[INSERT:%[0-9]+]]:_(s256) = G_INSERT [[DEF]], [[BITCAST]](s224), 0
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[INSERT]](s256)
+    ;
     ; CI-HSA-LABEL: name: test_load_global_s224_align4
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
@@ -2061,6 +2187,7 @@ body: |
     ; CI-HSA-NEXT: [[DEF:%[0-9]+]]:_(s256) = G_IMPLICIT_DEF
     ; CI-HSA-NEXT: [[INSERT:%[0-9]+]]:_(s256) = G_INSERT [[DEF]], [[BITCAST]](s224), 0
     ; CI-HSA-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[INSERT]](s256)
+    ;
     ; CI-MESA-LABEL: name: test_load_global_s224_align4
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
@@ -2076,6 +2203,7 @@ body: |
     ; CI-MESA-NEXT: [[DEF:%[0-9]+]]:_(s256) = G_IMPLICIT_DEF
     ; CI-MESA-NEXT: [[INSERT:%[0-9]+]]:_(s256) = G_INSERT [[DEF]], [[BITCAST]](s224), 0
     ; CI-MESA-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[INSERT]](s256)
+    ;
     ; VI-LABEL: name: test_load_global_s224_align4
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -2091,6 +2219,7 @@ body: |
     ; VI-NEXT: [[DEF:%[0-9]+]]:_(s256) = G_IMPLICIT_DEF
     ; VI-NEXT: [[INSERT:%[0-9]+]]:_(s256) = G_INSERT [[DEF]], [[BITCAST]](s224), 0
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[INSERT]](s256)
+    ;
     ; GFX9-HSA-LABEL: name: test_load_global_s224_align4
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
@@ -2106,6 +2235,7 @@ body: |
     ; GFX9-HSA-NEXT: [[DEF:%[0-9]+]]:_(s256) = G_IMPLICIT_DEF
     ; GFX9-HSA-NEXT: [[INSERT:%[0-9]+]]:_(s256) = G_INSERT [[DEF]], [[BITCAST]](s224), 0
     ; GFX9-HSA-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[INSERT]](s256)
+    ;
     ; GFX9-MESA-LABEL: name: test_load_global_s224_align4
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -2142,6 +2272,7 @@ body: |
     ; SI-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p1) :: (load (<4 x s32>), addrspace 1)
     ; SI-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
     ; CI-HSA-LABEL: name: test_load_global_s128_align16
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
@@ -2149,6 +2280,7 @@ body: |
     ; CI-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p1) :: (load (<4 x s32>), addrspace 1)
     ; CI-HSA-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>)
     ; CI-HSA-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
     ; CI-MESA-LABEL: name: test_load_global_s128_align16
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
@@ -2156,6 +2288,7 @@ body: |
     ; CI-MESA-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p1) :: (load (<4 x s32>), addrspace 1)
     ; CI-MESA-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>)
     ; CI-MESA-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
     ; VI-LABEL: name: test_load_global_s128_align16
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -2163,6 +2296,7 @@ body: |
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p1) :: (load (<4 x s32>), addrspace 1)
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
     ; GFX9-HSA-LABEL: name: test_load_global_s128_align16
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
@@ -2170,6 +2304,7 @@ body: |
     ; GFX9-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p1) :: (load (<4 x s32>), addrspace 1)
     ; GFX9-HSA-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>)
     ; GFX9-HSA-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
     ; GFX9-MESA-LABEL: name: test_load_global_s128_align16
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -2195,6 +2330,7 @@ body: |
     ; SI-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p1) :: (load (<4 x s32>), align 4, addrspace 1)
     ; SI-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
     ; CI-HSA-LABEL: name: test_load_global_s128_align4
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
@@ -2202,6 +2338,7 @@ body: |
     ; CI-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p1) :: (load (<4 x s32>), align 4, addrspace 1)
     ; CI-HSA-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>)
     ; CI-HSA-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
     ; CI-MESA-LABEL: name: test_load_global_s128_align4
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
@@ -2209,6 +2346,7 @@ body: |
     ; CI-MESA-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p1) :: (load (<4 x s32>), align 4, addrspace 1)
     ; CI-MESA-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>)
     ; CI-MESA-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
     ; VI-LABEL: name: test_load_global_s128_align4
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -2216,6 +2354,7 @@ body: |
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p1) :: (load (<4 x s32>), align 4, addrspace 1)
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
     ; GFX9-HSA-LABEL: name: test_load_global_s128_align4
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
@@ -2223,6 +2362,7 @@ body: |
     ; GFX9-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p1) :: (load (<4 x s32>), align 4, addrspace 1)
     ; GFX9-HSA-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>)
     ; GFX9-HSA-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
     ; GFX9-MESA-LABEL: name: test_load_global_s128_align4
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -2310,6 +2450,7 @@ body: |
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
     ; SI-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
     ; CI-HSA-LABEL: name: test_load_global_s128_align1
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
@@ -2317,6 +2458,7 @@ body: |
     ; CI-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p1) :: (load (<4 x s32>), align 1, addrspace 1)
     ; CI-HSA-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>)
     ; CI-HSA-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
     ; CI-MESA-LABEL: name: test_load_global_s128_align1
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
@@ -2386,6 +2528,7 @@ body: |
     ; CI-MESA-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
     ; CI-MESA-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; CI-MESA-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
     ; VI-LABEL: name: test_load_global_s128_align1
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -2455,6 +2598,7 @@ body: |
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
     ; GFX9-HSA-LABEL: name: test_load_global_s128_align1
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
@@ -2462,6 +2606,7 @@ body: |
     ; GFX9-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p1) :: (load (<4 x s32>), align 1, addrspace 1)
     ; GFX9-HSA-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>)
     ; GFX9-HSA-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
     ; GFX9-MESA-LABEL: name: test_load_global_s128_align1
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -2549,6 +2694,7 @@ body: |
     ; SI-NEXT: [[LOAD:%[0-9]+]]:_(<8 x s32>) = G_LOAD [[COPY]](p1) :: (load (<8 x s32>), align 16, addrspace 1)
     ; SI-NEXT: [[BITCAST:%[0-9]+]]:_(s256) = G_BITCAST [[LOAD]](<8 x s32>)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BITCAST]](s256)
+    ;
     ; CI-HSA-LABEL: name: test_load_global_s256_align32
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
@@ -2556,6 +2702,7 @@ body: |
     ; CI-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<8 x s32>) = G_LOAD [[COPY]](p1) :: (load (<8 x s32>), align 16, addrspace 1)
     ; CI-HSA-NEXT: [[BITCAST:%[0-9]+]]:_(s256) = G_BITCAST [[LOAD]](<8 x s32>)
     ; CI-HSA-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BITCAST]](s256)
+    ;
     ; CI-MESA-LABEL: name: test_load_global_s256_align32
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
@@ -2563,6 +2710,7 @@ body: |
     ; CI-MESA-NEXT: [[LOAD:%[0-9]+]]:_(<8 x s32>) = G_LOAD [[COPY]](p1) :: (load (<8 x s32>), align 16, addrspace 1)
     ; CI-MESA-NEXT: [[BITCAST:%[0-9]+]]:_(s256) = G_BITCAST [[LOAD]](<8 x s32>)
     ; CI-MESA-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BITCAST]](s256)
+    ;
     ; VI-LABEL: name: test_load_global_s256_align32
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -2570,6 +2718,7 @@ body: |
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<8 x s32>) = G_LOAD [[COPY]](p1) :: (load (<8 x s32>), align 16, addrspace 1)
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(s256) = G_BITCAST [[LOAD]](<8 x s32>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BITCAST]](s256)
+    ;
     ; GFX9-HSA-LABEL: name: test_load_global_s256_align32
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
@@ -2577,6 +2726,7 @@ body: |
     ; GFX9-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<8 x s32>) = G_LOAD [[COPY]](p1) :: (load (<8 x s32>), align 16, addrspace 1)
     ; GFX9-HSA-NEXT: [[BITCAST:%[0-9]+]]:_(s256) = G_BITCAST [[LOAD]](<8 x s32>)
     ; GFX9-HSA-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BITCAST]](s256)
+    ;
     ; GFX9-MESA-LABEL: name: test_load_global_s256_align32
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -2601,30 +2751,35 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; SI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p1) :: (load (p1), addrspace 1)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
+    ;
     ; CI-HSA-LABEL: name: test_load_global_p1_align8
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
     ; CI-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-HSA-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p1) :: (load (p1), addrspace 1)
     ; CI-HSA-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
+    ;
     ; CI-MESA-LABEL: name: test_load_global_p1_align8
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
     ; CI-MESA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-MESA-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p1) :: (load (p1), addrspace 1)
     ; CI-MESA-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
+    ;
     ; VI-LABEL: name: test_load_global_p1_align8
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p1) :: (load (p1), addrspace 1)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
+    ;
     ; GFX9-HSA-LABEL: name: test_load_global_p1_align8
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
     ; GFX9-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p1) :: (load (p1), addrspace 1)
     ; GFX9-HSA-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
+    ;
     ; GFX9-MESA-LABEL: name: test_load_global_p1_align8
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -2648,30 +2803,35 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; SI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p1) :: (load (p1), align 4, addrspace 1)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
+    ;
     ; CI-HSA-LABEL: name: test_load_global_p1_align4
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
     ; CI-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-HSA-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p1) :: (load (p1), align 4, addrspace 1)
     ; CI-HSA-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
+    ;
     ; CI-MESA-LABEL: name: test_load_global_p1_align4
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
     ; CI-MESA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-MESA-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p1) :: (load (p1), align 4, addrspace 1)
     ; CI-MESA-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
+    ;
     ; VI-LABEL: name: test_load_global_p1_align4
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p1) :: (load (p1), align 4, addrspace 1)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
+    ;
     ; GFX9-HSA-LABEL: name: test_load_global_p1_align4
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
     ; GFX9-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p1) :: (load (p1), align 4, addrspace 1)
     ; GFX9-HSA-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
+    ;
     ; GFX9-MESA-LABEL: name: test_load_global_p1_align4
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -2732,12 +2892,14 @@ body: |
     ; SI-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
     ; SI-NEXT: [[INTTOPTR:%[0-9]+]]:_(p1) = G_INTTOPTR [[OR6]](s64)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p1)
+    ;
     ; CI-HSA-LABEL: name: test_load_global_p1_align1
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
     ; CI-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-HSA-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p1) :: (load (p1), align 1, addrspace 1)
     ; CI-HSA-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
+    ;
     ; CI-MESA-LABEL: name: test_load_global_p1_align1
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
@@ -2781,6 +2943,7 @@ body: |
     ; CI-MESA-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
     ; CI-MESA-NEXT: [[INTTOPTR:%[0-9]+]]:_(p1) = G_INTTOPTR [[OR6]](s64)
     ; CI-MESA-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p1)
+    ;
     ; VI-LABEL: name: test_load_global_p1_align1
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -2824,12 +2987,14 @@ body: |
     ; VI-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
     ; VI-NEXT: [[INTTOPTR:%[0-9]+]]:_(p1) = G_INTTOPTR [[OR6]](s64)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p1)
+    ;
     ; GFX9-HSA-LABEL: name: test_load_global_p1_align1
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
     ; GFX9-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p1) :: (load (p1), align 1, addrspace 1)
     ; GFX9-HSA-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
+    ;
     ; GFX9-MESA-LABEL: name: test_load_global_p1_align1
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -2890,30 +3055,35 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; SI-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p1) :: (load (p3), addrspace 1)
     ; SI-NEXT: $vgpr0 = COPY [[LOAD]](p3)
+    ;
     ; CI-HSA-LABEL: name: test_load_global_p3_align4
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
     ; CI-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-HSA-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p1) :: (load (p3), addrspace 1)
     ; CI-HSA-NEXT: $vgpr0 = COPY [[LOAD]](p3)
+    ;
     ; CI-MESA-LABEL: name: test_load_global_p3_align4
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
     ; CI-MESA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-MESA-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p1) :: (load (p3), addrspace 1)
     ; CI-MESA-NEXT: $vgpr0 = COPY [[LOAD]](p3)
+    ;
     ; VI-LABEL: name: test_load_global_p3_align4
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p1) :: (load (p3), addrspace 1)
     ; VI-NEXT: $vgpr0 = COPY [[LOAD]](p3)
+    ;
     ; GFX9-HSA-LABEL: name: test_load_global_p3_align4
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
     ; GFX9-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p1) :: (load (p3), addrspace 1)
     ; GFX9-HSA-NEXT: $vgpr0 = COPY [[LOAD]](p3)
+    ;
     ; GFX9-MESA-LABEL: name: test_load_global_p3_align4
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -2937,30 +3107,35 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; SI-NEXT: [[LOAD:%[0-9]+]]:_(p4) = G_LOAD [[COPY]](p1) :: (load (p4), addrspace 1)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p4)
+    ;
     ; CI-HSA-LABEL: name: test_load_global_p4_align8
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
     ; CI-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-HSA-NEXT: [[LOAD:%[0-9]+]]:_(p4) = G_LOAD [[COPY]](p1) :: (load (p4), addrspace 1)
     ; CI-HSA-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p4)
+    ;
     ; CI-MESA-LABEL: name: test_load_global_p4_align8
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
     ; CI-MESA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-MESA-NEXT: [[LOAD:%[0-9]+]]:_(p4) = G_LOAD [[COPY]](p1) :: (load (p4), addrspace 1)
     ; CI-MESA-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p4)
+    ;
     ; VI-LABEL: name: test_load_global_p4_align8
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(p4) = G_LOAD [[COPY]](p1) :: (load (p4), addrspace 1)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p4)
+    ;
     ; GFX9-HSA-LABEL: name: test_load_global_p4_align8
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
     ; GFX9-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: [[LOAD:%[0-9]+]]:_(p4) = G_LOAD [[COPY]](p1) :: (load (p4), addrspace 1)
     ; GFX9-HSA-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p4)
+    ;
     ; GFX9-MESA-LABEL: name: test_load_global_p4_align8
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -2984,30 +3159,35 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; SI-NEXT: [[LOAD:%[0-9]+]]:_(p4) = G_LOAD [[COPY]](p1) :: (load (p4), align 4, addrspace 1)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p4)
+    ;
     ; CI-HSA-LABEL: name: test_load_global_p4_align4
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
     ; CI-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-HSA-NEXT: [[LOAD:%[0-9]+]]:_(p4) = G_LOAD [[COPY]](p1) :: (load (p4), align 4, addrspace 1)
     ; CI-HSA-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p4)
+    ;
     ; CI-MESA-LABEL: name: test_load_global_p4_align4
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
     ; CI-MESA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-MESA-NEXT: [[LOAD:%[0-9]+]]:_(p4) = G_LOAD [[COPY]](p1) :: (load (p4), align 4, addrspace 1)
     ; CI-MESA-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p4)
+    ;
     ; VI-LABEL: name: test_load_global_p4_align4
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(p4) = G_LOAD [[COPY]](p1) :: (load (p4), align 4, addrspace 1)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p4)
+    ;
     ; GFX9-HSA-LABEL: name: test_load_global_p4_align4
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
     ; GFX9-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: [[LOAD:%[0-9]+]]:_(p4) = G_LOAD [[COPY]](p1) :: (load (p4), align 4, addrspace 1)
     ; GFX9-HSA-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p4)
+    ;
     ; GFX9-MESA-LABEL: name: test_load_global_p4_align4
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -3050,12 +3230,14 @@ body: |
     ; SI-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]]
     ; SI-NEXT: [[INTTOPTR:%[0-9]+]]:_(p4) = G_INTTOPTR [[OR2]](s64)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p4)
+    ;
     ; CI-HSA-LABEL: name: test_load_global_p4_align2
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
     ; CI-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-HSA-NEXT: [[LOAD:%[0-9]+]]:_(p4) = G_LOAD [[COPY]](p1) :: (load (p4), align 2, addrspace 1)
     ; CI-HSA-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p4)
+    ;
     ; CI-MESA-LABEL: name: test_load_global_p4_align2
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
@@ -3081,6 +3263,7 @@ body: |
     ; CI-MESA-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]]
     ; CI-MESA-NEXT: [[INTTOPTR:%[0-9]+]]:_(p4) = G_INTTOPTR [[OR2]](s64)
     ; CI-MESA-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p4)
+    ;
     ; VI-LABEL: name: test_load_global_p4_align2
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -3106,12 +3289,14 @@ body: |
     ; VI-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]]
     ; VI-NEXT: [[INTTOPTR:%[0-9]+]]:_(p4) = G_INTTOPTR [[OR2]](s64)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p4)
+    ;
     ; GFX9-HSA-LABEL: name: test_load_global_p4_align2
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
     ; GFX9-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: [[LOAD:%[0-9]+]]:_(p4) = G_LOAD [[COPY]](p1) :: (load (p4), align 2, addrspace 1)
     ; GFX9-HSA-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p4)
+    ;
     ; GFX9-MESA-LABEL: name: test_load_global_p4_align2
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -3191,12 +3376,14 @@ body: |
     ; SI-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
     ; SI-NEXT: [[INTTOPTR:%[0-9]+]]:_(p4) = G_INTTOPTR [[OR6]](s64)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p4)
+    ;
     ; CI-HSA-LABEL: name: test_load_global_p4_align1
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
     ; CI-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-HSA-NEXT: [[LOAD:%[0-9]+]]:_(p4) = G_LOAD [[COPY]](p1) :: (load (p4), align 1, addrspace 1)
     ; CI-HSA-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p4)
+    ;
     ; CI-MESA-LABEL: name: test_load_global_p4_align1
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
@@ -3240,6 +3427,7 @@ body: |
     ; CI-MESA-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
     ; CI-MESA-NEXT: [[INTTOPTR:%[0-9]+]]:_(p4) = G_INTTOPTR [[OR6]](s64)
     ; CI-MESA-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p4)
+    ;
     ; VI-LABEL: name: test_load_global_p4_align1
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -3283,12 +3471,14 @@ body: |
     ; VI-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
     ; VI-NEXT: [[INTTOPTR:%[0-9]+]]:_(p4) = G_INTTOPTR [[OR6]](s64)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p4)
+    ;
     ; GFX9-HSA-LABEL: name: test_load_global_p4_align1
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
     ; GFX9-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: [[LOAD:%[0-9]+]]:_(p4) = G_LOAD [[COPY]](p1) :: (load (p4), align 1, addrspace 1)
     ; GFX9-HSA-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p4)
+    ;
     ; GFX9-MESA-LABEL: name: test_load_global_p4_align1
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -3349,30 +3539,35 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; SI-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p1) :: (load (p5), addrspace 1)
     ; SI-NEXT: $vgpr0 = COPY [[LOAD]](p5)
+    ;
     ; CI-HSA-LABEL: name: test_load_global_p5_align4
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
     ; CI-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-HSA-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p1) :: (load (p5), addrspace 1)
     ; CI-HSA-NEXT: $vgpr0 = COPY [[LOAD]](p5)
+    ;
     ; CI-MESA-LABEL: name: test_load_global_p5_align4
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
     ; CI-MESA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-MESA-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p1) :: (load (p5), addrspace 1)
     ; CI-MESA-NEXT: $vgpr0 = COPY [[LOAD]](p5)
+    ;
     ; VI-LABEL: name: test_load_global_p5_align4
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p1) :: (load (p5), addrspace 1)
     ; VI-NEXT: $vgpr0 = COPY [[LOAD]](p5)
+    ;
     ; GFX9-HSA-LABEL: name: test_load_global_p5_align4
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
     ; GFX9-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p1) :: (load (p5), addrspace 1)
     ; GFX9-HSA-NEXT: $vgpr0 = COPY [[LOAD]](p5)
+    ;
     ; GFX9-MESA-LABEL: name: test_load_global_p5_align4
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -3403,12 +3598,14 @@ body: |
     ; SI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; SI-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR]](s32)
     ; SI-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5)
+    ;
     ; CI-HSA-LABEL: name: test_load_global_p5_align2
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
     ; CI-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-HSA-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p1) :: (load (p5), align 2, addrspace 1)
     ; CI-HSA-NEXT: $vgpr0 = COPY [[LOAD]](p5)
+    ;
     ; CI-MESA-LABEL: name: test_load_global_p5_align2
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
@@ -3422,6 +3619,7 @@ body: |
     ; CI-MESA-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; CI-MESA-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR]](s32)
     ; CI-MESA-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5)
+    ;
     ; VI-LABEL: name: test_load_global_p5_align2
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -3435,12 +3633,14 @@ body: |
     ; VI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; VI-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR]](s32)
     ; VI-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5)
+    ;
     ; GFX9-HSA-LABEL: name: test_load_global_p5_align2
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
     ; GFX9-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p1) :: (load (p5), align 2, addrspace 1)
     ; GFX9-HSA-NEXT: $vgpr0 = COPY [[LOAD]](p5)
+    ;
     ; GFX9-MESA-LABEL: name: test_load_global_p5_align2
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -3488,12 +3688,14 @@ body: |
     ; SI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
     ; SI-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR2]](s32)
     ; SI-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5)
+    ;
     ; CI-HSA-LABEL: name: test_load_global_p5_align1
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
     ; CI-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-HSA-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p1) :: (load (p5), align 1, addrspace 1)
     ; CI-HSA-NEXT: $vgpr0 = COPY [[LOAD]](p5)
+    ;
     ; CI-MESA-LABEL: name: test_load_global_p5_align1
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
@@ -3517,6 +3719,7 @@ body: |
     ; CI-MESA-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
     ; CI-MESA-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR2]](s32)
     ; CI-MESA-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5)
+    ;
     ; VI-LABEL: name: test_load_global_p5_align1
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -3540,12 +3743,14 @@ body: |
     ; VI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
     ; VI-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR2]](s32)
     ; VI-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5)
+    ;
     ; GFX9-HSA-LABEL: name: test_load_global_p5_align1
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
     ; GFX9-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p1) :: (load (p5), align 1, addrspace 1)
     ; GFX9-HSA-NEXT: $vgpr0 = COPY [[LOAD]](p5)
+    ;
     ; GFX9-MESA-LABEL: name: test_load_global_p5_align1
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -3586,30 +3791,35 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; SI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s16), align 4, addrspace 1)
     ; SI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; CI-HSA-LABEL: name: test_load_global_v2s8_align4
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
     ; CI-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-HSA-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s16), align 4, addrspace 1)
     ; CI-HSA-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; CI-MESA-LABEL: name: test_load_global_v2s8_align4
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
     ; CI-MESA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-MESA-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s16), align 4, addrspace 1)
     ; CI-MESA-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; VI-LABEL: name: test_load_global_v2s8_align4
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s16), align 4, addrspace 1)
     ; VI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX9-HSA-LABEL: name: test_load_global_v2s8_align4
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
     ; GFX9-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s16), align 4, addrspace 1)
     ; GFX9-HSA-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX9-MESA-LABEL: name: test_load_global_v2s8_align4
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -3635,30 +3845,35 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; SI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s16), addrspace 1)
     ; SI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; CI-HSA-LABEL: name: test_load_global_v2s8_align2
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
     ; CI-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-HSA-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s16), addrspace 1)
     ; CI-HSA-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; CI-MESA-LABEL: name: test_load_global_v2s8_align2
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
     ; CI-MESA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-MESA-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s16), addrspace 1)
     ; CI-MESA-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; VI-LABEL: name: test_load_global_v2s8_align2
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s16), addrspace 1)
     ; VI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX9-HSA-LABEL: name: test_load_global_v2s8_align2
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
     ; GFX9-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s16), addrspace 1)
     ; GFX9-HSA-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX9-MESA-LABEL: name: test_load_global_v2s8_align2
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -3690,12 +3905,14 @@ body: |
     ; SI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
     ; SI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; SI-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
     ; CI-HSA-LABEL: name: test_load_global_v2s8_align1
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
     ; CI-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-HSA-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s16), align 1, addrspace 1)
     ; CI-HSA-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; CI-MESA-LABEL: name: test_load_global_v2s8_align1
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
@@ -3708,6 +3925,7 @@ body: |
     ; CI-MESA-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
     ; CI-MESA-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; CI-MESA-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
     ; VI-LABEL: name: test_load_global_v2s8_align1
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -3720,12 +3938,14 @@ body: |
     ; VI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
     ; VI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; VI-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
     ; GFX9-HSA-LABEL: name: test_load_global_v2s8_align1
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
     ; GFX9-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s16), align 1, addrspace 1)
     ; GFX9-HSA-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX9-MESA-LABEL: name: test_load_global_v2s8_align1
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -3761,8 +3981,8 @@ body: |
     ; SI-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
     ; SI-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C1]](s32)
     ; SI-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-    ; SI-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; SI-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
+    ; SI-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; SI-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C2]]
     ; SI-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C]](s32)
     ; SI-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
@@ -3782,6 +4002,7 @@ body: |
     ; SI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C1]](s32)
     ; SI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]]
     ; SI-NEXT: $vgpr0 = COPY [[OR2]](s32)
+    ;
     ; CI-HSA-LABEL: name: test_load_global_v3s8_align4
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
@@ -3792,8 +4013,8 @@ body: |
     ; CI-HSA-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
     ; CI-HSA-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C1]](s32)
     ; CI-HSA-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-    ; CI-HSA-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; CI-HSA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
+    ; CI-HSA-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; CI-HSA-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C2]]
     ; CI-HSA-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C]](s32)
     ; CI-HSA-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
@@ -3813,6 +4034,7 @@ body: |
     ; CI-HSA-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C1]](s32)
     ; CI-HSA-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]]
     ; CI-HSA-NEXT: $vgpr0 = COPY [[OR2]](s32)
+    ;
     ; CI-MESA-LABEL: name: test_load_global_v3s8_align4
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
@@ -3823,8 +4045,8 @@ body: |
     ; CI-MESA-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
     ; CI-MESA-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C1]](s32)
     ; CI-MESA-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-    ; CI-MESA-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; CI-MESA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
+    ; CI-MESA-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; CI-MESA-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C2]]
     ; CI-MESA-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C]](s32)
     ; CI-MESA-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
@@ -3844,6 +4066,7 @@ body: |
     ; CI-MESA-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C1]](s32)
     ; CI-MESA-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]]
     ; CI-MESA-NEXT: $vgpr0 = COPY [[OR2]](s32)
+    ;
     ; VI-LABEL: name: test_load_global_v3s8_align4
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -3854,8 +4077,8 @@ body: |
     ; VI-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
     ; VI-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C1]](s32)
     ; VI-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-    ; VI-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; VI-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
+    ; VI-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; VI-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C2]]
     ; VI-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
     ; VI-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C2]]
@@ -3873,6 +4096,7 @@ body: |
     ; VI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C1]](s32)
     ; VI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]]
     ; VI-NEXT: $vgpr0 = COPY [[OR2]](s32)
+    ;
     ; GFX9-HSA-LABEL: name: test_load_global_v3s8_align4
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
@@ -3883,8 +4107,8 @@ body: |
     ; GFX9-HSA-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
     ; GFX9-HSA-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C1]](s32)
     ; GFX9-HSA-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-    ; GFX9-HSA-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; GFX9-HSA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
+    ; GFX9-HSA-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; GFX9-HSA-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C2]]
     ; GFX9-HSA-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
     ; GFX9-HSA-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C2]]
@@ -3902,6 +4126,7 @@ body: |
     ; GFX9-HSA-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C1]](s32)
     ; GFX9-HSA-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]]
     ; GFX9-HSA-NEXT: $vgpr0 = COPY [[OR2]](s32)
+    ;
     ; GFX9-MESA-LABEL: name: test_load_global_v3s8_align4
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -3912,8 +4137,8 @@ body: |
     ; GFX9-MESA-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
     ; GFX9-MESA-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C1]](s32)
     ; GFX9-MESA-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-    ; GFX9-MESA-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; GFX9-MESA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
+    ; GFX9-MESA-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; GFX9-MESA-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C2]]
     ; GFX9-MESA-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
     ; GFX9-MESA-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C2]]
@@ -3965,8 +4190,8 @@ body: |
     ; SI-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C1]](s32)
     ; SI-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C3]](s32)
     ; SI-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-    ; SI-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; SI-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
+    ; SI-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; SI-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C4]]
     ; SI-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C1]](s32)
     ; SI-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
@@ -3986,6 +4211,7 @@ body: |
     ; SI-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C3]](s32)
     ; SI-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]]
     ; SI-NEXT: $vgpr0 = COPY [[OR4]](s32)
+    ;
     ; CI-HSA-LABEL: name: test_load_global_v3s8_align1
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
@@ -4001,8 +4227,8 @@ body: |
     ; CI-HSA-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C2]](s32)
     ; CI-HSA-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C1]](s32)
     ; CI-HSA-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-    ; CI-HSA-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; CI-HSA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
+    ; CI-HSA-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; CI-HSA-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C3]]
     ; CI-HSA-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
     ; CI-HSA-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
@@ -4022,6 +4248,7 @@ body: |
     ; CI-HSA-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C1]](s32)
     ; CI-HSA-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL3]]
     ; CI-HSA-NEXT: $vgpr0 = COPY [[OR3]](s32)
+    ;
     ; CI-MESA-LABEL: name: test_load_global_v3s8_align1
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
@@ -4042,8 +4269,8 @@ body: |
     ; CI-MESA-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C1]](s32)
     ; CI-MESA-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C3]](s32)
     ; CI-MESA-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-    ; CI-MESA-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; CI-MESA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
+    ; CI-MESA-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; CI-MESA-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C4]]
     ; CI-MESA-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C1]](s32)
     ; CI-MESA-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
@@ -4063,6 +4290,7 @@ body: |
     ; CI-MESA-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C3]](s32)
     ; CI-MESA-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]]
     ; CI-MESA-NEXT: $vgpr0 = COPY [[OR4]](s32)
+    ;
     ; VI-LABEL: name: test_load_global_v3s8_align1
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -4083,8 +4311,8 @@ body: |
     ; VI-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C1]](s32)
     ; VI-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C3]](s32)
     ; VI-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-    ; VI-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; VI-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
+    ; VI-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; VI-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C4]]
     ; VI-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
     ; VI-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C4]]
@@ -4102,6 +4330,7 @@ body: |
     ; VI-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C3]](s32)
     ; VI-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]]
     ; VI-NEXT: $vgpr0 = COPY [[OR4]](s32)
+    ;
     ; GFX9-HSA-LABEL: name: test_load_global_v3s8_align1
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
@@ -4117,8 +4346,8 @@ body: |
     ; GFX9-HSA-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C2]](s32)
     ; GFX9-HSA-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C1]](s32)
     ; GFX9-HSA-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-    ; GFX9-HSA-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; GFX9-HSA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
+    ; GFX9-HSA-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; GFX9-HSA-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C3]]
     ; GFX9-HSA-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
     ; GFX9-HSA-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C3]]
@@ -4136,6 +4365,7 @@ body: |
     ; GFX9-HSA-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C1]](s32)
     ; GFX9-HSA-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL3]]
     ; GFX9-HSA-NEXT: $vgpr0 = COPY [[OR3]](s32)
+    ;
     ; GFX9-MESA-LABEL: name: test_load_global_v3s8_align1
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -4156,8 +4386,8 @@ body: |
     ; GFX9-MESA-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C1]](s32)
     ; GFX9-MESA-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C3]](s32)
     ; GFX9-MESA-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-    ; GFX9-MESA-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; GFX9-MESA-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
+    ; GFX9-MESA-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; GFX9-MESA-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C4]]
     ; GFX9-MESA-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
     ; GFX9-MESA-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C4]]
@@ -4202,6 +4432,7 @@ body: |
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LSHR]](s32), [[LSHR1]](s32), [[LSHR2]](s32)
     ; SI-NEXT: [[TRUNC:%[0-9]+]]:_(<4 x s8>) = G_TRUNC [[BUILD_VECTOR]](<4 x s32>)
     ; SI-NEXT: $vgpr0 = COPY [[TRUNC]](<4 x s8>)
+    ;
     ; CI-HSA-LABEL: name: test_load_global_v4s8_align4
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
@@ -4216,6 +4447,7 @@ body: |
     ; CI-HSA-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LSHR]](s32), [[LSHR1]](s32), [[LSHR2]](s32)
     ; CI-HSA-NEXT: [[TRUNC:%[0-9]+]]:_(<4 x s8>) = G_TRUNC [[BUILD_VECTOR]](<4 x s32>)
     ; CI-HSA-NEXT: $vgpr0 = COPY [[TRUNC]](<4 x s8>)
+    ;
     ; CI-MESA-LABEL: name: test_load_global_v4s8_align4
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
@@ -4230,6 +4462,7 @@ body: |
     ; CI-MESA-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LSHR]](s32), [[LSHR1]](s32), [[LSHR2]](s32)
     ; CI-MESA-NEXT: [[TRUNC:%[0-9]+]]:_(<4 x s8>) = G_TRUNC [[BUILD_VECTOR]](<4 x s32>)
     ; CI-MESA-NEXT: $vgpr0 = COPY [[TRUNC]](<4 x s8>)
+    ;
     ; VI-LABEL: name: test_load_global_v4s8_align4
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -4244,6 +4477,7 @@ body: |
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LSHR]](s32), [[LSHR1]](s32), [[LSHR2]](s32)
     ; VI-NEXT: [[TRUNC:%[0-9]+]]:_(<4 x s8>) = G_TRUNC [[BUILD_VECTOR]](<4 x s32>)
     ; VI-NEXT: $vgpr0 = COPY [[TRUNC]](<4 x s8>)
+    ;
     ; GFX9-HSA-LABEL: name: test_load_global_v4s8_align4
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
@@ -4264,6 +4498,7 @@ body: |
     ; GFX9-HSA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>)
     ; GFX9-HSA-NEXT: [[TRUNC4:%[0-9]+]]:_(<4 x s8>) = G_TRUNC [[CONCAT_VECTORS]](<4 x s16>)
     ; GFX9-HSA-NEXT: $vgpr0 = COPY [[TRUNC4]](<4 x s8>)
+    ;
     ; GFX9-MESA-LABEL: name: test_load_global_v4s8_align4
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -4314,6 +4549,7 @@ body: |
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[LSHR]](s32), [[LSHR1]](s32), [[LSHR2]](s32)
     ; SI-NEXT: [[TRUNC:%[0-9]+]]:_(<4 x s8>) = G_TRUNC [[BUILD_VECTOR]](<4 x s32>)
     ; SI-NEXT: $vgpr0 = COPY [[TRUNC]](<4 x s8>)
+    ;
     ; CI-HSA-LABEL: name: test_load_global_v4s8_align2
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
@@ -4328,6 +4564,7 @@ body: |
     ; CI-HSA-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LSHR]](s32), [[LSHR1]](s32), [[LSHR2]](s32)
     ; CI-HSA-NEXT: [[TRUNC:%[0-9]+]]:_(<4 x s8>) = G_TRUNC [[BUILD_VECTOR]](<4 x s32>)
     ; CI-HSA-NEXT: $vgpr0 = COPY [[TRUNC]](<4 x s8>)
+    ;
     ; CI-MESA-LABEL: name: test_load_global_v4s8_align2
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
@@ -4347,6 +4584,7 @@ body: |
     ; CI-MESA-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[LSHR]](s32), [[LSHR1]](s32), [[LSHR2]](s32)
     ; CI-MESA-NEXT: [[TRUNC:%[0-9]+]]:_(<4 x s8>) = G_TRUNC [[BUILD_VECTOR]](<4 x s32>)
     ; CI-MESA-NEXT: $vgpr0 = COPY [[TRUNC]](<4 x s8>)
+    ;
     ; VI-LABEL: name: test_load_global_v4s8_align2
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -4366,6 +4604,7 @@ body: |
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[LSHR]](s32), [[LSHR1]](s32), [[LSHR2]](s32)
     ; VI-NEXT: [[TRUNC:%[0-9]+]]:_(<4 x s8>) = G_TRUNC [[BUILD_VECTOR]](<4 x s32>)
     ; VI-NEXT: $vgpr0 = COPY [[TRUNC]](<4 x s8>)
+    ;
     ; GFX9-HSA-LABEL: name: test_load_global_v4s8_align2
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
@@ -4386,6 +4625,7 @@ body: |
     ; GFX9-HSA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>)
     ; GFX9-HSA-NEXT: [[TRUNC4:%[0-9]+]]:_(<4 x s8>) = G_TRUNC [[CONCAT_VECTORS]](<4 x s16>)
     ; GFX9-HSA-NEXT: $vgpr0 = COPY [[TRUNC4]](<4 x s8>)
+    ;
     ; GFX9-MESA-LABEL: name: test_load_global_v4s8_align2
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -4450,6 +4690,7 @@ body: |
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[LSHR]](s32), [[LSHR1]](s32), [[LSHR2]](s32)
     ; SI-NEXT: [[TRUNC:%[0-9]+]]:_(<4 x s8>) = G_TRUNC [[BUILD_VECTOR]](<4 x s32>)
     ; SI-NEXT: $vgpr0 = COPY [[TRUNC]](<4 x s8>)
+    ;
     ; CI-HSA-LABEL: name: test_load_global_v4s8_align1
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
@@ -4464,6 +4705,7 @@ body: |
     ; CI-HSA-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LSHR]](s32), [[LSHR1]](s32), [[LSHR2]](s32)
     ; CI-HSA-NEXT: [[TRUNC:%[0-9]+]]:_(<4 x s8>) = G_TRUNC [[BUILD_VECTOR]](<4 x s32>)
     ; CI-HSA-NEXT: $vgpr0 = COPY [[TRUNC]](<4 x s8>)
+    ;
     ; CI-MESA-LABEL: name: test_load_global_v4s8_align1
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
@@ -4492,6 +4734,7 @@ body: |
     ; CI-MESA-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[LSHR]](s32), [[LSHR1]](s32), [[LSHR2]](s32)
     ; CI-MESA-NEXT: [[TRUNC:%[0-9]+]]:_(<4 x s8>) = G_TRUNC [[BUILD_VECTOR]](<4 x s32>)
     ; CI-MESA-NEXT: $vgpr0 = COPY [[TRUNC]](<4 x s8>)
+    ;
     ; VI-LABEL: name: test_load_global_v4s8_align1
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -4520,6 +4763,7 @@ body: |
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[LSHR]](s32), [[LSHR1]](s32), [[LSHR2]](s32)
     ; VI-NEXT: [[TRUNC:%[0-9]+]]:_(<4 x s8>) = G_TRUNC [[BUILD_VECTOR]](<4 x s32>)
     ; VI-NEXT: $vgpr0 = COPY [[TRUNC]](<4 x s8>)
+    ;
     ; GFX9-HSA-LABEL: name: test_load_global_v4s8_align1
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
@@ -4540,6 +4784,7 @@ body: |
     ; GFX9-HSA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>)
     ; GFX9-HSA-NEXT: [[TRUNC4:%[0-9]+]]:_(<4 x s8>) = G_TRUNC [[CONCAT_VECTORS]](<4 x s16>)
     ; GFX9-HSA-NEXT: $vgpr0 = COPY [[TRUNC4]](<4 x s8>)
+    ;
     ; GFX9-MESA-LABEL: name: test_load_global_v4s8_align1
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -4591,30 +4836,35 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; SI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p1) :: (load (<2 x s32>), addrspace 1)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; CI-HSA-LABEL: name: test_load_global_v8s8_align8
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
     ; CI-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p1) :: (load (<2 x s32>), addrspace 1)
     ; CI-HSA-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; CI-MESA-LABEL: name: test_load_global_v8s8_align8
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
     ; CI-MESA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-MESA-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p1) :: (load (<2 x s32>), addrspace 1)
     ; CI-MESA-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; VI-LABEL: name: test_load_global_v8s8_align8
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p1) :: (load (<2 x s32>), addrspace 1)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; GFX9-HSA-LABEL: name: test_load_global_v8s8_align8
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
     ; GFX9-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p1) :: (load (<2 x s32>), addrspace 1)
     ; GFX9-HSA-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; GFX9-MESA-LABEL: name: test_load_global_v8s8_align8
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -4639,30 +4889,35 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; SI-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p1) :: (load (<4 x s32>), addrspace 1)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ;
     ; CI-HSA-LABEL: name: test_load_global_v16s8_align16
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
     ; CI-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p1) :: (load (<4 x s32>), addrspace 1)
     ; CI-HSA-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ;
     ; CI-MESA-LABEL: name: test_load_global_v16s8_align16
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
     ; CI-MESA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-MESA-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p1) :: (load (<4 x s32>), addrspace 1)
     ; CI-MESA-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ;
     ; VI-LABEL: name: test_load_global_v16s8_align16
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p1) :: (load (<4 x s32>), addrspace 1)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ;
     ; GFX9-HSA-LABEL: name: test_load_global_v16s8_align16
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
     ; GFX9-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p1) :: (load (<4 x s32>), addrspace 1)
     ; GFX9-HSA-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ;
     ; GFX9-MESA-LABEL: name: test_load_global_v16s8_align16
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -4687,30 +4942,35 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; SI-NEXT: [[LOAD:%[0-9]+]]:_(<8 x s32>) = G_LOAD [[COPY]](p1) :: (load (<8 x s32>), addrspace 1)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[LOAD]](<8 x s32>)
+    ;
     ; CI-HSA-LABEL: name: test_load_global_v32s8_align32
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
     ; CI-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<8 x s32>) = G_LOAD [[COPY]](p1) :: (load (<8 x s32>), addrspace 1)
     ; CI-HSA-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[LOAD]](<8 x s32>)
+    ;
     ; CI-MESA-LABEL: name: test_load_global_v32s8_align32
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
     ; CI-MESA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-MESA-NEXT: [[LOAD:%[0-9]+]]:_(<8 x s32>) = G_LOAD [[COPY]](p1) :: (load (<8 x s32>), addrspace 1)
     ; CI-MESA-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[LOAD]](<8 x s32>)
+    ;
     ; VI-LABEL: name: test_load_global_v32s8_align32
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<8 x s32>) = G_LOAD [[COPY]](p1) :: (load (<8 x s32>), addrspace 1)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[LOAD]](<8 x s32>)
+    ;
     ; GFX9-HSA-LABEL: name: test_load_global_v32s8_align32
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
     ; GFX9-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<8 x s32>) = G_LOAD [[COPY]](p1) :: (load (<8 x s32>), addrspace 1)
     ; GFX9-HSA-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[LOAD]](<8 x s32>)
+    ;
     ; GFX9-MESA-LABEL: name: test_load_global_v32s8_align32
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -4736,30 +4996,35 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; SI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p1) :: (load (<2 x s16>), addrspace 1)
     ; SI-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>)
+    ;
     ; CI-HSA-LABEL: name: test_load_global_v2s16_align4
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
     ; CI-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p1) :: (load (<2 x s16>), addrspace 1)
     ; CI-HSA-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>)
+    ;
     ; CI-MESA-LABEL: name: test_load_global_v2s16_align4
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
     ; CI-MESA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-MESA-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p1) :: (load (<2 x s16>), addrspace 1)
     ; CI-MESA-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>)
+    ;
     ; VI-LABEL: name: test_load_global_v2s16_align4
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p1) :: (load (<2 x s16>), addrspace 1)
     ; VI-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>)
+    ;
     ; GFX9-HSA-LABEL: name: test_load_global_v2s16_align4
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
     ; GFX9-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p1) :: (load (<2 x s16>), addrspace 1)
     ; GFX9-HSA-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>)
+    ;
     ; GFX9-MESA-LABEL: name: test_load_global_v2s16_align4
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -4793,12 +5058,14 @@ body: |
     ; SI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
     ; SI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; SI-NEXT: $vgpr0 = COPY [[BITCAST]](<2 x s16>)
+    ;
     ; CI-HSA-LABEL: name: test_load_global_v2s16_align2
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
     ; CI-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p1) :: (load (<2 x s16>), align 2, addrspace 1)
     ; CI-HSA-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>)
+    ;
     ; CI-MESA-LABEL: name: test_load_global_v2s16_align2
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
@@ -4815,6 +5082,7 @@ body: |
     ; CI-MESA-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
     ; CI-MESA-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; CI-MESA-NEXT: $vgpr0 = COPY [[BITCAST]](<2 x s16>)
+    ;
     ; VI-LABEL: name: test_load_global_v2s16_align2
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -4831,12 +5099,14 @@ body: |
     ; VI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; VI-NEXT: $vgpr0 = COPY [[BITCAST]](<2 x s16>)
+    ;
     ; GFX9-HSA-LABEL: name: test_load_global_v2s16_align2
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
     ; GFX9-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p1) :: (load (<2 x s16>), align 2, addrspace 1)
     ; GFX9-HSA-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>)
+    ;
     ; GFX9-MESA-LABEL: name: test_load_global_v2s16_align2
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -4886,12 +5156,14 @@ body: |
     ; SI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL2]]
     ; SI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
     ; SI-NEXT: $vgpr0 = COPY [[BITCAST]](<2 x s16>)
+    ;
     ; CI-HSA-LABEL: name: test_load_global_v2s16_align1
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
     ; CI-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p1) :: (load (<2 x s16>), align 1, addrspace 1)
     ; CI-HSA-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>)
+    ;
     ; CI-MESA-LABEL: name: test_load_global_v2s16_align1
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
@@ -4918,6 +5190,7 @@ body: |
     ; CI-MESA-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL2]]
     ; CI-MESA-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
     ; CI-MESA-NEXT: $vgpr0 = COPY [[BITCAST]](<2 x s16>)
+    ;
     ; VI-LABEL: name: test_load_global_v2s16_align1
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -4944,12 +5217,14 @@ body: |
     ; VI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL2]]
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
     ; VI-NEXT: $vgpr0 = COPY [[BITCAST]](<2 x s16>)
+    ;
     ; GFX9-HSA-LABEL: name: test_load_global_v2s16_align1
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
     ; GFX9-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p1) :: (load (<2 x s16>), align 1, addrspace 1)
     ; GFX9-HSA-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>)
+    ;
     ; GFX9-MESA-LABEL: name: test_load_global_v2s16_align1
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -5002,13 +5277,13 @@ body: |
     ; SI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
     ; SI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
     ; SI-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
-    ; SI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]]
-    ; SI-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[BITCAST2]], [[C1]]
-    ; SI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C]](s32)
-    ; SI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]]
+    ; SI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[BITCAST2]], [[C1]]
+    ; SI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C]](s32)
+    ; SI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[LSHR]], [[SHL1]]
     ; SI-NEXT: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
     ; SI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[UV]](<2 x s16>), [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
     ; CI-HSA-LABEL: name: test_load_global_v3s16_align8
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
@@ -5028,13 +5303,13 @@ body: |
     ; CI-HSA-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
     ; CI-HSA-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
     ; CI-HSA-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
-    ; CI-HSA-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]]
-    ; CI-HSA-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[BITCAST2]], [[C1]]
-    ; CI-HSA-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C]](s32)
-    ; CI-HSA-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]]
+    ; CI-HSA-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[BITCAST2]], [[C1]]
+    ; CI-HSA-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C]](s32)
+    ; CI-HSA-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[LSHR]], [[SHL1]]
     ; CI-HSA-NEXT: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
     ; CI-HSA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[UV]](<2 x s16>), [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>)
     ; CI-HSA-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
     ; CI-MESA-LABEL: name: test_load_global_v3s16_align8
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
@@ -5054,13 +5329,13 @@ body: |
     ; CI-MESA-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
     ; CI-MESA-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
     ; CI-MESA-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
-    ; CI-MESA-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]]
-    ; CI-MESA-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[BITCAST2]], [[C1]]
-    ; CI-MESA-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C]](s32)
-    ; CI-MESA-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]]
+    ; CI-MESA-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[BITCAST2]], [[C1]]
+    ; CI-MESA-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C]](s32)
+    ; CI-MESA-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[LSHR]], [[SHL1]]
     ; CI-MESA-NEXT: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
     ; CI-MESA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[UV]](<2 x s16>), [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>)
     ; CI-MESA-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
     ; VI-LABEL: name: test_load_global_v3s16_align8
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -5080,13 +5355,13 @@ body: |
     ; VI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
     ; VI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
     ; VI-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
-    ; VI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]]
-    ; VI-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[BITCAST2]], [[C1]]
-    ; VI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C]](s32)
-    ; VI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]]
+    ; VI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[BITCAST2]], [[C1]]
+    ; VI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C]](s32)
+    ; VI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[LSHR]], [[SHL1]]
     ; VI-NEXT: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
     ; VI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[UV]](<2 x s16>), [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
     ; GFX9-HSA-LABEL: name: test_load_global_v3s16_align8
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
@@ -5108,6 +5383,7 @@ body: |
     ; GFX9-HSA-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
     ; GFX9-HSA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[UV]](<2 x s16>), [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>)
     ; GFX9-HSA-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
     ; GFX9-MESA-LABEL: name: test_load_global_v3s16_align8
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -5170,13 +5446,13 @@ body: |
     ; SI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C2]](s32)
     ; SI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]]
     ; SI-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
-    ; SI-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C3]]
-    ; SI-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C3]]
-    ; SI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C2]](s32)
-    ; SI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]]
+    ; SI-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C3]]
+    ; SI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND4]], [[C2]](s32)
+    ; SI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[LSHR]], [[SHL2]]
     ; SI-NEXT: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
     ; SI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
     ; CI-HSA-LABEL: name: test_load_global_v3s16_align4
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
@@ -5205,13 +5481,13 @@ body: |
     ; CI-HSA-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C2]](s32)
     ; CI-HSA-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]]
     ; CI-HSA-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
-    ; CI-HSA-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C3]]
-    ; CI-HSA-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C3]]
-    ; CI-HSA-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C2]](s32)
-    ; CI-HSA-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]]
+    ; CI-HSA-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C3]]
+    ; CI-HSA-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND4]], [[C2]](s32)
+    ; CI-HSA-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[LSHR]], [[SHL2]]
     ; CI-HSA-NEXT: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
     ; CI-HSA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>)
     ; CI-HSA-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
     ; CI-MESA-LABEL: name: test_load_global_v3s16_align4
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
@@ -5240,13 +5516,13 @@ body: |
     ; CI-MESA-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C2]](s32)
     ; CI-MESA-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]]
     ; CI-MESA-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
-    ; CI-MESA-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C3]]
-    ; CI-MESA-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C3]]
-    ; CI-MESA-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C2]](s32)
-    ; CI-MESA-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]]
+    ; CI-MESA-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C3]]
+    ; CI-MESA-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND4]], [[C2]](s32)
+    ; CI-MESA-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[LSHR]], [[SHL2]]
     ; CI-MESA-NEXT: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
     ; CI-MESA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>)
     ; CI-MESA-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
     ; VI-LABEL: name: test_load_global_v3s16_align4
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -5275,13 +5551,13 @@ body: |
     ; VI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C2]](s32)
     ; VI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]]
     ; VI-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
-    ; VI-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C3]]
-    ; VI-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C3]]
-    ; VI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C2]](s32)
-    ; VI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]]
+    ; VI-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C3]]
+    ; VI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND4]], [[C2]](s32)
+    ; VI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[LSHR]], [[SHL2]]
     ; VI-NEXT: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
     ; VI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
     ; GFX9-HSA-LABEL: name: test_load_global_v3s16_align4
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
@@ -5310,6 +5586,7 @@ body: |
     ; GFX9-HSA-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16)
     ; GFX9-HSA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
     ; GFX9-HSA-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
     ; GFX9-MESA-LABEL: name: test_load_global_v3s16_align4
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -5379,13 +5656,13 @@ body: |
     ; SI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C2]](s32)
     ; SI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]]
     ; SI-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
-    ; SI-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C3]]
-    ; SI-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C3]]
-    ; SI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C2]](s32)
-    ; SI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]]
+    ; SI-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C3]]
+    ; SI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND4]], [[C2]](s32)
+    ; SI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[LSHR]], [[SHL2]]
     ; SI-NEXT: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
     ; SI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
     ; CI-HSA-LABEL: name: test_load_global_v3s16_align2
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
@@ -5414,13 +5691,13 @@ body: |
     ; CI-HSA-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C2]](s32)
     ; CI-HSA-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]]
     ; CI-HSA-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
-    ; CI-HSA-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C3]]
-    ; CI-HSA-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C3]]
-    ; CI-HSA-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C2]](s32)
-    ; CI-HSA-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]]
+    ; CI-HSA-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C3]]
+    ; CI-HSA-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND4]], [[C2]](s32)
+    ; CI-HSA-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[LSHR]], [[SHL2]]
     ; CI-HSA-NEXT: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
     ; CI-HSA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>)
     ; CI-HSA-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
     ; CI-MESA-LABEL: name: test_load_global_v3s16_align2
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
@@ -5449,13 +5726,13 @@ body: |
     ; CI-MESA-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C2]](s32)
     ; CI-MESA-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]]
     ; CI-MESA-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
-    ; CI-MESA-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C3]]
-    ; CI-MESA-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C3]]
-    ; CI-MESA-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C2]](s32)
-    ; CI-MESA-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]]
+    ; CI-MESA-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C3]]
+    ; CI-MESA-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND4]], [[C2]](s32)
+    ; CI-MESA-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[LSHR]], [[SHL2]]
     ; CI-MESA-NEXT: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
     ; CI-MESA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>)
     ; CI-MESA-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
     ; VI-LABEL: name: test_load_global_v3s16_align2
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -5484,13 +5761,13 @@ body: |
     ; VI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C2]](s32)
     ; VI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]]
     ; VI-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
-    ; VI-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C3]]
-    ; VI-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C3]]
-    ; VI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C2]](s32)
-    ; VI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]]
+    ; VI-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C3]]
+    ; VI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND4]], [[C2]](s32)
+    ; VI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[LSHR]], [[SHL2]]
     ; VI-NEXT: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
     ; VI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
     ; GFX9-HSA-LABEL: name: test_load_global_v3s16_align2
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
@@ -5519,6 +5796,7 @@ body: |
     ; GFX9-HSA-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16)
     ; GFX9-HSA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
     ; GFX9-HSA-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
     ; GFX9-MESA-LABEL: name: test_load_global_v3s16_align2
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -5602,13 +5880,13 @@ body: |
     ; SI-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32)
     ; SI-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL4]]
     ; SI-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR4]](s32)
-    ; SI-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C5]]
-    ; SI-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C5]]
-    ; SI-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C4]](s32)
-    ; SI-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL5]]
+    ; SI-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C5]]
+    ; SI-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND4]], [[C4]](s32)
+    ; SI-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[LSHR]], [[SHL5]]
     ; SI-NEXT: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR5]](s32)
     ; SI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
     ; CI-HSA-LABEL: name: test_load_global_v3s16_align1
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
@@ -5637,13 +5915,13 @@ body: |
     ; CI-HSA-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C2]](s32)
     ; CI-HSA-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]]
     ; CI-HSA-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
-    ; CI-HSA-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C3]]
-    ; CI-HSA-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C3]]
-    ; CI-HSA-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C2]](s32)
-    ; CI-HSA-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]]
+    ; CI-HSA-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C3]]
+    ; CI-HSA-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND4]], [[C2]](s32)
+    ; CI-HSA-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[LSHR]], [[SHL2]]
     ; CI-HSA-NEXT: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
     ; CI-HSA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>)
     ; CI-HSA-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
     ; CI-MESA-LABEL: name: test_load_global_v3s16_align1
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
@@ -5686,13 +5964,13 @@ body: |
     ; CI-MESA-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32)
     ; CI-MESA-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL4]]
     ; CI-MESA-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR4]](s32)
-    ; CI-MESA-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C5]]
-    ; CI-MESA-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C5]]
-    ; CI-MESA-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C4]](s32)
-    ; CI-MESA-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL5]]
+    ; CI-MESA-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C5]]
+    ; CI-MESA-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND4]], [[C4]](s32)
+    ; CI-MESA-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[LSHR]], [[SHL5]]
     ; CI-MESA-NEXT: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR5]](s32)
     ; CI-MESA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>)
     ; CI-MESA-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
     ; VI-LABEL: name: test_load_global_v3s16_align1
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -5735,13 +6013,13 @@ body: |
     ; VI-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32)
     ; VI-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL4]]
     ; VI-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR4]](s32)
-    ; VI-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C5]]
-    ; VI-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C5]]
-    ; VI-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C4]](s32)
-    ; VI-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL5]]
+    ; VI-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C5]]
+    ; VI-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND4]], [[C4]](s32)
+    ; VI-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[LSHR]], [[SHL5]]
     ; VI-NEXT: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR5]](s32)
     ; VI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
     ; GFX9-HSA-LABEL: name: test_load_global_v3s16_align1
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
@@ -5770,6 +6048,7 @@ body: |
     ; GFX9-HSA-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16)
     ; GFX9-HSA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
     ; GFX9-HSA-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
     ; GFX9-MESA-LABEL: name: test_load_global_v3s16_align1
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -5831,30 +6110,35 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; SI-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p1) :: (load (<4 x s16>), addrspace 1)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
+    ;
     ; CI-HSA-LABEL: name: test_load_global_v4s16_align8
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
     ; CI-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p1) :: (load (<4 x s16>), addrspace 1)
     ; CI-HSA-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
+    ;
     ; CI-MESA-LABEL: name: test_load_global_v4s16_align8
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
     ; CI-MESA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-MESA-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p1) :: (load (<4 x s16>), addrspace 1)
     ; CI-MESA-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
+    ;
     ; VI-LABEL: name: test_load_global_v4s16_align8
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p1) :: (load (<4 x s16>), addrspace 1)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
+    ;
     ; GFX9-HSA-LABEL: name: test_load_global_v4s16_align8
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
     ; GFX9-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p1) :: (load (<4 x s16>), addrspace 1)
     ; GFX9-HSA-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
+    ;
     ; GFX9-MESA-LABEL: name: test_load_global_v4s16_align8
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -5878,30 +6162,35 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; SI-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p1) :: (load (<4 x s16>), align 4, addrspace 1)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
+    ;
     ; CI-HSA-LABEL: name: test_load_global_v4s16_align4
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
     ; CI-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p1) :: (load (<4 x s16>), align 4, addrspace 1)
     ; CI-HSA-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
+    ;
     ; CI-MESA-LABEL: name: test_load_global_v4s16_align4
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
     ; CI-MESA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-MESA-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p1) :: (load (<4 x s16>), align 4, addrspace 1)
     ; CI-MESA-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
+    ;
     ; VI-LABEL: name: test_load_global_v4s16_align4
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p1) :: (load (<4 x s16>), align 4, addrspace 1)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
+    ;
     ; GFX9-HSA-LABEL: name: test_load_global_v4s16_align4
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
     ; GFX9-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p1) :: (load (<4 x s16>), align 4, addrspace 1)
     ; GFX9-HSA-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
+    ;
     ; GFX9-MESA-LABEL: name: test_load_global_v4s16_align4
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -5947,12 +6236,14 @@ body: |
     ; SI-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
     ; SI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
     ; CI-HSA-LABEL: name: test_load_global_v4s16_align2
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
     ; CI-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p1) :: (load (<4 x s16>), align 2, addrspace 1)
     ; CI-HSA-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
+    ;
     ; CI-MESA-LABEL: name: test_load_global_v4s16_align2
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
@@ -5981,6 +6272,7 @@ body: |
     ; CI-MESA-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
     ; CI-MESA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>)
     ; CI-MESA-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
     ; VI-LABEL: name: test_load_global_v4s16_align2
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -6009,12 +6301,14 @@ body: |
     ; VI-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
     ; VI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
     ; GFX9-HSA-LABEL: name: test_load_global_v4s16_align2
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
     ; GFX9-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p1) :: (load (<4 x s16>), align 2, addrspace 1)
     ; GFX9-HSA-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
+    ;
     ; GFX9-MESA-LABEL: name: test_load_global_v4s16_align2
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -6094,12 +6388,14 @@ body: |
     ; SI-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR5]](s32)
     ; SI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
     ; CI-HSA-LABEL: name: test_load_global_v4s16_align1
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
     ; CI-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p1) :: (load (<4 x s16>), align 1, addrspace 1)
     ; CI-HSA-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
+    ;
     ; CI-MESA-LABEL: name: test_load_global_v4s16_align1
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
@@ -6146,6 +6442,7 @@ body: |
     ; CI-MESA-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR5]](s32)
     ; CI-MESA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>)
     ; CI-MESA-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
     ; VI-LABEL: name: test_load_global_v4s16_align1
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -6192,12 +6489,14 @@ body: |
     ; VI-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR5]](s32)
     ; VI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
     ; GFX9-HSA-LABEL: name: test_load_global_v4s16_align1
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
     ; GFX9-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p1) :: (load (<4 x s16>), align 1, addrspace 1)
     ; GFX9-HSA-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
+    ;
     ; GFX9-MESA-LABEL: name: test_load_global_v4s16_align1
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -6270,6 +6569,7 @@ body: |
     ; SI-NEXT: $vgpr0 = COPY [[UV]](<2 x s16>)
     ; SI-NEXT: $vgpr1 = COPY [[UV1]](<2 x s16>)
     ; SI-NEXT: $vgpr2 = COPY [[BITCAST3]](<2 x s16>)
+    ;
     ; CI-HSA-LABEL: name: test_load_global_v5s16_align16
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
@@ -6291,6 +6591,7 @@ body: |
     ; CI-HSA-NEXT: $vgpr0 = COPY [[UV]](<2 x s16>)
     ; CI-HSA-NEXT: $vgpr1 = COPY [[UV1]](<2 x s16>)
     ; CI-HSA-NEXT: $vgpr2 = COPY [[BITCAST3]](<2 x s16>)
+    ;
     ; CI-MESA-LABEL: name: test_load_global_v5s16_align16
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
@@ -6312,6 +6613,7 @@ body: |
     ; CI-MESA-NEXT: $vgpr0 = COPY [[UV]](<2 x s16>)
     ; CI-MESA-NEXT: $vgpr1 = COPY [[UV1]](<2 x s16>)
     ; CI-MESA-NEXT: $vgpr2 = COPY [[BITCAST3]](<2 x s16>)
+    ;
     ; VI-LABEL: name: test_load_global_v5s16_align16
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -6333,6 +6635,7 @@ body: |
     ; VI-NEXT: $vgpr0 = COPY [[UV]](<2 x s16>)
     ; VI-NEXT: $vgpr1 = COPY [[UV1]](<2 x s16>)
     ; VI-NEXT: $vgpr2 = COPY [[BITCAST3]](<2 x s16>)
+    ;
     ; GFX9-HSA-LABEL: name: test_load_global_v5s16_align16
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
@@ -6352,6 +6655,7 @@ body: |
     ; GFX9-HSA-NEXT: $vgpr0 = COPY [[UV4]](<2 x s16>)
     ; GFX9-HSA-NEXT: $vgpr1 = COPY [[UV9]](<2 x s16>)
     ; GFX9-HSA-NEXT: $vgpr2 = COPY [[BUILD_VECTOR]](<2 x s16>)
+    ;
     ; GFX9-MESA-LABEL: name: test_load_global_v5s16_align16
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -6410,6 +6714,7 @@ body: |
     ; SI-NEXT: $vgpr0 = COPY [[UV]](<2 x s16>)
     ; SI-NEXT: $vgpr1 = COPY [[UV1]](<2 x s16>)
     ; SI-NEXT: $vgpr2 = COPY [[BITCAST1]](<2 x s16>)
+    ;
     ; CI-HSA-LABEL: name: test_load_global_v5s16_align8
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
@@ -6450,6 +6755,7 @@ body: |
     ; CI-HSA-NEXT: $vgpr0 = COPY [[BITCAST1]](<2 x s16>)
     ; CI-HSA-NEXT: $vgpr1 = COPY [[BITCAST2]](<2 x s16>)
     ; CI-HSA-NEXT: $vgpr2 = COPY [[BITCAST3]](<2 x s16>)
+    ;
     ; CI-MESA-LABEL: name: test_load_global_v5s16_align8
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
@@ -6490,6 +6796,7 @@ body: |
     ; CI-MESA-NEXT: $vgpr0 = COPY [[BITCAST1]](<2 x s16>)
     ; CI-MESA-NEXT: $vgpr1 = COPY [[BITCAST2]](<2 x s16>)
     ; CI-MESA-NEXT: $vgpr2 = COPY [[BITCAST3]](<2 x s16>)
+    ;
     ; VI-LABEL: name: test_load_global_v5s16_align8
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -6530,6 +6837,7 @@ body: |
     ; VI-NEXT: $vgpr0 = COPY [[BITCAST1]](<2 x s16>)
     ; VI-NEXT: $vgpr1 = COPY [[BITCAST2]](<2 x s16>)
     ; VI-NEXT: $vgpr2 = COPY [[BITCAST3]](<2 x s16>)
+    ;
     ; GFX9-HSA-LABEL: name: test_load_global_v5s16_align8
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
@@ -6562,6 +6870,7 @@ body: |
     ; GFX9-HSA-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>)
     ; GFX9-HSA-NEXT: $vgpr1 = COPY [[BUILD_VECTOR1]](<2 x s16>)
     ; GFX9-HSA-NEXT: $vgpr2 = COPY [[BUILD_VECTOR2]](<2 x s16>)
+    ;
     ; GFX9-MESA-LABEL: name: test_load_global_v5s16_align8
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -6633,6 +6942,7 @@ body: |
     ; SI-NEXT: $vgpr0 = COPY [[UV]](<2 x s16>)
     ; SI-NEXT: $vgpr1 = COPY [[UV1]](<2 x s16>)
     ; SI-NEXT: $vgpr2 = COPY [[BITCAST1]](<2 x s16>)
+    ;
     ; CI-HSA-LABEL: name: test_load_global_v5s16_align4
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
@@ -6673,6 +6983,7 @@ body: |
     ; CI-HSA-NEXT: $vgpr0 = COPY [[BITCAST1]](<2 x s16>)
     ; CI-HSA-NEXT: $vgpr1 = COPY [[BITCAST2]](<2 x s16>)
     ; CI-HSA-NEXT: $vgpr2 = COPY [[BITCAST3]](<2 x s16>)
+    ;
     ; CI-MESA-LABEL: name: test_load_global_v5s16_align4
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
@@ -6713,6 +7024,7 @@ body: |
     ; CI-MESA-NEXT: $vgpr0 = COPY [[BITCAST1]](<2 x s16>)
     ; CI-MESA-NEXT: $vgpr1 = COPY [[BITCAST2]](<2 x s16>)
     ; CI-MESA-NEXT: $vgpr2 = COPY [[BITCAST3]](<2 x s16>)
+    ;
     ; VI-LABEL: name: test_load_global_v5s16_align4
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -6753,6 +7065,7 @@ body: |
     ; VI-NEXT: $vgpr0 = COPY [[BITCAST1]](<2 x s16>)
     ; VI-NEXT: $vgpr1 = COPY [[BITCAST2]](<2 x s16>)
     ; VI-NEXT: $vgpr2 = COPY [[BITCAST3]](<2 x s16>)
+    ;
     ; GFX9-HSA-LABEL: name: test_load_global_v5s16_align4
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
@@ -6785,6 +7098,7 @@ body: |
     ; GFX9-HSA-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>)
     ; GFX9-HSA-NEXT: $vgpr1 = COPY [[BUILD_VECTOR1]](<2 x s16>)
     ; GFX9-HSA-NEXT: $vgpr2 = COPY [[BUILD_VECTOR2]](<2 x s16>)
+    ;
     ; GFX9-MESA-LABEL: name: test_load_global_v5s16_align4
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -6874,6 +7188,7 @@ body: |
     ; SI-NEXT: $vgpr0 = COPY [[BITCAST1]](<2 x s16>)
     ; SI-NEXT: $vgpr1 = COPY [[BITCAST2]](<2 x s16>)
     ; SI-NEXT: $vgpr2 = COPY [[BITCAST3]](<2 x s16>)
+    ;
     ; CI-HSA-LABEL: name: test_load_global_v5s16_align2
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
@@ -6914,6 +7229,7 @@ body: |
     ; CI-HSA-NEXT: $vgpr0 = COPY [[BITCAST1]](<2 x s16>)
     ; CI-HSA-NEXT: $vgpr1 = COPY [[BITCAST2]](<2 x s16>)
     ; CI-HSA-NEXT: $vgpr2 = COPY [[BITCAST3]](<2 x s16>)
+    ;
     ; CI-MESA-LABEL: name: test_load_global_v5s16_align2
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
@@ -6954,6 +7270,7 @@ body: |
     ; CI-MESA-NEXT: $vgpr0 = COPY [[BITCAST1]](<2 x s16>)
     ; CI-MESA-NEXT: $vgpr1 = COPY [[BITCAST2]](<2 x s16>)
     ; CI-MESA-NEXT: $vgpr2 = COPY [[BITCAST3]](<2 x s16>)
+    ;
     ; VI-LABEL: name: test_load_global_v5s16_align2
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -6994,6 +7311,7 @@ body: |
     ; VI-NEXT: $vgpr0 = COPY [[BITCAST1]](<2 x s16>)
     ; VI-NEXT: $vgpr1 = COPY [[BITCAST2]](<2 x s16>)
     ; VI-NEXT: $vgpr2 = COPY [[BITCAST3]](<2 x s16>)
+    ;
     ; GFX9-HSA-LABEL: name: test_load_global_v5s16_align2
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
@@ -7026,6 +7344,7 @@ body: |
     ; GFX9-HSA-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>)
     ; GFX9-HSA-NEXT: $vgpr1 = COPY [[BUILD_VECTOR1]](<2 x s16>)
     ; GFX9-HSA-NEXT: $vgpr2 = COPY [[BUILD_VECTOR2]](<2 x s16>)
+    ;
     ; GFX9-MESA-LABEL: name: test_load_global_v5s16_align2
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -7137,6 +7456,7 @@ body: |
     ; SI-NEXT: $vgpr0 = COPY [[BITCAST1]](<2 x s16>)
     ; SI-NEXT: $vgpr1 = COPY [[BITCAST2]](<2 x s16>)
     ; SI-NEXT: $vgpr2 = COPY [[BITCAST3]](<2 x s16>)
+    ;
     ; CI-HSA-LABEL: name: test_load_global_v5s16_align1
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
@@ -7177,6 +7497,7 @@ body: |
     ; CI-HSA-NEXT: $vgpr0 = COPY [[BITCAST1]](<2 x s16>)
     ; CI-HSA-NEXT: $vgpr1 = COPY [[BITCAST2]](<2 x s16>)
     ; CI-HSA-NEXT: $vgpr2 = COPY [[BITCAST3]](<2 x s16>)
+    ;
     ; CI-MESA-LABEL: name: test_load_global_v5s16_align1
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
@@ -7239,6 +7560,7 @@ body: |
     ; CI-MESA-NEXT: $vgpr0 = COPY [[BITCAST1]](<2 x s16>)
     ; CI-MESA-NEXT: $vgpr1 = COPY [[BITCAST2]](<2 x s16>)
     ; CI-MESA-NEXT: $vgpr2 = COPY [[BITCAST3]](<2 x s16>)
+    ;
     ; VI-LABEL: name: test_load_global_v5s16_align1
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -7301,6 +7623,7 @@ body: |
     ; VI-NEXT: $vgpr0 = COPY [[BITCAST1]](<2 x s16>)
     ; VI-NEXT: $vgpr1 = COPY [[BITCAST2]](<2 x s16>)
     ; VI-NEXT: $vgpr2 = COPY [[BITCAST3]](<2 x s16>)
+    ;
     ; GFX9-HSA-LABEL: name: test_load_global_v5s16_align1
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
@@ -7333,6 +7656,7 @@ body: |
     ; GFX9-HSA-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>)
     ; GFX9-HSA-NEXT: $vgpr1 = COPY [[BUILD_VECTOR1]](<2 x s16>)
     ; GFX9-HSA-NEXT: $vgpr2 = COPY [[BUILD_VECTOR2]](<2 x s16>)
+    ;
     ; GFX9-MESA-LABEL: name: test_load_global_v5s16_align1
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -7413,6 +7737,7 @@ body: |
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[UV2]](s32)
     ; SI-NEXT: [[BITCAST:%[0-9]+]]:_(<6 x s16>) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](<6 x s16>)
+    ;
     ; CI-HSA-LABEL: name: test_load_global_v6s16_align16
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
@@ -7420,6 +7745,7 @@ body: |
     ; CI-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p1) :: (load (<3 x s32>), align 16, addrspace 1)
     ; CI-HSA-NEXT: [[BITCAST:%[0-9]+]]:_(<6 x s16>) = G_BITCAST [[LOAD]](<3 x s32>)
     ; CI-HSA-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](<6 x s16>)
+    ;
     ; CI-MESA-LABEL: name: test_load_global_v6s16_align16
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
@@ -7427,6 +7753,7 @@ body: |
     ; CI-MESA-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p1) :: (load (<3 x s32>), align 16, addrspace 1)
     ; CI-MESA-NEXT: [[BITCAST:%[0-9]+]]:_(<6 x s16>) = G_BITCAST [[LOAD]](<3 x s32>)
     ; CI-MESA-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](<6 x s16>)
+    ;
     ; VI-LABEL: name: test_load_global_v6s16_align16
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -7434,6 +7761,7 @@ body: |
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p1) :: (load (<3 x s32>), align 16, addrspace 1)
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(<6 x s16>) = G_BITCAST [[LOAD]](<3 x s32>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](<6 x s16>)
+    ;
     ; GFX9-HSA-LABEL: name: test_load_global_v6s16_align16
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
@@ -7441,6 +7769,7 @@ body: |
     ; GFX9-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p1) :: (load (<3 x s32>), align 16, addrspace 1)
     ; GFX9-HSA-NEXT: [[BITCAST:%[0-9]+]]:_(<6 x s16>) = G_BITCAST [[LOAD]](<3 x s32>)
     ; GFX9-HSA-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](<6 x s16>)
+    ;
     ; GFX9-MESA-LABEL: name: test_load_global_v6s16_align16
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -7471,6 +7800,7 @@ body: |
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[LOAD1]](s32)
     ; SI-NEXT: [[BITCAST:%[0-9]+]]:_(<6 x s16>) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](<6 x s16>)
+    ;
     ; CI-HSA-LABEL: name: test_load_global_v6s16_align8
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
@@ -7478,6 +7808,7 @@ body: |
     ; CI-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p1) :: (load (<3 x s32>), align 8, addrspace 1)
     ; CI-HSA-NEXT: [[BITCAST:%[0-9]+]]:_(<6 x s16>) = G_BITCAST [[LOAD]](<3 x s32>)
     ; CI-HSA-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](<6 x s16>)
+    ;
     ; CI-MESA-LABEL: name: test_load_global_v6s16_align8
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
@@ -7485,6 +7816,7 @@ body: |
     ; CI-MESA-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p1) :: (load (<3 x s32>), align 8, addrspace 1)
     ; CI-MESA-NEXT: [[BITCAST:%[0-9]+]]:_(<6 x s16>) = G_BITCAST [[LOAD]](<3 x s32>)
     ; CI-MESA-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](<6 x s16>)
+    ;
     ; VI-LABEL: name: test_load_global_v6s16_align8
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -7492,6 +7824,7 @@ body: |
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p1) :: (load (<3 x s32>), align 8, addrspace 1)
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(<6 x s16>) = G_BITCAST [[LOAD]](<3 x s32>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](<6 x s16>)
+    ;
     ; GFX9-HSA-LABEL: name: test_load_global_v6s16_align8
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
@@ -7499,6 +7832,7 @@ body: |
     ; GFX9-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p1) :: (load (<3 x s32>), align 8, addrspace 1)
     ; GFX9-HSA-NEXT: [[BITCAST:%[0-9]+]]:_(<6 x s16>) = G_BITCAST [[LOAD]](<3 x s32>)
     ; GFX9-HSA-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](<6 x s16>)
+    ;
     ; GFX9-MESA-LABEL: name: test_load_global_v6s16_align8
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -7529,6 +7863,7 @@ body: |
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[LOAD1]](s32)
     ; SI-NEXT: [[BITCAST:%[0-9]+]]:_(<6 x s16>) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](<6 x s16>)
+    ;
     ; CI-HSA-LABEL: name: test_load_global_v6s16_align4
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
@@ -7536,6 +7871,7 @@ body: |
     ; CI-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p1) :: (load (<3 x s32>), align 4, addrspace 1)
     ; CI-HSA-NEXT: [[BITCAST:%[0-9]+]]:_(<6 x s16>) = G_BITCAST [[LOAD]](<3 x s32>)
     ; CI-HSA-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](<6 x s16>)
+    ;
     ; CI-MESA-LABEL: name: test_load_global_v6s16_align4
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
@@ -7543,6 +7879,7 @@ body: |
     ; CI-MESA-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p1) :: (load (<3 x s32>), align 4, addrspace 1)
     ; CI-MESA-NEXT: [[BITCAST:%[0-9]+]]:_(<6 x s16>) = G_BITCAST [[LOAD]](<3 x s32>)
     ; CI-MESA-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](<6 x s16>)
+    ;
     ; VI-LABEL: name: test_load_global_v6s16_align4
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -7550,6 +7887,7 @@ body: |
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p1) :: (load (<3 x s32>), align 4, addrspace 1)
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(<6 x s16>) = G_BITCAST [[LOAD]](<3 x s32>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](<6 x s16>)
+    ;
     ; GFX9-HSA-LABEL: name: test_load_global_v6s16_align4
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
@@ -7557,6 +7895,7 @@ body: |
     ; GFX9-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p1) :: (load (<3 x s32>), align 4, addrspace 1)
     ; GFX9-HSA-NEXT: [[BITCAST:%[0-9]+]]:_(<6 x s16>) = G_BITCAST [[LOAD]](<3 x s32>)
     ; GFX9-HSA-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](<6 x s16>)
+    ;
     ; GFX9-MESA-LABEL: name: test_load_global_v6s16_align4
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -7603,6 +7942,7 @@ body: |
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32)
     ; SI-NEXT: [[BITCAST:%[0-9]+]]:_(<6 x s16>) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](<6 x s16>)
+    ;
     ; CI-HSA-LABEL: name: test_load_global_v6s16_align2
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
@@ -7610,6 +7950,7 @@ body: |
     ; CI-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p1) :: (load (<3 x s32>), align 2, addrspace 1)
     ; CI-HSA-NEXT: [[BITCAST:%[0-9]+]]:_(<6 x s16>) = G_BITCAST [[LOAD]](<3 x s32>)
     ; CI-HSA-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](<6 x s16>)
+    ;
     ; CI-MESA-LABEL: name: test_load_global_v6s16_align2
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
@@ -7638,6 +7979,7 @@ body: |
     ; CI-MESA-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32)
     ; CI-MESA-NEXT: [[BITCAST:%[0-9]+]]:_(<6 x s16>) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; CI-MESA-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](<6 x s16>)
+    ;
     ; VI-LABEL: name: test_load_global_v6s16_align2
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -7666,6 +8008,7 @@ body: |
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32)
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(<6 x s16>) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](<6 x s16>)
+    ;
     ; GFX9-HSA-LABEL: name: test_load_global_v6s16_align2
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
@@ -7673,6 +8016,7 @@ body: |
     ; GFX9-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p1) :: (load (<3 x s32>), align 2, addrspace 1)
     ; GFX9-HSA-NEXT: [[BITCAST:%[0-9]+]]:_(<6 x s16>) = G_BITCAST [[LOAD]](<3 x s32>)
     ; GFX9-HSA-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](<6 x s16>)
+    ;
     ; GFX9-MESA-LABEL: name: test_load_global_v6s16_align2
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -7766,6 +8110,7 @@ body: |
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32)
     ; SI-NEXT: [[BITCAST:%[0-9]+]]:_(<6 x s16>) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](<6 x s16>)
+    ;
     ; CI-HSA-LABEL: name: test_load_global_v6s16_align1
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
@@ -7773,6 +8118,7 @@ body: |
     ; CI-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p1) :: (load (<3 x s32>), align 1, addrspace 1)
     ; CI-HSA-NEXT: [[BITCAST:%[0-9]+]]:_(<6 x s16>) = G_BITCAST [[LOAD]](<3 x s32>)
     ; CI-HSA-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](<6 x s16>)
+    ;
     ; CI-MESA-LABEL: name: test_load_global_v6s16_align1
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
@@ -7827,6 +8173,7 @@ body: |
     ; CI-MESA-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32)
     ; CI-MESA-NEXT: [[BITCAST:%[0-9]+]]:_(<6 x s16>) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; CI-MESA-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](<6 x s16>)
+    ;
     ; VI-LABEL: name: test_load_global_v6s16_align1
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -7881,6 +8228,7 @@ body: |
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32)
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(<6 x s16>) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](<6 x s16>)
+    ;
     ; GFX9-HSA-LABEL: name: test_load_global_v6s16_align1
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
@@ -7888,6 +8236,7 @@ body: |
     ; GFX9-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p1) :: (load (<3 x s32>), align 1, addrspace 1)
     ; GFX9-HSA-NEXT: [[BITCAST:%[0-9]+]]:_(<6 x s16>) = G_BITCAST [[LOAD]](<3 x s32>)
     ; GFX9-HSA-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](<6 x s16>)
+    ;
     ; GFX9-MESA-LABEL: name: test_load_global_v6s16_align1
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -7975,6 +8324,7 @@ body: |
     ; SI-NEXT: $vgpr1 = COPY [[UV1]](<2 x s16>)
     ; SI-NEXT: $vgpr2 = COPY [[UV2]](<2 x s16>)
     ; SI-NEXT: $vgpr3 = COPY [[BITCAST3]](<2 x s16>)
+    ;
     ; CI-HSA-LABEL: name: test_load_global_v7s16_align16
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
@@ -7997,6 +8347,7 @@ body: |
     ; CI-HSA-NEXT: $vgpr1 = COPY [[UV1]](<2 x s16>)
     ; CI-HSA-NEXT: $vgpr2 = COPY [[UV2]](<2 x s16>)
     ; CI-HSA-NEXT: $vgpr3 = COPY [[BITCAST3]](<2 x s16>)
+    ;
     ; CI-MESA-LABEL: name: test_load_global_v7s16_align16
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
@@ -8019,6 +8370,7 @@ body: |
     ; CI-MESA-NEXT: $vgpr1 = COPY [[UV1]](<2 x s16>)
     ; CI-MESA-NEXT: $vgpr2 = COPY [[UV2]](<2 x s16>)
     ; CI-MESA-NEXT: $vgpr3 = COPY [[BITCAST3]](<2 x s16>)
+    ;
     ; VI-LABEL: name: test_load_global_v7s16_align16
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -8041,6 +8393,7 @@ body: |
     ; VI-NEXT: $vgpr1 = COPY [[UV1]](<2 x s16>)
     ; VI-NEXT: $vgpr2 = COPY [[UV2]](<2 x s16>)
     ; VI-NEXT: $vgpr3 = COPY [[BITCAST3]](<2 x s16>)
+    ;
     ; GFX9-HSA-LABEL: name: test_load_global_v7s16_align16
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
@@ -8062,6 +8415,7 @@ body: |
     ; GFX9-HSA-NEXT: $vgpr1 = COPY [[UV9]](<2 x s16>)
     ; GFX9-HSA-NEXT: $vgpr2 = COPY [[UV14]](<2 x s16>)
     ; GFX9-HSA-NEXT: $vgpr3 = COPY [[BUILD_VECTOR]](<2 x s16>)
+    ;
     ; GFX9-MESA-LABEL: name: test_load_global_v7s16_align16
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -8153,6 +8507,7 @@ body: |
     ; SI-NEXT: $vgpr1 = COPY [[BITCAST2]](<2 x s16>)
     ; SI-NEXT: $vgpr2 = COPY [[BITCAST3]](<2 x s16>)
     ; SI-NEXT: $vgpr3 = COPY [[BITCAST4]](<2 x s16>)
+    ;
     ; CI-HSA-LABEL: name: test_load_global_v7s16_align8
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
@@ -8205,6 +8560,7 @@ body: |
     ; CI-HSA-NEXT: $vgpr1 = COPY [[BITCAST2]](<2 x s16>)
     ; CI-HSA-NEXT: $vgpr2 = COPY [[BITCAST3]](<2 x s16>)
     ; CI-HSA-NEXT: $vgpr3 = COPY [[BITCAST4]](<2 x s16>)
+    ;
     ; CI-MESA-LABEL: name: test_load_global_v7s16_align8
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
@@ -8257,6 +8613,7 @@ body: |
     ; CI-MESA-NEXT: $vgpr1 = COPY [[BITCAST2]](<2 x s16>)
     ; CI-MESA-NEXT: $vgpr2 = COPY [[BITCAST3]](<2 x s16>)
     ; CI-MESA-NEXT: $vgpr3 = COPY [[BITCAST4]](<2 x s16>)
+    ;
     ; VI-LABEL: name: test_load_global_v7s16_align8
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -8309,6 +8666,7 @@ body: |
     ; VI-NEXT: $vgpr1 = COPY [[BITCAST2]](<2 x s16>)
     ; VI-NEXT: $vgpr2 = COPY [[BITCAST3]](<2 x s16>)
     ; VI-NEXT: $vgpr3 = COPY [[BITCAST4]](<2 x s16>)
+    ;
     ; GFX9-HSA-LABEL: name: test_load_global_v7s16_align8
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
@@ -8351,6 +8709,7 @@ body: |
     ; GFX9-HSA-NEXT: $vgpr1 = COPY [[BUILD_VECTOR1]](<2 x s16>)
     ; GFX9-HSA-NEXT: $vgpr2 = COPY [[BUILD_VECTOR2]](<2 x s16>)
     ; GFX9-HSA-NEXT: $vgpr3 = COPY [[BUILD_VECTOR3]](<2 x s16>)
+    ;
     ; GFX9-MESA-LABEL: name: test_load_global_v7s16_align8
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -8463,6 +8822,7 @@ body: |
     ; SI-NEXT: $vgpr1 = COPY [[BITCAST2]](<2 x s16>)
     ; SI-NEXT: $vgpr2 = COPY [[BITCAST3]](<2 x s16>)
     ; SI-NEXT: $vgpr3 = COPY [[BITCAST4]](<2 x s16>)
+    ;
     ; CI-HSA-LABEL: name: test_load_global_v7s16_align4
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
@@ -8515,6 +8875,7 @@ body: |
     ; CI-HSA-NEXT: $vgpr1 = COPY [[BITCAST2]](<2 x s16>)
     ; CI-HSA-NEXT: $vgpr2 = COPY [[BITCAST3]](<2 x s16>)
     ; CI-HSA-NEXT: $vgpr3 = COPY [[BITCAST4]](<2 x s16>)
+    ;
     ; CI-MESA-LABEL: name: test_load_global_v7s16_align4
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
@@ -8567,6 +8928,7 @@ body: |
     ; CI-MESA-NEXT: $vgpr1 = COPY [[BITCAST2]](<2 x s16>)
     ; CI-MESA-NEXT: $vgpr2 = COPY [[BITCAST3]](<2 x s16>)
     ; CI-MESA-NEXT: $vgpr3 = COPY [[BITCAST4]](<2 x s16>)
+    ;
     ; VI-LABEL: name: test_load_global_v7s16_align4
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -8619,6 +8981,7 @@ body: |
     ; VI-NEXT: $vgpr1 = COPY [[BITCAST2]](<2 x s16>)
     ; VI-NEXT: $vgpr2 = COPY [[BITCAST3]](<2 x s16>)
     ; VI-NEXT: $vgpr3 = COPY [[BITCAST4]](<2 x s16>)
+    ;
     ; GFX9-HSA-LABEL: name: test_load_global_v7s16_align4
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
@@ -8661,6 +9024,7 @@ body: |
     ; GFX9-HSA-NEXT: $vgpr1 = COPY [[BUILD_VECTOR1]](<2 x s16>)
     ; GFX9-HSA-NEXT: $vgpr2 = COPY [[BUILD_VECTOR2]](<2 x s16>)
     ; GFX9-HSA-NEXT: $vgpr3 = COPY [[BUILD_VECTOR3]](<2 x s16>)
+    ;
     ; GFX9-MESA-LABEL: name: test_load_global_v7s16_align4
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -8773,6 +9137,7 @@ body: |
     ; SI-NEXT: $vgpr1 = COPY [[BITCAST2]](<2 x s16>)
     ; SI-NEXT: $vgpr2 = COPY [[BITCAST3]](<2 x s16>)
     ; SI-NEXT: $vgpr3 = COPY [[BITCAST4]](<2 x s16>)
+    ;
     ; CI-HSA-LABEL: name: test_load_global_v7s16_align2
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
@@ -8825,6 +9190,7 @@ body: |
     ; CI-HSA-NEXT: $vgpr1 = COPY [[BITCAST2]](<2 x s16>)
     ; CI-HSA-NEXT: $vgpr2 = COPY [[BITCAST3]](<2 x s16>)
     ; CI-HSA-NEXT: $vgpr3 = COPY [[BITCAST4]](<2 x s16>)
+    ;
     ; CI-MESA-LABEL: name: test_load_global_v7s16_align2
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
@@ -8877,6 +9243,7 @@ body: |
     ; CI-MESA-NEXT: $vgpr1 = COPY [[BITCAST2]](<2 x s16>)
     ; CI-MESA-NEXT: $vgpr2 = COPY [[BITCAST3]](<2 x s16>)
     ; CI-MESA-NEXT: $vgpr3 = COPY [[BITCAST4]](<2 x s16>)
+    ;
     ; VI-LABEL: name: test_load_global_v7s16_align2
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -8929,6 +9296,7 @@ body: |
     ; VI-NEXT: $vgpr1 = COPY [[BITCAST2]](<2 x s16>)
     ; VI-NEXT: $vgpr2 = COPY [[BITCAST3]](<2 x s16>)
     ; VI-NEXT: $vgpr3 = COPY [[BITCAST4]](<2 x s16>)
+    ;
     ; GFX9-HSA-LABEL: name: test_load_global_v7s16_align2
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
@@ -8971,6 +9339,7 @@ body: |
     ; GFX9-HSA-NEXT: $vgpr1 = COPY [[BUILD_VECTOR1]](<2 x s16>)
     ; GFX9-HSA-NEXT: $vgpr2 = COPY [[BUILD_VECTOR2]](<2 x s16>)
     ; GFX9-HSA-NEXT: $vgpr3 = COPY [[BUILD_VECTOR3]](<2 x s16>)
+    ;
     ; GFX9-MESA-LABEL: name: test_load_global_v7s16_align2
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -9113,6 +9482,7 @@ body: |
     ; SI-NEXT: $vgpr1 = COPY [[BITCAST2]](<2 x s16>)
     ; SI-NEXT: $vgpr2 = COPY [[BITCAST3]](<2 x s16>)
     ; SI-NEXT: $vgpr3 = COPY [[BITCAST4]](<2 x s16>)
+    ;
     ; CI-HSA-LABEL: name: test_load_global_v7s16_align1
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
@@ -9165,6 +9535,7 @@ body: |
     ; CI-HSA-NEXT: $vgpr1 = COPY [[BITCAST2]](<2 x s16>)
     ; CI-HSA-NEXT: $vgpr2 = COPY [[BITCAST3]](<2 x s16>)
     ; CI-HSA-NEXT: $vgpr3 = COPY [[BITCAST4]](<2 x s16>)
+    ;
     ; CI-MESA-LABEL: name: test_load_global_v7s16_align1
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
@@ -9247,6 +9618,7 @@ body: |
     ; CI-MESA-NEXT: $vgpr1 = COPY [[BITCAST2]](<2 x s16>)
     ; CI-MESA-NEXT: $vgpr2 = COPY [[BITCAST3]](<2 x s16>)
     ; CI-MESA-NEXT: $vgpr3 = COPY [[BITCAST4]](<2 x s16>)
+    ;
     ; VI-LABEL: name: test_load_global_v7s16_align1
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -9329,6 +9701,7 @@ body: |
     ; VI-NEXT: $vgpr1 = COPY [[BITCAST2]](<2 x s16>)
     ; VI-NEXT: $vgpr2 = COPY [[BITCAST3]](<2 x s16>)
     ; VI-NEXT: $vgpr3 = COPY [[BITCAST4]](<2 x s16>)
+    ;
     ; GFX9-HSA-LABEL: name: test_load_global_v7s16_align1
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
@@ -9371,6 +9744,7 @@ body: |
     ; GFX9-HSA-NEXT: $vgpr1 = COPY [[BUILD_VECTOR1]](<2 x s16>)
     ; GFX9-HSA-NEXT: $vgpr2 = COPY [[BUILD_VECTOR2]](<2 x s16>)
     ; GFX9-HSA-NEXT: $vgpr3 = COPY [[BUILD_VECTOR3]](<2 x s16>)
+    ;
     ; GFX9-MESA-LABEL: name: test_load_global_v7s16_align1
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -9468,6 +9842,7 @@ body: |
     ; SI-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p1) :: (load (<4 x s32>), addrspace 1)
     ; SI-NEXT: [[BITCAST:%[0-9]+]]:_(<8 x s16>) = G_BITCAST [[LOAD]](<4 x s32>)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<8 x s16>)
+    ;
     ; CI-HSA-LABEL: name: test_load_global_v8s16_align16
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
@@ -9475,6 +9850,7 @@ body: |
     ; CI-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p1) :: (load (<4 x s32>), addrspace 1)
     ; CI-HSA-NEXT: [[BITCAST:%[0-9]+]]:_(<8 x s16>) = G_BITCAST [[LOAD]](<4 x s32>)
     ; CI-HSA-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<8 x s16>)
+    ;
     ; CI-MESA-LABEL: name: test_load_global_v8s16_align16
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
@@ -9482,6 +9858,7 @@ body: |
     ; CI-MESA-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p1) :: (load (<4 x s32>), addrspace 1)
     ; CI-MESA-NEXT: [[BITCAST:%[0-9]+]]:_(<8 x s16>) = G_BITCAST [[LOAD]](<4 x s32>)
     ; CI-MESA-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<8 x s16>)
+    ;
     ; VI-LABEL: name: test_load_global_v8s16_align16
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -9489,6 +9866,7 @@ body: |
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p1) :: (load (<4 x s32>), addrspace 1)
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(<8 x s16>) = G_BITCAST [[LOAD]](<4 x s32>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<8 x s16>)
+    ;
     ; GFX9-HSA-LABEL: name: test_load_global_v8s16_align16
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
@@ -9496,6 +9874,7 @@ body: |
     ; GFX9-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p1) :: (load (<4 x s32>), addrspace 1)
     ; GFX9-HSA-NEXT: [[BITCAST:%[0-9]+]]:_(<8 x s16>) = G_BITCAST [[LOAD]](<4 x s32>)
     ; GFX9-HSA-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<8 x s16>)
+    ;
     ; GFX9-MESA-LABEL: name: test_load_global_v8s16_align16
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -9521,6 +9900,7 @@ body: |
     ; SI-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p1) :: (load (<4 x s32>), align 8, addrspace 1)
     ; SI-NEXT: [[BITCAST:%[0-9]+]]:_(<8 x s16>) = G_BITCAST [[LOAD]](<4 x s32>)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<8 x s16>)
+    ;
     ; CI-HSA-LABEL: name: test_load_global_v8s16_align8
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
@@ -9528,6 +9908,7 @@ body: |
     ; CI-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p1) :: (load (<4 x s32>), align 8, addrspace 1)
     ; CI-HSA-NEXT: [[BITCAST:%[0-9]+]]:_(<8 x s16>) = G_BITCAST [[LOAD]](<4 x s32>)
     ; CI-HSA-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<8 x s16>)
+    ;
     ; CI-MESA-LABEL: name: test_load_global_v8s16_align8
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
@@ -9535,6 +9916,7 @@ body: |
     ; CI-MESA-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p1) :: (load (<4 x s32>), align 8, addrspace 1)
     ; CI-MESA-NEXT: [[BITCAST:%[0-9]+]]:_(<8 x s16>) = G_BITCAST [[LOAD]](<4 x s32>)
     ; CI-MESA-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<8 x s16>)
+    ;
     ; VI-LABEL: name: test_load_global_v8s16_align8
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -9542,6 +9924,7 @@ body: |
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p1) :: (load (<4 x s32>), align 8, addrspace 1)
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(<8 x s16>) = G_BITCAST [[LOAD]](<4 x s32>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<8 x s16>)
+    ;
     ; GFX9-HSA-LABEL: name: test_load_global_v8s16_align8
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
@@ -9549,6 +9932,7 @@ body: |
     ; GFX9-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p1) :: (load (<4 x s32>), align 8, addrspace 1)
     ; GFX9-HSA-NEXT: [[BITCAST:%[0-9]+]]:_(<8 x s16>) = G_BITCAST [[LOAD]](<4 x s32>)
     ; GFX9-HSA-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<8 x s16>)
+    ;
     ; GFX9-MESA-LABEL: name: test_load_global_v8s16_align8
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -9573,30 +9957,35 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; SI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p1) :: (load (<2 x s32>), addrspace 1)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; CI-HSA-LABEL: name: test_load_global_v2s32_align8
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
     ; CI-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p1) :: (load (<2 x s32>), addrspace 1)
     ; CI-HSA-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; CI-MESA-LABEL: name: test_load_global_v2s32_align8
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
     ; CI-MESA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-MESA-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p1) :: (load (<2 x s32>), addrspace 1)
     ; CI-MESA-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; VI-LABEL: name: test_load_global_v2s32_align8
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p1) :: (load (<2 x s32>), addrspace 1)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; GFX9-HSA-LABEL: name: test_load_global_v2s32_align8
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
     ; GFX9-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p1) :: (load (<2 x s32>), addrspace 1)
     ; GFX9-HSA-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; GFX9-MESA-LABEL: name: test_load_global_v2s32_align8
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -9620,30 +10009,35 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; SI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p1) :: (load (<2 x s32>), align 4, addrspace 1)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; CI-HSA-LABEL: name: test_load_global_v2s32_align4
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
     ; CI-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p1) :: (load (<2 x s32>), align 4, addrspace 1)
     ; CI-HSA-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; CI-MESA-LABEL: name: test_load_global_v2s32_align4
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
     ; CI-MESA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-MESA-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p1) :: (load (<2 x s32>), align 4, addrspace 1)
     ; CI-MESA-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; VI-LABEL: name: test_load_global_v2s32_align4
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p1) :: (load (<2 x s32>), align 4, addrspace 1)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; GFX9-HSA-LABEL: name: test_load_global_v2s32_align4
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
     ; GFX9-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p1) :: (load (<2 x s32>), align 4, addrspace 1)
     ; GFX9-HSA-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; GFX9-MESA-LABEL: name: test_load_global_v2s32_align4
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -9681,12 +10075,14 @@ body: |
     ; SI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; CI-HSA-LABEL: name: test_load_global_v2s32_align2
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
     ; CI-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p1) :: (load (<2 x s32>), align 2, addrspace 1)
     ; CI-HSA-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; CI-MESA-LABEL: name: test_load_global_v2s32_align2
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
@@ -9707,6 +10103,7 @@ body: |
     ; CI-MESA-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
     ; CI-MESA-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32)
     ; CI-MESA-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; VI-LABEL: name: test_load_global_v2s32_align2
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -9727,12 +10124,14 @@ body: |
     ; VI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; GFX9-HSA-LABEL: name: test_load_global_v2s32_align2
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
     ; GFX9-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p1) :: (load (<2 x s32>), align 2, addrspace 1)
     ; GFX9-HSA-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; GFX9-MESA-LABEL: name: test_load_global_v2s32_align2
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -9802,12 +10201,14 @@ body: |
     ; SI-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; CI-HSA-LABEL: name: test_load_global_v2s32_align1
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
     ; CI-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p1) :: (load (<2 x s32>), align 1, addrspace 1)
     ; CI-HSA-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; CI-MESA-LABEL: name: test_load_global_v2s32_align1
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
@@ -9846,6 +10247,7 @@ body: |
     ; CI-MESA-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
     ; CI-MESA-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32)
     ; CI-MESA-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; VI-LABEL: name: test_load_global_v2s32_align1
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -9884,12 +10286,14 @@ body: |
     ; VI-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; GFX9-HSA-LABEL: name: test_load_global_v2s32_align1
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
     ; GFX9-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p1) :: (load (<2 x s32>), align 1, addrspace 1)
     ; GFX9-HSA-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; GFX9-MESA-LABEL: name: test_load_global_v2s32_align1
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -9947,30 +10351,35 @@ body: |
     ; SI-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<4 x s32>)
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[UV2]](s32)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
+    ;
     ; CI-HSA-LABEL: name: test_load_global_v3s32_align16
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
     ; CI-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p1) :: (load (<3 x s32>), align 16, addrspace 1)
     ; CI-HSA-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](<3 x s32>)
+    ;
     ; CI-MESA-LABEL: name: test_load_global_v3s32_align16
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
     ; CI-MESA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-MESA-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p1) :: (load (<3 x s32>), align 16, addrspace 1)
     ; CI-MESA-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](<3 x s32>)
+    ;
     ; VI-LABEL: name: test_load_global_v3s32_align16
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p1) :: (load (<3 x s32>), align 16, addrspace 1)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](<3 x s32>)
+    ;
     ; GFX9-HSA-LABEL: name: test_load_global_v3s32_align16
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
     ; GFX9-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p1) :: (load (<3 x s32>), align 16, addrspace 1)
     ; GFX9-HSA-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](<3 x s32>)
+    ;
     ; GFX9-MESA-LABEL: name: test_load_global_v3s32_align16
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -9999,30 +10408,35 @@ body: |
     ; SI-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>)
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[LOAD1]](s32)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
+    ;
     ; CI-HSA-LABEL: name: test_load_global_v3s32_align4
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
     ; CI-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p1) :: (load (<3 x s32>), align 4, addrspace 1)
     ; CI-HSA-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](<3 x s32>)
+    ;
     ; CI-MESA-LABEL: name: test_load_global_v3s32_align4
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
     ; CI-MESA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-MESA-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p1) :: (load (<3 x s32>), align 4, addrspace 1)
     ; CI-MESA-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](<3 x s32>)
+    ;
     ; VI-LABEL: name: test_load_global_v3s32_align4
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p1) :: (load (<3 x s32>), align 4, addrspace 1)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](<3 x s32>)
+    ;
     ; GFX9-HSA-LABEL: name: test_load_global_v3s32_align4
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
     ; GFX9-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p1) :: (load (<3 x s32>), align 4, addrspace 1)
     ; GFX9-HSA-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](<3 x s32>)
+    ;
     ; GFX9-MESA-LABEL: name: test_load_global_v3s32_align4
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -10046,30 +10460,35 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; SI-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p1) :: (load (<4 x s32>), addrspace 1)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ;
     ; CI-HSA-LABEL: name: test_load_global_v4s32_align16
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
     ; CI-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p1) :: (load (<4 x s32>), addrspace 1)
     ; CI-HSA-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ;
     ; CI-MESA-LABEL: name: test_load_global_v4s32_align16
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
     ; CI-MESA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-MESA-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p1) :: (load (<4 x s32>), addrspace 1)
     ; CI-MESA-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ;
     ; VI-LABEL: name: test_load_global_v4s32_align16
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p1) :: (load (<4 x s32>), addrspace 1)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ;
     ; GFX9-HSA-LABEL: name: test_load_global_v4s32_align16
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
     ; GFX9-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p1) :: (load (<4 x s32>), addrspace 1)
     ; GFX9-HSA-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ;
     ; GFX9-MESA-LABEL: name: test_load_global_v4s32_align16
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -10093,30 +10512,35 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; SI-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p1) :: (load (<4 x s32>), align 8, addrspace 1)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ;
     ; CI-HSA-LABEL: name: test_load_global_v4s32_align8
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
     ; CI-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p1) :: (load (<4 x s32>), align 8, addrspace 1)
     ; CI-HSA-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ;
     ; CI-MESA-LABEL: name: test_load_global_v4s32_align8
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
     ; CI-MESA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-MESA-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p1) :: (load (<4 x s32>), align 8, addrspace 1)
     ; CI-MESA-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ;
     ; VI-LABEL: name: test_load_global_v4s32_align8
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p1) :: (load (<4 x s32>), align 8, addrspace 1)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ;
     ; GFX9-HSA-LABEL: name: test_load_global_v4s32_align8
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
     ; GFX9-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p1) :: (load (<4 x s32>), align 8, addrspace 1)
     ; GFX9-HSA-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ;
     ; GFX9-MESA-LABEL: name: test_load_global_v4s32_align8
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -10140,30 +10564,35 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; SI-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p1) :: (load (<4 x s32>), align 4, addrspace 1)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ;
     ; CI-HSA-LABEL: name: test_load_global_v4s32_align4
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
     ; CI-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p1) :: (load (<4 x s32>), align 4, addrspace 1)
     ; CI-HSA-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ;
     ; CI-MESA-LABEL: name: test_load_global_v4s32_align4
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
     ; CI-MESA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-MESA-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p1) :: (load (<4 x s32>), align 4, addrspace 1)
     ; CI-MESA-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ;
     ; VI-LABEL: name: test_load_global_v4s32_align4
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p1) :: (load (<4 x s32>), align 4, addrspace 1)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ;
     ; GFX9-HSA-LABEL: name: test_load_global_v4s32_align4
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
     ; GFX9-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p1) :: (load (<4 x s32>), align 4, addrspace 1)
     ; GFX9-HSA-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ;
     ; GFX9-MESA-LABEL: name: test_load_global_v4s32_align4
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -10187,30 +10616,35 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; SI-NEXT: [[LOAD:%[0-9]+]]:_(<8 x s32>) = G_LOAD [[COPY]](p1) :: (load (<8 x s32>), addrspace 1)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[LOAD]](<8 x s32>)
+    ;
     ; CI-HSA-LABEL: name: test_load_global_v8s32_align32
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
     ; CI-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<8 x s32>) = G_LOAD [[COPY]](p1) :: (load (<8 x s32>), addrspace 1)
     ; CI-HSA-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[LOAD]](<8 x s32>)
+    ;
     ; CI-MESA-LABEL: name: test_load_global_v8s32_align32
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
     ; CI-MESA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-MESA-NEXT: [[LOAD:%[0-9]+]]:_(<8 x s32>) = G_LOAD [[COPY]](p1) :: (load (<8 x s32>), addrspace 1)
     ; CI-MESA-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[LOAD]](<8 x s32>)
+    ;
     ; VI-LABEL: name: test_load_global_v8s32_align32
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<8 x s32>) = G_LOAD [[COPY]](p1) :: (load (<8 x s32>), addrspace 1)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[LOAD]](<8 x s32>)
+    ;
     ; GFX9-HSA-LABEL: name: test_load_global_v8s32_align32
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
     ; GFX9-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<8 x s32>) = G_LOAD [[COPY]](p1) :: (load (<8 x s32>), addrspace 1)
     ; GFX9-HSA-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[LOAD]](<8 x s32>)
+    ;
     ; GFX9-MESA-LABEL: name: test_load_global_v8s32_align32
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -10234,30 +10668,35 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; SI-NEXT: [[LOAD:%[0-9]+]]:_(<16 x s32>) = G_LOAD [[COPY]](p1) :: (load (<16 x s32>), align 32, addrspace 1)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[LOAD]](<16 x s32>)
+    ;
     ; CI-HSA-LABEL: name: test_load_global_v16s32_align32
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
     ; CI-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<16 x s32>) = G_LOAD [[COPY]](p1) :: (load (<16 x s32>), align 32, addrspace 1)
     ; CI-HSA-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[LOAD]](<16 x s32>)
+    ;
     ; CI-MESA-LABEL: name: test_load_global_v16s32_align32
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
     ; CI-MESA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-MESA-NEXT: [[LOAD:%[0-9]+]]:_(<16 x s32>) = G_LOAD [[COPY]](p1) :: (load (<16 x s32>), align 32, addrspace 1)
     ; CI-MESA-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[LOAD]](<16 x s32>)
+    ;
     ; VI-LABEL: name: test_load_global_v16s32_align32
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<16 x s32>) = G_LOAD [[COPY]](p1) :: (load (<16 x s32>), align 32, addrspace 1)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[LOAD]](<16 x s32>)
+    ;
     ; GFX9-HSA-LABEL: name: test_load_global_v16s32_align32
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
     ; GFX9-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<16 x s32>) = G_LOAD [[COPY]](p1) :: (load (<16 x s32>), align 32, addrspace 1)
     ; GFX9-HSA-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[LOAD]](<16 x s32>)
+    ;
     ; GFX9-MESA-LABEL: name: test_load_global_v16s32_align32
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -10281,30 +10720,35 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; SI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p1) :: (load (<2 x s64>), addrspace 1)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>)
+    ;
     ; CI-HSA-LABEL: name: test_load_global_v2s64_align16
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
     ; CI-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p1) :: (load (<2 x s64>), addrspace 1)
     ; CI-HSA-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>)
+    ;
     ; CI-MESA-LABEL: name: test_load_global_v2s64_align16
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
     ; CI-MESA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-MESA-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p1) :: (load (<2 x s64>), addrspace 1)
     ; CI-MESA-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>)
+    ;
     ; VI-LABEL: name: test_load_global_v2s64_align16
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p1) :: (load (<2 x s64>), addrspace 1)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>)
+    ;
     ; GFX9-HSA-LABEL: name: test_load_global_v2s64_align16
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
     ; GFX9-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p1) :: (load (<2 x s64>), addrspace 1)
     ; GFX9-HSA-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>)
+    ;
     ; GFX9-MESA-LABEL: name: test_load_global_v2s64_align16
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -10328,30 +10772,35 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; SI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p1) :: (load (<2 x s64>), align 8, addrspace 1)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>)
+    ;
     ; CI-HSA-LABEL: name: test_load_global_v2s64_align8
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
     ; CI-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p1) :: (load (<2 x s64>), align 8, addrspace 1)
     ; CI-HSA-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>)
+    ;
     ; CI-MESA-LABEL: name: test_load_global_v2s64_align8
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
     ; CI-MESA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-MESA-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p1) :: (load (<2 x s64>), align 8, addrspace 1)
     ; CI-MESA-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>)
+    ;
     ; VI-LABEL: name: test_load_global_v2s64_align8
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p1) :: (load (<2 x s64>), align 8, addrspace 1)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>)
+    ;
     ; GFX9-HSA-LABEL: name: test_load_global_v2s64_align8
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
     ; GFX9-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p1) :: (load (<2 x s64>), align 8, addrspace 1)
     ; GFX9-HSA-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>)
+    ;
     ; GFX9-MESA-LABEL: name: test_load_global_v2s64_align8
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -10375,30 +10824,35 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; SI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p1) :: (load (<2 x s64>), align 4, addrspace 1)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>)
+    ;
     ; CI-HSA-LABEL: name: test_load_global_v2s64_align4
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
     ; CI-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p1) :: (load (<2 x s64>), align 4, addrspace 1)
     ; CI-HSA-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>)
+    ;
     ; CI-MESA-LABEL: name: test_load_global_v2s64_align4
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
     ; CI-MESA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-MESA-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p1) :: (load (<2 x s64>), align 4, addrspace 1)
     ; CI-MESA-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>)
+    ;
     ; VI-LABEL: name: test_load_global_v2s64_align4
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p1) :: (load (<2 x s64>), align 4, addrspace 1)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>)
+    ;
     ; GFX9-HSA-LABEL: name: test_load_global_v2s64_align4
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
     ; GFX9-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p1) :: (load (<2 x s64>), align 4, addrspace 1)
     ; GFX9-HSA-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>)
+    ;
     ; GFX9-MESA-LABEL: name: test_load_global_v2s64_align4
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -10459,12 +10913,14 @@ body: |
     ; SI-NEXT: [[OR5:%[0-9]+]]:_(s64) = G_OR [[SHL5]], [[ZEXT1]]
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR2]](s64), [[OR5]](s64)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ;
     ; CI-HSA-LABEL: name: test_load_global_v2s64_align2
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
     ; CI-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p1) :: (load (<2 x s64>), align 2, addrspace 1)
     ; CI-HSA-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>)
+    ;
     ; CI-MESA-LABEL: name: test_load_global_v2s64_align2
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
@@ -10508,6 +10964,7 @@ body: |
     ; CI-MESA-NEXT: [[OR5:%[0-9]+]]:_(s64) = G_OR [[SHL5]], [[ZEXT1]]
     ; CI-MESA-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR2]](s64), [[OR5]](s64)
     ; CI-MESA-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ;
     ; VI-LABEL: name: test_load_global_v2s64_align2
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -10551,12 +11008,14 @@ body: |
     ; VI-NEXT: [[OR5:%[0-9]+]]:_(s64) = G_OR [[SHL5]], [[ZEXT1]]
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR2]](s64), [[OR5]](s64)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ;
     ; GFX9-HSA-LABEL: name: test_load_global_v2s64_align2
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
     ; GFX9-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p1) :: (load (<2 x s64>), align 2, addrspace 1)
     ; GFX9-HSA-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>)
+    ;
     ; GFX9-MESA-LABEL: name: test_load_global_v2s64_align2
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -10688,12 +11147,14 @@ body: |
     ; SI-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]]
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ;
     ; CI-HSA-LABEL: name: test_load_global_v2s64_align1
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
     ; CI-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p1) :: (load (<2 x s64>), align 1, addrspace 1)
     ; CI-HSA-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>)
+    ;
     ; CI-MESA-LABEL: name: test_load_global_v2s64_align1
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
@@ -10771,6 +11232,7 @@ body: |
     ; CI-MESA-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]]
     ; CI-MESA-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64)
     ; CI-MESA-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ;
     ; VI-LABEL: name: test_load_global_v2s64_align1
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -10848,12 +11310,14 @@ body: |
     ; VI-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]]
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ;
     ; GFX9-HSA-LABEL: name: test_load_global_v2s64_align1
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
     ; GFX9-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p1) :: (load (<2 x s64>), align 1, addrspace 1)
     ; GFX9-HSA-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>)
+    ;
     ; GFX9-MESA-LABEL: name: test_load_global_v2s64_align1
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -10949,6 +11413,7 @@ body: |
     ; SI-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p1) :: (load (<4 x s32>), addrspace 1)
     ; SI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>)
+    ;
     ; CI-HSA-LABEL: name: test_load_global_v2sp1_align16
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
@@ -10956,6 +11421,7 @@ body: |
     ; CI-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p1) :: (load (<4 x s32>), addrspace 1)
     ; CI-HSA-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>)
     ; CI-HSA-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>)
+    ;
     ; CI-MESA-LABEL: name: test_load_global_v2sp1_align16
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
@@ -10963,6 +11429,7 @@ body: |
     ; CI-MESA-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p1) :: (load (<4 x s32>), addrspace 1)
     ; CI-MESA-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>)
     ; CI-MESA-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>)
+    ;
     ; VI-LABEL: name: test_load_global_v2sp1_align16
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -10970,6 +11437,7 @@ body: |
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p1) :: (load (<4 x s32>), addrspace 1)
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>)
+    ;
     ; GFX9-HSA-LABEL: name: test_load_global_v2sp1_align16
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
@@ -10977,6 +11445,7 @@ body: |
     ; GFX9-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p1) :: (load (<4 x s32>), addrspace 1)
     ; GFX9-HSA-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>)
     ; GFX9-HSA-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>)
+    ;
     ; GFX9-MESA-LABEL: name: test_load_global_v2sp1_align16
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -11005,6 +11474,7 @@ body: |
     ; SI-NEXT: [[UV4:%[0-9]+]]:_(s64), [[UV5:%[0-9]+]]:_(s64), [[UV6:%[0-9]+]]:_(s64), [[UV7:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>)
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[UV]](s64), [[UV1]](s64), [[UV2]](s64), [[UV7]](s64)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
+    ;
     ; CI-HSA-LABEL: name: test_load_global_v3s64_align32
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
@@ -11015,6 +11485,7 @@ body: |
     ; CI-HSA-NEXT: [[UV4:%[0-9]+]]:_(s64), [[UV5:%[0-9]+]]:_(s64), [[UV6:%[0-9]+]]:_(s64), [[UV7:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>)
     ; CI-HSA-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[UV]](s64), [[UV1]](s64), [[UV2]](s64), [[UV7]](s64)
     ; CI-HSA-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
+    ;
     ; CI-MESA-LABEL: name: test_load_global_v3s64_align32
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
@@ -11025,6 +11496,7 @@ body: |
     ; CI-MESA-NEXT: [[UV4:%[0-9]+]]:_(s64), [[UV5:%[0-9]+]]:_(s64), [[UV6:%[0-9]+]]:_(s64), [[UV7:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>)
     ; CI-MESA-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[UV]](s64), [[UV1]](s64), [[UV2]](s64), [[UV7]](s64)
     ; CI-MESA-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
+    ;
     ; VI-LABEL: name: test_load_global_v3s64_align32
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -11035,6 +11507,7 @@ body: |
     ; VI-NEXT: [[UV4:%[0-9]+]]:_(s64), [[UV5:%[0-9]+]]:_(s64), [[UV6:%[0-9]+]]:_(s64), [[UV7:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>)
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[UV]](s64), [[UV1]](s64), [[UV2]](s64), [[UV7]](s64)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
+    ;
     ; GFX9-HSA-LABEL: name: test_load_global_v3s64_align32
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
@@ -11045,6 +11518,7 @@ body: |
     ; GFX9-HSA-NEXT: [[UV4:%[0-9]+]]:_(s64), [[UV5:%[0-9]+]]:_(s64), [[UV6:%[0-9]+]]:_(s64), [[UV7:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>)
     ; GFX9-HSA-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[UV]](s64), [[UV1]](s64), [[UV2]](s64), [[UV7]](s64)
     ; GFX9-HSA-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
+    ;
     ; GFX9-MESA-LABEL: name: test_load_global_v3s64_align32
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -11081,6 +11555,7 @@ body: |
     ; SI-NEXT: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64), [[UV4:%[0-9]+]]:_(s64), [[UV5:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>)
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[UV]](s64), [[UV1]](s64), [[LOAD1]](s64), [[UV5]](s64)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
+    ;
     ; CI-HSA-LABEL: name: test_load_global_v3s64_align8
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
@@ -11094,6 +11569,7 @@ body: |
     ; CI-HSA-NEXT: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64), [[UV4:%[0-9]+]]:_(s64), [[UV5:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>)
     ; CI-HSA-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[UV]](s64), [[UV1]](s64), [[LOAD1]](s64), [[UV5]](s64)
     ; CI-HSA-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
+    ;
     ; CI-MESA-LABEL: name: test_load_global_v3s64_align8
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
@@ -11107,6 +11583,7 @@ body: |
     ; CI-MESA-NEXT: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64), [[UV4:%[0-9]+]]:_(s64), [[UV5:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>)
     ; CI-MESA-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[UV]](s64), [[UV1]](s64), [[LOAD1]](s64), [[UV5]](s64)
     ; CI-MESA-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
+    ;
     ; VI-LABEL: name: test_load_global_v3s64_align8
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -11120,6 +11597,7 @@ body: |
     ; VI-NEXT: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64), [[UV4:%[0-9]+]]:_(s64), [[UV5:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>)
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[UV]](s64), [[UV1]](s64), [[LOAD1]](s64), [[UV5]](s64)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
+    ;
     ; GFX9-HSA-LABEL: name: test_load_global_v3s64_align8
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
@@ -11133,6 +11611,7 @@ body: |
     ; GFX9-HSA-NEXT: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64), [[UV4:%[0-9]+]]:_(s64), [[UV5:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>)
     ; GFX9-HSA-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[UV]](s64), [[UV1]](s64), [[LOAD1]](s64), [[UV5]](s64)
     ; GFX9-HSA-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
+    ;
     ; GFX9-MESA-LABEL: name: test_load_global_v3s64_align8
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -11272,6 +11751,7 @@ body: |
     ; SI-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64), [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>)
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64), [[OR20]](s64), [[UV3]](s64)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
+    ;
     ; CI-HSA-LABEL: name: test_load_global_v3s64_align1
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
@@ -11285,6 +11765,7 @@ body: |
     ; CI-HSA-NEXT: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64), [[UV4:%[0-9]+]]:_(s64), [[UV5:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>)
     ; CI-HSA-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[UV]](s64), [[UV1]](s64), [[LOAD1]](s64), [[UV5]](s64)
     ; CI-HSA-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
+    ;
     ; CI-MESA-LABEL: name: test_load_global_v3s64_align1
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
@@ -11398,6 +11879,7 @@ body: |
     ; CI-MESA-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64), [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>)
     ; CI-MESA-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64), [[OR20]](s64), [[UV3]](s64)
     ; CI-MESA-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
+    ;
     ; VI-LABEL: name: test_load_global_v3s64_align1
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -11511,6 +11993,7 @@ body: |
     ; VI-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64), [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>)
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64), [[OR20]](s64), [[UV3]](s64)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
+    ;
     ; GFX9-HSA-LABEL: name: test_load_global_v3s64_align1
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
@@ -11524,6 +12007,7 @@ body: |
     ; GFX9-HSA-NEXT: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64), [[UV4:%[0-9]+]]:_(s64), [[UV5:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>)
     ; GFX9-HSA-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[UV]](s64), [[UV1]](s64), [[LOAD1]](s64), [[UV5]](s64)
     ; GFX9-HSA-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
+    ;
     ; GFX9-MESA-LABEL: name: test_load_global_v3s64_align1
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -11656,30 +12140,35 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; SI-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s64>) = G_LOAD [[COPY]](p1) :: (load (<4 x s64>), addrspace 1)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[LOAD]](<4 x s64>)
+    ;
     ; CI-HSA-LABEL: name: test_load_global_v4s64_align32
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
     ; CI-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s64>) = G_LOAD [[COPY]](p1) :: (load (<4 x s64>), addrspace 1)
     ; CI-HSA-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[LOAD]](<4 x s64>)
+    ;
     ; CI-MESA-LABEL: name: test_load_global_v4s64_align32
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
     ; CI-MESA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-MESA-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s64>) = G_LOAD [[COPY]](p1) :: (load (<4 x s64>), addrspace 1)
     ; CI-MESA-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[LOAD]](<4 x s64>)
+    ;
     ; VI-LABEL: name: test_load_global_v4s64_align32
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s64>) = G_LOAD [[COPY]](p1) :: (load (<4 x s64>), addrspace 1)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[LOAD]](<4 x s64>)
+    ;
     ; GFX9-HSA-LABEL: name: test_load_global_v4s64_align32
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
     ; GFX9-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s64>) = G_LOAD [[COPY]](p1) :: (load (<4 x s64>), addrspace 1)
     ; GFX9-HSA-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[LOAD]](<4 x s64>)
+    ;
     ; GFX9-MESA-LABEL: name: test_load_global_v4s64_align32
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -11703,30 +12192,35 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; SI-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s64>) = G_LOAD [[COPY]](p1) :: (load (<4 x s64>), align 8, addrspace 1)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[LOAD]](<4 x s64>)
+    ;
     ; CI-HSA-LABEL: name: test_load_global_v4s64_align8
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
     ; CI-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s64>) = G_LOAD [[COPY]](p1) :: (load (<4 x s64>), align 8, addrspace 1)
     ; CI-HSA-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[LOAD]](<4 x s64>)
+    ;
     ; CI-MESA-LABEL: name: test_load_global_v4s64_align8
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
     ; CI-MESA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-MESA-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s64>) = G_LOAD [[COPY]](p1) :: (load (<4 x s64>), align 8, addrspace 1)
     ; CI-MESA-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[LOAD]](<4 x s64>)
+    ;
     ; VI-LABEL: name: test_load_global_v4s64_align8
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s64>) = G_LOAD [[COPY]](p1) :: (load (<4 x s64>), align 8, addrspace 1)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[LOAD]](<4 x s64>)
+    ;
     ; GFX9-HSA-LABEL: name: test_load_global_v4s64_align8
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
     ; GFX9-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s64>) = G_LOAD [[COPY]](p1) :: (load (<4 x s64>), align 8, addrspace 1)
     ; GFX9-HSA-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[LOAD]](<4 x s64>)
+    ;
     ; GFX9-MESA-LABEL: name: test_load_global_v4s64_align8
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -11889,12 +12383,14 @@ body: |
     ; SI-NEXT: [[OR27:%[0-9]+]]:_(s64) = G_OR [[SHL27]], [[ZEXT3]]
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64), [[OR20]](s64), [[OR27]](s64)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
+    ;
     ; CI-HSA-LABEL: name: test_load_global_v4s64_align1
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
     ; CI-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s64>) = G_LOAD [[COPY]](p1) :: (load (<4 x s64>), align 1, addrspace 1)
     ; CI-HSA-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[LOAD]](<4 x s64>)
+    ;
     ; CI-MESA-LABEL: name: test_load_global_v4s64_align1
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
@@ -12040,6 +12536,7 @@ body: |
     ; CI-MESA-NEXT: [[OR27:%[0-9]+]]:_(s64) = G_OR [[SHL27]], [[ZEXT3]]
     ; CI-MESA-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64), [[OR20]](s64), [[OR27]](s64)
     ; CI-MESA-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
+    ;
     ; VI-LABEL: name: test_load_global_v4s64_align1
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -12185,12 +12682,14 @@ body: |
     ; VI-NEXT: [[OR27:%[0-9]+]]:_(s64) = G_OR [[SHL27]], [[ZEXT3]]
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64), [[OR20]](s64), [[OR27]](s64)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
+    ;
     ; GFX9-HSA-LABEL: name: test_load_global_v4s64_align1
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
     ; GFX9-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s64>) = G_LOAD [[COPY]](p1) :: (load (<4 x s64>), align 1, addrspace 1)
     ; GFX9-HSA-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[LOAD]](<4 x s64>)
+    ;
     ; GFX9-MESA-LABEL: name: test_load_global_v4s64_align1
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -12354,6 +12853,7 @@ body: |
     ; SI-NEXT: [[LOAD:%[0-9]+]]:_(<8 x s32>) = G_LOAD [[COPY]](p1) :: (load (<8 x s32>), addrspace 1)
     ; SI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s128>) = G_BITCAST [[LOAD]](<8 x s32>)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BITCAST]](<2 x s128>)
+    ;
     ; CI-HSA-LABEL: name: test_load_global_v2s128_align32
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
@@ -12361,6 +12861,7 @@ body: |
     ; CI-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<8 x s32>) = G_LOAD [[COPY]](p1) :: (load (<8 x s32>), addrspace 1)
     ; CI-HSA-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s128>) = G_BITCAST [[LOAD]](<8 x s32>)
     ; CI-HSA-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BITCAST]](<2 x s128>)
+    ;
     ; CI-MESA-LABEL: name: test_load_global_v2s128_align32
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
@@ -12368,6 +12869,7 @@ body: |
     ; CI-MESA-NEXT: [[LOAD:%[0-9]+]]:_(<8 x s32>) = G_LOAD [[COPY]](p1) :: (load (<8 x s32>), addrspace 1)
     ; CI-MESA-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s128>) = G_BITCAST [[LOAD]](<8 x s32>)
     ; CI-MESA-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BITCAST]](<2 x s128>)
+    ;
     ; VI-LABEL: name: test_load_global_v2s128_align32
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -12375,6 +12877,7 @@ body: |
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<8 x s32>) = G_LOAD [[COPY]](p1) :: (load (<8 x s32>), addrspace 1)
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s128>) = G_BITCAST [[LOAD]](<8 x s32>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BITCAST]](<2 x s128>)
+    ;
     ; GFX9-HSA-LABEL: name: test_load_global_v2s128_align32
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
@@ -12382,6 +12885,7 @@ body: |
     ; GFX9-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<8 x s32>) = G_LOAD [[COPY]](p1) :: (load (<8 x s32>), addrspace 1)
     ; GFX9-HSA-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s128>) = G_BITCAST [[LOAD]](<8 x s32>)
     ; GFX9-HSA-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BITCAST]](<2 x s128>)
+    ;
     ; GFX9-MESA-LABEL: name: test_load_global_v2s128_align32
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -12407,6 +12911,7 @@ body: |
     ; SI-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p1) :: (load (<4 x s32>), addrspace 1)
     ; SI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>)
+    ;
     ; CI-HSA-LABEL: name: test_load_global_v2p1_align16
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
@@ -12414,6 +12919,7 @@ body: |
     ; CI-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p1) :: (load (<4 x s32>), addrspace 1)
     ; CI-HSA-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>)
     ; CI-HSA-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>)
+    ;
     ; CI-MESA-LABEL: name: test_load_global_v2p1_align16
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
@@ -12421,6 +12927,7 @@ body: |
     ; CI-MESA-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p1) :: (load (<4 x s32>), addrspace 1)
     ; CI-MESA-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>)
     ; CI-MESA-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>)
+    ;
     ; VI-LABEL: name: test_load_global_v2p1_align16
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -12428,6 +12935,7 @@ body: |
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p1) :: (load (<4 x s32>), addrspace 1)
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>)
+    ;
     ; GFX9-HSA-LABEL: name: test_load_global_v2p1_align16
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
@@ -12435,6 +12943,7 @@ body: |
     ; GFX9-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p1) :: (load (<4 x s32>), addrspace 1)
     ; GFX9-HSA-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>)
     ; GFX9-HSA-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>)
+    ;
     ; GFX9-MESA-LABEL: name: test_load_global_v2p1_align16
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -12460,6 +12969,7 @@ body: |
     ; SI-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p1) :: (load (<4 x s32>), align 8, addrspace 1)
     ; SI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>)
+    ;
     ; CI-HSA-LABEL: name: test_load_global_v2p1_align8
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
@@ -12467,6 +12977,7 @@ body: |
     ; CI-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p1) :: (load (<4 x s32>), align 8, addrspace 1)
     ; CI-HSA-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>)
     ; CI-HSA-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>)
+    ;
     ; CI-MESA-LABEL: name: test_load_global_v2p1_align8
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
@@ -12474,6 +12985,7 @@ body: |
     ; CI-MESA-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p1) :: (load (<4 x s32>), align 8, addrspace 1)
     ; CI-MESA-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>)
     ; CI-MESA-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>)
+    ;
     ; VI-LABEL: name: test_load_global_v2p1_align8
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -12481,6 +12993,7 @@ body: |
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p1) :: (load (<4 x s32>), align 8, addrspace 1)
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>)
+    ;
     ; GFX9-HSA-LABEL: name: test_load_global_v2p1_align8
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
@@ -12488,6 +13001,7 @@ body: |
     ; GFX9-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p1) :: (load (<4 x s32>), align 8, addrspace 1)
     ; GFX9-HSA-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>)
     ; GFX9-HSA-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>)
+    ;
     ; GFX9-MESA-LABEL: name: test_load_global_v2p1_align8
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -12513,6 +13027,7 @@ body: |
     ; SI-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p1) :: (load (<4 x s32>), align 4, addrspace 1)
     ; SI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>)
+    ;
     ; CI-HSA-LABEL: name: test_load_global_v2p1_align4
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
@@ -12520,6 +13035,7 @@ body: |
     ; CI-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p1) :: (load (<4 x s32>), align 4, addrspace 1)
     ; CI-HSA-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>)
     ; CI-HSA-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>)
+    ;
     ; CI-MESA-LABEL: name: test_load_global_v2p1_align4
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
@@ -12527,6 +13043,7 @@ body: |
     ; CI-MESA-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p1) :: (load (<4 x s32>), align 4, addrspace 1)
     ; CI-MESA-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>)
     ; CI-MESA-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>)
+    ;
     ; VI-LABEL: name: test_load_global_v2p1_align4
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -12534,6 +13051,7 @@ body: |
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p1) :: (load (<4 x s32>), align 4, addrspace 1)
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>)
+    ;
     ; GFX9-HSA-LABEL: name: test_load_global_v2p1_align4
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
@@ -12541,6 +13059,7 @@ body: |
     ; GFX9-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p1) :: (load (<4 x s32>), align 4, addrspace 1)
     ; GFX9-HSA-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>)
     ; GFX9-HSA-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>)
+    ;
     ; GFX9-MESA-LABEL: name: test_load_global_v2p1_align4
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -12628,6 +13147,7 @@ body: |
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
     ; SI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>)
+    ;
     ; CI-HSA-LABEL: name: test_load_global_v2p1_align1
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
@@ -12635,6 +13155,7 @@ body: |
     ; CI-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p1) :: (load (<4 x s32>), align 1, addrspace 1)
     ; CI-HSA-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>)
     ; CI-HSA-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>)
+    ;
     ; CI-MESA-LABEL: name: test_load_global_v2p1_align1
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
@@ -12704,6 +13225,7 @@ body: |
     ; CI-MESA-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
     ; CI-MESA-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; CI-MESA-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>)
+    ;
     ; VI-LABEL: name: test_load_global_v2p1_align1
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -12773,6 +13295,7 @@ body: |
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>)
+    ;
     ; GFX9-HSA-LABEL: name: test_load_global_v2p1_align1
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
@@ -12780,6 +13303,7 @@ body: |
     ; GFX9-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p1) :: (load (<4 x s32>), align 1, addrspace 1)
     ; GFX9-HSA-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>)
     ; GFX9-HSA-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>)
+    ;
     ; GFX9-MESA-LABEL: name: test_load_global_v2p1_align1
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -12867,6 +13391,7 @@ body: |
     ; SI-NEXT: [[LOAD:%[0-9]+]]:_(<8 x s32>) = G_LOAD [[COPY]](p1) :: (load (<8 x s32>), align 8, addrspace 1)
     ; SI-NEXT: [[BITCAST:%[0-9]+]]:_(<4 x p1>) = G_BITCAST [[LOAD]](<8 x s32>)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BITCAST]](<4 x p1>)
+    ;
     ; CI-HSA-LABEL: name: test_load_global_v4p1_align8
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
@@ -12874,6 +13399,7 @@ body: |
     ; CI-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<8 x s32>) = G_LOAD [[COPY]](p1) :: (load (<8 x s32>), align 8, addrspace 1)
     ; CI-HSA-NEXT: [[BITCAST:%[0-9]+]]:_(<4 x p1>) = G_BITCAST [[LOAD]](<8 x s32>)
     ; CI-HSA-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BITCAST]](<4 x p1>)
+    ;
     ; CI-MESA-LABEL: name: test_load_global_v4p1_align8
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
@@ -12881,6 +13407,7 @@ body: |
     ; CI-MESA-NEXT: [[LOAD:%[0-9]+]]:_(<8 x s32>) = G_LOAD [[COPY]](p1) :: (load (<8 x s32>), align 8, addrspace 1)
     ; CI-MESA-NEXT: [[BITCAST:%[0-9]+]]:_(<4 x p1>) = G_BITCAST [[LOAD]](<8 x s32>)
     ; CI-MESA-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BITCAST]](<4 x p1>)
+    ;
     ; VI-LABEL: name: test_load_global_v4p1_align8
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -12888,6 +13415,7 @@ body: |
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<8 x s32>) = G_LOAD [[COPY]](p1) :: (load (<8 x s32>), align 8, addrspace 1)
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(<4 x p1>) = G_BITCAST [[LOAD]](<8 x s32>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BITCAST]](<4 x p1>)
+    ;
     ; GFX9-HSA-LABEL: name: test_load_global_v4p1_align8
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
@@ -12895,6 +13423,7 @@ body: |
     ; GFX9-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<8 x s32>) = G_LOAD [[COPY]](p1) :: (load (<8 x s32>), align 8, addrspace 1)
     ; GFX9-HSA-NEXT: [[BITCAST:%[0-9]+]]:_(<4 x p1>) = G_BITCAST [[LOAD]](<8 x s32>)
     ; GFX9-HSA-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BITCAST]](<4 x p1>)
+    ;
     ; GFX9-MESA-LABEL: name: test_load_global_v4p1_align8
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -12919,30 +13448,35 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; SI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p1) :: (load (<2 x p3>), addrspace 1)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
+    ;
     ; CI-HSA-LABEL: name: test_load_global_v2p3_align8
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
     ; CI-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p1) :: (load (<2 x p3>), addrspace 1)
     ; CI-HSA-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
+    ;
     ; CI-MESA-LABEL: name: test_load_global_v2p3_align8
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
     ; CI-MESA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-MESA-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p1) :: (load (<2 x p3>), addrspace 1)
     ; CI-MESA-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
+    ;
     ; VI-LABEL: name: test_load_global_v2p3_align8
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p1) :: (load (<2 x p3>), addrspace 1)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
+    ;
     ; GFX9-HSA-LABEL: name: test_load_global_v2p3_align8
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
     ; GFX9-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p1) :: (load (<2 x p3>), addrspace 1)
     ; GFX9-HSA-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
+    ;
     ; GFX9-MESA-LABEL: name: test_load_global_v2p3_align8
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -12966,30 +13500,35 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; SI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p1) :: (load (<2 x p3>), align 4, addrspace 1)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
+    ;
     ; CI-HSA-LABEL: name: test_load_global_v2p3_align4
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
     ; CI-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p1) :: (load (<2 x p3>), align 4, addrspace 1)
     ; CI-HSA-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
+    ;
     ; CI-MESA-LABEL: name: test_load_global_v2p3_align4
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
     ; CI-MESA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-MESA-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p1) :: (load (<2 x p3>), align 4, addrspace 1)
     ; CI-MESA-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
+    ;
     ; VI-LABEL: name: test_load_global_v2p3_align4
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p1) :: (load (<2 x p3>), align 4, addrspace 1)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
+    ;
     ; GFX9-HSA-LABEL: name: test_load_global_v2p3_align4
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
     ; GFX9-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p1) :: (load (<2 x p3>), align 4, addrspace 1)
     ; GFX9-HSA-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
+    ;
     ; GFX9-MESA-LABEL: name: test_load_global_v2p3_align4
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -13047,12 +13586,14 @@ body: |
     ; SI-NEXT: [[INTTOPTR1:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR5]](s32)
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[INTTOPTR]](p3), [[INTTOPTR1]](p3)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x p3>)
+    ;
     ; CI-HSA-LABEL: name: test_load_global_v2p3_align1
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
     ; CI-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p1) :: (load (<2 x p3>), align 1, addrspace 1)
     ; CI-HSA-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
+    ;
     ; CI-MESA-LABEL: name: test_load_global_v2p3_align1
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
@@ -13093,6 +13634,7 @@ body: |
     ; CI-MESA-NEXT: [[INTTOPTR1:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR5]](s32)
     ; CI-MESA-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[INTTOPTR]](p3), [[INTTOPTR1]](p3)
     ; CI-MESA-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x p3>)
+    ;
     ; VI-LABEL: name: test_load_global_v2p3_align1
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -13133,12 +13675,14 @@ body: |
     ; VI-NEXT: [[INTTOPTR1:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR5]](s32)
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[INTTOPTR]](p3), [[INTTOPTR1]](p3)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x p3>)
+    ;
     ; GFX9-HSA-LABEL: name: test_load_global_v2p3_align1
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
     ; GFX9-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p1) :: (load (<2 x p3>), align 1, addrspace 1)
     ; GFX9-HSA-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
+    ;
     ; GFX9-MESA-LABEL: name: test_load_global_v2p3_align1
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -13196,30 +13740,35 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; SI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s8), align 4, addrspace 1)
     ; SI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; CI-HSA-LABEL: name: test_ext_load_global_s32_from_1_align4
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
     ; CI-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-HSA-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s8), align 4, addrspace 1)
     ; CI-HSA-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; CI-MESA-LABEL: name: test_ext_load_global_s32_from_1_align4
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
     ; CI-MESA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-MESA-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s8), align 4, addrspace 1)
     ; CI-MESA-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; VI-LABEL: name: test_ext_load_global_s32_from_1_align4
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s8), align 4, addrspace 1)
     ; VI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX9-HSA-LABEL: name: test_ext_load_global_s32_from_1_align4
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
     ; GFX9-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s8), align 4, addrspace 1)
     ; GFX9-HSA-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX9-MESA-LABEL: name: test_ext_load_global_s32_from_1_align4
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -13243,30 +13792,35 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; SI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s16), align 4, addrspace 1)
     ; SI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; CI-HSA-LABEL: name: test_ext_load_global_s32_from_2_align4
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
     ; CI-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-HSA-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s16), align 4, addrspace 1)
     ; CI-HSA-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; CI-MESA-LABEL: name: test_ext_load_global_s32_from_2_align4
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
     ; CI-MESA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-MESA-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s16), align 4, addrspace 1)
     ; CI-MESA-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; VI-LABEL: name: test_ext_load_global_s32_from_2_align4
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s16), align 4, addrspace 1)
     ; VI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX9-HSA-LABEL: name: test_ext_load_global_s32_from_2_align4
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
     ; GFX9-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s16), align 4, addrspace 1)
     ; GFX9-HSA-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX9-MESA-LABEL: name: test_ext_load_global_s32_from_2_align4
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -13302,6 +13856,7 @@ body: |
     ; SI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C3]](s32)
     ; SI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[OR]]
     ; SI-NEXT: $vgpr0 = COPY [[OR1]](s32)
+    ;
     ; CI-HSA-LABEL: name: test_ext_load_global_s32_from_s24_align1
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
@@ -13314,6 +13869,7 @@ body: |
     ; CI-HSA-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
     ; CI-HSA-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; CI-HSA-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
     ; CI-MESA-LABEL: name: test_ext_load_global_s32_from_s24_align1
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
@@ -13332,6 +13888,7 @@ body: |
     ; CI-MESA-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C3]](s32)
     ; CI-MESA-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[OR]]
     ; CI-MESA-NEXT: $vgpr0 = COPY [[OR1]](s32)
+    ;
     ; VI-LABEL: name: test_ext_load_global_s32_from_s24_align1
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -13350,6 +13907,7 @@ body: |
     ; VI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C3]](s32)
     ; VI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[OR]]
     ; VI-NEXT: $vgpr0 = COPY [[OR1]](s32)
+    ;
     ; GFX9-HSA-LABEL: name: test_ext_load_global_s32_from_s24_align1
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
@@ -13362,6 +13920,7 @@ body: |
     ; GFX9-HSA-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
     ; GFX9-HSA-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; GFX9-HSA-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
     ; GFX9-MESA-LABEL: name: test_ext_load_global_s32_from_s24_align1
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -13402,6 +13961,7 @@ body: |
     ; SI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
     ; SI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; SI-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
     ; CI-HSA-LABEL: name: test_ext_load_global_s32_from_s24_align2
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
@@ -13414,6 +13974,7 @@ body: |
     ; CI-HSA-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
     ; CI-HSA-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; CI-HSA-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
     ; CI-MESA-LABEL: name: test_ext_load_global_s32_from_s24_align2
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
@@ -13426,6 +13987,7 @@ body: |
     ; CI-MESA-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
     ; CI-MESA-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; CI-MESA-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
     ; VI-LABEL: name: test_ext_load_global_s32_from_s24_align2
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -13438,6 +14000,7 @@ body: |
     ; VI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
     ; VI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; VI-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
     ; GFX9-HSA-LABEL: name: test_ext_load_global_s32_from_s24_align2
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
@@ -13450,6 +14013,7 @@ body: |
     ; GFX9-HSA-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
     ; GFX9-HSA-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; GFX9-HSA-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
     ; GFX9-MESA-LABEL: name: test_ext_load_global_s32_from_s24_align2
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -13479,30 +14043,35 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; SI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s32), addrspace 1)
     ; SI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; CI-HSA-LABEL: name: test_ext_load_global_s32_from_s24_align4
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
     ; CI-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-HSA-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s32), addrspace 1)
     ; CI-HSA-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; CI-MESA-LABEL: name: test_ext_load_global_s32_from_s24_align4
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
     ; CI-MESA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-MESA-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s32), addrspace 1)
     ; CI-MESA-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; VI-LABEL: name: test_ext_load_global_s32_from_s24_align4
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s32), addrspace 1)
     ; VI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX9-HSA-LABEL: name: test_ext_load_global_s32_from_s24_align4
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
     ; GFX9-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s32), addrspace 1)
     ; GFX9-HSA-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX9-MESA-LABEL: name: test_ext_load_global_s32_from_s24_align4
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -13528,6 +14097,7 @@ body: |
     ; SI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s8), align 4, addrspace 1)
     ; SI-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
     ; CI-HSA-LABEL: name: test_ext_load_global_s64_from_1_align4
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
@@ -13535,6 +14105,7 @@ body: |
     ; CI-HSA-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s8), align 4, addrspace 1)
     ; CI-HSA-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; CI-HSA-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
     ; CI-MESA-LABEL: name: test_ext_load_global_s64_from_1_align4
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
@@ -13542,6 +14113,7 @@ body: |
     ; CI-MESA-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s8), align 4, addrspace 1)
     ; CI-MESA-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; CI-MESA-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
     ; VI-LABEL: name: test_ext_load_global_s64_from_1_align4
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -13549,6 +14121,7 @@ body: |
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s8), align 4, addrspace 1)
     ; VI-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
     ; GFX9-HSA-LABEL: name: test_ext_load_global_s64_from_1_align4
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
@@ -13556,6 +14129,7 @@ body: |
     ; GFX9-HSA-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s8), align 4, addrspace 1)
     ; GFX9-HSA-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; GFX9-HSA-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
     ; GFX9-MESA-LABEL: name: test_ext_load_global_s64_from_1_align4
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -13581,6 +14155,7 @@ body: |
     ; SI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s16), align 4, addrspace 1)
     ; SI-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
     ; CI-HSA-LABEL: name: test_ext_load_global_s64_from_2_align4
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
@@ -13588,6 +14163,7 @@ body: |
     ; CI-HSA-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s16), align 4, addrspace 1)
     ; CI-HSA-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; CI-HSA-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
     ; CI-MESA-LABEL: name: test_ext_load_global_s64_from_2_align4
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
@@ -13595,6 +14171,7 @@ body: |
     ; CI-MESA-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s16), align 4, addrspace 1)
     ; CI-MESA-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; CI-MESA-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
     ; VI-LABEL: name: test_ext_load_global_s64_from_2_align4
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -13602,6 +14179,7 @@ body: |
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s16), align 4, addrspace 1)
     ; VI-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
     ; GFX9-HSA-LABEL: name: test_ext_load_global_s64_from_2_align4
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
@@ -13609,6 +14187,7 @@ body: |
     ; GFX9-HSA-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s16), align 4, addrspace 1)
     ; GFX9-HSA-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; GFX9-HSA-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
     ; GFX9-MESA-LABEL: name: test_ext_load_global_s64_from_2_align4
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -13634,6 +14213,7 @@ body: |
     ; SI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s32), addrspace 1)
     ; SI-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
     ; CI-HSA-LABEL: name: test_ext_load_global_s64_from_4_align4
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
@@ -13641,6 +14221,7 @@ body: |
     ; CI-HSA-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s32), addrspace 1)
     ; CI-HSA-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; CI-HSA-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
     ; CI-MESA-LABEL: name: test_ext_load_global_s64_from_4_align4
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
@@ -13648,6 +14229,7 @@ body: |
     ; CI-MESA-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s32), addrspace 1)
     ; CI-MESA-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; CI-MESA-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
     ; VI-LABEL: name: test_ext_load_global_s64_from_4_align4
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -13655,6 +14237,7 @@ body: |
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s32), addrspace 1)
     ; VI-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
     ; GFX9-HSA-LABEL: name: test_ext_load_global_s64_from_4_align4
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
@@ -13662,6 +14245,7 @@ body: |
     ; GFX9-HSA-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s32), addrspace 1)
     ; GFX9-HSA-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; GFX9-HSA-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
     ; GFX9-MESA-LABEL: name: test_ext_load_global_s64_from_4_align4
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -13690,6 +14274,7 @@ body: |
     ; SI-NEXT: [[DEF1:%[0-9]+]]:_(s64) = G_IMPLICIT_DEF
     ; SI-NEXT: [[MV1:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[MV]](s64), [[DEF1]](s64)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV1]](s128)
+    ;
     ; CI-HSA-LABEL: name: test_ext_load_global_s128_from_4_align4
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
@@ -13700,6 +14285,7 @@ body: |
     ; CI-HSA-NEXT: [[DEF1:%[0-9]+]]:_(s64) = G_IMPLICIT_DEF
     ; CI-HSA-NEXT: [[MV1:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[MV]](s64), [[DEF1]](s64)
     ; CI-HSA-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV1]](s128)
+    ;
     ; CI-MESA-LABEL: name: test_ext_load_global_s128_from_4_align4
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
@@ -13710,6 +14296,7 @@ body: |
     ; CI-MESA-NEXT: [[DEF1:%[0-9]+]]:_(s64) = G_IMPLICIT_DEF
     ; CI-MESA-NEXT: [[MV1:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[MV]](s64), [[DEF1]](s64)
     ; CI-MESA-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV1]](s128)
+    ;
     ; VI-LABEL: name: test_ext_load_global_s128_from_4_align4
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -13720,6 +14307,7 @@ body: |
     ; VI-NEXT: [[DEF1:%[0-9]+]]:_(s64) = G_IMPLICIT_DEF
     ; VI-NEXT: [[MV1:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[MV]](s64), [[DEF1]](s64)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV1]](s128)
+    ;
     ; GFX9-HSA-LABEL: name: test_ext_load_global_s128_from_4_align4
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
@@ -13730,6 +14318,7 @@ body: |
     ; GFX9-HSA-NEXT: [[DEF1:%[0-9]+]]:_(s64) = G_IMPLICIT_DEF
     ; GFX9-HSA-NEXT: [[MV1:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[MV]](s64), [[DEF1]](s64)
     ; GFX9-HSA-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV1]](s128)
+    ;
     ; GFX9-MESA-LABEL: name: test_ext_load_global_s128_from_4_align4
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -13758,6 +14347,7 @@ body: |
     ; SI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s16), align 4, addrspace 1)
     ; SI-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
     ; CI-HSA-LABEL: name: test_ext_load_global_s64_from_2_align2
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
@@ -13765,6 +14355,7 @@ body: |
     ; CI-HSA-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s16), align 4, addrspace 1)
     ; CI-HSA-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; CI-HSA-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
     ; CI-MESA-LABEL: name: test_ext_load_global_s64_from_2_align2
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
@@ -13772,6 +14363,7 @@ body: |
     ; CI-MESA-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s16), align 4, addrspace 1)
     ; CI-MESA-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; CI-MESA-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
     ; VI-LABEL: name: test_ext_load_global_s64_from_2_align2
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -13779,6 +14371,7 @@ body: |
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s16), align 4, addrspace 1)
     ; VI-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
     ; GFX9-HSA-LABEL: name: test_ext_load_global_s64_from_2_align2
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
@@ -13786,6 +14379,7 @@ body: |
     ; GFX9-HSA-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s16), align 4, addrspace 1)
     ; GFX9-HSA-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; GFX9-HSA-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
     ; GFX9-MESA-LABEL: name: test_ext_load_global_s64_from_2_align2
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -13811,6 +14405,7 @@ body: |
     ; SI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s8), align 4, addrspace 1)
     ; SI-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
     ; CI-HSA-LABEL: name: test_ext_load_global_s64_from_1_align1
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
@@ -13818,6 +14413,7 @@ body: |
     ; CI-HSA-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s8), align 4, addrspace 1)
     ; CI-HSA-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; CI-HSA-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
     ; CI-MESA-LABEL: name: test_ext_load_global_s64_from_1_align1
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
@@ -13825,6 +14421,7 @@ body: |
     ; CI-MESA-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s8), align 4, addrspace 1)
     ; CI-MESA-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; CI-MESA-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
     ; VI-LABEL: name: test_ext_load_global_s64_from_1_align1
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -13832,6 +14429,7 @@ body: |
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s8), align 4, addrspace 1)
     ; VI-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
     ; GFX9-HSA-LABEL: name: test_ext_load_global_s64_from_1_align1
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
@@ -13839,6 +14437,7 @@ body: |
     ; GFX9-HSA-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s8), align 4, addrspace 1)
     ; GFX9-HSA-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; GFX9-HSA-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
     ; GFX9-MESA-LABEL: name: test_ext_load_global_s64_from_1_align1
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -13863,30 +14462,35 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; SI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p1) :: (load (<2 x s16>), align 1, addrspace 1)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; CI-HSA-LABEL: name: test_extload_global_v2s32_from_v2s16_align1
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
     ; CI-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p1) :: (load (<2 x s16>), align 1, addrspace 1)
     ; CI-HSA-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; CI-MESA-LABEL: name: test_extload_global_v2s32_from_v2s16_align1
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
     ; CI-MESA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-MESA-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p1) :: (load (<2 x s16>), align 1, addrspace 1)
     ; CI-MESA-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; VI-LABEL: name: test_extload_global_v2s32_from_v2s16_align1
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p1) :: (load (<2 x s16>), align 1, addrspace 1)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; GFX9-HSA-LABEL: name: test_extload_global_v2s32_from_v2s16_align1
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
     ; GFX9-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p1) :: (load (<2 x s16>), align 1, addrspace 1)
     ; GFX9-HSA-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; GFX9-MESA-LABEL: name: test_extload_global_v2s32_from_v2s16_align1
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -13910,30 +14514,35 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; SI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p1) :: (load (<2 x s16>), align 2, addrspace 1)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; CI-HSA-LABEL: name: test_extload_global_v2s32_from_v2s16_align2
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
     ; CI-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p1) :: (load (<2 x s16>), align 2, addrspace 1)
     ; CI-HSA-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; CI-MESA-LABEL: name: test_extload_global_v2s32_from_v2s16_align2
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
     ; CI-MESA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-MESA-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p1) :: (load (<2 x s16>), align 2, addrspace 1)
     ; CI-MESA-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; VI-LABEL: name: test_extload_global_v2s32_from_v2s16_align2
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p1) :: (load (<2 x s16>), align 2, addrspace 1)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; GFX9-HSA-LABEL: name: test_extload_global_v2s32_from_v2s16_align2
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
     ; GFX9-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p1) :: (load (<2 x s16>), align 2, addrspace 1)
     ; GFX9-HSA-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; GFX9-MESA-LABEL: name: test_extload_global_v2s32_from_v2s16_align2
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -13957,30 +14566,35 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; SI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p1) :: (load (<2 x s16>), addrspace 1)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; CI-HSA-LABEL: name: test_extload_global_v2s32_from_v2s16_align4
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
     ; CI-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p1) :: (load (<2 x s16>), addrspace 1)
     ; CI-HSA-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; CI-MESA-LABEL: name: test_extload_global_v2s32_from_v2s16_align4
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
     ; CI-MESA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-MESA-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p1) :: (load (<2 x s16>), addrspace 1)
     ; CI-MESA-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; VI-LABEL: name: test_extload_global_v2s32_from_v2s16_align4
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p1) :: (load (<2 x s16>), addrspace 1)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; GFX9-HSA-LABEL: name: test_extload_global_v2s32_from_v2s16_align4
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
     ; GFX9-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p1) :: (load (<2 x s16>), addrspace 1)
     ; GFX9-HSA-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; GFX9-MESA-LABEL: name: test_extload_global_v2s32_from_v2s16_align4
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -14004,30 +14618,35 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; SI-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p1) :: (load (<3 x s16>), align 4, addrspace 1)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](<3 x s32>)
+    ;
     ; CI-HSA-LABEL: name: test_extload_global_v3s32_from_v3s16_align4
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
     ; CI-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p1) :: (load (<3 x s16>), align 4, addrspace 1)
     ; CI-HSA-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](<3 x s32>)
+    ;
     ; CI-MESA-LABEL: name: test_extload_global_v3s32_from_v3s16_align4
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
     ; CI-MESA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-MESA-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p1) :: (load (<3 x s16>), align 4, addrspace 1)
     ; CI-MESA-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](<3 x s32>)
+    ;
     ; VI-LABEL: name: test_extload_global_v3s32_from_v3s16_align4
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p1) :: (load (<3 x s16>), align 4, addrspace 1)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](<3 x s32>)
+    ;
     ; GFX9-HSA-LABEL: name: test_extload_global_v3s32_from_v3s16_align4
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
     ; GFX9-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p1) :: (load (<3 x s16>), align 4, addrspace 1)
     ; GFX9-HSA-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](<3 x s32>)
+    ;
     ; GFX9-MESA-LABEL: name: test_extload_global_v3s32_from_v3s16_align4
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -14051,30 +14670,35 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; SI-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p1) :: (load (<4 x s16>), align 4, addrspace 1)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ;
     ; CI-HSA-LABEL: name: test_extload_global_v4s32_from_v4s16_align4
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
     ; CI-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p1) :: (load (<4 x s16>), align 4, addrspace 1)
     ; CI-HSA-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ;
     ; CI-MESA-LABEL: name: test_extload_global_v4s32_from_v4s16_align4
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
     ; CI-MESA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-MESA-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p1) :: (load (<4 x s16>), align 4, addrspace 1)
     ; CI-MESA-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ;
     ; VI-LABEL: name: test_extload_global_v4s32_from_v4s16_align4
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p1) :: (load (<4 x s16>), align 4, addrspace 1)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ;
     ; GFX9-HSA-LABEL: name: test_extload_global_v4s32_from_v4s16_align4
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
     ; GFX9-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p1) :: (load (<4 x s16>), align 4, addrspace 1)
     ; GFX9-HSA-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ;
     ; GFX9-MESA-LABEL: name: test_extload_global_v4s32_from_v4s16_align4
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -14194,6 +14818,7 @@ body: |
     ; SI-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)
     ; SI-NEXT: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96)
+    ;
     ; CI-HSA-LABEL: name: test_global_v2s96_align1
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
@@ -14208,6 +14833,7 @@ body: |
     ; CI-HSA-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
     ; CI-HSA-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)
     ; CI-HSA-NEXT: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96)
+    ;
     ; CI-MESA-LABEL: name: test_global_v2s96_align1
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
@@ -14310,6 +14936,7 @@ body: |
     ; CI-MESA-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
     ; CI-MESA-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)
     ; CI-MESA-NEXT: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96)
+    ;
     ; VI-LABEL: name: test_global_v2s96_align1
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -14412,6 +15039,7 @@ body: |
     ; VI-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)
     ; VI-NEXT: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96)
+    ;
     ; GFX9-HSA-LABEL: name: test_global_v2s96_align1
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
@@ -14426,6 +15054,7 @@ body: |
     ; GFX9-HSA-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
     ; GFX9-HSA-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)
     ; GFX9-HSA-NEXT: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96)
+    ;
     ; GFX9-MESA-LABEL: name: test_global_v2s96_align1
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -14594,6 +15223,7 @@ body: |
     ; SI-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)
     ; SI-NEXT: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96)
+    ;
     ; CI-HSA-LABEL: name: test_global_v2s96_align2
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
@@ -14608,6 +15238,7 @@ body: |
     ; CI-HSA-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
     ; CI-HSA-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)
     ; CI-HSA-NEXT: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96)
+    ;
     ; CI-MESA-LABEL: name: test_global_v2s96_align2
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
@@ -14660,6 +15291,7 @@ body: |
     ; CI-MESA-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
     ; CI-MESA-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)
     ; CI-MESA-NEXT: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96)
+    ;
     ; VI-LABEL: name: test_global_v2s96_align2
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -14712,6 +15344,7 @@ body: |
     ; VI-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)
     ; VI-NEXT: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96)
+    ;
     ; GFX9-HSA-LABEL: name: test_global_v2s96_align2
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
@@ -14726,6 +15359,7 @@ body: |
     ; GFX9-HSA-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
     ; GFX9-HSA-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)
     ; GFX9-HSA-NEXT: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96)
+    ;
     ; GFX9-MESA-LABEL: name: test_global_v2s96_align2
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -14815,6 +15449,7 @@ body: |
     ; SI-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)
     ; SI-NEXT: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96)
+    ;
     ; CI-HSA-LABEL: name: test_global_v2s96_align4
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
@@ -14829,6 +15464,7 @@ body: |
     ; CI-HSA-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
     ; CI-HSA-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)
     ; CI-HSA-NEXT: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96)
+    ;
     ; CI-MESA-LABEL: name: test_global_v2s96_align4
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
@@ -14843,6 +15479,7 @@ body: |
     ; CI-MESA-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
     ; CI-MESA-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)
     ; CI-MESA-NEXT: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96)
+    ;
     ; VI-LABEL: name: test_global_v2s96_align4
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -14857,6 +15494,7 @@ body: |
     ; VI-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)
     ; VI-NEXT: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96)
+    ;
     ; GFX9-HSA-LABEL: name: test_global_v2s96_align4
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
@@ -14871,6 +15509,7 @@ body: |
     ; GFX9-HSA-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
     ; GFX9-HSA-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)
     ; GFX9-HSA-NEXT: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96)
+    ;
     ; GFX9-MESA-LABEL: name: test_global_v2s96_align4
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -14920,6 +15559,7 @@ body: |
     ; SI-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)
     ; SI-NEXT: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96)
+    ;
     ; CI-HSA-LABEL: name: test_global_v2s96_align16
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
@@ -14934,6 +15574,7 @@ body: |
     ; CI-HSA-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
     ; CI-HSA-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)
     ; CI-HSA-NEXT: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96)
+    ;
     ; CI-MESA-LABEL: name: test_global_v2s96_align16
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
@@ -14948,6 +15589,7 @@ body: |
     ; CI-MESA-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
     ; CI-MESA-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)
     ; CI-MESA-NEXT: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96)
+    ;
     ; VI-LABEL: name: test_global_v2s96_align16
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -14962,6 +15604,7 @@ body: |
     ; VI-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)
     ; VI-NEXT: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96)
+    ;
     ; GFX9-HSA-LABEL: name: test_global_v2s96_align16
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
@@ -14976,6 +15619,7 @@ body: |
     ; GFX9-HSA-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
     ; GFX9-HSA-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)
     ; GFX9-HSA-NEXT: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96)
+    ;
     ; GFX9-MESA-LABEL: name: test_global_v2s96_align16
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -15074,6 +15718,7 @@ body: |
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<32 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LSHR]](s32), [[LSHR1]](s32), [[LSHR2]](s32), [[LSHR3]](s32), [[LSHR4]](s32), [[LSHR5]](s32), [[LSHR6]](s32), [[LSHR7]](s32), [[LSHR8]](s32), [[LSHR9]](s32), [[LSHR10]](s32), [[LSHR11]](s32), [[LSHR12]](s32), [[LSHR13]](s32), [[LSHR14]](s32), [[LSHR15]](s32), [[LSHR16]](s32), [[LSHR17]](s32), [[LSHR18]](s32), [[LSHR19]](s32), [[LSHR20]](s32), [[LSHR21]](s32), [[LSHR22]](s32), [[LSHR23]](s32), [[LSHR24]](s32), [[LSHR25]](s32), [[LSHR26]](s32), [[LSHR27]](s32), [[LSHR28]](s32), [[LSHR29]](s32), [[LSHR30]](s32)
     ; SI-NEXT: [[TRUNC:%[0-9]+]]:_(<32 x s1>) = G_TRUNC [[BUILD_VECTOR]](<32 x s32>)
     ; SI-NEXT: $vgpr0 = COPY [[TRUNC]](<32 x s1>)
+    ;
     ; CI-HSA-LABEL: name: test_load_global_v32s1_align4
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
@@ -15144,6 +15789,7 @@ body: |
     ; CI-HSA-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<32 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LSHR]](s32), [[LSHR1]](s32), [[LSHR2]](s32), [[LSHR3]](s32), [[LSHR4]](s32), [[LSHR5]](s32), [[LSHR6]](s32), [[LSHR7]](s32), [[LSHR8]](s32), [[LSHR9]](s32), [[LSHR10]](s32), [[LSHR11]](s32), [[LSHR12]](s32), [[LSHR13]](s32), [[LSHR14]](s32), [[LSHR15]](s32), [[LSHR16]](s32), [[LSHR17]](s32), [[LSHR18]](s32), [[LSHR19]](s32), [[LSHR20]](s32), [[LSHR21]](s32), [[LSHR22]](s32), [[LSHR23]](s32), [[LSHR24]](s32), [[LSHR25]](s32), [[LSHR26]](s32), [[LSHR27]](s32), [[LSHR28]](s32), [[LSHR29]](s32), [[LSHR30]](s32)
     ; CI-HSA-NEXT: [[TRUNC:%[0-9]+]]:_(<32 x s1>) = G_TRUNC [[BUILD_VECTOR]](<32 x s32>)
     ; CI-HSA-NEXT: $vgpr0 = COPY [[TRUNC]](<32 x s1>)
+    ;
     ; CI-MESA-LABEL: name: test_load_global_v32s1_align4
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
@@ -15214,6 +15860,7 @@ body: |
     ; CI-MESA-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<32 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LSHR]](s32), [[LSHR1]](s32), [[LSHR2]](s32), [[LSHR3]](s32), [[LSHR4]](s32), [[LSHR5]](s32), [[LSHR6]](s32), [[LSHR7]](s32), [[LSHR8]](s32), [[LSHR9]](s32), [[LSHR10]](s32), [[LSHR11]](s32), [[LSHR12]](s32), [[LSHR13]](s32), [[LSHR14]](s32), [[LSHR15]](s32), [[LSHR16]](s32), [[LSHR17]](s32), [[LSHR18]](s32), [[LSHR19]](s32), [[LSHR20]](s32), [[LSHR21]](s32), [[LSHR22]](s32), [[LSHR23]](s32), [[LSHR24]](s32), [[LSHR25]](s32), [[LSHR26]](s32), [[LSHR27]](s32), [[LSHR28]](s32), [[LSHR29]](s32), [[LSHR30]](s32)
     ; CI-MESA-NEXT: [[TRUNC:%[0-9]+]]:_(<32 x s1>) = G_TRUNC [[BUILD_VECTOR]](<32 x s32>)
     ; CI-MESA-NEXT: $vgpr0 = COPY [[TRUNC]](<32 x s1>)
+    ;
     ; VI-LABEL: name: test_load_global_v32s1_align4
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -15284,6 +15931,7 @@ body: |
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<32 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LSHR]](s32), [[LSHR1]](s32), [[LSHR2]](s32), [[LSHR3]](s32), [[LSHR4]](s32), [[LSHR5]](s32), [[LSHR6]](s32), [[LSHR7]](s32), [[LSHR8]](s32), [[LSHR9]](s32), [[LSHR10]](s32), [[LSHR11]](s32), [[LSHR12]](s32), [[LSHR13]](s32), [[LSHR14]](s32), [[LSHR15]](s32), [[LSHR16]](s32), [[LSHR17]](s32), [[LSHR18]](s32), [[LSHR19]](s32), [[LSHR20]](s32), [[LSHR21]](s32), [[LSHR22]](s32), [[LSHR23]](s32), [[LSHR24]](s32), [[LSHR25]](s32), [[LSHR26]](s32), [[LSHR27]](s32), [[LSHR28]](s32), [[LSHR29]](s32), [[LSHR30]](s32)
     ; VI-NEXT: [[TRUNC:%[0-9]+]]:_(<32 x s1>) = G_TRUNC [[BUILD_VECTOR]](<32 x s32>)
     ; VI-NEXT: $vgpr0 = COPY [[TRUNC]](<32 x s1>)
+    ;
     ; GFX9-HSA-LABEL: name: test_load_global_v32s1_align4
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
@@ -15402,6 +16050,7 @@ body: |
     ; GFX9-HSA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<32 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR5]](<2 x s16>), [[BUILD_VECTOR6]](<2 x s16>), [[BUILD_VECTOR7]](<2 x s16>), [[BUILD_VECTOR8]](<2 x s16>), [[BUILD_VECTOR9]](<2 x s16>), [[BUILD_VECTOR10]](<2 x s16>), [[BUILD_VECTOR11]](<2 x s16>), [[BUILD_VECTOR12]](<2 x s16>), [[BUILD_VECTOR13]](<2 x s16>), [[BUILD_VECTOR14]](<2 x s16>), [[BUILD_VECTOR15]](<2 x s16>)
     ; GFX9-HSA-NEXT: [[TRUNC32:%[0-9]+]]:_(<32 x s1>) = G_TRUNC [[CONCAT_VECTORS]](<32 x s16>)
     ; GFX9-HSA-NEXT: $vgpr0 = COPY [[TRUNC32]](<32 x s1>)
+    ;
     ; GFX9-MESA-LABEL: name: test_load_global_v32s1_align4
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -15553,6 +16202,7 @@ body: |
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LSHR]](s32), [[LSHR1]](s32), [[LSHR2]](s32), [[LSHR3]](s32), [[LSHR4]](s32), [[LSHR5]](s32), [[LSHR6]](s32)
     ; SI-NEXT: [[TRUNC:%[0-9]+]]:_(<8 x s4>) = G_TRUNC [[BUILD_VECTOR]](<8 x s32>)
     ; SI-NEXT: $vgpr0 = COPY [[TRUNC]](<8 x s4>)
+    ;
     ; CI-HSA-LABEL: name: test_load_global_v8s4_align4
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
@@ -15575,6 +16225,7 @@ body: |
     ; CI-HSA-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LSHR]](s32), [[LSHR1]](s32), [[LSHR2]](s32), [[LSHR3]](s32), [[LSHR4]](s32), [[LSHR5]](s32), [[LSHR6]](s32)
     ; CI-HSA-NEXT: [[TRUNC:%[0-9]+]]:_(<8 x s4>) = G_TRUNC [[BUILD_VECTOR]](<8 x s32>)
     ; CI-HSA-NEXT: $vgpr0 = COPY [[TRUNC]](<8 x s4>)
+    ;
     ; CI-MESA-LABEL: name: test_load_global_v8s4_align4
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
@@ -15597,6 +16248,7 @@ body: |
     ; CI-MESA-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LSHR]](s32), [[LSHR1]](s32), [[LSHR2]](s32), [[LSHR3]](s32), [[LSHR4]](s32), [[LSHR5]](s32), [[LSHR6]](s32)
     ; CI-MESA-NEXT: [[TRUNC:%[0-9]+]]:_(<8 x s4>) = G_TRUNC [[BUILD_VECTOR]](<8 x s32>)
     ; CI-MESA-NEXT: $vgpr0 = COPY [[TRUNC]](<8 x s4>)
+    ;
     ; VI-LABEL: name: test_load_global_v8s4_align4
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -15619,6 +16271,7 @@ body: |
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LSHR]](s32), [[LSHR1]](s32), [[LSHR2]](s32), [[LSHR3]](s32), [[LSHR4]](s32), [[LSHR5]](s32), [[LSHR6]](s32)
     ; VI-NEXT: [[TRUNC:%[0-9]+]]:_(<8 x s4>) = G_TRUNC [[BUILD_VECTOR]](<8 x s32>)
     ; VI-NEXT: $vgpr0 = COPY [[TRUNC]](<8 x s4>)
+    ;
     ; GFX9-HSA-LABEL: name: test_load_global_v8s4_align4
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
@@ -15653,6 +16306,7 @@ body: |
     ; GFX9-HSA-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>), [[BUILD_VECTOR3]](<2 x s16>)
     ; GFX9-HSA-NEXT: [[TRUNC8:%[0-9]+]]:_(<8 x s4>) = G_TRUNC [[CONCAT_VECTORS]](<8 x s16>)
     ; GFX9-HSA-NEXT: $vgpr0 = COPY [[TRUNC8]](<8 x s4>)
+    ;
     ; GFX9-MESA-LABEL: name: test_load_global_v8s4_align4
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
@@ -15705,30 +16359,35 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; SI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s16), align 536870912, addrspace 1)
     ; SI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; CI-HSA-LABEL: name: test_load_global_s32_align536870912
     ; CI-HSA: liveins: $vgpr0_vgpr1
     ; CI-HSA-NEXT: {{  $}}
     ; CI-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-HSA-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s16), align 536870912, addrspace 1)
     ; CI-HSA-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; CI-MESA-LABEL: name: test_load_global_s32_align536870912
     ; CI-MESA: liveins: $vgpr0_vgpr1
     ; CI-MESA-NEXT: {{  $}}
     ; CI-MESA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-MESA-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s16), align 536870912, addrspace 1)
     ; CI-MESA-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; VI-LABEL: name: test_load_global_s32_align536870912
     ; VI: liveins: $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s16), align 536870912, addrspace 1)
     ; VI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX9-HSA-LABEL: name: test_load_global_s32_align536870912
     ; GFX9-HSA: liveins: $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: {{  $}}
     ; GFX9-HSA-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; GFX9-HSA-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p1) :: (load (s16), align 536870912, addrspace 1)
     ; GFX9-HSA-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX9-MESA-LABEL: name: test_load_global_s32_align536870912
     ; GFX9-MESA: liveins: $vgpr0_vgpr1
     ; GFX9-MESA-NEXT: {{  $}}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-local.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-local.mir
index 46a6125225dfb..a94df99206304 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-local.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-local.mir
@@ -24,6 +24,7 @@ body: |
     ; SI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; SI-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LOAD]], [[C]]
     ; SI-NEXT: $vgpr0 = COPY [[AND]](s32)
+    ;
     ; CI-LABEL: name: test_load_local_s1_align1
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -32,6 +33,7 @@ body: |
     ; CI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; CI-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LOAD]], [[C]]
     ; CI-NEXT: $vgpr0 = COPY [[AND]](s32)
+    ;
     ; CI-DS128-LABEL: name: test_load_local_s1_align1
     ; CI-DS128: liveins: $vgpr0
     ; CI-DS128-NEXT: {{  $}}
@@ -40,6 +42,7 @@ body: |
     ; CI-DS128-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; CI-DS128-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LOAD]], [[C]]
     ; CI-DS128-NEXT: $vgpr0 = COPY [[AND]](s32)
+    ;
     ; VI-LABEL: name: test_load_local_s1_align1
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -48,6 +51,7 @@ body: |
     ; VI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; VI-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LOAD]], [[C]]
     ; VI-NEXT: $vgpr0 = COPY [[AND]](s32)
+    ;
     ; GFX9-LABEL: name: test_load_local_s1_align1
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -56,6 +60,7 @@ body: |
     ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LOAD]], [[C]]
     ; GFX9-NEXT: $vgpr0 = COPY [[AND]](s32)
+    ;
     ; GFX9-UNALIGNED-LABEL: name: test_load_local_s1_align1
     ; GFX9-UNALIGNED: liveins: $vgpr0
     ; GFX9-UNALIGNED-NEXT: {{  $}}
@@ -64,6 +69,7 @@ body: |
     ; GFX9-UNALIGNED-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; GFX9-UNALIGNED-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LOAD]], [[C]]
     ; GFX9-UNALIGNED-NEXT: $vgpr0 = COPY [[AND]](s32)
+    ;
     ; GFX10-LABEL: name: test_load_local_s1_align1
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -72,6 +78,7 @@ body: |
     ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; GFX10-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LOAD]], [[C]]
     ; GFX10-NEXT: $vgpr0 = COPY [[AND]](s32)
+    ;
     ; GFX10-UNALIGNED-LABEL: name: test_load_local_s1_align1
     ; GFX10-UNALIGNED: liveins: $vgpr0
     ; GFX10-UNALIGNED-NEXT: {{  $}}
@@ -80,6 +87,7 @@ body: |
     ; GFX10-UNALIGNED-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; GFX10-UNALIGNED-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LOAD]], [[C]]
     ; GFX10-UNALIGNED-NEXT: $vgpr0 = COPY [[AND]](s32)
+    ;
     ; GFX11-LABEL: name: test_load_local_s1_align1
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -88,6 +96,7 @@ body: |
     ; GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; GFX11-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LOAD]], [[C]]
     ; GFX11-NEXT: $vgpr0 = COPY [[AND]](s32)
+    ;
     ; GFX11-UNALIGNED-LABEL: name: test_load_local_s1_align1
     ; GFX11-UNALIGNED: liveins: $vgpr0
     ; GFX11-UNALIGNED-NEXT: {{  $}}
@@ -116,6 +125,7 @@ body: |
     ; SI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
     ; SI-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LOAD]], [[C]]
     ; SI-NEXT: $vgpr0 = COPY [[AND]](s32)
+    ;
     ; CI-LABEL: name: test_load_local_s2_align1
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -124,6 +134,7 @@ body: |
     ; CI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
     ; CI-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LOAD]], [[C]]
     ; CI-NEXT: $vgpr0 = COPY [[AND]](s32)
+    ;
     ; CI-DS128-LABEL: name: test_load_local_s2_align1
     ; CI-DS128: liveins: $vgpr0
     ; CI-DS128-NEXT: {{  $}}
@@ -132,6 +143,7 @@ body: |
     ; CI-DS128-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
     ; CI-DS128-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LOAD]], [[C]]
     ; CI-DS128-NEXT: $vgpr0 = COPY [[AND]](s32)
+    ;
     ; VI-LABEL: name: test_load_local_s2_align1
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -140,6 +152,7 @@ body: |
     ; VI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
     ; VI-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LOAD]], [[C]]
     ; VI-NEXT: $vgpr0 = COPY [[AND]](s32)
+    ;
     ; GFX9-LABEL: name: test_load_local_s2_align1
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -148,6 +161,7 @@ body: |
     ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
     ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LOAD]], [[C]]
     ; GFX9-NEXT: $vgpr0 = COPY [[AND]](s32)
+    ;
     ; GFX9-UNALIGNED-LABEL: name: test_load_local_s2_align1
     ; GFX9-UNALIGNED: liveins: $vgpr0
     ; GFX9-UNALIGNED-NEXT: {{  $}}
@@ -156,6 +170,7 @@ body: |
     ; GFX9-UNALIGNED-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
     ; GFX9-UNALIGNED-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LOAD]], [[C]]
     ; GFX9-UNALIGNED-NEXT: $vgpr0 = COPY [[AND]](s32)
+    ;
     ; GFX10-LABEL: name: test_load_local_s2_align1
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -164,6 +179,7 @@ body: |
     ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
     ; GFX10-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LOAD]], [[C]]
     ; GFX10-NEXT: $vgpr0 = COPY [[AND]](s32)
+    ;
     ; GFX10-UNALIGNED-LABEL: name: test_load_local_s2_align1
     ; GFX10-UNALIGNED: liveins: $vgpr0
     ; GFX10-UNALIGNED-NEXT: {{  $}}
@@ -172,6 +188,7 @@ body: |
     ; GFX10-UNALIGNED-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
     ; GFX10-UNALIGNED-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LOAD]], [[C]]
     ; GFX10-UNALIGNED-NEXT: $vgpr0 = COPY [[AND]](s32)
+    ;
     ; GFX11-LABEL: name: test_load_local_s2_align1
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -180,6 +197,7 @@ body: |
     ; GFX11-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
     ; GFX11-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LOAD]], [[C]]
     ; GFX11-NEXT: $vgpr0 = COPY [[AND]](s32)
+    ;
     ; GFX11-UNALIGNED-LABEL: name: test_load_local_s2_align1
     ; GFX11-UNALIGNED: liveins: $vgpr0
     ; GFX11-UNALIGNED-NEXT: {{  $}}
@@ -206,54 +224,63 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; SI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s8), align 4, addrspace 3)
     ; SI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; CI-LABEL: name: test_load_local_s8_align4
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s8), align 4, addrspace 3)
     ; CI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; CI-DS128-LABEL: name: test_load_local_s8_align4
     ; CI-DS128: liveins: $vgpr0
     ; CI-DS128-NEXT: {{  $}}
     ; CI-DS128-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; CI-DS128-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s8), align 4, addrspace 3)
     ; CI-DS128-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; VI-LABEL: name: test_load_local_s8_align4
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s8), align 4, addrspace 3)
     ; VI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX9-LABEL: name: test_load_local_s8_align4
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s8), align 4, addrspace 3)
     ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX9-UNALIGNED-LABEL: name: test_load_local_s8_align4
     ; GFX9-UNALIGNED: liveins: $vgpr0
     ; GFX9-UNALIGNED-NEXT: {{  $}}
     ; GFX9-UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX9-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s8), align 4, addrspace 3)
     ; GFX9-UNALIGNED-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX10-LABEL: name: test_load_local_s8_align4
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s8), align 4, addrspace 3)
     ; GFX10-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX10-UNALIGNED-LABEL: name: test_load_local_s8_align4
     ; GFX10-UNALIGNED: liveins: $vgpr0
     ; GFX10-UNALIGNED-NEXT: {{  $}}
     ; GFX10-UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX10-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s8), align 4, addrspace 3)
     ; GFX10-UNALIGNED-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX11-LABEL: name: test_load_local_s8_align4
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s8), align 4, addrspace 3)
     ; GFX11-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX11-UNALIGNED-LABEL: name: test_load_local_s8_align4
     ; GFX11-UNALIGNED: liveins: $vgpr0
     ; GFX11-UNALIGNED-NEXT: {{  $}}
@@ -278,54 +305,63 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; SI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s8), addrspace 3)
     ; SI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; CI-LABEL: name: test_load_local_s8_align1
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s8), addrspace 3)
     ; CI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; CI-DS128-LABEL: name: test_load_local_s8_align1
     ; CI-DS128: liveins: $vgpr0
     ; CI-DS128-NEXT: {{  $}}
     ; CI-DS128-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; CI-DS128-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s8), addrspace 3)
     ; CI-DS128-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; VI-LABEL: name: test_load_local_s8_align1
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s8), addrspace 3)
     ; VI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX9-LABEL: name: test_load_local_s8_align1
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s8), addrspace 3)
     ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX9-UNALIGNED-LABEL: name: test_load_local_s8_align1
     ; GFX9-UNALIGNED: liveins: $vgpr0
     ; GFX9-UNALIGNED-NEXT: {{  $}}
     ; GFX9-UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX9-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s8), addrspace 3)
     ; GFX9-UNALIGNED-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX10-LABEL: name: test_load_local_s8_align1
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s8), addrspace 3)
     ; GFX10-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX10-UNALIGNED-LABEL: name: test_load_local_s8_align1
     ; GFX10-UNALIGNED: liveins: $vgpr0
     ; GFX10-UNALIGNED-NEXT: {{  $}}
     ; GFX10-UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX10-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s8), addrspace 3)
     ; GFX10-UNALIGNED-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX11-LABEL: name: test_load_local_s8_align1
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s8), addrspace 3)
     ; GFX11-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX11-UNALIGNED-LABEL: name: test_load_local_s8_align1
     ; GFX11-UNALIGNED: liveins: $vgpr0
     ; GFX11-UNALIGNED-NEXT: {{  $}}
@@ -350,54 +386,63 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; SI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s16), align 4, addrspace 3)
     ; SI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; CI-LABEL: name: test_load_local_s16_align4
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s16), align 4, addrspace 3)
     ; CI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; CI-DS128-LABEL: name: test_load_local_s16_align4
     ; CI-DS128: liveins: $vgpr0
     ; CI-DS128-NEXT: {{  $}}
     ; CI-DS128-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; CI-DS128-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s16), align 4, addrspace 3)
     ; CI-DS128-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; VI-LABEL: name: test_load_local_s16_align4
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s16), align 4, addrspace 3)
     ; VI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX9-LABEL: name: test_load_local_s16_align4
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s16), align 4, addrspace 3)
     ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX9-UNALIGNED-LABEL: name: test_load_local_s16_align4
     ; GFX9-UNALIGNED: liveins: $vgpr0
     ; GFX9-UNALIGNED-NEXT: {{  $}}
     ; GFX9-UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX9-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s16), align 4, addrspace 3)
     ; GFX9-UNALIGNED-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX10-LABEL: name: test_load_local_s16_align4
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s16), align 4, addrspace 3)
     ; GFX10-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX10-UNALIGNED-LABEL: name: test_load_local_s16_align4
     ; GFX10-UNALIGNED: liveins: $vgpr0
     ; GFX10-UNALIGNED-NEXT: {{  $}}
     ; GFX10-UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX10-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s16), align 4, addrspace 3)
     ; GFX10-UNALIGNED-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX11-LABEL: name: test_load_local_s16_align4
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s16), align 4, addrspace 3)
     ; GFX11-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX11-UNALIGNED-LABEL: name: test_load_local_s16_align4
     ; GFX11-UNALIGNED: liveins: $vgpr0
     ; GFX11-UNALIGNED-NEXT: {{  $}}
@@ -422,54 +467,63 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; SI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s16), addrspace 3)
     ; SI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; CI-LABEL: name: test_load_local_s16_align2
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s16), addrspace 3)
     ; CI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; CI-DS128-LABEL: name: test_load_local_s16_align2
     ; CI-DS128: liveins: $vgpr0
     ; CI-DS128-NEXT: {{  $}}
     ; CI-DS128-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; CI-DS128-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s16), addrspace 3)
     ; CI-DS128-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; VI-LABEL: name: test_load_local_s16_align2
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s16), addrspace 3)
     ; VI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX9-LABEL: name: test_load_local_s16_align2
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s16), addrspace 3)
     ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX9-UNALIGNED-LABEL: name: test_load_local_s16_align2
     ; GFX9-UNALIGNED: liveins: $vgpr0
     ; GFX9-UNALIGNED-NEXT: {{  $}}
     ; GFX9-UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX9-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s16), addrspace 3)
     ; GFX9-UNALIGNED-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX10-LABEL: name: test_load_local_s16_align2
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s16), addrspace 3)
     ; GFX10-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX10-UNALIGNED-LABEL: name: test_load_local_s16_align2
     ; GFX10-UNALIGNED: liveins: $vgpr0
     ; GFX10-UNALIGNED-NEXT: {{  $}}
     ; GFX10-UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX10-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s16), addrspace 3)
     ; GFX10-UNALIGNED-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX11-LABEL: name: test_load_local_s16_align2
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s16), addrspace 3)
     ; GFX11-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX11-UNALIGNED-LABEL: name: test_load_local_s16_align2
     ; GFX11-UNALIGNED: liveins: $vgpr0
     ; GFX11-UNALIGNED-NEXT: {{  $}}
@@ -500,6 +554,7 @@ body: |
     ; SI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
     ; SI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; SI-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
     ; CI-LABEL: name: test_load_local_s16_align1
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -512,6 +567,7 @@ body: |
     ; CI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
     ; CI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; CI-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
     ; CI-DS128-LABEL: name: test_load_local_s16_align1
     ; CI-DS128: liveins: $vgpr0
     ; CI-DS128-NEXT: {{  $}}
@@ -524,6 +580,7 @@ body: |
     ; CI-DS128-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
     ; CI-DS128-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; CI-DS128-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
     ; VI-LABEL: name: test_load_local_s16_align1
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -536,6 +593,7 @@ body: |
     ; VI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
     ; VI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; VI-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
     ; GFX9-LABEL: name: test_load_local_s16_align1
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -548,12 +606,14 @@ body: |
     ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
     ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; GFX9-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
     ; GFX9-UNALIGNED-LABEL: name: test_load_local_s16_align1
     ; GFX9-UNALIGNED: liveins: $vgpr0
     ; GFX9-UNALIGNED-NEXT: {{  $}}
     ; GFX9-UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX9-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s16), align 1, addrspace 3)
     ; GFX9-UNALIGNED-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX10-LABEL: name: test_load_local_s16_align1
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -566,12 +626,14 @@ body: |
     ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
     ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; GFX10-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
     ; GFX10-UNALIGNED-LABEL: name: test_load_local_s16_align1
     ; GFX10-UNALIGNED: liveins: $vgpr0
     ; GFX10-UNALIGNED-NEXT: {{  $}}
     ; GFX10-UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX10-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s16), align 1, addrspace 3)
     ; GFX10-UNALIGNED-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX11-LABEL: name: test_load_local_s16_align1
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -584,6 +646,7 @@ body: |
     ; GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
     ; GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; GFX11-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
     ; GFX11-UNALIGNED-LABEL: name: test_load_local_s16_align1
     ; GFX11-UNALIGNED: liveins: $vgpr0
     ; GFX11-UNALIGNED-NEXT: {{  $}}
@@ -608,54 +671,63 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; SI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s32), addrspace 3)
     ; SI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; CI-LABEL: name: test_load_local_s32_align4
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s32), addrspace 3)
     ; CI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; CI-DS128-LABEL: name: test_load_local_s32_align4
     ; CI-DS128: liveins: $vgpr0
     ; CI-DS128-NEXT: {{  $}}
     ; CI-DS128-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; CI-DS128-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s32), addrspace 3)
     ; CI-DS128-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; VI-LABEL: name: test_load_local_s32_align4
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s32), addrspace 3)
     ; VI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX9-LABEL: name: test_load_local_s32_align4
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s32), addrspace 3)
     ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX9-UNALIGNED-LABEL: name: test_load_local_s32_align4
     ; GFX9-UNALIGNED: liveins: $vgpr0
     ; GFX9-UNALIGNED-NEXT: {{  $}}
     ; GFX9-UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX9-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s32), addrspace 3)
     ; GFX9-UNALIGNED-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX10-LABEL: name: test_load_local_s32_align4
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s32), addrspace 3)
     ; GFX10-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX10-UNALIGNED-LABEL: name: test_load_local_s32_align4
     ; GFX10-UNALIGNED: liveins: $vgpr0
     ; GFX10-UNALIGNED-NEXT: {{  $}}
     ; GFX10-UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX10-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s32), addrspace 3)
     ; GFX10-UNALIGNED-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX11-LABEL: name: test_load_local_s32_align4
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s32), addrspace 3)
     ; GFX11-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX11-UNALIGNED-LABEL: name: test_load_local_s32_align4
     ; GFX11-UNALIGNED: liveins: $vgpr0
     ; GFX11-UNALIGNED-NEXT: {{  $}}
@@ -685,6 +757,7 @@ body: |
     ; SI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
     ; SI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; SI-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
     ; CI-LABEL: name: test_load_local_s32_align2
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -697,6 +770,7 @@ body: |
     ; CI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
     ; CI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; CI-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
     ; CI-DS128-LABEL: name: test_load_local_s32_align2
     ; CI-DS128: liveins: $vgpr0
     ; CI-DS128-NEXT: {{  $}}
@@ -709,6 +783,7 @@ body: |
     ; CI-DS128-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
     ; CI-DS128-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; CI-DS128-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
     ; VI-LABEL: name: test_load_local_s32_align2
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -721,6 +796,7 @@ body: |
     ; VI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
     ; VI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; VI-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
     ; GFX9-LABEL: name: test_load_local_s32_align2
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -733,12 +809,14 @@ body: |
     ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
     ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; GFX9-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
     ; GFX9-UNALIGNED-LABEL: name: test_load_local_s32_align2
     ; GFX9-UNALIGNED: liveins: $vgpr0
     ; GFX9-UNALIGNED-NEXT: {{  $}}
     ; GFX9-UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX9-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s32), align 2, addrspace 3)
     ; GFX9-UNALIGNED-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX10-LABEL: name: test_load_local_s32_align2
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -751,12 +829,14 @@ body: |
     ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
     ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; GFX10-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
     ; GFX10-UNALIGNED-LABEL: name: test_load_local_s32_align2
     ; GFX10-UNALIGNED: liveins: $vgpr0
     ; GFX10-UNALIGNED-NEXT: {{  $}}
     ; GFX10-UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX10-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s32), align 2, addrspace 3)
     ; GFX10-UNALIGNED-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX11-LABEL: name: test_load_local_s32_align2
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -769,6 +849,7 @@ body: |
     ; GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
     ; GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; GFX11-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
     ; GFX11-UNALIGNED-LABEL: name: test_load_local_s32_align2
     ; GFX11-UNALIGNED: liveins: $vgpr0
     ; GFX11-UNALIGNED-NEXT: {{  $}}
@@ -808,6 +889,7 @@ body: |
     ; SI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
     ; SI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
     ; SI-NEXT: $vgpr0 = COPY [[OR2]](s32)
+    ;
     ; CI-LABEL: name: test_load_local_s32_align1
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -830,6 +912,7 @@ body: |
     ; CI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
     ; CI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
     ; CI-NEXT: $vgpr0 = COPY [[OR2]](s32)
+    ;
     ; CI-DS128-LABEL: name: test_load_local_s32_align1
     ; CI-DS128: liveins: $vgpr0
     ; CI-DS128-NEXT: {{  $}}
@@ -852,6 +935,7 @@ body: |
     ; CI-DS128-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
     ; CI-DS128-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
     ; CI-DS128-NEXT: $vgpr0 = COPY [[OR2]](s32)
+    ;
     ; VI-LABEL: name: test_load_local_s32_align1
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -874,6 +958,7 @@ body: |
     ; VI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
     ; VI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
     ; VI-NEXT: $vgpr0 = COPY [[OR2]](s32)
+    ;
     ; GFX9-LABEL: name: test_load_local_s32_align1
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -896,12 +981,14 @@ body: |
     ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
     ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
     ; GFX9-NEXT: $vgpr0 = COPY [[OR2]](s32)
+    ;
     ; GFX9-UNALIGNED-LABEL: name: test_load_local_s32_align1
     ; GFX9-UNALIGNED: liveins: $vgpr0
     ; GFX9-UNALIGNED-NEXT: {{  $}}
     ; GFX9-UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX9-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s32), align 1, addrspace 3)
     ; GFX9-UNALIGNED-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX10-LABEL: name: test_load_local_s32_align1
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -924,12 +1011,14 @@ body: |
     ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
     ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
     ; GFX10-NEXT: $vgpr0 = COPY [[OR2]](s32)
+    ;
     ; GFX10-UNALIGNED-LABEL: name: test_load_local_s32_align1
     ; GFX10-UNALIGNED: liveins: $vgpr0
     ; GFX10-UNALIGNED-NEXT: {{  $}}
     ; GFX10-UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX10-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s32), align 1, addrspace 3)
     ; GFX10-UNALIGNED-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX11-LABEL: name: test_load_local_s32_align1
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -952,6 +1041,7 @@ body: |
     ; GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
     ; GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
     ; GFX11-NEXT: $vgpr0 = COPY [[OR2]](s32)
+    ;
     ; GFX11-UNALIGNED-LABEL: name: test_load_local_s32_align1
     ; GFX11-UNALIGNED: liveins: $vgpr0
     ; GFX11-UNALIGNED-NEXT: {{  $}}
@@ -975,54 +1065,63 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; SI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s32), align 8, addrspace 3)
     ; SI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; CI-LABEL: name: test_load_local_s24_align8
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s32), align 8, addrspace 3)
     ; CI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; CI-DS128-LABEL: name: test_load_local_s24_align8
     ; CI-DS128: liveins: $vgpr0
     ; CI-DS128-NEXT: {{  $}}
     ; CI-DS128-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; CI-DS128-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s32), align 8, addrspace 3)
     ; CI-DS128-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; VI-LABEL: name: test_load_local_s24_align8
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s32), align 8, addrspace 3)
     ; VI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX9-LABEL: name: test_load_local_s24_align8
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s32), align 8, addrspace 3)
     ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX9-UNALIGNED-LABEL: name: test_load_local_s24_align8
     ; GFX9-UNALIGNED: liveins: $vgpr0
     ; GFX9-UNALIGNED-NEXT: {{  $}}
     ; GFX9-UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX9-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s32), align 8, addrspace 3)
     ; GFX9-UNALIGNED-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX10-LABEL: name: test_load_local_s24_align8
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s32), align 8, addrspace 3)
     ; GFX10-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX10-UNALIGNED-LABEL: name: test_load_local_s24_align8
     ; GFX10-UNALIGNED: liveins: $vgpr0
     ; GFX10-UNALIGNED-NEXT: {{  $}}
     ; GFX10-UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX10-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s32), align 8, addrspace 3)
     ; GFX10-UNALIGNED-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX11-LABEL: name: test_load_local_s24_align8
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s32), align 8, addrspace 3)
     ; GFX11-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX11-UNALIGNED-LABEL: name: test_load_local_s24_align8
     ; GFX11-UNALIGNED: liveins: $vgpr0
     ; GFX11-UNALIGNED-NEXT: {{  $}}
@@ -1047,54 +1146,63 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; SI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s32), addrspace 3)
     ; SI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; CI-LABEL: name: test_load_local_s24_align4
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s32), addrspace 3)
     ; CI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; CI-DS128-LABEL: name: test_load_local_s24_align4
     ; CI-DS128: liveins: $vgpr0
     ; CI-DS128-NEXT: {{  $}}
     ; CI-DS128-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; CI-DS128-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s32), addrspace 3)
     ; CI-DS128-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; VI-LABEL: name: test_load_local_s24_align4
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s32), addrspace 3)
     ; VI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX9-LABEL: name: test_load_local_s24_align4
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s32), addrspace 3)
     ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX9-UNALIGNED-LABEL: name: test_load_local_s24_align4
     ; GFX9-UNALIGNED: liveins: $vgpr0
     ; GFX9-UNALIGNED-NEXT: {{  $}}
     ; GFX9-UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX9-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s32), addrspace 3)
     ; GFX9-UNALIGNED-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX10-LABEL: name: test_load_local_s24_align4
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s32), addrspace 3)
     ; GFX10-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX10-UNALIGNED-LABEL: name: test_load_local_s24_align4
     ; GFX10-UNALIGNED: liveins: $vgpr0
     ; GFX10-UNALIGNED-NEXT: {{  $}}
     ; GFX10-UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX10-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s32), addrspace 3)
     ; GFX10-UNALIGNED-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX11-LABEL: name: test_load_local_s24_align4
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s32), addrspace 3)
     ; GFX11-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX11-UNALIGNED-LABEL: name: test_load_local_s24_align4
     ; GFX11-UNALIGNED: liveins: $vgpr0
     ; GFX11-UNALIGNED-NEXT: {{  $}}
@@ -1125,6 +1233,7 @@ body: |
     ; SI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
     ; SI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; SI-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
     ; CI-LABEL: name: test_load_local_s24_align2
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -1137,6 +1246,7 @@ body: |
     ; CI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
     ; CI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; CI-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
     ; CI-DS128-LABEL: name: test_load_local_s24_align2
     ; CI-DS128: liveins: $vgpr0
     ; CI-DS128-NEXT: {{  $}}
@@ -1149,6 +1259,7 @@ body: |
     ; CI-DS128-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
     ; CI-DS128-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; CI-DS128-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
     ; VI-LABEL: name: test_load_local_s24_align2
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -1161,6 +1272,7 @@ body: |
     ; VI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
     ; VI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; VI-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
     ; GFX9-LABEL: name: test_load_local_s24_align2
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -1173,6 +1285,7 @@ body: |
     ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
     ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; GFX9-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
     ; GFX9-UNALIGNED-LABEL: name: test_load_local_s24_align2
     ; GFX9-UNALIGNED: liveins: $vgpr0
     ; GFX9-UNALIGNED-NEXT: {{  $}}
@@ -1185,6 +1298,7 @@ body: |
     ; GFX9-UNALIGNED-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
     ; GFX9-UNALIGNED-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; GFX9-UNALIGNED-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
     ; GFX10-LABEL: name: test_load_local_s24_align2
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -1197,6 +1311,7 @@ body: |
     ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
     ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; GFX10-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
     ; GFX10-UNALIGNED-LABEL: name: test_load_local_s24_align2
     ; GFX10-UNALIGNED: liveins: $vgpr0
     ; GFX10-UNALIGNED-NEXT: {{  $}}
@@ -1209,6 +1324,7 @@ body: |
     ; GFX10-UNALIGNED-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
     ; GFX10-UNALIGNED-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; GFX10-UNALIGNED-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
     ; GFX11-LABEL: name: test_load_local_s24_align2
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -1221,6 +1337,7 @@ body: |
     ; GFX11-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
     ; GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; GFX11-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
     ; GFX11-UNALIGNED-LABEL: name: test_load_local_s24_align2
     ; GFX11-UNALIGNED: liveins: $vgpr0
     ; GFX11-UNALIGNED-NEXT: {{  $}}
@@ -1263,6 +1380,7 @@ body: |
     ; SI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C3]](s32)
     ; SI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[OR]]
     ; SI-NEXT: $vgpr0 = COPY [[OR1]](s32)
+    ;
     ; CI-LABEL: name: test_load_local_s24_align1
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -1281,6 +1399,7 @@ body: |
     ; CI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C3]](s32)
     ; CI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[OR]]
     ; CI-NEXT: $vgpr0 = COPY [[OR1]](s32)
+    ;
     ; CI-DS128-LABEL: name: test_load_local_s24_align1
     ; CI-DS128: liveins: $vgpr0
     ; CI-DS128-NEXT: {{  $}}
@@ -1299,6 +1418,7 @@ body: |
     ; CI-DS128-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C3]](s32)
     ; CI-DS128-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[OR]]
     ; CI-DS128-NEXT: $vgpr0 = COPY [[OR1]](s32)
+    ;
     ; VI-LABEL: name: test_load_local_s24_align1
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -1317,6 +1437,7 @@ body: |
     ; VI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C3]](s32)
     ; VI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[OR]]
     ; VI-NEXT: $vgpr0 = COPY [[OR1]](s32)
+    ;
     ; GFX9-LABEL: name: test_load_local_s24_align1
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -1335,6 +1456,7 @@ body: |
     ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C3]](s32)
     ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[OR]]
     ; GFX9-NEXT: $vgpr0 = COPY [[OR1]](s32)
+    ;
     ; GFX9-UNALIGNED-LABEL: name: test_load_local_s24_align1
     ; GFX9-UNALIGNED: liveins: $vgpr0
     ; GFX9-UNALIGNED-NEXT: {{  $}}
@@ -1347,6 +1469,7 @@ body: |
     ; GFX9-UNALIGNED-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
     ; GFX9-UNALIGNED-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; GFX9-UNALIGNED-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
     ; GFX10-LABEL: name: test_load_local_s24_align1
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -1365,6 +1488,7 @@ body: |
     ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C3]](s32)
     ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[OR]]
     ; GFX10-NEXT: $vgpr0 = COPY [[OR1]](s32)
+    ;
     ; GFX10-UNALIGNED-LABEL: name: test_load_local_s24_align1
     ; GFX10-UNALIGNED: liveins: $vgpr0
     ; GFX10-UNALIGNED-NEXT: {{  $}}
@@ -1377,6 +1501,7 @@ body: |
     ; GFX10-UNALIGNED-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
     ; GFX10-UNALIGNED-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; GFX10-UNALIGNED-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
     ; GFX11-LABEL: name: test_load_local_s24_align1
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -1395,6 +1520,7 @@ body: |
     ; GFX11-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C3]](s32)
     ; GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[OR]]
     ; GFX11-NEXT: $vgpr0 = COPY [[OR1]](s32)
+    ;
     ; GFX11-UNALIGNED-LABEL: name: test_load_local_s24_align1
     ; GFX11-UNALIGNED: liveins: $vgpr0
     ; GFX11-UNALIGNED-NEXT: {{  $}}
@@ -1425,54 +1551,63 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; SI-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p3) :: (load (s64), addrspace 3)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
+    ;
     ; CI-LABEL: name: test_load_local_s48_align8
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p3) :: (load (s64), addrspace 3)
     ; CI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
+    ;
     ; CI-DS128-LABEL: name: test_load_local_s48_align8
     ; CI-DS128: liveins: $vgpr0
     ; CI-DS128-NEXT: {{  $}}
     ; CI-DS128-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; CI-DS128-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p3) :: (load (s64), addrspace 3)
     ; CI-DS128-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
+    ;
     ; VI-LABEL: name: test_load_local_s48_align8
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p3) :: (load (s64), addrspace 3)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
+    ;
     ; GFX9-LABEL: name: test_load_local_s48_align8
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p3) :: (load (s64), addrspace 3)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
+    ;
     ; GFX9-UNALIGNED-LABEL: name: test_load_local_s48_align8
     ; GFX9-UNALIGNED: liveins: $vgpr0
     ; GFX9-UNALIGNED-NEXT: {{  $}}
     ; GFX9-UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX9-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p3) :: (load (s64), addrspace 3)
     ; GFX9-UNALIGNED-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
+    ;
     ; GFX10-LABEL: name: test_load_local_s48_align8
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p3) :: (load (s64), addrspace 3)
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
+    ;
     ; GFX10-UNALIGNED-LABEL: name: test_load_local_s48_align8
     ; GFX10-UNALIGNED: liveins: $vgpr0
     ; GFX10-UNALIGNED-NEXT: {{  $}}
     ; GFX10-UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX10-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p3) :: (load (s64), addrspace 3)
     ; GFX10-UNALIGNED-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
+    ;
     ; GFX11-LABEL: name: test_load_local_s48_align8
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p3) :: (load (s64), addrspace 3)
     ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
+    ;
     ; GFX11-UNALIGNED-LABEL: name: test_load_local_s48_align8
     ; GFX11-UNALIGNED: liveins: $vgpr0
     ; GFX11-UNALIGNED-NEXT: {{  $}}
@@ -1497,54 +1632,63 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; SI-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p3) :: (load (s64), addrspace 3)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
+    ;
     ; CI-LABEL: name: test_load_local_s64_align8
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p3) :: (load (s64), addrspace 3)
     ; CI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
+    ;
     ; CI-DS128-LABEL: name: test_load_local_s64_align8
     ; CI-DS128: liveins: $vgpr0
     ; CI-DS128-NEXT: {{  $}}
     ; CI-DS128-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; CI-DS128-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p3) :: (load (s64), addrspace 3)
     ; CI-DS128-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
+    ;
     ; VI-LABEL: name: test_load_local_s64_align8
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p3) :: (load (s64), addrspace 3)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
+    ;
     ; GFX9-LABEL: name: test_load_local_s64_align8
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p3) :: (load (s64), addrspace 3)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
+    ;
     ; GFX9-UNALIGNED-LABEL: name: test_load_local_s64_align8
     ; GFX9-UNALIGNED: liveins: $vgpr0
     ; GFX9-UNALIGNED-NEXT: {{  $}}
     ; GFX9-UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX9-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p3) :: (load (s64), addrspace 3)
     ; GFX9-UNALIGNED-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
+    ;
     ; GFX10-LABEL: name: test_load_local_s64_align8
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p3) :: (load (s64), addrspace 3)
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
+    ;
     ; GFX10-UNALIGNED-LABEL: name: test_load_local_s64_align8
     ; GFX10-UNALIGNED: liveins: $vgpr0
     ; GFX10-UNALIGNED-NEXT: {{  $}}
     ; GFX10-UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX10-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p3) :: (load (s64), addrspace 3)
     ; GFX10-UNALIGNED-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
+    ;
     ; GFX11-LABEL: name: test_load_local_s64_align8
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p3) :: (load (s64), addrspace 3)
     ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
+    ;
     ; GFX11-UNALIGNED-LABEL: name: test_load_local_s64_align8
     ; GFX11-UNALIGNED: liveins: $vgpr0
     ; GFX11-UNALIGNED-NEXT: {{  $}}
@@ -1568,54 +1712,63 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; SI-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p3) :: (load (s64), align 4, addrspace 3)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
+    ;
     ; CI-LABEL: name: test_load_local_s64_align4
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p3) :: (load (s64), align 4, addrspace 3)
     ; CI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
+    ;
     ; CI-DS128-LABEL: name: test_load_local_s64_align4
     ; CI-DS128: liveins: $vgpr0
     ; CI-DS128-NEXT: {{  $}}
     ; CI-DS128-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; CI-DS128-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p3) :: (load (s64), align 4, addrspace 3)
     ; CI-DS128-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
+    ;
     ; VI-LABEL: name: test_load_local_s64_align4
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p3) :: (load (s64), align 4, addrspace 3)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
+    ;
     ; GFX9-LABEL: name: test_load_local_s64_align4
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p3) :: (load (s64), align 4, addrspace 3)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
+    ;
     ; GFX9-UNALIGNED-LABEL: name: test_load_local_s64_align4
     ; GFX9-UNALIGNED: liveins: $vgpr0
     ; GFX9-UNALIGNED-NEXT: {{  $}}
     ; GFX9-UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX9-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p3) :: (load (s64), align 4, addrspace 3)
     ; GFX9-UNALIGNED-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
+    ;
     ; GFX10-LABEL: name: test_load_local_s64_align4
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p3) :: (load (s64), align 4, addrspace 3)
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
+    ;
     ; GFX10-UNALIGNED-LABEL: name: test_load_local_s64_align4
     ; GFX10-UNALIGNED: liveins: $vgpr0
     ; GFX10-UNALIGNED-NEXT: {{  $}}
     ; GFX10-UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX10-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p3) :: (load (s64), align 4, addrspace 3)
     ; GFX10-UNALIGNED-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
+    ;
     ; GFX11-LABEL: name: test_load_local_s64_align4
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p3) :: (load (s64), align 4, addrspace 3)
     ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
+    ;
     ; GFX11-UNALIGNED-LABEL: name: test_load_local_s64_align4
     ; GFX11-UNALIGNED: liveins: $vgpr0
     ; GFX11-UNALIGNED-NEXT: {{  $}}
@@ -1657,6 +1810,7 @@ body: |
     ; SI-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32)
     ; SI-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]]
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[OR2]](s64)
+    ;
     ; CI-LABEL: name: test_load_local_s64_align2
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -1681,6 +1835,7 @@ body: |
     ; CI-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32)
     ; CI-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]]
     ; CI-NEXT: $vgpr0_vgpr1 = COPY [[OR2]](s64)
+    ;
     ; CI-DS128-LABEL: name: test_load_local_s64_align2
     ; CI-DS128: liveins: $vgpr0
     ; CI-DS128-NEXT: {{  $}}
@@ -1705,6 +1860,7 @@ body: |
     ; CI-DS128-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32)
     ; CI-DS128-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]]
     ; CI-DS128-NEXT: $vgpr0_vgpr1 = COPY [[OR2]](s64)
+    ;
     ; VI-LABEL: name: test_load_local_s64_align2
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -1729,6 +1885,7 @@ body: |
     ; VI-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32)
     ; VI-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]]
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[OR2]](s64)
+    ;
     ; GFX9-LABEL: name: test_load_local_s64_align2
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -1753,12 +1910,14 @@ body: |
     ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32)
     ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]]
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[OR2]](s64)
+    ;
     ; GFX9-UNALIGNED-LABEL: name: test_load_local_s64_align2
     ; GFX9-UNALIGNED: liveins: $vgpr0
     ; GFX9-UNALIGNED-NEXT: {{  $}}
     ; GFX9-UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX9-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p3) :: (load (s64), align 2, addrspace 3)
     ; GFX9-UNALIGNED-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
+    ;
     ; GFX10-LABEL: name: test_load_local_s64_align2
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -1783,6 +1942,7 @@ body: |
     ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32)
     ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]]
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[OR2]](s64)
+    ;
     ; GFX10-UNALIGNED-LABEL: name: test_load_local_s64_align2
     ; GFX10-UNALIGNED: liveins: $vgpr0
     ; GFX10-UNALIGNED-NEXT: {{  $}}
@@ -1797,6 +1957,7 @@ body: |
     ; GFX10-UNALIGNED-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C1]](s32)
     ; GFX10-UNALIGNED-NEXT: [[OR:%[0-9]+]]:_(s64) = G_OR [[SHL]], [[ZEXT]]
     ; GFX10-UNALIGNED-NEXT: $vgpr0_vgpr1 = COPY [[OR]](s64)
+    ;
     ; GFX11-LABEL: name: test_load_local_s64_align2
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -1821,6 +1982,7 @@ body: |
     ; GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C3]](s32)
     ; GFX11-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]]
     ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[OR2]](s64)
+    ;
     ; GFX11-UNALIGNED-LABEL: name: test_load_local_s64_align2
     ; GFX11-UNALIGNED: liveins: $vgpr0
     ; GFX11-UNALIGNED-NEXT: {{  $}}
@@ -1880,6 +2042,7 @@ body: |
     ; SI-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
     ; SI-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[OR6]](s64)
+    ;
     ; CI-LABEL: name: test_load_local_s64_align1
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -1922,6 +2085,7 @@ body: |
     ; CI-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
     ; CI-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
     ; CI-NEXT: $vgpr0_vgpr1 = COPY [[OR6]](s64)
+    ;
     ; CI-DS128-LABEL: name: test_load_local_s64_align1
     ; CI-DS128: liveins: $vgpr0
     ; CI-DS128-NEXT: {{  $}}
@@ -1964,6 +2128,7 @@ body: |
     ; CI-DS128-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
     ; CI-DS128-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
     ; CI-DS128-NEXT: $vgpr0_vgpr1 = COPY [[OR6]](s64)
+    ;
     ; VI-LABEL: name: test_load_local_s64_align1
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -2006,6 +2171,7 @@ body: |
     ; VI-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
     ; VI-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[OR6]](s64)
+    ;
     ; GFX9-LABEL: name: test_load_local_s64_align1
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -2048,12 +2214,14 @@ body: |
     ; GFX9-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
     ; GFX9-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[OR6]](s64)
+    ;
     ; GFX9-UNALIGNED-LABEL: name: test_load_local_s64_align1
     ; GFX9-UNALIGNED: liveins: $vgpr0
     ; GFX9-UNALIGNED-NEXT: {{  $}}
     ; GFX9-UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX9-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p3) :: (load (s64), align 1, addrspace 3)
     ; GFX9-UNALIGNED-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](s64)
+    ;
     ; GFX10-LABEL: name: test_load_local_s64_align1
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -2096,6 +2264,7 @@ body: |
     ; GFX10-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
     ; GFX10-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[OR6]](s64)
+    ;
     ; GFX10-UNALIGNED-LABEL: name: test_load_local_s64_align1
     ; GFX10-UNALIGNED: liveins: $vgpr0
     ; GFX10-UNALIGNED-NEXT: {{  $}}
@@ -2110,6 +2279,7 @@ body: |
     ; GFX10-UNALIGNED-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C1]](s32)
     ; GFX10-UNALIGNED-NEXT: [[OR:%[0-9]+]]:_(s64) = G_OR [[SHL]], [[ZEXT]]
     ; GFX10-UNALIGNED-NEXT: $vgpr0_vgpr1 = COPY [[OR]](s64)
+    ;
     ; GFX11-LABEL: name: test_load_local_s64_align1
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -2152,6 +2322,7 @@ body: |
     ; GFX11-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C5]](s32)
     ; GFX11-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
     ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[OR6]](s64)
+    ;
     ; GFX11-UNALIGNED-LABEL: name: test_load_local_s64_align1
     ; GFX11-UNALIGNED: liveins: $vgpr0
     ; GFX11-UNALIGNED-NEXT: {{  $}}
@@ -2222,6 +2393,7 @@ body: |
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32)
     ; SI-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
     ; CI-LABEL: name: test_load_local_s96_align16
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -2275,6 +2447,7 @@ body: |
     ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32)
     ; CI-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
     ; CI-DS128-LABEL: name: test_load_local_s96_align16
     ; CI-DS128: liveins: $vgpr0
     ; CI-DS128-NEXT: {{  $}}
@@ -2328,6 +2501,7 @@ body: |
     ; CI-DS128-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32)
     ; CI-DS128-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; CI-DS128-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
     ; VI-LABEL: name: test_load_local_s96_align16
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -2381,6 +2555,7 @@ body: |
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32)
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
     ; GFX9-LABEL: name: test_load_local_s96_align16
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -2434,6 +2609,7 @@ body: |
     ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32)
     ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
     ; GFX9-UNALIGNED-LABEL: name: test_load_local_s96_align16
     ; GFX9-UNALIGNED: liveins: $vgpr0
     ; GFX9-UNALIGNED-NEXT: {{  $}}
@@ -2441,6 +2617,7 @@ body: |
     ; GFX9-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p3) :: (load (<3 x s32>), align 1, addrspace 3)
     ; GFX9-UNALIGNED-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
     ; GFX9-UNALIGNED-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
     ; GFX10-LABEL: name: test_load_local_s96_align16
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -2494,6 +2671,7 @@ body: |
     ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32)
     ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
     ; GFX10-UNALIGNED-LABEL: name: test_load_local_s96_align16
     ; GFX10-UNALIGNED: liveins: $vgpr0
     ; GFX10-UNALIGNED-NEXT: {{  $}}
@@ -2508,6 +2686,7 @@ body: |
     ; GFX10-UNALIGNED-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32)
     ; GFX10-UNALIGNED-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; GFX10-UNALIGNED-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
     ; GFX11-LABEL: name: test_load_local_s96_align16
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -2561,6 +2740,7 @@ body: |
     ; GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32)
     ; GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; GFX11-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
     ; GFX11-UNALIGNED-LABEL: name: test_load_local_s96_align16
     ; GFX11-UNALIGNED: liveins: $vgpr0
     ; GFX11-UNALIGNED-NEXT: {{  $}}
@@ -2591,6 +2771,7 @@ body: |
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[LOAD1]](s32)
     ; SI-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
     ; CI-LABEL: name: test_load_local_s96_align8
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -2603,6 +2784,7 @@ body: |
     ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[LOAD1]](s32)
     ; CI-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
     ; CI-DS128-LABEL: name: test_load_local_s96_align8
     ; CI-DS128: liveins: $vgpr0
     ; CI-DS128-NEXT: {{  $}}
@@ -2617,6 +2799,7 @@ body: |
     ; CI-DS128-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32)
     ; CI-DS128-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; CI-DS128-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
     ; VI-LABEL: name: test_load_local_s96_align8
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -2631,6 +2814,7 @@ body: |
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32)
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
     ; GFX9-LABEL: name: test_load_local_s96_align8
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -2645,6 +2829,7 @@ body: |
     ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32)
     ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
     ; GFX9-UNALIGNED-LABEL: name: test_load_local_s96_align8
     ; GFX9-UNALIGNED: liveins: $vgpr0
     ; GFX9-UNALIGNED-NEXT: {{  $}}
@@ -2652,6 +2837,7 @@ body: |
     ; GFX9-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p3) :: (load (<3 x s32>), align 8, addrspace 3)
     ; GFX9-UNALIGNED-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
     ; GFX9-UNALIGNED-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
     ; GFX10-LABEL: name: test_load_local_s96_align8
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -2666,6 +2852,7 @@ body: |
     ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32)
     ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
     ; GFX10-UNALIGNED-LABEL: name: test_load_local_s96_align8
     ; GFX10-UNALIGNED: liveins: $vgpr0
     ; GFX10-UNALIGNED-NEXT: {{  $}}
@@ -2680,6 +2867,7 @@ body: |
     ; GFX10-UNALIGNED-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32)
     ; GFX10-UNALIGNED-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; GFX10-UNALIGNED-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
     ; GFX11-LABEL: name: test_load_local_s96_align8
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -2694,6 +2882,7 @@ body: |
     ; GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32)
     ; GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; GFX11-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
     ; GFX11-UNALIGNED-LABEL: name: test_load_local_s96_align8
     ; GFX11-UNALIGNED: liveins: $vgpr0
     ; GFX11-UNALIGNED-NEXT: {{  $}}
@@ -2724,6 +2913,7 @@ body: |
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[LOAD1]](s32)
     ; SI-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
     ; CI-LABEL: name: test_load_local_s96_align4
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -2736,6 +2926,7 @@ body: |
     ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[LOAD1]](s32)
     ; CI-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
     ; CI-DS128-LABEL: name: test_load_local_s96_align4
     ; CI-DS128: liveins: $vgpr0
     ; CI-DS128-NEXT: {{  $}}
@@ -2750,6 +2941,7 @@ body: |
     ; CI-DS128-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32)
     ; CI-DS128-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; CI-DS128-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
     ; VI-LABEL: name: test_load_local_s96_align4
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -2764,6 +2956,7 @@ body: |
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32)
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
     ; GFX9-LABEL: name: test_load_local_s96_align4
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -2778,6 +2971,7 @@ body: |
     ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32)
     ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
     ; GFX9-UNALIGNED-LABEL: name: test_load_local_s96_align4
     ; GFX9-UNALIGNED: liveins: $vgpr0
     ; GFX9-UNALIGNED-NEXT: {{  $}}
@@ -2785,6 +2979,7 @@ body: |
     ; GFX9-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p3) :: (load (<3 x s32>), align 4, addrspace 3)
     ; GFX9-UNALIGNED-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
     ; GFX9-UNALIGNED-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
     ; GFX10-LABEL: name: test_load_local_s96_align4
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -2799,6 +2994,7 @@ body: |
     ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32)
     ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
     ; GFX10-UNALIGNED-LABEL: name: test_load_local_s96_align4
     ; GFX10-UNALIGNED: liveins: $vgpr0
     ; GFX10-UNALIGNED-NEXT: {{  $}}
@@ -2813,6 +3009,7 @@ body: |
     ; GFX10-UNALIGNED-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32)
     ; GFX10-UNALIGNED-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; GFX10-UNALIGNED-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
     ; GFX11-LABEL: name: test_load_local_s96_align4
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -2827,6 +3024,7 @@ body: |
     ; GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32)
     ; GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; GFX11-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
     ; GFX11-UNALIGNED-LABEL: name: test_load_local_s96_align4
     ; GFX11-UNALIGNED: liveins: $vgpr0
     ; GFX11-UNALIGNED-NEXT: {{  $}}
@@ -2873,6 +3071,7 @@ body: |
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32)
     ; SI-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
     ; CI-LABEL: name: test_load_local_s96_align2
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -2901,6 +3100,7 @@ body: |
     ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32)
     ; CI-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
     ; CI-DS128-LABEL: name: test_load_local_s96_align2
     ; CI-DS128: liveins: $vgpr0
     ; CI-DS128-NEXT: {{  $}}
@@ -2929,6 +3129,7 @@ body: |
     ; CI-DS128-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32)
     ; CI-DS128-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; CI-DS128-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
     ; VI-LABEL: name: test_load_local_s96_align2
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -2957,6 +3158,7 @@ body: |
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32)
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
     ; GFX9-LABEL: name: test_load_local_s96_align2
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -2985,6 +3187,7 @@ body: |
     ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32)
     ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
     ; GFX9-UNALIGNED-LABEL: name: test_load_local_s96_align2
     ; GFX9-UNALIGNED: liveins: $vgpr0
     ; GFX9-UNALIGNED-NEXT: {{  $}}
@@ -2992,6 +3195,7 @@ body: |
     ; GFX9-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p3) :: (load (<3 x s32>), align 2, addrspace 3)
     ; GFX9-UNALIGNED-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
     ; GFX9-UNALIGNED-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
     ; GFX10-LABEL: name: test_load_local_s96_align2
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -3020,6 +3224,7 @@ body: |
     ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32)
     ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
     ; GFX10-UNALIGNED-LABEL: name: test_load_local_s96_align2
     ; GFX10-UNALIGNED: liveins: $vgpr0
     ; GFX10-UNALIGNED-NEXT: {{  $}}
@@ -3034,6 +3239,7 @@ body: |
     ; GFX10-UNALIGNED-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32)
     ; GFX10-UNALIGNED-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; GFX10-UNALIGNED-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
     ; GFX11-LABEL: name: test_load_local_s96_align2
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -3062,6 +3268,7 @@ body: |
     ; GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32)
     ; GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; GFX11-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
     ; GFX11-UNALIGNED-LABEL: name: test_load_local_s96_align2
     ; GFX11-UNALIGNED: liveins: $vgpr0
     ; GFX11-UNALIGNED-NEXT: {{  $}}
@@ -3133,6 +3340,7 @@ body: |
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32)
     ; SI-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
     ; CI-LABEL: name: test_load_local_s96_align1
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -3186,6 +3394,7 @@ body: |
     ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32)
     ; CI-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
     ; CI-DS128-LABEL: name: test_load_local_s96_align1
     ; CI-DS128: liveins: $vgpr0
     ; CI-DS128-NEXT: {{  $}}
@@ -3239,6 +3448,7 @@ body: |
     ; CI-DS128-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32)
     ; CI-DS128-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; CI-DS128-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
     ; VI-LABEL: name: test_load_local_s96_align1
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -3292,6 +3502,7 @@ body: |
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32)
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
     ; GFX9-LABEL: name: test_load_local_s96_align1
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -3345,6 +3556,7 @@ body: |
     ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32)
     ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
     ; GFX9-UNALIGNED-LABEL: name: test_load_local_s96_align1
     ; GFX9-UNALIGNED: liveins: $vgpr0
     ; GFX9-UNALIGNED-NEXT: {{  $}}
@@ -3352,6 +3564,7 @@ body: |
     ; GFX9-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p3) :: (load (<3 x s32>), align 1, addrspace 3)
     ; GFX9-UNALIGNED-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[LOAD]](<3 x s32>)
     ; GFX9-UNALIGNED-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
     ; GFX10-LABEL: name: test_load_local_s96_align1
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -3405,6 +3618,7 @@ body: |
     ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32)
     ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
     ; GFX10-UNALIGNED-LABEL: name: test_load_local_s96_align1
     ; GFX10-UNALIGNED: liveins: $vgpr0
     ; GFX10-UNALIGNED-NEXT: {{  $}}
@@ -3419,6 +3633,7 @@ body: |
     ; GFX10-UNALIGNED-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32)
     ; GFX10-UNALIGNED-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; GFX10-UNALIGNED-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
     ; GFX11-LABEL: name: test_load_local_s96_align1
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -3472,6 +3687,7 @@ body: |
     ; GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32)
     ; GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; GFX11-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
     ; GFX11-UNALIGNED-LABEL: name: test_load_local_s96_align1
     ; GFX11-UNALIGNED: liveins: $vgpr0
     ; GFX11-UNALIGNED-NEXT: {{  $}}
@@ -3559,6 +3775,7 @@ body: |
     ; SI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s32>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s32>), [[BUILD_VECTOR1]](<2 x s32>)
     ; SI-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[CONCAT_VECTORS]](<4 x s32>)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
     ; CI-LABEL: name: test_load_local_s128_align16
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -3628,6 +3845,7 @@ body: |
     ; CI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s32>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s32>), [[BUILD_VECTOR1]](<2 x s32>)
     ; CI-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[CONCAT_VECTORS]](<4 x s32>)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
     ; CI-DS128-LABEL: name: test_load_local_s128_align16
     ; CI-DS128: liveins: $vgpr0
     ; CI-DS128-NEXT: {{  $}}
@@ -3696,6 +3914,7 @@ body: |
     ; CI-DS128-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
     ; CI-DS128-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; CI-DS128-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
     ; VI-LABEL: name: test_load_local_s128_align16
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -3764,6 +3983,7 @@ body: |
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
     ; GFX9-LABEL: name: test_load_local_s128_align16
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -3832,6 +4052,7 @@ body: |
     ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
     ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
     ; GFX9-UNALIGNED-LABEL: name: test_load_local_s128_align16
     ; GFX9-UNALIGNED: liveins: $vgpr0
     ; GFX9-UNALIGNED-NEXT: {{  $}}
@@ -3839,6 +4060,7 @@ body: |
     ; GFX9-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p3) :: (load (<4 x s32>), align 1, addrspace 3)
     ; GFX9-UNALIGNED-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>)
     ; GFX9-UNALIGNED-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
     ; GFX10-LABEL: name: test_load_local_s128_align16
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -3907,6 +4129,7 @@ body: |
     ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
     ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
     ; GFX10-UNALIGNED-LABEL: name: test_load_local_s128_align16
     ; GFX10-UNALIGNED: liveins: $vgpr0
     ; GFX10-UNALIGNED-NEXT: {{  $}}
@@ -3924,6 +4147,7 @@ body: |
     ; GFX10-UNALIGNED-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32)
     ; GFX10-UNALIGNED-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; GFX10-UNALIGNED-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
     ; GFX11-LABEL: name: test_load_local_s128_align16
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -3992,6 +4216,7 @@ body: |
     ; GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
     ; GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
     ; GFX11-UNALIGNED-LABEL: name: test_load_local_s128_align16
     ; GFX11-UNALIGNED: liveins: $vgpr0
     ; GFX11-UNALIGNED-NEXT: {{  $}}
@@ -4021,6 +4246,7 @@ body: |
     ; SI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s32>) = G_CONCAT_VECTORS [[LOAD]](<2 x s32>), [[LOAD1]](<2 x s32>)
     ; SI-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[CONCAT_VECTORS]](<4 x s32>)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
     ; CI-LABEL: name: test_load_local_s128_align8
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -4032,6 +4258,7 @@ body: |
     ; CI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s32>) = G_CONCAT_VECTORS [[LOAD]](<2 x s32>), [[LOAD1]](<2 x s32>)
     ; CI-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[CONCAT_VECTORS]](<4 x s32>)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
     ; CI-DS128-LABEL: name: test_load_local_s128_align8
     ; CI-DS128: liveins: $vgpr0
     ; CI-DS128-NEXT: {{  $}}
@@ -4039,6 +4266,7 @@ body: |
     ; CI-DS128-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p3) :: (load (<4 x s32>), align 8, addrspace 3)
     ; CI-DS128-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>)
     ; CI-DS128-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
     ; VI-LABEL: name: test_load_local_s128_align8
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -4046,6 +4274,7 @@ body: |
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p3) :: (load (<4 x s32>), align 8, addrspace 3)
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
     ; GFX9-LABEL: name: test_load_local_s128_align8
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -4053,6 +4282,7 @@ body: |
     ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p3) :: (load (<4 x s32>), align 8, addrspace 3)
     ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
     ; GFX9-UNALIGNED-LABEL: name: test_load_local_s128_align8
     ; GFX9-UNALIGNED: liveins: $vgpr0
     ; GFX9-UNALIGNED-NEXT: {{  $}}
@@ -4060,6 +4290,7 @@ body: |
     ; GFX9-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p3) :: (load (<4 x s32>), align 8, addrspace 3)
     ; GFX9-UNALIGNED-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>)
     ; GFX9-UNALIGNED-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
     ; GFX10-LABEL: name: test_load_local_s128_align8
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -4077,6 +4308,7 @@ body: |
     ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32)
     ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
     ; GFX10-UNALIGNED-LABEL: name: test_load_local_s128_align8
     ; GFX10-UNALIGNED: liveins: $vgpr0
     ; GFX10-UNALIGNED-NEXT: {{  $}}
@@ -4094,6 +4326,7 @@ body: |
     ; GFX10-UNALIGNED-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32)
     ; GFX10-UNALIGNED-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; GFX10-UNALIGNED-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
     ; GFX11-LABEL: name: test_load_local_s128_align8
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -4101,6 +4334,7 @@ body: |
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p3) :: (load (<4 x s32>), align 8, addrspace 3)
     ; GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>)
     ; GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
     ; GFX11-UNALIGNED-LABEL: name: test_load_local_s128_align8
     ; GFX11-UNALIGNED: liveins: $vgpr0
     ; GFX11-UNALIGNED-NEXT: {{  $}}
@@ -4130,6 +4364,7 @@ body: |
     ; SI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s32>) = G_CONCAT_VECTORS [[LOAD]](<2 x s32>), [[LOAD1]](<2 x s32>)
     ; SI-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[CONCAT_VECTORS]](<4 x s32>)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
     ; CI-LABEL: name: test_load_local_s128_align4
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -4141,6 +4376,7 @@ body: |
     ; CI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s32>) = G_CONCAT_VECTORS [[LOAD]](<2 x s32>), [[LOAD1]](<2 x s32>)
     ; CI-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[CONCAT_VECTORS]](<4 x s32>)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
     ; CI-DS128-LABEL: name: test_load_local_s128_align4
     ; CI-DS128: liveins: $vgpr0
     ; CI-DS128-NEXT: {{  $}}
@@ -4158,6 +4394,7 @@ body: |
     ; CI-DS128-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32)
     ; CI-DS128-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; CI-DS128-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
     ; VI-LABEL: name: test_load_local_s128_align4
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -4175,6 +4412,7 @@ body: |
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32)
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
     ; GFX9-LABEL: name: test_load_local_s128_align4
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -4192,6 +4430,7 @@ body: |
     ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32)
     ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
     ; GFX9-UNALIGNED-LABEL: name: test_load_local_s128_align4
     ; GFX9-UNALIGNED: liveins: $vgpr0
     ; GFX9-UNALIGNED-NEXT: {{  $}}
@@ -4199,6 +4438,7 @@ body: |
     ; GFX9-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p3) :: (load (<4 x s32>), align 4, addrspace 3)
     ; GFX9-UNALIGNED-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>)
     ; GFX9-UNALIGNED-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
     ; GFX10-LABEL: name: test_load_local_s128_align4
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -4216,6 +4456,7 @@ body: |
     ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32)
     ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
     ; GFX10-UNALIGNED-LABEL: name: test_load_local_s128_align4
     ; GFX10-UNALIGNED: liveins: $vgpr0
     ; GFX10-UNALIGNED-NEXT: {{  $}}
@@ -4233,6 +4474,7 @@ body: |
     ; GFX10-UNALIGNED-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32)
     ; GFX10-UNALIGNED-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; GFX10-UNALIGNED-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
     ; GFX11-LABEL: name: test_load_local_s128_align4
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -4250,6 +4492,7 @@ body: |
     ; GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32)
     ; GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
     ; GFX11-UNALIGNED-LABEL: name: test_load_local_s128_align4
     ; GFX11-UNALIGNED: liveins: $vgpr0
     ; GFX11-UNALIGNED-NEXT: {{  $}}
@@ -4304,6 +4547,7 @@ body: |
     ; SI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s32>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s32>), [[BUILD_VECTOR1]](<2 x s32>)
     ; SI-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[CONCAT_VECTORS]](<4 x s32>)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
     ; CI-LABEL: name: test_load_local_s128_align2
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -4340,6 +4584,7 @@ body: |
     ; CI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s32>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s32>), [[BUILD_VECTOR1]](<2 x s32>)
     ; CI-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[CONCAT_VECTORS]](<4 x s32>)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
     ; CI-DS128-LABEL: name: test_load_local_s128_align2
     ; CI-DS128: liveins: $vgpr0
     ; CI-DS128-NEXT: {{  $}}
@@ -4375,6 +4620,7 @@ body: |
     ; CI-DS128-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32), [[OR3]](s32)
     ; CI-DS128-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; CI-DS128-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
     ; VI-LABEL: name: test_load_local_s128_align2
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -4410,6 +4656,7 @@ body: |
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32), [[OR3]](s32)
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
     ; GFX9-LABEL: name: test_load_local_s128_align2
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -4445,6 +4692,7 @@ body: |
     ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32), [[OR3]](s32)
     ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
     ; GFX9-UNALIGNED-LABEL: name: test_load_local_s128_align2
     ; GFX9-UNALIGNED: liveins: $vgpr0
     ; GFX9-UNALIGNED-NEXT: {{  $}}
@@ -4452,6 +4700,7 @@ body: |
     ; GFX9-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p3) :: (load (<4 x s32>), align 2, addrspace 3)
     ; GFX9-UNALIGNED-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>)
     ; GFX9-UNALIGNED-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
     ; GFX10-LABEL: name: test_load_local_s128_align2
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -4487,6 +4736,7 @@ body: |
     ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32), [[OR3]](s32)
     ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
     ; GFX10-UNALIGNED-LABEL: name: test_load_local_s128_align2
     ; GFX10-UNALIGNED: liveins: $vgpr0
     ; GFX10-UNALIGNED-NEXT: {{  $}}
@@ -4504,6 +4754,7 @@ body: |
     ; GFX10-UNALIGNED-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32)
     ; GFX10-UNALIGNED-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; GFX10-UNALIGNED-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
     ; GFX11-LABEL: name: test_load_local_s128_align2
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -4539,6 +4790,7 @@ body: |
     ; GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32), [[OR3]](s32)
     ; GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
     ; GFX11-UNALIGNED-LABEL: name: test_load_local_s128_align2
     ; GFX11-UNALIGNED: liveins: $vgpr0
     ; GFX11-UNALIGNED-NEXT: {{  $}}
@@ -4626,6 +4878,7 @@ body: |
     ; SI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s32>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s32>), [[BUILD_VECTOR1]](<2 x s32>)
     ; SI-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[CONCAT_VECTORS]](<4 x s32>)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
     ; CI-LABEL: name: test_load_local_s128_align1
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -4695,6 +4948,7 @@ body: |
     ; CI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s32>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s32>), [[BUILD_VECTOR1]](<2 x s32>)
     ; CI-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[CONCAT_VECTORS]](<4 x s32>)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
     ; CI-DS128-LABEL: name: test_load_local_s128_align1
     ; CI-DS128: liveins: $vgpr0
     ; CI-DS128-NEXT: {{  $}}
@@ -4763,6 +5017,7 @@ body: |
     ; CI-DS128-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
     ; CI-DS128-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; CI-DS128-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
     ; VI-LABEL: name: test_load_local_s128_align1
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -4831,6 +5086,7 @@ body: |
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
     ; GFX9-LABEL: name: test_load_local_s128_align1
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -4899,6 +5155,7 @@ body: |
     ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
     ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
     ; GFX9-UNALIGNED-LABEL: name: test_load_local_s128_align1
     ; GFX9-UNALIGNED: liveins: $vgpr0
     ; GFX9-UNALIGNED-NEXT: {{  $}}
@@ -4906,6 +5163,7 @@ body: |
     ; GFX9-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p3) :: (load (<4 x s32>), align 1, addrspace 3)
     ; GFX9-UNALIGNED-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[LOAD]](<4 x s32>)
     ; GFX9-UNALIGNED-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
     ; GFX10-LABEL: name: test_load_local_s128_align1
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -4974,6 +5232,7 @@ body: |
     ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
     ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
     ; GFX10-UNALIGNED-LABEL: name: test_load_local_s128_align1
     ; GFX10-UNALIGNED: liveins: $vgpr0
     ; GFX10-UNALIGNED-NEXT: {{  $}}
@@ -4991,6 +5250,7 @@ body: |
     ; GFX10-UNALIGNED-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32)
     ; GFX10-UNALIGNED-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; GFX10-UNALIGNED-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
     ; GFX11-LABEL: name: test_load_local_s128_align1
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -5059,6 +5319,7 @@ body: |
     ; GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
     ; GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
     ; GFX11-UNALIGNED-LABEL: name: test_load_local_s128_align1
     ; GFX11-UNALIGNED: liveins: $vgpr0
     ; GFX11-UNALIGNED-NEXT: {{  $}}
@@ -5083,54 +5344,63 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; SI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p3) :: (load (p1), addrspace 3)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
+    ;
     ; CI-LABEL: name: test_load_local_p1_align8
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p3) :: (load (p1), addrspace 3)
     ; CI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
+    ;
     ; CI-DS128-LABEL: name: test_load_local_p1_align8
     ; CI-DS128: liveins: $vgpr0
     ; CI-DS128-NEXT: {{  $}}
     ; CI-DS128-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; CI-DS128-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p3) :: (load (p1), addrspace 3)
     ; CI-DS128-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
+    ;
     ; VI-LABEL: name: test_load_local_p1_align8
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p3) :: (load (p1), addrspace 3)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
+    ;
     ; GFX9-LABEL: name: test_load_local_p1_align8
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p3) :: (load (p1), addrspace 3)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
+    ;
     ; GFX9-UNALIGNED-LABEL: name: test_load_local_p1_align8
     ; GFX9-UNALIGNED: liveins: $vgpr0
     ; GFX9-UNALIGNED-NEXT: {{  $}}
     ; GFX9-UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX9-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p3) :: (load (p1), addrspace 3)
     ; GFX9-UNALIGNED-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
+    ;
     ; GFX10-LABEL: name: test_load_local_p1_align8
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p3) :: (load (p1), addrspace 3)
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
+    ;
     ; GFX10-UNALIGNED-LABEL: name: test_load_local_p1_align8
     ; GFX10-UNALIGNED: liveins: $vgpr0
     ; GFX10-UNALIGNED-NEXT: {{  $}}
     ; GFX10-UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX10-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p3) :: (load (p1), addrspace 3)
     ; GFX10-UNALIGNED-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
+    ;
     ; GFX11-LABEL: name: test_load_local_p1_align8
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p3) :: (load (p1), addrspace 3)
     ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
+    ;
     ; GFX11-UNALIGNED-LABEL: name: test_load_local_p1_align8
     ; GFX11-UNALIGNED: liveins: $vgpr0
     ; GFX11-UNALIGNED-NEXT: {{  $}}
@@ -5163,36 +5433,42 @@ body: |
     ; SI-NEXT: [[OR:%[0-9]+]]:_(s64) = G_OR [[SHL]], [[ZEXT]]
     ; SI-NEXT: [[INTTOPTR:%[0-9]+]]:_(p1) = G_INTTOPTR [[OR]](s64)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p1)
+    ;
     ; CI-LABEL: name: test_load_local_p1_align4
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p3) :: (load (p1), align 4, addrspace 3)
     ; CI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
+    ;
     ; CI-DS128-LABEL: name: test_load_local_p1_align4
     ; CI-DS128: liveins: $vgpr0
     ; CI-DS128-NEXT: {{  $}}
     ; CI-DS128-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; CI-DS128-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p3) :: (load (p1), align 4, addrspace 3)
     ; CI-DS128-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
+    ;
     ; VI-LABEL: name: test_load_local_p1_align4
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p3) :: (load (p1), align 4, addrspace 3)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
+    ;
     ; GFX9-LABEL: name: test_load_local_p1_align4
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p3) :: (load (p1), align 4, addrspace 3)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
+    ;
     ; GFX9-UNALIGNED-LABEL: name: test_load_local_p1_align4
     ; GFX9-UNALIGNED: liveins: $vgpr0
     ; GFX9-UNALIGNED-NEXT: {{  $}}
     ; GFX9-UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX9-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p3) :: (load (p1), align 4, addrspace 3)
     ; GFX9-UNALIGNED-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
+    ;
     ; GFX10-LABEL: name: test_load_local_p1_align4
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -5208,6 +5484,7 @@ body: |
     ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s64) = G_OR [[SHL]], [[ZEXT]]
     ; GFX10-NEXT: [[INTTOPTR:%[0-9]+]]:_(p1) = G_INTTOPTR [[OR]](s64)
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p1)
+    ;
     ; GFX10-UNALIGNED-LABEL: name: test_load_local_p1_align4
     ; GFX10-UNALIGNED: liveins: $vgpr0
     ; GFX10-UNALIGNED-NEXT: {{  $}}
@@ -5223,12 +5500,14 @@ body: |
     ; GFX10-UNALIGNED-NEXT: [[OR:%[0-9]+]]:_(s64) = G_OR [[SHL]], [[ZEXT]]
     ; GFX10-UNALIGNED-NEXT: [[INTTOPTR:%[0-9]+]]:_(p1) = G_INTTOPTR [[OR]](s64)
     ; GFX10-UNALIGNED-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p1)
+    ;
     ; GFX11-LABEL: name: test_load_local_p1_align4
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p3) :: (load (p1), align 4, addrspace 3)
     ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
+    ;
     ; GFX11-UNALIGNED-LABEL: name: test_load_local_p1_align4
     ; GFX11-UNALIGNED: liveins: $vgpr0
     ; GFX11-UNALIGNED-NEXT: {{  $}}
@@ -5271,6 +5550,7 @@ body: |
     ; SI-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]]
     ; SI-NEXT: [[INTTOPTR:%[0-9]+]]:_(p1) = G_INTTOPTR [[OR2]](s64)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p1)
+    ;
     ; CI-LABEL: name: test_load_local_p1_align2
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -5296,6 +5576,7 @@ body: |
     ; CI-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]]
     ; CI-NEXT: [[INTTOPTR:%[0-9]+]]:_(p1) = G_INTTOPTR [[OR2]](s64)
     ; CI-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p1)
+    ;
     ; CI-DS128-LABEL: name: test_load_local_p1_align2
     ; CI-DS128: liveins: $vgpr0
     ; CI-DS128-NEXT: {{  $}}
@@ -5321,6 +5602,7 @@ body: |
     ; CI-DS128-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]]
     ; CI-DS128-NEXT: [[INTTOPTR:%[0-9]+]]:_(p1) = G_INTTOPTR [[OR2]](s64)
     ; CI-DS128-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p1)
+    ;
     ; VI-LABEL: name: test_load_local_p1_align2
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -5346,6 +5628,7 @@ body: |
     ; VI-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]]
     ; VI-NEXT: [[INTTOPTR:%[0-9]+]]:_(p1) = G_INTTOPTR [[OR2]](s64)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p1)
+    ;
     ; GFX9-LABEL: name: test_load_local_p1_align2
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -5371,12 +5654,14 @@ body: |
     ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]]
     ; GFX9-NEXT: [[INTTOPTR:%[0-9]+]]:_(p1) = G_INTTOPTR [[OR2]](s64)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p1)
+    ;
     ; GFX9-UNALIGNED-LABEL: name: test_load_local_p1_align2
     ; GFX9-UNALIGNED: liveins: $vgpr0
     ; GFX9-UNALIGNED-NEXT: {{  $}}
     ; GFX9-UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX9-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p3) :: (load (p1), align 2, addrspace 3)
     ; GFX9-UNALIGNED-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
+    ;
     ; GFX10-LABEL: name: test_load_local_p1_align2
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -5402,6 +5687,7 @@ body: |
     ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]]
     ; GFX10-NEXT: [[INTTOPTR:%[0-9]+]]:_(p1) = G_INTTOPTR [[OR2]](s64)
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p1)
+    ;
     ; GFX10-UNALIGNED-LABEL: name: test_load_local_p1_align2
     ; GFX10-UNALIGNED: liveins: $vgpr0
     ; GFX10-UNALIGNED-NEXT: {{  $}}
@@ -5417,6 +5703,7 @@ body: |
     ; GFX10-UNALIGNED-NEXT: [[OR:%[0-9]+]]:_(s64) = G_OR [[SHL]], [[ZEXT]]
     ; GFX10-UNALIGNED-NEXT: [[INTTOPTR:%[0-9]+]]:_(p1) = G_INTTOPTR [[OR]](s64)
     ; GFX10-UNALIGNED-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p1)
+    ;
     ; GFX11-LABEL: name: test_load_local_p1_align2
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -5442,6 +5729,7 @@ body: |
     ; GFX11-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SHL2]], [[ZEXT]]
     ; GFX11-NEXT: [[INTTOPTR:%[0-9]+]]:_(p1) = G_INTTOPTR [[OR2]](s64)
     ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p1)
+    ;
     ; GFX11-UNALIGNED-LABEL: name: test_load_local_p1_align2
     ; GFX11-UNALIGNED: liveins: $vgpr0
     ; GFX11-UNALIGNED-NEXT: {{  $}}
@@ -5502,6 +5790,7 @@ body: |
     ; SI-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
     ; SI-NEXT: [[INTTOPTR:%[0-9]+]]:_(p1) = G_INTTOPTR [[OR6]](s64)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p1)
+    ;
     ; CI-LABEL: name: test_load_local_p1_align1
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -5545,6 +5834,7 @@ body: |
     ; CI-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
     ; CI-NEXT: [[INTTOPTR:%[0-9]+]]:_(p1) = G_INTTOPTR [[OR6]](s64)
     ; CI-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p1)
+    ;
     ; CI-DS128-LABEL: name: test_load_local_p1_align1
     ; CI-DS128: liveins: $vgpr0
     ; CI-DS128-NEXT: {{  $}}
@@ -5588,6 +5878,7 @@ body: |
     ; CI-DS128-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
     ; CI-DS128-NEXT: [[INTTOPTR:%[0-9]+]]:_(p1) = G_INTTOPTR [[OR6]](s64)
     ; CI-DS128-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p1)
+    ;
     ; VI-LABEL: name: test_load_local_p1_align1
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -5631,6 +5922,7 @@ body: |
     ; VI-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
     ; VI-NEXT: [[INTTOPTR:%[0-9]+]]:_(p1) = G_INTTOPTR [[OR6]](s64)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p1)
+    ;
     ; GFX9-LABEL: name: test_load_local_p1_align1
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -5674,12 +5966,14 @@ body: |
     ; GFX9-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
     ; GFX9-NEXT: [[INTTOPTR:%[0-9]+]]:_(p1) = G_INTTOPTR [[OR6]](s64)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p1)
+    ;
     ; GFX9-UNALIGNED-LABEL: name: test_load_local_p1_align1
     ; GFX9-UNALIGNED: liveins: $vgpr0
     ; GFX9-UNALIGNED-NEXT: {{  $}}
     ; GFX9-UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX9-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[COPY]](p3) :: (load (p1), align 1, addrspace 3)
     ; GFX9-UNALIGNED-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](p1)
+    ;
     ; GFX10-LABEL: name: test_load_local_p1_align1
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -5723,6 +6017,7 @@ body: |
     ; GFX10-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
     ; GFX10-NEXT: [[INTTOPTR:%[0-9]+]]:_(p1) = G_INTTOPTR [[OR6]](s64)
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p1)
+    ;
     ; GFX10-UNALIGNED-LABEL: name: test_load_local_p1_align1
     ; GFX10-UNALIGNED: liveins: $vgpr0
     ; GFX10-UNALIGNED-NEXT: {{  $}}
@@ -5738,6 +6033,7 @@ body: |
     ; GFX10-UNALIGNED-NEXT: [[OR:%[0-9]+]]:_(s64) = G_OR [[SHL]], [[ZEXT]]
     ; GFX10-UNALIGNED-NEXT: [[INTTOPTR:%[0-9]+]]:_(p1) = G_INTTOPTR [[OR]](s64)
     ; GFX10-UNALIGNED-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p1)
+    ;
     ; GFX11-LABEL: name: test_load_local_p1_align1
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -5781,6 +6077,7 @@ body: |
     ; GFX11-NEXT: [[OR6:%[0-9]+]]:_(s64) = G_OR [[SHL6]], [[ZEXT]]
     ; GFX11-NEXT: [[INTTOPTR:%[0-9]+]]:_(p1) = G_INTTOPTR [[OR6]](s64)
     ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[INTTOPTR]](p1)
+    ;
     ; GFX11-UNALIGNED-LABEL: name: test_load_local_p1_align1
     ; GFX11-UNALIGNED: liveins: $vgpr0
     ; GFX11-UNALIGNED-NEXT: {{  $}}
@@ -5804,54 +6101,63 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; SI-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p3) :: (load (p3), addrspace 3)
     ; SI-NEXT: $vgpr0 = COPY [[LOAD]](p3)
+    ;
     ; CI-LABEL: name: test_load_local_p3_align4
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p3) :: (load (p3), addrspace 3)
     ; CI-NEXT: $vgpr0 = COPY [[LOAD]](p3)
+    ;
     ; CI-DS128-LABEL: name: test_load_local_p3_align4
     ; CI-DS128: liveins: $vgpr0
     ; CI-DS128-NEXT: {{  $}}
     ; CI-DS128-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; CI-DS128-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p3) :: (load (p3), addrspace 3)
     ; CI-DS128-NEXT: $vgpr0 = COPY [[LOAD]](p3)
+    ;
     ; VI-LABEL: name: test_load_local_p3_align4
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p3) :: (load (p3), addrspace 3)
     ; VI-NEXT: $vgpr0 = COPY [[LOAD]](p3)
+    ;
     ; GFX9-LABEL: name: test_load_local_p3_align4
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p3) :: (load (p3), addrspace 3)
     ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](p3)
+    ;
     ; GFX9-UNALIGNED-LABEL: name: test_load_local_p3_align4
     ; GFX9-UNALIGNED: liveins: $vgpr0
     ; GFX9-UNALIGNED-NEXT: {{  $}}
     ; GFX9-UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX9-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p3) :: (load (p3), addrspace 3)
     ; GFX9-UNALIGNED-NEXT: $vgpr0 = COPY [[LOAD]](p3)
+    ;
     ; GFX10-LABEL: name: test_load_local_p3_align4
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p3) :: (load (p3), addrspace 3)
     ; GFX10-NEXT: $vgpr0 = COPY [[LOAD]](p3)
+    ;
     ; GFX10-UNALIGNED-LABEL: name: test_load_local_p3_align4
     ; GFX10-UNALIGNED: liveins: $vgpr0
     ; GFX10-UNALIGNED-NEXT: {{  $}}
     ; GFX10-UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX10-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p3) :: (load (p3), addrspace 3)
     ; GFX10-UNALIGNED-NEXT: $vgpr0 = COPY [[LOAD]](p3)
+    ;
     ; GFX11-LABEL: name: test_load_local_p3_align4
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p3) :: (load (p3), addrspace 3)
     ; GFX11-NEXT: $vgpr0 = COPY [[LOAD]](p3)
+    ;
     ; GFX11-UNALIGNED-LABEL: name: test_load_local_p3_align4
     ; GFX11-UNALIGNED: liveins: $vgpr0
     ; GFX11-UNALIGNED-NEXT: {{  $}}
@@ -5882,6 +6188,7 @@ body: |
     ; SI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; SI-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR]](s32)
     ; SI-NEXT: $vgpr0 = COPY [[INTTOPTR]](p3)
+    ;
     ; CI-LABEL: name: test_load_local_p3_align2
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -5895,6 +6202,7 @@ body: |
     ; CI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; CI-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR]](s32)
     ; CI-NEXT: $vgpr0 = COPY [[INTTOPTR]](p3)
+    ;
     ; CI-DS128-LABEL: name: test_load_local_p3_align2
     ; CI-DS128: liveins: $vgpr0
     ; CI-DS128-NEXT: {{  $}}
@@ -5908,6 +6216,7 @@ body: |
     ; CI-DS128-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; CI-DS128-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR]](s32)
     ; CI-DS128-NEXT: $vgpr0 = COPY [[INTTOPTR]](p3)
+    ;
     ; VI-LABEL: name: test_load_local_p3_align2
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -5921,6 +6230,7 @@ body: |
     ; VI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; VI-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR]](s32)
     ; VI-NEXT: $vgpr0 = COPY [[INTTOPTR]](p3)
+    ;
     ; GFX9-LABEL: name: test_load_local_p3_align2
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -5934,12 +6244,14 @@ body: |
     ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; GFX9-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR]](s32)
     ; GFX9-NEXT: $vgpr0 = COPY [[INTTOPTR]](p3)
+    ;
     ; GFX9-UNALIGNED-LABEL: name: test_load_local_p3_align2
     ; GFX9-UNALIGNED: liveins: $vgpr0
     ; GFX9-UNALIGNED-NEXT: {{  $}}
     ; GFX9-UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX9-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p3) :: (load (p3), align 2, addrspace 3)
     ; GFX9-UNALIGNED-NEXT: $vgpr0 = COPY [[LOAD]](p3)
+    ;
     ; GFX10-LABEL: name: test_load_local_p3_align2
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -5953,12 +6265,14 @@ body: |
     ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; GFX10-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR]](s32)
     ; GFX10-NEXT: $vgpr0 = COPY [[INTTOPTR]](p3)
+    ;
     ; GFX10-UNALIGNED-LABEL: name: test_load_local_p3_align2
     ; GFX10-UNALIGNED: liveins: $vgpr0
     ; GFX10-UNALIGNED-NEXT: {{  $}}
     ; GFX10-UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX10-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p3) :: (load (p3), align 2, addrspace 3)
     ; GFX10-UNALIGNED-NEXT: $vgpr0 = COPY [[LOAD]](p3)
+    ;
     ; GFX11-LABEL: name: test_load_local_p3_align2
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -5972,6 +6286,7 @@ body: |
     ; GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; GFX11-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR]](s32)
     ; GFX11-NEXT: $vgpr0 = COPY [[INTTOPTR]](p3)
+    ;
     ; GFX11-UNALIGNED-LABEL: name: test_load_local_p3_align2
     ; GFX11-UNALIGNED: liveins: $vgpr0
     ; GFX11-UNALIGNED-NEXT: {{  $}}
@@ -6012,6 +6327,7 @@ body: |
     ; SI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
     ; SI-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR2]](s32)
     ; SI-NEXT: $vgpr0 = COPY [[INTTOPTR]](p3)
+    ;
     ; CI-LABEL: name: test_load_local_p3_align1
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -6035,6 +6351,7 @@ body: |
     ; CI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
     ; CI-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR2]](s32)
     ; CI-NEXT: $vgpr0 = COPY [[INTTOPTR]](p3)
+    ;
     ; CI-DS128-LABEL: name: test_load_local_p3_align1
     ; CI-DS128: liveins: $vgpr0
     ; CI-DS128-NEXT: {{  $}}
@@ -6058,6 +6375,7 @@ body: |
     ; CI-DS128-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
     ; CI-DS128-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR2]](s32)
     ; CI-DS128-NEXT: $vgpr0 = COPY [[INTTOPTR]](p3)
+    ;
     ; VI-LABEL: name: test_load_local_p3_align1
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -6081,6 +6399,7 @@ body: |
     ; VI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
     ; VI-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR2]](s32)
     ; VI-NEXT: $vgpr0 = COPY [[INTTOPTR]](p3)
+    ;
     ; GFX9-LABEL: name: test_load_local_p3_align1
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -6104,12 +6423,14 @@ body: |
     ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
     ; GFX9-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR2]](s32)
     ; GFX9-NEXT: $vgpr0 = COPY [[INTTOPTR]](p3)
+    ;
     ; GFX9-UNALIGNED-LABEL: name: test_load_local_p3_align1
     ; GFX9-UNALIGNED: liveins: $vgpr0
     ; GFX9-UNALIGNED-NEXT: {{  $}}
     ; GFX9-UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX9-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p3) :: (load (p3), align 1, addrspace 3)
     ; GFX9-UNALIGNED-NEXT: $vgpr0 = COPY [[LOAD]](p3)
+    ;
     ; GFX10-LABEL: name: test_load_local_p3_align1
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -6133,12 +6454,14 @@ body: |
     ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
     ; GFX10-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR2]](s32)
     ; GFX10-NEXT: $vgpr0 = COPY [[INTTOPTR]](p3)
+    ;
     ; GFX10-UNALIGNED-LABEL: name: test_load_local_p3_align1
     ; GFX10-UNALIGNED: liveins: $vgpr0
     ; GFX10-UNALIGNED-NEXT: {{  $}}
     ; GFX10-UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX10-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p3) :: (load (p3), align 1, addrspace 3)
     ; GFX10-UNALIGNED-NEXT: $vgpr0 = COPY [[LOAD]](p3)
+    ;
     ; GFX11-LABEL: name: test_load_local_p3_align1
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -6162,6 +6485,7 @@ body: |
     ; GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
     ; GFX11-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR2]](s32)
     ; GFX11-NEXT: $vgpr0 = COPY [[INTTOPTR]](p3)
+    ;
     ; GFX11-UNALIGNED-LABEL: name: test_load_local_p3_align1
     ; GFX11-UNALIGNED: liveins: $vgpr0
     ; GFX11-UNALIGNED-NEXT: {{  $}}
@@ -6185,54 +6509,63 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; SI-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p3) :: (load (p5), addrspace 3)
     ; SI-NEXT: $vgpr0 = COPY [[LOAD]](p5)
+    ;
     ; CI-LABEL: name: test_load_local_p5_align4
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p3) :: (load (p5), addrspace 3)
     ; CI-NEXT: $vgpr0 = COPY [[LOAD]](p5)
+    ;
     ; CI-DS128-LABEL: name: test_load_local_p5_align4
     ; CI-DS128: liveins: $vgpr0
     ; CI-DS128-NEXT: {{  $}}
     ; CI-DS128-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; CI-DS128-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p3) :: (load (p5), addrspace 3)
     ; CI-DS128-NEXT: $vgpr0 = COPY [[LOAD]](p5)
+    ;
     ; VI-LABEL: name: test_load_local_p5_align4
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p3) :: (load (p5), addrspace 3)
     ; VI-NEXT: $vgpr0 = COPY [[LOAD]](p5)
+    ;
     ; GFX9-LABEL: name: test_load_local_p5_align4
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p3) :: (load (p5), addrspace 3)
     ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](p5)
+    ;
     ; GFX9-UNALIGNED-LABEL: name: test_load_local_p5_align4
     ; GFX9-UNALIGNED: liveins: $vgpr0
     ; GFX9-UNALIGNED-NEXT: {{  $}}
     ; GFX9-UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX9-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p3) :: (load (p5), addrspace 3)
     ; GFX9-UNALIGNED-NEXT: $vgpr0 = COPY [[LOAD]](p5)
+    ;
     ; GFX10-LABEL: name: test_load_local_p5_align4
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p3) :: (load (p5), addrspace 3)
     ; GFX10-NEXT: $vgpr0 = COPY [[LOAD]](p5)
+    ;
     ; GFX10-UNALIGNED-LABEL: name: test_load_local_p5_align4
     ; GFX10-UNALIGNED: liveins: $vgpr0
     ; GFX10-UNALIGNED-NEXT: {{  $}}
     ; GFX10-UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX10-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p3) :: (load (p5), addrspace 3)
     ; GFX10-UNALIGNED-NEXT: $vgpr0 = COPY [[LOAD]](p5)
+    ;
     ; GFX11-LABEL: name: test_load_local_p5_align4
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p3) :: (load (p5), addrspace 3)
     ; GFX11-NEXT: $vgpr0 = COPY [[LOAD]](p5)
+    ;
     ; GFX11-UNALIGNED-LABEL: name: test_load_local_p5_align4
     ; GFX11-UNALIGNED: liveins: $vgpr0
     ; GFX11-UNALIGNED-NEXT: {{  $}}
@@ -6263,6 +6596,7 @@ body: |
     ; SI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; SI-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR]](s32)
     ; SI-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5)
+    ;
     ; CI-LABEL: name: test_load_local_p5_align2
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -6276,6 +6610,7 @@ body: |
     ; CI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; CI-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR]](s32)
     ; CI-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5)
+    ;
     ; CI-DS128-LABEL: name: test_load_local_p5_align2
     ; CI-DS128: liveins: $vgpr0
     ; CI-DS128-NEXT: {{  $}}
@@ -6289,6 +6624,7 @@ body: |
     ; CI-DS128-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; CI-DS128-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR]](s32)
     ; CI-DS128-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5)
+    ;
     ; VI-LABEL: name: test_load_local_p5_align2
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -6302,6 +6638,7 @@ body: |
     ; VI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; VI-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR]](s32)
     ; VI-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5)
+    ;
     ; GFX9-LABEL: name: test_load_local_p5_align2
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -6315,12 +6652,14 @@ body: |
     ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; GFX9-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR]](s32)
     ; GFX9-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5)
+    ;
     ; GFX9-UNALIGNED-LABEL: name: test_load_local_p5_align2
     ; GFX9-UNALIGNED: liveins: $vgpr0
     ; GFX9-UNALIGNED-NEXT: {{  $}}
     ; GFX9-UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX9-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p3) :: (load (p5), align 2, addrspace 3)
     ; GFX9-UNALIGNED-NEXT: $vgpr0 = COPY [[LOAD]](p5)
+    ;
     ; GFX10-LABEL: name: test_load_local_p5_align2
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -6334,12 +6673,14 @@ body: |
     ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; GFX10-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR]](s32)
     ; GFX10-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5)
+    ;
     ; GFX10-UNALIGNED-LABEL: name: test_load_local_p5_align2
     ; GFX10-UNALIGNED: liveins: $vgpr0
     ; GFX10-UNALIGNED-NEXT: {{  $}}
     ; GFX10-UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX10-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p3) :: (load (p5), align 2, addrspace 3)
     ; GFX10-UNALIGNED-NEXT: $vgpr0 = COPY [[LOAD]](p5)
+    ;
     ; GFX11-LABEL: name: test_load_local_p5_align2
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -6353,6 +6694,7 @@ body: |
     ; GFX11-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; GFX11-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR]](s32)
     ; GFX11-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5)
+    ;
     ; GFX11-UNALIGNED-LABEL: name: test_load_local_p5_align2
     ; GFX11-UNALIGNED: liveins: $vgpr0
     ; GFX11-UNALIGNED-NEXT: {{  $}}
@@ -6393,6 +6735,7 @@ body: |
     ; SI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
     ; SI-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR2]](s32)
     ; SI-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5)
+    ;
     ; CI-LABEL: name: test_load_local_p5_align1
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -6416,6 +6759,7 @@ body: |
     ; CI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
     ; CI-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR2]](s32)
     ; CI-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5)
+    ;
     ; CI-DS128-LABEL: name: test_load_local_p5_align1
     ; CI-DS128: liveins: $vgpr0
     ; CI-DS128-NEXT: {{  $}}
@@ -6439,6 +6783,7 @@ body: |
     ; CI-DS128-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
     ; CI-DS128-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR2]](s32)
     ; CI-DS128-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5)
+    ;
     ; VI-LABEL: name: test_load_local_p5_align1
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -6462,6 +6807,7 @@ body: |
     ; VI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
     ; VI-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR2]](s32)
     ; VI-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5)
+    ;
     ; GFX9-LABEL: name: test_load_local_p5_align1
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -6485,12 +6831,14 @@ body: |
     ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
     ; GFX9-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR2]](s32)
     ; GFX9-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5)
+    ;
     ; GFX9-UNALIGNED-LABEL: name: test_load_local_p5_align1
     ; GFX9-UNALIGNED: liveins: $vgpr0
     ; GFX9-UNALIGNED-NEXT: {{  $}}
     ; GFX9-UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX9-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p3) :: (load (p5), align 1, addrspace 3)
     ; GFX9-UNALIGNED-NEXT: $vgpr0 = COPY [[LOAD]](p5)
+    ;
     ; GFX10-LABEL: name: test_load_local_p5_align1
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -6514,12 +6862,14 @@ body: |
     ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
     ; GFX10-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR2]](s32)
     ; GFX10-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5)
+    ;
     ; GFX10-UNALIGNED-LABEL: name: test_load_local_p5_align1
     ; GFX10-UNALIGNED: liveins: $vgpr0
     ; GFX10-UNALIGNED-NEXT: {{  $}}
     ; GFX10-UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX10-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p3) :: (load (p5), align 1, addrspace 3)
     ; GFX10-UNALIGNED-NEXT: $vgpr0 = COPY [[LOAD]](p5)
+    ;
     ; GFX11-LABEL: name: test_load_local_p5_align1
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -6543,6 +6893,7 @@ body: |
     ; GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
     ; GFX11-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR2]](s32)
     ; GFX11-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5)
+    ;
     ; GFX11-UNALIGNED-LABEL: name: test_load_local_p5_align1
     ; GFX11-UNALIGNED: liveins: $vgpr0
     ; GFX11-UNALIGNED-NEXT: {{  $}}
@@ -6566,54 +6917,63 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; SI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s16), addrspace 3)
     ; SI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; CI-LABEL: name: test_load_local_v2s8_align2
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s16), addrspace 3)
     ; CI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; CI-DS128-LABEL: name: test_load_local_v2s8_align2
     ; CI-DS128: liveins: $vgpr0
     ; CI-DS128-NEXT: {{  $}}
     ; CI-DS128-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; CI-DS128-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s16), addrspace 3)
     ; CI-DS128-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; VI-LABEL: name: test_load_local_v2s8_align2
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s16), addrspace 3)
     ; VI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX9-LABEL: name: test_load_local_v2s8_align2
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s16), addrspace 3)
     ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX9-UNALIGNED-LABEL: name: test_load_local_v2s8_align2
     ; GFX9-UNALIGNED: liveins: $vgpr0
     ; GFX9-UNALIGNED-NEXT: {{  $}}
     ; GFX9-UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX9-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s16), addrspace 3)
     ; GFX9-UNALIGNED-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX10-LABEL: name: test_load_local_v2s8_align2
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s16), addrspace 3)
     ; GFX10-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX10-UNALIGNED-LABEL: name: test_load_local_v2s8_align2
     ; GFX10-UNALIGNED: liveins: $vgpr0
     ; GFX10-UNALIGNED-NEXT: {{  $}}
     ; GFX10-UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX10-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s16), addrspace 3)
     ; GFX10-UNALIGNED-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX11-LABEL: name: test_load_local_v2s8_align2
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s16), addrspace 3)
     ; GFX11-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX11-UNALIGNED-LABEL: name: test_load_local_v2s8_align2
     ; GFX11-UNALIGNED: liveins: $vgpr0
     ; GFX11-UNALIGNED-NEXT: {{  $}}
@@ -6647,6 +7007,7 @@ body: |
     ; SI-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C1]](s32)
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[LSHR]](s32)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; CI-LABEL: name: test_load_local_v2s8_align1
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -6661,6 +7022,7 @@ body: |
     ; CI-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C1]](s32)
     ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[LSHR]](s32)
     ; CI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; CI-DS128-LABEL: name: test_load_local_v2s8_align1
     ; CI-DS128: liveins: $vgpr0
     ; CI-DS128-NEXT: {{  $}}
@@ -6675,6 +7037,7 @@ body: |
     ; CI-DS128-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C1]](s32)
     ; CI-DS128-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[LSHR]](s32)
     ; CI-DS128-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; VI-LABEL: name: test_load_local_v2s8_align1
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -6689,6 +7052,7 @@ body: |
     ; VI-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C1]](s32)
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[LSHR]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; GFX9-LABEL: name: test_load_local_v2s8_align1
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -6703,6 +7067,7 @@ body: |
     ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C1]](s32)
     ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[LSHR]](s32)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; GFX9-UNALIGNED-LABEL: name: test_load_local_v2s8_align1
     ; GFX9-UNALIGNED: liveins: $vgpr0
     ; GFX9-UNALIGNED-NEXT: {{  $}}
@@ -6712,6 +7077,7 @@ body: |
     ; GFX9-UNALIGNED-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C]](s32)
     ; GFX9-UNALIGNED-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LSHR]](s32)
     ; GFX9-UNALIGNED-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; GFX10-LABEL: name: test_load_local_v2s8_align1
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -6726,6 +7092,7 @@ body: |
     ; GFX10-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C1]](s32)
     ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[LSHR]](s32)
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; GFX10-UNALIGNED-LABEL: name: test_load_local_v2s8_align1
     ; GFX10-UNALIGNED: liveins: $vgpr0
     ; GFX10-UNALIGNED-NEXT: {{  $}}
@@ -6735,6 +7102,7 @@ body: |
     ; GFX10-UNALIGNED-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C]](s32)
     ; GFX10-UNALIGNED-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LSHR]](s32)
     ; GFX10-UNALIGNED-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; GFX11-LABEL: name: test_load_local_v2s8_align1
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -6749,6 +7117,7 @@ body: |
     ; GFX11-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C1]](s32)
     ; GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[LSHR]](s32)
     ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; GFX11-UNALIGNED-LABEL: name: test_load_local_v2s8_align1
     ; GFX11-UNALIGNED: liveins: $vgpr0
     ; GFX11-UNALIGNED-NEXT: {{  $}}
@@ -6780,8 +7149,8 @@ body: |
     ; SI-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
     ; SI-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C1]](s32)
     ; SI-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-    ; SI-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; SI-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
+    ; SI-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; SI-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C2]]
     ; SI-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C]](s32)
     ; SI-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
@@ -6801,6 +7170,7 @@ body: |
     ; SI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C1]](s32)
     ; SI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]]
     ; SI-NEXT: $vgpr0 = COPY [[OR2]](s32)
+    ;
     ; CI-LABEL: name: test_load_local_v3s8_align4
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -6811,8 +7181,8 @@ body: |
     ; CI-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
     ; CI-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C1]](s32)
     ; CI-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-    ; CI-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; CI-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
+    ; CI-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; CI-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C2]]
     ; CI-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C]](s32)
     ; CI-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
@@ -6832,6 +7202,7 @@ body: |
     ; CI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C1]](s32)
     ; CI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]]
     ; CI-NEXT: $vgpr0 = COPY [[OR2]](s32)
+    ;
     ; CI-DS128-LABEL: name: test_load_local_v3s8_align4
     ; CI-DS128: liveins: $vgpr0
     ; CI-DS128-NEXT: {{  $}}
@@ -6842,8 +7213,8 @@ body: |
     ; CI-DS128-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
     ; CI-DS128-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C1]](s32)
     ; CI-DS128-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-    ; CI-DS128-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; CI-DS128-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
+    ; CI-DS128-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; CI-DS128-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C2]]
     ; CI-DS128-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C]](s32)
     ; CI-DS128-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
@@ -6863,6 +7234,7 @@ body: |
     ; CI-DS128-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C1]](s32)
     ; CI-DS128-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]]
     ; CI-DS128-NEXT: $vgpr0 = COPY [[OR2]](s32)
+    ;
     ; VI-LABEL: name: test_load_local_v3s8_align4
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -6873,8 +7245,8 @@ body: |
     ; VI-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
     ; VI-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C1]](s32)
     ; VI-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-    ; VI-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; VI-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
+    ; VI-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; VI-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C2]]
     ; VI-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
     ; VI-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C2]]
@@ -6892,6 +7264,7 @@ body: |
     ; VI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C1]](s32)
     ; VI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]]
     ; VI-NEXT: $vgpr0 = COPY [[OR2]](s32)
+    ;
     ; GFX9-LABEL: name: test_load_local_v3s8_align4
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -6902,8 +7275,8 @@ body: |
     ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
     ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C1]](s32)
     ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
+    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C2]]
     ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
     ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C2]]
@@ -6921,6 +7294,7 @@ body: |
     ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C1]](s32)
     ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]]
     ; GFX9-NEXT: $vgpr0 = COPY [[OR2]](s32)
+    ;
     ; GFX9-UNALIGNED-LABEL: name: test_load_local_v3s8_align4
     ; GFX9-UNALIGNED: liveins: $vgpr0
     ; GFX9-UNALIGNED-NEXT: {{  $}}
@@ -6931,8 +7305,8 @@ body: |
     ; GFX9-UNALIGNED-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
     ; GFX9-UNALIGNED-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C1]](s32)
     ; GFX9-UNALIGNED-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-    ; GFX9-UNALIGNED-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; GFX9-UNALIGNED-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
+    ; GFX9-UNALIGNED-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; GFX9-UNALIGNED-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C2]]
     ; GFX9-UNALIGNED-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
     ; GFX9-UNALIGNED-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C2]]
@@ -6950,6 +7324,7 @@ body: |
     ; GFX9-UNALIGNED-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C1]](s32)
     ; GFX9-UNALIGNED-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]]
     ; GFX9-UNALIGNED-NEXT: $vgpr0 = COPY [[OR2]](s32)
+    ;
     ; GFX10-LABEL: name: test_load_local_v3s8_align4
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -6960,8 +7335,8 @@ body: |
     ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
     ; GFX10-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C1]](s32)
     ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-    ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
+    ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; GFX10-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C2]]
     ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
     ; GFX10-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C2]]
@@ -6979,6 +7354,7 @@ body: |
     ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C1]](s32)
     ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]]
     ; GFX10-NEXT: $vgpr0 = COPY [[OR2]](s32)
+    ;
     ; GFX10-UNALIGNED-LABEL: name: test_load_local_v3s8_align4
     ; GFX10-UNALIGNED: liveins: $vgpr0
     ; GFX10-UNALIGNED-NEXT: {{  $}}
@@ -6989,8 +7365,8 @@ body: |
     ; GFX10-UNALIGNED-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
     ; GFX10-UNALIGNED-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C1]](s32)
     ; GFX10-UNALIGNED-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-    ; GFX10-UNALIGNED-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; GFX10-UNALIGNED-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
+    ; GFX10-UNALIGNED-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; GFX10-UNALIGNED-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C2]]
     ; GFX10-UNALIGNED-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
     ; GFX10-UNALIGNED-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C2]]
@@ -7008,6 +7384,7 @@ body: |
     ; GFX10-UNALIGNED-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C1]](s32)
     ; GFX10-UNALIGNED-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]]
     ; GFX10-UNALIGNED-NEXT: $vgpr0 = COPY [[OR2]](s32)
+    ;
     ; GFX11-LABEL: name: test_load_local_v3s8_align4
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -7018,8 +7395,8 @@ body: |
     ; GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
     ; GFX11-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C1]](s32)
     ; GFX11-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-    ; GFX11-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
+    ; GFX11-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; GFX11-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C2]]
     ; GFX11-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
     ; GFX11-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C2]]
@@ -7037,6 +7414,7 @@ body: |
     ; GFX11-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C1]](s32)
     ; GFX11-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]]
     ; GFX11-NEXT: $vgpr0 = COPY [[OR2]](s32)
+    ;
     ; GFX11-UNALIGNED-LABEL: name: test_load_local_v3s8_align4
     ; GFX11-UNALIGNED: liveins: $vgpr0
     ; GFX11-UNALIGNED-NEXT: {{  $}}
@@ -7047,8 +7425,8 @@ body: |
     ; GFX11-UNALIGNED-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
     ; GFX11-UNALIGNED-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C1]](s32)
     ; GFX11-UNALIGNED-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-    ; GFX11-UNALIGNED-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; GFX11-UNALIGNED-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
+    ; GFX11-UNALIGNED-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; GFX11-UNALIGNED-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C2]]
     ; GFX11-UNALIGNED-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
     ; GFX11-UNALIGNED-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C2]]
@@ -7099,8 +7477,8 @@ body: |
     ; SI-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C1]](s32)
     ; SI-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C3]](s32)
     ; SI-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-    ; SI-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; SI-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
+    ; SI-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; SI-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C4]]
     ; SI-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C1]](s32)
     ; SI-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
@@ -7120,6 +7498,7 @@ body: |
     ; SI-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C3]](s32)
     ; SI-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]]
     ; SI-NEXT: $vgpr0 = COPY [[OR4]](s32)
+    ;
     ; CI-LABEL: name: test_load_local_v3s8_align1
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -7140,8 +7519,8 @@ body: |
     ; CI-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C1]](s32)
     ; CI-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C3]](s32)
     ; CI-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-    ; CI-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; CI-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
+    ; CI-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; CI-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C4]]
     ; CI-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C1]](s32)
     ; CI-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
@@ -7161,6 +7540,7 @@ body: |
     ; CI-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C3]](s32)
     ; CI-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]]
     ; CI-NEXT: $vgpr0 = COPY [[OR4]](s32)
+    ;
     ; CI-DS128-LABEL: name: test_load_local_v3s8_align1
     ; CI-DS128: liveins: $vgpr0
     ; CI-DS128-NEXT: {{  $}}
@@ -7181,8 +7561,8 @@ body: |
     ; CI-DS128-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C1]](s32)
     ; CI-DS128-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C3]](s32)
     ; CI-DS128-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-    ; CI-DS128-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; CI-DS128-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
+    ; CI-DS128-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; CI-DS128-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C4]]
     ; CI-DS128-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C1]](s32)
     ; CI-DS128-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
@@ -7202,6 +7582,7 @@ body: |
     ; CI-DS128-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C3]](s32)
     ; CI-DS128-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]]
     ; CI-DS128-NEXT: $vgpr0 = COPY [[OR4]](s32)
+    ;
     ; VI-LABEL: name: test_load_local_v3s8_align1
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -7222,8 +7603,8 @@ body: |
     ; VI-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C1]](s32)
     ; VI-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C3]](s32)
     ; VI-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-    ; VI-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; VI-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
+    ; VI-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; VI-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C4]]
     ; VI-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
     ; VI-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C4]]
@@ -7241,6 +7622,7 @@ body: |
     ; VI-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C3]](s32)
     ; VI-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]]
     ; VI-NEXT: $vgpr0 = COPY [[OR4]](s32)
+    ;
     ; GFX9-LABEL: name: test_load_local_v3s8_align1
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -7261,8 +7643,8 @@ body: |
     ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C1]](s32)
     ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C3]](s32)
     ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-    ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
+    ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C4]]
     ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
     ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C4]]
@@ -7280,6 +7662,7 @@ body: |
     ; GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C3]](s32)
     ; GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]]
     ; GFX9-NEXT: $vgpr0 = COPY [[OR4]](s32)
+    ;
     ; GFX9-UNALIGNED-LABEL: name: test_load_local_v3s8_align1
     ; GFX9-UNALIGNED: liveins: $vgpr0
     ; GFX9-UNALIGNED-NEXT: {{  $}}
@@ -7295,8 +7678,8 @@ body: |
     ; GFX9-UNALIGNED-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C2]](s32)
     ; GFX9-UNALIGNED-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C1]](s32)
     ; GFX9-UNALIGNED-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-    ; GFX9-UNALIGNED-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; GFX9-UNALIGNED-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
+    ; GFX9-UNALIGNED-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; GFX9-UNALIGNED-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C3]]
     ; GFX9-UNALIGNED-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
     ; GFX9-UNALIGNED-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C3]]
@@ -7314,6 +7697,7 @@ body: |
     ; GFX9-UNALIGNED-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C1]](s32)
     ; GFX9-UNALIGNED-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL3]]
     ; GFX9-UNALIGNED-NEXT: $vgpr0 = COPY [[OR3]](s32)
+    ;
     ; GFX10-LABEL: name: test_load_local_v3s8_align1
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -7334,8 +7718,8 @@ body: |
     ; GFX10-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C1]](s32)
     ; GFX10-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C3]](s32)
     ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-    ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
+    ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; GFX10-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C4]]
     ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
     ; GFX10-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C4]]
@@ -7353,6 +7737,7 @@ body: |
     ; GFX10-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C3]](s32)
     ; GFX10-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]]
     ; GFX10-NEXT: $vgpr0 = COPY [[OR4]](s32)
+    ;
     ; GFX10-UNALIGNED-LABEL: name: test_load_local_v3s8_align1
     ; GFX10-UNALIGNED: liveins: $vgpr0
     ; GFX10-UNALIGNED-NEXT: {{  $}}
@@ -7368,8 +7753,8 @@ body: |
     ; GFX10-UNALIGNED-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C2]](s32)
     ; GFX10-UNALIGNED-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C1]](s32)
     ; GFX10-UNALIGNED-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-    ; GFX10-UNALIGNED-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; GFX10-UNALIGNED-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
+    ; GFX10-UNALIGNED-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; GFX10-UNALIGNED-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C3]]
     ; GFX10-UNALIGNED-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
     ; GFX10-UNALIGNED-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C3]]
@@ -7387,6 +7772,7 @@ body: |
     ; GFX10-UNALIGNED-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C1]](s32)
     ; GFX10-UNALIGNED-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL3]]
     ; GFX10-UNALIGNED-NEXT: $vgpr0 = COPY [[OR3]](s32)
+    ;
     ; GFX11-LABEL: name: test_load_local_v3s8_align1
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -7407,8 +7793,8 @@ body: |
     ; GFX11-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C1]](s32)
     ; GFX11-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C3]](s32)
     ; GFX11-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-    ; GFX11-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
+    ; GFX11-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; GFX11-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C4]]
     ; GFX11-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
     ; GFX11-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C4]]
@@ -7426,6 +7812,7 @@ body: |
     ; GFX11-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C3]](s32)
     ; GFX11-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]]
     ; GFX11-NEXT: $vgpr0 = COPY [[OR4]](s32)
+    ;
     ; GFX11-UNALIGNED-LABEL: name: test_load_local_v3s8_align1
     ; GFX11-UNALIGNED: liveins: $vgpr0
     ; GFX11-UNALIGNED-NEXT: {{  $}}
@@ -7441,8 +7828,8 @@ body: |
     ; GFX11-UNALIGNED-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C2]](s32)
     ; GFX11-UNALIGNED-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C1]](s32)
     ; GFX11-UNALIGNED-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-    ; GFX11-UNALIGNED-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; GFX11-UNALIGNED-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
+    ; GFX11-UNALIGNED-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; GFX11-UNALIGNED-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C3]]
     ; GFX11-UNALIGNED-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
     ; GFX11-UNALIGNED-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C3]]
@@ -7479,54 +7866,63 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; SI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s32), addrspace 3)
     ; SI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; CI-LABEL: name: test_load_local_v4s8_align4
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s32), addrspace 3)
     ; CI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; CI-DS128-LABEL: name: test_load_local_v4s8_align4
     ; CI-DS128: liveins: $vgpr0
     ; CI-DS128-NEXT: {{  $}}
     ; CI-DS128-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; CI-DS128-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s32), addrspace 3)
     ; CI-DS128-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; VI-LABEL: name: test_load_local_v4s8_align4
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s32), addrspace 3)
     ; VI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX9-LABEL: name: test_load_local_v4s8_align4
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s32), addrspace 3)
     ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX9-UNALIGNED-LABEL: name: test_load_local_v4s8_align4
     ; GFX9-UNALIGNED: liveins: $vgpr0
     ; GFX9-UNALIGNED-NEXT: {{  $}}
     ; GFX9-UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX9-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s32), addrspace 3)
     ; GFX9-UNALIGNED-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX10-LABEL: name: test_load_local_v4s8_align4
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s32), addrspace 3)
     ; GFX10-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX10-UNALIGNED-LABEL: name: test_load_local_v4s8_align4
     ; GFX10-UNALIGNED: liveins: $vgpr0
     ; GFX10-UNALIGNED-NEXT: {{  $}}
     ; GFX10-UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX10-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s32), addrspace 3)
     ; GFX10-UNALIGNED-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX11-LABEL: name: test_load_local_v4s8_align4
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s32), addrspace 3)
     ; GFX11-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX11-UNALIGNED-LABEL: name: test_load_local_v4s8_align4
     ; GFX11-UNALIGNED: liveins: $vgpr0
     ; GFX11-UNALIGNED-NEXT: {{  $}}
@@ -7551,54 +7947,63 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; SI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load (<2 x s32>), addrspace 3)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; CI-LABEL: name: test_load_local_v8s8_align8
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load (<2 x s32>), addrspace 3)
     ; CI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; CI-DS128-LABEL: name: test_load_local_v8s8_align8
     ; CI-DS128: liveins: $vgpr0
     ; CI-DS128-NEXT: {{  $}}
     ; CI-DS128-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; CI-DS128-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load (<2 x s32>), addrspace 3)
     ; CI-DS128-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; VI-LABEL: name: test_load_local_v8s8_align8
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load (<2 x s32>), addrspace 3)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; GFX9-LABEL: name: test_load_local_v8s8_align8
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load (<2 x s32>), addrspace 3)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; GFX9-UNALIGNED-LABEL: name: test_load_local_v8s8_align8
     ; GFX9-UNALIGNED: liveins: $vgpr0
     ; GFX9-UNALIGNED-NEXT: {{  $}}
     ; GFX9-UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX9-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load (<2 x s32>), addrspace 3)
     ; GFX9-UNALIGNED-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; GFX10-LABEL: name: test_load_local_v8s8_align8
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load (<2 x s32>), addrspace 3)
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; GFX10-UNALIGNED-LABEL: name: test_load_local_v8s8_align8
     ; GFX10-UNALIGNED: liveins: $vgpr0
     ; GFX10-UNALIGNED-NEXT: {{  $}}
     ; GFX10-UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX10-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load (<2 x s32>), addrspace 3)
     ; GFX10-UNALIGNED-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; GFX11-LABEL: name: test_load_local_v8s8_align8
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load (<2 x s32>), addrspace 3)
     ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; GFX11-UNALIGNED-LABEL: name: test_load_local_v8s8_align8
     ; GFX11-UNALIGNED: liveins: $vgpr0
     ; GFX11-UNALIGNED-NEXT: {{  $}}
@@ -7683,6 +8088,7 @@ body: |
     ; SI-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
+    ;
     ; CI-LABEL: name: test_load_local_v16s8_align16
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -7749,6 +8155,7 @@ body: |
     ; CI-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
     ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
+    ;
     ; CI-DS128-LABEL: name: test_load_local_v16s8_align16
     ; CI-DS128: liveins: $vgpr0
     ; CI-DS128-NEXT: {{  $}}
@@ -7816,6 +8223,7 @@ body: |
     ; CI-DS128-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
     ; CI-DS128-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
     ; CI-DS128-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
+    ;
     ; VI-LABEL: name: test_load_local_v16s8_align16
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -7883,6 +8291,7 @@ body: |
     ; VI-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
+    ;
     ; GFX9-LABEL: name: test_load_local_v16s8_align16
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -7950,12 +8359,14 @@ body: |
     ; GFX9-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
     ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
+    ;
     ; GFX9-UNALIGNED-LABEL: name: test_load_local_v16s8_align16
     ; GFX9-UNALIGNED: liveins: $vgpr0
     ; GFX9-UNALIGNED-NEXT: {{  $}}
     ; GFX9-UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX9-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p3) :: (load (<4 x s32>), align 1, addrspace 3)
     ; GFX9-UNALIGNED-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ;
     ; GFX10-LABEL: name: test_load_local_v16s8_align16
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -8023,6 +8434,7 @@ body: |
     ; GFX10-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
     ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
+    ;
     ; GFX10-UNALIGNED-LABEL: name: test_load_local_v16s8_align16
     ; GFX10-UNALIGNED: liveins: $vgpr0
     ; GFX10-UNALIGNED-NEXT: {{  $}}
@@ -8039,6 +8451,7 @@ body: |
     ; GFX10-UNALIGNED-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p3) :: (load (s32) from unknown-address + 12, align 1, addrspace 3)
     ; GFX10-UNALIGNED-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32)
     ; GFX10-UNALIGNED-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
+    ;
     ; GFX11-LABEL: name: test_load_local_v16s8_align16
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -8106,6 +8519,7 @@ body: |
     ; GFX11-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
     ; GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
     ; GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
+    ;
     ; GFX11-UNALIGNED-LABEL: name: test_load_local_v16s8_align16
     ; GFX11-UNALIGNED: liveins: $vgpr0
     ; GFX11-UNALIGNED-NEXT: {{  $}}
@@ -8130,54 +8544,63 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; SI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p3) :: (load (<2 x s16>), addrspace 3)
     ; SI-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>)
+    ;
     ; CI-LABEL: name: test_load_local_v2s16_align4
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p3) :: (load (<2 x s16>), addrspace 3)
     ; CI-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>)
+    ;
     ; CI-DS128-LABEL: name: test_load_local_v2s16_align4
     ; CI-DS128: liveins: $vgpr0
     ; CI-DS128-NEXT: {{  $}}
     ; CI-DS128-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; CI-DS128-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p3) :: (load (<2 x s16>), addrspace 3)
     ; CI-DS128-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>)
+    ;
     ; VI-LABEL: name: test_load_local_v2s16_align4
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p3) :: (load (<2 x s16>), addrspace 3)
     ; VI-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>)
+    ;
     ; GFX9-LABEL: name: test_load_local_v2s16_align4
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p3) :: (load (<2 x s16>), addrspace 3)
     ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>)
+    ;
     ; GFX9-UNALIGNED-LABEL: name: test_load_local_v2s16_align4
     ; GFX9-UNALIGNED: liveins: $vgpr0
     ; GFX9-UNALIGNED-NEXT: {{  $}}
     ; GFX9-UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX9-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p3) :: (load (<2 x s16>), addrspace 3)
     ; GFX9-UNALIGNED-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>)
+    ;
     ; GFX10-LABEL: name: test_load_local_v2s16_align4
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p3) :: (load (<2 x s16>), addrspace 3)
     ; GFX10-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>)
+    ;
     ; GFX10-UNALIGNED-LABEL: name: test_load_local_v2s16_align4
     ; GFX10-UNALIGNED: liveins: $vgpr0
     ; GFX10-UNALIGNED-NEXT: {{  $}}
     ; GFX10-UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX10-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p3) :: (load (<2 x s16>), addrspace 3)
     ; GFX10-UNALIGNED-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>)
+    ;
     ; GFX11-LABEL: name: test_load_local_v2s16_align4
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p3) :: (load (<2 x s16>), addrspace 3)
     ; GFX11-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>)
+    ;
     ; GFX11-UNALIGNED-LABEL: name: test_load_local_v2s16_align4
     ; GFX11-UNALIGNED: liveins: $vgpr0
     ; GFX11-UNALIGNED-NEXT: {{  $}}
@@ -8211,6 +8634,7 @@ body: |
     ; SI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
     ; SI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; SI-NEXT: $vgpr0 = COPY [[BITCAST]](<2 x s16>)
+    ;
     ; CI-LABEL: name: test_load_local_v2s16_align2
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -8227,6 +8651,7 @@ body: |
     ; CI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
     ; CI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; CI-NEXT: $vgpr0 = COPY [[BITCAST]](<2 x s16>)
+    ;
     ; CI-DS128-LABEL: name: test_load_local_v2s16_align2
     ; CI-DS128: liveins: $vgpr0
     ; CI-DS128-NEXT: {{  $}}
@@ -8243,6 +8668,7 @@ body: |
     ; CI-DS128-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
     ; CI-DS128-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; CI-DS128-NEXT: $vgpr0 = COPY [[BITCAST]](<2 x s16>)
+    ;
     ; VI-LABEL: name: test_load_local_v2s16_align2
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -8259,6 +8685,7 @@ body: |
     ; VI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; VI-NEXT: $vgpr0 = COPY [[BITCAST]](<2 x s16>)
+    ;
     ; GFX9-LABEL: name: test_load_local_v2s16_align2
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -8271,12 +8698,14 @@ body: |
     ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
     ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
     ; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>)
+    ;
     ; GFX9-UNALIGNED-LABEL: name: test_load_local_v2s16_align2
     ; GFX9-UNALIGNED: liveins: $vgpr0
     ; GFX9-UNALIGNED-NEXT: {{  $}}
     ; GFX9-UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX9-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p3) :: (load (<2 x s16>), align 2, addrspace 3)
     ; GFX9-UNALIGNED-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>)
+    ;
     ; GFX10-LABEL: name: test_load_local_v2s16_align2
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -8289,12 +8718,14 @@ body: |
     ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
     ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
     ; GFX10-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>)
+    ;
     ; GFX10-UNALIGNED-LABEL: name: test_load_local_v2s16_align2
     ; GFX10-UNALIGNED: liveins: $vgpr0
     ; GFX10-UNALIGNED-NEXT: {{  $}}
     ; GFX10-UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX10-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p3) :: (load (<2 x s16>), align 2, addrspace 3)
     ; GFX10-UNALIGNED-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>)
+    ;
     ; GFX11-LABEL: name: test_load_local_v2s16_align2
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -8307,6 +8738,7 @@ body: |
     ; GFX11-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
     ; GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
     ; GFX11-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>)
+    ;
     ; GFX11-UNALIGNED-LABEL: name: test_load_local_v2s16_align2
     ; GFX11-UNALIGNED: liveins: $vgpr0
     ; GFX11-UNALIGNED-NEXT: {{  $}}
@@ -8350,6 +8782,7 @@ body: |
     ; SI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL2]]
     ; SI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
     ; SI-NEXT: $vgpr0 = COPY [[BITCAST]](<2 x s16>)
+    ;
     ; CI-LABEL: name: test_load_local_v2s16_align1
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -8376,6 +8809,7 @@ body: |
     ; CI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL2]]
     ; CI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
     ; CI-NEXT: $vgpr0 = COPY [[BITCAST]](<2 x s16>)
+    ;
     ; CI-DS128-LABEL: name: test_load_local_v2s16_align1
     ; CI-DS128: liveins: $vgpr0
     ; CI-DS128-NEXT: {{  $}}
@@ -8402,6 +8836,7 @@ body: |
     ; CI-DS128-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL2]]
     ; CI-DS128-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
     ; CI-DS128-NEXT: $vgpr0 = COPY [[BITCAST]](<2 x s16>)
+    ;
     ; VI-LABEL: name: test_load_local_v2s16_align1
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -8428,6 +8863,7 @@ body: |
     ; VI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL2]]
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
     ; VI-NEXT: $vgpr0 = COPY [[BITCAST]](<2 x s16>)
+    ;
     ; GFX9-LABEL: name: test_load_local_v2s16_align1
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -8450,12 +8886,14 @@ body: |
     ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
     ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
     ; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>)
+    ;
     ; GFX9-UNALIGNED-LABEL: name: test_load_local_v2s16_align1
     ; GFX9-UNALIGNED: liveins: $vgpr0
     ; GFX9-UNALIGNED-NEXT: {{  $}}
     ; GFX9-UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX9-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p3) :: (load (<2 x s16>), align 1, addrspace 3)
     ; GFX9-UNALIGNED-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>)
+    ;
     ; GFX10-LABEL: name: test_load_local_v2s16_align1
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -8478,12 +8916,14 @@ body: |
     ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
     ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
     ; GFX10-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>)
+    ;
     ; GFX10-UNALIGNED-LABEL: name: test_load_local_v2s16_align1
     ; GFX10-UNALIGNED: liveins: $vgpr0
     ; GFX10-UNALIGNED-NEXT: {{  $}}
     ; GFX10-UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX10-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p3) :: (load (<2 x s16>), align 1, addrspace 3)
     ; GFX10-UNALIGNED-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>)
+    ;
     ; GFX11-LABEL: name: test_load_local_v2s16_align1
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -8506,6 +8946,7 @@ body: |
     ; GFX11-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
     ; GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
     ; GFX11-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>)
+    ;
     ; GFX11-UNALIGNED-LABEL: name: test_load_local_v2s16_align1
     ; GFX11-UNALIGNED: liveins: $vgpr0
     ; GFX11-UNALIGNED-NEXT: {{  $}}
@@ -8542,13 +8983,13 @@ body: |
     ; SI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
     ; SI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
     ; SI-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
-    ; SI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]]
-    ; SI-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[BITCAST2]], [[C1]]
-    ; SI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C]](s32)
-    ; SI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]]
+    ; SI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[BITCAST2]], [[C1]]
+    ; SI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C]](s32)
+    ; SI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[LSHR]], [[SHL1]]
     ; SI-NEXT: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
     ; SI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[UV]](<2 x s16>), [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
     ; CI-LABEL: name: test_load_local_v3s16_align8
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -8568,13 +9009,13 @@ body: |
     ; CI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
     ; CI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
     ; CI-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
-    ; CI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]]
-    ; CI-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[BITCAST2]], [[C1]]
-    ; CI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C]](s32)
-    ; CI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]]
+    ; CI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[BITCAST2]], [[C1]]
+    ; CI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C]](s32)
+    ; CI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[LSHR]], [[SHL1]]
     ; CI-NEXT: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
     ; CI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[UV]](<2 x s16>), [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
     ; CI-DS128-LABEL: name: test_load_local_v3s16_align8
     ; CI-DS128: liveins: $vgpr0
     ; CI-DS128-NEXT: {{  $}}
@@ -8594,13 +9035,13 @@ body: |
     ; CI-DS128-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
     ; CI-DS128-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
     ; CI-DS128-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
-    ; CI-DS128-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]]
-    ; CI-DS128-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[BITCAST2]], [[C1]]
-    ; CI-DS128-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C]](s32)
-    ; CI-DS128-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]]
+    ; CI-DS128-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[BITCAST2]], [[C1]]
+    ; CI-DS128-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C]](s32)
+    ; CI-DS128-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[LSHR]], [[SHL1]]
     ; CI-DS128-NEXT: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
     ; CI-DS128-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[UV]](<2 x s16>), [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>)
     ; CI-DS128-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
     ; VI-LABEL: name: test_load_local_v3s16_align8
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -8620,13 +9061,13 @@ body: |
     ; VI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
     ; VI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
     ; VI-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
-    ; VI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]]
-    ; VI-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[BITCAST2]], [[C1]]
-    ; VI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C]](s32)
-    ; VI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]]
+    ; VI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[BITCAST2]], [[C1]]
+    ; VI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C]](s32)
+    ; VI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[LSHR]], [[SHL1]]
     ; VI-NEXT: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
     ; VI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[UV]](<2 x s16>), [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
     ; GFX9-LABEL: name: test_load_local_v3s16_align8
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -8648,6 +9089,7 @@ body: |
     ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
     ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[UV]](<2 x s16>), [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
     ; GFX9-UNALIGNED-LABEL: name: test_load_local_v3s16_align8
     ; GFX9-UNALIGNED: liveins: $vgpr0
     ; GFX9-UNALIGNED-NEXT: {{  $}}
@@ -8669,6 +9111,7 @@ body: |
     ; GFX9-UNALIGNED-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
     ; GFX9-UNALIGNED-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[UV]](<2 x s16>), [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>)
     ; GFX9-UNALIGNED-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
     ; GFX10-LABEL: name: test_load_local_v3s16_align8
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -8690,6 +9133,7 @@ body: |
     ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
     ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[UV]](<2 x s16>), [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
     ; GFX10-UNALIGNED-LABEL: name: test_load_local_v3s16_align8
     ; GFX10-UNALIGNED: liveins: $vgpr0
     ; GFX10-UNALIGNED-NEXT: {{  $}}
@@ -8711,6 +9155,7 @@ body: |
     ; GFX10-UNALIGNED-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
     ; GFX10-UNALIGNED-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[UV]](<2 x s16>), [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>)
     ; GFX10-UNALIGNED-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
     ; GFX11-LABEL: name: test_load_local_v3s16_align8
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -8732,6 +9177,7 @@ body: |
     ; GFX11-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
     ; GFX11-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[UV]](<2 x s16>), [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>)
     ; GFX11-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
     ; GFX11-UNALIGNED-LABEL: name: test_load_local_v3s16_align8
     ; GFX11-UNALIGNED: liveins: $vgpr0
     ; GFX11-UNALIGNED-NEXT: {{  $}}
@@ -8794,13 +9240,13 @@ body: |
     ; SI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C2]](s32)
     ; SI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]]
     ; SI-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
-    ; SI-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C3]]
-    ; SI-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C3]]
-    ; SI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C2]](s32)
-    ; SI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]]
+    ; SI-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C3]]
+    ; SI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND4]], [[C2]](s32)
+    ; SI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[LSHR]], [[SHL2]]
     ; SI-NEXT: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
     ; SI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
     ; CI-LABEL: name: test_load_local_v3s16_align2
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -8829,13 +9275,13 @@ body: |
     ; CI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C2]](s32)
     ; CI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]]
     ; CI-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
-    ; CI-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C3]]
-    ; CI-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C3]]
-    ; CI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C2]](s32)
-    ; CI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]]
+    ; CI-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C3]]
+    ; CI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND4]], [[C2]](s32)
+    ; CI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[LSHR]], [[SHL2]]
     ; CI-NEXT: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
     ; CI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
     ; CI-DS128-LABEL: name: test_load_local_v3s16_align2
     ; CI-DS128: liveins: $vgpr0
     ; CI-DS128-NEXT: {{  $}}
@@ -8864,13 +9310,13 @@ body: |
     ; CI-DS128-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C2]](s32)
     ; CI-DS128-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]]
     ; CI-DS128-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
-    ; CI-DS128-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C3]]
-    ; CI-DS128-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C3]]
-    ; CI-DS128-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C2]](s32)
-    ; CI-DS128-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]]
+    ; CI-DS128-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C3]]
+    ; CI-DS128-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND4]], [[C2]](s32)
+    ; CI-DS128-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[LSHR]], [[SHL2]]
     ; CI-DS128-NEXT: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
     ; CI-DS128-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>)
     ; CI-DS128-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
     ; VI-LABEL: name: test_load_local_v3s16_align2
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -8899,13 +9345,13 @@ body: |
     ; VI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C2]](s32)
     ; VI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]]
     ; VI-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
-    ; VI-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C3]]
-    ; VI-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C3]]
-    ; VI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C2]](s32)
-    ; VI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]]
+    ; VI-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C3]]
+    ; VI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND4]], [[C2]](s32)
+    ; VI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[LSHR]], [[SHL2]]
     ; VI-NEXT: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
     ; VI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
     ; GFX9-LABEL: name: test_load_local_v3s16_align2
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -8934,6 +9380,7 @@ body: |
     ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16)
     ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
     ; GFX9-UNALIGNED-LABEL: name: test_load_local_v3s16_align2
     ; GFX9-UNALIGNED: liveins: $vgpr0
     ; GFX9-UNALIGNED-NEXT: {{  $}}
@@ -8962,6 +9409,7 @@ body: |
     ; GFX9-UNALIGNED-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16)
     ; GFX9-UNALIGNED-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
     ; GFX9-UNALIGNED-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
     ; GFX10-LABEL: name: test_load_local_v3s16_align2
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -8990,6 +9438,7 @@ body: |
     ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16)
     ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
     ; GFX10-UNALIGNED-LABEL: name: test_load_local_v3s16_align2
     ; GFX10-UNALIGNED: liveins: $vgpr0
     ; GFX10-UNALIGNED-NEXT: {{  $}}
@@ -9018,6 +9467,7 @@ body: |
     ; GFX10-UNALIGNED-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16)
     ; GFX10-UNALIGNED-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
     ; GFX10-UNALIGNED-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
     ; GFX11-LABEL: name: test_load_local_v3s16_align2
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -9046,6 +9496,7 @@ body: |
     ; GFX11-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16)
     ; GFX11-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
     ; GFX11-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
     ; GFX11-UNALIGNED-LABEL: name: test_load_local_v3s16_align2
     ; GFX11-UNALIGNED: liveins: $vgpr0
     ; GFX11-UNALIGNED-NEXT: {{  $}}
@@ -9129,13 +9580,13 @@ body: |
     ; SI-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32)
     ; SI-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL4]]
     ; SI-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR4]](s32)
-    ; SI-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C5]]
-    ; SI-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C5]]
-    ; SI-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C4]](s32)
-    ; SI-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL5]]
+    ; SI-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C5]]
+    ; SI-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND4]], [[C4]](s32)
+    ; SI-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[LSHR]], [[SHL5]]
     ; SI-NEXT: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR5]](s32)
     ; SI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
     ; CI-LABEL: name: test_load_local_v3s16_align1
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -9178,13 +9629,13 @@ body: |
     ; CI-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32)
     ; CI-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL4]]
     ; CI-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR4]](s32)
-    ; CI-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C5]]
-    ; CI-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C5]]
-    ; CI-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C4]](s32)
-    ; CI-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL5]]
+    ; CI-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C5]]
+    ; CI-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND4]], [[C4]](s32)
+    ; CI-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[LSHR]], [[SHL5]]
     ; CI-NEXT: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR5]](s32)
     ; CI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
     ; CI-DS128-LABEL: name: test_load_local_v3s16_align1
     ; CI-DS128: liveins: $vgpr0
     ; CI-DS128-NEXT: {{  $}}
@@ -9227,13 +9678,13 @@ body: |
     ; CI-DS128-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32)
     ; CI-DS128-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL4]]
     ; CI-DS128-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR4]](s32)
-    ; CI-DS128-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C5]]
-    ; CI-DS128-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C5]]
-    ; CI-DS128-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C4]](s32)
-    ; CI-DS128-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL5]]
+    ; CI-DS128-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C5]]
+    ; CI-DS128-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND4]], [[C4]](s32)
+    ; CI-DS128-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[LSHR]], [[SHL5]]
     ; CI-DS128-NEXT: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR5]](s32)
     ; CI-DS128-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>)
     ; CI-DS128-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
     ; VI-LABEL: name: test_load_local_v3s16_align1
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -9276,13 +9727,13 @@ body: |
     ; VI-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32)
     ; VI-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL4]]
     ; VI-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR4]](s32)
-    ; VI-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C5]]
-    ; VI-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C5]]
-    ; VI-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C4]](s32)
-    ; VI-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL5]]
+    ; VI-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C5]]
+    ; VI-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND4]], [[C4]](s32)
+    ; VI-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[LSHR]], [[SHL5]]
     ; VI-NEXT: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR5]](s32)
     ; VI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
     ; GFX9-LABEL: name: test_load_local_v3s16_align1
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -9325,6 +9776,7 @@ body: |
     ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16)
     ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
     ; GFX9-UNALIGNED-LABEL: name: test_load_local_v3s16_align1
     ; GFX9-UNALIGNED: liveins: $vgpr0
     ; GFX9-UNALIGNED-NEXT: {{  $}}
@@ -9353,6 +9805,7 @@ body: |
     ; GFX9-UNALIGNED-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16)
     ; GFX9-UNALIGNED-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
     ; GFX9-UNALIGNED-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
     ; GFX10-LABEL: name: test_load_local_v3s16_align1
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -9395,6 +9848,7 @@ body: |
     ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16)
     ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
     ; GFX10-UNALIGNED-LABEL: name: test_load_local_v3s16_align1
     ; GFX10-UNALIGNED: liveins: $vgpr0
     ; GFX10-UNALIGNED-NEXT: {{  $}}
@@ -9423,6 +9877,7 @@ body: |
     ; GFX10-UNALIGNED-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16)
     ; GFX10-UNALIGNED-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
     ; GFX10-UNALIGNED-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
     ; GFX11-LABEL: name: test_load_local_v3s16_align1
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -9465,6 +9920,7 @@ body: |
     ; GFX11-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16)
     ; GFX11-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
     ; GFX11-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
     ; GFX11-UNALIGNED-LABEL: name: test_load_local_v3s16_align1
     ; GFX11-UNALIGNED: liveins: $vgpr0
     ; GFX11-UNALIGNED-NEXT: {{  $}}
@@ -9511,54 +9967,63 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; SI-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p3) :: (load (<4 x s16>), addrspace 3)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
+    ;
     ; CI-LABEL: name: test_load_local_v4s16_align8
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p3) :: (load (<4 x s16>), addrspace 3)
     ; CI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
+    ;
     ; CI-DS128-LABEL: name: test_load_local_v4s16_align8
     ; CI-DS128: liveins: $vgpr0
     ; CI-DS128-NEXT: {{  $}}
     ; CI-DS128-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; CI-DS128-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p3) :: (load (<4 x s16>), addrspace 3)
     ; CI-DS128-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
+    ;
     ; VI-LABEL: name: test_load_local_v4s16_align8
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p3) :: (load (<4 x s16>), addrspace 3)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
+    ;
     ; GFX9-LABEL: name: test_load_local_v4s16_align8
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p3) :: (load (<4 x s16>), addrspace 3)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
+    ;
     ; GFX9-UNALIGNED-LABEL: name: test_load_local_v4s16_align8
     ; GFX9-UNALIGNED: liveins: $vgpr0
     ; GFX9-UNALIGNED-NEXT: {{  $}}
     ; GFX9-UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX9-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p3) :: (load (<4 x s16>), addrspace 3)
     ; GFX9-UNALIGNED-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
+    ;
     ; GFX10-LABEL: name: test_load_local_v4s16_align8
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p3) :: (load (<4 x s16>), addrspace 3)
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
+    ;
     ; GFX10-UNALIGNED-LABEL: name: test_load_local_v4s16_align8
     ; GFX10-UNALIGNED: liveins: $vgpr0
     ; GFX10-UNALIGNED-NEXT: {{  $}}
     ; GFX10-UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX10-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p3) :: (load (<4 x s16>), addrspace 3)
     ; GFX10-UNALIGNED-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
+    ;
     ; GFX11-LABEL: name: test_load_local_v4s16_align8
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p3) :: (load (<4 x s16>), addrspace 3)
     ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
+    ;
     ; GFX11-UNALIGNED-LABEL: name: test_load_local_v4s16_align8
     ; GFX11-UNALIGNED: liveins: $vgpr0
     ; GFX11-UNALIGNED-NEXT: {{  $}}
@@ -9604,36 +10069,42 @@ body: |
     ; SI-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
     ; SI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
     ; CI-LABEL: name: test_load_local_v4s16_align4
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p3) :: (load (<4 x s16>), align 4, addrspace 3)
     ; CI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
+    ;
     ; CI-DS128-LABEL: name: test_load_local_v4s16_align4
     ; CI-DS128: liveins: $vgpr0
     ; CI-DS128-NEXT: {{  $}}
     ; CI-DS128-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; CI-DS128-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p3) :: (load (<4 x s16>), align 4, addrspace 3)
     ; CI-DS128-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
+    ;
     ; VI-LABEL: name: test_load_local_v4s16_align4
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p3) :: (load (<4 x s16>), align 4, addrspace 3)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
+    ;
     ; GFX9-LABEL: name: test_load_local_v4s16_align4
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p3) :: (load (<4 x s16>), align 4, addrspace 3)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
+    ;
     ; GFX9-UNALIGNED-LABEL: name: test_load_local_v4s16_align4
     ; GFX9-UNALIGNED: liveins: $vgpr0
     ; GFX9-UNALIGNED-NEXT: {{  $}}
     ; GFX9-UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX9-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p3) :: (load (<4 x s16>), align 4, addrspace 3)
     ; GFX9-UNALIGNED-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
+    ;
     ; GFX10-LABEL: name: test_load_local_v4s16_align4
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -9656,6 +10127,7 @@ body: |
     ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
     ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>)
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
     ; GFX10-UNALIGNED-LABEL: name: test_load_local_v4s16_align4
     ; GFX10-UNALIGNED: liveins: $vgpr0
     ; GFX10-UNALIGNED-NEXT: {{  $}}
@@ -9678,12 +10150,14 @@ body: |
     ; GFX10-UNALIGNED-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
     ; GFX10-UNALIGNED-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>)
     ; GFX10-UNALIGNED-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
     ; GFX11-LABEL: name: test_load_local_v4s16_align4
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p3) :: (load (<4 x s16>), align 4, addrspace 3)
     ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
+    ;
     ; GFX11-UNALIGNED-LABEL: name: test_load_local_v4s16_align4
     ; GFX11-UNALIGNED: liveins: $vgpr0
     ; GFX11-UNALIGNED-NEXT: {{  $}}
@@ -9728,6 +10202,7 @@ body: |
     ; SI-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
     ; SI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
     ; CI-LABEL: name: test_load_local_v4s16_align2
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -9756,6 +10231,7 @@ body: |
     ; CI-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
     ; CI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>)
     ; CI-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
     ; CI-DS128-LABEL: name: test_load_local_v4s16_align2
     ; CI-DS128: liveins: $vgpr0
     ; CI-DS128-NEXT: {{  $}}
@@ -9784,6 +10260,7 @@ body: |
     ; CI-DS128-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
     ; CI-DS128-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>)
     ; CI-DS128-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
     ; VI-LABEL: name: test_load_local_v4s16_align2
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -9812,6 +10289,7 @@ body: |
     ; VI-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
     ; VI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
     ; GFX9-LABEL: name: test_load_local_v4s16_align2
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -9834,12 +10312,14 @@ body: |
     ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
     ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
     ; GFX9-UNALIGNED-LABEL: name: test_load_local_v4s16_align2
     ; GFX9-UNALIGNED: liveins: $vgpr0
     ; GFX9-UNALIGNED-NEXT: {{  $}}
     ; GFX9-UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX9-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p3) :: (load (<4 x s16>), align 2, addrspace 3)
     ; GFX9-UNALIGNED-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
+    ;
     ; GFX10-LABEL: name: test_load_local_v4s16_align2
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -9862,6 +10342,7 @@ body: |
     ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
     ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>)
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
     ; GFX10-UNALIGNED-LABEL: name: test_load_local_v4s16_align2
     ; GFX10-UNALIGNED: liveins: $vgpr0
     ; GFX10-UNALIGNED-NEXT: {{  $}}
@@ -9884,6 +10365,7 @@ body: |
     ; GFX10-UNALIGNED-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
     ; GFX10-UNALIGNED-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>)
     ; GFX10-UNALIGNED-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
     ; GFX11-LABEL: name: test_load_local_v4s16_align2
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -9906,6 +10388,7 @@ body: |
     ; GFX11-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
     ; GFX11-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>)
     ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
     ; GFX11-UNALIGNED-LABEL: name: test_load_local_v4s16_align2
     ; GFX11-UNALIGNED: liveins: $vgpr0
     ; GFX11-UNALIGNED-NEXT: {{  $}}
@@ -9969,6 +10452,7 @@ body: |
     ; SI-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR5]](s32)
     ; SI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
     ; CI-LABEL: name: test_load_local_v4s16_align1
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -10015,6 +10499,7 @@ body: |
     ; CI-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR5]](s32)
     ; CI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>)
     ; CI-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
     ; CI-DS128-LABEL: name: test_load_local_v4s16_align1
     ; CI-DS128: liveins: $vgpr0
     ; CI-DS128-NEXT: {{  $}}
@@ -10061,6 +10546,7 @@ body: |
     ; CI-DS128-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR5]](s32)
     ; CI-DS128-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>)
     ; CI-DS128-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
     ; VI-LABEL: name: test_load_local_v4s16_align1
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -10107,6 +10593,7 @@ body: |
     ; VI-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR5]](s32)
     ; VI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
     ; GFX9-LABEL: name: test_load_local_v4s16_align1
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -10147,12 +10634,14 @@ body: |
     ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
     ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
     ; GFX9-UNALIGNED-LABEL: name: test_load_local_v4s16_align1
     ; GFX9-UNALIGNED: liveins: $vgpr0
     ; GFX9-UNALIGNED-NEXT: {{  $}}
     ; GFX9-UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX9-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[COPY]](p3) :: (load (<4 x s16>), align 1, addrspace 3)
     ; GFX9-UNALIGNED-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<4 x s16>)
+    ;
     ; GFX10-LABEL: name: test_load_local_v4s16_align1
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -10193,6 +10682,7 @@ body: |
     ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
     ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>)
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
     ; GFX10-UNALIGNED-LABEL: name: test_load_local_v4s16_align1
     ; GFX10-UNALIGNED: liveins: $vgpr0
     ; GFX10-UNALIGNED-NEXT: {{  $}}
@@ -10215,6 +10705,7 @@ body: |
     ; GFX10-UNALIGNED-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
     ; GFX10-UNALIGNED-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>)
     ; GFX10-UNALIGNED-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
     ; GFX11-LABEL: name: test_load_local_v4s16_align1
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -10255,6 +10746,7 @@ body: |
     ; GFX11-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
     ; GFX11-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>)
     ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
     ; GFX11-UNALIGNED-LABEL: name: test_load_local_v4s16_align1
     ; GFX11-UNALIGNED: liveins: $vgpr0
     ; GFX11-UNALIGNED-NEXT: {{  $}}
@@ -10278,54 +10770,63 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; SI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load (<2 x s32>), addrspace 3)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; CI-LABEL: name: test_load_local_v2s32_align8
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load (<2 x s32>), addrspace 3)
     ; CI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; CI-DS128-LABEL: name: test_load_local_v2s32_align8
     ; CI-DS128: liveins: $vgpr0
     ; CI-DS128-NEXT: {{  $}}
     ; CI-DS128-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; CI-DS128-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load (<2 x s32>), addrspace 3)
     ; CI-DS128-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; VI-LABEL: name: test_load_local_v2s32_align8
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load (<2 x s32>), addrspace 3)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; GFX9-LABEL: name: test_load_local_v2s32_align8
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load (<2 x s32>), addrspace 3)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; GFX9-UNALIGNED-LABEL: name: test_load_local_v2s32_align8
     ; GFX9-UNALIGNED: liveins: $vgpr0
     ; GFX9-UNALIGNED-NEXT: {{  $}}
     ; GFX9-UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX9-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load (<2 x s32>), addrspace 3)
     ; GFX9-UNALIGNED-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; GFX10-LABEL: name: test_load_local_v2s32_align8
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load (<2 x s32>), addrspace 3)
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; GFX10-UNALIGNED-LABEL: name: test_load_local_v2s32_align8
     ; GFX10-UNALIGNED: liveins: $vgpr0
     ; GFX10-UNALIGNED-NEXT: {{  $}}
     ; GFX10-UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX10-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load (<2 x s32>), addrspace 3)
     ; GFX10-UNALIGNED-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; GFX11-LABEL: name: test_load_local_v2s32_align8
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load (<2 x s32>), addrspace 3)
     ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; GFX11-UNALIGNED-LABEL: name: test_load_local_v2s32_align8
     ; GFX11-UNALIGNED: liveins: $vgpr0
     ; GFX11-UNALIGNED-NEXT: {{  $}}
@@ -10349,54 +10850,63 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; SI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load (<2 x s32>), align 4, addrspace 3)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; CI-LABEL: name: test_load_local_v2s32_align4
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load (<2 x s32>), align 4, addrspace 3)
     ; CI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; CI-DS128-LABEL: name: test_load_local_v2s32_align4
     ; CI-DS128: liveins: $vgpr0
     ; CI-DS128-NEXT: {{  $}}
     ; CI-DS128-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; CI-DS128-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load (<2 x s32>), align 4, addrspace 3)
     ; CI-DS128-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; VI-LABEL: name: test_load_local_v2s32_align4
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load (<2 x s32>), align 4, addrspace 3)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; GFX9-LABEL: name: test_load_local_v2s32_align4
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load (<2 x s32>), align 4, addrspace 3)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; GFX9-UNALIGNED-LABEL: name: test_load_local_v2s32_align4
     ; GFX9-UNALIGNED: liveins: $vgpr0
     ; GFX9-UNALIGNED-NEXT: {{  $}}
     ; GFX9-UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX9-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load (<2 x s32>), align 4, addrspace 3)
     ; GFX9-UNALIGNED-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; GFX10-LABEL: name: test_load_local_v2s32_align4
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load (<2 x s32>), align 4, addrspace 3)
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; GFX10-UNALIGNED-LABEL: name: test_load_local_v2s32_align4
     ; GFX10-UNALIGNED: liveins: $vgpr0
     ; GFX10-UNALIGNED-NEXT: {{  $}}
     ; GFX10-UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX10-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load (<2 x s32>), align 4, addrspace 3)
     ; GFX10-UNALIGNED-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; GFX11-LABEL: name: test_load_local_v2s32_align4
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load (<2 x s32>), align 4, addrspace 3)
     ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; GFX11-UNALIGNED-LABEL: name: test_load_local_v2s32_align4
     ; GFX11-UNALIGNED: liveins: $vgpr0
     ; GFX11-UNALIGNED-NEXT: {{  $}}
@@ -10434,6 +10944,7 @@ body: |
     ; SI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; CI-LABEL: name: test_load_local_v2s32_align2
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -10454,6 +10965,7 @@ body: |
     ; CI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
     ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32)
     ; CI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; CI-DS128-LABEL: name: test_load_local_v2s32_align2
     ; CI-DS128: liveins: $vgpr0
     ; CI-DS128-NEXT: {{  $}}
@@ -10474,6 +10986,7 @@ body: |
     ; CI-DS128-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
     ; CI-DS128-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32)
     ; CI-DS128-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; VI-LABEL: name: test_load_local_v2s32_align2
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -10494,6 +11007,7 @@ body: |
     ; VI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; GFX9-LABEL: name: test_load_local_v2s32_align2
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -10514,12 +11028,14 @@ body: |
     ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
     ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; GFX9-UNALIGNED-LABEL: name: test_load_local_v2s32_align2
     ; GFX9-UNALIGNED: liveins: $vgpr0
     ; GFX9-UNALIGNED-NEXT: {{  $}}
     ; GFX9-UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX9-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load (<2 x s32>), align 2, addrspace 3)
     ; GFX9-UNALIGNED-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; GFX10-LABEL: name: test_load_local_v2s32_align2
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -10540,6 +11056,7 @@ body: |
     ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
     ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32)
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; GFX10-UNALIGNED-LABEL: name: test_load_local_v2s32_align2
     ; GFX10-UNALIGNED: liveins: $vgpr0
     ; GFX10-UNALIGNED-NEXT: {{  $}}
@@ -10550,6 +11067,7 @@ body: |
     ; GFX10-UNALIGNED-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load (s32) from unknown-address + 4, align 2, addrspace 3)
     ; GFX10-UNALIGNED-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32)
     ; GFX10-UNALIGNED-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; GFX11-LABEL: name: test_load_local_v2s32_align2
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -10570,6 +11088,7 @@ body: |
     ; GFX11-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
     ; GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32)
     ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; GFX11-UNALIGNED-LABEL: name: test_load_local_v2s32_align2
     ; GFX11-UNALIGNED: liveins: $vgpr0
     ; GFX11-UNALIGNED-NEXT: {{  $}}
@@ -10625,6 +11144,7 @@ body: |
     ; SI-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; CI-LABEL: name: test_load_local_v2s32_align1
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -10663,6 +11183,7 @@ body: |
     ; CI-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
     ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32)
     ; CI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; CI-DS128-LABEL: name: test_load_local_v2s32_align1
     ; CI-DS128: liveins: $vgpr0
     ; CI-DS128-NEXT: {{  $}}
@@ -10701,6 +11222,7 @@ body: |
     ; CI-DS128-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
     ; CI-DS128-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32)
     ; CI-DS128-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; VI-LABEL: name: test_load_local_v2s32_align1
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -10739,6 +11261,7 @@ body: |
     ; VI-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; GFX9-LABEL: name: test_load_local_v2s32_align1
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -10777,12 +11300,14 @@ body: |
     ; GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
     ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; GFX9-UNALIGNED-LABEL: name: test_load_local_v2s32_align1
     ; GFX9-UNALIGNED: liveins: $vgpr0
     ; GFX9-UNALIGNED-NEXT: {{  $}}
     ; GFX9-UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX9-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load (<2 x s32>), align 1, addrspace 3)
     ; GFX9-UNALIGNED-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; GFX10-LABEL: name: test_load_local_v2s32_align1
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -10821,6 +11346,7 @@ body: |
     ; GFX10-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
     ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32)
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; GFX10-UNALIGNED-LABEL: name: test_load_local_v2s32_align1
     ; GFX10-UNALIGNED: liveins: $vgpr0
     ; GFX10-UNALIGNED-NEXT: {{  $}}
@@ -10831,6 +11357,7 @@ body: |
     ; GFX10-UNALIGNED-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p3) :: (load (s32) from unknown-address + 4, align 1, addrspace 3)
     ; GFX10-UNALIGNED-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32)
     ; GFX10-UNALIGNED-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; GFX11-LABEL: name: test_load_local_v2s32_align1
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -10869,6 +11396,7 @@ body: |
     ; GFX11-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
     ; GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32)
     ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; GFX11-UNALIGNED-LABEL: name: test_load_local_v2s32_align1
     ; GFX11-UNALIGNED: liveins: $vgpr0
     ; GFX11-UNALIGNED-NEXT: {{  $}}
@@ -10938,6 +11466,7 @@ body: |
     ; SI-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
+    ;
     ; CI-LABEL: name: test_load_local_v3s32_align16
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -10990,6 +11519,7 @@ body: |
     ; CI-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
     ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
+    ;
     ; CI-DS128-LABEL: name: test_load_local_v3s32_align16
     ; CI-DS128: liveins: $vgpr0
     ; CI-DS128-NEXT: {{  $}}
@@ -11042,6 +11572,7 @@ body: |
     ; CI-DS128-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
     ; CI-DS128-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32)
     ; CI-DS128-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
+    ;
     ; VI-LABEL: name: test_load_local_v3s32_align16
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -11094,6 +11625,7 @@ body: |
     ; VI-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
+    ;
     ; GFX9-LABEL: name: test_load_local_v3s32_align16
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -11146,12 +11678,14 @@ body: |
     ; GFX9-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
     ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
+    ;
     ; GFX9-UNALIGNED-LABEL: name: test_load_local_v3s32_align16
     ; GFX9-UNALIGNED: liveins: $vgpr0
     ; GFX9-UNALIGNED-NEXT: {{  $}}
     ; GFX9-UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX9-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p3) :: (load (<3 x s32>), align 1, addrspace 3)
     ; GFX9-UNALIGNED-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](<3 x s32>)
+    ;
     ; GFX10-LABEL: name: test_load_local_v3s32_align16
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -11204,6 +11738,7 @@ body: |
     ; GFX10-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
     ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
+    ;
     ; GFX10-UNALIGNED-LABEL: name: test_load_local_v3s32_align16
     ; GFX10-UNALIGNED: liveins: $vgpr0
     ; GFX10-UNALIGNED-NEXT: {{  $}}
@@ -11217,6 +11752,7 @@ body: |
     ; GFX10-UNALIGNED-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p3) :: (load (s32) from unknown-address + 8, align 1, addrspace 3)
     ; GFX10-UNALIGNED-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32)
     ; GFX10-UNALIGNED-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
+    ;
     ; GFX11-LABEL: name: test_load_local_v3s32_align16
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -11269,6 +11805,7 @@ body: |
     ; GFX11-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
     ; GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32)
     ; GFX11-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
+    ;
     ; GFX11-UNALIGNED-LABEL: name: test_load_local_v3s32_align16
     ; GFX11-UNALIGNED: liveins: $vgpr0
     ; GFX11-UNALIGNED-NEXT: {{  $}}
@@ -11297,6 +11834,7 @@ body: |
     ; SI-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>)
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[LOAD1]](s32)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
+    ;
     ; CI-LABEL: name: test_load_local_v3s32_align4
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -11308,6 +11846,7 @@ body: |
     ; CI-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[LOAD]](<2 x s32>)
     ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[UV]](s32), [[UV1]](s32), [[LOAD1]](s32)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
+    ;
     ; CI-DS128-LABEL: name: test_load_local_v3s32_align4
     ; CI-DS128: liveins: $vgpr0
     ; CI-DS128-NEXT: {{  $}}
@@ -11321,6 +11860,7 @@ body: |
     ; CI-DS128-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p3) :: (load (s32) from unknown-address + 8, addrspace 3)
     ; CI-DS128-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32)
     ; CI-DS128-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
+    ;
     ; VI-LABEL: name: test_load_local_v3s32_align4
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -11334,6 +11874,7 @@ body: |
     ; VI-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p3) :: (load (s32) from unknown-address + 8, addrspace 3)
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
+    ;
     ; GFX9-LABEL: name: test_load_local_v3s32_align4
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -11347,12 +11888,14 @@ body: |
     ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p3) :: (load (s32) from unknown-address + 8, addrspace 3)
     ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
+    ;
     ; GFX9-UNALIGNED-LABEL: name: test_load_local_v3s32_align4
     ; GFX9-UNALIGNED: liveins: $vgpr0
     ; GFX9-UNALIGNED-NEXT: {{  $}}
     ; GFX9-UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX9-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p3) :: (load (<3 x s32>), align 4, addrspace 3)
     ; GFX9-UNALIGNED-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](<3 x s32>)
+    ;
     ; GFX10-LABEL: name: test_load_local_v3s32_align4
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -11366,6 +11909,7 @@ body: |
     ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p3) :: (load (s32) from unknown-address + 8, addrspace 3)
     ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
+    ;
     ; GFX10-UNALIGNED-LABEL: name: test_load_local_v3s32_align4
     ; GFX10-UNALIGNED: liveins: $vgpr0
     ; GFX10-UNALIGNED-NEXT: {{  $}}
@@ -11379,6 +11923,7 @@ body: |
     ; GFX10-UNALIGNED-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p3) :: (load (s32) from unknown-address + 8, addrspace 3)
     ; GFX10-UNALIGNED-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32)
     ; GFX10-UNALIGNED-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
+    ;
     ; GFX11-LABEL: name: test_load_local_v3s32_align4
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -11392,6 +11937,7 @@ body: |
     ; GFX11-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p3) :: (load (s32) from unknown-address + 8, addrspace 3)
     ; GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32)
     ; GFX11-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
+    ;
     ; GFX11-UNALIGNED-LABEL: name: test_load_local_v3s32_align4
     ; GFX11-UNALIGNED: liveins: $vgpr0
     ; GFX11-UNALIGNED-NEXT: {{  $}}
@@ -11419,6 +11965,7 @@ body: |
     ; SI-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[PTR_ADD]](p3) :: (load (<2 x s32>) from unknown-address + 8, addrspace 3)
     ; SI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s32>) = G_CONCAT_VECTORS [[LOAD]](<2 x s32>), [[LOAD1]](<2 x s32>)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[CONCAT_VECTORS]](<4 x s32>)
+    ;
     ; CI-LABEL: name: test_load_local_v4s32_align16
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -11429,48 +11976,56 @@ body: |
     ; CI-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[PTR_ADD]](p3) :: (load (<2 x s32>) from unknown-address + 8, addrspace 3)
     ; CI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s32>) = G_CONCAT_VECTORS [[LOAD]](<2 x s32>), [[LOAD1]](<2 x s32>)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[CONCAT_VECTORS]](<4 x s32>)
+    ;
     ; CI-DS128-LABEL: name: test_load_local_v4s32_align16
     ; CI-DS128: liveins: $vgpr0
     ; CI-DS128-NEXT: {{  $}}
     ; CI-DS128-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; CI-DS128-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p3) :: (load (<4 x s32>), addrspace 3)
     ; CI-DS128-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ;
     ; VI-LABEL: name: test_load_local_v4s32_align16
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p3) :: (load (<4 x s32>), addrspace 3)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ;
     ; GFX9-LABEL: name: test_load_local_v4s32_align16
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p3) :: (load (<4 x s32>), addrspace 3)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ;
     ; GFX9-UNALIGNED-LABEL: name: test_load_local_v4s32_align16
     ; GFX9-UNALIGNED: liveins: $vgpr0
     ; GFX9-UNALIGNED-NEXT: {{  $}}
     ; GFX9-UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX9-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p3) :: (load (<4 x s32>), addrspace 3)
     ; GFX9-UNALIGNED-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ;
     ; GFX10-LABEL: name: test_load_local_v4s32_align16
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p3) :: (load (<4 x s32>), addrspace 3)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ;
     ; GFX10-UNALIGNED-LABEL: name: test_load_local_v4s32_align16
     ; GFX10-UNALIGNED: liveins: $vgpr0
     ; GFX10-UNALIGNED-NEXT: {{  $}}
     ; GFX10-UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX10-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p3) :: (load (<4 x s32>), addrspace 3)
     ; GFX10-UNALIGNED-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ;
     ; GFX11-LABEL: name: test_load_local_v4s32_align16
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p3) :: (load (<4 x s32>), addrspace 3)
     ; GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ;
     ; GFX11-UNALIGNED-LABEL: name: test_load_local_v4s32_align16
     ; GFX11-UNALIGNED: liveins: $vgpr0
     ; GFX11-UNALIGNED-NEXT: {{  $}}
@@ -11498,6 +12053,7 @@ body: |
     ; SI-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[PTR_ADD]](p3) :: (load (<2 x s32>) from unknown-address + 8, addrspace 3)
     ; SI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s32>) = G_CONCAT_VECTORS [[LOAD]](<2 x s32>), [[LOAD1]](<2 x s32>)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[CONCAT_VECTORS]](<4 x s32>)
+    ;
     ; CI-LABEL: name: test_load_local_v4s32_align8
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -11508,30 +12064,35 @@ body: |
     ; CI-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[PTR_ADD]](p3) :: (load (<2 x s32>) from unknown-address + 8, addrspace 3)
     ; CI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s32>) = G_CONCAT_VECTORS [[LOAD]](<2 x s32>), [[LOAD1]](<2 x s32>)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[CONCAT_VECTORS]](<4 x s32>)
+    ;
     ; CI-DS128-LABEL: name: test_load_local_v4s32_align8
     ; CI-DS128: liveins: $vgpr0
     ; CI-DS128-NEXT: {{  $}}
     ; CI-DS128-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; CI-DS128-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p3) :: (load (<4 x s32>), align 8, addrspace 3)
     ; CI-DS128-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ;
     ; VI-LABEL: name: test_load_local_v4s32_align8
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p3) :: (load (<4 x s32>), align 8, addrspace 3)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ;
     ; GFX9-LABEL: name: test_load_local_v4s32_align8
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p3) :: (load (<4 x s32>), align 8, addrspace 3)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ;
     ; GFX9-UNALIGNED-LABEL: name: test_load_local_v4s32_align8
     ; GFX9-UNALIGNED: liveins: $vgpr0
     ; GFX9-UNALIGNED-NEXT: {{  $}}
     ; GFX9-UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX9-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p3) :: (load (<4 x s32>), align 8, addrspace 3)
     ; GFX9-UNALIGNED-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ;
     ; GFX10-LABEL: name: test_load_local_v4s32_align8
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -11548,6 +12109,7 @@ body: |
     ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p3) :: (load (s32) from unknown-address + 12, addrspace 3)
     ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
+    ;
     ; GFX10-UNALIGNED-LABEL: name: test_load_local_v4s32_align8
     ; GFX10-UNALIGNED: liveins: $vgpr0
     ; GFX10-UNALIGNED-NEXT: {{  $}}
@@ -11564,12 +12126,14 @@ body: |
     ; GFX10-UNALIGNED-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p3) :: (load (s32) from unknown-address + 12, addrspace 3)
     ; GFX10-UNALIGNED-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32)
     ; GFX10-UNALIGNED-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
+    ;
     ; GFX11-LABEL: name: test_load_local_v4s32_align8
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p3) :: (load (<4 x s32>), align 8, addrspace 3)
     ; GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ;
     ; GFX11-UNALIGNED-LABEL: name: test_load_local_v4s32_align8
     ; GFX11-UNALIGNED: liveins: $vgpr0
     ; GFX11-UNALIGNED-NEXT: {{  $}}
@@ -11597,6 +12161,7 @@ body: |
     ; SI-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[PTR_ADD]](p3) :: (load (<2 x s32>) from unknown-address + 8, align 4, addrspace 3)
     ; SI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s32>) = G_CONCAT_VECTORS [[LOAD]](<2 x s32>), [[LOAD1]](<2 x s32>)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[CONCAT_VECTORS]](<4 x s32>)
+    ;
     ; CI-LABEL: name: test_load_local_v4s32_align4
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -11607,6 +12172,7 @@ body: |
     ; CI-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[PTR_ADD]](p3) :: (load (<2 x s32>) from unknown-address + 8, align 4, addrspace 3)
     ; CI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s32>) = G_CONCAT_VECTORS [[LOAD]](<2 x s32>), [[LOAD1]](<2 x s32>)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[CONCAT_VECTORS]](<4 x s32>)
+    ;
     ; CI-DS128-LABEL: name: test_load_local_v4s32_align4
     ; CI-DS128: liveins: $vgpr0
     ; CI-DS128-NEXT: {{  $}}
@@ -11623,6 +12189,7 @@ body: |
     ; CI-DS128-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p3) :: (load (s32) from unknown-address + 12, addrspace 3)
     ; CI-DS128-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32)
     ; CI-DS128-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
+    ;
     ; VI-LABEL: name: test_load_local_v4s32_align4
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -11639,6 +12206,7 @@ body: |
     ; VI-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p3) :: (load (s32) from unknown-address + 12, addrspace 3)
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
+    ;
     ; GFX9-LABEL: name: test_load_local_v4s32_align4
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -11655,12 +12223,14 @@ body: |
     ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p3) :: (load (s32) from unknown-address + 12, addrspace 3)
     ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
+    ;
     ; GFX9-UNALIGNED-LABEL: name: test_load_local_v4s32_align4
     ; GFX9-UNALIGNED: liveins: $vgpr0
     ; GFX9-UNALIGNED-NEXT: {{  $}}
     ; GFX9-UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX9-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p3) :: (load (<4 x s32>), align 4, addrspace 3)
     ; GFX9-UNALIGNED-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ;
     ; GFX10-LABEL: name: test_load_local_v4s32_align4
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -11677,6 +12247,7 @@ body: |
     ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p3) :: (load (s32) from unknown-address + 12, addrspace 3)
     ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
+    ;
     ; GFX10-UNALIGNED-LABEL: name: test_load_local_v4s32_align4
     ; GFX10-UNALIGNED: liveins: $vgpr0
     ; GFX10-UNALIGNED-NEXT: {{  $}}
@@ -11693,6 +12264,7 @@ body: |
     ; GFX10-UNALIGNED-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p3) :: (load (s32) from unknown-address + 12, addrspace 3)
     ; GFX10-UNALIGNED-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32)
     ; GFX10-UNALIGNED-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
+    ;
     ; GFX11-LABEL: name: test_load_local_v4s32_align4
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -11709,6 +12281,7 @@ body: |
     ; GFX11-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p3) :: (load (s32) from unknown-address + 12, addrspace 3)
     ; GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32)
     ; GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
+    ;
     ; GFX11-UNALIGNED-LABEL: name: test_load_local_v4s32_align4
     ; GFX11-UNALIGNED: liveins: $vgpr0
     ; GFX11-UNALIGNED-NEXT: {{  $}}
@@ -11761,6 +12334,7 @@ body: |
     ; SI-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR3]](s32)
     ; SI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s32>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s32>), [[BUILD_VECTOR1]](<2 x s32>)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[CONCAT_VECTORS]](<4 x s32>)
+    ;
     ; CI-LABEL: name: test_load_local_v4s32_align2
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -11796,6 +12370,7 @@ body: |
     ; CI-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR3]](s32)
     ; CI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s32>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s32>), [[BUILD_VECTOR1]](<2 x s32>)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[CONCAT_VECTORS]](<4 x s32>)
+    ;
     ; CI-DS128-LABEL: name: test_load_local_v4s32_align2
     ; CI-DS128: liveins: $vgpr0
     ; CI-DS128-NEXT: {{  $}}
@@ -11830,6 +12405,7 @@ body: |
     ; CI-DS128-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
     ; CI-DS128-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32), [[OR3]](s32)
     ; CI-DS128-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
+    ;
     ; VI-LABEL: name: test_load_local_v4s32_align2
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -11864,6 +12440,7 @@ body: |
     ; VI-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32), [[OR3]](s32)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
+    ;
     ; GFX9-LABEL: name: test_load_local_v4s32_align2
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -11898,12 +12475,14 @@ body: |
     ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
     ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32), [[OR3]](s32)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
+    ;
     ; GFX9-UNALIGNED-LABEL: name: test_load_local_v4s32_align2
     ; GFX9-UNALIGNED: liveins: $vgpr0
     ; GFX9-UNALIGNED-NEXT: {{  $}}
     ; GFX9-UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX9-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p3) :: (load (<4 x s32>), align 2, addrspace 3)
     ; GFX9-UNALIGNED-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ;
     ; GFX10-LABEL: name: test_load_local_v4s32_align2
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -11938,6 +12517,7 @@ body: |
     ; GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
     ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32), [[OR3]](s32)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
+    ;
     ; GFX10-UNALIGNED-LABEL: name: test_load_local_v4s32_align2
     ; GFX10-UNALIGNED: liveins: $vgpr0
     ; GFX10-UNALIGNED-NEXT: {{  $}}
@@ -11954,6 +12534,7 @@ body: |
     ; GFX10-UNALIGNED-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p3) :: (load (s32) from unknown-address + 12, align 2, addrspace 3)
     ; GFX10-UNALIGNED-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32)
     ; GFX10-UNALIGNED-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
+    ;
     ; GFX11-LABEL: name: test_load_local_v4s32_align2
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -11988,6 +12569,7 @@ body: |
     ; GFX11-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
     ; GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32), [[OR3]](s32)
     ; GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
+    ;
     ; GFX11-UNALIGNED-LABEL: name: test_load_local_v4s32_align2
     ; GFX11-UNALIGNED: liveins: $vgpr0
     ; GFX11-UNALIGNED-NEXT: {{  $}}
@@ -12073,6 +12655,7 @@ body: |
     ; SI-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR8]](s32), [[OR11]](s32)
     ; SI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s32>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s32>), [[BUILD_VECTOR1]](<2 x s32>)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[CONCAT_VECTORS]](<4 x s32>)
+    ;
     ; CI-LABEL: name: test_load_local_v4s32_align1
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -12141,6 +12724,7 @@ body: |
     ; CI-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR8]](s32), [[OR11]](s32)
     ; CI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s32>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s32>), [[BUILD_VECTOR1]](<2 x s32>)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[CONCAT_VECTORS]](<4 x s32>)
+    ;
     ; CI-DS128-LABEL: name: test_load_local_v4s32_align1
     ; CI-DS128: liveins: $vgpr0
     ; CI-DS128-NEXT: {{  $}}
@@ -12208,6 +12792,7 @@ body: |
     ; CI-DS128-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
     ; CI-DS128-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
     ; CI-DS128-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
+    ;
     ; VI-LABEL: name: test_load_local_v4s32_align1
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -12275,6 +12860,7 @@ body: |
     ; VI-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
+    ;
     ; GFX9-LABEL: name: test_load_local_v4s32_align1
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -12342,12 +12928,14 @@ body: |
     ; GFX9-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
     ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
+    ;
     ; GFX9-UNALIGNED-LABEL: name: test_load_local_v4s32_align1
     ; GFX9-UNALIGNED: liveins: $vgpr0
     ; GFX9-UNALIGNED-NEXT: {{  $}}
     ; GFX9-UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX9-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p3) :: (load (<4 x s32>), align 1, addrspace 3)
     ; GFX9-UNALIGNED-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ;
     ; GFX10-LABEL: name: test_load_local_v4s32_align1
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -12415,6 +13003,7 @@ body: |
     ; GFX10-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
     ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
+    ;
     ; GFX10-UNALIGNED-LABEL: name: test_load_local_v4s32_align1
     ; GFX10-UNALIGNED: liveins: $vgpr0
     ; GFX10-UNALIGNED-NEXT: {{  $}}
@@ -12431,6 +13020,7 @@ body: |
     ; GFX10-UNALIGNED-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p3) :: (load (s32) from unknown-address + 12, align 1, addrspace 3)
     ; GFX10-UNALIGNED-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32)
     ; GFX10-UNALIGNED-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
+    ;
     ; GFX11-LABEL: name: test_load_local_v4s32_align1
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -12498,6 +13088,7 @@ body: |
     ; GFX11-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
     ; GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
     ; GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
+    ;
     ; GFX11-UNALIGNED-LABEL: name: test_load_local_v4s32_align1
     ; GFX11-UNALIGNED: liveins: $vgpr0
     ; GFX11-UNALIGNED-NEXT: {{  $}}
@@ -12531,6 +13122,7 @@ body: |
     ; SI-NEXT: [[LOAD3:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[PTR_ADD2]](p3) :: (load (<2 x s32>) from unknown-address + 24, addrspace 3)
     ; SI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s32>) = G_CONCAT_VECTORS [[LOAD]](<2 x s32>), [[LOAD1]](<2 x s32>), [[LOAD2]](<2 x s32>), [[LOAD3]](<2 x s32>)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<8 x s32>)
+    ;
     ; CI-LABEL: name: test_load_local_v8s32_align32
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -12547,6 +13139,7 @@ body: |
     ; CI-NEXT: [[LOAD3:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[PTR_ADD2]](p3) :: (load (<2 x s32>) from unknown-address + 24, addrspace 3)
     ; CI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s32>) = G_CONCAT_VECTORS [[LOAD]](<2 x s32>), [[LOAD1]](<2 x s32>), [[LOAD2]](<2 x s32>), [[LOAD3]](<2 x s32>)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<8 x s32>)
+    ;
     ; CI-DS128-LABEL: name: test_load_local_v8s32_align32
     ; CI-DS128: liveins: $vgpr0
     ; CI-DS128-NEXT: {{  $}}
@@ -12557,6 +13150,7 @@ body: |
     ; CI-DS128-NEXT: [[LOAD1:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD]](p3) :: (load (<4 x s32>) from unknown-address + 16, addrspace 3)
     ; CI-DS128-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>)
     ; CI-DS128-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<8 x s32>)
+    ;
     ; VI-LABEL: name: test_load_local_v8s32_align32
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -12567,6 +13161,7 @@ body: |
     ; VI-NEXT: [[LOAD1:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD]](p3) :: (load (<4 x s32>) from unknown-address + 16, addrspace 3)
     ; VI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<8 x s32>)
+    ;
     ; GFX9-LABEL: name: test_load_local_v8s32_align32
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -12577,6 +13172,7 @@ body: |
     ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD]](p3) :: (load (<4 x s32>) from unknown-address + 16, addrspace 3)
     ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<8 x s32>)
+    ;
     ; GFX9-UNALIGNED-LABEL: name: test_load_local_v8s32_align32
     ; GFX9-UNALIGNED: liveins: $vgpr0
     ; GFX9-UNALIGNED-NEXT: {{  $}}
@@ -12587,6 +13183,7 @@ body: |
     ; GFX9-UNALIGNED-NEXT: [[LOAD1:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD]](p3) :: (load (<4 x s32>) from unknown-address + 16, addrspace 3)
     ; GFX9-UNALIGNED-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>)
     ; GFX9-UNALIGNED-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<8 x s32>)
+    ;
     ; GFX10-LABEL: name: test_load_local_v8s32_align32
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -12597,6 +13194,7 @@ body: |
     ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD]](p3) :: (load (<4 x s32>) from unknown-address + 16, addrspace 3)
     ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<8 x s32>)
+    ;
     ; GFX10-UNALIGNED-LABEL: name: test_load_local_v8s32_align32
     ; GFX10-UNALIGNED: liveins: $vgpr0
     ; GFX10-UNALIGNED-NEXT: {{  $}}
@@ -12607,6 +13205,7 @@ body: |
     ; GFX10-UNALIGNED-NEXT: [[LOAD1:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD]](p3) :: (load (<4 x s32>) from unknown-address + 16, addrspace 3)
     ; GFX10-UNALIGNED-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>)
     ; GFX10-UNALIGNED-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<8 x s32>)
+    ;
     ; GFX11-LABEL: name: test_load_local_v8s32_align32
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -12617,6 +13216,7 @@ body: |
     ; GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD]](p3) :: (load (<4 x s32>) from unknown-address + 16, addrspace 3)
     ; GFX11-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>)
     ; GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<8 x s32>)
+    ;
     ; GFX11-UNALIGNED-LABEL: name: test_load_local_v8s32_align32
     ; GFX11-UNALIGNED: liveins: $vgpr0
     ; GFX11-UNALIGNED-NEXT: {{  $}}
@@ -12666,6 +13266,7 @@ body: |
     ; SI-NEXT: [[LOAD7:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[PTR_ADD6]](p3) :: (load (<2 x s32>) from unknown-address + 56, addrspace 3)
     ; SI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<16 x s32>) = G_CONCAT_VECTORS [[LOAD]](<2 x s32>), [[LOAD1]](<2 x s32>), [[LOAD2]](<2 x s32>), [[LOAD3]](<2 x s32>), [[LOAD4]](<2 x s32>), [[LOAD5]](<2 x s32>), [[LOAD6]](<2 x s32>), [[LOAD7]](<2 x s32>)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[CONCAT_VECTORS]](<16 x s32>)
+    ;
     ; CI-LABEL: name: test_load_local_v16s32_align32
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -12694,6 +13295,7 @@ body: |
     ; CI-NEXT: [[LOAD7:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[PTR_ADD6]](p3) :: (load (<2 x s32>) from unknown-address + 56, addrspace 3)
     ; CI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<16 x s32>) = G_CONCAT_VECTORS [[LOAD]](<2 x s32>), [[LOAD1]](<2 x s32>), [[LOAD2]](<2 x s32>), [[LOAD3]](<2 x s32>), [[LOAD4]](<2 x s32>), [[LOAD5]](<2 x s32>), [[LOAD6]](<2 x s32>), [[LOAD7]](<2 x s32>)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[CONCAT_VECTORS]](<16 x s32>)
+    ;
     ; CI-DS128-LABEL: name: test_load_local_v16s32_align32
     ; CI-DS128: liveins: $vgpr0
     ; CI-DS128-NEXT: {{  $}}
@@ -12710,6 +13312,7 @@ body: |
     ; CI-DS128-NEXT: [[LOAD3:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD2]](p3) :: (load (<4 x s32>) from unknown-address + 48, addrspace 3)
     ; CI-DS128-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<16 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>), [[LOAD2]](<4 x s32>), [[LOAD3]](<4 x s32>)
     ; CI-DS128-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[CONCAT_VECTORS]](<16 x s32>)
+    ;
     ; VI-LABEL: name: test_load_local_v16s32_align32
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -12726,6 +13329,7 @@ body: |
     ; VI-NEXT: [[LOAD3:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD2]](p3) :: (load (<4 x s32>) from unknown-address + 48, addrspace 3)
     ; VI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<16 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>), [[LOAD2]](<4 x s32>), [[LOAD3]](<4 x s32>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[CONCAT_VECTORS]](<16 x s32>)
+    ;
     ; GFX9-LABEL: name: test_load_local_v16s32_align32
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -12742,6 +13346,7 @@ body: |
     ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD2]](p3) :: (load (<4 x s32>) from unknown-address + 48, addrspace 3)
     ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<16 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>), [[LOAD2]](<4 x s32>), [[LOAD3]](<4 x s32>)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[CONCAT_VECTORS]](<16 x s32>)
+    ;
     ; GFX9-UNALIGNED-LABEL: name: test_load_local_v16s32_align32
     ; GFX9-UNALIGNED: liveins: $vgpr0
     ; GFX9-UNALIGNED-NEXT: {{  $}}
@@ -12758,6 +13363,7 @@ body: |
     ; GFX9-UNALIGNED-NEXT: [[LOAD3:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD2]](p3) :: (load (<4 x s32>) from unknown-address + 48, addrspace 3)
     ; GFX9-UNALIGNED-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<16 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>), [[LOAD2]](<4 x s32>), [[LOAD3]](<4 x s32>)
     ; GFX9-UNALIGNED-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[CONCAT_VECTORS]](<16 x s32>)
+    ;
     ; GFX10-LABEL: name: test_load_local_v16s32_align32
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -12774,6 +13380,7 @@ body: |
     ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD2]](p3) :: (load (<4 x s32>) from unknown-address + 48, addrspace 3)
     ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<16 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>), [[LOAD2]](<4 x s32>), [[LOAD3]](<4 x s32>)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[CONCAT_VECTORS]](<16 x s32>)
+    ;
     ; GFX10-UNALIGNED-LABEL: name: test_load_local_v16s32_align32
     ; GFX10-UNALIGNED: liveins: $vgpr0
     ; GFX10-UNALIGNED-NEXT: {{  $}}
@@ -12790,6 +13397,7 @@ body: |
     ; GFX10-UNALIGNED-NEXT: [[LOAD3:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD2]](p3) :: (load (<4 x s32>) from unknown-address + 48, addrspace 3)
     ; GFX10-UNALIGNED-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<16 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>), [[LOAD2]](<4 x s32>), [[LOAD3]](<4 x s32>)
     ; GFX10-UNALIGNED-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[CONCAT_VECTORS]](<16 x s32>)
+    ;
     ; GFX11-LABEL: name: test_load_local_v16s32_align32
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -12806,6 +13414,7 @@ body: |
     ; GFX11-NEXT: [[LOAD3:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[PTR_ADD2]](p3) :: (load (<4 x s32>) from unknown-address + 48, addrspace 3)
     ; GFX11-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<16 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>), [[LOAD2]](<4 x s32>), [[LOAD3]](<4 x s32>)
     ; GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[CONCAT_VECTORS]](<16 x s32>)
+    ;
     ; GFX11-UNALIGNED-LABEL: name: test_load_local_v16s32_align32
     ; GFX11-UNALIGNED: liveins: $vgpr0
     ; GFX11-UNALIGNED-NEXT: {{  $}}
@@ -12843,6 +13452,7 @@ body: |
     ; SI-NEXT: [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD]](p3) :: (load (s64) from unknown-address + 8, align 4, addrspace 3)
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[LOAD]](s64), [[LOAD1]](s64)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ;
     ; CI-LABEL: name: test_load_local_v2s64_align4
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -12853,6 +13463,7 @@ body: |
     ; CI-NEXT: [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD]](p3) :: (load (s64) from unknown-address + 8, align 4, addrspace 3)
     ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[LOAD]](s64), [[LOAD1]](s64)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ;
     ; CI-DS128-LABEL: name: test_load_local_v2s64_align4
     ; CI-DS128: liveins: $vgpr0
     ; CI-DS128-NEXT: {{  $}}
@@ -12863,6 +13474,7 @@ body: |
     ; CI-DS128-NEXT: [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD]](p3) :: (load (s64) from unknown-address + 8, align 4, addrspace 3)
     ; CI-DS128-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[LOAD]](s64), [[LOAD1]](s64)
     ; CI-DS128-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ;
     ; VI-LABEL: name: test_load_local_v2s64_align4
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -12873,6 +13485,7 @@ body: |
     ; VI-NEXT: [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD]](p3) :: (load (s64) from unknown-address + 8, align 4, addrspace 3)
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[LOAD]](s64), [[LOAD1]](s64)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ;
     ; GFX9-LABEL: name: test_load_local_v2s64_align4
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -12883,12 +13496,14 @@ body: |
     ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD]](p3) :: (load (s64) from unknown-address + 8, align 4, addrspace 3)
     ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[LOAD]](s64), [[LOAD1]](s64)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ;
     ; GFX9-UNALIGNED-LABEL: name: test_load_local_v2s64_align4
     ; GFX9-UNALIGNED: liveins: $vgpr0
     ; GFX9-UNALIGNED-NEXT: {{  $}}
     ; GFX9-UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX9-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p3) :: (load (<2 x s64>), align 4, addrspace 3)
     ; GFX9-UNALIGNED-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>)
+    ;
     ; GFX10-LABEL: name: test_load_local_v2s64_align4
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -12899,6 +13514,7 @@ body: |
     ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD]](p3) :: (load (s64) from unknown-address + 8, align 4, addrspace 3)
     ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[LOAD]](s64), [[LOAD1]](s64)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ;
     ; GFX10-UNALIGNED-LABEL: name: test_load_local_v2s64_align4
     ; GFX10-UNALIGNED: liveins: $vgpr0
     ; GFX10-UNALIGNED-NEXT: {{  $}}
@@ -12909,6 +13525,7 @@ body: |
     ; GFX10-UNALIGNED-NEXT: [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD]](p3) :: (load (s64) from unknown-address + 8, align 4, addrspace 3)
     ; GFX10-UNALIGNED-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[LOAD]](s64), [[LOAD1]](s64)
     ; GFX10-UNALIGNED-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ;
     ; GFX11-LABEL: name: test_load_local_v2s64_align4
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -12919,6 +13536,7 @@ body: |
     ; GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD]](p3) :: (load (s64) from unknown-address + 8, align 4, addrspace 3)
     ; GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[LOAD]](s64), [[LOAD1]](s64)
     ; GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ;
     ; GFX11-UNALIGNED-LABEL: name: test_load_local_v2s64_align4
     ; GFX11-UNALIGNED: liveins: $vgpr0
     ; GFX11-UNALIGNED-NEXT: {{  $}}
@@ -13012,6 +13630,7 @@ body: |
     ; SI-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]]
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ;
     ; CI-LABEL: name: test_load_local_v2s64_align16
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -13088,6 +13707,7 @@ body: |
     ; CI-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]]
     ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ;
     ; CI-DS128-LABEL: name: test_load_local_v2s64_align16
     ; CI-DS128: liveins: $vgpr0
     ; CI-DS128-NEXT: {{  $}}
@@ -13164,6 +13784,7 @@ body: |
     ; CI-DS128-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]]
     ; CI-DS128-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64)
     ; CI-DS128-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ;
     ; VI-LABEL: name: test_load_local_v2s64_align16
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -13240,6 +13861,7 @@ body: |
     ; VI-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]]
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ;
     ; GFX9-LABEL: name: test_load_local_v2s64_align16
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -13316,12 +13938,14 @@ body: |
     ; GFX9-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]]
     ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ;
     ; GFX9-UNALIGNED-LABEL: name: test_load_local_v2s64_align16
     ; GFX9-UNALIGNED: liveins: $vgpr0
     ; GFX9-UNALIGNED-NEXT: {{  $}}
     ; GFX9-UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX9-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[COPY]](p3) :: (load (<2 x s64>), align 1, addrspace 3)
     ; GFX9-UNALIGNED-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<2 x s64>)
+    ;
     ; GFX10-LABEL: name: test_load_local_v2s64_align16
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -13398,6 +14022,7 @@ body: |
     ; GFX10-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]]
     ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ;
     ; GFX10-UNALIGNED-LABEL: name: test_load_local_v2s64_align16
     ; GFX10-UNALIGNED: liveins: $vgpr0
     ; GFX10-UNALIGNED-NEXT: {{  $}}
@@ -13423,6 +14048,7 @@ body: |
     ; GFX10-UNALIGNED-NEXT: [[OR1:%[0-9]+]]:_(s64) = G_OR [[SHL1]], [[ZEXT1]]
     ; GFX10-UNALIGNED-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR]](s64), [[OR1]](s64)
     ; GFX10-UNALIGNED-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ;
     ; GFX11-LABEL: name: test_load_local_v2s64_align16
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -13499,6 +14125,7 @@ body: |
     ; GFX11-NEXT: [[OR13:%[0-9]+]]:_(s64) = G_OR [[SHL13]], [[ZEXT1]]
     ; GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[OR6]](s64), [[OR13]](s64)
     ; GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ;
     ; GFX11-UNALIGNED-LABEL: name: test_load_local_v2s64_align16
     ; GFX11-UNALIGNED: liveins: $vgpr0
     ; GFX11-UNALIGNED-NEXT: {{  $}}
@@ -13531,6 +14158,7 @@ body: |
     ; SI-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64), [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>)
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[LOAD]](s64), [[LOAD1]](s64), [[LOAD2]](s64), [[UV3]](s64)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
+    ;
     ; CI-LABEL: name: test_load_local_v3s64_align32
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -13546,6 +14174,7 @@ body: |
     ; CI-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64), [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>)
     ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[LOAD]](s64), [[LOAD1]](s64), [[LOAD2]](s64), [[UV3]](s64)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
+    ;
     ; CI-DS128-LABEL: name: test_load_local_v3s64_align32
     ; CI-DS128: liveins: $vgpr0
     ; CI-DS128-NEXT: {{  $}}
@@ -13559,6 +14188,7 @@ body: |
     ; CI-DS128-NEXT: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64), [[UV4:%[0-9]+]]:_(s64), [[UV5:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>)
     ; CI-DS128-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[UV]](s64), [[UV1]](s64), [[LOAD1]](s64), [[UV5]](s64)
     ; CI-DS128-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
+    ;
     ; VI-LABEL: name: test_load_local_v3s64_align32
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -13572,6 +14202,7 @@ body: |
     ; VI-NEXT: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64), [[UV4:%[0-9]+]]:_(s64), [[UV5:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>)
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[UV]](s64), [[UV1]](s64), [[LOAD1]](s64), [[UV5]](s64)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
+    ;
     ; GFX9-LABEL: name: test_load_local_v3s64_align32
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -13585,6 +14216,7 @@ body: |
     ; GFX9-NEXT: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64), [[UV4:%[0-9]+]]:_(s64), [[UV5:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>)
     ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[UV]](s64), [[UV1]](s64), [[LOAD1]](s64), [[UV5]](s64)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
+    ;
     ; GFX9-UNALIGNED-LABEL: name: test_load_local_v3s64_align32
     ; GFX9-UNALIGNED: liveins: $vgpr0
     ; GFX9-UNALIGNED-NEXT: {{  $}}
@@ -13598,6 +14230,7 @@ body: |
     ; GFX9-UNALIGNED-NEXT: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64), [[UV4:%[0-9]+]]:_(s64), [[UV5:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>)
     ; GFX9-UNALIGNED-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[UV]](s64), [[UV1]](s64), [[LOAD1]](s64), [[UV5]](s64)
     ; GFX9-UNALIGNED-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
+    ;
     ; GFX10-LABEL: name: test_load_local_v3s64_align32
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -13611,6 +14244,7 @@ body: |
     ; GFX10-NEXT: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64), [[UV4:%[0-9]+]]:_(s64), [[UV5:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>)
     ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[UV]](s64), [[UV1]](s64), [[LOAD1]](s64), [[UV5]](s64)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
+    ;
     ; GFX10-UNALIGNED-LABEL: name: test_load_local_v3s64_align32
     ; GFX10-UNALIGNED: liveins: $vgpr0
     ; GFX10-UNALIGNED-NEXT: {{  $}}
@@ -13624,6 +14258,7 @@ body: |
     ; GFX10-UNALIGNED-NEXT: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64), [[UV4:%[0-9]+]]:_(s64), [[UV5:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>)
     ; GFX10-UNALIGNED-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[UV]](s64), [[UV1]](s64), [[LOAD1]](s64), [[UV5]](s64)
     ; GFX10-UNALIGNED-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
+    ;
     ; GFX11-LABEL: name: test_load_local_v3s64_align32
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -13637,6 +14272,7 @@ body: |
     ; GFX11-NEXT: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64), [[UV4:%[0-9]+]]:_(s64), [[UV5:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>)
     ; GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[UV]](s64), [[UV1]](s64), [[LOAD1]](s64), [[UV5]](s64)
     ; GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
+    ;
     ; GFX11-UNALIGNED-LABEL: name: test_load_local_v3s64_align32
     ; GFX11-UNALIGNED: liveins: $vgpr0
     ; GFX11-UNALIGNED-NEXT: {{  $}}
@@ -13679,6 +14315,7 @@ body: |
     ; SI-NEXT: [[LOAD3:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD2]](p3) :: (load (s64) from unknown-address + 24, addrspace 3)
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[LOAD]](s64), [[LOAD1]](s64), [[LOAD2]](s64), [[LOAD3]](s64)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
+    ;
     ; CI-LABEL: name: test_load_local_v4s64_align32
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -13695,6 +14332,7 @@ body: |
     ; CI-NEXT: [[LOAD3:%[0-9]+]]:_(s64) = G_LOAD [[PTR_ADD2]](p3) :: (load (s64) from unknown-address + 24, addrspace 3)
     ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[LOAD]](s64), [[LOAD1]](s64), [[LOAD2]](s64), [[LOAD3]](s64)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
+    ;
     ; CI-DS128-LABEL: name: test_load_local_v4s64_align32
     ; CI-DS128: liveins: $vgpr0
     ; CI-DS128-NEXT: {{  $}}
@@ -13705,6 +14343,7 @@ body: |
     ; CI-DS128-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[PTR_ADD]](p3) :: (load (<2 x s64>) from unknown-address + 16, addrspace 3)
     ; CI-DS128-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s64>) = G_CONCAT_VECTORS [[LOAD]](<2 x s64>), [[LOAD1]](<2 x s64>)
     ; CI-DS128-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<4 x s64>)
+    ;
     ; VI-LABEL: name: test_load_local_v4s64_align32
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -13715,6 +14354,7 @@ body: |
     ; VI-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[PTR_ADD]](p3) :: (load (<2 x s64>) from unknown-address + 16, addrspace 3)
     ; VI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s64>) = G_CONCAT_VECTORS [[LOAD]](<2 x s64>), [[LOAD1]](<2 x s64>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<4 x s64>)
+    ;
     ; GFX9-LABEL: name: test_load_local_v4s64_align32
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -13725,6 +14365,7 @@ body: |
     ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[PTR_ADD]](p3) :: (load (<2 x s64>) from unknown-address + 16, addrspace 3)
     ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s64>) = G_CONCAT_VECTORS [[LOAD]](<2 x s64>), [[LOAD1]](<2 x s64>)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<4 x s64>)
+    ;
     ; GFX9-UNALIGNED-LABEL: name: test_load_local_v4s64_align32
     ; GFX9-UNALIGNED: liveins: $vgpr0
     ; GFX9-UNALIGNED-NEXT: {{  $}}
@@ -13735,6 +14376,7 @@ body: |
     ; GFX9-UNALIGNED-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[PTR_ADD]](p3) :: (load (<2 x s64>) from unknown-address + 16, addrspace 3)
     ; GFX9-UNALIGNED-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s64>) = G_CONCAT_VECTORS [[LOAD]](<2 x s64>), [[LOAD1]](<2 x s64>)
     ; GFX9-UNALIGNED-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<4 x s64>)
+    ;
     ; GFX10-LABEL: name: test_load_local_v4s64_align32
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -13745,6 +14387,7 @@ body: |
     ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[PTR_ADD]](p3) :: (load (<2 x s64>) from unknown-address + 16, addrspace 3)
     ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s64>) = G_CONCAT_VECTORS [[LOAD]](<2 x s64>), [[LOAD1]](<2 x s64>)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<4 x s64>)
+    ;
     ; GFX10-UNALIGNED-LABEL: name: test_load_local_v4s64_align32
     ; GFX10-UNALIGNED: liveins: $vgpr0
     ; GFX10-UNALIGNED-NEXT: {{  $}}
@@ -13755,6 +14398,7 @@ body: |
     ; GFX10-UNALIGNED-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[PTR_ADD]](p3) :: (load (<2 x s64>) from unknown-address + 16, addrspace 3)
     ; GFX10-UNALIGNED-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s64>) = G_CONCAT_VECTORS [[LOAD]](<2 x s64>), [[LOAD1]](<2 x s64>)
     ; GFX10-UNALIGNED-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<4 x s64>)
+    ;
     ; GFX11-LABEL: name: test_load_local_v4s64_align32
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -13765,6 +14409,7 @@ body: |
     ; GFX11-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[PTR_ADD]](p3) :: (load (<2 x s64>) from unknown-address + 16, addrspace 3)
     ; GFX11-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s64>) = G_CONCAT_VECTORS [[LOAD]](<2 x s64>), [[LOAD1]](<2 x s64>)
     ; GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[CONCAT_VECTORS]](<4 x s64>)
+    ;
     ; GFX11-UNALIGNED-LABEL: name: test_load_local_v4s64_align32
     ; GFX11-UNALIGNED: liveins: $vgpr0
     ; GFX11-UNALIGNED-NEXT: {{  $}}
@@ -13797,6 +14442,7 @@ body: |
     ; SI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s32>) = G_CONCAT_VECTORS [[LOAD]](<2 x s32>), [[LOAD1]](<2 x s32>)
     ; SI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[CONCAT_VECTORS]](<4 x s32>)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>)
+    ;
     ; CI-LABEL: name: test_load_local_v2p1_align4
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -13808,6 +14454,7 @@ body: |
     ; CI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s32>) = G_CONCAT_VECTORS [[LOAD]](<2 x s32>), [[LOAD1]](<2 x s32>)
     ; CI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[CONCAT_VECTORS]](<4 x s32>)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>)
+    ;
     ; CI-DS128-LABEL: name: test_load_local_v2p1_align4
     ; CI-DS128: liveins: $vgpr0
     ; CI-DS128-NEXT: {{  $}}
@@ -13825,6 +14472,7 @@ body: |
     ; CI-DS128-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32)
     ; CI-DS128-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; CI-DS128-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>)
+    ;
     ; VI-LABEL: name: test_load_local_v2p1_align4
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -13842,6 +14490,7 @@ body: |
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32)
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>)
+    ;
     ; GFX9-LABEL: name: test_load_local_v2p1_align4
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -13859,6 +14508,7 @@ body: |
     ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32)
     ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>)
+    ;
     ; GFX9-UNALIGNED-LABEL: name: test_load_local_v2p1_align4
     ; GFX9-UNALIGNED: liveins: $vgpr0
     ; GFX9-UNALIGNED-NEXT: {{  $}}
@@ -13866,6 +14516,7 @@ body: |
     ; GFX9-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p3) :: (load (<4 x s32>), align 4, addrspace 3)
     ; GFX9-UNALIGNED-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[LOAD]](<4 x s32>)
     ; GFX9-UNALIGNED-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>)
+    ;
     ; GFX10-LABEL: name: test_load_local_v2p1_align4
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -13883,6 +14534,7 @@ body: |
     ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32)
     ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>)
+    ;
     ; GFX10-UNALIGNED-LABEL: name: test_load_local_v2p1_align4
     ; GFX10-UNALIGNED: liveins: $vgpr0
     ; GFX10-UNALIGNED-NEXT: {{  $}}
@@ -13900,6 +14552,7 @@ body: |
     ; GFX10-UNALIGNED-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32)
     ; GFX10-UNALIGNED-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; GFX10-UNALIGNED-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>)
+    ;
     ; GFX11-LABEL: name: test_load_local_v2p1_align4
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -13917,6 +14570,7 @@ body: |
     ; GFX11-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32)
     ; GFX11-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>)
+    ;
     ; GFX11-UNALIGNED-LABEL: name: test_load_local_v2p1_align4
     ; GFX11-UNALIGNED: liveins: $vgpr0
     ; GFX11-UNALIGNED-NEXT: {{  $}}
@@ -13941,54 +14595,63 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; SI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p3) :: (load (<2 x p3>), addrspace 3)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
+    ;
     ; CI-LABEL: name: test_load_local_v2p3_align8
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p3) :: (load (<2 x p3>), addrspace 3)
     ; CI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
+    ;
     ; CI-DS128-LABEL: name: test_load_local_v2p3_align8
     ; CI-DS128: liveins: $vgpr0
     ; CI-DS128-NEXT: {{  $}}
     ; CI-DS128-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; CI-DS128-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p3) :: (load (<2 x p3>), addrspace 3)
     ; CI-DS128-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
+    ;
     ; VI-LABEL: name: test_load_local_v2p3_align8
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p3) :: (load (<2 x p3>), addrspace 3)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
+    ;
     ; GFX9-LABEL: name: test_load_local_v2p3_align8
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p3) :: (load (<2 x p3>), addrspace 3)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
+    ;
     ; GFX9-UNALIGNED-LABEL: name: test_load_local_v2p3_align8
     ; GFX9-UNALIGNED: liveins: $vgpr0
     ; GFX9-UNALIGNED-NEXT: {{  $}}
     ; GFX9-UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX9-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p3) :: (load (<2 x p3>), addrspace 3)
     ; GFX9-UNALIGNED-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
+    ;
     ; GFX10-LABEL: name: test_load_local_v2p3_align8
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p3) :: (load (<2 x p3>), addrspace 3)
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
+    ;
     ; GFX10-UNALIGNED-LABEL: name: test_load_local_v2p3_align8
     ; GFX10-UNALIGNED: liveins: $vgpr0
     ; GFX10-UNALIGNED-NEXT: {{  $}}
     ; GFX10-UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX10-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p3) :: (load (<2 x p3>), addrspace 3)
     ; GFX10-UNALIGNED-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
+    ;
     ; GFX11-LABEL: name: test_load_local_v2p3_align8
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p3>) = G_LOAD [[COPY]](p3) :: (load (<2 x p3>), addrspace 3)
     ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x p3>)
+    ;
     ; GFX11-UNALIGNED-LABEL: name: test_load_local_v2p3_align8
     ; GFX11-UNALIGNED: liveins: $vgpr0
     ; GFX11-UNALIGNED-NEXT: {{  $}}
@@ -14012,54 +14675,63 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; SI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s8), align 4, addrspace 3)
     ; SI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; CI-LABEL: name: test_extload_local_s32_from_1_align4
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s8), align 4, addrspace 3)
     ; CI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; CI-DS128-LABEL: name: test_extload_local_s32_from_1_align4
     ; CI-DS128: liveins: $vgpr0
     ; CI-DS128-NEXT: {{  $}}
     ; CI-DS128-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; CI-DS128-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s8), align 4, addrspace 3)
     ; CI-DS128-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; VI-LABEL: name: test_extload_local_s32_from_1_align4
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s8), align 4, addrspace 3)
     ; VI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX9-LABEL: name: test_extload_local_s32_from_1_align4
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s8), align 4, addrspace 3)
     ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX9-UNALIGNED-LABEL: name: test_extload_local_s32_from_1_align4
     ; GFX9-UNALIGNED: liveins: $vgpr0
     ; GFX9-UNALIGNED-NEXT: {{  $}}
     ; GFX9-UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX9-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s8), align 4, addrspace 3)
     ; GFX9-UNALIGNED-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX10-LABEL: name: test_extload_local_s32_from_1_align4
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s8), align 4, addrspace 3)
     ; GFX10-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX10-UNALIGNED-LABEL: name: test_extload_local_s32_from_1_align4
     ; GFX10-UNALIGNED: liveins: $vgpr0
     ; GFX10-UNALIGNED-NEXT: {{  $}}
     ; GFX10-UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX10-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s8), align 4, addrspace 3)
     ; GFX10-UNALIGNED-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX11-LABEL: name: test_extload_local_s32_from_1_align4
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s8), align 4, addrspace 3)
     ; GFX11-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX11-UNALIGNED-LABEL: name: test_extload_local_s32_from_1_align4
     ; GFX11-UNALIGNED: liveins: $vgpr0
     ; GFX11-UNALIGNED-NEXT: {{  $}}
@@ -14083,54 +14755,63 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; SI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s16), align 4, addrspace 3)
     ; SI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; CI-LABEL: name: test_extload_local_s32_from_2_align4
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s16), align 4, addrspace 3)
     ; CI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; CI-DS128-LABEL: name: test_extload_local_s32_from_2_align4
     ; CI-DS128: liveins: $vgpr0
     ; CI-DS128-NEXT: {{  $}}
     ; CI-DS128-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; CI-DS128-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s16), align 4, addrspace 3)
     ; CI-DS128-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; VI-LABEL: name: test_extload_local_s32_from_2_align4
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s16), align 4, addrspace 3)
     ; VI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX9-LABEL: name: test_extload_local_s32_from_2_align4
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s16), align 4, addrspace 3)
     ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX9-UNALIGNED-LABEL: name: test_extload_local_s32_from_2_align4
     ; GFX9-UNALIGNED: liveins: $vgpr0
     ; GFX9-UNALIGNED-NEXT: {{  $}}
     ; GFX9-UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX9-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s16), align 4, addrspace 3)
     ; GFX9-UNALIGNED-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX10-LABEL: name: test_extload_local_s32_from_2_align4
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s16), align 4, addrspace 3)
     ; GFX10-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX10-UNALIGNED-LABEL: name: test_extload_local_s32_from_2_align4
     ; GFX10-UNALIGNED: liveins: $vgpr0
     ; GFX10-UNALIGNED-NEXT: {{  $}}
     ; GFX10-UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX10-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s16), align 4, addrspace 3)
     ; GFX10-UNALIGNED-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX11-LABEL: name: test_extload_local_s32_from_2_align4
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s16), align 4, addrspace 3)
     ; GFX11-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX11-UNALIGNED-LABEL: name: test_extload_local_s32_from_2_align4
     ; GFX11-UNALIGNED: liveins: $vgpr0
     ; GFX11-UNALIGNED-NEXT: {{  $}}
@@ -14156,6 +14837,7 @@ body: |
     ; SI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s8), align 4, addrspace 3)
     ; SI-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
     ; CI-LABEL: name: test_extload_local_s64_from_1_align4
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -14163,6 +14845,7 @@ body: |
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s8), align 4, addrspace 3)
     ; CI-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; CI-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
     ; CI-DS128-LABEL: name: test_extload_local_s64_from_1_align4
     ; CI-DS128: liveins: $vgpr0
     ; CI-DS128-NEXT: {{  $}}
@@ -14170,6 +14853,7 @@ body: |
     ; CI-DS128-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s8), align 4, addrspace 3)
     ; CI-DS128-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; CI-DS128-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
     ; VI-LABEL: name: test_extload_local_s64_from_1_align4
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -14177,6 +14861,7 @@ body: |
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s8), align 4, addrspace 3)
     ; VI-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
     ; GFX9-LABEL: name: test_extload_local_s64_from_1_align4
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -14184,6 +14869,7 @@ body: |
     ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s8), align 4, addrspace 3)
     ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
     ; GFX9-UNALIGNED-LABEL: name: test_extload_local_s64_from_1_align4
     ; GFX9-UNALIGNED: liveins: $vgpr0
     ; GFX9-UNALIGNED-NEXT: {{  $}}
@@ -14191,6 +14877,7 @@ body: |
     ; GFX9-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s8), align 4, addrspace 3)
     ; GFX9-UNALIGNED-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; GFX9-UNALIGNED-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
     ; GFX10-LABEL: name: test_extload_local_s64_from_1_align4
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -14198,6 +14885,7 @@ body: |
     ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s8), align 4, addrspace 3)
     ; GFX10-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
     ; GFX10-UNALIGNED-LABEL: name: test_extload_local_s64_from_1_align4
     ; GFX10-UNALIGNED: liveins: $vgpr0
     ; GFX10-UNALIGNED-NEXT: {{  $}}
@@ -14205,6 +14893,7 @@ body: |
     ; GFX10-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s8), align 4, addrspace 3)
     ; GFX10-UNALIGNED-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; GFX10-UNALIGNED-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
     ; GFX11-LABEL: name: test_extload_local_s64_from_1_align4
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -14212,6 +14901,7 @@ body: |
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s8), align 4, addrspace 3)
     ; GFX11-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
     ; GFX11-UNALIGNED-LABEL: name: test_extload_local_s64_from_1_align4
     ; GFX11-UNALIGNED: liveins: $vgpr0
     ; GFX11-UNALIGNED-NEXT: {{  $}}
@@ -14237,6 +14927,7 @@ body: |
     ; SI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s16), align 4, addrspace 3)
     ; SI-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
     ; CI-LABEL: name: test_extload_local_s64_from_2_align4
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -14244,6 +14935,7 @@ body: |
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s16), align 4, addrspace 3)
     ; CI-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; CI-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
     ; CI-DS128-LABEL: name: test_extload_local_s64_from_2_align4
     ; CI-DS128: liveins: $vgpr0
     ; CI-DS128-NEXT: {{  $}}
@@ -14251,6 +14943,7 @@ body: |
     ; CI-DS128-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s16), align 4, addrspace 3)
     ; CI-DS128-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; CI-DS128-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
     ; VI-LABEL: name: test_extload_local_s64_from_2_align4
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -14258,6 +14951,7 @@ body: |
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s16), align 4, addrspace 3)
     ; VI-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
     ; GFX9-LABEL: name: test_extload_local_s64_from_2_align4
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -14265,6 +14959,7 @@ body: |
     ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s16), align 4, addrspace 3)
     ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
     ; GFX9-UNALIGNED-LABEL: name: test_extload_local_s64_from_2_align4
     ; GFX9-UNALIGNED: liveins: $vgpr0
     ; GFX9-UNALIGNED-NEXT: {{  $}}
@@ -14272,6 +14967,7 @@ body: |
     ; GFX9-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s16), align 4, addrspace 3)
     ; GFX9-UNALIGNED-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; GFX9-UNALIGNED-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
     ; GFX10-LABEL: name: test_extload_local_s64_from_2_align4
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -14279,6 +14975,7 @@ body: |
     ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s16), align 4, addrspace 3)
     ; GFX10-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
     ; GFX10-UNALIGNED-LABEL: name: test_extload_local_s64_from_2_align4
     ; GFX10-UNALIGNED: liveins: $vgpr0
     ; GFX10-UNALIGNED-NEXT: {{  $}}
@@ -14286,6 +14983,7 @@ body: |
     ; GFX10-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s16), align 4, addrspace 3)
     ; GFX10-UNALIGNED-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; GFX10-UNALIGNED-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
     ; GFX11-LABEL: name: test_extload_local_s64_from_2_align4
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -14293,6 +14991,7 @@ body: |
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s16), align 4, addrspace 3)
     ; GFX11-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
     ; GFX11-UNALIGNED-LABEL: name: test_extload_local_s64_from_2_align4
     ; GFX11-UNALIGNED: liveins: $vgpr0
     ; GFX11-UNALIGNED-NEXT: {{  $}}
@@ -14318,6 +15017,7 @@ body: |
     ; SI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s32), addrspace 3)
     ; SI-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
     ; CI-LABEL: name: test_extload_local_s64_from_4_align4
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -14325,6 +15025,7 @@ body: |
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s32), addrspace 3)
     ; CI-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; CI-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
     ; CI-DS128-LABEL: name: test_extload_local_s64_from_4_align4
     ; CI-DS128: liveins: $vgpr0
     ; CI-DS128-NEXT: {{  $}}
@@ -14332,6 +15033,7 @@ body: |
     ; CI-DS128-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s32), addrspace 3)
     ; CI-DS128-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; CI-DS128-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
     ; VI-LABEL: name: test_extload_local_s64_from_4_align4
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -14339,6 +15041,7 @@ body: |
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s32), addrspace 3)
     ; VI-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
     ; GFX9-LABEL: name: test_extload_local_s64_from_4_align4
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -14346,6 +15049,7 @@ body: |
     ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s32), addrspace 3)
     ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
     ; GFX9-UNALIGNED-LABEL: name: test_extload_local_s64_from_4_align4
     ; GFX9-UNALIGNED: liveins: $vgpr0
     ; GFX9-UNALIGNED-NEXT: {{  $}}
@@ -14353,6 +15057,7 @@ body: |
     ; GFX9-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s32), addrspace 3)
     ; GFX9-UNALIGNED-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; GFX9-UNALIGNED-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
     ; GFX10-LABEL: name: test_extload_local_s64_from_4_align4
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -14360,6 +15065,7 @@ body: |
     ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s32), addrspace 3)
     ; GFX10-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
     ; GFX10-UNALIGNED-LABEL: name: test_extload_local_s64_from_4_align4
     ; GFX10-UNALIGNED: liveins: $vgpr0
     ; GFX10-UNALIGNED-NEXT: {{  $}}
@@ -14367,6 +15073,7 @@ body: |
     ; GFX10-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s32), addrspace 3)
     ; GFX10-UNALIGNED-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; GFX10-UNALIGNED-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
     ; GFX11-LABEL: name: test_extload_local_s64_from_4_align4
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -14374,6 +15081,7 @@ body: |
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s32), addrspace 3)
     ; GFX11-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
     ; GFX11-UNALIGNED-LABEL: name: test_extload_local_s64_from_4_align4
     ; GFX11-UNALIGNED: liveins: $vgpr0
     ; GFX11-UNALIGNED-NEXT: {{  $}}
@@ -14402,6 +15110,7 @@ body: |
     ; SI-NEXT: [[DEF1:%[0-9]+]]:_(s64) = G_IMPLICIT_DEF
     ; SI-NEXT: [[MV1:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[MV]](s64), [[DEF1]](s64)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV1]](s128)
+    ;
     ; CI-LABEL: name: test_extload_local_s128_from_4_align4
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -14412,6 +15121,7 @@ body: |
     ; CI-NEXT: [[DEF1:%[0-9]+]]:_(s64) = G_IMPLICIT_DEF
     ; CI-NEXT: [[MV1:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[MV]](s64), [[DEF1]](s64)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV1]](s128)
+    ;
     ; CI-DS128-LABEL: name: test_extload_local_s128_from_4_align4
     ; CI-DS128: liveins: $vgpr0
     ; CI-DS128-NEXT: {{  $}}
@@ -14422,6 +15132,7 @@ body: |
     ; CI-DS128-NEXT: [[DEF1:%[0-9]+]]:_(s64) = G_IMPLICIT_DEF
     ; CI-DS128-NEXT: [[MV1:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[MV]](s64), [[DEF1]](s64)
     ; CI-DS128-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV1]](s128)
+    ;
     ; VI-LABEL: name: test_extload_local_s128_from_4_align4
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -14432,6 +15143,7 @@ body: |
     ; VI-NEXT: [[DEF1:%[0-9]+]]:_(s64) = G_IMPLICIT_DEF
     ; VI-NEXT: [[MV1:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[MV]](s64), [[DEF1]](s64)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV1]](s128)
+    ;
     ; GFX9-LABEL: name: test_extload_local_s128_from_4_align4
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -14442,6 +15154,7 @@ body: |
     ; GFX9-NEXT: [[DEF1:%[0-9]+]]:_(s64) = G_IMPLICIT_DEF
     ; GFX9-NEXT: [[MV1:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[MV]](s64), [[DEF1]](s64)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV1]](s128)
+    ;
     ; GFX9-UNALIGNED-LABEL: name: test_extload_local_s128_from_4_align4
     ; GFX9-UNALIGNED: liveins: $vgpr0
     ; GFX9-UNALIGNED-NEXT: {{  $}}
@@ -14452,6 +15165,7 @@ body: |
     ; GFX9-UNALIGNED-NEXT: [[DEF1:%[0-9]+]]:_(s64) = G_IMPLICIT_DEF
     ; GFX9-UNALIGNED-NEXT: [[MV1:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[MV]](s64), [[DEF1]](s64)
     ; GFX9-UNALIGNED-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV1]](s128)
+    ;
     ; GFX10-LABEL: name: test_extload_local_s128_from_4_align4
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -14462,6 +15176,7 @@ body: |
     ; GFX10-NEXT: [[DEF1:%[0-9]+]]:_(s64) = G_IMPLICIT_DEF
     ; GFX10-NEXT: [[MV1:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[MV]](s64), [[DEF1]](s64)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV1]](s128)
+    ;
     ; GFX10-UNALIGNED-LABEL: name: test_extload_local_s128_from_4_align4
     ; GFX10-UNALIGNED: liveins: $vgpr0
     ; GFX10-UNALIGNED-NEXT: {{  $}}
@@ -14472,6 +15187,7 @@ body: |
     ; GFX10-UNALIGNED-NEXT: [[DEF1:%[0-9]+]]:_(s64) = G_IMPLICIT_DEF
     ; GFX10-UNALIGNED-NEXT: [[MV1:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[MV]](s64), [[DEF1]](s64)
     ; GFX10-UNALIGNED-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV1]](s128)
+    ;
     ; GFX11-LABEL: name: test_extload_local_s128_from_4_align4
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -14482,6 +15198,7 @@ body: |
     ; GFX11-NEXT: [[DEF1:%[0-9]+]]:_(s64) = G_IMPLICIT_DEF
     ; GFX11-NEXT: [[MV1:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[MV]](s64), [[DEF1]](s64)
     ; GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV1]](s128)
+    ;
     ; GFX11-UNALIGNED-LABEL: name: test_extload_local_s128_from_4_align4
     ; GFX11-UNALIGNED: liveins: $vgpr0
     ; GFX11-UNALIGNED-NEXT: {{  $}}
@@ -14510,6 +15227,7 @@ body: |
     ; SI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s16), align 4, addrspace 3)
     ; SI-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
     ; CI-LABEL: name: test_extload_local_s64_from_2_align2
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -14517,6 +15235,7 @@ body: |
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s16), align 4, addrspace 3)
     ; CI-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; CI-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
     ; CI-DS128-LABEL: name: test_extload_local_s64_from_2_align2
     ; CI-DS128: liveins: $vgpr0
     ; CI-DS128-NEXT: {{  $}}
@@ -14524,6 +15243,7 @@ body: |
     ; CI-DS128-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s16), align 4, addrspace 3)
     ; CI-DS128-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; CI-DS128-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
     ; VI-LABEL: name: test_extload_local_s64_from_2_align2
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -14531,6 +15251,7 @@ body: |
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s16), align 4, addrspace 3)
     ; VI-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
     ; GFX9-LABEL: name: test_extload_local_s64_from_2_align2
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -14538,6 +15259,7 @@ body: |
     ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s16), align 4, addrspace 3)
     ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
     ; GFX9-UNALIGNED-LABEL: name: test_extload_local_s64_from_2_align2
     ; GFX9-UNALIGNED: liveins: $vgpr0
     ; GFX9-UNALIGNED-NEXT: {{  $}}
@@ -14545,6 +15267,7 @@ body: |
     ; GFX9-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s16), align 4, addrspace 3)
     ; GFX9-UNALIGNED-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; GFX9-UNALIGNED-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
     ; GFX10-LABEL: name: test_extload_local_s64_from_2_align2
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -14552,6 +15275,7 @@ body: |
     ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s16), align 4, addrspace 3)
     ; GFX10-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
     ; GFX10-UNALIGNED-LABEL: name: test_extload_local_s64_from_2_align2
     ; GFX10-UNALIGNED: liveins: $vgpr0
     ; GFX10-UNALIGNED-NEXT: {{  $}}
@@ -14559,6 +15283,7 @@ body: |
     ; GFX10-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s16), align 4, addrspace 3)
     ; GFX10-UNALIGNED-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; GFX10-UNALIGNED-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
     ; GFX11-LABEL: name: test_extload_local_s64_from_2_align2
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -14566,6 +15291,7 @@ body: |
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s16), align 4, addrspace 3)
     ; GFX11-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
     ; GFX11-UNALIGNED-LABEL: name: test_extload_local_s64_from_2_align2
     ; GFX11-UNALIGNED: liveins: $vgpr0
     ; GFX11-UNALIGNED-NEXT: {{  $}}
@@ -14591,6 +15317,7 @@ body: |
     ; SI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s8), align 4, addrspace 3)
     ; SI-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
     ; CI-LABEL: name: test_extload_local_s64_from_1_align1
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -14598,6 +15325,7 @@ body: |
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s8), align 4, addrspace 3)
     ; CI-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; CI-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
     ; CI-DS128-LABEL: name: test_extload_local_s64_from_1_align1
     ; CI-DS128: liveins: $vgpr0
     ; CI-DS128-NEXT: {{  $}}
@@ -14605,6 +15333,7 @@ body: |
     ; CI-DS128-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s8), align 4, addrspace 3)
     ; CI-DS128-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; CI-DS128-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
     ; VI-LABEL: name: test_extload_local_s64_from_1_align1
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -14612,6 +15341,7 @@ body: |
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s8), align 4, addrspace 3)
     ; VI-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
     ; GFX9-LABEL: name: test_extload_local_s64_from_1_align1
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -14619,6 +15349,7 @@ body: |
     ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s8), align 4, addrspace 3)
     ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
     ; GFX9-UNALIGNED-LABEL: name: test_extload_local_s64_from_1_align1
     ; GFX9-UNALIGNED: liveins: $vgpr0
     ; GFX9-UNALIGNED-NEXT: {{  $}}
@@ -14626,6 +15357,7 @@ body: |
     ; GFX9-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s8), align 4, addrspace 3)
     ; GFX9-UNALIGNED-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; GFX9-UNALIGNED-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
     ; GFX10-LABEL: name: test_extload_local_s64_from_1_align1
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -14633,6 +15365,7 @@ body: |
     ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s8), align 4, addrspace 3)
     ; GFX10-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
     ; GFX10-UNALIGNED-LABEL: name: test_extload_local_s64_from_1_align1
     ; GFX10-UNALIGNED: liveins: $vgpr0
     ; GFX10-UNALIGNED-NEXT: {{  $}}
@@ -14640,6 +15373,7 @@ body: |
     ; GFX10-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s8), align 4, addrspace 3)
     ; GFX10-UNALIGNED-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; GFX10-UNALIGNED-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
     ; GFX11-LABEL: name: test_extload_local_s64_from_1_align1
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -14647,6 +15381,7 @@ body: |
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p3) :: (load (s8), align 4, addrspace 3)
     ; GFX11-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
     ; GFX11-UNALIGNED-LABEL: name: test_extload_local_s64_from_1_align1
     ; GFX11-UNALIGNED: liveins: $vgpr0
     ; GFX11-UNALIGNED-NEXT: {{  $}}
@@ -14671,54 +15406,63 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; SI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load (<2 x s16>), align 1, addrspace 3)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; CI-LABEL: name: test_extload_local_v2s32_from_4_align1
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load (<2 x s16>), align 1, addrspace 3)
     ; CI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; CI-DS128-LABEL: name: test_extload_local_v2s32_from_4_align1
     ; CI-DS128: liveins: $vgpr0
     ; CI-DS128-NEXT: {{  $}}
     ; CI-DS128-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; CI-DS128-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load (<2 x s16>), align 1, addrspace 3)
     ; CI-DS128-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; VI-LABEL: name: test_extload_local_v2s32_from_4_align1
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load (<2 x s16>), align 1, addrspace 3)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; GFX9-LABEL: name: test_extload_local_v2s32_from_4_align1
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load (<2 x s16>), align 1, addrspace 3)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; GFX9-UNALIGNED-LABEL: name: test_extload_local_v2s32_from_4_align1
     ; GFX9-UNALIGNED: liveins: $vgpr0
     ; GFX9-UNALIGNED-NEXT: {{  $}}
     ; GFX9-UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX9-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load (<2 x s16>), align 1, addrspace 3)
     ; GFX9-UNALIGNED-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; GFX10-LABEL: name: test_extload_local_v2s32_from_4_align1
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load (<2 x s16>), align 1, addrspace 3)
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; GFX10-UNALIGNED-LABEL: name: test_extload_local_v2s32_from_4_align1
     ; GFX10-UNALIGNED: liveins: $vgpr0
     ; GFX10-UNALIGNED-NEXT: {{  $}}
     ; GFX10-UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX10-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load (<2 x s16>), align 1, addrspace 3)
     ; GFX10-UNALIGNED-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; GFX11-LABEL: name: test_extload_local_v2s32_from_4_align1
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load (<2 x s16>), align 1, addrspace 3)
     ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; GFX11-UNALIGNED-LABEL: name: test_extload_local_v2s32_from_4_align1
     ; GFX11-UNALIGNED: liveins: $vgpr0
     ; GFX11-UNALIGNED-NEXT: {{  $}}
@@ -14742,54 +15486,63 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; SI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load (<2 x s16>), align 2, addrspace 3)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; CI-LABEL: name: test_extload_local_v2s32_from_4_align2
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load (<2 x s16>), align 2, addrspace 3)
     ; CI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; CI-DS128-LABEL: name: test_extload_local_v2s32_from_4_align2
     ; CI-DS128: liveins: $vgpr0
     ; CI-DS128-NEXT: {{  $}}
     ; CI-DS128-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; CI-DS128-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load (<2 x s16>), align 2, addrspace 3)
     ; CI-DS128-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; VI-LABEL: name: test_extload_local_v2s32_from_4_align2
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load (<2 x s16>), align 2, addrspace 3)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; GFX9-LABEL: name: test_extload_local_v2s32_from_4_align2
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load (<2 x s16>), align 2, addrspace 3)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; GFX9-UNALIGNED-LABEL: name: test_extload_local_v2s32_from_4_align2
     ; GFX9-UNALIGNED: liveins: $vgpr0
     ; GFX9-UNALIGNED-NEXT: {{  $}}
     ; GFX9-UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX9-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load (<2 x s16>), align 2, addrspace 3)
     ; GFX9-UNALIGNED-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; GFX10-LABEL: name: test_extload_local_v2s32_from_4_align2
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load (<2 x s16>), align 2, addrspace 3)
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; GFX10-UNALIGNED-LABEL: name: test_extload_local_v2s32_from_4_align2
     ; GFX10-UNALIGNED: liveins: $vgpr0
     ; GFX10-UNALIGNED-NEXT: {{  $}}
     ; GFX10-UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX10-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load (<2 x s16>), align 2, addrspace 3)
     ; GFX10-UNALIGNED-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; GFX11-LABEL: name: test_extload_local_v2s32_from_4_align2
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load (<2 x s16>), align 2, addrspace 3)
     ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; GFX11-UNALIGNED-LABEL: name: test_extload_local_v2s32_from_4_align2
     ; GFX11-UNALIGNED: liveins: $vgpr0
     ; GFX11-UNALIGNED-NEXT: {{  $}}
@@ -14813,54 +15566,63 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; SI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load (<2 x s16>), addrspace 3)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; CI-LABEL: name: test_extload_local_v2s32_from_4_align4
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load (<2 x s16>), addrspace 3)
     ; CI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; CI-DS128-LABEL: name: test_extload_local_v2s32_from_4_align4
     ; CI-DS128: liveins: $vgpr0
     ; CI-DS128-NEXT: {{  $}}
     ; CI-DS128-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; CI-DS128-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load (<2 x s16>), addrspace 3)
     ; CI-DS128-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; VI-LABEL: name: test_extload_local_v2s32_from_4_align4
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load (<2 x s16>), addrspace 3)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; GFX9-LABEL: name: test_extload_local_v2s32_from_4_align4
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load (<2 x s16>), addrspace 3)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; GFX9-UNALIGNED-LABEL: name: test_extload_local_v2s32_from_4_align4
     ; GFX9-UNALIGNED: liveins: $vgpr0
     ; GFX9-UNALIGNED-NEXT: {{  $}}
     ; GFX9-UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX9-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load (<2 x s16>), addrspace 3)
     ; GFX9-UNALIGNED-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; GFX10-LABEL: name: test_extload_local_v2s32_from_4_align4
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load (<2 x s16>), addrspace 3)
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; GFX10-UNALIGNED-LABEL: name: test_extload_local_v2s32_from_4_align4
     ; GFX10-UNALIGNED: liveins: $vgpr0
     ; GFX10-UNALIGNED-NEXT: {{  $}}
     ; GFX10-UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX10-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load (<2 x s16>), addrspace 3)
     ; GFX10-UNALIGNED-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; GFX11-LABEL: name: test_extload_local_v2s32_from_4_align4
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p3) :: (load (<2 x s16>), addrspace 3)
     ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; GFX11-UNALIGNED-LABEL: name: test_extload_local_v2s32_from_4_align4
     ; GFX11-UNALIGNED: liveins: $vgpr0
     ; GFX11-UNALIGNED-NEXT: {{  $}}
@@ -14884,54 +15646,63 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; SI-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p3) :: (load (<3 x s16>), align 4, addrspace 3)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](<3 x s32>)
+    ;
     ; CI-LABEL: name: test_extload_local_v3s32_from_6_align4
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p3) :: (load (<3 x s16>), align 4, addrspace 3)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](<3 x s32>)
+    ;
     ; CI-DS128-LABEL: name: test_extload_local_v3s32_from_6_align4
     ; CI-DS128: liveins: $vgpr0
     ; CI-DS128-NEXT: {{  $}}
     ; CI-DS128-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; CI-DS128-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p3) :: (load (<3 x s16>), align 4, addrspace 3)
     ; CI-DS128-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](<3 x s32>)
+    ;
     ; VI-LABEL: name: test_extload_local_v3s32_from_6_align4
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p3) :: (load (<3 x s16>), align 4, addrspace 3)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](<3 x s32>)
+    ;
     ; GFX9-LABEL: name: test_extload_local_v3s32_from_6_align4
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p3) :: (load (<3 x s16>), align 4, addrspace 3)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](<3 x s32>)
+    ;
     ; GFX9-UNALIGNED-LABEL: name: test_extload_local_v3s32_from_6_align4
     ; GFX9-UNALIGNED: liveins: $vgpr0
     ; GFX9-UNALIGNED-NEXT: {{  $}}
     ; GFX9-UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX9-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p3) :: (load (<3 x s16>), align 4, addrspace 3)
     ; GFX9-UNALIGNED-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](<3 x s32>)
+    ;
     ; GFX10-LABEL: name: test_extload_local_v3s32_from_6_align4
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p3) :: (load (<3 x s16>), align 4, addrspace 3)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](<3 x s32>)
+    ;
     ; GFX10-UNALIGNED-LABEL: name: test_extload_local_v3s32_from_6_align4
     ; GFX10-UNALIGNED: liveins: $vgpr0
     ; GFX10-UNALIGNED-NEXT: {{  $}}
     ; GFX10-UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX10-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p3) :: (load (<3 x s16>), align 4, addrspace 3)
     ; GFX10-UNALIGNED-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](<3 x s32>)
+    ;
     ; GFX11-LABEL: name: test_extload_local_v3s32_from_6_align4
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p3) :: (load (<3 x s16>), align 4, addrspace 3)
     ; GFX11-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](<3 x s32>)
+    ;
     ; GFX11-UNALIGNED-LABEL: name: test_extload_local_v3s32_from_6_align4
     ; GFX11-UNALIGNED: liveins: $vgpr0
     ; GFX11-UNALIGNED-NEXT: {{  $}}
@@ -14955,54 +15726,63 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; SI-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p3) :: (load (<4 x s16>), align 4, addrspace 3)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ;
     ; CI-LABEL: name: test_extload_local_v4s32_from_8_align4
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p3) :: (load (<4 x s16>), align 4, addrspace 3)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ;
     ; CI-DS128-LABEL: name: test_extload_local_v4s32_from_8_align4
     ; CI-DS128: liveins: $vgpr0
     ; CI-DS128-NEXT: {{  $}}
     ; CI-DS128-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; CI-DS128-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p3) :: (load (<4 x s16>), align 4, addrspace 3)
     ; CI-DS128-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ;
     ; VI-LABEL: name: test_extload_local_v4s32_from_8_align4
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p3) :: (load (<4 x s16>), align 4, addrspace 3)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ;
     ; GFX9-LABEL: name: test_extload_local_v4s32_from_8_align4
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p3) :: (load (<4 x s16>), align 4, addrspace 3)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ;
     ; GFX9-UNALIGNED-LABEL: name: test_extload_local_v4s32_from_8_align4
     ; GFX9-UNALIGNED: liveins: $vgpr0
     ; GFX9-UNALIGNED-NEXT: {{  $}}
     ; GFX9-UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX9-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p3) :: (load (<4 x s16>), align 4, addrspace 3)
     ; GFX9-UNALIGNED-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ;
     ; GFX10-LABEL: name: test_extload_local_v4s32_from_8_align4
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p3) :: (load (<4 x s16>), align 4, addrspace 3)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ;
     ; GFX10-UNALIGNED-LABEL: name: test_extload_local_v4s32_from_8_align4
     ; GFX10-UNALIGNED: liveins: $vgpr0
     ; GFX10-UNALIGNED-NEXT: {{  $}}
     ; GFX10-UNALIGNED-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX10-UNALIGNED-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p3) :: (load (<4 x s16>), align 4, addrspace 3)
     ; GFX10-UNALIGNED-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ;
     ; GFX11-LABEL: name: test_extload_local_v4s32_from_8_align4
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
     ; GFX11-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; GFX11-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p3) :: (load (<4 x s16>), align 4, addrspace 3)
     ; GFX11-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ;
     ; GFX11-UNALIGNED-LABEL: name: test_extload_local_v4s32_from_8_align4
     ; GFX11-UNALIGNED: liveins: $vgpr0
     ; GFX11-UNALIGNED-NEXT: {{  $}}
@@ -15121,6 +15901,7 @@ body: |
     ; SI-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)
     ; SI-NEXT: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96)
+    ;
     ; CI-LABEL: name: test_load_local_v2s96_align1
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -15222,6 +16003,7 @@ body: |
     ; CI-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)
     ; CI-NEXT: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96)
+    ;
     ; CI-DS128-LABEL: name: test_load_local_v2s96_align1
     ; CI-DS128: liveins: $vgpr0
     ; CI-DS128-NEXT: {{  $}}
@@ -15323,6 +16105,7 @@ body: |
     ; CI-DS128-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
     ; CI-DS128-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)
     ; CI-DS128-NEXT: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96)
+    ;
     ; VI-LABEL: name: test_load_local_v2s96_align1
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -15424,6 +16207,7 @@ body: |
     ; VI-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)
     ; VI-NEXT: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96)
+    ;
     ; GFX9-LABEL: name: test_load_local_v2s96_align1
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -15525,6 +16309,7 @@ body: |
     ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)
     ; GFX9-NEXT: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96)
+    ;
     ; GFX9-UNALIGNED-LABEL: name: test_load_local_v2s96_align1
     ; GFX9-UNALIGNED: liveins: $vgpr0
     ; GFX9-UNALIGNED-NEXT: {{  $}}
@@ -15539,6 +16324,7 @@ body: |
     ; GFX9-UNALIGNED-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
     ; GFX9-UNALIGNED-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)
     ; GFX9-UNALIGNED-NEXT: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96)
+    ;
     ; GFX10-LABEL: name: test_load_local_v2s96_align1
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -15640,6 +16426,7 @@ body: |
     ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)
     ; GFX10-NEXT: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96)
+    ;
     ; GFX10-UNALIGNED-LABEL: name: test_load_local_v2s96_align1
     ; GFX10-UNALIGNED: liveins: $vgpr0
     ; GFX10-UNALIGNED-NEXT: {{  $}}
@@ -15666,6 +16453,7 @@ body: |
     ; GFX10-UNALIGNED-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
     ; GFX10-UNALIGNED-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)
     ; GFX10-UNALIGNED-NEXT: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96)
+    ;
     ; GFX11-LABEL: name: test_load_local_v2s96_align1
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -15767,6 +16555,7 @@ body: |
     ; GFX11-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
     ; GFX11-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)
     ; GFX11-NEXT: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96)
+    ;
     ; GFX11-UNALIGNED-LABEL: name: test_load_local_v2s96_align1
     ; GFX11-UNALIGNED: liveins: $vgpr0
     ; GFX11-UNALIGNED-NEXT: {{  $}}
@@ -15847,6 +16636,7 @@ body: |
     ; SI-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)
     ; SI-NEXT: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96)
+    ;
     ; CI-LABEL: name: test_load_local_v2s96_align2
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -15899,6 +16689,7 @@ body: |
     ; CI-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)
     ; CI-NEXT: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96)
+    ;
     ; CI-DS128-LABEL: name: test_load_local_v2s96_align2
     ; CI-DS128: liveins: $vgpr0
     ; CI-DS128-NEXT: {{  $}}
@@ -15951,6 +16742,7 @@ body: |
     ; CI-DS128-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
     ; CI-DS128-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)
     ; CI-DS128-NEXT: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96)
+    ;
     ; VI-LABEL: name: test_load_local_v2s96_align2
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -16003,6 +16795,7 @@ body: |
     ; VI-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)
     ; VI-NEXT: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96)
+    ;
     ; GFX9-LABEL: name: test_load_local_v2s96_align2
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -16055,6 +16848,7 @@ body: |
     ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)
     ; GFX9-NEXT: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96)
+    ;
     ; GFX9-UNALIGNED-LABEL: name: test_load_local_v2s96_align2
     ; GFX9-UNALIGNED: liveins: $vgpr0
     ; GFX9-UNALIGNED-NEXT: {{  $}}
@@ -16069,6 +16863,7 @@ body: |
     ; GFX9-UNALIGNED-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
     ; GFX9-UNALIGNED-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)
     ; GFX9-UNALIGNED-NEXT: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96)
+    ;
     ; GFX10-LABEL: name: test_load_local_v2s96_align2
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -16121,6 +16916,7 @@ body: |
     ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)
     ; GFX10-NEXT: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96)
+    ;
     ; GFX10-UNALIGNED-LABEL: name: test_load_local_v2s96_align2
     ; GFX10-UNALIGNED: liveins: $vgpr0
     ; GFX10-UNALIGNED-NEXT: {{  $}}
@@ -16147,6 +16943,7 @@ body: |
     ; GFX10-UNALIGNED-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
     ; GFX10-UNALIGNED-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)
     ; GFX10-UNALIGNED-NEXT: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96)
+    ;
     ; GFX11-LABEL: name: test_load_local_v2s96_align2
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -16199,6 +16996,7 @@ body: |
     ; GFX11-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
     ; GFX11-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)
     ; GFX11-NEXT: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96)
+    ;
     ; GFX11-UNALIGNED-LABEL: name: test_load_local_v2s96_align2
     ; GFX11-UNALIGNED: liveins: $vgpr0
     ; GFX11-UNALIGNED-NEXT: {{  $}}
@@ -16250,6 +17048,7 @@ body: |
     ; SI-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)
     ; SI-NEXT: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96)
+    ;
     ; CI-LABEL: name: test_load_local_v2s96_align4
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -16273,6 +17072,7 @@ body: |
     ; CI-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)
     ; CI-NEXT: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96)
+    ;
     ; CI-DS128-LABEL: name: test_load_local_v2s96_align4
     ; CI-DS128: liveins: $vgpr0
     ; CI-DS128-NEXT: {{  $}}
@@ -16299,6 +17099,7 @@ body: |
     ; CI-DS128-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
     ; CI-DS128-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)
     ; CI-DS128-NEXT: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96)
+    ;
     ; VI-LABEL: name: test_load_local_v2s96_align4
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -16325,6 +17126,7 @@ body: |
     ; VI-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)
     ; VI-NEXT: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96)
+    ;
     ; GFX9-LABEL: name: test_load_local_v2s96_align4
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -16351,6 +17153,7 @@ body: |
     ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)
     ; GFX9-NEXT: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96)
+    ;
     ; GFX9-UNALIGNED-LABEL: name: test_load_local_v2s96_align4
     ; GFX9-UNALIGNED: liveins: $vgpr0
     ; GFX9-UNALIGNED-NEXT: {{  $}}
@@ -16365,6 +17168,7 @@ body: |
     ; GFX9-UNALIGNED-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
     ; GFX9-UNALIGNED-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)
     ; GFX9-UNALIGNED-NEXT: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96)
+    ;
     ; GFX10-LABEL: name: test_load_local_v2s96_align4
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -16391,6 +17195,7 @@ body: |
     ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)
     ; GFX10-NEXT: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96)
+    ;
     ; GFX10-UNALIGNED-LABEL: name: test_load_local_v2s96_align4
     ; GFX10-UNALIGNED: liveins: $vgpr0
     ; GFX10-UNALIGNED-NEXT: {{  $}}
@@ -16417,6 +17222,7 @@ body: |
     ; GFX10-UNALIGNED-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
     ; GFX10-UNALIGNED-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)
     ; GFX10-UNALIGNED-NEXT: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96)
+    ;
     ; GFX11-LABEL: name: test_load_local_v2s96_align4
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -16443,6 +17249,7 @@ body: |
     ; GFX11-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
     ; GFX11-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)
     ; GFX11-NEXT: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96)
+    ;
     ; GFX11-UNALIGNED-LABEL: name: test_load_local_v2s96_align4
     ; GFX11-UNALIGNED: liveins: $vgpr0
     ; GFX11-UNALIGNED-NEXT: {{  $}}
@@ -16494,6 +17301,7 @@ body: |
     ; SI-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)
     ; SI-NEXT: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96)
+    ;
     ; CI-LABEL: name: test_load_local_v2s96_align16
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -16517,6 +17325,7 @@ body: |
     ; CI-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)
     ; CI-NEXT: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96)
+    ;
     ; CI-DS128-LABEL: name: test_load_local_v2s96_align16
     ; CI-DS128: liveins: $vgpr0
     ; CI-DS128-NEXT: {{  $}}
@@ -16538,6 +17347,7 @@ body: |
     ; CI-DS128-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
     ; CI-DS128-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)
     ; CI-DS128-NEXT: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96)
+    ;
     ; VI-LABEL: name: test_load_local_v2s96_align16
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -16559,6 +17369,7 @@ body: |
     ; VI-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)
     ; VI-NEXT: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96)
+    ;
     ; GFX9-LABEL: name: test_load_local_v2s96_align16
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -16580,6 +17391,7 @@ body: |
     ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)
     ; GFX9-NEXT: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96)
+    ;
     ; GFX9-UNALIGNED-LABEL: name: test_load_local_v2s96_align16
     ; GFX9-UNALIGNED: liveins: $vgpr0
     ; GFX9-UNALIGNED-NEXT: {{  $}}
@@ -16594,6 +17406,7 @@ body: |
     ; GFX9-UNALIGNED-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
     ; GFX9-UNALIGNED-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)
     ; GFX9-UNALIGNED-NEXT: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96)
+    ;
     ; GFX10-LABEL: name: test_load_local_v2s96_align16
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -16615,6 +17428,7 @@ body: |
     ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)
     ; GFX10-NEXT: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96)
+    ;
     ; GFX10-UNALIGNED-LABEL: name: test_load_local_v2s96_align16
     ; GFX10-UNALIGNED: liveins: $vgpr0
     ; GFX10-UNALIGNED-NEXT: {{  $}}
@@ -16636,6 +17450,7 @@ body: |
     ; GFX10-UNALIGNED-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
     ; GFX10-UNALIGNED-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)
     ; GFX10-UNALIGNED-NEXT: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96)
+    ;
     ; GFX11-LABEL: name: test_load_local_v2s96_align16
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -16657,6 +17472,7 @@ body: |
     ; GFX11-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
     ; GFX11-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)
     ; GFX11-NEXT: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96)
+    ;
     ; GFX11-UNALIGNED-LABEL: name: test_load_local_v2s96_align16
     ; GFX11-UNALIGNED: liveins: $vgpr0
     ; GFX11-UNALIGNED-NEXT: {{  $}}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-private.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-private.mir
index 620f012805294..40884567f9962 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-private.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-load-private.mir
@@ -20,6 +20,7 @@ body: |
     ; SI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; SI-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LOAD]], [[C]]
     ; SI-NEXT: $vgpr0 = COPY [[AND]](s32)
+    ;
     ; CI-LABEL: name: test_load_private_s1_align1
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -28,6 +29,7 @@ body: |
     ; CI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; CI-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LOAD]], [[C]]
     ; CI-NEXT: $vgpr0 = COPY [[AND]](s32)
+    ;
     ; VI-LABEL: name: test_load_private_s1_align1
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -36,6 +38,7 @@ body: |
     ; VI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; VI-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LOAD]], [[C]]
     ; VI-NEXT: $vgpr0 = COPY [[AND]](s32)
+    ;
     ; GFX9-LABEL: name: test_load_private_s1_align1
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -44,6 +47,7 @@ body: |
     ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LOAD]], [[C]]
     ; GFX9-NEXT: $vgpr0 = COPY [[AND]](s32)
+    ;
     ; GFX10-LABEL: name: test_load_private_s1_align1
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -52,6 +56,7 @@ body: |
     ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; GFX10-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LOAD]], [[C]]
     ; GFX10-NEXT: $vgpr0 = COPY [[AND]](s32)
+    ;
     ; GFX11-LABEL: name: test_load_private_s1_align1
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -80,6 +85,7 @@ body: |
     ; SI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
     ; SI-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LOAD]], [[C]]
     ; SI-NEXT: $vgpr0 = COPY [[AND]](s32)
+    ;
     ; CI-LABEL: name: test_load_private_s2_align1
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -88,6 +94,7 @@ body: |
     ; CI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
     ; CI-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LOAD]], [[C]]
     ; CI-NEXT: $vgpr0 = COPY [[AND]](s32)
+    ;
     ; VI-LABEL: name: test_load_private_s2_align1
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -96,6 +103,7 @@ body: |
     ; VI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
     ; VI-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LOAD]], [[C]]
     ; VI-NEXT: $vgpr0 = COPY [[AND]](s32)
+    ;
     ; GFX9-LABEL: name: test_load_private_s2_align1
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -104,6 +112,7 @@ body: |
     ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
     ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LOAD]], [[C]]
     ; GFX9-NEXT: $vgpr0 = COPY [[AND]](s32)
+    ;
     ; GFX10-LABEL: name: test_load_private_s2_align1
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -112,6 +121,7 @@ body: |
     ; GFX10-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
     ; GFX10-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LOAD]], [[C]]
     ; GFX10-NEXT: $vgpr0 = COPY [[AND]](s32)
+    ;
     ; GFX11-LABEL: name: test_load_private_s2_align1
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -138,30 +148,35 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; SI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s8), align 4, addrspace 5)
     ; SI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; CI-LABEL: name: test_load_private_s8_align4
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s8), align 4, addrspace 5)
     ; CI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; VI-LABEL: name: test_load_private_s8_align4
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s8), align 4, addrspace 5)
     ; VI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX9-LABEL: name: test_load_private_s8_align4
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s8), align 4, addrspace 5)
     ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX10-LABEL: name: test_load_private_s8_align4
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s8), align 4, addrspace 5)
     ; GFX10-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX11-LABEL: name: test_load_private_s8_align4
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -186,30 +201,35 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; SI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s8), addrspace 5)
     ; SI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; CI-LABEL: name: test_load_private_s8_align1
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s8), addrspace 5)
     ; CI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; VI-LABEL: name: test_load_private_s8_align1
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s8), addrspace 5)
     ; VI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX9-LABEL: name: test_load_private_s8_align1
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s8), addrspace 5)
     ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX10-LABEL: name: test_load_private_s8_align1
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s8), addrspace 5)
     ; GFX10-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX11-LABEL: name: test_load_private_s8_align1
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -234,30 +254,35 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; SI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), align 4, addrspace 5)
     ; SI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; CI-LABEL: name: test_load_private_s16_align4
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), align 4, addrspace 5)
     ; CI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; VI-LABEL: name: test_load_private_s16_align4
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), align 4, addrspace 5)
     ; VI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX9-LABEL: name: test_load_private_s16_align4
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), align 4, addrspace 5)
     ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX10-LABEL: name: test_load_private_s16_align4
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), align 4, addrspace 5)
     ; GFX10-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX11-LABEL: name: test_load_private_s16_align4
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -282,30 +307,35 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; SI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), addrspace 5)
     ; SI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; CI-LABEL: name: test_load_private_s16_align2
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), addrspace 5)
     ; CI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; VI-LABEL: name: test_load_private_s16_align2
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), addrspace 5)
     ; VI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX9-LABEL: name: test_load_private_s16_align2
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), addrspace 5)
     ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX10-LABEL: name: test_load_private_s16_align2
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), addrspace 5)
     ; GFX10-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX11-LABEL: name: test_load_private_s16_align2
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -336,6 +366,7 @@ body: |
     ; SI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
     ; SI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; SI-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
     ; CI-LABEL: name: test_load_private_s16_align1
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -348,6 +379,7 @@ body: |
     ; CI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
     ; CI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; CI-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
     ; VI-LABEL: name: test_load_private_s16_align1
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -360,6 +392,7 @@ body: |
     ; VI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
     ; VI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; VI-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
     ; GFX9-LABEL: name: test_load_private_s16_align1
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -372,6 +405,7 @@ body: |
     ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
     ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; GFX9-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
     ; GFX10-LABEL: name: test_load_private_s16_align1
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -384,6 +418,7 @@ body: |
     ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
     ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; GFX10-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
     ; GFX11-LABEL: name: test_load_private_s16_align1
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -408,30 +443,35 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; SI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), addrspace 5)
     ; SI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; CI-LABEL: name: test_load_private_s32_align4
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), addrspace 5)
     ; CI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; VI-LABEL: name: test_load_private_s32_align4
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), addrspace 5)
     ; VI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX9-LABEL: name: test_load_private_s32_align4
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), addrspace 5)
     ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX10-LABEL: name: test_load_private_s32_align4
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), addrspace 5)
     ; GFX10-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX11-LABEL: name: test_load_private_s32_align4
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -461,6 +501,7 @@ body: |
     ; SI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
     ; SI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; SI-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
     ; CI-LABEL: name: test_load_private_s32_align2
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -473,6 +514,7 @@ body: |
     ; CI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
     ; CI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; CI-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
     ; VI-LABEL: name: test_load_private_s32_align2
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -485,6 +527,7 @@ body: |
     ; VI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
     ; VI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; VI-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
     ; GFX9-LABEL: name: test_load_private_s32_align2
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -497,6 +540,7 @@ body: |
     ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
     ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; GFX9-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
     ; GFX10-LABEL: name: test_load_private_s32_align2
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -509,6 +553,7 @@ body: |
     ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
     ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; GFX10-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
     ; GFX11-LABEL: name: test_load_private_s32_align2
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -548,6 +593,7 @@ body: |
     ; SI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
     ; SI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
     ; SI-NEXT: $vgpr0 = COPY [[OR2]](s32)
+    ;
     ; CI-LABEL: name: test_load_private_s32_align1
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -570,6 +616,7 @@ body: |
     ; CI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
     ; CI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
     ; CI-NEXT: $vgpr0 = COPY [[OR2]](s32)
+    ;
     ; VI-LABEL: name: test_load_private_s32_align1
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -592,6 +639,7 @@ body: |
     ; VI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
     ; VI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
     ; VI-NEXT: $vgpr0 = COPY [[OR2]](s32)
+    ;
     ; GFX9-LABEL: name: test_load_private_s32_align1
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -614,6 +662,7 @@ body: |
     ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
     ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
     ; GFX9-NEXT: $vgpr0 = COPY [[OR2]](s32)
+    ;
     ; GFX10-LABEL: name: test_load_private_s32_align1
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -636,6 +685,7 @@ body: |
     ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[OR1]], [[C3]](s32)
     ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
     ; GFX10-NEXT: $vgpr0 = COPY [[OR2]](s32)
+    ;
     ; GFX11-LABEL: name: test_load_private_s32_align1
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -659,30 +709,35 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; SI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 8, addrspace 5)
     ; SI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; CI-LABEL: name: test_load_private_s24_align8
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 8, addrspace 5)
     ; CI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; VI-LABEL: name: test_load_private_s24_align8
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 8, addrspace 5)
     ; VI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX9-LABEL: name: test_load_private_s24_align8
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 8, addrspace 5)
     ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX10-LABEL: name: test_load_private_s24_align8
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), align 8, addrspace 5)
     ; GFX10-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX11-LABEL: name: test_load_private_s24_align8
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -707,30 +762,35 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; SI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), addrspace 5)
     ; SI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; CI-LABEL: name: test_load_private_s24_align4
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), addrspace 5)
     ; CI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; VI-LABEL: name: test_load_private_s24_align4
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), addrspace 5)
     ; VI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX9-LABEL: name: test_load_private_s24_align4
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), addrspace 5)
     ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX10-LABEL: name: test_load_private_s24_align4
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), addrspace 5)
     ; GFX10-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX11-LABEL: name: test_load_private_s24_align4
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -761,6 +821,7 @@ body: |
     ; SI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
     ; SI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; SI-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
     ; CI-LABEL: name: test_load_private_s24_align2
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -773,6 +834,7 @@ body: |
     ; CI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
     ; CI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; CI-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
     ; VI-LABEL: name: test_load_private_s24_align2
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -785,6 +847,7 @@ body: |
     ; VI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
     ; VI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; VI-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
     ; GFX9-LABEL: name: test_load_private_s24_align2
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -797,6 +860,7 @@ body: |
     ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
     ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; GFX9-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
     ; GFX10-LABEL: name: test_load_private_s24_align2
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -809,6 +873,7 @@ body: |
     ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C1]](s32)
     ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; GFX10-NEXT: $vgpr0 = COPY [[OR]](s32)
+    ;
     ; GFX11-LABEL: name: test_load_private_s24_align2
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -851,6 +916,7 @@ body: |
     ; SI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C3]](s32)
     ; SI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[OR]]
     ; SI-NEXT: $vgpr0 = COPY [[OR1]](s32)
+    ;
     ; CI-LABEL: name: test_load_private_s24_align1
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -869,6 +935,7 @@ body: |
     ; CI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C3]](s32)
     ; CI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[OR]]
     ; CI-NEXT: $vgpr0 = COPY [[OR1]](s32)
+    ;
     ; VI-LABEL: name: test_load_private_s24_align1
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -887,6 +954,7 @@ body: |
     ; VI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C3]](s32)
     ; VI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[OR]]
     ; VI-NEXT: $vgpr0 = COPY [[OR1]](s32)
+    ;
     ; GFX9-LABEL: name: test_load_private_s24_align1
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -905,6 +973,7 @@ body: |
     ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C3]](s32)
     ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[OR]]
     ; GFX9-NEXT: $vgpr0 = COPY [[OR1]](s32)
+    ;
     ; GFX10-LABEL: name: test_load_private_s24_align1
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -923,6 +992,7 @@ body: |
     ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LOAD]], [[C3]](s32)
     ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[OR]]
     ; GFX10-NEXT: $vgpr0 = COPY [[OR1]](s32)
+    ;
     ; GFX11-LABEL: name: test_load_private_s24_align1
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -959,15 +1029,15 @@ body: |
     ; SI-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C1]](s32)
     ; SI-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
     ; SI-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LOAD]], [[C2]]
-    ; SI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C2]]
-    ; SI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C1]](s32)
+    ; SI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LSHR]], [[C1]](s32)
     ; SI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
-    ; SI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[LOAD1]], [[C2]]
+    ; SI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LOAD1]], [[C2]]
     ; SI-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
     ; SI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[C3]], [[C1]](s32)
-    ; SI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]]
+    ; SI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND1]], [[SHL1]]
     ; SI-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64)
+    ;
     ; CI-LABEL: name: test_load_private_s48_align8
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -980,15 +1050,15 @@ body: |
     ; CI-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C1]](s32)
     ; CI-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
     ; CI-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LOAD]], [[C2]]
-    ; CI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C2]]
-    ; CI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C1]](s32)
+    ; CI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LSHR]], [[C1]](s32)
     ; CI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
-    ; CI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[LOAD1]], [[C2]]
+    ; CI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LOAD1]], [[C2]]
     ; CI-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
     ; CI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[C3]], [[C1]](s32)
-    ; CI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]]
+    ; CI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND1]], [[SHL1]]
     ; CI-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32)
     ; CI-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64)
+    ;
     ; VI-LABEL: name: test_load_private_s48_align8
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -1001,15 +1071,15 @@ body: |
     ; VI-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C1]](s32)
     ; VI-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
     ; VI-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LOAD]], [[C2]]
-    ; VI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C2]]
-    ; VI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C1]](s32)
+    ; VI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LSHR]], [[C1]](s32)
     ; VI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
-    ; VI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[LOAD1]], [[C2]]
+    ; VI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LOAD1]], [[C2]]
     ; VI-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
     ; VI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[C3]], [[C1]](s32)
-    ; VI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]]
+    ; VI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND1]], [[SHL1]]
     ; VI-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64)
+    ;
     ; GFX9-LABEL: name: test_load_private_s48_align8
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -1022,15 +1092,15 @@ body: |
     ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C1]](s32)
     ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
     ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LOAD]], [[C2]]
-    ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C2]]
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C1]](s32)
+    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LSHR]], [[C1]](s32)
     ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
-    ; GFX9-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[LOAD1]], [[C2]]
+    ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LOAD1]], [[C2]]
     ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
     ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[C3]], [[C1]](s32)
-    ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]]
+    ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND1]], [[SHL1]]
     ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64)
+    ;
     ; GFX10-LABEL: name: test_load_private_s48_align8
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -1043,15 +1113,15 @@ body: |
     ; GFX10-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C1]](s32)
     ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
     ; GFX10-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LOAD]], [[C2]]
-    ; GFX10-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C2]]
-    ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C1]](s32)
+    ; GFX10-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LSHR]], [[C1]](s32)
     ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
-    ; GFX10-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[LOAD1]], [[C2]]
+    ; GFX10-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LOAD1]], [[C2]]
     ; GFX10-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
     ; GFX10-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[C3]], [[C1]](s32)
-    ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]]
+    ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND1]], [[SHL1]]
     ; GFX10-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32)
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64)
+    ;
     ; GFX11-LABEL: name: test_load_private_s48_align8
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -1080,6 +1150,7 @@ body: |
     ; SI-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, addrspace 5)
     ; SI-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64)
+    ;
     ; CI-LABEL: name: test_load_private_s64_align8
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -1090,6 +1161,7 @@ body: |
     ; CI-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, addrspace 5)
     ; CI-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32)
     ; CI-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64)
+    ;
     ; VI-LABEL: name: test_load_private_s64_align8
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -1100,6 +1172,7 @@ body: |
     ; VI-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, addrspace 5)
     ; VI-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64)
+    ;
     ; GFX9-LABEL: name: test_load_private_s64_align8
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -1110,6 +1183,7 @@ body: |
     ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, addrspace 5)
     ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64)
+    ;
     ; GFX10-LABEL: name: test_load_private_s64_align8
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -1120,6 +1194,7 @@ body: |
     ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, addrspace 5)
     ; GFX10-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32)
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64)
+    ;
     ; GFX11-LABEL: name: test_load_private_s64_align8
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -1147,6 +1222,7 @@ body: |
     ; SI-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, addrspace 5)
     ; SI-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64)
+    ;
     ; CI-LABEL: name: test_load_private_s64_align4
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -1157,6 +1233,7 @@ body: |
     ; CI-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, addrspace 5)
     ; CI-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32)
     ; CI-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64)
+    ;
     ; VI-LABEL: name: test_load_private_s64_align4
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -1167,6 +1244,7 @@ body: |
     ; VI-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, addrspace 5)
     ; VI-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64)
+    ;
     ; GFX9-LABEL: name: test_load_private_s64_align4
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -1177,6 +1255,7 @@ body: |
     ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, addrspace 5)
     ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64)
+    ;
     ; GFX10-LABEL: name: test_load_private_s64_align4
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -1187,6 +1266,7 @@ body: |
     ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, addrspace 5)
     ; GFX10-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32)
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64)
+    ;
     ; GFX11-LABEL: name: test_load_private_s64_align4
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -1224,6 +1304,7 @@ body: |
     ; SI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
     ; SI-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64)
+    ;
     ; CI-LABEL: name: test_load_private_s64_align2
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -1244,6 +1325,7 @@ body: |
     ; CI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
     ; CI-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32)
     ; CI-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64)
+    ;
     ; VI-LABEL: name: test_load_private_s64_align2
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -1264,6 +1346,7 @@ body: |
     ; VI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
     ; VI-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64)
+    ;
     ; GFX9-LABEL: name: test_load_private_s64_align2
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -1284,6 +1367,7 @@ body: |
     ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
     ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64)
+    ;
     ; GFX10-LABEL: name: test_load_private_s64_align2
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -1304,6 +1388,7 @@ body: |
     ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
     ; GFX10-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32)
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64)
+    ;
     ; GFX11-LABEL: name: test_load_private_s64_align2
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -1359,6 +1444,7 @@ body: |
     ; SI-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
     ; SI-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR2]](s32), [[OR5]](s32)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64)
+    ;
     ; CI-LABEL: name: test_load_private_s64_align1
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -1397,6 +1483,7 @@ body: |
     ; CI-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
     ; CI-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR2]](s32), [[OR5]](s32)
     ; CI-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64)
+    ;
     ; VI-LABEL: name: test_load_private_s64_align1
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -1435,6 +1522,7 @@ body: |
     ; VI-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
     ; VI-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR2]](s32), [[OR5]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64)
+    ;
     ; GFX9-LABEL: name: test_load_private_s64_align1
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -1473,6 +1561,7 @@ body: |
     ; GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
     ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR2]](s32), [[OR5]](s32)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64)
+    ;
     ; GFX10-LABEL: name: test_load_private_s64_align1
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -1511,6 +1600,7 @@ body: |
     ; GFX10-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
     ; GFX10-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR2]](s32), [[OR5]](s32)
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64)
+    ;
     ; GFX11-LABEL: name: test_load_private_s64_align1
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -1581,6 +1671,7 @@ body: |
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32)
     ; SI-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
     ; CI-LABEL: name: test_load_private_s96_align16
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -1634,6 +1725,7 @@ body: |
     ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32)
     ; CI-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
     ; VI-LABEL: name: test_load_private_s96_align16
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -1687,6 +1779,7 @@ body: |
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32)
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
     ; GFX9-LABEL: name: test_load_private_s96_align16
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -1740,6 +1833,7 @@ body: |
     ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32)
     ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
     ; GFX10-LABEL: name: test_load_private_s96_align16
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -1793,6 +1887,7 @@ body: |
     ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32)
     ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
     ; GFX11-LABEL: name: test_load_private_s96_align16
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -1825,6 +1920,7 @@ body: |
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32)
     ; SI-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
     ; CI-LABEL: name: test_load_private_s96_align8
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -1839,6 +1935,7 @@ body: |
     ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32)
     ; CI-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
     ; VI-LABEL: name: test_load_private_s96_align8
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -1853,6 +1950,7 @@ body: |
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32)
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
     ; GFX9-LABEL: name: test_load_private_s96_align8
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -1867,6 +1965,7 @@ body: |
     ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32)
     ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
     ; GFX10-LABEL: name: test_load_private_s96_align8
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -1881,6 +1980,7 @@ body: |
     ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32)
     ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
     ; GFX11-LABEL: name: test_load_private_s96_align8
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -1913,6 +2013,7 @@ body: |
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32)
     ; SI-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
     ; CI-LABEL: name: test_load_private_s96_align4
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -1927,6 +2028,7 @@ body: |
     ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32)
     ; CI-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
     ; VI-LABEL: name: test_load_private_s96_align4
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -1941,6 +2043,7 @@ body: |
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32)
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
     ; GFX9-LABEL: name: test_load_private_s96_align4
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -1955,6 +2058,7 @@ body: |
     ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32)
     ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
     ; GFX10-LABEL: name: test_load_private_s96_align4
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -1969,6 +2073,7 @@ body: |
     ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32)
     ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
     ; GFX11-LABEL: name: test_load_private_s96_align4
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -2015,6 +2120,7 @@ body: |
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32)
     ; SI-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
     ; CI-LABEL: name: test_load_private_s96_align2
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -2043,6 +2149,7 @@ body: |
     ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32)
     ; CI-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
     ; VI-LABEL: name: test_load_private_s96_align2
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -2071,6 +2178,7 @@ body: |
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32)
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
     ; GFX9-LABEL: name: test_load_private_s96_align2
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -2099,6 +2207,7 @@ body: |
     ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32)
     ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
     ; GFX10-LABEL: name: test_load_private_s96_align2
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -2127,6 +2236,7 @@ body: |
     ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32)
     ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
     ; GFX11-LABEL: name: test_load_private_s96_align2
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -2198,6 +2308,7 @@ body: |
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32)
     ; SI-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
     ; CI-LABEL: name: test_load_private_s96_align1
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -2251,6 +2362,7 @@ body: |
     ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32)
     ; CI-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
     ; VI-LABEL: name: test_load_private_s96_align1
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -2304,6 +2416,7 @@ body: |
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32)
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
     ; GFX9-LABEL: name: test_load_private_s96_align1
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -2357,6 +2470,7 @@ body: |
     ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32)
     ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
     ; GFX10-LABEL: name: test_load_private_s96_align1
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -2410,6 +2524,7 @@ body: |
     ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32)
     ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s96) = G_BITCAST [[BUILD_VECTOR]](<3 x s32>)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BITCAST]](s96)
+    ;
     ; GFX11-LABEL: name: test_load_private_s96_align1
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -2496,6 +2611,7 @@ body: |
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
     ; SI-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
     ; CI-LABEL: name: test_load_private_s128_align16
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -2564,6 +2680,7 @@ body: |
     ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
     ; CI-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
     ; VI-LABEL: name: test_load_private_s128_align16
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -2632,6 +2749,7 @@ body: |
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
     ; GFX9-LABEL: name: test_load_private_s128_align16
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -2700,6 +2818,7 @@ body: |
     ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
     ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
     ; GFX10-LABEL: name: test_load_private_s128_align16
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -2768,6 +2887,7 @@ body: |
     ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
     ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
     ; GFX11-LABEL: name: test_load_private_s128_align16
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -2803,6 +2923,7 @@ body: |
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32)
     ; SI-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
     ; CI-LABEL: name: test_load_private_s128_align8
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -2820,6 +2941,7 @@ body: |
     ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32)
     ; CI-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
     ; VI-LABEL: name: test_load_private_s128_align8
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -2837,6 +2959,7 @@ body: |
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32)
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
     ; GFX9-LABEL: name: test_load_private_s128_align8
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -2854,6 +2977,7 @@ body: |
     ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32)
     ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
     ; GFX10-LABEL: name: test_load_private_s128_align8
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -2871,6 +2995,7 @@ body: |
     ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32)
     ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
     ; GFX11-LABEL: name: test_load_private_s128_align8
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -2906,6 +3031,7 @@ body: |
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32)
     ; SI-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
     ; CI-LABEL: name: test_load_private_s128_align4
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -2923,6 +3049,7 @@ body: |
     ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32)
     ; CI-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
     ; VI-LABEL: name: test_load_private_s128_align4
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -2940,6 +3067,7 @@ body: |
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32)
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
     ; GFX9-LABEL: name: test_load_private_s128_align4
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -2957,6 +3085,7 @@ body: |
     ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32)
     ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
     ; GFX10-LABEL: name: test_load_private_s128_align4
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -2974,6 +3103,7 @@ body: |
     ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32)
     ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
     ; GFX11-LABEL: name: test_load_private_s128_align4
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -3027,6 +3157,7 @@ body: |
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32), [[OR3]](s32)
     ; SI-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
     ; CI-LABEL: name: test_load_private_s128_align2
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -3062,6 +3193,7 @@ body: |
     ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32), [[OR3]](s32)
     ; CI-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
     ; VI-LABEL: name: test_load_private_s128_align2
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -3097,6 +3229,7 @@ body: |
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32), [[OR3]](s32)
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
     ; GFX9-LABEL: name: test_load_private_s128_align2
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -3132,6 +3265,7 @@ body: |
     ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32), [[OR3]](s32)
     ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
     ; GFX10-LABEL: name: test_load_private_s128_align2
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -3167,6 +3301,7 @@ body: |
     ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32), [[OR3]](s32)
     ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
     ; GFX11-LABEL: name: test_load_private_s128_align2
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -3253,6 +3388,7 @@ body: |
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
     ; SI-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
     ; CI-LABEL: name: test_load_private_s128_align1
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -3321,6 +3457,7 @@ body: |
     ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
     ; CI-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
     ; VI-LABEL: name: test_load_private_s128_align1
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -3389,6 +3526,7 @@ body: |
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
     ; GFX9-LABEL: name: test_load_private_s128_align1
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -3457,6 +3595,7 @@ body: |
     ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
     ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
     ; GFX10-LABEL: name: test_load_private_s128_align1
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -3525,6 +3664,7 @@ body: |
     ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
     ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(s128) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](s128)
+    ;
     ; GFX11-LABEL: name: test_load_private_s128_align1
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -3553,6 +3693,7 @@ body: |
     ; SI-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, addrspace 5)
     ; SI-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[MV]](p1)
+    ;
     ; CI-LABEL: name: test_load_private_p1_align8
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -3563,6 +3704,7 @@ body: |
     ; CI-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, addrspace 5)
     ; CI-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32)
     ; CI-NEXT: $vgpr0_vgpr1 = COPY [[MV]](p1)
+    ;
     ; VI-LABEL: name: test_load_private_p1_align8
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -3573,6 +3715,7 @@ body: |
     ; VI-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, addrspace 5)
     ; VI-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[MV]](p1)
+    ;
     ; GFX9-LABEL: name: test_load_private_p1_align8
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -3583,6 +3726,7 @@ body: |
     ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, addrspace 5)
     ; GFX9-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[MV]](p1)
+    ;
     ; GFX10-LABEL: name: test_load_private_p1_align8
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -3593,6 +3737,7 @@ body: |
     ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, addrspace 5)
     ; GFX10-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32)
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[MV]](p1)
+    ;
     ; GFX11-LABEL: name: test_load_private_p1_align8
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -3620,6 +3765,7 @@ body: |
     ; SI-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, addrspace 5)
     ; SI-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[MV]](p1)
+    ;
     ; CI-LABEL: name: test_load_private_p1_align4
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -3630,6 +3776,7 @@ body: |
     ; CI-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, addrspace 5)
     ; CI-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32)
     ; CI-NEXT: $vgpr0_vgpr1 = COPY [[MV]](p1)
+    ;
     ; VI-LABEL: name: test_load_private_p1_align4
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -3640,6 +3787,7 @@ body: |
     ; VI-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, addrspace 5)
     ; VI-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[MV]](p1)
+    ;
     ; GFX9-LABEL: name: test_load_private_p1_align4
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -3650,6 +3798,7 @@ body: |
     ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, addrspace 5)
     ; GFX9-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[MV]](p1)
+    ;
     ; GFX10-LABEL: name: test_load_private_p1_align4
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -3660,6 +3809,7 @@ body: |
     ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, addrspace 5)
     ; GFX10-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32)
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[MV]](p1)
+    ;
     ; GFX11-LABEL: name: test_load_private_p1_align4
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -3697,6 +3847,7 @@ body: |
     ; SI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
     ; SI-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[MV]](p1)
+    ;
     ; CI-LABEL: name: test_load_private_p1_align2
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -3717,6 +3868,7 @@ body: |
     ; CI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
     ; CI-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32)
     ; CI-NEXT: $vgpr0_vgpr1 = COPY [[MV]](p1)
+    ;
     ; VI-LABEL: name: test_load_private_p1_align2
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -3737,6 +3889,7 @@ body: |
     ; VI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
     ; VI-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[MV]](p1)
+    ;
     ; GFX9-LABEL: name: test_load_private_p1_align2
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -3757,6 +3910,7 @@ body: |
     ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
     ; GFX9-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[MV]](p1)
+    ;
     ; GFX10-LABEL: name: test_load_private_p1_align2
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -3777,6 +3931,7 @@ body: |
     ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
     ; GFX10-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32)
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[MV]](p1)
+    ;
     ; GFX11-LABEL: name: test_load_private_p1_align2
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -3832,6 +3987,7 @@ body: |
     ; SI-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
     ; SI-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR2]](s32), [[OR5]](s32)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[MV]](p1)
+    ;
     ; CI-LABEL: name: test_load_private_p1_align1
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -3870,6 +4026,7 @@ body: |
     ; CI-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
     ; CI-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR2]](s32), [[OR5]](s32)
     ; CI-NEXT: $vgpr0_vgpr1 = COPY [[MV]](p1)
+    ;
     ; VI-LABEL: name: test_load_private_p1_align1
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -3908,6 +4065,7 @@ body: |
     ; VI-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
     ; VI-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR2]](s32), [[OR5]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[MV]](p1)
+    ;
     ; GFX9-LABEL: name: test_load_private_p1_align1
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -3946,6 +4104,7 @@ body: |
     ; GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
     ; GFX9-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR2]](s32), [[OR5]](s32)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[MV]](p1)
+    ;
     ; GFX10-LABEL: name: test_load_private_p1_align1
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -3984,6 +4143,7 @@ body: |
     ; GFX10-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
     ; GFX10-NEXT: [[MV:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[OR2]](s32), [[OR5]](s32)
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[MV]](p1)
+    ;
     ; GFX11-LABEL: name: test_load_private_p1_align1
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -4007,30 +4167,35 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; SI-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p5) :: (load (p3), addrspace 5)
     ; SI-NEXT: $vgpr0 = COPY [[LOAD]](p3)
+    ;
     ; CI-LABEL: name: test_load_private_p3_align4
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p5) :: (load (p3), addrspace 5)
     ; CI-NEXT: $vgpr0 = COPY [[LOAD]](p3)
+    ;
     ; VI-LABEL: name: test_load_private_p3_align4
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p5) :: (load (p3), addrspace 5)
     ; VI-NEXT: $vgpr0 = COPY [[LOAD]](p3)
+    ;
     ; GFX9-LABEL: name: test_load_private_p3_align4
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p5) :: (load (p3), addrspace 5)
     ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](p3)
+    ;
     ; GFX10-LABEL: name: test_load_private_p3_align4
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(p3) = G_LOAD [[COPY]](p5) :: (load (p3), addrspace 5)
     ; GFX10-NEXT: $vgpr0 = COPY [[LOAD]](p3)
+    ;
     ; GFX11-LABEL: name: test_load_private_p3_align4
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -4061,6 +4226,7 @@ body: |
     ; SI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; SI-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR]](s32)
     ; SI-NEXT: $vgpr0 = COPY [[INTTOPTR]](p3)
+    ;
     ; CI-LABEL: name: test_load_private_p3_align2
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -4074,6 +4240,7 @@ body: |
     ; CI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; CI-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR]](s32)
     ; CI-NEXT: $vgpr0 = COPY [[INTTOPTR]](p3)
+    ;
     ; VI-LABEL: name: test_load_private_p3_align2
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -4087,6 +4254,7 @@ body: |
     ; VI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; VI-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR]](s32)
     ; VI-NEXT: $vgpr0 = COPY [[INTTOPTR]](p3)
+    ;
     ; GFX9-LABEL: name: test_load_private_p3_align2
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -4100,6 +4268,7 @@ body: |
     ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; GFX9-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR]](s32)
     ; GFX9-NEXT: $vgpr0 = COPY [[INTTOPTR]](p3)
+    ;
     ; GFX10-LABEL: name: test_load_private_p3_align2
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -4113,6 +4282,7 @@ body: |
     ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; GFX10-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR]](s32)
     ; GFX10-NEXT: $vgpr0 = COPY [[INTTOPTR]](p3)
+    ;
     ; GFX11-LABEL: name: test_load_private_p3_align2
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -4153,6 +4323,7 @@ body: |
     ; SI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
     ; SI-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR2]](s32)
     ; SI-NEXT: $vgpr0 = COPY [[INTTOPTR]](p3)
+    ;
     ; CI-LABEL: name: test_load_private_p3_align1
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -4176,6 +4347,7 @@ body: |
     ; CI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
     ; CI-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR2]](s32)
     ; CI-NEXT: $vgpr0 = COPY [[INTTOPTR]](p3)
+    ;
     ; VI-LABEL: name: test_load_private_p3_align1
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -4199,6 +4371,7 @@ body: |
     ; VI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
     ; VI-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR2]](s32)
     ; VI-NEXT: $vgpr0 = COPY [[INTTOPTR]](p3)
+    ;
     ; GFX9-LABEL: name: test_load_private_p3_align1
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -4222,6 +4395,7 @@ body: |
     ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
     ; GFX9-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR2]](s32)
     ; GFX9-NEXT: $vgpr0 = COPY [[INTTOPTR]](p3)
+    ;
     ; GFX10-LABEL: name: test_load_private_p3_align1
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -4245,6 +4419,7 @@ body: |
     ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
     ; GFX10-NEXT: [[INTTOPTR:%[0-9]+]]:_(p3) = G_INTTOPTR [[OR2]](s32)
     ; GFX10-NEXT: $vgpr0 = COPY [[INTTOPTR]](p3)
+    ;
     ; GFX11-LABEL: name: test_load_private_p3_align1
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -4268,30 +4443,35 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; SI-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p5) :: (load (p5), addrspace 5)
     ; SI-NEXT: $vgpr0 = COPY [[LOAD]](p5)
+    ;
     ; CI-LABEL: name: test_load_private_p5_align4
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p5) :: (load (p5), addrspace 5)
     ; CI-NEXT: $vgpr0 = COPY [[LOAD]](p5)
+    ;
     ; VI-LABEL: name: test_load_private_p5_align4
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p5) :: (load (p5), addrspace 5)
     ; VI-NEXT: $vgpr0 = COPY [[LOAD]](p5)
+    ;
     ; GFX9-LABEL: name: test_load_private_p5_align4
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p5) :: (load (p5), addrspace 5)
     ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](p5)
+    ;
     ; GFX10-LABEL: name: test_load_private_p5_align4
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(p5) = G_LOAD [[COPY]](p5) :: (load (p5), addrspace 5)
     ; GFX10-NEXT: $vgpr0 = COPY [[LOAD]](p5)
+    ;
     ; GFX11-LABEL: name: test_load_private_p5_align4
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -4322,6 +4502,7 @@ body: |
     ; SI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; SI-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR]](s32)
     ; SI-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5)
+    ;
     ; CI-LABEL: name: test_load_private_p5_align2
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -4335,6 +4516,7 @@ body: |
     ; CI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; CI-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR]](s32)
     ; CI-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5)
+    ;
     ; VI-LABEL: name: test_load_private_p5_align2
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -4348,6 +4530,7 @@ body: |
     ; VI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; VI-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR]](s32)
     ; VI-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5)
+    ;
     ; GFX9-LABEL: name: test_load_private_p5_align2
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -4361,6 +4544,7 @@ body: |
     ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; GFX9-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR]](s32)
     ; GFX9-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5)
+    ;
     ; GFX10-LABEL: name: test_load_private_p5_align2
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -4374,6 +4558,7 @@ body: |
     ; GFX10-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[ZEXTLOAD]]
     ; GFX10-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR]](s32)
     ; GFX10-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5)
+    ;
     ; GFX11-LABEL: name: test_load_private_p5_align2
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -4414,6 +4599,7 @@ body: |
     ; SI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
     ; SI-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR2]](s32)
     ; SI-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5)
+    ;
     ; CI-LABEL: name: test_load_private_p5_align1
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -4437,6 +4623,7 @@ body: |
     ; CI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
     ; CI-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR2]](s32)
     ; CI-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5)
+    ;
     ; VI-LABEL: name: test_load_private_p5_align1
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -4460,6 +4647,7 @@ body: |
     ; VI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
     ; VI-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR2]](s32)
     ; VI-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5)
+    ;
     ; GFX9-LABEL: name: test_load_private_p5_align1
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -4483,6 +4671,7 @@ body: |
     ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
     ; GFX9-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR2]](s32)
     ; GFX9-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5)
+    ;
     ; GFX10-LABEL: name: test_load_private_p5_align1
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -4506,6 +4695,7 @@ body: |
     ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SHL2]], [[OR]]
     ; GFX10-NEXT: [[INTTOPTR:%[0-9]+]]:_(p5) = G_INTTOPTR [[OR2]](s32)
     ; GFX10-NEXT: $vgpr0 = COPY [[INTTOPTR]](p5)
+    ;
     ; GFX11-LABEL: name: test_load_private_p5_align1
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -4529,30 +4719,35 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; SI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), addrspace 5)
     ; SI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; CI-LABEL: name: test_load_private_v2s8_align2
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), addrspace 5)
     ; CI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; VI-LABEL: name: test_load_private_v2s8_align2
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), addrspace 5)
     ; VI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX9-LABEL: name: test_load_private_v2s8_align2
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), addrspace 5)
     ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX10-LABEL: name: test_load_private_v2s8_align2
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), addrspace 5)
     ; GFX10-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX11-LABEL: name: test_load_private_v2s8_align2
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -4586,6 +4781,7 @@ body: |
     ; SI-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C1]](s32)
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[LSHR]](s32)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; CI-LABEL: name: test_load_private_v2s8_align1
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -4600,6 +4796,7 @@ body: |
     ; CI-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C1]](s32)
     ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[LSHR]](s32)
     ; CI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; VI-LABEL: name: test_load_private_v2s8_align1
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -4614,6 +4811,7 @@ body: |
     ; VI-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C1]](s32)
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[LSHR]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; GFX9-LABEL: name: test_load_private_v2s8_align1
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -4628,6 +4826,7 @@ body: |
     ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C1]](s32)
     ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[LSHR]](s32)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; GFX10-LABEL: name: test_load_private_v2s8_align1
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -4642,6 +4841,7 @@ body: |
     ; GFX10-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C1]](s32)
     ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[LSHR]](s32)
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; GFX11-LABEL: name: test_load_private_v2s8_align1
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -4673,8 +4873,8 @@ body: |
     ; SI-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
     ; SI-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C1]](s32)
     ; SI-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-    ; SI-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; SI-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
+    ; SI-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; SI-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C2]]
     ; SI-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C]](s32)
     ; SI-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
@@ -4694,6 +4894,7 @@ body: |
     ; SI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C1]](s32)
     ; SI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]]
     ; SI-NEXT: $vgpr0 = COPY [[OR2]](s32)
+    ;
     ; CI-LABEL: name: test_load_private_v3s8_align4
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -4704,8 +4905,8 @@ body: |
     ; CI-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
     ; CI-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C1]](s32)
     ; CI-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-    ; CI-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; CI-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
+    ; CI-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; CI-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C2]]
     ; CI-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C]](s32)
     ; CI-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
@@ -4725,6 +4926,7 @@ body: |
     ; CI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C1]](s32)
     ; CI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]]
     ; CI-NEXT: $vgpr0 = COPY [[OR2]](s32)
+    ;
     ; VI-LABEL: name: test_load_private_v3s8_align4
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -4735,8 +4937,8 @@ body: |
     ; VI-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
     ; VI-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C1]](s32)
     ; VI-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-    ; VI-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; VI-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
+    ; VI-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; VI-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C2]]
     ; VI-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
     ; VI-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C2]]
@@ -4754,6 +4956,7 @@ body: |
     ; VI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C1]](s32)
     ; VI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]]
     ; VI-NEXT: $vgpr0 = COPY [[OR2]](s32)
+    ;
     ; GFX9-LABEL: name: test_load_private_v3s8_align4
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -4764,8 +4967,8 @@ body: |
     ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
     ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C1]](s32)
     ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
+    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C2]]
     ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
     ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C2]]
@@ -4783,6 +4986,7 @@ body: |
     ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C1]](s32)
     ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]]
     ; GFX9-NEXT: $vgpr0 = COPY [[OR2]](s32)
+    ;
     ; GFX10-LABEL: name: test_load_private_v3s8_align4
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -4793,8 +4997,8 @@ body: |
     ; GFX10-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
     ; GFX10-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C1]](s32)
     ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-    ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
+    ; GFX10-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; GFX10-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C2]]
     ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
     ; GFX10-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C2]]
@@ -4812,6 +5016,7 @@ body: |
     ; GFX10-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C1]](s32)
     ; GFX10-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]]
     ; GFX10-NEXT: $vgpr0 = COPY [[OR2]](s32)
+    ;
     ; GFX11-LABEL: name: test_load_private_v3s8_align4
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -4822,8 +5027,8 @@ body: |
     ; GFX11-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
     ; GFX11-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[LOAD]], [[C1]](s32)
     ; GFX11-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-    ; GFX11-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD]](s32)
+    ; GFX11-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; GFX11-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C2]]
     ; GFX11-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
     ; GFX11-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C2]]
@@ -4874,8 +5079,8 @@ body: |
     ; SI-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C1]](s32)
     ; SI-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C3]](s32)
     ; SI-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-    ; SI-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; SI-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
+    ; SI-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; SI-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C4]]
     ; SI-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C1]](s32)
     ; SI-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
@@ -4895,6 +5100,7 @@ body: |
     ; SI-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C3]](s32)
     ; SI-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]]
     ; SI-NEXT: $vgpr0 = COPY [[OR4]](s32)
+    ;
     ; CI-LABEL: name: test_load_private_v3s8_align1
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -4915,8 +5121,8 @@ body: |
     ; CI-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C1]](s32)
     ; CI-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C3]](s32)
     ; CI-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-    ; CI-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; CI-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
+    ; CI-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; CI-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C4]]
     ; CI-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C1]](s32)
     ; CI-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
@@ -4936,6 +5142,7 @@ body: |
     ; CI-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C3]](s32)
     ; CI-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]]
     ; CI-NEXT: $vgpr0 = COPY [[OR4]](s32)
+    ;
     ; VI-LABEL: name: test_load_private_v3s8_align1
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -4956,8 +5163,8 @@ body: |
     ; VI-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C1]](s32)
     ; VI-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C3]](s32)
     ; VI-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-    ; VI-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; VI-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
+    ; VI-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; VI-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C4]]
     ; VI-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
     ; VI-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C4]]
@@ -4975,6 +5182,7 @@ body: |
     ; VI-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C3]](s32)
     ; VI-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]]
     ; VI-NEXT: $vgpr0 = COPY [[OR4]](s32)
+    ;
     ; GFX9-LABEL: name: test_load_private_v3s8_align1
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -4995,8 +5203,8 @@ body: |
     ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C1]](s32)
     ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C3]](s32)
     ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-    ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
+    ; GFX9-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C4]]
     ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
     ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C4]]
@@ -5014,6 +5222,7 @@ body: |
     ; GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C3]](s32)
     ; GFX9-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]]
     ; GFX9-NEXT: $vgpr0 = COPY [[OR4]](s32)
+    ;
     ; GFX10-LABEL: name: test_load_private_v3s8_align1
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -5034,8 +5243,8 @@ body: |
     ; GFX10-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C1]](s32)
     ; GFX10-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR1]], [[C3]](s32)
     ; GFX10-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-    ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
+    ; GFX10-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; GFX10-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C4]]
     ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
     ; GFX10-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C4]]
@@ -5053,6 +5262,7 @@ body: |
     ; GFX10-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C3]](s32)
     ; GFX10-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]]
     ; GFX10-NEXT: $vgpr0 = COPY [[OR4]](s32)
+    ;
     ; GFX11-LABEL: name: test_load_private_v3s8_align1
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -5068,8 +5278,8 @@ body: |
     ; GFX11-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C2]](s32)
     ; GFX11-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[OR]], [[C1]](s32)
     ; GFX11-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-    ; GFX11-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; GFX11-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[OR]](s32)
+    ; GFX11-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; GFX11-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C3]]
     ; GFX11-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
     ; GFX11-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C3]]
@@ -5106,30 +5316,35 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; SI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), addrspace 5)
     ; SI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; CI-LABEL: name: test_load_private_v4s8_align4
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), addrspace 5)
     ; CI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; VI-LABEL: name: test_load_private_v4s8_align4
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), addrspace 5)
     ; VI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX9-LABEL: name: test_load_private_v4s8_align4
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), addrspace 5)
     ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX10-LABEL: name: test_load_private_v4s8_align4
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), addrspace 5)
     ; GFX10-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX11-LABEL: name: test_load_private_v4s8_align4
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -5158,6 +5373,7 @@ body: |
     ; SI-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, addrspace 5)
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; CI-LABEL: name: test_load_private_v8s8_align8
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -5168,6 +5384,7 @@ body: |
     ; CI-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, addrspace 5)
     ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32)
     ; CI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; VI-LABEL: name: test_load_private_v8s8_align8
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -5178,6 +5395,7 @@ body: |
     ; VI-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, addrspace 5)
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; GFX9-LABEL: name: test_load_private_v8s8_align8
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -5188,6 +5406,7 @@ body: |
     ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, addrspace 5)
     ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; GFX10-LABEL: name: test_load_private_v8s8_align8
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -5198,6 +5417,7 @@ body: |
     ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, addrspace 5)
     ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32)
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; GFX11-LABEL: name: test_load_private_v8s8_align8
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -5283,6 +5503,7 @@ body: |
     ; SI-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
+    ;
     ; CI-LABEL: name: test_load_private_v16s8_align16
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -5350,6 +5571,7 @@ body: |
     ; CI-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
     ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
+    ;
     ; VI-LABEL: name: test_load_private_v16s8_align16
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -5417,6 +5639,7 @@ body: |
     ; VI-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
+    ;
     ; GFX9-LABEL: name: test_load_private_v16s8_align16
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -5484,6 +5707,7 @@ body: |
     ; GFX9-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
     ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
+    ;
     ; GFX10-LABEL: name: test_load_private_v16s8_align16
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -5551,6 +5775,7 @@ body: |
     ; GFX10-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
     ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
+    ;
     ; GFX11-LABEL: name: test_load_private_v16s8_align16
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -5575,30 +5800,35 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; SI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), addrspace 5)
     ; SI-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>)
+    ;
     ; CI-LABEL: name: test_load_private_v2s16_align4
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), addrspace 5)
     ; CI-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>)
+    ;
     ; VI-LABEL: name: test_load_private_v2s16_align4
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), addrspace 5)
     ; VI-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>)
+    ;
     ; GFX9-LABEL: name: test_load_private_v2s16_align4
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), addrspace 5)
     ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>)
+    ;
     ; GFX10-LABEL: name: test_load_private_v2s16_align4
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), addrspace 5)
     ; GFX10-NEXT: $vgpr0 = COPY [[LOAD]](<2 x s16>)
+    ;
     ; GFX11-LABEL: name: test_load_private_v2s16_align4
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -5632,6 +5862,7 @@ body: |
     ; SI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
     ; SI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; SI-NEXT: $vgpr0 = COPY [[BITCAST]](<2 x s16>)
+    ;
     ; CI-LABEL: name: test_load_private_v2s16_align2
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -5648,6 +5879,7 @@ body: |
     ; CI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
     ; CI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; CI-NEXT: $vgpr0 = COPY [[BITCAST]](<2 x s16>)
+    ;
     ; VI-LABEL: name: test_load_private_v2s16_align2
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -5664,6 +5896,7 @@ body: |
     ; VI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; VI-NEXT: $vgpr0 = COPY [[BITCAST]](<2 x s16>)
+    ;
     ; GFX9-LABEL: name: test_load_private_v2s16_align2
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -5676,6 +5909,7 @@ body: |
     ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
     ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
     ; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>)
+    ;
     ; GFX10-LABEL: name: test_load_private_v2s16_align2
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -5688,6 +5922,7 @@ body: |
     ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LOAD1]](s32)
     ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
     ; GFX10-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>)
+    ;
     ; GFX11-LABEL: name: test_load_private_v2s16_align2
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -5731,6 +5966,7 @@ body: |
     ; SI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL2]]
     ; SI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
     ; SI-NEXT: $vgpr0 = COPY [[BITCAST]](<2 x s16>)
+    ;
     ; CI-LABEL: name: test_load_private_v2s16_align1
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -5757,6 +5993,7 @@ body: |
     ; CI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL2]]
     ; CI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
     ; CI-NEXT: $vgpr0 = COPY [[BITCAST]](<2 x s16>)
+    ;
     ; VI-LABEL: name: test_load_private_v2s16_align1
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -5783,6 +6020,7 @@ body: |
     ; VI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL2]]
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
     ; VI-NEXT: $vgpr0 = COPY [[BITCAST]](<2 x s16>)
+    ;
     ; GFX9-LABEL: name: test_load_private_v2s16_align1
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -5805,6 +6043,7 @@ body: |
     ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
     ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
     ; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>)
+    ;
     ; GFX10-LABEL: name: test_load_private_v2s16_align1
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -5827,6 +6066,7 @@ body: |
     ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[OR1]](s32)
     ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
     ; GFX10-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>)
+    ;
     ; GFX11-LABEL: name: test_load_private_v2s16_align1
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -5862,22 +6102,21 @@ body: |
     ; SI-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
     ; SI-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
     ; SI-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[BITCAST]], [[C2]]
-    ; SI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C2]]
-    ; SI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C1]](s32)
+    ; SI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LSHR]], [[C1]](s32)
     ; SI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
     ; SI-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
-    ; SI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[LOAD1]], [[C2]]
-    ; SI-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C2]]
-    ; SI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C1]](s32)
-    ; SI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]]
+    ; SI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LOAD1]], [[C2]]
+    ; SI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C2]]
+    ; SI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C1]](s32)
+    ; SI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND1]], [[SHL1]]
     ; SI-NEXT: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
-    ; SI-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C2]]
-    ; SI-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[BITCAST2]], [[C2]]
-    ; SI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C1]](s32)
-    ; SI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]]
+    ; SI-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[BITCAST2]], [[C2]]
+    ; SI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C1]](s32)
+    ; SI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[LSHR1]], [[SHL2]]
     ; SI-NEXT: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
     ; SI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
     ; CI-LABEL: name: test_load_private_v3s16_align8
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -5896,22 +6135,21 @@ body: |
     ; CI-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
     ; CI-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
     ; CI-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[BITCAST]], [[C2]]
-    ; CI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C2]]
-    ; CI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C1]](s32)
+    ; CI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LSHR]], [[C1]](s32)
     ; CI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
     ; CI-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
-    ; CI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[LOAD1]], [[C2]]
-    ; CI-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C2]]
-    ; CI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C1]](s32)
-    ; CI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]]
+    ; CI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LOAD1]], [[C2]]
+    ; CI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C2]]
+    ; CI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C1]](s32)
+    ; CI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND1]], [[SHL1]]
     ; CI-NEXT: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
-    ; CI-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C2]]
-    ; CI-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[BITCAST2]], [[C2]]
-    ; CI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C1]](s32)
-    ; CI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]]
+    ; CI-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[BITCAST2]], [[C2]]
+    ; CI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C1]](s32)
+    ; CI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[LSHR1]], [[SHL2]]
     ; CI-NEXT: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
     ; CI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
     ; VI-LABEL: name: test_load_private_v3s16_align8
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -5930,22 +6168,21 @@ body: |
     ; VI-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
     ; VI-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
     ; VI-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[BITCAST]], [[C2]]
-    ; VI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C2]]
-    ; VI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C1]](s32)
+    ; VI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LSHR]], [[C1]](s32)
     ; VI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
     ; VI-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
-    ; VI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[LOAD1]], [[C2]]
-    ; VI-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C2]]
-    ; VI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C1]](s32)
-    ; VI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]]
+    ; VI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LOAD1]], [[C2]]
+    ; VI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C2]]
+    ; VI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C1]](s32)
+    ; VI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND1]], [[SHL1]]
     ; VI-NEXT: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
-    ; VI-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C2]]
-    ; VI-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[BITCAST2]], [[C2]]
-    ; VI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C1]](s32)
-    ; VI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]]
+    ; VI-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[BITCAST2]], [[C2]]
+    ; VI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C1]](s32)
+    ; VI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[LSHR1]], [[SHL2]]
     ; VI-NEXT: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
     ; VI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
     ; GFX9-LABEL: name: test_load_private_v3s16_align8
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -5973,6 +6210,7 @@ body: |
     ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16)
     ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
     ; GFX10-LABEL: name: test_load_private_v3s16_align8
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -6000,6 +6238,7 @@ body: |
     ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16)
     ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
     ; GFX11-LABEL: name: test_load_private_v3s16_align8
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -6062,13 +6301,13 @@ body: |
     ; SI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C2]](s32)
     ; SI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]]
     ; SI-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
-    ; SI-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C3]]
-    ; SI-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C3]]
-    ; SI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C2]](s32)
-    ; SI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]]
+    ; SI-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C3]]
+    ; SI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND4]], [[C2]](s32)
+    ; SI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[LSHR]], [[SHL2]]
     ; SI-NEXT: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
     ; SI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
     ; CI-LABEL: name: test_load_private_v3s16_align2
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -6097,13 +6336,13 @@ body: |
     ; CI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C2]](s32)
     ; CI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]]
     ; CI-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
-    ; CI-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C3]]
-    ; CI-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C3]]
-    ; CI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C2]](s32)
-    ; CI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]]
+    ; CI-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C3]]
+    ; CI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND4]], [[C2]](s32)
+    ; CI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[LSHR]], [[SHL2]]
     ; CI-NEXT: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
     ; CI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
     ; VI-LABEL: name: test_load_private_v3s16_align2
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -6132,13 +6371,13 @@ body: |
     ; VI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C2]](s32)
     ; VI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]]
     ; VI-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
-    ; VI-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C3]]
-    ; VI-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C3]]
-    ; VI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C2]](s32)
-    ; VI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]]
+    ; VI-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C3]]
+    ; VI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND4]], [[C2]](s32)
+    ; VI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[LSHR]], [[SHL2]]
     ; VI-NEXT: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
     ; VI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
     ; GFX9-LABEL: name: test_load_private_v3s16_align2
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -6167,6 +6406,7 @@ body: |
     ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16)
     ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
     ; GFX10-LABEL: name: test_load_private_v3s16_align2
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -6195,6 +6435,7 @@ body: |
     ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16)
     ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
     ; GFX11-LABEL: name: test_load_private_v3s16_align2
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -6278,13 +6519,13 @@ body: |
     ; SI-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32)
     ; SI-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL4]]
     ; SI-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR4]](s32)
-    ; SI-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C5]]
-    ; SI-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C5]]
-    ; SI-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C4]](s32)
-    ; SI-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL5]]
+    ; SI-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C5]]
+    ; SI-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND4]], [[C4]](s32)
+    ; SI-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[LSHR]], [[SHL5]]
     ; SI-NEXT: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR5]](s32)
     ; SI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
     ; CI-LABEL: name: test_load_private_v3s16_align1
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -6327,13 +6568,13 @@ body: |
     ; CI-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32)
     ; CI-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL4]]
     ; CI-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR4]](s32)
-    ; CI-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C5]]
-    ; CI-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C5]]
-    ; CI-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C4]](s32)
-    ; CI-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL5]]
+    ; CI-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C5]]
+    ; CI-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND4]], [[C4]](s32)
+    ; CI-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[LSHR]], [[SHL5]]
     ; CI-NEXT: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR5]](s32)
     ; CI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
     ; VI-LABEL: name: test_load_private_v3s16_align1
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -6376,13 +6617,13 @@ body: |
     ; VI-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C4]](s32)
     ; VI-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL4]]
     ; VI-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR4]](s32)
-    ; VI-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C5]]
-    ; VI-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C5]]
-    ; VI-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C4]](s32)
-    ; VI-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL5]]
+    ; VI-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C5]]
+    ; VI-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND4]], [[C4]](s32)
+    ; VI-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[LSHR]], [[SHL5]]
     ; VI-NEXT: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR5]](s32)
     ; VI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
     ; GFX9-LABEL: name: test_load_private_v3s16_align1
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -6425,6 +6666,7 @@ body: |
     ; GFX9-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16)
     ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
     ; GFX10-LABEL: name: test_load_private_v3s16_align1
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -6467,6 +6709,7 @@ body: |
     ; GFX10-NEXT: [[BUILD_VECTOR2:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC4]](s16), [[TRUNC5]](s16)
     ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>), [[BUILD_VECTOR2]](<2 x s16>)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
     ; GFX11-LABEL: name: test_load_private_v3s16_align1
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -6517,6 +6760,7 @@ body: |
     ; SI-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[PTR_ADD]](p5) :: (load (<2 x s16>) from unknown-address + 4, addrspace 5)
     ; SI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[LOAD]](<2 x s16>), [[LOAD1]](<2 x s16>)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
     ; CI-LABEL: name: test_load_private_v4s16_align8
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -6527,6 +6771,7 @@ body: |
     ; CI-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[PTR_ADD]](p5) :: (load (<2 x s16>) from unknown-address + 4, addrspace 5)
     ; CI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[LOAD]](<2 x s16>), [[LOAD1]](<2 x s16>)
     ; CI-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
     ; VI-LABEL: name: test_load_private_v4s16_align8
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -6537,6 +6782,7 @@ body: |
     ; VI-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[PTR_ADD]](p5) :: (load (<2 x s16>) from unknown-address + 4, addrspace 5)
     ; VI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[LOAD]](<2 x s16>), [[LOAD1]](<2 x s16>)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
     ; GFX9-LABEL: name: test_load_private_v4s16_align8
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -6547,6 +6793,7 @@ body: |
     ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[PTR_ADD]](p5) :: (load (<2 x s16>) from unknown-address + 4, addrspace 5)
     ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[LOAD]](<2 x s16>), [[LOAD1]](<2 x s16>)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
     ; GFX10-LABEL: name: test_load_private_v4s16_align8
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -6557,6 +6804,7 @@ body: |
     ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[PTR_ADD]](p5) :: (load (<2 x s16>) from unknown-address + 4, addrspace 5)
     ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[LOAD]](<2 x s16>), [[LOAD1]](<2 x s16>)
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
     ; GFX11-LABEL: name: test_load_private_v4s16_align8
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -6584,6 +6832,7 @@ body: |
     ; SI-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[PTR_ADD]](p5) :: (load (<2 x s16>) from unknown-address + 4, addrspace 5)
     ; SI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[LOAD]](<2 x s16>), [[LOAD1]](<2 x s16>)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
     ; CI-LABEL: name: test_load_private_v4s16_align4
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -6594,6 +6843,7 @@ body: |
     ; CI-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[PTR_ADD]](p5) :: (load (<2 x s16>) from unknown-address + 4, addrspace 5)
     ; CI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[LOAD]](<2 x s16>), [[LOAD1]](<2 x s16>)
     ; CI-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
     ; VI-LABEL: name: test_load_private_v4s16_align4
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -6604,6 +6854,7 @@ body: |
     ; VI-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[PTR_ADD]](p5) :: (load (<2 x s16>) from unknown-address + 4, addrspace 5)
     ; VI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[LOAD]](<2 x s16>), [[LOAD1]](<2 x s16>)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
     ; GFX9-LABEL: name: test_load_private_v4s16_align4
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -6614,6 +6865,7 @@ body: |
     ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[PTR_ADD]](p5) :: (load (<2 x s16>) from unknown-address + 4, addrspace 5)
     ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[LOAD]](<2 x s16>), [[LOAD1]](<2 x s16>)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
     ; GFX10-LABEL: name: test_load_private_v4s16_align4
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -6624,6 +6876,7 @@ body: |
     ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[PTR_ADD]](p5) :: (load (<2 x s16>) from unknown-address + 4, addrspace 5)
     ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[LOAD]](<2 x s16>), [[LOAD1]](<2 x s16>)
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
     ; GFX11-LABEL: name: test_load_private_v4s16_align4
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -6667,6 +6920,7 @@ body: |
     ; SI-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
     ; SI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
     ; CI-LABEL: name: test_load_private_v4s16_align2
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -6694,6 +6948,7 @@ body: |
     ; CI-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
     ; CI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>)
     ; CI-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
     ; VI-LABEL: name: test_load_private_v4s16_align2
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -6721,6 +6976,7 @@ body: |
     ; VI-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
     ; VI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
     ; GFX9-LABEL: name: test_load_private_v4s16_align2
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -6742,6 +6998,7 @@ body: |
     ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
     ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
     ; GFX10-LABEL: name: test_load_private_v4s16_align2
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -6763,6 +7020,7 @@ body: |
     ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
     ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>)
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
     ; GFX11-LABEL: name: test_load_private_v4s16_align2
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -6825,6 +7083,7 @@ body: |
     ; SI-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR5]](s32)
     ; SI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
     ; CI-LABEL: name: test_load_private_v4s16_align1
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -6870,6 +7129,7 @@ body: |
     ; CI-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR5]](s32)
     ; CI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>)
     ; CI-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
     ; VI-LABEL: name: test_load_private_v4s16_align1
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -6915,6 +7175,7 @@ body: |
     ; VI-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR5]](s32)
     ; VI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST]](<2 x s16>), [[BITCAST1]](<2 x s16>)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
     ; GFX9-LABEL: name: test_load_private_v4s16_align1
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -6954,6 +7215,7 @@ body: |
     ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
     ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
     ; GFX10-LABEL: name: test_load_private_v4s16_align1
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -6993,6 +7255,7 @@ body: |
     ; GFX10-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
     ; GFX10-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>)
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
     ; GFX11-LABEL: name: test_load_private_v4s16_align1
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -7020,6 +7283,7 @@ body: |
     ; SI-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, addrspace 5)
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; CI-LABEL: name: test_load_private_v2s32_align8
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -7030,6 +7294,7 @@ body: |
     ; CI-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, addrspace 5)
     ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32)
     ; CI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; VI-LABEL: name: test_load_private_v2s32_align8
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -7040,6 +7305,7 @@ body: |
     ; VI-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, addrspace 5)
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; GFX9-LABEL: name: test_load_private_v2s32_align8
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -7050,6 +7316,7 @@ body: |
     ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, addrspace 5)
     ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; GFX10-LABEL: name: test_load_private_v2s32_align8
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -7060,6 +7327,7 @@ body: |
     ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, addrspace 5)
     ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32)
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; GFX11-LABEL: name: test_load_private_v2s32_align8
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -7087,6 +7355,7 @@ body: |
     ; SI-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, addrspace 5)
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; CI-LABEL: name: test_load_private_v2s32_align4
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -7097,6 +7366,7 @@ body: |
     ; CI-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, addrspace 5)
     ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32)
     ; CI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; VI-LABEL: name: test_load_private_v2s32_align4
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -7107,6 +7377,7 @@ body: |
     ; VI-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, addrspace 5)
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; GFX9-LABEL: name: test_load_private_v2s32_align4
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -7117,6 +7388,7 @@ body: |
     ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, addrspace 5)
     ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; GFX10-LABEL: name: test_load_private_v2s32_align4
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -7127,6 +7399,7 @@ body: |
     ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p5) :: (load (s32) from unknown-address + 4, addrspace 5)
     ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32)
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; GFX11-LABEL: name: test_load_private_v2s32_align4
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -7164,6 +7437,7 @@ body: |
     ; SI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; CI-LABEL: name: test_load_private_v2s32_align2
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -7184,6 +7458,7 @@ body: |
     ; CI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
     ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32)
     ; CI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; VI-LABEL: name: test_load_private_v2s32_align2
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -7204,6 +7479,7 @@ body: |
     ; VI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; GFX9-LABEL: name: test_load_private_v2s32_align2
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -7224,6 +7500,7 @@ body: |
     ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
     ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; GFX10-LABEL: name: test_load_private_v2s32_align2
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -7244,6 +7521,7 @@ body: |
     ; GFX10-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[SHL1]], [[ZEXTLOAD1]]
     ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32)
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; GFX11-LABEL: name: test_load_private_v2s32_align2
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -7299,6 +7577,7 @@ body: |
     ; SI-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; CI-LABEL: name: test_load_private_v2s32_align1
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -7337,6 +7616,7 @@ body: |
     ; CI-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
     ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32)
     ; CI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; VI-LABEL: name: test_load_private_v2s32_align1
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -7375,6 +7655,7 @@ body: |
     ; VI-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; GFX9-LABEL: name: test_load_private_v2s32_align1
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -7413,6 +7694,7 @@ body: |
     ; GFX9-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
     ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; GFX10-LABEL: name: test_load_private_v2s32_align1
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -7451,6 +7733,7 @@ body: |
     ; GFX10-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[SHL5]], [[OR3]]
     ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32)
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; GFX11-LABEL: name: test_load_private_v2s32_align1
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -7520,6 +7803,7 @@ body: |
     ; SI-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
+    ;
     ; CI-LABEL: name: test_load_private_v3s32_align16
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -7572,6 +7856,7 @@ body: |
     ; CI-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
     ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
+    ;
     ; VI-LABEL: name: test_load_private_v3s32_align16
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -7624,6 +7909,7 @@ body: |
     ; VI-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
+    ;
     ; GFX9-LABEL: name: test_load_private_v3s32_align16
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -7676,6 +7962,7 @@ body: |
     ; GFX9-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
     ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
+    ;
     ; GFX10-LABEL: name: test_load_private_v3s32_align16
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -7728,6 +8015,7 @@ body: |
     ; GFX10-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[SHL8]], [[OR6]]
     ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
+    ;
     ; GFX11-LABEL: name: test_load_private_v3s32_align16
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -7758,6 +8046,7 @@ body: |
     ; SI-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, addrspace 5)
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
+    ;
     ; CI-LABEL: name: test_load_private_v3s32_align4
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -7771,6 +8060,7 @@ body: |
     ; CI-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, addrspace 5)
     ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
+    ;
     ; VI-LABEL: name: test_load_private_v3s32_align4
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -7784,6 +8074,7 @@ body: |
     ; VI-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, addrspace 5)
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
+    ;
     ; GFX9-LABEL: name: test_load_private_v3s32_align4
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -7797,6 +8088,7 @@ body: |
     ; GFX9-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, addrspace 5)
     ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
+    ;
     ; GFX10-LABEL: name: test_load_private_v3s32_align4
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -7810,6 +8102,7 @@ body: |
     ; GFX10-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p5) :: (load (s32) from unknown-address + 8, addrspace 5)
     ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
+    ;
     ; GFX11-LABEL: name: test_load_private_v3s32_align4
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -7894,6 +8187,7 @@ body: |
     ; SI-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
+    ;
     ; CI-LABEL: name: test_load_private_v4s32_align16
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -7961,6 +8255,7 @@ body: |
     ; CI-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
     ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
+    ;
     ; VI-LABEL: name: test_load_private_v4s32_align16
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -8028,6 +8323,7 @@ body: |
     ; VI-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
+    ;
     ; GFX9-LABEL: name: test_load_private_v4s32_align16
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -8095,6 +8391,7 @@ body: |
     ; GFX9-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
     ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
+    ;
     ; GFX10-LABEL: name: test_load_private_v4s32_align16
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -8162,6 +8459,7 @@ body: |
     ; GFX10-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
     ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
+    ;
     ; GFX11-LABEL: name: test_load_private_v4s32_align16
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -8195,6 +8493,7 @@ body: |
     ; SI-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, addrspace 5)
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
+    ;
     ; CI-LABEL: name: test_load_private_v4s32_align8
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -8211,6 +8510,7 @@ body: |
     ; CI-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, addrspace 5)
     ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
+    ;
     ; VI-LABEL: name: test_load_private_v4s32_align8
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -8227,6 +8527,7 @@ body: |
     ; VI-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, addrspace 5)
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
+    ;
     ; GFX9-LABEL: name: test_load_private_v4s32_align8
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -8243,6 +8544,7 @@ body: |
     ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, addrspace 5)
     ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
+    ;
     ; GFX10-LABEL: name: test_load_private_v4s32_align8
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -8259,6 +8561,7 @@ body: |
     ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, addrspace 5)
     ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
+    ;
     ; GFX11-LABEL: name: test_load_private_v4s32_align8
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -8292,6 +8595,7 @@ body: |
     ; SI-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, addrspace 5)
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
+    ;
     ; CI-LABEL: name: test_load_private_v4s32_align4
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -8308,6 +8612,7 @@ body: |
     ; CI-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, addrspace 5)
     ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
+    ;
     ; VI-LABEL: name: test_load_private_v4s32_align4
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -8324,6 +8629,7 @@ body: |
     ; VI-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, addrspace 5)
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
+    ;
     ; GFX9-LABEL: name: test_load_private_v4s32_align4
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -8340,6 +8646,7 @@ body: |
     ; GFX9-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, addrspace 5)
     ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
+    ;
     ; GFX10-LABEL: name: test_load_private_v4s32_align4
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -8356,6 +8663,7 @@ body: |
     ; GFX10-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD2]](p5) :: (load (s32) from unknown-address + 12, addrspace 5)
     ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
+    ;
     ; GFX11-LABEL: name: test_load_private_v4s32_align4
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -8407,6 +8715,7 @@ body: |
     ; SI-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32), [[OR3]](s32)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
+    ;
     ; CI-LABEL: name: test_load_private_v4s32_align2
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -8441,6 +8750,7 @@ body: |
     ; CI-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
     ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32), [[OR3]](s32)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
+    ;
     ; VI-LABEL: name: test_load_private_v4s32_align2
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -8475,6 +8785,7 @@ body: |
     ; VI-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32), [[OR3]](s32)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
+    ;
     ; GFX9-LABEL: name: test_load_private_v4s32_align2
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -8509,6 +8820,7 @@ body: |
     ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
     ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32), [[OR3]](s32)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
+    ;
     ; GFX10-LABEL: name: test_load_private_v4s32_align2
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -8543,6 +8855,7 @@ body: |
     ; GFX10-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SHL3]], [[ZEXTLOAD3]]
     ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR]](s32), [[OR1]](s32), [[OR2]](s32), [[OR3]](s32)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
+    ;
     ; GFX11-LABEL: name: test_load_private_v4s32_align2
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -8627,6 +8940,7 @@ body: |
     ; SI-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
+    ;
     ; CI-LABEL: name: test_load_private_v4s32_align1
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -8694,6 +9008,7 @@ body: |
     ; CI-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
     ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
+    ;
     ; VI-LABEL: name: test_load_private_v4s32_align1
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -8761,6 +9076,7 @@ body: |
     ; VI-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
+    ;
     ; GFX9-LABEL: name: test_load_private_v4s32_align1
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -8828,6 +9144,7 @@ body: |
     ; GFX9-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
     ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
+    ;
     ; GFX10-LABEL: name: test_load_private_v4s32_align1
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -8895,6 +9212,7 @@ body: |
     ; GFX10-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[SHL11]], [[OR9]]
     ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[OR2]](s32), [[OR5]](s32), [[OR8]](s32), [[OR11]](s32)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
+    ;
     ; GFX11-LABEL: name: test_load_private_v4s32_align1
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -8940,6 +9258,7 @@ body: |
     ; SI-NEXT: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s32) from unknown-address + 28, addrspace 5)
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32), [[LOAD4]](s32), [[LOAD5]](s32), [[LOAD6]](s32), [[LOAD7]](s32)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<8 x s32>)
+    ;
     ; CI-LABEL: name: test_load_private_v8s32_align32
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -8968,6 +9287,7 @@ body: |
     ; CI-NEXT: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s32) from unknown-address + 28, addrspace 5)
     ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32), [[LOAD4]](s32), [[LOAD5]](s32), [[LOAD6]](s32), [[LOAD7]](s32)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<8 x s32>)
+    ;
     ; VI-LABEL: name: test_load_private_v8s32_align32
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -8996,6 +9316,7 @@ body: |
     ; VI-NEXT: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s32) from unknown-address + 28, addrspace 5)
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32), [[LOAD4]](s32), [[LOAD5]](s32), [[LOAD6]](s32), [[LOAD7]](s32)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<8 x s32>)
+    ;
     ; GFX9-LABEL: name: test_load_private_v8s32_align32
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -9024,6 +9345,7 @@ body: |
     ; GFX9-NEXT: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s32) from unknown-address + 28, addrspace 5)
     ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32), [[LOAD4]](s32), [[LOAD5]](s32), [[LOAD6]](s32), [[LOAD7]](s32)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<8 x s32>)
+    ;
     ; GFX10-LABEL: name: test_load_private_v8s32_align32
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -9052,6 +9374,7 @@ body: |
     ; GFX10-NEXT: [[LOAD7:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD6]](p5) :: (load (s32) from unknown-address + 28, addrspace 5)
     ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32), [[LOAD4]](s32), [[LOAD5]](s32), [[LOAD6]](s32), [[LOAD7]](s32)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<8 x s32>)
+    ;
     ; GFX11-LABEL: name: test_load_private_v8s32_align32
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -9125,6 +9448,7 @@ body: |
     ; SI-NEXT: [[LOAD15:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s32) from unknown-address + 60, addrspace 5)
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<16 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32), [[LOAD4]](s32), [[LOAD5]](s32), [[LOAD6]](s32), [[LOAD7]](s32), [[LOAD8]](s32), [[LOAD9]](s32), [[LOAD10]](s32), [[LOAD11]](s32), [[LOAD12]](s32), [[LOAD13]](s32), [[LOAD14]](s32), [[LOAD15]](s32)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[BUILD_VECTOR]](<16 x s32>)
+    ;
     ; CI-LABEL: name: test_load_private_v16s32_align32
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -9177,6 +9501,7 @@ body: |
     ; CI-NEXT: [[LOAD15:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s32) from unknown-address + 60, addrspace 5)
     ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<16 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32), [[LOAD4]](s32), [[LOAD5]](s32), [[LOAD6]](s32), [[LOAD7]](s32), [[LOAD8]](s32), [[LOAD9]](s32), [[LOAD10]](s32), [[LOAD11]](s32), [[LOAD12]](s32), [[LOAD13]](s32), [[LOAD14]](s32), [[LOAD15]](s32)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[BUILD_VECTOR]](<16 x s32>)
+    ;
     ; VI-LABEL: name: test_load_private_v16s32_align32
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -9229,6 +9554,7 @@ body: |
     ; VI-NEXT: [[LOAD15:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s32) from unknown-address + 60, addrspace 5)
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<16 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32), [[LOAD4]](s32), [[LOAD5]](s32), [[LOAD6]](s32), [[LOAD7]](s32), [[LOAD8]](s32), [[LOAD9]](s32), [[LOAD10]](s32), [[LOAD11]](s32), [[LOAD12]](s32), [[LOAD13]](s32), [[LOAD14]](s32), [[LOAD15]](s32)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[BUILD_VECTOR]](<16 x s32>)
+    ;
     ; GFX9-LABEL: name: test_load_private_v16s32_align32
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -9281,6 +9607,7 @@ body: |
     ; GFX9-NEXT: [[LOAD15:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s32) from unknown-address + 60, addrspace 5)
     ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<16 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32), [[LOAD4]](s32), [[LOAD5]](s32), [[LOAD6]](s32), [[LOAD7]](s32), [[LOAD8]](s32), [[LOAD9]](s32), [[LOAD10]](s32), [[LOAD11]](s32), [[LOAD12]](s32), [[LOAD13]](s32), [[LOAD14]](s32), [[LOAD15]](s32)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[BUILD_VECTOR]](<16 x s32>)
+    ;
     ; GFX10-LABEL: name: test_load_private_v16s32_align32
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -9333,6 +9660,7 @@ body: |
     ; GFX10-NEXT: [[LOAD15:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD14]](p5) :: (load (s32) from unknown-address + 60, addrspace 5)
     ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<16 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32), [[LOAD4]](s32), [[LOAD5]](s32), [[LOAD6]](s32), [[LOAD7]](s32), [[LOAD8]](s32), [[LOAD9]](s32), [[LOAD10]](s32), [[LOAD11]](s32), [[LOAD12]](s32), [[LOAD13]](s32), [[LOAD14]](s32), [[LOAD15]](s32)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[BUILD_VECTOR]](<16 x s32>)
+    ;
     ; GFX11-LABEL: name: test_load_private_v16s32_align32
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -9377,6 +9705,7 @@ body: |
     ; SI-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD2]](s32), [[LOAD3]](s32)
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ;
     ; CI-LABEL: name: test_load_private_v2s64_align4
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -9394,6 +9723,7 @@ body: |
     ; CI-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD2]](s32), [[LOAD3]](s32)
     ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ;
     ; VI-LABEL: name: test_load_private_v2s64_align4
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -9411,6 +9741,7 @@ body: |
     ; VI-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD2]](s32), [[LOAD3]](s32)
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ;
     ; GFX9-LABEL: name: test_load_private_v2s64_align4
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -9428,6 +9759,7 @@ body: |
     ; GFX9-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD2]](s32), [[LOAD3]](s32)
     ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ;
     ; GFX10-LABEL: name: test_load_private_v2s64_align4
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -9445,6 +9777,7 @@ body: |
     ; GFX10-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD2]](s32), [[LOAD3]](s32)
     ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ;
     ; GFX11-LABEL: name: test_load_private_v2s64_align4
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -9530,6 +9863,7 @@ body: |
     ; SI-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR8]](s32), [[OR11]](s32)
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ;
     ; CI-LABEL: name: test_load_private_v2s64_align16
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -9598,6 +9932,7 @@ body: |
     ; CI-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR8]](s32), [[OR11]](s32)
     ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ;
     ; VI-LABEL: name: test_load_private_v2s64_align16
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -9666,6 +10001,7 @@ body: |
     ; VI-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR8]](s32), [[OR11]](s32)
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ;
     ; GFX9-LABEL: name: test_load_private_v2s64_align16
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -9734,6 +10070,7 @@ body: |
     ; GFX9-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR8]](s32), [[OR11]](s32)
     ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ;
     ; GFX10-LABEL: name: test_load_private_v2s64_align16
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -9802,6 +10139,7 @@ body: |
     ; GFX10-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR8]](s32), [[OR11]](s32)
     ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ;
     ; GFX11-LABEL: name: test_load_private_v2s64_align16
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -9844,6 +10182,7 @@ body: |
     ; SI-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64), [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>)
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[UV3]](s64)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
+    ;
     ; CI-LABEL: name: test_load_private_v3s64_align32
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -9869,6 +10208,7 @@ body: |
     ; CI-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64), [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>)
     ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[UV3]](s64)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
+    ;
     ; VI-LABEL: name: test_load_private_v3s64_align32
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -9894,6 +10234,7 @@ body: |
     ; VI-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64), [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>)
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[UV3]](s64)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
+    ;
     ; GFX9-LABEL: name: test_load_private_v3s64_align32
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -9919,6 +10260,7 @@ body: |
     ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64), [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>)
     ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[UV3]](s64)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
+    ;
     ; GFX10-LABEL: name: test_load_private_v3s64_align32
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -9944,6 +10286,7 @@ body: |
     ; GFX10-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64), [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>)
     ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[UV3]](s64)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
+    ;
     ; GFX11-LABEL: name: test_load_private_v3s64_align32
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -9999,6 +10342,7 @@ body: |
     ; SI-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD6]](s32), [[LOAD7]](s32)
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[MV3]](s64)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
+    ;
     ; CI-LABEL: name: test_load_private_v4s64_align32
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -10028,6 +10372,7 @@ body: |
     ; CI-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD6]](s32), [[LOAD7]](s32)
     ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[MV3]](s64)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
+    ;
     ; VI-LABEL: name: test_load_private_v4s64_align32
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -10057,6 +10402,7 @@ body: |
     ; VI-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD6]](s32), [[LOAD7]](s32)
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[MV3]](s64)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
+    ;
     ; GFX9-LABEL: name: test_load_private_v4s64_align32
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -10086,6 +10432,7 @@ body: |
     ; GFX9-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD6]](s32), [[LOAD7]](s32)
     ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[MV3]](s64)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
+    ;
     ; GFX10-LABEL: name: test_load_private_v4s64_align32
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -10115,6 +10462,7 @@ body: |
     ; GFX10-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD6]](s32), [[LOAD7]](s32)
     ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64), [[MV2]](s64), [[MV3]](s64)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
+    ;
     ; GFX11-LABEL: name: test_load_private_v4s64_align32
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -10153,6 +10501,7 @@ body: |
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32)
     ; SI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>)
+    ;
     ; CI-LABEL: name: test_load_private_v2p1_align4
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -10170,6 +10519,7 @@ body: |
     ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32)
     ; CI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>)
+    ;
     ; VI-LABEL: name: test_load_private_v2p1_align4
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -10187,6 +10537,7 @@ body: |
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32)
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>)
+    ;
     ; GFX9-LABEL: name: test_load_private_v2p1_align4
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -10204,6 +10555,7 @@ body: |
     ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32)
     ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>)
+    ;
     ; GFX10-LABEL: name: test_load_private_v2p1_align4
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -10221,6 +10573,7 @@ body: |
     ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32)
     ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p1>) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x p1>)
+    ;
     ; GFX11-LABEL: name: test_load_private_v2p1_align4
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -10268,6 +10621,7 @@ body: |
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32), [[LOAD4]](s32), [[LOAD5]](s32), [[LOAD6]](s32), [[LOAD7]](s32)
     ; SI-NEXT: [[BITCAST:%[0-9]+]]:_(<4 x p1>) = G_BITCAST [[BUILD_VECTOR]](<8 x s32>)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BITCAST]](<4 x p1>)
+    ;
     ; CI-LABEL: name: test_load_private_v4p1_align8
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -10297,6 +10651,7 @@ body: |
     ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32), [[LOAD4]](s32), [[LOAD5]](s32), [[LOAD6]](s32), [[LOAD7]](s32)
     ; CI-NEXT: [[BITCAST:%[0-9]+]]:_(<4 x p1>) = G_BITCAST [[BUILD_VECTOR]](<8 x s32>)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BITCAST]](<4 x p1>)
+    ;
     ; VI-LABEL: name: test_load_private_v4p1_align8
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -10326,6 +10681,7 @@ body: |
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32), [[LOAD4]](s32), [[LOAD5]](s32), [[LOAD6]](s32), [[LOAD7]](s32)
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(<4 x p1>) = G_BITCAST [[BUILD_VECTOR]](<8 x s32>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BITCAST]](<4 x p1>)
+    ;
     ; GFX9-LABEL: name: test_load_private_v4p1_align8
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -10355,6 +10711,7 @@ body: |
     ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32), [[LOAD4]](s32), [[LOAD5]](s32), [[LOAD6]](s32), [[LOAD7]](s32)
     ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(<4 x p1>) = G_BITCAST [[BUILD_VECTOR]](<8 x s32>)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BITCAST]](<4 x p1>)
+    ;
     ; GFX10-LABEL: name: test_load_private_v4p1_align8
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -10384,6 +10741,7 @@ body: |
     ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[LOAD3]](s32), [[LOAD4]](s32), [[LOAD5]](s32), [[LOAD6]](s32), [[LOAD7]](s32)
     ; GFX10-NEXT: [[BITCAST:%[0-9]+]]:_(<4 x p1>) = G_BITCAST [[BUILD_VECTOR]](<8 x s32>)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BITCAST]](<4 x p1>)
+    ;
     ; GFX11-LABEL: name: test_load_private_v4p1_align8
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -10416,6 +10774,7 @@ body: |
     ; SI-NEXT: [[LOAD1:%[0-9]+]]:_(p3) = G_LOAD [[PTR_ADD]](p5) :: (load (p3) from unknown-address + 4, addrspace 5)
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[LOAD]](p3), [[LOAD1]](p3)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x p3>)
+    ;
     ; CI-LABEL: name: test_load_private_v2p3_align8
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -10426,6 +10785,7 @@ body: |
     ; CI-NEXT: [[LOAD1:%[0-9]+]]:_(p3) = G_LOAD [[PTR_ADD]](p5) :: (load (p3) from unknown-address + 4, addrspace 5)
     ; CI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[LOAD]](p3), [[LOAD1]](p3)
     ; CI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x p3>)
+    ;
     ; VI-LABEL: name: test_load_private_v2p3_align8
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -10436,6 +10796,7 @@ body: |
     ; VI-NEXT: [[LOAD1:%[0-9]+]]:_(p3) = G_LOAD [[PTR_ADD]](p5) :: (load (p3) from unknown-address + 4, addrspace 5)
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[LOAD]](p3), [[LOAD1]](p3)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x p3>)
+    ;
     ; GFX9-LABEL: name: test_load_private_v2p3_align8
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -10446,6 +10807,7 @@ body: |
     ; GFX9-NEXT: [[LOAD1:%[0-9]+]]:_(p3) = G_LOAD [[PTR_ADD]](p5) :: (load (p3) from unknown-address + 4, addrspace 5)
     ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[LOAD]](p3), [[LOAD1]](p3)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x p3>)
+    ;
     ; GFX10-LABEL: name: test_load_private_v2p3_align8
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -10456,6 +10818,7 @@ body: |
     ; GFX10-NEXT: [[LOAD1:%[0-9]+]]:_(p3) = G_LOAD [[PTR_ADD]](p5) :: (load (p3) from unknown-address + 4, addrspace 5)
     ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x p3>) = G_BUILD_VECTOR [[LOAD]](p3), [[LOAD1]](p3)
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x p3>)
+    ;
     ; GFX11-LABEL: name: test_load_private_v2p3_align8
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -10479,30 +10842,35 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; SI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s8), align 4, addrspace 5)
     ; SI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; CI-LABEL: name: test_ext_load_private_s32_from_1_align4
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s8), align 4, addrspace 5)
     ; CI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; VI-LABEL: name: test_ext_load_private_s32_from_1_align4
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s8), align 4, addrspace 5)
     ; VI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX9-LABEL: name: test_ext_load_private_s32_from_1_align4
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s8), align 4, addrspace 5)
     ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX10-LABEL: name: test_ext_load_private_s32_from_1_align4
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s8), align 4, addrspace 5)
     ; GFX10-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX11-LABEL: name: test_ext_load_private_s32_from_1_align4
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -10526,30 +10894,35 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; SI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), align 4, addrspace 5)
     ; SI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; CI-LABEL: name: test_ext_load_private_s32_from_2_align4
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), align 4, addrspace 5)
     ; CI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; VI-LABEL: name: test_ext_load_private_s32_from_2_align4
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), align 4, addrspace 5)
     ; VI-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX9-LABEL: name: test_ext_load_private_s32_from_2_align4
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), align 4, addrspace 5)
     ; GFX9-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX10-LABEL: name: test_ext_load_private_s32_from_2_align4
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), align 4, addrspace 5)
     ; GFX10-NEXT: $vgpr0 = COPY [[LOAD]](s32)
+    ;
     ; GFX11-LABEL: name: test_ext_load_private_s32_from_2_align4
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -10575,6 +10948,7 @@ body: |
     ; SI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s8), align 4, addrspace 5)
     ; SI-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
     ; CI-LABEL: name: test_ext_load_private_s64_from_1_align4
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -10582,6 +10956,7 @@ body: |
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s8), align 4, addrspace 5)
     ; CI-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; CI-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
     ; VI-LABEL: name: test_ext_load_private_s64_from_1_align4
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -10589,6 +10964,7 @@ body: |
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s8), align 4, addrspace 5)
     ; VI-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
     ; GFX9-LABEL: name: test_ext_load_private_s64_from_1_align4
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -10596,6 +10972,7 @@ body: |
     ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s8), align 4, addrspace 5)
     ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
     ; GFX10-LABEL: name: test_ext_load_private_s64_from_1_align4
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -10603,6 +10980,7 @@ body: |
     ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s8), align 4, addrspace 5)
     ; GFX10-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
     ; GFX11-LABEL: name: test_ext_load_private_s64_from_1_align4
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -10628,6 +11006,7 @@ body: |
     ; SI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), align 4, addrspace 5)
     ; SI-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
     ; CI-LABEL: name: test_ext_load_private_s64_from_2_align4
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -10635,6 +11014,7 @@ body: |
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), align 4, addrspace 5)
     ; CI-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; CI-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
     ; VI-LABEL: name: test_ext_load_private_s64_from_2_align4
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -10642,6 +11022,7 @@ body: |
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), align 4, addrspace 5)
     ; VI-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
     ; GFX9-LABEL: name: test_ext_load_private_s64_from_2_align4
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -10649,6 +11030,7 @@ body: |
     ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), align 4, addrspace 5)
     ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
     ; GFX10-LABEL: name: test_ext_load_private_s64_from_2_align4
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -10656,6 +11038,7 @@ body: |
     ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), align 4, addrspace 5)
     ; GFX10-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
     ; GFX11-LABEL: name: test_ext_load_private_s64_from_2_align4
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -10681,6 +11064,7 @@ body: |
     ; SI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), addrspace 5)
     ; SI-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
     ; CI-LABEL: name: test_ext_load_private_s64_from_4_align4
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -10688,6 +11072,7 @@ body: |
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), addrspace 5)
     ; CI-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; CI-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
     ; VI-LABEL: name: test_ext_load_private_s64_from_4_align4
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -10695,6 +11080,7 @@ body: |
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), addrspace 5)
     ; VI-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
     ; GFX9-LABEL: name: test_ext_load_private_s64_from_4_align4
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -10702,6 +11088,7 @@ body: |
     ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), addrspace 5)
     ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
     ; GFX10-LABEL: name: test_ext_load_private_s64_from_4_align4
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -10709,6 +11096,7 @@ body: |
     ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s32), addrspace 5)
     ; GFX10-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
     ; GFX11-LABEL: name: test_ext_load_private_s64_from_4_align4
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -10737,6 +11125,7 @@ body: |
     ; SI-NEXT: [[DEF1:%[0-9]+]]:_(s64) = G_IMPLICIT_DEF
     ; SI-NEXT: [[MV1:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[MV]](s64), [[DEF1]](s64)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV1]](s128)
+    ;
     ; CI-LABEL: name: test_ext_load_private_s128_from_4_align4
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -10747,6 +11136,7 @@ body: |
     ; CI-NEXT: [[DEF1:%[0-9]+]]:_(s64) = G_IMPLICIT_DEF
     ; CI-NEXT: [[MV1:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[MV]](s64), [[DEF1]](s64)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV1]](s128)
+    ;
     ; VI-LABEL: name: test_ext_load_private_s128_from_4_align4
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -10757,6 +11147,7 @@ body: |
     ; VI-NEXT: [[DEF1:%[0-9]+]]:_(s64) = G_IMPLICIT_DEF
     ; VI-NEXT: [[MV1:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[MV]](s64), [[DEF1]](s64)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV1]](s128)
+    ;
     ; GFX9-LABEL: name: test_ext_load_private_s128_from_4_align4
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -10767,6 +11158,7 @@ body: |
     ; GFX9-NEXT: [[DEF1:%[0-9]+]]:_(s64) = G_IMPLICIT_DEF
     ; GFX9-NEXT: [[MV1:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[MV]](s64), [[DEF1]](s64)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV1]](s128)
+    ;
     ; GFX10-LABEL: name: test_ext_load_private_s128_from_4_align4
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -10777,6 +11169,7 @@ body: |
     ; GFX10-NEXT: [[DEF1:%[0-9]+]]:_(s64) = G_IMPLICIT_DEF
     ; GFX10-NEXT: [[MV1:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[MV]](s64), [[DEF1]](s64)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV1]](s128)
+    ;
     ; GFX11-LABEL: name: test_ext_load_private_s128_from_4_align4
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -10805,6 +11198,7 @@ body: |
     ; SI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), align 4, addrspace 5)
     ; SI-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
     ; CI-LABEL: name: test_ext_load_private_s64_from_2_align2
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -10812,6 +11206,7 @@ body: |
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), align 4, addrspace 5)
     ; CI-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; CI-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
     ; VI-LABEL: name: test_ext_load_private_s64_from_2_align2
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -10819,6 +11214,7 @@ body: |
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), align 4, addrspace 5)
     ; VI-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
     ; GFX9-LABEL: name: test_ext_load_private_s64_from_2_align2
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -10826,6 +11222,7 @@ body: |
     ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), align 4, addrspace 5)
     ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
     ; GFX10-LABEL: name: test_ext_load_private_s64_from_2_align2
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -10833,6 +11230,7 @@ body: |
     ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s16), align 4, addrspace 5)
     ; GFX10-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
     ; GFX11-LABEL: name: test_ext_load_private_s64_from_2_align2
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -10858,6 +11256,7 @@ body: |
     ; SI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s8), align 4, addrspace 5)
     ; SI-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
     ; CI-LABEL: name: test_ext_load_private_s64_from_1_align1
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -10865,6 +11264,7 @@ body: |
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s8), align 4, addrspace 5)
     ; CI-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; CI-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
     ; VI-LABEL: name: test_ext_load_private_s64_from_1_align1
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -10872,6 +11272,7 @@ body: |
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s8), align 4, addrspace 5)
     ; VI-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
     ; GFX9-LABEL: name: test_ext_load_private_s64_from_1_align1
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -10879,6 +11280,7 @@ body: |
     ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s8), align 4, addrspace 5)
     ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
     ; GFX10-LABEL: name: test_ext_load_private_s64_from_1_align1
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -10886,6 +11288,7 @@ body: |
     ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[COPY]](p5) :: (load (s8), align 4, addrspace 5)
     ; GFX10-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LOAD]](s32)
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[ANYEXT]](s64)
+    ;
     ; GFX11-LABEL: name: test_ext_load_private_s64_from_1_align1
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -10910,30 +11313,35 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; SI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 1, addrspace 5)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; CI-LABEL: name: test_extload_private_v2s32_from_4_align1
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 1, addrspace 5)
     ; CI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; VI-LABEL: name: test_extload_private_v2s32_from_4_align1
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 1, addrspace 5)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; GFX9-LABEL: name: test_extload_private_v2s32_from_4_align1
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 1, addrspace 5)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; GFX10-LABEL: name: test_extload_private_v2s32_from_4_align1
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 1, addrspace 5)
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; GFX11-LABEL: name: test_extload_private_v2s32_from_4_align1
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -10957,30 +11365,35 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; SI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 2, addrspace 5)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; CI-LABEL: name: test_extload_private_v2s32_from_4_align2
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 2, addrspace 5)
     ; CI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; VI-LABEL: name: test_extload_private_v2s32_from_4_align2
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 2, addrspace 5)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; GFX9-LABEL: name: test_extload_private_v2s32_from_4_align2
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 2, addrspace 5)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; GFX10-LABEL: name: test_extload_private_v2s32_from_4_align2
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), align 2, addrspace 5)
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; GFX11-LABEL: name: test_extload_private_v2s32_from_4_align2
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -11004,30 +11417,35 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; SI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), addrspace 5)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; CI-LABEL: name: test_extload_private_v2s32_from_4_align4
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), addrspace 5)
     ; CI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; VI-LABEL: name: test_extload_private_v2s32_from_4_align4
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), addrspace 5)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; GFX9-LABEL: name: test_extload_private_v2s32_from_4_align4
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), addrspace 5)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; GFX10-LABEL: name: test_extload_private_v2s32_from_4_align4
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[COPY]](p5) :: (load (<2 x s16>), addrspace 5)
     ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[LOAD]](<2 x s32>)
+    ;
     ; GFX11-LABEL: name: test_extload_private_v2s32_from_4_align4
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -11051,30 +11469,35 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; SI-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s16>), align 4, addrspace 5)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](<3 x s32>)
+    ;
     ; CI-LABEL: name: test_extload_private_v3s32_from_6_align4
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s16>), align 4, addrspace 5)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](<3 x s32>)
+    ;
     ; VI-LABEL: name: test_extload_private_v3s32_from_6_align4
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s16>), align 4, addrspace 5)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](<3 x s32>)
+    ;
     ; GFX9-LABEL: name: test_extload_private_v3s32_from_6_align4
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s16>), align 4, addrspace 5)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](<3 x s32>)
+    ;
     ; GFX10-LABEL: name: test_extload_private_v3s32_from_6_align4
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[COPY]](p5) :: (load (<3 x s16>), align 4, addrspace 5)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[LOAD]](<3 x s32>)
+    ;
     ; GFX11-LABEL: name: test_extload_private_v3s32_from_6_align4
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -11098,30 +11521,35 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; SI-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s16>), align 4, addrspace 5)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ;
     ; CI-LABEL: name: test_extload_private_v4s32_from_8_align4
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; CI-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s16>), align 4, addrspace 5)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ;
     ; VI-LABEL: name: test_extload_private_v4s32_from_8_align4
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; VI-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s16>), align 4, addrspace 5)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ;
     ; GFX9-LABEL: name: test_extload_private_v4s32_from_8_align4
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; GFX9-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s16>), align 4, addrspace 5)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ;
     ; GFX10-LABEL: name: test_extload_private_v4s32_from_8_align4
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:_(p5) = COPY $vgpr0
     ; GFX10-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[COPY]](p5) :: (load (<4 x s16>), align 4, addrspace 5)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[LOAD]](<4 x s32>)
+    ;
     ; GFX11-LABEL: name: test_extload_private_v4s32_from_8_align4
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -11240,6 +11668,7 @@ body: |
     ; SI-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)
     ; SI-NEXT: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96)
+    ;
     ; CI-LABEL: name: test_load_private_v2s96_align1
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -11341,6 +11770,7 @@ body: |
     ; CI-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)
     ; CI-NEXT: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96)
+    ;
     ; VI-LABEL: name: test_load_private_v2s96_align1
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -11442,6 +11872,7 @@ body: |
     ; VI-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)
     ; VI-NEXT: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96)
+    ;
     ; GFX9-LABEL: name: test_load_private_v2s96_align1
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -11543,6 +11974,7 @@ body: |
     ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)
     ; GFX9-NEXT: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96)
+    ;
     ; GFX10-LABEL: name: test_load_private_v2s96_align1
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -11644,6 +12076,7 @@ body: |
     ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)
     ; GFX10-NEXT: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96)
+    ;
     ; GFX11-LABEL: name: test_load_private_v2s96_align1
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -11724,6 +12157,7 @@ body: |
     ; SI-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)
     ; SI-NEXT: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96)
+    ;
     ; CI-LABEL: name: test_load_private_v2s96_align2
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -11776,6 +12210,7 @@ body: |
     ; CI-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)
     ; CI-NEXT: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96)
+    ;
     ; VI-LABEL: name: test_load_private_v2s96_align2
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -11828,6 +12263,7 @@ body: |
     ; VI-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)
     ; VI-NEXT: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96)
+    ;
     ; GFX9-LABEL: name: test_load_private_v2s96_align2
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -11880,6 +12316,7 @@ body: |
     ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)
     ; GFX9-NEXT: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96)
+    ;
     ; GFX10-LABEL: name: test_load_private_v2s96_align2
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -11932,6 +12369,7 @@ body: |
     ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)
     ; GFX10-NEXT: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96)
+    ;
     ; GFX11-LABEL: name: test_load_private_v2s96_align2
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -11986,6 +12424,7 @@ body: |
     ; SI-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)
     ; SI-NEXT: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96)
+    ;
     ; CI-LABEL: name: test_load_private_v2s96_align4
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -12012,6 +12451,7 @@ body: |
     ; CI-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)
     ; CI-NEXT: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96)
+    ;
     ; VI-LABEL: name: test_load_private_v2s96_align4
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -12038,6 +12478,7 @@ body: |
     ; VI-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)
     ; VI-NEXT: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96)
+    ;
     ; GFX9-LABEL: name: test_load_private_v2s96_align4
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -12064,6 +12505,7 @@ body: |
     ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)
     ; GFX9-NEXT: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96)
+    ;
     ; GFX10-LABEL: name: test_load_private_v2s96_align4
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -12090,6 +12532,7 @@ body: |
     ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)
     ; GFX10-NEXT: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96)
+    ;
     ; GFX11-LABEL: name: test_load_private_v2s96_align4
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
@@ -12144,6 +12587,7 @@ body: |
     ; SI-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)
     ; SI-NEXT: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96)
+    ;
     ; CI-LABEL: name: test_load_private_v2s96_align16
     ; CI: liveins: $vgpr0
     ; CI-NEXT: {{  $}}
@@ -12170,6 +12614,7 @@ body: |
     ; CI-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
     ; CI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)
     ; CI-NEXT: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96)
+    ;
     ; VI-LABEL: name: test_load_private_v2s96_align16
     ; VI: liveins: $vgpr0
     ; VI-NEXT: {{  $}}
@@ -12196,6 +12641,7 @@ body: |
     ; VI-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)
     ; VI-NEXT: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96)
+    ;
     ; GFX9-LABEL: name: test_load_private_v2s96_align16
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
@@ -12222,6 +12668,7 @@ body: |
     ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)
     ; GFX9-NEXT: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96)
+    ;
     ; GFX10-LABEL: name: test_load_private_v2s96_align16
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -12248,6 +12695,7 @@ body: |
     ; GFX10-NEXT: [[COPY2:%[0-9]+]]:_(s96) = COPY [[BITCAST1]](s96)
     ; GFX10-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[COPY1]](s96)
     ; GFX10-NEXT: $vgpr3_vgpr4_vgpr5 = COPY [[COPY2]](s96)
+    ;
     ; GFX11-LABEL: name: test_load_private_v2s96_align16
     ; GFX11: liveins: $vgpr0
     ; GFX11-NEXT: {{  $}}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-lshr.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-lshr.mir
index 8b2d986682c71..3ec9a48b02e69 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-lshr.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-lshr.mir
@@ -18,6 +18,7 @@ body: |
     ; SI-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
     ; SI-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[COPY1]](s32)
     ; SI-NEXT: $vgpr0 = COPY [[LSHR]](s32)
+    ;
     ; VI-LABEL: name: test_lshr_s32_s32
     ; VI: liveins: $vgpr0, $vgpr1
     ; VI-NEXT: {{  $}}
@@ -25,6 +26,7 @@ body: |
     ; VI-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
     ; VI-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[COPY1]](s32)
     ; VI-NEXT: $vgpr0 = COPY [[LSHR]](s32)
+    ;
     ; GFX9-LABEL: name: test_lshr_s32_s32
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -51,6 +53,7 @@ body: |
     ; SI-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64)
     ; SI-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[COPY]], [[TRUNC]](s32)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[LSHR]](s64)
+    ;
     ; VI-LABEL: name: test_lshr_s64_s64
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; VI-NEXT: {{  $}}
@@ -59,6 +62,7 @@ body: |
     ; VI-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64)
     ; VI-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[COPY]], [[TRUNC]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[LSHR]](s64)
+    ;
     ; GFX9-LABEL: name: test_lshr_s64_s64
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -85,6 +89,7 @@ body: |
     ; SI-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr2
     ; SI-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[COPY]], [[COPY1]](s32)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[LSHR]](s64)
+    ;
     ; VI-LABEL: name: test_lshr_s64_s32
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2
     ; VI-NEXT: {{  $}}
@@ -92,6 +97,7 @@ body: |
     ; VI-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr2
     ; VI-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[COPY]], [[COPY1]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[LSHR]](s64)
+    ;
     ; GFX9-LABEL: name: test_lshr_s64_s32
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9-NEXT: {{  $}}
@@ -119,6 +125,7 @@ body: |
     ; SI-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C]]
     ; SI-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[COPY]], [[AND]](s32)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[LSHR]](s64)
+    ;
     ; VI-LABEL: name: test_lshr_s64_s16
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2
     ; VI-NEXT: {{  $}}
@@ -128,6 +135,7 @@ body: |
     ; VI-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C]]
     ; VI-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[COPY]], [[AND]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[LSHR]](s64)
+    ;
     ; GFX9-LABEL: name: test_lshr_s64_s16
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9-NEXT: {{  $}}
@@ -159,6 +167,7 @@ body: |
     ; SI-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C]]
     ; SI-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[AND]], [[COPY1]](s32)
     ; SI-NEXT: $vgpr0 = COPY [[LSHR]](s32)
+    ;
     ; VI-LABEL: name: test_lshr_s16_s32
     ; VI: liveins: $vgpr0, $vgpr1
     ; VI-NEXT: {{  $}}
@@ -169,6 +178,7 @@ body: |
     ; VI-NEXT: [[LSHR:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC]], [[TRUNC1]](s16)
     ; VI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR]](s16)
     ; VI-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ;
     ; GFX9-LABEL: name: test_lshr_s16_s32
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -203,6 +213,7 @@ body: |
     ; SI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C]]
     ; SI-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[AND1]], [[AND]](s32)
     ; SI-NEXT: $vgpr0 = COPY [[LSHR]](s32)
+    ;
     ; VI-LABEL: name: test_lshr_s16_s16
     ; VI: liveins: $vgpr0, $vgpr1
     ; VI-NEXT: {{  $}}
@@ -213,6 +224,7 @@ body: |
     ; VI-NEXT: [[LSHR:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC]], [[TRUNC1]](s16)
     ; VI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR]](s16)
     ; VI-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ;
     ; GFX9-LABEL: name: test_lshr_s16_s16
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -249,26 +261,28 @@ body: |
     ; SI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C1]]
     ; SI-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[AND1]], [[AND]](s32)
     ; SI-NEXT: $vgpr0 = COPY [[LSHR]](s32)
+    ;
     ; VI-LABEL: name: test_lshr_s16_i8
     ; VI: liveins: $vgpr0, $vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; VI-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
     ; VI-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
-    ; VI-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; VI-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+    ; VI-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; VI-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C]]
     ; VI-NEXT: [[LSHR:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC]], [[AND]](s16)
     ; VI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR]](s16)
     ; VI-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ;
     ; GFX9-LABEL: name: test_lshr_s16_i8
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
     ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C]]
     ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s16) = G_LSHR [[TRUNC]], [[AND]](s16)
     ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR]](s16)
@@ -298,26 +312,28 @@ body: |
     ; SI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C]]
     ; SI-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[AND1]], [[AND]](s32)
     ; SI-NEXT: $vgpr0 = COPY [[LSHR]](s32)
+    ;
     ; VI-LABEL: name: test_lshr_i8_i8
     ; VI: liveins: $vgpr0, $vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; VI-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; VI-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; VI-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+    ; VI-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; VI-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C]]
     ; VI-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
     ; VI-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C]]
     ; VI-NEXT: [[LSHR:%[0-9]+]]:_(s16) = G_LSHR [[AND1]], [[AND]](s16)
     ; VI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR]](s16)
     ; VI-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ;
     ; GFX9-LABEL: name: test_lshr_i8_i8
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C]]
     ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
     ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C]]
@@ -350,6 +366,7 @@ body: |
     ; SI-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[UV3]](s32)
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LSHR]](s32), [[LSHR1]](s32)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; VI-LABEL: name: test_lshr_v2s32_v2s32
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; VI-NEXT: {{  $}}
@@ -361,6 +378,7 @@ body: |
     ; VI-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[UV1]], [[UV3]](s32)
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LSHR]](s32), [[LSHR1]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; GFX9-LABEL: name: test_lshr_v2s32_v2s32
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -396,6 +414,7 @@ body: |
     ; SI-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[UV2]], [[UV5]](s32)
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LSHR]](s32), [[LSHR1]](s32), [[LSHR2]](s32)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
+    ;
     ; VI-LABEL: name: test_lshr_v3s32_v3s32
     ; VI: liveins: $vgpr0_vgpr1_vgpr2, $vgpr3_vgpr4_vgpr5
     ; VI-NEXT: {{  $}}
@@ -408,6 +427,7 @@ body: |
     ; VI-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[UV2]], [[UV5]](s32)
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[LSHR]](s32), [[LSHR1]](s32), [[LSHR2]](s32)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
+    ;
     ; GFX9-LABEL: name: test_lshr_v3s32_v3s32
     ; GFX9: liveins: $vgpr0_vgpr1_vgpr2, $vgpr3_vgpr4_vgpr5
     ; GFX9-NEXT: {{  $}}
@@ -443,6 +463,7 @@ body: |
     ; SI-NEXT: [[LSHR1:%[0-9]+]]:_(s64) = G_LSHR [[UV1]], [[UV3]](s32)
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[LSHR]](s64), [[LSHR1]](s64)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ;
     ; VI-LABEL: name: test_lshr_v2s64_v2s32
     ; VI: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5
     ; VI-NEXT: {{  $}}
@@ -454,6 +475,7 @@ body: |
     ; VI-NEXT: [[LSHR1:%[0-9]+]]:_(s64) = G_LSHR [[UV1]], [[UV3]](s32)
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[LSHR]](s64), [[LSHR1]](s64)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ;
     ; GFX9-LABEL: name: test_lshr_v2s64_v2s32
     ; GFX9: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5
     ; GFX9-NEXT: {{  $}}
@@ -491,6 +513,7 @@ body: |
     ; SI-NEXT: [[UV7:%[0-9]+]]:_(s64), [[UV8:%[0-9]+]]:_(s64), [[UV9:%[0-9]+]]:_(s64), [[UV10:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>)
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[LSHR]](s64), [[LSHR1]](s64), [[LSHR2]](s64), [[UV10]](s64)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
+    ;
     ; VI-LABEL: name: test_lshr_v3s64_v3s32
     ; VI: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10
     ; VI-NEXT: {{  $}}
@@ -505,6 +528,7 @@ body: |
     ; VI-NEXT: [[UV7:%[0-9]+]]:_(s64), [[UV8:%[0-9]+]]:_(s64), [[UV9:%[0-9]+]]:_(s64), [[UV10:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>)
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[LSHR]](s64), [[LSHR1]](s64), [[LSHR2]](s64), [[UV10]](s64)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
+    ;
     ; GFX9-LABEL: name: test_lshr_v3s64_v3s32
     ; GFX9: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10
     ; GFX9-NEXT: {{  $}}
@@ -548,15 +572,14 @@ body: |
     ; SI-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C1]]
     ; SI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[BITCAST]], [[C1]]
     ; SI-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[AND1]], [[AND]](s32)
-    ; SI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C1]]
-    ; SI-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]]
-    ; SI-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[AND3]], [[AND2]](s32)
-    ; SI-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[LSHR2]], [[C1]]
-    ; SI-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[LSHR3]], [[C1]]
-    ; SI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C]](s32)
-    ; SI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL]]
+    ; SI-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[LSHR]], [[LSHR1]](s32)
+    ; SI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[LSHR2]], [[C1]]
+    ; SI-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[LSHR3]], [[C1]]
+    ; SI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C]](s32)
+    ; SI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL]]
     ; SI-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; SI-NEXT: $vgpr0 = COPY [[BITCAST2]](<2 x s16>)
+    ;
     ; VI-LABEL: name: test_lshr_v2s16_v2s16
     ; VI: liveins: $vgpr0, $vgpr1
     ; VI-NEXT: {{  $}}
@@ -579,6 +602,7 @@ body: |
     ; VI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL]]
     ; VI-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; VI-NEXT: $vgpr0 = COPY [[BITCAST2]](<2 x s16>)
+    ;
     ; GFX9-LABEL: name: test_lshr_v2s16_v2s16
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -610,14 +634,14 @@ body: |
     ; SI-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
     ; SI-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[BITCAST]], [[C1]]
     ; SI-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[AND]], [[UV]](s32)
-    ; SI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]]
-    ; SI-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[AND1]], [[UV1]](s32)
-    ; SI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C1]]
-    ; SI-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[LSHR2]], [[C1]]
-    ; SI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C]](s32)
-    ; SI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL]]
+    ; SI-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[LSHR]], [[UV1]](s32)
+    ; SI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C1]]
+    ; SI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[LSHR2]], [[C1]]
+    ; SI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C]](s32)
+    ; SI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND1]], [[SHL]]
     ; SI-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; SI-NEXT: $vgpr0 = COPY [[BITCAST1]](<2 x s16>)
+    ;
     ; VI-LABEL: name: test_lshr_v2s16_v2s32
     ; VI: liveins: $vgpr0, $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -639,6 +663,7 @@ body: |
     ; VI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL]]
     ; VI-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; VI-NEXT: $vgpr0 = COPY [[BITCAST1]](<2 x s16>)
+    ;
     ; GFX9-LABEL: name: test_lshr_v2s16_v2s32
     ; GFX9: liveins: $vgpr0, $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -686,28 +711,26 @@ body: |
     ; SI-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[BITCAST2]], [[C1]]
     ; SI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[BITCAST]], [[C1]]
     ; SI-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[AND1]], [[AND]](s32)
-    ; SI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C1]]
-    ; SI-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]]
-    ; SI-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[AND3]], [[AND2]](s32)
-    ; SI-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[BITCAST3]], [[C1]]
-    ; SI-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C1]]
-    ; SI-NEXT: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[AND5]], [[AND4]](s32)
+    ; SI-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[LSHR]], [[LSHR1]](s32)
+    ; SI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[BITCAST3]], [[C1]]
+    ; SI-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C1]]
+    ; SI-NEXT: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[AND3]], [[AND2]](s32)
     ; SI-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
     ; SI-NEXT: [[UV4:%[0-9]+]]:_(<2 x s16>), [[UV5:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>)
     ; SI-NEXT: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[UV5]](<2 x s16>)
     ; SI-NEXT: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST4]], [[C]](s32)
-    ; SI-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[LSHR2]], [[C1]]
-    ; SI-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[LSHR3]], [[C1]]
-    ; SI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C]](s32)
-    ; SI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND6]], [[SHL]]
+    ; SI-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[LSHR2]], [[C1]]
+    ; SI-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[LSHR3]], [[C1]]
+    ; SI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C]](s32)
+    ; SI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL]]
     ; SI-NEXT: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
-    ; SI-NEXT: [[AND8:%[0-9]+]]:_(s32) = G_AND [[LSHR4]], [[C1]]
-    ; SI-NEXT: [[AND9:%[0-9]+]]:_(s32) = G_AND [[LSHR5]], [[C1]]
-    ; SI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[C]](s32)
-    ; SI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND8]], [[SHL1]]
+    ; SI-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[LSHR4]], [[C1]]
+    ; SI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LSHR5]], [[C]](s32)
+    ; SI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND6]], [[SHL1]]
     ; SI-NEXT: [[BITCAST6:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
     ; SI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST5]](<2 x s16>), [[BITCAST6]](<2 x s16>)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
     ; VI-LABEL: name: test_lshr_v3s16_v3s16
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; VI-NEXT: {{  $}}
@@ -741,13 +764,12 @@ body: |
     ; VI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL]]
     ; VI-NEXT: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; VI-NEXT: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[LSHR4]](s16)
-    ; VI-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
-    ; VI-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LSHR5]], [[C1]]
-    ; VI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND]], [[C]](s32)
+    ; VI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LSHR5]], [[C]](s32)
     ; VI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL1]]
     ; VI-NEXT: [[BITCAST6:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
     ; VI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST5]](<2 x s16>), [[BITCAST6]](<2 x s16>)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
     ; GFX9-LABEL: name: test_lshr_v3s16_v3s16
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -810,34 +832,32 @@ body: |
     ; SI-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[BITCAST2]], [[C1]]
     ; SI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[BITCAST]], [[C1]]
     ; SI-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[AND1]], [[AND]](s32)
-    ; SI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C1]]
-    ; SI-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]]
-    ; SI-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[AND3]], [[AND2]](s32)
-    ; SI-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[BITCAST3]], [[C1]]
-    ; SI-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C1]]
-    ; SI-NEXT: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[AND5]], [[AND4]](s32)
+    ; SI-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[LSHR]], [[LSHR1]](s32)
+    ; SI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[BITCAST3]], [[C1]]
+    ; SI-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C1]]
+    ; SI-NEXT: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[AND3]], [[AND2]](s32)
     ; SI-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
     ; SI-NEXT: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>)
     ; SI-NEXT: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[UV6]](<2 x s16>)
     ; SI-NEXT: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST4]], [[C]](s32)
     ; SI-NEXT: [[BITCAST5:%[0-9]+]]:_(s32) = G_BITCAST [[UV7]](<2 x s16>)
-    ; SI-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[LSHR2]], [[C1]]
-    ; SI-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[LSHR3]], [[C1]]
-    ; SI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C]](s32)
-    ; SI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND6]], [[SHL]]
+    ; SI-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[LSHR2]], [[C1]]
+    ; SI-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[LSHR3]], [[C1]]
+    ; SI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C]](s32)
+    ; SI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL]]
     ; SI-NEXT: [[BITCAST6:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
-    ; SI-NEXT: [[AND8:%[0-9]+]]:_(s32) = G_AND [[LSHR4]], [[C1]]
-    ; SI-NEXT: [[AND9:%[0-9]+]]:_(s32) = G_AND [[BITCAST4]], [[C1]]
-    ; SI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[C]](s32)
-    ; SI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND8]], [[SHL1]]
+    ; SI-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[LSHR4]], [[C1]]
+    ; SI-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[BITCAST4]], [[C1]]
+    ; SI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C]](s32)
+    ; SI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND6]], [[SHL1]]
     ; SI-NEXT: [[BITCAST7:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
-    ; SI-NEXT: [[AND10:%[0-9]+]]:_(s32) = G_AND [[LSHR5]], [[C1]]
-    ; SI-NEXT: [[AND11:%[0-9]+]]:_(s32) = G_AND [[BITCAST5]], [[C1]]
-    ; SI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[C]](s32)
-    ; SI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND10]], [[SHL2]]
+    ; SI-NEXT: [[AND8:%[0-9]+]]:_(s32) = G_AND [[BITCAST5]], [[C1]]
+    ; SI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND8]], [[C]](s32)
+    ; SI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[LSHR5]], [[SHL2]]
     ; SI-NEXT: [[BITCAST8:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
     ; SI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST6]](<2 x s16>), [[BITCAST7]](<2 x s16>), [[BITCAST8]](<2 x s16>)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
     ; VI-LABEL: name: test_ashr_v3s16_v3s16
     ; VI: liveins: $vgpr0_vgpr1_vgpr2, $vgpr3_vgpr4_vgpr5
     ; VI-NEXT: {{  $}}
@@ -877,13 +897,13 @@ body: |
     ; VI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND]], [[C]](s32)
     ; VI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL1]]
     ; VI-NEXT: [[BITCAST7:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
-    ; VI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR5]], [[C1]]
-    ; VI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[BITCAST5]], [[C1]]
-    ; VI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C]](s32)
-    ; VI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND1]], [[SHL2]]
+    ; VI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[BITCAST5]], [[C1]]
+    ; VI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
+    ; VI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[LSHR5]], [[SHL2]]
     ; VI-NEXT: [[BITCAST8:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
     ; VI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST6]](<2 x s16>), [[BITCAST7]](<2 x s16>), [[BITCAST8]](<2 x s16>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
     ; GFX9-LABEL: name: test_ashr_v3s16_v3s16
     ; GFX9: liveins: $vgpr0_vgpr1_vgpr2, $vgpr3_vgpr4_vgpr5
     ; GFX9-NEXT: {{  $}}
@@ -953,27 +973,24 @@ body: |
     ; SI-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[BITCAST2]], [[C1]]
     ; SI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[BITCAST]], [[C1]]
     ; SI-NEXT: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[AND1]], [[AND]](s32)
-    ; SI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[LSHR2]], [[C1]]
-    ; SI-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]]
-    ; SI-NEXT: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[AND3]], [[AND2]](s32)
-    ; SI-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[BITCAST3]], [[C1]]
-    ; SI-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C1]]
-    ; SI-NEXT: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[AND5]], [[AND4]](s32)
-    ; SI-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[LSHR3]], [[C1]]
-    ; SI-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C1]]
-    ; SI-NEXT: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[AND7]], [[AND6]](s32)
-    ; SI-NEXT: [[AND8:%[0-9]+]]:_(s32) = G_AND [[LSHR4]], [[C1]]
-    ; SI-NEXT: [[AND9:%[0-9]+]]:_(s32) = G_AND [[LSHR5]], [[C1]]
-    ; SI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[C]](s32)
-    ; SI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND8]], [[SHL]]
+    ; SI-NEXT: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[LSHR]], [[LSHR2]](s32)
+    ; SI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[BITCAST3]], [[C1]]
+    ; SI-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C1]]
+    ; SI-NEXT: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[AND3]], [[AND2]](s32)
+    ; SI-NEXT: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[LSHR1]], [[LSHR3]](s32)
+    ; SI-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[LSHR4]], [[C1]]
+    ; SI-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[LSHR5]], [[C1]]
+    ; SI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C]](s32)
+    ; SI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL]]
     ; SI-NEXT: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
-    ; SI-NEXT: [[AND10:%[0-9]+]]:_(s32) = G_AND [[LSHR6]], [[C1]]
-    ; SI-NEXT: [[AND11:%[0-9]+]]:_(s32) = G_AND [[LSHR7]], [[C1]]
-    ; SI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[C]](s32)
-    ; SI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND10]], [[SHL1]]
+    ; SI-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[LSHR6]], [[C1]]
+    ; SI-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[LSHR7]], [[C1]]
+    ; SI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C]](s32)
+    ; SI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND6]], [[SHL1]]
     ; SI-NEXT: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
     ; SI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
     ; VI-LABEL: name: test_lshr_v4s16_v4s16
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; VI-NEXT: {{  $}}
@@ -1014,6 +1031,7 @@ body: |
     ; VI-NEXT: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
     ; VI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
     ; GFX9-LABEL: name: test_lshr_v4s16_v4s16
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -1042,68 +1060,70 @@ body: |
     ; SI-NEXT: {{  $}}
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(s128) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
     ; SI-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr4
-    ; SI-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 64
+    ; SI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 64
     ; SI-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[COPY]](s128)
-    ; SI-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[COPY1]], [[C2]]
-    ; SI-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[C2]], [[COPY1]]
-    ; SI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-    ; SI-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY1]](s32), [[C2]]
-    ; SI-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C]]
+    ; SI-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[COPY1]], [[C]]
+    ; SI-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[C]], [[COPY1]]
+    ; SI-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; SI-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY1]](s32), [[C]]
+    ; SI-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C1]]
     ; SI-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[UV1]], [[COPY1]](s32)
     ; SI-NEXT: [[LSHR1:%[0-9]+]]:_(s64) = G_LSHR [[UV]], [[COPY1]](s32)
     ; SI-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[UV1]], [[SUB1]](s32)
     ; SI-NEXT: [[OR:%[0-9]+]]:_(s64) = G_OR [[LSHR1]], [[SHL]]
-    ; SI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; SI-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
     ; SI-NEXT: [[LSHR2:%[0-9]+]]:_(s64) = G_LSHR [[UV1]], [[SUB]](s32)
     ; SI-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s1), [[OR]], [[LSHR2]]
     ; SI-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[ICMP1]](s1), [[UV]], [[SELECT]]
-    ; SI-NEXT: [[SELECT2:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s1), [[LSHR]], [[C1]]
+    ; SI-NEXT: [[SELECT2:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s1), [[LSHR]], [[C2]]
     ; SI-NEXT: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[SELECT1]](s64), [[SELECT2]](s64)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV]](s128)
+    ;
     ; VI-LABEL: name: test_lshr_s128_s128
     ; VI: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(s128) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
     ; VI-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr4
-    ; VI-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 64
+    ; VI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 64
     ; VI-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[COPY]](s128)
-    ; VI-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[COPY1]], [[C2]]
-    ; VI-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[C2]], [[COPY1]]
-    ; VI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-    ; VI-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY1]](s32), [[C2]]
-    ; VI-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C]]
+    ; VI-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[COPY1]], [[C]]
+    ; VI-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[C]], [[COPY1]]
+    ; VI-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; VI-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY1]](s32), [[C]]
+    ; VI-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C1]]
     ; VI-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[UV1]], [[COPY1]](s32)
     ; VI-NEXT: [[LSHR1:%[0-9]+]]:_(s64) = G_LSHR [[UV]], [[COPY1]](s32)
     ; VI-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[UV1]], [[SUB1]](s32)
     ; VI-NEXT: [[OR:%[0-9]+]]:_(s64) = G_OR [[LSHR1]], [[SHL]]
-    ; VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; VI-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
     ; VI-NEXT: [[LSHR2:%[0-9]+]]:_(s64) = G_LSHR [[UV1]], [[SUB]](s32)
     ; VI-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s1), [[OR]], [[LSHR2]]
     ; VI-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[ICMP1]](s1), [[UV]], [[SELECT]]
-    ; VI-NEXT: [[SELECT2:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s1), [[LSHR]], [[C1]]
+    ; VI-NEXT: [[SELECT2:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s1), [[LSHR]], [[C2]]
     ; VI-NEXT: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[SELECT1]](s64), [[SELECT2]](s64)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV]](s128)
+    ;
     ; GFX9-LABEL: name: test_lshr_s128_s128
     ; GFX9: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s128) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr4
-    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 64
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 64
     ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[COPY]](s128)
-    ; GFX9-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[COPY1]], [[C2]]
-    ; GFX9-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[C2]], [[COPY1]]
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-    ; GFX9-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY1]](s32), [[C2]]
-    ; GFX9-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C]]
+    ; GFX9-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[COPY1]], [[C]]
+    ; GFX9-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[C]], [[COPY1]]
+    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; GFX9-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY1]](s32), [[C]]
+    ; GFX9-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C1]]
     ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[UV1]], [[COPY1]](s32)
     ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s64) = G_LSHR [[UV]], [[COPY1]](s32)
     ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[UV1]], [[SUB1]](s32)
     ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s64) = G_OR [[LSHR1]], [[SHL]]
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
     ; GFX9-NEXT: [[LSHR2:%[0-9]+]]:_(s64) = G_LSHR [[UV1]], [[SUB]](s32)
     ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s1), [[OR]], [[LSHR2]]
     ; GFX9-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[ICMP1]](s1), [[UV]], [[SELECT]]
-    ; GFX9-NEXT: [[SELECT2:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s1), [[LSHR]], [[C1]]
+    ; GFX9-NEXT: [[SELECT2:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s1), [[LSHR]], [[C2]]
     ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[SELECT1]](s64), [[SELECT2]](s64)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV]](s128)
     %0:_(s128) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
@@ -1142,6 +1162,7 @@ body: |
     ; SI-NEXT: [[SELECT2:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s1), [[LSHR]], [[C2]]
     ; SI-NEXT: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[SELECT1]](s64), [[SELECT2]](s64)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV]](s128)
+    ;
     ; VI-LABEL: name: test_lshr_s128_s132
     ; VI: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4
     ; VI-NEXT: {{  $}}
@@ -1165,6 +1186,7 @@ body: |
     ; VI-NEXT: [[SELECT2:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s1), [[LSHR]], [[C2]]
     ; VI-NEXT: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[SELECT1]](s64), [[SELECT2]](s64)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV]](s128)
+    ;
     ; GFX9-LABEL: name: test_lshr_s128_s132
     ; GFX9: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4
     ; GFX9-NEXT: {{  $}}
@@ -1205,11 +1227,13 @@ body: |
     ; SI-NEXT: {{  $}}
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(s128) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[COPY]](s128)
+    ;
     ; VI-LABEL: name: test_lshr_s128_s32_0
     ; VI: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(s128) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[COPY]](s128)
+    ;
     ; GFX9-LABEL: name: test_lshr_s128_s32_0
     ; GFX9: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4
     ; GFX9-NEXT: {{  $}}
@@ -1241,6 +1265,7 @@ body: |
     ; SI-NEXT: [[LSHR1:%[0-9]+]]:_(s64) = G_LSHR [[UV1]], [[C]](s32)
     ; SI-NEXT: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[OR]](s64), [[LSHR1]](s64)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV]](s128)
+    ;
     ; VI-LABEL: name: test_lshr_s128_s32_23
     ; VI: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4
     ; VI-NEXT: {{  $}}
@@ -1254,6 +1279,7 @@ body: |
     ; VI-NEXT: [[LSHR1:%[0-9]+]]:_(s64) = G_LSHR [[UV1]], [[C]](s32)
     ; VI-NEXT: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[OR]](s64), [[LSHR1]](s64)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV]](s128)
+    ;
     ; GFX9-LABEL: name: test_lshr_s128_s32_23
     ; GFX9: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4
     ; GFX9-NEXT: {{  $}}
@@ -1292,6 +1318,7 @@ body: |
     ; SI-NEXT: [[LSHR1:%[0-9]+]]:_(s64) = G_LSHR [[UV1]], [[C]](s32)
     ; SI-NEXT: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[OR]](s64), [[LSHR1]](s64)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV]](s128)
+    ;
     ; VI-LABEL: name: test_lshr_s128_s32_31
     ; VI: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4
     ; VI-NEXT: {{  $}}
@@ -1305,6 +1332,7 @@ body: |
     ; VI-NEXT: [[LSHR1:%[0-9]+]]:_(s64) = G_LSHR [[UV1]], [[C]](s32)
     ; VI-NEXT: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[OR]](s64), [[LSHR1]](s64)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV]](s128)
+    ;
     ; GFX9-LABEL: name: test_lshr_s128_s32_31
     ; GFX9: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4
     ; GFX9-NEXT: {{  $}}
@@ -1342,6 +1370,7 @@ body: |
     ; SI-NEXT: [[LSHR1:%[0-9]+]]:_(s64) = G_LSHR [[UV1]], [[C]](s32)
     ; SI-NEXT: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[OR]](s64), [[LSHR1]](s64)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV]](s128)
+    ;
     ; VI-LABEL: name: test_lshr_s128_s32_32
     ; VI: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4
     ; VI-NEXT: {{  $}}
@@ -1354,6 +1383,7 @@ body: |
     ; VI-NEXT: [[LSHR1:%[0-9]+]]:_(s64) = G_LSHR [[UV1]], [[C]](s32)
     ; VI-NEXT: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[OR]](s64), [[LSHR1]](s64)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV]](s128)
+    ;
     ; GFX9-LABEL: name: test_lshr_s128_s32_32
     ; GFX9: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4
     ; GFX9-NEXT: {{  $}}
@@ -1391,6 +1421,7 @@ body: |
     ; SI-NEXT: [[LSHR1:%[0-9]+]]:_(s64) = G_LSHR [[UV1]], [[C]](s32)
     ; SI-NEXT: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[OR]](s64), [[LSHR1]](s64)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV]](s128)
+    ;
     ; VI-LABEL: name: test_lshr_s128_s32_33
     ; VI: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4
     ; VI-NEXT: {{  $}}
@@ -1404,6 +1435,7 @@ body: |
     ; VI-NEXT: [[LSHR1:%[0-9]+]]:_(s64) = G_LSHR [[UV1]], [[C]](s32)
     ; VI-NEXT: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[OR]](s64), [[LSHR1]](s64)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV]](s128)
+    ;
     ; GFX9-LABEL: name: test_lshr_s128_s32_33
     ; GFX9: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4
     ; GFX9-NEXT: {{  $}}
@@ -1439,6 +1471,7 @@ body: |
     ; SI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
     ; SI-NEXT: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[LSHR]](s64), [[C1]](s64)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV]](s128)
+    ;
     ; VI-LABEL: name: test_lshr_s128_s32_127
     ; VI: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4
     ; VI-NEXT: {{  $}}
@@ -1449,6 +1482,7 @@ body: |
     ; VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
     ; VI-NEXT: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[LSHR]](s64), [[C1]](s64)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV]](s128)
+    ;
     ; GFX9-LABEL: name: test_lshr_s128_s32_127
     ; GFX9: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4
     ; GFX9-NEXT: {{  $}}
@@ -1476,33 +1510,33 @@ body: |
     ; SI-NEXT: {{  $}}
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(s256) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
     ; SI-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr8
-    ; SI-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 128
+    ; SI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 128
     ; SI-NEXT: [[UV:%[0-9]+]]:_(s128), [[UV1:%[0-9]+]]:_(s128) = G_UNMERGE_VALUES [[COPY]](s256)
-    ; SI-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[COPY1]], [[C2]]
-    ; SI-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[C2]], [[COPY1]]
-    ; SI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-    ; SI-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY1]](s32), [[C2]]
-    ; SI-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C]]
-    ; SI-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 64
+    ; SI-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[COPY1]], [[C]]
+    ; SI-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[C]], [[COPY1]]
+    ; SI-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; SI-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY1]](s32), [[C]]
+    ; SI-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C1]]
+    ; SI-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 64
     ; SI-NEXT: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[UV1]](s128)
-    ; SI-NEXT: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[COPY1]], [[C3]]
-    ; SI-NEXT: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[C3]], [[COPY1]]
-    ; SI-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY1]](s32), [[C3]]
-    ; SI-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C]]
+    ; SI-NEXT: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[COPY1]], [[C2]]
+    ; SI-NEXT: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[C2]], [[COPY1]]
+    ; SI-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY1]](s32), [[C2]]
+    ; SI-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C1]]
     ; SI-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[UV3]], [[COPY1]](s32)
     ; SI-NEXT: [[LSHR1:%[0-9]+]]:_(s64) = G_LSHR [[UV2]], [[COPY1]](s32)
     ; SI-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[UV3]], [[SUB3]](s32)
     ; SI-NEXT: [[OR:%[0-9]+]]:_(s64) = G_OR [[LSHR1]], [[SHL]]
-    ; SI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; SI-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
     ; SI-NEXT: [[LSHR2:%[0-9]+]]:_(s64) = G_LSHR [[UV3]], [[SUB2]](s32)
     ; SI-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP2]](s1), [[OR]], [[LSHR2]]
     ; SI-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[ICMP3]](s1), [[UV2]], [[SELECT]]
-    ; SI-NEXT: [[SELECT2:%[0-9]+]]:_(s64) = G_SELECT [[ICMP2]](s1), [[LSHR]], [[C1]]
+    ; SI-NEXT: [[SELECT2:%[0-9]+]]:_(s64) = G_SELECT [[ICMP2]](s1), [[LSHR]], [[C3]]
     ; SI-NEXT: [[UV4:%[0-9]+]]:_(s64), [[UV5:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[UV]](s128)
-    ; SI-NEXT: [[SUB4:%[0-9]+]]:_(s32) = G_SUB [[COPY1]], [[C3]]
-    ; SI-NEXT: [[SUB5:%[0-9]+]]:_(s32) = G_SUB [[C3]], [[COPY1]]
-    ; SI-NEXT: [[ICMP4:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY1]](s32), [[C3]]
-    ; SI-NEXT: [[ICMP5:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C]]
+    ; SI-NEXT: [[SUB4:%[0-9]+]]:_(s32) = G_SUB [[COPY1]], [[C2]]
+    ; SI-NEXT: [[SUB5:%[0-9]+]]:_(s32) = G_SUB [[C2]], [[COPY1]]
+    ; SI-NEXT: [[ICMP4:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY1]](s32), [[C2]]
+    ; SI-NEXT: [[ICMP5:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C1]]
     ; SI-NEXT: [[LSHR3:%[0-9]+]]:_(s64) = G_LSHR [[UV5]], [[COPY1]](s32)
     ; SI-NEXT: [[LSHR4:%[0-9]+]]:_(s64) = G_LSHR [[UV4]], [[COPY1]](s32)
     ; SI-NEXT: [[SHL1:%[0-9]+]]:_(s64) = G_SHL [[UV5]], [[SUB5]](s32)
@@ -1510,27 +1544,27 @@ body: |
     ; SI-NEXT: [[LSHR5:%[0-9]+]]:_(s64) = G_LSHR [[UV5]], [[SUB4]](s32)
     ; SI-NEXT: [[SELECT3:%[0-9]+]]:_(s64) = G_SELECT [[ICMP4]](s1), [[OR1]], [[LSHR5]]
     ; SI-NEXT: [[SELECT4:%[0-9]+]]:_(s64) = G_SELECT [[ICMP5]](s1), [[UV4]], [[SELECT3]]
-    ; SI-NEXT: [[SELECT5:%[0-9]+]]:_(s64) = G_SELECT [[ICMP4]](s1), [[LSHR3]], [[C1]]
+    ; SI-NEXT: [[SELECT5:%[0-9]+]]:_(s64) = G_SELECT [[ICMP4]](s1), [[LSHR3]], [[C3]]
     ; SI-NEXT: [[UV6:%[0-9]+]]:_(s64), [[UV7:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[UV1]](s128)
-    ; SI-NEXT: [[SUB6:%[0-9]+]]:_(s32) = G_SUB [[SUB1]], [[C3]]
-    ; SI-NEXT: [[SUB7:%[0-9]+]]:_(s32) = G_SUB [[C3]], [[SUB1]]
-    ; SI-NEXT: [[ICMP6:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[SUB1]](s32), [[C3]]
-    ; SI-NEXT: [[ICMP7:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[SUB1]](s32), [[C]]
+    ; SI-NEXT: [[SUB6:%[0-9]+]]:_(s32) = G_SUB [[SUB1]], [[C2]]
+    ; SI-NEXT: [[SUB7:%[0-9]+]]:_(s32) = G_SUB [[C2]], [[SUB1]]
+    ; SI-NEXT: [[ICMP6:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[SUB1]](s32), [[C2]]
+    ; SI-NEXT: [[ICMP7:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[SUB1]](s32), [[C1]]
     ; SI-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[UV6]], [[SUB1]](s32)
     ; SI-NEXT: [[LSHR6:%[0-9]+]]:_(s64) = G_LSHR [[UV6]], [[SUB7]](s32)
     ; SI-NEXT: [[SHL3:%[0-9]+]]:_(s64) = G_SHL [[UV7]], [[SUB1]](s32)
     ; SI-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[LSHR6]], [[SHL3]]
     ; SI-NEXT: [[SHL4:%[0-9]+]]:_(s64) = G_SHL [[UV6]], [[SUB6]](s32)
-    ; SI-NEXT: [[SELECT6:%[0-9]+]]:_(s64) = G_SELECT [[ICMP6]](s1), [[SHL2]], [[C1]]
+    ; SI-NEXT: [[SELECT6:%[0-9]+]]:_(s64) = G_SELECT [[ICMP6]](s1), [[SHL2]], [[C3]]
     ; SI-NEXT: [[SELECT7:%[0-9]+]]:_(s64) = G_SELECT [[ICMP6]](s1), [[OR2]], [[SHL4]]
     ; SI-NEXT: [[SELECT8:%[0-9]+]]:_(s64) = G_SELECT [[ICMP7]](s1), [[UV7]], [[SELECT7]]
     ; SI-NEXT: [[OR3:%[0-9]+]]:_(s64) = G_OR [[SELECT4]], [[SELECT6]]
     ; SI-NEXT: [[OR4:%[0-9]+]]:_(s64) = G_OR [[SELECT5]], [[SELECT8]]
     ; SI-NEXT: [[UV8:%[0-9]+]]:_(s64), [[UV9:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[UV1]](s128)
-    ; SI-NEXT: [[SUB8:%[0-9]+]]:_(s32) = G_SUB [[SUB]], [[C3]]
-    ; SI-NEXT: [[SUB9:%[0-9]+]]:_(s32) = G_SUB [[C3]], [[SUB]]
-    ; SI-NEXT: [[ICMP8:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[SUB]](s32), [[C3]]
-    ; SI-NEXT: [[ICMP9:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[SUB]](s32), [[C]]
+    ; SI-NEXT: [[SUB8:%[0-9]+]]:_(s32) = G_SUB [[SUB]], [[C2]]
+    ; SI-NEXT: [[SUB9:%[0-9]+]]:_(s32) = G_SUB [[C2]], [[SUB]]
+    ; SI-NEXT: [[ICMP8:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[SUB]](s32), [[C2]]
+    ; SI-NEXT: [[ICMP9:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[SUB]](s32), [[C1]]
     ; SI-NEXT: [[LSHR7:%[0-9]+]]:_(s64) = G_LSHR [[UV9]], [[SUB]](s32)
     ; SI-NEXT: [[LSHR8:%[0-9]+]]:_(s64) = G_LSHR [[UV8]], [[SUB]](s32)
     ; SI-NEXT: [[SHL5:%[0-9]+]]:_(s64) = G_SHL [[UV9]], [[SUB9]](s32)
@@ -1538,50 +1572,51 @@ body: |
     ; SI-NEXT: [[LSHR9:%[0-9]+]]:_(s64) = G_LSHR [[UV9]], [[SUB8]](s32)
     ; SI-NEXT: [[SELECT9:%[0-9]+]]:_(s64) = G_SELECT [[ICMP8]](s1), [[OR5]], [[LSHR9]]
     ; SI-NEXT: [[SELECT10:%[0-9]+]]:_(s64) = G_SELECT [[ICMP9]](s1), [[UV8]], [[SELECT9]]
-    ; SI-NEXT: [[SELECT11:%[0-9]+]]:_(s64) = G_SELECT [[ICMP8]](s1), [[LSHR7]], [[C1]]
+    ; SI-NEXT: [[SELECT11:%[0-9]+]]:_(s64) = G_SELECT [[ICMP8]](s1), [[LSHR7]], [[C3]]
     ; SI-NEXT: [[SELECT12:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s1), [[OR3]], [[SELECT10]]
     ; SI-NEXT: [[SELECT13:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s1), [[OR4]], [[SELECT11]]
     ; SI-NEXT: [[UV10:%[0-9]+]]:_(s64), [[UV11:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[UV]](s128)
     ; SI-NEXT: [[SELECT14:%[0-9]+]]:_(s64) = G_SELECT [[ICMP1]](s1), [[UV10]], [[SELECT12]]
     ; SI-NEXT: [[SELECT15:%[0-9]+]]:_(s64) = G_SELECT [[ICMP1]](s1), [[UV11]], [[SELECT13]]
     ; SI-NEXT: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[SELECT14]](s64), [[SELECT15]](s64)
-    ; SI-NEXT: [[SELECT16:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s1), [[SELECT1]], [[C1]]
-    ; SI-NEXT: [[SELECT17:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s1), [[SELECT2]], [[C1]]
+    ; SI-NEXT: [[SELECT16:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s1), [[SELECT1]], [[C3]]
+    ; SI-NEXT: [[SELECT17:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s1), [[SELECT2]], [[C3]]
     ; SI-NEXT: [[MV1:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[SELECT16]](s64), [[SELECT17]](s64)
     ; SI-NEXT: [[MV2:%[0-9]+]]:_(s256) = G_MERGE_VALUES [[MV]](s128), [[MV1]](s128)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[MV2]](s256)
+    ;
     ; VI-LABEL: name: test_lshr_s256_s256
     ; VI: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(s256) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
     ; VI-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr8
-    ; VI-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 128
+    ; VI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 128
     ; VI-NEXT: [[UV:%[0-9]+]]:_(s128), [[UV1:%[0-9]+]]:_(s128) = G_UNMERGE_VALUES [[COPY]](s256)
-    ; VI-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[COPY1]], [[C2]]
-    ; VI-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[C2]], [[COPY1]]
-    ; VI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-    ; VI-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY1]](s32), [[C2]]
-    ; VI-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C]]
-    ; VI-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 64
+    ; VI-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[COPY1]], [[C]]
+    ; VI-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[C]], [[COPY1]]
+    ; VI-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; VI-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY1]](s32), [[C]]
+    ; VI-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C1]]
+    ; VI-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 64
     ; VI-NEXT: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[UV1]](s128)
-    ; VI-NEXT: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[COPY1]], [[C3]]
-    ; VI-NEXT: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[C3]], [[COPY1]]
-    ; VI-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY1]](s32), [[C3]]
-    ; VI-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C]]
+    ; VI-NEXT: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[COPY1]], [[C2]]
+    ; VI-NEXT: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[C2]], [[COPY1]]
+    ; VI-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY1]](s32), [[C2]]
+    ; VI-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C1]]
     ; VI-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[UV3]], [[COPY1]](s32)
     ; VI-NEXT: [[LSHR1:%[0-9]+]]:_(s64) = G_LSHR [[UV2]], [[COPY1]](s32)
     ; VI-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[UV3]], [[SUB3]](s32)
     ; VI-NEXT: [[OR:%[0-9]+]]:_(s64) = G_OR [[LSHR1]], [[SHL]]
-    ; VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; VI-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
     ; VI-NEXT: [[LSHR2:%[0-9]+]]:_(s64) = G_LSHR [[UV3]], [[SUB2]](s32)
     ; VI-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP2]](s1), [[OR]], [[LSHR2]]
     ; VI-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[ICMP3]](s1), [[UV2]], [[SELECT]]
-    ; VI-NEXT: [[SELECT2:%[0-9]+]]:_(s64) = G_SELECT [[ICMP2]](s1), [[LSHR]], [[C1]]
+    ; VI-NEXT: [[SELECT2:%[0-9]+]]:_(s64) = G_SELECT [[ICMP2]](s1), [[LSHR]], [[C3]]
     ; VI-NEXT: [[UV4:%[0-9]+]]:_(s64), [[UV5:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[UV]](s128)
-    ; VI-NEXT: [[SUB4:%[0-9]+]]:_(s32) = G_SUB [[COPY1]], [[C3]]
-    ; VI-NEXT: [[SUB5:%[0-9]+]]:_(s32) = G_SUB [[C3]], [[COPY1]]
-    ; VI-NEXT: [[ICMP4:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY1]](s32), [[C3]]
-    ; VI-NEXT: [[ICMP5:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C]]
+    ; VI-NEXT: [[SUB4:%[0-9]+]]:_(s32) = G_SUB [[COPY1]], [[C2]]
+    ; VI-NEXT: [[SUB5:%[0-9]+]]:_(s32) = G_SUB [[C2]], [[COPY1]]
+    ; VI-NEXT: [[ICMP4:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY1]](s32), [[C2]]
+    ; VI-NEXT: [[ICMP5:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C1]]
     ; VI-NEXT: [[LSHR3:%[0-9]+]]:_(s64) = G_LSHR [[UV5]], [[COPY1]](s32)
     ; VI-NEXT: [[LSHR4:%[0-9]+]]:_(s64) = G_LSHR [[UV4]], [[COPY1]](s32)
     ; VI-NEXT: [[SHL1:%[0-9]+]]:_(s64) = G_SHL [[UV5]], [[SUB5]](s32)
@@ -1589,27 +1624,27 @@ body: |
     ; VI-NEXT: [[LSHR5:%[0-9]+]]:_(s64) = G_LSHR [[UV5]], [[SUB4]](s32)
     ; VI-NEXT: [[SELECT3:%[0-9]+]]:_(s64) = G_SELECT [[ICMP4]](s1), [[OR1]], [[LSHR5]]
     ; VI-NEXT: [[SELECT4:%[0-9]+]]:_(s64) = G_SELECT [[ICMP5]](s1), [[UV4]], [[SELECT3]]
-    ; VI-NEXT: [[SELECT5:%[0-9]+]]:_(s64) = G_SELECT [[ICMP4]](s1), [[LSHR3]], [[C1]]
+    ; VI-NEXT: [[SELECT5:%[0-9]+]]:_(s64) = G_SELECT [[ICMP4]](s1), [[LSHR3]], [[C3]]
     ; VI-NEXT: [[UV6:%[0-9]+]]:_(s64), [[UV7:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[UV1]](s128)
-    ; VI-NEXT: [[SUB6:%[0-9]+]]:_(s32) = G_SUB [[SUB1]], [[C3]]
-    ; VI-NEXT: [[SUB7:%[0-9]+]]:_(s32) = G_SUB [[C3]], [[SUB1]]
-    ; VI-NEXT: [[ICMP6:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[SUB1]](s32), [[C3]]
-    ; VI-NEXT: [[ICMP7:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[SUB1]](s32), [[C]]
+    ; VI-NEXT: [[SUB6:%[0-9]+]]:_(s32) = G_SUB [[SUB1]], [[C2]]
+    ; VI-NEXT: [[SUB7:%[0-9]+]]:_(s32) = G_SUB [[C2]], [[SUB1]]
+    ; VI-NEXT: [[ICMP6:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[SUB1]](s32), [[C2]]
+    ; VI-NEXT: [[ICMP7:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[SUB1]](s32), [[C1]]
     ; VI-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[UV6]], [[SUB1]](s32)
     ; VI-NEXT: [[LSHR6:%[0-9]+]]:_(s64) = G_LSHR [[UV6]], [[SUB7]](s32)
     ; VI-NEXT: [[SHL3:%[0-9]+]]:_(s64) = G_SHL [[UV7]], [[SUB1]](s32)
     ; VI-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[LSHR6]], [[SHL3]]
     ; VI-NEXT: [[SHL4:%[0-9]+]]:_(s64) = G_SHL [[UV6]], [[SUB6]](s32)
-    ; VI-NEXT: [[SELECT6:%[0-9]+]]:_(s64) = G_SELECT [[ICMP6]](s1), [[SHL2]], [[C1]]
+    ; VI-NEXT: [[SELECT6:%[0-9]+]]:_(s64) = G_SELECT [[ICMP6]](s1), [[SHL2]], [[C3]]
     ; VI-NEXT: [[SELECT7:%[0-9]+]]:_(s64) = G_SELECT [[ICMP6]](s1), [[OR2]], [[SHL4]]
     ; VI-NEXT: [[SELECT8:%[0-9]+]]:_(s64) = G_SELECT [[ICMP7]](s1), [[UV7]], [[SELECT7]]
     ; VI-NEXT: [[OR3:%[0-9]+]]:_(s64) = G_OR [[SELECT4]], [[SELECT6]]
     ; VI-NEXT: [[OR4:%[0-9]+]]:_(s64) = G_OR [[SELECT5]], [[SELECT8]]
     ; VI-NEXT: [[UV8:%[0-9]+]]:_(s64), [[UV9:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[UV1]](s128)
-    ; VI-NEXT: [[SUB8:%[0-9]+]]:_(s32) = G_SUB [[SUB]], [[C3]]
-    ; VI-NEXT: [[SUB9:%[0-9]+]]:_(s32) = G_SUB [[C3]], [[SUB]]
-    ; VI-NEXT: [[ICMP8:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[SUB]](s32), [[C3]]
-    ; VI-NEXT: [[ICMP9:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[SUB]](s32), [[C]]
+    ; VI-NEXT: [[SUB8:%[0-9]+]]:_(s32) = G_SUB [[SUB]], [[C2]]
+    ; VI-NEXT: [[SUB9:%[0-9]+]]:_(s32) = G_SUB [[C2]], [[SUB]]
+    ; VI-NEXT: [[ICMP8:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[SUB]](s32), [[C2]]
+    ; VI-NEXT: [[ICMP9:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[SUB]](s32), [[C1]]
     ; VI-NEXT: [[LSHR7:%[0-9]+]]:_(s64) = G_LSHR [[UV9]], [[SUB]](s32)
     ; VI-NEXT: [[LSHR8:%[0-9]+]]:_(s64) = G_LSHR [[UV8]], [[SUB]](s32)
     ; VI-NEXT: [[SHL5:%[0-9]+]]:_(s64) = G_SHL [[UV9]], [[SUB9]](s32)
@@ -1617,50 +1652,51 @@ body: |
     ; VI-NEXT: [[LSHR9:%[0-9]+]]:_(s64) = G_LSHR [[UV9]], [[SUB8]](s32)
     ; VI-NEXT: [[SELECT9:%[0-9]+]]:_(s64) = G_SELECT [[ICMP8]](s1), [[OR5]], [[LSHR9]]
     ; VI-NEXT: [[SELECT10:%[0-9]+]]:_(s64) = G_SELECT [[ICMP9]](s1), [[UV8]], [[SELECT9]]
-    ; VI-NEXT: [[SELECT11:%[0-9]+]]:_(s64) = G_SELECT [[ICMP8]](s1), [[LSHR7]], [[C1]]
+    ; VI-NEXT: [[SELECT11:%[0-9]+]]:_(s64) = G_SELECT [[ICMP8]](s1), [[LSHR7]], [[C3]]
     ; VI-NEXT: [[SELECT12:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s1), [[OR3]], [[SELECT10]]
     ; VI-NEXT: [[SELECT13:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s1), [[OR4]], [[SELECT11]]
     ; VI-NEXT: [[UV10:%[0-9]+]]:_(s64), [[UV11:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[UV]](s128)
     ; VI-NEXT: [[SELECT14:%[0-9]+]]:_(s64) = G_SELECT [[ICMP1]](s1), [[UV10]], [[SELECT12]]
     ; VI-NEXT: [[SELECT15:%[0-9]+]]:_(s64) = G_SELECT [[ICMP1]](s1), [[UV11]], [[SELECT13]]
     ; VI-NEXT: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[SELECT14]](s64), [[SELECT15]](s64)
-    ; VI-NEXT: [[SELECT16:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s1), [[SELECT1]], [[C1]]
-    ; VI-NEXT: [[SELECT17:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s1), [[SELECT2]], [[C1]]
+    ; VI-NEXT: [[SELECT16:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s1), [[SELECT1]], [[C3]]
+    ; VI-NEXT: [[SELECT17:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s1), [[SELECT2]], [[C3]]
     ; VI-NEXT: [[MV1:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[SELECT16]](s64), [[SELECT17]](s64)
     ; VI-NEXT: [[MV2:%[0-9]+]]:_(s256) = G_MERGE_VALUES [[MV]](s128), [[MV1]](s128)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[MV2]](s256)
+    ;
     ; GFX9-LABEL: name: test_lshr_s256_s256
     ; GFX9: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s256) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr8
-    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 128
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 128
     ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s128), [[UV1:%[0-9]+]]:_(s128) = G_UNMERGE_VALUES [[COPY]](s256)
-    ; GFX9-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[COPY1]], [[C2]]
-    ; GFX9-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[C2]], [[COPY1]]
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-    ; GFX9-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY1]](s32), [[C2]]
-    ; GFX9-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C]]
-    ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 64
+    ; GFX9-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[COPY1]], [[C]]
+    ; GFX9-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[C]], [[COPY1]]
+    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; GFX9-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY1]](s32), [[C]]
+    ; GFX9-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C1]]
+    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 64
     ; GFX9-NEXT: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[UV1]](s128)
-    ; GFX9-NEXT: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[COPY1]], [[C3]]
-    ; GFX9-NEXT: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[C3]], [[COPY1]]
-    ; GFX9-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY1]](s32), [[C3]]
-    ; GFX9-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C]]
+    ; GFX9-NEXT: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[COPY1]], [[C2]]
+    ; GFX9-NEXT: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[C2]], [[COPY1]]
+    ; GFX9-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY1]](s32), [[C2]]
+    ; GFX9-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C1]]
     ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[UV3]], [[COPY1]](s32)
     ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s64) = G_LSHR [[UV2]], [[COPY1]](s32)
     ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[UV3]], [[SUB3]](s32)
     ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s64) = G_OR [[LSHR1]], [[SHL]]
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
     ; GFX9-NEXT: [[LSHR2:%[0-9]+]]:_(s64) = G_LSHR [[UV3]], [[SUB2]](s32)
     ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP2]](s1), [[OR]], [[LSHR2]]
     ; GFX9-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[ICMP3]](s1), [[UV2]], [[SELECT]]
-    ; GFX9-NEXT: [[SELECT2:%[0-9]+]]:_(s64) = G_SELECT [[ICMP2]](s1), [[LSHR]], [[C1]]
+    ; GFX9-NEXT: [[SELECT2:%[0-9]+]]:_(s64) = G_SELECT [[ICMP2]](s1), [[LSHR]], [[C3]]
     ; GFX9-NEXT: [[UV4:%[0-9]+]]:_(s64), [[UV5:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[UV]](s128)
-    ; GFX9-NEXT: [[SUB4:%[0-9]+]]:_(s32) = G_SUB [[COPY1]], [[C3]]
-    ; GFX9-NEXT: [[SUB5:%[0-9]+]]:_(s32) = G_SUB [[C3]], [[COPY1]]
-    ; GFX9-NEXT: [[ICMP4:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY1]](s32), [[C3]]
-    ; GFX9-NEXT: [[ICMP5:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C]]
+    ; GFX9-NEXT: [[SUB4:%[0-9]+]]:_(s32) = G_SUB [[COPY1]], [[C2]]
+    ; GFX9-NEXT: [[SUB5:%[0-9]+]]:_(s32) = G_SUB [[C2]], [[COPY1]]
+    ; GFX9-NEXT: [[ICMP4:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY1]](s32), [[C2]]
+    ; GFX9-NEXT: [[ICMP5:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C1]]
     ; GFX9-NEXT: [[LSHR3:%[0-9]+]]:_(s64) = G_LSHR [[UV5]], [[COPY1]](s32)
     ; GFX9-NEXT: [[LSHR4:%[0-9]+]]:_(s64) = G_LSHR [[UV4]], [[COPY1]](s32)
     ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s64) = G_SHL [[UV5]], [[SUB5]](s32)
@@ -1668,27 +1704,27 @@ body: |
     ; GFX9-NEXT: [[LSHR5:%[0-9]+]]:_(s64) = G_LSHR [[UV5]], [[SUB4]](s32)
     ; GFX9-NEXT: [[SELECT3:%[0-9]+]]:_(s64) = G_SELECT [[ICMP4]](s1), [[OR1]], [[LSHR5]]
     ; GFX9-NEXT: [[SELECT4:%[0-9]+]]:_(s64) = G_SELECT [[ICMP5]](s1), [[UV4]], [[SELECT3]]
-    ; GFX9-NEXT: [[SELECT5:%[0-9]+]]:_(s64) = G_SELECT [[ICMP4]](s1), [[LSHR3]], [[C1]]
+    ; GFX9-NEXT: [[SELECT5:%[0-9]+]]:_(s64) = G_SELECT [[ICMP4]](s1), [[LSHR3]], [[C3]]
     ; GFX9-NEXT: [[UV6:%[0-9]+]]:_(s64), [[UV7:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[UV1]](s128)
-    ; GFX9-NEXT: [[SUB6:%[0-9]+]]:_(s32) = G_SUB [[SUB1]], [[C3]]
-    ; GFX9-NEXT: [[SUB7:%[0-9]+]]:_(s32) = G_SUB [[C3]], [[SUB1]]
-    ; GFX9-NEXT: [[ICMP6:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[SUB1]](s32), [[C3]]
-    ; GFX9-NEXT: [[ICMP7:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[SUB1]](s32), [[C]]
+    ; GFX9-NEXT: [[SUB6:%[0-9]+]]:_(s32) = G_SUB [[SUB1]], [[C2]]
+    ; GFX9-NEXT: [[SUB7:%[0-9]+]]:_(s32) = G_SUB [[C2]], [[SUB1]]
+    ; GFX9-NEXT: [[ICMP6:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[SUB1]](s32), [[C2]]
+    ; GFX9-NEXT: [[ICMP7:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[SUB1]](s32), [[C1]]
     ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[UV6]], [[SUB1]](s32)
     ; GFX9-NEXT: [[LSHR6:%[0-9]+]]:_(s64) = G_LSHR [[UV6]], [[SUB7]](s32)
     ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s64) = G_SHL [[UV7]], [[SUB1]](s32)
     ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[LSHR6]], [[SHL3]]
     ; GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s64) = G_SHL [[UV6]], [[SUB6]](s32)
-    ; GFX9-NEXT: [[SELECT6:%[0-9]+]]:_(s64) = G_SELECT [[ICMP6]](s1), [[SHL2]], [[C1]]
+    ; GFX9-NEXT: [[SELECT6:%[0-9]+]]:_(s64) = G_SELECT [[ICMP6]](s1), [[SHL2]], [[C3]]
     ; GFX9-NEXT: [[SELECT7:%[0-9]+]]:_(s64) = G_SELECT [[ICMP6]](s1), [[OR2]], [[SHL4]]
     ; GFX9-NEXT: [[SELECT8:%[0-9]+]]:_(s64) = G_SELECT [[ICMP7]](s1), [[UV7]], [[SELECT7]]
     ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s64) = G_OR [[SELECT4]], [[SELECT6]]
     ; GFX9-NEXT: [[OR4:%[0-9]+]]:_(s64) = G_OR [[SELECT5]], [[SELECT8]]
     ; GFX9-NEXT: [[UV8:%[0-9]+]]:_(s64), [[UV9:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[UV1]](s128)
-    ; GFX9-NEXT: [[SUB8:%[0-9]+]]:_(s32) = G_SUB [[SUB]], [[C3]]
-    ; GFX9-NEXT: [[SUB9:%[0-9]+]]:_(s32) = G_SUB [[C3]], [[SUB]]
-    ; GFX9-NEXT: [[ICMP8:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[SUB]](s32), [[C3]]
-    ; GFX9-NEXT: [[ICMP9:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[SUB]](s32), [[C]]
+    ; GFX9-NEXT: [[SUB8:%[0-9]+]]:_(s32) = G_SUB [[SUB]], [[C2]]
+    ; GFX9-NEXT: [[SUB9:%[0-9]+]]:_(s32) = G_SUB [[C2]], [[SUB]]
+    ; GFX9-NEXT: [[ICMP8:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[SUB]](s32), [[C2]]
+    ; GFX9-NEXT: [[ICMP9:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[SUB]](s32), [[C1]]
     ; GFX9-NEXT: [[LSHR7:%[0-9]+]]:_(s64) = G_LSHR [[UV9]], [[SUB]](s32)
     ; GFX9-NEXT: [[LSHR8:%[0-9]+]]:_(s64) = G_LSHR [[UV8]], [[SUB]](s32)
     ; GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s64) = G_SHL [[UV9]], [[SUB9]](s32)
@@ -1696,15 +1732,15 @@ body: |
     ; GFX9-NEXT: [[LSHR9:%[0-9]+]]:_(s64) = G_LSHR [[UV9]], [[SUB8]](s32)
     ; GFX9-NEXT: [[SELECT9:%[0-9]+]]:_(s64) = G_SELECT [[ICMP8]](s1), [[OR5]], [[LSHR9]]
     ; GFX9-NEXT: [[SELECT10:%[0-9]+]]:_(s64) = G_SELECT [[ICMP9]](s1), [[UV8]], [[SELECT9]]
-    ; GFX9-NEXT: [[SELECT11:%[0-9]+]]:_(s64) = G_SELECT [[ICMP8]](s1), [[LSHR7]], [[C1]]
+    ; GFX9-NEXT: [[SELECT11:%[0-9]+]]:_(s64) = G_SELECT [[ICMP8]](s1), [[LSHR7]], [[C3]]
     ; GFX9-NEXT: [[SELECT12:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s1), [[OR3]], [[SELECT10]]
     ; GFX9-NEXT: [[SELECT13:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s1), [[OR4]], [[SELECT11]]
     ; GFX9-NEXT: [[UV10:%[0-9]+]]:_(s64), [[UV11:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[UV]](s128)
     ; GFX9-NEXT: [[SELECT14:%[0-9]+]]:_(s64) = G_SELECT [[ICMP1]](s1), [[UV10]], [[SELECT12]]
     ; GFX9-NEXT: [[SELECT15:%[0-9]+]]:_(s64) = G_SELECT [[ICMP1]](s1), [[UV11]], [[SELECT13]]
     ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[SELECT14]](s64), [[SELECT15]](s64)
-    ; GFX9-NEXT: [[SELECT16:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s1), [[SELECT1]], [[C1]]
-    ; GFX9-NEXT: [[SELECT17:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s1), [[SELECT2]], [[C1]]
+    ; GFX9-NEXT: [[SELECT16:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s1), [[SELECT1]], [[C3]]
+    ; GFX9-NEXT: [[SELECT17:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s1), [[SELECT2]], [[C3]]
     ; GFX9-NEXT: [[MV1:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[SELECT16]](s64), [[SELECT17]](s64)
     ; GFX9-NEXT: [[MV2:%[0-9]+]]:_(s256) = G_MERGE_VALUES [[MV]](s128), [[MV1]](s128)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[MV2]](s256)
@@ -1761,6 +1797,7 @@ body: |
     ; SI-NEXT: [[MV1:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[SELECT4]](s64), [[SELECT5]](s64)
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s128>) = G_BUILD_VECTOR [[MV]](s128), [[MV1]](s128)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<2 x s128>)
+    ;
     ; VI-LABEL: name: test_lshr_v2s128_v2s32
     ; VI: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr4_vgpr5
     ; VI-NEXT: {{  $}}
@@ -1801,6 +1838,7 @@ body: |
     ; VI-NEXT: [[MV1:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[SELECT4]](s64), [[SELECT5]](s64)
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s128>) = G_BUILD_VECTOR [[MV]](s128), [[MV1]](s128)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<2 x s128>)
+    ;
     ; GFX9-LABEL: name: test_lshr_v2s128_v2s32
     ; GFX9: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr4_vgpr5
     ; GFX9-NEXT: {{  $}}
@@ -1859,12 +1897,12 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(s96) = COPY $vgpr0_vgpr1_vgpr2
     ; SI-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr3
     ; SI-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC %23(s64)
-    ; SI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
-    ; SI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
     ; SI-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s96)
     ; SI-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
     ; SI-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UV]](s32), [[UV1]](s32)
     ; SI-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UV2]](s32), [[DEF]](s32)
+    ; SI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
+    ; SI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
     ; SI-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[MV]], [[C]]
     ; SI-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[MV1]], [[C1]]
     ; SI-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 64
@@ -1885,18 +1923,19 @@ body: |
     ; SI-NEXT: [[MV2:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[SELECT1]](s64), [[SELECT2]](s64)
     ; SI-NEXT: [[TRUNC1:%[0-9]+]]:_(s96) = G_TRUNC [[MV2]](s128)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[TRUNC1]](s96)
+    ;
     ; VI-LABEL: name: test_lshr_s65_s32
     ; VI: liveins: $vgpr0_vgpr1_vgpr2, $vgpr3
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(s96) = COPY $vgpr0_vgpr1_vgpr2
     ; VI-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr3
     ; VI-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC %23(s64)
-    ; VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
-    ; VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
     ; VI-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s96)
     ; VI-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
     ; VI-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UV]](s32), [[UV1]](s32)
     ; VI-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UV2]](s32), [[DEF]](s32)
+    ; VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
+    ; VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
     ; VI-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[MV]], [[C]]
     ; VI-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[MV1]], [[C1]]
     ; VI-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 64
@@ -1917,18 +1956,19 @@ body: |
     ; VI-NEXT: [[MV2:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[SELECT1]](s64), [[SELECT2]](s64)
     ; VI-NEXT: [[TRUNC1:%[0-9]+]]:_(s96) = G_TRUNC [[MV2]](s128)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[TRUNC1]](s96)
+    ;
     ; GFX9-LABEL: name: test_lshr_s65_s32
     ; GFX9: liveins: $vgpr0_vgpr1_vgpr2, $vgpr3
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s96) = COPY $vgpr0_vgpr1_vgpr2
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr3
     ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC %23(s64)
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
     ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s96)
     ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
     ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UV]](s32), [[UV1]](s32)
     ; GFX9-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UV2]](s32), [[DEF]](s32)
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
+    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
     ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[MV]], [[C]]
     ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[MV1]], [[C1]]
     ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 64
@@ -1968,12 +2008,12 @@ body: |
     ; SI-NEXT: {{  $}}
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(s96) = COPY $vgpr0_vgpr1_vgpr2
     ; SI-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC %23(s64)
-    ; SI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
-    ; SI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
     ; SI-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s96)
     ; SI-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
     ; SI-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UV]](s32), [[UV1]](s32)
     ; SI-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UV2]](s32), [[DEF]](s32)
+    ; SI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
+    ; SI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
     ; SI-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[MV]], [[C]]
     ; SI-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[MV1]], [[C1]]
     ; SI-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 64
@@ -1994,17 +2034,18 @@ body: |
     ; SI-NEXT: [[MV2:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[SELECT1]](s64), [[SELECT2]](s64)
     ; SI-NEXT: [[TRUNC1:%[0-9]+]]:_(s96) = G_TRUNC [[MV2]](s128)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[TRUNC1]](s96)
+    ;
     ; VI-LABEL: name: test_lshr_s65_s32_constant8
     ; VI: liveins: $vgpr0_vgpr1_vgpr2
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(s96) = COPY $vgpr0_vgpr1_vgpr2
     ; VI-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC %23(s64)
-    ; VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
-    ; VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
     ; VI-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s96)
     ; VI-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
     ; VI-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UV]](s32), [[UV1]](s32)
     ; VI-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UV2]](s32), [[DEF]](s32)
+    ; VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
+    ; VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
     ; VI-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[MV]], [[C]]
     ; VI-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[MV1]], [[C1]]
     ; VI-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 64
@@ -2025,17 +2066,18 @@ body: |
     ; VI-NEXT: [[MV2:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[SELECT1]](s64), [[SELECT2]](s64)
     ; VI-NEXT: [[TRUNC1:%[0-9]+]]:_(s96) = G_TRUNC [[MV2]](s128)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[TRUNC1]](s96)
+    ;
     ; GFX9-LABEL: name: test_lshr_s65_s32_constant8
     ; GFX9: liveins: $vgpr0_vgpr1_vgpr2
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s96) = COPY $vgpr0_vgpr1_vgpr2
     ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC %23(s64)
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
     ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s96)
     ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
     ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UV]](s32), [[UV1]](s32)
     ; GFX9-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UV2]](s32), [[DEF]](s32)
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
+    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
     ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[MV]], [[C]]
     ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[MV1]], [[C1]]
     ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 64
@@ -2077,12 +2119,12 @@ body: |
     ; SI-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr3
     ; SI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; SI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[C]], [[COPY1]](s32)
-    ; SI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
-    ; SI-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
     ; SI-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s96)
     ; SI-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
     ; SI-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UV]](s32), [[UV1]](s32)
     ; SI-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UV2]](s32), [[DEF]](s32)
+    ; SI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
+    ; SI-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
     ; SI-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[MV]], [[C1]]
     ; SI-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[MV1]], [[C2]]
     ; SI-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 64
@@ -2103,6 +2145,7 @@ body: |
     ; SI-NEXT: [[MV2:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[SELECT1]](s64), [[SELECT2]](s64)
     ; SI-NEXT: [[TRUNC:%[0-9]+]]:_(s96) = G_TRUNC [[MV2]](s128)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[TRUNC]](s96)
+    ;
     ; VI-LABEL: name: test_lshr_s65_s32_known_pow2
     ; VI: liveins: $vgpr0_vgpr1_vgpr2, $vgpr3
     ; VI-NEXT: {{  $}}
@@ -2110,12 +2153,12 @@ body: |
     ; VI-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr3
     ; VI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; VI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[C]], [[COPY1]](s32)
-    ; VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
-    ; VI-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
     ; VI-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s96)
     ; VI-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
     ; VI-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UV]](s32), [[UV1]](s32)
     ; VI-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UV2]](s32), [[DEF]](s32)
+    ; VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
+    ; VI-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
     ; VI-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[MV]], [[C1]]
     ; VI-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[MV1]], [[C2]]
     ; VI-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 64
@@ -2136,6 +2179,7 @@ body: |
     ; VI-NEXT: [[MV2:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[SELECT1]](s64), [[SELECT2]](s64)
     ; VI-NEXT: [[TRUNC:%[0-9]+]]:_(s96) = G_TRUNC [[MV2]](s128)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[TRUNC]](s96)
+    ;
     ; GFX9-LABEL: name: test_lshr_s65_s32_known_pow2
     ; GFX9: liveins: $vgpr0_vgpr1_vgpr2, $vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -2143,12 +2187,12 @@ body: |
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr3
     ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[C]], [[COPY1]](s32)
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
-    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
     ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s96)
     ; GFX9-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
     ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UV]](s32), [[UV1]](s32)
     ; GFX9-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UV2]](s32), [[DEF]](s32)
+    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
+    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
     ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[MV]], [[C1]]
     ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[MV1]], [[C2]]
     ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 64
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-merge-values.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-merge-values.mir
index c1b3b758c22cf..70f7e7ae623c3 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-merge-values.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-merge-values.mir
@@ -23,8 +23,8 @@ body: |
     ; CHECK-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
     ; CHECK-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY $vgpr6
     ; CHECK-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY $vgpr7
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C]]
     ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
     ; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY [[C1]](s32)
@@ -91,14 +91,11 @@ body: |
   bb.0:
     ; CHECK-LABEL: name: test_merge_s16_s8_s8
     ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
-    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 0
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[C2]], [[C1]]
-    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY [[C]](s32)
-    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[C3]](s32)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 0
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[C]], [[C2]](s32)
     ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[SHL]](s32)
-    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[TRUNC]]
+    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[C1]], [[TRUNC]]
     ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[OR]](s16)
     ; CHECK-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
     %0:_(s8) = G_CONSTANT i8 0
@@ -115,26 +112,22 @@ body: |
     ; CHECK-LABEL: name: test_merge_s24_s8_s8_s8
     ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
-    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 0
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[C2]], [[C1]]
-    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY [[C3]](s32)
-    ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C]](s32)
-    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY1]], [[COPY]](s32)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 0
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
+    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[C]], [[COPY]](s32)
     ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[SHL]](s32)
-    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[TRUNC]]
-    ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 2
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[C5]], [[C1]]
-    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[DEF]], [[C4]]
-    ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C3]](s32)
+    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[C1]], [[TRUNC]]
+    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 2
+    ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[DEF]], [[C4]]
+    ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND]], [[C2]](s32)
     ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32)
-    ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND1]], [[TRUNC1]]
+    ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s16) = G_OR [[C3]], [[TRUNC1]]
     ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16)
     ; CHECK-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16)
-    ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; CHECK-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C6]](s32)
+    ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; CHECK-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C5]](s32)
     ; CHECK-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]]
     ; CHECK-NEXT: $vgpr0 = COPY [[OR2]](s32)
     %0:_(s8) = G_CONSTANT i8 0
@@ -154,18 +147,14 @@ body: |
     ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
     ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY [[C]](s32)
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C1]](s32)
     ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY1]], [[C4]](s32)
-    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY]], [[SHL]]
-    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
+    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[C1]], [[C4]](s32)
+    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[C]], [[SHL]]
     ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY2]], [[C5]](s32)
+    ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[C2]], [[C5]](s32)
     ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
-    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C3]](s32)
     ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
-    ; CHECK-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[COPY3]], [[C6]](s32)
+    ; CHECK-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[C3]], [[C6]](s32)
     ; CHECK-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[SHL2]]
     ; CHECK-NEXT: $vgpr0 = COPY [[OR2]](s32)
     %0:_(s8) = G_CONSTANT i8 0
@@ -242,47 +231,40 @@ body: |
     ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
     ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 5
     ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-    ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 15
-    ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s16) = G_CONSTANT i16 0
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[C6]], [[C5]]
-    ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s16) = G_CONSTANT i16 4
+    ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 0
+    ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s16) = G_CONSTANT i16 4
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY [[C3]](s32)
-    ; CHECK-NEXT: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 15
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C]](s32)
-    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY1]], [[COPY]](s32)
+    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[C]], [[COPY]](s32)
     ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[SHL]](s32)
-    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[TRUNC]]
-    ; CHECK-NEXT: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C9]](s32)
-    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C1]](s32)
-    ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY3]], [[COPY2]](s32)
+    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[C5]], [[TRUNC]]
+    ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C7]](s32)
+    ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[C1]], [[COPY1]](s32)
     ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32)
     ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s16) = G_OR [[OR]], [[TRUNC1]]
-    ; CHECK-NEXT: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
-    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY [[C10]](s32)
-    ; CHECK-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; CHECK-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[COPY5]], [[COPY4]](s32)
+    ; CHECK-NEXT: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C8]](s32)
+    ; CHECK-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[C2]], [[COPY2]](s32)
     ; CHECK-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[SHL2]](s32)
     ; CHECK-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[OR1]], [[TRUNC2]]
-    ; CHECK-NEXT: [[COPY6:%[0-9]+]]:_(s16) = COPY [[C7]](s16)
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[COPY6]], [[C5]]
-    ; CHECK-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY [[C3]](s32)
-    ; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY [[C4]](s32)
-    ; CHECK-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[COPY8]], [[COPY7]](s32)
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s16) = COPY [[C6]](s16)
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY [[C3]](s32)
+    ; CHECK-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[C4]], [[COPY4]](s32)
     ; CHECK-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32)
-    ; CHECK-NEXT: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND1]], [[TRUNC3]]
-    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[DEF]], [[C8]]
-    ; CHECK-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C9]](s32)
+    ; CHECK-NEXT: [[OR3:%[0-9]+]]:_(s16) = G_OR [[COPY3]], [[TRUNC3]]
+    ; CHECK-NEXT: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 15
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[DEF]], [[C9]]
+    ; CHECK-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND]], [[C7]](s32)
     ; CHECK-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[SHL4]](s32)
     ; CHECK-NEXT: [[OR4:%[0-9]+]]:_(s16) = G_OR [[OR3]], [[TRUNC4]]
-    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[DEF]], [[C8]]
-    ; CHECK-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C10]](s32)
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[DEF]], [[C9]]
+    ; CHECK-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C8]](s32)
     ; CHECK-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[SHL5]](s32)
     ; CHECK-NEXT: [[OR5:%[0-9]+]]:_(s16) = G_OR [[OR4]], [[TRUNC5]]
     ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16)
     ; CHECK-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR5]](s16)
-    ; CHECK-NEXT: [[C11:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; CHECK-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C11]](s32)
+    ; CHECK-NEXT: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; CHECK-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C10]](s32)
     ; CHECK-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL6]]
     ; CHECK-NEXT: [[TRUNC6:%[0-9]+]]:_(s24) = G_TRUNC [[OR6]](s32)
     ; CHECK-NEXT: S_NOP 0, implicit [[TRUNC6]](s24)
@@ -308,47 +290,39 @@ body: |
     ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 5
     ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 6
     ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-    ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s16) = G_CONSTANT i16 15
-    ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s16) = G_CONSTANT i16 0
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[C7]], [[C6]]
-    ; CHECK-NEXT: [[C8:%[0-9]+]]:_(s16) = G_CONSTANT i16 4
+    ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s16) = G_CONSTANT i16 0
+    ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s16) = G_CONSTANT i16 4
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY [[C3]](s32)
-    ; CHECK-NEXT: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 15
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C]](s32)
-    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY1]], [[COPY]](s32)
+    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[C]], [[COPY]](s32)
     ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[SHL]](s32)
-    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[TRUNC]]
-    ; CHECK-NEXT: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C10]](s32)
-    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C1]](s32)
-    ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY3]], [[COPY2]](s32)
+    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[C6]], [[TRUNC]]
+    ; CHECK-NEXT: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C8]](s32)
+    ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[C1]], [[COPY1]](s32)
     ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32)
     ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s16) = G_OR [[OR]], [[TRUNC1]]
-    ; CHECK-NEXT: [[C11:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
-    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY [[C11]](s32)
-    ; CHECK-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; CHECK-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[COPY5]], [[COPY4]](s32)
+    ; CHECK-NEXT: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C9]](s32)
+    ; CHECK-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[C2]], [[COPY2]](s32)
     ; CHECK-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[SHL2]](s32)
     ; CHECK-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[OR1]], [[TRUNC2]]
-    ; CHECK-NEXT: [[COPY6:%[0-9]+]]:_(s16) = COPY [[C8]](s16)
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[COPY6]], [[C6]]
-    ; CHECK-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY [[C3]](s32)
-    ; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY [[C4]](s32)
-    ; CHECK-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[COPY8]], [[COPY7]](s32)
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s16) = COPY [[C7]](s16)
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY [[C3]](s32)
+    ; CHECK-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[C4]], [[COPY4]](s32)
     ; CHECK-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32)
-    ; CHECK-NEXT: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND1]], [[TRUNC3]]
-    ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
-    ; CHECK-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[COPY9]], [[C10]](s32)
+    ; CHECK-NEXT: [[OR3:%[0-9]+]]:_(s16) = G_OR [[COPY3]], [[TRUNC3]]
+    ; CHECK-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[C5]], [[C8]](s32)
     ; CHECK-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[SHL4]](s32)
     ; CHECK-NEXT: [[OR4:%[0-9]+]]:_(s16) = G_OR [[OR3]], [[TRUNC4]]
-    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[DEF]], [[C9]]
-    ; CHECK-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C11]](s32)
+    ; CHECK-NEXT: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 15
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[DEF]], [[C10]]
+    ; CHECK-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND]], [[C9]](s32)
     ; CHECK-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[SHL5]](s32)
     ; CHECK-NEXT: [[OR5:%[0-9]+]]:_(s16) = G_OR [[OR4]], [[TRUNC5]]
     ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16)
     ; CHECK-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR5]](s16)
-    ; CHECK-NEXT: [[C12:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; CHECK-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C12]](s32)
+    ; CHECK-NEXT: [[C11:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; CHECK-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C11]](s32)
     ; CHECK-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL6]]
     ; CHECK-NEXT: [[TRUNC6:%[0-9]+]]:_(s28) = G_TRUNC [[OR6]](s32)
     ; CHECK-NEXT: S_NOP 0, implicit [[TRUNC6]](s28)
@@ -376,33 +350,25 @@ body: |
     ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 5
     ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 6
     ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 7
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY [[C]](s32)
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C1]](s32)
-    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY1]], [[C4]](s32)
-    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY]], [[SHL]]
-    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
+    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[C1]], [[C4]](s32)
+    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[C]], [[SHL]]
     ; CHECK-NEXT: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY2]], [[C8]](s32)
+    ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[C2]], [[C8]](s32)
     ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
-    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C3]](s32)
     ; CHECK-NEXT: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
-    ; CHECK-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[COPY3]], [[C9]](s32)
+    ; CHECK-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[C3]], [[C9]](s32)
     ; CHECK-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[SHL2]]
-    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY [[C4]](s32)
     ; CHECK-NEXT: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; CHECK-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[COPY4]], [[C10]](s32)
+    ; CHECK-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[C4]], [[C10]](s32)
     ; CHECK-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[OR2]], [[SHL3]]
-    ; CHECK-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
     ; CHECK-NEXT: [[C11:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-    ; CHECK-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[COPY5]], [[C11]](s32)
+    ; CHECK-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[C5]], [[C11]](s32)
     ; CHECK-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[OR3]], [[SHL4]]
-    ; CHECK-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY [[C6]](s32)
     ; CHECK-NEXT: [[C12:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
-    ; CHECK-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[COPY6]], [[C12]](s32)
+    ; CHECK-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[C6]], [[C12]](s32)
     ; CHECK-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[OR4]], [[SHL5]]
-    ; CHECK-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY [[C7]](s32)
     ; CHECK-NEXT: [[C13:%[0-9]+]]:_(s32) = G_CONSTANT i32 28
-    ; CHECK-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[COPY7]], [[C13]](s32)
+    ; CHECK-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[C7]], [[C13]](s32)
     ; CHECK-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[OR5]], [[SHL6]]
     ; CHECK-NEXT: S_NOP 0, implicit [[OR6]](s32)
     %0:_(s4) = G_CONSTANT i4 0
@@ -426,43 +392,34 @@ body: |
     ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
     ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 5
     ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 7
-    ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
-    ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 0
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[C5]], [[C4]]
-    ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY [[C6]](s32)
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C]](s32)
-    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY1]], [[COPY]](s32)
+    ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 0
+    ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
+    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[C]], [[COPY]](s32)
     ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[SHL]](s32)
-    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[TRUNC]]
-    ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s16) = G_CONSTANT i16 2
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[C7]], [[C4]]
-    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C6]](s32)
-    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C1]](s32)
-    ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY3]], [[COPY2]](s32)
+    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[C4]], [[TRUNC]]
+    ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s16) = G_CONSTANT i16 2
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
+    ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[C1]], [[COPY1]](s32)
     ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32)
-    ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND1]], [[TRUNC1]]
-    ; CHECK-NEXT: [[C8:%[0-9]+]]:_(s16) = G_CONSTANT i16 4
-    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[C8]], [[C4]]
-    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY [[C6]](s32)
-    ; CHECK-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; CHECK-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[COPY5]], [[COPY4]](s32)
+    ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s16) = G_OR [[C6]], [[TRUNC1]]
+    ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s16) = G_CONSTANT i16 4
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
+    ; CHECK-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[C2]], [[COPY2]](s32)
     ; CHECK-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[SHL2]](s32)
-    ; CHECK-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[TRUNC2]]
-    ; CHECK-NEXT: [[C9:%[0-9]+]]:_(s16) = G_CONSTANT i16 6
-    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[C9]], [[C4]]
-    ; CHECK-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY [[C3]](s32)
-    ; CHECK-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[COPY6]], [[C6]](s32)
+    ; CHECK-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[C7]], [[TRUNC2]]
+    ; CHECK-NEXT: [[C8:%[0-9]+]]:_(s16) = G_CONSTANT i16 6
+    ; CHECK-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[C3]], [[C5]](s32)
     ; CHECK-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32)
-    ; CHECK-NEXT: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND3]], [[TRUNC3]]
+    ; CHECK-NEXT: [[OR3:%[0-9]+]]:_(s16) = G_OR [[C8]], [[TRUNC3]]
     ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16)
     ; CHECK-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16)
-    ; CHECK-NEXT: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; CHECK-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C10]](s32)
+    ; CHECK-NEXT: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; CHECK-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C9]](s32)
     ; CHECK-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]]
     ; CHECK-NEXT: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16)
     ; CHECK-NEXT: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16)
-    ; CHECK-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C10]](s32)
+    ; CHECK-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C9]](s32)
     ; CHECK-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]]
     ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32)
     ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64)
@@ -490,62 +447,49 @@ body: |
     ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
     ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 9
     ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 11
-    ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
-    ; CHECK-NEXT: [[C8:%[0-9]+]]:_(s16) = G_CONSTANT i16 0
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[C8]], [[C7]]
-    ; CHECK-NEXT: [[C9:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
+    ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s16) = G_CONSTANT i16 0
+    ; CHECK-NEXT: [[C8:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY [[C4]](s32)
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C]](s32)
-    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY1]], [[COPY]](s32)
+    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[C]], [[COPY]](s32)
     ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[SHL]](s32)
-    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[TRUNC]]
-    ; CHECK-NEXT: [[C10:%[0-9]+]]:_(s16) = G_CONSTANT i16 2
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[C10]], [[C7]]
-    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C4]](s32)
-    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C1]](s32)
-    ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY3]], [[COPY2]](s32)
+    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[C7]], [[TRUNC]]
+    ; CHECK-NEXT: [[C9:%[0-9]+]]:_(s16) = G_CONSTANT i16 2
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C4]](s32)
+    ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[C1]], [[COPY1]](s32)
     ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32)
-    ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND1]], [[TRUNC1]]
-    ; CHECK-NEXT: [[C11:%[0-9]+]]:_(s16) = G_CONSTANT i16 4
-    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[C11]], [[C7]]
-    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY [[C4]](s32)
-    ; CHECK-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; CHECK-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[COPY5]], [[COPY4]](s32)
+    ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s16) = G_OR [[C9]], [[TRUNC1]]
+    ; CHECK-NEXT: [[C10:%[0-9]+]]:_(s16) = G_CONSTANT i16 4
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C4]](s32)
+    ; CHECK-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[C2]], [[COPY2]](s32)
     ; CHECK-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[SHL2]](s32)
-    ; CHECK-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[TRUNC2]]
-    ; CHECK-NEXT: [[C12:%[0-9]+]]:_(s16) = G_CONSTANT i16 6
-    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[C12]], [[C7]]
-    ; CHECK-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY [[C4]](s32)
-    ; CHECK-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY [[C3]](s32)
-    ; CHECK-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[COPY7]], [[COPY6]](s32)
+    ; CHECK-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[C10]], [[TRUNC2]]
+    ; CHECK-NEXT: [[C11:%[0-9]+]]:_(s16) = G_CONSTANT i16 6
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C4]](s32)
+    ; CHECK-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[C3]], [[COPY3]](s32)
     ; CHECK-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32)
-    ; CHECK-NEXT: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND3]], [[TRUNC3]]
-    ; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(s16) = COPY [[C9]](s16)
-    ; CHECK-NEXT: [[AND4:%[0-9]+]]:_(s16) = G_AND [[COPY8]], [[C7]]
-    ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[C4]](s32)
-    ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
-    ; CHECK-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[COPY10]], [[COPY9]](s32)
+    ; CHECK-NEXT: [[OR3:%[0-9]+]]:_(s16) = G_OR [[C11]], [[TRUNC3]]
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:_(s16) = COPY [[C8]](s16)
+    ; CHECK-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY [[C4]](s32)
+    ; CHECK-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[C5]], [[COPY5]](s32)
     ; CHECK-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[SHL4]](s32)
-    ; CHECK-NEXT: [[OR4:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[TRUNC4]]
-    ; CHECK-NEXT: [[C13:%[0-9]+]]:_(s16) = G_CONSTANT i16 10
-    ; CHECK-NEXT: [[AND5:%[0-9]+]]:_(s16) = G_AND [[C13]], [[C7]]
-    ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[C4]](s32)
-    ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[C6]](s32)
-    ; CHECK-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[COPY12]], [[COPY11]](s32)
+    ; CHECK-NEXT: [[OR4:%[0-9]+]]:_(s16) = G_OR [[COPY4]], [[TRUNC4]]
+    ; CHECK-NEXT: [[C12:%[0-9]+]]:_(s16) = G_CONSTANT i16 10
+    ; CHECK-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY [[C4]](s32)
+    ; CHECK-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[C6]], [[COPY6]](s32)
     ; CHECK-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[SHL5]](s32)
-    ; CHECK-NEXT: [[OR5:%[0-9]+]]:_(s16) = G_OR [[AND5]], [[TRUNC5]]
+    ; CHECK-NEXT: [[OR5:%[0-9]+]]:_(s16) = G_OR [[C12]], [[TRUNC5]]
     ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16)
     ; CHECK-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16)
-    ; CHECK-NEXT: [[C14:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; CHECK-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C14]](s32)
+    ; CHECK-NEXT: [[C13:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; CHECK-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C13]](s32)
     ; CHECK-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL6]]
     ; CHECK-NEXT: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16)
     ; CHECK-NEXT: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16)
-    ; CHECK-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C14]](s32)
+    ; CHECK-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C13]](s32)
     ; CHECK-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL7]]
     ; CHECK-NEXT: [[ZEXT4:%[0-9]+]]:_(s32) = G_ZEXT [[OR4]](s16)
     ; CHECK-NEXT: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[OR5]](s16)
-    ; CHECK-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C14]](s32)
+    ; CHECK-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[ZEXT5]], [[C13]](s32)
     ; CHECK-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[ZEXT4]], [[SHL8]]
     ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR6]](s32), [[OR7]](s32), [[OR8]](s32)
     ; CHECK-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[MV]](s96)
@@ -605,44 +549,36 @@ body: |
     ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
     ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 5
     ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
-    ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 0
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[C4]], [[C3]]
-    ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
-    ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C]](s32)
-    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY1]], [[COPY]](s32)
+    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 0
+    ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY [[C4]](s32)
+    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[C]], [[COPY]](s32)
     ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[SHL]](s32)
-    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[TRUNC]]
-    ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s16) = G_CONSTANT i16 2
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[C7]], [[C3]]
-    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
-    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C1]](s32)
-    ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY3]], [[COPY2]](s32)
+    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[C3]], [[TRUNC]]
+    ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 2
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C4]](s32)
+    ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[C1]], [[COPY1]](s32)
     ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[SHL1]](s32)
-    ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND1]], [[TRUNC1]]
-    ; CHECK-NEXT: [[C8:%[0-9]+]]:_(s16) = G_CONSTANT i16 4
-    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[C8]], [[C3]]
-    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
-    ; CHECK-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; CHECK-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[COPY5]], [[COPY4]](s32)
+    ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s16) = G_OR [[C5]], [[TRUNC1]]
+    ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s16) = G_CONSTANT i16 4
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C4]](s32)
+    ; CHECK-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[C2]], [[COPY2]](s32)
     ; CHECK-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[SHL2]](s32)
-    ; CHECK-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[TRUNC2]]
-    ; CHECK-NEXT: [[C9:%[0-9]+]]:_(s16) = G_CONSTANT i16 6
-    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[C9]], [[C3]]
-    ; CHECK-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[DEF]], [[C6]]
-    ; CHECK-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND4]], [[C5]](s32)
+    ; CHECK-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[C6]], [[TRUNC2]]
+    ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s16) = G_CONSTANT i16 6
+    ; CHECK-NEXT: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[DEF]], [[C8]]
+    ; CHECK-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND]], [[C4]](s32)
     ; CHECK-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[SHL3]](s32)
-    ; CHECK-NEXT: [[OR3:%[0-9]+]]:_(s16) = G_OR [[AND3]], [[TRUNC3]]
+    ; CHECK-NEXT: [[OR3:%[0-9]+]]:_(s16) = G_OR [[C7]], [[TRUNC3]]
     ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16)
     ; CHECK-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16)
-    ; CHECK-NEXT: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; CHECK-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C10]](s32)
+    ; CHECK-NEXT: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; CHECK-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C9]](s32)
     ; CHECK-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL4]]
     ; CHECK-NEXT: [[ZEXT2:%[0-9]+]]:_(s32) = G_ZEXT [[OR2]](s16)
     ; CHECK-NEXT: [[ZEXT3:%[0-9]+]]:_(s32) = G_ZEXT [[OR3]](s16)
-    ; CHECK-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C10]](s32)
+    ; CHECK-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ZEXT3]], [[C9]](s32)
     ; CHECK-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL5]]
     ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32)
     ; CHECK-NEXT: [[TRUNC4:%[0-9]+]]:_(s56) = G_TRUNC [[MV]](s64)
@@ -667,315 +603,249 @@ body: |
     ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
     ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY [[C]](s32)
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C]](s32)
-    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY1]], [[C1]](s32)
-    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY]], [[SHL]]
-    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C]](s32)
-    ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY2]], [[C2]](s32)
+    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[C]], [[C1]](s32)
+    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[C]], [[SHL]]
+    ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[C]], [[C2]](s32)
     ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
-    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C]](s32)
-    ; CHECK-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[COPY3]], [[C3]](s32)
+    ; CHECK-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[C]], [[C3]](s32)
     ; CHECK-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[SHL2]]
-    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY [[C]](s32)
     ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
-    ; CHECK-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[COPY4]], [[C4]](s32)
+    ; CHECK-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[C]], [[C4]](s32)
     ; CHECK-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[OR2]], [[SHL3]]
-    ; CHECK-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY [[C]](s32)
     ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 5
-    ; CHECK-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[COPY5]], [[C5]](s32)
+    ; CHECK-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[C]], [[C5]](s32)
     ; CHECK-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[OR3]], [[SHL4]]
-    ; CHECK-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY [[C]](s32)
     ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 6
-    ; CHECK-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[COPY6]], [[C6]](s32)
+    ; CHECK-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[C]], [[C6]](s32)
     ; CHECK-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[OR4]], [[SHL5]]
-    ; CHECK-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY [[C]](s32)
     ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 7
-    ; CHECK-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[COPY7]], [[C7]](s32)
+    ; CHECK-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[C]], [[C7]](s32)
     ; CHECK-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[OR5]], [[SHL6]]
-    ; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY [[C]](s32)
     ; CHECK-NEXT: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; CHECK-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[COPY8]], [[C8]](s32)
+    ; CHECK-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[C]], [[C8]](s32)
     ; CHECK-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[OR6]], [[SHL7]]
-    ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[C]](s32)
     ; CHECK-NEXT: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 9
-    ; CHECK-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[COPY9]], [[C9]](s32)
+    ; CHECK-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[C]], [[C9]](s32)
     ; CHECK-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[OR7]], [[SHL8]]
-    ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[C]](s32)
     ; CHECK-NEXT: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
-    ; CHECK-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[COPY10]], [[C10]](s32)
+    ; CHECK-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[C]], [[C10]](s32)
     ; CHECK-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[OR8]], [[SHL9]]
-    ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[C]](s32)
     ; CHECK-NEXT: [[C11:%[0-9]+]]:_(s32) = G_CONSTANT i32 11
-    ; CHECK-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[COPY11]], [[C11]](s32)
+    ; CHECK-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[C]], [[C11]](s32)
     ; CHECK-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[OR9]], [[SHL10]]
-    ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[C]](s32)
     ; CHECK-NEXT: [[C12:%[0-9]+]]:_(s32) = G_CONSTANT i32 12
-    ; CHECK-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[COPY12]], [[C12]](s32)
+    ; CHECK-NEXT: [[SHL11:%[0-9]+]]:_(s32) = G_SHL [[C]], [[C12]](s32)
     ; CHECK-NEXT: [[OR11:%[0-9]+]]:_(s32) = G_OR [[OR10]], [[SHL11]]
-    ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[C]](s32)
     ; CHECK-NEXT: [[C13:%[0-9]+]]:_(s32) = G_CONSTANT i32 13
-    ; CHECK-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[COPY13]], [[C13]](s32)
+    ; CHECK-NEXT: [[SHL12:%[0-9]+]]:_(s32) = G_SHL [[C]], [[C13]](s32)
     ; CHECK-NEXT: [[OR12:%[0-9]+]]:_(s32) = G_OR [[OR11]], [[SHL12]]
-    ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[C]](s32)
     ; CHECK-NEXT: [[C14:%[0-9]+]]:_(s32) = G_CONSTANT i32 14
-    ; CHECK-NEXT: [[SHL13:%[0-9]+]]:_(s32) = G_SHL [[COPY14]], [[C14]](s32)
+    ; CHECK-NEXT: [[SHL13:%[0-9]+]]:_(s32) = G_SHL [[C]], [[C14]](s32)
     ; CHECK-NEXT: [[OR13:%[0-9]+]]:_(s32) = G_OR [[OR12]], [[SHL13]]
-    ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[C]](s32)
     ; CHECK-NEXT: [[C15:%[0-9]+]]:_(s32) = G_CONSTANT i32 15
-    ; CHECK-NEXT: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[COPY15]], [[C15]](s32)
+    ; CHECK-NEXT: [[SHL14:%[0-9]+]]:_(s32) = G_SHL [[C]], [[C15]](s32)
     ; CHECK-NEXT: [[OR14:%[0-9]+]]:_(s32) = G_OR [[OR13]], [[SHL14]]
-    ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[C]](s32)
     ; CHECK-NEXT: [[C16:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; CHECK-NEXT: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C16]](s32)
+    ; CHECK-NEXT: [[SHL15:%[0-9]+]]:_(s32) = G_SHL [[C]], [[C16]](s32)
     ; CHECK-NEXT: [[OR15:%[0-9]+]]:_(s32) = G_OR [[OR14]], [[SHL15]]
-    ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[C1]](s32)
     ; CHECK-NEXT: [[C17:%[0-9]+]]:_(s32) = G_CONSTANT i32 17
-    ; CHECK-NEXT: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C17]](s32)
+    ; CHECK-NEXT: [[SHL16:%[0-9]+]]:_(s32) = G_SHL [[C1]], [[C17]](s32)
     ; CHECK-NEXT: [[OR16:%[0-9]+]]:_(s32) = G_OR [[OR15]], [[SHL16]]
-    ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[C]](s32)
     ; CHECK-NEXT: [[C18:%[0-9]+]]:_(s32) = G_CONSTANT i32 18
-    ; CHECK-NEXT: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C18]](s32)
+    ; CHECK-NEXT: [[SHL17:%[0-9]+]]:_(s32) = G_SHL [[C]], [[C18]](s32)
     ; CHECK-NEXT: [[OR17:%[0-9]+]]:_(s32) = G_OR [[OR16]], [[SHL17]]
-    ; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[C]](s32)
     ; CHECK-NEXT: [[C19:%[0-9]+]]:_(s32) = G_CONSTANT i32 19
-    ; CHECK-NEXT: [[SHL18:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C19]](s32)
+    ; CHECK-NEXT: [[SHL18:%[0-9]+]]:_(s32) = G_SHL [[C]], [[C19]](s32)
     ; CHECK-NEXT: [[OR18:%[0-9]+]]:_(s32) = G_OR [[OR17]], [[SHL18]]
-    ; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[C]](s32)
     ; CHECK-NEXT: [[C20:%[0-9]+]]:_(s32) = G_CONSTANT i32 20
-    ; CHECK-NEXT: [[SHL19:%[0-9]+]]:_(s32) = G_SHL [[COPY20]], [[C20]](s32)
+    ; CHECK-NEXT: [[SHL19:%[0-9]+]]:_(s32) = G_SHL [[C]], [[C20]](s32)
     ; CHECK-NEXT: [[OR19:%[0-9]+]]:_(s32) = G_OR [[OR18]], [[SHL19]]
-    ; CHECK-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[C]](s32)
     ; CHECK-NEXT: [[C21:%[0-9]+]]:_(s32) = G_CONSTANT i32 21
-    ; CHECK-NEXT: [[SHL20:%[0-9]+]]:_(s32) = G_SHL [[COPY21]], [[C21]](s32)
+    ; CHECK-NEXT: [[SHL20:%[0-9]+]]:_(s32) = G_SHL [[C]], [[C21]](s32)
     ; CHECK-NEXT: [[OR20:%[0-9]+]]:_(s32) = G_OR [[OR19]], [[SHL20]]
-    ; CHECK-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[C]](s32)
     ; CHECK-NEXT: [[C22:%[0-9]+]]:_(s32) = G_CONSTANT i32 22
-    ; CHECK-NEXT: [[SHL21:%[0-9]+]]:_(s32) = G_SHL [[COPY22]], [[C22]](s32)
+    ; CHECK-NEXT: [[SHL21:%[0-9]+]]:_(s32) = G_SHL [[C]], [[C22]](s32)
     ; CHECK-NEXT: [[OR21:%[0-9]+]]:_(s32) = G_OR [[OR20]], [[SHL21]]
-    ; CHECK-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY [[C]](s32)
     ; CHECK-NEXT: [[C23:%[0-9]+]]:_(s32) = G_CONSTANT i32 23
-    ; CHECK-NEXT: [[SHL22:%[0-9]+]]:_(s32) = G_SHL [[COPY23]], [[C23]](s32)
+    ; CHECK-NEXT: [[SHL22:%[0-9]+]]:_(s32) = G_SHL [[C]], [[C23]](s32)
     ; CHECK-NEXT: [[OR22:%[0-9]+]]:_(s32) = G_OR [[OR21]], [[SHL22]]
-    ; CHECK-NEXT: [[COPY24:%[0-9]+]]:_(s32) = COPY [[C]](s32)
     ; CHECK-NEXT: [[C24:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
-    ; CHECK-NEXT: [[SHL23:%[0-9]+]]:_(s32) = G_SHL [[COPY24]], [[C24]](s32)
+    ; CHECK-NEXT: [[SHL23:%[0-9]+]]:_(s32) = G_SHL [[C]], [[C24]](s32)
     ; CHECK-NEXT: [[OR23:%[0-9]+]]:_(s32) = G_OR [[OR22]], [[SHL23]]
-    ; CHECK-NEXT: [[COPY25:%[0-9]+]]:_(s32) = COPY [[C]](s32)
     ; CHECK-NEXT: [[C25:%[0-9]+]]:_(s32) = G_CONSTANT i32 25
-    ; CHECK-NEXT: [[SHL24:%[0-9]+]]:_(s32) = G_SHL [[COPY25]], [[C25]](s32)
+    ; CHECK-NEXT: [[SHL24:%[0-9]+]]:_(s32) = G_SHL [[C]], [[C25]](s32)
     ; CHECK-NEXT: [[OR24:%[0-9]+]]:_(s32) = G_OR [[OR23]], [[SHL24]]
-    ; CHECK-NEXT: [[COPY26:%[0-9]+]]:_(s32) = COPY [[C]](s32)
     ; CHECK-NEXT: [[C26:%[0-9]+]]:_(s32) = G_CONSTANT i32 26
-    ; CHECK-NEXT: [[SHL25:%[0-9]+]]:_(s32) = G_SHL [[COPY26]], [[C26]](s32)
+    ; CHECK-NEXT: [[SHL25:%[0-9]+]]:_(s32) = G_SHL [[C]], [[C26]](s32)
     ; CHECK-NEXT: [[OR25:%[0-9]+]]:_(s32) = G_OR [[OR24]], [[SHL25]]
-    ; CHECK-NEXT: [[COPY27:%[0-9]+]]:_(s32) = COPY [[C]](s32)
     ; CHECK-NEXT: [[C27:%[0-9]+]]:_(s32) = G_CONSTANT i32 27
-    ; CHECK-NEXT: [[SHL26:%[0-9]+]]:_(s32) = G_SHL [[COPY27]], [[C27]](s32)
+    ; CHECK-NEXT: [[SHL26:%[0-9]+]]:_(s32) = G_SHL [[C]], [[C27]](s32)
     ; CHECK-NEXT: [[OR26:%[0-9]+]]:_(s32) = G_OR [[OR25]], [[SHL26]]
-    ; CHECK-NEXT: [[COPY28:%[0-9]+]]:_(s32) = COPY [[C]](s32)
     ; CHECK-NEXT: [[C28:%[0-9]+]]:_(s32) = G_CONSTANT i32 28
-    ; CHECK-NEXT: [[SHL27:%[0-9]+]]:_(s32) = G_SHL [[COPY28]], [[C28]](s32)
+    ; CHECK-NEXT: [[SHL27:%[0-9]+]]:_(s32) = G_SHL [[C]], [[C28]](s32)
     ; CHECK-NEXT: [[OR27:%[0-9]+]]:_(s32) = G_OR [[OR26]], [[SHL27]]
-    ; CHECK-NEXT: [[COPY29:%[0-9]+]]:_(s32) = COPY [[C]](s32)
     ; CHECK-NEXT: [[C29:%[0-9]+]]:_(s32) = G_CONSTANT i32 29
-    ; CHECK-NEXT: [[SHL28:%[0-9]+]]:_(s32) = G_SHL [[COPY29]], [[C29]](s32)
+    ; CHECK-NEXT: [[SHL28:%[0-9]+]]:_(s32) = G_SHL [[C]], [[C29]](s32)
     ; CHECK-NEXT: [[OR28:%[0-9]+]]:_(s32) = G_OR [[OR27]], [[SHL28]]
-    ; CHECK-NEXT: [[COPY30:%[0-9]+]]:_(s32) = COPY [[C]](s32)
     ; CHECK-NEXT: [[C30:%[0-9]+]]:_(s32) = G_CONSTANT i32 30
-    ; CHECK-NEXT: [[SHL29:%[0-9]+]]:_(s32) = G_SHL [[COPY30]], [[C30]](s32)
+    ; CHECK-NEXT: [[SHL29:%[0-9]+]]:_(s32) = G_SHL [[C]], [[C30]](s32)
     ; CHECK-NEXT: [[OR29:%[0-9]+]]:_(s32) = G_OR [[OR28]], [[SHL29]]
-    ; CHECK-NEXT: [[COPY31:%[0-9]+]]:_(s32) = COPY [[C]](s32)
     ; CHECK-NEXT: [[C31:%[0-9]+]]:_(s32) = G_CONSTANT i32 31
-    ; CHECK-NEXT: [[SHL30:%[0-9]+]]:_(s32) = G_SHL [[COPY31]], [[C31]](s32)
+    ; CHECK-NEXT: [[SHL30:%[0-9]+]]:_(s32) = G_SHL [[C]], [[C31]](s32)
     ; CHECK-NEXT: [[OR30:%[0-9]+]]:_(s32) = G_OR [[OR29]], [[SHL30]]
-    ; CHECK-NEXT: [[COPY32:%[0-9]+]]:_(s32) = COPY [[C]](s32)
-    ; CHECK-NEXT: [[COPY33:%[0-9]+]]:_(s32) = COPY [[C]](s32)
-    ; CHECK-NEXT: [[SHL31:%[0-9]+]]:_(s32) = G_SHL [[COPY33]], [[C1]](s32)
-    ; CHECK-NEXT: [[OR31:%[0-9]+]]:_(s32) = G_OR [[COPY32]], [[SHL31]]
-    ; CHECK-NEXT: [[COPY34:%[0-9]+]]:_(s32) = COPY [[C]](s32)
-    ; CHECK-NEXT: [[SHL32:%[0-9]+]]:_(s32) = G_SHL [[COPY34]], [[C2]](s32)
+    ; CHECK-NEXT: [[SHL31:%[0-9]+]]:_(s32) = G_SHL [[C]], [[C1]](s32)
+    ; CHECK-NEXT: [[OR31:%[0-9]+]]:_(s32) = G_OR [[C]], [[SHL31]]
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY [[C]](s32)
+    ; CHECK-NEXT: [[SHL32:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[C2]](s32)
     ; CHECK-NEXT: [[OR32:%[0-9]+]]:_(s32) = G_OR [[OR31]], [[SHL32]]
-    ; CHECK-NEXT: [[COPY35:%[0-9]+]]:_(s32) = COPY [[C1]](s32)
-    ; CHECK-NEXT: [[SHL33:%[0-9]+]]:_(s32) = G_SHL [[COPY35]], [[C3]](s32)
+    ; CHECK-NEXT: [[SHL33:%[0-9]+]]:_(s32) = G_SHL [[C1]], [[C3]](s32)
     ; CHECK-NEXT: [[OR33:%[0-9]+]]:_(s32) = G_OR [[OR32]], [[SHL33]]
-    ; CHECK-NEXT: [[COPY36:%[0-9]+]]:_(s32) = COPY [[C]](s32)
-    ; CHECK-NEXT: [[SHL34:%[0-9]+]]:_(s32) = G_SHL [[COPY36]], [[C4]](s32)
+    ; CHECK-NEXT: [[SHL34:%[0-9]+]]:_(s32) = G_SHL [[C]], [[C4]](s32)
     ; CHECK-NEXT: [[OR34:%[0-9]+]]:_(s32) = G_OR [[OR33]], [[SHL34]]
-    ; CHECK-NEXT: [[COPY37:%[0-9]+]]:_(s32) = COPY [[C]](s32)
-    ; CHECK-NEXT: [[SHL35:%[0-9]+]]:_(s32) = G_SHL [[COPY37]], [[C5]](s32)
+    ; CHECK-NEXT: [[SHL35:%[0-9]+]]:_(s32) = G_SHL [[C]], [[C5]](s32)
     ; CHECK-NEXT: [[OR35:%[0-9]+]]:_(s32) = G_OR [[OR34]], [[SHL35]]
-    ; CHECK-NEXT: [[COPY38:%[0-9]+]]:_(s32) = COPY [[C]](s32)
-    ; CHECK-NEXT: [[SHL36:%[0-9]+]]:_(s32) = G_SHL [[COPY38]], [[C6]](s32)
+    ; CHECK-NEXT: [[SHL36:%[0-9]+]]:_(s32) = G_SHL [[C]], [[C6]](s32)
     ; CHECK-NEXT: [[OR36:%[0-9]+]]:_(s32) = G_OR [[OR35]], [[SHL36]]
-    ; CHECK-NEXT: [[COPY39:%[0-9]+]]:_(s32) = COPY [[C]](s32)
-    ; CHECK-NEXT: [[SHL37:%[0-9]+]]:_(s32) = G_SHL [[COPY39]], [[C7]](s32)
+    ; CHECK-NEXT: [[SHL37:%[0-9]+]]:_(s32) = G_SHL [[C]], [[C7]](s32)
     ; CHECK-NEXT: [[OR37:%[0-9]+]]:_(s32) = G_OR [[OR36]], [[SHL37]]
-    ; CHECK-NEXT: [[COPY40:%[0-9]+]]:_(s32) = COPY [[C]](s32)
-    ; CHECK-NEXT: [[SHL38:%[0-9]+]]:_(s32) = G_SHL [[COPY40]], [[C8]](s32)
+    ; CHECK-NEXT: [[SHL38:%[0-9]+]]:_(s32) = G_SHL [[C]], [[C8]](s32)
     ; CHECK-NEXT: [[OR38:%[0-9]+]]:_(s32) = G_OR [[OR37]], [[SHL38]]
-    ; CHECK-NEXT: [[COPY41:%[0-9]+]]:_(s32) = COPY [[C]](s32)
-    ; CHECK-NEXT: [[SHL39:%[0-9]+]]:_(s32) = G_SHL [[COPY41]], [[C9]](s32)
+    ; CHECK-NEXT: [[SHL39:%[0-9]+]]:_(s32) = G_SHL [[C]], [[C9]](s32)
     ; CHECK-NEXT: [[OR39:%[0-9]+]]:_(s32) = G_OR [[OR38]], [[SHL39]]
-    ; CHECK-NEXT: [[COPY42:%[0-9]+]]:_(s32) = COPY [[C]](s32)
-    ; CHECK-NEXT: [[SHL40:%[0-9]+]]:_(s32) = G_SHL [[COPY42]], [[C10]](s32)
+    ; CHECK-NEXT: [[SHL40:%[0-9]+]]:_(s32) = G_SHL [[C]], [[C10]](s32)
     ; CHECK-NEXT: [[OR40:%[0-9]+]]:_(s32) = G_OR [[OR39]], [[SHL40]]
-    ; CHECK-NEXT: [[COPY43:%[0-9]+]]:_(s32) = COPY [[C]](s32)
-    ; CHECK-NEXT: [[SHL41:%[0-9]+]]:_(s32) = G_SHL [[COPY43]], [[C11]](s32)
+    ; CHECK-NEXT: [[SHL41:%[0-9]+]]:_(s32) = G_SHL [[C]], [[C11]](s32)
     ; CHECK-NEXT: [[OR41:%[0-9]+]]:_(s32) = G_OR [[OR40]], [[SHL41]]
-    ; CHECK-NEXT: [[COPY44:%[0-9]+]]:_(s32) = COPY [[C]](s32)
-    ; CHECK-NEXT: [[SHL42:%[0-9]+]]:_(s32) = G_SHL [[COPY44]], [[C12]](s32)
+    ; CHECK-NEXT: [[SHL42:%[0-9]+]]:_(s32) = G_SHL [[C]], [[C12]](s32)
     ; CHECK-NEXT: [[OR42:%[0-9]+]]:_(s32) = G_OR [[OR41]], [[SHL42]]
-    ; CHECK-NEXT: [[COPY45:%[0-9]+]]:_(s32) = COPY [[C]](s32)
-    ; CHECK-NEXT: [[SHL43:%[0-9]+]]:_(s32) = G_SHL [[COPY45]], [[C13]](s32)
+    ; CHECK-NEXT: [[SHL43:%[0-9]+]]:_(s32) = G_SHL [[C]], [[C13]](s32)
     ; CHECK-NEXT: [[OR43:%[0-9]+]]:_(s32) = G_OR [[OR42]], [[SHL43]]
-    ; CHECK-NEXT: [[COPY46:%[0-9]+]]:_(s32) = COPY [[C]](s32)
-    ; CHECK-NEXT: [[SHL44:%[0-9]+]]:_(s32) = G_SHL [[COPY46]], [[C14]](s32)
+    ; CHECK-NEXT: [[SHL44:%[0-9]+]]:_(s32) = G_SHL [[C]], [[C14]](s32)
     ; CHECK-NEXT: [[OR44:%[0-9]+]]:_(s32) = G_OR [[OR43]], [[SHL44]]
-    ; CHECK-NEXT: [[COPY47:%[0-9]+]]:_(s32) = COPY [[C]](s32)
-    ; CHECK-NEXT: [[SHL45:%[0-9]+]]:_(s32) = G_SHL [[COPY47]], [[C15]](s32)
+    ; CHECK-NEXT: [[SHL45:%[0-9]+]]:_(s32) = G_SHL [[C]], [[C15]](s32)
     ; CHECK-NEXT: [[OR45:%[0-9]+]]:_(s32) = G_OR [[OR44]], [[SHL45]]
-    ; CHECK-NEXT: [[COPY48:%[0-9]+]]:_(s32) = COPY [[C]](s32)
-    ; CHECK-NEXT: [[SHL46:%[0-9]+]]:_(s32) = G_SHL [[COPY48]], [[C16]](s32)
+    ; CHECK-NEXT: [[SHL46:%[0-9]+]]:_(s32) = G_SHL [[C]], [[C16]](s32)
     ; CHECK-NEXT: [[OR46:%[0-9]+]]:_(s32) = G_OR [[OR45]], [[SHL46]]
-    ; CHECK-NEXT: [[COPY49:%[0-9]+]]:_(s32) = COPY [[C]](s32)
-    ; CHECK-NEXT: [[SHL47:%[0-9]+]]:_(s32) = G_SHL [[COPY49]], [[C17]](s32)
+    ; CHECK-NEXT: [[SHL47:%[0-9]+]]:_(s32) = G_SHL [[C]], [[C17]](s32)
     ; CHECK-NEXT: [[OR47:%[0-9]+]]:_(s32) = G_OR [[OR46]], [[SHL47]]
-    ; CHECK-NEXT: [[COPY50:%[0-9]+]]:_(s32) = COPY [[C]](s32)
-    ; CHECK-NEXT: [[SHL48:%[0-9]+]]:_(s32) = G_SHL [[COPY50]], [[C18]](s32)
+    ; CHECK-NEXT: [[SHL48:%[0-9]+]]:_(s32) = G_SHL [[C]], [[C18]](s32)
     ; CHECK-NEXT: [[OR48:%[0-9]+]]:_(s32) = G_OR [[OR47]], [[SHL48]]
-    ; CHECK-NEXT: [[COPY51:%[0-9]+]]:_(s32) = COPY [[C1]](s32)
-    ; CHECK-NEXT: [[SHL49:%[0-9]+]]:_(s32) = G_SHL [[COPY51]], [[C19]](s32)
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C1]](s32)
+    ; CHECK-NEXT: [[SHL49:%[0-9]+]]:_(s32) = G_SHL [[COPY1]], [[C19]](s32)
     ; CHECK-NEXT: [[OR49:%[0-9]+]]:_(s32) = G_OR [[OR48]], [[SHL49]]
-    ; CHECK-NEXT: [[COPY52:%[0-9]+]]:_(s32) = COPY [[C1]](s32)
-    ; CHECK-NEXT: [[SHL50:%[0-9]+]]:_(s32) = G_SHL [[COPY52]], [[C20]](s32)
+    ; CHECK-NEXT: [[SHL50:%[0-9]+]]:_(s32) = G_SHL [[C1]], [[C20]](s32)
     ; CHECK-NEXT: [[OR50:%[0-9]+]]:_(s32) = G_OR [[OR49]], [[SHL50]]
-    ; CHECK-NEXT: [[COPY53:%[0-9]+]]:_(s32) = COPY [[C]](s32)
-    ; CHECK-NEXT: [[SHL51:%[0-9]+]]:_(s32) = G_SHL [[COPY53]], [[C21]](s32)
+    ; CHECK-NEXT: [[SHL51:%[0-9]+]]:_(s32) = G_SHL [[C]], [[C21]](s32)
     ; CHECK-NEXT: [[OR51:%[0-9]+]]:_(s32) = G_OR [[OR50]], [[SHL51]]
-    ; CHECK-NEXT: [[COPY54:%[0-9]+]]:_(s32) = COPY [[C]](s32)
-    ; CHECK-NEXT: [[SHL52:%[0-9]+]]:_(s32) = G_SHL [[COPY54]], [[C22]](s32)
+    ; CHECK-NEXT: [[SHL52:%[0-9]+]]:_(s32) = G_SHL [[C]], [[C22]](s32)
     ; CHECK-NEXT: [[OR52:%[0-9]+]]:_(s32) = G_OR [[OR51]], [[SHL52]]
-    ; CHECK-NEXT: [[COPY55:%[0-9]+]]:_(s32) = COPY [[C]](s32)
-    ; CHECK-NEXT: [[SHL53:%[0-9]+]]:_(s32) = G_SHL [[COPY55]], [[C23]](s32)
+    ; CHECK-NEXT: [[SHL53:%[0-9]+]]:_(s32) = G_SHL [[C]], [[C23]](s32)
     ; CHECK-NEXT: [[OR53:%[0-9]+]]:_(s32) = G_OR [[OR52]], [[SHL53]]
-    ; CHECK-NEXT: [[COPY56:%[0-9]+]]:_(s32) = COPY [[C]](s32)
-    ; CHECK-NEXT: [[SHL54:%[0-9]+]]:_(s32) = G_SHL [[COPY56]], [[C24]](s32)
+    ; CHECK-NEXT: [[SHL54:%[0-9]+]]:_(s32) = G_SHL [[C]], [[C24]](s32)
     ; CHECK-NEXT: [[OR54:%[0-9]+]]:_(s32) = G_OR [[OR53]], [[SHL54]]
-    ; CHECK-NEXT: [[COPY57:%[0-9]+]]:_(s32) = COPY [[C]](s32)
-    ; CHECK-NEXT: [[SHL55:%[0-9]+]]:_(s32) = G_SHL [[COPY57]], [[C25]](s32)
+    ; CHECK-NEXT: [[SHL55:%[0-9]+]]:_(s32) = G_SHL [[C]], [[C25]](s32)
     ; CHECK-NEXT: [[OR55:%[0-9]+]]:_(s32) = G_OR [[OR54]], [[SHL55]]
-    ; CHECK-NEXT: [[COPY58:%[0-9]+]]:_(s32) = COPY [[C]](s32)
-    ; CHECK-NEXT: [[SHL56:%[0-9]+]]:_(s32) = G_SHL [[COPY58]], [[C26]](s32)
+    ; CHECK-NEXT: [[SHL56:%[0-9]+]]:_(s32) = G_SHL [[C]], [[C26]](s32)
     ; CHECK-NEXT: [[OR56:%[0-9]+]]:_(s32) = G_OR [[OR55]], [[SHL56]]
-    ; CHECK-NEXT: [[COPY59:%[0-9]+]]:_(s32) = COPY [[C]](s32)
-    ; CHECK-NEXT: [[SHL57:%[0-9]+]]:_(s32) = G_SHL [[COPY59]], [[C27]](s32)
+    ; CHECK-NEXT: [[SHL57:%[0-9]+]]:_(s32) = G_SHL [[C]], [[C27]](s32)
     ; CHECK-NEXT: [[OR57:%[0-9]+]]:_(s32) = G_OR [[OR56]], [[SHL57]]
-    ; CHECK-NEXT: [[COPY60:%[0-9]+]]:_(s32) = COPY [[C]](s32)
-    ; CHECK-NEXT: [[SHL58:%[0-9]+]]:_(s32) = G_SHL [[COPY60]], [[C28]](s32)
+    ; CHECK-NEXT: [[SHL58:%[0-9]+]]:_(s32) = G_SHL [[C]], [[C28]](s32)
     ; CHECK-NEXT: [[OR58:%[0-9]+]]:_(s32) = G_OR [[OR57]], [[SHL58]]
-    ; CHECK-NEXT: [[COPY61:%[0-9]+]]:_(s32) = COPY [[C]](s32)
-    ; CHECK-NEXT: [[SHL59:%[0-9]+]]:_(s32) = G_SHL [[COPY61]], [[C29]](s32)
+    ; CHECK-NEXT: [[SHL59:%[0-9]+]]:_(s32) = G_SHL [[C]], [[C29]](s32)
     ; CHECK-NEXT: [[OR59:%[0-9]+]]:_(s32) = G_OR [[OR58]], [[SHL59]]
-    ; CHECK-NEXT: [[COPY62:%[0-9]+]]:_(s32) = COPY [[C]](s32)
-    ; CHECK-NEXT: [[SHL60:%[0-9]+]]:_(s32) = G_SHL [[COPY62]], [[C30]](s32)
+    ; CHECK-NEXT: [[SHL60:%[0-9]+]]:_(s32) = G_SHL [[C]], [[C30]](s32)
     ; CHECK-NEXT: [[OR60:%[0-9]+]]:_(s32) = G_OR [[OR59]], [[SHL60]]
-    ; CHECK-NEXT: [[COPY63:%[0-9]+]]:_(s32) = COPY [[C]](s32)
-    ; CHECK-NEXT: [[SHL61:%[0-9]+]]:_(s32) = G_SHL [[COPY63]], [[C31]](s32)
+    ; CHECK-NEXT: [[SHL61:%[0-9]+]]:_(s32) = G_SHL [[C]], [[C31]](s32)
     ; CHECK-NEXT: [[OR61:%[0-9]+]]:_(s32) = G_OR [[OR60]], [[SHL61]]
-    ; CHECK-NEXT: [[COPY64:%[0-9]+]]:_(s32) = COPY [[C]](s32)
-    ; CHECK-NEXT: [[COPY65:%[0-9]+]]:_(s32) = COPY [[C]](s32)
-    ; CHECK-NEXT: [[SHL62:%[0-9]+]]:_(s32) = G_SHL [[COPY65]], [[C1]](s32)
-    ; CHECK-NEXT: [[OR62:%[0-9]+]]:_(s32) = G_OR [[COPY64]], [[SHL62]]
-    ; CHECK-NEXT: [[COPY66:%[0-9]+]]:_(s32) = COPY [[C]](s32)
-    ; CHECK-NEXT: [[SHL63:%[0-9]+]]:_(s32) = G_SHL [[COPY66]], [[C2]](s32)
+    ; CHECK-NEXT: [[SHL62:%[0-9]+]]:_(s32) = G_SHL [[C]], [[C1]](s32)
+    ; CHECK-NEXT: [[OR62:%[0-9]+]]:_(s32) = G_OR [[C]], [[SHL62]]
+    ; CHECK-NEXT: [[SHL63:%[0-9]+]]:_(s32) = G_SHL [[C]], [[C2]](s32)
     ; CHECK-NEXT: [[OR63:%[0-9]+]]:_(s32) = G_OR [[OR62]], [[SHL63]]
-    ; CHECK-NEXT: [[COPY67:%[0-9]+]]:_(s32) = COPY [[C]](s32)
-    ; CHECK-NEXT: [[SHL64:%[0-9]+]]:_(s32) = G_SHL [[COPY67]], [[C3]](s32)
+    ; CHECK-NEXT: [[SHL64:%[0-9]+]]:_(s32) = G_SHL [[C]], [[C3]](s32)
     ; CHECK-NEXT: [[OR64:%[0-9]+]]:_(s32) = G_OR [[OR63]], [[SHL64]]
-    ; CHECK-NEXT: [[COPY68:%[0-9]+]]:_(s32) = COPY [[C]](s32)
-    ; CHECK-NEXT: [[SHL65:%[0-9]+]]:_(s32) = G_SHL [[COPY68]], [[C4]](s32)
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C]](s32)
+    ; CHECK-NEXT: [[SHL65:%[0-9]+]]:_(s32) = G_SHL [[COPY2]], [[C4]](s32)
     ; CHECK-NEXT: [[OR65:%[0-9]+]]:_(s32) = G_OR [[OR64]], [[SHL65]]
-    ; CHECK-NEXT: [[COPY69:%[0-9]+]]:_(s32) = COPY [[C]](s32)
-    ; CHECK-NEXT: [[SHL66:%[0-9]+]]:_(s32) = G_SHL [[COPY69]], [[C5]](s32)
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C]](s32)
+    ; CHECK-NEXT: [[SHL66:%[0-9]+]]:_(s32) = G_SHL [[COPY3]], [[C5]](s32)
     ; CHECK-NEXT: [[OR66:%[0-9]+]]:_(s32) = G_OR [[OR65]], [[SHL66]]
-    ; CHECK-NEXT: [[COPY70:%[0-9]+]]:_(s32) = COPY [[C]](s32)
-    ; CHECK-NEXT: [[SHL67:%[0-9]+]]:_(s32) = G_SHL [[COPY70]], [[C6]](s32)
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY [[C]](s32)
+    ; CHECK-NEXT: [[SHL67:%[0-9]+]]:_(s32) = G_SHL [[COPY4]], [[C6]](s32)
     ; CHECK-NEXT: [[OR67:%[0-9]+]]:_(s32) = G_OR [[OR66]], [[SHL67]]
-    ; CHECK-NEXT: [[COPY71:%[0-9]+]]:_(s32) = COPY [[C]](s32)
-    ; CHECK-NEXT: [[SHL68:%[0-9]+]]:_(s32) = G_SHL [[COPY71]], [[C7]](s32)
+    ; CHECK-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY [[C]](s32)
+    ; CHECK-NEXT: [[SHL68:%[0-9]+]]:_(s32) = G_SHL [[COPY5]], [[C7]](s32)
     ; CHECK-NEXT: [[OR68:%[0-9]+]]:_(s32) = G_OR [[OR67]], [[SHL68]]
-    ; CHECK-NEXT: [[COPY72:%[0-9]+]]:_(s32) = COPY [[C]](s32)
-    ; CHECK-NEXT: [[SHL69:%[0-9]+]]:_(s32) = G_SHL [[COPY72]], [[C8]](s32)
+    ; CHECK-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY [[C]](s32)
+    ; CHECK-NEXT: [[SHL69:%[0-9]+]]:_(s32) = G_SHL [[COPY6]], [[C8]](s32)
     ; CHECK-NEXT: [[OR69:%[0-9]+]]:_(s32) = G_OR [[OR68]], [[SHL69]]
-    ; CHECK-NEXT: [[COPY73:%[0-9]+]]:_(s32) = COPY [[C]](s32)
-    ; CHECK-NEXT: [[SHL70:%[0-9]+]]:_(s32) = G_SHL [[COPY73]], [[C9]](s32)
+    ; CHECK-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY [[C]](s32)
+    ; CHECK-NEXT: [[SHL70:%[0-9]+]]:_(s32) = G_SHL [[COPY7]], [[C9]](s32)
     ; CHECK-NEXT: [[OR70:%[0-9]+]]:_(s32) = G_OR [[OR69]], [[SHL70]]
-    ; CHECK-NEXT: [[COPY74:%[0-9]+]]:_(s32) = COPY [[C]](s32)
-    ; CHECK-NEXT: [[SHL71:%[0-9]+]]:_(s32) = G_SHL [[COPY74]], [[C10]](s32)
+    ; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY [[C]](s32)
+    ; CHECK-NEXT: [[SHL71:%[0-9]+]]:_(s32) = G_SHL [[COPY8]], [[C10]](s32)
     ; CHECK-NEXT: [[OR71:%[0-9]+]]:_(s32) = G_OR [[OR70]], [[SHL71]]
-    ; CHECK-NEXT: [[COPY75:%[0-9]+]]:_(s32) = COPY [[C]](s32)
-    ; CHECK-NEXT: [[SHL72:%[0-9]+]]:_(s32) = G_SHL [[COPY75]], [[C11]](s32)
+    ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[C]](s32)
+    ; CHECK-NEXT: [[SHL72:%[0-9]+]]:_(s32) = G_SHL [[COPY9]], [[C11]](s32)
     ; CHECK-NEXT: [[OR72:%[0-9]+]]:_(s32) = G_OR [[OR71]], [[SHL72]]
-    ; CHECK-NEXT: [[COPY76:%[0-9]+]]:_(s32) = COPY [[C]](s32)
-    ; CHECK-NEXT: [[SHL73:%[0-9]+]]:_(s32) = G_SHL [[COPY76]], [[C12]](s32)
+    ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[C]](s32)
+    ; CHECK-NEXT: [[SHL73:%[0-9]+]]:_(s32) = G_SHL [[COPY10]], [[C12]](s32)
     ; CHECK-NEXT: [[OR73:%[0-9]+]]:_(s32) = G_OR [[OR72]], [[SHL73]]
-    ; CHECK-NEXT: [[COPY77:%[0-9]+]]:_(s32) = COPY [[C]](s32)
-    ; CHECK-NEXT: [[SHL74:%[0-9]+]]:_(s32) = G_SHL [[COPY77]], [[C13]](s32)
+    ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[C]](s32)
+    ; CHECK-NEXT: [[SHL74:%[0-9]+]]:_(s32) = G_SHL [[COPY11]], [[C13]](s32)
     ; CHECK-NEXT: [[OR74:%[0-9]+]]:_(s32) = G_OR [[OR73]], [[SHL74]]
-    ; CHECK-NEXT: [[COPY78:%[0-9]+]]:_(s32) = COPY [[C]](s32)
-    ; CHECK-NEXT: [[SHL75:%[0-9]+]]:_(s32) = G_SHL [[COPY78]], [[C14]](s32)
+    ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[C]](s32)
+    ; CHECK-NEXT: [[SHL75:%[0-9]+]]:_(s32) = G_SHL [[COPY12]], [[C14]](s32)
     ; CHECK-NEXT: [[OR75:%[0-9]+]]:_(s32) = G_OR [[OR74]], [[SHL75]]
-    ; CHECK-NEXT: [[COPY79:%[0-9]+]]:_(s32) = COPY [[C]](s32)
-    ; CHECK-NEXT: [[SHL76:%[0-9]+]]:_(s32) = G_SHL [[COPY79]], [[C15]](s32)
+    ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[C]](s32)
+    ; CHECK-NEXT: [[SHL76:%[0-9]+]]:_(s32) = G_SHL [[COPY13]], [[C15]](s32)
     ; CHECK-NEXT: [[OR76:%[0-9]+]]:_(s32) = G_OR [[OR75]], [[SHL76]]
-    ; CHECK-NEXT: [[COPY80:%[0-9]+]]:_(s32) = COPY [[C]](s32)
-    ; CHECK-NEXT: [[SHL77:%[0-9]+]]:_(s32) = G_SHL [[COPY80]], [[C16]](s32)
+    ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[C]](s32)
+    ; CHECK-NEXT: [[SHL77:%[0-9]+]]:_(s32) = G_SHL [[COPY14]], [[C16]](s32)
     ; CHECK-NEXT: [[OR77:%[0-9]+]]:_(s32) = G_OR [[OR76]], [[SHL77]]
-    ; CHECK-NEXT: [[COPY81:%[0-9]+]]:_(s32) = COPY [[C]](s32)
-    ; CHECK-NEXT: [[SHL78:%[0-9]+]]:_(s32) = G_SHL [[COPY81]], [[C17]](s32)
+    ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[C]](s32)
+    ; CHECK-NEXT: [[SHL78:%[0-9]+]]:_(s32) = G_SHL [[COPY15]], [[C17]](s32)
     ; CHECK-NEXT: [[OR78:%[0-9]+]]:_(s32) = G_OR [[OR77]], [[SHL78]]
-    ; CHECK-NEXT: [[COPY82:%[0-9]+]]:_(s32) = COPY [[C]](s32)
-    ; CHECK-NEXT: [[SHL79:%[0-9]+]]:_(s32) = G_SHL [[COPY82]], [[C18]](s32)
+    ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[C]](s32)
+    ; CHECK-NEXT: [[SHL79:%[0-9]+]]:_(s32) = G_SHL [[COPY16]], [[C18]](s32)
     ; CHECK-NEXT: [[OR79:%[0-9]+]]:_(s32) = G_OR [[OR78]], [[SHL79]]
-    ; CHECK-NEXT: [[COPY83:%[0-9]+]]:_(s32) = COPY [[C]](s32)
-    ; CHECK-NEXT: [[SHL80:%[0-9]+]]:_(s32) = G_SHL [[COPY83]], [[C19]](s32)
+    ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[C]](s32)
+    ; CHECK-NEXT: [[SHL80:%[0-9]+]]:_(s32) = G_SHL [[COPY17]], [[C19]](s32)
     ; CHECK-NEXT: [[OR80:%[0-9]+]]:_(s32) = G_OR [[OR79]], [[SHL80]]
-    ; CHECK-NEXT: [[COPY84:%[0-9]+]]:_(s32) = COPY [[C]](s32)
-    ; CHECK-NEXT: [[SHL81:%[0-9]+]]:_(s32) = G_SHL [[COPY84]], [[C20]](s32)
+    ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[C]](s32)
+    ; CHECK-NEXT: [[SHL81:%[0-9]+]]:_(s32) = G_SHL [[COPY18]], [[C20]](s32)
     ; CHECK-NEXT: [[OR81:%[0-9]+]]:_(s32) = G_OR [[OR80]], [[SHL81]]
-    ; CHECK-NEXT: [[COPY85:%[0-9]+]]:_(s32) = COPY [[C]](s32)
-    ; CHECK-NEXT: [[SHL82:%[0-9]+]]:_(s32) = G_SHL [[COPY85]], [[C21]](s32)
+    ; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[C]](s32)
+    ; CHECK-NEXT: [[SHL82:%[0-9]+]]:_(s32) = G_SHL [[COPY19]], [[C21]](s32)
     ; CHECK-NEXT: [[OR82:%[0-9]+]]:_(s32) = G_OR [[OR81]], [[SHL82]]
-    ; CHECK-NEXT: [[COPY86:%[0-9]+]]:_(s32) = COPY [[C]](s32)
-    ; CHECK-NEXT: [[SHL83:%[0-9]+]]:_(s32) = G_SHL [[COPY86]], [[C22]](s32)
+    ; CHECK-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[C]](s32)
+    ; CHECK-NEXT: [[SHL83:%[0-9]+]]:_(s32) = G_SHL [[COPY20]], [[C22]](s32)
     ; CHECK-NEXT: [[OR83:%[0-9]+]]:_(s32) = G_OR [[OR82]], [[SHL83]]
-    ; CHECK-NEXT: [[COPY87:%[0-9]+]]:_(s32) = COPY [[C]](s32)
-    ; CHECK-NEXT: [[SHL84:%[0-9]+]]:_(s32) = G_SHL [[COPY87]], [[C23]](s32)
+    ; CHECK-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[C]](s32)
+    ; CHECK-NEXT: [[SHL84:%[0-9]+]]:_(s32) = G_SHL [[COPY21]], [[C23]](s32)
     ; CHECK-NEXT: [[OR84:%[0-9]+]]:_(s32) = G_OR [[OR83]], [[SHL84]]
-    ; CHECK-NEXT: [[COPY88:%[0-9]+]]:_(s32) = COPY [[C]](s32)
-    ; CHECK-NEXT: [[SHL85:%[0-9]+]]:_(s32) = G_SHL [[COPY88]], [[C24]](s32)
+    ; CHECK-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[C]](s32)
+    ; CHECK-NEXT: [[SHL85:%[0-9]+]]:_(s32) = G_SHL [[COPY22]], [[C24]](s32)
     ; CHECK-NEXT: [[OR85:%[0-9]+]]:_(s32) = G_OR [[OR84]], [[SHL85]]
-    ; CHECK-NEXT: [[COPY89:%[0-9]+]]:_(s32) = COPY [[C]](s32)
-    ; CHECK-NEXT: [[SHL86:%[0-9]+]]:_(s32) = G_SHL [[COPY89]], [[C25]](s32)
+    ; CHECK-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY [[C]](s32)
+    ; CHECK-NEXT: [[SHL86:%[0-9]+]]:_(s32) = G_SHL [[COPY23]], [[C25]](s32)
     ; CHECK-NEXT: [[OR86:%[0-9]+]]:_(s32) = G_OR [[OR85]], [[SHL86]]
-    ; CHECK-NEXT: [[COPY90:%[0-9]+]]:_(s32) = COPY [[C]](s32)
-    ; CHECK-NEXT: [[SHL87:%[0-9]+]]:_(s32) = G_SHL [[COPY90]], [[C26]](s32)
+    ; CHECK-NEXT: [[COPY24:%[0-9]+]]:_(s32) = COPY [[C]](s32)
+    ; CHECK-NEXT: [[SHL87:%[0-9]+]]:_(s32) = G_SHL [[COPY24]], [[C26]](s32)
     ; CHECK-NEXT: [[OR87:%[0-9]+]]:_(s32) = G_OR [[OR86]], [[SHL87]]
-    ; CHECK-NEXT: [[COPY91:%[0-9]+]]:_(s32) = COPY [[C]](s32)
-    ; CHECK-NEXT: [[SHL88:%[0-9]+]]:_(s32) = G_SHL [[COPY91]], [[C27]](s32)
+    ; CHECK-NEXT: [[COPY25:%[0-9]+]]:_(s32) = COPY [[C]](s32)
+    ; CHECK-NEXT: [[SHL88:%[0-9]+]]:_(s32) = G_SHL [[COPY25]], [[C27]](s32)
     ; CHECK-NEXT: [[OR88:%[0-9]+]]:_(s32) = G_OR [[OR87]], [[SHL88]]
-    ; CHECK-NEXT: [[COPY92:%[0-9]+]]:_(s32) = COPY [[C]](s32)
-    ; CHECK-NEXT: [[SHL89:%[0-9]+]]:_(s32) = G_SHL [[COPY92]], [[C28]](s32)
+    ; CHECK-NEXT: [[COPY26:%[0-9]+]]:_(s32) = COPY [[C]](s32)
+    ; CHECK-NEXT: [[SHL89:%[0-9]+]]:_(s32) = G_SHL [[COPY26]], [[C28]](s32)
     ; CHECK-NEXT: [[OR89:%[0-9]+]]:_(s32) = G_OR [[OR88]], [[SHL89]]
-    ; CHECK-NEXT: [[COPY93:%[0-9]+]]:_(s32) = COPY [[C]](s32)
-    ; CHECK-NEXT: [[SHL90:%[0-9]+]]:_(s32) = G_SHL [[COPY93]], [[C29]](s32)
+    ; CHECK-NEXT: [[COPY27:%[0-9]+]]:_(s32) = COPY [[C]](s32)
+    ; CHECK-NEXT: [[SHL90:%[0-9]+]]:_(s32) = G_SHL [[COPY27]], [[C29]](s32)
     ; CHECK-NEXT: [[OR90:%[0-9]+]]:_(s32) = G_OR [[OR89]], [[SHL90]]
-    ; CHECK-NEXT: [[COPY94:%[0-9]+]]:_(s32) = COPY [[C]](s32)
-    ; CHECK-NEXT: [[SHL91:%[0-9]+]]:_(s32) = G_SHL [[COPY94]], [[C30]](s32)
+    ; CHECK-NEXT: [[COPY28:%[0-9]+]]:_(s32) = COPY [[C]](s32)
+    ; CHECK-NEXT: [[SHL91:%[0-9]+]]:_(s32) = G_SHL [[COPY28]], [[C30]](s32)
     ; CHECK-NEXT: [[OR91:%[0-9]+]]:_(s32) = G_OR [[OR90]], [[SHL91]]
-    ; CHECK-NEXT: [[COPY95:%[0-9]+]]:_(s32) = COPY [[C]](s32)
-    ; CHECK-NEXT: [[SHL92:%[0-9]+]]:_(s32) = G_SHL [[COPY95]], [[C31]](s32)
+    ; CHECK-NEXT: [[COPY29:%[0-9]+]]:_(s32) = COPY [[C]](s32)
+    ; CHECK-NEXT: [[SHL92:%[0-9]+]]:_(s32) = G_SHL [[COPY29]], [[C31]](s32)
     ; CHECK-NEXT: [[OR92:%[0-9]+]]:_(s32) = G_OR [[OR91]], [[SHL92]]
     ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s96) = G_MERGE_VALUES [[OR30]](s32), [[OR61]](s32), [[OR92]](s32)
     ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s68) = G_TRUNC [[MV]](s96)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-or.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-or.mir
index 3174495e6bf0a..483698e01417b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-or.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-or.mir
@@ -472,10 +472,9 @@ body: |
     ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
     ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
     ; CHECK-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
-    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]]
-    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[BITCAST2]], [[C1]]
-    ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C]](s32)
-    ; CHECK-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]]
+    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[BITCAST2]], [[C1]]
+    ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C]](s32)
+    ; CHECK-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[LSHR]], [[SHL1]]
     ; CHECK-NEXT: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
     ; CHECK-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[UV6]](<2 x s16>), [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>)
     ; CHECK-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS2]](<6 x s16>)
@@ -534,10 +533,9 @@ body: |
     ; CHECK-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST4]], [[C]](s32)
     ; CHECK-NEXT: [[BITCAST5:%[0-9]+]]:_(s32) = G_BITCAST [[UV7]](<2 x s16>)
     ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]]
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C1]]
-    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
-    ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C1]]
+    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND]], [[C]](s32)
+    ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[LSHR]], [[SHL]]
     ; CHECK-NEXT: [[BITCAST6:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
     ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
@@ -546,10 +544,9 @@ body: |
     ; CHECK-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[COPY]], [[SHL1]]
     ; CHECK-NEXT: [[BITCAST7:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
     ; CHECK-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST6]](<2 x s16>), [[BITCAST7]](<2 x s16>)
-    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C1]]
-    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[BITCAST3]], [[C1]]
-    ; CHECK-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C]](s32)
-    ; CHECK-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL2]]
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[BITCAST3]], [[C1]]
+    ; CHECK-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
+    ; CHECK-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[LSHR1]], [[SHL2]]
     ; CHECK-NEXT: [[BITCAST8:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR3]](s32)
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
     ; CHECK-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[C2]], [[C]](s32)
@@ -564,20 +561,17 @@ body: |
     ; CHECK-NEXT: [[UV10:%[0-9]+]]:_(<2 x s16>), [[UV11:%[0-9]+]]:_(<2 x s16>), [[UV12:%[0-9]+]]:_(<2 x s16>), [[UV13:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF2]](<8 x s16>)
     ; CHECK-NEXT: [[BITCAST11:%[0-9]+]]:_(s32) = G_BITCAST [[UV12]](<2 x s16>)
     ; CHECK-NEXT: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST11]], [[C]](s32)
-    ; CHECK-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[BITCAST4]], [[C1]]
-    ; CHECK-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[LSHR2]], [[C1]]
-    ; CHECK-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C]](s32)
-    ; CHECK-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL4]]
+    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[BITCAST4]], [[C1]]
+    ; CHECK-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LSHR2]], [[C]](s32)
+    ; CHECK-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL4]]
     ; CHECK-NEXT: [[BITCAST12:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR6]](s32)
-    ; CHECK-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[BITCAST5]], [[C1]]
-    ; CHECK-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[BITCAST10]], [[C1]]
-    ; CHECK-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C]](s32)
-    ; CHECK-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[AND6]], [[SHL5]]
+    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[BITCAST5]], [[C1]]
+    ; CHECK-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[BITCAST10]], [[C1]]
+    ; CHECK-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND4]], [[C]](s32)
+    ; CHECK-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[AND3]], [[SHL5]]
     ; CHECK-NEXT: [[BITCAST13:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR7]](s32)
-    ; CHECK-NEXT: [[AND8:%[0-9]+]]:_(s32) = G_AND [[LSHR3]], [[C1]]
-    ; CHECK-NEXT: [[AND9:%[0-9]+]]:_(s32) = G_AND [[LSHR4]], [[C1]]
-    ; CHECK-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[C]](s32)
-    ; CHECK-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[AND8]], [[SHL6]]
+    ; CHECK-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[LSHR4]], [[C]](s32)
+    ; CHECK-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[LSHR3]], [[SHL6]]
     ; CHECK-NEXT: [[BITCAST14:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR8]](s32)
     ; CHECK-NEXT: [[CONCAT_VECTORS4:%[0-9]+]]:_(<8 x s16>) = G_CONCAT_VECTORS [[BITCAST12]](<2 x s16>), [[BITCAST13]](<2 x s16>), [[BITCAST14]](<2 x s16>), [[UV13]](<2 x s16>)
     ; CHECK-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[CONCAT_VECTORS4]](<8 x s16>)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-phi.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-phi.mir
index 7b439e501854c..00612d552a104 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-phi.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-phi.mir
@@ -179,10 +179,9 @@ body: |
   ; CHECK-NEXT:   [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C6]](s32)
   ; CHECK-NEXT:   [[OR3:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL3]]
   ; CHECK-NEXT:   [[BITCAST8:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR3]](s32)
-  ; CHECK-NEXT:   [[AND6:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C7]]
-  ; CHECK-NEXT:   [[AND7:%[0-9]+]]:_(s32) = G_AND [[BITCAST7]], [[C7]]
-  ; CHECK-NEXT:   [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C6]](s32)
-  ; CHECK-NEXT:   [[OR4:%[0-9]+]]:_(s32) = G_OR [[AND6]], [[SHL4]]
+  ; CHECK-NEXT:   [[AND6:%[0-9]+]]:_(s32) = G_AND [[BITCAST7]], [[C7]]
+  ; CHECK-NEXT:   [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND6]], [[C6]](s32)
+  ; CHECK-NEXT:   [[OR4:%[0-9]+]]:_(s32) = G_OR [[LSHR1]], [[SHL4]]
   ; CHECK-NEXT:   [[BITCAST9:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR4]](s32)
   ; CHECK-NEXT:   [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[UV2]](<2 x s16>), [[BITCAST8]](<2 x s16>), [[BITCAST9]](<2 x s16>)
   ; CHECK-NEXT:   $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS2]](<6 x s16>)
@@ -1373,8 +1372,8 @@ body: |
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   [[PHI:%[0-9]+]]:_(s16) = G_PHI [[TRUNC]](s16), %bb.0, [[TRUNC1]](s16), %bb.1
-  ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 127
   ; CHECK-NEXT:   [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[PHI]](s16)
+  ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 127
   ; CHECK-NEXT:   [[AND:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C1]]
   ; CHECK-NEXT:   $vgpr0 = COPY [[AND]](s32)
   ; CHECK-NEXT:   S_SETPC_B64 undef $sgpr30_sgpr31
@@ -1430,8 +1429,8 @@ body: |
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
   ; CHECK-NEXT:   [[PHI:%[0-9]+]]:_(s16) = G_PHI [[TRUNC]](s16), %bb.0, [[TRUNC1]](s16), %bb.1
-  ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
   ; CHECK-NEXT:   [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[PHI]](s16)
+  ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
   ; CHECK-NEXT:   [[AND:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C1]]
   ; CHECK-NEXT:   $vgpr0 = COPY [[AND]](s32)
   ; CHECK-NEXT:   S_SETPC_B64 undef $sgpr30_sgpr31
@@ -1654,9 +1653,7 @@ body: |
   ; CHECK-NEXT:   [[AND:%[0-9]+]]:_(s32) = G_AND [[BITCAST]], [[C2]]
   ; CHECK-NEXT:   [[AND1:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C2]]
   ; CHECK-NEXT:   [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[AND]](s32), [[AND1]]
-  ; CHECK-NEXT:   [[AND2:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C2]]
-  ; CHECK-NEXT:   [[AND3:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C2]]
-  ; CHECK-NEXT:   [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[AND2]](s32), [[AND3]]
+  ; CHECK-NEXT:   [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[LSHR]](s32), [[LSHR1]]
   ; CHECK-NEXT:   [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY3]](s32), [[C]]
   ; CHECK-NEXT:   G_BRCOND [[ICMP2]](s1), %bb.1
   ; CHECK-NEXT:   G_BR %bb.2
@@ -1670,12 +1667,10 @@ body: |
   ; CHECK-NEXT:   [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[COPY2]](<2 x s16>)
   ; CHECK-NEXT:   [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C3]](s32)
   ; CHECK-NEXT:   [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
-  ; CHECK-NEXT:   [[AND4:%[0-9]+]]:_(s32) = G_AND [[BITCAST2]], [[C4]]
-  ; CHECK-NEXT:   [[AND5:%[0-9]+]]:_(s32) = G_AND [[BITCAST3]], [[C4]]
-  ; CHECK-NEXT:   [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[AND4]](s32), [[AND5]]
-  ; CHECK-NEXT:   [[AND6:%[0-9]+]]:_(s32) = G_AND [[LSHR2]], [[C4]]
-  ; CHECK-NEXT:   [[AND7:%[0-9]+]]:_(s32) = G_AND [[LSHR3]], [[C4]]
-  ; CHECK-NEXT:   [[ICMP4:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[AND6]](s32), [[AND7]]
+  ; CHECK-NEXT:   [[AND2:%[0-9]+]]:_(s32) = G_AND [[BITCAST2]], [[C4]]
+  ; CHECK-NEXT:   [[AND3:%[0-9]+]]:_(s32) = G_AND [[BITCAST3]], [[C4]]
+  ; CHECK-NEXT:   [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[AND2]](s32), [[AND3]]
+  ; CHECK-NEXT:   [[ICMP4:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[LSHR2]](s32), [[LSHR3]]
   ; CHECK-NEXT:   G_BR %bb.2
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.2:
@@ -1684,9 +1679,9 @@ body: |
   ; CHECK-NEXT:   [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[PHI]](s1)
   ; CHECK-NEXT:   [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[PHI1]](s1)
   ; CHECK-NEXT:   [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-  ; CHECK-NEXT:   [[AND8:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C5]]
-  ; CHECK-NEXT:   [[AND9:%[0-9]+]]:_(s32) = G_AND [[ANYEXT1]], [[C5]]
-  ; CHECK-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[AND8]](s32), [[AND9]](s32)
+  ; CHECK-NEXT:   [[AND4:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C5]]
+  ; CHECK-NEXT:   [[AND5:%[0-9]+]]:_(s32) = G_AND [[ANYEXT1]], [[C5]]
+  ; CHECK-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[AND4]](s32), [[AND5]](s32)
   ; CHECK-NEXT:   $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
   ; CHECK-NEXT:   S_SETPC_B64 undef $sgpr30_sgpr31
   bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ptrmask.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ptrmask.mir
index 227d4645ea647..80d1aa4d8889d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ptrmask.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ptrmask.mir
@@ -12,8 +12,8 @@ body: |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr2
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 65535
     ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[COPY1]](s32)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 65535
     ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[ANYEXT]], [[C]]
     ; CHECK-NEXT: [[PTRMASK:%[0-9]+]]:_(p1) = G_PTRMASK [[COPY]], [[AND]](s64)
     ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[PTRMASK]](p1)
@@ -94,8 +94,8 @@ body: |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $vgpr0_vgpr1
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr2
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 65535
     ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[COPY1]](s32)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 65535
     ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[ANYEXT]], [[C]]
     ; CHECK-NEXT: [[PTRMASK:%[0-9]+]]:_(p0) = G_PTRMASK [[COPY]], [[AND]](s64)
     ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[PTRMASK]](p0)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-rotl-rotr.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-rotl-rotr.mir
index e5017d0b615d7..07aca9f80be58 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-rotl-rotr.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-rotl-rotr.mir
@@ -21,36 +21,33 @@ body:             |
     ; GFX6-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 15
     ; GFX6-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 32767
     ; GFX6-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C2]]
-    ; GFX6-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C1]](s32)
-    ; GFX6-NEXT: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[COPY2]](s32)
+    ; GFX6-NEXT: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[C1]](s32)
     ; GFX6-NEXT: [[AMDGPU_RCP_IFLAG:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG [[UITOFP]](s32)
     ; GFX6-NEXT: [[C3:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41EFFFFFC0000000
     ; GFX6-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG]], [[C3]]
     ; GFX6-NEXT: [[FPTOUI:%[0-9]+]]:_(s32) = G_FPTOUI [[FMUL]](s32)
     ; GFX6-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-    ; GFX6-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C4]], [[COPY2]]
+    ; GFX6-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C4]], [[C1]]
     ; GFX6-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[SUB]], [[FPTOUI]]
     ; GFX6-NEXT: [[UMULH:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI]], [[MUL]]
     ; GFX6-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI]], [[UMULH]]
     ; GFX6-NEXT: [[UMULH1:%[0-9]+]]:_(s32) = G_UMULH [[AND]], [[ADD]]
-    ; GFX6-NEXT: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[UMULH1]], [[COPY2]]
+    ; GFX6-NEXT: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[UMULH1]], [[C1]]
     ; GFX6-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[AND]], [[MUL1]]
     ; GFX6-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; GFX6-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SUB1]](s32), [[COPY2]]
-    ; GFX6-NEXT: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[SUB1]], [[COPY2]]
+    ; GFX6-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SUB1]](s32), [[C1]]
+    ; GFX6-NEXT: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[SUB1]], [[C1]]
     ; GFX6-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[SUB2]], [[SUB1]]
-    ; GFX6-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SELECT]](s32), [[COPY2]]
-    ; GFX6-NEXT: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[SELECT]], [[COPY2]]
+    ; GFX6-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SELECT]](s32), [[C1]]
+    ; GFX6-NEXT: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[SELECT]], [[C1]]
     ; GFX6-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s1), [[SUB3]], [[SELECT]]
     ; GFX6-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[SELECT1]], [[C2]]
     ; GFX6-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[AND1]](s32)
     ; GFX6-NEXT: [[SUB4:%[0-9]+]]:_(s32) = G_SUB [[C]], [[SELECT1]]
-    ; GFX6-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
     ; GFX6-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C2]]
-    ; GFX6-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[AND2]], [[COPY3]](s32)
+    ; GFX6-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[AND2]], [[C5]](s32)
     ; GFX6-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[SUB4]], [[C2]]
-    ; GFX6-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C2]]
-    ; GFX6-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[AND4]], [[AND3]](s32)
+    ; GFX6-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[LSHR]], [[AND3]](s32)
     ; GFX6-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[LSHR1]]
     ; GFX6-NEXT: $sgpr0 = COPY [[OR]](s32)
     ;
@@ -62,41 +59,38 @@ body:             |
     ; GFX8-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 15
     ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 32767
     ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]]
-    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C]](s32)
-    ; GFX8-NEXT: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[COPY2]](s32)
+    ; GFX8-NEXT: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[C]](s32)
     ; GFX8-NEXT: [[AMDGPU_RCP_IFLAG:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG [[UITOFP]](s32)
     ; GFX8-NEXT: [[C2:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41EFFFFFC0000000
     ; GFX8-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG]], [[C2]]
     ; GFX8-NEXT: [[FPTOUI:%[0-9]+]]:_(s32) = G_FPTOUI [[FMUL]](s32)
     ; GFX8-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-    ; GFX8-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C3]], [[COPY2]]
+    ; GFX8-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C3]], [[C]]
     ; GFX8-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[SUB]], [[FPTOUI]]
     ; GFX8-NEXT: [[UMULH:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI]], [[MUL]]
     ; GFX8-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI]], [[UMULH]]
     ; GFX8-NEXT: [[UMULH1:%[0-9]+]]:_(s32) = G_UMULH [[AND]], [[ADD]]
-    ; GFX8-NEXT: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[UMULH1]], [[COPY2]]
+    ; GFX8-NEXT: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[UMULH1]], [[C]]
     ; GFX8-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[AND]], [[MUL1]]
-    ; GFX8-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SUB1]](s32), [[COPY2]]
-    ; GFX8-NEXT: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[SUB1]], [[COPY2]]
+    ; GFX8-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SUB1]](s32), [[C]]
+    ; GFX8-NEXT: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[SUB1]], [[C]]
     ; GFX8-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[SUB2]], [[SUB1]]
-    ; GFX8-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SELECT]](s32), [[COPY2]]
-    ; GFX8-NEXT: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[SELECT]], [[COPY2]]
+    ; GFX8-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SELECT]](s32), [[C]]
+    ; GFX8-NEXT: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[SELECT]], [[C]]
     ; GFX8-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s1), [[SUB3]], [[SELECT]]
-    ; GFX8-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 32767
     ; GFX8-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[SELECT1]](s32)
+    ; GFX8-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 32767
     ; GFX8-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C4]]
     ; GFX8-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
     ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC1]], [[AND1]](s16)
     ; GFX8-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 14
-    ; GFX8-NEXT: [[COPY3:%[0-9]+]]:_(s16) = COPY [[TRUNC]](s16)
-    ; GFX8-NEXT: [[SUB4:%[0-9]+]]:_(s16) = G_SUB [[C5]], [[COPY3]]
+    ; GFX8-NEXT: [[COPY2:%[0-9]+]]:_(s16) = COPY [[TRUNC]](s16)
+    ; GFX8-NEXT: [[SUB4:%[0-9]+]]:_(s16) = G_SUB [[C5]], [[COPY2]]
     ; GFX8-NEXT: [[C6:%[0-9]+]]:_(s16) = G_CONSTANT i16 1
-    ; GFX8-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[C6]], [[C4]]
-    ; GFX8-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C4]]
-    ; GFX8-NEXT: [[LSHR:%[0-9]+]]:_(s16) = G_LSHR [[AND3]], [[AND2]](s16)
-    ; GFX8-NEXT: [[AND4:%[0-9]+]]:_(s16) = G_AND [[SUB4]], [[C4]]
-    ; GFX8-NEXT: [[AND5:%[0-9]+]]:_(s16) = G_AND [[LSHR]], [[C4]]
-    ; GFX8-NEXT: [[LSHR1:%[0-9]+]]:_(s16) = G_LSHR [[AND5]], [[AND4]](s16)
+    ; GFX8-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C4]]
+    ; GFX8-NEXT: [[LSHR:%[0-9]+]]:_(s16) = G_LSHR [[AND2]], [[C6]](s16)
+    ; GFX8-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[SUB4]], [[C4]]
+    ; GFX8-NEXT: [[LSHR1:%[0-9]+]]:_(s16) = G_LSHR [[LSHR]], [[AND3]](s16)
     ; GFX8-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[SHL]](s16)
     ; GFX8-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR1]](s16)
     ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[ANYEXT]], [[ANYEXT1]]
@@ -214,36 +208,33 @@ body:             |
     ; GFX-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 31
     ; GFX-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2147483647
     ; GFX-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C2]]
-    ; GFX-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C1]](s32)
-    ; GFX-NEXT: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[COPY2]](s32)
+    ; GFX-NEXT: [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[C1]](s32)
     ; GFX-NEXT: [[AMDGPU_RCP_IFLAG:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG [[UITOFP]](s32)
     ; GFX-NEXT: [[C3:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41EFFFFFC0000000
     ; GFX-NEXT: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG]], [[C3]]
     ; GFX-NEXT: [[FPTOUI:%[0-9]+]]:_(s32) = G_FPTOUI [[FMUL]](s32)
     ; GFX-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-    ; GFX-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C4]], [[COPY2]]
+    ; GFX-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C4]], [[C1]]
     ; GFX-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[SUB]], [[FPTOUI]]
     ; GFX-NEXT: [[UMULH:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI]], [[MUL]]
     ; GFX-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI]], [[UMULH]]
     ; GFX-NEXT: [[UMULH1:%[0-9]+]]:_(s32) = G_UMULH [[AND]], [[ADD]]
-    ; GFX-NEXT: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[UMULH1]], [[COPY2]]
+    ; GFX-NEXT: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[UMULH1]], [[C1]]
     ; GFX-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[AND]], [[MUL1]]
     ; GFX-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; GFX-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SUB1]](s32), [[COPY2]]
-    ; GFX-NEXT: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[SUB1]], [[COPY2]]
+    ; GFX-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SUB1]](s32), [[C1]]
+    ; GFX-NEXT: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[SUB1]], [[C1]]
     ; GFX-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[SUB2]], [[SUB1]]
-    ; GFX-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SELECT]](s32), [[COPY2]]
-    ; GFX-NEXT: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[SELECT]], [[COPY2]]
+    ; GFX-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SELECT]](s32), [[C1]]
+    ; GFX-NEXT: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[SELECT]], [[C1]]
     ; GFX-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s1), [[SUB3]], [[SELECT]]
     ; GFX-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[SELECT1]], [[C2]]
     ; GFX-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[AND1]](s32)
     ; GFX-NEXT: [[SUB4:%[0-9]+]]:_(s32) = G_SUB [[C]], [[SELECT1]]
-    ; GFX-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C5]](s32)
     ; GFX-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C2]]
-    ; GFX-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[AND2]], [[COPY3]](s32)
+    ; GFX-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[AND2]], [[C5]](s32)
     ; GFX-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[SUB4]], [[C2]]
-    ; GFX-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C2]]
-    ; GFX-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[AND4]], [[AND3]](s32)
+    ; GFX-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[LSHR]], [[AND3]](s32)
     ; GFX-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[SHL]], [[LSHR1]]
     ; GFX-NEXT: $sgpr0 = COPY [[OR]](s32)
     %0:_(s32) = COPY $sgpr0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-saddo.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-saddo.mir
index 09aaf8d548758..dba20e128237c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-saddo.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-saddo.mir
@@ -256,17 +256,16 @@ body: |
     ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C]](s32)
     ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]]
     ; CHECK-NEXT: [[BITCAST11:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
-    ; CHECK-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[LSHR4]], [[C2]]
-    ; CHECK-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[BITCAST9]], [[C2]]
-    ; CHECK-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C]](s32)
-    ; CHECK-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]]
+    ; CHECK-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[BITCAST9]], [[C2]]
+    ; CHECK-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND4]], [[C]](s32)
+    ; CHECK-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[LSHR4]], [[SHL2]]
     ; CHECK-NEXT: [[BITCAST12:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
     ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST10]](<2 x s16>), [[BITCAST11]](<2 x s16>), [[BITCAST12]](<2 x s16>)
     ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C3]]
-    ; CHECK-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[ANYEXT1]], [[C3]]
-    ; CHECK-NEXT: [[AND8:%[0-9]+]]:_(s32) = G_AND [[ANYEXT2]], [[C3]]
-    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[AND6]](s32), [[AND7]](s32), [[AND8]](s32)
+    ; CHECK-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C3]]
+    ; CHECK-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[ANYEXT1]], [[C3]]
+    ; CHECK-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[ANYEXT2]], [[C3]]
+    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[AND5]](s32), [[AND6]](s32), [[AND7]](s32)
     ; CHECK-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
     ; CHECK-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
     %0:_(<6 x s16>) = COPY $vgpr0_vgpr1_vgpr2
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-saddsat.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-saddsat.mir
index a175a87f239e3..11018723d979a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-saddsat.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-saddsat.mir
@@ -31,6 +31,7 @@ body: |
     ; GFX6-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[SHL]], [[SMIN1]]
     ; GFX6-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[ADD]], [[C]](s32)
     ; GFX6-NEXT: $vgpr0 = COPY [[ASHR]](s32)
+    ;
     ; GFX8-LABEL: name: saddsat_s7
     ; GFX8: liveins: $vgpr0, $vgpr1
     ; GFX8-NEXT: {{  $}}
@@ -54,6 +55,7 @@ body: |
     ; GFX8-NEXT: [[ASHR:%[0-9]+]]:_(s16) = G_ASHR [[ADD]], [[C]](s16)
     ; GFX8-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[ASHR]](s16)
     ; GFX8-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ;
     ; GFX9-LABEL: name: saddsat_s7
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -103,6 +105,7 @@ body: |
     ; GFX6-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[SHL]], [[SMIN1]]
     ; GFX6-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[ADD]], [[C]](s32)
     ; GFX6-NEXT: $vgpr0 = COPY [[ASHR]](s32)
+    ;
     ; GFX8-LABEL: name: saddsat_s8
     ; GFX8: liveins: $vgpr0, $vgpr1
     ; GFX8-NEXT: {{  $}}
@@ -126,6 +129,7 @@ body: |
     ; GFX8-NEXT: [[ASHR:%[0-9]+]]:_(s16) = G_ASHR [[ADD]], [[C]](s16)
     ; GFX8-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[ASHR]](s16)
     ; GFX8-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ;
     ; GFX9-LABEL: name: saddsat_s8
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -187,8 +191,8 @@ body: |
     ; GFX6-NEXT: [[SMIN3:%[0-9]+]]:_(s32) = G_SMIN [[SMAX3]], [[SUB2]]
     ; GFX6-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[SHL2]], [[SMIN3]]
     ; GFX6-NEXT: [[ASHR1:%[0-9]+]]:_(s32) = G_ASHR [[ADD1]], [[C1]](s32)
-    ; GFX6-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; GFX6-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[ASHR]](s32)
+    ; GFX6-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; GFX6-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C5]]
     ; GFX6-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C]](s32)
     ; GFX6-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
@@ -198,6 +202,7 @@ body: |
     ; GFX6-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[TRUNC1]]
     ; GFX6-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[OR]](s16)
     ; GFX6-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ;
     ; GFX8-LABEL: name: saddsat_v2s8
     ; GFX8: liveins: $vgpr0, $vgpr1
     ; GFX8-NEXT: {{  $}}
@@ -241,6 +246,7 @@ body: |
     ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL4]]
     ; GFX8-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[OR]](s16)
     ; GFX8-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ;
     ; GFX9-LABEL: name: saddsat_v2s8
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -311,6 +317,7 @@ body: |
     ; GFX6-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[SHL]], [[SMIN1]]
     ; GFX6-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[ADD]], [[C]](s32)
     ; GFX6-NEXT: $vgpr0 = COPY [[ASHR]](s32)
+    ;
     ; GFX8-LABEL: name: saddsat_s16
     ; GFX8: liveins: $vgpr0, $vgpr1
     ; GFX8-NEXT: {{  $}}
@@ -330,6 +337,7 @@ body: |
     ; GFX8-NEXT: [[ADD:%[0-9]+]]:_(s16) = G_ADD [[TRUNC]], [[SMIN1]]
     ; GFX8-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[ADD]](s16)
     ; GFX8-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ;
     ; GFX9-LABEL: name: saddsat_s16
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -395,6 +403,7 @@ body: |
     ; GFX6-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL4]]
     ; GFX6-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; GFX6-NEXT: $vgpr0 = COPY [[BITCAST2]](<2 x s16>)
+    ;
     ; GFX8-LABEL: name: saddsat_v2s16
     ; GFX8: liveins: $vgpr0, $vgpr1
     ; GFX8-NEXT: {{  $}}
@@ -432,6 +441,7 @@ body: |
     ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL]]
     ; GFX8-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; GFX8-NEXT: $vgpr0 = COPY [[BITCAST2]](<2 x s16>)
+    ;
     ; GFX9-LABEL: name: saddsat_v2s16
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -512,13 +522,13 @@ body: |
     ; GFX6-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C]](s32)
     ; GFX6-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL7]]
     ; GFX6-NEXT: [[BITCAST6:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
-    ; GFX6-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[LSHR3]], [[C4]]
-    ; GFX6-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[BITCAST4]], [[C4]]
-    ; GFX6-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C]](s32)
-    ; GFX6-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL8]]
+    ; GFX6-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[BITCAST4]], [[C4]]
+    ; GFX6-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND4]], [[C]](s32)
+    ; GFX6-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[LSHR3]], [[SHL8]]
     ; GFX6-NEXT: [[BITCAST7:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
     ; GFX6-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST5]](<2 x s16>), [[BITCAST6]](<2 x s16>), [[BITCAST7]](<2 x s16>)
     ; GFX6-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
     ; GFX8-LABEL: name: saddsat_v3s16
     ; GFX8: liveins: $vgpr0_vgpr1_vgpr2
     ; GFX8-NEXT: {{  $}}
@@ -577,13 +587,13 @@ body: |
     ; GFX8-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND]], [[C]](s32)
     ; GFX8-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL1]]
     ; GFX8-NEXT: [[BITCAST6:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
-    ; GFX8-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR3]], [[C4]]
-    ; GFX8-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[BITCAST4]], [[C4]]
-    ; GFX8-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C]](s32)
-    ; GFX8-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND1]], [[SHL2]]
+    ; GFX8-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[BITCAST4]], [[C4]]
+    ; GFX8-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
+    ; GFX8-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[LSHR3]], [[SHL2]]
     ; GFX8-NEXT: [[BITCAST7:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
     ; GFX8-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST5]](<2 x s16>), [[BITCAST6]](<2 x s16>), [[BITCAST7]](<2 x s16>)
     ; GFX8-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
     ; GFX9-LABEL: name: saddsat_v3s16
     ; GFX9: liveins: $vgpr0_vgpr1_vgpr2
     ; GFX9-NEXT: {{  $}}
@@ -709,6 +719,7 @@ body: |
     ; GFX6-NEXT: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
     ; GFX6-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>)
     ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
     ; GFX8-LABEL: name: saddsat_v4s16
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX8-NEXT: {{  $}}
@@ -776,6 +787,7 @@ body: |
     ; GFX8-NEXT: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
     ; GFX8-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>)
     ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
     ; GFX9-LABEL: name: saddsat_v4s16
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -815,6 +827,7 @@ body: |
     ; GFX6-NEXT: [[SMIN1:%[0-9]+]]:_(s32) = G_SMIN [[SMAX1]], [[SUB]]
     ; GFX6-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY]], [[SMIN1]]
     ; GFX6-NEXT: $vgpr0 = COPY [[ADD]](s32)
+    ;
     ; GFX8-LABEL: name: saddsat_s32
     ; GFX8: liveins: $vgpr0, $vgpr1
     ; GFX8-NEXT: {{  $}}
@@ -831,6 +844,7 @@ body: |
     ; GFX8-NEXT: [[SMIN1:%[0-9]+]]:_(s32) = G_SMIN [[SMAX1]], [[SUB]]
     ; GFX8-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY]], [[SMIN1]]
     ; GFX8-NEXT: $vgpr0 = COPY [[ADD]](s32)
+    ;
     ; GFX9-LABEL: name: saddsat_s32
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -876,6 +890,7 @@ body: |
     ; GFX6-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[UV1]], [[SMIN3]]
     ; GFX6-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[ADD]](s32), [[ADD1]](s32)
     ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; GFX8-LABEL: name: saddsat_v2s32
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX8-NEXT: {{  $}}
@@ -902,6 +917,7 @@ body: |
     ; GFX8-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[UV1]], [[SMIN3]]
     ; GFX8-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[ADD]](s32), [[ADD1]](s32)
     ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; GFX9-LABEL: name: saddsat_v2s32
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -949,6 +965,7 @@ body: |
     ; GFX6-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO2]](s32), [[UADDE2]](s32)
     ; GFX6-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[XOR]](s1), [[MV1]], [[MV]]
     ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[SELECT]](s64)
+    ;
     ; GFX8-LABEL: name: saddsat_s64
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX8-NEXT: {{  $}}
@@ -973,6 +990,7 @@ body: |
     ; GFX8-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO2]](s32), [[UADDE2]](s32)
     ; GFX8-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[XOR]](s1), [[MV1]], [[MV]]
     ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[SELECT]](s64)
+    ;
     ; GFX9-LABEL: name: saddsat_s64
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -1051,6 +1069,7 @@ body: |
     ; GFX6-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[XOR1]](s1), [[MV3]], [[MV2]]
     ; GFX6-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[SELECT]](s64), [[SELECT1]](s64)
     ; GFX6-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ;
     ; GFX8-LABEL: name: saddsat_v2s64
     ; GFX8: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7
     ; GFX8-NEXT: {{  $}}
@@ -1093,6 +1112,7 @@ body: |
     ; GFX8-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[XOR1]](s1), [[MV3]], [[MV2]]
     ; GFX8-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[SELECT]](s64), [[SELECT1]](s64)
     ; GFX8-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ;
     ; GFX9-LABEL: name: saddsat_v2s64
     ; GFX9: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7
     ; GFX9-NEXT: {{  $}}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-select.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-select.mir
index 674f23e2f411c..93a7e837cdd9b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-select.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-select.mir
@@ -417,10 +417,9 @@ body: |
     ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C1]](s32)
     ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
     ; CHECK-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
-    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C2]]
-    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[BITCAST2]], [[C2]]
-    ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C1]](s32)
-    ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]]
+    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[BITCAST2]], [[C2]]
+    ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C1]](s32)
+    ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[LSHR]], [[SHL1]]
     ; CHECK-NEXT: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
     ; CHECK-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[UV6]](<2 x s16>), [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>)
     ; CHECK-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS2]](<6 x s16>)
@@ -1362,10 +1361,9 @@ body: |
     ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND]], [[C]](s32)
     ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL1]]
     ; CHECK-NEXT: [[BITCAST7:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR2]], [[C1]]
-    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[BITCAST5]], [[C1]]
-    ; CHECK-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C]](s32)
-    ; CHECK-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND1]], [[SHL2]]
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[BITCAST5]], [[C1]]
+    ; CHECK-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
+    ; CHECK-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[LSHR2]], [[SHL2]]
     ; CHECK-NEXT: [[BITCAST8:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
     ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST6]](<2 x s16>), [[BITCAST7]](<2 x s16>), [[BITCAST8]](<2 x s16>)
     ; CHECK-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
@@ -1401,8 +1399,8 @@ body: |
     ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s16) = G_ANYEXT [[ICMP1]](s1)
     ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s16) = G_ANYEXT [[ICMP2]](s1)
     ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s16) = G_SELECT [[ICMP]](s1), [[ANYEXT]], [[ANYEXT1]]
-    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; CHECK-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[SELECT]](s16)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ANYEXT2]], [[C1]]
     ; CHECK-NEXT: $vgpr0 = COPY [[AND]](s32)
     %0:_(s32) = COPY $vgpr0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sext-inreg.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sext-inreg.mir
index d5f992a42b07a..40c48e10f933f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sext-inreg.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sext-inreg.mir
@@ -17,12 +17,14 @@ body: |
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX9-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY]], 1
     ; GFX9-NEXT: $vgpr0 = COPY [[SEXT_INREG]](s32)
+    ;
     ; GFX8-LABEL: name: test_sext_inreg_s32_1
     ; GFX8: liveins: $vgpr0
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX8-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY]], 1
     ; GFX8-NEXT: $vgpr0 = COPY [[SEXT_INREG]](s32)
+    ;
     ; GFX6-LABEL: name: test_sext_inreg_s32_1
     ; GFX6: liveins: $vgpr0
     ; GFX6-NEXT: {{  $}}
@@ -46,12 +48,14 @@ body: |
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX9-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY]], 2
     ; GFX9-NEXT: $vgpr0 = COPY [[SEXT_INREG]](s32)
+    ;
     ; GFX8-LABEL: name: test_sext_inreg_s32_2
     ; GFX8: liveins: $vgpr0
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX8-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY]], 2
     ; GFX8-NEXT: $vgpr0 = COPY [[SEXT_INREG]](s32)
+    ;
     ; GFX6-LABEL: name: test_sext_inreg_s32_2
     ; GFX6: liveins: $vgpr0
     ; GFX6-NEXT: {{  $}}
@@ -75,12 +79,14 @@ body: |
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX9-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY]], 8
     ; GFX9-NEXT: $vgpr0 = COPY [[SEXT_INREG]](s32)
+    ;
     ; GFX8-LABEL: name: test_sext_inreg_s32_8
     ; GFX8: liveins: $vgpr0
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX8-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY]], 8
     ; GFX8-NEXT: $vgpr0 = COPY [[SEXT_INREG]](s32)
+    ;
     ; GFX6-LABEL: name: test_sext_inreg_s32_8
     ; GFX6: liveins: $vgpr0
     ; GFX6-NEXT: {{  $}}
@@ -104,12 +110,14 @@ body: |
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX9-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY]], 8
     ; GFX9-NEXT: $vgpr0 = COPY [[SEXT_INREG]](s32)
+    ;
     ; GFX8-LABEL: name: test_sext_inreg_s32_16
     ; GFX8: liveins: $vgpr0
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX8-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY]], 8
     ; GFX8-NEXT: $vgpr0 = COPY [[SEXT_INREG]](s32)
+    ;
     ; GFX6-LABEL: name: test_sext_inreg_s32_16
     ; GFX6: liveins: $vgpr0
     ; GFX6-NEXT: {{  $}}
@@ -133,12 +141,14 @@ body: |
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX9-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY]], 31
     ; GFX9-NEXT: $vgpr0 = COPY [[SEXT_INREG]](s32)
+    ;
     ; GFX8-LABEL: name: test_sext_inreg_s32_31
     ; GFX8: liveins: $vgpr0
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX8-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY]], 31
     ; GFX8-NEXT: $vgpr0 = COPY [[SEXT_INREG]](s32)
+    ;
     ; GFX6-LABEL: name: test_sext_inreg_s32_31
     ; GFX6: liveins: $vgpr0
     ; GFX6-NEXT: {{  $}}
@@ -162,12 +172,14 @@ body: |
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s64) = G_SEXT_INREG [[COPY]], 1
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[SEXT_INREG]](s64)
+    ;
     ; GFX8-LABEL: name: test_sext_inreg_s64_1
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s64) = G_SEXT_INREG [[COPY]], 1
     ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[SEXT_INREG]](s64)
+    ;
     ; GFX6-LABEL: name: test_sext_inreg_s64_1
     ; GFX6: liveins: $vgpr0_vgpr1
     ; GFX6-NEXT: {{  $}}
@@ -191,12 +203,14 @@ body: |
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s64) = G_SEXT_INREG [[COPY]], 2
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[SEXT_INREG]](s64)
+    ;
     ; GFX8-LABEL: name: test_sext_inreg_s64_2
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s64) = G_SEXT_INREG [[COPY]], 2
     ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[SEXT_INREG]](s64)
+    ;
     ; GFX6-LABEL: name: test_sext_inreg_s64_2
     ; GFX6: liveins: $vgpr0_vgpr1
     ; GFX6-NEXT: {{  $}}
@@ -220,12 +234,14 @@ body: |
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s64) = G_SEXT_INREG [[COPY]], 8
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[SEXT_INREG]](s64)
+    ;
     ; GFX8-LABEL: name: test_sext_inreg_s64_8
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s64) = G_SEXT_INREG [[COPY]], 8
     ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[SEXT_INREG]](s64)
+    ;
     ; GFX6-LABEL: name: test_sext_inreg_s64_8
     ; GFX6: liveins: $vgpr0_vgpr1
     ; GFX6-NEXT: {{  $}}
@@ -249,12 +265,14 @@ body: |
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s64) = G_SEXT_INREG [[COPY]], 8
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[SEXT_INREG]](s64)
+    ;
     ; GFX8-LABEL: name: test_sext_inreg_s64_16
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s64) = G_SEXT_INREG [[COPY]], 8
     ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[SEXT_INREG]](s64)
+    ;
     ; GFX6-LABEL: name: test_sext_inreg_s64_16
     ; GFX6: liveins: $vgpr0_vgpr1
     ; GFX6-NEXT: {{  $}}
@@ -278,12 +296,14 @@ body: |
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s64) = G_SEXT_INREG [[COPY]], 31
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[SEXT_INREG]](s64)
+    ;
     ; GFX8-LABEL: name: test_sext_inreg_s64_31
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s64) = G_SEXT_INREG [[COPY]], 31
     ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[SEXT_INREG]](s64)
+    ;
     ; GFX6-LABEL: name: test_sext_inreg_s64_31
     ; GFX6: liveins: $vgpr0_vgpr1
     ; GFX6-NEXT: {{  $}}
@@ -307,12 +327,14 @@ body: |
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s64) = G_SEXT_INREG [[COPY]], 32
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[SEXT_INREG]](s64)
+    ;
     ; GFX8-LABEL: name: test_sext_inreg_s64_32
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s64) = G_SEXT_INREG [[COPY]], 32
     ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[SEXT_INREG]](s64)
+    ;
     ; GFX6-LABEL: name: test_sext_inreg_s64_32
     ; GFX6: liveins: $vgpr0_vgpr1
     ; GFX6-NEXT: {{  $}}
@@ -336,12 +358,14 @@ body: |
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s64) = G_SEXT_INREG [[COPY]], 33
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[SEXT_INREG]](s64)
+    ;
     ; GFX8-LABEL: name: test_sext_inreg_s64_33
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s64) = G_SEXT_INREG [[COPY]], 33
     ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[SEXT_INREG]](s64)
+    ;
     ; GFX6-LABEL: name: test_sext_inreg_s64_33
     ; GFX6: liveins: $vgpr0_vgpr1
     ; GFX6-NEXT: {{  $}}
@@ -365,12 +389,14 @@ body: |
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
     ; GFX9-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s64) = G_SEXT_INREG [[COPY]], 63
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[SEXT_INREG]](s64)
+    ;
     ; GFX8-LABEL: name: test_sext_inreg_s64_63
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
     ; GFX8-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s64) = G_SEXT_INREG [[COPY]], 63
     ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[SEXT_INREG]](s64)
+    ;
     ; GFX6-LABEL: name: test_sext_inreg_s64_63
     ; GFX6: liveins: $vgpr0_vgpr1
     ; GFX6-NEXT: {{  $}}
@@ -395,6 +421,7 @@ body: |
     ; GFX9-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY]], 1
     ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[SEXT_INREG]](s32)
     ; GFX9-NEXT: S_ENDPGM 0, implicit [[TRUNC]](s16)
+    ;
     ; GFX8-LABEL: name: test_sext_inreg_s16_1
     ; GFX8: liveins: $vgpr0
     ; GFX8-NEXT: {{  $}}
@@ -404,6 +431,7 @@ body: |
     ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[C]](s16)
     ; GFX8-NEXT: [[ASHR:%[0-9]+]]:_(s16) = G_ASHR [[SHL]], [[C]](s16)
     ; GFX8-NEXT: S_ENDPGM 0, implicit [[ASHR]](s16)
+    ;
     ; GFX6-LABEL: name: test_sext_inreg_s16_1
     ; GFX6: liveins: $vgpr0
     ; GFX6-NEXT: {{  $}}
@@ -431,6 +459,7 @@ body: |
     ; GFX9-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[COPY]], 15
     ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[SEXT_INREG]](s32)
     ; GFX9-NEXT: S_ENDPGM 0, implicit [[TRUNC]](s16)
+    ;
     ; GFX8-LABEL: name: test_sext_inreg_s16_15
     ; GFX8: liveins: $vgpr0
     ; GFX8-NEXT: {{  $}}
@@ -440,6 +469,7 @@ body: |
     ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[C]](s16)
     ; GFX8-NEXT: [[ASHR:%[0-9]+]]:_(s16) = G_ASHR [[SHL]], [[C]](s16)
     ; GFX8-NEXT: S_ENDPGM 0, implicit [[ASHR]](s16)
+    ;
     ; GFX6-LABEL: name: test_sext_inreg_s16_15
     ; GFX6: liveins: $vgpr0
     ; GFX6-NEXT: {{  $}}
@@ -473,6 +503,7 @@ body: |
     ; GFX9-NEXT: [[MV1:%[0-9]+]]:_(s192) = G_MERGE_VALUES [[SEXT_INREG]](s64), [[MV]](s64), [[MV]](s64)
     ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s96) = G_TRUNC [[MV1]](s192)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[TRUNC1]](s96)
+    ;
     ; GFX8-LABEL: name: test_sext_inreg_s96_8
     ; GFX8: liveins: $vgpr0_vgpr1_vgpr2
     ; GFX8-NEXT: {{  $}}
@@ -486,6 +517,7 @@ body: |
     ; GFX8-NEXT: [[MV1:%[0-9]+]]:_(s192) = G_MERGE_VALUES [[SEXT_INREG]](s64), [[MV]](s64), [[MV]](s64)
     ; GFX8-NEXT: [[TRUNC1:%[0-9]+]]:_(s96) = G_TRUNC [[MV1]](s192)
     ; GFX8-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[TRUNC1]](s96)
+    ;
     ; GFX6-LABEL: name: test_sext_inreg_s96_8
     ; GFX6: liveins: $vgpr0_vgpr1_vgpr2
     ; GFX6-NEXT: {{  $}}
@@ -520,6 +552,7 @@ body: |
     ; GFX9-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[SEXT_INREG]], [[C]](s32)
     ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[SEXT_INREG]](s64), [[ASHR]](s64)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV]](s128)
+    ;
     ; GFX8-LABEL: name: test_sext_inreg_s128_8
     ; GFX8: liveins: $vgpr0_vgpr1_vgpr2_vgpr3
     ; GFX8-NEXT: {{  $}}
@@ -530,6 +563,7 @@ body: |
     ; GFX8-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[SEXT_INREG]], [[C]](s32)
     ; GFX8-NEXT: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[SEXT_INREG]](s64), [[ASHR]](s64)
     ; GFX8-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV]](s128)
+    ;
     ; GFX6-LABEL: name: test_sext_inreg_s128_8
     ; GFX6: liveins: $vgpr0_vgpr1_vgpr2_vgpr3
     ; GFX6-NEXT: {{  $}}
@@ -564,6 +598,7 @@ body: |
     ; GFX9-NEXT: [[MV1:%[0-9]+]]:_(s320) = G_MERGE_VALUES [[SEXT_INREG]](s64), [[MV]](s64), [[MV]](s64), [[MV]](s64), [[MV]](s64)
     ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s160) = G_TRUNC [[MV1]](s320)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 = COPY [[TRUNC1]](s160)
+    ;
     ; GFX8-LABEL: name: test_sext_inreg_s160_8
     ; GFX8: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4
     ; GFX8-NEXT: {{  $}}
@@ -577,6 +612,7 @@ body: |
     ; GFX8-NEXT: [[MV1:%[0-9]+]]:_(s320) = G_MERGE_VALUES [[SEXT_INREG]](s64), [[MV]](s64), [[MV]](s64), [[MV]](s64), [[MV]](s64)
     ; GFX8-NEXT: [[TRUNC1:%[0-9]+]]:_(s160) = G_TRUNC [[MV1]](s320)
     ; GFX8-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 = COPY [[TRUNC1]](s160)
+    ;
     ; GFX6-LABEL: name: test_sext_inreg_s160_8
     ; GFX6: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4
     ; GFX6-NEXT: {{  $}}
@@ -611,6 +647,7 @@ body: |
     ; GFX9-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[SEXT_INREG]], [[C]](s32)
     ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s256) = G_MERGE_VALUES [[SEXT_INREG]](s64), [[ASHR]](s64), [[ASHR]](s64), [[ASHR]](s64)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[MV]](s256)
+    ;
     ; GFX8-LABEL: name: test_sext_inreg_256_8
     ; GFX8: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
     ; GFX8-NEXT: {{  $}}
@@ -621,6 +658,7 @@ body: |
     ; GFX8-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[SEXT_INREG]], [[C]](s32)
     ; GFX8-NEXT: [[MV:%[0-9]+]]:_(s256) = G_MERGE_VALUES [[SEXT_INREG]](s64), [[ASHR]](s64), [[ASHR]](s64), [[ASHR]](s64)
     ; GFX8-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[MV]](s256)
+    ;
     ; GFX6-LABEL: name: test_sext_inreg_256_8
     ; GFX6: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
     ; GFX6-NEXT: {{  $}}
@@ -652,6 +690,7 @@ body: |
     ; GFX9-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[SEXT_INREG]], [[C]](s32)
     ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s512) = G_MERGE_VALUES [[SEXT_INREG]](s64), [[ASHR]](s64), [[ASHR]](s64), [[ASHR]](s64), [[ASHR]](s64), [[ASHR]](s64), [[ASHR]](s64), [[ASHR]](s64)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[MV]](s512)
+    ;
     ; GFX8-LABEL: name: test_sext_inreg_512_8
     ; GFX8: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
     ; GFX8-NEXT: {{  $}}
@@ -662,6 +701,7 @@ body: |
     ; GFX8-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[SEXT_INREG]], [[C]](s32)
     ; GFX8-NEXT: [[MV:%[0-9]+]]:_(s512) = G_MERGE_VALUES [[SEXT_INREG]](s64), [[ASHR]](s64), [[ASHR]](s64), [[ASHR]](s64), [[ASHR]](s64), [[ASHR]](s64), [[ASHR]](s64), [[ASHR]](s64)
     ; GFX8-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[MV]](s512)
+    ;
     ; GFX6-LABEL: name: test_sext_inreg_512_8
     ; GFX6: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15
     ; GFX6-NEXT: {{  $}}
@@ -693,6 +733,7 @@ body: |
     ; GFX9-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[SEXT_INREG]], [[C]](s32)
     ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s1024) = G_MERGE_VALUES [[SEXT_INREG]](s64), [[ASHR]](s64), [[ASHR]](s64), [[ASHR]](s64), [[ASHR]](s64), [[ASHR]](s64), [[ASHR]](s64), [[ASHR]](s64), [[ASHR]](s64), [[ASHR]](s64), [[ASHR]](s64), [[ASHR]](s64), [[ASHR]](s64), [[ASHR]](s64), [[ASHR]](s64), [[ASHR]](s64)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = COPY [[MV]](s1024)
+    ;
     ; GFX8-LABEL: name: test_sext_inreg_1024_8
     ; GFX8: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
     ; GFX8-NEXT: {{  $}}
@@ -703,6 +744,7 @@ body: |
     ; GFX8-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[SEXT_INREG]], [[C]](s32)
     ; GFX8-NEXT: [[MV:%[0-9]+]]:_(s1024) = G_MERGE_VALUES [[SEXT_INREG]](s64), [[ASHR]](s64), [[ASHR]](s64), [[ASHR]](s64), [[ASHR]](s64), [[ASHR]](s64), [[ASHR]](s64), [[ASHR]](s64), [[ASHR]](s64), [[ASHR]](s64), [[ASHR]](s64), [[ASHR]](s64), [[ASHR]](s64), [[ASHR]](s64), [[ASHR]](s64), [[ASHR]](s64)
     ; GFX8-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = COPY [[MV]](s1024)
+    ;
     ; GFX6-LABEL: name: test_sext_inreg_1024_8
     ; GFX6: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31
     ; GFX6-NEXT: {{  $}}
@@ -733,6 +775,7 @@ body: |
     ; GFX9-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[UV1]], 1
     ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[SEXT_INREG]](s32), [[SEXT_INREG1]](s32)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; GFX8-LABEL: name: test_sext_inreg_v2s32_1
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
@@ -742,6 +785,7 @@ body: |
     ; GFX8-NEXT: [[SEXT_INREG1:%[0-9]+]]:_(s32) = G_SEXT_INREG [[UV1]], 1
     ; GFX8-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[SEXT_INREG]](s32), [[SEXT_INREG1]](s32)
     ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; GFX6-LABEL: name: test_sext_inreg_v2s32_1
     ; GFX6: liveins: $vgpr0_vgpr1
     ; GFX6-NEXT: {{  $}}
@@ -771,6 +815,7 @@ body: |
     ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(<2 x s16>) = G_SHL [[COPY]], [[BUILD_VECTOR]](<2 x s16>)
     ; GFX9-NEXT: [[ASHR:%[0-9]+]]:_(<2 x s16>) = G_ASHR [[SHL]], [[BUILD_VECTOR]](<2 x s16>)
     ; GFX9-NEXT: $vgpr0 = COPY [[ASHR]](<2 x s16>)
+    ;
     ; GFX8-LABEL: name: test_sext_inreg_v2s16_1
     ; GFX8: liveins: $vgpr0
     ; GFX8-NEXT: {{  $}}
@@ -791,6 +836,7 @@ body: |
     ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]]
     ; GFX8-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; GFX8-NEXT: $vgpr0 = COPY [[BITCAST1]](<2 x s16>)
+    ;
     ; GFX6-LABEL: name: test_sext_inreg_v2s16_1
     ; GFX6: liveins: $vgpr0
     ; GFX6-NEXT: {{  $}}
@@ -854,6 +900,7 @@ body: |
     ; GFX9-NEXT: [[BUILD_VECTOR5:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC5]](s16), [[TRUNC6]](s16)
     ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR3]](<2 x s16>), [[BUILD_VECTOR4]](<2 x s16>), [[BUILD_VECTOR5]](<2 x s16>)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
     ; GFX8-LABEL: name: test_sext_inreg_v3s16_1
     ; GFX8: liveins: $vgpr0_vgpr1_vgpr2
     ; GFX8-NEXT: {{  $}}
@@ -889,13 +936,13 @@ body: |
     ; GFX8-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND]], [[C]](s32)
     ; GFX8-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL4]]
     ; GFX8-NEXT: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
-    ; GFX8-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C2]]
-    ; GFX8-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[BITCAST3]], [[C2]]
-    ; GFX8-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C]](s32)
-    ; GFX8-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND1]], [[SHL5]]
+    ; GFX8-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[BITCAST3]], [[C2]]
+    ; GFX8-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
+    ; GFX8-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[LSHR1]], [[SHL5]]
     ; GFX8-NEXT: [[BITCAST6:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
     ; GFX8-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>), [[BITCAST6]](<2 x s16>)
     ; GFX8-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
     ; GFX6-LABEL: name: test_sext_inreg_v3s16_1
     ; GFX6: liveins: $vgpr0_vgpr1_vgpr2
     ; GFX6-NEXT: {{  $}}
@@ -924,10 +971,9 @@ body: |
     ; GFX6-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C]](s32)
     ; GFX6-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]]
     ; GFX6-NEXT: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
-    ; GFX6-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C1]]
-    ; GFX6-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[BITCAST3]], [[C1]]
-    ; GFX6-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C]](s32)
-    ; GFX6-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]]
+    ; GFX6-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[BITCAST3]], [[C1]]
+    ; GFX6-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND4]], [[C]](s32)
+    ; GFX6-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[LSHR1]], [[SHL2]]
     ; GFX6-NEXT: [[BITCAST6:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
     ; GFX6-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>), [[BITCAST6]](<2 x s16>)
     ; GFX6-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
@@ -955,6 +1001,7 @@ body: |
     ; GFX9-NEXT: [[SEXT_INREG2:%[0-9]+]]:_(s32) = G_SEXT_INREG [[UV2]], 1
     ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[SEXT_INREG]](s32), [[SEXT_INREG1]](s32), [[SEXT_INREG2]](s32)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
+    ;
     ; GFX8-LABEL: name: test_sext_inreg_v3s32_1
     ; GFX8: liveins: $vgpr0_vgpr1_vgpr2
     ; GFX8-NEXT: {{  $}}
@@ -965,6 +1012,7 @@ body: |
     ; GFX8-NEXT: [[SEXT_INREG2:%[0-9]+]]:_(s32) = G_SEXT_INREG [[UV2]], 1
     ; GFX8-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[SEXT_INREG]](s32), [[SEXT_INREG1]](s32), [[SEXT_INREG2]](s32)
     ; GFX8-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
+    ;
     ; GFX6-LABEL: name: test_sext_inreg_v3s32_1
     ; GFX6: liveins: $vgpr0_vgpr1_vgpr2
     ; GFX6-NEXT: {{  $}}
@@ -997,6 +1045,7 @@ body: |
     ; GFX9-NEXT: [[SEXT_INREG3:%[0-9]+]]:_(s32) = G_SEXT_INREG [[UV3]], 1
     ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[SEXT_INREG]](s32), [[SEXT_INREG1]](s32), [[SEXT_INREG2]](s32), [[SEXT_INREG3]](s32)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
+    ;
     ; GFX8-LABEL: name: test_sext_inreg_v4s32_1
     ; GFX8: liveins: $vgpr0_vgpr1_vgpr2_vgpr3
     ; GFX8-NEXT: {{  $}}
@@ -1008,6 +1057,7 @@ body: |
     ; GFX8-NEXT: [[SEXT_INREG3:%[0-9]+]]:_(s32) = G_SEXT_INREG [[UV3]], 1
     ; GFX8-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[SEXT_INREG]](s32), [[SEXT_INREG1]](s32), [[SEXT_INREG2]](s32), [[SEXT_INREG3]](s32)
     ; GFX8-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
+    ;
     ; GFX6-LABEL: name: test_sext_inreg_v4s32_1
     ; GFX6: liveins: $vgpr0_vgpr1_vgpr2_vgpr3
     ; GFX6-NEXT: {{  $}}
@@ -1044,6 +1094,7 @@ body: |
     ; GFX9-NEXT: [[ASHR1:%[0-9]+]]:_(<2 x s16>) = G_ASHR [[SHL1]], [[BUILD_VECTOR1]](<2 x s16>)
     ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[ASHR]](<2 x s16>), [[ASHR1]](<2 x s16>)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
     ; GFX8-LABEL: name: test_sext_inreg_v4s16_1
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
@@ -1079,6 +1130,7 @@ body: |
     ; GFX8-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
     ; GFX8-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST2]](<2 x s16>), [[BITCAST3]](<2 x s16>)
     ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
     ; GFX6-LABEL: name: test_sext_inreg_v4s16_1
     ; GFX6: liveins: $vgpr0_vgpr1
     ; GFX6-NEXT: {{  $}}
@@ -1131,6 +1183,7 @@ body: |
     ; GFX9-NEXT: [[ASHR2:%[0-9]+]]:_(<2 x s16>) = G_ASHR [[SHL2]], [[BUILD_VECTOR2]](<2 x s16>)
     ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[ASHR]](<2 x s16>), [[ASHR1]](<2 x s16>), [[ASHR2]](<2 x s16>)
     ; GFX9-NEXT: S_ENDPGM 0, implicit [[CONCAT_VECTORS]](<6 x s16>)
+    ;
     ; GFX8-LABEL: name: test_sext_inreg_v6s16_1
     ; GFX8: [[DEF:%[0-9]+]]:_(<6 x s16>) = G_IMPLICIT_DEF
     ; GFX8-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>), [[UV2:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<6 x s16>)
@@ -1177,6 +1230,7 @@ body: |
     ; GFX8-NEXT: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
     ; GFX8-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>)
     ; GFX8-NEXT: S_ENDPGM 0, implicit [[CONCAT_VECTORS]](<6 x s16>)
+    ;
     ; GFX6-LABEL: name: test_sext_inreg_v6s16_1
     ; GFX6: [[DEF:%[0-9]+]]:_(<6 x s16>) = G_IMPLICIT_DEF
     ; GFX6-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>), [[UV2:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<6 x s16>)
@@ -1241,6 +1295,7 @@ body: |
     ; GFX9-NEXT: [[MV1:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[SEXT_INREG1]](s64), [[ASHR1]](s64)
     ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s128>) = G_BUILD_VECTOR [[MV]](s128), [[MV1]](s128)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<2 x s128>)
+    ;
     ; GFX8-LABEL: name: test_sext_inreg_v2s128_1
     ; GFX8: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
     ; GFX8-NEXT: {{  $}}
@@ -1258,6 +1313,7 @@ body: |
     ; GFX8-NEXT: [[MV1:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[SEXT_INREG1]](s64), [[ASHR1]](s64)
     ; GFX8-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s128>) = G_BUILD_VECTOR [[MV]](s128), [[MV1]](s128)
     ; GFX8-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<2 x s128>)
+    ;
     ; GFX6-LABEL: name: test_sext_inreg_v2s128_1
     ; GFX6: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
     ; GFX6-NEXT: {{  $}}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sext.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sext.mir
index 12d1ce39c0a98..847ffc8aadc07 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sext.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sext.mir
@@ -659,8 +659,8 @@ body: |
     ; CHECK-NEXT: [[ASHR:%[0-9]+]]:_(s16) = G_ASHR [[SHL]], [[C3]](s16)
     ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 7
     ; CHECK-NEXT: [[ASHR1:%[0-9]+]]:_(s16) = G_ASHR [[ASHR]], [[C4]](s16)
-    ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+    ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C5]]
     ; CHECK-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
     ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C5]]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-shl.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-shl.mir
index 1b620f65d164a..b1de5dbdff9f1 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-shl.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-shl.mir
@@ -18,6 +18,7 @@ body: |
     ; SI-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
     ; SI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[COPY1]](s32)
     ; SI-NEXT: $vgpr0 = COPY [[SHL]](s32)
+    ;
     ; VI-LABEL: name: test_shl_s32_s32
     ; VI: liveins: $vgpr0, $vgpr1
     ; VI-NEXT: {{  $}}
@@ -25,6 +26,7 @@ body: |
     ; VI-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
     ; VI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[COPY1]](s32)
     ; VI-NEXT: $vgpr0 = COPY [[SHL]](s32)
+    ;
     ; GFX9-LABEL: name: test_shl_s32_s32
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -51,6 +53,7 @@ body: |
     ; SI-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64)
     ; SI-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[COPY]], [[TRUNC]](s32)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[SHL]](s64)
+    ;
     ; VI-LABEL: name: test_shl_s64_s64
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; VI-NEXT: {{  $}}
@@ -59,6 +62,7 @@ body: |
     ; VI-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64)
     ; VI-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[COPY]], [[TRUNC]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[SHL]](s64)
+    ;
     ; GFX9-LABEL: name: test_shl_s64_s64
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -85,6 +89,7 @@ body: |
     ; SI-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr2
     ; SI-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[COPY]], [[COPY1]](s32)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[SHL]](s64)
+    ;
     ; VI-LABEL: name: test_shl_s64_s32
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2
     ; VI-NEXT: {{  $}}
@@ -92,6 +97,7 @@ body: |
     ; VI-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr2
     ; VI-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[COPY]], [[COPY1]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[SHL]](s64)
+    ;
     ; GFX9-LABEL: name: test_shl_s64_s32
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9-NEXT: {{  $}}
@@ -119,6 +125,7 @@ body: |
     ; SI-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C]]
     ; SI-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[COPY]], [[AND]](s32)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[SHL]](s64)
+    ;
     ; VI-LABEL: name: test_shl_s64_s16
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2
     ; VI-NEXT: {{  $}}
@@ -128,6 +135,7 @@ body: |
     ; VI-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C]]
     ; VI-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[COPY]], [[AND]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[SHL]](s64)
+    ;
     ; GFX9-LABEL: name: test_shl_s64_s16
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9-NEXT: {{  $}}
@@ -157,6 +165,7 @@ body: |
     ; SI-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
     ; SI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[COPY1]](s32)
     ; SI-NEXT: $vgpr0 = COPY [[SHL]](s32)
+    ;
     ; VI-LABEL: name: test_shl_s16_s32
     ; VI: liveins: $vgpr0, $vgpr1
     ; VI-NEXT: {{  $}}
@@ -167,6 +176,7 @@ body: |
     ; VI-NEXT: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[TRUNC1]](s16)
     ; VI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[SHL]](s16)
     ; VI-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ;
     ; GFX9-LABEL: name: test_shl_s16_s32
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -200,6 +210,7 @@ body: |
     ; SI-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C]]
     ; SI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[AND]](s32)
     ; SI-NEXT: $vgpr0 = COPY [[SHL]](s32)
+    ;
     ; VI-LABEL: name: test_shl_s16_s16
     ; VI: liveins: $vgpr0, $vgpr1
     ; VI-NEXT: {{  $}}
@@ -210,6 +221,7 @@ body: |
     ; VI-NEXT: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[TRUNC1]](s16)
     ; VI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[SHL]](s16)
     ; VI-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ;
     ; GFX9-LABEL: name: test_shl_s16_s16
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -244,26 +256,28 @@ body: |
     ; SI-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C]]
     ; SI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[AND]](s32)
     ; SI-NEXT: $vgpr0 = COPY [[SHL]](s32)
+    ;
     ; VI-LABEL: name: test_shl_s16_i8
     ; VI: liveins: $vgpr0, $vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; VI-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
     ; VI-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
-    ; VI-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; VI-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+    ; VI-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; VI-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C]]
     ; VI-NEXT: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[AND]](s16)
     ; VI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[SHL]](s16)
     ; VI-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ;
     ; GFX9-LABEL: name: test_shl_s16_i8
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
     ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C]]
     ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[AND]](s16)
     ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[SHL]](s16)
@@ -292,25 +306,27 @@ body: |
     ; SI-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C]]
     ; SI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[AND]](s32)
     ; SI-NEXT: $vgpr0 = COPY [[SHL]](s32)
+    ;
     ; VI-LABEL: name: test_shl_i8_i8
     ; VI: liveins: $vgpr0, $vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; VI-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; VI-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; VI-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+    ; VI-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; VI-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C]]
     ; VI-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
     ; VI-NEXT: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC1]], [[AND]](s16)
     ; VI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[SHL]](s16)
     ; VI-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ;
     ; GFX9-LABEL: name: test_shl_i8_i8
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C]]
     ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
     ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC1]], [[AND]](s16)
@@ -343,6 +359,7 @@ body: |
     ; SI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[UV1]], [[UV3]](s32)
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[SHL]](s32), [[SHL1]](s32)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; VI-LABEL: name: test_shl_v2s32_v2s32
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; VI-NEXT: {{  $}}
@@ -354,6 +371,7 @@ body: |
     ; VI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[UV1]], [[UV3]](s32)
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[SHL]](s32), [[SHL1]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; GFX9-LABEL: name: test_shl_v2s32_v2s32
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -389,6 +407,7 @@ body: |
     ; SI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[UV2]], [[UV5]](s32)
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[SHL]](s32), [[SHL1]](s32), [[SHL2]](s32)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
+    ;
     ; VI-LABEL: name: test_shl_v3s32_v3s32
     ; VI: liveins: $vgpr0_vgpr1_vgpr2, $vgpr3_vgpr4_vgpr5
     ; VI-NEXT: {{  $}}
@@ -401,6 +420,7 @@ body: |
     ; VI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[UV2]], [[UV5]](s32)
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[SHL]](s32), [[SHL1]](s32), [[SHL2]](s32)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
+    ;
     ; GFX9-LABEL: name: test_shl_v3s32_v3s32
     ; GFX9: liveins: $vgpr0_vgpr1_vgpr2, $vgpr3_vgpr4_vgpr5
     ; GFX9-NEXT: {{  $}}
@@ -436,6 +456,7 @@ body: |
     ; SI-NEXT: [[SHL1:%[0-9]+]]:_(s64) = G_SHL [[UV1]], [[UV3]](s32)
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[SHL]](s64), [[SHL1]](s64)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ;
     ; VI-LABEL: name: test_shl_v2s64_v2s32
     ; VI: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5
     ; VI-NEXT: {{  $}}
@@ -447,6 +468,7 @@ body: |
     ; VI-NEXT: [[SHL1:%[0-9]+]]:_(s64) = G_SHL [[UV1]], [[UV3]](s32)
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[SHL]](s64), [[SHL1]](s64)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ;
     ; GFX9-LABEL: name: test_shl_v2s64_v2s32
     ; GFX9: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5
     ; GFX9-NEXT: {{  $}}
@@ -484,6 +506,7 @@ body: |
     ; SI-NEXT: [[UV7:%[0-9]+]]:_(s64), [[UV8:%[0-9]+]]:_(s64), [[UV9:%[0-9]+]]:_(s64), [[UV10:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>)
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[SHL]](s64), [[SHL1]](s64), [[SHL2]](s64), [[UV10]](s64)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
+    ;
     ; VI-LABEL: name: test_shl_v3s64_v3s32
     ; VI: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10
     ; VI-NEXT: {{  $}}
@@ -498,6 +521,7 @@ body: |
     ; VI-NEXT: [[UV7:%[0-9]+]]:_(s64), [[UV8:%[0-9]+]]:_(s64), [[UV9:%[0-9]+]]:_(s64), [[UV10:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[DEF]](<4 x s64>)
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[SHL]](s64), [[SHL1]](s64), [[SHL2]](s64), [[UV10]](s64)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<4 x s64>)
+    ;
     ; GFX9-LABEL: name: test_shl_v3s64_v3s32
     ; GFX9: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10
     ; GFX9-NEXT: {{  $}}
@@ -540,14 +564,14 @@ body: |
     ; SI-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
     ; SI-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C1]]
     ; SI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[BITCAST]], [[AND]](s32)
-    ; SI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C1]]
-    ; SI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LSHR]], [[AND1]](s32)
-    ; SI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[SHL]], [[C1]]
-    ; SI-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[SHL1]], [[C1]]
-    ; SI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C]](s32)
-    ; SI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL2]]
+    ; SI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LSHR]], [[LSHR1]](s32)
+    ; SI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[SHL]], [[C1]]
+    ; SI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[SHL1]], [[C1]]
+    ; SI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C]](s32)
+    ; SI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND1]], [[SHL2]]
     ; SI-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; SI-NEXT: $vgpr0 = COPY [[BITCAST2]](<2 x s16>)
+    ;
     ; VI-LABEL: name: test_shl_v2s16_v2s16
     ; VI: liveins: $vgpr0, $vgpr1
     ; VI-NEXT: {{  $}}
@@ -570,6 +594,7 @@ body: |
     ; VI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]]
     ; VI-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; VI-NEXT: $vgpr0 = COPY [[BITCAST2]](<2 x s16>)
+    ;
     ; GFX9-LABEL: name: test_shl_v2s16_v2s16
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -607,6 +632,7 @@ body: |
     ; SI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL2]]
     ; SI-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; SI-NEXT: $vgpr0 = COPY [[BITCAST1]](<2 x s16>)
+    ;
     ; VI-LABEL: name: test_shl_v2s16_v2s32
     ; VI: liveins: $vgpr0, $vgpr0_vgpr1
     ; VI-NEXT: {{  $}}
@@ -628,6 +654,7 @@ body: |
     ; VI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]]
     ; VI-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; VI-NEXT: $vgpr0 = COPY [[BITCAST1]](<2 x s16>)
+    ;
     ; GFX9-LABEL: name: test_shl_v2s16_v2s32
     ; GFX9: liveins: $vgpr0, $vgpr0_vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -673,32 +700,31 @@ body: |
     ; SI-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
     ; SI-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[BITCAST2]], [[C1]]
     ; SI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[BITCAST]], [[AND]](s32)
-    ; SI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C1]]
-    ; SI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LSHR]], [[AND1]](s32)
-    ; SI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[BITCAST3]], [[C1]]
-    ; SI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[BITCAST1]], [[AND2]](s32)
+    ; SI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LSHR]], [[LSHR1]](s32)
+    ; SI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[BITCAST3]], [[C1]]
+    ; SI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[BITCAST1]], [[AND1]](s32)
     ; SI-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
     ; SI-NEXT: [[UV6:%[0-9]+]]:_(<2 x s16>), [[UV7:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>)
     ; SI-NEXT: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[UV6]](<2 x s16>)
     ; SI-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST4]], [[C]](s32)
     ; SI-NEXT: [[BITCAST5:%[0-9]+]]:_(s32) = G_BITCAST [[UV7]](<2 x s16>)
-    ; SI-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[SHL]], [[C1]]
-    ; SI-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[SHL1]], [[C1]]
-    ; SI-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND4]], [[C]](s32)
-    ; SI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND3]], [[SHL3]]
+    ; SI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[SHL]], [[C1]]
+    ; SI-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[SHL1]], [[C1]]
+    ; SI-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C]](s32)
+    ; SI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL3]]
     ; SI-NEXT: [[BITCAST6:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
-    ; SI-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[SHL2]], [[C1]]
-    ; SI-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[BITCAST4]], [[C1]]
-    ; SI-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND6]], [[C]](s32)
-    ; SI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND5]], [[SHL4]]
+    ; SI-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[SHL2]], [[C1]]
+    ; SI-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[BITCAST4]], [[C1]]
+    ; SI-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C]](s32)
+    ; SI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL4]]
     ; SI-NEXT: [[BITCAST7:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
-    ; SI-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[LSHR2]], [[C1]]
-    ; SI-NEXT: [[AND8:%[0-9]+]]:_(s32) = G_AND [[BITCAST5]], [[C1]]
-    ; SI-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND8]], [[C]](s32)
-    ; SI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND7]], [[SHL5]]
+    ; SI-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[BITCAST5]], [[C1]]
+    ; SI-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND6]], [[C]](s32)
+    ; SI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[LSHR2]], [[SHL5]]
     ; SI-NEXT: [[BITCAST8:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
     ; SI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST6]](<2 x s16>), [[BITCAST7]](<2 x s16>), [[BITCAST8]](<2 x s16>)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
     ; VI-LABEL: name: test_shl_v3s16_v3s16
     ; VI: liveins: $vgpr0_vgpr1_vgpr2, $vgpr3_vgpr4_vgpr5
     ; VI-NEXT: {{  $}}
@@ -738,13 +764,13 @@ body: |
     ; VI-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND]], [[C]](s32)
     ; VI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL4]]
     ; VI-NEXT: [[BITCAST7:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
-    ; VI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR2]], [[C1]]
-    ; VI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[BITCAST5]], [[C1]]
-    ; VI-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C]](s32)
-    ; VI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND1]], [[SHL5]]
+    ; VI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[BITCAST5]], [[C1]]
+    ; VI-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
+    ; VI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[LSHR2]], [[SHL5]]
     ; VI-NEXT: [[BITCAST8:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
     ; VI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST6]](<2 x s16>), [[BITCAST7]](<2 x s16>), [[BITCAST8]](<2 x s16>)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
     ; GFX9-LABEL: name: test_shl_v3s16_v3s16
     ; GFX9: liveins: $vgpr0_vgpr1_vgpr2, $vgpr3_vgpr4_vgpr5
     ; GFX9-NEXT: {{  $}}
@@ -814,24 +840,23 @@ body: |
     ; SI-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
     ; SI-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[BITCAST2]], [[C1]]
     ; SI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[BITCAST]], [[AND]](s32)
-    ; SI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR2]], [[C1]]
-    ; SI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LSHR]], [[AND1]](s32)
-    ; SI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[BITCAST3]], [[C1]]
-    ; SI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[BITCAST1]], [[AND2]](s32)
-    ; SI-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[LSHR3]], [[C1]]
-    ; SI-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LSHR1]], [[AND3]](s32)
-    ; SI-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[SHL]], [[C1]]
-    ; SI-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[SHL1]], [[C1]]
-    ; SI-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C]](s32)
-    ; SI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL4]]
+    ; SI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[LSHR]], [[LSHR2]](s32)
+    ; SI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[BITCAST3]], [[C1]]
+    ; SI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[BITCAST1]], [[AND1]](s32)
+    ; SI-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[LSHR1]], [[LSHR3]](s32)
+    ; SI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[SHL]], [[C1]]
+    ; SI-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[SHL1]], [[C1]]
+    ; SI-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C]](s32)
+    ; SI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL4]]
     ; SI-NEXT: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
-    ; SI-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[SHL2]], [[C1]]
-    ; SI-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[SHL3]], [[C1]]
-    ; SI-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C]](s32)
-    ; SI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND6]], [[SHL5]]
+    ; SI-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[SHL2]], [[C1]]
+    ; SI-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[SHL3]], [[C1]]
+    ; SI-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C]](s32)
+    ; SI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL5]]
     ; SI-NEXT: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
     ; SI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
     ; VI-LABEL: name: test_shl_v4s16_v4s16
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; VI-NEXT: {{  $}}
@@ -872,6 +897,7 @@ body: |
     ; VI-NEXT: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
     ; VI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
     ; GFX9-LABEL: name: test_shl_v4s16_v4s16
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -904,25 +930,27 @@ body: |
     ; SI-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C]]
     ; SI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[AND]](s32)
     ; SI-NEXT: $vgpr0 = COPY [[SHL]](s32)
+    ;
     ; VI-LABEL: name: test_shl_s7_s7
     ; VI: liveins: $vgpr0, $vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; VI-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; VI-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 127
     ; VI-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+    ; VI-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 127
     ; VI-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C]]
     ; VI-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
     ; VI-NEXT: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC1]], [[AND]](s16)
     ; VI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[SHL]](s16)
     ; VI-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ;
     ; GFX9-LABEL: name: test_shl_s7_s7
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 127
     ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 127
     ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C]]
     ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
     ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC1]], [[AND]](s16)
@@ -950,6 +978,7 @@ body: |
     ; SI-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
     ; SI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[COPY1]](s32)
     ; SI-NEXT: $vgpr0 = COPY [[SHL]](s32)
+    ;
     ; VI-LABEL: name: test_shl_i24_i32
     ; VI: liveins: $vgpr0, $vgpr1
     ; VI-NEXT: {{  $}}
@@ -957,6 +986,7 @@ body: |
     ; VI-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
     ; VI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[COPY1]](s32)
     ; VI-NEXT: $vgpr0 = COPY [[SHL]](s32)
+    ;
     ; GFX9-LABEL: name: test_shl_i24_i32
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -983,66 +1013,68 @@ body: |
     ; SI-NEXT: {{  $}}
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(s128) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
     ; SI-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr4
-    ; SI-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 64
+    ; SI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 64
     ; SI-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[COPY]](s128)
-    ; SI-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[COPY1]], [[C2]]
-    ; SI-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[C2]], [[COPY1]]
-    ; SI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-    ; SI-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY1]](s32), [[C2]]
-    ; SI-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C]]
+    ; SI-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[COPY1]], [[C]]
+    ; SI-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[C]], [[COPY1]]
+    ; SI-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; SI-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY1]](s32), [[C]]
+    ; SI-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C1]]
     ; SI-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[UV]], [[COPY1]](s32)
     ; SI-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[UV]], [[SUB1]](s32)
     ; SI-NEXT: [[SHL1:%[0-9]+]]:_(s64) = G_SHL [[UV1]], [[COPY1]](s32)
     ; SI-NEXT: [[OR:%[0-9]+]]:_(s64) = G_OR [[LSHR]], [[SHL1]]
-    ; SI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; SI-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
     ; SI-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[UV]], [[SUB]](s32)
-    ; SI-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s1), [[SHL]], [[C1]]
+    ; SI-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s1), [[SHL]], [[C2]]
     ; SI-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s1), [[OR]], [[SHL2]]
     ; SI-NEXT: [[SELECT2:%[0-9]+]]:_(s64) = G_SELECT [[ICMP1]](s1), [[UV1]], [[SELECT1]]
     ; SI-NEXT: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[SELECT]](s64), [[SELECT2]](s64)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV]](s128)
+    ;
     ; VI-LABEL: name: test_shl_s128_s128
     ; VI: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(s128) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
     ; VI-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr4
-    ; VI-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 64
+    ; VI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 64
     ; VI-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[COPY]](s128)
-    ; VI-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[COPY1]], [[C2]]
-    ; VI-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[C2]], [[COPY1]]
-    ; VI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-    ; VI-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY1]](s32), [[C2]]
-    ; VI-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C]]
+    ; VI-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[COPY1]], [[C]]
+    ; VI-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[C]], [[COPY1]]
+    ; VI-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; VI-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY1]](s32), [[C]]
+    ; VI-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C1]]
     ; VI-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[UV]], [[COPY1]](s32)
     ; VI-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[UV]], [[SUB1]](s32)
     ; VI-NEXT: [[SHL1:%[0-9]+]]:_(s64) = G_SHL [[UV1]], [[COPY1]](s32)
     ; VI-NEXT: [[OR:%[0-9]+]]:_(s64) = G_OR [[LSHR]], [[SHL1]]
-    ; VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; VI-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
     ; VI-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[UV]], [[SUB]](s32)
-    ; VI-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s1), [[SHL]], [[C1]]
+    ; VI-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s1), [[SHL]], [[C2]]
     ; VI-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s1), [[OR]], [[SHL2]]
     ; VI-NEXT: [[SELECT2:%[0-9]+]]:_(s64) = G_SELECT [[ICMP1]](s1), [[UV1]], [[SELECT1]]
     ; VI-NEXT: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[SELECT]](s64), [[SELECT2]](s64)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV]](s128)
+    ;
     ; GFX9-LABEL: name: test_shl_s128_s128
     ; GFX9: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s128) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr4
-    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 64
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 64
     ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[COPY]](s128)
-    ; GFX9-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[COPY1]], [[C2]]
-    ; GFX9-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[C2]], [[COPY1]]
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-    ; GFX9-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY1]](s32), [[C2]]
-    ; GFX9-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C]]
+    ; GFX9-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[COPY1]], [[C]]
+    ; GFX9-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[C]], [[COPY1]]
+    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; GFX9-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY1]](s32), [[C]]
+    ; GFX9-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C1]]
     ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[UV]], [[COPY1]](s32)
     ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[UV]], [[SUB1]](s32)
     ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s64) = G_SHL [[UV1]], [[COPY1]](s32)
     ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s64) = G_OR [[LSHR]], [[SHL1]]
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
     ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[UV]], [[SUB]](s32)
-    ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s1), [[SHL]], [[C1]]
+    ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s1), [[SHL]], [[C2]]
     ; GFX9-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s1), [[OR]], [[SHL2]]
     ; GFX9-NEXT: [[SELECT2:%[0-9]+]]:_(s64) = G_SELECT [[ICMP1]](s1), [[UV1]], [[SELECT1]]
     ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[SELECT]](s64), [[SELECT2]](s64)
@@ -1083,6 +1115,7 @@ body: |
     ; SI-NEXT: [[SELECT2:%[0-9]+]]:_(s64) = G_SELECT [[ICMP1]](s1), [[UV1]], [[SELECT1]]
     ; SI-NEXT: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[SELECT]](s64), [[SELECT2]](s64)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV]](s128)
+    ;
     ; VI-LABEL: name: test_shl_s128_s132
     ; VI: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4
     ; VI-NEXT: {{  $}}
@@ -1106,6 +1139,7 @@ body: |
     ; VI-NEXT: [[SELECT2:%[0-9]+]]:_(s64) = G_SELECT [[ICMP1]](s1), [[UV1]], [[SELECT1]]
     ; VI-NEXT: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[SELECT]](s64), [[SELECT2]](s64)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV]](s128)
+    ;
     ; GFX9-LABEL: name: test_shl_s128_s132
     ; GFX9: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4
     ; GFX9-NEXT: {{  $}}
@@ -1146,11 +1180,13 @@ body: |
     ; SI-NEXT: {{  $}}
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(s128) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[COPY]](s128)
+    ;
     ; VI-LABEL: name: test_shl_s128_s32_0
     ; VI: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(s128) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[COPY]](s128)
+    ;
     ; GFX9-LABEL: name: test_shl_s128_s32_0
     ; GFX9: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4
     ; GFX9-NEXT: {{  $}}
@@ -1182,6 +1218,7 @@ body: |
     ; SI-NEXT: [[OR:%[0-9]+]]:_(s64) = G_OR [[SHL1]], [[LSHR]]
     ; SI-NEXT: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[SHL]](s64), [[OR]](s64)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV]](s128)
+    ;
     ; VI-LABEL: name: test_shl_s128_s32_23
     ; VI: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4
     ; VI-NEXT: {{  $}}
@@ -1195,6 +1232,7 @@ body: |
     ; VI-NEXT: [[OR:%[0-9]+]]:_(s64) = G_OR [[SHL1]], [[LSHR]]
     ; VI-NEXT: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[SHL]](s64), [[OR]](s64)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV]](s128)
+    ;
     ; GFX9-LABEL: name: test_shl_s128_s32_23
     ; GFX9: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4
     ; GFX9-NEXT: {{  $}}
@@ -1233,6 +1271,7 @@ body: |
     ; SI-NEXT: [[OR:%[0-9]+]]:_(s64) = G_OR [[SHL1]], [[LSHR]]
     ; SI-NEXT: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[SHL]](s64), [[OR]](s64)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV]](s128)
+    ;
     ; VI-LABEL: name: test_shl_s128_s32_31
     ; VI: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4
     ; VI-NEXT: {{  $}}
@@ -1246,6 +1285,7 @@ body: |
     ; VI-NEXT: [[OR:%[0-9]+]]:_(s64) = G_OR [[SHL1]], [[LSHR]]
     ; VI-NEXT: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[SHL]](s64), [[OR]](s64)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV]](s128)
+    ;
     ; GFX9-LABEL: name: test_shl_s128_s32_31
     ; GFX9: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4
     ; GFX9-NEXT: {{  $}}
@@ -1283,6 +1323,7 @@ body: |
     ; SI-NEXT: [[OR:%[0-9]+]]:_(s64) = G_OR [[SHL1]], [[LSHR]]
     ; SI-NEXT: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[SHL]](s64), [[OR]](s64)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV]](s128)
+    ;
     ; VI-LABEL: name: test_shl_s128_s32_32
     ; VI: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4
     ; VI-NEXT: {{  $}}
@@ -1295,6 +1336,7 @@ body: |
     ; VI-NEXT: [[OR:%[0-9]+]]:_(s64) = G_OR [[SHL1]], [[LSHR]]
     ; VI-NEXT: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[SHL]](s64), [[OR]](s64)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV]](s128)
+    ;
     ; GFX9-LABEL: name: test_shl_s128_s32_32
     ; GFX9: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4
     ; GFX9-NEXT: {{  $}}
@@ -1332,6 +1374,7 @@ body: |
     ; SI-NEXT: [[OR:%[0-9]+]]:_(s64) = G_OR [[SHL1]], [[LSHR]]
     ; SI-NEXT: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[SHL]](s64), [[OR]](s64)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV]](s128)
+    ;
     ; VI-LABEL: name: test_shl_s128_s32_33
     ; VI: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4
     ; VI-NEXT: {{  $}}
@@ -1345,6 +1388,7 @@ body: |
     ; VI-NEXT: [[OR:%[0-9]+]]:_(s64) = G_OR [[SHL1]], [[LSHR]]
     ; VI-NEXT: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[SHL]](s64), [[OR]](s64)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV]](s128)
+    ;
     ; GFX9-LABEL: name: test_shl_s128_s32_33
     ; GFX9: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4
     ; GFX9-NEXT: {{  $}}
@@ -1380,6 +1424,7 @@ body: |
     ; SI-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[UV]], [[C1]](s32)
     ; SI-NEXT: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[C]](s64), [[SHL]](s64)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV]](s128)
+    ;
     ; VI-LABEL: name: test_shl_s128_s32_127
     ; VI: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4
     ; VI-NEXT: {{  $}}
@@ -1390,6 +1435,7 @@ body: |
     ; VI-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[UV]], [[C1]](s32)
     ; VI-NEXT: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[C]](s64), [[SHL]](s64)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV]](s128)
+    ;
     ; GFX9-LABEL: name: test_shl_s128_s32_127
     ; GFX9: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4
     ; GFX9-NEXT: {{  $}}
@@ -1417,33 +1463,33 @@ body: |
     ; SI-NEXT: {{  $}}
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(s256) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
     ; SI-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr8
-    ; SI-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 128
+    ; SI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 128
     ; SI-NEXT: [[UV:%[0-9]+]]:_(s128), [[UV1:%[0-9]+]]:_(s128) = G_UNMERGE_VALUES [[COPY]](s256)
-    ; SI-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[COPY1]], [[C2]]
-    ; SI-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[C2]], [[COPY1]]
-    ; SI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-    ; SI-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY1]](s32), [[C2]]
-    ; SI-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C]]
-    ; SI-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 64
+    ; SI-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[COPY1]], [[C]]
+    ; SI-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[C]], [[COPY1]]
+    ; SI-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; SI-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY1]](s32), [[C]]
+    ; SI-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C1]]
+    ; SI-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 64
     ; SI-NEXT: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[UV]](s128)
-    ; SI-NEXT: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[COPY1]], [[C3]]
-    ; SI-NEXT: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[C3]], [[COPY1]]
-    ; SI-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY1]](s32), [[C3]]
-    ; SI-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C]]
+    ; SI-NEXT: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[COPY1]], [[C2]]
+    ; SI-NEXT: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[C2]], [[COPY1]]
+    ; SI-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY1]](s32), [[C2]]
+    ; SI-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C1]]
     ; SI-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[UV2]], [[COPY1]](s32)
     ; SI-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[UV2]], [[SUB3]](s32)
     ; SI-NEXT: [[SHL1:%[0-9]+]]:_(s64) = G_SHL [[UV3]], [[COPY1]](s32)
     ; SI-NEXT: [[OR:%[0-9]+]]:_(s64) = G_OR [[LSHR]], [[SHL1]]
-    ; SI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; SI-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
     ; SI-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[UV2]], [[SUB2]](s32)
-    ; SI-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP2]](s1), [[SHL]], [[C1]]
+    ; SI-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP2]](s1), [[SHL]], [[C3]]
     ; SI-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[ICMP2]](s1), [[OR]], [[SHL2]]
     ; SI-NEXT: [[SELECT2:%[0-9]+]]:_(s64) = G_SELECT [[ICMP3]](s1), [[UV3]], [[SELECT1]]
     ; SI-NEXT: [[UV4:%[0-9]+]]:_(s64), [[UV5:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[UV]](s128)
-    ; SI-NEXT: [[SUB4:%[0-9]+]]:_(s32) = G_SUB [[SUB1]], [[C3]]
-    ; SI-NEXT: [[SUB5:%[0-9]+]]:_(s32) = G_SUB [[C3]], [[SUB1]]
-    ; SI-NEXT: [[ICMP4:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[SUB1]](s32), [[C3]]
-    ; SI-NEXT: [[ICMP5:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[SUB1]](s32), [[C]]
+    ; SI-NEXT: [[SUB4:%[0-9]+]]:_(s32) = G_SUB [[SUB1]], [[C2]]
+    ; SI-NEXT: [[SUB5:%[0-9]+]]:_(s32) = G_SUB [[C2]], [[SUB1]]
+    ; SI-NEXT: [[ICMP4:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[SUB1]](s32), [[C2]]
+    ; SI-NEXT: [[ICMP5:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[SUB1]](s32), [[C1]]
     ; SI-NEXT: [[LSHR1:%[0-9]+]]:_(s64) = G_LSHR [[UV5]], [[SUB1]](s32)
     ; SI-NEXT: [[LSHR2:%[0-9]+]]:_(s64) = G_LSHR [[UV4]], [[SUB1]](s32)
     ; SI-NEXT: [[SHL3:%[0-9]+]]:_(s64) = G_SHL [[UV5]], [[SUB5]](s32)
@@ -1451,37 +1497,37 @@ body: |
     ; SI-NEXT: [[LSHR3:%[0-9]+]]:_(s64) = G_LSHR [[UV5]], [[SUB4]](s32)
     ; SI-NEXT: [[SELECT3:%[0-9]+]]:_(s64) = G_SELECT [[ICMP4]](s1), [[OR1]], [[LSHR3]]
     ; SI-NEXT: [[SELECT4:%[0-9]+]]:_(s64) = G_SELECT [[ICMP5]](s1), [[UV4]], [[SELECT3]]
-    ; SI-NEXT: [[SELECT5:%[0-9]+]]:_(s64) = G_SELECT [[ICMP4]](s1), [[LSHR1]], [[C1]]
+    ; SI-NEXT: [[SELECT5:%[0-9]+]]:_(s64) = G_SELECT [[ICMP4]](s1), [[LSHR1]], [[C3]]
     ; SI-NEXT: [[UV6:%[0-9]+]]:_(s64), [[UV7:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[UV1]](s128)
-    ; SI-NEXT: [[SUB6:%[0-9]+]]:_(s32) = G_SUB [[COPY1]], [[C3]]
-    ; SI-NEXT: [[SUB7:%[0-9]+]]:_(s32) = G_SUB [[C3]], [[COPY1]]
-    ; SI-NEXT: [[ICMP6:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY1]](s32), [[C3]]
-    ; SI-NEXT: [[ICMP7:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C]]
+    ; SI-NEXT: [[SUB6:%[0-9]+]]:_(s32) = G_SUB [[COPY1]], [[C2]]
+    ; SI-NEXT: [[SUB7:%[0-9]+]]:_(s32) = G_SUB [[C2]], [[COPY1]]
+    ; SI-NEXT: [[ICMP6:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY1]](s32), [[C2]]
+    ; SI-NEXT: [[ICMP7:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C1]]
     ; SI-NEXT: [[SHL4:%[0-9]+]]:_(s64) = G_SHL [[UV6]], [[COPY1]](s32)
     ; SI-NEXT: [[LSHR4:%[0-9]+]]:_(s64) = G_LSHR [[UV6]], [[SUB7]](s32)
     ; SI-NEXT: [[SHL5:%[0-9]+]]:_(s64) = G_SHL [[UV7]], [[COPY1]](s32)
     ; SI-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[LSHR4]], [[SHL5]]
     ; SI-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[UV6]], [[SUB6]](s32)
-    ; SI-NEXT: [[SELECT6:%[0-9]+]]:_(s64) = G_SELECT [[ICMP6]](s1), [[SHL4]], [[C1]]
+    ; SI-NEXT: [[SELECT6:%[0-9]+]]:_(s64) = G_SELECT [[ICMP6]](s1), [[SHL4]], [[C3]]
     ; SI-NEXT: [[SELECT7:%[0-9]+]]:_(s64) = G_SELECT [[ICMP6]](s1), [[OR2]], [[SHL6]]
     ; SI-NEXT: [[SELECT8:%[0-9]+]]:_(s64) = G_SELECT [[ICMP7]](s1), [[UV7]], [[SELECT7]]
     ; SI-NEXT: [[OR3:%[0-9]+]]:_(s64) = G_OR [[SELECT4]], [[SELECT6]]
     ; SI-NEXT: [[OR4:%[0-9]+]]:_(s64) = G_OR [[SELECT5]], [[SELECT8]]
     ; SI-NEXT: [[UV8:%[0-9]+]]:_(s64), [[UV9:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[UV]](s128)
-    ; SI-NEXT: [[SUB8:%[0-9]+]]:_(s32) = G_SUB [[SUB]], [[C3]]
-    ; SI-NEXT: [[SUB9:%[0-9]+]]:_(s32) = G_SUB [[C3]], [[SUB]]
-    ; SI-NEXT: [[ICMP8:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[SUB]](s32), [[C3]]
-    ; SI-NEXT: [[ICMP9:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[SUB]](s32), [[C]]
+    ; SI-NEXT: [[SUB8:%[0-9]+]]:_(s32) = G_SUB [[SUB]], [[C2]]
+    ; SI-NEXT: [[SUB9:%[0-9]+]]:_(s32) = G_SUB [[C2]], [[SUB]]
+    ; SI-NEXT: [[ICMP8:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[SUB]](s32), [[C2]]
+    ; SI-NEXT: [[ICMP9:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[SUB]](s32), [[C1]]
     ; SI-NEXT: [[SHL7:%[0-9]+]]:_(s64) = G_SHL [[UV8]], [[SUB]](s32)
     ; SI-NEXT: [[LSHR5:%[0-9]+]]:_(s64) = G_LSHR [[UV8]], [[SUB9]](s32)
     ; SI-NEXT: [[SHL8:%[0-9]+]]:_(s64) = G_SHL [[UV9]], [[SUB]](s32)
     ; SI-NEXT: [[OR5:%[0-9]+]]:_(s64) = G_OR [[LSHR5]], [[SHL8]]
     ; SI-NEXT: [[SHL9:%[0-9]+]]:_(s64) = G_SHL [[UV8]], [[SUB8]](s32)
-    ; SI-NEXT: [[SELECT9:%[0-9]+]]:_(s64) = G_SELECT [[ICMP8]](s1), [[SHL7]], [[C1]]
+    ; SI-NEXT: [[SELECT9:%[0-9]+]]:_(s64) = G_SELECT [[ICMP8]](s1), [[SHL7]], [[C3]]
     ; SI-NEXT: [[SELECT10:%[0-9]+]]:_(s64) = G_SELECT [[ICMP8]](s1), [[OR5]], [[SHL9]]
     ; SI-NEXT: [[SELECT11:%[0-9]+]]:_(s64) = G_SELECT [[ICMP9]](s1), [[UV9]], [[SELECT10]]
-    ; SI-NEXT: [[SELECT12:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s1), [[SELECT]], [[C1]]
-    ; SI-NEXT: [[SELECT13:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s1), [[SELECT2]], [[C1]]
+    ; SI-NEXT: [[SELECT12:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s1), [[SELECT]], [[C3]]
+    ; SI-NEXT: [[SELECT13:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s1), [[SELECT2]], [[C3]]
     ; SI-NEXT: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[SELECT12]](s64), [[SELECT13]](s64)
     ; SI-NEXT: [[SELECT14:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s1), [[OR3]], [[SELECT9]]
     ; SI-NEXT: [[SELECT15:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s1), [[OR4]], [[SELECT11]]
@@ -1491,38 +1537,39 @@ body: |
     ; SI-NEXT: [[MV1:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[SELECT16]](s64), [[SELECT17]](s64)
     ; SI-NEXT: [[MV2:%[0-9]+]]:_(s256) = G_MERGE_VALUES [[MV]](s128), [[MV1]](s128)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[MV2]](s256)
+    ;
     ; VI-LABEL: name: test_shl_s256_s256
     ; VI: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(s256) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
     ; VI-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr8
-    ; VI-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 128
+    ; VI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 128
     ; VI-NEXT: [[UV:%[0-9]+]]:_(s128), [[UV1:%[0-9]+]]:_(s128) = G_UNMERGE_VALUES [[COPY]](s256)
-    ; VI-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[COPY1]], [[C2]]
-    ; VI-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[C2]], [[COPY1]]
-    ; VI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-    ; VI-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY1]](s32), [[C2]]
-    ; VI-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C]]
-    ; VI-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 64
+    ; VI-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[COPY1]], [[C]]
+    ; VI-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[C]], [[COPY1]]
+    ; VI-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; VI-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY1]](s32), [[C]]
+    ; VI-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C1]]
+    ; VI-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 64
     ; VI-NEXT: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[UV]](s128)
-    ; VI-NEXT: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[COPY1]], [[C3]]
-    ; VI-NEXT: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[C3]], [[COPY1]]
-    ; VI-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY1]](s32), [[C3]]
-    ; VI-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C]]
+    ; VI-NEXT: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[COPY1]], [[C2]]
+    ; VI-NEXT: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[C2]], [[COPY1]]
+    ; VI-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY1]](s32), [[C2]]
+    ; VI-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C1]]
     ; VI-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[UV2]], [[COPY1]](s32)
     ; VI-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[UV2]], [[SUB3]](s32)
     ; VI-NEXT: [[SHL1:%[0-9]+]]:_(s64) = G_SHL [[UV3]], [[COPY1]](s32)
     ; VI-NEXT: [[OR:%[0-9]+]]:_(s64) = G_OR [[LSHR]], [[SHL1]]
-    ; VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; VI-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
     ; VI-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[UV2]], [[SUB2]](s32)
-    ; VI-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP2]](s1), [[SHL]], [[C1]]
+    ; VI-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP2]](s1), [[SHL]], [[C3]]
     ; VI-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[ICMP2]](s1), [[OR]], [[SHL2]]
     ; VI-NEXT: [[SELECT2:%[0-9]+]]:_(s64) = G_SELECT [[ICMP3]](s1), [[UV3]], [[SELECT1]]
     ; VI-NEXT: [[UV4:%[0-9]+]]:_(s64), [[UV5:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[UV]](s128)
-    ; VI-NEXT: [[SUB4:%[0-9]+]]:_(s32) = G_SUB [[SUB1]], [[C3]]
-    ; VI-NEXT: [[SUB5:%[0-9]+]]:_(s32) = G_SUB [[C3]], [[SUB1]]
-    ; VI-NEXT: [[ICMP4:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[SUB1]](s32), [[C3]]
-    ; VI-NEXT: [[ICMP5:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[SUB1]](s32), [[C]]
+    ; VI-NEXT: [[SUB4:%[0-9]+]]:_(s32) = G_SUB [[SUB1]], [[C2]]
+    ; VI-NEXT: [[SUB5:%[0-9]+]]:_(s32) = G_SUB [[C2]], [[SUB1]]
+    ; VI-NEXT: [[ICMP4:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[SUB1]](s32), [[C2]]
+    ; VI-NEXT: [[ICMP5:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[SUB1]](s32), [[C1]]
     ; VI-NEXT: [[LSHR1:%[0-9]+]]:_(s64) = G_LSHR [[UV5]], [[SUB1]](s32)
     ; VI-NEXT: [[LSHR2:%[0-9]+]]:_(s64) = G_LSHR [[UV4]], [[SUB1]](s32)
     ; VI-NEXT: [[SHL3:%[0-9]+]]:_(s64) = G_SHL [[UV5]], [[SUB5]](s32)
@@ -1530,37 +1577,37 @@ body: |
     ; VI-NEXT: [[LSHR3:%[0-9]+]]:_(s64) = G_LSHR [[UV5]], [[SUB4]](s32)
     ; VI-NEXT: [[SELECT3:%[0-9]+]]:_(s64) = G_SELECT [[ICMP4]](s1), [[OR1]], [[LSHR3]]
     ; VI-NEXT: [[SELECT4:%[0-9]+]]:_(s64) = G_SELECT [[ICMP5]](s1), [[UV4]], [[SELECT3]]
-    ; VI-NEXT: [[SELECT5:%[0-9]+]]:_(s64) = G_SELECT [[ICMP4]](s1), [[LSHR1]], [[C1]]
+    ; VI-NEXT: [[SELECT5:%[0-9]+]]:_(s64) = G_SELECT [[ICMP4]](s1), [[LSHR1]], [[C3]]
     ; VI-NEXT: [[UV6:%[0-9]+]]:_(s64), [[UV7:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[UV1]](s128)
-    ; VI-NEXT: [[SUB6:%[0-9]+]]:_(s32) = G_SUB [[COPY1]], [[C3]]
-    ; VI-NEXT: [[SUB7:%[0-9]+]]:_(s32) = G_SUB [[C3]], [[COPY1]]
-    ; VI-NEXT: [[ICMP6:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY1]](s32), [[C3]]
-    ; VI-NEXT: [[ICMP7:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C]]
+    ; VI-NEXT: [[SUB6:%[0-9]+]]:_(s32) = G_SUB [[COPY1]], [[C2]]
+    ; VI-NEXT: [[SUB7:%[0-9]+]]:_(s32) = G_SUB [[C2]], [[COPY1]]
+    ; VI-NEXT: [[ICMP6:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY1]](s32), [[C2]]
+    ; VI-NEXT: [[ICMP7:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C1]]
     ; VI-NEXT: [[SHL4:%[0-9]+]]:_(s64) = G_SHL [[UV6]], [[COPY1]](s32)
     ; VI-NEXT: [[LSHR4:%[0-9]+]]:_(s64) = G_LSHR [[UV6]], [[SUB7]](s32)
     ; VI-NEXT: [[SHL5:%[0-9]+]]:_(s64) = G_SHL [[UV7]], [[COPY1]](s32)
     ; VI-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[LSHR4]], [[SHL5]]
     ; VI-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[UV6]], [[SUB6]](s32)
-    ; VI-NEXT: [[SELECT6:%[0-9]+]]:_(s64) = G_SELECT [[ICMP6]](s1), [[SHL4]], [[C1]]
+    ; VI-NEXT: [[SELECT6:%[0-9]+]]:_(s64) = G_SELECT [[ICMP6]](s1), [[SHL4]], [[C3]]
     ; VI-NEXT: [[SELECT7:%[0-9]+]]:_(s64) = G_SELECT [[ICMP6]](s1), [[OR2]], [[SHL6]]
     ; VI-NEXT: [[SELECT8:%[0-9]+]]:_(s64) = G_SELECT [[ICMP7]](s1), [[UV7]], [[SELECT7]]
     ; VI-NEXT: [[OR3:%[0-9]+]]:_(s64) = G_OR [[SELECT4]], [[SELECT6]]
     ; VI-NEXT: [[OR4:%[0-9]+]]:_(s64) = G_OR [[SELECT5]], [[SELECT8]]
     ; VI-NEXT: [[UV8:%[0-9]+]]:_(s64), [[UV9:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[UV]](s128)
-    ; VI-NEXT: [[SUB8:%[0-9]+]]:_(s32) = G_SUB [[SUB]], [[C3]]
-    ; VI-NEXT: [[SUB9:%[0-9]+]]:_(s32) = G_SUB [[C3]], [[SUB]]
-    ; VI-NEXT: [[ICMP8:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[SUB]](s32), [[C3]]
-    ; VI-NEXT: [[ICMP9:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[SUB]](s32), [[C]]
+    ; VI-NEXT: [[SUB8:%[0-9]+]]:_(s32) = G_SUB [[SUB]], [[C2]]
+    ; VI-NEXT: [[SUB9:%[0-9]+]]:_(s32) = G_SUB [[C2]], [[SUB]]
+    ; VI-NEXT: [[ICMP8:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[SUB]](s32), [[C2]]
+    ; VI-NEXT: [[ICMP9:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[SUB]](s32), [[C1]]
     ; VI-NEXT: [[SHL7:%[0-9]+]]:_(s64) = G_SHL [[UV8]], [[SUB]](s32)
     ; VI-NEXT: [[LSHR5:%[0-9]+]]:_(s64) = G_LSHR [[UV8]], [[SUB9]](s32)
     ; VI-NEXT: [[SHL8:%[0-9]+]]:_(s64) = G_SHL [[UV9]], [[SUB]](s32)
     ; VI-NEXT: [[OR5:%[0-9]+]]:_(s64) = G_OR [[LSHR5]], [[SHL8]]
     ; VI-NEXT: [[SHL9:%[0-9]+]]:_(s64) = G_SHL [[UV8]], [[SUB8]](s32)
-    ; VI-NEXT: [[SELECT9:%[0-9]+]]:_(s64) = G_SELECT [[ICMP8]](s1), [[SHL7]], [[C1]]
+    ; VI-NEXT: [[SELECT9:%[0-9]+]]:_(s64) = G_SELECT [[ICMP8]](s1), [[SHL7]], [[C3]]
     ; VI-NEXT: [[SELECT10:%[0-9]+]]:_(s64) = G_SELECT [[ICMP8]](s1), [[OR5]], [[SHL9]]
     ; VI-NEXT: [[SELECT11:%[0-9]+]]:_(s64) = G_SELECT [[ICMP9]](s1), [[UV9]], [[SELECT10]]
-    ; VI-NEXT: [[SELECT12:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s1), [[SELECT]], [[C1]]
-    ; VI-NEXT: [[SELECT13:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s1), [[SELECT2]], [[C1]]
+    ; VI-NEXT: [[SELECT12:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s1), [[SELECT]], [[C3]]
+    ; VI-NEXT: [[SELECT13:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s1), [[SELECT2]], [[C3]]
     ; VI-NEXT: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[SELECT12]](s64), [[SELECT13]](s64)
     ; VI-NEXT: [[SELECT14:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s1), [[OR3]], [[SELECT9]]
     ; VI-NEXT: [[SELECT15:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s1), [[OR4]], [[SELECT11]]
@@ -1570,38 +1617,39 @@ body: |
     ; VI-NEXT: [[MV1:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[SELECT16]](s64), [[SELECT17]](s64)
     ; VI-NEXT: [[MV2:%[0-9]+]]:_(s256) = G_MERGE_VALUES [[MV]](s128), [[MV1]](s128)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[MV2]](s256)
+    ;
     ; GFX9-LABEL: name: test_shl_s256_s256
     ; GFX9: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr8
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s256) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr8
-    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 128
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 128
     ; GFX9-NEXT: [[UV:%[0-9]+]]:_(s128), [[UV1:%[0-9]+]]:_(s128) = G_UNMERGE_VALUES [[COPY]](s256)
-    ; GFX9-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[COPY1]], [[C2]]
-    ; GFX9-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[C2]], [[COPY1]]
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-    ; GFX9-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY1]](s32), [[C2]]
-    ; GFX9-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C]]
-    ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 64
+    ; GFX9-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[COPY1]], [[C]]
+    ; GFX9-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[C]], [[COPY1]]
+    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; GFX9-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY1]](s32), [[C]]
+    ; GFX9-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C1]]
+    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 64
     ; GFX9-NEXT: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[UV]](s128)
-    ; GFX9-NEXT: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[COPY1]], [[C3]]
-    ; GFX9-NEXT: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[C3]], [[COPY1]]
-    ; GFX9-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY1]](s32), [[C3]]
-    ; GFX9-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C]]
+    ; GFX9-NEXT: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[COPY1]], [[C2]]
+    ; GFX9-NEXT: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[C2]], [[COPY1]]
+    ; GFX9-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY1]](s32), [[C2]]
+    ; GFX9-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C1]]
     ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[UV2]], [[COPY1]](s32)
     ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[UV2]], [[SUB3]](s32)
     ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s64) = G_SHL [[UV3]], [[COPY1]](s32)
     ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s64) = G_OR [[LSHR]], [[SHL1]]
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; GFX9-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
     ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[UV2]], [[SUB2]](s32)
-    ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP2]](s1), [[SHL]], [[C1]]
+    ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP2]](s1), [[SHL]], [[C3]]
     ; GFX9-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[ICMP2]](s1), [[OR]], [[SHL2]]
     ; GFX9-NEXT: [[SELECT2:%[0-9]+]]:_(s64) = G_SELECT [[ICMP3]](s1), [[UV3]], [[SELECT1]]
     ; GFX9-NEXT: [[UV4:%[0-9]+]]:_(s64), [[UV5:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[UV]](s128)
-    ; GFX9-NEXT: [[SUB4:%[0-9]+]]:_(s32) = G_SUB [[SUB1]], [[C3]]
-    ; GFX9-NEXT: [[SUB5:%[0-9]+]]:_(s32) = G_SUB [[C3]], [[SUB1]]
-    ; GFX9-NEXT: [[ICMP4:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[SUB1]](s32), [[C3]]
-    ; GFX9-NEXT: [[ICMP5:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[SUB1]](s32), [[C]]
+    ; GFX9-NEXT: [[SUB4:%[0-9]+]]:_(s32) = G_SUB [[SUB1]], [[C2]]
+    ; GFX9-NEXT: [[SUB5:%[0-9]+]]:_(s32) = G_SUB [[C2]], [[SUB1]]
+    ; GFX9-NEXT: [[ICMP4:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[SUB1]](s32), [[C2]]
+    ; GFX9-NEXT: [[ICMP5:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[SUB1]](s32), [[C1]]
     ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s64) = G_LSHR [[UV5]], [[SUB1]](s32)
     ; GFX9-NEXT: [[LSHR2:%[0-9]+]]:_(s64) = G_LSHR [[UV4]], [[SUB1]](s32)
     ; GFX9-NEXT: [[SHL3:%[0-9]+]]:_(s64) = G_SHL [[UV5]], [[SUB5]](s32)
@@ -1609,37 +1657,37 @@ body: |
     ; GFX9-NEXT: [[LSHR3:%[0-9]+]]:_(s64) = G_LSHR [[UV5]], [[SUB4]](s32)
     ; GFX9-NEXT: [[SELECT3:%[0-9]+]]:_(s64) = G_SELECT [[ICMP4]](s1), [[OR1]], [[LSHR3]]
     ; GFX9-NEXT: [[SELECT4:%[0-9]+]]:_(s64) = G_SELECT [[ICMP5]](s1), [[UV4]], [[SELECT3]]
-    ; GFX9-NEXT: [[SELECT5:%[0-9]+]]:_(s64) = G_SELECT [[ICMP4]](s1), [[LSHR1]], [[C1]]
+    ; GFX9-NEXT: [[SELECT5:%[0-9]+]]:_(s64) = G_SELECT [[ICMP4]](s1), [[LSHR1]], [[C3]]
     ; GFX9-NEXT: [[UV6:%[0-9]+]]:_(s64), [[UV7:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[UV1]](s128)
-    ; GFX9-NEXT: [[SUB6:%[0-9]+]]:_(s32) = G_SUB [[COPY1]], [[C3]]
-    ; GFX9-NEXT: [[SUB7:%[0-9]+]]:_(s32) = G_SUB [[C3]], [[COPY1]]
-    ; GFX9-NEXT: [[ICMP6:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY1]](s32), [[C3]]
-    ; GFX9-NEXT: [[ICMP7:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C]]
+    ; GFX9-NEXT: [[SUB6:%[0-9]+]]:_(s32) = G_SUB [[COPY1]], [[C2]]
+    ; GFX9-NEXT: [[SUB7:%[0-9]+]]:_(s32) = G_SUB [[C2]], [[COPY1]]
+    ; GFX9-NEXT: [[ICMP6:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY1]](s32), [[C2]]
+    ; GFX9-NEXT: [[ICMP7:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[COPY1]](s32), [[C1]]
     ; GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s64) = G_SHL [[UV6]], [[COPY1]](s32)
     ; GFX9-NEXT: [[LSHR4:%[0-9]+]]:_(s64) = G_LSHR [[UV6]], [[SUB7]](s32)
     ; GFX9-NEXT: [[SHL5:%[0-9]+]]:_(s64) = G_SHL [[UV7]], [[COPY1]](s32)
     ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[LSHR4]], [[SHL5]]
     ; GFX9-NEXT: [[SHL6:%[0-9]+]]:_(s64) = G_SHL [[UV6]], [[SUB6]](s32)
-    ; GFX9-NEXT: [[SELECT6:%[0-9]+]]:_(s64) = G_SELECT [[ICMP6]](s1), [[SHL4]], [[C1]]
+    ; GFX9-NEXT: [[SELECT6:%[0-9]+]]:_(s64) = G_SELECT [[ICMP6]](s1), [[SHL4]], [[C3]]
     ; GFX9-NEXT: [[SELECT7:%[0-9]+]]:_(s64) = G_SELECT [[ICMP6]](s1), [[OR2]], [[SHL6]]
     ; GFX9-NEXT: [[SELECT8:%[0-9]+]]:_(s64) = G_SELECT [[ICMP7]](s1), [[UV7]], [[SELECT7]]
     ; GFX9-NEXT: [[OR3:%[0-9]+]]:_(s64) = G_OR [[SELECT4]], [[SELECT6]]
     ; GFX9-NEXT: [[OR4:%[0-9]+]]:_(s64) = G_OR [[SELECT5]], [[SELECT8]]
     ; GFX9-NEXT: [[UV8:%[0-9]+]]:_(s64), [[UV9:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[UV]](s128)
-    ; GFX9-NEXT: [[SUB8:%[0-9]+]]:_(s32) = G_SUB [[SUB]], [[C3]]
-    ; GFX9-NEXT: [[SUB9:%[0-9]+]]:_(s32) = G_SUB [[C3]], [[SUB]]
-    ; GFX9-NEXT: [[ICMP8:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[SUB]](s32), [[C3]]
-    ; GFX9-NEXT: [[ICMP9:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[SUB]](s32), [[C]]
+    ; GFX9-NEXT: [[SUB8:%[0-9]+]]:_(s32) = G_SUB [[SUB]], [[C2]]
+    ; GFX9-NEXT: [[SUB9:%[0-9]+]]:_(s32) = G_SUB [[C2]], [[SUB]]
+    ; GFX9-NEXT: [[ICMP8:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[SUB]](s32), [[C2]]
+    ; GFX9-NEXT: [[ICMP9:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[SUB]](s32), [[C1]]
     ; GFX9-NEXT: [[SHL7:%[0-9]+]]:_(s64) = G_SHL [[UV8]], [[SUB]](s32)
     ; GFX9-NEXT: [[LSHR5:%[0-9]+]]:_(s64) = G_LSHR [[UV8]], [[SUB9]](s32)
     ; GFX9-NEXT: [[SHL8:%[0-9]+]]:_(s64) = G_SHL [[UV9]], [[SUB]](s32)
     ; GFX9-NEXT: [[OR5:%[0-9]+]]:_(s64) = G_OR [[LSHR5]], [[SHL8]]
     ; GFX9-NEXT: [[SHL9:%[0-9]+]]:_(s64) = G_SHL [[UV8]], [[SUB8]](s32)
-    ; GFX9-NEXT: [[SELECT9:%[0-9]+]]:_(s64) = G_SELECT [[ICMP8]](s1), [[SHL7]], [[C1]]
+    ; GFX9-NEXT: [[SELECT9:%[0-9]+]]:_(s64) = G_SELECT [[ICMP8]](s1), [[SHL7]], [[C3]]
     ; GFX9-NEXT: [[SELECT10:%[0-9]+]]:_(s64) = G_SELECT [[ICMP8]](s1), [[OR5]], [[SHL9]]
     ; GFX9-NEXT: [[SELECT11:%[0-9]+]]:_(s64) = G_SELECT [[ICMP9]](s1), [[UV9]], [[SELECT10]]
-    ; GFX9-NEXT: [[SELECT12:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s1), [[SELECT]], [[C1]]
-    ; GFX9-NEXT: [[SELECT13:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s1), [[SELECT2]], [[C1]]
+    ; GFX9-NEXT: [[SELECT12:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s1), [[SELECT]], [[C3]]
+    ; GFX9-NEXT: [[SELECT13:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s1), [[SELECT2]], [[C3]]
     ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[SELECT12]](s64), [[SELECT13]](s64)
     ; GFX9-NEXT: [[SELECT14:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s1), [[OR3]], [[SELECT9]]
     ; GFX9-NEXT: [[SELECT15:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s1), [[OR4]], [[SELECT11]]
@@ -1702,6 +1750,7 @@ body: |
     ; SI-NEXT: [[MV1:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[SELECT3]](s64), [[SELECT5]](s64)
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s128>) = G_BUILD_VECTOR [[MV]](s128), [[MV1]](s128)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<2 x s128>)
+    ;
     ; VI-LABEL: name: test_shl_v2s128_v2s32
     ; VI: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr4_vgpr5
     ; VI-NEXT: {{  $}}
@@ -1742,6 +1791,7 @@ body: |
     ; VI-NEXT: [[MV1:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[SELECT3]](s64), [[SELECT5]](s64)
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s128>) = G_BUILD_VECTOR [[MV]](s128), [[MV1]](s128)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<2 x s128>)
+    ;
     ; GFX9-LABEL: name: test_shl_v2s128_v2s32
     ; GFX9: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, $vgpr4_vgpr5
     ; GFX9-NEXT: {{  $}}
@@ -1822,6 +1872,7 @@ body: |
     ; SI-NEXT: [[MV2:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[SELECT]](s64), [[SELECT2]](s64)
     ; SI-NEXT: [[TRUNC1:%[0-9]+]]:_(s96) = G_TRUNC [[MV2]](s128)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[TRUNC1]](s96)
+    ;
     ; VI-LABEL: name: test_shl_s65_s32
     ; VI: liveins: $vgpr0_vgpr1_vgpr2, $vgpr3
     ; VI-NEXT: {{  $}}
@@ -1850,6 +1901,7 @@ body: |
     ; VI-NEXT: [[MV2:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[SELECT]](s64), [[SELECT2]](s64)
     ; VI-NEXT: [[TRUNC1:%[0-9]+]]:_(s96) = G_TRUNC [[MV2]](s128)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[TRUNC1]](s96)
+    ;
     ; GFX9-LABEL: name: test_shl_s65_s32
     ; GFX9: liveins: $vgpr0_vgpr1_vgpr2, $vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -1919,6 +1971,7 @@ body: |
     ; SI-NEXT: [[MV2:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[SELECT]](s64), [[SELECT2]](s64)
     ; SI-NEXT: [[TRUNC1:%[0-9]+]]:_(s96) = G_TRUNC [[MV2]](s128)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[TRUNC1]](s96)
+    ;
     ; VI-LABEL: name: test_shl_s65_s32_constant8
     ; VI: liveins: $vgpr0_vgpr1_vgpr2
     ; VI-NEXT: {{  $}}
@@ -1946,6 +1999,7 @@ body: |
     ; VI-NEXT: [[MV2:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[SELECT]](s64), [[SELECT2]](s64)
     ; VI-NEXT: [[TRUNC1:%[0-9]+]]:_(s96) = G_TRUNC [[MV2]](s128)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[TRUNC1]](s96)
+    ;
     ; GFX9-LABEL: name: test_shl_s65_s32_constant8
     ; GFX9: liveins: $vgpr0_vgpr1_vgpr2
     ; GFX9-NEXT: {{  $}}
@@ -2016,6 +2070,7 @@ body: |
     ; SI-NEXT: [[MV2:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[SELECT]](s64), [[SELECT2]](s64)
     ; SI-NEXT: [[TRUNC:%[0-9]+]]:_(s96) = G_TRUNC [[MV2]](s128)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[TRUNC]](s96)
+    ;
     ; VI-LABEL: name: test_shl_s65_s32_known_pow2
     ; VI: liveins: $vgpr0_vgpr1_vgpr2, $vgpr3
     ; VI-NEXT: {{  $}}
@@ -2045,6 +2100,7 @@ body: |
     ; VI-NEXT: [[MV2:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[SELECT]](s64), [[SELECT2]](s64)
     ; VI-NEXT: [[TRUNC:%[0-9]+]]:_(s96) = G_TRUNC [[MV2]](s128)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[TRUNC]](s96)
+    ;
     ; GFX9-LABEL: name: test_shl_s65_s32_known_pow2
     ; GFX9: liveins: $vgpr0_vgpr1_vgpr2, $vgpr3
     ; GFX9-NEXT: {{  $}}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-shuffle-vector.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-shuffle-vector.mir
index 402454d989d60..c8bd8ab33f18c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-shuffle-vector.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-shuffle-vector.mir
@@ -291,8 +291,7 @@ body: |
     ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32)
     ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
     ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C2]]
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C2]]
-    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C1]](s32)
+    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LSHR1]], [[C1]](s32)
     ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
     ; CHECK-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; CHECK-NEXT: $vgpr0 = COPY [[BITCAST2]](<2 x s16>)
@@ -323,10 +322,9 @@ body: |
     ; CHECK-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>)
     ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
     ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C2]]
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C2]]
-    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C1]](s32)
-    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C2]]
+    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND]], [[C1]](s32)
+    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[LSHR]], [[SHL]]
     ; CHECK-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; CHECK-NEXT: $vgpr0 = COPY [[BITCAST2]](<2 x s16>)
     %0:_(<2 x s16>) = COPY $vgpr0
@@ -358,14 +356,13 @@ body: |
     ; CHECK-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>)
     ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
     ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[BITCAST2]], [[C1]]
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]]
-    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
+    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LSHR]], [[C]](s32)
     ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
     ; CHECK-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
-    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C1]]
-    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[BITCAST]], [[C1]]
-    ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C]](s32)
-    ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]]
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C1]]
+    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[BITCAST]], [[C1]]
+    ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C]](s32)
+    ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND1]], [[SHL1]]
     ; CHECK-NEXT: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
     ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>)
     ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-shuffle-vector.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-shuffle-vector.s16.mir
index 9388d31b7d3c2..a99a34f090466 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-shuffle-vector.s16.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-shuffle-vector.s16.mir
@@ -23,6 +23,7 @@ body: |
     ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[C]], [[SHL]]
     ; GFX8-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; GFX8-NEXT: $vgpr0 = COPY [[BITCAST]](<2 x s16>)
+    ;
     ; GFX9-LABEL: name: shufflevector_v2s16_v2s16_undef_undef
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -62,6 +63,7 @@ body: |
     ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
     ; GFX8-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; GFX8-NEXT: $vgpr0 = COPY [[BITCAST1]](<2 x s16>)
+    ;
     ; GFX9-LABEL: name: shufflevector_v2s16_v2s16_0_undef
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -105,6 +107,7 @@ body: |
     ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY2]], [[SHL]]
     ; GFX8-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; GFX8-NEXT: $vgpr0 = COPY [[BITCAST1]](<2 x s16>)
+    ;
     ; GFX9-LABEL: name: shufflevector_v2s16_v2s16_undef_0
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -145,11 +148,11 @@ body: |
     ; GFX8-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32)
     ; GFX8-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
     ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C2]]
-    ; GFX8-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C2]]
-    ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C1]](s32)
+    ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LSHR1]], [[C1]](s32)
     ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
     ; GFX8-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; GFX8-NEXT: $vgpr0 = COPY [[BITCAST2]](<2 x s16>)
+    ;
     ; GFX9-LABEL: name: shufflevector_v2s16_v2s16_0_1
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -192,12 +195,12 @@ body: |
     ; GFX8-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>)
     ; GFX8-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
     ; GFX8-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
-    ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C2]]
-    ; GFX8-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C2]]
-    ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C1]](s32)
-    ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
+    ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C2]]
+    ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND]], [[C1]](s32)
+    ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[LSHR]], [[SHL]]
     ; GFX8-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; GFX8-NEXT: $vgpr0 = COPY [[BITCAST2]](<2 x s16>)
+    ;
     ; GFX9-LABEL: name: shufflevector_v2s16_v2s16_1_0
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -246,6 +249,7 @@ body: |
     ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
     ; GFX8-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; GFX8-NEXT: $vgpr0 = COPY [[BITCAST2]](<2 x s16>)
+    ;
     ; GFX9-LABEL: name: shufflevector_v2s16_v2s16_0_0
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -285,13 +289,11 @@ body: |
     ; GFX8-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
     ; GFX8-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>)
     ; GFX8-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
-    ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
-    ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]]
-    ; GFX8-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C1]]
-    ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
-    ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
+    ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LSHR1]], [[C]](s32)
+    ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[LSHR]], [[SHL]]
     ; GFX8-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; GFX8-NEXT: $vgpr0 = COPY [[BITCAST2]](<2 x s16>)
+    ;
     ; GFX9-LABEL: name: shufflevector_v2s16_v2s16_1_1
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -339,6 +341,7 @@ body: |
     ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
     ; GFX8-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; GFX8-NEXT: $vgpr0 = COPY [[BITCAST2]](<2 x s16>)
+    ;
     ; GFX9-LABEL: name: shufflevector_v2s16_v2s16_2_2
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -384,6 +387,7 @@ body: |
     ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
     ; GFX8-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; GFX8-NEXT: $vgpr0 = COPY [[BITCAST1]](<2 x s16>)
+    ;
     ; GFX9-LABEL: name: shufflevector_v2s16_v2s16_2_undef
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -427,6 +431,7 @@ body: |
     ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY2]], [[SHL]]
     ; GFX8-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; GFX8-NEXT: $vgpr0 = COPY [[BITCAST1]](<2 x s16>)
+    ;
     ; GFX9-LABEL: name: shufflevector_v2s16_v2s16_undef_2
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -467,11 +472,11 @@ body: |
     ; GFX8-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32)
     ; GFX8-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
     ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C2]]
-    ; GFX8-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C2]]
-    ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C1]](s32)
+    ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LSHR1]], [[C1]](s32)
     ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
     ; GFX8-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; GFX8-NEXT: $vgpr0 = COPY [[BITCAST2]](<2 x s16>)
+    ;
     ; GFX9-LABEL: name: shufflevector_v2s16_v2s16_2_3
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -514,12 +519,12 @@ body: |
     ; GFX8-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>)
     ; GFX8-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
     ; GFX8-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
-    ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C2]]
-    ; GFX8-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C2]]
-    ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C1]](s32)
-    ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
+    ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C2]]
+    ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND]], [[C1]](s32)
+    ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[LSHR]], [[SHL]]
     ; GFX8-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; GFX8-NEXT: $vgpr0 = COPY [[BITCAST2]](<2 x s16>)
+    ;
     ; GFX9-LABEL: name: shufflevector_v2s16_v2s16_3_2
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -559,12 +564,11 @@ body: |
     ; GFX8-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
     ; GFX8-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
     ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-    ; GFX8-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
-    ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C2]]
-    ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND]], [[C]](s32)
+    ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LSHR]], [[C]](s32)
     ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[C1]], [[SHL]]
     ; GFX8-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; GFX8-NEXT: $vgpr0 = COPY [[BITCAST1]](<2 x s16>)
+    ;
     ; GFX9-LABEL: name: shufflevector_v2s16_v2s16_undef_3
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -600,13 +604,12 @@ body: |
     ; GFX8-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>)
     ; GFX8-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
     ; GFX8-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
-    ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
-    ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]]
-    ; GFX8-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-    ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[C2]], [[C]](s32)
-    ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
+    ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[C1]], [[C]](s32)
+    ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[LSHR]], [[SHL]]
     ; GFX8-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; GFX8-NEXT: $vgpr0 = COPY [[BITCAST1]](<2 x s16>)
+    ;
     ; GFX9-LABEL: name: shufflevector_v2s16_v2s16_3_undef
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -644,13 +647,11 @@ body: |
     ; GFX8-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
     ; GFX8-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>)
     ; GFX8-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
-    ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
-    ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]]
-    ; GFX8-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C1]]
-    ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
-    ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
+    ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LSHR1]], [[C]](s32)
+    ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[LSHR]], [[SHL]]
     ; GFX8-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; GFX8-NEXT: $vgpr0 = COPY [[BITCAST2]](<2 x s16>)
+    ;
     ; GFX9-LABEL: name: shufflevector_v2s16_v2s16_3_3
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -698,6 +699,7 @@ body: |
     ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
     ; GFX8-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; GFX8-NEXT: $vgpr0 = COPY [[BITCAST2]](<2 x s16>)
+    ;
     ; GFX9-LABEL: name: shufflevector_v2s16_v2s16_0_2
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -745,6 +747,7 @@ body: |
     ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
     ; GFX8-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; GFX8-NEXT: $vgpr0 = COPY [[BITCAST2]](<2 x s16>)
+    ;
     ; GFX9-LABEL: name: shufflevector_v2s16_v2s16_2_0
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -786,12 +789,12 @@ body: |
     ; GFX8-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>)
     ; GFX8-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
     ; GFX8-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
-    ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C2]]
-    ; GFX8-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C2]]
-    ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C1]](s32)
-    ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
+    ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C2]]
+    ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND]], [[C1]](s32)
+    ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[LSHR]], [[SHL]]
     ; GFX8-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; GFX8-NEXT: $vgpr0 = COPY [[BITCAST2]](<2 x s16>)
+    ;
     ; GFX9-LABEL: name: shufflevector_v2s16_v2s16_3_0
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -835,11 +838,11 @@ body: |
     ; GFX8-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32)
     ; GFX8-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
     ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C2]]
-    ; GFX8-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C2]]
-    ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C1]](s32)
+    ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LSHR1]], [[C1]](s32)
     ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
     ; GFX8-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; GFX8-NEXT: $vgpr0 = COPY [[BITCAST2]](<2 x s16>)
+    ;
     ; GFX9-LABEL: name: shufflevector_v2s16_v2s16_0_3
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -882,12 +885,12 @@ body: |
     ; GFX8-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>)
     ; GFX8-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
     ; GFX8-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
-    ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C2]]
-    ; GFX8-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C2]]
-    ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C1]](s32)
-    ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
+    ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C2]]
+    ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND]], [[C1]](s32)
+    ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[LSHR]], [[SHL]]
     ; GFX8-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; GFX8-NEXT: $vgpr0 = COPY [[BITCAST2]](<2 x s16>)
+    ;
     ; GFX9-LABEL: name: shufflevector_v2s16_v2s16_1_2
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -931,11 +934,11 @@ body: |
     ; GFX8-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32)
     ; GFX8-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
     ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C2]]
-    ; GFX8-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C2]]
-    ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C1]](s32)
+    ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LSHR1]], [[C1]](s32)
     ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
     ; GFX8-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; GFX8-NEXT: $vgpr0 = COPY [[BITCAST2]](<2 x s16>)
+    ;
     ; GFX9-LABEL: name: shufflevector_v2s16_v2s16_2_1
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -980,17 +983,17 @@ body: |
     ; GFX8-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>)
     ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
     ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[BITCAST2]], [[C1]]
-    ; GFX8-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]]
-    ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
+    ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LSHR]], [[C]](s32)
     ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
     ; GFX8-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
-    ; GFX8-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C1]]
-    ; GFX8-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[BITCAST]], [[C1]]
-    ; GFX8-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C]](s32)
-    ; GFX8-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]]
+    ; GFX8-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C1]]
+    ; GFX8-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[BITCAST]], [[C1]]
+    ; GFX8-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C]](s32)
+    ; GFX8-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND1]], [[SHL1]]
     ; GFX8-NEXT: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
     ; GFX8-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>)
     ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
     ; GFX9-LABEL: name: shufflevector_v4s16_v3s16_2_0
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -998,7 +1001,7 @@ body: |
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr2_vgpr3
     ; GFX9-NEXT: [[UV:%[0-9]+]]:_(<2 x s16>), [[UV1:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[COPY]](<4 x s16>)
     ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[UV]](<2 x s16>)
-    ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+    ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
     ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
     ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
     ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
@@ -1006,9 +1009,9 @@ body: |
     ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>)
     ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
     ; GFX9-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>)
-    ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32)
-    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
-    ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC3]](s16)
+    ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32)
+    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC3]](s16), [[TRUNC1]](s16)
+    ; GFX9-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC2]](s16), [[TRUNC]](s16)
     ; GFX9-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BUILD_VECTOR]](<2 x s16>), [[BUILD_VECTOR1]](<2 x s16>)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
     %0:_(<4 x s16>) = COPY $vgpr0_vgpr1
@@ -1044,12 +1047,12 @@ body: |
     ; GFX8-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
     ; GFX8-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[COPY3]], [[C]](s32)
     ; GFX8-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
-    ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C2]]
-    ; GFX8-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C2]]
-    ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C1]](s32)
-    ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
+    ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C2]]
+    ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND]], [[C1]](s32)
+    ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[LSHR]], [[SHL]]
     ; GFX8-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; GFX8-NEXT: $vgpr0 = COPY [[BITCAST2]](<2 x s16>)
+    ;
     ; GFX9-LABEL: name: shufflevector_v2s16_v4s16_1_0
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -1098,13 +1101,11 @@ body: |
     ; GFX8-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BITCAST1]](<2 x s32>)
     ; GFX8-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[UV3]](s32)
     ; GFX8-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[COPY3]], [[C]](s32)
-    ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
-    ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]]
-    ; GFX8-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C1]]
-    ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
-    ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
+    ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LSHR1]], [[C]](s32)
+    ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[LSHR]], [[SHL]]
     ; GFX8-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; GFX8-NEXT: $vgpr0 = COPY [[BITCAST2]](<2 x s16>)
+    ;
     ; GFX9-LABEL: name: shufflevector_v2s16_v4s16_1_3
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-smulh.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-smulh.mir
index 4b1ac64b32b5a..51fffb7551a12 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-smulh.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-smulh.mir
@@ -15,6 +15,7 @@ body: |
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
     ; GFX8-NEXT: [[SMULH:%[0-9]+]]:_(s32) = G_SMULH [[COPY]], [[COPY1]]
     ; GFX8-NEXT: $vgpr0 = COPY [[SMULH]](s32)
+    ;
     ; GFX9-LABEL: name: test_smulh_s32
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -45,6 +46,7 @@ body: |
     ; GFX8-NEXT: [[SMULH1:%[0-9]+]]:_(s32) = G_SMULH [[UV1]], [[UV3]]
     ; GFX8-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[SMULH]](s32), [[SMULH1]](s32)
     ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; GFX9-LABEL: name: test_smulh_v2s32
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -80,6 +82,7 @@ body: |
     ; GFX8-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[MUL]], [[C]](s32)
     ; GFX8-NEXT: [[SEXT_INREG2:%[0-9]+]]:_(s32) = G_SEXT_INREG [[ASHR]], 16
     ; GFX8-NEXT: $vgpr0 = COPY [[SEXT_INREG2]](s32)
+    ;
     ; GFX9-LABEL: name: test_smulh_s16
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -124,6 +127,7 @@ body: |
     ; GFX8-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[ASHR2]](s16)
     ; GFX8-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s32) = G_SEXT_INREG [[ANYEXT]], 8
     ; GFX8-NEXT: $vgpr0 = COPY [[SEXT_INREG]](s32)
+    ;
     ; GFX9-LABEL: name: test_smulh_s8
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -181,6 +185,7 @@ body: |
     ; GFX8-NEXT: [[SEXT_INREG5:%[0-9]+]]:_(s32) = G_SEXT_INREG [[LSHR]], 16
     ; GFX8-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[SEXT_INREG4]](s32), [[SEXT_INREG5]](s32)
     ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; GFX9-LABEL: name: test_smulh_v2s16
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -246,6 +251,7 @@ body: |
     ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL4]]
     ; GFX8-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[OR]](s16)
     ; GFX8-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ;
     ; GFX9-LABEL: name: test_smulh_v2s8
     ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -349,8 +355,8 @@ body: |
     ; GFX8-NEXT: [[ASHR10:%[0-9]+]]:_(s16) = G_ASHR [[SHL7]], [[C3]](s16)
     ; GFX8-NEXT: [[MUL3:%[0-9]+]]:_(s16) = G_MUL [[ASHR9]], [[ASHR10]]
     ; GFX8-NEXT: [[ASHR11:%[0-9]+]]:_(s16) = G_ASHR [[MUL3]], [[C3]](s16)
-    ; GFX8-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
     ; GFX8-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[ASHR2]](s16)
+    ; GFX8-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
     ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C4]]
     ; GFX8-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[ASHR5]](s16)
     ; GFX8-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[ANYEXT1]], [[C4]]
@@ -365,6 +371,7 @@ body: |
     ; GFX8-NEXT: [[SHL10:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C2]](s32)
     ; GFX8-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[SHL10]]
     ; GFX8-NEXT: $vgpr0 = COPY [[OR2]](s32)
+    ;
     ; GFX9-LABEL: name: test_smulh_v4s8
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-smulo.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-smulo.mir
index 9ba2ebe8c4b4d..cd75462271a36 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-smulo.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-smulo.mir
@@ -21,6 +21,7 @@ body: |
     ; GFX8-NEXT: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[ICMP]](s1)
     ; GFX8-NEXT: $vgpr0 = COPY [[MUL]](s32)
     ; GFX8-NEXT: $vgpr1 = COPY [[SEXT]](s32)
+    ;
     ; GFX9-LABEL: name: test_smulo_s32
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -72,6 +73,7 @@ body: |
     ; GFX8-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[SEXT_INREG]](s32), [[SEXT_INREG1]](s32)
     ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
     ; GFX8-NEXT: $vgpr2_vgpr3 = COPY [[BUILD_VECTOR1]](<2 x s32>)
+    ;
     ; GFX9-LABEL: name: test_smulo_v2s32
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -124,6 +126,7 @@ body: |
     ; GFX8-NEXT: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[ICMP]](s1)
     ; GFX8-NEXT: $vgpr0 = COPY [[SEXT_INREG3]](s32)
     ; GFX8-NEXT: $vgpr1 = COPY [[SEXT]](s32)
+    ;
     ; GFX9-LABEL: name: test_smulo_s16
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -169,6 +172,7 @@ body: |
     ; GFX8-NEXT: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[ICMP]](s1)
     ; GFX8-NEXT: $vgpr0 = COPY [[SEXT_INREG3]](s32)
     ; GFX8-NEXT: $vgpr1 = COPY [[SEXT]](s32)
+    ;
     ; GFX9-LABEL: name: test_smulo_s8
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -235,6 +239,7 @@ body: |
     ; GFX8-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[SEXT_INREG8]](s32), [[SEXT_INREG9]](s32)
     ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR1]](<2 x s32>)
     ; GFX8-NEXT: $vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; GFX9-LABEL: name: test_smulo_v2s16
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -296,8 +301,8 @@ body: |
     ; GFX8-NEXT: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[SEXT_INREG3]], [[SEXT_INREG4]]
     ; GFX8-NEXT: [[SEXT_INREG5:%[0-9]+]]:_(s32) = G_SEXT_INREG [[MUL1]], 8
     ; GFX8-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[MUL1]](s32), [[SEXT_INREG5]]
-    ; GFX8-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; GFX8-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[MUL]](s32)
+    ; GFX8-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C]]
     ; GFX8-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[MUL1]](s32)
     ; GFX8-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C]]
@@ -310,6 +315,7 @@ body: |
     ; GFX8-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
     ; GFX8-NEXT: $vgpr1 = COPY [[ANYEXT1]](s32)
     ; GFX8-NEXT: $vgpr2 = COPY [[ANYEXT2]](s32)
+    ;
     ; GFX9-LABEL: name: test_smulo_v2s8
     ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -327,8 +333,8 @@ body: |
     ; GFX9-NEXT: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[SEXT_INREG3]], [[SEXT_INREG4]]
     ; GFX9-NEXT: [[SEXT_INREG5:%[0-9]+]]:_(s32) = G_SEXT_INREG [[MUL1]], 8
     ; GFX9-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[MUL1]](s32), [[SEXT_INREG5]]
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[MUL]](s32)
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C]]
     ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[MUL1]](s32)
     ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C]]
@@ -411,6 +417,7 @@ body: |
     ; GFX8-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[ICMP]](s1)
     ; GFX8-NEXT: $vgpr0 = COPY [[OR2]](s32)
     ; GFX8-NEXT: $vgpr1 = COPY [[ANYEXT]](s32)
+    ;
     ; GFX9-LABEL: name: test_smulo_v4s8
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -492,6 +499,7 @@ body: |
     ; GFX8-NEXT: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[OR]](s1)
     ; GFX8-NEXT: $vgpr0 = COPY [[SEXT_INREG3]](s32)
     ; GFX8-NEXT: $vgpr1 = COPY [[SEXT]](s32)
+    ;
     ; GFX9-LABEL: name: test_smulo_s24
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sshlsat.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sshlsat.mir
index 43496a8aec8c4..5bc314d29c97f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sshlsat.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sshlsat.mir
@@ -31,14 +31,15 @@ body: |
     ; GFX6-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s1), [[SELECT]], [[SHL1]]
     ; GFX6-NEXT: [[ASHR1:%[0-9]+]]:_(s32) = G_ASHR [[SELECT1]], [[C1]](s32)
     ; GFX6-NEXT: $vgpr0 = COPY [[ASHR1]](s32)
+    ;
     ; GFX8-LABEL: name: sshlsat_s7
     ; GFX8: liveins: $vgpr0, $vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
     ; GFX8-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
-    ; GFX8-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 127
     ; GFX8-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+    ; GFX8-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 127
     ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C]]
     ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 9
     ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[C1]](s16)
@@ -54,14 +55,15 @@ body: |
     ; GFX8-NEXT: [[ASHR1:%[0-9]+]]:_(s16) = G_ASHR [[SELECT1]], [[C1]](s16)
     ; GFX8-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[ASHR1]](s16)
     ; GFX8-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ;
     ; GFX9-LABEL: name: sshlsat_s7
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
     ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 127
     ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 127
     ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C]]
     ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 9
     ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[C1]](s16)
@@ -112,14 +114,15 @@ body: |
     ; GFX6-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s1), [[SELECT]], [[SHL1]]
     ; GFX6-NEXT: [[ASHR1:%[0-9]+]]:_(s32) = G_ASHR [[SELECT1]], [[C1]](s32)
     ; GFX6-NEXT: $vgpr0 = COPY [[ASHR1]](s32)
+    ;
     ; GFX8-LABEL: name: sshlsat_s8
     ; GFX8: liveins: $vgpr0, $vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
     ; GFX8-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
-    ; GFX8-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; GFX8-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+    ; GFX8-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C]]
     ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
     ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[C1]](s16)
@@ -135,14 +138,15 @@ body: |
     ; GFX8-NEXT: [[ASHR1:%[0-9]+]]:_(s16) = G_ASHR [[SELECT1]], [[C1]](s16)
     ; GFX8-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[ASHR1]](s16)
     ; GFX8-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ;
     ; GFX9-LABEL: name: sshlsat_s8
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
     ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C]]
     ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
     ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[C1]](s16)
@@ -204,8 +208,8 @@ body: |
     ; GFX6-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SHL2]](s32), [[ASHR2]]
     ; GFX6-NEXT: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[ICMP3]](s1), [[SELECT2]], [[SHL3]]
     ; GFX6-NEXT: [[ASHR3:%[0-9]+]]:_(s32) = G_ASHR [[SELECT3]], [[C1]](s32)
-    ; GFX6-NEXT: [[C6:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; GFX6-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[ASHR1]](s32)
+    ; GFX6-NEXT: [[C6:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; GFX6-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C6]]
     ; GFX6-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C]](s32)
     ; GFX6-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[ASHR3]], [[C2]]
@@ -214,6 +218,7 @@ body: |
     ; GFX6-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[TRUNC1]]
     ; GFX6-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[OR]](s16)
     ; GFX6-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ;
     ; GFX8-LABEL: name: sshlsat_v2s8
     ; GFX8: liveins: $vgpr0, $vgpr1
     ; GFX8-NEXT: {{  $}}
@@ -223,8 +228,8 @@ body: |
     ; GFX8-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C]](s32)
     ; GFX8-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[COPY1]], [[C]](s32)
     ; GFX8-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
-    ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; GFX8-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+    ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C1]]
     ; GFX8-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
     ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[C2]](s16)
@@ -255,6 +260,7 @@ body: |
     ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL4]]
     ; GFX8-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[OR]](s16)
     ; GFX8-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ;
     ; GFX9-LABEL: name: sshlsat_v2s8
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -264,8 +270,8 @@ body: |
     ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C]](s32)
     ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[COPY1]], [[C]](s32)
     ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C1]]
     ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
     ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[C2]](s16)
@@ -334,6 +340,7 @@ body: |
     ; GFX6-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s1), [[SELECT]], [[SHL1]]
     ; GFX6-NEXT: [[ASHR1:%[0-9]+]]:_(s32) = G_ASHR [[SELECT1]], [[C1]](s32)
     ; GFX6-NEXT: $vgpr0 = COPY [[ASHR1]](s32)
+    ;
     ; GFX8-LABEL: name: sshlsat_s16
     ; GFX8: liveins: $vgpr0, $vgpr1
     ; GFX8-NEXT: {{  $}}
@@ -352,6 +359,7 @@ body: |
     ; GFX8-NEXT: [[SELECT1:%[0-9]+]]:_(s16) = G_SELECT [[ICMP1]](s1), [[SELECT]], [[SHL]]
     ; GFX8-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[SELECT1]](s16)
     ; GFX8-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ;
     ; GFX9-LABEL: name: sshlsat_s16
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -408,21 +416,21 @@ body: |
     ; GFX6-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SHL]](s32), [[ASHR]]
     ; GFX6-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s1), [[SELECT]], [[SHL1]]
     ; GFX6-NEXT: [[ASHR1:%[0-9]+]]:_(s32) = G_ASHR [[SELECT1]], [[C]](s32)
-    ; GFX6-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C1]]
     ; GFX6-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LSHR]], [[C]](s32)
-    ; GFX6-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[SHL2]], [[AND1]](s32)
-    ; GFX6-NEXT: [[ASHR2:%[0-9]+]]:_(s32) = G_ASHR [[SHL3]], [[AND1]](s32)
+    ; GFX6-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[SHL2]], [[LSHR1]](s32)
+    ; GFX6-NEXT: [[ASHR2:%[0-9]+]]:_(s32) = G_ASHR [[SHL3]], [[LSHR1]](s32)
     ; GFX6-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[SHL2]](s32), [[C4]]
     ; GFX6-NEXT: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s1), [[C2]], [[C3]]
     ; GFX6-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SHL2]](s32), [[ASHR2]]
     ; GFX6-NEXT: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[ICMP3]](s1), [[SELECT2]], [[SHL3]]
     ; GFX6-NEXT: [[ASHR3:%[0-9]+]]:_(s32) = G_ASHR [[SELECT3]], [[C]](s32)
-    ; GFX6-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[ASHR1]], [[C1]]
-    ; GFX6-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[ASHR3]], [[C1]]
-    ; GFX6-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C]](s32)
-    ; GFX6-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL4]]
+    ; GFX6-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[ASHR1]], [[C1]]
+    ; GFX6-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[ASHR3]], [[C1]]
+    ; GFX6-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C]](s32)
+    ; GFX6-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND1]], [[SHL4]]
     ; GFX6-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; GFX6-NEXT: $vgpr0 = COPY [[BITCAST2]](<2 x s16>)
+    ;
     ; GFX8-LABEL: name: sshlsat_v2s16
     ; GFX8: liveins: $vgpr0, $vgpr1
     ; GFX8-NEXT: {{  $}}
@@ -458,6 +466,7 @@ body: |
     ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]]
     ; GFX8-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; GFX8-NEXT: $vgpr0 = COPY [[BITCAST2]](<2 x s16>)
+    ;
     ; GFX9-LABEL: name: sshlsat_v2s16
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -513,34 +522,32 @@ body: |
     ; GFX6-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
     ; GFX6-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>)
     ; GFX6-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32)
-    ; GFX6-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
-    ; GFX6-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C1]]
     ; GFX6-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[BITCAST]], [[C]](s32)
-    ; GFX6-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[SHL]], [[AND]](s32)
-    ; GFX6-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[SHL1]], [[AND]](s32)
-    ; GFX6-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 -2147483648
-    ; GFX6-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 2147483647
-    ; GFX6-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-    ; GFX6-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[SHL]](s32), [[C4]]
-    ; GFX6-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[C2]], [[C3]]
+    ; GFX6-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[SHL]], [[LSHR1]](s32)
+    ; GFX6-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[SHL1]], [[LSHR1]](s32)
+    ; GFX6-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -2147483648
+    ; GFX6-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2147483647
+    ; GFX6-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; GFX6-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[SHL]](s32), [[C3]]
+    ; GFX6-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[C1]], [[C2]]
     ; GFX6-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SHL]](s32), [[ASHR]]
     ; GFX6-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s1), [[SELECT]], [[SHL1]]
     ; GFX6-NEXT: [[ASHR1:%[0-9]+]]:_(s32) = G_ASHR [[SELECT1]], [[C]](s32)
-    ; GFX6-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[BITCAST2]], [[C1]]
+    ; GFX6-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+    ; GFX6-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[BITCAST2]], [[C4]]
     ; GFX6-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LSHR]], [[C]](s32)
-    ; GFX6-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[SHL2]], [[AND1]](s32)
-    ; GFX6-NEXT: [[ASHR2:%[0-9]+]]:_(s32) = G_ASHR [[SHL3]], [[AND1]](s32)
-    ; GFX6-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[SHL2]](s32), [[C4]]
-    ; GFX6-NEXT: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s1), [[C2]], [[C3]]
+    ; GFX6-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[SHL2]], [[AND]](s32)
+    ; GFX6-NEXT: [[ASHR2:%[0-9]+]]:_(s32) = G_ASHR [[SHL3]], [[AND]](s32)
+    ; GFX6-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[SHL2]](s32), [[C3]]
+    ; GFX6-NEXT: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s1), [[C1]], [[C2]]
     ; GFX6-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SHL2]](s32), [[ASHR2]]
     ; GFX6-NEXT: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[ICMP3]](s1), [[SELECT2]], [[SHL3]]
     ; GFX6-NEXT: [[ASHR3:%[0-9]+]]:_(s32) = G_ASHR [[SELECT3]], [[C]](s32)
-    ; GFX6-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[LSHR2]], [[C1]]
     ; GFX6-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[BITCAST1]], [[C]](s32)
-    ; GFX6-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[SHL4]], [[AND2]](s32)
-    ; GFX6-NEXT: [[ASHR4:%[0-9]+]]:_(s32) = G_ASHR [[SHL5]], [[AND2]](s32)
-    ; GFX6-NEXT: [[ICMP4:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[SHL4]](s32), [[C4]]
-    ; GFX6-NEXT: [[SELECT4:%[0-9]+]]:_(s32) = G_SELECT [[ICMP4]](s1), [[C2]], [[C3]]
+    ; GFX6-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[SHL4]], [[LSHR2]](s32)
+    ; GFX6-NEXT: [[ASHR4:%[0-9]+]]:_(s32) = G_ASHR [[SHL5]], [[LSHR2]](s32)
+    ; GFX6-NEXT: [[ICMP4:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[SHL4]](s32), [[C3]]
+    ; GFX6-NEXT: [[SELECT4:%[0-9]+]]:_(s32) = G_SELECT [[ICMP4]](s1), [[C1]], [[C2]]
     ; GFX6-NEXT: [[ICMP5:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SHL4]](s32), [[ASHR4]]
     ; GFX6-NEXT: [[SELECT5:%[0-9]+]]:_(s32) = G_SELECT [[ICMP5]](s1), [[SELECT4]], [[SHL5]]
     ; GFX6-NEXT: [[ASHR5:%[0-9]+]]:_(s32) = G_ASHR [[SELECT5]], [[C]](s32)
@@ -549,23 +556,23 @@ body: |
     ; GFX6-NEXT: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>)
     ; GFX6-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32)
     ; GFX6-NEXT: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[UV4]](<2 x s16>)
-    ; GFX6-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[ASHR1]], [[C1]]
-    ; GFX6-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[ASHR3]], [[C1]]
-    ; GFX6-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND4]], [[C]](s32)
-    ; GFX6-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND3]], [[SHL6]]
+    ; GFX6-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[ASHR1]], [[C4]]
+    ; GFX6-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[ASHR3]], [[C4]]
+    ; GFX6-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C]](s32)
+    ; GFX6-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND1]], [[SHL6]]
     ; GFX6-NEXT: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
-    ; GFX6-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[ASHR5]], [[C1]]
-    ; GFX6-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[BITCAST3]], [[C1]]
-    ; GFX6-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND6]], [[C]](s32)
-    ; GFX6-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND5]], [[SHL7]]
+    ; GFX6-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[ASHR5]], [[C4]]
+    ; GFX6-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[BITCAST3]], [[C4]]
+    ; GFX6-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND4]], [[C]](s32)
+    ; GFX6-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND3]], [[SHL7]]
     ; GFX6-NEXT: [[BITCAST6:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
-    ; GFX6-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[LSHR3]], [[C1]]
-    ; GFX6-NEXT: [[AND8:%[0-9]+]]:_(s32) = G_AND [[BITCAST4]], [[C1]]
-    ; GFX6-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND8]], [[C]](s32)
-    ; GFX6-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND7]], [[SHL8]]
+    ; GFX6-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[BITCAST4]], [[C4]]
+    ; GFX6-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C]](s32)
+    ; GFX6-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[LSHR3]], [[SHL8]]
     ; GFX6-NEXT: [[BITCAST7:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
     ; GFX6-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST5]](<2 x s16>), [[BITCAST6]](<2 x s16>), [[BITCAST7]](<2 x s16>)
     ; GFX6-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
     ; GFX8-LABEL: name: sshlsat_v3s16
     ; GFX8: liveins: $vgpr0_vgpr1_vgpr2
     ; GFX8-NEXT: {{  $}}
@@ -621,13 +628,13 @@ body: |
     ; GFX8-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND]], [[C]](s32)
     ; GFX8-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL4]]
     ; GFX8-NEXT: [[BITCAST6:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
-    ; GFX8-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR3]], [[C4]]
-    ; GFX8-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[BITCAST4]], [[C4]]
-    ; GFX8-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C]](s32)
-    ; GFX8-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND1]], [[SHL5]]
+    ; GFX8-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[BITCAST4]], [[C4]]
+    ; GFX8-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
+    ; GFX8-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[LSHR3]], [[SHL5]]
     ; GFX8-NEXT: [[BITCAST7:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
     ; GFX8-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST5]](<2 x s16>), [[BITCAST6]](<2 x s16>), [[BITCAST7]](<2 x s16>)
     ; GFX8-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
     ; GFX9-LABEL: name: sshlsat_v3s16
     ; GFX9: liveins: $vgpr0_vgpr1_vgpr2
     ; GFX9-NEXT: {{  $}}
@@ -723,45 +730,44 @@ body: |
     ; GFX6-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SHL]](s32), [[ASHR]]
     ; GFX6-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s1), [[SELECT]], [[SHL1]]
     ; GFX6-NEXT: [[ASHR1:%[0-9]+]]:_(s32) = G_ASHR [[SELECT1]], [[C]](s32)
-    ; GFX6-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR2]], [[C1]]
     ; GFX6-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LSHR]], [[C]](s32)
-    ; GFX6-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[SHL2]], [[AND1]](s32)
-    ; GFX6-NEXT: [[ASHR2:%[0-9]+]]:_(s32) = G_ASHR [[SHL3]], [[AND1]](s32)
+    ; GFX6-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[SHL2]], [[LSHR2]](s32)
+    ; GFX6-NEXT: [[ASHR2:%[0-9]+]]:_(s32) = G_ASHR [[SHL3]], [[LSHR2]](s32)
     ; GFX6-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[SHL2]](s32), [[C4]]
     ; GFX6-NEXT: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s1), [[C2]], [[C3]]
     ; GFX6-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SHL2]](s32), [[ASHR2]]
     ; GFX6-NEXT: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[ICMP3]](s1), [[SELECT2]], [[SHL3]]
     ; GFX6-NEXT: [[ASHR3:%[0-9]+]]:_(s32) = G_ASHR [[SELECT3]], [[C]](s32)
-    ; GFX6-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[BITCAST3]], [[C1]]
+    ; GFX6-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[BITCAST3]], [[C1]]
     ; GFX6-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[BITCAST1]], [[C]](s32)
-    ; GFX6-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[SHL4]], [[AND2]](s32)
-    ; GFX6-NEXT: [[ASHR4:%[0-9]+]]:_(s32) = G_ASHR [[SHL5]], [[AND2]](s32)
+    ; GFX6-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[SHL4]], [[AND1]](s32)
+    ; GFX6-NEXT: [[ASHR4:%[0-9]+]]:_(s32) = G_ASHR [[SHL5]], [[AND1]](s32)
     ; GFX6-NEXT: [[ICMP4:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[SHL4]](s32), [[C4]]
     ; GFX6-NEXT: [[SELECT4:%[0-9]+]]:_(s32) = G_SELECT [[ICMP4]](s1), [[C2]], [[C3]]
     ; GFX6-NEXT: [[ICMP5:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SHL4]](s32), [[ASHR4]]
     ; GFX6-NEXT: [[SELECT5:%[0-9]+]]:_(s32) = G_SELECT [[ICMP5]](s1), [[SELECT4]], [[SHL5]]
     ; GFX6-NEXT: [[ASHR5:%[0-9]+]]:_(s32) = G_ASHR [[SELECT5]], [[C]](s32)
-    ; GFX6-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[LSHR3]], [[C1]]
     ; GFX6-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[LSHR1]], [[C]](s32)
-    ; GFX6-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[SHL6]], [[AND3]](s32)
-    ; GFX6-NEXT: [[ASHR6:%[0-9]+]]:_(s32) = G_ASHR [[SHL7]], [[AND3]](s32)
+    ; GFX6-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[SHL6]], [[LSHR3]](s32)
+    ; GFX6-NEXT: [[ASHR6:%[0-9]+]]:_(s32) = G_ASHR [[SHL7]], [[LSHR3]](s32)
     ; GFX6-NEXT: [[ICMP6:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[SHL6]](s32), [[C4]]
     ; GFX6-NEXT: [[SELECT6:%[0-9]+]]:_(s32) = G_SELECT [[ICMP6]](s1), [[C2]], [[C3]]
     ; GFX6-NEXT: [[ICMP7:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SHL6]](s32), [[ASHR6]]
     ; GFX6-NEXT: [[SELECT7:%[0-9]+]]:_(s32) = G_SELECT [[ICMP7]](s1), [[SELECT6]], [[SHL7]]
     ; GFX6-NEXT: [[ASHR7:%[0-9]+]]:_(s32) = G_ASHR [[SELECT7]], [[C]](s32)
-    ; GFX6-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[ASHR1]], [[C1]]
-    ; GFX6-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[ASHR3]], [[C1]]
-    ; GFX6-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C]](s32)
-    ; GFX6-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL8]]
+    ; GFX6-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[ASHR1]], [[C1]]
+    ; GFX6-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[ASHR3]], [[C1]]
+    ; GFX6-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C]](s32)
+    ; GFX6-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL8]]
     ; GFX6-NEXT: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
-    ; GFX6-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[ASHR5]], [[C1]]
-    ; GFX6-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[ASHR7]], [[C1]]
-    ; GFX6-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C]](s32)
-    ; GFX6-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND6]], [[SHL9]]
+    ; GFX6-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[ASHR5]], [[C1]]
+    ; GFX6-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[ASHR7]], [[C1]]
+    ; GFX6-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C]](s32)
+    ; GFX6-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL9]]
     ; GFX6-NEXT: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
     ; GFX6-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>)
     ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
     ; GFX8-LABEL: name: sshlsat_v4s16
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX8-NEXT: {{  $}}
@@ -825,6 +831,7 @@ body: |
     ; GFX8-NEXT: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
     ; GFX8-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>)
     ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
     ; GFX9-LABEL: name: sshlsat_v4s16
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -907,6 +914,7 @@ body: |
     ; GFX6-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[COPY]](s32), [[ASHR]]
     ; GFX6-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s1), [[SELECT]], [[SHL]]
     ; GFX6-NEXT: $vgpr0 = COPY [[SELECT1]](s32)
+    ;
     ; GFX8-LABEL: name: sshlsat_s32
     ; GFX8: liveins: $vgpr0, $vgpr1
     ; GFX8-NEXT: {{  $}}
@@ -922,6 +930,7 @@ body: |
     ; GFX8-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[COPY]](s32), [[ASHR]]
     ; GFX8-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s1), [[SELECT]], [[SHL]]
     ; GFX8-NEXT: $vgpr0 = COPY [[SELECT1]](s32)
+    ;
     ; GFX9-LABEL: name: sshlsat_s32
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -973,6 +982,7 @@ body: |
     ; GFX6-NEXT: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[ICMP3]](s1), [[SELECT2]], [[SHL1]]
     ; GFX6-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[SELECT1]](s32), [[SELECT3]](s32)
     ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; GFX8-LABEL: name: sshlsat_v2s32
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX8-NEXT: {{  $}}
@@ -997,6 +1007,7 @@ body: |
     ; GFX8-NEXT: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[ICMP3]](s1), [[SELECT2]], [[SHL1]]
     ; GFX8-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[SELECT1]](s32), [[SELECT3]](s32)
     ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; GFX9-LABEL: name: sshlsat_v2s32
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -1049,6 +1060,7 @@ body: |
     ; GFX6-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[COPY]](s64), [[ASHR]]
     ; GFX6-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[ICMP1]](s1), [[SELECT]], [[SHL]]
     ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[SELECT1]](s64)
+    ;
     ; GFX8-LABEL: name: sshlsat_s64
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX8-NEXT: {{  $}}
@@ -1065,6 +1077,7 @@ body: |
     ; GFX8-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[COPY]](s64), [[ASHR]]
     ; GFX8-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[ICMP1]](s1), [[SELECT]], [[SHL]]
     ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[SELECT1]](s64)
+    ;
     ; GFX9-LABEL: name: sshlsat_s64
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -1119,6 +1132,7 @@ body: |
     ; GFX6-NEXT: [[SELECT3:%[0-9]+]]:_(s64) = G_SELECT [[ICMP3]](s1), [[SELECT2]], [[SHL1]]
     ; GFX6-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[SELECT1]](s64), [[SELECT3]](s64)
     ; GFX6-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ;
     ; GFX8-LABEL: name: sshlsat_v2s64
     ; GFX8: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7
     ; GFX8-NEXT: {{  $}}
@@ -1145,6 +1159,7 @@ body: |
     ; GFX8-NEXT: [[SELECT3:%[0-9]+]]:_(s64) = G_SELECT [[ICMP3]](s1), [[SELECT2]], [[SHL1]]
     ; GFX8-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[SELECT1]](s64), [[SELECT3]](s64)
     ; GFX8-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ;
     ; GFX9-LABEL: name: sshlsat_v2s64
     ; GFX9: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7
     ; GFX9-NEXT: {{  $}}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ssubo.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ssubo.mir
index 896d057fd74f6..57b1ab9b194ec 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ssubo.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ssubo.mir
@@ -256,17 +256,16 @@ body: |
     ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C]](s32)
     ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]]
     ; CHECK-NEXT: [[BITCAST11:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
-    ; CHECK-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[LSHR4]], [[C2]]
-    ; CHECK-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[BITCAST9]], [[C2]]
-    ; CHECK-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C]](s32)
-    ; CHECK-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL2]]
+    ; CHECK-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[BITCAST9]], [[C2]]
+    ; CHECK-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND4]], [[C]](s32)
+    ; CHECK-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[LSHR4]], [[SHL2]]
     ; CHECK-NEXT: [[BITCAST12:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
     ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST10]](<2 x s16>), [[BITCAST11]](<2 x s16>), [[BITCAST12]](<2 x s16>)
     ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C3]]
-    ; CHECK-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[ANYEXT1]], [[C3]]
-    ; CHECK-NEXT: [[AND8:%[0-9]+]]:_(s32) = G_AND [[ANYEXT2]], [[C3]]
-    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[AND6]](s32), [[AND7]](s32), [[AND8]](s32)
+    ; CHECK-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C3]]
+    ; CHECK-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[ANYEXT1]], [[C3]]
+    ; CHECK-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[ANYEXT2]], [[C3]]
+    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[AND5]](s32), [[AND6]](s32), [[AND7]](s32)
     ; CHECK-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
     ; CHECK-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
     %0:_(<6 x s16>) = COPY $vgpr0_vgpr1_vgpr2
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ssubsat.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ssubsat.mir
index 7d6b8c5f62190..ba5d290809487 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ssubsat.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ssubsat.mir
@@ -31,6 +31,7 @@ body: |
     ; GFX6-NEXT: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[SHL]], [[SMIN1]]
     ; GFX6-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[SUB2]], [[C]](s32)
     ; GFX6-NEXT: $vgpr0 = COPY [[ASHR]](s32)
+    ;
     ; GFX8-LABEL: name: ssubsat_s7
     ; GFX8: liveins: $vgpr0, $vgpr1
     ; GFX8-NEXT: {{  $}}
@@ -54,6 +55,7 @@ body: |
     ; GFX8-NEXT: [[ASHR:%[0-9]+]]:_(s16) = G_ASHR [[SUB2]], [[C]](s16)
     ; GFX8-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[ASHR]](s16)
     ; GFX8-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ;
     ; GFX9-LABEL: name: ssubsat_s7
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -103,6 +105,7 @@ body: |
     ; GFX6-NEXT: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[SHL]], [[SMIN1]]
     ; GFX6-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[SUB2]], [[C]](s32)
     ; GFX6-NEXT: $vgpr0 = COPY [[ASHR]](s32)
+    ;
     ; GFX8-LABEL: name: ssubsat_s8
     ; GFX8: liveins: $vgpr0, $vgpr1
     ; GFX8-NEXT: {{  $}}
@@ -126,6 +129,7 @@ body: |
     ; GFX8-NEXT: [[ASHR:%[0-9]+]]:_(s16) = G_ASHR [[SUB2]], [[C]](s16)
     ; GFX8-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[ASHR]](s16)
     ; GFX8-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ;
     ; GFX9-LABEL: name: ssubsat_s8
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -187,8 +191,8 @@ body: |
     ; GFX6-NEXT: [[SMIN3:%[0-9]+]]:_(s32) = G_SMIN [[SMAX3]], [[SUB4]]
     ; GFX6-NEXT: [[SUB5:%[0-9]+]]:_(s32) = G_SUB [[SHL2]], [[SMIN3]]
     ; GFX6-NEXT: [[ASHR1:%[0-9]+]]:_(s32) = G_ASHR [[SUB5]], [[C1]](s32)
-    ; GFX6-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; GFX6-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[ASHR]](s32)
+    ; GFX6-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; GFX6-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C5]]
     ; GFX6-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C]](s32)
     ; GFX6-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
@@ -198,6 +202,7 @@ body: |
     ; GFX6-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[TRUNC1]]
     ; GFX6-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[OR]](s16)
     ; GFX6-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ;
     ; GFX8-LABEL: name: ssubsat_v2s8
     ; GFX8: liveins: $vgpr0, $vgpr1
     ; GFX8-NEXT: {{  $}}
@@ -241,6 +246,7 @@ body: |
     ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL4]]
     ; GFX8-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[OR]](s16)
     ; GFX8-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ;
     ; GFX9-LABEL: name: ssubsat_v2s8
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -311,6 +317,7 @@ body: |
     ; GFX6-NEXT: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[SHL]], [[SMIN1]]
     ; GFX6-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[SUB2]], [[C]](s32)
     ; GFX6-NEXT: $vgpr0 = COPY [[ASHR]](s32)
+    ;
     ; GFX8-LABEL: name: ssubsat_s16
     ; GFX8: liveins: $vgpr0, $vgpr1
     ; GFX8-NEXT: {{  $}}
@@ -330,6 +337,7 @@ body: |
     ; GFX8-NEXT: [[SUB2:%[0-9]+]]:_(s16) = G_SUB [[TRUNC]], [[SMIN1]]
     ; GFX8-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[SUB2]](s16)
     ; GFX8-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ;
     ; GFX9-LABEL: name: ssubsat_s16
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -395,6 +403,7 @@ body: |
     ; GFX6-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL4]]
     ; GFX6-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; GFX6-NEXT: $vgpr0 = COPY [[BITCAST2]](<2 x s16>)
+    ;
     ; GFX8-LABEL: name: ssubsat_v2s16
     ; GFX8: liveins: $vgpr0, $vgpr1
     ; GFX8-NEXT: {{  $}}
@@ -432,6 +441,7 @@ body: |
     ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL]]
     ; GFX8-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; GFX8-NEXT: $vgpr0 = COPY [[BITCAST2]](<2 x s16>)
+    ;
     ; GFX9-LABEL: name: ssubsat_v2s16
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -512,13 +522,13 @@ body: |
     ; GFX6-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C]](s32)
     ; GFX6-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL7]]
     ; GFX6-NEXT: [[BITCAST6:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
-    ; GFX6-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[LSHR3]], [[C4]]
-    ; GFX6-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[BITCAST4]], [[C4]]
-    ; GFX6-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C]](s32)
-    ; GFX6-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL8]]
+    ; GFX6-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[BITCAST4]], [[C4]]
+    ; GFX6-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND4]], [[C]](s32)
+    ; GFX6-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[LSHR3]], [[SHL8]]
     ; GFX6-NEXT: [[BITCAST7:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
     ; GFX6-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST5]](<2 x s16>), [[BITCAST6]](<2 x s16>), [[BITCAST7]](<2 x s16>)
     ; GFX6-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
     ; GFX8-LABEL: name: ssubsat_v3s16
     ; GFX8: liveins: $vgpr0_vgpr1_vgpr2
     ; GFX8-NEXT: {{  $}}
@@ -577,13 +587,13 @@ body: |
     ; GFX8-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND]], [[C]](s32)
     ; GFX8-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL1]]
     ; GFX8-NEXT: [[BITCAST6:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
-    ; GFX8-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR3]], [[C4]]
-    ; GFX8-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[BITCAST4]], [[C4]]
-    ; GFX8-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C]](s32)
-    ; GFX8-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND1]], [[SHL2]]
+    ; GFX8-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[BITCAST4]], [[C4]]
+    ; GFX8-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
+    ; GFX8-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[LSHR3]], [[SHL2]]
     ; GFX8-NEXT: [[BITCAST7:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
     ; GFX8-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST5]](<2 x s16>), [[BITCAST6]](<2 x s16>), [[BITCAST7]](<2 x s16>)
     ; GFX8-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
     ; GFX9-LABEL: name: ssubsat_v3s16
     ; GFX9: liveins: $vgpr0_vgpr1_vgpr2
     ; GFX9-NEXT: {{  $}}
@@ -709,6 +719,7 @@ body: |
     ; GFX6-NEXT: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
     ; GFX6-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>)
     ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
     ; GFX8-LABEL: name: ssubsat_v4s16
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX8-NEXT: {{  $}}
@@ -776,6 +787,7 @@ body: |
     ; GFX8-NEXT: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
     ; GFX8-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>)
     ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
     ; GFX9-LABEL: name: ssubsat_v4s16
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -815,6 +827,7 @@ body: |
     ; GFX6-NEXT: [[SMIN1:%[0-9]+]]:_(s32) = G_SMIN [[SMAX1]], [[SUB1]]
     ; GFX6-NEXT: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[COPY]], [[SMIN1]]
     ; GFX6-NEXT: $vgpr0 = COPY [[SUB2]](s32)
+    ;
     ; GFX8-LABEL: name: ssubsat_s32
     ; GFX8: liveins: $vgpr0, $vgpr1
     ; GFX8-NEXT: {{  $}}
@@ -831,6 +844,7 @@ body: |
     ; GFX8-NEXT: [[SMIN1:%[0-9]+]]:_(s32) = G_SMIN [[SMAX1]], [[SUB1]]
     ; GFX8-NEXT: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[COPY]], [[SMIN1]]
     ; GFX8-NEXT: $vgpr0 = COPY [[SUB2]](s32)
+    ;
     ; GFX9-LABEL: name: ssubsat_s32
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -876,6 +890,7 @@ body: |
     ; GFX6-NEXT: [[SUB5:%[0-9]+]]:_(s32) = G_SUB [[UV1]], [[SMIN3]]
     ; GFX6-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[SUB2]](s32), [[SUB5]](s32)
     ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; GFX8-LABEL: name: ssubsat_v2s32
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX8-NEXT: {{  $}}
@@ -902,6 +917,7 @@ body: |
     ; GFX8-NEXT: [[SUB5:%[0-9]+]]:_(s32) = G_SUB [[UV1]], [[SMIN3]]
     ; GFX8-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[SUB2]](s32), [[SUB5]](s32)
     ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; GFX9-LABEL: name: ssubsat_v2s32
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -949,6 +965,7 @@ body: |
     ; GFX6-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO]](s32), [[UADDE]](s32)
     ; GFX6-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[XOR]](s1), [[MV1]], [[MV]]
     ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[SELECT]](s64)
+    ;
     ; GFX8-LABEL: name: ssubsat_s64
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX8-NEXT: {{  $}}
@@ -973,6 +990,7 @@ body: |
     ; GFX8-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO]](s32), [[UADDE]](s32)
     ; GFX8-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[XOR]](s1), [[MV1]], [[MV]]
     ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[SELECT]](s64)
+    ;
     ; GFX9-LABEL: name: ssubsat_s64
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -1051,6 +1069,7 @@ body: |
     ; GFX6-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[XOR1]](s1), [[MV3]], [[MV2]]
     ; GFX6-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[SELECT]](s64), [[SELECT1]](s64)
     ; GFX6-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ;
     ; GFX8-LABEL: name: ssubsat_v2s64
     ; GFX8: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7
     ; GFX8-NEXT: {{  $}}
@@ -1093,6 +1112,7 @@ body: |
     ; GFX8-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[XOR1]](s1), [[MV3]], [[MV2]]
     ; GFX8-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[SELECT]](s64), [[SELECT1]](s64)
     ; GFX8-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ;
     ; GFX9-LABEL: name: ssubsat_v2s64
     ; GFX9: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7
     ; GFX9-NEXT: {{  $}}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-store-global.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-store-global.mir
index fb1e0537556fa..c2fdbff77868f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-store-global.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-store-global.mir
@@ -21,6 +21,7 @@ body: |
     ; SI-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C]]
     ; SI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[AND]], [[C]]
     ; SI-NEXT: G_STORE [[AND1]](s32), [[COPY]](p1) :: (store (s8), addrspace 1)
+    ;
     ; CI-LABEL: name: test_store_global_s1_align1
     ; CI: liveins: $vgpr0_vgpr1, $vgpr2
     ; CI-NEXT: {{  $}}
@@ -30,6 +31,7 @@ body: |
     ; CI-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C]]
     ; CI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[AND]], [[C]]
     ; CI-NEXT: G_STORE [[AND1]](s32), [[COPY]](p1) :: (store (s8), addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_s1_align1
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2
     ; VI-NEXT: {{  $}}
@@ -39,6 +41,7 @@ body: |
     ; VI-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C]]
     ; VI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[AND]], [[C]]
     ; VI-NEXT: G_STORE [[AND1]](s32), [[COPY]](p1) :: (store (s8), addrspace 1)
+    ;
     ; GFX9-LABEL: name: test_store_global_s1_align1
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9-NEXT: {{  $}}
@@ -68,6 +71,7 @@ body: |
     ; SI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 127
     ; SI-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C]]
     ; SI-NEXT: G_STORE [[AND]](s32), [[COPY]](p1) :: (store (s8), addrspace 1)
+    ;
     ; CI-LABEL: name: test_store_global_s7_align1
     ; CI: liveins: $vgpr0_vgpr1, $vgpr2
     ; CI-NEXT: {{  $}}
@@ -76,6 +80,7 @@ body: |
     ; CI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 127
     ; CI-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C]]
     ; CI-NEXT: G_STORE [[AND]](s32), [[COPY]](p1) :: (store (s8), addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_s7_align1
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2
     ; VI-NEXT: {{  $}}
@@ -84,6 +89,7 @@ body: |
     ; VI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 127
     ; VI-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C]]
     ; VI-NEXT: G_STORE [[AND]](s32), [[COPY]](p1) :: (store (s8), addrspace 1)
+    ;
     ; GFX9-LABEL: name: test_store_global_s7_align1
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9-NEXT: {{  $}}
@@ -110,18 +116,21 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; SI-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr2
     ; SI-NEXT: G_STORE [[COPY1]](s32), [[COPY]](p1) :: (store (s8), addrspace 1)
+    ;
     ; CI-LABEL: name: test_store_global_s8_align1
     ; CI: liveins: $vgpr0_vgpr1, $vgpr2
     ; CI-NEXT: {{  $}}
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr2
     ; CI-NEXT: G_STORE [[COPY1]](s32), [[COPY]](p1) :: (store (s8), addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_s8_align1
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr2
     ; VI-NEXT: G_STORE [[COPY1]](s32), [[COPY]](p1) :: (store (s8), addrspace 1)
+    ;
     ; GFX9-LABEL: name: test_store_global_s8_align1
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9-NEXT: {{  $}}
@@ -153,12 +162,14 @@ body: |
     ; SI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C2]](s64)
     ; SI-NEXT: G_STORE [[COPY1]](s32), [[COPY]](p1) :: (store (s8), addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR]](s32), [[PTR_ADD]](p1) :: (store (s8) into unknown-address + 1, addrspace 1)
+    ;
     ; CI-LABEL: name: test_store_global_s16_align1
     ; CI: liveins: $vgpr0_vgpr1, $vgpr2
     ; CI-NEXT: {{  $}}
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr2
     ; CI-NEXT: G_STORE [[COPY1]](s32), [[COPY]](p1) :: (store (s16), align 1, addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_s16_align1
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2
     ; VI-NEXT: {{  $}}
@@ -172,6 +183,7 @@ body: |
     ; VI-NEXT: G_STORE [[COPY1]](s32), [[COPY]](p1) :: (store (s8), addrspace 1)
     ; VI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR]](s16)
     ; VI-NEXT: G_STORE [[ANYEXT]](s32), [[PTR_ADD]](p1) :: (store (s8) into unknown-address + 1, addrspace 1)
+    ;
     ; GFX9-LABEL: name: test_store_global_s16_align1
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9-NEXT: {{  $}}
@@ -196,18 +208,21 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; SI-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr2
     ; SI-NEXT: G_STORE [[COPY1]](s32), [[COPY]](p1) :: (store (s16), addrspace 1)
+    ;
     ; CI-LABEL: name: test_store_global_s16_align2
     ; CI: liveins: $vgpr0_vgpr1, $vgpr2
     ; CI-NEXT: {{  $}}
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr2
     ; CI-NEXT: G_STORE [[COPY1]](s32), [[COPY]](p1) :: (store (s16), addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_s16_align2
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr2
     ; VI-NEXT: G_STORE [[COPY1]](s32), [[COPY]](p1) :: (store (s16), addrspace 1)
+    ;
     ; GFX9-LABEL: name: test_store_global_s16_align2
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9-NEXT: {{  $}}
@@ -232,18 +247,21 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; SI-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr2
     ; SI-NEXT: G_STORE [[COPY1]](s32), [[COPY]](p1) :: (store (s16), align 4, addrspace 1)
+    ;
     ; CI-LABEL: name: test_store_global_s16_align4
     ; CI: liveins: $vgpr0_vgpr1, $vgpr2
     ; CI-NEXT: {{  $}}
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr2
     ; CI-NEXT: G_STORE [[COPY1]](s32), [[COPY]](p1) :: (store (s16), align 4, addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_s16_align4
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr2
     ; VI-NEXT: G_STORE [[COPY1]](s32), [[COPY]](p1) :: (store (s16), align 4, addrspace 1)
+    ;
     ; GFX9-LABEL: name: test_store_global_s16_align4
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9-NEXT: {{  $}}
@@ -274,6 +292,7 @@ body: |
     ; SI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C1]](s64)
     ; SI-NEXT: G_STORE [[COPY2]](s32), [[COPY]](p1) :: (store (s16), align 4, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR]](s32), [[PTR_ADD]](p1) :: (store (s8) into unknown-address + 2, align 2, addrspace 1)
+    ;
     ; CI-LABEL: name: test_store_global_s24_align4
     ; CI: liveins: $vgpr0_vgpr1, $vgpr2
     ; CI-NEXT: {{  $}}
@@ -286,6 +305,7 @@ body: |
     ; CI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C1]](s64)
     ; CI-NEXT: G_STORE [[COPY2]](s32), [[COPY]](p1) :: (store (s16), align 4, addrspace 1)
     ; CI-NEXT: G_STORE [[LSHR]](s32), [[PTR_ADD]](p1) :: (store (s8) into unknown-address + 2, align 2, addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_s24_align4
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2
     ; VI-NEXT: {{  $}}
@@ -298,6 +318,7 @@ body: |
     ; VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C1]](s64)
     ; VI-NEXT: G_STORE [[COPY2]](s32), [[COPY]](p1) :: (store (s16), align 4, addrspace 1)
     ; VI-NEXT: G_STORE [[LSHR]](s32), [[PTR_ADD]](p1) :: (store (s8) into unknown-address + 2, align 2, addrspace 1)
+    ;
     ; GFX9-LABEL: name: test_store_global_s24_align4
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9-NEXT: {{  $}}
@@ -334,6 +355,7 @@ body: |
     ; SI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C1]](s64)
     ; SI-NEXT: G_STORE [[COPY2]](s32), [[COPY]](p1) :: (store (s16), addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR]](s32), [[PTR_ADD]](p1) :: (store (s8) into unknown-address + 2, align 2, addrspace 1)
+    ;
     ; CI-LABEL: name: test_store_global_s24_align2
     ; CI: liveins: $vgpr0_vgpr1, $vgpr2
     ; CI-NEXT: {{  $}}
@@ -346,6 +368,7 @@ body: |
     ; CI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C1]](s64)
     ; CI-NEXT: G_STORE [[COPY2]](s32), [[COPY]](p1) :: (store (s16), addrspace 1)
     ; CI-NEXT: G_STORE [[LSHR]](s32), [[PTR_ADD]](p1) :: (store (s8) into unknown-address + 2, align 2, addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_s24_align2
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2
     ; VI-NEXT: {{  $}}
@@ -358,6 +381,7 @@ body: |
     ; VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C1]](s64)
     ; VI-NEXT: G_STORE [[COPY2]](s32), [[COPY]](p1) :: (store (s16), addrspace 1)
     ; VI-NEXT: G_STORE [[LSHR]](s32), [[PTR_ADD]](p1) :: (store (s8) into unknown-address + 2, align 2, addrspace 1)
+    ;
     ; GFX9-LABEL: name: test_store_global_s24_align2
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9-NEXT: {{  $}}
@@ -401,6 +425,7 @@ body: |
     ; SI-NEXT: G_STORE [[COPY2]](s32), [[COPY]](p1) :: (store (s8), addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR1]](s32), [[PTR_ADD1]](p1) :: (store (s8) into unknown-address + 1, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR]](s32), [[PTR_ADD]](p1) :: (store (s8) into unknown-address + 2, addrspace 1)
+    ;
     ; CI-LABEL: name: test_store_global_s24_align1
     ; CI: liveins: $vgpr0_vgpr1, $vgpr2
     ; CI-NEXT: {{  $}}
@@ -413,6 +438,7 @@ body: |
     ; CI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C1]](s64)
     ; CI-NEXT: G_STORE [[COPY2]](s32), [[COPY]](p1) :: (store (s16), align 1, addrspace 1)
     ; CI-NEXT: G_STORE [[LSHR]](s32), [[PTR_ADD]](p1) :: (store (s8) into unknown-address + 2, addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_s24_align1
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2
     ; VI-NEXT: {{  $}}
@@ -432,6 +458,7 @@ body: |
     ; VI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR1]](s16)
     ; VI-NEXT: G_STORE [[ANYEXT]](s32), [[PTR_ADD1]](p1) :: (store (s8) into unknown-address + 1, addrspace 1)
     ; VI-NEXT: G_STORE [[LSHR]](s32), [[PTR_ADD]](p1) :: (store (s8) into unknown-address + 2, addrspace 1)
+    ;
     ; GFX9-LABEL: name: test_store_global_s24_align1
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9-NEXT: {{  $}}
@@ -464,6 +491,7 @@ body: |
     ; SI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 33554431
     ; SI-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C]]
     ; SI-NEXT: G_STORE [[AND]](s32), [[COPY]](p1) :: (store (s32), addrspace 1)
+    ;
     ; CI-LABEL: name: test_store_global_s25_align4
     ; CI: liveins: $vgpr0_vgpr1, $vgpr2
     ; CI-NEXT: {{  $}}
@@ -472,6 +500,7 @@ body: |
     ; CI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 33554431
     ; CI-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C]]
     ; CI-NEXT: G_STORE [[AND]](s32), [[COPY]](p1) :: (store (s32), addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_s25_align4
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2
     ; VI-NEXT: {{  $}}
@@ -480,6 +509,7 @@ body: |
     ; VI-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 33554431
     ; VI-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C]]
     ; VI-NEXT: G_STORE [[AND]](s32), [[COPY]](p1) :: (store (s32), addrspace 1)
+    ;
     ; GFX9-LABEL: name: test_store_global_s25_align4
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9-NEXT: {{  $}}
@@ -543,17 +573,18 @@ body: |
     ; SI-NEXT: G_STORE [[COPY2]](s32), [[COPY]](p1) :: (store (s8), addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR1]](s32), [[PTR_ADD1]](p1) :: (store (s8) into unknown-address + 1, addrspace 1)
     ; SI-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; SI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C3]]
-    ; SI-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[AND1]], [[COPY3]](s32)
+    ; SI-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[LSHR]], [[COPY3]](s32)
     ; SI-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[LSHR]](s32), [[PTR_ADD]](p1) :: (store (s8) into unknown-address + 2, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR2]](s32), [[PTR_ADD2]](p1) :: (store (s8) into unknown-address + 3, addrspace 1)
+    ;
     ; CI-LABEL: name: test_store_global_s32_align1
     ; CI: liveins: $vgpr0_vgpr1, $vgpr2
     ; CI-NEXT: {{  $}}
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr2
     ; CI-NEXT: G_STORE [[COPY1]](s32), [[COPY]](p1) :: (store (s32), align 1, addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_s32_align1
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2
     ; VI-NEXT: {{  $}}
@@ -578,6 +609,7 @@ body: |
     ; VI-NEXT: G_STORE [[LSHR]](s32), [[PTR_ADD]](p1) :: (store (s8) into unknown-address + 2, addrspace 1)
     ; VI-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR2]](s16)
     ; VI-NEXT: G_STORE [[ANYEXT1]](s32), [[PTR_ADD2]](p1) :: (store (s8) into unknown-address + 3, addrspace 1)
+    ;
     ; GFX9-LABEL: name: test_store_global_s32_align1
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9-NEXT: {{  $}}
@@ -607,12 +639,14 @@ body: |
     ; SI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C1]](s64)
     ; SI-NEXT: G_STORE [[COPY2]](s32), [[COPY]](p1) :: (store (s16), addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR]](s32), [[PTR_ADD]](p1) :: (store (s16) into unknown-address + 2, addrspace 1)
+    ;
     ; CI-LABEL: name: test_store_global_s32_align2
     ; CI: liveins: $vgpr0_vgpr1, $vgpr2
     ; CI-NEXT: {{  $}}
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr2
     ; CI-NEXT: G_STORE [[COPY1]](s32), [[COPY]](p1) :: (store (s32), align 2, addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_s32_align2
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2
     ; VI-NEXT: {{  $}}
@@ -625,6 +659,7 @@ body: |
     ; VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C1]](s64)
     ; VI-NEXT: G_STORE [[COPY2]](s32), [[COPY]](p1) :: (store (s16), addrspace 1)
     ; VI-NEXT: G_STORE [[LSHR]](s32), [[PTR_ADD]](p1) :: (store (s16) into unknown-address + 2, addrspace 1)
+    ;
     ; GFX9-LABEL: name: test_store_global_s32_align2
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9-NEXT: {{  $}}
@@ -648,18 +683,21 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; SI-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr2
     ; SI-NEXT: G_STORE [[COPY1]](s32), [[COPY]](p1) :: (store (s32), addrspace 1)
+    ;
     ; CI-LABEL: name: test_store_global_s32_align4
     ; CI: liveins: $vgpr0_vgpr1, $vgpr2
     ; CI-NEXT: {{  $}}
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr2
     ; CI-NEXT: G_STORE [[COPY1]](s32), [[COPY]](p1) :: (store (s32), addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_s32_align4
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr2
     ; VI-NEXT: G_STORE [[COPY1]](s32), [[COPY]](p1) :: (store (s32), addrspace 1)
+    ;
     ; GFX9-LABEL: name: test_store_global_s32_align4
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9-NEXT: {{  $}}
@@ -697,17 +735,18 @@ body: |
     ; SI-NEXT: G_STORE [[COPY2]](s32), [[COPY]](p1) :: (store (s8), addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR1]](s32), [[PTR_ADD1]](p1) :: (store (s8) into unknown-address + 1, addrspace 1)
     ; SI-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; SI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C3]]
-    ; SI-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[AND1]], [[COPY3]](s32)
+    ; SI-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[LSHR]], [[COPY3]](s32)
     ; SI-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[LSHR]](s32), [[PTR_ADD]](p1) :: (store (s8) into unknown-address + 2, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR2]](s32), [[PTR_ADD2]](p1) :: (store (s8) into unknown-address + 3, addrspace 1)
+    ;
     ; CI-LABEL: name: test_store_global_p3_align1
     ; CI: liveins: $vgpr0_vgpr1, $vgpr2
     ; CI-NEXT: {{  $}}
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-NEXT: [[COPY1:%[0-9]+]]:_(p3) = COPY $vgpr2
     ; CI-NEXT: G_STORE [[COPY1]](p3), [[COPY]](p1) :: (store (p3), align 1, addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_p3_align1
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2
     ; VI-NEXT: {{  $}}
@@ -733,6 +772,7 @@ body: |
     ; VI-NEXT: G_STORE [[LSHR]](s32), [[PTR_ADD]](p1) :: (store (s8) into unknown-address + 2, addrspace 1)
     ; VI-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR2]](s16)
     ; VI-NEXT: G_STORE [[ANYEXT1]](s32), [[PTR_ADD2]](p1) :: (store (s8) into unknown-address + 3, addrspace 1)
+    ;
     ; GFX9-LABEL: name: test_store_global_p3_align1
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9-NEXT: {{  $}}
@@ -763,12 +803,14 @@ body: |
     ; SI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C1]](s64)
     ; SI-NEXT: G_STORE [[COPY2]](s32), [[COPY]](p1) :: (store (s16), addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR]](s32), [[PTR_ADD]](p1) :: (store (s16) into unknown-address + 2, addrspace 1)
+    ;
     ; CI-LABEL: name: test_store_global_p3_align2
     ; CI: liveins: $vgpr0_vgpr1, $vgpr2
     ; CI-NEXT: {{  $}}
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-NEXT: [[COPY1:%[0-9]+]]:_(p3) = COPY $vgpr2
     ; CI-NEXT: G_STORE [[COPY1]](p3), [[COPY]](p1) :: (store (p3), align 2, addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_p3_align2
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2
     ; VI-NEXT: {{  $}}
@@ -782,6 +824,7 @@ body: |
     ; VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C1]](s64)
     ; VI-NEXT: G_STORE [[COPY2]](s32), [[COPY]](p1) :: (store (s16), addrspace 1)
     ; VI-NEXT: G_STORE [[LSHR]](s32), [[PTR_ADD]](p1) :: (store (s16) into unknown-address + 2, addrspace 1)
+    ;
     ; GFX9-LABEL: name: test_store_global_p3_align2
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9-NEXT: {{  $}}
@@ -805,18 +848,21 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; SI-NEXT: [[COPY1:%[0-9]+]]:_(p3) = COPY $vgpr2
     ; SI-NEXT: G_STORE [[COPY1]](p3), [[COPY]](p1) :: (store (p3), addrspace 1)
+    ;
     ; CI-LABEL: name: test_store_global_p3_align4
     ; CI: liveins: $vgpr0_vgpr1, $vgpr2
     ; CI-NEXT: {{  $}}
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-NEXT: [[COPY1:%[0-9]+]]:_(p3) = COPY $vgpr2
     ; CI-NEXT: G_STORE [[COPY1]](p3), [[COPY]](p1) :: (store (p3), addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_p3_align4
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[COPY1:%[0-9]+]]:_(p3) = COPY $vgpr2
     ; VI-NEXT: G_STORE [[COPY1]](p3), [[COPY]](p1) :: (store (p3), addrspace 1)
+    ;
     ; GFX9-LABEL: name: test_store_global_p3_align4
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2
     ; GFX9-NEXT: {{  $}}
@@ -858,18 +904,18 @@ body: |
     ; SI-NEXT: G_STORE [[COPY2]](s32), [[COPY]](p1) :: (store (s8), addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR2]](s32), [[PTR_ADD2]](p1) :: (store (s8) into unknown-address + 1, addrspace 1)
     ; SI-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C4]](s32)
-    ; SI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C5]]
-    ; SI-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[AND1]], [[COPY3]](s32)
+    ; SI-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[LSHR1]], [[COPY3]](s32)
     ; SI-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD1]], [[C6]](s64)
     ; SI-NEXT: G_STORE [[LSHR1]](s32), [[PTR_ADD1]](p1) :: (store (s8) into unknown-address + 2, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR3]](s32), [[PTR_ADD3]](p1) :: (store (s8) into unknown-address + 3, addrspace 1)
     ; SI-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[LSHR]](s64)
     ; SI-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY [[C4]](s32)
-    ; SI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[TRUNC1]], [[C5]]
-    ; SI-NEXT: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[AND2]], [[COPY4]](s32)
+    ; SI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[TRUNC1]], [[C5]]
+    ; SI-NEXT: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[AND1]], [[COPY4]](s32)
     ; SI-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD]], [[C6]](s64)
     ; SI-NEXT: G_STORE [[TRUNC1]](s32), [[PTR_ADD]](p1) :: (store (s8) into unknown-address + 4, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR4]](s32), [[PTR_ADD4]](p1) :: (store (s8) into unknown-address + 5, addrspace 1)
+    ;
     ; CI-LABEL: name: test_store_global_s48_align1
     ; CI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; CI-NEXT: {{  $}}
@@ -883,6 +929,7 @@ body: |
     ; CI-NEXT: G_STORE [[TRUNC]](s32), [[COPY]](p1) :: (store (s32), align 1, addrspace 1)
     ; CI-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[LSHR]](s64)
     ; CI-NEXT: G_STORE [[TRUNC1]](s32), [[PTR_ADD]](p1) :: (store (s16) into unknown-address + 4, align 1, addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_s48_align1
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; VI-NEXT: {{  $}}
@@ -919,6 +966,7 @@ body: |
     ; VI-NEXT: G_STORE [[TRUNC3]](s32), [[PTR_ADD]](p1) :: (store (s8) into unknown-address + 4, addrspace 1)
     ; VI-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR4]](s16)
     ; VI-NEXT: G_STORE [[ANYEXT2]](s32), [[PTR_ADD4]](p1) :: (store (s8) into unknown-address + 5, addrspace 1)
+    ;
     ; GFX9-LABEL: name: test_store_global_s48_align1
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -963,6 +1011,7 @@ body: |
     ; SI-NEXT: G_STORE [[LSHR1]](s32), [[PTR_ADD1]](p1) :: (store (s16) into unknown-address + 2, addrspace 1)
     ; SI-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[LSHR]](s64)
     ; SI-NEXT: G_STORE [[TRUNC1]](s32), [[PTR_ADD]](p1) :: (store (s16) into unknown-address + 4, addrspace 1)
+    ;
     ; CI-LABEL: name: test_store_global_s48_align2
     ; CI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; CI-NEXT: {{  $}}
@@ -976,6 +1025,7 @@ body: |
     ; CI-NEXT: G_STORE [[TRUNC]](s32), [[COPY]](p1) :: (store (s32), align 2, addrspace 1)
     ; CI-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[LSHR]](s64)
     ; CI-NEXT: G_STORE [[TRUNC1]](s32), [[PTR_ADD]](p1) :: (store (s16) into unknown-address + 4, addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_s48_align2
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; VI-NEXT: {{  $}}
@@ -995,6 +1045,7 @@ body: |
     ; VI-NEXT: G_STORE [[LSHR1]](s32), [[PTR_ADD1]](p1) :: (store (s16) into unknown-address + 2, addrspace 1)
     ; VI-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[LSHR]](s64)
     ; VI-NEXT: G_STORE [[TRUNC1]](s32), [[PTR_ADD]](p1) :: (store (s16) into unknown-address + 4, addrspace 1)
+    ;
     ; GFX9-LABEL: name: test_store_global_s48_align2
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -1070,8 +1121,7 @@ body: |
     ; SI-NEXT: G_STORE [[COPY3]](s32), [[COPY]](p1) :: (store (s8), addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR2]](s32), [[PTR_ADD2]](p1) :: (store (s8) into unknown-address + 1, addrspace 1)
     ; SI-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY [[C4]](s32)
-    ; SI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C5]]
-    ; SI-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[AND1]], [[COPY4]](s32)
+    ; SI-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[LSHR1]], [[COPY4]](s32)
     ; SI-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD1]], [[C6]](s64)
     ; SI-NEXT: G_STORE [[LSHR1]](s32), [[PTR_ADD1]](p1) :: (store (s8) into unknown-address + 2, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR3]](s32), [[PTR_ADD3]](p1) :: (store (s8) into unknown-address + 3, addrspace 1)
@@ -1080,23 +1130,24 @@ body: |
     ; SI-NEXT: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[COPY5]], [[C2]](s32)
     ; SI-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD]], [[C3]](s64)
     ; SI-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY [[C4]](s32)
-    ; SI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C5]]
-    ; SI-NEXT: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[AND2]], [[COPY6]](s32)
+    ; SI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C5]]
+    ; SI-NEXT: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[AND1]], [[COPY6]](s32)
     ; SI-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD]], [[C6]](s64)
     ; SI-NEXT: G_STORE [[COPY5]](s32), [[PTR_ADD]](p1) :: (store (s8) into unknown-address + 4, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR5]](s32), [[PTR_ADD5]](p1) :: (store (s8) into unknown-address + 5, addrspace 1)
     ; SI-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY [[C4]](s32)
-    ; SI-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[LSHR4]], [[C5]]
-    ; SI-NEXT: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[AND3]], [[COPY7]](s32)
+    ; SI-NEXT: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[LSHR4]], [[COPY7]](s32)
     ; SI-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD4]], [[C6]](s64)
     ; SI-NEXT: G_STORE [[LSHR4]](s32), [[PTR_ADD4]](p1) :: (store (s8) into unknown-address + 6, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR6]](s32), [[PTR_ADD6]](p1) :: (store (s8) into unknown-address + 7, addrspace 1)
+    ;
     ; CI-LABEL: name: test_store_global_s64_align1
     ; CI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; CI-NEXT: {{  $}}
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3
     ; CI-NEXT: G_STORE [[COPY1]](s64), [[COPY]](p1) :: (store (s64), align 1, addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_s64_align1
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; VI-NEXT: {{  $}}
@@ -1143,6 +1194,7 @@ body: |
     ; VI-NEXT: G_STORE [[LSHR4]](s32), [[PTR_ADD4]](p1) :: (store (s8) into unknown-address + 6, addrspace 1)
     ; VI-NEXT: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR6]](s16)
     ; VI-NEXT: G_STORE [[ANYEXT3]](s32), [[PTR_ADD6]](p1) :: (store (s8) into unknown-address + 7, addrspace 1)
+    ;
     ; GFX9-LABEL: name: test_store_global_s64_align1
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -1184,12 +1236,14 @@ body: |
     ; SI-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD]], [[C3]](s64)
     ; SI-NEXT: G_STORE [[COPY4]](s32), [[PTR_ADD]](p1) :: (store (s16) into unknown-address + 4, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR2]](s32), [[PTR_ADD2]](p1) :: (store (s16) into unknown-address + 6, addrspace 1)
+    ;
     ; CI-LABEL: name: test_store_global_s64_align2
     ; CI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; CI-NEXT: {{  $}}
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3
     ; CI-NEXT: G_STORE [[COPY1]](s64), [[COPY]](p1) :: (store (s64), align 2, addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_s64_align2
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; VI-NEXT: {{  $}}
@@ -1214,6 +1268,7 @@ body: |
     ; VI-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD]], [[C3]](s64)
     ; VI-NEXT: G_STORE [[COPY4]](s32), [[PTR_ADD]](p1) :: (store (s16) into unknown-address + 4, addrspace 1)
     ; VI-NEXT: G_STORE [[LSHR2]](s32), [[PTR_ADD2]](p1) :: (store (s16) into unknown-address + 6, addrspace 1)
+    ;
     ; GFX9-LABEL: name: test_store_global_s64_align2
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -1237,18 +1292,21 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; SI-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3
     ; SI-NEXT: G_STORE [[COPY1]](s64), [[COPY]](p1) :: (store (s64), align 4, addrspace 1)
+    ;
     ; CI-LABEL: name: test_store_global_s64_align4
     ; CI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; CI-NEXT: {{  $}}
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3
     ; CI-NEXT: G_STORE [[COPY1]](s64), [[COPY]](p1) :: (store (s64), align 4, addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_s64_align4
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3
     ; VI-NEXT: G_STORE [[COPY1]](s64), [[COPY]](p1) :: (store (s64), align 4, addrspace 1)
+    ;
     ; GFX9-LABEL: name: test_store_global_s64_align4
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -1272,18 +1330,21 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; SI-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3
     ; SI-NEXT: G_STORE [[COPY1]](s64), [[COPY]](p1) :: (store (s64), addrspace 1)
+    ;
     ; CI-LABEL: name: test_store_global_s64_align8
     ; CI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; CI-NEXT: {{  $}}
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3
     ; CI-NEXT: G_STORE [[COPY1]](s64), [[COPY]](p1) :: (store (s64), addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_s64_align8
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3
     ; VI-NEXT: G_STORE [[COPY1]](s64), [[COPY]](p1) :: (store (s64), addrspace 1)
+    ;
     ; GFX9-LABEL: name: test_store_global_s64_align8
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -1307,18 +1368,21 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; SI-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3
     ; SI-NEXT: G_STORE [[COPY1]](s64), [[COPY]](p1) :: (store (s64), align 16, addrspace 1)
+    ;
     ; CI-LABEL: name: test_store_global_s64_align16
     ; CI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; CI-NEXT: {{  $}}
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3
     ; CI-NEXT: G_STORE [[COPY1]](s64), [[COPY]](p1) :: (store (s64), align 16, addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_s64_align16
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3
     ; VI-NEXT: G_STORE [[COPY1]](s64), [[COPY]](p1) :: (store (s64), align 16, addrspace 1)
+    ;
     ; GFX9-LABEL: name: test_store_global_s64_align16
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -1362,8 +1426,7 @@ body: |
     ; SI-NEXT: G_STORE [[COPY3]](s32), [[COPY]](p1) :: (store (s8), addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR2]](s32), [[PTR_ADD2]](p1) :: (store (s8) into unknown-address + 1, addrspace 1)
     ; SI-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY [[C4]](s32)
-    ; SI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C5]]
-    ; SI-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[AND1]], [[COPY4]](s32)
+    ; SI-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[LSHR1]], [[COPY4]](s32)
     ; SI-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD1]], [[C6]](s64)
     ; SI-NEXT: G_STORE [[LSHR1]](s32), [[PTR_ADD1]](p1) :: (store (s8) into unknown-address + 2, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR3]](s32), [[PTR_ADD3]](p1) :: (store (s8) into unknown-address + 3, addrspace 1)
@@ -1372,23 +1435,24 @@ body: |
     ; SI-NEXT: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[COPY5]], [[C2]](s32)
     ; SI-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD]], [[C3]](s64)
     ; SI-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY [[C4]](s32)
-    ; SI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C5]]
-    ; SI-NEXT: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[AND2]], [[COPY6]](s32)
+    ; SI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C5]]
+    ; SI-NEXT: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[AND1]], [[COPY6]](s32)
     ; SI-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD]], [[C6]](s64)
     ; SI-NEXT: G_STORE [[COPY5]](s32), [[PTR_ADD]](p1) :: (store (s8) into unknown-address + 4, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR5]](s32), [[PTR_ADD5]](p1) :: (store (s8) into unknown-address + 5, addrspace 1)
     ; SI-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY [[C4]](s32)
-    ; SI-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[LSHR4]], [[C5]]
-    ; SI-NEXT: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[AND3]], [[COPY7]](s32)
+    ; SI-NEXT: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[LSHR4]], [[COPY7]](s32)
     ; SI-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD4]], [[C6]](s64)
     ; SI-NEXT: G_STORE [[LSHR4]](s32), [[PTR_ADD4]](p1) :: (store (s8) into unknown-address + 6, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR6]](s32), [[PTR_ADD6]](p1) :: (store (s8) into unknown-address + 7, addrspace 1)
+    ;
     ; CI-LABEL: name: test_store_global_p0_align1
     ; CI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; CI-NEXT: {{  $}}
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $vgpr2_vgpr3
     ; CI-NEXT: G_STORE [[COPY1]](p0), [[COPY]](p1) :: (store (p0), align 1, addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_p0_align1
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; VI-NEXT: {{  $}}
@@ -1436,6 +1500,7 @@ body: |
     ; VI-NEXT: G_STORE [[LSHR4]](s32), [[PTR_ADD4]](p1) :: (store (s8) into unknown-address + 6, addrspace 1)
     ; VI-NEXT: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR6]](s16)
     ; VI-NEXT: G_STORE [[ANYEXT3]](s32), [[PTR_ADD6]](p1) :: (store (s8) into unknown-address + 7, addrspace 1)
+    ;
     ; GFX9-LABEL: name: test_store_global_p0_align1
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -1478,12 +1543,14 @@ body: |
     ; SI-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD]], [[C3]](s64)
     ; SI-NEXT: G_STORE [[COPY4]](s32), [[PTR_ADD]](p1) :: (store (s16) into unknown-address + 4, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR2]](s32), [[PTR_ADD2]](p1) :: (store (s16) into unknown-address + 6, addrspace 1)
+    ;
     ; CI-LABEL: name: test_store_global_p0_align2
     ; CI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; CI-NEXT: {{  $}}
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $vgpr2_vgpr3
     ; CI-NEXT: G_STORE [[COPY1]](p0), [[COPY]](p1) :: (store (p0), align 2, addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_p0_align2
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; VI-NEXT: {{  $}}
@@ -1509,6 +1576,7 @@ body: |
     ; VI-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD]], [[C3]](s64)
     ; VI-NEXT: G_STORE [[COPY4]](s32), [[PTR_ADD]](p1) :: (store (s16) into unknown-address + 4, addrspace 1)
     ; VI-NEXT: G_STORE [[LSHR2]](s32), [[PTR_ADD2]](p1) :: (store (s16) into unknown-address + 6, addrspace 1)
+    ;
     ; GFX9-LABEL: name: test_store_global_p0_align2
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -1532,18 +1600,21 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; SI-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $vgpr2_vgpr3
     ; SI-NEXT: G_STORE [[COPY1]](p0), [[COPY]](p1) :: (store (p0), align 4, addrspace 1)
+    ;
     ; CI-LABEL: name: test_store_global_p0_align4
     ; CI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; CI-NEXT: {{  $}}
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $vgpr2_vgpr3
     ; CI-NEXT: G_STORE [[COPY1]](p0), [[COPY]](p1) :: (store (p0), align 4, addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_p0_align4
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $vgpr2_vgpr3
     ; VI-NEXT: G_STORE [[COPY1]](p0), [[COPY]](p1) :: (store (p0), align 4, addrspace 1)
+    ;
     ; GFX9-LABEL: name: test_store_global_p0_align4
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -1567,18 +1638,21 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; SI-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $vgpr2_vgpr3
     ; SI-NEXT: G_STORE [[COPY1]](p0), [[COPY]](p1) :: (store (p0), addrspace 1)
+    ;
     ; CI-LABEL: name: test_store_global_p0_align8
     ; CI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; CI-NEXT: {{  $}}
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $vgpr2_vgpr3
     ; CI-NEXT: G_STORE [[COPY1]](p0), [[COPY]](p1) :: (store (p0), addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_p0_align8
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $vgpr2_vgpr3
     ; VI-NEXT: G_STORE [[COPY1]](p0), [[COPY]](p1) :: (store (p0), addrspace 1)
+    ;
     ; GFX9-LABEL: name: test_store_global_p0_align8
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -1602,18 +1676,21 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; SI-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $vgpr2_vgpr3
     ; SI-NEXT: G_STORE [[COPY1]](p0), [[COPY]](p1) :: (store (p0), align 16, addrspace 1)
+    ;
     ; CI-LABEL: name: test_store_global_p0_align16
     ; CI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; CI-NEXT: {{  $}}
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $vgpr2_vgpr3
     ; CI-NEXT: G_STORE [[COPY1]](p0), [[COPY]](p1) :: (store (p0), align 16, addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_p0_align16
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $vgpr2_vgpr3
     ; VI-NEXT: G_STORE [[COPY1]](p0), [[COPY]](p1) :: (store (p0), align 16, addrspace 1)
+    ;
     ; GFX9-LABEL: name: test_store_global_p0_align16
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -1657,8 +1734,7 @@ body: |
     ; SI-NEXT: G_STORE [[COPY3]](s32), [[COPY]](p1) :: (store (s8), addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR2]](s32), [[PTR_ADD2]](p1) :: (store (s8) into unknown-address + 1, addrspace 1)
     ; SI-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY [[C4]](s32)
-    ; SI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C5]]
-    ; SI-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[AND1]], [[COPY4]](s32)
+    ; SI-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[LSHR1]], [[COPY4]](s32)
     ; SI-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD1]], [[C6]](s64)
     ; SI-NEXT: G_STORE [[LSHR1]](s32), [[PTR_ADD1]](p1) :: (store (s8) into unknown-address + 2, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR3]](s32), [[PTR_ADD3]](p1) :: (store (s8) into unknown-address + 3, addrspace 1)
@@ -1667,23 +1743,24 @@ body: |
     ; SI-NEXT: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[COPY5]], [[C2]](s32)
     ; SI-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD]], [[C3]](s64)
     ; SI-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY [[C4]](s32)
-    ; SI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C5]]
-    ; SI-NEXT: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[AND2]], [[COPY6]](s32)
+    ; SI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C5]]
+    ; SI-NEXT: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[AND1]], [[COPY6]](s32)
     ; SI-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD]], [[C6]](s64)
     ; SI-NEXT: G_STORE [[COPY5]](s32), [[PTR_ADD]](p1) :: (store (s8) into unknown-address + 4, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR5]](s32), [[PTR_ADD5]](p1) :: (store (s8) into unknown-address + 5, addrspace 1)
     ; SI-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY [[C4]](s32)
-    ; SI-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[LSHR4]], [[C5]]
-    ; SI-NEXT: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[AND3]], [[COPY7]](s32)
+    ; SI-NEXT: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[LSHR4]], [[COPY7]](s32)
     ; SI-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD4]], [[C6]](s64)
     ; SI-NEXT: G_STORE [[LSHR4]](s32), [[PTR_ADD4]](p1) :: (store (s8) into unknown-address + 6, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR6]](s32), [[PTR_ADD6]](p1) :: (store (s8) into unknown-address + 7, addrspace 1)
+    ;
     ; CI-LABEL: name: test_store_global_p999_align1
     ; CI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; CI-NEXT: {{  $}}
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-NEXT: [[COPY1:%[0-9]+]]:_(p999) = COPY $vgpr2_vgpr3
     ; CI-NEXT: G_STORE [[COPY1]](p999), [[COPY]](p1) :: (store (p999), align 1, addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_p999_align1
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; VI-NEXT: {{  $}}
@@ -1731,6 +1808,7 @@ body: |
     ; VI-NEXT: G_STORE [[LSHR4]](s32), [[PTR_ADD4]](p1) :: (store (s8) into unknown-address + 6, addrspace 1)
     ; VI-NEXT: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR6]](s16)
     ; VI-NEXT: G_STORE [[ANYEXT3]](s32), [[PTR_ADD6]](p1) :: (store (s8) into unknown-address + 7, addrspace 1)
+    ;
     ; GFX9-LABEL: name: test_store_global_p999_align1
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -1773,12 +1851,14 @@ body: |
     ; SI-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD]], [[C3]](s64)
     ; SI-NEXT: G_STORE [[COPY4]](s32), [[PTR_ADD]](p1) :: (store (s16) into unknown-address + 4, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR2]](s32), [[PTR_ADD2]](p1) :: (store (s16) into unknown-address + 6, addrspace 1)
+    ;
     ; CI-LABEL: name: test_store_global_p999_align2
     ; CI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; CI-NEXT: {{  $}}
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-NEXT: [[COPY1:%[0-9]+]]:_(p999) = COPY $vgpr2_vgpr3
     ; CI-NEXT: G_STORE [[COPY1]](p999), [[COPY]](p1) :: (store (p999), align 2, addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_p999_align2
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; VI-NEXT: {{  $}}
@@ -1804,6 +1884,7 @@ body: |
     ; VI-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD]], [[C3]](s64)
     ; VI-NEXT: G_STORE [[COPY4]](s32), [[PTR_ADD]](p1) :: (store (s16) into unknown-address + 4, addrspace 1)
     ; VI-NEXT: G_STORE [[LSHR2]](s32), [[PTR_ADD2]](p1) :: (store (s16) into unknown-address + 6, addrspace 1)
+    ;
     ; GFX9-LABEL: name: test_store_global_p999_align2
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -1827,18 +1908,21 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; SI-NEXT: [[COPY1:%[0-9]+]]:_(p999) = COPY $vgpr2_vgpr3
     ; SI-NEXT: G_STORE [[COPY1]](p999), [[COPY]](p1) :: (store (p999), align 4, addrspace 1)
+    ;
     ; CI-LABEL: name: test_store_global_p999_align4
     ; CI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; CI-NEXT: {{  $}}
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-NEXT: [[COPY1:%[0-9]+]]:_(p999) = COPY $vgpr2_vgpr3
     ; CI-NEXT: G_STORE [[COPY1]](p999), [[COPY]](p1) :: (store (p999), align 4, addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_p999_align4
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[COPY1:%[0-9]+]]:_(p999) = COPY $vgpr2_vgpr3
     ; VI-NEXT: G_STORE [[COPY1]](p999), [[COPY]](p1) :: (store (p999), align 4, addrspace 1)
+    ;
     ; GFX9-LABEL: name: test_store_global_p999_align4
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -1862,18 +1946,21 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; SI-NEXT: [[COPY1:%[0-9]+]]:_(p999) = COPY $vgpr2_vgpr3
     ; SI-NEXT: G_STORE [[COPY1]](p999), [[COPY]](p1) :: (store (p999), addrspace 1)
+    ;
     ; CI-LABEL: name: test_store_global_p999_align8
     ; CI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; CI-NEXT: {{  $}}
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-NEXT: [[COPY1:%[0-9]+]]:_(p999) = COPY $vgpr2_vgpr3
     ; CI-NEXT: G_STORE [[COPY1]](p999), [[COPY]](p1) :: (store (p999), addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_p999_align8
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[COPY1:%[0-9]+]]:_(p999) = COPY $vgpr2_vgpr3
     ; VI-NEXT: G_STORE [[COPY1]](p999), [[COPY]](p1) :: (store (p999), addrspace 1)
+    ;
     ; GFX9-LABEL: name: test_store_global_p999_align8
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -1897,18 +1984,21 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; SI-NEXT: [[COPY1:%[0-9]+]]:_(p999) = COPY $vgpr2_vgpr3
     ; SI-NEXT: G_STORE [[COPY1]](p999), [[COPY]](p1) :: (store (p999), align 16, addrspace 1)
+    ;
     ; CI-LABEL: name: test_store_global_p999_align16
     ; CI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; CI-NEXT: {{  $}}
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-NEXT: [[COPY1:%[0-9]+]]:_(p999) = COPY $vgpr2_vgpr3
     ; CI-NEXT: G_STORE [[COPY1]](p999), [[COPY]](p1) :: (store (p999), align 16, addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_p999_align16
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[COPY1:%[0-9]+]]:_(p999) = COPY $vgpr2_vgpr3
     ; VI-NEXT: G_STORE [[COPY1]](p999), [[COPY]](p1) :: (store (p999), align 16, addrspace 1)
+    ;
     ; GFX9-LABEL: name: test_store_global_p999_align16
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -1946,8 +2036,7 @@ body: |
     ; SI-NEXT: G_STORE [[COPY2]](s32), [[COPY]](p1) :: (store (s8), addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR1]](s32), [[PTR_ADD1]](p1) :: (store (s8) into unknown-address + 1, addrspace 1)
     ; SI-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; SI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C3]]
-    ; SI-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[AND1]], [[COPY3]](s32)
+    ; SI-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[LSHR]], [[COPY3]](s32)
     ; SI-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[LSHR]](s32), [[PTR_ADD]](p1) :: (store (s8) into unknown-address + 2, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR2]](s32), [[PTR_ADD2]](p1) :: (store (s8) into unknown-address + 3, addrspace 1)
@@ -1957,23 +2046,24 @@ body: |
     ; SI-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[COPY4]], [[C]](s32)
     ; SI-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD3]], [[C1]](s64)
     ; SI-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; SI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]]
-    ; SI-NEXT: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[AND2]], [[COPY5]](s32)
+    ; SI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]]
+    ; SI-NEXT: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[AND1]], [[COPY5]](s32)
     ; SI-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD3]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[COPY4]](s32), [[PTR_ADD3]](p1) :: (store (s8) into unknown-address + 4, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR4]](s32), [[PTR_ADD5]](p1) :: (store (s8) into unknown-address + 5, addrspace 1)
     ; SI-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; SI-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[LSHR3]], [[C3]]
-    ; SI-NEXT: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[AND3]], [[COPY6]](s32)
+    ; SI-NEXT: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[LSHR3]], [[COPY6]](s32)
     ; SI-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD4]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[LSHR3]](s32), [[PTR_ADD4]](p1) :: (store (s8) into unknown-address + 6, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR5]](s32), [[PTR_ADD6]](p1) :: (store (s8) into unknown-address + 7, addrspace 1)
+    ;
     ; CI-LABEL: name: test_store_global_v2s32_align1
     ; CI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; CI-NEXT: {{  $}}
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr2_vgpr3
     ; CI-NEXT: G_STORE [[COPY1]](<2 x s32>), [[COPY]](p1) :: (store (<2 x s32>), align 1, addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_v2s32_align1
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; VI-NEXT: {{  $}}
@@ -2016,6 +2106,7 @@ body: |
     ; VI-NEXT: G_STORE [[LSHR3]](s32), [[PTR_ADD4]](p1) :: (store (s8) into unknown-address + 6, addrspace 1)
     ; VI-NEXT: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR5]](s16)
     ; VI-NEXT: G_STORE [[ANYEXT3]](s32), [[PTR_ADD6]](p1) :: (store (s8) into unknown-address + 7, addrspace 1)
+    ;
     ; GFX9-LABEL: name: test_store_global_v2s32_align1
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -2053,12 +2144,14 @@ body: |
     ; SI-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD1]], [[C1]](s64)
     ; SI-NEXT: G_STORE [[COPY3]](s32), [[PTR_ADD1]](p1) :: (store (s16) into unknown-address + 4, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR1]](s32), [[PTR_ADD2]](p1) :: (store (s16) into unknown-address + 6, addrspace 1)
+    ;
     ; CI-LABEL: name: test_store_global_v2s32_align2
     ; CI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; CI-NEXT: {{  $}}
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr2_vgpr3
     ; CI-NEXT: G_STORE [[COPY1]](<2 x s32>), [[COPY]](p1) :: (store (<2 x s32>), align 2, addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_v2s32_align2
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; VI-NEXT: {{  $}}
@@ -2079,6 +2172,7 @@ body: |
     ; VI-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD1]], [[C1]](s64)
     ; VI-NEXT: G_STORE [[COPY3]](s32), [[PTR_ADD1]](p1) :: (store (s16) into unknown-address + 4, addrspace 1)
     ; VI-NEXT: G_STORE [[LSHR1]](s32), [[PTR_ADD2]](p1) :: (store (s16) into unknown-address + 6, addrspace 1)
+    ;
     ; GFX9-LABEL: name: test_store_global_v2s32_align2
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -2102,18 +2196,21 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; SI-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr2_vgpr3
     ; SI-NEXT: G_STORE [[COPY1]](<2 x s32>), [[COPY]](p1) :: (store (<2 x s32>), align 4, addrspace 1)
+    ;
     ; CI-LABEL: name: test_store_global_v2s32_align4
     ; CI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; CI-NEXT: {{  $}}
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr2_vgpr3
     ; CI-NEXT: G_STORE [[COPY1]](<2 x s32>), [[COPY]](p1) :: (store (<2 x s32>), align 4, addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_v2s32_align4
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr2_vgpr3
     ; VI-NEXT: G_STORE [[COPY1]](<2 x s32>), [[COPY]](p1) :: (store (<2 x s32>), align 4, addrspace 1)
+    ;
     ; GFX9-LABEL: name: test_store_global_v2s32_align4
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -2137,18 +2234,21 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; SI-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr2_vgpr3
     ; SI-NEXT: G_STORE [[COPY1]](<2 x s32>), [[COPY]](p1) :: (store (<2 x s32>), addrspace 1)
+    ;
     ; CI-LABEL: name: test_store_global_v2s32_align8
     ; CI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; CI-NEXT: {{  $}}
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr2_vgpr3
     ; CI-NEXT: G_STORE [[COPY1]](<2 x s32>), [[COPY]](p1) :: (store (<2 x s32>), addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_v2s32_align8
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr2_vgpr3
     ; VI-NEXT: G_STORE [[COPY1]](<2 x s32>), [[COPY]](p1) :: (store (<2 x s32>), addrspace 1)
+    ;
     ; GFX9-LABEL: name: test_store_global_v2s32_align8
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -2172,18 +2272,21 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; SI-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr2_vgpr3
     ; SI-NEXT: G_STORE [[COPY1]](<2 x s32>), [[COPY]](p1) :: (store (<2 x s32>), align 16, addrspace 1)
+    ;
     ; CI-LABEL: name: test_store_global_v2s32_align16
     ; CI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; CI-NEXT: {{  $}}
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr2_vgpr3
     ; CI-NEXT: G_STORE [[COPY1]](<2 x s32>), [[COPY]](p1) :: (store (<2 x s32>), align 16, addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_v2s32_align16
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr2_vgpr3
     ; VI-NEXT: G_STORE [[COPY1]](<2 x s32>), [[COPY]](p1) :: (store (<2 x s32>), align 16, addrspace 1)
+    ;
     ; GFX9-LABEL: name: test_store_global_v2s32_align16
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -2222,8 +2325,7 @@ body: |
     ; SI-NEXT: G_STORE [[COPY2]](s32), [[COPY]](p1) :: (store (s8), addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR1]](s32), [[PTR_ADD1]](p1) :: (store (s8) into unknown-address + 1, addrspace 1)
     ; SI-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; SI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C3]]
-    ; SI-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[AND1]], [[COPY3]](s32)
+    ; SI-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[LSHR]], [[COPY3]](s32)
     ; SI-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[LSHR]](s32), [[PTR_ADD]](p1) :: (store (s8) into unknown-address + 2, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR2]](s32), [[PTR_ADD2]](p1) :: (store (s8) into unknown-address + 3, addrspace 1)
@@ -2234,23 +2336,24 @@ body: |
     ; SI-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[COPY4]], [[C]](s32)
     ; SI-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD3]], [[C1]](s64)
     ; SI-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; SI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]]
-    ; SI-NEXT: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[AND2]], [[COPY5]](s32)
+    ; SI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]]
+    ; SI-NEXT: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[AND1]], [[COPY5]](s32)
     ; SI-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD3]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[COPY4]](s32), [[PTR_ADD3]](p1) :: (store (s8) into unknown-address + 4, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR4]](s32), [[PTR_ADD5]](p1) :: (store (s8) into unknown-address + 5, addrspace 1)
     ; SI-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; SI-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[LSHR3]], [[C3]]
-    ; SI-NEXT: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[AND3]], [[COPY6]](s32)
+    ; SI-NEXT: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[LSHR3]], [[COPY6]](s32)
     ; SI-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD4]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[LSHR3]](s32), [[PTR_ADD4]](p1) :: (store (s8) into unknown-address + 6, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR5]](s32), [[PTR_ADD6]](p1) :: (store (s8) into unknown-address + 7, addrspace 1)
+    ;
     ; CI-LABEL: name: test_store_global_v2p3_align1
     ; CI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; CI-NEXT: {{  $}}
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-NEXT: [[COPY1:%[0-9]+]]:_(<2 x p3>) = COPY $vgpr2_vgpr3
     ; CI-NEXT: G_STORE [[COPY1]](<2 x p3>), [[COPY]](p1) :: (store (<2 x p3>), align 1, addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_v2p3_align1
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; VI-NEXT: {{  $}}
@@ -2295,6 +2398,7 @@ body: |
     ; VI-NEXT: G_STORE [[LSHR3]](s32), [[PTR_ADD4]](p1) :: (store (s8) into unknown-address + 6, addrspace 1)
     ; VI-NEXT: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR5]](s16)
     ; VI-NEXT: G_STORE [[ANYEXT3]](s32), [[PTR_ADD6]](p1) :: (store (s8) into unknown-address + 7, addrspace 1)
+    ;
     ; GFX9-LABEL: name: test_store_global_v2p3_align1
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -2334,12 +2438,14 @@ body: |
     ; SI-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD1]], [[C1]](s64)
     ; SI-NEXT: G_STORE [[COPY3]](s32), [[PTR_ADD1]](p1) :: (store (s16) into unknown-address + 4, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR1]](s32), [[PTR_ADD2]](p1) :: (store (s16) into unknown-address + 6, addrspace 1)
+    ;
     ; CI-LABEL: name: test_store_global_v2p3_align2
     ; CI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; CI-NEXT: {{  $}}
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-NEXT: [[COPY1:%[0-9]+]]:_(<2 x p3>) = COPY $vgpr2_vgpr3
     ; CI-NEXT: G_STORE [[COPY1]](<2 x p3>), [[COPY]](p1) :: (store (<2 x p3>), align 2, addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_v2p3_align2
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; VI-NEXT: {{  $}}
@@ -2362,6 +2468,7 @@ body: |
     ; VI-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD1]], [[C1]](s64)
     ; VI-NEXT: G_STORE [[COPY3]](s32), [[PTR_ADD1]](p1) :: (store (s16) into unknown-address + 4, addrspace 1)
     ; VI-NEXT: G_STORE [[LSHR1]](s32), [[PTR_ADD2]](p1) :: (store (s16) into unknown-address + 6, addrspace 1)
+    ;
     ; GFX9-LABEL: name: test_store_global_v2p3_align2
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -2385,18 +2492,21 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; SI-NEXT: [[COPY1:%[0-9]+]]:_(<2 x p3>) = COPY $vgpr2_vgpr3
     ; SI-NEXT: G_STORE [[COPY1]](<2 x p3>), [[COPY]](p1) :: (store (<2 x p3>), align 4, addrspace 1)
+    ;
     ; CI-LABEL: name: test_store_global_v2p3_align4
     ; CI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; CI-NEXT: {{  $}}
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-NEXT: [[COPY1:%[0-9]+]]:_(<2 x p3>) = COPY $vgpr2_vgpr3
     ; CI-NEXT: G_STORE [[COPY1]](<2 x p3>), [[COPY]](p1) :: (store (<2 x p3>), align 4, addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_v2p3_align4
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[COPY1:%[0-9]+]]:_(<2 x p3>) = COPY $vgpr2_vgpr3
     ; VI-NEXT: G_STORE [[COPY1]](<2 x p3>), [[COPY]](p1) :: (store (<2 x p3>), align 4, addrspace 1)
+    ;
     ; GFX9-LABEL: name: test_store_global_v2p3_align4
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -2420,18 +2530,21 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; SI-NEXT: [[COPY1:%[0-9]+]]:_(<2 x p3>) = COPY $vgpr2_vgpr3
     ; SI-NEXT: G_STORE [[COPY1]](<2 x p3>), [[COPY]](p1) :: (store (<2 x p3>), addrspace 1)
+    ;
     ; CI-LABEL: name: test_store_global_v2p3_align8
     ; CI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; CI-NEXT: {{  $}}
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-NEXT: [[COPY1:%[0-9]+]]:_(<2 x p3>) = COPY $vgpr2_vgpr3
     ; CI-NEXT: G_STORE [[COPY1]](<2 x p3>), [[COPY]](p1) :: (store (<2 x p3>), addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_v2p3_align8
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[COPY1:%[0-9]+]]:_(<2 x p3>) = COPY $vgpr2_vgpr3
     ; VI-NEXT: G_STORE [[COPY1]](<2 x p3>), [[COPY]](p1) :: (store (<2 x p3>), addrspace 1)
+    ;
     ; GFX9-LABEL: name: test_store_global_v2p3_align8
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -2455,18 +2568,21 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; SI-NEXT: [[COPY1:%[0-9]+]]:_(<2 x p3>) = COPY $vgpr2_vgpr3
     ; SI-NEXT: G_STORE [[COPY1]](<2 x p3>), [[COPY]](p1) :: (store (<2 x p3>), align 16, addrspace 1)
+    ;
     ; CI-LABEL: name: test_store_global_v2p3_align16
     ; CI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; CI-NEXT: {{  $}}
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-NEXT: [[COPY1:%[0-9]+]]:_(<2 x p3>) = COPY $vgpr2_vgpr3
     ; CI-NEXT: G_STORE [[COPY1]](<2 x p3>), [[COPY]](p1) :: (store (<2 x p3>), align 16, addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_v2p3_align16
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[COPY1:%[0-9]+]]:_(<2 x p3>) = COPY $vgpr2_vgpr3
     ; VI-NEXT: G_STORE [[COPY1]](<2 x p3>), [[COPY]](p1) :: (store (<2 x p3>), align 16, addrspace 1)
+    ;
     ; GFX9-LABEL: name: test_store_global_v2p3_align16
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -2527,12 +2643,14 @@ body: |
     ; SI-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD5]], [[C3]](s64)
     ; SI-NEXT: G_STORE [[LSHR1]](s32), [[PTR_ADD5]](p1) :: (store (s8) into unknown-address + 6, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR5]](s32), [[PTR_ADD6]](p1) :: (store (s8) into unknown-address + 7, addrspace 1)
+    ;
     ; CI-LABEL: name: test_store_global_v4s16_align1
     ; CI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; CI-NEXT: {{  $}}
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr2_vgpr3
     ; CI-NEXT: G_STORE [[COPY1]](<4 x s16>), [[COPY]](p1) :: (store (<4 x s16>), align 1, addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_v4s16_align1
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; VI-NEXT: {{  $}}
@@ -2576,6 +2694,7 @@ body: |
     ; VI-NEXT: G_STORE [[LSHR1]](s32), [[PTR_ADD5]](p1) :: (store (s8) into unknown-address + 6, addrspace 1)
     ; VI-NEXT: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR5]](s16)
     ; VI-NEXT: G_STORE [[ANYEXT3]](s32), [[PTR_ADD6]](p1) :: (store (s8) into unknown-address + 7, addrspace 1)
+    ;
     ; GFX9-LABEL: name: test_store_global_v4s16_align1
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -2614,12 +2733,14 @@ body: |
     ; SI-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 6
     ; SI-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C3]](s64)
     ; SI-NEXT: G_STORE [[LSHR1]](s32), [[PTR_ADD2]](p1) :: (store (s16) into unknown-address + 6, addrspace 1)
+    ;
     ; CI-LABEL: name: test_store_global_v4s16_align2
     ; CI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; CI-NEXT: {{  $}}
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr2_vgpr3
     ; CI-NEXT: G_STORE [[COPY1]](<4 x s16>), [[COPY]](p1) :: (store (<4 x s16>), align 2, addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_v4s16_align2
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; VI-NEXT: {{  $}}
@@ -2641,6 +2762,7 @@ body: |
     ; VI-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 6
     ; VI-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C3]](s64)
     ; VI-NEXT: G_STORE [[LSHR1]](s32), [[PTR_ADD2]](p1) :: (store (s16) into unknown-address + 6, addrspace 1)
+    ;
     ; GFX9-LABEL: name: test_store_global_v4s16_align2
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -2664,18 +2786,21 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; SI-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr2_vgpr3
     ; SI-NEXT: G_STORE [[COPY1]](<4 x s16>), [[COPY]](p1) :: (store (<4 x s16>), align 4, addrspace 1)
+    ;
     ; CI-LABEL: name: test_store_global_v4s16_align4
     ; CI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; CI-NEXT: {{  $}}
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr2_vgpr3
     ; CI-NEXT: G_STORE [[COPY1]](<4 x s16>), [[COPY]](p1) :: (store (<4 x s16>), align 4, addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_v4s16_align4
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr2_vgpr3
     ; VI-NEXT: G_STORE [[COPY1]](<4 x s16>), [[COPY]](p1) :: (store (<4 x s16>), align 4, addrspace 1)
+    ;
     ; GFX9-LABEL: name: test_store_global_v4s16_align4
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -2699,18 +2824,21 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; SI-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr2_vgpr3
     ; SI-NEXT: G_STORE [[COPY1]](<4 x s16>), [[COPY]](p1) :: (store (<4 x s16>), addrspace 1)
+    ;
     ; CI-LABEL: name: test_store_global_v4s16_align8
     ; CI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; CI-NEXT: {{  $}}
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr2_vgpr3
     ; CI-NEXT: G_STORE [[COPY1]](<4 x s16>), [[COPY]](p1) :: (store (<4 x s16>), addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_v4s16_align8
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr2_vgpr3
     ; VI-NEXT: G_STORE [[COPY1]](<4 x s16>), [[COPY]](p1) :: (store (<4 x s16>), addrspace 1)
+    ;
     ; GFX9-LABEL: name: test_store_global_v4s16_align8
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -2734,18 +2862,21 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; SI-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr2_vgpr3
     ; SI-NEXT: G_STORE [[COPY1]](<4 x s16>), [[COPY]](p1) :: (store (<4 x s16>), align 16, addrspace 1)
+    ;
     ; CI-LABEL: name: test_store_global_v4s16_align16
     ; CI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; CI-NEXT: {{  $}}
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr2_vgpr3
     ; CI-NEXT: G_STORE [[COPY1]](<4 x s16>), [[COPY]](p1) :: (store (<4 x s16>), align 16, addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_v4s16_align16
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s16>) = COPY $vgpr2_vgpr3
     ; VI-NEXT: G_STORE [[COPY1]](<4 x s16>), [[COPY]](p1) :: (store (<4 x s16>), align 16, addrspace 1)
+    ;
     ; GFX9-LABEL: name: test_store_global_v4s16_align16
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -2783,8 +2914,7 @@ body: |
     ; SI-NEXT: G_STORE [[COPY2]](s32), [[COPY]](p1) :: (store (s8), addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR1]](s32), [[PTR_ADD1]](p1) :: (store (s8) into unknown-address + 1, addrspace 1)
     ; SI-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; SI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C3]]
-    ; SI-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[AND1]], [[COPY3]](s32)
+    ; SI-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[LSHR]], [[COPY3]](s32)
     ; SI-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[LSHR]](s32), [[PTR_ADD]](p1) :: (store (s8) into unknown-address + 2, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR2]](s32), [[PTR_ADD2]](p1) :: (store (s8) into unknown-address + 3, addrspace 1)
@@ -2794,14 +2924,13 @@ body: |
     ; SI-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[COPY4]], [[C]](s32)
     ; SI-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD3]], [[C1]](s64)
     ; SI-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; SI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]]
-    ; SI-NEXT: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[AND2]], [[COPY5]](s32)
+    ; SI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]]
+    ; SI-NEXT: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[AND1]], [[COPY5]](s32)
     ; SI-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD3]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[COPY4]](s32), [[PTR_ADD3]](p1) :: (store (s8) into unknown-address + 4, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR4]](s32), [[PTR_ADD5]](p1) :: (store (s8) into unknown-address + 5, addrspace 1)
     ; SI-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; SI-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[LSHR3]], [[C3]]
-    ; SI-NEXT: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[AND3]], [[COPY6]](s32)
+    ; SI-NEXT: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[LSHR3]], [[COPY6]](s32)
     ; SI-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD4]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[LSHR3]](s32), [[PTR_ADD4]](p1) :: (store (s8) into unknown-address + 6, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR5]](s32), [[PTR_ADD6]](p1) :: (store (s8) into unknown-address + 7, addrspace 1)
@@ -2811,23 +2940,24 @@ body: |
     ; SI-NEXT: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[COPY7]], [[C]](s32)
     ; SI-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD7]], [[C1]](s64)
     ; SI-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; SI-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C3]]
-    ; SI-NEXT: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[AND4]], [[COPY8]](s32)
+    ; SI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C3]]
+    ; SI-NEXT: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[AND2]], [[COPY8]](s32)
     ; SI-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[COPY7]](s32), [[PTR_ADD7]](p1) :: (store (s8) into unknown-address + 8, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR7]](s32), [[PTR_ADD9]](p1) :: (store (s8) into unknown-address + 9, addrspace 1)
     ; SI-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; SI-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[LSHR6]], [[C3]]
-    ; SI-NEXT: [[LSHR8:%[0-9]+]]:_(s32) = G_LSHR [[AND5]], [[COPY9]](s32)
+    ; SI-NEXT: [[LSHR8:%[0-9]+]]:_(s32) = G_LSHR [[LSHR6]], [[COPY9]](s32)
     ; SI-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD8]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[LSHR6]](s32), [[PTR_ADD8]](p1) :: (store (s8) into unknown-address + 10, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR8]](s32), [[PTR_ADD10]](p1) :: (store (s8) into unknown-address + 11, addrspace 1)
+    ;
     ; CI-LABEL: name: test_store_global_v3s32_align1
     ; CI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4
     ; CI-NEXT: {{  $}}
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-NEXT: [[COPY1:%[0-9]+]]:_(<3 x s32>) = COPY $vgpr2_vgpr3_vgpr4
     ; CI-NEXT: G_STORE [[COPY1]](<3 x s32>), [[COPY]](p1) :: (store (<3 x s32>), align 1, addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_v3s32_align1
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4
     ; VI-NEXT: {{  $}}
@@ -2887,6 +3017,7 @@ body: |
     ; VI-NEXT: G_STORE [[LSHR6]](s32), [[PTR_ADD8]](p1) :: (store (s8) into unknown-address + 10, addrspace 1)
     ; VI-NEXT: [[ANYEXT5:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR8]](s16)
     ; VI-NEXT: G_STORE [[ANYEXT5]](s32), [[PTR_ADD10]](p1) :: (store (s8) into unknown-address + 11, addrspace 1)
+    ;
     ; GFX9-LABEL: name: test_store_global_v3s32_align1
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4
     ; GFX9-NEXT: {{  $}}
@@ -2931,12 +3062,14 @@ body: |
     ; SI-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD3]], [[C1]](s64)
     ; SI-NEXT: G_STORE [[COPY4]](s32), [[PTR_ADD3]](p1) :: (store (s16) into unknown-address + 8, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR2]](s32), [[PTR_ADD4]](p1) :: (store (s16) into unknown-address + 10, addrspace 1)
+    ;
     ; CI-LABEL: name: test_store_global_v3s32_align2
     ; CI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4
     ; CI-NEXT: {{  $}}
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-NEXT: [[COPY1:%[0-9]+]]:_(<3 x s32>) = COPY $vgpr2_vgpr3_vgpr4
     ; CI-NEXT: G_STORE [[COPY1]](<3 x s32>), [[COPY]](p1) :: (store (<3 x s32>), align 2, addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_v3s32_align2
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4
     ; VI-NEXT: {{  $}}
@@ -2964,6 +3097,7 @@ body: |
     ; VI-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD3]], [[C1]](s64)
     ; VI-NEXT: G_STORE [[COPY4]](s32), [[PTR_ADD3]](p1) :: (store (s16) into unknown-address + 8, addrspace 1)
     ; VI-NEXT: G_STORE [[LSHR2]](s32), [[PTR_ADD4]](p1) :: (store (s16) into unknown-address + 10, addrspace 1)
+    ;
     ; GFX9-LABEL: name: test_store_global_v3s32_align2
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4
     ; GFX9-NEXT: {{  $}}
@@ -2992,18 +3126,21 @@ body: |
     ; SI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
     ; SI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64)
     ; SI-NEXT: G_STORE [[UV2]](s32), [[PTR_ADD]](p1) :: (store (s32) into unknown-address + 8, addrspace 1)
+    ;
     ; CI-LABEL: name: test_store_global_v3s32_align4
     ; CI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4
     ; CI-NEXT: {{  $}}
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-NEXT: [[COPY1:%[0-9]+]]:_(<3 x s32>) = COPY $vgpr2_vgpr3_vgpr4
     ; CI-NEXT: G_STORE [[COPY1]](<3 x s32>), [[COPY]](p1) :: (store (<3 x s32>), align 4, addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_v3s32_align4
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[COPY1:%[0-9]+]]:_(<3 x s32>) = COPY $vgpr2_vgpr3_vgpr4
     ; VI-NEXT: G_STORE [[COPY1]](<3 x s32>), [[COPY]](p1) :: (store (<3 x s32>), align 4, addrspace 1)
+    ;
     ; GFX9-LABEL: name: test_store_global_v3s32_align4
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4
     ; GFX9-NEXT: {{  $}}
@@ -3032,18 +3169,21 @@ body: |
     ; SI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
     ; SI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64)
     ; SI-NEXT: G_STORE [[UV2]](s32), [[PTR_ADD]](p1) :: (store (s32) into unknown-address + 8, align 8, addrspace 1)
+    ;
     ; CI-LABEL: name: test_store_global_v3s32_align8
     ; CI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4
     ; CI-NEXT: {{  $}}
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-NEXT: [[COPY1:%[0-9]+]]:_(<3 x s32>) = COPY $vgpr2_vgpr3_vgpr4
     ; CI-NEXT: G_STORE [[COPY1]](<3 x s32>), [[COPY]](p1) :: (store (<3 x s32>), align 8, addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_v3s32_align8
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[COPY1:%[0-9]+]]:_(<3 x s32>) = COPY $vgpr2_vgpr3_vgpr4
     ; VI-NEXT: G_STORE [[COPY1]](<3 x s32>), [[COPY]](p1) :: (store (<3 x s32>), align 8, addrspace 1)
+    ;
     ; GFX9-LABEL: name: test_store_global_v3s32_align8
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4
     ; GFX9-NEXT: {{  $}}
@@ -3072,18 +3212,21 @@ body: |
     ; SI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
     ; SI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64)
     ; SI-NEXT: G_STORE [[UV2]](s32), [[PTR_ADD]](p1) :: (store (s32) into unknown-address + 8, align 8, addrspace 1)
+    ;
     ; CI-LABEL: name: test_store_global_v3s32_align16
     ; CI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4
     ; CI-NEXT: {{  $}}
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-NEXT: [[COPY1:%[0-9]+]]:_(<3 x s32>) = COPY $vgpr2_vgpr3_vgpr4
     ; CI-NEXT: G_STORE [[COPY1]](<3 x s32>), [[COPY]](p1) :: (store (<3 x s32>), align 16, addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_v3s32_align16
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[COPY1:%[0-9]+]]:_(<3 x s32>) = COPY $vgpr2_vgpr3_vgpr4
     ; VI-NEXT: G_STORE [[COPY1]](<3 x s32>), [[COPY]](p1) :: (store (<3 x s32>), align 16, addrspace 1)
+    ;
     ; GFX9-LABEL: name: test_store_global_v3s32_align16
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4
     ; GFX9-NEXT: {{  $}}
@@ -3121,8 +3264,7 @@ body: |
     ; SI-NEXT: G_STORE [[COPY2]](s32), [[COPY]](p1) :: (store (s8), addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR1]](s32), [[PTR_ADD1]](p1) :: (store (s8) into unknown-address + 1, addrspace 1)
     ; SI-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; SI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C3]]
-    ; SI-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[AND1]], [[COPY3]](s32)
+    ; SI-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[LSHR]], [[COPY3]](s32)
     ; SI-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[LSHR]](s32), [[PTR_ADD]](p1) :: (store (s8) into unknown-address + 2, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR2]](s32), [[PTR_ADD2]](p1) :: (store (s8) into unknown-address + 3, addrspace 1)
@@ -3132,14 +3274,13 @@ body: |
     ; SI-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[COPY4]], [[C]](s32)
     ; SI-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD3]], [[C1]](s64)
     ; SI-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; SI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]]
-    ; SI-NEXT: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[AND2]], [[COPY5]](s32)
+    ; SI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]]
+    ; SI-NEXT: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[AND1]], [[COPY5]](s32)
     ; SI-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD3]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[COPY4]](s32), [[PTR_ADD3]](p1) :: (store (s8) into unknown-address + 4, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR4]](s32), [[PTR_ADD5]](p1) :: (store (s8) into unknown-address + 5, addrspace 1)
     ; SI-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; SI-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[LSHR3]], [[C3]]
-    ; SI-NEXT: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[AND3]], [[COPY6]](s32)
+    ; SI-NEXT: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[LSHR3]], [[COPY6]](s32)
     ; SI-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD4]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[LSHR3]](s32), [[PTR_ADD4]](p1) :: (store (s8) into unknown-address + 6, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR5]](s32), [[PTR_ADD6]](p1) :: (store (s8) into unknown-address + 7, addrspace 1)
@@ -3149,14 +3290,13 @@ body: |
     ; SI-NEXT: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[COPY7]], [[C]](s32)
     ; SI-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD7]], [[C1]](s64)
     ; SI-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; SI-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C3]]
-    ; SI-NEXT: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[AND4]], [[COPY8]](s32)
+    ; SI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C3]]
+    ; SI-NEXT: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[AND2]], [[COPY8]](s32)
     ; SI-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[COPY7]](s32), [[PTR_ADD7]](p1) :: (store (s8) into unknown-address + 8, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR7]](s32), [[PTR_ADD9]](p1) :: (store (s8) into unknown-address + 9, addrspace 1)
     ; SI-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; SI-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[LSHR6]], [[C3]]
-    ; SI-NEXT: [[LSHR8:%[0-9]+]]:_(s32) = G_LSHR [[AND5]], [[COPY9]](s32)
+    ; SI-NEXT: [[LSHR8:%[0-9]+]]:_(s32) = G_LSHR [[LSHR6]], [[COPY9]](s32)
     ; SI-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD8]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[LSHR6]](s32), [[PTR_ADD8]](p1) :: (store (s8) into unknown-address + 10, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR8]](s32), [[PTR_ADD10]](p1) :: (store (s8) into unknown-address + 11, addrspace 1)
@@ -3166,23 +3306,24 @@ body: |
     ; SI-NEXT: [[LSHR9:%[0-9]+]]:_(s32) = G_LSHR [[COPY10]], [[C]](s32)
     ; SI-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD11]], [[C1]](s64)
     ; SI-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; SI-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C3]]
-    ; SI-NEXT: [[LSHR10:%[0-9]+]]:_(s32) = G_LSHR [[AND6]], [[COPY11]](s32)
+    ; SI-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C3]]
+    ; SI-NEXT: [[LSHR10:%[0-9]+]]:_(s32) = G_LSHR [[AND3]], [[COPY11]](s32)
     ; SI-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD11]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[COPY10]](s32), [[PTR_ADD11]](p1) :: (store (s8) into unknown-address + 12, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR10]](s32), [[PTR_ADD13]](p1) :: (store (s8) into unknown-address + 13, addrspace 1)
     ; SI-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; SI-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[LSHR9]], [[C3]]
-    ; SI-NEXT: [[LSHR11:%[0-9]+]]:_(s32) = G_LSHR [[AND7]], [[COPY12]](s32)
+    ; SI-NEXT: [[LSHR11:%[0-9]+]]:_(s32) = G_LSHR [[LSHR9]], [[COPY12]](s32)
     ; SI-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD12]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[LSHR9]](s32), [[PTR_ADD12]](p1) :: (store (s8) into unknown-address + 14, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR11]](s32), [[PTR_ADD14]](p1) :: (store (s8) into unknown-address + 15, addrspace 1)
+    ;
     ; CI-LABEL: name: test_store_global_v4s32_align1
     ; CI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; CI-NEXT: {{  $}}
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; CI-NEXT: G_STORE [[COPY1]](<4 x s32>), [[COPY]](p1) :: (store (<4 x s32>), align 1, addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_v4s32_align1
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; VI-NEXT: {{  $}}
@@ -3259,6 +3400,7 @@ body: |
     ; VI-NEXT: G_STORE [[LSHR9]](s32), [[PTR_ADD12]](p1) :: (store (s8) into unknown-address + 14, addrspace 1)
     ; VI-NEXT: [[ANYEXT7:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR11]](s16)
     ; VI-NEXT: G_STORE [[ANYEXT7]](s32), [[PTR_ADD14]](p1) :: (store (s8) into unknown-address + 15, addrspace 1)
+    ;
     ; GFX9-LABEL: name: test_store_global_v4s32_align1
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX9-NEXT: {{  $}}
@@ -3310,12 +3452,14 @@ body: |
     ; SI-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD5]], [[C1]](s64)
     ; SI-NEXT: G_STORE [[COPY5]](s32), [[PTR_ADD5]](p1) :: (store (s16) into unknown-address + 12, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR3]](s32), [[PTR_ADD6]](p1) :: (store (s16) into unknown-address + 14, addrspace 1)
+    ;
     ; CI-LABEL: name: test_store_global_v4s32_align2
     ; CI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; CI-NEXT: {{  $}}
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; CI-NEXT: G_STORE [[COPY1]](<4 x s32>), [[COPY]](p1) :: (store (<4 x s32>), align 2, addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_v4s32_align2
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; VI-NEXT: {{  $}}
@@ -3350,6 +3494,7 @@ body: |
     ; VI-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD5]], [[C1]](s64)
     ; VI-NEXT: G_STORE [[COPY5]](s32), [[PTR_ADD5]](p1) :: (store (s16) into unknown-address + 12, addrspace 1)
     ; VI-NEXT: G_STORE [[LSHR3]](s32), [[PTR_ADD6]](p1) :: (store (s16) into unknown-address + 14, addrspace 1)
+    ;
     ; GFX9-LABEL: name: test_store_global_v4s32_align2
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX9-NEXT: {{  $}}
@@ -3373,18 +3518,21 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; SI-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; SI-NEXT: G_STORE [[COPY1]](<4 x s32>), [[COPY]](p1) :: (store (<4 x s32>), align 4, addrspace 1)
+    ;
     ; CI-LABEL: name: test_store_global_v4s32_align4
     ; CI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; CI-NEXT: {{  $}}
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; CI-NEXT: G_STORE [[COPY1]](<4 x s32>), [[COPY]](p1) :: (store (<4 x s32>), align 4, addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_v4s32_align4
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; VI-NEXT: G_STORE [[COPY1]](<4 x s32>), [[COPY]](p1) :: (store (<4 x s32>), align 4, addrspace 1)
+    ;
     ; GFX9-LABEL: name: test_store_global_v4s32_align4
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX9-NEXT: {{  $}}
@@ -3408,18 +3556,21 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; SI-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; SI-NEXT: G_STORE [[COPY1]](<4 x s32>), [[COPY]](p1) :: (store (<4 x s32>), align 8, addrspace 1)
+    ;
     ; CI-LABEL: name: test_store_global_v4s32_align8
     ; CI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; CI-NEXT: {{  $}}
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; CI-NEXT: G_STORE [[COPY1]](<4 x s32>), [[COPY]](p1) :: (store (<4 x s32>), align 8, addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_v4s32_align8
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; VI-NEXT: G_STORE [[COPY1]](<4 x s32>), [[COPY]](p1) :: (store (<4 x s32>), align 8, addrspace 1)
+    ;
     ; GFX9-LABEL: name: test_store_global_v4s32_align8
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX9-NEXT: {{  $}}
@@ -3443,18 +3594,21 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; SI-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; SI-NEXT: G_STORE [[COPY1]](<4 x s32>), [[COPY]](p1) :: (store (<4 x s32>), addrspace 1)
+    ;
     ; CI-LABEL: name: test_store_global_v4s32_align16
     ; CI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; CI-NEXT: {{  $}}
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; CI-NEXT: G_STORE [[COPY1]](<4 x s32>), [[COPY]](p1) :: (store (<4 x s32>), addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_v4s32_align16
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; VI-NEXT: G_STORE [[COPY1]](<4 x s32>), [[COPY]](p1) :: (store (<4 x s32>), addrspace 1)
+    ;
     ; GFX9-LABEL: name: test_store_global_v4s32_align16
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX9-NEXT: {{  $}}
@@ -3498,8 +3652,7 @@ body: |
     ; SI-NEXT: G_STORE [[COPY3]](s32), [[COPY]](p1) :: (store (s8), addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR2]](s32), [[PTR_ADD2]](p1) :: (store (s8) into unknown-address + 1, addrspace 1)
     ; SI-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY [[C4]](s32)
-    ; SI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C5]]
-    ; SI-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[AND1]], [[COPY4]](s32)
+    ; SI-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[LSHR1]], [[COPY4]](s32)
     ; SI-NEXT: [[PTR_ADD3:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD1]], [[C6]](s64)
     ; SI-NEXT: G_STORE [[LSHR1]](s32), [[PTR_ADD1]](p1) :: (store (s8) into unknown-address + 2, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR3]](s32), [[PTR_ADD3]](p1) :: (store (s8) into unknown-address + 3, addrspace 1)
@@ -3508,14 +3661,13 @@ body: |
     ; SI-NEXT: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[COPY5]], [[C2]](s32)
     ; SI-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD]], [[C3]](s64)
     ; SI-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY [[C4]](s32)
-    ; SI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C5]]
-    ; SI-NEXT: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[AND2]], [[COPY6]](s32)
+    ; SI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY5]], [[C5]]
+    ; SI-NEXT: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[AND1]], [[COPY6]](s32)
     ; SI-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD]], [[C6]](s64)
     ; SI-NEXT: G_STORE [[COPY5]](s32), [[PTR_ADD]](p1) :: (store (s8) into unknown-address + 4, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR5]](s32), [[PTR_ADD5]](p1) :: (store (s8) into unknown-address + 5, addrspace 1)
     ; SI-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY [[C4]](s32)
-    ; SI-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[LSHR4]], [[C5]]
-    ; SI-NEXT: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[AND3]], [[COPY7]](s32)
+    ; SI-NEXT: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[LSHR4]], [[COPY7]](s32)
     ; SI-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD4]], [[C6]](s64)
     ; SI-NEXT: G_STORE [[LSHR4]](s32), [[PTR_ADD4]](p1) :: (store (s8) into unknown-address + 6, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR6]](s32), [[PTR_ADD6]](p1) :: (store (s8) into unknown-address + 7, addrspace 1)
@@ -3530,14 +3682,13 @@ body: |
     ; SI-NEXT: [[LSHR8:%[0-9]+]]:_(s32) = G_LSHR [[COPY10]], [[C2]](s32)
     ; SI-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD7]], [[C3]](s64)
     ; SI-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[C4]](s32)
-    ; SI-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C5]]
-    ; SI-NEXT: [[LSHR9:%[0-9]+]]:_(s32) = G_LSHR [[AND4]], [[COPY11]](s32)
+    ; SI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C5]]
+    ; SI-NEXT: [[LSHR9:%[0-9]+]]:_(s32) = G_LSHR [[AND2]], [[COPY11]](s32)
     ; SI-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD7]], [[C6]](s64)
     ; SI-NEXT: G_STORE [[COPY10]](s32), [[PTR_ADD7]](p1) :: (store (s8) into unknown-address + 8, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR9]](s32), [[PTR_ADD10]](p1) :: (store (s8) into unknown-address + 9, addrspace 1)
     ; SI-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[C4]](s32)
-    ; SI-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[LSHR8]], [[C5]]
-    ; SI-NEXT: [[LSHR10:%[0-9]+]]:_(s32) = G_LSHR [[AND5]], [[COPY12]](s32)
+    ; SI-NEXT: [[LSHR10:%[0-9]+]]:_(s32) = G_LSHR [[LSHR8]], [[COPY12]](s32)
     ; SI-NEXT: [[PTR_ADD11:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD9]], [[C6]](s64)
     ; SI-NEXT: G_STORE [[LSHR8]](s32), [[PTR_ADD9]](p1) :: (store (s8) into unknown-address + 10, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR10]](s32), [[PTR_ADD11]](p1) :: (store (s8) into unknown-address + 11, addrspace 1)
@@ -3546,23 +3697,24 @@ body: |
     ; SI-NEXT: [[LSHR11:%[0-9]+]]:_(s32) = G_LSHR [[COPY13]], [[C2]](s32)
     ; SI-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD8]], [[C3]](s64)
     ; SI-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[C4]](s32)
-    ; SI-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY13]], [[C5]]
-    ; SI-NEXT: [[LSHR12:%[0-9]+]]:_(s32) = G_LSHR [[AND6]], [[COPY14]](s32)
+    ; SI-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY13]], [[C5]]
+    ; SI-NEXT: [[LSHR12:%[0-9]+]]:_(s32) = G_LSHR [[AND3]], [[COPY14]](s32)
     ; SI-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD8]], [[C6]](s64)
     ; SI-NEXT: G_STORE [[COPY13]](s32), [[PTR_ADD8]](p1) :: (store (s8) into unknown-address + 12, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR12]](s32), [[PTR_ADD13]](p1) :: (store (s8) into unknown-address + 13, addrspace 1)
     ; SI-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[C4]](s32)
-    ; SI-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[LSHR11]], [[C5]]
-    ; SI-NEXT: [[LSHR13:%[0-9]+]]:_(s32) = G_LSHR [[AND7]], [[COPY15]](s32)
+    ; SI-NEXT: [[LSHR13:%[0-9]+]]:_(s32) = G_LSHR [[LSHR11]], [[COPY15]](s32)
     ; SI-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD12]], [[C6]](s64)
     ; SI-NEXT: G_STORE [[LSHR11]](s32), [[PTR_ADD12]](p1) :: (store (s8) into unknown-address + 14, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR13]](s32), [[PTR_ADD14]](p1) :: (store (s8) into unknown-address + 15, addrspace 1)
+    ;
     ; CI-LABEL: name: test_store_global_v2s64_align1
     ; CI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; CI-NEXT: {{  $}}
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s64>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; CI-NEXT: G_STORE [[COPY1]](<2 x s64>), [[COPY]](p1) :: (store (<2 x s64>), align 1, addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_v2s64_align1
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; VI-NEXT: {{  $}}
@@ -3648,6 +3800,7 @@ body: |
     ; VI-NEXT: G_STORE [[LSHR11]](s32), [[PTR_ADD12]](p1) :: (store (s8) into unknown-address + 14, addrspace 1)
     ; VI-NEXT: [[ANYEXT7:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR13]](s16)
     ; VI-NEXT: G_STORE [[ANYEXT7]](s32), [[PTR_ADD14]](p1) :: (store (s8) into unknown-address + 15, addrspace 1)
+    ;
     ; GFX9-LABEL: name: test_store_global_v2s64_align1
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX9-NEXT: {{  $}}
@@ -3708,12 +3861,14 @@ body: |
     ; SI-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD4]], [[C3]](s64)
     ; SI-NEXT: G_STORE [[COPY8]](s32), [[PTR_ADD4]](p1) :: (store (s16) into unknown-address + 12, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR5]](s32), [[PTR_ADD6]](p1) :: (store (s16) into unknown-address + 14, addrspace 1)
+    ;
     ; CI-LABEL: name: test_store_global_v2s64_align2
     ; CI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; CI-NEXT: {{  $}}
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s64>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; CI-NEXT: G_STORE [[COPY1]](<2 x s64>), [[COPY]](p1) :: (store (<2 x s64>), align 2, addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_v2s64_align2
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; VI-NEXT: {{  $}}
@@ -3757,6 +3912,7 @@ body: |
     ; VI-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD4]], [[C3]](s64)
     ; VI-NEXT: G_STORE [[COPY8]](s32), [[PTR_ADD4]](p1) :: (store (s16) into unknown-address + 12, addrspace 1)
     ; VI-NEXT: G_STORE [[LSHR5]](s32), [[PTR_ADD6]](p1) :: (store (s16) into unknown-address + 14, addrspace 1)
+    ;
     ; GFX9-LABEL: name: test_store_global_v2s64_align2
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX9-NEXT: {{  $}}
@@ -3780,18 +3936,21 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; SI-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s64>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; SI-NEXT: G_STORE [[COPY1]](<2 x s64>), [[COPY]](p1) :: (store (<2 x s64>), align 4, addrspace 1)
+    ;
     ; CI-LABEL: name: test_store_global_v2s64_align4
     ; CI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; CI-NEXT: {{  $}}
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s64>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; CI-NEXT: G_STORE [[COPY1]](<2 x s64>), [[COPY]](p1) :: (store (<2 x s64>), align 4, addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_v2s64_align4
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s64>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; VI-NEXT: G_STORE [[COPY1]](<2 x s64>), [[COPY]](p1) :: (store (<2 x s64>), align 4, addrspace 1)
+    ;
     ; GFX9-LABEL: name: test_store_global_v2s64_align4
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX9-NEXT: {{  $}}
@@ -3815,18 +3974,21 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; SI-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s64>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; SI-NEXT: G_STORE [[COPY1]](<2 x s64>), [[COPY]](p1) :: (store (<2 x s64>), align 8, addrspace 1)
+    ;
     ; CI-LABEL: name: test_store_global_v2s64_align8
     ; CI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; CI-NEXT: {{  $}}
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s64>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; CI-NEXT: G_STORE [[COPY1]](<2 x s64>), [[COPY]](p1) :: (store (<2 x s64>), align 8, addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_v2s64_align8
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s64>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; VI-NEXT: G_STORE [[COPY1]](<2 x s64>), [[COPY]](p1) :: (store (<2 x s64>), align 8, addrspace 1)
+    ;
     ; GFX9-LABEL: name: test_store_global_v2s64_align8
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX9-NEXT: {{  $}}
@@ -3850,18 +4012,21 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; SI-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s64>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; SI-NEXT: G_STORE [[COPY1]](<2 x s64>), [[COPY]](p1) :: (store (<2 x s64>), addrspace 1)
+    ;
     ; CI-LABEL: name: test_store_global_v2s64_align16
     ; CI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; CI-NEXT: {{  $}}
     ; CI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; CI-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s64>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; CI-NEXT: G_STORE [[COPY1]](<2 x s64>), [[COPY]](p1) :: (store (<2 x s64>), addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_v2s64_align16
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s64>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; VI-NEXT: G_STORE [[COPY1]](<2 x s64>), [[COPY]](p1) :: (store (<2 x s64>), addrspace 1)
+    ;
     ; GFX9-LABEL: name: test_store_global_v2s64_align16
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX9-NEXT: {{  $}}
@@ -3900,8 +4065,7 @@ body: |
     ; SI-NEXT: G_STORE [[COPY2]](s32), [[COPY]](p1) :: (store (s8), addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR1]](s32), [[PTR_ADD1]](p1) :: (store (s8) into unknown-address + 1, addrspace 1)
     ; SI-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; SI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C3]]
-    ; SI-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[AND1]], [[COPY3]](s32)
+    ; SI-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[LSHR]], [[COPY3]](s32)
     ; SI-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[LSHR]](s32), [[PTR_ADD]](p1) :: (store (s8) into unknown-address + 2, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR2]](s32), [[PTR_ADD2]](p1) :: (store (s8) into unknown-address + 3, addrspace 1)
@@ -3911,14 +4075,13 @@ body: |
     ; SI-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[COPY4]], [[C]](s32)
     ; SI-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD3]], [[C1]](s64)
     ; SI-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; SI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]]
-    ; SI-NEXT: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[AND2]], [[COPY5]](s32)
+    ; SI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]]
+    ; SI-NEXT: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[AND1]], [[COPY5]](s32)
     ; SI-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD3]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[COPY4]](s32), [[PTR_ADD3]](p1) :: (store (s8) into unknown-address + 4, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR4]](s32), [[PTR_ADD5]](p1) :: (store (s8) into unknown-address + 5, addrspace 1)
     ; SI-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; SI-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[LSHR3]], [[C3]]
-    ; SI-NEXT: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[AND3]], [[COPY6]](s32)
+    ; SI-NEXT: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[LSHR3]], [[COPY6]](s32)
     ; SI-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD4]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[LSHR3]](s32), [[PTR_ADD4]](p1) :: (store (s8) into unknown-address + 6, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR5]](s32), [[PTR_ADD6]](p1) :: (store (s8) into unknown-address + 7, addrspace 1)
@@ -3928,14 +4091,13 @@ body: |
     ; SI-NEXT: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[COPY7]], [[C]](s32)
     ; SI-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD7]], [[C1]](s64)
     ; SI-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; SI-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C3]]
-    ; SI-NEXT: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[AND4]], [[COPY8]](s32)
+    ; SI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C3]]
+    ; SI-NEXT: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[AND2]], [[COPY8]](s32)
     ; SI-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[COPY7]](s32), [[PTR_ADD7]](p1) :: (store (s8) into unknown-address + 8, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR7]](s32), [[PTR_ADD9]](p1) :: (store (s8) into unknown-address + 9, addrspace 1)
     ; SI-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; SI-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[LSHR6]], [[C3]]
-    ; SI-NEXT: [[LSHR8:%[0-9]+]]:_(s32) = G_LSHR [[AND5]], [[COPY9]](s32)
+    ; SI-NEXT: [[LSHR8:%[0-9]+]]:_(s32) = G_LSHR [[LSHR6]], [[COPY9]](s32)
     ; SI-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD8]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[LSHR6]](s32), [[PTR_ADD8]](p1) :: (store (s8) into unknown-address + 10, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR8]](s32), [[PTR_ADD10]](p1) :: (store (s8) into unknown-address + 11, addrspace 1)
@@ -3945,17 +4107,17 @@ body: |
     ; SI-NEXT: [[LSHR9:%[0-9]+]]:_(s32) = G_LSHR [[COPY10]], [[C]](s32)
     ; SI-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD11]], [[C1]](s64)
     ; SI-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; SI-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C3]]
-    ; SI-NEXT: [[LSHR10:%[0-9]+]]:_(s32) = G_LSHR [[AND6]], [[COPY11]](s32)
+    ; SI-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C3]]
+    ; SI-NEXT: [[LSHR10:%[0-9]+]]:_(s32) = G_LSHR [[AND3]], [[COPY11]](s32)
     ; SI-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD11]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[COPY10]](s32), [[PTR_ADD11]](p1) :: (store (s8) into unknown-address + 12, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR10]](s32), [[PTR_ADD13]](p1) :: (store (s8) into unknown-address + 13, addrspace 1)
     ; SI-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; SI-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[LSHR9]], [[C3]]
-    ; SI-NEXT: [[LSHR11:%[0-9]+]]:_(s32) = G_LSHR [[AND7]], [[COPY12]](s32)
+    ; SI-NEXT: [[LSHR11:%[0-9]+]]:_(s32) = G_LSHR [[LSHR9]], [[COPY12]](s32)
     ; SI-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD12]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[LSHR9]](s32), [[PTR_ADD12]](p1) :: (store (s8) into unknown-address + 14, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR11]](s32), [[PTR_ADD14]](p1) :: (store (s8) into unknown-address + 15, addrspace 1)
+    ;
     ; CI-LABEL: name: test_store_global_v8s16_align1
     ; CI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; CI-NEXT: {{  $}}
@@ -3963,6 +4125,7 @@ body: |
     ; CI-NEXT: [[COPY1:%[0-9]+]]:_(<8 x s16>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; CI-NEXT: [[BITCAST:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[COPY1]](<8 x s16>)
     ; CI-NEXT: G_STORE [[BITCAST]](<4 x s32>), [[COPY]](p1) :: (store (<4 x s32>), align 1, addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_v8s16_align1
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; VI-NEXT: {{  $}}
@@ -4040,6 +4203,7 @@ body: |
     ; VI-NEXT: G_STORE [[LSHR9]](s32), [[PTR_ADD12]](p1) :: (store (s8) into unknown-address + 14, addrspace 1)
     ; VI-NEXT: [[ANYEXT7:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR11]](s16)
     ; VI-NEXT: G_STORE [[ANYEXT7]](s32), [[PTR_ADD14]](p1) :: (store (s8) into unknown-address + 15, addrspace 1)
+    ;
     ; GFX9-LABEL: name: test_store_global_v8s16_align1
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX9-NEXT: {{  $}}
@@ -4093,6 +4257,7 @@ body: |
     ; SI-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD5]], [[C1]](s64)
     ; SI-NEXT: G_STORE [[COPY5]](s32), [[PTR_ADD5]](p1) :: (store (s16) into unknown-address + 12, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR3]](s32), [[PTR_ADD6]](p1) :: (store (s16) into unknown-address + 14, addrspace 1)
+    ;
     ; CI-LABEL: name: test_store_global_v8s16_align2
     ; CI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; CI-NEXT: {{  $}}
@@ -4100,6 +4265,7 @@ body: |
     ; CI-NEXT: [[COPY1:%[0-9]+]]:_(<8 x s16>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; CI-NEXT: [[BITCAST:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[COPY1]](<8 x s16>)
     ; CI-NEXT: G_STORE [[BITCAST]](<4 x s32>), [[COPY]](p1) :: (store (<4 x s32>), align 2, addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_v8s16_align2
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; VI-NEXT: {{  $}}
@@ -4135,6 +4301,7 @@ body: |
     ; VI-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD5]], [[C1]](s64)
     ; VI-NEXT: G_STORE [[COPY5]](s32), [[PTR_ADD5]](p1) :: (store (s16) into unknown-address + 12, addrspace 1)
     ; VI-NEXT: G_STORE [[LSHR3]](s32), [[PTR_ADD6]](p1) :: (store (s16) into unknown-address + 14, addrspace 1)
+    ;
     ; GFX9-LABEL: name: test_store_global_v8s16_align2
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX9-NEXT: {{  $}}
@@ -4160,6 +4327,7 @@ body: |
     ; SI-NEXT: [[COPY1:%[0-9]+]]:_(<8 x s16>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; SI-NEXT: [[BITCAST:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[COPY1]](<8 x s16>)
     ; SI-NEXT: G_STORE [[BITCAST]](<4 x s32>), [[COPY]](p1) :: (store (<4 x s32>), align 4, addrspace 1)
+    ;
     ; CI-LABEL: name: test_store_global_v8s16_align4
     ; CI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; CI-NEXT: {{  $}}
@@ -4167,6 +4335,7 @@ body: |
     ; CI-NEXT: [[COPY1:%[0-9]+]]:_(<8 x s16>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; CI-NEXT: [[BITCAST:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[COPY1]](<8 x s16>)
     ; CI-NEXT: G_STORE [[BITCAST]](<4 x s32>), [[COPY]](p1) :: (store (<4 x s32>), align 4, addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_v8s16_align4
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; VI-NEXT: {{  $}}
@@ -4174,6 +4343,7 @@ body: |
     ; VI-NEXT: [[COPY1:%[0-9]+]]:_(<8 x s16>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[COPY1]](<8 x s16>)
     ; VI-NEXT: G_STORE [[BITCAST]](<4 x s32>), [[COPY]](p1) :: (store (<4 x s32>), align 4, addrspace 1)
+    ;
     ; GFX9-LABEL: name: test_store_global_v8s16_align4
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX9-NEXT: {{  $}}
@@ -4199,6 +4369,7 @@ body: |
     ; SI-NEXT: [[COPY1:%[0-9]+]]:_(<8 x s16>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; SI-NEXT: [[BITCAST:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[COPY1]](<8 x s16>)
     ; SI-NEXT: G_STORE [[BITCAST]](<4 x s32>), [[COPY]](p1) :: (store (<4 x s32>), align 8, addrspace 1)
+    ;
     ; CI-LABEL: name: test_store_global_v8s16_align8
     ; CI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; CI-NEXT: {{  $}}
@@ -4206,6 +4377,7 @@ body: |
     ; CI-NEXT: [[COPY1:%[0-9]+]]:_(<8 x s16>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; CI-NEXT: [[BITCAST:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[COPY1]](<8 x s16>)
     ; CI-NEXT: G_STORE [[BITCAST]](<4 x s32>), [[COPY]](p1) :: (store (<4 x s32>), align 8, addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_v8s16_align8
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; VI-NEXT: {{  $}}
@@ -4213,6 +4385,7 @@ body: |
     ; VI-NEXT: [[COPY1:%[0-9]+]]:_(<8 x s16>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[COPY1]](<8 x s16>)
     ; VI-NEXT: G_STORE [[BITCAST]](<4 x s32>), [[COPY]](p1) :: (store (<4 x s32>), align 8, addrspace 1)
+    ;
     ; GFX9-LABEL: name: test_store_global_v8s16_align8
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX9-NEXT: {{  $}}
@@ -4238,6 +4411,7 @@ body: |
     ; SI-NEXT: [[COPY1:%[0-9]+]]:_(<8 x s16>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; SI-NEXT: [[BITCAST:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[COPY1]](<8 x s16>)
     ; SI-NEXT: G_STORE [[BITCAST]](<4 x s32>), [[COPY]](p1) :: (store (<4 x s32>), addrspace 1)
+    ;
     ; CI-LABEL: name: test_store_global_v8s16_align16
     ; CI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; CI-NEXT: {{  $}}
@@ -4245,6 +4419,7 @@ body: |
     ; CI-NEXT: [[COPY1:%[0-9]+]]:_(<8 x s16>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; CI-NEXT: [[BITCAST:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[COPY1]](<8 x s16>)
     ; CI-NEXT: G_STORE [[BITCAST]](<4 x s32>), [[COPY]](p1) :: (store (<4 x s32>), addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_v8s16_align16
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; VI-NEXT: {{  $}}
@@ -4252,6 +4427,7 @@ body: |
     ; VI-NEXT: [[COPY1:%[0-9]+]]:_(<8 x s16>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[COPY1]](<8 x s16>)
     ; VI-NEXT: G_STORE [[BITCAST]](<4 x s32>), [[COPY]](p1) :: (store (<4 x s32>), addrspace 1)
+    ;
     ; GFX9-LABEL: name: test_store_global_v8s16_align16
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX9-NEXT: {{  $}}
@@ -4291,8 +4467,7 @@ body: |
     ; SI-NEXT: G_STORE [[COPY2]](s32), [[COPY]](p1) :: (store (s8), addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR1]](s32), [[PTR_ADD1]](p1) :: (store (s8) into unknown-address + 1, addrspace 1)
     ; SI-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; SI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C3]]
-    ; SI-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[AND1]], [[COPY3]](s32)
+    ; SI-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[LSHR]], [[COPY3]](s32)
     ; SI-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[LSHR]](s32), [[PTR_ADD]](p1) :: (store (s8) into unknown-address + 2, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR2]](s32), [[PTR_ADD2]](p1) :: (store (s8) into unknown-address + 3, addrspace 1)
@@ -4302,14 +4477,13 @@ body: |
     ; SI-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[COPY4]], [[C]](s32)
     ; SI-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD3]], [[C1]](s64)
     ; SI-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; SI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]]
-    ; SI-NEXT: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[AND2]], [[COPY5]](s32)
+    ; SI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]]
+    ; SI-NEXT: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[AND1]], [[COPY5]](s32)
     ; SI-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD3]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[COPY4]](s32), [[PTR_ADD3]](p1) :: (store (s8) into unknown-address + 4, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR4]](s32), [[PTR_ADD5]](p1) :: (store (s8) into unknown-address + 5, addrspace 1)
     ; SI-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; SI-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[LSHR3]], [[C3]]
-    ; SI-NEXT: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[AND3]], [[COPY6]](s32)
+    ; SI-NEXT: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[LSHR3]], [[COPY6]](s32)
     ; SI-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD4]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[LSHR3]](s32), [[PTR_ADD4]](p1) :: (store (s8) into unknown-address + 6, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR5]](s32), [[PTR_ADD6]](p1) :: (store (s8) into unknown-address + 7, addrspace 1)
@@ -4319,14 +4493,13 @@ body: |
     ; SI-NEXT: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[COPY7]], [[C]](s32)
     ; SI-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD7]], [[C1]](s64)
     ; SI-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; SI-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C3]]
-    ; SI-NEXT: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[AND4]], [[COPY8]](s32)
+    ; SI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C3]]
+    ; SI-NEXT: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[AND2]], [[COPY8]](s32)
     ; SI-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[COPY7]](s32), [[PTR_ADD7]](p1) :: (store (s8) into unknown-address + 8, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR7]](s32), [[PTR_ADD9]](p1) :: (store (s8) into unknown-address + 9, addrspace 1)
     ; SI-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; SI-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[LSHR6]], [[C3]]
-    ; SI-NEXT: [[LSHR8:%[0-9]+]]:_(s32) = G_LSHR [[AND5]], [[COPY9]](s32)
+    ; SI-NEXT: [[LSHR8:%[0-9]+]]:_(s32) = G_LSHR [[LSHR6]], [[COPY9]](s32)
     ; SI-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD8]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[LSHR6]](s32), [[PTR_ADD8]](p1) :: (store (s8) into unknown-address + 10, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR8]](s32), [[PTR_ADD10]](p1) :: (store (s8) into unknown-address + 11, addrspace 1)
@@ -4336,17 +4509,17 @@ body: |
     ; SI-NEXT: [[LSHR9:%[0-9]+]]:_(s32) = G_LSHR [[COPY10]], [[C]](s32)
     ; SI-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD11]], [[C1]](s64)
     ; SI-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; SI-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C3]]
-    ; SI-NEXT: [[LSHR10:%[0-9]+]]:_(s32) = G_LSHR [[AND6]], [[COPY11]](s32)
+    ; SI-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C3]]
+    ; SI-NEXT: [[LSHR10:%[0-9]+]]:_(s32) = G_LSHR [[AND3]], [[COPY11]](s32)
     ; SI-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD11]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[COPY10]](s32), [[PTR_ADD11]](p1) :: (store (s8) into unknown-address + 12, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR10]](s32), [[PTR_ADD13]](p1) :: (store (s8) into unknown-address + 13, addrspace 1)
     ; SI-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; SI-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[LSHR9]], [[C3]]
-    ; SI-NEXT: [[LSHR11:%[0-9]+]]:_(s32) = G_LSHR [[AND7]], [[COPY12]](s32)
+    ; SI-NEXT: [[LSHR11:%[0-9]+]]:_(s32) = G_LSHR [[LSHR9]], [[COPY12]](s32)
     ; SI-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD12]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[LSHR9]](s32), [[PTR_ADD12]](p1) :: (store (s8) into unknown-address + 14, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR11]](s32), [[PTR_ADD14]](p1) :: (store (s8) into unknown-address + 15, addrspace 1)
+    ;
     ; CI-LABEL: name: test_store_global_v2p0_align1
     ; CI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; CI-NEXT: {{  $}}
@@ -4354,6 +4527,7 @@ body: |
     ; CI-NEXT: [[COPY1:%[0-9]+]]:_(<2 x p0>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; CI-NEXT: [[BITCAST:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[COPY1]](<2 x p0>)
     ; CI-NEXT: G_STORE [[BITCAST]](<4 x s32>), [[COPY]](p1) :: (store (<4 x s32>), align 1, addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_v2p0_align1
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; VI-NEXT: {{  $}}
@@ -4431,6 +4605,7 @@ body: |
     ; VI-NEXT: G_STORE [[LSHR9]](s32), [[PTR_ADD12]](p1) :: (store (s8) into unknown-address + 14, addrspace 1)
     ; VI-NEXT: [[ANYEXT7:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR11]](s16)
     ; VI-NEXT: G_STORE [[ANYEXT7]](s32), [[PTR_ADD14]](p1) :: (store (s8) into unknown-address + 15, addrspace 1)
+    ;
     ; GFX9-LABEL: name: test_store_global_v2p0_align1
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX9-NEXT: {{  $}}
@@ -4484,6 +4659,7 @@ body: |
     ; SI-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD5]], [[C1]](s64)
     ; SI-NEXT: G_STORE [[COPY5]](s32), [[PTR_ADD5]](p1) :: (store (s16) into unknown-address + 12, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR3]](s32), [[PTR_ADD6]](p1) :: (store (s16) into unknown-address + 14, addrspace 1)
+    ;
     ; CI-LABEL: name: test_store_global_v2p0_align2
     ; CI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; CI-NEXT: {{  $}}
@@ -4491,6 +4667,7 @@ body: |
     ; CI-NEXT: [[COPY1:%[0-9]+]]:_(<2 x p0>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; CI-NEXT: [[BITCAST:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[COPY1]](<2 x p0>)
     ; CI-NEXT: G_STORE [[BITCAST]](<4 x s32>), [[COPY]](p1) :: (store (<4 x s32>), align 2, addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_v2p0_align2
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; VI-NEXT: {{  $}}
@@ -4526,6 +4703,7 @@ body: |
     ; VI-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD5]], [[C1]](s64)
     ; VI-NEXT: G_STORE [[COPY5]](s32), [[PTR_ADD5]](p1) :: (store (s16) into unknown-address + 12, addrspace 1)
     ; VI-NEXT: G_STORE [[LSHR3]](s32), [[PTR_ADD6]](p1) :: (store (s16) into unknown-address + 14, addrspace 1)
+    ;
     ; GFX9-LABEL: name: test_store_global_v2p0_align2
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX9-NEXT: {{  $}}
@@ -4551,6 +4729,7 @@ body: |
     ; SI-NEXT: [[COPY1:%[0-9]+]]:_(<2 x p0>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; SI-NEXT: [[BITCAST:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[COPY1]](<2 x p0>)
     ; SI-NEXT: G_STORE [[BITCAST]](<4 x s32>), [[COPY]](p1) :: (store (<4 x s32>), align 4, addrspace 1)
+    ;
     ; CI-LABEL: name: test_store_global_v2p0_align4
     ; CI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; CI-NEXT: {{  $}}
@@ -4558,6 +4737,7 @@ body: |
     ; CI-NEXT: [[COPY1:%[0-9]+]]:_(<2 x p0>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; CI-NEXT: [[BITCAST:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[COPY1]](<2 x p0>)
     ; CI-NEXT: G_STORE [[BITCAST]](<4 x s32>), [[COPY]](p1) :: (store (<4 x s32>), align 4, addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_v2p0_align4
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; VI-NEXT: {{  $}}
@@ -4565,6 +4745,7 @@ body: |
     ; VI-NEXT: [[COPY1:%[0-9]+]]:_(<2 x p0>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[COPY1]](<2 x p0>)
     ; VI-NEXT: G_STORE [[BITCAST]](<4 x s32>), [[COPY]](p1) :: (store (<4 x s32>), align 4, addrspace 1)
+    ;
     ; GFX9-LABEL: name: test_store_global_v2p0_align4
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX9-NEXT: {{  $}}
@@ -4590,6 +4771,7 @@ body: |
     ; SI-NEXT: [[COPY1:%[0-9]+]]:_(<2 x p0>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; SI-NEXT: [[BITCAST:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[COPY1]](<2 x p0>)
     ; SI-NEXT: G_STORE [[BITCAST]](<4 x s32>), [[COPY]](p1) :: (store (<4 x s32>), align 8, addrspace 1)
+    ;
     ; CI-LABEL: name: test_store_global_v2p0_align8
     ; CI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; CI-NEXT: {{  $}}
@@ -4597,6 +4779,7 @@ body: |
     ; CI-NEXT: [[COPY1:%[0-9]+]]:_(<2 x p0>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; CI-NEXT: [[BITCAST:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[COPY1]](<2 x p0>)
     ; CI-NEXT: G_STORE [[BITCAST]](<4 x s32>), [[COPY]](p1) :: (store (<4 x s32>), align 8, addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_v2p0_align8
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; VI-NEXT: {{  $}}
@@ -4604,6 +4787,7 @@ body: |
     ; VI-NEXT: [[COPY1:%[0-9]+]]:_(<2 x p0>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[COPY1]](<2 x p0>)
     ; VI-NEXT: G_STORE [[BITCAST]](<4 x s32>), [[COPY]](p1) :: (store (<4 x s32>), align 8, addrspace 1)
+    ;
     ; GFX9-LABEL: name: test_store_global_v2p0_align8
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX9-NEXT: {{  $}}
@@ -4629,6 +4813,7 @@ body: |
     ; SI-NEXT: [[COPY1:%[0-9]+]]:_(<2 x p0>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; SI-NEXT: [[BITCAST:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[COPY1]](<2 x p0>)
     ; SI-NEXT: G_STORE [[BITCAST]](<4 x s32>), [[COPY]](p1) :: (store (<4 x s32>), addrspace 1)
+    ;
     ; CI-LABEL: name: test_store_global_v2p0_align16
     ; CI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; CI-NEXT: {{  $}}
@@ -4636,6 +4821,7 @@ body: |
     ; CI-NEXT: [[COPY1:%[0-9]+]]:_(<2 x p0>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; CI-NEXT: [[BITCAST:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[COPY1]](<2 x p0>)
     ; CI-NEXT: G_STORE [[BITCAST]](<4 x s32>), [[COPY]](p1) :: (store (<4 x s32>), addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_v2p0_align16
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; VI-NEXT: {{  $}}
@@ -4643,6 +4829,7 @@ body: |
     ; VI-NEXT: [[COPY1:%[0-9]+]]:_(<2 x p0>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[COPY1]](<2 x p0>)
     ; VI-NEXT: G_STORE [[BITCAST]](<4 x s32>), [[COPY]](p1) :: (store (<4 x s32>), addrspace 1)
+    ;
     ; GFX9-LABEL: name: test_store_global_v2p0_align16
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX9-NEXT: {{  $}}
@@ -4682,8 +4869,7 @@ body: |
     ; SI-NEXT: G_STORE [[COPY2]](s32), [[COPY]](p1) :: (store (s8), addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR1]](s32), [[PTR_ADD1]](p1) :: (store (s8) into unknown-address + 1, addrspace 1)
     ; SI-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; SI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C3]]
-    ; SI-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[AND1]], [[COPY3]](s32)
+    ; SI-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[LSHR]], [[COPY3]](s32)
     ; SI-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[LSHR]](s32), [[PTR_ADD]](p1) :: (store (s8) into unknown-address + 2, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR2]](s32), [[PTR_ADD2]](p1) :: (store (s8) into unknown-address + 3, addrspace 1)
@@ -4693,14 +4879,13 @@ body: |
     ; SI-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[COPY4]], [[C]](s32)
     ; SI-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD3]], [[C1]](s64)
     ; SI-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; SI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]]
-    ; SI-NEXT: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[AND2]], [[COPY5]](s32)
+    ; SI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]]
+    ; SI-NEXT: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[AND1]], [[COPY5]](s32)
     ; SI-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD3]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[COPY4]](s32), [[PTR_ADD3]](p1) :: (store (s8) into unknown-address + 4, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR4]](s32), [[PTR_ADD5]](p1) :: (store (s8) into unknown-address + 5, addrspace 1)
     ; SI-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; SI-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[LSHR3]], [[C3]]
-    ; SI-NEXT: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[AND3]], [[COPY6]](s32)
+    ; SI-NEXT: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[LSHR3]], [[COPY6]](s32)
     ; SI-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD4]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[LSHR3]](s32), [[PTR_ADD4]](p1) :: (store (s8) into unknown-address + 6, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR5]](s32), [[PTR_ADD6]](p1) :: (store (s8) into unknown-address + 7, addrspace 1)
@@ -4710,17 +4895,17 @@ body: |
     ; SI-NEXT: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[COPY7]], [[C]](s32)
     ; SI-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD7]], [[C1]](s64)
     ; SI-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; SI-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C3]]
-    ; SI-NEXT: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[AND4]], [[COPY8]](s32)
+    ; SI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C3]]
+    ; SI-NEXT: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[AND2]], [[COPY8]](s32)
     ; SI-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[COPY7]](s32), [[PTR_ADD7]](p1) :: (store (s8) into unknown-address + 8, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR7]](s32), [[PTR_ADD9]](p1) :: (store (s8) into unknown-address + 9, addrspace 1)
     ; SI-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; SI-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[LSHR6]], [[C3]]
-    ; SI-NEXT: [[LSHR8:%[0-9]+]]:_(s32) = G_LSHR [[AND5]], [[COPY9]](s32)
+    ; SI-NEXT: [[LSHR8:%[0-9]+]]:_(s32) = G_LSHR [[LSHR6]], [[COPY9]](s32)
     ; SI-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD8]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[LSHR6]](s32), [[PTR_ADD8]](p1) :: (store (s8) into unknown-address + 10, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR8]](s32), [[PTR_ADD10]](p1) :: (store (s8) into unknown-address + 11, addrspace 1)
+    ;
     ; CI-LABEL: name: test_store_global_s96_align1
     ; CI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4
     ; CI-NEXT: {{  $}}
@@ -4728,6 +4913,7 @@ body: |
     ; CI-NEXT: [[COPY1:%[0-9]+]]:_(s96) = COPY $vgpr2_vgpr3_vgpr4
     ; CI-NEXT: [[BITCAST:%[0-9]+]]:_(<3 x s32>) = G_BITCAST [[COPY1]](s96)
     ; CI-NEXT: G_STORE [[BITCAST]](<3 x s32>), [[COPY]](p1) :: (store (<3 x s32>), align 1, addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_s96_align1
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4
     ; VI-NEXT: {{  $}}
@@ -4788,6 +4974,7 @@ body: |
     ; VI-NEXT: G_STORE [[LSHR6]](s32), [[PTR_ADD8]](p1) :: (store (s8) into unknown-address + 10, addrspace 1)
     ; VI-NEXT: [[ANYEXT5:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR8]](s16)
     ; VI-NEXT: G_STORE [[ANYEXT5]](s32), [[PTR_ADD10]](p1) :: (store (s8) into unknown-address + 11, addrspace 1)
+    ;
     ; GFX9-LABEL: name: test_store_global_s96_align1
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4
     ; GFX9-NEXT: {{  $}}
@@ -4834,6 +5021,7 @@ body: |
     ; SI-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD3]], [[C1]](s64)
     ; SI-NEXT: G_STORE [[COPY4]](s32), [[PTR_ADD3]](p1) :: (store (s16) into unknown-address + 8, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR2]](s32), [[PTR_ADD4]](p1) :: (store (s16) into unknown-address + 10, addrspace 1)
+    ;
     ; CI-LABEL: name: test_store_global_s96_align2
     ; CI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4
     ; CI-NEXT: {{  $}}
@@ -4841,6 +5029,7 @@ body: |
     ; CI-NEXT: [[COPY1:%[0-9]+]]:_(s96) = COPY $vgpr2_vgpr3_vgpr4
     ; CI-NEXT: [[BITCAST:%[0-9]+]]:_(<3 x s32>) = G_BITCAST [[COPY1]](s96)
     ; CI-NEXT: G_STORE [[BITCAST]](<3 x s32>), [[COPY]](p1) :: (store (<3 x s32>), align 2, addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_s96_align2
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4
     ; VI-NEXT: {{  $}}
@@ -4869,6 +5058,7 @@ body: |
     ; VI-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD3]], [[C1]](s64)
     ; VI-NEXT: G_STORE [[COPY4]](s32), [[PTR_ADD3]](p1) :: (store (s16) into unknown-address + 8, addrspace 1)
     ; VI-NEXT: G_STORE [[LSHR2]](s32), [[PTR_ADD4]](p1) :: (store (s16) into unknown-address + 10, addrspace 1)
+    ;
     ; GFX9-LABEL: name: test_store_global_s96_align2
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4
     ; GFX9-NEXT: {{  $}}
@@ -4899,6 +5089,7 @@ body: |
     ; SI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
     ; SI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64)
     ; SI-NEXT: G_STORE [[UV2]](s32), [[PTR_ADD]](p1) :: (store (s32) into unknown-address + 8, addrspace 1)
+    ;
     ; CI-LABEL: name: test_store_global_s96_align4
     ; CI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4
     ; CI-NEXT: {{  $}}
@@ -4906,6 +5097,7 @@ body: |
     ; CI-NEXT: [[COPY1:%[0-9]+]]:_(s96) = COPY $vgpr2_vgpr3_vgpr4
     ; CI-NEXT: [[BITCAST:%[0-9]+]]:_(<3 x s32>) = G_BITCAST [[COPY1]](s96)
     ; CI-NEXT: G_STORE [[BITCAST]](<3 x s32>), [[COPY]](p1) :: (store (<3 x s32>), align 4, addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_s96_align4
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4
     ; VI-NEXT: {{  $}}
@@ -4913,6 +5105,7 @@ body: |
     ; VI-NEXT: [[COPY1:%[0-9]+]]:_(s96) = COPY $vgpr2_vgpr3_vgpr4
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(<3 x s32>) = G_BITCAST [[COPY1]](s96)
     ; VI-NEXT: G_STORE [[BITCAST]](<3 x s32>), [[COPY]](p1) :: (store (<3 x s32>), align 4, addrspace 1)
+    ;
     ; GFX9-LABEL: name: test_store_global_s96_align4
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4
     ; GFX9-NEXT: {{  $}}
@@ -4943,6 +5136,7 @@ body: |
     ; SI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
     ; SI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64)
     ; SI-NEXT: G_STORE [[UV2]](s32), [[PTR_ADD]](p1) :: (store (s32) into unknown-address + 8, align 8, addrspace 1)
+    ;
     ; CI-LABEL: name: test_store_global_s96_align8
     ; CI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4
     ; CI-NEXT: {{  $}}
@@ -4950,6 +5144,7 @@ body: |
     ; CI-NEXT: [[COPY1:%[0-9]+]]:_(s96) = COPY $vgpr2_vgpr3_vgpr4
     ; CI-NEXT: [[BITCAST:%[0-9]+]]:_(<3 x s32>) = G_BITCAST [[COPY1]](s96)
     ; CI-NEXT: G_STORE [[BITCAST]](<3 x s32>), [[COPY]](p1) :: (store (<3 x s32>), align 8, addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_s96_align8
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4
     ; VI-NEXT: {{  $}}
@@ -4957,6 +5152,7 @@ body: |
     ; VI-NEXT: [[COPY1:%[0-9]+]]:_(s96) = COPY $vgpr2_vgpr3_vgpr4
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(<3 x s32>) = G_BITCAST [[COPY1]](s96)
     ; VI-NEXT: G_STORE [[BITCAST]](<3 x s32>), [[COPY]](p1) :: (store (<3 x s32>), align 8, addrspace 1)
+    ;
     ; GFX9-LABEL: name: test_store_global_s96_align8
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4
     ; GFX9-NEXT: {{  $}}
@@ -4987,6 +5183,7 @@ body: |
     ; SI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
     ; SI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64)
     ; SI-NEXT: G_STORE [[UV2]](s32), [[PTR_ADD]](p1) :: (store (s32) into unknown-address + 8, align 8, addrspace 1)
+    ;
     ; CI-LABEL: name: test_store_global_s96_align16
     ; CI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4
     ; CI-NEXT: {{  $}}
@@ -4994,6 +5191,7 @@ body: |
     ; CI-NEXT: [[COPY1:%[0-9]+]]:_(s96) = COPY $vgpr2_vgpr3_vgpr4
     ; CI-NEXT: [[BITCAST:%[0-9]+]]:_(<3 x s32>) = G_BITCAST [[COPY1]](s96)
     ; CI-NEXT: G_STORE [[BITCAST]](<3 x s32>), [[COPY]](p1) :: (store (<3 x s32>), align 16, addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_s96_align16
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4
     ; VI-NEXT: {{  $}}
@@ -5001,6 +5199,7 @@ body: |
     ; VI-NEXT: [[COPY1:%[0-9]+]]:_(s96) = COPY $vgpr2_vgpr3_vgpr4
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(<3 x s32>) = G_BITCAST [[COPY1]](s96)
     ; VI-NEXT: G_STORE [[BITCAST]](<3 x s32>), [[COPY]](p1) :: (store (<3 x s32>), align 16, addrspace 1)
+    ;
     ; GFX9-LABEL: name: test_store_global_s96_align16
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4
     ; GFX9-NEXT: {{  $}}
@@ -5040,8 +5239,7 @@ body: |
     ; SI-NEXT: G_STORE [[COPY2]](s32), [[COPY]](p1) :: (store (s8), addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR1]](s32), [[PTR_ADD1]](p1) :: (store (s8) into unknown-address + 1, addrspace 1)
     ; SI-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; SI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C3]]
-    ; SI-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[AND1]], [[COPY3]](s32)
+    ; SI-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[LSHR]], [[COPY3]](s32)
     ; SI-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[LSHR]](s32), [[PTR_ADD]](p1) :: (store (s8) into unknown-address + 2, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR2]](s32), [[PTR_ADD2]](p1) :: (store (s8) into unknown-address + 3, addrspace 1)
@@ -5051,14 +5249,13 @@ body: |
     ; SI-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[COPY4]], [[C]](s32)
     ; SI-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD3]], [[C1]](s64)
     ; SI-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; SI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]]
-    ; SI-NEXT: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[AND2]], [[COPY5]](s32)
+    ; SI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]]
+    ; SI-NEXT: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[AND1]], [[COPY5]](s32)
     ; SI-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD3]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[COPY4]](s32), [[PTR_ADD3]](p1) :: (store (s8) into unknown-address + 4, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR4]](s32), [[PTR_ADD5]](p1) :: (store (s8) into unknown-address + 5, addrspace 1)
     ; SI-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; SI-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[LSHR3]], [[C3]]
-    ; SI-NEXT: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[AND3]], [[COPY6]](s32)
+    ; SI-NEXT: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[LSHR3]], [[COPY6]](s32)
     ; SI-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD4]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[LSHR3]](s32), [[PTR_ADD4]](p1) :: (store (s8) into unknown-address + 6, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR5]](s32), [[PTR_ADD6]](p1) :: (store (s8) into unknown-address + 7, addrspace 1)
@@ -5068,14 +5265,13 @@ body: |
     ; SI-NEXT: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[COPY7]], [[C]](s32)
     ; SI-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD7]], [[C1]](s64)
     ; SI-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; SI-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C3]]
-    ; SI-NEXT: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[AND4]], [[COPY8]](s32)
+    ; SI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C3]]
+    ; SI-NEXT: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[AND2]], [[COPY8]](s32)
     ; SI-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[COPY7]](s32), [[PTR_ADD7]](p1) :: (store (s8) into unknown-address + 8, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR7]](s32), [[PTR_ADD9]](p1) :: (store (s8) into unknown-address + 9, addrspace 1)
     ; SI-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; SI-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[LSHR6]], [[C3]]
-    ; SI-NEXT: [[LSHR8:%[0-9]+]]:_(s32) = G_LSHR [[AND5]], [[COPY9]](s32)
+    ; SI-NEXT: [[LSHR8:%[0-9]+]]:_(s32) = G_LSHR [[LSHR6]], [[COPY9]](s32)
     ; SI-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD8]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[LSHR6]](s32), [[PTR_ADD8]](p1) :: (store (s8) into unknown-address + 10, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR8]](s32), [[PTR_ADD10]](p1) :: (store (s8) into unknown-address + 11, addrspace 1)
@@ -5085,17 +5281,17 @@ body: |
     ; SI-NEXT: [[LSHR9:%[0-9]+]]:_(s32) = G_LSHR [[COPY10]], [[C]](s32)
     ; SI-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD11]], [[C1]](s64)
     ; SI-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; SI-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C3]]
-    ; SI-NEXT: [[LSHR10:%[0-9]+]]:_(s32) = G_LSHR [[AND6]], [[COPY11]](s32)
+    ; SI-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C3]]
+    ; SI-NEXT: [[LSHR10:%[0-9]+]]:_(s32) = G_LSHR [[AND3]], [[COPY11]](s32)
     ; SI-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD11]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[COPY10]](s32), [[PTR_ADD11]](p1) :: (store (s8) into unknown-address + 12, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR10]](s32), [[PTR_ADD13]](p1) :: (store (s8) into unknown-address + 13, addrspace 1)
     ; SI-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; SI-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[LSHR9]], [[C3]]
-    ; SI-NEXT: [[LSHR11:%[0-9]+]]:_(s32) = G_LSHR [[AND7]], [[COPY12]](s32)
+    ; SI-NEXT: [[LSHR11:%[0-9]+]]:_(s32) = G_LSHR [[LSHR9]], [[COPY12]](s32)
     ; SI-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD12]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[LSHR9]](s32), [[PTR_ADD12]](p1) :: (store (s8) into unknown-address + 14, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR11]](s32), [[PTR_ADD14]](p1) :: (store (s8) into unknown-address + 15, addrspace 1)
+    ;
     ; CI-LABEL: name: test_store_global_s128_align1
     ; CI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; CI-NEXT: {{  $}}
@@ -5103,6 +5299,7 @@ body: |
     ; CI-NEXT: [[COPY1:%[0-9]+]]:_(s128) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; CI-NEXT: [[BITCAST:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[COPY1]](s128)
     ; CI-NEXT: G_STORE [[BITCAST]](<4 x s32>), [[COPY]](p1) :: (store (<4 x s32>), align 1, addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_s128_align1
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; VI-NEXT: {{  $}}
@@ -5180,6 +5377,7 @@ body: |
     ; VI-NEXT: G_STORE [[LSHR9]](s32), [[PTR_ADD12]](p1) :: (store (s8) into unknown-address + 14, addrspace 1)
     ; VI-NEXT: [[ANYEXT7:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR11]](s16)
     ; VI-NEXT: G_STORE [[ANYEXT7]](s32), [[PTR_ADD14]](p1) :: (store (s8) into unknown-address + 15, addrspace 1)
+    ;
     ; GFX9-LABEL: name: test_store_global_s128_align1
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX9-NEXT: {{  $}}
@@ -5233,6 +5431,7 @@ body: |
     ; SI-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD5]], [[C1]](s64)
     ; SI-NEXT: G_STORE [[COPY5]](s32), [[PTR_ADD5]](p1) :: (store (s16) into unknown-address + 12, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR3]](s32), [[PTR_ADD6]](p1) :: (store (s16) into unknown-address + 14, addrspace 1)
+    ;
     ; CI-LABEL: name: test_store_global_s128_align2
     ; CI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; CI-NEXT: {{  $}}
@@ -5240,6 +5439,7 @@ body: |
     ; CI-NEXT: [[COPY1:%[0-9]+]]:_(s128) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; CI-NEXT: [[BITCAST:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[COPY1]](s128)
     ; CI-NEXT: G_STORE [[BITCAST]](<4 x s32>), [[COPY]](p1) :: (store (<4 x s32>), align 2, addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_s128_align2
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; VI-NEXT: {{  $}}
@@ -5275,6 +5475,7 @@ body: |
     ; VI-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD5]], [[C1]](s64)
     ; VI-NEXT: G_STORE [[COPY5]](s32), [[PTR_ADD5]](p1) :: (store (s16) into unknown-address + 12, addrspace 1)
     ; VI-NEXT: G_STORE [[LSHR3]](s32), [[PTR_ADD6]](p1) :: (store (s16) into unknown-address + 14, addrspace 1)
+    ;
     ; GFX9-LABEL: name: test_store_global_s128_align2
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX9-NEXT: {{  $}}
@@ -5300,6 +5501,7 @@ body: |
     ; SI-NEXT: [[COPY1:%[0-9]+]]:_(s128) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; SI-NEXT: [[BITCAST:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[COPY1]](s128)
     ; SI-NEXT: G_STORE [[BITCAST]](<4 x s32>), [[COPY]](p1) :: (store (<4 x s32>), align 4, addrspace 1)
+    ;
     ; CI-LABEL: name: test_store_global_s128_align4
     ; CI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; CI-NEXT: {{  $}}
@@ -5307,6 +5509,7 @@ body: |
     ; CI-NEXT: [[COPY1:%[0-9]+]]:_(s128) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; CI-NEXT: [[BITCAST:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[COPY1]](s128)
     ; CI-NEXT: G_STORE [[BITCAST]](<4 x s32>), [[COPY]](p1) :: (store (<4 x s32>), align 4, addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_s128_align4
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; VI-NEXT: {{  $}}
@@ -5314,6 +5517,7 @@ body: |
     ; VI-NEXT: [[COPY1:%[0-9]+]]:_(s128) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[COPY1]](s128)
     ; VI-NEXT: G_STORE [[BITCAST]](<4 x s32>), [[COPY]](p1) :: (store (<4 x s32>), align 4, addrspace 1)
+    ;
     ; GFX9-LABEL: name: test_store_global_s128_align4
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX9-NEXT: {{  $}}
@@ -5339,6 +5543,7 @@ body: |
     ; SI-NEXT: [[COPY1:%[0-9]+]]:_(s128) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; SI-NEXT: [[BITCAST:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[COPY1]](s128)
     ; SI-NEXT: G_STORE [[BITCAST]](<4 x s32>), [[COPY]](p1) :: (store (<4 x s32>), align 8, addrspace 1)
+    ;
     ; CI-LABEL: name: test_store_global_s128_align8
     ; CI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; CI-NEXT: {{  $}}
@@ -5346,6 +5551,7 @@ body: |
     ; CI-NEXT: [[COPY1:%[0-9]+]]:_(s128) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; CI-NEXT: [[BITCAST:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[COPY1]](s128)
     ; CI-NEXT: G_STORE [[BITCAST]](<4 x s32>), [[COPY]](p1) :: (store (<4 x s32>), align 8, addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_s128_align8
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; VI-NEXT: {{  $}}
@@ -5353,6 +5559,7 @@ body: |
     ; VI-NEXT: [[COPY1:%[0-9]+]]:_(s128) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[COPY1]](s128)
     ; VI-NEXT: G_STORE [[BITCAST]](<4 x s32>), [[COPY]](p1) :: (store (<4 x s32>), align 8, addrspace 1)
+    ;
     ; GFX9-LABEL: name: test_store_global_s128_align8
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX9-NEXT: {{  $}}
@@ -5378,6 +5585,7 @@ body: |
     ; SI-NEXT: [[COPY1:%[0-9]+]]:_(s128) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; SI-NEXT: [[BITCAST:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[COPY1]](s128)
     ; SI-NEXT: G_STORE [[BITCAST]](<4 x s32>), [[COPY]](p1) :: (store (<4 x s32>), addrspace 1)
+    ;
     ; CI-LABEL: name: test_store_global_s128_align16
     ; CI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; CI-NEXT: {{  $}}
@@ -5385,6 +5593,7 @@ body: |
     ; CI-NEXT: [[COPY1:%[0-9]+]]:_(s128) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; CI-NEXT: [[BITCAST:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[COPY1]](s128)
     ; CI-NEXT: G_STORE [[BITCAST]](<4 x s32>), [[COPY]](p1) :: (store (<4 x s32>), addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_s128_align16
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; VI-NEXT: {{  $}}
@@ -5392,6 +5601,7 @@ body: |
     ; VI-NEXT: [[COPY1:%[0-9]+]]:_(s128) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; VI-NEXT: [[BITCAST:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[COPY1]](s128)
     ; VI-NEXT: G_STORE [[BITCAST]](<4 x s32>), [[COPY]](p1) :: (store (<4 x s32>), addrspace 1)
+    ;
     ; GFX9-LABEL: name: test_store_global_s128_align16
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; GFX9-NEXT: {{  $}}
@@ -5430,8 +5640,7 @@ body: |
     ; SI-NEXT: G_STORE [[COPY2]](s32), [[COPY]](p1) :: (store (s8), addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR1]](s32), [[PTR_ADD1]](p1) :: (store (s8) into unknown-address + 1, addrspace 1)
     ; SI-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; SI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C3]]
-    ; SI-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[AND1]], [[COPY3]](s32)
+    ; SI-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[LSHR]], [[COPY3]](s32)
     ; SI-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[LSHR]](s32), [[PTR_ADD]](p1) :: (store (s8) into unknown-address + 2, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR2]](s32), [[PTR_ADD2]](p1) :: (store (s8) into unknown-address + 3, addrspace 1)
@@ -5441,14 +5650,13 @@ body: |
     ; SI-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[COPY4]], [[C]](s32)
     ; SI-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD3]], [[C1]](s64)
     ; SI-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; SI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]]
-    ; SI-NEXT: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[AND2]], [[COPY5]](s32)
+    ; SI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]]
+    ; SI-NEXT: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[AND1]], [[COPY5]](s32)
     ; SI-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD3]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[COPY4]](s32), [[PTR_ADD3]](p1) :: (store (s8) into unknown-address + 4, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR4]](s32), [[PTR_ADD5]](p1) :: (store (s8) into unknown-address + 5, addrspace 1)
     ; SI-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; SI-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[LSHR3]], [[C3]]
-    ; SI-NEXT: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[AND3]], [[COPY6]](s32)
+    ; SI-NEXT: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[LSHR3]], [[COPY6]](s32)
     ; SI-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD4]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[LSHR3]](s32), [[PTR_ADD4]](p1) :: (store (s8) into unknown-address + 6, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR5]](s32), [[PTR_ADD6]](p1) :: (store (s8) into unknown-address + 7, addrspace 1)
@@ -5458,14 +5666,13 @@ body: |
     ; SI-NEXT: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[COPY7]], [[C]](s32)
     ; SI-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD7]], [[C1]](s64)
     ; SI-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; SI-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C3]]
-    ; SI-NEXT: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[AND4]], [[COPY8]](s32)
+    ; SI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C3]]
+    ; SI-NEXT: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[AND2]], [[COPY8]](s32)
     ; SI-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[COPY7]](s32), [[PTR_ADD7]](p1) :: (store (s8) into unknown-address + 8, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR7]](s32), [[PTR_ADD9]](p1) :: (store (s8) into unknown-address + 9, addrspace 1)
     ; SI-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; SI-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[LSHR6]], [[C3]]
-    ; SI-NEXT: [[LSHR8:%[0-9]+]]:_(s32) = G_LSHR [[AND5]], [[COPY9]](s32)
+    ; SI-NEXT: [[LSHR8:%[0-9]+]]:_(s32) = G_LSHR [[LSHR6]], [[COPY9]](s32)
     ; SI-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD8]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[LSHR6]](s32), [[PTR_ADD8]](p1) :: (store (s8) into unknown-address + 10, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR8]](s32), [[PTR_ADD10]](p1) :: (store (s8) into unknown-address + 11, addrspace 1)
@@ -5475,14 +5682,13 @@ body: |
     ; SI-NEXT: [[LSHR9:%[0-9]+]]:_(s32) = G_LSHR [[COPY10]], [[C]](s32)
     ; SI-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD11]], [[C1]](s64)
     ; SI-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; SI-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C3]]
-    ; SI-NEXT: [[LSHR10:%[0-9]+]]:_(s32) = G_LSHR [[AND6]], [[COPY11]](s32)
+    ; SI-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C3]]
+    ; SI-NEXT: [[LSHR10:%[0-9]+]]:_(s32) = G_LSHR [[AND3]], [[COPY11]](s32)
     ; SI-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD11]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[COPY10]](s32), [[PTR_ADD11]](p1) :: (store (s8) into unknown-address + 12, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR10]](s32), [[PTR_ADD13]](p1) :: (store (s8) into unknown-address + 13, addrspace 1)
     ; SI-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; SI-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[LSHR9]], [[C3]]
-    ; SI-NEXT: [[LSHR11:%[0-9]+]]:_(s32) = G_LSHR [[AND7]], [[COPY12]](s32)
+    ; SI-NEXT: [[LSHR11:%[0-9]+]]:_(s32) = G_LSHR [[LSHR9]], [[COPY12]](s32)
     ; SI-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD12]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[LSHR9]](s32), [[PTR_ADD12]](p1) :: (store (s8) into unknown-address + 14, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR11]](s32), [[PTR_ADD14]](p1) :: (store (s8) into unknown-address + 15, addrspace 1)
@@ -5492,17 +5698,17 @@ body: |
     ; SI-NEXT: [[LSHR12:%[0-9]+]]:_(s32) = G_LSHR [[COPY13]], [[C]](s32)
     ; SI-NEXT: [[PTR_ADD16:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD15]], [[C1]](s64)
     ; SI-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; SI-NEXT: [[AND8:%[0-9]+]]:_(s32) = G_AND [[COPY13]], [[C3]]
-    ; SI-NEXT: [[LSHR13:%[0-9]+]]:_(s32) = G_LSHR [[AND8]], [[COPY14]](s32)
+    ; SI-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY13]], [[C3]]
+    ; SI-NEXT: [[LSHR13:%[0-9]+]]:_(s32) = G_LSHR [[AND4]], [[COPY14]](s32)
     ; SI-NEXT: [[PTR_ADD17:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD15]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[COPY13]](s32), [[PTR_ADD15]](p1) :: (store (s8) into unknown-address + 16, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR13]](s32), [[PTR_ADD17]](p1) :: (store (s8) into unknown-address + 17, addrspace 1)
     ; SI-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; SI-NEXT: [[AND9:%[0-9]+]]:_(s32) = G_AND [[LSHR12]], [[C3]]
-    ; SI-NEXT: [[LSHR14:%[0-9]+]]:_(s32) = G_LSHR [[AND9]], [[COPY15]](s32)
+    ; SI-NEXT: [[LSHR14:%[0-9]+]]:_(s32) = G_LSHR [[LSHR12]], [[COPY15]](s32)
     ; SI-NEXT: [[PTR_ADD18:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD16]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[LSHR12]](s32), [[PTR_ADD16]](p1) :: (store (s8) into unknown-address + 18, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR14]](s32), [[PTR_ADD18]](p1) :: (store (s8) into unknown-address + 19, addrspace 1)
+    ;
     ; CI-LABEL: name: test_store_global_v5s32_align1
     ; CI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6
     ; CI-NEXT: {{  $}}
@@ -5514,6 +5720,7 @@ body: |
     ; CI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
     ; CI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64)
     ; CI-NEXT: G_STORE [[UV4]](s32), [[PTR_ADD]](p1) :: (store (s32) into unknown-address + 16, align 1, addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_v5s32_align1
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6
     ; VI-NEXT: {{  $}}
@@ -5607,6 +5814,7 @@ body: |
     ; VI-NEXT: G_STORE [[LSHR12]](s32), [[PTR_ADD16]](p1) :: (store (s8) into unknown-address + 18, addrspace 1)
     ; VI-NEXT: [[ANYEXT9:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR14]](s16)
     ; VI-NEXT: G_STORE [[ANYEXT9]](s32), [[PTR_ADD18]](p1) :: (store (s8) into unknown-address + 19, addrspace 1)
+    ;
     ; GFX9-LABEL: name: test_store_global_v5s32_align1
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6
     ; GFX9-NEXT: {{  $}}
@@ -5670,6 +5878,7 @@ body: |
     ; SI-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD7]], [[C1]](s64)
     ; SI-NEXT: G_STORE [[COPY6]](s32), [[PTR_ADD7]](p1) :: (store (s16) into unknown-address + 16, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR4]](s32), [[PTR_ADD8]](p1) :: (store (s16) into unknown-address + 18, addrspace 1)
+    ;
     ; CI-LABEL: name: test_store_global_v5s32_align2
     ; CI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6
     ; CI-NEXT: {{  $}}
@@ -5681,6 +5890,7 @@ body: |
     ; CI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
     ; CI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64)
     ; CI-NEXT: G_STORE [[UV4]](s32), [[PTR_ADD]](p1) :: (store (s32) into unknown-address + 16, align 2, addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_v5s32_align2
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6
     ; VI-NEXT: {{  $}}
@@ -5722,6 +5932,7 @@ body: |
     ; VI-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD7]], [[C1]](s64)
     ; VI-NEXT: G_STORE [[COPY6]](s32), [[PTR_ADD7]](p1) :: (store (s16) into unknown-address + 16, addrspace 1)
     ; VI-NEXT: G_STORE [[LSHR4]](s32), [[PTR_ADD8]](p1) :: (store (s16) into unknown-address + 18, addrspace 1)
+    ;
     ; GFX9-LABEL: name: test_store_global_v5s32_align2
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6
     ; GFX9-NEXT: {{  $}}
@@ -5755,6 +5966,7 @@ body: |
     ; SI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
     ; SI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64)
     ; SI-NEXT: G_STORE [[UV4]](s32), [[PTR_ADD]](p1) :: (store (s32) into unknown-address + 16, addrspace 1)
+    ;
     ; CI-LABEL: name: test_store_global_v5s32_align4
     ; CI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6
     ; CI-NEXT: {{  $}}
@@ -5766,6 +5978,7 @@ body: |
     ; CI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
     ; CI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64)
     ; CI-NEXT: G_STORE [[UV4]](s32), [[PTR_ADD]](p1) :: (store (s32) into unknown-address + 16, addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_v5s32_align4
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6
     ; VI-NEXT: {{  $}}
@@ -5777,6 +5990,7 @@ body: |
     ; VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
     ; VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64)
     ; VI-NEXT: G_STORE [[UV4]](s32), [[PTR_ADD]](p1) :: (store (s32) into unknown-address + 16, addrspace 1)
+    ;
     ; GFX9-LABEL: name: test_store_global_v5s32_align4
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6
     ; GFX9-NEXT: {{  $}}
@@ -5810,6 +6024,7 @@ body: |
     ; SI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
     ; SI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64)
     ; SI-NEXT: G_STORE [[UV4]](s32), [[PTR_ADD]](p1) :: (store (s32) into unknown-address + 16, align 8, addrspace 1)
+    ;
     ; CI-LABEL: name: test_store_global_v5s32_align8
     ; CI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6
     ; CI-NEXT: {{  $}}
@@ -5821,6 +6036,7 @@ body: |
     ; CI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
     ; CI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64)
     ; CI-NEXT: G_STORE [[UV4]](s32), [[PTR_ADD]](p1) :: (store (s32) into unknown-address + 16, align 8, addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_v5s32_align8
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6
     ; VI-NEXT: {{  $}}
@@ -5832,6 +6048,7 @@ body: |
     ; VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
     ; VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64)
     ; VI-NEXT: G_STORE [[UV4]](s32), [[PTR_ADD]](p1) :: (store (s32) into unknown-address + 16, align 8, addrspace 1)
+    ;
     ; GFX9-LABEL: name: test_store_global_v5s32_align8
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6
     ; GFX9-NEXT: {{  $}}
@@ -5865,6 +6082,7 @@ body: |
     ; SI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
     ; SI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64)
     ; SI-NEXT: G_STORE [[UV4]](s32), [[PTR_ADD]](p1) :: (store (s32) into unknown-address + 16, align 16, addrspace 1)
+    ;
     ; CI-LABEL: name: test_store_global_v5s32_align16
     ; CI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6
     ; CI-NEXT: {{  $}}
@@ -5876,6 +6094,7 @@ body: |
     ; CI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
     ; CI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64)
     ; CI-NEXT: G_STORE [[UV4]](s32), [[PTR_ADD]](p1) :: (store (s32) into unknown-address + 16, align 16, addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_v5s32_align16
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6
     ; VI-NEXT: {{  $}}
@@ -5887,6 +6106,7 @@ body: |
     ; VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
     ; VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64)
     ; VI-NEXT: G_STORE [[UV4]](s32), [[PTR_ADD]](p1) :: (store (s32) into unknown-address + 16, align 16, addrspace 1)
+    ;
     ; GFX9-LABEL: name: test_store_global_v5s32_align16
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6
     ; GFX9-NEXT: {{  $}}
@@ -5929,8 +6149,7 @@ body: |
     ; SI-NEXT: G_STORE [[COPY2]](s32), [[COPY]](p1) :: (store (s8), addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR1]](s32), [[PTR_ADD1]](p1) :: (store (s8) into unknown-address + 1, addrspace 1)
     ; SI-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; SI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C3]]
-    ; SI-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[AND1]], [[COPY3]](s32)
+    ; SI-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[LSHR]], [[COPY3]](s32)
     ; SI-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[LSHR]](s32), [[PTR_ADD]](p1) :: (store (s8) into unknown-address + 2, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR2]](s32), [[PTR_ADD2]](p1) :: (store (s8) into unknown-address + 3, addrspace 1)
@@ -5940,14 +6159,13 @@ body: |
     ; SI-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[COPY4]], [[C]](s32)
     ; SI-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD3]], [[C1]](s64)
     ; SI-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; SI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]]
-    ; SI-NEXT: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[AND2]], [[COPY5]](s32)
+    ; SI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]]
+    ; SI-NEXT: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[AND1]], [[COPY5]](s32)
     ; SI-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD3]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[COPY4]](s32), [[PTR_ADD3]](p1) :: (store (s8) into unknown-address + 4, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR4]](s32), [[PTR_ADD5]](p1) :: (store (s8) into unknown-address + 5, addrspace 1)
     ; SI-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; SI-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[LSHR3]], [[C3]]
-    ; SI-NEXT: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[AND3]], [[COPY6]](s32)
+    ; SI-NEXT: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[LSHR3]], [[COPY6]](s32)
     ; SI-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD4]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[LSHR3]](s32), [[PTR_ADD4]](p1) :: (store (s8) into unknown-address + 6, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR5]](s32), [[PTR_ADD6]](p1) :: (store (s8) into unknown-address + 7, addrspace 1)
@@ -5957,14 +6175,13 @@ body: |
     ; SI-NEXT: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[COPY7]], [[C]](s32)
     ; SI-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD7]], [[C1]](s64)
     ; SI-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; SI-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C3]]
-    ; SI-NEXT: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[AND4]], [[COPY8]](s32)
+    ; SI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C3]]
+    ; SI-NEXT: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[AND2]], [[COPY8]](s32)
     ; SI-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[COPY7]](s32), [[PTR_ADD7]](p1) :: (store (s8) into unknown-address + 8, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR7]](s32), [[PTR_ADD9]](p1) :: (store (s8) into unknown-address + 9, addrspace 1)
     ; SI-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; SI-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[LSHR6]], [[C3]]
-    ; SI-NEXT: [[LSHR8:%[0-9]+]]:_(s32) = G_LSHR [[AND5]], [[COPY9]](s32)
+    ; SI-NEXT: [[LSHR8:%[0-9]+]]:_(s32) = G_LSHR [[LSHR6]], [[COPY9]](s32)
     ; SI-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD8]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[LSHR6]](s32), [[PTR_ADD8]](p1) :: (store (s8) into unknown-address + 10, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR8]](s32), [[PTR_ADD10]](p1) :: (store (s8) into unknown-address + 11, addrspace 1)
@@ -5974,14 +6191,13 @@ body: |
     ; SI-NEXT: [[LSHR9:%[0-9]+]]:_(s32) = G_LSHR [[COPY10]], [[C]](s32)
     ; SI-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD11]], [[C1]](s64)
     ; SI-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; SI-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C3]]
-    ; SI-NEXT: [[LSHR10:%[0-9]+]]:_(s32) = G_LSHR [[AND6]], [[COPY11]](s32)
+    ; SI-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C3]]
+    ; SI-NEXT: [[LSHR10:%[0-9]+]]:_(s32) = G_LSHR [[AND3]], [[COPY11]](s32)
     ; SI-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD11]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[COPY10]](s32), [[PTR_ADD11]](p1) :: (store (s8) into unknown-address + 12, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR10]](s32), [[PTR_ADD13]](p1) :: (store (s8) into unknown-address + 13, addrspace 1)
     ; SI-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; SI-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[LSHR9]], [[C3]]
-    ; SI-NEXT: [[LSHR11:%[0-9]+]]:_(s32) = G_LSHR [[AND7]], [[COPY12]](s32)
+    ; SI-NEXT: [[LSHR11:%[0-9]+]]:_(s32) = G_LSHR [[LSHR9]], [[COPY12]](s32)
     ; SI-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD12]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[LSHR9]](s32), [[PTR_ADD12]](p1) :: (store (s8) into unknown-address + 14, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR11]](s32), [[PTR_ADD14]](p1) :: (store (s8) into unknown-address + 15, addrspace 1)
@@ -5991,17 +6207,17 @@ body: |
     ; SI-NEXT: [[LSHR12:%[0-9]+]]:_(s32) = G_LSHR [[COPY13]], [[C]](s32)
     ; SI-NEXT: [[PTR_ADD16:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD15]], [[C1]](s64)
     ; SI-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; SI-NEXT: [[AND8:%[0-9]+]]:_(s32) = G_AND [[COPY13]], [[C3]]
-    ; SI-NEXT: [[LSHR13:%[0-9]+]]:_(s32) = G_LSHR [[AND8]], [[COPY14]](s32)
+    ; SI-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY13]], [[C3]]
+    ; SI-NEXT: [[LSHR13:%[0-9]+]]:_(s32) = G_LSHR [[AND4]], [[COPY14]](s32)
     ; SI-NEXT: [[PTR_ADD17:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD15]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[COPY13]](s32), [[PTR_ADD15]](p1) :: (store (s8) into unknown-address + 16, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR13]](s32), [[PTR_ADD17]](p1) :: (store (s8) into unknown-address + 17, addrspace 1)
     ; SI-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; SI-NEXT: [[AND9:%[0-9]+]]:_(s32) = G_AND [[LSHR12]], [[C3]]
-    ; SI-NEXT: [[LSHR14:%[0-9]+]]:_(s32) = G_LSHR [[AND9]], [[COPY15]](s32)
+    ; SI-NEXT: [[LSHR14:%[0-9]+]]:_(s32) = G_LSHR [[LSHR12]], [[COPY15]](s32)
     ; SI-NEXT: [[PTR_ADD18:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD16]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[LSHR12]](s32), [[PTR_ADD16]](p1) :: (store (s8) into unknown-address + 18, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR14]](s32), [[PTR_ADD18]](p1) :: (store (s8) into unknown-address + 19, addrspace 1)
+    ;
     ; CI-LABEL: name: test_store_global_v5p3_align1
     ; CI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6
     ; CI-NEXT: {{  $}}
@@ -6014,6 +6230,7 @@ body: |
     ; CI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
     ; CI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64)
     ; CI-NEXT: G_STORE [[UV4]](s32), [[PTR_ADD]](p1) :: (store (s32) into unknown-address + 16, align 1, addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_v5p3_align1
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6
     ; VI-NEXT: {{  $}}
@@ -6108,6 +6325,7 @@ body: |
     ; VI-NEXT: G_STORE [[LSHR12]](s32), [[PTR_ADD16]](p1) :: (store (s8) into unknown-address + 18, addrspace 1)
     ; VI-NEXT: [[ANYEXT9:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR14]](s16)
     ; VI-NEXT: G_STORE [[ANYEXT9]](s32), [[PTR_ADD18]](p1) :: (store (s8) into unknown-address + 19, addrspace 1)
+    ;
     ; GFX9-LABEL: name: test_store_global_v5p3_align1
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6
     ; GFX9-NEXT: {{  $}}
@@ -6173,6 +6391,7 @@ body: |
     ; SI-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD7]], [[C1]](s64)
     ; SI-NEXT: G_STORE [[COPY6]](s32), [[PTR_ADD7]](p1) :: (store (s16) into unknown-address + 16, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR4]](s32), [[PTR_ADD8]](p1) :: (store (s16) into unknown-address + 18, addrspace 1)
+    ;
     ; CI-LABEL: name: test_store_global_v5p3_align2
     ; CI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6
     ; CI-NEXT: {{  $}}
@@ -6185,6 +6404,7 @@ body: |
     ; CI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
     ; CI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64)
     ; CI-NEXT: G_STORE [[UV4]](s32), [[PTR_ADD]](p1) :: (store (s32) into unknown-address + 16, align 2, addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_v5p3_align2
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6
     ; VI-NEXT: {{  $}}
@@ -6227,6 +6447,7 @@ body: |
     ; VI-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD7]], [[C1]](s64)
     ; VI-NEXT: G_STORE [[COPY6]](s32), [[PTR_ADD7]](p1) :: (store (s16) into unknown-address + 16, addrspace 1)
     ; VI-NEXT: G_STORE [[LSHR4]](s32), [[PTR_ADD8]](p1) :: (store (s16) into unknown-address + 18, addrspace 1)
+    ;
     ; GFX9-LABEL: name: test_store_global_v5p3_align2
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6
     ; GFX9-NEXT: {{  $}}
@@ -6262,6 +6483,7 @@ body: |
     ; SI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
     ; SI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64)
     ; SI-NEXT: G_STORE [[UV4]](s32), [[PTR_ADD]](p1) :: (store (s32) into unknown-address + 16, addrspace 1)
+    ;
     ; CI-LABEL: name: test_store_global_v5p3_align4
     ; CI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6
     ; CI-NEXT: {{  $}}
@@ -6274,6 +6496,7 @@ body: |
     ; CI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
     ; CI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64)
     ; CI-NEXT: G_STORE [[UV4]](s32), [[PTR_ADD]](p1) :: (store (s32) into unknown-address + 16, addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_v5p3_align4
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6
     ; VI-NEXT: {{  $}}
@@ -6286,6 +6509,7 @@ body: |
     ; VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
     ; VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64)
     ; VI-NEXT: G_STORE [[UV4]](s32), [[PTR_ADD]](p1) :: (store (s32) into unknown-address + 16, addrspace 1)
+    ;
     ; GFX9-LABEL: name: test_store_global_v5p3_align4
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6
     ; GFX9-NEXT: {{  $}}
@@ -6321,6 +6545,7 @@ body: |
     ; SI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
     ; SI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64)
     ; SI-NEXT: G_STORE [[UV4]](s32), [[PTR_ADD]](p1) :: (store (s32) into unknown-address + 16, align 8, addrspace 1)
+    ;
     ; CI-LABEL: name: test_store_global_v5p3_align8
     ; CI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6
     ; CI-NEXT: {{  $}}
@@ -6333,6 +6558,7 @@ body: |
     ; CI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
     ; CI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64)
     ; CI-NEXT: G_STORE [[UV4]](s32), [[PTR_ADD]](p1) :: (store (s32) into unknown-address + 16, align 8, addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_v5p3_align8
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6
     ; VI-NEXT: {{  $}}
@@ -6345,6 +6571,7 @@ body: |
     ; VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
     ; VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64)
     ; VI-NEXT: G_STORE [[UV4]](s32), [[PTR_ADD]](p1) :: (store (s32) into unknown-address + 16, align 8, addrspace 1)
+    ;
     ; GFX9-LABEL: name: test_store_global_v5p3_align8
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6
     ; GFX9-NEXT: {{  $}}
@@ -6380,6 +6607,7 @@ body: |
     ; SI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
     ; SI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64)
     ; SI-NEXT: G_STORE [[UV4]](s32), [[PTR_ADD]](p1) :: (store (s32) into unknown-address + 16, align 16, addrspace 1)
+    ;
     ; CI-LABEL: name: test_store_global_v5p3_align16
     ; CI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6
     ; CI-NEXT: {{  $}}
@@ -6392,6 +6620,7 @@ body: |
     ; CI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
     ; CI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64)
     ; CI-NEXT: G_STORE [[UV4]](s32), [[PTR_ADD]](p1) :: (store (s32) into unknown-address + 16, align 16, addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_v5p3_align16
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6
     ; VI-NEXT: {{  $}}
@@ -6404,6 +6633,7 @@ body: |
     ; VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
     ; VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64)
     ; VI-NEXT: G_STORE [[UV4]](s32), [[PTR_ADD]](p1) :: (store (s32) into unknown-address + 16, align 16, addrspace 1)
+    ;
     ; GFX9-LABEL: name: test_store_global_v5p3_align16
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6
     ; GFX9-NEXT: {{  $}}
@@ -6439,6 +6669,7 @@ body: |
     ; SI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
     ; SI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64)
     ; SI-NEXT: G_STORE [[UV4]](s32), [[PTR_ADD]](p1) :: (store (s32) into unknown-address + 16, align 16, addrspace 1)
+    ;
     ; CI-LABEL: name: test_store_global_v10s16_align4
     ; CI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6
     ; CI-NEXT: {{  $}}
@@ -6451,6 +6682,7 @@ body: |
     ; CI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
     ; CI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64)
     ; CI-NEXT: G_STORE [[UV4]](s32), [[PTR_ADD]](p1) :: (store (s32) into unknown-address + 16, align 16, addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_v10s16_align4
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6
     ; VI-NEXT: {{  $}}
@@ -6463,6 +6695,7 @@ body: |
     ; VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
     ; VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64)
     ; VI-NEXT: G_STORE [[UV4]](s32), [[PTR_ADD]](p1) :: (store (s32) into unknown-address + 16, align 16, addrspace 1)
+    ;
     ; GFX9-LABEL: name: test_store_global_v10s16_align4
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6
     ; GFX9-NEXT: {{  $}}
@@ -6508,6 +6741,7 @@ body: |
     ; SI-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
     ; SI-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD]], [[C3]](s64)
     ; SI-NEXT: G_STORE [[BITCAST1]](s32), [[PTR_ADD2]](p1) :: (store (s16) into unknown-address + 20, align 4, addrspace 1)
+    ;
     ; CI-LABEL: name: test_store_global_v11s16_align4
     ; CI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6
     ; CI-NEXT: {{  $}}
@@ -6530,6 +6764,7 @@ body: |
     ; CI-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
     ; CI-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD]], [[C3]](s64)
     ; CI-NEXT: G_STORE [[BITCAST1]](s32), [[PTR_ADD2]](p1) :: (store (s16) into unknown-address + 20, align 4, addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_v11s16_align4
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6
     ; VI-NEXT: {{  $}}
@@ -6552,6 +6787,7 @@ body: |
     ; VI-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
     ; VI-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD]], [[C3]](s64)
     ; VI-NEXT: G_STORE [[BITCAST1]](s32), [[PTR_ADD2]](p1) :: (store (s16) into unknown-address + 20, align 4, addrspace 1)
+    ;
     ; GFX9-LABEL: name: test_store_global_v11s16_align4
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6
     ; GFX9-NEXT: {{  $}}
@@ -6598,6 +6834,7 @@ body: |
     ; SI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
     ; SI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64)
     ; SI-NEXT: G_STORE [[UV8]](<2 x s32>), [[PTR_ADD]](p1) :: (store (<2 x s32>) into unknown-address + 16, align 16, addrspace 1)
+    ;
     ; CI-LABEL: name: test_store_global_v12s16_align4
     ; CI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6
     ; CI-NEXT: {{  $}}
@@ -6611,6 +6848,7 @@ body: |
     ; CI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
     ; CI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64)
     ; CI-NEXT: G_STORE [[UV8]](<2 x s32>), [[PTR_ADD]](p1) :: (store (<2 x s32>) into unknown-address + 16, align 16, addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_v12s16_align4
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6
     ; VI-NEXT: {{  $}}
@@ -6624,6 +6862,7 @@ body: |
     ; VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
     ; VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64)
     ; VI-NEXT: G_STORE [[UV8]](<2 x s32>), [[PTR_ADD]](p1) :: (store (<2 x s32>) into unknown-address + 16, align 16, addrspace 1)
+    ;
     ; GFX9-LABEL: name: test_store_global_v12s16_align4
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6
     ; GFX9-NEXT: {{  $}}
@@ -6669,8 +6908,7 @@ body: |
     ; SI-NEXT: G_STORE [[COPY2]](s32), [[COPY]](p1) :: (store (s8), addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR1]](s32), [[PTR_ADD1]](p1) :: (store (s8) into unknown-address + 1, addrspace 1)
     ; SI-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; SI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C3]]
-    ; SI-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[AND1]], [[COPY3]](s32)
+    ; SI-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[LSHR]], [[COPY3]](s32)
     ; SI-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[LSHR]](s32), [[PTR_ADD]](p1) :: (store (s8) into unknown-address + 2, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR2]](s32), [[PTR_ADD2]](p1) :: (store (s8) into unknown-address + 3, addrspace 1)
@@ -6680,14 +6918,13 @@ body: |
     ; SI-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[COPY4]], [[C]](s32)
     ; SI-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD3]], [[C1]](s64)
     ; SI-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; SI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]]
-    ; SI-NEXT: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[AND2]], [[COPY5]](s32)
+    ; SI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]]
+    ; SI-NEXT: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[AND1]], [[COPY5]](s32)
     ; SI-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD3]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[COPY4]](s32), [[PTR_ADD3]](p1) :: (store (s8) into unknown-address + 4, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR4]](s32), [[PTR_ADD5]](p1) :: (store (s8) into unknown-address + 5, addrspace 1)
     ; SI-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; SI-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[LSHR3]], [[C3]]
-    ; SI-NEXT: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[AND3]], [[COPY6]](s32)
+    ; SI-NEXT: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[LSHR3]], [[COPY6]](s32)
     ; SI-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD4]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[LSHR3]](s32), [[PTR_ADD4]](p1) :: (store (s8) into unknown-address + 6, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR5]](s32), [[PTR_ADD6]](p1) :: (store (s8) into unknown-address + 7, addrspace 1)
@@ -6697,14 +6934,13 @@ body: |
     ; SI-NEXT: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[COPY7]], [[C]](s32)
     ; SI-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD7]], [[C1]](s64)
     ; SI-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; SI-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C3]]
-    ; SI-NEXT: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[AND4]], [[COPY8]](s32)
+    ; SI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C3]]
+    ; SI-NEXT: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[AND2]], [[COPY8]](s32)
     ; SI-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[COPY7]](s32), [[PTR_ADD7]](p1) :: (store (s8) into unknown-address + 8, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR7]](s32), [[PTR_ADD9]](p1) :: (store (s8) into unknown-address + 9, addrspace 1)
     ; SI-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; SI-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[LSHR6]], [[C3]]
-    ; SI-NEXT: [[LSHR8:%[0-9]+]]:_(s32) = G_LSHR [[AND5]], [[COPY9]](s32)
+    ; SI-NEXT: [[LSHR8:%[0-9]+]]:_(s32) = G_LSHR [[LSHR6]], [[COPY9]](s32)
     ; SI-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD8]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[LSHR6]](s32), [[PTR_ADD8]](p1) :: (store (s8) into unknown-address + 10, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR8]](s32), [[PTR_ADD10]](p1) :: (store (s8) into unknown-address + 11, addrspace 1)
@@ -6714,14 +6950,13 @@ body: |
     ; SI-NEXT: [[LSHR9:%[0-9]+]]:_(s32) = G_LSHR [[COPY10]], [[C]](s32)
     ; SI-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD11]], [[C1]](s64)
     ; SI-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; SI-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C3]]
-    ; SI-NEXT: [[LSHR10:%[0-9]+]]:_(s32) = G_LSHR [[AND6]], [[COPY11]](s32)
+    ; SI-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C3]]
+    ; SI-NEXT: [[LSHR10:%[0-9]+]]:_(s32) = G_LSHR [[AND3]], [[COPY11]](s32)
     ; SI-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD11]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[COPY10]](s32), [[PTR_ADD11]](p1) :: (store (s8) into unknown-address + 12, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR10]](s32), [[PTR_ADD13]](p1) :: (store (s8) into unknown-address + 13, addrspace 1)
     ; SI-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; SI-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[LSHR9]], [[C3]]
-    ; SI-NEXT: [[LSHR11:%[0-9]+]]:_(s32) = G_LSHR [[AND7]], [[COPY12]](s32)
+    ; SI-NEXT: [[LSHR11:%[0-9]+]]:_(s32) = G_LSHR [[LSHR9]], [[COPY12]](s32)
     ; SI-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD12]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[LSHR9]](s32), [[PTR_ADD12]](p1) :: (store (s8) into unknown-address + 14, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR11]](s32), [[PTR_ADD14]](p1) :: (store (s8) into unknown-address + 15, addrspace 1)
@@ -6731,17 +6966,17 @@ body: |
     ; SI-NEXT: [[LSHR12:%[0-9]+]]:_(s32) = G_LSHR [[COPY13]], [[C]](s32)
     ; SI-NEXT: [[PTR_ADD16:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD15]], [[C1]](s64)
     ; SI-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; SI-NEXT: [[AND8:%[0-9]+]]:_(s32) = G_AND [[COPY13]], [[C3]]
-    ; SI-NEXT: [[LSHR13:%[0-9]+]]:_(s32) = G_LSHR [[AND8]], [[COPY14]](s32)
+    ; SI-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY13]], [[C3]]
+    ; SI-NEXT: [[LSHR13:%[0-9]+]]:_(s32) = G_LSHR [[AND4]], [[COPY14]](s32)
     ; SI-NEXT: [[PTR_ADD17:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD15]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[COPY13]](s32), [[PTR_ADD15]](p1) :: (store (s8) into unknown-address + 16, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR13]](s32), [[PTR_ADD17]](p1) :: (store (s8) into unknown-address + 17, addrspace 1)
     ; SI-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; SI-NEXT: [[AND9:%[0-9]+]]:_(s32) = G_AND [[LSHR12]], [[C3]]
-    ; SI-NEXT: [[LSHR14:%[0-9]+]]:_(s32) = G_LSHR [[AND9]], [[COPY15]](s32)
+    ; SI-NEXT: [[LSHR14:%[0-9]+]]:_(s32) = G_LSHR [[LSHR12]], [[COPY15]](s32)
     ; SI-NEXT: [[PTR_ADD18:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD16]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[LSHR12]](s32), [[PTR_ADD16]](p1) :: (store (s8) into unknown-address + 18, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR14]](s32), [[PTR_ADD18]](p1) :: (store (s8) into unknown-address + 19, addrspace 1)
+    ;
     ; CI-LABEL: name: test_store_global_s160_align1
     ; CI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6
     ; CI-NEXT: {{  $}}
@@ -6754,6 +6989,7 @@ body: |
     ; CI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
     ; CI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64)
     ; CI-NEXT: G_STORE [[UV4]](s32), [[PTR_ADD]](p1) :: (store (s32) into unknown-address + 16, align 1, addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_s160_align1
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6
     ; VI-NEXT: {{  $}}
@@ -6848,6 +7084,7 @@ body: |
     ; VI-NEXT: G_STORE [[LSHR12]](s32), [[PTR_ADD16]](p1) :: (store (s8) into unknown-address + 18, addrspace 1)
     ; VI-NEXT: [[ANYEXT9:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR14]](s16)
     ; VI-NEXT: G_STORE [[ANYEXT9]](s32), [[PTR_ADD18]](p1) :: (store (s8) into unknown-address + 19, addrspace 1)
+    ;
     ; GFX9-LABEL: name: test_store_global_s160_align1
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6
     ; GFX9-NEXT: {{  $}}
@@ -6913,6 +7150,7 @@ body: |
     ; SI-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD7]], [[C1]](s64)
     ; SI-NEXT: G_STORE [[COPY6]](s32), [[PTR_ADD7]](p1) :: (store (s16) into unknown-address + 16, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR4]](s32), [[PTR_ADD8]](p1) :: (store (s16) into unknown-address + 18, addrspace 1)
+    ;
     ; CI-LABEL: name: test_store_global_s160_align2
     ; CI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6
     ; CI-NEXT: {{  $}}
@@ -6925,6 +7163,7 @@ body: |
     ; CI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
     ; CI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64)
     ; CI-NEXT: G_STORE [[UV4]](s32), [[PTR_ADD]](p1) :: (store (s32) into unknown-address + 16, align 2, addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_s160_align2
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6
     ; VI-NEXT: {{  $}}
@@ -6967,6 +7206,7 @@ body: |
     ; VI-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD7]], [[C1]](s64)
     ; VI-NEXT: G_STORE [[COPY6]](s32), [[PTR_ADD7]](p1) :: (store (s16) into unknown-address + 16, addrspace 1)
     ; VI-NEXT: G_STORE [[LSHR4]](s32), [[PTR_ADD8]](p1) :: (store (s16) into unknown-address + 18, addrspace 1)
+    ;
     ; GFX9-LABEL: name: test_store_global_s160_align2
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6
     ; GFX9-NEXT: {{  $}}
@@ -7002,6 +7242,7 @@ body: |
     ; SI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
     ; SI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64)
     ; SI-NEXT: G_STORE [[UV4]](s32), [[PTR_ADD]](p1) :: (store (s32) into unknown-address + 16, addrspace 1)
+    ;
     ; CI-LABEL: name: test_store_global_s160_align4
     ; CI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6
     ; CI-NEXT: {{  $}}
@@ -7014,6 +7255,7 @@ body: |
     ; CI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
     ; CI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64)
     ; CI-NEXT: G_STORE [[UV4]](s32), [[PTR_ADD]](p1) :: (store (s32) into unknown-address + 16, addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_s160_align4
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6
     ; VI-NEXT: {{  $}}
@@ -7026,6 +7268,7 @@ body: |
     ; VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
     ; VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64)
     ; VI-NEXT: G_STORE [[UV4]](s32), [[PTR_ADD]](p1) :: (store (s32) into unknown-address + 16, addrspace 1)
+    ;
     ; GFX9-LABEL: name: test_store_global_s160_align4
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6
     ; GFX9-NEXT: {{  $}}
@@ -7061,6 +7304,7 @@ body: |
     ; SI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
     ; SI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64)
     ; SI-NEXT: G_STORE [[UV4]](s32), [[PTR_ADD]](p1) :: (store (s32) into unknown-address + 16, align 8, addrspace 1)
+    ;
     ; CI-LABEL: name: test_store_global_s160_align8
     ; CI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6
     ; CI-NEXT: {{  $}}
@@ -7073,6 +7317,7 @@ body: |
     ; CI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
     ; CI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64)
     ; CI-NEXT: G_STORE [[UV4]](s32), [[PTR_ADD]](p1) :: (store (s32) into unknown-address + 16, align 8, addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_s160_align8
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6
     ; VI-NEXT: {{  $}}
@@ -7085,6 +7330,7 @@ body: |
     ; VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
     ; VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64)
     ; VI-NEXT: G_STORE [[UV4]](s32), [[PTR_ADD]](p1) :: (store (s32) into unknown-address + 16, align 8, addrspace 1)
+    ;
     ; GFX9-LABEL: name: test_store_global_s160_align8
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6
     ; GFX9-NEXT: {{  $}}
@@ -7120,6 +7366,7 @@ body: |
     ; SI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
     ; SI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64)
     ; SI-NEXT: G_STORE [[UV4]](s32), [[PTR_ADD]](p1) :: (store (s32) into unknown-address + 16, align 16, addrspace 1)
+    ;
     ; CI-LABEL: name: test_store_global_s160_align16
     ; CI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6
     ; CI-NEXT: {{  $}}
@@ -7132,6 +7379,7 @@ body: |
     ; CI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
     ; CI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64)
     ; CI-NEXT: G_STORE [[UV4]](s32), [[PTR_ADD]](p1) :: (store (s32) into unknown-address + 16, align 16, addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_s160_align16
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6
     ; VI-NEXT: {{  $}}
@@ -7144,6 +7392,7 @@ body: |
     ; VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
     ; VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64)
     ; VI-NEXT: G_STORE [[UV4]](s32), [[PTR_ADD]](p1) :: (store (s32) into unknown-address + 16, align 16, addrspace 1)
+    ;
     ; GFX9-LABEL: name: test_store_global_s160_align16
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6
     ; GFX9-NEXT: {{  $}}
@@ -7188,8 +7437,7 @@ body: |
     ; SI-NEXT: G_STORE [[COPY2]](s32), [[COPY]](p1) :: (store (s8), addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR1]](s32), [[PTR_ADD1]](p1) :: (store (s8) into unknown-address + 1, addrspace 1)
     ; SI-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; SI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C3]]
-    ; SI-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[AND1]], [[COPY3]](s32)
+    ; SI-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[LSHR]], [[COPY3]](s32)
     ; SI-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[LSHR]](s32), [[PTR_ADD]](p1) :: (store (s8) into unknown-address + 2, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR2]](s32), [[PTR_ADD2]](p1) :: (store (s8) into unknown-address + 3, addrspace 1)
@@ -7199,14 +7447,13 @@ body: |
     ; SI-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[COPY4]], [[C]](s32)
     ; SI-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD3]], [[C1]](s64)
     ; SI-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; SI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]]
-    ; SI-NEXT: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[AND2]], [[COPY5]](s32)
+    ; SI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]]
+    ; SI-NEXT: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[AND1]], [[COPY5]](s32)
     ; SI-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD3]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[COPY4]](s32), [[PTR_ADD3]](p1) :: (store (s8) into unknown-address + 4, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR4]](s32), [[PTR_ADD5]](p1) :: (store (s8) into unknown-address + 5, addrspace 1)
     ; SI-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; SI-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[LSHR3]], [[C3]]
-    ; SI-NEXT: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[AND3]], [[COPY6]](s32)
+    ; SI-NEXT: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[LSHR3]], [[COPY6]](s32)
     ; SI-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD4]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[LSHR3]](s32), [[PTR_ADD4]](p1) :: (store (s8) into unknown-address + 6, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR5]](s32), [[PTR_ADD6]](p1) :: (store (s8) into unknown-address + 7, addrspace 1)
@@ -7216,14 +7463,13 @@ body: |
     ; SI-NEXT: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[COPY7]], [[C]](s32)
     ; SI-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD7]], [[C1]](s64)
     ; SI-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; SI-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C3]]
-    ; SI-NEXT: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[AND4]], [[COPY8]](s32)
+    ; SI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C3]]
+    ; SI-NEXT: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[AND2]], [[COPY8]](s32)
     ; SI-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[COPY7]](s32), [[PTR_ADD7]](p1) :: (store (s8) into unknown-address + 8, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR7]](s32), [[PTR_ADD9]](p1) :: (store (s8) into unknown-address + 9, addrspace 1)
     ; SI-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; SI-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[LSHR6]], [[C3]]
-    ; SI-NEXT: [[LSHR8:%[0-9]+]]:_(s32) = G_LSHR [[AND5]], [[COPY9]](s32)
+    ; SI-NEXT: [[LSHR8:%[0-9]+]]:_(s32) = G_LSHR [[LSHR6]], [[COPY9]](s32)
     ; SI-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD8]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[LSHR6]](s32), [[PTR_ADD8]](p1) :: (store (s8) into unknown-address + 10, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR8]](s32), [[PTR_ADD10]](p1) :: (store (s8) into unknown-address + 11, addrspace 1)
@@ -7233,14 +7479,13 @@ body: |
     ; SI-NEXT: [[LSHR9:%[0-9]+]]:_(s32) = G_LSHR [[COPY10]], [[C]](s32)
     ; SI-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD11]], [[C1]](s64)
     ; SI-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; SI-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C3]]
-    ; SI-NEXT: [[LSHR10:%[0-9]+]]:_(s32) = G_LSHR [[AND6]], [[COPY11]](s32)
+    ; SI-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C3]]
+    ; SI-NEXT: [[LSHR10:%[0-9]+]]:_(s32) = G_LSHR [[AND3]], [[COPY11]](s32)
     ; SI-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD11]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[COPY10]](s32), [[PTR_ADD11]](p1) :: (store (s8) into unknown-address + 12, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR10]](s32), [[PTR_ADD13]](p1) :: (store (s8) into unknown-address + 13, addrspace 1)
     ; SI-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; SI-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[LSHR9]], [[C3]]
-    ; SI-NEXT: [[LSHR11:%[0-9]+]]:_(s32) = G_LSHR [[AND7]], [[COPY12]](s32)
+    ; SI-NEXT: [[LSHR11:%[0-9]+]]:_(s32) = G_LSHR [[LSHR9]], [[COPY12]](s32)
     ; SI-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD12]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[LSHR9]](s32), [[PTR_ADD12]](p1) :: (store (s8) into unknown-address + 14, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR11]](s32), [[PTR_ADD14]](p1) :: (store (s8) into unknown-address + 15, addrspace 1)
@@ -7251,14 +7496,13 @@ body: |
     ; SI-NEXT: [[LSHR12:%[0-9]+]]:_(s32) = G_LSHR [[COPY13]], [[C]](s32)
     ; SI-NEXT: [[PTR_ADD16:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD15]], [[C1]](s64)
     ; SI-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; SI-NEXT: [[AND8:%[0-9]+]]:_(s32) = G_AND [[COPY13]], [[C3]]
-    ; SI-NEXT: [[LSHR13:%[0-9]+]]:_(s32) = G_LSHR [[AND8]], [[COPY14]](s32)
+    ; SI-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY13]], [[C3]]
+    ; SI-NEXT: [[LSHR13:%[0-9]+]]:_(s32) = G_LSHR [[AND4]], [[COPY14]](s32)
     ; SI-NEXT: [[PTR_ADD17:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD15]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[COPY13]](s32), [[PTR_ADD15]](p1) :: (store (s8) into unknown-address + 16, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR13]](s32), [[PTR_ADD17]](p1) :: (store (s8) into unknown-address + 17, addrspace 1)
     ; SI-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; SI-NEXT: [[AND9:%[0-9]+]]:_(s32) = G_AND [[LSHR12]], [[C3]]
-    ; SI-NEXT: [[LSHR14:%[0-9]+]]:_(s32) = G_LSHR [[AND9]], [[COPY15]](s32)
+    ; SI-NEXT: [[LSHR14:%[0-9]+]]:_(s32) = G_LSHR [[LSHR12]], [[COPY15]](s32)
     ; SI-NEXT: [[PTR_ADD18:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD16]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[LSHR12]](s32), [[PTR_ADD16]](p1) :: (store (s8) into unknown-address + 18, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR14]](s32), [[PTR_ADD18]](p1) :: (store (s8) into unknown-address + 19, addrspace 1)
@@ -7267,14 +7511,13 @@ body: |
     ; SI-NEXT: [[LSHR15:%[0-9]+]]:_(s32) = G_LSHR [[COPY16]], [[C]](s32)
     ; SI-NEXT: [[PTR_ADD20:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD19]], [[C1]](s64)
     ; SI-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; SI-NEXT: [[AND10:%[0-9]+]]:_(s32) = G_AND [[COPY16]], [[C3]]
-    ; SI-NEXT: [[LSHR16:%[0-9]+]]:_(s32) = G_LSHR [[AND10]], [[COPY17]](s32)
+    ; SI-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY16]], [[C3]]
+    ; SI-NEXT: [[LSHR16:%[0-9]+]]:_(s32) = G_LSHR [[AND5]], [[COPY17]](s32)
     ; SI-NEXT: [[PTR_ADD21:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD19]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[COPY16]](s32), [[PTR_ADD19]](p1) :: (store (s8) into unknown-address + 20, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR16]](s32), [[PTR_ADD21]](p1) :: (store (s8) into unknown-address + 21, addrspace 1)
     ; SI-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; SI-NEXT: [[AND11:%[0-9]+]]:_(s32) = G_AND [[LSHR15]], [[C3]]
-    ; SI-NEXT: [[LSHR17:%[0-9]+]]:_(s32) = G_LSHR [[AND11]], [[COPY18]](s32)
+    ; SI-NEXT: [[LSHR17:%[0-9]+]]:_(s32) = G_LSHR [[LSHR15]], [[COPY18]](s32)
     ; SI-NEXT: [[PTR_ADD22:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD20]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[LSHR15]](s32), [[PTR_ADD20]](p1) :: (store (s8) into unknown-address + 22, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR17]](s32), [[PTR_ADD22]](p1) :: (store (s8) into unknown-address + 23, addrspace 1)
@@ -7283,14 +7526,13 @@ body: |
     ; SI-NEXT: [[LSHR18:%[0-9]+]]:_(s32) = G_LSHR [[COPY19]], [[C]](s32)
     ; SI-NEXT: [[PTR_ADD24:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD23]], [[C1]](s64)
     ; SI-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; SI-NEXT: [[AND12:%[0-9]+]]:_(s32) = G_AND [[COPY19]], [[C3]]
-    ; SI-NEXT: [[LSHR19:%[0-9]+]]:_(s32) = G_LSHR [[AND12]], [[COPY20]](s32)
+    ; SI-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY19]], [[C3]]
+    ; SI-NEXT: [[LSHR19:%[0-9]+]]:_(s32) = G_LSHR [[AND6]], [[COPY20]](s32)
     ; SI-NEXT: [[PTR_ADD25:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD23]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[COPY19]](s32), [[PTR_ADD23]](p1) :: (store (s8) into unknown-address + 24, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR19]](s32), [[PTR_ADD25]](p1) :: (store (s8) into unknown-address + 25, addrspace 1)
     ; SI-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; SI-NEXT: [[AND13:%[0-9]+]]:_(s32) = G_AND [[LSHR18]], [[C3]]
-    ; SI-NEXT: [[LSHR20:%[0-9]+]]:_(s32) = G_LSHR [[AND13]], [[COPY21]](s32)
+    ; SI-NEXT: [[LSHR20:%[0-9]+]]:_(s32) = G_LSHR [[LSHR18]], [[COPY21]](s32)
     ; SI-NEXT: [[PTR_ADD26:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD24]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[LSHR18]](s32), [[PTR_ADD24]](p1) :: (store (s8) into unknown-address + 26, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR20]](s32), [[PTR_ADD26]](p1) :: (store (s8) into unknown-address + 27, addrspace 1)
@@ -7299,17 +7541,17 @@ body: |
     ; SI-NEXT: [[LSHR21:%[0-9]+]]:_(s32) = G_LSHR [[COPY22]], [[C]](s32)
     ; SI-NEXT: [[PTR_ADD28:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD27]], [[C1]](s64)
     ; SI-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; SI-NEXT: [[AND14:%[0-9]+]]:_(s32) = G_AND [[COPY22]], [[C3]]
-    ; SI-NEXT: [[LSHR22:%[0-9]+]]:_(s32) = G_LSHR [[AND14]], [[COPY23]](s32)
+    ; SI-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY22]], [[C3]]
+    ; SI-NEXT: [[LSHR22:%[0-9]+]]:_(s32) = G_LSHR [[AND7]], [[COPY23]](s32)
     ; SI-NEXT: [[PTR_ADD29:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD27]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[COPY22]](s32), [[PTR_ADD27]](p1) :: (store (s8) into unknown-address + 28, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR22]](s32), [[PTR_ADD29]](p1) :: (store (s8) into unknown-address + 29, addrspace 1)
     ; SI-NEXT: [[COPY24:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; SI-NEXT: [[AND15:%[0-9]+]]:_(s32) = G_AND [[LSHR21]], [[C3]]
-    ; SI-NEXT: [[LSHR23:%[0-9]+]]:_(s32) = G_LSHR [[AND15]], [[COPY24]](s32)
+    ; SI-NEXT: [[LSHR23:%[0-9]+]]:_(s32) = G_LSHR [[LSHR21]], [[COPY24]](s32)
     ; SI-NEXT: [[PTR_ADD30:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD28]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[LSHR21]](s32), [[PTR_ADD28]](p1) :: (store (s8) into unknown-address + 30, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR23]](s32), [[PTR_ADD30]](p1) :: (store (s8) into unknown-address + 31, addrspace 1)
+    ;
     ; CI-LABEL: name: test_store_global_v8s32_align1
     ; CI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
     ; CI-NEXT: {{  $}}
@@ -7320,6 +7562,7 @@ body: |
     ; CI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
     ; CI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64)
     ; CI-NEXT: G_STORE [[UV1]](<4 x s32>), [[PTR_ADD]](p1) :: (store (<4 x s32>) into unknown-address + 16, align 1, addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_v8s32_align1
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
     ; VI-NEXT: {{  $}}
@@ -7463,6 +7706,7 @@ body: |
     ; VI-NEXT: G_STORE [[LSHR21]](s32), [[PTR_ADD28]](p1) :: (store (s8) into unknown-address + 30, addrspace 1)
     ; VI-NEXT: [[ANYEXT15:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR23]](s16)
     ; VI-NEXT: G_STORE [[ANYEXT15]](s32), [[PTR_ADD30]](p1) :: (store (s8) into unknown-address + 31, addrspace 1)
+    ;
     ; GFX9-LABEL: name: test_store_global_v8s32_align1
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
     ; GFX9-NEXT: {{  $}}
@@ -7545,6 +7789,7 @@ body: |
     ; SI-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD13]], [[C1]](s64)
     ; SI-NEXT: G_STORE [[COPY9]](s32), [[PTR_ADD13]](p1) :: (store (s16) into unknown-address + 28, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR7]](s32), [[PTR_ADD14]](p1) :: (store (s16) into unknown-address + 30, addrspace 1)
+    ;
     ; CI-LABEL: name: test_store_global_v8s32_align2
     ; CI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
     ; CI-NEXT: {{  $}}
@@ -7555,6 +7800,7 @@ body: |
     ; CI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
     ; CI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64)
     ; CI-NEXT: G_STORE [[UV1]](<4 x s32>), [[PTR_ADD]](p1) :: (store (<4 x s32>) into unknown-address + 16, align 2, addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_v8s32_align2
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
     ; VI-NEXT: {{  $}}
@@ -7616,6 +7862,7 @@ body: |
     ; VI-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD13]], [[C1]](s64)
     ; VI-NEXT: G_STORE [[COPY9]](s32), [[PTR_ADD13]](p1) :: (store (s16) into unknown-address + 28, addrspace 1)
     ; VI-NEXT: G_STORE [[LSHR7]](s32), [[PTR_ADD14]](p1) :: (store (s16) into unknown-address + 30, addrspace 1)
+    ;
     ; GFX9-LABEL: name: test_store_global_v8s32_align2
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
     ; GFX9-NEXT: {{  $}}
@@ -7647,6 +7894,7 @@ body: |
     ; SI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
     ; SI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64)
     ; SI-NEXT: G_STORE [[UV1]](<4 x s32>), [[PTR_ADD]](p1) :: (store (<4 x s32>) into unknown-address + 16, align 4, addrspace 1)
+    ;
     ; CI-LABEL: name: test_store_global_v8s32_align4
     ; CI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
     ; CI-NEXT: {{  $}}
@@ -7657,6 +7905,7 @@ body: |
     ; CI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
     ; CI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64)
     ; CI-NEXT: G_STORE [[UV1]](<4 x s32>), [[PTR_ADD]](p1) :: (store (<4 x s32>) into unknown-address + 16, align 4, addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_v8s32_align4
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
     ; VI-NEXT: {{  $}}
@@ -7667,6 +7916,7 @@ body: |
     ; VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
     ; VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64)
     ; VI-NEXT: G_STORE [[UV1]](<4 x s32>), [[PTR_ADD]](p1) :: (store (<4 x s32>) into unknown-address + 16, align 4, addrspace 1)
+    ;
     ; GFX9-LABEL: name: test_store_global_v8s32_align4
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
     ; GFX9-NEXT: {{  $}}
@@ -7698,6 +7948,7 @@ body: |
     ; SI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
     ; SI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64)
     ; SI-NEXT: G_STORE [[UV1]](<4 x s32>), [[PTR_ADD]](p1) :: (store (<4 x s32>) into unknown-address + 16, align 8, addrspace 1)
+    ;
     ; CI-LABEL: name: test_store_global_v8s32_align8
     ; CI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
     ; CI-NEXT: {{  $}}
@@ -7708,6 +7959,7 @@ body: |
     ; CI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
     ; CI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64)
     ; CI-NEXT: G_STORE [[UV1]](<4 x s32>), [[PTR_ADD]](p1) :: (store (<4 x s32>) into unknown-address + 16, align 8, addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_v8s32_align8
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
     ; VI-NEXT: {{  $}}
@@ -7718,6 +7970,7 @@ body: |
     ; VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
     ; VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64)
     ; VI-NEXT: G_STORE [[UV1]](<4 x s32>), [[PTR_ADD]](p1) :: (store (<4 x s32>) into unknown-address + 16, align 8, addrspace 1)
+    ;
     ; GFX9-LABEL: name: test_store_global_v8s32_align8
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
     ; GFX9-NEXT: {{  $}}
@@ -7749,6 +8002,7 @@ body: |
     ; SI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
     ; SI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64)
     ; SI-NEXT: G_STORE [[UV1]](<4 x s32>), [[PTR_ADD]](p1) :: (store (<4 x s32>) into unknown-address + 16, addrspace 1)
+    ;
     ; CI-LABEL: name: test_store_global_v8s32_align16
     ; CI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
     ; CI-NEXT: {{  $}}
@@ -7759,6 +8013,7 @@ body: |
     ; CI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
     ; CI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64)
     ; CI-NEXT: G_STORE [[UV1]](<4 x s32>), [[PTR_ADD]](p1) :: (store (<4 x s32>) into unknown-address + 16, addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_v8s32_align16
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
     ; VI-NEXT: {{  $}}
@@ -7769,6 +8024,7 @@ body: |
     ; VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
     ; VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64)
     ; VI-NEXT: G_STORE [[UV1]](<4 x s32>), [[PTR_ADD]](p1) :: (store (<4 x s32>) into unknown-address + 16, addrspace 1)
+    ;
     ; GFX9-LABEL: name: test_store_global_v8s32_align16
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
     ; GFX9-NEXT: {{  $}}
@@ -7801,6 +8057,7 @@ body: |
     ; SI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
     ; SI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64)
     ; SI-NEXT: G_STORE [[UV1]](<4 x s32>), [[PTR_ADD]](p1) :: (store (<4 x s32>) into unknown-address + 16, addrspace 1)
+    ;
     ; CI-LABEL: name: test_store_global_v2s128_align32
     ; CI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
     ; CI-NEXT: {{  $}}
@@ -7812,6 +8069,7 @@ body: |
     ; CI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
     ; CI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64)
     ; CI-NEXT: G_STORE [[UV1]](<4 x s32>), [[PTR_ADD]](p1) :: (store (<4 x s32>) into unknown-address + 16, addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_v2s128_align32
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
     ; VI-NEXT: {{  $}}
@@ -7823,6 +8081,7 @@ body: |
     ; VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
     ; VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64)
     ; VI-NEXT: G_STORE [[UV1]](<4 x s32>), [[PTR_ADD]](p1) :: (store (<4 x s32>) into unknown-address + 16, addrspace 1)
+    ;
     ; GFX9-LABEL: name: test_store_global_v2s128_align32
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
     ; GFX9-NEXT: {{  $}}
@@ -7867,8 +8126,7 @@ body: |
     ; SI-NEXT: G_STORE [[COPY2]](s32), [[COPY]](p1) :: (store (s8), addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR1]](s32), [[PTR_ADD1]](p1) :: (store (s8) into unknown-address + 1, addrspace 1)
     ; SI-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; SI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C3]]
-    ; SI-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[AND1]], [[COPY3]](s32)
+    ; SI-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[LSHR]], [[COPY3]](s32)
     ; SI-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[LSHR]](s32), [[PTR_ADD]](p1) :: (store (s8) into unknown-address + 2, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR2]](s32), [[PTR_ADD2]](p1) :: (store (s8) into unknown-address + 3, addrspace 1)
@@ -7878,14 +8136,13 @@ body: |
     ; SI-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[COPY4]], [[C]](s32)
     ; SI-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD3]], [[C1]](s64)
     ; SI-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; SI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]]
-    ; SI-NEXT: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[AND2]], [[COPY5]](s32)
+    ; SI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY4]], [[C3]]
+    ; SI-NEXT: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[AND1]], [[COPY5]](s32)
     ; SI-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD3]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[COPY4]](s32), [[PTR_ADD3]](p1) :: (store (s8) into unknown-address + 4, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR4]](s32), [[PTR_ADD5]](p1) :: (store (s8) into unknown-address + 5, addrspace 1)
     ; SI-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; SI-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[LSHR3]], [[C3]]
-    ; SI-NEXT: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[AND3]], [[COPY6]](s32)
+    ; SI-NEXT: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[LSHR3]], [[COPY6]](s32)
     ; SI-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD4]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[LSHR3]](s32), [[PTR_ADD4]](p1) :: (store (s8) into unknown-address + 6, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR5]](s32), [[PTR_ADD6]](p1) :: (store (s8) into unknown-address + 7, addrspace 1)
@@ -7895,14 +8152,13 @@ body: |
     ; SI-NEXT: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[COPY7]], [[C]](s32)
     ; SI-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD7]], [[C1]](s64)
     ; SI-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; SI-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C3]]
-    ; SI-NEXT: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[AND4]], [[COPY8]](s32)
+    ; SI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY7]], [[C3]]
+    ; SI-NEXT: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[AND2]], [[COPY8]](s32)
     ; SI-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[COPY7]](s32), [[PTR_ADD7]](p1) :: (store (s8) into unknown-address + 8, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR7]](s32), [[PTR_ADD9]](p1) :: (store (s8) into unknown-address + 9, addrspace 1)
     ; SI-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; SI-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[LSHR6]], [[C3]]
-    ; SI-NEXT: [[LSHR8:%[0-9]+]]:_(s32) = G_LSHR [[AND5]], [[COPY9]](s32)
+    ; SI-NEXT: [[LSHR8:%[0-9]+]]:_(s32) = G_LSHR [[LSHR6]], [[COPY9]](s32)
     ; SI-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD8]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[LSHR6]](s32), [[PTR_ADD8]](p1) :: (store (s8) into unknown-address + 10, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR8]](s32), [[PTR_ADD10]](p1) :: (store (s8) into unknown-address + 11, addrspace 1)
@@ -7912,14 +8168,13 @@ body: |
     ; SI-NEXT: [[LSHR9:%[0-9]+]]:_(s32) = G_LSHR [[COPY10]], [[C]](s32)
     ; SI-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD11]], [[C1]](s64)
     ; SI-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; SI-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C3]]
-    ; SI-NEXT: [[LSHR10:%[0-9]+]]:_(s32) = G_LSHR [[AND6]], [[COPY11]](s32)
+    ; SI-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY10]], [[C3]]
+    ; SI-NEXT: [[LSHR10:%[0-9]+]]:_(s32) = G_LSHR [[AND3]], [[COPY11]](s32)
     ; SI-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD11]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[COPY10]](s32), [[PTR_ADD11]](p1) :: (store (s8) into unknown-address + 12, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR10]](s32), [[PTR_ADD13]](p1) :: (store (s8) into unknown-address + 13, addrspace 1)
     ; SI-NEXT: [[COPY12:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; SI-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[LSHR9]], [[C3]]
-    ; SI-NEXT: [[LSHR11:%[0-9]+]]:_(s32) = G_LSHR [[AND7]], [[COPY12]](s32)
+    ; SI-NEXT: [[LSHR11:%[0-9]+]]:_(s32) = G_LSHR [[LSHR9]], [[COPY12]](s32)
     ; SI-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD12]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[LSHR9]](s32), [[PTR_ADD12]](p1) :: (store (s8) into unknown-address + 14, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR11]](s32), [[PTR_ADD14]](p1) :: (store (s8) into unknown-address + 15, addrspace 1)
@@ -7930,14 +8185,13 @@ body: |
     ; SI-NEXT: [[LSHR12:%[0-9]+]]:_(s32) = G_LSHR [[COPY13]], [[C]](s32)
     ; SI-NEXT: [[PTR_ADD16:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD15]], [[C1]](s64)
     ; SI-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; SI-NEXT: [[AND8:%[0-9]+]]:_(s32) = G_AND [[COPY13]], [[C3]]
-    ; SI-NEXT: [[LSHR13:%[0-9]+]]:_(s32) = G_LSHR [[AND8]], [[COPY14]](s32)
+    ; SI-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY13]], [[C3]]
+    ; SI-NEXT: [[LSHR13:%[0-9]+]]:_(s32) = G_LSHR [[AND4]], [[COPY14]](s32)
     ; SI-NEXT: [[PTR_ADD17:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD15]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[COPY13]](s32), [[PTR_ADD15]](p1) :: (store (s8) into unknown-address + 16, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR13]](s32), [[PTR_ADD17]](p1) :: (store (s8) into unknown-address + 17, addrspace 1)
     ; SI-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; SI-NEXT: [[AND9:%[0-9]+]]:_(s32) = G_AND [[LSHR12]], [[C3]]
-    ; SI-NEXT: [[LSHR14:%[0-9]+]]:_(s32) = G_LSHR [[AND9]], [[COPY15]](s32)
+    ; SI-NEXT: [[LSHR14:%[0-9]+]]:_(s32) = G_LSHR [[LSHR12]], [[COPY15]](s32)
     ; SI-NEXT: [[PTR_ADD18:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD16]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[LSHR12]](s32), [[PTR_ADD16]](p1) :: (store (s8) into unknown-address + 18, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR14]](s32), [[PTR_ADD18]](p1) :: (store (s8) into unknown-address + 19, addrspace 1)
@@ -7946,14 +8200,13 @@ body: |
     ; SI-NEXT: [[LSHR15:%[0-9]+]]:_(s32) = G_LSHR [[COPY16]], [[C]](s32)
     ; SI-NEXT: [[PTR_ADD20:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD19]], [[C1]](s64)
     ; SI-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; SI-NEXT: [[AND10:%[0-9]+]]:_(s32) = G_AND [[COPY16]], [[C3]]
-    ; SI-NEXT: [[LSHR16:%[0-9]+]]:_(s32) = G_LSHR [[AND10]], [[COPY17]](s32)
+    ; SI-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY16]], [[C3]]
+    ; SI-NEXT: [[LSHR16:%[0-9]+]]:_(s32) = G_LSHR [[AND5]], [[COPY17]](s32)
     ; SI-NEXT: [[PTR_ADD21:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD19]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[COPY16]](s32), [[PTR_ADD19]](p1) :: (store (s8) into unknown-address + 20, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR16]](s32), [[PTR_ADD21]](p1) :: (store (s8) into unknown-address + 21, addrspace 1)
     ; SI-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; SI-NEXT: [[AND11:%[0-9]+]]:_(s32) = G_AND [[LSHR15]], [[C3]]
-    ; SI-NEXT: [[LSHR17:%[0-9]+]]:_(s32) = G_LSHR [[AND11]], [[COPY18]](s32)
+    ; SI-NEXT: [[LSHR17:%[0-9]+]]:_(s32) = G_LSHR [[LSHR15]], [[COPY18]](s32)
     ; SI-NEXT: [[PTR_ADD22:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD20]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[LSHR15]](s32), [[PTR_ADD20]](p1) :: (store (s8) into unknown-address + 22, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR17]](s32), [[PTR_ADD22]](p1) :: (store (s8) into unknown-address + 23, addrspace 1)
@@ -7962,14 +8215,13 @@ body: |
     ; SI-NEXT: [[LSHR18:%[0-9]+]]:_(s32) = G_LSHR [[COPY19]], [[C]](s32)
     ; SI-NEXT: [[PTR_ADD24:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD23]], [[C1]](s64)
     ; SI-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; SI-NEXT: [[AND12:%[0-9]+]]:_(s32) = G_AND [[COPY19]], [[C3]]
-    ; SI-NEXT: [[LSHR19:%[0-9]+]]:_(s32) = G_LSHR [[AND12]], [[COPY20]](s32)
+    ; SI-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY19]], [[C3]]
+    ; SI-NEXT: [[LSHR19:%[0-9]+]]:_(s32) = G_LSHR [[AND6]], [[COPY20]](s32)
     ; SI-NEXT: [[PTR_ADD25:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD23]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[COPY19]](s32), [[PTR_ADD23]](p1) :: (store (s8) into unknown-address + 24, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR19]](s32), [[PTR_ADD25]](p1) :: (store (s8) into unknown-address + 25, addrspace 1)
     ; SI-NEXT: [[COPY21:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; SI-NEXT: [[AND13:%[0-9]+]]:_(s32) = G_AND [[LSHR18]], [[C3]]
-    ; SI-NEXT: [[LSHR20:%[0-9]+]]:_(s32) = G_LSHR [[AND13]], [[COPY21]](s32)
+    ; SI-NEXT: [[LSHR20:%[0-9]+]]:_(s32) = G_LSHR [[LSHR18]], [[COPY21]](s32)
     ; SI-NEXT: [[PTR_ADD26:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD24]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[LSHR18]](s32), [[PTR_ADD24]](p1) :: (store (s8) into unknown-address + 26, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR20]](s32), [[PTR_ADD26]](p1) :: (store (s8) into unknown-address + 27, addrspace 1)
@@ -7978,17 +8230,17 @@ body: |
     ; SI-NEXT: [[LSHR21:%[0-9]+]]:_(s32) = G_LSHR [[COPY22]], [[C]](s32)
     ; SI-NEXT: [[PTR_ADD28:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD27]], [[C1]](s64)
     ; SI-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; SI-NEXT: [[AND14:%[0-9]+]]:_(s32) = G_AND [[COPY22]], [[C3]]
-    ; SI-NEXT: [[LSHR22:%[0-9]+]]:_(s32) = G_LSHR [[AND14]], [[COPY23]](s32)
+    ; SI-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY22]], [[C3]]
+    ; SI-NEXT: [[LSHR22:%[0-9]+]]:_(s32) = G_LSHR [[AND7]], [[COPY23]](s32)
     ; SI-NEXT: [[PTR_ADD29:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD27]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[COPY22]](s32), [[PTR_ADD27]](p1) :: (store (s8) into unknown-address + 28, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR22]](s32), [[PTR_ADD29]](p1) :: (store (s8) into unknown-address + 29, addrspace 1)
     ; SI-NEXT: [[COPY24:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; SI-NEXT: [[AND15:%[0-9]+]]:_(s32) = G_AND [[LSHR21]], [[C3]]
-    ; SI-NEXT: [[LSHR23:%[0-9]+]]:_(s32) = G_LSHR [[AND15]], [[COPY24]](s32)
+    ; SI-NEXT: [[LSHR23:%[0-9]+]]:_(s32) = G_LSHR [[LSHR21]], [[COPY24]](s32)
     ; SI-NEXT: [[PTR_ADD30:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD28]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[LSHR21]](s32), [[PTR_ADD28]](p1) :: (store (s8) into unknown-address + 30, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR23]](s32), [[PTR_ADD30]](p1) :: (store (s8) into unknown-address + 31, addrspace 1)
+    ;
     ; CI-LABEL: name: test_store_global_s256_align1
     ; CI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
     ; CI-NEXT: {{  $}}
@@ -8000,6 +8252,7 @@ body: |
     ; CI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
     ; CI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64)
     ; CI-NEXT: G_STORE [[UV1]](<4 x s32>), [[PTR_ADD]](p1) :: (store (<4 x s32>) into unknown-address + 16, align 1, addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_s256_align1
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
     ; VI-NEXT: {{  $}}
@@ -8144,6 +8397,7 @@ body: |
     ; VI-NEXT: G_STORE [[LSHR21]](s32), [[PTR_ADD28]](p1) :: (store (s8) into unknown-address + 30, addrspace 1)
     ; VI-NEXT: [[ANYEXT15:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR23]](s16)
     ; VI-NEXT: G_STORE [[ANYEXT15]](s32), [[PTR_ADD30]](p1) :: (store (s8) into unknown-address + 31, addrspace 1)
+    ;
     ; GFX9-LABEL: name: test_store_global_s256_align1
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
     ; GFX9-NEXT: {{  $}}
@@ -8228,6 +8482,7 @@ body: |
     ; SI-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD13]], [[C1]](s64)
     ; SI-NEXT: G_STORE [[COPY9]](s32), [[PTR_ADD13]](p1) :: (store (s16) into unknown-address + 28, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR7]](s32), [[PTR_ADD14]](p1) :: (store (s16) into unknown-address + 30, addrspace 1)
+    ;
     ; CI-LABEL: name: test_store_global_s256_align2
     ; CI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
     ; CI-NEXT: {{  $}}
@@ -8239,6 +8494,7 @@ body: |
     ; CI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
     ; CI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64)
     ; CI-NEXT: G_STORE [[UV1]](<4 x s32>), [[PTR_ADD]](p1) :: (store (<4 x s32>) into unknown-address + 16, align 2, addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_s256_align2
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
     ; VI-NEXT: {{  $}}
@@ -8301,6 +8557,7 @@ body: |
     ; VI-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD13]], [[C1]](s64)
     ; VI-NEXT: G_STORE [[COPY9]](s32), [[PTR_ADD13]](p1) :: (store (s16) into unknown-address + 28, addrspace 1)
     ; VI-NEXT: G_STORE [[LSHR7]](s32), [[PTR_ADD14]](p1) :: (store (s16) into unknown-address + 30, addrspace 1)
+    ;
     ; GFX9-LABEL: name: test_store_global_s256_align2
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
     ; GFX9-NEXT: {{  $}}
@@ -8334,6 +8591,7 @@ body: |
     ; SI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
     ; SI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64)
     ; SI-NEXT: G_STORE [[UV1]](<4 x s32>), [[PTR_ADD]](p1) :: (store (<4 x s32>) into unknown-address + 16, align 4, addrspace 1)
+    ;
     ; CI-LABEL: name: test_store_global_s256_align4
     ; CI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
     ; CI-NEXT: {{  $}}
@@ -8345,6 +8603,7 @@ body: |
     ; CI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
     ; CI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64)
     ; CI-NEXT: G_STORE [[UV1]](<4 x s32>), [[PTR_ADD]](p1) :: (store (<4 x s32>) into unknown-address + 16, align 4, addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_s256_align4
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
     ; VI-NEXT: {{  $}}
@@ -8356,6 +8615,7 @@ body: |
     ; VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
     ; VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64)
     ; VI-NEXT: G_STORE [[UV1]](<4 x s32>), [[PTR_ADD]](p1) :: (store (<4 x s32>) into unknown-address + 16, align 4, addrspace 1)
+    ;
     ; GFX9-LABEL: name: test_store_global_s256_align4
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
     ; GFX9-NEXT: {{  $}}
@@ -8389,6 +8649,7 @@ body: |
     ; SI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
     ; SI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64)
     ; SI-NEXT: G_STORE [[UV1]](<4 x s32>), [[PTR_ADD]](p1) :: (store (<4 x s32>) into unknown-address + 16, align 8, addrspace 1)
+    ;
     ; CI-LABEL: name: test_store_global_s256_align8
     ; CI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
     ; CI-NEXT: {{  $}}
@@ -8400,6 +8661,7 @@ body: |
     ; CI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
     ; CI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64)
     ; CI-NEXT: G_STORE [[UV1]](<4 x s32>), [[PTR_ADD]](p1) :: (store (<4 x s32>) into unknown-address + 16, align 8, addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_s256_align8
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
     ; VI-NEXT: {{  $}}
@@ -8411,6 +8673,7 @@ body: |
     ; VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
     ; VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64)
     ; VI-NEXT: G_STORE [[UV1]](<4 x s32>), [[PTR_ADD]](p1) :: (store (<4 x s32>) into unknown-address + 16, align 8, addrspace 1)
+    ;
     ; GFX9-LABEL: name: test_store_global_s256_align8
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
     ; GFX9-NEXT: {{  $}}
@@ -8444,6 +8707,7 @@ body: |
     ; SI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
     ; SI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64)
     ; SI-NEXT: G_STORE [[UV1]](<4 x s32>), [[PTR_ADD]](p1) :: (store (<4 x s32>) into unknown-address + 16, addrspace 1)
+    ;
     ; CI-LABEL: name: test_store_global_s256_align16
     ; CI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
     ; CI-NEXT: {{  $}}
@@ -8455,6 +8719,7 @@ body: |
     ; CI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
     ; CI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64)
     ; CI-NEXT: G_STORE [[UV1]](<4 x s32>), [[PTR_ADD]](p1) :: (store (<4 x s32>) into unknown-address + 16, addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_s256_align16
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
     ; VI-NEXT: {{  $}}
@@ -8466,6 +8731,7 @@ body: |
     ; VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
     ; VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64)
     ; VI-NEXT: G_STORE [[UV1]](<4 x s32>), [[PTR_ADD]](p1) :: (store (<4 x s32>) into unknown-address + 16, addrspace 1)
+    ;
     ; GFX9-LABEL: name: test_store_global_s256_align16
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
     ; GFX9-NEXT: {{  $}}
@@ -8499,6 +8765,7 @@ body: |
     ; SI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
     ; SI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64)
     ; SI-NEXT: G_STORE [[UV1]](<4 x s32>), [[PTR_ADD]](p1) :: (store (<4 x s32>) into unknown-address + 16, addrspace 1)
+    ;
     ; CI-LABEL: name: test_store_global_s256_align32
     ; CI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
     ; CI-NEXT: {{  $}}
@@ -8510,6 +8777,7 @@ body: |
     ; CI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
     ; CI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64)
     ; CI-NEXT: G_STORE [[UV1]](<4 x s32>), [[PTR_ADD]](p1) :: (store (<4 x s32>) into unknown-address + 16, addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_s256_align32
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
     ; VI-NEXT: {{  $}}
@@ -8521,6 +8789,7 @@ body: |
     ; VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
     ; VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64)
     ; VI-NEXT: G_STORE [[UV1]](<4 x s32>), [[PTR_ADD]](p1) :: (store (<4 x s32>) into unknown-address + 16, addrspace 1)
+    ;
     ; GFX9-LABEL: name: test_store_global_s256_align32
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
     ; GFX9-NEXT: {{  $}}
@@ -8553,6 +8822,7 @@ body: |
     ; SI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
     ; SI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64)
     ; SI-NEXT: G_STORE [[UV1]](<4 x s32>), [[PTR_ADD]](p1) :: (store (<4 x s32>) into unknown-address + 16, addrspace 1)
+    ;
     ; CI-LABEL: name: test_store_global_v8s32_align32
     ; CI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
     ; CI-NEXT: {{  $}}
@@ -8563,6 +8833,7 @@ body: |
     ; CI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
     ; CI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64)
     ; CI-NEXT: G_STORE [[UV1]](<4 x s32>), [[PTR_ADD]](p1) :: (store (<4 x s32>) into unknown-address + 16, addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_v8s32_align32
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
     ; VI-NEXT: {{  $}}
@@ -8573,6 +8844,7 @@ body: |
     ; VI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
     ; VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64)
     ; VI-NEXT: G_STORE [[UV1]](<4 x s32>), [[PTR_ADD]](p1) :: (store (<4 x s32>) into unknown-address + 16, addrspace 1)
+    ;
     ; GFX9-LABEL: name: test_store_global_v8s32_align32
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9
     ; GFX9-NEXT: {{  $}}
@@ -8618,8 +8890,7 @@ body: |
     ; SI-NEXT: G_STORE [[COPY4]](s32), [[COPY]](p1) :: (store (s8), addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR1]](s32), [[PTR_ADD1]](p1) :: (store (s8) into unknown-address + 1, addrspace 1)
     ; SI-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; SI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C3]]
-    ; SI-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[AND1]], [[COPY5]](s32)
+    ; SI-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[LSHR]], [[COPY5]](s32)
     ; SI-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[LSHR]](s32), [[PTR_ADD]](p1) :: (store (s8) into unknown-address + 2, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR2]](s32), [[PTR_ADD2]](p1) :: (store (s8) into unknown-address + 3, addrspace 1)
@@ -8629,14 +8900,13 @@ body: |
     ; SI-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[COPY6]], [[C]](s32)
     ; SI-NEXT: [[PTR_ADD4:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD3]], [[C1]](s64)
     ; SI-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; SI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C3]]
-    ; SI-NEXT: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[AND2]], [[COPY7]](s32)
+    ; SI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY6]], [[C3]]
+    ; SI-NEXT: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[AND1]], [[COPY7]](s32)
     ; SI-NEXT: [[PTR_ADD5:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD3]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[COPY6]](s32), [[PTR_ADD3]](p1) :: (store (s8) into unknown-address + 4, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR4]](s32), [[PTR_ADD5]](p1) :: (store (s8) into unknown-address + 5, addrspace 1)
     ; SI-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; SI-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[LSHR3]], [[C3]]
-    ; SI-NEXT: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[AND3]], [[COPY8]](s32)
+    ; SI-NEXT: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[LSHR3]], [[COPY8]](s32)
     ; SI-NEXT: [[PTR_ADD6:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD4]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[LSHR3]](s32), [[PTR_ADD4]](p1) :: (store (s8) into unknown-address + 6, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR5]](s32), [[PTR_ADD6]](p1) :: (store (s8) into unknown-address + 7, addrspace 1)
@@ -8646,14 +8916,13 @@ body: |
     ; SI-NEXT: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[COPY9]], [[C]](s32)
     ; SI-NEXT: [[PTR_ADD8:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD7]], [[C1]](s64)
     ; SI-NEXT: [[COPY10:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; SI-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C3]]
-    ; SI-NEXT: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[AND4]], [[COPY10]](s32)
+    ; SI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY9]], [[C3]]
+    ; SI-NEXT: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[AND2]], [[COPY10]](s32)
     ; SI-NEXT: [[PTR_ADD9:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD7]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[COPY9]](s32), [[PTR_ADD7]](p1) :: (store (s8) into unknown-address + 8, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR7]](s32), [[PTR_ADD9]](p1) :: (store (s8) into unknown-address + 9, addrspace 1)
     ; SI-NEXT: [[COPY11:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; SI-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[LSHR6]], [[C3]]
-    ; SI-NEXT: [[LSHR8:%[0-9]+]]:_(s32) = G_LSHR [[AND5]], [[COPY11]](s32)
+    ; SI-NEXT: [[LSHR8:%[0-9]+]]:_(s32) = G_LSHR [[LSHR6]], [[COPY11]](s32)
     ; SI-NEXT: [[PTR_ADD10:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD8]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[LSHR6]](s32), [[PTR_ADD8]](p1) :: (store (s8) into unknown-address + 10, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR8]](s32), [[PTR_ADD10]](p1) :: (store (s8) into unknown-address + 11, addrspace 1)
@@ -8663,14 +8932,13 @@ body: |
     ; SI-NEXT: [[LSHR9:%[0-9]+]]:_(s32) = G_LSHR [[COPY12]], [[C]](s32)
     ; SI-NEXT: [[PTR_ADD12:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD11]], [[C1]](s64)
     ; SI-NEXT: [[COPY13:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; SI-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C3]]
-    ; SI-NEXT: [[LSHR10:%[0-9]+]]:_(s32) = G_LSHR [[AND6]], [[COPY13]](s32)
+    ; SI-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY12]], [[C3]]
+    ; SI-NEXT: [[LSHR10:%[0-9]+]]:_(s32) = G_LSHR [[AND3]], [[COPY13]](s32)
     ; SI-NEXT: [[PTR_ADD13:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD11]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[COPY12]](s32), [[PTR_ADD11]](p1) :: (store (s8) into unknown-address + 12, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR10]](s32), [[PTR_ADD13]](p1) :: (store (s8) into unknown-address + 13, addrspace 1)
     ; SI-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; SI-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[LSHR9]], [[C3]]
-    ; SI-NEXT: [[LSHR11:%[0-9]+]]:_(s32) = G_LSHR [[AND7]], [[COPY14]](s32)
+    ; SI-NEXT: [[LSHR11:%[0-9]+]]:_(s32) = G_LSHR [[LSHR9]], [[COPY14]](s32)
     ; SI-NEXT: [[PTR_ADD14:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD12]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[LSHR9]](s32), [[PTR_ADD12]](p1) :: (store (s8) into unknown-address + 14, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR11]](s32), [[PTR_ADD14]](p1) :: (store (s8) into unknown-address + 15, addrspace 1)
@@ -8680,14 +8948,13 @@ body: |
     ; SI-NEXT: [[LSHR12:%[0-9]+]]:_(s32) = G_LSHR [[COPY15]], [[C]](s32)
     ; SI-NEXT: [[PTR_ADD16:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD15]], [[C1]](s64)
     ; SI-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; SI-NEXT: [[AND8:%[0-9]+]]:_(s32) = G_AND [[COPY15]], [[C3]]
-    ; SI-NEXT: [[LSHR13:%[0-9]+]]:_(s32) = G_LSHR [[AND8]], [[COPY16]](s32)
+    ; SI-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[COPY15]], [[C3]]
+    ; SI-NEXT: [[LSHR13:%[0-9]+]]:_(s32) = G_LSHR [[AND4]], [[COPY16]](s32)
     ; SI-NEXT: [[PTR_ADD17:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD15]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[COPY15]](s32), [[PTR_ADD15]](p1) :: (store (s8) into unknown-address + 16, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR13]](s32), [[PTR_ADD17]](p1) :: (store (s8) into unknown-address + 17, addrspace 1)
     ; SI-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; SI-NEXT: [[AND9:%[0-9]+]]:_(s32) = G_AND [[LSHR12]], [[C3]]
-    ; SI-NEXT: [[LSHR14:%[0-9]+]]:_(s32) = G_LSHR [[AND9]], [[COPY17]](s32)
+    ; SI-NEXT: [[LSHR14:%[0-9]+]]:_(s32) = G_LSHR [[LSHR12]], [[COPY17]](s32)
     ; SI-NEXT: [[PTR_ADD18:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD16]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[LSHR12]](s32), [[PTR_ADD16]](p1) :: (store (s8) into unknown-address + 18, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR14]](s32), [[PTR_ADD18]](p1) :: (store (s8) into unknown-address + 19, addrspace 1)
@@ -8696,14 +8963,13 @@ body: |
     ; SI-NEXT: [[LSHR15:%[0-9]+]]:_(s32) = G_LSHR [[COPY18]], [[C]](s32)
     ; SI-NEXT: [[PTR_ADD20:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD19]], [[C1]](s64)
     ; SI-NEXT: [[COPY19:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; SI-NEXT: [[AND10:%[0-9]+]]:_(s32) = G_AND [[COPY18]], [[C3]]
-    ; SI-NEXT: [[LSHR16:%[0-9]+]]:_(s32) = G_LSHR [[AND10]], [[COPY19]](s32)
+    ; SI-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[COPY18]], [[C3]]
+    ; SI-NEXT: [[LSHR16:%[0-9]+]]:_(s32) = G_LSHR [[AND5]], [[COPY19]](s32)
     ; SI-NEXT: [[PTR_ADD21:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD19]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[COPY18]](s32), [[PTR_ADD19]](p1) :: (store (s8) into unknown-address + 20, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR16]](s32), [[PTR_ADD21]](p1) :: (store (s8) into unknown-address + 21, addrspace 1)
     ; SI-NEXT: [[COPY20:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; SI-NEXT: [[AND11:%[0-9]+]]:_(s32) = G_AND [[LSHR15]], [[C3]]
-    ; SI-NEXT: [[LSHR17:%[0-9]+]]:_(s32) = G_LSHR [[AND11]], [[COPY20]](s32)
+    ; SI-NEXT: [[LSHR17:%[0-9]+]]:_(s32) = G_LSHR [[LSHR15]], [[COPY20]](s32)
     ; SI-NEXT: [[PTR_ADD22:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD20]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[LSHR15]](s32), [[PTR_ADD20]](p1) :: (store (s8) into unknown-address + 22, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR17]](s32), [[PTR_ADD22]](p1) :: (store (s8) into unknown-address + 23, addrspace 1)
@@ -8712,14 +8978,13 @@ body: |
     ; SI-NEXT: [[LSHR18:%[0-9]+]]:_(s32) = G_LSHR [[COPY21]], [[C]](s32)
     ; SI-NEXT: [[PTR_ADD24:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD23]], [[C1]](s64)
     ; SI-NEXT: [[COPY22:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; SI-NEXT: [[AND12:%[0-9]+]]:_(s32) = G_AND [[COPY21]], [[C3]]
-    ; SI-NEXT: [[LSHR19:%[0-9]+]]:_(s32) = G_LSHR [[AND12]], [[COPY22]](s32)
+    ; SI-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[COPY21]], [[C3]]
+    ; SI-NEXT: [[LSHR19:%[0-9]+]]:_(s32) = G_LSHR [[AND6]], [[COPY22]](s32)
     ; SI-NEXT: [[PTR_ADD25:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD23]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[COPY21]](s32), [[PTR_ADD23]](p1) :: (store (s8) into unknown-address + 24, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR19]](s32), [[PTR_ADD25]](p1) :: (store (s8) into unknown-address + 25, addrspace 1)
     ; SI-NEXT: [[COPY23:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; SI-NEXT: [[AND13:%[0-9]+]]:_(s32) = G_AND [[LSHR18]], [[C3]]
-    ; SI-NEXT: [[LSHR20:%[0-9]+]]:_(s32) = G_LSHR [[AND13]], [[COPY23]](s32)
+    ; SI-NEXT: [[LSHR20:%[0-9]+]]:_(s32) = G_LSHR [[LSHR18]], [[COPY23]](s32)
     ; SI-NEXT: [[PTR_ADD26:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD24]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[LSHR18]](s32), [[PTR_ADD24]](p1) :: (store (s8) into unknown-address + 26, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR20]](s32), [[PTR_ADD26]](p1) :: (store (s8) into unknown-address + 27, addrspace 1)
@@ -8728,14 +8993,13 @@ body: |
     ; SI-NEXT: [[LSHR21:%[0-9]+]]:_(s32) = G_LSHR [[COPY24]], [[C]](s32)
     ; SI-NEXT: [[PTR_ADD28:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD27]], [[C1]](s64)
     ; SI-NEXT: [[COPY25:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; SI-NEXT: [[AND14:%[0-9]+]]:_(s32) = G_AND [[COPY24]], [[C3]]
-    ; SI-NEXT: [[LSHR22:%[0-9]+]]:_(s32) = G_LSHR [[AND14]], [[COPY25]](s32)
+    ; SI-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[COPY24]], [[C3]]
+    ; SI-NEXT: [[LSHR22:%[0-9]+]]:_(s32) = G_LSHR [[AND7]], [[COPY25]](s32)
     ; SI-NEXT: [[PTR_ADD29:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD27]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[COPY24]](s32), [[PTR_ADD27]](p1) :: (store (s8) into unknown-address + 28, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR22]](s32), [[PTR_ADD29]](p1) :: (store (s8) into unknown-address + 29, addrspace 1)
     ; SI-NEXT: [[COPY26:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; SI-NEXT: [[AND15:%[0-9]+]]:_(s32) = G_AND [[LSHR21]], [[C3]]
-    ; SI-NEXT: [[LSHR23:%[0-9]+]]:_(s32) = G_LSHR [[AND15]], [[COPY26]](s32)
+    ; SI-NEXT: [[LSHR23:%[0-9]+]]:_(s32) = G_LSHR [[LSHR21]], [[COPY26]](s32)
     ; SI-NEXT: [[PTR_ADD30:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD28]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[LSHR21]](s32), [[PTR_ADD28]](p1) :: (store (s8) into unknown-address + 30, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR23]](s32), [[PTR_ADD30]](p1) :: (store (s8) into unknown-address + 31, addrspace 1)
@@ -8745,17 +9009,17 @@ body: |
     ; SI-NEXT: [[LSHR24:%[0-9]+]]:_(s32) = G_LSHR [[COPY27]], [[C]](s32)
     ; SI-NEXT: [[PTR_ADD32:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD31]], [[C1]](s64)
     ; SI-NEXT: [[COPY28:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; SI-NEXT: [[AND16:%[0-9]+]]:_(s32) = G_AND [[COPY27]], [[C3]]
-    ; SI-NEXT: [[LSHR25:%[0-9]+]]:_(s32) = G_LSHR [[AND16]], [[COPY28]](s32)
+    ; SI-NEXT: [[AND8:%[0-9]+]]:_(s32) = G_AND [[COPY27]], [[C3]]
+    ; SI-NEXT: [[LSHR25:%[0-9]+]]:_(s32) = G_LSHR [[AND8]], [[COPY28]](s32)
     ; SI-NEXT: [[PTR_ADD33:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD31]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[COPY27]](s32), [[PTR_ADD31]](p1) :: (store (s8) into unknown-address + 32, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR25]](s32), [[PTR_ADD33]](p1) :: (store (s8) into unknown-address + 33, addrspace 1)
     ; SI-NEXT: [[COPY29:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; SI-NEXT: [[AND17:%[0-9]+]]:_(s32) = G_AND [[LSHR24]], [[C3]]
-    ; SI-NEXT: [[LSHR26:%[0-9]+]]:_(s32) = G_LSHR [[AND17]], [[COPY29]](s32)
+    ; SI-NEXT: [[LSHR26:%[0-9]+]]:_(s32) = G_LSHR [[LSHR24]], [[COPY29]](s32)
     ; SI-NEXT: [[PTR_ADD34:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD32]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[LSHR24]](s32), [[PTR_ADD32]](p1) :: (store (s8) into unknown-address + 34, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR26]](s32), [[PTR_ADD34]](p1) :: (store (s8) into unknown-address + 35, addrspace 1)
+    ;
     ; CI-LABEL: name: test_store_global_v9s32_align1
     ; CI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4, $vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10
     ; CI-NEXT: {{  $}}
@@ -8775,6 +9039,7 @@ body: |
     ; CI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 32
     ; CI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C1]](s64)
     ; CI-NEXT: G_STORE [[UV8]](s32), [[PTR_ADD1]](p1) :: (store (s32) into unknown-address + 32, align 1, addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_v9s32_align1
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4, $vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10
     ; VI-NEXT: {{  $}}
@@ -8937,6 +9202,7 @@ body: |
     ; VI-NEXT: G_STORE [[LSHR24]](s32), [[PTR_ADD32]](p1) :: (store (s8) into unknown-address + 34, addrspace 1)
     ; VI-NEXT: [[ANYEXT17:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR26]](s16)
     ; VI-NEXT: G_STORE [[ANYEXT17]](s32), [[PTR_ADD34]](p1) :: (store (s8) into unknown-address + 35, addrspace 1)
+    ;
     ; GFX9-LABEL: name: test_store_global_v9s32_align1
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4, $vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10
     ; GFX9-NEXT: {{  $}}
@@ -9040,6 +9306,7 @@ body: |
     ; SI-NEXT: [[PTR_ADD16:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD15]], [[C1]](s64)
     ; SI-NEXT: G_STORE [[COPY12]](s32), [[PTR_ADD15]](p1) :: (store (s16) into unknown-address + 32, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR8]](s32), [[PTR_ADD16]](p1) :: (store (s16) into unknown-address + 34, addrspace 1)
+    ;
     ; CI-LABEL: name: test_store_global_v9s32_align2
     ; CI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4, $vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10
     ; CI-NEXT: {{  $}}
@@ -9059,6 +9326,7 @@ body: |
     ; CI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 32
     ; CI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C1]](s64)
     ; CI-NEXT: G_STORE [[UV8]](s32), [[PTR_ADD1]](p1) :: (store (s32) into unknown-address + 32, align 2, addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_v9s32_align2
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4, $vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10
     ; VI-NEXT: {{  $}}
@@ -9129,6 +9397,7 @@ body: |
     ; VI-NEXT: [[PTR_ADD16:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD15]], [[C1]](s64)
     ; VI-NEXT: G_STORE [[COPY12]](s32), [[PTR_ADD15]](p1) :: (store (s16) into unknown-address + 32, addrspace 1)
     ; VI-NEXT: G_STORE [[LSHR8]](s32), [[PTR_ADD16]](p1) :: (store (s16) into unknown-address + 34, addrspace 1)
+    ;
     ; GFX9-LABEL: name: test_store_global_v9s32_align2
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4, $vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10
     ; GFX9-NEXT: {{  $}}
@@ -9181,6 +9450,7 @@ body: |
     ; SI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 32
     ; SI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C1]](s64)
     ; SI-NEXT: G_STORE [[UV8]](s32), [[PTR_ADD1]](p1) :: (store (s32) into unknown-address + 32, addrspace 1)
+    ;
     ; CI-LABEL: name: test_store_global_v9s32_align4
     ; CI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4, $vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10
     ; CI-NEXT: {{  $}}
@@ -9200,6 +9470,7 @@ body: |
     ; CI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 32
     ; CI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C1]](s64)
     ; CI-NEXT: G_STORE [[UV8]](s32), [[PTR_ADD1]](p1) :: (store (s32) into unknown-address + 32, addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_v9s32_align4
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4, $vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10
     ; VI-NEXT: {{  $}}
@@ -9219,6 +9490,7 @@ body: |
     ; VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 32
     ; VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C1]](s64)
     ; VI-NEXT: G_STORE [[UV8]](s32), [[PTR_ADD1]](p1) :: (store (s32) into unknown-address + 32, addrspace 1)
+    ;
     ; GFX9-LABEL: name: test_store_global_v9s32_align4
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4, $vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10
     ; GFX9-NEXT: {{  $}}
@@ -9271,6 +9543,7 @@ body: |
     ; SI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 32
     ; SI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C1]](s64)
     ; SI-NEXT: G_STORE [[UV8]](s32), [[PTR_ADD1]](p1) :: (store (s32) into unknown-address + 32, align 8, addrspace 1)
+    ;
     ; CI-LABEL: name: test_store_global_v9s32_align8
     ; CI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4, $vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10
     ; CI-NEXT: {{  $}}
@@ -9290,6 +9563,7 @@ body: |
     ; CI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 32
     ; CI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C1]](s64)
     ; CI-NEXT: G_STORE [[UV8]](s32), [[PTR_ADD1]](p1) :: (store (s32) into unknown-address + 32, align 8, addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_v9s32_align8
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4, $vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10
     ; VI-NEXT: {{  $}}
@@ -9309,6 +9583,7 @@ body: |
     ; VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 32
     ; VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C1]](s64)
     ; VI-NEXT: G_STORE [[UV8]](s32), [[PTR_ADD1]](p1) :: (store (s32) into unknown-address + 32, align 8, addrspace 1)
+    ;
     ; GFX9-LABEL: name: test_store_global_v9s32_align8
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4, $vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10
     ; GFX9-NEXT: {{  $}}
@@ -9361,6 +9636,7 @@ body: |
     ; SI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 32
     ; SI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C1]](s64)
     ; SI-NEXT: G_STORE [[UV8]](s32), [[PTR_ADD1]](p1) :: (store (s32) into unknown-address + 32, align 16, addrspace 1)
+    ;
     ; CI-LABEL: name: test_store_global_v9s32_align16
     ; CI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4, $vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10
     ; CI-NEXT: {{  $}}
@@ -9380,6 +9656,7 @@ body: |
     ; CI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 32
     ; CI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C1]](s64)
     ; CI-NEXT: G_STORE [[UV8]](s32), [[PTR_ADD1]](p1) :: (store (s32) into unknown-address + 32, align 16, addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_v9s32_align16
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4, $vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10
     ; VI-NEXT: {{  $}}
@@ -9399,6 +9676,7 @@ body: |
     ; VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 32
     ; VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C1]](s64)
     ; VI-NEXT: G_STORE [[UV8]](s32), [[PTR_ADD1]](p1) :: (store (s32) into unknown-address + 32, align 16, addrspace 1)
+    ;
     ; GFX9-LABEL: name: test_store_global_v9s32_align16
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4, $vgpr5_vgpr6_vgpr7, $vgpr8_vgpr9_vgpr10
     ; GFX9-NEXT: {{  $}}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-store.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-store.mir
index 30c7f4fb3e7d2..22d792abe3624 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-store.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-store.mir
@@ -22,6 +22,7 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; SI-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr2
     ; SI-NEXT: G_STORE [[COPY1]](s32), [[COPY]](p1) :: (store (s32), addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_i32
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2
     ; VI-NEXT: {{  $}}
@@ -45,6 +46,7 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; SI-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3
     ; SI-NEXT: G_STORE [[COPY1]](s64), [[COPY]](p1) :: (store (s64), addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_i64
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; VI-NEXT: {{  $}}
@@ -68,6 +70,7 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; SI-NEXT: [[COPY1:%[0-9]+]]:_(p1) = COPY $vgpr2_vgpr3
     ; SI-NEXT: G_STORE [[COPY1]](p1), [[COPY]](p1) :: (store (p1), addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_p1
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; VI-NEXT: {{  $}}
@@ -91,6 +94,7 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; SI-NEXT: [[COPY1:%[0-9]+]]:_(p4) = COPY $vgpr2_vgpr3
     ; SI-NEXT: G_STORE [[COPY1]](p4), [[COPY]](p1) :: (store (p4), addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_p4
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; VI-NEXT: {{  $}}
@@ -114,6 +118,7 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; SI-NEXT: [[COPY1:%[0-9]+]]:_(p3) = COPY $vgpr2
     ; SI-NEXT: G_STORE [[COPY1]](p3), [[COPY]](p1) :: (store (p3), addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_p3
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2
     ; VI-NEXT: {{  $}}
@@ -137,6 +142,7 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; SI-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr2_vgpr3
     ; SI-NEXT: G_STORE [[COPY1]](<2 x s32>), [[COPY]](p1) :: (store (<2 x s32>), addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_v2s32
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; VI-NEXT: {{  $}}
@@ -160,6 +166,7 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; SI-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr2
     ; SI-NEXT: G_STORE [[COPY1]](<2 x s16>), [[COPY]](p1) :: (store (<2 x s16>), addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_v2s16
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2
     ; VI-NEXT: {{  $}}
@@ -188,6 +195,7 @@ body: |
     ; SI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
     ; SI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C]](s64)
     ; SI-NEXT: G_STORE [[UV2]](s32), [[PTR_ADD]](p1) :: (store (s32) into unknown-address + 8, addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_v3s32
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4
     ; VI-NEXT: {{  $}}
@@ -212,6 +220,7 @@ body: |
     ; SI-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3
     ; SI-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64)
     ; SI-NEXT: G_STORE [[TRUNC]](s32), [[COPY]](p1) :: (store (s8), addrspace 1)
+    ;
     ; VI-LABEL: name: test_truncstore_global_s64_to_s8
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; VI-NEXT: {{  $}}
@@ -237,6 +246,7 @@ body: |
     ; SI-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3
     ; SI-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64)
     ; SI-NEXT: G_STORE [[TRUNC]](s32), [[COPY]](p1) :: (store (s16), addrspace 1)
+    ;
     ; VI-LABEL: name: test_truncstore_global_s64_to_s16
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; VI-NEXT: {{  $}}
@@ -269,6 +279,7 @@ body: |
     ; SI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C2]](s64)
     ; SI-NEXT: G_STORE [[TRUNC]](s32), [[COPY]](p1) :: (store (s8), addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR]](s32), [[PTR_ADD]](p1) :: (store (s8) into unknown-address + 1, addrspace 1)
+    ;
     ; VI-LABEL: name: test_truncstore_global_s64_to_s16_align1
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; VI-NEXT: {{  $}}
@@ -301,6 +312,7 @@ body: |
     ; SI-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3
     ; SI-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64)
     ; SI-NEXT: G_STORE [[TRUNC]](s32), [[COPY]](p1) :: (store (s32), addrspace 1)
+    ;
     ; VI-LABEL: name: test_truncstore_global_s64_to_s32
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; VI-NEXT: {{  $}}
@@ -332,6 +344,7 @@ body: |
     ; SI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C1]](s64)
     ; SI-NEXT: G_STORE [[COPY2]](s32), [[COPY]](p1) :: (store (s16), addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR]](s32), [[PTR_ADD]](p1) :: (store (s16) into unknown-address + 2, addrspace 1)
+    ;
     ; VI-LABEL: name: test_truncstore_global_s64_to_s32_align2
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; VI-NEXT: {{  $}}
@@ -376,11 +389,11 @@ body: |
     ; SI-NEXT: G_STORE [[COPY2]](s32), [[COPY]](p1) :: (store (s8), addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR1]](s32), [[PTR_ADD1]](p1) :: (store (s8) into unknown-address + 1, addrspace 1)
     ; SI-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
-    ; SI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C3]]
-    ; SI-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[AND1]], [[COPY3]](s32)
+    ; SI-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[LSHR]], [[COPY3]](s32)
     ; SI-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[LSHR]](s32), [[PTR_ADD]](p1) :: (store (s8) into unknown-address + 2, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR2]](s32), [[PTR_ADD2]](p1) :: (store (s8) into unknown-address + 3, addrspace 1)
+    ;
     ; VI-LABEL: name: test_truncstore_global_s64_to_s32_align1
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; VI-NEXT: {{  $}}
@@ -424,6 +437,7 @@ body: |
     ; SI-NEXT: [[COPY1:%[0-9]+]]:_(s128) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; SI-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s128)
     ; SI-NEXT: G_STORE [[TRUNC]](s32), [[COPY]](p1) :: (store (s16), addrspace 1)
+    ;
     ; VI-LABEL: name: test_truncstore_global_s128_to_s16
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; VI-NEXT: {{  $}}
@@ -449,6 +463,7 @@ body: |
     ; SI-NEXT: [[COPY1:%[0-9]+]]:_(s128) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; SI-NEXT: [[BITCAST:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[COPY1]](s128)
     ; SI-NEXT: G_STORE [[BITCAST]](<4 x s32>), [[COPY]](p1) :: (store (<4 x s32>), addrspace 1)
+    ;
     ; VI-LABEL: name: test_truncstore_global_s128_to_s8
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; VI-NEXT: {{  $}}
@@ -476,6 +491,7 @@ body: |
     ; SI-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C]]
     ; SI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[AND]], [[C]]
     ; SI-NEXT: G_STORE [[AND1]](s32), [[COPY]](p1) :: (store (s8), addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_i1
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2
     ; VI-NEXT: {{  $}}
@@ -503,6 +519,7 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; SI-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr2
     ; SI-NEXT: G_STORE [[COPY1]](s32), [[COPY]](p1) :: (store (s8), addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_i8
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2
     ; VI-NEXT: {{  $}}
@@ -527,6 +544,7 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; SI-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr2
     ; SI-NEXT: G_STORE [[COPY1]](s32), [[COPY]](p1) :: (store (s16), addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_i16
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2
     ; VI-NEXT: {{  $}}
@@ -557,6 +575,7 @@ body: |
     ; SI-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
     ; SI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY1]], [[C]](s64)
     ; SI-NEXT: G_STORE [[UV2]](s32), [[PTR_ADD]](p1) :: (store (s32) into unknown-address + 8, align 8, addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_96
     ; VI: liveins: $vgpr0_vgpr1_vgpr2, $vgpr3_vgpr4
     ; VI-NEXT: {{  $}}
@@ -583,6 +602,7 @@ body: |
     ; SI-NEXT: [[COPY1:%[0-9]+]]:_(s128) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; SI-NEXT: [[BITCAST:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[COPY1]](s128)
     ; SI-NEXT: G_STORE [[BITCAST]](<4 x s32>), [[COPY]](p1) :: (store (<4 x s32>), addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_i128
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; VI-NEXT: {{  $}}
@@ -607,6 +627,7 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; SI-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s64>) = COPY $vgpr2_vgpr3_vgpr4_vgpr5
     ; SI-NEXT: G_STORE [[COPY1]](<2 x s64>), [[COPY]](p1) :: (store (<2 x s64>), addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_v2s64
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; VI-NEXT: {{  $}}
@@ -631,8 +652,8 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; SI-NEXT: [[DEF:%[0-9]+]]:_(<2 x s32>) = G_IMPLICIT_DEF
     ; SI-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[DEF]](<2 x s32>)
-    ; SI-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; SI-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[UV]](s32)
+    ; SI-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; SI-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C]]
     ; SI-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
     ; SI-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C1]](s32)
@@ -649,14 +670,15 @@ body: |
     ; SI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[ANYEXT]](s32), [[COPY]](p1) :: (store (s8), addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR]](s32), [[PTR_ADD]](p1) :: (store (s8) into unknown-address + 1, addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_v2s8_align1
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[DEF:%[0-9]+]]:_(<2 x s32>) = G_IMPLICIT_DEF
     ; VI-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[DEF]](<2 x s32>)
-    ; VI-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; VI-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[UV]](s32)
+    ; VI-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; VI-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C]]
     ; VI-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[UV1]](s32)
     ; VI-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C]]
@@ -688,8 +710,8 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; SI-NEXT: [[DEF:%[0-9]+]]:_(<2 x s32>) = G_IMPLICIT_DEF
     ; SI-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[DEF]](<2 x s32>)
-    ; SI-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; SI-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[UV]](s32)
+    ; SI-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; SI-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C]]
     ; SI-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
     ; SI-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
@@ -699,14 +721,15 @@ body: |
     ; SI-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[TRUNC1]]
     ; SI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[OR]](s16)
     ; SI-NEXT: G_STORE [[ANYEXT]](s32), [[COPY]](p1) :: (store (s16), addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_v2s8_align2
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[DEF:%[0-9]+]]:_(<2 x s32>) = G_IMPLICIT_DEF
     ; VI-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[DEF]](<2 x s32>)
-    ; VI-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; VI-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[UV]](s32)
+    ; VI-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; VI-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C]]
     ; VI-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[UV1]](s32)
     ; VI-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C]]
@@ -733,8 +756,8 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; SI-NEXT: [[DEF:%[0-9]+]]:_(<2 x s32>) = G_IMPLICIT_DEF
     ; SI-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[DEF]](<2 x s32>)
-    ; SI-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; SI-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[UV]](s32)
+    ; SI-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; SI-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C]]
     ; SI-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
     ; SI-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
@@ -744,14 +767,15 @@ body: |
     ; SI-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[TRUNC1]]
     ; SI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[OR]](s16)
     ; SI-NEXT: G_STORE [[ANYEXT]](s32), [[COPY]](p1) :: (store (s16), align 4, addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_v2s8_align4
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; VI-NEXT: [[DEF:%[0-9]+]]:_(<2 x s32>) = G_IMPLICIT_DEF
     ; VI-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[DEF]](<2 x s32>)
-    ; VI-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; VI-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[UV]](s32)
+    ; VI-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; VI-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C]]
     ; VI-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[UV1]](s32)
     ; VI-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C]]
@@ -779,8 +803,8 @@ body: |
     ; SI-NEXT: [[COPY1:%[0-9]+]]:_(<3 x s32>) = COPY $vgpr2_vgpr3_vgpr4
     ; SI-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<3 x s32>)
     ; SI-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-    ; SI-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; SI-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[UV]](s32)
+    ; SI-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; SI-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C]]
     ; SI-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
     ; SI-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C1]](s32)
@@ -813,6 +837,7 @@ body: |
     ; SI-NEXT: G_STORE [[COPY4]](s32), [[COPY]](p1) :: (store (s8), addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR1]](s32), [[PTR_ADD1]](p1) :: (store (s8) into unknown-address + 1, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR]](s32), [[PTR_ADD]](p1) :: (store (s8) into unknown-address + 2, addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_v3s8_align1
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4
     ; VI-NEXT: {{  $}}
@@ -820,8 +845,8 @@ body: |
     ; VI-NEXT: [[COPY1:%[0-9]+]]:_(<3 x s32>) = COPY $vgpr2_vgpr3_vgpr4
     ; VI-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<3 x s32>)
     ; VI-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-    ; VI-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; VI-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[UV]](s32)
+    ; VI-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; VI-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C]]
     ; VI-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[UV1]](s32)
     ; VI-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C]]
@@ -870,8 +895,8 @@ body: |
     ; SI-NEXT: [[COPY1:%[0-9]+]]:_(<3 x s32>) = COPY $vgpr2_vgpr3_vgpr4
     ; SI-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<3 x s32>)
     ; SI-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-    ; SI-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; SI-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[UV]](s32)
+    ; SI-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; SI-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C]]
     ; SI-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
     ; SI-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C1]](s32)
@@ -897,6 +922,7 @@ body: |
     ; SI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[COPY3]](s32), [[COPY]](p1) :: (store (s16), addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR]](s32), [[PTR_ADD]](p1) :: (store (s8) into unknown-address + 2, align 2, addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_v3s8_align2
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4
     ; VI-NEXT: {{  $}}
@@ -904,8 +930,8 @@ body: |
     ; VI-NEXT: [[COPY1:%[0-9]+]]:_(<3 x s32>) = COPY $vgpr2_vgpr3_vgpr4
     ; VI-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<3 x s32>)
     ; VI-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-    ; VI-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; VI-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[UV]](s32)
+    ; VI-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; VI-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C]]
     ; VI-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[UV1]](s32)
     ; VI-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C]]
@@ -949,8 +975,8 @@ body: |
     ; SI-NEXT: [[COPY1:%[0-9]+]]:_(<3 x s32>) = COPY $vgpr2_vgpr3_vgpr4
     ; SI-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<3 x s32>)
     ; SI-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-    ; SI-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; SI-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[UV]](s32)
+    ; SI-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; SI-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C]]
     ; SI-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
     ; SI-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C1]](s32)
@@ -976,6 +1002,7 @@ body: |
     ; SI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[COPY3]](s32), [[COPY]](p1) :: (store (s16), align 4, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR]](s32), [[PTR_ADD]](p1) :: (store (s8) into unknown-address + 2, align 2, addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_v3s8_align4
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4
     ; VI-NEXT: {{  $}}
@@ -983,8 +1010,8 @@ body: |
     ; VI-NEXT: [[COPY1:%[0-9]+]]:_(<3 x s32>) = COPY $vgpr2_vgpr3_vgpr4
     ; VI-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<3 x s32>)
     ; VI-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-    ; VI-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; VI-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[UV]](s32)
+    ; VI-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; VI-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C]]
     ; VI-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[UV1]](s32)
     ; VI-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C]]
@@ -1053,11 +1080,11 @@ body: |
     ; SI-NEXT: G_STORE [[COPY2]](s32), [[COPY]](p1) :: (store (s8), addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR1]](s32), [[PTR_ADD1]](p1) :: (store (s8) into unknown-address + 1, addrspace 1)
     ; SI-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C1]](s32)
-    ; SI-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C5]]
-    ; SI-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[AND5]], [[COPY3]](s32)
+    ; SI-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[LSHR]], [[COPY3]](s32)
     ; SI-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p1) = G_PTR_ADD [[PTR_ADD]], [[C6]](s64)
     ; SI-NEXT: G_STORE [[LSHR]](s32), [[PTR_ADD]](p1) :: (store (s8) into unknown-address + 2, addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR2]](s32), [[PTR_ADD2]](p1) :: (store (s8) into unknown-address + 3, addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_v4s8_align1
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; VI-NEXT: {{  $}}
@@ -1082,8 +1109,8 @@ body: |
     ; VI-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY2]], [[C2]](s32)
     ; VI-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
     ; VI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C4]](s64)
-    ; VI-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; VI-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[UV]](s32)
+    ; VI-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; VI-NEXT: [[AND4:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C5]]
     ; VI-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[UV1]](s32)
     ; VI-NEXT: [[AND5:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C5]]
@@ -1141,6 +1168,7 @@ body: |
     ; SI-NEXT: [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[COPY]], [[C4]](s64)
     ; SI-NEXT: G_STORE [[COPY2]](s32), [[COPY]](p1) :: (store (s16), addrspace 1)
     ; SI-NEXT: G_STORE [[LSHR]](s32), [[PTR_ADD]](p1) :: (store (s16) into unknown-address + 2, addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_v4s8_align2
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; VI-NEXT: {{  $}}
@@ -1201,6 +1229,7 @@ body: |
     ; SI-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C3]](s32)
     ; SI-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[SHL2]]
     ; SI-NEXT: G_STORE [[OR2]](s32), [[COPY]](p1) :: (store (s32), addrspace 1)
+    ;
     ; VI-LABEL: name: test_store_global_v4s8_align4
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; VI-NEXT: {{  $}}
@@ -1241,6 +1270,7 @@ body: |
     ; SI-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
     ; SI-NEXT: [[DEF:%[0-9]+]]:_(<2 x s8>) = G_IMPLICIT_DEF
     ; SI-NEXT: G_STORE [[DEF]](<2 x s8>), [[COPY]](p1) :: (store (<2 x s4>), addrspace 1)
+    ;
     ; VI-LABEL: name: test_truncstore_global_v2s8_to_1_align1
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4_vgpr5
     ; VI-NEXT: {{  $}}
@@ -1266,6 +1296,7 @@ body: |
     ; SI-NEXT: [[COPY1:%[0-9]+]]:_(<3 x s32>) = COPY $vgpr2_vgpr3_vgpr4
     ; SI-NEXT: [[TRUNC:%[0-9]+]]:_(<3 x s8>) = G_TRUNC [[COPY1]](<3 x s32>)
     ; SI-NEXT: G_STORE [[TRUNC]](<3 x s8>), [[COPY]](p1) :: (store (<3 x s2>), addrspace 1)
+    ;
     ; VI-LABEL: name: test_truncstore_global_v3s8_to_1_align1
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4
     ; VI-NEXT: {{  $}}
@@ -1293,6 +1324,7 @@ body: |
     ; SI-NEXT: [[COPY1:%[0-9]+]]:_(<3 x s32>) = COPY $vgpr2_vgpr3_vgpr4
     ; SI-NEXT: [[TRUNC:%[0-9]+]]:_(<3 x s8>) = G_TRUNC [[COPY1]](<3 x s32>)
     ; SI-NEXT: G_STORE [[TRUNC]](<3 x s8>), [[COPY]](p1) :: (store (<3 x s4>), addrspace 1)
+    ;
     ; VI-LABEL: name: test_truncstore_global_v3s8_to_2_align2
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3_vgpr4
     ; VI-NEXT: {{  $}}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-trunc.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-trunc.mir
index b89e5d684b0ff..5205386c8ea71 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-trunc.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-trunc.mir
@@ -126,8 +126,8 @@ body: |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
     ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[COPY]](<2 x s64>)
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
     ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[UV]](s64)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
     ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[TRUNC]], [[C]]
     ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[UV1]](s64)
     ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[TRUNC1]], [[C]]
@@ -153,8 +153,8 @@ body: |
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
     ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<2 x s64>), [[UV1:%[0-9]+]]:_(<2 x s64>) = G_UNMERGE_VALUES [[COPY]](<4 x s64>)
     ; CHECK-NEXT: [[UV2:%[0-9]+]]:_(s64), [[UV3:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[UV]](<2 x s64>)
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
     ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[UV2]](s64)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
     ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[TRUNC]], [[C]]
     ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[UV3]](s64)
     ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[TRUNC1]], [[C]]
@@ -355,8 +355,8 @@ body: |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x s128>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7
     ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s128), [[UV1:%[0-9]+]]:_(s128) = G_UNMERGE_VALUES [[COPY]](<2 x s128>)
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
     ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[UV]](s128)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
     ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[TRUNC]], [[C]]
     ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[UV1]](s128)
     ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[TRUNC1]], [[C]]
@@ -403,8 +403,8 @@ body: |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s96) = COPY $vgpr0_vgpr1_vgpr2
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s96) = COPY $vgpr3_vgpr4_vgpr5
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
     ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s96)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
     ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[TRUNC]], [[C]]
     ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s96)
     ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[TRUNC1]], [[C]]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-uaddo.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-uaddo.mir
index 54ce45c0dc088..b2fe9b8ddc903 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-uaddo.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-uaddo.mir
@@ -139,22 +139,20 @@ body: |
     ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[AND]], [[AND1]]
     ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[ADD]], [[C1]]
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[ADD]](s32), [[AND2]]
-    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]]
-    ; CHECK-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C1]]
-    ; CHECK-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[AND3]], [[AND4]]
-    ; CHECK-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[ADD1]], [[C1]]
-    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[ADD1]](s32), [[AND5]]
+    ; CHECK-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[LSHR]], [[LSHR1]]
+    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[ADD1]], [[C1]]
+    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[ADD1]](s32), [[AND3]]
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[AND2]](s32)
-    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[AND5]](s32)
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[AND3]](s32)
     ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY3]], [[C]](s32)
     ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY2]], [[SHL]]
     ; CHECK-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[ICMP]](s1)
     ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[ICMP1]](s1)
     ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C2]]
-    ; CHECK-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[ANYEXT1]], [[C2]]
-    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[AND6]](s32), [[AND7]](s32)
+    ; CHECK-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C2]]
+    ; CHECK-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[ANYEXT1]], [[C2]]
+    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[AND4]](s32), [[AND5]](s32)
     ; CHECK-NEXT: $vgpr0 = COPY [[BITCAST2]](<2 x s16>)
     ; CHECK-NEXT: $vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<2 x s32>)
     %0:_(<2 x s16>) = COPY $vgpr0
@@ -190,16 +188,14 @@ body: |
     ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[AND]], [[AND1]]
     ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[ADD]], [[C1]]
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[ADD]](s32), [[AND2]]
-    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]]
-    ; CHECK-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C1]]
-    ; CHECK-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[AND3]], [[AND4]]
-    ; CHECK-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[ADD1]], [[C1]]
-    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[ADD1]](s32), [[AND5]]
-    ; CHECK-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C1]]
-    ; CHECK-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[BITCAST3]], [[C1]]
-    ; CHECK-NEXT: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[AND6]], [[AND7]]
-    ; CHECK-NEXT: [[AND8:%[0-9]+]]:_(s32) = G_AND [[ADD2]], [[C1]]
-    ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[ADD2]](s32), [[AND8]]
+    ; CHECK-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[LSHR]], [[LSHR1]]
+    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[ADD1]], [[C1]]
+    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[ADD1]](s32), [[AND3]]
+    ; CHECK-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C1]]
+    ; CHECK-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[BITCAST3]], [[C1]]
+    ; CHECK-NEXT: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[AND4]], [[AND5]]
+    ; CHECK-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[ADD2]], [[C1]]
+    ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[ADD2]](s32), [[AND6]]
     ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[ICMP]](s1)
     ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[ICMP1]](s1)
     ; CHECK-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[ICMP2]](s1)
@@ -209,26 +205,25 @@ body: |
     ; CHECK-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST4]], [[C]](s32)
     ; CHECK-NEXT: [[BITCAST5:%[0-9]+]]:_(s32) = G_BITCAST [[UV7]](<2 x s16>)
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[AND2]](s32)
-    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[AND5]](s32)
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[AND3]](s32)
     ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY3]], [[C]](s32)
     ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY2]], [[SHL]]
     ; CHECK-NEXT: [[BITCAST6:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
-    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY [[AND8]](s32)
-    ; CHECK-NEXT: [[AND9:%[0-9]+]]:_(s32) = G_AND [[BITCAST4]], [[C1]]
-    ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[C]](s32)
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY [[AND6]](s32)
+    ; CHECK-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[BITCAST4]], [[C1]]
+    ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C]](s32)
     ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[COPY4]], [[SHL1]]
     ; CHECK-NEXT: [[BITCAST7:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
-    ; CHECK-NEXT: [[AND10:%[0-9]+]]:_(s32) = G_AND [[LSHR2]], [[C1]]
-    ; CHECK-NEXT: [[AND11:%[0-9]+]]:_(s32) = G_AND [[BITCAST5]], [[C1]]
-    ; CHECK-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[C]](s32)
-    ; CHECK-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND10]], [[SHL2]]
+    ; CHECK-NEXT: [[AND8:%[0-9]+]]:_(s32) = G_AND [[BITCAST5]], [[C1]]
+    ; CHECK-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND8]], [[C]](s32)
+    ; CHECK-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[LSHR2]], [[SHL2]]
     ; CHECK-NEXT: [[BITCAST8:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
     ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST6]](<2 x s16>), [[BITCAST7]](<2 x s16>), [[BITCAST8]](<2 x s16>)
     ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND12:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C2]]
-    ; CHECK-NEXT: [[AND13:%[0-9]+]]:_(s32) = G_AND [[ANYEXT1]], [[C2]]
-    ; CHECK-NEXT: [[AND14:%[0-9]+]]:_(s32) = G_AND [[ANYEXT2]], [[C2]]
-    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[AND12]](s32), [[AND13]](s32), [[AND14]](s32)
+    ; CHECK-NEXT: [[AND9:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C2]]
+    ; CHECK-NEXT: [[AND10:%[0-9]+]]:_(s32) = G_AND [[ANYEXT1]], [[C2]]
+    ; CHECK-NEXT: [[AND11:%[0-9]+]]:_(s32) = G_AND [[ANYEXT2]], [[C2]]
+    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[AND9]](s32), [[AND10]](s32), [[AND11]](s32)
     ; CHECK-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
     ; CHECK-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
     %0:_(<6 x s16>) = COPY $vgpr0_vgpr1_vgpr2
@@ -271,28 +266,24 @@ body: |
     ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[AND]], [[AND1]]
     ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[ADD]], [[C1]]
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[ADD]](s32), [[AND2]]
-    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]]
-    ; CHECK-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[LSHR2]], [[C1]]
-    ; CHECK-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[AND3]], [[AND4]]
-    ; CHECK-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[ADD1]], [[C1]]
-    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[ADD1]](s32), [[AND5]]
-    ; CHECK-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C1]]
-    ; CHECK-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[BITCAST3]], [[C1]]
-    ; CHECK-NEXT: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[AND6]], [[AND7]]
-    ; CHECK-NEXT: [[AND8:%[0-9]+]]:_(s32) = G_AND [[ADD2]], [[C1]]
-    ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[ADD2]](s32), [[AND8]]
-    ; CHECK-NEXT: [[AND9:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C1]]
-    ; CHECK-NEXT: [[AND10:%[0-9]+]]:_(s32) = G_AND [[LSHR3]], [[C1]]
-    ; CHECK-NEXT: [[ADD3:%[0-9]+]]:_(s32) = G_ADD [[AND9]], [[AND10]]
-    ; CHECK-NEXT: [[AND11:%[0-9]+]]:_(s32) = G_AND [[ADD3]], [[C1]]
-    ; CHECK-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[ADD3]](s32), [[AND11]]
+    ; CHECK-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[LSHR]], [[LSHR2]]
+    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[ADD1]], [[C1]]
+    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[ADD1]](s32), [[AND3]]
+    ; CHECK-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C1]]
+    ; CHECK-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[BITCAST3]], [[C1]]
+    ; CHECK-NEXT: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[AND4]], [[AND5]]
+    ; CHECK-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[ADD2]], [[C1]]
+    ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[ADD2]](s32), [[AND6]]
+    ; CHECK-NEXT: [[ADD3:%[0-9]+]]:_(s32) = G_ADD [[LSHR1]], [[LSHR3]]
+    ; CHECK-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[ADD3]], [[C1]]
+    ; CHECK-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[ADD3]](s32), [[AND7]]
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[AND2]](s32)
-    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[AND5]](s32)
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[AND3]](s32)
     ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY3]], [[C]](s32)
     ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY2]], [[SHL]]
     ; CHECK-NEXT: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
-    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY [[AND8]](s32)
-    ; CHECK-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY [[AND11]](s32)
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY [[AND6]](s32)
+    ; CHECK-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY [[AND7]](s32)
     ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY5]], [[C]](s32)
     ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[COPY4]], [[SHL1]]
     ; CHECK-NEXT: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
@@ -302,11 +293,11 @@ body: |
     ; CHECK-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[ICMP2]](s1)
     ; CHECK-NEXT: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[ICMP3]](s1)
     ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND12:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C2]]
-    ; CHECK-NEXT: [[AND13:%[0-9]+]]:_(s32) = G_AND [[ANYEXT1]], [[C2]]
-    ; CHECK-NEXT: [[AND14:%[0-9]+]]:_(s32) = G_AND [[ANYEXT2]], [[C2]]
-    ; CHECK-NEXT: [[AND15:%[0-9]+]]:_(s32) = G_AND [[ANYEXT3]], [[C2]]
-    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[AND12]](s32), [[AND13]](s32), [[AND14]](s32), [[AND15]](s32)
+    ; CHECK-NEXT: [[AND8:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C2]]
+    ; CHECK-NEXT: [[AND9:%[0-9]+]]:_(s32) = G_AND [[ANYEXT1]], [[C2]]
+    ; CHECK-NEXT: [[AND10:%[0-9]+]]:_(s32) = G_AND [[ANYEXT2]], [[C2]]
+    ; CHECK-NEXT: [[AND11:%[0-9]+]]:_(s32) = G_AND [[ANYEXT3]], [[C2]]
+    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[AND8]](s32), [[AND9]](s32), [[AND10]](s32), [[AND11]](s32)
     ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
     ; CHECK-NEXT: $vgpr2_vgpr3_vgpr4_vgpr5 = COPY [[BUILD_VECTOR]](<4 x s32>)
     %0:_(<4 x s16>) = COPY $vgpr0_vgpr1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-uaddsat.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-uaddsat.mir
index ca2dacce5457e..0d2366489c2a0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-uaddsat.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-uaddsat.mir
@@ -25,6 +25,7 @@ body: |
     ; GFX6-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[SHL]], [[UMIN]]
     ; GFX6-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[ADD]], [[C]](s32)
     ; GFX6-NEXT: $vgpr0 = COPY [[LSHR]](s32)
+    ;
     ; GFX8-LABEL: name: uaddsat_s7
     ; GFX8: liveins: $vgpr0, $vgpr1
     ; GFX8-NEXT: {{  $}}
@@ -39,6 +40,7 @@ body: |
     ; GFX8-NEXT: [[LSHR:%[0-9]+]]:_(s16) = G_LSHR [[UADDSAT]], [[C]](s16)
     ; GFX8-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR]](s16)
     ; GFX8-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ;
     ; GFX9-LABEL: name: uaddsat_s7
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -82,6 +84,7 @@ body: |
     ; GFX6-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[SHL]], [[UMIN]]
     ; GFX6-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[ADD]], [[C]](s32)
     ; GFX6-NEXT: $vgpr0 = COPY [[LSHR]](s32)
+    ;
     ; GFX8-LABEL: name: uaddsat_s8
     ; GFX8: liveins: $vgpr0, $vgpr1
     ; GFX8-NEXT: {{  $}}
@@ -96,6 +99,7 @@ body: |
     ; GFX8-NEXT: [[LSHR:%[0-9]+]]:_(s16) = G_LSHR [[UADDSAT]], [[C]](s16)
     ; GFX8-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR]](s16)
     ; GFX8-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ;
     ; GFX9-LABEL: name: uaddsat_s8
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -147,17 +151,14 @@ body: |
     ; GFX6-NEXT: [[UMIN1:%[0-9]+]]:_(s32) = G_UMIN [[XOR1]], [[SHL3]]
     ; GFX6-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[SHL2]], [[UMIN1]]
     ; GFX6-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[ADD1]], [[C1]](s32)
-    ; GFX6-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; GFX6-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32)
-    ; GFX6-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C3]]
     ; GFX6-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C]](s32)
-    ; GFX6-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
-    ; GFX6-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR3]], [[C4]]
-    ; GFX6-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[COPY2]](s32)
+    ; GFX6-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LSHR3]], [[COPY2]](s32)
     ; GFX6-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[SHL4]](s32)
-    ; GFX6-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[TRUNC1]]
+    ; GFX6-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[TRUNC]], [[TRUNC1]]
     ; GFX6-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[OR]](s16)
     ; GFX6-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ;
     ; GFX8-LABEL: name: uaddsat_v2s8
     ; GFX8: liveins: $vgpr0, $vgpr1
     ; GFX8-NEXT: {{  $}}
@@ -179,13 +180,11 @@ body: |
     ; GFX8-NEXT: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[TRUNC3]], [[C1]](s16)
     ; GFX8-NEXT: [[UADDSAT1:%[0-9]+]]:_(s16) = G_UADDSAT [[SHL2]], [[SHL3]]
     ; GFX8-NEXT: [[LSHR3:%[0-9]+]]:_(s16) = G_LSHR [[UADDSAT1]], [[C1]](s16)
-    ; GFX8-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
-    ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[LSHR2]], [[C2]]
-    ; GFX8-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[LSHR3]], [[C2]]
-    ; GFX8-NEXT: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C1]](s16)
-    ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL4]]
+    ; GFX8-NEXT: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[LSHR3]], [[C1]](s16)
+    ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[LSHR2]], [[SHL4]]
     ; GFX8-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[OR]](s16)
     ; GFX8-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ;
     ; GFX9-LABEL: name: uaddsat_v2s8
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -250,6 +249,7 @@ body: |
     ; GFX6-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[SHL]], [[UMIN]]
     ; GFX6-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[ADD]], [[C]](s32)
     ; GFX6-NEXT: $vgpr0 = COPY [[LSHR]](s32)
+    ;
     ; GFX8-LABEL: name: uaddsat_s16
     ; GFX8: liveins: $vgpr0, $vgpr1
     ; GFX8-NEXT: {{  $}}
@@ -260,6 +260,7 @@ body: |
     ; GFX8-NEXT: [[UADDSAT:%[0-9]+]]:_(s16) = G_UADDSAT [[TRUNC]], [[TRUNC1]]
     ; GFX8-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UADDSAT]](s16)
     ; GFX8-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ;
     ; GFX9-LABEL: name: uaddsat_s16
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -308,13 +309,11 @@ body: |
     ; GFX6-NEXT: [[UMIN1:%[0-9]+]]:_(s32) = G_UMIN [[XOR1]], [[SHL3]]
     ; GFX6-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[SHL2]], [[UMIN1]]
     ; GFX6-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[ADD1]], [[C]](s32)
-    ; GFX6-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
-    ; GFX6-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LSHR2]], [[C2]]
-    ; GFX6-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR3]], [[C2]]
-    ; GFX6-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
-    ; GFX6-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL4]]
+    ; GFX6-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LSHR3]], [[C]](s32)
+    ; GFX6-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[LSHR2]], [[SHL4]]
     ; GFX6-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; GFX6-NEXT: $vgpr0 = COPY [[BITCAST2]](<2 x s16>)
+    ;
     ; GFX8-LABEL: name: uaddsat_v2s16
     ; GFX8: liveins: $vgpr0, $vgpr1
     ; GFX8-NEXT: {{  $}}
@@ -337,6 +336,7 @@ body: |
     ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL]]
     ; GFX8-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; GFX8-NEXT: $vgpr0 = COPY [[BITCAST2]](<2 x s16>)
+    ;
     ; GFX9-LABEL: name: uaddsat_v2s16
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -392,24 +392,21 @@ body: |
     ; GFX6-NEXT: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>)
     ; GFX6-NEXT: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32)
     ; GFX6-NEXT: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[UV4]](<2 x s16>)
-    ; GFX6-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
-    ; GFX6-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LSHR3]], [[C2]]
-    ; GFX6-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR4]], [[C2]]
-    ; GFX6-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
-    ; GFX6-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL6]]
+    ; GFX6-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[LSHR4]], [[C]](s32)
+    ; GFX6-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[LSHR3]], [[SHL6]]
     ; GFX6-NEXT: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
-    ; GFX6-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[LSHR5]], [[C2]]
-    ; GFX6-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[BITCAST3]], [[C2]]
-    ; GFX6-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C]](s32)
-    ; GFX6-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL7]]
+    ; GFX6-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+    ; GFX6-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[BITCAST3]], [[C2]]
+    ; GFX6-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND]], [[C]](s32)
+    ; GFX6-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[LSHR5]], [[SHL7]]
     ; GFX6-NEXT: [[BITCAST6:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
-    ; GFX6-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[LSHR6]], [[C2]]
-    ; GFX6-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[BITCAST4]], [[C2]]
-    ; GFX6-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C]](s32)
-    ; GFX6-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL8]]
+    ; GFX6-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[BITCAST4]], [[C2]]
+    ; GFX6-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
+    ; GFX6-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[LSHR6]], [[SHL8]]
     ; GFX6-NEXT: [[BITCAST7:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
     ; GFX6-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST5]](<2 x s16>), [[BITCAST6]](<2 x s16>), [[BITCAST7]](<2 x s16>)
     ; GFX6-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
     ; GFX8-LABEL: name: uaddsat_v3s16
     ; GFX8: liveins: $vgpr0_vgpr1_vgpr2
     ; GFX8-NEXT: {{  $}}
@@ -447,13 +444,13 @@ body: |
     ; GFX8-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND]], [[C]](s32)
     ; GFX8-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL1]]
     ; GFX8-NEXT: [[BITCAST6:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
-    ; GFX8-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR3]], [[C1]]
-    ; GFX8-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[BITCAST4]], [[C1]]
-    ; GFX8-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C]](s32)
-    ; GFX8-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND1]], [[SHL2]]
+    ; GFX8-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[BITCAST4]], [[C1]]
+    ; GFX8-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
+    ; GFX8-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[LSHR3]], [[SHL2]]
     ; GFX8-NEXT: [[BITCAST7:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
     ; GFX8-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST5]](<2 x s16>), [[BITCAST6]](<2 x s16>), [[BITCAST7]](<2 x s16>)
     ; GFX8-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
     ; GFX9-LABEL: name: uaddsat_v3s16
     ; GFX9: liveins: $vgpr0_vgpr1_vgpr2
     ; GFX9-NEXT: {{  $}}
@@ -548,19 +545,15 @@ body: |
     ; GFX6-NEXT: [[UMIN3:%[0-9]+]]:_(s32) = G_UMIN [[XOR3]], [[SHL7]]
     ; GFX6-NEXT: [[ADD3:%[0-9]+]]:_(s32) = G_ADD [[SHL6]], [[UMIN3]]
     ; GFX6-NEXT: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[ADD3]], [[C]](s32)
-    ; GFX6-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
-    ; GFX6-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LSHR4]], [[C2]]
-    ; GFX6-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR5]], [[C2]]
-    ; GFX6-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
-    ; GFX6-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL8]]
+    ; GFX6-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[LSHR5]], [[C]](s32)
+    ; GFX6-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[LSHR4]], [[SHL8]]
     ; GFX6-NEXT: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
-    ; GFX6-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[LSHR6]], [[C2]]
-    ; GFX6-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[LSHR7]], [[C2]]
-    ; GFX6-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C]](s32)
-    ; GFX6-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL9]]
+    ; GFX6-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[LSHR7]], [[C]](s32)
+    ; GFX6-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[LSHR6]], [[SHL9]]
     ; GFX6-NEXT: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
     ; GFX6-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>)
     ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
     ; GFX8-LABEL: name: uaddsat_v4s16
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX8-NEXT: {{  $}}
@@ -601,6 +594,7 @@ body: |
     ; GFX8-NEXT: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
     ; GFX8-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>)
     ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
     ; GFX9-LABEL: name: uaddsat_v4s16
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -634,6 +628,7 @@ body: |
     ; GFX6-NEXT: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[XOR]], [[COPY1]]
     ; GFX6-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY]], [[UMIN]]
     ; GFX6-NEXT: $vgpr0 = COPY [[ADD]](s32)
+    ;
     ; GFX8-LABEL: name: uaddsat_s32
     ; GFX8: liveins: $vgpr0, $vgpr1
     ; GFX8-NEXT: {{  $}}
@@ -641,6 +636,7 @@ body: |
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
     ; GFX8-NEXT: [[UADDSAT:%[0-9]+]]:_(s32) = G_UADDSAT [[COPY]], [[COPY1]]
     ; GFX8-NEXT: $vgpr0 = COPY [[UADDSAT]](s32)
+    ;
     ; GFX9-LABEL: name: uaddsat_s32
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -676,6 +672,7 @@ body: |
     ; GFX6-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[UV1]], [[UMIN1]]
     ; GFX6-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[ADD]](s32), [[ADD1]](s32)
     ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; GFX8-LABEL: name: uaddsat_v2s32
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX8-NEXT: {{  $}}
@@ -687,6 +684,7 @@ body: |
     ; GFX8-NEXT: [[UADDSAT1:%[0-9]+]]:_(s32) = G_UADDSAT [[UV1]], [[UV3]]
     ; GFX8-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[UADDSAT]](s32), [[UADDSAT1]](s32)
     ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; GFX9-LABEL: name: uaddsat_v2s32
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -723,6 +721,7 @@ body: |
     ; GFX6-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
     ; GFX6-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[UADDE1]](s1), [[C]], [[MV]]
     ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[SELECT]](s64)
+    ;
     ; GFX8-LABEL: name: uaddsat_s64
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX8-NEXT: {{  $}}
@@ -736,6 +735,7 @@ body: |
     ; GFX8-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 -1
     ; GFX8-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[UADDE1]](s1), [[C]], [[MV]]
     ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[SELECT]](s64)
+    ;
     ; GFX9-LABEL: name: uaddsat_s64
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -783,6 +783,7 @@ body: |
     ; GFX6-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[UADDE3]](s1), [[C]], [[MV1]]
     ; GFX6-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[SELECT]](s64), [[SELECT1]](s64)
     ; GFX6-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ;
     ; GFX8-LABEL: name: uaddsat_v2s64
     ; GFX8: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7
     ; GFX8-NEXT: {{  $}}
@@ -805,6 +806,7 @@ body: |
     ; GFX8-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[UADDE3]](s1), [[C]], [[MV1]]
     ; GFX8-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[SELECT]](s64), [[SELECT1]](s64)
     ; GFX8-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ;
     ; GFX9-LABEL: name: uaddsat_v2s64
     ; GFX9: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7
     ; GFX9-NEXT: {{  $}}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-udiv.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-udiv.mir
index 2143d94da2ede..bfba201e264b1 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-udiv.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-udiv.mir
@@ -39,6 +39,7 @@ body: |
     ; GFX6-NEXT: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[SELECT]], [[C2]]
     ; GFX6-NEXT: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s1), [[ADD2]], [[SELECT]]
     ; GFX6-NEXT: $vgpr0 = COPY [[SELECT2]](s32)
+    ;
     ; GFX8-LABEL: name: test_udiv_s32
     ; GFX8: liveins: $vgpr0, $vgpr1
     ; GFX8-NEXT: {{  $}}
@@ -67,6 +68,7 @@ body: |
     ; GFX8-NEXT: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[SELECT]], [[C2]]
     ; GFX8-NEXT: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s1), [[ADD2]], [[SELECT]]
     ; GFX8-NEXT: $vgpr0 = COPY [[SELECT2]](s32)
+    ;
     ; GFX9-LABEL: name: test_udiv_s32
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -95,6 +97,7 @@ body: |
     ; GFX9-NEXT: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[SELECT]], [[C2]]
     ; GFX9-NEXT: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s1), [[ADD2]], [[SELECT]]
     ; GFX9-NEXT: $vgpr0 = COPY [[SELECT2]](s32)
+    ;
     ; GFX10-LABEL: name: test_udiv_s32
     ; GFX10: liveins: $vgpr0, $vgpr1
     ; GFX10-NEXT: {{  $}}
@@ -185,6 +188,7 @@ body: |
     ; GFX6-NEXT: [[SELECT5:%[0-9]+]]:_(s32) = G_SELECT [[ICMP3]](s1), [[ADD5]], [[SELECT3]]
     ; GFX6-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[SELECT2]](s32), [[SELECT5]](s32)
     ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; GFX8-LABEL: name: test_udiv_v2s32
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX8-NEXT: {{  $}}
@@ -235,6 +239,7 @@ body: |
     ; GFX8-NEXT: [[SELECT5:%[0-9]+]]:_(s32) = G_SELECT [[ICMP3]](s1), [[ADD5]], [[SELECT3]]
     ; GFX8-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[SELECT2]](s32), [[SELECT5]](s32)
     ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; GFX9-LABEL: name: test_udiv_v2s32
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -285,6 +290,7 @@ body: |
     ; GFX9-NEXT: [[SELECT5:%[0-9]+]]:_(s32) = G_SELECT [[ICMP3]](s1), [[ADD5]], [[SELECT3]]
     ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[SELECT2]](s32), [[SELECT5]](s32)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; GFX10-LABEL: name: test_udiv_v2s32
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX10-NEXT: {{  $}}
@@ -497,6 +503,7 @@ body: |
     ; GFX6-NEXT: [[ICMP7:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SELECT]](s32), [[C5]]
     ; GFX6-NEXT: [[SELECT3:%[0-9]+]]:_(s64) = G_SELECT [[ICMP7]](s1), [[SELECT2]], [[MV]]
     ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[SELECT3]](s64)
+    ;
     ; GFX8-LABEL: name: test_udiv_s64
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX8-NEXT: {{  $}}
@@ -647,6 +654,7 @@ body: |
     ; GFX8-NEXT: [[ICMP7:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SELECT]](s32), [[C5]]
     ; GFX8-NEXT: [[SELECT3:%[0-9]+]]:_(s64) = G_SELECT [[ICMP7]](s1), [[SELECT2]], [[MV]]
     ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[SELECT3]](s64)
+    ;
     ; GFX9-LABEL: name: test_udiv_s64
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -797,6 +805,7 @@ body: |
     ; GFX9-NEXT: [[ICMP7:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SELECT]](s32), [[C5]]
     ; GFX9-NEXT: [[SELECT3:%[0-9]+]]:_(s64) = G_SELECT [[ICMP7]](s1), [[SELECT2]], [[MV]]
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[SELECT3]](s64)
+    ;
     ; GFX10-LABEL: name: test_udiv_s64
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX10-NEXT: {{  $}}
@@ -1249,6 +1258,7 @@ body: |
     ; GFX6-NEXT: [[SELECT7:%[0-9]+]]:_(s64) = G_SELECT [[ICMP15]](s1), [[SELECT6]], [[MV3]]
     ; GFX6-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[SELECT3]](s64), [[SELECT7]](s64)
     ; GFX6-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ;
     ; GFX8-LABEL: name: test_udiv_v2s64
     ; GFX8: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7
     ; GFX8-NEXT: {{  $}}
@@ -1539,6 +1549,7 @@ body: |
     ; GFX8-NEXT: [[SELECT7:%[0-9]+]]:_(s64) = G_SELECT [[ICMP15]](s1), [[SELECT6]], [[MV3]]
     ; GFX8-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[SELECT3]](s64), [[SELECT7]](s64)
     ; GFX8-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ;
     ; GFX9-LABEL: name: test_udiv_v2s64
     ; GFX9: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7
     ; GFX9-NEXT: {{  $}}
@@ -1829,6 +1840,7 @@ body: |
     ; GFX9-NEXT: [[SELECT7:%[0-9]+]]:_(s64) = G_SELECT [[ICMP15]](s1), [[SELECT6]], [[MV3]]
     ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[SELECT3]](s64), [[SELECT7]](s64)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ;
     ; GFX10-LABEL: name: test_udiv_v2s64
     ; GFX10: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7
     ; GFX10-NEXT: {{  $}}
@@ -2163,6 +2175,7 @@ body: |
     ; GFX6-NEXT: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s1), [[ADD2]], [[SELECT]]
     ; GFX6-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[SELECT2]], [[C]]
     ; GFX6-NEXT: $vgpr0 = COPY [[AND2]](s32)
+    ;
     ; GFX8-LABEL: name: test_udiv_s16
     ; GFX8: liveins: $vgpr0, $vgpr1
     ; GFX8-NEXT: {{  $}}
@@ -2195,6 +2208,7 @@ body: |
     ; GFX8-NEXT: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s1), [[ADD2]], [[SELECT]]
     ; GFX8-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[SELECT2]], [[C]]
     ; GFX8-NEXT: $vgpr0 = COPY [[AND2]](s32)
+    ;
     ; GFX9-LABEL: name: test_udiv_s16
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -2227,6 +2241,7 @@ body: |
     ; GFX9-NEXT: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s1), [[ADD2]], [[SELECT]]
     ; GFX9-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[SELECT2]], [[C]]
     ; GFX9-NEXT: $vgpr0 = COPY [[AND2]](s32)
+    ;
     ; GFX10-LABEL: name: test_udiv_s16
     ; GFX10: liveins: $vgpr0, $vgpr1
     ; GFX10-NEXT: {{  $}}
@@ -2309,33 +2324,32 @@ body: |
     ; GFX6-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SELECT1]](s32), [[AND1]]
     ; GFX6-NEXT: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[SELECT]], [[C4]]
     ; GFX6-NEXT: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s1), [[ADD2]], [[SELECT]]
-    ; GFX6-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]]
-    ; GFX6-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C1]]
-    ; GFX6-NEXT: [[UITOFP1:%[0-9]+]]:_(s32) = G_UITOFP [[AND3]](s32)
+    ; GFX6-NEXT: [[UITOFP1:%[0-9]+]]:_(s32) = G_UITOFP [[LSHR1]](s32)
     ; GFX6-NEXT: [[AMDGPU_RCP_IFLAG1:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG [[UITOFP1]](s32)
     ; GFX6-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG1]], [[C2]]
     ; GFX6-NEXT: [[FPTOUI1:%[0-9]+]]:_(s32) = G_FPTOUI [[FMUL1]](s32)
-    ; GFX6-NEXT: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[C3]], [[AND3]]
+    ; GFX6-NEXT: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[C3]], [[LSHR1]]
     ; GFX6-NEXT: [[MUL2:%[0-9]+]]:_(s32) = G_MUL [[SUB3]], [[FPTOUI1]]
     ; GFX6-NEXT: [[UMULH2:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI1]], [[MUL2]]
     ; GFX6-NEXT: [[ADD3:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI1]], [[UMULH2]]
-    ; GFX6-NEXT: [[UMULH3:%[0-9]+]]:_(s32) = G_UMULH [[AND2]], [[ADD3]]
-    ; GFX6-NEXT: [[MUL3:%[0-9]+]]:_(s32) = G_MUL [[UMULH3]], [[AND3]]
-    ; GFX6-NEXT: [[SUB4:%[0-9]+]]:_(s32) = G_SUB [[AND2]], [[MUL3]]
-    ; GFX6-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SUB4]](s32), [[AND3]]
+    ; GFX6-NEXT: [[UMULH3:%[0-9]+]]:_(s32) = G_UMULH [[LSHR]], [[ADD3]]
+    ; GFX6-NEXT: [[MUL3:%[0-9]+]]:_(s32) = G_MUL [[UMULH3]], [[LSHR1]]
+    ; GFX6-NEXT: [[SUB4:%[0-9]+]]:_(s32) = G_SUB [[LSHR]], [[MUL3]]
+    ; GFX6-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SUB4]](s32), [[LSHR1]]
     ; GFX6-NEXT: [[ADD4:%[0-9]+]]:_(s32) = G_ADD [[UMULH3]], [[C4]]
     ; GFX6-NEXT: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s1), [[ADD4]], [[UMULH3]]
-    ; GFX6-NEXT: [[SUB5:%[0-9]+]]:_(s32) = G_SUB [[SUB4]], [[AND3]]
+    ; GFX6-NEXT: [[SUB5:%[0-9]+]]:_(s32) = G_SUB [[SUB4]], [[LSHR1]]
     ; GFX6-NEXT: [[SELECT4:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s1), [[SUB5]], [[SUB4]]
-    ; GFX6-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SELECT4]](s32), [[AND3]]
+    ; GFX6-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SELECT4]](s32), [[LSHR1]]
     ; GFX6-NEXT: [[ADD5:%[0-9]+]]:_(s32) = G_ADD [[SELECT3]], [[C4]]
     ; GFX6-NEXT: [[SELECT5:%[0-9]+]]:_(s32) = G_SELECT [[ICMP3]](s1), [[ADD5]], [[SELECT3]]
-    ; GFX6-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[SELECT2]], [[C1]]
-    ; GFX6-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[SELECT5]], [[C1]]
-    ; GFX6-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C]](s32)
-    ; GFX6-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL]]
+    ; GFX6-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[SELECT2]], [[C1]]
+    ; GFX6-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[SELECT5]], [[C1]]
+    ; GFX6-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C]](s32)
+    ; GFX6-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL]]
     ; GFX6-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; GFX6-NEXT: $vgpr0 = COPY [[BITCAST2]](<2 x s16>)
+    ;
     ; GFX8-LABEL: name: test_udiv_v2s16
     ; GFX8: liveins: $vgpr0, $vgpr1
     ; GFX8-NEXT: {{  $}}
@@ -2371,33 +2385,32 @@ body: |
     ; GFX8-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SELECT1]](s32), [[AND1]]
     ; GFX8-NEXT: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[SELECT]], [[C4]]
     ; GFX8-NEXT: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s1), [[ADD2]], [[SELECT]]
-    ; GFX8-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]]
-    ; GFX8-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C1]]
-    ; GFX8-NEXT: [[UITOFP1:%[0-9]+]]:_(s32) = G_UITOFP [[AND3]](s32)
+    ; GFX8-NEXT: [[UITOFP1:%[0-9]+]]:_(s32) = G_UITOFP [[LSHR1]](s32)
     ; GFX8-NEXT: [[AMDGPU_RCP_IFLAG1:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG [[UITOFP1]](s32)
     ; GFX8-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG1]], [[C2]]
     ; GFX8-NEXT: [[FPTOUI1:%[0-9]+]]:_(s32) = G_FPTOUI [[FMUL1]](s32)
-    ; GFX8-NEXT: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[C3]], [[AND3]]
+    ; GFX8-NEXT: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[C3]], [[LSHR1]]
     ; GFX8-NEXT: [[MUL2:%[0-9]+]]:_(s32) = G_MUL [[SUB3]], [[FPTOUI1]]
     ; GFX8-NEXT: [[UMULH2:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI1]], [[MUL2]]
     ; GFX8-NEXT: [[ADD3:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI1]], [[UMULH2]]
-    ; GFX8-NEXT: [[UMULH3:%[0-9]+]]:_(s32) = G_UMULH [[AND2]], [[ADD3]]
-    ; GFX8-NEXT: [[MUL3:%[0-9]+]]:_(s32) = G_MUL [[UMULH3]], [[AND3]]
-    ; GFX8-NEXT: [[SUB4:%[0-9]+]]:_(s32) = G_SUB [[AND2]], [[MUL3]]
-    ; GFX8-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SUB4]](s32), [[AND3]]
+    ; GFX8-NEXT: [[UMULH3:%[0-9]+]]:_(s32) = G_UMULH [[LSHR]], [[ADD3]]
+    ; GFX8-NEXT: [[MUL3:%[0-9]+]]:_(s32) = G_MUL [[UMULH3]], [[LSHR1]]
+    ; GFX8-NEXT: [[SUB4:%[0-9]+]]:_(s32) = G_SUB [[LSHR]], [[MUL3]]
+    ; GFX8-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SUB4]](s32), [[LSHR1]]
     ; GFX8-NEXT: [[ADD4:%[0-9]+]]:_(s32) = G_ADD [[UMULH3]], [[C4]]
     ; GFX8-NEXT: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s1), [[ADD4]], [[UMULH3]]
-    ; GFX8-NEXT: [[SUB5:%[0-9]+]]:_(s32) = G_SUB [[SUB4]], [[AND3]]
+    ; GFX8-NEXT: [[SUB5:%[0-9]+]]:_(s32) = G_SUB [[SUB4]], [[LSHR1]]
     ; GFX8-NEXT: [[SELECT4:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s1), [[SUB5]], [[SUB4]]
-    ; GFX8-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SELECT4]](s32), [[AND3]]
+    ; GFX8-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SELECT4]](s32), [[LSHR1]]
     ; GFX8-NEXT: [[ADD5:%[0-9]+]]:_(s32) = G_ADD [[SELECT3]], [[C4]]
     ; GFX8-NEXT: [[SELECT5:%[0-9]+]]:_(s32) = G_SELECT [[ICMP3]](s1), [[ADD5]], [[SELECT3]]
-    ; GFX8-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[SELECT2]], [[C1]]
-    ; GFX8-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[SELECT5]], [[C1]]
-    ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C]](s32)
-    ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL]]
+    ; GFX8-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[SELECT2]], [[C1]]
+    ; GFX8-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[SELECT5]], [[C1]]
+    ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C]](s32)
+    ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL]]
     ; GFX8-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; GFX8-NEXT: $vgpr0 = COPY [[BITCAST2]](<2 x s16>)
+    ;
     ; GFX9-LABEL: name: test_udiv_v2s16
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -2434,30 +2447,29 @@ body: |
     ; GFX9-NEXT: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[SELECT]], [[C4]]
     ; GFX9-NEXT: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s1), [[ADD2]], [[SELECT]]
     ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[SELECT2]](s32)
-    ; GFX9-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]]
-    ; GFX9-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C1]]
-    ; GFX9-NEXT: [[UITOFP1:%[0-9]+]]:_(s32) = G_UITOFP [[AND3]](s32)
+    ; GFX9-NEXT: [[UITOFP1:%[0-9]+]]:_(s32) = G_UITOFP [[LSHR1]](s32)
     ; GFX9-NEXT: [[AMDGPU_RCP_IFLAG1:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG [[UITOFP1]](s32)
     ; GFX9-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG1]], [[C2]]
     ; GFX9-NEXT: [[FPTOUI1:%[0-9]+]]:_(s32) = G_FPTOUI [[FMUL1]](s32)
-    ; GFX9-NEXT: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[C3]], [[AND3]]
+    ; GFX9-NEXT: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[C3]], [[LSHR1]]
     ; GFX9-NEXT: [[MUL2:%[0-9]+]]:_(s32) = G_MUL [[SUB3]], [[FPTOUI1]]
     ; GFX9-NEXT: [[UMULH2:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI1]], [[MUL2]]
     ; GFX9-NEXT: [[ADD3:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI1]], [[UMULH2]]
-    ; GFX9-NEXT: [[UMULH3:%[0-9]+]]:_(s32) = G_UMULH [[AND2]], [[ADD3]]
-    ; GFX9-NEXT: [[MUL3:%[0-9]+]]:_(s32) = G_MUL [[UMULH3]], [[AND3]]
-    ; GFX9-NEXT: [[SUB4:%[0-9]+]]:_(s32) = G_SUB [[AND2]], [[MUL3]]
-    ; GFX9-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SUB4]](s32), [[AND3]]
+    ; GFX9-NEXT: [[UMULH3:%[0-9]+]]:_(s32) = G_UMULH [[LSHR]], [[ADD3]]
+    ; GFX9-NEXT: [[MUL3:%[0-9]+]]:_(s32) = G_MUL [[UMULH3]], [[LSHR1]]
+    ; GFX9-NEXT: [[SUB4:%[0-9]+]]:_(s32) = G_SUB [[LSHR]], [[MUL3]]
+    ; GFX9-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SUB4]](s32), [[LSHR1]]
     ; GFX9-NEXT: [[ADD4:%[0-9]+]]:_(s32) = G_ADD [[UMULH3]], [[C4]]
     ; GFX9-NEXT: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s1), [[ADD4]], [[UMULH3]]
-    ; GFX9-NEXT: [[SUB5:%[0-9]+]]:_(s32) = G_SUB [[SUB4]], [[AND3]]
+    ; GFX9-NEXT: [[SUB5:%[0-9]+]]:_(s32) = G_SUB [[SUB4]], [[LSHR1]]
     ; GFX9-NEXT: [[SELECT4:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s1), [[SUB5]], [[SUB4]]
-    ; GFX9-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SELECT4]](s32), [[AND3]]
+    ; GFX9-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SELECT4]](s32), [[LSHR1]]
     ; GFX9-NEXT: [[ADD5:%[0-9]+]]:_(s32) = G_ADD [[SELECT3]], [[C4]]
     ; GFX9-NEXT: [[SELECT5:%[0-9]+]]:_(s32) = G_SELECT [[ICMP3]](s1), [[ADD5]], [[SELECT3]]
     ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[SELECT5]](s32)
     ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
     ; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>)
+    ;
     ; GFX10-LABEL: name: test_udiv_v2s16
     ; GFX10: liveins: $vgpr0, $vgpr1
     ; GFX10-NEXT: {{  $}}
@@ -2494,25 +2506,23 @@ body: |
     ; GFX10-NEXT: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[SELECT]], [[C4]]
     ; GFX10-NEXT: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s1), [[ADD2]], [[SELECT]]
     ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[SELECT2]](s32)
-    ; GFX10-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]]
-    ; GFX10-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C1]]
-    ; GFX10-NEXT: [[UITOFP1:%[0-9]+]]:_(s32) = G_UITOFP [[AND3]](s32)
+    ; GFX10-NEXT: [[UITOFP1:%[0-9]+]]:_(s32) = G_UITOFP [[LSHR1]](s32)
     ; GFX10-NEXT: [[AMDGPU_RCP_IFLAG1:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG [[UITOFP1]](s32)
     ; GFX10-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG1]], [[C2]]
     ; GFX10-NEXT: [[FPTOUI1:%[0-9]+]]:_(s32) = G_FPTOUI [[FMUL1]](s32)
-    ; GFX10-NEXT: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[C3]], [[AND3]]
+    ; GFX10-NEXT: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[C3]], [[LSHR1]]
     ; GFX10-NEXT: [[MUL2:%[0-9]+]]:_(s32) = G_MUL [[SUB3]], [[FPTOUI1]]
     ; GFX10-NEXT: [[UMULH2:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI1]], [[MUL2]]
     ; GFX10-NEXT: [[ADD3:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI1]], [[UMULH2]]
-    ; GFX10-NEXT: [[UMULH3:%[0-9]+]]:_(s32) = G_UMULH [[AND2]], [[ADD3]]
-    ; GFX10-NEXT: [[MUL3:%[0-9]+]]:_(s32) = G_MUL [[UMULH3]], [[AND3]]
-    ; GFX10-NEXT: [[SUB4:%[0-9]+]]:_(s32) = G_SUB [[AND2]], [[MUL3]]
-    ; GFX10-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SUB4]](s32), [[AND3]]
+    ; GFX10-NEXT: [[UMULH3:%[0-9]+]]:_(s32) = G_UMULH [[LSHR]], [[ADD3]]
+    ; GFX10-NEXT: [[MUL3:%[0-9]+]]:_(s32) = G_MUL [[UMULH3]], [[LSHR1]]
+    ; GFX10-NEXT: [[SUB4:%[0-9]+]]:_(s32) = G_SUB [[LSHR]], [[MUL3]]
+    ; GFX10-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SUB4]](s32), [[LSHR1]]
     ; GFX10-NEXT: [[ADD4:%[0-9]+]]:_(s32) = G_ADD [[UMULH3]], [[C4]]
     ; GFX10-NEXT: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s1), [[ADD4]], [[UMULH3]]
-    ; GFX10-NEXT: [[SUB5:%[0-9]+]]:_(s32) = G_SUB [[SUB4]], [[AND3]]
+    ; GFX10-NEXT: [[SUB5:%[0-9]+]]:_(s32) = G_SUB [[SUB4]], [[LSHR1]]
     ; GFX10-NEXT: [[SELECT4:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s1), [[SUB5]], [[SUB4]]
-    ; GFX10-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SELECT4]](s32), [[AND3]]
+    ; GFX10-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SELECT4]](s32), [[LSHR1]]
     ; GFX10-NEXT: [[ADD5:%[0-9]+]]:_(s32) = G_ADD [[SELECT3]], [[C4]]
     ; GFX10-NEXT: [[SELECT5:%[0-9]+]]:_(s32) = G_SELECT [[ICMP3]](s1), [[ADD5]], [[SELECT3]]
     ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[SELECT5]](s32)
@@ -2561,6 +2571,7 @@ body: |
     ; GFX6-NEXT: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[SELECT]], [[C3]]
     ; GFX6-NEXT: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s1), [[ADD2]], [[SELECT]]
     ; GFX6-NEXT: $vgpr0 = COPY [[SELECT2]](s32)
+    ;
     ; GFX8-LABEL: name: test_udiv_s7
     ; GFX8: liveins: $vgpr0, $vgpr1
     ; GFX8-NEXT: {{  $}}
@@ -2592,6 +2603,7 @@ body: |
     ; GFX8-NEXT: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[SELECT]], [[C3]]
     ; GFX8-NEXT: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s1), [[ADD2]], [[SELECT]]
     ; GFX8-NEXT: $vgpr0 = COPY [[SELECT2]](s32)
+    ;
     ; GFX9-LABEL: name: test_udiv_s7
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -2623,6 +2635,7 @@ body: |
     ; GFX9-NEXT: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[SELECT]], [[C3]]
     ; GFX9-NEXT: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s1), [[ADD2]], [[SELECT]]
     ; GFX9-NEXT: $vgpr0 = COPY [[SELECT2]](s32)
+    ;
     ; GFX10-LABEL: name: test_udiv_s7
     ; GFX10: liveins: $vgpr0, $vgpr1
     ; GFX10-NEXT: {{  $}}
@@ -2700,6 +2713,7 @@ body: |
     ; GFX6-NEXT: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[SELECT]], [[C3]]
     ; GFX6-NEXT: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s1), [[ADD2]], [[SELECT]]
     ; GFX6-NEXT: $vgpr0 = COPY [[SELECT2]](s32)
+    ;
     ; GFX8-LABEL: name: test_udiv_s17
     ; GFX8: liveins: $vgpr0, $vgpr1
     ; GFX8-NEXT: {{  $}}
@@ -2731,6 +2745,7 @@ body: |
     ; GFX8-NEXT: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[SELECT]], [[C3]]
     ; GFX8-NEXT: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s1), [[ADD2]], [[SELECT]]
     ; GFX8-NEXT: $vgpr0 = COPY [[SELECT2]](s32)
+    ;
     ; GFX9-LABEL: name: test_udiv_s17
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -2762,6 +2777,7 @@ body: |
     ; GFX9-NEXT: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[SELECT]], [[C3]]
     ; GFX9-NEXT: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s1), [[ADD2]], [[SELECT]]
     ; GFX9-NEXT: $vgpr0 = COPY [[SELECT2]](s32)
+    ;
     ; GFX10-LABEL: name: test_udiv_s17
     ; GFX10: liveins: $vgpr0, $vgpr1
     ; GFX10-NEXT: {{  $}}
@@ -2961,6 +2977,7 @@ body: |
     ; GFX6-NEXT: [[ICMP7:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SELECT]](s32), [[C6]]
     ; GFX6-NEXT: [[SELECT3:%[0-9]+]]:_(s64) = G_SELECT [[ICMP7]](s1), [[SELECT2]], [[MV]]
     ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[SELECT3]](s64)
+    ;
     ; GFX8-LABEL: name: test_udiv_s33
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX8-NEXT: {{  $}}
@@ -3114,6 +3131,7 @@ body: |
     ; GFX8-NEXT: [[ICMP7:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SELECT]](s32), [[C6]]
     ; GFX8-NEXT: [[SELECT3:%[0-9]+]]:_(s64) = G_SELECT [[ICMP7]](s1), [[SELECT2]], [[MV]]
     ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[SELECT3]](s64)
+    ;
     ; GFX9-LABEL: name: test_udiv_s33
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -3267,6 +3285,7 @@ body: |
     ; GFX9-NEXT: [[ICMP7:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SELECT]](s32), [[C6]]
     ; GFX9-NEXT: [[SELECT3:%[0-9]+]]:_(s64) = G_SELECT [[ICMP7]](s1), [[SELECT2]], [[MV]]
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[SELECT3]](s64)
+    ;
     ; GFX10-LABEL: name: test_udiv_s33
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX10-NEXT: {{  $}}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-umax.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-umax.mir
index 18e24e8105d12..3f8bbbce2d31d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-umax.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-umax.mir
@@ -18,6 +18,7 @@ body: |
     ; SI-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
     ; SI-NEXT: [[UMAX:%[0-9]+]]:_(s32) = G_UMAX [[COPY]], [[COPY1]]
     ; SI-NEXT: $vgpr0 = COPY [[UMAX]](s32)
+    ;
     ; VI-LABEL: name: test_umax_s32
     ; VI: liveins: $vgpr0, $vgpr1
     ; VI-NEXT: {{  $}}
@@ -25,6 +26,7 @@ body: |
     ; VI-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
     ; VI-NEXT: [[UMAX:%[0-9]+]]:_(s32) = G_UMAX [[COPY]], [[COPY1]]
     ; VI-NEXT: $vgpr0 = COPY [[UMAX]](s32)
+    ;
     ; GFX9-LABEL: name: test_umax_s32
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -52,6 +54,7 @@ body: |
     ; SI-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ugt), [[COPY]](s64), [[COPY1]]
     ; SI-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s1), [[COPY]], [[COPY1]]
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[SELECT]](s64)
+    ;
     ; VI-LABEL: name: test_umax_s64
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; VI-NEXT: {{  $}}
@@ -60,6 +63,7 @@ body: |
     ; VI-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ugt), [[COPY]](s64), [[COPY1]]
     ; VI-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s1), [[COPY]], [[COPY1]]
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[SELECT]](s64)
+    ;
     ; GFX9-LABEL: name: test_umax_s64
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -90,6 +94,7 @@ body: |
     ; SI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C]]
     ; SI-NEXT: [[UMAX:%[0-9]+]]:_(s32) = G_UMAX [[AND]], [[AND1]]
     ; SI-NEXT: $vgpr0 = COPY [[UMAX]](s32)
+    ;
     ; VI-LABEL: name: test_umax_s16
     ; VI: liveins: $vgpr0, $vgpr1
     ; VI-NEXT: {{  $}}
@@ -100,6 +105,7 @@ body: |
     ; VI-NEXT: [[UMAX:%[0-9]+]]:_(s16) = G_UMAX [[TRUNC]], [[TRUNC1]]
     ; VI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UMAX]](s16)
     ; VI-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ;
     ; GFX9-LABEL: name: test_umax_s16
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -135,26 +141,28 @@ body: |
     ; SI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C]]
     ; SI-NEXT: [[UMAX:%[0-9]+]]:_(s32) = G_UMAX [[AND]], [[AND1]]
     ; SI-NEXT: $vgpr0 = COPY [[UMAX]](s32)
+    ;
     ; VI-LABEL: name: test_umax_s8
     ; VI: liveins: $vgpr0, $vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; VI-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; VI-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; VI-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+    ; VI-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; VI-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C]]
     ; VI-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
     ; VI-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C]]
     ; VI-NEXT: [[UMAX:%[0-9]+]]:_(s16) = G_UMAX [[AND]], [[AND1]]
     ; VI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UMAX]](s16)
     ; VI-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ;
     ; GFX9-LABEL: name: test_umax_s8
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C]]
     ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
     ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C]]
@@ -186,6 +194,7 @@ body: |
     ; SI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C]]
     ; SI-NEXT: [[UMAX:%[0-9]+]]:_(s32) = G_UMAX [[AND]], [[AND1]]
     ; SI-NEXT: $vgpr0 = COPY [[UMAX]](s32)
+    ;
     ; VI-LABEL: name: test_umax_s17
     ; VI: liveins: $vgpr0, $vgpr1
     ; VI-NEXT: {{  $}}
@@ -196,6 +205,7 @@ body: |
     ; VI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C]]
     ; VI-NEXT: [[UMAX:%[0-9]+]]:_(s32) = G_UMAX [[AND]], [[AND1]]
     ; VI-NEXT: $vgpr0 = COPY [[UMAX]](s32)
+    ;
     ; GFX9-LABEL: name: test_umax_s17
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -232,6 +242,7 @@ body: |
     ; SI-NEXT: [[UMAX1:%[0-9]+]]:_(s32) = G_UMAX [[UV1]], [[UV3]]
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[UMAX]](s32), [[UMAX1]](s32)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; VI-LABEL: name: test_umax_v2s32
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; VI-NEXT: {{  $}}
@@ -243,6 +254,7 @@ body: |
     ; VI-NEXT: [[UMAX1:%[0-9]+]]:_(s32) = G_UMAX [[UV1]], [[UV3]]
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[UMAX]](s32), [[UMAX1]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; GFX9-LABEL: name: test_umax_v2s32
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -278,6 +290,7 @@ body: |
     ; SI-NEXT: [[UMAX2:%[0-9]+]]:_(s32) = G_UMAX [[UV2]], [[UV5]]
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[UMAX]](s32), [[UMAX1]](s32), [[UMAX2]](s32)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
+    ;
     ; VI-LABEL: name: test_umax_v3s32
     ; VI: liveins: $vgpr0_vgpr1_vgpr2, $vgpr3_vgpr4_vgpr5
     ; VI-NEXT: {{  $}}
@@ -290,6 +303,7 @@ body: |
     ; VI-NEXT: [[UMAX2:%[0-9]+]]:_(s32) = G_UMAX [[UV2]], [[UV5]]
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[UMAX]](s32), [[UMAX1]](s32), [[UMAX2]](s32)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
+    ;
     ; GFX9-LABEL: name: test_umax_v3s32
     ; GFX9: liveins: $vgpr0_vgpr1_vgpr2, $vgpr3_vgpr4_vgpr5
     ; GFX9-NEXT: {{  $}}
@@ -328,15 +342,12 @@ body: |
     ; SI-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[BITCAST]], [[C1]]
     ; SI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C1]]
     ; SI-NEXT: [[UMAX:%[0-9]+]]:_(s32) = G_UMAX [[AND]], [[AND1]]
-    ; SI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]]
-    ; SI-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C1]]
-    ; SI-NEXT: [[UMAX1:%[0-9]+]]:_(s32) = G_UMAX [[AND2]], [[AND3]]
-    ; SI-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[UMAX]], [[C1]]
-    ; SI-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[UMAX1]], [[C1]]
-    ; SI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C]](s32)
-    ; SI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL]]
+    ; SI-NEXT: [[UMAX1:%[0-9]+]]:_(s32) = G_UMAX [[LSHR]], [[LSHR1]]
+    ; SI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[UMAX1]], [[C]](s32)
+    ; SI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[UMAX]], [[SHL]]
     ; SI-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; SI-NEXT: $vgpr0 = COPY [[BITCAST2]](<2 x s16>)
+    ;
     ; VI-LABEL: name: test_umax_v2s16
     ; VI: liveins: $vgpr0, $vgpr1
     ; VI-NEXT: {{  $}}
@@ -359,6 +370,7 @@ body: |
     ; VI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL]]
     ; VI-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; VI-NEXT: $vgpr0 = COPY [[BITCAST2]](<2 x s16>)
+    ;
     ; GFX9-LABEL: name: test_umax_v2s16
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -396,14 +408,13 @@ body: |
     ; SI-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[BITCAST]], [[C1]]
     ; SI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[BITCAST2]], [[C1]]
     ; SI-NEXT: [[UMAX:%[0-9]+]]:_(s32) = G_UMAX [[AND]], [[AND1]]
-    ; SI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]]
-    ; SI-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C1]]
-    ; SI-NEXT: [[UMAX1:%[0-9]+]]:_(s32) = G_UMAX [[AND2]], [[AND3]]
-    ; SI-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C1]]
-    ; SI-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[BITCAST3]], [[C1]]
-    ; SI-NEXT: [[UMAX2:%[0-9]+]]:_(s32) = G_UMAX [[AND4]], [[AND5]]
+    ; SI-NEXT: [[UMAX1:%[0-9]+]]:_(s32) = G_UMAX [[LSHR]], [[LSHR1]]
+    ; SI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C1]]
+    ; SI-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[BITCAST3]], [[C1]]
+    ; SI-NEXT: [[UMAX2:%[0-9]+]]:_(s32) = G_UMAX [[AND2]], [[AND3]]
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[UMAX]](s32), [[UMAX1]](s32), [[UMAX2]](s32)
     ; SI-NEXT: S_NOP 0, implicit [[BUILD_VECTOR]](<3 x s32>)
+    ;
     ; VI-LABEL: name: test_umax_v3s16
     ; VI: liveins: $vgpr0, $vgpr1
     ; VI-NEXT: {{  $}}
@@ -432,6 +443,7 @@ body: |
     ; VI-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[UMAX2]](s16)
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[ANYEXT]](s32), [[ANYEXT1]](s32), [[ANYEXT2]](s32)
     ; VI-NEXT: S_NOP 0, implicit [[BUILD_VECTOR]](<3 x s32>)
+    ;
     ; GFX9-LABEL: name: test_umax_v3s16
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -484,27 +496,20 @@ body: |
     ; SI-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[BITCAST]], [[C1]]
     ; SI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[BITCAST2]], [[C1]]
     ; SI-NEXT: [[UMAX:%[0-9]+]]:_(s32) = G_UMAX [[AND]], [[AND1]]
-    ; SI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]]
-    ; SI-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[LSHR2]], [[C1]]
-    ; SI-NEXT: [[UMAX1:%[0-9]+]]:_(s32) = G_UMAX [[AND2]], [[AND3]]
-    ; SI-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C1]]
-    ; SI-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[BITCAST3]], [[C1]]
-    ; SI-NEXT: [[UMAX2:%[0-9]+]]:_(s32) = G_UMAX [[AND4]], [[AND5]]
-    ; SI-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C1]]
-    ; SI-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[LSHR3]], [[C1]]
-    ; SI-NEXT: [[UMAX3:%[0-9]+]]:_(s32) = G_UMAX [[AND6]], [[AND7]]
-    ; SI-NEXT: [[AND8:%[0-9]+]]:_(s32) = G_AND [[UMAX]], [[C1]]
-    ; SI-NEXT: [[AND9:%[0-9]+]]:_(s32) = G_AND [[UMAX1]], [[C1]]
-    ; SI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[C]](s32)
-    ; SI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND8]], [[SHL]]
+    ; SI-NEXT: [[UMAX1:%[0-9]+]]:_(s32) = G_UMAX [[LSHR]], [[LSHR2]]
+    ; SI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C1]]
+    ; SI-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[BITCAST3]], [[C1]]
+    ; SI-NEXT: [[UMAX2:%[0-9]+]]:_(s32) = G_UMAX [[AND2]], [[AND3]]
+    ; SI-NEXT: [[UMAX3:%[0-9]+]]:_(s32) = G_UMAX [[LSHR1]], [[LSHR3]]
+    ; SI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[UMAX1]], [[C]](s32)
+    ; SI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[UMAX]], [[SHL]]
     ; SI-NEXT: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
-    ; SI-NEXT: [[AND10:%[0-9]+]]:_(s32) = G_AND [[UMAX2]], [[C1]]
-    ; SI-NEXT: [[AND11:%[0-9]+]]:_(s32) = G_AND [[UMAX3]], [[C1]]
-    ; SI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[C]](s32)
-    ; SI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND10]], [[SHL1]]
+    ; SI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[UMAX3]], [[C]](s32)
+    ; SI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[UMAX2]], [[SHL1]]
     ; SI-NEXT: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
     ; SI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
     ; VI-LABEL: name: test_umax_v4s16
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; VI-NEXT: {{  $}}
@@ -545,6 +550,7 @@ body: |
     ; VI-NEXT: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
     ; VI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
     ; GFX9-LABEL: name: test_umax_v4s16
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-umin.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-umin.mir
index 50776dafdb7eb..f01143f0e0a9b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-umin.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-umin.mir
@@ -18,6 +18,7 @@ body: |
     ; SI-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
     ; SI-NEXT: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[COPY]], [[COPY1]]
     ; SI-NEXT: $vgpr0 = COPY [[UMIN]](s32)
+    ;
     ; VI-LABEL: name: test_umin_s32
     ; VI: liveins: $vgpr0, $vgpr1
     ; VI-NEXT: {{  $}}
@@ -25,6 +26,7 @@ body: |
     ; VI-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
     ; VI-NEXT: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[COPY]], [[COPY1]]
     ; VI-NEXT: $vgpr0 = COPY [[UMIN]](s32)
+    ;
     ; GFX9-LABEL: name: test_umin_s32
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -52,6 +54,7 @@ body: |
     ; SI-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY]](s64), [[COPY1]]
     ; SI-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s1), [[COPY]], [[COPY1]]
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[SELECT]](s64)
+    ;
     ; VI-LABEL: name: test_umin_s64
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; VI-NEXT: {{  $}}
@@ -60,6 +63,7 @@ body: |
     ; VI-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY]](s64), [[COPY1]]
     ; VI-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s1), [[COPY]], [[COPY1]]
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[SELECT]](s64)
+    ;
     ; GFX9-LABEL: name: test_umin_s64
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -90,6 +94,7 @@ body: |
     ; SI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C]]
     ; SI-NEXT: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[AND]], [[AND1]]
     ; SI-NEXT: $vgpr0 = COPY [[UMIN]](s32)
+    ;
     ; VI-LABEL: name: test_umin_s16
     ; VI: liveins: $vgpr0, $vgpr1
     ; VI-NEXT: {{  $}}
@@ -100,6 +105,7 @@ body: |
     ; VI-NEXT: [[UMIN:%[0-9]+]]:_(s16) = G_UMIN [[TRUNC]], [[TRUNC1]]
     ; VI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UMIN]](s16)
     ; VI-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ;
     ; GFX9-LABEL: name: test_umin_s16
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -135,26 +141,28 @@ body: |
     ; SI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C]]
     ; SI-NEXT: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[AND]], [[AND1]]
     ; SI-NEXT: $vgpr0 = COPY [[UMIN]](s32)
+    ;
     ; VI-LABEL: name: test_umin_s8
     ; VI: liveins: $vgpr0, $vgpr1
     ; VI-NEXT: {{  $}}
     ; VI-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; VI-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; VI-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; VI-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+    ; VI-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; VI-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C]]
     ; VI-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
     ; VI-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C]]
     ; VI-NEXT: [[UMIN:%[0-9]+]]:_(s16) = G_UMIN [[AND]], [[AND1]]
     ; VI-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UMIN]](s16)
     ; VI-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ;
     ; GFX9-LABEL: name: test_umin_s8
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C]]
     ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
     ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C]]
@@ -186,6 +194,7 @@ body: |
     ; SI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C]]
     ; SI-NEXT: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[AND]], [[AND1]]
     ; SI-NEXT: $vgpr0 = COPY [[UMIN]](s32)
+    ;
     ; VI-LABEL: name: test_umin_s17
     ; VI: liveins: $vgpr0, $vgpr1
     ; VI-NEXT: {{  $}}
@@ -196,6 +205,7 @@ body: |
     ; VI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C]]
     ; VI-NEXT: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[AND]], [[AND1]]
     ; VI-NEXT: $vgpr0 = COPY [[UMIN]](s32)
+    ;
     ; GFX9-LABEL: name: test_umin_s17
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -232,6 +242,7 @@ body: |
     ; SI-NEXT: [[UMIN1:%[0-9]+]]:_(s32) = G_UMIN [[UV1]], [[UV3]]
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[UMIN]](s32), [[UMIN1]](s32)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; VI-LABEL: name: test_umin_v2s32
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; VI-NEXT: {{  $}}
@@ -243,6 +254,7 @@ body: |
     ; VI-NEXT: [[UMIN1:%[0-9]+]]:_(s32) = G_UMIN [[UV1]], [[UV3]]
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[UMIN]](s32), [[UMIN1]](s32)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; GFX9-LABEL: name: test_umin_v2s32
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -278,6 +290,7 @@ body: |
     ; SI-NEXT: [[UMIN2:%[0-9]+]]:_(s32) = G_UMIN [[UV2]], [[UV5]]
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[UMIN]](s32), [[UMIN1]](s32), [[UMIN2]](s32)
     ; SI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
+    ;
     ; VI-LABEL: name: test_umin_v3s32
     ; VI: liveins: $vgpr0_vgpr1_vgpr2, $vgpr3_vgpr4_vgpr5
     ; VI-NEXT: {{  $}}
@@ -290,6 +303,7 @@ body: |
     ; VI-NEXT: [[UMIN2:%[0-9]+]]:_(s32) = G_UMIN [[UV2]], [[UV5]]
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[UMIN]](s32), [[UMIN1]](s32), [[UMIN2]](s32)
     ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
+    ;
     ; GFX9-LABEL: name: test_umin_v3s32
     ; GFX9: liveins: $vgpr0_vgpr1_vgpr2, $vgpr3_vgpr4_vgpr5
     ; GFX9-NEXT: {{  $}}
@@ -328,15 +342,12 @@ body: |
     ; SI-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[BITCAST]], [[C1]]
     ; SI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C1]]
     ; SI-NEXT: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[AND]], [[AND1]]
-    ; SI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]]
-    ; SI-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C1]]
-    ; SI-NEXT: [[UMIN1:%[0-9]+]]:_(s32) = G_UMIN [[AND2]], [[AND3]]
-    ; SI-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[UMIN]], [[C1]]
-    ; SI-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[UMIN1]], [[C1]]
-    ; SI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C]](s32)
-    ; SI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL]]
+    ; SI-NEXT: [[UMIN1:%[0-9]+]]:_(s32) = G_UMIN [[LSHR]], [[LSHR1]]
+    ; SI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[UMIN1]], [[C]](s32)
+    ; SI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[UMIN]], [[SHL]]
     ; SI-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; SI-NEXT: $vgpr0 = COPY [[BITCAST2]](<2 x s16>)
+    ;
     ; VI-LABEL: name: test_umin_v2s16
     ; VI: liveins: $vgpr0, $vgpr1
     ; VI-NEXT: {{  $}}
@@ -359,6 +370,7 @@ body: |
     ; VI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL]]
     ; VI-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; VI-NEXT: $vgpr0 = COPY [[BITCAST2]](<2 x s16>)
+    ;
     ; GFX9-LABEL: name: test_umin_v2s16
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -396,14 +408,13 @@ body: |
     ; SI-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[BITCAST]], [[C1]]
     ; SI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[BITCAST2]], [[C1]]
     ; SI-NEXT: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[AND]], [[AND1]]
-    ; SI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]]
-    ; SI-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C1]]
-    ; SI-NEXT: [[UMIN1:%[0-9]+]]:_(s32) = G_UMIN [[AND2]], [[AND3]]
-    ; SI-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C1]]
-    ; SI-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[BITCAST3]], [[C1]]
-    ; SI-NEXT: [[UMIN2:%[0-9]+]]:_(s32) = G_UMIN [[AND4]], [[AND5]]
+    ; SI-NEXT: [[UMIN1:%[0-9]+]]:_(s32) = G_UMIN [[LSHR]], [[LSHR1]]
+    ; SI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C1]]
+    ; SI-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[BITCAST3]], [[C1]]
+    ; SI-NEXT: [[UMIN2:%[0-9]+]]:_(s32) = G_UMIN [[AND2]], [[AND3]]
     ; SI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[UMIN]](s32), [[UMIN1]](s32), [[UMIN2]](s32)
     ; SI-NEXT: S_NOP 0, implicit [[BUILD_VECTOR]](<3 x s32>)
+    ;
     ; VI-LABEL: name: test_umin_v3s16
     ; VI: liveins: $vgpr0, $vgpr1
     ; VI-NEXT: {{  $}}
@@ -432,6 +443,7 @@ body: |
     ; VI-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[UMIN2]](s16)
     ; VI-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[ANYEXT]](s32), [[ANYEXT1]](s32), [[ANYEXT2]](s32)
     ; VI-NEXT: S_NOP 0, implicit [[BUILD_VECTOR]](<3 x s32>)
+    ;
     ; GFX9-LABEL: name: test_umin_v3s16
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -484,27 +496,20 @@ body: |
     ; SI-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[BITCAST]], [[C1]]
     ; SI-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[BITCAST2]], [[C1]]
     ; SI-NEXT: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[AND]], [[AND1]]
-    ; SI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]]
-    ; SI-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[LSHR2]], [[C1]]
-    ; SI-NEXT: [[UMIN1:%[0-9]+]]:_(s32) = G_UMIN [[AND2]], [[AND3]]
-    ; SI-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C1]]
-    ; SI-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[BITCAST3]], [[C1]]
-    ; SI-NEXT: [[UMIN2:%[0-9]+]]:_(s32) = G_UMIN [[AND4]], [[AND5]]
-    ; SI-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C1]]
-    ; SI-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[LSHR3]], [[C1]]
-    ; SI-NEXT: [[UMIN3:%[0-9]+]]:_(s32) = G_UMIN [[AND6]], [[AND7]]
-    ; SI-NEXT: [[AND8:%[0-9]+]]:_(s32) = G_AND [[UMIN]], [[C1]]
-    ; SI-NEXT: [[AND9:%[0-9]+]]:_(s32) = G_AND [[UMIN1]], [[C1]]
-    ; SI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[C]](s32)
-    ; SI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND8]], [[SHL]]
+    ; SI-NEXT: [[UMIN1:%[0-9]+]]:_(s32) = G_UMIN [[LSHR]], [[LSHR2]]
+    ; SI-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C1]]
+    ; SI-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[BITCAST3]], [[C1]]
+    ; SI-NEXT: [[UMIN2:%[0-9]+]]:_(s32) = G_UMIN [[AND2]], [[AND3]]
+    ; SI-NEXT: [[UMIN3:%[0-9]+]]:_(s32) = G_UMIN [[LSHR1]], [[LSHR3]]
+    ; SI-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[UMIN1]], [[C]](s32)
+    ; SI-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[UMIN]], [[SHL]]
     ; SI-NEXT: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
-    ; SI-NEXT: [[AND10:%[0-9]+]]:_(s32) = G_AND [[UMIN2]], [[C1]]
-    ; SI-NEXT: [[AND11:%[0-9]+]]:_(s32) = G_AND [[UMIN3]], [[C1]]
-    ; SI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[C]](s32)
-    ; SI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND10]], [[SHL1]]
+    ; SI-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[UMIN3]], [[C]](s32)
+    ; SI-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[UMIN2]], [[SHL1]]
     ; SI-NEXT: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
     ; SI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>)
     ; SI-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
     ; VI-LABEL: name: test_umin_v4s16
     ; VI: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; VI-NEXT: {{  $}}
@@ -545,6 +550,7 @@ body: |
     ; VI-NEXT: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
     ; VI-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>)
     ; VI-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
     ; GFX9-LABEL: name: test_umin_v4s16
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-umulh.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-umulh.mir
index 50deea3089095..3a919f004964b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-umulh.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-umulh.mir
@@ -15,6 +15,7 @@ body: |
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
     ; GFX8-NEXT: [[UMULH:%[0-9]+]]:_(s32) = G_UMULH [[COPY]], [[COPY1]]
     ; GFX8-NEXT: $vgpr0 = COPY [[UMULH]](s32)
+    ;
     ; GFX9-LABEL: name: test_umulh_s32
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -45,6 +46,7 @@ body: |
     ; GFX8-NEXT: [[UMULH1:%[0-9]+]]:_(s32) = G_UMULH [[UV1]], [[UV3]]
     ; GFX8-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[UMULH]](s32), [[UMULH1]](s32)
     ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; GFX9-LABEL: name: test_umulh_v2s32
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -98,6 +100,7 @@ body: |
     ; GFX8-NEXT: [[ADD3:%[0-9]+]]:_(s32) = G_ADD [[UMULH3]], [[ADD2]]
     ; GFX8-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO8]](s32), [[ADD3]](s32)
     ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64)
+    ;
     ; GFX9-LABEL: name: test_umulh_s64
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -197,6 +200,7 @@ body: |
     ; GFX8-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO18]](s32), [[ADD7]](s32)
     ; GFX8-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV]](s64), [[MV1]](s64)
     ; GFX8-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ;
     ; GFX9-LABEL: name: test_umulh_v2s64
     ; GFX9: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7
     ; GFX9-NEXT: {{  $}}
@@ -277,8 +281,8 @@ body: |
     ; GFX8-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[AND]], [[AND1]]
     ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
     ; GFX8-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[MUL]], [[C1]](s32)
-    ; GFX8-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C]]
-    ; GFX8-NEXT: $vgpr0 = COPY [[AND2]](s32)
+    ; GFX8-NEXT: $vgpr0 = COPY [[LSHR]](s32)
+    ;
     ; GFX9-LABEL: name: test_umulh_s16
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -290,8 +294,7 @@ body: |
     ; GFX9-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[AND]], [[AND1]]
     ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
     ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[MUL]], [[C1]](s32)
-    ; GFX9-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C]]
-    ; GFX9-NEXT: $vgpr0 = COPY [[AND2]](s32)
+    ; GFX9-NEXT: $vgpr0 = COPY [[LSHR]](s32)
     %0:_(s32) = COPY $vgpr0
     %1:_(s32) = COPY $vgpr1
     %2:_(s16) = G_TRUNC %0
@@ -312,33 +315,34 @@ body: |
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX8-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; GFX8-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+    ; GFX8-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C]]
     ; GFX8-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
     ; GFX8-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C]]
     ; GFX8-NEXT: [[MUL:%[0-9]+]]:_(s16) = G_MUL [[AND]], [[AND1]]
     ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
     ; GFX8-NEXT: [[LSHR:%[0-9]+]]:_(s16) = G_LSHR [[MUL]], [[C1]](s16)
-    ; GFX8-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
     ; GFX8-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR]](s16)
+    ; GFX8-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
     ; GFX8-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C2]]
     ; GFX8-NEXT: $vgpr0 = COPY [[AND2]](s32)
+    ;
     ; GFX9-LABEL: name: test_umulh_s8
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C]]
     ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
     ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C]]
     ; GFX9-NEXT: [[MUL:%[0-9]+]]:_(s16) = G_MUL [[AND]], [[AND1]]
     ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
     ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s16) = G_LSHR [[MUL]], [[C1]](s16)
-    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
     ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR]](s16)
+    ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
     ; GFX9-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C2]]
     ; GFX9-NEXT: $vgpr0 = COPY [[AND2]](s32)
     %0:_(s32) = COPY $vgpr0
@@ -372,17 +376,15 @@ body: |
     ; GFX8-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[UV3]], [[C]]
     ; GFX8-NEXT: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[AND2]], [[AND3]]
     ; GFX8-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[MUL1]], [[C1]](s32)
-    ; GFX8-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C]]
-    ; GFX8-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C]]
-    ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C1]](s32)
-    ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL]]
+    ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LSHR1]], [[C1]](s32)
+    ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[LSHR]], [[SHL]]
     ; GFX8-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; GFX8-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[BITCAST]](<2 x s16>)
     ; GFX8-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32)
-    ; GFX8-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C]]
-    ; GFX8-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[LSHR2]], [[C]]
-    ; GFX8-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[AND6]](s32), [[AND7]](s32)
+    ; GFX8-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C]]
+    ; GFX8-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[AND4]](s32), [[LSHR2]](s32)
     ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; GFX9-LABEL: name: test_umulh_v2s16
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -400,9 +402,7 @@ body: |
     ; GFX9-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[UV3]], [[C]]
     ; GFX9-NEXT: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[AND2]], [[AND3]]
     ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[MUL1]], [[C1]](s32)
-    ; GFX9-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C]]
-    ; GFX9-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C]]
-    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[AND4]](s32), [[AND5]](s32)
+    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[LSHR]](s32), [[LSHR1]](s32)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
     %0:_(<2 x s32>) = COPY $vgpr0_vgpr1
     %1:_(<2 x s32>) = COPY $vgpr2_vgpr3
@@ -427,8 +427,8 @@ body: |
     ; GFX8-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
     ; GFX8-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY $vgpr4
     ; GFX8-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
-    ; GFX8-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; GFX8-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+    ; GFX8-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C]]
     ; GFX8-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY3]](s32)
     ; GFX8-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C]]
@@ -448,21 +448,19 @@ body: |
     ; GFX8-NEXT: [[MUL2:%[0-9]+]]:_(s16) = G_MUL [[AND4]], [[AND5]]
     ; GFX8-NEXT: [[LSHR2:%[0-9]+]]:_(s16) = G_LSHR [[MUL2]], [[C1]](s16)
     ; GFX8-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-    ; GFX8-NEXT: [[AND6:%[0-9]+]]:_(s16) = G_AND [[LSHR]], [[C]]
-    ; GFX8-NEXT: [[AND7:%[0-9]+]]:_(s16) = G_AND [[LSHR1]], [[C]]
-    ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C1]](s16)
-    ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL]]
-    ; GFX8-NEXT: [[AND8:%[0-9]+]]:_(s16) = G_AND [[LSHR2]], [[C]]
+    ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[LSHR1]], [[C1]](s16)
+    ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[LSHR]], [[SHL]]
     ; GFX8-NEXT: [[TRUNC6:%[0-9]+]]:_(s16) = G_TRUNC [[DEF]](s32)
-    ; GFX8-NEXT: [[AND9:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C]]
-    ; GFX8-NEXT: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C1]](s16)
-    ; GFX8-NEXT: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL1]]
+    ; GFX8-NEXT: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC6]], [[C]]
+    ; GFX8-NEXT: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND6]], [[C1]](s16)
+    ; GFX8-NEXT: [[OR1:%[0-9]+]]:_(s16) = G_OR [[LSHR2]], [[SHL1]]
     ; GFX8-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16)
     ; GFX8-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16)
     ; GFX8-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
     ; GFX8-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C2]](s32)
     ; GFX8-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]]
     ; GFX8-NEXT: $vgpr0 = COPY [[OR2]](s32)
+    ;
     ; GFX9-LABEL: name: test_umulh_v3s8
     ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
     ; GFX9-NEXT: {{  $}}
@@ -503,11 +501,10 @@ body: |
     ; GFX9-NEXT: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C]]
     ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[AND7]], [[C1]](s16)
     ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND6]], [[SHL]]
-    ; GFX9-NEXT: [[AND8:%[0-9]+]]:_(s16) = G_AND [[LSHR1]], [[C]]
     ; GFX9-NEXT: [[TRUNC8:%[0-9]+]]:_(s16) = G_TRUNC [[DEF]](s32)
-    ; GFX9-NEXT: [[AND9:%[0-9]+]]:_(s16) = G_AND [[TRUNC8]], [[C]]
-    ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND9]], [[C1]](s16)
-    ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND8]], [[SHL1]]
+    ; GFX9-NEXT: [[AND8:%[0-9]+]]:_(s16) = G_AND [[TRUNC8]], [[C]]
+    ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND8]], [[C1]](s16)
+    ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s16) = G_OR [[LSHR1]], [[SHL1]]
     ; GFX9-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16)
     ; GFX9-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16)
     ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[ZEXT1]], [[C2]](s32)
@@ -546,8 +543,8 @@ body: |
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
     ; GFX8-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
     ; GFX8-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
-    ; GFX8-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; GFX8-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+    ; GFX8-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C]]
     ; GFX8-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY2]](s32)
     ; GFX8-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C]]
@@ -560,12 +557,11 @@ body: |
     ; GFX8-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C]]
     ; GFX8-NEXT: [[MUL1:%[0-9]+]]:_(s16) = G_MUL [[AND2]], [[AND3]]
     ; GFX8-NEXT: [[LSHR1:%[0-9]+]]:_(s16) = G_LSHR [[MUL1]], [[C1]](s16)
-    ; GFX8-NEXT: [[AND4:%[0-9]+]]:_(s16) = G_AND [[LSHR]], [[C]]
-    ; GFX8-NEXT: [[AND5:%[0-9]+]]:_(s16) = G_AND [[LSHR1]], [[C]]
-    ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[AND5]], [[C1]](s16)
-    ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL]]
+    ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[LSHR1]], [[C1]](s16)
+    ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[LSHR]], [[SHL]]
     ; GFX8-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[OR]](s16)
     ; GFX8-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ;
     ; GFX9-LABEL: name: test_umulh_v2s8
     ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -635,8 +631,8 @@ body: |
     ; GFX8-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[COPY1]], [[C]](s32)
     ; GFX8-NEXT: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[COPY1]], [[C1]](s32)
     ; GFX8-NEXT: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[COPY1]], [[C2]](s32)
-    ; GFX8-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; GFX8-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+    ; GFX8-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C3]]
     ; GFX8-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
     ; GFX8-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C3]]
@@ -661,8 +657,8 @@ body: |
     ; GFX8-NEXT: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC7]], [[C3]]
     ; GFX8-NEXT: [[MUL3:%[0-9]+]]:_(s16) = G_MUL [[AND6]], [[AND7]]
     ; GFX8-NEXT: [[LSHR9:%[0-9]+]]:_(s16) = G_LSHR [[MUL3]], [[C4]](s16)
-    ; GFX8-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
     ; GFX8-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR6]](s16)
+    ; GFX8-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
     ; GFX8-NEXT: [[AND8:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C5]]
     ; GFX8-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR7]](s16)
     ; GFX8-NEXT: [[AND9:%[0-9]+]]:_(s32) = G_AND [[ANYEXT1]], [[C5]]
@@ -677,6 +673,7 @@ body: |
     ; GFX8-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[C2]](s32)
     ; GFX8-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[SHL2]]
     ; GFX8-NEXT: $vgpr0 = COPY [[OR2]](s32)
+    ;
     ; GFX9-LABEL: name: test_umulh_v4s8
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-umulo.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-umulo.mir
index 7f9993d9fce25..13c52d08b4941 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-umulo.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-umulo.mir
@@ -20,6 +20,7 @@ body: |
     ; GFX8-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[ICMP]](s1)
     ; GFX8-NEXT: $vgpr0 = COPY [[MUL]](s32)
     ; GFX8-NEXT: $vgpr1 = COPY [[ZEXT]](s32)
+    ;
     ; GFX9-LABEL: name: test_umulo_s32
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -69,6 +70,7 @@ body: |
     ; GFX8-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[AND]](s32), [[AND1]](s32)
     ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
     ; GFX8-NEXT: $vgpr2_vgpr3 = COPY [[BUILD_VECTOR1]](<2 x s32>)
+    ;
     ; GFX9-LABEL: name: test_umulo_v2s32
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -84,8 +86,8 @@ body: |
     ; GFX9-NEXT: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[UV1]], [[UV3]]
     ; GFX9-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[UMULH1]](s32), [[C]]
     ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[MUL]](s32), [[MUL1]](s32)
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[ICMP]](s1)
+    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C1]]
     ; GFX9-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[ICMP1]](s1)
     ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[ANYEXT1]], [[C1]]
@@ -149,6 +151,7 @@ body: |
     ; GFX8-NEXT: [[ZEXT5:%[0-9]+]]:_(s64) = G_ZEXT [[ICMP]](s1)
     ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[MV1]](s64)
     ; GFX8-NEXT: $vgpr2_vgpr3 = COPY [[ZEXT5]](s64)
+    ;
     ; GFX9-LABEL: name: test_umulo_s64
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -283,14 +286,15 @@ body: |
     ; GFX8-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UV24]](s32), [[UV26]](s32)
     ; GFX8-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[MV2]](s64), [[C]]
     ; GFX8-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV1]](s64), [[MV3]](s64)
-    ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
     ; GFX8-NEXT: [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[ICMP]](s1)
+    ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
     ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[ANYEXT2]], [[C1]]
     ; GFX8-NEXT: [[ANYEXT3:%[0-9]+]]:_(s64) = G_ANYEXT [[ICMP1]](s1)
     ; GFX8-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[ANYEXT3]], [[C1]]
     ; GFX8-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[AND]](s64), [[AND1]](s64)
     ; GFX8-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
     ; GFX8-NEXT: $vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR1]](<2 x s64>)
+    ;
     ; GFX9-LABEL: name: test_umulo_v2s64
     ; GFX9: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7
     ; GFX9-NEXT: {{  $}}
@@ -368,8 +372,8 @@ body: |
     ; GFX9-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UV24]](s32), [[UV26]](s32)
     ; GFX9-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[MV2]](s64), [[C]]
     ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[MV1]](s64), [[MV3]](s64)
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
     ; GFX9-NEXT: [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[ICMP]](s1)
+    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
     ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[ANYEXT2]], [[C1]]
     ; GFX9-NEXT: [[ANYEXT3:%[0-9]+]]:_(s64) = G_ANYEXT [[ICMP1]](s1)
     ; GFX9-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[ANYEXT3]], [[C1]]
@@ -409,6 +413,7 @@ body: |
     ; GFX8-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s1)
     ; GFX8-NEXT: $vgpr0 = COPY [[AND3]](s32)
     ; GFX8-NEXT: $vgpr1 = COPY [[ZEXT]](s32)
+    ;
     ; GFX9-LABEL: name: test_umulo_s24
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -461,6 +466,7 @@ body: |
     ; GFX8-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[ICMP]](s1)
     ; GFX8-NEXT: $vgpr0 = COPY [[AND3]](s32)
     ; GFX8-NEXT: $vgpr1 = COPY [[ZEXT]](s32)
+    ;
     ; GFX9-LABEL: name: test_umulo_s16
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -508,6 +514,7 @@ body: |
     ; GFX8-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[ICMP]](s1)
     ; GFX8-NEXT: $vgpr0 = COPY [[AND3]](s32)
     ; GFX8-NEXT: $vgpr1 = COPY [[ZEXT]](s32)
+    ;
     ; GFX9-LABEL: name: test_umulo_s8
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -572,10 +579,10 @@ body: |
     ; GFX8-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[BITCAST]](<2 x s16>)
     ; GFX8-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C1]](s32)
     ; GFX8-NEXT: [[AND10:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C]]
-    ; GFX8-NEXT: [[AND11:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C]]
-    ; GFX8-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[AND10]](s32), [[AND11]](s32)
+    ; GFX8-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[AND10]](s32), [[LSHR]](s32)
     ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR1]](<2 x s32>)
     ; GFX8-NEXT: $vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; GFX9-LABEL: name: test_umulo_v2s16
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -594,8 +601,8 @@ body: |
     ; GFX9-NEXT: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[AND3]], [[AND4]]
     ; GFX9-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[MUL1]], [[C]]
     ; GFX9-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[MUL1]](s32), [[AND5]]
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[ICMP]](s1)
+    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; GFX9-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C1]]
     ; GFX9-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[ICMP1]](s1)
     ; GFX9-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[ANYEXT1]], [[C1]]
@@ -640,8 +647,8 @@ body: |
     ; GFX8-NEXT: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[AND3]], [[AND4]]
     ; GFX8-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[MUL1]], [[C]]
     ; GFX8-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[MUL1]](s32), [[AND5]]
-    ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; GFX8-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[MUL]](s32)
+    ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; GFX8-NEXT: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C1]]
     ; GFX8-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[MUL1]](s32)
     ; GFX8-NEXT: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C1]]
@@ -654,6 +661,7 @@ body: |
     ; GFX8-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
     ; GFX8-NEXT: $vgpr1 = COPY [[ANYEXT1]](s32)
     ; GFX8-NEXT: $vgpr2 = COPY [[ANYEXT2]](s32)
+    ;
     ; GFX9-LABEL: name: test_umulo_v2s8
     ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -672,8 +680,8 @@ body: |
     ; GFX9-NEXT: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[AND3]], [[AND4]]
     ; GFX9-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[MUL1]], [[C]]
     ; GFX9-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[MUL1]](s32), [[AND5]]
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[MUL]](s32)
+    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; GFX9-NEXT: [[AND6:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C1]]
     ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[MUL1]](s32)
     ; GFX9-NEXT: [[AND7:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C1]]
@@ -740,22 +748,21 @@ body: |
     ; GFX8-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C3]]
     ; GFX8-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[LSHR4]], [[C3]]
     ; GFX8-NEXT: [[MUL2:%[0-9]+]]:_(s32) = G_MUL [[AND5]], [[AND6]]
-    ; GFX8-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[LSHR2]], [[C3]]
-    ; GFX8-NEXT: [[AND8:%[0-9]+]]:_(s32) = G_AND [[LSHR5]], [[C3]]
-    ; GFX8-NEXT: [[MUL3:%[0-9]+]]:_(s32) = G_MUL [[AND7]], [[AND8]]
-    ; GFX8-NEXT: [[AND9:%[0-9]+]]:_(s32) = G_AND [[MUL]], [[C3]]
-    ; GFX8-NEXT: [[AND10:%[0-9]+]]:_(s32) = G_AND [[MUL1]], [[C3]]
-    ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND10]], [[C]](s32)
-    ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND9]], [[SHL]]
-    ; GFX8-NEXT: [[AND11:%[0-9]+]]:_(s32) = G_AND [[MUL2]], [[C3]]
-    ; GFX8-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[C1]](s32)
+    ; GFX8-NEXT: [[MUL3:%[0-9]+]]:_(s32) = G_MUL [[LSHR2]], [[LSHR5]]
+    ; GFX8-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[MUL]], [[C3]]
+    ; GFX8-NEXT: [[AND8:%[0-9]+]]:_(s32) = G_AND [[MUL1]], [[C3]]
+    ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND8]], [[C]](s32)
+    ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND7]], [[SHL]]
+    ; GFX8-NEXT: [[AND9:%[0-9]+]]:_(s32) = G_AND [[MUL2]], [[C3]]
+    ; GFX8-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[C1]](s32)
     ; GFX8-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
-    ; GFX8-NEXT: [[AND12:%[0-9]+]]:_(s32) = G_AND [[MUL3]], [[C3]]
-    ; GFX8-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND12]], [[C2]](s32)
+    ; GFX8-NEXT: [[AND10:%[0-9]+]]:_(s32) = G_AND [[MUL3]], [[C3]]
+    ; GFX8-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND10]], [[C2]](s32)
     ; GFX8-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[SHL2]]
     ; GFX8-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[ICMP]](s1)
     ; GFX8-NEXT: $vgpr0 = COPY [[OR2]](s32)
     ; GFX8-NEXT: $vgpr1 = COPY [[ANYEXT]](s32)
+    ;
     ; GFX9-LABEL: name: test_umulo_v4s8
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -782,18 +789,16 @@ body: |
     ; GFX9-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C3]]
     ; GFX9-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[LSHR4]], [[C3]]
     ; GFX9-NEXT: [[MUL2:%[0-9]+]]:_(s32) = G_MUL [[AND5]], [[AND6]]
-    ; GFX9-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[LSHR2]], [[C3]]
-    ; GFX9-NEXT: [[AND8:%[0-9]+]]:_(s32) = G_AND [[LSHR5]], [[C3]]
-    ; GFX9-NEXT: [[MUL3:%[0-9]+]]:_(s32) = G_MUL [[AND7]], [[AND8]]
-    ; GFX9-NEXT: [[AND9:%[0-9]+]]:_(s32) = G_AND [[MUL]], [[C3]]
-    ; GFX9-NEXT: [[AND10:%[0-9]+]]:_(s32) = G_AND [[MUL1]], [[C3]]
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND10]], [[C]](s32)
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND9]], [[SHL]]
-    ; GFX9-NEXT: [[AND11:%[0-9]+]]:_(s32) = G_AND [[MUL2]], [[C3]]
-    ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[C1]](s32)
+    ; GFX9-NEXT: [[MUL3:%[0-9]+]]:_(s32) = G_MUL [[LSHR2]], [[LSHR5]]
+    ; GFX9-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[MUL]], [[C3]]
+    ; GFX9-NEXT: [[AND8:%[0-9]+]]:_(s32) = G_AND [[MUL1]], [[C3]]
+    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND8]], [[C]](s32)
+    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND7]], [[SHL]]
+    ; GFX9-NEXT: [[AND9:%[0-9]+]]:_(s32) = G_AND [[MUL2]], [[C3]]
+    ; GFX9-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[C1]](s32)
     ; GFX9-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[OR]], [[SHL1]]
-    ; GFX9-NEXT: [[AND12:%[0-9]+]]:_(s32) = G_AND [[MUL3]], [[C3]]
-    ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND12]], [[C2]](s32)
+    ; GFX9-NEXT: [[AND10:%[0-9]+]]:_(s32) = G_AND [[MUL3]], [[C3]]
+    ; GFX9-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND10]], [[C2]](s32)
     ; GFX9-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[OR1]], [[SHL2]]
     ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[ICMP]](s1)
     ; GFX9-NEXT: $vgpr0 = COPY [[OR2]](s32)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-unmerge-values.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-unmerge-values.mir
index cb1112043c213..c231aa8334d45 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-unmerge-values.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-unmerge-values.mir
@@ -263,10 +263,9 @@ body: |
     ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[UV]], [[C2]]
     ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[AND]], [[COPY1]](s32)
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C1]](s32)
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C2]]
-    ; CHECK-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[AND1]], [[COPY2]](s32)
-    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[UV1]], [[C2]]
-    ; CHECK-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[AND2]], [[C1]](s32)
+    ; CHECK-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[LSHR]], [[COPY2]](s32)
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[UV1]], [[C2]]
+    ; CHECK-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[AND1]], [[C1]](s32)
     ; CHECK-NEXT: $vgpr0 = COPY [[UV]](s32)
     ; CHECK-NEXT: $vgpr1 = COPY [[LSHR1]](s32)
     ; CHECK-NEXT: $vgpr2 = COPY [[LSHR]](s32)
@@ -335,13 +334,11 @@ body: |
     ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[UV]], [[C2]]
     ; CHECK-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[AND]], [[COPY1]](s32)
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C1]](s32)
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C2]]
-    ; CHECK-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[AND1]], [[COPY2]](s32)
+    ; CHECK-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[LSHR]], [[COPY2]](s32)
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C1]](s32)
-    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[UV1]], [[C2]]
-    ; CHECK-NEXT: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[AND2]], [[COPY3]](s32)
-    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C2]]
-    ; CHECK-NEXT: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[AND3]], [[C1]](s32)
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[UV1]], [[C2]]
+    ; CHECK-NEXT: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[AND1]], [[COPY3]](s32)
+    ; CHECK-NEXT: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[LSHR1]], [[C1]](s32)
     ; CHECK-NEXT: $vgpr0 = COPY [[UV]](s32)
     ; CHECK-NEXT: $vgpr1 = COPY [[LSHR2]](s32)
     ; CHECK-NEXT: $vgpr2 = COPY [[LSHR]](s32)
@@ -389,13 +386,11 @@ body: |
     ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[UV]], [[C2]]
     ; CHECK-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[AND]], [[COPY1]](s32)
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C1]](s32)
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C2]]
-    ; CHECK-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[AND1]], [[COPY2]](s32)
+    ; CHECK-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[LSHR]], [[COPY2]](s32)
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[C1]](s32)
-    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[UV1]], [[C2]]
-    ; CHECK-NEXT: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[AND2]], [[COPY3]](s32)
-    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C2]]
-    ; CHECK-NEXT: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[AND3]], [[C1]](s32)
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[UV1]], [[C2]]
+    ; CHECK-NEXT: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[AND1]], [[COPY3]](s32)
+    ; CHECK-NEXT: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[LSHR1]], [[C1]](s32)
     ; CHECK-NEXT: $vgpr0 = COPY [[UV]](s32)
     ; CHECK-NEXT: $vgpr1 = COPY [[LSHR2]](s32)
     ; CHECK-NEXT: $vgpr2 = COPY [[LSHR]](s32)
@@ -451,29 +446,23 @@ body: |
     ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[UV]], [[C2]]
     ; CHECK-NEXT: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[AND2]], [[COPY3]](s32)
     ; CHECK-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY [[C1]](s32)
-    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C2]]
-    ; CHECK-NEXT: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[AND3]], [[COPY4]](s32)
+    ; CHECK-NEXT: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[LSHR]], [[COPY4]](s32)
     ; CHECK-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY [[C3]](s32)
-    ; CHECK-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C2]]
-    ; CHECK-NEXT: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[AND4]], [[COPY5]](s32)
+    ; CHECK-NEXT: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[LSHR]], [[COPY5]](s32)
     ; CHECK-NEXT: [[COPY6:%[0-9]+]]:_(s32) = COPY [[C4]](s32)
-    ; CHECK-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C2]]
-    ; CHECK-NEXT: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[AND5]], [[COPY6]](s32)
+    ; CHECK-NEXT: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[LSHR]], [[COPY6]](s32)
     ; CHECK-NEXT: [[COPY7:%[0-9]+]]:_(s32) = COPY [[C1]](s32)
-    ; CHECK-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[UV1]], [[C2]]
-    ; CHECK-NEXT: [[LSHR8:%[0-9]+]]:_(s32) = G_LSHR [[AND6]], [[COPY7]](s32)
+    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[UV1]], [[C2]]
+    ; CHECK-NEXT: [[LSHR8:%[0-9]+]]:_(s32) = G_LSHR [[AND3]], [[COPY7]](s32)
     ; CHECK-NEXT: [[COPY8:%[0-9]+]]:_(s32) = COPY [[C3]](s32)
-    ; CHECK-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[UV1]], [[C2]]
-    ; CHECK-NEXT: [[LSHR9:%[0-9]+]]:_(s32) = G_LSHR [[AND7]], [[COPY8]](s32)
+    ; CHECK-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[UV1]], [[C2]]
+    ; CHECK-NEXT: [[LSHR9:%[0-9]+]]:_(s32) = G_LSHR [[AND4]], [[COPY8]](s32)
     ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY [[C4]](s32)
-    ; CHECK-NEXT: [[AND8:%[0-9]+]]:_(s32) = G_AND [[UV1]], [[C2]]
-    ; CHECK-NEXT: [[LSHR10:%[0-9]+]]:_(s32) = G_LSHR [[AND8]], [[COPY9]](s32)
-    ; CHECK-NEXT: [[AND9:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C2]]
-    ; CHECK-NEXT: [[LSHR11:%[0-9]+]]:_(s32) = G_LSHR [[AND9]], [[C1]](s32)
-    ; CHECK-NEXT: [[AND10:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C2]]
-    ; CHECK-NEXT: [[LSHR12:%[0-9]+]]:_(s32) = G_LSHR [[AND10]], [[C3]](s32)
-    ; CHECK-NEXT: [[AND11:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C2]]
-    ; CHECK-NEXT: [[LSHR13:%[0-9]+]]:_(s32) = G_LSHR [[AND11]], [[C4]](s32)
+    ; CHECK-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[UV1]], [[C2]]
+    ; CHECK-NEXT: [[LSHR10:%[0-9]+]]:_(s32) = G_LSHR [[AND5]], [[COPY9]](s32)
+    ; CHECK-NEXT: [[LSHR11:%[0-9]+]]:_(s32) = G_LSHR [[LSHR1]], [[C1]](s32)
+    ; CHECK-NEXT: [[LSHR12:%[0-9]+]]:_(s32) = G_LSHR [[LSHR1]], [[C3]](s32)
+    ; CHECK-NEXT: [[LSHR13:%[0-9]+]]:_(s32) = G_LSHR [[LSHR1]], [[C4]](s32)
     ; CHECK-NEXT: $vgpr0 = COPY [[UV]](s32)
     ; CHECK-NEXT: $vgpr1 = COPY [[LSHR2]](s32)
     ; CHECK-NEXT: $vgpr2 = COPY [[LSHR3]](s32)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-urem.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-urem.mir
index 28d5f842140f1..44f44123bb736 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-urem.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-urem.mir
@@ -36,6 +36,7 @@ body: |
     ; GFX6-NEXT: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[SELECT]], [[COPY1]]
     ; GFX6-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s1), [[SUB3]], [[SELECT]]
     ; GFX6-NEXT: $vgpr0 = COPY [[SELECT1]](s32)
+    ;
     ; GFX8-LABEL: name: test_urem_s32
     ; GFX8: liveins: $vgpr0, $vgpr1
     ; GFX8-NEXT: {{  $}}
@@ -61,6 +62,7 @@ body: |
     ; GFX8-NEXT: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[SELECT]], [[COPY1]]
     ; GFX8-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s1), [[SUB3]], [[SELECT]]
     ; GFX8-NEXT: $vgpr0 = COPY [[SELECT1]](s32)
+    ;
     ; GFX9-LABEL: name: test_urem_s32
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -86,6 +88,7 @@ body: |
     ; GFX9-NEXT: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[SELECT]], [[COPY1]]
     ; GFX9-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s1), [[SUB3]], [[SELECT]]
     ; GFX9-NEXT: $vgpr0 = COPY [[SELECT1]](s32)
+    ;
     ; GFX10-LABEL: name: test_urem_s32
     ; GFX10: liveins: $vgpr0, $vgpr1
     ; GFX10-NEXT: {{  $}}
@@ -168,6 +171,7 @@ body: |
     ; GFX6-NEXT: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[ICMP3]](s1), [[SUB7]], [[SELECT2]]
     ; GFX6-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[SELECT1]](s32), [[SELECT3]](s32)
     ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; GFX8-LABEL: name: test_urem_v2s32
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX8-NEXT: {{  $}}
@@ -213,6 +217,7 @@ body: |
     ; GFX8-NEXT: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[ICMP3]](s1), [[SUB7]], [[SELECT2]]
     ; GFX8-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[SELECT1]](s32), [[SELECT3]](s32)
     ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; GFX9-LABEL: name: test_urem_v2s32
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -258,6 +263,7 @@ body: |
     ; GFX9-NEXT: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[ICMP3]](s1), [[SUB7]], [[SELECT2]]
     ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[SELECT1]](s32), [[SELECT3]](s32)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; GFX10-LABEL: name: test_urem_v2s32
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX10-NEXT: {{  $}}
@@ -461,6 +467,7 @@ body: |
     ; GFX6-NEXT: [[ICMP7:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SELECT]](s32), [[C5]]
     ; GFX6-NEXT: [[SELECT3:%[0-9]+]]:_(s64) = G_SELECT [[ICMP7]](s1), [[SELECT2]], [[MV]]
     ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[SELECT3]](s64)
+    ;
     ; GFX8-LABEL: name: test_urem_s64
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX8-NEXT: {{  $}}
@@ -607,6 +614,7 @@ body: |
     ; GFX8-NEXT: [[ICMP7:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SELECT]](s32), [[C5]]
     ; GFX8-NEXT: [[SELECT3:%[0-9]+]]:_(s64) = G_SELECT [[ICMP7]](s1), [[SELECT2]], [[MV]]
     ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[SELECT3]](s64)
+    ;
     ; GFX9-LABEL: name: test_urem_s64
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -753,6 +761,7 @@ body: |
     ; GFX9-NEXT: [[ICMP7:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SELECT]](s32), [[C5]]
     ; GFX9-NEXT: [[SELECT3:%[0-9]+]]:_(s64) = G_SELECT [[ICMP7]](s1), [[SELECT2]], [[MV]]
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[SELECT3]](s64)
+    ;
     ; GFX10-LABEL: name: test_urem_s64
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX10-NEXT: {{  $}}
@@ -1194,6 +1203,7 @@ body: |
     ; GFX6-NEXT: [[SELECT7:%[0-9]+]]:_(s64) = G_SELECT [[ICMP15]](s1), [[SELECT6]], [[MV3]]
     ; GFX6-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[SELECT3]](s64), [[SELECT7]](s64)
     ; GFX6-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ;
     ; GFX8-LABEL: name: test_urem_v2s64
     ; GFX8: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7
     ; GFX8-NEXT: {{  $}}
@@ -1477,6 +1487,7 @@ body: |
     ; GFX8-NEXT: [[SELECT7:%[0-9]+]]:_(s64) = G_SELECT [[ICMP15]](s1), [[SELECT6]], [[MV3]]
     ; GFX8-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[SELECT3]](s64), [[SELECT7]](s64)
     ; GFX8-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ;
     ; GFX9-LABEL: name: test_urem_v2s64
     ; GFX9: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7
     ; GFX9-NEXT: {{  $}}
@@ -1760,6 +1771,7 @@ body: |
     ; GFX9-NEXT: [[SELECT7:%[0-9]+]]:_(s64) = G_SELECT [[ICMP15]](s1), [[SELECT6]], [[MV3]]
     ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[SELECT3]](s64), [[SELECT7]](s64)
     ; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ;
     ; GFX10-LABEL: name: test_urem_v2s64
     ; GFX10: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7
     ; GFX10-NEXT: {{  $}}
@@ -2084,6 +2096,7 @@ body: |
     ; GFX6-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s1), [[SUB3]], [[SELECT]]
     ; GFX6-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[SELECT1]], [[C]]
     ; GFX6-NEXT: $vgpr0 = COPY [[AND2]](s32)
+    ;
     ; GFX8-LABEL: name: test_urem_s16
     ; GFX8: liveins: $vgpr0, $vgpr1
     ; GFX8-NEXT: {{  $}}
@@ -2113,6 +2126,7 @@ body: |
     ; GFX8-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s1), [[SUB3]], [[SELECT]]
     ; GFX8-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[SELECT1]], [[C]]
     ; GFX8-NEXT: $vgpr0 = COPY [[AND2]](s32)
+    ;
     ; GFX9-LABEL: name: test_urem_s16
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -2142,6 +2156,7 @@ body: |
     ; GFX9-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s1), [[SUB3]], [[SELECT]]
     ; GFX9-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[SELECT1]], [[C]]
     ; GFX9-NEXT: $vgpr0 = COPY [[AND2]](s32)
+    ;
     ; GFX10-LABEL: name: test_urem_s16
     ; GFX10: liveins: $vgpr0, $vgpr1
     ; GFX10-NEXT: {{  $}}
@@ -2218,31 +2233,30 @@ body: |
     ; GFX6-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SELECT]](s32), [[AND1]]
     ; GFX6-NEXT: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[SELECT]], [[AND1]]
     ; GFX6-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s1), [[SUB3]], [[SELECT]]
-    ; GFX6-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]]
-    ; GFX6-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C1]]
-    ; GFX6-NEXT: [[UITOFP1:%[0-9]+]]:_(s32) = G_UITOFP [[AND3]](s32)
+    ; GFX6-NEXT: [[UITOFP1:%[0-9]+]]:_(s32) = G_UITOFP [[LSHR1]](s32)
     ; GFX6-NEXT: [[AMDGPU_RCP_IFLAG1:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG [[UITOFP1]](s32)
     ; GFX6-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG1]], [[C2]]
     ; GFX6-NEXT: [[FPTOUI1:%[0-9]+]]:_(s32) = G_FPTOUI [[FMUL1]](s32)
-    ; GFX6-NEXT: [[SUB4:%[0-9]+]]:_(s32) = G_SUB [[C3]], [[AND3]]
+    ; GFX6-NEXT: [[SUB4:%[0-9]+]]:_(s32) = G_SUB [[C3]], [[LSHR1]]
     ; GFX6-NEXT: [[MUL2:%[0-9]+]]:_(s32) = G_MUL [[SUB4]], [[FPTOUI1]]
     ; GFX6-NEXT: [[UMULH2:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI1]], [[MUL2]]
     ; GFX6-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI1]], [[UMULH2]]
-    ; GFX6-NEXT: [[UMULH3:%[0-9]+]]:_(s32) = G_UMULH [[AND2]], [[ADD1]]
-    ; GFX6-NEXT: [[MUL3:%[0-9]+]]:_(s32) = G_MUL [[UMULH3]], [[AND3]]
-    ; GFX6-NEXT: [[SUB5:%[0-9]+]]:_(s32) = G_SUB [[AND2]], [[MUL3]]
-    ; GFX6-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SUB5]](s32), [[AND3]]
-    ; GFX6-NEXT: [[SUB6:%[0-9]+]]:_(s32) = G_SUB [[SUB5]], [[AND3]]
+    ; GFX6-NEXT: [[UMULH3:%[0-9]+]]:_(s32) = G_UMULH [[LSHR]], [[ADD1]]
+    ; GFX6-NEXT: [[MUL3:%[0-9]+]]:_(s32) = G_MUL [[UMULH3]], [[LSHR1]]
+    ; GFX6-NEXT: [[SUB5:%[0-9]+]]:_(s32) = G_SUB [[LSHR]], [[MUL3]]
+    ; GFX6-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SUB5]](s32), [[LSHR1]]
+    ; GFX6-NEXT: [[SUB6:%[0-9]+]]:_(s32) = G_SUB [[SUB5]], [[LSHR1]]
     ; GFX6-NEXT: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s1), [[SUB6]], [[SUB5]]
-    ; GFX6-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SELECT2]](s32), [[AND3]]
-    ; GFX6-NEXT: [[SUB7:%[0-9]+]]:_(s32) = G_SUB [[SELECT2]], [[AND3]]
+    ; GFX6-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SELECT2]](s32), [[LSHR1]]
+    ; GFX6-NEXT: [[SUB7:%[0-9]+]]:_(s32) = G_SUB [[SELECT2]], [[LSHR1]]
     ; GFX6-NEXT: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[ICMP3]](s1), [[SUB7]], [[SELECT2]]
-    ; GFX6-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[SELECT1]], [[C1]]
-    ; GFX6-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[SELECT3]], [[C1]]
-    ; GFX6-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C]](s32)
-    ; GFX6-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL]]
+    ; GFX6-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[SELECT1]], [[C1]]
+    ; GFX6-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[SELECT3]], [[C1]]
+    ; GFX6-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C]](s32)
+    ; GFX6-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL]]
     ; GFX6-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; GFX6-NEXT: $vgpr0 = COPY [[BITCAST2]](<2 x s16>)
+    ;
     ; GFX8-LABEL: name: test_urem_v2s16
     ; GFX8: liveins: $vgpr0, $vgpr1
     ; GFX8-NEXT: {{  $}}
@@ -2275,31 +2289,30 @@ body: |
     ; GFX8-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SELECT]](s32), [[AND1]]
     ; GFX8-NEXT: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[SELECT]], [[AND1]]
     ; GFX8-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s1), [[SUB3]], [[SELECT]]
-    ; GFX8-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]]
-    ; GFX8-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C1]]
-    ; GFX8-NEXT: [[UITOFP1:%[0-9]+]]:_(s32) = G_UITOFP [[AND3]](s32)
+    ; GFX8-NEXT: [[UITOFP1:%[0-9]+]]:_(s32) = G_UITOFP [[LSHR1]](s32)
     ; GFX8-NEXT: [[AMDGPU_RCP_IFLAG1:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG [[UITOFP1]](s32)
     ; GFX8-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG1]], [[C2]]
     ; GFX8-NEXT: [[FPTOUI1:%[0-9]+]]:_(s32) = G_FPTOUI [[FMUL1]](s32)
-    ; GFX8-NEXT: [[SUB4:%[0-9]+]]:_(s32) = G_SUB [[C3]], [[AND3]]
+    ; GFX8-NEXT: [[SUB4:%[0-9]+]]:_(s32) = G_SUB [[C3]], [[LSHR1]]
     ; GFX8-NEXT: [[MUL2:%[0-9]+]]:_(s32) = G_MUL [[SUB4]], [[FPTOUI1]]
     ; GFX8-NEXT: [[UMULH2:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI1]], [[MUL2]]
     ; GFX8-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI1]], [[UMULH2]]
-    ; GFX8-NEXT: [[UMULH3:%[0-9]+]]:_(s32) = G_UMULH [[AND2]], [[ADD1]]
-    ; GFX8-NEXT: [[MUL3:%[0-9]+]]:_(s32) = G_MUL [[UMULH3]], [[AND3]]
-    ; GFX8-NEXT: [[SUB5:%[0-9]+]]:_(s32) = G_SUB [[AND2]], [[MUL3]]
-    ; GFX8-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SUB5]](s32), [[AND3]]
-    ; GFX8-NEXT: [[SUB6:%[0-9]+]]:_(s32) = G_SUB [[SUB5]], [[AND3]]
+    ; GFX8-NEXT: [[UMULH3:%[0-9]+]]:_(s32) = G_UMULH [[LSHR]], [[ADD1]]
+    ; GFX8-NEXT: [[MUL3:%[0-9]+]]:_(s32) = G_MUL [[UMULH3]], [[LSHR1]]
+    ; GFX8-NEXT: [[SUB5:%[0-9]+]]:_(s32) = G_SUB [[LSHR]], [[MUL3]]
+    ; GFX8-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SUB5]](s32), [[LSHR1]]
+    ; GFX8-NEXT: [[SUB6:%[0-9]+]]:_(s32) = G_SUB [[SUB5]], [[LSHR1]]
     ; GFX8-NEXT: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s1), [[SUB6]], [[SUB5]]
-    ; GFX8-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SELECT2]](s32), [[AND3]]
-    ; GFX8-NEXT: [[SUB7:%[0-9]+]]:_(s32) = G_SUB [[SELECT2]], [[AND3]]
+    ; GFX8-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SELECT2]](s32), [[LSHR1]]
+    ; GFX8-NEXT: [[SUB7:%[0-9]+]]:_(s32) = G_SUB [[SELECT2]], [[LSHR1]]
     ; GFX8-NEXT: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[ICMP3]](s1), [[SUB7]], [[SELECT2]]
-    ; GFX8-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[SELECT1]], [[C1]]
-    ; GFX8-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[SELECT3]], [[C1]]
-    ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C]](s32)
-    ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL]]
+    ; GFX8-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[SELECT1]], [[C1]]
+    ; GFX8-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[SELECT3]], [[C1]]
+    ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C]](s32)
+    ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL]]
     ; GFX8-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; GFX8-NEXT: $vgpr0 = COPY [[BITCAST2]](<2 x s16>)
+    ;
     ; GFX9-LABEL: name: test_urem_v2s16
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -2333,28 +2346,27 @@ body: |
     ; GFX9-NEXT: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[SELECT]], [[AND1]]
     ; GFX9-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s1), [[SUB3]], [[SELECT]]
     ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[SELECT1]](s32)
-    ; GFX9-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]]
-    ; GFX9-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C1]]
-    ; GFX9-NEXT: [[UITOFP1:%[0-9]+]]:_(s32) = G_UITOFP [[AND3]](s32)
+    ; GFX9-NEXT: [[UITOFP1:%[0-9]+]]:_(s32) = G_UITOFP [[LSHR1]](s32)
     ; GFX9-NEXT: [[AMDGPU_RCP_IFLAG1:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG [[UITOFP1]](s32)
     ; GFX9-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG1]], [[C2]]
     ; GFX9-NEXT: [[FPTOUI1:%[0-9]+]]:_(s32) = G_FPTOUI [[FMUL1]](s32)
-    ; GFX9-NEXT: [[SUB4:%[0-9]+]]:_(s32) = G_SUB [[C3]], [[AND3]]
+    ; GFX9-NEXT: [[SUB4:%[0-9]+]]:_(s32) = G_SUB [[C3]], [[LSHR1]]
     ; GFX9-NEXT: [[MUL2:%[0-9]+]]:_(s32) = G_MUL [[SUB4]], [[FPTOUI1]]
     ; GFX9-NEXT: [[UMULH2:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI1]], [[MUL2]]
     ; GFX9-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI1]], [[UMULH2]]
-    ; GFX9-NEXT: [[UMULH3:%[0-9]+]]:_(s32) = G_UMULH [[AND2]], [[ADD1]]
-    ; GFX9-NEXT: [[MUL3:%[0-9]+]]:_(s32) = G_MUL [[UMULH3]], [[AND3]]
-    ; GFX9-NEXT: [[SUB5:%[0-9]+]]:_(s32) = G_SUB [[AND2]], [[MUL3]]
-    ; GFX9-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SUB5]](s32), [[AND3]]
-    ; GFX9-NEXT: [[SUB6:%[0-9]+]]:_(s32) = G_SUB [[SUB5]], [[AND3]]
+    ; GFX9-NEXT: [[UMULH3:%[0-9]+]]:_(s32) = G_UMULH [[LSHR]], [[ADD1]]
+    ; GFX9-NEXT: [[MUL3:%[0-9]+]]:_(s32) = G_MUL [[UMULH3]], [[LSHR1]]
+    ; GFX9-NEXT: [[SUB5:%[0-9]+]]:_(s32) = G_SUB [[LSHR]], [[MUL3]]
+    ; GFX9-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SUB5]](s32), [[LSHR1]]
+    ; GFX9-NEXT: [[SUB6:%[0-9]+]]:_(s32) = G_SUB [[SUB5]], [[LSHR1]]
     ; GFX9-NEXT: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s1), [[SUB6]], [[SUB5]]
-    ; GFX9-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SELECT2]](s32), [[AND3]]
-    ; GFX9-NEXT: [[SUB7:%[0-9]+]]:_(s32) = G_SUB [[SELECT2]], [[AND3]]
+    ; GFX9-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SELECT2]](s32), [[LSHR1]]
+    ; GFX9-NEXT: [[SUB7:%[0-9]+]]:_(s32) = G_SUB [[SELECT2]], [[LSHR1]]
     ; GFX9-NEXT: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[ICMP3]](s1), [[SUB7]], [[SELECT2]]
     ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[SELECT3]](s32)
     ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
     ; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>)
+    ;
     ; GFX10-LABEL: name: test_urem_v2s16
     ; GFX10: liveins: $vgpr0, $vgpr1
     ; GFX10-NEXT: {{  $}}
@@ -2388,24 +2400,22 @@ body: |
     ; GFX10-NEXT: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[SELECT]], [[AND1]]
     ; GFX10-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s1), [[SUB3]], [[SELECT]]
     ; GFX10-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[SELECT1]](s32)
-    ; GFX10-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]]
-    ; GFX10-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C1]]
-    ; GFX10-NEXT: [[UITOFP1:%[0-9]+]]:_(s32) = G_UITOFP [[AND3]](s32)
+    ; GFX10-NEXT: [[UITOFP1:%[0-9]+]]:_(s32) = G_UITOFP [[LSHR1]](s32)
     ; GFX10-NEXT: [[AMDGPU_RCP_IFLAG1:%[0-9]+]]:_(s32) = G_AMDGPU_RCP_IFLAG [[UITOFP1]](s32)
     ; GFX10-NEXT: [[FMUL1:%[0-9]+]]:_(s32) = G_FMUL [[AMDGPU_RCP_IFLAG1]], [[C2]]
     ; GFX10-NEXT: [[FPTOUI1:%[0-9]+]]:_(s32) = G_FPTOUI [[FMUL1]](s32)
-    ; GFX10-NEXT: [[SUB4:%[0-9]+]]:_(s32) = G_SUB [[C3]], [[AND3]]
+    ; GFX10-NEXT: [[SUB4:%[0-9]+]]:_(s32) = G_SUB [[C3]], [[LSHR1]]
     ; GFX10-NEXT: [[MUL2:%[0-9]+]]:_(s32) = G_MUL [[SUB4]], [[FPTOUI1]]
     ; GFX10-NEXT: [[UMULH2:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI1]], [[MUL2]]
     ; GFX10-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI1]], [[UMULH2]]
-    ; GFX10-NEXT: [[UMULH3:%[0-9]+]]:_(s32) = G_UMULH [[AND2]], [[ADD1]]
-    ; GFX10-NEXT: [[MUL3:%[0-9]+]]:_(s32) = G_MUL [[UMULH3]], [[AND3]]
-    ; GFX10-NEXT: [[SUB5:%[0-9]+]]:_(s32) = G_SUB [[AND2]], [[MUL3]]
-    ; GFX10-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SUB5]](s32), [[AND3]]
-    ; GFX10-NEXT: [[SUB6:%[0-9]+]]:_(s32) = G_SUB [[SUB5]], [[AND3]]
+    ; GFX10-NEXT: [[UMULH3:%[0-9]+]]:_(s32) = G_UMULH [[LSHR]], [[ADD1]]
+    ; GFX10-NEXT: [[MUL3:%[0-9]+]]:_(s32) = G_MUL [[UMULH3]], [[LSHR1]]
+    ; GFX10-NEXT: [[SUB5:%[0-9]+]]:_(s32) = G_SUB [[LSHR]], [[MUL3]]
+    ; GFX10-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SUB5]](s32), [[LSHR1]]
+    ; GFX10-NEXT: [[SUB6:%[0-9]+]]:_(s32) = G_SUB [[SUB5]], [[LSHR1]]
     ; GFX10-NEXT: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s1), [[SUB6]], [[SUB5]]
-    ; GFX10-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SELECT2]](s32), [[AND3]]
-    ; GFX10-NEXT: [[SUB7:%[0-9]+]]:_(s32) = G_SUB [[SELECT2]], [[AND3]]
+    ; GFX10-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[SELECT2]](s32), [[LSHR1]]
+    ; GFX10-NEXT: [[SUB7:%[0-9]+]]:_(s32) = G_SUB [[SELECT2]], [[LSHR1]]
     ; GFX10-NEXT: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[ICMP3]](s1), [[SUB7]], [[SELECT2]]
     ; GFX10-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[SELECT3]](s32)
     ; GFX10-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
@@ -2450,6 +2460,7 @@ body: |
     ; GFX6-NEXT: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[SELECT]], [[AND1]]
     ; GFX6-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s1), [[SUB3]], [[SELECT]]
     ; GFX6-NEXT: $vgpr0 = COPY [[SELECT1]](s32)
+    ;
     ; GFX8-LABEL: name: test_urem_s7
     ; GFX8: liveins: $vgpr0, $vgpr1
     ; GFX8-NEXT: {{  $}}
@@ -2478,6 +2489,7 @@ body: |
     ; GFX8-NEXT: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[SELECT]], [[AND1]]
     ; GFX8-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s1), [[SUB3]], [[SELECT]]
     ; GFX8-NEXT: $vgpr0 = COPY [[SELECT1]](s32)
+    ;
     ; GFX9-LABEL: name: test_urem_s7
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -2506,6 +2518,7 @@ body: |
     ; GFX9-NEXT: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[SELECT]], [[AND1]]
     ; GFX9-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s1), [[SUB3]], [[SELECT]]
     ; GFX9-NEXT: $vgpr0 = COPY [[SELECT1]](s32)
+    ;
     ; GFX10-LABEL: name: test_urem_s7
     ; GFX10: liveins: $vgpr0, $vgpr1
     ; GFX10-NEXT: {{  $}}
@@ -2577,6 +2590,7 @@ body: |
     ; GFX6-NEXT: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[SELECT]], [[AND1]]
     ; GFX6-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s1), [[SUB3]], [[SELECT]]
     ; GFX6-NEXT: $vgpr0 = COPY [[SELECT1]](s32)
+    ;
     ; GFX8-LABEL: name: test_urem_s17
     ; GFX8: liveins: $vgpr0, $vgpr1
     ; GFX8-NEXT: {{  $}}
@@ -2605,6 +2619,7 @@ body: |
     ; GFX8-NEXT: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[SELECT]], [[AND1]]
     ; GFX8-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s1), [[SUB3]], [[SELECT]]
     ; GFX8-NEXT: $vgpr0 = COPY [[SELECT1]](s32)
+    ;
     ; GFX9-LABEL: name: test_urem_s17
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -2633,6 +2648,7 @@ body: |
     ; GFX9-NEXT: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[SELECT]], [[AND1]]
     ; GFX9-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s1), [[SUB3]], [[SELECT]]
     ; GFX9-NEXT: $vgpr0 = COPY [[SELECT1]](s32)
+    ;
     ; GFX10-LABEL: name: test_urem_s17
     ; GFX10: liveins: $vgpr0, $vgpr1
     ; GFX10-NEXT: {{  $}}
@@ -2825,6 +2841,7 @@ body: |
     ; GFX6-NEXT: [[ICMP7:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SELECT]](s32), [[C6]]
     ; GFX6-NEXT: [[SELECT3:%[0-9]+]]:_(s64) = G_SELECT [[ICMP7]](s1), [[SELECT2]], [[MV]]
     ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[SELECT3]](s64)
+    ;
     ; GFX8-LABEL: name: test_urem_s33
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX8-NEXT: {{  $}}
@@ -2974,6 +2991,7 @@ body: |
     ; GFX8-NEXT: [[ICMP7:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SELECT]](s32), [[C6]]
     ; GFX8-NEXT: [[SELECT3:%[0-9]+]]:_(s64) = G_SELECT [[ICMP7]](s1), [[SELECT2]], [[MV]]
     ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[SELECT3]](s64)
+    ;
     ; GFX9-LABEL: name: test_urem_s33
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -3123,6 +3141,7 @@ body: |
     ; GFX9-NEXT: [[ICMP7:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SELECT]](s32), [[C6]]
     ; GFX9-NEXT: [[SELECT3:%[0-9]+]]:_(s64) = G_SELECT [[ICMP7]](s1), [[SELECT2]], [[MV]]
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[SELECT3]](s64)
+    ;
     ; GFX10-LABEL: name: test_urem_s33
     ; GFX10: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX10-NEXT: {{  $}}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ushlsat.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ushlsat.mir
index a8a3ab3e79efb..c5bf23a15e1db 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ushlsat.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ushlsat.mir
@@ -27,14 +27,15 @@ body: |
     ; GFX6-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[C2]], [[SHL1]]
     ; GFX6-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[SELECT]], [[C1]](s32)
     ; GFX6-NEXT: $vgpr0 = COPY [[LSHR1]](s32)
+    ;
     ; GFX8-LABEL: name: ushlsat_s7
     ; GFX8: liveins: $vgpr0, $vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
     ; GFX8-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
-    ; GFX8-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 127
     ; GFX8-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+    ; GFX8-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 127
     ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C]]
     ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 9
     ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[C1]](s16)
@@ -46,14 +47,15 @@ body: |
     ; GFX8-NEXT: [[LSHR1:%[0-9]+]]:_(s16) = G_LSHR [[SELECT]], [[C1]](s16)
     ; GFX8-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR1]](s16)
     ; GFX8-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ;
     ; GFX9-LABEL: name: ushlsat_s7
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
     ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 127
     ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 127
     ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C]]
     ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 9
     ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[C1]](s16)
@@ -96,14 +98,15 @@ body: |
     ; GFX6-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[C2]], [[SHL1]]
     ; GFX6-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[SELECT]], [[C1]](s32)
     ; GFX6-NEXT: $vgpr0 = COPY [[LSHR1]](s32)
+    ;
     ; GFX8-LABEL: name: ushlsat_s8
     ; GFX8: liveins: $vgpr0, $vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
     ; GFX8-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
-    ; GFX8-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; GFX8-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+    ; GFX8-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C]]
     ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
     ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[C1]](s16)
@@ -115,14 +118,15 @@ body: |
     ; GFX8-NEXT: [[LSHR1:%[0-9]+]]:_(s16) = G_LSHR [[SELECT]], [[C1]](s16)
     ; GFX8-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR1]](s16)
     ; GFX8-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ;
     ; GFX9-LABEL: name: ushlsat_s8
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
     ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
-    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C]]
     ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
     ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[C1]](s16)
@@ -174,16 +178,14 @@ body: |
     ; GFX6-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SHL2]](s32), [[LSHR4]]
     ; GFX6-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s1), [[C3]], [[SHL3]]
     ; GFX6-NEXT: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[SELECT1]], [[C1]](s32)
-    ; GFX6-NEXT: [[C4:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; GFX6-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR3]](s32)
-    ; GFX6-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C4]]
     ; GFX6-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C]](s32)
-    ; GFX6-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[LSHR5]], [[C2]]
-    ; GFX6-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[COPY2]](s32)
+    ; GFX6-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LSHR5]], [[COPY2]](s32)
     ; GFX6-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[SHL4]](s32)
-    ; GFX6-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[TRUNC1]]
+    ; GFX6-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[TRUNC]], [[TRUNC1]]
     ; GFX6-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[OR]](s16)
     ; GFX6-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ;
     ; GFX8-LABEL: name: ushlsat_v2s8
     ; GFX8: liveins: $vgpr0, $vgpr1
     ; GFX8-NEXT: {{  $}}
@@ -193,8 +195,8 @@ body: |
     ; GFX8-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C]](s32)
     ; GFX8-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[COPY1]], [[C]](s32)
     ; GFX8-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
-    ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; GFX8-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+    ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C1]]
     ; GFX8-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
     ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[C2]](s16)
@@ -213,12 +215,11 @@ body: |
     ; GFX8-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SHL2]](s16), [[LSHR4]]
     ; GFX8-NEXT: [[SELECT1:%[0-9]+]]:_(s16) = G_SELECT [[ICMP1]](s1), [[C3]], [[SHL3]]
     ; GFX8-NEXT: [[LSHR5:%[0-9]+]]:_(s16) = G_LSHR [[SELECT1]], [[C2]](s16)
-    ; GFX8-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[LSHR3]], [[C1]]
-    ; GFX8-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[LSHR5]], [[C1]]
-    ; GFX8-NEXT: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C2]](s16)
-    ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL4]]
+    ; GFX8-NEXT: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[LSHR5]], [[C2]](s16)
+    ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[LSHR3]], [[SHL4]]
     ; GFX8-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[OR]](s16)
     ; GFX8-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ;
     ; GFX9-LABEL: name: ushlsat_v2s8
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -228,8 +229,8 @@ body: |
     ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C]](s32)
     ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[COPY1]], [[C]](s32)
     ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
-    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; GFX9-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C1]]
     ; GFX9-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
     ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[C2]](s16)
@@ -248,10 +249,8 @@ body: |
     ; GFX9-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SHL2]](s16), [[LSHR4]]
     ; GFX9-NEXT: [[SELECT1:%[0-9]+]]:_(s16) = G_SELECT [[ICMP1]](s1), [[C3]], [[SHL3]]
     ; GFX9-NEXT: [[LSHR5:%[0-9]+]]:_(s16) = G_LSHR [[SELECT1]], [[C2]](s16)
-    ; GFX9-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[LSHR3]], [[C1]]
-    ; GFX9-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[LSHR5]], [[C1]]
-    ; GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C2]](s16)
-    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL4]]
+    ; GFX9-NEXT: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[LSHR5]], [[C2]](s16)
+    ; GFX9-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[LSHR3]], [[SHL4]]
     ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[OR]](s16)
     ; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
     %0:_(s32) = COPY $vgpr0
@@ -288,6 +287,7 @@ body: |
     ; GFX6-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[C2]], [[SHL1]]
     ; GFX6-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[SELECT]], [[C1]](s32)
     ; GFX6-NEXT: $vgpr0 = COPY [[LSHR1]](s32)
+    ;
     ; GFX8-LABEL: name: ushlsat_s16
     ; GFX8: liveins: $vgpr0, $vgpr1
     ; GFX8-NEXT: {{  $}}
@@ -302,6 +302,7 @@ body: |
     ; GFX8-NEXT: [[SELECT:%[0-9]+]]:_(s16) = G_SELECT [[ICMP]](s1), [[C]], [[SHL]]
     ; GFX8-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[SELECT]](s16)
     ; GFX8-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ;
     ; GFX9-LABEL: name: ushlsat_s16
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -350,19 +351,17 @@ body: |
     ; GFX6-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SHL]](s32), [[LSHR2]]
     ; GFX6-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[C2]], [[SHL1]]
     ; GFX6-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[SELECT]], [[C]](s32)
-    ; GFX6-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C1]]
     ; GFX6-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LSHR]], [[C]](s32)
-    ; GFX6-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[SHL2]], [[AND1]](s32)
-    ; GFX6-NEXT: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[SHL3]], [[AND1]](s32)
+    ; GFX6-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[SHL2]], [[LSHR1]](s32)
+    ; GFX6-NEXT: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[SHL3]], [[LSHR1]](s32)
     ; GFX6-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SHL2]](s32), [[LSHR4]]
     ; GFX6-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s1), [[C2]], [[SHL3]]
     ; GFX6-NEXT: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[SELECT1]], [[C]](s32)
-    ; GFX6-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[LSHR3]], [[C1]]
-    ; GFX6-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[LSHR5]], [[C1]]
-    ; GFX6-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C]](s32)
-    ; GFX6-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL4]]
+    ; GFX6-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LSHR5]], [[C]](s32)
+    ; GFX6-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[LSHR3]], [[SHL4]]
     ; GFX6-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; GFX6-NEXT: $vgpr0 = COPY [[BITCAST2]](<2 x s16>)
+    ;
     ; GFX8-LABEL: name: ushlsat_v2s16
     ; GFX8: liveins: $vgpr0, $vgpr1
     ; GFX8-NEXT: {{  $}}
@@ -392,6 +391,7 @@ body: |
     ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL2]]
     ; GFX8-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; GFX8-NEXT: $vgpr0 = COPY [[BITCAST2]](<2 x s16>)
+    ;
     ; GFX9-LABEL: name: ushlsat_v2s16
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -441,51 +441,46 @@ body: |
     ; GFX6-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
     ; GFX6-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[UV2]](<2 x s16>)
     ; GFX6-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32)
-    ; GFX6-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
-    ; GFX6-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C1]]
     ; GFX6-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[BITCAST]], [[C]](s32)
-    ; GFX6-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[SHL]], [[AND]](s32)
-    ; GFX6-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[SHL1]], [[AND]](s32)
-    ; GFX6-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; GFX6-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[SHL]], [[LSHR1]](s32)
+    ; GFX6-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[SHL1]], [[LSHR1]](s32)
+    ; GFX6-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
     ; GFX6-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SHL]](s32), [[LSHR3]]
-    ; GFX6-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[C2]], [[SHL1]]
+    ; GFX6-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[C1]], [[SHL1]]
     ; GFX6-NEXT: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[SELECT]], [[C]](s32)
-    ; GFX6-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[BITCAST2]], [[C1]]
+    ; GFX6-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+    ; GFX6-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[BITCAST2]], [[C2]]
     ; GFX6-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LSHR]], [[C]](s32)
-    ; GFX6-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[SHL2]], [[AND1]](s32)
-    ; GFX6-NEXT: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[SHL3]], [[AND1]](s32)
+    ; GFX6-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[SHL2]], [[AND]](s32)
+    ; GFX6-NEXT: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[SHL3]], [[AND]](s32)
     ; GFX6-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SHL2]](s32), [[LSHR5]]
-    ; GFX6-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s1), [[C2]], [[SHL3]]
+    ; GFX6-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s1), [[C1]], [[SHL3]]
     ; GFX6-NEXT: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[SELECT1]], [[C]](s32)
-    ; GFX6-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[LSHR2]], [[C1]]
     ; GFX6-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[BITCAST1]], [[C]](s32)
-    ; GFX6-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[SHL4]], [[AND2]](s32)
-    ; GFX6-NEXT: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[SHL5]], [[AND2]](s32)
+    ; GFX6-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[SHL4]], [[LSHR2]](s32)
+    ; GFX6-NEXT: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[SHL5]], [[LSHR2]](s32)
     ; GFX6-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SHL4]](s32), [[LSHR7]]
-    ; GFX6-NEXT: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s1), [[C2]], [[SHL5]]
+    ; GFX6-NEXT: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s1), [[C1]], [[SHL5]]
     ; GFX6-NEXT: [[LSHR8:%[0-9]+]]:_(s32) = G_LSHR [[SELECT2]], [[C]](s32)
     ; GFX6-NEXT: [[DEF:%[0-9]+]]:_(<4 x s16>) = G_IMPLICIT_DEF
     ; GFX6-NEXT: [[UV3:%[0-9]+]]:_(<2 x s16>), [[UV4:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF]](<4 x s16>)
     ; GFX6-NEXT: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>)
     ; GFX6-NEXT: [[LSHR9:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32)
     ; GFX6-NEXT: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[UV4]](<2 x s16>)
-    ; GFX6-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[LSHR4]], [[C1]]
-    ; GFX6-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[LSHR6]], [[C1]]
-    ; GFX6-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND4]], [[C]](s32)
-    ; GFX6-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND3]], [[SHL6]]
+    ; GFX6-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[LSHR6]], [[C]](s32)
+    ; GFX6-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[LSHR4]], [[SHL6]]
     ; GFX6-NEXT: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
-    ; GFX6-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[LSHR8]], [[C1]]
-    ; GFX6-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[BITCAST3]], [[C1]]
-    ; GFX6-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND6]], [[C]](s32)
-    ; GFX6-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND5]], [[SHL7]]
+    ; GFX6-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[BITCAST3]], [[C2]]
+    ; GFX6-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
+    ; GFX6-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[LSHR8]], [[SHL7]]
     ; GFX6-NEXT: [[BITCAST6:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
-    ; GFX6-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[LSHR9]], [[C1]]
-    ; GFX6-NEXT: [[AND8:%[0-9]+]]:_(s32) = G_AND [[BITCAST4]], [[C1]]
-    ; GFX6-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND8]], [[C]](s32)
-    ; GFX6-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND7]], [[SHL8]]
+    ; GFX6-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[BITCAST4]], [[C2]]
+    ; GFX6-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C]](s32)
+    ; GFX6-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[LSHR9]], [[SHL8]]
     ; GFX6-NEXT: [[BITCAST7:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
     ; GFX6-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST5]](<2 x s16>), [[BITCAST6]](<2 x s16>), [[BITCAST7]](<2 x s16>)
     ; GFX6-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
     ; GFX8-LABEL: name: ushlsat_v3s16
     ; GFX8: liveins: $vgpr0_vgpr1_vgpr2
     ; GFX8-NEXT: {{  $}}
@@ -533,13 +528,13 @@ body: |
     ; GFX8-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND]], [[C]](s32)
     ; GFX8-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL4]]
     ; GFX8-NEXT: [[BITCAST6:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
-    ; GFX8-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR6]], [[C2]]
-    ; GFX8-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[BITCAST4]], [[C2]]
-    ; GFX8-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C]](s32)
-    ; GFX8-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND1]], [[SHL5]]
+    ; GFX8-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[BITCAST4]], [[C2]]
+    ; GFX8-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
+    ; GFX8-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[LSHR6]], [[SHL5]]
     ; GFX8-NEXT: [[BITCAST7:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
     ; GFX8-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST5]](<2 x s16>), [[BITCAST6]](<2 x s16>), [[BITCAST7]](<2 x s16>)
     ; GFX8-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
     ; GFX9-LABEL: name: ushlsat_v3s16
     ; GFX9: liveins: $vgpr0_vgpr1_vgpr2
     ; GFX9-NEXT: {{  $}}
@@ -623,39 +618,34 @@ body: |
     ; GFX6-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SHL]](s32), [[LSHR4]]
     ; GFX6-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[C2]], [[SHL1]]
     ; GFX6-NEXT: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR [[SELECT]], [[C]](s32)
-    ; GFX6-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR2]], [[C1]]
     ; GFX6-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LSHR]], [[C]](s32)
-    ; GFX6-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[SHL2]], [[AND1]](s32)
-    ; GFX6-NEXT: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[SHL3]], [[AND1]](s32)
+    ; GFX6-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[SHL2]], [[LSHR2]](s32)
+    ; GFX6-NEXT: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[SHL3]], [[LSHR2]](s32)
     ; GFX6-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SHL2]](s32), [[LSHR6]]
     ; GFX6-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s1), [[C2]], [[SHL3]]
     ; GFX6-NEXT: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[SELECT1]], [[C]](s32)
-    ; GFX6-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[BITCAST3]], [[C1]]
+    ; GFX6-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[BITCAST3]], [[C1]]
     ; GFX6-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[BITCAST1]], [[C]](s32)
-    ; GFX6-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[SHL4]], [[AND2]](s32)
-    ; GFX6-NEXT: [[LSHR8:%[0-9]+]]:_(s32) = G_LSHR [[SHL5]], [[AND2]](s32)
+    ; GFX6-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[SHL4]], [[AND1]](s32)
+    ; GFX6-NEXT: [[LSHR8:%[0-9]+]]:_(s32) = G_LSHR [[SHL5]], [[AND1]](s32)
     ; GFX6-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SHL4]](s32), [[LSHR8]]
     ; GFX6-NEXT: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s1), [[C2]], [[SHL5]]
     ; GFX6-NEXT: [[LSHR9:%[0-9]+]]:_(s32) = G_LSHR [[SELECT2]], [[C]](s32)
-    ; GFX6-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[LSHR3]], [[C1]]
     ; GFX6-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[LSHR1]], [[C]](s32)
-    ; GFX6-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[SHL6]], [[AND3]](s32)
-    ; GFX6-NEXT: [[LSHR10:%[0-9]+]]:_(s32) = G_LSHR [[SHL7]], [[AND3]](s32)
+    ; GFX6-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[SHL6]], [[LSHR3]](s32)
+    ; GFX6-NEXT: [[LSHR10:%[0-9]+]]:_(s32) = G_LSHR [[SHL7]], [[LSHR3]](s32)
     ; GFX6-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SHL6]](s32), [[LSHR10]]
     ; GFX6-NEXT: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[ICMP3]](s1), [[C2]], [[SHL7]]
     ; GFX6-NEXT: [[LSHR11:%[0-9]+]]:_(s32) = G_LSHR [[SELECT3]], [[C]](s32)
-    ; GFX6-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[LSHR5]], [[C1]]
-    ; GFX6-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[LSHR7]], [[C1]]
-    ; GFX6-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C]](s32)
-    ; GFX6-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL8]]
+    ; GFX6-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[LSHR7]], [[C]](s32)
+    ; GFX6-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[LSHR5]], [[SHL8]]
     ; GFX6-NEXT: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
-    ; GFX6-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[LSHR9]], [[C1]]
-    ; GFX6-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[LSHR11]], [[C1]]
-    ; GFX6-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C]](s32)
-    ; GFX6-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND6]], [[SHL9]]
+    ; GFX6-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[LSHR11]], [[C]](s32)
+    ; GFX6-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[LSHR9]], [[SHL9]]
     ; GFX6-NEXT: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
     ; GFX6-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>)
     ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
     ; GFX8-LABEL: name: ushlsat_v4s16
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX8-NEXT: {{  $}}
@@ -709,6 +699,7 @@ body: |
     ; GFX8-NEXT: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
     ; GFX8-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>)
     ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
     ; GFX9-LABEL: name: ushlsat_v4s16
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -777,6 +768,7 @@ body: |
     ; GFX6-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[COPY]](s32), [[LSHR]]
     ; GFX6-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[C]], [[SHL]]
     ; GFX6-NEXT: $vgpr0 = COPY [[SELECT]](s32)
+    ;
     ; GFX8-LABEL: name: ushlsat_s32
     ; GFX8: liveins: $vgpr0, $vgpr1
     ; GFX8-NEXT: {{  $}}
@@ -788,6 +780,7 @@ body: |
     ; GFX8-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[COPY]](s32), [[LSHR]]
     ; GFX8-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s1), [[C]], [[SHL]]
     ; GFX8-NEXT: $vgpr0 = COPY [[SELECT]](s32)
+    ;
     ; GFX9-LABEL: name: ushlsat_s32
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -829,6 +822,7 @@ body: |
     ; GFX6-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s1), [[C]], [[SHL1]]
     ; GFX6-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[SELECT]](s32), [[SELECT1]](s32)
     ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; GFX8-LABEL: name: ushlsat_v2s32
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX8-NEXT: {{  $}}
@@ -847,6 +841,7 @@ body: |
     ; GFX8-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s1), [[C]], [[SHL1]]
     ; GFX8-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[SELECT]](s32), [[SELECT1]](s32)
     ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; GFX9-LABEL: name: ushlsat_v2s32
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -889,6 +884,7 @@ body: |
     ; GFX6-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[COPY]](s64), [[LSHR]]
     ; GFX6-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s1), [[C]], [[SHL]]
     ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[SELECT]](s64)
+    ;
     ; GFX8-LABEL: name: ushlsat_s64
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX8-NEXT: {{  $}}
@@ -901,6 +897,7 @@ body: |
     ; GFX8-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[COPY]](s64), [[LSHR]]
     ; GFX8-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s1), [[C]], [[SHL]]
     ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[SELECT]](s64)
+    ;
     ; GFX9-LABEL: name: ushlsat_s64
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -945,6 +942,7 @@ body: |
     ; GFX6-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[ICMP1]](s1), [[C]], [[SHL1]]
     ; GFX6-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[SELECT]](s64), [[SELECT1]](s64)
     ; GFX6-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ;
     ; GFX8-LABEL: name: ushlsat_v2s64
     ; GFX8: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7
     ; GFX8-NEXT: {{  $}}
@@ -965,6 +963,7 @@ body: |
     ; GFX8-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[ICMP1]](s1), [[C]], [[SHL1]]
     ; GFX8-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[SELECT]](s64), [[SELECT1]](s64)
     ; GFX8-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ;
     ; GFX9-LABEL: name: ushlsat_v2s64
     ; GFX9: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7
     ; GFX9-NEXT: {{  $}}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-usubo.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-usubo.mir
index 249695a761636..6aff25387fc3e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-usubo.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-usubo.mir
@@ -139,22 +139,20 @@ body: |
     ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[AND]], [[AND1]]
     ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[SUB]], [[C1]]
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SUB]](s32), [[AND2]]
-    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]]
-    ; CHECK-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C1]]
-    ; CHECK-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[AND3]], [[AND4]]
-    ; CHECK-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[SUB1]], [[C1]]
-    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SUB1]](s32), [[AND5]]
+    ; CHECK-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[LSHR]], [[LSHR1]]
+    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[SUB1]], [[C1]]
+    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SUB1]](s32), [[AND3]]
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[AND2]](s32)
-    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[AND5]](s32)
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[AND3]](s32)
     ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY3]], [[C]](s32)
     ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY2]], [[SHL]]
     ; CHECK-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[ICMP]](s1)
     ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[ICMP1]](s1)
     ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C2]]
-    ; CHECK-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[ANYEXT1]], [[C2]]
-    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[AND6]](s32), [[AND7]](s32)
+    ; CHECK-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C2]]
+    ; CHECK-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[ANYEXT1]], [[C2]]
+    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[AND4]](s32), [[AND5]](s32)
     ; CHECK-NEXT: $vgpr0 = COPY [[BITCAST2]](<2 x s16>)
     ; CHECK-NEXT: $vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<2 x s32>)
     %0:_(<2 x s16>) = COPY $vgpr0
@@ -190,16 +188,14 @@ body: |
     ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[AND]], [[AND1]]
     ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[SUB]], [[C1]]
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SUB]](s32), [[AND2]]
-    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]]
-    ; CHECK-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C1]]
-    ; CHECK-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[AND3]], [[AND4]]
-    ; CHECK-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[SUB1]], [[C1]]
-    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SUB1]](s32), [[AND5]]
-    ; CHECK-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C1]]
-    ; CHECK-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[BITCAST3]], [[C1]]
-    ; CHECK-NEXT: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[AND6]], [[AND7]]
-    ; CHECK-NEXT: [[AND8:%[0-9]+]]:_(s32) = G_AND [[SUB2]], [[C1]]
-    ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SUB2]](s32), [[AND8]]
+    ; CHECK-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[LSHR]], [[LSHR1]]
+    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[SUB1]], [[C1]]
+    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SUB1]](s32), [[AND3]]
+    ; CHECK-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C1]]
+    ; CHECK-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[BITCAST3]], [[C1]]
+    ; CHECK-NEXT: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[AND4]], [[AND5]]
+    ; CHECK-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[SUB2]], [[C1]]
+    ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SUB2]](s32), [[AND6]]
     ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[ICMP]](s1)
     ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[ICMP1]](s1)
     ; CHECK-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[ICMP2]](s1)
@@ -209,26 +205,25 @@ body: |
     ; CHECK-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST4]], [[C]](s32)
     ; CHECK-NEXT: [[BITCAST5:%[0-9]+]]:_(s32) = G_BITCAST [[UV7]](<2 x s16>)
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[AND2]](s32)
-    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[AND5]](s32)
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[AND3]](s32)
     ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY3]], [[C]](s32)
     ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY2]], [[SHL]]
     ; CHECK-NEXT: [[BITCAST6:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
-    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY [[AND8]](s32)
-    ; CHECK-NEXT: [[AND9:%[0-9]+]]:_(s32) = G_AND [[BITCAST4]], [[C1]]
-    ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[C]](s32)
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY [[AND6]](s32)
+    ; CHECK-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[BITCAST4]], [[C1]]
+    ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C]](s32)
     ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[COPY4]], [[SHL1]]
     ; CHECK-NEXT: [[BITCAST7:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
-    ; CHECK-NEXT: [[AND10:%[0-9]+]]:_(s32) = G_AND [[LSHR2]], [[C1]]
-    ; CHECK-NEXT: [[AND11:%[0-9]+]]:_(s32) = G_AND [[BITCAST5]], [[C1]]
-    ; CHECK-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND11]], [[C]](s32)
-    ; CHECK-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND10]], [[SHL2]]
+    ; CHECK-NEXT: [[AND8:%[0-9]+]]:_(s32) = G_AND [[BITCAST5]], [[C1]]
+    ; CHECK-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND8]], [[C]](s32)
+    ; CHECK-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[LSHR2]], [[SHL2]]
     ; CHECK-NEXT: [[BITCAST8:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
     ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST6]](<2 x s16>), [[BITCAST7]](<2 x s16>), [[BITCAST8]](<2 x s16>)
     ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND12:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C2]]
-    ; CHECK-NEXT: [[AND13:%[0-9]+]]:_(s32) = G_AND [[ANYEXT1]], [[C2]]
-    ; CHECK-NEXT: [[AND14:%[0-9]+]]:_(s32) = G_AND [[ANYEXT2]], [[C2]]
-    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[AND12]](s32), [[AND13]](s32), [[AND14]](s32)
+    ; CHECK-NEXT: [[AND9:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C2]]
+    ; CHECK-NEXT: [[AND10:%[0-9]+]]:_(s32) = G_AND [[ANYEXT1]], [[C2]]
+    ; CHECK-NEXT: [[AND11:%[0-9]+]]:_(s32) = G_AND [[ANYEXT2]], [[C2]]
+    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[AND9]](s32), [[AND10]](s32), [[AND11]](s32)
     ; CHECK-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
     ; CHECK-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
     %0:_(<6 x s16>) = COPY $vgpr0_vgpr1_vgpr2
@@ -271,28 +266,24 @@ body: |
     ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[AND]], [[AND1]]
     ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[SUB]], [[C1]]
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SUB]](s32), [[AND2]]
-    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]]
-    ; CHECK-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[LSHR2]], [[C1]]
-    ; CHECK-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[AND3]], [[AND4]]
-    ; CHECK-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[SUB1]], [[C1]]
-    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SUB1]](s32), [[AND5]]
-    ; CHECK-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C1]]
-    ; CHECK-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[BITCAST3]], [[C1]]
-    ; CHECK-NEXT: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[AND6]], [[AND7]]
-    ; CHECK-NEXT: [[AND8:%[0-9]+]]:_(s32) = G_AND [[SUB2]], [[C1]]
-    ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SUB2]](s32), [[AND8]]
-    ; CHECK-NEXT: [[AND9:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C1]]
-    ; CHECK-NEXT: [[AND10:%[0-9]+]]:_(s32) = G_AND [[LSHR3]], [[C1]]
-    ; CHECK-NEXT: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[AND9]], [[AND10]]
-    ; CHECK-NEXT: [[AND11:%[0-9]+]]:_(s32) = G_AND [[SUB3]], [[C1]]
-    ; CHECK-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SUB3]](s32), [[AND11]]
+    ; CHECK-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[LSHR]], [[LSHR2]]
+    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[SUB1]], [[C1]]
+    ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SUB1]](s32), [[AND3]]
+    ; CHECK-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C1]]
+    ; CHECK-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[BITCAST3]], [[C1]]
+    ; CHECK-NEXT: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[AND4]], [[AND5]]
+    ; CHECK-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[SUB2]], [[C1]]
+    ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SUB2]](s32), [[AND6]]
+    ; CHECK-NEXT: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[LSHR1]], [[LSHR3]]
+    ; CHECK-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[SUB3]], [[C1]]
+    ; CHECK-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SUB3]](s32), [[AND7]]
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[AND2]](s32)
-    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[AND5]](s32)
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[AND3]](s32)
     ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY3]], [[C]](s32)
     ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY2]], [[SHL]]
     ; CHECK-NEXT: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
-    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY [[AND8]](s32)
-    ; CHECK-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY [[AND11]](s32)
+    ; CHECK-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY [[AND6]](s32)
+    ; CHECK-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY [[AND7]](s32)
     ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY5]], [[C]](s32)
     ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[COPY4]], [[SHL1]]
     ; CHECK-NEXT: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
@@ -302,11 +293,11 @@ body: |
     ; CHECK-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[ICMP2]](s1)
     ; CHECK-NEXT: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[ICMP3]](s1)
     ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND12:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C2]]
-    ; CHECK-NEXT: [[AND13:%[0-9]+]]:_(s32) = G_AND [[ANYEXT1]], [[C2]]
-    ; CHECK-NEXT: [[AND14:%[0-9]+]]:_(s32) = G_AND [[ANYEXT2]], [[C2]]
-    ; CHECK-NEXT: [[AND15:%[0-9]+]]:_(s32) = G_AND [[ANYEXT3]], [[C2]]
-    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[AND12]](s32), [[AND13]](s32), [[AND14]](s32), [[AND15]](s32)
+    ; CHECK-NEXT: [[AND8:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C2]]
+    ; CHECK-NEXT: [[AND9:%[0-9]+]]:_(s32) = G_AND [[ANYEXT1]], [[C2]]
+    ; CHECK-NEXT: [[AND10:%[0-9]+]]:_(s32) = G_AND [[ANYEXT2]], [[C2]]
+    ; CHECK-NEXT: [[AND11:%[0-9]+]]:_(s32) = G_AND [[ANYEXT3]], [[C2]]
+    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[AND8]](s32), [[AND9]](s32), [[AND10]](s32), [[AND11]](s32)
     ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
     ; CHECK-NEXT: $vgpr2_vgpr3_vgpr4_vgpr5 = COPY [[BUILD_VECTOR]](<4 x s32>)
     %0:_(<4 x s16>) = COPY $vgpr0_vgpr1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-usubsat.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-usubsat.mir
index 4b8a067ab74a9..663ffdcd5364c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-usubsat.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-usubsat.mir
@@ -23,6 +23,7 @@ body: |
     ; GFX6-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[SHL]], [[UMIN]]
     ; GFX6-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[SUB]], [[C]](s32)
     ; GFX6-NEXT: $vgpr0 = COPY [[LSHR]](s32)
+    ;
     ; GFX8-LABEL: name: usubsat_s7
     ; GFX8: liveins: $vgpr0, $vgpr1
     ; GFX8-NEXT: {{  $}}
@@ -37,6 +38,7 @@ body: |
     ; GFX8-NEXT: [[LSHR:%[0-9]+]]:_(s16) = G_LSHR [[USUBSAT]], [[C]](s16)
     ; GFX8-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR]](s16)
     ; GFX8-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ;
     ; GFX9-LABEL: name: usubsat_s7
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -78,6 +80,7 @@ body: |
     ; GFX6-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[SHL]], [[UMIN]]
     ; GFX6-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[SUB]], [[C]](s32)
     ; GFX6-NEXT: $vgpr0 = COPY [[LSHR]](s32)
+    ;
     ; GFX8-LABEL: name: usubsat_s8
     ; GFX8: liveins: $vgpr0, $vgpr1
     ; GFX8-NEXT: {{  $}}
@@ -92,6 +95,7 @@ body: |
     ; GFX8-NEXT: [[LSHR:%[0-9]+]]:_(s16) = G_LSHR [[USUBSAT]], [[C]](s16)
     ; GFX8-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LSHR]](s16)
     ; GFX8-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ;
     ; GFX9-LABEL: name: usubsat_s8
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -140,17 +144,14 @@ body: |
     ; GFX6-NEXT: [[UMIN1:%[0-9]+]]:_(s32) = G_UMIN [[SHL2]], [[SHL3]]
     ; GFX6-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[SHL2]], [[UMIN1]]
     ; GFX6-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[SUB1]], [[C1]](s32)
-    ; GFX6-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; GFX6-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32)
-    ; GFX6-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C2]]
     ; GFX6-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C]](s32)
-    ; GFX6-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
-    ; GFX6-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR3]], [[C3]]
-    ; GFX6-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[COPY2]](s32)
+    ; GFX6-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LSHR3]], [[COPY2]](s32)
     ; GFX6-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[SHL4]](s32)
-    ; GFX6-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[TRUNC1]]
+    ; GFX6-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[TRUNC]], [[TRUNC1]]
     ; GFX6-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[OR]](s16)
     ; GFX6-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ;
     ; GFX8-LABEL: name: usubsat_v2s8
     ; GFX8: liveins: $vgpr0, $vgpr1
     ; GFX8-NEXT: {{  $}}
@@ -172,13 +173,11 @@ body: |
     ; GFX8-NEXT: [[SHL3:%[0-9]+]]:_(s16) = G_SHL [[TRUNC3]], [[C1]](s16)
     ; GFX8-NEXT: [[USUBSAT1:%[0-9]+]]:_(s16) = G_USUBSAT [[SHL2]], [[SHL3]]
     ; GFX8-NEXT: [[LSHR3:%[0-9]+]]:_(s16) = G_LSHR [[USUBSAT1]], [[C1]](s16)
-    ; GFX8-NEXT: [[C2:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
-    ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[LSHR2]], [[C2]]
-    ; GFX8-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[LSHR3]], [[C2]]
-    ; GFX8-NEXT: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[AND1]], [[C1]](s16)
-    ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[AND]], [[SHL4]]
+    ; GFX8-NEXT: [[SHL4:%[0-9]+]]:_(s16) = G_SHL [[LSHR3]], [[C1]](s16)
+    ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[LSHR2]], [[SHL4]]
     ; GFX8-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[OR]](s16)
     ; GFX8-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ;
     ; GFX9-LABEL: name: usubsat_v2s8
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -241,6 +240,7 @@ body: |
     ; GFX6-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[SHL]], [[UMIN]]
     ; GFX6-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[SUB]], [[C]](s32)
     ; GFX6-NEXT: $vgpr0 = COPY [[LSHR]](s32)
+    ;
     ; GFX8-LABEL: name: usubsat_s16
     ; GFX8: liveins: $vgpr0, $vgpr1
     ; GFX8-NEXT: {{  $}}
@@ -251,6 +251,7 @@ body: |
     ; GFX8-NEXT: [[USUBSAT:%[0-9]+]]:_(s16) = G_USUBSAT [[TRUNC]], [[TRUNC1]]
     ; GFX8-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[USUBSAT]](s16)
     ; GFX8-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ;
     ; GFX9-LABEL: name: usubsat_s16
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -296,13 +297,11 @@ body: |
     ; GFX6-NEXT: [[UMIN1:%[0-9]+]]:_(s32) = G_UMIN [[SHL2]], [[SHL3]]
     ; GFX6-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[SHL2]], [[UMIN1]]
     ; GFX6-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[SUB1]], [[C]](s32)
-    ; GFX6-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
-    ; GFX6-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LSHR2]], [[C1]]
-    ; GFX6-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR3]], [[C1]]
-    ; GFX6-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
-    ; GFX6-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL4]]
+    ; GFX6-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LSHR3]], [[C]](s32)
+    ; GFX6-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[LSHR2]], [[SHL4]]
     ; GFX6-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; GFX6-NEXT: $vgpr0 = COPY [[BITCAST2]](<2 x s16>)
+    ;
     ; GFX8-LABEL: name: usubsat_v2s16
     ; GFX8: liveins: $vgpr0, $vgpr1
     ; GFX8-NEXT: {{  $}}
@@ -325,6 +324,7 @@ body: |
     ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[ZEXT]], [[SHL]]
     ; GFX8-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; GFX8-NEXT: $vgpr0 = COPY [[BITCAST2]](<2 x s16>)
+    ;
     ; GFX9-LABEL: name: usubsat_v2s16
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -376,24 +376,21 @@ body: |
     ; GFX6-NEXT: [[BITCAST3:%[0-9]+]]:_(s32) = G_BITCAST [[UV3]](<2 x s16>)
     ; GFX6-NEXT: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST3]], [[C]](s32)
     ; GFX6-NEXT: [[BITCAST4:%[0-9]+]]:_(s32) = G_BITCAST [[UV4]](<2 x s16>)
-    ; GFX6-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
-    ; GFX6-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LSHR3]], [[C1]]
-    ; GFX6-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR4]], [[C1]]
-    ; GFX6-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
-    ; GFX6-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL6]]
+    ; GFX6-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[LSHR4]], [[C]](s32)
+    ; GFX6-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[LSHR3]], [[SHL6]]
     ; GFX6-NEXT: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
-    ; GFX6-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[LSHR5]], [[C1]]
-    ; GFX6-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[BITCAST3]], [[C1]]
-    ; GFX6-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C]](s32)
-    ; GFX6-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL7]]
+    ; GFX6-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+    ; GFX6-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[BITCAST3]], [[C1]]
+    ; GFX6-NEXT: [[SHL7:%[0-9]+]]:_(s32) = G_SHL [[AND]], [[C]](s32)
+    ; GFX6-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[LSHR5]], [[SHL7]]
     ; GFX6-NEXT: [[BITCAST6:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
-    ; GFX6-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[LSHR6]], [[C1]]
-    ; GFX6-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[BITCAST4]], [[C1]]
-    ; GFX6-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C]](s32)
-    ; GFX6-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL8]]
+    ; GFX6-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[BITCAST4]], [[C1]]
+    ; GFX6-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
+    ; GFX6-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[LSHR6]], [[SHL8]]
     ; GFX6-NEXT: [[BITCAST7:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
     ; GFX6-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST5]](<2 x s16>), [[BITCAST6]](<2 x s16>), [[BITCAST7]](<2 x s16>)
     ; GFX6-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
     ; GFX8-LABEL: name: usubsat_v3s16
     ; GFX8: liveins: $vgpr0_vgpr1_vgpr2
     ; GFX8-NEXT: {{  $}}
@@ -431,13 +428,13 @@ body: |
     ; GFX8-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND]], [[C]](s32)
     ; GFX8-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[ZEXT2]], [[SHL1]]
     ; GFX8-NEXT: [[BITCAST6:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
-    ; GFX8-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR3]], [[C1]]
-    ; GFX8-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[BITCAST4]], [[C1]]
-    ; GFX8-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C]](s32)
-    ; GFX8-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND1]], [[SHL2]]
+    ; GFX8-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[BITCAST4]], [[C1]]
+    ; GFX8-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
+    ; GFX8-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[LSHR3]], [[SHL2]]
     ; GFX8-NEXT: [[BITCAST7:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
     ; GFX8-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[BITCAST5]](<2 x s16>), [[BITCAST6]](<2 x s16>), [[BITCAST7]](<2 x s16>)
     ; GFX8-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS]](<6 x s16>)
+    ;
     ; GFX9-LABEL: name: usubsat_v3s16
     ; GFX9: liveins: $vgpr0_vgpr1_vgpr2
     ; GFX9-NEXT: {{  $}}
@@ -527,19 +524,15 @@ body: |
     ; GFX6-NEXT: [[UMIN3:%[0-9]+]]:_(s32) = G_UMIN [[SHL6]], [[SHL7]]
     ; GFX6-NEXT: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[SHL6]], [[UMIN3]]
     ; GFX6-NEXT: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR [[SUB3]], [[C]](s32)
-    ; GFX6-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
-    ; GFX6-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LSHR4]], [[C1]]
-    ; GFX6-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR5]], [[C1]]
-    ; GFX6-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
-    ; GFX6-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL8]]
+    ; GFX6-NEXT: [[SHL8:%[0-9]+]]:_(s32) = G_SHL [[LSHR5]], [[C]](s32)
+    ; GFX6-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[LSHR4]], [[SHL8]]
     ; GFX6-NEXT: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
-    ; GFX6-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[LSHR6]], [[C1]]
-    ; GFX6-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[LSHR7]], [[C1]]
-    ; GFX6-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C]](s32)
-    ; GFX6-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL9]]
+    ; GFX6-NEXT: [[SHL9:%[0-9]+]]:_(s32) = G_SHL [[LSHR7]], [[C]](s32)
+    ; GFX6-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[LSHR6]], [[SHL9]]
     ; GFX6-NEXT: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
     ; GFX6-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>)
     ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
     ; GFX8-LABEL: name: usubsat_v4s16
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX8-NEXT: {{  $}}
@@ -580,6 +573,7 @@ body: |
     ; GFX8-NEXT: [[BITCAST5:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
     ; GFX8-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST4]](<2 x s16>), [[BITCAST5]](<2 x s16>)
     ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+    ;
     ; GFX9-LABEL: name: usubsat_v4s16
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -611,6 +605,7 @@ body: |
     ; GFX6-NEXT: [[UMIN:%[0-9]+]]:_(s32) = G_UMIN [[COPY]], [[COPY1]]
     ; GFX6-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[COPY]], [[UMIN]]
     ; GFX6-NEXT: $vgpr0 = COPY [[SUB]](s32)
+    ;
     ; GFX8-LABEL: name: usubsat_s32
     ; GFX8: liveins: $vgpr0, $vgpr1
     ; GFX8-NEXT: {{  $}}
@@ -618,6 +613,7 @@ body: |
     ; GFX8-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
     ; GFX8-NEXT: [[USUBSAT:%[0-9]+]]:_(s32) = G_USUBSAT [[COPY]], [[COPY1]]
     ; GFX8-NEXT: $vgpr0 = COPY [[USUBSAT]](s32)
+    ;
     ; GFX9-LABEL: name: usubsat_s32
     ; GFX9: liveins: $vgpr0, $vgpr1
     ; GFX9-NEXT: {{  $}}
@@ -650,6 +646,7 @@ body: |
     ; GFX6-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[UV1]], [[UMIN1]]
     ; GFX6-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[SUB]](s32), [[SUB1]](s32)
     ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; GFX8-LABEL: name: usubsat_v2s32
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX8-NEXT: {{  $}}
@@ -661,6 +658,7 @@ body: |
     ; GFX8-NEXT: [[USUBSAT1:%[0-9]+]]:_(s32) = G_USUBSAT [[UV1]], [[UV3]]
     ; GFX8-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[USUBSAT]](s32), [[USUBSAT1]](s32)
     ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+    ;
     ; GFX9-LABEL: name: usubsat_v2s32
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -697,6 +695,7 @@ body: |
     ; GFX6-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
     ; GFX6-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[USUBE1]](s1), [[C]], [[MV]]
     ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[SELECT]](s64)
+    ;
     ; GFX8-LABEL: name: usubsat_s64
     ; GFX8: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX8-NEXT: {{  $}}
@@ -710,6 +709,7 @@ body: |
     ; GFX8-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
     ; GFX8-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[USUBE1]](s1), [[C]], [[MV]]
     ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[SELECT]](s64)
+    ;
     ; GFX9-LABEL: name: usubsat_s64
     ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
     ; GFX9-NEXT: {{  $}}
@@ -757,6 +757,7 @@ body: |
     ; GFX6-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[USUBE3]](s1), [[C]], [[MV1]]
     ; GFX6-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[SELECT]](s64), [[SELECT1]](s64)
     ; GFX6-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ;
     ; GFX8-LABEL: name: usubsat_v2s64
     ; GFX8: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7
     ; GFX8-NEXT: {{  $}}
@@ -779,6 +780,7 @@ body: |
     ; GFX8-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[USUBE3]](s1), [[C]], [[MV1]]
     ; GFX8-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[SELECT]](s64), [[SELECT1]](s64)
     ; GFX8-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
+    ;
     ; GFX9-LABEL: name: usubsat_v2s64
     ; GFX9: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5_vgpr6_vgpr7
     ; GFX9-NEXT: {{  $}}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-xor.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-xor.mir
index bfbbf03ebeb20..9353813a8dbfc 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-xor.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-xor.mir
@@ -472,10 +472,9 @@ body: |
     ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
     ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
     ; CHECK-NEXT: [[BITCAST3:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
-    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]]
-    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[BITCAST2]], [[C1]]
-    ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C]](s32)
-    ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]]
+    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[BITCAST2]], [[C1]]
+    ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C]](s32)
+    ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[LSHR]], [[SHL1]]
     ; CHECK-NEXT: [[BITCAST4:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
     ; CHECK-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<6 x s16>) = G_CONCAT_VECTORS [[UV6]](<2 x s16>), [[BITCAST3]](<2 x s16>), [[BITCAST4]](<2 x s16>)
     ; CHECK-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[CONCAT_VECTORS2]](<6 x s16>)
@@ -533,10 +532,9 @@ body: |
     ; CHECK-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST4]], [[C]](s32)
     ; CHECK-NEXT: [[BITCAST5:%[0-9]+]]:_(s32) = G_BITCAST [[UV7]](<2 x s16>)
     ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]]
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C1]]
-    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
-    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C1]]
+    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND]], [[C]](s32)
+    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[LSHR]], [[SHL]]
     ; CHECK-NEXT: [[BITCAST6:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
     ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
@@ -545,10 +543,9 @@ body: |
     ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[COPY]], [[SHL1]]
     ; CHECK-NEXT: [[BITCAST7:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR1]](s32)
     ; CHECK-NEXT: [[CONCAT_VECTORS2:%[0-9]+]]:_(<4 x s16>) = G_CONCAT_VECTORS [[BITCAST6]](<2 x s16>), [[BITCAST7]](<2 x s16>)
-    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C1]]
-    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[BITCAST3]], [[C1]]
-    ; CHECK-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND3]], [[C]](s32)
-    ; CHECK-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL2]]
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[BITCAST3]], [[C1]]
+    ; CHECK-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
+    ; CHECK-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[LSHR1]], [[SHL2]]
     ; CHECK-NEXT: [[BITCAST8:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR2]](s32)
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
     ; CHECK-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[C2]], [[C]](s32)
@@ -563,20 +560,17 @@ body: |
     ; CHECK-NEXT: [[UV10:%[0-9]+]]:_(<2 x s16>), [[UV11:%[0-9]+]]:_(<2 x s16>), [[UV12:%[0-9]+]]:_(<2 x s16>), [[UV13:%[0-9]+]]:_(<2 x s16>) = G_UNMERGE_VALUES [[DEF2]](<8 x s16>)
     ; CHECK-NEXT: [[BITCAST11:%[0-9]+]]:_(s32) = G_BITCAST [[UV12]](<2 x s16>)
     ; CHECK-NEXT: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST11]], [[C]](s32)
-    ; CHECK-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[BITCAST4]], [[C1]]
-    ; CHECK-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[LSHR2]], [[C1]]
-    ; CHECK-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C]](s32)
-    ; CHECK-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL4]]
+    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[BITCAST4]], [[C1]]
+    ; CHECK-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LSHR2]], [[C]](s32)
+    ; CHECK-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL4]]
     ; CHECK-NEXT: [[BITCAST12:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR4]](s32)
-    ; CHECK-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[BITCAST5]], [[C1]]
-    ; CHECK-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[BITCAST10]], [[C1]]
-    ; CHECK-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND7]], [[C]](s32)
-    ; CHECK-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[AND6]], [[SHL5]]
+    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[BITCAST5]], [[C1]]
+    ; CHECK-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[BITCAST10]], [[C1]]
+    ; CHECK-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND4]], [[C]](s32)
+    ; CHECK-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[AND3]], [[SHL5]]
     ; CHECK-NEXT: [[BITCAST13:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR5]](s32)
-    ; CHECK-NEXT: [[AND8:%[0-9]+]]:_(s32) = G_AND [[LSHR3]], [[C1]]
-    ; CHECK-NEXT: [[AND9:%[0-9]+]]:_(s32) = G_AND [[LSHR4]], [[C1]]
-    ; CHECK-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND9]], [[C]](s32)
-    ; CHECK-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[AND8]], [[SHL6]]
+    ; CHECK-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[LSHR4]], [[C]](s32)
+    ; CHECK-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[LSHR3]], [[SHL6]]
     ; CHECK-NEXT: [[BITCAST14:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR6]](s32)
     ; CHECK-NEXT: [[CONCAT_VECTORS4:%[0-9]+]]:_(<8 x s16>) = G_CONCAT_VECTORS [[BITCAST12]](<2 x s16>), [[BITCAST13]](<2 x s16>), [[BITCAST14]](<2 x s16>), [[UV13]](<2 x s16>)
     ; CHECK-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[CONCAT_VECTORS4]](<8 x s16>)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-zext.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-zext.mir
index 98b210cbcdaad..8db6a7c78aaa4 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-zext.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-zext.mir
@@ -28,8 +28,8 @@ body: |
     ; CHECK: liveins: $vgpr0
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 65535
     ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[COPY]](s32)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 65535
     ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[ANYEXT]], [[C]]
     ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[AND]](s64)
     %0:_(s32) = COPY $vgpr0
@@ -138,8 +138,7 @@ body: |
     ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
     ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
     ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[BITCAST]], [[C1]]
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]]
-    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[AND]](s32), [[AND1]](s32)
+    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[AND]](s32), [[LSHR]](s32)
     ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
     %0:_(<2 x s16>) = COPY $vgpr0
     %1:_(<2 x s32>) = G_ZEXT %0
@@ -163,9 +162,8 @@ body: |
     ; CHECK-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[UV1]](<2 x s16>)
     ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
     ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[BITCAST]], [[C1]]
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]]
-    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C1]]
-    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[AND]](s32), [[AND1]](s32), [[AND2]](s32)
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C1]]
+    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s32>) = G_BUILD_VECTOR [[AND]](s32), [[LSHR]](s32), [[AND1]](s32)
     ; CHECK-NEXT: $vgpr0_vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<3 x s32>)
     %0:_(<4 x s16>) = COPY $vgpr0_vgpr1
     %1:_(<3 x s16>) = G_EXTRACT %0, 0
@@ -191,10 +189,8 @@ body: |
     ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
     ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
     ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[BITCAST]], [[C1]]
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]]
-    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C1]]
-    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C1]]
-    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[AND]](s32), [[AND1]](s32), [[AND2]](s32), [[AND3]](s32)
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[BITCAST1]], [[C1]]
+    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[AND]](s32), [[LSHR]](s32), [[AND1]](s32), [[LSHR1]](s32)
     ; CHECK-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<4 x s32>)
     %0:_(<4 x s16>) = COPY $vgpr0_vgpr1
     %1:_(<4 x s32>) = G_ZEXT %0
@@ -275,8 +271,8 @@ body: |
     ; CHECK: liveins: $vgpr0
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C]]
     ; CHECK-NEXT: S_ENDPGM 0, implicit [[AND]](s16)
     %0:_(s32) = COPY $vgpr0
@@ -654,8 +650,8 @@ body: |
     ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C1]](s32)
     ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
     ; CHECK-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C2]](s32)
-    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
     ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C3]]
     ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
     ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s16) = G_AND [[TRUNC1]], [[C3]]
@@ -665,13 +661,11 @@ body: |
     ; CHECK-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
     ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s16) = G_AND [[TRUNC2]], [[C3]]
     ; CHECK-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32)
-    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s16) = G_AND [[TRUNC3]], [[C3]]
-    ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[AND3]], [[C4]](s16)
+    ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s16) = G_SHL [[TRUNC3]], [[C4]](s16)
     ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s16) = G_OR [[AND2]], [[SHL1]]
     ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s16) = G_CONSTANT i16 0
-    ; CHECK-NEXT: [[AND4:%[0-9]+]]:_(s16) = G_AND [[C5]], [[C3]]
-    ; CHECK-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[AND4]], [[C4]](s16)
-    ; CHECK-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[AND4]], [[SHL2]]
+    ; CHECK-NEXT: [[SHL2:%[0-9]+]]:_(s16) = G_SHL [[C5]], [[C4]](s16)
+    ; CHECK-NEXT: [[OR2:%[0-9]+]]:_(s16) = G_OR [[C5]], [[SHL2]]
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s16) = COPY [[OR2]](s16)
     ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16)
     ; CHECK-NEXT: [[ZEXT1:%[0-9]+]]:_(s32) = G_ZEXT [[OR1]](s16)
@@ -715,67 +709,61 @@ body: |
     ; CHECK: liveins: $vgpr0
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 3
-    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-    ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C1]](s64)
-    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C2]](s32)
-    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[UV]], [[C3]]
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C3]]
-    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C]](s32)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C1]]
+    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LSHR]], [[C]](s32)
     ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
-    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[UV1]], [[C3]]
-    ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C4]](s32)
-    ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY1]], [[C2]](s32)
-    ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[AND2]], [[SHL1]]
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[C2]], [[C]](s32)
+    ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[C2]], [[SHL1]]
     ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s32), [[OR1]](s32)
-    ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C2]](s32)
-    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C3]]
-    ; CHECK-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C3]]
-    ; CHECK-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[AND4]], [[C2]](s32)
-    ; CHECK-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[AND3]], [[SHL2]]
-    ; CHECK-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[C4]], [[C2]](s32)
-    ; CHECK-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[C4]], [[SHL3]]
-    ; CHECK-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR2]](s32), [[OR3]](s32)
     ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s64) = G_IMPLICIT_DEF
-    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY [[MV1]](s64)
+    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 3
+    ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C4]](s64)
+    ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[UV]], [[C]](s32)
+    ; CHECK-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[LSHR1]], [[C]](s32)
+    ; CHECK-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[UV]], [[SHL2]]
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C2]](s32)
+    ; CHECK-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[COPY1]], [[C]](s32)
+    ; CHECK-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[UV1]], [[SHL3]]
+    ; CHECK-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR2]](s32), [[OR3]](s32)
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY [[MV]](s64)
     ; CHECK-NEXT: [[EXTRACT:%[0-9]+]]:_(s48) = G_EXTRACT [[DEF]](s64), 0
-    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s64) = COPY [[C]](s64)
-    ; CHECK-NEXT: [[EXTRACT1:%[0-9]+]]:_(s48) = G_EXTRACT [[MV]](s64), 0
-    ; CHECK-NEXT: [[AND5:%[0-9]+]]:_(s64) = G_AND [[COPY2]], [[COPY3]]
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s64) = COPY [[C3]](s64)
+    ; CHECK-NEXT: [[EXTRACT1:%[0-9]+]]:_(s48) = G_EXTRACT [[MV1]](s64), 0
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[COPY2]], [[COPY3]]
     ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[EXTRACT]](s48)
     ; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[EXTRACT1]](s48)
-    ; CHECK-NEXT: [[AND6:%[0-9]+]]:_(s64) = G_AND [[ANYEXT]], [[ANYEXT1]]
-    ; CHECK-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AND6]](s64)
-    ; CHECK-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[UV2]], [[C2]](s32)
+    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s64) = G_AND [[ANYEXT]], [[ANYEXT1]]
+    ; CHECK-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AND2]](s64)
+    ; CHECK-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[UV2]], [[C]](s32)
     ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(s64) = G_IMPLICIT_DEF
     ; CHECK-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[DEF1]](s64)
-    ; CHECK-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[UV4]], [[C2]](s32)
-    ; CHECK-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[UV2]], [[C3]]
-    ; CHECK-NEXT: [[AND8:%[0-9]+]]:_(s32) = G_AND [[LSHR2]], [[C3]]
-    ; CHECK-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[AND8]], [[C2]](s32)
-    ; CHECK-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[AND7]], [[SHL4]]
-    ; CHECK-NEXT: [[AND9:%[0-9]+]]:_(s32) = G_AND [[UV3]], [[C3]]
-    ; CHECK-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[AND9]], [[SHL1]]
+    ; CHECK-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR [[UV4]], [[C]](s32)
+    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[UV2]], [[C1]]
+    ; CHECK-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL [[LSHR2]], [[C]](s32)
+    ; CHECK-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[AND3]], [[SHL4]]
+    ; CHECK-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[UV3]], [[C1]]
+    ; CHECK-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[AND4]], [[SHL3]]
     ; CHECK-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR4]](s32), [[OR5]](s32)
-    ; CHECK-NEXT: [[AND10:%[0-9]+]]:_(s32) = G_AND [[UV4]], [[C3]]
-    ; CHECK-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND10]], [[C2]](s32)
+    ; CHECK-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[UV4]], [[C1]]
+    ; CHECK-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[AND5]], [[C]](s32)
     ; CHECK-NEXT: [[OR6:%[0-9]+]]:_(s32) = G_OR [[COPY1]], [[SHL5]]
-    ; CHECK-NEXT: [[AND11:%[0-9]+]]:_(s32) = G_AND [[LSHR3]], [[C3]]
-    ; CHECK-NEXT: [[AND12:%[0-9]+]]:_(s32) = G_AND [[UV5]], [[C3]]
-    ; CHECK-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND12]], [[C2]](s32)
-    ; CHECK-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[AND11]], [[SHL6]]
+    ; CHECK-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[UV5]], [[C1]]
+    ; CHECK-NEXT: [[SHL6:%[0-9]+]]:_(s32) = G_SHL [[AND6]], [[C]](s32)
+    ; CHECK-NEXT: [[OR7:%[0-9]+]]:_(s32) = G_OR [[LSHR3]], [[SHL6]]
     ; CHECK-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR6]](s32), [[OR7]](s32)
-    ; CHECK-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[AND12]], [[SHL5]]
+    ; CHECK-NEXT: [[OR8:%[0-9]+]]:_(s32) = G_OR [[AND6]], [[SHL5]]
     ; CHECK-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UV4]](s32), [[OR8]](s32)
     ; CHECK-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY [[OR7]](s32)
     ; CHECK-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY4]](s32), [[UV4]](s32)
-    ; CHECK-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[AND12]], [[SHL1]]
-    ; CHECK-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[COPY1]], [[SHL1]]
+    ; CHECK-NEXT: [[OR9:%[0-9]+]]:_(s32) = G_OR [[AND6]], [[SHL3]]
+    ; CHECK-NEXT: [[OR10:%[0-9]+]]:_(s32) = G_OR [[COPY1]], [[SHL3]]
     ; CHECK-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR9]](s32), [[OR10]](s32)
-    ; CHECK-NEXT: [[MV7:%[0-9]+]]:_(s384) = G_MERGE_VALUES [[AND5]](s64), [[MV2]](s64), [[MV3]](s64), [[MV4]](s64), [[MV5]](s64), [[MV6]](s64)
+    ; CHECK-NEXT: [[MV7:%[0-9]+]]:_(s384) = G_MERGE_VALUES [[AND1]](s64), [[MV2]](s64), [[MV3]](s64), [[MV4]](s64), [[MV5]](s64), [[MV6]](s64)
     ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s112) = G_TRUNC [[MV7]](s384)
     ; CHECK-NEXT: S_ENDPGM 0, implicit [[TRUNC]](s112)
     %0:_(s32) = COPY $vgpr0
diff --git a/llvm/test/CodeGen/ARM/GlobalISel/arm-legalize-fp.mir b/llvm/test/CodeGen/ARM/GlobalISel/arm-legalize-fp.mir
index 782b05d550030..96461e044813d 100644
--- a/llvm/test/CodeGen/ARM/GlobalISel/arm-legalize-fp.mir
+++ b/llvm/test/CodeGen/ARM/GlobalISel/arm-legalize-fp.mir
@@ -1117,8 +1117,7 @@ body:             |
     ; SOFT-NOT: G_FCMP
     ; For soft float we just need to return a '-1' constant, but the truncation
     ; to 1 bit is converted by the combiner to the following masking sequence.
-    ; SOFT: [[R:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; SOFT: [[REXT:%[0-9]+]]:_(s32) = COPY [[R]](s32)
+    ; SOFT: [[REXT:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; SOFT-NOT: G_FCMP
     ; CHECK: $r0 = COPY [[REXT]]
 ...
@@ -1152,8 +1151,7 @@ body:             |
     ; SOFT-NOT: G_FCMP
     ; For soft float we just need to return a '0' constant, but the truncation
     ; to 1 bit is converted by the combiner to the following masking sequence.
-    ; SOFT: [[R:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-    ; SOFT: [[REXT:%[0-9]+]]:_(s32) = COPY [[R]](s32)
+    ; SOFT: [[REXT:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
     ; SOFT-NOT: G_FCMP
     ; CHECK: $r0 = COPY [[REXT]]
 ...
@@ -1825,8 +1823,7 @@ body:             |
     ; SOFT-NOT: G_FCMP
     ; The result needs to be truncated, and the combiner turns the truncation
     ; into the following masking sequence.
-    ; SOFT: [[MASK:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; SOFT: [[REXT:%[0-9]+]]:_(s32) = COPY [[MASK]]
+    ; SOFT: [[REXT:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; SOFT-NOT: G_FCMP
     %7(s32) = G_ZEXT %6(s1)
     $r0 = COPY %7(s32)
@@ -1870,11 +1867,9 @@ body:             |
     ; HARD: [[R:%[0-9]+]]:_(s1) = G_FCMP floatpred(false), [[X]](s64), [[Y]]
     ; HARD: [[REXT:%[0-9]+]]:_(s32) = G_ZEXT [[R]](s1)
     ; SOFT-NOT: G_FCMP
-    ; SOFT: [[R:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; SOFT: [[REXT:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
     ; The result needs to be truncated, and the combiner turns the truncation
     ; into the following masking sequence.
-    ; SOFT: [[REXT:%[0-9]+]]:_(s32) = COPY [[R]]
-    ; SOFT-NOT: G_FCMP
     ; SOFT-NOT: G_FCMP
     %7(s32) = G_ZEXT %6(s1)
     $r0 = COPY %7(s32)
diff --git a/llvm/test/CodeGen/Mips/GlobalISel/legalizer/add.mir b/llvm/test/CodeGen/Mips/GlobalISel/legalizer/add.mir
index 8fcdd3fc8da25..52352edbe3392 100644
--- a/llvm/test/CodeGen/Mips/GlobalISel/legalizer/add.mir
+++ b/llvm/test/CodeGen/Mips/GlobalISel/legalizer/add.mir
@@ -24,11 +24,12 @@ body:             |
 
     ; MIPS32-LABEL: name: add_i32
     ; MIPS32: liveins: $a0, $a1
-    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
-    ; MIPS32: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
-    ; MIPS32: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY]], [[COPY1]]
-    ; MIPS32: $v0 = COPY [[ADD]](s32)
-    ; MIPS32: RetRA implicit $v0
+    ; MIPS32-NEXT: {{  $}}
+    ; MIPS32-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+    ; MIPS32-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
+    ; MIPS32-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY]], [[COPY1]]
+    ; MIPS32-NEXT: $v0 = COPY [[ADD]](s32)
+    ; MIPS32-NEXT: RetRA implicit $v0
     %0:_(s32) = COPY $a0
     %1:_(s32) = COPY $a1
     %2:_(s32) = G_ADD %0, %1
@@ -46,14 +47,15 @@ body:             |
 
     ; MIPS32-LABEL: name: add_i8_sext
     ; MIPS32: liveins: $a0, $a1
-    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
-    ; MIPS32: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
-    ; MIPS32: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY1]], [[COPY]]
-    ; MIPS32: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
-    ; MIPS32: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ADD]], [[C]](s32)
-    ; MIPS32: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[SHL]], [[C]](s32)
-    ; MIPS32: $v0 = COPY [[ASHR]](s32)
-    ; MIPS32: RetRA implicit $v0
+    ; MIPS32-NEXT: {{  $}}
+    ; MIPS32-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+    ; MIPS32-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
+    ; MIPS32-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY1]], [[COPY]]
+    ; MIPS32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
+    ; MIPS32-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ADD]], [[C]](s32)
+    ; MIPS32-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[SHL]], [[C]](s32)
+    ; MIPS32-NEXT: $v0 = COPY [[ASHR]](s32)
+    ; MIPS32-NEXT: RetRA implicit $v0
     %2:_(s32) = COPY $a0
     %0:_(s8) = G_TRUNC %2(s32)
     %3:_(s32) = COPY $a1
@@ -74,13 +76,14 @@ body:             |
 
     ; MIPS32-LABEL: name: add_i8_zext
     ; MIPS32: liveins: $a0, $a1
-    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
-    ; MIPS32: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
-    ; MIPS32: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY1]], [[COPY]]
-    ; MIPS32: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
-    ; MIPS32: [[AND:%[0-9]+]]:_(s32) = G_AND [[ADD]], [[C]]
-    ; MIPS32: $v0 = COPY [[AND]](s32)
-    ; MIPS32: RetRA implicit $v0
+    ; MIPS32-NEXT: {{  $}}
+    ; MIPS32-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+    ; MIPS32-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
+    ; MIPS32-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY1]], [[COPY]]
+    ; MIPS32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
+    ; MIPS32-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ADD]], [[C]]
+    ; MIPS32-NEXT: $v0 = COPY [[AND]](s32)
+    ; MIPS32-NEXT: RetRA implicit $v0
     %2:_(s32) = COPY $a0
     %0:_(s8) = G_TRUNC %2(s32)
     %3:_(s32) = COPY $a1
@@ -101,11 +104,12 @@ body:             |
 
     ; MIPS32-LABEL: name: add_i8_aext
     ; MIPS32: liveins: $a0, $a1
-    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
-    ; MIPS32: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
-    ; MIPS32: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY1]], [[COPY]]
-    ; MIPS32: $v0 = COPY [[ADD]](s32)
-    ; MIPS32: RetRA implicit $v0
+    ; MIPS32-NEXT: {{  $}}
+    ; MIPS32-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+    ; MIPS32-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
+    ; MIPS32-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY1]], [[COPY]]
+    ; MIPS32-NEXT: $v0 = COPY [[ADD]](s32)
+    ; MIPS32-NEXT: RetRA implicit $v0
     %2:_(s32) = COPY $a0
     %0:_(s8) = G_TRUNC %2(s32)
     %3:_(s32) = COPY $a1
@@ -126,14 +130,15 @@ body:             |
 
     ; MIPS32-LABEL: name: add_i16_sext
     ; MIPS32: liveins: $a0, $a1
-    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
-    ; MIPS32: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
-    ; MIPS32: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY1]], [[COPY]]
-    ; MIPS32: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; MIPS32: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ADD]], [[C]](s32)
-    ; MIPS32: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[SHL]], [[C]](s32)
-    ; MIPS32: $v0 = COPY [[ASHR]](s32)
-    ; MIPS32: RetRA implicit $v0
+    ; MIPS32-NEXT: {{  $}}
+    ; MIPS32-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+    ; MIPS32-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
+    ; MIPS32-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY1]], [[COPY]]
+    ; MIPS32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; MIPS32-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ADD]], [[C]](s32)
+    ; MIPS32-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[SHL]], [[C]](s32)
+    ; MIPS32-NEXT: $v0 = COPY [[ASHR]](s32)
+    ; MIPS32-NEXT: RetRA implicit $v0
     %2:_(s32) = COPY $a0
     %0:_(s16) = G_TRUNC %2(s32)
     %3:_(s32) = COPY $a1
@@ -154,13 +159,14 @@ body:             |
 
     ; MIPS32-LABEL: name: add_i16_zext
     ; MIPS32: liveins: $a0, $a1
-    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
-    ; MIPS32: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
-    ; MIPS32: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY1]], [[COPY]]
-    ; MIPS32: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
-    ; MIPS32: [[AND:%[0-9]+]]:_(s32) = G_AND [[ADD]], [[C]]
-    ; MIPS32: $v0 = COPY [[AND]](s32)
-    ; MIPS32: RetRA implicit $v0
+    ; MIPS32-NEXT: {{  $}}
+    ; MIPS32-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+    ; MIPS32-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
+    ; MIPS32-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY1]], [[COPY]]
+    ; MIPS32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+    ; MIPS32-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ADD]], [[C]]
+    ; MIPS32-NEXT: $v0 = COPY [[AND]](s32)
+    ; MIPS32-NEXT: RetRA implicit $v0
     %2:_(s32) = COPY $a0
     %0:_(s16) = G_TRUNC %2(s32)
     %3:_(s32) = COPY $a1
@@ -181,11 +187,12 @@ body:             |
 
     ; MIPS32-LABEL: name: add_i16_aext
     ; MIPS32: liveins: $a0, $a1
-    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
-    ; MIPS32: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
-    ; MIPS32: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY1]], [[COPY]]
-    ; MIPS32: $v0 = COPY [[ADD]](s32)
-    ; MIPS32: RetRA implicit $v0
+    ; MIPS32-NEXT: {{  $}}
+    ; MIPS32-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+    ; MIPS32-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
+    ; MIPS32-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY1]], [[COPY]]
+    ; MIPS32-NEXT: $v0 = COPY [[ADD]](s32)
+    ; MIPS32-NEXT: RetRA implicit $v0
     %2:_(s32) = COPY $a0
     %0:_(s16) = G_TRUNC %2(s32)
     %3:_(s32) = COPY $a1
@@ -206,19 +213,18 @@ body:             |
 
     ; MIPS32-LABEL: name: add_i64
     ; MIPS32: liveins: $a0, $a1, $a2, $a3
-    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
-    ; MIPS32: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
-    ; MIPS32: [[COPY2:%[0-9]+]]:_(s32) = COPY $a2
-    ; MIPS32: [[COPY3:%[0-9]+]]:_(s32) = COPY $a3
-    ; MIPS32: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY3]], [[COPY1]]
-    ; MIPS32: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[ADD]](s32), [[COPY1]]
-    ; MIPS32: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[COPY2]], [[COPY]]
-    ; MIPS32: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; MIPS32: [[AND:%[0-9]+]]:_(s32) = G_AND [[ICMP]], [[C]]
-    ; MIPS32: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[ADD1]], [[AND]]
-    ; MIPS32: $v0 = COPY [[ADD2]](s32)
-    ; MIPS32: $v1 = COPY [[ADD]](s32)
-    ; MIPS32: RetRA implicit $v0, implicit $v1
+    ; MIPS32-NEXT: {{  $}}
+    ; MIPS32-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+    ; MIPS32-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
+    ; MIPS32-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $a2
+    ; MIPS32-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $a3
+    ; MIPS32-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY3]], [[COPY1]]
+    ; MIPS32-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[ADD]](s32), [[COPY1]]
+    ; MIPS32-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[COPY2]], [[COPY]]
+    ; MIPS32-NEXT: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[ADD1]], [[ICMP]]
+    ; MIPS32-NEXT: $v0 = COPY [[ADD2]](s32)
+    ; MIPS32-NEXT: $v1 = COPY [[ADD]](s32)
+    ; MIPS32-NEXT: RetRA implicit $v0, implicit $v1
     %2:_(s32) = COPY $a0
     %3:_(s32) = COPY $a1
     %0:_(s64) = G_MERGE_VALUES %3(s32), %2(s32)
@@ -247,44 +253,44 @@ body:             |
 
     ; MIPS32-LABEL: name: add_i128
     ; MIPS32: liveins: $a0, $a1, $a2, $a3
-    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
-    ; MIPS32: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
-    ; MIPS32: [[COPY2:%[0-9]+]]:_(s32) = COPY $a2
-    ; MIPS32: [[COPY3:%[0-9]+]]:_(s32) = COPY $a3
-    ; MIPS32: [[FRAME_INDEX:%[0-9]+]]:_(p0) = G_FRAME_INDEX %fixed-stack.0
-    ; MIPS32: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX]](p0) :: (load (s32) from %fixed-stack.0)
-    ; MIPS32: [[FRAME_INDEX1:%[0-9]+]]:_(p0) = G_FRAME_INDEX %fixed-stack.1
-    ; MIPS32: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX1]](p0) :: (load (s32) from %fixed-stack.1)
-    ; MIPS32: [[FRAME_INDEX2:%[0-9]+]]:_(p0) = G_FRAME_INDEX %fixed-stack.2
-    ; MIPS32: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX2]](p0) :: (load (s32) from %fixed-stack.2)
-    ; MIPS32: [[FRAME_INDEX3:%[0-9]+]]:_(p0) = G_FRAME_INDEX %fixed-stack.3
-    ; MIPS32: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX3]](p0) :: (load (s32) from %fixed-stack.3)
-    ; MIPS32: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[LOAD]], [[COPY]]
-    ; MIPS32: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[ADD]](s32), [[COPY]]
-    ; MIPS32: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[LOAD1]], [[COPY1]]
-    ; MIPS32: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[ADD1]](s32), [[LOAD1]]
-    ; MIPS32: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; MIPS32: [[AND:%[0-9]+]]:_(s32) = G_AND [[ICMP]], [[C]]
-    ; MIPS32: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[ADD1]], [[AND]]
-    ; MIPS32: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-    ; MIPS32: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[ADD2]](s32), [[C1]]
-    ; MIPS32: [[AND1:%[0-9]+]]:_(s32) = G_AND [[ICMP2]], [[ICMP]]
-    ; MIPS32: [[OR:%[0-9]+]]:_(s32) = G_OR [[ICMP1]], [[AND1]]
-    ; MIPS32: [[ADD3:%[0-9]+]]:_(s32) = G_ADD [[LOAD2]], [[COPY2]]
-    ; MIPS32: [[ICMP3:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[ADD3]](s32), [[LOAD2]]
-    ; MIPS32: [[AND2:%[0-9]+]]:_(s32) = G_AND [[OR]], [[C]]
-    ; MIPS32: [[ADD4:%[0-9]+]]:_(s32) = G_ADD [[ADD3]], [[AND2]]
-    ; MIPS32: [[ICMP4:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[ADD4]](s32), [[C1]]
-    ; MIPS32: [[AND3:%[0-9]+]]:_(s32) = G_AND [[ICMP4]], [[OR]]
-    ; MIPS32: [[OR1:%[0-9]+]]:_(s32) = G_OR [[ICMP3]], [[AND3]]
-    ; MIPS32: [[ADD5:%[0-9]+]]:_(s32) = G_ADD [[LOAD3]], [[COPY3]]
-    ; MIPS32: [[AND4:%[0-9]+]]:_(s32) = G_AND [[OR1]], [[C]]
-    ; MIPS32: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[ADD5]], [[AND4]]
-    ; MIPS32: $v0 = COPY [[ADD]](s32)
-    ; MIPS32: $v1 = COPY [[ADD2]](s32)
-    ; MIPS32: $a0 = COPY [[ADD4]](s32)
-    ; MIPS32: $a1 = COPY [[ADD6]](s32)
-    ; MIPS32: RetRA implicit $v0, implicit $v1, implicit $a0, implicit $a1
+    ; MIPS32-NEXT: {{  $}}
+    ; MIPS32-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+    ; MIPS32-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
+    ; MIPS32-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $a2
+    ; MIPS32-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $a3
+    ; MIPS32-NEXT: [[FRAME_INDEX:%[0-9]+]]:_(p0) = G_FRAME_INDEX %fixed-stack.0
+    ; MIPS32-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX]](p0) :: (load (s32) from %fixed-stack.0)
+    ; MIPS32-NEXT: [[FRAME_INDEX1:%[0-9]+]]:_(p0) = G_FRAME_INDEX %fixed-stack.1
+    ; MIPS32-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX1]](p0) :: (load (s32) from %fixed-stack.1)
+    ; MIPS32-NEXT: [[FRAME_INDEX2:%[0-9]+]]:_(p0) = G_FRAME_INDEX %fixed-stack.2
+    ; MIPS32-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX2]](p0) :: (load (s32) from %fixed-stack.2)
+    ; MIPS32-NEXT: [[FRAME_INDEX3:%[0-9]+]]:_(p0) = G_FRAME_INDEX %fixed-stack.3
+    ; MIPS32-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX3]](p0) :: (load (s32) from %fixed-stack.3)
+    ; MIPS32-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[LOAD]], [[COPY]]
+    ; MIPS32-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[ADD]](s32), [[COPY]]
+    ; MIPS32-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[LOAD1]], [[COPY1]]
+    ; MIPS32-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[ADD1]](s32), [[LOAD1]]
+    ; MIPS32-NEXT: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[ADD1]], [[ICMP]]
+    ; MIPS32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; MIPS32-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[ADD2]](s32), [[C]]
+    ; MIPS32-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ICMP2]], [[ICMP]]
+    ; MIPS32-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[ICMP1]], [[AND]]
+    ; MIPS32-NEXT: [[ADD3:%[0-9]+]]:_(s32) = G_ADD [[LOAD2]], [[COPY2]]
+    ; MIPS32-NEXT: [[ICMP3:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[ADD3]](s32), [[LOAD2]]
+    ; MIPS32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; MIPS32-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[OR]], [[C1]]
+    ; MIPS32-NEXT: [[ADD4:%[0-9]+]]:_(s32) = G_ADD [[ADD3]], [[AND1]]
+    ; MIPS32-NEXT: [[ICMP4:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[ADD4]](s32), [[C]]
+    ; MIPS32-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[ICMP4]], [[OR]]
+    ; MIPS32-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[ICMP3]], [[AND2]]
+    ; MIPS32-NEXT: [[ADD5:%[0-9]+]]:_(s32) = G_ADD [[LOAD3]], [[COPY3]]
+    ; MIPS32-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[OR1]], [[C1]]
+    ; MIPS32-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[ADD5]], [[AND3]]
+    ; MIPS32-NEXT: $v0 = COPY [[ADD]](s32)
+    ; MIPS32-NEXT: $v1 = COPY [[ADD2]](s32)
+    ; MIPS32-NEXT: $a0 = COPY [[ADD4]](s32)
+    ; MIPS32-NEXT: $a1 = COPY [[ADD6]](s32)
+    ; MIPS32-NEXT: RetRA implicit $v0, implicit $v1, implicit $a0, implicit $a1
     %2:_(s32) = COPY $a0
     %3:_(s32) = COPY $a1
     %4:_(s32) = COPY $a2
@@ -318,18 +324,18 @@ body:             |
 
     ; MIPS32-LABEL: name: uadd_with_overflow
     ; MIPS32: liveins: $a0, $a1, $a2, $a3
-    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
-    ; MIPS32: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
-    ; MIPS32: [[COPY2:%[0-9]+]]:_(p0) = COPY $a2
-    ; MIPS32: [[COPY3:%[0-9]+]]:_(p0) = COPY $a3
-    ; MIPS32: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY]], [[COPY1]]
-    ; MIPS32: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[ADD]](s32), [[COPY1]]
-    ; MIPS32: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; MIPS32: [[AND:%[0-9]+]]:_(s32) = G_AND [[ICMP]], [[C]]
-    ; MIPS32: [[AND1:%[0-9]+]]:_(s32) = G_AND [[AND]], [[C]]
-    ; MIPS32: G_STORE [[AND1]](s32), [[COPY3]](p0) :: (store (s8) into %ir.pcarry_flag)
-    ; MIPS32: G_STORE [[ADD]](s32), [[COPY2]](p0) :: (store (s32) into %ir.padd)
-    ; MIPS32: RetRA
+    ; MIPS32-NEXT: {{  $}}
+    ; MIPS32-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+    ; MIPS32-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
+    ; MIPS32-NEXT: [[COPY2:%[0-9]+]]:_(p0) = COPY $a2
+    ; MIPS32-NEXT: [[COPY3:%[0-9]+]]:_(p0) = COPY $a3
+    ; MIPS32-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY]], [[COPY1]]
+    ; MIPS32-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[ADD]](s32), [[COPY1]]
+    ; MIPS32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; MIPS32-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ICMP]], [[C]]
+    ; MIPS32-NEXT: G_STORE [[AND]](s32), [[COPY3]](p0) :: (store (s8) into %ir.pcarry_flag)
+    ; MIPS32-NEXT: G_STORE [[ADD]](s32), [[COPY2]](p0) :: (store (s32) into %ir.padd)
+    ; MIPS32-NEXT: RetRA
     %0:_(s32) = COPY $a0
     %1:_(s32) = COPY $a1
     %2:_(p0) = COPY $a2
diff --git a/llvm/test/CodeGen/Mips/GlobalISel/legalizer/bitwise.mir b/llvm/test/CodeGen/Mips/GlobalISel/legalizer/bitwise.mir
index ecda48cbb76aa..ec88831baa724 100644
--- a/llvm/test/CodeGen/Mips/GlobalISel/legalizer/bitwise.mir
+++ b/llvm/test/CodeGen/Mips/GlobalISel/legalizer/bitwise.mir
@@ -42,11 +42,12 @@ body:             |
 
     ; MIPS32-LABEL: name: and_i1
     ; MIPS32: liveins: $a0, $a1
-    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
-    ; MIPS32: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
-    ; MIPS32: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[COPY]]
-    ; MIPS32: $v0 = COPY [[AND]](s32)
-    ; MIPS32: RetRA implicit $v0
+    ; MIPS32-NEXT: {{  $}}
+    ; MIPS32-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+    ; MIPS32-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
+    ; MIPS32-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[COPY]]
+    ; MIPS32-NEXT: $v0 = COPY [[AND]](s32)
+    ; MIPS32-NEXT: RetRA implicit $v0
     %2:_(s32) = COPY $a0
     %0:_(s1) = G_TRUNC %2(s32)
     %3:_(s32) = COPY $a1
@@ -67,11 +68,12 @@ body:             |
 
     ; MIPS32-LABEL: name: and_i8
     ; MIPS32: liveins: $a0, $a1
-    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
-    ; MIPS32: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
-    ; MIPS32: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[COPY]]
-    ; MIPS32: $v0 = COPY [[AND]](s32)
-    ; MIPS32: RetRA implicit $v0
+    ; MIPS32-NEXT: {{  $}}
+    ; MIPS32-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+    ; MIPS32-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
+    ; MIPS32-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[COPY]]
+    ; MIPS32-NEXT: $v0 = COPY [[AND]](s32)
+    ; MIPS32-NEXT: RetRA implicit $v0
     %2:_(s32) = COPY $a0
     %0:_(s8) = G_TRUNC %2(s32)
     %3:_(s32) = COPY $a1
@@ -92,11 +94,12 @@ body:             |
 
     ; MIPS32-LABEL: name: and_i16
     ; MIPS32: liveins: $a0, $a1
-    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
-    ; MIPS32: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
-    ; MIPS32: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[COPY]]
-    ; MIPS32: $v0 = COPY [[AND]](s32)
-    ; MIPS32: RetRA implicit $v0
+    ; MIPS32-NEXT: {{  $}}
+    ; MIPS32-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+    ; MIPS32-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
+    ; MIPS32-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[COPY]]
+    ; MIPS32-NEXT: $v0 = COPY [[AND]](s32)
+    ; MIPS32-NEXT: RetRA implicit $v0
     %2:_(s32) = COPY $a0
     %0:_(s16) = G_TRUNC %2(s32)
     %3:_(s32) = COPY $a1
@@ -117,11 +120,12 @@ body:             |
 
     ; MIPS32-LABEL: name: and_i32
     ; MIPS32: liveins: $a0, $a1
-    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
-    ; MIPS32: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
-    ; MIPS32: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[COPY]]
-    ; MIPS32: $v0 = COPY [[AND]](s32)
-    ; MIPS32: RetRA implicit $v0
+    ; MIPS32-NEXT: {{  $}}
+    ; MIPS32-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+    ; MIPS32-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
+    ; MIPS32-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[COPY]]
+    ; MIPS32-NEXT: $v0 = COPY [[AND]](s32)
+    ; MIPS32-NEXT: RetRA implicit $v0
     %0:_(s32) = COPY $a0
     %1:_(s32) = COPY $a1
     %2:_(s32) = G_AND %1, %0
@@ -139,15 +143,16 @@ body:             |
 
     ; MIPS32-LABEL: name: and_i64
     ; MIPS32: liveins: $a0, $a1, $a2, $a3
-    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
-    ; MIPS32: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
-    ; MIPS32: [[COPY2:%[0-9]+]]:_(s32) = COPY $a2
-    ; MIPS32: [[COPY3:%[0-9]+]]:_(s32) = COPY $a3
-    ; MIPS32: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[COPY]]
-    ; MIPS32: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[COPY1]]
-    ; MIPS32: $v0 = COPY [[AND]](s32)
-    ; MIPS32: $v1 = COPY [[AND1]](s32)
-    ; MIPS32: RetRA implicit $v0, implicit $v1
+    ; MIPS32-NEXT: {{  $}}
+    ; MIPS32-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+    ; MIPS32-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
+    ; MIPS32-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $a2
+    ; MIPS32-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $a3
+    ; MIPS32-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[COPY]]
+    ; MIPS32-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY3]], [[COPY1]]
+    ; MIPS32-NEXT: $v0 = COPY [[AND]](s32)
+    ; MIPS32-NEXT: $v1 = COPY [[AND1]](s32)
+    ; MIPS32-NEXT: RetRA implicit $v0, implicit $v1
     %2:_(s32) = COPY $a0
     %3:_(s32) = COPY $a1
     %0:_(s64) = G_MERGE_VALUES %2(s32), %3(s32)
@@ -171,11 +176,12 @@ body:             |
 
     ; MIPS32-LABEL: name: or_i1
     ; MIPS32: liveins: $a0, $a1
-    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
-    ; MIPS32: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
-    ; MIPS32: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY1]], [[COPY]]
-    ; MIPS32: $v0 = COPY [[OR]](s32)
-    ; MIPS32: RetRA implicit $v0
+    ; MIPS32-NEXT: {{  $}}
+    ; MIPS32-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+    ; MIPS32-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
+    ; MIPS32-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY1]], [[COPY]]
+    ; MIPS32-NEXT: $v0 = COPY [[OR]](s32)
+    ; MIPS32-NEXT: RetRA implicit $v0
     %2:_(s32) = COPY $a0
     %0:_(s1) = G_TRUNC %2(s32)
     %3:_(s32) = COPY $a1
@@ -196,11 +202,12 @@ body:             |
 
     ; MIPS32-LABEL: name: or_i8
     ; MIPS32: liveins: $a0, $a1
-    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
-    ; MIPS32: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
-    ; MIPS32: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY1]], [[COPY]]
-    ; MIPS32: $v0 = COPY [[OR]](s32)
-    ; MIPS32: RetRA implicit $v0
+    ; MIPS32-NEXT: {{  $}}
+    ; MIPS32-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+    ; MIPS32-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
+    ; MIPS32-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY1]], [[COPY]]
+    ; MIPS32-NEXT: $v0 = COPY [[OR]](s32)
+    ; MIPS32-NEXT: RetRA implicit $v0
     %2:_(s32) = COPY $a0
     %0:_(s8) = G_TRUNC %2(s32)
     %3:_(s32) = COPY $a1
@@ -221,11 +228,12 @@ body:             |
 
     ; MIPS32-LABEL: name: or_i16
     ; MIPS32: liveins: $a0, $a1
-    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
-    ; MIPS32: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
-    ; MIPS32: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY1]], [[COPY]]
-    ; MIPS32: $v0 = COPY [[OR]](s32)
-    ; MIPS32: RetRA implicit $v0
+    ; MIPS32-NEXT: {{  $}}
+    ; MIPS32-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+    ; MIPS32-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
+    ; MIPS32-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY1]], [[COPY]]
+    ; MIPS32-NEXT: $v0 = COPY [[OR]](s32)
+    ; MIPS32-NEXT: RetRA implicit $v0
     %2:_(s32) = COPY $a0
     %0:_(s16) = G_TRUNC %2(s32)
     %3:_(s32) = COPY $a1
@@ -246,11 +254,12 @@ body:             |
 
     ; MIPS32-LABEL: name: or_i32
     ; MIPS32: liveins: $a0, $a1
-    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
-    ; MIPS32: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
-    ; MIPS32: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY1]], [[COPY]]
-    ; MIPS32: $v0 = COPY [[OR]](s32)
-    ; MIPS32: RetRA implicit $v0
+    ; MIPS32-NEXT: {{  $}}
+    ; MIPS32-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+    ; MIPS32-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
+    ; MIPS32-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY1]], [[COPY]]
+    ; MIPS32-NEXT: $v0 = COPY [[OR]](s32)
+    ; MIPS32-NEXT: RetRA implicit $v0
     %0:_(s32) = COPY $a0
     %1:_(s32) = COPY $a1
     %2:_(s32) = G_OR %1, %0
@@ -268,15 +277,16 @@ body:             |
 
     ; MIPS32-LABEL: name: or_i64
     ; MIPS32: liveins: $a0, $a1, $a2, $a3
-    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
-    ; MIPS32: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
-    ; MIPS32: [[COPY2:%[0-9]+]]:_(s32) = COPY $a2
-    ; MIPS32: [[COPY3:%[0-9]+]]:_(s32) = COPY $a3
-    ; MIPS32: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY2]], [[COPY]]
-    ; MIPS32: [[OR1:%[0-9]+]]:_(s32) = G_OR [[COPY3]], [[COPY1]]
-    ; MIPS32: $v0 = COPY [[OR]](s32)
-    ; MIPS32: $v1 = COPY [[OR1]](s32)
-    ; MIPS32: RetRA implicit $v0, implicit $v1
+    ; MIPS32-NEXT: {{  $}}
+    ; MIPS32-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+    ; MIPS32-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
+    ; MIPS32-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $a2
+    ; MIPS32-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $a3
+    ; MIPS32-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY2]], [[COPY]]
+    ; MIPS32-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[COPY3]], [[COPY1]]
+    ; MIPS32-NEXT: $v0 = COPY [[OR]](s32)
+    ; MIPS32-NEXT: $v1 = COPY [[OR1]](s32)
+    ; MIPS32-NEXT: RetRA implicit $v0, implicit $v1
     %2:_(s32) = COPY $a0
     %3:_(s32) = COPY $a1
     %0:_(s64) = G_MERGE_VALUES %2(s32), %3(s32)
@@ -300,11 +310,12 @@ body:             |
 
     ; MIPS32-LABEL: name: xor_i1
     ; MIPS32: liveins: $a0, $a1
-    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
-    ; MIPS32: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
-    ; MIPS32: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[COPY1]], [[COPY]]
-    ; MIPS32: $v0 = COPY [[XOR]](s32)
-    ; MIPS32: RetRA implicit $v0
+    ; MIPS32-NEXT: {{  $}}
+    ; MIPS32-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+    ; MIPS32-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
+    ; MIPS32-NEXT: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[COPY1]], [[COPY]]
+    ; MIPS32-NEXT: $v0 = COPY [[XOR]](s32)
+    ; MIPS32-NEXT: RetRA implicit $v0
     %2:_(s32) = COPY $a0
     %0:_(s1) = G_TRUNC %2(s32)
     %3:_(s32) = COPY $a1
@@ -325,11 +336,12 @@ body:             |
 
     ; MIPS32-LABEL: name: xor_i8
     ; MIPS32: liveins: $a0, $a1
-    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
-    ; MIPS32: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
-    ; MIPS32: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[COPY1]], [[COPY]]
-    ; MIPS32: $v0 = COPY [[XOR]](s32)
-    ; MIPS32: RetRA implicit $v0
+    ; MIPS32-NEXT: {{  $}}
+    ; MIPS32-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+    ; MIPS32-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
+    ; MIPS32-NEXT: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[COPY1]], [[COPY]]
+    ; MIPS32-NEXT: $v0 = COPY [[XOR]](s32)
+    ; MIPS32-NEXT: RetRA implicit $v0
     %2:_(s32) = COPY $a0
     %0:_(s8) = G_TRUNC %2(s32)
     %3:_(s32) = COPY $a1
@@ -350,11 +362,12 @@ body:             |
 
     ; MIPS32-LABEL: name: xor_i16
     ; MIPS32: liveins: $a0, $a1
-    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
-    ; MIPS32: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
-    ; MIPS32: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[COPY1]], [[COPY]]
-    ; MIPS32: $v0 = COPY [[XOR]](s32)
-    ; MIPS32: RetRA implicit $v0
+    ; MIPS32-NEXT: {{  $}}
+    ; MIPS32-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+    ; MIPS32-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
+    ; MIPS32-NEXT: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[COPY1]], [[COPY]]
+    ; MIPS32-NEXT: $v0 = COPY [[XOR]](s32)
+    ; MIPS32-NEXT: RetRA implicit $v0
     %2:_(s32) = COPY $a0
     %0:_(s16) = G_TRUNC %2(s32)
     %3:_(s32) = COPY $a1
@@ -375,11 +388,12 @@ body:             |
 
     ; MIPS32-LABEL: name: xor_i32
     ; MIPS32: liveins: $a0, $a1
-    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
-    ; MIPS32: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
-    ; MIPS32: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[COPY1]], [[COPY]]
-    ; MIPS32: $v0 = COPY [[XOR]](s32)
-    ; MIPS32: RetRA implicit $v0
+    ; MIPS32-NEXT: {{  $}}
+    ; MIPS32-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+    ; MIPS32-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
+    ; MIPS32-NEXT: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[COPY1]], [[COPY]]
+    ; MIPS32-NEXT: $v0 = COPY [[XOR]](s32)
+    ; MIPS32-NEXT: RetRA implicit $v0
     %0:_(s32) = COPY $a0
     %1:_(s32) = COPY $a1
     %2:_(s32) = G_XOR %1, %0
@@ -397,15 +411,16 @@ body:             |
 
     ; MIPS32-LABEL: name: xor_i64
     ; MIPS32: liveins: $a0, $a1, $a2, $a3
-    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
-    ; MIPS32: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
-    ; MIPS32: [[COPY2:%[0-9]+]]:_(s32) = COPY $a2
-    ; MIPS32: [[COPY3:%[0-9]+]]:_(s32) = COPY $a3
-    ; MIPS32: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[COPY2]], [[COPY]]
-    ; MIPS32: [[XOR1:%[0-9]+]]:_(s32) = G_XOR [[COPY3]], [[COPY1]]
-    ; MIPS32: $v0 = COPY [[XOR]](s32)
-    ; MIPS32: $v1 = COPY [[XOR1]](s32)
-    ; MIPS32: RetRA implicit $v0, implicit $v1
+    ; MIPS32-NEXT: {{  $}}
+    ; MIPS32-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+    ; MIPS32-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
+    ; MIPS32-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $a2
+    ; MIPS32-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $a3
+    ; MIPS32-NEXT: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[COPY2]], [[COPY]]
+    ; MIPS32-NEXT: [[XOR1:%[0-9]+]]:_(s32) = G_XOR [[COPY3]], [[COPY1]]
+    ; MIPS32-NEXT: $v0 = COPY [[XOR]](s32)
+    ; MIPS32-NEXT: $v1 = COPY [[XOR1]](s32)
+    ; MIPS32-NEXT: RetRA implicit $v0, implicit $v1
     %2:_(s32) = COPY $a0
     %3:_(s32) = COPY $a1
     %0:_(s64) = G_MERGE_VALUES %2(s32), %3(s32)
@@ -429,11 +444,12 @@ body:             |
 
     ; MIPS32-LABEL: name: shl
     ; MIPS32: liveins: $a0
-    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
-    ; MIPS32: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; MIPS32: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[C]](s32)
-    ; MIPS32: $v0 = COPY [[SHL]](s32)
-    ; MIPS32: RetRA implicit $v0
+    ; MIPS32-NEXT: {{  $}}
+    ; MIPS32-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+    ; MIPS32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; MIPS32-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[C]](s32)
+    ; MIPS32-NEXT: $v0 = COPY [[SHL]](s32)
+    ; MIPS32-NEXT: RetRA implicit $v0
     %0:_(s32) = COPY $a0
     %1:_(s32) = G_CONSTANT i32 1
     %2:_(s32) = G_SHL %0, %1
@@ -451,11 +467,12 @@ body:             |
 
     ; MIPS32-LABEL: name: ashr
     ; MIPS32: liveins: $a0
-    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
-    ; MIPS32: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; MIPS32: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[COPY]], [[C]](s32)
-    ; MIPS32: $v0 = COPY [[ASHR]](s32)
-    ; MIPS32: RetRA implicit $v0
+    ; MIPS32-NEXT: {{  $}}
+    ; MIPS32-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+    ; MIPS32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; MIPS32-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[COPY]], [[C]](s32)
+    ; MIPS32-NEXT: $v0 = COPY [[ASHR]](s32)
+    ; MIPS32-NEXT: RetRA implicit $v0
     %0:_(s32) = COPY $a0
     %1:_(s32) = G_CONSTANT i32 1
     %2:_(s32) = G_ASHR %0, %1
@@ -473,11 +490,12 @@ body:             |
 
     ; MIPS32-LABEL: name: lshr
     ; MIPS32: liveins: $a0
-    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
-    ; MIPS32: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; MIPS32: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C]](s32)
-    ; MIPS32: $v0 = COPY [[LSHR]](s32)
-    ; MIPS32: RetRA implicit $v0
+    ; MIPS32-NEXT: {{  $}}
+    ; MIPS32-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+    ; MIPS32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; MIPS32-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C]](s32)
+    ; MIPS32-NEXT: $v0 = COPY [[LSHR]](s32)
+    ; MIPS32-NEXT: RetRA implicit $v0
     %0:_(s32) = COPY $a0
     %1:_(s32) = G_CONSTANT i32 1
     %2:_(s32) = G_LSHR %0, %1
@@ -495,11 +513,12 @@ body:             |
 
     ; MIPS32-LABEL: name: lshr_i64_shift_amount
     ; MIPS32: liveins: $a0
-    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
-    ; MIPS32: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; MIPS32: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C]](s32)
-    ; MIPS32: $v0 = COPY [[LSHR]](s32)
-    ; MIPS32: RetRA implicit $v0
+    ; MIPS32-NEXT: {{  $}}
+    ; MIPS32-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+    ; MIPS32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; MIPS32-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C]](s32)
+    ; MIPS32-NEXT: $v0 = COPY [[LSHR]](s32)
+    ; MIPS32-NEXT: RetRA implicit $v0
     %0:_(s32) = COPY $a0
     %1:_(s64) = G_CONSTANT i64 1
     %2:_(s32) = G_LSHR %0, %1
@@ -517,11 +536,12 @@ body:             |
 
     ; MIPS32-LABEL: name: shlv
     ; MIPS32: liveins: $a0, $a1
-    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
-    ; MIPS32: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
-    ; MIPS32: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[COPY1]](s32)
-    ; MIPS32: $v0 = COPY [[SHL]](s32)
-    ; MIPS32: RetRA implicit $v0
+    ; MIPS32-NEXT: {{  $}}
+    ; MIPS32-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+    ; MIPS32-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
+    ; MIPS32-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[COPY1]](s32)
+    ; MIPS32-NEXT: $v0 = COPY [[SHL]](s32)
+    ; MIPS32-NEXT: RetRA implicit $v0
     %0:_(s32) = COPY $a0
     %1:_(s32) = COPY $a1
     %2:_(s32) = G_SHL %0, %1
@@ -539,11 +559,12 @@ body:             |
 
     ; MIPS32-LABEL: name: ashrv
     ; MIPS32: liveins: $a0, $a1
-    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
-    ; MIPS32: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
-    ; MIPS32: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[COPY]], [[COPY1]](s32)
-    ; MIPS32: $v0 = COPY [[ASHR]](s32)
-    ; MIPS32: RetRA implicit $v0
+    ; MIPS32-NEXT: {{  $}}
+    ; MIPS32-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+    ; MIPS32-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
+    ; MIPS32-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[COPY]], [[COPY1]](s32)
+    ; MIPS32-NEXT: $v0 = COPY [[ASHR]](s32)
+    ; MIPS32-NEXT: RetRA implicit $v0
     %0:_(s32) = COPY $a0
     %1:_(s32) = COPY $a1
     %2:_(s32) = G_ASHR %0, %1
@@ -561,11 +582,12 @@ body:             |
 
     ; MIPS32-LABEL: name: lshrv
     ; MIPS32: liveins: $a0, $a1
-    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
-    ; MIPS32: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
-    ; MIPS32: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[COPY1]](s32)
-    ; MIPS32: $v0 = COPY [[LSHR]](s32)
-    ; MIPS32: RetRA implicit $v0
+    ; MIPS32-NEXT: {{  $}}
+    ; MIPS32-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+    ; MIPS32-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
+    ; MIPS32-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[COPY1]](s32)
+    ; MIPS32-NEXT: $v0 = COPY [[LSHR]](s32)
+    ; MIPS32-NEXT: RetRA implicit $v0
     %0:_(s32) = COPY $a0
     %1:_(s32) = COPY $a1
     %2:_(s32) = G_LSHR %0, %1
@@ -583,12 +605,12 @@ body:             |
 
     ; MIPS32-LABEL: name: shl_i16
     ; MIPS32: liveins: $a0
-    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
-    ; MIPS32: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; MIPS32: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C]](s32)
-    ; MIPS32: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[COPY1]](s32)
-    ; MIPS32: $v0 = COPY [[SHL]](s32)
-    ; MIPS32: RetRA implicit $v0
+    ; MIPS32-NEXT: {{  $}}
+    ; MIPS32-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+    ; MIPS32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; MIPS32-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[C]](s32)
+    ; MIPS32-NEXT: $v0 = COPY [[SHL]](s32)
+    ; MIPS32-NEXT: RetRA implicit $v0
     %1:_(s32) = COPY $a0
     %0:_(s16) = G_TRUNC %1(s32)
     %2:_(s16) = G_CONSTANT i16 2
@@ -608,15 +630,15 @@ body:             |
 
     ; MIPS32-LABEL: name: ashr_i8
     ; MIPS32: liveins: $a0
-    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
-    ; MIPS32: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; MIPS32: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C]](s32)
-    ; MIPS32: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
-    ; MIPS32: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[C1]](s32)
-    ; MIPS32: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[SHL]], [[C1]](s32)
-    ; MIPS32: [[ASHR1:%[0-9]+]]:_(s32) = G_ASHR [[ASHR]], [[COPY1]](s32)
-    ; MIPS32: $v0 = COPY [[ASHR1]](s32)
-    ; MIPS32: RetRA implicit $v0
+    ; MIPS32-NEXT: {{  $}}
+    ; MIPS32-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+    ; MIPS32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; MIPS32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
+    ; MIPS32-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[C1]](s32)
+    ; MIPS32-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[SHL]], [[C1]](s32)
+    ; MIPS32-NEXT: [[ASHR1:%[0-9]+]]:_(s32) = G_ASHR [[ASHR]], [[C]](s32)
+    ; MIPS32-NEXT: $v0 = COPY [[ASHR1]](s32)
+    ; MIPS32-NEXT: RetRA implicit $v0
     %1:_(s32) = COPY $a0
     %0:_(s8) = G_TRUNC %1(s32)
     %2:_(s8) = G_CONSTANT i8 2
@@ -636,14 +658,14 @@ body:             |
 
     ; MIPS32-LABEL: name: lshr_i16
     ; MIPS32: liveins: $a0
-    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
-    ; MIPS32: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; MIPS32: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
-    ; MIPS32: [[COPY1:%[0-9]+]]:_(s32) = COPY [[C]](s32)
-    ; MIPS32: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C1]]
-    ; MIPS32: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[AND]], [[COPY1]](s32)
-    ; MIPS32: $v0 = COPY [[LSHR]](s32)
-    ; MIPS32: RetRA implicit $v0
+    ; MIPS32-NEXT: {{  $}}
+    ; MIPS32-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+    ; MIPS32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; MIPS32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+    ; MIPS32-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C1]]
+    ; MIPS32-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[AND]], [[C]](s32)
+    ; MIPS32-NEXT: $v0 = COPY [[LSHR]](s32)
+    ; MIPS32-NEXT: RetRA implicit $v0
     %1:_(s32) = COPY $a0
     %0:_(s16) = G_TRUNC %1(s32)
     %2:_(s16) = G_CONSTANT i16 2
@@ -663,30 +685,27 @@ body:             |
 
     ; MIPS32-LABEL: name: shl_i64
     ; MIPS32: liveins: $a0, $a1, $a2, $a3
-    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
-    ; MIPS32: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
-    ; MIPS32: [[COPY2:%[0-9]+]]:_(s32) = COPY $a2
-    ; MIPS32: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-    ; MIPS32: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[COPY2]], [[C]]
-    ; MIPS32: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[C]], [[COPY2]]
-    ; MIPS32: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-    ; MIPS32: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[COPY2]](s32), [[C]]
-    ; MIPS32: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[COPY2]](s32), [[C1]]
-    ; MIPS32: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[COPY2]](s32)
-    ; MIPS32: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[SUB1]](s32)
-    ; MIPS32: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY1]], [[COPY2]](s32)
-    ; MIPS32: [[OR:%[0-9]+]]:_(s32) = G_OR [[LSHR]], [[SHL1]]
-    ; MIPS32: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[SUB]](s32)
-    ; MIPS32: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; MIPS32: [[AND:%[0-9]+]]:_(s32) = G_AND [[ICMP]], [[C2]]
-    ; MIPS32: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AND]](s32), [[SHL]], [[C1]]
-    ; MIPS32: [[AND1:%[0-9]+]]:_(s32) = G_AND [[ICMP]], [[C2]]
-    ; MIPS32: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[AND1]](s32), [[OR]], [[SHL2]]
-    ; MIPS32: [[AND2:%[0-9]+]]:_(s32) = G_AND [[ICMP1]], [[C2]]
-    ; MIPS32: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[AND2]](s32), [[COPY1]], [[SELECT1]]
-    ; MIPS32: $v0 = COPY [[SELECT]](s32)
-    ; MIPS32: $v1 = COPY [[SELECT2]](s32)
-    ; MIPS32: RetRA implicit $v0, implicit $v1
+    ; MIPS32-NEXT: {{  $}}
+    ; MIPS32-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+    ; MIPS32-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
+    ; MIPS32-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $a2
+    ; MIPS32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; MIPS32-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[COPY2]], [[C]]
+    ; MIPS32-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[C]], [[COPY2]]
+    ; MIPS32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; MIPS32-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[COPY2]](s32), [[C]]
+    ; MIPS32-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[COPY2]](s32), [[C1]]
+    ; MIPS32-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[COPY2]](s32)
+    ; MIPS32-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[SUB1]](s32)
+    ; MIPS32-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY1]], [[COPY2]](s32)
+    ; MIPS32-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[LSHR]], [[SHL1]]
+    ; MIPS32-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[SUB]](s32)
+    ; MIPS32-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s32), [[SHL]], [[C1]]
+    ; MIPS32-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s32), [[OR]], [[SHL2]]
+    ; MIPS32-NEXT: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s32), [[COPY1]], [[SELECT1]]
+    ; MIPS32-NEXT: $v0 = COPY [[SELECT]](s32)
+    ; MIPS32-NEXT: $v1 = COPY [[SELECT2]](s32)
+    ; MIPS32-NEXT: RetRA implicit $v0, implicit $v1
     %2:_(s32) = COPY $a0
     %3:_(s32) = COPY $a1
     %0:_(s64) = G_MERGE_VALUES %2(s32), %3(s32)
@@ -710,32 +729,29 @@ body:             |
 
     ; MIPS32-LABEL: name: ashl_i64
     ; MIPS32: liveins: $a0, $a1, $a2, $a3
-    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
-    ; MIPS32: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
-    ; MIPS32: [[COPY2:%[0-9]+]]:_(s32) = COPY $a2
-    ; MIPS32: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-    ; MIPS32: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[COPY2]], [[C]]
-    ; MIPS32: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[C]], [[COPY2]]
-    ; MIPS32: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-    ; MIPS32: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[COPY2]](s32), [[C]]
-    ; MIPS32: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[COPY2]](s32), [[C1]]
-    ; MIPS32: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[COPY1]], [[COPY2]](s32)
-    ; MIPS32: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[COPY2]](s32)
-    ; MIPS32: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY1]], [[SUB1]](s32)
-    ; MIPS32: [[OR:%[0-9]+]]:_(s32) = G_OR [[LSHR]], [[SHL]]
-    ; MIPS32: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 31
-    ; MIPS32: [[ASHR1:%[0-9]+]]:_(s32) = G_ASHR [[COPY1]], [[C2]](s32)
-    ; MIPS32: [[ASHR2:%[0-9]+]]:_(s32) = G_ASHR [[COPY1]], [[SUB]](s32)
-    ; MIPS32: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; MIPS32: [[AND:%[0-9]+]]:_(s32) = G_AND [[ICMP]], [[C3]]
-    ; MIPS32: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AND]](s32), [[OR]], [[ASHR2]]
-    ; MIPS32: [[AND1:%[0-9]+]]:_(s32) = G_AND [[ICMP1]], [[C3]]
-    ; MIPS32: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[AND1]](s32), [[COPY]], [[SELECT]]
-    ; MIPS32: [[AND2:%[0-9]+]]:_(s32) = G_AND [[ICMP]], [[C3]]
-    ; MIPS32: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[AND2]](s32), [[ASHR]], [[ASHR1]]
-    ; MIPS32: $v0 = COPY [[SELECT1]](s32)
-    ; MIPS32: $v1 = COPY [[SELECT2]](s32)
-    ; MIPS32: RetRA implicit $v0, implicit $v1
+    ; MIPS32-NEXT: {{  $}}
+    ; MIPS32-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+    ; MIPS32-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
+    ; MIPS32-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $a2
+    ; MIPS32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; MIPS32-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[COPY2]], [[C]]
+    ; MIPS32-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[C]], [[COPY2]]
+    ; MIPS32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; MIPS32-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[COPY2]](s32), [[C]]
+    ; MIPS32-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[COPY2]](s32), [[C1]]
+    ; MIPS32-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[COPY1]], [[COPY2]](s32)
+    ; MIPS32-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[COPY2]](s32)
+    ; MIPS32-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY1]], [[SUB1]](s32)
+    ; MIPS32-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[LSHR]], [[SHL]]
+    ; MIPS32-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 31
+    ; MIPS32-NEXT: [[ASHR1:%[0-9]+]]:_(s32) = G_ASHR [[COPY1]], [[C2]](s32)
+    ; MIPS32-NEXT: [[ASHR2:%[0-9]+]]:_(s32) = G_ASHR [[COPY1]], [[SUB]](s32)
+    ; MIPS32-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s32), [[OR]], [[ASHR2]]
+    ; MIPS32-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s32), [[COPY]], [[SELECT]]
+    ; MIPS32-NEXT: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s32), [[ASHR]], [[ASHR1]]
+    ; MIPS32-NEXT: $v0 = COPY [[SELECT1]](s32)
+    ; MIPS32-NEXT: $v1 = COPY [[SELECT2]](s32)
+    ; MIPS32-NEXT: RetRA implicit $v0, implicit $v1
     %2:_(s32) = COPY $a0
     %3:_(s32) = COPY $a1
     %0:_(s64) = G_MERGE_VALUES %2(s32), %3(s32)
@@ -759,30 +775,27 @@ body:             |
 
     ; MIPS32-LABEL: name: lshr_i64
     ; MIPS32: liveins: $a0, $a1, $a2, $a3
-    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
-    ; MIPS32: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
-    ; MIPS32: [[COPY2:%[0-9]+]]:_(s32) = COPY $a2
-    ; MIPS32: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-    ; MIPS32: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[COPY2]], [[C]]
-    ; MIPS32: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[C]], [[COPY2]]
-    ; MIPS32: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-    ; MIPS32: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[COPY2]](s32), [[C]]
-    ; MIPS32: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[COPY2]](s32), [[C1]]
-    ; MIPS32: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY1]], [[COPY2]](s32)
-    ; MIPS32: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[COPY2]](s32)
-    ; MIPS32: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY1]], [[SUB1]](s32)
-    ; MIPS32: [[OR:%[0-9]+]]:_(s32) = G_OR [[LSHR1]], [[SHL]]
-    ; MIPS32: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[COPY1]], [[SUB]](s32)
-    ; MIPS32: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; MIPS32: [[AND:%[0-9]+]]:_(s32) = G_AND [[ICMP]], [[C2]]
-    ; MIPS32: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AND]](s32), [[OR]], [[LSHR2]]
-    ; MIPS32: [[AND1:%[0-9]+]]:_(s32) = G_AND [[ICMP1]], [[C2]]
-    ; MIPS32: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[AND1]](s32), [[COPY]], [[SELECT]]
-    ; MIPS32: [[AND2:%[0-9]+]]:_(s32) = G_AND [[ICMP]], [[C2]]
-    ; MIPS32: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[AND2]](s32), [[LSHR]], [[C1]]
-    ; MIPS32: $v0 = COPY [[SELECT1]](s32)
-    ; MIPS32: $v1 = COPY [[SELECT2]](s32)
-    ; MIPS32: RetRA implicit $v0, implicit $v1
+    ; MIPS32-NEXT: {{  $}}
+    ; MIPS32-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+    ; MIPS32-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
+    ; MIPS32-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $a2
+    ; MIPS32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; MIPS32-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[COPY2]], [[C]]
+    ; MIPS32-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[C]], [[COPY2]]
+    ; MIPS32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; MIPS32-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[COPY2]](s32), [[C]]
+    ; MIPS32-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[COPY2]](s32), [[C1]]
+    ; MIPS32-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY1]], [[COPY2]](s32)
+    ; MIPS32-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[COPY2]](s32)
+    ; MIPS32-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY1]], [[SUB1]](s32)
+    ; MIPS32-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[LSHR1]], [[SHL]]
+    ; MIPS32-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[COPY1]], [[SUB]](s32)
+    ; MIPS32-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s32), [[OR]], [[LSHR2]]
+    ; MIPS32-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s32), [[COPY]], [[SELECT]]
+    ; MIPS32-NEXT: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s32), [[LSHR]], [[C1]]
+    ; MIPS32-NEXT: $v0 = COPY [[SELECT1]](s32)
+    ; MIPS32-NEXT: $v1 = COPY [[SELECT2]](s32)
+    ; MIPS32-NEXT: RetRA implicit $v0, implicit $v1
     %2:_(s32) = COPY $a0
     %3:_(s32) = COPY $a1
     %0:_(s64) = G_MERGE_VALUES %2(s32), %3(s32)
diff --git a/llvm/test/CodeGen/Mips/GlobalISel/legalizer/constants.mir b/llvm/test/CodeGen/Mips/GlobalISel/legalizer/constants.mir
index 9265bae16b899..0ced5d52d262c 100644
--- a/llvm/test/CodeGen/Mips/GlobalISel/legalizer/constants.mir
+++ b/llvm/test/CodeGen/Mips/GlobalISel/legalizer/constants.mir
@@ -20,10 +20,10 @@ body:             |
   bb.1.entry:
     ; MIPS32-LABEL: name: any_i64
     ; MIPS32: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-    ; MIPS32: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -2147483648
-    ; MIPS32: $v0 = COPY [[C1]](s32)
-    ; MIPS32: $v1 = COPY [[C]](s32)
-    ; MIPS32: RetRA implicit $v0, implicit $v1
+    ; MIPS32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -2147483648
+    ; MIPS32-NEXT: $v0 = COPY [[C1]](s32)
+    ; MIPS32-NEXT: $v1 = COPY [[C]](s32)
+    ; MIPS32-NEXT: RetRA implicit $v0, implicit $v1
     %0:_(s64) = G_CONSTANT i64 -9223372036854775808
     %1:_(s32), %2:_(s32) = G_UNMERGE_VALUES %0(s64)
     $v0 = COPY %2(s32)
@@ -39,8 +39,8 @@ body:             |
   bb.1.entry:
     ; MIPS32-LABEL: name: any_i32
     ; MIPS32: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -2147483648
-    ; MIPS32: $v0 = COPY [[C]](s32)
-    ; MIPS32: RetRA implicit $v0
+    ; MIPS32-NEXT: $v0 = COPY [[C]](s32)
+    ; MIPS32-NEXT: RetRA implicit $v0
     %0:_(s32) = G_CONSTANT i32 -2147483648
     $v0 = COPY %0(s32)
     RetRA implicit $v0
@@ -54,9 +54,9 @@ body:             |
   bb.1.entry:
     ; MIPS32-LABEL: name: signed_i16
     ; MIPS32: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -32768
-    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY [[C]](s32)
-    ; MIPS32: $v0 = COPY [[COPY]](s32)
-    ; MIPS32: RetRA implicit $v0
+    ; MIPS32-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY [[C]](s32)
+    ; MIPS32-NEXT: $v0 = COPY [[COPY]](s32)
+    ; MIPS32-NEXT: RetRA implicit $v0
     %0:_(s16) = G_CONSTANT i16 -32768
     %1:_(s32) = G_SEXT %0(s16)
     $v0 = COPY %1(s32)
@@ -71,9 +71,9 @@ body:             |
   bb.1.entry:
     ; MIPS32-LABEL: name: signed_i8
     ; MIPS32: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -128
-    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY [[C]](s32)
-    ; MIPS32: $v0 = COPY [[COPY]](s32)
-    ; MIPS32: RetRA implicit $v0
+    ; MIPS32-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY [[C]](s32)
+    ; MIPS32-NEXT: $v0 = COPY [[COPY]](s32)
+    ; MIPS32-NEXT: RetRA implicit $v0
     %0:_(s8) = G_CONSTANT i8 -128
     %1:_(s32) = G_SEXT %0(s8)
     $v0 = COPY %1(s32)
@@ -88,8 +88,8 @@ body:             |
   bb.1.entry:
     ; MIPS32-LABEL: name: unsigned_i16
     ; MIPS32: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 32768
-    ; MIPS32: $v0 = COPY [[C]](s32)
-    ; MIPS32: RetRA implicit $v0
+    ; MIPS32-NEXT: $v0 = COPY [[C]](s32)
+    ; MIPS32-NEXT: RetRA implicit $v0
     %0:_(s16) = G_CONSTANT i16 -32768
     %1:_(s32) = G_ZEXT %0(s16)
     $v0 = COPY %1(s32)
@@ -104,8 +104,8 @@ body:             |
   bb.1.entry:
     ; MIPS32-LABEL: name: unsigned_i8
     ; MIPS32: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 128
-    ; MIPS32: $v0 = COPY [[C]](s32)
-    ; MIPS32: RetRA implicit $v0
+    ; MIPS32-NEXT: $v0 = COPY [[C]](s32)
+    ; MIPS32-NEXT: RetRA implicit $v0
     %0:_(s8) = G_CONSTANT i8 -128
     %1:_(s32) = G_ZEXT %0(s8)
     $v0 = COPY %1(s32)
@@ -120,9 +120,8 @@ body:             |
   bb.1.entry:
     ; MIPS32-LABEL: name: i1_true
     ; MIPS32: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY [[C]](s32)
-    ; MIPS32: $v0 = COPY [[COPY]](s32)
-    ; MIPS32: RetRA implicit $v0
+    ; MIPS32-NEXT: $v0 = COPY [[C]](s32)
+    ; MIPS32-NEXT: RetRA implicit $v0
     %0:_(s1) = G_CONSTANT i1 true
     %1:_(s32) = G_ZEXT %0(s1)
     $v0 = COPY %1(s32)
@@ -137,9 +136,8 @@ body:             |
   bb.1.entry:
     ; MIPS32-LABEL: name: i1_false
     ; MIPS32: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY [[C]](s32)
-    ; MIPS32: $v0 = COPY [[COPY]](s32)
-    ; MIPS32: RetRA implicit $v0
+    ; MIPS32-NEXT: $v0 = COPY [[C]](s32)
+    ; MIPS32-NEXT: RetRA implicit $v0
     %0:_(s1) = G_CONSTANT i1 false
     %1:_(s32) = G_ZEXT %0(s1)
     $v0 = COPY %1(s32)
diff --git a/llvm/test/CodeGen/Mips/GlobalISel/legalizer/ctlz.mir b/llvm/test/CodeGen/Mips/GlobalISel/legalizer/ctlz.mir
index f3865435873ad..73ea9338426c2 100644
--- a/llvm/test/CodeGen/Mips/GlobalISel/legalizer/ctlz.mir
+++ b/llvm/test/CodeGen/Mips/GlobalISel/legalizer/ctlz.mir
@@ -10,10 +10,11 @@ body:             |
 
     ; MIPS32-LABEL: name: ctlz_i32
     ; MIPS32: liveins: $a0
-    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
-    ; MIPS32: [[CTLZ:%[0-9]+]]:_(s32) = G_CTLZ [[COPY]](s32)
-    ; MIPS32: $v0 = COPY [[CTLZ]](s32)
-    ; MIPS32: RetRA implicit $v0
+    ; MIPS32-NEXT: {{  $}}
+    ; MIPS32-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+    ; MIPS32-NEXT: [[CTLZ:%[0-9]+]]:_(s32) = G_CTLZ [[COPY]](s32)
+    ; MIPS32-NEXT: $v0 = COPY [[CTLZ]](s32)
+    ; MIPS32-NEXT: RetRA implicit $v0
     %0:_(s32) = COPY $a0
     %1:_(s32) = G_CTLZ %0(s32)
     $v0 = COPY %1(s32)
@@ -30,20 +31,19 @@ body:             |
 
     ; MIPS32-LABEL: name: ctlz_i64
     ; MIPS32: liveins: $a0, $a1
-    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
-    ; MIPS32: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
-    ; MIPS32: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-    ; MIPS32: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[COPY1]](s32), [[C]]
-    ; MIPS32: [[CTLZ:%[0-9]+]]:_(s32) = G_CTLZ [[COPY]](s32)
-    ; MIPS32: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-    ; MIPS32: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[CTLZ]], [[C1]]
-    ; MIPS32: [[CTLZ1:%[0-9]+]]:_(s32) = G_CTLZ [[COPY1]](s32)
-    ; MIPS32: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; MIPS32: [[AND:%[0-9]+]]:_(s32) = G_AND [[ICMP]], [[C2]]
-    ; MIPS32: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AND]](s32), [[ADD]], [[CTLZ1]]
-    ; MIPS32: $v0 = COPY [[SELECT]](s32)
-    ; MIPS32: $v1 = COPY [[C]](s32)
-    ; MIPS32: RetRA implicit $v0, implicit $v1
+    ; MIPS32-NEXT: {{  $}}
+    ; MIPS32-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+    ; MIPS32-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
+    ; MIPS32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; MIPS32-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[COPY1]](s32), [[C]]
+    ; MIPS32-NEXT: [[CTLZ:%[0-9]+]]:_(s32) = G_CTLZ [[COPY]](s32)
+    ; MIPS32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; MIPS32-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[CTLZ]], [[C1]]
+    ; MIPS32-NEXT: [[CTLZ1:%[0-9]+]]:_(s32) = G_CTLZ [[COPY1]](s32)
+    ; MIPS32-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s32), [[ADD]], [[CTLZ1]]
+    ; MIPS32-NEXT: $v0 = COPY [[SELECT]](s32)
+    ; MIPS32-NEXT: $v1 = COPY [[C]](s32)
+    ; MIPS32-NEXT: RetRA implicit $v0, implicit $v1
     %1:_(s32) = COPY $a0
     %2:_(s32) = COPY $a1
     %0:_(s64) = G_MERGE_VALUES %1(s32), %2(s32)
diff --git a/llvm/test/CodeGen/Mips/GlobalISel/legalizer/cttz.mir b/llvm/test/CodeGen/Mips/GlobalISel/legalizer/cttz.mir
index 78a182ff4cfdd..3e7bcdc39d5d9 100644
--- a/llvm/test/CodeGen/Mips/GlobalISel/legalizer/cttz.mir
+++ b/llvm/test/CodeGen/Mips/GlobalISel/legalizer/cttz.mir
@@ -10,16 +10,17 @@ body:             |
 
     ; MIPS32-LABEL: name: cttz_i32
     ; MIPS32: liveins: $a0
-    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
-    ; MIPS32: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
-    ; MIPS32: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[COPY]], [[C]]
-    ; MIPS32: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY]], [[C]]
-    ; MIPS32: [[AND:%[0-9]+]]:_(s32) = G_AND [[XOR]], [[ADD]]
-    ; MIPS32: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-    ; MIPS32: [[CTLZ:%[0-9]+]]:_(s32) = G_CTLZ [[AND]](s32)
-    ; MIPS32: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C1]], [[CTLZ]]
-    ; MIPS32: $v0 = COPY [[SUB]](s32)
-    ; MIPS32: RetRA implicit $v0
+    ; MIPS32-NEXT: {{  $}}
+    ; MIPS32-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+    ; MIPS32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; MIPS32-NEXT: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[COPY]], [[C]]
+    ; MIPS32-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY]], [[C]]
+    ; MIPS32-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[XOR]], [[ADD]]
+    ; MIPS32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; MIPS32-NEXT: [[CTLZ:%[0-9]+]]:_(s32) = G_CTLZ [[AND]](s32)
+    ; MIPS32-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C1]], [[CTLZ]]
+    ; MIPS32-NEXT: $v0 = COPY [[SUB]](s32)
+    ; MIPS32-NEXT: RetRA implicit $v0
     %0:_(s32) = COPY $a0
     %1:_(s32) = G_CTTZ %0(s32)
     $v0 = COPY %1(s32)
@@ -36,29 +37,28 @@ body:             |
 
     ; MIPS32-LABEL: name: cttz_i64
     ; MIPS32: liveins: $a0, $a1
-    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
-    ; MIPS32: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
-    ; MIPS32: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-    ; MIPS32: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[COPY]](s32), [[C]]
-    ; MIPS32: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
-    ; MIPS32: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[COPY1]], [[C1]]
-    ; MIPS32: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY1]], [[C1]]
-    ; MIPS32: [[AND:%[0-9]+]]:_(s32) = G_AND [[XOR]], [[ADD]]
-    ; MIPS32: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-    ; MIPS32: [[CTLZ:%[0-9]+]]:_(s32) = G_CTLZ [[AND]](s32)
-    ; MIPS32: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C2]], [[CTLZ]]
-    ; MIPS32: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[SUB]], [[C2]]
-    ; MIPS32: [[XOR1:%[0-9]+]]:_(s32) = G_XOR [[COPY]], [[C1]]
-    ; MIPS32: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[COPY]], [[C1]]
-    ; MIPS32: [[AND1:%[0-9]+]]:_(s32) = G_AND [[XOR1]], [[ADD2]]
-    ; MIPS32: [[CTLZ1:%[0-9]+]]:_(s32) = G_CTLZ [[AND1]](s32)
-    ; MIPS32: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[C2]], [[CTLZ1]]
-    ; MIPS32: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; MIPS32: [[AND2:%[0-9]+]]:_(s32) = G_AND [[ICMP]], [[C3]]
-    ; MIPS32: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AND2]](s32), [[ADD1]], [[SUB1]]
-    ; MIPS32: $v0 = COPY [[SELECT]](s32)
-    ; MIPS32: $v1 = COPY [[C]](s32)
-    ; MIPS32: RetRA implicit $v0, implicit $v1
+    ; MIPS32-NEXT: {{  $}}
+    ; MIPS32-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+    ; MIPS32-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
+    ; MIPS32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; MIPS32-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[COPY]](s32), [[C]]
+    ; MIPS32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; MIPS32-NEXT: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[COPY1]], [[C1]]
+    ; MIPS32-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY1]], [[C1]]
+    ; MIPS32-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[XOR]], [[ADD]]
+    ; MIPS32-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; MIPS32-NEXT: [[CTLZ:%[0-9]+]]:_(s32) = G_CTLZ [[AND]](s32)
+    ; MIPS32-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C2]], [[CTLZ]]
+    ; MIPS32-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[SUB]], [[C2]]
+    ; MIPS32-NEXT: [[XOR1:%[0-9]+]]:_(s32) = G_XOR [[COPY]], [[C1]]
+    ; MIPS32-NEXT: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[COPY]], [[C1]]
+    ; MIPS32-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[XOR1]], [[ADD2]]
+    ; MIPS32-NEXT: [[CTLZ1:%[0-9]+]]:_(s32) = G_CTLZ [[AND1]](s32)
+    ; MIPS32-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[C2]], [[CTLZ1]]
+    ; MIPS32-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s32), [[ADD1]], [[SUB1]]
+    ; MIPS32-NEXT: $v0 = COPY [[SELECT]](s32)
+    ; MIPS32-NEXT: $v1 = COPY [[C]](s32)
+    ; MIPS32-NEXT: RetRA implicit $v0, implicit $v1
     %1:_(s32) = COPY $a0
     %2:_(s32) = COPY $a1
     %0:_(s64) = G_MERGE_VALUES %1(s32), %2(s32)
@@ -79,22 +79,22 @@ body:             |
 
     ; MIPS32-LABEL: name: ffs_i32_expansion
     ; MIPS32: liveins: $a0
-    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
-    ; MIPS32: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; MIPS32: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-    ; MIPS32: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
-    ; MIPS32: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[COPY]], [[C2]]
-    ; MIPS32: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY]], [[C2]]
-    ; MIPS32: [[AND:%[0-9]+]]:_(s32) = G_AND [[XOR]], [[ADD]]
-    ; MIPS32: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-    ; MIPS32: [[CTLZ:%[0-9]+]]:_(s32) = G_CTLZ [[AND]](s32)
-    ; MIPS32: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C3]], [[CTLZ]]
-    ; MIPS32: [[ADD1:%[0-9]+]]:_(s32) = nuw nsw G_ADD [[SUB]], [[C]]
-    ; MIPS32: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[COPY]](s32), [[C1]]
-    ; MIPS32: [[AND1:%[0-9]+]]:_(s32) = G_AND [[ICMP]], [[C]]
-    ; MIPS32: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AND1]](s32), [[C1]], [[ADD1]]
-    ; MIPS32: $v0 = COPY [[SELECT]](s32)
-    ; MIPS32: RetRA implicit $v0
+    ; MIPS32-NEXT: {{  $}}
+    ; MIPS32-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+    ; MIPS32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; MIPS32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; MIPS32-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; MIPS32-NEXT: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[COPY]], [[C2]]
+    ; MIPS32-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY]], [[C2]]
+    ; MIPS32-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[XOR]], [[ADD]]
+    ; MIPS32-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; MIPS32-NEXT: [[CTLZ:%[0-9]+]]:_(s32) = G_CTLZ [[AND]](s32)
+    ; MIPS32-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C3]], [[CTLZ]]
+    ; MIPS32-NEXT: [[ADD1:%[0-9]+]]:_(s32) = nuw nsw G_ADD [[SUB]], [[C]]
+    ; MIPS32-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[COPY]](s32), [[C1]]
+    ; MIPS32-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s32), [[C1]], [[ADD1]]
+    ; MIPS32-NEXT: $v0 = COPY [[SELECT]](s32)
+    ; MIPS32-NEXT: RetRA implicit $v0
     %0:_(s32) = COPY $a0
     %2:_(s32) = G_CONSTANT i32 1
     %4:_(s32) = G_CONSTANT i32 0
@@ -116,43 +116,41 @@ body:             |
 
     ; MIPS32-LABEL: name: ffs_i64_expansion
     ; MIPS32: liveins: $a0, $a1
-    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
-    ; MIPS32: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
-    ; MIPS32: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; MIPS32: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-    ; MIPS32: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C1]](s32), [[C1]](s32)
-    ; MIPS32: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[COPY]](s32), [[C1]]
-    ; MIPS32: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
-    ; MIPS32: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[COPY1]], [[C2]]
-    ; MIPS32: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY1]], [[C2]]
-    ; MIPS32: [[AND:%[0-9]+]]:_(s32) = G_AND [[XOR]], [[ADD]]
-    ; MIPS32: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-    ; MIPS32: [[CTLZ:%[0-9]+]]:_(s32) = G_CTLZ [[AND]](s32)
-    ; MIPS32: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C3]], [[CTLZ]]
-    ; MIPS32: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[SUB]], [[C3]]
-    ; MIPS32: [[XOR1:%[0-9]+]]:_(s32) = G_XOR [[COPY]], [[C2]]
-    ; MIPS32: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[COPY]], [[C2]]
-    ; MIPS32: [[AND1:%[0-9]+]]:_(s32) = G_AND [[XOR1]], [[ADD2]]
-    ; MIPS32: [[CTLZ1:%[0-9]+]]:_(s32) = G_CTLZ [[AND1]](s32)
-    ; MIPS32: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[C3]], [[CTLZ1]]
-    ; MIPS32: [[AND2:%[0-9]+]]:_(s32) = G_AND [[ICMP]], [[C]]
-    ; MIPS32: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AND2]](s32), [[ADD1]], [[SUB1]]
-    ; MIPS32: [[ADD3:%[0-9]+]]:_(s32) = G_ADD [[SELECT]], [[C]]
-    ; MIPS32: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[ADD3]](s32), [[C]]
-    ; MIPS32: [[ADD4:%[0-9]+]]:_(s32) = G_ADD [[C1]], [[C1]]
-    ; MIPS32: [[AND3:%[0-9]+]]:_(s32) = G_AND [[ICMP1]], [[C]]
-    ; MIPS32: [[ADD5:%[0-9]+]]:_(s32) = G_ADD [[ADD4]], [[AND3]]
-    ; MIPS32: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[ADD3]](s32), [[ADD5]](s32)
-    ; MIPS32: [[XOR2:%[0-9]+]]:_(s32) = G_XOR [[COPY]], [[C1]]
-    ; MIPS32: [[XOR3:%[0-9]+]]:_(s32) = G_XOR [[COPY1]], [[C1]]
-    ; MIPS32: [[OR:%[0-9]+]]:_(s32) = G_OR [[XOR2]], [[XOR3]]
-    ; MIPS32: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[OR]](s32), [[C1]]
-    ; MIPS32: [[AND4:%[0-9]+]]:_(s32) = G_AND [[ICMP2]], [[C]]
-    ; MIPS32: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[AND4]](s32), [[MV]], [[MV1]]
-    ; MIPS32: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[SELECT1]](s64)
-    ; MIPS32: $v0 = COPY [[UV]](s32)
-    ; MIPS32: $v1 = COPY [[UV1]](s32)
-    ; MIPS32: RetRA implicit $v0, implicit $v1
+    ; MIPS32-NEXT: {{  $}}
+    ; MIPS32-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+    ; MIPS32-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
+    ; MIPS32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; MIPS32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; MIPS32-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C1]](s32), [[C1]](s32)
+    ; MIPS32-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[COPY]](s32), [[C1]]
+    ; MIPS32-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+    ; MIPS32-NEXT: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[COPY1]], [[C2]]
+    ; MIPS32-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY1]], [[C2]]
+    ; MIPS32-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[XOR]], [[ADD]]
+    ; MIPS32-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; MIPS32-NEXT: [[CTLZ:%[0-9]+]]:_(s32) = G_CTLZ [[AND]](s32)
+    ; MIPS32-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[C3]], [[CTLZ]]
+    ; MIPS32-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[SUB]], [[C3]]
+    ; MIPS32-NEXT: [[XOR1:%[0-9]+]]:_(s32) = G_XOR [[COPY]], [[C2]]
+    ; MIPS32-NEXT: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[COPY]], [[C2]]
+    ; MIPS32-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[XOR1]], [[ADD2]]
+    ; MIPS32-NEXT: [[CTLZ1:%[0-9]+]]:_(s32) = G_CTLZ [[AND1]](s32)
+    ; MIPS32-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[C3]], [[CTLZ1]]
+    ; MIPS32-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s32), [[ADD1]], [[SUB1]]
+    ; MIPS32-NEXT: [[ADD3:%[0-9]+]]:_(s32) = G_ADD [[SELECT]], [[C]]
+    ; MIPS32-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[ADD3]](s32), [[C]]
+    ; MIPS32-NEXT: [[ADD4:%[0-9]+]]:_(s32) = G_ADD [[C1]], [[C1]]
+    ; MIPS32-NEXT: [[ADD5:%[0-9]+]]:_(s32) = G_ADD [[ADD4]], [[ICMP1]]
+    ; MIPS32-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[ADD3]](s32), [[ADD5]](s32)
+    ; MIPS32-NEXT: [[XOR2:%[0-9]+]]:_(s32) = G_XOR [[COPY]], [[C1]]
+    ; MIPS32-NEXT: [[XOR3:%[0-9]+]]:_(s32) = G_XOR [[COPY1]], [[C1]]
+    ; MIPS32-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[XOR2]], [[XOR3]]
+    ; MIPS32-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[OR]](s32), [[C1]]
+    ; MIPS32-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[ICMP2]](s32), [[MV]], [[MV1]]
+    ; MIPS32-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[SELECT1]](s64)
+    ; MIPS32-NEXT: $v0 = COPY [[UV]](s32)
+    ; MIPS32-NEXT: $v1 = COPY [[UV1]](s32)
+    ; MIPS32-NEXT: RetRA implicit $v0, implicit $v1
     %1:_(s32) = COPY $a0
     %2:_(s32) = COPY $a1
     %0:_(s64) = G_MERGE_VALUES %1(s32), %2(s32)
diff --git a/llvm/test/CodeGen/Mips/GlobalISel/legalizer/fptosi_and_fptoui.mir b/llvm/test/CodeGen/Mips/GlobalISel/legalizer/fptosi_and_fptoui.mir
index 88910e3e19fdf..533cf5e132806 100644
--- a/llvm/test/CodeGen/Mips/GlobalISel/legalizer/fptosi_and_fptoui.mir
+++ b/llvm/test/CodeGen/Mips/GlobalISel/legalizer/fptosi_and_fptoui.mir
@@ -31,28 +31,31 @@ body:             |
 
     ; FP32-LABEL: name: f32toi64
     ; FP32: liveins: $f12
-    ; FP32: [[COPY:%[0-9]+]]:_(s32) = COPY $f12
-    ; FP32: ADJCALLSTACKDOWN 16, 0, implicit-def $sp, implicit $sp
-    ; FP32: $f12 = COPY [[COPY]](s32)
-    ; FP32: JAL &__fixsfdi, csr_o32, implicit-def $ra, implicit-def $sp, implicit $f12, implicit-def $v0, implicit-def $v1
-    ; FP32: [[COPY1:%[0-9]+]]:_(s32) = COPY $v0
-    ; FP32: [[COPY2:%[0-9]+]]:_(s32) = COPY $v1
-    ; FP32: ADJCALLSTACKUP 16, 0, implicit-def $sp, implicit $sp
-    ; FP32: $v0 = COPY [[COPY1]](s32)
-    ; FP32: $v1 = COPY [[COPY2]](s32)
-    ; FP32: RetRA implicit $v0, implicit $v1
+    ; FP32-NEXT: {{  $}}
+    ; FP32-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $f12
+    ; FP32-NEXT: ADJCALLSTACKDOWN 16, 0, implicit-def $sp, implicit $sp
+    ; FP32-NEXT: $f12 = COPY [[COPY]](s32)
+    ; FP32-NEXT: JAL &__fixsfdi, csr_o32, implicit-def $ra, implicit-def $sp, implicit $f12, implicit-def $v0, implicit-def $v1
+    ; FP32-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $v0
+    ; FP32-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $v1
+    ; FP32-NEXT: ADJCALLSTACKUP 16, 0, implicit-def $sp, implicit $sp
+    ; FP32-NEXT: $v0 = COPY [[COPY1]](s32)
+    ; FP32-NEXT: $v1 = COPY [[COPY2]](s32)
+    ; FP32-NEXT: RetRA implicit $v0, implicit $v1
+    ;
     ; FP64-LABEL: name: f32toi64
     ; FP64: liveins: $f12
-    ; FP64: [[COPY:%[0-9]+]]:_(s32) = COPY $f12
-    ; FP64: ADJCALLSTACKDOWN 16, 0, implicit-def $sp, implicit $sp
-    ; FP64: $f12 = COPY [[COPY]](s32)
-    ; FP64: JAL &__fixsfdi, csr_o32_fp64, implicit-def $ra, implicit-def $sp, implicit $f12, implicit-def $v0, implicit-def $v1
-    ; FP64: [[COPY1:%[0-9]+]]:_(s32) = COPY $v0
-    ; FP64: [[COPY2:%[0-9]+]]:_(s32) = COPY $v1
-    ; FP64: ADJCALLSTACKUP 16, 0, implicit-def $sp, implicit $sp
-    ; FP64: $v0 = COPY [[COPY1]](s32)
-    ; FP64: $v1 = COPY [[COPY2]](s32)
-    ; FP64: RetRA implicit $v0, implicit $v1
+    ; FP64-NEXT: {{  $}}
+    ; FP64-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $f12
+    ; FP64-NEXT: ADJCALLSTACKDOWN 16, 0, implicit-def $sp, implicit $sp
+    ; FP64-NEXT: $f12 = COPY [[COPY]](s32)
+    ; FP64-NEXT: JAL &__fixsfdi, csr_o32_fp64, implicit-def $ra, implicit-def $sp, implicit $f12, implicit-def $v0, implicit-def $v1
+    ; FP64-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $v0
+    ; FP64-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $v1
+    ; FP64-NEXT: ADJCALLSTACKUP 16, 0, implicit-def $sp, implicit $sp
+    ; FP64-NEXT: $v0 = COPY [[COPY1]](s32)
+    ; FP64-NEXT: $v1 = COPY [[COPY2]](s32)
+    ; FP64-NEXT: RetRA implicit $v0, implicit $v1
     %0:_(s32) = COPY $f12
     %1:_(s64) = G_FPTOSI %0(s32)
     %2:_(s32), %3:_(s32) = G_UNMERGE_VALUES %1(s64)
@@ -71,16 +74,19 @@ body:             |
 
     ; FP32-LABEL: name: f32toi32
     ; FP32: liveins: $f12
-    ; FP32: [[COPY:%[0-9]+]]:_(s32) = COPY $f12
-    ; FP32: [[FPTOSI:%[0-9]+]]:_(s32) = G_FPTOSI [[COPY]](s32)
-    ; FP32: $v0 = COPY [[FPTOSI]](s32)
-    ; FP32: RetRA implicit $v0
+    ; FP32-NEXT: {{  $}}
+    ; FP32-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $f12
+    ; FP32-NEXT: [[FPTOSI:%[0-9]+]]:_(s32) = G_FPTOSI [[COPY]](s32)
+    ; FP32-NEXT: $v0 = COPY [[FPTOSI]](s32)
+    ; FP32-NEXT: RetRA implicit $v0
+    ;
     ; FP64-LABEL: name: f32toi32
     ; FP64: liveins: $f12
-    ; FP64: [[COPY:%[0-9]+]]:_(s32) = COPY $f12
-    ; FP64: [[FPTOSI:%[0-9]+]]:_(s32) = G_FPTOSI [[COPY]](s32)
-    ; FP64: $v0 = COPY [[FPTOSI]](s32)
-    ; FP64: RetRA implicit $v0
+    ; FP64-NEXT: {{  $}}
+    ; FP64-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $f12
+    ; FP64-NEXT: [[FPTOSI:%[0-9]+]]:_(s32) = G_FPTOSI [[COPY]](s32)
+    ; FP64-NEXT: $v0 = COPY [[FPTOSI]](s32)
+    ; FP64-NEXT: RetRA implicit $v0
     %0:_(s32) = COPY $f12
     %1:_(s32) = G_FPTOSI %0(s32)
     $v0 = COPY %1(s32)
@@ -97,22 +103,25 @@ body:             |
 
     ; FP32-LABEL: name: f32toi16
     ; FP32: liveins: $f12
-    ; FP32: [[COPY:%[0-9]+]]:_(s32) = COPY $f12
-    ; FP32: [[FPTOSI:%[0-9]+]]:_(s32) = G_FPTOSI [[COPY]](s32)
-    ; FP32: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; FP32: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[FPTOSI]], [[C]](s32)
-    ; FP32: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[SHL]], [[C]](s32)
-    ; FP32: $v0 = COPY [[ASHR]](s32)
-    ; FP32: RetRA implicit $v0
+    ; FP32-NEXT: {{  $}}
+    ; FP32-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $f12
+    ; FP32-NEXT: [[FPTOSI:%[0-9]+]]:_(s32) = G_FPTOSI [[COPY]](s32)
+    ; FP32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; FP32-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[FPTOSI]], [[C]](s32)
+    ; FP32-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[SHL]], [[C]](s32)
+    ; FP32-NEXT: $v0 = COPY [[ASHR]](s32)
+    ; FP32-NEXT: RetRA implicit $v0
+    ;
     ; FP64-LABEL: name: f32toi16
     ; FP64: liveins: $f12
-    ; FP64: [[COPY:%[0-9]+]]:_(s32) = COPY $f12
-    ; FP64: [[FPTOSI:%[0-9]+]]:_(s32) = G_FPTOSI [[COPY]](s32)
-    ; FP64: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; FP64: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[FPTOSI]], [[C]](s32)
-    ; FP64: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[SHL]], [[C]](s32)
-    ; FP64: $v0 = COPY [[ASHR]](s32)
-    ; FP64: RetRA implicit $v0
+    ; FP64-NEXT: {{  $}}
+    ; FP64-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $f12
+    ; FP64-NEXT: [[FPTOSI:%[0-9]+]]:_(s32) = G_FPTOSI [[COPY]](s32)
+    ; FP64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; FP64-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[FPTOSI]], [[C]](s32)
+    ; FP64-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[SHL]], [[C]](s32)
+    ; FP64-NEXT: $v0 = COPY [[ASHR]](s32)
+    ; FP64-NEXT: RetRA implicit $v0
     %0:_(s32) = COPY $f12
     %1:_(s16) = G_FPTOSI %0(s32)
     %2:_(s32) = G_SEXT %1(s16)
@@ -130,22 +139,25 @@ body:             |
 
     ; FP32-LABEL: name: f32toi8
     ; FP32: liveins: $f12
-    ; FP32: [[COPY:%[0-9]+]]:_(s32) = COPY $f12
-    ; FP32: [[FPTOSI:%[0-9]+]]:_(s32) = G_FPTOSI [[COPY]](s32)
-    ; FP32: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
-    ; FP32: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[FPTOSI]], [[C]](s32)
-    ; FP32: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[SHL]], [[C]](s32)
-    ; FP32: $v0 = COPY [[ASHR]](s32)
-    ; FP32: RetRA implicit $v0
+    ; FP32-NEXT: {{  $}}
+    ; FP32-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $f12
+    ; FP32-NEXT: [[FPTOSI:%[0-9]+]]:_(s32) = G_FPTOSI [[COPY]](s32)
+    ; FP32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
+    ; FP32-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[FPTOSI]], [[C]](s32)
+    ; FP32-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[SHL]], [[C]](s32)
+    ; FP32-NEXT: $v0 = COPY [[ASHR]](s32)
+    ; FP32-NEXT: RetRA implicit $v0
+    ;
     ; FP64-LABEL: name: f32toi8
     ; FP64: liveins: $f12
-    ; FP64: [[COPY:%[0-9]+]]:_(s32) = COPY $f12
-    ; FP64: [[FPTOSI:%[0-9]+]]:_(s32) = G_FPTOSI [[COPY]](s32)
-    ; FP64: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
-    ; FP64: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[FPTOSI]], [[C]](s32)
-    ; FP64: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[SHL]], [[C]](s32)
-    ; FP64: $v0 = COPY [[ASHR]](s32)
-    ; FP64: RetRA implicit $v0
+    ; FP64-NEXT: {{  $}}
+    ; FP64-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $f12
+    ; FP64-NEXT: [[FPTOSI:%[0-9]+]]:_(s32) = G_FPTOSI [[COPY]](s32)
+    ; FP64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
+    ; FP64-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[FPTOSI]], [[C]](s32)
+    ; FP64-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[SHL]], [[C]](s32)
+    ; FP64-NEXT: $v0 = COPY [[ASHR]](s32)
+    ; FP64-NEXT: RetRA implicit $v0
     %0:_(s32) = COPY $f12
     %1:_(s8) = G_FPTOSI %0(s32)
     %2:_(s32) = G_SEXT %1(s8)
@@ -163,28 +175,31 @@ body:             |
 
     ; FP32-LABEL: name: f64toi64
     ; FP32: liveins: $d6
-    ; FP32: [[COPY:%[0-9]+]]:_(s64) = COPY $d6
-    ; FP32: ADJCALLSTACKDOWN 16, 0, implicit-def $sp, implicit $sp
-    ; FP32: $d6 = COPY [[COPY]](s64)
-    ; FP32: JAL &__fixdfdi, csr_o32, implicit-def $ra, implicit-def $sp, implicit $d6, implicit-def $v0, implicit-def $v1
-    ; FP32: [[COPY1:%[0-9]+]]:_(s32) = COPY $v0
-    ; FP32: [[COPY2:%[0-9]+]]:_(s32) = COPY $v1
-    ; FP32: ADJCALLSTACKUP 16, 0, implicit-def $sp, implicit $sp
-    ; FP32: $v0 = COPY [[COPY1]](s32)
-    ; FP32: $v1 = COPY [[COPY2]](s32)
-    ; FP32: RetRA implicit $v0, implicit $v1
+    ; FP32-NEXT: {{  $}}
+    ; FP32-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $d6
+    ; FP32-NEXT: ADJCALLSTACKDOWN 16, 0, implicit-def $sp, implicit $sp
+    ; FP32-NEXT: $d6 = COPY [[COPY]](s64)
+    ; FP32-NEXT: JAL &__fixdfdi, csr_o32, implicit-def $ra, implicit-def $sp, implicit $d6, implicit-def $v0, implicit-def $v1
+    ; FP32-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $v0
+    ; FP32-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $v1
+    ; FP32-NEXT: ADJCALLSTACKUP 16, 0, implicit-def $sp, implicit $sp
+    ; FP32-NEXT: $v0 = COPY [[COPY1]](s32)
+    ; FP32-NEXT: $v1 = COPY [[COPY2]](s32)
+    ; FP32-NEXT: RetRA implicit $v0, implicit $v1
+    ;
     ; FP64-LABEL: name: f64toi64
     ; FP64: liveins: $d6
-    ; FP64: [[COPY:%[0-9]+]]:_(s64) = COPY $d6
-    ; FP64: ADJCALLSTACKDOWN 16, 0, implicit-def $sp, implicit $sp
-    ; FP64: $d12_64 = COPY [[COPY]](s64)
-    ; FP64: JAL &__fixdfdi, csr_o32_fp64, implicit-def $ra, implicit-def $sp, implicit $d12_64, implicit-def $v0, implicit-def $v1
-    ; FP64: [[COPY1:%[0-9]+]]:_(s32) = COPY $v0
-    ; FP64: [[COPY2:%[0-9]+]]:_(s32) = COPY $v1
-    ; FP64: ADJCALLSTACKUP 16, 0, implicit-def $sp, implicit $sp
-    ; FP64: $v0 = COPY [[COPY1]](s32)
-    ; FP64: $v1 = COPY [[COPY2]](s32)
-    ; FP64: RetRA implicit $v0, implicit $v1
+    ; FP64-NEXT: {{  $}}
+    ; FP64-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $d6
+    ; FP64-NEXT: ADJCALLSTACKDOWN 16, 0, implicit-def $sp, implicit $sp
+    ; FP64-NEXT: $d12_64 = COPY [[COPY]](s64)
+    ; FP64-NEXT: JAL &__fixdfdi, csr_o32_fp64, implicit-def $ra, implicit-def $sp, implicit $d12_64, implicit-def $v0, implicit-def $v1
+    ; FP64-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $v0
+    ; FP64-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $v1
+    ; FP64-NEXT: ADJCALLSTACKUP 16, 0, implicit-def $sp, implicit $sp
+    ; FP64-NEXT: $v0 = COPY [[COPY1]](s32)
+    ; FP64-NEXT: $v1 = COPY [[COPY2]](s32)
+    ; FP64-NEXT: RetRA implicit $v0, implicit $v1
     %0:_(s64) = COPY $d6
     %1:_(s64) = G_FPTOSI %0(s64)
     %2:_(s32), %3:_(s32) = G_UNMERGE_VALUES %1(s64)
@@ -203,16 +218,19 @@ body:             |
 
     ; FP32-LABEL: name: f64toi32
     ; FP32: liveins: $d6
-    ; FP32: [[COPY:%[0-9]+]]:_(s64) = COPY $d6
-    ; FP32: [[FPTOSI:%[0-9]+]]:_(s32) = G_FPTOSI [[COPY]](s64)
-    ; FP32: $v0 = COPY [[FPTOSI]](s32)
-    ; FP32: RetRA implicit $v0
+    ; FP32-NEXT: {{  $}}
+    ; FP32-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $d6
+    ; FP32-NEXT: [[FPTOSI:%[0-9]+]]:_(s32) = G_FPTOSI [[COPY]](s64)
+    ; FP32-NEXT: $v0 = COPY [[FPTOSI]](s32)
+    ; FP32-NEXT: RetRA implicit $v0
+    ;
     ; FP64-LABEL: name: f64toi32
     ; FP64: liveins: $d6
-    ; FP64: [[COPY:%[0-9]+]]:_(s64) = COPY $d6
-    ; FP64: [[FPTOSI:%[0-9]+]]:_(s32) = G_FPTOSI [[COPY]](s64)
-    ; FP64: $v0 = COPY [[FPTOSI]](s32)
-    ; FP64: RetRA implicit $v0
+    ; FP64-NEXT: {{  $}}
+    ; FP64-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $d6
+    ; FP64-NEXT: [[FPTOSI:%[0-9]+]]:_(s32) = G_FPTOSI [[COPY]](s64)
+    ; FP64-NEXT: $v0 = COPY [[FPTOSI]](s32)
+    ; FP64-NEXT: RetRA implicit $v0
     %0:_(s64) = COPY $d6
     %1:_(s32) = G_FPTOSI %0(s64)
     $v0 = COPY %1(s32)
@@ -229,22 +247,25 @@ body:             |
 
     ; FP32-LABEL: name: f64toi16
     ; FP32: liveins: $d6
-    ; FP32: [[COPY:%[0-9]+]]:_(s64) = COPY $d6
-    ; FP32: [[FPTOSI:%[0-9]+]]:_(s32) = G_FPTOSI [[COPY]](s64)
-    ; FP32: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; FP32: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[FPTOSI]], [[C]](s32)
-    ; FP32: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[SHL]], [[C]](s32)
-    ; FP32: $v0 = COPY [[ASHR]](s32)
-    ; FP32: RetRA implicit $v0
+    ; FP32-NEXT: {{  $}}
+    ; FP32-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $d6
+    ; FP32-NEXT: [[FPTOSI:%[0-9]+]]:_(s32) = G_FPTOSI [[COPY]](s64)
+    ; FP32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; FP32-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[FPTOSI]], [[C]](s32)
+    ; FP32-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[SHL]], [[C]](s32)
+    ; FP32-NEXT: $v0 = COPY [[ASHR]](s32)
+    ; FP32-NEXT: RetRA implicit $v0
+    ;
     ; FP64-LABEL: name: f64toi16
     ; FP64: liveins: $d6
-    ; FP64: [[COPY:%[0-9]+]]:_(s64) = COPY $d6
-    ; FP64: [[FPTOSI:%[0-9]+]]:_(s32) = G_FPTOSI [[COPY]](s64)
-    ; FP64: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; FP64: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[FPTOSI]], [[C]](s32)
-    ; FP64: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[SHL]], [[C]](s32)
-    ; FP64: $v0 = COPY [[ASHR]](s32)
-    ; FP64: RetRA implicit $v0
+    ; FP64-NEXT: {{  $}}
+    ; FP64-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $d6
+    ; FP64-NEXT: [[FPTOSI:%[0-9]+]]:_(s32) = G_FPTOSI [[COPY]](s64)
+    ; FP64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; FP64-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[FPTOSI]], [[C]](s32)
+    ; FP64-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[SHL]], [[C]](s32)
+    ; FP64-NEXT: $v0 = COPY [[ASHR]](s32)
+    ; FP64-NEXT: RetRA implicit $v0
     %0:_(s64) = COPY $d6
     %1:_(s16) = G_FPTOSI %0(s64)
     %2:_(s32) = G_SEXT %1(s16)
@@ -262,22 +283,25 @@ body:             |
 
     ; FP32-LABEL: name: f64toi8
     ; FP32: liveins: $d6
-    ; FP32: [[COPY:%[0-9]+]]:_(s64) = COPY $d6
-    ; FP32: [[FPTOSI:%[0-9]+]]:_(s32) = G_FPTOSI [[COPY]](s64)
-    ; FP32: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
-    ; FP32: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[FPTOSI]], [[C]](s32)
-    ; FP32: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[SHL]], [[C]](s32)
-    ; FP32: $v0 = COPY [[ASHR]](s32)
-    ; FP32: RetRA implicit $v0
+    ; FP32-NEXT: {{  $}}
+    ; FP32-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $d6
+    ; FP32-NEXT: [[FPTOSI:%[0-9]+]]:_(s32) = G_FPTOSI [[COPY]](s64)
+    ; FP32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
+    ; FP32-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[FPTOSI]], [[C]](s32)
+    ; FP32-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[SHL]], [[C]](s32)
+    ; FP32-NEXT: $v0 = COPY [[ASHR]](s32)
+    ; FP32-NEXT: RetRA implicit $v0
+    ;
     ; FP64-LABEL: name: f64toi8
     ; FP64: liveins: $d6
-    ; FP64: [[COPY:%[0-9]+]]:_(s64) = COPY $d6
-    ; FP64: [[FPTOSI:%[0-9]+]]:_(s32) = G_FPTOSI [[COPY]](s64)
-    ; FP64: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
-    ; FP64: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[FPTOSI]], [[C]](s32)
-    ; FP64: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[SHL]], [[C]](s32)
-    ; FP64: $v0 = COPY [[ASHR]](s32)
-    ; FP64: RetRA implicit $v0
+    ; FP64-NEXT: {{  $}}
+    ; FP64-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $d6
+    ; FP64-NEXT: [[FPTOSI:%[0-9]+]]:_(s32) = G_FPTOSI [[COPY]](s64)
+    ; FP64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
+    ; FP64-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[FPTOSI]], [[C]](s32)
+    ; FP64-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[SHL]], [[C]](s32)
+    ; FP64-NEXT: $v0 = COPY [[ASHR]](s32)
+    ; FP64-NEXT: RetRA implicit $v0
     %0:_(s64) = COPY $d6
     %1:_(s8) = G_FPTOSI %0(s64)
     %2:_(s32) = G_SEXT %1(s8)
@@ -295,28 +319,31 @@ body:             |
 
     ; FP32-LABEL: name: f32tou64
     ; FP32: liveins: $f12
-    ; FP32: [[COPY:%[0-9]+]]:_(s32) = COPY $f12
-    ; FP32: ADJCALLSTACKDOWN 16, 0, implicit-def $sp, implicit $sp
-    ; FP32: $f12 = COPY [[COPY]](s32)
-    ; FP32: JAL &__fixunssfdi, csr_o32, implicit-def $ra, implicit-def $sp, implicit $f12, implicit-def $v0, implicit-def $v1
-    ; FP32: [[COPY1:%[0-9]+]]:_(s32) = COPY $v0
-    ; FP32: [[COPY2:%[0-9]+]]:_(s32) = COPY $v1
-    ; FP32: ADJCALLSTACKUP 16, 0, implicit-def $sp, implicit $sp
-    ; FP32: $v0 = COPY [[COPY1]](s32)
-    ; FP32: $v1 = COPY [[COPY2]](s32)
-    ; FP32: RetRA implicit $v0, implicit $v1
+    ; FP32-NEXT: {{  $}}
+    ; FP32-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $f12
+    ; FP32-NEXT: ADJCALLSTACKDOWN 16, 0, implicit-def $sp, implicit $sp
+    ; FP32-NEXT: $f12 = COPY [[COPY]](s32)
+    ; FP32-NEXT: JAL &__fixunssfdi, csr_o32, implicit-def $ra, implicit-def $sp, implicit $f12, implicit-def $v0, implicit-def $v1
+    ; FP32-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $v0
+    ; FP32-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $v1
+    ; FP32-NEXT: ADJCALLSTACKUP 16, 0, implicit-def $sp, implicit $sp
+    ; FP32-NEXT: $v0 = COPY [[COPY1]](s32)
+    ; FP32-NEXT: $v1 = COPY [[COPY2]](s32)
+    ; FP32-NEXT: RetRA implicit $v0, implicit $v1
+    ;
     ; FP64-LABEL: name: f32tou64
     ; FP64: liveins: $f12
-    ; FP64: [[COPY:%[0-9]+]]:_(s32) = COPY $f12
-    ; FP64: ADJCALLSTACKDOWN 16, 0, implicit-def $sp, implicit $sp
-    ; FP64: $f12 = COPY [[COPY]](s32)
-    ; FP64: JAL &__fixunssfdi, csr_o32_fp64, implicit-def $ra, implicit-def $sp, implicit $f12, implicit-def $v0, implicit-def $v1
-    ; FP64: [[COPY1:%[0-9]+]]:_(s32) = COPY $v0
-    ; FP64: [[COPY2:%[0-9]+]]:_(s32) = COPY $v1
-    ; FP64: ADJCALLSTACKUP 16, 0, implicit-def $sp, implicit $sp
-    ; FP64: $v0 = COPY [[COPY1]](s32)
-    ; FP64: $v1 = COPY [[COPY2]](s32)
-    ; FP64: RetRA implicit $v0, implicit $v1
+    ; FP64-NEXT: {{  $}}
+    ; FP64-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $f12
+    ; FP64-NEXT: ADJCALLSTACKDOWN 16, 0, implicit-def $sp, implicit $sp
+    ; FP64-NEXT: $f12 = COPY [[COPY]](s32)
+    ; FP64-NEXT: JAL &__fixunssfdi, csr_o32_fp64, implicit-def $ra, implicit-def $sp, implicit $f12, implicit-def $v0, implicit-def $v1
+    ; FP64-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $v0
+    ; FP64-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $v1
+    ; FP64-NEXT: ADJCALLSTACKUP 16, 0, implicit-def $sp, implicit $sp
+    ; FP64-NEXT: $v0 = COPY [[COPY1]](s32)
+    ; FP64-NEXT: $v1 = COPY [[COPY2]](s32)
+    ; FP64-NEXT: RetRA implicit $v0, implicit $v1
     %0:_(s32) = COPY $f12
     %1:_(s64) = G_FPTOUI %0(s32)
     %2:_(s32), %3:_(s32) = G_UNMERGE_VALUES %1(s64)
@@ -335,34 +362,33 @@ body:             |
 
     ; FP32-LABEL: name: f32tou32
     ; FP32: liveins: $f12
-    ; FP32: [[COPY:%[0-9]+]]:_(s32) = COPY $f12
-    ; FP32: [[FPTOSI:%[0-9]+]]:_(s32) = G_FPTOSI [[COPY]](s32)
-    ; FP32: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41E0000000000000
-    ; FP32: [[FSUB:%[0-9]+]]:_(s32) = G_FSUB [[COPY]], [[C]]
-    ; FP32: [[FPTOSI1:%[0-9]+]]:_(s32) = G_FPTOSI [[FSUB]](s32)
-    ; FP32: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -2147483648
-    ; FP32: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[FPTOSI1]], [[C1]]
-    ; FP32: [[FCMP:%[0-9]+]]:_(s32) = G_FCMP floatpred(ult), [[COPY]](s32), [[C]]
-    ; FP32: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; FP32: [[AND:%[0-9]+]]:_(s32) = G_AND [[FCMP]], [[C2]]
-    ; FP32: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AND]](s32), [[FPTOSI]], [[XOR]]
-    ; FP32: $v0 = COPY [[SELECT]](s32)
-    ; FP32: RetRA implicit $v0
+    ; FP32-NEXT: {{  $}}
+    ; FP32-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $f12
+    ; FP32-NEXT: [[FPTOSI:%[0-9]+]]:_(s32) = G_FPTOSI [[COPY]](s32)
+    ; FP32-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41E0000000000000
+    ; FP32-NEXT: [[FSUB:%[0-9]+]]:_(s32) = G_FSUB [[COPY]], [[C]]
+    ; FP32-NEXT: [[FPTOSI1:%[0-9]+]]:_(s32) = G_FPTOSI [[FSUB]](s32)
+    ; FP32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -2147483648
+    ; FP32-NEXT: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[FPTOSI1]], [[C1]]
+    ; FP32-NEXT: [[FCMP:%[0-9]+]]:_(s32) = G_FCMP floatpred(ult), [[COPY]](s32), [[C]]
+    ; FP32-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s32), [[FPTOSI]], [[XOR]]
+    ; FP32-NEXT: $v0 = COPY [[SELECT]](s32)
+    ; FP32-NEXT: RetRA implicit $v0
+    ;
     ; FP64-LABEL: name: f32tou32
     ; FP64: liveins: $f12
-    ; FP64: [[COPY:%[0-9]+]]:_(s32) = COPY $f12
-    ; FP64: [[FPTOSI:%[0-9]+]]:_(s32) = G_FPTOSI [[COPY]](s32)
-    ; FP64: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41E0000000000000
-    ; FP64: [[FSUB:%[0-9]+]]:_(s32) = G_FSUB [[COPY]], [[C]]
-    ; FP64: [[FPTOSI1:%[0-9]+]]:_(s32) = G_FPTOSI [[FSUB]](s32)
-    ; FP64: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -2147483648
-    ; FP64: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[FPTOSI1]], [[C1]]
-    ; FP64: [[FCMP:%[0-9]+]]:_(s32) = G_FCMP floatpred(ult), [[COPY]](s32), [[C]]
-    ; FP64: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; FP64: [[AND:%[0-9]+]]:_(s32) = G_AND [[FCMP]], [[C2]]
-    ; FP64: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AND]](s32), [[FPTOSI]], [[XOR]]
-    ; FP64: $v0 = COPY [[SELECT]](s32)
-    ; FP64: RetRA implicit $v0
+    ; FP64-NEXT: {{  $}}
+    ; FP64-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $f12
+    ; FP64-NEXT: [[FPTOSI:%[0-9]+]]:_(s32) = G_FPTOSI [[COPY]](s32)
+    ; FP64-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41E0000000000000
+    ; FP64-NEXT: [[FSUB:%[0-9]+]]:_(s32) = G_FSUB [[COPY]], [[C]]
+    ; FP64-NEXT: [[FPTOSI1:%[0-9]+]]:_(s32) = G_FPTOSI [[FSUB]](s32)
+    ; FP64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -2147483648
+    ; FP64-NEXT: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[FPTOSI1]], [[C1]]
+    ; FP64-NEXT: [[FCMP:%[0-9]+]]:_(s32) = G_FCMP floatpred(ult), [[COPY]](s32), [[C]]
+    ; FP64-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s32), [[FPTOSI]], [[XOR]]
+    ; FP64-NEXT: $v0 = COPY [[SELECT]](s32)
+    ; FP64-NEXT: RetRA implicit $v0
     %0:_(s32) = COPY $f12
     %1:_(s32) = G_FPTOUI %0(s32)
     $v0 = COPY %1(s32)
@@ -379,38 +405,37 @@ body:             |
 
     ; FP32-LABEL: name: f32tou16
     ; FP32: liveins: $f12
-    ; FP32: [[COPY:%[0-9]+]]:_(s32) = COPY $f12
-    ; FP32: [[FPTOSI:%[0-9]+]]:_(s32) = G_FPTOSI [[COPY]](s32)
-    ; FP32: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41E0000000000000
-    ; FP32: [[FSUB:%[0-9]+]]:_(s32) = G_FSUB [[COPY]], [[C]]
-    ; FP32: [[FPTOSI1:%[0-9]+]]:_(s32) = G_FPTOSI [[FSUB]](s32)
-    ; FP32: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -2147483648
-    ; FP32: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[FPTOSI1]], [[C1]]
-    ; FP32: [[FCMP:%[0-9]+]]:_(s32) = G_FCMP floatpred(ult), [[COPY]](s32), [[C]]
-    ; FP32: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; FP32: [[AND:%[0-9]+]]:_(s32) = G_AND [[FCMP]], [[C2]]
-    ; FP32: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AND]](s32), [[FPTOSI]], [[XOR]]
-    ; FP32: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
-    ; FP32: [[AND1:%[0-9]+]]:_(s32) = G_AND [[SELECT]], [[C3]]
-    ; FP32: $v0 = COPY [[AND1]](s32)
-    ; FP32: RetRA implicit $v0
+    ; FP32-NEXT: {{  $}}
+    ; FP32-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $f12
+    ; FP32-NEXT: [[FPTOSI:%[0-9]+]]:_(s32) = G_FPTOSI [[COPY]](s32)
+    ; FP32-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41E0000000000000
+    ; FP32-NEXT: [[FSUB:%[0-9]+]]:_(s32) = G_FSUB [[COPY]], [[C]]
+    ; FP32-NEXT: [[FPTOSI1:%[0-9]+]]:_(s32) = G_FPTOSI [[FSUB]](s32)
+    ; FP32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -2147483648
+    ; FP32-NEXT: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[FPTOSI1]], [[C1]]
+    ; FP32-NEXT: [[FCMP:%[0-9]+]]:_(s32) = G_FCMP floatpred(ult), [[COPY]](s32), [[C]]
+    ; FP32-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s32), [[FPTOSI]], [[XOR]]
+    ; FP32-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+    ; FP32-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[SELECT]], [[C2]]
+    ; FP32-NEXT: $v0 = COPY [[AND]](s32)
+    ; FP32-NEXT: RetRA implicit $v0
+    ;
     ; FP64-LABEL: name: f32tou16
     ; FP64: liveins: $f12
-    ; FP64: [[COPY:%[0-9]+]]:_(s32) = COPY $f12
-    ; FP64: [[FPTOSI:%[0-9]+]]:_(s32) = G_FPTOSI [[COPY]](s32)
-    ; FP64: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41E0000000000000
-    ; FP64: [[FSUB:%[0-9]+]]:_(s32) = G_FSUB [[COPY]], [[C]]
-    ; FP64: [[FPTOSI1:%[0-9]+]]:_(s32) = G_FPTOSI [[FSUB]](s32)
-    ; FP64: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -2147483648
-    ; FP64: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[FPTOSI1]], [[C1]]
-    ; FP64: [[FCMP:%[0-9]+]]:_(s32) = G_FCMP floatpred(ult), [[COPY]](s32), [[C]]
-    ; FP64: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; FP64: [[AND:%[0-9]+]]:_(s32) = G_AND [[FCMP]], [[C2]]
-    ; FP64: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AND]](s32), [[FPTOSI]], [[XOR]]
-    ; FP64: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
-    ; FP64: [[AND1:%[0-9]+]]:_(s32) = G_AND [[SELECT]], [[C3]]
-    ; FP64: $v0 = COPY [[AND1]](s32)
-    ; FP64: RetRA implicit $v0
+    ; FP64-NEXT: {{  $}}
+    ; FP64-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $f12
+    ; FP64-NEXT: [[FPTOSI:%[0-9]+]]:_(s32) = G_FPTOSI [[COPY]](s32)
+    ; FP64-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41E0000000000000
+    ; FP64-NEXT: [[FSUB:%[0-9]+]]:_(s32) = G_FSUB [[COPY]], [[C]]
+    ; FP64-NEXT: [[FPTOSI1:%[0-9]+]]:_(s32) = G_FPTOSI [[FSUB]](s32)
+    ; FP64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -2147483648
+    ; FP64-NEXT: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[FPTOSI1]], [[C1]]
+    ; FP64-NEXT: [[FCMP:%[0-9]+]]:_(s32) = G_FCMP floatpred(ult), [[COPY]](s32), [[C]]
+    ; FP64-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s32), [[FPTOSI]], [[XOR]]
+    ; FP64-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+    ; FP64-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[SELECT]], [[C2]]
+    ; FP64-NEXT: $v0 = COPY [[AND]](s32)
+    ; FP64-NEXT: RetRA implicit $v0
     %0:_(s32) = COPY $f12
     %1:_(s16) = G_FPTOUI %0(s32)
     %2:_(s32) = G_ZEXT %1(s16)
@@ -428,38 +453,37 @@ body:             |
 
     ; FP32-LABEL: name: f32tou8
     ; FP32: liveins: $f12
-    ; FP32: [[COPY:%[0-9]+]]:_(s32) = COPY $f12
-    ; FP32: [[FPTOSI:%[0-9]+]]:_(s32) = G_FPTOSI [[COPY]](s32)
-    ; FP32: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41E0000000000000
-    ; FP32: [[FSUB:%[0-9]+]]:_(s32) = G_FSUB [[COPY]], [[C]]
-    ; FP32: [[FPTOSI1:%[0-9]+]]:_(s32) = G_FPTOSI [[FSUB]](s32)
-    ; FP32: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -2147483648
-    ; FP32: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[FPTOSI1]], [[C1]]
-    ; FP32: [[FCMP:%[0-9]+]]:_(s32) = G_FCMP floatpred(ult), [[COPY]](s32), [[C]]
-    ; FP32: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; FP32: [[AND:%[0-9]+]]:_(s32) = G_AND [[FCMP]], [[C2]]
-    ; FP32: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AND]](s32), [[FPTOSI]], [[XOR]]
-    ; FP32: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
-    ; FP32: [[AND1:%[0-9]+]]:_(s32) = G_AND [[SELECT]], [[C3]]
-    ; FP32: $v0 = COPY [[AND1]](s32)
-    ; FP32: RetRA implicit $v0
+    ; FP32-NEXT: {{  $}}
+    ; FP32-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $f12
+    ; FP32-NEXT: [[FPTOSI:%[0-9]+]]:_(s32) = G_FPTOSI [[COPY]](s32)
+    ; FP32-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41E0000000000000
+    ; FP32-NEXT: [[FSUB:%[0-9]+]]:_(s32) = G_FSUB [[COPY]], [[C]]
+    ; FP32-NEXT: [[FPTOSI1:%[0-9]+]]:_(s32) = G_FPTOSI [[FSUB]](s32)
+    ; FP32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -2147483648
+    ; FP32-NEXT: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[FPTOSI1]], [[C1]]
+    ; FP32-NEXT: [[FCMP:%[0-9]+]]:_(s32) = G_FCMP floatpred(ult), [[COPY]](s32), [[C]]
+    ; FP32-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s32), [[FPTOSI]], [[XOR]]
+    ; FP32-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
+    ; FP32-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[SELECT]], [[C2]]
+    ; FP32-NEXT: $v0 = COPY [[AND]](s32)
+    ; FP32-NEXT: RetRA implicit $v0
+    ;
     ; FP64-LABEL: name: f32tou8
     ; FP64: liveins: $f12
-    ; FP64: [[COPY:%[0-9]+]]:_(s32) = COPY $f12
-    ; FP64: [[FPTOSI:%[0-9]+]]:_(s32) = G_FPTOSI [[COPY]](s32)
-    ; FP64: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41E0000000000000
-    ; FP64: [[FSUB:%[0-9]+]]:_(s32) = G_FSUB [[COPY]], [[C]]
-    ; FP64: [[FPTOSI1:%[0-9]+]]:_(s32) = G_FPTOSI [[FSUB]](s32)
-    ; FP64: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -2147483648
-    ; FP64: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[FPTOSI1]], [[C1]]
-    ; FP64: [[FCMP:%[0-9]+]]:_(s32) = G_FCMP floatpred(ult), [[COPY]](s32), [[C]]
-    ; FP64: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; FP64: [[AND:%[0-9]+]]:_(s32) = G_AND [[FCMP]], [[C2]]
-    ; FP64: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AND]](s32), [[FPTOSI]], [[XOR]]
-    ; FP64: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
-    ; FP64: [[AND1:%[0-9]+]]:_(s32) = G_AND [[SELECT]], [[C3]]
-    ; FP64: $v0 = COPY [[AND1]](s32)
-    ; FP64: RetRA implicit $v0
+    ; FP64-NEXT: {{  $}}
+    ; FP64-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $f12
+    ; FP64-NEXT: [[FPTOSI:%[0-9]+]]:_(s32) = G_FPTOSI [[COPY]](s32)
+    ; FP64-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x41E0000000000000
+    ; FP64-NEXT: [[FSUB:%[0-9]+]]:_(s32) = G_FSUB [[COPY]], [[C]]
+    ; FP64-NEXT: [[FPTOSI1:%[0-9]+]]:_(s32) = G_FPTOSI [[FSUB]](s32)
+    ; FP64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -2147483648
+    ; FP64-NEXT: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[FPTOSI1]], [[C1]]
+    ; FP64-NEXT: [[FCMP:%[0-9]+]]:_(s32) = G_FCMP floatpred(ult), [[COPY]](s32), [[C]]
+    ; FP64-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s32), [[FPTOSI]], [[XOR]]
+    ; FP64-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
+    ; FP64-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[SELECT]], [[C2]]
+    ; FP64-NEXT: $v0 = COPY [[AND]](s32)
+    ; FP64-NEXT: RetRA implicit $v0
     %0:_(s32) = COPY $f12
     %1:_(s8) = G_FPTOUI %0(s32)
     %2:_(s32) = G_ZEXT %1(s8)
@@ -477,28 +501,31 @@ body:             |
 
     ; FP32-LABEL: name: f64tou64
     ; FP32: liveins: $d6
-    ; FP32: [[COPY:%[0-9]+]]:_(s64) = COPY $d6
-    ; FP32: ADJCALLSTACKDOWN 16, 0, implicit-def $sp, implicit $sp
-    ; FP32: $d6 = COPY [[COPY]](s64)
-    ; FP32: JAL &__fixunsdfdi, csr_o32, implicit-def $ra, implicit-def $sp, implicit $d6, implicit-def $v0, implicit-def $v1
-    ; FP32: [[COPY1:%[0-9]+]]:_(s32) = COPY $v0
-    ; FP32: [[COPY2:%[0-9]+]]:_(s32) = COPY $v1
-    ; FP32: ADJCALLSTACKUP 16, 0, implicit-def $sp, implicit $sp
-    ; FP32: $v0 = COPY [[COPY1]](s32)
-    ; FP32: $v1 = COPY [[COPY2]](s32)
-    ; FP32: RetRA implicit $v0, implicit $v1
+    ; FP32-NEXT: {{  $}}
+    ; FP32-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $d6
+    ; FP32-NEXT: ADJCALLSTACKDOWN 16, 0, implicit-def $sp, implicit $sp
+    ; FP32-NEXT: $d6 = COPY [[COPY]](s64)
+    ; FP32-NEXT: JAL &__fixunsdfdi, csr_o32, implicit-def $ra, implicit-def $sp, implicit $d6, implicit-def $v0, implicit-def $v1
+    ; FP32-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $v0
+    ; FP32-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $v1
+    ; FP32-NEXT: ADJCALLSTACKUP 16, 0, implicit-def $sp, implicit $sp
+    ; FP32-NEXT: $v0 = COPY [[COPY1]](s32)
+    ; FP32-NEXT: $v1 = COPY [[COPY2]](s32)
+    ; FP32-NEXT: RetRA implicit $v0, implicit $v1
+    ;
     ; FP64-LABEL: name: f64tou64
     ; FP64: liveins: $d6
-    ; FP64: [[COPY:%[0-9]+]]:_(s64) = COPY $d6
-    ; FP64: ADJCALLSTACKDOWN 16, 0, implicit-def $sp, implicit $sp
-    ; FP64: $d12_64 = COPY [[COPY]](s64)
-    ; FP64: JAL &__fixunsdfdi, csr_o32_fp64, implicit-def $ra, implicit-def $sp, implicit $d12_64, implicit-def $v0, implicit-def $v1
-    ; FP64: [[COPY1:%[0-9]+]]:_(s32) = COPY $v0
-    ; FP64: [[COPY2:%[0-9]+]]:_(s32) = COPY $v1
-    ; FP64: ADJCALLSTACKUP 16, 0, implicit-def $sp, implicit $sp
-    ; FP64: $v0 = COPY [[COPY1]](s32)
-    ; FP64: $v1 = COPY [[COPY2]](s32)
-    ; FP64: RetRA implicit $v0, implicit $v1
+    ; FP64-NEXT: {{  $}}
+    ; FP64-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $d6
+    ; FP64-NEXT: ADJCALLSTACKDOWN 16, 0, implicit-def $sp, implicit $sp
+    ; FP64-NEXT: $d12_64 = COPY [[COPY]](s64)
+    ; FP64-NEXT: JAL &__fixunsdfdi, csr_o32_fp64, implicit-def $ra, implicit-def $sp, implicit $d12_64, implicit-def $v0, implicit-def $v1
+    ; FP64-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $v0
+    ; FP64-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $v1
+    ; FP64-NEXT: ADJCALLSTACKUP 16, 0, implicit-def $sp, implicit $sp
+    ; FP64-NEXT: $v0 = COPY [[COPY1]](s32)
+    ; FP64-NEXT: $v1 = COPY [[COPY2]](s32)
+    ; FP64-NEXT: RetRA implicit $v0, implicit $v1
     %0:_(s64) = COPY $d6
     %1:_(s64) = G_FPTOUI %0(s64)
     %2:_(s32), %3:_(s32) = G_UNMERGE_VALUES %1(s64)
@@ -517,34 +544,33 @@ body:             |
 
     ; FP32-LABEL: name: f64tou32
     ; FP32: liveins: $d6
-    ; FP32: [[COPY:%[0-9]+]]:_(s64) = COPY $d6
-    ; FP32: [[FPTOSI:%[0-9]+]]:_(s32) = G_FPTOSI [[COPY]](s64)
-    ; FP32: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 0x41E0000000000000
-    ; FP32: [[FSUB:%[0-9]+]]:_(s64) = G_FSUB [[COPY]], [[C]]
-    ; FP32: [[FPTOSI1:%[0-9]+]]:_(s32) = G_FPTOSI [[FSUB]](s64)
-    ; FP32: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -2147483648
-    ; FP32: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[FPTOSI1]], [[C1]]
-    ; FP32: [[FCMP:%[0-9]+]]:_(s32) = G_FCMP floatpred(ult), [[COPY]](s64), [[C]]
-    ; FP32: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; FP32: [[AND:%[0-9]+]]:_(s32) = G_AND [[FCMP]], [[C2]]
-    ; FP32: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AND]](s32), [[FPTOSI]], [[XOR]]
-    ; FP32: $v0 = COPY [[SELECT]](s32)
-    ; FP32: RetRA implicit $v0
+    ; FP32-NEXT: {{  $}}
+    ; FP32-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $d6
+    ; FP32-NEXT: [[FPTOSI:%[0-9]+]]:_(s32) = G_FPTOSI [[COPY]](s64)
+    ; FP32-NEXT: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 0x41E0000000000000
+    ; FP32-NEXT: [[FSUB:%[0-9]+]]:_(s64) = G_FSUB [[COPY]], [[C]]
+    ; FP32-NEXT: [[FPTOSI1:%[0-9]+]]:_(s32) = G_FPTOSI [[FSUB]](s64)
+    ; FP32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -2147483648
+    ; FP32-NEXT: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[FPTOSI1]], [[C1]]
+    ; FP32-NEXT: [[FCMP:%[0-9]+]]:_(s32) = G_FCMP floatpred(ult), [[COPY]](s64), [[C]]
+    ; FP32-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s32), [[FPTOSI]], [[XOR]]
+    ; FP32-NEXT: $v0 = COPY [[SELECT]](s32)
+    ; FP32-NEXT: RetRA implicit $v0
+    ;
     ; FP64-LABEL: name: f64tou32
     ; FP64: liveins: $d6
-    ; FP64: [[COPY:%[0-9]+]]:_(s64) = COPY $d6
-    ; FP64: [[FPTOSI:%[0-9]+]]:_(s32) = G_FPTOSI [[COPY]](s64)
-    ; FP64: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 0x41E0000000000000
-    ; FP64: [[FSUB:%[0-9]+]]:_(s64) = G_FSUB [[COPY]], [[C]]
-    ; FP64: [[FPTOSI1:%[0-9]+]]:_(s32) = G_FPTOSI [[FSUB]](s64)
-    ; FP64: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -2147483648
-    ; FP64: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[FPTOSI1]], [[C1]]
-    ; FP64: [[FCMP:%[0-9]+]]:_(s32) = G_FCMP floatpred(ult), [[COPY]](s64), [[C]]
-    ; FP64: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; FP64: [[AND:%[0-9]+]]:_(s32) = G_AND [[FCMP]], [[C2]]
-    ; FP64: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AND]](s32), [[FPTOSI]], [[XOR]]
-    ; FP64: $v0 = COPY [[SELECT]](s32)
-    ; FP64: RetRA implicit $v0
+    ; FP64-NEXT: {{  $}}
+    ; FP64-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $d6
+    ; FP64-NEXT: [[FPTOSI:%[0-9]+]]:_(s32) = G_FPTOSI [[COPY]](s64)
+    ; FP64-NEXT: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 0x41E0000000000000
+    ; FP64-NEXT: [[FSUB:%[0-9]+]]:_(s64) = G_FSUB [[COPY]], [[C]]
+    ; FP64-NEXT: [[FPTOSI1:%[0-9]+]]:_(s32) = G_FPTOSI [[FSUB]](s64)
+    ; FP64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -2147483648
+    ; FP64-NEXT: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[FPTOSI1]], [[C1]]
+    ; FP64-NEXT: [[FCMP:%[0-9]+]]:_(s32) = G_FCMP floatpred(ult), [[COPY]](s64), [[C]]
+    ; FP64-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s32), [[FPTOSI]], [[XOR]]
+    ; FP64-NEXT: $v0 = COPY [[SELECT]](s32)
+    ; FP64-NEXT: RetRA implicit $v0
     %0:_(s64) = COPY $d6
     %1:_(s32) = G_FPTOUI %0(s64)
     $v0 = COPY %1(s32)
@@ -561,38 +587,37 @@ body:             |
 
     ; FP32-LABEL: name: f64tou16
     ; FP32: liveins: $d6
-    ; FP32: [[COPY:%[0-9]+]]:_(s64) = COPY $d6
-    ; FP32: [[FPTOSI:%[0-9]+]]:_(s32) = G_FPTOSI [[COPY]](s64)
-    ; FP32: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 0x41E0000000000000
-    ; FP32: [[FSUB:%[0-9]+]]:_(s64) = G_FSUB [[COPY]], [[C]]
-    ; FP32: [[FPTOSI1:%[0-9]+]]:_(s32) = G_FPTOSI [[FSUB]](s64)
-    ; FP32: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -2147483648
-    ; FP32: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[FPTOSI1]], [[C1]]
-    ; FP32: [[FCMP:%[0-9]+]]:_(s32) = G_FCMP floatpred(ult), [[COPY]](s64), [[C]]
-    ; FP32: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; FP32: [[AND:%[0-9]+]]:_(s32) = G_AND [[FCMP]], [[C2]]
-    ; FP32: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AND]](s32), [[FPTOSI]], [[XOR]]
-    ; FP32: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
-    ; FP32: [[AND1:%[0-9]+]]:_(s32) = G_AND [[SELECT]], [[C3]]
-    ; FP32: $v0 = COPY [[AND1]](s32)
-    ; FP32: RetRA implicit $v0
+    ; FP32-NEXT: {{  $}}
+    ; FP32-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $d6
+    ; FP32-NEXT: [[FPTOSI:%[0-9]+]]:_(s32) = G_FPTOSI [[COPY]](s64)
+    ; FP32-NEXT: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 0x41E0000000000000
+    ; FP32-NEXT: [[FSUB:%[0-9]+]]:_(s64) = G_FSUB [[COPY]], [[C]]
+    ; FP32-NEXT: [[FPTOSI1:%[0-9]+]]:_(s32) = G_FPTOSI [[FSUB]](s64)
+    ; FP32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -2147483648
+    ; FP32-NEXT: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[FPTOSI1]], [[C1]]
+    ; FP32-NEXT: [[FCMP:%[0-9]+]]:_(s32) = G_FCMP floatpred(ult), [[COPY]](s64), [[C]]
+    ; FP32-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s32), [[FPTOSI]], [[XOR]]
+    ; FP32-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+    ; FP32-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[SELECT]], [[C2]]
+    ; FP32-NEXT: $v0 = COPY [[AND]](s32)
+    ; FP32-NEXT: RetRA implicit $v0
+    ;
     ; FP64-LABEL: name: f64tou16
     ; FP64: liveins: $d6
-    ; FP64: [[COPY:%[0-9]+]]:_(s64) = COPY $d6
-    ; FP64: [[FPTOSI:%[0-9]+]]:_(s32) = G_FPTOSI [[COPY]](s64)
-    ; FP64: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 0x41E0000000000000
-    ; FP64: [[FSUB:%[0-9]+]]:_(s64) = G_FSUB [[COPY]], [[C]]
-    ; FP64: [[FPTOSI1:%[0-9]+]]:_(s32) = G_FPTOSI [[FSUB]](s64)
-    ; FP64: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -2147483648
-    ; FP64: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[FPTOSI1]], [[C1]]
-    ; FP64: [[FCMP:%[0-9]+]]:_(s32) = G_FCMP floatpred(ult), [[COPY]](s64), [[C]]
-    ; FP64: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; FP64: [[AND:%[0-9]+]]:_(s32) = G_AND [[FCMP]], [[C2]]
-    ; FP64: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AND]](s32), [[FPTOSI]], [[XOR]]
-    ; FP64: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
-    ; FP64: [[AND1:%[0-9]+]]:_(s32) = G_AND [[SELECT]], [[C3]]
-    ; FP64: $v0 = COPY [[AND1]](s32)
-    ; FP64: RetRA implicit $v0
+    ; FP64-NEXT: {{  $}}
+    ; FP64-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $d6
+    ; FP64-NEXT: [[FPTOSI:%[0-9]+]]:_(s32) = G_FPTOSI [[COPY]](s64)
+    ; FP64-NEXT: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 0x41E0000000000000
+    ; FP64-NEXT: [[FSUB:%[0-9]+]]:_(s64) = G_FSUB [[COPY]], [[C]]
+    ; FP64-NEXT: [[FPTOSI1:%[0-9]+]]:_(s32) = G_FPTOSI [[FSUB]](s64)
+    ; FP64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -2147483648
+    ; FP64-NEXT: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[FPTOSI1]], [[C1]]
+    ; FP64-NEXT: [[FCMP:%[0-9]+]]:_(s32) = G_FCMP floatpred(ult), [[COPY]](s64), [[C]]
+    ; FP64-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s32), [[FPTOSI]], [[XOR]]
+    ; FP64-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+    ; FP64-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[SELECT]], [[C2]]
+    ; FP64-NEXT: $v0 = COPY [[AND]](s32)
+    ; FP64-NEXT: RetRA implicit $v0
     %0:_(s64) = COPY $d6
     %1:_(s16) = G_FPTOUI %0(s64)
     %2:_(s32) = G_ZEXT %1(s16)
@@ -610,38 +635,37 @@ body:             |
 
     ; FP32-LABEL: name: f64tou8
     ; FP32: liveins: $d6
-    ; FP32: [[COPY:%[0-9]+]]:_(s64) = COPY $d6
-    ; FP32: [[FPTOSI:%[0-9]+]]:_(s32) = G_FPTOSI [[COPY]](s64)
-    ; FP32: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 0x41E0000000000000
-    ; FP32: [[FSUB:%[0-9]+]]:_(s64) = G_FSUB [[COPY]], [[C]]
-    ; FP32: [[FPTOSI1:%[0-9]+]]:_(s32) = G_FPTOSI [[FSUB]](s64)
-    ; FP32: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -2147483648
-    ; FP32: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[FPTOSI1]], [[C1]]
-    ; FP32: [[FCMP:%[0-9]+]]:_(s32) = G_FCMP floatpred(ult), [[COPY]](s64), [[C]]
-    ; FP32: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; FP32: [[AND:%[0-9]+]]:_(s32) = G_AND [[FCMP]], [[C2]]
-    ; FP32: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AND]](s32), [[FPTOSI]], [[XOR]]
-    ; FP32: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
-    ; FP32: [[AND1:%[0-9]+]]:_(s32) = G_AND [[SELECT]], [[C3]]
-    ; FP32: $v0 = COPY [[AND1]](s32)
-    ; FP32: RetRA implicit $v0
+    ; FP32-NEXT: {{  $}}
+    ; FP32-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $d6
+    ; FP32-NEXT: [[FPTOSI:%[0-9]+]]:_(s32) = G_FPTOSI [[COPY]](s64)
+    ; FP32-NEXT: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 0x41E0000000000000
+    ; FP32-NEXT: [[FSUB:%[0-9]+]]:_(s64) = G_FSUB [[COPY]], [[C]]
+    ; FP32-NEXT: [[FPTOSI1:%[0-9]+]]:_(s32) = G_FPTOSI [[FSUB]](s64)
+    ; FP32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -2147483648
+    ; FP32-NEXT: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[FPTOSI1]], [[C1]]
+    ; FP32-NEXT: [[FCMP:%[0-9]+]]:_(s32) = G_FCMP floatpred(ult), [[COPY]](s64), [[C]]
+    ; FP32-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s32), [[FPTOSI]], [[XOR]]
+    ; FP32-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
+    ; FP32-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[SELECT]], [[C2]]
+    ; FP32-NEXT: $v0 = COPY [[AND]](s32)
+    ; FP32-NEXT: RetRA implicit $v0
+    ;
     ; FP64-LABEL: name: f64tou8
     ; FP64: liveins: $d6
-    ; FP64: [[COPY:%[0-9]+]]:_(s64) = COPY $d6
-    ; FP64: [[FPTOSI:%[0-9]+]]:_(s32) = G_FPTOSI [[COPY]](s64)
-    ; FP64: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 0x41E0000000000000
-    ; FP64: [[FSUB:%[0-9]+]]:_(s64) = G_FSUB [[COPY]], [[C]]
-    ; FP64: [[FPTOSI1:%[0-9]+]]:_(s32) = G_FPTOSI [[FSUB]](s64)
-    ; FP64: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -2147483648
-    ; FP64: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[FPTOSI1]], [[C1]]
-    ; FP64: [[FCMP:%[0-9]+]]:_(s32) = G_FCMP floatpred(ult), [[COPY]](s64), [[C]]
-    ; FP64: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; FP64: [[AND:%[0-9]+]]:_(s32) = G_AND [[FCMP]], [[C2]]
-    ; FP64: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AND]](s32), [[FPTOSI]], [[XOR]]
-    ; FP64: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
-    ; FP64: [[AND1:%[0-9]+]]:_(s32) = G_AND [[SELECT]], [[C3]]
-    ; FP64: $v0 = COPY [[AND1]](s32)
-    ; FP64: RetRA implicit $v0
+    ; FP64-NEXT: {{  $}}
+    ; FP64-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $d6
+    ; FP64-NEXT: [[FPTOSI:%[0-9]+]]:_(s32) = G_FPTOSI [[COPY]](s64)
+    ; FP64-NEXT: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 0x41E0000000000000
+    ; FP64-NEXT: [[FSUB:%[0-9]+]]:_(s64) = G_FSUB [[COPY]], [[C]]
+    ; FP64-NEXT: [[FPTOSI1:%[0-9]+]]:_(s32) = G_FPTOSI [[FSUB]](s64)
+    ; FP64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -2147483648
+    ; FP64-NEXT: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[FPTOSI1]], [[C1]]
+    ; FP64-NEXT: [[FCMP:%[0-9]+]]:_(s32) = G_FCMP floatpred(ult), [[COPY]](s64), [[C]]
+    ; FP64-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s32), [[FPTOSI]], [[XOR]]
+    ; FP64-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
+    ; FP64-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[SELECT]], [[C2]]
+    ; FP64-NEXT: $v0 = COPY [[AND]](s32)
+    ; FP64-NEXT: RetRA implicit $v0
     %0:_(s64) = COPY $d6
     %1:_(s8) = G_FPTOUI %0(s64)
     %2:_(s32) = G_ZEXT %1(s8)
diff --git a/llvm/test/CodeGen/Mips/GlobalISel/legalizer/icmp.mir b/llvm/test/CodeGen/Mips/GlobalISel/legalizer/icmp.mir
index c7a7a10b3f921..3c81427231a35 100644
--- a/llvm/test/CodeGen/Mips/GlobalISel/legalizer/icmp.mir
+++ b/llvm/test/CodeGen/Mips/GlobalISel/legalizer/icmp.mir
@@ -28,11 +28,12 @@ body:             |
 
     ; MIPS32-LABEL: name: ne_i32
     ; MIPS32: liveins: $a0, $a1
-    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
-    ; MIPS32: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
-    ; MIPS32: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ne), [[COPY]](s32), [[COPY1]]
-    ; MIPS32: $v0 = COPY [[ICMP]](s32)
-    ; MIPS32: RetRA implicit $v0
+    ; MIPS32-NEXT: {{  $}}
+    ; MIPS32-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+    ; MIPS32-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
+    ; MIPS32-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ne), [[COPY]](s32), [[COPY1]]
+    ; MIPS32-NEXT: $v0 = COPY [[ICMP]](s32)
+    ; MIPS32-NEXT: RetRA implicit $v0
     %0:_(s32) = COPY $a0
     %1:_(s32) = COPY $a1
     %2:_(s1) = G_ICMP intpred(ne), %0(s32), %1
@@ -51,11 +52,12 @@ body:             |
 
     ; MIPS32-LABEL: name: eq_ptr
     ; MIPS32: liveins: $a0, $a1
-    ; MIPS32: [[COPY:%[0-9]+]]:_(p0) = COPY $a0
-    ; MIPS32: [[COPY1:%[0-9]+]]:_(p0) = COPY $a1
-    ; MIPS32: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[COPY]](p0), [[COPY1]]
-    ; MIPS32: $v0 = COPY [[ICMP]](s32)
-    ; MIPS32: RetRA implicit $v0
+    ; MIPS32-NEXT: {{  $}}
+    ; MIPS32-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $a0
+    ; MIPS32-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $a1
+    ; MIPS32-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[COPY]](p0), [[COPY1]]
+    ; MIPS32-NEXT: $v0 = COPY [[ICMP]](s32)
+    ; MIPS32-NEXT: RetRA implicit $v0
     %0:_(p0) = COPY $a0
     %1:_(p0) = COPY $a1
     %2:_(s1) = G_ICMP intpred(eq), %0(p0), %1
@@ -74,14 +76,15 @@ body:             |
 
     ; MIPS32-LABEL: name: ult_i8
     ; MIPS32: liveins: $a0, $a1
-    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
-    ; MIPS32: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
-    ; MIPS32: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
-    ; MIPS32: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C]]
-    ; MIPS32: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C]]
-    ; MIPS32: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[AND]](s32), [[AND1]]
-    ; MIPS32: $v0 = COPY [[ICMP]](s32)
-    ; MIPS32: RetRA implicit $v0
+    ; MIPS32-NEXT: {{  $}}
+    ; MIPS32-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+    ; MIPS32-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
+    ; MIPS32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
+    ; MIPS32-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C]]
+    ; MIPS32-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C]]
+    ; MIPS32-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[AND]](s32), [[AND1]]
+    ; MIPS32-NEXT: $v0 = COPY [[ICMP]](s32)
+    ; MIPS32-NEXT: RetRA implicit $v0
     %2:_(s32) = COPY $a0
     %0:_(s8) = G_TRUNC %2(s32)
     %3:_(s32) = COPY $a1
@@ -102,16 +105,17 @@ body:             |
 
     ; MIPS32-LABEL: name: slt_i16
     ; MIPS32: liveins: $a0, $a1
-    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
-    ; MIPS32: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
-    ; MIPS32: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; MIPS32: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[C]](s32)
-    ; MIPS32: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[SHL]], [[C]](s32)
-    ; MIPS32: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY1]], [[C]](s32)
-    ; MIPS32: [[ASHR1:%[0-9]+]]:_(s32) = G_ASHR [[SHL1]], [[C]](s32)
-    ; MIPS32: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(slt), [[ASHR]](s32), [[ASHR1]]
-    ; MIPS32: $v0 = COPY [[ICMP]](s32)
-    ; MIPS32: RetRA implicit $v0
+    ; MIPS32-NEXT: {{  $}}
+    ; MIPS32-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+    ; MIPS32-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
+    ; MIPS32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; MIPS32-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[C]](s32)
+    ; MIPS32-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[SHL]], [[C]](s32)
+    ; MIPS32-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[COPY1]], [[C]](s32)
+    ; MIPS32-NEXT: [[ASHR1:%[0-9]+]]:_(s32) = G_ASHR [[SHL1]], [[C]](s32)
+    ; MIPS32-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(slt), [[ASHR]](s32), [[ASHR1]]
+    ; MIPS32-NEXT: $v0 = COPY [[ICMP]](s32)
+    ; MIPS32-NEXT: RetRA implicit $v0
     %2:_(s32) = COPY $a0
     %0:_(s16) = G_TRUNC %2(s32)
     %3:_(s32) = COPY $a1
@@ -132,17 +136,18 @@ body:             |
 
     ; MIPS32-LABEL: name: eq_i64
     ; MIPS32: liveins: $a0, $a1, $a2, $a3
-    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
-    ; MIPS32: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
-    ; MIPS32: [[COPY2:%[0-9]+]]:_(s32) = COPY $a2
-    ; MIPS32: [[COPY3:%[0-9]+]]:_(s32) = COPY $a3
-    ; MIPS32: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-    ; MIPS32: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[COPY]], [[COPY2]]
-    ; MIPS32: [[XOR1:%[0-9]+]]:_(s32) = G_XOR [[COPY1]], [[COPY3]]
-    ; MIPS32: [[OR:%[0-9]+]]:_(s32) = G_OR [[XOR]], [[XOR1]]
-    ; MIPS32: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[OR]](s32), [[C]]
-    ; MIPS32: $v0 = COPY [[ICMP]](s32)
-    ; MIPS32: RetRA implicit $v0
+    ; MIPS32-NEXT: {{  $}}
+    ; MIPS32-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+    ; MIPS32-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
+    ; MIPS32-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $a2
+    ; MIPS32-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $a3
+    ; MIPS32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; MIPS32-NEXT: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[COPY]], [[COPY2]]
+    ; MIPS32-NEXT: [[XOR1:%[0-9]+]]:_(s32) = G_XOR [[COPY1]], [[COPY3]]
+    ; MIPS32-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[XOR]], [[XOR1]]
+    ; MIPS32-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[OR]](s32), [[C]]
+    ; MIPS32-NEXT: $v0 = COPY [[ICMP]](s32)
+    ; MIPS32-NEXT: RetRA implicit $v0
     %2:_(s32) = COPY $a0
     %3:_(s32) = COPY $a1
     %0:_(s64) = G_MERGE_VALUES %2(s32), %3(s32)
@@ -165,17 +170,18 @@ body:             |
 
     ; MIPS32-LABEL: name: ne_i64
     ; MIPS32: liveins: $a0, $a1, $a2, $a3
-    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
-    ; MIPS32: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
-    ; MIPS32: [[COPY2:%[0-9]+]]:_(s32) = COPY $a2
-    ; MIPS32: [[COPY3:%[0-9]+]]:_(s32) = COPY $a3
-    ; MIPS32: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-    ; MIPS32: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[COPY]], [[COPY2]]
-    ; MIPS32: [[XOR1:%[0-9]+]]:_(s32) = G_XOR [[COPY1]], [[COPY3]]
-    ; MIPS32: [[OR:%[0-9]+]]:_(s32) = G_OR [[XOR]], [[XOR1]]
-    ; MIPS32: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ne), [[OR]](s32), [[C]]
-    ; MIPS32: $v0 = COPY [[ICMP]](s32)
-    ; MIPS32: RetRA implicit $v0
+    ; MIPS32-NEXT: {{  $}}
+    ; MIPS32-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+    ; MIPS32-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
+    ; MIPS32-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $a2
+    ; MIPS32-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $a3
+    ; MIPS32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; MIPS32-NEXT: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[COPY]], [[COPY2]]
+    ; MIPS32-NEXT: [[XOR1:%[0-9]+]]:_(s32) = G_XOR [[COPY1]], [[COPY3]]
+    ; MIPS32-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[XOR]], [[XOR1]]
+    ; MIPS32-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ne), [[OR]](s32), [[C]]
+    ; MIPS32-NEXT: $v0 = COPY [[ICMP]](s32)
+    ; MIPS32-NEXT: RetRA implicit $v0
     %2:_(s32) = COPY $a0
     %3:_(s32) = COPY $a1
     %0:_(s64) = G_MERGE_VALUES %2(s32), %3(s32)
@@ -198,18 +204,17 @@ body:             |
 
     ; MIPS32-LABEL: name: sgt_i64
     ; MIPS32: liveins: $a0, $a1, $a2, $a3
-    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
-    ; MIPS32: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
-    ; MIPS32: [[COPY2:%[0-9]+]]:_(s32) = COPY $a2
-    ; MIPS32: [[COPY3:%[0-9]+]]:_(s32) = COPY $a3
-    ; MIPS32: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(sgt), [[COPY1]](s32), [[COPY3]]
-    ; MIPS32: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[COPY1]](s32), [[COPY3]]
-    ; MIPS32: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(ugt), [[COPY]](s32), [[COPY2]]
-    ; MIPS32: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; MIPS32: [[AND:%[0-9]+]]:_(s32) = G_AND [[ICMP1]], [[C]]
-    ; MIPS32: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AND]](s32), [[ICMP2]], [[ICMP]]
-    ; MIPS32: $v0 = COPY [[SELECT]](s32)
-    ; MIPS32: RetRA implicit $v0
+    ; MIPS32-NEXT: {{  $}}
+    ; MIPS32-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+    ; MIPS32-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
+    ; MIPS32-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $a2
+    ; MIPS32-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $a3
+    ; MIPS32-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(sgt), [[COPY1]](s32), [[COPY3]]
+    ; MIPS32-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[COPY1]](s32), [[COPY3]]
+    ; MIPS32-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(ugt), [[COPY]](s32), [[COPY2]]
+    ; MIPS32-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s32), [[ICMP2]], [[ICMP]]
+    ; MIPS32-NEXT: $v0 = COPY [[SELECT]](s32)
+    ; MIPS32-NEXT: RetRA implicit $v0
     %2:_(s32) = COPY $a0
     %3:_(s32) = COPY $a1
     %0:_(s64) = G_MERGE_VALUES %2(s32), %3(s32)
@@ -232,18 +237,17 @@ body:             |
 
     ; MIPS32-LABEL: name: sge_i64
     ; MIPS32: liveins: $a0, $a1, $a2, $a3
-    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
-    ; MIPS32: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
-    ; MIPS32: [[COPY2:%[0-9]+]]:_(s32) = COPY $a2
-    ; MIPS32: [[COPY3:%[0-9]+]]:_(s32) = COPY $a3
-    ; MIPS32: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(sge), [[COPY1]](s32), [[COPY3]]
-    ; MIPS32: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[COPY1]](s32), [[COPY3]]
-    ; MIPS32: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(uge), [[COPY]](s32), [[COPY2]]
-    ; MIPS32: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; MIPS32: [[AND:%[0-9]+]]:_(s32) = G_AND [[ICMP1]], [[C]]
-    ; MIPS32: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AND]](s32), [[ICMP2]], [[ICMP]]
-    ; MIPS32: $v0 = COPY [[SELECT]](s32)
-    ; MIPS32: RetRA implicit $v0
+    ; MIPS32-NEXT: {{  $}}
+    ; MIPS32-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+    ; MIPS32-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
+    ; MIPS32-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $a2
+    ; MIPS32-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $a3
+    ; MIPS32-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(sge), [[COPY1]](s32), [[COPY3]]
+    ; MIPS32-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[COPY1]](s32), [[COPY3]]
+    ; MIPS32-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(uge), [[COPY]](s32), [[COPY2]]
+    ; MIPS32-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s32), [[ICMP2]], [[ICMP]]
+    ; MIPS32-NEXT: $v0 = COPY [[SELECT]](s32)
+    ; MIPS32-NEXT: RetRA implicit $v0
     %2:_(s32) = COPY $a0
     %3:_(s32) = COPY $a1
     %0:_(s64) = G_MERGE_VALUES %2(s32), %3(s32)
@@ -266,18 +270,17 @@ body:             |
 
     ; MIPS32-LABEL: name: slt_i64
     ; MIPS32: liveins: $a0, $a1, $a2, $a3
-    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
-    ; MIPS32: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
-    ; MIPS32: [[COPY2:%[0-9]+]]:_(s32) = COPY $a2
-    ; MIPS32: [[COPY3:%[0-9]+]]:_(s32) = COPY $a3
-    ; MIPS32: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(slt), [[COPY1]](s32), [[COPY3]]
-    ; MIPS32: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[COPY1]](s32), [[COPY3]]
-    ; MIPS32: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[COPY]](s32), [[COPY2]]
-    ; MIPS32: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; MIPS32: [[AND:%[0-9]+]]:_(s32) = G_AND [[ICMP1]], [[C]]
-    ; MIPS32: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AND]](s32), [[ICMP2]], [[ICMP]]
-    ; MIPS32: $v0 = COPY [[SELECT]](s32)
-    ; MIPS32: RetRA implicit $v0
+    ; MIPS32-NEXT: {{  $}}
+    ; MIPS32-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+    ; MIPS32-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
+    ; MIPS32-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $a2
+    ; MIPS32-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $a3
+    ; MIPS32-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(slt), [[COPY1]](s32), [[COPY3]]
+    ; MIPS32-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[COPY1]](s32), [[COPY3]]
+    ; MIPS32-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[COPY]](s32), [[COPY2]]
+    ; MIPS32-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s32), [[ICMP2]], [[ICMP]]
+    ; MIPS32-NEXT: $v0 = COPY [[SELECT]](s32)
+    ; MIPS32-NEXT: RetRA implicit $v0
     %2:_(s32) = COPY $a0
     %3:_(s32) = COPY $a1
     %0:_(s64) = G_MERGE_VALUES %2(s32), %3(s32)
@@ -300,18 +303,17 @@ body:             |
 
     ; MIPS32-LABEL: name: sle_i64
     ; MIPS32: liveins: $a0, $a1, $a2, $a3
-    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
-    ; MIPS32: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
-    ; MIPS32: [[COPY2:%[0-9]+]]:_(s32) = COPY $a2
-    ; MIPS32: [[COPY3:%[0-9]+]]:_(s32) = COPY $a3
-    ; MIPS32: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(sle), [[COPY1]](s32), [[COPY3]]
-    ; MIPS32: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[COPY1]](s32), [[COPY3]]
-    ; MIPS32: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(ule), [[COPY]](s32), [[COPY2]]
-    ; MIPS32: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; MIPS32: [[AND:%[0-9]+]]:_(s32) = G_AND [[ICMP1]], [[C]]
-    ; MIPS32: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AND]](s32), [[ICMP2]], [[ICMP]]
-    ; MIPS32: $v0 = COPY [[SELECT]](s32)
-    ; MIPS32: RetRA implicit $v0
+    ; MIPS32-NEXT: {{  $}}
+    ; MIPS32-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+    ; MIPS32-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
+    ; MIPS32-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $a2
+    ; MIPS32-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $a3
+    ; MIPS32-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(sle), [[COPY1]](s32), [[COPY3]]
+    ; MIPS32-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[COPY1]](s32), [[COPY3]]
+    ; MIPS32-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(ule), [[COPY]](s32), [[COPY2]]
+    ; MIPS32-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s32), [[ICMP2]], [[ICMP]]
+    ; MIPS32-NEXT: $v0 = COPY [[SELECT]](s32)
+    ; MIPS32-NEXT: RetRA implicit $v0
     %2:_(s32) = COPY $a0
     %3:_(s32) = COPY $a1
     %0:_(s64) = G_MERGE_VALUES %2(s32), %3(s32)
@@ -334,18 +336,17 @@ body:             |
 
     ; MIPS32-LABEL: name: ugt_i64
     ; MIPS32: liveins: $a0, $a1, $a2, $a3
-    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
-    ; MIPS32: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
-    ; MIPS32: [[COPY2:%[0-9]+]]:_(s32) = COPY $a2
-    ; MIPS32: [[COPY3:%[0-9]+]]:_(s32) = COPY $a3
-    ; MIPS32: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ugt), [[COPY1]](s32), [[COPY3]]
-    ; MIPS32: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[COPY1]](s32), [[COPY3]]
-    ; MIPS32: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(ugt), [[COPY]](s32), [[COPY2]]
-    ; MIPS32: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; MIPS32: [[AND:%[0-9]+]]:_(s32) = G_AND [[ICMP1]], [[C]]
-    ; MIPS32: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AND]](s32), [[ICMP2]], [[ICMP]]
-    ; MIPS32: $v0 = COPY [[SELECT]](s32)
-    ; MIPS32: RetRA implicit $v0
+    ; MIPS32-NEXT: {{  $}}
+    ; MIPS32-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+    ; MIPS32-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
+    ; MIPS32-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $a2
+    ; MIPS32-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $a3
+    ; MIPS32-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ugt), [[COPY1]](s32), [[COPY3]]
+    ; MIPS32-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[COPY1]](s32), [[COPY3]]
+    ; MIPS32-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(ugt), [[COPY]](s32), [[COPY2]]
+    ; MIPS32-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s32), [[ICMP2]], [[ICMP]]
+    ; MIPS32-NEXT: $v0 = COPY [[SELECT]](s32)
+    ; MIPS32-NEXT: RetRA implicit $v0
     %2:_(s32) = COPY $a0
     %3:_(s32) = COPY $a1
     %0:_(s64) = G_MERGE_VALUES %2(s32), %3(s32)
@@ -368,18 +369,17 @@ body:             |
 
     ; MIPS32-LABEL: name: uge_i64
     ; MIPS32: liveins: $a0, $a1, $a2, $a3
-    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
-    ; MIPS32: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
-    ; MIPS32: [[COPY2:%[0-9]+]]:_(s32) = COPY $a2
-    ; MIPS32: [[COPY3:%[0-9]+]]:_(s32) = COPY $a3
-    ; MIPS32: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(uge), [[COPY1]](s32), [[COPY3]]
-    ; MIPS32: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[COPY1]](s32), [[COPY3]]
-    ; MIPS32: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(uge), [[COPY]](s32), [[COPY2]]
-    ; MIPS32: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; MIPS32: [[AND:%[0-9]+]]:_(s32) = G_AND [[ICMP1]], [[C]]
-    ; MIPS32: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AND]](s32), [[ICMP2]], [[ICMP]]
-    ; MIPS32: $v0 = COPY [[SELECT]](s32)
-    ; MIPS32: RetRA implicit $v0
+    ; MIPS32-NEXT: {{  $}}
+    ; MIPS32-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+    ; MIPS32-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
+    ; MIPS32-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $a2
+    ; MIPS32-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $a3
+    ; MIPS32-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(uge), [[COPY1]](s32), [[COPY3]]
+    ; MIPS32-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[COPY1]](s32), [[COPY3]]
+    ; MIPS32-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(uge), [[COPY]](s32), [[COPY2]]
+    ; MIPS32-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s32), [[ICMP2]], [[ICMP]]
+    ; MIPS32-NEXT: $v0 = COPY [[SELECT]](s32)
+    ; MIPS32-NEXT: RetRA implicit $v0
     %2:_(s32) = COPY $a0
     %3:_(s32) = COPY $a1
     %0:_(s64) = G_MERGE_VALUES %2(s32), %3(s32)
@@ -402,18 +402,17 @@ body:             |
 
     ; MIPS32-LABEL: name: ult_i64
     ; MIPS32: liveins: $a0, $a1, $a2, $a3
-    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
-    ; MIPS32: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
-    ; MIPS32: [[COPY2:%[0-9]+]]:_(s32) = COPY $a2
-    ; MIPS32: [[COPY3:%[0-9]+]]:_(s32) = COPY $a3
-    ; MIPS32: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[COPY1]](s32), [[COPY3]]
-    ; MIPS32: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[COPY1]](s32), [[COPY3]]
-    ; MIPS32: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[COPY]](s32), [[COPY2]]
-    ; MIPS32: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; MIPS32: [[AND:%[0-9]+]]:_(s32) = G_AND [[ICMP1]], [[C]]
-    ; MIPS32: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AND]](s32), [[ICMP2]], [[ICMP]]
-    ; MIPS32: $v0 = COPY [[SELECT]](s32)
-    ; MIPS32: RetRA implicit $v0
+    ; MIPS32-NEXT: {{  $}}
+    ; MIPS32-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+    ; MIPS32-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
+    ; MIPS32-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $a2
+    ; MIPS32-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $a3
+    ; MIPS32-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[COPY1]](s32), [[COPY3]]
+    ; MIPS32-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[COPY1]](s32), [[COPY3]]
+    ; MIPS32-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[COPY]](s32), [[COPY2]]
+    ; MIPS32-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s32), [[ICMP2]], [[ICMP]]
+    ; MIPS32-NEXT: $v0 = COPY [[SELECT]](s32)
+    ; MIPS32-NEXT: RetRA implicit $v0
     %2:_(s32) = COPY $a0
     %3:_(s32) = COPY $a1
     %0:_(s64) = G_MERGE_VALUES %2(s32), %3(s32)
@@ -436,18 +435,17 @@ body:             |
 
     ; MIPS32-LABEL: name: ule_i64
     ; MIPS32: liveins: $a0, $a1, $a2, $a3
-    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
-    ; MIPS32: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
-    ; MIPS32: [[COPY2:%[0-9]+]]:_(s32) = COPY $a2
-    ; MIPS32: [[COPY3:%[0-9]+]]:_(s32) = COPY $a3
-    ; MIPS32: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ule), [[COPY1]](s32), [[COPY3]]
-    ; MIPS32: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[COPY1]](s32), [[COPY3]]
-    ; MIPS32: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(ule), [[COPY]](s32), [[COPY2]]
-    ; MIPS32: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; MIPS32: [[AND:%[0-9]+]]:_(s32) = G_AND [[ICMP1]], [[C]]
-    ; MIPS32: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AND]](s32), [[ICMP2]], [[ICMP]]
-    ; MIPS32: $v0 = COPY [[SELECT]](s32)
-    ; MIPS32: RetRA implicit $v0
+    ; MIPS32-NEXT: {{  $}}
+    ; MIPS32-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+    ; MIPS32-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
+    ; MIPS32-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $a2
+    ; MIPS32-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $a3
+    ; MIPS32-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ule), [[COPY1]](s32), [[COPY3]]
+    ; MIPS32-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[COPY1]](s32), [[COPY3]]
+    ; MIPS32-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(ule), [[COPY]](s32), [[COPY2]]
+    ; MIPS32-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s32), [[ICMP2]], [[ICMP]]
+    ; MIPS32-NEXT: $v0 = COPY [[SELECT]](s32)
+    ; MIPS32-NEXT: RetRA implicit $v0
     %2:_(s32) = COPY $a0
     %3:_(s32) = COPY $a1
     %0:_(s64) = G_MERGE_VALUES %2(s32), %3(s32)
diff --git a/llvm/test/CodeGen/Mips/GlobalISel/legalizer/jump_table_and_brjt.mir b/llvm/test/CodeGen/Mips/GlobalISel/legalizer/jump_table_and_brjt.mir
index 0c36cf6459b6d..301fd2b9196e9 100644
--- a/llvm/test/CodeGen/Mips/GlobalISel/legalizer/jump_table_and_brjt.mir
+++ b/llvm/test/CodeGen/Mips/GlobalISel/legalizer/jump_table_and_brjt.mir
@@ -70,69 +70,84 @@ jumpTable:
 body:             |
   ; MIPS32-LABEL: name: mod4_0_to_11
   ; MIPS32: bb.0.entry:
-  ; MIPS32:   successors: %bb.6(0x40000000), %bb.1(0x40000000)
-  ; MIPS32:   liveins: $a0
-  ; MIPS32:   [[COPY:%[0-9]+]]:_(s32) = COPY $a0
-  ; MIPS32:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 7
-  ; MIPS32:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
-  ; MIPS32:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-  ; MIPS32:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-  ; MIPS32:   [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-  ; MIPS32:   [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
-  ; MIPS32:   [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-  ; MIPS32:   [[SUB:%[0-9]+]]:_(s32) = G_SUB [[COPY]], [[C6]]
-  ; MIPS32:   [[COPY1:%[0-9]+]]:_(s32) = COPY [[SUB]](s32)
-  ; MIPS32:   [[COPY2:%[0-9]+]]:_(s32) = COPY [[C]](s32)
-  ; MIPS32:   [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ugt), [[COPY1]](s32), [[COPY2]]
-  ; MIPS32:   [[AND:%[0-9]+]]:_(s32) = G_AND [[ICMP]], [[C3]]
-  ; MIPS32:   G_BRCOND [[AND]](s32), %bb.6
-  ; MIPS32: bb.1.entry:
-  ; MIPS32:   successors: %bb.2(0x20000000), %bb.3(0x20000000), %bb.4(0x20000000), %bb.5(0x20000000)
-  ; MIPS32:   [[JUMP_TABLE:%[0-9]+]]:_(p0) = G_JUMP_TABLE %jump-table.0
-  ; MIPS32:   G_BRJT [[JUMP_TABLE]](p0), %jump-table.0, [[COPY1]](s32)
-  ; MIPS32: bb.2.sw.bb:
-  ; MIPS32:   $v0 = COPY [[C4]](s32)
-  ; MIPS32:   RetRA implicit $v0
-  ; MIPS32: bb.3.sw.bb1:
-  ; MIPS32:   $v0 = COPY [[C3]](s32)
-  ; MIPS32:   RetRA implicit $v0
-  ; MIPS32: bb.4.sw.bb2:
-  ; MIPS32:   $v0 = COPY [[C2]](s32)
-  ; MIPS32:   RetRA implicit $v0
-  ; MIPS32: bb.5.sw.bb3:
-  ; MIPS32:   $v0 = COPY [[C1]](s32)
-  ; MIPS32:   RetRA implicit $v0
-  ; MIPS32: bb.6.sw.default:
-  ; MIPS32:   successors: %bb.7(0x80000000)
-  ; MIPS32: bb.7.sw.epilog:
-  ; MIPS32:   successors: %bb.13(0x40000000), %bb.8(0x40000000)
-  ; MIPS32:   [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-  ; MIPS32:   [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[COPY]], [[C7]]
-  ; MIPS32:   [[COPY3:%[0-9]+]]:_(s32) = COPY [[SUB1]](s32)
-  ; MIPS32:   [[COPY4:%[0-9]+]]:_(s32) = COPY [[C1]](s32)
-  ; MIPS32:   [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(ugt), [[COPY3]](s32), [[COPY4]]
-  ; MIPS32:   [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-  ; MIPS32:   [[AND1:%[0-9]+]]:_(s32) = G_AND [[ICMP1]], [[C8]]
-  ; MIPS32:   G_BRCOND [[AND1]](s32), %bb.13
-  ; MIPS32: bb.8.sw.epilog:
-  ; MIPS32:   successors: %bb.9(0x20000000), %bb.10(0x20000000), %bb.11(0x20000000), %bb.12(0x20000000)
-  ; MIPS32:   [[JUMP_TABLE1:%[0-9]+]]:_(p0) = G_JUMP_TABLE %jump-table.1
-  ; MIPS32:   G_BRJT [[JUMP_TABLE1]](p0), %jump-table.1, [[COPY3]](s32)
-  ; MIPS32: bb.9.sw.bb4:
-  ; MIPS32:   $v0 = COPY [[C4]](s32)
-  ; MIPS32:   RetRA implicit $v0
-  ; MIPS32: bb.10.sw.bb5:
-  ; MIPS32:   $v0 = COPY [[C3]](s32)
-  ; MIPS32:   RetRA implicit $v0
-  ; MIPS32: bb.11.sw.bb6:
-  ; MIPS32:   $v0 = COPY [[C2]](s32)
-  ; MIPS32:   RetRA implicit $v0
-  ; MIPS32: bb.12.sw.bb7:
-  ; MIPS32:   $v0 = COPY [[C1]](s32)
-  ; MIPS32:   RetRA implicit $v0
-  ; MIPS32: bb.13.sw.default8:
-  ; MIPS32:   $v0 = COPY [[C5]](s32)
-  ; MIPS32:   RetRA implicit $v0
+  ; MIPS32-NEXT:   successors: %bb.6(0x40000000), %bb.1(0x40000000)
+  ; MIPS32-NEXT:   liveins: $a0
+  ; MIPS32-NEXT: {{  $}}
+  ; MIPS32-NEXT:   [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+  ; MIPS32-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 7
+  ; MIPS32-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
+  ; MIPS32-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+  ; MIPS32-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+  ; MIPS32-NEXT:   [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+  ; MIPS32-NEXT:   [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
+  ; MIPS32-NEXT:   [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+  ; MIPS32-NEXT:   [[SUB:%[0-9]+]]:_(s32) = G_SUB [[COPY]], [[C6]]
+  ; MIPS32-NEXT:   [[COPY1:%[0-9]+]]:_(s32) = COPY [[SUB]](s32)
+  ; MIPS32-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY [[C]](s32)
+  ; MIPS32-NEXT:   [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ugt), [[COPY1]](s32), [[COPY2]]
+  ; MIPS32-NEXT:   G_BRCOND [[ICMP]](s32), %bb.6
+  ; MIPS32-NEXT: {{  $}}
+  ; MIPS32-NEXT: bb.1.entry:
+  ; MIPS32-NEXT:   successors: %bb.2(0x20000000), %bb.3(0x20000000), %bb.4(0x20000000), %bb.5(0x20000000)
+  ; MIPS32-NEXT: {{  $}}
+  ; MIPS32-NEXT:   [[JUMP_TABLE:%[0-9]+]]:_(p0) = G_JUMP_TABLE %jump-table.0
+  ; MIPS32-NEXT:   G_BRJT [[JUMP_TABLE]](p0), %jump-table.0, [[COPY1]](s32)
+  ; MIPS32-NEXT: {{  $}}
+  ; MIPS32-NEXT: bb.2.sw.bb:
+  ; MIPS32-NEXT:   $v0 = COPY [[C4]](s32)
+  ; MIPS32-NEXT:   RetRA implicit $v0
+  ; MIPS32-NEXT: {{  $}}
+  ; MIPS32-NEXT: bb.3.sw.bb1:
+  ; MIPS32-NEXT:   $v0 = COPY [[C3]](s32)
+  ; MIPS32-NEXT:   RetRA implicit $v0
+  ; MIPS32-NEXT: {{  $}}
+  ; MIPS32-NEXT: bb.4.sw.bb2:
+  ; MIPS32-NEXT:   $v0 = COPY [[C2]](s32)
+  ; MIPS32-NEXT:   RetRA implicit $v0
+  ; MIPS32-NEXT: {{  $}}
+  ; MIPS32-NEXT: bb.5.sw.bb3:
+  ; MIPS32-NEXT:   $v0 = COPY [[C1]](s32)
+  ; MIPS32-NEXT:   RetRA implicit $v0
+  ; MIPS32-NEXT: {{  $}}
+  ; MIPS32-NEXT: bb.6.sw.default:
+  ; MIPS32-NEXT:   successors: %bb.7(0x80000000)
+  ; MIPS32-NEXT: {{  $}}
+  ; MIPS32-NEXT: {{  $}}
+  ; MIPS32-NEXT: bb.7.sw.epilog:
+  ; MIPS32-NEXT:   successors: %bb.13(0x40000000), %bb.8(0x40000000)
+  ; MIPS32-NEXT: {{  $}}
+  ; MIPS32-NEXT:   [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+  ; MIPS32-NEXT:   [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[COPY]], [[C7]]
+  ; MIPS32-NEXT:   [[COPY3:%[0-9]+]]:_(s32) = COPY [[SUB1]](s32)
+  ; MIPS32-NEXT:   [[COPY4:%[0-9]+]]:_(s32) = COPY [[C1]](s32)
+  ; MIPS32-NEXT:   [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(ugt), [[COPY3]](s32), [[COPY4]]
+  ; MIPS32-NEXT:   G_BRCOND [[ICMP1]](s32), %bb.13
+  ; MIPS32-NEXT: {{  $}}
+  ; MIPS32-NEXT: bb.8.sw.epilog:
+  ; MIPS32-NEXT:   successors: %bb.9(0x20000000), %bb.10(0x20000000), %bb.11(0x20000000), %bb.12(0x20000000)
+  ; MIPS32-NEXT: {{  $}}
+  ; MIPS32-NEXT:   [[JUMP_TABLE1:%[0-9]+]]:_(p0) = G_JUMP_TABLE %jump-table.1
+  ; MIPS32-NEXT:   G_BRJT [[JUMP_TABLE1]](p0), %jump-table.1, [[COPY3]](s32)
+  ; MIPS32-NEXT: {{  $}}
+  ; MIPS32-NEXT: bb.9.sw.bb4:
+  ; MIPS32-NEXT:   $v0 = COPY [[C4]](s32)
+  ; MIPS32-NEXT:   RetRA implicit $v0
+  ; MIPS32-NEXT: {{  $}}
+  ; MIPS32-NEXT: bb.10.sw.bb5:
+  ; MIPS32-NEXT:   $v0 = COPY [[C3]](s32)
+  ; MIPS32-NEXT:   RetRA implicit $v0
+  ; MIPS32-NEXT: {{  $}}
+  ; MIPS32-NEXT: bb.11.sw.bb6:
+  ; MIPS32-NEXT:   $v0 = COPY [[C2]](s32)
+  ; MIPS32-NEXT:   RetRA implicit $v0
+  ; MIPS32-NEXT: {{  $}}
+  ; MIPS32-NEXT: bb.12.sw.bb7:
+  ; MIPS32-NEXT:   $v0 = COPY [[C1]](s32)
+  ; MIPS32-NEXT:   RetRA implicit $v0
+  ; MIPS32-NEXT: {{  $}}
+  ; MIPS32-NEXT: bb.13.sw.default8:
+  ; MIPS32-NEXT:   $v0 = COPY [[C5]](s32)
+  ; MIPS32-NEXT:   RetRA implicit $v0
   bb.1.entry:
     liveins: $a0
 
diff --git a/llvm/test/CodeGen/Mips/GlobalISel/legalizer/mul.mir b/llvm/test/CodeGen/Mips/GlobalISel/legalizer/mul.mir
index e96d826638541..7ad286b952cb1 100644
--- a/llvm/test/CodeGen/Mips/GlobalISel/legalizer/mul.mir
+++ b/llvm/test/CodeGen/Mips/GlobalISel/legalizer/mul.mir
@@ -25,11 +25,12 @@ body:             |
 
     ; MIPS32-LABEL: name: mul_i32
     ; MIPS32: liveins: $a0, $a1
-    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
-    ; MIPS32: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
-    ; MIPS32: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[COPY]], [[COPY1]]
-    ; MIPS32: $v0 = COPY [[MUL]](s32)
-    ; MIPS32: RetRA implicit $v0
+    ; MIPS32-NEXT: {{  $}}
+    ; MIPS32-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+    ; MIPS32-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
+    ; MIPS32-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[COPY]], [[COPY1]]
+    ; MIPS32-NEXT: $v0 = COPY [[MUL]](s32)
+    ; MIPS32-NEXT: RetRA implicit $v0
     %0:_(s32) = COPY $a0
     %1:_(s32) = COPY $a1
     %2:_(s32) = G_MUL %0, %1
@@ -47,14 +48,15 @@ body:             |
 
     ; MIPS32-LABEL: name: mul_i8_sext
     ; MIPS32: liveins: $a0, $a1
-    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
-    ; MIPS32: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
-    ; MIPS32: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[COPY1]], [[COPY]]
-    ; MIPS32: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
-    ; MIPS32: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[MUL]], [[C]](s32)
-    ; MIPS32: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[SHL]], [[C]](s32)
-    ; MIPS32: $v0 = COPY [[ASHR]](s32)
-    ; MIPS32: RetRA implicit $v0
+    ; MIPS32-NEXT: {{  $}}
+    ; MIPS32-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+    ; MIPS32-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
+    ; MIPS32-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[COPY1]], [[COPY]]
+    ; MIPS32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
+    ; MIPS32-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[MUL]], [[C]](s32)
+    ; MIPS32-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[SHL]], [[C]](s32)
+    ; MIPS32-NEXT: $v0 = COPY [[ASHR]](s32)
+    ; MIPS32-NEXT: RetRA implicit $v0
     %2:_(s32) = COPY $a0
     %0:_(s8) = G_TRUNC %2(s32)
     %3:_(s32) = COPY $a1
@@ -75,13 +77,14 @@ body:             |
 
     ; MIPS32-LABEL: name: mul_i8_zext
     ; MIPS32: liveins: $a0, $a1
-    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
-    ; MIPS32: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
-    ; MIPS32: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[COPY1]], [[COPY]]
-    ; MIPS32: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
-    ; MIPS32: [[AND:%[0-9]+]]:_(s32) = G_AND [[MUL]], [[C]]
-    ; MIPS32: $v0 = COPY [[AND]](s32)
-    ; MIPS32: RetRA implicit $v0
+    ; MIPS32-NEXT: {{  $}}
+    ; MIPS32-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+    ; MIPS32-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
+    ; MIPS32-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[COPY1]], [[COPY]]
+    ; MIPS32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
+    ; MIPS32-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[MUL]], [[C]]
+    ; MIPS32-NEXT: $v0 = COPY [[AND]](s32)
+    ; MIPS32-NEXT: RetRA implicit $v0
     %2:_(s32) = COPY $a0
     %0:_(s8) = G_TRUNC %2(s32)
     %3:_(s32) = COPY $a1
@@ -102,11 +105,12 @@ body:             |
 
     ; MIPS32-LABEL: name: mul_i8_aext
     ; MIPS32: liveins: $a0, $a1
-    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
-    ; MIPS32: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
-    ; MIPS32: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[COPY1]], [[COPY]]
-    ; MIPS32: $v0 = COPY [[MUL]](s32)
-    ; MIPS32: RetRA implicit $v0
+    ; MIPS32-NEXT: {{  $}}
+    ; MIPS32-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+    ; MIPS32-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
+    ; MIPS32-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[COPY1]], [[COPY]]
+    ; MIPS32-NEXT: $v0 = COPY [[MUL]](s32)
+    ; MIPS32-NEXT: RetRA implicit $v0
     %2:_(s32) = COPY $a0
     %0:_(s8) = G_TRUNC %2(s32)
     %3:_(s32) = COPY $a1
@@ -127,14 +131,15 @@ body:             |
 
     ; MIPS32-LABEL: name: mul_i16_sext
     ; MIPS32: liveins: $a0, $a1
-    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
-    ; MIPS32: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
-    ; MIPS32: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[COPY1]], [[COPY]]
-    ; MIPS32: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; MIPS32: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[MUL]], [[C]](s32)
-    ; MIPS32: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[SHL]], [[C]](s32)
-    ; MIPS32: $v0 = COPY [[ASHR]](s32)
-    ; MIPS32: RetRA implicit $v0
+    ; MIPS32-NEXT: {{  $}}
+    ; MIPS32-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+    ; MIPS32-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
+    ; MIPS32-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[COPY1]], [[COPY]]
+    ; MIPS32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; MIPS32-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[MUL]], [[C]](s32)
+    ; MIPS32-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[SHL]], [[C]](s32)
+    ; MIPS32-NEXT: $v0 = COPY [[ASHR]](s32)
+    ; MIPS32-NEXT: RetRA implicit $v0
     %2:_(s32) = COPY $a0
     %0:_(s16) = G_TRUNC %2(s32)
     %3:_(s32) = COPY $a1
@@ -155,13 +160,14 @@ body:             |
 
     ; MIPS32-LABEL: name: mul_i16_zext
     ; MIPS32: liveins: $a0, $a1
-    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
-    ; MIPS32: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
-    ; MIPS32: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[COPY1]], [[COPY]]
-    ; MIPS32: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
-    ; MIPS32: [[AND:%[0-9]+]]:_(s32) = G_AND [[MUL]], [[C]]
-    ; MIPS32: $v0 = COPY [[AND]](s32)
-    ; MIPS32: RetRA implicit $v0
+    ; MIPS32-NEXT: {{  $}}
+    ; MIPS32-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+    ; MIPS32-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
+    ; MIPS32-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[COPY1]], [[COPY]]
+    ; MIPS32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+    ; MIPS32-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[MUL]], [[C]]
+    ; MIPS32-NEXT: $v0 = COPY [[AND]](s32)
+    ; MIPS32-NEXT: RetRA implicit $v0
     %2:_(s32) = COPY $a0
     %0:_(s16) = G_TRUNC %2(s32)
     %3:_(s32) = COPY $a1
@@ -182,11 +188,12 @@ body:             |
 
     ; MIPS32-LABEL: name: mul_i16_aext
     ; MIPS32: liveins: $a0, $a1
-    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
-    ; MIPS32: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
-    ; MIPS32: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[COPY1]], [[COPY]]
-    ; MIPS32: $v0 = COPY [[MUL]](s32)
-    ; MIPS32: RetRA implicit $v0
+    ; MIPS32-NEXT: {{  $}}
+    ; MIPS32-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+    ; MIPS32-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
+    ; MIPS32-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[COPY1]], [[COPY]]
+    ; MIPS32-NEXT: $v0 = COPY [[MUL]](s32)
+    ; MIPS32-NEXT: RetRA implicit $v0
     %2:_(s32) = COPY $a0
     %0:_(s16) = G_TRUNC %2(s32)
     %3:_(s32) = COPY $a1
@@ -207,19 +214,20 @@ body:             |
 
     ; MIPS32-LABEL: name: mul_i64
     ; MIPS32: liveins: $a0, $a1, $a2, $a3
-    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
-    ; MIPS32: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
-    ; MIPS32: [[COPY2:%[0-9]+]]:_(s32) = COPY $a2
-    ; MIPS32: [[COPY3:%[0-9]+]]:_(s32) = COPY $a3
-    ; MIPS32: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[COPY2]], [[COPY]]
-    ; MIPS32: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[COPY3]], [[COPY]]
-    ; MIPS32: [[MUL2:%[0-9]+]]:_(s32) = G_MUL [[COPY2]], [[COPY1]]
-    ; MIPS32: [[UMULH:%[0-9]+]]:_(s32) = G_UMULH [[COPY2]], [[COPY]]
-    ; MIPS32: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[MUL1]], [[MUL2]]
-    ; MIPS32: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[ADD]], [[UMULH]]
-    ; MIPS32: $v0 = COPY [[MUL]](s32)
-    ; MIPS32: $v1 = COPY [[ADD1]](s32)
-    ; MIPS32: RetRA implicit $v0, implicit $v1
+    ; MIPS32-NEXT: {{  $}}
+    ; MIPS32-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+    ; MIPS32-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
+    ; MIPS32-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $a2
+    ; MIPS32-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $a3
+    ; MIPS32-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[COPY2]], [[COPY]]
+    ; MIPS32-NEXT: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[COPY3]], [[COPY]]
+    ; MIPS32-NEXT: [[MUL2:%[0-9]+]]:_(s32) = G_MUL [[COPY2]], [[COPY1]]
+    ; MIPS32-NEXT: [[UMULH:%[0-9]+]]:_(s32) = G_UMULH [[COPY2]], [[COPY]]
+    ; MIPS32-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[MUL1]], [[MUL2]]
+    ; MIPS32-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[ADD]], [[UMULH]]
+    ; MIPS32-NEXT: $v0 = COPY [[MUL]](s32)
+    ; MIPS32-NEXT: $v1 = COPY [[ADD1]](s32)
+    ; MIPS32-NEXT: RetRA implicit $v0, implicit $v1
     %2:_(s32) = COPY $a0
     %3:_(s32) = COPY $a1
     %0:_(s64) = G_MERGE_VALUES %2(s32), %3(s32)
@@ -248,73 +256,66 @@ body:             |
 
     ; MIPS32-LABEL: name: mul_i128
     ; MIPS32: liveins: $a0, $a1, $a2, $a3
-    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
-    ; MIPS32: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
-    ; MIPS32: [[COPY2:%[0-9]+]]:_(s32) = COPY $a2
-    ; MIPS32: [[COPY3:%[0-9]+]]:_(s32) = COPY $a3
-    ; MIPS32: [[FRAME_INDEX:%[0-9]+]]:_(p0) = G_FRAME_INDEX %fixed-stack.0
-    ; MIPS32: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX]](p0) :: (load (s32) from %fixed-stack.0, align 8)
-    ; MIPS32: [[FRAME_INDEX1:%[0-9]+]]:_(p0) = G_FRAME_INDEX %fixed-stack.1
-    ; MIPS32: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX1]](p0) :: (load (s32) from %fixed-stack.1)
-    ; MIPS32: [[FRAME_INDEX2:%[0-9]+]]:_(p0) = G_FRAME_INDEX %fixed-stack.2
-    ; MIPS32: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX2]](p0) :: (load (s32) from %fixed-stack.2, align 8)
-    ; MIPS32: [[FRAME_INDEX3:%[0-9]+]]:_(p0) = G_FRAME_INDEX %fixed-stack.3
-    ; MIPS32: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX3]](p0) :: (load (s32) from %fixed-stack.3)
-    ; MIPS32: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[LOAD]], [[COPY]]
-    ; MIPS32: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[LOAD1]], [[COPY]]
-    ; MIPS32: [[MUL2:%[0-9]+]]:_(s32) = G_MUL [[LOAD]], [[COPY1]]
-    ; MIPS32: [[UMULH:%[0-9]+]]:_(s32) = G_UMULH [[LOAD]], [[COPY]]
-    ; MIPS32: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[MUL1]], [[MUL2]]
-    ; MIPS32: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[ADD]](s32), [[MUL2]]
-    ; MIPS32: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; MIPS32: [[AND:%[0-9]+]]:_(s32) = G_AND [[ICMP]], [[C]]
-    ; MIPS32: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[ADD]], [[UMULH]]
-    ; MIPS32: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[ADD1]](s32), [[UMULH]]
-    ; MIPS32: [[AND1:%[0-9]+]]:_(s32) = G_AND [[ICMP1]], [[C]]
-    ; MIPS32: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[AND]], [[AND1]]
-    ; MIPS32: [[MUL3:%[0-9]+]]:_(s32) = G_MUL [[LOAD2]], [[COPY]]
-    ; MIPS32: [[MUL4:%[0-9]+]]:_(s32) = G_MUL [[LOAD1]], [[COPY1]]
-    ; MIPS32: [[MUL5:%[0-9]+]]:_(s32) = G_MUL [[LOAD]], [[COPY2]]
-    ; MIPS32: [[UMULH1:%[0-9]+]]:_(s32) = G_UMULH [[LOAD1]], [[COPY]]
-    ; MIPS32: [[UMULH2:%[0-9]+]]:_(s32) = G_UMULH [[LOAD]], [[COPY1]]
-    ; MIPS32: [[ADD3:%[0-9]+]]:_(s32) = G_ADD [[MUL3]], [[MUL4]]
-    ; MIPS32: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[ADD3]](s32), [[MUL4]]
-    ; MIPS32: [[AND2:%[0-9]+]]:_(s32) = G_AND [[ICMP2]], [[C]]
-    ; MIPS32: [[ADD4:%[0-9]+]]:_(s32) = G_ADD [[ADD3]], [[MUL5]]
-    ; MIPS32: [[ICMP3:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[ADD4]](s32), [[MUL5]]
-    ; MIPS32: [[AND3:%[0-9]+]]:_(s32) = G_AND [[ICMP3]], [[C]]
-    ; MIPS32: [[ADD5:%[0-9]+]]:_(s32) = G_ADD [[AND2]], [[AND3]]
-    ; MIPS32: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[ADD4]], [[UMULH1]]
-    ; MIPS32: [[ICMP4:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[ADD6]](s32), [[UMULH1]]
-    ; MIPS32: [[AND4:%[0-9]+]]:_(s32) = G_AND [[ICMP4]], [[C]]
-    ; MIPS32: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[ADD5]], [[AND4]]
-    ; MIPS32: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ADD6]], [[UMULH2]]
-    ; MIPS32: [[ICMP5:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[ADD8]](s32), [[UMULH2]]
-    ; MIPS32: [[AND5:%[0-9]+]]:_(s32) = G_AND [[ICMP5]], [[C]]
-    ; MIPS32: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ADD7]], [[AND5]]
-    ; MIPS32: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ADD8]], [[ADD2]]
-    ; MIPS32: [[ICMP6:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[ADD10]](s32), [[ADD2]]
-    ; MIPS32: [[AND6:%[0-9]+]]:_(s32) = G_AND [[ICMP6]], [[C]]
-    ; MIPS32: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[ADD9]], [[AND6]]
-    ; MIPS32: [[MUL6:%[0-9]+]]:_(s32) = G_MUL [[LOAD3]], [[COPY]]
-    ; MIPS32: [[MUL7:%[0-9]+]]:_(s32) = G_MUL [[LOAD2]], [[COPY1]]
-    ; MIPS32: [[MUL8:%[0-9]+]]:_(s32) = G_MUL [[LOAD1]], [[COPY2]]
-    ; MIPS32: [[MUL9:%[0-9]+]]:_(s32) = G_MUL [[LOAD]], [[COPY3]]
-    ; MIPS32: [[UMULH3:%[0-9]+]]:_(s32) = G_UMULH [[LOAD2]], [[COPY]]
-    ; MIPS32: [[UMULH4:%[0-9]+]]:_(s32) = G_UMULH [[LOAD1]], [[COPY1]]
-    ; MIPS32: [[UMULH5:%[0-9]+]]:_(s32) = G_UMULH [[LOAD]], [[COPY2]]
-    ; MIPS32: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[MUL6]], [[MUL7]]
-    ; MIPS32: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ADD12]], [[MUL8]]
-    ; MIPS32: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ADD13]], [[MUL9]]
-    ; MIPS32: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[ADD14]], [[UMULH3]]
-    ; MIPS32: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[ADD15]], [[UMULH4]]
-    ; MIPS32: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[ADD16]], [[UMULH5]]
-    ; MIPS32: [[ADD18:%[0-9]+]]:_(s32) = G_ADD [[ADD17]], [[ADD11]]
-    ; MIPS32: $v0 = COPY [[MUL]](s32)
-    ; MIPS32: $v1 = COPY [[ADD1]](s32)
-    ; MIPS32: $a0 = COPY [[ADD10]](s32)
-    ; MIPS32: $a1 = COPY [[ADD18]](s32)
-    ; MIPS32: RetRA implicit $v0, implicit $v1, implicit $a0, implicit $a1
+    ; MIPS32-NEXT: {{  $}}
+    ; MIPS32-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+    ; MIPS32-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
+    ; MIPS32-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $a2
+    ; MIPS32-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $a3
+    ; MIPS32-NEXT: [[FRAME_INDEX:%[0-9]+]]:_(p0) = G_FRAME_INDEX %fixed-stack.0
+    ; MIPS32-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX]](p0) :: (load (s32) from %fixed-stack.0, align 8)
+    ; MIPS32-NEXT: [[FRAME_INDEX1:%[0-9]+]]:_(p0) = G_FRAME_INDEX %fixed-stack.1
+    ; MIPS32-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX1]](p0) :: (load (s32) from %fixed-stack.1)
+    ; MIPS32-NEXT: [[FRAME_INDEX2:%[0-9]+]]:_(p0) = G_FRAME_INDEX %fixed-stack.2
+    ; MIPS32-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX2]](p0) :: (load (s32) from %fixed-stack.2, align 8)
+    ; MIPS32-NEXT: [[FRAME_INDEX3:%[0-9]+]]:_(p0) = G_FRAME_INDEX %fixed-stack.3
+    ; MIPS32-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX3]](p0) :: (load (s32) from %fixed-stack.3)
+    ; MIPS32-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[LOAD]], [[COPY]]
+    ; MIPS32-NEXT: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[LOAD1]], [[COPY]]
+    ; MIPS32-NEXT: [[MUL2:%[0-9]+]]:_(s32) = G_MUL [[LOAD]], [[COPY1]]
+    ; MIPS32-NEXT: [[UMULH:%[0-9]+]]:_(s32) = G_UMULH [[LOAD]], [[COPY]]
+    ; MIPS32-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[MUL1]], [[MUL2]]
+    ; MIPS32-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[ADD]](s32), [[MUL2]]
+    ; MIPS32-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[ADD]], [[UMULH]]
+    ; MIPS32-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[ADD1]](s32), [[UMULH]]
+    ; MIPS32-NEXT: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[ICMP]], [[ICMP1]]
+    ; MIPS32-NEXT: [[MUL3:%[0-9]+]]:_(s32) = G_MUL [[LOAD2]], [[COPY]]
+    ; MIPS32-NEXT: [[MUL4:%[0-9]+]]:_(s32) = G_MUL [[LOAD1]], [[COPY1]]
+    ; MIPS32-NEXT: [[MUL5:%[0-9]+]]:_(s32) = G_MUL [[LOAD]], [[COPY2]]
+    ; MIPS32-NEXT: [[UMULH1:%[0-9]+]]:_(s32) = G_UMULH [[LOAD1]], [[COPY]]
+    ; MIPS32-NEXT: [[UMULH2:%[0-9]+]]:_(s32) = G_UMULH [[LOAD]], [[COPY1]]
+    ; MIPS32-NEXT: [[ADD3:%[0-9]+]]:_(s32) = G_ADD [[MUL3]], [[MUL4]]
+    ; MIPS32-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[ADD3]](s32), [[MUL4]]
+    ; MIPS32-NEXT: [[ADD4:%[0-9]+]]:_(s32) = G_ADD [[ADD3]], [[MUL5]]
+    ; MIPS32-NEXT: [[ICMP3:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[ADD4]](s32), [[MUL5]]
+    ; MIPS32-NEXT: [[ADD5:%[0-9]+]]:_(s32) = G_ADD [[ICMP2]], [[ICMP3]]
+    ; MIPS32-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[ADD4]], [[UMULH1]]
+    ; MIPS32-NEXT: [[ICMP4:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[ADD6]](s32), [[UMULH1]]
+    ; MIPS32-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[ADD5]], [[ICMP4]]
+    ; MIPS32-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ADD6]], [[UMULH2]]
+    ; MIPS32-NEXT: [[ICMP5:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[ADD8]](s32), [[UMULH2]]
+    ; MIPS32-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ADD7]], [[ICMP5]]
+    ; MIPS32-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ADD8]], [[ADD2]]
+    ; MIPS32-NEXT: [[ICMP6:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[ADD10]](s32), [[ADD2]]
+    ; MIPS32-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[ADD9]], [[ICMP6]]
+    ; MIPS32-NEXT: [[MUL6:%[0-9]+]]:_(s32) = G_MUL [[LOAD3]], [[COPY]]
+    ; MIPS32-NEXT: [[MUL7:%[0-9]+]]:_(s32) = G_MUL [[LOAD2]], [[COPY1]]
+    ; MIPS32-NEXT: [[MUL8:%[0-9]+]]:_(s32) = G_MUL [[LOAD1]], [[COPY2]]
+    ; MIPS32-NEXT: [[MUL9:%[0-9]+]]:_(s32) = G_MUL [[LOAD]], [[COPY3]]
+    ; MIPS32-NEXT: [[UMULH3:%[0-9]+]]:_(s32) = G_UMULH [[LOAD2]], [[COPY]]
+    ; MIPS32-NEXT: [[UMULH4:%[0-9]+]]:_(s32) = G_UMULH [[LOAD1]], [[COPY1]]
+    ; MIPS32-NEXT: [[UMULH5:%[0-9]+]]:_(s32) = G_UMULH [[LOAD]], [[COPY2]]
+    ; MIPS32-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[MUL6]], [[MUL7]]
+    ; MIPS32-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ADD12]], [[MUL8]]
+    ; MIPS32-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ADD13]], [[MUL9]]
+    ; MIPS32-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[ADD14]], [[UMULH3]]
+    ; MIPS32-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[ADD15]], [[UMULH4]]
+    ; MIPS32-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[ADD16]], [[UMULH5]]
+    ; MIPS32-NEXT: [[ADD18:%[0-9]+]]:_(s32) = G_ADD [[ADD17]], [[ADD11]]
+    ; MIPS32-NEXT: $v0 = COPY [[MUL]](s32)
+    ; MIPS32-NEXT: $v1 = COPY [[ADD1]](s32)
+    ; MIPS32-NEXT: $a0 = COPY [[ADD10]](s32)
+    ; MIPS32-NEXT: $a1 = COPY [[ADD18]](s32)
+    ; MIPS32-NEXT: RetRA implicit $v0, implicit $v1, implicit $a0, implicit $a1
     %2:_(s32) = COPY $a0
     %3:_(s32) = COPY $a1
     %4:_(s32) = COPY $a2
@@ -348,40 +349,35 @@ body:             |
 
     ; MIPS32-LABEL: name: umulh_i64
     ; MIPS32: liveins: $a0, $a1, $a2, $a3
-    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
-    ; MIPS32: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
-    ; MIPS32: [[COPY2:%[0-9]+]]:_(s32) = COPY $a2
-    ; MIPS32: [[COPY3:%[0-9]+]]:_(s32) = COPY $a3
-    ; MIPS32: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[COPY3]], [[COPY]]
-    ; MIPS32: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[COPY2]], [[COPY1]]
-    ; MIPS32: [[UMULH:%[0-9]+]]:_(s32) = G_UMULH [[COPY2]], [[COPY]]
-    ; MIPS32: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[MUL]], [[MUL1]]
-    ; MIPS32: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[ADD]](s32), [[MUL1]]
-    ; MIPS32: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; MIPS32: [[AND:%[0-9]+]]:_(s32) = G_AND [[ICMP]], [[C]]
-    ; MIPS32: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[ADD]], [[UMULH]]
-    ; MIPS32: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[ADD1]](s32), [[UMULH]]
-    ; MIPS32: [[AND1:%[0-9]+]]:_(s32) = G_AND [[ICMP1]], [[C]]
-    ; MIPS32: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[AND]], [[AND1]]
-    ; MIPS32: [[MUL2:%[0-9]+]]:_(s32) = G_MUL [[COPY3]], [[COPY1]]
-    ; MIPS32: [[UMULH1:%[0-9]+]]:_(s32) = G_UMULH [[COPY3]], [[COPY]]
-    ; MIPS32: [[UMULH2:%[0-9]+]]:_(s32) = G_UMULH [[COPY2]], [[COPY1]]
-    ; MIPS32: [[ADD3:%[0-9]+]]:_(s32) = G_ADD [[MUL2]], [[UMULH1]]
-    ; MIPS32: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[ADD3]](s32), [[UMULH1]]
-    ; MIPS32: [[AND2:%[0-9]+]]:_(s32) = G_AND [[ICMP2]], [[C]]
-    ; MIPS32: [[ADD4:%[0-9]+]]:_(s32) = G_ADD [[ADD3]], [[UMULH2]]
-    ; MIPS32: [[ICMP3:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[ADD4]](s32), [[UMULH2]]
-    ; MIPS32: [[AND3:%[0-9]+]]:_(s32) = G_AND [[ICMP3]], [[C]]
-    ; MIPS32: [[ADD5:%[0-9]+]]:_(s32) = G_ADD [[AND2]], [[AND3]]
-    ; MIPS32: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[ADD4]], [[ADD2]]
-    ; MIPS32: [[ICMP4:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[ADD6]](s32), [[ADD2]]
-    ; MIPS32: [[AND4:%[0-9]+]]:_(s32) = G_AND [[ICMP4]], [[C]]
-    ; MIPS32: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[ADD5]], [[AND4]]
-    ; MIPS32: [[UMULH3:%[0-9]+]]:_(s32) = G_UMULH [[COPY3]], [[COPY1]]
-    ; MIPS32: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[UMULH3]], [[ADD7]]
-    ; MIPS32: $v0 = COPY [[ADD6]](s32)
-    ; MIPS32: $v1 = COPY [[ADD8]](s32)
-    ; MIPS32: RetRA implicit $v0, implicit $v1
+    ; MIPS32-NEXT: {{  $}}
+    ; MIPS32-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+    ; MIPS32-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
+    ; MIPS32-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $a2
+    ; MIPS32-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $a3
+    ; MIPS32-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[COPY3]], [[COPY]]
+    ; MIPS32-NEXT: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[COPY2]], [[COPY1]]
+    ; MIPS32-NEXT: [[UMULH:%[0-9]+]]:_(s32) = G_UMULH [[COPY2]], [[COPY]]
+    ; MIPS32-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[MUL]], [[MUL1]]
+    ; MIPS32-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[ADD]](s32), [[MUL1]]
+    ; MIPS32-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[ADD]], [[UMULH]]
+    ; MIPS32-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[ADD1]](s32), [[UMULH]]
+    ; MIPS32-NEXT: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[ICMP]], [[ICMP1]]
+    ; MIPS32-NEXT: [[MUL2:%[0-9]+]]:_(s32) = G_MUL [[COPY3]], [[COPY1]]
+    ; MIPS32-NEXT: [[UMULH1:%[0-9]+]]:_(s32) = G_UMULH [[COPY3]], [[COPY]]
+    ; MIPS32-NEXT: [[UMULH2:%[0-9]+]]:_(s32) = G_UMULH [[COPY2]], [[COPY1]]
+    ; MIPS32-NEXT: [[ADD3:%[0-9]+]]:_(s32) = G_ADD [[MUL2]], [[UMULH1]]
+    ; MIPS32-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[ADD3]](s32), [[UMULH1]]
+    ; MIPS32-NEXT: [[ADD4:%[0-9]+]]:_(s32) = G_ADD [[ADD3]], [[UMULH2]]
+    ; MIPS32-NEXT: [[ICMP3:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[ADD4]](s32), [[UMULH2]]
+    ; MIPS32-NEXT: [[ADD5:%[0-9]+]]:_(s32) = G_ADD [[ICMP2]], [[ICMP3]]
+    ; MIPS32-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[ADD4]], [[ADD2]]
+    ; MIPS32-NEXT: [[ICMP4:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[ADD6]](s32), [[ADD2]]
+    ; MIPS32-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[ADD5]], [[ICMP4]]
+    ; MIPS32-NEXT: [[UMULH3:%[0-9]+]]:_(s32) = G_UMULH [[COPY3]], [[COPY1]]
+    ; MIPS32-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[UMULH3]], [[ADD7]]
+    ; MIPS32-NEXT: $v0 = COPY [[ADD6]](s32)
+    ; MIPS32-NEXT: $v1 = COPY [[ADD8]](s32)
+    ; MIPS32-NEXT: RetRA implicit $v0, implicit $v1
     %2:_(s32) = COPY $a0
     %3:_(s32) = COPY $a1
     %0:_(s64) = G_MERGE_VALUES %2(s32), %3(s32)
@@ -405,20 +401,20 @@ body:             |
 
     ; MIPS32-LABEL: name: umul_with_overflow
     ; MIPS32: liveins: $a0, $a1, $a2, $a3
-    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
-    ; MIPS32: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
-    ; MIPS32: [[COPY2:%[0-9]+]]:_(p0) = COPY $a2
-    ; MIPS32: [[COPY3:%[0-9]+]]:_(p0) = COPY $a3
-    ; MIPS32: [[UMULH:%[0-9]+]]:_(s32) = G_UMULH [[COPY]], [[COPY1]]
-    ; MIPS32: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-    ; MIPS32: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[COPY]], [[COPY1]]
-    ; MIPS32: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ne), [[UMULH]](s32), [[C]]
-    ; MIPS32: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; MIPS32: [[AND:%[0-9]+]]:_(s32) = G_AND [[ICMP]], [[C1]]
-    ; MIPS32: [[AND1:%[0-9]+]]:_(s32) = G_AND [[AND]], [[C1]]
-    ; MIPS32: G_STORE [[AND1]](s32), [[COPY3]](p0) :: (store (s8) into %ir.pcarry_flag)
-    ; MIPS32: G_STORE [[MUL]](s32), [[COPY2]](p0) :: (store (s32) into %ir.pmul)
-    ; MIPS32: RetRA
+    ; MIPS32-NEXT: {{  $}}
+    ; MIPS32-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+    ; MIPS32-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
+    ; MIPS32-NEXT: [[COPY2:%[0-9]+]]:_(p0) = COPY $a2
+    ; MIPS32-NEXT: [[COPY3:%[0-9]+]]:_(p0) = COPY $a3
+    ; MIPS32-NEXT: [[UMULH:%[0-9]+]]:_(s32) = G_UMULH [[COPY]], [[COPY1]]
+    ; MIPS32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; MIPS32-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[COPY]], [[COPY1]]
+    ; MIPS32-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ne), [[UMULH]](s32), [[C]]
+    ; MIPS32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; MIPS32-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ICMP]], [[C1]]
+    ; MIPS32-NEXT: G_STORE [[AND]](s32), [[COPY3]](p0) :: (store (s8) into %ir.pcarry_flag)
+    ; MIPS32-NEXT: G_STORE [[MUL]](s32), [[COPY2]](p0) :: (store (s32) into %ir.pmul)
+    ; MIPS32-NEXT: RetRA
     %0:_(s32) = COPY $a0
     %1:_(s32) = COPY $a1
     %2:_(p0) = COPY $a2
diff --git a/llvm/test/CodeGen/Mips/GlobalISel/legalizer/select.mir b/llvm/test/CodeGen/Mips/GlobalISel/legalizer/select.mir
index 60b16cc1f3525..a0bf22dbffd4b 100644
--- a/llvm/test/CodeGen/Mips/GlobalISel/legalizer/select.mir
+++ b/llvm/test/CodeGen/Mips/GlobalISel/legalizer/select.mir
@@ -22,14 +22,15 @@ body:             |
 
     ; MIPS32-LABEL: name: select_i8
     ; MIPS32: liveins: $a0, $a1, $a2
-    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
-    ; MIPS32: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
-    ; MIPS32: [[COPY2:%[0-9]+]]:_(s32) = COPY $a2
-    ; MIPS32: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; MIPS32: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C]]
-    ; MIPS32: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AND]](s32), [[COPY1]], [[COPY2]]
-    ; MIPS32: $v0 = COPY [[SELECT]](s32)
-    ; MIPS32: RetRA implicit $v0
+    ; MIPS32-NEXT: {{  $}}
+    ; MIPS32-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+    ; MIPS32-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
+    ; MIPS32-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $a2
+    ; MIPS32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; MIPS32-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C]]
+    ; MIPS32-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AND]](s32), [[COPY1]], [[COPY2]]
+    ; MIPS32-NEXT: $v0 = COPY [[SELECT]](s32)
+    ; MIPS32-NEXT: RetRA implicit $v0
     %3:_(s32) = COPY $a0
     %0:_(s1) = G_TRUNC %3(s32)
     %4:_(s32) = COPY $a1
@@ -52,14 +53,15 @@ body:             |
 
     ; MIPS32-LABEL: name: select_i16
     ; MIPS32: liveins: $a0, $a1, $a2
-    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
-    ; MIPS32: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
-    ; MIPS32: [[COPY2:%[0-9]+]]:_(s32) = COPY $a2
-    ; MIPS32: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; MIPS32: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C]]
-    ; MIPS32: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AND]](s32), [[COPY1]], [[COPY2]]
-    ; MIPS32: $v0 = COPY [[SELECT]](s32)
-    ; MIPS32: RetRA implicit $v0
+    ; MIPS32-NEXT: {{  $}}
+    ; MIPS32-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+    ; MIPS32-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
+    ; MIPS32-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $a2
+    ; MIPS32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; MIPS32-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C]]
+    ; MIPS32-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AND]](s32), [[COPY1]], [[COPY2]]
+    ; MIPS32-NEXT: $v0 = COPY [[SELECT]](s32)
+    ; MIPS32-NEXT: RetRA implicit $v0
     %3:_(s32) = COPY $a0
     %0:_(s1) = G_TRUNC %3(s32)
     %4:_(s32) = COPY $a1
@@ -82,14 +84,15 @@ body:             |
 
     ; MIPS32-LABEL: name: select_i32
     ; MIPS32: liveins: $a0, $a1, $a2
-    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
-    ; MIPS32: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
-    ; MIPS32: [[COPY2:%[0-9]+]]:_(s32) = COPY $a2
-    ; MIPS32: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; MIPS32: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C]]
-    ; MIPS32: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AND]](s32), [[COPY1]], [[COPY2]]
-    ; MIPS32: $v0 = COPY [[SELECT]](s32)
-    ; MIPS32: RetRA implicit $v0
+    ; MIPS32-NEXT: {{  $}}
+    ; MIPS32-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+    ; MIPS32-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
+    ; MIPS32-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $a2
+    ; MIPS32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; MIPS32-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C]]
+    ; MIPS32-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AND]](s32), [[COPY1]], [[COPY2]]
+    ; MIPS32-NEXT: $v0 = COPY [[SELECT]](s32)
+    ; MIPS32-NEXT: RetRA implicit $v0
     %3:_(s32) = COPY $a0
     %0:_(s1) = G_TRUNC %3(s32)
     %1:_(s32) = COPY $a1
@@ -109,14 +112,15 @@ body:             |
 
     ; MIPS32-LABEL: name: select_ptr
     ; MIPS32: liveins: $a0, $a1, $a2
-    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
-    ; MIPS32: [[COPY1:%[0-9]+]]:_(p0) = COPY $a1
-    ; MIPS32: [[COPY2:%[0-9]+]]:_(p0) = COPY $a2
-    ; MIPS32: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; MIPS32: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C]]
-    ; MIPS32: [[SELECT:%[0-9]+]]:_(p0) = G_SELECT [[AND]](s32), [[COPY1]], [[COPY2]]
-    ; MIPS32: $v0 = COPY [[SELECT]](p0)
-    ; MIPS32: RetRA implicit $v0
+    ; MIPS32-NEXT: {{  $}}
+    ; MIPS32-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+    ; MIPS32-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $a1
+    ; MIPS32-NEXT: [[COPY2:%[0-9]+]]:_(p0) = COPY $a2
+    ; MIPS32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; MIPS32-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C]]
+    ; MIPS32-NEXT: [[SELECT:%[0-9]+]]:_(p0) = G_SELECT [[AND]](s32), [[COPY1]], [[COPY2]]
+    ; MIPS32-NEXT: $v0 = COPY [[SELECT]](p0)
+    ; MIPS32-NEXT: RetRA implicit $v0
     %3:_(s32) = COPY $a0
     %0:_(s1) = G_TRUNC %3(s32)
     %1:_(p0) = COPY $a1
@@ -136,17 +140,17 @@ body:             |
 
     ; MIPS32-LABEL: name: select_with_negation
     ; MIPS32: liveins: $a0, $a1, $a2, $a3
-    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
-    ; MIPS32: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
-    ; MIPS32: [[COPY2:%[0-9]+]]:_(s32) = COPY $a2
-    ; MIPS32: [[COPY3:%[0-9]+]]:_(s32) = COPY $a3
-    ; MIPS32: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; MIPS32: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(slt), [[COPY]](s32), [[COPY1]]
-    ; MIPS32: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[ICMP]], [[C]]
-    ; MIPS32: [[AND:%[0-9]+]]:_(s32) = G_AND [[XOR]], [[C]]
-    ; MIPS32: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AND]](s32), [[COPY2]], [[COPY3]]
-    ; MIPS32: $v0 = COPY [[SELECT]](s32)
-    ; MIPS32: RetRA implicit $v0
+    ; MIPS32-NEXT: {{  $}}
+    ; MIPS32-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+    ; MIPS32-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
+    ; MIPS32-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $a2
+    ; MIPS32-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $a3
+    ; MIPS32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; MIPS32-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(slt), [[COPY]](s32), [[COPY1]]
+    ; MIPS32-NEXT: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[ICMP]], [[C]]
+    ; MIPS32-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[XOR]](s32), [[COPY2]], [[COPY3]]
+    ; MIPS32-NEXT: $v0 = COPY [[SELECT]](s32)
+    ; MIPS32-NEXT: RetRA implicit $v0
     %0:_(s32) = COPY $a0
     %1:_(s32) = COPY $a1
     %2:_(s32) = COPY $a2
@@ -172,22 +176,23 @@ body:             |
 
     ; MIPS32-LABEL: name: select_i64
     ; MIPS32: liveins: $a0, $a2, $a3
-    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
-    ; MIPS32: [[COPY1:%[0-9]+]]:_(s32) = COPY $a2
-    ; MIPS32: [[COPY2:%[0-9]+]]:_(s32) = COPY $a3
-    ; MIPS32: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY1]](s32), [[COPY2]](s32)
-    ; MIPS32: [[FRAME_INDEX:%[0-9]+]]:_(p0) = G_FRAME_INDEX %fixed-stack.0
-    ; MIPS32: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX]](p0) :: (load (s32) from %fixed-stack.0, align 8)
-    ; MIPS32: [[FRAME_INDEX1:%[0-9]+]]:_(p0) = G_FRAME_INDEX %fixed-stack.1
-    ; MIPS32: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX1]](p0) :: (load (s32) from %fixed-stack.1)
-    ; MIPS32: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32)
-    ; MIPS32: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; MIPS32: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C]]
-    ; MIPS32: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[AND]](s32), [[MV]], [[MV1]]
-    ; MIPS32: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[SELECT]](s64)
-    ; MIPS32: $v0 = COPY [[UV]](s32)
-    ; MIPS32: $v1 = COPY [[UV1]](s32)
-    ; MIPS32: RetRA implicit $v0, implicit $v1
+    ; MIPS32-NEXT: {{  $}}
+    ; MIPS32-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+    ; MIPS32-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $a2
+    ; MIPS32-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $a3
+    ; MIPS32-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY1]](s32), [[COPY2]](s32)
+    ; MIPS32-NEXT: [[FRAME_INDEX:%[0-9]+]]:_(p0) = G_FRAME_INDEX %fixed-stack.0
+    ; MIPS32-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX]](p0) :: (load (s32) from %fixed-stack.0, align 8)
+    ; MIPS32-NEXT: [[FRAME_INDEX1:%[0-9]+]]:_(p0) = G_FRAME_INDEX %fixed-stack.1
+    ; MIPS32-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX1]](p0) :: (load (s32) from %fixed-stack.1)
+    ; MIPS32-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[LOAD]](s32), [[LOAD1]](s32)
+    ; MIPS32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; MIPS32-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C]]
+    ; MIPS32-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[AND]](s32), [[MV]], [[MV1]]
+    ; MIPS32-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[SELECT]](s64)
+    ; MIPS32-NEXT: $v0 = COPY [[UV]](s32)
+    ; MIPS32-NEXT: $v1 = COPY [[UV1]](s32)
+    ; MIPS32-NEXT: RetRA implicit $v0, implicit $v1
     %3:_(s32) = COPY $a0
     %0:_(s1) = G_TRUNC %3(s32)
     %4:_(s32) = COPY $a2
@@ -215,14 +220,15 @@ body:             |
 
     ; MIPS32-LABEL: name: select_float
     ; MIPS32: liveins: $a0, $a1, $a2
-    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
-    ; MIPS32: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
-    ; MIPS32: [[COPY2:%[0-9]+]]:_(s32) = COPY $a2
-    ; MIPS32: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; MIPS32: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C]]
-    ; MIPS32: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AND]](s32), [[COPY1]], [[COPY2]]
-    ; MIPS32: $f0 = COPY [[SELECT]](s32)
-    ; MIPS32: RetRA implicit $f0
+    ; MIPS32-NEXT: {{  $}}
+    ; MIPS32-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+    ; MIPS32-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
+    ; MIPS32-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $a2
+    ; MIPS32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; MIPS32-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C]]
+    ; MIPS32-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AND]](s32), [[COPY1]], [[COPY2]]
+    ; MIPS32-NEXT: $f0 = COPY [[SELECT]](s32)
+    ; MIPS32-NEXT: RetRA implicit $f0
     %3:_(s32) = COPY $a0
     %0:_(s1) = G_TRUNC %3(s32)
     %1:_(s32) = COPY $a1
@@ -244,15 +250,16 @@ body:             |
 
     ; MIPS32-LABEL: name: select_double
     ; MIPS32: liveins: $d6, $d7
-    ; MIPS32: [[COPY:%[0-9]+]]:_(s64) = COPY $d6
-    ; MIPS32: [[COPY1:%[0-9]+]]:_(s64) = COPY $d7
-    ; MIPS32: [[FRAME_INDEX:%[0-9]+]]:_(p0) = G_FRAME_INDEX %fixed-stack.0
-    ; MIPS32: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX]](p0) :: (load (s32) from %fixed-stack.0, align 8)
-    ; MIPS32: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; MIPS32: [[AND:%[0-9]+]]:_(s32) = G_AND [[LOAD]], [[C]]
-    ; MIPS32: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[AND]](s32), [[COPY]], [[COPY1]]
-    ; MIPS32: $d0 = COPY [[SELECT]](s64)
-    ; MIPS32: RetRA implicit $d0
+    ; MIPS32-NEXT: {{  $}}
+    ; MIPS32-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $d6
+    ; MIPS32-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $d7
+    ; MIPS32-NEXT: [[FRAME_INDEX:%[0-9]+]]:_(p0) = G_FRAME_INDEX %fixed-stack.0
+    ; MIPS32-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX]](p0) :: (load (s32) from %fixed-stack.0, align 8)
+    ; MIPS32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; MIPS32-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LOAD]], [[C]]
+    ; MIPS32-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[AND]](s32), [[COPY]], [[COPY1]]
+    ; MIPS32-NEXT: $d0 = COPY [[SELECT]](s64)
+    ; MIPS32-NEXT: RetRA implicit $d0
     %0:_(s64) = COPY $d6
     %1:_(s64) = COPY $d7
     %4:_(p0) = G_FRAME_INDEX %fixed-stack.0
diff --git a/llvm/test/CodeGen/Mips/GlobalISel/legalizer/sub.mir b/llvm/test/CodeGen/Mips/GlobalISel/legalizer/sub.mir
index 0230eb2c07e8f..db62b72be87d8 100644
--- a/llvm/test/CodeGen/Mips/GlobalISel/legalizer/sub.mir
+++ b/llvm/test/CodeGen/Mips/GlobalISel/legalizer/sub.mir
@@ -23,11 +23,12 @@ body:             |
 
     ; MIPS32-LABEL: name: sub_i32
     ; MIPS32: liveins: $a0, $a1
-    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
-    ; MIPS32: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
-    ; MIPS32: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[COPY]], [[COPY1]]
-    ; MIPS32: $v0 = COPY [[SUB]](s32)
-    ; MIPS32: RetRA implicit $v0
+    ; MIPS32-NEXT: {{  $}}
+    ; MIPS32-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+    ; MIPS32-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
+    ; MIPS32-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[COPY]], [[COPY1]]
+    ; MIPS32-NEXT: $v0 = COPY [[SUB]](s32)
+    ; MIPS32-NEXT: RetRA implicit $v0
     %0:_(s32) = COPY $a0
     %1:_(s32) = COPY $a1
     %2:_(s32) = G_SUB %0, %1
@@ -45,14 +46,15 @@ body:             |
 
     ; MIPS32-LABEL: name: sub_i8_sext
     ; MIPS32: liveins: $a0, $a1
-    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
-    ; MIPS32: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
-    ; MIPS32: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[COPY1]], [[COPY]]
-    ; MIPS32: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
-    ; MIPS32: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[SUB]], [[C]](s32)
-    ; MIPS32: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[SHL]], [[C]](s32)
-    ; MIPS32: $v0 = COPY [[ASHR]](s32)
-    ; MIPS32: RetRA implicit $v0
+    ; MIPS32-NEXT: {{  $}}
+    ; MIPS32-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+    ; MIPS32-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
+    ; MIPS32-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[COPY1]], [[COPY]]
+    ; MIPS32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
+    ; MIPS32-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[SUB]], [[C]](s32)
+    ; MIPS32-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[SHL]], [[C]](s32)
+    ; MIPS32-NEXT: $v0 = COPY [[ASHR]](s32)
+    ; MIPS32-NEXT: RetRA implicit $v0
     %2:_(s32) = COPY $a0
     %0:_(s8) = G_TRUNC %2(s32)
     %3:_(s32) = COPY $a1
@@ -73,13 +75,14 @@ body:             |
 
     ; MIPS32-LABEL: name: sub_i8_zext
     ; MIPS32: liveins: $a0, $a1
-    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
-    ; MIPS32: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
-    ; MIPS32: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[COPY1]], [[COPY]]
-    ; MIPS32: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
-    ; MIPS32: [[AND:%[0-9]+]]:_(s32) = G_AND [[SUB]], [[C]]
-    ; MIPS32: $v0 = COPY [[AND]](s32)
-    ; MIPS32: RetRA implicit $v0
+    ; MIPS32-NEXT: {{  $}}
+    ; MIPS32-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+    ; MIPS32-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
+    ; MIPS32-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[COPY1]], [[COPY]]
+    ; MIPS32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
+    ; MIPS32-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[SUB]], [[C]]
+    ; MIPS32-NEXT: $v0 = COPY [[AND]](s32)
+    ; MIPS32-NEXT: RetRA implicit $v0
     %2:_(s32) = COPY $a0
     %0:_(s8) = G_TRUNC %2(s32)
     %3:_(s32) = COPY $a1
@@ -100,11 +103,12 @@ body:             |
 
     ; MIPS32-LABEL: name: sub_i8_aext
     ; MIPS32: liveins: $a0, $a1
-    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
-    ; MIPS32: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
-    ; MIPS32: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[COPY1]], [[COPY]]
-    ; MIPS32: $v0 = COPY [[SUB]](s32)
-    ; MIPS32: RetRA implicit $v0
+    ; MIPS32-NEXT: {{  $}}
+    ; MIPS32-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+    ; MIPS32-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
+    ; MIPS32-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[COPY1]], [[COPY]]
+    ; MIPS32-NEXT: $v0 = COPY [[SUB]](s32)
+    ; MIPS32-NEXT: RetRA implicit $v0
     %2:_(s32) = COPY $a0
     %0:_(s8) = G_TRUNC %2(s32)
     %3:_(s32) = COPY $a1
@@ -125,14 +129,15 @@ body:             |
 
     ; MIPS32-LABEL: name: sub_i16_sext
     ; MIPS32: liveins: $a0, $a1
-    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
-    ; MIPS32: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
-    ; MIPS32: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[COPY1]], [[COPY]]
-    ; MIPS32: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; MIPS32: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[SUB]], [[C]](s32)
-    ; MIPS32: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[SHL]], [[C]](s32)
-    ; MIPS32: $v0 = COPY [[ASHR]](s32)
-    ; MIPS32: RetRA implicit $v0
+    ; MIPS32-NEXT: {{  $}}
+    ; MIPS32-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+    ; MIPS32-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
+    ; MIPS32-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[COPY1]], [[COPY]]
+    ; MIPS32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; MIPS32-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[SUB]], [[C]](s32)
+    ; MIPS32-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[SHL]], [[C]](s32)
+    ; MIPS32-NEXT: $v0 = COPY [[ASHR]](s32)
+    ; MIPS32-NEXT: RetRA implicit $v0
     %2:_(s32) = COPY $a0
     %0:_(s16) = G_TRUNC %2(s32)
     %3:_(s32) = COPY $a1
@@ -153,13 +158,14 @@ body:             |
 
     ; MIPS32-LABEL: name: sub_i16_zext
     ; MIPS32: liveins: $a0, $a1
-    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
-    ; MIPS32: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
-    ; MIPS32: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[COPY1]], [[COPY]]
-    ; MIPS32: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
-    ; MIPS32: [[AND:%[0-9]+]]:_(s32) = G_AND [[SUB]], [[C]]
-    ; MIPS32: $v0 = COPY [[AND]](s32)
-    ; MIPS32: RetRA implicit $v0
+    ; MIPS32-NEXT: {{  $}}
+    ; MIPS32-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+    ; MIPS32-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
+    ; MIPS32-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[COPY1]], [[COPY]]
+    ; MIPS32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+    ; MIPS32-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[SUB]], [[C]]
+    ; MIPS32-NEXT: $v0 = COPY [[AND]](s32)
+    ; MIPS32-NEXT: RetRA implicit $v0
     %2:_(s32) = COPY $a0
     %0:_(s16) = G_TRUNC %2(s32)
     %3:_(s32) = COPY $a1
@@ -180,11 +186,12 @@ body:             |
 
     ; MIPS32-LABEL: name: sub_i16_aext
     ; MIPS32: liveins: $a0, $a1
-    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
-    ; MIPS32: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
-    ; MIPS32: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[COPY1]], [[COPY]]
-    ; MIPS32: $v0 = COPY [[SUB]](s32)
-    ; MIPS32: RetRA implicit $v0
+    ; MIPS32-NEXT: {{  $}}
+    ; MIPS32-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+    ; MIPS32-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
+    ; MIPS32-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[COPY1]], [[COPY]]
+    ; MIPS32-NEXT: $v0 = COPY [[SUB]](s32)
+    ; MIPS32-NEXT: RetRA implicit $v0
     %2:_(s32) = COPY $a0
     %0:_(s16) = G_TRUNC %2(s32)
     %3:_(s32) = COPY $a1
@@ -205,19 +212,18 @@ body:             |
 
     ; MIPS32-LABEL: name: sub_i64
     ; MIPS32: liveins: $a0, $a1, $a2, $a3
-    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
-    ; MIPS32: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
-    ; MIPS32: [[COPY2:%[0-9]+]]:_(s32) = COPY $a2
-    ; MIPS32: [[COPY3:%[0-9]+]]:_(s32) = COPY $a3
-    ; MIPS32: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[COPY3]], [[COPY1]]
-    ; MIPS32: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[COPY3]](s32), [[COPY1]]
-    ; MIPS32: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[COPY2]], [[COPY]]
-    ; MIPS32: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; MIPS32: [[AND:%[0-9]+]]:_(s32) = G_AND [[ICMP]], [[C]]
-    ; MIPS32: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[SUB1]], [[AND]]
-    ; MIPS32: $v0 = COPY [[SUB2]](s32)
-    ; MIPS32: $v1 = COPY [[SUB]](s32)
-    ; MIPS32: RetRA implicit $v0, implicit $v1
+    ; MIPS32-NEXT: {{  $}}
+    ; MIPS32-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+    ; MIPS32-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
+    ; MIPS32-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $a2
+    ; MIPS32-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $a3
+    ; MIPS32-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[COPY3]], [[COPY1]]
+    ; MIPS32-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[COPY3]](s32), [[COPY1]]
+    ; MIPS32-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[COPY2]], [[COPY]]
+    ; MIPS32-NEXT: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[SUB1]], [[ICMP]]
+    ; MIPS32-NEXT: $v0 = COPY [[SUB2]](s32)
+    ; MIPS32-NEXT: $v1 = COPY [[SUB]](s32)
+    ; MIPS32-NEXT: RetRA implicit $v0, implicit $v1
     %2:_(s32) = COPY $a0
     %3:_(s32) = COPY $a1
     %0:_(s64) = G_MERGE_VALUES %3(s32), %2(s32)
@@ -246,44 +252,44 @@ body:             |
 
     ; MIPS32-LABEL: name: sub_i128
     ; MIPS32: liveins: $a0, $a1, $a2, $a3
-    ; MIPS32: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
-    ; MIPS32: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
-    ; MIPS32: [[COPY2:%[0-9]+]]:_(s32) = COPY $a2
-    ; MIPS32: [[COPY3:%[0-9]+]]:_(s32) = COPY $a3
-    ; MIPS32: [[FRAME_INDEX:%[0-9]+]]:_(p0) = G_FRAME_INDEX %fixed-stack.0
-    ; MIPS32: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX]](p0) :: (load (s32) from %fixed-stack.0, align 8)
-    ; MIPS32: [[FRAME_INDEX1:%[0-9]+]]:_(p0) = G_FRAME_INDEX %fixed-stack.1
-    ; MIPS32: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX1]](p0) :: (load (s32) from %fixed-stack.1)
-    ; MIPS32: [[FRAME_INDEX2:%[0-9]+]]:_(p0) = G_FRAME_INDEX %fixed-stack.2
-    ; MIPS32: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX2]](p0) :: (load (s32) from %fixed-stack.2, align 8)
-    ; MIPS32: [[FRAME_INDEX3:%[0-9]+]]:_(p0) = G_FRAME_INDEX %fixed-stack.3
-    ; MIPS32: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX3]](p0) :: (load (s32) from %fixed-stack.3)
-    ; MIPS32: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[LOAD]], [[COPY]]
-    ; MIPS32: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[LOAD]](s32), [[COPY]]
-    ; MIPS32: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[LOAD1]], [[COPY1]]
-    ; MIPS32: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(ugt), [[SUB1]](s32), [[LOAD1]]
-    ; MIPS32: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; MIPS32: [[AND:%[0-9]+]]:_(s32) = G_AND [[ICMP]], [[C]]
-    ; MIPS32: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[SUB1]], [[AND]]
-    ; MIPS32: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-    ; MIPS32: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[SUB1]](s32), [[C1]]
-    ; MIPS32: [[AND1:%[0-9]+]]:_(s32) = G_AND [[ICMP2]], [[ICMP]]
-    ; MIPS32: [[OR:%[0-9]+]]:_(s32) = G_OR [[ICMP1]], [[AND1]]
-    ; MIPS32: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[LOAD2]], [[COPY2]]
-    ; MIPS32: [[ICMP3:%[0-9]+]]:_(s32) = G_ICMP intpred(ugt), [[SUB3]](s32), [[LOAD2]]
-    ; MIPS32: [[AND2:%[0-9]+]]:_(s32) = G_AND [[OR]], [[C]]
-    ; MIPS32: [[SUB4:%[0-9]+]]:_(s32) = G_SUB [[SUB3]], [[AND2]]
-    ; MIPS32: [[ICMP4:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[SUB3]](s32), [[C1]]
-    ; MIPS32: [[AND3:%[0-9]+]]:_(s32) = G_AND [[ICMP4]], [[OR]]
-    ; MIPS32: [[OR1:%[0-9]+]]:_(s32) = G_OR [[ICMP3]], [[AND3]]
-    ; MIPS32: [[SUB5:%[0-9]+]]:_(s32) = G_SUB [[LOAD3]], [[COPY3]]
-    ; MIPS32: [[AND4:%[0-9]+]]:_(s32) = G_AND [[OR1]], [[C]]
-    ; MIPS32: [[SUB6:%[0-9]+]]:_(s32) = G_SUB [[SUB5]], [[AND4]]
-    ; MIPS32: $v0 = COPY [[SUB]](s32)
-    ; MIPS32: $v1 = COPY [[SUB2]](s32)
-    ; MIPS32: $a0 = COPY [[SUB4]](s32)
-    ; MIPS32: $a1 = COPY [[SUB6]](s32)
-    ; MIPS32: RetRA implicit $v0, implicit $v1, implicit $a0, implicit $a1
+    ; MIPS32-NEXT: {{  $}}
+    ; MIPS32-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $a0
+    ; MIPS32-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $a1
+    ; MIPS32-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $a2
+    ; MIPS32-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $a3
+    ; MIPS32-NEXT: [[FRAME_INDEX:%[0-9]+]]:_(p0) = G_FRAME_INDEX %fixed-stack.0
+    ; MIPS32-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX]](p0) :: (load (s32) from %fixed-stack.0, align 8)
+    ; MIPS32-NEXT: [[FRAME_INDEX1:%[0-9]+]]:_(p0) = G_FRAME_INDEX %fixed-stack.1
+    ; MIPS32-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX1]](p0) :: (load (s32) from %fixed-stack.1)
+    ; MIPS32-NEXT: [[FRAME_INDEX2:%[0-9]+]]:_(p0) = G_FRAME_INDEX %fixed-stack.2
+    ; MIPS32-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX2]](p0) :: (load (s32) from %fixed-stack.2, align 8)
+    ; MIPS32-NEXT: [[FRAME_INDEX3:%[0-9]+]]:_(p0) = G_FRAME_INDEX %fixed-stack.3
+    ; MIPS32-NEXT: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX3]](p0) :: (load (s32) from %fixed-stack.3)
+    ; MIPS32-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[LOAD]], [[COPY]]
+    ; MIPS32-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[LOAD]](s32), [[COPY]]
+    ; MIPS32-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[LOAD1]], [[COPY1]]
+    ; MIPS32-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(ugt), [[SUB1]](s32), [[LOAD1]]
+    ; MIPS32-NEXT: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[SUB1]], [[ICMP]]
+    ; MIPS32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; MIPS32-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[SUB1]](s32), [[C]]
+    ; MIPS32-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ICMP2]], [[ICMP]]
+    ; MIPS32-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[ICMP1]], [[AND]]
+    ; MIPS32-NEXT: [[SUB3:%[0-9]+]]:_(s32) = G_SUB [[LOAD2]], [[COPY2]]
+    ; MIPS32-NEXT: [[ICMP3:%[0-9]+]]:_(s32) = G_ICMP intpred(ugt), [[SUB3]](s32), [[LOAD2]]
+    ; MIPS32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; MIPS32-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[OR]], [[C1]]
+    ; MIPS32-NEXT: [[SUB4:%[0-9]+]]:_(s32) = G_SUB [[SUB3]], [[AND1]]
+    ; MIPS32-NEXT: [[ICMP4:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[SUB3]](s32), [[C]]
+    ; MIPS32-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[ICMP4]], [[OR]]
+    ; MIPS32-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[ICMP3]], [[AND2]]
+    ; MIPS32-NEXT: [[SUB5:%[0-9]+]]:_(s32) = G_SUB [[LOAD3]], [[COPY3]]
+    ; MIPS32-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[OR1]], [[C1]]
+    ; MIPS32-NEXT: [[SUB6:%[0-9]+]]:_(s32) = G_SUB [[SUB5]], [[AND3]]
+    ; MIPS32-NEXT: $v0 = COPY [[SUB]](s32)
+    ; MIPS32-NEXT: $v1 = COPY [[SUB2]](s32)
+    ; MIPS32-NEXT: $a0 = COPY [[SUB4]](s32)
+    ; MIPS32-NEXT: $a1 = COPY [[SUB6]](s32)
+    ; MIPS32-NEXT: RetRA implicit $v0, implicit $v1, implicit $a0, implicit $a1
     %2:_(s32) = COPY $a0
     %3:_(s32) = COPY $a1
     %4:_(s32) = COPY $a2
diff --git a/llvm/test/CodeGen/Mips/GlobalISel/llvm-ir/add.ll b/llvm/test/CodeGen/Mips/GlobalISel/llvm-ir/add.ll
index 38d3f00f62db2..f76d18b53e59b 100644
--- a/llvm/test/CodeGen/Mips/GlobalISel/llvm-ir/add.ll
+++ b/llvm/test/CodeGen/Mips/GlobalISel/llvm-ir/add.ll
@@ -89,7 +89,6 @@ define i64 @add_i64(i64 %a, i64 %b) {
 ; MIPS32-NEXT:    addu $2, $6, $4
 ; MIPS32-NEXT:    sltu $3, $2, $4
 ; MIPS32-NEXT:    addu $1, $7, $5
-; MIPS32-NEXT:    andi $3, $3, 1
 ; MIPS32-NEXT:    addu $3, $1, $3
 ; MIPS32-NEXT:    jr $ra
 ; MIPS32-NEXT:    nop
@@ -115,8 +114,7 @@ define i128 @add_i128(i128 %a, i128 %b) {
 ; MIPS32-NEXT:    sltu $9, $2, $8
 ; MIPS32-NEXT:    addu $3, $4, $3
 ; MIPS32-NEXT:    sltu $4, $3, $4
-; MIPS32-NEXT:    andi $8, $9, 1
-; MIPS32-NEXT:    addu $3, $3, $8
+; MIPS32-NEXT:    addu $3, $3, $9
 ; MIPS32-NEXT:    sltiu $8, $3, 1
 ; MIPS32-NEXT:    and $8, $8, $9
 ; MIPS32-NEXT:    or $8, $4, $8
@@ -178,7 +176,6 @@ define void @uadd_with_overflow(i32 %lhs, i32 %rhs, ptr %padd, ptr %pcarry_flag)
 ; MIPS32-NEXT:    addu $1, $4, $5
 ; MIPS32-NEXT:    sltu $2, $1, $5
 ; MIPS32-NEXT:    andi $2, $2, 1
-; MIPS32-NEXT:    andi $2, $2, 1
 ; MIPS32-NEXT:    sb $2, 0($7)
 ; MIPS32-NEXT:    sw $1, 0($6)
 ; MIPS32-NEXT:    jr $ra
diff --git a/llvm/test/CodeGen/Mips/GlobalISel/llvm-ir/bitwise.ll b/llvm/test/CodeGen/Mips/GlobalISel/llvm-ir/bitwise.ll
index 26816482306a9..e572551ef4b2a 100644
--- a/llvm/test/CodeGen/Mips/GlobalISel/llvm-ir/bitwise.ll
+++ b/llvm/test/CodeGen/Mips/GlobalISel/llvm-ir/bitwise.ll
@@ -306,8 +306,7 @@ entry:
 define  i16 @shl_i16(i16 %a) {
 ; MIPS32-LABEL: shl_i16:
 ; MIPS32:       # %bb.0: # %entry
-; MIPS32-NEXT:    ori $1, $zero, 2
-; MIPS32-NEXT:    sllv $2, $4, $1
+; MIPS32-NEXT:    sll $2, $4, 2
 ; MIPS32-NEXT:    jr $ra
 ; MIPS32-NEXT:    nop
 entry:
@@ -318,10 +317,9 @@ entry:
 define i8 @ashr_i8(i8 %a) {
 ; MIPS32-LABEL: ashr_i8:
 ; MIPS32:       # %bb.0: # %entry
-; MIPS32-NEXT:    ori $2, $zero, 2
 ; MIPS32-NEXT:    sll $1, $4, 24
 ; MIPS32-NEXT:    sra $1, $1, 24
-; MIPS32-NEXT:    srav $2, $1, $2
+; MIPS32-NEXT:    sra $2, $1, 2
 ; MIPS32-NEXT:    jr $ra
 ; MIPS32-NEXT:    nop
 entry:
@@ -332,9 +330,8 @@ entry:
 define i16 @lshr_i16(i16 %a) {
 ; MIPS32-LABEL: lshr_i16:
 ; MIPS32:       # %bb.0: # %entry
-; MIPS32-NEXT:    ori $2, $zero, 2
 ; MIPS32-NEXT:    andi $1, $4, 65535
-; MIPS32-NEXT:    srlv $2, $1, $2
+; MIPS32-NEXT:    srl $2, $1, 2
 ; MIPS32-NEXT:    jr $ra
 ; MIPS32-NEXT:    nop
 entry:
@@ -346,24 +343,19 @@ define i64 @shl_i64(i64 %a, i64 %b) {
 ; MIPS32-LABEL: shl_i64:
 ; MIPS32:       # %bb.0: # %entry
 ; MIPS32-NEXT:    move $3, $4
-; MIPS32-NEXT:    move $9, $6
-; MIPS32-NEXT:    ori $1, $zero, 32
-; MIPS32-NEXT:    subu $8, $9, $1
-; MIPS32-NEXT:    subu $4, $1, $9
+; MIPS32-NEXT:    ori $4, $zero, 32
+; MIPS32-NEXT:    subu $8, $6, $4
+; MIPS32-NEXT:    subu $1, $4, $6
 ; MIPS32-NEXT:    ori $2, $zero, 0
-; MIPS32-NEXT:    sltu $6, $9, $1
-; MIPS32-NEXT:    sltiu $1, $9, 1
-; MIPS32-NEXT:    sllv $7, $3, $9
-; MIPS32-NEXT:    srlv $4, $3, $4
-; MIPS32-NEXT:    sllv $9, $5, $9
-; MIPS32-NEXT:    or $4, $4, $9
+; MIPS32-NEXT:    sltu $4, $6, $4
+; MIPS32-NEXT:    sllv $7, $3, $6
+; MIPS32-NEXT:    srlv $1, $3, $1
+; MIPS32-NEXT:    sllv $9, $5, $6
+; MIPS32-NEXT:    or $1, $1, $9
 ; MIPS32-NEXT:    sllv $3, $3, $8
-; MIPS32-NEXT:    andi $8, $6, 1
-; MIPS32-NEXT:    movn $2, $7, $8
-; MIPS32-NEXT:    andi $6, $6, 1
-; MIPS32-NEXT:    movn $3, $4, $6
-; MIPS32-NEXT:    andi $1, $1, 1
-; MIPS32-NEXT:    movn $3, $5, $1
+; MIPS32-NEXT:    movn $2, $7, $4
+; MIPS32-NEXT:    movn $3, $1, $4
+; MIPS32-NEXT:    movz $3, $5, $6
 ; MIPS32-NEXT:    jr $ra
 ; MIPS32-NEXT:    nop
 entry:
@@ -379,23 +371,18 @@ define i64 @ashl_i64(i64 %a, i64 %b) {
 ; MIPS32-NEXT:    sw $4, 4($sp) # 4-byte Folded Spill
 ; MIPS32-NEXT:    move $2, $5
 ; MIPS32-NEXT:    lw $5, 4($sp) # 4-byte Folded Reload
-; MIPS32-NEXT:    move $3, $6
 ; MIPS32-NEXT:    ori $1, $zero, 32
-; MIPS32-NEXT:    subu $8, $3, $1
-; MIPS32-NEXT:    subu $7, $1, $3
-; MIPS32-NEXT:    sltu $4, $3, $1
-; MIPS32-NEXT:    sltiu $6, $3, 1
-; MIPS32-NEXT:    srav $1, $2, $3
-; MIPS32-NEXT:    srlv $3, $5, $3
+; MIPS32-NEXT:    subu $8, $6, $1
+; MIPS32-NEXT:    subu $7, $1, $6
+; MIPS32-NEXT:    sltu $4, $6, $1
+; MIPS32-NEXT:    srav $1, $2, $6
+; MIPS32-NEXT:    srlv $3, $5, $6
 ; MIPS32-NEXT:    sllv $7, $2, $7
 ; MIPS32-NEXT:    or $7, $3, $7
 ; MIPS32-NEXT:    sra $3, $2, 31
 ; MIPS32-NEXT:    srav $2, $2, $8
-; MIPS32-NEXT:    andi $8, $4, 1
-; MIPS32-NEXT:    movn $2, $7, $8
-; MIPS32-NEXT:    andi $6, $6, 1
-; MIPS32-NEXT:    movn $2, $5, $6
-; MIPS32-NEXT:    andi $4, $4, 1
+; MIPS32-NEXT:    movn $2, $7, $4
+; MIPS32-NEXT:    movz $2, $5, $6
 ; MIPS32-NEXT:    movn $3, $1, $4
 ; MIPS32-NEXT:    addiu $sp, $sp, 8
 ; MIPS32-NEXT:    jr $ra
@@ -413,23 +400,18 @@ define i64 @lshr_i64(i64 %a, i64 %b) {
 ; MIPS32-NEXT:    sw $4, 4($sp) # 4-byte Folded Spill
 ; MIPS32-NEXT:    move $2, $5
 ; MIPS32-NEXT:    lw $5, 4($sp) # 4-byte Folded Reload
-; MIPS32-NEXT:    move $7, $6
 ; MIPS32-NEXT:    ori $1, $zero, 32
-; MIPS32-NEXT:    subu $8, $7, $1
-; MIPS32-NEXT:    subu $9, $1, $7
+; MIPS32-NEXT:    subu $8, $6, $1
+; MIPS32-NEXT:    subu $9, $1, $6
 ; MIPS32-NEXT:    ori $3, $zero, 0
-; MIPS32-NEXT:    sltu $4, $7, $1
-; MIPS32-NEXT:    sltiu $6, $7, 1
-; MIPS32-NEXT:    srlv $1, $2, $7
-; MIPS32-NEXT:    srlv $7, $5, $7
+; MIPS32-NEXT:    sltu $4, $6, $1
+; MIPS32-NEXT:    srlv $1, $2, $6
+; MIPS32-NEXT:    srlv $7, $5, $6
 ; MIPS32-NEXT:    sllv $9, $2, $9
 ; MIPS32-NEXT:    or $7, $7, $9
 ; MIPS32-NEXT:    srlv $2, $2, $8
-; MIPS32-NEXT:    andi $8, $4, 1
-; MIPS32-NEXT:    movn $2, $7, $8
-; MIPS32-NEXT:    andi $6, $6, 1
-; MIPS32-NEXT:    movn $2, $5, $6
-; MIPS32-NEXT:    andi $4, $4, 1
+; MIPS32-NEXT:    movn $2, $7, $4
+; MIPS32-NEXT:    movz $2, $5, $6
 ; MIPS32-NEXT:    movn $3, $1, $4
 ; MIPS32-NEXT:    addiu $sp, $sp, 8
 ; MIPS32-NEXT:    jr $ra
diff --git a/llvm/test/CodeGen/Mips/GlobalISel/llvm-ir/ctlz.ll b/llvm/test/CodeGen/Mips/GlobalISel/llvm-ir/ctlz.ll
index 65fce9d4f5d59..542a585b50e3d 100644
--- a/llvm/test/CodeGen/Mips/GlobalISel/llvm-ir/ctlz.ll
+++ b/llvm/test/CodeGen/Mips/GlobalISel/llvm-ir/ctlz.ll
@@ -17,14 +17,11 @@ declare i32 @llvm.ctlz.i32(i32, i1 immarg)
 define i64 @ctlz_i64(i64 %a) {
 ; MIPS32-LABEL: ctlz_i64:
 ; MIPS32:       # %bb.0: # %entry
-; MIPS32-NEXT:    move $1, $4
 ; MIPS32-NEXT:    ori $3, $zero, 0
-; MIPS32-NEXT:    sltiu $4, $5, 1
-; MIPS32-NEXT:    clz $1, $1
+; MIPS32-NEXT:    clz $1, $4
 ; MIPS32-NEXT:    addiu $1, $1, 32
 ; MIPS32-NEXT:    clz $2, $5
-; MIPS32-NEXT:    andi $4, $4, 1
-; MIPS32-NEXT:    movn $2, $1, $4
+; MIPS32-NEXT:    movz $2, $1, $5
 ; MIPS32-NEXT:    jr $ra
 ; MIPS32-NEXT:    nop
 entry:
diff --git a/llvm/test/CodeGen/Mips/GlobalISel/llvm-ir/cttz.ll b/llvm/test/CodeGen/Mips/GlobalISel/llvm-ir/cttz.ll
index 44a2e619f7156..6bc4c625d4010 100644
--- a/llvm/test/CodeGen/Mips/GlobalISel/llvm-ir/cttz.ll
+++ b/llvm/test/CodeGen/Mips/GlobalISel/llvm-ir/cttz.ll
@@ -21,9 +21,7 @@ declare i32 @llvm.cttz.i32(i32, i1 immarg)
 define i64 @cttz_i64(i64  %a) {
 ; MIPS32-LABEL: cttz_i64:
 ; MIPS32:       # %bb.0: # %entry
-; MIPS32-NEXT:    move $6, $4
 ; MIPS32-NEXT:    ori $3, $zero, 0
-; MIPS32-NEXT:    sltiu $4, $6, 1
 ; MIPS32-NEXT:    not $1, $5
 ; MIPS32-NEXT:    addiu $2, $5, -1
 ; MIPS32-NEXT:    and $1, $1, $2
@@ -31,13 +29,12 @@ define i64 @cttz_i64(i64  %a) {
 ; MIPS32-NEXT:    clz $1, $1
 ; MIPS32-NEXT:    subu $1, $2, $1
 ; MIPS32-NEXT:    addiu $1, $1, 32
-; MIPS32-NEXT:    not $5, $6
-; MIPS32-NEXT:    addiu $6, $6, -1
+; MIPS32-NEXT:    not $5, $4
+; MIPS32-NEXT:    addiu $6, $4, -1
 ; MIPS32-NEXT:    and $5, $5, $6
 ; MIPS32-NEXT:    clz $5, $5
 ; MIPS32-NEXT:    subu $2, $2, $5
-; MIPS32-NEXT:    andi $4, $4, 1
-; MIPS32-NEXT:    movn $2, $1, $4
+; MIPS32-NEXT:    movz $2, $1, $4
 ; MIPS32-NEXT:    jr $ra
 ; MIPS32-NEXT:    nop
 entry:
@@ -58,9 +55,7 @@ define i32 @ffs_i32_expansion(i32 %a) {
 ; MIPS32-NEXT:    clz $3, $3
 ; MIPS32-NEXT:    subu $2, $2, $3
 ; MIPS32-NEXT:    addiu $2, $2, 1
-; MIPS32-NEXT:    sltiu $3, $4, 1
-; MIPS32-NEXT:    andi $3, $3, 1
-; MIPS32-NEXT:    movn $2, $1, $3
+; MIPS32-NEXT:    movz $2, $1, $4
 ; MIPS32-NEXT:    jr $ra
 ; MIPS32-NEXT:    nop
 entry:
@@ -76,7 +71,6 @@ define i64 @ffs_i64_expansion(i64 %a) {
 ; MIPS32:       # %bb.0: # %entry
 ; MIPS32-NEXT:    ori $3, $zero, 1
 ; MIPS32-NEXT:    ori $1, $zero, 0
-; MIPS32-NEXT:    sltiu $7, $4, 1
 ; MIPS32-NEXT:    not $2, $5
 ; MIPS32-NEXT:    addiu $6, $5, -1
 ; MIPS32-NEXT:    and $6, $2, $6
@@ -84,25 +78,21 @@ define i64 @ffs_i64_expansion(i64 %a) {
 ; MIPS32-NEXT:    clz $6, $6
 ; MIPS32-NEXT:    subu $6, $2, $6
 ; MIPS32-NEXT:    addiu $6, $6, 32
-; MIPS32-NEXT:    not $8, $4
-; MIPS32-NEXT:    addiu $9, $4, -1
-; MIPS32-NEXT:    and $8, $8, $9
-; MIPS32-NEXT:    clz $8, $8
-; MIPS32-NEXT:    subu $2, $2, $8
-; MIPS32-NEXT:    andi $7, $7, 1
-; MIPS32-NEXT:    movn $2, $6, $7
+; MIPS32-NEXT:    not $7, $4
+; MIPS32-NEXT:    addiu $8, $4, -1
+; MIPS32-NEXT:    and $7, $7, $8
+; MIPS32-NEXT:    clz $7, $7
+; MIPS32-NEXT:    subu $2, $2, $7
+; MIPS32-NEXT:    movz $2, $6, $4
 ; MIPS32-NEXT:    addiu $2, $2, 1
 ; MIPS32-NEXT:    sltu $6, $2, $3
 ; MIPS32-NEXT:    addiu $3, $1, 0
-; MIPS32-NEXT:    andi $6, $6, 1
 ; MIPS32-NEXT:    addu $3, $3, $6
 ; MIPS32-NEXT:    xori $4, $4, 0
 ; MIPS32-NEXT:    xori $5, $5, 0
 ; MIPS32-NEXT:    or $4, $4, $5
-; MIPS32-NEXT:    sltiu $4, $4, 1
-; MIPS32-NEXT:    andi $4, $4, 1
-; MIPS32-NEXT:    movn $2, $1, $4
-; MIPS32-NEXT:    movn $3, $1, $4
+; MIPS32-NEXT:    movz $2, $1, $4
+; MIPS32-NEXT:    movz $3, $1, $4
 ; MIPS32-NEXT:    jr $ra
 ; MIPS32-NEXT:    nop
 entry:
diff --git a/llvm/test/CodeGen/Mips/GlobalISel/llvm-ir/fptosi_and_fptoui.ll b/llvm/test/CodeGen/Mips/GlobalISel/llvm-ir/fptosi_and_fptoui.ll
index e9cc0b933f719..823e0ecb6e941 100644
--- a/llvm/test/CodeGen/Mips/GlobalISel/llvm-ir/fptosi_and_fptoui.ll
+++ b/llvm/test/CodeGen/Mips/GlobalISel/llvm-ir/fptosi_and_fptoui.ll
@@ -151,7 +151,6 @@ define i32 @f32tou32(float %a) {
 ; MIPS32-NEXT:    addiu $3, $zero, 1
 ; MIPS32-NEXT:    c.ult.s $f12, $f0
 ; MIPS32-NEXT:    movf $3, $zero, $fcc0
-; MIPS32-NEXT:    andi $3, $3, 1
 ; MIPS32-NEXT:    movn $2, $1, $3
 ; MIPS32-NEXT:    jr $ra
 ; MIPS32-NEXT:    nop
@@ -175,7 +174,6 @@ define zeroext i16 @f32tou16(float %a) {
 ; MIPS32-NEXT:    addiu $3, $zero, 1
 ; MIPS32-NEXT:    c.ult.s $f12, $f0
 ; MIPS32-NEXT:    movf $3, $zero, $fcc0
-; MIPS32-NEXT:    andi $3, $3, 1
 ; MIPS32-NEXT:    movn $1, $2, $3
 ; MIPS32-NEXT:    andi $2, $1, 65535
 ; MIPS32-NEXT:    jr $ra
@@ -200,7 +198,6 @@ define zeroext i8 @f32tou8(float %a) {
 ; MIPS32-NEXT:    addiu $3, $zero, 1
 ; MIPS32-NEXT:    c.ult.s $f12, $f0
 ; MIPS32-NEXT:    movf $3, $zero, $fcc0
-; MIPS32-NEXT:    andi $3, $3, 1
 ; MIPS32-NEXT:    movn $1, $2, $3
 ; MIPS32-NEXT:    andi $2, $1, 255
 ; MIPS32-NEXT:    jr $ra
@@ -245,7 +242,6 @@ define i32 @f64tou32(double %a) {
 ; FP32-NEXT:    addiu $3, $zero, 1
 ; FP32-NEXT:    c.ult.d $f12, $f0
 ; FP32-NEXT:    movf $3, $zero, $fcc0
-; FP32-NEXT:    andi $3, $3, 1
 ; FP32-NEXT:    movn $2, $1, $3
 ; FP32-NEXT:    jr $ra
 ; FP32-NEXT:    nop
@@ -266,7 +262,6 @@ define i32 @f64tou32(double %a) {
 ; FP64-NEXT:    addiu $3, $zero, 1
 ; FP64-NEXT:    c.ult.d $f12, $f0
 ; FP64-NEXT:    movf $3, $zero, $fcc0
-; FP64-NEXT:    andi $3, $3, 1
 ; FP64-NEXT:    movn $2, $1, $3
 ; FP64-NEXT:    jr $ra
 ; FP64-NEXT:    nop
@@ -292,7 +287,6 @@ define zeroext i16 @f64tou16(double %a) {
 ; FP32-NEXT:    addiu $3, $zero, 1
 ; FP32-NEXT:    c.ult.d $f12, $f0
 ; FP32-NEXT:    movf $3, $zero, $fcc0
-; FP32-NEXT:    andi $3, $3, 1
 ; FP32-NEXT:    movn $1, $2, $3
 ; FP32-NEXT:    andi $2, $1, 65535
 ; FP32-NEXT:    jr $ra
@@ -314,7 +308,6 @@ define zeroext i16 @f64tou16(double %a) {
 ; FP64-NEXT:    addiu $3, $zero, 1
 ; FP64-NEXT:    c.ult.d $f12, $f0
 ; FP64-NEXT:    movf $3, $zero, $fcc0
-; FP64-NEXT:    andi $3, $3, 1
 ; FP64-NEXT:    movn $1, $2, $3
 ; FP64-NEXT:    andi $2, $1, 65535
 ; FP64-NEXT:    jr $ra
@@ -341,7 +334,6 @@ define zeroext i8 @f64tou8(double %a) {
 ; FP32-NEXT:    addiu $3, $zero, 1
 ; FP32-NEXT:    c.ult.d $f12, $f0
 ; FP32-NEXT:    movf $3, $zero, $fcc0
-; FP32-NEXT:    andi $3, $3, 1
 ; FP32-NEXT:    movn $1, $2, $3
 ; FP32-NEXT:    andi $2, $1, 255
 ; FP32-NEXT:    jr $ra
@@ -363,7 +355,6 @@ define zeroext i8 @f64tou8(double %a) {
 ; FP64-NEXT:    addiu $3, $zero, 1
 ; FP64-NEXT:    c.ult.d $f12, $f0
 ; FP64-NEXT:    movf $3, $zero, $fcc0
-; FP64-NEXT:    andi $3, $3, 1
 ; FP64-NEXT:    movn $1, $2, $3
 ; FP64-NEXT:    andi $2, $1, 255
 ; FP64-NEXT:    jr $ra
diff --git a/llvm/test/CodeGen/Mips/GlobalISel/llvm-ir/icmp.ll b/llvm/test/CodeGen/Mips/GlobalISel/llvm-ir/icmp.ll
index ab474e1f8e722..49f2a78683b0a 100644
--- a/llvm/test/CodeGen/Mips/GlobalISel/llvm-ir/icmp.ll
+++ b/llvm/test/CodeGen/Mips/GlobalISel/llvm-ir/icmp.ll
@@ -189,11 +189,9 @@ define i1 @sgt_i64(i64 %a, i64 %b) {
 ; MIPS32-LABEL: sgt_i64:
 ; MIPS32:       # %bb.0: # %entry
 ; MIPS32-NEXT:    slt $2, $7, $5
-; MIPS32-NEXT:    xor $1, $5, $7
-; MIPS32-NEXT:    sltiu $3, $1, 1
 ; MIPS32-NEXT:    sltu $1, $6, $4
-; MIPS32-NEXT:    andi $3, $3, 1
-; MIPS32-NEXT:    movn $2, $1, $3
+; MIPS32-NEXT:    xor $3, $5, $7
+; MIPS32-NEXT:    movz $2, $1, $3
 ; MIPS32-NEXT:    jr $ra
 ; MIPS32-NEXT:    nop
 entry:
@@ -206,12 +204,10 @@ define i1 @sge_i64(i64 %a, i64 %b) {
 ; MIPS32:       # %bb.0: # %entry
 ; MIPS32-NEXT:    slt $1, $5, $7
 ; MIPS32-NEXT:    xori $2, $1, 1
-; MIPS32-NEXT:    xor $1, $5, $7
-; MIPS32-NEXT:    sltiu $3, $1, 1
 ; MIPS32-NEXT:    sltu $1, $4, $6
 ; MIPS32-NEXT:    xori $1, $1, 1
-; MIPS32-NEXT:    andi $3, $3, 1
-; MIPS32-NEXT:    movn $2, $1, $3
+; MIPS32-NEXT:    xor $3, $5, $7
+; MIPS32-NEXT:    movz $2, $1, $3
 ; MIPS32-NEXT:    jr $ra
 ; MIPS32-NEXT:    nop
 entry:
@@ -223,11 +219,9 @@ define i1 @slt_i64(i64 %a, i64 %b) {
 ; MIPS32-LABEL: slt_i64:
 ; MIPS32:       # %bb.0: # %entry
 ; MIPS32-NEXT:    slt $2, $5, $7
-; MIPS32-NEXT:    xor $1, $5, $7
-; MIPS32-NEXT:    sltiu $3, $1, 1
 ; MIPS32-NEXT:    sltu $1, $4, $6
-; MIPS32-NEXT:    andi $3, $3, 1
-; MIPS32-NEXT:    movn $2, $1, $3
+; MIPS32-NEXT:    xor $3, $5, $7
+; MIPS32-NEXT:    movz $2, $1, $3
 ; MIPS32-NEXT:    jr $ra
 ; MIPS32-NEXT:    nop
 entry:
@@ -240,12 +234,10 @@ define i1 @sle_i64(i64 %a, i64 %b) {
 ; MIPS32:       # %bb.0: # %entry
 ; MIPS32-NEXT:    slt $1, $7, $5
 ; MIPS32-NEXT:    xori $2, $1, 1
-; MIPS32-NEXT:    xor $1, $5, $7
-; MIPS32-NEXT:    sltiu $3, $1, 1
 ; MIPS32-NEXT:    sltu $1, $6, $4
 ; MIPS32-NEXT:    xori $1, $1, 1
-; MIPS32-NEXT:    andi $3, $3, 1
-; MIPS32-NEXT:    movn $2, $1, $3
+; MIPS32-NEXT:    xor $3, $5, $7
+; MIPS32-NEXT:    movz $2, $1, $3
 ; MIPS32-NEXT:    jr $ra
 ; MIPS32-NEXT:    nop
 entry:
@@ -257,11 +249,9 @@ define i1 @ugt_i64(i64 %a, i64 %b) {
 ; MIPS32-LABEL: ugt_i64:
 ; MIPS32:       # %bb.0: # %entry
 ; MIPS32-NEXT:    sltu $2, $7, $5
-; MIPS32-NEXT:    xor $1, $5, $7
-; MIPS32-NEXT:    sltiu $3, $1, 1
 ; MIPS32-NEXT:    sltu $1, $6, $4
-; MIPS32-NEXT:    andi $3, $3, 1
-; MIPS32-NEXT:    movn $2, $1, $3
+; MIPS32-NEXT:    xor $3, $5, $7
+; MIPS32-NEXT:    movz $2, $1, $3
 ; MIPS32-NEXT:    jr $ra
 ; MIPS32-NEXT:    nop
 entry:
@@ -274,12 +264,10 @@ define i1 @uge_i64(i64 %a, i64 %b) {
 ; MIPS32:       # %bb.0: # %entry
 ; MIPS32-NEXT:    sltu $1, $5, $7
 ; MIPS32-NEXT:    xori $2, $1, 1
-; MIPS32-NEXT:    xor $1, $5, $7
-; MIPS32-NEXT:    sltiu $3, $1, 1
 ; MIPS32-NEXT:    sltu $1, $4, $6
 ; MIPS32-NEXT:    xori $1, $1, 1
-; MIPS32-NEXT:    andi $3, $3, 1
-; MIPS32-NEXT:    movn $2, $1, $3
+; MIPS32-NEXT:    xor $3, $5, $7
+; MIPS32-NEXT:    movz $2, $1, $3
 ; MIPS32-NEXT:    jr $ra
 ; MIPS32-NEXT:    nop
 entry:
@@ -291,11 +279,9 @@ define i1 @ult_i64(i64 %a, i64 %b) {
 ; MIPS32-LABEL: ult_i64:
 ; MIPS32:       # %bb.0: # %entry
 ; MIPS32-NEXT:    sltu $2, $5, $7
-; MIPS32-NEXT:    xor $1, $5, $7
-; MIPS32-NEXT:    sltiu $3, $1, 1
 ; MIPS32-NEXT:    sltu $1, $4, $6
-; MIPS32-NEXT:    andi $3, $3, 1
-; MIPS32-NEXT:    movn $2, $1, $3
+; MIPS32-NEXT:    xor $3, $5, $7
+; MIPS32-NEXT:    movz $2, $1, $3
 ; MIPS32-NEXT:    jr $ra
 ; MIPS32-NEXT:    nop
 entry:
@@ -308,12 +294,10 @@ define i1 @ule_i64(i64 %a, i64 %b) {
 ; MIPS32:       # %bb.0: # %entry
 ; MIPS32-NEXT:    sltu $1, $7, $5
 ; MIPS32-NEXT:    xori $2, $1, 1
-; MIPS32-NEXT:    xor $1, $5, $7
-; MIPS32-NEXT:    sltiu $3, $1, 1
 ; MIPS32-NEXT:    sltu $1, $6, $4
 ; MIPS32-NEXT:    xori $1, $1, 1
-; MIPS32-NEXT:    andi $3, $3, 1
-; MIPS32-NEXT:    movn $2, $1, $3
+; MIPS32-NEXT:    xor $3, $5, $7
+; MIPS32-NEXT:    movz $2, $1, $3
 ; MIPS32-NEXT:    jr $ra
 ; MIPS32-NEXT:    nop
 entry:
diff --git a/llvm/test/CodeGen/Mips/GlobalISel/llvm-ir/jump_table_and_brjt.ll b/llvm/test/CodeGen/Mips/GlobalISel/llvm-ir/jump_table_and_brjt.ll
index 30a1b514ddafe..4c10fedaa4a88 100644
--- a/llvm/test/CodeGen/Mips/GlobalISel/llvm-ir/jump_table_and_brjt.ll
+++ b/llvm/test/CodeGen/Mips/GlobalISel/llvm-ir/jump_table_and_brjt.ll
@@ -23,7 +23,6 @@ define i32 @mod4_0_to_11(i32 %a) {
 ; MIPS32-NEXT:    subu $2, $4, $2
 ; MIPS32-NEXT:    sw $2, 28($sp) # 4-byte Folded Spill
 ; MIPS32-NEXT:    sltu $1, $1, $2
-; MIPS32-NEXT:    andi $1, $1, 1
 ; MIPS32-NEXT:    bnez $1, $BB0_6
 ; MIPS32-NEXT:    nop
 ; MIPS32-NEXT:  $BB0_1: # %entry
@@ -64,7 +63,6 @@ define i32 @mod4_0_to_11(i32 %a) {
 ; MIPS32-NEXT:    subu $2, $2, $3
 ; MIPS32-NEXT:    sw $2, 0($sp) # 4-byte Folded Spill
 ; MIPS32-NEXT:    sltu $1, $1, $2
-; MIPS32-NEXT:    andi $1, $1, 1
 ; MIPS32-NEXT:    bnez $1, $BB0_13
 ; MIPS32-NEXT:    nop
 ; MIPS32-NEXT:  $BB0_8: # %sw.epilog
@@ -125,7 +123,6 @@ define i32 @mod4_0_to_11(i32 %a) {
 ; MIPS32_PIC-NEXT:    subu $2, $4, $2
 ; MIPS32_PIC-NEXT:    sw $2, 36($sp) # 4-byte Folded Spill
 ; MIPS32_PIC-NEXT:    sltu $1, $1, $2
-; MIPS32_PIC-NEXT:    andi $1, $1, 1
 ; MIPS32_PIC-NEXT:    bnez $1, $BB0_6
 ; MIPS32_PIC-NEXT:    nop
 ; MIPS32_PIC-NEXT:  $BB0_1: # %entry
@@ -168,7 +165,6 @@ define i32 @mod4_0_to_11(i32 %a) {
 ; MIPS32_PIC-NEXT:    subu $2, $2, $3
 ; MIPS32_PIC-NEXT:    sw $2, 4($sp) # 4-byte Folded Spill
 ; MIPS32_PIC-NEXT:    sltu $1, $1, $2
-; MIPS32_PIC-NEXT:    andi $1, $1, 1
 ; MIPS32_PIC-NEXT:    bnez $1, $BB0_13
 ; MIPS32_PIC-NEXT:    nop
 ; MIPS32_PIC-NEXT:  $BB0_8: # %sw.epilog
diff --git a/llvm/test/CodeGen/Mips/GlobalISel/llvm-ir/mul.ll b/llvm/test/CodeGen/Mips/GlobalISel/llvm-ir/mul.ll
index 44266f84379e1..c8c66fabf202b 100644
--- a/llvm/test/CodeGen/Mips/GlobalISel/llvm-ir/mul.ll
+++ b/llvm/test/CodeGen/Mips/GlobalISel/llvm-ir/mul.ll
@@ -123,10 +123,8 @@ define i128 @mul_i128(i128 %a, i128 %b) {
 ; MIPS32-NEXT:    mfhi $5
 ; MIPS32-NEXT:    addu $3, $3, $4
 ; MIPS32-NEXT:    sltu $4, $3, $4
-; MIPS32-NEXT:    andi $4, $4, 1
 ; MIPS32-NEXT:    addu $3, $3, $5
 ; MIPS32-NEXT:    sltu $5, $3, $5
-; MIPS32-NEXT:    andi $5, $5, 1
 ; MIPS32-NEXT:    addu $10, $4, $5
 ; MIPS32-NEXT:    mul $4, $8, $14
 ; MIPS32-NEXT:    mul $5, $7, $13
@@ -137,22 +135,17 @@ define i128 @mul_i128(i128 %a, i128 %b) {
 ; MIPS32-NEXT:    mfhi $11
 ; MIPS32-NEXT:    addu $4, $4, $5
 ; MIPS32-NEXT:    sltu $5, $4, $5
-; MIPS32-NEXT:    andi $5, $5, 1
 ; MIPS32-NEXT:    addu $4, $4, $24
 ; MIPS32-NEXT:    sltu $24, $4, $24
-; MIPS32-NEXT:    andi $24, $24, 1
 ; MIPS32-NEXT:    addu $5, $5, $24
 ; MIPS32-NEXT:    addu $4, $4, $15
 ; MIPS32-NEXT:    sltu $15, $4, $15
-; MIPS32-NEXT:    andi $15, $15, 1
 ; MIPS32-NEXT:    addu $5, $5, $15
 ; MIPS32-NEXT:    addu $4, $4, $11
 ; MIPS32-NEXT:    sltu $11, $4, $11
-; MIPS32-NEXT:    andi $11, $11, 1
 ; MIPS32-NEXT:    addu $5, $5, $11
 ; MIPS32-NEXT:    addu $4, $4, $10
 ; MIPS32-NEXT:    sltu $10, $4, $10
-; MIPS32-NEXT:    andi $10, $10, 1
 ; MIPS32-NEXT:    addu $5, $5, $10
 ; MIPS32-NEXT:    mul $1, $1, $14
 ; MIPS32-NEXT:    mul $11, $8, $13
@@ -187,7 +180,6 @@ define void @umul_with_overflow(i32 %lhs, i32 %rhs, ptr %pmul, ptr %pcarry_flag)
 ; MIPS32-NEXT:    mul $1, $4, $5
 ; MIPS32-NEXT:    sltu $2, $zero, $2
 ; MIPS32-NEXT:    andi $2, $2, 1
-; MIPS32-NEXT:    andi $2, $2, 1
 ; MIPS32-NEXT:    sb $2, 0($7)
 ; MIPS32-NEXT:    sw $1, 0($6)
 ; MIPS32-NEXT:    jr $ra
diff --git a/llvm/test/CodeGen/Mips/GlobalISel/llvm-ir/select.ll b/llvm/test/CodeGen/Mips/GlobalISel/llvm-ir/select.ll
index 30076a5ec7187..7aba3bc18f63f 100644
--- a/llvm/test/CodeGen/Mips/GlobalISel/llvm-ir/select.ll
+++ b/llvm/test/CodeGen/Mips/GlobalISel/llvm-ir/select.ll
@@ -59,7 +59,6 @@ define i32 @select_with_negation(i32 %a, i32 %b, i32 %x, i32 %y) {
 ; MIPS32-NEXT:    move $2, $7
 ; MIPS32-NEXT:    slt $1, $4, $5
 ; MIPS32-NEXT:    xori $1, $1, 1
-; MIPS32-NEXT:    andi $1, $1, 1
 ; MIPS32-NEXT:    movn $2, $6, $1
 ; MIPS32-NEXT:    jr $ra
 ; MIPS32-NEXT:    nop
diff --git a/llvm/test/CodeGen/Mips/GlobalISel/llvm-ir/sitofp_and_uitofp.ll b/llvm/test/CodeGen/Mips/GlobalISel/llvm-ir/sitofp_and_uitofp.ll
index 07d094604684d..b7f42c2912132 100644
--- a/llvm/test/CodeGen/Mips/GlobalISel/llvm-ir/sitofp_and_uitofp.ll
+++ b/llvm/test/CodeGen/Mips/GlobalISel/llvm-ir/sitofp_and_uitofp.ll
@@ -173,10 +173,9 @@ entry:
 define float @u16tof32(i16 zeroext %a) {
 ; FP32-LABEL: u16tof32:
 ; FP32:       # %bb.0: # %entry
-; FP32-NEXT:    andi $1, $4, 65535
-; FP32-NEXT:    lui $2, 17200
-; FP32-NEXT:    mtc1 $1, $f0
-; FP32-NEXT:    mtc1 $2, $f1
+; FP32-NEXT:    lui $1, 17200
+; FP32-NEXT:    mtc1 $4, $f0
+; FP32-NEXT:    mtc1 $1, $f1
 ; FP32-NEXT:    lui $2, 17200
 ; FP32-NEXT:    ori $1, $zero, 0
 ; FP32-NEXT:    mtc1 $1, $f2
@@ -188,10 +187,9 @@ define float @u16tof32(i16 zeroext %a) {
 ;
 ; FP64-LABEL: u16tof32:
 ; FP64:       # %bb.0: # %entry
-; FP64-NEXT:    andi $1, $4, 65535
-; FP64-NEXT:    lui $2, 17200
-; FP64-NEXT:    mtc1 $1, $f0
-; FP64-NEXT:    mthc1 $2, $f0
+; FP64-NEXT:    lui $1, 17200
+; FP64-NEXT:    mtc1 $4, $f0
+; FP64-NEXT:    mthc1 $1, $f0
 ; FP64-NEXT:    lui $2, 17200
 ; FP64-NEXT:    ori $1, $zero, 0
 ; FP64-NEXT:    mtc1 $1, $f1
@@ -208,10 +206,9 @@ entry:
 define float @u8tof32(i8 zeroext %a) {
 ; FP32-LABEL: u8tof32:
 ; FP32:       # %bb.0: # %entry
-; FP32-NEXT:    andi $1, $4, 255
-; FP32-NEXT:    lui $2, 17200
-; FP32-NEXT:    mtc1 $1, $f0
-; FP32-NEXT:    mtc1 $2, $f1
+; FP32-NEXT:    lui $1, 17200
+; FP32-NEXT:    mtc1 $4, $f0
+; FP32-NEXT:    mtc1 $1, $f1
 ; FP32-NEXT:    lui $2, 17200
 ; FP32-NEXT:    ori $1, $zero, 0
 ; FP32-NEXT:    mtc1 $1, $f2
@@ -223,10 +220,9 @@ define float @u8tof32(i8 zeroext %a) {
 ;
 ; FP64-LABEL: u8tof32:
 ; FP64:       # %bb.0: # %entry
-; FP64-NEXT:    andi $1, $4, 255
-; FP64-NEXT:    lui $2, 17200
-; FP64-NEXT:    mtc1 $1, $f0
-; FP64-NEXT:    mthc1 $2, $f0
+; FP64-NEXT:    lui $1, 17200
+; FP64-NEXT:    mtc1 $4, $f0
+; FP64-NEXT:    mthc1 $1, $f0
 ; FP64-NEXT:    lui $2, 17200
 ; FP64-NEXT:    ori $1, $zero, 0
 ; FP64-NEXT:    mtc1 $1, $f1
@@ -292,10 +288,9 @@ entry:
 define double @u16tof64(i16 zeroext %a) {
 ; FP32-LABEL: u16tof64:
 ; FP32:       # %bb.0: # %entry
-; FP32-NEXT:    andi $1, $4, 65535
-; FP32-NEXT:    lui $2, 17200
-; FP32-NEXT:    mtc1 $1, $f0
-; FP32-NEXT:    mtc1 $2, $f1
+; FP32-NEXT:    lui $1, 17200
+; FP32-NEXT:    mtc1 $4, $f0
+; FP32-NEXT:    mtc1 $1, $f1
 ; FP32-NEXT:    lui $2, 17200
 ; FP32-NEXT:    ori $1, $zero, 0
 ; FP32-NEXT:    mtc1 $1, $f2
@@ -306,10 +301,9 @@ define double @u16tof64(i16 zeroext %a) {
 ;
 ; FP64-LABEL: u16tof64:
 ; FP64:       # %bb.0: # %entry
-; FP64-NEXT:    andi $1, $4, 65535
-; FP64-NEXT:    lui $2, 17200
-; FP64-NEXT:    mtc1 $1, $f0
-; FP64-NEXT:    mthc1 $2, $f0
+; FP64-NEXT:    lui $1, 17200
+; FP64-NEXT:    mtc1 $4, $f0
+; FP64-NEXT:    mthc1 $1, $f0
 ; FP64-NEXT:    lui $2, 17200
 ; FP64-NEXT:    ori $1, $zero, 0
 ; FP64-NEXT:    mtc1 $1, $f1
@@ -325,10 +319,9 @@ entry:
 define double @u8tof64(i8 zeroext %a) {
 ; FP32-LABEL: u8tof64:
 ; FP32:       # %bb.0: # %entry
-; FP32-NEXT:    andi $1, $4, 255
-; FP32-NEXT:    lui $2, 17200
-; FP32-NEXT:    mtc1 $1, $f0
-; FP32-NEXT:    mtc1 $2, $f1
+; FP32-NEXT:    lui $1, 17200
+; FP32-NEXT:    mtc1 $4, $f0
+; FP32-NEXT:    mtc1 $1, $f1
 ; FP32-NEXT:    lui $2, 17200
 ; FP32-NEXT:    ori $1, $zero, 0
 ; FP32-NEXT:    mtc1 $1, $f2
@@ -339,10 +332,9 @@ define double @u8tof64(i8 zeroext %a) {
 ;
 ; FP64-LABEL: u8tof64:
 ; FP64:       # %bb.0: # %entry
-; FP64-NEXT:    andi $1, $4, 255
-; FP64-NEXT:    lui $2, 17200
-; FP64-NEXT:    mtc1 $1, $f0
-; FP64-NEXT:    mthc1 $2, $f0
+; FP64-NEXT:    lui $1, 17200
+; FP64-NEXT:    mtc1 $4, $f0
+; FP64-NEXT:    mthc1 $1, $f0
 ; FP64-NEXT:    lui $2, 17200
 ; FP64-NEXT:    ori $1, $zero, 0
 ; FP64-NEXT:    mtc1 $1, $f1
diff --git a/llvm/test/CodeGen/Mips/GlobalISel/llvm-ir/sub.ll b/llvm/test/CodeGen/Mips/GlobalISel/llvm-ir/sub.ll
index 87df1d2034900..e9ada30d43825 100644
--- a/llvm/test/CodeGen/Mips/GlobalISel/llvm-ir/sub.ll
+++ b/llvm/test/CodeGen/Mips/GlobalISel/llvm-ir/sub.ll
@@ -90,7 +90,6 @@ define i64 @sub_i64(i64 %a, i64 %b) {
 ; MIPS32-NEXT:    subu $2, $6, $4
 ; MIPS32-NEXT:    sltu $3, $6, $4
 ; MIPS32-NEXT:    subu $1, $7, $5
-; MIPS32-NEXT:    andi $3, $3, 1
 ; MIPS32-NEXT:    subu $3, $1, $3
 ; MIPS32-NEXT:    jr $ra
 ; MIPS32-NEXT:    nop
@@ -115,8 +114,7 @@ define i128 @sub_i128(i128 %a, i128 %b) {
 ; MIPS32-NEXT:    sltu $9, $8, $9
 ; MIPS32-NEXT:    subu $8, $3, $5
 ; MIPS32-NEXT:    sltu $5, $3, $8
-; MIPS32-NEXT:    andi $3, $9, 1
-; MIPS32-NEXT:    subu $3, $8, $3
+; MIPS32-NEXT:    subu $3, $8, $9
 ; MIPS32-NEXT:    sltiu $8, $8, 1
 ; MIPS32-NEXT:    and $8, $8, $9
 ; MIPS32-NEXT:    or $8, $5, $8
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv32/legalize-abs.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv32/legalize-abs.mir
index ca4d5a526d474..71817809b399d 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv32/legalize-abs.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv32/legalize-abs.mir
@@ -84,9 +84,7 @@ body:             |
     ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY]], [[ASHR]]
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[ADD]](s32), [[ASHR]]
     ; CHECK-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[COPY1]], [[ASHR1]]
-    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ICMP]], [[C2]]
-    ; CHECK-NEXT: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[ADD1]], [[AND]]
+    ; CHECK-NEXT: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[ADD1]], [[ICMP]]
     ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[ADD]], [[ASHR]]
     ; CHECK-NEXT: [[XOR1:%[0-9]+]]:_(s32) = G_XOR [[ADD2]], [[ASHR1]]
     ; CHECK-NEXT: $x10 = COPY [[XOR]](s32)
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv32/legalize-add.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv32/legalize-add.mir
index 097f96a4f0566..d169eb316dfcb 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv32/legalize-add.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv32/legalize-add.mir
@@ -90,9 +90,7 @@ body:             |
     ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD %xlo, %ylo
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[ADD]](s32), %ylo
     ; CHECK-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD %xhi, %yhi
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ICMP]], [[C]]
-    ; CHECK-NEXT: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[ADD1]], [[AND]]
+    ; CHECK-NEXT: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[ADD1]], [[ICMP]]
     ; CHECK-NEXT: $x10 = COPY [[ADD]](s32)
     ; CHECK-NEXT: $x11 = COPY [[ADD2]](s32)
     ; CHECK-NEXT: PseudoRET implicit $x10, implicit $x11
@@ -124,9 +122,7 @@ body:             |
     ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD %lo1, %lo2
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[ADD]](s32), %lo2
     ; CHECK-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD %hi1, %hi2
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ICMP]], [[C]]
-    ; CHECK-NEXT: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[ADD1]], [[AND]]
+    ; CHECK-NEXT: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[ADD1]], [[ICMP]]
     ; CHECK-NEXT: $x10 = COPY [[ADD]](s32)
     ; CHECK-NEXT: $x11 = COPY [[ADD2]](s32)
     ; CHECK-NEXT: PseudoRET implicit $x10, implicit $x11
@@ -158,17 +154,13 @@ body:             |
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[ADD]](s32), %lo2
     ; CHECK-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD %mid1, %mid2
     ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[ADD1]](s32), %mid1
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ICMP]], [[C]]
-    ; CHECK-NEXT: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[ADD1]], [[AND]]
-    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-    ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[ADD2]](s32), [[C1]]
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[ICMP2]], [[ICMP]]
-    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[ICMP1]], [[AND1]]
+    ; CHECK-NEXT: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[ADD1]], [[ICMP]]
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[ADD2]](s32), [[C]]
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ICMP2]], [[ICMP]]
+    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[ICMP1]], [[AND]]
     ; CHECK-NEXT: [[ADD3:%[0-9]+]]:_(s32) = G_ADD %hi1, %hi2
-    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[OR]], [[C2]]
-    ; CHECK-NEXT: [[ADD4:%[0-9]+]]:_(s32) = G_ADD [[ADD3]], [[AND2]]
+    ; CHECK-NEXT: [[ADD4:%[0-9]+]]:_(s32) = G_ADD [[ADD3]], [[OR]]
     ; CHECK-NEXT: $x10 = COPY [[ADD]](s32)
     ; CHECK-NEXT: $x11 = COPY [[ADD2]](s32)
     ; CHECK-NEXT: $x12 = COPY [[ADD4]](s32)
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv32/legalize-addo-subo.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv32/legalize-addo-subo.mir
index f831e50126ca9..c348ec6f73ad3 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv32/legalize-addo-subo.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv32/legalize-addo-subo.mir
@@ -120,23 +120,17 @@ body:             |
     ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY]], [[COPY2]]
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[ADD]](s32), [[COPY2]]
     ; CHECK-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[COPY1]], [[COPY3]]
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ICMP]], [[C]]
-    ; CHECK-NEXT: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[ADD1]], [[AND]]
+    ; CHECK-NEXT: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[ADD1]], [[ICMP]]
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
     ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
     ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(slt), [[ADD2]](s32), [[COPY1]]
     ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[ADD2]](s32), [[COPY1]]
     ; CHECK-NEXT: [[ICMP3:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[ADD]](s32), [[COPY]]
-    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[ICMP2]], [[C3]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AND1]](s32), [[ICMP3]], [[ICMP1]]
-    ; CHECK-NEXT: [[ICMP4:%[0-9]+]]:_(s32) = G_ICMP intpred(slt), [[COPY3]](s32), [[C2]]
-    ; CHECK-NEXT: [[ICMP5:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[COPY3]](s32), [[C2]]
-    ; CHECK-NEXT: [[ICMP6:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[COPY2]](s32), [[C1]]
-    ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[ICMP5]], [[C4]]
-    ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[AND2]](s32), [[ICMP6]], [[ICMP4]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s32), [[ICMP3]], [[ICMP1]]
+    ; CHECK-NEXT: [[ICMP4:%[0-9]+]]:_(s32) = G_ICMP intpred(slt), [[COPY3]](s32), [[C1]]
+    ; CHECK-NEXT: [[ICMP5:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[COPY3]](s32), [[C1]]
+    ; CHECK-NEXT: [[ICMP6:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[COPY2]](s32), [[C]]
+    ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP5]](s32), [[ICMP6]], [[ICMP4]]
     ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[SELECT1]], [[SELECT]]
     ; CHECK-NEXT: $x10 = COPY [[ADD]](s32)
     ; CHECK-NEXT: $x11 = COPY [[ADD2]](s32)
@@ -275,23 +269,17 @@ body:             |
     ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[COPY]], [[COPY2]]
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[COPY]](s32), [[COPY2]]
     ; CHECK-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[COPY1]], [[COPY3]]
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ICMP]], [[C]]
-    ; CHECK-NEXT: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[SUB1]], [[AND]]
+    ; CHECK-NEXT: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[SUB1]], [[ICMP]]
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
     ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
     ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(slt), [[SUB2]](s32), [[COPY1]]
     ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[SUB2]](s32), [[COPY1]]
     ; CHECK-NEXT: [[ICMP3:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[SUB]](s32), [[COPY]]
-    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[ICMP2]], [[C3]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AND1]](s32), [[ICMP3]], [[ICMP1]]
-    ; CHECK-NEXT: [[ICMP4:%[0-9]+]]:_(s32) = G_ICMP intpred(sgt), [[COPY3]](s32), [[C2]]
-    ; CHECK-NEXT: [[ICMP5:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[COPY3]](s32), [[C2]]
-    ; CHECK-NEXT: [[ICMP6:%[0-9]+]]:_(s32) = G_ICMP intpred(ugt), [[COPY2]](s32), [[C1]]
-    ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[ICMP5]], [[C4]]
-    ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[AND2]](s32), [[ICMP6]], [[ICMP4]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s32), [[ICMP3]], [[ICMP1]]
+    ; CHECK-NEXT: [[ICMP4:%[0-9]+]]:_(s32) = G_ICMP intpred(sgt), [[COPY3]](s32), [[C1]]
+    ; CHECK-NEXT: [[ICMP5:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[COPY3]](s32), [[C1]]
+    ; CHECK-NEXT: [[ICMP6:%[0-9]+]]:_(s32) = G_ICMP intpred(ugt), [[COPY2]](s32), [[C]]
+    ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP5]](s32), [[ICMP6]], [[ICMP4]]
     ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[SELECT1]], [[SELECT]]
     ; CHECK-NEXT: $x10 = COPY [[SUB]](s32)
     ; CHECK-NEXT: $x11 = COPY [[SUB2]](s32)
@@ -417,15 +405,11 @@ body:             |
     ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[COPY]], [[COPY2]]
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[ADD]](s32), [[COPY2]]
     ; CHECK-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[COPY1]], [[COPY3]]
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ICMP]], [[C]]
-    ; CHECK-NEXT: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[ADD1]], [[AND]]
+    ; CHECK-NEXT: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[ADD1]], [[ICMP]]
     ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[ADD2]](s32), [[COPY3]]
     ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[ADD2]](s32), [[COPY3]]
     ; CHECK-NEXT: [[ICMP3:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[ADD]](s32), [[COPY2]]
-    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[ICMP2]], [[C1]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AND1]](s32), [[ICMP3]], [[ICMP1]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s32), [[ICMP3]], [[ICMP1]]
     ; CHECK-NEXT: $x10 = COPY [[ADD]](s32)
     ; CHECK-NEXT: $x11 = COPY [[ADD2]](s32)
     ; CHECK-NEXT: $x12 = COPY [[SELECT]](s32)
@@ -550,15 +534,11 @@ body:             |
     ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[COPY]], [[COPY2]]
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[COPY]](s32), [[COPY2]]
     ; CHECK-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[COPY1]], [[COPY3]]
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ICMP]], [[C]]
-    ; CHECK-NEXT: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[SUB1]], [[AND]]
+    ; CHECK-NEXT: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[SUB1]], [[ICMP]]
     ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[COPY1]](s32), [[COPY3]]
     ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[COPY1]](s32), [[COPY3]]
     ; CHECK-NEXT: [[ICMP3:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[COPY]](s32), [[COPY2]]
-    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[ICMP2]], [[C1]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AND1]](s32), [[ICMP3]], [[ICMP1]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s32), [[ICMP3]], [[ICMP1]]
     ; CHECK-NEXT: $x10 = COPY [[SUB]](s32)
     ; CHECK-NEXT: $x11 = COPY [[SUB2]](s32)
     ; CHECK-NEXT: $x12 = COPY [[SELECT]](s32)
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv32/legalize-ashr.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv32/legalize-ashr.mir
index 468077e51c3bf..e24c7ad539133 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv32/legalize-ashr.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv32/legalize-ashr.mir
@@ -117,15 +117,9 @@ body:             |
     ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 31
     ; CHECK-NEXT: [[ASHR2:%[0-9]+]]:_(s32) = G_ASHR [[ASHR]], [[C3]](s32)
     ; CHECK-NEXT: [[ASHR3:%[0-9]+]]:_(s32) = G_ASHR [[ASHR]], [[SUB]](s32)
-    ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ICMP]], [[C4]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AND]](s32), [[OR]], [[ASHR3]]
-    ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[ICMP1]], [[C5]]
-    ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[AND1]](s32), %xlo, [[SELECT]]
-    ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[ICMP]], [[C6]]
-    ; CHECK-NEXT: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[AND2]](s32), [[ASHR1]], [[ASHR2]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s32), [[OR]], [[ASHR3]]
+    ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s32), %xlo, [[SELECT]]
+    ; CHECK-NEXT: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s32), [[ASHR1]], [[ASHR2]]
     ; CHECK-NEXT: $x10 = COPY [[SELECT1]](s32)
     ; CHECK-NEXT: $x11 = COPY [[SELECT2]](s32)
     ; CHECK-NEXT: PseudoRET implicit $x10, implicit $x11
@@ -166,15 +160,9 @@ body:             |
     ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 31
     ; CHECK-NEXT: [[ASHR1:%[0-9]+]]:_(s32) = G_ASHR %hi1, [[C2]](s32)
     ; CHECK-NEXT: [[ASHR2:%[0-9]+]]:_(s32) = G_ASHR %hi1, [[SUB]](s32)
-    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ICMP]], [[C3]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AND]](s32), [[OR]], [[ASHR2]]
-    ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[ICMP1]], [[C4]]
-    ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[AND1]](s32), %lo1, [[SELECT]]
-    ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[ICMP]], [[C5]]
-    ; CHECK-NEXT: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[AND2]](s32), [[ASHR]], [[ASHR1]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s32), [[OR]], [[ASHR2]]
+    ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s32), %lo1, [[SELECT]]
+    ; CHECK-NEXT: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s32), [[ASHR]], [[ASHR1]]
     ; CHECK-NEXT: $x10 = COPY [[SELECT1]](s32)
     ; CHECK-NEXT: $x11 = COPY [[SELECT2]](s32)
     ; CHECK-NEXT: PseudoRET implicit $x10, implicit $x11
@@ -218,95 +206,63 @@ body:             |
     ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ASHR]], [[SUB3]](s32)
     ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[LSHR]], [[SHL]]
     ; CHECK-NEXT: [[ASHR1:%[0-9]+]]:_(s32) = G_ASHR [[ASHR]], [[SUB2]](s32)
-    ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ICMP2]], [[C5]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AND]](s32), [[OR]], [[ASHR1]]
-    ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[ICMP3]], [[C6]]
-    ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[AND1]](s32), %hi1, [[SELECT]]
-    ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-    ; CHECK-NEXT: [[SUB4:%[0-9]+]]:_(s32) = G_SUB %lo2, [[C7]]
-    ; CHECK-NEXT: [[SUB5:%[0-9]+]]:_(s32) = G_SUB [[C7]], %lo2
-    ; CHECK-NEXT: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-    ; CHECK-NEXT: [[ICMP4:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), %lo2(s32), [[C7]]
-    ; CHECK-NEXT: [[ICMP5:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), %lo2(s32), [[C8]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s32), [[OR]], [[ASHR1]]
+    ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP3]](s32), %hi1, [[SELECT]]
+    ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; CHECK-NEXT: [[SUB4:%[0-9]+]]:_(s32) = G_SUB %lo2, [[C5]]
+    ; CHECK-NEXT: [[SUB5:%[0-9]+]]:_(s32) = G_SUB [[C5]], %lo2
+    ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[ICMP4:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), %lo2(s32), [[C5]]
+    ; CHECK-NEXT: [[ICMP5:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), %lo2(s32), [[C6]]
     ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR %mid1, %lo2(s32)
     ; CHECK-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR %lo1, %lo2(s32)
     ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL %mid1, [[SUB5]](s32)
     ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[LSHR2]], [[SHL1]]
-    ; CHECK-NEXT: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
     ; CHECK-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR %mid1, [[SUB4]](s32)
-    ; CHECK-NEXT: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[ICMP4]], [[C10]]
-    ; CHECK-NEXT: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[AND2]](s32), [[OR1]], [[LSHR3]]
-    ; CHECK-NEXT: [[C11:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[ICMP5]], [[C11]]
-    ; CHECK-NEXT: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[AND3]](s32), %lo1, [[SELECT2]]
-    ; CHECK-NEXT: [[C12:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[ICMP4]], [[C12]]
-    ; CHECK-NEXT: [[SELECT4:%[0-9]+]]:_(s32) = G_SELECT [[AND4]](s32), [[LSHR1]], [[C9]]
-    ; CHECK-NEXT: [[C13:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-    ; CHECK-NEXT: [[SUB6:%[0-9]+]]:_(s32) = G_SUB [[SUB1]], [[C13]]
-    ; CHECK-NEXT: [[SUB7:%[0-9]+]]:_(s32) = G_SUB [[C13]], [[SUB1]]
-    ; CHECK-NEXT: [[C14:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-    ; CHECK-NEXT: [[ICMP6:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[SUB1]](s32), [[C13]]
-    ; CHECK-NEXT: [[ICMP7:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[SUB1]](s32), [[C14]]
+    ; CHECK-NEXT: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[ICMP4]](s32), [[OR1]], [[LSHR3]]
+    ; CHECK-NEXT: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[ICMP5]](s32), %lo1, [[SELECT2]]
+    ; CHECK-NEXT: [[SELECT4:%[0-9]+]]:_(s32) = G_SELECT [[ICMP4]](s32), [[LSHR1]], [[C7]]
+    ; CHECK-NEXT: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; CHECK-NEXT: [[SUB6:%[0-9]+]]:_(s32) = G_SUB [[SUB1]], [[C8]]
+    ; CHECK-NEXT: [[SUB7:%[0-9]+]]:_(s32) = G_SUB [[C8]], [[SUB1]]
+    ; CHECK-NEXT: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[ICMP6:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[SUB1]](s32), [[C8]]
+    ; CHECK-NEXT: [[ICMP7:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[SUB1]](s32), [[C9]]
     ; CHECK-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL %hi1, [[SUB1]](s32)
     ; CHECK-NEXT: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR %hi1, [[SUB7]](s32)
     ; CHECK-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[ASHR]], [[SUB1]](s32)
     ; CHECK-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[LSHR4]], [[SHL3]]
-    ; CHECK-NEXT: [[C15:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
     ; CHECK-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL %hi1, [[SUB6]](s32)
-    ; CHECK-NEXT: [[C16:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[ICMP6]], [[C16]]
-    ; CHECK-NEXT: [[SELECT5:%[0-9]+]]:_(s32) = G_SELECT [[AND5]](s32), [[SHL2]], [[C15]]
-    ; CHECK-NEXT: [[C17:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[ICMP6]], [[C17]]
-    ; CHECK-NEXT: [[SELECT6:%[0-9]+]]:_(s32) = G_SELECT [[AND6]](s32), [[OR2]], [[SHL4]]
-    ; CHECK-NEXT: [[C18:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[ICMP7]], [[C18]]
-    ; CHECK-NEXT: [[SELECT7:%[0-9]+]]:_(s32) = G_SELECT [[AND7]](s32), [[ASHR]], [[SELECT6]]
+    ; CHECK-NEXT: [[SELECT5:%[0-9]+]]:_(s32) = G_SELECT [[ICMP6]](s32), [[SHL2]], [[C10]]
+    ; CHECK-NEXT: [[SELECT6:%[0-9]+]]:_(s32) = G_SELECT [[ICMP6]](s32), [[OR2]], [[SHL4]]
+    ; CHECK-NEXT: [[SELECT7:%[0-9]+]]:_(s32) = G_SELECT [[ICMP7]](s32), [[ASHR]], [[SELECT6]]
     ; CHECK-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SELECT3]], [[SELECT5]]
     ; CHECK-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SELECT4]], [[SELECT7]]
-    ; CHECK-NEXT: [[C19:%[0-9]+]]:_(s32) = G_CONSTANT i32 31
-    ; CHECK-NEXT: [[ASHR2:%[0-9]+]]:_(s32) = G_ASHR [[ASHR]], [[C19]](s32)
-    ; CHECK-NEXT: [[C20:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-    ; CHECK-NEXT: [[SUB8:%[0-9]+]]:_(s32) = G_SUB [[SUB]], [[C20]]
-    ; CHECK-NEXT: [[SUB9:%[0-9]+]]:_(s32) = G_SUB [[C20]], [[SUB]]
-    ; CHECK-NEXT: [[C21:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-    ; CHECK-NEXT: [[ICMP8:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[SUB]](s32), [[C20]]
-    ; CHECK-NEXT: [[ICMP9:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[SUB]](s32), [[C21]]
+    ; CHECK-NEXT: [[C11:%[0-9]+]]:_(s32) = G_CONSTANT i32 31
+    ; CHECK-NEXT: [[ASHR2:%[0-9]+]]:_(s32) = G_ASHR [[ASHR]], [[C11]](s32)
+    ; CHECK-NEXT: [[C12:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; CHECK-NEXT: [[SUB8:%[0-9]+]]:_(s32) = G_SUB [[SUB]], [[C12]]
+    ; CHECK-NEXT: [[SUB9:%[0-9]+]]:_(s32) = G_SUB [[C12]], [[SUB]]
+    ; CHECK-NEXT: [[C13:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[ICMP8:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[SUB]](s32), [[C12]]
+    ; CHECK-NEXT: [[ICMP9:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[SUB]](s32), [[C13]]
     ; CHECK-NEXT: [[ASHR3:%[0-9]+]]:_(s32) = G_ASHR [[ASHR]], [[SUB]](s32)
     ; CHECK-NEXT: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR %hi1, [[SUB]](s32)
     ; CHECK-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[ASHR]], [[SUB9]](s32)
     ; CHECK-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[LSHR5]], [[SHL5]]
-    ; CHECK-NEXT: [[C22:%[0-9]+]]:_(s32) = G_CONSTANT i32 31
-    ; CHECK-NEXT: [[ASHR4:%[0-9]+]]:_(s32) = G_ASHR [[ASHR]], [[C22]](s32)
+    ; CHECK-NEXT: [[C14:%[0-9]+]]:_(s32) = G_CONSTANT i32 31
+    ; CHECK-NEXT: [[ASHR4:%[0-9]+]]:_(s32) = G_ASHR [[ASHR]], [[C14]](s32)
     ; CHECK-NEXT: [[ASHR5:%[0-9]+]]:_(s32) = G_ASHR [[ASHR]], [[SUB8]](s32)
-    ; CHECK-NEXT: [[C23:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND8:%[0-9]+]]:_(s32) = G_AND [[ICMP8]], [[C23]]
-    ; CHECK-NEXT: [[SELECT8:%[0-9]+]]:_(s32) = G_SELECT [[AND8]](s32), [[OR5]], [[ASHR5]]
-    ; CHECK-NEXT: [[C24:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND9:%[0-9]+]]:_(s32) = G_AND [[ICMP9]], [[C24]]
-    ; CHECK-NEXT: [[SELECT9:%[0-9]+]]:_(s32) = G_SELECT [[AND9]](s32), %hi1, [[SELECT8]]
-    ; CHECK-NEXT: [[C25:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND10:%[0-9]+]]:_(s32) = G_AND [[ICMP8]], [[C25]]
-    ; CHECK-NEXT: [[SELECT10:%[0-9]+]]:_(s32) = G_SELECT [[AND10]](s32), [[ASHR3]], [[ASHR4]]
-    ; CHECK-NEXT: [[C26:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND11:%[0-9]+]]:_(s32) = G_AND [[ICMP]], [[C26]]
-    ; CHECK-NEXT: [[SELECT11:%[0-9]+]]:_(s32) = G_SELECT [[AND11]](s32), [[OR3]], [[SELECT9]]
-    ; CHECK-NEXT: [[C27:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND12:%[0-9]+]]:_(s32) = G_AND [[ICMP]], [[C27]]
-    ; CHECK-NEXT: [[SELECT12:%[0-9]+]]:_(s32) = G_SELECT [[AND12]](s32), [[OR4]], [[SELECT10]]
-    ; CHECK-NEXT: [[C28:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND13:%[0-9]+]]:_(s32) = G_AND [[ICMP1]], [[C28]]
-    ; CHECK-NEXT: [[SELECT13:%[0-9]+]]:_(s32) = G_SELECT [[AND13]](s32), %lo1, [[SELECT11]]
-    ; CHECK-NEXT: [[C29:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND14:%[0-9]+]]:_(s32) = G_AND [[ICMP1]], [[C29]]
-    ; CHECK-NEXT: [[SELECT14:%[0-9]+]]:_(s32) = G_SELECT [[AND14]](s32), %mid1, [[SELECT12]]
-    ; CHECK-NEXT: [[C30:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND15:%[0-9]+]]:_(s32) = G_AND [[ICMP]], [[C30]]
-    ; CHECK-NEXT: [[SELECT15:%[0-9]+]]:_(s32) = G_SELECT [[AND15]](s32), [[SELECT1]], [[ASHR2]]
+    ; CHECK-NEXT: [[SELECT8:%[0-9]+]]:_(s32) = G_SELECT [[ICMP8]](s32), [[OR5]], [[ASHR5]]
+    ; CHECK-NEXT: [[SELECT9:%[0-9]+]]:_(s32) = G_SELECT [[ICMP9]](s32), %hi1, [[SELECT8]]
+    ; CHECK-NEXT: [[SELECT10:%[0-9]+]]:_(s32) = G_SELECT [[ICMP8]](s32), [[ASHR3]], [[ASHR4]]
+    ; CHECK-NEXT: [[SELECT11:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s32), [[OR3]], [[SELECT9]]
+    ; CHECK-NEXT: [[SELECT12:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s32), [[OR4]], [[SELECT10]]
+    ; CHECK-NEXT: [[SELECT13:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s32), %lo1, [[SELECT11]]
+    ; CHECK-NEXT: [[SELECT14:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s32), %mid1, [[SELECT12]]
+    ; CHECK-NEXT: [[SELECT15:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s32), [[SELECT1]], [[ASHR2]]
     ; CHECK-NEXT: $x10 = COPY [[SELECT13]](s32)
     ; CHECK-NEXT: $x11 = COPY [[SELECT14]](s32)
     ; CHECK-NEXT: $x12 = COPY [[SELECT15]](s32)
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv32/legalize-bswap.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv32/legalize-bswap.mir
index f01397078e409..9667b85a603db 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv32/legalize-bswap.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv32/legalize-bswap.mir
@@ -13,14 +13,11 @@ body:             |
     ; CHECK-NEXT: [[ASSERT_ZEXT:%[0-9]+]]:_(s32) = G_ASSERT_ZEXT [[COPY]], 16
     ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
     ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[ASSERT_ZEXT]], [[C]](s32)
-    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ASSERT_ZEXT]], [[C2]]
-    ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[AND]], [[C1]](s32)
+    ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[ASSERT_ZEXT]], [[C]](s32)
     ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[LSHR]], [[SHL]]
-    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[OR]], [[C3]]
-    ; CHECK-NEXT: $x10 = COPY [[AND1]](s32)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[OR]], [[C1]]
+    ; CHECK-NEXT: $x10 = COPY [[AND]](s32)
     ; CHECK-NEXT: PseudoRET implicit $x10
     %0:_(s32) = COPY $x10
     %1:_(s32) = G_ASSERT_ZEXT %0, 16
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv32/legalize-icmp.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv32/legalize-icmp.mir
index 62a77ea069ad5..894a6a0aa954a 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv32/legalize-icmp.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv32/legalize-icmp.mir
@@ -115,9 +115,7 @@ body:             |
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(sgt), [[ASHR]](s32), [[ASHR1]]
     ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[ASHR]](s32), [[ASHR1]]
     ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(ugt), %xhi(s32), %yhi
-    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ICMP1]], [[C2]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AND]](s32), [[ICMP2]], [[ICMP]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s32), [[ICMP2]], [[ICMP]]
     ; CHECK-NEXT: $x10 = COPY [[SELECT]](s32)
     ; CHECK-NEXT: PseudoRET implicit $x10
     %xhi:_(s32) = COPY $x10
@@ -146,9 +144,7 @@ body:             |
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(sgt), %xlo(s32), %ylo
     ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), %xlo(s32), %ylo
     ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(ugt), %xhi(s32), %yhi
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ICMP1]], [[C]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AND]](s32), [[ICMP2]], [[ICMP]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s32), [[ICMP2]], [[ICMP]]
     ; CHECK-NEXT: $x10 = COPY [[SELECT]](s32)
     ; CHECK-NEXT: PseudoRET implicit $x10
     %xhi:_(s32) = COPY $x10
@@ -277,9 +273,7 @@ body:             |
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(slt), [[ASHR]](s32), [[ASHR1]]
     ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[ASHR]](s32), [[ASHR1]]
     ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), %xhi(s32), %yhi
-    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ICMP1]], [[C2]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AND]](s32), [[ICMP2]], [[ICMP]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s32), [[ICMP2]], [[ICMP]]
     ; CHECK-NEXT: $x10 = COPY [[SELECT]](s32)
     ; CHECK-NEXT: PseudoRET implicit $x10
     %xhi:_(s32) = COPY $x10
@@ -308,9 +302,7 @@ body:             |
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(slt), %xlo(s32), %ylo
     ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), %xlo(s32), %ylo
     ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), %xhi(s32), %yhi
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ICMP1]], [[C]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AND]](s32), [[ICMP2]], [[ICMP]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s32), [[ICMP2]], [[ICMP]]
     ; CHECK-NEXT: $x10 = COPY [[SELECT]](s32)
     ; CHECK-NEXT: PseudoRET implicit $x10
     %xhi:_(s32) = COPY $x10
@@ -439,9 +431,7 @@ body:             |
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(sge), [[ASHR]](s32), [[ASHR1]]
     ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[ASHR]](s32), [[ASHR1]]
     ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(uge), %xhi(s32), %yhi
-    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ICMP1]], [[C2]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AND]](s32), [[ICMP2]], [[ICMP]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s32), [[ICMP2]], [[ICMP]]
     ; CHECK-NEXT: $x10 = COPY [[SELECT]](s32)
     ; CHECK-NEXT: PseudoRET implicit $x10
     %xhi:_(s32) = COPY $x10
@@ -470,9 +460,7 @@ body:             |
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(sge), %xlo(s32), %ylo
     ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), %xlo(s32), %ylo
     ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(uge), %xhi(s32), %yhi
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ICMP1]], [[C]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AND]](s32), [[ICMP2]], [[ICMP]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s32), [[ICMP2]], [[ICMP]]
     ; CHECK-NEXT: $x10 = COPY [[SELECT]](s32)
     ; CHECK-NEXT: PseudoRET implicit $x10
     %xhi:_(s32) = COPY $x10
@@ -601,9 +589,7 @@ body:             |
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(sle), [[ASHR]](s32), [[ASHR1]]
     ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[ASHR]](s32), [[ASHR1]]
     ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(ule), %xhi(s32), %yhi
-    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ICMP1]], [[C2]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AND]](s32), [[ICMP2]], [[ICMP]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s32), [[ICMP2]], [[ICMP]]
     ; CHECK-NEXT: $x10 = COPY [[SELECT]](s32)
     ; CHECK-NEXT: PseudoRET implicit $x10
     %xhi:_(s32) = COPY $x10
@@ -632,9 +618,7 @@ body:             |
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(sle), %xlo(s32), %ylo
     ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), %xlo(s32), %ylo
     ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(ule), %xhi(s32), %yhi
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ICMP1]], [[C]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AND]](s32), [[ICMP2]], [[ICMP]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s32), [[ICMP2]], [[ICMP]]
     ; CHECK-NEXT: $x10 = COPY [[SELECT]](s32)
     ; CHECK-NEXT: PseudoRET implicit $x10
     %xhi:_(s32) = COPY $x10
@@ -759,9 +743,7 @@ body:             |
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ugt), [[AND1]](s32), [[AND3]]
     ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[AND1]](s32), [[AND3]]
     ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(ugt), [[AND]](s32), [[AND2]]
-    ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[ICMP1]], [[C4]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AND4]](s32), [[ICMP2]], [[ICMP]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s32), [[ICMP2]], [[ICMP]]
     ; CHECK-NEXT: $x10 = COPY [[SELECT]](s32)
     ; CHECK-NEXT: PseudoRET implicit $x10
     %xhi:_(s32) = COPY $x10
@@ -790,9 +772,7 @@ body:             |
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ugt), %xlo(s32), %ylo
     ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), %xlo(s32), %ylo
     ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(ugt), %xhi(s32), %yhi
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ICMP1]], [[C]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AND]](s32), [[ICMP2]], [[ICMP]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s32), [[ICMP2]], [[ICMP]]
     ; CHECK-NEXT: $x10 = COPY [[SELECT]](s32)
     ; CHECK-NEXT: PseudoRET implicit $x10
     %xhi:_(s32) = COPY $x10
@@ -917,9 +897,7 @@ body:             |
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[AND1]](s32), [[AND3]]
     ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[AND1]](s32), [[AND3]]
     ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[AND]](s32), [[AND2]]
-    ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[ICMP1]], [[C4]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AND4]](s32), [[ICMP2]], [[ICMP]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s32), [[ICMP2]], [[ICMP]]
     ; CHECK-NEXT: $x10 = COPY [[SELECT]](s32)
     ; CHECK-NEXT: PseudoRET implicit $x10
     %xhi:_(s32) = COPY $x10
@@ -948,9 +926,7 @@ body:             |
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(sge), %xlo(s32), %ylo
     ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), %xlo(s32), %ylo
     ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(uge), %xhi(s32), %yhi
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ICMP1]], [[C]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AND]](s32), [[ICMP2]], [[ICMP]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s32), [[ICMP2]], [[ICMP]]
     ; CHECK-NEXT: $x10 = COPY [[SELECT]](s32)
     ; CHECK-NEXT: PseudoRET implicit $x10
     %xhi:_(s32) = COPY $x10
@@ -1075,9 +1051,7 @@ body:             |
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(uge), [[AND1]](s32), [[AND3]]
     ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[AND1]](s32), [[AND3]]
     ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(uge), [[AND]](s32), [[AND2]]
-    ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[ICMP1]], [[C4]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AND4]](s32), [[ICMP2]], [[ICMP]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s32), [[ICMP2]], [[ICMP]]
     ; CHECK-NEXT: $x10 = COPY [[SELECT]](s32)
     ; CHECK-NEXT: PseudoRET implicit $x10
     %xhi:_(s32) = COPY $x10
@@ -1106,9 +1080,7 @@ body:             |
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(uge), %xlo(s32), %ylo
     ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), %xlo(s32), %ylo
     ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(uge), %xhi(s32), %yhi
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ICMP1]], [[C]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AND]](s32), [[ICMP2]], [[ICMP]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s32), [[ICMP2]], [[ICMP]]
     ; CHECK-NEXT: $x10 = COPY [[SELECT]](s32)
     ; CHECK-NEXT: PseudoRET implicit $x10
     %xhi:_(s32) = COPY $x10
@@ -1233,9 +1205,7 @@ body:             |
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ule), [[AND1]](s32), [[AND3]]
     ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[AND1]](s32), [[AND3]]
     ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(ule), [[AND]](s32), [[AND2]]
-    ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[ICMP1]], [[C4]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AND4]](s32), [[ICMP2]], [[ICMP]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s32), [[ICMP2]], [[ICMP]]
     ; CHECK-NEXT: $x10 = COPY [[SELECT]](s32)
     ; CHECK-NEXT: PseudoRET implicit $x10
     %xhi:_(s32) = COPY $x10
@@ -1264,9 +1234,7 @@ body:             |
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ule), %xlo(s32), %ylo
     ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), %xlo(s32), %ylo
     ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(ule), %xhi(s32), %yhi
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ICMP1]], [[C]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AND]](s32), [[ICMP2]], [[ICMP]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s32), [[ICMP2]], [[ICMP]]
     ; CHECK-NEXT: $x10 = COPY [[SELECT]](s32)
     ; CHECK-NEXT: PseudoRET implicit $x10
     %xhi:_(s32) = COPY $x10
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv32/legalize-lshr.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv32/legalize-lshr.mir
index da6fb7802bd42..604ae2a4b7928 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv32/legalize-lshr.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv32/legalize-lshr.mir
@@ -114,15 +114,9 @@ body:             |
     ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[LSHR1]], [[SHL]]
     ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
     ; CHECK-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[AND1]], [[SUB]](s32)
-    ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[ICMP]], [[C5]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AND2]](s32), [[OR]], [[LSHR2]]
-    ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[ICMP1]], [[C6]]
-    ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[AND3]](s32), [[AND]], [[SELECT]]
-    ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[ICMP]], [[C7]]
-    ; CHECK-NEXT: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[AND4]](s32), [[LSHR]], [[C4]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s32), [[OR]], [[LSHR2]]
+    ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s32), [[AND]], [[SELECT]]
+    ; CHECK-NEXT: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s32), [[LSHR]], [[C4]]
     ; CHECK-NEXT: $x10 = COPY [[SELECT1]](s32)
     ; CHECK-NEXT: $x11 = COPY [[SELECT2]](s32)
     ; CHECK-NEXT: PseudoRET implicit $x10, implicit $x11
@@ -162,15 +156,9 @@ body:             |
     ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[LSHR1]], [[SHL]]
     ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
     ; CHECK-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR %hi1, [[SUB]](s32)
-    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ICMP]], [[C3]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AND]](s32), [[OR]], [[LSHR2]]
-    ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[ICMP1]], [[C4]]
-    ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[AND1]](s32), %lo1, [[SELECT]]
-    ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[ICMP]], [[C5]]
-    ; CHECK-NEXT: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[AND2]](s32), [[LSHR]], [[C2]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s32), [[OR]], [[LSHR2]]
+    ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s32), %lo1, [[SELECT]]
+    ; CHECK-NEXT: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s32), [[LSHR]], [[C2]]
     ; CHECK-NEXT: $x10 = COPY [[SELECT1]](s32)
     ; CHECK-NEXT: $x11 = COPY [[SELECT2]](s32)
     ; CHECK-NEXT: PseudoRET implicit $x10, implicit $x11
@@ -213,93 +201,61 @@ body:             |
     ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[C]], [[SUB3]](s32)
     ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[LSHR]], [[SHL]]
     ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[C]], [[SUB2]](s32)
-    ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ICMP2]], [[C5]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AND]](s32), [[OR]], [[LSHR1]]
-    ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[ICMP3]], [[C6]]
-    ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[AND1]](s32), %hi1, [[SELECT]]
-    ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-    ; CHECK-NEXT: [[SUB4:%[0-9]+]]:_(s32) = G_SUB %lo2, [[C7]]
-    ; CHECK-NEXT: [[SUB5:%[0-9]+]]:_(s32) = G_SUB [[C7]], %lo2
-    ; CHECK-NEXT: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-    ; CHECK-NEXT: [[ICMP4:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), %lo2(s32), [[C7]]
-    ; CHECK-NEXT: [[ICMP5:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), %lo2(s32), [[C8]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s32), [[OR]], [[LSHR1]]
+    ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP3]](s32), %hi1, [[SELECT]]
+    ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; CHECK-NEXT: [[SUB4:%[0-9]+]]:_(s32) = G_SUB %lo2, [[C5]]
+    ; CHECK-NEXT: [[SUB5:%[0-9]+]]:_(s32) = G_SUB [[C5]], %lo2
+    ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[ICMP4:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), %lo2(s32), [[C5]]
+    ; CHECK-NEXT: [[ICMP5:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), %lo2(s32), [[C6]]
     ; CHECK-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR %mid1, %lo2(s32)
     ; CHECK-NEXT: [[LSHR3:%[0-9]+]]:_(s32) = G_LSHR %lo1, %lo2(s32)
     ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s32) = G_SHL %mid1, [[SUB5]](s32)
     ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[LSHR3]], [[SHL1]]
-    ; CHECK-NEXT: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
     ; CHECK-NEXT: [[LSHR4:%[0-9]+]]:_(s32) = G_LSHR %mid1, [[SUB4]](s32)
-    ; CHECK-NEXT: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[ICMP4]], [[C10]]
-    ; CHECK-NEXT: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[AND2]](s32), [[OR1]], [[LSHR4]]
-    ; CHECK-NEXT: [[C11:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[ICMP5]], [[C11]]
-    ; CHECK-NEXT: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[AND3]](s32), %lo1, [[SELECT2]]
-    ; CHECK-NEXT: [[C12:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[ICMP4]], [[C12]]
-    ; CHECK-NEXT: [[SELECT4:%[0-9]+]]:_(s32) = G_SELECT [[AND4]](s32), [[LSHR2]], [[C9]]
-    ; CHECK-NEXT: [[C13:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-    ; CHECK-NEXT: [[SUB6:%[0-9]+]]:_(s32) = G_SUB [[SUB1]], [[C13]]
-    ; CHECK-NEXT: [[SUB7:%[0-9]+]]:_(s32) = G_SUB [[C13]], [[SUB1]]
-    ; CHECK-NEXT: [[C14:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-    ; CHECK-NEXT: [[ICMP6:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[SUB1]](s32), [[C13]]
-    ; CHECK-NEXT: [[ICMP7:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[SUB1]](s32), [[C14]]
+    ; CHECK-NEXT: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[ICMP4]](s32), [[OR1]], [[LSHR4]]
+    ; CHECK-NEXT: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[ICMP5]](s32), %lo1, [[SELECT2]]
+    ; CHECK-NEXT: [[SELECT4:%[0-9]+]]:_(s32) = G_SELECT [[ICMP4]](s32), [[LSHR2]], [[C7]]
+    ; CHECK-NEXT: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; CHECK-NEXT: [[SUB6:%[0-9]+]]:_(s32) = G_SUB [[SUB1]], [[C8]]
+    ; CHECK-NEXT: [[SUB7:%[0-9]+]]:_(s32) = G_SUB [[C8]], [[SUB1]]
+    ; CHECK-NEXT: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[ICMP6:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[SUB1]](s32), [[C8]]
+    ; CHECK-NEXT: [[ICMP7:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[SUB1]](s32), [[C9]]
     ; CHECK-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL %hi1, [[SUB1]](s32)
     ; CHECK-NEXT: [[LSHR5:%[0-9]+]]:_(s32) = G_LSHR %hi1, [[SUB7]](s32)
     ; CHECK-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL [[C]], [[SUB1]](s32)
     ; CHECK-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[LSHR5]], [[SHL3]]
-    ; CHECK-NEXT: [[C15:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
     ; CHECK-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL %hi1, [[SUB6]](s32)
-    ; CHECK-NEXT: [[C16:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[ICMP6]], [[C16]]
-    ; CHECK-NEXT: [[SELECT5:%[0-9]+]]:_(s32) = G_SELECT [[AND5]](s32), [[SHL2]], [[C15]]
-    ; CHECK-NEXT: [[C17:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[ICMP6]], [[C17]]
-    ; CHECK-NEXT: [[SELECT6:%[0-9]+]]:_(s32) = G_SELECT [[AND6]](s32), [[OR2]], [[SHL4]]
-    ; CHECK-NEXT: [[C18:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[ICMP7]], [[C18]]
-    ; CHECK-NEXT: [[SELECT7:%[0-9]+]]:_(s32) = G_SELECT [[AND7]](s32), [[C]], [[SELECT6]]
+    ; CHECK-NEXT: [[SELECT5:%[0-9]+]]:_(s32) = G_SELECT [[ICMP6]](s32), [[SHL2]], [[C10]]
+    ; CHECK-NEXT: [[SELECT6:%[0-9]+]]:_(s32) = G_SELECT [[ICMP6]](s32), [[OR2]], [[SHL4]]
+    ; CHECK-NEXT: [[SELECT7:%[0-9]+]]:_(s32) = G_SELECT [[ICMP7]](s32), [[C]], [[SELECT6]]
     ; CHECK-NEXT: [[OR3:%[0-9]+]]:_(s32) = G_OR [[SELECT3]], [[SELECT5]]
     ; CHECK-NEXT: [[OR4:%[0-9]+]]:_(s32) = G_OR [[SELECT4]], [[SELECT7]]
-    ; CHECK-NEXT: [[C19:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-    ; CHECK-NEXT: [[C20:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-    ; CHECK-NEXT: [[SUB8:%[0-9]+]]:_(s32) = G_SUB [[SUB]], [[C20]]
-    ; CHECK-NEXT: [[SUB9:%[0-9]+]]:_(s32) = G_SUB [[C20]], [[SUB]]
-    ; CHECK-NEXT: [[C21:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-    ; CHECK-NEXT: [[ICMP8:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[SUB]](s32), [[C20]]
-    ; CHECK-NEXT: [[ICMP9:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[SUB]](s32), [[C21]]
+    ; CHECK-NEXT: [[C11:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[C12:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; CHECK-NEXT: [[SUB8:%[0-9]+]]:_(s32) = G_SUB [[SUB]], [[C12]]
+    ; CHECK-NEXT: [[SUB9:%[0-9]+]]:_(s32) = G_SUB [[C12]], [[SUB]]
+    ; CHECK-NEXT: [[C13:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[ICMP8:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[SUB]](s32), [[C12]]
+    ; CHECK-NEXT: [[ICMP9:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[SUB]](s32), [[C13]]
     ; CHECK-NEXT: [[LSHR6:%[0-9]+]]:_(s32) = G_LSHR [[C]], [[SUB]](s32)
     ; CHECK-NEXT: [[LSHR7:%[0-9]+]]:_(s32) = G_LSHR %hi1, [[SUB]](s32)
     ; CHECK-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL [[C]], [[SUB9]](s32)
     ; CHECK-NEXT: [[OR5:%[0-9]+]]:_(s32) = G_OR [[LSHR7]], [[SHL5]]
-    ; CHECK-NEXT: [[C22:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[C14:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
     ; CHECK-NEXT: [[LSHR8:%[0-9]+]]:_(s32) = G_LSHR [[C]], [[SUB8]](s32)
-    ; CHECK-NEXT: [[C23:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND8:%[0-9]+]]:_(s32) = G_AND [[ICMP8]], [[C23]]
-    ; CHECK-NEXT: [[SELECT8:%[0-9]+]]:_(s32) = G_SELECT [[AND8]](s32), [[OR5]], [[LSHR8]]
-    ; CHECK-NEXT: [[C24:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND9:%[0-9]+]]:_(s32) = G_AND [[ICMP9]], [[C24]]
-    ; CHECK-NEXT: [[SELECT9:%[0-9]+]]:_(s32) = G_SELECT [[AND9]](s32), %hi1, [[SELECT8]]
-    ; CHECK-NEXT: [[C25:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND10:%[0-9]+]]:_(s32) = G_AND [[ICMP8]], [[C25]]
-    ; CHECK-NEXT: [[SELECT10:%[0-9]+]]:_(s32) = G_SELECT [[AND10]](s32), [[LSHR6]], [[C22]]
-    ; CHECK-NEXT: [[C26:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND11:%[0-9]+]]:_(s32) = G_AND [[ICMP]], [[C26]]
-    ; CHECK-NEXT: [[SELECT11:%[0-9]+]]:_(s32) = G_SELECT [[AND11]](s32), [[OR3]], [[SELECT9]]
-    ; CHECK-NEXT: [[C27:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND12:%[0-9]+]]:_(s32) = G_AND [[ICMP]], [[C27]]
-    ; CHECK-NEXT: [[SELECT12:%[0-9]+]]:_(s32) = G_SELECT [[AND12]](s32), [[OR4]], [[SELECT10]]
-    ; CHECK-NEXT: [[C28:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND13:%[0-9]+]]:_(s32) = G_AND [[ICMP1]], [[C28]]
-    ; CHECK-NEXT: [[SELECT13:%[0-9]+]]:_(s32) = G_SELECT [[AND13]](s32), %lo1, [[SELECT11]]
-    ; CHECK-NEXT: [[C29:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND14:%[0-9]+]]:_(s32) = G_AND [[ICMP1]], [[C29]]
-    ; CHECK-NEXT: [[SELECT14:%[0-9]+]]:_(s32) = G_SELECT [[AND14]](s32), %mid1, [[SELECT12]]
-    ; CHECK-NEXT: [[C30:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND15:%[0-9]+]]:_(s32) = G_AND [[ICMP]], [[C30]]
-    ; CHECK-NEXT: [[SELECT15:%[0-9]+]]:_(s32) = G_SELECT [[AND15]](s32), [[SELECT1]], [[C19]]
+    ; CHECK-NEXT: [[SELECT8:%[0-9]+]]:_(s32) = G_SELECT [[ICMP8]](s32), [[OR5]], [[LSHR8]]
+    ; CHECK-NEXT: [[SELECT9:%[0-9]+]]:_(s32) = G_SELECT [[ICMP9]](s32), %hi1, [[SELECT8]]
+    ; CHECK-NEXT: [[SELECT10:%[0-9]+]]:_(s32) = G_SELECT [[ICMP8]](s32), [[LSHR6]], [[C14]]
+    ; CHECK-NEXT: [[SELECT11:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s32), [[OR3]], [[SELECT9]]
+    ; CHECK-NEXT: [[SELECT12:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s32), [[OR4]], [[SELECT10]]
+    ; CHECK-NEXT: [[SELECT13:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s32), %lo1, [[SELECT11]]
+    ; CHECK-NEXT: [[SELECT14:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s32), %mid1, [[SELECT12]]
+    ; CHECK-NEXT: [[SELECT15:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s32), [[SELECT1]], [[C11]]
     ; CHECK-NEXT: $x10 = COPY [[SELECT13]](s32)
     ; CHECK-NEXT: $x11 = COPY [[SELECT14]](s32)
     ; CHECK-NEXT: $x12 = COPY [[SELECT15]](s32)
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv32/legalize-mul-ext.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv32/legalize-mul-ext.mir
index 04e99d7407f05..433d6e6b821f3 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv32/legalize-mul-ext.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv32/legalize-mul-ext.mir
@@ -162,13 +162,9 @@ body:             |
     ; CHECK-NEXT: [[UMULH:%[0-9]+]]:_(s32) = G_UMULH %lo1, %lo2
     ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[MUL1]], [[MUL2]]
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[ADD]](s32), [[MUL2]]
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ICMP]], [[C]]
     ; CHECK-NEXT: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[ADD]], [[UMULH]]
     ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[ADD1]](s32), [[UMULH]]
-    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[ICMP1]], [[C1]]
-    ; CHECK-NEXT: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[AND]], [[AND1]]
+    ; CHECK-NEXT: [[ADD2:%[0-9]+]]:_(s32) = G_ADD [[ICMP]], [[ICMP1]]
     ; CHECK-NEXT: [[MUL3:%[0-9]+]]:_(s32) = G_MUL %hi1, %lo2
     ; CHECK-NEXT: [[MUL4:%[0-9]+]]:_(s32) = G_MUL %mid1, %mid2
     ; CHECK-NEXT: [[MUL5:%[0-9]+]]:_(s32) = G_MUL %lo1, %hi2
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv32/legalize-shl.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv32/legalize-shl.mir
index bc18361555c5f..ce00388c61dbe 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv32/legalize-shl.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv32/legalize-shl.mir
@@ -104,15 +104,9 @@ body:             |
     ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[LSHR]], [[SHL1]]
     ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
     ; CHECK-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL %xlo, [[SUB]](s32)
-    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ICMP]], [[C3]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AND]](s32), [[SHL]], [[C2]]
-    ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[ICMP]], [[C4]]
-    ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[AND1]](s32), [[OR]], [[SHL2]]
-    ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[ICMP1]], [[C5]]
-    ; CHECK-NEXT: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[AND2]](s32), %xhi, [[SELECT1]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s32), [[SHL]], [[C2]]
+    ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s32), [[OR]], [[SHL2]]
+    ; CHECK-NEXT: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s32), %xhi, [[SELECT1]]
     ; CHECK-NEXT: $x10 = COPY [[SELECT]](s32)
     ; CHECK-NEXT: $x11 = COPY [[SELECT2]](s32)
     ; CHECK-NEXT: PseudoRET implicit $x10, implicit $x11
@@ -152,15 +146,9 @@ body:             |
     ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[LSHR]], [[SHL1]]
     ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
     ; CHECK-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL %lo1, [[SUB]](s32)
-    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ICMP]], [[C3]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AND]](s32), [[SHL]], [[C2]]
-    ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[ICMP]], [[C4]]
-    ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[AND1]](s32), [[OR]], [[SHL2]]
-    ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[ICMP1]], [[C5]]
-    ; CHECK-NEXT: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[AND2]](s32), %hi1, [[SELECT1]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s32), [[SHL]], [[C2]]
+    ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s32), [[OR]], [[SHL2]]
+    ; CHECK-NEXT: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s32), %hi1, [[SELECT1]]
     ; CHECK-NEXT: $x10 = COPY [[SELECT]](s32)
     ; CHECK-NEXT: $x11 = COPY [[SELECT2]](s32)
     ; CHECK-NEXT: PseudoRET implicit $x10, implicit $x11
@@ -204,60 +192,38 @@ body:             |
     ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[LSHR]], [[SHL1]]
     ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
     ; CHECK-NEXT: [[SHL2:%[0-9]+]]:_(s32) = G_SHL %lo1, [[SUB2]](s32)
-    ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ICMP2]], [[C5]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AND]](s32), [[SHL]], [[C4]]
-    ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[ICMP2]], [[C6]]
-    ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[AND1]](s32), [[OR]], [[SHL2]]
-    ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[ICMP3]], [[C7]]
-    ; CHECK-NEXT: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[AND2]](s32), %mid1, [[SELECT1]]
-    ; CHECK-NEXT: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-    ; CHECK-NEXT: [[SUB4:%[0-9]+]]:_(s32) = G_SUB [[SUB1]], [[C8]]
-    ; CHECK-NEXT: [[SUB5:%[0-9]+]]:_(s32) = G_SUB [[C8]], [[SUB1]]
-    ; CHECK-NEXT: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-    ; CHECK-NEXT: [[ICMP4:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[SUB1]](s32), [[C8]]
-    ; CHECK-NEXT: [[ICMP5:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[SUB1]](s32), [[C9]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s32), [[SHL]], [[C4]]
+    ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP2]](s32), [[OR]], [[SHL2]]
+    ; CHECK-NEXT: [[SELECT2:%[0-9]+]]:_(s32) = G_SELECT [[ICMP3]](s32), %mid1, [[SELECT1]]
+    ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; CHECK-NEXT: [[SUB4:%[0-9]+]]:_(s32) = G_SUB [[SUB1]], [[C5]]
+    ; CHECK-NEXT: [[SUB5:%[0-9]+]]:_(s32) = G_SUB [[C5]], [[SUB1]]
+    ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[ICMP4:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[SUB1]](s32), [[C5]]
+    ; CHECK-NEXT: [[ICMP5:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[SUB1]](s32), [[C6]]
     ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR %lo1, [[SUB1]](s32)
     ; CHECK-NEXT: [[SHL3:%[0-9]+]]:_(s32) = G_SHL %mid1, [[SUB5]](s32)
     ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s32) = G_OR [[LSHR1]], [[SHL3]]
     ; CHECK-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR %mid1, [[SUB4]](s32)
-    ; CHECK-NEXT: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[ICMP4]], [[C10]]
-    ; CHECK-NEXT: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[AND3]](s32), [[OR1]], [[LSHR2]]
-    ; CHECK-NEXT: [[C11:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[ICMP5]], [[C11]]
-    ; CHECK-NEXT: [[SELECT4:%[0-9]+]]:_(s32) = G_SELECT [[AND4]](s32), %lo1, [[SELECT3]]
-    ; CHECK-NEXT: [[C12:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-    ; CHECK-NEXT: [[ICMP6:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), %lo2(s32), [[C12]]
+    ; CHECK-NEXT: [[SELECT3:%[0-9]+]]:_(s32) = G_SELECT [[ICMP4]](s32), [[OR1]], [[LSHR2]]
+    ; CHECK-NEXT: [[SELECT4:%[0-9]+]]:_(s32) = G_SELECT [[ICMP5]](s32), %lo1, [[SELECT3]]
+    ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; CHECK-NEXT: [[ICMP6:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), %lo2(s32), [[C7]]
     ; CHECK-NEXT: [[SHL4:%[0-9]+]]:_(s32) = G_SHL %hi1, %lo2(s32)
-    ; CHECK-NEXT: [[C13:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-    ; CHECK-NEXT: [[C14:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[ICMP6]], [[C14]]
-    ; CHECK-NEXT: [[SELECT5:%[0-9]+]]:_(s32) = G_SELECT [[AND5]](s32), [[SHL4]], [[C13]]
+    ; CHECK-NEXT: [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[SELECT5:%[0-9]+]]:_(s32) = G_SELECT [[ICMP6]](s32), [[SHL4]], [[C8]]
     ; CHECK-NEXT: [[OR2:%[0-9]+]]:_(s32) = G_OR [[SELECT4]], [[SELECT5]]
-    ; CHECK-NEXT: [[C15:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-    ; CHECK-NEXT: [[C16:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-    ; CHECK-NEXT: [[C17:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
-    ; CHECK-NEXT: [[ICMP7:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[SUB]](s32), [[C17]]
+    ; CHECK-NEXT: [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[C10:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[C11:%[0-9]+]]:_(s32) = G_CONSTANT i32 32
+    ; CHECK-NEXT: [[ICMP7:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[SUB]](s32), [[C11]]
     ; CHECK-NEXT: [[SHL5:%[0-9]+]]:_(s32) = G_SHL %lo1, [[SUB]](s32)
-    ; CHECK-NEXT: [[C18:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-    ; CHECK-NEXT: [[C19:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[ICMP7]], [[C19]]
-    ; CHECK-NEXT: [[SELECT6:%[0-9]+]]:_(s32) = G_SELECT [[AND6]](s32), [[SHL5]], [[C18]]
-    ; CHECK-NEXT: [[C20:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[ICMP]], [[C20]]
-    ; CHECK-NEXT: [[SELECT7:%[0-9]+]]:_(s32) = G_SELECT [[AND7]](s32), [[SELECT]], [[C15]]
-    ; CHECK-NEXT: [[C21:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND8:%[0-9]+]]:_(s32) = G_AND [[ICMP]], [[C21]]
-    ; CHECK-NEXT: [[SELECT8:%[0-9]+]]:_(s32) = G_SELECT [[AND8]](s32), [[SELECT2]], [[C16]]
-    ; CHECK-NEXT: [[C22:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND9:%[0-9]+]]:_(s32) = G_AND [[ICMP]], [[C22]]
-    ; CHECK-NEXT: [[SELECT9:%[0-9]+]]:_(s32) = G_SELECT [[AND9]](s32), [[OR2]], [[SELECT6]]
-    ; CHECK-NEXT: [[C23:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND10:%[0-9]+]]:_(s32) = G_AND [[ICMP1]], [[C23]]
-    ; CHECK-NEXT: [[SELECT10:%[0-9]+]]:_(s32) = G_SELECT [[AND10]](s32), %hi1, [[SELECT9]]
+    ; CHECK-NEXT: [[C12:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[SELECT6:%[0-9]+]]:_(s32) = G_SELECT [[ICMP7]](s32), [[SHL5]], [[C12]]
+    ; CHECK-NEXT: [[SELECT7:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s32), [[SELECT]], [[C9]]
+    ; CHECK-NEXT: [[SELECT8:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s32), [[SELECT2]], [[C10]]
+    ; CHECK-NEXT: [[SELECT9:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s32), [[OR2]], [[SELECT6]]
+    ; CHECK-NEXT: [[SELECT10:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s32), %hi1, [[SELECT9]]
     ; CHECK-NEXT: $x10 = COPY [[SELECT7]](s32)
     ; CHECK-NEXT: $x11 = COPY [[SELECT8]](s32)
     ; CHECK-NEXT: $x12 = COPY [[SELECT10]](s32)
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv32/legalize-smax.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv32/legalize-smax.mir
index 31df394c4f754..4b1a5293f6282 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv32/legalize-smax.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv32/legalize-smax.mir
@@ -13,12 +13,10 @@ body:             |
     ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
     ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]]
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ugt), [[AND]](s32), [[AND1]]
-    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[ICMP]], [[C2]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AND2]](s32), [[COPY]], [[COPY1]]
-    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
-    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[SELECT]], [[C3]](s32)
-    ; CHECK-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[SHL]], [[C3]](s32)
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s32), [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
+    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[SELECT]], [[C2]](s32)
+    ; CHECK-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[SHL]], [[C2]](s32)
     ; CHECK-NEXT: $x10 = COPY [[ASHR]](s32)
     ; CHECK-NEXT: PseudoRET implicit $x10
     %0:_(s32) = COPY $x10
@@ -43,12 +41,10 @@ body:             |
     ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
     ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]]
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ugt), [[AND]](s32), [[AND1]]
-    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[ICMP]], [[C2]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AND2]](s32), [[COPY]], [[COPY1]]
-    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[SELECT]], [[C3]](s32)
-    ; CHECK-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[SHL]], [[C3]](s32)
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s32), [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[SELECT]], [[C2]](s32)
+    ; CHECK-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[SHL]], [[C2]](s32)
     ; CHECK-NEXT: $x10 = COPY [[ASHR]](s32)
     ; CHECK-NEXT: PseudoRET implicit $x10
     %0:_(s32) = COPY $x10
@@ -69,9 +65,7 @@ body:             |
     ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x11
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ugt), [[COPY]](s32), [[COPY1]]
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ICMP]], [[C]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AND]](s32), [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s32), [[COPY]], [[COPY1]]
     ; CHECK-NEXT: $x10 = COPY [[SELECT]](s32)
     ; CHECK-NEXT: PseudoRET implicit $x10
     %0:_(s32) = COPY $x10
@@ -93,12 +87,8 @@ body:             |
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ugt), [[COPY1]](s32), [[COPY3]]
     ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[COPY1]](s32), [[COPY3]]
     ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(ugt), [[COPY]](s32), [[COPY2]]
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ICMP1]], [[C]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AND]](s32), [[ICMP2]], [[ICMP]]
-    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[SELECT]], [[C1]]
-    ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[AND1]](s32), [[COPY]], [[COPY2]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s32), [[ICMP2]], [[ICMP]]
+    ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[SELECT]](s32), [[COPY]], [[COPY2]]
     ; CHECK-NEXT: $x10 = COPY [[SELECT1]](s32)
     ; CHECK-NEXT: PseudoRET implicit $x10
     %0:_(s32) = COPY $x10
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv32/legalize-smin.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv32/legalize-smin.mir
index 2b589e6bb63c1..dac2be4119c14 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv32/legalize-smin.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv32/legalize-smin.mir
@@ -13,12 +13,10 @@ body:             |
     ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
     ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]]
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[AND]](s32), [[AND1]]
-    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[ICMP]], [[C2]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AND2]](s32), [[COPY]], [[COPY1]]
-    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
-    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[SELECT]], [[C3]](s32)
-    ; CHECK-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[SHL]], [[C3]](s32)
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s32), [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
+    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[SELECT]], [[C2]](s32)
+    ; CHECK-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[SHL]], [[C2]](s32)
     ; CHECK-NEXT: $x10 = COPY [[ASHR]](s32)
     ; CHECK-NEXT: PseudoRET implicit $x10
     %0:_(s32) = COPY $x10
@@ -43,12 +41,10 @@ body:             |
     ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
     ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]]
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[AND]](s32), [[AND1]]
-    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[ICMP]], [[C2]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AND2]](s32), [[COPY]], [[COPY1]]
-    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[SELECT]], [[C3]](s32)
-    ; CHECK-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[SHL]], [[C3]](s32)
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s32), [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[SELECT]], [[C2]](s32)
+    ; CHECK-NEXT: [[ASHR:%[0-9]+]]:_(s32) = G_ASHR [[SHL]], [[C2]](s32)
     ; CHECK-NEXT: $x10 = COPY [[ASHR]](s32)
     ; CHECK-NEXT: PseudoRET implicit $x10
     %0:_(s32) = COPY $x10
@@ -69,9 +65,7 @@ body:             |
     ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x11
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[COPY]](s32), [[COPY1]]
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ICMP]], [[C]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AND]](s32), [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s32), [[COPY]], [[COPY1]]
     ; CHECK-NEXT: $x10 = COPY [[SELECT]](s32)
     ; CHECK-NEXT: PseudoRET implicit $x10
     %0:_(s32) = COPY $x10
@@ -93,12 +87,8 @@ body:             |
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[COPY1]](s32), [[COPY3]]
     ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[COPY1]](s32), [[COPY3]]
     ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[COPY]](s32), [[COPY2]]
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ICMP1]], [[C]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AND]](s32), [[ICMP2]], [[ICMP]]
-    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[SELECT]], [[C1]]
-    ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[AND1]](s32), [[COPY]], [[COPY2]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s32), [[ICMP2]], [[ICMP]]
+    ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[SELECT]](s32), [[COPY]], [[COPY2]]
     ; CHECK-NEXT: $x10 = COPY [[SELECT1]](s32)
     ; CHECK-NEXT: PseudoRET implicit $x10
     %0:_(s32) = COPY $x10
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv32/legalize-store.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv32/legalize-store.mir
index fb3367fcd3047..2ece5a8c9d414 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv32/legalize-store.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv32/legalize-store.mir
@@ -232,11 +232,9 @@ body:             |
     ; CHECK-NEXT: G_STORE [[COPY2]](s32), [[COPY1]](p0) :: (store (s8))
     ; CHECK-NEXT: G_STORE [[LSHR1]](s32), [[PTR_ADD1]](p0) :: (store (s8) into unknown-address + 1)
     ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C6]]
-    ; CHECK-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[AND1]], [[C5]](s32)
-    ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD]], [[C7]](s32)
+    ; CHECK-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[LSHR]], [[C5]](s32)
+    ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD]], [[C6]](s32)
     ; CHECK-NEXT: G_STORE [[LSHR]](s32), [[PTR_ADD]](p0) :: (store (s8) into unknown-address + 2)
     ; CHECK-NEXT: G_STORE [[LSHR2]](s32), [[PTR_ADD2]](p0) :: (store (s8) into unknown-address + 3)
     ; CHECK-NEXT: PseudoRET
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv32/legalize-sub.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv32/legalize-sub.mir
index bb3a68fa01142..258d02646186c 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv32/legalize-sub.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv32/legalize-sub.mir
@@ -90,9 +90,7 @@ body:             |
     ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB %xlo, %ylo
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), %xlo(s32), %ylo
     ; CHECK-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB %xhi, %yhi
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ICMP]], [[C]]
-    ; CHECK-NEXT: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[SUB1]], [[AND]]
+    ; CHECK-NEXT: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[SUB1]], [[ICMP]]
     ; CHECK-NEXT: $x10 = COPY [[SUB]](s32)
     ; CHECK-NEXT: $x11 = COPY [[SUB2]](s32)
     ; CHECK-NEXT: PseudoRET implicit $x10, implicit $x11
@@ -124,9 +122,7 @@ body:             |
     ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB %lo1, %lo2
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), %lo1(s32), %lo2
     ; CHECK-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB %hi1, %hi2
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ICMP]], [[C]]
-    ; CHECK-NEXT: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[SUB1]], [[AND]]
+    ; CHECK-NEXT: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[SUB1]], [[ICMP]]
     ; CHECK-NEXT: $x10 = COPY [[SUB]](s32)
     ; CHECK-NEXT: $x11 = COPY [[SUB2]](s32)
     ; CHECK-NEXT: PseudoRET implicit $x10, implicit $x11
@@ -158,17 +154,13 @@ body:             |
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), %lo1(s32), %lo2
     ; CHECK-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB %mid1, %mid2
     ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(ugt), [[SUB1]](s32), %mid1
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ICMP]], [[C]]
-    ; CHECK-NEXT: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[SUB1]], [[AND]]
-    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-    ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[SUB1]](s32), [[C1]]
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[ICMP2]], [[ICMP]]
-    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[ICMP1]], [[AND1]]
+    ; CHECK-NEXT: [[SUB2:%[0-9]+]]:_(s32) = G_SUB [[SUB1]], [[ICMP]]
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[SUB1]](s32), [[C]]
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ICMP2]], [[ICMP]]
+    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[ICMP1]], [[AND]]
     ; CHECK-NEXT: [[SUB3:%[0-9]+]]:_(s32) = G_SUB %hi1, %hi2
-    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[OR]], [[C2]]
-    ; CHECK-NEXT: [[SUB4:%[0-9]+]]:_(s32) = G_SUB [[SUB3]], [[AND2]]
+    ; CHECK-NEXT: [[SUB4:%[0-9]+]]:_(s32) = G_SUB [[SUB3]], [[OR]]
     ; CHECK-NEXT: $x10 = COPY [[SUB]](s32)
     ; CHECK-NEXT: $x11 = COPY [[SUB2]](s32)
     ; CHECK-NEXT: $x12 = COPY [[SUB4]](s32)
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv32/legalize-umax.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv32/legalize-umax.mir
index 8dea2cb875073..a05dd43ce0aa3 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv32/legalize-umax.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv32/legalize-umax.mir
@@ -13,12 +13,10 @@ body:             |
     ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
     ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]]
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ugt), [[AND]](s32), [[AND1]]
-    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[ICMP]], [[C2]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AND2]](s32), [[COPY]], [[COPY1]]
-    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
-    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[SELECT]], [[C3]]
-    ; CHECK-NEXT: $x10 = COPY [[AND3]](s32)
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s32), [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
+    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[SELECT]], [[C2]]
+    ; CHECK-NEXT: $x10 = COPY [[AND2]](s32)
     ; CHECK-NEXT: PseudoRET implicit $x10
     %0:_(s32) = COPY $x10
     %1:_(s32) = COPY $x11
@@ -42,12 +40,10 @@ body:             |
     ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
     ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]]
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ugt), [[AND]](s32), [[AND1]]
-    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[ICMP]], [[C2]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AND2]](s32), [[COPY]], [[COPY1]]
-    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
-    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[SELECT]], [[C3]]
-    ; CHECK-NEXT: $x10 = COPY [[AND3]](s32)
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s32), [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[SELECT]], [[C2]]
+    ; CHECK-NEXT: $x10 = COPY [[AND2]](s32)
     ; CHECK-NEXT: PseudoRET implicit $x10
     %0:_(s32) = COPY $x10
     %1:_(s32) = COPY $x11
@@ -67,9 +63,7 @@ body:             |
     ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x11
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ugt), [[COPY]](s32), [[COPY1]]
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ICMP]], [[C]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AND]](s32), [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s32), [[COPY]], [[COPY1]]
     ; CHECK-NEXT: $x10 = COPY [[SELECT]](s32)
     ; CHECK-NEXT: PseudoRET implicit $x10
     %0:_(s32) = COPY $x10
@@ -91,12 +85,8 @@ body:             |
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ugt), [[COPY1]](s32), [[COPY3]]
     ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[COPY1]](s32), [[COPY3]]
     ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(ugt), [[COPY]](s32), [[COPY2]]
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ICMP1]], [[C]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AND]](s32), [[ICMP2]], [[ICMP]]
-    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[SELECT]], [[C1]]
-    ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[AND1]](s32), [[COPY]], [[COPY2]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s32), [[ICMP2]], [[ICMP]]
+    ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[SELECT]](s32), [[COPY]], [[COPY2]]
     ; CHECK-NEXT: $x10 = COPY [[SELECT1]](s32)
     ; CHECK-NEXT: PseudoRET implicit $x10
     %0:_(s32) = COPY $x10
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv32/legalize-umin.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv32/legalize-umin.mir
index cd180a2a5b329..d9282d84455bc 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv32/legalize-umin.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv32/legalize-umin.mir
@@ -13,12 +13,10 @@ body:             |
     ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
     ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]]
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[AND]](s32), [[AND1]]
-    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[ICMP]], [[C2]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AND2]](s32), [[COPY]], [[COPY1]]
-    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
-    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[SELECT]], [[C3]]
-    ; CHECK-NEXT: $x10 = COPY [[AND3]](s32)
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s32), [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
+    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[SELECT]], [[C2]]
+    ; CHECK-NEXT: $x10 = COPY [[AND2]](s32)
     ; CHECK-NEXT: PseudoRET implicit $x10
     %0:_(s32) = COPY $x10
     %1:_(s32) = COPY $x11
@@ -42,12 +40,10 @@ body:             |
     ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
     ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]]
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[AND]](s32), [[AND1]]
-    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[ICMP]], [[C2]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AND2]](s32), [[COPY]], [[COPY1]]
-    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
-    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[SELECT]], [[C3]]
-    ; CHECK-NEXT: $x10 = COPY [[AND3]](s32)
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s32), [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[SELECT]], [[C2]]
+    ; CHECK-NEXT: $x10 = COPY [[AND2]](s32)
     ; CHECK-NEXT: PseudoRET implicit $x10
     %0:_(s32) = COPY $x10
     %1:_(s32) = COPY $x11
@@ -67,9 +63,7 @@ body:             |
     ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $x10
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $x11
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[COPY]](s32), [[COPY1]]
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ICMP]], [[C]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AND]](s32), [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP]](s32), [[COPY]], [[COPY1]]
     ; CHECK-NEXT: $x10 = COPY [[SELECT]](s32)
     ; CHECK-NEXT: PseudoRET implicit $x10
     %0:_(s32) = COPY $x10
@@ -91,12 +85,8 @@ body:             |
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[COPY1]](s32), [[COPY3]]
     ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s32) = G_ICMP intpred(eq), [[COPY1]](s32), [[COPY3]]
     ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s32) = G_ICMP intpred(ult), [[COPY]](s32), [[COPY2]]
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ICMP1]], [[C]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AND]](s32), [[ICMP2]], [[ICMP]]
-    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[SELECT]], [[C1]]
-    ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[AND1]](s32), [[COPY]], [[COPY2]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[ICMP1]](s32), [[ICMP2]], [[ICMP]]
+    ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[SELECT]](s32), [[COPY]], [[COPY2]]
     ; CHECK-NEXT: $x10 = COPY [[SELECT1]](s32)
     ; CHECK-NEXT: PseudoRET implicit $x10
     %0:_(s32) = COPY $x10
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-abs.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-abs.mir
index d7b694b9dfdb8..ee92bb5197834 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-abs.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-abs.mir
@@ -16,8 +16,8 @@ body:             |
     ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[ASSERT_ZEXT]](s64)
     ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[TRUNC1]], [[ASHR1]]
     ; CHECK-NEXT: [[XOR:%[0-9]+]]:_(s32) = G_XOR [[ADD]], [[ASHR1]]
-    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 255
     ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[XOR]](s32)
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 255
     ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[ANYEXT]], [[C2]]
     ; CHECK-NEXT: $x10 = COPY [[AND]](s64)
     ; CHECK-NEXT: PseudoRET implicit $x10
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-add.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-add.mir
index 11ac3c5991376..f394e4d5064ed 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-add.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-add.mir
@@ -122,9 +122,7 @@ body:             |
     ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD %x00, %y00
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(ult), [[ADD]](s64), %y00
     ; CHECK-NEXT: [[ADD1:%[0-9]+]]:_(s64) = G_ADD %x01, %y01
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[ICMP]], [[C]]
-    ; CHECK-NEXT: [[ADD2:%[0-9]+]]:_(s64) = G_ADD [[ADD1]], [[AND]]
+    ; CHECK-NEXT: [[ADD2:%[0-9]+]]:_(s64) = G_ADD [[ADD1]], [[ICMP]]
     ; CHECK-NEXT: $x10 = COPY [[ADD]](s64)
     ; CHECK-NEXT: $x11 = COPY [[ADD2]](s64)
     ; CHECK-NEXT: PseudoRET implicit $x10, implicit $x11
@@ -156,9 +154,7 @@ body:             |
     ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD %lo1, %lo2
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(ult), [[ADD]](s64), %lo2
     ; CHECK-NEXT: [[ADD1:%[0-9]+]]:_(s64) = G_ADD %hi1, %hi2
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[ICMP]], [[C]]
-    ; CHECK-NEXT: [[ADD2:%[0-9]+]]:_(s64) = G_ADD [[ADD1]], [[AND]]
+    ; CHECK-NEXT: [[ADD2:%[0-9]+]]:_(s64) = G_ADD [[ADD1]], [[ICMP]]
     ; CHECK-NEXT: $x10 = COPY [[ADD]](s64)
     ; CHECK-NEXT: $x11 = COPY [[ADD2]](s64)
     ; CHECK-NEXT: PseudoRET implicit $x10, implicit $x11
@@ -190,21 +186,19 @@ body:             |
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(ult), [[ADD]](s64), %lo2
     ; CHECK-NEXT: [[ADD1:%[0-9]+]]:_(s64) = G_ADD %mid1, %mid2
     ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s64) = G_ICMP intpred(ult), [[ADD1]](s64), %mid1
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[ICMP]], [[C]]
-    ; CHECK-NEXT: [[ADD2:%[0-9]+]]:_(s64) = G_ADD [[ADD1]], [[AND]]
-    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-    ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s64) = G_ICMP intpred(eq), [[ADD2]](s64), [[C1]]
+    ; CHECK-NEXT: [[ADD2:%[0-9]+]]:_(s64) = G_ADD [[ADD1]], [[ICMP]]
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s64) = G_ICMP intpred(eq), [[ADD2]](s64), [[C]]
     ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[ICMP2]](s64)
     ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[ICMP]](s64)
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[TRUNC]], [[TRUNC1]]
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[TRUNC]], [[TRUNC1]]
     ; CHECK-NEXT: [[TRUNC2:%[0-9]+]]:_(s32) = G_TRUNC [[ICMP1]](s64)
-    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[TRUNC2]], [[AND1]]
+    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[TRUNC2]], [[AND]]
     ; CHECK-NEXT: [[ADD3:%[0-9]+]]:_(s64) = G_ADD %hi1, %hi2
-    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
     ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR]](s32)
-    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s64) = G_AND [[ANYEXT]], [[C2]]
-    ; CHECK-NEXT: [[ADD4:%[0-9]+]]:_(s64) = G_ADD [[ADD3]], [[AND2]]
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[ANYEXT]], [[C1]]
+    ; CHECK-NEXT: [[ADD4:%[0-9]+]]:_(s64) = G_ADD [[ADD3]], [[AND1]]
     ; CHECK-NEXT: $x10 = COPY [[ADD]](s64)
     ; CHECK-NEXT: $x11 = COPY [[ADD2]](s64)
     ; CHECK-NEXT: $x12 = COPY [[ADD4]](s64)
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-addo-subo.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-addo-subo.mir
index 6600f7f30d729..5506f5228e9db 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-addo-subo.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-addo-subo.mir
@@ -288,8 +288,8 @@ body:             |
     ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
     ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64)
     ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[TRUNC]], [[TRUNC1]]
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 255
     ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[ADD]](s32)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 255
     ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[ANYEXT]], [[C]]
     ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 255
     ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[COPY1]], [[C1]]
@@ -324,8 +324,8 @@ body:             |
     ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
     ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64)
     ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[TRUNC]], [[TRUNC1]]
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 65535
     ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[ADD]](s32)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 65535
     ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[ANYEXT]], [[C]]
     ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 65535
     ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[COPY1]], [[C1]]
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-ashr.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-ashr.mir
index a422c42e77684..1a2233cb79907 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-ashr.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-ashr.mir
@@ -8,8 +8,8 @@ body:             |
     ; CHECK-LABEL: name: ashr_i8
     ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
     ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
     ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[TRUNC]], [[C]]
     ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
     ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
@@ -36,8 +36,8 @@ body:             |
     ; CHECK-LABEL: name: ashr_i15
     ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 32767
     ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 32767
     ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[TRUNC]], [[C]]
     ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
     ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 17
@@ -64,8 +64,8 @@ body:             |
     ; CHECK-LABEL: name: ashr_i16
     ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
     ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
     ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[TRUNC]], [[C]]
     ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
     ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
@@ -149,15 +149,9 @@ body:             |
     ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 63
     ; CHECK-NEXT: [[ASHR2:%[0-9]+]]:_(s64) = G_ASHR [[ASHR]], [[C3]](s64)
     ; CHECK-NEXT: [[ASHR3:%[0-9]+]]:_(s64) = G_ASHR [[ASHR]], [[SUB]](s64)
-    ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[ICMP]], [[C4]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[AND]](s64), [[OR]], [[ASHR3]]
-    ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[ICMP1]], [[C5]]
-    ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[AND1]](s64), %x00, [[SELECT]]
-    ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s64) = G_AND [[ICMP]], [[C6]]
-    ; CHECK-NEXT: [[SELECT2:%[0-9]+]]:_(s64) = G_SELECT [[AND2]](s64), [[ASHR1]], [[ASHR2]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s64), [[OR]], [[ASHR3]]
+    ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[ICMP1]](s64), %x00, [[SELECT]]
+    ; CHECK-NEXT: [[SELECT2:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s64), [[ASHR1]], [[ASHR2]]
     ; CHECK-NEXT: $x10 = COPY [[SELECT1]](s64)
     ; CHECK-NEXT: $x11 = COPY [[SELECT2]](s64)
     ; CHECK-NEXT: PseudoRET implicit $x10, implicit $x11
@@ -198,15 +192,9 @@ body:             |
     ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 63
     ; CHECK-NEXT: [[ASHR1:%[0-9]+]]:_(s64) = G_ASHR %hi1, [[C2]](s64)
     ; CHECK-NEXT: [[ASHR2:%[0-9]+]]:_(s64) = G_ASHR %hi1, [[SUB]](s64)
-    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[ICMP]], [[C3]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[AND]](s64), [[OR]], [[ASHR2]]
-    ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[ICMP1]], [[C4]]
-    ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[AND1]](s64), %lo1, [[SELECT]]
-    ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s64) = G_AND [[ICMP]], [[C5]]
-    ; CHECK-NEXT: [[SELECT2:%[0-9]+]]:_(s64) = G_SELECT [[AND2]](s64), [[ASHR]], [[ASHR1]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s64), [[OR]], [[ASHR2]]
+    ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[ICMP1]](s64), %lo1, [[SELECT]]
+    ; CHECK-NEXT: [[SELECT2:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s64), [[ASHR]], [[ASHR1]]
     ; CHECK-NEXT: $x10 = COPY [[SELECT1]](s64)
     ; CHECK-NEXT: $x11 = COPY [[SELECT2]](s64)
     ; CHECK-NEXT: PseudoRET implicit $x10, implicit $x11
@@ -250,95 +238,63 @@ body:             |
     ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[ASHR]], [[SUB3]](s64)
     ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s64) = G_OR [[LSHR]], [[SHL]]
     ; CHECK-NEXT: [[ASHR1:%[0-9]+]]:_(s64) = G_ASHR [[ASHR]], [[SUB2]](s64)
-    ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[ICMP2]], [[C5]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[AND]](s64), [[OR]], [[ASHR1]]
-    ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[ICMP3]], [[C6]]
-    ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[AND1]](s64), %hi1, [[SELECT]]
-    ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 64
-    ; CHECK-NEXT: [[SUB4:%[0-9]+]]:_(s64) = G_SUB %lo2, [[C7]]
-    ; CHECK-NEXT: [[SUB5:%[0-9]+]]:_(s64) = G_SUB [[C7]], %lo2
-    ; CHECK-NEXT: [[C8:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-    ; CHECK-NEXT: [[ICMP4:%[0-9]+]]:_(s64) = G_ICMP intpred(ult), %lo2(s64), [[C7]]
-    ; CHECK-NEXT: [[ICMP5:%[0-9]+]]:_(s64) = G_ICMP intpred(eq), %lo2(s64), [[C8]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP2]](s64), [[OR]], [[ASHR1]]
+    ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[ICMP3]](s64), %hi1, [[SELECT]]
+    ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 64
+    ; CHECK-NEXT: [[SUB4:%[0-9]+]]:_(s64) = G_SUB %lo2, [[C5]]
+    ; CHECK-NEXT: [[SUB5:%[0-9]+]]:_(s64) = G_SUB [[C5]], %lo2
+    ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[ICMP4:%[0-9]+]]:_(s64) = G_ICMP intpred(ult), %lo2(s64), [[C5]]
+    ; CHECK-NEXT: [[ICMP5:%[0-9]+]]:_(s64) = G_ICMP intpred(eq), %lo2(s64), [[C6]]
     ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(s64) = G_LSHR %mid1, %lo2(s64)
     ; CHECK-NEXT: [[LSHR2:%[0-9]+]]:_(s64) = G_LSHR %lo1, %lo2(s64)
     ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s64) = G_SHL %mid1, [[SUB5]](s64)
     ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s64) = G_OR [[LSHR2]], [[SHL1]]
-    ; CHECK-NEXT: [[C9:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
     ; CHECK-NEXT: [[LSHR3:%[0-9]+]]:_(s64) = G_LSHR %mid1, [[SUB4]](s64)
-    ; CHECK-NEXT: [[C10:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s64) = G_AND [[ICMP4]], [[C10]]
-    ; CHECK-NEXT: [[SELECT2:%[0-9]+]]:_(s64) = G_SELECT [[AND2]](s64), [[OR1]], [[LSHR3]]
-    ; CHECK-NEXT: [[C11:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s64) = G_AND [[ICMP5]], [[C11]]
-    ; CHECK-NEXT: [[SELECT3:%[0-9]+]]:_(s64) = G_SELECT [[AND3]](s64), %lo1, [[SELECT2]]
-    ; CHECK-NEXT: [[C12:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; CHECK-NEXT: [[AND4:%[0-9]+]]:_(s64) = G_AND [[ICMP4]], [[C12]]
-    ; CHECK-NEXT: [[SELECT4:%[0-9]+]]:_(s64) = G_SELECT [[AND4]](s64), [[LSHR1]], [[C9]]
-    ; CHECK-NEXT: [[C13:%[0-9]+]]:_(s64) = G_CONSTANT i64 64
-    ; CHECK-NEXT: [[SUB6:%[0-9]+]]:_(s64) = G_SUB [[SUB1]], [[C13]]
-    ; CHECK-NEXT: [[SUB7:%[0-9]+]]:_(s64) = G_SUB [[C13]], [[SUB1]]
-    ; CHECK-NEXT: [[C14:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-    ; CHECK-NEXT: [[ICMP6:%[0-9]+]]:_(s64) = G_ICMP intpred(ult), [[SUB1]](s64), [[C13]]
-    ; CHECK-NEXT: [[ICMP7:%[0-9]+]]:_(s64) = G_ICMP intpred(eq), [[SUB1]](s64), [[C14]]
+    ; CHECK-NEXT: [[SELECT2:%[0-9]+]]:_(s64) = G_SELECT [[ICMP4]](s64), [[OR1]], [[LSHR3]]
+    ; CHECK-NEXT: [[SELECT3:%[0-9]+]]:_(s64) = G_SELECT [[ICMP5]](s64), %lo1, [[SELECT2]]
+    ; CHECK-NEXT: [[SELECT4:%[0-9]+]]:_(s64) = G_SELECT [[ICMP4]](s64), [[LSHR1]], [[C7]]
+    ; CHECK-NEXT: [[C8:%[0-9]+]]:_(s64) = G_CONSTANT i64 64
+    ; CHECK-NEXT: [[SUB6:%[0-9]+]]:_(s64) = G_SUB [[SUB1]], [[C8]]
+    ; CHECK-NEXT: [[SUB7:%[0-9]+]]:_(s64) = G_SUB [[C8]], [[SUB1]]
+    ; CHECK-NEXT: [[C9:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[ICMP6:%[0-9]+]]:_(s64) = G_ICMP intpred(ult), [[SUB1]](s64), [[C8]]
+    ; CHECK-NEXT: [[ICMP7:%[0-9]+]]:_(s64) = G_ICMP intpred(eq), [[SUB1]](s64), [[C9]]
     ; CHECK-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL %hi1, [[SUB1]](s64)
     ; CHECK-NEXT: [[LSHR4:%[0-9]+]]:_(s64) = G_LSHR %hi1, [[SUB7]](s64)
     ; CHECK-NEXT: [[SHL3:%[0-9]+]]:_(s64) = G_SHL [[ASHR]], [[SUB1]](s64)
     ; CHECK-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[LSHR4]], [[SHL3]]
-    ; CHECK-NEXT: [[C15:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[C10:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
     ; CHECK-NEXT: [[SHL4:%[0-9]+]]:_(s64) = G_SHL %hi1, [[SUB6]](s64)
-    ; CHECK-NEXT: [[C16:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; CHECK-NEXT: [[AND5:%[0-9]+]]:_(s64) = G_AND [[ICMP6]], [[C16]]
-    ; CHECK-NEXT: [[SELECT5:%[0-9]+]]:_(s64) = G_SELECT [[AND5]](s64), [[SHL2]], [[C15]]
-    ; CHECK-NEXT: [[C17:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; CHECK-NEXT: [[AND6:%[0-9]+]]:_(s64) = G_AND [[ICMP6]], [[C17]]
-    ; CHECK-NEXT: [[SELECT6:%[0-9]+]]:_(s64) = G_SELECT [[AND6]](s64), [[OR2]], [[SHL4]]
-    ; CHECK-NEXT: [[C18:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; CHECK-NEXT: [[AND7:%[0-9]+]]:_(s64) = G_AND [[ICMP7]], [[C18]]
-    ; CHECK-NEXT: [[SELECT7:%[0-9]+]]:_(s64) = G_SELECT [[AND7]](s64), [[ASHR]], [[SELECT6]]
+    ; CHECK-NEXT: [[SELECT5:%[0-9]+]]:_(s64) = G_SELECT [[ICMP6]](s64), [[SHL2]], [[C10]]
+    ; CHECK-NEXT: [[SELECT6:%[0-9]+]]:_(s64) = G_SELECT [[ICMP6]](s64), [[OR2]], [[SHL4]]
+    ; CHECK-NEXT: [[SELECT7:%[0-9]+]]:_(s64) = G_SELECT [[ICMP7]](s64), [[ASHR]], [[SELECT6]]
     ; CHECK-NEXT: [[OR3:%[0-9]+]]:_(s64) = G_OR [[SELECT3]], [[SELECT5]]
     ; CHECK-NEXT: [[OR4:%[0-9]+]]:_(s64) = G_OR [[SELECT4]], [[SELECT7]]
-    ; CHECK-NEXT: [[C19:%[0-9]+]]:_(s64) = G_CONSTANT i64 63
-    ; CHECK-NEXT: [[ASHR2:%[0-9]+]]:_(s64) = G_ASHR [[ASHR]], [[C19]](s64)
-    ; CHECK-NEXT: [[C20:%[0-9]+]]:_(s64) = G_CONSTANT i64 64
-    ; CHECK-NEXT: [[SUB8:%[0-9]+]]:_(s64) = G_SUB [[SUB]], [[C20]]
-    ; CHECK-NEXT: [[SUB9:%[0-9]+]]:_(s64) = G_SUB [[C20]], [[SUB]]
-    ; CHECK-NEXT: [[C21:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-    ; CHECK-NEXT: [[ICMP8:%[0-9]+]]:_(s64) = G_ICMP intpred(ult), [[SUB]](s64), [[C20]]
-    ; CHECK-NEXT: [[ICMP9:%[0-9]+]]:_(s64) = G_ICMP intpred(eq), [[SUB]](s64), [[C21]]
+    ; CHECK-NEXT: [[C11:%[0-9]+]]:_(s64) = G_CONSTANT i64 63
+    ; CHECK-NEXT: [[ASHR2:%[0-9]+]]:_(s64) = G_ASHR [[ASHR]], [[C11]](s64)
+    ; CHECK-NEXT: [[C12:%[0-9]+]]:_(s64) = G_CONSTANT i64 64
+    ; CHECK-NEXT: [[SUB8:%[0-9]+]]:_(s64) = G_SUB [[SUB]], [[C12]]
+    ; CHECK-NEXT: [[SUB9:%[0-9]+]]:_(s64) = G_SUB [[C12]], [[SUB]]
+    ; CHECK-NEXT: [[C13:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[ICMP8:%[0-9]+]]:_(s64) = G_ICMP intpred(ult), [[SUB]](s64), [[C12]]
+    ; CHECK-NEXT: [[ICMP9:%[0-9]+]]:_(s64) = G_ICMP intpred(eq), [[SUB]](s64), [[C13]]
     ; CHECK-NEXT: [[ASHR3:%[0-9]+]]:_(s64) = G_ASHR [[ASHR]], [[SUB]](s64)
     ; CHECK-NEXT: [[LSHR5:%[0-9]+]]:_(s64) = G_LSHR %hi1, [[SUB]](s64)
     ; CHECK-NEXT: [[SHL5:%[0-9]+]]:_(s64) = G_SHL [[ASHR]], [[SUB9]](s64)
     ; CHECK-NEXT: [[OR5:%[0-9]+]]:_(s64) = G_OR [[LSHR5]], [[SHL5]]
-    ; CHECK-NEXT: [[C22:%[0-9]+]]:_(s64) = G_CONSTANT i64 63
-    ; CHECK-NEXT: [[ASHR4:%[0-9]+]]:_(s64) = G_ASHR [[ASHR]], [[C22]](s64)
+    ; CHECK-NEXT: [[C14:%[0-9]+]]:_(s64) = G_CONSTANT i64 63
+    ; CHECK-NEXT: [[ASHR4:%[0-9]+]]:_(s64) = G_ASHR [[ASHR]], [[C14]](s64)
     ; CHECK-NEXT: [[ASHR5:%[0-9]+]]:_(s64) = G_ASHR [[ASHR]], [[SUB8]](s64)
-    ; CHECK-NEXT: [[C23:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; CHECK-NEXT: [[AND8:%[0-9]+]]:_(s64) = G_AND [[ICMP8]], [[C23]]
-    ; CHECK-NEXT: [[SELECT8:%[0-9]+]]:_(s64) = G_SELECT [[AND8]](s64), [[OR5]], [[ASHR5]]
-    ; CHECK-NEXT: [[C24:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; CHECK-NEXT: [[AND9:%[0-9]+]]:_(s64) = G_AND [[ICMP9]], [[C24]]
-    ; CHECK-NEXT: [[SELECT9:%[0-9]+]]:_(s64) = G_SELECT [[AND9]](s64), %hi1, [[SELECT8]]
-    ; CHECK-NEXT: [[C25:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; CHECK-NEXT: [[AND10:%[0-9]+]]:_(s64) = G_AND [[ICMP8]], [[C25]]
-    ; CHECK-NEXT: [[SELECT10:%[0-9]+]]:_(s64) = G_SELECT [[AND10]](s64), [[ASHR3]], [[ASHR4]]
-    ; CHECK-NEXT: [[C26:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; CHECK-NEXT: [[AND11:%[0-9]+]]:_(s64) = G_AND [[ICMP]], [[C26]]
-    ; CHECK-NEXT: [[SELECT11:%[0-9]+]]:_(s64) = G_SELECT [[AND11]](s64), [[OR3]], [[SELECT9]]
-    ; CHECK-NEXT: [[C27:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; CHECK-NEXT: [[AND12:%[0-9]+]]:_(s64) = G_AND [[ICMP]], [[C27]]
-    ; CHECK-NEXT: [[SELECT12:%[0-9]+]]:_(s64) = G_SELECT [[AND12]](s64), [[OR4]], [[SELECT10]]
-    ; CHECK-NEXT: [[C28:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; CHECK-NEXT: [[AND13:%[0-9]+]]:_(s64) = G_AND [[ICMP1]], [[C28]]
-    ; CHECK-NEXT: [[SELECT13:%[0-9]+]]:_(s64) = G_SELECT [[AND13]](s64), %lo1, [[SELECT11]]
-    ; CHECK-NEXT: [[C29:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; CHECK-NEXT: [[AND14:%[0-9]+]]:_(s64) = G_AND [[ICMP1]], [[C29]]
-    ; CHECK-NEXT: [[SELECT14:%[0-9]+]]:_(s64) = G_SELECT [[AND14]](s64), %mid1, [[SELECT12]]
-    ; CHECK-NEXT: [[C30:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; CHECK-NEXT: [[AND15:%[0-9]+]]:_(s64) = G_AND [[ICMP]], [[C30]]
-    ; CHECK-NEXT: [[SELECT15:%[0-9]+]]:_(s64) = G_SELECT [[AND15]](s64), [[SELECT1]], [[ASHR2]]
+    ; CHECK-NEXT: [[SELECT8:%[0-9]+]]:_(s64) = G_SELECT [[ICMP8]](s64), [[OR5]], [[ASHR5]]
+    ; CHECK-NEXT: [[SELECT9:%[0-9]+]]:_(s64) = G_SELECT [[ICMP9]](s64), %hi1, [[SELECT8]]
+    ; CHECK-NEXT: [[SELECT10:%[0-9]+]]:_(s64) = G_SELECT [[ICMP8]](s64), [[ASHR3]], [[ASHR4]]
+    ; CHECK-NEXT: [[SELECT11:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s64), [[OR3]], [[SELECT9]]
+    ; CHECK-NEXT: [[SELECT12:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s64), [[OR4]], [[SELECT10]]
+    ; CHECK-NEXT: [[SELECT13:%[0-9]+]]:_(s64) = G_SELECT [[ICMP1]](s64), %lo1, [[SELECT11]]
+    ; CHECK-NEXT: [[SELECT14:%[0-9]+]]:_(s64) = G_SELECT [[ICMP1]](s64), %mid1, [[SELECT12]]
+    ; CHECK-NEXT: [[SELECT15:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s64), [[SELECT1]], [[ASHR2]]
     ; CHECK-NEXT: $x10 = COPY [[SELECT13]](s64)
     ; CHECK-NEXT: $x11 = COPY [[SELECT14]](s64)
     ; CHECK-NEXT: $x12 = COPY [[SELECT15]](s64)
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-bswap.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-bswap.mir
index 07bdb346fd2ae..b8b5a9f59c00a 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-bswap.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-bswap.mir
@@ -14,16 +14,13 @@ body:             |
     ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
     ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[ASSERT_ZEXT]](s64)
     ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[TRUNC]], [[C]](s32)
-    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
     ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[ASSERT_ZEXT]](s64)
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[TRUNC1]], [[C2]]
-    ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[AND]], [[C1]](s32)
+    ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[TRUNC1]], [[C]](s32)
     ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[LSHR]], [[SHL]]
-    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 65535
     ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR]](s32)
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[ANYEXT]], [[C3]]
-    ; CHECK-NEXT: $x10 = COPY [[AND1]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 65535
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[ANYEXT]], [[C1]]
+    ; CHECK-NEXT: $x10 = COPY [[AND]](s64)
     ; CHECK-NEXT: PseudoRET implicit $x10
     %0:_(s64) = COPY $x10
     %1:_(s64) = G_ASSERT_ZEXT %0, 16
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-div.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-div.mir
index d3895cb54092e..63361e903577c 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-div.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-div.mir
@@ -332,11 +332,11 @@ body:             |
     ; CHECK-M-LABEL: name: udiv_i8
     ; CHECK-M: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
     ; CHECK-M-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
-    ; CHECK-M-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
     ; CHECK-M-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-M-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
     ; CHECK-M-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[TRUNC]], [[C]]
-    ; CHECK-M-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
     ; CHECK-M-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64)
+    ; CHECK-M-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
     ; CHECK-M-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[TRUNC1]], [[C1]]
     ; CHECK-M-NEXT: [[UDIV:%[0-9]+]]:_(s32) = G_UDIV [[AND]], [[AND1]]
     ; CHECK-M-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[UDIV]](s32)
@@ -373,11 +373,11 @@ body:             |
     ; CHECK-M-LABEL: name: udiv_i15
     ; CHECK-M: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
     ; CHECK-M-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
-    ; CHECK-M-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 32767
     ; CHECK-M-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-M-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 32767
     ; CHECK-M-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[TRUNC]], [[C]]
-    ; CHECK-M-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 32767
     ; CHECK-M-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64)
+    ; CHECK-M-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 32767
     ; CHECK-M-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[TRUNC1]], [[C1]]
     ; CHECK-M-NEXT: [[UDIV:%[0-9]+]]:_(s32) = G_UDIV [[AND]], [[AND1]]
     ; CHECK-M-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[UDIV]](s32)
@@ -414,11 +414,11 @@ body:             |
     ; CHECK-M-LABEL: name: udiv_i16
     ; CHECK-M: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
     ; CHECK-M-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
-    ; CHECK-M-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
     ; CHECK-M-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-M-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
     ; CHECK-M-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[TRUNC]], [[C]]
-    ; CHECK-M-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
     ; CHECK-M-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64)
+    ; CHECK-M-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
     ; CHECK-M-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[TRUNC1]], [[C1]]
     ; CHECK-M-NEXT: [[UDIV:%[0-9]+]]:_(s32) = G_UDIV [[AND]], [[AND1]]
     ; CHECK-M-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[UDIV]](s32)
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-icmp.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-icmp.mir
index 351fdf18bfc95..de47b1493e4da 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-icmp.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-icmp.mir
@@ -137,9 +137,7 @@ body:             |
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(sgt), [[ASHR]](s64), [[ASHR1]]
     ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s64) = G_ICMP intpred(eq), [[ASHR]](s64), [[ASHR1]]
     ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s64) = G_ICMP intpred(ugt), %xhi(s64), %yhi
-    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[ICMP1]], [[C2]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[AND]](s64), [[ICMP2]], [[ICMP]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP1]](s64), [[ICMP2]], [[ICMP]]
     ; CHECK-NEXT: $x10 = COPY [[SELECT]](s64)
     ; CHECK-NEXT: PseudoRET implicit $x10
     %xhi:_(s64) = COPY $x10
@@ -168,9 +166,7 @@ body:             |
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(sgt), %xlo(s64), %ylo
     ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s64) = G_ICMP intpred(eq), %xlo(s64), %ylo
     ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s64) = G_ICMP intpred(ugt), %xhi(s64), %yhi
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[ICMP1]], [[C]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[AND]](s64), [[ICMP2]], [[ICMP]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP1]](s64), [[ICMP2]], [[ICMP]]
     ; CHECK-NEXT: $x10 = COPY [[SELECT]](s64)
     ; CHECK-NEXT: PseudoRET implicit $x10
     %xhi:_(s64) = COPY $x10
@@ -321,9 +317,7 @@ body:             |
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(sge), [[ASHR]](s64), [[ASHR1]]
     ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s64) = G_ICMP intpred(eq), [[ASHR]](s64), [[ASHR1]]
     ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s64) = G_ICMP intpred(uge), %xhi(s64), %yhi
-    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[ICMP1]], [[C2]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[AND]](s64), [[ICMP2]], [[ICMP]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP1]](s64), [[ICMP2]], [[ICMP]]
     ; CHECK-NEXT: $x10 = COPY [[SELECT]](s64)
     ; CHECK-NEXT: PseudoRET implicit $x10
     %xhi:_(s64) = COPY $x10
@@ -352,9 +346,7 @@ body:             |
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(slt), %xlo(s64), %ylo
     ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s64) = G_ICMP intpred(eq), %xlo(s64), %ylo
     ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s64) = G_ICMP intpred(ult), %xhi(s64), %yhi
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[ICMP1]], [[C]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[AND]](s64), [[ICMP2]], [[ICMP]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP1]](s64), [[ICMP2]], [[ICMP]]
     ; CHECK-NEXT: $x10 = COPY [[SELECT]](s64)
     ; CHECK-NEXT: PseudoRET implicit $x10
     %xhi:_(s64) = COPY $x10
@@ -505,9 +497,7 @@ body:             |
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(sge), [[ASHR]](s64), [[ASHR1]]
     ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s64) = G_ICMP intpred(eq), [[ASHR]](s64), [[ASHR1]]
     ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s64) = G_ICMP intpred(uge), %xhi(s64), %yhi
-    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[ICMP1]], [[C2]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[AND]](s64), [[ICMP2]], [[ICMP]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP1]](s64), [[ICMP2]], [[ICMP]]
     ; CHECK-NEXT: $x10 = COPY [[SELECT]](s64)
     ; CHECK-NEXT: PseudoRET implicit $x10
     %xhi:_(s64) = COPY $x10
@@ -536,9 +526,7 @@ body:             |
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(sge), %xlo(s64), %ylo
     ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s64) = G_ICMP intpred(eq), %xlo(s64), %ylo
     ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s64) = G_ICMP intpred(uge), %xhi(s64), %yhi
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[ICMP1]], [[C]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[AND]](s64), [[ICMP2]], [[ICMP]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP1]](s64), [[ICMP2]], [[ICMP]]
     ; CHECK-NEXT: $x10 = COPY [[SELECT]](s64)
     ; CHECK-NEXT: PseudoRET implicit $x10
     %xhi:_(s64) = COPY $x10
@@ -689,9 +677,7 @@ body:             |
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(sle), [[ASHR]](s64), [[ASHR1]]
     ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s64) = G_ICMP intpred(eq), [[ASHR]](s64), [[ASHR1]]
     ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s64) = G_ICMP intpred(ule), %xhi(s64), %yhi
-    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[ICMP1]], [[C2]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[AND]](s64), [[ICMP2]], [[ICMP]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP1]](s64), [[ICMP2]], [[ICMP]]
     ; CHECK-NEXT: $x10 = COPY [[SELECT]](s64)
     ; CHECK-NEXT: PseudoRET implicit $x10
     %xhi:_(s64) = COPY $x10
@@ -720,9 +706,7 @@ body:             |
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(sle), %xlo(s64), %ylo
     ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s64) = G_ICMP intpred(eq), %xlo(s64), %ylo
     ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s64) = G_ICMP intpred(ule), %xhi(s64), %yhi
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[ICMP1]], [[C]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[AND]](s64), [[ICMP2]], [[ICMP]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP1]](s64), [[ICMP2]], [[ICMP]]
     ; CHECK-NEXT: $x10 = COPY [[SELECT]](s64)
     ; CHECK-NEXT: PseudoRET implicit $x10
     %xhi:_(s64) = COPY $x10
@@ -871,9 +855,7 @@ body:             |
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(ugt), [[AND1]](s64), [[AND3]]
     ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s64) = G_ICMP intpred(eq), [[AND1]](s64), [[AND3]]
     ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s64) = G_ICMP intpred(ugt), [[AND]](s64), [[AND2]]
-    ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; CHECK-NEXT: [[AND4:%[0-9]+]]:_(s64) = G_AND [[ICMP1]], [[C4]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[AND4]](s64), [[ICMP2]], [[ICMP]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP1]](s64), [[ICMP2]], [[ICMP]]
     ; CHECK-NEXT: $x10 = COPY [[SELECT]](s64)
     ; CHECK-NEXT: PseudoRET implicit $x10
     %xhi:_(s64) = COPY $x10
@@ -902,9 +884,7 @@ body:             |
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(ugt), %xlo(s64), %ylo
     ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s64) = G_ICMP intpred(eq), %xlo(s64), %ylo
     ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s64) = G_ICMP intpred(ugt), %xhi(s64), %yhi
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[ICMP1]], [[C]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[AND]](s64), [[ICMP2]], [[ICMP]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP1]](s64), [[ICMP2]], [[ICMP]]
     ; CHECK-NEXT: $x10 = COPY [[SELECT]](s64)
     ; CHECK-NEXT: PseudoRET implicit $x10
     %xhi:_(s64) = COPY $x10
@@ -1053,9 +1033,7 @@ body:             |
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(ult), [[AND1]](s64), [[AND3]]
     ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s64) = G_ICMP intpred(eq), [[AND1]](s64), [[AND3]]
     ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s64) = G_ICMP intpred(ult), [[AND]](s64), [[AND2]]
-    ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; CHECK-NEXT: [[AND4:%[0-9]+]]:_(s64) = G_AND [[ICMP1]], [[C4]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[AND4]](s64), [[ICMP2]], [[ICMP]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP1]](s64), [[ICMP2]], [[ICMP]]
     ; CHECK-NEXT: $x10 = COPY [[SELECT]](s64)
     ; CHECK-NEXT: PseudoRET implicit $x10
     %xhi:_(s64) = COPY $x10
@@ -1084,9 +1062,7 @@ body:             |
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(sge), %xlo(s64), %ylo
     ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s64) = G_ICMP intpred(eq), %xlo(s64), %ylo
     ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s64) = G_ICMP intpred(uge), %xhi(s64), %yhi
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[ICMP1]], [[C]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[AND]](s64), [[ICMP2]], [[ICMP]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP1]](s64), [[ICMP2]], [[ICMP]]
     ; CHECK-NEXT: $x10 = COPY [[SELECT]](s64)
     ; CHECK-NEXT: PseudoRET implicit $x10
     %xhi:_(s64) = COPY $x10
@@ -1235,9 +1211,7 @@ body:             |
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(uge), [[AND1]](s64), [[AND3]]
     ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s64) = G_ICMP intpred(eq), [[AND1]](s64), [[AND3]]
     ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s64) = G_ICMP intpred(uge), [[AND]](s64), [[AND2]]
-    ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; CHECK-NEXT: [[AND4:%[0-9]+]]:_(s64) = G_AND [[ICMP1]], [[C4]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[AND4]](s64), [[ICMP2]], [[ICMP]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP1]](s64), [[ICMP2]], [[ICMP]]
     ; CHECK-NEXT: $x10 = COPY [[SELECT]](s64)
     ; CHECK-NEXT: PseudoRET implicit $x10
     %xhi:_(s64) = COPY $x10
@@ -1266,9 +1240,7 @@ body:             |
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(uge), %xlo(s64), %ylo
     ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s64) = G_ICMP intpred(eq), %xlo(s64), %ylo
     ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s64) = G_ICMP intpred(uge), %xhi(s64), %yhi
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[ICMP1]], [[C]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[AND]](s64), [[ICMP2]], [[ICMP]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP1]](s64), [[ICMP2]], [[ICMP]]
     ; CHECK-NEXT: $x10 = COPY [[SELECT]](s64)
     ; CHECK-NEXT: PseudoRET implicit $x10
     %xhi:_(s64) = COPY $x10
@@ -1417,9 +1389,7 @@ body:             |
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(ule), [[AND1]](s64), [[AND3]]
     ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s64) = G_ICMP intpred(eq), [[AND1]](s64), [[AND3]]
     ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s64) = G_ICMP intpred(ule), [[AND]](s64), [[AND2]]
-    ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; CHECK-NEXT: [[AND4:%[0-9]+]]:_(s64) = G_AND [[ICMP1]], [[C4]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[AND4]](s64), [[ICMP2]], [[ICMP]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP1]](s64), [[ICMP2]], [[ICMP]]
     ; CHECK-NEXT: $x10 = COPY [[SELECT]](s64)
     ; CHECK-NEXT: PseudoRET implicit $x10
     %xhi:_(s64) = COPY $x10
@@ -1448,9 +1418,7 @@ body:             |
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(ule), %xlo(s64), %ylo
     ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s64) = G_ICMP intpred(eq), %xlo(s64), %ylo
     ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s64) = G_ICMP intpred(ule), %xhi(s64), %yhi
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[ICMP1]], [[C]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[AND]](s64), [[ICMP2]], [[ICMP]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP1]](s64), [[ICMP2]], [[ICMP]]
     ; CHECK-NEXT: $x10 = COPY [[SELECT]](s64)
     ; CHECK-NEXT: PseudoRET implicit $x10
     %xhi:_(s64) = COPY $x10
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-lshr.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-lshr.mir
index aa34e7d2cdcae..8cbae0fa0173e 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-lshr.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-lshr.mir
@@ -8,11 +8,11 @@ body:             |
     ; CHECK-LABEL: name: lshr_i8
     ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
     ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
     ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[TRUNC]], [[C]]
-    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
     ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
     ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[TRUNC1]], [[C1]]
     ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[AND1]], [[AND]](s32)
     ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LSHR]](s32)
@@ -35,11 +35,11 @@ body:             |
     ; CHECK-LABEL: name: lshr_i15
     ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 32767
     ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 32767
     ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[TRUNC]], [[C]]
-    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 32767
     ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 32767
     ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[TRUNC1]], [[C1]]
     ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[AND1]], [[AND]](s32)
     ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LSHR]](s32)
@@ -62,11 +62,11 @@ body:             |
     ; CHECK-LABEL: name: lshr_i16
     ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
     ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
     ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[TRUNC]], [[C]]
-    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
     ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
     ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[TRUNC1]], [[C1]]
     ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[AND1]], [[AND]](s32)
     ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[LSHR]](s32)
@@ -146,15 +146,9 @@ body:             |
     ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s64) = G_OR [[LSHR1]], [[SHL]]
     ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
     ; CHECK-NEXT: [[LSHR2:%[0-9]+]]:_(s64) = G_LSHR [[AND1]], [[SUB]](s64)
-    ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s64) = G_AND [[ICMP]], [[C5]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[AND2]](s64), [[OR]], [[LSHR2]]
-    ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s64) = G_AND [[ICMP1]], [[C6]]
-    ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[AND3]](s64), [[AND]], [[SELECT]]
-    ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; CHECK-NEXT: [[AND4:%[0-9]+]]:_(s64) = G_AND [[ICMP]], [[C7]]
-    ; CHECK-NEXT: [[SELECT2:%[0-9]+]]:_(s64) = G_SELECT [[AND4]](s64), [[LSHR]], [[C4]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s64), [[OR]], [[LSHR2]]
+    ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[ICMP1]](s64), [[AND]], [[SELECT]]
+    ; CHECK-NEXT: [[SELECT2:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s64), [[LSHR]], [[C4]]
     ; CHECK-NEXT: $x10 = COPY [[SELECT1]](s64)
     ; CHECK-NEXT: $x11 = COPY [[SELECT2]](s64)
     ; CHECK-NEXT: PseudoRET implicit $x10, implicit $x11
@@ -194,15 +188,9 @@ body:             |
     ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s64) = G_OR [[LSHR1]], [[SHL]]
     ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
     ; CHECK-NEXT: [[LSHR2:%[0-9]+]]:_(s64) = G_LSHR %hi1, [[SUB]](s64)
-    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[ICMP]], [[C3]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[AND]](s64), [[OR]], [[LSHR2]]
-    ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[ICMP1]], [[C4]]
-    ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[AND1]](s64), %lo1, [[SELECT]]
-    ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s64) = G_AND [[ICMP]], [[C5]]
-    ; CHECK-NEXT: [[SELECT2:%[0-9]+]]:_(s64) = G_SELECT [[AND2]](s64), [[LSHR]], [[C2]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s64), [[OR]], [[LSHR2]]
+    ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[ICMP1]](s64), %lo1, [[SELECT]]
+    ; CHECK-NEXT: [[SELECT2:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s64), [[LSHR]], [[C2]]
     ; CHECK-NEXT: $x10 = COPY [[SELECT1]](s64)
     ; CHECK-NEXT: $x11 = COPY [[SELECT2]](s64)
     ; CHECK-NEXT: PseudoRET implicit $x10, implicit $x11
@@ -245,93 +233,61 @@ body:             |
     ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[C]], [[SUB3]](s64)
     ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s64) = G_OR [[LSHR]], [[SHL]]
     ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(s64) = G_LSHR [[C]], [[SUB2]](s64)
-    ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[ICMP2]], [[C5]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[AND]](s64), [[OR]], [[LSHR1]]
-    ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[ICMP3]], [[C6]]
-    ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[AND1]](s64), %hi1, [[SELECT]]
-    ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 64
-    ; CHECK-NEXT: [[SUB4:%[0-9]+]]:_(s64) = G_SUB %lo2, [[C7]]
-    ; CHECK-NEXT: [[SUB5:%[0-9]+]]:_(s64) = G_SUB [[C7]], %lo2
-    ; CHECK-NEXT: [[C8:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-    ; CHECK-NEXT: [[ICMP4:%[0-9]+]]:_(s64) = G_ICMP intpred(ult), %lo2(s64), [[C7]]
-    ; CHECK-NEXT: [[ICMP5:%[0-9]+]]:_(s64) = G_ICMP intpred(eq), %lo2(s64), [[C8]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP2]](s64), [[OR]], [[LSHR1]]
+    ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[ICMP3]](s64), %hi1, [[SELECT]]
+    ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 64
+    ; CHECK-NEXT: [[SUB4:%[0-9]+]]:_(s64) = G_SUB %lo2, [[C5]]
+    ; CHECK-NEXT: [[SUB5:%[0-9]+]]:_(s64) = G_SUB [[C5]], %lo2
+    ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[ICMP4:%[0-9]+]]:_(s64) = G_ICMP intpred(ult), %lo2(s64), [[C5]]
+    ; CHECK-NEXT: [[ICMP5:%[0-9]+]]:_(s64) = G_ICMP intpred(eq), %lo2(s64), [[C6]]
     ; CHECK-NEXT: [[LSHR2:%[0-9]+]]:_(s64) = G_LSHR %mid1, %lo2(s64)
     ; CHECK-NEXT: [[LSHR3:%[0-9]+]]:_(s64) = G_LSHR %lo1, %lo2(s64)
     ; CHECK-NEXT: [[SHL1:%[0-9]+]]:_(s64) = G_SHL %mid1, [[SUB5]](s64)
     ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s64) = G_OR [[LSHR3]], [[SHL1]]
-    ; CHECK-NEXT: [[C9:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
     ; CHECK-NEXT: [[LSHR4:%[0-9]+]]:_(s64) = G_LSHR %mid1, [[SUB4]](s64)
-    ; CHECK-NEXT: [[C10:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s64) = G_AND [[ICMP4]], [[C10]]
-    ; CHECK-NEXT: [[SELECT2:%[0-9]+]]:_(s64) = G_SELECT [[AND2]](s64), [[OR1]], [[LSHR4]]
-    ; CHECK-NEXT: [[C11:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s64) = G_AND [[ICMP5]], [[C11]]
-    ; CHECK-NEXT: [[SELECT3:%[0-9]+]]:_(s64) = G_SELECT [[AND3]](s64), %lo1, [[SELECT2]]
-    ; CHECK-NEXT: [[C12:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; CHECK-NEXT: [[AND4:%[0-9]+]]:_(s64) = G_AND [[ICMP4]], [[C12]]
-    ; CHECK-NEXT: [[SELECT4:%[0-9]+]]:_(s64) = G_SELECT [[AND4]](s64), [[LSHR2]], [[C9]]
-    ; CHECK-NEXT: [[C13:%[0-9]+]]:_(s64) = G_CONSTANT i64 64
-    ; CHECK-NEXT: [[SUB6:%[0-9]+]]:_(s64) = G_SUB [[SUB1]], [[C13]]
-    ; CHECK-NEXT: [[SUB7:%[0-9]+]]:_(s64) = G_SUB [[C13]], [[SUB1]]
-    ; CHECK-NEXT: [[C14:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-    ; CHECK-NEXT: [[ICMP6:%[0-9]+]]:_(s64) = G_ICMP intpred(ult), [[SUB1]](s64), [[C13]]
-    ; CHECK-NEXT: [[ICMP7:%[0-9]+]]:_(s64) = G_ICMP intpred(eq), [[SUB1]](s64), [[C14]]
+    ; CHECK-NEXT: [[SELECT2:%[0-9]+]]:_(s64) = G_SELECT [[ICMP4]](s64), [[OR1]], [[LSHR4]]
+    ; CHECK-NEXT: [[SELECT3:%[0-9]+]]:_(s64) = G_SELECT [[ICMP5]](s64), %lo1, [[SELECT2]]
+    ; CHECK-NEXT: [[SELECT4:%[0-9]+]]:_(s64) = G_SELECT [[ICMP4]](s64), [[LSHR2]], [[C7]]
+    ; CHECK-NEXT: [[C8:%[0-9]+]]:_(s64) = G_CONSTANT i64 64
+    ; CHECK-NEXT: [[SUB6:%[0-9]+]]:_(s64) = G_SUB [[SUB1]], [[C8]]
+    ; CHECK-NEXT: [[SUB7:%[0-9]+]]:_(s64) = G_SUB [[C8]], [[SUB1]]
+    ; CHECK-NEXT: [[C9:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[ICMP6:%[0-9]+]]:_(s64) = G_ICMP intpred(ult), [[SUB1]](s64), [[C8]]
+    ; CHECK-NEXT: [[ICMP7:%[0-9]+]]:_(s64) = G_ICMP intpred(eq), [[SUB1]](s64), [[C9]]
     ; CHECK-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL %hi1, [[SUB1]](s64)
     ; CHECK-NEXT: [[LSHR5:%[0-9]+]]:_(s64) = G_LSHR %hi1, [[SUB7]](s64)
     ; CHECK-NEXT: [[SHL3:%[0-9]+]]:_(s64) = G_SHL [[C]], [[SUB1]](s64)
     ; CHECK-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[LSHR5]], [[SHL3]]
-    ; CHECK-NEXT: [[C15:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[C10:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
     ; CHECK-NEXT: [[SHL4:%[0-9]+]]:_(s64) = G_SHL %hi1, [[SUB6]](s64)
-    ; CHECK-NEXT: [[C16:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; CHECK-NEXT: [[AND5:%[0-9]+]]:_(s64) = G_AND [[ICMP6]], [[C16]]
-    ; CHECK-NEXT: [[SELECT5:%[0-9]+]]:_(s64) = G_SELECT [[AND5]](s64), [[SHL2]], [[C15]]
-    ; CHECK-NEXT: [[C17:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; CHECK-NEXT: [[AND6:%[0-9]+]]:_(s64) = G_AND [[ICMP6]], [[C17]]
-    ; CHECK-NEXT: [[SELECT6:%[0-9]+]]:_(s64) = G_SELECT [[AND6]](s64), [[OR2]], [[SHL4]]
-    ; CHECK-NEXT: [[C18:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; CHECK-NEXT: [[AND7:%[0-9]+]]:_(s64) = G_AND [[ICMP7]], [[C18]]
-    ; CHECK-NEXT: [[SELECT7:%[0-9]+]]:_(s64) = G_SELECT [[AND7]](s64), [[C]], [[SELECT6]]
+    ; CHECK-NEXT: [[SELECT5:%[0-9]+]]:_(s64) = G_SELECT [[ICMP6]](s64), [[SHL2]], [[C10]]
+    ; CHECK-NEXT: [[SELECT6:%[0-9]+]]:_(s64) = G_SELECT [[ICMP6]](s64), [[OR2]], [[SHL4]]
+    ; CHECK-NEXT: [[SELECT7:%[0-9]+]]:_(s64) = G_SELECT [[ICMP7]](s64), [[C]], [[SELECT6]]
     ; CHECK-NEXT: [[OR3:%[0-9]+]]:_(s64) = G_OR [[SELECT3]], [[SELECT5]]
     ; CHECK-NEXT: [[OR4:%[0-9]+]]:_(s64) = G_OR [[SELECT4]], [[SELECT7]]
-    ; CHECK-NEXT: [[C19:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-    ; CHECK-NEXT: [[C20:%[0-9]+]]:_(s64) = G_CONSTANT i64 64
-    ; CHECK-NEXT: [[SUB8:%[0-9]+]]:_(s64) = G_SUB [[SUB]], [[C20]]
-    ; CHECK-NEXT: [[SUB9:%[0-9]+]]:_(s64) = G_SUB [[C20]], [[SUB]]
-    ; CHECK-NEXT: [[C21:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-    ; CHECK-NEXT: [[ICMP8:%[0-9]+]]:_(s64) = G_ICMP intpred(ult), [[SUB]](s64), [[C20]]
-    ; CHECK-NEXT: [[ICMP9:%[0-9]+]]:_(s64) = G_ICMP intpred(eq), [[SUB]](s64), [[C21]]
+    ; CHECK-NEXT: [[C11:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[C12:%[0-9]+]]:_(s64) = G_CONSTANT i64 64
+    ; CHECK-NEXT: [[SUB8:%[0-9]+]]:_(s64) = G_SUB [[SUB]], [[C12]]
+    ; CHECK-NEXT: [[SUB9:%[0-9]+]]:_(s64) = G_SUB [[C12]], [[SUB]]
+    ; CHECK-NEXT: [[C13:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[ICMP8:%[0-9]+]]:_(s64) = G_ICMP intpred(ult), [[SUB]](s64), [[C12]]
+    ; CHECK-NEXT: [[ICMP9:%[0-9]+]]:_(s64) = G_ICMP intpred(eq), [[SUB]](s64), [[C13]]
     ; CHECK-NEXT: [[LSHR6:%[0-9]+]]:_(s64) = G_LSHR [[C]], [[SUB]](s64)
     ; CHECK-NEXT: [[LSHR7:%[0-9]+]]:_(s64) = G_LSHR %hi1, [[SUB]](s64)
     ; CHECK-NEXT: [[SHL5:%[0-9]+]]:_(s64) = G_SHL [[C]], [[SUB9]](s64)
     ; CHECK-NEXT: [[OR5:%[0-9]+]]:_(s64) = G_OR [[LSHR7]], [[SHL5]]
-    ; CHECK-NEXT: [[C22:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[C14:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
     ; CHECK-NEXT: [[LSHR8:%[0-9]+]]:_(s64) = G_LSHR [[C]], [[SUB8]](s64)
-    ; CHECK-NEXT: [[C23:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; CHECK-NEXT: [[AND8:%[0-9]+]]:_(s64) = G_AND [[ICMP8]], [[C23]]
-    ; CHECK-NEXT: [[SELECT8:%[0-9]+]]:_(s64) = G_SELECT [[AND8]](s64), [[OR5]], [[LSHR8]]
-    ; CHECK-NEXT: [[C24:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; CHECK-NEXT: [[AND9:%[0-9]+]]:_(s64) = G_AND [[ICMP9]], [[C24]]
-    ; CHECK-NEXT: [[SELECT9:%[0-9]+]]:_(s64) = G_SELECT [[AND9]](s64), %hi1, [[SELECT8]]
-    ; CHECK-NEXT: [[C25:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; CHECK-NEXT: [[AND10:%[0-9]+]]:_(s64) = G_AND [[ICMP8]], [[C25]]
-    ; CHECK-NEXT: [[SELECT10:%[0-9]+]]:_(s64) = G_SELECT [[AND10]](s64), [[LSHR6]], [[C22]]
-    ; CHECK-NEXT: [[C26:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; CHECK-NEXT: [[AND11:%[0-9]+]]:_(s64) = G_AND [[ICMP]], [[C26]]
-    ; CHECK-NEXT: [[SELECT11:%[0-9]+]]:_(s64) = G_SELECT [[AND11]](s64), [[OR3]], [[SELECT9]]
-    ; CHECK-NEXT: [[C27:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; CHECK-NEXT: [[AND12:%[0-9]+]]:_(s64) = G_AND [[ICMP]], [[C27]]
-    ; CHECK-NEXT: [[SELECT12:%[0-9]+]]:_(s64) = G_SELECT [[AND12]](s64), [[OR4]], [[SELECT10]]
-    ; CHECK-NEXT: [[C28:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; CHECK-NEXT: [[AND13:%[0-9]+]]:_(s64) = G_AND [[ICMP1]], [[C28]]
-    ; CHECK-NEXT: [[SELECT13:%[0-9]+]]:_(s64) = G_SELECT [[AND13]](s64), %lo1, [[SELECT11]]
-    ; CHECK-NEXT: [[C29:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; CHECK-NEXT: [[AND14:%[0-9]+]]:_(s64) = G_AND [[ICMP1]], [[C29]]
-    ; CHECK-NEXT: [[SELECT14:%[0-9]+]]:_(s64) = G_SELECT [[AND14]](s64), %mid1, [[SELECT12]]
-    ; CHECK-NEXT: [[C30:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; CHECK-NEXT: [[AND15:%[0-9]+]]:_(s64) = G_AND [[ICMP]], [[C30]]
-    ; CHECK-NEXT: [[SELECT15:%[0-9]+]]:_(s64) = G_SELECT [[AND15]](s64), [[SELECT1]], [[C19]]
+    ; CHECK-NEXT: [[SELECT8:%[0-9]+]]:_(s64) = G_SELECT [[ICMP8]](s64), [[OR5]], [[LSHR8]]
+    ; CHECK-NEXT: [[SELECT9:%[0-9]+]]:_(s64) = G_SELECT [[ICMP9]](s64), %hi1, [[SELECT8]]
+    ; CHECK-NEXT: [[SELECT10:%[0-9]+]]:_(s64) = G_SELECT [[ICMP8]](s64), [[LSHR6]], [[C14]]
+    ; CHECK-NEXT: [[SELECT11:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s64), [[OR3]], [[SELECT9]]
+    ; CHECK-NEXT: [[SELECT12:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s64), [[OR4]], [[SELECT10]]
+    ; CHECK-NEXT: [[SELECT13:%[0-9]+]]:_(s64) = G_SELECT [[ICMP1]](s64), %lo1, [[SELECT11]]
+    ; CHECK-NEXT: [[SELECT14:%[0-9]+]]:_(s64) = G_SELECT [[ICMP1]](s64), %mid1, [[SELECT12]]
+    ; CHECK-NEXT: [[SELECT15:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s64), [[SELECT1]], [[C11]]
     ; CHECK-NEXT: $x10 = COPY [[SELECT13]](s64)
     ; CHECK-NEXT: $x11 = COPY [[SELECT14]](s64)
     ; CHECK-NEXT: $x12 = COPY [[SELECT15]](s64)
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-mul-ext.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-mul-ext.mir
index 8d8c23e25d172..09e002e8428d7 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-mul-ext.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-mul-ext.mir
@@ -194,13 +194,9 @@ body:             |
     ; CHECK-NEXT: [[UMULH:%[0-9]+]]:_(s64) = G_UMULH %lo1, %lo2
     ; CHECK-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[MUL1]], [[MUL2]]
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(ult), [[ADD]](s64), [[MUL2]]
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[ICMP]], [[C]]
     ; CHECK-NEXT: [[ADD1:%[0-9]+]]:_(s64) = G_ADD [[ADD]], [[UMULH]]
     ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s64) = G_ICMP intpred(ult), [[ADD1]](s64), [[UMULH]]
-    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[ICMP1]], [[C1]]
-    ; CHECK-NEXT: [[ADD2:%[0-9]+]]:_(s64) = G_ADD [[AND]], [[AND1]]
+    ; CHECK-NEXT: [[ADD2:%[0-9]+]]:_(s64) = G_ADD [[ICMP]], [[ICMP1]]
     ; CHECK-NEXT: [[MUL3:%[0-9]+]]:_(s64) = G_MUL %hi1, %lo2
     ; CHECK-NEXT: [[MUL4:%[0-9]+]]:_(s64) = G_MUL %mid1, %mid2
     ; CHECK-NEXT: [[MUL5:%[0-9]+]]:_(s64) = G_MUL %lo1, %hi2
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-rem.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-rem.mir
index e43516ee4acbb..f076b5988948d 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-rem.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-rem.mir
@@ -332,11 +332,11 @@ body:             |
     ; CHECK-M-LABEL: name: urem_i8
     ; CHECK-M: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
     ; CHECK-M-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
-    ; CHECK-M-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
     ; CHECK-M-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-M-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
     ; CHECK-M-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[TRUNC]], [[C]]
-    ; CHECK-M-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
     ; CHECK-M-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64)
+    ; CHECK-M-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
     ; CHECK-M-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[TRUNC1]], [[C1]]
     ; CHECK-M-NEXT: [[UREM:%[0-9]+]]:_(s32) = G_UREM [[AND]], [[AND1]]
     ; CHECK-M-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[UREM]](s32)
@@ -373,11 +373,11 @@ body:             |
     ; CHECK-M-LABEL: name: urem_i15
     ; CHECK-M: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
     ; CHECK-M-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
-    ; CHECK-M-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 32767
     ; CHECK-M-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-M-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 32767
     ; CHECK-M-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[TRUNC]], [[C]]
-    ; CHECK-M-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 32767
     ; CHECK-M-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64)
+    ; CHECK-M-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 32767
     ; CHECK-M-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[TRUNC1]], [[C1]]
     ; CHECK-M-NEXT: [[UREM:%[0-9]+]]:_(s32) = G_UREM [[AND]], [[AND1]]
     ; CHECK-M-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[UREM]](s32)
@@ -414,11 +414,11 @@ body:             |
     ; CHECK-M-LABEL: name: urem_i16
     ; CHECK-M: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
     ; CHECK-M-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
-    ; CHECK-M-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
     ; CHECK-M-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
+    ; CHECK-M-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
     ; CHECK-M-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[TRUNC]], [[C]]
-    ; CHECK-M-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
     ; CHECK-M-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64)
+    ; CHECK-M-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
     ; CHECK-M-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[TRUNC1]], [[C1]]
     ; CHECK-M-NEXT: [[UREM:%[0-9]+]]:_(s32) = G_UREM [[AND]], [[AND1]]
     ; CHECK-M-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[UREM]](s32)
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-shl.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-shl.mir
index 0f9587b4c8b8d..057cbcb9972ac 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-shl.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-shl.mir
@@ -8,8 +8,8 @@ body:             |
     ; CHECK-LABEL: name: shl_i8
     ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
     ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
     ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[TRUNC]], [[C]]
     ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
     ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[TRUNC1]], [[AND]](s32)
@@ -33,8 +33,8 @@ body:             |
     ; CHECK-LABEL: name: shl_i15
     ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 32767
     ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 32767
     ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[TRUNC]], [[C]]
     ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
     ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[TRUNC1]], [[AND]](s32)
@@ -58,8 +58,8 @@ body:             |
     ; CHECK-LABEL: name: shl_i16
     ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
     ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[COPY1]](s64)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
     ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[TRUNC]], [[C]]
     ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[COPY]](s64)
     ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[TRUNC1]], [[AND]](s32)
@@ -136,15 +136,9 @@ body:             |
     ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s64) = G_OR [[LSHR]], [[SHL1]]
     ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
     ; CHECK-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL %x00, [[SUB]](s64)
-    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[ICMP]], [[C3]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[AND]](s64), [[SHL]], [[C2]]
-    ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[ICMP]], [[C4]]
-    ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[AND1]](s64), [[OR]], [[SHL2]]
-    ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s64) = G_AND [[ICMP1]], [[C5]]
-    ; CHECK-NEXT: [[SELECT2:%[0-9]+]]:_(s64) = G_SELECT [[AND2]](s64), %x01, [[SELECT1]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s64), [[SHL]], [[C2]]
+    ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s64), [[OR]], [[SHL2]]
+    ; CHECK-NEXT: [[SELECT2:%[0-9]+]]:_(s64) = G_SELECT [[ICMP1]](s64), %x01, [[SELECT1]]
     ; CHECK-NEXT: $x10 = COPY [[SELECT]](s64)
     ; CHECK-NEXT: $x11 = COPY [[SELECT2]](s64)
     ; CHECK-NEXT: PseudoRET implicit $x10, implicit $x11
@@ -184,15 +178,9 @@ body:             |
     ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s64) = G_OR [[LSHR]], [[SHL1]]
     ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
     ; CHECK-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL %lo1, [[SUB]](s64)
-    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[ICMP]], [[C3]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[AND]](s64), [[SHL]], [[C2]]
-    ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[ICMP]], [[C4]]
-    ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[AND1]](s64), [[OR]], [[SHL2]]
-    ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s64) = G_AND [[ICMP1]], [[C5]]
-    ; CHECK-NEXT: [[SELECT2:%[0-9]+]]:_(s64) = G_SELECT [[AND2]](s64), %hi1, [[SELECT1]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s64), [[SHL]], [[C2]]
+    ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s64), [[OR]], [[SHL2]]
+    ; CHECK-NEXT: [[SELECT2:%[0-9]+]]:_(s64) = G_SELECT [[ICMP1]](s64), %hi1, [[SELECT1]]
     ; CHECK-NEXT: $x10 = COPY [[SELECT]](s64)
     ; CHECK-NEXT: $x11 = COPY [[SELECT2]](s64)
     ; CHECK-NEXT: PseudoRET implicit $x10, implicit $x11
@@ -236,60 +224,38 @@ body:             |
     ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s64) = G_OR [[LSHR]], [[SHL1]]
     ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
     ; CHECK-NEXT: [[SHL2:%[0-9]+]]:_(s64) = G_SHL %lo1, [[SUB2]](s64)
-    ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[ICMP2]], [[C5]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[AND]](s64), [[SHL]], [[C4]]
-    ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[ICMP2]], [[C6]]
-    ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[AND1]](s64), [[OR]], [[SHL2]]
-    ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s64) = G_AND [[ICMP3]], [[C7]]
-    ; CHECK-NEXT: [[SELECT2:%[0-9]+]]:_(s64) = G_SELECT [[AND2]](s64), %mid1, [[SELECT1]]
-    ; CHECK-NEXT: [[C8:%[0-9]+]]:_(s64) = G_CONSTANT i64 64
-    ; CHECK-NEXT: [[SUB4:%[0-9]+]]:_(s64) = G_SUB [[SUB1]], [[C8]]
-    ; CHECK-NEXT: [[SUB5:%[0-9]+]]:_(s64) = G_SUB [[C8]], [[SUB1]]
-    ; CHECK-NEXT: [[C9:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-    ; CHECK-NEXT: [[ICMP4:%[0-9]+]]:_(s64) = G_ICMP intpred(ult), [[SUB1]](s64), [[C8]]
-    ; CHECK-NEXT: [[ICMP5:%[0-9]+]]:_(s64) = G_ICMP intpred(eq), [[SUB1]](s64), [[C9]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP2]](s64), [[SHL]], [[C4]]
+    ; CHECK-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[ICMP2]](s64), [[OR]], [[SHL2]]
+    ; CHECK-NEXT: [[SELECT2:%[0-9]+]]:_(s64) = G_SELECT [[ICMP3]](s64), %mid1, [[SELECT1]]
+    ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 64
+    ; CHECK-NEXT: [[SUB4:%[0-9]+]]:_(s64) = G_SUB [[SUB1]], [[C5]]
+    ; CHECK-NEXT: [[SUB5:%[0-9]+]]:_(s64) = G_SUB [[C5]], [[SUB1]]
+    ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[ICMP4:%[0-9]+]]:_(s64) = G_ICMP intpred(ult), [[SUB1]](s64), [[C5]]
+    ; CHECK-NEXT: [[ICMP5:%[0-9]+]]:_(s64) = G_ICMP intpred(eq), [[SUB1]](s64), [[C6]]
     ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(s64) = G_LSHR %lo1, [[SUB1]](s64)
     ; CHECK-NEXT: [[SHL3:%[0-9]+]]:_(s64) = G_SHL %mid1, [[SUB5]](s64)
     ; CHECK-NEXT: [[OR1:%[0-9]+]]:_(s64) = G_OR [[LSHR1]], [[SHL3]]
     ; CHECK-NEXT: [[LSHR2:%[0-9]+]]:_(s64) = G_LSHR %mid1, [[SUB4]](s64)
-    ; CHECK-NEXT: [[C10:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s64) = G_AND [[ICMP4]], [[C10]]
-    ; CHECK-NEXT: [[SELECT3:%[0-9]+]]:_(s64) = G_SELECT [[AND3]](s64), [[OR1]], [[LSHR2]]
-    ; CHECK-NEXT: [[C11:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; CHECK-NEXT: [[AND4:%[0-9]+]]:_(s64) = G_AND [[ICMP5]], [[C11]]
-    ; CHECK-NEXT: [[SELECT4:%[0-9]+]]:_(s64) = G_SELECT [[AND4]](s64), %lo1, [[SELECT3]]
-    ; CHECK-NEXT: [[C12:%[0-9]+]]:_(s64) = G_CONSTANT i64 64
-    ; CHECK-NEXT: [[ICMP6:%[0-9]+]]:_(s64) = G_ICMP intpred(ult), %lo2(s64), [[C12]]
+    ; CHECK-NEXT: [[SELECT3:%[0-9]+]]:_(s64) = G_SELECT [[ICMP4]](s64), [[OR1]], [[LSHR2]]
+    ; CHECK-NEXT: [[SELECT4:%[0-9]+]]:_(s64) = G_SELECT [[ICMP5]](s64), %lo1, [[SELECT3]]
+    ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 64
+    ; CHECK-NEXT: [[ICMP6:%[0-9]+]]:_(s64) = G_ICMP intpred(ult), %lo2(s64), [[C7]]
     ; CHECK-NEXT: [[SHL4:%[0-9]+]]:_(s64) = G_SHL %hi1, %lo2(s64)
-    ; CHECK-NEXT: [[C13:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-    ; CHECK-NEXT: [[C14:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; CHECK-NEXT: [[AND5:%[0-9]+]]:_(s64) = G_AND [[ICMP6]], [[C14]]
-    ; CHECK-NEXT: [[SELECT5:%[0-9]+]]:_(s64) = G_SELECT [[AND5]](s64), [[SHL4]], [[C13]]
+    ; CHECK-NEXT: [[C8:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[SELECT5:%[0-9]+]]:_(s64) = G_SELECT [[ICMP6]](s64), [[SHL4]], [[C8]]
     ; CHECK-NEXT: [[OR2:%[0-9]+]]:_(s64) = G_OR [[SELECT4]], [[SELECT5]]
-    ; CHECK-NEXT: [[C15:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-    ; CHECK-NEXT: [[C16:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-    ; CHECK-NEXT: [[C17:%[0-9]+]]:_(s64) = G_CONSTANT i64 64
-    ; CHECK-NEXT: [[ICMP7:%[0-9]+]]:_(s64) = G_ICMP intpred(ult), [[SUB]](s64), [[C17]]
+    ; CHECK-NEXT: [[C9:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[C10:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[C11:%[0-9]+]]:_(s64) = G_CONSTANT i64 64
+    ; CHECK-NEXT: [[ICMP7:%[0-9]+]]:_(s64) = G_ICMP intpred(ult), [[SUB]](s64), [[C11]]
     ; CHECK-NEXT: [[SHL5:%[0-9]+]]:_(s64) = G_SHL %lo1, [[SUB]](s64)
-    ; CHECK-NEXT: [[C18:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-    ; CHECK-NEXT: [[C19:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; CHECK-NEXT: [[AND6:%[0-9]+]]:_(s64) = G_AND [[ICMP7]], [[C19]]
-    ; CHECK-NEXT: [[SELECT6:%[0-9]+]]:_(s64) = G_SELECT [[AND6]](s64), [[SHL5]], [[C18]]
-    ; CHECK-NEXT: [[C20:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; CHECK-NEXT: [[AND7:%[0-9]+]]:_(s64) = G_AND [[ICMP]], [[C20]]
-    ; CHECK-NEXT: [[SELECT7:%[0-9]+]]:_(s64) = G_SELECT [[AND7]](s64), [[SELECT]], [[C15]]
-    ; CHECK-NEXT: [[C21:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; CHECK-NEXT: [[AND8:%[0-9]+]]:_(s64) = G_AND [[ICMP]], [[C21]]
-    ; CHECK-NEXT: [[SELECT8:%[0-9]+]]:_(s64) = G_SELECT [[AND8]](s64), [[SELECT2]], [[C16]]
-    ; CHECK-NEXT: [[C22:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; CHECK-NEXT: [[AND9:%[0-9]+]]:_(s64) = G_AND [[ICMP]], [[C22]]
-    ; CHECK-NEXT: [[SELECT9:%[0-9]+]]:_(s64) = G_SELECT [[AND9]](s64), [[OR2]], [[SELECT6]]
-    ; CHECK-NEXT: [[C23:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; CHECK-NEXT: [[AND10:%[0-9]+]]:_(s64) = G_AND [[ICMP1]], [[C23]]
-    ; CHECK-NEXT: [[SELECT10:%[0-9]+]]:_(s64) = G_SELECT [[AND10]](s64), %hi1, [[SELECT9]]
+    ; CHECK-NEXT: [[C12:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[SELECT6:%[0-9]+]]:_(s64) = G_SELECT [[ICMP7]](s64), [[SHL5]], [[C12]]
+    ; CHECK-NEXT: [[SELECT7:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s64), [[SELECT]], [[C9]]
+    ; CHECK-NEXT: [[SELECT8:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s64), [[SELECT2]], [[C10]]
+    ; CHECK-NEXT: [[SELECT9:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s64), [[OR2]], [[SELECT6]]
+    ; CHECK-NEXT: [[SELECT10:%[0-9]+]]:_(s64) = G_SELECT [[ICMP1]](s64), %hi1, [[SELECT9]]
     ; CHECK-NEXT: $x10 = COPY [[SELECT7]](s64)
     ; CHECK-NEXT: $x11 = COPY [[SELECT8]](s64)
     ; CHECK-NEXT: $x12 = COPY [[SELECT10]](s64)
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-smax.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-smax.mir
index 43f4309dc5670..7a69d1942fafe 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-smax.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-smax.mir
@@ -13,12 +13,10 @@ body:             |
     ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 255
     ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[COPY1]], [[C1]]
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(ugt), [[AND]](s64), [[AND1]]
-    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s64) = G_AND [[ICMP]], [[C2]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[AND2]](s64), [[COPY]], [[COPY1]]
-    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 56
-    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[SELECT]], [[C3]](s64)
-    ; CHECK-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[SHL]], [[C3]](s64)
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s64), [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 56
+    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[SELECT]], [[C2]](s64)
+    ; CHECK-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[SHL]], [[C2]](s64)
     ; CHECK-NEXT: $x10 = COPY [[ASHR]](s64)
     ; CHECK-NEXT: PseudoRET implicit $x10
     %0:_(s64) = COPY $x10
@@ -43,12 +41,10 @@ body:             |
     ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 65535
     ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[COPY1]], [[C1]]
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(ugt), [[AND]](s64), [[AND1]]
-    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s64) = G_AND [[ICMP]], [[C2]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[AND2]](s64), [[COPY]], [[COPY1]]
-    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 48
-    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[SELECT]], [[C3]](s64)
-    ; CHECK-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[SHL]], [[C3]](s64)
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s64), [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 48
+    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[SELECT]], [[C2]](s64)
+    ; CHECK-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[SHL]], [[C2]](s64)
     ; CHECK-NEXT: $x10 = COPY [[ASHR]](s64)
     ; CHECK-NEXT: PseudoRET implicit $x10
     %0:_(s64) = COPY $x10
@@ -73,9 +69,7 @@ body:             |
     ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4294967295
     ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[COPY1]], [[C1]]
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(ugt), [[AND]](s64), [[AND1]]
-    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s64) = G_AND [[ICMP]], [[C2]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[AND2]](s64), [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s64), [[COPY]], [[COPY1]]
     ; CHECK-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s64) = G_SEXT_INREG [[SELECT]], 32
     ; CHECK-NEXT: $x10 = COPY [[SEXT_INREG]](s64)
     ; CHECK-NEXT: PseudoRET implicit $x10
@@ -97,9 +91,7 @@ body:             |
     ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(ugt), [[COPY]](s64), [[COPY1]]
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[ICMP]], [[C]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[AND]](s64), [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s64), [[COPY]], [[COPY1]]
     ; CHECK-NEXT: $x10 = COPY [[SELECT]](s64)
     ; CHECK-NEXT: PseudoRET implicit $x10
     %0:_(s64) = COPY $x10
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-smin.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-smin.mir
index 85fea46b4bc46..0f86b8960d4d6 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-smin.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-smin.mir
@@ -13,12 +13,10 @@ body:             |
     ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 255
     ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[COPY1]], [[C1]]
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(ult), [[AND]](s64), [[AND1]]
-    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s64) = G_AND [[ICMP]], [[C2]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[AND2]](s64), [[COPY]], [[COPY1]]
-    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 56
-    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[SELECT]], [[C3]](s64)
-    ; CHECK-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[SHL]], [[C3]](s64)
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s64), [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 56
+    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[SELECT]], [[C2]](s64)
+    ; CHECK-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[SHL]], [[C2]](s64)
     ; CHECK-NEXT: $x10 = COPY [[ASHR]](s64)
     ; CHECK-NEXT: PseudoRET implicit $x10
     %0:_(s64) = COPY $x10
@@ -43,12 +41,10 @@ body:             |
     ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 65535
     ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[COPY1]], [[C1]]
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(ult), [[AND]](s64), [[AND1]]
-    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s64) = G_AND [[ICMP]], [[C2]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[AND2]](s64), [[COPY]], [[COPY1]]
-    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 48
-    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[SELECT]], [[C3]](s64)
-    ; CHECK-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[SHL]], [[C3]](s64)
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s64), [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 48
+    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[SELECT]], [[C2]](s64)
+    ; CHECK-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[SHL]], [[C2]](s64)
     ; CHECK-NEXT: $x10 = COPY [[ASHR]](s64)
     ; CHECK-NEXT: PseudoRET implicit $x10
     %0:_(s64) = COPY $x10
@@ -73,9 +69,7 @@ body:             |
     ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4294967295
     ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[COPY1]], [[C1]]
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(ult), [[AND]](s64), [[AND1]]
-    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s64) = G_AND [[ICMP]], [[C2]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[AND2]](s64), [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s64), [[COPY]], [[COPY1]]
     ; CHECK-NEXT: [[SEXT_INREG:%[0-9]+]]:_(s64) = G_SEXT_INREG [[SELECT]], 32
     ; CHECK-NEXT: $x10 = COPY [[SEXT_INREG]](s64)
     ; CHECK-NEXT: PseudoRET implicit $x10
@@ -97,9 +91,7 @@ body:             |
     ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(ult), [[COPY]](s64), [[COPY1]]
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[ICMP]], [[C]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[AND]](s64), [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s64), [[COPY]], [[COPY1]]
     ; CHECK-NEXT: $x10 = COPY [[SELECT]](s64)
     ; CHECK-NEXT: PseudoRET implicit $x10
     %0:_(s64) = COPY $x10
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-store.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-store.mir
index fb4ead8f8d711..3982178468733 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-store.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-store.mir
@@ -261,11 +261,9 @@ body:             |
     ; CHECK-NEXT: G_STORE [[COPY2]](s32), [[COPY1]](p0) :: (store (s8))
     ; CHECK-NEXT: G_STORE [[LSHR1]](s32), [[PTR_ADD1]](p0) :: (store (s8) into unknown-address + 1)
     ; CHECK-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
-    ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C6]]
-    ; CHECK-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[AND1]], [[C5]](s32)
-    ; CHECK-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; CHECK-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD]], [[C7]](s64)
+    ; CHECK-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[LSHR]], [[C5]](s32)
+    ; CHECK-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; CHECK-NEXT: [[PTR_ADD2:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD]], [[C6]](s64)
     ; CHECK-NEXT: G_STORE [[LSHR]](s32), [[PTR_ADD]](p0) :: (store (s8) into unknown-address + 2)
     ; CHECK-NEXT: G_STORE [[LSHR2]](s32), [[PTR_ADD2]](p0) :: (store (s8) into unknown-address + 3)
     ; CHECK-NEXT: PseudoRET
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-sub.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-sub.mir
index 20bf8929c5552..c2504273c2af6 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-sub.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-sub.mir
@@ -122,9 +122,7 @@ body:             |
     ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s64) = G_SUB %x00, %y00
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(ult), %x00(s64), %y00
     ; CHECK-NEXT: [[SUB1:%[0-9]+]]:_(s64) = G_SUB %x01, %y01
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[ICMP]], [[C]]
-    ; CHECK-NEXT: [[SUB2:%[0-9]+]]:_(s64) = G_SUB [[SUB1]], [[AND]]
+    ; CHECK-NEXT: [[SUB2:%[0-9]+]]:_(s64) = G_SUB [[SUB1]], [[ICMP]]
     ; CHECK-NEXT: $x10 = COPY [[SUB]](s64)
     ; CHECK-NEXT: $x11 = COPY [[SUB2]](s64)
     ; CHECK-NEXT: PseudoRET implicit $x10, implicit $x11
@@ -156,9 +154,7 @@ body:             |
     ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s64) = G_SUB %lo1, %lo2
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(ult), %lo1(s64), %lo2
     ; CHECK-NEXT: [[SUB1:%[0-9]+]]:_(s64) = G_SUB %hi1, %hi2
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[ICMP]], [[C]]
-    ; CHECK-NEXT: [[SUB2:%[0-9]+]]:_(s64) = G_SUB [[SUB1]], [[AND]]
+    ; CHECK-NEXT: [[SUB2:%[0-9]+]]:_(s64) = G_SUB [[SUB1]], [[ICMP]]
     ; CHECK-NEXT: $x10 = COPY [[SUB]](s64)
     ; CHECK-NEXT: $x11 = COPY [[SUB2]](s64)
     ; CHECK-NEXT: PseudoRET implicit $x10, implicit $x11
@@ -190,21 +186,19 @@ body:             |
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(ult), %lo1(s64), %lo2
     ; CHECK-NEXT: [[SUB1:%[0-9]+]]:_(s64) = G_SUB %mid1, %mid2
     ; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s64) = G_ICMP intpred(ugt), [[SUB1]](s64), %mid1
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[ICMP]], [[C]]
-    ; CHECK-NEXT: [[SUB2:%[0-9]+]]:_(s64) = G_SUB [[SUB1]], [[AND]]
-    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-    ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s64) = G_ICMP intpred(eq), [[SUB1]](s64), [[C1]]
+    ; CHECK-NEXT: [[SUB2:%[0-9]+]]:_(s64) = G_SUB [[SUB1]], [[ICMP]]
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[ICMP2:%[0-9]+]]:_(s64) = G_ICMP intpred(eq), [[SUB1]](s64), [[C]]
     ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC [[ICMP2]](s64)
     ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s32) = G_TRUNC [[ICMP]](s64)
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[TRUNC]], [[TRUNC1]]
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[TRUNC]], [[TRUNC1]]
     ; CHECK-NEXT: [[TRUNC2:%[0-9]+]]:_(s32) = G_TRUNC [[ICMP1]](s64)
-    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[TRUNC2]], [[AND1]]
+    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[TRUNC2]], [[AND]]
     ; CHECK-NEXT: [[SUB3:%[0-9]+]]:_(s64) = G_SUB %hi1, %hi2
-    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
     ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[OR]](s32)
-    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s64) = G_AND [[ANYEXT]], [[C2]]
-    ; CHECK-NEXT: [[SUB4:%[0-9]+]]:_(s64) = G_SUB [[SUB3]], [[AND2]]
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[ANYEXT]], [[C1]]
+    ; CHECK-NEXT: [[SUB4:%[0-9]+]]:_(s64) = G_SUB [[SUB3]], [[AND1]]
     ; CHECK-NEXT: $x10 = COPY [[SUB]](s64)
     ; CHECK-NEXT: $x11 = COPY [[SUB2]](s64)
     ; CHECK-NEXT: $x12 = COPY [[SUB4]](s64)
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-umax.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-umax.mir
index d0310e3e21ec9..e07aa9978fb45 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-umax.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-umax.mir
@@ -13,12 +13,10 @@ body:             |
     ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 255
     ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[COPY1]], [[C1]]
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(ugt), [[AND]](s64), [[AND1]]
-    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s64) = G_AND [[ICMP]], [[C2]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[AND2]](s64), [[COPY]], [[COPY1]]
-    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 255
-    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s64) = G_AND [[SELECT]], [[C3]]
-    ; CHECK-NEXT: $x10 = COPY [[AND3]](s64)
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s64), [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 255
+    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s64) = G_AND [[SELECT]], [[C2]]
+    ; CHECK-NEXT: $x10 = COPY [[AND2]](s64)
     ; CHECK-NEXT: PseudoRET implicit $x10
     %0:_(s64) = COPY $x10
     %1:_(s64) = COPY $x11
@@ -42,12 +40,10 @@ body:             |
     ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 65535
     ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[COPY1]], [[C1]]
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(ugt), [[AND]](s64), [[AND1]]
-    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s64) = G_AND [[ICMP]], [[C2]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[AND2]](s64), [[COPY]], [[COPY1]]
-    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 65535
-    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s64) = G_AND [[SELECT]], [[C3]]
-    ; CHECK-NEXT: $x10 = COPY [[AND3]](s64)
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s64), [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 65535
+    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s64) = G_AND [[SELECT]], [[C2]]
+    ; CHECK-NEXT: $x10 = COPY [[AND2]](s64)
     ; CHECK-NEXT: PseudoRET implicit $x10
     %0:_(s64) = COPY $x10
     %1:_(s64) = COPY $x11
@@ -71,12 +67,10 @@ body:             |
     ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4294967295
     ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[COPY1]], [[C1]]
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(ugt), [[AND]](s64), [[AND1]]
-    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s64) = G_AND [[ICMP]], [[C2]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[AND2]](s64), [[COPY]], [[COPY1]]
-    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4294967295
-    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s64) = G_AND [[SELECT]], [[C3]]
-    ; CHECK-NEXT: $x10 = COPY [[AND3]](s64)
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s64), [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4294967295
+    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s64) = G_AND [[SELECT]], [[C2]]
+    ; CHECK-NEXT: $x10 = COPY [[AND2]](s64)
     ; CHECK-NEXT: PseudoRET implicit $x10
     %0:_(s64) = COPY $x10
     %1:_(s64) = COPY $x11
@@ -96,9 +90,7 @@ body:             |
     ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(ugt), [[COPY]](s64), [[COPY1]]
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[ICMP]], [[C]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[AND]](s64), [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s64), [[COPY]], [[COPY1]]
     ; CHECK-NEXT: $x10 = COPY [[SELECT]](s64)
     ; CHECK-NEXT: PseudoRET implicit $x10
     %0:_(s64) = COPY $x10
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-umin.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-umin.mir
index a0eec3298a586..23c6a26bec57a 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-umin.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rv64/legalize-umin.mir
@@ -13,12 +13,10 @@ body:             |
     ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 255
     ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[COPY1]], [[C1]]
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(ult), [[AND]](s64), [[AND1]]
-    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s64) = G_AND [[ICMP]], [[C2]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[AND2]](s64), [[COPY]], [[COPY1]]
-    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 255
-    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s64) = G_AND [[SELECT]], [[C3]]
-    ; CHECK-NEXT: $x10 = COPY [[AND3]](s64)
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s64), [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 255
+    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s64) = G_AND [[SELECT]], [[C2]]
+    ; CHECK-NEXT: $x10 = COPY [[AND2]](s64)
     ; CHECK-NEXT: PseudoRET implicit $x10
     %0:_(s64) = COPY $x10
     %1:_(s64) = COPY $x11
@@ -42,12 +40,10 @@ body:             |
     ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 65535
     ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[COPY1]], [[C1]]
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(ult), [[AND]](s64), [[AND1]]
-    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s64) = G_AND [[ICMP]], [[C2]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[AND2]](s64), [[COPY]], [[COPY1]]
-    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 65535
-    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s64) = G_AND [[SELECT]], [[C3]]
-    ; CHECK-NEXT: $x10 = COPY [[AND3]](s64)
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s64), [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 65535
+    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s64) = G_AND [[SELECT]], [[C2]]
+    ; CHECK-NEXT: $x10 = COPY [[AND2]](s64)
     ; CHECK-NEXT: PseudoRET implicit $x10
     %0:_(s64) = COPY $x10
     %1:_(s64) = COPY $x11
@@ -71,12 +67,10 @@ body:             |
     ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 4294967295
     ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s64) = G_AND [[COPY1]], [[C1]]
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(ult), [[AND]](s64), [[AND1]]
-    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s64) = G_AND [[ICMP]], [[C2]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[AND2]](s64), [[COPY]], [[COPY1]]
-    ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4294967295
-    ; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s64) = G_AND [[SELECT]], [[C3]]
-    ; CHECK-NEXT: $x10 = COPY [[AND3]](s64)
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s64), [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 4294967295
+    ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s64) = G_AND [[SELECT]], [[C2]]
+    ; CHECK-NEXT: $x10 = COPY [[AND2]](s64)
     ; CHECK-NEXT: PseudoRET implicit $x10
     %0:_(s64) = COPY $x10
     %1:_(s64) = COPY $x11
@@ -96,9 +90,7 @@ body:             |
     ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x10
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x11
     ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s64) = G_ICMP intpred(ult), [[COPY]](s64), [[COPY1]]
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[ICMP]], [[C]]
-    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[AND]](s64), [[COPY]], [[COPY1]]
+    ; CHECK-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[ICMP]](s64), [[COPY]], [[COPY1]]
     ; CHECK-NEXT: $x10 = COPY [[SELECT]](s64)
     ; CHECK-NEXT: PseudoRET implicit $x10
     %0:_(s64) = COPY $x10
diff --git a/llvm/test/CodeGen/X86/GlobalISel/legalize-cmp.mir b/llvm/test/CodeGen/X86/GlobalISel/legalize-cmp.mir
index 32afcb2db4dad..1184c075f4220 100644
--- a/llvm/test/CodeGen/X86/GlobalISel/legalize-cmp.mir
+++ b/llvm/test/CodeGen/X86/GlobalISel/legalize-cmp.mir
@@ -48,14 +48,16 @@ body:             |
     liveins: $edi, $esi
 
     ; CHECK-LABEL: name: test_cmp_i8
-    ; CHECK: [[COPY:%[0-9]+]]:_(s8) = COPY $dil
-    ; CHECK: [[COPY1:%[0-9]+]]:_(s8) = COPY $sil
-    ; CHECK: [[ICMP:%[0-9]+]]:_(s8) = G_ICMP intpred(ult), [[COPY]](s8), [[COPY1]]
-    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[ICMP]](s8)
-    ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C]]
-    ; CHECK: $eax = COPY [[AND]](s32)
-    ; CHECK: RET 0, implicit $eax
+    ; CHECK: liveins: $edi, $esi
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s8) = COPY $dil
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s8) = COPY $sil
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s8) = G_ICMP intpred(ult), [[COPY]](s8), [[COPY1]]
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[ICMP]](s8)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C]]
+    ; CHECK-NEXT: $eax = COPY [[AND]](s32)
+    ; CHECK-NEXT: RET 0, implicit $eax
     %0(s8) = COPY $dil
     %1(s8) = COPY $sil
     %2(s1) = G_ICMP intpred(ult), %0(s8), %1
@@ -79,14 +81,16 @@ body:             |
     liveins: $edi, $esi
 
     ; CHECK-LABEL: name: test_cmp_i16
-    ; CHECK: [[COPY:%[0-9]+]]:_(s16) = COPY $di
-    ; CHECK: [[COPY1:%[0-9]+]]:_(s16) = COPY $si
-    ; CHECK: [[ICMP:%[0-9]+]]:_(s8) = G_ICMP intpred(ult), [[COPY]](s16), [[COPY1]]
-    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[ICMP]](s8)
-    ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C]]
-    ; CHECK: $eax = COPY [[AND]](s32)
-    ; CHECK: RET 0, implicit $eax
+    ; CHECK: liveins: $edi, $esi
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s16) = COPY $di
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s16) = COPY $si
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s8) = G_ICMP intpred(ult), [[COPY]](s16), [[COPY1]]
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[ICMP]](s8)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C]]
+    ; CHECK-NEXT: $eax = COPY [[AND]](s32)
+    ; CHECK-NEXT: RET 0, implicit $eax
     %0(s16) = COPY $di
     %1(s16) = COPY $si
     %2(s1) = G_ICMP intpred(ult), %0(s16), %1
@@ -110,14 +114,16 @@ body:             |
     liveins: $edi, $esi
 
     ; CHECK-LABEL: name: test_cmp_i32
-    ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $edi
-    ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $esi
-    ; CHECK: [[ICMP:%[0-9]+]]:_(s8) = G_ICMP intpred(ult), [[COPY]](s32), [[COPY1]]
-    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[ICMP]](s8)
-    ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C]]
-    ; CHECK: $eax = COPY [[AND]](s32)
-    ; CHECK: RET 0, implicit $eax
+    ; CHECK: liveins: $edi, $esi
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $edi
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $esi
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s8) = G_ICMP intpred(ult), [[COPY]](s32), [[COPY1]]
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[ICMP]](s8)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C]]
+    ; CHECK-NEXT: $eax = COPY [[AND]](s32)
+    ; CHECK-NEXT: RET 0, implicit $eax
     %0(s32) = COPY $edi
     %1(s32) = COPY $esi
     %2(s1) = G_ICMP intpred(ult), %0(s32), %1
@@ -141,14 +147,16 @@ body:             |
     liveins: $rdi, $rsi
 
     ; CHECK-LABEL: name: test_cmp_i64
-    ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $rdi
-    ; CHECK: [[COPY1:%[0-9]+]]:_(s64) = COPY $rsi
-    ; CHECK: [[ICMP:%[0-9]+]]:_(s8) = G_ICMP intpred(ult), [[COPY]](s64), [[COPY1]]
-    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[ICMP]](s8)
-    ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C]]
-    ; CHECK: $eax = COPY [[AND]](s32)
-    ; CHECK: RET 0, implicit $eax
+    ; CHECK: liveins: $rdi, $rsi
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $rdi
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $rsi
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s8) = G_ICMP intpred(ult), [[COPY]](s64), [[COPY1]]
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[ICMP]](s8)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C]]
+    ; CHECK-NEXT: $eax = COPY [[AND]](s32)
+    ; CHECK-NEXT: RET 0, implicit $eax
     %0(s64) = COPY $rdi
     %1(s64) = COPY $rsi
     %2(s1) = G_ICMP intpred(ult), %0(s64), %1
@@ -172,14 +180,16 @@ body:             |
     liveins: $rdi, $rsi
 
     ; CHECK-LABEL: name: test_cmp_p0
-    ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $rdi
-    ; CHECK: [[COPY1:%[0-9]+]]:_(p0) = COPY $rsi
-    ; CHECK: [[ICMP:%[0-9]+]]:_(s8) = G_ICMP intpred(ult), [[COPY]](p0), [[COPY1]]
-    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[ICMP]](s8)
-    ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C]]
-    ; CHECK: $eax = COPY [[AND]](s32)
-    ; CHECK: RET 0, implicit $eax
+    ; CHECK: liveins: $rdi, $rsi
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $rdi
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY $rsi
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s8) = G_ICMP intpred(ult), [[COPY]](p0), [[COPY1]]
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[ICMP]](s8)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C]]
+    ; CHECK-NEXT: $eax = COPY [[AND]](s32)
+    ; CHECK-NEXT: RET 0, implicit $eax
     %0(p0) = COPY $rdi
     %1(p0) = COPY $rsi
     %2(s1) = G_ICMP intpred(ult), %0(p0), %1
diff --git a/llvm/test/CodeGen/X86/GlobalISel/legalize-ext-x86-64.mir b/llvm/test/CodeGen/X86/GlobalISel/legalize-ext-x86-64.mir
index 60c21e40a5ec8..1ea2da9757f1a 100644
--- a/llvm/test/CodeGen/X86/GlobalISel/legalize-ext-x86-64.mir
+++ b/llvm/test/CodeGen/X86/GlobalISel/legalize-ext-x86-64.mir
@@ -76,14 +76,16 @@ body:             |
     liveins: $edi
 
     ; CHECK-LABEL: name: test_sext_i1
-    ; CHECK: [[COPY:%[0-9]+]]:_(s8) = COPY $dil
-    ; CHECK: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[COPY]](s8)
-    ; CHECK: [[C:%[0-9]+]]:_(s8) = G_CONSTANT i8 63
-    ; CHECK: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C]](s8)
-    ; CHECK: [[COPY1:%[0-9]+]]:_(s8) = COPY [[C]](s8)
-    ; CHECK: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[SHL]], [[COPY1]](s8)
-    ; CHECK: $rax = COPY [[ASHR]](s64)
-    ; CHECK: RET 0, implicit $rax
+    ; CHECK: liveins: $edi
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s8) = COPY $dil
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[COPY]](s8)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s8) = G_CONSTANT i8 63
+    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[ANYEXT]], [[C]](s8)
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s8) = COPY [[C]](s8)
+    ; CHECK-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[SHL]], [[COPY1]](s8)
+    ; CHECK-NEXT: $rax = COPY [[ASHR]](s64)
+    ; CHECK-NEXT: RET 0, implicit $rax
     %0(s8) = COPY $dil
     %1(s1) = G_TRUNC %0(s8)
     %2(s64) = G_SEXT %1(s1)
@@ -104,10 +106,12 @@ body:             |
     liveins: $edi
 
     ; CHECK-LABEL: name: test_sext_i8
-    ; CHECK: [[COPY:%[0-9]+]]:_(s8) = COPY $dil
-    ; CHECK: [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[COPY]](s8)
-    ; CHECK: $rax = COPY [[SEXT]](s64)
-    ; CHECK: RET 0, implicit $rax
+    ; CHECK: liveins: $edi
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s8) = COPY $dil
+    ; CHECK-NEXT: [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[COPY]](s8)
+    ; CHECK-NEXT: $rax = COPY [[SEXT]](s64)
+    ; CHECK-NEXT: RET 0, implicit $rax
     %0(s8) = COPY $dil
     %1(s64) = G_SEXT %0(s8)
     $rax = COPY %1(s64)
@@ -127,10 +131,12 @@ body:             |
     liveins: $edi
 
     ; CHECK-LABEL: name: test_sext_i16
-    ; CHECK: [[COPY:%[0-9]+]]:_(s16) = COPY $di
-    ; CHECK: [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[COPY]](s16)
-    ; CHECK: $rax = COPY [[SEXT]](s64)
-    ; CHECK: RET 0, implicit $rax
+    ; CHECK: liveins: $edi
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s16) = COPY $di
+    ; CHECK-NEXT: [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[COPY]](s16)
+    ; CHECK-NEXT: $rax = COPY [[SEXT]](s64)
+    ; CHECK-NEXT: RET 0, implicit $rax
     %0(s16) = COPY $di
     %1(s64) = G_SEXT %0(s16)
     $rax = COPY %1(s64)
@@ -150,10 +156,12 @@ body:             |
     liveins: $edi
 
     ; CHECK-LABEL: name: test_sext_i32
-    ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $edi
-    ; CHECK: [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[COPY]](s32)
-    ; CHECK: $rax = COPY [[SEXT]](s64)
-    ; CHECK: RET 0, implicit $rax
+    ; CHECK: liveins: $edi
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $edi
+    ; CHECK-NEXT: [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[COPY]](s32)
+    ; CHECK-NEXT: $rax = COPY [[SEXT]](s64)
+    ; CHECK-NEXT: RET 0, implicit $rax
     %0(s32) = COPY $edi
     %1(s64) = G_SEXT %0(s32)
     $rax = COPY %1(s64)
@@ -174,12 +182,14 @@ body:             |
     liveins: $edi
 
     ; CHECK-LABEL: name: test_zext_i1
-    ; CHECK: [[COPY:%[0-9]+]]:_(s8) = COPY $dil
-    ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; CHECK: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[COPY]](s8)
-    ; CHECK: [[AND:%[0-9]+]]:_(s64) = G_AND [[ANYEXT]], [[C]]
-    ; CHECK: $rax = COPY [[AND]](s64)
-    ; CHECK: RET 0, implicit $rax
+    ; CHECK: liveins: $edi
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s8) = COPY $dil
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[COPY]](s8)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[ANYEXT]], [[C]]
+    ; CHECK-NEXT: $rax = COPY [[AND]](s64)
+    ; CHECK-NEXT: RET 0, implicit $rax
     %0(s8) = COPY $dil
     %1(s1) = G_TRUNC %0(s8)
     %2(s64) = G_ZEXT %1(s1)
@@ -200,10 +210,12 @@ body:             |
     liveins: $edi
 
     ; CHECK-LABEL: name: test_zext_i8
-    ; CHECK: [[COPY:%[0-9]+]]:_(s8) = COPY $dil
-    ; CHECK: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[COPY]](s8)
-    ; CHECK: $rax = COPY [[ZEXT]](s64)
-    ; CHECK: RET 0, implicit $rax
+    ; CHECK: liveins: $edi
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s8) = COPY $dil
+    ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[COPY]](s8)
+    ; CHECK-NEXT: $rax = COPY [[ZEXT]](s64)
+    ; CHECK-NEXT: RET 0, implicit $rax
     %0(s8) = COPY $dil
     %1(s64) = G_ZEXT %0(s8)
     $rax = COPY %1(s64)
@@ -223,10 +235,12 @@ body:             |
     liveins: $edi
 
     ; CHECK-LABEL: name: test_zext_i16
-    ; CHECK: [[COPY:%[0-9]+]]:_(s16) = COPY $di
-    ; CHECK: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[COPY]](s16)
-    ; CHECK: $rax = COPY [[ZEXT]](s64)
-    ; CHECK: RET 0, implicit $rax
+    ; CHECK: liveins: $edi
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s16) = COPY $di
+    ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[COPY]](s16)
+    ; CHECK-NEXT: $rax = COPY [[ZEXT]](s64)
+    ; CHECK-NEXT: RET 0, implicit $rax
     %0(s16) = COPY $di
     %1(s64) = G_ZEXT %0(s16)
     $rax = COPY %1(s64)
@@ -246,10 +260,12 @@ body:             |
     liveins: $edi
 
     ; CHECK-LABEL: name: test_zext_i32
-    ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $edi
-    ; CHECK: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[COPY]](s32)
-    ; CHECK: $rax = COPY [[ZEXT]](s64)
-    ; CHECK: RET 0, implicit $rax
+    ; CHECK: liveins: $edi
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $edi
+    ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[COPY]](s32)
+    ; CHECK-NEXT: $rax = COPY [[ZEXT]](s64)
+    ; CHECK-NEXT: RET 0, implicit $rax
     %0(s32) = COPY $edi
     %1(s64) = G_ZEXT %0(s32)
     $rax = COPY %1(s64)
@@ -270,10 +286,12 @@ body:             |
     liveins: $edi
 
     ; CHECK-LABEL: name: test_anyext_i1
-    ; CHECK: [[COPY:%[0-9]+]]:_(s8) = COPY $dil
-    ; CHECK: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[COPY]](s8)
-    ; CHECK: $rax = COPY [[ANYEXT]](s64)
-    ; CHECK: RET 0, implicit $rax
+    ; CHECK: liveins: $edi
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s8) = COPY $dil
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[COPY]](s8)
+    ; CHECK-NEXT: $rax = COPY [[ANYEXT]](s64)
+    ; CHECK-NEXT: RET 0, implicit $rax
     %0(s8) = COPY $dil
     %1(s1) = G_TRUNC %0(s8)
     %2(s64) = G_ANYEXT %1(s1)
@@ -294,10 +312,12 @@ body:             |
     liveins: $edi
 
     ; CHECK-LABEL: name: test_anyext_i8
-    ; CHECK: [[COPY:%[0-9]+]]:_(s8) = COPY $dil
-    ; CHECK: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[COPY]](s8)
-    ; CHECK: $rax = COPY [[ANYEXT]](s64)
-    ; CHECK: RET 0, implicit $rax
+    ; CHECK: liveins: $edi
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s8) = COPY $dil
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[COPY]](s8)
+    ; CHECK-NEXT: $rax = COPY [[ANYEXT]](s64)
+    ; CHECK-NEXT: RET 0, implicit $rax
     %0(s8) = COPY $dil
     %1(s64) = G_ANYEXT %0(s8)
     $rax = COPY %1(s64)
@@ -317,10 +337,12 @@ body:             |
     liveins: $edi
 
     ; CHECK-LABEL: name: test_anyext_i16
-    ; CHECK: [[COPY:%[0-9]+]]:_(s16) = COPY $di
-    ; CHECK: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[COPY]](s16)
-    ; CHECK: $rax = COPY [[ANYEXT]](s64)
-    ; CHECK: RET 0, implicit $rax
+    ; CHECK: liveins: $edi
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s16) = COPY $di
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[COPY]](s16)
+    ; CHECK-NEXT: $rax = COPY [[ANYEXT]](s64)
+    ; CHECK-NEXT: RET 0, implicit $rax
     %0(s16) = COPY $di
     %1(s64) = G_ANYEXT %0(s16)
     $rax = COPY %1(s64)
@@ -340,10 +362,12 @@ body:             |
     liveins: $edi
 
     ; CHECK-LABEL: name: test_anyext_i32
-    ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $edi
-    ; CHECK: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[COPY]](s32)
-    ; CHECK: $rax = COPY [[ANYEXT]](s64)
-    ; CHECK: RET 0, implicit $rax
+    ; CHECK: liveins: $edi
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $edi
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[COPY]](s32)
+    ; CHECK-NEXT: $rax = COPY [[ANYEXT]](s64)
+    ; CHECK-NEXT: RET 0, implicit $rax
     %0(s32) = COPY $edi
     %1(s64) = G_ANYEXT %0(s32)
     $rax = COPY %1(s64)
diff --git a/llvm/test/CodeGen/X86/GlobalISel/legalize-ext.mir b/llvm/test/CodeGen/X86/GlobalISel/legalize-ext.mir
index 58c9880023c88..bf625cf7eeb99 100644
--- a/llvm/test/CodeGen/X86/GlobalISel/legalize-ext.mir
+++ b/llvm/test/CodeGen/X86/GlobalISel/legalize-ext.mir
@@ -104,19 +104,24 @@ body:             |
     liveins: $edi
 
     ; X32-LABEL: name: test_zext_i1toi8
-    ; X32: [[COPY:%[0-9]+]]:_(s32) = COPY $edi
-    ; X32: [[C:%[0-9]+]]:_(s8) = G_CONSTANT i8 1
-    ; X32: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[COPY]](s32)
-    ; X32: [[AND:%[0-9]+]]:_(s8) = G_AND [[TRUNC]], [[C]]
-    ; X32: $al = COPY [[AND]](s8)
-    ; X32: RET 0, implicit $al
+    ; X32: liveins: $edi
+    ; X32-NEXT: {{  $}}
+    ; X32-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $edi
+    ; X32-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[COPY]](s32)
+    ; X32-NEXT: [[C:%[0-9]+]]:_(s8) = G_CONSTANT i8 1
+    ; X32-NEXT: [[AND:%[0-9]+]]:_(s8) = G_AND [[TRUNC]], [[C]]
+    ; X32-NEXT: $al = COPY [[AND]](s8)
+    ; X32-NEXT: RET 0, implicit $al
+    ;
     ; X64-LABEL: name: test_zext_i1toi8
-    ; X64: [[COPY:%[0-9]+]]:_(s32) = COPY $edi
-    ; X64: [[C:%[0-9]+]]:_(s8) = G_CONSTANT i8 1
-    ; X64: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[COPY]](s32)
-    ; X64: [[AND:%[0-9]+]]:_(s8) = G_AND [[TRUNC]], [[C]]
-    ; X64: $al = COPY [[AND]](s8)
-    ; X64: RET 0, implicit $al
+    ; X64: liveins: $edi
+    ; X64-NEXT: {{  $}}
+    ; X64-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $edi
+    ; X64-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[COPY]](s32)
+    ; X64-NEXT: [[C:%[0-9]+]]:_(s8) = G_CONSTANT i8 1
+    ; X64-NEXT: [[AND:%[0-9]+]]:_(s8) = G_AND [[TRUNC]], [[C]]
+    ; X64-NEXT: $al = COPY [[AND]](s8)
+    ; X64-NEXT: RET 0, implicit $al
     %1:_(s32) = COPY $edi
     %0:_(s1) = G_TRUNC %1(s32)
     %2:_(s8) = G_ZEXT %0(s1)
@@ -137,19 +142,24 @@ body:             |
     liveins: $edi
 
     ; X32-LABEL: name: test_zext_i1toi16
-    ; X32: [[COPY:%[0-9]+]]:_(s32) = COPY $edi
-    ; X32: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 1
-    ; X32: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
-    ; X32: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C]]
-    ; X32: $ax = COPY [[AND]](s16)
-    ; X32: RET 0, implicit $ax
+    ; X32: liveins: $edi
+    ; X32-NEXT: {{  $}}
+    ; X32-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $edi
+    ; X32-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+    ; X32-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 1
+    ; X32-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C]]
+    ; X32-NEXT: $ax = COPY [[AND]](s16)
+    ; X32-NEXT: RET 0, implicit $ax
+    ;
     ; X64-LABEL: name: test_zext_i1toi16
-    ; X64: [[COPY:%[0-9]+]]:_(s32) = COPY $edi
-    ; X64: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 1
-    ; X64: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
-    ; X64: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C]]
-    ; X64: $ax = COPY [[AND]](s16)
-    ; X64: RET 0, implicit $ax
+    ; X64: liveins: $edi
+    ; X64-NEXT: {{  $}}
+    ; X64-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $edi
+    ; X64-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+    ; X64-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 1
+    ; X64-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C]]
+    ; X64-NEXT: $ax = COPY [[AND]](s16)
+    ; X64-NEXT: RET 0, implicit $ax
     %1:_(s32) = COPY $edi
     %0:_(s1) = G_TRUNC %1(s32)
     %2:_(s16) = G_ZEXT %0(s1)
@@ -171,19 +181,24 @@ body:             |
     liveins: $edi
 
     ; X32-LABEL: name: test_zext_i1
-    ; X32: [[COPY:%[0-9]+]]:_(s8) = COPY $dil
-    ; X32: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; X32: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[COPY]](s8)
-    ; X32: [[AND:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C]]
-    ; X32: $eax = COPY [[AND]](s32)
-    ; X32: RET 0, implicit $eax
+    ; X32: liveins: $edi
+    ; X32-NEXT: {{  $}}
+    ; X32-NEXT: [[COPY:%[0-9]+]]:_(s8) = COPY $dil
+    ; X32-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[COPY]](s8)
+    ; X32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; X32-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C]]
+    ; X32-NEXT: $eax = COPY [[AND]](s32)
+    ; X32-NEXT: RET 0, implicit $eax
+    ;
     ; X64-LABEL: name: test_zext_i1
-    ; X64: [[COPY:%[0-9]+]]:_(s8) = COPY $dil
-    ; X64: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; X64: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[COPY]](s8)
-    ; X64: [[AND:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C]]
-    ; X64: $eax = COPY [[AND]](s32)
-    ; X64: RET 0, implicit $eax
+    ; X64: liveins: $edi
+    ; X64-NEXT: {{  $}}
+    ; X64-NEXT: [[COPY:%[0-9]+]]:_(s8) = COPY $dil
+    ; X64-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[COPY]](s8)
+    ; X64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; X64-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C]]
+    ; X64-NEXT: $eax = COPY [[AND]](s32)
+    ; X64-NEXT: RET 0, implicit $eax
     %0(s8) = COPY $dil
     %1(s1) = G_TRUNC %0(s8)
     %2(s32) = G_ZEXT %1(s1)
@@ -204,15 +219,20 @@ body:             |
     liveins: $edi
 
     ; X32-LABEL: name: test_zext_i8toi16
-    ; X32: [[COPY:%[0-9]+]]:_(s8) = COPY $dil
-    ; X32: [[ZEXT:%[0-9]+]]:_(s16) = G_ZEXT [[COPY]](s8)
-    ; X32: $ax = COPY [[ZEXT]](s16)
-    ; X32: RET 0, implicit $ax
+    ; X32: liveins: $edi
+    ; X32-NEXT: {{  $}}
+    ; X32-NEXT: [[COPY:%[0-9]+]]:_(s8) = COPY $dil
+    ; X32-NEXT: [[ZEXT:%[0-9]+]]:_(s16) = G_ZEXT [[COPY]](s8)
+    ; X32-NEXT: $ax = COPY [[ZEXT]](s16)
+    ; X32-NEXT: RET 0, implicit $ax
+    ;
     ; X64-LABEL: name: test_zext_i8toi16
-    ; X64: [[COPY:%[0-9]+]]:_(s8) = COPY $dil
-    ; X64: [[ZEXT:%[0-9]+]]:_(s16) = G_ZEXT [[COPY]](s8)
-    ; X64: $ax = COPY [[ZEXT]](s16)
-    ; X64: RET 0, implicit $ax
+    ; X64: liveins: $edi
+    ; X64-NEXT: {{  $}}
+    ; X64-NEXT: [[COPY:%[0-9]+]]:_(s8) = COPY $dil
+    ; X64-NEXT: [[ZEXT:%[0-9]+]]:_(s16) = G_ZEXT [[COPY]](s8)
+    ; X64-NEXT: $ax = COPY [[ZEXT]](s16)
+    ; X64-NEXT: RET 0, implicit $ax
     %0(s8) = COPY $dil
     %1(s16) = G_ZEXT %0(s8)
     $ax = COPY %1(s16)
@@ -232,15 +252,20 @@ body:             |
     liveins: $edi
 
     ; X32-LABEL: name: test_zext_i8
-    ; X32: [[COPY:%[0-9]+]]:_(s8) = COPY $dil
-    ; X32: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[COPY]](s8)
-    ; X32: $eax = COPY [[ZEXT]](s32)
-    ; X32: RET 0, implicit $eax
+    ; X32: liveins: $edi
+    ; X32-NEXT: {{  $}}
+    ; X32-NEXT: [[COPY:%[0-9]+]]:_(s8) = COPY $dil
+    ; X32-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[COPY]](s8)
+    ; X32-NEXT: $eax = COPY [[ZEXT]](s32)
+    ; X32-NEXT: RET 0, implicit $eax
+    ;
     ; X64-LABEL: name: test_zext_i8
-    ; X64: [[COPY:%[0-9]+]]:_(s8) = COPY $dil
-    ; X64: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[COPY]](s8)
-    ; X64: $eax = COPY [[ZEXT]](s32)
-    ; X64: RET 0, implicit $eax
+    ; X64: liveins: $edi
+    ; X64-NEXT: {{  $}}
+    ; X64-NEXT: [[COPY:%[0-9]+]]:_(s8) = COPY $dil
+    ; X64-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[COPY]](s8)
+    ; X64-NEXT: $eax = COPY [[ZEXT]](s32)
+    ; X64-NEXT: RET 0, implicit $eax
     %0(s8) = COPY $dil
     %1(s32) = G_ZEXT %0(s8)
     $eax = COPY %1(s32)
@@ -260,15 +285,20 @@ body:             |
     liveins: $edi
 
     ; X32-LABEL: name: test_zext_i16
-    ; X32: [[COPY:%[0-9]+]]:_(s16) = COPY $di
-    ; X32: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[COPY]](s16)
-    ; X32: $eax = COPY [[ZEXT]](s32)
-    ; X32: RET 0, implicit $eax
+    ; X32: liveins: $edi
+    ; X32-NEXT: {{  $}}
+    ; X32-NEXT: [[COPY:%[0-9]+]]:_(s16) = COPY $di
+    ; X32-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[COPY]](s16)
+    ; X32-NEXT: $eax = COPY [[ZEXT]](s32)
+    ; X32-NEXT: RET 0, implicit $eax
+    ;
     ; X64-LABEL: name: test_zext_i16
-    ; X64: [[COPY:%[0-9]+]]:_(s16) = COPY $di
-    ; X64: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[COPY]](s16)
-    ; X64: $eax = COPY [[ZEXT]](s32)
-    ; X64: RET 0, implicit $eax
+    ; X64: liveins: $edi
+    ; X64-NEXT: {{  $}}
+    ; X64-NEXT: [[COPY:%[0-9]+]]:_(s16) = COPY $di
+    ; X64-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[COPY]](s16)
+    ; X64-NEXT: $eax = COPY [[ZEXT]](s32)
+    ; X64-NEXT: RET 0, implicit $eax
     %0(s16) = COPY $di
     %1(s32) = G_ZEXT %0(s16)
     $eax = COPY %1(s32)
@@ -288,13 +318,18 @@ body:             |
     liveins: $edi
 
     ; X32-LABEL: name: test_sext_i1toi8
-    ; X32: [[C:%[0-9]+]]:_(s8) = G_CONSTANT i8 0
-    ; X32: $al = COPY [[C]](s8)
-    ; X32: RET 0, implicit $al
+    ; X32: liveins: $edi
+    ; X32-NEXT: {{  $}}
+    ; X32-NEXT: [[C:%[0-9]+]]:_(s8) = G_CONSTANT i8 0
+    ; X32-NEXT: $al = COPY [[C]](s8)
+    ; X32-NEXT: RET 0, implicit $al
+    ;
     ; X64-LABEL: name: test_sext_i1toi8
-    ; X64: [[C:%[0-9]+]]:_(s8) = G_CONSTANT i8 0
-    ; X64: $al = COPY [[C]](s8)
-    ; X64: RET 0, implicit $al
+    ; X64: liveins: $edi
+    ; X64-NEXT: {{  $}}
+    ; X64-NEXT: [[C:%[0-9]+]]:_(s8) = G_CONSTANT i8 0
+    ; X64-NEXT: $al = COPY [[C]](s8)
+    ; X64-NEXT: RET 0, implicit $al
     %0(s1) = G_IMPLICIT_DEF
     %1(s8) = G_SEXT %0(s1)
     $al = COPY %1(s8)
@@ -314,13 +349,18 @@ body:             |
     liveins: $edi
 
     ; X32-LABEL: name: test_sext_i1toi16
-    ; X32: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 0
-    ; X32: $ax = COPY [[C]](s16)
-    ; X32: RET 0, implicit $ax
+    ; X32: liveins: $edi
+    ; X32-NEXT: {{  $}}
+    ; X32-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 0
+    ; X32-NEXT: $ax = COPY [[C]](s16)
+    ; X32-NEXT: RET 0, implicit $ax
+    ;
     ; X64-LABEL: name: test_sext_i1toi16
-    ; X64: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 0
-    ; X64: $ax = COPY [[C]](s16)
-    ; X64: RET 0, implicit $ax
+    ; X64: liveins: $edi
+    ; X64-NEXT: {{  $}}
+    ; X64-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 0
+    ; X64-NEXT: $ax = COPY [[C]](s16)
+    ; X64-NEXT: RET 0, implicit $ax
     %0(s1) = G_IMPLICIT_DEF
     %1(s16) = G_SEXT %0(s1)
     $ax = COPY %1(s16)
@@ -341,13 +381,18 @@ body:             |
     liveins: $edi
 
     ; X32-LABEL: name: test_sext_i1
-    ; X32: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-    ; X32: $eax = COPY [[C]](s32)
-    ; X32: RET 0, implicit $eax
+    ; X32: liveins: $edi
+    ; X32-NEXT: {{  $}}
+    ; X32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; X32-NEXT: $eax = COPY [[C]](s32)
+    ; X32-NEXT: RET 0, implicit $eax
+    ;
     ; X64-LABEL: name: test_sext_i1
-    ; X64: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-    ; X64: $eax = COPY [[C]](s32)
-    ; X64: RET 0, implicit $eax
+    ; X64: liveins: $edi
+    ; X64-NEXT: {{  $}}
+    ; X64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; X64-NEXT: $eax = COPY [[C]](s32)
+    ; X64-NEXT: RET 0, implicit $eax
     %0(s1) = G_IMPLICIT_DEF
     %2(s32) = G_SEXT %0(s1)
     $eax = COPY %2(s32)
@@ -367,15 +412,20 @@ body:             |
     liveins: $edi
 
     ; X32-LABEL: name: test_sext_i8toi16
-    ; X32: [[COPY:%[0-9]+]]:_(s8) = COPY $dil
-    ; X32: [[SEXT:%[0-9]+]]:_(s16) = G_SEXT [[COPY]](s8)
-    ; X32: $ax = COPY [[SEXT]](s16)
-    ; X32: RET 0, implicit $ax
+    ; X32: liveins: $edi
+    ; X32-NEXT: {{  $}}
+    ; X32-NEXT: [[COPY:%[0-9]+]]:_(s8) = COPY $dil
+    ; X32-NEXT: [[SEXT:%[0-9]+]]:_(s16) = G_SEXT [[COPY]](s8)
+    ; X32-NEXT: $ax = COPY [[SEXT]](s16)
+    ; X32-NEXT: RET 0, implicit $ax
+    ;
     ; X64-LABEL: name: test_sext_i8toi16
-    ; X64: [[COPY:%[0-9]+]]:_(s8) = COPY $dil
-    ; X64: [[SEXT:%[0-9]+]]:_(s16) = G_SEXT [[COPY]](s8)
-    ; X64: $ax = COPY [[SEXT]](s16)
-    ; X64: RET 0, implicit $ax
+    ; X64: liveins: $edi
+    ; X64-NEXT: {{  $}}
+    ; X64-NEXT: [[COPY:%[0-9]+]]:_(s8) = COPY $dil
+    ; X64-NEXT: [[SEXT:%[0-9]+]]:_(s16) = G_SEXT [[COPY]](s8)
+    ; X64-NEXT: $ax = COPY [[SEXT]](s16)
+    ; X64-NEXT: RET 0, implicit $ax
     %0(s8) = COPY $dil
     %1(s16) = G_SEXT %0(s8)
     $ax = COPY %1(s16)
@@ -395,15 +445,20 @@ body:             |
     liveins: $edi
 
     ; X32-LABEL: name: test_sext_i8
-    ; X32: [[COPY:%[0-9]+]]:_(s8) = COPY $dil
-    ; X32: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[COPY]](s8)
-    ; X32: $eax = COPY [[SEXT]](s32)
-    ; X32: RET 0, implicit $eax
+    ; X32: liveins: $edi
+    ; X32-NEXT: {{  $}}
+    ; X32-NEXT: [[COPY:%[0-9]+]]:_(s8) = COPY $dil
+    ; X32-NEXT: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[COPY]](s8)
+    ; X32-NEXT: $eax = COPY [[SEXT]](s32)
+    ; X32-NEXT: RET 0, implicit $eax
+    ;
     ; X64-LABEL: name: test_sext_i8
-    ; X64: [[COPY:%[0-9]+]]:_(s8) = COPY $dil
-    ; X64: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[COPY]](s8)
-    ; X64: $eax = COPY [[SEXT]](s32)
-    ; X64: RET 0, implicit $eax
+    ; X64: liveins: $edi
+    ; X64-NEXT: {{  $}}
+    ; X64-NEXT: [[COPY:%[0-9]+]]:_(s8) = COPY $dil
+    ; X64-NEXT: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[COPY]](s8)
+    ; X64-NEXT: $eax = COPY [[SEXT]](s32)
+    ; X64-NEXT: RET 0, implicit $eax
     %0(s8) = COPY $dil
     %1(s32) = G_SEXT %0(s8)
     $eax = COPY %1(s32)
@@ -423,15 +478,20 @@ body:             |
     liveins: $edi
 
     ; X32-LABEL: name: test_sext_i16
-    ; X32: [[COPY:%[0-9]+]]:_(s16) = COPY $di
-    ; X32: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[COPY]](s16)
-    ; X32: $eax = COPY [[SEXT]](s32)
-    ; X32: RET 0, implicit $eax
+    ; X32: liveins: $edi
+    ; X32-NEXT: {{  $}}
+    ; X32-NEXT: [[COPY:%[0-9]+]]:_(s16) = COPY $di
+    ; X32-NEXT: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[COPY]](s16)
+    ; X32-NEXT: $eax = COPY [[SEXT]](s32)
+    ; X32-NEXT: RET 0, implicit $eax
+    ;
     ; X64-LABEL: name: test_sext_i16
-    ; X64: [[COPY:%[0-9]+]]:_(s16) = COPY $di
-    ; X64: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[COPY]](s16)
-    ; X64: $eax = COPY [[SEXT]](s32)
-    ; X64: RET 0, implicit $eax
+    ; X64: liveins: $edi
+    ; X64-NEXT: {{  $}}
+    ; X64-NEXT: [[COPY:%[0-9]+]]:_(s16) = COPY $di
+    ; X64-NEXT: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[COPY]](s16)
+    ; X64-NEXT: $eax = COPY [[SEXT]](s32)
+    ; X64-NEXT: RET 0, implicit $eax
     %0(s16) = COPY $di
     %1(s32) = G_SEXT %0(s16)
     $eax = COPY %1(s32)
@@ -452,15 +512,20 @@ body:             |
     liveins: $edi
 
     ; X32-LABEL: name: test_anyext_i1toi8
-    ; X32: [[COPY:%[0-9]+]]:_(s32) = COPY $edi
-    ; X32: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[COPY]](s32)
-    ; X32: $al = COPY [[TRUNC]](s8)
-    ; X32: RET 0, implicit $al
+    ; X32: liveins: $edi
+    ; X32-NEXT: {{  $}}
+    ; X32-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $edi
+    ; X32-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[COPY]](s32)
+    ; X32-NEXT: $al = COPY [[TRUNC]](s8)
+    ; X32-NEXT: RET 0, implicit $al
+    ;
     ; X64-LABEL: name: test_anyext_i1toi8
-    ; X64: [[COPY:%[0-9]+]]:_(s32) = COPY $edi
-    ; X64: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[COPY]](s32)
-    ; X64: $al = COPY [[TRUNC]](s8)
-    ; X64: RET 0, implicit $al
+    ; X64: liveins: $edi
+    ; X64-NEXT: {{  $}}
+    ; X64-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $edi
+    ; X64-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[COPY]](s32)
+    ; X64-NEXT: $al = COPY [[TRUNC]](s8)
+    ; X64-NEXT: RET 0, implicit $al
     %0(s32) = COPY $edi
     %1(s1) = G_TRUNC %0(s32)
     %2(s8) = G_ANYEXT %1(s1)
@@ -482,15 +547,20 @@ body:             |
     liveins: $edi
 
     ; X32-LABEL: name: test_anyext_i1toi16
-    ; X32: [[COPY:%[0-9]+]]:_(s32) = COPY $edi
-    ; X32: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
-    ; X32: $ax = COPY [[TRUNC]](s16)
-    ; X32: RET 0, implicit $ax
+    ; X32: liveins: $edi
+    ; X32-NEXT: {{  $}}
+    ; X32-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $edi
+    ; X32-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+    ; X32-NEXT: $ax = COPY [[TRUNC]](s16)
+    ; X32-NEXT: RET 0, implicit $ax
+    ;
     ; X64-LABEL: name: test_anyext_i1toi16
-    ; X64: [[COPY:%[0-9]+]]:_(s32) = COPY $edi
-    ; X64: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
-    ; X64: $ax = COPY [[TRUNC]](s16)
-    ; X64: RET 0, implicit $ax
+    ; X64: liveins: $edi
+    ; X64-NEXT: {{  $}}
+    ; X64-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $edi
+    ; X64-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+    ; X64-NEXT: $ax = COPY [[TRUNC]](s16)
+    ; X64-NEXT: RET 0, implicit $ax
     %0(s32) = COPY $edi
     %1(s1) = G_TRUNC %0(s32)
     %2(s16) = G_ANYEXT %1(s1)
@@ -512,15 +582,20 @@ body:             |
     liveins: $edi
 
     ; X32-LABEL: name: test_anyext_i1
-    ; X32: [[COPY:%[0-9]+]]:_(s8) = COPY $dil
-    ; X32: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[COPY]](s8)
-    ; X32: $eax = COPY [[ANYEXT]](s32)
-    ; X32: RET 0, implicit $eax
+    ; X32: liveins: $edi
+    ; X32-NEXT: {{  $}}
+    ; X32-NEXT: [[COPY:%[0-9]+]]:_(s8) = COPY $dil
+    ; X32-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[COPY]](s8)
+    ; X32-NEXT: $eax = COPY [[ANYEXT]](s32)
+    ; X32-NEXT: RET 0, implicit $eax
+    ;
     ; X64-LABEL: name: test_anyext_i1
-    ; X64: [[COPY:%[0-9]+]]:_(s8) = COPY $dil
-    ; X64: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[COPY]](s8)
-    ; X64: $eax = COPY [[ANYEXT]](s32)
-    ; X64: RET 0, implicit $eax
+    ; X64: liveins: $edi
+    ; X64-NEXT: {{  $}}
+    ; X64-NEXT: [[COPY:%[0-9]+]]:_(s8) = COPY $dil
+    ; X64-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[COPY]](s8)
+    ; X64-NEXT: $eax = COPY [[ANYEXT]](s32)
+    ; X64-NEXT: RET 0, implicit $eax
     %0(s8) = COPY $dil
     %1(s1) = G_TRUNC %0(s8)
     %2(s32) = G_ANYEXT %1(s1)
@@ -541,15 +616,20 @@ body:             |
     liveins: $edi
 
     ; X32-LABEL: name: test_anyext_i8toi16
-    ; X32: [[COPY:%[0-9]+]]:_(s8) = COPY $dil
-    ; X32: [[ANYEXT:%[0-9]+]]:_(s16) = G_ANYEXT [[COPY]](s8)
-    ; X32: $ax = COPY [[ANYEXT]](s16)
-    ; X32: RET 0, implicit $ax
+    ; X32: liveins: $edi
+    ; X32-NEXT: {{  $}}
+    ; X32-NEXT: [[COPY:%[0-9]+]]:_(s8) = COPY $dil
+    ; X32-NEXT: [[ANYEXT:%[0-9]+]]:_(s16) = G_ANYEXT [[COPY]](s8)
+    ; X32-NEXT: $ax = COPY [[ANYEXT]](s16)
+    ; X32-NEXT: RET 0, implicit $ax
+    ;
     ; X64-LABEL: name: test_anyext_i8toi16
-    ; X64: [[COPY:%[0-9]+]]:_(s8) = COPY $dil
-    ; X64: [[ANYEXT:%[0-9]+]]:_(s16) = G_ANYEXT [[COPY]](s8)
-    ; X64: $ax = COPY [[ANYEXT]](s16)
-    ; X64: RET 0, implicit $ax
+    ; X64: liveins: $edi
+    ; X64-NEXT: {{  $}}
+    ; X64-NEXT: [[COPY:%[0-9]+]]:_(s8) = COPY $dil
+    ; X64-NEXT: [[ANYEXT:%[0-9]+]]:_(s16) = G_ANYEXT [[COPY]](s8)
+    ; X64-NEXT: $ax = COPY [[ANYEXT]](s16)
+    ; X64-NEXT: RET 0, implicit $ax
     %0(s8) = COPY $dil
     %1(s16) = G_ANYEXT %0(s8)
     $ax = COPY %1(s16)
@@ -569,15 +649,20 @@ body:             |
     liveins: $edi
 
     ; X32-LABEL: name: test_anyext_i8
-    ; X32: [[COPY:%[0-9]+]]:_(s8) = COPY $dil
-    ; X32: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[COPY]](s8)
-    ; X32: $eax = COPY [[ANYEXT]](s32)
-    ; X32: RET 0, implicit $eax
+    ; X32: liveins: $edi
+    ; X32-NEXT: {{  $}}
+    ; X32-NEXT: [[COPY:%[0-9]+]]:_(s8) = COPY $dil
+    ; X32-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[COPY]](s8)
+    ; X32-NEXT: $eax = COPY [[ANYEXT]](s32)
+    ; X32-NEXT: RET 0, implicit $eax
+    ;
     ; X64-LABEL: name: test_anyext_i8
-    ; X64: [[COPY:%[0-9]+]]:_(s8) = COPY $dil
-    ; X64: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[COPY]](s8)
-    ; X64: $eax = COPY [[ANYEXT]](s32)
-    ; X64: RET 0, implicit $eax
+    ; X64: liveins: $edi
+    ; X64-NEXT: {{  $}}
+    ; X64-NEXT: [[COPY:%[0-9]+]]:_(s8) = COPY $dil
+    ; X64-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[COPY]](s8)
+    ; X64-NEXT: $eax = COPY [[ANYEXT]](s32)
+    ; X64-NEXT: RET 0, implicit $eax
     %0(s8) = COPY $dil
     %1(s32) = G_ANYEXT %0(s8)
     $eax = COPY %1(s32)
@@ -597,15 +682,20 @@ body:             |
     liveins: $edi
 
     ; X32-LABEL: name: test_anyext_i16
-    ; X32: [[COPY:%[0-9]+]]:_(s16) = COPY $di
-    ; X32: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[COPY]](s16)
-    ; X32: $eax = COPY [[ANYEXT]](s32)
-    ; X32: RET 0, implicit $eax
+    ; X32: liveins: $edi
+    ; X32-NEXT: {{  $}}
+    ; X32-NEXT: [[COPY:%[0-9]+]]:_(s16) = COPY $di
+    ; X32-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[COPY]](s16)
+    ; X32-NEXT: $eax = COPY [[ANYEXT]](s32)
+    ; X32-NEXT: RET 0, implicit $eax
+    ;
     ; X64-LABEL: name: test_anyext_i16
-    ; X64: [[COPY:%[0-9]+]]:_(s16) = COPY $di
-    ; X64: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[COPY]](s16)
-    ; X64: $eax = COPY [[ANYEXT]](s32)
-    ; X64: RET 0, implicit $eax
+    ; X64: liveins: $edi
+    ; X64-NEXT: {{  $}}
+    ; X64-NEXT: [[COPY:%[0-9]+]]:_(s16) = COPY $di
+    ; X64-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[COPY]](s16)
+    ; X64-NEXT: $eax = COPY [[ANYEXT]](s32)
+    ; X64-NEXT: RET 0, implicit $eax
     %0(s16) = COPY $di
     %1(s32) = G_ANYEXT %0(s16)
     $eax = COPY %1(s32)
diff --git a/llvm/test/CodeGen/X86/GlobalISel/legalize-trailing-zeros-undef.mir b/llvm/test/CodeGen/X86/GlobalISel/legalize-trailing-zeros-undef.mir
index 0594793230790..9807d13e3235a 100644
--- a/llvm/test/CodeGen/X86/GlobalISel/legalize-trailing-zeros-undef.mir
+++ b/llvm/test/CodeGen/X86/GlobalISel/legalize-trailing-zeros-undef.mir
@@ -21,6 +21,7 @@ body:             |
     ; X64-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 34359738367
     ; X64-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[CTTZ_ZERO_UNDEF]], [[C1]]
     ; X64-NEXT: RET 0, implicit [[AND]](s64)
+    ;
     ; X86-LABEL: name: test_cttz35
     ; X86: [[COPY:%[0-9]+]]:_(s64) = COPY $rdx
     ; X86-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
@@ -34,8 +35,8 @@ body:             |
     ; X86-NEXT: [[UADDO:%[0-9]+]]:_(s32), [[UADDO1:%[0-9]+]]:_(s1) = G_UADDO [[CTTZ_ZERO_UNDEF]], [[C2]]
     ; X86-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s1) = G_UADDE [[C]], [[C]], [[UADDO1]]
     ; X86-NEXT: [[CTTZ_ZERO_UNDEF1:%[0-9]+]]:_(s32) = G_CTTZ_ZERO_UNDEF [[OR]](s32)
-    ; X86-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; X86-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[ICMP]](s8)
+    ; X86-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; X86-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C3]]
     ; X86-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AND]](s32), [[UADDO]], [[CTTZ_ZERO_UNDEF1]]
     ; X86-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[AND]](s32), [[UADDE]], [[C]]
@@ -88,6 +89,7 @@ body:             |
     ; X64-NEXT: [[CTTZ_ZERO_UNDEF:%[0-9]+]]:_(s64) = G_CTTZ_ZERO_UNDEF [[DEF]](s64)
     ; X64-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY [[CTTZ_ZERO_UNDEF]](s64)
     ; X64-NEXT: RET 0, implicit [[COPY]](s64)
+    ;
     ; X86-LABEL: name: test_cttz64
     ; X86: [[DEF:%[0-9]+]]:_(s64) = IMPLICIT_DEF
     ; X86-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[DEF]](s64)
@@ -98,8 +100,8 @@ body:             |
     ; X86-NEXT: [[UADDO:%[0-9]+]]:_(s32), [[UADDO1:%[0-9]+]]:_(s1) = G_UADDO [[CTTZ_ZERO_UNDEF]], [[C1]]
     ; X86-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s1) = G_UADDE [[C]], [[C]], [[UADDO1]]
     ; X86-NEXT: [[CTTZ_ZERO_UNDEF1:%[0-9]+]]:_(s32) = G_CTTZ_ZERO_UNDEF [[UV]](s32)
-    ; X86-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; X86-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[ICMP]](s8)
+    ; X86-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; X86-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C2]]
     ; X86-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AND]](s32), [[UADDO]], [[CTTZ_ZERO_UNDEF1]]
     ; X86-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[AND]](s32), [[UADDE]], [[C]]
diff --git a/llvm/test/CodeGen/X86/GlobalISel/legalize-trailing-zeros.mir b/llvm/test/CodeGen/X86/GlobalISel/legalize-trailing-zeros.mir
index 9459b2ff19dac..e2d10423dbec5 100644
--- a/llvm/test/CodeGen/X86/GlobalISel/legalize-trailing-zeros.mir
+++ b/llvm/test/CodeGen/X86/GlobalISel/legalize-trailing-zeros.mir
@@ -21,6 +21,7 @@ body:             |
     ; X64-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 34359738367
     ; X64-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[CTTZ_ZERO_UNDEF]], [[C1]]
     ; X64-NEXT: RET 0, implicit [[AND]](s64)
+    ;
     ; X86-LABEL: name: test_cttz35
     ; X86: [[COPY:%[0-9]+]]:_(s64) = COPY $rdx
     ; X86-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
@@ -34,8 +35,8 @@ body:             |
     ; X86-NEXT: [[UADDO:%[0-9]+]]:_(s32), [[UADDO1:%[0-9]+]]:_(s1) = G_UADDO [[CTTZ_ZERO_UNDEF]], [[C2]]
     ; X86-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s1) = G_UADDE [[C]], [[C]], [[UADDO1]]
     ; X86-NEXT: [[CTTZ_ZERO_UNDEF1:%[0-9]+]]:_(s32) = G_CTTZ_ZERO_UNDEF [[OR]](s32)
-    ; X86-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; X86-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[ICMP]](s8)
+    ; X86-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; X86-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C3]]
     ; X86-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AND]](s32), [[UADDO]], [[CTTZ_ZERO_UNDEF1]]
     ; X86-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[AND]](s32), [[UADDE]], [[C]]
@@ -90,6 +91,7 @@ body:             |
     ; X64-NEXT: [[CTTZ:%[0-9]+]]:_(s64) = G_CTTZ [[DEF]](s64)
     ; X64-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY [[CTTZ]](s64)
     ; X64-NEXT: RET 0, implicit [[COPY]](s64)
+    ;
     ; X86-LABEL: name: test_cttz64
     ; X86: [[DEF:%[0-9]+]]:_(s64) = IMPLICIT_DEF
     ; X86-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[DEF]](s64)
@@ -100,8 +102,8 @@ body:             |
     ; X86-NEXT: [[UADDO:%[0-9]+]]:_(s32), [[UADDO1:%[0-9]+]]:_(s1) = G_UADDO [[CTTZ]], [[C1]]
     ; X86-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s1) = G_UADDE [[C]], [[C]], [[UADDO1]]
     ; X86-NEXT: [[CTTZ_ZERO_UNDEF:%[0-9]+]]:_(s32) = G_CTTZ_ZERO_UNDEF [[UV]](s32)
-    ; X86-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; X86-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[ICMP]](s8)
+    ; X86-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; X86-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C2]]
     ; X86-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[AND]](s32), [[UADDO]], [[CTTZ_ZERO_UNDEF]]
     ; X86-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[AND]](s32), [[UADDE]], [[C]]
diff --git a/llvm/test/CodeGen/X86/GlobalISel/legalize-trunc.mir b/llvm/test/CodeGen/X86/GlobalISel/legalize-trunc.mir
index e0fcce270e274..9a2414a4eb755 100644
--- a/llvm/test/CodeGen/X86/GlobalISel/legalize-trunc.mir
+++ b/llvm/test/CodeGen/X86/GlobalISel/legalize-trunc.mir
@@ -18,28 +18,29 @@ body:             |
   bb.1 (%ir-block.0):
     ; X32-LABEL: name: trunc_check
     ; X32: [[DEF:%[0-9]+]]:_(s32) = IMPLICIT_DEF
-    ; X32: [[DEF1:%[0-9]+]]:_(p0) = G_IMPLICIT_DEF
-    ; X32: [[C:%[0-9]+]]:_(s8) = G_CONSTANT i8 1
-    ; X32: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[DEF]](s32)
-    ; X32: [[AND:%[0-9]+]]:_(s8) = G_AND [[TRUNC]], [[C]]
-    ; X32: G_STORE [[AND]](s8), [[DEF1]](p0) :: (store (s1))
-    ; X32: [[TRUNC1:%[0-9]+]]:_(s8) = G_TRUNC [[DEF]](s32)
-    ; X32: G_STORE [[TRUNC1]](s8), [[DEF1]](p0) :: (store (s8))
-    ; X32: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[DEF]](s32)
-    ; X32: G_STORE [[TRUNC2]](s16), [[DEF1]](p0) :: (store (s16))
-    ; X32: RET 0
+    ; X32-NEXT: [[DEF1:%[0-9]+]]:_(p0) = G_IMPLICIT_DEF
+    ; X32-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[DEF]](s32)
+    ; X32-NEXT: [[C:%[0-9]+]]:_(s8) = G_CONSTANT i8 1
+    ; X32-NEXT: [[AND:%[0-9]+]]:_(s8) = G_AND [[TRUNC]], [[C]]
+    ; X32-NEXT: G_STORE [[AND]](s8), [[DEF1]](p0) :: (store (s1))
+    ; X32-NEXT: [[TRUNC1:%[0-9]+]]:_(s8) = G_TRUNC [[DEF]](s32)
+    ; X32-NEXT: G_STORE [[TRUNC1]](s8), [[DEF1]](p0) :: (store (s8))
+    ; X32-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[DEF]](s32)
+    ; X32-NEXT: G_STORE [[TRUNC2]](s16), [[DEF1]](p0) :: (store (s16))
+    ; X32-NEXT: RET 0
+    ;
     ; X64-LABEL: name: trunc_check
     ; X64: [[DEF:%[0-9]+]]:_(s32) = IMPLICIT_DEF
-    ; X64: [[DEF1:%[0-9]+]]:_(p0) = G_IMPLICIT_DEF
-    ; X64: [[C:%[0-9]+]]:_(s8) = G_CONSTANT i8 1
-    ; X64: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[DEF]](s32)
-    ; X64: [[AND:%[0-9]+]]:_(s8) = G_AND [[TRUNC]], [[C]]
-    ; X64: G_STORE [[AND]](s8), [[DEF1]](p0) :: (store (s1))
-    ; X64: [[TRUNC1:%[0-9]+]]:_(s8) = G_TRUNC [[DEF]](s32)
-    ; X64: G_STORE [[TRUNC1]](s8), [[DEF1]](p0) :: (store (s8))
-    ; X64: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[DEF]](s32)
-    ; X64: G_STORE [[TRUNC2]](s16), [[DEF1]](p0) :: (store (s16))
-    ; X64: RET 0
+    ; X64-NEXT: [[DEF1:%[0-9]+]]:_(p0) = G_IMPLICIT_DEF
+    ; X64-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[DEF]](s32)
+    ; X64-NEXT: [[C:%[0-9]+]]:_(s8) = G_CONSTANT i8 1
+    ; X64-NEXT: [[AND:%[0-9]+]]:_(s8) = G_AND [[TRUNC]], [[C]]
+    ; X64-NEXT: G_STORE [[AND]](s8), [[DEF1]](p0) :: (store (s1))
+    ; X64-NEXT: [[TRUNC1:%[0-9]+]]:_(s8) = G_TRUNC [[DEF]](s32)
+    ; X64-NEXT: G_STORE [[TRUNC1]](s8), [[DEF1]](p0) :: (store (s8))
+    ; X64-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[DEF]](s32)
+    ; X64-NEXT: G_STORE [[TRUNC2]](s16), [[DEF1]](p0) :: (store (s16))
+    ; X64-NEXT: RET 0
     %0(s32) = IMPLICIT_DEF
     %1(s1)  = G_TRUNC %0(s32)
     %4:_(p0) = G_IMPLICIT_DEF
diff --git a/llvm/test/CodeGen/X86/GlobalISel/lshr-scalar.ll b/llvm/test/CodeGen/X86/GlobalISel/lshr-scalar.ll
index fe5c78cd0767f..8a6215e07ebf6 100644
--- a/llvm/test/CodeGen/X86/GlobalISel/lshr-scalar.ll
+++ b/llvm/test/CodeGen/X86/GlobalISel/lshr-scalar.ll
@@ -165,8 +165,7 @@ define i1 @test_lshr_i1_imm1(i32 %arg1) {
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    andb $1, %al
-; X64-NEXT:    movb $1, %cl
-; X64-NEXT:    shrb %cl, %al
+; X64-NEXT:    shrb %al
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
   %a = trunc i32 %arg1 to i1
diff --git a/llvm/test/CodeGen/X86/GlobalISel/x86_64-legalize-zext.mir b/llvm/test/CodeGen/X86/GlobalISel/x86_64-legalize-zext.mir
index b071c29e0d197..e3074c3e01f54 100644
--- a/llvm/test/CodeGen/X86/GlobalISel/x86_64-legalize-zext.mir
+++ b/llvm/test/CodeGen/X86/GlobalISel/x86_64-legalize-zext.mir
@@ -68,12 +68,13 @@ body:             |
 
     ; CHECK-LABEL: name: zext_i1_to_i8
     ; CHECK: liveins: $edi
-    ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $edi
-    ; CHECK: [[C:%[0-9]+]]:_(s8) = G_CONSTANT i8 1
-    ; CHECK: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[COPY]](s32)
-    ; CHECK: [[AND:%[0-9]+]]:_(s8) = G_AND [[TRUNC]], [[C]]
-    ; CHECK: $al = COPY [[AND]](s8)
-    ; CHECK: RET 0, implicit $al
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $edi
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[COPY]](s32)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s8) = G_CONSTANT i8 1
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s8) = G_AND [[TRUNC]], [[C]]
+    ; CHECK-NEXT: $al = COPY [[AND]](s8)
+    ; CHECK-NEXT: RET 0, implicit $al
     %1:_(s32) = COPY $edi
     %0:_(s1) = G_TRUNC %1(s32)
     %2:_(s8) = G_ZEXT %0(s1)
@@ -95,12 +96,13 @@ body:             |
 
     ; CHECK-LABEL: name: zext_i1_to_i16
     ; CHECK: liveins: $edi
-    ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $edi
-    ; CHECK: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 1
-    ; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
-    ; CHECK: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C]]
-    ; CHECK: $ax = COPY [[AND]](s16)
-    ; CHECK: RET 0, implicit $ax
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $edi
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 1
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C]]
+    ; CHECK-NEXT: $ax = COPY [[AND]](s16)
+    ; CHECK-NEXT: RET 0, implicit $ax
     %1:_(s32) = COPY $edi
     %0:_(s1) = G_TRUNC %1(s32)
     %2:_(s16) = G_ZEXT %0(s1)
@@ -122,11 +124,12 @@ body:             |
 
     ; CHECK-LABEL: name: zext_i1_to_i32
     ; CHECK: liveins: $edi
-    ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $edi
-    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C]]
-    ; CHECK: $eax = COPY [[AND]](s32)
-    ; CHECK: RET 0, implicit $eax
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $edi
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C]]
+    ; CHECK-NEXT: $eax = COPY [[AND]](s32)
+    ; CHECK-NEXT: RET 0, implicit $eax
     %1:_(s32) = COPY $edi
     %0:_(s1) = G_TRUNC %1(s32)
     %2:_(s32) = G_ZEXT %0(s1)
@@ -148,12 +151,13 @@ body:             |
 
     ; CHECK-LABEL: name: zext_i1_to_i64
     ; CHECK: liveins: $edi
-    ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $edi
-    ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; CHECK: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[COPY]](s32)
-    ; CHECK: [[AND:%[0-9]+]]:_(s64) = G_AND [[ANYEXT]], [[C]]
-    ; CHECK: $rax = COPY [[AND]](s64)
-    ; CHECK: RET 0, implicit $rax
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $edi
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[COPY]](s32)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[ANYEXT]], [[C]]
+    ; CHECK-NEXT: $rax = COPY [[AND]](s64)
+    ; CHECK-NEXT: RET 0, implicit $rax
     %1:_(s32) = COPY $edi
     %0:_(s1) = G_TRUNC %1(s32)
     %2:_(s64) = G_ZEXT %0(s1)
@@ -175,12 +179,13 @@ body:             |
 
     ; CHECK-LABEL: name: zext_i8_to_i16
     ; CHECK: liveins: $edi
-    ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $edi
-    ; CHECK: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
-    ; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
-    ; CHECK: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C]]
-    ; CHECK: $ax = COPY [[AND]](s16)
-    ; CHECK: RET 0, implicit $ax
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $edi
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 255
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C]]
+    ; CHECK-NEXT: $ax = COPY [[AND]](s16)
+    ; CHECK-NEXT: RET 0, implicit $ax
     %1:_(s32) = COPY $edi
     %0:_(s8) = G_TRUNC %1(s32)
     %2:_(s16) = G_ZEXT %0(s8)
@@ -202,11 +207,12 @@ body:             |
 
     ; CHECK-LABEL: name: zext_i8_to_i32
     ; CHECK: liveins: $edi
-    ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $edi
-    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
-    ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C]]
-    ; CHECK: $eax = COPY [[AND]](s32)
-    ; CHECK: RET 0, implicit $eax
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $edi
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C]]
+    ; CHECK-NEXT: $eax = COPY [[AND]](s32)
+    ; CHECK-NEXT: RET 0, implicit $eax
     %1:_(s32) = COPY $edi
     %0:_(s8) = G_TRUNC %1(s32)
     %2:_(s32) = G_ZEXT %0(s8)
@@ -228,12 +234,13 @@ body:             |
 
     ; CHECK-LABEL: name: zext_i8_to_i64
     ; CHECK: liveins: $edi
-    ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $edi
-    ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 255
-    ; CHECK: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[COPY]](s32)
-    ; CHECK: [[AND:%[0-9]+]]:_(s64) = G_AND [[ANYEXT]], [[C]]
-    ; CHECK: $rax = COPY [[AND]](s64)
-    ; CHECK: RET 0, implicit $rax
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $edi
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[COPY]](s32)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 255
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[ANYEXT]], [[C]]
+    ; CHECK-NEXT: $rax = COPY [[AND]](s64)
+    ; CHECK-NEXT: RET 0, implicit $rax
     %1:_(s32) = COPY $edi
     %0:_(s8) = G_TRUNC %1(s32)
     %2:_(s64) = G_ZEXT %0(s8)
@@ -255,11 +262,12 @@ body:             |
 
     ; CHECK-LABEL: name: zext_i16_to_i32
     ; CHECK: liveins: $edi
-    ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $edi
-    ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
-    ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C]]
-    ; CHECK: $eax = COPY [[AND]](s32)
-    ; CHECK: RET 0, implicit $eax
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $edi
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C]]
+    ; CHECK-NEXT: $eax = COPY [[AND]](s32)
+    ; CHECK-NEXT: RET 0, implicit $eax
     %1:_(s32) = COPY $edi
     %0:_(s16) = G_TRUNC %1(s32)
     %2:_(s32) = G_ZEXT %0(s16)
@@ -281,12 +289,13 @@ body:             |
 
     ; CHECK-LABEL: name: zext_i16_to_i64
     ; CHECK: liveins: $edi
-    ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $edi
-    ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 65535
-    ; CHECK: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[COPY]](s32)
-    ; CHECK: [[AND:%[0-9]+]]:_(s64) = G_AND [[ANYEXT]], [[C]]
-    ; CHECK: $rax = COPY [[AND]](s64)
-    ; CHECK: RET 0, implicit $rax
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $edi
+    ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[COPY]](s32)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 65535
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[ANYEXT]], [[C]]
+    ; CHECK-NEXT: $rax = COPY [[AND]](s64)
+    ; CHECK-NEXT: RET 0, implicit $rax
     %1:_(s32) = COPY $edi
     %0:_(s16) = G_TRUNC %1(s32)
     %2:_(s64) = G_ZEXT %0(s16)
@@ -307,10 +316,11 @@ body:             |
 
     ; CHECK-LABEL: name: zext_i32_to_i64
     ; CHECK: liveins: $edi
-    ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $edi
-    ; CHECK: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[COPY]](s32)
-    ; CHECK: $rax = COPY [[ZEXT]](s64)
-    ; CHECK: RET 0, implicit $rax
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $edi
+    ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[COPY]](s32)
+    ; CHECK-NEXT: $rax = COPY [[ZEXT]](s64)
+    ; CHECK-NEXT: RET 0, implicit $rax
     %0:_(s32) = COPY $edi
     %1:_(s64) = G_ZEXT %0(s32)
     $rax = COPY %1(s64)
diff --git a/llvm/unittests/CodeGen/GlobalISel/LegalizerTest.cpp b/llvm/unittests/CodeGen/GlobalISel/LegalizerTest.cpp
index b829df42dc401..401d04954a669 100644
--- a/llvm/unittests/CodeGen/GlobalISel/LegalizerTest.cpp
+++ b/llvm/unittests/CodeGen/GlobalISel/LegalizerTest.cpp
@@ -212,8 +212,8 @@ TEST_F(AArch64GISelMITest, UnorderedArtifactCombiningManyCopiesTest) {
     CHECK-NEXT: [[OFFSET_1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
     CHECK-NEXT: [[VPTR_1:%[0-9]+]]:_(p0) = G_PTR_ADD %vptr:_, [[OFFSET_1]]:_(s64)
     CHECK-NEXT: [[LOAD_1:%[0-9]+]]:_(s16) = G_LOAD [[VPTR_1]]:_(p0) :: (load (s8) from unknown-address + 1)
-    CHECK-NEXT: [[FF_MASK:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
     CHECK-NEXT: [[V0_EXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD_0]]:_(s16)
+    CHECK-NEXT: [[FF_MASK:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
     CHECK-NEXT: %v0_zext:_(s32) = G_AND [[V0_EXT]]:_, [[FF_MASK]]:_
     CHECK-NEXT: [[V1_EXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD_1]]:_(s16)
     CHECK-NEXT: [[SHAMNT:%[0-9]+]]:_(s32) = G_CONSTANT i32 24

From cd4cdf22ec70c511924ffbdf35a09f29777c0f23 Mon Sep 17 00:00:00 2001
From: PietroGhg <38155419+PietroGhg@users.noreply.github.com>
Date: Thu, 2 Nov 2023 00:24:35 +0100
Subject: [PATCH 758/877] [SYCL] [NATIVECPU] Fix postcommit failure  (#11749)

Fixes post commit failure from https://github.com/intel/llvm/pull/11685
---
 sycl/test/check_device_code/native_cpu/native_cpu_builtins.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/sycl/test/check_device_code/native_cpu/native_cpu_builtins.cpp b/sycl/test/check_device_code/native_cpu/native_cpu_builtins.cpp
index acaf3f30a240e..9f6c6c07755fd 100644
--- a/sycl/test/check_device_code/native_cpu/native_cpu_builtins.cpp
+++ b/sycl/test/check_device_code/native_cpu/native_cpu_builtins.cpp
@@ -88,7 +88,6 @@ int main() {
 
 // check that builtins are generated as expected
 // CHECK: define weak i64 @__dpcpp_nativecpu_global_id(i32 %[[DIMARG:.*]], ptr addrspace(1) %[[PTRARG:.*]]) {
-// CHECK: entry:
 // CHECK:   %[[GEP:.*]] = getelementptr %struct.__nativecpu_state, ptr addrspace(1) %[[PTRARG]], i64 0, i32 0, i32 %[[DIMARG]]
 // CHECK:   %[[LOAD:.*]] = load i64, ptr addrspace(1) %[[GEP]], align 8
 // CHECK:   ret i64 %[[LOAD]]

From 2667dbff122f75cd4571d74fb833834aef20f446 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?=
 =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?=
 =?UTF-8?q?=E3=83=B3=29?= <clementval@gmail.com>
Date: Wed, 1 Nov 2023 17:41:20 -0700
Subject: [PATCH 759/877] [flang] Support fir.boxchar in getTypeAsString
 (#70997)

---
 flang/lib/Optimizer/Dialect/FIRType.cpp    | 7 ++++++-
 flang/unittests/Optimizer/FIRTypesTest.cpp | 2 ++
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/flang/lib/Optimizer/Dialect/FIRType.cpp b/flang/lib/Optimizer/Dialect/FIRType.cpp
index 323b589cc7f2e..efaead599bee1 100644
--- a/flang/lib/Optimizer/Dialect/FIRType.cpp
+++ b/flang/lib/Optimizer/Dialect/FIRType.cpp
@@ -545,7 +545,9 @@ std::string getTypeAsString(mlir::Type ty, const fir::KindMapping &kindMap,
       break;
     } else if (auto charTy = mlir::dyn_cast_or_null<fir::CharacterType>(ty)) {
       name << 'c' << kindMap.getCharacterBitsize(charTy.getFKind());
-      if (charTy.getLen() != fir::CharacterType::singleton())
+      if (charTy.getLen() == fir::CharacterType::unknownLen())
+        name << "xU";
+      else if (charTy.getLen() != fir::CharacterType::singleton())
         name << "x" << charTy.getLen();
       break;
     } else if (auto seqTy = mlir::dyn_cast_or_null<fir::SequenceType>(ty)) {
@@ -574,6 +576,9 @@ std::string getTypeAsString(mlir::Type ty, const fir::KindMapping &kindMap,
     } else if (auto boxTy = mlir::dyn_cast_or_null<fir::BoxType>(ty)) {
       name << "box_";
       ty = boxTy.getEleTy();
+    } else if (auto boxcharTy = mlir::dyn_cast_or_null<fir::BoxCharType>(ty)) {
+      name << "boxchar_";
+      ty = boxcharTy.getEleTy();
     } else if (auto recTy = mlir::dyn_cast_or_null<fir::RecordType>(ty)) {
       name << "rec_" << recTy.getName();
       break;
diff --git a/flang/unittests/Optimizer/FIRTypesTest.cpp b/flang/unittests/Optimizer/FIRTypesTest.cpp
index b66c1ff441bad..238aa6939d5df 100644
--- a/flang/unittests/Optimizer/FIRTypesTest.cpp
+++ b/flang/unittests/Optimizer/FIRTypesTest.cpp
@@ -313,4 +313,6 @@ TEST_F(FIRTypesTest, getTypeAsString) {
       fir::getTypeAsString(
           fir::LLVMPointerType::get(mlir::IntegerType::get(&context, 32)),
           *kindMap));
+  EXPECT_EQ("boxchar_c8xU",
+      fir::getTypeAsString(fir::BoxCharType::get(&context, 1), *kindMap));
 }

From c0d78c4232057768b04d3330e581d81544391e68 Mon Sep 17 00:00:00 2001
From: Peiming Liu <36770114+PeimingLiu@users.noreply.github.com>
Date: Wed, 1 Nov 2023 18:15:11 -0700
Subject: [PATCH 760/877] =?UTF-8?q?[mlir][sparse]=20Implement=20rewriters?=
 =?UTF-8?q?=20to=20reinterpret=20maps=20on=20alloc=5Ftenso=E2=80=A6=20(#70?=
 =?UTF-8?q?993)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

…r operation
---
 .../Transforms/SparseReinterpretMap.cpp       |  55 +++++++++-
 .../SparseTensor/sparse_reinterpret_map.mlir  |   5 +-
 .../CPU/sparse_conversion_block.mlir          | 100 ++++++++++++++++++
 3 files changed, 156 insertions(+), 4 deletions(-)
 create mode 100644 mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conversion_block.mlir

diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseReinterpretMap.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseReinterpretMap.cpp
index d14df6db8ee6b..307a609fd1b77 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseReinterpretMap.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseReinterpretMap.cpp
@@ -6,7 +6,10 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "CodegenUtils.h"
+
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Bufferization/IR/Bufferization.h"
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
 #include "mlir/Dialect/Linalg/Utils/Utils.h"
 #include "mlir/Dialect/SparseTensor/IR/SparseTensor.h"
@@ -188,6 +191,56 @@ struct DemapInsRewriter : public OpRewritePattern<SourceOp> {
   }
 };
 
+struct TensorAllocDemapper
+    : public OpRewritePattern<bufferization::AllocTensorOp> {
+  using OpRewritePattern::OpRewritePattern;
+  LogicalResult matchAndRewrite(bufferization::AllocTensorOp op,
+                                PatternRewriter &rewriter) const override {
+    if (!hasNonIdentityOperandsOrResults(op))
+      return failure();
+
+    Location loc = op.getLoc();
+    auto stt = getSparseTensorType(op.getResult());
+
+    SmallVector<Value> maxDimCrds;
+    maxDimCrds.reserve(stt.getDimRank());
+    ValueRange dynSz = op.getDynamicSizes();
+    for (int64_t dimSz : stt.getDimShape()) {
+      if (ShapedType::isDynamic(dimSz)) {
+        Value maxCrd = rewriter.create<arith::SubIOp>(
+            loc, dynSz.front(), constantIndex(rewriter, loc, 1));
+        maxDimCrds.push_back(maxCrd);
+        dynSz = dynSz.drop_front();
+      } else {
+        maxDimCrds.push_back(constantIndex(rewriter, loc, dimSz - 1));
+      }
+    }
+
+    ValueRange maxLvlCrds = stt.translateCrds(rewriter, loc, maxDimCrds,
+                                              CrdTransDirectionKind::dim2lvl);
+    auto lvlShape = stt.getLvlShape();
+    SmallVector<Value> dynLvlSzs;
+    for (unsigned i = 0, e = lvlShape.size(); i < e; i++) {
+      if (ShapedType::isDynamic(lvlShape[i])) {
+        Value sz = rewriter.create<arith::AddIOp>(
+            loc, maxLvlCrds[i], constantIndex(rewriter, loc, 1));
+        dynLvlSzs.push_back(sz);
+      }
+    }
+
+    assert(dynSz.empty()); // should have consumed all.
+    rewriter.startRootUpdate(op);
+    op->setOperands(dynLvlSzs);
+    op.getResult().setType(stt.getDemappedType());
+    rewriter.finalizeRootUpdate(op);
+    rewriter.setInsertionPointAfter(op);
+
+    Value t = genRemap(rewriter, stt.getEncoding(), op.getResult());
+    rewriter.replaceAllUsesExcept(op.getResult(), t, t.getDefiningOp());
+    return success();
+  }
+};
+
 struct TensorInsertDemapper
     : public DemapInsRewriter<TensorInsertDemapper, tensor::InsertOp> {
   using DemapInsRewriter::DemapInsRewriter;
@@ -309,7 +362,7 @@ void mlir::populateSparseReinterpretMap(RewritePatternSet &patterns,
   }
   if (scope == ReinterpretMapScope::kAll ||
       scope == ReinterpretMapScope::kExceptGeneric) {
-    patterns.add<TensorInsertDemapper, ForeachOpDemapper>(
+    patterns.add<TensorAllocDemapper, TensorInsertDemapper, ForeachOpDemapper>(
         patterns.getContext());
   }
 }
diff --git a/mlir/test/Dialect/SparseTensor/sparse_reinterpret_map.mlir b/mlir/test/Dialect/SparseTensor/sparse_reinterpret_map.mlir
index be3ab37e9cbd1..972364289ac2e 100644
--- a/mlir/test/Dialect/SparseTensor/sparse_reinterpret_map.mlir
+++ b/mlir/test/Dialect/SparseTensor/sparse_reinterpret_map.mlir
@@ -57,10 +57,9 @@ func.func @mul(%arg0: tensor<32x32xf32>,
 
 // CHECK-LABEL:   func.func @sparse_foreach_reinterpret_map(
 // CHECK-SAME:      %[[VAL_0:.*]]: tensor<2x4xf64
-// CHECK:           %[[VAL_1:.*]] = bufferization.alloc_tensor() : tensor<2x4xf64
+// CHECK:           %[[VAL_1:.*]] = bufferization.alloc_tensor() : tensor<1x2x2x2xf64
 // CHECK:           %[[VAL_2:.*]] = sparse_tensor.reinterpret_map %[[VAL_0]] : tensor<2x4xf64
-// CHECK:           %[[VAL_3:.*]] = sparse_tensor.reinterpret_map %[[VAL_1]] : tensor<2x4xf64
-// CHECK:           %[[VAL_4:.*]] = sparse_tensor.foreach in %[[VAL_2]] init(%[[VAL_3]])
+// CHECK:           %[[VAL_4:.*]] = sparse_tensor.foreach in %[[VAL_2]] init(%[[VAL_1]])
 // CHECK:           ^bb0(%[[VAL_5:.*]]: index, %[[VAL_6:.*]]: index, %[[VAL_7:.*]]: index, %[[VAL_8:.*]]: index, %[[VAL_9:.*]]: f64, %[[VAL_10:.*]]: tensor<1x2x2x2xf64
 // CHECK:             %[[VAL_11:.*]] = sparse_tensor.insert %[[VAL_9]] into %[[VAL_10]]{{\[}}%[[VAL_5]], %[[VAL_6]], %[[VAL_7]], %[[VAL_8]]] : tensor<1x2x2x2xf64
 // CHECK:             sparse_tensor.yield %[[VAL_11]] : tensor<1x2x2x2xf64
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conversion_block.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conversion_block.mlir
new file mode 100644
index 0000000000000..ec14492c5b449
--- /dev/null
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conversion_block.mlir
@@ -0,0 +1,100 @@
+//--------------------------------------------------------------------------------------------------
+// WHEN CREATING A NEW TEST, PLEASE JUST COPY & PASTE WITHOUT EDITS.
+//
+// Set-up that's shared across all tests in this directory. In principle, this
+// config could be moved to lit.local.cfg. However, there are downstream users that
+//  do not use these LIT config files. Hence why this is kept inline.
+//
+// DEFINE: %{sparse_compiler_opts} = enable-runtime-library=true
+// DEFINE: %{sparse_compiler_opts_sve} = enable-arm-sve=true %{sparse_compiler_opts}
+// DEFINE: %{compile} = mlir-opt %s --sparse-compiler="%{sparse_compiler_opts}"
+// DEFINE: %{compile_sve} = mlir-opt %s --sparse-compiler="%{sparse_compiler_opts_sve}"
+// DEFINE: %{run_libs} = -shared-libs=%mlir_c_runner_utils,%mlir_runner_utils
+// DEFINE: %{run_opts} = -e entry -entry-point-result=void
+// DEFINE: %{run} = mlir-cpu-runner %{run_opts} %{run_libs}
+// DEFINE: %{run_sve} = %mcr_aarch64_cmd --march=aarch64 --mattr="+sve" %{run_opts} %{run_libs}
+//
+// DEFINE: %{env} =
+//--------------------------------------------------------------------------------------------------
+
+// RUN: %{compile} | %{run} | FileCheck %s
+//
+// Do the same run, but now with direct IR generation.
+// REDEFINE: %{sparse_compiler_opts} = enable-runtime-library=false enable-buffer-initialization=true
+// RUN: %{compile} | %{run} | FileCheck %s
+//
+// Do the same run, but now with direct IR generation and vectorization.
+// REDEFINE: %{sparse_compiler_opts} = enable-runtime-library=false enable-buffer-initialization=true vl=2 reassociate-fp-reductions=true enable-index-optimizations=true
+// RUN: %{compile} | %{run} | FileCheck %s
+//
+// Do the same run, but now with direct IR generation and VLA vectorization.
+// RUN: %if mlir_arm_sve_tests %{ %{compile_sve} | %{run_sve} | FileCheck %s %}
+
+#CSR  = #sparse_tensor.encoding<{
+  map = (d0, d1) -> (d0 : dense, d1 : compressed)
+}>
+
+#CSC  = #sparse_tensor.encoding<{
+  map = (d0, d1) -> (d1 : dense, d0 : compressed)
+}>
+
+#BSR = #sparse_tensor.encoding<{
+   map = ( i, j ) ->
+      ( i floordiv 2 : dense,
+        j floordiv 2 : compressed,
+        i mod 2      : dense,
+        j mod 2      : dense
+      )
+}>
+
+
+//
+// Integration test that tests conversions between sparse tensors.
+//
+module {
+  //
+  // Output utilities.
+  //
+  func.func @dumpf64(%arg0: memref<?xf64>) {
+    %c0 = arith.constant 0 : index
+    %d0 = arith.constant -1.0 : f64
+    %0 = vector.transfer_read %arg0[%c0], %d0: memref<?xf64>, vector<8xf64>
+    vector.print %0 : vector<8xf64>
+    return
+  }
+
+  //
+  // Main driver.
+  //
+  func.func @entry() {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c2 = arith.constant 2 : index
+
+    //
+    // Initialize a 2-dim dense tensor.
+    //
+    %t = arith.constant dense<[
+       [  1.0,  2.0,  3.0,  4.0 ],
+       [  5.0,  6.0,  7.0,  8.0 ]
+    ]> : tensor<2x4xf64>
+
+
+    %1 = sparse_tensor.convert %t : tensor<2x4xf64> to tensor<2x4xf64, #CSR>
+    %2 = sparse_tensor.convert %1 : tensor<2x4xf64, #CSR> to tensor<2x4xf64, #BSR>
+    %3 = sparse_tensor.convert %2 : tensor<2x4xf64, #BSR> to tensor<2x4xf64, #CSC>
+
+    %v1 = sparse_tensor.values %1 : tensor<2x4xf64, #CSR> to memref<?xf64>
+    %v2 = sparse_tensor.values %2 : tensor<2x4xf64, #BSR> to memref<?xf64>
+    %v3 = sparse_tensor.values %3 : tensor<2x4xf64, #CSC> to memref<?xf64>
+
+    // CHECK:      ( 1, 2, 3, 4, 5, 6, 7, 8 )
+    // CHECK-NEXT: ( 1, 2, 5, 6, 3, 4, 7, 8 )
+    // CHECK-NEXT: ( 1, 5, 2, 6, 3, 7, 4, 8 )
+    call @dumpf64(%v1) : (memref<?xf64>) -> ()
+    call @dumpf64(%v2) : (memref<?xf64>) -> ()
+    call @dumpf64(%v3) : (memref<?xf64>) -> ()
+
+    return
+  }
+}

From d06596516fcb4d32a26eef5930a50fa56457814b Mon Sep 17 00:00:00 2001
From: Davide Italiano <davidino@fb.com>
Date: Wed, 1 Nov 2023 18:38:27 -0700
Subject: [PATCH 761/877] [build_symbolizer] Fix typo in
 2c81d70747ac81b37b6c7639fe7afa328e8f5e79

---
 .../lib/sanitizer_common/symbolizer/scripts/build_symbolizer.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/compiler-rt/lib/sanitizer_common/symbolizer/scripts/build_symbolizer.sh b/compiler-rt/lib/sanitizer_common/symbolizer/scripts/build_symbolizer.sh
index 13acced825e0b..520e0a631cb4a 100755
--- a/compiler-rt/lib/sanitizer_common/symbolizer/scripts/build_symbolizer.sh
+++ b/compiler-rt/lib/sanitizer_common/symbolizer/scripts/build_symbolizer.sh
@@ -71,7 +71,7 @@ FLAGS+=" -include ${SRC_DIR}/../sanitizer_redefine_builtins.h -DSANITIZER_COMMON
 LINKFLAGS="-fuse-ld=lld -target $TARGET_TRIPLE"
 
 # Build zlib.
-if [[ -d ${ZLIB_BUILD} ]]; then
+if [[ ! -d ${ZLIB_BUILD} ]]; then
   if [[ -z "${ZLIB_SRC}" ]]; then
     git clone https://github.com/madler/zlib ${ZLIB_BUILD}
   else

From 24cf476bd6d144b9fa28325b4d16e3c9dbfc4d4f Mon Sep 17 00:00:00 2001
From: saienduri <77521230+saienduri@users.noreply.github.com>
Date: Wed, 1 Nov 2023 18:57:21 -0700
Subject: [PATCH 762/877] [mlir] Add support for vector.store sub-byte
 emulation. (#70293)

---
 .../Transforms/VectorEmulateNarrowType.cpp    | 67 +++++++++++++++-
 .../Vector/vector-emulate-narrow-type.mlir    | 78 +++++++++++++++++++
 2 files changed, 144 insertions(+), 1 deletion(-)

diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp
index 3d65123373109..6aea0343bfc93 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorEmulateNarrowType.cpp
@@ -34,6 +34,71 @@ using namespace mlir;
 
 namespace {
 
+//===----------------------------------------------------------------------===//
+// ConvertVectorStore
+//===----------------------------------------------------------------------===//
+
+struct ConvertVectorStore final : OpConversionPattern<vector::StoreOp> {
+  using OpConversionPattern::OpConversionPattern;
+
+  LogicalResult
+  matchAndRewrite(vector::StoreOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+
+    auto loc = op.getLoc();
+    auto convertedType = cast<MemRefType>(adaptor.getBase().getType());
+    Type oldElementType = op.getValueToStore().getType().getElementType();
+    Type newElementType = convertedType.getElementType();
+    int srcBits = oldElementType.getIntOrFloatBitWidth();
+    int dstBits = newElementType.getIntOrFloatBitWidth();
+
+    if (dstBits % srcBits != 0) {
+      return rewriter.notifyMatchFailure(
+          op, "only dstBits % srcBits == 0 supported");
+    }
+    int scale = dstBits / srcBits;
+
+    // Adjust the number of elements to store when emulating narrow types.
+    // Here only the 1-D vector store is considered, and the N-D memref types
+    // should be linearized.
+    // For example, to emulate i4 to i8, the following op:
+    //
+    // vector.store %arg1, %0[%arg2, %arg3] : memref<4x8xi4>, vector<8xi4>
+    //
+    // can be replaced with
+    //
+    // %bitcast = vector.bitcast %arg1 : vector<8xi4> to vector<4xi8>
+    // vector.store %bitcast, %alloc[%linear_index] : memref<16xi8>,
+    // vector<4xi8>
+
+    auto origElements = op.getValueToStore().getType().getNumElements();
+    if (origElements % scale != 0)
+      return failure();
+
+    auto stridedMetadata =
+        rewriter.create<memref::ExtractStridedMetadataOp>(loc, op.getBase());
+
+    OpFoldResult linearizedIndices;
+    std::tie(std::ignore, linearizedIndices) =
+        memref::getLinearizedMemRefOffsetAndSize(
+            rewriter, loc, srcBits, dstBits,
+            stridedMetadata.getConstifiedMixedOffset(),
+            stridedMetadata.getConstifiedMixedSizes(),
+            stridedMetadata.getConstifiedMixedStrides(),
+            getAsOpFoldResult(adaptor.getIndices()));
+
+    auto numElements = origElements / scale;
+    auto bitCast = rewriter.create<vector::BitCastOp>(
+        loc, VectorType::get(numElements, newElementType),
+        op.getValueToStore());
+
+    rewriter.replaceOpWithNewOp<vector::StoreOp>(
+        op, bitCast.getResult(), adaptor.getBase(),
+        getValueOrCreateConstantIndexOp(rewriter, loc, linearizedIndices));
+    return success();
+  }
+};
+
 //===----------------------------------------------------------------------===//
 // ConvertVectorLoad
 //===----------------------------------------------------------------------===//
@@ -755,7 +820,7 @@ void vector::populateVectorNarrowTypeEmulationPatterns(
     RewritePatternSet &patterns) {
 
   // Populate `vector.*` conversion patterns.
-  patterns.add<ConvertVectorLoad, ConvertVectorMaskedLoad,
+  patterns.add<ConvertVectorLoad, ConvertVectorMaskedLoad, ConvertVectorStore,
                ConvertVectorTransferRead>(typeConverter, patterns.getContext());
 }
 
diff --git a/mlir/test/Dialect/Vector/vector-emulate-narrow-type.mlir b/mlir/test/Dialect/Vector/vector-emulate-narrow-type.mlir
index e1d6c3be49471..af0f98a1c447d 100644
--- a/mlir/test/Dialect/Vector/vector-emulate-narrow-type.mlir
+++ b/mlir/test/Dialect/Vector/vector-emulate-narrow-type.mlir
@@ -350,3 +350,81 @@ func.func @vector_extract_cst_maskedload_i4() -> vector<8x8x16xi4> {
 // CHECK32-SAME:     memref<128xi32>, vector<2xi1>, vector<2xi32> into vector<2xi32>
 //      CHECK32:   %[[BITCAST:.+]] = vector.bitcast %[[LOAD]] : vector<2xi32> to vector<16xi4>
 //      CHECK32:   %[[SELECT:.+]] = arith.select %[[ORIG_EXT2]], %[[BITCAST]], %[[PASSTHRU]] : vector<16xi1>, vector<16xi4>
+
+// -----
+
+func.func @vector_store_i8(%arg0: vector<8xi8>, %arg1: index, %arg2: index) {
+    %0 = memref.alloc() : memref<4x8xi8>
+    vector.store %arg0, %0[%arg1, %arg2] :memref<4x8xi8>, vector<8xi8>
+    return
+}
+
+// Expect no conversions, i8 is supported.
+//      CHECK: func @vector_store_i8
+//      CHECK: %[[ALLOC:.+]] = memref.alloc() : memref<4x8xi8>
+//      CHECK: vector.store %[[ARG0]], %[[ALLOC:.+]][%[[ARG1]], %[[ARG2]]] : memref<4x8xi8>, vector<8xi8>
+
+//  CHECK32-DAG: affine_map<()[s0, s1] -> (s0 * 2 + s1 floordiv 4)>
+//      CHECK32: func @vector_store_i8
+//      CHECK32: %[[ALLOC:.+]] = memref.alloc() : memref<8xi32>
+//      CHECK32: %[[INDEX:.+]] = affine.apply #[[MAP]]()[%[[ARG1]], %[[ARG2]]]
+//      CHECK32: %[[VEC_I32:.+]] = vector.bitcast %[[ARG0]] : vector<8xi8> to vector<2xi32>
+//      CHECK32: vector.store %[[VEC_I32:.+]], %[[ALLOC:.+]][%[[INDEX:.+]]] : memref<8xi32>, vector<2xi32
+
+// -----
+
+func.func @vector_store_i4(%arg0: vector<8xi4>, %arg1: index, %arg2: index) {
+    %0 = memref.alloc() : memref<4x8xi4>
+    vector.store %arg0, %0[%arg1, %arg2] :memref<4x8xi4>, vector<8xi4>
+    return
+}
+
+//  CHECK-DAG: #[[MAP:.+]] = affine_map<()[s0, s1] -> (s0 * 4 + s1 floordiv 2)>
+//      CHECK: func @vector_store_i4
+//      CHECK: %[[ALLOC:.+]] = memref.alloc() : memref<16xi8>
+//      CHECK: %[[INDEX:.+]] = affine.apply #[[MAP]]()[%[[ARG1]], %[[ARG2]]]
+//      CHECK: %[[VEC_I8:.+]] = vector.bitcast %[[ARG0]] : vector<8xi4> to vector<4xi8>
+//      CHECK: vector.store %[[VEC_I8:.+]], %[[ALLOC:.+]][%[[INDEX:.+]]] : memref<16xi8>, vector<4xi8>
+
+//  CHECK32-DAG: #[[MAP:.+]] = affine_map<()[s0, s1] -> (s0 + s1 floordiv 8)>
+//      CHECK32: func @vector_store_i4
+//      CHECK32: %[[ALLOC:.+]] = memref.alloc() : memref<4xi32>
+//      CHECK32: %[[INDEX:.+]] = affine.apply #[[MAP]]()[%[[ARG1]], %[[ARG2]]]
+//      CHECK32: %[[VEC_I32:.+]] = vector.bitcast %[[ARG0]] : vector<8xi4> to vector<1xi32>
+//      CHECK32: vector.store %[[VEC_I32:.+]], %[[ALLOC:.+]][%[[INDEX:.+]]] : memref<4xi32>, vector<1xi32>
+
+// -----
+
+func.func @vector_store_i4_dynamic(%arg0: vector<8xi4>, %arg1: index, %arg2: index, %arg3: index, %arg4: index) {
+    %0 = memref.alloc(%arg1, %arg2) : memref<?x?xi4>
+    vector.store %arg0, %0[%arg3, %arg4] : memref<?x?xi4>, vector<8xi4>
+    return
+}
+
+//  CHECK-DAG: #[[MAP:.+]] = affine_map<()[s0, s1] -> ((s0 * s1) floordiv 2)>
+//  CHECK-DAG: #[[MAP1:.+]] = affine_map<()[s0, s1, s2] -> ((s2 + s0 * s1) floordiv 2)>
+//      CHECK: func @vector_store_i4_dynamic
+// CHECK-SAME:   %[[ARG0:[a-zA-Z0-9]+]]: vector<8xi4>
+// CHECK-SAME:   %[[ARG1:[a-zA-Z0-9]+]]: index
+// CHECK-SAME:   %[[ARG2:[a-zA-Z0-9]+]]: index
+// CHECK-SAME:   %[[ARG3:[a-zA-Z0-9]+]]: index
+// CHECK-SAME:   %[[ARG4:[a-zA-Z0-9]+]]: index
+//      CHECK: %[[SIZE:.+]] = affine.apply #[[MAP]]()[%[[ARG1]], %[[ARG2]]]
+//      CHECK: %[[ALLOC:.+]] = memref.alloc(%[[SIZE]]) : memref<?xi8>
+//      CHECK: %[[INDEX:.+]] = affine.apply #[[MAP1]]()[%[[ARG3]], %[[ARG2]], %[[ARG4]]]
+//      CHECK: %[[VEC_I8:.+]] = vector.bitcast %[[ARG0]] : vector<8xi4> to vector<4xi8>
+//      CHECK: vector.store %[[VEC_I8:.+]], %[[ALLOC:.+]][%[[INDEX:.+]]] : memref<?xi8>, vector<4xi8>
+
+//  CHECK32-DAG: #[[MAP:.+]] = affine_map<()[s0, s1] -> ((s0 * s1) floordiv 8)>
+//  CHECK32-DAG: #[[MAP1:.+]] = affine_map<()[s0, s1, s2] -> ((s2 + s0 * s1) floordiv 8)>
+//      CHECK32: func @vector_store_i4_dynamic
+// CHECK32-SAME:   %[[ARG0:[a-zA-Z0-9]+]]: vector<8xi4>
+// CHECK32-SAME:   %[[ARG1:[a-zA-Z0-9]+]]: index
+// CHECK32-SAME:   %[[ARG2:[a-zA-Z0-9]+]]: index
+// CHECK32-SAME:   %[[ARG3:[a-zA-Z0-9]+]]: index
+// CHECK32-SAME:   %[[ARG4:[a-zA-Z0-9]+]]: index
+//      CHECK32: %[[SIZE:.+]] = affine.apply #[[MAP]]()[%[[ARG1]], %[[ARG2]]]
+//      CHECK32: %[[ALLOC:.+]] = memref.alloc(%[[SIZE]]) : memref<?xi32>
+//      CHECK32: %[[INDEX:.+]] = affine.apply #[[MAP1]]()[%[[ARG3]], %[[ARG2]], %[[ARG4]]]
+//      CHECK32: %[[VEC_I8:.+]] = vector.bitcast %[[ARG0]] : vector<8xi4> to vector<1xi32>
+//      CHECK32: vector.store %[[VEC_I8:.+]], %[[ALLOC:.+]][%[[INDEX:.+]]] : memref<?xi32>, vector<1xi32>

From ba0763e4cb08015bd0acef5cf05a372d423017b4 Mon Sep 17 00:00:00 2001
From: Tobias Stadler <mail@stadler-tobias.de>
Date: Thu, 2 Nov 2023 03:08:44 +0100
Subject: [PATCH 763/877] [GlobalISel][M68k] Update test after 373c343

Missed test case in experimental target, which was not covered by pre-merge checks.
---
 llvm/test/CodeGen/M68k/GlobalISel/legalize-udiv.mir | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/llvm/test/CodeGen/M68k/GlobalISel/legalize-udiv.mir b/llvm/test/CodeGen/M68k/GlobalISel/legalize-udiv.mir
index 439a961113908..7e12bdded3f65 100644
--- a/llvm/test/CodeGen/M68k/GlobalISel/legalize-udiv.mir
+++ b/llvm/test/CodeGen/M68k/GlobalISel/legalize-udiv.mir
@@ -16,14 +16,14 @@ body:             |
     ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX]](p0) :: (load (s8) from %fixed-stack.0, align 8)
     ; CHECK-NEXT: [[FRAME_INDEX1:%[0-9]+]]:_(p0) = G_FRAME_INDEX %fixed-stack.1
     ; CHECK-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX1]](p0) :: (load (s8) from %fixed-stack.1, align 4)
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[LOAD]](s32)
     ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
-    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[C]](s32)
-    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s8) = G_TRUNC [[LOAD]](s32)
-    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s8) = G_AND [[TRUNC1]], [[TRUNC]]
+    ; CHECK-NEXT: [[TRUNC1:%[0-9]+]]:_(s8) = G_TRUNC [[C]](s32)
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s8) = G_AND [[TRUNC]], [[TRUNC1]]
+    ; CHECK-NEXT: [[TRUNC2:%[0-9]+]]:_(s8) = G_TRUNC [[LOAD1]](s32)
     ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
-    ; CHECK-NEXT: [[TRUNC2:%[0-9]+]]:_(s8) = G_TRUNC [[C1]](s32)
-    ; CHECK-NEXT: [[TRUNC3:%[0-9]+]]:_(s8) = G_TRUNC [[LOAD1]](s32)
-    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s8) = G_AND [[TRUNC3]], [[TRUNC2]]
+    ; CHECK-NEXT: [[TRUNC3:%[0-9]+]]:_(s8) = G_TRUNC [[C1]](s32)
+    ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s8) = G_AND [[TRUNC2]], [[TRUNC3]]
     ; CHECK-NEXT: [[UDIV:%[0-9]+]]:_(s8) = G_UDIV [[AND]], [[AND1]]
     ; CHECK-NEXT: $bd0 = COPY [[UDIV]](s8)
     ; CHECK-NEXT: RTS implicit $bd0

From d848f724c5649e058abac978110b3992485089fd Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Wed, 1 Nov 2023 21:23:05 -0700
Subject: [PATCH 764/877] [CodeGen] Remove unused declaration
 getOverrideStackAlignment

The corresponding function definition was removed by:

  commit 3787ee457173c3612aac4c9b1a2b6d6ab0202616
  Author: Nick Desaulniers <ndesaulniers@google.com>
  Date:   Tue Jun 8 08:57:12 2021 -0700
---
 llvm/include/llvm/CodeGen/CommandFlags.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/CommandFlags.h b/llvm/include/llvm/CodeGen/CommandFlags.h
index 918e69352effe..6407dde5bcd6c 100644
--- a/llvm/include/llvm/CodeGen/CommandFlags.h
+++ b/llvm/include/llvm/CodeGen/CommandFlags.h
@@ -90,8 +90,6 @@ bool getDisableTailCalls();
 
 bool getStackSymbolOrdering();
 
-unsigned getOverrideStackAlignment();
-
 bool getStackRealign();
 
 std::string getTrapFuncName();

From 13644f0bda0b11968c70aa051d2c2455229c6970 Mon Sep 17 00:00:00 2001
From: Jakub Kuderski <jakub@nod-labs.com>
Date: Thu, 2 Nov 2023 00:34:51 -0400
Subject: [PATCH 765/877] [mlir][spirv][cf] Check destination block argument
 types (#70889)

Do not match on illegal destination blocks. Also apply some minor
cleanups.

TODO: We should add region/block argument type conversions to properly
address this.

Issue: https://github.com/llvm/llvm-project/issues/70813
---
 .../ControlFlowToSPIRV/ControlFlowToSPIRV.cpp | 46 +++++++++++++++----
 .../ControlFlowToSPIRVPass.cpp                |  4 +-
 .../ControlFlowToSPIRV/cf-ops-to-spirv.mlir   | 19 +++++++-
 3 files changed, 59 insertions(+), 10 deletions(-)

diff --git a/mlir/lib/Conversion/ControlFlowToSPIRV/ControlFlowToSPIRV.cpp b/mlir/lib/Conversion/ControlFlowToSPIRV/ControlFlowToSPIRV.cpp
index 6787a5ccd3a4a..5dba79016120b 100644
--- a/mlir/lib/Conversion/ControlFlowToSPIRV/ControlFlowToSPIRV.cpp
+++ b/mlir/lib/Conversion/ControlFlowToSPIRV/ControlFlowToSPIRV.cpp
@@ -18,26 +18,48 @@
 #include "mlir/Dialect/SPIRV/Transforms/SPIRVConversion.h"
 #include "mlir/Dialect/SPIRV/Utils/LayoutUtils.h"
 #include "mlir/IR/AffineMap.h"
+#include "mlir/IR/PatternMatch.h"
 #include "mlir/Support/LogicalResult.h"
+#include "mlir/Transforms/DialectConversion.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/FormatVariadic.h"
 
 #define DEBUG_TYPE "cf-to-spirv-pattern"
 
 using namespace mlir;
 
+/// Checks that the target block arguments are legal.
+static LogicalResult checkBlockArguments(Block &block, Operation *op,
+                                         PatternRewriter &rewriter,
+                                         const TypeConverter &converter) {
+  for (BlockArgument arg : block.getArguments()) {
+    if (!converter.isLegal(arg.getType())) {
+      return rewriter.notifyMatchFailure(
+          op,
+          llvm::formatv(
+              "failed to match, destination argument not legalized (found {0})",
+              arg));
+    }
+  }
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // Operation conversion
 //===----------------------------------------------------------------------===//
 
 namespace {
-
 /// Converts cf.br to spirv.Branch.
-struct BranchOpPattern final : public OpConversionPattern<cf::BranchOp> {
-  using OpConversionPattern<cf::BranchOp>::OpConversionPattern;
+struct BranchOpPattern final : OpConversionPattern<cf::BranchOp> {
+  using OpConversionPattern::OpConversionPattern;
 
   LogicalResult
   matchAndRewrite(cf::BranchOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
+    if (failed(checkBlockArguments(*op.getDest(), op, rewriter,
+                                   *getTypeConverter())))
+      return failure();
+
     rewriter.replaceOpWithNewOp<spirv::BranchOp>(op, op.getDest(),
                                                  adaptor.getDestOperands());
     return success();
@@ -45,16 +67,24 @@ struct BranchOpPattern final : public OpConversionPattern<cf::BranchOp> {
 };
 
 /// Converts cf.cond_br to spirv.BranchConditional.
-struct CondBranchOpPattern final
-    : public OpConversionPattern<cf::CondBranchOp> {
-  using OpConversionPattern<cf::CondBranchOp>::OpConversionPattern;
+struct CondBranchOpPattern final : OpConversionPattern<cf::CondBranchOp> {
+  using OpConversionPattern::OpConversionPattern;
 
   LogicalResult
   matchAndRewrite(cf::CondBranchOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
+    if (failed(checkBlockArguments(*op.getTrueDest(), op, rewriter,
+                                   *getTypeConverter())))
+      return failure();
+
+    if (failed(checkBlockArguments(*op.getFalseDest(), op, rewriter,
+                                   *getTypeConverter())))
+      return failure();
+
     rewriter.replaceOpWithNewOp<spirv::BranchConditionalOp>(
-        op, op.getCondition(), op.getTrueDest(), adaptor.getTrueDestOperands(),
-        op.getFalseDest(), adaptor.getFalseDestOperands());
+        op, adaptor.getCondition(), op.getTrueDest(),
+        adaptor.getTrueDestOperands(), op.getFalseDest(),
+        adaptor.getFalseDestOperands());
     return success();
   }
 };
diff --git a/mlir/lib/Conversion/ControlFlowToSPIRV/ControlFlowToSPIRVPass.cpp b/mlir/lib/Conversion/ControlFlowToSPIRV/ControlFlowToSPIRVPass.cpp
index d8aecae257b46..a752b82eac7c3 100644
--- a/mlir/lib/Conversion/ControlFlowToSPIRV/ControlFlowToSPIRVPass.cpp
+++ b/mlir/lib/Conversion/ControlFlowToSPIRV/ControlFlowToSPIRVPass.cpp
@@ -25,7 +25,7 @@ using namespace mlir;
 
 namespace {
 /// A pass converting MLIR ControlFlow operations into the SPIR-V dialect.
-class ConvertControlFlowToSPIRVPass
+class ConvertControlFlowToSPIRVPass final
     : public impl::ConvertControlFlowToSPIRVBase<
           ConvertControlFlowToSPIRVPass> {
   void runOnOperation() override;
@@ -44,6 +44,8 @@ void ConvertControlFlowToSPIRVPass::runOnOperation() {
   options.emulateLT32BitScalarTypes = this->emulateLT32BitScalarTypes;
   SPIRVTypeConverter typeConverter(targetAttr, options);
 
+  // TODO: We should also take care of block argument type conversion.
+
   RewritePatternSet patterns(context);
   cf::populateControlFlowToSPIRVPatterns(typeConverter, patterns);
 
diff --git a/mlir/test/Conversion/ControlFlowToSPIRV/cf-ops-to-spirv.mlir b/mlir/test/Conversion/ControlFlowToSPIRV/cf-ops-to-spirv.mlir
index 98ee689145349..a10a1468e5824 100644
--- a/mlir/test/Conversion/ControlFlowToSPIRV/cf-ops-to-spirv.mlir
+++ b/mlir/test/Conversion/ControlFlowToSPIRV/cf-ops-to-spirv.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt -split-input-file -convert-cf-to-spirv -verify-diagnostics %s | FileCheck %s
+// RUN: mlir-opt --split-input-file --convert-cf-to-spirv --verify-diagnostics %s | FileCheck %s
 
 //===----------------------------------------------------------------------===//
 // cf.br, cf.cond_br
@@ -36,3 +36,20 @@ func.func @simple_loop(%begin: i32, %end: i32, %step: i32) {
 }
 
 }
+
+// -----
+
+// TODO: We should handle blocks whose arguments require type conversion.
+
+// CHECK-LABEL: func.func @main_graph
+func.func @main_graph(%arg0: index) {
+  %c3 = arith.constant 1 : index
+  cf.br ^bb1(%arg0 : index)
+^bb1(%0: index):  // 2 preds: ^bb0, ^bb2
+  %1 = arith.cmpi slt, %0, %c3 : index
+  cf.cond_br %1, ^bb2, ^bb3
+^bb2:  // pred: ^bb1
+  cf.br ^bb1(%c3 : index)
+^bb3:  // pred: ^bb1
+  return
+}

From 22aad0ad36a434c5f3dd540b371175f4ebd34b62 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Wed, 1 Nov 2023 21:39:41 -0700
Subject: [PATCH 766/877] [mlir] Use llvm::to_underlying (NFC)

---
 mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
index 2bf9355ed6267..3b792a26d1823 100644
--- a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
+++ b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
@@ -21,6 +21,7 @@
 #include "mlir/Interfaces/FoldInterfaces.h"
 
 #include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/STLForwardCompat.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
@@ -668,10 +669,7 @@ static LogicalResult verifySynchronizationHint(Operation *op, uint64_t hint) {
 // Helper function to get bitwise AND of `value` and 'flag'
 uint64_t mapTypeToBitFlag(uint64_t value,
                           llvm::omp::OpenMPOffloadMappingFlags flag) {
-  return value &
-         static_cast<
-             std::underlying_type_t<llvm::omp::OpenMPOffloadMappingFlags>>(
-             flag);
+  return value & llvm::to_underlying(flag);
 }
 
 /// Parses a map_entries map type from a string format back into its numeric
@@ -720,8 +718,7 @@ static ParseResult parseMapClause(OpAsmParser &parser, IntegerAttr &mapType) {
 
   mapType = parser.getBuilder().getIntegerAttr(
       parser.getBuilder().getIntegerType(64, /*isSigned=*/false),
-      static_cast<std::underlying_type_t<llvm::omp::OpenMPOffloadMappingFlags>>(
-          mapTypeBits));
+      llvm::to_underlying(mapTypeBits));
 
   return success();
 }

From a37861d70754688dacdeb6c09db812b6c68b7fb6 Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Fri, 22 Sep 2023 08:21:21 -0500
Subject: [PATCH 767/877] [InstSimplify] Add tests for simplify `llvm.ptrmask`;
 NFC

Differential Revision: https://reviews.llvm.org/D156632
---
 llvm/test/Transforms/InstSimplify/ptrmask.ll | 366 +++++++++++++++++++
 1 file changed, 366 insertions(+)
 create mode 100644 llvm/test/Transforms/InstSimplify/ptrmask.ll

diff --git a/llvm/test/Transforms/InstSimplify/ptrmask.ll b/llvm/test/Transforms/InstSimplify/ptrmask.ll
new file mode 100644
index 0000000000000..4945d4bc0edf6
--- /dev/null
+++ b/llvm/test/Transforms/InstSimplify/ptrmask.ll
@@ -0,0 +1,366 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
+; RUN: opt < %s -passes=instsimplify -S | FileCheck %s
+
+target datalayout = "p1:64:64:64:32"
+
+declare ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) , i32)
+declare ptr @llvm.ptrmask.p0.i64(ptr, i64)
+
+declare <2 x ptr addrspace(1) > @llvm.ptrmask.v2p1.v2i32(<2 x ptr addrspace(1) >, <2 x i32>)
+declare <2 x ptr> @llvm.ptrmask.v2p1.v2i64(<2 x ptr>, <2 x i64>)
+
+define ptr @ptrmask_simplify_poison_mask(ptr %p) {
+; CHECK-LABEL: define ptr @ptrmask_simplify_poison_mask
+; CHECK-SAME: (ptr [[P:%.*]]) {
+; CHECK-NEXT:    [[R:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[P]], i64 poison)
+; CHECK-NEXT:    ret ptr [[R]]
+;
+  %r = call ptr @llvm.ptrmask.p0.i64(ptr %p, i64 poison)
+  ret ptr %r
+}
+
+define <2 x ptr addrspace(1) > @ptrmask_simplify_poison_mask_vec(<2 x ptr addrspace(1) > %p) {
+; CHECK-LABEL: define <2 x ptr addrspace(1)> @ptrmask_simplify_poison_mask_vec
+; CHECK-SAME: (<2 x ptr addrspace(1)> [[P:%.*]]) {
+; CHECK-NEXT:    [[R:%.*]] = call <2 x ptr addrspace(1)> @llvm.ptrmask.v2p1.v2i32(<2 x ptr addrspace(1)> [[P]], <2 x i32> poison)
+; CHECK-NEXT:    ret <2 x ptr addrspace(1)> [[R]]
+;
+  %r = call <2 x ptr addrspace(1) > @llvm.ptrmask.v2p1.v2i32(<2 x ptr addrspace(1) > %p, <2 x i32> poison)
+  ret <2 x ptr addrspace(1) > %r
+}
+
+define <2 x ptr addrspace(1) > @ptrmask_simplify_poison_and_zero_i32_vec_fail(<2 x ptr addrspace(1) > %p) {
+; CHECK-LABEL: define <2 x ptr addrspace(1)> @ptrmask_simplify_poison_and_zero_i32_vec_fail
+; CHECK-SAME: (<2 x ptr addrspace(1)> [[P:%.*]]) {
+; CHECK-NEXT:    [[R:%.*]] = call <2 x ptr addrspace(1)> @llvm.ptrmask.v2p1.v2i32(<2 x ptr addrspace(1)> [[P]], <2 x i32> <i32 undef, i32 0>)
+; CHECK-NEXT:    ret <2 x ptr addrspace(1)> [[R]]
+;
+  %r = call <2 x ptr addrspace(1) > @llvm.ptrmask.v2p1.v2i32(<2 x ptr addrspace(1) > %p, <2 x i32> <i32 undef, i32 0>)
+  ret <2 x ptr addrspace(1) > %r
+}
+
+define <2 x ptr> @ptrmask_simplify_undef_and_ones_vec(<2 x ptr> %p) {
+; CHECK-LABEL: define <2 x ptr> @ptrmask_simplify_undef_and_ones_vec
+; CHECK-SAME: (<2 x ptr> [[P:%.*]]) {
+; CHECK-NEXT:    [[R:%.*]] = call <2 x ptr> @llvm.ptrmask.v2p0.v2i64(<2 x ptr> [[P]], <2 x i64> <i64 undef, i64 -1>)
+; CHECK-NEXT:    ret <2 x ptr> [[R]]
+;
+  %r = call <2 x ptr> @llvm.ptrmask.v2p1.v2i64(<2 x ptr> %p, <2 x i64> <i64 undef, i64 -1>)
+  ret <2 x ptr> %r
+}
+
+define <2 x ptr> @ptrmask_simplify_poison_and_ones_vec(<2 x ptr> %p) {
+; CHECK-LABEL: define <2 x ptr> @ptrmask_simplify_poison_and_ones_vec
+; CHECK-SAME: (<2 x ptr> [[P:%.*]]) {
+; CHECK-NEXT:    [[R:%.*]] = call <2 x ptr> @llvm.ptrmask.v2p0.v2i64(<2 x ptr> [[P]], <2 x i64> <i64 poison, i64 -1>)
+; CHECK-NEXT:    ret <2 x ptr> [[R]]
+;
+  %r = call <2 x ptr> @llvm.ptrmask.v2p1.v2i64(<2 x ptr> %p, <2 x i64> <i64 poison, i64 -1>)
+  ret <2 x ptr> %r
+}
+
+define <2 x ptr> @ptrmask_simplify_ones_vec(<2 x ptr> %p) {
+; CHECK-LABEL: define <2 x ptr> @ptrmask_simplify_ones_vec
+; CHECK-SAME: (<2 x ptr> [[P:%.*]]) {
+; CHECK-NEXT:    [[R:%.*]] = call <2 x ptr> @llvm.ptrmask.v2p0.v2i64(<2 x ptr> [[P]], <2 x i64> <i64 -1, i64 -1>)
+; CHECK-NEXT:    ret <2 x ptr> [[R]]
+;
+  %r = call <2 x ptr> @llvm.ptrmask.v2p1.v2i64(<2 x ptr> %p, <2 x i64> <i64 -1, i64 -1>)
+  ret <2 x ptr> %r
+}
+
+define <2 x ptr addrspace(1) > @ptrmask_simplify_ones_i32_vec(<2 x ptr addrspace(1) > %p) {
+; CHECK-LABEL: define <2 x ptr addrspace(1)> @ptrmask_simplify_ones_i32_vec
+; CHECK-SAME: (<2 x ptr addrspace(1)> [[P:%.*]]) {
+; CHECK-NEXT:    [[R:%.*]] = call <2 x ptr addrspace(1)> @llvm.ptrmask.v2p1.v2i32(<2 x ptr addrspace(1)> [[P]], <2 x i32> <i32 -1, i32 -1>)
+; CHECK-NEXT:    ret <2 x ptr addrspace(1)> [[R]]
+;
+  %r = call <2 x ptr addrspace(1) > @llvm.ptrmask.v2p1.v2i32(<2 x ptr addrspace(1) > %p, <2 x i32> <i32 -1, i32 -1>)
+  ret <2 x ptr addrspace(1) > %r
+}
+
+define ptr addrspace(1) @ptrmask_simplify_undef_mask(ptr addrspace(1) %p) {
+; CHECK-LABEL: define ptr addrspace(1) @ptrmask_simplify_undef_mask
+; CHECK-SAME: (ptr addrspace(1) [[P:%.*]]) {
+; CHECK-NEXT:    [[R:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) [[P]], i32 undef)
+; CHECK-NEXT:    ret ptr addrspace(1) [[R]]
+;
+  %r = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) %p, i32 undef)
+  ret ptr addrspace(1) %r
+}
+
+define ptr @ptrmask_simplify_0_mask(ptr %p) {
+; CHECK-LABEL: define ptr @ptrmask_simplify_0_mask
+; CHECK-SAME: (ptr [[P:%.*]]) {
+; CHECK-NEXT:    [[R:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[P]], i64 0)
+; CHECK-NEXT:    ret ptr [[R]]
+;
+  %r = call ptr @llvm.ptrmask.p0.i64(ptr %p, i64 0)
+  ret ptr %r
+}
+
+define ptr @ptrmask_simplify_1s_mask(ptr %p) {
+; CHECK-LABEL: define ptr @ptrmask_simplify_1s_mask
+; CHECK-SAME: (ptr [[P:%.*]]) {
+; CHECK-NEXT:    [[R:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[P]], i64 -1)
+; CHECK-NEXT:    ret ptr [[R]]
+;
+  %r = call ptr @llvm.ptrmask.p0.i64(ptr %p, i64 -1)
+  ret ptr %r
+}
+
+define ptr addrspace(1) @ptrmask_simplify_1s_mask_i32(ptr addrspace(1) %p) {
+; CHECK-LABEL: define ptr addrspace(1) @ptrmask_simplify_1s_mask_i32
+; CHECK-SAME: (ptr addrspace(1) [[P:%.*]]) {
+; CHECK-NEXT:    [[R:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) [[P]], i32 -1)
+; CHECK-NEXT:    ret ptr addrspace(1) [[R]]
+;
+  %r = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) %p, i32 -1)
+  ret ptr addrspace(1) %r
+}
+
+define ptr @ptrmask_simplify_poison_ptr(i64 %m) {
+; CHECK-LABEL: define ptr @ptrmask_simplify_poison_ptr
+; CHECK-SAME: (i64 [[M:%.*]]) {
+; CHECK-NEXT:    [[R:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr poison, i64 [[M]])
+; CHECK-NEXT:    ret ptr [[R]]
+;
+  %r = call ptr @llvm.ptrmask.p0.i64(ptr poison, i64 %m)
+  ret ptr %r
+}
+
+define ptr addrspace(1) @ptrmask_simplify_undef_ptr(i32 %m) {
+; CHECK-LABEL: define ptr addrspace(1) @ptrmask_simplify_undef_ptr
+; CHECK-SAME: (i32 [[M:%.*]]) {
+; CHECK-NEXT:    [[R:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) undef, i32 [[M]])
+; CHECK-NEXT:    ret ptr addrspace(1) [[R]]
+;
+  %r = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) undef, i32 %m)
+  ret ptr addrspace(1) %r
+}
+
+define ptr @ptrmask_simplify_null_ptr(i64 %m) {
+; CHECK-LABEL: define ptr @ptrmask_simplify_null_ptr
+; CHECK-SAME: (i64 [[M:%.*]]) {
+; CHECK-NEXT:    [[R:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr null, i64 [[M]])
+; CHECK-NEXT:    ret ptr [[R]]
+;
+  %r = call ptr @llvm.ptrmask.p0.i64(ptr null, i64 %m)
+  ret ptr %r
+}
+
+define ptr @ptrmask_simplify_ptrmask(ptr %p) {
+; CHECK-LABEL: define ptr @ptrmask_simplify_ptrmask
+; CHECK-SAME: (ptr [[P:%.*]]) {
+; CHECK-NEXT:    [[M:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[R:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[P]], i64 [[M]])
+; CHECK-NEXT:    ret ptr [[R]]
+;
+  %m = ptrtoint ptr %p to i64
+  %r = call ptr @llvm.ptrmask.p0.i64(ptr %p, i64 %m)
+  ret ptr %r
+}
+
+define ptr addrspace(1) @ptrmask_simplify_ptrmask_i32(ptr addrspace(1) %p) {
+; CHECK-LABEL: define ptr addrspace(1) @ptrmask_simplify_ptrmask_i32
+; CHECK-SAME: (ptr addrspace(1) [[P:%.*]]) {
+; CHECK-NEXT:    [[M:%.*]] = ptrtoint ptr addrspace(1) [[P]] to i32
+; CHECK-NEXT:    [[R:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) [[P]], i32 [[M]])
+; CHECK-NEXT:    ret ptr addrspace(1) [[R]]
+;
+  %m = ptrtoint ptr addrspace(1) %p to i32
+  %r = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) %p, i32 %m)
+  ret ptr addrspace(1) %r
+}
+
+define ptr @ptrmask_simplify_aligned_unused(ptr align 64 %p) {
+; CHECK-LABEL: define ptr @ptrmask_simplify_aligned_unused
+; CHECK-SAME: (ptr align 64 [[P:%.*]]) {
+; CHECK-NEXT:    [[R:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[P]], i64 -64)
+; CHECK-NEXT:    ret ptr [[R]]
+;
+  %r = call ptr @llvm.ptrmask.p0.i64(ptr %p, i64 -64)
+  ret ptr %r
+}
+
+define <2 x ptr> @ptrmask_simplify_aligned_unused_vec(<2 x ptr> align 128 %p) {
+; CHECK-LABEL: define <2 x ptr> @ptrmask_simplify_aligned_unused_vec
+; CHECK-SAME: (<2 x ptr> align 128 [[P:%.*]]) {
+; CHECK-NEXT:    [[R:%.*]] = call <2 x ptr> @llvm.ptrmask.v2p0.v2i64(<2 x ptr> [[P]], <2 x i64> <i64 -64, i64 -64>)
+; CHECK-NEXT:    ret <2 x ptr> [[R]]
+;
+  %r = call <2 x ptr> @llvm.ptrmask.v2p1.v2i64(<2 x ptr> %p, <2 x i64> <i64 -64, i64 -64>)
+  ret <2 x ptr> %r
+}
+
+define <2 x ptr> @ptrmask_simplify_aligned_unused_vec_todo(<2 x ptr> align 128 %p) {
+; CHECK-LABEL: define <2 x ptr> @ptrmask_simplify_aligned_unused_vec_todo
+; CHECK-SAME: (<2 x ptr> align 128 [[P:%.*]]) {
+; CHECK-NEXT:    [[R:%.*]] = call <2 x ptr> @llvm.ptrmask.v2p0.v2i64(<2 x ptr> [[P]], <2 x i64> <i64 -64, i64 -128>)
+; CHECK-NEXT:    ret <2 x ptr> [[R]]
+;
+  %r = call <2 x ptr> @llvm.ptrmask.v2p1.v2i64(<2 x ptr> %p, <2 x i64> <i64 -64, i64 -128>)
+  ret <2 x ptr> %r
+}
+
+define ptr addrspace(1) @ptrmask_simplify_aligned_unused_i32(ptr addrspace(1) align 64 %p) {
+; CHECK-LABEL: define ptr addrspace(1) @ptrmask_simplify_aligned_unused_i32
+; CHECK-SAME: (ptr addrspace(1) align 64 [[P:%.*]]) {
+; CHECK-NEXT:    [[R:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) [[P]], i32 -64)
+; CHECK-NEXT:    ret ptr addrspace(1) [[R]]
+;
+  %r = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) %p, i32 -64)
+  ret ptr addrspace(1) %r
+}
+
+define ptr @ptrmask_simplify_known_unused(ptr %p) {
+; CHECK-LABEL: define ptr @ptrmask_simplify_known_unused
+; CHECK-SAME: (ptr [[P:%.*]]) {
+; CHECK-NEXT:    [[PM0:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[P]], i64 -64)
+; CHECK-NEXT:    [[PGEP:%.*]] = getelementptr i8, ptr [[PM0]], i64 32
+; CHECK-NEXT:    [[R:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PGEP]], i64 -32)
+; CHECK-NEXT:    ret ptr [[R]]
+;
+  %pm0 = call ptr @llvm.ptrmask.p0.i64(ptr %p, i64 -64)
+  %pgep = getelementptr i8, ptr %pm0, i64 32
+  %r = call ptr @llvm.ptrmask.p0.i64(ptr %pgep, i64 -32)
+  ret ptr %r
+}
+
+define <2 x ptr> @ptrmask_simplify_known_unused_vec(<2 x ptr> %p) {
+; CHECK-LABEL: define <2 x ptr> @ptrmask_simplify_known_unused_vec
+; CHECK-SAME: (<2 x ptr> [[P:%.*]]) {
+; CHECK-NEXT:    [[PM0:%.*]] = call <2 x ptr> @llvm.ptrmask.v2p0.v2i64(<2 x ptr> [[P]], <2 x i64> <i64 -64, i64 -64>)
+; CHECK-NEXT:    [[PGEP:%.*]] = getelementptr i8, <2 x ptr> [[PM0]], <2 x i64> <i64 32, i64 32>
+; CHECK-NEXT:    [[R:%.*]] = call <2 x ptr> @llvm.ptrmask.v2p0.v2i64(<2 x ptr> [[PGEP]], <2 x i64> <i64 -32, i64 -32>)
+; CHECK-NEXT:    ret <2 x ptr> [[R]]
+;
+  %pm0 = call <2 x ptr> @llvm.ptrmask.v2p1.v2i64(<2 x ptr> %p, <2 x i64> <i64 -64, i64 -64>)
+  %pgep = getelementptr i8, <2 x ptr> %pm0, <2 x i64> <i64 32, i64 32>
+  %r = call <2 x ptr> @llvm.ptrmask.v2p1.v2i64(<2 x ptr> %pgep, <2 x i64> <i64 -32, i64 -32>)
+  ret <2 x ptr> %r
+}
+
+define <2 x ptr> @ptrmask_simplify_known_unused_vec2(<2 x ptr> %p) {
+; CHECK-LABEL: define <2 x ptr> @ptrmask_simplify_known_unused_vec2
+; CHECK-SAME: (<2 x ptr> [[P:%.*]]) {
+; CHECK-NEXT:    [[PM0:%.*]] = call <2 x ptr> @llvm.ptrmask.v2p0.v2i64(<2 x ptr> [[P]], <2 x i64> <i64 -64, i64 -64>)
+; CHECK-NEXT:    [[PGEP:%.*]] = getelementptr i8, <2 x ptr> [[PM0]], <2 x i64> <i64 32, i64 32>
+; CHECK-NEXT:    [[R:%.*]] = call <2 x ptr> @llvm.ptrmask.v2p0.v2i64(<2 x ptr> [[PGEP]], <2 x i64> <i64 -32, i64 -16>)
+; CHECK-NEXT:    ret <2 x ptr> [[R]]
+;
+  %pm0 = call <2 x ptr> @llvm.ptrmask.v2p1.v2i64(<2 x ptr> %p, <2 x i64> <i64 -64, i64 -64>)
+  %pgep = getelementptr i8, <2 x ptr> %pm0, <2 x i64> <i64 32, i64 32>
+  %r = call <2 x ptr> @llvm.ptrmask.v2p1.v2i64(<2 x ptr> %pgep, <2 x i64> <i64 -32, i64 -16>)
+  ret <2 x ptr> %r
+}
+
+define <2 x ptr> @ptrmask_simplify_known_unused_vec3(<2 x ptr> %p) {
+; CHECK-LABEL: define <2 x ptr> @ptrmask_simplify_known_unused_vec3
+; CHECK-SAME: (<2 x ptr> [[P:%.*]]) {
+; CHECK-NEXT:    [[PM0:%.*]] = call <2 x ptr> @llvm.ptrmask.v2p0.v2i64(<2 x ptr> [[P]], <2 x i64> <i64 -64, i64 -128>)
+; CHECK-NEXT:    [[PGEP:%.*]] = getelementptr i8, <2 x ptr> [[PM0]], <2 x i64> <i64 32, i64 32>
+; CHECK-NEXT:    [[R:%.*]] = call <2 x ptr> @llvm.ptrmask.v2p0.v2i64(<2 x ptr> [[PGEP]], <2 x i64> <i64 -32, i64 -32>)
+; CHECK-NEXT:    ret <2 x ptr> [[R]]
+;
+  %pm0 = call <2 x ptr> @llvm.ptrmask.v2p1.v2i64(<2 x ptr> %p, <2 x i64> <i64 -64, i64 -128>)
+  %pgep = getelementptr i8, <2 x ptr> %pm0, <2 x i64> <i64 32, i64 32>
+  %r = call <2 x ptr> @llvm.ptrmask.v2p1.v2i64(<2 x ptr> %pgep, <2 x i64> <i64 -32, i64 -32>)
+  ret <2 x ptr> %r
+}
+
+define <2 x ptr> @ptrmask_simplify_known_unused_vec4(<2 x ptr> %p) {
+; CHECK-LABEL: define <2 x ptr> @ptrmask_simplify_known_unused_vec4
+; CHECK-SAME: (<2 x ptr> [[P:%.*]]) {
+; CHECK-NEXT:    [[PM0:%.*]] = call <2 x ptr> @llvm.ptrmask.v2p0.v2i64(<2 x ptr> [[P]], <2 x i64> <i64 -64, i64 -128>)
+; CHECK-NEXT:    [[PGEP:%.*]] = getelementptr i8, <2 x ptr> [[PM0]], <2 x i64> <i64 32, i64 64>
+; CHECK-NEXT:    [[R:%.*]] = call <2 x ptr> @llvm.ptrmask.v2p0.v2i64(<2 x ptr> [[PGEP]], <2 x i64> <i64 -32, i64 -32>)
+; CHECK-NEXT:    ret <2 x ptr> [[R]]
+;
+  %pm0 = call <2 x ptr> @llvm.ptrmask.v2p1.v2i64(<2 x ptr> %p, <2 x i64> <i64 -64, i64 -128>)
+  %pgep = getelementptr i8, <2 x ptr> %pm0, <2 x i64> <i64 32, i64 64>
+  %r = call <2 x ptr> @llvm.ptrmask.v2p1.v2i64(<2 x ptr> %pgep, <2 x i64> <i64 -32, i64 -32>)
+  ret <2 x ptr> %r
+}
+
+define <2 x ptr> @ptrmask_simplify_known_unused_vec_fail(<2 x ptr> %p) {
+; CHECK-LABEL: define <2 x ptr> @ptrmask_simplify_known_unused_vec_fail
+; CHECK-SAME: (<2 x ptr> [[P:%.*]]) {
+; CHECK-NEXT:    [[PM0:%.*]] = call <2 x ptr> @llvm.ptrmask.v2p0.v2i64(<2 x ptr> [[P]], <2 x i64> <i64 -64, i64 -128>)
+; CHECK-NEXT:    [[PGEP:%.*]] = getelementptr i8, <2 x ptr> [[PM0]], <2 x i64> <i64 16, i64 64>
+; CHECK-NEXT:    [[R:%.*]] = call <2 x ptr> @llvm.ptrmask.v2p0.v2i64(<2 x ptr> [[PGEP]], <2 x i64> <i64 -32, i64 -32>)
+; CHECK-NEXT:    ret <2 x ptr> [[R]]
+;
+  %pm0 = call <2 x ptr> @llvm.ptrmask.v2p1.v2i64(<2 x ptr> %p, <2 x i64> <i64 -64, i64 -128>)
+  %pgep = getelementptr i8, <2 x ptr> %pm0, <2 x i64> <i64 16, i64 64>
+  %r = call <2 x ptr> @llvm.ptrmask.v2p1.v2i64(<2 x ptr> %pgep, <2 x i64> <i64 -32, i64 -32>)
+  ret <2 x ptr> %r
+}
+
+define <2 x ptr> @ptrmask_simplify_known_unused_vec_fail2(<2 x ptr> %p) {
+; CHECK-LABEL: define <2 x ptr> @ptrmask_simplify_known_unused_vec_fail2
+; CHECK-SAME: (<2 x ptr> [[P:%.*]]) {
+; CHECK-NEXT:    [[PM0:%.*]] = call <2 x ptr> @llvm.ptrmask.v2p0.v2i64(<2 x ptr> [[P]], <2 x i64> <i64 -64, i64 -64>)
+; CHECK-NEXT:    [[PGEP:%.*]] = getelementptr i8, <2 x ptr> [[PM0]], <2 x i64> <i64 32, i64 32>
+; CHECK-NEXT:    [[R:%.*]] = call <2 x ptr> @llvm.ptrmask.v2p0.v2i64(<2 x ptr> [[PGEP]], <2 x i64> <i64 -32, i64 -64>)
+; CHECK-NEXT:    ret <2 x ptr> [[R]]
+;
+  %pm0 = call <2 x ptr> @llvm.ptrmask.v2p1.v2i64(<2 x ptr> %p, <2 x i64> <i64 -64, i64 -64>)
+  %pgep = getelementptr i8, <2 x ptr> %pm0, <2 x i64> <i64 32, i64 32>
+  %r = call <2 x ptr> @llvm.ptrmask.v2p1.v2i64(<2 x ptr> %pgep, <2 x i64> <i64 -32, i64 -64>)
+  ret <2 x ptr> %r
+}
+
+define <2 x ptr> @ptrmask_simplify_known_unused_vec_fail3(<2 x ptr> %p) {
+; CHECK-LABEL: define <2 x ptr> @ptrmask_simplify_known_unused_vec_fail3
+; CHECK-SAME: (<2 x ptr> [[P:%.*]]) {
+; CHECK-NEXT:    [[PM0:%.*]] = call <2 x ptr> @llvm.ptrmask.v2p0.v2i64(<2 x ptr> [[P]], <2 x i64> <i64 -64, i64 -16>)
+; CHECK-NEXT:    [[PGEP:%.*]] = getelementptr i8, <2 x ptr> [[PM0]], <2 x i64> <i64 32, i64 32>
+; CHECK-NEXT:    [[R:%.*]] = call <2 x ptr> @llvm.ptrmask.v2p0.v2i64(<2 x ptr> [[PGEP]], <2 x i64> <i64 -32, i64 -32>)
+; CHECK-NEXT:    ret <2 x ptr> [[R]]
+;
+  %pm0 = call <2 x ptr> @llvm.ptrmask.v2p1.v2i64(<2 x ptr> %p, <2 x i64> <i64 -64, i64 -16>)
+  %pgep = getelementptr i8, <2 x ptr> %pm0, <2 x i64> <i64 32, i64 32>
+  %r = call <2 x ptr> @llvm.ptrmask.v2p1.v2i64(<2 x ptr> %pgep, <2 x i64> <i64 -32, i64 -32>)
+  ret <2 x ptr> %r
+}
+
+define ptr @ptrmask_maintain_provenance_i64(ptr %p0) {
+; CHECK-LABEL: define ptr @ptrmask_maintain_provenance_i64
+; CHECK-SAME: (ptr [[P0:%.*]]) {
+; CHECK-NEXT:    [[R:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[P0]], i64 0)
+; CHECK-NEXT:    ret ptr [[R]]
+;
+  %r = call ptr @llvm.ptrmask.p0.i64(ptr %p0, i64 0)
+  ret ptr %r
+}
+
+define ptr addrspace(1) @ptrmask_maintain_provenance_i32(ptr addrspace(1) %p0) {
+; CHECK-LABEL: define ptr addrspace(1) @ptrmask_maintain_provenance_i32
+; CHECK-SAME: (ptr addrspace(1) [[P0:%.*]]) {
+; CHECK-NEXT:    [[R:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) [[P0]], i32 0)
+; CHECK-NEXT:    ret ptr addrspace(1) [[R]]
+;
+  %r = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) %p0, i32 0)
+  ret ptr addrspace(1) %r
+}
+
+define <2 x ptr> @ptrmask_maintain_provenance_v2i64(<2 x ptr> %p0) {
+; CHECK-LABEL: define <2 x ptr> @ptrmask_maintain_provenance_v2i64
+; CHECK-SAME: (<2 x ptr> [[P0:%.*]]) {
+; CHECK-NEXT:    [[R:%.*]] = call <2 x ptr> @llvm.ptrmask.v2p0.v2i64(<2 x ptr> [[P0]], <2 x i64> zeroinitializer)
+; CHECK-NEXT:    ret <2 x ptr> [[R]]
+;
+  %r = call <2 x ptr> @llvm.ptrmask.v2p1.v2i64(<2 x ptr> %p0, <2 x i64> zeroinitializer)
+  ret <2 x ptr> %r
+}
+
+define <2 x ptr addrspace(1) > @ptrmask_maintain_provenance_v2i32(<2 x ptr addrspace(1) > %p0) {
+; CHECK-LABEL: define <2 x ptr addrspace(1)> @ptrmask_maintain_provenance_v2i32
+; CHECK-SAME: (<2 x ptr addrspace(1)> [[P0:%.*]]) {
+; CHECK-NEXT:    [[R:%.*]] = call <2 x ptr addrspace(1)> @llvm.ptrmask.v2p1.v2i32(<2 x ptr addrspace(1)> [[P0]], <2 x i32> zeroinitializer)
+; CHECK-NEXT:    ret <2 x ptr addrspace(1)> [[R]]
+;
+  %r = call <2 x ptr addrspace(1) > @llvm.ptrmask.v2p1.v2i32(<2 x ptr addrspace(1) > %p0, <2 x i32> zeroinitializer)
+  ret <2 x ptr addrspace(1) > %r
+}

From 8c2fcf5b772ee686b079af75079537d1e4852faf Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Fri, 22 Sep 2023 08:21:27 -0500
Subject: [PATCH 768/877] [InstSimplify] Add some basic simplifications for
 `llvm.ptrmask`

Mostly the same as `and`. We also have a check for a useless
`llvm.ptrmask` if the ptr is already known aligned.

Differential Revision: https://reviews.llvm.org/D156633
---
 llvm/lib/Analysis/InstructionSimplify.cpp     | 38 +++++++++++
 .../InferAddressSpaces/AMDGPU/ptrmask.ll      |  5 +-
 .../test/Transforms/InstCombine/align-addr.ll |  8 +--
 llvm/test/Transforms/InstSimplify/ptrmask.ll  | 65 ++++++-------------
 4 files changed, 63 insertions(+), 53 deletions(-)

diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp
index fe3d7d679129f..c07b5b9e130f8 100644
--- a/llvm/lib/Analysis/InstructionSimplify.cpp
+++ b/llvm/lib/Analysis/InstructionSimplify.cpp
@@ -6412,6 +6412,44 @@ static Value *simplifyBinaryIntrinsic(Function *F, Value *Op0, Value *Op1,
       return Constant::getNullValue(ReturnType);
     break;
   }
+  case Intrinsic::ptrmask: {
+    if (isa<PoisonValue>(Op0) || isa<PoisonValue>(Op1))
+      return PoisonValue::get(Op0->getType());
+
+    // NOTE: We can't apply this simplifications based on the value of Op1
+    // because we need to preserve provenance.
+    if (Q.isUndefValue(Op0) || match(Op0, m_Zero()))
+      return Constant::getNullValue(Op0->getType());
+
+    assert(Op1->getType()->getScalarSizeInBits() ==
+               Q.DL.getIndexTypeSizeInBits(Op0->getType()) &&
+           "Invalid mask width");
+    // If index-width (mask size) is less than pointer-size then mask is
+    // 1-extended.
+    if (match(Op1, m_PtrToInt(m_Specific(Op0))))
+      return Op0;
+
+    // NOTE: We may have attributes associated with the return value of the
+    // llvm.ptrmask intrinsic that will be lost when we just return the
+    // operand. We should try to preserve them.
+    if (match(Op1, m_AllOnes()) || Q.isUndefValue(Op1))
+      return Op0;
+
+    Constant *C;
+    if (match(Op1, m_ImmConstant(C))) {
+      KnownBits PtrKnown = computeKnownBits(Op0, /*Depth=*/0, Q);
+      // See if we only masking off bits we know are already zero due to
+      // alignment.
+      APInt IrrelevantPtrBits =
+          PtrKnown.Zero.zextOrTrunc(C->getType()->getScalarSizeInBits());
+      C = ConstantFoldBinaryOpOperands(
+          Instruction::Or, C, ConstantInt::get(C->getType(), IrrelevantPtrBits),
+          Q.DL);
+      if (C != nullptr && C->isAllOnesValue())
+        return Op0;
+    }
+    break;
+  }
   case Intrinsic::smax:
   case Intrinsic::smin:
   case Intrinsic::umax:
diff --git a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/ptrmask.ll b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/ptrmask.ll
index 280630468c2c0..857132a0d10fc 100644
--- a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/ptrmask.ll
+++ b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/ptrmask.ll
@@ -320,8 +320,7 @@ define i8 @ptrmask_cast_local_to_flat_const_mask_fffffffffffffffe(ptr addrspace(
 
 define i8 @ptrmask_cast_local_to_flat_const_mask_ffffffffffffffff(ptr addrspace(3) %src.ptr) {
 ; CHECK-LABEL: @ptrmask_cast_local_to_flat_const_mask_ffffffffffffffff(
-; CHECK-NEXT:    [[TMP1:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[SRC_PTR:%.*]], i32 -1)
-; CHECK-NEXT:    [[LOAD:%.*]] = load i8, ptr addrspace(3) [[TMP1]], align 1
+; CHECK-NEXT:    [[LOAD:%.*]] = load i8, ptr addrspace(3) [[SRC_PTR:%.*]], align 1
 ; CHECK-NEXT:    ret i8 [[LOAD]]
 ;
   %cast = addrspacecast ptr addrspace(3) %src.ptr to ptr
@@ -333,7 +332,7 @@ define i8 @ptrmask_cast_local_to_flat_const_mask_ffffffffffffffff(ptr addrspace(
 ; Make sure non-constant masks can also be handled.
 define i8 @ptrmask_cast_local_to_flat_load_range_mask(ptr addrspace(3) %src.ptr, ptr addrspace(1) %mask.ptr) {
 ; CHECK-LABEL: @ptrmask_cast_local_to_flat_load_range_mask(
-; CHECK-NEXT:    [[LOAD_MASK:%.*]] = load i64, ptr addrspace(1) [[MASK_PTR:%.*]], align 8, !range !0
+; CHECK-NEXT:    [[LOAD_MASK:%.*]] = load i64, ptr addrspace(1) [[MASK_PTR:%.*]], align 8, !range [[RNG0:![0-9]+]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[LOAD_MASK]] to i32
 ; CHECK-NEXT:    [[TMP2:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[SRC_PTR:%.*]], i32 [[TMP1]])
 ; CHECK-NEXT:    [[LOAD:%.*]] = load i8, ptr addrspace(3) [[TMP2]], align 1
diff --git a/llvm/test/Transforms/InstCombine/align-addr.ll b/llvm/test/Transforms/InstCombine/align-addr.ll
index ec8e7c9348f17..1e49cddf7ffe7 100644
--- a/llvm/test/Transforms/InstCombine/align-addr.ll
+++ b/llvm/test/Transforms/InstCombine/align-addr.ll
@@ -181,11 +181,9 @@ define <16 x i8> @ptrmask_align8_ptr_align1(ptr align 1 %ptr) {
 }
 
 ; Underlying alignment already the same as forced alignment by ptrmask
-; TODO: Should be able to drop the ptrmask
 define <16 x i8> @ptrmask_align8_ptr_align8(ptr align 8 %ptr) {
 ; CHECK-LABEL: @ptrmask_align8_ptr_align8(
-; CHECK-NEXT:    [[ALIGNED:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR:%.*]], i64 -8)
-; CHECK-NEXT:    [[LOAD:%.*]] = load <16 x i8>, ptr [[ALIGNED]], align 1
+; CHECK-NEXT:    [[LOAD:%.*]] = load <16 x i8>, ptr [[PTR:%.*]], align 1
 ; CHECK-NEXT:    ret <16 x i8> [[LOAD]]
 ;
   %aligned = call ptr @llvm.ptrmask.p0.i64(ptr %ptr, i64 -8)
@@ -194,11 +192,9 @@ define <16 x i8> @ptrmask_align8_ptr_align8(ptr align 8 %ptr) {
 }
 
 ; Underlying alignment greater than alignment forced by ptrmask
-; TODO: Should be able to drop the ptrmask
 define <16 x i8> @ptrmask_align8_ptr_align16(ptr align 16 %ptr) {
 ; CHECK-LABEL: @ptrmask_align8_ptr_align16(
-; CHECK-NEXT:    [[ALIGNED:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR:%.*]], i64 -8)
-; CHECK-NEXT:    [[LOAD:%.*]] = load <16 x i8>, ptr [[ALIGNED]], align 1
+; CHECK-NEXT:    [[LOAD:%.*]] = load <16 x i8>, ptr [[PTR:%.*]], align 1
 ; CHECK-NEXT:    ret <16 x i8> [[LOAD]]
 ;
   %aligned = call ptr @llvm.ptrmask.p0.i64(ptr %ptr, i64 -8)
diff --git a/llvm/test/Transforms/InstSimplify/ptrmask.ll b/llvm/test/Transforms/InstSimplify/ptrmask.ll
index 4945d4bc0edf6..dd83abfdeee46 100644
--- a/llvm/test/Transforms/InstSimplify/ptrmask.ll
+++ b/llvm/test/Transforms/InstSimplify/ptrmask.ll
@@ -12,8 +12,7 @@ declare <2 x ptr> @llvm.ptrmask.v2p1.v2i64(<2 x ptr>, <2 x i64>)
 define ptr @ptrmask_simplify_poison_mask(ptr %p) {
 ; CHECK-LABEL: define ptr @ptrmask_simplify_poison_mask
 ; CHECK-SAME: (ptr [[P:%.*]]) {
-; CHECK-NEXT:    [[R:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[P]], i64 poison)
-; CHECK-NEXT:    ret ptr [[R]]
+; CHECK-NEXT:    ret ptr poison
 ;
   %r = call ptr @llvm.ptrmask.p0.i64(ptr %p, i64 poison)
   ret ptr %r
@@ -22,8 +21,7 @@ define ptr @ptrmask_simplify_poison_mask(ptr %p) {
 define <2 x ptr addrspace(1) > @ptrmask_simplify_poison_mask_vec(<2 x ptr addrspace(1) > %p) {
 ; CHECK-LABEL: define <2 x ptr addrspace(1)> @ptrmask_simplify_poison_mask_vec
 ; CHECK-SAME: (<2 x ptr addrspace(1)> [[P:%.*]]) {
-; CHECK-NEXT:    [[R:%.*]] = call <2 x ptr addrspace(1)> @llvm.ptrmask.v2p1.v2i32(<2 x ptr addrspace(1)> [[P]], <2 x i32> poison)
-; CHECK-NEXT:    ret <2 x ptr addrspace(1)> [[R]]
+; CHECK-NEXT:    ret <2 x ptr addrspace(1)> poison
 ;
   %r = call <2 x ptr addrspace(1) > @llvm.ptrmask.v2p1.v2i32(<2 x ptr addrspace(1) > %p, <2 x i32> poison)
   ret <2 x ptr addrspace(1) > %r
@@ -42,8 +40,7 @@ define <2 x ptr addrspace(1) > @ptrmask_simplify_poison_and_zero_i32_vec_fail(<2
 define <2 x ptr> @ptrmask_simplify_undef_and_ones_vec(<2 x ptr> %p) {
 ; CHECK-LABEL: define <2 x ptr> @ptrmask_simplify_undef_and_ones_vec
 ; CHECK-SAME: (<2 x ptr> [[P:%.*]]) {
-; CHECK-NEXT:    [[R:%.*]] = call <2 x ptr> @llvm.ptrmask.v2p0.v2i64(<2 x ptr> [[P]], <2 x i64> <i64 undef, i64 -1>)
-; CHECK-NEXT:    ret <2 x ptr> [[R]]
+; CHECK-NEXT:    ret <2 x ptr> [[P]]
 ;
   %r = call <2 x ptr> @llvm.ptrmask.v2p1.v2i64(<2 x ptr> %p, <2 x i64> <i64 undef, i64 -1>)
   ret <2 x ptr> %r
@@ -52,8 +49,7 @@ define <2 x ptr> @ptrmask_simplify_undef_and_ones_vec(<2 x ptr> %p) {
 define <2 x ptr> @ptrmask_simplify_poison_and_ones_vec(<2 x ptr> %p) {
 ; CHECK-LABEL: define <2 x ptr> @ptrmask_simplify_poison_and_ones_vec
 ; CHECK-SAME: (<2 x ptr> [[P:%.*]]) {
-; CHECK-NEXT:    [[R:%.*]] = call <2 x ptr> @llvm.ptrmask.v2p0.v2i64(<2 x ptr> [[P]], <2 x i64> <i64 poison, i64 -1>)
-; CHECK-NEXT:    ret <2 x ptr> [[R]]
+; CHECK-NEXT:    ret <2 x ptr> [[P]]
 ;
   %r = call <2 x ptr> @llvm.ptrmask.v2p1.v2i64(<2 x ptr> %p, <2 x i64> <i64 poison, i64 -1>)
   ret <2 x ptr> %r
@@ -62,8 +58,7 @@ define <2 x ptr> @ptrmask_simplify_poison_and_ones_vec(<2 x ptr> %p) {
 define <2 x ptr> @ptrmask_simplify_ones_vec(<2 x ptr> %p) {
 ; CHECK-LABEL: define <2 x ptr> @ptrmask_simplify_ones_vec
 ; CHECK-SAME: (<2 x ptr> [[P:%.*]]) {
-; CHECK-NEXT:    [[R:%.*]] = call <2 x ptr> @llvm.ptrmask.v2p0.v2i64(<2 x ptr> [[P]], <2 x i64> <i64 -1, i64 -1>)
-; CHECK-NEXT:    ret <2 x ptr> [[R]]
+; CHECK-NEXT:    ret <2 x ptr> [[P]]
 ;
   %r = call <2 x ptr> @llvm.ptrmask.v2p1.v2i64(<2 x ptr> %p, <2 x i64> <i64 -1, i64 -1>)
   ret <2 x ptr> %r
@@ -72,8 +67,7 @@ define <2 x ptr> @ptrmask_simplify_ones_vec(<2 x ptr> %p) {
 define <2 x ptr addrspace(1) > @ptrmask_simplify_ones_i32_vec(<2 x ptr addrspace(1) > %p) {
 ; CHECK-LABEL: define <2 x ptr addrspace(1)> @ptrmask_simplify_ones_i32_vec
 ; CHECK-SAME: (<2 x ptr addrspace(1)> [[P:%.*]]) {
-; CHECK-NEXT:    [[R:%.*]] = call <2 x ptr addrspace(1)> @llvm.ptrmask.v2p1.v2i32(<2 x ptr addrspace(1)> [[P]], <2 x i32> <i32 -1, i32 -1>)
-; CHECK-NEXT:    ret <2 x ptr addrspace(1)> [[R]]
+; CHECK-NEXT:    ret <2 x ptr addrspace(1)> [[P]]
 ;
   %r = call <2 x ptr addrspace(1) > @llvm.ptrmask.v2p1.v2i32(<2 x ptr addrspace(1) > %p, <2 x i32> <i32 -1, i32 -1>)
   ret <2 x ptr addrspace(1) > %r
@@ -82,8 +76,7 @@ define <2 x ptr addrspace(1) > @ptrmask_simplify_ones_i32_vec(<2 x ptr addrspace
 define ptr addrspace(1) @ptrmask_simplify_undef_mask(ptr addrspace(1) %p) {
 ; CHECK-LABEL: define ptr addrspace(1) @ptrmask_simplify_undef_mask
 ; CHECK-SAME: (ptr addrspace(1) [[P:%.*]]) {
-; CHECK-NEXT:    [[R:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) [[P]], i32 undef)
-; CHECK-NEXT:    ret ptr addrspace(1) [[R]]
+; CHECK-NEXT:    ret ptr addrspace(1) [[P]]
 ;
   %r = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) %p, i32 undef)
   ret ptr addrspace(1) %r
@@ -102,8 +95,7 @@ define ptr @ptrmask_simplify_0_mask(ptr %p) {
 define ptr @ptrmask_simplify_1s_mask(ptr %p) {
 ; CHECK-LABEL: define ptr @ptrmask_simplify_1s_mask
 ; CHECK-SAME: (ptr [[P:%.*]]) {
-; CHECK-NEXT:    [[R:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[P]], i64 -1)
-; CHECK-NEXT:    ret ptr [[R]]
+; CHECK-NEXT:    ret ptr [[P]]
 ;
   %r = call ptr @llvm.ptrmask.p0.i64(ptr %p, i64 -1)
   ret ptr %r
@@ -112,8 +104,7 @@ define ptr @ptrmask_simplify_1s_mask(ptr %p) {
 define ptr addrspace(1) @ptrmask_simplify_1s_mask_i32(ptr addrspace(1) %p) {
 ; CHECK-LABEL: define ptr addrspace(1) @ptrmask_simplify_1s_mask_i32
 ; CHECK-SAME: (ptr addrspace(1) [[P:%.*]]) {
-; CHECK-NEXT:    [[R:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) [[P]], i32 -1)
-; CHECK-NEXT:    ret ptr addrspace(1) [[R]]
+; CHECK-NEXT:    ret ptr addrspace(1) [[P]]
 ;
   %r = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) %p, i32 -1)
   ret ptr addrspace(1) %r
@@ -122,8 +113,7 @@ define ptr addrspace(1) @ptrmask_simplify_1s_mask_i32(ptr addrspace(1) %p) {
 define ptr @ptrmask_simplify_poison_ptr(i64 %m) {
 ; CHECK-LABEL: define ptr @ptrmask_simplify_poison_ptr
 ; CHECK-SAME: (i64 [[M:%.*]]) {
-; CHECK-NEXT:    [[R:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr poison, i64 [[M]])
-; CHECK-NEXT:    ret ptr [[R]]
+; CHECK-NEXT:    ret ptr poison
 ;
   %r = call ptr @llvm.ptrmask.p0.i64(ptr poison, i64 %m)
   ret ptr %r
@@ -132,8 +122,7 @@ define ptr @ptrmask_simplify_poison_ptr(i64 %m) {
 define ptr addrspace(1) @ptrmask_simplify_undef_ptr(i32 %m) {
 ; CHECK-LABEL: define ptr addrspace(1) @ptrmask_simplify_undef_ptr
 ; CHECK-SAME: (i32 [[M:%.*]]) {
-; CHECK-NEXT:    [[R:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) undef, i32 [[M]])
-; CHECK-NEXT:    ret ptr addrspace(1) [[R]]
+; CHECK-NEXT:    ret ptr addrspace(1) null
 ;
   %r = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) undef, i32 %m)
   ret ptr addrspace(1) %r
@@ -142,8 +131,7 @@ define ptr addrspace(1) @ptrmask_simplify_undef_ptr(i32 %m) {
 define ptr @ptrmask_simplify_null_ptr(i64 %m) {
 ; CHECK-LABEL: define ptr @ptrmask_simplify_null_ptr
 ; CHECK-SAME: (i64 [[M:%.*]]) {
-; CHECK-NEXT:    [[R:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr null, i64 [[M]])
-; CHECK-NEXT:    ret ptr [[R]]
+; CHECK-NEXT:    ret ptr null
 ;
   %r = call ptr @llvm.ptrmask.p0.i64(ptr null, i64 %m)
   ret ptr %r
@@ -152,9 +140,7 @@ define ptr @ptrmask_simplify_null_ptr(i64 %m) {
 define ptr @ptrmask_simplify_ptrmask(ptr %p) {
 ; CHECK-LABEL: define ptr @ptrmask_simplify_ptrmask
 ; CHECK-SAME: (ptr [[P:%.*]]) {
-; CHECK-NEXT:    [[M:%.*]] = ptrtoint ptr [[P]] to i64
-; CHECK-NEXT:    [[R:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[P]], i64 [[M]])
-; CHECK-NEXT:    ret ptr [[R]]
+; CHECK-NEXT:    ret ptr [[P]]
 ;
   %m = ptrtoint ptr %p to i64
   %r = call ptr @llvm.ptrmask.p0.i64(ptr %p, i64 %m)
@@ -164,9 +150,7 @@ define ptr @ptrmask_simplify_ptrmask(ptr %p) {
 define ptr addrspace(1) @ptrmask_simplify_ptrmask_i32(ptr addrspace(1) %p) {
 ; CHECK-LABEL: define ptr addrspace(1) @ptrmask_simplify_ptrmask_i32
 ; CHECK-SAME: (ptr addrspace(1) [[P:%.*]]) {
-; CHECK-NEXT:    [[M:%.*]] = ptrtoint ptr addrspace(1) [[P]] to i32
-; CHECK-NEXT:    [[R:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) [[P]], i32 [[M]])
-; CHECK-NEXT:    ret ptr addrspace(1) [[R]]
+; CHECK-NEXT:    ret ptr addrspace(1) [[P]]
 ;
   %m = ptrtoint ptr addrspace(1) %p to i32
   %r = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) %p, i32 %m)
@@ -176,8 +160,7 @@ define ptr addrspace(1) @ptrmask_simplify_ptrmask_i32(ptr addrspace(1) %p) {
 define ptr @ptrmask_simplify_aligned_unused(ptr align 64 %p) {
 ; CHECK-LABEL: define ptr @ptrmask_simplify_aligned_unused
 ; CHECK-SAME: (ptr align 64 [[P:%.*]]) {
-; CHECK-NEXT:    [[R:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[P]], i64 -64)
-; CHECK-NEXT:    ret ptr [[R]]
+; CHECK-NEXT:    ret ptr [[P]]
 ;
   %r = call ptr @llvm.ptrmask.p0.i64(ptr %p, i64 -64)
   ret ptr %r
@@ -206,8 +189,7 @@ define <2 x ptr> @ptrmask_simplify_aligned_unused_vec_todo(<2 x ptr> align 128 %
 define ptr addrspace(1) @ptrmask_simplify_aligned_unused_i32(ptr addrspace(1) align 64 %p) {
 ; CHECK-LABEL: define ptr addrspace(1) @ptrmask_simplify_aligned_unused_i32
 ; CHECK-SAME: (ptr addrspace(1) align 64 [[P:%.*]]) {
-; CHECK-NEXT:    [[R:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) [[P]], i32 -64)
-; CHECK-NEXT:    ret ptr addrspace(1) [[R]]
+; CHECK-NEXT:    ret ptr addrspace(1) [[P]]
 ;
   %r = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) %p, i32 -64)
   ret ptr addrspace(1) %r
@@ -218,8 +200,7 @@ define ptr @ptrmask_simplify_known_unused(ptr %p) {
 ; CHECK-SAME: (ptr [[P:%.*]]) {
 ; CHECK-NEXT:    [[PM0:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[P]], i64 -64)
 ; CHECK-NEXT:    [[PGEP:%.*]] = getelementptr i8, ptr [[PM0]], i64 32
-; CHECK-NEXT:    [[R:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PGEP]], i64 -32)
-; CHECK-NEXT:    ret ptr [[R]]
+; CHECK-NEXT:    ret ptr [[PGEP]]
 ;
   %pm0 = call ptr @llvm.ptrmask.p0.i64(ptr %p, i64 -64)
   %pgep = getelementptr i8, ptr %pm0, i64 32
@@ -232,8 +213,7 @@ define <2 x ptr> @ptrmask_simplify_known_unused_vec(<2 x ptr> %p) {
 ; CHECK-SAME: (<2 x ptr> [[P:%.*]]) {
 ; CHECK-NEXT:    [[PM0:%.*]] = call <2 x ptr> @llvm.ptrmask.v2p0.v2i64(<2 x ptr> [[P]], <2 x i64> <i64 -64, i64 -64>)
 ; CHECK-NEXT:    [[PGEP:%.*]] = getelementptr i8, <2 x ptr> [[PM0]], <2 x i64> <i64 32, i64 32>
-; CHECK-NEXT:    [[R:%.*]] = call <2 x ptr> @llvm.ptrmask.v2p0.v2i64(<2 x ptr> [[PGEP]], <2 x i64> <i64 -32, i64 -32>)
-; CHECK-NEXT:    ret <2 x ptr> [[R]]
+; CHECK-NEXT:    ret <2 x ptr> [[PGEP]]
 ;
   %pm0 = call <2 x ptr> @llvm.ptrmask.v2p1.v2i64(<2 x ptr> %p, <2 x i64> <i64 -64, i64 -64>)
   %pgep = getelementptr i8, <2 x ptr> %pm0, <2 x i64> <i64 32, i64 32>
@@ -246,8 +226,7 @@ define <2 x ptr> @ptrmask_simplify_known_unused_vec2(<2 x ptr> %p) {
 ; CHECK-SAME: (<2 x ptr> [[P:%.*]]) {
 ; CHECK-NEXT:    [[PM0:%.*]] = call <2 x ptr> @llvm.ptrmask.v2p0.v2i64(<2 x ptr> [[P]], <2 x i64> <i64 -64, i64 -64>)
 ; CHECK-NEXT:    [[PGEP:%.*]] = getelementptr i8, <2 x ptr> [[PM0]], <2 x i64> <i64 32, i64 32>
-; CHECK-NEXT:    [[R:%.*]] = call <2 x ptr> @llvm.ptrmask.v2p0.v2i64(<2 x ptr> [[PGEP]], <2 x i64> <i64 -32, i64 -16>)
-; CHECK-NEXT:    ret <2 x ptr> [[R]]
+; CHECK-NEXT:    ret <2 x ptr> [[PGEP]]
 ;
   %pm0 = call <2 x ptr> @llvm.ptrmask.v2p1.v2i64(<2 x ptr> %p, <2 x i64> <i64 -64, i64 -64>)
   %pgep = getelementptr i8, <2 x ptr> %pm0, <2 x i64> <i64 32, i64 32>
@@ -260,8 +239,7 @@ define <2 x ptr> @ptrmask_simplify_known_unused_vec3(<2 x ptr> %p) {
 ; CHECK-SAME: (<2 x ptr> [[P:%.*]]) {
 ; CHECK-NEXT:    [[PM0:%.*]] = call <2 x ptr> @llvm.ptrmask.v2p0.v2i64(<2 x ptr> [[P]], <2 x i64> <i64 -64, i64 -128>)
 ; CHECK-NEXT:    [[PGEP:%.*]] = getelementptr i8, <2 x ptr> [[PM0]], <2 x i64> <i64 32, i64 32>
-; CHECK-NEXT:    [[R:%.*]] = call <2 x ptr> @llvm.ptrmask.v2p0.v2i64(<2 x ptr> [[PGEP]], <2 x i64> <i64 -32, i64 -32>)
-; CHECK-NEXT:    ret <2 x ptr> [[R]]
+; CHECK-NEXT:    ret <2 x ptr> [[PGEP]]
 ;
   %pm0 = call <2 x ptr> @llvm.ptrmask.v2p1.v2i64(<2 x ptr> %p, <2 x i64> <i64 -64, i64 -128>)
   %pgep = getelementptr i8, <2 x ptr> %pm0, <2 x i64> <i64 32, i64 32>
@@ -274,8 +252,7 @@ define <2 x ptr> @ptrmask_simplify_known_unused_vec4(<2 x ptr> %p) {
 ; CHECK-SAME: (<2 x ptr> [[P:%.*]]) {
 ; CHECK-NEXT:    [[PM0:%.*]] = call <2 x ptr> @llvm.ptrmask.v2p0.v2i64(<2 x ptr> [[P]], <2 x i64> <i64 -64, i64 -128>)
 ; CHECK-NEXT:    [[PGEP:%.*]] = getelementptr i8, <2 x ptr> [[PM0]], <2 x i64> <i64 32, i64 64>
-; CHECK-NEXT:    [[R:%.*]] = call <2 x ptr> @llvm.ptrmask.v2p0.v2i64(<2 x ptr> [[PGEP]], <2 x i64> <i64 -32, i64 -32>)
-; CHECK-NEXT:    ret <2 x ptr> [[R]]
+; CHECK-NEXT:    ret <2 x ptr> [[PGEP]]
 ;
   %pm0 = call <2 x ptr> @llvm.ptrmask.v2p1.v2i64(<2 x ptr> %p, <2 x i64> <i64 -64, i64 -128>)
   %pgep = getelementptr i8, <2 x ptr> %pm0, <2 x i64> <i64 32, i64 64>

From 2c5f2b33aea2ebaa780ca522f3d1007e99c7577e Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Fri, 22 Sep 2023 08:21:32 -0500
Subject: [PATCH 769/877] [InstCombine] Add tests for combining `llvm.ptrmask`;
 NFC

Differential Revision: https://reviews.llvm.org/D156634
---
 .../InstCombine/consecutive-ptrmask.ll        |  27 +-
 llvm/test/Transforms/InstCombine/ptrmask.ll   | 432 ++++++++++++++++++
 2 files changed, 458 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/Transforms/InstCombine/ptrmask.ll

diff --git a/llvm/test/Transforms/InstCombine/consecutive-ptrmask.ll b/llvm/test/Transforms/InstCombine/consecutive-ptrmask.ll
index bfe9efb1e88dd..a4722c1561409 100644
--- a/llvm/test/Transforms/InstCombine/consecutive-ptrmask.ll
+++ b/llvm/test/Transforms/InstCombine/consecutive-ptrmask.ll
@@ -4,7 +4,8 @@
 target datalayout = "p1:64:64:64:32"
 
 declare ptr @llvm.ptrmask.p0.i64(ptr, i64)
-declare ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1), i32)
+declare ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) , i32)
+declare <2 x ptr addrspace(1) > @llvm.ptrmask.v2p1.v2i32(<2 x ptr addrspace(1) >, <2 x i32>)
 declare <2 x ptr> @llvm.ptrmask.v2p0.v2i64(<2 x ptr>, <2 x i64>)
 declare void @use.ptr(ptr)
 
@@ -57,3 +58,27 @@ define ptr addrspace(1) @fold_2x_smaller_index_type(ptr addrspace(1) %p, i32 %m0
   %p1 = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) %p0, i32 %m1)
   ret ptr addrspace(1) %p1
 }
+
+define <2 x ptr> @fold_2x_vec_i64(<2 x ptr> %p, <2 x i64> %m0) {
+; CHECK-LABEL: define <2 x ptr> @fold_2x_vec_i64
+; CHECK-SAME: (<2 x ptr> [[P:%.*]], <2 x i64> [[M0:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = and <2 x i64> [[M0]], <i64 -2, i64 -2>
+; CHECK-NEXT:    [[P1:%.*]] = call <2 x ptr> @llvm.ptrmask.v2p0.v2i64(<2 x ptr> [[P]], <2 x i64> [[TMP1]])
+; CHECK-NEXT:    ret <2 x ptr> [[P1]]
+;
+  %p0 = call <2 x ptr> @llvm.ptrmask.v2p0.v2i64(<2 x ptr> %p, <2 x i64> %m0)
+  %p1 = call <2 x ptr> @llvm.ptrmask.v2p0.v2i64(<2 x ptr> %p0, <2 x i64> <i64 -2, i64 -2>)
+  ret <2 x ptr> %p1
+}
+
+define <2 x ptr addrspace(1) > @fold_2x_vec_i32_undef(<2 x ptr addrspace(1) > %p, <2 x i32> %m0) {
+; CHECK-LABEL: define <2 x ptr addrspace(1)> @fold_2x_vec_i32_undef
+; CHECK-SAME: (<2 x ptr addrspace(1)> [[P:%.*]], <2 x i32> [[M0:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = and <2 x i32> [[M0]], <i32 -2, i32 undef>
+; CHECK-NEXT:    [[P1:%.*]] = call <2 x ptr addrspace(1)> @llvm.ptrmask.v2p1.v2i32(<2 x ptr addrspace(1)> [[P]], <2 x i32> [[TMP1]])
+; CHECK-NEXT:    ret <2 x ptr addrspace(1)> [[P1]]
+;
+  %p0 = call <2 x ptr addrspace(1) > @llvm.ptrmask.v2p1.v2i32(<2 x ptr addrspace(1) > %p, <2 x i32> %m0)
+  %p1 = call <2 x ptr addrspace(1) > @llvm.ptrmask.v2p1.v2i32(<2 x ptr addrspace(1) > %p0, <2 x i32> <i32 -2, i32 undef>)
+  ret <2 x ptr addrspace(1) > %p1
+}
diff --git a/llvm/test/Transforms/InstCombine/ptrmask.ll b/llvm/test/Transforms/InstCombine/ptrmask.ll
new file mode 100644
index 0000000000000..222f6e9ae9c1c
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/ptrmask.ll
@@ -0,0 +1,432 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
+; RUN: opt < %s -passes=instcombine -S | FileCheck %s
+
+target datalayout = "p1:64:64:64:32"
+
+declare ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1), i32)
+declare ptr @llvm.ptrmask.p0.i64(ptr, i64)
+declare <2 x ptr addrspace(1) > @llvm.ptrmask.v2p1.v2i32(<2 x ptr addrspace(1) >, <2 x i32>)
+declare <2 x ptr> @llvm.ptrmask.v2p0.v2i64(<2 x ptr>, <2 x i64>)
+
+define ptr @ptrmask_combine_consecutive_preserve_attrs(ptr %p0, i64 %m1) {
+; CHECK-LABEL: define ptr @ptrmask_combine_consecutive_preserve_attrs
+; CHECK-SAME: (ptr [[P0:%.*]], i64 [[M1:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[M1]], 224
+; CHECK-NEXT:    [[R:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[P0]], i64 [[TMP1]])
+; CHECK-NEXT:    ret ptr [[R]]
+;
+  %pm0 = call ptr @llvm.ptrmask.p0.i64(ptr %p0, i64 224)
+  %r = call noalias ptr @llvm.ptrmask.p0.i64(ptr %pm0, i64 %m1)
+  ret ptr %r
+}
+
+define <2 x ptr> @ptrmask_combine_consecutive_preserve_attrs_vecs(<2 x ptr> %p0, <2 x i64> %m1) {
+; CHECK-LABEL: define <2 x ptr> @ptrmask_combine_consecutive_preserve_attrs_vecs
+; CHECK-SAME: (<2 x ptr> [[P0:%.*]], <2 x i64> [[M1:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = and <2 x i64> [[M1]], <i64 12345, i64 12345>
+; CHECK-NEXT:    [[R:%.*]] = call <2 x ptr> @llvm.ptrmask.v2p0.v2i64(<2 x ptr> [[P0]], <2 x i64> [[TMP1]])
+; CHECK-NEXT:    ret <2 x ptr> [[R]]
+;
+  %pm0 = call <2 x ptr> @llvm.ptrmask.v2p0.v2i64(<2 x ptr> %p0, <2 x i64> <i64 12345, i64 12345>)
+  %r = call align 128 <2 x ptr> @llvm.ptrmask.v2p0.v2i64(<2 x ptr> %pm0, <2 x i64> %m1)
+  ret <2 x ptr> %r
+}
+
+define ptr @ptrmask_combine_consecutive_preserve_attrs_fail(ptr %p0, i64 %m0) {
+; CHECK-LABEL: define ptr @ptrmask_combine_consecutive_preserve_attrs_fail
+; CHECK-SAME: (ptr [[P0:%.*]], i64 [[M0:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[M0]], 193
+; CHECK-NEXT:    [[R:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[P0]], i64 [[TMP1]])
+; CHECK-NEXT:    ret ptr [[R]]
+;
+  %pm0 = call noalias ptr @llvm.ptrmask.p0.i64(ptr %p0, i64 %m0)
+  %r = call ptr @llvm.ptrmask.p0.i64(ptr %pm0, i64 193)
+  ret ptr %r
+}
+
+define ptr @ptrmask_combine_consecutive_preserve_attrs_todo0(ptr %p0) {
+; CHECK-LABEL: define ptr @ptrmask_combine_consecutive_preserve_attrs_todo0
+; CHECK-SAME: (ptr [[P0:%.*]]) {
+; CHECK-NEXT:    [[PM0:%.*]] = call noalias ptr @llvm.ptrmask.p0.i64(ptr [[P0]], i64 224)
+; CHECK-NEXT:    ret ptr [[PM0]]
+;
+  %pm0 = call noalias ptr @llvm.ptrmask.p0.i64(ptr %p0, i64 224)
+  %r = call ptr @llvm.ptrmask.p0.i64(ptr %pm0, i64 224)
+  ret ptr %r
+}
+
+define ptr @ptrmask_combine_consecutive_preserve_attrs_todo1(ptr %p0) {
+; CHECK-LABEL: define ptr @ptrmask_combine_consecutive_preserve_attrs_todo1
+; CHECK-SAME: (ptr [[P0:%.*]]) {
+; CHECK-NEXT:    [[PM0:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[P0]], i64 224)
+; CHECK-NEXT:    ret ptr [[PM0]]
+;
+  %pm0 = call ptr @llvm.ptrmask.p0.i64(ptr %p0, i64 224)
+  %r = call noalias ptr @llvm.ptrmask.p0.i64(ptr %pm0, i64 224)
+  ret ptr %r
+}
+
+define ptr addrspace(1) @ptrmask_combine_consecutive_preserve_attrs_todo2(ptr addrspace(1) %p0) {
+; CHECK-LABEL: define ptr addrspace(1) @ptrmask_combine_consecutive_preserve_attrs_todo2
+; CHECK-SAME: (ptr addrspace(1) [[P0:%.*]]) {
+; CHECK-NEXT:    [[PM0:%.*]] = call noalias ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) [[P0]], i32 224)
+; CHECK-NEXT:    ret ptr addrspace(1) [[PM0]]
+;
+  %pm0 = call noalias ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) %p0, i32 224)
+  %r = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) %pm0, i32 224)
+  ret ptr addrspace(1) %r
+}
+
+define ptr @ptrmask_combine_add_nonnull(ptr %p) {
+; CHECK-LABEL: define ptr @ptrmask_combine_add_nonnull
+; CHECK-SAME: (ptr [[P:%.*]]) {
+; CHECK-NEXT:    [[PM0:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[P]], i64 -64)
+; CHECK-NEXT:    [[PGEP:%.*]] = getelementptr i8, ptr [[PM0]], i64 33
+; CHECK-NEXT:    [[R:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PGEP]], i64 -16)
+; CHECK-NEXT:    ret ptr [[R]]
+;
+  %pm0 = call ptr @llvm.ptrmask.p0.i64(ptr %p, i64 -64)
+  %pgep = getelementptr i8, ptr %pm0, i64 33
+  %r = call ptr @llvm.ptrmask.p0.i64(ptr %pgep, i64 -16)
+  ret ptr %r
+}
+
+define ptr @ptrmask_combine_add_alignment(ptr %p) {
+; CHECK-LABEL: define ptr @ptrmask_combine_add_alignment
+; CHECK-SAME: (ptr [[P:%.*]]) {
+; CHECK-NEXT:    [[R:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[P]], i64 -64)
+; CHECK-NEXT:    ret ptr [[R]]
+;
+  %r = call ptr @llvm.ptrmask.p0.i64(ptr %p, i64 -64)
+  ret ptr %r
+}
+
+define ptr addrspace(1) @ptrmask_combine_add_alignment2(ptr addrspace(1) align 32 %p) {
+; CHECK-LABEL: define ptr addrspace(1) @ptrmask_combine_add_alignment2
+; CHECK-SAME: (ptr addrspace(1) align 32 [[P:%.*]]) {
+; CHECK-NEXT:    [[R:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) [[P]], i32 -64)
+; CHECK-NEXT:    ret ptr addrspace(1) [[R]]
+;
+  %r = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) %p, i32 -64)
+  ret ptr addrspace(1) %r
+}
+
+define <2 x ptr> @ptrmask_combine_add_alignment_vec(<2 x ptr> %p) {
+; CHECK-LABEL: define <2 x ptr> @ptrmask_combine_add_alignment_vec
+; CHECK-SAME: (<2 x ptr> [[P:%.*]]) {
+; CHECK-NEXT:    [[R:%.*]] = call <2 x ptr> @llvm.ptrmask.v2p0.v2i64(<2 x ptr> [[P]], <2 x i64> <i64 -96, i64 -96>)
+; CHECK-NEXT:    ret <2 x ptr> [[R]]
+;
+  %r = call <2 x ptr> @llvm.ptrmask.v2p0.v2i64(<2 x ptr> %p, <2 x i64> <i64 -96, i64 -96>)
+  ret <2 x ptr> %r
+}
+
+define ptr addrspace(1) @ptrmask_combine_improve_alignment(ptr addrspace(1) %p) {
+; CHECK-LABEL: define ptr addrspace(1) @ptrmask_combine_improve_alignment
+; CHECK-SAME: (ptr addrspace(1) [[P:%.*]]) {
+; CHECK-NEXT:    [[R:%.*]] = call align 32 ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) [[P]], i32 -64)
+; CHECK-NEXT:    ret ptr addrspace(1) [[R]]
+;
+  %r = call align 32 ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) %p, i32 -64)
+  ret ptr addrspace(1) %r
+}
+
+define <2 x ptr addrspace(1) > @ptrmask_combine_improve_alignment_vec(<2 x ptr addrspace(1) > %p) {
+; CHECK-LABEL: define <2 x ptr addrspace(1)> @ptrmask_combine_improve_alignment_vec
+; CHECK-SAME: (<2 x ptr addrspace(1)> [[P:%.*]]) {
+; CHECK-NEXT:    [[R:%.*]] = call align 32 <2 x ptr addrspace(1)> @llvm.ptrmask.v2p1.v2i32(<2 x ptr addrspace(1)> [[P]], <2 x i32> <i32 -64, i32 -128>)
+; CHECK-NEXT:    ret <2 x ptr addrspace(1)> [[R]]
+;
+  %r = call align 32 <2 x ptr addrspace(1) > @llvm.ptrmask.v2p1.v2i32(<2 x ptr addrspace(1) > %p, <2 x i32> <i32 -64, i32 -128>)
+  ret <2 x ptr addrspace(1) > %r
+}
+
+define ptr addrspace(1) @ptrmask_combine_improve_alignment_fail(ptr addrspace(1) %p) {
+; CHECK-LABEL: define ptr addrspace(1) @ptrmask_combine_improve_alignment_fail
+; CHECK-SAME: (ptr addrspace(1) [[P:%.*]]) {
+; CHECK-NEXT:    [[R:%.*]] = call align 128 ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) [[P]], i32 -64)
+; CHECK-NEXT:    ret ptr addrspace(1) [[R]]
+;
+  %r = call align 128 ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) %p, i32 -64)
+  ret ptr addrspace(1) %r
+}
+
+define i64 @ptrtoint_of_ptrmask(ptr %p, i64 %m) {
+; CHECK-LABEL: define i64 @ptrtoint_of_ptrmask
+; CHECK-SAME: (ptr [[P:%.*]], i64 [[M:%.*]]) {
+; CHECK-NEXT:    [[PM:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[P]], i64 [[M]])
+; CHECK-NEXT:    [[R:%.*]] = ptrtoint ptr [[PM]] to i64
+; CHECK-NEXT:    ret i64 [[R]]
+;
+  %pm = call ptr @llvm.ptrmask.p0.i64(ptr %p, i64 %m)
+  %r = ptrtoint ptr %pm to i64
+  ret i64 %r
+}
+
+; This succeeds because (ptrtoint i32) gets folded to (trunc i32 (ptrtoint i64))
+define i32 @ptrtoint_of_ptrmask2(ptr %p, i64 %m) {
+; CHECK-LABEL: define i32 @ptrtoint_of_ptrmask2
+; CHECK-SAME: (ptr [[P:%.*]], i64 [[M:%.*]]) {
+; CHECK-NEXT:    [[PM:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[P]], i64 [[M]])
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[PM]] to i64
+; CHECK-NEXT:    [[R:%.*]] = trunc i64 [[TMP1]] to i32
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %pm = call ptr @llvm.ptrmask.p0.i64(ptr %p, i64 %m)
+  %r = ptrtoint ptr %pm to i32
+  ret i32 %r
+}
+
+define <2 x i64> @ptrtoint_of_ptrmask_vec(<2 x ptr> %p, <2 x i64> %m) {
+; CHECK-LABEL: define <2 x i64> @ptrtoint_of_ptrmask_vec
+; CHECK-SAME: (<2 x ptr> [[P:%.*]], <2 x i64> [[M:%.*]]) {
+; CHECK-NEXT:    [[PM:%.*]] = call <2 x ptr> @llvm.ptrmask.v2p0.v2i64(<2 x ptr> [[P]], <2 x i64> [[M]])
+; CHECK-NEXT:    [[R:%.*]] = ptrtoint <2 x ptr> [[PM]] to <2 x i64>
+; CHECK-NEXT:    ret <2 x i64> [[R]]
+;
+  %pm = call <2 x ptr> @llvm.ptrmask.v2p0.v2i64(<2 x ptr> %p, <2 x i64> %m)
+  %r = ptrtoint <2 x ptr> %pm to <2 x i64>
+  ret <2 x i64> %r
+}
+
+define <2 x i32> @ptrtoint_of_ptrmask_vec2(<2 x ptr> %p, <2 x i64> %m) {
+; CHECK-LABEL: define <2 x i32> @ptrtoint_of_ptrmask_vec2
+; CHECK-SAME: (<2 x ptr> [[P:%.*]], <2 x i64> [[M:%.*]]) {
+; CHECK-NEXT:    [[PM:%.*]] = call <2 x ptr> @llvm.ptrmask.v2p0.v2i64(<2 x ptr> [[P]], <2 x i64> [[M]])
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint <2 x ptr> [[PM]] to <2 x i64>
+; CHECK-NEXT:    [[R:%.*]] = trunc <2 x i64> [[TMP1]] to <2 x i32>
+; CHECK-NEXT:    ret <2 x i32> [[R]]
+;
+  %pm = call <2 x ptr> @llvm.ptrmask.v2p0.v2i64(<2 x ptr> %p, <2 x i64> %m)
+  %r = ptrtoint <2 x ptr> %pm to <2 x i32>
+  ret <2 x i32> %r
+}
+
+define i64 @ptrtoint_of_ptrmask_fail(ptr addrspace(1) %p, i32 %m) {
+; CHECK-LABEL: define i64 @ptrtoint_of_ptrmask_fail
+; CHECK-SAME: (ptr addrspace(1) [[P:%.*]], i32 [[M:%.*]]) {
+; CHECK-NEXT:    [[PM:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) [[P]], i32 [[M]])
+; CHECK-NEXT:    [[R:%.*]] = ptrtoint ptr addrspace(1) [[PM]] to i64
+; CHECK-NEXT:    ret i64 [[R]]
+;
+  %pm = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) %p, i32 %m)
+  %r = ptrtoint ptr addrspace(1) %pm to i64
+  ret i64 %r
+}
+
+define <2 x i32> @ptrtoint_of_ptrmask_vec_fail(<2 x ptr addrspace(1) > %p, <2 x i32> %m) {
+; CHECK-LABEL: define <2 x i32> @ptrtoint_of_ptrmask_vec_fail
+; CHECK-SAME: (<2 x ptr addrspace(1)> [[P:%.*]], <2 x i32> [[M:%.*]]) {
+; CHECK-NEXT:    [[PM:%.*]] = call <2 x ptr addrspace(1)> @llvm.ptrmask.v2p1.v2i32(<2 x ptr addrspace(1)> [[P]], <2 x i32> [[M]])
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint <2 x ptr addrspace(1)> [[PM]] to <2 x i64>
+; CHECK-NEXT:    [[R:%.*]] = trunc <2 x i64> [[TMP1]] to <2 x i32>
+; CHECK-NEXT:    ret <2 x i32> [[R]]
+;
+  %pm = call <2 x ptr addrspace(1) > @llvm.ptrmask.v2p1.v2i32(<2 x ptr addrspace(1) > %p, <2 x i32> %m)
+  %r = ptrtoint <2 x ptr addrspace(1) > %pm to <2 x i32>
+  ret <2 x i32> %r
+}
+
+define ptr addrspace(1) @ptrmask_is_null(ptr addrspace(1) align 32 %p) {
+; CHECK-LABEL: define ptr addrspace(1) @ptrmask_is_null
+; CHECK-SAME: (ptr addrspace(1) align 32 [[P:%.*]]) {
+; CHECK-NEXT:    [[R:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) [[P]], i32 31)
+; CHECK-NEXT:    ret ptr addrspace(1) [[R]]
+;
+  %r = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) %p, i32 31)
+  ret ptr addrspace(1) %r
+}
+
+define <2 x ptr addrspace(1) > @ptrmask_is_null_vec(<2 x ptr addrspace(1) > align 64 %p) {
+; CHECK-LABEL: define <2 x ptr addrspace(1)> @ptrmask_is_null_vec
+; CHECK-SAME: (<2 x ptr addrspace(1)> align 64 [[P:%.*]]) {
+; CHECK-NEXT:    [[R:%.*]] = call <2 x ptr addrspace(1)> @llvm.ptrmask.v2p1.v2i32(<2 x ptr addrspace(1)> [[P]], <2 x i32> <i32 31, i32 63>)
+; CHECK-NEXT:    ret <2 x ptr addrspace(1)> [[R]]
+;
+  %r = call <2 x ptr addrspace(1) > @llvm.ptrmask.v2p1.v2i32(<2 x ptr addrspace(1) > %p, <2 x i32> <i32 31, i32 63>)
+  ret <2 x ptr addrspace(1) > %r
+}
+
+define ptr addrspace(1) @ptrmask_is_null_fail(ptr addrspace(1) align 16 %p) {
+; CHECK-LABEL: define ptr addrspace(1) @ptrmask_is_null_fail
+; CHECK-SAME: (ptr addrspace(1) align 16 [[P:%.*]]) {
+; CHECK-NEXT:    [[R:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) [[P]], i32 31)
+; CHECK-NEXT:    ret ptr addrspace(1) [[R]]
+;
+  %r = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) %p, i32 31)
+  ret ptr addrspace(1) %r
+}
+
+define <2 x ptr addrspace(1) > @ptrmask_is_null_vec_fail(<2 x ptr addrspace(1) > align 32 %p) {
+; CHECK-LABEL: define <2 x ptr addrspace(1)> @ptrmask_is_null_vec_fail
+; CHECK-SAME: (<2 x ptr addrspace(1)> align 32 [[P:%.*]]) {
+; CHECK-NEXT:    [[R:%.*]] = call <2 x ptr addrspace(1)> @llvm.ptrmask.v2p1.v2i32(<2 x ptr addrspace(1)> [[P]], <2 x i32> <i32 31, i32 63>)
+; CHECK-NEXT:    ret <2 x ptr addrspace(1)> [[R]]
+;
+  %r = call <2 x ptr addrspace(1) > @llvm.ptrmask.v2p1.v2i32(<2 x ptr addrspace(1) > %p, <2 x i32> <i32 31, i32 63>)
+  ret <2 x ptr addrspace(1) > %r
+}
+
+define ptr @ptrmask_maintain_provenance_i64(ptr %p0) {
+; CHECK-LABEL: define ptr @ptrmask_maintain_provenance_i64
+; CHECK-SAME: (ptr [[P0:%.*]]) {
+; CHECK-NEXT:    [[R:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[P0]], i64 0)
+; CHECK-NEXT:    ret ptr [[R]]
+;
+  %r = call ptr @llvm.ptrmask.p0.i64(ptr %p0, i64 0)
+  ret ptr %r
+}
+
+define ptr addrspace(1) @ptrmask_maintain_provenance_i32(ptr addrspace(1) %p0) {
+; CHECK-LABEL: define ptr addrspace(1) @ptrmask_maintain_provenance_i32
+; CHECK-SAME: (ptr addrspace(1) [[P0:%.*]]) {
+; CHECK-NEXT:    [[R:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) [[P0]], i32 0)
+; CHECK-NEXT:    ret ptr addrspace(1) [[R]]
+;
+  %r = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) %p0, i32 0)
+  ret ptr addrspace(1) %r
+}
+
+define ptr @ptrmask_is_useless0(i64 %i, i64 %m) {
+; CHECK-LABEL: define ptr @ptrmask_is_useless0
+; CHECK-SAME: (i64 [[I:%.*]], i64 [[M:%.*]]) {
+; CHECK-NEXT:    [[M0:%.*]] = and i64 [[M]], -4
+; CHECK-NEXT:    [[I0:%.*]] = and i64 [[I]], -4
+; CHECK-NEXT:    [[P0:%.*]] = inttoptr i64 [[I0]] to ptr
+; CHECK-NEXT:    [[R:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[P0]], i64 [[M0]])
+; CHECK-NEXT:    ret ptr [[R]]
+;
+  %m0 = and i64 %m, -4
+  %i0 = and i64 %i, -4
+  %p0 = inttoptr i64 %i0 to ptr
+  %r = call ptr @llvm.ptrmask.p0.i64(ptr %p0, i64 %m0)
+  ret ptr %r
+}
+
+define ptr @ptrmask_is_useless1(i64 %i, i64 %m) {
+; CHECK-LABEL: define ptr @ptrmask_is_useless1
+; CHECK-SAME: (i64 [[I:%.*]], i64 [[M:%.*]]) {
+; CHECK-NEXT:    [[M0:%.*]] = and i64 [[M]], -4
+; CHECK-NEXT:    [[I0:%.*]] = and i64 [[I]], -8
+; CHECK-NEXT:    [[P0:%.*]] = inttoptr i64 [[I0]] to ptr
+; CHECK-NEXT:    [[R:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[P0]], i64 [[M0]])
+; CHECK-NEXT:    ret ptr [[R]]
+;
+  %m0 = and i64 %m, -4
+  %i0 = and i64 %i, -8
+  %p0 = inttoptr i64 %i0 to ptr
+  %r = call ptr @llvm.ptrmask.p0.i64(ptr %p0, i64 %m0)
+  ret ptr %r
+}
+
+define ptr @ptrmask_is_useless2(i64 %i, i64 %m) {
+; CHECK-LABEL: define ptr @ptrmask_is_useless2
+; CHECK-SAME: (i64 [[I:%.*]], i64 [[M:%.*]]) {
+; CHECK-NEXT:    [[M0:%.*]] = and i64 [[M]], 127
+; CHECK-NEXT:    [[I0:%.*]] = and i64 [[I]], 31
+; CHECK-NEXT:    [[P0:%.*]] = inttoptr i64 [[I0]] to ptr
+; CHECK-NEXT:    [[R:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[P0]], i64 [[M0]])
+; CHECK-NEXT:    ret ptr [[R]]
+;
+  %m0 = and i64 %m, 127
+  %i0 = and i64 %i, 31
+  %p0 = inttoptr i64 %i0 to ptr
+  %r = call ptr @llvm.ptrmask.p0.i64(ptr %p0, i64 %m0)
+  ret ptr %r
+}
+
+define ptr @ptrmask_is_useless3(i64 %i, i64 %m) {
+; CHECK-LABEL: define ptr @ptrmask_is_useless3
+; CHECK-SAME: (i64 [[I:%.*]], i64 [[M:%.*]]) {
+; CHECK-NEXT:    [[M0:%.*]] = and i64 [[M]], 127
+; CHECK-NEXT:    [[I0:%.*]] = and i64 [[I]], 127
+; CHECK-NEXT:    [[P0:%.*]] = inttoptr i64 [[I0]] to ptr
+; CHECK-NEXT:    [[R:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[P0]], i64 [[M0]])
+; CHECK-NEXT:    ret ptr [[R]]
+;
+  %m0 = and i64 %m, 127
+  %i0 = and i64 %i, 127
+  %p0 = inttoptr i64 %i0 to ptr
+  %r = call ptr @llvm.ptrmask.p0.i64(ptr %p0, i64 %m0)
+  ret ptr %r
+}
+
+define ptr @ptrmask_is_useless4(i64 %i, i64 %m) {
+; CHECK-LABEL: define ptr @ptrmask_is_useless4
+; CHECK-SAME: (i64 [[I:%.*]], i64 [[M:%.*]]) {
+; CHECK-NEXT:    [[M0:%.*]] = or i64 [[M]], -4
+; CHECK-NEXT:    [[I0:%.*]] = and i64 [[I]], -4
+; CHECK-NEXT:    [[P0:%.*]] = inttoptr i64 [[I0]] to ptr
+; CHECK-NEXT:    [[R:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[P0]], i64 [[M0]])
+; CHECK-NEXT:    ret ptr [[R]]
+;
+  %m0 = or i64 %m, -4
+  %i0 = and i64 %i, -4
+  %p0 = inttoptr i64 %i0 to ptr
+  %r = call ptr @llvm.ptrmask.p0.i64(ptr %p0, i64 %m0)
+  ret ptr %r
+}
+
+define <2 x ptr> @ptrmask_is_useless_vec(<2 x i64> %i, <2 x i64> %m) {
+; CHECK-LABEL: define <2 x ptr> @ptrmask_is_useless_vec
+; CHECK-SAME: (<2 x i64> [[I:%.*]], <2 x i64> [[M:%.*]]) {
+; CHECK-NEXT:    [[M0:%.*]] = and <2 x i64> [[M]], <i64 127, i64 127>
+; CHECK-NEXT:    [[I0:%.*]] = and <2 x i64> [[I]], <i64 31, i64 31>
+; CHECK-NEXT:    [[P0:%.*]] = inttoptr <2 x i64> [[I0]] to <2 x ptr>
+; CHECK-NEXT:    [[R:%.*]] = call <2 x ptr> @llvm.ptrmask.v2p0.v2i64(<2 x ptr> [[P0]], <2 x i64> [[M0]])
+; CHECK-NEXT:    ret <2 x ptr> [[R]]
+;
+  %m0 = and <2 x i64> %m, <i64 127, i64 127>
+  %i0 = and <2 x i64> %i, <i64 31, i64 31>
+  %p0 = inttoptr <2 x i64> %i0 to <2 x ptr>
+  %r = call <2 x ptr> @llvm.ptrmask.v2p0.v2i64(<2 x ptr> %p0, <2 x i64> %m0)
+  ret <2 x ptr> %r
+}
+
+define <2 x ptr> @ptrmask_is_useless_vec_todo(<2 x i64> %i, <2 x i64> %m) {
+; CHECK-LABEL: define <2 x ptr> @ptrmask_is_useless_vec_todo
+; CHECK-SAME: (<2 x i64> [[I:%.*]], <2 x i64> [[M:%.*]]) {
+; CHECK-NEXT:    [[M0:%.*]] = and <2 x i64> [[M]], <i64 127, i64 127>
+; CHECK-NEXT:    [[I0:%.*]] = and <2 x i64> [[I]], <i64 31, i64 127>
+; CHECK-NEXT:    [[P0:%.*]] = inttoptr <2 x i64> [[I0]] to <2 x ptr>
+; CHECK-NEXT:    [[R:%.*]] = call <2 x ptr> @llvm.ptrmask.v2p0.v2i64(<2 x ptr> [[P0]], <2 x i64> [[M0]])
+; CHECK-NEXT:    ret <2 x ptr> [[R]]
+;
+  %m0 = and <2 x i64> %m, <i64 127, i64 127>
+  %i0 = and <2 x i64> %i, <i64 31, i64 127>
+  %p0 = inttoptr <2 x i64> %i0 to <2 x ptr>
+  %r = call <2 x ptr> @llvm.ptrmask.v2p0.v2i64(<2 x ptr> %p0, <2 x i64> %m0)
+  ret <2 x ptr> %r
+}
+
+define ptr @ptrmask_is_useless_fail0(i64 %i, i64 %m) {
+; CHECK-LABEL: define ptr @ptrmask_is_useless_fail0
+; CHECK-SAME: (i64 [[I:%.*]], i64 [[M:%.*]]) {
+; CHECK-NEXT:    [[M0:%.*]] = and i64 [[M]], -4
+; CHECK-NEXT:    [[I0:%.*]] = or i64 [[I]], -4
+; CHECK-NEXT:    [[P0:%.*]] = inttoptr i64 [[I0]] to ptr
+; CHECK-NEXT:    [[R:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr nonnull [[P0]], i64 [[M0]])
+; CHECK-NEXT:    ret ptr [[R]]
+;
+  %m0 = and i64 %m, -4
+  %i0 = or i64 %i, -4
+  %p0 = inttoptr i64 %i0 to ptr
+  %r = call ptr @llvm.ptrmask.p0.i64(ptr %p0, i64 %m0)
+  ret ptr %r
+}
+
+define ptr @ptrmask_is_useless_fail1(i64 %i, i64 %m) {
+; CHECK-LABEL: define ptr @ptrmask_is_useless_fail1
+; CHECK-SAME: (i64 [[I:%.*]], i64 [[M:%.*]]) {
+; CHECK-NEXT:    [[M0:%.*]] = and i64 [[M]], 127
+; CHECK-NEXT:    [[I0:%.*]] = and i64 [[I]], 511
+; CHECK-NEXT:    [[P0:%.*]] = inttoptr i64 [[I0]] to ptr
+; CHECK-NEXT:    [[R:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[P0]], i64 [[M0]])
+; CHECK-NEXT:    ret ptr [[R]]
+;
+  %m0 = and i64 %m, 127
+  %i0 = and i64 %i, 511
+  %p0 = inttoptr i64 %i0 to ptr
+  %r = call ptr @llvm.ptrmask.p0.i64(ptr %p0, i64 %m0)
+  ret ptr %r
+}

From edb9e9a5fb3cae861511f9a11e6d000e8a82fe91 Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Fri, 27 Oct 2023 11:35:29 -0500
Subject: [PATCH 770/877] [InstCombine] Implement `SimplifyDemandedBits` for
 `llvm.ptrmask`

Logic basically copies 'and' but we can't return a constant if the
result == rhs (mask) so that case is skipped.
---
 .../InstCombine/InstCombineCalls.cpp          |  5 ++
 .../InstCombine/InstCombineInternal.h         |  1 +
 .../InstCombineSimplifyDemanded.cpp           | 71 ++++++++++++++++---
 llvm/test/Transforms/InstCombine/ptrmask.ll   | 28 +++-----
 4 files changed, 79 insertions(+), 26 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 7e585e9166247..5c08ab190eba4 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -1962,6 +1962,11 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
     break;
   }
   case Intrinsic::ptrmask: {
+    unsigned BitWidth = DL.getPointerTypeSizeInBits(II->getType());
+    KnownBits Known(BitWidth);
+    if (SimplifyDemandedInstructionBits(*II, Known))
+      return II;
+
     Value *InnerPtr, *InnerMask;
     if (match(II->getArgOperand(0),
               m_OneUse(m_Intrinsic<Intrinsic::ptrmask>(m_Value(InnerPtr),
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
index 01c89ea06f2d9..34b10220ec88a 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -544,6 +544,7 @@ class LLVM_LIBRARY_VISIBILITY InstCombinerImpl final
   /// Tries to simplify operands to an integer instruction based on its
   /// demanded bits.
   bool SimplifyDemandedInstructionBits(Instruction &Inst);
+  bool SimplifyDemandedInstructionBits(Instruction &Inst, KnownBits &Known);
 
   Value *SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
                                     APInt &UndefElts, unsigned Depth = 0,
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
index cd6b017874e8d..fa6fe9d30abd1 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
@@ -48,15 +48,20 @@ static bool ShrinkDemandedConstant(Instruction *I, unsigned OpNo,
   return true;
 }
 
+/// Returns the bitwidth of the given scalar or pointer type. For vector types,
+/// returns the element type's bitwidth.
+static unsigned getBitWidth(Type *Ty, const DataLayout &DL) {
+  if (unsigned BitWidth = Ty->getScalarSizeInBits())
+    return BitWidth;
 
+  return DL.getPointerTypeSizeInBits(Ty);
+}
 
 /// Inst is an integer instruction that SimplifyDemandedBits knows about. See if
 /// the instruction has any properties that allow us to simplify its operands.
-bool InstCombinerImpl::SimplifyDemandedInstructionBits(Instruction &Inst) {
-  unsigned BitWidth = Inst.getType()->getScalarSizeInBits();
-  KnownBits Known(BitWidth);
-  APInt DemandedMask(APInt::getAllOnes(BitWidth));
-
+bool InstCombinerImpl::SimplifyDemandedInstructionBits(Instruction &Inst,
+                                                       KnownBits &Known) {
+  APInt DemandedMask(APInt::getAllOnes(Known.getBitWidth()));
   Value *V = SimplifyDemandedUseBits(&Inst, DemandedMask, Known,
                                      0, &Inst);
   if (!V) return false;
@@ -65,6 +70,13 @@ bool InstCombinerImpl::SimplifyDemandedInstructionBits(Instruction &Inst) {
   return true;
 }
 
+/// Inst is an integer instruction that SimplifyDemandedBits knows about. See if
+/// the instruction has any properties that allow us to simplify its operands.
+bool InstCombinerImpl::SimplifyDemandedInstructionBits(Instruction &Inst) {
+  KnownBits Known(getBitWidth(Inst.getType(), DL));
+  return SimplifyDemandedInstructionBits(Inst, Known);
+}
+
 /// This form of SimplifyDemandedBits simplifies the specified instruction
 /// operand if possible, updating it in place. It returns true if it made any
 /// change and false otherwise.
@@ -143,7 +155,6 @@ Value *InstCombinerImpl::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
     return SimplifyMultipleUseDemandedBits(I, DemandedMask, Known, Depth, CxtI);
 
   KnownBits LHSKnown(BitWidth), RHSKnown(BitWidth);
-
   // If this is the root being simplified, allow it to have multiple uses,
   // just set the DemandedMask to all bits so that we can try to simplify the
   // operands.  This allows visitTruncInst (for example) to simplify the
@@ -893,6 +904,48 @@ Value *InstCombinerImpl::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
         }
         break;
       }
+      case Intrinsic::ptrmask: {
+        unsigned MaskWidth = I->getOperand(1)->getType()->getScalarSizeInBits();
+        RHSKnown = KnownBits(MaskWidth);
+        // If either the LHS or the RHS are Zero, the result is zero.
+        if (SimplifyDemandedBits(I, 0, DemandedMask, LHSKnown, Depth + 1) ||
+            SimplifyDemandedBits(
+                I, 1, (DemandedMask & ~LHSKnown.Zero).zextOrTrunc(MaskWidth),
+                RHSKnown, Depth + 1))
+          return I;
+
+        // TODO: Should be 1-extend
+        RHSKnown = RHSKnown.anyextOrTrunc(BitWidth);
+        assert(!RHSKnown.hasConflict() && "Bits known to be one AND zero?");
+        assert(!LHSKnown.hasConflict() && "Bits known to be one AND zero?");
+
+        Known = LHSKnown & RHSKnown;
+        KnownBitsComputed = true;
+
+        // If the client is only demanding bits we know to be zero, return
+        // `llvm.ptrmask(p, 0)`. We can't return `null` here due to pointer
+        // provenance, but making the mask zero will be easily optimizable in
+        // the backend.
+        if (DemandedMask.isSubsetOf(Known.Zero) &&
+            !match(I->getOperand(1), m_Zero()))
+          return replaceOperand(
+              *I, 1, Constant::getNullValue(I->getOperand(1)->getType()));
+
+        // Mask in demanded space does nothing.
+        // NOTE: We may have attributes associated with the return value of the
+        // llvm.ptrmask intrinsic that will be lost when we just return the
+        // operand. We should try to preserve them.
+        if (DemandedMask.isSubsetOf(RHSKnown.One | LHSKnown.Zero))
+          return I->getOperand(0);
+
+        // If the RHS is a constant, see if we can simplify it.
+        if (ShrinkDemandedConstant(
+                I, 1, (DemandedMask & ~LHSKnown.Zero).zextOrTrunc(MaskWidth)))
+          return I;
+
+        break;
+      }
+
       case Intrinsic::fshr:
       case Intrinsic::fshl: {
         const APInt *SA;
@@ -978,8 +1031,10 @@ Value *InstCombinerImpl::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
   }
 
   // If the client is only demanding bits that we know, return the known
-  // constant.
-  if (DemandedMask.isSubsetOf(Known.Zero|Known.One))
+  // constant. We can't directly simplify pointers as a constant because of
+  // pointer provenance.
+  // TODO: We could return `(inttoptr const)` for pointers.
+  if (!V->getType()->isPointerTy() && DemandedMask.isSubsetOf(Known.Zero | Known.One))
     return Constant::getIntegerValue(VTy, Known.One);
   return nullptr;
 }
diff --git a/llvm/test/Transforms/InstCombine/ptrmask.ll b/llvm/test/Transforms/InstCombine/ptrmask.ll
index 222f6e9ae9c1c..b036b8bcf14cd 100644
--- a/llvm/test/Transforms/InstCombine/ptrmask.ll
+++ b/llvm/test/Transforms/InstCombine/ptrmask.ll
@@ -82,7 +82,7 @@ define ptr @ptrmask_combine_add_nonnull(ptr %p) {
 ; CHECK-SAME: (ptr [[P:%.*]]) {
 ; CHECK-NEXT:    [[PM0:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[P]], i64 -64)
 ; CHECK-NEXT:    [[PGEP:%.*]] = getelementptr i8, ptr [[PM0]], i64 33
-; CHECK-NEXT:    [[R:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PGEP]], i64 -16)
+; CHECK-NEXT:    [[R:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PGEP]], i64 -32)
 ; CHECK-NEXT:    ret ptr [[R]]
 ;
   %pm0 = call ptr @llvm.ptrmask.p0.i64(ptr %p, i64 -64)
@@ -230,7 +230,7 @@ define <2 x i32> @ptrtoint_of_ptrmask_vec_fail(<2 x ptr addrspace(1) > %p, <2 x
 define ptr addrspace(1) @ptrmask_is_null(ptr addrspace(1) align 32 %p) {
 ; CHECK-LABEL: define ptr addrspace(1) @ptrmask_is_null
 ; CHECK-SAME: (ptr addrspace(1) align 32 [[P:%.*]]) {
-; CHECK-NEXT:    [[R:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) [[P]], i32 31)
+; CHECK-NEXT:    [[R:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) [[P]], i32 0)
 ; CHECK-NEXT:    ret ptr addrspace(1) [[R]]
 ;
   %r = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) %p, i32 31)
@@ -250,7 +250,7 @@ define <2 x ptr addrspace(1) > @ptrmask_is_null_vec(<2 x ptr addrspace(1) > alig
 define ptr addrspace(1) @ptrmask_is_null_fail(ptr addrspace(1) align 16 %p) {
 ; CHECK-LABEL: define ptr addrspace(1) @ptrmask_is_null_fail
 ; CHECK-SAME: (ptr addrspace(1) align 16 [[P:%.*]]) {
-; CHECK-NEXT:    [[R:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) [[P]], i32 31)
+; CHECK-NEXT:    [[R:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) [[P]], i32 16)
 ; CHECK-NEXT:    ret ptr addrspace(1) [[R]]
 ;
   %r = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) %p, i32 31)
@@ -290,10 +290,9 @@ define ptr addrspace(1) @ptrmask_maintain_provenance_i32(ptr addrspace(1) %p0) {
 define ptr @ptrmask_is_useless0(i64 %i, i64 %m) {
 ; CHECK-LABEL: define ptr @ptrmask_is_useless0
 ; CHECK-SAME: (i64 [[I:%.*]], i64 [[M:%.*]]) {
-; CHECK-NEXT:    [[M0:%.*]] = and i64 [[M]], -4
 ; CHECK-NEXT:    [[I0:%.*]] = and i64 [[I]], -4
 ; CHECK-NEXT:    [[P0:%.*]] = inttoptr i64 [[I0]] to ptr
-; CHECK-NEXT:    [[R:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[P0]], i64 [[M0]])
+; CHECK-NEXT:    [[R:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[P0]], i64 [[M]])
 ; CHECK-NEXT:    ret ptr [[R]]
 ;
   %m0 = and i64 %m, -4
@@ -306,10 +305,9 @@ define ptr @ptrmask_is_useless0(i64 %i, i64 %m) {
 define ptr @ptrmask_is_useless1(i64 %i, i64 %m) {
 ; CHECK-LABEL: define ptr @ptrmask_is_useless1
 ; CHECK-SAME: (i64 [[I:%.*]], i64 [[M:%.*]]) {
-; CHECK-NEXT:    [[M0:%.*]] = and i64 [[M]], -4
 ; CHECK-NEXT:    [[I0:%.*]] = and i64 [[I]], -8
 ; CHECK-NEXT:    [[P0:%.*]] = inttoptr i64 [[I0]] to ptr
-; CHECK-NEXT:    [[R:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[P0]], i64 [[M0]])
+; CHECK-NEXT:    [[R:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[P0]], i64 [[M]])
 ; CHECK-NEXT:    ret ptr [[R]]
 ;
   %m0 = and i64 %m, -4
@@ -322,10 +320,9 @@ define ptr @ptrmask_is_useless1(i64 %i, i64 %m) {
 define ptr @ptrmask_is_useless2(i64 %i, i64 %m) {
 ; CHECK-LABEL: define ptr @ptrmask_is_useless2
 ; CHECK-SAME: (i64 [[I:%.*]], i64 [[M:%.*]]) {
-; CHECK-NEXT:    [[M0:%.*]] = and i64 [[M]], 127
 ; CHECK-NEXT:    [[I0:%.*]] = and i64 [[I]], 31
 ; CHECK-NEXT:    [[P0:%.*]] = inttoptr i64 [[I0]] to ptr
-; CHECK-NEXT:    [[R:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[P0]], i64 [[M0]])
+; CHECK-NEXT:    [[R:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[P0]], i64 [[M]])
 ; CHECK-NEXT:    ret ptr [[R]]
 ;
   %m0 = and i64 %m, 127
@@ -338,10 +335,9 @@ define ptr @ptrmask_is_useless2(i64 %i, i64 %m) {
 define ptr @ptrmask_is_useless3(i64 %i, i64 %m) {
 ; CHECK-LABEL: define ptr @ptrmask_is_useless3
 ; CHECK-SAME: (i64 [[I:%.*]], i64 [[M:%.*]]) {
-; CHECK-NEXT:    [[M0:%.*]] = and i64 [[M]], 127
 ; CHECK-NEXT:    [[I0:%.*]] = and i64 [[I]], 127
 ; CHECK-NEXT:    [[P0:%.*]] = inttoptr i64 [[I0]] to ptr
-; CHECK-NEXT:    [[R:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[P0]], i64 [[M0]])
+; CHECK-NEXT:    [[R:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[P0]], i64 [[M]])
 ; CHECK-NEXT:    ret ptr [[R]]
 ;
   %m0 = and i64 %m, 127
@@ -354,11 +350,9 @@ define ptr @ptrmask_is_useless3(i64 %i, i64 %m) {
 define ptr @ptrmask_is_useless4(i64 %i, i64 %m) {
 ; CHECK-LABEL: define ptr @ptrmask_is_useless4
 ; CHECK-SAME: (i64 [[I:%.*]], i64 [[M:%.*]]) {
-; CHECK-NEXT:    [[M0:%.*]] = or i64 [[M]], -4
 ; CHECK-NEXT:    [[I0:%.*]] = and i64 [[I]], -4
 ; CHECK-NEXT:    [[P0:%.*]] = inttoptr i64 [[I0]] to ptr
-; CHECK-NEXT:    [[R:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[P0]], i64 [[M0]])
-; CHECK-NEXT:    ret ptr [[R]]
+; CHECK-NEXT:    ret ptr [[P0]]
 ;
   %m0 = or i64 %m, -4
   %i0 = and i64 %i, -4
@@ -370,10 +364,9 @@ define ptr @ptrmask_is_useless4(i64 %i, i64 %m) {
 define <2 x ptr> @ptrmask_is_useless_vec(<2 x i64> %i, <2 x i64> %m) {
 ; CHECK-LABEL: define <2 x ptr> @ptrmask_is_useless_vec
 ; CHECK-SAME: (<2 x i64> [[I:%.*]], <2 x i64> [[M:%.*]]) {
-; CHECK-NEXT:    [[M0:%.*]] = and <2 x i64> [[M]], <i64 127, i64 127>
 ; CHECK-NEXT:    [[I0:%.*]] = and <2 x i64> [[I]], <i64 31, i64 31>
 ; CHECK-NEXT:    [[P0:%.*]] = inttoptr <2 x i64> [[I0]] to <2 x ptr>
-; CHECK-NEXT:    [[R:%.*]] = call <2 x ptr> @llvm.ptrmask.v2p0.v2i64(<2 x ptr> [[P0]], <2 x i64> [[M0]])
+; CHECK-NEXT:    [[R:%.*]] = call <2 x ptr> @llvm.ptrmask.v2p0.v2i64(<2 x ptr> [[P0]], <2 x i64> [[M]])
 ; CHECK-NEXT:    ret <2 x ptr> [[R]]
 ;
   %m0 = and <2 x i64> %m, <i64 127, i64 127>
@@ -386,10 +379,9 @@ define <2 x ptr> @ptrmask_is_useless_vec(<2 x i64> %i, <2 x i64> %m) {
 define <2 x ptr> @ptrmask_is_useless_vec_todo(<2 x i64> %i, <2 x i64> %m) {
 ; CHECK-LABEL: define <2 x ptr> @ptrmask_is_useless_vec_todo
 ; CHECK-SAME: (<2 x i64> [[I:%.*]], <2 x i64> [[M:%.*]]) {
-; CHECK-NEXT:    [[M0:%.*]] = and <2 x i64> [[M]], <i64 127, i64 127>
 ; CHECK-NEXT:    [[I0:%.*]] = and <2 x i64> [[I]], <i64 31, i64 127>
 ; CHECK-NEXT:    [[P0:%.*]] = inttoptr <2 x i64> [[I0]] to <2 x ptr>
-; CHECK-NEXT:    [[R:%.*]] = call <2 x ptr> @llvm.ptrmask.v2p0.v2i64(<2 x ptr> [[P0]], <2 x i64> [[M0]])
+; CHECK-NEXT:    [[R:%.*]] = call <2 x ptr> @llvm.ptrmask.v2p0.v2i64(<2 x ptr> [[P0]], <2 x i64> [[M]])
 ; CHECK-NEXT:    ret <2 x ptr> [[R]]
 ;
   %m0 = and <2 x i64> %m, <i64 127, i64 127>

From 51abbf98d19cb1b89c6938811f2805bafe4b336e Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Fri, 27 Oct 2023 11:35:41 -0500
Subject: [PATCH 771/877] [InstCombine] Deduce `align` and `nonnull` return
 attributes for `llvm.ptrmask`

We can deduce the former based on the mask / incoming pointer
alignment.  We can set the latter based if know the result in non-zero
(this is essentially just caching our analysis result).

Differential Revision: https://reviews.llvm.org/D156636
---
 clang/test/CodeGen/arm64_32-vaarg.c           |  2 +-
 .../InstCombine/InstCombineCalls.cpp          | 21 +++++++++++
 .../test/Transforms/InstCombine/align-addr.ll |  8 ++---
 .../InstCombine/consecutive-ptrmask.ll        |  2 +-
 llvm/test/Transforms/InstCombine/ptrmask.ll   | 36 +++++++++----------
 5 files changed, 45 insertions(+), 24 deletions(-)

diff --git a/clang/test/CodeGen/arm64_32-vaarg.c b/clang/test/CodeGen/arm64_32-vaarg.c
index 9fbcf88ecfdcc..3f1f4443436da 100644
--- a/clang/test/CodeGen/arm64_32-vaarg.c
+++ b/clang/test/CodeGen/arm64_32-vaarg.c
@@ -29,7 +29,7 @@ long long test_longlong(OneLongLong input, va_list *mylist) {
   // CHECK-LABEL: define{{.*}} i64 @test_longlong(i64 %input
   // CHECK: [[STARTPTR:%.*]] = load ptr, ptr %mylist
   // CHECK: [[ALIGN_TMP:%.+]] = getelementptr inbounds i8, ptr [[STARTPTR]], i32 7
-  // CHECK: [[ALIGNED_ADDR:%.+]] = tail call ptr @llvm.ptrmask.p0.i32(ptr nonnull [[ALIGN_TMP]], i32 -8)
+  // CHECK: [[ALIGNED_ADDR:%.+]] = tail call align 8 ptr @llvm.ptrmask.p0.i32(ptr nonnull [[ALIGN_TMP]], i32 -8)
   // CHECK: [[NEXT:%.*]] = getelementptr inbounds i8, ptr [[ALIGNED_ADDR]], i32 8
   // CHECK: store ptr [[NEXT]], ptr %mylist
 
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 5c08ab190eba4..10a8bff700b73 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -1978,6 +1978,27 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
           *II, Builder.CreateIntrinsic(InnerPtr->getType(), Intrinsic::ptrmask,
                                        {InnerPtr, NewMask}));
     }
+    bool Changed = false;
+    // See if we can deduce non-null.
+    if (!CI.hasRetAttr(Attribute::NonNull) &&
+        (Known.isNonZero() ||
+         isKnownNonZero(II, DL, /*Depth*/ 0, &AC, II, &DT))) {
+      CI.addRetAttr(Attribute::NonNull);
+      Changed = true;
+    }
+
+    unsigned NewAlignmentLog =
+        std::min(Value::MaxAlignmentExponent,
+                 std::min(BitWidth - 1, Known.countMinTrailingZeros()));
+    // Known bits will capture if we had alignment information associated with
+    // the pointer argument.
+    if (NewAlignmentLog > Log2(CI.getRetAlign().valueOrOne())) {
+      CI.addRetAttr(Attribute::getWithAlignment(
+          CI.getContext(), Align(uint64_t(1) << NewAlignmentLog)));
+      Changed = true;
+    }
+    if (Changed)
+      return &CI;
     break;
   }
   case Intrinsic::uadd_with_overflow:
diff --git a/llvm/test/Transforms/InstCombine/align-addr.ll b/llvm/test/Transforms/InstCombine/align-addr.ll
index 1e49cddf7ffe7..facb5df08a82f 100644
--- a/llvm/test/Transforms/InstCombine/align-addr.ll
+++ b/llvm/test/Transforms/InstCombine/align-addr.ll
@@ -135,7 +135,7 @@ define <16 x i8> @ptrmask_align_unknown_ptr_align1(ptr align 1 %ptr, i64 %mask)
 
 define <16 x i8> @ptrmask_align_unknown_ptr_align8(ptr align 8 %ptr, i64 %mask) {
 ; CHECK-LABEL: @ptrmask_align_unknown_ptr_align8(
-; CHECK-NEXT:    [[ALIGNED:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR:%.*]], i64 [[MASK:%.*]])
+; CHECK-NEXT:    [[ALIGNED:%.*]] = call align 8 ptr @llvm.ptrmask.p0.i64(ptr [[PTR:%.*]], i64 [[MASK:%.*]])
 ; CHECK-NEXT:    [[LOAD:%.*]] = load <16 x i8>, ptr [[ALIGNED]], align 1
 ; CHECK-NEXT:    ret <16 x i8> [[LOAD]]
 ;
@@ -147,7 +147,7 @@ define <16 x i8> @ptrmask_align_unknown_ptr_align8(ptr align 8 %ptr, i64 %mask)
 ; Increase load align from 1 to 2
 define <16 x i8> @ptrmask_align2_ptr_align1(ptr align 1 %ptr) {
 ; CHECK-LABEL: @ptrmask_align2_ptr_align1(
-; CHECK-NEXT:    [[ALIGNED:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR:%.*]], i64 -2)
+; CHECK-NEXT:    [[ALIGNED:%.*]] = call align 2 ptr @llvm.ptrmask.p0.i64(ptr [[PTR:%.*]], i64 -2)
 ; CHECK-NEXT:    [[LOAD:%.*]] = load <16 x i8>, ptr [[ALIGNED]], align 1
 ; CHECK-NEXT:    ret <16 x i8> [[LOAD]]
 ;
@@ -159,7 +159,7 @@ define <16 x i8> @ptrmask_align2_ptr_align1(ptr align 1 %ptr) {
 ; Increase load align from 1 to 4
 define <16 x i8> @ptrmask_align4_ptr_align1(ptr align 1 %ptr) {
 ; CHECK-LABEL: @ptrmask_align4_ptr_align1(
-; CHECK-NEXT:    [[ALIGNED:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR:%.*]], i64 -4)
+; CHECK-NEXT:    [[ALIGNED:%.*]] = call align 4 ptr @llvm.ptrmask.p0.i64(ptr [[PTR:%.*]], i64 -4)
 ; CHECK-NEXT:    [[LOAD:%.*]] = load <16 x i8>, ptr [[ALIGNED]], align 1
 ; CHECK-NEXT:    ret <16 x i8> [[LOAD]]
 ;
@@ -171,7 +171,7 @@ define <16 x i8> @ptrmask_align4_ptr_align1(ptr align 1 %ptr) {
 ; Increase load align from 1 to 8
 define <16 x i8> @ptrmask_align8_ptr_align1(ptr align 1 %ptr) {
 ; CHECK-LABEL: @ptrmask_align8_ptr_align1(
-; CHECK-NEXT:    [[ALIGNED:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR:%.*]], i64 -8)
+; CHECK-NEXT:    [[ALIGNED:%.*]] = call align 8 ptr @llvm.ptrmask.p0.i64(ptr [[PTR:%.*]], i64 -8)
 ; CHECK-NEXT:    [[LOAD:%.*]] = load <16 x i8>, ptr [[ALIGNED]], align 1
 ; CHECK-NEXT:    ret <16 x i8> [[LOAD]]
 ;
diff --git a/llvm/test/Transforms/InstCombine/consecutive-ptrmask.ll b/llvm/test/Transforms/InstCombine/consecutive-ptrmask.ll
index a4722c1561409..22a7c0c072d96 100644
--- a/llvm/test/Transforms/InstCombine/consecutive-ptrmask.ll
+++ b/llvm/test/Transforms/InstCombine/consecutive-ptrmask.ll
@@ -63,7 +63,7 @@ define <2 x ptr> @fold_2x_vec_i64(<2 x ptr> %p, <2 x i64> %m0) {
 ; CHECK-LABEL: define <2 x ptr> @fold_2x_vec_i64
 ; CHECK-SAME: (<2 x ptr> [[P:%.*]], <2 x i64> [[M0:%.*]]) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = and <2 x i64> [[M0]], <i64 -2, i64 -2>
-; CHECK-NEXT:    [[P1:%.*]] = call <2 x ptr> @llvm.ptrmask.v2p0.v2i64(<2 x ptr> [[P]], <2 x i64> [[TMP1]])
+; CHECK-NEXT:    [[P1:%.*]] = call align 2 <2 x ptr> @llvm.ptrmask.v2p0.v2i64(<2 x ptr> [[P]], <2 x i64> [[TMP1]])
 ; CHECK-NEXT:    ret <2 x ptr> [[P1]]
 ;
   %p0 = call <2 x ptr> @llvm.ptrmask.v2p0.v2i64(<2 x ptr> %p, <2 x i64> %m0)
diff --git a/llvm/test/Transforms/InstCombine/ptrmask.ll b/llvm/test/Transforms/InstCombine/ptrmask.ll
index b036b8bcf14cd..c501062b3b3b0 100644
--- a/llvm/test/Transforms/InstCombine/ptrmask.ll
+++ b/llvm/test/Transforms/InstCombine/ptrmask.ll
@@ -12,7 +12,7 @@ define ptr @ptrmask_combine_consecutive_preserve_attrs(ptr %p0, i64 %m1) {
 ; CHECK-LABEL: define ptr @ptrmask_combine_consecutive_preserve_attrs
 ; CHECK-SAME: (ptr [[P0:%.*]], i64 [[M1:%.*]]) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[M1]], 224
-; CHECK-NEXT:    [[R:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[P0]], i64 [[TMP1]])
+; CHECK-NEXT:    [[R:%.*]] = call align 32 ptr @llvm.ptrmask.p0.i64(ptr [[P0]], i64 [[TMP1]])
 ; CHECK-NEXT:    ret ptr [[R]]
 ;
   %pm0 = call ptr @llvm.ptrmask.p0.i64(ptr %p0, i64 224)
@@ -47,7 +47,7 @@ define ptr @ptrmask_combine_consecutive_preserve_attrs_fail(ptr %p0, i64 %m0) {
 define ptr @ptrmask_combine_consecutive_preserve_attrs_todo0(ptr %p0) {
 ; CHECK-LABEL: define ptr @ptrmask_combine_consecutive_preserve_attrs_todo0
 ; CHECK-SAME: (ptr [[P0:%.*]]) {
-; CHECK-NEXT:    [[PM0:%.*]] = call noalias ptr @llvm.ptrmask.p0.i64(ptr [[P0]], i64 224)
+; CHECK-NEXT:    [[PM0:%.*]] = call noalias align 32 ptr @llvm.ptrmask.p0.i64(ptr [[P0]], i64 224)
 ; CHECK-NEXT:    ret ptr [[PM0]]
 ;
   %pm0 = call noalias ptr @llvm.ptrmask.p0.i64(ptr %p0, i64 224)
@@ -58,7 +58,7 @@ define ptr @ptrmask_combine_consecutive_preserve_attrs_todo0(ptr %p0) {
 define ptr @ptrmask_combine_consecutive_preserve_attrs_todo1(ptr %p0) {
 ; CHECK-LABEL: define ptr @ptrmask_combine_consecutive_preserve_attrs_todo1
 ; CHECK-SAME: (ptr [[P0:%.*]]) {
-; CHECK-NEXT:    [[PM0:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[P0]], i64 224)
+; CHECK-NEXT:    [[PM0:%.*]] = call align 32 ptr @llvm.ptrmask.p0.i64(ptr [[P0]], i64 224)
 ; CHECK-NEXT:    ret ptr [[PM0]]
 ;
   %pm0 = call ptr @llvm.ptrmask.p0.i64(ptr %p0, i64 224)
@@ -69,7 +69,7 @@ define ptr @ptrmask_combine_consecutive_preserve_attrs_todo1(ptr %p0) {
 define ptr addrspace(1) @ptrmask_combine_consecutive_preserve_attrs_todo2(ptr addrspace(1) %p0) {
 ; CHECK-LABEL: define ptr addrspace(1) @ptrmask_combine_consecutive_preserve_attrs_todo2
 ; CHECK-SAME: (ptr addrspace(1) [[P0:%.*]]) {
-; CHECK-NEXT:    [[PM0:%.*]] = call noalias ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) [[P0]], i32 224)
+; CHECK-NEXT:    [[PM0:%.*]] = call noalias align 32 ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) [[P0]], i32 224)
 ; CHECK-NEXT:    ret ptr addrspace(1) [[PM0]]
 ;
   %pm0 = call noalias ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) %p0, i32 224)
@@ -80,9 +80,9 @@ define ptr addrspace(1) @ptrmask_combine_consecutive_preserve_attrs_todo2(ptr ad
 define ptr @ptrmask_combine_add_nonnull(ptr %p) {
 ; CHECK-LABEL: define ptr @ptrmask_combine_add_nonnull
 ; CHECK-SAME: (ptr [[P:%.*]]) {
-; CHECK-NEXT:    [[PM0:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[P]], i64 -64)
+; CHECK-NEXT:    [[PM0:%.*]] = call align 64 ptr @llvm.ptrmask.p0.i64(ptr [[P]], i64 -64)
 ; CHECK-NEXT:    [[PGEP:%.*]] = getelementptr i8, ptr [[PM0]], i64 33
-; CHECK-NEXT:    [[R:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PGEP]], i64 -32)
+; CHECK-NEXT:    [[R:%.*]] = call nonnull align 32 ptr @llvm.ptrmask.p0.i64(ptr [[PGEP]], i64 -32)
 ; CHECK-NEXT:    ret ptr [[R]]
 ;
   %pm0 = call ptr @llvm.ptrmask.p0.i64(ptr %p, i64 -64)
@@ -94,7 +94,7 @@ define ptr @ptrmask_combine_add_nonnull(ptr %p) {
 define ptr @ptrmask_combine_add_alignment(ptr %p) {
 ; CHECK-LABEL: define ptr @ptrmask_combine_add_alignment
 ; CHECK-SAME: (ptr [[P:%.*]]) {
-; CHECK-NEXT:    [[R:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[P]], i64 -64)
+; CHECK-NEXT:    [[R:%.*]] = call align 64 ptr @llvm.ptrmask.p0.i64(ptr [[P]], i64 -64)
 ; CHECK-NEXT:    ret ptr [[R]]
 ;
   %r = call ptr @llvm.ptrmask.p0.i64(ptr %p, i64 -64)
@@ -104,7 +104,7 @@ define ptr @ptrmask_combine_add_alignment(ptr %p) {
 define ptr addrspace(1) @ptrmask_combine_add_alignment2(ptr addrspace(1) align 32 %p) {
 ; CHECK-LABEL: define ptr addrspace(1) @ptrmask_combine_add_alignment2
 ; CHECK-SAME: (ptr addrspace(1) align 32 [[P:%.*]]) {
-; CHECK-NEXT:    [[R:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) [[P]], i32 -64)
+; CHECK-NEXT:    [[R:%.*]] = call align 64 ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) [[P]], i32 -64)
 ; CHECK-NEXT:    ret ptr addrspace(1) [[R]]
 ;
   %r = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) %p, i32 -64)
@@ -114,7 +114,7 @@ define ptr addrspace(1) @ptrmask_combine_add_alignment2(ptr addrspace(1) align 3
 define <2 x ptr> @ptrmask_combine_add_alignment_vec(<2 x ptr> %p) {
 ; CHECK-LABEL: define <2 x ptr> @ptrmask_combine_add_alignment_vec
 ; CHECK-SAME: (<2 x ptr> [[P:%.*]]) {
-; CHECK-NEXT:    [[R:%.*]] = call <2 x ptr> @llvm.ptrmask.v2p0.v2i64(<2 x ptr> [[P]], <2 x i64> <i64 -96, i64 -96>)
+; CHECK-NEXT:    [[R:%.*]] = call align 32 <2 x ptr> @llvm.ptrmask.v2p0.v2i64(<2 x ptr> [[P]], <2 x i64> <i64 -96, i64 -96>)
 ; CHECK-NEXT:    ret <2 x ptr> [[R]]
 ;
   %r = call <2 x ptr> @llvm.ptrmask.v2p0.v2i64(<2 x ptr> %p, <2 x i64> <i64 -96, i64 -96>)
@@ -124,7 +124,7 @@ define <2 x ptr> @ptrmask_combine_add_alignment_vec(<2 x ptr> %p) {
 define ptr addrspace(1) @ptrmask_combine_improve_alignment(ptr addrspace(1) %p) {
 ; CHECK-LABEL: define ptr addrspace(1) @ptrmask_combine_improve_alignment
 ; CHECK-SAME: (ptr addrspace(1) [[P:%.*]]) {
-; CHECK-NEXT:    [[R:%.*]] = call align 32 ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) [[P]], i32 -64)
+; CHECK-NEXT:    [[R:%.*]] = call align 64 ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) [[P]], i32 -64)
 ; CHECK-NEXT:    ret ptr addrspace(1) [[R]]
 ;
   %r = call align 32 ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) %p, i32 -64)
@@ -134,7 +134,7 @@ define ptr addrspace(1) @ptrmask_combine_improve_alignment(ptr addrspace(1) %p)
 define <2 x ptr addrspace(1) > @ptrmask_combine_improve_alignment_vec(<2 x ptr addrspace(1) > %p) {
 ; CHECK-LABEL: define <2 x ptr addrspace(1)> @ptrmask_combine_improve_alignment_vec
 ; CHECK-SAME: (<2 x ptr addrspace(1)> [[P:%.*]]) {
-; CHECK-NEXT:    [[R:%.*]] = call align 32 <2 x ptr addrspace(1)> @llvm.ptrmask.v2p1.v2i32(<2 x ptr addrspace(1)> [[P]], <2 x i32> <i32 -64, i32 -128>)
+; CHECK-NEXT:    [[R:%.*]] = call align 64 <2 x ptr addrspace(1)> @llvm.ptrmask.v2p1.v2i32(<2 x ptr addrspace(1)> [[P]], <2 x i32> <i32 -64, i32 -128>)
 ; CHECK-NEXT:    ret <2 x ptr addrspace(1)> [[R]]
 ;
   %r = call align 32 <2 x ptr addrspace(1) > @llvm.ptrmask.v2p1.v2i32(<2 x ptr addrspace(1) > %p, <2 x i32> <i32 -64, i32 -128>)
@@ -230,7 +230,7 @@ define <2 x i32> @ptrtoint_of_ptrmask_vec_fail(<2 x ptr addrspace(1) > %p, <2 x
 define ptr addrspace(1) @ptrmask_is_null(ptr addrspace(1) align 32 %p) {
 ; CHECK-LABEL: define ptr addrspace(1) @ptrmask_is_null
 ; CHECK-SAME: (ptr addrspace(1) align 32 [[P:%.*]]) {
-; CHECK-NEXT:    [[R:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) [[P]], i32 0)
+; CHECK-NEXT:    [[R:%.*]] = call align 4294967296 ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) [[P]], i32 0)
 ; CHECK-NEXT:    ret ptr addrspace(1) [[R]]
 ;
   %r = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) %p, i32 31)
@@ -250,7 +250,7 @@ define <2 x ptr addrspace(1) > @ptrmask_is_null_vec(<2 x ptr addrspace(1) > alig
 define ptr addrspace(1) @ptrmask_is_null_fail(ptr addrspace(1) align 16 %p) {
 ; CHECK-LABEL: define ptr addrspace(1) @ptrmask_is_null_fail
 ; CHECK-SAME: (ptr addrspace(1) align 16 [[P:%.*]]) {
-; CHECK-NEXT:    [[R:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) [[P]], i32 16)
+; CHECK-NEXT:    [[R:%.*]] = call align 16 ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) [[P]], i32 16)
 ; CHECK-NEXT:    ret ptr addrspace(1) [[R]]
 ;
   %r = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) %p, i32 31)
@@ -270,7 +270,7 @@ define <2 x ptr addrspace(1) > @ptrmask_is_null_vec_fail(<2 x ptr addrspace(1) >
 define ptr @ptrmask_maintain_provenance_i64(ptr %p0) {
 ; CHECK-LABEL: define ptr @ptrmask_maintain_provenance_i64
 ; CHECK-SAME: (ptr [[P0:%.*]]) {
-; CHECK-NEXT:    [[R:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[P0]], i64 0)
+; CHECK-NEXT:    [[R:%.*]] = call align 4294967296 ptr @llvm.ptrmask.p0.i64(ptr [[P0]], i64 0)
 ; CHECK-NEXT:    ret ptr [[R]]
 ;
   %r = call ptr @llvm.ptrmask.p0.i64(ptr %p0, i64 0)
@@ -280,7 +280,7 @@ define ptr @ptrmask_maintain_provenance_i64(ptr %p0) {
 define ptr addrspace(1) @ptrmask_maintain_provenance_i32(ptr addrspace(1) %p0) {
 ; CHECK-LABEL: define ptr addrspace(1) @ptrmask_maintain_provenance_i32
 ; CHECK-SAME: (ptr addrspace(1) [[P0:%.*]]) {
-; CHECK-NEXT:    [[R:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) [[P0]], i32 0)
+; CHECK-NEXT:    [[R:%.*]] = call align 4294967296 ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) [[P0]], i32 0)
 ; CHECK-NEXT:    ret ptr addrspace(1) [[R]]
 ;
   %r = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) %p0, i32 0)
@@ -292,7 +292,7 @@ define ptr @ptrmask_is_useless0(i64 %i, i64 %m) {
 ; CHECK-SAME: (i64 [[I:%.*]], i64 [[M:%.*]]) {
 ; CHECK-NEXT:    [[I0:%.*]] = and i64 [[I]], -4
 ; CHECK-NEXT:    [[P0:%.*]] = inttoptr i64 [[I0]] to ptr
-; CHECK-NEXT:    [[R:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[P0]], i64 [[M]])
+; CHECK-NEXT:    [[R:%.*]] = call align 4 ptr @llvm.ptrmask.p0.i64(ptr [[P0]], i64 [[M]])
 ; CHECK-NEXT:    ret ptr [[R]]
 ;
   %m0 = and i64 %m, -4
@@ -307,7 +307,7 @@ define ptr @ptrmask_is_useless1(i64 %i, i64 %m) {
 ; CHECK-SAME: (i64 [[I:%.*]], i64 [[M:%.*]]) {
 ; CHECK-NEXT:    [[I0:%.*]] = and i64 [[I]], -8
 ; CHECK-NEXT:    [[P0:%.*]] = inttoptr i64 [[I0]] to ptr
-; CHECK-NEXT:    [[R:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[P0]], i64 [[M]])
+; CHECK-NEXT:    [[R:%.*]] = call align 8 ptr @llvm.ptrmask.p0.i64(ptr [[P0]], i64 [[M]])
 ; CHECK-NEXT:    ret ptr [[R]]
 ;
   %m0 = and i64 %m, -4
@@ -397,7 +397,7 @@ define ptr @ptrmask_is_useless_fail0(i64 %i, i64 %m) {
 ; CHECK-NEXT:    [[M0:%.*]] = and i64 [[M]], -4
 ; CHECK-NEXT:    [[I0:%.*]] = or i64 [[I]], -4
 ; CHECK-NEXT:    [[P0:%.*]] = inttoptr i64 [[I0]] to ptr
-; CHECK-NEXT:    [[R:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr nonnull [[P0]], i64 [[M0]])
+; CHECK-NEXT:    [[R:%.*]] = call align 4 ptr @llvm.ptrmask.p0.i64(ptr nonnull [[P0]], i64 [[M0]])
 ; CHECK-NEXT:    ret ptr [[R]]
 ;
   %m0 = and i64 %m, -4

From cc8341872d48ed3b7d4c42cac32d68349644c1c1 Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Fri, 22 Sep 2023 08:21:53 -0500
Subject: [PATCH 772/877] [InstCombine] Preserve return attributes when merging
 `llvm.ptrmask`

If we have assosiated attributes i.e `([ret_attrs] (ptrmask (ptrmask
p0, m0), m1))` we should preserve `[ret_attrs]` when combining the two
`llvm.ptrmask`s.

Differential Revision: https://reviews.llvm.org/D156638
---
 .../Transforms/InstCombine/InstCombineCalls.cpp    | 14 ++++++++++----
 llvm/test/Transforms/InstCombine/ptrmask.ll        |  4 ++--
 2 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 10a8bff700b73..b0d31a92464e3 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -1968,17 +1968,23 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
       return II;
 
     Value *InnerPtr, *InnerMask;
+    bool Changed = false;
+    // Combine:
+    // (ptrmask (ptrmask p, A), B)
+    //    -> (ptrmask p, (and A, B))
     if (match(II->getArgOperand(0),
               m_OneUse(m_Intrinsic<Intrinsic::ptrmask>(m_Value(InnerPtr),
                                                        m_Value(InnerMask))))) {
       assert(II->getArgOperand(1)->getType() == InnerMask->getType() &&
              "Mask types must match");
+      // TODO: If InnerMask == Op1, we could copy attributes from inner
+      // callsite -> outer callsite.
       Value *NewMask = Builder.CreateAnd(II->getArgOperand(1), InnerMask);
-      return replaceInstUsesWith(
-          *II, Builder.CreateIntrinsic(InnerPtr->getType(), Intrinsic::ptrmask,
-                                       {InnerPtr, NewMask}));
+      replaceOperand(CI, 0, InnerPtr);
+      replaceOperand(CI, 1, NewMask);
+      Changed = true;
     }
-    bool Changed = false;
+
     // See if we can deduce non-null.
     if (!CI.hasRetAttr(Attribute::NonNull) &&
         (Known.isNonZero() ||
diff --git a/llvm/test/Transforms/InstCombine/ptrmask.ll b/llvm/test/Transforms/InstCombine/ptrmask.ll
index c501062b3b3b0..b07b81e32cbc7 100644
--- a/llvm/test/Transforms/InstCombine/ptrmask.ll
+++ b/llvm/test/Transforms/InstCombine/ptrmask.ll
@@ -12,7 +12,7 @@ define ptr @ptrmask_combine_consecutive_preserve_attrs(ptr %p0, i64 %m1) {
 ; CHECK-LABEL: define ptr @ptrmask_combine_consecutive_preserve_attrs
 ; CHECK-SAME: (ptr [[P0:%.*]], i64 [[M1:%.*]]) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[M1]], 224
-; CHECK-NEXT:    [[R:%.*]] = call align 32 ptr @llvm.ptrmask.p0.i64(ptr [[P0]], i64 [[TMP1]])
+; CHECK-NEXT:    [[R:%.*]] = call noalias align 32 ptr @llvm.ptrmask.p0.i64(ptr [[P0]], i64 [[TMP1]])
 ; CHECK-NEXT:    ret ptr [[R]]
 ;
   %pm0 = call ptr @llvm.ptrmask.p0.i64(ptr %p0, i64 224)
@@ -24,7 +24,7 @@ define <2 x ptr> @ptrmask_combine_consecutive_preserve_attrs_vecs(<2 x ptr> %p0,
 ; CHECK-LABEL: define <2 x ptr> @ptrmask_combine_consecutive_preserve_attrs_vecs
 ; CHECK-SAME: (<2 x ptr> [[P0:%.*]], <2 x i64> [[M1:%.*]]) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = and <2 x i64> [[M1]], <i64 12345, i64 12345>
-; CHECK-NEXT:    [[R:%.*]] = call <2 x ptr> @llvm.ptrmask.v2p0.v2i64(<2 x ptr> [[P0]], <2 x i64> [[TMP1]])
+; CHECK-NEXT:    [[R:%.*]] = call align 128 <2 x ptr> @llvm.ptrmask.v2p0.v2i64(<2 x ptr> [[P0]], <2 x i64> [[TMP1]])
 ; CHECK-NEXT:    ret <2 x ptr> [[R]]
 ;
   %pm0 = call <2 x ptr> @llvm.ptrmask.v2p0.v2i64(<2 x ptr> %p0, <2 x i64> <i64 12345, i64 12345>)

From bd29197fd8448265dbf8e370a8a21a477cd50f5a Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Fri, 22 Sep 2023 08:22:01 -0500
Subject: [PATCH 773/877] [InstCombine] Fold `(ptrtoint (ptrmask p0, m0))` ->
 `(and (ptrtoint p0), m0)`

`and` is generally more supported so if we have a `ptrmask` anyways
might as well use `and`.

Differential Revision: https://reviews.llvm.org/D156640

Closes #67166
---
 .../InstCombine/InstCombineCasts.cpp          |  9 +++++++++
 llvm/test/Transforms/InstCombine/ptrmask.ll   | 20 +++++++++----------
 2 files changed, 19 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
index e2f17aab9c442..efd18b44657e5 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -1935,6 +1935,15 @@ Instruction *InstCombinerImpl::visitPtrToInt(PtrToIntInst &CI) {
     return CastInst::CreateIntegerCast(P, Ty, /*isSigned=*/false);
   }
 
+  // (ptrtoint (ptrmask P, M))
+  //    -> (and (ptrtoint P), M)
+  // This is generally beneficial as `and` is better supported than `ptrmask`.
+  Value *Ptr, *Mask;
+  if (match(SrcOp, m_OneUse(m_Intrinsic<Intrinsic::ptrmask>(m_Value(Ptr),
+                                                            m_Value(Mask)))) &&
+      Mask->getType() == Ty)
+    return BinaryOperator::CreateAnd(Builder.CreatePtrToInt(Ptr, Ty), Mask);
+
   if (auto *GEP = dyn_cast<GetElementPtrInst>(SrcOp)) {
     // Fold ptrtoint(gep null, x) to multiply + constant if the GEP has one use.
     // While this can increase the number of instructions it doesn't actually
diff --git a/llvm/test/Transforms/InstCombine/ptrmask.ll b/llvm/test/Transforms/InstCombine/ptrmask.ll
index b07b81e32cbc7..afeb5d5251d0f 100644
--- a/llvm/test/Transforms/InstCombine/ptrmask.ll
+++ b/llvm/test/Transforms/InstCombine/ptrmask.ll
@@ -154,8 +154,8 @@ define ptr addrspace(1) @ptrmask_combine_improve_alignment_fail(ptr addrspace(1)
 define i64 @ptrtoint_of_ptrmask(ptr %p, i64 %m) {
 ; CHECK-LABEL: define i64 @ptrtoint_of_ptrmask
 ; CHECK-SAME: (ptr [[P:%.*]], i64 [[M:%.*]]) {
-; CHECK-NEXT:    [[PM:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[P]], i64 [[M]])
-; CHECK-NEXT:    [[R:%.*]] = ptrtoint ptr [[PM]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[R:%.*]] = and i64 [[TMP1]], [[M]]
 ; CHECK-NEXT:    ret i64 [[R]]
 ;
   %pm = call ptr @llvm.ptrmask.p0.i64(ptr %p, i64 %m)
@@ -167,9 +167,9 @@ define i64 @ptrtoint_of_ptrmask(ptr %p, i64 %m) {
 define i32 @ptrtoint_of_ptrmask2(ptr %p, i64 %m) {
 ; CHECK-LABEL: define i32 @ptrtoint_of_ptrmask2
 ; CHECK-SAME: (ptr [[P:%.*]], i64 [[M:%.*]]) {
-; CHECK-NEXT:    [[PM:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[P]], i64 [[M]])
-; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[PM]] to i64
-; CHECK-NEXT:    [[R:%.*]] = trunc i64 [[TMP1]] to i32
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[P]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = and i64 [[TMP1]], [[M]]
+; CHECK-NEXT:    [[R:%.*]] = trunc i64 [[TMP2]] to i32
 ; CHECK-NEXT:    ret i32 [[R]]
 ;
   %pm = call ptr @llvm.ptrmask.p0.i64(ptr %p, i64 %m)
@@ -180,8 +180,8 @@ define i32 @ptrtoint_of_ptrmask2(ptr %p, i64 %m) {
 define <2 x i64> @ptrtoint_of_ptrmask_vec(<2 x ptr> %p, <2 x i64> %m) {
 ; CHECK-LABEL: define <2 x i64> @ptrtoint_of_ptrmask_vec
 ; CHECK-SAME: (<2 x ptr> [[P:%.*]], <2 x i64> [[M:%.*]]) {
-; CHECK-NEXT:    [[PM:%.*]] = call <2 x ptr> @llvm.ptrmask.v2p0.v2i64(<2 x ptr> [[P]], <2 x i64> [[M]])
-; CHECK-NEXT:    [[R:%.*]] = ptrtoint <2 x ptr> [[PM]] to <2 x i64>
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint <2 x ptr> [[P]] to <2 x i64>
+; CHECK-NEXT:    [[R:%.*]] = and <2 x i64> [[TMP1]], [[M]]
 ; CHECK-NEXT:    ret <2 x i64> [[R]]
 ;
   %pm = call <2 x ptr> @llvm.ptrmask.v2p0.v2i64(<2 x ptr> %p, <2 x i64> %m)
@@ -192,9 +192,9 @@ define <2 x i64> @ptrtoint_of_ptrmask_vec(<2 x ptr> %p, <2 x i64> %m) {
 define <2 x i32> @ptrtoint_of_ptrmask_vec2(<2 x ptr> %p, <2 x i64> %m) {
 ; CHECK-LABEL: define <2 x i32> @ptrtoint_of_ptrmask_vec2
 ; CHECK-SAME: (<2 x ptr> [[P:%.*]], <2 x i64> [[M:%.*]]) {
-; CHECK-NEXT:    [[PM:%.*]] = call <2 x ptr> @llvm.ptrmask.v2p0.v2i64(<2 x ptr> [[P]], <2 x i64> [[M]])
-; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint <2 x ptr> [[PM]] to <2 x i64>
-; CHECK-NEXT:    [[R:%.*]] = trunc <2 x i64> [[TMP1]] to <2 x i32>
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint <2 x ptr> [[P]] to <2 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = and <2 x i64> [[TMP1]], [[M]]
+; CHECK-NEXT:    [[R:%.*]] = trunc <2 x i64> [[TMP2]] to <2 x i32>
 ; CHECK-NEXT:    ret <2 x i32> [[R]]
 ;
   %pm = call <2 x ptr> @llvm.ptrmask.v2p0.v2i64(<2 x ptr> %p, <2 x i64> %m)

From a40f651a065ef28fee17bc1f22cce08247eff950 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Wed, 1 Nov 2023 22:35:28 -0700
Subject: [PATCH 774/877] [ELF] adjustOutputSections: don't copy SHF_EXECINSTR
 when an output does not contain input sections (#70911)

For an output section with no input section, GNU ld eliminates the
output section when there are only symbol assignments (e.g.
`.foo : { symbol = 42; }`) but not for `.foo : { . += 42; }`
(`SHF_ALLOC|SHF_WRITE`).

We choose to retain such an output section with a symbol assignment
(unless unreferenced `PROVIDE`). We copy the previous section flag (see
https://reviews.llvm.org/D37736) to hopefully make the current PT_LOAD
segment extend to the current output section:

* decrease the number of PT_LOAD segments
* If a new PT_LOAD segment is introduced without a page-size
  alignment as a separator, there may be a run-time crash.

However, this `flags` copying behavior is not suitable for
`.foo : { . += 42; }` when `flags` contains `SHF_EXECINSTR`. The
executable bit is surprising

(https://discourse.llvm.org/t/lld-output-section-flag-assignment-behavior/74359).

I think we should drop SHF_EXECINSTR when copying `flags`. The risk is a
code section followed by `.foo : { symbol = 42; }` will be broken, which
I believe is unrelated as such uses are almost always related to data
sections.

For data-command-only output sections (e.g. `.foo : { QUAD(42) }`), we
keep allowing copyable SHF_WRITE.

Some tests are updated to drop the SHF_EXECINSTR flag. GNU ld doesn't
set SHF_EXECINSTR as well, though it sets SHF_WRITE for some tests while
we don't.
---
 lld/ELF/LinkerScript.cpp                           |  8 +++++---
 lld/docs/ELF/linker_script.rst                     | 10 ++++++++++
 .../linkerscript/empty-synthetic-removed-flags.s   |  1 -
 lld/test/ELF/linkerscript/extend-pt-load2.test     | 13 +++++++------
 lld/test/ELF/linkerscript/insert-before.test       | 11 ++++++-----
 lld/test/ELF/linkerscript/lma-align2.test          | 14 +++++++-------
 6 files changed, 35 insertions(+), 22 deletions(-)

diff --git a/lld/ELF/LinkerScript.cpp b/lld/ELF/LinkerScript.cpp
index df091613dc0a1..1b8acbe1c908b 100644
--- a/lld/ELF/LinkerScript.cpp
+++ b/lld/ELF/LinkerScript.cpp
@@ -1167,7 +1167,9 @@ void LinkerScript::adjustOutputSections() {
   //   * The address assignment.
   // The other option is to pick flags that minimize the impact the section
   // will have on the rest of the linker. That is why we copy the flags from
-  // the previous sections. Only a few flags are needed to keep the impact low.
+  // the previous sections. We copy just SHF_ALLOC and SHF_WRITE to keep the
+  // impact low. We do not propagate SHF_EXECINSTR as in some cases this can
+  // lead to executable writeable section.
   uint64_t flags = SHF_ALLOC;
 
   SmallVector<StringRef, 0> defPhdrs;
@@ -1193,8 +1195,8 @@ void LinkerScript::adjustOutputSections() {
     // We do not want to keep any special flags for output section
     // in case it is empty.
     if (isEmpty)
-      sec->flags = flags & ((sec->nonAlloc ? 0 : (uint64_t)SHF_ALLOC) |
-                            SHF_WRITE | SHF_EXECINSTR);
+      sec->flags =
+          flags & ((sec->nonAlloc ? 0 : (uint64_t)SHF_ALLOC) | SHF_WRITE);
 
     // The code below may remove empty output sections. We should save the
     // specified program headers (if exist) and propagate them to subsequent
diff --git a/lld/docs/ELF/linker_script.rst b/lld/docs/ELF/linker_script.rst
index fbd96abcb44c5..3606ef4fe4b8e 100644
--- a/lld/docs/ELF/linker_script.rst
+++ b/lld/docs/ELF/linker_script.rst
@@ -97,6 +97,16 @@ The presence of ``address`` can cause the condition unsatisfied. LLD will warn.
 GNU ld from Binutils 2.35 onwards will reduce sh_addralign so that
 sh_addr=0 (modulo sh_addralign).
 
+When an output section has no input section, GNU ld will eliminate it if it
+only contains symbol assignments (e.g. ``.foo { symbol = 42; }``). LLD will
+retain such sections unless all the symbol assignments are unreferenced
+``PROVIDED``.
+
+When an output section has no input section but advances the location counter,
+GNU ld sets the ``SHF_WRITE`` flag. LLD sets the SHF_WRITE flag only if the
+preceding output section with non-empty input sections also has the SHF_WRITE
+flag.
+
 Output section type
 -------------------
 
diff --git a/lld/test/ELF/linkerscript/empty-synthetic-removed-flags.s b/lld/test/ELF/linkerscript/empty-synthetic-removed-flags.s
index b8d72702425f1..dfc502e8da9e1 100644
--- a/lld/test/ELF/linkerscript/empty-synthetic-removed-flags.s
+++ b/lld/test/ELF/linkerscript/empty-synthetic-removed-flags.s
@@ -29,7 +29,6 @@
 # EMPTY-NEXT:  Type: SHT_PROGBITS
 # EMPTY-NEXT:  Flags [
 # EMPTY-NEXT:    SHF_ALLOC
-# EMPTY-NEXT:    SHF_EXECINSTR
 # EMPTY-NEXT:  ]
 
 .section .foo,"ax"
diff --git a/lld/test/ELF/linkerscript/extend-pt-load2.test b/lld/test/ELF/linkerscript/extend-pt-load2.test
index 3be94a5670d8c..6c0e3b2863f91 100644
--- a/lld/test/ELF/linkerscript/extend-pt-load2.test
+++ b/lld/test/ELF/linkerscript/extend-pt-load2.test
@@ -19,12 +19,13 @@ SECTIONS {
   .data.rel.ro : { *(.data.rel.ro) }
 }
 
-# CHECK:      .rodata      PROGBITS 0000000000000215 000215 000001 00 A  0
-# CHECK-NEXT: foo          PROGBITS 0000000000000216 000216 000000 00 A  0
-# CHECK-NEXT: .text        PROGBITS 0000000000000218 000218 000001 00 AX 0
-# CHECK-NEXT: bar          PROGBITS 0000000000000219 000219 000de7 00 AX 0
+# CHECK:      .rodata      PROGBITS 000000000000024d 00024d 000001 00 A  0
+# CHECK-NEXT: foo          PROGBITS 000000000000024e 00024e 000000 00 A  0
+# CHECK-NEXT: .text        PROGBITS 0000000000000250 000250 000001 00 AX 0
+# CHECK-NEXT: bar          PROGBITS 0000000000000251 000251 000daf 00 A  0
 # CHECK-NEXT: .data.rel.ro PROGBITS 0000000000001000 001000 000001 00 WA 0
 
-# CHECK:      LOAD 0x000000 0x0000000000000000 0x0000000000000000 0x000216 0x000216 R   0x1000
-# CHECK:      LOAD 0x000218 0x0000000000000218 0x0000000000000218 0x000de8 0x000de8 R E 0x1000
+# CHECK:      LOAD 0x000000 0x0000000000000000 0x0000000000000000 0x00024e 0x00024e R   0x1000
+# CHECK-NEXT: LOAD 0x000250 0x0000000000000250 0x0000000000000250 0x000001 0x000001 R E 0x1000
+# CHECK-NEXT: LOAD 0x000251 0x0000000000000251 0x0000000000000251 0x000daf 0x000daf R   0x1000
 # CHECK-NEXT: LOAD 0x001000 0x0000000000001000 0x0000000000001000 0x000068 0x000068 RW  0x1000
diff --git a/lld/test/ELF/linkerscript/insert-before.test b/lld/test/ELF/linkerscript/insert-before.test
index f9611538c013b..e6ed413639827 100644
--- a/lld/test/ELF/linkerscript/insert-before.test
+++ b/lld/test/ELF/linkerscript/insert-before.test
@@ -9,13 +9,14 @@
 # RUN: llvm-readelf -S -l %t1 | FileCheck %s
 # CHECK:      Name      Type     Address          Off    Size   ES Flg
 # CHECK-NEXT:           NULL
-# CHECK-NEXT: .foo.text PROGBITS 0000000000000000 001000 000008 00  AX
-# CHECK-NEXT: .text     PROGBITS 0000000000000008 001008 000008 00  AX
-# CHECK-NEXT: .byte     PROGBITS 0000000000000010 001010 000001 00  AX
-# CHECK-NEXT: .foo.data PROGBITS 0000000000000011 001011 000008 00  WA
-# CHECK-NEXT: .data     PROGBITS 0000000000000019 001019 000008 00  WA
+# CHECK-NEXT: .foo.text PROGBITS 0000000000000000 001000 000008 00  AX 0
+# CHECK-NEXT: .text     PROGBITS 0000000000000008 001008 000008 00  AX 0
+# CHECK-NEXT: .byte     PROGBITS 0000000000000010 001010 000001 00  A  0
+# CHECK-NEXT: .foo.data PROGBITS 0000000000000011 001011 000008 00  WA 0
+# CHECK-NEXT: .data     PROGBITS 0000000000000019 001019 000008 00  WA 0
 # CHECK:      Type
 # CHECK-NEXT: LOAD {{.*}} R E
+# CHECK-NEXT: LOAD {{.*}} R
 # CHECK-NEXT: LOAD {{.*}} RW
 # CHECK-NEXT: GNU_STACK {{.*}} RW
 
diff --git a/lld/test/ELF/linkerscript/lma-align2.test b/lld/test/ELF/linkerscript/lma-align2.test
index 2177101ff3747..23b06b179b6e8 100644
--- a/lld/test/ELF/linkerscript/lma-align2.test
+++ b/lld/test/ELF/linkerscript/lma-align2.test
@@ -7,17 +7,17 @@
 # CHECK:      Name         Type     Address          Off    Size   ES Flg Lk Inf Al
 # CHECK-NEXT:              NULL     0000000000000000 000000 000000 00      0   0  0
 # CHECK-NEXT: .text        PROGBITS 0000000008000000 001000 000001 00  AX  0   0  4
-# CHECK-NEXT: .data        PROGBITS 0000000020000000 002000 000006 00  AX  0   0  8
-# CHECK-NEXT: .data2       PROGBITS 000000000800000e 00200e 000008 00  AX  0   0  1
-# CHECK-NEXT: .data3       PROGBITS 0000000020000008 002018 000000 00  AX  0   0  8
-# CHECK-NEXT: .data4       PROGBITS 0000000008000018 002018 000008 00  AX  0   0  1
+# CHECK-NEXT: .data        PROGBITS 0000000020000000 002000 000006 00  A   0   0  8
+# CHECK-NEXT: .data2       PROGBITS 000000000800000e 00200e 000008 00  A   0   0  1
+# CHECK-NEXT: .data3       PROGBITS 0000000020000008 002018 000000 00  A   0   0  8
+# CHECK-NEXT: .data4       PROGBITS 0000000008000018 002018 000008 00  A   0   0  1
 
 
 # CHECK:      Type  Offset   VirtAddr           PhysAddr           FileSiz  MemSiz   Flg Align
 # CHECK-NEXT: LOAD  0x001000 0x0000000008000000 0x0000000008000000 0x000001 0x000001 R E 0x1000
-# CHECK-NEXT: LOAD  0x002000 0x0000000020000000 0x0000000008000008 0x000006 0x000006 R E 0x1000
-# CHECK-NEXT: LOAD  0x00200e 0x000000000800000e 0x000000000800000e 0x000008 0x000008 R E 0x1000
-# CHECK-NEXT: LOAD  0x002018 0x0000000008000018 0x0000000008000018 0x000008 0x000008 R E 0x1000
+# CHECK-NEXT: LOAD  0x002000 0x0000000020000000 0x0000000008000008 0x000006 0x000006 R   0x1000
+# CHECK-NEXT: LOAD  0x00200e 0x000000000800000e 0x000000000800000e 0x000008 0x000008 R   0x1000
+# CHECK-NEXT: LOAD  0x002018 0x0000000008000018 0x0000000008000018 0x000008 0x000008 R   0x1000
 
 MEMORY {
   CODE (rx) : ORIGIN = 0x08000000, LENGTH = 100K

From d636d73f9420bb78959af0dec0885f5a26e515af Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Thu, 2 Nov 2023 13:57:07 +0900
Subject: [PATCH 775/877] Revert "Move ExpandLargeDivRem to
 llvm/test/CodeGen/X86 because they need a triple"

This reverts commit 6bf1b4e8e0776e6f27013434d8b632016ccc795c.

Requiring a triple does not require moving these to a codegen test directory.
Move these to an x86 specific subdirectory of a transforms test.
---
 llvm/test/Transforms/ExpandLargeDivRem/X86/lit.local.cfg        | 2 ++
 .../ExpandLargeDivRem/X86/sdiv129.ll}                           | 0
 .../ExpandLargeDivRem/X86/srem129.ll}                           | 0
 .../ExpandLargeDivRem/X86/udiv129.ll}                           | 0
 .../ExpandLargeDivRem/X86/urem129.ll}                           | 0
 5 files changed, 2 insertions(+)
 create mode 100644 llvm/test/Transforms/ExpandLargeDivRem/X86/lit.local.cfg
 rename llvm/test/{CodeGen/X86/expand-large-div-rem-sdiv129.ll => Transforms/ExpandLargeDivRem/X86/sdiv129.ll} (100%)
 rename llvm/test/{CodeGen/X86/expand-large-div-rem-srem129.ll => Transforms/ExpandLargeDivRem/X86/srem129.ll} (100%)
 rename llvm/test/{CodeGen/X86/expand-large-div-rem-udiv129.ll => Transforms/ExpandLargeDivRem/X86/udiv129.ll} (100%)
 rename llvm/test/{CodeGen/X86/expand-large-div-rem-urem129.ll => Transforms/ExpandLargeDivRem/X86/urem129.ll} (100%)

diff --git a/llvm/test/Transforms/ExpandLargeDivRem/X86/lit.local.cfg b/llvm/test/Transforms/ExpandLargeDivRem/X86/lit.local.cfg
new file mode 100644
index 0000000000000..42bf50dcc13c3
--- /dev/null
+++ b/llvm/test/Transforms/ExpandLargeDivRem/X86/lit.local.cfg
@@ -0,0 +1,2 @@
+if not "X86" in config.root.targets:
+    config.unsupported = True
diff --git a/llvm/test/CodeGen/X86/expand-large-div-rem-sdiv129.ll b/llvm/test/Transforms/ExpandLargeDivRem/X86/sdiv129.ll
similarity index 100%
rename from llvm/test/CodeGen/X86/expand-large-div-rem-sdiv129.ll
rename to llvm/test/Transforms/ExpandLargeDivRem/X86/sdiv129.ll
diff --git a/llvm/test/CodeGen/X86/expand-large-div-rem-srem129.ll b/llvm/test/Transforms/ExpandLargeDivRem/X86/srem129.ll
similarity index 100%
rename from llvm/test/CodeGen/X86/expand-large-div-rem-srem129.ll
rename to llvm/test/Transforms/ExpandLargeDivRem/X86/srem129.ll
diff --git a/llvm/test/CodeGen/X86/expand-large-div-rem-udiv129.ll b/llvm/test/Transforms/ExpandLargeDivRem/X86/udiv129.ll
similarity index 100%
rename from llvm/test/CodeGen/X86/expand-large-div-rem-udiv129.ll
rename to llvm/test/Transforms/ExpandLargeDivRem/X86/udiv129.ll
diff --git a/llvm/test/CodeGen/X86/expand-large-div-rem-urem129.ll b/llvm/test/Transforms/ExpandLargeDivRem/X86/urem129.ll
similarity index 100%
rename from llvm/test/CodeGen/X86/expand-large-div-rem-urem129.ll
rename to llvm/test/Transforms/ExpandLargeDivRem/X86/urem129.ll

From d76b56fd28582c1cc6663cefa5ae2f8a23492d0a Mon Sep 17 00:00:00 2001
From: Chuanqi Xu <yedeng.yd@linux.alibaba.com>
Date: Thu, 2 Nov 2023 14:00:12 +0800
Subject: [PATCH 776/877] [NFC] Eliminate warnings in
 SourceLocationEncodingTest.cpp

There are 2 dangling else warnings and a warning pararences around bit operations.
This patch tries to eliminate that.
---
 .../Serialization/SourceLocationEncodingTest.cpp          | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/clang/unittests/Serialization/SourceLocationEncodingTest.cpp b/clang/unittests/Serialization/SourceLocationEncodingTest.cpp
index 2640ea4a59989..141da4c27f8d0 100644
--- a/clang/unittests/Serialization/SourceLocationEncodingTest.cpp
+++ b/clang/unittests/Serialization/SourceLocationEncodingTest.cpp
@@ -25,8 +25,9 @@ void roundTrip(SourceLocation::UIntTy Loc,
                std::optional<uint64_t> ExpectedEncoded = std::nullopt) {
   uint64_t ActualEncoded =
       SourceLocationEncoding::encode(SourceLocation::getFromRawEncoding(Loc));
-  if (ExpectedEncoded)
+  if (ExpectedEncoded) {
     ASSERT_EQ(ActualEncoded, *ExpectedEncoded) << "Encoding " << Loc;
+  }
   SourceLocation::UIntTy DecodedEncoded =
       SourceLocationEncoding::decode(ActualEncoded).getRawEncoding();
   ASSERT_EQ(DecodedEncoded, Loc) << "Decoding " << ActualEncoded;
@@ -41,9 +42,10 @@ void roundTrip(std::vector<SourceLocation::UIntTy> Locs,
     for (auto L : Locs)
       ActualEncoded.push_back(SourceLocationEncoding::encode(
           SourceLocation::getFromRawEncoding(L), Seq));
-    if (!ExpectedEncoded.empty())
+    if (!ExpectedEncoded.empty()) {
       ASSERT_EQ(ActualEncoded, ExpectedEncoded)
           << "Encoding " << testing::PrintToString(Locs);
+    }
   }
   std::vector<SourceLocation::UIntTy> DecodedEncoded;
   {
@@ -70,7 +72,7 @@ TEST(SourceLocationEncoding, Individual) {
   roundTrip(Big);
   roundTrip(Big + 1);
   roundTrip(MacroBit | Big);
-  roundTrip(MacroBit | Big + 1);
+  roundTrip(MacroBit | (Big + 1));
 }
 
 TEST(SourceLocationEncoding, Sequence) {

From 858b56e4962749013ded409ff43370b542c8b6cb Mon Sep 17 00:00:00 2001
From: Yuxuan Chen <yuxuanchen1997@outlook.com>
Date: Wed, 1 Nov 2023 23:03:47 -0700
Subject: [PATCH 777/877] [Clang] Preserve coroutine parameter referenced state
 (#70973)

This PR is proposing a fix for
https://github.com/llvm/llvm-project/issues/65971.

Previously, given a coroutine like this
```
task foo(int a) {
  co_return;
}
```
Parameter `a` is never used. However, because C++ coroutines move
constructs the variable to a heap allocated coroutine activation frame,
we considered all parameters referenced. When diagnosing unused
parameters, we cannot distinguish if the variable reference was due to
coroutine parameter moves.

Compiler Explorer shows that GCC warns against this case correctly, but
clang does not: https://godbolt.org/z/Wo7dfqeaf

This patch addresses this issue by preserving the original
`ParmVarDecl`'s `Referenced` state.
---
 clang/lib/Sema/SemaCoroutine.cpp              |  6 ++++
 .../warn-unused-parameters-coroutine.cpp      | 34 +++++++++++++++++++
 2 files changed, 40 insertions(+)
 create mode 100644 clang/test/SemaCXX/warn-unused-parameters-coroutine.cpp

diff --git a/clang/lib/Sema/SemaCoroutine.cpp b/clang/lib/Sema/SemaCoroutine.cpp
index 38ac406b14ada..bee80db8d166a 100644
--- a/clang/lib/Sema/SemaCoroutine.cpp
+++ b/clang/lib/Sema/SemaCoroutine.cpp
@@ -1965,9 +1965,15 @@ bool Sema::buildCoroutineParameterMoves(SourceLocation Loc) {
     if (PD->getType()->isDependentType())
       continue;
 
+    // Preserve the referenced state for unused parameter diagnostics.
+    bool DeclReferenced = PD->isReferenced();
+
     ExprResult PDRefExpr =
         BuildDeclRefExpr(PD, PD->getType().getNonReferenceType(),
                          ExprValueKind::VK_LValue, Loc); // FIXME: scope?
+
+    PD->setReferenced(DeclReferenced);
+
     if (PDRefExpr.isInvalid())
       return false;
 
diff --git a/clang/test/SemaCXX/warn-unused-parameters-coroutine.cpp b/clang/test/SemaCXX/warn-unused-parameters-coroutine.cpp
new file mode 100644
index 0000000000000..b4c01550f9f78
--- /dev/null
+++ b/clang/test/SemaCXX/warn-unused-parameters-coroutine.cpp
@@ -0,0 +1,34 @@
+// RUN: %clang_cc1 -fsyntax-only -Wunused-parameter -verify -std=c++20 %s
+
+#include "Inputs/std-coroutine.h"
+
+struct awaitable {
+  bool await_ready() noexcept;
+  void await_resume() noexcept;
+  void await_suspend(std::coroutine_handle<>) noexcept;
+};
+
+struct task : awaitable {
+  struct promise_type {
+    task get_return_object() noexcept;
+    awaitable initial_suspend() noexcept;
+    awaitable final_suspend() noexcept;
+    void unhandled_exception() noexcept;
+    void return_void() noexcept;
+  };
+};
+
+task foo(int a) { // expected-warning{{unused parameter 'a'}}
+  co_return;
+}
+
+task bar(int a, int b) { // expected-warning{{unused parameter 'b'}}
+  a = a + 1;
+  co_return;
+}
+
+void create_closure() {
+  auto closure = [](int c) -> task { // expected-warning{{unused parameter 'c'}}
+    co_return;
+  };
+}

From bf383dca36953fbd37c979d78f781ba37eab129c Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Wed, 1 Nov 2023 23:16:33 -0700
Subject: [PATCH 778/877] [llvm] Stop including llvm/Support/Endian.h (NFC)

Identified with misc-include-cleaner.
---
 llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp                  | 1 -
 llvm/lib/DebugInfo/CodeView/LazyRandomTypeCollection.cpp       | 1 -
 llvm/lib/DebugInfo/CodeView/SymbolSerializer.cpp               | 1 -
 llvm/lib/ExecutionEngine/JITLink/ELF_aarch32.cpp               | 1 -
 llvm/lib/ExecutionEngine/JITLink/ELF_ppc64.cpp                 | 1 -
 llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp | 1 -
 llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp | 1 -
 llvm/unittests/DebugInfo/GSYM/GSYMTest.cpp                     | 1 -
 llvm/unittests/ExecutionEngine/JITLink/LinkGraphTests.cpp      | 1 -
 llvm/unittests/ExecutionEngine/JITLink/StubsTests.cpp          | 1 -
 10 files changed, 10 deletions(-)

diff --git a/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp b/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
index a41d0357bf343..06dcf0a4d463b 100644
--- a/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
@@ -56,7 +56,6 @@
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/BinaryStreamWriter.h"
 #include "llvm/Support/Casting.h"
-#include "llvm/Support/Endian.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FormatVariadic.h"
diff --git a/llvm/lib/DebugInfo/CodeView/LazyRandomTypeCollection.cpp b/llvm/lib/DebugInfo/CodeView/LazyRandomTypeCollection.cpp
index 2343386e031c5..e59a0197d6500 100644
--- a/llvm/lib/DebugInfo/CodeView/LazyRandomTypeCollection.cpp
+++ b/llvm/lib/DebugInfo/CodeView/LazyRandomTypeCollection.cpp
@@ -15,7 +15,6 @@
 #include "llvm/DebugInfo/CodeView/RecordName.h"
 #include "llvm/DebugInfo/CodeView/RecordSerialization.h"
 #include "llvm/Support/BinaryStreamReader.h"
-#include "llvm/Support/Endian.h"
 #include "llvm/Support/Error.h"
 #include <algorithm>
 #include <cassert>
diff --git a/llvm/lib/DebugInfo/CodeView/SymbolSerializer.cpp b/llvm/lib/DebugInfo/CodeView/SymbolSerializer.cpp
index e52f3e56f1155..eadc50f2da800 100644
--- a/llvm/lib/DebugInfo/CodeView/SymbolSerializer.cpp
+++ b/llvm/lib/DebugInfo/CodeView/SymbolSerializer.cpp
@@ -8,7 +8,6 @@
 
 #include "llvm/DebugInfo/CodeView/SymbolSerializer.h"
 #include "llvm/ADT/ArrayRef.h"
-#include "llvm/Support/Endian.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/ErrorHandling.h"
 #include <cassert>
diff --git a/llvm/lib/ExecutionEngine/JITLink/ELF_aarch32.cpp b/llvm/lib/ExecutionEngine/JITLink/ELF_aarch32.cpp
index 23946c7de9adb..6359ceb4c5174 100644
--- a/llvm/lib/ExecutionEngine/JITLink/ELF_aarch32.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/ELF_aarch32.cpp
@@ -17,7 +17,6 @@
 #include "llvm/ExecutionEngine/JITLink/aarch32.h"
 #include "llvm/Object/ELF.h"
 #include "llvm/Object/ELFObjectFile.h"
-#include "llvm/Support/Endian.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/TargetParser/ARMTargetParser.h"
 
diff --git a/llvm/lib/ExecutionEngine/JITLink/ELF_ppc64.cpp b/llvm/lib/ExecutionEngine/JITLink/ELF_ppc64.cpp
index 25b1dd9d3d125..3b86250b60a44 100644
--- a/llvm/lib/ExecutionEngine/JITLink/ELF_ppc64.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/ELF_ppc64.cpp
@@ -15,7 +15,6 @@
 #include "llvm/ExecutionEngine/JITLink/TableManager.h"
 #include "llvm/ExecutionEngine/JITLink/ppc64.h"
 #include "llvm/Object/ELFObjectFile.h"
-#include "llvm/Support/Endian.h"
 
 #include "EHFrameSupportImpl.h"
 #include "ELFLinkGraphBuilder.h"
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp
index ecb68ff401e9f..8744bcb45f75f 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp
@@ -17,7 +17,6 @@
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCELFObjectWriter.h"
 #include "llvm/MC/MCValue.h"
-#include "llvm/Support/Endian.h"
 #include "llvm/Support/EndianStream.h"
 
 #define DEBUG_TYPE "loongarch-asmbackend"
diff --git a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp
index 9f6b9d8ef18fd..b70fa957a8599 100644
--- a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp
+++ b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp
@@ -32,7 +32,6 @@
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCSymbolWasm.h"
 #include "llvm/MC/TargetRegistry.h"
-#include "llvm/Support/Endian.h"
 #include "llvm/Support/SourceMgr.h"
 
 using namespace llvm;
diff --git a/llvm/unittests/DebugInfo/GSYM/GSYMTest.cpp b/llvm/unittests/DebugInfo/GSYM/GSYMTest.cpp
index ad81a2fcd1644..e5ee93ffa3c60 100644
--- a/llvm/unittests/DebugInfo/GSYM/GSYMTest.cpp
+++ b/llvm/unittests/DebugInfo/GSYM/GSYMTest.cpp
@@ -21,7 +21,6 @@
 #include "llvm/DebugInfo/GSYM/StringTable.h"
 #include "llvm/ObjectYAML/DWARFEmitter.h"
 #include "llvm/Support/DataExtractor.h"
-#include "llvm/Support/Endian.h"
 #include "llvm/Testing/Support/Error.h"
 
 #include "gtest/gtest.h"
diff --git a/llvm/unittests/ExecutionEngine/JITLink/LinkGraphTests.cpp b/llvm/unittests/ExecutionEngine/JITLink/LinkGraphTests.cpp
index a94ad0859ebba..20eed05b14ae8 100644
--- a/llvm/unittests/ExecutionEngine/JITLink/LinkGraphTests.cpp
+++ b/llvm/unittests/ExecutionEngine/JITLink/LinkGraphTests.cpp
@@ -9,7 +9,6 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ExecutionEngine/JITLink/JITLink.h"
 #include "llvm/ExecutionEngine/Orc/ObjectFileInterface.h"
-#include "llvm/Support/Endian.h"
 #include "llvm/Support/Memory.h"
 
 #include "llvm/Testing/Support/Error.h"
diff --git a/llvm/unittests/ExecutionEngine/JITLink/StubsTests.cpp b/llvm/unittests/ExecutionEngine/JITLink/StubsTests.cpp
index e33aa63b5e4c8..8fbede0378b9f 100644
--- a/llvm/unittests/ExecutionEngine/JITLink/StubsTests.cpp
+++ b/llvm/unittests/ExecutionEngine/JITLink/StubsTests.cpp
@@ -13,7 +13,6 @@
 #include "llvm/ExecutionEngine/JITLink/loongarch.h"
 #include "llvm/ExecutionEngine/JITLink/x86_64.h"
 #include "llvm/ExecutionEngine/Orc/ObjectFileInterface.h"
-#include "llvm/Support/Endian.h"
 #include "llvm/Support/Memory.h"
 
 #include "llvm/Testing/Support/Error.h"

From 749083b91f31f370cf64831d3e7e6215b6d51442 Mon Sep 17 00:00:00 2001
From: Weining Lu <luweining@loongson.cn>
Date: Thu, 2 Nov 2023 09:29:43 +0800
Subject: [PATCH 779/877] [LoongArch] Pre-commit test for issue #70890

---
 .../LoongArch/abi-lp64d-empty-unions.c        | 26 +++++++++++++++++++
 1 file changed, 26 insertions(+)
 create mode 100644 clang/test/CodeGen/LoongArch/abi-lp64d-empty-unions.c

diff --git a/clang/test/CodeGen/LoongArch/abi-lp64d-empty-unions.c b/clang/test/CodeGen/LoongArch/abi-lp64d-empty-unions.c
new file mode 100644
index 0000000000000..b0607425336e2
--- /dev/null
+++ b/clang/test/CodeGen/LoongArch/abi-lp64d-empty-unions.c
@@ -0,0 +1,26 @@
+// RUN: %clang_cc1 -triple loongarch64 -target-feature +f -target-feature +d -target-abi lp64d -emit-llvm %s -o - | \
+// RUN:   FileCheck --check-prefix=CHECK-C %s
+// RUN: %clang_cc1 -triple loongarch64 -target-feature +f -target-feature +d -target-abi lp64d -emit-llvm %s -o - -x c++ | \
+// RUN:   FileCheck --check-prefix=CHECK-CXX %s
+
+#include <stdint.h>
+
+// CHECK-C: define{{.*}} void @test1()
+// CHECK-CXX: define{{.*}} i64 @_Z5test12u1(i64{{[^,]*}})
+union u1 { };
+union u1 test1(union u1 a) {
+  return a;
+}
+
+struct s1 {
+  union u1 u;
+  int i;
+  float f;
+};
+
+// CHECK-C: define{{.*}} { i32, float } @test2(i32{{[^,]*}}, float{{[^,]*}})
+/// FIXME: This doesn't match g++.
+// CHECK-CXX: define{{.*}} { i32, float } @_Z5test22s1(i32{{[^,]*}}, float{{[^,]*}})
+struct s1 test2(struct s1 a) {
+  return a;
+}

From 2f17c9f65e7da50a77101431ddf7f6ed7e1ea92c Mon Sep 17 00:00:00 2001
From: Christian Ulmann <christianulmann@gmail.com>
Date: Thu, 2 Nov 2023 07:35:21 +0100
Subject: [PATCH 780/877] [MLIR][NVGPUToNVVM] Remove typed pointer support
 (#70867)

This commit removes the support for lowering NVGPU to NVVM dialect with
typed pointers. Typed pointers have been deprecated for a while now and
it's planned to soon remove them from the LLVM dialect.

Related PSA:
https://discourse.llvm.org/t/psa-removal-of-typed-pointers-from-the-llvm-dialect/74502
---
 mlir/include/mlir/Conversion/Passes.td        |  5 --
 .../Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp    | 19 ++----
 .../Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir |  2 +-
 .../NVGPUToNVVM/typed-pointers.mlir           | 59 -------------------
 4 files changed, 5 insertions(+), 80 deletions(-)
 delete mode 100644 mlir/test/Conversion/NVGPUToNVVM/typed-pointers.mlir

diff --git a/mlir/include/mlir/Conversion/Passes.td b/mlir/include/mlir/Conversion/Passes.td
index 08ad52e7a778c..93c9ed15340b9 100644
--- a/mlir/include/mlir/Conversion/Passes.td
+++ b/mlir/include/mlir/Conversion/Passes.td
@@ -809,11 +809,6 @@ def ConvertNVGPUToNVVMPass : Pass<"convert-nvgpu-to-nvvm"> {
   let dependentDialects = [
     "NVVM::NVVMDialect",
   ];
-  let options = [
-    Option<"useOpaquePointers", "use-opaque-pointers", "bool",
-              /*default=*/"true", "Generate LLVM IR using opaque pointers "
-              "instead of typed pointers">
-  ];
 }
 
 
diff --git a/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp b/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp
index efcde2ba58bd6..1977a571130ed 100644
--- a/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp
+++ b/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp
@@ -402,7 +402,6 @@ struct ConvertNVGPUToNVVMPass
 
   void runOnOperation() override {
     LowerToLLVMOptions options(&getContext());
-    options.useOpaquePointers = useOpaquePointers;
     RewritePatternSet patterns(&getContext());
     LLVMTypeConverter converter(&getContext(), options);
     IRRewriter rewriter(&getContext());
@@ -451,7 +450,7 @@ struct ConvertNVGPUToNVVMPass
           nvgpu::getMBarrierMemrefType(rewriter.getContext(), type));
     });
     converter.addConversion([&](nvgpu::TensorMapDescriptorType type) -> Type {
-      return converter.getPointerType(type.getTensor().getElementType());
+      return LLVM::LLVMPointerType::get(type.getContext());
     });
     populateNVGPUToNVVMConversionPatterns(converter, patterns);
     LLVMConversionTarget target(getContext());
@@ -651,16 +650,11 @@ struct NVGPUAsyncCopyLowering
     Value dstPtr =
         getStridedElementPtr(b.getLoc(), dstMemrefType, adaptor.getDst(),
                              adaptor.getDstIndices(), rewriter);
-    auto i8Ty = IntegerType::get(op.getContext(), 8);
     FailureOr<unsigned> dstAddressSpace =
         getTypeConverter()->getMemRefAddressSpace(dstMemrefType);
     if (failed(dstAddressSpace))
       return rewriter.notifyMatchFailure(
           loc, "destination memref address space not convertible to integer");
-    auto dstPointerType =
-        getTypeConverter()->getPointerType(i8Ty, *dstAddressSpace);
-    if (!getTypeConverter()->useOpaquePointers())
-      dstPtr = b.create<LLVM::BitcastOp>(dstPointerType, dstPtr);
 
     auto srcMemrefType = cast<MemRefType>(op.getSrc().getType());
     FailureOr<unsigned> srcAddressSpace =
@@ -671,13 +665,9 @@ struct NVGPUAsyncCopyLowering
 
     Value scrPtr = getStridedElementPtr(loc, srcMemrefType, adaptor.getSrc(),
                                         adaptor.getSrcIndices(), rewriter);
-    auto srcPointerType =
-        getTypeConverter()->getPointerType(i8Ty, *srcAddressSpace);
-    if (!getTypeConverter()->useOpaquePointers())
-      scrPtr = b.create<LLVM::BitcastOp>(srcPointerType, scrPtr);
     // Intrinsics takes a global pointer so we need an address space cast.
-    auto srcPointerGlobalType = getTypeConverter()->getPointerType(
-        i8Ty, NVVM::NVVMMemorySpace::kGlobalMemorySpace);
+    auto srcPointerGlobalType = LLVM::LLVMPointerType::get(
+        op->getContext(), NVVM::NVVMMemorySpace::kGlobalMemorySpace);
     scrPtr = b.create<LLVM::AddrSpaceCastOp>(srcPointerGlobalType, scrPtr);
     int64_t dstElements = adaptor.getDstElements().getZExtValue();
     int64_t sizeInBytes =
@@ -1128,8 +1118,7 @@ struct NVGPUTmaCreateDescriptorOpLowering
   matchAndRewrite(nvgpu::TmaCreateDescriptorOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
     ImplicitLocOpBuilder b(op->getLoc(), rewriter);
-    LLVM::LLVMPointerType llvmPointerType = getTypeConverter()->getPointerType(
-        IntegerType::get(op->getContext(), 8));
+    auto llvmPointerType = LLVM::LLVMPointerType::get(op->getContext());
     Type llvmInt64Type = IntegerType::get(op->getContext(), 64);
 
     Value tensorElementType =
diff --git a/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir b/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir
index 123a661193c49..745cbdbd51532 100644
--- a/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir
+++ b/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-nvgpu-to-nvvm='use-opaque-pointers=1' | FileCheck %s
+// RUN: mlir-opt %s -convert-nvgpu-to-nvvm | FileCheck %s
 // RUN: mlir-opt %s -transform-interpreter | FileCheck %s
 
 // CHECK-LABEL: @m16n8k16_fp16
diff --git a/mlir/test/Conversion/NVGPUToNVVM/typed-pointers.mlir b/mlir/test/Conversion/NVGPUToNVVM/typed-pointers.mlir
deleted file mode 100644
index 1a37f1c046cf6..0000000000000
--- a/mlir/test/Conversion/NVGPUToNVVM/typed-pointers.mlir
+++ /dev/null
@@ -1,59 +0,0 @@
-// RUN: mlir-opt --convert-nvgpu-to-nvvm='use-opaque-pointers=0' --split-input-file %s | FileCheck %s
-
-// CHECK-LABEL: @async_cp(
-// CHECK-SAME: %[[IDX:[a-zA-Z0-9_]+]]: index)
-func.func @async_cp(
-  %src: memref<128x128xf32>, %dst: memref<3x16x128xf32, 3>, %i : index) {
-  // CHECK: %[[IDX1:.*]] = builtin.unrealized_conversion_cast %[[IDX]] : index to i64
-  // CHECK-DAG: %[[BASEDST:.*]] = llvm.extractvalue %{{.*}}[1] : !llvm.struct<(ptr<f32, 3>, ptr<f32, 3>, i64, array<3 x i64>, array<3 x i64>)>
-  // CHECK-DAG: %[[S0:.*]] = llvm.mlir.constant(2048 : index) : i64
-  // CHECK-DAG: %[[LI:.*]] = llvm.mul %[[IDX1]], %[[S0]] : i64
-  // CHECK-DAG: %[[S1:.*]] = llvm.mlir.constant(128 : index) : i64
-  // CHECK-DAG: %[[FI0:.*]] = llvm.mul %[[IDX1]], %[[S1]] : i64
-  // CHECK-DAG: %[[FI1:.*]] = llvm.add %[[LI]], %[[FI0]] : i64
-  // CHECK-DAG: %[[FI2:.*]] = llvm.add %[[FI1]], %[[IDX1]] : i64
-  // CHECK-DAG: %[[ADDRESSDST:.*]] = llvm.getelementptr %[[BASEDST]][%[[FI2]]] : (!llvm.ptr<f32, 3>, i64) -> !llvm.ptr<f32, 3>
-  // CHECK-DAG: %[[CAST0:.*]] = llvm.bitcast %[[ADDRESSDST]] : !llvm.ptr<f32, 3> to !llvm.ptr<i8, 3>
-  // CHECK-DAG: %[[BASESRC:.*]] = llvm.extractvalue %{{.*}}[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
-  // CHECK-DAG: %[[S3:.*]] = llvm.mlir.constant(128 : index) : i64
-  // CHECK-DAG: %[[FI3:.*]] = llvm.mul %[[IDX1]], %[[S3]]  : i64
-  // CHECK-DAG: %[[FI4:.*]] = llvm.add %[[FI3]], %[[IDX1]]  : i64
-  // CHECK-DAG: %[[ADDRESSSRC:.*]] = llvm.getelementptr %[[BASESRC]][%[[FI4]]] : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32>
-  // CHECK-DAG: %[[CAST1:.*]] = llvm.bitcast %[[ADDRESSSRC]] : !llvm.ptr<f32> to !llvm.ptr<i8>
-  // CHECK-DAG: %[[CAST2:.*]] = llvm.addrspacecast %[[CAST1]] : !llvm.ptr<i8> to !llvm.ptr<i8, 1>
-  // CHECK-DAG: nvvm.cp.async.shared.global %[[CAST0]], %[[CAST2]], 16, cache = ca
-  %0 = nvgpu.device_async_copy %src[%i, %i], %dst[%i, %i, %i], 4 : memref<128x128xf32> to memref<3x16x128xf32, 3>
-  // CHECK: nvvm.cp.async.commit.group
-  %1 = nvgpu.device_async_create_group %0
-  // CHECK: nvvm.cp.async.wait.group 1
-  nvgpu.device_async_wait %1 { numGroups = 1 : i32 }
-
-  // CHECK: nvvm.cp.async.shared.global %{{.*}}, %{{.*}}, 16, cache = cg
-  %2 = nvgpu.device_async_copy %src[%i, %i], %dst[%i, %i, %i], 4 {bypassL1}: memref<128x128xf32> to memref<3x16x128xf32, 3>
-  return
-}
-
-// -----
-
-// CHECK-LABEL: @async_cp_i4(
-// CHECK-SAME: %[[IDX:[a-zA-Z0-9_]+]]: index)
-func.func @async_cp_i4(
-  %src: memref<128x64xi4>, %dst: memref<128x128xi4, 3>, %i : index) -> !nvgpu.device.async.token {
-  // CHECK: %[[IDX1:.*]] = builtin.unrealized_conversion_cast %[[IDX]] : index to i64
-  // CHECK-DAG: %[[BASEDST:.*]] = llvm.extractvalue %{{.*}}[1] : !llvm.struct<(ptr<i4, 3>, ptr<i4, 3>, i64, array<2 x i64>, array<2 x i64>)>
-  // CHECK-DAG: %[[S0:.*]] = llvm.mlir.constant(128 : index) : i64
-  // CHECK-DAG: %[[LI:.*]] = llvm.mul %[[IDX1]], %[[S0]] : i64
-  // CHECK-DAG: %[[FI1:.*]] = llvm.add %[[LI]], %[[IDX1]] : i64
-  // CHECK-DAG: %[[ADDRESSDST:.*]] = llvm.getelementptr %[[BASEDST]][%[[FI1]]] : (!llvm.ptr<i4, 3>, i64) -> !llvm.ptr<i4, 3>
-  // CHECK-DAG: %[[CAST0:.*]] = llvm.bitcast %[[ADDRESSDST]] : !llvm.ptr<i4, 3> to !llvm.ptr<i8, 3>
-  // CHECK-DAG: %[[BASESRC:.*]] = llvm.extractvalue %{{.*}}[1] : !llvm.struct<(ptr<i4>, ptr<i4>, i64, array<2 x i64>, array<2 x i64>)>
-  // CHECK-DAG: %[[S2:.*]] = llvm.mlir.constant(64 : index) : i64
-  // CHECK-DAG: %[[FI2:.*]] = llvm.mul %[[IDX1]], %[[S2]]  : i64
-  // CHECK-DAG: %[[FI3:.*]] = llvm.add %[[FI2]], %[[IDX1]]  : i64
-  // CHECK-DAG: %[[ADDRESSSRC:.*]] = llvm.getelementptr %[[BASESRC]][%[[FI3]]] : (!llvm.ptr<i4>, i64) -> !llvm.ptr<i4>
-  // CHECK-DAG: %[[CAST1:.*]] = llvm.bitcast %[[ADDRESSSRC]] : !llvm.ptr<i4> to !llvm.ptr<i8>
-  // CHECK-DAG: %[[CAST2:.*]] = llvm.addrspacecast %[[CAST1]] : !llvm.ptr<i8> to !llvm.ptr<i8, 1>
-  // CHECK-DAG: nvvm.cp.async.shared.global %[[CAST0]], %[[CAST2]], 16, cache = ca
-  %0 = nvgpu.device_async_copy %src[%i, %i], %dst[%i, %i], 32 : memref<128x64xi4> to memref<128x128xi4, 3>
-  return %0 : !nvgpu.device.async.token
-}

From b28a296cdfb6309fed328e5a299d392a0a841ce1 Mon Sep 17 00:00:00 2001
From: Christian Ulmann <christianulmann@gmail.com>
Date: Thu, 2 Nov 2023 07:35:58 +0100
Subject: [PATCH 781/877] [MLIR][MemRefToLLVM] Remove typed pointer support
 (#70909)

This commit removes the support for lowering MemRefToLLVM to LLVM
dialect with typed pointers. Typed pointers have been deprecated for a
while now and it's planned to soon remove them from the LLVM dialect.

Related PSA:
https://discourse.llvm.org/t/psa-removal-of-typed-pointers-from-the-llvm-dialect/74502
---
 .../Conversion/LLVMCommon/MemRefBuilder.h     |   2 -
 mlir/include/mlir/Conversion/Passes.td        |   5 +-
 .../mlir/Dialect/LLVMIR/FunctionCallUtils.h   |   2 +-
 .../Conversion/LLVMCommon/MemRefBuilder.cpp   |  82 +---
 .../Conversion/MemRefToLLVM/MemRefToLLVM.cpp  | 129 ++----
 .../MemRefToLLVM/convert-alloca-scope.mlir    |   2 +-
 .../convert-dynamic-memref-ops.mlir           |   6 +-
 .../convert-static-memref-ops.mlir            |   2 +-
 .../expand-then-convert-to-llvm.mlir          |   2 +-
 .../MemRefToLLVM/generic-functions.mlir       |   4 +-
 .../MemRefToLLVM/memref-to-llvm.mlir          |   4 +-
 .../MemRefToLLVM/typed-pointers.mlir          | 376 ------------------
 12 files changed, 57 insertions(+), 559 deletions(-)
 delete mode 100644 mlir/test/Conversion/MemRefToLLVM/typed-pointers.mlir

diff --git a/mlir/include/mlir/Conversion/LLVMCommon/MemRefBuilder.h b/mlir/include/mlir/Conversion/LLVMCommon/MemRefBuilder.h
index ef8215d332c46..e97675f37e10c 100644
--- a/mlir/include/mlir/Conversion/LLVMCommon/MemRefBuilder.h
+++ b/mlir/include/mlir/Conversion/LLVMCommon/MemRefBuilder.h
@@ -120,8 +120,6 @@ class MemRefDescriptor : public StructBuilder {
   static unsigned getNumUnpackedValues(MemRefType type);
 
 private:
-  bool useOpaquePointers();
-
   // Cached index type.
   Type indexType;
 };
diff --git a/mlir/include/mlir/Conversion/Passes.td b/mlir/include/mlir/Conversion/Passes.td
index 93c9ed15340b9..dae13bfecfafe 100644
--- a/mlir/include/mlir/Conversion/Passes.td
+++ b/mlir/include/mlir/Conversion/Passes.td
@@ -746,10 +746,7 @@ def FinalizeMemRefToLLVMConversionPass :
            "bool",
            /*default=*/"false",
            "Use generic allocation and deallocation functions instead of the "
-           "classic 'malloc', 'aligned_alloc' and 'free' functions">,
-    Option<"useOpaquePointers", "use-opaque-pointers", "bool",
-               /*default=*/"true", "Generate LLVM IR using opaque pointers "
-               "instead of typed pointers">
+           "classic 'malloc', 'aligned_alloc' and 'free' functions">
   ];
 }
 
diff --git a/mlir/include/mlir/Dialect/LLVMIR/FunctionCallUtils.h b/mlir/include/mlir/Dialect/LLVMIR/FunctionCallUtils.h
index 05320c0c71869..7289c3ac6ff7e 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/FunctionCallUtils.h
+++ b/mlir/include/mlir/Dialect/LLVMIR/FunctionCallUtils.h
@@ -61,7 +61,7 @@ LLVM::LLVMFuncOp lookupOrCreateGenericAlignedAllocFn(ModuleOp moduleOp,
                                                      Type indexType,
                                                      bool opaquePointers);
 LLVM::LLVMFuncOp lookupOrCreateGenericFreeFn(ModuleOp moduleOp,
-                                             bool opaquePointers);
+                                             bool opaquePointers = true);
 LLVM::LLVMFuncOp lookupOrCreateMemRefCopyFn(ModuleOp moduleOp, Type indexType,
                                             Type unrankedDescriptorType);
 
diff --git a/mlir/lib/Conversion/LLVMCommon/MemRefBuilder.cpp b/mlir/lib/Conversion/LLVMCommon/MemRefBuilder.cpp
index 0a3c9a57eec95..023fd6244ce1a 100644
--- a/mlir/lib/Conversion/LLVMCommon/MemRefBuilder.cpp
+++ b/mlir/lib/Conversion/LLVMCommon/MemRefBuilder.cpp
@@ -53,8 +53,7 @@ MemRefDescriptor MemRefDescriptor::fromStaticShape(
 
   // Extract all strides and offsets and verify they are static.
   auto [strides, offset] = getStridesAndOffset(type);
-  assert(!ShapedType::isDynamic(offset) &&
-         "expected static offset");
+  assert(!ShapedType::isDynamic(offset) && "expected static offset");
   assert(!llvm::any_of(strides, ShapedType::isDynamic) &&
          "expected static strides");
 
@@ -134,27 +133,19 @@ Value MemRefDescriptor::size(OpBuilder &builder, Location loc, Value pos,
                              int64_t rank) {
   auto arrayTy = LLVM::LLVMArrayType::get(indexType, rank);
 
-  LLVM::LLVMPointerType indexPtrTy;
-  LLVM::LLVMPointerType arrayPtrTy;
-
-  if (useOpaquePointers()) {
-    arrayPtrTy = indexPtrTy = LLVM::LLVMPointerType::get(builder.getContext());
-  } else {
-    indexPtrTy = LLVM::LLVMPointerType::get(indexType);
-    arrayPtrTy = LLVM::LLVMPointerType::get(arrayTy);
-  }
+  auto ptrTy = LLVM::LLVMPointerType::get(builder.getContext());
 
   // Copy size values to stack-allocated memory.
   auto one = createIndexAttrConstant(builder, loc, indexType, 1);
   auto sizes = builder.create<LLVM::ExtractValueOp>(
       loc, value, llvm::ArrayRef<int64_t>({kSizePosInMemRefDescriptor}));
-  auto sizesPtr = builder.create<LLVM::AllocaOp>(loc, arrayPtrTy, arrayTy, one,
+  auto sizesPtr = builder.create<LLVM::AllocaOp>(loc, ptrTy, arrayTy, one,
                                                  /*alignment=*/0);
   builder.create<LLVM::StoreOp>(loc, sizes, sizesPtr);
 
   // Load an return size value of interest.
-  auto resultPtr = builder.create<LLVM::GEPOp>(
-      loc, indexPtrTy, arrayTy, sizesPtr, ArrayRef<LLVM::GEPArg>{0, pos});
+  auto resultPtr = builder.create<LLVM::GEPOp>(loc, ptrTy, arrayTy, sizesPtr,
+                                               ArrayRef<LLVM::GEPArg>{0, pos});
   return builder.create<LLVM::LoadOp>(loc, indexType, resultPtr);
 }
 
@@ -273,10 +264,6 @@ unsigned MemRefDescriptor::getNumUnpackedValues(MemRefType type) {
   return 3 + 2 * type.getRank();
 }
 
-bool MemRefDescriptor::useOpaquePointers() {
-  return getElementPtrType().isOpaque();
-}
-
 //===----------------------------------------------------------------------===//
 // MemRefDescriptorView implementation.
 //===----------------------------------------------------------------------===//
@@ -413,44 +400,20 @@ void UnrankedMemRefDescriptor::computeSizes(
 Value UnrankedMemRefDescriptor::allocatedPtr(
     OpBuilder &builder, Location loc, Value memRefDescPtr,
     LLVM::LLVMPointerType elemPtrType) {
-
-  Value elementPtrPtr;
-  if (elemPtrType.isOpaque())
-    elementPtrPtr = memRefDescPtr;
-  else
-    elementPtrPtr = builder.create<LLVM::BitcastOp>(
-        loc, LLVM::LLVMPointerType::get(elemPtrType), memRefDescPtr);
-
-  return builder.create<LLVM::LoadOp>(loc, elemPtrType, elementPtrPtr);
+  return builder.create<LLVM::LoadOp>(loc, elemPtrType, memRefDescPtr);
 }
 
 void UnrankedMemRefDescriptor::setAllocatedPtr(
     OpBuilder &builder, Location loc, Value memRefDescPtr,
     LLVM::LLVMPointerType elemPtrType, Value allocatedPtr) {
-  Value elementPtrPtr;
-  if (elemPtrType.isOpaque())
-    elementPtrPtr = memRefDescPtr;
-  else
-    elementPtrPtr = builder.create<LLVM::BitcastOp>(
-        loc, LLVM::LLVMPointerType::get(elemPtrType), memRefDescPtr);
-
-  builder.create<LLVM::StoreOp>(loc, allocatedPtr, elementPtrPtr);
+  builder.create<LLVM::StoreOp>(loc, allocatedPtr, memRefDescPtr);
 }
 
 static std::pair<Value, Type>
 castToElemPtrPtr(OpBuilder &builder, Location loc, Value memRefDescPtr,
                  LLVM::LLVMPointerType elemPtrType) {
-  Value elementPtrPtr;
-  Type elemPtrPtrType;
-  if (elemPtrType.isOpaque()) {
-    elementPtrPtr = memRefDescPtr;
-    elemPtrPtrType = LLVM::LLVMPointerType::get(builder.getContext());
-  } else {
-    elemPtrPtrType = LLVM::LLVMPointerType::get(elemPtrType);
-    elementPtrPtr =
-        builder.create<LLVM::BitcastOp>(loc, elemPtrPtrType, memRefDescPtr);
-  }
-  return {elementPtrPtr, elemPtrPtrType};
+  auto elemPtrPtrType = LLVM::LLVMPointerType::get(builder.getContext());
+  return {memRefDescPtr, elemPtrPtrType};
 }
 
 Value UnrankedMemRefDescriptor::alignedPtr(
@@ -483,16 +446,8 @@ Value UnrankedMemRefDescriptor::offsetBasePtr(
   auto [elementPtrPtr, elemPtrPtrType] =
       castToElemPtrPtr(builder, loc, memRefDescPtr, elemPtrType);
 
-  Value offsetGep =
-      builder.create<LLVM::GEPOp>(loc, elemPtrPtrType, elemPtrType,
-                                  elementPtrPtr, ArrayRef<LLVM::GEPArg>{2});
-
-  if (!elemPtrType.isOpaque()) {
-    offsetGep = builder.create<LLVM::BitcastOp>(
-        loc, LLVM::LLVMPointerType::get(typeConverter.getIndexType()),
-        offsetGep);
-  }
-  return offsetGep;
+  return builder.create<LLVM::GEPOp>(loc, elemPtrPtrType, elemPtrType,
+                                     elementPtrPtr, ArrayRef<LLVM::GEPArg>{2});
 }
 
 Value UnrankedMemRefDescriptor::offset(OpBuilder &builder, Location loc,
@@ -521,19 +476,8 @@ Value UnrankedMemRefDescriptor::sizeBasePtr(
   Type indexTy = typeConverter.getIndexType();
   Type structTy = LLVM::LLVMStructType::getLiteral(
       indexTy.getContext(), {elemPtrType, elemPtrType, indexTy, indexTy});
-  Value structPtr;
-  if (elemPtrType.isOpaque()) {
-    structPtr = memRefDescPtr;
-  } else {
-    Type structPtrTy = LLVM::LLVMPointerType::get(structTy);
-    structPtr =
-        builder.create<LLVM::BitcastOp>(loc, structPtrTy, memRefDescPtr);
-  }
-
-  auto resultType = elemPtrType.isOpaque()
-                        ? LLVM::LLVMPointerType::get(indexTy.getContext())
-                        : LLVM::LLVMPointerType::get(indexTy);
-  return builder.create<LLVM::GEPOp>(loc, resultType, structTy, structPtr,
+  auto resultType = LLVM::LLVMPointerType::get(builder.getContext());
+  return builder.create<LLVM::GEPOp>(loc, resultType, structTy, memRefDescPtr,
                                      ArrayRef<LLVM::GEPArg>{0, 3});
 }
 
diff --git a/mlir/lib/Conversion/MemRefToLLVM/MemRefToLLVM.cpp b/mlir/lib/Conversion/MemRefToLLVM/MemRefToLLVM.cpp
index e9efcedec0e14..91b1210efec23 100644
--- a/mlir/lib/Conversion/MemRefToLLVM/MemRefToLLVM.cpp
+++ b/mlir/lib/Conversion/MemRefToLLVM/MemRefToLLVM.cpp
@@ -46,10 +46,9 @@ LLVM::LLVMFuncOp getFreeFn(const LLVMTypeConverter *typeConverter,
   bool useGenericFn = typeConverter->getOptions().useGenericFunctions;
 
   if (useGenericFn)
-    return LLVM::lookupOrCreateGenericFreeFn(
-        module, typeConverter->useOpaquePointers());
+    return LLVM::lookupOrCreateGenericFreeFn(module);
 
-  return LLVM::lookupOrCreateFreeFn(module, typeConverter->useOpaquePointers());
+  return LLVM::lookupOrCreateFreeFn(module);
 }
 
 struct AllocOpLowering : public AllocLikeOpLLVMLowering {
@@ -108,7 +107,7 @@ struct AllocaOpLowering : public AllocLikeOpLLVMLowering {
     unsigned addrSpace =
         *getTypeConverter()->getMemRefAddressSpace(allocaOp.getType());
     auto elementPtrType =
-        getTypeConverter()->getPointerType(elementType, addrSpace);
+        LLVM::LLVMPointerType::get(rewriter.getContext(), addrSpace);
 
     auto allocatedElementPtr =
         rewriter.create<LLVM::AllocaOp>(loc, elementPtrType, elementType, size,
@@ -232,10 +231,8 @@ struct DeallocOpLowering : public ConvertOpToLLVMPattern<memref::DeallocOp> {
     Value allocatedPtr;
     if (auto unrankedTy =
             llvm::dyn_cast<UnrankedMemRefType>(op.getMemref().getType())) {
-      Type elementType = unrankedTy.getElementType();
-      Type llvmElementTy = getTypeConverter()->convertType(elementType);
-      LLVM::LLVMPointerType elementPtrTy = getTypeConverter()->getPointerType(
-          llvmElementTy, unrankedTy.getMemorySpaceAsInt());
+      auto elementPtrTy = LLVM::LLVMPointerType::get(
+          rewriter.getContext(), unrankedTy.getMemorySpaceAsInt());
       allocatedPtr = UnrankedMemRefDescriptor::allocatedPtr(
           rewriter, op.getLoc(),
           UnrankedMemRefDescriptor(adaptor.getMemref())
@@ -245,10 +242,6 @@ struct DeallocOpLowering : public ConvertOpToLLVMPattern<memref::DeallocOp> {
       allocatedPtr = MemRefDescriptor(adaptor.getMemref())
                          .allocatedPtr(rewriter, op.getLoc());
     }
-    if (!getTypeConverter()->useOpaquePointers())
-      allocatedPtr = rewriter.create<LLVM::BitcastOp>(
-          op.getLoc(), getVoidPtrType(), allocatedPtr);
-
     rewriter.replaceOpWithNewOp<LLVM::CallOp>(op, freeFunc, allocatedPtr);
     return success();
   }
@@ -306,19 +299,12 @@ struct DimOpLowering : public ConvertOpToLLVMPattern<memref::DimOp> {
     Value underlyingRankedDesc = unrankedDesc.memRefDescPtr(rewriter, loc);
 
     Type elementType = typeConverter->convertType(scalarMemRefType);
-    Value scalarMemRefDescPtr;
-    if (getTypeConverter()->useOpaquePointers())
-      scalarMemRefDescPtr = underlyingRankedDesc;
-    else
-      scalarMemRefDescPtr = rewriter.create<LLVM::BitcastOp>(
-          loc, LLVM::LLVMPointerType::get(elementType, addressSpace),
-          underlyingRankedDesc);
 
     // Get pointer to offset field of memref<element_type> descriptor.
-    Type indexPtrTy = getTypeConverter()->getPointerType(
-        getTypeConverter()->getIndexType(), addressSpace);
+    auto indexPtrTy =
+        LLVM::LLVMPointerType::get(rewriter.getContext(), addressSpace);
     Value offsetPtr = rewriter.create<LLVM::GEPOp>(
-        loc, indexPtrTy, elementType, scalarMemRefDescPtr,
+        loc, indexPtrTy, elementType, underlyingRankedDesc,
         ArrayRef<LLVM::GEPArg>{0, 2});
 
     // The size value that we have to extract can be obtained using GEPop with
@@ -569,18 +555,14 @@ struct GetGlobalMemrefOpLowering : public AllocLikeOpLLVMLowering {
     unsigned memSpace = *maybeAddressSpace;
 
     Type arrayTy = convertGlobalMemrefTypeToLLVM(type, *getTypeConverter());
-    Type resTy = getTypeConverter()->getPointerType(arrayTy, memSpace);
+    auto ptrTy = LLVM::LLVMPointerType::get(rewriter.getContext(), memSpace);
     auto addressOf =
-        rewriter.create<LLVM::AddressOfOp>(loc, resTy, getGlobalOp.getName());
+        rewriter.create<LLVM::AddressOfOp>(loc, ptrTy, getGlobalOp.getName());
 
     // Get the address of the first element in the array by creating a GEP with
     // the address of the GV as the base, and (rank + 1) number of 0 indices.
-    Type elementType = typeConverter->convertType(type.getElementType());
-    Type elementPtrType =
-        getTypeConverter()->getPointerType(elementType, memSpace);
-
     auto gep = rewriter.create<LLVM::GEPOp>(
-        loc, elementPtrType, arrayTy, addressOf,
+        loc, ptrTy, arrayTy, addressOf,
         SmallVector<LLVM::GEPArg>(type.getRank() + 1, 0));
 
     // We do not expect the memref obtained using `memref.get_global` to be
@@ -590,7 +572,7 @@ struct GetGlobalMemrefOpLowering : public AllocLikeOpLLVMLowering {
     Value deadBeefConst =
         createIndexAttrConstant(rewriter, op->getLoc(), intPtrType, 0xdeadbeef);
     auto deadBeefPtr =
-        rewriter.create<LLVM::IntToPtrOp>(loc, elementPtrType, deadBeefConst);
+        rewriter.create<LLVM::IntToPtrOp>(loc, ptrTy, deadBeefConst);
 
     // Both allocated and aligned pointers are same. We could potentially stash
     // a nullptr for the allocated pointer since we do not expect any dealloc.
@@ -734,13 +716,6 @@ struct MemRefCastOpLowering : public ConvertOpToLLVMPattern<memref::CastOp> {
       auto ptr = getTypeConverter()->promoteOneMemRefDescriptor(
           loc, adaptor.getSource(), rewriter);
 
-      // voidptr = BitCastOp srcType* to void*
-      Value voidPtr;
-      if (getTypeConverter()->useOpaquePointers())
-        voidPtr = ptr;
-      else
-        voidPtr = rewriter.create<LLVM::BitcastOp>(loc, getVoidPtrType(), ptr);
-
       // rank = ConstantOp srcRank
       auto rankVal = rewriter.create<LLVM::ConstantOp>(
           loc, getIndexType(), rewriter.getIndexAttr(rank));
@@ -749,8 +724,8 @@ struct MemRefCastOpLowering : public ConvertOpToLLVMPattern<memref::CastOp> {
           UnrankedMemRefDescriptor::undef(rewriter, loc, targetStructType);
       // d1 = InsertValueOp undef, rank, 0
       memRefDesc.setRank(rewriter, loc, rankVal);
-      // d2 = InsertValueOp d1, voidptr, 1
-      memRefDesc.setMemRefDescPtr(rewriter, loc, voidPtr);
+      // d2 = InsertValueOp d1, ptr, 1
+      memRefDesc.setMemRefDescPtr(rewriter, loc, ptr);
       rewriter.replaceOp(memRefCastOp, (Value)memRefDesc);
 
     } else if (isa<UnrankedMemRefType>(srcType) && isa<MemRefType>(dstType)) {
@@ -760,17 +735,9 @@ struct MemRefCastOpLowering : public ConvertOpToLLVMPattern<memref::CastOp> {
       UnrankedMemRefDescriptor memRefDesc(adaptor.getSource());
       // ptr = ExtractValueOp src, 1
       auto ptr = memRefDesc.memRefDescPtr(rewriter, loc);
-      // castPtr = BitCastOp i8* to structTy*
-      Value castPtr;
-      if (getTypeConverter()->useOpaquePointers())
-        castPtr = ptr;
-      else
-        castPtr = rewriter.create<LLVM::BitcastOp>(
-            loc, LLVM::LLVMPointerType::get(targetStructType), ptr);
 
-      // struct = LoadOp castPtr
-      auto loadOp =
-          rewriter.create<LLVM::LoadOp>(loc, targetStructType, castPtr);
+      // struct = LoadOp ptr
+      auto loadOp = rewriter.create<LLVM::LoadOp>(loc, targetStructType, ptr);
       rewriter.replaceOp(memRefCastOp, loadOp.getResult());
     } else {
       llvm_unreachable("Unsupported unranked memref to unranked memref cast");
@@ -841,17 +808,10 @@ struct MemRefCopyOpLowering : public ConvertOpToLLVMPattern<memref::CopyOp> {
       auto ptr =
           typeConverter->promoteOneMemRefDescriptor(loc, ranked, rewriter);
 
-      Value voidPtr;
-      if (getTypeConverter()->useOpaquePointers())
-        voidPtr = ptr;
-      else
-        voidPtr = rewriter.create<LLVM::BitcastOp>(loc, getVoidPtrType(), ptr);
-
       auto unrankedType =
           UnrankedMemRefType::get(type.getElementType(), type.getMemorySpace());
-      return UnrankedMemRefDescriptor::pack(rewriter, loc, *typeConverter,
-                                            unrankedType,
-                                            ValueRange{rank, voidPtr});
+      return UnrankedMemRefDescriptor::pack(
+          rewriter, loc, *typeConverter, unrankedType, ValueRange{rank, ptr});
     };
 
     // Save stack position before promoting descriptors
@@ -871,7 +831,7 @@ struct MemRefCopyOpLowering : public ConvertOpToLLVMPattern<memref::CopyOp> {
     auto one = rewriter.create<LLVM::ConstantOp>(loc, getIndexType(),
                                                  rewriter.getIndexAttr(1));
     auto promote = [&](Value desc) {
-      Type ptrType = getTypeConverter()->getPointerType(desc.getType());
+      auto ptrType = LLVM::LLVMPointerType::get(rewriter.getContext());
       auto allocated =
           rewriter.create<LLVM::AllocaOp>(loc, ptrType, desc.getType(), one);
       rewriter.create<LLVM::StoreOp>(loc, desc, allocated);
@@ -983,12 +943,10 @@ struct MemorySpaceCastOpLowering
       result.setMemRefDescPtr(rewriter, loc, resultUnderlyingDesc);
 
       // Copy pointers, performing address space casts.
-      Type llvmElementType =
-          typeConverter->convertType(sourceType.getElementType());
-      LLVM::LLVMPointerType sourceElemPtrType =
-          getTypeConverter()->getPointerType(llvmElementType, sourceAddrSpace);
+      auto sourceElemPtrType =
+          LLVM::LLVMPointerType::get(rewriter.getContext(), sourceAddrSpace);
       auto resultElemPtrType =
-          getTypeConverter()->getPointerType(llvmElementType, resultAddrSpace);
+          LLVM::LLVMPointerType::get(rewriter.getContext(), resultAddrSpace);
 
       Value allocatedPtr = sourceDesc.allocatedPtr(
           rewriter, loc, sourceUnderlyingDesc, sourceElemPtrType);
@@ -1053,10 +1011,8 @@ static void extractPointersAndOffset(Location loc,
   // These will all cause assert()s on unconvertible types.
   unsigned memorySpace = *typeConverter.getMemRefAddressSpace(
       cast<UnrankedMemRefType>(operandType));
-  Type elementType = cast<UnrankedMemRefType>(operandType).getElementType();
-  Type llvmElementType = typeConverter.convertType(elementType);
-  LLVM::LLVMPointerType elementPtrType =
-      typeConverter.getPointerType(llvmElementType, memorySpace);
+  auto elementPtrType =
+      LLVM::LLVMPointerType::get(rewriter.getContext(), memorySpace);
 
   // Extract pointer to the underlying ranked memref descriptor and cast it to
   // ElemType**.
@@ -1254,7 +1210,6 @@ struct MemRefReshapeOpLowering
     auto targetType = cast<UnrankedMemRefType>(reshapeOp.getResult().getType());
     unsigned addressSpace =
         *getTypeConverter()->getMemRefAddressSpace(targetType);
-    Type elementType = targetType.getElementType();
 
     // Create the unranked memref descriptor that holds the ranked one. The
     // inner descriptor is allocated on stack.
@@ -1276,9 +1231,8 @@ struct MemRefReshapeOpLowering
                              &allocatedPtr, &alignedPtr, &offset);
 
     // Set pointers and offset.
-    Type llvmElementType = typeConverter->convertType(elementType);
-    LLVM::LLVMPointerType elementPtrType =
-        getTypeConverter()->getPointerType(llvmElementType, addressSpace);
+    auto elementPtrType =
+        LLVM::LLVMPointerType::get(rewriter.getContext(), addressSpace);
 
     UnrankedMemRefDescriptor::setAllocatedPtr(rewriter, loc, underlyingDescPtr,
                                               elementPtrType, allocatedPtr);
@@ -1328,7 +1282,7 @@ struct MemRefReshapeOpLowering
     rewriter.setInsertionPointToStart(bodyBlock);
 
     // Copy size from shape to descriptor.
-    Type llvmIndexPtrType = getTypeConverter()->getPointerType(indexType);
+    auto llvmIndexPtrType = LLVM::LLVMPointerType::get(rewriter.getContext());
     Value sizeLoadGep = rewriter.create<LLVM::GEPOp>(
         loc, llvmIndexPtrType,
         typeConverter->convertType(shapeMemRefType.getElementType()),
@@ -1430,9 +1384,9 @@ class TransposeOpLowering : public ConvertOpToLLVMPattern<memref::TransposeOp> {
     targetMemRef.setOffset(rewriter, loc, viewMemRef.offset(rewriter, loc));
 
     // Iterate over the dimensions and apply size/stride permutation:
-    // When enumerating the results of the permutation map, the enumeration index
-    // is the index into the target dimensions and the DimExpr points to the
-    // dimension of the source memref.
+    // When enumerating the results of the permutation map, the enumeration
+    // index is the index into the target dimensions and the DimExpr points to
+    // the dimension of the source memref.
     for (const auto &en :
          llvm::enumerate(transposeOp.getPermutation().getResults())) {
       int targetPos = en.index();
@@ -1523,17 +1477,7 @@ struct ViewOpLowering : public ConvertOpToLLVMPattern<memref::ViewOp> {
     // Field 1: Copy the allocated pointer, used for malloc/free.
     Value allocatedPtr = sourceMemRef.allocatedPtr(rewriter, loc);
     auto srcMemRefType = cast<MemRefType>(viewOp.getSource().getType());
-    unsigned sourceMemorySpace =
-        *getTypeConverter()->getMemRefAddressSpace(srcMemRefType);
-    Value bitcastPtr;
-    if (getTypeConverter()->useOpaquePointers())
-      bitcastPtr = allocatedPtr;
-    else
-      bitcastPtr = rewriter.create<LLVM::BitcastOp>(
-          loc, LLVM::LLVMPointerType::get(targetElementTy, sourceMemorySpace),
-          allocatedPtr);
-
-    targetMemRef.setAllocatedPtr(rewriter, loc, bitcastPtr);
+    targetMemRef.setAllocatedPtr(rewriter, loc, allocatedPtr);
 
     // Field 2: Copy the actual aligned pointer to payload.
     Value alignedPtr = sourceMemRef.alignedPtr(rewriter, loc);
@@ -1542,15 +1486,7 @@ struct ViewOpLowering : public ConvertOpToLLVMPattern<memref::ViewOp> {
         typeConverter->convertType(srcMemRefType.getElementType()), alignedPtr,
         adaptor.getByteShift());
 
-    if (getTypeConverter()->useOpaquePointers()) {
-      bitcastPtr = alignedPtr;
-    } else {
-      bitcastPtr = rewriter.create<LLVM::BitcastOp>(
-          loc, LLVM::LLVMPointerType::get(targetElementTy, sourceMemorySpace),
-          alignedPtr);
-    }
-
-    targetMemRef.setAlignedPtr(rewriter, loc, bitcastPtr);
+    targetMemRef.setAlignedPtr(rewriter, loc, alignedPtr);
 
     Type indexType = getIndexType();
     // Field 3: The offset in the resulting type must be 0. This is
@@ -1766,7 +1702,6 @@ struct FinalizeMemRefToLLVMConversionPass
                          : LowerToLLVMOptions::AllocLowering::Malloc);
 
     options.useGenericFunctions = useGenericFunctions;
-    options.useOpaquePointers = useOpaquePointers;
 
     if (indexBitwidth != kDeriveIndexBitwidthFromDataLayout)
       options.overrideIndexBitwidth(indexBitwidth);
diff --git a/mlir/test/Conversion/MemRefToLLVM/convert-alloca-scope.mlir b/mlir/test/Conversion/MemRefToLLVM/convert-alloca-scope.mlir
index 5cc9b33d21f17..5fb3059626cdc 100644
--- a/mlir/test/Conversion/MemRefToLLVM/convert-alloca-scope.mlir
+++ b/mlir/test/Conversion/MemRefToLLVM/convert-alloca-scope.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt -finalize-memref-to-llvm='use-opaque-pointers=1' %s | FileCheck %s
+// RUN: mlir-opt -finalize-memref-to-llvm  %s | FileCheck %s
 
 // CHECK-LABEL: @empty
 func.func @empty() {
diff --git a/mlir/test/Conversion/MemRefToLLVM/convert-dynamic-memref-ops.mlir b/mlir/test/Conversion/MemRefToLLVM/convert-dynamic-memref-ops.mlir
index ea6a235857e63..66dc30127ae74 100644
--- a/mlir/test/Conversion/MemRefToLLVM/convert-dynamic-memref-ops.mlir
+++ b/mlir/test/Conversion/MemRefToLLVM/convert-dynamic-memref-ops.mlir
@@ -1,6 +1,6 @@
-// RUN: mlir-opt -split-input-file -finalize-memref-to-llvm='use-opaque-pointers=1' %s | FileCheck %s
-// RUN: mlir-opt -split-input-file -finalize-memref-to-llvm='use-aligned-alloc=1 use-opaque-pointers=1' %s | FileCheck %s --check-prefix=ALIGNED-ALLOC
-// RUN: mlir-opt -split-input-file -finalize-memref-to-llvm='index-bitwidth=32 use-opaque-pointers=1' %s | FileCheck --check-prefix=CHECK32 %s
+// RUN: mlir-opt -split-input-file -finalize-memref-to-llvm %s | FileCheck %s
+// RUN: mlir-opt -split-input-file -finalize-memref-to-llvm='use-aligned-alloc=1' %s | FileCheck %s --check-prefix=ALIGNED-ALLOC
+// RUN: mlir-opt -split-input-file -finalize-memref-to-llvm='index-bitwidth=32' %s | FileCheck --check-prefix=CHECK32 %s
 
 // CHECK-LABEL: func @mixed_alloc(
 //       CHECK:   %[[Marg:.*]]: index, %[[Narg:.*]]: index)
diff --git a/mlir/test/Conversion/MemRefToLLVM/convert-static-memref-ops.mlir b/mlir/test/Conversion/MemRefToLLVM/convert-static-memref-ops.mlir
index 35a6358d8f58b..f1600d43e7bfb 100644
--- a/mlir/test/Conversion/MemRefToLLVM/convert-static-memref-ops.mlir
+++ b/mlir/test/Conversion/MemRefToLLVM/convert-static-memref-ops.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt -finalize-memref-to-llvm='use-opaque-pointers=1' -split-input-file %s | FileCheck %s
+// RUN: mlir-opt -finalize-memref-to-llvm -split-input-file %s | FileCheck %s
 
 // CHECK-LABEL: func @zero_d_alloc()
 func.func @zero_d_alloc() -> memref<f32> {
diff --git a/mlir/test/Conversion/MemRefToLLVM/expand-then-convert-to-llvm.mlir b/mlir/test/Conversion/MemRefToLLVM/expand-then-convert-to-llvm.mlir
index 0e655b5464d96..eb45112b117c0 100644
--- a/mlir/test/Conversion/MemRefToLLVM/expand-then-convert-to-llvm.mlir
+++ b/mlir/test/Conversion/MemRefToLLVM/expand-then-convert-to-llvm.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt -expand-strided-metadata -finalize-memref-to-llvm='use-opaque-pointers=1' -lower-affine -convert-arith-to-llvm -cse %s -split-input-file | FileCheck %s
+// RUN: mlir-opt -expand-strided-metadata -finalize-memref-to-llvm -lower-affine -convert-arith-to-llvm -cse %s -split-input-file | FileCheck %s
 //
 // This test demonstrates a full "memref to llvm" pipeline where
 // we first expand some of the memref operations (using affine,
diff --git a/mlir/test/Conversion/MemRefToLLVM/generic-functions.mlir b/mlir/test/Conversion/MemRefToLLVM/generic-functions.mlir
index d5ac0b0da979d..9256461eadcb3 100644
--- a/mlir/test/Conversion/MemRefToLLVM/generic-functions.mlir
+++ b/mlir/test/Conversion/MemRefToLLVM/generic-functions.mlir
@@ -1,7 +1,7 @@
-// RUN: mlir-opt -pass-pipeline="builtin.module(finalize-memref-to-llvm{use-generic-functions=1 use-opaque-pointers=1})" -split-input-file %s \
+// RUN: mlir-opt -pass-pipeline="builtin.module(finalize-memref-to-llvm{use-generic-functions=1})" -split-input-file %s \
 // RUN: | FileCheck %s --check-prefix="CHECK-NOTALIGNED"
 
-// RUN: mlir-opt -pass-pipeline="builtin.module(finalize-memref-to-llvm{use-generic-functions=1 use-aligned-alloc=1 use-opaque-pointers=1})" -split-input-file %s \
+// RUN: mlir-opt -pass-pipeline="builtin.module(finalize-memref-to-llvm{use-generic-functions=1 use-aligned-alloc=1})" -split-input-file %s \
 // RUN: | FileCheck %s --check-prefix="CHECK-ALIGNED"
 
 // CHECK-LABEL: func @alloc()
diff --git a/mlir/test/Conversion/MemRefToLLVM/memref-to-llvm.mlir b/mlir/test/Conversion/MemRefToLLVM/memref-to-llvm.mlir
index ae487ef669474..3b3a51d609be9 100644
--- a/mlir/test/Conversion/MemRefToLLVM/memref-to-llvm.mlir
+++ b/mlir/test/Conversion/MemRefToLLVM/memref-to-llvm.mlir
@@ -1,5 +1,5 @@
-// RUN: mlir-opt -finalize-memref-to-llvm='use-opaque-pointers=1' %s -split-input-file | FileCheck %s
-// RUN: mlir-opt -finalize-memref-to-llvm='index-bitwidth=32 use-opaque-pointers=1' %s -split-input-file | FileCheck --check-prefix=CHECK32 %s
+// RUN: mlir-opt -finalize-memref-to-llvm %s -split-input-file | FileCheck %s
+// RUN: mlir-opt -finalize-memref-to-llvm='index-bitwidth=32' %s -split-input-file | FileCheck --check-prefix=CHECK32 %s
 
 // Same below, but using the `ConvertToLLVMPatternInterface` entry point
 // and the generic `convert-to-llvm` pass. This produces slightly different IR
diff --git a/mlir/test/Conversion/MemRefToLLVM/typed-pointers.mlir b/mlir/test/Conversion/MemRefToLLVM/typed-pointers.mlir
deleted file mode 100644
index 19d053ee7813b..0000000000000
--- a/mlir/test/Conversion/MemRefToLLVM/typed-pointers.mlir
+++ /dev/null
@@ -1,376 +0,0 @@
-// RUN: mlir-opt -finalize-memref-to-llvm='use-opaque-pointers=0' %s -split-input-file | FileCheck %s
-// RUN: mlir-opt -finalize-memref-to-llvm='index-bitwidth=32 use-opaque-pointers=0' -split-input-file %s | FileCheck --check-prefix=CHECK32 %s
-
-// CHECK-LABEL: func @view(
-// CHECK: %[[ARG1F:.*]]: index, %[[ARG2F:.*]]: index
-func.func @view(%arg1 : index, %arg2 : index) {
-  // CHECK: %[[ARG2:.*]] = builtin.unrealized_conversion_cast %[[ARG2F:.*]]
-  // CHECK: %[[ARG1:.*]] = builtin.unrealized_conversion_cast %[[ARG1F:.*]]
-  // CHECK: llvm.mlir.constant(2048 : index) : i64
-  // CHECK: llvm.mlir.undef : !llvm.struct<(ptr<i8>, ptr<i8>, i64, array<1 x i64>, array<1 x i64>)>
-  %0 = memref.alloc() : memref<2048xi8>
-
-  // CHECK: llvm.mlir.undef : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
-  // CHECK: %[[BASE_PTR_2:.*]] = llvm.extractvalue %{{.*}}[1] : !llvm.struct<(ptr<i8>, ptr<i8>, i64, array<1 x i64>, array<1 x i64>)>
-  // CHECK: %[[SHIFTED_BASE_PTR_2:.*]] = llvm.getelementptr %[[BASE_PTR_2]][%[[ARG2]]] : (!llvm.ptr<i8>, i64) -> !llvm.ptr<i8>
-  // CHECK: %[[CAST_SHIFTED_BASE_PTR_2:.*]] = llvm.bitcast %[[SHIFTED_BASE_PTR_2]] : !llvm.ptr<i8> to !llvm.ptr<f32>
-  // CHECK: llvm.insertvalue %[[CAST_SHIFTED_BASE_PTR_2]], %{{.*}}[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
-  // CHECK: %[[C0_2:.*]] = llvm.mlir.constant(0 : index) : i64
-  // CHECK: llvm.insertvalue %[[C0_2]], %{{.*}}[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
-  // CHECK: llvm.insertvalue %[[ARG1]], %{{.*}}[3, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
-  // CHECK: llvm.mlir.constant(1 : index) : i64
-  // CHECK: llvm.insertvalue %{{.*}}, %{{.*}}[4, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
-  // CHECK: llvm.mlir.constant(4 : index) : i64
-  // CHECK: llvm.insertvalue %{{.*}}, %{{.*}}[3, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
-  // CHECK: llvm.mul %{{.*}}, %[[ARG1]]
-  // CHECK: llvm.insertvalue %{{.*}}, %{{.*}}[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
-  %1 = memref.view %0[%arg2][%arg1] : memref<2048xi8> to memref<4x?xf32>
-  return
-}
-
-// -----
-
-// CHECK:   llvm.mlir.global external @gv0() {addr_space = 0 : i32} : !llvm.array<2 x f32> {
-// CHECK-NEXT:     %0 = llvm.mlir.undef : !llvm.array<2 x f32>
-// CHECK-NEXT:     llvm.return %0 : !llvm.array<2 x f32>
-// CHECK-NEXT:   }
-memref.global @gv0 : memref<2xf32> = uninitialized
-
-// CHECK-LABEL: func @get_gv0_memref
-func.func @get_gv0_memref() {
-  %0 = memref.get_global @gv0 : memref<2xf32>
-  // CHECK: %[[DIM:.*]] = llvm.mlir.constant(2 : index) : i64
-  // CHECK: %[[STRIDE:.*]] = llvm.mlir.constant(1 : index) : i64
-  // CHECK: %[[ADDR:.*]] = llvm.mlir.addressof @gv0 : !llvm.ptr<array<2 x f32>>
-  // CHECK: %[[GEP:.*]] = llvm.getelementptr %[[ADDR]][0, 0] : (!llvm.ptr<array<2 x f32>>) -> !llvm.ptr<f32>
-  // CHECK: %[[DEADBEEF:.*]] = llvm.mlir.constant(3735928559 : index) : i64
-  // CHECK: %[[DEADBEEFPTR:.*]] = llvm.inttoptr %[[DEADBEEF]] : i64 to !llvm.ptr<f32>
-  // CHECK: llvm.mlir.undef : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-  // CHECK: llvm.insertvalue %[[DEADBEEFPTR]], {{.*}}[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-  // CHECK: llvm.insertvalue %[[GEP]], {{.*}}[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-  // CHECK: %[[OFFSET:.*]] = llvm.mlir.constant(0 : index) : i64
-  // CHECK: llvm.insertvalue %[[OFFSET]], {{.*}}[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-  // CHECK: llvm.insertvalue %[[DIM]], {{.*}}[3, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-  // CHECK: llvm.insertvalue %[[STRIDE]], {{.*}}[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<1 x i64>, array<1 x i64>)>
-  return
-}
-
-// -----
-
-// CHECK-LABEL: func @memref_copy_unranked
-func.func @memref_copy_unranked() {
-  %0 = memref.alloc() : memref<2xi1>
-  // CHECK: llvm.mlir.constant(2 : index) : i64
-  // CHECK: llvm.mlir.undef : !llvm.struct<(ptr<i1>, ptr<i1>, i64, array<1 x i64>, array<1 x i64>)>
-  %1 = memref.cast %0 : memref<2xi1> to memref<*xi1>
-  %2 = memref.alloc() : memref<2xi1>
-  // CHECK: llvm.mlir.constant(2 : index) : i64
-  // CHECK: llvm.mlir.undef : !llvm.struct<(ptr<i1>, ptr<i1>, i64, array<1 x i64>, array<1 x i64>)>
-  %3 = memref.cast %2 : memref<2xi1> to memref<*xi1>
-  memref.copy %1, %3 : memref<*xi1> to memref<*xi1>
-  // CHECK: [[ONE:%.*]] = llvm.mlir.constant(1 : index) : i64
-  // CHECK: [[ALLOCA:%.*]] = llvm.alloca %35 x !llvm.struct<(ptr<i1>, ptr<i1>, i64, array<1 x i64>, array<1 x i64>)> : (i64) -> !llvm.ptr<struct<(ptr<i1>, ptr<i1>, i64, array<1 x i64>, array<1 x i64>)>>
-  // CHECK: llvm.store {{%.*}}, [[ALLOCA]] : !llvm.ptr<struct<(ptr<i1>, ptr<i1>, i64, array<1 x i64>, array<1 x i64>)>>
-  // CHECK: [[BITCAST:%.*]] = llvm.bitcast [[ALLOCA]] : !llvm.ptr<struct<(ptr<i1>, ptr<i1>, i64, array<1 x i64>, array<1 x i64>)>> to !llvm.ptr<i8>
-  // CHECK: [[RANK:%.*]] = llvm.mlir.constant(1 : index) : i64
-  // CHECK: [[UNDEF:%.*]] = llvm.mlir.undef : !llvm.struct<(i64, ptr<i8>)>
-  // CHECK: [[INSERT:%.*]] = llvm.insertvalue [[RANK]], [[UNDEF]][0] : !llvm.struct<(i64, ptr<i8>)>
-  // CHECK: [[INSERT2:%.*]] = llvm.insertvalue [[BITCAST]], [[INSERT]][1] : !llvm.struct<(i64, ptr<i8>)>
-  // CHECK: [[STACKSAVE:%.*]] = llvm.intr.stacksave : !llvm.ptr<i8>
-  // CHECK: [[RANK2:%.*]] = llvm.mlir.constant(1 : index) : i64
-  // CHECK: [[ALLOCA2:%.*]] = llvm.alloca [[RANK2]] x !llvm.struct<(i64, ptr<i8>)> : (i64) -> !llvm.ptr<struct<(i64, ptr<i8>)>>
-  // CHECK: llvm.store {{%.*}}, [[ALLOCA2]] : !llvm.ptr<struct<(i64, ptr<i8>)>>
-  // CHECK: [[ALLOCA3:%.*]] = llvm.alloca [[RANK2]] x !llvm.struct<(i64, ptr<i8>)> : (i64) -> !llvm.ptr<struct<(i64, ptr<i8>)>>
-  // CHECK: llvm.store [[INSERT2]], [[ALLOCA3]] : !llvm.ptr<struct<(i64, ptr<i8>)>>
-  // CHECK: [[SIZEPTR:%.*]] = llvm.getelementptr {{%.*}}[1] : (!llvm.ptr<i1>) -> !llvm.ptr<i1>
-  // CHECK: [[SIZE:%.*]] = llvm.ptrtoint [[SIZEPTR]] : !llvm.ptr<i1> to i64
-  // CHECK: llvm.call @memrefCopy([[SIZE]], [[ALLOCA2]], [[ALLOCA3]]) : (i64, !llvm.ptr<struct<(i64, ptr<i8>)>>, !llvm.ptr<struct<(i64, ptr<i8>)>>) -> ()
-  // CHECK: llvm.intr.stackrestore [[STACKSAVE]]
-  return
-}
-
-// -----
-
-// CHECK-LABEL: func @mixed_alloc(
-//       CHECK:   %[[Marg:.*]]: index, %[[Narg:.*]]: index)
-func.func @mixed_alloc(%arg0: index, %arg1: index) -> memref<?x42x?xf32> {
-//   CHECK-DAG:  %[[M:.*]] = builtin.unrealized_conversion_cast %[[Marg]]
-//   CHECK-DAG:  %[[N:.*]] = builtin.unrealized_conversion_cast %[[Narg]]
-//       CHECK:  %[[c42:.*]] = llvm.mlir.constant(42 : index) : i64
-//  CHECK-NEXT:  %[[one:.*]] = llvm.mlir.constant(1 : index) : i64
-//  CHECK-NEXT:  %[[st0:.*]] = llvm.mul %[[N]], %[[c42]] : i64
-//  CHECK-NEXT:  %[[sz:.*]] = llvm.mul %[[st0]], %[[M]] : i64
-//  CHECK-NEXT:  %[[null:.*]] = llvm.mlir.zero : !llvm.ptr<f32>
-//  CHECK-NEXT:  %[[gep:.*]] = llvm.getelementptr %[[null]][%[[sz]]] : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32>
-//  CHECK-NEXT:  %[[sz_bytes:.*]] = llvm.ptrtoint %[[gep]] : !llvm.ptr<f32> to i64
-//  CHECK-NEXT:  llvm.call @malloc(%[[sz_bytes]]) : (i64) -> !llvm.ptr<i8>
-//  CHECK-NEXT:  llvm.bitcast %{{.*}} : !llvm.ptr<i8> to !llvm.ptr<f32>
-//  CHECK-NEXT:  llvm.mlir.undef : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<3 x i64>, array<3 x i64>)>
-//  CHECK-NEXT:  llvm.insertvalue %{{.*}}, %{{.*}}[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<3 x i64>, array<3 x i64>)>
-//  CHECK-NEXT:  llvm.insertvalue %{{.*}}, %{{.*}}[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<3 x i64>, array<3 x i64>)>
-//  CHECK-NEXT:  %[[off:.*]] = llvm.mlir.constant(0 : index) : i64
-//  CHECK-NEXT:  llvm.insertvalue %[[off]], %{{.*}}[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<3 x i64>, array<3 x i64>)>
-//  CHECK-NEXT:  llvm.insertvalue %[[M]], %{{.*}}[3, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<3 x i64>, array<3 x i64>)>
-//  CHECK-NEXT:  llvm.insertvalue %[[c42]], %{{.*}}[3, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<3 x i64>, array<3 x i64>)>
-//  CHECK-NEXT:  llvm.insertvalue %[[N]], %{{.*}}[3, 2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<3 x i64>, array<3 x i64>)>
-//  CHECK-NEXT:  llvm.insertvalue %[[st0]], %{{.*}}[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<3 x i64>, array<3 x i64>)>
-//  CHECK-NEXT:  llvm.insertvalue %[[N]], %{{.*}}[4, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<3 x i64>, array<3 x i64>)>
-//  CHECK-NEXT:  llvm.insertvalue %[[one]], %{{.*}}[4, 2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<3 x i64>, array<3 x i64>)>
-  %0 = memref.alloc(%arg0, %arg1) : memref<?x42x?xf32>
-  return %0 : memref<?x42x?xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @mixed_dealloc
-func.func @mixed_dealloc(%arg0: memref<?x42x?xf32>) {
-//      CHECK:  %[[ptr:.*]] = llvm.extractvalue %{{.*}}[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<3 x i64>, array<3 x i64>)>
-// CHECK-NEXT:  %[[ptri8:.*]] = llvm.bitcast %[[ptr]] : !llvm.ptr<f32> to !llvm.ptr<i8>
-// CHECK-NEXT:  llvm.call @free(%[[ptri8]]) : (!llvm.ptr<i8>) -> ()
-  memref.dealloc %arg0 : memref<?x42x?xf32>
-  return
-}
-
-// -----
-
-// CHECK-LABEL: func @dynamic_alloc(
-//       CHECK:   %[[Marg:.*]]: index, %[[Narg:.*]]: index)
-func.func @dynamic_alloc(%arg0: index, %arg1: index) -> memref<?x?xf32> {
-//   CHECK-DAG:  %[[M:.*]] = builtin.unrealized_conversion_cast %[[Marg]]
-//   CHECK-DAG:  %[[N:.*]] = builtin.unrealized_conversion_cast %[[Narg]]
-//  CHECK-NEXT:  %[[one:.*]] = llvm.mlir.constant(1 : index) : i64
-//  CHECK-NEXT:  %[[sz:.*]] = llvm.mul %[[N]], %[[M]] : i64
-//  CHECK-NEXT:  %[[null:.*]] = llvm.mlir.zero : !llvm.ptr<f32>
-//  CHECK-NEXT:  %[[gep:.*]] = llvm.getelementptr %[[null]][%[[sz]]] : (!llvm.ptr<f32>, i64) -> !llvm.ptr<f32>
-//  CHECK-NEXT:  %[[sz_bytes:.*]] = llvm.ptrtoint %[[gep]] : !llvm.ptr<f32> to i64
-//  CHECK-NEXT:  llvm.call @malloc(%[[sz_bytes]]) : (i64) -> !llvm.ptr<i8>
-//  CHECK-NEXT:  llvm.bitcast %{{.*}} : !llvm.ptr<i8> to !llvm.ptr<f32>
-//  CHECK-NEXT:  llvm.mlir.undef : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
-//  CHECK-NEXT:  llvm.insertvalue %{{.*}}, %{{.*}}[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
-//  CHECK-NEXT:  llvm.insertvalue %{{.*}}, %{{.*}}[1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
-//  CHECK-NEXT:  %[[off:.*]] = llvm.mlir.constant(0 : index) : i64
-//  CHECK-NEXT:  llvm.insertvalue %[[off]], %{{.*}}[2] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
-//  CHECK-NEXT:  llvm.insertvalue %[[M]], %{{.*}}[3, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
-//  CHECK-NEXT:  llvm.insertvalue %[[N]], %{{.*}}[3, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
-//  CHECK-NEXT:  llvm.insertvalue %[[N]], %{{.*}}[4, 0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
-//  CHECK-NEXT:  llvm.insertvalue %[[one]], %{{.*}}[4, 1] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
-  %0 = memref.alloc(%arg0, %arg1) : memref<?x?xf32>
-  return %0 : memref<?x?xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @dynamic_dealloc
-func.func @dynamic_dealloc(%arg0: memref<?x?xf32>) {
-//      CHECK:  %[[ptr:.*]] = llvm.extractvalue %{{.*}}[0] : !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<2 x i64>, array<2 x i64>)>
-// CHECK-NEXT:  %[[ptri8:.*]] = llvm.bitcast %[[ptr]] : !llvm.ptr<f32> to !llvm.ptr<i8>
-// CHECK-NEXT:  llvm.call @free(%[[ptri8]]) : (!llvm.ptr<i8>) -> ()
-  memref.dealloc %arg0 : memref<?x?xf32>
-  return
-}
-
-// -----
-
-// CHECK-LABEL: func @memref_cast_static_to_dynamic
-func.func @memref_cast_static_to_dynamic(%static : memref<10x42xf32>) {
-// CHECK-NOT: llvm.bitcast
-  %0 = memref.cast %static : memref<10x42xf32> to memref<?x?xf32>
-  return
-}
-
-// -----
-
-// CHECK-LABEL: func @memref_cast_static_to_mixed
-func.func @memref_cast_static_to_mixed(%static : memref<10x42xf32>) {
-// CHECK-NOT: llvm.bitcast
-  %0 = memref.cast %static : memref<10x42xf32> to memref<?x42xf32>
-  return
-}
-
-// -----
-
-// CHECK-LABEL: func @memref_cast_dynamic_to_static
-func.func @memref_cast_dynamic_to_static(%dynamic : memref<?x?xf32>) {
-// CHECK-NOT: llvm.bitcast
-  %0 = memref.cast %dynamic : memref<?x?xf32> to memref<10x12xf32>
-  return
-}
-
-// -----
-
-// CHECK-LABEL: func @memref_cast_dynamic_to_mixed
-func.func @memref_cast_dynamic_to_mixed(%dynamic : memref<?x?xf32>) {
-// CHECK-NOT: llvm.bitcast
-  %0 = memref.cast %dynamic : memref<?x?xf32> to memref<?x12xf32>
-  return
-}
-
-// -----
-
-// CHECK-LABEL: func @memref_cast_mixed_to_dynamic
-func.func @memref_cast_mixed_to_dynamic(%mixed : memref<42x?xf32>) {
-// CHECK-NOT: llvm.bitcast
-  %0 = memref.cast %mixed : memref<42x?xf32> to memref<?x?xf32>
-  return
-}
-
-// -----
-
-// CHECK-LABEL: func @memref_cast_mixed_to_static
-func.func @memref_cast_mixed_to_static(%mixed : memref<42x?xf32>) {
-// CHECK-NOT: llvm.bitcast
-  %0 = memref.cast %mixed : memref<42x?xf32> to memref<42x1xf32>
-  return
-}
-
-// -----
-
-// CHECK-LABEL: func @memref_cast_mixed_to_mixed
-func.func @memref_cast_mixed_to_mixed(%mixed : memref<42x?xf32>) {
-// CHECK-NOT: llvm.bitcast
-  %0 = memref.cast %mixed : memref<42x?xf32> to memref<?x1xf32>
-  return
-}
-
-// -----
-
-// CHECK-LABEL: func @memref_cast_ranked_to_unranked
-// CHECK32-LABEL: func @memref_cast_ranked_to_unranked
-func.func @memref_cast_ranked_to_unranked(%arg : memref<42x2x?xf32>) {
-// CHECK-DAG:  %[[c:.*]] = llvm.mlir.constant(1 : index) : i64
-// CHECK-DAG:  %[[p:.*]] = llvm.alloca %[[c]] x !llvm.struct<(ptr<f32>, ptr<f32>, i64, array<3 x i64>, array<3 x i64>)> : (i64) -> !llvm.ptr<struct<(ptr<f32>, ptr<f32>, i64, array<3 x i64>, array<3 x i64>)>>
-// CHECK-DAG:  llvm.store %{{.*}}, %[[p]] : !llvm.ptr<struct<(ptr<f32>, ptr<f32>, i64, array<3 x i64>, array<3 x i64>)>>
-// CHECK-DAG:  %[[p2:.*]] = llvm.bitcast %[[p]] : !llvm.ptr<struct<(ptr<f32>, ptr<f32>, i64, array<3 x i64>, array<3 x i64>)>> to !llvm.ptr<i8>
-// CHECK-DAG:  %[[r:.*]] = llvm.mlir.constant(3 : index) : i64
-// CHECK    :  llvm.mlir.undef : !llvm.struct<(i64, ptr<i8>)>
-// CHECK-DAG:  llvm.insertvalue %[[r]], %{{.*}}[0] : !llvm.struct<(i64, ptr<i8>)>
-// CHECK-DAG:  llvm.insertvalue %[[p2]], %{{.*}}[1] : !llvm.struct<(i64, ptr<i8>)>
-// CHECK32-DAG:  %[[c:.*]] = llvm.mlir.constant(1 : index) : i64
-// CHECK32-DAG:  %[[p:.*]] = llvm.alloca %[[c]] x !llvm.struct<(ptr<f32>, ptr<f32>, i32, array<3 x i32>, array<3 x i32>)> : (i64) -> !llvm.ptr<struct<(ptr<f32>, ptr<f32>, i32, array<3 x i32>, array<3 x i32>)>>
-// CHECK32-DAG:  llvm.store %{{.*}}, %[[p]] : !llvm.ptr<struct<(ptr<f32>, ptr<f32>, i32, array<3 x i32>, array<3 x i32>)>>
-// CHECK32-DAG:  %[[p2:.*]] = llvm.bitcast %[[p]] : !llvm.ptr<struct<(ptr<f32>, ptr<f32>, i32, array<3 x i32>, array<3 x i32>)>> to !llvm.ptr<i8>
-// CHECK32-DAG:  %[[r:.*]] = llvm.mlir.constant(3 : index) : i32
-// CHECK32    :  llvm.mlir.undef : !llvm.struct<(i32, ptr<i8>)>
-// CHECK32-DAG:  llvm.insertvalue %[[r]], %{{.*}}[0] : !llvm.struct<(i32, ptr<i8>)>
-// CHECK32-DAG:  llvm.insertvalue %[[p2]], %{{.*}}[1] : !llvm.struct<(i32, ptr<i8>)>
-  %0 = memref.cast %arg : memref<42x2x?xf32> to memref<*xf32>
-  return
-}
-
-// -----
-
-// CHECK-LABEL: func @memref_cast_unranked_to_ranked
-func.func @memref_cast_unranked_to_ranked(%arg : memref<*xf32>) {
-//      CHECK: %[[p:.*]] = llvm.extractvalue %{{.*}}[1] : !llvm.struct<(i64, ptr<i8>)>
-// CHECK-NEXT: llvm.bitcast %[[p]] : !llvm.ptr<i8> to !llvm.ptr<struct<(ptr<f32>, ptr<f32>, i64, array<4 x i64>, array<4 x i64>)>>
-  %0 = memref.cast %arg : memref<*xf32> to memref<?x?x10x2xf32>
-  return
-}
-
-// -----
-
-// CHECK-LABEL: @memref_reinterpret_cast_unranked_to_dynamic_shape
-func.func @memref_reinterpret_cast_unranked_to_dynamic_shape(%offset: index,
-                                                        %size_0 : index,
-                                                        %size_1 : index,
-                                                        %stride_0 : index,
-                                                        %stride_1 : index,
-                                                        %input : memref<*xf32>) {
-  %output = memref.reinterpret_cast %input to
-           offset: [%offset], sizes: [%size_0, %size_1],
-           strides: [%stride_0, %stride_1]
-           : memref<*xf32> to memref<?x?xf32, strided<[?, ?], offset: ?>>
-  return
-}
-// CHECK-SAME: ([[OFFSETarg:%[a-z,0-9]+]]: index,
-// CHECK-SAME: [[SIZE_0arg:%[a-z,0-9]+]]: index, [[SIZE_1arg:%[a-z,0-9]+]]: index,
-// CHECK-SAME: [[STRIDE_0arg:%[a-z,0-9]+]]: index, [[STRIDE_1arg:%[a-z,0-9]+]]: index,
-// CHECK-DAG: [[OFFSET:%.*]] = builtin.unrealized_conversion_cast [[OFFSETarg]]
-// CHECK-DAG: [[SIZE_0:%.*]] = builtin.unrealized_conversion_cast [[SIZE_0arg]]
-// CHECK-DAG: [[SIZE_1:%.*]] = builtin.unrealized_conversion_cast [[SIZE_1arg]]
-// CHECK-DAG: [[STRIDE_0:%.*]] = builtin.unrealized_conversion_cast [[STRIDE_0arg]]
-// CHECK-DAG: [[STRIDE_1:%.*]] = builtin.unrealized_conversion_cast [[STRIDE_1arg]]
-// CHECK-DAG: [[INPUT:%.*]] = builtin.unrealized_conversion_cast
-// CHECK: [[OUT_0:%.*]] = llvm.mlir.undef : [[TY:!.*]]
-// CHECK: [[DESCRIPTOR:%.*]] = llvm.extractvalue [[INPUT]][1] : !llvm.struct<(i64, ptr<i8>)>
-// CHECK: [[BASE_PTR_PTR:%.*]] = llvm.bitcast [[DESCRIPTOR]] : !llvm.ptr<i8> to !llvm.ptr<ptr<f32>>
-// CHECK: [[BASE_PTR:%.*]] = llvm.load [[BASE_PTR_PTR]] : !llvm.ptr<ptr<f32>>
-// CHECK: [[BASE_PTR_PTR_:%.*]] = llvm.bitcast [[DESCRIPTOR]] : !llvm.ptr<i8> to !llvm.ptr<ptr<f32>>
-// CHECK: [[ALIGNED_PTR_PTR:%.*]] = llvm.getelementptr [[BASE_PTR_PTR_]]{{\[}}1]
-// CHECK-SAME: : (!llvm.ptr<ptr<f32>>) -> !llvm.ptr<ptr<f32>>
-// CHECK: [[ALIGNED_PTR:%.*]] = llvm.load [[ALIGNED_PTR_PTR]] : !llvm.ptr<ptr<f32>>
-// CHECK: [[OUT_1:%.*]] = llvm.insertvalue [[BASE_PTR]], [[OUT_0]][0] : [[TY]]
-// CHECK: [[OUT_2:%.*]] = llvm.insertvalue [[ALIGNED_PTR]], [[OUT_1]][1] : [[TY]]
-// CHECK: [[OUT_3:%.*]] = llvm.insertvalue [[OFFSET]], [[OUT_2]][2] : [[TY]]
-// CHECK: [[OUT_4:%.*]] = llvm.insertvalue [[SIZE_0]], [[OUT_3]][3, 0] : [[TY]]
-// CHECK: [[OUT_5:%.*]] = llvm.insertvalue [[STRIDE_0]], [[OUT_4]][4, 0] : [[TY]]
-// CHECK: [[OUT_6:%.*]] = llvm.insertvalue [[SIZE_1]], [[OUT_5]][3, 1] : [[TY]]
-// CHECK: [[OUT_7:%.*]] = llvm.insertvalue [[STRIDE_1]], [[OUT_6]][4, 1] : [[TY]]
-
-// -----
-
-// CHECK-LABEL: @memref_reshape
-func.func @memref_reshape(%input : memref<2x3xf32>, %shape : memref<?xindex>) {
-  %output = memref.reshape %input(%shape)
-                : (memref<2x3xf32>, memref<?xindex>) -> memref<*xf32>
-  return
-}
-// CHECK: [[INPUT:%.*]] = builtin.unrealized_conversion_cast %{{.*}} to [[INPUT_TY:!.*]]
-// CHECK: [[SHAPE:%.*]] = builtin.unrealized_conversion_cast %{{.*}} to [[SHAPE_TY:!.*]]
-// CHECK: [[RANK:%.*]] = llvm.extractvalue [[SHAPE]][3, 0] : [[SHAPE_TY]]
-// CHECK: [[UNRANKED_OUT_O:%.*]] = llvm.mlir.undef : !llvm.struct<(i64, ptr<i8>)>
-// CHECK: [[UNRANKED_OUT_1:%.*]] = llvm.insertvalue [[RANK]], [[UNRANKED_OUT_O]][0] : !llvm.struct<(i64, ptr<i8>)>
-
-// Compute size in bytes to allocate result ranked descriptor
-// CHECK: [[C1:%.*]] = llvm.mlir.constant(1 : index) : i64
-// CHECK: [[C2:%.*]] = llvm.mlir.constant(2 : index) : i64
-// CHECK: [[INDEX_SIZE:%.*]] = llvm.mlir.constant(8 : index) : i64
-// CHECK: [[PTR_SIZE:%.*]] = llvm.mlir.constant(8 : index) : i64
-// CHECK: [[DOUBLE_PTR_SIZE:%.*]] = llvm.mul [[C2]], [[PTR_SIZE]] : i64
-// CHECK: [[DESC_ALLOC_SIZE:%.*]] = llvm.add [[DOUBLE_PTR_SIZE]], %{{.*}}
-// CHECK: [[UNDERLYING_DESC:%.*]] = llvm.alloca [[DESC_ALLOC_SIZE]] x i8
-// CHECK: llvm.insertvalue [[UNDERLYING_DESC]], [[UNRANKED_OUT_1]][1]
-
-// Set allocated, aligned pointers and offset.
-// CHECK: [[ALLOC_PTR:%.*]] = llvm.extractvalue [[INPUT]][0] : [[INPUT_TY]]
-// CHECK: [[ALIGN_PTR:%.*]] = llvm.extractvalue [[INPUT]][1] : [[INPUT_TY]]
-// CHECK: [[OFFSET:%.*]] = llvm.extractvalue [[INPUT]][2] : [[INPUT_TY]]
-// CHECK: [[BASE_PTR_PTR:%.*]] = llvm.bitcast [[UNDERLYING_DESC]]
-// CHECK-SAME:                     !llvm.ptr<i8> to !llvm.ptr<ptr<f32>>
-// CHECK: llvm.store [[ALLOC_PTR]], [[BASE_PTR_PTR]] : !llvm.ptr<ptr<f32>>
-// CHECK: [[BASE_PTR_PTR_:%.*]] = llvm.bitcast [[UNDERLYING_DESC]] : !llvm.ptr<i8> to !llvm.ptr<ptr<f32>>
-// CHECK: [[ALIGNED_PTR_PTR:%.*]] = llvm.getelementptr [[BASE_PTR_PTR_]]{{\[}}1]
-// CHECK: llvm.store [[ALIGN_PTR]], [[ALIGNED_PTR_PTR]] : !llvm.ptr<ptr<f32>>
-// CHECK: [[BASE_PTR_PTR__:%.*]] = llvm.bitcast [[UNDERLYING_DESC]] : !llvm.ptr<i8> to !llvm.ptr<ptr<f32>>
-// CHECK: [[OFFSET_PTR_:%.*]] = llvm.getelementptr [[BASE_PTR_PTR__]]{{\[}}2]
-// CHECK: [[OFFSET_PTR:%.*]] = llvm.bitcast [[OFFSET_PTR_]]
-// CHECK: llvm.store [[OFFSET]], [[OFFSET_PTR]] : !llvm.ptr<i64>
-
-// Iterate over shape operand in reverse order and set sizes and strides.
-// CHECK: [[STRUCT_PTR:%.*]] = llvm.bitcast [[UNDERLYING_DESC]]
-// CHECK-SAME: !llvm.ptr<i8> to !llvm.ptr<struct<(ptr<f32>, ptr<f32>, i64, i64)>>
-// CHECK: [[SIZES_PTR:%.*]] = llvm.getelementptr [[STRUCT_PTR]]{{\[}}0, 3]
-// CHECK: [[STRIDES_PTR:%.*]] = llvm.getelementptr [[SIZES_PTR]]{{\[}}[[RANK]]]
-// CHECK: [[SHAPE_IN_PTR:%.*]] = llvm.extractvalue [[SHAPE]][1] : [[SHAPE_TY]]
-// CHECK: [[C1_:%.*]] = llvm.mlir.constant(1 : index) : i64
-// CHECK: [[RANK_MIN_1:%.*]] = llvm.sub [[RANK]], [[C1_]] : i64
-// CHECK: llvm.br ^bb1([[RANK_MIN_1]], [[C1_]] : i64, i64)
-
-// CHECK: ^bb1([[DIM:%.*]]: i64, [[CUR_STRIDE:%.*]]: i64):
-// CHECK:   [[C0_:%.*]] = llvm.mlir.constant(0 : index) : i64
-// CHECK:   [[COND:%.*]] = llvm.icmp "sge" [[DIM]], [[C0_]] : i64
-// CHECK:   llvm.cond_br [[COND]], ^bb2, ^bb3
-
-// CHECK: ^bb2:
-// CHECK:   [[SIZE_PTR:%.*]] = llvm.getelementptr [[SHAPE_IN_PTR]]{{\[}}[[DIM]]]
-// CHECK:   [[SIZE:%.*]] = llvm.load [[SIZE_PTR]] : !llvm.ptr<i64>
-// CHECK:   [[TARGET_SIZE_PTR:%.*]] = llvm.getelementptr [[SIZES_PTR]]{{\[}}[[DIM]]]
-// CHECK:   llvm.store [[SIZE]], [[TARGET_SIZE_PTR]] : !llvm.ptr<i64>
-// CHECK:   [[TARGET_STRIDE_PTR:%.*]] = llvm.getelementptr [[STRIDES_PTR]]{{\[}}[[DIM]]]
-// CHECK:   llvm.store [[CUR_STRIDE]], [[TARGET_STRIDE_PTR]] : !llvm.ptr<i64>
-// CHECK:   [[UPDATE_STRIDE:%.*]] = llvm.mul [[CUR_STRIDE]], [[SIZE]] : i64
-// CHECK:   [[STRIDE_COND:%.*]] = llvm.sub [[DIM]], [[C1_]] : i64
-// CHECK:   llvm.br ^bb1([[STRIDE_COND]], [[UPDATE_STRIDE]] : i64, i64)
-
-// CHECK: ^bb3:
-// CHECK:   return

From e98f3bfff2c499e76ce8784ea15cedbfb83b33bf Mon Sep 17 00:00:00 2001
From: Ben Shi <2283975856@qq.com>
Date: Thu, 2 Nov 2023 14:41:33 +0800
Subject: [PATCH 782/877] [clang][analyzer]][NFC] Simplify method
 'ensureStreamNonNull' of StreamChecker (#70927)

The passed in parameter 'State' is always identical to 'C.getState()'.
---
 clang/lib/StaticAnalyzer/Checkers/StreamChecker.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/lib/StaticAnalyzer/Checkers/StreamChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/StreamChecker.cpp
index 4b7103c20557c..a5f8d855f8e06 100644
--- a/clang/lib/StaticAnalyzer/Checkers/StreamChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/StreamChecker.cpp
@@ -1037,7 +1037,7 @@ StreamChecker::ensureStreamNonNull(SVal StreamVal, const Expr *StreamE,
   ConstraintManager &CM = C.getConstraintManager();
 
   ProgramStateRef StateNotNull, StateNull;
-  std::tie(StateNotNull, StateNull) = CM.assumeDual(C.getState(), *Stream);
+  std::tie(StateNotNull, StateNull) = CM.assumeDual(State, *Stream);
 
   if (!StateNotNull && StateNull) {
     if (ExplodedNode *N = C.generateErrorNode(StateNull)) {

From 5a9b99630b4934b194f2cbc67329e0665172ac6c Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Thu, 2 Nov 2023 14:54:20 +0900
Subject: [PATCH 783/877] X86: Move ExpandLargeFpConvert tests to
 test/Transforms

---
 .../X86/expand-large-fp-convert-fptosi129.ll                    | 0
 .../X86/expand-large-fp-convert-fptoui129.ll                    | 0
 .../X86/expand-large-fp-convert-si129tofp.ll                    | 0
 .../X86/expand-large-fp-convert-ui129tofp.ll                    | 0
 llvm/test/Transforms/ExpandLargeFpConvert/X86/lit.local.cfg     | 2 ++
 5 files changed, 2 insertions(+)
 rename llvm/test/{CodeGen => Transforms/ExpandLargeFpConvert}/X86/expand-large-fp-convert-fptosi129.ll (100%)
 rename llvm/test/{CodeGen => Transforms/ExpandLargeFpConvert}/X86/expand-large-fp-convert-fptoui129.ll (100%)
 rename llvm/test/{CodeGen => Transforms/ExpandLargeFpConvert}/X86/expand-large-fp-convert-si129tofp.ll (100%)
 rename llvm/test/{CodeGen => Transforms/ExpandLargeFpConvert}/X86/expand-large-fp-convert-ui129tofp.ll (100%)
 create mode 100644 llvm/test/Transforms/ExpandLargeFpConvert/X86/lit.local.cfg

diff --git a/llvm/test/CodeGen/X86/expand-large-fp-convert-fptosi129.ll b/llvm/test/Transforms/ExpandLargeFpConvert/X86/expand-large-fp-convert-fptosi129.ll
similarity index 100%
rename from llvm/test/CodeGen/X86/expand-large-fp-convert-fptosi129.ll
rename to llvm/test/Transforms/ExpandLargeFpConvert/X86/expand-large-fp-convert-fptosi129.ll
diff --git a/llvm/test/CodeGen/X86/expand-large-fp-convert-fptoui129.ll b/llvm/test/Transforms/ExpandLargeFpConvert/X86/expand-large-fp-convert-fptoui129.ll
similarity index 100%
rename from llvm/test/CodeGen/X86/expand-large-fp-convert-fptoui129.ll
rename to llvm/test/Transforms/ExpandLargeFpConvert/X86/expand-large-fp-convert-fptoui129.ll
diff --git a/llvm/test/CodeGen/X86/expand-large-fp-convert-si129tofp.ll b/llvm/test/Transforms/ExpandLargeFpConvert/X86/expand-large-fp-convert-si129tofp.ll
similarity index 100%
rename from llvm/test/CodeGen/X86/expand-large-fp-convert-si129tofp.ll
rename to llvm/test/Transforms/ExpandLargeFpConvert/X86/expand-large-fp-convert-si129tofp.ll
diff --git a/llvm/test/CodeGen/X86/expand-large-fp-convert-ui129tofp.ll b/llvm/test/Transforms/ExpandLargeFpConvert/X86/expand-large-fp-convert-ui129tofp.ll
similarity index 100%
rename from llvm/test/CodeGen/X86/expand-large-fp-convert-ui129tofp.ll
rename to llvm/test/Transforms/ExpandLargeFpConvert/X86/expand-large-fp-convert-ui129tofp.ll
diff --git a/llvm/test/Transforms/ExpandLargeFpConvert/X86/lit.local.cfg b/llvm/test/Transforms/ExpandLargeFpConvert/X86/lit.local.cfg
new file mode 100644
index 0000000000000..42bf50dcc13c3
--- /dev/null
+++ b/llvm/test/Transforms/ExpandLargeFpConvert/X86/lit.local.cfg
@@ -0,0 +1,2 @@
+if not "X86" in config.root.targets:
+    config.unsupported = True

From e98195f318375978e3e0b153cade8bb3a05029bb Mon Sep 17 00:00:00 2001
From: Konstantin Varlamov <var-const@users.noreply.github.com>
Date: Wed, 1 Nov 2023 23:57:05 -0700
Subject: [PATCH 784/877] [libc++] Fix another potentially flaky atomic test.
 (#71011)

This is a follow-up to https://github.com/llvm/llvm-project/pull/70436.
---
 .../atomic_notify_all.pass.cpp                     | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.wait/atomic_notify_all.pass.cpp b/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.wait/atomic_notify_all.pass.cpp
index 2c8161eedbc4d..e446bcf5cfc97 100644
--- a/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.wait/atomic_notify_all.pass.cpp
+++ b/libcxx/test/std/atomics/atomics.types.operations/atomics.types.operations.wait/atomic_notify_all.pass.cpp
@@ -39,15 +39,21 @@ struct TestFn {
     {
       A a(T(1));
       static_assert(noexcept(std::atomic_notify_all(&a)), "");
-      auto f = [&]() {
+
+      std::atomic<bool> is_ready[2] = {false, false};
+      auto f                        = [&](int index) {
         assert(std::atomic_load(&a) == T(1));
+        is_ready[index].store(true);
+
         std::atomic_wait(&a, T(1));
         assert(std::atomic_load(&a) == T(3));
       };
-      std::thread t1 = support::make_test_thread(f);
-      std::thread t2 = support::make_test_thread(f);
-      std::this_thread::sleep_for(std::chrono::milliseconds(100));
+      std::thread t1 = support::make_test_thread(f, /*index=*/0);
+      std::thread t2 = support::make_test_thread(f, /*index=*/1);
 
+      while (!is_ready[0] || !is_ready[1]) {
+        // Spin
+      }
       std::atomic_store(&a, T(3));
       std::atomic_notify_all(&a);
       t1.join();

From 3bc056d5f0ebe9e4074afa088c3a0355f9ab901a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Thu, 2 Nov 2023 09:51:33 +0200
Subject: [PATCH 785/877] [clang-repl] [test] Make an XFAIL more precise
 (#70991)

The const.cpp testcase fails when running in MSVC mode, while it does
succeed in MinGW mode.

In MSVC mode, there are more constructor invocations than expected, as
the printout looks like this:

    A(1), this = 0000025597930000
    A(1), this = 0000025597930000
    f: this = 0000025597930000, val = 1
    A(1), this = 0000025597930000
    f: this = 0000025597930000, val = 1
    ~A, this = 0000025597930000, val = 1
    ~A, this = 0000025597930000, val = 1
    ~A, this = 0000025597930000, val = 1

While the expected printout looks like this:

    A(1), this = 000002C903E10000
    f: this = 000002C903E10000, val = 1
    f: this = 000002C903E10000, val = 1
    ~A, this = 000002C903E10000, val = 1
---
 clang/test/Interpreter/const.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/test/Interpreter/const.cpp b/clang/test/Interpreter/const.cpp
index 4b6ce65e3643e..1412a1d85d6f5 100644
--- a/clang/test/Interpreter/const.cpp
+++ b/clang/test/Interpreter/const.cpp
@@ -1,6 +1,6 @@
 // UNSUPPORTED: system-aix
 // see https://github.com/llvm/llvm-project/issues/68092
-// XFAIL: system-windows
+// XFAIL: target={{.*}}-windows-msvc
 
 // RUN: cat %s | clang-repl | FileCheck %s
 // RUN: cat %s | clang-repl -Xcc -O2 | FileCheck %s

From 0eb516817d3da57a272d7aae85d63be718b97d73 Mon Sep 17 00:00:00 2001
From: Carl Ritson <carl.ritson@amd.com>
Date: Thu, 2 Nov 2023 17:16:19 +0900
Subject: [PATCH 786/877] [AMDGPU] Remove dom tree requirements from
 SIWholeQuadMode pass (#71012)

SIWholeQuadMode preserves dominator and post dominator trees, but does
not require them.
---
 llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp | 6 ++----
 llvm/test/CodeGen/AMDGPU/llc-pipeline.ll   | 5 -----
 2 files changed, 2 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
index 3143d437e3709..d1d828bfbdb70 100644
--- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -238,9 +238,7 @@ class SIWholeQuadMode : public MachineFunctionPass {
     AU.addRequired<LiveIntervals>();
     AU.addPreserved<SlotIndexes>();
     AU.addPreserved<LiveIntervals>();
-    AU.addRequired<MachineDominatorTree>();
     AU.addPreserved<MachineDominatorTree>();
-    AU.addRequired<MachinePostDominatorTree>();
     AU.addPreserved<MachinePostDominatorTree>();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
@@ -1594,8 +1592,8 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
   TRI = &TII->getRegisterInfo();
   MRI = &MF.getRegInfo();
   LIS = &getAnalysis<LiveIntervals>();
-  MDT = &getAnalysis<MachineDominatorTree>();
-  PDT = &getAnalysis<MachinePostDominatorTree>();
+  MDT = getAnalysisIfAvailable<MachineDominatorTree>();
+  PDT = getAnalysisIfAvailable<MachinePostDominatorTree>();
 
   if (ST->isWave32()) {
     AndOpc = AMDGPU::S_AND_B32;
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
index bf2f6e983b529..5e922f608c1da 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -105,7 +105,6 @@
 ; GCN-O0-NEXT:        MachineDominator Tree Construction
 ; GCN-O0-NEXT:        Slot index numbering
 ; GCN-O0-NEXT:        Live Interval Analysis
-; GCN-O0-NEXT:        MachinePostDominator Tree Construction
 ; GCN-O0-NEXT:        SI Whole Quad Mode
 ; GCN-O0-NEXT:        Virtual Register Map
 ; GCN-O0-NEXT:        Live Register Matrix
@@ -334,7 +333,6 @@
 ; GCN-O1-NEXT:        Register Coalescer
 ; GCN-O1-NEXT:        Rename Disconnected Subregister Components
 ; GCN-O1-NEXT:        Machine Instruction Scheduler
-; GCN-O1-NEXT:        MachinePostDominator Tree Construction
 ; GCN-O1-NEXT:        SI Whole Quad Mode
 ; GCN-O1-NEXT:        Virtual Register Map
 ; GCN-O1-NEXT:        Live Register Matrix
@@ -626,7 +624,6 @@
 ; GCN-O1-OPTS-NEXT:        Rename Disconnected Subregister Components
 ; GCN-O1-OPTS-NEXT:        AMDGPU Pre-RA optimizations
 ; GCN-O1-OPTS-NEXT:        Machine Instruction Scheduler
-; GCN-O1-OPTS-NEXT:        MachinePostDominator Tree Construction
 ; GCN-O1-OPTS-NEXT:        SI Whole Quad Mode
 ; GCN-O1-OPTS-NEXT:        Virtual Register Map
 ; GCN-O1-OPTS-NEXT:        Live Register Matrix
@@ -929,7 +926,6 @@
 ; GCN-O2-NEXT:        Rename Disconnected Subregister Components
 ; GCN-O2-NEXT:        AMDGPU Pre-RA optimizations
 ; GCN-O2-NEXT:        Machine Instruction Scheduler
-; GCN-O2-NEXT:        MachinePostDominator Tree Construction
 ; GCN-O2-NEXT:        SI Whole Quad Mode
 ; GCN-O2-NEXT:        Virtual Register Map
 ; GCN-O2-NEXT:        Live Register Matrix
@@ -1245,7 +1241,6 @@
 ; GCN-O3-NEXT:        Rename Disconnected Subregister Components
 ; GCN-O3-NEXT:        AMDGPU Pre-RA optimizations
 ; GCN-O3-NEXT:        Machine Instruction Scheduler
-; GCN-O3-NEXT:        MachinePostDominator Tree Construction
 ; GCN-O3-NEXT:        SI Whole Quad Mode
 ; GCN-O3-NEXT:        Virtual Register Map
 ; GCN-O3-NEXT:        Live Register Matrix

From af50d6efc92dc9c9bf581aacb3086cb3c3a4f329 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Thu, 2 Nov 2023 09:36:28 +0100
Subject: [PATCH 787/877] [SeparateConstOffsetFromGEP] Avoid use of
 ConstantExpr::getCast()

Try to constant fold, otherwise fall back to generating an
instruction.
---
 .../Scalar/SeparateConstOffsetFromGEP.cpp       | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
index ba4ecf8b8853e..b8c9d9d100f11 100644
--- a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
+++ b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
@@ -661,15 +661,16 @@ Value *ConstantOffsetExtractor::applyExts(Value *V) {
   // in the reversed order.
   for (CastInst *I : llvm::reverse(ExtInsts)) {
     if (Constant *C = dyn_cast<Constant>(Current)) {
-      // If Current is a constant, apply s/zext using ConstantExpr::getCast.
-      // ConstantExpr::getCast emits a ConstantInt if C is a ConstantInt.
-      Current = ConstantExpr::getCast(I->getOpcode(), C, I->getType());
-    } else {
-      Instruction *Ext = I->clone();
-      Ext->setOperand(0, Current);
-      Ext->insertBefore(IP);
-      Current = Ext;
+      // Try to constant fold the cast.
+      Current = ConstantFoldCastOperand(I->getOpcode(), C, I->getType(), DL);
+      if (Current)
+        continue;
     }
+
+    Instruction *Ext = I->clone();
+    Ext->setOperand(0, Current);
+    Ext->insertBefore(IP);
+    Current = Ext;
   }
   return Current;
 }

From 0d2143611425081bb9db5bb6ee57aaddfd1eda53 Mon Sep 17 00:00:00 2001
From: Chuanqi Xu <yedeng.yd@linux.alibaba.com>
Date: Thu, 2 Nov 2023 16:40:20 +0800
Subject: [PATCH 788/877] [C++20] [Modules] Warn if we found #include
 <filename> in module purview (#69555)

Close https://github.com/llvm/llvm-project/issues/68615.

It is generally wrong to include <filename> in the module purview.
Although there are cases to include files in the module purview,
generally these use cases should include files by quotes instead of by
angles. Here we think the files got included by angles are the system
headers.

This is consistency with MSVC too:
https://learn.microsoft.com/en-us/cpp/error-messages/compiler-warnings/compiler-warnings-by-compiler-version?view=msvc-170#warnings-introduced-in-visual-studio-2022-version-170-compiler-version-1930
---
 .../include/clang/Basic/DiagnosticLexKinds.td |  5 ++
 clang/lib/Lex/PPDirectives.cpp                |  4 ++
 .../include-in-module-purview.cppm            | 60 +++++++++++++++++++
 3 files changed, 69 insertions(+)
 create mode 100644 clang/test/Preprocessor/include-in-module-purview.cppm

diff --git a/clang/include/clang/Basic/DiagnosticLexKinds.td b/clang/include/clang/Basic/DiagnosticLexKinds.td
index 940cca6736849..564ca48cc32ac 100644
--- a/clang/include/clang/Basic/DiagnosticLexKinds.td
+++ b/clang/include/clang/Basic/DiagnosticLexKinds.td
@@ -922,6 +922,11 @@ def err_header_import_semi_in_macro : Error<
 def err_header_import_not_header_unit : Error<
   "header file %0 (aka '%1') cannot be imported because "
   "it is not known to be a header unit">;
+def warn_pp_include_angled_in_module_purview : Warning<
+  "'#include <filename>' attaches the declarations to the named module '%0'"
+  ", which is not usually intended; consider moving that directive before "
+  "the module declaration">,
+  InGroup<DiagGroup<"include-angled-in-module-purview">>;
 
 def warn_header_guard : Warning<
   "%0 is used as a header guard here, followed by #define of a different macro">,
diff --git a/clang/lib/Lex/PPDirectives.cpp b/clang/lib/Lex/PPDirectives.cpp
index 6fd515a5f0c75..d97a103833c2f 100644
--- a/clang/lib/Lex/PPDirectives.cpp
+++ b/clang/lib/Lex/PPDirectives.cpp
@@ -2537,6 +2537,10 @@ Preprocessor::ImportAction Preprocessor::HandleHeaderIncludeOrImport(
     return {ImportAction::None};
   }
 
+  if (isAngled && isInNamedModule())
+    Diag(FilenameTok, diag::warn_pp_include_angled_in_module_purview)
+        << getNamedModuleName();
+
   // Look up the file, create a File ID for it.
   SourceLocation IncludePos = FilenameTok.getLocation();
   // If the filename string was the result of macro expansions, set the include
diff --git a/clang/test/Preprocessor/include-in-module-purview.cppm b/clang/test/Preprocessor/include-in-module-purview.cppm
new file mode 100644
index 0000000000000..0a080112b4327
--- /dev/null
+++ b/clang/test/Preprocessor/include-in-module-purview.cppm
@@ -0,0 +1,60 @@
+// RUN: rm -rf %t
+// RUN: mkdir %t
+// RUN: split-file %s %t
+//
+// RUN: %clang_cc1 -std=c++20 %t/a.cppm -E -P -I%t -o %t/tmp 2>&1 | FileCheck %t/a.cppm
+// RUN: %clang_cc1 -std=c++20 %t/a.cppm -E -P -I%t -o - 2>&1 \
+// RUN:     -Wno-include-angled-in-module-purview | FileCheck %t/a.cppm --check-prefix=CHECK-NO-WARN
+
+//--- a.h
+// left empty
+
+//--- b.h
+#include <stddef.h>
+// The headers not get included shouldn't be affected.
+#ifdef WHATEVER
+#include <stdint.h>
+#endif
+
+//--- a.cppm
+module;
+#include <stddef.h>
+#include <a.h>
+#include <b.h>
+#include "a.h"
+#include "b.h"
+export module a;
+
+#include <stddef.h>
+#include <a.h>
+#include <b.h>
+#include "a.h"
+#include "b.h"
+
+// CHECK: a.cppm:9:10: warning: '#include <filename>' attaches the declarations to the named module 'a'
+// CHECK: a.cppm:10:10: warning: '#include <filename>' attaches the declarations to the named module 'a'
+// CHECK: a.cppm:11:10: warning: '#include <filename>' attaches the declarations to the named module 'a'
+// CHECK: In file included from {{.*}}/a.cppm:11
+// CHECK-NEXT: b.h:1:10: warning: '#include <filename>' attaches the declarations to the named module 'a'
+// CHECK: In file included from {{.*}}/a.cppm:13
+// CHECK-NEXT: b.h:1:10: warning: '#include <filename>' attaches the declarations to the named module 'a'
+
+module :private;
+#include <stddef.h>
+#include <a.h>
+#include <b.h>
+#include "a.h"
+#include "b.h"
+
+// CHECK: a.cppm:24:10: warning: '#include <filename>' attaches the declarations to the named module 'a'
+// CHECK: a.cppm:25:10: warning: '#include <filename>' attaches the declarations to the named module 'a'
+// CHECK: a.cppm:26:10: warning: '#include <filename>' attaches the declarations to the named module 'a'
+// CHECK: In file included from {{.*}}/a.cppm:26
+// CHECK-NEXT: b.h:1:10: warning: '#include <filename>' attaches the declarations to the named module 'a'
+// CHECK: In file included from {{.*}}/a.cppm:28
+// CHECK-NEXT: b.h:1:10: warning: '#include <filename>' attaches the declarations to the named module 'a'
+
+// We should have catched all warnings.
+// CHECK: 10 warnings generated.
+
+// CHECK-NO-WARN-NOT: warning

From 8f564e014e6a1e6a871b211de2fa670de23dc869 Mon Sep 17 00:00:00 2001
From: Cullen Rhodes <cullen.rhodes@arm.com>
Date: Thu, 2 Nov 2023 08:43:37 +0000
Subject: [PATCH 789/877] [mlir][ArmSME] Add mask operand to store_tile_slice
 (#70838)

---
 .../mlir/Dialect/ArmSME/IR/ArmSMEOps.td       |  44 ++++---
 .../Conversion/ArmSMEToSCF/ArmSMEToSCF.cpp    |   9 +-
 .../Transforms/LegalizeForLLVMExport.cpp      |  28 ++---
 .../ArmSMEToSCF/arm-sme-to-scf.mlir           |   3 +-
 mlir/test/Dialect/ArmSME/arm-sme-to-llvm.mlir |  76 ++++++------
 mlir/test/Dialect/ArmSME/invalid.mlir         |  16 ++-
 mlir/test/Dialect/ArmSME/roundtrip.mlir       | 114 +++++++++---------
 7 files changed, 156 insertions(+), 134 deletions(-)

diff --git a/mlir/include/mlir/Dialect/ArmSME/IR/ArmSMEOps.td b/mlir/include/mlir/Dialect/ArmSME/IR/ArmSMEOps.td
index 8a34472d3b3a2..3b8a4cb60f910 100644
--- a/mlir/include/mlir/Dialect/ArmSME/IR/ArmSMEOps.td
+++ b/mlir/include/mlir/Dialect/ArmSME/IR/ArmSMEOps.td
@@ -66,6 +66,15 @@ class HasMatchingMaskTypeConstraint<string vector, string mask> :
     vector, mask,
     "::llvm::cast<mlir::VectorType>($_self).cloneWith({}, IntegerType::get($_ctxt, 1))">;
 
+class TileSliceMaskConstraint<string tile, string mask> :
+  TypesMatchWith<
+    "`" # mask # "` has i1 element type and the shape is a slice of `" # tile # "`",
+    tile, mask,
+    "VectorType("
+      "VectorType::Builder("
+        "::llvm::cast<mlir::VectorType>($_self)"
+      ").dropDim(0).setElementType(IntegerType::get($_self.getContext(), 1)))">;
+
 //===----------------------------------------------------------------------===//
 // ArmSME attr definitions
 //===----------------------------------------------------------------------===//
@@ -408,15 +417,7 @@ def TileStoreOp : ArmSME_Op<"tile_store", [
 }
 
 def LoadTileSliceOp : ArmSME_Op<"load_tile_slice", [
-    AllTypesMatch<["tile", "result"]>,
-    TypesMatchWith<
-      "mask has i1 element type and is a slice of the result",
-      "result", "mask",
-      "VectorType("
-        "VectorType::Builder("
-          "::llvm::cast<mlir::VectorType>($_self)"
-        ").dropDim(0).setElementType(IntegerType::get($_self.getContext(), 1))"
-      ")">,
+  AllTypesMatch<["tile", "result"]>, TileSliceMaskConstraint<"result", "mask">
 ]> {
   let summary = "Tile slice load and update operation";
   let description = [{
@@ -432,9 +433,8 @@ def LoadTileSliceOp : ArmSME_Op<"load_tile_slice", [
     dimensions since the operation is scalable, and the element type must be a
     scalar that matches the element type of the result.
 
-    An SSA value `mask` specifies to mask out elements read from the MemRef.
-    The `mask` type is an `i1` vector with a shape that matches how elements
-    are read from the MemRef.
+    The provided `mask` is used to specify which elements of the tile slice
+    will be loaded.
 
     Example 1: Load a vector<[16]xi8> tile slice from memory into tile horizontally (default) at given index.
     ```mlir
@@ -474,7 +474,9 @@ def LoadTileSliceOp : ArmSME_Op<"load_tile_slice", [
   }];
 }
 
-def StoreTileSliceOp : ArmSME_Op<"store_tile_slice"> {
+def StoreTileSliceOp : ArmSME_Op<"store_tile_slice", [
+  TileSliceMaskConstraint<"tile", "mask">
+]> {
   let summary = "Tile slice store operation";
   let description = [{
     Stores a 1D tile slice from a 2D SME "virtual tile" into memory. The tile
@@ -489,22 +491,26 @@ def StoreTileSliceOp : ArmSME_Op<"store_tile_slice"> {
     dimensions since the operation is scalable, and the element type must be a
     scalar that matches the element type of the input tile.
 
+    The provided `mask` is used to specify which elements of the tile slice
+    will be stored.
+
     Example 1: Store vector<[16]xi8> horizontal (default) tile slice from tile at given index to memory.
     ```mlir
-    arm_sme.store_tile_slice %tile, %tile_slice_index, %base[%c0] : vector<[16]x[16]xi8>, memref<?x?xi8>
+    arm_sme.store_tile_slice %tile, %tile_slice_index, %mask, %base[%c0] : vector<[16]x[16]xi8>, vector<[16]xi1>, memref<?x?xi8>
     ```
 
     Example 2: Store vector<[4]xf32> vertical tile slice from tile at given index to memory.
     ```mlir
-    arm_sme.store_tile_slice %tile, %tile_slice_index, %base[%c0] layout<vertical> : vector<[4]x[4]xf32>, memref<?x?xf32>
+    arm_sme.store_tile_slice %tile, %tile_slice_index, %mask, %base[%c0] layout<vertical> : vector<[4]x[4]xf32>, vector<[4]xi1>, memref<?x?xf32>
     ```
 
     Example 3: Store a vector<[1]xi128> vertical tile slice from tile at given index to memory.
     ```mlir
-    arm_sme.store_tile_slice %tile, %tile_slice_index, %base[%c0] layout<vertical> : vector<[1]x[1]xi128>, memref<?x?xi128>
+    arm_sme.store_tile_slice %tile, %tile_slice_index, %mask, %base[%c0] layout<vertical> : vector<[1]x[1]xi128>, vector<[1]xi1>, memref<?x?xi128>
     ```
   }];
-  let arguments = (ins SMETile:$tile, Index:$tile_slice_index,
+  let arguments = (ins
+    SMETile:$tile, Index:$tile_slice_index, SVEPredicate:$mask,
     Arg<AnyMemRef, "the reference to store to", [MemWrite]>:$base,
     Variadic<Index>:$indices, ArmSME_TileSliceLayoutAttr:$layout
   );
@@ -518,8 +524,8 @@ def StoreTileSliceOp : ArmSME_Op<"store_tile_slice"> {
   }];
 
   let assemblyFormat = [{
-    $tile `,` $tile_slice_index `,` $base `[` $indices `]` (`layout` `` $layout^)?
-      attr-dict `:` type($base) `,` type($tile)
+    $tile `,` $tile_slice_index `,` $mask `,` $base `[` $indices `]` (`layout` `` $layout^)?
+      attr-dict `:` type($base) `,` type($mask) `,` type($tile)
   }];
 }
 
diff --git a/mlir/lib/Conversion/ArmSMEToSCF/ArmSMEToSCF.cpp b/mlir/lib/Conversion/ArmSMEToSCF/ArmSMEToSCF.cpp
index 50cc818f1ffc0..80da6ffda1ed2 100644
--- a/mlir/lib/Conversion/ArmSMEToSCF/ArmSMEToSCF.cpp
+++ b/mlir/lib/Conversion/ArmSMEToSCF/ArmSMEToSCF.cpp
@@ -190,6 +190,12 @@ struct TileStoreOpConversion : public OpRewritePattern<arm_sme::TileStoreOp> {
 
     rewriter.setInsertionPointToStart(forOp.getBody());
 
+    // Create an 'all true' predicate for the tile slice.
+    auto predicateType =
+        VectorType::get(tileType.getDimSize(1), rewriter.getI1Type(), true);
+    auto allTruePredicate = rewriter.create<arith::ConstantOp>(
+        loc, DenseElementsAttr::get(predicateType, true));
+
     SmallVector<Value> memrefIndices;
     auto tileSliceIndex = forOp.getInductionVar();
     getMemrefIndices(tileStoreOp.getIndices(),
@@ -197,7 +203,8 @@ struct TileStoreOpConversion : public OpRewritePattern<arm_sme::TileStoreOp> {
                      numTileSlices, memrefIndices, loc, rewriter);
     rewriter.replaceOpWithNewOp<arm_sme::StoreTileSliceOp>(
         tileStoreOp, tileStoreOp.getValueToStore(), tileSliceIndex,
-        tileStoreOp.getBase(), memrefIndices, tileStoreOp.getLayout());
+        allTruePredicate, tileStoreOp.getBase(), memrefIndices,
+        tileStoreOp.getLayout());
 
     return success();
   }
diff --git a/mlir/lib/Dialect/ArmSME/Transforms/LegalizeForLLVMExport.cpp b/mlir/lib/Dialect/ArmSME/Transforms/LegalizeForLLVMExport.cpp
index 7dd04e25075c8..d1a54658a595b 100644
--- a/mlir/lib/Dialect/ArmSME/Transforms/LegalizeForLLVMExport.cpp
+++ b/mlir/lib/Dialect/ArmSME/Transforms/LegalizeForLLVMExport.cpp
@@ -278,13 +278,7 @@ struct StoreTileSliceToArmSMELowering
     auto tileSliceI32 = rewriter.create<arith::IndexCastUIOp>(
         loc, rewriter.getI32Type(), tileSlice);
 
-    // Create all active predicate mask.
-    auto one = rewriter.create<arith::ConstantOp>(
-        loc, rewriter.getI1Type(),
-        rewriter.getIntegerAttr(rewriter.getI1Type(), 1));
-    auto predTy = VectorType::get(tileType.getShape()[0], rewriter.getI1Type(),
-                                  /*scalableDims=*/{true});
-    auto allActiveMask = rewriter.create<vector::SplatOp>(loc, predTy, one);
+    auto maskOp = storeTileSliceOp.getMask();
 
     Value tileI32 = castTileIDToI32(tile, loc, rewriter);
     arm_sme::TileSliceLayout layout = storeTileSliceOp.getLayout();
@@ -295,23 +289,23 @@ struct StoreTileSliceToArmSMELowering
         llvm_unreachable("unexpected element type!");
       case 8:
         rewriter.replaceOpWithNewOp<arm_sme::aarch64_sme_st1b_horiz>(
-            storeTileSliceOp, allActiveMask, ptr, tileI32, tileSliceI32);
+            storeTileSliceOp, maskOp, ptr, tileI32, tileSliceI32);
         break;
       case 16:
         rewriter.replaceOpWithNewOp<arm_sme::aarch64_sme_st1h_horiz>(
-            storeTileSliceOp, allActiveMask, ptr, tileI32, tileSliceI32);
+            storeTileSliceOp, maskOp, ptr, tileI32, tileSliceI32);
         break;
       case 32:
         rewriter.replaceOpWithNewOp<arm_sme::aarch64_sme_st1w_horiz>(
-            storeTileSliceOp, allActiveMask, ptr, tileI32, tileSliceI32);
+            storeTileSliceOp, maskOp, ptr, tileI32, tileSliceI32);
         break;
       case 64:
         rewriter.replaceOpWithNewOp<arm_sme::aarch64_sme_st1d_horiz>(
-            storeTileSliceOp, allActiveMask, ptr, tileI32, tileSliceI32);
+            storeTileSliceOp, maskOp, ptr, tileI32, tileSliceI32);
         break;
       case 128:
         rewriter.replaceOpWithNewOp<arm_sme::aarch64_sme_st1q_horiz>(
-            storeTileSliceOp, allActiveMask, ptr, tileI32, tileSliceI32);
+            storeTileSliceOp, maskOp, ptr, tileI32, tileSliceI32);
         break;
       }
     } else {
@@ -320,23 +314,23 @@ struct StoreTileSliceToArmSMELowering
         llvm_unreachable("unexpected element type!");
       case 8:
         rewriter.replaceOpWithNewOp<arm_sme::aarch64_sme_st1b_vert>(
-            storeTileSliceOp, allActiveMask, ptr, tileI32, tileSliceI32);
+            storeTileSliceOp, maskOp, ptr, tileI32, tileSliceI32);
         break;
       case 16:
         rewriter.replaceOpWithNewOp<arm_sme::aarch64_sme_st1h_vert>(
-            storeTileSliceOp, allActiveMask, ptr, tileI32, tileSliceI32);
+            storeTileSliceOp, maskOp, ptr, tileI32, tileSliceI32);
         break;
       case 32:
         rewriter.replaceOpWithNewOp<arm_sme::aarch64_sme_st1w_vert>(
-            storeTileSliceOp, allActiveMask, ptr, tileI32, tileSliceI32);
+            storeTileSliceOp, maskOp, ptr, tileI32, tileSliceI32);
         break;
       case 64:
         rewriter.replaceOpWithNewOp<arm_sme::aarch64_sme_st1d_vert>(
-            storeTileSliceOp, allActiveMask, ptr, tileI32, tileSliceI32);
+            storeTileSliceOp, maskOp, ptr, tileI32, tileSliceI32);
         break;
       case 128:
         rewriter.replaceOpWithNewOp<arm_sme::aarch64_sme_st1q_vert>(
-            storeTileSliceOp, allActiveMask, ptr, tileI32, tileSliceI32);
+            storeTileSliceOp, maskOp, ptr, tileI32, tileSliceI32);
         break;
       }
     }
diff --git a/mlir/test/Conversion/ArmSMEToSCF/arm-sme-to-scf.mlir b/mlir/test/Conversion/ArmSMEToSCF/arm-sme-to-scf.mlir
index 3fb320c0d219e..d61f588941b40 100644
--- a/mlir/test/Conversion/ArmSMEToSCF/arm-sme-to-scf.mlir
+++ b/mlir/test/Conversion/ArmSMEToSCF/arm-sme-to-scf.mlir
@@ -48,8 +48,9 @@ func.func @arm_sme_tile_load_ver(%src : memref<?x?xi32>) {
 // CHECK-DAG:     %[[VSCALE:.*]] = vector.vscale
 // CHECK:         %[[NUM_TILE_SLICES:.*]] = arith.muli %[[C4]], %[[VSCALE]] : index
 // CHECK:         scf.for %[[TILE_SLICE_INDEX:.*]] = %[[C0]] to %[[NUM_TILE_SLICES]] step %[[C1]] {
+// CHECK:           %[[PTRUE_S:.*]] = arith.constant dense<true> : vector<[4]xi1>
 // CHECK:           %[[OFFSET:.*]] = arith.addi %[[C0]], %[[TILE_SLICE_INDEX]] : index
-// CHECK:           arm_sme.store_tile_slice %[[TILE]], %[[TILE_SLICE_INDEX]], %[[DEST]]{{\[}}%[[OFFSET]], %[[C0]]] : memref<?x?xi32>, vector<[4]x[4]xi32>
+// CHECK:           arm_sme.store_tile_slice %[[TILE]], %[[TILE_SLICE_INDEX]], %[[PTRUE_S]], %[[DEST]]{{\[}}%[[OFFSET]], %[[C0]]] : memref<?x?xi32>, vector<[4]xi1>, vector<[4]x[4]xi32>
 func.func @arm_sme_tile_store_hor(%tile : vector<[4]x[4]xi32>, %dest : memref<?x?xi32>) {
   %c0 = arith.constant 0 : index
   arm_sme.tile_store %tile, %dest[%c0, %c0] : memref<?x?xi32>, vector<[4]x[4]xi32>
diff --git a/mlir/test/Dialect/ArmSME/arm-sme-to-llvm.mlir b/mlir/test/Dialect/ArmSME/arm-sme-to-llvm.mlir
index 30ddb3c468601..8fdcf69958244 100644
--- a/mlir/test/Dialect/ArmSME/arm-sme-to-llvm.mlir
+++ b/mlir/test/Dialect/ArmSME/arm-sme-to-llvm.mlir
@@ -209,8 +209,8 @@ func.func @arm_sme_load_tile_slice_ver_f64(%src : memref<?x?xf64>, %mask : vecto
 // CHECK-LABEL:   func.func @arm_sme_store_tile_slice_hor_i8(
 // CHECK-SAME:                                               %[[TILE:.*]]: vector<[16]x[16]xi8>,
 // CHECK-SAME:                                               %[[TILE_SLICE_INDEX:.*]]: index,
+// CHECK-SAME:                                               %[[MASK:.*]]: vector<[16]xi1>,
 // CHECK-SAME:                                               %[[DEST:.*]]: memref<?x?xi8>) {
-// CHECK:           %[[PTRUE_B:.*]] = arith.constant dense<true> : vector<[16]xi1>
 // CHECK:           %[[C0:.*]] = arith.constant 0 : index
 // CHECK:           %[[MEM_DESC:.*]] = builtin.unrealized_conversion_cast %[[DEST]] : memref<?x?xi8> to !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
 // CHECK:           %[[C0_I64:.*]] = builtin.unrealized_conversion_cast %[[C0]] : index to i64
@@ -221,12 +221,12 @@ func.func @arm_sme_load_tile_slice_ver_f64(%src : memref<?x?xf64>, %mask : vecto
 // CHECK:           %[[GEP:.*]] = llvm.getelementptr %[[ALIGNED_BASE]]{{\[}}%[[OFFSET]]] : (!llvm.ptr, i64) -> !llvm.ptr, i8
 // CHECK:           %[[TILE_SLICE_INDEX_I32:.*]] = arith.index_castui %[[TILE_SLICE_INDEX]] : index to i32
 // CHECK:           %[[TILE_ID_I32:.*]] = arith.extui %[[TILE_ID]] : i8 to i32
-// CHECK:           "arm_sme.intr.st1b.horiz"(%[[PTRUE_B]], %[[GEP]], %[[TILE_ID_I32]], %[[TILE_SLICE_INDEX_I32]]) : (vector<[16]xi1>, !llvm.ptr, i32, i32) -> ()
+// CHECK:           "arm_sme.intr.st1b.horiz"(%[[MASK]], %[[GEP]], %[[TILE_ID_I32]], %[[TILE_SLICE_INDEX_I32]]) : (vector<[16]xi1>, !llvm.ptr, i32, i32) -> ()
 // CHECK:           return
 // CHECK:         }
-func.func @arm_sme_store_tile_slice_hor_i8(%tile : vector<[16]x[16]xi8>, %tile_slice_index : index, %dest : memref<?x?xi8>) -> () {
+func.func @arm_sme_store_tile_slice_hor_i8(%tile : vector<[16]x[16]xi8>, %tile_slice_index : index,  %mask : vector<[16]xi1>, %dest : memref<?x?xi8>) -> () {
   %c0 = arith.constant 0 : index
-  arm_sme.store_tile_slice %tile, %tile_slice_index, %dest[%c0] : memref<?x?xi8>, vector<[16]x[16]xi8>
+  arm_sme.store_tile_slice %tile, %tile_slice_index, %mask, %dest[%c0] : memref<?x?xi8>, vector<[16]xi1>, vector<[16]x[16]xi8>
   return
 }
 
@@ -234,9 +234,9 @@ func.func @arm_sme_store_tile_slice_hor_i8(%tile : vector<[16]x[16]xi8>, %tile_s
 
 // CHECK-LABEL: @arm_sme_store_tile_slice_hor_i16
 // CHECK: "arm_sme.intr.st1h.horiz"({{.*}}) : (vector<[8]xi1>, !llvm.ptr, i32, i32) -> ()
-func.func @arm_sme_store_tile_slice_hor_i16(%tile : vector<[8]x[8]xi16>, %tile_slice_index : index, %dest : memref<?x?xi16>) -> () {
+func.func @arm_sme_store_tile_slice_hor_i16(%tile : vector<[8]x[8]xi16>, %tile_slice_index : index, %mask : vector<[8]xi1>, %dest : memref<?x?xi16>) -> () {
   %c0 = arith.constant 0 : index
-  arm_sme.store_tile_slice %tile, %tile_slice_index, %dest[%c0] : memref<?x?xi16>, vector<[8]x[8]xi16>
+  arm_sme.store_tile_slice %tile, %tile_slice_index, %mask, %dest[%c0] : memref<?x?xi16>, vector<[8]xi1>, vector<[8]x[8]xi16>
   return
 }
 
@@ -244,9 +244,9 @@ func.func @arm_sme_store_tile_slice_hor_i16(%tile : vector<[8]x[8]xi16>, %tile_s
 
 // CHECK-LABEL: @arm_sme_store_tile_slice_hor_i32
 // CHECK: "arm_sme.intr.st1w.horiz"({{.*}}) : (vector<[4]xi1>, !llvm.ptr, i32, i32) -> ()
-func.func @arm_sme_store_tile_slice_hor_i32(%tile : vector<[4]x[4]xi32>, %tile_slice_index : index, %dest : memref<?x?xi32>) -> () {
+func.func @arm_sme_store_tile_slice_hor_i32(%tile : vector<[4]x[4]xi32>, %tile_slice_index : index, %mask : vector<[4]xi1>, %dest : memref<?x?xi32>) -> () {
   %c0 = arith.constant 0 : index
-  arm_sme.store_tile_slice %tile, %tile_slice_index, %dest[%c0] : memref<?x?xi32>, vector<[4]x[4]xi32>
+  arm_sme.store_tile_slice %tile, %tile_slice_index, %mask, %dest[%c0] : memref<?x?xi32>, vector<[4]xi1>, vector<[4]x[4]xi32>
   return
 }
 
@@ -254,9 +254,9 @@ func.func @arm_sme_store_tile_slice_hor_i32(%tile : vector<[4]x[4]xi32>, %tile_s
 
 // CHECK-LABEL: @arm_sme_store_tile_slice_hor_i64
 // CHECK: "arm_sme.intr.st1d.horiz"({{.*}}) : (vector<[2]xi1>, !llvm.ptr, i32, i32) -> ()
-func.func @arm_sme_store_tile_slice_hor_i64(%tile : vector<[2]x[2]xi64>, %tile_slice_index : index, %dest : memref<?x?xi64>) -> () {
+func.func @arm_sme_store_tile_slice_hor_i64(%tile : vector<[2]x[2]xi64>, %tile_slice_index : index, %mask : vector<[2]xi1>, %dest : memref<?x?xi64>) -> () {
   %c0 = arith.constant 0 : index
-  arm_sme.store_tile_slice %tile, %tile_slice_index, %dest[%c0] : memref<?x?xi64>, vector<[2]x[2]xi64>
+  arm_sme.store_tile_slice %tile, %tile_slice_index, %mask, %dest[%c0] : memref<?x?xi64>, vector<[2]xi1>, vector<[2]x[2]xi64>
   return
 }
 
@@ -264,9 +264,9 @@ func.func @arm_sme_store_tile_slice_hor_i64(%tile : vector<[2]x[2]xi64>, %tile_s
 
 // CHECK-LABEL: @arm_sme_store_tile_slice_hor_i128
 // CHECK: "arm_sme.intr.st1q.horiz"({{.*}}) : (vector<[1]xi1>, !llvm.ptr, i32, i32) -> ()
-func.func @arm_sme_store_tile_slice_hor_i128(%tile : vector<[1]x[1]xi128>, %tile_slice_index : index, %dest : memref<?x?xi128>) -> () {
+func.func @arm_sme_store_tile_slice_hor_i128(%tile : vector<[1]x[1]xi128>, %tile_slice_index : index, %mask : vector<[1]xi1>, %dest : memref<?x?xi128>) -> () {
   %c0 = arith.constant 0 : index
-  arm_sme.store_tile_slice %tile, %tile_slice_index, %dest[%c0] : memref<?x?xi128>, vector<[1]x[1]xi128>
+  arm_sme.store_tile_slice %tile, %tile_slice_index, %mask, %dest[%c0] : memref<?x?xi128>, vector<[1]xi1>, vector<[1]x[1]xi128>
   return
 }
 
@@ -274,9 +274,9 @@ func.func @arm_sme_store_tile_slice_hor_i128(%tile : vector<[1]x[1]xi128>, %tile
 
 // CHECK-LABEL: @arm_sme_store_tile_slice_hor_f16
 // CHECK: "arm_sme.intr.st1h.horiz"({{.*}}) : (vector<[8]xi1>, !llvm.ptr, i32, i32) -> ()
-func.func @arm_sme_store_tile_slice_hor_f16(%tile : vector<[8]x[8]xf16>, %tile_slice_index : index, %dest : memref<?x?xf16>) -> () {
+func.func @arm_sme_store_tile_slice_hor_f16(%tile : vector<[8]x[8]xf16>, %tile_slice_index : index, %mask : vector<[8]xi1>, %dest : memref<?x?xf16>) -> () {
   %c0 = arith.constant 0 : index
-  arm_sme.store_tile_slice %tile, %tile_slice_index, %dest[%c0] : memref<?x?xf16>, vector<[8]x[8]xf16>
+  arm_sme.store_tile_slice %tile, %tile_slice_index, %mask, %dest[%c0] : memref<?x?xf16>, vector<[8]xi1>, vector<[8]x[8]xf16>
   return
 }
 
@@ -284,9 +284,9 @@ func.func @arm_sme_store_tile_slice_hor_f16(%tile : vector<[8]x[8]xf16>, %tile_s
 
 // CHECK-LABEL: @arm_sme_store_tile_slice_hor_bf16
 // CHECK: "arm_sme.intr.st1h.horiz"({{.*}}) : (vector<[8]xi1>, !llvm.ptr, i32, i32) -> ()
-func.func @arm_sme_store_tile_slice_hor_bf16(%tile : vector<[8]x[8]xbf16>, %tile_slice_index : index, %dest : memref<?x?xbf16>) -> () {
+func.func @arm_sme_store_tile_slice_hor_bf16(%tile : vector<[8]x[8]xbf16>, %tile_slice_index : index, %mask : vector<[8]xi1>, %dest : memref<?x?xbf16>) -> () {
   %c0 = arith.constant 0 : index
-  arm_sme.store_tile_slice %tile, %tile_slice_index, %dest[%c0] : memref<?x?xbf16>, vector<[8]x[8]xbf16>
+  arm_sme.store_tile_slice %tile, %tile_slice_index, %mask, %dest[%c0] : memref<?x?xbf16>, vector<[8]xi1>, vector<[8]x[8]xbf16>
   return
 }
 
@@ -294,9 +294,9 @@ func.func @arm_sme_store_tile_slice_hor_bf16(%tile : vector<[8]x[8]xbf16>, %tile
 
 // CHECK-LABEL: @arm_sme_store_tile_slice_hor_f32
 // CHECK: "arm_sme.intr.st1w.horiz"({{.*}}) : (vector<[4]xi1>, !llvm.ptr, i32, i32) -> ()
-func.func @arm_sme_store_tile_slice_hor_f32(%tile : vector<[4]x[4]xf32>, %tile_slice_index : index, %dest : memref<?x?xf32>) -> () {
+func.func @arm_sme_store_tile_slice_hor_f32(%tile : vector<[4]x[4]xf32>, %tile_slice_index : index, %mask : vector<[4]xi1>, %dest : memref<?x?xf32>) -> () {
   %c0 = arith.constant 0 : index
-  arm_sme.store_tile_slice %tile, %tile_slice_index, %dest[%c0] : memref<?x?xf32>, vector<[4]x[4]xf32>
+  arm_sme.store_tile_slice %tile, %tile_slice_index, %mask, %dest[%c0] : memref<?x?xf32>, vector<[4]xi1>, vector<[4]x[4]xf32>
   return
 }
 
@@ -304,9 +304,9 @@ func.func @arm_sme_store_tile_slice_hor_f32(%tile : vector<[4]x[4]xf32>, %tile_s
 
 // CHECK-LABEL: @arm_sme_store_tile_slice_hor_f64
 // CHECK: "arm_sme.intr.st1d.horiz"({{.*}}) : (vector<[2]xi1>, !llvm.ptr, i32, i32) -> ()
-func.func @arm_sme_store_tile_slice_hor_f64(%tile : vector<[2]x[2]xf64>, %tile_slice_index : index, %dest : memref<?x?xf64>) -> () {
+func.func @arm_sme_store_tile_slice_hor_f64(%tile : vector<[2]x[2]xf64>, %tile_slice_index : index, %mask : vector<[2]xi1>, %dest : memref<?x?xf64>) -> () {
   %c0 = arith.constant 0 : index
-  arm_sme.store_tile_slice %tile, %tile_slice_index, %dest[%c0] : memref<?x?xf64>, vector<[2]x[2]xf64>
+  arm_sme.store_tile_slice %tile, %tile_slice_index, %mask, %dest[%c0] : memref<?x?xf64>, vector<[2]xi1>, vector<[2]x[2]xf64>
   return
 }
 
@@ -314,9 +314,9 @@ func.func @arm_sme_store_tile_slice_hor_f64(%tile : vector<[2]x[2]xf64>, %tile_s
 
 // CHECK-LABEL: @arm_sme_store_tile_slice_ver_i8
 // CHECK: "arm_sme.intr.st1b.vert"({{.*}}) : (vector<[16]xi1>, !llvm.ptr, i32, i32) -> ()
-func.func @arm_sme_store_tile_slice_ver_i8(%tile : vector<[16]x[16]xi8>, %tile_slice_index : index, %dest : memref<?x?xi8>) -> () {
+func.func @arm_sme_store_tile_slice_ver_i8(%tile : vector<[16]x[16]xi8>, %tile_slice_index : index, %mask : vector<[16]xi1>, %dest : memref<?x?xi8>) -> () {
   %c0 = arith.constant 0 : index
-  arm_sme.store_tile_slice %tile, %tile_slice_index, %dest[%c0] layout<vertical> : memref<?x?xi8>, vector<[16]x[16]xi8>
+  arm_sme.store_tile_slice %tile, %tile_slice_index, %mask, %dest[%c0] layout<vertical> : memref<?x?xi8>, vector<[16]xi1>, vector<[16]x[16]xi8>
   return
 }
 
@@ -324,9 +324,9 @@ func.func @arm_sme_store_tile_slice_ver_i8(%tile : vector<[16]x[16]xi8>, %tile_s
 
 // CHECK-LABEL: @arm_sme_store_tile_slice_ver_i16
 // CHECK: "arm_sme.intr.st1h.vert"({{.*}}) : (vector<[8]xi1>, !llvm.ptr, i32, i32) -> ()
-func.func @arm_sme_store_tile_slice_ver_i16(%tile : vector<[8]x[8]xi16>, %tile_slice_index : index, %dest : memref<?x?xi16>) -> () {
+func.func @arm_sme_store_tile_slice_ver_i16(%tile : vector<[8]x[8]xi16>, %tile_slice_index : index, %mask : vector<[8]xi1>, %dest : memref<?x?xi16>) -> () {
   %c0 = arith.constant 0 : index
-  arm_sme.store_tile_slice %tile, %tile_slice_index, %dest[%c0] layout<vertical> : memref<?x?xi16>, vector<[8]x[8]xi16>
+  arm_sme.store_tile_slice %tile, %tile_slice_index, %mask, %dest[%c0] layout<vertical> : memref<?x?xi16>, vector<[8]xi1>, vector<[8]x[8]xi16>
   return
 }
 
@@ -334,9 +334,9 @@ func.func @arm_sme_store_tile_slice_ver_i16(%tile : vector<[8]x[8]xi16>, %tile_s
 
 // CHECK-LABEL: @arm_sme_store_tile_slice_ver_i32
 // CHECK: "arm_sme.intr.st1w.vert"({{.*}}) : (vector<[4]xi1>, !llvm.ptr, i32, i32) -> ()
-func.func @arm_sme_store_tile_slice_ver_i32(%tile : vector<[4]x[4]xi32>, %tile_slice_index : index, %dest : memref<?x?xi32>) -> () {
+func.func @arm_sme_store_tile_slice_ver_i32(%tile : vector<[4]x[4]xi32>, %tile_slice_index : index, %mask : vector<[4]xi1>, %dest : memref<?x?xi32>) -> () {
   %c0 = arith.constant 0 : index
-  arm_sme.store_tile_slice %tile, %tile_slice_index, %dest[%c0] layout<vertical> : memref<?x?xi32>, vector<[4]x[4]xi32>
+  arm_sme.store_tile_slice %tile, %tile_slice_index, %mask, %dest[%c0] layout<vertical> : memref<?x?xi32>, vector<[4]xi1>, vector<[4]x[4]xi32>
   return
 }
 
@@ -344,9 +344,9 @@ func.func @arm_sme_store_tile_slice_ver_i32(%tile : vector<[4]x[4]xi32>, %tile_s
 
 // CHECK-LABEL: @arm_sme_store_tile_slice_ver_i64
 // CHECK: "arm_sme.intr.st1d.vert"({{.*}}) : (vector<[2]xi1>, !llvm.ptr, i32, i32) -> ()
-func.func @arm_sme_store_tile_slice_ver_i64(%tile : vector<[2]x[2]xi64>, %tile_slice_index : index, %dest : memref<?x?xi64>) -> () {
+func.func @arm_sme_store_tile_slice_ver_i64(%tile : vector<[2]x[2]xi64>, %tile_slice_index : index, %mask : vector<[2]xi1>, %dest : memref<?x?xi64>) -> () {
   %c0 = arith.constant 0 : index
-  arm_sme.store_tile_slice %tile, %tile_slice_index, %dest[%c0] layout<vertical> : memref<?x?xi64>, vector<[2]x[2]xi64>
+  arm_sme.store_tile_slice %tile, %tile_slice_index, %mask, %dest[%c0] layout<vertical> : memref<?x?xi64>, vector<[2]xi1>, vector<[2]x[2]xi64>
   return
 }
 
@@ -354,9 +354,9 @@ func.func @arm_sme_store_tile_slice_ver_i64(%tile : vector<[2]x[2]xi64>, %tile_s
 
 // CHECK-LABEL: @arm_sme_store_tile_slice_ver_i128
 // CHECK: "arm_sme.intr.st1q.vert"({{.*}}) : (vector<[1]xi1>, !llvm.ptr, i32, i32) -> ()
-func.func @arm_sme_store_tile_slice_ver_i128(%tile : vector<[1]x[1]xi128>, %tile_slice_index : index, %dest : memref<?x?xi128>) -> () {
+func.func @arm_sme_store_tile_slice_ver_i128(%tile : vector<[1]x[1]xi128>, %tile_slice_index : index, %mask : vector<[1]xi1>, %dest : memref<?x?xi128>) -> () {
   %c0 = arith.constant 0 : index
-  arm_sme.store_tile_slice %tile, %tile_slice_index, %dest[%c0] layout<vertical> : memref<?x?xi128>, vector<[1]x[1]xi128>
+  arm_sme.store_tile_slice %tile, %tile_slice_index, %mask, %dest[%c0] layout<vertical> : memref<?x?xi128>, vector<[1]xi1>, vector<[1]x[1]xi128>
   return
 }
 
@@ -364,9 +364,9 @@ func.func @arm_sme_store_tile_slice_ver_i128(%tile : vector<[1]x[1]xi128>, %tile
 
 // CHECK-LABEL: @arm_sme_store_tile_slice_ver_f16
 // CHECK: "arm_sme.intr.st1h.vert"({{.*}}) : (vector<[8]xi1>, !llvm.ptr, i32, i32) -> ()
-func.func @arm_sme_store_tile_slice_ver_f16(%tile : vector<[8]x[8]xf16>, %tile_slice_index : index, %dest : memref<?x?xf16>) -> () {
+func.func @arm_sme_store_tile_slice_ver_f16(%tile : vector<[8]x[8]xf16>, %tile_slice_index : index, %mask : vector<[8]xi1>, %dest : memref<?x?xf16>) -> () {
   %c0 = arith.constant 0 : index
-  arm_sme.store_tile_slice %tile, %tile_slice_index, %dest[%c0] layout<vertical> : memref<?x?xf16>, vector<[8]x[8]xf16>
+  arm_sme.store_tile_slice %tile, %tile_slice_index, %mask, %dest[%c0] layout<vertical> : memref<?x?xf16>, vector<[8]xi1>, vector<[8]x[8]xf16>
   return
 }
 
@@ -374,9 +374,9 @@ func.func @arm_sme_store_tile_slice_ver_f16(%tile : vector<[8]x[8]xf16>, %tile_s
 
 // CHECK-LABEL: @arm_sme_store_tile_slice_ver_bf16
 // CHECK: "arm_sme.intr.st1h.vert"({{.*}}) : (vector<[8]xi1>, !llvm.ptr, i32, i32) -> ()
-func.func @arm_sme_store_tile_slice_ver_bf16(%tile : vector<[8]x[8]xbf16>, %tile_slice_index : index, %dest : memref<?x?xbf16>) -> () {
+func.func @arm_sme_store_tile_slice_ver_bf16(%tile : vector<[8]x[8]xbf16>, %tile_slice_index : index, %mask : vector<[8]xi1>, %dest : memref<?x?xbf16>) -> () {
   %c0 = arith.constant 0 : index
-  arm_sme.store_tile_slice %tile, %tile_slice_index, %dest[%c0] layout<vertical> : memref<?x?xbf16>, vector<[8]x[8]xbf16>
+  arm_sme.store_tile_slice %tile, %tile_slice_index, %mask, %dest[%c0] layout<vertical> : memref<?x?xbf16>, vector<[8]xi1>, vector<[8]x[8]xbf16>
   return
 }
 
@@ -384,9 +384,9 @@ func.func @arm_sme_store_tile_slice_ver_bf16(%tile : vector<[8]x[8]xbf16>, %tile
 
 // CHECK-LABEL: @arm_sme_store_tile_slice_ver_f32
 // CHECK: "arm_sme.intr.st1w.vert"({{.*}}) : (vector<[4]xi1>, !llvm.ptr, i32, i32) -> ()
-func.func @arm_sme_store_tile_slice_ver_f32(%tile : vector<[4]x[4]xf32>, %tile_slice_index : index, %dest : memref<?x?xf32>) -> () {
+func.func @arm_sme_store_tile_slice_ver_f32(%tile : vector<[4]x[4]xf32>, %tile_slice_index : index, %mask : vector<[4]xi1>, %dest : memref<?x?xf32>) -> () {
   %c0 = arith.constant 0 : index
-  arm_sme.store_tile_slice %tile, %tile_slice_index, %dest[%c0] layout<vertical> : memref<?x?xf32>, vector<[4]x[4]xf32>
+  arm_sme.store_tile_slice %tile, %tile_slice_index, %mask, %dest[%c0] layout<vertical> : memref<?x?xf32>, vector<[4]xi1>, vector<[4]x[4]xf32>
   return
 }
 
@@ -394,9 +394,9 @@ func.func @arm_sme_store_tile_slice_ver_f32(%tile : vector<[4]x[4]xf32>, %tile_s
 
 // CHECK-LABEL: @arm_sme_store_tile_slice_ver_f64
 // CHECK: "arm_sme.intr.st1d.vert"({{.*}}) : (vector<[2]xi1>, !llvm.ptr, i32, i32) -> ()
-func.func @arm_sme_store_tile_slice_ver_f64(%tile : vector<[2]x[2]xf64>, %tile_slice_index : index, %dest : memref<?x?xf64>) -> () {
+func.func @arm_sme_store_tile_slice_ver_f64(%tile : vector<[2]x[2]xf64>, %tile_slice_index : index, %mask : vector<[2]xi1>, %dest : memref<?x?xf64>) -> () {
   %c0 = arith.constant 0 : index
-  arm_sme.store_tile_slice %tile, %tile_slice_index, %dest[%c0] layout<vertical> : memref<?x?xf64>, vector<[2]x[2]xf64>
+  arm_sme.store_tile_slice %tile, %tile_slice_index, %mask, %dest[%c0] layout<vertical> : memref<?x?xf64>, vector<[2]xi1>, vector<[2]x[2]xf64>
   return
 }
 
diff --git a/mlir/test/Dialect/ArmSME/invalid.mlir b/mlir/test/Dialect/ArmSME/invalid.mlir
index 588b8e891fadd..58d7b8f361a23 100644
--- a/mlir/test/Dialect/ArmSME/invalid.mlir
+++ b/mlir/test/Dialect/ArmSME/invalid.mlir
@@ -159,7 +159,7 @@ func.func @arm_sme_tile_load__pad_but_no_mask(%src : memref<?x?xf64>, %pad : f64
 
 func.func @arm_sme_load_tile_slice__bad_mask_type(%src : memref<?x?xi8>, %mask : vector<[2]xi1>, %tile : vector<[16]x[16]xi8>, %tile_slice_index : index) {
   %c0 = arith.constant 0 : index
-  // expected-error@+1 {{op failed to verify that mask has i1 element type and is a slice of the result}}
+  // expected-error@+1 {{op failed to verify that `mask` has i1 element type and the shape is a slice of `result`}}
   %tile_update = arm_sme.load_tile_slice %src[%c0], %mask, %tile, %tile_slice_index : memref<?x?xi8>, vector<[2]xi1>, vector<[16]x[16]xi8>
   return
 }
@@ -178,6 +178,20 @@ func.func @arm_sme_tile_store__bad_mask_type(%tile : vector<[16]x[16]xi8>, %mask
   return
 }
 
+//===----------------------------------------------------------------------===//
+// arm_sme.store_tile_slice
+//===----------------------------------------------------------------------===//
+
+
+// -----
+
+func.func @arm_sme_store_tile_slice__bad_mask_type(%tile : vector<[16]x[16]xi8>, %tile_slice_index : index, %mask : vector<[8]xi1>, %dest : memref<?x?xi8>) -> () {
+  %c0 = arith.constant 0 : index
+  // expected-error@+1 {{op failed to verify that `mask` has i1 element type and the shape is a slice of `tile`}}
+  arm_sme.store_tile_slice %tile, %tile_slice_index, %mask, %dest[%c0] : memref<?x?xi8>, vector<[8]xi1>, vector<[16]x[16]xi8>
+  return
+}
+
 //===----------------------------------------------------------------------===//
 // arm_sme.outerproduct
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/Dialect/ArmSME/roundtrip.mlir b/mlir/test/Dialect/ArmSME/roundtrip.mlir
index 8206bd80e3eb4..1dbcc32e9259f 100644
--- a/mlir/test/Dialect/ArmSME/roundtrip.mlir
+++ b/mlir/test/Dialect/ArmSME/roundtrip.mlir
@@ -823,173 +823,173 @@ func.func @arm_sme_load_tile_slice_hor_i8(%src : memref<?x?xi8>, %mask : vector<
 
 // -----
 
-func.func @arm_sme_store_tile_slice_hor_i8(%tile : vector<[16]x[16]xi8>, %tile_slice_index : index, %dest : memref<?x?xi8>) -> () {
-  // CHECK: arm_sme.store_tile_slice {{.*}}, {{.*}}, %{{.*}}[{{.*}}] : memref<?x?xi8>, vector<[16]x[16]xi8>
+func.func @arm_sme_store_tile_slice_hor_i8(%tile : vector<[16]x[16]xi8>, %tile_slice_index : index, %mask : vector<[16]xi1>, %dest : memref<?x?xi8>) -> () {
+  // CHECK: arm_sme.store_tile_slice {{.*}}, {{.*}}, %{{.*}}[{{.*}}] : memref<?x?xi8>, vector<[16]xi1>, vector<[16]x[16]xi8>
   %c0 = arith.constant 0 : index
-  arm_sme.store_tile_slice %tile, %tile_slice_index, %dest[%c0] : memref<?x?xi8>, vector<[16]x[16]xi8>
+  arm_sme.store_tile_slice %tile, %tile_slice_index, %mask, %dest[%c0] : memref<?x?xi8>, vector<[16]xi1>, vector<[16]x[16]xi8>
   return
 }
 
 // -----
 
-func.func @arm_sme_store_tile_slice_hor_i16(%tile : vector<[8]x[8]xi16>, %tile_slice_index : index, %dest : memref<?x?xi16>) -> () {
-  // CHECK: arm_sme.store_tile_slice {{.*}}, {{.*}}, %{{.*}}[{{.*}}] : memref<?x?xi16>, vector<[8]x[8]xi16>
+func.func @arm_sme_store_tile_slice_hor_i16(%tile : vector<[8]x[8]xi16>, %tile_slice_index : index, %mask : vector<[8]xi1>, %dest : memref<?x?xi16>) -> () {
+  // CHECK: arm_sme.store_tile_slice {{.*}}, {{.*}}, %{{.*}}[{{.*}}] : memref<?x?xi16>, vector<[8]xi1>, vector<[8]x[8]xi16>
   %c0 = arith.constant 0 : index
-  arm_sme.store_tile_slice %tile, %tile_slice_index, %dest[%c0] : memref<?x?xi16>, vector<[8]x[8]xi16>
+  arm_sme.store_tile_slice %tile, %tile_slice_index, %mask, %dest[%c0] : memref<?x?xi16>, vector<[8]xi1>, vector<[8]x[8]xi16>
   return
 }
 
 // -----
 
-func.func @arm_sme_store_tile_slice_hor_i32(%tile : vector<[4]x[4]xi32>, %tile_slice_index : index, %dest : memref<?x?xi32>) -> () {
-  // CHECK: arm_sme.store_tile_slice {{.*}}, {{.*}}, %{{.*}}[{{.*}}] : memref<?x?xi32>, vector<[4]x[4]xi32>
+func.func @arm_sme_store_tile_slice_hor_i32(%tile : vector<[4]x[4]xi32>, %tile_slice_index : index, %mask : vector<[4]xi1>, %dest : memref<?x?xi32>) -> () {
+  // CHECK: arm_sme.store_tile_slice {{.*}}, {{.*}}, %{{.*}}[{{.*}}] : memref<?x?xi32>, vector<[4]xi1>, vector<[4]x[4]xi32>
   %c0 = arith.constant 0 : index
-  arm_sme.store_tile_slice %tile, %tile_slice_index, %dest[%c0] : memref<?x?xi32>, vector<[4]x[4]xi32>
+  arm_sme.store_tile_slice %tile, %tile_slice_index, %mask, %dest[%c0] : memref<?x?xi32>, vector<[4]xi1>, vector<[4]x[4]xi32>
   return
 }
 
 // -----
 
-func.func @arm_sme_store_tile_slice_hor_i64(%tile : vector<[2]x[2]xi64>, %tile_slice_index : index, %dest : memref<?x?xi64>) -> () {
-  // CHECK: arm_sme.store_tile_slice {{.*}}, {{.*}}, %{{.*}}[{{.*}}] : memref<?x?xi64>, vector<[2]x[2]xi64>
+func.func @arm_sme_store_tile_slice_hor_i64(%tile : vector<[2]x[2]xi64>, %tile_slice_index : index, %mask : vector<[2]xi1>, %dest : memref<?x?xi64>) -> () {
+  // CHECK: arm_sme.store_tile_slice {{.*}}, {{.*}}, %{{.*}}[{{.*}}] : memref<?x?xi64>, vector<[2]xi1>, vector<[2]x[2]xi64>
   %c0 = arith.constant 0 : index
-  arm_sme.store_tile_slice %tile, %tile_slice_index, %dest[%c0] : memref<?x?xi64>, vector<[2]x[2]xi64>
+  arm_sme.store_tile_slice %tile, %tile_slice_index, %mask, %dest[%c0] : memref<?x?xi64>, vector<[2]xi1>, vector<[2]x[2]xi64>
   return
 }
 
 // -----
 
-func.func @arm_sme_store_tile_slice_hor_i128(%tile : vector<[1]x[1]xi128>, %tile_slice_index : index, %dest : memref<?x?xi128>) -> () {
-  // CHECK: arm_sme.store_tile_slice {{.*}}, {{.*}}, %{{.*}}[{{.*}}] : memref<?x?xi128>, vector<[1]x[1]xi128>
+func.func @arm_sme_store_tile_slice_hor_i128(%tile : vector<[1]x[1]xi128>, %tile_slice_index : index, %mask : vector<[1]xi1>, %dest : memref<?x?xi128>) -> () {
+  // CHECK: arm_sme.store_tile_slice {{.*}}, {{.*}}, %{{.*}}[{{.*}}] : memref<?x?xi128>, vector<[1]xi1>, vector<[1]x[1]xi128>
   %c0 = arith.constant 0 : index
-  arm_sme.store_tile_slice %tile, %tile_slice_index, %dest[%c0] : memref<?x?xi128>, vector<[1]x[1]xi128>
+  arm_sme.store_tile_slice %tile, %tile_slice_index, %mask, %dest[%c0] : memref<?x?xi128>, vector<[1]xi1>, vector<[1]x[1]xi128>
   return
 }
 
 // -----
 
-func.func @arm_sme_store_tile_slice_hor_f16(%tile : vector<[8]x[8]xf16>, %tile_slice_index : index, %dest : memref<?x?xf16>) -> () {
-  // CHECK: arm_sme.store_tile_slice {{.*}}, {{.*}}, %{{.*}}[{{.*}}] : memref<?x?xf16>, vector<[8]x[8]xf16>
+func.func @arm_sme_store_tile_slice_hor_f16(%tile : vector<[8]x[8]xf16>, %tile_slice_index : index, %mask : vector<[8]xi1>, %dest : memref<?x?xf16>) -> () {
+  // CHECK: arm_sme.store_tile_slice {{.*}}, {{.*}}, %{{.*}}[{{.*}}] : memref<?x?xf16>, vector<[8]xi1>, vector<[8]x[8]xf16>
   %c0 = arith.constant 0 : index
-  arm_sme.store_tile_slice %tile, %tile_slice_index, %dest[%c0] : memref<?x?xf16>, vector<[8]x[8]xf16>
+  arm_sme.store_tile_slice %tile, %tile_slice_index, %mask, %dest[%c0] : memref<?x?xf16>, vector<[8]xi1>, vector<[8]x[8]xf16>
   return
 }
 
 // -----
 
-func.func @arm_sme_store_tile_slice_hor_bf16(%tile : vector<[8]x[8]xbf16>, %tile_slice_index : index, %dest : memref<?x?xbf16>) -> () {
-  // CHECK: arm_sme.store_tile_slice {{.*}}, {{.*}}, %{{.*}}[{{.*}}] : memref<?x?xbf16>, vector<[8]x[8]xbf16>
+func.func @arm_sme_store_tile_slice_hor_bf16(%tile : vector<[8]x[8]xbf16>, %tile_slice_index : index, %mask : vector<[8]xi1>, %dest : memref<?x?xbf16>) -> () {
+  // CHECK: arm_sme.store_tile_slice {{.*}}, {{.*}}, %{{.*}}[{{.*}}] : memref<?x?xbf16>, vector<[8]xi1>, vector<[8]x[8]xbf16>
   %c0 = arith.constant 0 : index
-  arm_sme.store_tile_slice %tile, %tile_slice_index, %dest[%c0] : memref<?x?xbf16>, vector<[8]x[8]xbf16>
+  arm_sme.store_tile_slice %tile, %tile_slice_index, %mask, %dest[%c0] : memref<?x?xbf16>, vector<[8]xi1>, vector<[8]x[8]xbf16>
   return
 }
 
 // -----
 
-func.func @arm_sme_store_tile_slice_hor_f32(%tile : vector<[4]x[4]xf32>, %tile_slice_index : index, %dest : memref<?x?xf32>) -> () {
-  // CHECK: arm_sme.store_tile_slice {{.*}}, {{.*}}, %{{.*}}[{{.*}}] : memref<?x?xf32>, vector<[4]x[4]xf32>
+func.func @arm_sme_store_tile_slice_hor_f32(%tile : vector<[4]x[4]xf32>, %tile_slice_index : index, %mask : vector<[4]xi1>, %dest : memref<?x?xf32>) -> () {
+  // CHECK: arm_sme.store_tile_slice {{.*}}, {{.*}}, %{{.*}}[{{.*}}] : memref<?x?xf32>, vector<[4]xi1>, vector<[4]x[4]xf32>
   %c0 = arith.constant 0 : index
-  arm_sme.store_tile_slice %tile, %tile_slice_index, %dest[%c0] : memref<?x?xf32>, vector<[4]x[4]xf32>
+  arm_sme.store_tile_slice %tile, %tile_slice_index, %mask, %dest[%c0] : memref<?x?xf32>, vector<[4]xi1>, vector<[4]x[4]xf32>
   return
 }
 
 // -----
 
-func.func @arm_sme_store_tile_slice_hor_f64(%tile : vector<[2]x[2]xf64>, %tile_slice_index : index, %dest : memref<?x?xf64>) -> () {
-  // CHECK: arm_sme.store_tile_slice {{.*}}, {{.*}}, %{{.*}}[{{.*}}] : memref<?x?xf64>, vector<[2]x[2]xf64>
+func.func @arm_sme_store_tile_slice_hor_f64(%tile : vector<[2]x[2]xf64>, %tile_slice_index : index, %mask : vector<[2]xi1>, %dest : memref<?x?xf64>) -> () {
+  // CHECK: arm_sme.store_tile_slice {{.*}}, {{.*}}, %{{.*}}[{{.*}}] : memref<?x?xf64>, vector<[2]xi1>, vector<[2]x[2]xf64>
   %c0 = arith.constant 0 : index
-  arm_sme.store_tile_slice %tile, %tile_slice_index, %dest[%c0] : memref<?x?xf64>, vector<[2]x[2]xf64>
+  arm_sme.store_tile_slice %tile, %tile_slice_index, %mask, %dest[%c0] : memref<?x?xf64>, vector<[2]xi1>, vector<[2]x[2]xf64>
   return
 }
 
 // -----
 
-func.func @arm_sme_store_tile_slice_ver_i8(%tile : vector<[16]x[16]xi8>, %tile_slice_index : index, %dest : memref<?x?xi8>) -> () {
-  // CHECK: arm_sme.store_tile_slice {{.*}} layout<vertical> : memref<?x?xi8>, vector<[16]x[16]xi8>
+func.func @arm_sme_store_tile_slice_ver_i8(%tile : vector<[16]x[16]xi8>, %tile_slice_index : index, %mask : vector<[16]xi1>, %dest : memref<?x?xi8>) -> () {
+  // CHECK: arm_sme.store_tile_slice {{.*}} layout<vertical> : memref<?x?xi8>, vector<[16]xi1>, vector<[16]x[16]xi8>
   %c0 = arith.constant 0 : index
-  arm_sme.store_tile_slice %tile, %tile_slice_index, %dest[%c0] layout<vertical> : memref<?x?xi8>, vector<[16]x[16]xi8>
+  arm_sme.store_tile_slice %tile, %tile_slice_index, %mask, %dest[%c0] layout<vertical> : memref<?x?xi8>, vector<[16]xi1>, vector<[16]x[16]xi8>
   return
 }
 
 // -----
 
-func.func @arm_sme_store_tile_slice_ver_i16(%tile : vector<[8]x[8]xi16>, %tile_slice_index : index, %dest : memref<?x?xi16>) -> () {
-  // CHECK: arm_sme.store_tile_slice {{.*}} layout<vertical> : memref<?x?xi16>, vector<[8]x[8]xi16>
+func.func @arm_sme_store_tile_slice_ver_i16(%tile : vector<[8]x[8]xi16>, %tile_slice_index : index, %mask : vector<[8]xi1>, %dest : memref<?x?xi16>) -> () {
+  // CHECK: arm_sme.store_tile_slice {{.*}} layout<vertical> : memref<?x?xi16>, vector<[8]xi1>, vector<[8]x[8]xi16>
   %c0 = arith.constant 0 : index
-  arm_sme.store_tile_slice %tile, %tile_slice_index, %dest[%c0] layout<vertical> : memref<?x?xi16>, vector<[8]x[8]xi16>
+  arm_sme.store_tile_slice %tile, %tile_slice_index, %mask, %dest[%c0] layout<vertical> : memref<?x?xi16>, vector<[8]xi1>, vector<[8]x[8]xi16>
   return
 }
 
 // -----
 
-func.func @arm_sme_store_tile_slice_ver_i32(%tile : vector<[4]x[4]xi32>, %tile_slice_index : index, %dest : memref<?x?xi32>) -> () {
-  // CHECK: arm_sme.store_tile_slice {{.*}} layout<vertical> : memref<?x?xi32>, vector<[4]x[4]xi32>
+func.func @arm_sme_store_tile_slice_ver_i32(%tile : vector<[4]x[4]xi32>, %tile_slice_index : index, %mask : vector<[4]xi1>, %dest : memref<?x?xi32>) -> () {
+  // CHECK: arm_sme.store_tile_slice {{.*}} layout<vertical> : memref<?x?xi32>, vector<[4]xi1>, vector<[4]x[4]xi32>
   %c0 = arith.constant 0 : index
-  arm_sme.store_tile_slice %tile, %tile_slice_index, %dest[%c0] layout<vertical> : memref<?x?xi32>, vector<[4]x[4]xi32>
+  arm_sme.store_tile_slice %tile, %tile_slice_index, %mask, %dest[%c0] layout<vertical> : memref<?x?xi32>, vector<[4]xi1>, vector<[4]x[4]xi32>
   return
 }
 
 // -----
 
-func.func @arm_sme_store_tile_slice_ver_i64(%tile : vector<[2]x[2]xi64>, %tile_slice_index : index, %dest : memref<?x?xi64>) -> () {
-  // CHECK: arm_sme.store_tile_slice {{.*}} layout<vertical> : memref<?x?xi64>, vector<[2]x[2]xi64>
+func.func @arm_sme_store_tile_slice_ver_i64(%tile : vector<[2]x[2]xi64>, %tile_slice_index : index, %mask : vector<[2]xi1>, %dest : memref<?x?xi64>) -> () {
+  // CHECK: arm_sme.store_tile_slice {{.*}} layout<vertical> : memref<?x?xi64>, vector<[2]xi1>, vector<[2]x[2]xi64>
   %c0 = arith.constant 0 : index
-  arm_sme.store_tile_slice %tile, %tile_slice_index, %dest[%c0] layout<vertical> : memref<?x?xi64>, vector<[2]x[2]xi64>
+  arm_sme.store_tile_slice %tile, %tile_slice_index, %mask, %dest[%c0] layout<vertical> : memref<?x?xi64>, vector<[2]xi1>, vector<[2]x[2]xi64>
   return
 }
 
 // -----
 
-func.func @arm_sme_store_tile_slice_ver_i128(%tile : vector<[1]x[1]xi128>, %tile_slice_index : index, %dest : memref<?x?xi128>) -> () {
-  // CHECK: arm_sme.store_tile_slice {{.*}} layout<vertical> : memref<?x?xi128>, vector<[1]x[1]xi128>
+func.func @arm_sme_store_tile_slice_ver_i128(%tile : vector<[1]x[1]xi128>, %tile_slice_index : index, %mask : vector<[1]xi1>, %dest : memref<?x?xi128>) -> () {
+  // CHECK: arm_sme.store_tile_slice {{.*}} layout<vertical> : memref<?x?xi128>, vector<[1]xi1>, vector<[1]x[1]xi128>
   %c0 = arith.constant 0 : index
-  arm_sme.store_tile_slice %tile, %tile_slice_index, %dest[%c0] layout<vertical> : memref<?x?xi128>, vector<[1]x[1]xi128>
+  arm_sme.store_tile_slice %tile, %tile_slice_index, %mask, %dest[%c0] layout<vertical> : memref<?x?xi128>, vector<[1]xi1>, vector<[1]x[1]xi128>
   return
 }
 
 // -----
 
-func.func @arm_sme_store_tile_slice_ver_f16(%tile : vector<[8]x[8]xf16>, %tile_slice_index : index, %dest : memref<?x?xf16>) -> () {
-  // CHECK: arm_sme.store_tile_slice {{.*}} layout<vertical> : memref<?x?xf16>, vector<[8]x[8]xf16>
+func.func @arm_sme_store_tile_slice_ver_f16(%tile : vector<[8]x[8]xf16>, %tile_slice_index : index, %mask : vector<[8]xi1>, %dest : memref<?x?xf16>) -> () {
+  // CHECK: arm_sme.store_tile_slice {{.*}} layout<vertical> : memref<?x?xf16>, vector<[8]xi1>, vector<[8]x[8]xf16>
   %c0 = arith.constant 0 : index
-  arm_sme.store_tile_slice %tile, %tile_slice_index, %dest[%c0] layout<vertical> : memref<?x?xf16>, vector<[8]x[8]xf16>
+  arm_sme.store_tile_slice %tile, %tile_slice_index, %mask, %dest[%c0] layout<vertical> : memref<?x?xf16>, vector<[8]xi1>, vector<[8]x[8]xf16>
   return
 }
 
 // -----
 
-func.func @arm_sme_store_tile_slice_ver_bf16(%tile : vector<[8]x[8]xbf16>, %tile_slice_index : index, %dest : memref<?x?xbf16>) -> () {
-  // CHECK: arm_sme.store_tile_slice {{.*}} layout<vertical> : memref<?x?xbf16>, vector<[8]x[8]xbf16>
+func.func @arm_sme_store_tile_slice_ver_bf16(%tile : vector<[8]x[8]xbf16>, %tile_slice_index : index, %mask : vector<[8]xi1>, %dest : memref<?x?xbf16>) -> () {
+  // CHECK: arm_sme.store_tile_slice {{.*}} layout<vertical> : memref<?x?xbf16>, vector<[8]xi1>, vector<[8]x[8]xbf16>
   %c0 = arith.constant 0 : index
-  arm_sme.store_tile_slice %tile, %tile_slice_index, %dest[%c0] layout<vertical> : memref<?x?xbf16>, vector<[8]x[8]xbf16>
+  arm_sme.store_tile_slice %tile, %tile_slice_index, %mask, %dest[%c0] layout<vertical> : memref<?x?xbf16>, vector<[8]xi1>, vector<[8]x[8]xbf16>
   return
 }
 
 // -----
 
-func.func @arm_sme_store_tile_slice_ver_f32(%tile : vector<[4]x[4]xf32>, %tile_slice_index : index, %dest : memref<?x?xf32>) -> () {
-  // CHECK: arm_sme.store_tile_slice {{.*}} layout<vertical> : memref<?x?xf32>, vector<[4]x[4]xf32>
+func.func @arm_sme_store_tile_slice_ver_f32(%tile : vector<[4]x[4]xf32>, %tile_slice_index : index, %mask : vector<[4]xi1>, %dest : memref<?x?xf32>) -> () {
+  // CHECK: arm_sme.store_tile_slice {{.*}} layout<vertical> : memref<?x?xf32>, vector<[4]xi1>, vector<[4]x[4]xf32>
   %c0 = arith.constant 0 : index
-  arm_sme.store_tile_slice %tile, %tile_slice_index, %dest[%c0] layout<vertical> : memref<?x?xf32>, vector<[4]x[4]xf32>
+  arm_sme.store_tile_slice %tile, %tile_slice_index, %mask, %dest[%c0] layout<vertical> : memref<?x?xf32>, vector<[4]xi1>, vector<[4]x[4]xf32>
   return
 }
 
 // -----
 
-func.func @arm_sme_store_tile_slice_ver_f64(%tile : vector<[2]x[2]xf64>, %tile_slice_index : index, %dest : memref<?x?xf64>) -> () {
-  // CHECK: arm_sme.store_tile_slice {{.*}} layout<vertical> : memref<?x?xf64>, vector<[2]x[2]xf64>
+func.func @arm_sme_store_tile_slice_ver_f64(%tile : vector<[2]x[2]xf64>, %tile_slice_index : index, %mask : vector<[2]xi1>, %dest : memref<?x?xf64>) -> () {
+  // CHECK: arm_sme.store_tile_slice {{.*}} layout<vertical> : memref<?x?xf64>, vector<[2]xi1>, vector<[2]x[2]xf64>
   %c0 = arith.constant 0 : index
-  arm_sme.store_tile_slice %tile, %tile_slice_index, %dest[%c0] layout<vertical> : memref<?x?xf64>, vector<[2]x[2]xf64>
+  arm_sme.store_tile_slice %tile, %tile_slice_index, %mask, %dest[%c0] layout<vertical> : memref<?x?xf64>, vector<[2]xi1>, vector<[2]x[2]xf64>
   return
 }
 
 // -----
 
 /// Layout is optional and horizontal is the default, verify it's still parsed.
-func.func @arm_sme_store_tile_slice_hor_i8(%tile : vector<[16]x[16]xi8>, %tile_slice_index : index, %dest : memref<?x?xi8>) -> () {
-  // CHECK: arm_sme.store_tile_slice {{.*}}, {{.*}}, %{{.*}}[{{.*}}] : memref<?x?xi8>, vector<[16]x[16]xi8>
+func.func @arm_sme_store_tile_slice_hor_i8(%tile : vector<[16]x[16]xi8>, %tile_slice_index : index, %mask : vector<[16]xi1>, %dest : memref<?x?xi8>) -> () {
+  // CHECK: arm_sme.store_tile_slice {{.*}}, {{.*}}, %{{.*}}[{{.*}}] : memref<?x?xi8>, vector<[16]xi1>, vector<[16]x[16]xi8>
   %c0 = arith.constant 0 : index
-  arm_sme.store_tile_slice %tile, %tile_slice_index, %dest[%c0] layout<horizontal> : memref<?x?xi8>, vector<[16]x[16]xi8>
+  arm_sme.store_tile_slice %tile, %tile_slice_index, %mask, %dest[%c0] layout<horizontal> : memref<?x?xi8>, vector<[16]xi1>, vector<[16]x[16]xi8>
   return
 }
 

From e408f705240096b3310836fc623256b78709a05f Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Thu, 2 Nov 2023 09:47:31 +0100
Subject: [PATCH 790/877] [LSR] Avoid use of ConstantExpr::getCast() (NFC)

Use the constant folding API instead, which must succeed as we're
working on a ConstantInt.
---
 llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index 9549a9f41fa11..6e04f54e13da5 100644
--- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -5546,10 +5546,12 @@ Value *LSRInstance::Expand(const LSRUse &LU, const LSRFixup &LF,
              "a scale at the same time!");
       Constant *C = ConstantInt::getSigned(SE.getEffectiveSCEVType(OpTy),
                                            -(uint64_t)Offset);
-      if (C->getType() != OpTy)
-        C = ConstantExpr::getCast(CastInst::getCastOpcode(C, false,
-                                                          OpTy, false),
-                                  C, OpTy);
+      if (C->getType() != OpTy) {
+        C = ConstantFoldCastOperand(
+            CastInst::getCastOpcode(C, false, OpTy, false), C, OpTy,
+            CI->getModule()->getDataLayout());
+        assert(C && "Cast of ConstantInt should have folded");
+      }
 
       CI->setOperand(1, C);
     }

From b73d7390732b48014983aa9569e68c139f61bfcb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Thu, 2 Nov 2023 10:18:15 +0200
Subject: [PATCH 791/877] Revert "[clang-repl] [test] Make an XFAIL more
 precise (#70991)"

This reverts commit 3bc056d5f0ebe9e4074afa088c3a0355f9ab901a.

This broke on bots with a target triple of x86_64-sie-ps5,
which also appear to behave like the MSVC case.
---
 clang/test/Interpreter/const.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/test/Interpreter/const.cpp b/clang/test/Interpreter/const.cpp
index 1412a1d85d6f5..4b6ce65e3643e 100644
--- a/clang/test/Interpreter/const.cpp
+++ b/clang/test/Interpreter/const.cpp
@@ -1,6 +1,6 @@
 // UNSUPPORTED: system-aix
 // see https://github.com/llvm/llvm-project/issues/68092
-// XFAIL: target={{.*}}-windows-msvc
+// XFAIL: system-windows
 
 // RUN: cat %s | clang-repl | FileCheck %s
 // RUN: cat %s | clang-repl -Xcc -O2 | FileCheck %s

From b3523d7e6d8834468cfcb66e629adbe17da90ea5 Mon Sep 17 00:00:00 2001
From: Carl Ritson <carl.ritson@amd.com>
Date: Thu, 2 Nov 2023 17:37:36 +0900
Subject: [PATCH 792/877] [AMDGPU] Generate wwm-reserved.ll (NFC)

---
 llvm/test/CodeGen/AMDGPU/wwm-reserved.ll | 1639 ++++++++++++++++++++--
 1 file changed, 1492 insertions(+), 147 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll
index 5bcaee3e0bf51..c134cf7ea3aaa 100644
--- a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll
+++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll
@@ -1,10 +1,92 @@
-; RUN: llc -O0 -march=amdgcn -mcpu=gfx900 -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX9-O0 %s
-; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX9-O3 %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc -O0 -march=amdgcn -mcpu=gfx900 -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9-O0 %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9-O3 %s
 
 ; NOTE: llvm.amdgcn.wwm is deprecated, use llvm.amdgcn.strict.wwm instead.
 
-; GFX9-LABEL: {{^}}no_cfg:
 define amdgpu_cs void @no_cfg(ptr addrspace(8) inreg %tmp14) {
+; GFX9-O0-LABEL: no_cfg:
+; GFX9-O0:       ; %bb.0:
+; GFX9-O0-NEXT:    s_mov_b32 s6, s2
+; GFX9-O0-NEXT:    s_mov_b32 s4, s0
+; GFX9-O0-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX9-O0-NEXT:    s_mov_b32 s7, s3
+; GFX9-O0-NEXT:    s_mov_b32 s8, s7
+; GFX9-O0-NEXT:    s_mov_b32 s9, s6
+; GFX9-O0-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX9-O0-NEXT:    s_mov_b32 s5, s1
+; GFX9-O0-NEXT:    s_mov_b32 s10, s5
+; GFX9-O0-NEXT:    s_mov_b32 s0, s4
+; GFX9-O0-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; GFX9-O0-NEXT:    s_mov_b32 s1, s10
+; GFX9-O0-NEXT:    s_mov_b32 s2, s9
+; GFX9-O0-NEXT:    s_mov_b32 s3, s8
+; GFX9-O0-NEXT:    s_mov_b32 s4, 0
+; GFX9-O0-NEXT:    buffer_load_dwordx2 v[5:6], off, s[0:3], s4
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v4
+; GFX9-O0-NEXT:    s_not_b64 exec, exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-O0-NEXT:    s_not_b64 exec, exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v3
+; GFX9-O0-NEXT:    s_not_b64 exec, exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-O0-NEXT:    s_not_b64 exec, exec
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9-O0-NEXT:    s_nop 1
+; GFX9-O0-NEXT:    v_mov_b32_dpp v2, v0 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-O0-NEXT:    v_add_u32_e64 v0, v0, v2
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v0
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-O0-NEXT:    s_nop 1
+; GFX9-O0-NEXT:    v_mov_b32_dpp v0, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-O0-NEXT:    v_add_u32_e64 v0, v1, v0
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v0
+; GFX9-O0-NEXT:    v_cmp_eq_u32_e64 s[6:7], v3, v4
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[6:7]
+; GFX9-O0-NEXT:    s_mov_b32 s5, 1
+; GFX9-O0-NEXT:    v_lshlrev_b32_e64 v3, s5, v3
+; GFX9-O0-NEXT:    s_mov_b32 s5, 2
+; GFX9-O0-NEXT:    v_and_b32_e64 v3, v3, s5
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s4 offset:4
+; GFX9-O0-NEXT:    s_endpgm
+;
+; GFX9-O3-LABEL: no_cfg:
+; GFX9-O3:       ; %bb.0:
+; GFX9-O3-NEXT:    buffer_load_dwordx2 v[4:5], off, s[0:3], 0
+; GFX9-O3-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-O3-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-O3-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-O3-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-O3-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O3-NEXT:    v_mov_b32_e32 v2, v4
+; GFX9-O3-NEXT:    s_not_b64 exec, exec
+; GFX9-O3-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-O3-NEXT:    s_not_b64 exec, exec
+; GFX9-O3-NEXT:    v_mov_b32_e32 v3, v5
+; GFX9-O3-NEXT:    s_not_b64 exec, exec
+; GFX9-O3-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9-O3-NEXT:    s_not_b64 exec, exec
+; GFX9-O3-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-O3-NEXT:    v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-O3-NEXT:    v_mov_b32_dpp v0, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-O3-NEXT:    v_add_u32_e32 v1, v2, v1
+; GFX9-O3-NEXT:    v_add_u32_e32 v0, v3, v0
+; GFX9-O3-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-O3-NEXT:    v_mov_b32_e32 v4, v1
+; GFX9-O3-NEXT:    v_mov_b32_e32 v5, v0
+; GFX9-O3-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX9-O3-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; GFX9-O3-NEXT:    v_lshlrev_b32_e32 v4, 1, v4
+; GFX9-O3-NEXT:    v_and_b32_e32 v4, 2, v4
+; GFX9-O3-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:4
+; GFX9-O3-NEXT:    s_endpgm
   %tmp100 = call <2 x float> @llvm.amdgcn.raw.ptr.buffer.load.v2f32(ptr addrspace(8) %tmp14, i32 0, i32 0, i32 0)
   %tmp101 = bitcast <2 x float> %tmp100 to <2 x i32>
   %tmp102 = extractelement <2 x i32> %tmp101, i32 0
@@ -12,26 +94,15 @@ define amdgpu_cs void @no_cfg(ptr addrspace(8) inreg %tmp14) {
   %tmp105 = tail call i32 @llvm.amdgcn.set.inactive.i32(i32 %tmp102, i32 0)
   %tmp107 = tail call i32 @llvm.amdgcn.set.inactive.i32(i32 %tmp103, i32 0)
 
-; GFX9: s_or_saveexec_b64 s[{{[0-9]+}}:{{[0-9]+}}], -1
 
-; GFX9-DAG: v_mov_b32_dpp v[[FIRST_MOV:[0-9]+]], v{{[0-9]+}} row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-O3-DAG: v_add_u32_e32 v[[FIRST_ADD:[0-9]+]], v{{[0-9]+}}, v[[FIRST_MOV]]
-; GFX9-O0-DAG: v_add_u32_e64 v[[FIRST_ADD:[0-9]+]], v{{[0-9]+}}, v[[FIRST_MOV]]
-; GFX9-DAG: v_mov_b32_e32 v[[FIRST:[0-9]+]], v[[FIRST_ADD]]
   %tmp120 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp105, i32 323, i32 12, i32 15, i1 false)
   %tmp121 = add i32 %tmp105, %tmp120
   %tmp122 = tail call i32 @llvm.amdgcn.wwm.i32(i32 %tmp121)
 
-; GFX9-DAG: v_mov_b32_dpp v[[SECOND_MOV:[0-9]+]], v{{[0-9]+}} row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-O3-DAG: v_add_u32_e32 v[[SECOND_ADD:[0-9]+]], v{{[0-9]+}}, v[[SECOND_MOV]]
-; GFX9-O0-DAG: v_add_u32_e64 v[[SECOND_ADD:[0-9]+]], v{{[0-9]+}}, v[[SECOND_MOV]]
-; GFX9-DAG: v_mov_b32_e32 v[[SECOND:[0-9]+]], v[[SECOND_ADD]]
   %tmp135 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp107, i32 323, i32 12, i32 15, i1 false)
   %tmp136 = add i32 %tmp107, %tmp135
   %tmp137 = tail call i32 @llvm.amdgcn.wwm.i32(i32 %tmp136)
 
-; GFX9-O3: v_cmp_eq_u32_e32 vcc, v[[FIRST]], v[[SECOND]]
-; GFX9-O0: v_cmp_eq_u32_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[FIRST]], v[[SECOND]]
   %tmp138 = icmp eq i32 %tmp122, %tmp137
   %tmp139 = sext i1 %tmp138 to i32
   %tmp140 = shl nsw i32 %tmp139, 1
@@ -41,19 +112,171 @@ define amdgpu_cs void @no_cfg(ptr addrspace(8) inreg %tmp14) {
   ret void
 }
 
-; GFX9-LABEL: {{^}}cfg:
 define amdgpu_cs void @cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) {
+; GFX9-O0-LABEL: cfg:
+; GFX9-O0:       ; %bb.0: ; %entry
+; GFX9-O0-NEXT:    s_mov_b32 s16, SCRATCH_RSRC_DWORD0
+; GFX9-O0-NEXT:    s_mov_b32 s17, SCRATCH_RSRC_DWORD1
+; GFX9-O0-NEXT:    s_mov_b32 s18, -1
+; GFX9-O0-NEXT:    s_mov_b32 s19, 0xe00000
+; GFX9-O0-NEXT:    s_add_u32 s16, s16, s4
+; GFX9-O0-NEXT:    s_addc_u32 s17, s17, 0
+; GFX9-O0-NEXT:    ; implicit-def: $vgpr3 : SGPR spill to VGPR lane
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v0
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[12:13], -1
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[16:19], 0 offset:4 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[12:13]
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s3, 0
+; GFX9-O0-NEXT:    s_mov_b32 s4, s1
+; GFX9-O0-NEXT:    v_readlane_b32 s1, v0, 0
+; GFX9-O0-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; GFX9-O0-NEXT:    s_mov_b32 s3, s1
+; GFX9-O0-NEXT:    s_mov_b32 s8, s3
+; GFX9-O0-NEXT:    s_mov_b32 s9, s2
+; GFX9-O0-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; GFX9-O0-NEXT:    s_mov_b32 s1, s4
+; GFX9-O0-NEXT:    s_mov_b32 s10, s1
+; GFX9-O0-NEXT:    s_mov_b32 s4, s0
+; GFX9-O0-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX9-O0-NEXT:    s_mov_b32 s5, s10
+; GFX9-O0-NEXT:    s_mov_b32 s6, s9
+; GFX9-O0-NEXT:    s_mov_b32 s7, s8
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s2, 1
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s3, 2
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s0, 3
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s1, 4
+; GFX9-O0-NEXT:    s_mov_b32 s0, 0
+; GFX9-O0-NEXT:    s_nop 2
+; GFX9-O0-NEXT:    buffer_load_dwordx2 v[4:5], off, s[4:7], s0
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[16:19], 0 offset:16 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[16:19], 0 offset:20 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr2_sgpr3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v4
+; GFX9-O0-NEXT:    s_not_b64 exec, exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-O0-NEXT:    s_not_b64 exec, exec
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s0
+; GFX9-O0-NEXT:    s_nop 1
+; GFX9-O0-NEXT:    v_mov_b32_dpp v2, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-O0-NEXT:    v_add_u32_e64 v1, v1, v2
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v1
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[16:19], 0 offset:12 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_cmp_eq_u32_e64 s[2:3], v3, s0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, s0
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[16:19], 0 offset:8 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_mov_b64 s[0:1], exec
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s0, 5
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s1, 6
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[12:13], -1
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[16:19], 0 offset:4 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[12:13]
+; GFX9-O0-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX9-O0-NEXT:    s_cbranch_execz .LBB1_2
+; GFX9-O0-NEXT:  ; %bb.1: ; %if
+; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[16:19], 0 offset:16 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[16:19], 0 offset:20 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v4
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-O0-NEXT:    s_not_b64 exec, exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v1
+; GFX9-O0-NEXT:    s_not_b64 exec, exec
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX9-O0-NEXT:    v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-O0-NEXT:    v_add_u32_e64 v1, v2, v1
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v1
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[16:19], 0 offset:8 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:  .LBB1_2: ; %merge
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[12:13], -1
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[16:19], 0 offset:4 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[12:13]
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    v_readlane_b32 s4, v0, 5
+; GFX9-O0-NEXT:    v_readlane_b32 s5, v0, 6
+; GFX9-O0-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9-O0-NEXT:    v_readlane_b32 s2, v0, 1
+; GFX9-O0-NEXT:    v_readlane_b32 s3, v0, 2
+; GFX9-O0-NEXT:    v_readlane_b32 s0, v0, 3
+; GFX9-O0-NEXT:    v_readlane_b32 s1, v0, 4
+; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[16:19], 0 offset:12 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[16:19], 0 offset:8 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    v_cmp_eq_u32_e64 s[4:5], v3, v4
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[4:5]
+; GFX9-O0-NEXT:    s_mov_b32 s4, 1
+; GFX9-O0-NEXT:    v_lshlrev_b32_e64 v3, s4, v3
+; GFX9-O0-NEXT:    s_mov_b32 s4, 2
+; GFX9-O0-NEXT:    v_and_b32_e64 v3, v3, s4
+; GFX9-O0-NEXT:    s_mov_b32 s6, s1
+; GFX9-O0-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; GFX9-O0-NEXT:    s_mov_b32 s4, s3
+; GFX9-O0-NEXT:    s_mov_b32 s5, s2
+; GFX9-O0-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; GFX9-O0-NEXT:    s_mov_b32 s1, s6
+; GFX9-O0-NEXT:    s_mov_b32 s2, s5
+; GFX9-O0-NEXT:    s_mov_b32 s3, s4
+; GFX9-O0-NEXT:    s_mov_b32 s4, 0
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s4 offset:4
+; GFX9-O0-NEXT:    ; kill: killed $vgpr0
+; GFX9-O0-NEXT:    s_endpgm
+;
+; GFX9-O3-LABEL: cfg:
+; GFX9-O3:       ; %bb.0: ; %entry
+; GFX9-O3-NEXT:    buffer_load_dwordx2 v[3:4], off, s[0:3], 0
+; GFX9-O3-NEXT:    v_mov_b32_e32 v5, 0
+; GFX9-O3-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-O3-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-O3-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-O3-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O3-NEXT:    v_mov_b32_e32 v2, v3
+; GFX9-O3-NEXT:    s_not_b64 exec, exec
+; GFX9-O3-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-O3-NEXT:    s_not_b64 exec, exec
+; GFX9-O3-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-O3-NEXT:    v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-O3-NEXT:    v_add_u32_e32 v1, v2, v1
+; GFX9-O3-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-O3-NEXT:    v_mov_b32_e32 v3, v1
+; GFX9-O3-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-O3-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX9-O3-NEXT:    s_cbranch_execz .LBB1_2
+; GFX9-O3-NEXT:  ; %bb.1: ; %if
+; GFX9-O3-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-O3-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-O3-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-O3-NEXT:    v_mov_b32_e32 v2, v4
+; GFX9-O3-NEXT:    s_not_b64 exec, exec
+; GFX9-O3-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-O3-NEXT:    s_not_b64 exec, exec
+; GFX9-O3-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-O3-NEXT:    v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-O3-NEXT:    v_add_u32_e32 v1, v2, v1
+; GFX9-O3-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-O3-NEXT:    v_mov_b32_e32 v5, v1
+; GFX9-O3-NEXT:  .LBB1_2: ; %merge
+; GFX9-O3-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9-O3-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX9-O3-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-O3-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX9-O3-NEXT:    v_and_b32_e32 v0, 2, v0
+; GFX9-O3-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4
+; GFX9-O3-NEXT:    s_endpgm
 entry:
   %tmp100 = call <2 x float> @llvm.amdgcn.raw.ptr.buffer.load.v2f32(ptr addrspace(8) %tmp14, i32 0, i32 0, i32 0)
   %tmp101 = bitcast <2 x float> %tmp100 to <2 x i32>
   %tmp102 = extractelement <2 x i32> %tmp101, i32 0
   %tmp105 = tail call i32 @llvm.amdgcn.set.inactive.i32(i32 %tmp102, i32 0)
 
-; GFX9: v_mov_b32_dpp v[[FIRST_MOV:[0-9]+]], v{{[0-9]+}} row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-O3: v_add_u32_e32 v[[FIRST_ADD:[0-9]+]], v{{[0-9]+}}, v[[FIRST_MOV]]
-; GFX9-O0: v_add_u32_e64 v[[FIRST_ADD:[0-9]+]], v{{[0-9]+}}, v[[FIRST_MOV]]
-; GFX9: v_mov_b32_e32 v[[FIRST:[0-9]+]], v[[FIRST_ADD]]
-; GFX9-O0: buffer_store_dword v[[FIRST]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:[[FIRST_IMM_OFFSET:[0-9]+]]
   %tmp120 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp105, i32 323, i32 12, i32 15, i1 false)
   %tmp121 = add i32 %tmp105, %tmp120
   %tmp122 = tail call i32 @llvm.amdgcn.wwm.i32(i32 %tmp121)
@@ -64,11 +287,6 @@ if:
   %tmp103 = extractelement <2 x i32> %tmp101, i32 1
   %tmp107 = tail call i32 @llvm.amdgcn.set.inactive.i32(i32 %tmp103, i32 0)
 
-; GFX9: v_mov_b32_dpp v[[SECOND_MOV:[0-9]+]], v{{[0-9]+}} row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-O3: v_add_u32_e32 v[[SECOND_ADD:[0-9]+]], v{{[0-9]+}}, v[[SECOND_MOV]]
-; GFX9-O0: v_add_u32_e64 v[[SECOND_ADD:[0-9]+]], v{{[0-9]+}}, v[[SECOND_MOV]]
-; GFX9: v_mov_b32_e32 v[[SECOND:[0-9]+]], v[[SECOND_ADD]]
-; GFX9-O0: buffer_store_dword v[[SECOND]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:[[SECOND_IMM_OFFSET:[0-9]+]]
   %tmp135 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp107, i32 323, i32 12, i32 15, i1 false)
   %tmp136 = add i32 %tmp107, %tmp135
   %tmp137 = tail call i32 @llvm.amdgcn.wwm.i32(i32 %tmp136)
@@ -76,10 +294,6 @@ if:
 
 merge:
   %merge_value = phi i32 [ 0, %entry ], [%tmp137, %if ]
-; GFX9-O3: v_cmp_eq_u32_e32 vcc, v[[FIRST]], v[[SECOND]]
-; GFX9-O0: buffer_load_dword v[[FIRST:[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:[[FIRST_IMM_OFFSET]]
-; GFX9-O0: buffer_load_dword v[[SECOND:[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:[[SECOND_IMM_OFFSET]]
-; GFX9-O0: v_cmp_eq_u32_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[FIRST]], v[[SECOND]]
   %tmp138 = icmp eq i32 %tmp122, %merge_value
   %tmp139 = sext i1 %tmp138 to i32
   %tmp140 = shl nsw i32 %tmp139, 1
@@ -89,92 +303,553 @@ merge:
   ret void
 }
 
-; GFX9-LABEL: {{^}}called:
 define hidden i32 @called(i32 %a) noinline {
-; GFX9-O3: v_add_u32_e32 v1, v0, v0
-; GFX9-O0: v_add_u32_e64 v1, v0, v0
+; GFX9-O0-LABEL: called:
+; GFX9-O0:       ; %bb.0:
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-O0-NEXT:    v_add_u32_e64 v1, v0, v0
+; GFX9-O0-NEXT:    v_mul_lo_u32 v0, v1, v0
+; GFX9-O0-NEXT:    v_sub_u32_e64 v0, v0, v1
+; GFX9-O0-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-O3-LABEL: called:
+; GFX9-O3:       ; %bb.0:
+; GFX9-O3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-O3-NEXT:    v_add_u32_e32 v1, v0, v0
+; GFX9-O3-NEXT:    v_mul_lo_u32 v0, v1, v0
+; GFX9-O3-NEXT:    v_sub_u32_e32 v0, v0, v1
+; GFX9-O3-NEXT:    s_setpc_b64 s[30:31]
   %add = add i32 %a, %a
-; GFX9: v_mul_lo_u32 v0, v1, v0
   %mul = mul i32 %add, %a
-; GFX9-O3: v_sub_u32_e32 v0, v0, v1
-; GFX9-O0: v_sub_u32_e64 v0, v0, v1
   %sub = sub i32 %mul, %add
   ret i32 %sub
 }
 
-; GFX9-LABEL: {{^}}call:
 define amdgpu_kernel void @call(ptr addrspace(8) inreg %tmp14, i32 inreg %arg) {
-; GFX9-DAG: s_load_dword [[ARG:s[0-9]+]]
-; GFX9-O0-DAG: s_mov_b32 s3, 0{{$}}
-; GFX9-O0-DAG: v_mov_b32_e32 v{{[0-9]+}}, [[ARG]]
+; GFX9-O0-LABEL: call:
+; GFX9-O0:       ; %bb.0:
+; GFX9-O0-NEXT:    s_mov_b32 s32, 0x400
+; GFX9-O0-NEXT:    s_mov_b32 s24, SCRATCH_RSRC_DWORD0
+; GFX9-O0-NEXT:    s_mov_b32 s25, SCRATCH_RSRC_DWORD1
+; GFX9-O0-NEXT:    s_mov_b32 s26, -1
+; GFX9-O0-NEXT:    s_mov_b32 s27, 0xe00000
+; GFX9-O0-NEXT:    s_add_u32 s24, s24, s11
+; GFX9-O0-NEXT:    s_addc_u32 s25, s25, 0
+; GFX9-O0-NEXT:    ; implicit-def: $vgpr7 : SGPR spill to VGPR lane
+; GFX9-O0-NEXT:    s_mov_b32 s14, s10
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[10:11], -1
+; GFX9-O0-NEXT:    v_writelane_b32 v7, s10, 0
+; GFX9-O0-NEXT:    v_writelane_b32 v7, s11, 1
+; GFX9-O0-NEXT:    s_mov_b32 s13, s9
+; GFX9-O0-NEXT:    s_mov_b32 s12, s8
+; GFX9-O0-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX9-O0-NEXT:    v_writelane_b32 v7, s4, 2
+; GFX9-O0-NEXT:    v_writelane_b32 v7, s5, 3
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
+; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[24:27], 0 offset:4 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
+; GFX9-O0-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX9-O0-NEXT:    v_readlane_b32 s2, v7, 0
+; GFX9-O0-NEXT:    v_readlane_b32 s3, v7, 1
+; GFX9-O0-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX9-O0-NEXT:    v_readlane_b32 s0, v7, 2
+; GFX9-O0-NEXT:    v_readlane_b32 s1, v7, 3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v0
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX9-O0-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x24
+; GFX9-O0-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x2c
+; GFX9-O0-NEXT:    s_load_dword s2, s[0:1], 0x34
+; GFX9-O0-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-O0-NEXT:    s_mov_b32 s3, s9
+; GFX9-O0-NEXT:    ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9
+; GFX9-O0-NEXT:    s_mov_b32 s9, s17
+; GFX9-O0-NEXT:    ; kill: def $sgpr16 killed $sgpr16 killed $sgpr16_sgpr17
+; GFX9-O0-NEXT:    ; kill: def $sgpr16 killed $sgpr16 def $sgpr16_sgpr17_sgpr18_sgpr19
+; GFX9-O0-NEXT:    s_mov_b32 s17, s9
+; GFX9-O0-NEXT:    s_mov_b32 s18, s8
+; GFX9-O0-NEXT:    s_mov_b32 s19, s3
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s16, 4
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s17, 5
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s18, 6
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s19, 7
+; GFX9-O0-NEXT:    s_mov_b32 s3, 0
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s3, 8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, s2
+; GFX9-O0-NEXT:    s_not_b64 exec, exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, s3
+; GFX9-O0-NEXT:    s_not_b64 exec, exec
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s2, 9
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s3, 10
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
+; GFX9-O0-NEXT:    s_mov_b64 s[8:9], 56
+; GFX9-O0-NEXT:    s_mov_b32 s2, s0
+; GFX9-O0-NEXT:    s_mov_b32 s0, s1
+; GFX9-O0-NEXT:    s_mov_b32 s3, s8
+; GFX9-O0-NEXT:    s_mov_b32 s1, s9
+; GFX9-O0-NEXT:    s_add_u32 s8, s2, s3
+; GFX9-O0-NEXT:    s_addc_u32 s0, s0, s1
+; GFX9-O0-NEXT:    ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9
+; GFX9-O0-NEXT:    s_mov_b32 s9, s0
+; GFX9-O0-NEXT:    s_getpc_b64 s[16:17]
+; GFX9-O0-NEXT:    s_add_u32 s16, s16, called@rel32@lo+4
+; GFX9-O0-NEXT:    s_addc_u32 s17, s17, called@rel32@hi+12
+; GFX9-O0-NEXT:    s_mov_b64 s[0:1], s[24:25]
+; GFX9-O0-NEXT:    s_mov_b64 s[2:3], s[26:27]
+; GFX9-O0-NEXT:    s_mov_b32 s15, 20
+; GFX9-O0-NEXT:    v_lshlrev_b32_e64 v3, s15, v3
+; GFX9-O0-NEXT:    s_mov_b32 s15, 10
+; GFX9-O0-NEXT:    v_lshlrev_b32_e64 v4, s15, v4
+; GFX9-O0-NEXT:    v_or3_b32 v3, v5, v4, v3
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr15
+; GFX9-O0-NEXT:    v_mov_b32_e32 v31, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v6
+; GFX9-O0-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[24:27], 0 offset:4 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    v_readlane_b32 s0, v1, 4
+; GFX9-O0-NEXT:    v_readlane_b32 s1, v1, 5
+; GFX9-O0-NEXT:    v_readlane_b32 s2, v1, 6
+; GFX9-O0-NEXT:    v_readlane_b32 s3, v1, 7
+; GFX9-O0-NEXT:    v_readlane_b32 s6, v1, 9
+; GFX9-O0-NEXT:    v_readlane_b32 s7, v1, 10
+; GFX9-O0-NEXT:    v_readlane_b32 s4, v1, 8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v0
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
+; GFX9-O0-NEXT:    v_add_u32_e64 v3, v3, v6
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v3
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s4 offset:4
+; GFX9-O0-NEXT:    ; kill: killed $vgpr0
+; GFX9-O0-NEXT:    s_endpgm
+;
+; GFX9-O3-LABEL: call:
+; GFX9-O3:       ; %bb.0:
+; GFX9-O3-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
+; GFX9-O3-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
+; GFX9-O3-NEXT:    s_mov_b32 s14, -1
+; GFX9-O3-NEXT:    s_load_dword s2, s[0:1], 0x34
+; GFX9-O3-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-O3-NEXT:    s_mov_b32 s15, 0xe00000
+; GFX9-O3-NEXT:    s_add_u32 s12, s12, s3
+; GFX9-O3-NEXT:    s_addc_u32 s13, s13, 0
+; GFX9-O3-NEXT:    s_mov_b32 s32, 0
+; GFX9-O3-NEXT:    s_getpc_b64 s[8:9]
+; GFX9-O3-NEXT:    s_add_u32 s8, s8, called@rel32@lo+4
+; GFX9-O3-NEXT:    s_addc_u32 s9, s9, called@rel32@hi+12
+; GFX9-O3-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-O3-NEXT:    v_mov_b32_e32 v2, s2
+; GFX9-O3-NEXT:    s_not_b64 exec, exec
+; GFX9-O3-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-O3-NEXT:    s_not_b64 exec, exec
+; GFX9-O3-NEXT:    s_or_saveexec_b64 s[10:11], -1
+; GFX9-O3-NEXT:    s_mov_b64 s[0:1], s[12:13]
+; GFX9-O3-NEXT:    s_mov_b64 s[2:3], s[14:15]
+; GFX9-O3-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9-O3-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; GFX9-O3-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-O3-NEXT:    v_add_u32_e32 v1, v1, v2
+; GFX9-O3-NEXT:    s_mov_b64 exec, s[10:11]
+; GFX9-O3-NEXT:    v_mov_b32_e32 v0, v1
+; GFX9-O3-NEXT:    buffer_store_dword v0, off, s[4:7], 0 offset:4
+; GFX9-O3-NEXT:    s_endpgm
 
-; GFX9-O3: v_mov_b32_e32 v2, [[ARG]]
 
-; GFX9-NEXT: s_not_b64 exec, exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v6, s3
-; GFX9-O3-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: s_not_b64 exec, exec
   %tmp107 = tail call i32 @llvm.amdgcn.set.inactive.i32(i32 %arg, i32 0)
-; GFX9-O0: v_mov_b32_e32 v0, v6
-; GFX9-O3: v_mov_b32_e32 v0, v2
-; GFX9: s_swappc_b64
   %tmp134 = call i32 @called(i32 %tmp107)
-; GFX9-O3: v_mov_b32_e32 v1, v0
-; GFX9-O3: v_add_u32_e32 v1, v1, v2
-; GFX9-O0: v_mov_b32_e32 v3, v0
-; GFX9-O0: v_add_u32_e64 v3, v3, v6
   %tmp136 = add i32 %tmp134, %tmp107
   %tmp137 = tail call i32 @llvm.amdgcn.wwm.i32(i32 %tmp136)
-; GFX9-O0: buffer_store_dword v1
-; GFX9-O3: buffer_store_dword v0
   call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 %tmp137, ptr addrspace(8) %tmp14, i32 4, i32 0, i32 0)
   ret void
 }
 
-; GFX9-LABEL: {{^}}called_i64:
 define i64 @called_i64(i64 %a) noinline {
+; GFX9-O0-LABEL: called_i64:
+; GFX9-O0:       ; %bb.0:
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v1
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4_sgpr5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v3
+; GFX9-O0-NEXT:    v_add_co_u32_e64 v4, s[4:5], v4, v5
+; GFX9-O0-NEXT:    v_addc_co_u32_e64 v0, s[4:5], v0, v1, s[4:5]
+; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v0
+; GFX9-O0-NEXT:    s_mov_b32 s4, 32
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v3
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[0:1], s4, v[0:1]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v4
+; GFX9-O0-NEXT:    v_mul_lo_u32 v1, v0, v1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v2
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[6:7], s4, v[4:5]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v6
+; GFX9-O0-NEXT:    v_mul_lo_u32 v2, v2, v3
+; GFX9-O0-NEXT:    v_mad_u64_u32 v[6:7], s[6:7], v0, v3, 0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v7
+; GFX9-O0-NEXT:    v_add3_u32 v0, v0, v1, v2
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s5
+; GFX9-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v2
+; GFX9-O0-NEXT:    v_lshlrev_b64 v[1:2], s4, v[0:1]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v2
+; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec
+; GFX9-O0-NEXT:    s_mov_b32 s5, 0
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v7
+; GFX9-O0-NEXT:    v_or_b32_e64 v0, v0, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v6
+; GFX9-O0-NEXT:    v_or_b32_e64 v6, v1, v2
+; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v4
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v5
+; GFX9-O0-NEXT:    v_sub_co_u32_e64 v1, s[6:7], v1, v3
+; GFX9-O0-NEXT:    v_subb_co_u32_e64 v0, s[6:7], v0, v2, s[6:7]
+; GFX9-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v1
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[1:2], s4, v[1:2]
+; GFX9-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 killed $vgpr1_vgpr2 killed $exec
+; GFX9-O0-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-O3-LABEL: called_i64:
+; GFX9-O3:       ; %bb.0:
+; GFX9-O3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-O3-NEXT:    v_add_co_u32_e32 v2, vcc, v0, v0
+; GFX9-O3-NEXT:    v_addc_co_u32_e32 v3, vcc, v1, v1, vcc
+; GFX9-O3-NEXT:    v_mul_lo_u32 v4, v3, v0
+; GFX9-O3-NEXT:    v_mul_lo_u32 v5, v2, v1
+; GFX9-O3-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v2, v0, 0
+; GFX9-O3-NEXT:    v_add3_u32 v1, v1, v5, v4
+; GFX9-O3-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v2
+; GFX9-O3-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-O3-NEXT:    s_setpc_b64 s[30:31]
   %add = add i64 %a, %a
   %mul = mul i64 %add, %a
   %sub = sub i64 %mul, %add
   ret i64 %sub
 }
 
-; GFX9-LABEL: {{^}}call_i64:
 define amdgpu_kernel void @call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %arg) {
-; GFX9: s_load_dwordx2 s[[[ARG_LO:[0-9]+]]:[[ARG_HI:[0-9]+]]]{{.*}}, 0x34
+; GFX9-O0-LABEL: call_i64:
+; GFX9-O0:       ; %bb.0:
+; GFX9-O0-NEXT:    s_mov_b32 s32, 0x400
+; GFX9-O0-NEXT:    s_mov_b32 s24, SCRATCH_RSRC_DWORD0
+; GFX9-O0-NEXT:    s_mov_b32 s25, SCRATCH_RSRC_DWORD1
+; GFX9-O0-NEXT:    s_mov_b32 s26, -1
+; GFX9-O0-NEXT:    s_mov_b32 s27, 0xe00000
+; GFX9-O0-NEXT:    s_add_u32 s24, s24, s11
+; GFX9-O0-NEXT:    s_addc_u32 s25, s25, 0
+; GFX9-O0-NEXT:    ; implicit-def: $vgpr12 : SGPR spill to VGPR lane
+; GFX9-O0-NEXT:    s_mov_b32 s14, s10
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[10:11], -1
+; GFX9-O0-NEXT:    v_writelane_b32 v12, s10, 0
+; GFX9-O0-NEXT:    v_writelane_b32 v12, s11, 1
+; GFX9-O0-NEXT:    s_mov_b32 s13, s9
+; GFX9-O0-NEXT:    s_mov_b32 s12, s8
+; GFX9-O0-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX9-O0-NEXT:    v_writelane_b32 v12, s4, 2
+; GFX9-O0-NEXT:    v_writelane_b32 v12, s5, 3
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
+; GFX9-O0-NEXT:    buffer_store_dword v12, off, s[24:27], 0 offset:4 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
+; GFX9-O0-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX9-O0-NEXT:    v_readlane_b32 s2, v12, 0
+; GFX9-O0-NEXT:    v_readlane_b32 s3, v12, 1
+; GFX9-O0-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX9-O0-NEXT:    v_readlane_b32 s0, v12, 2
+; GFX9-O0-NEXT:    v_readlane_b32 s1, v12, 3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v0
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX9-O0-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x24
+; GFX9-O0-NEXT:    s_load_dwordx2 s[18:19], s[0:1], 0x2c
+; GFX9-O0-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX9-O0-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-O0-NEXT:    s_mov_b32 s8, s19
+; GFX9-O0-NEXT:    s_mov_b32 s9, s18
+; GFX9-O0-NEXT:    s_mov_b32 s15, s17
+; GFX9-O0-NEXT:    ; kill: def $sgpr16 killed $sgpr16 killed $sgpr16_sgpr17
+; GFX9-O0-NEXT:    ; kill: def $sgpr16 killed $sgpr16 def $sgpr16_sgpr17_sgpr18_sgpr19
+; GFX9-O0-NEXT:    s_mov_b32 s17, s15
+; GFX9-O0-NEXT:    s_mov_b32 s18, s9
+; GFX9-O0-NEXT:    s_mov_b32 s19, s8
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s16, 4
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s17, 5
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s18, 6
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s19, 7
+; GFX9-O0-NEXT:    s_mov_b64 s[8:9], 0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, s2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, s3
+; GFX9-O0-NEXT:    s_not_b64 exec, exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, s8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, s9
+; GFX9-O0-NEXT:    s_not_b64 exec, exec
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s2, 8
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s3, 9
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
+; GFX9-O0-NEXT:    s_mov_b64 s[8:9], 60
+; GFX9-O0-NEXT:    s_mov_b32 s2, s0
+; GFX9-O0-NEXT:    s_mov_b32 s0, s1
+; GFX9-O0-NEXT:    s_mov_b32 s3, s8
+; GFX9-O0-NEXT:    s_mov_b32 s1, s9
+; GFX9-O0-NEXT:    s_add_u32 s8, s2, s3
+; GFX9-O0-NEXT:    s_addc_u32 s0, s0, s1
+; GFX9-O0-NEXT:    ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9
+; GFX9-O0-NEXT:    s_mov_b32 s9, s0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v8
+; GFX9-O0-NEXT:    s_mov_b32 s0, 32
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr2_sgpr3
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[10:11], s0, v[8:9]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v10
+; GFX9-O0-NEXT:    s_getpc_b64 s[0:1]
+; GFX9-O0-NEXT:    s_add_u32 s0, s0, called_i64@gotpcrel32@lo+4
+; GFX9-O0-NEXT:    s_addc_u32 s1, s1, called_i64@gotpcrel32@hi+12
+; GFX9-O0-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
+; GFX9-O0-NEXT:    s_mov_b64 s[0:1], s[24:25]
+; GFX9-O0-NEXT:    s_mov_b64 s[2:3], s[26:27]
+; GFX9-O0-NEXT:    s_mov_b32 s15, 20
+; GFX9-O0-NEXT:    v_lshlrev_b32_e64 v3, s15, v3
+; GFX9-O0-NEXT:    s_mov_b32 s15, 10
+; GFX9-O0-NEXT:    v_lshlrev_b32_e64 v4, s15, v4
+; GFX9-O0-NEXT:    v_or3_b32 v3, v5, v4, v3
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr15
+; GFX9-O0-NEXT:    v_mov_b32_e32 v31, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v7
+; GFX9-O0-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-O0-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[24:27], 0 offset:4 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    v_readlane_b32 s0, v2, 4
+; GFX9-O0-NEXT:    v_readlane_b32 s1, v2, 5
+; GFX9-O0-NEXT:    v_readlane_b32 s2, v2, 6
+; GFX9-O0-NEXT:    v_readlane_b32 s3, v2, 7
+; GFX9-O0-NEXT:    v_readlane_b32 s4, v2, 8
+; GFX9-O0-NEXT:    v_readlane_b32 s5, v2, 9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v0
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v1
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v9
+; GFX9-O0-NEXT:    v_add_co_u32_e64 v3, s[6:7], v3, v5
+; GFX9-O0-NEXT:    v_addc_co_u32_e64 v4, s[6:7], v4, v6, s[6:7]
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v4
+; GFX9-O0-NEXT:    s_mov_b32 s4, 0
+; GFX9-O0-NEXT:    buffer_store_dwordx2 v[1:2], off, s[0:3], s4 offset:4
+; GFX9-O0-NEXT:    ; kill: killed $vgpr0
+; GFX9-O0-NEXT:    s_endpgm
+;
+; GFX9-O3-LABEL: call_i64:
+; GFX9-O3:       ; %bb.0:
+; GFX9-O3-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
+; GFX9-O3-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
+; GFX9-O3-NEXT:    s_mov_b32 s14, -1
+; GFX9-O3-NEXT:    s_mov_b32 s15, 0xe00000
+; GFX9-O3-NEXT:    s_add_u32 s12, s12, s3
+; GFX9-O3-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX9-O3-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x24
+; GFX9-O3-NEXT:    s_mov_b32 s32, 0
+; GFX9-O3-NEXT:    s_addc_u32 s13, s13, 0
+; GFX9-O3-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX9-O3-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-O3-NEXT:    s_add_u32 s4, s4, called_i64@gotpcrel32@lo+4
+; GFX9-O3-NEXT:    s_addc_u32 s5, s5, called_i64@gotpcrel32@hi+12
+; GFX9-O3-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX9-O3-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX9-O3-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-O3-NEXT:    v_mov_b32_e32 v6, s2
+; GFX9-O3-NEXT:    v_mov_b32_e32 v7, s3
+; GFX9-O3-NEXT:    s_not_b64 exec, exec
+; GFX9-O3-NEXT:    v_mov_b32_e32 v6, 0
+; GFX9-O3-NEXT:    v_mov_b32_e32 v7, 0
+; GFX9-O3-NEXT:    s_not_b64 exec, exec
+; GFX9-O3-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-O3-NEXT:    s_mov_b64 s[0:1], s[12:13]
+; GFX9-O3-NEXT:    s_mov_b64 s[2:3], s[14:15]
+; GFX9-O3-NEXT:    v_mov_b32_e32 v0, v6
+; GFX9-O3-NEXT:    v_mov_b32_e32 v1, v7
+; GFX9-O3-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-O3-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-O3-NEXT:    v_mov_b32_e32 v3, v1
+; GFX9-O3-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v6
+; GFX9-O3-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v7, vcc
+; GFX9-O3-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-O3-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9-O3-NEXT:    v_mov_b32_e32 v1, v3
+; GFX9-O3-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0 offset:4
+; GFX9-O3-NEXT:    s_endpgm
 
-; GFX9-O0: s_mov_b64 s[[[ZERO_LO:[0-9]+]]:[[ZERO_HI:[0-9]+]]], 0{{$}}
-; GFX9-O0-DAG: v_mov_b32_e32 v9, s[[ARG_HI]]
-; GFX9-O0-DAG: v_mov_b32_e32 v8, s[[ARG_LO]]
 
-; GFX9-O3-DAG: v_mov_b32_e32 v7, s[[ARG_HI]]
-; GFX9-O3-DAG: v_mov_b32_e32 v6, s[[ARG_LO]]
 
-; GFX9: s_not_b64 exec, exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v8, s[[ZERO_LO]]
-; GFX9-O0-NEXT: v_mov_b32_e32 v9, s[[ZERO_HI]]
-; GFX9-O3-NEXT: v_mov_b32_e32 v6, 0
-; GFX9-O3-NEXT: v_mov_b32_e32 v7, 0
-; GFX9-NEXT: s_not_b64 exec, exec
   %tmp107 = tail call i64 @llvm.amdgcn.set.inactive.i64(i64 %arg, i64 0)
-; GFX9: s_swappc_b64
   %tmp134 = call i64 @called_i64(i64 %tmp107)
   %tmp136 = add i64 %tmp134, %tmp107
   %tmp137 = tail call i64 @llvm.amdgcn.wwm.i64(i64 %tmp136)
   %tmp138 = bitcast i64 %tmp137 to <2 x i32>
-; GFX9: buffer_store_dwordx2
   call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %tmp138, ptr addrspace(8) %tmp14, i32 4, i32 0, i32 0)
   ret void
 }
 
-; GFX9-LABEL: {{^}}_amdgpu_cs_main:
 define amdgpu_cs void @_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %index) {
+; GFX9-O0-LABEL: _amdgpu_cs_main:
+; GFX9-O0:       ; %bb.0:
+; GFX9-O0-NEXT:    s_mov_b32 s4, s3
+; GFX9-O0-NEXT:    s_mov_b32 s5, s2
+; GFX9-O0-NEXT:    s_mov_b32 s6, s1
+; GFX9-O0-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; GFX9-O0-NEXT:    s_mov_b32 s1, s6
+; GFX9-O0-NEXT:    s_mov_b32 s2, s5
+; GFX9-O0-NEXT:    s_mov_b32 s3, s4
+; GFX9-O0-NEXT:    ; kill: def $sgpr4_sgpr5_sgpr6_sgpr7 killed $sgpr0_sgpr1_sgpr2_sgpr3
+; GFX9-O0-NEXT:    s_mov_b32 s4, 5
+; GFX9-O0-NEXT:    v_lshlrev_b32_e64 v0, s4, v0
+; GFX9-O0-NEXT:    s_mov_b32 s4, 0
+; GFX9-O0-NEXT:    buffer_load_dwordx4 v[10:13], v0, s[0:3], s4 offen
+; GFX9-O0-NEXT:    buffer_load_dwordx2 v[3:4], v0, s[0:3], s4 offen offset:16
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v11
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v10
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
+; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v7
+; GFX9-O0-NEXT:    s_mov_b32 s5, 0x7fffffff
+; GFX9-O0-NEXT:    s_mov_b32 s6, -1
+; GFX9-O0-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX9-O0-NEXT:    s_mov_b32 s7, s5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v6
+; GFX9-O0-NEXT:    s_not_b64 exec, exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s7
+; GFX9-O0-NEXT:    s_not_b64 exec, exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v13
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v12
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
+; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v6
+; GFX9-O0-NEXT:    s_not_b64 exec, exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s7
+; GFX9-O0-NEXT:    s_not_b64 exec, exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v2
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v4
+; GFX9-O0-NEXT:    s_not_b64 exec, exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s7
+; GFX9-O0-NEXT:    s_not_b64 exec, exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v6
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
+; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6_vgpr7_vgpr8 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v11
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v10
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v9
+; GFX9-O0-NEXT:    buffer_store_dwordx4 v[5:8], v0, s[0:3], s4 offen
+; GFX9-O0-NEXT:    buffer_store_dwordx2 v[3:4], v0, s[0:3], s4 offen offset:16
+; GFX9-O0-NEXT:    s_endpgm
+;
+; GFX9-O3-LABEL: _amdgpu_cs_main:
+; GFX9-O3:       ; %bb.0:
+; GFX9-O3-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; GFX9-O3-NEXT:    buffer_load_dwordx4 v[7:10], v0, s[0:3], 0 offen
+; GFX9-O3-NEXT:    buffer_load_dwordx2 v[11:12], v0, s[0:3], 0 offen offset:16
+; GFX9-O3-NEXT:    s_mov_b32 s4, -1
+; GFX9-O3-NEXT:    s_brev_b32 s5, -2
+; GFX9-O3-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-O3-NEXT:    v_mov_b32_e32 v1, v7
+; GFX9-O3-NEXT:    v_mov_b32_e32 v2, v8
+; GFX9-O3-NEXT:    s_not_b64 exec, exec
+; GFX9-O3-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-O3-NEXT:    v_mov_b32_e32 v2, s5
+; GFX9-O3-NEXT:    s_not_b64 exec, exec
+; GFX9-O3-NEXT:    v_mov_b32_e32 v3, v9
+; GFX9-O3-NEXT:    v_mov_b32_e32 v4, v10
+; GFX9-O3-NEXT:    s_not_b64 exec, exec
+; GFX9-O3-NEXT:    v_mov_b32_e32 v3, s4
+; GFX9-O3-NEXT:    v_mov_b32_e32 v4, s5
+; GFX9-O3-NEXT:    s_not_b64 exec, exec
+; GFX9-O3-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O3-NEXT:    v_mov_b32_e32 v5, v11
+; GFX9-O3-NEXT:    v_mov_b32_e32 v6, v12
+; GFX9-O3-NEXT:    s_not_b64 exec, exec
+; GFX9-O3-NEXT:    v_mov_b32_e32 v5, s4
+; GFX9-O3-NEXT:    v_mov_b32_e32 v6, s5
+; GFX9-O3-NEXT:    s_not_b64 exec, exec
+; GFX9-O3-NEXT:    v_mov_b32_e32 v7, v1
+; GFX9-O3-NEXT:    v_mov_b32_e32 v9, v3
+; GFX9-O3-NEXT:    v_mov_b32_e32 v8, v2
+; GFX9-O3-NEXT:    v_mov_b32_e32 v10, v4
+; GFX9-O3-NEXT:    v_mov_b32_e32 v11, v5
+; GFX9-O3-NEXT:    v_mov_b32_e32 v12, v6
+; GFX9-O3-NEXT:    buffer_store_dwordx4 v[7:10], v0, s[0:3], 0 offen
+; GFX9-O3-NEXT:    buffer_store_dwordx2 v[11:12], v0, s[0:3], 0 offen offset:16
+; GFX9-O3-NEXT:    s_endpgm
   %tmp17 = shl i32 %index, 5
-; GFX9: buffer_load_dwordx4
   %tmp18 = tail call <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32> %desc, i32 %tmp17, i32 0)
   %.i0.upto1.bc = bitcast <4 x i32> %tmp18 to <2 x i64>
   %tmp19 = or i32 %tmp17, 16
-; GFX9: buffer_load_dwordx2
   %tmp20 = tail call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %desc, i32 %tmp19, i32 0)
   %.i0.upto1.extract = extractelement <2 x i64> %.i0.upto1.bc, i32 0
   %tmp22 = tail call i64 @llvm.amdgcn.set.inactive.i64(i64 %.i0.upto1.extract, i64 9223372036854775807)
@@ -191,16 +866,95 @@ define amdgpu_cs void @_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %index) {
   %tmp254 = shufflevector <2 x float> %.cast, <2 x float> %.cast6, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %desc.int = bitcast <4 x i32> %desc to i128
   %desc.ptr = inttoptr i128 %desc.int to ptr addrspace(8)
-; GFX9: buffer_store_dwordx4
   tail call void @llvm.amdgcn.raw.ptr.buffer.store.v4f32(<4 x float> %tmp254, ptr addrspace(8) %desc.ptr, i32 %tmp17, i32 0, i32 0)
-  ; GFX9: buffer_store_dwordx2
   tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2f32(<2 x float> %.cast7, ptr addrspace(8) %desc.ptr, i32 %tmp19, i32 0, i32 0)
   ret void
 }
 
 
-; GFX9-LABEL: {{^}}strict_wwm_no_cfg:
 define amdgpu_cs void @strict_wwm_no_cfg(ptr addrspace(8) inreg %tmp14) {
+; GFX9-O0-LABEL: strict_wwm_no_cfg:
+; GFX9-O0:       ; %bb.0:
+; GFX9-O0-NEXT:    s_mov_b32 s6, s2
+; GFX9-O0-NEXT:    s_mov_b32 s4, s0
+; GFX9-O0-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX9-O0-NEXT:    s_mov_b32 s7, s3
+; GFX9-O0-NEXT:    s_mov_b32 s8, s7
+; GFX9-O0-NEXT:    s_mov_b32 s9, s6
+; GFX9-O0-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
+; GFX9-O0-NEXT:    s_mov_b32 s5, s1
+; GFX9-O0-NEXT:    s_mov_b32 s10, s5
+; GFX9-O0-NEXT:    s_mov_b32 s0, s4
+; GFX9-O0-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; GFX9-O0-NEXT:    s_mov_b32 s1, s10
+; GFX9-O0-NEXT:    s_mov_b32 s2, s9
+; GFX9-O0-NEXT:    s_mov_b32 s3, s8
+; GFX9-O0-NEXT:    s_mov_b32 s4, 0
+; GFX9-O0-NEXT:    buffer_load_dwordx2 v[5:6], off, s[0:3], s4
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v4
+; GFX9-O0-NEXT:    s_not_b64 exec, exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-O0-NEXT:    s_not_b64 exec, exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v3
+; GFX9-O0-NEXT:    s_not_b64 exec, exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-O0-NEXT:    s_not_b64 exec, exec
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9-O0-NEXT:    s_nop 1
+; GFX9-O0-NEXT:    v_mov_b32_dpp v2, v0 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-O0-NEXT:    v_add_u32_e64 v0, v0, v2
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v0
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-O0-NEXT:    s_nop 1
+; GFX9-O0-NEXT:    v_mov_b32_dpp v0, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-O0-NEXT:    v_add_u32_e64 v0, v1, v0
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v0
+; GFX9-O0-NEXT:    v_cmp_eq_u32_e64 s[6:7], v3, v4
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[6:7]
+; GFX9-O0-NEXT:    s_mov_b32 s5, 1
+; GFX9-O0-NEXT:    v_lshlrev_b32_e64 v3, s5, v3
+; GFX9-O0-NEXT:    s_mov_b32 s5, 2
+; GFX9-O0-NEXT:    v_and_b32_e64 v3, v3, s5
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s4 offset:4
+; GFX9-O0-NEXT:    s_endpgm
+;
+; GFX9-O3-LABEL: strict_wwm_no_cfg:
+; GFX9-O3:       ; %bb.0:
+; GFX9-O3-NEXT:    buffer_load_dwordx2 v[4:5], off, s[0:3], 0
+; GFX9-O3-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-O3-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-O3-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-O3-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-O3-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O3-NEXT:    v_mov_b32_e32 v2, v4
+; GFX9-O3-NEXT:    s_not_b64 exec, exec
+; GFX9-O3-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-O3-NEXT:    s_not_b64 exec, exec
+; GFX9-O3-NEXT:    v_mov_b32_e32 v3, v5
+; GFX9-O3-NEXT:    s_not_b64 exec, exec
+; GFX9-O3-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9-O3-NEXT:    s_not_b64 exec, exec
+; GFX9-O3-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-O3-NEXT:    v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-O3-NEXT:    v_mov_b32_dpp v0, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-O3-NEXT:    v_add_u32_e32 v1, v2, v1
+; GFX9-O3-NEXT:    v_add_u32_e32 v0, v3, v0
+; GFX9-O3-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-O3-NEXT:    v_mov_b32_e32 v4, v1
+; GFX9-O3-NEXT:    v_mov_b32_e32 v5, v0
+; GFX9-O3-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX9-O3-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; GFX9-O3-NEXT:    v_lshlrev_b32_e32 v4, 1, v4
+; GFX9-O3-NEXT:    v_and_b32_e32 v4, 2, v4
+; GFX9-O3-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:4
+; GFX9-O3-NEXT:    s_endpgm
   %tmp100 = call <2 x float> @llvm.amdgcn.raw.ptr.buffer.load.v2f32(ptr addrspace(8) %tmp14, i32 0, i32 0, i32 0)
   %tmp101 = bitcast <2 x float> %tmp100 to <2 x i32>
   %tmp102 = extractelement <2 x i32> %tmp101, i32 0
@@ -208,26 +962,15 @@ define amdgpu_cs void @strict_wwm_no_cfg(ptr addrspace(8) inreg %tmp14) {
   %tmp105 = tail call i32 @llvm.amdgcn.set.inactive.i32(i32 %tmp102, i32 0)
   %tmp107 = tail call i32 @llvm.amdgcn.set.inactive.i32(i32 %tmp103, i32 0)
 
-; GFX9: s_or_saveexec_b64 s[{{[0-9]+}}:{{[0-9]+}}], -1
 
-; GFX9-DAG: v_mov_b32_dpp v[[FIRST_MOV:[0-9]+]], v{{[0-9]+}} row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-O3-DAG: v_add_u32_e32 v[[FIRST_ADD:[0-9]+]], v{{[0-9]+}}, v[[FIRST_MOV]]
-; GFX9-O0-DAG: v_add_u32_e64 v[[FIRST_ADD:[0-9]+]], v{{[0-9]+}}, v[[FIRST_MOV]]
-; GFX9-DAG: v_mov_b32_e32 v[[FIRST:[0-9]+]], v[[FIRST_ADD]]
   %tmp120 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp105, i32 323, i32 12, i32 15, i1 false)
   %tmp121 = add i32 %tmp105, %tmp120
   %tmp122 = tail call i32 @llvm.amdgcn.strict.wwm.i32(i32 %tmp121)
 
-; GFX9-DAG: v_mov_b32_dpp v[[SECOND_MOV:[0-9]+]], v{{[0-9]+}} row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-O3-DAG: v_add_u32_e32 v[[SECOND_ADD:[0-9]+]], v{{[0-9]+}}, v[[SECOND_MOV]]
-; GFX9-O0-DAG: v_add_u32_e64 v[[SECOND_ADD:[0-9]+]], v{{[0-9]+}}, v[[SECOND_MOV]]
-; GFX9-DAG: v_mov_b32_e32 v[[SECOND:[0-9]+]], v[[SECOND_ADD]]
   %tmp135 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp107, i32 323, i32 12, i32 15, i1 false)
   %tmp136 = add i32 %tmp107, %tmp135
   %tmp137 = tail call i32 @llvm.amdgcn.strict.wwm.i32(i32 %tmp136)
 
-; GFX9-O3: v_cmp_eq_u32_e32 vcc, v[[FIRST]], v[[SECOND]]
-; GFX9-O0: v_cmp_eq_u32_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[FIRST]], v[[SECOND]]
   %tmp138 = icmp eq i32 %tmp122, %tmp137
   %tmp139 = sext i1 %tmp138 to i32
   %tmp140 = shl nsw i32 %tmp139, 1
@@ -237,19 +980,171 @@ define amdgpu_cs void @strict_wwm_no_cfg(ptr addrspace(8) inreg %tmp14) {
   ret void
 }
 
-; GFX9-LABEL: {{^}}strict_wwm_cfg:
 define amdgpu_cs void @strict_wwm_cfg(ptr addrspace(8) inreg %tmp14, i32 %arg) {
+; GFX9-O0-LABEL: strict_wwm_cfg:
+; GFX9-O0:       ; %bb.0: ; %entry
+; GFX9-O0-NEXT:    s_mov_b32 s16, SCRATCH_RSRC_DWORD0
+; GFX9-O0-NEXT:    s_mov_b32 s17, SCRATCH_RSRC_DWORD1
+; GFX9-O0-NEXT:    s_mov_b32 s18, -1
+; GFX9-O0-NEXT:    s_mov_b32 s19, 0xe00000
+; GFX9-O0-NEXT:    s_add_u32 s16, s16, s4
+; GFX9-O0-NEXT:    s_addc_u32 s17, s17, 0
+; GFX9-O0-NEXT:    ; implicit-def: $vgpr3 : SGPR spill to VGPR lane
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v0
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[12:13], -1
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[16:19], 0 offset:4 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[12:13]
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s3, 0
+; GFX9-O0-NEXT:    s_mov_b32 s4, s1
+; GFX9-O0-NEXT:    v_readlane_b32 s1, v0, 0
+; GFX9-O0-NEXT:    ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
+; GFX9-O0-NEXT:    s_mov_b32 s3, s1
+; GFX9-O0-NEXT:    s_mov_b32 s8, s3
+; GFX9-O0-NEXT:    s_mov_b32 s9, s2
+; GFX9-O0-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; GFX9-O0-NEXT:    s_mov_b32 s1, s4
+; GFX9-O0-NEXT:    s_mov_b32 s10, s1
+; GFX9-O0-NEXT:    s_mov_b32 s4, s0
+; GFX9-O0-NEXT:    ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5_sgpr6_sgpr7
+; GFX9-O0-NEXT:    s_mov_b32 s5, s10
+; GFX9-O0-NEXT:    s_mov_b32 s6, s9
+; GFX9-O0-NEXT:    s_mov_b32 s7, s8
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s2, 1
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s3, 2
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s0, 3
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s1, 4
+; GFX9-O0-NEXT:    s_mov_b32 s0, 0
+; GFX9-O0-NEXT:    s_nop 2
+; GFX9-O0-NEXT:    buffer_load_dwordx2 v[4:5], off, s[4:7], s0
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[16:19], 0 offset:16 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[16:19], 0 offset:20 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr2_sgpr3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v4
+; GFX9-O0-NEXT:    s_not_b64 exec, exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-O0-NEXT:    s_not_b64 exec, exec
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s0
+; GFX9-O0-NEXT:    s_nop 1
+; GFX9-O0-NEXT:    v_mov_b32_dpp v2, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-O0-NEXT:    v_add_u32_e64 v1, v1, v2
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v1
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[16:19], 0 offset:12 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_cmp_eq_u32_e64 s[2:3], v3, s0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, s0
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[16:19], 0 offset:8 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_mov_b64 s[0:1], exec
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s0, 5
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s1, 6
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[12:13], -1
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[16:19], 0 offset:4 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[12:13]
+; GFX9-O0-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX9-O0-NEXT:    s_cbranch_execz .LBB8_2
+; GFX9-O0-NEXT:  ; %bb.1: ; %if
+; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[16:19], 0 offset:16 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[16:19], 0 offset:20 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v4
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-O0-NEXT:    s_not_b64 exec, exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v1
+; GFX9-O0-NEXT:    s_not_b64 exec, exec
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX9-O0-NEXT:    v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-O0-NEXT:    v_add_u32_e64 v1, v2, v1
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v1
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[16:19], 0 offset:8 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:  .LBB8_2: ; %merge
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[12:13], -1
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[16:19], 0 offset:4 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[12:13]
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    v_readlane_b32 s4, v0, 5
+; GFX9-O0-NEXT:    v_readlane_b32 s5, v0, 6
+; GFX9-O0-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9-O0-NEXT:    v_readlane_b32 s2, v0, 1
+; GFX9-O0-NEXT:    v_readlane_b32 s3, v0, 2
+; GFX9-O0-NEXT:    v_readlane_b32 s0, v0, 3
+; GFX9-O0-NEXT:    v_readlane_b32 s1, v0, 4
+; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[16:19], 0 offset:12 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[16:19], 0 offset:8 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    v_cmp_eq_u32_e64 s[4:5], v3, v4
+; GFX9-O0-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[4:5]
+; GFX9-O0-NEXT:    s_mov_b32 s4, 1
+; GFX9-O0-NEXT:    v_lshlrev_b32_e64 v3, s4, v3
+; GFX9-O0-NEXT:    s_mov_b32 s4, 2
+; GFX9-O0-NEXT:    v_and_b32_e64 v3, v3, s4
+; GFX9-O0-NEXT:    s_mov_b32 s6, s1
+; GFX9-O0-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; GFX9-O0-NEXT:    s_mov_b32 s4, s3
+; GFX9-O0-NEXT:    s_mov_b32 s5, s2
+; GFX9-O0-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; GFX9-O0-NEXT:    s_mov_b32 s1, s6
+; GFX9-O0-NEXT:    s_mov_b32 s2, s5
+; GFX9-O0-NEXT:    s_mov_b32 s3, s4
+; GFX9-O0-NEXT:    s_mov_b32 s4, 0
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s4 offset:4
+; GFX9-O0-NEXT:    ; kill: killed $vgpr0
+; GFX9-O0-NEXT:    s_endpgm
+;
+; GFX9-O3-LABEL: strict_wwm_cfg:
+; GFX9-O3:       ; %bb.0: ; %entry
+; GFX9-O3-NEXT:    buffer_load_dwordx2 v[3:4], off, s[0:3], 0
+; GFX9-O3-NEXT:    v_mov_b32_e32 v5, 0
+; GFX9-O3-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-O3-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-O3-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-O3-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O3-NEXT:    v_mov_b32_e32 v2, v3
+; GFX9-O3-NEXT:    s_not_b64 exec, exec
+; GFX9-O3-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-O3-NEXT:    s_not_b64 exec, exec
+; GFX9-O3-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX9-O3-NEXT:    v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-O3-NEXT:    v_add_u32_e32 v1, v2, v1
+; GFX9-O3-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-O3-NEXT:    v_mov_b32_e32 v3, v1
+; GFX9-O3-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-O3-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX9-O3-NEXT:    s_cbranch_execz .LBB8_2
+; GFX9-O3-NEXT:  ; %bb.1: ; %if
+; GFX9-O3-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-O3-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-O3-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-O3-NEXT:    v_mov_b32_e32 v2, v4
+; GFX9-O3-NEXT:    s_not_b64 exec, exec
+; GFX9-O3-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-O3-NEXT:    s_not_b64 exec, exec
+; GFX9-O3-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-O3-NEXT:    v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX9-O3-NEXT:    v_add_u32_e32 v1, v2, v1
+; GFX9-O3-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-O3-NEXT:    v_mov_b32_e32 v5, v1
+; GFX9-O3-NEXT:  .LBB8_2: ; %merge
+; GFX9-O3-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9-O3-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX9-O3-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX9-O3-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX9-O3-NEXT:    v_and_b32_e32 v0, 2, v0
+; GFX9-O3-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4
+; GFX9-O3-NEXT:    s_endpgm
 entry:
   %tmp100 = call <2 x float> @llvm.amdgcn.raw.ptr.buffer.load.v2f32(ptr addrspace(8) %tmp14, i32 0, i32 0, i32 0)
   %tmp101 = bitcast <2 x float> %tmp100 to <2 x i32>
   %tmp102 = extractelement <2 x i32> %tmp101, i32 0
   %tmp105 = tail call i32 @llvm.amdgcn.set.inactive.i32(i32 %tmp102, i32 0)
 
-; GFX9: v_mov_b32_dpp v[[FIRST_MOV:[0-9]+]], v{{[0-9]+}} row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-O3: v_add_u32_e32 v[[FIRST_ADD:[0-9]+]], v{{[0-9]+}}, v[[FIRST_MOV]]
-; GFX9-O0: v_add_u32_e64 v[[FIRST_ADD:[0-9]+]], v{{[0-9]+}}, v[[FIRST_MOV]]
-; GFX9: v_mov_b32_e32 v[[FIRST:[0-9]+]], v[[FIRST_ADD]]
-; GFX9-O0: buffer_store_dword v[[FIRST]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:[[FIRST_IMM_OFFSET:[0-9]+]]
   %tmp120 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp105, i32 323, i32 12, i32 15, i1 false)
   %tmp121 = add i32 %tmp105, %tmp120
   %tmp122 = tail call i32 @llvm.amdgcn.strict.wwm.i32(i32 %tmp121)
@@ -260,11 +1155,6 @@ if:
   %tmp103 = extractelement <2 x i32> %tmp101, i32 1
   %tmp107 = tail call i32 @llvm.amdgcn.set.inactive.i32(i32 %tmp103, i32 0)
 
-; GFX9: v_mov_b32_dpp v[[SECOND_MOV:[0-9]+]], v{{[0-9]+}} row_bcast:31 row_mask:0xc bank_mask:0xf
-; GFX9-O3: v_add_u32_e32 v[[SECOND_ADD:[0-9]+]], v{{[0-9]+}}, v[[SECOND_MOV]]
-; GFX9-O0: v_add_u32_e64 v[[SECOND_ADD:[0-9]+]], v{{[0-9]+}}, v[[SECOND_MOV]]
-; GFX9: v_mov_b32_e32 v[[SECOND:[0-9]+]], v[[SECOND_ADD]]
-; GFX9-O0: buffer_store_dword v[[SECOND]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:[[SECOND_IMM_OFFSET:[0-9]+]]
   %tmp135 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp107, i32 323, i32 12, i32 15, i1 false)
   %tmp136 = add i32 %tmp107, %tmp135
   %tmp137 = tail call i32 @llvm.amdgcn.strict.wwm.i32(i32 %tmp136)
@@ -272,10 +1162,6 @@ if:
 
 merge:
   %merge_value = phi i32 [ 0, %entry ], [%tmp137, %if ]
-; GFX9-O3: v_cmp_eq_u32_e32 vcc, v[[FIRST]], v[[SECOND]]
-; GFX9-O0: buffer_load_dword v[[FIRST:[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:[[FIRST_IMM_OFFSET]]
-; GFX9-O0: buffer_load_dword v[[SECOND:[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:[[SECOND_IMM_OFFSET]]
-; GFX9-O0: v_cmp_eq_u32_e64 s[{{[0-9]+}}:{{[0-9]+}}], v[[FIRST]], v[[SECOND]]
   %tmp138 = icmp eq i32 %tmp122, %merge_value
   %tmp139 = sext i1 %tmp138 to i32
   %tmp140 = shl nsw i32 %tmp139, 1
@@ -285,92 +1171,553 @@ merge:
   ret void
 }
 
-; GFX9-LABEL: {{^}}strict_wwm_called:
 define hidden i32 @strict_wwm_called(i32 %a) noinline {
-; GFX9-O3: v_add_u32_e32 v1, v0, v0
-; GFX9-O0: v_add_u32_e64 v1, v0, v0
+; GFX9-O0-LABEL: strict_wwm_called:
+; GFX9-O0:       ; %bb.0:
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-O0-NEXT:    v_add_u32_e64 v1, v0, v0
+; GFX9-O0-NEXT:    v_mul_lo_u32 v0, v1, v0
+; GFX9-O0-NEXT:    v_sub_u32_e64 v0, v0, v1
+; GFX9-O0-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-O3-LABEL: strict_wwm_called:
+; GFX9-O3:       ; %bb.0:
+; GFX9-O3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-O3-NEXT:    v_add_u32_e32 v1, v0, v0
+; GFX9-O3-NEXT:    v_mul_lo_u32 v0, v1, v0
+; GFX9-O3-NEXT:    v_sub_u32_e32 v0, v0, v1
+; GFX9-O3-NEXT:    s_setpc_b64 s[30:31]
   %add = add i32 %a, %a
-; GFX9: v_mul_lo_u32 v0, v1, v0
   %mul = mul i32 %add, %a
-; GFX9-O3: v_sub_u32_e32 v0, v0, v1
-; GFX9-O0: v_sub_u32_e64 v0, v0, v1
   %sub = sub i32 %mul, %add
   ret i32 %sub
 }
 
-; GFX9-LABEL: {{^}}strict_wwm_call:
 define amdgpu_kernel void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 inreg %arg) {
-; GFX9-DAG: s_load_dword [[ARG:s[0-9]+]]
-; GFX9-O0-DAG: s_mov_b32 s3, 0{{$}}
-; GFX9-O0-DAG: v_mov_b32_e32 v6, [[ARG]]
+; GFX9-O0-LABEL: strict_wwm_call:
+; GFX9-O0:       ; %bb.0:
+; GFX9-O0-NEXT:    s_mov_b32 s32, 0x400
+; GFX9-O0-NEXT:    s_mov_b32 s24, SCRATCH_RSRC_DWORD0
+; GFX9-O0-NEXT:    s_mov_b32 s25, SCRATCH_RSRC_DWORD1
+; GFX9-O0-NEXT:    s_mov_b32 s26, -1
+; GFX9-O0-NEXT:    s_mov_b32 s27, 0xe00000
+; GFX9-O0-NEXT:    s_add_u32 s24, s24, s11
+; GFX9-O0-NEXT:    s_addc_u32 s25, s25, 0
+; GFX9-O0-NEXT:    ; implicit-def: $vgpr7 : SGPR spill to VGPR lane
+; GFX9-O0-NEXT:    s_mov_b32 s14, s10
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[10:11], -1
+; GFX9-O0-NEXT:    v_writelane_b32 v7, s10, 0
+; GFX9-O0-NEXT:    v_writelane_b32 v7, s11, 1
+; GFX9-O0-NEXT:    s_mov_b32 s13, s9
+; GFX9-O0-NEXT:    s_mov_b32 s12, s8
+; GFX9-O0-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX9-O0-NEXT:    v_writelane_b32 v7, s4, 2
+; GFX9-O0-NEXT:    v_writelane_b32 v7, s5, 3
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
+; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[24:27], 0 offset:4 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
+; GFX9-O0-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX9-O0-NEXT:    v_readlane_b32 s2, v7, 0
+; GFX9-O0-NEXT:    v_readlane_b32 s3, v7, 1
+; GFX9-O0-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX9-O0-NEXT:    v_readlane_b32 s0, v7, 2
+; GFX9-O0-NEXT:    v_readlane_b32 s1, v7, 3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v0
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX9-O0-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x24
+; GFX9-O0-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x2c
+; GFX9-O0-NEXT:    s_load_dword s2, s[0:1], 0x34
+; GFX9-O0-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-O0-NEXT:    s_mov_b32 s3, s9
+; GFX9-O0-NEXT:    ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9
+; GFX9-O0-NEXT:    s_mov_b32 s9, s17
+; GFX9-O0-NEXT:    ; kill: def $sgpr16 killed $sgpr16 killed $sgpr16_sgpr17
+; GFX9-O0-NEXT:    ; kill: def $sgpr16 killed $sgpr16 def $sgpr16_sgpr17_sgpr18_sgpr19
+; GFX9-O0-NEXT:    s_mov_b32 s17, s9
+; GFX9-O0-NEXT:    s_mov_b32 s18, s8
+; GFX9-O0-NEXT:    s_mov_b32 s19, s3
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s16, 4
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s17, 5
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s18, 6
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s19, 7
+; GFX9-O0-NEXT:    s_mov_b32 s3, 0
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s3, 8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, s2
+; GFX9-O0-NEXT:    s_not_b64 exec, exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, s3
+; GFX9-O0-NEXT:    s_not_b64 exec, exec
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s2, 9
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s3, 10
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
+; GFX9-O0-NEXT:    s_mov_b64 s[8:9], 56
+; GFX9-O0-NEXT:    s_mov_b32 s2, s0
+; GFX9-O0-NEXT:    s_mov_b32 s0, s1
+; GFX9-O0-NEXT:    s_mov_b32 s3, s8
+; GFX9-O0-NEXT:    s_mov_b32 s1, s9
+; GFX9-O0-NEXT:    s_add_u32 s8, s2, s3
+; GFX9-O0-NEXT:    s_addc_u32 s0, s0, s1
+; GFX9-O0-NEXT:    ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9
+; GFX9-O0-NEXT:    s_mov_b32 s9, s0
+; GFX9-O0-NEXT:    s_getpc_b64 s[16:17]
+; GFX9-O0-NEXT:    s_add_u32 s16, s16, strict_wwm_called@rel32@lo+4
+; GFX9-O0-NEXT:    s_addc_u32 s17, s17, strict_wwm_called@rel32@hi+12
+; GFX9-O0-NEXT:    s_mov_b64 s[0:1], s[24:25]
+; GFX9-O0-NEXT:    s_mov_b64 s[2:3], s[26:27]
+; GFX9-O0-NEXT:    s_mov_b32 s15, 20
+; GFX9-O0-NEXT:    v_lshlrev_b32_e64 v3, s15, v3
+; GFX9-O0-NEXT:    s_mov_b32 s15, 10
+; GFX9-O0-NEXT:    v_lshlrev_b32_e64 v4, s15, v4
+; GFX9-O0-NEXT:    v_or3_b32 v3, v5, v4, v3
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr15
+; GFX9-O0-NEXT:    v_mov_b32_e32 v31, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v6
+; GFX9-O0-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[24:27], 0 offset:4 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    v_readlane_b32 s0, v1, 4
+; GFX9-O0-NEXT:    v_readlane_b32 s1, v1, 5
+; GFX9-O0-NEXT:    v_readlane_b32 s2, v1, 6
+; GFX9-O0-NEXT:    v_readlane_b32 s3, v1, 7
+; GFX9-O0-NEXT:    v_readlane_b32 s6, v1, 9
+; GFX9-O0-NEXT:    v_readlane_b32 s7, v1, 10
+; GFX9-O0-NEXT:    v_readlane_b32 s4, v1, 8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v0
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
+; GFX9-O0-NEXT:    v_add_u32_e64 v3, v3, v6
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v3
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s4 offset:4
+; GFX9-O0-NEXT:    ; kill: killed $vgpr0
+; GFX9-O0-NEXT:    s_endpgm
+;
+; GFX9-O3-LABEL: strict_wwm_call:
+; GFX9-O3:       ; %bb.0:
+; GFX9-O3-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
+; GFX9-O3-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
+; GFX9-O3-NEXT:    s_mov_b32 s14, -1
+; GFX9-O3-NEXT:    s_load_dword s2, s[0:1], 0x34
+; GFX9-O3-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-O3-NEXT:    s_mov_b32 s15, 0xe00000
+; GFX9-O3-NEXT:    s_add_u32 s12, s12, s3
+; GFX9-O3-NEXT:    s_addc_u32 s13, s13, 0
+; GFX9-O3-NEXT:    s_mov_b32 s32, 0
+; GFX9-O3-NEXT:    s_getpc_b64 s[8:9]
+; GFX9-O3-NEXT:    s_add_u32 s8, s8, strict_wwm_called@rel32@lo+4
+; GFX9-O3-NEXT:    s_addc_u32 s9, s9, strict_wwm_called@rel32@hi+12
+; GFX9-O3-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-O3-NEXT:    v_mov_b32_e32 v2, s2
+; GFX9-O3-NEXT:    s_not_b64 exec, exec
+; GFX9-O3-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-O3-NEXT:    s_not_b64 exec, exec
+; GFX9-O3-NEXT:    s_or_saveexec_b64 s[10:11], -1
+; GFX9-O3-NEXT:    s_mov_b64 s[0:1], s[12:13]
+; GFX9-O3-NEXT:    s_mov_b64 s[2:3], s[14:15]
+; GFX9-O3-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9-O3-NEXT:    s_swappc_b64 s[30:31], s[8:9]
+; GFX9-O3-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-O3-NEXT:    v_add_u32_e32 v1, v1, v2
+; GFX9-O3-NEXT:    s_mov_b64 exec, s[10:11]
+; GFX9-O3-NEXT:    v_mov_b32_e32 v0, v1
+; GFX9-O3-NEXT:    buffer_store_dword v0, off, s[4:7], 0 offset:4
+; GFX9-O3-NEXT:    s_endpgm
 
-; GFX9-O3: v_mov_b32_e32 v2, [[ARG]]
 
-; GFX9-NEXT: s_not_b64 exec, exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v6, s3
-; GFX9-O3-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: s_not_b64 exec, exec
   %tmp107 = tail call i32 @llvm.amdgcn.set.inactive.i32(i32 %arg, i32 0)
-; GFX9-O3: v_mov_b32_e32 v0, v2
-; GFX9-O0: v_mov_b32_e32 v0, v6
-; GFX9: s_swappc_b64
   %tmp134 = call i32 @strict_wwm_called(i32 %tmp107)
-; GFX9-O3: v_mov_b32_e32 v1, v0
-; GFX9-O3: v_add_u32_e32 v1, v1, v2
-; GFX9-O0: v_mov_b32_e32 v3, v0
-; GFX9-O0: v_add_u32_e64 v3, v3, v6
   %tmp136 = add i32 %tmp134, %tmp107
   %tmp137 = tail call i32 @llvm.amdgcn.strict.wwm.i32(i32 %tmp136)
-; GFX9-O0: buffer_store_dword v1
-; GFX9-O3: buffer_store_dword v0
   call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 %tmp137, ptr addrspace(8) %tmp14, i32 4, i32 0, i32 0)
   ret void
 }
 
-; GFX9-LABEL: {{^}}strict_wwm_called_i64:
 define i64 @strict_wwm_called_i64(i64 %a) noinline {
+; GFX9-O0-LABEL: strict_wwm_called_i64:
+; GFX9-O0:       ; %bb.0:
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX9-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v1
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr4_sgpr5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v3
+; GFX9-O0-NEXT:    v_add_co_u32_e64 v4, s[4:5], v4, v5
+; GFX9-O0-NEXT:    v_addc_co_u32_e64 v0, s[4:5], v0, v1, s[4:5]
+; GFX9-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v0
+; GFX9-O0-NEXT:    s_mov_b32 s4, 32
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v3
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[0:1], s4, v[0:1]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v4
+; GFX9-O0-NEXT:    v_mul_lo_u32 v1, v0, v1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v2
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[6:7], s4, v[4:5]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v6
+; GFX9-O0-NEXT:    v_mul_lo_u32 v2, v2, v3
+; GFX9-O0-NEXT:    v_mad_u64_u32 v[6:7], s[6:7], v0, v3, 0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v7
+; GFX9-O0-NEXT:    v_add3_u32 v0, v0, v1, v2
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s5
+; GFX9-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v2
+; GFX9-O0-NEXT:    v_lshlrev_b64 v[1:2], s4, v[0:1]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v2
+; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec
+; GFX9-O0-NEXT:    s_mov_b32 s5, 0
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v7
+; GFX9-O0-NEXT:    v_or_b32_e64 v0, v0, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v6
+; GFX9-O0-NEXT:    v_or_b32_e64 v6, v1, v2
+; GFX9-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v4
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v5
+; GFX9-O0-NEXT:    v_sub_co_u32_e64 v1, s[6:7], v1, v3
+; GFX9-O0-NEXT:    v_subb_co_u32_e64 v0, s[6:7], v0, v2, s[6:7]
+; GFX9-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v1
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[1:2], s4, v[1:2]
+; GFX9-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 killed $vgpr1_vgpr2 killed $exec
+; GFX9-O0-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-O3-LABEL: strict_wwm_called_i64:
+; GFX9-O3:       ; %bb.0:
+; GFX9-O3-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-O3-NEXT:    v_add_co_u32_e32 v2, vcc, v0, v0
+; GFX9-O3-NEXT:    v_addc_co_u32_e32 v3, vcc, v1, v1, vcc
+; GFX9-O3-NEXT:    v_mul_lo_u32 v4, v3, v0
+; GFX9-O3-NEXT:    v_mul_lo_u32 v5, v2, v1
+; GFX9-O3-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v2, v0, 0
+; GFX9-O3-NEXT:    v_add3_u32 v1, v1, v5, v4
+; GFX9-O3-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v2
+; GFX9-O3-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-O3-NEXT:    s_setpc_b64 s[30:31]
   %add = add i64 %a, %a
   %mul = mul i64 %add, %a
   %sub = sub i64 %mul, %add
   ret i64 %sub
 }
 
-; GFX9-LABEL: {{^}}strict_wwm_call_i64:
 define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %arg) {
-; GFX9: s_load_dwordx2 s[[[ARG_LO:[0-9]+]]:[[ARG_HI:[0-9]+]]]{{.*}}, 0x34
+; GFX9-O0-LABEL: strict_wwm_call_i64:
+; GFX9-O0:       ; %bb.0:
+; GFX9-O0-NEXT:    s_mov_b32 s32, 0x400
+; GFX9-O0-NEXT:    s_mov_b32 s24, SCRATCH_RSRC_DWORD0
+; GFX9-O0-NEXT:    s_mov_b32 s25, SCRATCH_RSRC_DWORD1
+; GFX9-O0-NEXT:    s_mov_b32 s26, -1
+; GFX9-O0-NEXT:    s_mov_b32 s27, 0xe00000
+; GFX9-O0-NEXT:    s_add_u32 s24, s24, s11
+; GFX9-O0-NEXT:    s_addc_u32 s25, s25, 0
+; GFX9-O0-NEXT:    ; implicit-def: $vgpr12 : SGPR spill to VGPR lane
+; GFX9-O0-NEXT:    s_mov_b32 s14, s10
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[10:11], -1
+; GFX9-O0-NEXT:    v_writelane_b32 v12, s10, 0
+; GFX9-O0-NEXT:    v_writelane_b32 v12, s11, 1
+; GFX9-O0-NEXT:    s_mov_b32 s13, s9
+; GFX9-O0-NEXT:    s_mov_b32 s12, s8
+; GFX9-O0-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX9-O0-NEXT:    v_writelane_b32 v12, s4, 2
+; GFX9-O0-NEXT:    v_writelane_b32 v12, s5, 3
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
+; GFX9-O0-NEXT:    buffer_store_dword v12, off, s[24:27], 0 offset:4 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
+; GFX9-O0-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX9-O0-NEXT:    v_readlane_b32 s2, v12, 0
+; GFX9-O0-NEXT:    v_readlane_b32 s3, v12, 1
+; GFX9-O0-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX9-O0-NEXT:    v_readlane_b32 s0, v12, 2
+; GFX9-O0-NEXT:    v_readlane_b32 s1, v12, 3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v0
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX9-O0-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x24
+; GFX9-O0-NEXT:    s_load_dwordx2 s[18:19], s[0:1], 0x2c
+; GFX9-O0-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX9-O0-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-O0-NEXT:    s_mov_b32 s8, s19
+; GFX9-O0-NEXT:    s_mov_b32 s9, s18
+; GFX9-O0-NEXT:    s_mov_b32 s15, s17
+; GFX9-O0-NEXT:    ; kill: def $sgpr16 killed $sgpr16 killed $sgpr16_sgpr17
+; GFX9-O0-NEXT:    ; kill: def $sgpr16 killed $sgpr16 def $sgpr16_sgpr17_sgpr18_sgpr19
+; GFX9-O0-NEXT:    s_mov_b32 s17, s15
+; GFX9-O0-NEXT:    s_mov_b32 s18, s9
+; GFX9-O0-NEXT:    s_mov_b32 s19, s8
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s16, 4
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s17, 5
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s18, 6
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s19, 7
+; GFX9-O0-NEXT:    s_mov_b64 s[8:9], 0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, s2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, s3
+; GFX9-O0-NEXT:    s_not_b64 exec, exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, s8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, s9
+; GFX9-O0-NEXT:    s_not_b64 exec, exec
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s2, 8
+; GFX9-O0-NEXT:    v_writelane_b32 v0, s3, 9
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
+; GFX9-O0-NEXT:    s_mov_b64 s[8:9], 60
+; GFX9-O0-NEXT:    s_mov_b32 s2, s0
+; GFX9-O0-NEXT:    s_mov_b32 s0, s1
+; GFX9-O0-NEXT:    s_mov_b32 s3, s8
+; GFX9-O0-NEXT:    s_mov_b32 s1, s9
+; GFX9-O0-NEXT:    s_add_u32 s8, s2, s3
+; GFX9-O0-NEXT:    s_addc_u32 s0, s0, s1
+; GFX9-O0-NEXT:    ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9
+; GFX9-O0-NEXT:    s_mov_b32 s9, s0
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v8
+; GFX9-O0-NEXT:    s_mov_b32 s0, 32
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr2_sgpr3
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[10:11], s0, v[8:9]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v10
+; GFX9-O0-NEXT:    s_getpc_b64 s[0:1]
+; GFX9-O0-NEXT:    s_add_u32 s0, s0, strict_wwm_called_i64@gotpcrel32@lo+4
+; GFX9-O0-NEXT:    s_addc_u32 s1, s1, strict_wwm_called_i64@gotpcrel32@hi+12
+; GFX9-O0-NEXT:    s_load_dwordx2 s[16:17], s[0:1], 0x0
+; GFX9-O0-NEXT:    s_mov_b64 s[0:1], s[24:25]
+; GFX9-O0-NEXT:    s_mov_b64 s[2:3], s[26:27]
+; GFX9-O0-NEXT:    s_mov_b32 s15, 20
+; GFX9-O0-NEXT:    v_lshlrev_b32_e64 v3, s15, v3
+; GFX9-O0-NEXT:    s_mov_b32 s15, 10
+; GFX9-O0-NEXT:    v_lshlrev_b32_e64 v4, s15, v4
+; GFX9-O0-NEXT:    v_or3_b32 v3, v5, v4, v3
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr15
+; GFX9-O0-NEXT:    v_mov_b32_e32 v31, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v7
+; GFX9-O0-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-O0-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[24:27], 0 offset:4 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    v_readlane_b32 s0, v2, 4
+; GFX9-O0-NEXT:    v_readlane_b32 s1, v2, 5
+; GFX9-O0-NEXT:    v_readlane_b32 s2, v2, 6
+; GFX9-O0-NEXT:    v_readlane_b32 s3, v2, 7
+; GFX9-O0-NEXT:    v_readlane_b32 s4, v2, 8
+; GFX9-O0-NEXT:    v_readlane_b32 s5, v2, 9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v0
+; GFX9-O0-NEXT:    s_or_saveexec_b64 s[20:21], -1
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[24:27], 0 offset:4 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[20:21]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v1
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v9
+; GFX9-O0-NEXT:    v_add_co_u32_e64 v3, s[6:7], v3, v5
+; GFX9-O0-NEXT:    v_addc_co_u32_e64 v4, s[6:7], v4, v6, s[6:7]
+; GFX9-O0-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v4
+; GFX9-O0-NEXT:    s_mov_b32 s4, 0
+; GFX9-O0-NEXT:    buffer_store_dwordx2 v[1:2], off, s[0:3], s4 offset:4
+; GFX9-O0-NEXT:    ; kill: killed $vgpr0
+; GFX9-O0-NEXT:    s_endpgm
+;
+; GFX9-O3-LABEL: strict_wwm_call_i64:
+; GFX9-O3:       ; %bb.0:
+; GFX9-O3-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
+; GFX9-O3-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
+; GFX9-O3-NEXT:    s_mov_b32 s14, -1
+; GFX9-O3-NEXT:    s_mov_b32 s15, 0xe00000
+; GFX9-O3-NEXT:    s_add_u32 s12, s12, s3
+; GFX9-O3-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x34
+; GFX9-O3-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x24
+; GFX9-O3-NEXT:    s_mov_b32 s32, 0
+; GFX9-O3-NEXT:    s_addc_u32 s13, s13, 0
+; GFX9-O3-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX9-O3-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-O3-NEXT:    s_add_u32 s4, s4, strict_wwm_called_i64@gotpcrel32@lo+4
+; GFX9-O3-NEXT:    s_addc_u32 s5, s5, strict_wwm_called_i64@gotpcrel32@hi+12
+; GFX9-O3-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GFX9-O3-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX9-O3-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-O3-NEXT:    v_mov_b32_e32 v6, s2
+; GFX9-O3-NEXT:    v_mov_b32_e32 v7, s3
+; GFX9-O3-NEXT:    s_not_b64 exec, exec
+; GFX9-O3-NEXT:    v_mov_b32_e32 v6, 0
+; GFX9-O3-NEXT:    v_mov_b32_e32 v7, 0
+; GFX9-O3-NEXT:    s_not_b64 exec, exec
+; GFX9-O3-NEXT:    s_or_saveexec_b64 s[6:7], -1
+; GFX9-O3-NEXT:    s_mov_b64 s[0:1], s[12:13]
+; GFX9-O3-NEXT:    s_mov_b64 s[2:3], s[14:15]
+; GFX9-O3-NEXT:    v_mov_b32_e32 v0, v6
+; GFX9-O3-NEXT:    v_mov_b32_e32 v1, v7
+; GFX9-O3-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-O3-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-O3-NEXT:    v_mov_b32_e32 v3, v1
+; GFX9-O3-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v6
+; GFX9-O3-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v7, vcc
+; GFX9-O3-NEXT:    s_mov_b64 exec, s[6:7]
+; GFX9-O3-NEXT:    v_mov_b32_e32 v0, v2
+; GFX9-O3-NEXT:    v_mov_b32_e32 v1, v3
+; GFX9-O3-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0 offset:4
+; GFX9-O3-NEXT:    s_endpgm
 
-; GFX9-O0: s_mov_b64 s[[[ZERO_LO:[0-9]+]]:[[ZERO_HI:[0-9]+]]], 0{{$}}
-; GFX9-O0-DAG: v_mov_b32_e32 v9, s[[ARG_HI]]
-; GFX9-O0-DAG: v_mov_b32_e32 v8, s[[ARG_LO]]
 
-; GFX9-O3-DAG: v_mov_b32_e32 v7, s[[ARG_HI]]
-; GFX9-O3-DAG: v_mov_b32_e32 v6, s[[ARG_LO]]
 
-; GFX9: s_not_b64 exec, exec
-; GFX9-O0-NEXT: v_mov_b32_e32 v8, s[[ZERO_LO]]
-; GFX9-O0-NEXT: v_mov_b32_e32 v9, s[[ZERO_HI]]
-; GFX9-O3-NEXT: v_mov_b32_e32 v6, 0
-; GFX9-O3-NEXT: v_mov_b32_e32 v7, 0
-; GFX9-NEXT: s_not_b64 exec, exec
   %tmp107 = tail call i64 @llvm.amdgcn.set.inactive.i64(i64 %arg, i64 0)
-; GFX9: s_swappc_b64
   %tmp134 = call i64 @strict_wwm_called_i64(i64 %tmp107)
   %tmp136 = add i64 %tmp134, %tmp107
   %tmp137 = tail call i64 @llvm.amdgcn.strict.wwm.i64(i64 %tmp136)
   %tmp138 = bitcast i64 %tmp137 to <2 x i32>
-; GFX9: buffer_store_dwordx2
   call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %tmp138, ptr addrspace(8) %tmp14, i32 4, i32 0, i32 0)
   ret void
 }
 
-; GFX9-LABEL: {{^}}strict_wwm_amdgpu_cs_main:
 define amdgpu_cs void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %index) {
+; GFX9-O0-LABEL: strict_wwm_amdgpu_cs_main:
+; GFX9-O0:       ; %bb.0:
+; GFX9-O0-NEXT:    s_mov_b32 s4, s3
+; GFX9-O0-NEXT:    s_mov_b32 s5, s2
+; GFX9-O0-NEXT:    s_mov_b32 s6, s1
+; GFX9-O0-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1_sgpr2_sgpr3
+; GFX9-O0-NEXT:    s_mov_b32 s1, s6
+; GFX9-O0-NEXT:    s_mov_b32 s2, s5
+; GFX9-O0-NEXT:    s_mov_b32 s3, s4
+; GFX9-O0-NEXT:    ; kill: def $sgpr4_sgpr5_sgpr6_sgpr7 killed $sgpr0_sgpr1_sgpr2_sgpr3
+; GFX9-O0-NEXT:    s_mov_b32 s4, 5
+; GFX9-O0-NEXT:    v_lshlrev_b32_e64 v0, s4, v0
+; GFX9-O0-NEXT:    s_mov_b32 s4, 0
+; GFX9-O0-NEXT:    buffer_load_dwordx4 v[10:13], v0, s[0:3], s4 offen
+; GFX9-O0-NEXT:    buffer_load_dwordx2 v[3:4], v0, s[0:3], s4 offen offset:16
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v11
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v10
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
+; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v7
+; GFX9-O0-NEXT:    s_mov_b32 s5, 0x7fffffff
+; GFX9-O0-NEXT:    s_mov_b32 s6, -1
+; GFX9-O0-NEXT:    ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
+; GFX9-O0-NEXT:    s_mov_b32 s7, s5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v6
+; GFX9-O0-NEXT:    s_not_b64 exec, exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s7
+; GFX9-O0-NEXT:    s_not_b64 exec, exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v13
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v12
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
+; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v6
+; GFX9-O0-NEXT:    s_not_b64 exec, exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s7
+; GFX9-O0-NEXT:    s_not_b64 exec, exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v2
+; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v4
+; GFX9-O0-NEXT:    s_not_b64 exec, exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s7
+; GFX9-O0-NEXT:    s_not_b64 exec, exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v1
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v6
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
+; GFX9-O0-NEXT:    ; implicit-def: $sgpr5
+; GFX9-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6_vgpr7_vgpr8 killed $exec
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v11
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v10
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v9
+; GFX9-O0-NEXT:    buffer_store_dwordx4 v[5:8], v0, s[0:3], s4 offen
+; GFX9-O0-NEXT:    buffer_store_dwordx2 v[3:4], v0, s[0:3], s4 offen offset:16
+; GFX9-O0-NEXT:    s_endpgm
+;
+; GFX9-O3-LABEL: strict_wwm_amdgpu_cs_main:
+; GFX9-O3:       ; %bb.0:
+; GFX9-O3-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
+; GFX9-O3-NEXT:    buffer_load_dwordx4 v[7:10], v0, s[0:3], 0 offen
+; GFX9-O3-NEXT:    buffer_load_dwordx2 v[11:12], v0, s[0:3], 0 offen offset:16
+; GFX9-O3-NEXT:    s_mov_b32 s4, -1
+; GFX9-O3-NEXT:    s_brev_b32 s5, -2
+; GFX9-O3-NEXT:    s_waitcnt vmcnt(1)
+; GFX9-O3-NEXT:    v_mov_b32_e32 v1, v7
+; GFX9-O3-NEXT:    v_mov_b32_e32 v2, v8
+; GFX9-O3-NEXT:    s_not_b64 exec, exec
+; GFX9-O3-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-O3-NEXT:    v_mov_b32_e32 v2, s5
+; GFX9-O3-NEXT:    s_not_b64 exec, exec
+; GFX9-O3-NEXT:    v_mov_b32_e32 v3, v9
+; GFX9-O3-NEXT:    v_mov_b32_e32 v4, v10
+; GFX9-O3-NEXT:    s_not_b64 exec, exec
+; GFX9-O3-NEXT:    v_mov_b32_e32 v3, s4
+; GFX9-O3-NEXT:    v_mov_b32_e32 v4, s5
+; GFX9-O3-NEXT:    s_not_b64 exec, exec
+; GFX9-O3-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O3-NEXT:    v_mov_b32_e32 v5, v11
+; GFX9-O3-NEXT:    v_mov_b32_e32 v6, v12
+; GFX9-O3-NEXT:    s_not_b64 exec, exec
+; GFX9-O3-NEXT:    v_mov_b32_e32 v5, s4
+; GFX9-O3-NEXT:    v_mov_b32_e32 v6, s5
+; GFX9-O3-NEXT:    s_not_b64 exec, exec
+; GFX9-O3-NEXT:    v_mov_b32_e32 v7, v1
+; GFX9-O3-NEXT:    v_mov_b32_e32 v9, v3
+; GFX9-O3-NEXT:    v_mov_b32_e32 v8, v2
+; GFX9-O3-NEXT:    v_mov_b32_e32 v10, v4
+; GFX9-O3-NEXT:    v_mov_b32_e32 v11, v5
+; GFX9-O3-NEXT:    v_mov_b32_e32 v12, v6
+; GFX9-O3-NEXT:    buffer_store_dwordx4 v[7:10], v0, s[0:3], 0 offen
+; GFX9-O3-NEXT:    buffer_store_dwordx2 v[11:12], v0, s[0:3], 0 offen offset:16
+; GFX9-O3-NEXT:    s_endpgm
   %tmp17 = shl i32 %index, 5
-; GFX9: buffer_load_dwordx4
   %tmp18 = tail call <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32> %desc, i32 %tmp17, i32 0)
   %.i0.upto1.bc = bitcast <4 x i32> %tmp18 to <2 x i64>
   %tmp19 = or i32 %tmp17, 16
-; GFX9: buffer_load_dwordx2
   %tmp20 = tail call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %desc, i32 %tmp19, i32 0)
   %.i0.upto1.extract = extractelement <2 x i64> %.i0.upto1.bc, i32 0
   %tmp22 = tail call i64 @llvm.amdgcn.set.inactive.i64(i64 %.i0.upto1.extract, i64 9223372036854775807)
@@ -385,11 +1732,9 @@ define amdgpu_cs void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %ind
   %.cast6 = bitcast i64 %tmp174 to <2 x float>
   %.cast7 = bitcast i64 %tmp251 to <2 x float>
   %tmp254 = shufflevector <2 x float> %.cast, <2 x float> %.cast6, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; GFX9: buffer_store_dwordx4
   %desc.int = bitcast <4 x i32> %desc to i128
   %desc.ptr = inttoptr i128 %desc.int to ptr addrspace(8)
   tail call void @llvm.amdgcn.raw.ptr.buffer.store.v4f32(<4 x float> %tmp254, ptr addrspace(8) %desc.ptr, i32 %tmp17, i32 0, i32 0)
-  ; GFX9: buffer_store_dwordx2
   tail call void @llvm.amdgcn.raw.ptr.buffer.store.v2f32(<2 x float> %.cast7, ptr addrspace(8)%desc.ptr, i32 %tmp19, i32 0, i32 0)
   ret void
 }
@@ -408,4 +1753,4 @@ declare void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32>, ptr addrspace(8)
 declare void @llvm.amdgcn.raw.ptr.buffer.store.v2f32(<2 x float>, ptr addrspace(8), i32, i32, i32)
 declare void @llvm.amdgcn.raw.ptr.buffer.store.v4f32(<4 x float>, ptr addrspace(8), i32, i32, i32)
 declare <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32>, i32, i32)
-declare <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32>, i32, i32)
\ No newline at end of file
+declare <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32>, i32, i32)

From f798bf847056bb2c1548be5dc09dc0a953f4e4f2 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell@arm.com>
Date: Thu, 2 Nov 2023 09:00:12 +0000
Subject: [PATCH 793/877] [mlir][ArmSME] Provide descriptions and summaries for
 types (#70920)

The auto-generated summaries were hard to read (and pretty unhelpful), a
SME tile was:

```
vector<[16]x[16]xi8> of 8-bit signless integer values or vector<[8]x[8]xi16> of 16-bit signless integer values or vector<[4]x[4]xi32> of 32-bit signless integer values or vector<[2]x[2]xi64> of 64-bit signless integer values or vector<[1]x[1]xi128> of 128-bit signless integer values or vector<[8]x[8]xf16> of 16-bit float values or vector<[8]x[8]xbf16> of bfloat16 type values or vector<[4]x[4]xf32> of 32-bit float values or vector<[2]x[2]xf64> of 64-bit float values
```

...and a SVE vector was:

```
of ranks 1scalable vector of 8-bit signless integer or 16-bit signless integer or 32-bit signless integer or 64-bit signless integer or 128-bit signless integer or 16-bit float or bfloat16 type or 32-bit float or 64-bit float values of length 16/8/4/2/1
```

Note: The descriptions added here won't yet be shown on the MLIR docs
(only the short summaries), but this should be easy to enable like it
was for attribute descriptions in #67009.

A table of contents (TOC) is also added to the ArmSME docs page to make
it easier to navigate.
---
 mlir/docs/Dialects/ArmSME.md                  |  2 +
 mlir/include/mlir/Dialect/ArmSME/IR/ArmSME.td | 38 ++++++++++++++-
 .../mlir/Dialect/ArmSME/IR/ArmSMEOps.td       | 48 +++++++++++++++++--
 mlir/test/Dialect/ArmSME/invalid.mlir         | 16 +++----
 4 files changed, 90 insertions(+), 14 deletions(-)

diff --git a/mlir/docs/Dialects/ArmSME.md b/mlir/docs/Dialects/ArmSME.md
index ab7c9ffe7aa92..505b52938eacc 100644
--- a/mlir/docs/Dialects/ArmSME.md
+++ b/mlir/docs/Dialects/ArmSME.md
@@ -1,5 +1,7 @@
 # 'ArmSME' Dialect
 
+[TOC]
+
 Basic dialect to target Arm SME architectures This dialect contains the
 definitions necessary to target Arm SME scalable matrix operations.
 
diff --git a/mlir/include/mlir/Dialect/ArmSME/IR/ArmSME.td b/mlir/include/mlir/Dialect/ArmSME/IR/ArmSME.td
index 18b9bd7a107fe..ffafb2569310e 100644
--- a/mlir/include/mlir/Dialect/ArmSME/IR/ArmSME.td
+++ b/mlir/include/mlir/Dialect/ArmSME/IR/ArmSME.td
@@ -42,11 +42,45 @@ def ArmSME_Dialect : Dialect {
 // ArmSME type definitions
 //===----------------------------------------------------------------------===//
 
+// FIXME: This allows types that are not SVE vectors, e.g. vector<[16]xi128>.
 def SVEVector : ScalableVectorOfRankAndLengthAndType<
-  [1], [16, 8, 4, 2, 1], [I8, I16, I32, I64, I128, F16, BF16, F32, F64]>;
+  [1], [16, 8, 4, 2, 1], [I8, I16, I32, I64, I128, F16, BF16, F32, F64]>
+{
+  let summary = "a vector type that matches the size of a SVE vector";
+  let description = [{
+    Possible vector types:
+
+    Integer elements:
+
+    * `vector<[16]xi8>`
+    * `vector<[8]xi16>`
+    * `vector<[4]xi32>`
+    * `vector<[2]xi64>`
+    * `vector<[1]xi128>`
+
+    Floating point elements:
+
+    * `vector<[8]xf16>`
+    * `vector<[8]xbf16>`
+    * `vector<[4]xf32>`
+    * `vector<[2]xf64>`
+  }];
+}
 
 def SVEPredicate : ScalableVectorOfRankAndLengthAndType<
-  [1], [16, 8, 4, 2, 1], [I1]>;
+  [1], [16, 8, 4, 2, 1], [I1]>
+{
+  let summary = "a vector type that matches the size of a SVE predicate";
+  let description = [{
+    Possible vector types:
+
+    * `vector<[16]xi1>`
+    * `vector<[8]xi1>`
+    * `vector<[4]xi1>`
+    * `vector<[2]xi1>`
+    * `vector<[1]xi1>`
+  }];
+}
 
 
 #endif // ARMSME
diff --git a/mlir/include/mlir/Dialect/ArmSME/IR/ArmSMEOps.td b/mlir/include/mlir/Dialect/ArmSME/IR/ArmSMEOps.td
index 3b8a4cb60f910..ba33a2826e6ca 100644
--- a/mlir/include/mlir/Dialect/ArmSME/IR/ArmSMEOps.td
+++ b/mlir/include/mlir/Dialect/ArmSME/IR/ArmSMEOps.td
@@ -43,7 +43,47 @@ def nxnxv4f32  : SMETileType<F32,  [4,  4 ], "vector<[4]x[4]xf32>">;
 def nxnxv2f64  : SMETileType<F64,  [2,  2 ], "vector<[2]x[2]xf64>">;
 
 def SMETile : AnyTypeOf<[nxnxv16i8, nxnxv8i16, nxnxv4i32, nxnxv2i64, nxnxv1i128,
-                         nxnxv8f16, nxnxv8bf16, nxnxv4f32, nxnxv2f64]>;
+                         nxnxv8f16, nxnxv8bf16, nxnxv4f32, nxnxv2f64],
+                        "a vector type that fits into a SME tile">
+{
+  let description = [{
+    Possible vector types:
+
+    Integer elements:
+
+    * `vector<[16]x[16]xi8>`
+    * `vector<[8]x[8]xi16>`
+    * `vector<[4]x[4]xi32>`
+    * `vector<[2]x[2]xi64>`
+    * `vector<[1]x[1]xi128>`
+
+    Floating point elements:
+
+    * `vector<[8]x[8]xf16>`
+    * `vector<[8]x[8]xbf16>`
+    * `vector<[4]x[4]xf32>`
+    * `vector<[2]x[2]xf64>`
+  }];
+}
+
+def TileID : AnyTypeOf<[I8, I16, I32, I64, I128],
+                       "an identifier of a virtual tile (of a size) within the ZA storage">
+{
+  let description = [{
+    The tile ID is an 8, 16, 32, 64, or 128-bit signless integer. The value of
+    the integer indicates the tile to use, and the bit size indicates the size
+    of tile. The number of tiles available and the element types of those depend
+    on the size. This is summarised below:
+
+    | Tile ID Type | Possible Tile IDs   | Tile Vector Types                                                       |
+    |--------------|---------------------|-------------------------------------------------------------------------|
+    | `i8`         | 0                   | `vector<[16]x[16]xi8>`                                                  |
+    | `i16`        | 0 and 1             | `vector<[8]x[8]xi16>`, `vector<[8]x[8]xf16>`, or `vector<[8]x[8]xbf16>` |
+    | `i32`        | 0 to 3 (inclusive)  | `vector<[4]x[4]xi32>` or `vector<[4]x[4]xf32>`                          |
+    | `i64`        | 0 to 7 (inclusive)  | `vector<[2]x[2]xi64>` or `vector<[2]x[2]xf64>`                          |
+    | `i128`       | 0 to 15 (inclusive) | `vector<[1]x[1]xi128>`                                                  |
+  }];
+}
 
 // A type constraint that verifies the bitwidth of the scalar integer returned
 // from 'arm_sme.get_tile_id' matches the element bitwidth of a "virtual tile".
@@ -160,7 +200,7 @@ def CastTileToVector : ArmSME_Op<"cast_tile_to_vector", [Pure, TileElementWidthM
     Canonicalization will look through `arm_sme.cast_tile_to_vector` and fold
     the cast away if it comes from a `arm_sme.cast_vector_to_tile`.
   }];
-  let arguments = (ins AnyTypeOf<[I8, I16, I32, I64, I128]>:$tile_id);
+  let arguments = (ins TileID:$tile_id);
   let results = (outs SMETile:$vector);
   let assemblyFormat =
     "$tile_id attr-dict `:` type($tile_id) `to` type($vector)";
@@ -196,7 +236,7 @@ def CastVectorToTile : ArmSME_Op<"cast_vector_to_tile", [Pure, TileElementWidthM
     the cast away if it comes from a `arm_sme.cast_tile_to_vector`.
   }];
   let arguments = (ins SMETile:$vector);
-  let results = (outs AnyTypeOf<[I8, I16, I32, I64, I128]>:$tile_id);
+  let results = (outs TileID:$tile_id);
   let assemblyFormat =
     "$vector attr-dict `:` type($vector) `to` type($tile_id)";
   let hasCanonicalizeMethod = 1;
@@ -232,7 +272,7 @@ def GetTileID : ArmSME_Op<"get_tile_id"> {
     ```
   }];
 
-  let results = (outs AnyTypeOf<[I8, I16, I32, I64, I128]>:$tile_id);
+  let results = (outs TileID:$tile_id);
   let assemblyFormat = "attr-dict `:` type($tile_id)";
 }
 
diff --git a/mlir/test/Dialect/ArmSME/invalid.mlir b/mlir/test/Dialect/ArmSME/invalid.mlir
index 58d7b8f361a23..d35eb7f7bccc7 100644
--- a/mlir/test/Dialect/ArmSME/invalid.mlir
+++ b/mlir/test/Dialect/ArmSME/invalid.mlir
@@ -15,7 +15,7 @@ func.func @arm_sme_cast_tile_to_vector__bad_tile_id_bitwidth(%tile_id : i8) -> v
 // -----
 
 func.func @arm_sme_cast_tile_to_vector__bad_vector_type_rank_1(%tile_id : i8) -> vector<[16]xi8> {
-  // expected-error@+1 {{op result #0 must be vector<[16]x[16]xi8> of 8-bit signless integer values or vector<[8]x[8]xi16> of 16-bit signless integer values or vector<[4]x[4]xi32> of 32-bit signless integer values or vector<[2]x[2]xi64> of 64-bit signless integer values or vector<[1]x[1]xi128> of 128-bit signless integer values or vector<[8]x[8]xf16> of 16-bit float values or vector<[8]x[8]xbf16> of bfloat16 type values or vector<[4]x[4]xf32> of 32-bit float values or vector<[2]x[2]xf64> of 64-bit float values, but got 'vector<[16]xi8>'}}
+  // expected-error@+1 {{op result #0 must be a vector type that fits into a SME tile, but got 'vector<[16]xi8>'}}
   %0 = arm_sme.cast_tile_to_vector %tile_id : i8 to vector<[16]xi8>
   return %0 : vector<[16]xi8>
 }
@@ -23,7 +23,7 @@ func.func @arm_sme_cast_tile_to_vector__bad_vector_type_rank_1(%tile_id : i8) ->
 // -----
 
 func.func @arm_sme_cast_tile_to_vector__bad_vector_type_i4(%tile_id : i8) -> vector<[16]x[16]xi4> {
-  // expected-error@+1 {{op result #0 must be vector<[16]x[16]xi8> of 8-bit signless integer values or vector<[8]x[8]xi16> of 16-bit signless integer values or vector<[4]x[4]xi32> of 32-bit signless integer values or vector<[2]x[2]xi64> of 64-bit signless integer values or vector<[1]x[1]xi128> of 128-bit signless integer values or vector<[8]x[8]xf16> of 16-bit float values or vector<[8]x[8]xbf16> of bfloat16 type values or vector<[4]x[4]xf32> of 32-bit float values or vector<[2]x[2]xf64> of 64-bit float values, but got 'vector<[16]x[16]xi4>'}}
+  // expected-error@+1 {{op result #0 must be a vector type that fits into a SME tile, but got 'vector<[16]x[16]xi4>'}}
   %0 = arm_sme.cast_tile_to_vector %tile_id : i8 to vector<[16]x[16]xi4>
   return %0 : vector<[16]x[16]xi4>
 }
@@ -31,7 +31,7 @@ func.func @arm_sme_cast_tile_to_vector__bad_vector_type_i4(%tile_id : i8) -> vec
 // -----
 
 func.func @arm_sme_cast_tile_to_vector__bad_vector_type_non_scalable_dim_0(%tile_id : i8) -> vector<16x[16]xi8> {
-  // expected-error@+1 {{op result #0 must be vector<[16]x[16]xi8> of 8-bit signless integer values or vector<[8]x[8]xi16> of 16-bit signless integer values or vector<[4]x[4]xi32> of 32-bit signless integer values or vector<[2]x[2]xi64> of 64-bit signless integer values or vector<[1]x[1]xi128> of 128-bit signless integer values or vector<[8]x[8]xf16> of 16-bit float values or vector<[8]x[8]xbf16> of bfloat16 type values or vector<[4]x[4]xf32> of 32-bit float values or vector<[2]x[2]xf64> of 64-bit float values, but got 'vector<16x[16]xi8>'}}
+  // expected-error@+1 {{op result #0 must be a vector type that fits into a SME tile, but got 'vector<16x[16]xi8>'}}
   %0 = arm_sme.cast_tile_to_vector %tile_id : i8 to vector<16x[16]xi8>
   return %0 : vector<16x[16]xi8>
 }
@@ -39,7 +39,7 @@ func.func @arm_sme_cast_tile_to_vector__bad_vector_type_non_scalable_dim_0(%tile
 // -----
 
 func.func @arm_sme_cast_tile_to_vector__bad_vector_type_non_scalable_dim_1(%tile_id : i8) -> vector<[16]x16xi8> {
-  // expected-error@+1 {{op result #0 must be vector<[16]x[16]xi8> of 8-bit signless integer values or vector<[8]x[8]xi16> of 16-bit signless integer values or vector<[4]x[4]xi32> of 32-bit signless integer values or vector<[2]x[2]xi64> of 64-bit signless integer values or vector<[1]x[1]xi128> of 128-bit signless integer values or vector<[8]x[8]xf16> of 16-bit float values or vector<[8]x[8]xbf16> of bfloat16 type values or vector<[4]x[4]xf32> of 32-bit float values or vector<[2]x[2]xf64> of 64-bit float values, but got 'vector<[16]x16xi8>'}}
+  // expected-error@+1 {{op result #0 must be a vector type that fits into a SME tile, but got 'vector<[16]x16xi8>'}}
   %0 = arm_sme.cast_tile_to_vector %tile_id : i8 to vector<[16]x16xi8>
   return %0 : vector<[16]x16xi8>
 }
@@ -47,7 +47,7 @@ func.func @arm_sme_cast_tile_to_vector__bad_vector_type_non_scalable_dim_1(%tile
 // -----
 
 func.func @arm_sme_cast_tile_to_vector_bad_shape(%tile_id : i8) -> vector<[4]x[16]xi8> {
-  // expected-error@+1 {{op result #0 must be vector<[16]x[16]xi8> of 8-bit signless integer values or vector<[8]x[8]xi16> of 16-bit signless integer values or vector<[4]x[4]xi32> of 32-bit signless integer values or vector<[2]x[2]xi64> of 64-bit signless integer values or vector<[1]x[1]xi128> of 128-bit signless integer values or vector<[8]x[8]xf16> of 16-bit float values or vector<[8]x[8]xbf16> of bfloat16 type values or vector<[4]x[4]xf32> of 32-bit float values or vector<[2]x[2]xf64> of 64-bit float values, but got 'vector<[4]x[16]xi8>'}}
+  // expected-error@+1 {{op result #0 must be a vector type that fits into a SME tile, but got 'vector<[4]x[16]xi8>'}}
   %0 = arm_sme.cast_tile_to_vector %tile_id : i8 to vector<[4]x[16]xi8>
   return %0 : vector<[4]x[16]xi8>
 }
@@ -67,7 +67,7 @@ func.func @arm_sme_cast_vector_to_tile__bad_tile_id_bitwidth(%vector : vector<[1
 // -----
 
 func.func @arm_sme_cast_vector_to_tile__bad_rank_1d(%vector : vector<[16]xi8>) -> i8 {
-  // expected-error@+1 {{op operand #0 must be vector<[16]x[16]xi8> of 8-bit signless integer values or vector<[8]x[8]xi16> of 16-bit signless integer values or vector<[4]x[4]xi32> of 32-bit signless integer values or vector<[2]x[2]xi64> of 64-bit signless integer values or vector<[1]x[1]xi128> of 128-bit signless integer values or vector<[8]x[8]xf16> of 16-bit float values or vector<[8]x[8]xbf16> of bfloat16 type values or vector<[4]x[4]xf32> of 32-bit float values or vector<[2]x[2]xf64> of 64-bit float values, but got 'vector<[16]xi8>'}}
+  // expected-error@+1 {{op operand #0 must be a vector type that fits into a SME tile, but got 'vector<[16]xi8>'}}
   %0 = arm_sme.cast_vector_to_tile %vector : vector<[16]xi8> to i8
   return %0 : i8
 }
@@ -79,7 +79,7 @@ func.func @arm_sme_cast_vector_to_tile__bad_rank_1d(%vector : vector<[16]xi8>) -
 // -----
 
 func.func @arm_sme_get_tile_id__bad_type() -> i1 {
-  // expected-error@+1 {{op result #0 must be 8-bit signless integer or 16-bit signless integer or 32-bit signless integer or 64-bit signless integer or 128-bit signless integer}}
+  // expected-error@+1 {{op result #0 must be an identifier of a virtual tile (of a size) within the ZA storage}}
   %0 = arm_sme.get_tile_id : i1
   return %0 : i1
 }
@@ -200,7 +200,7 @@ func.func @arm_sme_store_tile_slice__bad_mask_type(%tile : vector<[16]x[16]xi8>,
 
 func.func @arm_sme_outerproduct__bad_result_type(%vecA: vector<[2]xi16>, %vecB: vector<[2]xi16>) -> vector<[2]x[2]xi16>
 {
-  // expected-error@+1 {{op result #0 must be vector<[16]x[16]xi8> of 8-bit signless integer values or vector<[8]x[8]xi16> of 16-bit signless integer values or vector<[4]x[4]xi32> of 32-bit signless integer values or vector<[2]x[2]xi64> of 64-bit signless integer values or vector<[1]x[1]xi128> of 128-bit signless integer values or vector<[8]x[8]xf16> of 16-bit float values or vector<[8]x[8]xbf16> of bfloat16 type values or vector<[4]x[4]xf32> of 32-bit float values or vector<[2]x[2]xf64> of 64-bit float values, but got 'vector<[2]x[2]xi16>'}}
+  // expected-error@+1 {{op result #0 must be a vector type that fits into a SME tile, but got 'vector<[2]x[2]xi16>'}}
   %0 = arm_sme.outerproduct %vecA, %vecB : vector<[2]xi16>, vector<[2]xi16>
   return %0 : vector<[2]x[2]xi16>
 }

From e2564b27472638d2e2019e6cd2fc6d6d608f8b8c Mon Sep 17 00:00:00 2001
From: Christian Ulmann <christianulmann@gmail.com>
Date: Thu, 2 Nov 2023 10:09:49 +0100
Subject: [PATCH 794/877] [MLIR][SCFToOpenMP] Remove typed pointer support
 (#71028)

This commit removes the support for lowering SCF to OpenMP dialect with
typed pointers. Typed pointers have been deprecated for a while now and
it's planned to soon remove them from the LLVM dialect.

Related PSA:
https://discourse.llvm.org/t/psa-removal-of-typed-pointers-from-the-llvm-dialect/74502
---
 mlir/include/mlir/Conversion/Passes.td        |  6 --
 .../Conversion/SCFToOpenMP/SCFToOpenMP.cpp    | 59 +++++---------
 .../Conversion/SCFToOpenMP/reductions.mlir    |  2 +-
 .../Conversion/SCFToOpenMP/scf-to-openmp.mlir |  2 +-
 .../SCFToOpenMP/typed-pointers.mlir           | 78 -------------------
 5 files changed, 21 insertions(+), 126 deletions(-)
 delete mode 100644 mlir/test/Conversion/SCFToOpenMP/typed-pointers.mlir

diff --git a/mlir/include/mlir/Conversion/Passes.td b/mlir/include/mlir/Conversion/Passes.td
index dae13bfecfafe..00e342f8f30eb 100644
--- a/mlir/include/mlir/Conversion/Passes.td
+++ b/mlir/include/mlir/Conversion/Passes.td
@@ -883,12 +883,6 @@ def ConvertSCFToOpenMPPass : Pass<"convert-scf-to-openmp", "ModuleOp"> {
   let summary = "Convert SCF parallel loop to OpenMP parallel + workshare "
                 "constructs.";
 
-  let options = [
-    Option<"useOpaquePointers", "use-opaque-pointers", "bool",
-                 /*default=*/"true", "Generate LLVM IR using opaque pointers "
-                 "instead of typed pointers">
-  ];
-
   let dependentDialects = ["omp::OpenMPDialect", "LLVM::LLVMDialect",
                            "memref::MemRefDialect"];
 }
diff --git a/mlir/lib/Conversion/SCFToOpenMP/SCFToOpenMP.cpp b/mlir/lib/Conversion/SCFToOpenMP/SCFToOpenMP.cpp
index 50087751053ba..ee3ee02cf535e 100644
--- a/mlir/lib/Conversion/SCFToOpenMP/SCFToOpenMP.cpp
+++ b/mlir/lib/Conversion/SCFToOpenMP/SCFToOpenMP.cpp
@@ -211,25 +211,14 @@ static omp::ReductionDeclareOp createDecl(PatternRewriter &builder,
   return decl;
 }
 
-/// Returns an LLVM pointer type with the given element type, or an opaque
-/// pointer if 'useOpaquePointers' is true.
-static LLVM::LLVMPointerType getPointerType(Type elementType,
-                                            bool useOpaquePointers) {
-  if (useOpaquePointers)
-    return LLVM::LLVMPointerType::get(elementType.getContext());
-  return LLVM::LLVMPointerType::get(elementType);
-}
-
 /// Adds an atomic reduction combiner to the given OpenMP reduction declaration
 /// using llvm.atomicrmw of the given kind.
 static omp::ReductionDeclareOp addAtomicRMW(OpBuilder &builder,
                                             LLVM::AtomicBinOp atomicKind,
                                             omp::ReductionDeclareOp decl,
-                                            scf::ReduceOp reduce,
-                                            bool useOpaquePointers) {
+                                            scf::ReduceOp reduce) {
   OpBuilder::InsertionGuard guard(builder);
-  Type type = reduce.getOperand().getType();
-  Type ptrType = getPointerType(type, useOpaquePointers);
+  auto ptrType = LLVM::LLVMPointerType::get(builder.getContext());
   Location reduceOperandLoc = reduce.getOperand().getLoc();
   builder.createBlock(&decl.getAtomicReductionRegion(),
                       decl.getAtomicReductionRegion().end(), {ptrType, ptrType},
@@ -250,8 +239,7 @@ static omp::ReductionDeclareOp addAtomicRMW(OpBuilder &builder,
 /// the neutral value, necessary for the OpenMP declaration. If the reduction
 /// cannot be recognized, returns null.
 static omp::ReductionDeclareOp declareReduction(PatternRewriter &builder,
-                                                scf::ReduceOp reduce,
-                                                bool useOpaquePointers) {
+                                                scf::ReduceOp reduce) {
   Operation *container = SymbolTable::getNearestSymbolTable(reduce);
   SymbolTable symbolTable(container);
 
@@ -272,34 +260,29 @@ static omp::ReductionDeclareOp declareReduction(PatternRewriter &builder,
   if (matchSimpleReduction<arith::AddFOp, LLVM::FAddOp>(reduction)) {
     omp::ReductionDeclareOp decl = createDecl(builder, symbolTable, reduce,
                                               builder.getFloatAttr(type, 0.0));
-    return addAtomicRMW(builder, LLVM::AtomicBinOp::fadd, decl, reduce,
-                        useOpaquePointers);
+    return addAtomicRMW(builder, LLVM::AtomicBinOp::fadd, decl, reduce);
   }
   if (matchSimpleReduction<arith::AddIOp, LLVM::AddOp>(reduction)) {
     omp::ReductionDeclareOp decl = createDecl(builder, symbolTable, reduce,
                                               builder.getIntegerAttr(type, 0));
-    return addAtomicRMW(builder, LLVM::AtomicBinOp::add, decl, reduce,
-                        useOpaquePointers);
+    return addAtomicRMW(builder, LLVM::AtomicBinOp::add, decl, reduce);
   }
   if (matchSimpleReduction<arith::OrIOp, LLVM::OrOp>(reduction)) {
     omp::ReductionDeclareOp decl = createDecl(builder, symbolTable, reduce,
                                               builder.getIntegerAttr(type, 0));
-    return addAtomicRMW(builder, LLVM::AtomicBinOp::_or, decl, reduce,
-                        useOpaquePointers);
+    return addAtomicRMW(builder, LLVM::AtomicBinOp::_or, decl, reduce);
   }
   if (matchSimpleReduction<arith::XOrIOp, LLVM::XOrOp>(reduction)) {
     omp::ReductionDeclareOp decl = createDecl(builder, symbolTable, reduce,
                                               builder.getIntegerAttr(type, 0));
-    return addAtomicRMW(builder, LLVM::AtomicBinOp::_xor, decl, reduce,
-                        useOpaquePointers);
+    return addAtomicRMW(builder, LLVM::AtomicBinOp::_xor, decl, reduce);
   }
   if (matchSimpleReduction<arith::AndIOp, LLVM::AndOp>(reduction)) {
     omp::ReductionDeclareOp decl = createDecl(
         builder, symbolTable, reduce,
         builder.getIntegerAttr(
             type, llvm::APInt::getAllOnes(type.getIntOrFloatBitWidth())));
-    return addAtomicRMW(builder, LLVM::AtomicBinOp::_and, decl, reduce,
-                        useOpaquePointers);
+    return addAtomicRMW(builder, LLVM::AtomicBinOp::_and, decl, reduce);
   }
 
   // Match simple binary reductions that cannot be expressed with atomicrmw.
@@ -335,7 +318,7 @@ static omp::ReductionDeclareOp declareReduction(PatternRewriter &builder,
         builder, symbolTable, reduce, minMaxValueForSignedInt(type, !isMin));
     return addAtomicRMW(builder,
                         isMin ? LLVM::AtomicBinOp::min : LLVM::AtomicBinOp::max,
-                        decl, reduce, useOpaquePointers);
+                        decl, reduce);
   }
   if (matchSelectReduction<arith::CmpIOp, arith::SelectOp>(
           reduction, {arith::CmpIPredicate::ult, arith::CmpIPredicate::ule},
@@ -347,7 +330,7 @@ static omp::ReductionDeclareOp declareReduction(PatternRewriter &builder,
         builder, symbolTable, reduce, minMaxValueForUnsignedInt(type, !isMin));
     return addAtomicRMW(
         builder, isMin ? LLVM::AtomicBinOp::umin : LLVM::AtomicBinOp::umax,
-        decl, reduce, useOpaquePointers);
+        decl, reduce);
   }
 
   return nullptr;
@@ -357,11 +340,8 @@ namespace {
 
 struct ParallelOpLowering : public OpRewritePattern<scf::ParallelOp> {
 
-  bool useOpaquePointers;
-
-  ParallelOpLowering(MLIRContext *context, bool useOpaquePointers)
-      : OpRewritePattern<scf::ParallelOp>(context),
-        useOpaquePointers(useOpaquePointers) {}
+  ParallelOpLowering(MLIRContext *context)
+      : OpRewritePattern<scf::ParallelOp>(context) {}
 
   LogicalResult matchAndRewrite(scf::ParallelOp parallelOp,
                                 PatternRewriter &rewriter) const override {
@@ -370,8 +350,7 @@ struct ParallelOpLowering : public OpRewritePattern<scf::ParallelOp> {
     // declaration and use it instead of redeclaring.
     SmallVector<Attribute> reductionDeclSymbols;
     for (auto reduce : parallelOp.getOps<scf::ReduceOp>()) {
-      omp::ReductionDeclareOp decl =
-          declareReduction(rewriter, reduce, useOpaquePointers);
+      omp::ReductionDeclareOp decl = declareReduction(rewriter, reduce);
       if (!decl)
         return failure();
       reductionDeclSymbols.push_back(
@@ -385,14 +364,14 @@ struct ParallelOpLowering : public OpRewritePattern<scf::ParallelOp> {
         loc, rewriter.getIntegerType(64), rewriter.getI64IntegerAttr(1));
     SmallVector<Value> reductionVariables;
     reductionVariables.reserve(parallelOp.getNumReductions());
+    auto ptrType = LLVM::LLVMPointerType::get(parallelOp.getContext());
     for (Value init : parallelOp.getInitVals()) {
       assert((LLVM::isCompatibleType(init.getType()) ||
               isa<LLVM::PointerElementTypeInterface>(init.getType())) &&
              "cannot create a reduction variable if the type is not an LLVM "
              "pointer element");
-      Value storage = rewriter.create<LLVM::AllocaOp>(
-          loc, getPointerType(init.getType(), useOpaquePointers),
-          init.getType(), one, 0);
+      Value storage =
+          rewriter.create<LLVM::AllocaOp>(loc, ptrType, init.getType(), one, 0);
       rewriter.create<LLVM::StoreOp>(loc, init, storage);
       reductionVariables.push_back(storage);
     }
@@ -464,14 +443,14 @@ struct ParallelOpLowering : public OpRewritePattern<scf::ParallelOp> {
 };
 
 /// Applies the conversion patterns in the given function.
-static LogicalResult applyPatterns(ModuleOp module, bool useOpaquePointers) {
+static LogicalResult applyPatterns(ModuleOp module) {
   ConversionTarget target(*module.getContext());
   target.addIllegalOp<scf::ReduceOp, scf::ReduceReturnOp, scf::ParallelOp>();
   target.addLegalDialect<omp::OpenMPDialect, LLVM::LLVMDialect,
                          memref::MemRefDialect>();
 
   RewritePatternSet patterns(module.getContext());
-  patterns.add<ParallelOpLowering>(module.getContext(), useOpaquePointers);
+  patterns.add<ParallelOpLowering>(module.getContext());
   FrozenRewritePatternSet frozen(std::move(patterns));
   return applyPartialConversion(module, target, frozen);
 }
@@ -484,7 +463,7 @@ struct SCFToOpenMPPass
 
   /// Pass entry point.
   void runOnOperation() override {
-    if (failed(applyPatterns(getOperation(), useOpaquePointers)))
+    if (failed(applyPatterns(getOperation())))
       signalPassFailure();
   }
 };
diff --git a/mlir/test/Conversion/SCFToOpenMP/reductions.mlir b/mlir/test/Conversion/SCFToOpenMP/reductions.mlir
index c5723d96d4f1d..25b18b58a6adb 100644
--- a/mlir/test/Conversion/SCFToOpenMP/reductions.mlir
+++ b/mlir/test/Conversion/SCFToOpenMP/reductions.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt -convert-scf-to-openmp='use-opaque-pointers=1' -split-input-file %s | FileCheck %s
+// RUN: mlir-opt -convert-scf-to-openmp -split-input-file %s | FileCheck %s
 
 // CHECK: omp.reduction.declare @[[$REDF:.*]] : f32
 
diff --git a/mlir/test/Conversion/SCFToOpenMP/scf-to-openmp.mlir b/mlir/test/Conversion/SCFToOpenMP/scf-to-openmp.mlir
index 508052d483ec1..e0fdcae1b896b 100644
--- a/mlir/test/Conversion/SCFToOpenMP/scf-to-openmp.mlir
+++ b/mlir/test/Conversion/SCFToOpenMP/scf-to-openmp.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt -convert-scf-to-openmp='use-opaque-pointers=1' %s | FileCheck %s
+// RUN: mlir-opt -convert-scf-to-openmp %s | FileCheck %s
 
 // CHECK-LABEL: @parallel
 func.func @parallel(%arg0: index, %arg1: index, %arg2: index,
diff --git a/mlir/test/Conversion/SCFToOpenMP/typed-pointers.mlir b/mlir/test/Conversion/SCFToOpenMP/typed-pointers.mlir
deleted file mode 100644
index fb90c5d7d10fd..0000000000000
--- a/mlir/test/Conversion/SCFToOpenMP/typed-pointers.mlir
+++ /dev/null
@@ -1,78 +0,0 @@
-// RUN: mlir-opt -convert-scf-to-openmp='use-opaque-pointers=0' -split-input-file %s | FileCheck %s
-
-// CHECK: omp.reduction.declare @[[$REDF1:.*]] : f32
-
-// CHECK: init
-// CHECK: %[[INIT:.*]] = llvm.mlir.constant(-3.4
-// CHECK: omp.yield(%[[INIT]] : f32)
-
-// CHECK: combiner
-// CHECK: ^{{.*}}(%[[ARG0:.*]]: f32, %[[ARG1:.*]]: f32)
-// CHECK: %[[CMP:.*]] = arith.cmpf oge, %[[ARG0]], %[[ARG1]]
-// CHECK: %[[RES:.*]] = arith.select %[[CMP]], %[[ARG0]], %[[ARG1]]
-// CHECK: omp.yield(%[[RES]] : f32)
-
-// CHECK-NOT: atomic
-
-// CHECK: omp.reduction.declare @[[$REDF2:.*]] : i64
-
-// CHECK: init
-// CHECK: %[[INIT:.*]] = llvm.mlir.constant
-// CHECK: omp.yield(%[[INIT]] : i64)
-
-// CHECK: combiner
-// CHECK: ^{{.*}}(%[[ARG0:.*]]: i64, %[[ARG1:.*]]: i64)
-// CHECK: %[[CMP:.*]] = arith.cmpi slt, %[[ARG0]], %[[ARG1]]
-// CHECK: %[[RES:.*]] = arith.select %[[CMP]], %[[ARG1]], %[[ARG0]]
-// CHECK: omp.yield(%[[RES]] : i64)
-
-// CHECK: atomic
-// CHECK: ^{{.*}}(%[[ARG0:.*]]: !llvm.ptr<i64>, %[[ARG1:.*]]: !llvm.ptr<i64>):
-// CHECK: %[[RHS:.*]] = llvm.load %[[ARG1]]
-// CHECK: llvm.atomicrmw max %[[ARG0]], %[[RHS]] monotonic
-
-// CHECK-LABEL: @reduction4
-func.func @reduction4(%arg0 : index, %arg1 : index, %arg2 : index,
-                 %arg3 : index, %arg4 : index) -> (f32, i64) {
-  %step = arith.constant 1 : index
-  // CHECK: %[[ZERO:.*]] = arith.constant 0.0
-  %zero = arith.constant 0.0 : f32
-  // CHECK: %[[IONE:.*]] = arith.constant 1
-  %ione = arith.constant 1 : i64
-  // CHECK: %[[BUF1:.*]] = llvm.alloca %{{.*}} x f32
-  // CHECK: llvm.store %[[ZERO]], %[[BUF1]]
-  // CHECK: %[[BUF2:.*]] = llvm.alloca %{{.*}} x i64
-  // CHECK: llvm.store %[[IONE]], %[[BUF2]]
-
-  // CHECK: omp.parallel
-  // CHECK: omp.wsloop
-  // CHECK-SAME: reduction(@[[$REDF1]] -> %[[BUF1]]
-  // CHECK-SAME:           @[[$REDF2]] -> %[[BUF2]]
-  // CHECK: memref.alloca_scope
-  %res:2 = scf.parallel (%i0, %i1) = (%arg0, %arg1) to (%arg2, %arg3)
-                        step (%arg4, %step) init (%zero, %ione) -> (f32, i64) {
-    %one = arith.constant 1.0 : f32
-    // CHECK: omp.reduction %{{.*}}, %[[BUF1]]
-    scf.reduce(%one) : f32 {
-    ^bb0(%lhs : f32, %rhs: f32):
-      %cmp = arith.cmpf oge, %lhs, %rhs : f32
-      %res = arith.select %cmp, %lhs, %rhs : f32
-      scf.reduce.return %res : f32
-    }
-    // CHECK: arith.fptosi
-    %1 = arith.fptosi %one : f32 to i64
-    // CHECK: omp.reduction %{{.*}}, %[[BUF2]]
-    scf.reduce(%1) : i64 {
-    ^bb1(%lhs: i64, %rhs: i64):
-      %cmp = arith.cmpi slt, %lhs, %rhs : i64
-      %res = arith.select %cmp, %rhs, %lhs : i64
-      scf.reduce.return %res : i64
-    }
-    // CHECK: omp.yield
-  }
-  // CHECK: omp.terminator
-  // CHECK: %[[RES1:.*]] = llvm.load %[[BUF1]] : !llvm.ptr<f32>
-  // CHECK: %[[RES2:.*]] = llvm.load %[[BUF2]] : !llvm.ptr<i64>
-  // CHECK: return %[[RES1]], %[[RES2]]
-  return %res#0, %res#1 : f32, i64
-}

From 32521bb37caad512fd539d95bda26170a91e8a4b Mon Sep 17 00:00:00 2001
From: Ben Shi <2283975856@qq.com>
Date: Thu, 2 Nov 2023 17:10:56 +0800
Subject: [PATCH 795/877] [clang][analyzer] Restrict 'fopen' & 'tmpfile'
 modeling to POSIX versions in StreamChecker (#70540)

'tmpfile' has only one form that it has no argument.
---
 .../StaticAnalyzer/Checkers/StreamChecker.cpp    |  4 ++--
 clang/test/Analysis/stream-non-posix-function.c  | 16 ++++++++++++++++
 2 files changed, 18 insertions(+), 2 deletions(-)
 create mode 100644 clang/test/Analysis/stream-non-posix-function.c

diff --git a/clang/lib/StaticAnalyzer/Checkers/StreamChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/StreamChecker.cpp
index a5f8d855f8e06..898906977ba9b 100644
--- a/clang/lib/StaticAnalyzer/Checkers/StreamChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/StreamChecker.cpp
@@ -238,10 +238,10 @@ class StreamChecker : public Checker<check::PreCall, eval::Call,
 
 private:
   CallDescriptionMap<FnDescription> FnDescriptions = {
-      {{{"fopen"}}, {nullptr, &StreamChecker::evalFopen, ArgNone}},
+      {{{"fopen"}, 2}, {nullptr, &StreamChecker::evalFopen, ArgNone}},
       {{{"freopen"}, 3},
        {&StreamChecker::preFreopen, &StreamChecker::evalFreopen, 2}},
-      {{{"tmpfile"}}, {nullptr, &StreamChecker::evalFopen, ArgNone}},
+      {{{"tmpfile"}, 0}, {nullptr, &StreamChecker::evalFopen, ArgNone}},
       {{{"fclose"}, 1},
        {&StreamChecker::preDefault, &StreamChecker::evalFclose, 0}},
       {{{"fread"}, 4},
diff --git a/clang/test/Analysis/stream-non-posix-function.c b/clang/test/Analysis/stream-non-posix-function.c
new file mode 100644
index 0000000000000..70b3ab25d0265
--- /dev/null
+++ b/clang/test/Analysis/stream-non-posix-function.c
@@ -0,0 +1,16 @@
+// RUN: %clang_analyze_cc1 -fno-builtin -analyzer-checker=core,alpha.unix.Stream -verify %s
+// expected-no-diagnostics
+
+typedef struct _FILE FILE;
+
+// These functions are not standard C library functions.
+FILE *tmpfile(const char *restrict path); // Real 'tmpfile' should have exactly 0 formal parameters.
+FILE *fopen(const char *restrict path);   // Real 'fopen' should have exactly 2 formal parameters.
+
+void test_fopen_non_posix(void) {
+  FILE *fp = fopen("file"); // no-leak: This isn't the standard POSIX `fopen`, we don't know the semantics of this call.
+}
+
+void test_tmpfile_non_posix(void) {
+  FILE *fp = tmpfile("file"); // // no-leak: This isn't the standard POSIX `tmpfile`, we don't know the semantics of this call.
+}

From 7ef6b21c3e2cd8e477124840f4acf64e11f60616 Mon Sep 17 00:00:00 2001
From: Andrew Kaster <akaster@serenityos.org>
Date: Thu, 2 Nov 2023 05:28:12 -0400
Subject: [PATCH 796/877] [llvm] Add triple for SerenityOS

Reviewed By: MaskRay

Differential Revision: https://reviews.llvm.org/D154395
---
 llvm/include/llvm/TargetParser/Triple.h    |  7 ++++++-
 llvm/lib/TargetParser/Triple.cpp           |  2 ++
 llvm/unittests/TargetParser/TripleTest.cpp | 12 ++++++++++++
 3 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/llvm/include/llvm/TargetParser/Triple.h b/llvm/include/llvm/TargetParser/Triple.h
index 0f56ac68c851f..414b86d53aff6 100644
--- a/llvm/include/llvm/TargetParser/Triple.h
+++ b/llvm/include/llvm/TargetParser/Triple.h
@@ -221,7 +221,8 @@ class Triple {
     Emscripten,
     ShaderModel, // DirectX ShaderModel
     LiteOS,
-    LastOSType = LiteOS
+    Serenity,
+    LastOSType = Serenity
   };
   enum EnvironmentType {
     UnknownEnvironment,
@@ -668,6 +669,10 @@ class Triple {
     return getOS() == Triple::AIX;
   }
 
+  bool isOSSerenity() const {
+    return getOS() == Triple::Serenity;
+  }
+
   /// Tests whether the OS uses the ELF binary format.
   bool isOSBinFormatELF() const {
     return getObjectFormat() == Triple::ELF;
diff --git a/llvm/lib/TargetParser/Triple.cpp b/llvm/lib/TargetParser/Triple.cpp
index 5d4eb79675f89..c5e9ad43d2258 100644
--- a/llvm/lib/TargetParser/Triple.cpp
+++ b/llvm/lib/TargetParser/Triple.cpp
@@ -264,6 +264,7 @@ StringRef Triple::getOSTypeName(OSType Kind) {
   case PS5: return "ps5";
   case RTEMS: return "rtems";
   case Solaris: return "solaris";
+  case Serenity: return "serenity";
   case TvOS: return "tvos";
   case UEFI: return "uefi";
   case WASI: return "wasi";
@@ -641,6 +642,7 @@ static Triple::OSType parseOS(StringRef OSName) {
     .StartsWith("emscripten", Triple::Emscripten)
     .StartsWith("shadermodel", Triple::ShaderModel)
     .StartsWith("liteos", Triple::LiteOS)
+    .StartsWith("serenity", Triple::Serenity)
     .Default(Triple::UnknownOS);
 }
 
diff --git a/llvm/unittests/TargetParser/TripleTest.cpp b/llvm/unittests/TargetParser/TripleTest.cpp
index 0a43b1a8925d1..575e2ca381df3 100644
--- a/llvm/unittests/TargetParser/TripleTest.cpp
+++ b/llvm/unittests/TargetParser/TripleTest.cpp
@@ -1003,6 +1003,18 @@ TEST(TripleTest, ParsedIDs) {
   EXPECT_EQ(Triple::LiteOS, T.getOS());
   EXPECT_EQ(Triple::UnknownEnvironment, T.getEnvironment());
 
+  T = Triple("x86_64-pc-serenity");
+  EXPECT_EQ(Triple::x86_64, T.getArch());
+  EXPECT_EQ(Triple::PC, T.getVendor());
+  EXPECT_EQ(Triple::Serenity, T.getOS());
+  EXPECT_EQ(Triple::UnknownEnvironment, T.getEnvironment());
+
+  T = Triple("aarch64-pc-serenity");
+  EXPECT_EQ(Triple::aarch64, T.getArch());
+  EXPECT_EQ(Triple::PC, T.getVendor());
+  EXPECT_EQ(Triple::Serenity, T.getOS());
+  EXPECT_EQ(Triple::UnknownEnvironment, T.getEnvironment());
+
   T = Triple("huh");
   EXPECT_EQ(Triple::UnknownArch, T.getArch());
 }

From 18839aec4ed1977142444d650bf6c74c705fee44 Mon Sep 17 00:00:00 2001
From: Thomas Symalla <5754458+tsymalla@users.noreply.github.com>
Date: Thu, 2 Nov 2023 10:36:27 +0100
Subject: [PATCH 797/877] [AMDGPU] Detect kills in register sets when trying to
 form V_CMPX instructions. (#68293)

During the SIOptimizeExecMasking pass, we try to form V_CMPX
instructions by detecting S_AND_SAVEEXEC and V_MOV instructions.
Generally, we require the input operand of the V_MOV, which is the input
operand to the to-be-formed V_CMPX, to be alive. This is forced by
clearing the kill flags on the operand after V_CMPX has been generated.

However, if we have a kill of a register set that contains said
register, this will not be detected by clearKillFlags.
With this change, possible additional kill-flag candidates will be
detected during the final call to findInstrBackwards and then, the kill
flag will be removed to keep all registers in the set alive.

Co-authored-by: Thomas Symalla <thomas.symalla@amd.com>
---
 .../Target/AMDGPU/SIOptimizeExecMasking.cpp   | 41 +++++++++++++++----
 .../vcmp-saveexec-to-vcmpx-set-kill.mir       | 39 ++++++++++++++++++
 2 files changed, 72 insertions(+), 8 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx-set-kill.mir

diff --git a/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
index 04c9a6457944c..e3f54d01eb22a 100644
--- a/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
+++ b/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
@@ -10,6 +10,7 @@
 #include "GCNSubtarget.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "SIRegisterInfo.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/LivePhysRegs.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineOperand.h"
@@ -32,6 +33,7 @@ class SIOptimizeExecMasking : public MachineFunctionPass {
 
   DenseMap<MachineInstr *, MachineInstr *> SaveExecVCmpMapping;
   SmallVector<std::pair<MachineInstr *, MachineInstr *>, 1> OrXors;
+  SmallVector<MachineOperand *, 1> KillFlagCandidates;
 
   Register isCopyFromExec(const MachineInstr &MI) const;
   Register isCopyToExec(const MachineInstr &MI) const;
@@ -41,15 +43,16 @@ class SIOptimizeExecMasking : public MachineFunctionPass {
   MachineBasicBlock::reverse_iterator
   findExecCopy(MachineBasicBlock &MBB,
                MachineBasicBlock::reverse_iterator I) const;
-
   bool isRegisterInUseBetween(MachineInstr &Stop, MachineInstr &Start,
                               MCRegister Reg, bool UseLiveOuts = false,
                               bool IgnoreStart = false) const;
   bool isRegisterInUseAfter(MachineInstr &Stop, MCRegister Reg) const;
-  MachineInstr *findInstrBackwards(MachineInstr &Origin,
-                                   std::function<bool(MachineInstr *)> Pred,
-                                   ArrayRef<MCRegister> NonModifiableRegs,
-                                   unsigned MaxInstructions = 20) const;
+  MachineInstr *findInstrBackwards(
+      MachineInstr &Origin, std::function<bool(MachineInstr *)> Pred,
+      ArrayRef<MCRegister> NonModifiableRegs,
+      MachineInstr *Terminator = nullptr,
+      SmallVectorImpl<MachineOperand *> *KillFlagCandidates = nullptr,
+      unsigned MaxInstructions = 20) const;
   bool optimizeExecSequence();
   void tryRecordVCmpxAndSaveexecSequence(MachineInstr &MI);
   bool optimizeVCMPSaveExecSequence(MachineInstr &SaveExecInstr,
@@ -325,11 +328,13 @@ static bool isLiveOut(const MachineBasicBlock &MBB, unsigned Reg) {
 // Backwards-iterate from Origin (for n=MaxInstructions iterations) until either
 // the beginning of the BB is reached or Pred evaluates to true - which can be
 // an arbitrary condition based on the current MachineInstr, for instance an
-// target instruction. Breaks prematurely by returning nullptr if  one of the
+// target instruction. Breaks prematurely by returning nullptr if one of the
 // registers given in NonModifiableRegs is modified by the current instruction.
 MachineInstr *SIOptimizeExecMasking::findInstrBackwards(
     MachineInstr &Origin, std::function<bool(MachineInstr *)> Pred,
-    ArrayRef<MCRegister> NonModifiableRegs, unsigned MaxInstructions) const {
+    ArrayRef<MCRegister> NonModifiableRegs, MachineInstr *Terminator,
+    SmallVectorImpl<MachineOperand *> *KillFlagCandidates,
+    unsigned MaxInstructions) const {
   MachineBasicBlock::reverse_iterator A = Origin.getReverseIterator(),
                                       E = Origin.getParent()->rend();
   unsigned CurrentIteration = 0;
@@ -344,6 +349,21 @@ MachineInstr *SIOptimizeExecMasking::findInstrBackwards(
     for (MCRegister Reg : NonModifiableRegs) {
       if (A->modifiesRegister(Reg, TRI))
         return nullptr;
+
+      // Check for kills that appear after the terminator instruction, that
+      // would not be detected by clearKillFlags, since they will cause the
+      // register to be dead at a later place, causing the verifier to fail.
+      // We use the candidates to clear the kill flags later.
+      if (Terminator && KillFlagCandidates && A != Terminator &&
+          A->killsRegister(Reg, TRI)) {
+        for (MachineOperand &MO : A->operands()) {
+          if (MO.isReg() && MO.isKill()) {
+            Register Candidate = MO.getReg();
+            if (Candidate != Reg && TRI->regsOverlap(Candidate, Reg))
+              KillFlagCandidates->push_back(&MO);
+          }
+        }
+      }
     }
 
     ++CurrentIteration;
@@ -599,6 +619,9 @@ bool SIOptimizeExecMasking::optimizeVCMPSaveExecSequence(
   if (Src1->isReg())
     MRI->clearKillFlags(Src1->getReg());
 
+  for (MachineOperand *MO : KillFlagCandidates)
+    MO->setIsKill(false);
+
   SaveExecInstr.eraseFromParent();
   VCmp.eraseFromParent();
 
@@ -690,7 +713,8 @@ void SIOptimizeExecMasking::tryRecordVCmpxAndSaveexecSequence(
     NonDefRegs.push_back(Src1->getReg());
 
   if (!findInstrBackwards(
-          MI, [&](MachineInstr *Check) { return Check == VCmp; }, NonDefRegs))
+          MI, [&](MachineInstr *Check) { return Check == VCmp; }, NonDefRegs,
+          VCmp, &KillFlagCandidates))
     return;
 
   if (VCmp)
@@ -777,6 +801,7 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) {
 
   OrXors.clear();
   SaveExecVCmpMapping.clear();
+  KillFlagCandidates.clear();
   static unsigned SearchWindow = 10;
   for (MachineBasicBlock &MBB : MF) {
     unsigned SearchCount = 0;
diff --git a/llvm/test/CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx-set-kill.mir b/llvm/test/CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx-set-kill.mir
new file mode 100644
index 0000000000000..aef7adcf72b53
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/vcmp-saveexec-to-vcmpx-set-kill.mir
@@ -0,0 +1,39 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
+# RUN: llc -march=amdgcn -mcpu=gfx1100 -run-pass=si-optimize-exec-masking -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX1100 %s
+
+---
+
+name: vcmp_saveexec_to_vcmpx_set_kill
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $sgpr43, $sgpr44, $sgpr45, $sgpr55, $sgpr62, $sgpr63, $sgpr64, $sgpr65, $sgpr66, $sgpr67, $vgpr40, $vgpr41, $vgpr76, $vgpr77, $vgpr78, $vgpr95, $vgpr109, $vgpr110, $vgpr111, $sgpr48_sgpr49_sgpr50_sgpr51:0x000000000000000C, $sgpr52_sgpr53_sgpr54_sgpr55:0x0000000000000003, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr60_sgpr61, $vgpr92_vgpr93_vgpr94_vgpr95:0x000000000000003F, $vgpr104_vgpr105_vgpr106_vgpr107:0x000000000000003F, $vgpr46_vgpr47:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000C, $vgpr72_vgpr73:0x000000000000000F, $vgpr74_vgpr75:0x000000000000000F, $vgpr88_vgpr89:0x000000000000000C, $vgpr90_vgpr91:0x0000000000000003, $vgpr124_vgpr125:0x000000000000000F, $vgpr126_vgpr127:0x000000000000000F
+
+    ; GFX1100-LABEL: name: vcmp_saveexec_to_vcmpx_set_kill
+    ; GFX1100: liveins: $sgpr43, $sgpr44, $sgpr45, $sgpr55, $sgpr62, $sgpr63, $sgpr64, $sgpr65, $sgpr66, $sgpr67, $vgpr40, $vgpr41, $vgpr76, $vgpr77, $vgpr78, $vgpr95, $vgpr109, $vgpr110, $vgpr111, $sgpr48_sgpr49_sgpr50_sgpr51:0x000000000000000C, $sgpr52_sgpr53_sgpr54_sgpr55:0x0000000000000003, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr40_sgpr41, $sgpr56_sgpr57, $sgpr58_sgpr59, $sgpr60_sgpr61, $vgpr92_vgpr93_vgpr94_vgpr95:0x000000000000003F, $vgpr104_vgpr105_vgpr106_vgpr107:0x000000000000003F, $vgpr46_vgpr47:0x000000000000000F, $vgpr60_vgpr61:0x000000000000000F, $vgpr62_vgpr63:0x000000000000000C, $vgpr72_vgpr73:0x000000000000000F, $vgpr74_vgpr75:0x000000000000000F, $vgpr88_vgpr89:0x000000000000000C, $vgpr90_vgpr91:0x0000000000000003, $vgpr124_vgpr125:0x000000000000000F, $vgpr126_vgpr127:0x000000000000000F
+    ; GFX1100-NEXT: {{  $}}
+    ; GFX1100-NEXT: renamable $vgpr0 = V_AND_B32_e32 128, $vgpr90, implicit $exec
+    ; GFX1100-NEXT: renamable $vgpr1 = V_AND_B32_e32 128, $vgpr89, implicit $exec
+    ; GFX1100-NEXT: renamable $sgpr4 = V_CMP_NE_U32_e64 0, $vgpr0, implicit $exec
+    ; GFX1100-NEXT: renamable $sgpr0 = S_MOV_B32 0
+    ; GFX1100-NEXT: renamable $sgpr1 = COPY renamable $sgpr0
+    ; GFX1100-NEXT: renamable $sgpr2 = COPY renamable $sgpr0
+    ; GFX1100-NEXT: renamable $sgpr3 = COPY renamable $sgpr0
+    ; GFX1100-NEXT: BUFFER_STORE_DWORDX2_OFFSET_exact $vgpr0_vgpr1, killed $sgpr0_sgpr1_sgpr2_sgpr3, 0, 344, 1, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 8)
+    ; GFX1100-NEXT: renamable $sgpr68 = COPY renamable $sgpr66
+    ; GFX1100-NEXT: $sgpr5 = S_MOV_B32 $exec_lo
+    ; GFX1100-NEXT: V_CMPX_EQ_U32_nosdst_e64 $vgpr0, 0, implicit-def $exec, implicit $exec
+    renamable $vgpr0 = V_AND_B32_e32 128, $vgpr90, implicit $exec
+    renamable $vgpr1 = V_AND_B32_e32 128, $vgpr89, implicit $exec
+    renamable $vcc_lo = V_CMP_EQ_U32_e64 $vgpr0, 0, implicit $exec
+    renamable $sgpr4 = V_CMP_NE_U32_e64 0, killed $vgpr0, implicit $exec
+    renamable $sgpr0 = S_MOV_B32 0
+    renamable $sgpr1 = COPY renamable $sgpr0
+    renamable $sgpr2 = COPY renamable $sgpr0
+    renamable $sgpr3 = COPY renamable $sgpr0
+    BUFFER_STORE_DWORDX2_OFFSET_exact killed $vgpr0_vgpr1, killed $sgpr0_sgpr1_sgpr2_sgpr3, 0, 344, 1, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 8)
+    renamable $sgpr68 = COPY renamable $sgpr66
+    renamable $sgpr5 = COPY $exec_lo, implicit-def $exec_lo
+    renamable $sgpr6 = S_AND_B32 renamable $sgpr5, killed renamable $vcc_lo, implicit-def dead $scc
+    $exec_lo = S_MOV_B32_term killed renamable $sgpr6
+...

From 41cf94e6b8b43f6dec089ab1df03130926d51997 Mon Sep 17 00:00:00 2001
From: Jessica Del <50999226+OutOfCache@users.noreply.github.com>
Date: Thu, 2 Nov 2023 10:37:52 +0100
Subject: [PATCH 798/877] [AMDGPU] - Add s_quadmask intrinsics (#70804)

Add intrinsics to generate `s_quadmask_b32`
and `s_quadmask_b64`.

Support VGPR arguments by inserting a `v_readfirstlane`.
---
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td      |  5 ++
 .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp  |  9 ++
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp        |  6 +-
 llvm/lib/Target/AMDGPU/SOPInstructions.td     |  6 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.quadmask.ll    | 90 +++++++++++++++++++
 5 files changed, 112 insertions(+), 4 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.quadmask.ll

diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 50c63621b8e41..0daa5a71340d6 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -1932,6 +1932,11 @@ def int_amdgcn_inverse_ballot :
 def int_amdgcn_s_bitreplicate :
   DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_i32_ty], [IntrNoMem, IntrConvergent]>;
 
+// Lowers to S_QUADMASK_B{32,64}
+// The argument must be uniform; otherwise, the result is undefined.
+def int_amdgcn_s_quadmask :
+  DefaultAttrsIntrinsic<[llvm_anyint_ty], [llvm_anyint_ty], [IntrNoMem, IntrConvergent]>;
+
 class AMDGPUWaveReduce<LLVMType data_ty = llvm_anyint_ty> : Intrinsic<
     [data_ty],
     [
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 0c5ed649bcdbe..259af55885fc0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -2995,6 +2995,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
       return;
     case Intrinsic::amdgcn_inverse_ballot:
     case Intrinsic::amdgcn_s_bitreplicate:
+    case Intrinsic::amdgcn_s_quadmask:
       applyDefaultMapping(OpdMapper);
       constrainOpWithReadfirstlane(B, MI, 2); // Mask
       return;
@@ -4537,6 +4538,14 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
       OpdsMapping[2] = AMDGPU::getValueMapping(MaskBank, MaskSize);
       break;
     }
+    case Intrinsic::amdgcn_s_quadmask: {
+      Register MaskReg = MI.getOperand(2).getReg();
+      unsigned MaskSize = MRI.getType(MaskReg).getSizeInBits();
+      unsigned MaskBank = getRegBankID(MaskReg, MRI, AMDGPU::SGPRRegBankID);
+      OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, MaskSize);
+      OpdsMapping[2] = AMDGPU::getValueMapping(MaskBank, MaskSize);
+      break;
+    }
     case Intrinsic::amdgcn_wave_reduce_umin:
     case Intrinsic::amdgcn_wave_reduce_umax: {
       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 43dedbb03cc83..731969253c2b9 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -6484,8 +6484,10 @@ SIInstrInfo::legalizeOperands(MachineInstr &MI,
     return CreatedBB;
   }
 
-  // Legalize S_BITREPLICATE
-  if (MI.getOpcode() == AMDGPU::S_BITREPLICATE_B64_B32) {
+  // Legalize S_BITREPLICATE and S_QUADMASK
+  if (MI.getOpcode() == AMDGPU::S_BITREPLICATE_B64_B32 ||
+      MI.getOpcode() == AMDGPU::S_QUADMASK_B32 ||
+      MI.getOpcode() == AMDGPU::S_QUADMASK_B64) {
     MachineOperand &Src = MI.getOperand(1);
     if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
       Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index c419b5f7a5711..edbfd79db3fdb 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -326,8 +326,10 @@ def S_XNOR_SAVEEXEC_B64 : SOP1_64 <"s_xnor_saveexec_b64">;
 
 } // End hasSideEffects = 1, Uses = [EXEC], Defs = [EXEC, SCC]
 
-def S_QUADMASK_B32 : SOP1_32 <"s_quadmask_b32">;
-def S_QUADMASK_B64 : SOP1_64 <"s_quadmask_b64">;
+def S_QUADMASK_B32 : SOP1_32 <"s_quadmask_b32",
+  [(set i32:$sdst, (int_amdgcn_s_quadmask i32:$src0))]>;
+def S_QUADMASK_B64 : SOP1_64 <"s_quadmask_b64",
+  [(set i64:$sdst, (int_amdgcn_s_quadmask i64:$src0))]>;
 
 let Uses = [M0] in {
 def S_MOVRELS_B32 : SOP1_32R <"s_movrels_b32">;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.quadmask.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.quadmask.ll
new file mode 100644
index 0000000000000..65443a6efa789
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.quadmask.ll
@@ -0,0 +1,90 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -global-isel=1 -verify-machineinstrs < %s | FileCheck  -check-prefixes=GFX11 %s
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -global-isel=0 -verify-machineinstrs < %s | FileCheck  -check-prefixes=GFX11 %s
+
+declare i32 @llvm.amdgcn.s.quadmask.i32(i32)
+declare i64 @llvm.amdgcn.s.quadmask.i64(i64)
+
+define i32 @test_quadmask_constant_i32() {
+; GFX11-LABEL: test_quadmask_constant_i32:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_quadmask_b32 s0, 0x85fe3a92
+; GFX11-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %qm = call i32 @llvm.amdgcn.s.quadmask.i32(i32 u0x85FE3A92)
+  ret i32 %qm
+}
+
+define amdgpu_cs void @test_quadmask_sgpr_i32(i32 inreg %mask, ptr addrspace(1) %out) {
+; GFX11-LABEL: test_quadmask_sgpr_i32:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_quadmask_b32 s0, s0
+; GFX11-NEXT:    v_mov_b32_e32 v2, s0
+; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX11-NEXT:    s_nop 0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+entry:
+  %qm = call i32 @llvm.amdgcn.s.quadmask.i32(i32 %mask)
+  store i32 %qm, ptr addrspace(1) %out
+  ret void
+}
+
+
+define i32 @test_quadmask_vgpr_i32(i32 %mask) {
+; GFX11-LABEL: test_quadmask_vgpr_i32:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11-NEXT:    s_quadmask_b32 s0, s0
+; GFX11-NEXT:    v_mov_b32_e32 v0, s0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %qm = call i32 @llvm.amdgcn.s.quadmask.i32(i32 %mask)
+  ret i32 %qm
+}
+
+define i64 @test_quadmask_constant_i64() {
+; GFX11-LABEL: test_quadmask_constant_i64:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    s_mov_b32 s0, 0x85fe3a92
+; GFX11-NEXT:    s_mov_b32 s1, 0x67de48fc
+; GFX11-NEXT:    s_quadmask_b64 s[0:1], s[0:1]
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %qm = call i64 @llvm.amdgcn.s.quadmask.i64(i64 u0x67DE48FC85FE3A92)
+  ret i64 %qm
+}
+
+define amdgpu_cs void @test_quadmask_sgpr_i64(i64 inreg %mask, ptr addrspace(1) %out) {
+; GFX11-LABEL: test_quadmask_sgpr_i64:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_quadmask_b64 s[0:1], s[0:1]
+; GFX11-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX11-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX11-NEXT:    s_nop 0
+; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-NEXT:    s_endpgm
+entry:
+  %qm = call i64 @llvm.amdgcn.s.quadmask.i64(i64 %mask)
+  store i64 %qm, ptr addrspace(1) %out
+  ret void
+}
+
+define i64 @test_quadmask_vgpr_i64(i64 %mask) {
+; GFX11-LABEL: test_quadmask_vgpr_i64:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX11-NEXT:    s_quadmask_b64 s[0:1], s[0:1]
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  %qm = call i64 @llvm.amdgcn.s.quadmask.i64(i64 %mask)
+  ret i64 %qm
+}

From 07f0e75b5382a70d41adc43b8c08b82389e1e695 Mon Sep 17 00:00:00 2001
From: David Sherwood <57997763+david-arm@users.noreply.github.com>
Date: Thu, 2 Nov 2023 10:02:50 +0000
Subject: [PATCH 799/877] [LoopVectorize] Fix bug with code to hoist runtime
 checks (#70937)

There was a silly mistake in the expandBounds function that was using
the wrong type when calling expandCodeFor and always assuming the stride
is 64 bits. I've added the following test to defend this fix:

Transforms/LoopVectorize/ARM/mve-hoist-runtime-checks.ll
---
 llvm/lib/Transforms/Utils/LoopUtils.cpp       |   2 +-
 .../ARM/mve-hoist-runtime-checks.ll           | 133 ++++++++++++++++++
 2 files changed, 134 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/Transforms/LoopVectorize/ARM/mve-hoist-runtime-checks.ll

diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp
index 21affe7bdce40..d14fd09943839 100644
--- a/llvm/lib/Transforms/Utils/LoopUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp
@@ -1690,7 +1690,7 @@ static PointerBounds expandBounds(const RuntimeCheckingPtrGroup *CG,
     End = Builder.CreateFreeze(End, End->getName() + ".fr");
   }
   Value *StrideVal =
-      Stride ? Exp.expandCodeFor(Stride, Type::getInt64Ty(Ctx), Loc) : nullptr;
+      Stride ? Exp.expandCodeFor(Stride, Stride->getType(), Loc) : nullptr;
   LLVM_DEBUG(dbgs() << "Start: " << *Low << " End: " << *High << "\n");
   return {Start, End, StrideVal};
 }
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-hoist-runtime-checks.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-hoist-runtime-checks.ll
new file mode 100644
index 0000000000000..438321e0fb0cc
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-hoist-runtime-checks.ll
@@ -0,0 +1,133 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
+; REQUIRES: asserts
+; RUN: opt < %s -hoist-runtime-checks -p 'loop-vectorize' -force-vector-interleave=1 -S \
+; RUN:   -force-vector-width=4 -debug-only=loop-accesses,loop-vectorize,loop-utils 2> %t | FileCheck %s
+; RUN: cat %t | FileCheck %s --check-prefix=DEBUG
+
+target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "thumbv8.1m.main-none-unknown-eabi"
+
+; Equivalent example in C:
+; void diff_checks(int32_t *dst, int32_t *src, int m, int n) {
+;   for (int i = 0; i < m; i++) {
+;     for (int j = 0; j < n; j++) {
+;       dst[(i * (n + 1)) + j] = src[(i * n) + j];
+;     }
+;   }
+; }
+; NOTE: The strides of the starting address values in the inner loop differ, i.e.
+; '(i * (n + 1))' vs '(i * n)'.
+
+; DEBUG-LABEL: LAA: Found a loop in diff_checks:
+; DEBUG:      LAA: Not creating diff runtime check, since these  cannot be hoisted out of the outer loop
+; DEBUG:      LAA: Adding RT check for range:
+; DEBUG-NEXT: LAA: Expanded RT check for range to include outer loop in order to permit hoisting
+; DEBUG-NEXT: LAA: ... but need to check stride is positive: (4 + (4 * %n))
+; DEBUG-NEXT: Start: %dst End: ((4 * %n) + ((4 + (4 * %n)) * (-1 + %m)) + %dst)
+; DEBUG-NEXT: LAA: Adding RT check for range:
+; DEBUG-NEXT: LAA: Expanded RT check for range to include outer loop in order to permit hoisting
+; DEBUG-NEXT: LAA: ... but need to check stride is positive: (4 * %n)
+; DEBUG-NEXT: Start: %src End: ((4 * %m * %n) + %src)
+
+define void @diff_checks(ptr nocapture noundef writeonly %dst, ptr nocapture noundef readonly %src, i32 noundef %m, i32 noundef %n) #0 {
+; CHECK-LABEL: define void @diff_checks
+; CHECK-SAME: (ptr nocapture noundef writeonly [[DST:%.*]], ptr nocapture noundef readonly [[SRC:%.*]], i32 noundef [[M:%.*]], i32 noundef [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ADD5:%.*]] = add nsw i32 [[N]], 1
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[M]], -1
+; CHECK-NEXT:    [[TMP1:%.*]] = shl i32 [[N]], 2
+; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[TMP1]], 4
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i32 [[TMP0]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[TMP3]], [[TMP1]]
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i32 [[TMP4]]
+; CHECK-NEXT:    [[TMP5:%.*]] = mul i32 [[N]], [[M]]
+; CHECK-NEXT:    [[TMP6:%.*]] = shl i32 [[TMP5]], 2
+; CHECK-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[SRC]], i32 [[TMP6]]
+; CHECK-NEXT:    br label [[OUTER_LOOP:%.*]]
+; CHECK:       outer.loop:
+; CHECK-NEXT:    [[I_023_US:%.*]] = phi i32 [ [[INC10_US:%.*]], [[INNER_LOOP_EXIT:%.*]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[MUL_US:%.*]] = mul nsw i32 [[I_023_US]], [[N]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i32, ptr [[SRC]], i32 [[MUL_US]]
+; CHECK-NEXT:    [[MUL6_US:%.*]] = mul nsw i32 [[I_023_US]], [[ADD5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i32, ptr [[DST]], i32 [[MUL6_US]]
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK:       vector.memcheck:
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP1]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    [[STRIDE_CHECK:%.*]] = icmp slt i32 [[TMP2]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = or i1 [[FOUND_CONFLICT]], [[STRIDE_CHECK]]
+; CHECK-NEXT:    [[STRIDE_CHECK2:%.*]] = icmp slt i32 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP10:%.*]] = or i1 [[TMP9]], [[STRIDE_CHECK2]]
+; CHECK-NEXT:    br i1 [[TMP10]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i32 [[N]], 3
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP11:%.*]] = add i32 [[INDEX]], 0
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[TMP11]], i32 [[N]])
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i32, ptr [[TMP7]], i32 [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i32, ptr [[TMP12]], i32 0
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP13]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison), !alias.scope !0
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i32, ptr [[TMP8]], i32 [[TMP11]]
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i32, ptr [[TMP14]], i32 0
+; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[WIDE_MASKED_LOAD]], ptr [[TMP15]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]]), !alias.scope !3, !noalias !0
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[INNER_LOOP_EXIT]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[OUTER_LOOP]] ], [ 0, [[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    br label [[INNER_LOOP:%.*]]
+; CHECK:       inner.loop:
+; CHECK-NEXT:    [[J_021_US:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC_US:%.*]], [[INNER_LOOP]] ]
+; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr i32, ptr [[TMP7]], i32 [[J_021_US]]
+; CHECK-NEXT:    [[TMP17:%.*]] = load i32, ptr [[ARRAYIDX_US]], align 4
+; CHECK-NEXT:    [[ARRAYIDX8_US:%.*]] = getelementptr i32, ptr [[TMP8]], i32 [[J_021_US]]
+; CHECK-NEXT:    store i32 [[TMP17]], ptr [[ARRAYIDX8_US]], align 4
+; CHECK-NEXT:    [[INC_US]] = add nuw nsw i32 [[J_021_US]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC_US]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[INNER_LOOP_EXIT]], label [[INNER_LOOP]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK:       inner.loop.exit:
+; CHECK-NEXT:    [[INC10_US]] = add nuw nsw i32 [[I_023_US]], 1
+; CHECK-NEXT:    [[EXITCOND26_NOT:%.*]] = icmp eq i32 [[INC10_US]], [[M]]
+; CHECK-NEXT:    br i1 [[EXITCOND26_NOT]], label [[EXIT:%.*]], label [[OUTER_LOOP]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %add5 = add nsw i32 %n, 1
+  br label %outer.loop
+
+outer.loop:
+  %i.023.us = phi i32 [ %inc10.us, %inner.loop.exit ], [ 0, %entry ]
+  %mul.us = mul nsw i32 %i.023.us, %n
+  %0 = getelementptr i32, ptr %src, i32 %mul.us
+  %mul6.us = mul nsw i32 %i.023.us, %add5
+  %1 = getelementptr i32, ptr %dst, i32 %mul6.us
+  br label %inner.loop
+
+inner.loop:
+  %j.021.us = phi i32 [ 0, %outer.loop ], [ %inc.us, %inner.loop ]
+  %arrayidx.us = getelementptr i32, ptr %0, i32 %j.021.us
+  %2 = load i32, ptr %arrayidx.us, align 4
+  %arrayidx8.us = getelementptr i32, ptr %1, i32 %j.021.us
+  store i32 %2, ptr %arrayidx8.us, align 4
+  %inc.us = add nuw nsw i32 %j.021.us, 1
+  %exitcond.not = icmp eq i32 %inc.us, %n
+  br i1 %exitcond.not, label %inner.loop.exit, label %inner.loop
+
+inner.loop.exit:
+  %inc10.us = add nuw nsw i32 %i.023.us, 1
+  %exitcond26.not = icmp eq i32 %inc10.us, %m
+  br i1 %exitcond26.not, label %exit, label %outer.loop
+
+exit:
+  ret void
+}
+
+attributes #0 = { "target-cpu"="cortex-m55" }

From 1590cac4941e278ab10580c8a19a2720a512054c Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Thu, 2 Nov 2023 10:08:09 +0000
Subject: [PATCH 800/877] [AMDGPU] Implement moveToVALU for S_CSELECT_B64
 (#70352)

moveToVALU previously only handled S_CSELECT_B64 in the trivial case
where it was semantically equivalent to a copy. Implement the general
case using V_CNDMASK_B64_PSEUDO and implement post-RA expansion of
V_CNDMASK_B64_PSEUDO with immediate as well as register operands.
---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp    | 46 +++++++++++----
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp       | 59 +++++++++++---------
 llvm/test/CodeGen/AMDGPU/bug-cselect-b64.ll  | 34 +++++++++++
 llvm/test/CodeGen/AMDGPU/fix-sgpr-copies.mir | 19 +++++++
 4 files changed, 120 insertions(+), 38 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/bug-cselect-b64.ll

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index ea37cc5e82772..c1acf3e00d4fb 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -4744,8 +4744,8 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
     const SIRegisterInfo *TRI = ST.getRegisterInfo();
 
     Register Dst = MI.getOperand(0).getReg();
-    Register Src0 = MI.getOperand(1).getReg();
-    Register Src1 = MI.getOperand(2).getReg();
+    const MachineOperand &Src0 = MI.getOperand(1);
+    const MachineOperand &Src1 = MI.getOperand(2);
     const DebugLoc &DL = MI.getDebugLoc();
     Register SrcCond = MI.getOperand(3).getReg();
 
@@ -4754,20 +4754,42 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
     const auto *CondRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
     Register SrcCondCopy = MRI.createVirtualRegister(CondRC);
 
+    const TargetRegisterClass *Src0RC = Src0.isReg()
+                                            ? MRI.getRegClass(Src0.getReg())
+                                            : &AMDGPU::VReg_64RegClass;
+    const TargetRegisterClass *Src1RC = Src1.isReg()
+                                            ? MRI.getRegClass(Src1.getReg())
+                                            : &AMDGPU::VReg_64RegClass;
+
+    const TargetRegisterClass *Src0SubRC =
+        TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
+    const TargetRegisterClass *Src1SubRC =
+        TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
+
+    MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
+        MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
+    MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
+        MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
+
+    MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
+        MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
+    MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
+        MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
+
     BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), SrcCondCopy)
       .addReg(SrcCond);
     BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo)
-      .addImm(0)
-      .addReg(Src0, 0, AMDGPU::sub0)
-      .addImm(0)
-      .addReg(Src1, 0, AMDGPU::sub0)
-      .addReg(SrcCondCopy);
+        .addImm(0)
+        .add(Src0Sub0)
+        .addImm(0)
+        .add(Src1Sub0)
+        .addReg(SrcCondCopy);
     BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi)
-      .addImm(0)
-      .addReg(Src0, 0, AMDGPU::sub1)
-      .addImm(0)
-      .addReg(Src1, 0, AMDGPU::sub1)
-      .addReg(SrcCondCopy);
+        .addImm(0)
+        .add(Src0Sub1)
+        .addImm(0)
+        .add(Src1Sub1)
+        .addReg(SrcCondCopy);
 
     BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), Dst)
       .addReg(DstLo)
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 731969253c2b9..f61735a59c970 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -7265,27 +7265,27 @@ void SIInstrInfo::lowerSelect(SIInstrWorklist &Worklist, MachineInstr &Inst,
   MachineOperand &Src1 = Inst.getOperand(2);
   MachineOperand &Cond = Inst.getOperand(3);
 
-  Register SCCSource = Cond.getReg();
-  bool IsSCC = (SCCSource == AMDGPU::SCC);
+  Register CondReg = Cond.getReg();
+  bool IsSCC = (CondReg == AMDGPU::SCC);
 
   // If this is a trivial select where the condition is effectively not SCC
-  // (SCCSource is a source of copy to SCC), then the select is semantically
-  // equivalent to copying SCCSource. Hence, there is no need to create
+  // (CondReg is a source of copy to SCC), then the select is semantically
+  // equivalent to copying CondReg. Hence, there is no need to create
   // V_CNDMASK, we can just use that and bail out.
   if (!IsSCC && Src0.isImm() && (Src0.getImm() == -1) && Src1.isImm() &&
       (Src1.getImm() == 0)) {
-    MRI.replaceRegWith(Dest.getReg(), SCCSource);
+    MRI.replaceRegWith(Dest.getReg(), CondReg);
     return;
   }
 
-  const TargetRegisterClass *TC =
-      RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
-
-  Register CopySCC = MRI.createVirtualRegister(TC);
-
+  Register NewCondReg = CondReg;
   if (IsSCC) {
+    const TargetRegisterClass *TC =
+        RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
+    NewCondReg = MRI.createVirtualRegister(TC);
+
     // Now look for the closest SCC def if it is a copy
-    // replacing the SCCSource with the COPY source register
+    // replacing the CondReg with the COPY source register
     bool CopyFound = false;
     for (MachineInstr &CandI :
          make_range(std::next(MachineBasicBlock::reverse_iterator(Inst)),
@@ -7293,7 +7293,7 @@ void SIInstrInfo::lowerSelect(SIInstrWorklist &Worklist, MachineInstr &Inst,
       if (CandI.findRegisterDefOperandIdx(AMDGPU::SCC, false, false, &RI) !=
           -1) {
         if (CandI.isCopy() && CandI.getOperand(0).getReg() == AMDGPU::SCC) {
-          BuildMI(MBB, MII, DL, get(AMDGPU::COPY), CopySCC)
+          BuildMI(MBB, MII, DL, get(AMDGPU::COPY), NewCondReg)
               .addReg(CandI.getOperand(1).getReg());
           CopyFound = true;
         }
@@ -7308,24 +7308,31 @@ void SIInstrInfo::lowerSelect(SIInstrWorklist &Worklist, MachineInstr &Inst,
       unsigned Opcode = (ST.getWavefrontSize() == 64) ? AMDGPU::S_CSELECT_B64
                                                       : AMDGPU::S_CSELECT_B32;
       auto NewSelect =
-          BuildMI(MBB, MII, DL, get(Opcode), CopySCC).addImm(-1).addImm(0);
+          BuildMI(MBB, MII, DL, get(Opcode), NewCondReg).addImm(-1).addImm(0);
       NewSelect->getOperand(3).setIsUndef(Cond.isUndef());
     }
   }
 
-  Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
-
-  auto UpdatedInst =
-      BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B32_e64), ResultReg)
-          .addImm(0)
-          .add(Src1) // False
-          .addImm(0)
-          .add(Src0) // True
-          .addReg(IsSCC ? CopySCC : SCCSource);
-
-  MRI.replaceRegWith(Dest.getReg(), ResultReg);
-  legalizeOperands(*UpdatedInst, MDT);
-  addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
+  Register NewDestReg = MRI.createVirtualRegister(
+      RI.getEquivalentVGPRClass(MRI.getRegClass(Dest.getReg())));
+  MachineInstr *NewInst;
+  if (Inst.getOpcode() == AMDGPU::S_CSELECT_B32) {
+    NewInst = BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B32_e64), NewDestReg)
+                  .addImm(0)
+                  .add(Src1) // False
+                  .addImm(0)
+                  .add(Src0) // True
+                  .addReg(NewCondReg);
+  } else {
+    NewInst =
+        BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B64_PSEUDO), NewDestReg)
+            .add(Src1) // False
+            .add(Src0) // True
+            .addReg(NewCondReg);
+  }
+  MRI.replaceRegWith(Dest.getReg(), NewDestReg);
+  legalizeOperands(*NewInst, MDT);
+  addUsersToMoveToVALUWorklist(NewDestReg, MRI, Worklist);
 }
 
 void SIInstrInfo::lowerScalarAbs(SIInstrWorklist &Worklist,
diff --git a/llvm/test/CodeGen/AMDGPU/bug-cselect-b64.ll b/llvm/test/CodeGen/AMDGPU/bug-cselect-b64.ll
new file mode 100644
index 0000000000000..ac196635b363a
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/bug-cselect-b64.ll
@@ -0,0 +1,34 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc -march=amdgcn -mcpu=gfx1030 < %s | FileCheck %s
+
+define amdgpu_cs <2 x i32> @f() {
+; CHECK-LABEL: f:
+; CHECK:       ; %bb.0: ; %bb
+; CHECK-NEXT:    s_mov_b32 s0, 0
+; CHECK-NEXT:    s_mov_b32 s1, s0
+; CHECK-NEXT:    s_mov_b32 s2, s0
+; CHECK-NEXT:    s_mov_b32 s3, s0
+; CHECK-NEXT:    s_mov_b32 s4, s0
+; CHECK-NEXT:    buffer_load_dwordx2 v[0:1], off, s[0:3], 0
+; CHECK-NEXT:    s_mov_b32 s5, s0
+; CHECK-NEXT:    v_mov_b32_e32 v2, s0
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    v_cmp_ne_u64_e32 vcc_lo, s[4:5], v[0:1]
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
+; CHECK-NEXT:    buffer_store_dwordx2 v[1:2], off, s[0:3], 0
+; CHECK-NEXT:    ; return to shader part epilog
+bb:
+  %i = call <2 x i32> @llvm.amdgcn.raw.buffer.load.v2i32(<4 x i32> zeroinitializer, i32 0, i32 0, i32 0)
+  %i1 = bitcast <2 x i32> %i to i64
+  %i2 = insertelement <3 x i64> zeroinitializer, i64 %i1, i64 2
+  %i3 = icmp ne <3 x i64> %i2, zeroinitializer
+  %i4 = zext <3 x i1> %i3 to <3 x i64>
+  %i5 = bitcast <3 x i64> %i4 to <6 x i32>
+  %i6 = shufflevector <6 x i32> %i5, <6 x i32> zeroinitializer, <2 x i32> <i32 4, i32 5>
+  call void @llvm.amdgcn.raw.buffer.store.v2i32(<2 x i32> %i6, <4 x i32> zeroinitializer, i32 0, i32 0, i32 0)
+  ret <2 x i32> %i6
+}
+
+declare <2 x i32> @llvm.amdgcn.raw.buffer.load.v2i32(<4 x i32>, i32, i32, i32 immarg)
+declare void @llvm.amdgcn.raw.buffer.store.v2i32(<2 x i32>, <4 x i32>, i32, i32, i32 immarg)
diff --git a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies.mir b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies.mir
index 8d0a9899b5dbc..b1715bd9d3328 100644
--- a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies.mir
+++ b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies.mir
@@ -233,3 +233,22 @@ body:             |
     %0:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec
     %1:sreg_32 = COPY %0.sub0
 ...
+
+---
+# GCN-LABEL: name: s_cselect_b64
+# GCN: %0:vgpr_32 = IMPLICIT_DEF
+# GCN: %1:vreg_64 = IMPLICIT_DEF
+# GCN: %2:sreg_32 = IMPLICIT_DEF
+# GCN: %3:sreg_64 = IMPLICIT_DEF
+# GCN: %5:sreg_64_xexec = V_CMP_EQ_U32_e64 %0, 0, implicit $exec
+# GCN: %6:vreg_64 = V_CNDMASK_B64_PSEUDO 0, %1, %5, implicit $exec
+name: s_cselect_b64
+body: |
+  bb.0:
+    %0:vgpr_32 = IMPLICIT_DEF
+    %1:vreg_64 = IMPLICIT_DEF
+    %2:sreg_32 = COPY %0
+    %3:sreg_64 = COPY %1
+    S_CMP_EQ_U32 %2, 0, implicit-def $scc
+    %4:sreg_64 = S_CSELECT_B64 %3, 0, implicit $scc
+...

From c1b55ae7384c358009440df76ecf185364d928c1 Mon Sep 17 00:00:00 2001
From: Andrew Kaster <akaster@serenityos.org>
Date: Thu, 2 Nov 2023 06:15:43 -0400
Subject: [PATCH 801/877] [cmake] Add SerenityOS to config.guess

Reviewed By: phosek

Differential Revision: https://reviews.llvm.org/D154403
---
 llvm/cmake/config.guess | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/llvm/cmake/config.guess b/llvm/cmake/config.guess
index 71abbf939f97f..f489623677694 100644
--- a/llvm/cmake/config.guess
+++ b/llvm/cmake/config.guess
@@ -816,6 +816,9 @@ EOF
     i*:PW*:*)
 	echo ${UNAME_MACHINE}-pc-pw32
 	exit ;;
+    *:SerenityOS:*:*)
+	echo ${UNAME_MACHINE}-pc-serenity
+	exit ;;
     *:Interix*:*)
 	case ${UNAME_MACHINE} in
 	    x86)

From 22f11592232839f8e7b2228169012beab24fa56a Mon Sep 17 00:00:00 2001
From: Cullen Rhodes <cullen.rhodes@arm.com>
Date: Thu, 2 Nov 2023 10:23:38 +0000
Subject: [PATCH 802/877] [mlir][ArmSME] Propagate pad and mask in
 vector.transfer_read lowering (#70814)

This extends the lowering of vector.transfer_read -> arm_sme.tile_load
lowering to propagate pad and mask.

The restriction on the transfer_read being a transposition is also
removed, identity maps are lowered to normal horizontal loads.
---
 .../VectorToArmSME/VectorToArmSME.cpp         |  57 ++++---
 .../Dialect/ArmSME/vector-ops-to-sme.mlir     | 140 +++++++++---------
 2 files changed, 109 insertions(+), 88 deletions(-)

diff --git a/mlir/lib/Conversion/VectorToArmSME/VectorToArmSME.cpp b/mlir/lib/Conversion/VectorToArmSME/VectorToArmSME.cpp
index 005dd546bf163..5491f7dd30629 100644
--- a/mlir/lib/Conversion/VectorToArmSME/VectorToArmSME.cpp
+++ b/mlir/lib/Conversion/VectorToArmSME/VectorToArmSME.cpp
@@ -60,15 +60,30 @@ getSMETileAndCastToVector(PatternRewriter &rewriter, Location loc,
 
 namespace {
 
-/// Conversion pattern for vector.transfer_read op with transpose permutation
-/// map to vertical arm_sme.tile_load (in-flight transpose).
+/// Conversion pattern for vector.transfer_read.
+///
+/// ---
+///
+/// Example 1: op with identity permutation map to horizontal
+///            arm_sme.tile_load:
+///
+///   vector.transfer_read ...  permutation_map: (d0, d1) -> (d0, d1)
+///
+/// is converted to:
+///
+///   arm_sme.tile_load ...
+///
+/// ---
+///
+/// Example 2: op with transpose permutation map to vertical arm_sme.tile_load
+///            (in-flight transpose):
 ///
 ///   vector.transfer_read ...  permutation_map: (d0, d1) -> (d1, d0)
 ///
 /// is converted to:
 ///
 ///   arm_sme.tile_load ... layout<vertical>
-struct TransferReadPermutationToArmSMELowering
+struct TransferReadToArmSMELowering
     : public OpRewritePattern<vector::TransferReadOp> {
   using OpRewritePattern<vector::TransferReadOp>::OpRewritePattern;
 
@@ -79,15 +94,6 @@ struct TransferReadPermutationToArmSMELowering
       return rewriter.notifyMatchFailure(transferReadOp,
                                          "not a 2 result permutation map");
 
-    AffineMap map = transferReadOp.getPermutationMap();
-
-    // Permutation map doesn't perform permutation, can be lowered to
-    // vector.load by TransferReadToVectorLoadLowering and then
-    // arm_sme.tile_load by VectorLoadToArmSMELowering.
-    if (map.isIdentity())
-      return rewriter.notifyMatchFailure(
-          transferReadOp, "map is an identity, apply another pattern");
-
     auto vectorType = transferReadOp.getVectorType();
     if (!arm_sme::isValidSMETileVectorType(vectorType))
       return rewriter.notifyMatchFailure(transferReadOp,
@@ -96,26 +102,33 @@ struct TransferReadPermutationToArmSMELowering
     if (!llvm::isa<MemRefType>(transferReadOp.getSource().getType()))
       return rewriter.notifyMatchFailure(transferReadOp, "not a memref source");
 
-    if (transferReadOp.getMask())
-      // TODO: support masking.
-      return rewriter.notifyMatchFailure(transferReadOp,
-                                         "masking not yet supported");
-
     // Out-of-bounds dims are not supported.
     if (transferReadOp.hasOutOfBoundsDim())
       return rewriter.notifyMatchFailure(transferReadOp,
                                          "not inbounds transfer read");
 
+    arm_sme::TileSliceLayout layout;
+
     AffineExpr d0, d1;
     bindDims(transferReadOp.getContext(), d0, d1);
-    if (map != AffineMap::get(map.getNumDims(), 0, {d1, d0},
-                              transferReadOp.getContext()))
+    AffineMap map = transferReadOp.getPermutationMap();
+    if (map.isIdentity())
+      layout = arm_sme::TileSliceLayout::Horizontal;
+    else if (map == AffineMap::get(map.getNumDims(), 0, {d1, d0},
+                                   transferReadOp.getContext()))
+      layout = arm_sme::TileSliceLayout::Vertical;
+    else
       return rewriter.notifyMatchFailure(transferReadOp,
-                                         "not true 2-D matrix transpose");
+                                         "unsupported permutation map");
 
+    // Padding isn't optional for transfer_read, but is only used in the case
+    // of out-of-bounds accesses (not supported here) and/or masking. Mask is
+    // optional, if it's not present don't pass padding.
+    auto mask = transferReadOp.getMask();
+    auto padding = mask ? transferReadOp.getPadding() : nullptr;
     rewriter.replaceOpWithNewOp<arm_sme::TileLoadOp>(
         transferReadOp, vectorType, transferReadOp.getSource(),
-        transferReadOp.getIndices(), arm_sme::TileSliceLayout::Vertical);
+        transferReadOp.getIndices(), padding, mask, layout);
 
     return success();
   }
@@ -531,7 +544,7 @@ struct VectorOuterProductToArmSMELowering
 void mlir::populateVectorToArmSMEPatterns(RewritePatternSet &patterns,
                                           MLIRContext &ctx) {
   patterns.add<BroadcastOpToArmSMELowering, ConstantOpToArmSMELowering,
-               SplatOpToArmSMELowering, TransferReadPermutationToArmSMELowering,
+               SplatOpToArmSMELowering, TransferReadToArmSMELowering,
                TransferWriteToArmSMELowering, TransposeOpToArmSMELowering,
                VectorLoadToArmSMELowering, VectorStoreToArmSMELowering,
                VectorOuterProductToArmSMELowering>(&ctx);
diff --git a/mlir/test/Dialect/ArmSME/vector-ops-to-sme.mlir b/mlir/test/Dialect/ArmSME/vector-ops-to-sme.mlir
index 5f41313fc6ac7..ed33f8508dba0 100644
--- a/mlir/test/Dialect/ArmSME/vector-ops-to-sme.mlir
+++ b/mlir/test/Dialect/ArmSME/vector-ops-to-sme.mlir
@@ -1,181 +1,189 @@
 // RUN: mlir-opt %s -convert-vector-to-arm-sme -split-input-file -allow-unregistered-dialect | FileCheck %s
 
 //===----------------------------------------------------------------------===//
-// vector.transfer_read (with in-flight transpose)
+// vector.transfer_read
 //===----------------------------------------------------------------------===//
 
-// CHECK-LABEL: @transfer_read_2d_transpose_i8
-// CHECK: arm_sme.tile_load {{.*}} layout<vertical> : memref<?x?xi8>, vector<[16]x[16]xi8>
-func.func @transfer_read_2d_transpose_i8(%src : memref<?x?xi8>) {
+// CHECK-LABEL: @transfer_read_2d_i8
+// CHECK: arm_sme.tile_load %{{.*}}[{{.*}}] : memref<?x?xi8>, vector<[16]x[16]xi8>
+func.func @transfer_read_2d_i8(%src : memref<?x?xi8>) {
   %c0 = arith.constant 0 : index
   %pad = arith.constant 0 : i8
-  %0 = vector.transfer_read %src[%c0, %c0], %pad {permutation_map = affine_map<(d0, d1) -> (d1, d0)>, in_bounds = [true, true]} : memref<?x?xi8>, vector<[16]x[16]xi8>
+  %0 = vector.transfer_read %src[%c0, %c0], %pad {in_bounds = [true, true]} : memref<?x?xi8>, vector<[16]x[16]xi8>
   "prevent.dce"(%0) : (vector<[16]x[16]xi8>) -> ()
   return
 }
 
 // -----
 
-// CHECK-LABEL: @transfer_read_2d_transpose_i16
-// CHECK: arm_sme.tile_load {{.*}} layout<vertical> : memref<?x?xi16>, vector<[8]x[8]xi16>
-func.func @transfer_read_2d_transpose_i16(%src : memref<?x?xi16>) {
+// CHECK-LABEL: @transfer_read_2d_i16
+// CHECK: arm_sme.tile_load %{{.*}}[{{.*}}] : memref<?x?xi16>, vector<[8]x[8]xi16>
+func.func @transfer_read_2d_i16(%src : memref<?x?xi16>) {
   %c0 = arith.constant 0 : index
   %pad = arith.constant 0 : i16
-  %0 = vector.transfer_read %src[%c0, %c0], %pad {permutation_map = affine_map<(d0, d1) -> (d1, d0)>, in_bounds = [true, true]} : memref<?x?xi16>, vector<[8]x[8]xi16>
+  %0 = vector.transfer_read %src[%c0, %c0], %pad {in_bounds = [true, true]} : memref<?x?xi16>, vector<[8]x[8]xi16>
   "prevent.dce"(%0) : (vector<[8]x[8]xi16>) -> ()
   return
 }
 
 // -----
 
-// CHECK-LABEL: @transfer_read_2d_transpose_i32
-// CHECK: arm_sme.tile_load {{.*}} layout<vertical> : memref<?x?xi32>, vector<[4]x[4]xi32>
-func.func @transfer_read_2d_transpose_i32(%src : memref<?x?xi32>) {
+// CHECK-LABEL: @transfer_read_2d_i32
+// CHECK: arm_sme.tile_load %{{.*}}[{{.*}}] : memref<?x?xi32>, vector<[4]x[4]xi32>
+func.func @transfer_read_2d_i32(%src : memref<?x?xi32>) {
   %c0 = arith.constant 0 : index
   %pad = arith.constant 0 : i32
-  %0 = vector.transfer_read %src[%c0, %c0], %pad {permutation_map = affine_map<(d0, d1) -> (d1, d0)>, in_bounds = [true, true]} : memref<?x?xi32>, vector<[4]x[4]xi32>
+  %0 = vector.transfer_read %src[%c0, %c0], %pad {in_bounds = [true, true]} : memref<?x?xi32>, vector<[4]x[4]xi32>
   "prevent.dce"(%0) : (vector<[4]x[4]xi32>) -> ()
   return
 }
 
 // -----
 
-// CHECK-LABEL: @transfer_read_2d_transpose_i64
-// CHECK: arm_sme.tile_load {{.*}} layout<vertical> : memref<?x?xi64>, vector<[2]x[2]xi64>
-func.func @transfer_read_2d_transpose_i64(%src : memref<?x?xi64>) {
+// CHECK-LABEL: @transfer_read_2d_i64
+// CHECK: arm_sme.tile_load %{{.*}}[{{.*}}] : memref<?x?xi64>, vector<[2]x[2]xi64>
+func.func @transfer_read_2d_i64(%src : memref<?x?xi64>) {
   %c0 = arith.constant 0 : index
   %pad = arith.constant 0 : i64
-  %0 = vector.transfer_read %src[%c0, %c0], %pad {permutation_map = affine_map<(d0, d1) -> (d1, d0)>, in_bounds = [true, true]} : memref<?x?xi64>, vector<[2]x[2]xi64>
+  %0 = vector.transfer_read %src[%c0, %c0], %pad {in_bounds = [true, true]} : memref<?x?xi64>, vector<[2]x[2]xi64>
   "prevent.dce"(%0) : (vector<[2]x[2]xi64>) -> ()
   return
 }
 
 // -----
 
-// CHECK-LABEL: @transfer_read_2d_transpose_i128
-// CHECK: arm_sme.tile_load {{.*}} layout<vertical> : memref<?x?xi128>, vector<[1]x[1]xi128>
-func.func @transfer_read_2d_transpose_i128(%src : memref<?x?xi128>) {
+// CHECK-LABEL: @transfer_read_2d_i128
+// CHECK: arm_sme.tile_load %{{.*}}[{{.*}}] : memref<?x?xi128>, vector<[1]x[1]xi128>
+func.func @transfer_read_2d_i128(%src : memref<?x?xi128>) {
   %c0 = arith.constant 0 : index
   %pad = arith.constant 0 : i128
-  %0 = vector.transfer_read %src[%c0, %c0], %pad {permutation_map = affine_map<(d0, d1) -> (d1, d0)>, in_bounds = [true, true]} : memref<?x?xi128>, vector<[1]x[1]xi128>
+  %0 = vector.transfer_read %src[%c0, %c0], %pad {in_bounds = [true, true]} : memref<?x?xi128>, vector<[1]x[1]xi128>
   "prevent.dce"(%0) : (vector<[1]x[1]xi128>) -> ()
   return
 }
 
 // -----
 
-// CHECK-LABEL: @transfer_read_2d_transpose_f16
-// CHECK: arm_sme.tile_load {{.*}} layout<vertical> : memref<?x?xf16>, vector<[8]x[8]xf16>
-func.func @transfer_read_2d_transpose_f16(%src : memref<?x?xf16>) {
+// CHECK-LABEL: @transfer_read_2d_f16
+// CHECK: arm_sme.tile_load %{{.*}}[{{.*}}] : memref<?x?xf16>, vector<[8]x[8]xf16>
+func.func @transfer_read_2d_f16(%src : memref<?x?xf16>) {
   %c0 = arith.constant 0 : index
   %pad = arith.constant 0.0 : f16
-  %0 = vector.transfer_read %src[%c0, %c0], %pad {permutation_map = affine_map<(d0, d1) -> (d1, d0)>, in_bounds = [true, true]} : memref<?x?xf16>, vector<[8]x[8]xf16>
+  %0 = vector.transfer_read %src[%c0, %c0], %pad {in_bounds = [true, true]} : memref<?x?xf16>, vector<[8]x[8]xf16>
   "prevent.dce"(%0) : (vector<[8]x[8]xf16>) -> ()
   return
 }
 
 // -----
 
-// CHECK-LABEL: @transfer_read_2d_transpose_bf16
-// CHECK: arm_sme.tile_load {{.*}} layout<vertical> : memref<?x?xbf16>, vector<[8]x[8]xbf16>
-func.func @transfer_read_2d_transpose_bf16(%src : memref<?x?xbf16>) {
+// CHECK-LABEL: @transfer_read_2d_bf16
+// CHECK: arm_sme.tile_load %{{.*}}[{{.*}}] : memref<?x?xbf16>, vector<[8]x[8]xbf16>
+func.func @transfer_read_2d_bf16(%src : memref<?x?xbf16>) {
   %c0 = arith.constant 0 : index
   %pad = arith.constant 0.0 : bf16
-  %0 = vector.transfer_read %src[%c0, %c0], %pad {permutation_map = affine_map<(d0, d1) -> (d1, d0)>, in_bounds = [true, true]} : memref<?x?xbf16>, vector<[8]x[8]xbf16>
+  %0 = vector.transfer_read %src[%c0, %c0], %pad {in_bounds = [true, true]} : memref<?x?xbf16>, vector<[8]x[8]xbf16>
   "prevent.dce"(%0) : (vector<[8]x[8]xbf16>) -> ()
   return
 }
 
 // -----
 
-// CHECK-LABEL: @transfer_read_2d_transpose_f32
-// CHECK: arm_sme.tile_load {{.*}} layout<vertical> : memref<?x?xf32>, vector<[4]x[4]xf32>
-func.func @transfer_read_2d_transpose_f32(%src : memref<?x?xf32>) {
+// CHECK-LABEL: @transfer_read_2d_f32
+// CHECK: arm_sme.tile_load %{{.*}}[{{.*}}] : memref<?x?xf32>, vector<[4]x[4]xf32>
+func.func @transfer_read_2d_f32(%src : memref<?x?xf32>) {
   %c0 = arith.constant 0 : index
   %pad = arith.constant 0.0 : f32
-  %0 = vector.transfer_read %src[%c0, %c0], %pad {permutation_map = affine_map<(d0, d1) -> (d1, d0)>, in_bounds = [true, true]} : memref<?x?xf32>, vector<[4]x[4]xf32>
+  %0 = vector.transfer_read %src[%c0, %c0], %pad {in_bounds = [true, true]} : memref<?x?xf32>, vector<[4]x[4]xf32>
   "prevent.dce"(%0) : (vector<[4]x[4]xf32>) -> ()
   return
 }
 
 // -----
 
-// CHECK-LABEL: @transfer_read_2d_transpose_f64
-// CHECK: arm_sme.tile_load {{.*}} layout<vertical> : memref<?x?xf64>, vector<[2]x[2]xf64>
-func.func @transfer_read_2d_transpose_f64(%src : memref<?x?xf64>) {
+// CHECK-LABEL: @transfer_read_2d_f64
+// CHECK: arm_sme.tile_load %{{.*}}[{{.*}}] : memref<?x?xf64>, vector<[2]x[2]xf64>
+func.func @transfer_read_2d_f64(%src : memref<?x?xf64>) {
   %c0 = arith.constant 0 : index
   %pad = arith.constant 0.0 : f64
-  %0 = vector.transfer_read %src[%c0, %c0], %pad {permutation_map = affine_map<(d0, d1) -> (d1, d0)>, in_bounds = [true, true]} : memref<?x?xf64>, vector<[2]x[2]xf64>
+  %0 = vector.transfer_read %src[%c0, %c0], %pad {in_bounds = [true, true]} : memref<?x?xf64>, vector<[2]x[2]xf64>
   "prevent.dce"(%0) : (vector<[2]x[2]xf64>) -> ()
   return
 }
 
 // -----
 
-// CHECK-LABEL: @transfer_read_2d__bad_type
-// CHECK-NOT: arm_sme.tile_load
-// CHECK: vector.transfer_read
-func.func @transfer_read_2d__bad_type(%src : memref<?x?xf64>) {
+// CHECK-LABEL: @transfer_read_2d_with_mask_i16
+// CHECK: arm_sme.tile_load %{{.*}}[{{.*}}], {{.*}}, {{.*}} : memref<?x?xi16>, vector<[8]x[8]xi16>
+func.func @transfer_read_2d_with_mask_i16(%src : memref<?x?xi16>, %mask : vector<[8]x[8]xi1>) {
   %c0 = arith.constant 0 : index
-  %pad = arith.constant 0.0 : f64
-  %0 = vector.transfer_read %src[%c0, %c0], %pad {permutation_map = affine_map<(d0, d1) -> (d1, d0)>, in_bounds = [false, false]} : memref<?x?xf64>, vector<[4]x[4]xf64>
-  "prevent.dce"(%0) : (vector<[4]x[4]xf64>) -> ()
+  %pad = arith.constant 0 : i16
+  %0 = vector.transfer_read %src[%c0, %c0], %pad, %mask {in_bounds = [true, true]} : memref<?x?xi16>, vector<[8]x[8]xi16>
+  "prevent.dce"(%0) : (vector<[8]x[8]xi16>) -> ()
   return
 }
 
 // -----
 
-// CHECK-LABEL: @transfer_read_2d__non_memref_type
-// CHECK-NOT: arm_sme.tile_load
-// CHECK: vector.transfer_read
-func.func @transfer_read_2d__non_memref_type(%src : tensor<?x?xf64>) {
+/// in-flight transpose
+
+// CHECK-LABEL: @transfer_read_2d_transpose_i8
+// CHECK: arm_sme.tile_load {{.*}} layout<vertical> : memref<?x?xi8>, vector<[16]x[16]xi8>
+func.func @transfer_read_2d_transpose_i8(%src : memref<?x?xi8>) {
   %c0 = arith.constant 0 : index
-  %pad = arith.constant 0.0 : f64
-  %0 = vector.transfer_read %src[%c0, %c0], %pad {permutation_map = affine_map<(d0, d1) -> (d1, d0)>, in_bounds = [true, true]} : tensor<?x?xf64>, vector<[2]x[2]xf64>
-  "prevent.dce"(%0) : (vector<[2]x[2]xf64>) -> ()
+  %pad = arith.constant 0 : i8
+  %0 = vector.transfer_read %src[%c0, %c0], %pad {permutation_map = affine_map<(d0, d1) -> (d1, d0)>, in_bounds = [true, true]} : memref<?x?xi8>, vector<[16]x[16]xi8>
+  "prevent.dce"(%0) : (vector<[16]x[16]xi8>) -> ()
   return
 }
 
 // -----
 
-// CHECK-LABEL: @transfer_read_2d__bad_transfer_rank
+// CHECK-LABEL: @transfer_read_2d_transpose_with_mask_f32
+// CHECK: arm_sme.tile_load {{.*}} layout<vertical> : memref<?x?xf32>, vector<[4]x[4]xf32>
+func.func @transfer_read_2d_transpose_with_mask_f32(%src : memref<?x?xf32>, %mask : vector<[4]x[4]xi1>) {
+  %c0 = arith.constant 0 : index
+  %pad = arith.constant 0.0 : f32
+  %0 = vector.transfer_read %src[%c0, %c0], %pad, %mask {permutation_map = affine_map<(d0, d1) -> (d1, d0)>, in_bounds = [true, true]} : memref<?x?xf32>, vector<[4]x[4]xf32>
+  "prevent.dce"(%0) : (vector<[4]x[4]xf32>) -> ()
+  return
+}
+
+// -----
+
+// CHECK-LABEL: @transfer_read_2d__bad_type
 // CHECK-NOT: arm_sme.tile_load
 // CHECK: vector.transfer_read
-func.func @transfer_read_2d__bad_transfer_rank(%src : memref<?x?xf64>) {
+func.func @transfer_read_2d__bad_type(%src : memref<?x?xf64>) {
   %c0 = arith.constant 0 : index
   %pad = arith.constant 0.0 : f64
-  %0 = vector.transfer_read %src[%c0, %c0], %pad {permutation_map = affine_map<(d0, d1) -> (d0)>, in_bounds = [true]} : memref<?x?xf64>, vector<[2]xf64>
-  "prevent.dce"(%0) : (vector<[2]xf64>) -> ()
+  %0 = vector.transfer_read %src[%c0, %c0], %pad {permutation_map = affine_map<(d0, d1) -> (d1, d0)>, in_bounds = [false, false]} : memref<?x?xf64>, vector<[4]x[4]xf64>
+  "prevent.dce"(%0) : (vector<[4]x[4]xf64>) -> ()
   return
 }
 
 // -----
 
-// CHECK-LABEL: @transfer_read_2d__unsupported_mask
+// CHECK-LABEL: @transfer_read_2d__non_memref_type
 // CHECK-NOT: arm_sme.tile_load
 // CHECK: vector.transfer_read
-func.func @transfer_read_2d__unsupported_mask(%src : memref<?x?xf64>, %mask : vector<[2]x[2]xi1>) {
+func.func @transfer_read_2d__non_memref_type(%src : tensor<?x?xf64>) {
   %c0 = arith.constant 0 : index
   %pad = arith.constant 0.0 : f64
-  %0 = vector.transfer_read %src[%c0, %c0], %pad, %mask {permutation_map = affine_map<(d0, d1) -> (d1, d0)>, in_bounds = [true, true]} : memref<?x?xf64>, vector<[2]x[2]xf64>
+  %0 = vector.transfer_read %src[%c0, %c0], %pad {permutation_map = affine_map<(d0, d1) -> (d1, d0)>, in_bounds = [true, true]} : tensor<?x?xf64>, vector<[2]x[2]xf64>
   "prevent.dce"(%0) : (vector<[2]x[2]xf64>) -> ()
   return
 }
 
 // -----
 
-/// transfer_read with identity map should be lowered to vector.load by
-/// TransferReadToVectorLoadLowering and then arm_sme.tile_load by
-/// VectorLoadToArmSMELowering.
-
-// CHECK-LABEL: @transfer_read_2d__non_permuting_map
+// CHECK-LABEL: @transfer_read_2d__bad_transfer_rank
 // CHECK-NOT: arm_sme.tile_load
 // CHECK: vector.transfer_read
-func.func @transfer_read_2d__non_permuting_map(%src : memref<?x?xf64>) {
+func.func @transfer_read_2d__bad_transfer_rank(%src : memref<?x?xf64>) {
   %c0 = arith.constant 0 : index
   %pad = arith.constant 0.0 : f64
-  %0 = vector.transfer_read %src[%c0, %c0], %pad {permutation_map = affine_map<(d0, d1) -> (d0, d1)>, in_bounds = [true, true]} : memref<?x?xf64>, vector<[2]x[2]xf64>
-  "prevent.dce"(%0) : (vector<[2]x[2]xf64>) -> ()
+  %0 = vector.transfer_read %src[%c0, %c0], %pad {permutation_map = affine_map<(d0, d1) -> (d0)>, in_bounds = [true]} : memref<?x?xf64>, vector<[2]xf64>
+  "prevent.dce"(%0) : (vector<[2]xf64>) -> ()
   return
 }
 

From 805a36aaf571a8d53781c0c413d1af125137bfa2 Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett@linaro.org>
Date: Thu, 2 Nov 2023 10:27:37 +0000
Subject: [PATCH 803/877] [lldb][AArch64] Simplify handing of scalable
 registers using vg and svg (#70914)

This removes explicit invalidation of vg and svg that was done in
`GDBRemoteRegisterContext::AArch64Reconfigure`. This was in fact
covering up a bug elsehwere.

Register information says that a write to vg also invalidates svg (it
does not unless you are in streaming mode, but we decided to keep it
simple and say it always does).

This invalidation was not being applied until *after* AArch64Reconfigure
was called. This meant that without those manual invalidates this
happened:
* vg is written
* svg is not invalidated
* Reconfigure uses the written vg value
* Reconfigure uses the *old* svg value

I have moved the AArch64Reconfigure call to after we've processed the
invalidations caused by the register write, so we no longer need the
manual invalidates in AArch64Reconfigure.

In addition I have changed the order in which expedited registers as
parsed. These registers come with a stop notification and include,
amongst others, vg and svg.

So now we:
* Parse them and update register values (including vg and svg)
* AArch64Reconfigure, which uses those values, and invalidates every
register, because offsets may have changed.
* Parse the expedited registers again, knowing that none of the values
will have changed due to the scaling.

This means we use the expedited registers during the reconfigure, but
the invalidate does not mean we throw all of them away.

The cost is we parse them twice client side, but this is cheap compared
to a network packet, and is limited to AArch64 targets only.

On a system with SVE and SME, these are the packets sent for a step:
```
(lldb) b-remote.async>  < 803> read packet:
$T05thread:p1f80.1f80;name:main.o;threads:1f80;thread-pcs:000000000040056c<...>a1:0800000000000000;d9:0400000000000000;reason:trace;#fc
intern-state     <  21> send packet: $xfffffffff200,200#5e
intern-state     < 516> read packet:
$e4f2ffffffff000000<...>#71
intern-state     <  15> send packet: $Z0,400568,4#4d
intern-state     <   6> read packet: $OK#9a
dbg.evt-handler  <  16> send packet: $jThreadsInfo#c1
dbg.evt-handler  < 224> read packet:
$[{"name":"main.o","reason":"trace","registers":{"161":"0800000000000000",<...>}],"signal":5,"tid":8064}]]#73
```

You can see there are no extra register reads which means we're using
the expedited registers.

For a write to vg:
```
(lldb) register write vg 4
lldb             <  37> send packet:
$Pa1=0400000000000000;thread:1f80;#4a
lldb             <   6> read packet: $OK#9a
lldb             <  20> send packet: $pa1;thread:1f80;#29
lldb             <  20> read packet: $0400000000000000#04
lldb             <  20> send packet: $pd9;thread:1f80;#34
lldb             <  20> read packet: $0400000000000000#04
```

There is the initial P write, and lldb correctly assumes that SVG is
invalidated by this also so we read back the new vg and svg values
afterwards.
---
 .../gdb-remote/GDBRemoteRegisterContext.cpp   | 17 ++----
 .../Process/gdb-remote/ProcessGDBRemote.cpp   | 55 +++++++++++++------
 .../Process/gdb-remote/ProcessGDBRemote.h     |  3 +
 3 files changed, 45 insertions(+), 30 deletions(-)

diff --git a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteRegisterContext.cpp b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteRegisterContext.cpp
index 72280927471f8..013b2bbc0e67f 100644
--- a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteRegisterContext.cpp
+++ b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteRegisterContext.cpp
@@ -434,11 +434,6 @@ bool GDBRemoteRegisterContext::WriteRegisterBytes(const RegisterInfo *reg_info,
         } else {
           // This is an actual register, write it
           success = SetPrimordialRegister(reg_info, gdb_comm);
-
-          if (success && do_reconfigure_arm64_sve) {
-            AArch64Reconfigure();
-            InvalidateAllRegisters();
-          }
         }
 
         // Check if writing this register will invalidate any other register
@@ -452,6 +447,11 @@ bool GDBRemoteRegisterContext::WriteRegisterBytes(const RegisterInfo *reg_info,
                                false);
         }
 
+        if (success && do_reconfigure_arm64_sve) {
+          AArch64Reconfigure();
+          InvalidateAllRegisters();
+        }
+
         return success;
       }
     } else {
@@ -772,8 +772,6 @@ void GDBRemoteRegisterContext::AArch64Reconfigure() {
   std::optional<uint64_t> vg_reg_value;
   const RegisterInfo *vg_reg_info = m_reg_info_sp->GetRegisterInfo("vg");
   if (vg_reg_info) {
-    // Make sure we get the latest value of vg from the remote.
-    SetRegisterIsValid(vg_reg_info, false);
     uint32_t vg_reg_num = vg_reg_info->kinds[eRegisterKindLLDB];
     uint64_t reg_value = ReadRegisterAsUnsigned(vg_reg_num, fail_value);
     if (reg_value != fail_value && reg_value <= 32)
@@ -783,11 +781,6 @@ void GDBRemoteRegisterContext::AArch64Reconfigure() {
   std::optional<uint64_t> svg_reg_value;
   const RegisterInfo *svg_reg_info = m_reg_info_sp->GetRegisterInfo("svg");
   if (svg_reg_info) {
-    // When vg is written it is automatically made invalid. Writing vg will also
-    // change svg if we're in streaming mode but it will not be made invalid
-    // so do this manually so the following read gets the latest svg value.
-    SetRegisterIsValid(svg_reg_info, false);
-
     uint32_t svg_reg_num = svg_reg_info->kinds[eRegisterKindLLDB];
     uint64_t reg_value = ReadRegisterAsUnsigned(svg_reg_num, fail_value);
     if (reg_value != fail_value && reg_value <= 32)
diff --git a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp
index 16b3ba661d071..c50ac5de77904 100644
--- a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp
+++ b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp
@@ -1612,6 +1612,22 @@ bool ProcessGDBRemote::CalculateThreadStopInfo(ThreadGDBRemote *thread) {
   return false;
 }
 
+void ProcessGDBRemote::ParseExpeditedRegisters(
+    ExpeditedRegisterMap &expedited_register_map, ThreadSP thread_sp) {
+  ThreadGDBRemote *gdb_thread = static_cast<ThreadGDBRemote *>(thread_sp.get());
+  RegisterContextSP gdb_reg_ctx_sp(gdb_thread->GetRegisterContext());
+
+  for (const auto &pair : expedited_register_map) {
+    StringExtractor reg_value_extractor(pair.second);
+    WritableDataBufferSP buffer_sp(
+        new DataBufferHeap(reg_value_extractor.GetStringRef().size() / 2, 0));
+    reg_value_extractor.GetHexBytes(buffer_sp->GetData(), '\xcc');
+    uint32_t lldb_regnum = gdb_reg_ctx_sp->ConvertRegisterKindToRegisterNumber(
+        eRegisterKindProcessPlugin, pair.first);
+    gdb_thread->PrivateSetRegisterValue(lldb_regnum, buffer_sp->GetData());
+  }
+}
+
 ThreadSP ProcessGDBRemote::SetThreadStopInfo(
     lldb::tid_t tid, ExpeditedRegisterMap &expedited_register_map,
     uint8_t signo, const std::string &thread_name, const std::string &reason,
@@ -1646,32 +1662,35 @@ ThreadSP ProcessGDBRemote::SetThreadStopInfo(
 
   reg_ctx_sp->InvalidateIfNeeded(true);
 
+  auto iter = std::find(m_thread_ids.begin(), m_thread_ids.end(), tid);
+  if (iter != m_thread_ids.end())
+    SetThreadPc(thread_sp, iter - m_thread_ids.begin());
+
+  ParseExpeditedRegisters(expedited_register_map, thread_sp);
+
   // AArch64 SVE/SME specific code below updates SVE and ZA register sizes and
   // offsets if value of VG or SVG registers has changed since last stop.
   const ArchSpec &arch = GetTarget().GetArchitecture();
   if (arch.IsValid() && arch.GetTriple().isAArch64()) {
-    GDBRemoteRegisterContext *gdb_remote_reg_ctx =
-        static_cast<GDBRemoteRegisterContext *>(reg_ctx_sp.get());
+    GDBRemoteRegisterContext *reg_ctx_sp =
+        static_cast<GDBRemoteRegisterContext *>(
+            gdb_thread->GetRegisterContext().get());
 
-    if (gdb_remote_reg_ctx) {
-      gdb_remote_reg_ctx->AArch64Reconfigure();
-      gdb_remote_reg_ctx->InvalidateAllRegisters();
+    if (reg_ctx_sp) {
+      reg_ctx_sp->AArch64Reconfigure();
+      // Now we have changed the offsets of all the registers, so the values
+      // will be corrupted.
+      reg_ctx_sp->InvalidateAllRegisters();
+
+      // Expedited registers values will never contain registers that would be
+      // resized by AArch64Reconfigure. So we are safe to continue using these
+      // values. These values include vg, svg and useful general purpose
+      // registers so this saves a few read packets each time we make use of
+      // them.
+      ParseExpeditedRegisters(expedited_register_map, thread_sp);
     }
   }
 
-  auto iter = std::find(m_thread_ids.begin(), m_thread_ids.end(), tid);
-  if (iter != m_thread_ids.end())
-    SetThreadPc(thread_sp, iter - m_thread_ids.begin());
-
-  for (const auto &pair : expedited_register_map) {
-    StringExtractor reg_value_extractor(pair.second);
-    WritableDataBufferSP buffer_sp(
-        new DataBufferHeap(reg_value_extractor.GetStringRef().size() / 2, 0));
-    reg_value_extractor.GetHexBytes(buffer_sp->GetData(), '\xcc');
-    uint32_t lldb_regnum = reg_ctx_sp->ConvertRegisterKindToRegisterNumber(
-        eRegisterKindProcessPlugin, pair.first);
-    gdb_thread->PrivateSetRegisterValue(lldb_regnum, buffer_sp->GetData());
-  }
   thread_sp->SetName(thread_name.empty() ? nullptr : thread_name.c_str());
 
   gdb_thread->SetThreadDispatchQAddr(thread_dispatch_qaddr);
diff --git a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.h b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.h
index f0ead4c38c237..f3787e7169047 100644
--- a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.h
+++ b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.h
@@ -470,6 +470,9 @@ class ProcessGDBRemote : public Process,
   void DidForkSwitchSoftwareBreakpoints(bool enable);
   void DidForkSwitchHardwareTraps(bool enable);
 
+  void ParseExpeditedRegisters(ExpeditedRegisterMap &expedited_register_map,
+                               lldb::ThreadSP thread_sp);
+
   // Lists of register fields generated from the remote's target XML.
   // Pointers to these RegisterFlags will be set in the register info passed
   // back to the upper levels of lldb. Doing so is safe because this class will

From 65bad23e434bc647579e1154e602f51f0e6c65aa Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Thu, 2 Nov 2023 10:30:52 +0000
Subject: [PATCH 804/877] [AMDGPU] Fix test for #70532 (Implement moveToVALU
 for S_CSELECT_B64)

---
 llvm/test/CodeGen/AMDGPU/fix-sgpr-copies.mir | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies.mir b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies.mir
index b1715bd9d3328..60673bf780a05 100644
--- a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies.mir
+++ b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies.mir
@@ -240,8 +240,8 @@ body:             |
 # GCN: %1:vreg_64 = IMPLICIT_DEF
 # GCN: %2:sreg_32 = IMPLICIT_DEF
 # GCN: %3:sreg_64 = IMPLICIT_DEF
-# GCN: %5:sreg_64_xexec = V_CMP_EQ_U32_e64 %0, 0, implicit $exec
-# GCN: %6:vreg_64 = V_CNDMASK_B64_PSEUDO 0, %1, %5, implicit $exec
+# GCN: %7:sreg_64_xexec = V_CMP_EQ_U32_e64 %0, 0, implicit $exec
+# GCN: %6:vreg_64 = V_CNDMASK_B64_PSEUDO 0, %1, %7, implicit $exec
 name: s_cselect_b64
 body: |
   bb.0:

From 3a223f4414444c3ecc65abb6482df3df87e26b86 Mon Sep 17 00:00:00 2001
From: Nicolas Vasilache <nicolasvasilache@users.noreply.github.com>
Date: Thu, 2 Nov 2023 11:34:10 +0100
Subject: [PATCH 805/877] [mlir][Bufferization] Add support for controlled
 bufferization of alloc_tensor (#70957)

This revision adds support to
`transform.structured.bufferize_to_allocation` to bufferize
`bufferization.alloc_tensor()` ops.

This is useful as a means path to control the bufferization of
`tensor.empty` ops that have bene previously
`bufferization.empty_tensor_to_alloc_tensor`'ed.
---
 .../Dialect/Linalg/Transforms/Transforms.h    | 14 +++++++++++
 .../Transforms/ConvertToDestinationStyle.cpp  | 23 +++++++++++++++++++
 .../Transforms/transform-ops.mlir             | 23 +++++++++++++++++++
 3 files changed, 60 insertions(+)

diff --git a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
index 122f735628521..abd996bdbaf85 100644
--- a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
+++ b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
@@ -12,6 +12,7 @@
 #include <utility>
 
 #include "mlir/Conversion/VectorToSCF/VectorToSCF.h"
+#include "mlir/Dialect/Bufferization/IR/Bufferization.h"
 #include "mlir/Dialect/Linalg/Utils/Utils.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/SCF/Utils/Utils.h"
@@ -28,6 +29,7 @@
 
 namespace mlir {
 namespace bufferization {
+class AllocTensorOp;
 class OneShotAnalysisState;
 } // namespace bufferization
 
@@ -110,6 +112,18 @@ Value bufferizeToAllocation(RewriterBase &rewriter,
                             vector::MaskOp maskOp, Attribute memorySpace = {},
                             Operation *insertionPoint = nullptr);
 
+/// Materialize a buffer allocation for the given bufferization.alloc_tensor op
+/// and lower the op to memref.alloc + memref.tensor_store.
+///
+/// In addition to rewriting the IR, this function returns the newly allocated
+/// buffer. The `insertionPoint` parameter can be used to specify a custom
+/// insertion point for the buffer allocation.
+Value bufferizeToAllocation(RewriterBase &rewriter,
+                            const BufferizeToAllocationOptions &options,
+                            bufferization::AllocTensorOp allocTensorOp,
+                            Attribute memorySpace = {},
+                            Operation *insertionPoint = nullptr);
+
 /// Bufferize the given op with tensor semantics and materialize the result in
 /// a newly allocated buffer.
 ///
diff --git a/mlir/lib/Dialect/Linalg/Transforms/ConvertToDestinationStyle.cpp b/mlir/lib/Dialect/Linalg/Transforms/ConvertToDestinationStyle.cpp
index f7340844f7e19..311540fde512b 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/ConvertToDestinationStyle.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/ConvertToDestinationStyle.cpp
@@ -317,6 +317,27 @@ Value linalg::bufferizeToAllocation(
   return alloc;
 }
 
+Value linalg::bufferizeToAllocation(
+    RewriterBase &rewriter, const linalg::BufferizeToAllocationOptions &options,
+    bufferization::AllocTensorOp allocTensorOp, Attribute memorySpace,
+    Operation *insertionPoint) {
+  Location loc = allocTensorOp.getLoc();
+  OpBuilder::InsertionGuard g(rewriter);
+  rewriter.setInsertionPoint(insertionPoint ? insertionPoint : allocTensorOp);
+  bufferization::BufferizationOptions bufferizationOptions;
+
+  // Create buffer allocation.
+  Value alloc = createAllocationForTensor(
+      rewriter, loc, allocTensorOp.getResult(), options, memorySpace);
+
+  // Create bufferization.to_tensor with "restrict" and "writable". The returned
+  // tensor is a new buffer allocation, so it does not alias with any buffer.
+  Value toTensorOp = rewriter.create<bufferization::ToTensorOp>(
+      loc, alloc, /*restrict=*/true, /*writable=*/true);
+  rewriter.replaceOp(allocTensorOp, toTensorOp);
+  return alloc;
+}
+
 /// Lower tensor.from_elements to a sequence of chained tensor.insert.
 FailureOr<Operation *> mlir::linalg::rewriteInDestinationPassingStyle(
     RewriterBase &rewriter, tensor::FromElementsOp fromElementsOp) {
@@ -454,6 +475,8 @@ Value linalg::bufferizeToAllocation(
     return bufferizeToAllocation(rewriter, options, padOp, memorySpace);
   if (auto maskOp = dyn_cast<vector::MaskOp>(op))
     return bufferizeToAllocation(rewriter, options, maskOp, memorySpace);
+  if (auto allocTensorOp = dyn_cast<bufferization::AllocTensorOp>(op))
+    return bufferizeToAllocation(rewriter, options, allocTensorOp, memorySpace);
 
   // Only bufferizable ops are supported.
   auto bufferizableOp = dyn_cast<BufferizableOpInterface>(op);
diff --git a/mlir/test/Dialect/Bufferization/Transforms/transform-ops.mlir b/mlir/test/Dialect/Bufferization/Transforms/transform-ops.mlir
index 8d52d9900a793..3c50a9e72d9d9 100644
--- a/mlir/test/Dialect/Bufferization/Transforms/transform-ops.mlir
+++ b/mlir/test/Dialect/Bufferization/Transforms/transform-ops.mlir
@@ -215,3 +215,26 @@ func.func @buffer_loop_hoisting(%lb: index, %ub: index, %step: index, %f: f32, %
   }
   return
 }
+
+// -----
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %alloc_tensor = transform.structured.match ops{["bufferization.alloc_tensor"]} in %arg1
+      : (!transform.any_op) -> !transform.op<"bufferization.alloc_tensor">
+    %2, %new = transform.structured.bufferize_to_allocation %alloc_tensor 
+      {alloc_op = "memref.alloca"} 
+        : !transform.op<"bufferization.alloc_tensor">
+    transform.yield
+  }
+}
+
+// Expect `bufferization.bufferize_to_allocation` to create an alloc.
+//  CHECK-LABEL: func.func @empty_to_tensor_alloc()
+func.func @empty_to_tensor_alloc() -> tensor<2x2xf32> {
+  // CHECK-NEXT: %[[alloca:.*]] = memref.alloca() : memref<2x2xf32>
+  // CHECK-NEXT: %[[tensor:.*]] = bufferization.to_tensor %[[alloca]] restrict writable : memref<2x2xf32>
+  // CHECK-NEXT: return %[[tensor]] : tensor<2x2xf32>
+  %0 = bufferization.alloc_tensor() : tensor<2x2xf32>
+  return %0 : tensor<2x2xf32>
+}

From b90cfe46015d9d12317146251e633d2ca07dccd7 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Thu, 2 Nov 2023 10:35:15 +0000
Subject: [PATCH 806/877] [AMDGPU] New ttracedata intrinsics (#70235)

Add llvm.amdgcn.s.ttracedata and llvm.amdgcn.s.ttracedata.imm which map
directly to the corresponding instructions s_ttracedata and
s_ttracedata_imm. These are inherently whole-wave operations so any
non-uniform inputs are readfirstlaned.
---
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td      |  7 +++
 .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp  | 10 ++++
 llvm/lib/Target/AMDGPU/SOPInstructions.td     |  9 +++-
 .../AMDGPU/llvm.amdgcn.s.ttracedata.ll        | 53 +++++++++++++++++++
 4 files changed, 77 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.ttracedata.ll

diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 0daa5a71340d6..33fa3985e64d8 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -1697,6 +1697,13 @@ def int_amdgcn_s_setprio :
   DefaultAttrsIntrinsic<[], [llvm_i16_ty], [ImmArg<ArgIndex<0>>, IntrNoMem,
                                 IntrHasSideEffects]>;
 
+def int_amdgcn_s_ttracedata :
+  DefaultAttrsIntrinsic<[], [llvm_i32_ty],
+                        [IntrNoMem, IntrHasSideEffects]>;
+def int_amdgcn_s_ttracedata_imm :
+  DefaultAttrsIntrinsic<[], [llvm_i16_ty],
+                        [IntrNoMem, IntrHasSideEffects, ImmArg<ArgIndex<0>>]>;
+
 // This is IntrHasSideEffects so it can be used to read cycle counters.
 def int_amdgcn_s_getreg :
   ClangBuiltin<"__builtin_amdgcn_s_getreg">,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 259af55885fc0..047108fd06db5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -3066,6 +3066,9 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
       constrainOpWithReadfirstlane(B, MI, 2);
       return;
     }
+    case Intrinsic::amdgcn_s_ttracedata:
+      constrainOpWithReadfirstlane(B, MI, 1); // M0
+      return;
     case Intrinsic::amdgcn_raw_buffer_load_lds:
     case Intrinsic::amdgcn_raw_ptr_buffer_load_lds: {
       applyDefaultMapping(OpdMapper);
@@ -4670,6 +4673,13 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
       OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
       break;
     }
+    case Intrinsic::amdgcn_s_ttracedata: {
+      // This must be an SGPR, but accept a VGPR.
+      unsigned Bank =
+          getRegBankID(MI.getOperand(1).getReg(), MRI, AMDGPU::SGPRRegBankID);
+      OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32);
+      break;
+    }
     case Intrinsic::amdgcn_end_cf: {
       unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
       OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index edbfd79db3fdb..a28921a7ff33f 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -1503,7 +1503,10 @@ def S_INCPERFLEVEL : SOPP_Pseudo <"s_incperflevel", (ins i32imm:$simm16), "$simm
 def S_DECPERFLEVEL : SOPP_Pseudo <"s_decperflevel", (ins i32imm:$simm16), "$simm16",
   [(int_amdgcn_s_decperflevel timm:$simm16)]> {
 }
-def S_TTRACEDATA : SOPP_Pseudo <"s_ttracedata", (ins)> {
+
+let Uses = [M0] in
+def S_TTRACEDATA : SOPP_Pseudo <"s_ttracedata", (ins), "",
+                                [(int_amdgcn_s_ttracedata M0)]> {
   let simm16 = 0;
   let fixed_imm = 1;
 }
@@ -1547,8 +1550,10 @@ let SubtargetPredicate = isGFX10Plus in {
       [(SIdenorm_mode (i32 timm:$simm16))]>;
   }
 
+  let hasSideEffects = 1 in
   def S_TTRACEDATA_IMM :
-    SOPP_Pseudo<"s_ttracedata_imm", (ins s16imm:$simm16), "$simm16">;
+    SOPP_Pseudo<"s_ttracedata_imm", (ins s16imm:$simm16), "$simm16",
+                [(int_amdgcn_s_ttracedata_imm timm:$simm16)]>;
 } // End SubtargetPredicate = isGFX10Plus
 
 let SubtargetPredicate = isGFX11Plus in {
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.ttracedata.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.ttracedata.ll
new file mode 100644
index 0000000000000..37b5357950e64
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.ttracedata.ll
@@ -0,0 +1,53 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG %s
+; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL %s
+
+declare void @llvm.amdgcn.s.ttracedata(i32)
+declare void @llvm.amdgcn.s.ttracedata.imm(i16)
+
+define amdgpu_cs void @ttracedata_c() {
+; GFX11-LABEL: ttracedata_c:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_mov_b32 m0, 0xf4240
+; GFX11-NEXT:    s_ttracedata
+; GFX11-NEXT:    s_endpgm
+  call void @llvm.amdgcn.s.ttracedata(i32 1000000)
+  ret void
+}
+
+define amdgpu_cs void @ttracedata_s(i32 inreg %val) {
+; GFX11-LABEL: ttracedata_s:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_mov_b32 m0, s0
+; GFX11-NEXT:    s_ttracedata
+; GFX11-NEXT:    s_endpgm
+  call void @llvm.amdgcn.s.ttracedata(i32 %val)
+  ret void
+}
+
+define amdgpu_cs void @ttracedata_v(i32 %val) {
+; GFX11-SDAG-LABEL: ttracedata_v:
+; GFX11-SDAG:       ; %bb.0:
+; GFX11-SDAG-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT:    s_mov_b32 m0, s0
+; GFX11-SDAG-NEXT:    s_ttracedata
+; GFX11-SDAG-NEXT:    s_endpgm
+;
+; GFX11-GISEL-LABEL: ttracedata_v:
+; GFX11-GISEL:       ; %bb.0:
+; GFX11-GISEL-NEXT:    v_readfirstlane_b32 m0, v0
+; GFX11-GISEL-NEXT:    s_ttracedata
+; GFX11-GISEL-NEXT:    s_endpgm
+  call void @llvm.amdgcn.s.ttracedata(i32 %val)
+  ret void
+}
+
+define amdgpu_cs void @ttracedata_imm() {
+; GFX11-LABEL: ttracedata_imm:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_ttracedata_imm 0x3e8
+; GFX11-NEXT:    s_endpgm
+  call void @llvm.amdgcn.s.ttracedata.imm(i16 1000)
+  ret void
+}

From e331ca5076b5757434b8560743b47895a9319c49 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Thu, 2 Nov 2023 11:40:53 +0100
Subject: [PATCH 807/877] [InstSimplify] Add extra tests for #70335 (NFC)

These were miscompiled with the initial version of the patch.
---
 .../InstSimplify/and-or-implied-cond.ll       | 59 +++++++++++++++++++
 1 file changed, 59 insertions(+)

diff --git a/llvm/test/Transforms/InstSimplify/and-or-implied-cond.ll b/llvm/test/Transforms/InstSimplify/and-or-implied-cond.ll
index db38077e0c513..b916d899a91b6 100644
--- a/llvm/test/Transforms/InstSimplify/and-or-implied-cond.ll
+++ b/llvm/test/Transforms/InstSimplify/and-or-implied-cond.ll
@@ -264,3 +264,62 @@ define i1 @pr69091(i32 %arg, i32 %arg1) {
   %or = or i1 %icmp, %icmp2
   ret i1 %or
 }
+
+declare void @barrier()
+
+define i1 @or_icmp_implies_ub(i32 %x) {
+; CHECK-LABEL: @or_icmp_implies_ub(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ne i32 [[X:%.*]], 0
+; CHECK-NEXT:    call void @barrier()
+; CHECK-NEXT:    ret i1 [[CMP1]]
+;
+  %cmp1 = icmp ne i32 %x, 0
+  call void @barrier()
+  %div = udiv i32 2147483647, %x
+  %cmp2 = icmp ugt i32 %x, %div
+  %or = or i1 %cmp1, %cmp2
+  ret i1 %or
+}
+
+define i1 @and_icmp_implies_ub(i32 %x) {
+; CHECK-LABEL: @and_icmp_implies_ub(
+; CHECK-NEXT:    call void @barrier()
+; CHECK-NEXT:    ret i1 false
+;
+  %cmp1 = icmp eq i32 %x, 0
+  call void @barrier()
+  %div = udiv i32 2147483647, %x
+  %cmp2 = icmp ugt i32 %x, %div
+  %and = and i1 %cmp1, %cmp2
+  ret i1 %and
+}
+
+define i1 @or_icmp_implies_poison(i32 %x) {
+; CHECK-LABEL: @or_icmp_implies_poison(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ne i32 [[X:%.*]], 32
+; CHECK-NEXT:    [[SHL:%.*]] = shl i32 1, [[X]]
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ugt i32 [[X]], [[SHL]]
+; CHECK-NEXT:    [[OR:%.*]] = or i1 [[CMP1]], [[CMP2]]
+; CHECK-NEXT:    ret i1 [[OR]]
+;
+  %cmp1 = icmp ne i32 %x, 32
+  %shl = shl i32 1, %x
+  %cmp2 = icmp ugt i32 %x, %shl
+  %or = or i1 %cmp1, %cmp2
+  ret i1 %or
+}
+
+define i1 @and_icmp_implies_poison(i32 %x) {
+; CHECK-LABEL: @and_icmp_implies_poison(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i32 [[X:%.*]], 32
+; CHECK-NEXT:    [[SHL:%.*]] = shl i32 1, [[X]]
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ugt i32 [[X]], [[SHL]]
+; CHECK-NEXT:    [[AND:%.*]] = and i1 [[CMP1]], [[CMP2]]
+; CHECK-NEXT:    ret i1 [[AND]]
+;
+  %cmp1 = icmp eq i32 %x, 32
+  %shl = shl i32 1, %x
+  %cmp2 = icmp ugt i32 %x, %shl
+  %and = and i1 %cmp1, %cmp2
+  ret i1 %and
+}

From 6d013959774631bacb87a9fc90bbe505b5c1612a Mon Sep 17 00:00:00 2001
From: harishch4 <harishcse44@gmail.com>
Date: Thu, 2 Nov 2023 16:30:07 +0530
Subject: [PATCH 808/877] [Flang][OpenMP] Port openmp task tests to HLFIR flow
 (#70781)

---
 flang/test/Lower/OpenMP/task.f90 | 273 +++++++++++++++++++++++++++++++
 1 file changed, 273 insertions(+)
 create mode 100644 flang/test/Lower/OpenMP/task.f90

diff --git a/flang/test/Lower/OpenMP/task.f90 b/flang/test/Lower/OpenMP/task.f90
new file mode 100644
index 0000000000000..99b1740ca75a8
--- /dev/null
+++ b/flang/test/Lower/OpenMP/task.f90
@@ -0,0 +1,273 @@
+!RUN: %flang_fc1 -emit-hlfir -fopenmp %s -o - | FileCheck %s
+
+!CHECK-LABEL: func @_QPomp_task_simple() {
+subroutine omp_task_simple
+  !CHECK: omp.task {
+  !$omp task
+  !CHECK: fir.call @_QPfoo() {{.*}}: () -> ()
+  call foo()
+  !CHECK: omp.terminator
+  !$omp end task
+end subroutine omp_task_simple
+
+!===============================================================================
+! `if` clause
+!===============================================================================
+
+!CHECK-LABEL: func @_QPomp_task_if(%{{.+}}) {
+subroutine omp_task_if(bar)
+  logical, intent(inout) :: bar
+  !CHECK: omp.task if(%{{.+}}) {
+  !$omp task if(bar)
+  !CHECK: fir.call @_QPfoo() {{.*}}: () -> ()
+  call foo()
+  !CHECK: omp.terminator
+  !$omp end task
+end subroutine omp_task_if
+
+!===============================================================================
+! `final` clause
+!===============================================================================
+
+!CHECK-LABEL: func @_QPomp_task_final(%{{.+}}) {
+subroutine omp_task_final(bar)
+  logical, intent(inout) :: bar
+  !CHECK: omp.task final(%{{.+}}) {
+  !$omp task final(bar)
+  !CHECK: fir.call @_QPfoo() {{.*}}: () -> ()
+  call foo()
+  !CHECK: omp.terminator
+  !$omp end task
+end subroutine omp_task_final
+
+!===============================================================================
+! `untied` clause
+!===============================================================================
+
+!CHECK-LABEL: func @_QPomp_task_untied() {
+subroutine omp_task_untied()
+  !CHECK: omp.task untied {
+  !$omp task untied
+  !CHECK: fir.call @_QPfoo() {{.*}}: () -> ()
+  call foo()
+  !CHECK: omp.terminator
+  !$omp end task
+end subroutine omp_task_untied
+
+!===============================================================================
+! `mergeable` clause
+!===============================================================================
+
+!CHECK-LABEL: func @_QPomp_task_mergeable() {
+subroutine omp_task_mergeable()
+  !CHECK: omp.task mergeable {
+  !$omp task mergeable
+  !CHECK: fir.call @_QPfoo() {{.*}}: () -> ()
+  call foo()
+  !CHECK: omp.terminator
+  !$omp end task
+end subroutine omp_task_mergeable
+
+!===============================================================================
+! `priority` clause
+!===============================================================================
+
+!CHECK-LABEL: func @_QPomp_task_priority(%{{.+}}) {
+subroutine omp_task_priority(bar)
+  integer, intent(inout) :: bar
+  !CHECK: omp.task priority(%{{.+}}) {
+  !$omp task priority(bar)
+  !CHECK: fir.call @_QPfoo() {{.*}}: () -> ()
+  call foo()
+  !CHECK: omp.terminator
+  !$omp end task
+end subroutine omp_task_priority
+
+!===============================================================================
+! `allocate` clause
+!===============================================================================
+
+!CHECK-LABEL: func @_QPtask_allocate
+subroutine task_allocate()
+  use omp_lib
+  integer :: x
+  !CHECK: omp.task allocate(%{{.+}} : i32 -> %{{.+}} : !fir.ref<i32>) {
+  !$omp task allocate(omp_high_bw_mem_alloc: x) private(x)
+  !CHECK: arith.addi
+  x = x + 12
+  !CHECK: omp.terminator
+  !$omp end task
+end subroutine task_allocate
+
+!===============================================================================
+! `depend` clause
+!===============================================================================
+
+!CHECK-LABEL: func @_QPtask_depend
+subroutine task_depend()
+  integer :: x
+  !CHECK: omp.task depend(taskdependin -> %{{.+}} : !fir.ref<i32>) {
+  !$omp task depend(in : x)
+  !CHECK: arith.addi
+  x = x + 12
+  !CHECK: omp.terminator
+  !$omp end task
+end subroutine task_depend
+
+!CHECK-LABEL: func @_QPtask_depend_non_int
+subroutine task_depend_non_int()
+  character(len = 15) :: x
+  integer, allocatable :: y
+  complex :: z
+  !CHECK: omp.task depend(taskdependin -> %{{.+}} : !fir.ref<!fir.char<1,15>>, taskdependin -> %{{.+}} : !fir.ref<!fir.box<!fir.heap<i32>>>, taskdependin ->  %{{.+}} : !fir.ref<!fir.complex<4>>) {
+  !$omp task depend(in : x, y, z)
+  !CHECK: omp.terminator
+  !$omp end task
+end subroutine task_depend_non_int
+
+!CHECK-LABEL: func @_QPtask_depend_all_kinds_one_task
+subroutine task_depend_all_kinds_one_task()
+  integer :: x
+  !CHECK: omp.task depend(taskdependin -> %{{.+}} : !fir.ref<i32>, taskdependout -> %{{.+}} : !fir.ref<i32>, taskdependinout -> %{{.+}} : !fir.ref<i32>) {
+  !$omp task depend(in : x) depend(out : x) depend(inout : x)
+  !CHECK: arith.addi
+  x = x + 12
+  !CHECK: omp.terminator
+  !$omp end task
+end subroutine task_depend_all_kinds_one_task
+
+!CHECK-LABEL: func @_QPtask_depend_multi_var
+subroutine task_depend_multi_var()
+  integer :: x
+  integer :: y
+  !CHECK: omp.task depend(taskdependin -> %{{.*}} : !fir.ref<i32>, taskdependin -> %{{.+}} : !fir.ref<i32>) {
+  !$omp task depend(in :x,y)
+  !CHECK: arith.addi
+  x = x + 12
+  y = y + 12
+  !CHECK: omp.terminator
+  !$omp end task
+end subroutine task_depend_multi_var
+
+!CHECK-LABEL: func @_QPtask_depend_multi_task
+subroutine task_depend_multi_task()
+  integer :: x
+  !CHECK: omp.task depend(taskdependout -> %{{.+}} : !fir.ref<i32>)
+  !$omp task depend(out : x)
+  !CHECK: arith.addi
+  x = x + 12
+  !CHECK: omp.terminator
+  !$omp end task
+  !CHECK: omp.task depend(taskdependinout -> %{{.+}} : !fir.ref<i32>)
+  !$omp task depend(inout : x)
+  !CHECK: arith.addi
+  x = x + 12
+  !CHECK: omp.terminator
+  !$omp end task
+  !CHECK: omp.task depend(taskdependin -> %{{.+}} : !fir.ref<i32>)
+  !$omp task depend(in : x)
+  !CHECK: arith.addi
+  x = x + 12
+  !CHECK: omp.terminator
+  !$omp end task
+end subroutine task_depend_multi_task
+
+!===============================================================================
+! `private` clause
+!===============================================================================
+!CHECK-LABEL: func @_QPtask_private
+subroutine task_private
+  type mytype
+  integer :: x
+  end type mytype
+
+!CHECK: %[[INT_ALLOCA:.*]] = fir.alloca i32 {bindc_name = "int_var", uniq_name = "_QFtask_privateEint_var"}
+!CHECK: %[[INT_VAR:.+]]:2 = hlfir.declare %[[INT_ALLOCA]] {uniq_name = "_QFtask_privateEint_var"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[MYTYPE_ALLOCA:.*]] = fir.alloca !fir.type<_QFtask_privateTmytype{x:i32}> {bindc_name = "mytype_var", uniq_name = "_QFtask_privateEmytype_var"}
+!CHECK: %[[MYTYPE_VAR:.+]]:2 = hlfir.declare %[[MYTYPE_ALLOCA]] {uniq_name = "_QFtask_privateEmytype_var"} : (!fir.ref<!fir.type<_QFtask_privateTmytype{x:i32}>>) -> (!fir.ref<!fir.type<_QFtask_privateTmytype{x:i32}>>, !fir.ref<!fir.type<_QFtask_privateTmytype{x:i32}>>)
+  integer :: int_var
+  type(mytype) :: mytype_var
+
+  !CHECK: fir.call @_QPbar(%[[INT_VAR]]#1, %[[MYTYPE_VAR]]#1) {{.*}}: (!fir.ref<i32>, !fir.ref<!fir.type<_QFtask_privateTmytype{x:i32}>>) -> ()
+  call bar(int_var, mytype_var)
+
+  !CHECK: omp.task {
+  !$omp task private(int_var, mytype_var)
+!CHECK: %[[INT_PRIVATE_ALLOCA:.+]] = fir.alloca i32 {bindc_name = "int_var", pinned, uniq_name = "_QFtask_privateEint_var"}
+!CHECK: %[[INT_VAR_PRIVATE:.+]]:2 = hlfir.declare %[[INT_PRIVATE_ALLOCA]] {uniq_name = "_QFtask_privateEint_var"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[MYTYPE_PRIVATE_ALLOCA:.+]] = fir.alloca !fir.type<_QFtask_privateTmytype{x:i32}> {bindc_name = "mytype_var", pinned, uniq_name = "_QFtask_privateEmytype_var"}
+!CHECK: %[[MYTYPE_VAR_PRIVATE:.+]]:2 = hlfir.declare %[[MYTYPE_PRIVATE_ALLOCA]] {uniq_name = "_QFtask_privateEmytype_var"} : (!fir.ref<!fir.type<_QFtask_privateTmytype{x:i32}>>) -> (!fir.ref<!fir.type<_QFtask_privateTmytype{x:i32}>>, !fir.ref<!fir.type<_QFtask_privateTmytype{x:i32}>>)
+!CHECK: fir.call @_QPbar(%[[INT_VAR_PRIVATE]]#1, %[[MYTYPE_VAR_PRIVATE]]#1) fastmath<contract> : (!fir.ref<i32>, !fir.ref<!fir.type<_QFtask_privateTmytype{x:i32}>>) -> ()
+  call bar(int_var, mytype_var)
+  !CHECK: omp.terminator
+  !$omp end task
+end subroutine task_private
+
+!===============================================================================
+! `firstprivate` clause
+!===============================================================================
+!CHECK-LABEL: func @_QPtask_firstprivate
+subroutine task_firstprivate
+  type mytype
+  integer :: x
+  end type mytype
+ 
+  !CHECK: %[[INT_ALLOCA:.+]] = fir.alloca i32 {bindc_name = "int_var", uniq_name = "_QFtask_firstprivateEint_var"}
+  !CHECK: %[[INT_VAR:.+]]:2 = hlfir.declare %[[INT_ALLOCA]] {uniq_name = "_QFtask_firstprivateEint_var"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+  !CHECK: %[[MYTYPE_ALLOCA:.+]] = fir.alloca !fir.type<_QFtask_firstprivateTmytype{x:i32}> {bindc_name = "mytype_var", uniq_name = "_QFtask_firstprivateEmytype_var"}
+  !CHECK: %[[MYTYPE_VAR:.+]]:2 = hlfir.declare %[[MYTYPE_ALLOCA]] {uniq_name = "_QFtask_firstprivateEmytype_var"} : (!fir.ref<!fir.type<_QFtask_firstprivateTmytype{x:i32}>>) -> (!fir.ref<!fir.type<_QFtask_firstprivateTmytype{x:i32}>>, !fir.ref<!fir.type<_QFtask_firstprivateTmytype{x:i32}>>)
+  integer :: int_var
+  type(mytype) :: mytype_var
+
+!CHECK: fir.call @_QPbaz(%[[INT_VAR]]#1, %[[MYTYPE_VAR]]#1) fastmath<contract> : (!fir.ref<i32>, !fir.ref<!fir.type<_QFtask_firstprivateTmytype{x:i32}>>) -> ()
+  call baz(int_var, mytype_var)
+
+  !CHECK: omp.task {
+  !$omp task firstprivate(int_var, mytype_var)
+!CHECK: %[[INT_FIRSTPRIVATE_ALLOCA:.+]] = fir.alloca i32 {bindc_name = "int_var", pinned, uniq_name = "_QFtask_firstprivateEint_var"}
+!CHECK: %[[INT_VAR_FIRSTPRIVATE:.+]]:2 = hlfir.declare %[[INT_FIRSTPRIVATE_ALLOCA]] {uniq_name = "_QFtask_firstprivateEint_var"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[INT_VAR_LOAD:.+]] = fir.load %[[INT_VAR]]#0 : !fir.ref<i32>
+!CHECK: hlfir.assign %[[INT_VAR_LOAD]] to %[[INT_VAR_FIRSTPRIVATE]]#0 temporary_lhs : i32, !fir.ref<i32>
+!CHECK: %[[MYTYPE_FIRSTPRIVATE_ALLOCA:.+]] = fir.alloca !fir.type<_QFtask_firstprivateTmytype{x:i32}> {bindc_name = "mytype_var", pinned, uniq_name = "_QFtask_firstprivateEmytype_var"}
+!CHECK: %[[MYTYPE_VAR_FIRSTPRIVATE:.+]]:2 = hlfir.declare %[[MYTYPE_FIRSTPRIVATE_ALLOCA]] {uniq_name = "_QFtask_firstprivateEmytype_var"} : (!fir.ref<!fir.type<_QFtask_firstprivateTmytype{x:i32}>>) -> (!fir.ref<!fir.type<_QFtask_firstprivateTmytype{x:i32}>>, !fir.ref<!fir.type<_QFtask_firstprivateTmytype{x:i32}>>)
+!CHECK: hlfir.assign %[[MYTYPE_VAR]]#0 to %[[MYTYPE_VAR_FIRSTPRIVATE]]#0 temporary_lhs : !fir.ref<!fir.type<_QFtask_firstprivateTmytype{x:i32}>>, !fir.ref<!fir.type<_QFtask_firstprivateTmytype{x:i32}>>
+  call baz(int_var, mytype_var)
+  !CHECK: omp.terminator
+  !$omp end task
+end subroutine task_firstprivate
+
+!===============================================================================
+! Multiple clauses
+!===============================================================================
+
+!CHECK-LABEL: func @_QPtask_multiple_clauses
+subroutine task_multiple_clauses()
+  use omp_lib
+
+!CHECK: %[[X_ALLOCA:.+]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFtask_multiple_clausesEx"}
+!CHECK: %[[X:.+]]:2 = hlfir.declare %[[X_ALLOCA]] {uniq_name = "_QFtask_multiple_clausesEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[Y_ALLOCA:.+]] = fir.alloca i32 {bindc_name = "y", uniq_name = "_QFtask_multiple_clausesEy"}
+!CHECK: %[[Y:.+]]:2 = hlfir.declare %[[Y_ALLOCA]] {uniq_name = "_QFtask_multiple_clausesEy"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[Z_ALLOCA:.+]] = fir.alloca i32 {bindc_name = "z", uniq_name = "_QFtask_multiple_clausesEz"}
+!CHECK: %[[Z:.+]]:2 = hlfir.declare %[[Z_ALLOCA]] {uniq_name = "_QFtask_multiple_clausesEz"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+  integer :: x, y, z
+  logical :: buzz
+
+  !CHECK: omp.task if(%{{.+}}) final(%{{.+}}) untied mergeable priority(%{{.+}}) allocate(%{{.+}} : i32 -> %{{.+}} : !fir.ref<i32>) {
+  !$omp task if(buzz) final(buzz) untied mergeable priority(z) allocate(omp_high_bw_mem_alloc: x) private(x) firstprivate(y)
+
+!CHECK: %[[X_PRIV_ALLOCA:.+]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFtask_multiple_clausesEx"}
+!CHECK: %[[X_PRIV:.+]]:2 = hlfir.declare %[[X_PRIV_ALLOCA]] {uniq_name = "_QFtask_multiple_clausesEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[Y_PRIV_ALLOCA:.+]] = fir.alloca i32 {bindc_name = "y", pinned, uniq_name = "_QFtask_multiple_clausesEy"}
+!CHECK: %[[Y_PRIV:.+]]:2 = hlfir.declare %[[Y_PRIV_ALLOCA]] {uniq_name = "_QFtask_multiple_clausesEy"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[Y_LOAD:.+]] = fir.load %[[Y]]#0 : !fir.ref<i32>
+!CHECK: hlfir.assign %[[Y_LOAD]] to %[[Y_PRIV]]#0 temporary_lhs : i32, !fir.ref<i32>
+
+  !CHECK: arith.addi
+  x = x + 12
+  !CHECK: arith.subi
+  y = y - 12
+
+  !CHECK: omp.terminator
+  !$omp end task
+end subroutine task_multiple_clauses

From 6b96a4c5ccd05514994765f0a95fdd2a2b4e5d79 Mon Sep 17 00:00:00 2001
From: harishch4 <harishcse44@gmail.com>
Date: Thu, 2 Nov 2023 16:30:23 +0530
Subject: [PATCH 809/877] [Flang][OpenMP] Port openmp default clause tests to
 HLFIR flow (#70782)

---
 flang/test/Lower/OpenMP/default-clause.f90 | 327 +++++++++++++++++++++
 1 file changed, 327 insertions(+)
 create mode 100644 flang/test/Lower/OpenMP/default-clause.f90

diff --git a/flang/test/Lower/OpenMP/default-clause.f90 b/flang/test/Lower/OpenMP/default-clause.f90
new file mode 100644
index 0000000000000..f2bcad371018c
--- /dev/null
+++ b/flang/test/Lower/OpenMP/default-clause.f90
@@ -0,0 +1,327 @@
+! This test checks lowering of OpenMP parallel directive
+! with `DEFAULT` clause present.
+
+! RUN: %flang_fc1 -emit-hlfir -fopenmp %s -o - | FileCheck %s
+! RUN: bbc -fopenmp -emit-hlfir %s -o - | FileCheck %s
+
+
+!CHECK: func @_QQmain() attributes {fir.bindc_name = "default_clause_lowering"} {
+!CHECK: %[[W:.*]] = fir.alloca i32 {bindc_name = "w", uniq_name = "_QFEw"}
+!CHECK: %[[W_DECL:.*]]:2 = hlfir.declare %[[W]] {uniq_name = "_QFEw"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[X:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFEx"}
+!CHECK: %[[X_DECL:.*]]:2 = hlfir.declare %[[X]] {uniq_name = "_QFEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[Y:.*]] = fir.alloca i32 {bindc_name = "y", uniq_name = "_QFEy"}
+!CHECK: %[[Y_DECL:.*]]:2 = hlfir.declare %[[Y]] {uniq_name = "_QFEy"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[Z:.*]] = fir.alloca i32 {bindc_name = "z", uniq_name = "_QFEz"}
+!CHECK: %[[Z_DECL:.*]]:2 = hlfir.declare %[[Z]] {uniq_name = "_QFEz"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: omp.parallel {
+!CHECK: %[[PRIVATE_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFEx"}
+!CHECK: %[[PRIVATE_X_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_X]] {uniq_name = "_QFEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[CONST:.*]] = fir.load %[[X_DECL]]#0 : !fir.ref<i32>
+!CHECK: hlfir.assign %[[CONST]] to %[[PRIVATE_X_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+!CHECK: %[[PRIVATE_Y:.*]] = fir.alloca i32 {bindc_name = "y", pinned, uniq_name = "_QFEy"}
+!CHECK: %[[PRIVATE_Y_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_Y]] {uniq_name = "_QFEy"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[PRIVATE_W:.*]] = fir.alloca i32 {bindc_name = "w", pinned, uniq_name = "_QFEw"}
+!CHECK: %[[PRIVATE_W_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_W]] {uniq_name = "_QFEw"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[CONST:.*]] = arith.constant 2 : i32
+!CHECK: %[[TEMP:.*]] = fir.load %[[PRIVATE_Y_DECL]]#0 : !fir.ref<i32>
+!CHECK: %[[RESULT:.*]] = arith.muli %[[CONST]], %[[TEMP]] : i32
+!CHECK: hlfir.assign %[[RESULT]] to %[[PRIVATE_X_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK: %[[TEMP:.*]] = fir.load %[[PRIVATE_W_DECL]]#0 : !fir.ref<i32>
+!CHECK: %[[CONST:.*]] = arith.constant 45 : i32
+!CHECK: %[[RESULT:.*]] = arith.addi %[[TEMP]], %[[CONST]] : i32
+!CHECK: hlfir.assign %[[RESULT]] to %[[Z_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK: omp.terminator
+!CHECK: }
+
+program default_clause_lowering
+    integer :: x, y, z, w
+
+    !$omp parallel default(private) firstprivate(x) shared(z)
+        x = y * 2
+        z = w + 45
+    !$omp end parallel
+
+!CHECK: omp.parallel {
+!CHECK: %[[TEMP:.*]] = fir.load %[[Y_DECL]]#0 : !fir.ref<i32>
+!CHECK: hlfir.assign %[[TEMP]] to %[[X_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK: omp.terminator
+!CHECK: }
+
+    !$omp parallel default(shared)
+        x = y
+    !$omp end parallel
+
+!CHECK: omp.parallel {
+!CHECK: %[[PRIVATE_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFEx"}
+!CHECK: %[[PRIVATE_X_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_X]] {uniq_name = "_QFEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[PRIVATE_Y:.*]] = fir.alloca i32 {bindc_name = "y", pinned, uniq_name = "_QFEy"}
+!CHECK: %[[PRIVATE_Y_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_Y]] {uniq_name = "_QFEy"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[TEMP:.*]] = fir.load %[[PRIVATE_Y_DECL]]#0 : !fir.ref<i32>
+!CHECK: hlfir.assign %[[TEMP]] to %[[PRIVATE_X_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK: omp.terminator
+!CHECK: }
+
+    !$omp parallel default(none) private(x, y)
+        x = y
+    !$omp end parallel
+
+!CHECK: omp.parallel   {
+!CHECK: %[[PRIVATE_Y:.*]] = fir.alloca i32 {bindc_name = "y", pinned, uniq_name = "_QFEy"}
+!CHECK: %[[PRIVATE_Y_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_Y]] {uniq_name = "_QFEy"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[TEMP:.*]] = fir.load %[[Y_DECL]]#0 : !fir.ref<i32>
+!CHECK: hlfir.assign %[[TEMP]] to %[[PRIVATE_Y_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+!CHECK: %[[PRIVATE_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFEx"}
+!CHECK: %[[PRIVATE_X_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_X]] {uniq_name = "_QFEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[TEMP:.*]] = fir.load %[[X_DECL]]#0 : !fir.ref<i32>
+!CHECK: hlfir.assign %[[TEMP]] to %[[PRIVATE_X_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+!CHECK: %[[TEMP:.*]] = fir.load %[[PRIVATE_Y_DECL]]#0 : !fir.ref<i32>
+!CHECK: hlfir.assign %[[TEMP]] to %[[PRIVATE_X_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK: omp.terminator
+!CHECK: }
+
+    !$omp parallel default(firstprivate) firstprivate(y)
+        x = y
+    !$omp end parallel
+
+!CHECK: omp.parallel {
+!CHECK: %[[PRIVATE_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFEx"}
+!CHECK: %[[PRIVATE_X_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_X]] {uniq_name = "_QFEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[PRIVATE_Y:.*]] = fir.alloca i32 {bindc_name = "y", pinned, uniq_name = "_QFEy"}
+!CHECK: %[[PRIVATE_Y_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_Y]] {uniq_name = "_QFEy"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[TEMP:.*]] = fir.load %[[Y_DECL]]#0 : !fir.ref<i32>
+!CHECK: hlfir.assign %[[TEMP]] to %[[PRIVATE_Y_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+!CHECK: %[[PRIVATE_W:.*]] = fir.alloca i32 {bindc_name = "w", pinned, uniq_name = "_QFEw"}
+!CHECK: %[[PRIVATE_W_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_W]] {uniq_name = "_QFEw"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[TEMP:.*]] = fir.load %[[W_DECL]]#0 : !fir.ref<i32>
+!CHECK: hlfir.assign %[[TEMP]] to %[[PRIVATE_W_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+!CHECK: %[[CONST:.*]] = arith.constant 2 : i32
+!CHECK: %[[RESULT:.*]] = fir.load %[[PRIVATE_Y_DECL]]#0 : !fir.ref<i32>
+!CHECK: %[[TEMP:.*]] = arith.muli %[[CONST]], %[[RESULT]] : i32
+!CHECK: hlfir.assign %[[TEMP]] to %[[PRIVATE_X_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK: %[[TEMP:.*]] = fir.load %[[PRIVATE_W_DECL]]#0 : !fir.ref<i32>
+!CHECK: %[[CONST:.*]] = arith.constant 45 : i32
+!CHECK: %[[RESULT:.*]] = arith.addi %[[TEMP]], %[[CONST]] : i32
+!CHECK: hlfir.assign %[[RESULT]] to %[[Z_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK: omp.terminator
+!CHECK: }
+
+    !$omp parallel default(firstprivate) private(x) shared(z)
+        x = y * 2
+        z = w + 45
+    !$omp end parallel
+
+!CHECK: omp.parallel   {
+!CHECK: omp.parallel   {
+!CHECK: %[[PRIVATE_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFEx"}
+!CHECK: %[[PRIVATE_X_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_X]] {uniq_name = "_QFEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[PRIVATE_Y:.*]] = fir.alloca i32 {bindc_name = "y", pinned, uniq_name = "_QFEy"}
+!CHECK: %[[PRIVATE_Y_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_Y]] {uniq_name = "_QFEy"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[TEMP:.*]] = fir.load %[[PRIVATE_Y_DECL]]#0 : !fir.ref<i32>
+!CHECK: hlfir.assign %[[TEMP]] to %[[PRIVATE_X_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK: omp.terminator
+!CHECK: }
+!CHECK: omp.parallel {
+!CHECK: %[[PRIVATE_W:.*]] = fir.alloca i32 {bindc_name = "w", pinned, uniq_name = "_QFEw"}
+!CHECK: %[[PRIVATE_W_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_W]] {uniq_name = "_QFEw"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[TEMP:.*]] = fir.load %[[W_DECL]]#0 : !fir.ref<i32>
+!CHECK: hlfir.assign %[[TEMP]] to %[[PRIVATE_W_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+!CHECK: %[[PRIVATE_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFEx"}
+!CHECK: %[[PRIVATE_X_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_X]] {uniq_name = "_QFEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[TEMP:.*]] = fir.load %[[X_DECL]]#0 : !fir.ref<i32>
+!CHECK: hlfir.assign %[[TEMP]] to %[[PRIVATE_X_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+!CHECK: %[[TEMP:.*]] = fir.load %[[PRIVATE_X_DECL]]#0 : !fir.ref<i32>
+!CHECK: hlfir.assign %[[TEMP]] to %[[PRIVATE_W_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK: omp.terminator
+!CHECK: }
+!CHECK: omp.terminator
+!CHECK: }
+    !$omp parallel
+        !$omp parallel default(private)
+            x = y
+        !$omp end parallel
+
+        !$omp parallel default(firstprivate)
+            w = x
+        !$omp end parallel
+    !$omp end parallel
+
+end program default_clause_lowering
+
+subroutine nested_default_clause_tests
+    integer :: x, y, z, w, k, a
+!CHECK: %[[K:.*]] = fir.alloca i32 {bindc_name = "k", uniq_name = "_QFnested_default_clause_testsEk"}
+!CHECK: %[[K_DECL:.*]]:2 = hlfir.declare %[[K]] {uniq_name = "_QFnested_default_clause_testsEk"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[W:.*]] = fir.alloca i32 {bindc_name = "w", uniq_name = "_QFnested_default_clause_testsEw"}
+!CHECK: %[[W_DECL:.*]]:2 = hlfir.declare %[[W]] {uniq_name = "_QFnested_default_clause_testsEw"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[X:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFnested_default_clause_testsEx"}
+!CHECK: %[[X_DECL:.*]]:2 = hlfir.declare %[[X]] {uniq_name = "_QFnested_default_clause_testsEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[Y:.*]] = fir.alloca i32 {bindc_name = "y", uniq_name = "_QFnested_default_clause_testsEy"}
+!CHECK: %[[Y_DECL:.*]]:2 = hlfir.declare %[[Y]] {uniq_name = "_QFnested_default_clause_testsEy"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[Z:.*]] = fir.alloca i32 {bindc_name = "z", uniq_name = "_QFnested_default_clause_testsEz"}
+!CHECK: %[[Z_DECL:.*]]:2 = hlfir.declare %[[Z]] {uniq_name = "_QFnested_default_clause_testsEz"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: omp.parallel   {
+!CHECK: %[[PRIVATE_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFnested_default_clause_testsEx"}
+!CHECK: %[[PRIVATE_X_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_X]] {uniq_name = "_QFnested_default_clause_testsEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[TEMP:.*]] = fir.load %[[X_DECL]]#0 : !fir.ref<i32>
+!CHECK: hlfir.assign %[[TEMP]] to %[[PRIVATE_X_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+!CHECK: %[[PRIVATE_Y:.*]] = fir.alloca i32 {bindc_name = "y", pinned, uniq_name = "_QFnested_default_clause_testsEy"}
+!CHECK: %[[PRIVATE_Y_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_Y]] {uniq_name = "_QFnested_default_clause_testsEy"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[PRIVATE_Z:.*]] = fir.alloca i32 {bindc_name = "z", pinned, uniq_name = "_QFnested_default_clause_testsEz"}
+!CHECK: %[[PRIVATE_Z_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_Z]] {uniq_name = "_QFnested_default_clause_testsEz"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[PRIVATE_K:.*]] = fir.alloca i32 {bindc_name = "k", pinned, uniq_name = "_QFnested_default_clause_testsEk"}
+!CHECK: %[[PRIVATE_K_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_K]] {uniq_name = "_QFnested_default_clause_testsEk"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: omp.parallel {
+!CHECK: %[[INNER_PRIVATE_Y:.*]] = fir.alloca i32 {bindc_name = "y", pinned, uniq_name = "_QFnested_default_clause_testsEy"}
+!CHECK: %[[INNER_PRIVATE_Y_DECL:.*]]:2 = hlfir.declare %[[INNER_PRIVATE_Y]] {uniq_name = "_QFnested_default_clause_testsEy"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[INNER_PRIVATE_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFnested_default_clause_testsEx"}
+!CHECK: %[[INNER_PRIVATE_X_DECL:.*]]:2 = hlfir.declare %[[INNER_PRIVATE_X]] {uniq_name = "_QFnested_default_clause_testsEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[CONST:.*]] = arith.constant 20 : i32
+!CHECK: hlfir.assign %[[CONST]] to %[[INNER_PRIVATE_Y_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK: %[[CONST:.*]] = arith.constant 10 : i32
+!CHECK: hlfir.assign %[[CONST]] to %[[INNER_PRIVATE_X_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK: omp.terminator
+!CHECK: }
+!CHECK: omp.parallel   {
+!CHECK: %[[INNER_PRIVATE_W:.*]] = fir.alloca i32 {bindc_name = "w", pinned, uniq_name = "_QFnested_default_clause_testsEw"}
+!CHECK: %[[INNER_PRIVATE_W_DECL:.*]]:2 = hlfir.declare %[[INNER_PRIVATE_W]] {uniq_name = "_QFnested_default_clause_testsEw"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[INNER_PRIVATE_Z:.*]] = fir.alloca i32 {bindc_name = "z", pinned, uniq_name = "_QFnested_default_clause_testsEz"}
+!CHECK: %[[INNER_PRIVATE_Z_DECL:.*]]:2 = hlfir.declare %[[INNER_PRIVATE_Z]] {uniq_name = "_QFnested_default_clause_testsEz"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[TEMP:.*]] = fir.load %[[PRIVATE_Z_DECL]]#0 : !fir.ref<i32>
+!CHECK: hlfir.assign %[[TEMP]] to %[[INNER_PRIVATE_Z_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+!CHECK: %[[INNER_PRIVATE_K:.*]] = fir.alloca i32 {bindc_name = "k", pinned, uniq_name = "_QFnested_default_clause_testsEk"}
+!CHECK: %[[INNER_PRIVATE_K_DECL:.*]]:2 = hlfir.declare %[[INNER_PRIVATE_K]] {uniq_name = "_QFnested_default_clause_testsEk"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[TEMP:.*]] = fir.load %[[PRIVATE_K_DECL]]#0 : !fir.ref<i32>
+!CHECK: hlfir.assign %[[TEMP]] to %[[INNER_PRIVATE_K_DECL]]#0 temporary_lhs : i32, !fir.ref<i32> 
+!CHECK: %[[CONST:.*]] = arith.constant 30 : i32
+!CHECK: hlfir.assign %[[CONST]] to %[[PRIVATE_Y_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK: %[[CONST:.*]] = arith.constant 40 : i32
+!CHECK: hlfir.assign %[[CONST]] to %[[INNER_PRIVATE_W_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK: %[[CONST:.*]] = arith.constant 50 : i32
+!CHECK: hlfir.assign %[[CONST]] to %[[INNER_PRIVATE_Z_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK: %[[CONST:.*]] = arith.constant 40 : i32
+!CHECK: hlfir.assign %[[CONST]] to %[[INNER_PRIVATE_K_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK: omp.terminator
+!CHECK: }
+!CHECK: omp.terminator
+!CHECK: }
+    !$omp parallel  firstprivate(x) private(y) shared(w) default(private)  
+        !$omp parallel default(private)
+           y = 20
+           x = 10 
+        !$omp end parallel 
+
+        !$omp parallel default(firstprivate) shared(y) private(w) 
+            y = 30
+            w = 40 
+            z = 50
+            k = 40
+        !$omp end parallel
+    !$omp end parallel
+    
+    
+!CHECK: omp.parallel {
+!CHECK: %[[PRIVATE_X_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_X]] {uniq_name = "_QFnested_default_clause_testsEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[PRIVATE_Y:.*]] = fir.alloca i32 {bindc_name = "y", pinned, uniq_name = "_QFnested_default_clause_testsEy"}
+!CHECK: %[[PRIVATE_Y_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_Y]] {uniq_name = "_QFnested_default_clause_testsEy"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[PRIVATE_Z:.*]] = fir.alloca i32 {bindc_name = "z", pinned, uniq_name = "_QFnested_default_clause_testsEz"}
+!CHECK: %[[PRIVATE_Z_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_Z]] {uniq_name = "_QFnested_default_clause_testsEz"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[PRIVATE_W:.*]] = fir.alloca i32 {bindc_name = "w", pinned, uniq_name = "_QFnested_default_clause_testsEw"}
+!CHECK: %[[PRIVATE_W_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_W]] {uniq_name = "_QFnested_default_clause_testsEw"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: omp.parallel {
+!CHECK: %[[PRIVATE_INNER_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFnested_default_clause_testsEx"}
+!CHECK: %[[PRIVATE_INNER_X_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_INNER_X]] {uniq_name = "_QFnested_default_clause_testsEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[TEMP:.*]] = fir.load %[[PRIVATE_X_DECL]]#0 : !fir.ref<i32>
+!CHECK: hlfir.assign %[[TEMP]] to %[[PRIVATE_INNER_X_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+!CHECK: %[[INNER_PRIVATE_Y:.*]] = fir.alloca i32 {bindc_name = "y", pinned, uniq_name = "_QFnested_default_clause_testsEy"}
+!CHECK: %[[INNER_PRIVATE_Y_DECL:.*]]:2 = hlfir.declare %[[INNER_PRIVATE_Y]] {uniq_name = "_QFnested_default_clause_testsEy"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[TEMP:.*]] = fir.load %[[PRIVATE_Y_DECL]]#0 : !fir.ref<i32>
+!CHECK: hlfir.assign %[[TEMP]] to %[[INNER_PRIVATE_Y_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+!CHECK: %[[TEMP:.*]] = fir.load %[[INNER_PRIVATE_Y_DECL]]#0 : !fir.ref<i32>
+!CHECK: hlfir.assign %[[TEMP]] to %[[PRIVATE_INNER_X_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK: omp.terminator
+!CHECK: }
+!CHECK: omp.parallel {
+!CHECK: %[[PRIVATE_INNER_W:.*]] = fir.alloca i32 {bindc_name = "w", pinned, uniq_name = "_QFnested_default_clause_testsEw"}
+!CHECK: %[[PRIVATE_INNER_W_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_INNER_W]] {uniq_name = "_QFnested_default_clause_testsEw"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[PRIVATE_INNER_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFnested_default_clause_testsEx"}
+!CHECK: %[[PRIVATE_INNER_X_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_INNER_X]] {uniq_name = "_QFnested_default_clause_testsEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[TEMP_1:.*]] = fir.load %[[PRIVATE_INNER_X_DECL]]#0 : !fir.ref<i32>
+!CHECK: %[[TEMP_2:.*]] = fir.load %[[PRIVATE_Z_DECL]]#0 : !fir.ref<i32>
+!CHECK: %[[RESULT:.*]] = arith.addi %{{.*}}, %{{.*}} : i32
+!CHECK: hlfir.assign %[[RESULT]] to %[[PRIVATE_INNER_W_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK: omp.terminator
+!CHECK: }
+    !$omp parallel default(private)
+        !$omp parallel default(firstprivate)
+            x = y
+        !$omp end parallel
+
+        !$omp parallel default(private) shared(z)
+            w = x + z
+        !$omp end parallel
+    !$omp end parallel    
+    
+!CHECK: omp.parallel {
+!CHECK: %[[PRIVATE_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFnested_default_clause_testsEx"}
+!CHECK: %[[PRIVATE_X_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_X]] {uniq_name = "_QFnested_default_clause_testsEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[PRIVATE_Y:.*]] = fir.alloca i32 {bindc_name = "y", pinned, uniq_name = "_QFnested_default_clause_testsEy"}
+!CHECK: %[[PRIVATE_Y_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_Y]] {uniq_name = "_QFnested_default_clause_testsEy"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[PRIVATE_W:.*]] = fir.alloca i32 {bindc_name = "w", pinned, uniq_name = "_QFnested_default_clause_testsEw"}
+!CHECK: %[[PRIVATE_W_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_W]] {uniq_name = "_QFnested_default_clause_testsEw"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[PRIVATE_Z:.*]] = fir.alloca i32 {bindc_name = "z", pinned, uniq_name = "_QFnested_default_clause_testsEz"}
+!CHECK: %[[PRIVATE_Z_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_Z]] {uniq_name = "_QFnested_default_clause_testsEz"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: omp.parallel {
+!CHECK: %[[INNER_PRIVATE_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFnested_default_clause_testsEx"}
+!CHECK: %[[INNER_PRIVATE_X_DECL:.*]]:2 = hlfir.declare %[[INNER_PRIVATE_X]] {uniq_name = "_QFnested_default_clause_testsEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[TEMP:.*]] = fir.load %[[PRIVATE_X_DECL]]#0 : !fir.ref<i32>
+!CHECK: hlfir.assign %[[TEMP]] to %[[INNER_PRIVATE_X_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+!CHECK: %[[INNER_PRIVATE_Y:.*]] = fir.alloca i32 {bindc_name = "y", pinned, uniq_name = "_QFnested_default_clause_testsEy"}
+!CHECK: %[[INNER_PRIVATE_Y_DECL:.*]]:2 = hlfir.declare %[[INNER_PRIVATE_Y]] {uniq_name = "_QFnested_default_clause_testsEy"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[TEMP:.*]] = fir.load %[[PRIVATE_Y_DECL]]#0 : !fir.ref<i32>
+!CHECK: hlfir.assign %[[TEMP]] to %[[INNER_PRIVATE_Y_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+!CHECK: %[[TEMP:.*]] = fir.load %[[INNER_PRIVATE_Y_DECL]]#0 : !fir.ref<i32>
+!CHECK: hlfir.assign %[[TEMP]] to %[[INNER_PRIVATE_X_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK: omp.terminator
+!CHECK: }
+!CHECK: omp.parallel {
+!CHECK: %[[TEMP_1:.*]] = fir.load %[[PRIVATE_X_DECL]]#0 : !fir.ref<i32>
+!CHECK: %[[TEMP_2:.*]] = fir.load %[[PRIVATE_Z_DECL]]#0 : !fir.ref<i32>
+!CHECK: %[[TEMP_3:.*]] = arith.addi %[[TEMP_1]], %[[TEMP_2]] : i32
+!CHECK: hlfir.assign %[[TEMP_3]] to %[[PRIVATE_W_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK: omp.terminator
+!CHECK: }
+!CHECK: }
+    !$omp parallel default(private)
+		!$omp parallel default(firstprivate)
+			x = y
+		!$omp end parallel
+
+		!$omp parallel default(shared)
+			w = x + z
+		!$omp end parallel
+	!$omp end parallel
+
+!CHECK: omp.parallel {
+!CHECK: %[[PRIVATE_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFnested_default_clause_testsEx"}
+!CHECK: %[[PRIVATE_X_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_X]] {uniq_name = "_QFnested_default_clause_testsEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[TEMP:.*]] = fir.load %[[X_DECL]]#0 : !fir.ref<i32>
+!CHECK: hlfir.assign %[[TEMP]] to %[[PRIVATE_X_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+!CHECK: %[[PRIVATE_Y:.*]] = fir.alloca i32 {bindc_name = "y", pinned, uniq_name = "_QFnested_default_clause_testsEy"}
+!CHECK: %[[PRIVATE_Y_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_Y]] {uniq_name = "_QFnested_default_clause_testsEy"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[TEMP:.*]] = fir.load %[[Y_DECL]]#0 : !fir.ref<i32>
+!CHECK: hlfir.assign %[[TEMP]] to %[[PRIVATE_Y_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+!CHECK: omp.single {
+!CHECK: %[[TEMP:.*]] = fir.load %[[PRIVATE_Y_DECL]]#0 : !fir.ref<i32>
+!CHECK: hlfir.assign %[[TEMP]] to %[[PRIVATE_X_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK: omp.terminator
+!CHECK: }
+!CHECK: omp.terminator
+!CHECK: }
+!CHECK: } 
+	!$omp parallel default(firstprivate)
+		!$omp single
+			x = y
+		!$omp end single
+	!$omp end parallel
+end subroutine

From cc763dc2573f988538a1c6d29607743c61ff9aa2 Mon Sep 17 00:00:00 2001
From: harishch4 <harishcse44@gmail.com>
Date: Thu, 2 Nov 2023 16:30:34 +0530
Subject: [PATCH 810/877] [Flang][OpenMP] Port openmp private clause tests to
 HLFIR flow (#70783)

---
 .../Lower/OpenMP/parallel-private-clause.f90  | 256 +++++++++++-------
 1 file changed, 153 insertions(+), 103 deletions(-)

diff --git a/flang/test/Lower/OpenMP/parallel-private-clause.f90 b/flang/test/Lower/OpenMP/parallel-private-clause.f90
index 8d288f6483493..5fc77f206250e 100644
--- a/flang/test/Lower/OpenMP/parallel-private-clause.f90
+++ b/flang/test/Lower/OpenMP/parallel-private-clause.f90
@@ -2,25 +2,37 @@
 ! `PRIVATE` clause present.
 
 ! REQUIRES: shell
-! RUN: bbc --use-desc-for-alloc=false -fopenmp -emit-fir %s -o - | \
+! RUN: bbc --use-desc-for-alloc=false -fopenmp -emit-hlfir %s -o - | \
 ! RUN:   FileCheck %s --check-prefix=FIRDialect
 
-!FIRDialect: func @_QPprivate_clause(%[[ARG1:.*]]: !fir.ref<i32>{{.*}}, %[[ARG2:.*]]: !fir.ref<!fir.array<10xi32>>{{.*}}, %[[ARG3:.*]]: !fir.boxchar<1>{{.*}}, %[[ARG4:.*]]: !fir.boxchar<1>{{.*}}) {
-!FIRDialect-DAG: %[[ALPHA:.*]] = fir.alloca i32 {{{.*}}, uniq_name = "{{.*}}Ealpha"}
-!FIRDialect-DAG: %[[ALPHA_ARRAY:.*]] = fir.alloca !fir.array<10xi32> {{{.*}}, uniq_name = "{{.*}}Ealpha_array"}
-!FIRDialect-DAG: %[[BETA:.*]] = fir.alloca !fir.char<1,5> {{{.*}}, uniq_name = "{{.*}}Ebeta"}
-!FIRDialect-DAG: %[[BETA_ARRAY:.*]] = fir.alloca !fir.array<10x!fir.char<1,5>> {{{.*}}, uniq_name = "{{.*}}Ebeta_array"}
-
-!FIRDialect-DAG:  omp.parallel {
-!FIRDialect-DAG: %[[ALPHA_PRIVATE:.*]] = fir.alloca i32 {{{.*}}, pinned, uniq_name = "{{.*}}Ealpha"}
-!FIRDialect-DAG: %[[ALPHA_ARRAY_PRIVATE:.*]] = fir.alloca !fir.array<10xi32> {{{.*}}, pinned, uniq_name = "{{.*}}Ealpha_array"}
-!FIRDialect-DAG: %[[BETA_PRIVATE:.*]] = fir.alloca !fir.char<1,5> {{{.*}}, pinned, uniq_name = "{{.*}}Ebeta"}
-!FIRDialect-DAG: %[[BETA_ARRAY_PRIVATE:.*]] = fir.alloca !fir.array<10x!fir.char<1,5>> {{{.*}}, pinned, uniq_name = "{{.*}}Ebeta_array"}
-!FIRDialect-DAG: %[[ARG1_PRIVATE:.*]] = fir.alloca i32 {{{.*}}, pinned, uniq_name = "{{.*}}Earg1"}
-!FIRDialect-DAG: %[[ARG2_ARRAY_PRIVATE:.*]] = fir.alloca !fir.array<10xi32> {{{.*}}, pinned, uniq_name = "{{.*}}Earg2"}
-!FIRDialect-DAG: %[[ARG3_PRIVATE:.*]] = fir.alloca !fir.char<1,5> {{{.*}}, pinned, uniq_name = "{{.*}}Earg3"}
-!FIRDialect-DAG: %[[ARG4_ARRAY_PRIVATE:.*]] = fir.alloca !fir.array<10x!fir.char<1,5>> {{{.*}}, pinned, uniq_name = "{{.*}}Earg4"}
-!FIRDialect:    omp.terminator
+!FIRDialect: func @_QPprivate_clause(%[[ARG1:.*]]: !fir.ref<i32> {fir.bindc_name = "arg1"}, %[[ARG2:.*]]: !fir.ref<!fir.array<10xi32>> {fir.bindc_name = "arg2"}, %[[ARG3:.*]]: !fir.boxchar<1> {fir.bindc_name = "arg3"}, %[[ARG4:.*]]: !fir.boxchar<1> {fir.bindc_name = "arg4"}) {
+!FIRDialect-DAG: %[[ALPHA:.*]] = fir.alloca i32 {bindc_name = "alpha", uniq_name = "{{.*}}alpha"}
+!FIRDialect-DAG: %[[ALPHA_DECL:.*]]:2 = hlfir.declare %[[ALPHA]] {uniq_name = "{{.*}}alpha"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!FIRDialect-DAG: %[[ALPHA_ARRAY:.*]] = fir.alloca !fir.array<10xi32> {bindc_name = "alpha_array", uniq_name = "{{.*}}alpha_array"}
+!FIRDialect-DAG: %[[ALPHA_ARRAY_DECL:.*]]:2 = hlfir.declare %[[ALPHA_ARRAY]]({{.*}}) {uniq_name = "{{.*}}alpha_array"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
+!FIRDialect-DAG: %[[BETA:.*]] = fir.alloca !fir.char<1,5> {bindc_name = "beta", uniq_name = "{{.*}}beta"}
+!FIRDialect-DAG: %[[BETA_DECL:.*]]:2 = hlfir.declare %[[BETA]] typeparams {{.*}} {uniq_name = "{{.*}}beta"} : (!fir.ref<!fir.char<1,5>>, index) -> (!fir.ref<!fir.char<1,5>>, !fir.ref<!fir.char<1,5>>)
+!FIRDialect-DAG: %[[BETA_ARRAY:.*]] = fir.alloca !fir.array<10x!fir.char<1,5>> {bindc_name = "beta_array", uniq_name = "{{.*}}beta_array"}
+!FIRDialect-DAG: %[[BETA_ARRAY_DECL:.*]]:2 = hlfir.declare %[[BETA_ARRAY]]({{.*}}) typeparams {{.*}} {uniq_name = "{{.*}}beta_array"} : (!fir.ref<!fir.array<10x!fir.char<1,5>>>, !fir.shape<1>, index) -> (!fir.ref<!fir.array<10x!fir.char<1,5>>>, !fir.ref<!fir.array<10x!fir.char<1,5>>>)
+
+!FIRDialect-DAG: omp.parallel {
+!FIRDialect-DAG:  %[[ALPHA_PVT:.*]] = fir.alloca i32 {bindc_name = "alpha", pinned, uniq_name = "{{.*}}alpha"}
+!FIRDialect-DAG:  %[[ALPHA_PVT_DECL:.*]]:2 = hlfir.declare %[[ALPHA_PVT]] {uniq_name = "{{.*}}alpha"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!FIRDialect-DAG:  %[[ALPHA_ARRAY_PVT:.*]] = fir.alloca !fir.array<10xi32> {bindc_name = "alpha_array", pinned, uniq_name = "{{.*}}alpha_array"}
+!FIRDialect-DAG:  %[[ALPHA_ARRAY_PVT_DECL:.*]]:2 = hlfir.declare %[[ALPHA_ARRAY_PVT]]({{.*}}) {uniq_name = "{{.*}}alpha_array"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
+!FIRDialect-DAG:  %[[BETA_PVT:.*]] = fir.alloca !fir.char<1,5> {bindc_name = "beta", pinned, uniq_name = "{{.*}}beta"}
+!FIRDialect-DAG:  %[[BETA_PVT_DECL:.*]]:2 = hlfir.declare %[[BETA_PVT]] typeparams {{.*}} {uniq_name = "{{.*}}beta"} : (!fir.ref<!fir.char<1,5>>, index) -> (!fir.ref<!fir.char<1,5>>, !fir.ref<!fir.char<1,5>>)
+!FIRDialect-DAG:  %[[BETA_ARRAY_PVT:.*]] = fir.alloca !fir.array<10x!fir.char<1,5>> {bindc_name = "beta_array", pinned, uniq_name = "{{.*}}beta_array"}
+!FIRDialect-DAG:  %[[BETA_ARRAY_PVT_DECL:.*]]:2 = hlfir.declare %[[BETA_ARRAY_PVT]]({{.*}}) typeparams {{.*}} {uniq_name = "{{.*}}beta_array"} : (!fir.ref<!fir.array<10x!fir.char<1,5>>>, !fir.shape<1>, index) -> (!fir.ref<!fir.array<10x!fir.char<1,5>>>, !fir.ref<!fir.array<10x!fir.char<1,5>>>)
+!FIRDialect-DAG:  %[[ARG1_PVT:.*]] = fir.alloca i32 {bindc_name = "arg1", pinned, uniq_name = "{{.*}}arg1"}
+!FIRDialect-DAG:  %[[ARG1_PVT_DECL:.*]]:2 = hlfir.declare %[[ARG1_PVT]] {uniq_name = "{{.*}}arg1"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!FIRDialect-DAG:  %[[ARG2_PVT:.*]] = fir.alloca !fir.array<10xi32> {bindc_name = "arg2", pinned, uniq_name = "{{.*}}arg2"}
+!FIRDialect-DAG:  %[[ARG2_PVT_DECL:.*]]:2 = hlfir.declare %[[ARG2_PVT]]({{.*}}) {uniq_name = "{{.*}}arg2"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
+!FIRDialect-DAG:  %[[ARG3_PVT:.*]] = fir.alloca !fir.char<1,5> {bindc_name = "arg3", pinned, uniq_name = "{{.*}}arg3"}
+!FIRDialect-DAG:  %[[ARG3_PVT_DECL:.*]]:2 = hlfir.declare %[[ARG3_PVT]] typeparams {{.*}} {uniq_name = "{{.*}}arg3"} : (!fir.ref<!fir.char<1,5>>, index) -> (!fir.ref<!fir.char<1,5>>, !fir.ref<!fir.char<1,5>>)
+!FIRDialect-DAG:  %[[ARG4_PVT:.*]] = fir.alloca !fir.array<10x!fir.char<1,5>> {bindc_name = "arg4", pinned, uniq_name = "{{.*}}arg4"}
+!FIRDialect-DAG:  %[[ARG4_PVT_DECL:.*]]:2 = hlfir.declare %[[ARG4_PVT]]({{.*}}) typeparams {{.*}} {uniq_name = "{{.*}}arg4"} : (!fir.ref<!fir.array<10x!fir.char<1,5>>>, !fir.shape<1>, index) -> (!fir.ref<!fir.array<10x!fir.char<1,5>>>, !fir.ref<!fir.array<10x!fir.char<1,5>>>)
+!FIRDialect:     omp.terminator
 !FIRDialect:  }
 
 subroutine private_clause(arg1, arg2, arg3, arg4)
@@ -44,24 +56,40 @@ subroutine private_clause(arg1, arg2, arg3, arg4)
 end subroutine
 
 !FIRDialect: func @_QPprivate_clause_scalar() {
-!FIRDialect-DAG:   {{.*}} = fir.alloca !fir.complex<4> {bindc_name = "c", uniq_name = "{{.*}}Ec"}
-!FIRDialect-DAG:   {{.*}} = fir.alloca i8 {bindc_name = "i1", uniq_name = "{{.*}}Ei1"}
-!FIRDialect-DAG:   {{.*}} = fir.alloca i128 {bindc_name = "i16", uniq_name = "{{.*}}Ei16"}
-!FIRDialect-DAG:   {{.*}} = fir.alloca i16 {bindc_name = "i2", uniq_name = "{{.*}}Ei2"}
-!FIRDialect-DAG:   {{.*}} = fir.alloca i32 {bindc_name = "i4", uniq_name = "{{.*}}Ei4"}
-!FIRDialect-DAG:   {{.*}} = fir.alloca i64 {bindc_name = "i8", uniq_name = "{{.*}}Ei8"}
-!FIRDialect-DAG:   {{.*}} = fir.alloca !fir.logical<4> {bindc_name = "l", uniq_name = "{{.*}}El"}
-!FIRDialect-DAG:   {{.*}} = fir.alloca f32 {bindc_name = "r", uniq_name = "{{.*}}Er"}
-
-!FIRDialect:   omp.parallel {
-!FIRDialect-DAG:     {{.*}} = fir.alloca i8 {bindc_name = "i1", pinned, uniq_name = "{{.*}}Ei1"}
-!FIRDialect-DAG:     {{.*}} = fir.alloca i16 {bindc_name = "i2", pinned, uniq_name = "{{.*}}Ei2"}
-!FIRDialect-DAG:     {{.*}} = fir.alloca i32 {bindc_name = "i4", pinned, uniq_name = "{{.*}}Ei4"}
-!FIRDialect-DAG:     {{.*}} = fir.alloca i64 {bindc_name = "i8", pinned, uniq_name = "{{.*}}Ei8"}
-!FIRDialect-DAG:     {{.*}} = fir.alloca i128 {bindc_name = "i16", pinned, uniq_name = "{{.*}}Ei16"}
-!FIRDialect-DAG:     {{.*}} = fir.alloca !fir.complex<4> {bindc_name = "c", pinned, uniq_name = "{{.*}}Ec"}
-!FIRDialect-DAG:     {{.*}} = fir.alloca !fir.logical<4> {bindc_name = "l", pinned, uniq_name = "{{.*}}El"}
-!FIRDialect-DAG:     {{.*}} = fir.alloca f32 {bindc_name = "r", pinned, uniq_name = "{{.*}}Er"}
+!FIRDialect-DAG:  %[[C:.*]] = fir.alloca !fir.complex<4> {bindc_name = "c", uniq_name = "_QFprivate_clause_scalarEc"}
+!FIRDialect-DAG:  %[[C_DECL:.*]]:2 = hlfir.declare %[[C]] {uniq_name = "_QFprivate_clause_scalarEc"} : (!fir.ref<!fir.complex<4>>) -> (!fir.ref<!fir.complex<4>>, !fir.ref<!fir.complex<4>>)
+!FIRDialect-DAG:  %[[I1:.*]] = fir.alloca i8 {bindc_name = "i1", uniq_name = "_QFprivate_clause_scalarEi1"}
+!FIRDialect-DAG:  %[[I1_DECL:.*]]:2 = hlfir.declare %[[I1]] {uniq_name = "_QFprivate_clause_scalarEi1"} : (!fir.ref<i8>) -> (!fir.ref<i8>, !fir.ref<i8>)
+!FIRDialect-DAG:  %[[I16:.*]] = fir.alloca i128 {bindc_name = "i16", uniq_name = "_QFprivate_clause_scalarEi16"}
+!FIRDialect-DAG:  %[[I16_DECL:.*]]:2 = hlfir.declare %[[I16]] {uniq_name = "_QFprivate_clause_scalarEi16"} : (!fir.ref<i128>) -> (!fir.ref<i128>, !fir.ref<i128>)
+!FIRDialect-DAG:  %[[I2:.*]] = fir.alloca i16 {bindc_name = "i2", uniq_name = "_QFprivate_clause_scalarEi2"}
+!FIRDialect-DAG:  %[[I2_DECL:.*]]:2 = hlfir.declare %[[I2]] {uniq_name = "_QFprivate_clause_scalarEi2"} : (!fir.ref<i16>) -> (!fir.ref<i16>, !fir.ref<i16>)
+!FIRDialect-DAG:  %[[I4:.*]] = fir.alloca i32 {bindc_name = "i4", uniq_name = "_QFprivate_clause_scalarEi4"}
+!FIRDialect-DAG:  %[[I4_DECL:.*]]:2 = hlfir.declare %[[I4]] {uniq_name = "_QFprivate_clause_scalarEi4"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!FIRDialect-DAG:  %[[I8:.*]] = fir.alloca i64 {bindc_name = "i8", uniq_name = "_QFprivate_clause_scalarEi8"}
+!FIRDialect-DAG:  %[[I8_DECL:.*]]:2 = hlfir.declare %[[I8]] {uniq_name = "_QFprivate_clause_scalarEi8"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
+!FIRDialect-DAG:  %[[L:.*]] = fir.alloca !fir.logical<4> {bindc_name = "l", uniq_name = "_QFprivate_clause_scalarEl"}
+!FIRDialect-DAG:  %[[L_DECL:.*]]:2 = hlfir.declare %[[L]] {uniq_name = "_QFprivate_clause_scalarEl"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+!FIRDialect-DAG:  %[[R:.*]] = fir.alloca f32 {bindc_name = "r", uniq_name = "_QFprivate_clause_scalarEr"}
+!FIRDialect-DAG:  %[[R_DECL:.*]]:2 = hlfir.declare %[[R]] {uniq_name = "_QFprivate_clause_scalarEr"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+
+!FIRDialect:  omp.parallel {
+!FIRDialect-DAG:    %[[I1_PVT:.*]] = fir.alloca i8 {bindc_name = "i1", pinned, uniq_name = "_QFprivate_clause_scalarEi1"}
+!FIRDialect-DAG:    %[[I1_PVT_DECL:.*]]:2 = hlfir.declare %[[I1_PVT]] {uniq_name = "_QFprivate_clause_scalarEi1"} : (!fir.ref<i8>) -> (!fir.ref<i8>, !fir.ref<i8>)
+!FIRDialect-DAG:    %[[I2_PVT:.*]] = fir.alloca i16 {bindc_name = "i2", pinned, uniq_name = "_QFprivate_clause_scalarEi2"}
+!FIRDialect-DAG:    %[[I2_PVT_DECL:.*]]:2 = hlfir.declare %[[I2_PVT]] {uniq_name = "_QFprivate_clause_scalarEi2"} : (!fir.ref<i16>) -> (!fir.ref<i16>, !fir.ref<i16>)
+!FIRDialect-DAG:    %[[I4_PVT:.*]] = fir.alloca i32 {bindc_name = "i4", pinned, uniq_name = "_QFprivate_clause_scalarEi4"}
+!FIRDialect-DAG:    %[[I4_PVT_DECL:.*]]:2 = hlfir.declare %[[I4_PVT]] {uniq_name = "_QFprivate_clause_scalarEi4"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!FIRDialect-DAG:    %[[I8_PVT:.*]] = fir.alloca i64 {bindc_name = "i8", pinned, uniq_name = "_QFprivate_clause_scalarEi8"}
+!FIRDialect-DAG:    %[[I8_PVT_DECL:.*]]:2 = hlfir.declare %[[I8_PVT]] {uniq_name = "_QFprivate_clause_scalarEi8"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
+!FIRDialect-DAG:    %[[I16_PVT:.*]] = fir.alloca i128 {bindc_name = "i16", pinned, uniq_name = "_QFprivate_clause_scalarEi16"}
+!FIRDialect-DAG:    %[[I16_PVT_DECL:.*]]:2 = hlfir.declare %[[I16_PVT]] {uniq_name = "_QFprivate_clause_scalarEi16"} : (!fir.ref<i128>) -> (!fir.ref<i128>, !fir.ref<i128>)
+!FIRDialect-DAG:    %[[C_PVT:.*]] = fir.alloca !fir.complex<4> {bindc_name = "c", pinned, uniq_name = "_QFprivate_clause_scalarEc"}
+!FIRDialect-DAG:    %[[C_PVT_DECL:.*]]:2 = hlfir.declare %[[C_PVT]] {uniq_name = "_QFprivate_clause_scalarEc"} : (!fir.ref<!fir.complex<4>>) -> (!fir.ref<!fir.complex<4>>, !fir.ref<!fir.complex<4>>)
+!FIRDialect-DAG:    %[[L_PVT:.*]] = fir.alloca !fir.logical<4> {bindc_name = "l", pinned, uniq_name = "_QFprivate_clause_scalarEl"}
+!FIRDialect-DAG:    %[[L_PVT_DECL:.*]]:2 = hlfir.declare %[[L_PVT]] {uniq_name = "_QFprivate_clause_scalarEl"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+!FIRDialect-DAG:    %[[R_PVT:.*]] = fir.alloca f32 {bindc_name = "r", pinned, uniq_name = "_QFprivate_clause_scalarEr"}
+!FIRDialect-DAG:    %[[R_PVT_DECL:.*]]:2 = hlfir.declare %[[R_PVT]] {uniq_name = "_QFprivate_clause_scalarEr"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
 
 subroutine private_clause_scalar()
 
@@ -81,10 +109,12 @@ subroutine private_clause_scalar()
 end subroutine
 
 !FIRDialect: func @_QPprivate_clause_derived_type() {
-!FIRDialect:   {{.*}} = fir.alloca !fir.type<{{.*}}{t_i:i32,t_arr:!fir.array<5xi32>}> {bindc_name = "t", uniq_name = "{{.*}}Et"}
+!FIRDialect: %[[T:.*]] = fir.alloca !fir.type<_QFprivate_clause_derived_typeTmy_type{t_i:i32,t_arr:!fir.array<5xi32>}> {bindc_name = "t", uniq_name = "{{.*}}Et"}
+!FIRDialect: %[[T_DECL:.*]]:2 = hlfir.declare %[[T]] {uniq_name = "{{.*}}Et"} : (!fir.ref<!fir.type<_QFprivate_clause_derived_typeTmy_type{t_i:i32,t_arr:!fir.array<5xi32>}>>) -> (!fir.ref<!fir.type<_QFprivate_clause_derived_typeTmy_type{t_i:i32,t_arr:!fir.array<5xi32>}>>, !fir.ref<!fir.type<_QFprivate_clause_derived_typeTmy_type{t_i:i32,t_arr:!fir.array<5xi32>}>>)
 
 !FIRDialect:   omp.parallel {
-!FIRDialect:     {{.*}} = fir.alloca !fir.type<{{.*}}{t_i:i32,t_arr:!fir.array<5xi32>}> {bindc_name = "t", pinned, uniq_name = "{{.*}}Et"}
+!FIRDialect: %[[T:.*]] = fir.alloca !fir.type<_QFprivate_clause_derived_typeTmy_type{t_i:i32,t_arr:!fir.array<5xi32>}> {bindc_name = "t", pinned, uniq_name = "{{.*}}Et"}
+!FIRDialect: %[[T_DECL:.*]]:2 = hlfir.declare %[[T]] {uniq_name = "{{.*}}Et"} : (!fir.ref<!fir.type<_QFprivate_clause_derived_typeTmy_type{t_i:i32,t_arr:!fir.array<5xi32>}>>) -> (!fir.ref<!fir.type<_QFprivate_clause_derived_typeTmy_type{t_i:i32,t_arr:!fir.array<5xi32>}>>, !fir.ref<!fir.type<_QFprivate_clause_derived_typeTmy_type{t_i:i32,t_arr:!fir.array<5xi32>}>>)
 
 subroutine private_clause_derived_type()
 
@@ -101,27 +131,33 @@ subroutine private_clause_derived_type()
 end subroutine
 
 !FIRDialect: func @_QPprivate_clause_allocatable() {
-!FIRDialect-DAG:  {{.*}} = fir.alloca !fir.box<!fir.heap<i32>> {bindc_name = "x", uniq_name = "{{.*}}Ex"}
-!FIRDialect-DAG:  {{.*}} = fir.alloca !fir.heap<i32> {uniq_name = "{{.*}}Ex.addr"}
-!FIRDialect-DAG:  {{.*}} = fir.alloca !fir.box<!fir.heap<!fir.array<?xi32>>> {bindc_name = "x2", uniq_name = "{{.*}}Ex2"}
-!FIRDialect-DAG:  {{.*}} = fir.alloca !fir.heap<!fir.array<?xi32>> {uniq_name = "{{.*}}Ex2.addr"}
-!FIRDialect-DAG:  {{.*}} = fir.address_of(@{{.*}}Ex3) : !fir.ref<!fir.box<!fir.heap<i32>>>
-!FIRDialect-DAG:  [[TMP8:%.*]] = fir.address_of(@{{.*}}Ex4) : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
+!FIRDialect-DAG:  %[[X:.*]] = fir.alloca !fir.box<!fir.heap<i32>> {bindc_name = "x", uniq_name = "{{.*}}Ex"}
+!FIRDialect-DAG:  %[[X_DECL:.*]]:2 = hlfir.declare %[[X]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "{{.*}}Ex"} : (!fir.ref<!fir.box<!fir.heap<i32>>>) -> (!fir.ref<!fir.box<!fir.heap<i32>>>, !fir.ref<!fir.box<!fir.heap<i32>>>)
+!FIRDialect-DAG:  %[[X2:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xi32>>> {bindc_name = "x2", uniq_name = "{{.*}}Ex2"}
+!FIRDialect-DAG:  %[[X2_DECL:.*]]:2 = hlfir.declare %[[X2]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "{{.*}}Ex2"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>)
+!FIRDialect-DAG:  %[[X3:.*]] = fir.address_of(@{{.*}}Ex3) : !fir.ref<!fir.box<!fir.heap<i32>>>
+!FIRDialect-DAG:  %[[X3_DECL:.*]]:2 = hlfir.declare %[[X3]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "{{.*}}Ex3"} : (!fir.ref<!fir.box<!fir.heap<i32>>>) -> (!fir.ref<!fir.box<!fir.heap<i32>>>, !fir.ref<!fir.box<!fir.heap<i32>>>)
+!FIRDialect-DAG:  %[[X4:.*]] = fir.address_of(@{{.*}}Ex4) : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
+!FIRDialect-DAG:  %[[X4_DECL:.*]]:2 = hlfir.declare %[[X4]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "{{.*}}Ex4"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>)
 
 !FIRDialect:   omp.parallel {
-!FIRDialect-DAG:    [[TMP35:%.*]] = fir.alloca !fir.box<!fir.heap<i32>> {bindc_name = "x", pinned, uniq_name = "{{.*}}Ex"}
-!FIRDialect-DAG:    [[TMP39:%.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xi32>>> {bindc_name = "x2", pinned, uniq_name = "{{.*}}Ex2"}
-!FIRDialect-DAG:    [[TMP45:%.*]] = fir.alloca !fir.box<!fir.heap<i32>> {bindc_name = "x3", pinned, uniq_name = "{{.*}}Ex3"}
+!FIRDialect-DAG:    %[[X_PVT:.*]] = fir.alloca !fir.box<!fir.heap<i32>> {bindc_name = "x", pinned, uniq_name = "{{.*}}Ex"}
+!FIRDialect-DAG:    %[[X_PVT_DECL:.*]]:2 = hlfir.declare %[[X_PVT]] {uniq_name = "{{.*}}Ex"} : (!fir.ref<!fir.box<!fir.heap<i32>>>) -> (!fir.ref<!fir.box<!fir.heap<i32>>>, !fir.ref<!fir.box<!fir.heap<i32>>>)
+!FIRDialect-DAG:    %[[X2_PVT:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xi32>>> {bindc_name = "x2", pinned, uniq_name = "{{.*}}Ex2"}
+!FIRDialect-DAG:    %[[X2_PVT_DECL:.*]]:2 = hlfir.declare %[[X2_PVT]] {uniq_name = "{{.*}}Ex2"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>)
+!FIRDialect-DAG:    %[[X3_PVT:.*]] = fir.alloca !fir.box<!fir.heap<i32>> {bindc_name = "x3", pinned, uniq_name = "{{.*}}Ex3"}
+!FIRDialect-DAG:    %[[X3_PVT_DECL:.*]]:2 = hlfir.declare %[[X3_PVT]] {uniq_name = "{{.*}}Ex3"} : (!fir.ref<!fir.box<!fir.heap<i32>>>) -> (!fir.ref<!fir.box<!fir.heap<i32>>>, !fir.ref<!fir.box<!fir.heap<i32>>>)
+!FIRDialect-DAG:    %[[X4_PVT:.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xi32>>> {bindc_name = "x4", pinned, uniq_name = "{{.*}}Ex4"}
+!FIRDialect-DAG:    %[[X4_PVT_DECL:.*]]:2 = hlfir.declare %[[X4_PVT]] {uniq_name = "{{.*}}Ex4"} : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>)
 
-!FIRDialect-DAG:    [[TMP51:%.*]] = fir.load [[TMP8]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
-!FIRDialect-DAG:    [[TMP97:%.*]] = fir.load [[TMP8]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
-!FIRDialect-DAG:    [[TMP98:%.*]]:3 = fir.box_dims [[TMP97]], {{.*}} : (!fir.box<!fir.heap<!fir.array<?xi32>>>, index) -> (index, index, index)
-!FIRDialect-DAG:    [[TMP50:%.*]] = fir.alloca !fir.box<!fir.heap<!fir.array<?xi32>>> {bindc_name = "x4", pinned, uniq_name = "{{.*}}Ex4"}
+!FIRDialect-DAG:    %[[TMP58:.*]] = fir.load %[[X4_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
+!FIRDialect-DAG:    %[[TMP97:.*]] = fir.load %[[X4_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
+!FIRDialect-DAG:    %[[TMP98:.*]]:3 = fir.box_dims %[[TMP97]], {{.*}} : (!fir.box<!fir.heap<!fir.array<?xi32>>>, index) -> (index, index, index)
 
-! FIRDialect-DAG:    [[TMP101:%.*]] = fir.allocmem !fir.array<?xi32>, {{.*}} {fir.must_be_heap = true, uniq_name = "{{.*}}Ex4.alloc"}
-! FIRDialect-DAG:    [[TMP102:%.*]] = fir.shape_shift {{.*}}#0, {{.*}} : (index, index) -> !fir.shapeshift<1>
-! FIRDialect-DAG:    [[TMP103:%.*]] = fir.embox [[TMP101]]([[TMP102]]) : (!fir.heap<!fir.array<?xi32>>, !fir.shapeshift<1>) -> !fir.box<!fir.heap<!fir.array<?xi32>>>
-! FIRDialect-DAG:  fir.store [[TMP103]] to [[TMP50]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
+!FIRDialect-DAG:    %[[TMP101:.*]] = fir.allocmem !fir.array<?xi32>, {{.*}} {fir.must_be_heap = true, uniq_name = "{{.*}}Ex4.alloc"}
+!FIRDialect-DAG:    %[[TMP102:.*]] = fir.shape_shift {{.*}}#0, {{.*}} : (index, index) -> !fir.shapeshift<1>
+!FIRDialect-DAG:    %[[TMP103:.*]] = fir.embox %[[TMP101]](%[[TMP102]]) : (!fir.heap<!fir.array<?xi32>>, !fir.shapeshift<1>) -> !fir.box<!fir.heap<!fir.array<?xi32>>>
+!FIRDialect-DAG:    fir.store %[[TMP103]] to %[[X4_PVT]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
 
 
 subroutine private_clause_allocatable()
@@ -139,27 +175,29 @@ subroutine private_clause_allocatable()
 
 
 !FIRDialect: func @_QPprivate_clause_real_call_allocatable() {
-!FIRDialect-DAG: {{.*}} = fir.alloca !fir.box<!fir.heap<f32>> {bindc_name = "x5", uniq_name = "{{.*}}Ex5"}
+!FIRDialect-DAG: %[[X5:.*]] = fir.alloca !fir.box<!fir.heap<f32>> {bindc_name = "x5", uniq_name = "{{.*}}Ex5"}
 !FIRDialect-DAG: {{.*}} = fir.zero_bits !fir.heap<f32>
 !FIRDialect-DAG: {{.*}} = fir.embox %1 : (!fir.heap<f32>) -> !fir.box<!fir.heap<f32>>
-!FIRDialect-DAG: fir.store %2 to %0 : !fir.ref<!fir.box<!fir.heap<f32>>>
+!FIRDialect-DAG: fir.store %2 to %[[X5]] : !fir.ref<!fir.box<!fir.heap<f32>>>
+!FIRDialect-DAG: %[[X5_DECL:.*]]:2 = hlfir.declare %[[X5]] {fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFprivate_clause_real_call_allocatableEx5"} : (!fir.ref<!fir.box<!fir.heap<f32>>>) -> (!fir.ref<!fir.box<!fir.heap<f32>>>, !fir.ref<!fir.box<!fir.heap<f32>>>)
 !FIRDialect-DAG: omp.parallel   {
-!FIRDialect-DAG:  [[TMP203:%.*]] = fir.alloca !fir.box<!fir.heap<f32>> {bindc_name = "x5", pinned, uniq_name = "{{.*}}Ex5"}
+!FIRDialect-DAG:  %[[X5_PVT:.*]] = fir.alloca !fir.box<!fir.heap<f32>> {bindc_name = "x5", pinned, uniq_name = "_QFprivate_clause_real_call_allocatableEx5"}
 
 !FIRDialect-DAG: fir.if %{{.*}} {
 
-!FIRDialect-DAG:   fir.store %{{.*}} to [[TMP203]] : !fir.ref<!fir.box<!fir.heap<f32>>>
+!FIRDialect-DAG:   fir.store %{{.*}} to %[[X5_PVT]] : !fir.ref<!fir.box<!fir.heap<f32>>>
 !FIRDialect-DAG: } else {
 
-!FIRDialect-DAG:   fir.store %{{.*}} to [[TMP203]] : !fir.ref<!fir.box<!fir.heap<f32>>>
+!FIRDialect-DAG:   fir.store %{{.*}} to %[[X5_PVT]] : !fir.ref<!fir.box<!fir.heap<f32>>>
 !FIRDialect-DAG: }
-!FIRDialect-DAG: fir.call @_QFprivate_clause_real_call_allocatablePhelper_private_clause_real_call_allocatable([[TMP203]]) fastmath<contract> : (!fir.ref<!fir.box<!fir.heap<f32>>>) -> ()
-!FIRDialect-DAG: %{{.*}} = fir.load [[TMP203]] : !fir.ref<!fir.box<!fir.heap<f32>>>
+!FIRDialect-DAG: %[[X5_PVT_DECL:.*]]:2 = hlfir.declare %[[X5_PVT]] {uniq_name = "_QFprivate_clause_real_call_allocatableEx5"} : (!fir.ref<!fir.box<!fir.heap<f32>>>) -> (!fir.ref<!fir.box<!fir.heap<f32>>>, !fir.ref<!fir.box<!fir.heap<f32>>>)
+!FIRDialect-DAG: fir.call @_QFprivate_clause_real_call_allocatablePhelper_private_clause_real_call_allocatable(%[[X5_PVT_DECL]]#0) fastmath<contract> : (!fir.ref<!fir.box<!fir.heap<f32>>>) -> ()
+!FIRDialect-DAG: %{{.*}} = fir.load %[[X5_PVT_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<f32>>>
 
 !FIRDialect-DAG: fir.if %{{.*}} {
-!FIRDialect-DAG:   %{{.*}} = fir.load [[TMP203]] : !fir.ref<!fir.box<!fir.heap<f32>>>
+!FIRDialect-DAG:   %{{.*}} = fir.load %[[X5_PVT_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<f32>>>
 
-!FIRDialect-DAG:     fir.store %{{.*}} to [[TMP203]] : !fir.ref<!fir.box<!fir.heap<f32>>>
+!FIRDialect-DAG:     fir.store %{{.*}} to %[[X5_PVT_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<f32>>>
 !FIRDialect-DAG:   }
 !FIRDialect-DAG:   omp.terminator
 !FIRDialect-DAG:   }
@@ -180,9 +218,11 @@ subroutine helper_private_clause_real_call_allocatable(x6)
 end subroutine
 
 !FIRDialect:  func.func @_QPincrement_list_items(%arg0: !fir.ref<!fir.box<!fir.ptr<!fir.type<_QFincrement_list_itemsTnode{payload:i32,next:!fir.box<!fir.ptr<!fir.type<_QFincrement_list_itemsTnode>>>}>>>> {fir.bindc_name = "head"}) {
-!FIRDialect:    {{%.*}} = fir.alloca !fir.box<!fir.ptr<!fir.type<_QFincrement_list_itemsTnode{payload:i32,next:!fir.box<!fir.ptr<!fir.type<_QFincrement_list_itemsTnode>>>}>>> {bindc_name = "p", uniq_name = "_QFincrement_list_itemsEp"}
+!FIRDialect:    %[[P:.*]] = fir.alloca !fir.box<!fir.ptr<!fir.type<_QFincrement_list_itemsTnode{payload:i32,next:!fir.box<!fir.ptr<!fir.type<_QFincrement_list_itemsTnode>>>}>>> {bindc_name = "p", uniq_name = "_QFincrement_list_itemsEp"}
+!FIRDialect:    %[[P_DECL:.*]]:2 = hlfir.declare %[[P]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFincrement_list_itemsEp"} : (!fir.ref<!fir.box<!fir.ptr<!fir.type<_QFincrement_list_itemsTnode{payload:i32,next:!fir.box<!fir.ptr<!fir.type<_QFincrement_list_itemsTnode>>>}>>>>) -> (!fir.ref<!fir.box<!fir.ptr<!fir.type<_QFincrement_list_itemsTnode{payload:i32,next:!fir.box<!fir.ptr<!fir.type<_QFincrement_list_itemsTnode>>>}>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.type<_QFincrement_list_itemsTnode{payload:i32,next:!fir.box<!fir.ptr<!fir.type<_QFincrement_list_itemsTnode>>>}>>>>)
 !FIRDialect:    omp.parallel   {
-!FIRDialect:      {{%.*}} = fir.alloca !fir.box<!fir.ptr<!fir.type<_QFincrement_list_itemsTnode{payload:i32,next:!fir.box<!fir.ptr<!fir.type<_QFincrement_list_itemsTnode>>>}>>> {bindc_name = "p", pinned, uniq_name = "_QFincrement_list_itemsEp"}
+!FIRDialect:      %[[P_PVT:.*]] = fir.alloca !fir.box<!fir.ptr<!fir.type<_QFincrement_list_itemsTnode{payload:i32,next:!fir.box<!fir.ptr<!fir.type<_QFincrement_list_itemsTnode>>>}>>> {bindc_name = "p", pinned, uniq_name = "_QFincrement_list_itemsEp"}
+!FIRDialect:      %[[P_PVT_DECL:.*]]:2 = hlfir.declare %[[P_PVT]] {uniq_name = "_QFincrement_list_itemsEp"} : (!fir.ref<!fir.box<!fir.ptr<!fir.type<_QFincrement_list_itemsTnode{payload:i32,next:!fir.box<!fir.ptr<!fir.type<_QFincrement_list_itemsTnode>>>}>>>>) -> (!fir.ref<!fir.box<!fir.ptr<!fir.type<_QFincrement_list_itemsTnode{payload:i32,next:!fir.box<!fir.ptr<!fir.type<_QFincrement_list_itemsTnode>>>}>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.type<_QFincrement_list_itemsTnode{payload:i32,next:!fir.box<!fir.ptr<!fir.type<_QFincrement_list_itemsTnode>>>}>>>>)
 !FIRDialect:      omp.single   {
 
 !FIRDialect:         omp.terminator
@@ -209,24 +249,27 @@ subroutine increment_list_items (head)
 end subroutine increment_list_items
 
 !FIRDialect:  func.func @_QPparallel_pointer() {
-!FIRDialect-DAG: [[PP0:%.*]]  = fir.alloca !fir.box<!fir.ptr<i32>> {bindc_name = "y1", uniq_name = "{{.*}}Ey1"}
-!FIRDialect-DAG: [[PP1:%.*]]  = fir.alloca !fir.ptr<i32> {uniq_name = "{{.*}}Ey1.addr"}
-!FIRDialect-DAG: [[PP2:%.*]]  = fir.zero_bits !fir.ptr<i32>
-!FIRDialect:     fir.store [[PP2]] to [[PP1]] : !fir.ref<!fir.ptr<i32>>
-!FIRDialect-DAG: [[PP3:%.*]]  = fir.alloca !fir.box<!fir.ptr<!fir.array<?xi32>>> {bindc_name = "y2", uniq_name = "{{.*}}Ey2"}
-
-!FIRDialect:     fir.store %6 to %3 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
-!FIRDialect-DAG: [[PP7:%.*]] = fir.alloca i32 {bindc_name = "z1", fir.target, uniq_name = "{{.*}}Ez1"}
-
-!FIRDialect-DAG: [[PP8:%.*]] = fir.alloca !fir.array<10xi32> {bindc_name = "z2", fir.target, uniq_name = "{{.*}}Ez2"}
+!FIRDialect-DAG: %[[Y1:.*]] = fir.alloca !fir.box<!fir.ptr<i32>> {bindc_name = "y1", uniq_name = "_QFparallel_pointerEy1"}
+!FIRDialect:     fir.store %2 to %[[Y1]] : !fir.ref<!fir.box<!fir.ptr<i32>>>
+!FIRDialect-DAG: %[[Y1_DECL:.*]]:2 = hlfir.declare %[[Y1]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFparallel_pointerEy1"} : (!fir.ref<!fir.box<!fir.ptr<i32>>>) -> (!fir.ref<!fir.box<!fir.ptr<i32>>>, !fir.ref<!fir.box<!fir.ptr<i32>>>)
+!FIRDialect-DAG: %[[Y2:.*]] = fir.alloca !fir.box<!fir.ptr<!fir.array<?xi32>>> {bindc_name = "y2", uniq_name = "_QFparallel_pointerEy2"}
+
+!FIRDialect:     fir.store %7 to %[[Y2]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
+!FIRDialect-DAG: %[[Y2_DECL:.*]]:2 = hlfir.declare %[[Y2]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFparallel_pointerEy2"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>)
+!FIRDialect-DAG: %[[Z1:.*]] = fir.alloca i32 {bindc_name = "z1", fir.target, uniq_name = "_QFparallel_pointerEz1"}
+!FIRDialect-DAG: %[[Z1_DECL:.*]]:2 = hlfir.declare %[[Z1]] {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFparallel_pointerEz1"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!FIRDialect-DAG:  %[[Z2:.*]] = fir.alloca !fir.array<10xi32> {bindc_name = "z2", fir.target, uniq_name = "_QFparallel_pointerEz2"}
+!FIRDialect-DAG:  %[[Z2_DECL:.*]]:2 = hlfir.declare %[[Z2]](%12) {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFparallel_pointerEz2"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
 !FIRDialect:     omp.parallel   {
-!FIRDialect-DAG:   [[PP9:%.*]] = fir.alloca !fir.box<!fir.ptr<i32>> {bindc_name = "y1", pinned, uniq_name = "{{.*}}Ey1"}
-!FIRDialect-DAG:   [[PP10:%.*]] = fir.alloca !fir.box<!fir.ptr<!fir.array<?xi32>>> {bindc_name = "y2", pinned, uniq_name = "{{.*}}Ey2"}
-!FIRDialect-DAG:   [[PP11:%.*]] = fir.embox [[PP7]] : (!fir.ref<i32>) -> !fir.box<!fir.ptr<i32>>
-!FIRDialect:       fir.store [[PP11]] to [[PP9]] : !fir.ref<!fir.box<!fir.ptr<i32>>>
-!FIRDialect-DAG:   [[PP12:%.*]] = fir.shape %c{{.*}} : (index) -> !fir.shape<1>
-!FIRDialect-DAG:   [[PP13:%.*]] = fir.embox [[PP8]]([[PP12]]) : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>) -> !fir.box<!fir.ptr<!fir.array<?xi32>>>
-!FIRDialect:       fir.store %13 to [[PP10]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
+!FIRDialect-DAG:    %[[Y1_PVT:.*]] = fir.alloca !fir.box<!fir.ptr<i32>> {bindc_name = "y1", pinned, uniq_name = "_QFparallel_pointerEy1"}
+!FIRDialect-DAG:    %[[Y1_PVT_DECL:.*]]:2 = hlfir.declare %[[Y1_PVT]] {uniq_name = "_QFparallel_pointerEy1"} : (!fir.ref<!fir.box<!fir.ptr<i32>>>) -> (!fir.ref<!fir.box<!fir.ptr<i32>>>, !fir.ref<!fir.box<!fir.ptr<i32>>>)
+!FIRDialect-DAG:    %[[Y2_PVT:.*]] = fir.alloca !fir.box<!fir.ptr<!fir.array<?xi32>>> {bindc_name = "y2", pinned, uniq_name = "_QFparallel_pointerEy2"}
+!FIRDialect-DAG:    %[[Y2_PVT_DECL:.*]]:2 = hlfir.declare %[[Y2_PVT]] {uniq_name = "_QFparallel_pointerEy2"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>)
+!FIRDialect-DAG:    %[[PP18:.*]] = fir.embox %[[Z1_DECL]]#1 : (!fir.ref<i32>) -> !fir.box<!fir.ptr<i32>>
+!FIRDialect:       fir.store %[[PP18]] to %[[Y1_PVT_DECL]]#1 : !fir.ref<!fir.box<!fir.ptr<i32>>>
+!FIRDialect-DAG:    %[[PP19:.*]] = fir.shape %c10 : (index) -> !fir.shape<1>
+!FIRDialect-DAG:    %[[PP20:.*]] = fir.embox %[[Z2_DECL]]#1(%[[PP19]]) : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>) -> !fir.box<!fir.ptr<!fir.array<?xi32>>>
+!FIRDialect:        fir.store %[[PP20]] to %[[Y2_PVT_DECL]]#1 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
 !FIRDialect:       omp.terminator
 !FIRDialect:     }
 !FIRDialect:   return
@@ -251,9 +294,11 @@ subroutine simple_loop_1
   !$OMP PARALLEL PRIVATE(r)
   ! FIRDialect:     %[[ALLOCA_IV:.*]] = fir.alloca i32 {{{.*}}, pinned}
 
+  ! FIRDialect:     %[[ALLOCA_IV_DECL:.*]]:2 = hlfir.declare %[[ALLOCA_IV]] {uniq_name = "_QFsimple_loop_1Ei"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
   ! FIRDialect:     [[R:%.*]] = fir.alloca !fir.box<!fir.heap<f32>> {bindc_name = "r", pinned, uniq_name = "{{.*}}Er"}
   ! FIRDialect:     fir.store {{%.*}} to [[R]] : !fir.ref<!fir.box<!fir.heap<f32>>>
   ! FIRDialect:     fir.store {{%.*}} to [[R]] : !fir.ref<!fir.box<!fir.heap<f32>>>
+  ! FIRDialect:     %[[R_DECL:.*]]:2 = hlfir.declare [[R]] {uniq_name = "_QFsimple_loop_1Er"} : (!fir.ref<!fir.box<!fir.heap<f32>>>) -> (!fir.ref<!fir.box<!fir.heap<f32>>>, !fir.ref<!fir.box<!fir.heap<f32>>>)
 
   ! FIRDialect:     %[[WS_LB:.*]] = arith.constant 1 : i32
   ! FIRDialect:     %[[WS_UB:.*]] = arith.constant 9 : i32
@@ -262,18 +307,18 @@ subroutine simple_loop_1
   ! FIRDialect:     omp.wsloop for (%[[I:.*]]) : i32 = (%[[WS_LB]]) to (%[[WS_UB]]) inclusive step (%[[WS_STEP]])
   !$OMP DO
   do i=1, 9
-  ! FIRDialect:     fir.store %[[I]] to %[[ALLOCA_IV:.*]] : !fir.ref<i32>
-  ! FIRDialect:     %[[LOAD_IV:.*]] = fir.load %[[ALLOCA_IV]] : !fir.ref<i32>
-  ! FIRDialect:     fir.call @_FortranAioOutputInteger32({{.*}}, %[[LOAD_IV]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
+  ! FIRDialect:     fir.store %[[I]] to %[[ALLOCA_IV_DECL]]#1 : !fir.ref<i32>
+  ! FIRDialect:     %[[LOAD_IV:.*]] = fir.load %[[ALLOCA_IV_DECL]]#0 : !fir.ref<i32>
+  ! FIRDialect:     fir.call @_FortranAioOutputInteger32({{.*}}, %[[LOAD_IV]]) {{.*}} : (!fir.ref<i8>, i32) -> i1
     print*, i
   end do
   ! FIRDialect:     omp.yield
-  ! FIRDialect:     {{%.*}} = fir.load [[R]] : !fir.ref<!fir.box<!fir.heap<f32>>>
+  ! FIRDialect:     {{%.*}} = fir.load %[[R_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<f32>>>
   ! FIRDialect:     fir.if {{%.*}} {
-  ! FIRDialect:     [[LD:%.*]] = fir.load [[R]] : !fir.ref<!fir.box<!fir.heap<f32>>>
+  ! FIRDialect:     [[LD:%.*]] = fir.load %[[R_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<f32>>>
   ! FIRDialect:     [[AD:%.*]] = fir.box_addr [[LD]] : (!fir.box<!fir.heap<f32>>) -> !fir.heap<f32>
   ! FIRDialect:     fir.freemem [[AD]] : !fir.heap<f32>
-  ! FIRDialect:     fir.store {{%.*}} to [[R]] : !fir.ref<!fir.box<!fir.heap<f32>>>
+  ! FIRDialect:     fir.store {{%.*}} to %[[R_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<f32>>>
   !$OMP END DO
   ! FIRDialect:  omp.terminator
   !$OMP END PARALLEL
@@ -287,9 +332,11 @@ subroutine simple_loop_2
   !$OMP PARALLEL
   ! FIRDialect:     %[[ALLOCA_IV:.*]] = fir.alloca i32 {{{.*}}, pinned}
 
+  ! FIRDialect:     %[[ALLOCA_IV_DECL:.*]]:2 = hlfir.declare %[[ALLOCA_IV]] {uniq_name = "{{.*}}Ei"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
   ! FIRDialect:     [[R:%.*]] = fir.alloca !fir.box<!fir.heap<f32>> {bindc_name = "r", pinned, uniq_name = "{{.*}}Er"}
   ! FIRDialect:     fir.store {{%.*}} to [[R]] : !fir.ref<!fir.box<!fir.heap<f32>>>
   ! FIRDialect:     fir.store {{%.*}} to [[R]] : !fir.ref<!fir.box<!fir.heap<f32>>>
+  ! FIRDialect:     %[[R_DECL:.*]]:2 = hlfir.declare [[R]] {uniq_name = "{{.*}}Er"} : (!fir.ref<!fir.box<!fir.heap<f32>>>) -> (!fir.ref<!fir.box<!fir.heap<f32>>>, !fir.ref<!fir.box<!fir.heap<f32>>>)
 
   ! FIRDialect:     %[[WS_LB:.*]] = arith.constant 1 : i32
   ! FIRDialect:     %[[WS_UB:.*]] = arith.constant 9 : i32
@@ -298,18 +345,18 @@ subroutine simple_loop_2
   ! FIRDialect:     omp.wsloop for (%[[I:.*]]) : i32 = (%[[WS_LB]]) to (%[[WS_UB]]) inclusive step (%[[WS_STEP]])
   !$OMP DO PRIVATE(r)
   do i=1, 9
-  ! FIRDialect:     fir.store %[[I]] to %[[ALLOCA_IV:.*]] : !fir.ref<i32>
-  ! FIRDialect:     %[[LOAD_IV:.*]] = fir.load %[[ALLOCA_IV]] : !fir.ref<i32>
+  ! FIRDialect:     fir.store %[[I]] to %[[ALLOCA_IV_DECL]]#1 : !fir.ref<i32>
+  ! FIRDialect:     %[[LOAD_IV:.*]] = fir.load %[[ALLOCA_IV_DECL]]#0 : !fir.ref<i32>
   ! FIRDialect:     fir.call @_FortranAioOutputInteger32({{.*}}, %[[LOAD_IV]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
     print*, i
   end do
   ! FIRDialect:     omp.yield
-  ! FIRDialect:     {{%.*}} = fir.load [[R]] : !fir.ref<!fir.box<!fir.heap<f32>>>
+  ! FIRDialect:     {{%.*}} = fir.load %[[R_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<f32>>>
   ! FIRDialect:     fir.if {{%.*}} {
-  ! FIRDialect:     [[LD:%.*]] = fir.load [[R]] : !fir.ref<!fir.box<!fir.heap<f32>>>
+  ! FIRDialect:     [[LD:%.*]] = fir.load %[[R_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<f32>>>
   ! FIRDialect:     [[AD:%.*]] = fir.box_addr [[LD]] : (!fir.box<!fir.heap<f32>>) -> !fir.heap<f32>
   ! FIRDialect:     fir.freemem [[AD]] : !fir.heap<f32>
-  ! FIRDialect:     fir.store {{%.*}} to [[R]] : !fir.ref<!fir.box<!fir.heap<f32>>>
+  ! FIRDialect:     fir.store {{%.*}} to %[[R_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<f32>>>
   !$OMP END DO
   ! FIRDialect:  omp.terminator
   !$OMP END PARALLEL
@@ -321,10 +368,12 @@ subroutine simple_loop_3
   real, allocatable :: r;
   ! FIRDialect:  omp.parallel
   ! FIRDialect:     %[[ALLOCA_IV:.*]] = fir.alloca i32 {{{.*}}, pinned}
+  ! FIRDialect:     %[[ALLOCA_IV_DECL:.*]]:2 = hlfir.declare %[[ALLOCA_IV]] {uniq_name = "{{.*}}Ei"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
 
   ! FIRDialect:     [[R:%.*]] = fir.alloca !fir.box<!fir.heap<f32>> {bindc_name = "r", pinned, uniq_name = "{{.*}}Er"}
   ! FIRDialect:     fir.store {{%.*}} to [[R]] : !fir.ref<!fir.box<!fir.heap<f32>>>
   ! FIRDialect:     fir.store {{%.*}} to [[R]] : !fir.ref<!fir.box<!fir.heap<f32>>>
+  ! FIRDialect:     [[R_DECL:%.*]]:2 = hlfir.declare [[R]] {uniq_name = "{{.*}}Er"} : (!fir.ref<!fir.box<!fir.heap<f32>>>) -> (!fir.ref<!fir.box<!fir.heap<f32>>>, !fir.ref<!fir.box<!fir.heap<f32>>>)
 
   ! FIRDialect:     %[[WS_LB:.*]] = arith.constant 1 : i32
   ! FIRDialect:     %[[WS_UB:.*]] = arith.constant 9 : i32
@@ -333,18 +382,18 @@ subroutine simple_loop_3
   ! FIRDialect:     omp.wsloop for (%[[I:.*]]) : i32 = (%[[WS_LB]]) to (%[[WS_UB]]) inclusive step (%[[WS_STEP]])
   !$OMP PARALLEL DO PRIVATE(r)
   do i=1, 9
-  ! FIRDialect:     fir.store %[[I]] to %[[ALLOCA_IV:.*]] : !fir.ref<i32>
-  ! FIRDialect:     %[[LOAD_IV:.*]] = fir.load %[[ALLOCA_IV]] : !fir.ref<i32>
+  ! FIRDialect:     fir.store %[[I]] to %[[ALLOCA_IV_DECL:.*]]#1 : !fir.ref<i32>
+  ! FIRDialect:     %[[LOAD_IV:.*]] = fir.load %[[ALLOCA_IV_DECL]]#0 : !fir.ref<i32>
   ! FIRDialect:     fir.call @_FortranAioOutputInteger32({{.*}}, %[[LOAD_IV]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
     print*, i
   end do
   ! FIRDialect:     omp.yield
-  ! FIRDialect:     {{%.*}} = fir.load [[R]] : !fir.ref<!fir.box<!fir.heap<f32>>>
+  ! FIRDialect:     {{%.*}} = fir.load [[R_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<f32>>>
   ! FIRDialect:     fir.if {{%.*}} {
-  ! FIRDialect:     [[LD:%.*]] = fir.load [[R]] : !fir.ref<!fir.box<!fir.heap<f32>>>
+  ! FIRDialect:     [[LD:%.*]] = fir.load [[R_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<f32>>>
   ! FIRDialect:     [[AD:%.*]] = fir.box_addr [[LD]] : (!fir.box<!fir.heap<f32>>) -> !fir.heap<f32>
   ! FIRDialect:     fir.freemem [[AD]] : !fir.heap<f32>
-  ! FIRDialect:     fir.store {{%.*}} to [[R]] : !fir.ref<!fir.box<!fir.heap<f32>>>
+  ! FIRDialect:     fir.store {{%.*}} to [[R_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<f32>>>
   !$OMP END PARALLEL DO
   ! FIRDialect:  omp.terminator
 end subroutine
@@ -356,6 +405,7 @@ subroutine simd_loop_1
   ! FIRDialect:     [[R:%.*]] = fir.alloca !fir.box<!fir.heap<f32>> {bindc_name = "r", pinned, uniq_name = "{{.*}}Er"}
   ! FIRDialect:     fir.store {{%.*}} to [[R]] : !fir.ref<!fir.box<!fir.heap<f32>>>
   ! FIRDialect:     fir.store {{%.*}} to [[R]] : !fir.ref<!fir.box<!fir.heap<f32>>>
+  ! FIRDialect:     [[R_DECL:%.*]]:2 = hlfir.declare [[R]] {uniq_name = "{{.*}}r"} : (!fir.ref<!fir.box<!fir.heap<f32>>>) -> (!fir.ref<!fir.box<!fir.heap<f32>>>, !fir.ref<!fir.box<!fir.heap<f32>>>)
 
   ! FIRDialect:     %[[LB:.*]] = arith.constant 1 : i32
   ! FIRDialect:     %[[UB:.*]] = arith.constant 9 : i32
@@ -364,17 +414,17 @@ subroutine simd_loop_1
   ! FIRDialect: omp.simdloop for (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) {
   !$OMP SIMD PRIVATE(r)
   do i=1, 9
-  ! FIRDialect:     fir.store %[[I]] to %[[LOCAL:.*]] : !fir.ref<i32>
-  ! FIRDialect:     %[[LOAD_IV:.*]] = fir.load %[[LOCAL]] : !fir.ref<i32>
+  ! FIRDialect:     fir.store %[[I]] to %[[LOCAL:.*]]#1 : !fir.ref<i32>
+  ! FIRDialect:     %[[LOAD_IV:.*]] = fir.load %[[LOCAL]]#0 : !fir.ref<i32>
   ! FIRDialect:     fir.call @_FortranAioOutputInteger32({{.*}}, %[[LOAD_IV]]) {{.*}}: (!fir.ref<i8>, i32) -> i1
     print*, i
   end do
   !$OMP END SIMD
   ! FIRDialect:     omp.yield
-  ! FIRDialect:     {{%.*}} = fir.load [[R]] : !fir.ref<!fir.box<!fir.heap<f32>>>
+  ! FIRDialect:     {{%.*}} = fir.load [[R_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<f32>>>
   ! FIRDialect:     fir.if {{%.*}} {
-  ! FIRDialect:     [[LD:%.*]] = fir.load [[R]] : !fir.ref<!fir.box<!fir.heap<f32>>>
+  ! FIRDialect:     [[LD:%.*]] = fir.load [[R_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<f32>>>
   ! FIRDialect:     [[AD:%.*]] = fir.box_addr [[LD]] : (!fir.box<!fir.heap<f32>>) -> !fir.heap<f32>
   ! FIRDialect:     fir.freemem [[AD]] : !fir.heap<f32>
-  ! FIRDialect:     fir.store {{%.*}} to [[R]] : !fir.ref<!fir.box<!fir.heap<f32>>>
+  ! FIRDialect:     fir.store {{%.*}} to [[R_DECL]]#1 : !fir.ref<!fir.box<!fir.heap<f32>>>
 end subroutine

From d61ab5e8580058f5d98015da582133b478345bfe Mon Sep 17 00:00:00 2001
From: Jacek Caban <jacek@codeweavers.com>
Date: Thu, 2 Nov 2023 12:16:02 +0100
Subject: [PATCH 811/877] [lld] Align EC code region boundaries. (#69100)

Boundaries between code chunks of different architecture are always
aligned. 0x1000 seems to be a constant, this does not seem to be
affected by any command line alignment argument.
---
 lld/COFF/Writer.cpp                |  9 +++++++++
 lld/test/COFF/arm64ec-codemap.test | 28 ++++++++++++++--------------
 2 files changed, 23 insertions(+), 14 deletions(-)

diff --git a/lld/COFF/Writer.cpp b/lld/COFF/Writer.cpp
index 895a6e00710c6..960328d686852 100644
--- a/lld/COFF/Writer.cpp
+++ b/lld/COFF/Writer.cpp
@@ -1423,8 +1423,17 @@ void Writer::assignAddresses() {
     // If /FUNCTIONPADMIN is used, functions are padded in order to create a
     // hotpatchable image.
     uint32_t padding = sec->isCodeSection() ? config->functionPadMin : 0;
+    std::optional<chpe_range_type> prevECRange;
 
     for (Chunk *c : sec->chunks) {
+      // Alignment EC code range baudaries.
+      if (isArm64EC(ctx.config.machine) && sec->isCodeSection()) {
+        std::optional<chpe_range_type> rangeType = c->getArm64ECRangeType();
+        if (rangeType != prevECRange) {
+          virtualSize = alignTo(virtualSize, 4096);
+          prevECRange = rangeType;
+        }
+      }
       if (padding && c->isHotPatchable())
         virtualSize += padding;
       virtualSize = alignTo(virtualSize, c->getAlignment());
diff --git a/lld/test/COFF/arm64ec-codemap.test b/lld/test/COFF/arm64ec-codemap.test
index 84b40536ee51e..24eedc6f8e96e 100644
--- a/lld/test/COFF/arm64ec-codemap.test
+++ b/lld/test/COFF/arm64ec-codemap.test
@@ -93,7 +93,7 @@ RUN:          codemap3.obj loadconfig-arm64ec.obj -dll -noentry -merge:test=.tex
 RUN: llvm-readobj --coff-load-config testm.dll | FileCheck -check-prefix=CODEMAPM %s
 CODEMAPM:      CodeMap [
 CODEMAPM-NEXT:   0x1000 - 0x1010  ARM64EC
-CODEMAPM-NEXT:   0x2000 - 0x3004  X64
+CODEMAPM-NEXT:   0x2000 - 0x200E  X64
 CODEMAPM-NEXT: ]
 
 RUN: llvm-objdump -d testm.dll | FileCheck -check-prefix=DISASMM %s
@@ -107,9 +107,9 @@ DISASMM-NEXT: 18000100c: d65f03c0     ret
 DISASMM-NEXT:                 ...
 DISASMM-NEXT: 180002000: b8 03 00 00 00               movl    $0x3, %eax
 DISASMM-NEXT: 180002005: c3                           retq
-DISASMM-NEXT:                 ...
-DISASMM-NEXT: 180002ffe: 00 00                        addb    %al, (%rax)
-DISASMM-NEXT: 180003000: b8 06 00 00 00               movl    $0x6, %eax
+DISASMM-NEXT: 180002006: 00 00                        addb    %al, (%rax)
+DISASMM-NEXT: 180002008: b8 06 00 00 00               movl    $0x6, %eax
+DISASMM-NEXT: 18000200d: c3                           retq
 
 Merging data sections into code sections causes data to be separated from the code when sorting chunks.
 
@@ -120,8 +120,8 @@ RUN: llvm-readobj --coff-load-config testdm.dll | FileCheck -check-prefix=CODEMA
 CODEMAPDM:      CodeMap [
 CODEMAPDM-NEXT:     0x2000 - 0x2008  ARM64EC
 CODEMAPDM-NEXT:     0x3000 - 0x3006  X64
-CODEMAPDM-NEXT:     0x5200 - 0x5208  ARM64EC
-CODEMAPDM-NEXT:     0x6000 - 0x6006  X64
+CODEMAPDM-NEXT:     0x6000 - 0x6008  ARM64EC
+CODEMAPDM-NEXT:     0x7000 - 0x7006  X64
 CODEMAPDM-NEXT: ]
 
 RUN: llvm-objdump -d testdm.dll | FileCheck -check-prefix=DISASMDM %s
@@ -139,11 +139,11 @@ DISASMDM-EMPTY:
 DISASMDM-NEXT: Disassembly of section test:
 DISASMDM-EMPTY:
 DISASMDM-NEXT: 0000000180005000 <test>:
-DISASMDM:      180005200: 528000a0     mov     w0, #0x5
-DISASMDM-NEXT: 180005204: d65f03c0     ret
+DISASMDM:      180006000: 528000a0     mov     w0, #0x5
+DISASMDM-NEXT: 180006004: d65f03c0     ret
 DISASMDM-NEXT:                 ...
-DISASMDM-NEXT: 180006000: b8 06 00 00 00               movl    $0x6, %eax
-DISASMDM-NEXT: 180006005: c3                           retq
+DISASMDM-NEXT: 180007000: b8 06 00 00 00               movl    $0x6, %eax
+DISASMDM-NEXT: 180007005: c3                           retq
 
 #--- arm64-func-sym.s
     .text
@@ -156,7 +156,7 @@ arm64_func_sym:
 #--- arm64ec-func-sym.s
     .text
     .globl arm64ec_func_sym
-    .p2align 12, 0x0
+    .p2align 2, 0x0
 arm64ec_func_sym:
     mov w0, #2
     ret
@@ -171,14 +171,14 @@ arm64ec_func_sym2:
 #--- x86_64-func-sym.s
     .text
     .globl x86_64_func_sym
-    .p2align 12, 0x0
+    .p2align 2, 0x0
 x86_64_func_sym:
     movl $3, %eax
     retq
 
     .section test, "xr"
     .globl x86_64_func_sym2
-    .p2align 12, 0x0
+    .p2align 2, 0x0
 x86_64_func_sym2:
     movl $6, %eax
     retq
@@ -228,7 +228,7 @@ code_map:
     .rva arm64ec_func_sym + 1
     .word 16
     .rva x86_64_func_sym + 2
-    .word 0x1004
+    .word 14
 
     .globl code_map_count
 code_map_count = 2

From 43e13fdc9e8edd425f640c424071377879c07822 Mon Sep 17 00:00:00 2001
From: Congcong Cai <congcongcai0907@163.com>
Date: Thu, 2 Nov 2023 19:30:06 +0800
Subject: [PATCH 812/877] [NFC][clang-tidy]refactor isAssignmentToMemberOf in
 PreferMemberInitializerCheck (#71006)

---
 .../PreferMemberInitializerCheck.cpp          | 38 +++++++++++--------
 1 file changed, 22 insertions(+), 16 deletions(-)

diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/PreferMemberInitializerCheck.cpp b/clang-tools-extra/clang-tidy/cppcoreguidelines/PreferMemberInitializerCheck.cpp
index b6daf8b936bde..23d7303371529 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/PreferMemberInitializerCheck.cpp
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/PreferMemberInitializerCheck.cpp
@@ -118,45 +118,50 @@ static void updateAssignmentLevel(
   }
 }
 
-static std::pair<const FieldDecl *, const Expr *>
+struct AssignmentPair {
+  const FieldDecl *Field;
+  const Expr *Init;
+};
+
+static std::optional<AssignmentPair>
 isAssignmentToMemberOf(const CXXRecordDecl *Rec, const Stmt *S,
                        const CXXConstructorDecl *Ctor) {
   if (const auto *BO = dyn_cast<BinaryOperator>(S)) {
     if (BO->getOpcode() != BO_Assign)
-      return std::make_pair(nullptr, nullptr);
+      return {};
 
     const auto *ME = dyn_cast<MemberExpr>(BO->getLHS()->IgnoreParenImpCasts());
     if (!ME)
-      return std::make_pair(nullptr, nullptr);
+      return {};
 
     const auto *Field = dyn_cast<FieldDecl>(ME->getMemberDecl());
     if (!Field)
-      return std::make_pair(nullptr, nullptr);
+      return {};
 
     if (!isa<CXXThisExpr>(ME->getBase()))
-      return std::make_pair(nullptr, nullptr);
+      return {};
     const Expr *Init = BO->getRHS()->IgnoreParenImpCasts();
-    return std::make_pair(Field, Init);
+    return AssignmentPair{Field, Init};
   }
   if (const auto *COCE = dyn_cast<CXXOperatorCallExpr>(S)) {
     if (COCE->getOperator() != OO_Equal)
-      return std::make_pair(nullptr, nullptr);
+      return {};
 
     const auto *ME =
         dyn_cast<MemberExpr>(COCE->getArg(0)->IgnoreParenImpCasts());
     if (!ME)
-      return std::make_pair(nullptr, nullptr);
+      return {};
 
     const auto *Field = dyn_cast<FieldDecl>(ME->getMemberDecl());
     if (!Field)
-      return std::make_pair(nullptr, nullptr);
+      return {};
 
     if (!isa<CXXThisExpr>(ME->getBase()))
-      return std::make_pair(nullptr, nullptr);
+      return {};
     const Expr *Init = COCE->getArg(1)->IgnoreParenImpCasts();
-    return std::make_pair(Field, Init);
+    return AssignmentPair{Field, Init};
   }
-  return std::make_pair(nullptr, nullptr);
+  return {};
 }
 
 PreferMemberInitializerCheck::PreferMemberInitializerCheck(
@@ -216,11 +221,12 @@ void PreferMemberInitializerCheck::check(
         return;
     }
 
-    const FieldDecl *Field = nullptr;
-    const Expr *InitValue = nullptr;
-    std::tie(Field, InitValue) = isAssignmentToMemberOf(Class, S, Ctor);
-    if (!Field)
+    std::optional<AssignmentPair> AssignmentToMember =
+        isAssignmentToMemberOf(Class, S, Ctor);
+    if (!AssignmentToMember)
       continue;
+    const FieldDecl *Field = AssignmentToMember->Field;
+    const Expr *InitValue = AssignmentToMember->Init;
     updateAssignmentLevel(Field, InitValue, Ctor, AssignedFields);
     if (!canAdvanceAssignment(AssignedFields[Field]))
       continue;

From 09e3b1019c233989743e929becb75b65e90e1d74 Mon Sep 17 00:00:00 2001
From: Hugh Delaney <46290137+hdelan@users.noreply.github.com>
Date: Thu, 2 Nov 2023 11:41:24 +0000
Subject: [PATCH 813/877] [SYCL] Fix build when not using XPTI tracing (#11742)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The build was broken with:
```
/home/hugh/llvm/sycl/source/handler.cpp:247:48: error: ‘CmdTraceEvent’ was not declared in this scope
  247 |       auto EnqueueKernel = [&, CmdTraceEvent = CmdTraceEvent,
      |                                                ^~~~~~~~~~~~~
/home/hugh/llvm/sycl/source/handler.cpp:248:42: error: ‘InstanceID’ was not declared in this scope; did you mean ‘distance’?
  248 |                             InstanceID = InstanceID]() {
      |                                          ^~~~~~~~~~
      |                                          distance
```
When XPTI tracing is not used.
---
 sycl/source/handler.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp
index 394adfd6fdca0..a54c7b0e20283 100644
--- a/sycl/source/handler.cpp
+++ b/sycl/source/handler.cpp
@@ -242,10 +242,11 @@ event handler::finalize() {
       auto [CmdTraceEvent, InstanceID] = emitKernelInstrumentationData(
           StreamID, MKernel, MCodeLoc, MKernelName, MQueue, MNDRDesc,
           KernelBundleImpPtr, MArgs);
-#endif
-
       auto EnqueueKernel = [&, CmdTraceEvent = CmdTraceEvent,
                             InstanceID = InstanceID]() {
+#else
+      auto EnqueueKernel = [&]() {
+#endif
         // 'Result' for single point of return
         pi_int32 Result = PI_ERROR_INVALID_VALUE;
 #ifdef XPTI_ENABLE_INSTRUMENTATION

From 98da18344eed96cabfd2214131185686b1983412 Mon Sep 17 00:00:00 2001
From: Vlad Serebrennikov <serebrennikov.vladislav@gmail.com>
Date: Thu, 2 Nov 2023 16:17:17 +0400
Subject: [PATCH 814/877] [clang] Remove diagnostic that came with
 `[[clang::preferred_type]]` (#70632)

https://github.com/llvm/llvm-project/pull/69104 introduce a diagnostic that checked underlying type of an enum against type of bit-field that is annotated with `[[clang::preferred_type]]`. When I tried to introduce this annotation in https://github.com/llvm/llvm-project/pull/70349, it turned out to be too chatty, despite effort to avoid that.
---
 clang/include/clang/Basic/DiagnosticGroups.td |  1 -
 .../clang/Basic/DiagnosticSemaKinds.td        |  3 ---
 clang/lib/Sema/SemaDeclAttr.cpp               | 22 -------------------
 3 files changed, 26 deletions(-)

diff --git a/clang/include/clang/Basic/DiagnosticGroups.td b/clang/include/clang/Basic/DiagnosticGroups.td
index 9a8f3f03b39d1..4c4820d17b718 100644
--- a/clang/include/clang/Basic/DiagnosticGroups.td
+++ b/clang/include/clang/Basic/DiagnosticGroups.td
@@ -54,7 +54,6 @@ def BitFieldConstantConversion : DiagGroup<"bitfield-constant-conversion",
                                            [SingleBitBitFieldConstantConversion]>;
 def BitFieldEnumConversion : DiagGroup<"bitfield-enum-conversion">;
 def BitFieldWidth : DiagGroup<"bitfield-width">;
-def BitFieldType : DiagGroup<"bitfield-type">;
 def CompoundTokenSplitByMacro : DiagGroup<"compound-token-split-by-macro">;
 def CompoundTokenSplitBySpace : DiagGroup<"compound-token-split-by-space">;
 def CompoundTokenSplit : DiagGroup<"compound-token-split",
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 224c0df7f1fb7..474afc2fb99c1 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -3167,9 +3167,6 @@ def err_invalid_branch_protection_spec : Error<
   "invalid or misplaced branch protection specification '%0'">;
 def warn_unsupported_branch_protection_spec : Warning<
   "unsupported branch protection specification '%0'">, InGroup<BranchProtection>;
-def warn_attribute_underlying_type_mismatch : Warning<
-  "underlying type %0 of enumeration %1 doesn't match bit-field type %2">,
-  InGroup<BitFieldType>;
 
 def warn_unsupported_target_attribute
     : Warning<"%select{unsupported|duplicate|unknown}0%select{| CPU|"
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index 5e43f66d8be8d..842a01a88cd3c 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -5928,28 +5928,6 @@ static void handlePreferredTypeAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
   S.RequireCompleteType(ParmTSI->getTypeLoc().getBeginLoc(), QT,
                         diag::err_incomplete_type);
 
-  if (QT->isEnumeralType()) {
-    auto IsCorrespondingType = [&](QualType LHS, QualType RHS) {
-      assert(LHS != RHS);
-      if (LHS->isSignedIntegerType())
-        return LHS == S.getASTContext().getCorrespondingSignedType(RHS);
-      return LHS == S.getASTContext().getCorrespondingUnsignedType(RHS);
-    };
-    QualType BitfieldType =
-        cast<FieldDecl>(D)->getType()->getCanonicalTypeUnqualified();
-    QualType EnumUnderlyingType = QT->getAs<EnumType>()
-                                      ->getDecl()
-                                      ->getIntegerType()
-                                      ->getCanonicalTypeUnqualified();
-    if (EnumUnderlyingType != BitfieldType &&
-        !IsCorrespondingType(EnumUnderlyingType, BitfieldType)) {
-      S.Diag(ParmTSI->getTypeLoc().getBeginLoc(),
-             diag::warn_attribute_underlying_type_mismatch)
-          << EnumUnderlyingType << QT << BitfieldType;
-      return;
-    }
-  }
-
   D->addAttr(::new (S.Context) PreferredTypeAttr(S.Context, AL, ParmTSI));
 }
 

From fb2bc1cc70454ee7a0c07db3de00b3ccf1919712 Mon Sep 17 00:00:00 2001
From: Dmitry Sidorov <dmitry.sidorov@intel.com>
Date: Thu, 26 Oct 2023 04:36:07 -0700
Subject: [PATCH 815/877] do not rely on version of compiler being $VER.0.0
 (#2194)

Co-authored-by: Stanley Gambarin <stanley.gambarin@intel.com>

Original commit:
https://github.com/KhronosGroup/SPIRV-LLVM-Translator/commit/587443c
---
 llvm-spirv/test/DebugInfo/NonSemantic/DebugFunction.cl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm-spirv/test/DebugInfo/NonSemantic/DebugFunction.cl b/llvm-spirv/test/DebugInfo/NonSemantic/DebugFunction.cl
index 4329491673caa..f84b303ad2dfc 100644
--- a/llvm-spirv/test/DebugInfo/NonSemantic/DebugFunction.cl
+++ b/llvm-spirv/test/DebugInfo/NonSemantic/DebugFunction.cl
@@ -25,7 +25,7 @@ void kernel k() {
 // CHECK-SPIRV-DAG: String [[foo:[0-9]+]] "foo"
 // CHECK-SPIRV-DAG: String [[#EmptyStr:]] ""
 // CHECK-SPIRV-DAG: String [[k:[0-9]+]] "k"
-// CHECK-SPIRV-DAG: String [[#CV:]] "{{.*}}clang version [[#]].0.0
+// CHECK-SPIRV-DAG: String [[#CV:]] "{{.*}}clang version [[#]].[[#]].[[#]]
 // CHECK-SPIRV: [[#CU:]] [[#]] DebugCompilationUnit
 // CHECK-SPIRV: [[#FuncFoo:]] [[#]] DebugFunction [[foo]] {{.*}} [[#CU]]
 // CHECK-SPIRV: [[#FuncK:]] [[#]] DebugFunction [[k]] {{.*}} [[#CU]]

From b359eca9d4eb6de1f009ad93c637215f77321bb0 Mon Sep 17 00:00:00 2001
From: Nikita Kornev <nikita.kornev@intel.com>
Date: Thu, 26 Oct 2023 18:07:29 +0200
Subject: [PATCH 816/877] Support sycl_ext_oneapi_prefetch (#2189)

SYCL part: https://github.com/intel/llvm/pull/11458
https://github.com/intel/llvm/pull/11597

Handle new properties and decorate prefetch's arg.

Original commit:
https://github.com/KhronosGroup/SPIRV-LLVM-Translator/commit/a76f24e
---
 llvm-spirv/lib/SPIRV/SPIRVWriter.cpp          | 34 +++++--
 .../decorate-prefetch-w-cache-controls.ll     | 89 +++++++++++++++++++
 2 files changed, 118 insertions(+), 5 deletions(-)
 create mode 100644 llvm-spirv/test/extensions/INTEL/SPV_INTEL_cache_controls/decorate-prefetch-w-cache-controls.ll

diff --git a/llvm-spirv/lib/SPIRV/SPIRVWriter.cpp b/llvm-spirv/lib/SPIRV/SPIRVWriter.cpp
index 8619ffd782008..87cef8ce18d7e 100644
--- a/llvm-spirv/lib/SPIRV/SPIRVWriter.cpp
+++ b/llvm-spirv/lib/SPIRV/SPIRVWriter.cpp
@@ -2946,10 +2946,12 @@ struct AnnotationDecorations {
   DecorationsInfoVec MemoryAccessesVec;
   DecorationsInfoVec BufferLocationVec;
   DecorationsInfoVec LatencyControlVec;
+  DecorationsInfoVec CacheControlVec;
 
   bool empty() {
     return (MemoryAttributesVec.empty() && MemoryAccessesVec.empty() &&
-            BufferLocationVec.empty() && LatencyControlVec.empty());
+            BufferLocationVec.empty() && LatencyControlVec.empty() &&
+            CacheControlVec.empty());
   }
 };
 
@@ -3144,6 +3146,8 @@ AnnotationDecorations tryParseAnnotationString(SPIRVModule *BM,
       BM->isAllowedToUseExtension(ExtensionID::SPV_INTEL_fpga_buffer_location);
   const bool AllowFPGALatencyControl =
       BM->isAllowedToUseExtension(ExtensionID::SPV_INTEL_fpga_latency_control);
+  const bool AllowCacheControls =
+      BM->isAllowedToUseExtension(ExtensionID::SPV_INTEL_cache_controls);
 
   bool ValidDecorationFound = false;
   DecorationsInfoVec DecorationsVec;
@@ -3173,6 +3177,11 @@ AnnotationDecorations tryParseAnnotationString(SPIRVModule *BM,
                         DecorationLatencyControlConstraintINTEL)) {
           Decorates.LatencyControlVec.emplace_back(
               static_cast<Decoration>(DecorationKind), std::move(DecValues));
+        } else if (AllowCacheControls &&
+                   DecorationKind ==
+                       internal::DecorationCacheControlLoadINTEL) {
+          Decorates.CacheControlVec.emplace_back(
+              static_cast<Decoration>(DecorationKind), std::move(DecValues));
         } else {
           DecorationsVec.emplace_back(static_cast<Decoration>(DecorationKind),
                                       std::move(DecValues));
@@ -3282,7 +3291,7 @@ void addAnnotationDecorations(SPIRVEntry *E, DecorationsInfoVec &Decorations) {
       if (I.first != DecorationUserSemantic)
         continue;
 
-    switch (I.first) {
+    switch (static_cast<size_t>(I.first)) {
     case DecorationUserSemantic:
       M->getErrorLog().checkError(I.second.size() == 1,
                                   SPIRVEC_InvalidLlvmModule,
@@ -3403,6 +3412,21 @@ void addAnnotationDecorations(SPIRVEntry *E, DecorationsInfoVec &Decorations) {
       }
       break;
     }
+    case spv::internal::DecorationCacheControlLoadINTEL: {
+      if (M->isAllowedToUseExtension(ExtensionID::SPV_INTEL_cache_controls)) {
+        M->getErrorLog().checkError(
+            I.second.size() == 2, SPIRVEC_InvalidLlvmModule,
+            "CacheControlLoadINTEL requires exactly 2 extra operands");
+        SPIRVWord CacheLevel = 0;
+        SPIRVWord CacheControl = 0;
+        StringRef(I.second[0]).getAsInteger(10, CacheLevel);
+        StringRef(I.second[1]).getAsInteger(10, CacheControl);
+        E->addDecorate(new SPIRVDecorateCacheControlLoadINTEL(
+            E, CacheLevel,
+            static_cast<internal::LoadCacheControlINTEL>(CacheControl)));
+      }
+    }
+
     default:
       // Other decorations are either not supported by the translator or
       // handled in other places.
@@ -4233,9 +4257,7 @@ SPIRVValue *LLVMToSPIRVBase::transIntrinsicInst(IntrinsicInst *II,
     } else {
       // Memory accesses to a standalone pointer variable
       auto *DecSubj = transValue(II->getArgOperand(0), BB);
-      if (Decorations.MemoryAccessesVec.empty() &&
-          Decorations.BufferLocationVec.empty() &&
-          Decorations.LatencyControlVec.empty())
+      if (Decorations.empty())
         DecSubj->addDecorate(new SPIRVDecorateUserSemanticAttr(
             DecSubj, AnnotationString.c_str()));
       else {
@@ -4248,6 +4270,8 @@ SPIRVValue *LLVMToSPIRVBase::transIntrinsicInst(IntrinsicInst *II,
           addAnnotationDecorations(DecSubj, Decorations.BufferLocationVec);
           addAnnotationDecorations(DecSubj, Decorations.LatencyControlVec);
         }
+
+        addAnnotationDecorations(DecSubj, Decorations.CacheControlVec);
       }
       II->replaceAllUsesWith(II->getOperand(0));
     }
diff --git a/llvm-spirv/test/extensions/INTEL/SPV_INTEL_cache_controls/decorate-prefetch-w-cache-controls.ll b/llvm-spirv/test/extensions/INTEL/SPV_INTEL_cache_controls/decorate-prefetch-w-cache-controls.ll
new file mode 100644
index 0000000000000..7a876b6941bd6
--- /dev/null
+++ b/llvm-spirv/test/extensions/INTEL/SPV_INTEL_cache_controls/decorate-prefetch-w-cache-controls.ll
@@ -0,0 +1,89 @@
+; RUN: llvm-as %s -o %t.bc
+; RUN: llvm-spirv --spirv-ext=+SPV_INTEL_cache_controls -spirv-text %t.bc -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+
+; RUN: llvm-spirv --spirv-ext=+SPV_INTEL_cache_controls %t.bc -o %t.spv
+; RUN: llvm-spirv -r %t.spv --spirv-target-env=SPV-IR -o - | llvm-dis -o - | FileCheck %s --check-prefix=CHECK-LLVM
+
+target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64"
+target triple = "spir64-unknown-unknown"
+
+$_ZTSZ4mainEUlvE_ = comdat any
+
+; https://github.com/KhronosGroup/SPIRV-Registry/blob/main/extensions/INTEL/SPV_INTEL_cache_controls.asciidoc
+; These strings are:
+; {CacheControlLoadINTEL_Token:\22CacheLevel,CacheControl\22}
+@.str.1 = private unnamed_addr addrspace(1) constant [16 x i8] c"../prefetch.hpp\00", section "llvm.metadata"
+@.str.9 = private unnamed_addr addrspace(1) constant [13 x i8] c"{6442:\220,1\22}\00", section "llvm.metadata"
+@.str.10 = private unnamed_addr addrspace(1) constant [13 x i8] c"{6442:\221,1\22}\00", section "llvm.metadata"
+@.str.11 = private unnamed_addr addrspace(1) constant [13 x i8] c"{6442:\222,3\22}\00", section "llvm.metadata"
+
+; these CHECK-SPIRV check that prefetch's arg is decorated with the appropriate
+; CacheLevel and CacheControl values.
+
+; CHECK-SPIRV: Decorate [[PTR_ID1:.*]] CacheControlLoadINTEL 0 1
+; CHECK-SPIRV: Decorate [[PTR_ID2:.*]] CacheControlLoadINTEL 1 1
+; CHECK-SPIRV: Decorate [[PTR_ID3:.*]] CacheControlLoadINTEL 2 3
+
+; CHECK-SPIRV: ExtInst [[#]] [[#]] [[#]] prefetch [[PTR_ID1]] [[#]]
+; CHECK-SPIRV: ExtInst [[#]] [[#]] [[#]] prefetch [[PTR_ID2]] [[#]]
+; CHECK-SPIRV: ExtInst [[#]] [[#]] [[#]] prefetch [[PTR_ID3]] [[#]]
+
+; Check that the appropriate !spirv.Decorations are preserved after reverse
+; translation
+
+; CHECK-LLVM: %[[CALL1:.*]] = call spir_func ptr addrspace(1) @_Z41__spirv_GenericCastToPtrExplicit_ToGlobal{{.*}} !spirv.Decorations ![[MD1:.*]]
+; CHECK-LLVM: call spir_func void @_Z20__spirv_ocl_prefetch{{.*}}(ptr addrspace(1) %[[CALL1]], i64 1)
+; CHECK-LLVM: %[[CALL2:.*]] = call spir_func ptr addrspace(1) @_Z41__spirv_GenericCastToPtrExplicit_ToGlobal{{.*}} !spirv.Decorations ![[MD2:.*]]
+; CHECK-LLVM: call spir_func void @_Z20__spirv_ocl_prefetch{{.*}}(ptr addrspace(1) %[[CALL2]], i64 1)
+; CHECK-LLVM: %[[CALL3:.*]] = call spir_func ptr addrspace(1) @_Z41__spirv_GenericCastToPtrExplicit_ToGlobal{{.*}} !spirv.Decorations ![[MD3:.*]]
+; CHECK-LLVM: call spir_func void @_Z20__spirv_ocl_prefetch{{.*}}(ptr addrspace(1) %[[CALL3]], i64 2)
+
+
+; Function Attrs: convergent norecurse nounwind
+define weak_odr dso_local spir_kernel void @_ZTSZ4mainEUlvE_(ptr addrspace(1) noundef align 1 %_arg_dataPtr) local_unnamed_addr comdat !srcloc !5 !kernel_arg_buffer_location !6 !sycl_fixed_targets !7 !sycl_kernel_omit_args !8 {
+entry:
+  %0 = addrspacecast ptr addrspace(1) %_arg_dataPtr to ptr addrspace(4)
+  %call.i.i.i.i = tail call spir_func noundef ptr addrspace(1) @_Z41__spirv_GenericCastToPtrExplicit_ToGlobalPvi(ptr addrspace(4) noundef %0, i32 noundef 5)
+  %1 = tail call ptr addrspace(1) @llvm.ptr.annotation.p1.p1(ptr addrspace(1) %call.i.i.i.i, ptr addrspace(1) @.str.9, ptr addrspace(1) @.str.1, i32 76, ptr addrspace(1) null)
+  tail call spir_func void @_Z20__spirv_ocl_prefetchPU3AS1Kcm(ptr addrspace(1) noundef %1, i64 noundef 1)
+  %arrayidx3.i = getelementptr inbounds i8, ptr addrspace(4) %0, i64 1
+  %call.i.i.i13.i = tail call spir_func noundef ptr addrspace(1) @_Z41__spirv_GenericCastToPtrExplicit_ToGlobalPvi(ptr addrspace(4) noundef %arrayidx3.i, i32 noundef 5)
+  %2 = tail call ptr addrspace(1) @llvm.ptr.annotation.p1.p1(ptr addrspace(1) %call.i.i.i13.i, ptr addrspace(1) @.str.10, ptr addrspace(1) @.str.1, i32 80, ptr addrspace(1) null)
+  tail call spir_func void @_Z20__spirv_ocl_prefetchPU3AS1Kcm(ptr addrspace(1) noundef %2, i64 noundef 1)
+  %arrayidx7.i = getelementptr inbounds i8, ptr addrspace(4) %0, i64 2
+  %call.i.i.i16.i = tail call spir_func noundef ptr addrspace(1) @_Z41__spirv_GenericCastToPtrExplicit_ToGlobalPvi(ptr addrspace(4) noundef %arrayidx7.i, i32 noundef 5)
+  %3 = tail call ptr addrspace(1) @llvm.ptr.annotation.p1.p1(ptr addrspace(1) %call.i.i.i16.i, ptr addrspace(1) @.str.11, ptr addrspace(1) @.str.1, i32 80, ptr addrspace(1) null)
+  tail call spir_func void @_Z20__spirv_ocl_prefetchPU3AS1Kcm(ptr addrspace(1) noundef %3, i64 noundef 2)
+  ret void
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: readwrite)
+declare ptr addrspace(1) @llvm.ptr.annotation.p1.p1(ptr addrspace(1), ptr addrspace(1), ptr addrspace(1), i32, ptr addrspace(1))
+
+; Function Attrs: convergent nounwind
+declare dso_local spir_func void @_Z20__spirv_ocl_prefetchPU3AS1Kcm(ptr addrspace(1) noundef, i64 noundef) local_unnamed_addr
+
+; Function Attrs: convergent mustprogress nofree nounwind willreturn memory(none)
+declare dso_local spir_func noundef ptr addrspace(1) @_Z41__spirv_GenericCastToPtrExplicit_ToGlobalPvi(ptr addrspace(4) noundef, i32 noundef) local_unnamed_addr
+
+!llvm.module.flags = !{!0, !1}
+!opencl.spir.version = !{!2}
+!spirv.Source = !{!3}
+!llvm.ident = !{!4}
+
+; CHECK-LLVM: ![[MD1]] = !{![[MD10:.*]]}
+; ![[MD10]] = !{i32 6442, i32 0, i32 1}
+; CHECK-LLVM: ![[MD2]] = !{![[MD20:.*]]}
+; ![[MD20]] = !{i32 6442, i32 1, i32 1}
+; CHECK-LLVM: ![[MD3]] = !{![[MD30:.*]]}
+; ![[MD30]] = !{i32 6442, i32 2, i32 3}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 7, !"frame-pointer", i32 2}
+!2 = !{i32 1, i32 2}
+!3 = !{i32 4, i32 100000}
+!4 = !{!"clang version 18.0.0"}
+!5 = !{i32 1522}
+!6 = !{i32 -1}
+!7 = !{}
+!8 = !{i1 false}

From d924b2872ef646ec05d31c183a1b289eee8eee00 Mon Sep 17 00:00:00 2001
From: Marcos Maronas <maarquitos14@users.noreply.github.com>
Date: Thu, 2 Nov 2023 14:01:43 +0000
Subject: [PATCH 817/877] [sycl-web] Update expected output of multiple tests.
 (#11726)

Due to
https://github.com/intel/llvm/commit/f9ead46931aef2978ddf350ba6523638175d7861,
the desugared type is now only dumped when visibly
 different. This patch updates multiple tests to adjust the expected
 output according to this change.
---
 clang/test/SemaSYCL/accessor_inheritance.cpp  |   4 +-
 clang/test/SemaSYCL/array-kernel-param.cpp    |  20 +--
 clang/test/SemaSYCL/basic-kernel-wrapper.cpp  |   2 +-
 .../SemaSYCL/built-in-type-kernel-arg.cpp     |   8 +-
 clang/test/SemaSYCL/device_has_ast.cpp        |   2 +-
 clang/test/SemaSYCL/inheritance.cpp           |  12 +-
 clang/test/SemaSYCL/kernel-handler.cpp        |   4 +-
 .../SemaSYCL/parallel_for_wrapper_attr.cpp    |   4 +-
 clang/test/SemaSYCL/sampler.cpp               |   2 +-
 clang/test/SemaSYCL/stream.cpp                | 168 +++++++++---------
 clang/test/SemaSYCL/union-kernel-param1.cpp   |   6 +-
 clang/test/SemaSYCL/union-kernel-param2.cpp   |   2 +-
 clang/test/SemaSYCL/uses_aspects_ast.cpp      |   2 +-
 clang/test/SemaSYCL/wrapped-accessor.cpp      |   4 +-
 14 files changed, 120 insertions(+), 120 deletions(-)

diff --git a/clang/test/SemaSYCL/accessor_inheritance.cpp b/clang/test/SemaSYCL/accessor_inheritance.cpp
index 76a124f9d1749..8d83bbf5b2738 100644
--- a/clang/test/SemaSYCL/accessor_inheritance.cpp
+++ b/clang/test/SemaSYCL/accessor_inheritance.cpp
@@ -52,8 +52,8 @@ int main() {
 // CHECK-NEXT: DeclRefExpr {{.*}} 'int' lvalue ParmVar {{.*}} '_arg_A' 'int'
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' <LValueToRValue>
 // CHECK-NEXT: DeclRefExpr {{.*}} 'int' lvalue ParmVar {{.*}} '_arg_B' 'int'
-// CHECK-NEXT: CXXConstructExpr {{.*}} 'sycl::accessor<char, 1, sycl::access::mode::read>':'sycl::accessor<char, 1, sycl::access::mode::read>' 'void () noexcept'
-// CHECK-NEXT: CXXConstructExpr {{.*}} 'sycl::accessor<char, 1, sycl::access::mode::read>':'sycl::accessor<char, 1, sycl::access::mode::read>' 'void () noexcept'
+// CHECK-NEXT: CXXConstructExpr {{.*}} 'sycl::accessor<char, 1, sycl::access::mode::read>' 'void () noexcept'
+// CHECK-NEXT: CXXConstructExpr {{.*}} 'sycl::accessor<char, 1, sycl::access::mode::read>' 'void () noexcept'
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'int' <LValueToRValue>
 // CHECK-NEXT: DeclRefExpr {{.*}} 'int' lvalue ParmVar {{.*}} '_arg_C' 'int'
 
diff --git a/clang/test/SemaSYCL/array-kernel-param.cpp b/clang/test/SemaSYCL/array-kernel-param.cpp
index 4609aee098a2e..d9b37c88b28dd 100644
--- a/clang/test/SemaSYCL/array-kernel-param.cpp
+++ b/clang/test/SemaSYCL/array-kernel-param.cpp
@@ -208,14 +208,14 @@ int main() {
 
 // Check Kernel_TemplatedStructArray parameters
 // CHECK: FunctionDecl {{.*}}Kernel_TemplatedStructArray{{.*}} 'void (S<int>)'
-// CHECK-NEXT: ParmVarDecl {{.*}} used _arg_s 'S<int>':'S<int>'
+// CHECK-NEXT: ParmVarDecl {{.*}} used _arg_s 'S<int>'
 // CHECK-NEXT: CompoundStmt
 // CHECK-NEXT: DeclStmt
 // CHECK-NEXT: VarDecl {{.*}} used __SYCLKernel '(lambda at {{.*}}array-kernel-param.cpp{{.*}})' cinit
 // CHECK-NEXT: InitListExpr {{.*}} '(lambda at {{.*}}array-kernel-param.cpp{{.*}})'
-// CHECK-NEXT: CXXConstructExpr {{.*}} 'S<int>':'S<int>' 'void (const S<int> &) noexcept'
+// CHECK-NEXT: CXXConstructExpr {{.*}} 'S<int>' 'void (const S<int> &) noexcept'
 // CHECK-NEXT: ImplicitCastExpr
-// CHECK-NEXT: DeclRefExpr {{.*}} 'S<int>':'S<int>' lvalue ParmVar {{.*}} '_arg_s' 'S<int>':'S<int>'
+// CHECK-NEXT: DeclRefExpr {{.*}} 'S<int>' lvalue ParmVar {{.*}} '_arg_s' 'S<int>'
 
 // Check Kernel_Array_2D parameters
 // CHECK: FunctionDecl {{.*}}Kernel_Array_2D{{.*}} 'void (__wrapper_class)'
@@ -262,7 +262,7 @@ int main() {
 // CHECK-NEXT: MemberExpr {{.*}} 'NonDecomposedStruct[2]' lvalue .
 // CHECK-NEXT: DeclRefExpr {{.*}} '__wrapper_class' lvalue ParmVar {{.*}} '_arg_NonDecompStructArray' '__wrapper_class'
 // CHECK-NEXT: CXXConstructExpr {{.*}}'NonDecomposedStruct' 'void (const NonDecomposedStruct &) noexcept'
-// CHECK-NEXT: ImplicitCastExpr {{.*}} 'const NonDecomposedStruct':'const NonDecomposedStruct' lvalue <NoOp>
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'const NonDecomposedStruct' lvalue <NoOp>
 // CHECK-NEXT: ArraySubscriptExpr {{.*}}'NonDecomposedStruct' lvalue
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'NonDecomposedStruct *' <ArrayToPointerDecay>
 // CHECK-NEXT: OpaqueValueExpr {{.*}} 'NonDecomposedStruct[2]' lvalue
@@ -280,9 +280,9 @@ int main() {
 // CHECK-NEXT: InitListExpr
 // CHECK-NEXT: InitListExpr {{.*}} 'StructWithPointers[2]'
 // Initializer for StructWithPointersArray[0]
-// CHECK-NEXT: CXXConstructExpr {{.*}} 'StructWithPointers':'StructWithPointers' 'void (const StructWithPointers &) noexcept'
-// CHECK-NEXT: ImplicitCastExpr {{.*}} 'const StructWithPointers':'const StructWithPointers' lvalue <NoOp>
-// CHECK-NEXT: UnaryOperator {{.*}} 'StructWithPointers':'StructWithPointers' lvalue prefix '*' cannot overflow
+// CHECK-NEXT: CXXConstructExpr {{.*}} 'StructWithPointers' 'void (const StructWithPointers &) noexcept'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'const StructWithPointers' lvalue <NoOp>
+// CHECK-NEXT: UnaryOperator {{.*}} 'StructWithPointers' lvalue prefix '*' cannot overflow
 // CHECK-NEXT: CXXReinterpretCastExpr {{.*}} 'StructWithPointers *' reinterpret_cast<StructWithPointers *> <BitCast>
 // CHECK-NEXT: UnaryOperator {{.*}} '__generated_StructWithPointers *' prefix '&' cannot overflow
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} '__generated_StructWithPointers' lvalue
@@ -291,9 +291,9 @@ int main() {
 // CHECK-NEXT: DeclRefExpr {{.*}} '__wrapper_class' lvalue ParmVar {{.*}} '_arg_StructWithPointersArray'
 // CHECK-NEXT: IntegerLiteral {{.*}} 0
 // Initializer for StructWithPointersArray[1]
-// CHECK: CXXConstructExpr {{.*}} 'StructWithPointers':'StructWithPointers' 'void (const StructWithPointers &) noexcept'
-// CHECK-NEXT: ImplicitCastExpr {{.*}} 'const StructWithPointers':'const StructWithPointers' lvalue <NoOp>
-// CHECK-NEXT: UnaryOperator {{.*}} 'StructWithPointers':'StructWithPointers' lvalue prefix '*' cannot overflow
+// CHECK: CXXConstructExpr {{.*}} 'StructWithPointers' 'void (const StructWithPointers &) noexcept'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'const StructWithPointers' lvalue <NoOp>
+// CHECK-NEXT: UnaryOperator {{.*}} 'StructWithPointers' lvalue prefix '*' cannot overflow
 // CHECK-NEXT: CXXReinterpretCastExpr {{.*}} 'StructWithPointers *' reinterpret_cast<StructWithPointers *> <BitCast>
 // CHECK-NEXT: UnaryOperator {{.*}} '__generated_StructWithPointers *' prefix '&' cannot overflow
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} '__generated_StructWithPointers' lvalue
diff --git a/clang/test/SemaSYCL/basic-kernel-wrapper.cpp b/clang/test/SemaSYCL/basic-kernel-wrapper.cpp
index e76af52db63cc..d2a9320237bbe 100644
--- a/clang/test/SemaSYCL/basic-kernel-wrapper.cpp
+++ b/clang/test/SemaSYCL/basic-kernel-wrapper.cpp
@@ -40,7 +40,7 @@ int main() {
 
 // CHECK: CXXMemberCallExpr {{.*}} 'void'
 // CHECK-NEXT: MemberExpr {{.*}} 'void ({{.*}}PtrType, range<1>, range<1>, id<1>)' lvalue .__init
-// CHECK-NEXT: MemberExpr {{.*}} 'sycl::accessor<int, 1, sycl::access::mode::read_write>':'sycl::accessor<int, 1, sycl::access::mode::read_write>' lvalue .
+// CHECK-NEXT: MemberExpr {{.*}} 'sycl::accessor<int, 1, sycl::access::mode::read_write>' lvalue .
 // CHECK-NEXT: DeclRefExpr {{.*}} '(lambda at {{.*}}basic-kernel-wrapper.cpp{{.*}})' lvalue Var
 
 // CHECK-NEXT: ImplicitCastExpr {{.*}} <LValueToRValue>
diff --git a/clang/test/SemaSYCL/built-in-type-kernel-arg.cpp b/clang/test/SemaSYCL/built-in-type-kernel-arg.cpp
index 517a2034b8aa8..ee28c7953cb8a 100644
--- a/clang/test/SemaSYCL/built-in-type-kernel-arg.cpp
+++ b/clang/test/SemaSYCL/built-in-type-kernel-arg.cpp
@@ -103,9 +103,9 @@ int main() {
 // Check that lambda field of struct type is initialized
 // CHECK: VarDecl {{.*}}'(lambda at {{.*}}built-in-type-kernel-arg.cpp{{.*}})'
 // CHECK-NEXT: InitListExpr {{.*}}'(lambda at {{.*}}built-in-type-kernel-arg.cpp{{.*}})'
-// CHECK-NEXT: CXXConstructExpr {{.*}} 'test_struct':'test_struct' 'void (const test_struct &) noexcept'
-// CHECK-NEXT: ImplicitCastExpr {{.*}} 'const test_struct':'const test_struct' lvalue <NoOp>
-// CHECK-NEXT: UnaryOperator {{.*}} 'test_struct':'test_struct' lvalue prefix '*' cannot overflow
+// CHECK-NEXT: CXXConstructExpr {{.*}} 'test_struct' 'void (const test_struct &) noexcept'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'const test_struct' lvalue <NoOp>
+// CHECK-NEXT: UnaryOperator {{.*}} 'test_struct' lvalue prefix '*' cannot overflow
 // CHECK-NEXT: CXXReinterpretCastExpr {{.*}} 'test_struct *' reinterpret_cast<test_struct *> <BitCast>
 // CHECK-NEXT: UnaryOperator {{.*}} '__generated_test_struct *' prefix '&' cannot overflow
 // CHECK-NEXT: DeclRefExpr {{.*}} '__generated_test_struct' lvalue ParmVar {{.*}} '_arg_s'
@@ -151,7 +151,7 @@ int main() {
 // CHECK: VarDecl {{.*}} used __SYCLKernel
 // CHECK: InitListExpr
 // CHECK: CXXConstructExpr {{.*}} 'Nested::TDS':'test_struct_simple' 'void (const test_struct_simple &) noexcept'
-// CHECK: ImplicitCastExpr {{.*}} 'const test_struct_simple':'const test_struct_simple' lvalue <NoOp>
+// CHECK: ImplicitCastExpr {{.*}} 'const test_struct_simple' lvalue <NoOp>
 // CHECK: UnaryOperator {{.*}} 'Nested::TDS':'test_struct_simple' lvalue prefix '*' cannot overflow
 // CHECK: CXXReinterpretCastExpr {{.*}} 'Nested::TDS *' reinterpret_cast<struct Nested::TDS *> <BitCast>
 // CHECK: UnaryOperator {{.*}} '__generated_test_struct_simple *' prefix '&' cannot overflow
diff --git a/clang/test/SemaSYCL/device_has_ast.cpp b/clang/test/SemaSYCL/device_has_ast.cpp
index 2e3d1551a13b7..2e383bb4814c9 100644
--- a/clang/test/SemaSYCL/device_has_ast.cpp
+++ b/clang/test/SemaSYCL/device_has_ast.cpp
@@ -33,7 +33,7 @@ queue q;
 // CHECK-NEXT: CompoundStmt
 // CHECK-NEXT: SYCLDeviceHasAttr
 // CHECK-NEXT: SubstNonTypeTemplateParmExpr {{.*}} 'sycl::aspect'
-// CHECK-NEXT: NonTypeTemplateParmDecl {{.*}} referenced 'sycl::aspect':'sycl::aspect' depth 0 index 0 Aspect
+// CHECK-NEXT: NonTypeTemplateParmDecl {{.*}} referenced 'sycl::aspect' depth 0 index 0 Aspect
 // CHECK-NEXT: CStyleCastExpr {{.*}} 'sycl::aspect' <IntegralCast>
 // CHECK-NEXT: IntegerLiteral {{.*}} 'int' 0
 template <sycl::aspect Aspect>
diff --git a/clang/test/SemaSYCL/inheritance.cpp b/clang/test/SemaSYCL/inheritance.cpp
index ca103448a8dd5..c78c0e40fb4ec 100644
--- a/clang/test/SemaSYCL/inheritance.cpp
+++ b/clang/test/SemaSYCL/inheritance.cpp
@@ -68,16 +68,16 @@ int main() {
 
 // base is a simple class with no corresponding generated type. Therefore
 // copy from ParamVar
-// CHECK-NEXT: CXXConstructExpr {{.*}} 'base':'base' 'void (const base &) noexcept'
-// CHECK-NEXT: ImplicitCastExpr {{.*}} 'const base':'const base' lvalue <NoOp>
+// CHECK-NEXT: CXXConstructExpr {{.*}} 'base' 'void (const base &) noexcept'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'const base' lvalue <NoOp>
 // CHECK-NEXT: DeclRefExpr {{.*}} lvalue ParmVar {{.*}} '_arg__base' 'base'
 
 // second_base contains pointers and therefore the ParamVar is a new generated
 // type. Perform a copy of the corresponding kernel parameter via
 // reinterpret_cast.
-// CHECK-NEXT: CXXConstructExpr {{.*}} 'second_base':'second_base' 'void (const second_base &) noexcept'
-// CHECK-NEXT: ImplicitCastExpr {{.*}} 'const second_base':'const second_base' lvalue <NoOp>
-// CHECK-NEXT: UnaryOperator {{.*}} 'second_base':'second_base' lvalue prefix '*' cannot overflow
+// CHECK-NEXT: CXXConstructExpr {{.*}} 'second_base' 'void (const second_base &) noexcept'
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'const second_base' lvalue <NoOp>
+// CHECK-NEXT: UnaryOperator {{.*}} 'second_base' lvalue prefix '*' cannot overflow
 // CHECK-NEXT: CXXReinterpretCastExpr {{.*}} 'second_base *' reinterpret_cast<second_base *> <BitCast>
 // CHECK-NEXT: UnaryOperator {{.*}} '__generated_second_base *' prefix '&' cannot overflow
 // CHECK-NEXT: DeclRefExpr {{.*}} '__generated_second_base' lvalue ParmVar {{.*}} '_arg__base' '__generated_second_base'
@@ -99,5 +99,5 @@ int main() {
 // CHECK: CXXMemberCallExpr
 // CHECK-NEXT: MemberExpr {{.*}} lvalue .__init
 // CHECK-NEXT: MemberExpr {{.*}} lvalue .AccField
-// CHECK-NEXT: ImplicitCastExpr {{.*}} 'third_base':'third_base' lvalue <DerivedToBase (third_base)>
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'third_base' lvalue <DerivedToBase (third_base)>
 // CHECK-NEXT: DeclRefExpr {{.*}} 'derived' lvalue Var {{.*}} 'derived' 'derived'
diff --git a/clang/test/SemaSYCL/kernel-handler.cpp b/clang/test/SemaSYCL/kernel-handler.cpp
index 7acded89efa5d..4df4a8d17bc7f 100644
--- a/clang/test/SemaSYCL/kernel-handler.cpp
+++ b/clang/test/SemaSYCL/kernel-handler.cpp
@@ -59,7 +59,7 @@ int main() {
 // Kernel body with clones
 // NONATIVESUPPORT-NEXT: ImplicitCastExpr {{.*}} 'const (lambda at {{.*}}kernel-handler.cpp{{.*}})' lvalue
 // NONATIVESUPPORT-NEXT: DeclRefExpr {{.*}} '(lambda at {{.*}}kernel-handler.cpp{{.*}})' lvalue Var {{.*}} '(lambda at {{.*}}kernel-handler.cpp{{.*}})'
-// NONATIVESUPPORT-NEXT: CXXConstructExpr {{.*}} 'sycl::kernel_handler':'sycl::kernel_handler' 'void (const kernel_handler &) noexcept'
+// NONATIVESUPPORT-NEXT: CXXConstructExpr {{.*}} 'sycl::kernel_handler' 'void (const kernel_handler &) noexcept'
 // NONATIVESUPPORT-NEXT: ImplicitCastExpr {{.*}} 'const kernel_handler':'const sycl::kernel_handler' lvalue
 // NONATIVESUPPORT-NEXT: DeclRefExpr {{.*}} 'kernel_handler':'sycl::kernel_handler' lvalue Var {{.*}} 'kh' 'kernel_handler':'sycl::kernel_handler'
 
@@ -128,6 +128,6 @@ int main() {
 // Kernel body with clones
 // NATIVESUPPORT: ImplicitCastExpr {{.*}} 'const (lambda at {{.*}}kernel-handler.cpp{{.*}})' lvalue
 // NATIVESUPPORT-NEXT: DeclRefExpr {{.*}} '(lambda at {{.*}}kernel-handler.cpp{{.*}})' lvalue Var {{.*}} '(lambda at {{.*}}kernel-handler.cpp{{.*}})'
-// NATIVESUPPORT-NEXT: CXXConstructExpr {{.*}} 'sycl::kernel_handler':'sycl::kernel_handler' 'void (const kernel_handler &) noexcept'
+// NATIVESUPPORT-NEXT: CXXConstructExpr {{.*}} 'sycl::kernel_handler' 'void (const kernel_handler &) noexcept'
 // NATIVESUPPORT-NEXT: ImplicitCastExpr {{.*}} 'const kernel_handler':'const sycl::kernel_handler' lvalue
 // NATIVESUPPORT-NEXT: DeclRefExpr {{.*}} 'kernel_handler':'sycl::kernel_handler' lvalue Var {{.*}} 'kh' 'kernel_handler':'sycl::kernel_handler'
diff --git a/clang/test/SemaSYCL/parallel_for_wrapper_attr.cpp b/clang/test/SemaSYCL/parallel_for_wrapper_attr.cpp
index cadf4a30d0ad9..ee86f1b124c2a 100755
--- a/clang/test/SemaSYCL/parallel_for_wrapper_attr.cpp
+++ b/clang/test/SemaSYCL/parallel_for_wrapper_attr.cpp
@@ -32,7 +32,7 @@ void invoke() {
 // CHECK-NEXT:  ConstantExpr {{.*}} 'int'
 // CHECK-NEXT:  value: Int 4
 // CHECK-NEXT:  IntegerLiteral{{.*}}4{{$}}
-// CHECK:       CXXOperatorCallExpr {{.*}} 'void':'void' '()'
+// CHECK:       CXXOperatorCallExpr {{.*}} 'void' '()'
 // CHECK:       IntelReqdSubGroupSizeAttr {{.*}}
 // CHECK-NEXT:  ConstantExpr {{.*}} 'int'
 // CHECK-NEXT:  value: Int 4
@@ -49,6 +49,6 @@ void invoke() {
 // CHECK-NEXT:  ConstantExpr {{.*}} 'int'
 // CHECK-NEXT:  value: Int 4
 // CHECK-NEXT:  IntegerLiteral{{.*}}4{{$}}
-// CHECK:       CXXOperatorCallExpr {{.*}} 'void':'void' '()'
+// CHECK:       CXXOperatorCallExpr {{.*}} 'void' '()'
 // CHECK-NOT:   IntelReqdSubGroupSizeAttr {{.*}}
 // CHECK:       CXXConstructorDecl
diff --git a/clang/test/SemaSYCL/sampler.cpp b/clang/test/SemaSYCL/sampler.cpp
index 40506f800bf99..5d812f8ae8012 100644
--- a/clang/test/SemaSYCL/sampler.cpp
+++ b/clang/test/SemaSYCL/sampler.cpp
@@ -28,7 +28,7 @@ int main() {
 // Check that sampler field of the test kernel object is initialized using __init method
 // CHECK: CXXMemberCallExpr {{.*}} 'void'
 // CHECK-NEXT: MemberExpr {{.*}} 'void (__ocl_sampler_t)' lvalue .__init
-// CHECK-NEXT: MemberExpr {{.*}} 'sycl::sampler':'sycl::sampler' lvalue
+// CHECK-NEXT: MemberExpr {{.*}} 'sycl::sampler' lvalue
 // CHECK-NEXT: DeclRefExpr {{.*}} '(lambda at {{.*}}sampler.cpp{{.*}})' lvalue Var {{.*}} '(lambda at {{.*}}sampler.cpp{{.*}})'
 //
 // Check the parameters of __init method
diff --git a/clang/test/SemaSYCL/stream.cpp b/clang/test/SemaSYCL/stream.cpp
index 1d220310c895d..d68e10e947f07 100644
--- a/clang/test/SemaSYCL/stream.cpp
+++ b/clang/test/SemaSYCL/stream.cpp
@@ -52,39 +52,39 @@ int main() {
 
 // CHECK: InitListExpr {{.*}} '(lambda at
 // 'in_lambda'
-// CHECK-NEXT: CXXConstructExpr {{.*}} 'sycl::stream':'sycl::stream' 'void () noexcept'
+// CHECK-NEXT: CXXConstructExpr {{.*}} 'sycl::stream' 'void () noexcept'
 // 'in_lambda_array'
 // CHECK-NEXT: InitListExpr {{.*}} 'sycl::stream[2]'
 // element 0
-// CHECK-NEXT: CXXConstructExpr {{.*}} 'sycl::stream':'sycl::stream' 'void () noexcept'
+// CHECK-NEXT: CXXConstructExpr {{.*}} 'sycl::stream' 'void () noexcept'
 // element 1
-// CHECK-NEXT: CXXConstructExpr {{.*}} 'sycl::stream':'sycl::stream' 'void () noexcept'
+// CHECK-NEXT: CXXConstructExpr {{.*}} 'sycl::stream' 'void () noexcept'
 
 // 'in_lambda_mdarray'
 // CHECK-NEXT: InitListExpr {{.*}} 'sycl::stream[2][2]'
 // sub-array 0
 // CHECK-NEXT: InitListExpr {{.*}} 'sycl::stream[2]'
 // element 0
-// CHECK-NEXT: CXXConstructExpr {{.*}} 'sycl::stream':'sycl::stream' 'void () noexcept'
+// CHECK-NEXT: CXXConstructExpr {{.*}} 'sycl::stream' 'void () noexcept'
 // element 1
-// CHECK-NEXT: CXXConstructExpr {{.*}} 'sycl::stream':'sycl::stream' 'void () noexcept'
+// CHECK-NEXT: CXXConstructExpr {{.*}} 'sycl::stream' 'void () noexcept'
 // sub-array 1
 // CHECK-NEXT: InitListExpr {{.*}} 'sycl::stream[2]'
 // element 0
-// CHECK-NEXT: CXXConstructExpr {{.*}} 'sycl::stream':'sycl::stream' 'void () noexcept'
+// CHECK-NEXT: CXXConstructExpr {{.*}} 'sycl::stream' 'void () noexcept'
 // element 1
-// CHECK-NEXT: CXXConstructExpr {{.*}} 'sycl::stream':'sycl::stream' 'void () noexcept'
+// CHECK-NEXT: CXXConstructExpr {{.*}} 'sycl::stream' 'void () noexcept'
 
 // HasStreams struct
 // CHECK: InitListExpr {{.*}} 'HasStreams'
 // HasStreams::s1
-// CHECK-NEXT: CXXConstructExpr {{.*}} 'sycl::stream':'sycl::stream' 'void () noexcept'
+// CHECK-NEXT: CXXConstructExpr {{.*}} 'sycl::stream' 'void () noexcept'
 // HasStreams::s_array
 // CHECK-NEXT: InitListExpr {{.*}} 'sycl::stream[2]'
 // element 0
-// CHECK-NEXT: CXXConstructExpr {{.*}} 'sycl::stream':'sycl::stream' 'void () noexcept'
+// CHECK-NEXT: CXXConstructExpr {{.*}} 'sycl::stream' 'void () noexcept'
 // element 1
-// CHECK-NEXT: CXXConstructExpr {{.*}} 'sycl::stream':'sycl::stream' 'void () noexcept'
+// CHECK-NEXT: CXXConstructExpr {{.*}} 'sycl::stream' 'void () noexcept'
 
 // HasArrayOfHasStreams
 // CHECK-NEXT: InitListExpr {{.*}} 'HasArrayOfHasStreams'
@@ -96,23 +96,23 @@ int main() {
 // HasStreams struct
 // CHECK-NEXT: InitListExpr {{.*}} 'HasStreams'
 // HasStreams::s1
-// CHECK-NEXT: CXXConstructExpr {{.*}} 'sycl::stream':'sycl::stream' 'void () noexcept'
+// CHECK-NEXT: CXXConstructExpr {{.*}} 'sycl::stream' 'void () noexcept'
 // HasStreams::s_array
 // CHECK-NEXT: InitListExpr {{.*}} 'sycl::stream[2]'
 // element 0
-// CHECK-NEXT: CXXConstructExpr {{.*}} 'sycl::stream':'sycl::stream' 'void () noexcept'
+// CHECK-NEXT: CXXConstructExpr {{.*}} 'sycl::stream' 'void () noexcept'
 // element 1
-// CHECK-NEXT: CXXConstructExpr {{.*}} 'sycl::stream':'sycl::stream' 'void () noexcept'
+// CHECK-NEXT: CXXConstructExpr {{.*}} 'sycl::stream' 'void () noexcept'
 // HasStreams struct
 // CHECK-NEXT: InitListExpr {{.*}} 'HasStreams'
 // HasStreams::s1
-// CHECK-NEXT: CXXConstructExpr {{.*}} 'sycl::stream':'sycl::stream' 'void () noexcept'
+// CHECK-NEXT: CXXConstructExpr {{.*}} 'sycl::stream' 'void () noexcept'
 // HasStreams::s_array
 // CHECK-NEXT: InitListExpr {{.*}} 'sycl::stream[2]'
 // element 0
-// CHECK-NEXT: CXXConstructExpr {{.*}} 'sycl::stream':'sycl::stream' 'void () noexcept'
+// CHECK-NEXT: CXXConstructExpr {{.*}} 'sycl::stream' 'void () noexcept'
 // element 1
-// CHECK-NEXT: CXXConstructExpr {{.*}} 'sycl::stream':'sycl::stream' 'void () noexcept'
+// CHECK-NEXT: CXXConstructExpr {{.*}} 'sycl::stream' 'void () noexcept'
 
 // HasArrayOfHasStreams Array
 // CHECK: InitListExpr {{.*}} 'HasArrayOfHasStreams[2]'
@@ -126,23 +126,23 @@ int main() {
 // HasStreams struct
 // CHECK-NEXT: InitListExpr {{.*}} 'HasStreams'
 // HasStreams::s1
-// CHECK-NEXT: CXXConstructExpr {{.*}} 'sycl::stream':'sycl::stream' 'void () noexcept'
+// CHECK-NEXT: CXXConstructExpr {{.*}} 'sycl::stream' 'void () noexcept'
 // HasStreams::s_array
 // CHECK-NEXT: InitListExpr {{.*}} 'sycl::stream[2]'
 // element 0
-// CHECK-NEXT: CXXConstructExpr {{.*}} 'sycl::stream':'sycl::stream' 'void () noexcept'
+// CHECK-NEXT: CXXConstructExpr {{.*}} 'sycl::stream' 'void () noexcept'
 // element 1
-// CHECK-NEXT: CXXConstructExpr {{.*}} 'sycl::stream':'sycl::stream' 'void () noexcept'
+// CHECK-NEXT: CXXConstructExpr {{.*}} 'sycl::stream' 'void () noexcept'
 // HasStreams struct
 // CHECK-NEXT: InitListExpr {{.*}} 'HasStreams'
 // HasStreams::s1
-// CHECK-NEXT: CXXConstructExpr {{.*}} 'sycl::stream':'sycl::stream' 'void () noexcept'
+// CHECK-NEXT: CXXConstructExpr {{.*}} 'sycl::stream' 'void () noexcept'
 // HasStreams::s_array
 // CHECK-NEXT: InitListExpr {{.*}} 'sycl::stream[2]'
 // element 0
-// CHECK-NEXT: CXXConstructExpr {{.*}} 'sycl::stream':'sycl::stream' 'void () noexcept'
+// CHECK-NEXT: CXXConstructExpr {{.*}} 'sycl::stream' 'void () noexcept'
 // element 1
-// CHECK-NEXT: CXXConstructExpr {{.*}} 'sycl::stream':'sycl::stream' 'void () noexcept'
+// CHECK-NEXT: CXXConstructExpr {{.*}} 'sycl::stream' 'void () noexcept'
 // HasArrayOfHasStreams Struct
 // CHECK-NEXT: InitListExpr {{.*}} 'HasArrayOfHasStreams'
 // HasArrayOfHasStreams::i
@@ -153,36 +153,36 @@ int main() {
 // HasStreams struct
 // CHECK-NEXT: InitListExpr {{.*}} 'HasStreams'
 // HasStreams::s1
-// CHECK-NEXT: CXXConstructExpr {{.*}} 'sycl::stream':'sycl::stream' 'void () noexcept'
+// CHECK-NEXT: CXXConstructExpr {{.*}} 'sycl::stream' 'void () noexcept'
 // HasStreams::s_array
 // CHECK-NEXT: InitListExpr {{.*}} 'sycl::stream[2]'
 // element 0
-// CHECK-NEXT: CXXConstructExpr {{.*}} 'sycl::stream':'sycl::stream' 'void () noexcept'
+// CHECK-NEXT: CXXConstructExpr {{.*}} 'sycl::stream' 'void () noexcept'
 // element 1
-// CHECK-NEXT: CXXConstructExpr {{.*}} 'sycl::stream':'sycl::stream' 'void () noexcept'
+// CHECK-NEXT: CXXConstructExpr {{.*}} 'sycl::stream' 'void () noexcept'
 // HasStreams struct
 // CHECK-NEXT: InitListExpr {{.*}} 'HasStreams'
 // HasStreams::s1
-// CHECK-NEXT: CXXConstructExpr {{.*}} 'sycl::stream':'sycl::stream' 'void () noexcept'
+// CHECK-NEXT: CXXConstructExpr {{.*}} 'sycl::stream' 'void () noexcept'
 // HasStreams::s_array
 // CHECK-NEXT: InitListExpr {{.*}} 'sycl::stream[2]'
 // element 0
-// CHECK-NEXT: CXXConstructExpr {{.*}} 'sycl::stream':'sycl::stream' 'void () noexcept'
+// CHECK-NEXT: CXXConstructExpr {{.*}} 'sycl::stream' 'void () noexcept'
 // element 1
-// CHECK-NEXT: CXXConstructExpr {{.*}} 'sycl::stream':'sycl::stream' 'void () noexcept'
+// CHECK-NEXT: CXXConstructExpr {{.*}} 'sycl::stream' 'void () noexcept'
 
 // Calls to init
 // in_lambda __init
 // CHECK: CXXMemberCallExpr {{.*}} 'void'
 // CHECK-NEXT: MemberExpr {{.*}} 'void (__global char *, range<1>, range<1>, id<1>, int)' lvalue .__init
-// CHECK-NEXT: MemberExpr {{.*}} 'sycl::stream':'sycl::stream' lvalue .
+// CHECK-NEXT: MemberExpr {{.*}} 'sycl::stream' lvalue .
 // CHECK-NEXT: DeclRefExpr {{.*}} '(lambda at
 
 // in_lambda_array
 // element 0
 // CHECK: CXXMemberCallExpr {{.*}} 'void'
 // CHECK-NEXT: MemberExpr {{.*}} 'void (__global char *, range<1>, range<1>, id<1>, int)' lvalue .__init
-// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'sycl::stream':'sycl::stream' lvalue
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'sycl::stream' lvalue
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'sycl::stream *' <ArrayToPointerDecay>
 // CHECK-NEXT: MemberExpr {{.*}} 'sycl::stream[2]' lvalue .
 // CHECK-NEXT: DeclRefExpr {{.*}} '(lambda at
@@ -191,7 +191,7 @@ int main() {
 // element 1
 // CHECK: CXXMemberCallExpr {{.*}} 'void'
 // CHECK-NEXT: MemberExpr {{.*}} 'void (__global char *, range<1>, range<1>, id<1>, int)' lvalue .__init
-// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'sycl::stream':'sycl::stream' lvalue
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'sycl::stream' lvalue
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'sycl::stream *' <ArrayToPointerDecay>
 // CHECK-NEXT: MemberExpr  {{.*}} 'sycl::stream[2]' lvalue .
 // CHECK-NEXT: DeclRefExpr {{.*}} '(lambda at
@@ -201,7 +201,7 @@ int main() {
 // [0][0]
 // CHECK: CXXMemberCallExpr {{.*}} 'void'
 // CHECK-NEXT: MemberExpr {{.*}} 'void (__global char *, range<1>, range<1>, id<1>, int)' lvalue .__init
-// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'sycl::stream':'sycl::stream' lvalue
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'sycl::stream' lvalue
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'sycl::stream *' <ArrayToPointerDecay>
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'sycl::stream[2]' lvalue
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'sycl::stream (*)[2]' <ArrayToPointerDecay>
@@ -212,7 +212,7 @@ int main() {
 // [0][1]
 // CHECK: CXXMemberCallExpr {{.*}} 'void'
 // CHECK-NEXT: MemberExpr {{.*}} 'void (__global char *, range<1>, range<1>, id<1>, int)' lvalue .__init
-// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'sycl::stream':'sycl::stream' lvalue
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'sycl::stream' lvalue
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'sycl::stream *' <ArrayToPointerDecay>
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'sycl::stream[2]' lvalue
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'sycl::stream (*)[2]' <ArrayToPointerDecay>
@@ -223,7 +223,7 @@ int main() {
 // [1][0]
 // CHECK: CXXMemberCallExpr {{.*}} 'void'
 // CHECK-NEXT: MemberExpr {{.*}} 'void (__global char *, range<1>, range<1>, id<1>, int)' lvalue .__init
-// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'sycl::stream':'sycl::stream' lvalue
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'sycl::stream' lvalue
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'sycl::stream *' <ArrayToPointerDecay>
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'sycl::stream[2]' lvalue
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'sycl::stream (*)[2]' <ArrayToPointerDecay>
@@ -234,7 +234,7 @@ int main() {
 // [1][1]
 // CHECK: CXXMemberCallExpr {{.*}} 'void'
 // CHECK-NEXT: MemberExpr {{.*}} 'void (__global char *, range<1>, range<1>, id<1>, int)' lvalue .__init
-// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'sycl::stream':'sycl::stream' lvalue
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'sycl::stream' lvalue
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'sycl::stream *' <ArrayToPointerDecay>
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'sycl::stream[2]' lvalue
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'sycl::stream (*)[2]' <ArrayToPointerDecay>
@@ -246,13 +246,13 @@ int main() {
 // HasStreams
 // CHECK: CXXMemberCallExpr {{.*}} 'void'
 // CHECK-NEXT: MemberExpr {{.*}} 'void (__global char *, range<1>, range<1>, id<1>, int)' lvalue .__init
-// CHECK-NEXT: MemberExpr {{.*}} 'sycl::stream':'sycl::stream' lvalue .s1
+// CHECK-NEXT: MemberExpr {{.*}} 'sycl::stream' lvalue .s1
 // CHECK-NEXT: MemberExpr {{.*}}'HasStreams' lvalue .
 // CHECK-NEXT: DeclRefExpr {{.*}} '(lambda at
 // array:
 // CHECK: CXXMemberCallExpr {{.*}} 'void'
 // CHECK-NEXT: MemberExpr {{.*}} 'void (__global char *, range<1>, range<1>, id<1>, int)' lvalue .__init
-// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'sycl::stream':'sycl::stream' lvalue
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'sycl::stream' lvalue
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'sycl::stream *' <ArrayToPointerDecay>
 // CHECK-NEXT: MemberExpr {{.*}} 'sycl::stream[2]' lvalue .s_array
 // CHECK-NEXT: MemberExpr {{.*}}'HasStreams' lvalue .
@@ -261,7 +261,7 @@ int main() {
 // element 1
 // CHECK: CXXMemberCallExpr {{.*}} 'void'
 // CHECK-NEXT: MemberExpr {{.*}} 'void (__global char *, range<1>, range<1>, id<1>, int)' lvalue .__init
-// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'sycl::stream':'sycl::stream' lvalue
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'sycl::stream' lvalue
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'sycl::stream *' <ArrayToPointerDecay>
 // CHECK-NEXT: MemberExpr {{.*}} 'sycl::stream[2]' lvalue .s_array
 // CHECK-NEXT: MemberExpr {{.*}}'HasStreams' lvalue .
@@ -272,7 +272,7 @@ int main() {
 // First element
 // CHECK: CXXMemberCallExpr {{.*}} 'void'
 // CHECK-NEXT: MemberExpr {{.*}} 'void (__global char *, range<1>, range<1>, id<1>, int)' lvalue .__init
-// CHECK-NEXT: MemberExpr {{.*}} 'sycl::stream':'sycl::stream' lvalue .s1
+// CHECK-NEXT: MemberExpr {{.*}} 'sycl::stream' lvalue .s1
 // CHECK-NEXT: ArraySubscriptExpr {{.*}}'HasStreams' lvalue
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'HasStreams *' <ArrayToPointerDecay>
 // CHECK-NEXT: MemberExpr {{.*}} 'HasStreams[2]' lvalue .hs
@@ -282,7 +282,7 @@ int main() {
 // array:
 // CHECK: CXXMemberCallExpr {{.*}} 'void'
 // CHECK-NEXT: MemberExpr {{.*}} 'void (__global char *, range<1>, range<1>, id<1>, int)' lvalue .__init
-// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'sycl::stream':'sycl::stream' lvalue
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'sycl::stream' lvalue
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'sycl::stream *' <ArrayToPointerDecay>
 // CHECK-NEXT: MemberExpr {{.*}} 'sycl::stream[2]' lvalue .s_array
 // CHECK-NEXT: ArraySubscriptExpr {{.*}}'HasStreams' lvalue
@@ -295,7 +295,7 @@ int main() {
 // element 1
 // CHECK: CXXMemberCallExpr {{.*}} 'void'
 // CHECK-NEXT: MemberExpr {{.*}} 'void (__global char *, range<1>, range<1>, id<1>, int)' lvalue .__init
-// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'sycl::stream':'sycl::stream' lvalue
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'sycl::stream' lvalue
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'sycl::stream *' <ArrayToPointerDecay>
 // CHECK-NEXT: MemberExpr {{.*}} 'sycl::stream[2]' lvalue .s_array
 // CHECK-NEXT: ArraySubscriptExpr {{.*}}'HasStreams' lvalue
@@ -308,7 +308,7 @@ int main() {
 // second element
 // CHECK: CXXMemberCallExpr {{.*}} 'void'
 // CHECK-NEXT: MemberExpr {{.*}} 'void (__global char *, range<1>, range<1>, id<1>, int)' lvalue .__init
-// CHECK-NEXT: MemberExpr {{.*}} 'sycl::stream':'sycl::stream' lvalue .s1
+// CHECK-NEXT: MemberExpr {{.*}} 'sycl::stream' lvalue .s1
 // CHECK-NEXT: ArraySubscriptExpr {{.*}}'HasStreams' lvalue
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'HasStreams *' <ArrayToPointerDecay>
 // CHECK-NEXT: MemberExpr {{.*}} 'HasStreams[2]' lvalue .hs
@@ -318,7 +318,7 @@ int main() {
 // array:
 // CHECK: CXXMemberCallExpr {{.*}} 'void'
 // CHECK-NEXT: MemberExpr {{.*}} 'void (__global char *, range<1>, range<1>, id<1>, int)' lvalue .__init
-// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'sycl::stream':'sycl::stream' lvalue
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'sycl::stream' lvalue
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'sycl::stream *' <ArrayToPointerDecay>
 // CHECK-NEXT: MemberExpr {{.*}} 'sycl::stream[2]' lvalue .s_array
 // CHECK-NEXT: ArraySubscriptExpr {{.*}}'HasStreams' lvalue
@@ -331,7 +331,7 @@ int main() {
 // element 1
 // CHECK: CXXMemberCallExpr {{.*}} 'void'
 // CHECK-NEXT: MemberExpr {{.*}} 'void (__global char *, range<1>, range<1>, id<1>, int)' lvalue .__init
-// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'sycl::stream':'sycl::stream' lvalue
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'sycl::stream' lvalue
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'sycl::stream *' <ArrayToPointerDecay>
 // CHECK-NEXT: MemberExpr {{.*}} 'sycl::stream[2]' lvalue .s_array
 // CHECK-NEXT: ArraySubscriptExpr {{.*}}'HasStreams' lvalue
@@ -345,7 +345,7 @@ int main() {
 // First element
 // CHECK: CXXMemberCallExpr {{.*}} 'void'
 // CHECK-NEXT: MemberExpr {{.*}} 'void (__global char *, range<1>, range<1>, id<1>, int)' lvalue .__init
-// CHECK-NEXT: MemberExpr {{.*}} 'sycl::stream':'sycl::stream' lvalue .s1
+// CHECK-NEXT: MemberExpr {{.*}} 'sycl::stream' lvalue .s1
 // CHECK-NEXT: ArraySubscriptExpr {{.*}}'HasStreams' lvalue
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'HasStreams *' <ArrayToPointerDecay>
 // CHECK-NEXT: MemberExpr {{.*}} 'HasStreams[2]' lvalue .hs
@@ -358,7 +358,7 @@ int main() {
 // array:
 // CHECK: CXXMemberCallExpr {{.*}} 'void'
 // CHECK-NEXT: MemberExpr {{.*}} 'void (__global char *, range<1>, range<1>, id<1>, int)' lvalue .__init
-// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'sycl::stream':'sycl::stream' lvalue
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'sycl::stream' lvalue
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'sycl::stream *' <ArrayToPointerDecay>
 // CHECK-NEXT: MemberExpr {{.*}} 'sycl::stream[2]' lvalue .s_array
 // CHECK-NEXT: ArraySubscriptExpr {{.*}}'HasStreams' lvalue
@@ -374,7 +374,7 @@ int main() {
 // element 1
 // CHECK: CXXMemberCallExpr {{.*}} 'void'
 // CHECK-NEXT: MemberExpr {{.*}} 'void (__global char *, range<1>, range<1>, id<1>, int)' lvalue .__init
-// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'sycl::stream':'sycl::stream' lvalue
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'sycl::stream' lvalue
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'sycl::stream *' <ArrayToPointerDecay>
 // CHECK-NEXT: MemberExpr {{.*}} 'sycl::stream[2]' lvalue .s_array
 // CHECK-NEXT: ArraySubscriptExpr {{.*}}'HasStreams' lvalue
@@ -390,7 +390,7 @@ int main() {
 // second element
 // CHECK: CXXMemberCallExpr {{.*}} 'void'
 // CHECK-NEXT: MemberExpr {{.*}} 'void (__global char *, range<1>, range<1>, id<1>, int)' lvalue .__init
-// CHECK-NEXT: MemberExpr {{.*}} 'sycl::stream':'sycl::stream' lvalue .s1
+// CHECK-NEXT: MemberExpr {{.*}} 'sycl::stream' lvalue .s1
 // CHECK-NEXT: ArraySubscriptExpr {{.*}}'HasStreams' lvalue
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'HasStreams *' <ArrayToPointerDecay>
 // CHECK-NEXT: MemberExpr {{.*}} 'HasStreams[2]' lvalue .hs
@@ -403,7 +403,7 @@ int main() {
 // array:
 // CHECK: CXXMemberCallExpr {{.*}} 'void'
 // CHECK-NEXT: MemberExpr {{.*}} 'void (__global char *, range<1>, range<1>, id<1>, int)' lvalue .__init
-// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'sycl::stream':'sycl::stream' lvalue
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'sycl::stream' lvalue
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'sycl::stream *' <ArrayToPointerDecay>
 // CHECK-NEXT: MemberExpr {{.*}} 'sycl::stream[2]' lvalue .s_array
 // CHECK-NEXT: ArraySubscriptExpr {{.*}}'HasStreams' lvalue
@@ -419,7 +419,7 @@ int main() {
 // element 1
 // CHECK: CXXMemberCallExpr {{.*}} 'void'
 // CHECK-NEXT: MemberExpr {{.*}}  'void (__global char *, range<1>, range<1>, id<1>, int)' lvalue .__init
-// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'sycl::stream':'sycl::stream' lvalue
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'sycl::stream' lvalue
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'sycl::stream *' <ArrayToPointerDecay>
 // CHECK-NEXT: MemberExpr {{.*}} 'sycl::stream[2]' lvalue .s_array
 // CHECK-NEXT: ArraySubscriptExpr {{.*}}'HasStreams' lvalue
@@ -435,7 +435,7 @@ int main() {
 // second element
 // CHECK: CXXMemberCallExpr {{.*}} 'void'
 // CHECK-NEXT: MemberExpr {{.*}}  'void (__global char *, range<1>, range<1>, id<1>, int)' lvalue .__init
-// CHECK-NEXT: MemberExpr {{.*}} 'sycl::stream':'sycl::stream' lvalue .s1
+// CHECK-NEXT: MemberExpr {{.*}} 'sycl::stream' lvalue .s1
 // CHECK-NEXT: ArraySubscriptExpr {{.*}}'HasStreams' lvalue
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'HasStreams *' <ArrayToPointerDecay>
 // CHECK-NEXT: MemberExpr {{.*}} 'HasStreams[2]' lvalue .hs
@@ -448,7 +448,7 @@ int main() {
 // array:
 // CHECK: CXXMemberCallExpr {{.*}} 'void'
 // CHECK-NEXT: MemberExpr {{.*}}  'void (__global char *, range<1>, range<1>, id<1>, int)' lvalue .__init
-// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'sycl::stream':'sycl::stream' lvalue
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'sycl::stream' lvalue
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'sycl::stream *' <ArrayToPointerDecay>
 // CHECK-NEXT: MemberExpr {{.*}} 'sycl::stream[2]' lvalue .s_array
 // CHECK-NEXT: ArraySubscriptExpr {{.*}}'HasStreams' lvalue
@@ -464,7 +464,7 @@ int main() {
 // element 1
 // CHECK: CXXMemberCallExpr {{.*}} 'void'
 // CHECK-NEXT: MemberExpr {{.*}}  'void (__global char *, range<1>, range<1>, id<1>, int)' lvalue .__init
-// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'sycl::stream':'sycl::stream' lvalue
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'sycl::stream' lvalue
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'sycl::stream *' <ArrayToPointerDecay>
 // CHECK-NEXT: MemberExpr {{.*}} 'sycl::stream[2]' lvalue .s_array
 // CHECK-NEXT: ArraySubscriptExpr {{.*}}'HasStreams' lvalue
@@ -480,7 +480,7 @@ int main() {
 // second element
 // CHECK: CXXMemberCallExpr {{.*}} 'void'
 // CHECK-NEXT: MemberExpr {{.*}}  'void (__global char *, range<1>, range<1>, id<1>, int)' lvalue .__init
-// CHECK-NEXT: MemberExpr {{.*}} 'sycl::stream':'sycl::stream' lvalue .s1
+// CHECK-NEXT: MemberExpr {{.*}} 'sycl::stream' lvalue .s1
 // CHECK-NEXT: ArraySubscriptExpr {{.*}}'HasStreams' lvalue
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'HasStreams *' <ArrayToPointerDecay>
 // CHECK-NEXT: MemberExpr {{.*}} 'HasStreams[2]' lvalue .hs
@@ -493,7 +493,7 @@ int main() {
 // array:
 // CHECK: CXXMemberCallExpr {{.*}} 'void'
 // CHECK-NEXT: MemberExpr {{.*}}  'void (__global char *, range<1>, range<1>, id<1>, int)' lvalue .__init
-// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'sycl::stream':'sycl::stream' lvalue
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'sycl::stream' lvalue
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'sycl::stream *' <ArrayToPointerDecay>
 // CHECK-NEXT: MemberExpr {{.*}} 'sycl::stream[2]' lvalue .s_array
 // CHECK-NEXT: ArraySubscriptExpr {{.*}}'HasStreams' lvalue
@@ -509,7 +509,7 @@ int main() {
 // element 1
 // CHECK: CXXMemberCallExpr {{.*}} 'void'
 // CHECK-NEXT: MemberExpr {{.*}}  'void (__global char *, range<1>, range<1>, id<1>, int)' lvalue .__init
-// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'sycl::stream':'sycl::stream' lvalue
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'sycl::stream' lvalue
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'sycl::stream *' <ArrayToPointerDecay>
 // CHECK-NEXT: MemberExpr {{.*}} 'sycl::stream[2]' lvalue .s_array
 // CHECK-NEXT: ArraySubscriptExpr {{.*}}'HasStreams' lvalue
@@ -527,14 +527,14 @@ int main() {
 // in_lambda __finalize
 // CHECK: CXXMemberCallExpr {{.*}} 'void'
 // CHECK-NEXT: MemberExpr {{.*}} 'void ()' lvalue .__finalize
-// CHECK-NEXT: MemberExpr {{.*}} 'sycl::stream':'sycl::stream' lvalue .
+// CHECK-NEXT: MemberExpr {{.*}} 'sycl::stream' lvalue .
 // CHECK-NEXT: DeclRefExpr {{.*}} '(lambda at
 
 // _in_lambda_array
 // element 0
 // CHECK: CXXMemberCallExpr {{.*}} 'void'
 // CHECK-NEXT: MemberExpr {{.*}} 'void ()' lvalue .__finalize
-// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'sycl::stream':'sycl::stream' lvalue
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'sycl::stream' lvalue
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'sycl::stream *' <ArrayToPointerDecay>
 // CHECK-NEXT: MemberExpr {{.*}} 'sycl::stream[2]' lvalue
 // CHECK-NEXT: DeclRefExpr {{.*}} '(lambda at
@@ -542,7 +542,7 @@ int main() {
 // element 1
 // CHECK: CXXMemberCallExpr {{.*}} 'void'
 // CHECK-NEXT: MemberExpr {{.*}} 'void ()' lvalue .__finalize
-// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'sycl::stream':'sycl::stream' lvalue
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'sycl::stream' lvalue
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'sycl::stream *' <ArrayToPointerDecay>
 // CHECK-NEXT: MemberExpr {{.*}} 'sycl::stream[2]' lvalue
 // CHECK-NEXT: DeclRefExpr {{.*}} '(lambda at
@@ -552,7 +552,7 @@ int main() {
 // [0][0]
 // CHECK: CXXMemberCallExpr {{.*}} 'void'
 // CHECK-NEXT: MemberExpr {{.*}} 'void ()' lvalue .__finalize
-// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'sycl::stream':'sycl::stream' lvalue
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'sycl::stream' lvalue
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'sycl::stream *' <ArrayToPointerDecay>
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'sycl::stream[2]' lvalue
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'sycl::stream (*)[2]' <ArrayToPointerDecay>
@@ -563,7 +563,7 @@ int main() {
 // [0][1]
 // CHECK: CXXMemberCallExpr {{.*}} 'void'
 // CHECK-NEXT: MemberExpr {{.*}} 'void ()' lvalue .__finalize
-// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'sycl::stream':'sycl::stream' lvalue
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'sycl::stream' lvalue
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'sycl::stream *' <ArrayToPointerDecay>
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'sycl::stream[2]' lvalue
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'sycl::stream (*)[2]' <ArrayToPointerDecay>
@@ -574,7 +574,7 @@ int main() {
 // [1][0]
 // CHECK: CXXMemberCallExpr {{.*}} 'void'
 // CHECK-NEXT: MemberExpr {{.*}} 'void ()' lvalue .__finalize
-// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'sycl::stream':'sycl::stream' lvalue
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'sycl::stream' lvalue
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'sycl::stream *' <ArrayToPointerDecay>
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'sycl::stream[2]' lvalue
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'sycl::stream (*)[2]' <ArrayToPointerDecay>
@@ -585,7 +585,7 @@ int main() {
 // [1][1]
 // CHECK: CXXMemberCallExpr {{.*}} 'void'
 // CHECK-NEXT: MemberExpr {{.*}} 'void ()' lvalue .__finalize
-// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'sycl::stream':'sycl::stream' lvalue
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'sycl::stream' lvalue
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'sycl::stream *' <ArrayToPointerDecay>
 // CHECK-NEXT: ArraySubscriptExpr {{.*}} 'sycl::stream[2]' lvalue
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'sycl::stream (*)[2]' <ArrayToPointerDecay>
@@ -597,13 +597,13 @@ int main() {
 // HasStreams
 // CHECK: CXXMemberCallExpr {{.*}} 'void'
 // CHECK-NEXT: MemberExpr {{.*}} 'void ()' lvalue .__finalize
-// CHECK-NEXT: MemberExpr {{.*}} 'sycl::stream':'sycl::stream' lvalue .s1
+// CHECK-NEXT: MemberExpr {{.*}} 'sycl::stream' lvalue .s1
 // CHECK-NEXT: MemberExpr {{.*}}'HasStreams' lvalue .
 // CHECK-NEXT: DeclRefExpr {{.*}} '(lambda at
 // array:
 // CHECK: CXXMemberCallExpr {{.*}} 'void'
 // CHECK-NEXT: MemberExpr {{.*}} 'void ()' lvalue .__finalize
-// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'sycl::stream':'sycl::stream' lvalue
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'sycl::stream' lvalue
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'sycl::stream *' <ArrayToPointerDecay>
 // CHECK-NEXT: MemberExpr {{.*}} 'sycl::stream[2]' lvalue .s_array
 // CHECK-NEXT: MemberExpr {{.*}}'HasStreams' lvalue .
@@ -612,7 +612,7 @@ int main() {
 // element 1
 // CHECK: CXXMemberCallExpr {{.*}} 'void'
 // CHECK-NEXT: MemberExpr {{.*}} 'void ()' lvalue .__finalize
-// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'sycl::stream':'sycl::stream' lvalue
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'sycl::stream' lvalue
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'sycl::stream *' <ArrayToPointerDecay>
 // CHECK-NEXT: MemberExpr {{.*}} 'sycl::stream[2]' lvalue .s_array
 // CHECK-NEXT: MemberExpr {{.*}}'HasStreams' lvalue .
@@ -623,7 +623,7 @@ int main() {
 // First element
 // CHECK: CXXMemberCallExpr {{.*}} 'void'
 // CHECK-NEXT: MemberExpr {{.*}} 'void ()' lvalue .__finalize
-// CHECK-NEXT: MemberExpr {{.*}} 'sycl::stream':'sycl::stream' lvalue .s1
+// CHECK-NEXT: MemberExpr {{.*}} 'sycl::stream' lvalue .s1
 // CHECK-NEXT: ArraySubscriptExpr {{.*}}'HasStreams' lvalue
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'HasStreams *' <ArrayToPointerDecay>
 // CHECK-NEXT: MemberExpr {{.*}} 'HasStreams[2]' lvalue .hs
@@ -633,7 +633,7 @@ int main() {
 // array:
 // CHECK: CXXMemberCallExpr {{.*}} 'void'
 // CHECK-NEXT: MemberExpr {{.*}} 'void ()' lvalue .__finalize
-// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'sycl::stream':'sycl::stream' lvalue
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'sycl::stream' lvalue
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'sycl::stream *' <ArrayToPointerDecay>
 // CHECK-NEXT: MemberExpr {{.*}} 'sycl::stream[2]' lvalue .s_array
 // CHECK-NEXT: ArraySubscriptExpr {{.*}}'HasStreams' lvalue
@@ -646,7 +646,7 @@ int main() {
 // element 1
 // CHECK: CXXMemberCallExpr {{.*}} 'void'
 // CHECK-NEXT: MemberExpr {{.*}} 'void ()' lvalue .__finalize
-// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'sycl::stream':'sycl::stream' lvalue
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'sycl::stream' lvalue
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'sycl::stream *' <ArrayToPointerDecay>
 // CHECK-NEXT: MemberExpr {{.*}} 'sycl::stream[2]' lvalue .s_array
 // CHECK-NEXT: ArraySubscriptExpr {{.*}}'HasStreams' lvalue
@@ -659,7 +659,7 @@ int main() {
 // second element
 // CHECK: CXXMemberCallExpr {{.*}} 'void'
 // CHECK-NEXT: MemberExpr {{.*}} 'void ()' lvalue .__finalize
-// CHECK-NEXT: MemberExpr {{.*}} 'sycl::stream':'sycl::stream' lvalue .s1
+// CHECK-NEXT: MemberExpr {{.*}} 'sycl::stream' lvalue .s1
 // CHECK-NEXT: ArraySubscriptExpr {{.*}}'HasStreams' lvalue
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'HasStreams *' <ArrayToPointerDecay>
 // CHECK-NEXT: MemberExpr {{.*}} 'HasStreams[2]' lvalue .hs
@@ -669,7 +669,7 @@ int main() {
 // array:
 // CHECK: CXXMemberCallExpr {{.*}} 'void'
 // CHECK-NEXT: MemberExpr {{.*}} 'void ()' lvalue .__finalize
-// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'sycl::stream':'sycl::stream' lvalue
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'sycl::stream' lvalue
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'sycl::stream *' <ArrayToPointerDecay>
 // CHECK-NEXT: MemberExpr {{.*}} 'sycl::stream[2]' lvalue .s_array
 // CHECK-NEXT: ArraySubscriptExpr {{.*}}'HasStreams' lvalue
@@ -682,7 +682,7 @@ int main() {
 // element 1
 // CHECK: CXXMemberCallExpr {{.*}} 'void'
 // CHECK-NEXT: MemberExpr {{.*}} 'void ()' lvalue .__finalize
-// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'sycl::stream':'sycl::stream' lvalue
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'sycl::stream' lvalue
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'sycl::stream *' <ArrayToPointerDecay>
 // CHECK-NEXT: MemberExpr {{.*}} 'sycl::stream[2]' lvalue .s_array
 // CHECK-NEXT: ArraySubscriptExpr {{.*}}'HasStreams' lvalue
@@ -697,7 +697,7 @@ int main() {
 // First element
 // CHECK: CXXMemberCallExpr {{.*}} 'void'
 // CHECK-NEXT: MemberExpr {{.*}} 'void ()' lvalue .__finalize
-// CHECK-NEXT: MemberExpr {{.*}} 'sycl::stream':'sycl::stream' lvalue .s1
+// CHECK-NEXT: MemberExpr {{.*}} 'sycl::stream' lvalue .s1
 // CHECK-NEXT: ArraySubscriptExpr {{.*}}'HasStreams' lvalue
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'HasStreams *' <ArrayToPointerDecay>
 // CHECK-NEXT: MemberExpr {{.*}} 'HasStreams[2]' lvalue .hs
@@ -710,7 +710,7 @@ int main() {
 // array:
 // CHECK: CXXMemberCallExpr {{.*}} 'void'
 // CHECK-NEXT: MemberExpr {{.*}} 'void ()' lvalue .__finalize
-// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'sycl::stream':'sycl::stream' lvalue
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'sycl::stream' lvalue
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'sycl::stream *' <ArrayToPointerDecay>
 // CHECK-NEXT: MemberExpr {{.*}} 'sycl::stream[2]' lvalue .s_array
 // CHECK-NEXT: ArraySubscriptExpr {{.*}}'HasStreams' lvalue
@@ -726,7 +726,7 @@ int main() {
 // element 1
 // CHECK: CXXMemberCallExpr {{.*}} 'void'
 // CHECK-NEXT: MemberExpr {{.*}} 'void ()' lvalue .__finalize
-// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'sycl::stream':'sycl::stream' lvalue
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'sycl::stream' lvalue
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'sycl::stream *' <ArrayToPointerDecay>
 // CHECK-NEXT: MemberExpr {{.*}} 'sycl::stream[2]' lvalue .s_array
 // CHECK-NEXT: ArraySubscriptExpr {{.*}}'HasStreams' lvalue
@@ -742,7 +742,7 @@ int main() {
 // second element
 // CHECK: CXXMemberCallExpr {{.*}} 'void'
 // CHECK-NEXT: MemberExpr {{.*}} 'void ()' lvalue .__finalize
-// CHECK-NEXT: MemberExpr {{.*}} 'sycl::stream':'sycl::stream' lvalue .s1
+// CHECK-NEXT: MemberExpr {{.*}} 'sycl::stream' lvalue .s1
 // CHECK-NEXT: ArraySubscriptExpr {{.*}}'HasStreams' lvalue
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'HasStreams *' <ArrayToPointerDecay>
 // CHECK-NEXT: MemberExpr {{.*}} 'HasStreams[2]' lvalue .hs
@@ -755,7 +755,7 @@ int main() {
 // array:
 // CHECK: CXXMemberCallExpr {{.*}} 'void'
 // CHECK-NEXT: MemberExpr {{.*}} 'void ()' lvalue .__finalize
-// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'sycl::stream':'sycl::stream' lvalue
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'sycl::stream' lvalue
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'sycl::stream *' <ArrayToPointerDecay>
 // CHECK-NEXT: MemberExpr {{.*}} 'sycl::stream[2]' lvalue .s_array
 // CHECK-NEXT: ArraySubscriptExpr {{.*}}'HasStreams' lvalue
@@ -771,7 +771,7 @@ int main() {
 // element 1
 // CHECK: CXXMemberCallExpr {{.*}} 'void'
 // CHECK-NEXT: MemberExpr {{.*}} 'void ()' lvalue .__finalize
-// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'sycl::stream':'sycl::stream' lvalue
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'sycl::stream' lvalue
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'sycl::stream *' <ArrayToPointerDecay>
 // CHECK-NEXT: MemberExpr {{.*}} 'sycl::stream[2]' lvalue .s_array
 // CHECK-NEXT: ArraySubscriptExpr {{.*}}'HasStreams' lvalue
@@ -787,7 +787,7 @@ int main() {
 // second element
 // CHECK: CXXMemberCallExpr {{.*}} 'void'
 // CHECK-NEXT: MemberExpr {{.*}} 'void ()' lvalue .__finalize
-// CHECK-NEXT: MemberExpr {{.*}} 'sycl::stream':'sycl::stream' lvalue .s1
+// CHECK-NEXT: MemberExpr {{.*}} 'sycl::stream' lvalue .s1
 // CHECK-NEXT: ArraySubscriptExpr {{.*}}'HasStreams' lvalue
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'HasStreams *' <ArrayToPointerDecay>
 // CHECK-NEXT: MemberExpr {{.*}} 'HasStreams[2]' lvalue .hs
@@ -800,7 +800,7 @@ int main() {
 // array:
 // CHECK: CXXMemberCallExpr {{.*}} 'void'
 // CHECK-NEXT: MemberExpr {{.*}} 'void ()' lvalue .__finalize
-// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'sycl::stream':'sycl::stream' lvalue
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'sycl::stream' lvalue
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'sycl::stream *' <ArrayToPointerDecay>
 // CHECK-NEXT: MemberExpr {{.*}} 'sycl::stream[2]' lvalue .s_array
 // CHECK-NEXT: ArraySubscriptExpr {{.*}}'HasStreams' lvalue
@@ -816,7 +816,7 @@ int main() {
 // element 1
 // CHECK: CXXMemberCallExpr {{.*}} 'void'
 // CHECK-NEXT: MemberExpr {{.*}} 'void ()' lvalue .__finalize
-// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'sycl::stream':'sycl::stream' lvalue
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'sycl::stream' lvalue
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'sycl::stream *' <ArrayToPointerDecay>
 // CHECK-NEXT: MemberExpr {{.*}} 'sycl::stream[2]' lvalue .s_array
 // CHECK-NEXT: ArraySubscriptExpr {{.*}}'HasStreams' lvalue
@@ -832,7 +832,7 @@ int main() {
 // second element
 // CHECK: CXXMemberCallExpr {{.*}} 'void'
 // CHECK-NEXT: MemberExpr {{.*}} 'void ()' lvalue .__finalize
-// CHECK-NEXT: MemberExpr {{.*}} 'sycl::stream':'sycl::stream' lvalue .s1
+// CHECK-NEXT: MemberExpr {{.*}} 'sycl::stream' lvalue .s1
 // CHECK-NEXT: ArraySubscriptExpr {{.*}}'HasStreams' lvalue
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'HasStreams *' <ArrayToPointerDecay>
 // CHECK-NEXT: MemberExpr {{.*}} 'HasStreams[2]' lvalue .hs
@@ -845,7 +845,7 @@ int main() {
 // array:
 // CHECK: CXXMemberCallExpr {{.*}} 'void'
 // CHECK-NEXT: MemberExpr {{.*}} 'void ()' lvalue .__finalize
-// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'sycl::stream':'sycl::stream' lvalue
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'sycl::stream' lvalue
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'sycl::stream *' <ArrayToPointerDecay>
 // CHECK-NEXT: MemberExpr {{.*}} 'sycl::stream[2]' lvalue .s_array
 // CHECK-NEXT: ArraySubscriptExpr {{.*}}'HasStreams' lvalue
@@ -861,7 +861,7 @@ int main() {
 // element 1
 // CHECK: CXXMemberCallExpr {{.*}} 'void'
 // CHECK-NEXT: MemberExpr {{.*}} 'void ()' lvalue .__finalize
-// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'sycl::stream':'sycl::stream' lvalue
+// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'sycl::stream' lvalue
 // CHECK-NEXT: ImplicitCastExpr {{.*}} 'sycl::stream *' <ArrayToPointerDecay>
 // CHECK-NEXT: MemberExpr {{.*}} 'sycl::stream[2]' lvalue .s_array
 // CHECK-NEXT: ArraySubscriptExpr {{.*}}'HasStreams' lvalue
diff --git a/clang/test/SemaSYCL/union-kernel-param1.cpp b/clang/test/SemaSYCL/union-kernel-param1.cpp
index 48f5b0c7a7cb4..aef01a26b140a 100644
--- a/clang/test/SemaSYCL/union-kernel-param1.cpp
+++ b/clang/test/SemaSYCL/union-kernel-param1.cpp
@@ -47,13 +47,13 @@ int main() {
 
 // Check kernel_B parameters
 // CHECK: FunctionDecl {{.*}}kernel_B{{.*}} 'void (S<int>)'
-// CHECK-NEXT: ParmVarDecl {{.*}} used _arg_s 'S<int>':'S<int>'
+// CHECK-NEXT: ParmVarDecl {{.*}} used _arg_s 'S<int>'
 
 // Check kernel_B inits
 // CHECK-NEXT: CompoundStmt
 // CHECK-NEXT: DeclStmt
 // CHECK-NEXT: VarDecl {{.*}} cinit
 // CHECK-NEXT: InitListExpr
-// CHECK-NEXT: CXXConstructExpr {{.*}} 'S<int>':'S<int>' 'void (const S<int> &) noexcept'
+// CHECK-NEXT: CXXConstructExpr {{.*}} 'S<int>' 'void (const S<int> &) noexcept'
 // CHECK-NEXT: ImplicitCastExpr {{.*}}  'const S<int>'
-// CHECK-NEXT: DeclRefExpr {{.*}} 'S<int>':'S<int>' lvalue ParmVar {{.*}} '_arg_s' 'S<int>':'S<int>'
+// CHECK-NEXT: DeclRefExpr {{.*}} 'S<int>' lvalue ParmVar {{.*}} '_arg_s' 'S<int>'
diff --git a/clang/test/SemaSYCL/union-kernel-param2.cpp b/clang/test/SemaSYCL/union-kernel-param2.cpp
index d10cb27376950..920d73603e6c1 100644
--- a/clang/test/SemaSYCL/union-kernel-param2.cpp
+++ b/clang/test/SemaSYCL/union-kernel-param2.cpp
@@ -107,7 +107,7 @@ int main() {
 // CHECK-NEXT: VarDecl {{.*}} cinit
 // CHECK-NEXT: InitListExpr
 // CHECK-NEXT: CXXConstructExpr {{.*}} 'struct MyStructWithPtr':'MyStructWithPtr' 'void (const MyStructWithPtr &) noexcept'
-// CHECK-NEXT: ImplicitCastExpr {{.*}} 'const MyStructWithPtr':'const MyStructWithPtr' lvalue <NoOp>
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'const MyStructWithPtr' lvalue <NoOp>
 // CHECK-NEXT: UnaryOperator {{.*}} 'struct MyStructWithPtr':'MyStructWithPtr' lvalue prefix '*' cannot overflow
 // CHECK-NEXT: CXXReinterpretCastExpr {{.*}} 'struct MyStructWithPtr *' reinterpret_cast<struct MyStructWithPtr *> <BitCast>
 // CHECK-NEXT: UnaryOperator {{.*}} '__generated_MyStructWithPtr *' prefix '&' cannot overflow
diff --git a/clang/test/SemaSYCL/uses_aspects_ast.cpp b/clang/test/SemaSYCL/uses_aspects_ast.cpp
index cea616a758cd8..0d5155aa6dde3 100644
--- a/clang/test/SemaSYCL/uses_aspects_ast.cpp
+++ b/clang/test/SemaSYCL/uses_aspects_ast.cpp
@@ -35,7 +35,7 @@ queue q;
 // CHECK-NEXT: CompoundStmt
 // CHECK-NEXT: SYCLUsesAspectsAttr
 // CHECK-NEXT: SubstNonTypeTemplateParmExpr {{.*}} 'sycl::aspect'
-// CHECK-NEXT: NonTypeTemplateParmDecl {{.*}} referenced 'sycl::aspect':'sycl::aspect' depth 0 index 0 Aspect
+// CHECK-NEXT: NonTypeTemplateParmDecl {{.*}} referenced 'sycl::aspect' depth 0 index 0 Aspect
 // CHECK-NEXT: CStyleCastExpr {{.*}} 'sycl::aspect' <IntegralCast>
 // CHECK-NEXT: IntegerLiteral {{.*}} 'int' 0
 template <sycl::aspect Aspect>
diff --git a/clang/test/SemaSYCL/wrapped-accessor.cpp b/clang/test/SemaSYCL/wrapped-accessor.cpp
index d886f0075ca33..ba2bb59cc79c6 100644
--- a/clang/test/SemaSYCL/wrapped-accessor.cpp
+++ b/clang/test/SemaSYCL/wrapped-accessor.cpp
@@ -38,12 +38,12 @@ int main() {
 // CHECK: VarDecl {{.*}}'(lambda at {{.*}}wrapped-accessor.cpp{{.*}})'
 // CHECK-NEXT: InitListExpr {{.*}}'(lambda at {{.*}}wrapped-accessor.cpp{{.*}})'
 // CHECK-NEXT: InitListExpr {{.*}}'AccWrapper<sycl::accessor<int, 1, sycl::access::mode::read_write>>'
-// CHECK-NEXT: CXXConstructExpr {{.*}}'sycl::accessor<int, 1, sycl::access::mode::read_write>':'sycl::accessor<int, 1, sycl::access::mode::read_write>' 'void () noexcept'
+// CHECK-NEXT: CXXConstructExpr {{.*}}'sycl::accessor<int, 1, sycl::access::mode::read_write>' 'void () noexcept'
 
 // Check that accessor field of the wrapper object is initialized using __init method
 // CHECK-NEXT: CXXMemberCallExpr {{.*}} 'void'
 // CHECK-NEXT: MemberExpr {{.*}} 'void ({{.*}}PtrType, range<1>, range<1>, id<1>)' lvalue .__init
-// CHECK-NEXT: MemberExpr {{.*}} 'sycl::accessor<int, 1, sycl::access::mode::read_write>':'sycl::accessor<int, 1, sycl::access::mode::read_write>' lvalue .accessor {{.*}}
+// CHECK-NEXT: MemberExpr {{.*}} 'sycl::accessor<int, 1, sycl::access::mode::read_write>' lvalue .accessor {{.*}}
 // CHECK-NEXT: MemberExpr {{.*}} 'AccWrapper<decltype(acc)>':'AccWrapper<sycl::accessor<int, 1, sycl::access::mode::read_write>>' lvalue .
 // CHECK-NEXT: DeclRefExpr {{.*}} '(lambda at {{.*}}wrapped-accessor.cpp{{.*}})' lvalue Var {{.*}} '(lambda at {{.*}}wrapped-accessor.cpp{{.*}})'
 

From 5a917ec3f070bf6b1befbefb441e625d90732ad4 Mon Sep 17 00:00:00 2001
From: wangdi4 <101905226+wangdi4@users.noreply.github.com>
Date: Thu, 2 Nov 2023 11:28:37 -0400
Subject: [PATCH 818/877] [SYCL] Implement USM allocation functions with
 properties  (#10235)

This is an implementation of
https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/proposed/sycl_ext_oneapi_usm_malloc_properties.asciidoc

The path for the new extension is
`sycl/include/sycl/ext/oneapi/experimental/annotated_usm`, which
contains 6 header files:
- **alloc_util.hpp**: all the utilities (functions & type traits) used
in the implementation
- **alloc_base.hpp**: allocation APIs with no explicit usm kind on the
function name, i.e. `aligned_alloc_annotated` and `malloc_annotated`,
where the usm kind is passed through argument or property list. These
functions are the base of the other annotated allocation functions.
- **alloc_device.hpp**: allocation APIs with "device" on the function
name
-  **alloc_host.hpp**:  allocation APIs with "host" on the function name
- **alloc_shared.hpp**: allocation APIs with "shared" on the function
name
- **dealloc.hpp**: the deallocation APIs

The new USM alloc APIs with property support are basically wrappers on
the existing USM alloc functions (e.g. `sycl::malloc_host`). The new
APIs do the following things under the hood:
1. extract values of the input compile-time properties `usm_kind`,
`buffer_location` and `alignment`
2. use the values from (1) as the arguments and invoke existing alloc
functions
- `usm_kind`: check if it conflicts with the function (e.g. usm_kind is
_host_ while calling _malloc_device_annotated_)
- `buffer_location`: generate the runtime
`property_list{buffer_location(N)}`, which is passed to
_sycl::malloc_xxx_ as a argument
- `alignment`: combined with the alignment argument (for
_aligned_alloc_xxx_ function) by the following rules
       - if one of the alignments is 0, use the other alignment
       - otherwise, return the least common multiple
4. create annotated_ptr instance upon the raw pointer from (2), and
return

### Tests added
lit tests:
- sycl/test/extensions/annotated_usm/annotated_usm.cpp
- sycl/test/extensions/annotated_usm/invalid_compile_time_properties.cpp
- sycl/test/extensions/annotated_usm/invalid_runtime_properties.cpp
- sycl/test/extensions/annotated_usm/invalid_usm_kind.cpp
- (test utility) sycl/test/extensions/annotated_usm/fake_properties.cpp

e2e tests:
- sycl/test-e2e/Annotated_usm/annotated_usm_align.cpp
- sycl/test-e2e/Annotated_usm/annotated_usm_align_shared.cpp
- sycl/test-e2e/Annotated_usm/annotated_usm_kind.cpp
- sycl/test-e2e/Annotated_usm/annotated_usm_negative.cpp
---
 .../annotated_ptr/annotated_ptr.hpp           |   1 +
 .../annotated_ptr_properties.hpp              |  71 +++
 .../experimental/annotated_usm/alloc_base.hpp | 261 ++++++++++
 .../annotated_usm/alloc_device.hpp            | 161 +++++++
 .../experimental/annotated_usm/alloc_host.hpp | 150 ++++++
 .../annotated_usm/alloc_shared.hpp            | 161 +++++++
 .../experimental/annotated_usm/alloc_util.hpp | 248 ++++++++++
 .../experimental/annotated_usm/dealloc.hpp    |  36 ++
 .../sycl/ext/oneapi/properties/property.hpp   |   3 +-
 sycl/include/sycl/sycl.hpp                    |   4 +
 sycl/source/feature_test.hpp.in               |   1 +
 .../Annotated_usm/annotated_usm_align.cpp     | 422 +++++++++++++++++
 .../Annotated_usm/annotated_usm_kind.cpp      | 149 ++++++
 .../Annotated_usm/annotated_usm_negative.cpp  | 102 ++++
 .../annotated_usm_shared_align.cpp            | 254 ++++++++++
 .../annotated_usm/annotated_usm.cpp           | 194 ++++++++
 .../annotated_usm/fake_properties.hpp         | 446 ++++++++++++++++++
 .../invalid_compile_time_properties.cpp       | 170 +++++++
 .../invalid_runtime_properties.cpp            | 151 ++++++
 .../annotated_usm/invalid_usm_kind.cpp        | 111 +++++
 20 files changed, 3095 insertions(+), 1 deletion(-)
 create mode 100644 sycl/include/sycl/ext/oneapi/experimental/annotated_ptr/annotated_ptr_properties.hpp
 create mode 100644 sycl/include/sycl/ext/oneapi/experimental/annotated_usm/alloc_base.hpp
 create mode 100644 sycl/include/sycl/ext/oneapi/experimental/annotated_usm/alloc_device.hpp
 create mode 100644 sycl/include/sycl/ext/oneapi/experimental/annotated_usm/alloc_host.hpp
 create mode 100644 sycl/include/sycl/ext/oneapi/experimental/annotated_usm/alloc_shared.hpp
 create mode 100644 sycl/include/sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp
 create mode 100644 sycl/include/sycl/ext/oneapi/experimental/annotated_usm/dealloc.hpp
 create mode 100644 sycl/test-e2e/Annotated_usm/annotated_usm_align.cpp
 create mode 100644 sycl/test-e2e/Annotated_usm/annotated_usm_kind.cpp
 create mode 100644 sycl/test-e2e/Annotated_usm/annotated_usm_negative.cpp
 create mode 100644 sycl/test-e2e/Annotated_usm/annotated_usm_shared_align.cpp
 create mode 100644 sycl/test/extensions/annotated_usm/annotated_usm.cpp
 create mode 100644 sycl/test/extensions/annotated_usm/fake_properties.hpp
 create mode 100644 sycl/test/extensions/annotated_usm/invalid_compile_time_properties.cpp
 create mode 100644 sycl/test/extensions/annotated_usm/invalid_runtime_properties.cpp
 create mode 100644 sycl/test/extensions/annotated_usm/invalid_usm_kind.cpp

diff --git a/sycl/include/sycl/ext/oneapi/experimental/annotated_ptr/annotated_ptr.hpp b/sycl/include/sycl/ext/oneapi/experimental/annotated_ptr/annotated_ptr.hpp
index 1a54f73703278..74298f13a8dbc 100644
--- a/sycl/include/sycl/ext/oneapi/experimental/annotated_ptr/annotated_ptr.hpp
+++ b/sycl/include/sycl/ext/oneapi/experimental/annotated_ptr/annotated_ptr.hpp
@@ -10,6 +10,7 @@
 
 #include <sycl/detail/defines.hpp>
 #include <sycl/ext/intel/experimental/fpga_annotated_properties.hpp>
+#include <sycl/ext/oneapi/experimental/annotated_ptr/annotated_ptr_properties.hpp>
 #include <sycl/ext/oneapi/experimental/common_annotated_properties/properties.hpp>
 #include <sycl/ext/oneapi/properties/properties.hpp>
 #include <sycl/ext/oneapi/properties/property.hpp>
diff --git a/sycl/include/sycl/ext/oneapi/experimental/annotated_ptr/annotated_ptr_properties.hpp b/sycl/include/sycl/ext/oneapi/experimental/annotated_ptr/annotated_ptr_properties.hpp
new file mode 100644
index 0000000000000..6a6cbad0f417e
--- /dev/null
+++ b/sycl/include/sycl/ext/oneapi/experimental/annotated_ptr/annotated_ptr_properties.hpp
@@ -0,0 +1,71 @@
+//==-- annotated_ptr_properties.hpp - Specific properties of annotated_ptr
+//--==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <sycl/ext/oneapi/properties/properties.hpp> // for properties_t
+#include <sycl/usm/usm_enums.hpp>
+
+#include <type_traits> // for false_type, con...
+#include <utility>     // for declval
+
+namespace sycl {
+inline namespace _V1 {
+namespace ext {
+namespace oneapi {
+namespace experimental {
+
+template <typename T, typename PropertyListT> class annotated_ptr;
+
+struct usm_kind_key {
+  template <sycl::usm::alloc Kind>
+  using value_t =
+      property_value<usm_kind_key,
+                     std::integral_constant<sycl::usm::alloc, Kind>>;
+};
+
+template <sycl::usm::alloc Kind>
+inline constexpr usm_kind_key::value_t<Kind> usm_kind;
+inline constexpr usm_kind_key::value_t<sycl::usm::alloc::device>
+    usm_kind_device;
+inline constexpr usm_kind_key::value_t<sycl::usm::alloc::host> usm_kind_host;
+inline constexpr usm_kind_key::value_t<sycl::usm::alloc::shared>
+    usm_kind_shared;
+
+template <> struct is_property_key<usm_kind_key> : std::true_type {};
+
+template <typename T, sycl::usm::alloc Kind>
+struct is_valid_property<T, usm_kind_key::value_t<Kind>>
+    : std::bool_constant<std::is_pointer<T>::value> {};
+
+template <typename T, typename PropertyListT>
+struct is_property_key_of<usm_kind_key, annotated_ptr<T, PropertyListT>>
+    : std::true_type {};
+
+namespace detail {
+
+template <> struct PropertyToKind<usm_kind_key> {
+  static constexpr PropKind Kind = PropKind::UsmKind;
+};
+
+template <> struct IsCompileTimeProperty<usm_kind_key> : std::true_type {};
+
+template <sycl::usm::alloc Kind>
+struct PropertyMetaInfo<usm_kind_key::value_t<Kind>> {
+  static constexpr const char *name = "sycl-usm-kind";
+  static constexpr sycl::usm::alloc value = Kind;
+};
+
+} // namespace detail
+
+} // namespace experimental
+} // namespace oneapi
+} // namespace ext
+} // namespace _V1
+} // namespace sycl
diff --git a/sycl/include/sycl/ext/oneapi/experimental/annotated_usm/alloc_base.hpp b/sycl/include/sycl/ext/oneapi/experimental/annotated_usm/alloc_base.hpp
new file mode 100644
index 0000000000000..f57957bcb1af0
--- /dev/null
+++ b/sycl/include/sycl/ext/oneapi/experimental/annotated_usm/alloc_base.hpp
@@ -0,0 +1,261 @@
+//==-------- alloc_base.hpp - SYCL annotated usm basic allocation -------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <sycl/ext/oneapi/experimental/annotated_ptr/annotated_ptr.hpp>
+#include <sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp>
+
+namespace sycl {
+inline namespace _V1 {
+namespace ext {
+namespace oneapi {
+namespace experimental {
+
+////
+//  Parameterized USM allocation functions with properties support
+//  These functions take a USM kind parameter that specifies the type of USM to
+//  allocate
+//
+//  Note: this function group is the base implementation of the other annotated
+//  allocation API, and is eventally called by:
+//  1. "xxx_alloc_annotated" with USM kind in the properties (defined in
+//  alloc_base.hpp)
+//  2. "xxx_alloc_device_annotated" (defined in alloc_device.hpp)
+//  3. "xxx_alloc_host_annotated"   (defined in alloc_host.hpp)
+//  4. "xxx_alloc_shared_annotated" (defined in alloc_shared.hpp)
+////
+template <typename propertyListA = detail::empty_properties_t,
+          typename propertyListB =
+              typename detail::GetCompileTimeProperties<propertyListA>::type>
+std::enable_if_t<
+    detail::CheckTAndPropLists<void, propertyListA, propertyListB>::value,
+    annotated_ptr<void, propertyListB>>
+aligned_alloc_annotated(size_t alignment, size_t numBytes,
+                        const device &syclDevice, const context &syclContext,
+                        sycl::usm::alloc kind,
+                        const propertyListA &propList = properties{}) {
+  detail::ValidAllocPropertyList<void, propertyListA>::value;
+
+  // The input argument `propList` is useful when propertyListA contains valid
+  // runtime properties. While such case is not defined yet, suppress unused
+  // variables warning
+  static_cast<void>(propList);
+
+  constexpr size_t alignFromPropList =
+      detail::GetAlignFromPropList<propertyListA>::value;
+  const property_list &usmPropList = get_usm_property_list<propertyListA>();
+
+  if constexpr (detail::HasUsmKind<propertyListA>::value) {
+    constexpr sycl::usm::alloc usmKind =
+        detail::GetUsmKindFromPropList<propertyListA>::value;
+    if (usmKind != kind) {
+      throw sycl::exception(
+          sycl::make_error_code(sycl::errc::invalid),
+          "Input property list of USM allocation function contains usm_kind "
+          "property that conflicts with the usm kind argument");
+    }
+  }
+
+  if (kind == sycl::usm::alloc::unknown)
+    throw sycl::exception(sycl::make_error_code(sycl::errc::invalid),
+                          "Unknown USM allocation kind was specified.");
+
+  void *rawPtr =
+      sycl::aligned_alloc(combine_align(alignment, alignFromPropList), numBytes,
+                          syclDevice, syclContext, kind, usmPropList);
+  return annotated_ptr<void, propertyListB>(rawPtr);
+}
+
+template <typename T, typename propertyListA = detail::empty_properties_t,
+          typename propertyListB =
+              typename detail::GetCompileTimeProperties<propertyListA>::type>
+std::enable_if_t<
+    detail::CheckTAndPropLists<T, propertyListA, propertyListB>::value,
+    annotated_ptr<T, propertyListB>>
+aligned_alloc_annotated(size_t alignment, size_t count,
+                        const device &syclDevice, const context &syclContext,
+                        sycl::usm::alloc kind,
+                        const propertyListA &propList = properties{}) {
+  detail::ValidAllocPropertyList<T, propertyListA>::value;
+
+  // The input argument `propList` is useful when propertyListA contains valid
+  // runtime properties. While such case is not defined yet, suppress unused
+  // variables warning
+  static_cast<void>(propList);
+
+  constexpr size_t alignFromPropList =
+      detail::GetAlignFromPropList<propertyListA>::value;
+  const property_list &usmPropList = get_usm_property_list<propertyListA>();
+
+  if constexpr (detail::HasUsmKind<propertyListA>::value) {
+    constexpr sycl::usm::alloc usmKind =
+        detail::GetUsmKindFromPropList<propertyListA>::value;
+    if (usmKind != kind) {
+      throw sycl::exception(
+          sycl::make_error_code(sycl::errc::invalid),
+          "Input property list of USM allocation function contains usm_kind "
+          "property that conflicts with the usm kind argument");
+    }
+  }
+
+  if (kind == sycl::usm::alloc::unknown)
+    throw sycl::exception(sycl::make_error_code(sycl::errc::invalid),
+                          "Unknown USM allocation kind was specified.");
+
+  void *rawPtr = sycl::aligned_alloc(
+      combine_align(alignment, alignFromPropList), count * sizeof(T),
+      syclDevice, syclContext, kind, usmPropList);
+  return annotated_ptr<T, propertyListB>(static_cast<T *>(rawPtr));
+}
+
+template <typename propertyListA = detail::empty_properties_t,
+          typename propertyListB =
+              typename detail::GetCompileTimeProperties<propertyListA>::type>
+std::enable_if_t<
+    detail::CheckTAndPropLists<void, propertyListA, propertyListB>::value,
+    annotated_ptr<void, propertyListB>>
+aligned_alloc_annotated(size_t alignment, size_t numBytes,
+                        const queue &syclQueue, sycl::usm::alloc kind,
+                        const propertyListA &propList = properties{}) {
+  return aligned_alloc_annotated(alignment, numBytes, syclQueue.get_device(),
+                                 syclQueue.get_context(), kind, propList);
+}
+
+template <typename T, typename propertyListA = detail::empty_properties_t,
+          typename propertyListB =
+              typename detail::GetCompileTimeProperties<propertyListA>::type>
+std::enable_if_t<
+    detail::CheckTAndPropLists<T, propertyListA, propertyListB>::value,
+    annotated_ptr<T, propertyListB>>
+aligned_alloc_annotated(size_t alignment, size_t count, const queue &syclQueue,
+                        sycl::usm::alloc kind,
+                        const propertyListA &propList = properties{}) {
+  return aligned_alloc_annotated<T>(alignment, count, syclQueue.get_device(),
+                                    syclQueue.get_context(), kind, propList);
+}
+
+template <typename propertyListA = detail::empty_properties_t,
+          typename propertyListB =
+              typename detail::GetCompileTimeProperties<propertyListA>::type>
+std::enable_if_t<
+    detail::CheckTAndPropLists<void, propertyListA, propertyListB>::value,
+    annotated_ptr<void, propertyListB>>
+malloc_annotated(size_t numBytes, const device &syclDevice,
+                 const context &syclContext, sycl::usm::alloc kind,
+                 const propertyListA &propList = properties{}) {
+  return aligned_alloc_annotated(0, numBytes, syclDevice, syclContext, kind,
+                                 propList);
+}
+
+template <typename T, typename propertyListA = detail::empty_properties_t,
+          typename propertyListB =
+              typename detail::GetCompileTimeProperties<propertyListA>::type>
+std::enable_if_t<
+    detail::CheckTAndPropLists<T, propertyListA, propertyListB>::value,
+    annotated_ptr<T, propertyListB>>
+malloc_annotated(size_t count, const device &syclDevice,
+                 const context &syclContext, sycl::usm::alloc kind,
+                 const propertyListA &propList = properties{}) {
+  return aligned_alloc_annotated<T>(0, count, syclDevice, syclContext, kind,
+                                    propList);
+}
+
+template <typename propertyListA = detail::empty_properties_t,
+          typename propertyListB =
+              typename detail::GetCompileTimeProperties<propertyListA>::type>
+std::enable_if_t<
+    detail::CheckTAndPropLists<void, propertyListA, propertyListB>::value,
+    annotated_ptr<void, propertyListB>>
+malloc_annotated(size_t numBytes, const queue &syclQueue, sycl::usm::alloc kind,
+                 const propertyListA &propList = properties{}) {
+  return malloc_annotated(numBytes, syclQueue.get_device(),
+                          syclQueue.get_context(), kind, propList);
+}
+
+template <typename T, typename propertyListA = detail::empty_properties_t,
+          typename propertyListB =
+              typename detail::GetCompileTimeProperties<propertyListA>::type>
+std::enable_if_t<
+    detail::CheckTAndPropLists<T, propertyListA, propertyListB>::value,
+    annotated_ptr<T, propertyListB>>
+malloc_annotated(size_t count, const queue &syclQueue, sycl::usm::alloc kind,
+                 const propertyListA &propList = properties{}) {
+  return malloc_annotated<T>(count, syclQueue.get_device(),
+                             syclQueue.get_context(), kind, propList);
+}
+
+////
+//  Additional USM memory allocation functions with properties support that
+//  requires the usm_kind property to be specified on the input property list
+//
+//  These functions are implemented by extracting the usm kind from the property
+//  list and calling the usm-kind-as-argument version
+////
+
+template <typename propertyListA,
+          typename propertyListB =
+              typename detail::GetCompileTimeProperties<propertyListA>::type>
+std::enable_if_t<
+    detail::CheckTAndPropLists<void, propertyListA, propertyListB>::value,
+    annotated_ptr<void, propertyListB>>
+malloc_annotated(size_t numBytes, const device &syclDevice,
+                 const context &syclContext, const propertyListA &propList) {
+  constexpr sycl::usm::alloc usmKind =
+      detail::GetUsmKindFromPropList<propertyListA>::value;
+  static_assert(usmKind != sycl::usm::alloc::unknown,
+                "USM kind is not specified. Please specify it as an argument "
+                "or in the input property list.");
+  return malloc_annotated(numBytes, syclDevice, syclContext, usmKind, propList);
+}
+
+template <typename T, typename propertyListA,
+          typename propertyListB =
+              typename detail::GetCompileTimeProperties<propertyListA>::type>
+std::enable_if_t<
+    detail::CheckTAndPropLists<T, propertyListA, propertyListB>::value,
+    annotated_ptr<T, propertyListB>>
+malloc_annotated(size_t count, const device &syclDevice,
+                 const context &syclContext, const propertyListA &propList) {
+  constexpr sycl::usm::alloc usmKind =
+      detail::GetUsmKindFromPropList<propertyListA>::value;
+  static_assert(usmKind != sycl::usm::alloc::unknown,
+                "USM kind is not specified. Please specify it as an argument "
+                "or in the input property list.");
+  return malloc_annotated<T>(count, syclDevice, syclContext, usmKind, propList);
+}
+
+template <typename propertyListA,
+          typename propertyListB =
+              typename detail::GetCompileTimeProperties<propertyListA>::type>
+std::enable_if_t<
+    detail::CheckTAndPropLists<void, propertyListA, propertyListB>::value,
+    annotated_ptr<void, propertyListB>>
+malloc_annotated(size_t numBytes, const queue &syclQueue,
+                 const propertyListA &propList) {
+  return malloc_annotated(numBytes, syclQueue.get_device(),
+                          syclQueue.get_context(), propList);
+}
+
+template <typename T, typename propertyListA,
+          typename propertyListB =
+              typename detail::GetCompileTimeProperties<propertyListA>::type>
+std::enable_if_t<
+    detail::CheckTAndPropLists<T, propertyListA, propertyListB>::value,
+    annotated_ptr<T, propertyListB>>
+malloc_annotated(size_t count, const queue &syclQueue,
+                 const propertyListA &propList) {
+  return malloc_annotated<T>(count, syclQueue.get_device(),
+                             syclQueue.get_context(), propList);
+}
+
+} // namespace experimental
+} // namespace oneapi
+} // namespace ext
+} // namespace _V1
+} // namespace sycl
\ No newline at end of file
diff --git a/sycl/include/sycl/ext/oneapi/experimental/annotated_usm/alloc_device.hpp b/sycl/include/sycl/ext/oneapi/experimental/annotated_usm/alloc_device.hpp
new file mode 100644
index 0000000000000..290f012d08d98
--- /dev/null
+++ b/sycl/include/sycl/ext/oneapi/experimental/annotated_usm/alloc_device.hpp
@@ -0,0 +1,161 @@
+//==-------- alloc_device.hpp - SYCL annotated usm device allocation -------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <sycl/ext/oneapi/experimental/annotated_usm/alloc_base.hpp>
+
+namespace sycl {
+inline namespace _V1 {
+namespace ext {
+namespace oneapi {
+namespace experimental {
+
+template <typename T, typename ListA, typename ListB>
+using CheckDevicePtrTAndPropLists =
+    typename detail::CheckTAndPropListsWithUsmKind<sycl::usm::alloc::device, T,
+                                                   ListA, ListB>;
+
+template <typename PropertyListT>
+using GetAnnotatedDevicePtrProperties =
+    detail::GetAnnotatedPtrPropertiesWithUsmKind<sycl::usm::alloc::device,
+                                                 PropertyListT>;
+
+////
+//  "aligned_alloc_device_annotated": aligned device USM allocation functions
+//  with properties support
+//
+//  This the base form of all the annotated USM device allocation functions,
+//  which are implemented by calling the more generic "aligned_alloc_annotated"
+//  functions with the USM kind as an argument. Note that the returned
+//  annotated_ptr of "aligned_alloc_annotated" may not contain  the
+//  `usm_kind<alloc::device>`, so reconstruct the real annotated_ptr that
+//  contains usm_kind using the raw pointer of "aligned_alloc_annotated" result
+////
+template <typename propertyListA = detail::empty_properties_t,
+          typename propertyListB =
+              typename GetAnnotatedDevicePtrProperties<propertyListA>::type>
+std::enable_if_t<
+    CheckDevicePtrTAndPropLists<void, propertyListA, propertyListB>::value,
+    annotated_ptr<void, propertyListB>>
+aligned_alloc_device_annotated(size_t alignment, size_t numBytes,
+                               const device &syclDevice,
+                               const context &syclContext,
+                               const propertyListA &propList = properties{}) {
+  auto tmp =
+      aligned_alloc_annotated(alignment, numBytes, syclDevice, syclContext,
+                              sycl::usm::alloc::device, propList);
+  return annotated_ptr<void, propertyListB>(tmp.get());
+}
+
+template <typename T, typename propertyListA = detail::empty_properties_t,
+          typename propertyListB =
+              typename GetAnnotatedDevicePtrProperties<propertyListA>::type>
+std::enable_if_t<
+    CheckDevicePtrTAndPropLists<T, propertyListA, propertyListB>::value,
+    annotated_ptr<T, propertyListB>>
+aligned_alloc_device_annotated(size_t alignment, size_t count,
+                               const device &syclDevice,
+                               const context &syclContext,
+                               const propertyListA &propList = properties{}) {
+  auto tmp =
+      aligned_alloc_annotated<T>(alignment, count, syclDevice, syclContext,
+                                 sycl::usm::alloc::device, propList);
+  return annotated_ptr<T, propertyListB>(tmp.get());
+}
+
+template <typename propertyListA = detail::empty_properties_t,
+          typename propertyListB =
+              typename GetAnnotatedDevicePtrProperties<propertyListA>::type>
+std::enable_if_t<
+    CheckDevicePtrTAndPropLists<void, propertyListA, propertyListB>::value,
+    annotated_ptr<void, propertyListB>>
+aligned_alloc_device_annotated(size_t alignment, size_t numBytes,
+                               const queue &syclQueue,
+                               const propertyListA &propList = properties{}) {
+  return aligned_alloc_device_annotated(alignment, numBytes,
+                                        syclQueue.get_device(),
+                                        syclQueue.get_context(), propList);
+}
+
+template <typename T, typename propertyListA = detail::empty_properties_t,
+          typename propertyListB =
+              typename GetAnnotatedDevicePtrProperties<propertyListA>::type>
+std::enable_if_t<
+    CheckDevicePtrTAndPropLists<T, propertyListA, propertyListB>::value,
+    annotated_ptr<T, propertyListB>>
+aligned_alloc_device_annotated(size_t alignment, size_t count,
+                               const queue &syclQueue,
+                               const propertyListA &propList = properties{}) {
+  return aligned_alloc_device_annotated<T>(alignment, count,
+                                           syclQueue.get_device(),
+                                           syclQueue.get_context(), propList);
+}
+
+////
+//  "malloc_device_annotated": device USM allocation functions with properties
+//  support
+//
+//  Note: "malloc_device_annotated" functions call
+//  "aligned_alloc_device_annotated" with alignment 0
+////
+template <typename propertyListA = detail::empty_properties_t,
+          typename propertyListB =
+              typename GetAnnotatedDevicePtrProperties<propertyListA>::type>
+std::enable_if_t<
+    CheckDevicePtrTAndPropLists<void, propertyListA, propertyListB>::value,
+    annotated_ptr<void, propertyListB>>
+malloc_device_annotated(size_t numBytes, const device &syclDevice,
+                        const context &syclContext,
+                        const propertyListA &propList = properties{}) {
+  return aligned_alloc_device_annotated(0, numBytes, syclDevice, syclContext,
+                                        propList);
+}
+
+template <typename T, typename propertyListA = detail::empty_properties_t,
+          typename propertyListB =
+              typename GetAnnotatedDevicePtrProperties<propertyListA>::type>
+std::enable_if_t<
+    CheckDevicePtrTAndPropLists<T, propertyListA, propertyListB>::value,
+    annotated_ptr<T, propertyListB>>
+malloc_device_annotated(size_t count, const device &syclDevice,
+                        const context &syclContext,
+                        const propertyListA &propList = properties{}) {
+  return aligned_alloc_device_annotated<T>(0, count, syclDevice, syclContext,
+                                           propList);
+}
+
+template <typename propertyListA = detail::empty_properties_t,
+          typename propertyListB =
+              typename GetAnnotatedDevicePtrProperties<propertyListA>::type>
+std::enable_if_t<
+    CheckDevicePtrTAndPropLists<void, propertyListA, propertyListB>::value,
+    annotated_ptr<void, propertyListB>>
+malloc_device_annotated(size_t numBytes, const queue &syclQueue,
+                        const propertyListA &propList = properties{}) {
+  return malloc_device_annotated(numBytes, syclQueue.get_device(),
+                                 syclQueue.get_context(), propList);
+}
+
+template <typename T, typename propertyListA = detail::empty_properties_t,
+          typename propertyListB =
+              typename GetAnnotatedDevicePtrProperties<propertyListA>::type>
+std::enable_if_t<
+    CheckDevicePtrTAndPropLists<T, propertyListA, propertyListB>::value,
+    annotated_ptr<T, propertyListB>>
+malloc_device_annotated(size_t count, const queue &syclQueue,
+                        const propertyListA &propList = properties{}) {
+  return malloc_device_annotated<T>(count, syclQueue.get_device(),
+                                    syclQueue.get_context(), propList);
+}
+
+} // namespace experimental
+} // namespace oneapi
+} // namespace ext
+} // namespace _V1
+} // namespace sycl
\ No newline at end of file
diff --git a/sycl/include/sycl/ext/oneapi/experimental/annotated_usm/alloc_host.hpp b/sycl/include/sycl/ext/oneapi/experimental/annotated_usm/alloc_host.hpp
new file mode 100644
index 0000000000000..5a0fef1c35c32
--- /dev/null
+++ b/sycl/include/sycl/ext/oneapi/experimental/annotated_usm/alloc_host.hpp
@@ -0,0 +1,150 @@
+//==-------- alloc_host.hpp - SYCL annotated usm host allocation -------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <sycl/ext/oneapi/experimental/annotated_usm/alloc_base.hpp>
+
+namespace sycl {
+inline namespace _V1 {
+namespace ext {
+namespace oneapi {
+namespace experimental {
+
+template <typename T, typename ListA, typename ListB>
+using CheckHostPtrTAndPropLists =
+    typename detail::CheckTAndPropListsWithUsmKind<sycl::usm::alloc::host, T,
+                                                   ListA, ListB>;
+
+template <typename PropertyListT>
+using GetAnnotatedHostPtrProperties =
+    detail::GetAnnotatedPtrPropertiesWithUsmKind<sycl::usm::alloc::host,
+                                                 PropertyListT>;
+
+////
+//  "aligned_alloc_host_annotated": Aligned host USM allocation functions with
+//  properties support
+//
+//  This the base form of all the annotated USM host allocation functions, which
+//  are implemented by calling the more generic "aligned_alloc_annotated"
+//  functions with the USM kind as an argument. Note that when calling
+//  "aligned_alloc_annotated", the template parameter `propertyListA` should
+//  include the `usm_kind<alloc::host>` property to make it appear on the
+//  returned annotated_ptr of "aligned_alloc_annotated"
+////
+
+template <typename propertyListA = detail::empty_properties_t,
+          typename propertyListB =
+              typename GetAnnotatedHostPtrProperties<propertyListA>::type>
+std::enable_if_t<
+    CheckHostPtrTAndPropLists<void, propertyListA, propertyListB>::value,
+    annotated_ptr<void, propertyListB>>
+aligned_alloc_host_annotated(size_t alignment, size_t numBytes,
+                             const context &syclContext,
+                             const propertyListA &propList = properties{}) {
+  auto tmp = aligned_alloc_annotated(alignment, numBytes, {}, syclContext,
+                                     sycl::usm::alloc::host, propList);
+  return annotated_ptr<void, propertyListB>(tmp.get());
+}
+
+template <typename T, typename propertyListA = detail::empty_properties_t,
+          typename propertyListB =
+              typename GetAnnotatedHostPtrProperties<propertyListA>::type>
+std::enable_if_t<
+    CheckHostPtrTAndPropLists<T, propertyListA, propertyListB>::value,
+    annotated_ptr<T, propertyListB>>
+aligned_alloc_host_annotated(size_t alignment, size_t count,
+                             const context &syclContext,
+                             const propertyListA &propList = properties{}) {
+  auto tmp = aligned_alloc_annotated<T>(alignment, count, {}, syclContext,
+                                        sycl::usm::alloc::host, propList);
+  return annotated_ptr<T, propertyListB>(tmp.get());
+}
+
+template <typename propertyListA = detail::empty_properties_t,
+          typename propertyListB =
+              typename GetAnnotatedHostPtrProperties<propertyListA>::type>
+std::enable_if_t<
+    CheckHostPtrTAndPropLists<void, propertyListA, propertyListB>::value,
+    annotated_ptr<void, propertyListB>>
+aligned_alloc_host_annotated(size_t alignment, size_t numBytes,
+                             const queue &syclQueue,
+                             const propertyListA &propList = properties{}) {
+  return aligned_alloc_host_annotated(alignment, numBytes,
+                                      syclQueue.get_context(), propList);
+}
+
+template <typename T, typename propertyListA = detail::empty_properties_t,
+          typename propertyListB =
+              typename GetAnnotatedHostPtrProperties<propertyListA>::type>
+std::enable_if_t<
+    CheckHostPtrTAndPropLists<T, propertyListA, propertyListB>::value,
+    annotated_ptr<T, propertyListB>>
+aligned_alloc_host_annotated(size_t alignment, size_t count,
+                             const queue &syclQueue,
+                             const propertyListA &propList = properties{}) {
+  return aligned_alloc_host_annotated<T>(alignment, count,
+                                         syclQueue.get_context(), propList);
+}
+
+////
+//  Host USM allocation functions with properties support
+//
+//  Note: "malloc_host_annotated" functions call "aligned_alloc_host_annotated"
+//  with alignment 0
+////
+
+template <typename propertyListA = detail::empty_properties_t,
+          typename propertyListB =
+              typename GetAnnotatedHostPtrProperties<propertyListA>::type>
+std::enable_if_t<
+    CheckHostPtrTAndPropLists<void, propertyListA, propertyListB>::value,
+    annotated_ptr<void, propertyListB>>
+malloc_host_annotated(size_t numBytes, const context &syclContext,
+                      const propertyListA &propList = properties{}) {
+  return aligned_alloc_host_annotated(0, numBytes, syclContext, propList);
+}
+
+template <typename T, typename propertyListA = detail::empty_properties_t,
+          typename propertyListB =
+              typename GetAnnotatedHostPtrProperties<propertyListA>::type>
+std::enable_if_t<
+    CheckHostPtrTAndPropLists<T, propertyListA, propertyListB>::value,
+    annotated_ptr<T, propertyListB>>
+malloc_host_annotated(size_t count, const context &syclContext,
+                      const propertyListA &propList = properties{}) {
+  return aligned_alloc_host_annotated<T>(0, count, syclContext, propList);
+}
+
+template <typename propertyListA = detail::empty_properties_t,
+          typename propertyListB =
+              typename GetAnnotatedHostPtrProperties<propertyListA>::type>
+std::enable_if_t<
+    CheckHostPtrTAndPropLists<void, propertyListA, propertyListB>::value,
+    annotated_ptr<void, propertyListB>>
+malloc_host_annotated(size_t numBytes, const queue &syclQueue,
+                      const propertyListA &propList = properties{}) {
+  return malloc_host_annotated(numBytes, syclQueue.get_context(), propList);
+}
+
+template <typename T, typename propertyListA = detail::empty_properties_t,
+          typename propertyListB =
+              typename GetAnnotatedHostPtrProperties<propertyListA>::type>
+std::enable_if_t<
+    CheckHostPtrTAndPropLists<T, propertyListA, propertyListB>::value,
+    annotated_ptr<T, propertyListB>>
+malloc_host_annotated(size_t count, const queue &syclQueue,
+                      const propertyListA &propList = properties{}) {
+  return malloc_host_annotated<T>(count, syclQueue.get_context(), propList);
+}
+
+} // namespace experimental
+} // namespace oneapi
+} // namespace ext
+} // namespace _V1
+} // namespace sycl
\ No newline at end of file
diff --git a/sycl/include/sycl/ext/oneapi/experimental/annotated_usm/alloc_shared.hpp b/sycl/include/sycl/ext/oneapi/experimental/annotated_usm/alloc_shared.hpp
new file mode 100644
index 0000000000000..7d54b5e168520
--- /dev/null
+++ b/sycl/include/sycl/ext/oneapi/experimental/annotated_usm/alloc_shared.hpp
@@ -0,0 +1,161 @@
+//==-------- alloc_shared.hpp - SYCL annotated usm shared allocation -------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <sycl/ext/oneapi/experimental/annotated_usm/alloc_base.hpp>
+
+namespace sycl {
+inline namespace _V1 {
+namespace ext {
+namespace oneapi {
+namespace experimental {
+
+template <typename T, typename ListA, typename ListB>
+using CheckSharedPtrTAndPropLists =
+    typename detail::CheckTAndPropListsWithUsmKind<sycl::usm::alloc::shared, T,
+                                                   ListA, ListB>;
+
+template <typename PropertyListT>
+using GetAnnotatedSharedPtrProperties =
+    detail::GetAnnotatedPtrPropertiesWithUsmKind<sycl::usm::alloc::shared,
+                                                 PropertyListT>;
+
+////
+//  Aligned shared USM allocation functions with properties support
+//
+//  This the base form of all the annotated USM shared allocation functions,
+//  which are implemented by calling the more generic "aligned_alloc_annotated"
+//  functions with the USM kind as an argument. Note that when calling
+//  "aligned_alloc_annotated", the template parameter `propertyListA` should
+//  include the `usm_kind<alloc::shared>` property to make it appear on the
+//  returned annotated_ptr of "aligned_alloc_annotated"
+////
+
+template <typename propertyListA = detail::empty_properties_t,
+          typename propertyListB =
+              typename GetAnnotatedSharedPtrProperties<propertyListA>::type>
+std::enable_if_t<
+    CheckSharedPtrTAndPropLists<void, propertyListA, propertyListB>::value,
+    annotated_ptr<void, propertyListB>>
+aligned_alloc_shared_annotated(size_t alignment, size_t numBytes,
+                               const device &syclDevice,
+                               const context &syclContext,
+                               const propertyListA &propList = properties{}) {
+  auto tmp =
+      aligned_alloc_annotated(alignment, numBytes, syclDevice, syclContext,
+                              sycl::usm::alloc::shared, propList);
+  return annotated_ptr<void, propertyListB>(tmp.get());
+}
+
+template <typename T, typename propertyListA = detail::empty_properties_t,
+          typename propertyListB =
+              typename GetAnnotatedSharedPtrProperties<propertyListA>::type>
+std::enable_if_t<
+    CheckSharedPtrTAndPropLists<T, propertyListA, propertyListB>::value,
+    annotated_ptr<T, propertyListB>>
+aligned_alloc_shared_annotated(size_t alignment, size_t count,
+                               const device &syclDevice,
+                               const context &syclContext,
+                               const propertyListA &propList = properties{}) {
+  auto tmp =
+      aligned_alloc_annotated<T>(alignment, count, syclDevice, syclContext,
+                                 sycl::usm::alloc::shared, propList);
+  return annotated_ptr<T, propertyListB>(tmp.get());
+}
+
+template <typename propertyListA = detail::empty_properties_t,
+          typename propertyListB =
+              typename GetAnnotatedSharedPtrProperties<propertyListA>::type>
+std::enable_if_t<
+    CheckSharedPtrTAndPropLists<void, propertyListA, propertyListB>::value,
+    annotated_ptr<void, propertyListB>>
+aligned_alloc_shared_annotated(size_t alignment, size_t numBytes,
+                               const queue &syclQueue,
+                               const propertyListA &propList = properties{}) {
+  return aligned_alloc_shared_annotated(alignment, numBytes,
+                                        syclQueue.get_device(),
+                                        syclQueue.get_context(), propList);
+}
+
+template <typename T, typename propertyListA = detail::empty_properties_t,
+          typename propertyListB =
+              typename GetAnnotatedSharedPtrProperties<propertyListA>::type>
+std::enable_if_t<
+    CheckSharedPtrTAndPropLists<T, propertyListA, propertyListB>::value,
+    annotated_ptr<T, propertyListB>>
+aligned_alloc_shared_annotated(size_t alignment, size_t count,
+                               const queue &syclQueue,
+                               const propertyListA &propList = properties{}) {
+  return aligned_alloc_shared_annotated<T>(alignment, count,
+                                           syclQueue.get_device(),
+                                           syclQueue.get_context(), propList);
+}
+
+////
+//  Shared USM allocation functions with properties support
+//
+//  Note: "malloc_shared_annotated" functions call
+//  "aligned_alloc_shared_annotated" with alignment 0
+////
+
+template <typename propertyListA = detail::empty_properties_t,
+          typename propertyListB =
+              typename GetAnnotatedSharedPtrProperties<propertyListA>::type>
+std::enable_if_t<
+    CheckSharedPtrTAndPropLists<void, propertyListA, propertyListB>::value,
+    annotated_ptr<void, propertyListB>>
+malloc_shared_annotated(size_t numBytes, const device &syclDevice,
+                        const context &syclContext,
+                        const propertyListA &propList = properties{}) {
+  return aligned_alloc_shared_annotated(0, numBytes, syclDevice, syclContext,
+                                        propList);
+}
+
+template <typename T, typename propertyListA = detail::empty_properties_t,
+          typename propertyListB =
+              typename GetAnnotatedSharedPtrProperties<propertyListA>::type>
+std::enable_if_t<
+    CheckSharedPtrTAndPropLists<T, propertyListA, propertyListB>::value,
+    annotated_ptr<T, propertyListB>>
+malloc_shared_annotated(size_t count, const device &syclDevice,
+                        const context &syclContext,
+                        const propertyListA &propList = properties{}) {
+  return aligned_alloc_shared_annotated<T>(0, count, syclDevice, syclContext,
+                                           propList);
+}
+
+template <typename propertyListA = detail::empty_properties_t,
+          typename propertyListB =
+              typename GetAnnotatedSharedPtrProperties<propertyListA>::type>
+std::enable_if_t<
+    CheckSharedPtrTAndPropLists<void, propertyListA, propertyListB>::value,
+    annotated_ptr<void, propertyListB>>
+malloc_shared_annotated(size_t numBytes, const queue &syclQueue,
+                        const propertyListA &propList = properties{}) {
+  return malloc_shared_annotated(numBytes, syclQueue.get_device(),
+                                 syclQueue.get_context(), propList);
+}
+
+template <typename T, typename propertyListA = detail::empty_properties_t,
+          typename propertyListB =
+              typename GetAnnotatedSharedPtrProperties<propertyListA>::type>
+std::enable_if_t<
+    CheckSharedPtrTAndPropLists<T, propertyListA, propertyListB>::value,
+    annotated_ptr<T, propertyListB>>
+malloc_shared_annotated(size_t count, const queue &syclQueue,
+                        const propertyListA &propList = properties{}) {
+  return malloc_shared_annotated<T>(count, syclQueue.get_device(),
+                                    syclQueue.get_context(), propList);
+}
+
+} // namespace experimental
+} // namespace oneapi
+} // namespace ext
+} // namespace _V1
+} // namespace sycl
\ No newline at end of file
diff --git a/sycl/include/sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp b/sycl/include/sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp
new file mode 100644
index 0000000000000..029b8a0eda5ad
--- /dev/null
+++ b/sycl/include/sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp
@@ -0,0 +1,248 @@
+//==-------- alloc_util.hpp - Utilities for annotated usm allocation -------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <numeric>
+#include <sycl/ext/oneapi/experimental/annotated_ptr/annotated_ptr.hpp>
+
+namespace sycl {
+inline namespace _V1 {
+namespace ext {
+namespace oneapi {
+namespace experimental {
+
+namespace detail {
+
+////
+//  Type traits for USM allocation with property support
+////
+
+// Merge a property list with the usm_kind property
+template <sycl::usm::alloc Kind, typename PropertyListT>
+using MergeUsmKind =
+    detail::merged_properties_t<PropertyListT,
+                                decltype(properties{usm_kind<Kind>})>;
+
+// Check if a property list contains the a certain property
+template <typename PropKey, typename PropertyListT> struct HasProperty {};
+
+template <typename PropKey, typename... Props>
+struct HasProperty<PropKey, detail::properties_t<Props...>>
+    : detail::ContainsProperty<PropKey, std::tuple<Props...>> {};
+
+template <typename PropertyListT>
+using HasAlign = HasProperty<alignment_key, PropertyListT>;
+template <typename PropertyListT>
+using HasUsmKind = HasProperty<usm_kind_key, PropertyListT>;
+template <typename PropertyListT>
+using HasBufferLocation = HasProperty<buffer_location_key, PropertyListT>;
+
+// Get the value of a property from a property list
+template <typename PropKey, typename ConstType, typename DefaultPropVal,
+          typename PropertyListT>
+struct GetPropertyValueFromPropList {};
+
+template <typename PropKey, typename ConstType, typename DefaultPropVal,
+          typename... Props>
+struct GetPropertyValueFromPropList<PropKey, ConstType, DefaultPropVal,
+                                    detail::properties_t<Props...>> {
+  using prop_val_t = std::conditional_t<
+      detail::ContainsProperty<PropKey, std::tuple<Props...>>::value,
+      typename detail::FindCompileTimePropertyValueType<
+          PropKey, std::tuple<Props...>>::type,
+      DefaultPropVal>;
+  static constexpr ConstType value =
+      detail::PropertyMetaInfo<std::remove_const_t<prop_val_t>>::value;
+};
+
+// Get the value of alignment from a property list
+// If alignment is not present in the property list, set to default value 0
+template <typename PropertyListT>
+using GetAlignFromPropList =
+    GetPropertyValueFromPropList<alignment_key, size_t, decltype(alignment<0>),
+                                 PropertyListT>;
+// Get the value of usm_kind from a property list
+// The usm_kind is sycl::usm::alloc::unknown by default
+template <typename PropertyListT>
+using GetUsmKindFromPropList =
+    GetPropertyValueFromPropList<usm_kind_key, sycl::usm::alloc,
+                                 decltype(usm_kind<sycl::usm::alloc::unknown>),
+                                 PropertyListT>;
+// Get the value of buffer_location from a property list
+// The buffer location is -1 by default
+template <typename PropertyListT>
+using GetBufferLocationFromPropList = GetPropertyValueFromPropList<
+    buffer_location_key, int,
+    decltype(sycl::ext::intel::experimental::buffer_location<-1>),
+    PropertyListT>;
+
+// Check if a runtime property is valid
+template <typename Prop> struct IsRuntimePropertyValid : std::false_type {};
+
+// Validate the property list of annotated USM allocations by checking each
+// property is
+// 1. a valid compile-time property for annotated_ptr, or
+// 2. a valid runtime property for annotated USM allocations
+template <typename T, typename propertyList>
+struct ValidAllocPropertyList : std::false_type {};
+template <typename T>
+struct ValidAllocPropertyList<T, detail::empty_properties_t> : std::true_type {
+};
+template <typename T, typename Prop, typename... Props>
+struct ValidAllocPropertyList<T, detail::properties_t<Prop, Props...>>
+    : std::integral_constant<
+          bool, (is_valid_property<T *, Prop>::value ||
+                 IsRuntimePropertyValid<Prop>::value) &&
+                    ValidAllocPropertyList<
+                        T, detail::properties_t<Props...>>::value> {
+  // check if a compile-time property is valid for annotated_ptr
+  static_assert(!detail::IsCompileTimePropertyValue<Prop>::value ||
+                    is_valid_property<T *, Prop>::value,
+                "Found invalid compile-time property in the property list.");
+  // check if a runtime property is valid for malloc
+  static_assert(!detail::IsRuntimeProperty<Prop>::value ||
+                    IsRuntimePropertyValid<Prop>::value,
+                "Found invalid runtime property in the property list.");
+};
+
+// The utility to filter a given property list to get the properties for
+// annotated_ptr
+template <typename PropertyListT> struct GetCompileTimeProperties {};
+
+template <> struct GetCompileTimeProperties<detail::empty_properties_t> {
+  using type = detail::empty_properties_t;
+};
+
+template <typename Prop>
+struct GetCompileTimeProperties<detail::properties_t<Prop>> {
+  using type =
+      std::conditional_t<detail::IsCompileTimePropertyValue<Prop>::value,
+                         detail::properties_t<Prop>,
+                         detail::empty_properties_t>;
+};
+
+template <typename Prop, typename... Props>
+struct GetCompileTimeProperties<detail::properties_t<Prop, Props...>> {
+  using filtered_this_property_t =
+      std::conditional_t<detail::IsCompileTimePropertyValue<Prop>::value,
+                         detail::properties_t<Prop>,
+                         detail::empty_properties_t>;
+  using filtered_other_properties_t =
+      typename GetCompileTimeProperties<detail::properties_t<Props...>>::type;
+  using type = detail::merged_properties_t<filtered_this_property_t,
+                                           filtered_other_properties_t>;
+};
+
+// Given the input property list `PropertyListT` and the usm kind `Kind` of
+// annotated USM allocation functions, generate the property list for the
+// returned annotated_ptr by following the rules:
+// 1. all the compile-time properties in PropertyListT should appear in the
+// output properties
+// 2. if PropertyListT contains usm_kind, the value should be the same as
+// `Kind`, usm_kind property should appear in the ouput properties
+// 3. if PropertyListT does not contain usm_kind, a usm_kind property with value
+// `Kind` is inserted in the ouput properties
+template <sycl::usm::alloc Kind, typename PropertyListT>
+struct GetAnnotatedPtrPropertiesWithUsmKind {};
+template <sycl::usm::alloc Kind, typename... Props>
+struct GetAnnotatedPtrPropertiesWithUsmKind<Kind,
+                                            detail::properties_t<Props...>> {
+  using input_properties_t = detail::properties_t<Props...>;
+  using filtered_input_properties_t =
+      typename GetCompileTimeProperties<input_properties_t>::type;
+
+  static_assert(!HasUsmKind<input_properties_t>::value ||
+                    GetUsmKindFromPropList<input_properties_t>::value == Kind,
+                "Input property list contains conflicting USM kind.");
+
+  using type =
+      detail::merged_properties_t<filtered_input_properties_t,
+                                  decltype(properties{usm_kind<Kind>})>;
+};
+
+// Check if the 3 template parameters of annotated USM allocation functions,
+// i.e.
+//   T: allocated data type
+//   propertyListA: input property list
+//   propertyListB: property list for output anntoated_ptr
+// meet the following conditions:
+//  1. T is not a property list
+//  2. propertyListB == all the compile-time properties in propertyListA
+template <typename T, typename propertyListA, typename propertyListB>
+struct CheckTAndPropLists : std::false_type {};
+
+template <typename T, typename... PropsA, typename... PropsB>
+struct CheckTAndPropLists<T, detail::properties_t<PropsA...>,
+                          detail::properties_t<PropsB...>>
+    : std::integral_constant<
+          bool,
+          !is_property_list<T>::value &&
+              std::is_same_v<detail::properties_t<PropsB...>,
+                             typename GetCompileTimeProperties<
+                                 detail::properties_t<PropsA...>>::type>> {};
+
+// Check if the 3 template parameters of annotated USM allocation functions of a
+// certain usm kind:
+//   T: allocated data type
+//   propertyListA: input property list
+//   propertyListB: property list for output anntoated_ptr
+// meet the following conditions:
+//  1. T is not a property list
+//  2. if propertyListA contains usm_kind, the usm_kind must match with the
+//  specified usm kind.
+//     propertyListB is all the compile-time properties in propertyListA
+//  3. if propertyListA does not contain usm_kind, propertyListB is all the
+//  compile-time
+//     properties in propertyListA with the usm_kind property inserted
+template <sycl::usm::alloc Kind, typename T, typename propertyListA,
+          typename propertyListB>
+struct CheckTAndPropListsWithUsmKind : std::false_type {};
+
+template <sycl::usm::alloc Kind, typename T, typename... PropsA,
+          typename... PropsB>
+struct CheckTAndPropListsWithUsmKind<Kind, T, detail::properties_t<PropsA...>,
+                                     detail::properties_t<PropsB...>>
+    : std::integral_constant<
+          bool, !is_property_list<T>::value &&
+                    std::is_same_v<
+                        detail::properties_t<PropsB...>,
+                        typename GetAnnotatedPtrPropertiesWithUsmKind<
+                            Kind, detail::properties_t<PropsA...>>::type>> {};
+
+} // namespace detail
+
+////
+//  Utility functions for USM allocation with property support
+////
+
+// Transform a compile-time property list to a USM property_list (working at
+// runtime). Right now only the `buffer_location<N>` has its corresponding USM
+// runtime property and is transformable
+template <typename PropertyListT> inline property_list get_usm_property_list() {
+  if constexpr (detail::HasBufferLocation<PropertyListT>::value) {
+    return property_list{
+        sycl::ext::intel::experimental::property::usm::buffer_location(
+            detail::GetBufferLocationFromPropList<PropertyListT>::value)};
+  }
+  return {};
+}
+
+// Combine two alignment requirements for a pointer in the following way:
+// 1. if either of the alignments is 0, return the other alignment
+// 2. otherwise return the least common multiple of the two alignments
+inline size_t combine_align(size_t alignA, size_t alignB) {
+  return alignA == 0 ? alignB
+                     : (alignB == 0 ? alignA : std::lcm(alignA, alignB));
+}
+
+} // namespace experimental
+} // namespace oneapi
+} // namespace ext
+} // namespace _V1
+} // namespace sycl
\ No newline at end of file
diff --git a/sycl/include/sycl/ext/oneapi/experimental/annotated_usm/dealloc.hpp b/sycl/include/sycl/ext/oneapi/experimental/annotated_usm/dealloc.hpp
new file mode 100644
index 0000000000000..4b56a22f58d11
--- /dev/null
+++ b/sycl/include/sycl/ext/oneapi/experimental/annotated_usm/dealloc.hpp
@@ -0,0 +1,36 @@
+//==------------- dealloc.hpp - SYCL annotated usm deallocation ------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <sycl/ext/oneapi/experimental/annotated_ptr/annotated_ptr.hpp>
+
+namespace sycl {
+inline namespace _V1 {
+namespace ext {
+namespace oneapi {
+namespace experimental {
+
+////
+//  Deallocation
+////
+template <typename T, typename propList>
+void free(annotated_ptr<T, propList> &ptr, const context &syclContext) {
+  sycl::free(ptr.get(), syclContext);
+}
+
+template <typename T, typename propList>
+void free(annotated_ptr<T, propList> &ptr, const queue &syclQueue) {
+  sycl::free(ptr.get(), syclQueue);
+}
+
+} // namespace experimental
+} // namespace oneapi
+} // namespace ext
+} // namespace _V1
+} // namespace sycl
\ No newline at end of file
diff --git a/sycl/include/sycl/ext/oneapi/properties/property.hpp b/sycl/include/sycl/ext/oneapi/properties/property.hpp
index b877e5f31ca8d..833565d9bb4f2 100644
--- a/sycl/include/sycl/ext/oneapi/properties/property.hpp
+++ b/sycl/include/sycl/ext/oneapi/properties/property.hpp
@@ -214,8 +214,9 @@ enum PropKind : uint32_t {
   ESIMDL1CacheHint = 44,
   ESIMDL2CacheHint = 45,
   ESIMDL3CacheHint = 46,
+  UsmKind = 47,
   // PropKindSize must always be the last value.
-  PropKindSize = 47,
+  PropKindSize = 48,
 };
 
 // This trait must be specialized for all properties and must have a unique
diff --git a/sycl/include/sycl/sycl.hpp b/sycl/include/sycl/sycl.hpp
index d430e3f486612..5bc8197108a72 100644
--- a/sycl/include/sycl/sycl.hpp
+++ b/sycl/include/sycl/sycl.hpp
@@ -75,6 +75,10 @@
 #include <sycl/ext/oneapi/device_global/properties.hpp>
 #include <sycl/ext/oneapi/experimental/annotated_arg/annotated_arg.hpp>
 #include <sycl/ext/oneapi/experimental/annotated_ptr/annotated_ptr.hpp>
+#include <sycl/ext/oneapi/experimental/annotated_usm/alloc_device.hpp>
+#include <sycl/ext/oneapi/experimental/annotated_usm/alloc_host.hpp>
+#include <sycl/ext/oneapi/experimental/annotated_usm/alloc_shared.hpp>
+#include <sycl/ext/oneapi/experimental/annotated_usm/dealloc.hpp>
 #include <sycl/ext/oneapi/experimental/auto_local_range.hpp>
 #include <sycl/ext/oneapi/experimental/ballot_group.hpp>
 #include <sycl/ext/oneapi/experimental/bfloat16_math.hpp>
diff --git a/sycl/source/feature_test.hpp.in b/sycl/source/feature_test.hpp.in
index f0d686a889fe9..3180688a0257f 100644
--- a/sycl/source/feature_test.hpp.in
+++ b/sycl/source/feature_test.hpp.in
@@ -81,6 +81,7 @@ inline namespace _V1 {
 #define SYCL_EXT_ONEAPI_ANNOTATED_ARG 1
 #define SYCL_EXT_ONEAPI_ANNOTATED_PTR 1
 #define SYCL_EXT_ONEAPI_COPY_OPTIMIZE 1
+#define SYCL_EXT_ONEAPI_USM_MALLOC_PROPERTIES 1
 #cmakedefine01 SYCL_ENABLE_KERNEL_FUSION
 #if SYCL_ENABLE_KERNEL_FUSION
 #define SYCL_EXT_CODEPLAY_KERNEL_FUSION 1
diff --git a/sycl/test-e2e/Annotated_usm/annotated_usm_align.cpp b/sycl/test-e2e/Annotated_usm/annotated_usm_align.cpp
new file mode 100644
index 0000000000000..adfeb0a19fa9c
--- /dev/null
+++ b/sycl/test-e2e/Annotated_usm/annotated_usm_align.cpp
@@ -0,0 +1,422 @@
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+
+// UNSUPPORTED: gpu
+
+// This e2e test checks the alignment of the annotated USM allocation (host &
+// device) in various cases
+
+#include <sycl/sycl.hpp>
+
+#include <complex>
+#include <numeric>
+
+// clang-format on
+
+using namespace sycl::ext::oneapi::experimental;
+using namespace sycl::ext::intel::experimental;
+using alloc = sycl::usm::alloc;
+
+template <typename T> void testAlign(sycl::queue &q, unsigned align) {
+  const sycl::context &Ctx = q.get_context();
+  auto dev = q.get_device();
+
+  properties ALN0{alignment<0>};
+  properties ALN2{alignment<2>};
+  properties ALN4{alignment<4>};
+  properties ALN8{alignment<8>};
+  properties ALN16{alignment<16>};
+  properties ALN32{alignment<32>};
+  properties ALN64{alignment<64>};
+  properties ALN128{alignment<128>};
+  properties ALN256{alignment<256>};
+  properties ALN512{alignment<512>};
+
+  auto check_align = [&q](size_t Alignment, auto AllocFn,
+                          int Line = __builtin_LINE(), int Case = 0) {
+    // First allocation might naturally be over-aligned. Do several of them to
+    // do the verification;
+    decltype(AllocFn()) Arr[10];
+    for (auto *&Elem : Arr)
+      Elem = AllocFn();
+    for (auto *Ptr : Arr) {
+      auto v = reinterpret_cast<uintptr_t>(Ptr);
+      if ((v & (Alignment - 1)) != 0) {
+        std::cout << "Failed at line " << Line << ", case " << Case
+                  << std::endl;
+        assert(false && "Not properly aligned!");
+        break; // To be used with commented out assert above.
+      }
+    }
+    for (auto *Ptr : Arr)
+      free(Ptr, q);
+  };
+
+  auto CheckAlignAll = [&](size_t Expected, auto Funcs,
+                           int Line = __builtin_LINE()) {
+    std::apply(
+        [&](auto... Fs) {
+          int Case = 0;
+          (void)std::initializer_list<int>{
+              (check_align(Expected, Fs, Line, Case++), 0)...};
+        },
+        Funcs);
+  };
+
+  constexpr int N = 10;
+  assert(align > 0 || (align & (align - 1)) == 0);
+
+  auto MDevice = [&](auto... args) {
+    return malloc_device_annotated(N, args...).get();
+  };
+  auto MHost = [&](auto... args) {
+    return malloc_host_annotated(N, args...).get();
+  };
+
+  auto MAnnotated = [&](auto... args) {
+    return malloc_annotated(N, args...).get();
+  };
+
+  auto ADevice = [&](size_t align, auto... args) {
+    return aligned_alloc_device_annotated(align, N, args...).get();
+  };
+  auto AHost = [&](size_t align, auto... args) {
+    return aligned_alloc_host_annotated(align, N, args...).get();
+  };
+
+  auto AAnnotated = [&](size_t align, auto... args) {
+    return aligned_alloc_annotated(align, N, args...).get();
+  };
+
+  CheckAlignAll(
+      1,
+      std::tuple{
+          // Case: `malloc_xxx` with no alignment constraint, this includes
+          // no alignment property or `alignment<0>`
+          [&]() { return MDevice(q); },
+          [&]() { return MDevice(q, ALN0); },
+          [&]() { return MDevice(dev, Ctx); },
+          [&]() { return MDevice(dev, Ctx, ALN0); },
+          [&]() { return MHost(q); },
+          [&]() { return MHost(q, ALN0); },
+          [&]() { return MHost(Ctx); },
+          [&]() { return MHost(Ctx, ALN0); },
+          [&]() { return MAnnotated(dev, Ctx, alloc::device); },
+          [&]() { return MAnnotated(dev, Ctx, alloc::device, ALN0); },
+          [&]() { return MAnnotated(dev, Ctx, alloc::host); },
+          [&]() { return MAnnotated(dev, Ctx, alloc::host, ALN0); },
+          [&]() { return MAnnotated(dev, Ctx, properties{usm_kind_device}); },
+          [&]() {
+            return MAnnotated(dev, Ctx,
+                              properties{usm_kind_device, alignment<0>});
+          },
+          [&]() { return MAnnotated(dev, Ctx, properties{usm_kind_host}); },
+          [&]() {
+            return MAnnotated(dev, Ctx,
+                              properties{usm_kind_host, alignment<0>});
+          }
+
+          // Case: `aligned_alloc_xxx` with no alignment constraint, this
+          // includes
+          // 1. no alignment property or `alignment<0>`
+          // 2. alignment argument is 0
+          ,
+          [&]() { return ADevice(0, q); },
+          [&]() { return ADevice(0, q, ALN0); },
+          [&]() { return ADevice(0, dev, Ctx); },
+          [&]() { return ADevice(0, dev, Ctx, ALN0); },
+          [&]() { return AHost(0, q); },
+          [&]() { return AHost(0, q, ALN0); },
+          [&]() { return AHost(0, Ctx); },
+          [&]() { return AHost(0, Ctx, ALN0); },
+          [&]() { return AAnnotated(0, dev, Ctx, alloc::device); },
+          [&]() { return AAnnotated(0, dev, Ctx, alloc::device, ALN0); },
+          [&]() { return AAnnotated(0, dev, Ctx, alloc::host); },
+          [&]() { return AAnnotated(0, dev, Ctx, alloc::host, ALN0); }});
+
+  // Case: `aligned_alloc_xxx` with alignment argument (power of 2) and no
+  // alignment property
+  CheckAlignAll(
+      align,
+      std::tuple{[&]() { return ADevice(align, q); },
+                 [&]() { return ADevice(align, dev, Ctx); },
+                 [&]() { return AHost(align, q); },
+                 [&]() { return AHost(align, Ctx); },
+                 [&]() { return AAnnotated(align, dev, Ctx, alloc::device); },
+                 [&]() { return AAnnotated(align, dev, Ctx, alloc::host); }});
+
+  // Case: `malloc_xxx<T>` with no alignment constraint, the alignment is
+  // sizeof(T)
+  auto TDevice = [&](auto... args) {
+    return malloc_device_annotated<T>(N, args...).get();
+  };
+  auto THost = [&](auto... args) {
+    return malloc_host_annotated<T>(N, args...).get();
+  };
+  auto TAnnotated = [&](auto... args) {
+    return malloc_annotated<T>(N, args...).get();
+  };
+
+  auto ATDevice = [&](size_t align, auto... args) {
+    return aligned_alloc_device_annotated<T>(align, N, args...).get();
+  };
+  auto ATHost = [&](size_t align, auto... args) {
+    return aligned_alloc_host_annotated<T>(align, N, args...).get();
+  };
+  auto ATAnnotated = [&](size_t align, auto... args) {
+    return aligned_alloc_annotated<T>(align, N, args...).get();
+  };
+
+  auto check_align_lcm = [&q](std::vector<size_t> Alignments, auto AllocFn,
+                              int Line = __builtin_LINE(), int Case = 0) {
+    assert(Alignments.size() > 0);
+    size_t Lcm = Alignments[0];
+    for (int i = 1; i < Alignments.size(); i++)
+      Lcm = std::lcm(Lcm, Alignments[i]);
+
+    // First allocation might naturally be over-aligned. Do several of them to
+    // do the verification;
+    decltype(AllocFn()) Arr[10];
+    for (auto *&Elem : Arr)
+      Elem = AllocFn();
+    for (auto *Ptr : Arr) {
+      auto v = reinterpret_cast<uintptr_t>(Ptr);
+      if ((v & (Lcm - 1)) != 0) {
+        std::cout << "Failed at line " << Line << ", case " << Case
+                  << std::endl;
+        assert(false && "Not properly aligned!");
+        break; // To be used with commented out assert above.
+      }
+    }
+    for (auto *Ptr : Arr)
+      free(Ptr, q);
+  };
+
+  auto CheckAlignLcmAll = [&](std::vector<std::vector<size_t>> AlignVec,
+                              auto Funcs, int Line = __builtin_LINE()) {
+    std::apply(
+        [&](auto... Fs) {
+          int Case = 0, Idx = 0;
+          (void)std::initializer_list<int>{
+              (check_align_lcm(AlignVec[Idx++], Fs, Line, Case++), 0)...};
+        },
+        Funcs);
+  };
+
+  CheckAlignAll(
+      sizeof(T),
+      std::tuple{
+          // Case: `malloc_xxx<T>` with no alignment constraint, this includes
+          // no alignment property or `alignment<0>`
+          [&]() { return TDevice(q); },
+          [&]() { return TDevice(q, ALN0); },
+          [&]() { return TDevice(dev, Ctx); },
+          [&]() { return TDevice(dev, Ctx, ALN0); },
+          [&]() { return THost(q); },
+          [&]() { return THost(q, ALN0); },
+          [&]() { return THost(Ctx); },
+          [&]() { return THost(Ctx, ALN0); },
+          [&]() { return TAnnotated(dev, Ctx, alloc::device); },
+          [&]() { return TAnnotated(dev, Ctx, alloc::device, ALN0); },
+          [&]() { return TAnnotated(dev, Ctx, alloc::host); },
+          [&]() { return TAnnotated(dev, Ctx, alloc::host, ALN0); },
+          [&]() { return TAnnotated(dev, Ctx, properties{usm_kind_device}); },
+          [&]() {
+            return TAnnotated(dev, Ctx,
+                              properties{usm_kind_device, alignment<0>});
+          },
+          [&]() { return TAnnotated(dev, Ctx, properties{usm_kind_host}); },
+          [&]() {
+            return TAnnotated(dev, Ctx,
+                              properties{usm_kind_host, alignment<0>});
+          }
+
+          // Case: `aligned_alloc_xxx<T>` with no alignment constraint, this
+          // includes
+          // 1. no alignment property or `alignment<0>`
+          // 2. alignment argument is 0
+          ,
+          [&]() { return ATDevice(0, q); },
+          [&]() { return ATDevice(0, q, ALN0); },
+          [&]() { return ATDevice(0, dev, Ctx); },
+          [&]() { return ATDevice(0, dev, Ctx, ALN0); },
+          [&]() { return ATHost(0, q); },
+          [&]() { return ATHost(0, q, ALN0); },
+          [&]() { return ATHost(0, Ctx); },
+          [&]() { return ATHost(0, Ctx, ALN0); },
+          [&]() { return ATAnnotated(0, dev, Ctx, alloc::device); },
+          [&]() { return ATAnnotated(0, dev, Ctx, alloc::device, ALN0); },
+          [&]() { return ATAnnotated(0, dev, Ctx, alloc::host); },
+          [&]() { return ATAnnotated(0, dev, Ctx, alloc::host, ALN0); },
+      });
+
+  // Case: malloc_xxx<T> with compile-time alignment<K> (K is a power of 2), the
+  // alignment is the least-common-multiple of K and sizeof(T)
+  CheckAlignLcmAll(
+      {{2, sizeof(T)},
+       {4, sizeof(T)},
+       {8, sizeof(T)},
+       {16, sizeof(T)},
+       {128, sizeof(T)},
+       {256, sizeof(T)},
+       {16, sizeof(T)},
+       {64, sizeof(T)}},
+      std::tuple{[&]() { return TDevice(q, ALN2); },
+                 [&]() { return TDevice(dev, Ctx, ALN4); },
+                 [&]() { return THost(q, ALN8); },
+                 [&]() { return THost(Ctx, ALN16); },
+                 [&]() { return TAnnotated(dev, Ctx, alloc::device, ALN128); },
+                 [&]() { return TAnnotated(dev, Ctx, alloc::host, ALN256); },
+                 [&]() {
+                   return TAnnotated(
+                       dev, Ctx, properties{usm_kind_device, alignment<16>});
+                 },
+                 [&]() {
+                   return TAnnotated(dev, Ctx,
+                                     properties{usm_kind_host, alignment<64>});
+                 }});
+
+  // Case: aligned_alloc_xxx<T> with alignment argument K (K is a power of 2)
+  // and no alignment properties, the alignment is the least-common-multiple of
+  // K and sizeof(T)
+  CheckAlignAll(
+      std::lcm(align, sizeof(T)),
+      std::tuple{[&]() { return ATDevice(align, q); },
+                 [&]() { return ATDevice(align, dev, Ctx); },
+                 [&]() { return ATHost(align, q); },
+                 [&]() { return ATHost(align, Ctx); },
+                 [&]() { return ATAnnotated(align, dev, Ctx, alloc::device); },
+                 [&]() { return ATAnnotated(align, dev, Ctx, alloc::host); }});
+
+  // Case: aligned_alloc_xxx<T> with alignment argument K (K is a power of 2)
+  // and no alignment properties, the alignment is the least-common-multiple of
+  // K and sizeof(T)
+  CheckAlignLcmAll(
+      {
+          {2, sizeof(T)},
+          {4, sizeof(T)},
+          {8, sizeof(T)},
+          {16, sizeof(T)},
+          {128, sizeof(T)},
+          {256, sizeof(T)},
+      },
+      std::tuple{
+          [&]() { return ATDevice(2, q); },
+          [&]() { return ATDevice(4, dev, Ctx, ALN4); },
+          [&]() { return ATHost(8, q, ALN8); },
+          [&]() { return ATHost(16, Ctx, ALN16); },
+          [&]() { return ATAnnotated(128, dev, Ctx, alloc::device, ALN128); },
+          [&]() { return ATAnnotated(256, dev, Ctx, alloc::host, ALN256); }});
+
+  // Case: aligned_alloc_xxx<T> with compile-time alignment<K> (K is a power of
+  // 2), the alignment is the least-common-multiple of the alignment argument (a
+  // power of 2), K and sizeof(T)
+  CheckAlignLcmAll(
+      {
+          {align, 2, sizeof(T)},
+          {align, 4, sizeof(T)},
+          {align, 8, sizeof(T)},
+          {align, 16, sizeof(T)},
+          {align, 128, sizeof(T)},
+          {align, 256, sizeof(T)},
+      },
+      std::tuple{
+          [&]() { return ATDevice(align, q, ALN2); },
+          [&]() { return ATDevice(align, dev, Ctx, ALN4); },
+          [&]() { return ATHost(align, q, ALN8); },
+          [&]() { return ATHost(align, Ctx, ALN16); },
+          [&]() { return ATAnnotated(align, dev, Ctx, alloc::device, ALN128); },
+          [&]() { return ATAnnotated(align, dev, Ctx, alloc::host, ALN256); }});
+
+  // Test cases that are expected to return null
+  auto check_null = [&q](auto AllocFn, int Line = __builtin_LINE(),
+                         int Case = 0) {
+    decltype(AllocFn()) Ptr = AllocFn();
+    auto v = reinterpret_cast<uintptr_t>(Ptr);
+    if (v != 0) {
+      free(Ptr, q);
+      std::cout << "Failed at line " << Line << ", case " << Case << std::endl;
+      assert(false && "The return is not null!");
+    }
+  };
+
+  auto CheckNullAll = [&](auto Funcs, int Line = __builtin_LINE()) {
+    std::apply(
+        [&](auto... Fs) {
+          int Case = 0;
+          (void)std::initializer_list<int>{
+              (check_null(Fs, Line, Case++), 0)...};
+        },
+        Funcs);
+  };
+
+  CheckNullAll(std::tuple{
+      // Case: malloc_xxx with compile-time alignment<K> (K is not a power of
+      // 2),
+      // nullptr is returned
+      [&]() { return MDevice(q, properties{alignment<5>}); },
+      [&]() { return MDevice(dev, Ctx, properties{alignment<10>}); },
+      [&]() { return MHost(q, properties{alignment<25>}); },
+      [&]() { return MHost(Ctx, properties{alignment<50>}); },
+      [&]() {
+        return MAnnotated(q, alloc::device, properties{alignment<127>});
+      },
+      [&]() {
+        return MAnnotated(dev, Ctx, alloc::host, properties{alignment<200>});
+      },
+      [&]() {
+        return MAnnotated(q, properties{usm_kind_device, alignment<500>});
+      },
+      [&]() {
+        return MAnnotated(dev, Ctx, properties{usm_kind_host, alignment<1000>});
+      }
+
+      // Case: malloc_xxx<T> with compile-time alignment<K> (K is not a power of
+      // 2),
+      // nullptr is returned
+      ,
+      [&]() { return TDevice(q, properties{alignment<5>}); },
+      [&]() { return TDevice(dev, Ctx, properties{alignment<10>}); },
+      [&]() { return THost(q, properties{alignment<25>}); },
+      [&]() { return THost(Ctx, properties{alignment<50>}); },
+      [&]() {
+        return TAnnotated(q, alloc::device, properties{alignment<127>});
+      },
+      [&]() {
+        return TAnnotated(dev, Ctx, alloc::host, properties{alignment<200>});
+      },
+      [&]() {
+        return TAnnotated(q, properties{usm_kind_device, alignment<500>});
+      },
+      [&]() {
+        return TAnnotated(dev, Ctx, properties{usm_kind_host, alignment<1000>});
+      }
+      // Case: aligned_alloc_xxx with no alignment property, and the alignment
+      // argument is not a power of 2, the result is nullptr
+      ,
+      [&]() { return ADevice(3, q); },
+      [&]() { return ADevice(7, dev, Ctx); },
+      [&]() { return AHost(7, q); },
+      [&]() { return AHost(9, Ctx); },
+      [&]() { return AAnnotated(15, q, alloc::device); },
+      [&]() { return AAnnotated(17, dev, Ctx, alloc::host); }
+
+      // Case: aligned_alloc_xxx<T> with no alignment property, and the
+      // alignment
+      // argument is not a power of 2, the result is nullptr
+      ,
+      [&]() { return ATDevice(3, q); },
+      [&]() { return ATDevice(7, dev, Ctx); },
+      [&]() { return ATHost(7, q); },
+      [&]() { return ATHost(9, Ctx); },
+      [&]() { return ATAnnotated(15, q, alloc::device); },
+      [&]() { return ATAnnotated(17, dev, Ctx, alloc::host); }});
+}
+
+int main() {
+  sycl::queue q;
+  testAlign<char>(q, 4);
+  testAlign<int>(q, 128);
+  testAlign<std::complex<double>>(q, 4);
+  return 0;
+}
\ No newline at end of file
diff --git a/sycl/test-e2e/Annotated_usm/annotated_usm_kind.cpp b/sycl/test-e2e/Annotated_usm/annotated_usm_kind.cpp
new file mode 100644
index 0000000000000..812a0bb115737
--- /dev/null
+++ b/sycl/test-e2e/Annotated_usm/annotated_usm_kind.cpp
@@ -0,0 +1,149 @@
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+
+// This e2e test checks the usm kind of the pointer returned by annotated USM
+// allocation
+
+#include <sycl/sycl.hpp>
+
+#include <complex>
+#include <numeric>
+
+// clang-format on
+
+using namespace sycl::ext::oneapi::experimental;
+using namespace sycl::ext::intel::experimental;
+using alloc = sycl::usm::alloc;
+
+template <typename T> void testUsmKind(sycl::queue &q) {
+  const sycl::context &Ctx = q.get_context();
+  auto dev = q.get_device();
+
+  auto check_usm_kind = [&](alloc Expected, auto AllocFn,
+                            int Line = __builtin_LINE(), int Case = 0) {
+    // First allocation might naturally be over-aligned. Do several of them to
+    // do the verification;
+    decltype(AllocFn()) Ptr = AllocFn();
+    if (sycl::get_pointer_type(Ptr, Ctx) != Expected) {
+      std::cout << "Failed at line " << Line << ", case " << Case << std::endl;
+      assert(false && "Incorrect usm_kind!");
+    }
+    free(Ptr, q);
+  };
+
+  auto CheckUsmKindAll = [&](alloc Expected, auto Funcs,
+                             int Line = __builtin_LINE()) {
+    std::apply(
+        [&](auto... Fs) {
+          int Case = 0;
+          (void)std::initializer_list<int>{
+              (check_usm_kind(Expected, Fs, Line, Case++), 0)...};
+        },
+        Funcs);
+  };
+
+  constexpr int N = 11;
+
+  auto MDevice = [&](auto... args) {
+    return malloc_device_annotated(N, args...).get();
+  };
+  auto MHost = [&](auto... args) {
+    return malloc_host_annotated(N, args...).get();
+  };
+  auto MShared = [&](auto... args) {
+    return malloc_shared_annotated(N, args...).get();
+  };
+  auto MAnnotated = [&](auto... args) {
+    return malloc_annotated(N, args...).get();
+  };
+
+  auto ADevice = [&](size_t align, auto... args) {
+    return aligned_alloc_device_annotated(align, N, args...).get();
+  };
+  auto AHost = [&](size_t align, auto... args) {
+    return aligned_alloc_host_annotated(align, N, args...).get();
+  };
+  auto AShared = [&](size_t align, auto... args) {
+    return aligned_alloc_shared_annotated(align, N, args...).get();
+  };
+  auto AAnnotated = [&](size_t align, auto... args) {
+    return aligned_alloc_annotated(align, N, args...).get();
+  };
+
+  auto TDevice = [&](auto... args) {
+    return malloc_device_annotated<T>(N, args...).get();
+  };
+  auto THost = [&](auto... args) {
+    return malloc_host_annotated<T>(N, args...).get();
+  };
+  auto TShared = [&](auto... args) {
+    return malloc_shared_annotated<T>(N, args...).get();
+  };
+  auto TAnnotated = [&](auto... args) {
+    return malloc_annotated<T>(N, args...).get();
+  };
+
+  auto ATDevice = [&](size_t align, auto... args) {
+    return aligned_alloc_device_annotated<T>(align, N, args...).get();
+  };
+  auto ATHost = [&](size_t align, auto... args) {
+    return aligned_alloc_host_annotated<T>(align, N, args...).get();
+  };
+  auto ATShared = [&](size_t align, auto... args) {
+    return aligned_alloc_shared_annotated<T>(align, N, args...).get();
+  };
+  auto ATAnnotated = [&](size_t align, auto... args) {
+    return aligned_alloc_annotated<T>(align, N, args...).get();
+  };
+
+  CheckUsmKindAll(
+      alloc::device,
+      std::tuple{
+          [&]() { return MDevice(q); }, [&]() { return MDevice(dev, Ctx); },
+          [&]() { return MAnnotated(dev, Ctx, alloc::device); },
+          [&]() { return MAnnotated(dev, Ctx, properties{usm_kind_device}); },
+          [&]() { return ADevice(1, q); },
+          [&]() { return ADevice(1, dev, Ctx); },
+          [&]() { return AAnnotated(1, dev, Ctx, alloc::device); },
+          [&]() { return TDevice(q); }, [&]() { return TDevice(dev, Ctx); },
+          [&]() { return TAnnotated(dev, Ctx, alloc::device); },
+          [&]() { return TAnnotated(dev, Ctx, properties{usm_kind_device}); },
+          [&]() { return ATDevice(1, q); },
+          [&]() { return ATDevice(1, dev, Ctx); },
+          [&]() { return ATAnnotated(1, dev, Ctx, alloc::device); }});
+  CheckUsmKindAll(
+      alloc::host,
+      std::tuple{
+          [&]() { return MHost(q); }, [&]() { return MHost(Ctx); },
+          [&]() { return MAnnotated(dev, Ctx, alloc::host); },
+          [&]() { return MAnnotated(dev, Ctx, properties{usm_kind_host}); },
+          [&]() { return AHost(1, q); }, [&]() { return AHost(1, Ctx); },
+          [&]() { return AAnnotated(1, dev, Ctx, alloc::host); },
+          [&]() { return THost(q); }, [&]() { return THost(Ctx); },
+          [&]() { return TAnnotated(dev, Ctx, alloc::host); },
+          [&]() { return TAnnotated(dev, Ctx, properties{usm_kind_host}); },
+          [&]() { return ATHost(1, q); }, [&]() { return ATHost(1, Ctx); },
+          [&]() { return ATAnnotated(1, dev, Ctx, alloc::host); }});
+
+  CheckUsmKindAll(
+      alloc::shared,
+      std::tuple{
+          [&]() { return MShared(q); }, [&]() { return MShared(dev, Ctx); },
+          [&]() { return MAnnotated(dev, Ctx, alloc::shared); },
+          [&]() { return MAnnotated(dev, Ctx, properties{usm_kind_shared}); },
+          [&]() { return AShared(1, q); },
+          [&]() { return AShared(1, dev, Ctx); },
+          [&]() { return AAnnotated(1, dev, Ctx, alloc::shared); },
+          [&]() { return TShared(q); }, [&]() { return TShared(dev, Ctx); },
+          [&]() { return TAnnotated(dev, Ctx, alloc::shared); },
+          [&]() { return TAnnotated(dev, Ctx, properties{usm_kind_shared}); },
+          [&]() { return ATShared(1, q); },
+          [&]() { return ATShared(1, dev, Ctx); },
+          [&]() { return ATAnnotated(1, dev, Ctx, alloc::shared); }});
+}
+
+int main() {
+  sycl::queue q;
+  testUsmKind<int>(q);
+  return 0;
+}
\ No newline at end of file
diff --git a/sycl/test-e2e/Annotated_usm/annotated_usm_negative.cpp b/sycl/test-e2e/Annotated_usm/annotated_usm_negative.cpp
new file mode 100644
index 0000000000000..31953b5bd0540
--- /dev/null
+++ b/sycl/test-e2e/Annotated_usm/annotated_usm_negative.cpp
@@ -0,0 +1,102 @@
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+
+// UNSUPPORTED: gpu
+
+// Check expected runtime exception thrown for invalid input of annotated USM
+// allocations. Note this test does not work on gpu because the shared
+// allocation tests are expected to raise an error when the target device does
+// not have the corresponding aspect, while the gpu runtime has different
+// behavior
+
+#include <sycl/sycl.hpp>
+
+#include <complex>
+#include <iostream>
+
+// clang-format on
+
+using namespace sycl::ext::oneapi::experimental;
+using namespace sycl::ext::intel::experimental;
+
+using alloc = sycl::usm::alloc;
+
+constexpr int N = 10;
+
+#define TEST_ERRC_INVALID(f, args...)                                          \
+  try {                                                                        \
+    auto ap = f(args);                                                         \
+    assert(false && "Expected errc::invalid not raised");                      \
+  } catch (sycl::exception & e) {                                              \
+    if (e.code() == sycl::errc::invalid) {                                     \
+      std::cout << "Exception check passed: " << e.what() << std::endl;        \
+    }                                                                          \
+  }
+
+#define TEST_FEATURE_NOT_SUPPORTED(f, args...)                                 \
+  try {                                                                        \
+    auto ap = f(args);                                                         \
+    assert(false &&                                                            \
+           "Expected exception sycl::errc::feature_not_supported not raised"); \
+  } catch (sycl::exception & e) {                                              \
+    if (e.code() == sycl::errc::feature_not_supported) {                       \
+      std::cout << "Exception check passed: " << e.what() << std::endl;        \
+    }                                                                          \
+  }
+
+// Check an exception is thrown if property list contains usm_kind that
+// conflicts with the usm_kind argument
+template <typename T> void testUsmKindConflict(sycl::queue &q) {
+  const sycl::context &Ctx = q.get_context();
+  auto dev = q.get_device();
+
+  TEST_ERRC_INVALID(malloc_annotated<T>, N, q, alloc::device,
+                    properties{usm_kind_host});
+  TEST_ERRC_INVALID(malloc_annotated<T>, N, dev, Ctx, alloc::device,
+                    properties{usm_kind_host});
+  TEST_ERRC_INVALID(malloc_annotated, N, q, alloc::host,
+                    properties{usm_kind_shared});
+  TEST_ERRC_INVALID(malloc_annotated, N, dev, Ctx, alloc::host,
+                    properties{usm_kind_shared});
+  TEST_ERRC_INVALID(aligned_alloc_annotated<T>, 0, N, q, alloc::shared,
+                    properties{usm_kind_device});
+  TEST_ERRC_INVALID(aligned_alloc_annotated<T>, 0, N, dev, Ctx, alloc::shared,
+                    properties{usm_kind_device});
+  TEST_ERRC_INVALID(aligned_alloc_annotated, N, 0, q, alloc::host,
+                    properties{usm_kind_device});
+  TEST_ERRC_INVALID(aligned_alloc_annotated, N, 0, dev, Ctx, alloc::host,
+                    properties{usm_kind_device});
+}
+
+// Check an exception is thrown when calling malloc shared functions on SYCL
+// device that does not have shared aspect
+template <typename T> void testMissingDeviceAspect(sycl::queue &q) {
+  const sycl::context &Ctx = q.get_context();
+  auto dev = q.get_device();
+
+  if (!dev.has(sycl::aspect::usm_shared_allocations)) {
+    TEST_FEATURE_NOT_SUPPORTED(malloc_shared_annotated, N, q);
+    TEST_FEATURE_NOT_SUPPORTED(malloc_shared_annotated, N, dev, Ctx);
+    TEST_FEATURE_NOT_SUPPORTED(malloc_shared_annotated<T>, N, q);
+    TEST_FEATURE_NOT_SUPPORTED(malloc_shared_annotated<T>, N, dev, Ctx);
+    TEST_FEATURE_NOT_SUPPORTED(malloc_annotated<T>, N, q, alloc::shared);
+    TEST_FEATURE_NOT_SUPPORTED(malloc_annotated<T>, N, dev, Ctx, alloc::shared);
+    TEST_FEATURE_NOT_SUPPORTED(malloc_annotated, N, q, alloc::shared);
+    TEST_FEATURE_NOT_SUPPORTED(malloc_annotated, N, dev, Ctx, alloc::shared);
+    TEST_FEATURE_NOT_SUPPORTED(malloc_annotated, N, q,
+                               properties{usm_kind_shared});
+    TEST_FEATURE_NOT_SUPPORTED(malloc_annotated, N, dev, Ctx,
+                               properties{usm_kind_shared});
+    TEST_FEATURE_NOT_SUPPORTED(malloc_annotated<T>, N, q,
+                               properties{usm_kind_shared});
+    TEST_FEATURE_NOT_SUPPORTED(malloc_annotated<T>, N, dev, Ctx,
+                               properties{usm_kind_shared});
+  }
+}
+
+int main() {
+  sycl::queue q;
+  testUsmKindConflict<std::complex<double>>(q);
+  testMissingDeviceAspect<std::complex<double>>(q);
+  return 0;
+}
diff --git a/sycl/test-e2e/Annotated_usm/annotated_usm_shared_align.cpp b/sycl/test-e2e/Annotated_usm/annotated_usm_shared_align.cpp
new file mode 100644
index 0000000000000..bb64dfa53c3fe
--- /dev/null
+++ b/sycl/test-e2e/Annotated_usm/annotated_usm_shared_align.cpp
@@ -0,0 +1,254 @@
+// REQUIRES: aspect-usm_shared_allocations
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+
+// UNSUPPORTED: gpu
+
+// This e2e test checks the alignment of the annotated shared USM allocation in
+// various cases
+
+#include <sycl/sycl.hpp>
+
+#include <complex>
+#include <numeric>
+
+// clang-format on
+
+using namespace sycl::ext::oneapi::experimental;
+using namespace sycl::ext::intel::experimental;
+using alloc = sycl::usm::alloc;
+
+template <typename T> void testAlign(sycl::queue &q, unsigned align) {
+  const sycl::context &Ctx = q.get_context();
+  auto dev = q.get_device();
+
+  properties ALN0{alignment<0>};
+  properties ALN2{alignment<2>};
+  properties ALN4{alignment<4>};
+  properties ALN8{alignment<8>};
+  properties ALN16{alignment<16>};
+  properties ALN32{alignment<32>};
+  properties ALN64{alignment<64>};
+  properties ALN128{alignment<128>};
+  properties ALN256{alignment<256>};
+  properties ALN512{alignment<512>};
+
+  auto check_align = [&q](size_t Alignment, auto AllocFn,
+                          int Line = __builtin_LINE(), int Case = 0) {
+    // First allocation might naturally be over-aligned. Do several of them to
+    // do the verification;
+    decltype(AllocFn()) Arr[10];
+    for (auto *&Elem : Arr)
+      Elem = AllocFn();
+    for (auto *Ptr : Arr) {
+      auto v = reinterpret_cast<uintptr_t>(Ptr);
+      if ((v & (Alignment - 1)) != 0) {
+        std::cout << "Failed at line " << Line << ", case " << Case
+                  << std::endl;
+        assert(false && "Not properly aligned!");
+        break; // To be used with commented out assert above.
+      }
+    }
+    for (auto *Ptr : Arr)
+      free(Ptr, q);
+  };
+
+  auto CheckAlignAll = [&](size_t Expected, auto Funcs,
+                           int Line = __builtin_LINE()) {
+    std::apply(
+        [&](auto... Fs) {
+          int Case = 0;
+          (void)std::initializer_list<int>{
+              (check_align(Expected, Fs, Line, Case++), 0)...};
+        },
+        Funcs);
+  };
+
+  constexpr int N = 10;
+  assert(align > 0 || (align & (align - 1)) == 0);
+
+  auto MShared = [&](auto... args) {
+    return malloc_shared_annotated(N, args...).get();
+  };
+  auto MAnnotated = [&](auto... args) {
+    return malloc_annotated(N, args...).get();
+  };
+
+  auto AShared = [&](size_t align, auto... args) {
+    return aligned_alloc_shared_annotated(align, N, args...).get();
+  };
+  auto AAnnotated = [&](size_t align, auto... args) {
+    return aligned_alloc_annotated(align, N, args...).get();
+  };
+
+  CheckAlignAll(
+      1,
+      std::tuple{
+          // Case: `malloc_xxx` with no alignment constraint, this includes
+          // no alignment property or `alignment<0>`
+          [&]() { return MShared(q); }, [&]() { return MShared(q, ALN0); },
+          [&]() { return MShared(dev, Ctx); },
+          [&]() { return MShared(dev, Ctx, ALN0); },
+          [&]() { return MAnnotated(dev, Ctx, alloc::shared); },
+          [&]() { return MAnnotated(dev, Ctx, alloc::shared, ALN0); },
+          [&]() { return MAnnotated(dev, Ctx, properties{usm_kind_shared}); },
+          [&]() {
+            return MAnnotated(dev, Ctx,
+                              properties{usm_kind_shared, alignment<0>});
+          }
+
+          // Case: `aligned_alloc_xxx` with no alignment constraint, this
+          // includes
+          // 1. no alignment property or `alignment<0>`
+          // 2. alignment argument is 0
+          ,
+          [&]() { return AShared(0, q); },
+          [&]() { return AShared(0, q, ALN0); },
+          [&]() { return AShared(0, dev, Ctx); },
+          [&]() { return AShared(0, dev, Ctx, ALN0); },
+          [&]() { return AAnnotated(0, dev, Ctx, alloc::shared); },
+          [&]() { return AAnnotated(0, dev, Ctx, alloc::shared, ALN0); }});
+
+  // Case: `aligned_alloc_xxx` with alignment argument (power of 2) and no
+  // alignment property
+  CheckAlignAll(
+      align,
+      std::tuple{[&]() { return AShared(align, q); },
+                 [&]() { return AShared(align, dev, Ctx); },
+                 [&]() { return AAnnotated(align, dev, Ctx, alloc::shared); }});
+
+  // Case: `malloc_xxx<T>` with no alignment constraint, the alignment is
+  // sizeof(T)
+  auto TShared = [&](auto... args) {
+    return malloc_shared_annotated<T>(N, args...).get();
+  };
+  auto TAnnotated = [&](auto... args) {
+    return malloc_annotated<T>(N, args...).get();
+  };
+
+  auto ATShared = [&](size_t align, auto... args) {
+    return aligned_alloc_shared_annotated<T>(align, N, args...).get();
+  };
+  auto ATAnnotated = [&](size_t align, auto... args) {
+    return aligned_alloc_annotated<T>(align, N, args...).get();
+  };
+
+  auto check_align_lcm = [&q](std::vector<size_t> Alignments, auto AllocFn,
+                              int Line = __builtin_LINE(), int Case = 0) {
+    assert(Alignments.size() > 0);
+    size_t Lcm = Alignments[0];
+    for (int i = 1; i < Alignments.size(); i++)
+      Lcm = std::lcm(Lcm, Alignments[i]);
+
+    // First allocation might naturally be over-aligned. Do several of them to
+    // do the verification;
+    decltype(AllocFn()) Arr[10];
+    for (auto *&Elem : Arr)
+      Elem = AllocFn();
+    for (auto *Ptr : Arr) {
+      auto v = reinterpret_cast<uintptr_t>(Ptr);
+      if ((v & (Lcm - 1)) != 0) {
+        std::cout << "Failed at line " << Line << ", case " << Case
+                  << std::endl;
+        assert(false && "Not properly aligned!");
+        break; // To be used with commented out assert above.
+      }
+    }
+    for (auto *Ptr : Arr)
+      free(Ptr, q);
+  };
+
+  auto CheckAlignLcmAll = [&](std::vector<std::vector<size_t>> AlignVec,
+                              auto Funcs, int Line = __builtin_LINE()) {
+    std::apply(
+        [&](auto... Fs) {
+          int Case = 0, Idx = 0;
+          (void)std::initializer_list<int>{
+              (check_align_lcm(AlignVec[Idx++], Fs, Line, Case++), 0)...};
+        },
+        Funcs);
+  };
+
+  CheckAlignAll(
+      sizeof(T),
+      std::tuple{
+          // Case: `malloc_xxx<T>` with no alignment constraint, this includes
+          // no alignment property or `alignment<0>`
+          [&]() { return TShared(q); }, [&]() { return TShared(q, ALN0); },
+          [&]() { return TShared(dev, Ctx); },
+          [&]() { return TShared(dev, Ctx, ALN0); },
+          [&]() { return TAnnotated(dev, Ctx, alloc::shared); },
+          [&]() { return TAnnotated(dev, Ctx, alloc::shared, ALN0); },
+          [&]() { return TAnnotated(dev, Ctx, properties{usm_kind_shared}); },
+          [&]() {
+            return TAnnotated(dev, Ctx,
+                              properties{usm_kind_shared, alignment<0>});
+          }
+
+          // Case: `aligned_alloc_xxx<T>` with no alignment constraint, this
+          // includes
+          // 1. no alignment property or `alignment<0>`
+          // 2. alignment argument is 0
+          ,
+          [&]() { return ATShared(0, q); },
+          [&]() { return ATShared(0, q, ALN0); },
+          [&]() { return ATShared(0, dev, Ctx); },
+          [&]() { return ATShared(0, dev, Ctx, ALN0); },
+          [&]() { return ATAnnotated(0, dev, Ctx, alloc::shared); },
+          [&]() { return ATAnnotated(0, dev, Ctx, alloc::shared, ALN0); }});
+
+  // Case: malloc_xxx<T> with compile-time alignment<K> (K is a power of 2), the
+  // alignment is the least-common-multiple of K and sizeof(T)
+  CheckAlignLcmAll(
+      {{32, sizeof(T)}, {64, sizeof(T)}, {512, sizeof(T)}, {256, sizeof(T)}},
+      std::tuple{[&]() { return TShared(q, ALN32); },
+                 [&]() { return TShared(dev, Ctx, ALN64); },
+                 [&]() { return TAnnotated(dev, Ctx, alloc::shared, ALN512); },
+                 [&]() {
+                   return TAnnotated(
+                       dev, Ctx, properties{usm_kind_shared, alignment<256>});
+                 }});
+
+  // Case: aligned_alloc_xxx<T> with alignment argument K (K is a power of 2)
+  // and no alignment properties, the alignment is the least-common-multiple of
+  // K and sizeof(T)
+  CheckAlignLcmAll(
+      {
+          {32, sizeof(T)},
+          {64, sizeof(T)},
+          {512, sizeof(T)},
+      },
+      std::tuple{
+          [&]() { return ATShared(32, q, ALN32); },
+          [&]() { return ATShared(64, dev, Ctx, ALN64); },
+          [&]() { return ATAnnotated(512, dev, Ctx, alloc::shared, ALN512); }});
+
+  // Case: aligned_alloc_xxx<T> with alignment argument K (K is a power of 2)
+  // and no alignment properties, the alignment is the least-common-multiple of
+  // K and sizeof(T)
+  CheckAlignAll(std::lcm(align, sizeof(T)),
+                std::tuple{[&]() { return ATShared(align, q); },
+                           [&]() { return ATShared(align, dev, Ctx); },
+                           [&]() {
+                             return ATAnnotated(align, dev, Ctx, alloc::shared);
+                           }});
+
+  // Case: aligned_alloc_xxx<T> with compile-time alignment<K> (K is a power of
+  // 2), the alignment is the least-common-multiple of the alignment argument (a
+  // power of 2), K and sizeof(T)
+  CheckAlignLcmAll(
+      {{align, 32, sizeof(T)}, {align, 64, sizeof(T)}, {align, 512, sizeof(T)}},
+      std::tuple{[&]() { return ATShared(align, q, ALN32); },
+                 [&]() { return ATShared(align, dev, Ctx, ALN64); },
+                 [&]() {
+                   return ATAnnotated(align, dev, Ctx, alloc::shared, ALN512);
+                 }});
+}
+
+int main() {
+  sycl::queue q;
+  testAlign<char>(q, 4);
+  testAlign<int>(q, 128);
+  testAlign<std::complex<double>>(q, 4);
+  return 0;
+}
\ No newline at end of file
diff --git a/sycl/test/extensions/annotated_usm/annotated_usm.cpp b/sycl/test/extensions/annotated_usm/annotated_usm.cpp
new file mode 100644
index 0000000000000..b58d5d37022bf
--- /dev/null
+++ b/sycl/test/extensions/annotated_usm/annotated_usm.cpp
@@ -0,0 +1,194 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsyntax-only -Xclang -verify -Xclang -verify-ignore-unexpected=note %s
+// expected-no-diagnostics
+
+// Compile-time tests for various annotated USM allocation functions
+
+#include <sycl/sycl.hpp>
+
+#include <complex>
+
+#include "fake_properties.hpp"
+
+// clang-format on
+
+using namespace sycl::ext::oneapi::experimental;
+using namespace sycl::ext::intel::experimental;
+using alloc = sycl::usm::alloc;
+
+namespace sycl {
+namespace ext::oneapi::experimental {
+namespace detail {
+
+// Make runtime property `foo` and `foz` valid for this test
+template <> struct IsRuntimePropertyValid<foo_key> : std::true_type {};
+template <> struct IsRuntimePropertyValid<foz_key> : std::true_type {};
+
+} // namespace detail
+} // namespace ext::oneapi::experimental
+} // namespace sycl
+
+// Single test instance consisting of (i) calling malloc function `f` (ii) type
+// check on returned annotated_ptr (iii) calling free
+#define TEST_VOID(f, args...)                                                  \
+  {                                                                            \
+    auto ap = f(args);                                                         \
+    static_assert(                                                             \
+        std::is_same_v<decltype(ap), annotated_ptr<void, decltype(OutP)>>);    \
+    free(ap, q);                                                               \
+  }
+
+#define TEST_T(f, args...)                                                     \
+  {                                                                            \
+    auto ap = f(args);                                                         \
+    static_assert(                                                             \
+        std::is_same_v<decltype(ap), annotated_ptr<T, decltype(OutP)>>);       \
+    free(ap, q);                                                               \
+  }
+
+// Test all the possible use cases for a single allocation API, including:
+// 1. specify input properties `InP1` as argument. The template parameter `input
+// properties` and `output properties` are automatically inferred. This is the
+// most common use case
+// 2. specify input properties `InP1` in the template parameters. The template
+// parameter `output properties` is automatically inferred. The property list
+// argment can be omitted
+// 3. fully specify all the template parameters: `input properties` and `output
+// properties`. The property list argment can be omitted 4/5/6. Repeat case 1-3
+// but with element type T
+#define TEST_GROUP(func_name, args...)                                         \
+  TEST_VOID((func_name), args, InP1);                                          \
+  TEST_VOID((func_name<decltype(InP1)>),                                       \
+            args); /*the property list argument is omitted*/                   \
+  TEST_VOID((func_name<decltype(InP1)>), args, InP1);                          \
+  TEST_VOID((func_name<decltype(InP1), decltype(OutP)>), args, InP1);          \
+  TEST_VOID((func_name<decltype(InP1), decltype(OutP)>), args);                \
+  TEST_T((func_name<T>), args, InP1);                                          \
+  TEST_T((func_name<T, decltype(InP1)>), args);                                \
+  TEST_T((func_name<T, decltype(InP1)>), args, InP1);                          \
+  TEST_T((func_name<T, decltype(InP1), decltype(OutP)>), args, InP1);          \
+  TEST_T((func_name<T, decltype(InP1), decltype(OutP)>), args);
+
+// Test all the possible use cases for a single allocation API where runtime
+// property is specified. Note that when runtime property exists, the property
+// list argument cannot be omitted
+#define TEST_GROUP_WITH_RUNTIME_PROPERTY(func_name, args...)                   \
+  TEST_VOID((func_name), args, InP2);                                          \
+  TEST_VOID((func_name<decltype(InP2)>), args, InP2);                          \
+  TEST_VOID((func_name<decltype(InP2), decltype(OutP)>), args, InP2);          \
+  TEST_T((func_name<T>), args, InP2);                                          \
+  TEST_T((func_name<T, decltype(InP2)>), args, InP2);                          \
+  TEST_T((func_name<T, decltype(InP2), decltype(OutP)>), args, InP2);
+
+constexpr int N = 10;
+
+template <typename T> void testAlloc() {
+  sycl::queue q;
+  const sycl::context &Ctx = q.get_context();
+  auto Dev = q.get_device();
+
+  // Test device allocation
+  {
+    // Given a property list, all compile-time properties in it appear on
+    // the returned annotated_ptr, and runtime properties do not appear on the
+    // returned annotated_ptr (e.g. `foo`, `foz`)
+    properties InP1{conduit, buffer_location<5>};
+    properties InP2{conduit, buffer_location<5>, foo{foo_enum::a}, foz{0.1, 1}};
+    properties OutP{conduit, buffer_location<5>, usm_kind_device};
+
+    TEST_GROUP(malloc_device_annotated, N, q);
+    TEST_GROUP(malloc_device_annotated, N, Dev, Ctx);
+    TEST_GROUP(aligned_alloc_device_annotated, 1, N, q);
+    TEST_GROUP(aligned_alloc_device_annotated, 1, N, Dev, Ctx);
+
+    TEST_GROUP_WITH_RUNTIME_PROPERTY(malloc_device_annotated, N, q);
+    TEST_GROUP_WITH_RUNTIME_PROPERTY(malloc_device_annotated, N, Dev, Ctx);
+    TEST_GROUP_WITH_RUNTIME_PROPERTY(aligned_alloc_device_annotated, 1, N, q);
+    TEST_GROUP_WITH_RUNTIME_PROPERTY(aligned_alloc_device_annotated, 1, N, Dev,
+                                     Ctx);
+  }
+
+  // Test host allocation
+  {
+    properties InP1{};
+    properties InP2{foo{foo_enum::a}, foz{1.0, 1}};
+    properties OutP{usm_kind_host};
+
+    TEST_GROUP(malloc_host_annotated, N, q);
+    TEST_GROUP(malloc_host_annotated, N, Ctx);
+    TEST_GROUP(aligned_alloc_host_annotated, 1, N, q);
+    TEST_GROUP(aligned_alloc_host_annotated, 1, N, Ctx);
+
+    TEST_GROUP_WITH_RUNTIME_PROPERTY(malloc_host_annotated, N, q);
+    TEST_GROUP_WITH_RUNTIME_PROPERTY(malloc_host_annotated, N, Ctx);
+    TEST_GROUP_WITH_RUNTIME_PROPERTY(aligned_alloc_host_annotated, 1, N, q);
+    TEST_GROUP_WITH_RUNTIME_PROPERTY(aligned_alloc_host_annotated, 1, N, Ctx);
+  }
+
+  // Test shared allocation
+  {
+    properties InP1{conduit, buffer_location<5>};
+    properties InP2{conduit, buffer_location<5>, foo{foo_enum::a}, foz{0.1, 0}};
+    properties OutP{conduit, buffer_location<5>, usm_kind_shared};
+
+    TEST_GROUP(malloc_shared_annotated, N, q);
+    TEST_GROUP(malloc_shared_annotated, N, Dev, Ctx);
+    TEST_GROUP(aligned_alloc_shared_annotated, 1, N, q);
+    TEST_GROUP(aligned_alloc_shared_annotated, 1, N, Dev, Ctx);
+
+    TEST_GROUP_WITH_RUNTIME_PROPERTY(malloc_shared_annotated, N, q);
+    TEST_GROUP_WITH_RUNTIME_PROPERTY(malloc_shared_annotated, N, Dev, Ctx);
+    TEST_GROUP_WITH_RUNTIME_PROPERTY(aligned_alloc_shared_annotated, 1, N, q);
+    TEST_GROUP_WITH_RUNTIME_PROPERTY(aligned_alloc_shared_annotated, 1, N, Dev,
+                                     Ctx);
+  }
+
+  // Test alloc functions with usm_kind argument and no usm_kind compile-time
+  // property, usm_kind does not appear on the returned annotated_ptr
+  {
+    {
+      properties InP1{conduit, buffer_location<5>};
+      properties InP2{conduit, buffer_location<5>, foo{foo_enum::a},
+                      foz{0.1, 1}};
+      properties OutP{conduit, buffer_location<5>};
+
+      TEST_GROUP(malloc_annotated, N, q, alloc::device);
+      TEST_GROUP(malloc_annotated, N, Dev, Ctx, alloc::device);
+      TEST_GROUP(aligned_alloc_annotated, 1, N, q, alloc::device);
+      TEST_GROUP(aligned_alloc_annotated, 1, N, Dev, Ctx, alloc::host);
+
+      TEST_GROUP_WITH_RUNTIME_PROPERTY(malloc_annotated, N, q, alloc::device);
+      TEST_GROUP_WITH_RUNTIME_PROPERTY(malloc_annotated, N, Dev, Ctx,
+                                       alloc::device);
+      TEST_GROUP_WITH_RUNTIME_PROPERTY(aligned_alloc_annotated, 1, N, q,
+                                       alloc::device);
+      TEST_GROUP_WITH_RUNTIME_PROPERTY(aligned_alloc_annotated, 1, N, Dev, Ctx,
+                                       alloc::host);
+    }
+
+    // Test alloc functions with empty property list
+    {
+      properties InP1{};
+      properties OutP{};
+      TEST_GROUP(malloc_annotated, N, q, alloc::device);
+      TEST_GROUP(malloc_annotated, N, Dev, Ctx, alloc::device);
+      TEST_GROUP(aligned_alloc_annotated, 1, N, q, alloc::host);
+      TEST_GROUP(aligned_alloc_annotated, 1, N, Dev, Ctx, alloc::shared);
+    }
+  }
+
+  // Test alloc functions where usm_kind property is required in the input
+  // property list. usm_kind appears on the returned annotated_ptr
+  {
+    properties InP2{usm_kind_device, foo{foo_enum::a}, foz{0, 0}};
+    properties OutP{usm_kind_device};
+
+    TEST_GROUP_WITH_RUNTIME_PROPERTY(malloc_annotated, N, q);
+    TEST_GROUP_WITH_RUNTIME_PROPERTY(malloc_annotated, N, Dev, Ctx);
+  }
+}
+
+int main() {
+  testAlloc<double>();
+  testAlloc<std::complex<double>>();
+  return 0;
+}
diff --git a/sycl/test/extensions/annotated_usm/fake_properties.hpp b/sycl/test/extensions/annotated_usm/fake_properties.hpp
new file mode 100644
index 0000000000000..06a026e48c9b5
--- /dev/null
+++ b/sycl/test/extensions/annotated_usm/fake_properties.hpp
@@ -0,0 +1,446 @@
+//==--- mock_compile_time_properties.hpp -  Mock compile-time properties ---==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// Since we do not currently have any compile-time properties this header cheats
+// a little by defining its own compile-time properties. These are intended for
+// testing and users should not replicate this behavior.
+
+#pragma once
+
+namespace sycl {
+namespace ext {
+namespace oneapi {
+namespace experimental {
+
+// Compile-time properties
+struct bar_key {
+  using value_t = property_value<bar_key>;
+};
+
+struct baz_key {
+  template <int K>
+  using value_t = property_value<baz_key, std::integral_constant<int, K>>;
+};
+
+struct boo_key {
+  template <typename... Ts> using value_t = property_value<boo_key, Ts...>;
+};
+
+inline constexpr bar_key::value_t bar;
+template <int K> inline constexpr baz_key::value_t<K> baz;
+template <typename... Ts> inline constexpr boo_key::value_t<Ts...> boo;
+
+template <> struct is_property_key<bar_key> : std::true_type {};
+template <> struct is_property_key<baz_key> : std::true_type {};
+template <> struct is_property_key<boo_key> : std::true_type {};
+
+template <typename objectT>
+struct is_property_key_of<bar_key, objectT> : std::true_type {};
+template <typename objectT>
+struct is_property_key_of<baz_key, objectT> : std::true_type {};
+template <typename objectT>
+struct is_property_key_of<boo_key, objectT> : std::true_type {};
+
+namespace detail {
+template <> struct PropertyToKind<bar_key> {
+  static constexpr PropKind Kind = static_cast<enum PropKind>(
+      static_cast<std::uint32_t>(PropKind::PropKindSize) + 0);
+};
+template <> struct PropertyToKind<baz_key> {
+  static constexpr PropKind Kind = static_cast<enum PropKind>(
+      static_cast<std::uint32_t>(PropKind::PropKindSize) + 1);
+};
+
+template <> struct PropertyToKind<boo_key> {
+  static constexpr PropKind Kind = static_cast<enum PropKind>(
+      static_cast<std::uint32_t>(PropKind::PropKindSize) + 2);
+};
+
+template <> struct IsCompileTimeProperty<bar_key> : std::true_type {};
+template <> struct IsCompileTimeProperty<baz_key> : std::true_type {};
+template <> struct IsCompileTimeProperty<boo_key> : std::true_type {};
+
+} // namespace detail
+
+// Runtime properties
+enum foo_enum : unsigned { a, b, c };
+struct foo {
+  constexpr foo(foo_enum v) : value(v) {}
+  foo_enum value;
+};
+
+struct foz {
+  float value1;
+  bool value2;
+};
+
+struct rt_prop1 {};
+struct rt_prop2 {};
+struct rt_prop3 {};
+struct rt_prop4 {};
+struct rt_prop5 {};
+struct rt_prop6 {};
+struct rt_prop7 {};
+struct rt_prop8 {};
+struct rt_prop9 {};
+struct rt_prop10 {};
+struct rt_prop11 {};
+struct rt_prop12 {};
+struct rt_prop13 {};
+struct rt_prop14 {};
+struct rt_prop15 {};
+struct rt_prop16 {};
+struct rt_prop17 {};
+struct rt_prop18 {};
+struct rt_prop19 {};
+struct rt_prop20 {};
+struct rt_prop21 {};
+struct rt_prop22 {};
+struct rt_prop23 {};
+struct rt_prop24 {};
+struct rt_prop25 {};
+struct rt_prop26 {};
+struct rt_prop27 {};
+struct rt_prop28 {};
+struct rt_prop29 {};
+struct rt_prop30 {};
+struct rt_prop31 {};
+struct rt_prop32 {};
+struct rt_prop33 {};
+
+using foo_key = foo;
+using foz_key = foz;
+using rt_prop1_key = rt_prop1;
+using rt_prop2_key = rt_prop2;
+using rt_prop3_key = rt_prop3;
+using rt_prop4_key = rt_prop4;
+using rt_prop5_key = rt_prop5;
+using rt_prop6_key = rt_prop6;
+using rt_prop7_key = rt_prop7;
+using rt_prop8_key = rt_prop8;
+using rt_prop9_key = rt_prop9;
+using rt_prop10_key = rt_prop10;
+using rt_prop11_key = rt_prop11;
+using rt_prop12_key = rt_prop12;
+using rt_prop13_key = rt_prop13;
+using rt_prop14_key = rt_prop14;
+using rt_prop15_key = rt_prop15;
+using rt_prop16_key = rt_prop16;
+using rt_prop17_key = rt_prop17;
+using rt_prop18_key = rt_prop18;
+using rt_prop19_key = rt_prop19;
+using rt_prop20_key = rt_prop20;
+using rt_prop21_key = rt_prop21;
+using rt_prop22_key = rt_prop22;
+using rt_prop23_key = rt_prop23;
+using rt_prop24_key = rt_prop24;
+using rt_prop25_key = rt_prop25;
+using rt_prop26_key = rt_prop26;
+using rt_prop27_key = rt_prop27;
+using rt_prop28_key = rt_prop28;
+using rt_prop29_key = rt_prop29;
+using rt_prop30_key = rt_prop30;
+using rt_prop31_key = rt_prop31;
+using rt_prop32_key = rt_prop32;
+using rt_prop33_key = rt_prop33;
+
+template <> struct is_property_key<foo_key> : std::true_type {};
+template <> struct is_property_key<foz_key> : std::true_type {};
+
+template <> struct is_property_key<rt_prop1_key> : std::true_type {};
+template <> struct is_property_key<rt_prop2_key> : std::true_type {};
+template <> struct is_property_key<rt_prop3_key> : std::true_type {};
+template <> struct is_property_key<rt_prop4_key> : std::true_type {};
+template <> struct is_property_key<rt_prop5_key> : std::true_type {};
+template <> struct is_property_key<rt_prop6_key> : std::true_type {};
+template <> struct is_property_key<rt_prop7_key> : std::true_type {};
+template <> struct is_property_key<rt_prop8_key> : std::true_type {};
+template <> struct is_property_key<rt_prop9_key> : std::true_type {};
+template <> struct is_property_key<rt_prop10_key> : std::true_type {};
+template <> struct is_property_key<rt_prop11_key> : std::true_type {};
+template <> struct is_property_key<rt_prop12_key> : std::true_type {};
+template <> struct is_property_key<rt_prop13_key> : std::true_type {};
+template <> struct is_property_key<rt_prop14_key> : std::true_type {};
+template <> struct is_property_key<rt_prop15_key> : std::true_type {};
+template <> struct is_property_key<rt_prop16_key> : std::true_type {};
+template <> struct is_property_key<rt_prop17_key> : std::true_type {};
+template <> struct is_property_key<rt_prop18_key> : std::true_type {};
+template <> struct is_property_key<rt_prop19_key> : std::true_type {};
+template <> struct is_property_key<rt_prop20_key> : std::true_type {};
+template <> struct is_property_key<rt_prop21_key> : std::true_type {};
+template <> struct is_property_key<rt_prop22_key> : std::true_type {};
+template <> struct is_property_key<rt_prop23_key> : std::true_type {};
+template <> struct is_property_key<rt_prop24_key> : std::true_type {};
+template <> struct is_property_key<rt_prop25_key> : std::true_type {};
+template <> struct is_property_key<rt_prop26_key> : std::true_type {};
+template <> struct is_property_key<rt_prop27_key> : std::true_type {};
+template <> struct is_property_key<rt_prop28_key> : std::true_type {};
+template <> struct is_property_key<rt_prop29_key> : std::true_type {};
+template <> struct is_property_key<rt_prop30_key> : std::true_type {};
+template <> struct is_property_key<rt_prop31_key> : std::true_type {};
+template <> struct is_property_key<rt_prop32_key> : std::true_type {};
+template <> struct is_property_key<rt_prop33_key> : std::true_type {};
+
+template <typename objectT>
+struct is_property_key_of<foo_key, objectT> : std::true_type {};
+template <typename objectT>
+struct is_property_key_of<foz_key, objectT> : std::true_type {};
+
+template <typename objectT>
+struct is_property_key_of<rt_prop1_key, objectT> : std::true_type {};
+template <typename objectT>
+struct is_property_key_of<rt_prop2_key, objectT> : std::true_type {};
+template <typename objectT>
+struct is_property_key_of<rt_prop3_key, objectT> : std::true_type {};
+template <typename objectT>
+struct is_property_key_of<rt_prop4_key, objectT> : std::true_type {};
+template <typename objectT>
+struct is_property_key_of<rt_prop5_key, objectT> : std::true_type {};
+template <typename objectT>
+struct is_property_key_of<rt_prop6_key, objectT> : std::true_type {};
+template <typename objectT>
+struct is_property_key_of<rt_prop7_key, objectT> : std::true_type {};
+template <typename objectT>
+struct is_property_key_of<rt_prop8_key, objectT> : std::true_type {};
+template <typename objectT>
+struct is_property_key_of<rt_prop9_key, objectT> : std::true_type {};
+template <typename objectT>
+struct is_property_key_of<rt_prop10_key, objectT> : std::true_type {};
+template <typename objectT>
+struct is_property_key_of<rt_prop11_key, objectT> : std::true_type {};
+template <typename objectT>
+struct is_property_key_of<rt_prop12_key, objectT> : std::true_type {};
+template <typename objectT>
+struct is_property_key_of<rt_prop13_key, objectT> : std::true_type {};
+template <typename objectT>
+struct is_property_key_of<rt_prop14_key, objectT> : std::true_type {};
+template <typename objectT>
+struct is_property_key_of<rt_prop15_key, objectT> : std::true_type {};
+template <typename objectT>
+struct is_property_key_of<rt_prop16_key, objectT> : std::true_type {};
+template <typename objectT>
+struct is_property_key_of<rt_prop17_key, objectT> : std::true_type {};
+template <typename objectT>
+struct is_property_key_of<rt_prop18_key, objectT> : std::true_type {};
+template <typename objectT>
+struct is_property_key_of<rt_prop19_key, objectT> : std::true_type {};
+template <typename objectT>
+struct is_property_key_of<rt_prop20_key, objectT> : std::true_type {};
+template <typename objectT>
+struct is_property_key_of<rt_prop21_key, objectT> : std::true_type {};
+template <typename objectT>
+struct is_property_key_of<rt_prop22_key, objectT> : std::true_type {};
+template <typename objectT>
+struct is_property_key_of<rt_prop23_key, objectT> : std::true_type {};
+template <typename objectT>
+struct is_property_key_of<rt_prop24_key, objectT> : std::true_type {};
+template <typename objectT>
+struct is_property_key_of<rt_prop25_key, objectT> : std::true_type {};
+template <typename objectT>
+struct is_property_key_of<rt_prop26_key, objectT> : std::true_type {};
+template <typename objectT>
+struct is_property_key_of<rt_prop27_key, objectT> : std::true_type {};
+template <typename objectT>
+struct is_property_key_of<rt_prop28_key, objectT> : std::true_type {};
+template <typename objectT>
+struct is_property_key_of<rt_prop29_key, objectT> : std::true_type {};
+template <typename objectT>
+struct is_property_key_of<rt_prop30_key, objectT> : std::true_type {};
+template <typename objectT>
+struct is_property_key_of<rt_prop31_key, objectT> : std::true_type {};
+template <typename objectT>
+struct is_property_key_of<rt_prop32_key, objectT> : std::true_type {};
+template <typename objectT>
+struct is_property_key_of<rt_prop33_key, objectT> : std::true_type {};
+
+namespace detail {
+template <> struct PropertyToKind<foo_key> {
+  static constexpr PropKind Kind = static_cast<enum PropKind>(
+      static_cast<std::uint32_t>(PropKind::PropKindSize) + 3);
+};
+
+template <> struct PropertyToKind<foz_key> {
+  static constexpr PropKind Kind = static_cast<enum PropKind>(
+      static_cast<std::uint32_t>(PropKind::PropKindSize) + 4);
+};
+
+template <> struct PropertyToKind<rt_prop1_key> {
+  static constexpr PropKind Kind = static_cast<enum PropKind>(
+      static_cast<std::uint32_t>(PropKind::PropKindSize) + 4 + 1);
+};
+template <> struct PropertyToKind<rt_prop2_key> {
+  static constexpr PropKind Kind = static_cast<enum PropKind>(
+      static_cast<std::uint32_t>(PropKind::PropKindSize) + 4 + 2);
+};
+template <> struct PropertyToKind<rt_prop3_key> {
+  static constexpr PropKind Kind = static_cast<enum PropKind>(
+      static_cast<std::uint32_t>(PropKind::PropKindSize) + 4 + 3);
+};
+template <> struct PropertyToKind<rt_prop4_key> {
+  static constexpr PropKind Kind = static_cast<enum PropKind>(
+      static_cast<std::uint32_t>(PropKind::PropKindSize) + 4 + 4);
+};
+template <> struct PropertyToKind<rt_prop5_key> {
+  static constexpr PropKind Kind = static_cast<enum PropKind>(
+      static_cast<std::uint32_t>(PropKind::PropKindSize) + 4 + 5);
+};
+template <> struct PropertyToKind<rt_prop6_key> {
+  static constexpr PropKind Kind = static_cast<enum PropKind>(
+      static_cast<std::uint32_t>(PropKind::PropKindSize) + 4 + 6);
+};
+template <> struct PropertyToKind<rt_prop7_key> {
+  static constexpr PropKind Kind = static_cast<enum PropKind>(
+      static_cast<std::uint32_t>(PropKind::PropKindSize) + 4 + 7);
+};
+template <> struct PropertyToKind<rt_prop8_key> {
+  static constexpr PropKind Kind = static_cast<enum PropKind>(
+      static_cast<std::uint32_t>(PropKind::PropKindSize) + 4 + 8);
+};
+template <> struct PropertyToKind<rt_prop9_key> {
+  static constexpr PropKind Kind = static_cast<enum PropKind>(
+      static_cast<std::uint32_t>(PropKind::PropKindSize) + 4 + 9);
+};
+template <> struct PropertyToKind<rt_prop10_key> {
+  static constexpr PropKind Kind = static_cast<enum PropKind>(
+      static_cast<std::uint32_t>(PropKind::PropKindSize) + 4 + 10);
+};
+template <> struct PropertyToKind<rt_prop11_key> {
+  static constexpr PropKind Kind = static_cast<enum PropKind>(
+      static_cast<std::uint32_t>(PropKind::PropKindSize) + 4 + 11);
+};
+template <> struct PropertyToKind<rt_prop12_key> {
+  static constexpr PropKind Kind = static_cast<enum PropKind>(
+      static_cast<std::uint32_t>(PropKind::PropKindSize) + 4 + 12);
+};
+template <> struct PropertyToKind<rt_prop13_key> {
+  static constexpr PropKind Kind = static_cast<enum PropKind>(
+      static_cast<std::uint32_t>(PropKind::PropKindSize) + 4 + 13);
+};
+template <> struct PropertyToKind<rt_prop14_key> {
+  static constexpr PropKind Kind = static_cast<enum PropKind>(
+      static_cast<std::uint32_t>(PropKind::PropKindSize) + 4 + 14);
+};
+template <> struct PropertyToKind<rt_prop15_key> {
+  static constexpr PropKind Kind = static_cast<enum PropKind>(
+      static_cast<std::uint32_t>(PropKind::PropKindSize) + 4 + 15);
+};
+template <> struct PropertyToKind<rt_prop16_key> {
+  static constexpr PropKind Kind = static_cast<enum PropKind>(
+      static_cast<std::uint32_t>(PropKind::PropKindSize) + 4 + 16);
+};
+template <> struct PropertyToKind<rt_prop17_key> {
+  static constexpr PropKind Kind = static_cast<enum PropKind>(
+      static_cast<std::uint32_t>(PropKind::PropKindSize) + 4 + 17);
+};
+template <> struct PropertyToKind<rt_prop18_key> {
+  static constexpr PropKind Kind = static_cast<enum PropKind>(
+      static_cast<std::uint32_t>(PropKind::PropKindSize) + 4 + 18);
+};
+template <> struct PropertyToKind<rt_prop19_key> {
+  static constexpr PropKind Kind = static_cast<enum PropKind>(
+      static_cast<std::uint32_t>(PropKind::PropKindSize) + 4 + 19);
+};
+template <> struct PropertyToKind<rt_prop20_key> {
+  static constexpr PropKind Kind = static_cast<enum PropKind>(
+      static_cast<std::uint32_t>(PropKind::PropKindSize) + 4 + 20);
+};
+template <> struct PropertyToKind<rt_prop21_key> {
+  static constexpr PropKind Kind = static_cast<enum PropKind>(
+      static_cast<std::uint32_t>(PropKind::PropKindSize) + 4 + 21);
+};
+template <> struct PropertyToKind<rt_prop22_key> {
+  static constexpr PropKind Kind = static_cast<enum PropKind>(
+      static_cast<std::uint32_t>(PropKind::PropKindSize) + 4 + 22);
+};
+template <> struct PropertyToKind<rt_prop23_key> {
+  static constexpr PropKind Kind = static_cast<enum PropKind>(
+      static_cast<std::uint32_t>(PropKind::PropKindSize) + 4 + 23);
+};
+template <> struct PropertyToKind<rt_prop24_key> {
+  static constexpr PropKind Kind = static_cast<enum PropKind>(
+      static_cast<std::uint32_t>(PropKind::PropKindSize) + 4 + 24);
+};
+template <> struct PropertyToKind<rt_prop25_key> {
+  static constexpr PropKind Kind = static_cast<enum PropKind>(
+      static_cast<std::uint32_t>(PropKind::PropKindSize) + 4 + 25);
+};
+template <> struct PropertyToKind<rt_prop26_key> {
+  static constexpr PropKind Kind = static_cast<enum PropKind>(
+      static_cast<std::uint32_t>(PropKind::PropKindSize) + 4 + 26);
+};
+template <> struct PropertyToKind<rt_prop27_key> {
+  static constexpr PropKind Kind = static_cast<enum PropKind>(
+      static_cast<std::uint32_t>(PropKind::PropKindSize) + 4 + 27);
+};
+template <> struct PropertyToKind<rt_prop28_key> {
+  static constexpr PropKind Kind = static_cast<enum PropKind>(
+      static_cast<std::uint32_t>(PropKind::PropKindSize) + 4 + 28);
+};
+template <> struct PropertyToKind<rt_prop29_key> {
+  static constexpr PropKind Kind = static_cast<enum PropKind>(
+      static_cast<std::uint32_t>(PropKind::PropKindSize) + 4 + 29);
+};
+template <> struct PropertyToKind<rt_prop30_key> {
+  static constexpr PropKind Kind = static_cast<enum PropKind>(
+      static_cast<std::uint32_t>(PropKind::PropKindSize) + 4 + 30);
+};
+template <> struct PropertyToKind<rt_prop31_key> {
+  static constexpr PropKind Kind = static_cast<enum PropKind>(
+      static_cast<std::uint32_t>(PropKind::PropKindSize) + 4 + 31);
+};
+template <> struct PropertyToKind<rt_prop32_key> {
+  static constexpr PropKind Kind = static_cast<enum PropKind>(
+      static_cast<std::uint32_t>(PropKind::PropKindSize) + 4 + 32);
+};
+template <> struct PropertyToKind<rt_prop33_key> {
+  static constexpr PropKind Kind = static_cast<enum PropKind>(
+      static_cast<std::uint32_t>(PropKind::PropKindSize) + 4 + 33);
+};
+
+template <> struct IsRuntimeProperty<foo_key> : std::true_type {};
+template <> struct IsRuntimeProperty<foz_key> : std::true_type {};
+
+template <> struct IsRuntimeProperty<rt_prop1_key> : std::true_type {};
+template <> struct IsRuntimeProperty<rt_prop2_key> : std::true_type {};
+template <> struct IsRuntimeProperty<rt_prop3_key> : std::true_type {};
+template <> struct IsRuntimeProperty<rt_prop4_key> : std::true_type {};
+template <> struct IsRuntimeProperty<rt_prop5_key> : std::true_type {};
+template <> struct IsRuntimeProperty<rt_prop6_key> : std::true_type {};
+template <> struct IsRuntimeProperty<rt_prop7_key> : std::true_type {};
+template <> struct IsRuntimeProperty<rt_prop8_key> : std::true_type {};
+template <> struct IsRuntimeProperty<rt_prop9_key> : std::true_type {};
+template <> struct IsRuntimeProperty<rt_prop10_key> : std::true_type {};
+template <> struct IsRuntimeProperty<rt_prop11_key> : std::true_type {};
+template <> struct IsRuntimeProperty<rt_prop12_key> : std::true_type {};
+template <> struct IsRuntimeProperty<rt_prop13_key> : std::true_type {};
+template <> struct IsRuntimeProperty<rt_prop14_key> : std::true_type {};
+template <> struct IsRuntimeProperty<rt_prop15_key> : std::true_type {};
+template <> struct IsRuntimeProperty<rt_prop16_key> : std::true_type {};
+template <> struct IsRuntimeProperty<rt_prop17_key> : std::true_type {};
+template <> struct IsRuntimeProperty<rt_prop18_key> : std::true_type {};
+template <> struct IsRuntimeProperty<rt_prop19_key> : std::true_type {};
+template <> struct IsRuntimeProperty<rt_prop20_key> : std::true_type {};
+template <> struct IsRuntimeProperty<rt_prop21_key> : std::true_type {};
+template <> struct IsRuntimeProperty<rt_prop22_key> : std::true_type {};
+template <> struct IsRuntimeProperty<rt_prop23_key> : std::true_type {};
+template <> struct IsRuntimeProperty<rt_prop24_key> : std::true_type {};
+template <> struct IsRuntimeProperty<rt_prop25_key> : std::true_type {};
+template <> struct IsRuntimeProperty<rt_prop26_key> : std::true_type {};
+template <> struct IsRuntimeProperty<rt_prop27_key> : std::true_type {};
+template <> struct IsRuntimeProperty<rt_prop28_key> : std::true_type {};
+template <> struct IsRuntimeProperty<rt_prop29_key> : std::true_type {};
+template <> struct IsRuntimeProperty<rt_prop30_key> : std::true_type {};
+template <> struct IsRuntimeProperty<rt_prop31_key> : std::true_type {};
+template <> struct IsRuntimeProperty<rt_prop32_key> : std::true_type {};
+template <> struct IsRuntimeProperty<rt_prop33_key> : std::true_type {};
+
+} // namespace detail
+} // namespace experimental
+} // namespace oneapi
+} // namespace ext
+} // namespace sycl
diff --git a/sycl/test/extensions/annotated_usm/invalid_compile_time_properties.cpp b/sycl/test/extensions/annotated_usm/invalid_compile_time_properties.cpp
new file mode 100644
index 0000000000000..ea67d4395e847
--- /dev/null
+++ b/sycl/test/extensions/annotated_usm/invalid_compile_time_properties.cpp
@@ -0,0 +1,170 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -ferror-limit=0 -fsyntax-only -Xclang -verify -Xclang -verify-ignore-unexpected=note %s
+
+// Expected failed tests for annotated USM allocation when given properties
+// contain invalid compile-time property
+
+#include <sycl/sycl.hpp>
+
+#include <sycl/ext/intel/fpga_extensions.hpp>
+
+#include "fake_properties.hpp"
+
+#define TEST(f, args...)                                                       \
+  { auto ap = f(args); }
+
+using namespace sycl::ext::oneapi::experimental;
+using namespace sycl::ext::intel::experimental;
+using alloc = sycl::usm::alloc;
+
+constexpr int N = 10;
+
+// clang-format off
+
+// Expect error when the property list contain invalid comile-time property
+// Note that two errors are raised for each case, during:
+// 1. validating malloc input properties
+// 2. validating annotated_ptr properties
+void testInvalidCompileTimeProperty(sycl::queue &q) {
+  const sycl::context &Ctx = q.get_context();
+  auto dev = q.get_device();
+
+  // expected-error-re@sycl/ext/oneapi/experimental/common_annotated_properties/properties.hpp:* {{static assertion failed due to requirement {{.+}}bar_key{{.+}}: Property is invalid for the given type.}}
+  // expected-error-re@sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp:* {{static assertion failed due to requirement {{.+}}bar_key{{.+}}: Found invalid compile-time property in the property list.}}
+  TEST(malloc_device_annotated, N, q, properties{conduit, bar})
+
+  // expected-error-re@sycl/ext/oneapi/experimental/common_annotated_properties/properties.hpp:* {{static assertion failed due to requirement {{.+}}baz_key{{.+}}: Property is invalid for the given type.}}
+  // expected-error-re@sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp:* {{static assertion failed due to requirement {{.+}}baz_key{{.+}}: Found invalid compile-time property in the property list.}}
+  TEST(malloc_device_annotated, N, dev, Ctx, properties{conduit, buffer_location<1>, baz<5>})
+
+  // expected-error-re@sycl/ext/oneapi/experimental/common_annotated_properties/properties.hpp:* {{static assertion failed due to requirement {{.+}}baz_key{{.+}}: Property is invalid for the given type.}}
+  // expected-error-re@sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp:* {{static assertion failed due to requirement {{.+}}baz_key{{.+}}: Found invalid compile-time property in the property list.}}
+  TEST(malloc_device_annotated<int>, N, q, properties{conduit, baz<2>})
+
+  // expected-error-re@sycl/ext/oneapi/experimental/common_annotated_properties/properties.hpp:* 1+ {{static assertion failed due to requirement {{.+}}device_has_key{{.+}}: Property is invalid for the given type.}}
+  // expected-error-re@sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp:* {{static assertion failed due to requirement {{.+}}device_has_key{{.+}}: Found invalid compile-time property in the property list.}}
+  TEST(malloc_device_annotated<int>, N, dev, Ctx, properties{buffer_location<1>, device_has<sycl::aspect::cpu>})
+
+  // expected-error-re@sycl/ext/oneapi/experimental/common_annotated_properties/properties.hpp:* 1+ {{static assertion failed due to requirement {{.+}}work_group_size_key{{.+}}: Property is invalid for the given type.}}
+  // expected-error-re@sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp:* {{static assertion failed due to requirement {{.+}}work_group_size_key{{.+}}: Found invalid compile-time property in the property list.}}
+  TEST(aligned_alloc_device_annotated, 1, N, q, properties{register_map, work_group_size<1>})
+
+  // expected-error-re@sycl/ext/oneapi/experimental/common_annotated_properties/properties.hpp:* {{static assertion failed due to requirement {{.+}}baz_key{{.+}}: Property is invalid for the given type.}}
+  // expected-error-re@sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp:* {{static assertion failed due to requirement {{.+}}baz_key{{.+}}: Found invalid compile-time property in the property list.}}
+  TEST(aligned_alloc_device_annotated, 1, N, dev, Ctx, properties{register_map, baz<3>})
+
+  // expected-error-re@sycl/ext/oneapi/experimental/common_annotated_properties/properties.hpp:* {{static assertion failed due to requirement {{.+}}baz_key{{.+}}: Property is invalid for the given type.}}
+  // expected-error-re@sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp:* {{static assertion failed due to requirement {{.+}}baz_key{{.+}}: Found invalid compile-time property in the property list.}}
+  TEST(aligned_alloc_device_annotated<int>, 1, N, q, properties{register_map, baz<0>})
+
+  // expected-error-re@sycl/ext/oneapi/experimental/common_annotated_properties/properties.hpp:* {{static assertion failed due to requirement {{.+}}boo_key{{.+}}: Property is invalid for the given type.}}
+  // expected-error-re@sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp:* {{static assertion failed due to requirement {{.+}}boo_key{{.+}}: Found invalid compile-time property in the property list.}}
+  TEST(aligned_alloc_device_annotated<int>, 1, N, dev, Ctx, properties{conduit, boo<void>})
+
+
+  // expected-error-re@sycl/ext/oneapi/experimental/common_annotated_properties/properties.hpp:* {{static assertion failed due to requirement {{.+}}boo_key{{.+}}: Property is invalid for the given type.}}
+  // expected-error-re@sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp:* {{static assertion failed due to requirement {{.+}}boo_key{{.+}}: Found invalid compile-time property in the property list.}}
+  TEST(malloc_host_annotated, N, q, properties{conduit, boo<int>})
+
+  // expected-error-re@sycl/ext/oneapi/experimental/common_annotated_properties/properties.hpp:* 1+ {{static assertion failed due to requirement {{.+}}streaming_interface_key{{.+}}: Property is invalid for the given type.}}
+  // expected-error-re@sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp:* {{static assertion failed due to requirement {{.+}}streaming_interface_key{{.+}}: Found invalid compile-time property in the property list.}}
+  TEST(malloc_host_annotated, N, Ctx, properties{buffer_location<1>, streaming_interface_accept_downstream_stall})
+
+  // expected-error-re@sycl/ext/oneapi/experimental/common_annotated_properties/properties.hpp:* 1+ {{static assertion failed due to requirement {{.+}}register_map_interface_key{{.+}}: Property is invalid for the given type.}}
+  // expected-error-re@sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp:* {{static assertion failed due to requirement {{.+}}register_map_interface_key{{.+}}: Found invalid compile-time property in the property list.}}
+  TEST(malloc_host_annotated<int>, N, q, properties{register_map_interface_wait_for_done_write, register_map})
+
+  // expected-error-re@sycl/ext/oneapi/experimental/common_annotated_properties/properties.hpp:* {{static assertion failed due to requirement {{.+}}boo_key{{.+}}: Property is invalid for the given type.}}
+  // expected-error-re@sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp:* {{static assertion failed due to requirement {{.+}}boo_key{{.+}}: Found invalid compile-time property in the property list.}}
+  TEST(malloc_host_annotated<int>, N, Ctx, properties{conduit, buffer_location<1>, boo<int, int>})
+
+  // expected-error-re@sycl/ext/oneapi/experimental/common_annotated_properties/properties.hpp:* {{static assertion failed due to requirement {{.+}}boo_key{{.+}}: Property is invalid for the given type.}}
+  // expected-error-re@sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp:* {{static assertion failed due to requirement {{.+}}boo_key{{.+}}: Found invalid compile-time property in the property list.}}
+  TEST(aligned_alloc_host_annotated, 1, N, q, properties{register_map, boo<int, bool>})
+
+  // expected-error-re@sycl/ext/oneapi/experimental/common_annotated_properties/properties.hpp:* {{static assertion failed due to requirement {{.+}}boo_key{{.+}}: Property is invalid for the given type.}}
+  // expected-error-re@sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp:* {{static assertion failed due to requirement {{.+}}boo_key{{.+}}: Found invalid compile-time property in the property list.}}
+  TEST(aligned_alloc_host_annotated, 1, N, Ctx, properties{register_map, boo<int, int, int>})
+
+  // expected-error-re@sycl/ext/oneapi/experimental/common_annotated_properties/properties.hpp:* {{static assertion failed due to requirement {{.+}}boo_key{{.+}}: Property is invalid for the given type.}}
+  // expected-error-re@sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp:* {{static assertion failed due to requirement {{.+}}boo_key{{.+}}: Found invalid compile-time property in the property list.}}
+  TEST(aligned_alloc_host_annotated<int>, 1, N, q, properties{register_map, buffer_location<1>, boo<bool>})
+
+  // expected-error-re@sycl/ext/oneapi/experimental/common_annotated_properties/properties.hpp:* {{static assertion failed due to requirement {{.+}}boo_key{{.+}}: Property is invalid for the given type.}}
+  // expected-error-re@sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp:* {{static assertion failed due to requirement {{.+}}boo_key{{.+}}: Found invalid compile-time property in the property list.}}
+  TEST(aligned_alloc_host_annotated<int>, 1, N, Ctx, properties{register_map, buffer_location<1>, boo<float>})
+
+
+  // expected-error-re@sycl/ext/oneapi/experimental/common_annotated_properties/properties.hpp:* 1+ {{static assertion failed due to requirement {{.+}}device_image_scope_key{{.+}}: Property is invalid for the given type.}}
+  // expected-error-re@sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp:* {{static assertion failed due to requirement {{.+}}device_image_scope_key{{.+}}: Found invalid compile-time property in the property list.}}
+  TEST(malloc_shared_annotated, N, q, properties{conduit, device_image_scope})
+
+  // expected-error-re@sycl/ext/oneapi/experimental/common_annotated_properties/properties.hpp:* 1+ {{static assertion failed due to requirement {{.+}}host_access_key{{.+}}: Property is invalid for the given type.}}
+  // expected-error-re@sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp:* {{static assertion failed due to requirement {{.+}}host_access_key{{.+}}: Found invalid compile-time property in the property list.}}
+  TEST(malloc_shared_annotated, N, dev, Ctx, properties{buffer_location<1>, host_access_read})
+
+  // expected-error-re@sycl/ext/oneapi/experimental/common_annotated_properties/properties.hpp:* 1+ {{static assertion failed due to requirement {{.+}}boo_key{{.+}}: Property is invalid for the given type.}}
+  // expected-error-re@sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp:* {{static assertion failed due to requirement {{.+}}boo_key{{.+}}: Found invalid compile-time property in the property list.}}
+  TEST(malloc_shared_annotated<int>, N, q, properties{boo<char>, register_map})
+
+  // expected-error-re@sycl/ext/oneapi/experimental/common_annotated_properties/properties.hpp:* 1+ {{static assertion failed due to requirement {{.+}}init_mode_key{{.+}}: Property is invalid for the given type.}}
+  // expected-error-re@sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp:* {{static assertion failed due to requirement {{.+}}init_mode_key{{.+}}: Found invalid compile-time property in the property list.}}
+  TEST(malloc_shared_annotated<int>, N, dev, Ctx, properties{conduit, buffer_location<1>, init_mode_reset})
+
+  // expected-error-re@sycl/ext/oneapi/experimental/common_annotated_properties/properties.hpp:* 1+ {{static assertion failed due to requirement {{.+}}implement_in_csr_key{{.+}}: Property is invalid for the given type.}}
+  // expected-error-re@sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp:* {{static assertion failed due to requirement {{.+}}implement_in_csr_key{{.+}}: Found invalid compile-time property in the property list.}}
+  TEST(aligned_alloc_shared_annotated, 1, N, q, properties{register_map, implement_in_csr_on})
+
+  // expected-error-re@sycl/ext/oneapi/experimental/common_annotated_properties/properties.hpp:* 1+ {{static assertion failed due to requirement {{.+}}ready_latency_key{{.+}}: Property is invalid for the given type.}}
+  // expected-error-re@sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp:* {{static assertion failed due to requirement {{.+}}ready_latency_key{{.+}}: Found invalid compile-time property in the property list.}}
+  TEST(aligned_alloc_shared_annotated, 1, N, dev, Ctx, properties{register_map, ready_latency<1>})
+
+  // expected-error-re@sycl/ext/oneapi/experimental/common_annotated_properties/properties.hpp:* 1+ {{static assertion failed due to requirement {{.+}}bits_per_symbol_key{{.+}}: Property is invalid for the given type.}}
+  // expected-error-re@sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp:* {{static assertion failed due to requirement {{.+}}bits_per_symbol_key{{.+}}: Found invalid compile-time property in the property list.}}
+  TEST(aligned_alloc_shared_annotated<int>, 1, N, q, properties{register_map, bits_per_symbol<1>})
+
+  // expected-error-re@sycl/ext/oneapi/experimental/common_annotated_properties/properties.hpp:* 1+ {{static assertion failed due to requirement {{.+}}pipelined_key{{.+}}: Property is invalid for the given type.}}
+  // expected-error-re@sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp:* {{static assertion failed due to requirement {{.+}}pipelined_key{{.+}}: Found invalid compile-time property in the property list.}}
+  TEST(aligned_alloc_shared_annotated<int>, 1, N, dev, Ctx, properties{register_map, buffer_location<1>, pipelined<1>})
+
+
+  // expected-error-re@sycl/ext/oneapi/experimental/common_annotated_properties/properties.hpp:* 1+ {{static assertion failed due to requirement {{.+}}sub_group_size_key{{.+}}: Property is invalid for the given type.}}
+  // expected-error-re@sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp:* {{static assertion failed due to requirement {{.+}}sub_group_size_key{{.+}}: Found invalid compile-time property in the property list.}}
+  TEST(malloc_annotated, N, q, alloc::device, properties{conduit, sub_group_size<2>})
+
+  // expected-error-re@sycl/ext/oneapi/experimental/common_annotated_properties/properties.hpp:* 1+ {{static assertion failed due to requirement {{.+}}uses_valid_key{{.+}}: Property is invalid for the given type.}}
+  // expected-error-re@sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp:* {{static assertion failed due to requirement {{.+}}uses_valid_key{{.+}}: Found invalid compile-time property in the property list.}}
+  TEST(malloc_annotated, N, dev, Ctx, alloc::device, properties{buffer_location<1>, uses_valid_on})
+
+  // expected-error-re@sycl/ext/oneapi/experimental/common_annotated_properties/properties.hpp:* 1+ {{static assertion failed due to requirement {{.+}}first_symbol_in_high_order_bits_key{{.+}}: Property is invalid for the given type.}}
+  // expected-error-re@sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp:* {{static assertion failed due to requirement {{.+}}first_symbol_in_high_order_bits_key{{.+}}: Found invalid compile-time property in the property list.}}
+  TEST(malloc_annotated<int>, N, q, alloc::device, properties{first_symbol_in_high_order_bits_on, register_map})
+
+  // expected-error-re@sycl/ext/oneapi/experimental/common_annotated_properties/properties.hpp:* 1+ {{static assertion failed due to requirement {{.+}}protocol_key{{.+}}: Property is invalid for the given type.}}
+  // expected-error-re@sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp:* {{static assertion failed due to requirement {{.+}}protocol_key{{.+}}: Found invalid compile-time property in the property list.}}
+  TEST(malloc_annotated<int>, N, dev, Ctx, alloc::device, properties{conduit, buffer_location<1>, protocol_avalon_streaming_uses_ready})
+
+    // expected-error-re@sycl/ext/oneapi/experimental/common_annotated_properties/properties.hpp:* 1+ {{static assertion failed due to requirement {{.+}}word_size_key{{.+}}: Property is invalid for the given type.}}
+  // expected-error-re@sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp:* {{static assertion failed due to requirement {{.+}}word_size_key{{.+}}: Found invalid compile-time property in the property list.}}
+  TEST(aligned_alloc_annotated, 1, N, q, alloc::device, properties{register_map, word_size<1>})
+
+  // expected-error-re@sycl/ext/oneapi/experimental/common_annotated_properties/properties.hpp:* 1+ {{static assertion failed due to requirement {{.+}}stride_size_key{{.+}}: Property is invalid for the given type.}}
+  // expected-error-re@sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp:* {{static assertion failed due to requirement {{.+}}stride_size_key{{.+}}: Found invalid compile-time property in the property list.}}
+  TEST(aligned_alloc_annotated, 1, N, dev, Ctx, alloc::device, properties{register_map, stride_size<1>})
+
+  // expected-error-re@sycl/ext/oneapi/experimental/common_annotated_properties/properties.hpp:* 1+ {{static assertion failed due to requirement {{.+}}work_group_size_hint_key{{.+}}: Property is invalid for the given type.}}
+  // expected-error-re@sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp:* {{static assertion failed due to requirement {{.+}}work_group_size_hint_key{{.+}}: Found invalid compile-time property in the property list.}}
+  TEST(aligned_alloc_annotated<int>, 1, N, q, alloc::device, properties{register_map, work_group_size_hint<1>})
+
+  // expected-error-re@sycl/ext/oneapi/experimental/common_annotated_properties/properties.hpp:* {{static assertion failed due to requirement {{.+}}latency_anchor_id_key{{.+}}: Property is invalid for the given type.}}
+  // expected-error-re@sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp:* {{static assertion failed due to requirement {{.+}}latency_anchor_id_key{{.+}}: Found invalid compile-time property in the property list.}}
+  TEST(aligned_alloc_annotated<int>, 1, N, dev, Ctx, alloc::device, properties{register_map, buffer_location<1>, latency_anchor_id<5>})
+
+}
+
+// clang-format on
+int main() {
+  sycl::queue q;
+
+  testInvalidCompileTimeProperty(q);
+  return 0;
+}
diff --git a/sycl/test/extensions/annotated_usm/invalid_runtime_properties.cpp b/sycl/test/extensions/annotated_usm/invalid_runtime_properties.cpp
new file mode 100644
index 0000000000000..60f50a7c468dc
--- /dev/null
+++ b/sycl/test/extensions/annotated_usm/invalid_runtime_properties.cpp
@@ -0,0 +1,151 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -ferror-limit=0 -fsyntax-only -Xclang -verify -Xclang -verify-ignore-unexpected=note %s
+
+// Expected failed tests for annotated USM allocation when given properties
+// contain invalid runtime property
+
+#include <sycl/sycl.hpp>
+
+#include "fake_properties.hpp"
+
+#define TEST(f, args...)                                                       \
+  { auto ap = f(args); }
+
+using namespace sycl::ext::oneapi::experimental;
+using namespace sycl::ext::intel::experimental;
+using alloc = sycl::usm::alloc;
+
+constexpr int N = 10;
+
+// Expect error when the property list contain invalid runtime property
+// Note a runtime property is valid for annotated USM allocation only when the
+// type trait `detail::IsRuntimePropertyValid` is specialized with the property
+void testInvalidRuntimeProperty(sycl::queue &q) {
+  const sycl::context &Ctx = q.get_context();
+  auto dev = q.get_device();
+
+  // clang-format off
+
+  // expected-error-re@sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp:* {{static assertion failed due to requirement {{.+}}: Found invalid runtime property in the property list.}}
+  TEST(malloc_device_annotated, N, q, properties{conduit, cache_config{large_slm}})
+
+  // expected-error-re@sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp:* {{static assertion failed due to requirement {{.+}}: Found invalid runtime property in the property list.}}
+  TEST(malloc_device_annotated, N, dev, Ctx, properties{conduit, foo{foo_enum::b}})
+
+  // expected-error-re@sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp:* {{static assertion failed due to requirement {{.+}}: Found invalid runtime property in the property list.}}
+  TEST((malloc_device_annotated<int>), N, q, properties{conduit, foz{0, 1}})
+
+  // expected-error-re@sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp:* {{static assertion failed due to requirement {{.+}}: Found invalid runtime property in the property list.}}
+  TEST((malloc_device_annotated<int>), N, dev, Ctx, properties{conduit, rt_prop1{}})
+
+  // expected-error-re@sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp:* {{static assertion failed due to requirement {{.+}}: Found invalid runtime property in the property list.}}
+  TEST(aligned_alloc_device_annotated, 1, N, q, properties{conduit, rt_prop2{}})
+
+  // expected-error-re@sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp:* {{static assertion failed due to requirement {{.+}}: Found invalid runtime property in the property list.}}
+  TEST(aligned_alloc_device_annotated, 1, N, dev, Ctx, properties{conduit, rt_prop3{}})
+
+  // expected-error-re@sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp:* {{static assertion failed due to requirement {{.+}}: Found invalid runtime property in the property list.}}
+  TEST((aligned_alloc_device_annotated<int>), 1, N, q, properties{conduit, rt_prop4{}})
+
+  // expected-error-re@sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp:* {{static assertion failed due to requirement {{.+}}: Found invalid runtime property in the property list.}}
+  TEST((aligned_alloc_device_annotated<int>), 1, N, dev, Ctx, properties{conduit, rt_prop5{}})
+
+
+
+  // expected-error-re@sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp:* {{static assertion failed due to requirement {{.+}}: Found invalid runtime property in the property list.}}
+  TEST(malloc_host_annotated, N, q, properties{conduit, rt_prop6{}})
+
+  // expected-error-re@sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp:* {{static assertion failed due to requirement {{.+}}: Found invalid runtime property in the property list.}}
+  TEST(malloc_host_annotated, N, Ctx, properties{conduit, rt_prop7{}})
+
+  // expected-error-re@sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp:* {{static assertion failed due to requirement {{.+}}: Found invalid runtime property in the property list.}}
+  TEST((malloc_host_annotated<int>), N, q, properties{conduit, rt_prop8{}})
+
+  // expected-error-re@sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp:* {{static assertion failed due to requirement {{.+}}: Found invalid runtime property in the property list.}}
+  TEST((malloc_host_annotated<int>), N, Ctx, properties{conduit, rt_prop9{}})
+
+  // expected-error-re@sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp:* {{static assertion failed due to requirement {{.+}}: Found invalid runtime property in the property list.}}
+  TEST(aligned_alloc_host_annotated, 1, N, q, properties{conduit, rt_prop10{}})
+
+  // expected-error-re@sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp:* {{static assertion failed due to requirement {{.+}}: Found invalid runtime property in the property list.}}
+  TEST(aligned_alloc_host_annotated, 1, N, Ctx, properties{conduit, rt_prop11{}})
+
+  // expected-error-re@sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp:* {{static assertion failed due to requirement {{.+}}: Found invalid runtime property in the property list.}}
+  TEST((aligned_alloc_host_annotated<int>), 1, N, q, properties{conduit, rt_prop12{}})
+
+  // expected-error-re@sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp:* {{static assertion failed due to requirement {{.+}}: Found invalid runtime property in the property list.}}
+  TEST((aligned_alloc_host_annotated<int>), 1, N, Ctx, properties{conduit, rt_prop13{}})
+
+
+
+  // expected-error-re@sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp:* {{static assertion failed due to requirement {{.+}}: Found invalid runtime property in the property list.}}
+  TEST(malloc_shared_annotated, N, q, properties{conduit, rt_prop14{}})
+
+  // expected-error-re@sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp:* {{static assertion failed due to requirement {{.+}}: Found invalid runtime property in the property list.}}
+  TEST(malloc_shared_annotated, N, dev, Ctx, properties{conduit, rt_prop15{}})
+
+  // expected-error-re@sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp:* {{static assertion failed due to requirement {{.+}}: Found invalid runtime property in the property list.}}
+  TEST((malloc_shared_annotated<int>), N, q, properties{conduit, rt_prop16{}})
+
+  // expected-error-re@sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp:* {{static assertion failed due to requirement {{.+}}: Found invalid runtime property in the property list.}}
+  TEST((malloc_shared_annotated<int>), N, dev, Ctx, properties{conduit, rt_prop17{}})
+
+  // expected-error-re@sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp:* {{static assertion failed due to requirement {{.+}}: Found invalid runtime property in the property list.}}
+  TEST(aligned_alloc_shared_annotated, 1, N, q, properties{conduit, rt_prop18{}})
+
+  // expected-error-re@sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp:* {{static assertion failed due to requirement {{.+}}: Found invalid runtime property in the property list.}}
+  TEST(aligned_alloc_shared_annotated, 1, N, dev, Ctx, properties{conduit, rt_prop19{}})
+
+  // expected-error-re@sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp:* {{static assertion failed due to requirement {{.+}}: Found invalid runtime property in the property list.}}
+  TEST((aligned_alloc_shared_annotated<int>), 1, N, q, properties{conduit, rt_prop20{}})
+
+  // expected-error-re@sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp:* {{static assertion failed due to requirement {{.+}}: Found invalid runtime property in the property list.}}
+  TEST((aligned_alloc_shared_annotated<int>), 1, N, dev, Ctx, properties{conduit, rt_prop21{}})
+  
+
+
+  // expected-error-re@sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp:* {{static assertion failed due to requirement {{.+}}: Found invalid runtime property in the property list.}}
+  TEST(malloc_annotated, N, q, alloc::device, properties{conduit, rt_prop22{}})
+
+  // expected-error-re@sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp:* {{static assertion failed due to requirement {{.+}}: Found invalid runtime property in the property list.}}
+  TEST(malloc_annotated, N, q, alloc::device, properties{conduit, rt_prop23{}})
+
+  // expected-error-re@sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp:* {{static assertion failed due to requirement {{.+}}: Found invalid runtime property in the property list.}}
+  TEST((malloc_annotated<int>), N, q, alloc::device, properties{conduit, rt_prop24{}})
+
+  // expected-error-re@sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp:* {{static assertion failed due to requirement {{.+}}: Found invalid runtime property in the property list.}}
+  TEST((malloc_annotated<int>), N, q, alloc::device, properties{conduit, rt_prop25{}})
+
+  // expected-error-re@sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp:* {{static assertion failed due to requirement {{.+}}: Found invalid runtime property in the property list.}}
+  TEST(aligned_alloc_annotated, 1, N, q, alloc::device, properties{conduit, rt_prop26{}})
+
+  // expected-error-re@sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp:* {{static assertion failed due to requirement {{.+}}: Found invalid runtime property in the property list.}}
+  TEST(aligned_alloc_annotated, 1, N, q, alloc::device, properties{conduit, rt_prop27{}})
+
+  // expected-error-re@sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp:* {{static assertion failed due to requirement {{.+}}: Found invalid runtime property in the property list.}}
+  TEST((aligned_alloc_annotated<int>), 1, N, q, alloc::device, properties{conduit, rt_prop28{}})
+
+  // expected-error-re@sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp:* {{static assertion failed due to requirement {{.+}}: Found invalid runtime property in the property list.}}
+  TEST((aligned_alloc_annotated<int>), 1, N, q, alloc::device, properties{conduit, rt_prop29{}})
+
+
+  // expected-error-re@sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp:* {{static assertion failed due to requirement {{.+}}: Found invalid runtime property in the property list.}}
+  TEST(malloc_annotated, N, q, properties{usm_kind_device, rt_prop30{}})
+
+  // expected-error-re@sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp:* {{static assertion failed due to requirement {{.+}}: Found invalid runtime property in the property list.}}
+  TEST(malloc_annotated, N, dev, Ctx, properties{usm_kind_device, rt_prop31{}})
+
+  // expected-error-re@sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp:* {{static assertion failed due to requirement {{.+}}: Found invalid runtime property in the property list.}}
+  TEST((malloc_annotated<int>), N, q, properties{usm_kind_device, rt_prop32{}})
+
+  // expected-error-re@sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp:* {{static assertion failed due to requirement {{.+}}: Found invalid runtime property in the property list.}}
+  TEST((malloc_annotated<int>), N, dev, Ctx, properties{usm_kind_device, rt_prop33{}})
+
+  // clang-format on
+}
+
+int main() {
+  sycl::queue q;
+
+  testInvalidRuntimeProperty(q);
+  return 0;
+}
diff --git a/sycl/test/extensions/annotated_usm/invalid_usm_kind.cpp b/sycl/test/extensions/annotated_usm/invalid_usm_kind.cpp
new file mode 100644
index 0000000000000..53e25af76fa17
--- /dev/null
+++ b/sycl/test/extensions/annotated_usm/invalid_usm_kind.cpp
@@ -0,0 +1,111 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -ferror-limit=0 -fsyntax-only -Xclang -verify -Xclang -verify-ignore-unexpected=note %s
+
+// Annotated USM allocation tests that are expected to fail when:
+// 1. required usm_kind is not provided in the property list
+// 2. usm_kind in the property list conflicts with the function name
+
+#include <sycl/sycl.hpp>
+
+#include "fake_properties.hpp"
+
+#define TEST(f, args...)                                                       \
+  { auto ap = f(args); }
+
+using namespace sycl::ext::oneapi::experimental;
+using namespace sycl::ext::intel::experimental;
+using alloc = sycl::usm::alloc;
+
+constexpr int N = 10;
+
+// clang-format off
+
+// Missing usm kind in property list when it is required
+void testMissingUsmKind(sycl::queue &q) {
+  const sycl::context &Ctx = q.get_context();
+  auto dev = q.get_device();
+
+  properties InP{};
+  // expected-error-re@sycl/ext/oneapi/experimental/annotated_usm/alloc_base.hpp:* {{static assertion failed due to requirement {{.+}}: USM kind is not specified. Please specify it as an argument or in the input property list.}}
+  TEST(malloc_annotated, N, q, properties{})
+
+  // expected-error-re@sycl/ext/oneapi/experimental/annotated_usm/alloc_base.hpp:* {{static assertion failed due to requirement {{.+}}: USM kind is not specified. Please specify it as an argument or in the input property list.}}
+  TEST(malloc_annotated, N, dev, Ctx, properties{alignment<8>})
+
+  // expected-error-re@sycl/ext/oneapi/experimental/annotated_usm/alloc_base.hpp:* {{static assertion failed due to requirement {{.+}}: USM kind is not specified. Please specify it as an argument or in the input property list.}}
+  TEST((malloc_annotated<int>), N, q, properties{})
+
+  // expected-error-re@sycl/ext/oneapi/experimental/annotated_usm/alloc_base.hpp:* {{static assertion failed due to requirement {{.+}}: USM kind is not specified. Please specify it as an argument or in the input property list.}}
+  TEST((malloc_annotated<int>), N, dev, Ctx, properties{buffer_location<8>})
+}
+
+// Conflicting usm kinds between function name and property list
+void testConflictingUsmKind(sycl::queue &q) {
+  const sycl::context &Ctx = q.get_context();
+  auto dev = q.get_device();
+
+  // expected-error-re@sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp:* {{static assertion failed due to requirement {{.+}}: Input property list contains conflicting USM kind.}}
+  // expected-error@+1 {{no matching function for call to 'malloc_shared_annotated'}}
+  TEST(malloc_shared_annotated, N, q, properties{usm_kind_host});
+
+  // expected-error-re@sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp:* {{static assertion failed due to requirement {{.+}}: Input property list contains conflicting USM kind.}}
+  // expected-error@+1 {{no matching function for call to 'malloc_shared_annotated'}}
+  TEST(malloc_shared_annotated, N, dev, Ctx, properties{usm_kind_device});
+
+  // expected-error-re@sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp:* {{static assertion failed due to requirement {{.+}}: Input property list contains conflicting USM kind.}}
+  // expected-error@+1 {{no matching function for call to 'malloc_shared_annotated'}}
+  TEST((malloc_shared_annotated<int>), N, q, properties{buffer_location<1>, usm_kind_host});
+
+  // expected-error-re@sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp:* {{static assertion failed due to requirement {{.+}}: Input property list contains conflicting USM kind.}}
+  // expected-error@+1 {{no matching function for call to 'malloc_shared_annotated'}}
+  TEST((malloc_shared_annotated<int>), N, dev, Ctx, properties{buffer_location<2>, usm_kind_device});
+  
+  // expected-error-re@sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp:* {{static assertion failed due to requirement {{.+}}: Input property list contains conflicting USM kind.}}
+  // expected-error@+1 {{no matching function for call to 'malloc_host_annotated'}}
+  TEST(malloc_host_annotated, N, q, properties{usm_kind_shared, alignment<4>});
+
+  // expected-error-re@sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp:* {{static assertion failed due to requirement {{.+}}: Input property list contains conflicting USM kind.}}
+  // expected-error@+1 {{no matching function for call to 'malloc_host_annotated'}}
+  TEST(malloc_host_annotated, N, Ctx, properties{usm_kind_device});
+
+  // expected-error-re@sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp:* {{static assertion failed due to requirement {{.+}}: Input property list contains conflicting USM kind.}}
+  // expected-error@+1 {{no matching function for call to 'malloc_host_annotated'}}
+  TEST((malloc_host_annotated<int>), N, q, properties{buffer_location<1>, usm_kind_shared});
+
+  // expected-error-re@sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp:* {{static assertion failed due to requirement {{.+}}: Input property list contains conflicting USM kind.}}
+  // expected-error@+1 {{no matching function for call to 'malloc_host_annotated'}}
+  TEST((malloc_host_annotated<int>), N, Ctx, properties{buffer_location<2>, usm_kind_device});
+
+  // expected-error-re@sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp:* {{static assertion failed due to requirement {{.+}}: Input property list contains conflicting USM kind.}}
+  // expected-error@+1 {{no matching function for call to 'malloc_device_annotated'}}
+  TEST(malloc_device_annotated, N, q, properties{usm_kind_host});
+
+  // expected-error-re@sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp:* {{static assertion failed due to requirement {{.+}}: Input property list contains conflicting USM kind.}}
+  // expected-error@+1 {{no matching function for call to 'malloc_device_annotated'}}
+  TEST(malloc_device_annotated, N, dev, Ctx, properties{usm_kind_shared});
+
+  // expected-error-re@sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp:* {{static assertion failed due to requirement {{.+}}: Input property list contains conflicting USM kind.}}
+  // expected-error@+1 {{no matching function for call to 'malloc_device_annotated'}}
+  TEST((malloc_device_annotated<int>), N, q, properties{buffer_location<1>, usm_kind_host});
+
+  // expected-error-re@sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp:* {{static assertion failed due to requirement {{.+}}: Input property list contains conflicting USM kind.}}
+  // expected-error@+1 {{no matching function for call to 'malloc_device_annotated'}}
+  TEST((malloc_device_annotated<int>), N, dev, Ctx, properties{buffer_location<2>, usm_kind_shared});
+}
+
+// Duplicated properties (consistent or conflicting) exist in the property list
+void testInvalidPropList() {
+  // expected-error-re@sycl/ext/oneapi/properties/properties.hpp:* {{static assertion failed due to requirement {{.+}}: Duplicate properties in property list.}}
+  properties InvalidPropList{usm_kind_device, usm_kind_host};
+}
+
+// clang-format on
+
+int main() {
+  sycl::queue q;
+
+  testMissingUsmKind(q);
+  testConflictingUsmKind(q);
+  testInvalidPropList();
+
+  return 0;
+}

From 27142b59ddb972a3b91caad5788b03d07c2e000d Mon Sep 17 00:00:00 2001
From: Evgeniy <evgeniy.tyurin@intel.com>
Date: Thu, 2 Nov 2023 08:29:07 -0700
Subject: [PATCH 819/877] [SYCL][ESIMD][E2E] Safeguard windows_msvc_math.cpp
 from different versions of MSVC STL (#11752)

By providing a declaration of the function in the test.
---
 sycl/test/esimd/regression/windows_msvc_math.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/sycl/test/esimd/regression/windows_msvc_math.cpp b/sycl/test/esimd/regression/windows_msvc_math.cpp
index b1b999a785504..bd64de2029c0e 100644
--- a/sycl/test/esimd/regression/windows_msvc_math.cpp
+++ b/sycl/test/esimd/regression/windows_msvc_math.cpp
@@ -11,12 +11,13 @@
 
 // The tests validates an ability to build ESIMD code on windows platform.
 
-#include <cmath>
 #include <sycl/ext/intel/esimd.hpp>
 #include <sycl/sycl.hpp>
 
 using namespace sycl;
 
+extern __DPCPP_SYCL_EXTERNAL short _FDtest(float *px);
+
 int main() {
   queue q;
 

From 84839fb01bc242d5b98f78d58732bed4862e9299 Mon Sep 17 00:00:00 2001
From: Vyacheslav Klochkov <vyacheslav.n.klochkov@intel.com>
Date: Thu, 2 Nov 2023 11:51:20 -0500
Subject: [PATCH 820/877] [ESIMD][NFC] Remove esimd/detail/atomic_intrin.hpp
 with HOST atomic functions as unused (#11755)

atomic_intrin.hpp had only HOST implementations emulating atomic
operations. Those functions are not used after ESIMD emulator removal.

Signed-off-by: Klochkov, Vyacheslav N <vyacheslav.n.klochkov@intel.com>
---
 .../ext/intel/esimd/detail/atomic_intrin.hpp  | 218 ------------------
 .../esimd/detail/memory_intrin.hpp            |   1 -
 2 files changed, 219 deletions(-)
 delete mode 100644 sycl/include/sycl/ext/intel/esimd/detail/atomic_intrin.hpp

diff --git a/sycl/include/sycl/ext/intel/esimd/detail/atomic_intrin.hpp b/sycl/include/sycl/ext/intel/esimd/detail/atomic_intrin.hpp
deleted file mode 100644
index d9f1b247e5be3..0000000000000
--- a/sycl/include/sycl/ext/intel/esimd/detail/atomic_intrin.hpp
+++ /dev/null
@@ -1,218 +0,0 @@
-//==-------- atomic_intrin.hpp - Atomic intrinsic definition file ----------==//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-#pragma once
-
-/// @cond ESIMD_DETAIL
-
-namespace __ESIMD_DNS {
-
-// This function implements atomic update of pre-existing variable in the
-// absense of C++ 20's atomic_ref.
-
-// __atomic_* functions support only integral types. In order to
-// support floating types for certain operations like min/max,
-// 'cmpxchg' operation is applied for result values using
-// 'bridging' variables in integral type.
-template <typename Ty> using CmpxchgTy = __ESIMD_DNS::uint_type_t<sizeof(Ty)>;
-
-template <typename Ty> inline Ty atomic_load(Ty *ptr) {
-#ifdef _WIN32
-  // TODO: Windows will be supported soon
-  __ESIMD_UNSUPPORTED_ON_HOST;
-#else
-  __ESIMD_UNSUPPORTED_ON_HOST;
-  // TODO : Enable with unit test
-  /* return sycl::bit_cast<Ty>(__atomic_load_n((CmpxchgTy<Ty> *)ptr,
-                                               __ATOMIC_SEQ_CST)); */
-#endif
-}
-
-template <typename Ty> inline Ty atomic_store(Ty *ptr, Ty val) {
-#ifdef _WIN32
-  // TODO: Windows will be supported soon
-  __ESIMD_UNSUPPORTED_ON_HOST;
-#else
-  Ty ret = atomic_load<Ty>(ptr);
-  __atomic_store_n(ptr, val, __ATOMIC_SEQ_CST);
-  return ret;
-#endif
-}
-
-template <typename Ty> inline Ty atomic_add(Ty *ptr, Ty val) {
-#ifdef _WIN32
-  // TODO: Windows will be supported soon
-  __ESIMD_UNSUPPORTED_ON_HOST;
-#else
-  if constexpr (std::is_integral_v<Ty>) {
-    return __atomic_fetch_add(ptr, val, __ATOMIC_SEQ_CST);
-  } else {
-    // For Floating type
-    Ty _old, _new;
-    CmpxchgTy<Ty> _old_bits, _new_bits;
-    do {
-      _old = *ptr;
-      _new = _old + val;
-      _old_bits = *(CmpxchgTy<Ty> *)&_old;
-      _new_bits = *(CmpxchgTy<Ty> *)&_new;
-    } while (!__atomic_compare_exchange_n((CmpxchgTy<Ty> *)ptr, &_old_bits,
-                                          _new_bits, false, __ATOMIC_SEQ_CST,
-                                          __ATOMIC_SEQ_CST));
-    return _old;
-  }
-#endif
-}
-
-template <typename Ty> inline Ty atomic_sub(Ty *ptr, Ty val) {
-#ifdef _WIN32
-  // TODO: Windows will be supported soon
-  __ESIMD_UNSUPPORTED_ON_HOST;
-#else
-  if constexpr (std::is_integral_v<Ty>) {
-    return __atomic_fetch_sub(ptr, val, __ATOMIC_SEQ_CST);
-  } else {
-    // For Floating type
-    Ty _old, _new;
-    CmpxchgTy<Ty> _old_bits, _new_bits;
-    do {
-      _old = *ptr;
-      _new = _old - val;
-      _old_bits = *(CmpxchgTy<Ty> *)&_old;
-      _new_bits = *(CmpxchgTy<Ty> *)&_new;
-    } while (!__atomic_compare_exchange_n((CmpxchgTy<Ty> *)ptr, &_old_bits,
-                                          _new_bits, false, __ATOMIC_SEQ_CST,
-                                          __ATOMIC_SEQ_CST));
-    return _old;
-  }
-#endif
-}
-
-template <typename Ty> inline Ty atomic_and(Ty *ptr, Ty val) {
-#ifdef _WIN32
-  // TODO: Windows will be supported soon
-  __ESIMD_UNSUPPORTED_ON_HOST;
-#else
-  static_assert(std::is_integral_v<Ty>);
-  return __atomic_fetch_and(ptr, val, __ATOMIC_SEQ_CST);
-#endif
-}
-
-template <typename Ty> inline Ty atomic_or(Ty *ptr, Ty val) {
-#ifdef _WIN32
-  // TODO: Windows will be supported soon
-  __ESIMD_UNSUPPORTED_ON_HOST;
-#else
-  static_assert(std::is_integral_v<Ty>);
-  return __atomic_fetch_or(ptr, val, __ATOMIC_SEQ_CST);
-#endif
-}
-
-template <typename Ty> inline Ty atomic_xor(Ty *ptr, Ty val) {
-#ifdef _WIN32
-  // TODO: Windows will be supported soon
-  __ESIMD_UNSUPPORTED_ON_HOST;
-#else
-  static_assert(std::is_integral_v<Ty>);
-  return __atomic_fetch_xor(ptr, val, __ATOMIC_SEQ_CST);
-#endif
-}
-
-template <typename Ty> inline Ty atomic_min(Ty *ptr, Ty val) {
-#ifdef _WIN32
-  // TODO: Windows will be supported soon
-  __ESIMD_UNSUPPORTED_ON_HOST;
-#else
-  if constexpr (std::is_integral_v<Ty>) {
-    Ty _old, _new;
-    do {
-      _old = *ptr;
-      _new = std::min<Ty>(_old, val);
-    } while (!__atomic_compare_exchange_n(ptr, &_old, _new, false,
-                                          __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST));
-    return _old;
-  } else {
-    Ty _old, _new;
-    CmpxchgTy<Ty> _old_bits, _new_bits;
-    do {
-      _old = *ptr;
-      _new = std::min(_old, val);
-      _old_bits = *(CmpxchgTy<Ty> *)&_old;
-      _new_bits = *(CmpxchgTy<Ty> *)&_new;
-    } while (!__atomic_compare_exchange_n((CmpxchgTy<Ty> *)ptr, &_old_bits,
-                                          _new_bits, false, __ATOMIC_SEQ_CST,
-                                          __ATOMIC_SEQ_CST));
-    return _old;
-  }
-#endif
-}
-
-template <typename Ty> inline Ty atomic_max(Ty *ptr, Ty val) {
-#ifdef _WIN32
-  // TODO: Windows will be supported soon
-  __ESIMD_UNSUPPORTED_ON_HOST;
-#else
-  if constexpr (std::is_integral_v<Ty>) {
-    Ty _old, _new;
-    do {
-      _old = *ptr;
-      _new = std::max<Ty>(_old, val);
-    } while (!__atomic_compare_exchange_n(ptr, &_old, _new, false,
-                                          __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST));
-    return _old;
-  } else {
-    Ty _old, _new;
-    CmpxchgTy<Ty> _old_bits, _new_bits;
-    do {
-      _old = *ptr;
-      _new = std::max(_old, val);
-      _old_bits = *(CmpxchgTy<Ty> *)&_old;
-      _new_bits = *(CmpxchgTy<Ty> *)&_new;
-    } while (!__atomic_compare_exchange_n((CmpxchgTy<Ty> *)(CmpxchgTy<Ty> *)ptr,
-                                          &_old_bits, _new_bits, false,
-                                          __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST));
-    return _old;
-  }
-#endif
-}
-
-template <typename Ty>
-inline Ty atomic_cmpxchg(Ty *ptr, Ty expected, Ty desired) {
-#ifdef _WIN32
-  // TODO: Windows will be supported soon
-  __ESIMD_UNSUPPORTED_ON_HOST;
-#else
-  if constexpr (std::is_integral_v<Ty>) {
-    Ty local = expected;
-    __atomic_compare_exchange_n(ptr, &local, desired, false, __ATOMIC_SEQ_CST,
-                                __ATOMIC_SEQ_CST);
-    // if exchange occured, this means 'local=expected=*ptr'. So local
-    // is returned as old val
-    // if exchange did not occur, *ptr value compared against 'local'
-    // is stored in 'local'. So local is returned as old val
-    return local;
-  } else {
-    CmpxchgTy<Ty> desired_bits = *(CmpxchgTy<Ty> *)&desired;
-    CmpxchgTy<Ty> local_bits = *(CmpxchgTy<Ty> *)&expected;
-    __atomic_compare_exchange_n((CmpxchgTy<Ty> *)ptr, &local_bits, desired_bits,
-                                false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);
-    return *((Ty *)&local_bits);
-  }
-#endif
-}
-
-inline void atomic_fence() {
-#ifdef _WIN32
-  // TODO: Windows will be supported soon
-  __ESIMD_UNSUPPORTED_ON_HOST;
-#else
-  __atomic_thread_fence(__ATOMIC_SEQ_CST);
-#endif
-}
-
-} // namespace __ESIMD_DNS
-
-/// @endcond ESIMD_DETAIL
diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/detail/memory_intrin.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/detail/memory_intrin.hpp
index 4d8fc2c00e2b4..16953b35e9bdb 100644
--- a/sycl/include/sycl/ext/intel/experimental/esimd/detail/memory_intrin.hpp
+++ b/sycl/include/sycl/ext/intel/experimental/esimd/detail/memory_intrin.hpp
@@ -12,7 +12,6 @@
 
 #pragma once
 
-#include <sycl/ext/intel/esimd/detail/atomic_intrin.hpp>
 #include <sycl/ext/intel/esimd/detail/defines_elementary.hpp>
 #include <sycl/ext/intel/esimd/detail/memory_intrin.hpp>
 

From a3c60da773da3bc0b8f1bab1961020ed74f999bc Mon Sep 17 00:00:00 2001
From: sys_ce_bb <sys_ce_bb@intel.com>
Date: Thu, 2 Nov 2023 10:40:48 -0700
Subject: [PATCH 821/877] [SYCL][Test] Add datalayout due to e39f6c1844

e39f6c1844 changed the default datalayout.
Need to add datalayout explictly to llvm/test/SYCLLowerIR/convergent.ll
to keep what we tested.

llvm/test/Transforms/SimplifyCFG/X86/switch-to-lookup-large-types.ll
delete a run line to sync to llorg version ,
1b440155c11c37 removed the line without datalayout, we somehow did a bad merge.
---
 llvm/test/SYCLLowerIR/convergent.ll                              | 1 +
 .../Transforms/SimplifyCFG/X86/switch-to-lookup-large-types.ll   | 1 -
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/test/SYCLLowerIR/convergent.ll b/llvm/test/SYCLLowerIR/convergent.ll
index dcf55b7621ad4..cf979dec28f25 100644
--- a/llvm/test/SYCLLowerIR/convergent.ll
+++ b/llvm/test/SYCLLowerIR/convergent.ll
@@ -4,6 +4,7 @@
 ; RUN: opt < %s -passes=LowerWGScope -S | FileCheck %s
 ; RUN: opt < %s -passes=LowerWGScope --mtriple=nvptx -S | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-PTX
 
+target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
 
 %struct.baz = type { i64 }
 
diff --git a/llvm/test/Transforms/SimplifyCFG/X86/switch-to-lookup-large-types.ll b/llvm/test/Transforms/SimplifyCFG/X86/switch-to-lookup-large-types.ll
index 70f42761e79c0..b19ba337fbddc 100644
--- a/llvm/test/Transforms/SimplifyCFG/X86/switch-to-lookup-large-types.ll
+++ b/llvm/test/Transforms/SimplifyCFG/X86/switch-to-lookup-large-types.ll
@@ -1,5 +1,4 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --scrub-attributes --check-globals
-; RUN: opt -passes=simplifycfg --switch-to-lookup -S %s | FileCheck %s
 ; RUN: opt '-passes=simplifycfg<switch-to-lookup>' -data-layout="e" -S %s | FileCheck %s
 ;; If basic integer types are natively supported, the value is generated inline instead of using a global
 ; RUN: opt '-passes=simplifycfg<switch-to-lookup>' -data-layout="e-n8:16:32:64" -S %s | FileCheck %s --check-prefix=INLINE

From f3f93bb4bff433556928b50d74ed1cb43edde03e Mon Sep 17 00:00:00 2001
From: Wu Yingcong <109124601+yingcong-wu@users.noreply.github.com>
Date: Thu, 2 Nov 2023 12:17:42 -0700
Subject: [PATCH 822/877] [SYCL][E2E] Fix some e2e tests memory leak. (#11735)

Fix some leaks found in e2e tests.
---
 sycl/test-e2e/ESIMD/histogram_256_slm.cpp                  | 2 ++
 sycl/test-e2e/ESIMD/histogram_256_slm_spec_2020.cpp        | 2 ++
 sycl/test-e2e/ESIMD/linear/bitmap_helpers.h                | 7 +++++--
 sycl/test-e2e/ESIMD/regression/half_conversion_test.cpp    | 5 +++--
 .../Plugin/interop-level-zero-image-get-native-mem.cpp     | 2 +-
 sycl/test-e2e/Regression/commandlist/Inputs/main.cpp       | 2 ++
 6 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/sycl/test-e2e/ESIMD/histogram_256_slm.cpp b/sycl/test-e2e/ESIMD/histogram_256_slm.cpp
index 587feae2f64ba..8ea5bd16810df 100644
--- a/sycl/test-e2e/ESIMD/histogram_256_slm.cpp
+++ b/sycl/test-e2e/ESIMD/histogram_256_slm.cpp
@@ -207,5 +207,7 @@ int main() {
   else
     std::cout << "FAILED\n";
 
+  delete[] hist;
+
   return res ? 0 : -1;
 }
diff --git a/sycl/test-e2e/ESIMD/histogram_256_slm_spec_2020.cpp b/sycl/test-e2e/ESIMD/histogram_256_slm_spec_2020.cpp
index 7aa452b828c84..049abe843dd10 100644
--- a/sycl/test-e2e/ESIMD/histogram_256_slm_spec_2020.cpp
+++ b/sycl/test-e2e/ESIMD/histogram_256_slm_spec_2020.cpp
@@ -225,5 +225,7 @@ int main(int argc, char **argv) {
   else
     std::cout << "FAILED\n";
 
+  delete[] hist;
+
   return res ? 0 : -1;
 }
diff --git a/sycl/test-e2e/ESIMD/linear/bitmap_helpers.h b/sycl/test-e2e/ESIMD/linear/bitmap_helpers.h
index 60ba2c0141bf9..03bd4f76387ba 100644
--- a/sycl/test-e2e/ESIMD/linear/bitmap_helpers.h
+++ b/sycl/test-e2e/ESIMD/linear/bitmap_helpers.h
@@ -23,6 +23,7 @@
 #include <memory>
 #include <string>
 #include <sycl/detail/defines.hpp>
+#include <vector>
 
 namespace sycl {
 inline namespace _V1 {
@@ -258,8 +259,10 @@ namespace bitmap {
       width = std::abs(*(short *)&header_out[18]);
       height = std::abs(*(short *)&header_out[22]);
 
-      img_out = (unsigned char *)std::malloc(width * height * 3);
-      img_gold = (unsigned char *)std::malloc(width * height * 3);
+      auto img_out_vector = std::vector<unsigned char>(width * height * 3);
+      auto img_gold_vector = std::vector<unsigned char>(width * height * 3);
+      img_out = img_out_vector.data();
+      img_gold = img_gold_vector.data();
 
       if (fread(img_out, 1, width * height * 3, f_out) != width * height * 3) {
         perror(f_out_str);
diff --git a/sycl/test-e2e/ESIMD/regression/half_conversion_test.cpp b/sycl/test-e2e/ESIMD/regression/half_conversion_test.cpp
index 57e9771c29341..c6f3eb4d9ec52 100644
--- a/sycl/test-e2e/ESIMD/regression/half_conversion_test.cpp
+++ b/sycl/test-e2e/ESIMD/regression/half_conversion_test.cpp
@@ -14,6 +14,7 @@
 #include <sycl/sycl.hpp>
 
 #include <iostream>
+#include <vector>
 
 using namespace ::sycl;
 using namespace ::sycl::ext;
@@ -29,7 +30,8 @@ using int_type_t = std::conditional_t<
                            std::conditional_t<N == 8, int64_t, void>>>>;
 
 template <class Ty> bool test(queue q, int inc) {
-  Ty *data = new Ty[1];
+  auto data_vector = std::vector<Ty>(1);
+  Ty *data = data_vector.data();
 
   data[0] = (Ty)0;
   Ty VAL = (Ty)inc;
@@ -50,7 +52,6 @@ template <class Ty> bool test(queue q, int inc) {
     });
   } catch (::sycl::exception const &e) {
     std::cout << "SYCL exception caught: " << e.what() << '\n';
-    delete[] data;
     return false;
   }
 
diff --git a/sycl/test-e2e/Plugin/interop-level-zero-image-get-native-mem.cpp b/sycl/test-e2e/Plugin/interop-level-zero-image-get-native-mem.cpp
index 414b493c0134e..f0f7cb0454019 100644
--- a/sycl/test-e2e/Plugin/interop-level-zero-image-get-native-mem.cpp
+++ b/sycl/test-e2e/Plugin/interop-level-zero-image-get-native-mem.cpp
@@ -101,7 +101,7 @@ int main() {
        });
      }).wait();
   } // ~image
-
+  std::free(sourceData);
 #else
   std::cout << "Missing  Level-Zero backend. Test skipped." << std::endl;
 #endif
diff --git a/sycl/test-e2e/Regression/commandlist/Inputs/main.cpp b/sycl/test-e2e/Regression/commandlist/Inputs/main.cpp
index 9e609352a8640..e5beef06d7890 100644
--- a/sycl/test-e2e/Regression/commandlist/Inputs/main.cpp
+++ b/sycl/test-e2e/Regression/commandlist/Inputs/main.cpp
@@ -200,6 +200,8 @@ int main(int argc, char *argv[]) {
   std::cout << "total time: " << diff.count() << std::endl;
 #endif
 
+  delete queueLock;
+
   if (passed) {
     std::cout << "Passed" << std::endl;
   } else {

From 0d62e08dc48eff5d07e206cd2cd7e5f1f90a0bdb Mon Sep 17 00:00:00 2001
From: wangdi4 <101905226+wangdi4@users.noreply.github.com>
Date: Thu, 2 Nov 2023 16:54:30 -0400
Subject: [PATCH 823/877] [SYCL][Test] Fix annotated malloc test failure
 (#11762)

This is to fix the post-commit failure in #10235
---
 .../invalid_compile_time_properties.cpp       | 37 +------------------
 1 file changed, 1 insertion(+), 36 deletions(-)

diff --git a/sycl/test/extensions/annotated_usm/invalid_compile_time_properties.cpp b/sycl/test/extensions/annotated_usm/invalid_compile_time_properties.cpp
index ea67d4395e847..d8a86fc735a15 100644
--- a/sycl/test/extensions/annotated_usm/invalid_compile_time_properties.cpp
+++ b/sycl/test/extensions/annotated_usm/invalid_compile_time_properties.cpp
@@ -28,137 +28,102 @@ void testInvalidCompileTimeProperty(sycl::queue &q) {
   const sycl::context &Ctx = q.get_context();
   auto dev = q.get_device();
 
-  // expected-error-re@sycl/ext/oneapi/experimental/common_annotated_properties/properties.hpp:* {{static assertion failed due to requirement {{.+}}bar_key{{.+}}: Property is invalid for the given type.}}
+  // expected-error@sycl/ext/oneapi/experimental/common_annotated_properties/properties.hpp:* 33+ {{static assertion failed due to requirement 'is_valid_property_for_given_type': Property is invalid for the given type.}}
   // expected-error-re@sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp:* {{static assertion failed due to requirement {{.+}}bar_key{{.+}}: Found invalid compile-time property in the property list.}}
   TEST(malloc_device_annotated, N, q, properties{conduit, bar})
 
-  // expected-error-re@sycl/ext/oneapi/experimental/common_annotated_properties/properties.hpp:* {{static assertion failed due to requirement {{.+}}baz_key{{.+}}: Property is invalid for the given type.}}
   // expected-error-re@sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp:* {{static assertion failed due to requirement {{.+}}baz_key{{.+}}: Found invalid compile-time property in the property list.}}
   TEST(malloc_device_annotated, N, dev, Ctx, properties{conduit, buffer_location<1>, baz<5>})
 
-  // expected-error-re@sycl/ext/oneapi/experimental/common_annotated_properties/properties.hpp:* {{static assertion failed due to requirement {{.+}}baz_key{{.+}}: Property is invalid for the given type.}}
   // expected-error-re@sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp:* {{static assertion failed due to requirement {{.+}}baz_key{{.+}}: Found invalid compile-time property in the property list.}}
   TEST(malloc_device_annotated<int>, N, q, properties{conduit, baz<2>})
 
-  // expected-error-re@sycl/ext/oneapi/experimental/common_annotated_properties/properties.hpp:* 1+ {{static assertion failed due to requirement {{.+}}device_has_key{{.+}}: Property is invalid for the given type.}}
   // expected-error-re@sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp:* {{static assertion failed due to requirement {{.+}}device_has_key{{.+}}: Found invalid compile-time property in the property list.}}
   TEST(malloc_device_annotated<int>, N, dev, Ctx, properties{buffer_location<1>, device_has<sycl::aspect::cpu>})
 
-  // expected-error-re@sycl/ext/oneapi/experimental/common_annotated_properties/properties.hpp:* 1+ {{static assertion failed due to requirement {{.+}}work_group_size_key{{.+}}: Property is invalid for the given type.}}
   // expected-error-re@sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp:* {{static assertion failed due to requirement {{.+}}work_group_size_key{{.+}}: Found invalid compile-time property in the property list.}}
   TEST(aligned_alloc_device_annotated, 1, N, q, properties{register_map, work_group_size<1>})
 
-  // expected-error-re@sycl/ext/oneapi/experimental/common_annotated_properties/properties.hpp:* {{static assertion failed due to requirement {{.+}}baz_key{{.+}}: Property is invalid for the given type.}}
   // expected-error-re@sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp:* {{static assertion failed due to requirement {{.+}}baz_key{{.+}}: Found invalid compile-time property in the property list.}}
   TEST(aligned_alloc_device_annotated, 1, N, dev, Ctx, properties{register_map, baz<3>})
 
-  // expected-error-re@sycl/ext/oneapi/experimental/common_annotated_properties/properties.hpp:* {{static assertion failed due to requirement {{.+}}baz_key{{.+}}: Property is invalid for the given type.}}
   // expected-error-re@sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp:* {{static assertion failed due to requirement {{.+}}baz_key{{.+}}: Found invalid compile-time property in the property list.}}
   TEST(aligned_alloc_device_annotated<int>, 1, N, q, properties{register_map, baz<0>})
 
-  // expected-error-re@sycl/ext/oneapi/experimental/common_annotated_properties/properties.hpp:* {{static assertion failed due to requirement {{.+}}boo_key{{.+}}: Property is invalid for the given type.}}
   // expected-error-re@sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp:* {{static assertion failed due to requirement {{.+}}boo_key{{.+}}: Found invalid compile-time property in the property list.}}
   TEST(aligned_alloc_device_annotated<int>, 1, N, dev, Ctx, properties{conduit, boo<void>})
 
-
-  // expected-error-re@sycl/ext/oneapi/experimental/common_annotated_properties/properties.hpp:* {{static assertion failed due to requirement {{.+}}boo_key{{.+}}: Property is invalid for the given type.}}
   // expected-error-re@sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp:* {{static assertion failed due to requirement {{.+}}boo_key{{.+}}: Found invalid compile-time property in the property list.}}
   TEST(malloc_host_annotated, N, q, properties{conduit, boo<int>})
 
-  // expected-error-re@sycl/ext/oneapi/experimental/common_annotated_properties/properties.hpp:* 1+ {{static assertion failed due to requirement {{.+}}streaming_interface_key{{.+}}: Property is invalid for the given type.}}
   // expected-error-re@sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp:* {{static assertion failed due to requirement {{.+}}streaming_interface_key{{.+}}: Found invalid compile-time property in the property list.}}
   TEST(malloc_host_annotated, N, Ctx, properties{buffer_location<1>, streaming_interface_accept_downstream_stall})
 
-  // expected-error-re@sycl/ext/oneapi/experimental/common_annotated_properties/properties.hpp:* 1+ {{static assertion failed due to requirement {{.+}}register_map_interface_key{{.+}}: Property is invalid for the given type.}}
   // expected-error-re@sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp:* {{static assertion failed due to requirement {{.+}}register_map_interface_key{{.+}}: Found invalid compile-time property in the property list.}}
   TEST(malloc_host_annotated<int>, N, q, properties{register_map_interface_wait_for_done_write, register_map})
 
-  // expected-error-re@sycl/ext/oneapi/experimental/common_annotated_properties/properties.hpp:* {{static assertion failed due to requirement {{.+}}boo_key{{.+}}: Property is invalid for the given type.}}
   // expected-error-re@sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp:* {{static assertion failed due to requirement {{.+}}boo_key{{.+}}: Found invalid compile-time property in the property list.}}
   TEST(malloc_host_annotated<int>, N, Ctx, properties{conduit, buffer_location<1>, boo<int, int>})
 
-  // expected-error-re@sycl/ext/oneapi/experimental/common_annotated_properties/properties.hpp:* {{static assertion failed due to requirement {{.+}}boo_key{{.+}}: Property is invalid for the given type.}}
   // expected-error-re@sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp:* {{static assertion failed due to requirement {{.+}}boo_key{{.+}}: Found invalid compile-time property in the property list.}}
   TEST(aligned_alloc_host_annotated, 1, N, q, properties{register_map, boo<int, bool>})
 
-  // expected-error-re@sycl/ext/oneapi/experimental/common_annotated_properties/properties.hpp:* {{static assertion failed due to requirement {{.+}}boo_key{{.+}}: Property is invalid for the given type.}}
   // expected-error-re@sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp:* {{static assertion failed due to requirement {{.+}}boo_key{{.+}}: Found invalid compile-time property in the property list.}}
   TEST(aligned_alloc_host_annotated, 1, N, Ctx, properties{register_map, boo<int, int, int>})
 
-  // expected-error-re@sycl/ext/oneapi/experimental/common_annotated_properties/properties.hpp:* {{static assertion failed due to requirement {{.+}}boo_key{{.+}}: Property is invalid for the given type.}}
   // expected-error-re@sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp:* {{static assertion failed due to requirement {{.+}}boo_key{{.+}}: Found invalid compile-time property in the property list.}}
   TEST(aligned_alloc_host_annotated<int>, 1, N, q, properties{register_map, buffer_location<1>, boo<bool>})
 
-  // expected-error-re@sycl/ext/oneapi/experimental/common_annotated_properties/properties.hpp:* {{static assertion failed due to requirement {{.+}}boo_key{{.+}}: Property is invalid for the given type.}}
   // expected-error-re@sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp:* {{static assertion failed due to requirement {{.+}}boo_key{{.+}}: Found invalid compile-time property in the property list.}}
   TEST(aligned_alloc_host_annotated<int>, 1, N, Ctx, properties{register_map, buffer_location<1>, boo<float>})
 
-
-  // expected-error-re@sycl/ext/oneapi/experimental/common_annotated_properties/properties.hpp:* 1+ {{static assertion failed due to requirement {{.+}}device_image_scope_key{{.+}}: Property is invalid for the given type.}}
   // expected-error-re@sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp:* {{static assertion failed due to requirement {{.+}}device_image_scope_key{{.+}}: Found invalid compile-time property in the property list.}}
   TEST(malloc_shared_annotated, N, q, properties{conduit, device_image_scope})
 
-  // expected-error-re@sycl/ext/oneapi/experimental/common_annotated_properties/properties.hpp:* 1+ {{static assertion failed due to requirement {{.+}}host_access_key{{.+}}: Property is invalid for the given type.}}
   // expected-error-re@sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp:* {{static assertion failed due to requirement {{.+}}host_access_key{{.+}}: Found invalid compile-time property in the property list.}}
   TEST(malloc_shared_annotated, N, dev, Ctx, properties{buffer_location<1>, host_access_read})
 
-  // expected-error-re@sycl/ext/oneapi/experimental/common_annotated_properties/properties.hpp:* 1+ {{static assertion failed due to requirement {{.+}}boo_key{{.+}}: Property is invalid for the given type.}}
   // expected-error-re@sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp:* {{static assertion failed due to requirement {{.+}}boo_key{{.+}}: Found invalid compile-time property in the property list.}}
   TEST(malloc_shared_annotated<int>, N, q, properties{boo<char>, register_map})
 
-  // expected-error-re@sycl/ext/oneapi/experimental/common_annotated_properties/properties.hpp:* 1+ {{static assertion failed due to requirement {{.+}}init_mode_key{{.+}}: Property is invalid for the given type.}}
   // expected-error-re@sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp:* {{static assertion failed due to requirement {{.+}}init_mode_key{{.+}}: Found invalid compile-time property in the property list.}}
   TEST(malloc_shared_annotated<int>, N, dev, Ctx, properties{conduit, buffer_location<1>, init_mode_reset})
 
-  // expected-error-re@sycl/ext/oneapi/experimental/common_annotated_properties/properties.hpp:* 1+ {{static assertion failed due to requirement {{.+}}implement_in_csr_key{{.+}}: Property is invalid for the given type.}}
   // expected-error-re@sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp:* {{static assertion failed due to requirement {{.+}}implement_in_csr_key{{.+}}: Found invalid compile-time property in the property list.}}
   TEST(aligned_alloc_shared_annotated, 1, N, q, properties{register_map, implement_in_csr_on})
 
-  // expected-error-re@sycl/ext/oneapi/experimental/common_annotated_properties/properties.hpp:* 1+ {{static assertion failed due to requirement {{.+}}ready_latency_key{{.+}}: Property is invalid for the given type.}}
   // expected-error-re@sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp:* {{static assertion failed due to requirement {{.+}}ready_latency_key{{.+}}: Found invalid compile-time property in the property list.}}
   TEST(aligned_alloc_shared_annotated, 1, N, dev, Ctx, properties{register_map, ready_latency<1>})
 
-  // expected-error-re@sycl/ext/oneapi/experimental/common_annotated_properties/properties.hpp:* 1+ {{static assertion failed due to requirement {{.+}}bits_per_symbol_key{{.+}}: Property is invalid for the given type.}}
   // expected-error-re@sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp:* {{static assertion failed due to requirement {{.+}}bits_per_symbol_key{{.+}}: Found invalid compile-time property in the property list.}}
   TEST(aligned_alloc_shared_annotated<int>, 1, N, q, properties{register_map, bits_per_symbol<1>})
 
-  // expected-error-re@sycl/ext/oneapi/experimental/common_annotated_properties/properties.hpp:* 1+ {{static assertion failed due to requirement {{.+}}pipelined_key{{.+}}: Property is invalid for the given type.}}
   // expected-error-re@sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp:* {{static assertion failed due to requirement {{.+}}pipelined_key{{.+}}: Found invalid compile-time property in the property list.}}
   TEST(aligned_alloc_shared_annotated<int>, 1, N, dev, Ctx, properties{register_map, buffer_location<1>, pipelined<1>})
 
-
-  // expected-error-re@sycl/ext/oneapi/experimental/common_annotated_properties/properties.hpp:* 1+ {{static assertion failed due to requirement {{.+}}sub_group_size_key{{.+}}: Property is invalid for the given type.}}
   // expected-error-re@sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp:* {{static assertion failed due to requirement {{.+}}sub_group_size_key{{.+}}: Found invalid compile-time property in the property list.}}
   TEST(malloc_annotated, N, q, alloc::device, properties{conduit, sub_group_size<2>})
 
-  // expected-error-re@sycl/ext/oneapi/experimental/common_annotated_properties/properties.hpp:* 1+ {{static assertion failed due to requirement {{.+}}uses_valid_key{{.+}}: Property is invalid for the given type.}}
   // expected-error-re@sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp:* {{static assertion failed due to requirement {{.+}}uses_valid_key{{.+}}: Found invalid compile-time property in the property list.}}
   TEST(malloc_annotated, N, dev, Ctx, alloc::device, properties{buffer_location<1>, uses_valid_on})
 
-  // expected-error-re@sycl/ext/oneapi/experimental/common_annotated_properties/properties.hpp:* 1+ {{static assertion failed due to requirement {{.+}}first_symbol_in_high_order_bits_key{{.+}}: Property is invalid for the given type.}}
   // expected-error-re@sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp:* {{static assertion failed due to requirement {{.+}}first_symbol_in_high_order_bits_key{{.+}}: Found invalid compile-time property in the property list.}}
   TEST(malloc_annotated<int>, N, q, alloc::device, properties{first_symbol_in_high_order_bits_on, register_map})
 
-  // expected-error-re@sycl/ext/oneapi/experimental/common_annotated_properties/properties.hpp:* 1+ {{static assertion failed due to requirement {{.+}}protocol_key{{.+}}: Property is invalid for the given type.}}
   // expected-error-re@sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp:* {{static assertion failed due to requirement {{.+}}protocol_key{{.+}}: Found invalid compile-time property in the property list.}}
   TEST(malloc_annotated<int>, N, dev, Ctx, alloc::device, properties{conduit, buffer_location<1>, protocol_avalon_streaming_uses_ready})
 
-    // expected-error-re@sycl/ext/oneapi/experimental/common_annotated_properties/properties.hpp:* 1+ {{static assertion failed due to requirement {{.+}}word_size_key{{.+}}: Property is invalid for the given type.}}
   // expected-error-re@sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp:* {{static assertion failed due to requirement {{.+}}word_size_key{{.+}}: Found invalid compile-time property in the property list.}}
   TEST(aligned_alloc_annotated, 1, N, q, alloc::device, properties{register_map, word_size<1>})
 
-  // expected-error-re@sycl/ext/oneapi/experimental/common_annotated_properties/properties.hpp:* 1+ {{static assertion failed due to requirement {{.+}}stride_size_key{{.+}}: Property is invalid for the given type.}}
   // expected-error-re@sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp:* {{static assertion failed due to requirement {{.+}}stride_size_key{{.+}}: Found invalid compile-time property in the property list.}}
   TEST(aligned_alloc_annotated, 1, N, dev, Ctx, alloc::device, properties{register_map, stride_size<1>})
 
-  // expected-error-re@sycl/ext/oneapi/experimental/common_annotated_properties/properties.hpp:* 1+ {{static assertion failed due to requirement {{.+}}work_group_size_hint_key{{.+}}: Property is invalid for the given type.}}
   // expected-error-re@sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp:* {{static assertion failed due to requirement {{.+}}work_group_size_hint_key{{.+}}: Found invalid compile-time property in the property list.}}
   TEST(aligned_alloc_annotated<int>, 1, N, q, alloc::device, properties{register_map, work_group_size_hint<1>})
 
-  // expected-error-re@sycl/ext/oneapi/experimental/common_annotated_properties/properties.hpp:* {{static assertion failed due to requirement {{.+}}latency_anchor_id_key{{.+}}: Property is invalid for the given type.}}
   // expected-error-re@sycl/ext/oneapi/experimental/annotated_usm/alloc_util.hpp:* {{static assertion failed due to requirement {{.+}}latency_anchor_id_key{{.+}}: Found invalid compile-time property in the property list.}}
   TEST(aligned_alloc_annotated<int>, 1, N, dev, Ctx, alloc::device, properties{register_map, buffer_location<1>, latency_anchor_id<5>})
-
 }
 
 // clang-format on

From 8c635098dc9e6129bf1298d2264e495cad931b1e Mon Sep 17 00:00:00 2001
From: jinge90 <ge.jin@intel.com>
Date: Fri, 3 Nov 2023 06:39:50 +0800
Subject: [PATCH 824/877] [SYCL] Don't load all fallback spirv when launching
 from SPIRV file (#11734)

Currently, sycl-post-link tool scans user's device image to detect any
undefined function call to items in fallback spirv devicelibs and create
a property in final executable to tell which fallback devicelibs are
really needed. SYCL runtime will load required fallback spirv libs
according to this property.
However, if "SYCL_USE_KERNEL_SPIRV" is used, the kernel is launched from
spv file and sycl-post-link tool is not involved to scan and generate
the property. Previously, we just load and link all fallback spv files
and underlying runtime will remove them when they don't need those code.
However, we received a lot of complains about seeing a lot of
unnecessary code from underlying runtime. So, this PR is to avoid
loading this unnecessary code.
If anyone uses clang++ driver to generate spv file, this file won't
depend on any functions from fallback spv since device code has to be
linked with wrapper .obj to involve any functions from fallback spv.
---
 sycl/source/detail/program_manager/program_manager.cpp | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/sycl/source/detail/program_manager/program_manager.cpp b/sycl/source/detail/program_manager/program_manager.cpp
index 62dc20a8f8d95..cb4027d4f18a1 100644
--- a/sycl/source/detail/program_manager/program_manager.cpp
+++ b/sycl/source/detail/program_manager/program_manager.cpp
@@ -1522,16 +1522,13 @@ void ProgramManager::flushSpecConstants(const program_impl &Prg,
   Prg.flush_spec_constants(*Img, NativePrg);
 }
 
-// If the kernel is loaded from spv file, it may not include DeviceLib require
-// mask, sycl runtime won't know which fallback device libraries are needed. In
-// such case, the safest way is to load all fallback device libraries.
 uint32_t ProgramManager::getDeviceLibReqMask(const RTDeviceBinaryImage &Img) {
   const RTDeviceBinaryImage::PropertyRange &DLMRange =
       Img.getDeviceLibReqMask();
   if (DLMRange.isAvailable())
     return DeviceBinaryProperty(*(DLMRange.begin())).asUint32();
   else
-    return 0xFFFFFFFF;
+    return 0x0;
 }
 
 const KernelArgMask *

From c286b5c1423a7183d066cb9868a8028d7f692f5c Mon Sep 17 00:00:00 2001
From: Yug Sahu <132282493+yug-intel@users.noreply.github.com>
Date: Thu, 2 Nov 2023 20:49:05 -0700
Subject: [PATCH 825/877] [SYCL] Error for conduit and register_map on
 annotated_arg/ptr (#11711)

This change ensures that there is an error when the user tries to
specify both the `conduit` property and the `register_map` property on
`annotated_arg` or `annotated_ptr`.

For example, the following variable:

```cpp
annotated_arg<int, decltype(properties{conduit, register_map})> a;
```

will generate the following assertion failure:

```
/p/psg/swip/w/yugteshs/sycl_include_1/sycl/ext/oneapi/experimental/annotated_arg/annotated_arg.hpp:204:17: error: static assertion failed due to requirement 'hasConduitAndRegisterMapProperties': The properties conduit and register_map cannot be specified at the same time.
  204 |   static_assert(hasConduitAndRegisterMapProperties,
      |                 ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
```
---
 .../fpga_annotated_properties.hpp             |  9 ++++++++
 .../annotated_arg/annotated_arg.hpp           | 12 ++++++++++
 .../annotated_ptr/annotated_ptr.hpp           |  6 +++++
 .../annotated_arg/annotated_arg_for_ptr.cpp   |  3 +--
 .../annotated_arg/annotated_arg_negative.cpp  | 23 +++++++++++++++++++
 .../annotated_arg/annotated_ptr.cpp           |  4 ++--
 6 files changed, 53 insertions(+), 4 deletions(-)
 create mode 100644 sycl/test/extensions/annotated_arg/annotated_arg_negative.cpp

diff --git a/sycl/include/sycl/ext/intel/experimental/fpga_annotated_properties.hpp b/sycl/include/sycl/ext/intel/experimental/fpga_annotated_properties.hpp
index fb1ea830c257a..74220b795623e 100644
--- a/sycl/include/sycl/ext/intel/experimental/fpga_annotated_properties.hpp
+++ b/sycl/include/sycl/ext/intel/experimental/fpga_annotated_properties.hpp
@@ -373,6 +373,15 @@ template <typename... Args> struct checkValidFPGAPropertySet {
 
   static constexpr bool value = !(!has_BufferLocation && has_InterfaceConfig);
 };
+
+template <typename... Args> struct checkHasConduitAndRegisterMap {
+  using list = std::tuple<Args...>;
+  static constexpr bool has_Conduit =
+      ContainsProperty<conduit_key, list>::value;
+  static constexpr bool has_RegisterMap =
+      ContainsProperty<register_map_key, list>::value;
+  static constexpr bool value = !(has_Conduit && has_RegisterMap);
+};
 } // namespace detail
 
 } // namespace experimental
diff --git a/sycl/include/sycl/ext/oneapi/experimental/annotated_arg/annotated_arg.hpp b/sycl/include/sycl/ext/oneapi/experimental/annotated_arg/annotated_arg.hpp
index cb8a3229ae38a..3b837a0c3a1be 100644
--- a/sycl/include/sycl/ext/oneapi/experimental/annotated_arg/annotated_arg.hpp
+++ b/sycl/include/sycl/ext/oneapi/experimental/annotated_arg/annotated_arg.hpp
@@ -97,6 +97,12 @@ __SYCL_TYPE(annotated_arg) annotated_arg<T *, detail::properties_t<Props...>> {
   static_assert(hasValidFPGAProperties,
                 "FPGA Interface properties (i.e. awidth, dwidth, etc.)"
                 "can only be set with BufferLocation together.");
+  // check if conduit and register_map properties are specified together
+  static constexpr bool hasConduitAndRegisterMapProperties =
+      detail::checkHasConduitAndRegisterMap<Props...>::value;
+  static_assert(hasConduitAndRegisterMapProperties,
+                "The properties conduit and register_map cannot be "
+                "specified at the same time.");
 
   annotated_arg() noexcept = default;
   annotated_arg(const annotated_arg &) = default;
@@ -221,6 +227,12 @@ __SYCL_TYPE(annotated_arg) annotated_arg<T, detail::properties_t<Props...>> {
   static_assert(hasValidFPGAProperties,
                 "FPGA Interface properties (i.e. awidth, dwidth, etc.)"
                 "can only be set with BufferLocation together.");
+  // check if conduit and register_map properties are specified together
+  static constexpr bool hasConduitAndRegisterMapProperties =
+      detail::checkHasConduitAndRegisterMap<Props...>::value;
+  static_assert(hasConduitAndRegisterMapProperties,
+                "The properties conduit and register_map cannot be "
+                "specified at the same time.");
 
   annotated_arg() noexcept = default;
   annotated_arg(const annotated_arg &) = default;
diff --git a/sycl/include/sycl/ext/oneapi/experimental/annotated_ptr/annotated_ptr.hpp b/sycl/include/sycl/ext/oneapi/experimental/annotated_ptr/annotated_ptr.hpp
index 74298f13a8dbc..3f7150f84061a 100644
--- a/sycl/include/sycl/ext/oneapi/experimental/annotated_ptr/annotated_ptr.hpp
+++ b/sycl/include/sycl/ext/oneapi/experimental/annotated_ptr/annotated_ptr.hpp
@@ -208,6 +208,12 @@ __SYCL_TYPE(annotated_ptr) annotated_ptr<T, detail::properties_t<Props...>> {
   static_assert(hasValidFPGAProperties,
                 "FPGA Interface properties (i.e. awidth, dwidth, etc.)"
                 "can only be set with BufferLocation together.");
+  // check if conduit and register_map properties are specified together
+  static constexpr bool hasConduitAndRegisterMapProperties =
+      detail::checkHasConduitAndRegisterMap<Props...>::value;
+  static_assert(hasConduitAndRegisterMapProperties,
+                "The properties conduit and register_map cannot be "
+                "specified at the same time.");
 
   annotated_ptr() noexcept = default;
   annotated_ptr(const annotated_ptr &) = default;
diff --git a/sycl/test/extensions/annotated_arg/annotated_arg_for_ptr.cpp b/sycl/test/extensions/annotated_arg/annotated_arg_for_ptr.cpp
index 02a9def1c9a2c..a5e138a5cd91d 100644
--- a/sycl/test/extensions/annotated_arg/annotated_arg_for_ptr.cpp
+++ b/sycl/test/extensions/annotated_arg/annotated_arg_for_ptr.cpp
@@ -16,8 +16,7 @@ using annotated_arg_t1 =
     annotated_arg<int *, decltype(properties(buffer_location<0>, awidth<32>,
                                              dwidth<32>))>;
 
-using annotated_arg_t2 =
-    annotated_arg<int, decltype(properties(conduit, register_map))>;
+using annotated_arg_t2 = annotated_arg<int, decltype(properties(register_map))>;
 
 using annotated_arg_t3 =
     annotated_arg<int *, decltype(properties(buffer_location<0>, awidth<32>))>;
diff --git a/sycl/test/extensions/annotated_arg/annotated_arg_negative.cpp b/sycl/test/extensions/annotated_arg/annotated_arg_negative.cpp
new file mode 100644
index 0000000000000..7d112c919f399
--- /dev/null
+++ b/sycl/test/extensions/annotated_arg/annotated_arg_negative.cpp
@@ -0,0 +1,23 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsyntax-only -Xclang -verify -Xclang -verify-ignore-unexpected=note,warning %s
+
+#include "sycl/sycl.hpp"
+#include <sycl/ext/intel/fpga_extensions.hpp>
+
+using namespace sycl;
+using namespace ext::oneapi::experimental;
+using namespace ext::intel::experimental;
+
+void check_conduit_and_register_map_properties() {
+  // check for conduit and register_map properties specified together
+  // expected-error@sycl/ext/oneapi/experimental/annotated_arg/annotated_arg.hpp:* {{The properties conduit and register_map cannot be specified at the same time.}}
+  annotated_arg<int, decltype(properties{conduit, register_map})> a;
+  // expected-error@sycl/ext/oneapi/experimental/annotated_arg/annotated_arg.hpp:* {{The properties conduit and register_map cannot be specified at the same time.}}
+  annotated_arg<int *, decltype(properties{conduit, register_map})> b;
+  // expected-error@sycl/ext/oneapi/experimental/annotated_ptr/annotated_ptr.hpp:* {{The properties conduit and register_map cannot be specified at the same time.}}
+  annotated_ptr<int, decltype(properties{conduit, register_map})> c;
+}
+
+int main() {
+  check_conduit_and_register_map_properties();
+  return 0;
+}
diff --git a/sycl/test/extensions/annotated_arg/annotated_ptr.cpp b/sycl/test/extensions/annotated_arg/annotated_ptr.cpp
index 0cb2ff5c42e90..7753ed298dcba 100644
--- a/sycl/test/extensions/annotated_arg/annotated_ptr.cpp
+++ b/sycl/test/extensions/annotated_arg/annotated_ptr.cpp
@@ -16,8 +16,8 @@ using annotated_ptr_t1 =
                                            dwidth<32>))>;
 
 using annotated_ptr_t2 =
-    annotated_ptr<int, decltype(properties(buffer_location<0>, conduit,
-                                           register_map, alignment<8>))>;
+    annotated_ptr<int, decltype(properties(buffer_location<0>, register_map,
+                                           alignment<8>))>;
 
 using annotated_ptr_t3 =
     annotated_ptr<int, decltype(properties(buffer_location<0>, awidth<32>))>;

From d13c88ba4d058dd956b8216ec5e25b2d224a5afb Mon Sep 17 00:00:00 2001
From: Evgeniy <evgeniy.tyurin@intel.com>
Date: Thu, 2 Nov 2023 20:49:36 -0700
Subject: [PATCH 826/877] [SYCL][ESIMD][E2E][NFC] Remove emulator-related
 remarks (#11766)

---
 sycl/test-e2e/ESIMD/BitonicSortK.cpp               |  1 -
 sycl/test-e2e/ESIMD/BitonicSortK.hpp               |  6 ------
 sycl/test-e2e/ESIMD/BitonicSortK_emu.cpp           | 14 --------------
 sycl/test-e2e/ESIMD/BitonicSortKv2.cpp             |  1 -
 sycl/test-e2e/ESIMD/BitonicSortKv2.hpp             |  6 ------
 sycl/test-e2e/ESIMD/BitonicSortKv2_emu.cpp         | 14 --------------
 .../ESIMD/acc_gather_scatter_rgba_stateless.cpp    |  1 -
 .../ESIMD/acc_gather_scatter_rgba_stateless_64.cpp |  1 -
 .../ESIMD/accessor_gather_scatter_stateless.cpp    |  1 -
 sycl/test-e2e/ESIMD/accessor_global.cpp            |  1 -
 .../ESIMD/accessor_load_store_stateless.cpp        |  1 -
 .../ESIMD/accessor_load_store_stateless_64.cpp     |  1 -
 sycl/test-e2e/ESIMD/accessor_stateless.cpp         |  1 -
 sycl/test-e2e/ESIMD/accessor_stateless_64.cpp      |  1 -
 sycl/test-e2e/ESIMD/accessor_stateless_ctor_64.cpp |  1 -
 sycl/test-e2e/ESIMD/aot_mixed.cpp                  |  1 -
 sycl/test-e2e/ESIMD/api/saturation_smoke.cpp       |  2 --
 .../ESIMD/api/slm_gather_scatter_heavy.cpp         |  1 -
 sycl/test-e2e/ESIMD/dpas/dpas_bf16.cpp             |  2 +-
 sycl/test-e2e/ESIMD/dpas/dpas_common.hpp           | 11 +++++------
 sycl/test-e2e/ESIMD/dpas/dpas_fp16.cpp             |  2 +-
 sycl/test-e2e/ESIMD/dpas/dpas_int.cpp              |  2 +-
 sycl/test-e2e/ESIMD/dpas/dpas_tf32.cpp             |  5 +----
 .../ESIMD/dword_atomic_accessor_cmpxchg.cpp        |  2 --
 .../dword_atomic_accessor_cmpxchg_scalar_off.cpp   |  2 --
 .../test-e2e/ESIMD/dword_atomic_accessor_smoke.cpp |  2 --
 .../dword_atomic_accessor_smoke_scalar_off.cpp     |  2 --
 .../dword_atomic_accessor_smoke_stateless.cpp      |  2 --
 sycl/test-e2e/ESIMD/dword_atomic_cmpxchg.cpp       |  2 --
 .../ESIMD/dword_atomic_cmpxchg_scalar_off.cpp      |  2 --
 .../ESIMD/dword_atomic_smoke_scalar_off.cpp        |  2 --
 sycl/test-e2e/ESIMD/esimd_check_vc_codegen.cpp     |  2 --
 sycl/test-e2e/ESIMD/ext_math_ieee_sqrt_div.cpp     |  2 +-
 sycl/test-e2e/ESIMD/grf.cpp                        |  2 --
 sycl/test-e2e/ESIMD/hardware_dispatch.cpp          |  1 -
 .../test-e2e/ESIMD/histogram_256_slm_spec_2020.cpp |  2 --
 .../ESIMD/local_accessor_gather_scatter_rgba.cpp   |  1 -
 sycl/test-e2e/ESIMD/lsc/atomic_cmpxchg.cpp         |  2 --
 .../ESIMD/lsc/atomic_cmpxchg_scalar_off.cpp        |  2 --
 sycl/test-e2e/ESIMD/lsc/atomic_smoke.cpp           |  3 ---
 sycl/test-e2e/ESIMD/lsc/atomic_smoke_64.cpp        |  2 --
 .../test-e2e/ESIMD/lsc/atomic_smoke_scalar_off.cpp |  2 --
 .../ESIMD/lsc/lsc_argument_type_deduction.cpp      |  2 +-
 .../lsc/lsc_block_load_store_stateless_64.cpp      |  1 -
 .../lsc/lsc_block_load_store_stateless_flag_64.cpp |  1 -
 sycl/test-e2e/ESIMD/lsc/lsc_fence.cpp              |  2 +-
 .../ESIMD/lsc/lsc_gather_scatter_stateless_64.cpp  |  1 -
 sycl/test-e2e/ESIMD/lsc/lsc_load_2d_u16.cpp        |  2 +-
 sycl/test-e2e/ESIMD/lsc/lsc_load_2d_u32.cpp        |  2 +-
 sycl/test-e2e/ESIMD/lsc/lsc_load_2d_u64.cpp        |  2 +-
 sycl/test-e2e/ESIMD/lsc/lsc_load_2d_u8.cpp         |  2 +-
 sycl/test-e2e/ESIMD/lsc/lsc_predicate.cpp          |  2 +-
 .../test-e2e/ESIMD/lsc/lsc_predicate_stateless.cpp |  1 -
 sycl/test-e2e/ESIMD/lsc/lsc_prefetch_2d_u16.cpp    |  2 +-
 sycl/test-e2e/ESIMD/lsc/lsc_prefetch_2d_u32.cpp    |  2 +-
 sycl/test-e2e/ESIMD/lsc/lsc_prefetch_2d_u64.cpp    |  2 +-
 sycl/test-e2e/ESIMD/lsc/lsc_prefetch_2d_u8.cpp     |  2 +-
 sycl/test-e2e/ESIMD/lsc/lsc_slm.cpp                |  3 +--
 sycl/test-e2e/ESIMD/lsc/lsc_slm_atomic_cmpxchg.cpp |  3 ---
 sycl/test-e2e/ESIMD/lsc/lsc_slm_atomic_smoke.cpp   |  3 ---
 sycl/test-e2e/ESIMD/lsc/lsc_slm_block_load.cpp     |  2 +-
 sycl/test-e2e/ESIMD/lsc/lsc_slm_gather.cpp         |  2 +-
 sycl/test-e2e/ESIMD/lsc/lsc_store_2d_u16.cpp       |  2 +-
 sycl/test-e2e/ESIMD/lsc/lsc_store_2d_u32.cpp       |  2 +-
 sycl/test-e2e/ESIMD/lsc/lsc_store_2d_u8.cpp        |  2 +-
 sycl/test-e2e/ESIMD/lsc/lsc_surf.cpp               |  3 +--
 sycl/test-e2e/ESIMD/lsc/lsc_surf_load_u16u32.cpp   |  2 +-
 sycl/test-e2e/ESIMD/lsc/lsc_surf_load_u32.cpp      |  2 +-
 .../ESIMD/lsc/lsc_surf_load_u32_stateless.cpp      |  1 -
 sycl/test-e2e/ESIMD/lsc/lsc_surf_load_u64.cpp      |  2 +-
 .../ESIMD/lsc/lsc_surf_load_u64_stateless.cpp      |  1 -
 sycl/test-e2e/ESIMD/lsc/lsc_surf_load_u8_u16.cpp   |  2 +-
 .../ESIMD/lsc/lsc_surf_load_u8_u16_stateless.cpp   |  1 -
 sycl/test-e2e/ESIMD/lsc/lsc_surf_load_u8u32.cpp    |  2 +-
 .../ESIMD/lsc/lsc_surf_prefetch_u16u32.cpp         |  2 +-
 sycl/test-e2e/ESIMD/lsc/lsc_surf_prefetch_u32.cpp  |  2 +-
 sycl/test-e2e/ESIMD/lsc/lsc_surf_prefetch_u64.cpp  |  2 +-
 .../test-e2e/ESIMD/lsc/lsc_surf_prefetch_u8u32.cpp |  2 +-
 sycl/test-e2e/ESIMD/lsc/lsc_surf_store_u16u32.cpp  |  2 +-
 sycl/test-e2e/ESIMD/lsc/lsc_surf_store_u32.cpp     |  2 +-
 .../ESIMD/lsc/lsc_surf_store_u32_stateless.cpp     |  1 -
 sycl/test-e2e/ESIMD/lsc/lsc_surf_store_u64.cpp     |  2 +-
 .../ESIMD/lsc/lsc_surf_store_u64_stateless.cpp     |  1 -
 sycl/test-e2e/ESIMD/lsc/lsc_surf_store_u8_u16.cpp  |  2 +-
 sycl/test-e2e/ESIMD/lsc/lsc_surf_store_u8u32.cpp   |  2 +-
 sycl/test-e2e/ESIMD/lsc/lsc_usm.cpp                |  3 +--
 sycl/test-e2e/ESIMD/lsc/lsc_usm_2d.cpp             |  2 +-
 sycl/test-e2e/ESIMD/lsc/lsc_usm_64.cpp             |  3 +--
 .../ESIMD/lsc/lsc_usm_atomic_cachehint.cpp         |  3 +--
 .../ESIMD/lsc/lsc_usm_atomic_cachehint_64.cpp      |  3 +--
 sycl/test-e2e/ESIMD/lsc/lsc_usm_block_load_u32.cpp |  2 +-
 .../ESIMD/lsc/lsc_usm_block_load_u32_64.cpp        |  2 +-
 .../lsc/lsc_usm_block_load_u32_scalar_off.cpp      |  2 +-
 sycl/test-e2e/ESIMD/lsc/lsc_usm_block_load_u64.cpp |  2 +-
 .../ESIMD/lsc/lsc_usm_block_load_u64_64.cpp        |  2 +-
 .../lsc/lsc_usm_block_load_u64_scalar_off.cpp      |  2 +-
 .../ESIMD/lsc/lsc_usm_block_load_u8_u16.cpp        |  2 +-
 .../ESIMD/lsc/lsc_usm_block_load_u8_u16_64.cpp     |  2 +-
 sycl/test-e2e/ESIMD/lsc/lsc_usm_gather_u16u32.cpp  |  2 +-
 .../ESIMD/lsc/lsc_usm_gather_u16u32_64.cpp         |  2 +-
 sycl/test-e2e/ESIMD/lsc/lsc_usm_gather_u32.cpp     |  2 +-
 sycl/test-e2e/ESIMD/lsc/lsc_usm_gather_u32_64.cpp  |  2 +-
 sycl/test-e2e/ESIMD/lsc/lsc_usm_gather_u64.cpp     |  2 +-
 sycl/test-e2e/ESIMD/lsc/lsc_usm_gather_u64_64.cpp  |  2 +-
 sycl/test-e2e/ESIMD/lsc/lsc_usm_gather_u8_u16.cpp  |  2 +-
 .../ESIMD/lsc/lsc_usm_gather_u8_u16_64.cpp         |  2 +-
 sycl/test-e2e/ESIMD/lsc/lsc_usm_gather_u8u32.cpp   |  2 +-
 .../test-e2e/ESIMD/lsc/lsc_usm_gather_u8u32_64.cpp |  2 +-
 .../test-e2e/ESIMD/lsc/lsc_usm_prefetch_u16u32.cpp |  2 +-
 .../ESIMD/lsc/lsc_usm_prefetch_u16u32_64.cpp       |  2 +-
 sycl/test-e2e/ESIMD/lsc/lsc_usm_prefetch_u32.cpp   |  2 +-
 .../test-e2e/ESIMD/lsc/lsc_usm_prefetch_u32_64.cpp |  2 +-
 .../ESIMD/lsc/lsc_usm_prefetch_u32_scalar_off.cpp  |  2 +-
 sycl/test-e2e/ESIMD/lsc/lsc_usm_prefetch_u64.cpp   |  2 +-
 .../test-e2e/ESIMD/lsc/lsc_usm_prefetch_u64_64.cpp |  2 +-
 .../ESIMD/lsc/lsc_usm_prefetch_u64_scalar_off.cpp  |  2 +-
 sycl/test-e2e/ESIMD/lsc/lsc_usm_prefetch_u8u32.cpp |  2 +-
 .../ESIMD/lsc/lsc_usm_prefetch_u8u32_64.cpp        |  2 +-
 sycl/test-e2e/ESIMD/lsc/lsc_usm_store_u16u32.cpp   |  2 +-
 .../test-e2e/ESIMD/lsc/lsc_usm_store_u16u32_64.cpp |  2 +-
 sycl/test-e2e/ESIMD/lsc/lsc_usm_store_u32.cpp      |  2 +-
 sycl/test-e2e/ESIMD/lsc/lsc_usm_store_u32_64.cpp   |  2 +-
 sycl/test-e2e/ESIMD/lsc/lsc_usm_store_u64.cpp      |  2 +-
 sycl/test-e2e/ESIMD/lsc/lsc_usm_store_u64_64.cpp   |  2 +-
 sycl/test-e2e/ESIMD/lsc/lsc_usm_store_u8_u16.cpp   |  2 +-
 .../test-e2e/ESIMD/lsc/lsc_usm_store_u8_u16_64.cpp |  2 +-
 sycl/test-e2e/ESIMD/lsc/lsc_usm_store_u8u32.cpp    |  2 +-
 sycl/test-e2e/ESIMD/lsc/lsc_usm_store_u8u32_64.cpp |  2 +-
 sycl/test-e2e/ESIMD/mandelbrot/mandelbrot_spec.cpp |  2 --
 sycl/test-e2e/ESIMD/matrix_transpose.cpp           |  2 --
 sycl/test-e2e/ESIMD/matrix_transpose2.cpp          |  3 +--
 .../ESIMD/named_barriers/exec_in_order.cpp         |  1 -
 .../named_barriers/exec_in_order_branched.cpp      |  1 -
 sycl/test-e2e/ESIMD/named_barriers/loop.cpp        |  1 -
 .../ESIMD/named_barriers/loop_extended.cpp         |  1 -
 sycl/test-e2e/ESIMD/named_barriers/multiple_wg.cpp |  1 -
 sycl/test-e2e/ESIMD/named_barriers/single_wg.cpp   |  1 -
 sycl/test-e2e/ESIMD/no_host_code.cpp               |  1 -
 sycl/test-e2e/ESIMD/preemption.cpp                 |  1 -
 sycl/test-e2e/ESIMD/private_memory.cpp             |  3 ---
 .../ESIMD/regression/atomic_update_test.cpp        |  2 --
 .../regression/bit_shift_compilation_test.cpp      |  1 -
 sycl/test-e2e/ESIMD/regression/complex-lib-lin.cpp |  2 --
 .../ESIMD/regression/fmod_compatibility_test.cpp   |  2 --
 sycl/test-e2e/ESIMD/regression/globals.cpp         |  2 --
 .../ESIMD/regression/rol_ror_compilation_test.cpp  |  1 -
 .../ESIMD/regression/sycl_esimd_mixed_unnamed.cpp  |  2 --
 sycl/test-e2e/ESIMD/slm_alloc.cpp                  |  1 -
 .../ESIMD/slm_alloc_many_kernels_many_funcs.cpp    |  1 -
 .../ESIMD/slm_alloc_many_kernels_one_func.cpp      |  1 -
 sycl/test-e2e/ESIMD/slm_block_load_store.cpp       |  1 -
 sycl/test-e2e/ESIMD/spec_const/spec_const_bool.cpp |  2 --
 sycl/test-e2e/ESIMD/spec_const/spec_const_char.cpp |  2 --
 .../ESIMD/spec_const/spec_const_double.cpp         |  2 --
 .../test-e2e/ESIMD/spec_const/spec_const_float.cpp |  2 --
 sycl/test-e2e/ESIMD/spec_const/spec_const_int.cpp  |  2 --
 .../test-e2e/ESIMD/spec_const/spec_const_int64.cpp |  2 --
 .../ESIMD/spec_const/spec_const_redefine.cpp       |  1 -
 .../test-e2e/ESIMD/spec_const/spec_const_short.cpp |  2 --
 .../test-e2e/ESIMD/spec_const/spec_const_uchar.cpp |  2 --
 sycl/test-e2e/ESIMD/spec_const/spec_const_uint.cpp |  2 --
 .../ESIMD/spec_const/spec_const_uint64.cpp         |  2 --
 .../ESIMD/spec_const/spec_const_ushort.cpp         |  2 --
 .../ESIMD/spec_const/unused_spec_const.cpp         |  1 -
 sycl/test-e2e/ESIMD/sycl_esimd_mix.cpp             |  3 ---
 sycl/test-e2e/ESIMD/vadd_raw_send.cpp              |  2 --
 166 files changed, 86 insertions(+), 262 deletions(-)
 delete mode 100644 sycl/test-e2e/ESIMD/BitonicSortK_emu.cpp
 delete mode 100644 sycl/test-e2e/ESIMD/BitonicSortKv2_emu.cpp

diff --git a/sycl/test-e2e/ESIMD/BitonicSortK.cpp b/sycl/test-e2e/ESIMD/BitonicSortK.cpp
index d2029041a0012..bc35bd2cae53c 100644
--- a/sycl/test-e2e/ESIMD/BitonicSortK.cpp
+++ b/sycl/test-e2e/ESIMD/BitonicSortK.cpp
@@ -5,7 +5,6 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// UNSUPPORTED: esimd_emulator
 // Use -O2 to avoid huge stack usage under -O0.
 // RUN: %{build} -O2 -o %t.out
 // RUN: %{run} %t.out
diff --git a/sycl/test-e2e/ESIMD/BitonicSortK.hpp b/sycl/test-e2e/ESIMD/BitonicSortK.hpp
index fe2657d68b312..4758d412c654d 100644
--- a/sycl/test-e2e/ESIMD/BitonicSortK.hpp
+++ b/sycl/test-e2e/ESIMD/BitonicSortK.hpp
@@ -13,13 +13,7 @@
 #include <sycl/ext/intel/esimd.hpp>
 #include <sycl/sycl.hpp>
 
-// Disable unrolling with the esmid emulator
-// due to huge stack size requirement
-#ifdef ESIMD_EMU
-#define ESIMD_UNROLL _Pragma("nounroll")
-#else
 #define ESIMD_UNROLL _Pragma("unroll")
-#endif
 
 using namespace sycl;
 using namespace sycl::ext::intel::esimd;
diff --git a/sycl/test-e2e/ESIMD/BitonicSortK_emu.cpp b/sycl/test-e2e/ESIMD/BitonicSortK_emu.cpp
deleted file mode 100644
index 673d71a821264..0000000000000
--- a/sycl/test-e2e/ESIMD/BitonicSortK_emu.cpp
+++ /dev/null
@@ -1,14 +0,0 @@
-//==----------- BitonicSortK_emu.cpp - DPC++ ESIMD on-device test ---==//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-// REQUIRES: esimd_emulator
-// RUN: %{build} -o %t.out
-// RUN: %{run} %t.out
-
-#define ESIMD_EMU
-
-#include "BitonicSortK.hpp"
diff --git a/sycl/test-e2e/ESIMD/BitonicSortKv2.cpp b/sycl/test-e2e/ESIMD/BitonicSortKv2.cpp
index 40b201bf359dc..386b0bbaffeb9 100644
--- a/sycl/test-e2e/ESIMD/BitonicSortKv2.cpp
+++ b/sycl/test-e2e/ESIMD/BitonicSortKv2.cpp
@@ -5,7 +5,6 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// UNSUPPORTED: esimd_emulator
 // Use -O2 to avoid huge stack usage under -O0.
 // RUN: %{build} -O2 -o %t.out
 // RUN: %{run} %t.out
diff --git a/sycl/test-e2e/ESIMD/BitonicSortKv2.hpp b/sycl/test-e2e/ESIMD/BitonicSortKv2.hpp
index 43f86eefc102d..f67c581c4e986 100644
--- a/sycl/test-e2e/ESIMD/BitonicSortKv2.hpp
+++ b/sycl/test-e2e/ESIMD/BitonicSortKv2.hpp
@@ -17,13 +17,7 @@ using namespace sycl::ext::intel;
 using namespace sycl::ext::intel::esimd;
 using namespace std;
 
-// Disable unrolling with the esmid emulator
-// due to huge stack size requirement
-#ifdef ESIMD_EMU
-#define ESIMD_UNROLL _Pragma("nounroll")
-#else
 #define ESIMD_UNROLL _Pragma("unroll")
-#endif
 
 #define LOG2_ELEMENTS 16 // 24
 
diff --git a/sycl/test-e2e/ESIMD/BitonicSortKv2_emu.cpp b/sycl/test-e2e/ESIMD/BitonicSortKv2_emu.cpp
deleted file mode 100644
index 393a2b1f71ac2..0000000000000
--- a/sycl/test-e2e/ESIMD/BitonicSortKv2_emu.cpp
+++ /dev/null
@@ -1,14 +0,0 @@
-//==------------- BitonicSortKv2_emu.cpp  - DPC++ ESIMD on-device test ---==//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//==----------------------------------------------------------------------==//
-// REQUIRES: esimd_emulator
-// RUN: %{build} -o %t.out
-// RUN: %{run} %t.out
-
-#define ESIMD_EMU
-
-#include "BitonicSortKv2.hpp"
diff --git a/sycl/test-e2e/ESIMD/acc_gather_scatter_rgba_stateless.cpp b/sycl/test-e2e/ESIMD/acc_gather_scatter_rgba_stateless.cpp
index 0f6a9dc846cd0..de12f93d51e66 100644
--- a/sycl/test-e2e/ESIMD/acc_gather_scatter_rgba_stateless.cpp
+++ b/sycl/test-e2e/ESIMD/acc_gather_scatter_rgba_stateless.cpp
@@ -5,7 +5,6 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// UNSUPPORTED: esimd_emulator
 // Use -O2 to avoid huge stack usage under -O0.
 // RUN: %{build} -O2 -fsycl-esimd-force-stateless-mem -o %t.out
 // RUN: %{run} %t.out
diff --git a/sycl/test-e2e/ESIMD/acc_gather_scatter_rgba_stateless_64.cpp b/sycl/test-e2e/ESIMD/acc_gather_scatter_rgba_stateless_64.cpp
index a40c75d1b7891..02688be3470c7 100644
--- a/sycl/test-e2e/ESIMD/acc_gather_scatter_rgba_stateless_64.cpp
+++ b/sycl/test-e2e/ESIMD/acc_gather_scatter_rgba_stateless_64.cpp
@@ -6,7 +6,6 @@
 //
 //==-----------------------------------------------------------------------==//
 // REQUIRES: gpu-intel-pvc
-// UNSUPPORTED: esimd_emulator
 // RUN: %{build} -fsycl-esimd-force-stateless-mem -o %t.out
 // RUN: %{run} %t.out
 //
diff --git a/sycl/test-e2e/ESIMD/accessor_gather_scatter_stateless.cpp b/sycl/test-e2e/ESIMD/accessor_gather_scatter_stateless.cpp
index c893fb0c22000..bc5a35d9b26d6 100644
--- a/sycl/test-e2e/ESIMD/accessor_gather_scatter_stateless.cpp
+++ b/sycl/test-e2e/ESIMD/accessor_gather_scatter_stateless.cpp
@@ -5,7 +5,6 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// UNSUPPORTED: esimd_emulator
 // Use -O2 to avoid huge stack usage under -O0.
 // RUN: %{build} -O2 -fsycl-esimd-force-stateless-mem -o %t.out
 // RUN: %{run} %t.out
diff --git a/sycl/test-e2e/ESIMD/accessor_global.cpp b/sycl/test-e2e/ESIMD/accessor_global.cpp
index 94370e261dd19..238edd08a6a49 100644
--- a/sycl/test-e2e/ESIMD/accessor_global.cpp
+++ b/sycl/test-e2e/ESIMD/accessor_global.cpp
@@ -1,4 +1,3 @@
-// UNSUPPORTED: esimd_emulator
 //
 // RUN: %{build} -fsycl-esimd-force-stateless-mem -o %t.out
 // RUN: %{run} %t.out
diff --git a/sycl/test-e2e/ESIMD/accessor_load_store_stateless.cpp b/sycl/test-e2e/ESIMD/accessor_load_store_stateless.cpp
index 0495ea2fd327a..589af386c4ef0 100644
--- a/sycl/test-e2e/ESIMD/accessor_load_store_stateless.cpp
+++ b/sycl/test-e2e/ESIMD/accessor_load_store_stateless.cpp
@@ -10,7 +10,6 @@
 // intrinsics when stateless memory accesses are enforced, i.e. accessor
 // based accesses are automatically converted to stateless accesses.
 
-// UNSUPPORTED: esimd_emulator
 // RUN: %{build} -fsycl-esimd-force-stateless-mem -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/ESIMD/accessor_load_store_stateless_64.cpp b/sycl/test-e2e/ESIMD/accessor_load_store_stateless_64.cpp
index 35fa56d022160..be4f224ff39b8 100644
--- a/sycl/test-e2e/ESIMD/accessor_load_store_stateless_64.cpp
+++ b/sycl/test-e2e/ESIMD/accessor_load_store_stateless_64.cpp
@@ -6,7 +6,6 @@
 //
 //===-------------------------------------------------------------------===//
 // REQUIRES: gpu-intel-pvc
-// UNSUPPORTED: esimd_emulator
 // RUN: %{build} -o %t.out -fsycl-esimd-force-stateless-mem
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/ESIMD/accessor_stateless.cpp b/sycl/test-e2e/ESIMD/accessor_stateless.cpp
index 78bc7a34925a8..aa3384bf0117f 100644
--- a/sycl/test-e2e/ESIMD/accessor_stateless.cpp
+++ b/sycl/test-e2e/ESIMD/accessor_stateless.cpp
@@ -5,7 +5,6 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// UNSUPPORTED: esimd_emulator
 // Use -O2 to avoid huge stack usage under -O0.
 // RUN: %{build} -O2 -fsycl-esimd-force-stateless-mem -D_CRT_SECURE_NO_WARNINGS=1 -o %t.out
 // RUN: %{run} %t.out
diff --git a/sycl/test-e2e/ESIMD/accessor_stateless_64.cpp b/sycl/test-e2e/ESIMD/accessor_stateless_64.cpp
index 7e4e6df111d18..9e6c1783276da 100644
--- a/sycl/test-e2e/ESIMD/accessor_stateless_64.cpp
+++ b/sycl/test-e2e/ESIMD/accessor_stateless_64.cpp
@@ -6,7 +6,6 @@
 //
 //===------------------------------------------------------------===//
 // REQUIRES: gpu-intel-pvc
-// UNSUPPORTED: esimd_emulator
 // RUN: %{build} -fsycl-esimd-force-stateless-mem -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/ESIMD/accessor_stateless_ctor_64.cpp b/sycl/test-e2e/ESIMD/accessor_stateless_ctor_64.cpp
index 49c8195cc9f51..a586154e2e8a5 100644
--- a/sycl/test-e2e/ESIMD/accessor_stateless_ctor_64.cpp
+++ b/sycl/test-e2e/ESIMD/accessor_stateless_ctor_64.cpp
@@ -6,7 +6,6 @@
 //
 //===------------------------------------------------------------===//
 // REQUIRES: gpu-intel-pvc
-// UNSUPPORTED: esimd_emulator
 // RUN: %{build} -fsycl-esimd-force-stateless-mem -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/ESIMD/aot_mixed.cpp b/sycl/test-e2e/ESIMD/aot_mixed.cpp
index a5fabd54cc8bb..062abdf661651 100644
--- a/sycl/test-e2e/ESIMD/aot_mixed.cpp
+++ b/sycl/test-e2e/ESIMD/aot_mixed.cpp
@@ -5,7 +5,6 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// UNSUPPORTED: esimd_emulator
 // Temporary disable everywhere until "Unsupported required sub group size" is
 // fixed in some configurations.
 // UNSUPPORTED: windows
diff --git a/sycl/test-e2e/ESIMD/api/saturation_smoke.cpp b/sycl/test-e2e/ESIMD/api/saturation_smoke.cpp
index 23571247d7aa9..3715c5b98dfef 100644
--- a/sycl/test-e2e/ESIMD/api/saturation_smoke.cpp
+++ b/sycl/test-e2e/ESIMD/api/saturation_smoke.cpp
@@ -5,8 +5,6 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// TODO: esimd_emulator fails due to unimplemented 'half' type
-// XFAIL: esimd_emulator
 // TODO: remove fno-fast-math option once the issue is investigated and the test
 // is fixed.
 // DEFINE: %{mathflags} = %if cl_options %{/clang:-fno-fast-math%} %else %{-fno-fast-math%}
diff --git a/sycl/test-e2e/ESIMD/api/slm_gather_scatter_heavy.cpp b/sycl/test-e2e/ESIMD/api/slm_gather_scatter_heavy.cpp
index c15c6f2ccbb49..82e45a2a40648 100644
--- a/sycl/test-e2e/ESIMD/api/slm_gather_scatter_heavy.cpp
+++ b/sycl/test-e2e/ESIMD/api/slm_gather_scatter_heavy.cpp
@@ -8,7 +8,6 @@
 // RUN: %{build} -fsycl-device-code-split=per_kernel -o %t.out
 // RUN: %{run} %t.out
 //
-// UNSUPPORTED: esimd_emulator
 //
 // GPU driver had an error in handling of SLM aligned block_loads/stores,
 // which has been fixed only in "1.3.26816", and in win/opencl version going
diff --git a/sycl/test-e2e/ESIMD/dpas/dpas_bf16.cpp b/sycl/test-e2e/ESIMD/dpas/dpas_bf16.cpp
index 2198b90402e1b..258b191423b0b 100644
--- a/sycl/test-e2e/ESIMD/dpas/dpas_bf16.cpp
+++ b/sycl/test-e2e/ESIMD/dpas/dpas_bf16.cpp
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: gpu-intel-pvc || gpu-intel-dg2 || esimd_emulator
+// REQUIRES: gpu-intel-pvc || gpu-intel-dg2
 // RUN: %{build} -fsycl-device-code-split=per_kernel -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/ESIMD/dpas/dpas_common.hpp b/sycl/test-e2e/ESIMD/dpas/dpas_common.hpp
index 0eff1c2e28f08..3d50a0915afe1 100644
--- a/sycl/test-e2e/ESIMD/dpas/dpas_common.hpp
+++ b/sycl/test-e2e/ESIMD/dpas/dpas_common.hpp
@@ -425,18 +425,17 @@ bool tests(queue Q, bool Print) {
   auto Dev = Q.get_device();
 
   // Detect the execution size.
-  // The device trait is not implemented for esimd_emulator. Use both 8 and 16.
   int ExecSize;
-  bool IsEmulator = false;
   try {
     ExecSize = Dev.get_info<ext::intel::info::device::gpu_eu_simd_width>();
   } catch (sycl::exception e) {
-    IsEmulator = true;
+    std::cerr << "Could not determine ExecSize, FAIL" << std::endl;
+    return false;
   }
-  assert((IsEmulator || (ExecSize == 8 || ExecSize == 16)) &&
+  assert(((ExecSize == 8 || ExecSize == 16)) &&
          "Execution size must be 8 or 16");
 
-  if (ExecSize == 16 || IsEmulator) {
+  if (ExecSize == 16) {
     Passed &=
         test<SystolicDepth, RepeatCount, T1, T2, UseSrc0, 16, LetDeduceArgs>(
             Q, Print);
@@ -444,7 +443,7 @@ bool tests(queue Q, bool Print) {
         test<SystolicDepth, RepeatCount, T1, T2, !UseSrc0, 16, LetDeduceArgs>(
             Q, Print);
   }
-  if (ExecSize == 8 || IsEmulator) {
+  if (ExecSize == 8) {
     if constexpr (!ExecSize16Only) {
       Passed &=
           test<SystolicDepth, RepeatCount, T1, T2, UseSrc0, 8, LetDeduceArgs>(
diff --git a/sycl/test-e2e/ESIMD/dpas/dpas_fp16.cpp b/sycl/test-e2e/ESIMD/dpas/dpas_fp16.cpp
index b03ec91b89739..b5de11a73e70a 100644
--- a/sycl/test-e2e/ESIMD/dpas/dpas_fp16.cpp
+++ b/sycl/test-e2e/ESIMD/dpas/dpas_fp16.cpp
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: gpu-intel-pvc || gpu-intel-dg2 || esimd_emulator
+// REQUIRES: gpu-intel-pvc || gpu-intel-dg2
 // RUN: %{build} -fsycl-device-code-split=per_kernel -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/ESIMD/dpas/dpas_int.cpp b/sycl/test-e2e/ESIMD/dpas/dpas_int.cpp
index 5892f27c5bfa5..038de159e99d4 100644
--- a/sycl/test-e2e/ESIMD/dpas/dpas_int.cpp
+++ b/sycl/test-e2e/ESIMD/dpas/dpas_int.cpp
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: gpu-intel-pvc || gpu-intel-dg2 || esimd_emulator
+// REQUIRES: gpu-intel-pvc || gpu-intel-dg2
 // RUN: %{build} -fsycl-device-code-split=per_kernel -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/ESIMD/dpas/dpas_tf32.cpp b/sycl/test-e2e/ESIMD/dpas/dpas_tf32.cpp
index a4412cf005e31..61e3d26200270 100644
--- a/sycl/test-e2e/ESIMD/dpas/dpas_tf32.cpp
+++ b/sycl/test-e2e/ESIMD/dpas/dpas_tf32.cpp
@@ -5,13 +5,10 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: gpu-intel-pvc || esimd_emulator
+// REQUIRES: gpu-intel-pvc
 // RUN: %{build} -fsycl-device-code-split=per_kernel -o %t.out
 // RUN: %{run} %t.out
 
-// TODO: implement support for tfloat32 in esimd_emulator
-// XFAIL: esimd_emulator
-
 // This test verifies DPAS support for tfloat32.
 
 #include "dpas_common.hpp"
diff --git a/sycl/test-e2e/ESIMD/dword_atomic_accessor_cmpxchg.cpp b/sycl/test-e2e/ESIMD/dword_atomic_accessor_cmpxchg.cpp
index c2fcf290dc04a..de6cb40cc7d6d 100644
--- a/sycl/test-e2e/ESIMD/dword_atomic_accessor_cmpxchg.cpp
+++ b/sycl/test-e2e/ESIMD/dword_atomic_accessor_cmpxchg.cpp
@@ -7,8 +7,6 @@
 //===----------------------------------------------------------------------===//
 // This test checks DWORD compare-and-exchange atomic operations.
 //===----------------------------------------------------------------------===//
-// TODO: esimd_emulator fails due to random timeouts (_XFAIL_: esimd_emulator)
-// UNSUPPORTED: esimd_emulator
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/ESIMD/dword_atomic_accessor_cmpxchg_scalar_off.cpp b/sycl/test-e2e/ESIMD/dword_atomic_accessor_cmpxchg_scalar_off.cpp
index 73ea3e768daf1..e37594d59234b 100644
--- a/sycl/test-e2e/ESIMD/dword_atomic_accessor_cmpxchg_scalar_off.cpp
+++ b/sycl/test-e2e/ESIMD/dword_atomic_accessor_cmpxchg_scalar_off.cpp
@@ -7,8 +7,6 @@
 //===----------------------------------------------------------------------===//
 // This test checks LSC atomic operations.
 //===----------------------------------------------------------------------===//
-// TODO: esimd_emulator fails due to random timeouts (_XFAIL_: esimd_emulator)
-// UNSUPPORTED: esimd_emulator
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/ESIMD/dword_atomic_accessor_smoke.cpp b/sycl/test-e2e/ESIMD/dword_atomic_accessor_smoke.cpp
index 280de05250700..8f82106dfb0d5 100644
--- a/sycl/test-e2e/ESIMD/dword_atomic_accessor_smoke.cpp
+++ b/sycl/test-e2e/ESIMD/dword_atomic_accessor_smoke.cpp
@@ -9,8 +9,6 @@
 //===----------------------------------------------------------------------===//
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
-// TODO add support for __esimd_dword_atomic on esimd_emulator
-// XFAIL: esimd_emulator
 
 // This macro enforces usage of dword atomics in the included test.
 #define USE_DWORD_ATOMICS
diff --git a/sycl/test-e2e/ESIMD/dword_atomic_accessor_smoke_scalar_off.cpp b/sycl/test-e2e/ESIMD/dword_atomic_accessor_smoke_scalar_off.cpp
index a5c9cd4eb70d9..abdc77f37e451 100644
--- a/sycl/test-e2e/ESIMD/dword_atomic_accessor_smoke_scalar_off.cpp
+++ b/sycl/test-e2e/ESIMD/dword_atomic_accessor_smoke_scalar_off.cpp
@@ -7,8 +7,6 @@
 //===----------------------------------------------------------------------===//
 // This test checks LSC atomic operations.
 //===----------------------------------------------------------------------===//
-// TODO: esimd_emulator fails due to random timeouts (_XFAIL_: esimd_emulator)
-// UNSUPPORTED: esimd_emulator
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 //
diff --git a/sycl/test-e2e/ESIMD/dword_atomic_accessor_smoke_stateless.cpp b/sycl/test-e2e/ESIMD/dword_atomic_accessor_smoke_stateless.cpp
index 9de601a614d36..eb5f356e394d4 100644
--- a/sycl/test-e2e/ESIMD/dword_atomic_accessor_smoke_stateless.cpp
+++ b/sycl/test-e2e/ESIMD/dword_atomic_accessor_smoke_stateless.cpp
@@ -11,8 +11,6 @@
 //===----------------------------------------------------------------------===//
 // RUN: %{build} -fsycl-esimd-force-stateless-mem -o %t.out
 // RUN: %{run} %t.out
-// TODO add support for __esimd_dword_atomic on esimd_emulator
-// XFAIL: esimd_emulator
 
 // This macro enforces usage of dword atomics in the included test.
 #define USE_DWORD_ATOMICS
diff --git a/sycl/test-e2e/ESIMD/dword_atomic_cmpxchg.cpp b/sycl/test-e2e/ESIMD/dword_atomic_cmpxchg.cpp
index b9453f9b1a3c9..86889f5f14e30 100644
--- a/sycl/test-e2e/ESIMD/dword_atomic_cmpxchg.cpp
+++ b/sycl/test-e2e/ESIMD/dword_atomic_cmpxchg.cpp
@@ -7,8 +7,6 @@
 //===----------------------------------------------------------------------===//
 // This test checks DWORD compare-and-exchange atomic operations.
 //===----------------------------------------------------------------------===//
-// TODO: esimd_emulator fails due to random timeouts
-// UNSUPPORTED: esimd_emulator
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/ESIMD/dword_atomic_cmpxchg_scalar_off.cpp b/sycl/test-e2e/ESIMD/dword_atomic_cmpxchg_scalar_off.cpp
index 18c08272aec88..a72f4e9600c7a 100644
--- a/sycl/test-e2e/ESIMD/dword_atomic_cmpxchg_scalar_off.cpp
+++ b/sycl/test-e2e/ESIMD/dword_atomic_cmpxchg_scalar_off.cpp
@@ -7,8 +7,6 @@
 //===----------------------------------------------------------------------===//
 // This test checks LSC atomic operations.
 //===----------------------------------------------------------------------===//
-// TODO: esimd_emulator fails due to random timeouts
-// UNSUPPORTED: esimd_emulator
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/ESIMD/dword_atomic_smoke_scalar_off.cpp b/sycl/test-e2e/ESIMD/dword_atomic_smoke_scalar_off.cpp
index 3be8858e2b6c7..b24b9917e6b99 100644
--- a/sycl/test-e2e/ESIMD/dword_atomic_smoke_scalar_off.cpp
+++ b/sycl/test-e2e/ESIMD/dword_atomic_smoke_scalar_off.cpp
@@ -7,8 +7,6 @@
 //===----------------------------------------------------------------------===//
 // This test checks LSC atomic operations.
 //===----------------------------------------------------------------------===//
-// TODO: esimd_emulator fails due to random timeouts (_XFAIL_: esimd_emulator)
-// UNSUPPORTED: esimd_emulator
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 //
diff --git a/sycl/test-e2e/ESIMD/esimd_check_vc_codegen.cpp b/sycl/test-e2e/ESIMD/esimd_check_vc_codegen.cpp
index 44dde5c02dfa0..4c545c4109832 100644
--- a/sycl/test-e2e/ESIMD/esimd_check_vc_codegen.cpp
+++ b/sycl/test-e2e/ESIMD/esimd_check_vc_codegen.cpp
@@ -5,8 +5,6 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// esimd_emulator does not support online-compiler that invokes 'piProgramBuild'
-// UNSUPPORTED: esimd_emulator
 // RUN: %{build} -o %t.out
 // RUN: env SYCL_PI_TRACE=-1 %{run} %t.out 2>&1 | FileCheck %s
 
diff --git a/sycl/test-e2e/ESIMD/ext_math_ieee_sqrt_div.cpp b/sycl/test-e2e/ESIMD/ext_math_ieee_sqrt_div.cpp
index a266e2b4fb251..052ef69cc0d2c 100644
--- a/sycl/test-e2e/ESIMD/ext_math_ieee_sqrt_div.cpp
+++ b/sycl/test-e2e/ESIMD/ext_math_ieee_sqrt_div.cpp
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: gpu-intel-gen9 || gpu-intel-pvc || esimd_emulator
+// REQUIRES: gpu-intel-gen9 || gpu-intel-pvc
 
 // DEFINE: %{mathflags} = %if cl_options %{/clang:-fno-fast-math%} %else %{-fno-fast-math%}
 // RUN: %{build} -fsycl-device-code-split=per_kernel %{mathflags} -o %t.out
diff --git a/sycl/test-e2e/ESIMD/grf.cpp b/sycl/test-e2e/ESIMD/grf.cpp
index 43920f0438df7..976fd2de1eace 100644
--- a/sycl/test-e2e/ESIMD/grf.cpp
+++ b/sycl/test-e2e/ESIMD/grf.cpp
@@ -14,9 +14,7 @@
 //   compiler option
 
 // REQUIRES: gpu-intel-pvc
-// TODO/FIXME: esimd_emulator does not support online compilation that
 //             invokes 'piProgramBuild'/'piKernelCreate'
-// UNSUPPORTED: esimd_emulator
 // RUN: %{build} -o %t.out
 // RUN: env SYCL_PI_TRACE=-1 %{run} %t.out 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-NO-VAR
 // RUN: env SYCL_PROGRAM_COMPILE_OPTIONS="-g" SYCL_PI_TRACE=-1 %{run} %t.out 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-WITH-VAR
diff --git a/sycl/test-e2e/ESIMD/hardware_dispatch.cpp b/sycl/test-e2e/ESIMD/hardware_dispatch.cpp
index e10af45f2ae73..22c93d2f43628 100644
--- a/sycl/test-e2e/ESIMD/hardware_dispatch.cpp
+++ b/sycl/test-e2e/ESIMD/hardware_dispatch.cpp
@@ -5,7 +5,6 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// UNSUPPORTED: esimd_emulator
 // RUN: %clangxx -fsycl -fsycl-targets=intel_gpu_bdw %s -o %t.out
 // RUN: %t.out
 // TODO: remove XFAIL when the fix in GPU RT for Windows is updated on CI
diff --git a/sycl/test-e2e/ESIMD/histogram_256_slm_spec_2020.cpp b/sycl/test-e2e/ESIMD/histogram_256_slm_spec_2020.cpp
index 049abe843dd10..5a9b6ea519b69 100644
--- a/sycl/test-e2e/ESIMD/histogram_256_slm_spec_2020.cpp
+++ b/sycl/test-e2e/ESIMD/histogram_256_slm_spec_2020.cpp
@@ -1,7 +1,5 @@
 // TODO enable on Windows
 // REQUIRES: linux
-// TODO online_compiler check fails for esimd_emulator
-// XFAIL: esimd_emulator
 // Use -O2 to avoid huge stack usage under -O0.
 // RUN: %{build} -O2 -o %t.out
 // RUN: %{run} %t.out 16
diff --git a/sycl/test-e2e/ESIMD/local_accessor_gather_scatter_rgba.cpp b/sycl/test-e2e/ESIMD/local_accessor_gather_scatter_rgba.cpp
index c221ade130d37..7917899a947fa 100644
--- a/sycl/test-e2e/ESIMD/local_accessor_gather_scatter_rgba.cpp
+++ b/sycl/test-e2e/ESIMD/local_accessor_gather_scatter_rgba.cpp
@@ -7,7 +7,6 @@
 //===----------------------------------------------------------------------===//
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
-// UNSUPPORTED: esimd_emulator
 // REQUIRES-INTEL-DRIVER: lin: 26690, win: 101.4576
 // The test checks functionality of the gather_rgba/scatter_rgba local
 // accessor-based ESIMD intrinsics.
diff --git a/sycl/test-e2e/ESIMD/lsc/atomic_cmpxchg.cpp b/sycl/test-e2e/ESIMD/lsc/atomic_cmpxchg.cpp
index 5124d3ea0722a..3efdd92a8774e 100644
--- a/sycl/test-e2e/ESIMD/lsc/atomic_cmpxchg.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/atomic_cmpxchg.cpp
@@ -8,8 +8,6 @@
 // This test checks LSC compare-and-exchange atomic operations.
 //===----------------------------------------------------------------------===//
 // REQUIRES: gpu-intel-pvc
-// TODO: esimd_emulator fails due to random timeouts (_XFAIL_: esimd_emulator)
-// UNSUPPORTED: esimd_emulator
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/ESIMD/lsc/atomic_cmpxchg_scalar_off.cpp b/sycl/test-e2e/ESIMD/lsc/atomic_cmpxchg_scalar_off.cpp
index 11a08409e692a..1939ada09b7fe 100644
--- a/sycl/test-e2e/ESIMD/lsc/atomic_cmpxchg_scalar_off.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/atomic_cmpxchg_scalar_off.cpp
@@ -8,8 +8,6 @@
 // This test checks LSC compare-and-exchange atomic operations.
 //===----------------------------------------------------------------------===//
 // REQUIRES: gpu-intel-pvc
-// TODO: esimd_emulator fails due to random timeouts (_XFAIL_: esimd_emulator)
-// UNSUPPORTED: esimd_emulator
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/ESIMD/lsc/atomic_smoke.cpp b/sycl/test-e2e/ESIMD/lsc/atomic_smoke.cpp
index e84970e1b1220..09ac7db046927 100644
--- a/sycl/test-e2e/ESIMD/lsc/atomic_smoke.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/atomic_smoke.cpp
@@ -8,9 +8,6 @@
 // This test checks LSC atomic operations.
 //===----------------------------------------------------------------------===//
 // REQUIRES: gpu-intel-pvc
-// TODO: esimd_emulator fails due to random timeouts (_XFAIL_: esimd_emulator)
-// TODO: esimd_emulator doesn't support xchg operation
-// UNSUPPORTED: esimd_emulator
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/ESIMD/lsc/atomic_smoke_64.cpp b/sycl/test-e2e/ESIMD/lsc/atomic_smoke_64.cpp
index 16727b150d76b..86af688ac714a 100644
--- a/sycl/test-e2e/ESIMD/lsc/atomic_smoke_64.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/atomic_smoke_64.cpp
@@ -8,8 +8,6 @@
 // This test checks LSC atomic operations.
 //===----------------------------------------------------------------------===//
 // REQUIRES: gpu-intel-pvc
-// TODO: esimd_emulator fails due to random timeouts (_XFAIL_: esimd_emulator)
-// UNSUPPORTED: esimd_emulator
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 //
diff --git a/sycl/test-e2e/ESIMD/lsc/atomic_smoke_scalar_off.cpp b/sycl/test-e2e/ESIMD/lsc/atomic_smoke_scalar_off.cpp
index 5a8ab14bc6abb..ea93ea6fd0669 100644
--- a/sycl/test-e2e/ESIMD/lsc/atomic_smoke_scalar_off.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/atomic_smoke_scalar_off.cpp
@@ -8,8 +8,6 @@
 // This test checks LSC atomic operations.
 //===----------------------------------------------------------------------===//
 // REQUIRES: gpu-intel-pvc
-// TODO: esimd_emulator fails due to random timeouts (_XFAIL_: esimd_emulator)
-// UNSUPPORTED: esimd_emulator
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 //
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_argument_type_deduction.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_argument_type_deduction.cpp
index 4a3becd037207..3d6cfcb1375ad 100644
--- a/sycl/test-e2e/ESIMD/lsc/lsc_argument_type_deduction.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_argument_type_deduction.cpp
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: gpu-intel-pvc || esimd_emulator
+// REQUIRES: gpu-intel-pvc
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_block_load_store_stateless_64.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_block_load_store_stateless_64.cpp
index 58a6deae8129f..4eedefc8c5288 100644
--- a/sycl/test-e2e/ESIMD/lsc/lsc_block_load_store_stateless_64.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_block_load_store_stateless_64.cpp
@@ -6,7 +6,6 @@
 //
 //===-------------------------------------------------------------------===//
 // REQUIRES: gpu-intel-pvc
-// UNSUPPORTED: esimd_emulator
 // RUN: %{build} -o %t.out -fsycl-esimd-force-stateless-mem
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_block_load_store_stateless_flag_64.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_block_load_store_stateless_flag_64.cpp
index 7c723f927595c..0d78bfa4f423e 100644
--- a/sycl/test-e2e/ESIMD/lsc/lsc_block_load_store_stateless_flag_64.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_block_load_store_stateless_flag_64.cpp
@@ -6,7 +6,6 @@
 //
 //===-------------------------------------------------------------------===//
 // REQUIRES: gpu-intel-pvc
-// UNSUPPORTED: esimd_emulator
 // RUN: %{build} -o %t.out -fsycl-esimd-force-stateless-mem
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_fence.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_fence.cpp
index ebf3fdfbfbd68..5a0bb85f34301 100644
--- a/sycl/test-e2e/ESIMD/lsc/lsc_fence.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_fence.cpp
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: gpu-intel-pvc || esimd_emulator
+// REQUIRES: gpu-intel-pvc
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_gather_scatter_stateless_64.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_gather_scatter_stateless_64.cpp
index 6ff0dc9af80fa..6d00985dafde3 100644
--- a/sycl/test-e2e/ESIMD/lsc/lsc_gather_scatter_stateless_64.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_gather_scatter_stateless_64.cpp
@@ -6,7 +6,6 @@
 //
 //=----------------------------------------------------------------------=//
 // REQUIRES: gpu-intel-pvc
-// UNSUPPORTED: esimd_emulator
 // RUN: %{build} -o %t.out -fsycl-esimd-force-stateless-mem
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_load_2d_u16.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_load_2d_u16.cpp
index 6166037a33634..02c53047eb5e6 100644
--- a/sycl/test-e2e/ESIMD/lsc/lsc_load_2d_u16.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_load_2d_u16.cpp
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: gpu-intel-pvc || esimd_emulator
+// REQUIRES: gpu-intel-pvc
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_load_2d_u32.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_load_2d_u32.cpp
index 1d639c3e2f033..55f55b85a4a27 100644
--- a/sycl/test-e2e/ESIMD/lsc/lsc_load_2d_u32.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_load_2d_u32.cpp
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: gpu-intel-pvc || esimd_emulator
+// REQUIRES: gpu-intel-pvc
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_load_2d_u64.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_load_2d_u64.cpp
index c4b08d4ac1db4..e29eb0b69e3ca 100644
--- a/sycl/test-e2e/ESIMD/lsc/lsc_load_2d_u64.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_load_2d_u64.cpp
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: gpu-intel-pvc || esimd_emulator
+// REQUIRES: gpu-intel-pvc
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_load_2d_u8.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_load_2d_u8.cpp
index 0bfd6f26e22ec..ece73b40a5065 100644
--- a/sycl/test-e2e/ESIMD/lsc/lsc_load_2d_u8.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_load_2d_u8.cpp
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: gpu-intel-pvc || esimd_emulator
+// REQUIRES: gpu-intel-pvc
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_predicate.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_predicate.cpp
index 331721c4b3a03..d467374451538 100644
--- a/sycl/test-e2e/ESIMD/lsc/lsc_predicate.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_predicate.cpp
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: gpu-intel-pvc || esimd_emulator
+// REQUIRES: gpu-intel-pvc
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_predicate_stateless.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_predicate_stateless.cpp
index bb13127c9dda5..841127c500a94 100644
--- a/sycl/test-e2e/ESIMD/lsc/lsc_predicate_stateless.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_predicate_stateless.cpp
@@ -5,7 +5,6 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// UNSUPPORTED: esimd_emulator
 // REQUIRES: gpu-intel-pvc
 // RUN: %{build} -fsycl-esimd-force-stateless-mem -o %t.out
 // RUN: %{run} %t.out
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_prefetch_2d_u16.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_prefetch_2d_u16.cpp
index 4838787286fe4..65d5afe54c541 100644
--- a/sycl/test-e2e/ESIMD/lsc/lsc_prefetch_2d_u16.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_prefetch_2d_u16.cpp
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: gpu-intel-pvc || esimd_emulator
+// REQUIRES: gpu-intel-pvc
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_prefetch_2d_u32.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_prefetch_2d_u32.cpp
index 9947e1f9c3289..1c1b4717c8944 100644
--- a/sycl/test-e2e/ESIMD/lsc/lsc_prefetch_2d_u32.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_prefetch_2d_u32.cpp
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: gpu-intel-pvc || esimd_emulator
+// REQUIRES: gpu-intel-pvc
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_prefetch_2d_u64.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_prefetch_2d_u64.cpp
index ef63ad9438259..e691e1bb61e78 100644
--- a/sycl/test-e2e/ESIMD/lsc/lsc_prefetch_2d_u64.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_prefetch_2d_u64.cpp
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: gpu-intel-pvc || esimd_emulator
+// REQUIRES: gpu-intel-pvc
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_prefetch_2d_u8.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_prefetch_2d_u8.cpp
index f100e8743dba0..27e0361a9d0d2 100644
--- a/sycl/test-e2e/ESIMD/lsc/lsc_prefetch_2d_u8.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_prefetch_2d_u8.cpp
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: gpu-intel-pvc || esimd_emulator
+// REQUIRES: gpu-intel-pvc
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_slm.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_slm.cpp
index 35dd1a37e43d1..094eafe5b1f64 100644
--- a/sycl/test-e2e/ESIMD/lsc/lsc_slm.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_slm.cpp
@@ -5,8 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: gpu-intel-pvc || esimd_emulator
-// TODO : esimd_emulator does not support lsc-atomic yet
+// REQUIRES: gpu-intel-pvc
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_slm_atomic_cmpxchg.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_slm_atomic_cmpxchg.cpp
index aad2ca9ac604b..a66d90dc5b389 100644
--- a/sycl/test-e2e/ESIMD/lsc/lsc_slm_atomic_cmpxchg.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_slm_atomic_cmpxchg.cpp
@@ -8,9 +8,6 @@
 // This test checks LSC SLM atomic operations.
 //===----------------------------------------------------------------------===//
 // REQUIRES: gpu-intel-pvc
-// TODO: esimd_emulator fails due to random timeouts (_XFAIL_: esimd_emulator)
-// TODO: esimd_emulator doesn't support xchg operation
-// UNSUPPORTED: esimd_emulator
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_slm_atomic_smoke.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_slm_atomic_smoke.cpp
index 495f33ee42844..4fde6446bdff1 100644
--- a/sycl/test-e2e/ESIMD/lsc/lsc_slm_atomic_smoke.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_slm_atomic_smoke.cpp
@@ -8,9 +8,6 @@
 // This test checks LSC SLM atomic operations.
 //===----------------------------------------------------------------------===//
 // REQUIRES: gpu-intel-pvc
-// TODO: esimd_emulator fails due to random timeouts (_XFAIL_: esimd_emulator)
-// TODO: esimd_emulator doesn't support xchg operation
-// UNSUPPORTED: esimd_emulator
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_slm_block_load.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_slm_block_load.cpp
index 9e01364989cb0..86216ac64332e 100644
--- a/sycl/test-e2e/ESIMD/lsc/lsc_slm_block_load.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_slm_block_load.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: gpu-intel-pvc || esimd_emulator
+// REQUIRES: gpu-intel-pvc
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_slm_gather.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_slm_gather.cpp
index 5896766a03c75..45411362e970c 100644
--- a/sycl/test-e2e/ESIMD/lsc/lsc_slm_gather.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_slm_gather.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: gpu-intel-pvc || esimd_emulator
+// REQUIRES: gpu-intel-pvc
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_store_2d_u16.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_store_2d_u16.cpp
index 2bee181fd3326..82c6bb217da15 100644
--- a/sycl/test-e2e/ESIMD/lsc/lsc_store_2d_u16.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_store_2d_u16.cpp
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: gpu-intel-pvc || esimd_emulator
+// REQUIRES: gpu-intel-pvc
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_store_2d_u32.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_store_2d_u32.cpp
index c738e9d7611f5..08bde5eed21df 100644
--- a/sycl/test-e2e/ESIMD/lsc/lsc_store_2d_u32.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_store_2d_u32.cpp
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: gpu-intel-pvc || esimd_emulator
+// REQUIRES: gpu-intel-pvc
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_store_2d_u8.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_store_2d_u8.cpp
index 4d4b70f2a4168..00cc0252a648f 100644
--- a/sycl/test-e2e/ESIMD/lsc/lsc_store_2d_u8.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_store_2d_u8.cpp
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: gpu-intel-pvc || esimd_emulator
+// REQUIRES: gpu-intel-pvc
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_surf.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_surf.cpp
index 8c2c8ee963cb2..a6b01c446e69a 100644
--- a/sycl/test-e2e/ESIMD/lsc/lsc_surf.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_surf.cpp
@@ -5,8 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: gpu-intel-pvc || esimd_emulator
-// TODO : esimd_emulator does not support lsc-atomic yet
+// REQUIRES: gpu-intel-pvc
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_surf_load_u16u32.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_surf_load_u16u32.cpp
index b38775b343b20..8989c92ff934a 100644
--- a/sycl/test-e2e/ESIMD/lsc/lsc_surf_load_u16u32.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_surf_load_u16u32.cpp
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: gpu-intel-pvc || esimd_emulator
+// REQUIRES: gpu-intel-pvc
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_surf_load_u32.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_surf_load_u32.cpp
index 74de5a2ed9e5d..f1315e57b3ec0 100644
--- a/sycl/test-e2e/ESIMD/lsc/lsc_surf_load_u32.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_surf_load_u32.cpp
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: gpu-intel-pvc || esimd_emulator
+// REQUIRES: gpu-intel-pvc
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_surf_load_u32_stateless.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_surf_load_u32_stateless.cpp
index 97d4d87b074ee..c91ae9d98d6ef 100644
--- a/sycl/test-e2e/ESIMD/lsc/lsc_surf_load_u32_stateless.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_surf_load_u32_stateless.cpp
@@ -5,7 +5,6 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// UNSUPPORTED: esimd_emulator
 // REQUIRES: gpu-intel-pvc
 // RUN: %{build} -fsycl-esimd-force-stateless-mem -o %t.out
 // RUN: %{run} %t.out
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_surf_load_u64.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_surf_load_u64.cpp
index 21bb6e026b698..540d64a984e26 100644
--- a/sycl/test-e2e/ESIMD/lsc/lsc_surf_load_u64.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_surf_load_u64.cpp
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: gpu-intel-pvc || esimd_emulator
+// REQUIRES: gpu-intel-pvc
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_surf_load_u64_stateless.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_surf_load_u64_stateless.cpp
index 53ce8cd28671f..316d0736c2919 100644
--- a/sycl/test-e2e/ESIMD/lsc/lsc_surf_load_u64_stateless.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_surf_load_u64_stateless.cpp
@@ -5,7 +5,6 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// UNSUPPORTED: esimd_emulator
 // REQUIRES: gpu-intel-pvc
 // RUN: %{build} -fsycl-esimd-force-stateless-mem -o %t.out
 // RUN: %{run} %t.out
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_surf_load_u8_u16.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_surf_load_u8_u16.cpp
index fbee85d5d7928..0a031bd47fb62 100644
--- a/sycl/test-e2e/ESIMD/lsc/lsc_surf_load_u8_u16.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_surf_load_u8_u16.cpp
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: gpu-intel-pvc || esimd_emulator
+// REQUIRES: gpu-intel-pvc
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_surf_load_u8_u16_stateless.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_surf_load_u8_u16_stateless.cpp
index c459f11a1a97f..4c78d54ab4a8c 100644
--- a/sycl/test-e2e/ESIMD/lsc/lsc_surf_load_u8_u16_stateless.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_surf_load_u8_u16_stateless.cpp
@@ -5,7 +5,6 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// UNSUPPORTED: esimd_emulator
 // REQUIRES: gpu-intel-pvc
 // RUN: %{build} -fsycl-esimd-force-stateless-mem -o %t.out
 // RUN: %{run} %t.out
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_surf_load_u8u32.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_surf_load_u8u32.cpp
index 1c21601d01068..eb40e891b2b05 100644
--- a/sycl/test-e2e/ESIMD/lsc/lsc_surf_load_u8u32.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_surf_load_u8u32.cpp
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: gpu-intel-pvc || esimd_emulator
+// REQUIRES: gpu-intel-pvc
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_surf_prefetch_u16u32.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_surf_prefetch_u16u32.cpp
index 82f79c8cd77db..2918f80680130 100644
--- a/sycl/test-e2e/ESIMD/lsc/lsc_surf_prefetch_u16u32.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_surf_prefetch_u16u32.cpp
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: gpu-intel-pvc || esimd_emulator
+// REQUIRES: gpu-intel-pvc
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_surf_prefetch_u32.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_surf_prefetch_u32.cpp
index d601e98549407..2e064cbfcb99b 100644
--- a/sycl/test-e2e/ESIMD/lsc/lsc_surf_prefetch_u32.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_surf_prefetch_u32.cpp
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: gpu-intel-pvc || esimd_emulator
+// REQUIRES: gpu-intel-pvc
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_surf_prefetch_u64.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_surf_prefetch_u64.cpp
index 96c93784f576c..84a14bad846d0 100644
--- a/sycl/test-e2e/ESIMD/lsc/lsc_surf_prefetch_u64.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_surf_prefetch_u64.cpp
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: gpu-intel-pvc || esimd_emulator
+// REQUIRES: gpu-intel-pvc
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_surf_prefetch_u8u32.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_surf_prefetch_u8u32.cpp
index ad1d493ffbd31..a00fc249c1d39 100644
--- a/sycl/test-e2e/ESIMD/lsc/lsc_surf_prefetch_u8u32.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_surf_prefetch_u8u32.cpp
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: gpu-intel-pvc || esimd_emulator
+// REQUIRES: gpu-intel-pvc
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_surf_store_u16u32.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_surf_store_u16u32.cpp
index 8ffd9b300a9e1..a561ff9a1146c 100644
--- a/sycl/test-e2e/ESIMD/lsc/lsc_surf_store_u16u32.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_surf_store_u16u32.cpp
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: gpu-intel-pvc || esimd_emulator
+// REQUIRES: gpu-intel-pvc
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_surf_store_u32.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_surf_store_u32.cpp
index 07f933ee8c122..7af0cedf611b0 100644
--- a/sycl/test-e2e/ESIMD/lsc/lsc_surf_store_u32.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_surf_store_u32.cpp
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: gpu-intel-pvc || esimd_emulator
+// REQUIRES: gpu-intel-pvc
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_surf_store_u32_stateless.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_surf_store_u32_stateless.cpp
index e85c5adff9ec7..03519e079004a 100644
--- a/sycl/test-e2e/ESIMD/lsc/lsc_surf_store_u32_stateless.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_surf_store_u32_stateless.cpp
@@ -5,7 +5,6 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// UNSUPPORTED: esimd_emulator
 // REQUIRES: gpu-intel-pvc
 // RUN: %{build} -fsycl-esimd-force-stateless-mem -o %t.out
 // RUN: %{run} %t.out
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_surf_store_u64.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_surf_store_u64.cpp
index 824c06e0eeea2..ea5d6cf804ebf 100644
--- a/sycl/test-e2e/ESIMD/lsc/lsc_surf_store_u64.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_surf_store_u64.cpp
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: gpu-intel-pvc || esimd_emulator
+// REQUIRES: gpu-intel-pvc
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_surf_store_u64_stateless.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_surf_store_u64_stateless.cpp
index 3c0262be43147..dba135e798c3e 100644
--- a/sycl/test-e2e/ESIMD/lsc/lsc_surf_store_u64_stateless.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_surf_store_u64_stateless.cpp
@@ -5,7 +5,6 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// UNSUPPORTED: esimd_emulator
 // REQUIRES: gpu-intel-pvc
 // RUN: %{build} -fsycl-esimd-force-stateless-mem -o %t.out
 // RUN: %{run} %t.out
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_surf_store_u8_u16.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_surf_store_u8_u16.cpp
index c1485e0aa483e..b4c3621056927 100644
--- a/sycl/test-e2e/ESIMD/lsc/lsc_surf_store_u8_u16.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_surf_store_u8_u16.cpp
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: gpu-intel-pvc || esimd_emulator
+// REQUIRES: gpu-intel-pvc
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_surf_store_u8u32.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_surf_store_u8u32.cpp
index 5e4692a484c30..26a4b5e993067 100644
--- a/sycl/test-e2e/ESIMD/lsc/lsc_surf_store_u8u32.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_surf_store_u8u32.cpp
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: gpu-intel-pvc || esimd_emulator
+// REQUIRES: gpu-intel-pvc
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_usm.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_usm.cpp
index 62f63b8e76d85..bc4c03dcade5e 100644
--- a/sycl/test-e2e/ESIMD/lsc/lsc_usm.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_usm.cpp
@@ -5,8 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: gpu-intel-pvc || esimd_emulator
-// TODO : esimd_emulator does not support lsc-atomic yet
+// REQUIRES: gpu-intel-pvc
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_usm_2d.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_usm_2d.cpp
index 804b2cdb5ab87..fa1d0412136e2 100644
--- a/sycl/test-e2e/ESIMD/lsc/lsc_usm_2d.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_usm_2d.cpp
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: gpu-intel-pvc || esimd_emulator
+// REQUIRES: gpu-intel-pvc
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_usm_64.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_usm_64.cpp
index 4e922aae13b79..05744de776bc5 100644
--- a/sycl/test-e2e/ESIMD/lsc/lsc_usm_64.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_usm_64.cpp
@@ -5,8 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: gpu-intel-pvc || esimd_emulator
-// TODO : esimd_emulator does not support lsc-atomic yet
+// REQUIRES: gpu-intel-pvc
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_usm_atomic_cachehint.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_usm_atomic_cachehint.cpp
index 6691b05a43644..d797149205983 100644
--- a/sycl/test-e2e/ESIMD/lsc/lsc_usm_atomic_cachehint.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_usm_atomic_cachehint.cpp
@@ -5,9 +5,8 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: gpu-intel-pvc || esimd_emulator
+// REQUIRES: gpu-intel-pvc
 // TODO : Test uses 'kernel_bundle' that is not supported in ESIMD_EMULATOR
-// XFAIL: esimd_emulator
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_usm_atomic_cachehint_64.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_usm_atomic_cachehint_64.cpp
index 3fbab6f2070c9..d9b07693ef028 100644
--- a/sycl/test-e2e/ESIMD/lsc/lsc_usm_atomic_cachehint_64.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_usm_atomic_cachehint_64.cpp
@@ -5,9 +5,8 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: gpu-intel-pvc || esimd_emulator
+// REQUIRES: gpu-intel-pvc
 // TODO : Test uses 'kernel_bundle' that is not supported in ESIMD_EMULATOR
-// XFAIL: esimd_emulator
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // 64 bit offset variant of the test - uses 64 bit offsets.
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_usm_block_load_u32.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_usm_block_load_u32.cpp
index d8e4798078233..2b7b9186cb288 100644
--- a/sycl/test-e2e/ESIMD/lsc/lsc_usm_block_load_u32.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_usm_block_load_u32.cpp
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: gpu-intel-pvc || esimd_emulator
+// REQUIRES: gpu-intel-pvc
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_usm_block_load_u32_64.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_usm_block_load_u32_64.cpp
index b13a038bc6b68..582279fa822ef 100644
--- a/sycl/test-e2e/ESIMD/lsc/lsc_usm_block_load_u32_64.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_usm_block_load_u32_64.cpp
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: gpu-intel-pvc || esimd_emulator
+// REQUIRES: gpu-intel-pvc
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_usm_block_load_u32_scalar_off.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_usm_block_load_u32_scalar_off.cpp
index eb7f4155f7806..b9163aceae6d8 100644
--- a/sycl/test-e2e/ESIMD/lsc/lsc_usm_block_load_u32_scalar_off.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_usm_block_load_u32_scalar_off.cpp
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: gpu-intel-pvc || esimd_emulator
+// REQUIRES: gpu-intel-pvc
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_usm_block_load_u64.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_usm_block_load_u64.cpp
index d39133ee644cc..bf5a9ba9cd543 100644
--- a/sycl/test-e2e/ESIMD/lsc/lsc_usm_block_load_u64.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_usm_block_load_u64.cpp
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: gpu-intel-pvc || esimd_emulator
+// REQUIRES: gpu-intel-pvc
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_usm_block_load_u64_64.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_usm_block_load_u64_64.cpp
index 35423a9a03f60..6174dff54baef 100644
--- a/sycl/test-e2e/ESIMD/lsc/lsc_usm_block_load_u64_64.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_usm_block_load_u64_64.cpp
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: gpu-intel-pvc || esimd_emulator
+// REQUIRES: gpu-intel-pvc
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_usm_block_load_u64_scalar_off.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_usm_block_load_u64_scalar_off.cpp
index 2c9220263bbea..e8632bd521895 100644
--- a/sycl/test-e2e/ESIMD/lsc/lsc_usm_block_load_u64_scalar_off.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_usm_block_load_u64_scalar_off.cpp
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: gpu-intel-pvc || esimd_emulator
+// REQUIRES: gpu-intel-pvc
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_usm_block_load_u8_u16.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_usm_block_load_u8_u16.cpp
index d2433cedf234d..89c33c4282623 100644
--- a/sycl/test-e2e/ESIMD/lsc/lsc_usm_block_load_u8_u16.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_usm_block_load_u8_u16.cpp
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: gpu-intel-pvc || esimd_emulator
+// REQUIRES: gpu-intel-pvc
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_usm_block_load_u8_u16_64.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_usm_block_load_u8_u16_64.cpp
index fb4b0c5d04e78..ee1ec12323b8d 100644
--- a/sycl/test-e2e/ESIMD/lsc/lsc_usm_block_load_u8_u16_64.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_usm_block_load_u8_u16_64.cpp
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: gpu-intel-pvc || esimd_emulator
+// REQUIRES: gpu-intel-pvc
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_usm_gather_u16u32.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_usm_gather_u16u32.cpp
index 3a056bcd3de22..b7ab68dbc875d 100644
--- a/sycl/test-e2e/ESIMD/lsc/lsc_usm_gather_u16u32.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_usm_gather_u16u32.cpp
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: gpu-intel-pvc || esimd_emulator
+// REQUIRES: gpu-intel-pvc
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_usm_gather_u16u32_64.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_usm_gather_u16u32_64.cpp
index 2da2fbd3cbc60..abda1d562c879 100644
--- a/sycl/test-e2e/ESIMD/lsc/lsc_usm_gather_u16u32_64.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_usm_gather_u16u32_64.cpp
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: gpu-intel-pvc || esimd_emulator
+// REQUIRES: gpu-intel-pvc
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_usm_gather_u32.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_usm_gather_u32.cpp
index ed5756d6d63a6..5372bbd2f1fe0 100644
--- a/sycl/test-e2e/ESIMD/lsc/lsc_usm_gather_u32.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_usm_gather_u32.cpp
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: gpu-intel-pvc || esimd_emulator
+// REQUIRES: gpu-intel-pvc
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_usm_gather_u32_64.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_usm_gather_u32_64.cpp
index f303d8a2f7728..c30555ab07eb6 100644
--- a/sycl/test-e2e/ESIMD/lsc/lsc_usm_gather_u32_64.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_usm_gather_u32_64.cpp
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: gpu-intel-pvc || esimd_emulator
+// REQUIRES: gpu-intel-pvc
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_usm_gather_u64.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_usm_gather_u64.cpp
index 929a8d40f4d8b..c777b897df925 100644
--- a/sycl/test-e2e/ESIMD/lsc/lsc_usm_gather_u64.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_usm_gather_u64.cpp
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: gpu-intel-pvc || esimd_emulator
+// REQUIRES: gpu-intel-pvc
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_usm_gather_u64_64.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_usm_gather_u64_64.cpp
index 21c77b74f98ea..cb3c1518bcc85 100644
--- a/sycl/test-e2e/ESIMD/lsc/lsc_usm_gather_u64_64.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_usm_gather_u64_64.cpp
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: gpu-intel-pvc || esimd_emulator
+// REQUIRES: gpu-intel-pvc
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_usm_gather_u8_u16.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_usm_gather_u8_u16.cpp
index 649db7f71e943..023f4f36e0c1a 100644
--- a/sycl/test-e2e/ESIMD/lsc/lsc_usm_gather_u8_u16.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_usm_gather_u8_u16.cpp
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: gpu-intel-pvc || esimd_emulator
+// REQUIRES: gpu-intel-pvc
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_usm_gather_u8_u16_64.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_usm_gather_u8_u16_64.cpp
index c773419bfbf52..9d6051fc20346 100644
--- a/sycl/test-e2e/ESIMD/lsc/lsc_usm_gather_u8_u16_64.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_usm_gather_u8_u16_64.cpp
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: gpu-intel-pvc || esimd_emulator
+// REQUIRES: gpu-intel-pvc
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_usm_gather_u8u32.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_usm_gather_u8u32.cpp
index 80b59d5977648..3df170f24977d 100644
--- a/sycl/test-e2e/ESIMD/lsc/lsc_usm_gather_u8u32.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_usm_gather_u8u32.cpp
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: gpu-intel-pvc || esimd_emulator
+// REQUIRES: gpu-intel-pvc
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_usm_gather_u8u32_64.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_usm_gather_u8u32_64.cpp
index 67ab8038e3ac3..1a0432322e75b 100644
--- a/sycl/test-e2e/ESIMD/lsc/lsc_usm_gather_u8u32_64.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_usm_gather_u8u32_64.cpp
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: gpu-intel-pvc || esimd_emulator
+// REQUIRES: gpu-intel-pvc
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_usm_prefetch_u16u32.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_usm_prefetch_u16u32.cpp
index 72aac372bb863..ed84ba60eaf2d 100644
--- a/sycl/test-e2e/ESIMD/lsc/lsc_usm_prefetch_u16u32.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_usm_prefetch_u16u32.cpp
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: gpu-intel-pvc || esimd_emulator
+// REQUIRES: gpu-intel-pvc
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_usm_prefetch_u16u32_64.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_usm_prefetch_u16u32_64.cpp
index db509202c3762..9d4e47e2012ac 100644
--- a/sycl/test-e2e/ESIMD/lsc/lsc_usm_prefetch_u16u32_64.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_usm_prefetch_u16u32_64.cpp
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: gpu-intel-pvc || esimd_emulator
+// REQUIRES: gpu-intel-pvc
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_usm_prefetch_u32.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_usm_prefetch_u32.cpp
index 7969d4f66cb6b..0f1a1f2af3a49 100644
--- a/sycl/test-e2e/ESIMD/lsc/lsc_usm_prefetch_u32.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_usm_prefetch_u32.cpp
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: gpu-intel-pvc || esimd_emulator
+// REQUIRES: gpu-intel-pvc
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_usm_prefetch_u32_64.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_usm_prefetch_u32_64.cpp
index bec6571e3deb5..322f974814829 100644
--- a/sycl/test-e2e/ESIMD/lsc/lsc_usm_prefetch_u32_64.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_usm_prefetch_u32_64.cpp
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: gpu-intel-pvc || esimd_emulator
+// REQUIRES: gpu-intel-pvc
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_usm_prefetch_u32_scalar_off.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_usm_prefetch_u32_scalar_off.cpp
index 8a6f2c16a1ac1..d04b1d3a3c4b0 100644
--- a/sycl/test-e2e/ESIMD/lsc/lsc_usm_prefetch_u32_scalar_off.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_usm_prefetch_u32_scalar_off.cpp
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: gpu-intel-pvc || esimd_emulator
+// REQUIRES: gpu-intel-pvc
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_usm_prefetch_u64.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_usm_prefetch_u64.cpp
index 344e69f943eb3..c6c8c3348fee5 100644
--- a/sycl/test-e2e/ESIMD/lsc/lsc_usm_prefetch_u64.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_usm_prefetch_u64.cpp
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: gpu-intel-pvc || esimd_emulator
+// REQUIRES: gpu-intel-pvc
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_usm_prefetch_u64_64.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_usm_prefetch_u64_64.cpp
index 9ecd136ca873f..b82f3022fd279 100644
--- a/sycl/test-e2e/ESIMD/lsc/lsc_usm_prefetch_u64_64.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_usm_prefetch_u64_64.cpp
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: gpu-intel-pvc || esimd_emulator
+// REQUIRES: gpu-intel-pvc
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_usm_prefetch_u64_scalar_off.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_usm_prefetch_u64_scalar_off.cpp
index 72b3670025f7c..178e30cc2ca83 100644
--- a/sycl/test-e2e/ESIMD/lsc/lsc_usm_prefetch_u64_scalar_off.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_usm_prefetch_u64_scalar_off.cpp
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: gpu-intel-pvc || esimd_emulator
+// REQUIRES: gpu-intel-pvc
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_usm_prefetch_u8u32.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_usm_prefetch_u8u32.cpp
index 206add74171aa..654bdc903ac61 100644
--- a/sycl/test-e2e/ESIMD/lsc/lsc_usm_prefetch_u8u32.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_usm_prefetch_u8u32.cpp
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: gpu-intel-pvc || esimd_emulator
+// REQUIRES: gpu-intel-pvc
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_usm_prefetch_u8u32_64.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_usm_prefetch_u8u32_64.cpp
index e1801c84252ef..d1cba7d5c2bb6 100644
--- a/sycl/test-e2e/ESIMD/lsc/lsc_usm_prefetch_u8u32_64.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_usm_prefetch_u8u32_64.cpp
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: gpu-intel-pvc || esimd_emulator
+// REQUIRES: gpu-intel-pvc
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // 64 bit offset variant of the test - uses 64 bit offsets.
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_usm_store_u16u32.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_usm_store_u16u32.cpp
index 992cec0cc9acb..47ae54478acc3 100644
--- a/sycl/test-e2e/ESIMD/lsc/lsc_usm_store_u16u32.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_usm_store_u16u32.cpp
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: gpu-intel-pvc || esimd_emulator
+// REQUIRES: gpu-intel-pvc
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_usm_store_u16u32_64.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_usm_store_u16u32_64.cpp
index 03596478317e0..894efa1474678 100644
--- a/sycl/test-e2e/ESIMD/lsc/lsc_usm_store_u16u32_64.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_usm_store_u16u32_64.cpp
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: gpu-intel-pvc || esimd_emulator
+// REQUIRES: gpu-intel-pvc
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_usm_store_u32.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_usm_store_u32.cpp
index 94a58c7c16b98..0c5d807d92349 100644
--- a/sycl/test-e2e/ESIMD/lsc/lsc_usm_store_u32.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_usm_store_u32.cpp
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: gpu-intel-pvc || esimd_emulator
+// REQUIRES: gpu-intel-pvc
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_usm_store_u32_64.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_usm_store_u32_64.cpp
index 9925b55425821..23153d76cbf77 100644
--- a/sycl/test-e2e/ESIMD/lsc/lsc_usm_store_u32_64.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_usm_store_u32_64.cpp
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: gpu-intel-pvc || esimd_emulator
+// REQUIRES: gpu-intel-pvc
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_usm_store_u64.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_usm_store_u64.cpp
index 870a18d7117bd..55b4d083c12e8 100644
--- a/sycl/test-e2e/ESIMD/lsc/lsc_usm_store_u64.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_usm_store_u64.cpp
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: gpu-intel-pvc || esimd_emulator
+// REQUIRES: gpu-intel-pvc
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_usm_store_u64_64.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_usm_store_u64_64.cpp
index 75e9b1a831f6c..4dff03aac15d0 100644
--- a/sycl/test-e2e/ESIMD/lsc/lsc_usm_store_u64_64.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_usm_store_u64_64.cpp
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: gpu-intel-pvc || esimd_emulator
+// REQUIRES: gpu-intel-pvc
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_usm_store_u8_u16.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_usm_store_u8_u16.cpp
index db2f4dc81e5d2..8474224f62229 100644
--- a/sycl/test-e2e/ESIMD/lsc/lsc_usm_store_u8_u16.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_usm_store_u8_u16.cpp
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: gpu-intel-pvc || esimd_emulator
+// REQUIRES: gpu-intel-pvc
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_usm_store_u8_u16_64.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_usm_store_u8_u16_64.cpp
index 1a52e7241c11c..6f45cce381e0c 100644
--- a/sycl/test-e2e/ESIMD/lsc/lsc_usm_store_u8_u16_64.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_usm_store_u8_u16_64.cpp
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: gpu-intel-pvc || esimd_emulator
+// REQUIRES: gpu-intel-pvc
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_usm_store_u8u32.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_usm_store_u8u32.cpp
index 72541b4240c12..46862ff4be1ff 100644
--- a/sycl/test-e2e/ESIMD/lsc/lsc_usm_store_u8u32.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_usm_store_u8u32.cpp
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: gpu-intel-pvc || esimd_emulator
+// REQUIRES: gpu-intel-pvc
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_usm_store_u8u32_64.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_usm_store_u8u32_64.cpp
index 7ea20a622deaf..8a76c521c69f3 100644
--- a/sycl/test-e2e/ESIMD/lsc/lsc_usm_store_u8u32_64.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_usm_store_u8u32_64.cpp
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: gpu-intel-pvc || esimd_emulator
+// REQUIRES: gpu-intel-pvc
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/ESIMD/mandelbrot/mandelbrot_spec.cpp b/sycl/test-e2e/ESIMD/mandelbrot/mandelbrot_spec.cpp
index 796f202a0f923..397562342f704 100644
--- a/sycl/test-e2e/ESIMD/mandelbrot/mandelbrot_spec.cpp
+++ b/sycl/test-e2e/ESIMD/mandelbrot/mandelbrot_spec.cpp
@@ -9,8 +9,6 @@
 // TODO enable on Windows
 // REQUIRES: linux && gpu
 // REQUIRES: aspect-ext_intel_legacy_image
-// TODO online_compiler check fails for esimd_emulator
-// XFAIL: esimd_emulator
 // TODO: remove fno-fast-math option once the issue is investigated and the test
 // is fixed.
 // DEFINE: %{mathflags} = %if cl_options %{/clang:-fno-fast-math%} %else %{-fno-fast-math%}
diff --git a/sycl/test-e2e/ESIMD/matrix_transpose.cpp b/sycl/test-e2e/ESIMD/matrix_transpose.cpp
index 70a6f73b3c76d..58932d67a362b 100644
--- a/sycl/test-e2e/ESIMD/matrix_transpose.cpp
+++ b/sycl/test-e2e/ESIMD/matrix_transpose.cpp
@@ -5,8 +5,6 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// TODO: investigate sporadic esimd_emulator failures
-// UNSUPPORTED: esimd_emulator
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/ESIMD/matrix_transpose2.cpp b/sycl/test-e2e/ESIMD/matrix_transpose2.cpp
index d824e58b462aa..87f2354e12e8c 100644
--- a/sycl/test-e2e/ESIMD/matrix_transpose2.cpp
+++ b/sycl/test-e2e/ESIMD/matrix_transpose2.cpp
@@ -5,8 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// TODO: investigate sporadic esimd_emulator failures
-// UNSUPPORTED: gpu-intel-pvc || esimd_emulator
+// UNSUPPORTED: gpu-intel-pvc
 // REQUIRES: aspect-ext_intel_legacy_image
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
diff --git a/sycl/test-e2e/ESIMD/named_barriers/exec_in_order.cpp b/sycl/test-e2e/ESIMD/named_barriers/exec_in_order.cpp
index 61333c9601bcc..b9fdc062349a8 100644
--- a/sycl/test-e2e/ESIMD/named_barriers/exec_in_order.cpp
+++ b/sycl/test-e2e/ESIMD/named_barriers/exec_in_order.cpp
@@ -6,7 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// TODO: enable esimd_emulator when supported
 // REQUIRES: gpu-intel-pvc
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
diff --git a/sycl/test-e2e/ESIMD/named_barriers/exec_in_order_branched.cpp b/sycl/test-e2e/ESIMD/named_barriers/exec_in_order_branched.cpp
index bd2158955a007..7ac6c53fb5933 100644
--- a/sycl/test-e2e/ESIMD/named_barriers/exec_in_order_branched.cpp
+++ b/sycl/test-e2e/ESIMD/named_barriers/exec_in_order_branched.cpp
@@ -6,7 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// TODO: enable esimd_emulator when supported
 // REQUIRES: gpu-intel-pvc
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
diff --git a/sycl/test-e2e/ESIMD/named_barriers/loop.cpp b/sycl/test-e2e/ESIMD/named_barriers/loop.cpp
index 9df67633fe502..916eaff8cb649 100644
--- a/sycl/test-e2e/ESIMD/named_barriers/loop.cpp
+++ b/sycl/test-e2e/ESIMD/named_barriers/loop.cpp
@@ -6,7 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// TODO: enable esimd_emulator when supported
 // REQUIRES: gpu-intel-pvc
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
diff --git a/sycl/test-e2e/ESIMD/named_barriers/loop_extended.cpp b/sycl/test-e2e/ESIMD/named_barriers/loop_extended.cpp
index 49bf8d8402045..a6e476c16d95a 100644
--- a/sycl/test-e2e/ESIMD/named_barriers/loop_extended.cpp
+++ b/sycl/test-e2e/ESIMD/named_barriers/loop_extended.cpp
@@ -6,7 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// TODO: enable esimd_emulator when supported
 // REQUIRES: gpu-intel-pvc
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
diff --git a/sycl/test-e2e/ESIMD/named_barriers/multiple_wg.cpp b/sycl/test-e2e/ESIMD/named_barriers/multiple_wg.cpp
index 3f23e5c42b393..1eeeb8f67d478 100644
--- a/sycl/test-e2e/ESIMD/named_barriers/multiple_wg.cpp
+++ b/sycl/test-e2e/ESIMD/named_barriers/multiple_wg.cpp
@@ -6,7 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// TODO: enable esimd_emulator when supported
 // REQUIRES: gpu-intel-pvc
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
diff --git a/sycl/test-e2e/ESIMD/named_barriers/single_wg.cpp b/sycl/test-e2e/ESIMD/named_barriers/single_wg.cpp
index a4bf50b78a64e..1e875d2e18a2a 100644
--- a/sycl/test-e2e/ESIMD/named_barriers/single_wg.cpp
+++ b/sycl/test-e2e/ESIMD/named_barriers/single_wg.cpp
@@ -6,7 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// TODO: enable esimd_emulator when supported
 // REQUIRES: gpu-intel-pvc
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
diff --git a/sycl/test-e2e/ESIMD/no_host_code.cpp b/sycl/test-e2e/ESIMD/no_host_code.cpp
index a510c2d44cb6a..7c65607009bae 100644
--- a/sycl/test-e2e/ESIMD/no_host_code.cpp
+++ b/sycl/test-e2e/ESIMD/no_host_code.cpp
@@ -5,7 +5,6 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// UNSUPPORTED: esimd_emulator
 // RUN: %{build} -fno-sycl-esimd-build-host-code -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/ESIMD/preemption.cpp b/sycl/test-e2e/ESIMD/preemption.cpp
index 06d403e78bd6b..93f244ad71380 100644
--- a/sycl/test-e2e/ESIMD/preemption.cpp
+++ b/sycl/test-e2e/ESIMD/preemption.cpp
@@ -6,7 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 // REQUIRES: linux
-// UNSUPPORTED: esimd_emulator || gpu-intel-dg2 || gpu-intel-pvc
 // RUN: %{build} -o %t.out
 // RUN: env IGC_DumpToCustomDir=%t.dump IGC_ShaderDumpEnable=1 %{run} %t.out
 // RUN: grep enablePreemption %t.dump/*.asm
diff --git a/sycl/test-e2e/ESIMD/private_memory.cpp b/sycl/test-e2e/ESIMD/private_memory.cpp
index 25dc72f891ada..ad0a10984052e 100644
--- a/sycl/test-e2e/ESIMD/private_memory.cpp
+++ b/sycl/test-e2e/ESIMD/private_memory.cpp
@@ -7,9 +7,6 @@
 //===----------------------------------------------------------------------===//
 // UNSUPPORTED: gpu-intel-pvc
 
-// TODO online_compiler check fails for esimd_emulator.
-// XFAIL: esimd_emulator
-
 // RUN: %{build} -fsycl-device-code-split=per_kernel -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/ESIMD/regression/atomic_update_test.cpp b/sycl/test-e2e/ESIMD/regression/atomic_update_test.cpp
index d5888268f9999..c448fd57d71eb 100644
--- a/sycl/test-e2e/ESIMD/regression/atomic_update_test.cpp
+++ b/sycl/test-e2e/ESIMD/regression/atomic_update_test.cpp
@@ -8,8 +8,6 @@
 // This test checks regression in lsc_atomic_update
 //===----------------------------------------------------------------------===//
 // REQUIRES: gpu-intel-pvc
-// TODO add support for atomic_load and atomic_store on esimd_emulator
-// XFAIL: esimd_emulator
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/ESIMD/regression/bit_shift_compilation_test.cpp b/sycl/test-e2e/ESIMD/regression/bit_shift_compilation_test.cpp
index 0a654a8316dae..84c5f3d306fcf 100644
--- a/sycl/test-e2e/ESIMD/regression/bit_shift_compilation_test.cpp
+++ b/sycl/test-e2e/ESIMD/regression/bit_shift_compilation_test.cpp
@@ -1,7 +1,6 @@
 // Use -O2 to avoid huge stack usage under -O0.
 // RUN: %{build} -O2 -o %t.out
 // RUN: %{run} %t.out
-// XFAIL: esimd_emulator
 //==- bit_shift_compilation_test.cpp - Test for compilation of bit shift
 // functions -==//
 //
diff --git a/sycl/test-e2e/ESIMD/regression/complex-lib-lin.cpp b/sycl/test-e2e/ESIMD/regression/complex-lib-lin.cpp
index 0f93dfc0e33f5..c036d68a0a3d1 100644
--- a/sycl/test-e2e/ESIMD/regression/complex-lib-lin.cpp
+++ b/sycl/test-e2e/ESIMD/regression/complex-lib-lin.cpp
@@ -6,8 +6,6 @@
 // More details can be found in intel/llvm#4927.
 //
 // REQUIRES: linux
-// UNSUPPORTED: esimd_emulator
-// TODO: running non-ESIMD kernels on esimd_emulator backend.
 //
 // RUN: %clangxx -fsycl -fPIC -O3 %S/Inputs/complex-lib-sycl.cpp -c -o %t-lib-sycl.o
 // RUN: %clangxx -fsycl -fPIC -O3 %S/Inputs/complex-lib-esimd.cpp -c -o %t-lib-esimd.o
diff --git a/sycl/test-e2e/ESIMD/regression/fmod_compatibility_test.cpp b/sycl/test-e2e/ESIMD/regression/fmod_compatibility_test.cpp
index 4048658df3387..5f7a64a6b79b7 100644
--- a/sycl/test-e2e/ESIMD/regression/fmod_compatibility_test.cpp
+++ b/sycl/test-e2e/ESIMD/regression/fmod_compatibility_test.cpp
@@ -1,5 +1,3 @@
-// TODO online compiler check fails for esimd_emulator
-// XFAIL: esimd_emulator
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/ESIMD/regression/globals.cpp b/sycl/test-e2e/ESIMD/regression/globals.cpp
index a6a11bb803c99..df69bae87f97f 100644
--- a/sycl/test-e2e/ESIMD/regression/globals.cpp
+++ b/sycl/test-e2e/ESIMD/regression/globals.cpp
@@ -1,5 +1,3 @@
-// TODO: esimd_emulator fails due to unimplemented sub-group support
-// XFAIL: esimd_emulator
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/ESIMD/regression/rol_ror_compilation_test.cpp b/sycl/test-e2e/ESIMD/regression/rol_ror_compilation_test.cpp
index 039578a9f88b9..36c87cfe33596 100644
--- a/sycl/test-e2e/ESIMD/regression/rol_ror_compilation_test.cpp
+++ b/sycl/test-e2e/ESIMD/regression/rol_ror_compilation_test.cpp
@@ -1,7 +1,6 @@
 // REQUIRES: gpu-intel-gen11
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
-// XFAIL: esimd_emulator
 //==- rol_ror_compilation_test.cpp - Test for compilation of rol/ror functions
 // functions -==//
 //
diff --git a/sycl/test-e2e/ESIMD/regression/sycl_esimd_mixed_unnamed.cpp b/sycl/test-e2e/ESIMD/regression/sycl_esimd_mixed_unnamed.cpp
index e847491ee2e4b..f34caf146bc2f 100644
--- a/sycl/test-e2e/ESIMD/regression/sycl_esimd_mixed_unnamed.cpp
+++ b/sycl/test-e2e/ESIMD/regression/sycl_esimd_mixed_unnamed.cpp
@@ -8,10 +8,8 @@
 // This is basic test for mixing unnamed SYCL and ESIMD kernels in the same
 // source and in the same program .
 
-// UNSUPPORTED: esimd_emulator
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
-// XFAIL: esimd_emulator
 
 #include <iostream>
 #include <sycl/ext/intel/esimd.hpp>
diff --git a/sycl/test-e2e/ESIMD/slm_alloc.cpp b/sycl/test-e2e/ESIMD/slm_alloc.cpp
index d797326f34cdd..d4fdc9d2e3133 100644
--- a/sycl/test-e2e/ESIMD/slm_alloc.cpp
+++ b/sycl/test-e2e/ESIMD/slm_alloc.cpp
@@ -1,4 +1,3 @@
-// UNSUPPORTED: esimd_emulator
 //
 // RUN: %{build} -o %t.1.out
 // RUN: %{run} %t.1.out
diff --git a/sycl/test-e2e/ESIMD/slm_alloc_many_kernels_many_funcs.cpp b/sycl/test-e2e/ESIMD/slm_alloc_many_kernels_many_funcs.cpp
index f3e89322d2d4f..841e274dff8e4 100644
--- a/sycl/test-e2e/ESIMD/slm_alloc_many_kernels_many_funcs.cpp
+++ b/sycl/test-e2e/ESIMD/slm_alloc_many_kernels_many_funcs.cpp
@@ -1,4 +1,3 @@
-// UNSUPPORTED: esimd_emulator
 //
 // Windows doesn't yet have full shutdown().
 // UNSUPPORTED: ze_debug && windows
diff --git a/sycl/test-e2e/ESIMD/slm_alloc_many_kernels_one_func.cpp b/sycl/test-e2e/ESIMD/slm_alloc_many_kernels_one_func.cpp
index 892a0858b4ec1..2f4e51f41ee08 100644
--- a/sycl/test-e2e/ESIMD/slm_alloc_many_kernels_one_func.cpp
+++ b/sycl/test-e2e/ESIMD/slm_alloc_many_kernels_one_func.cpp
@@ -1,4 +1,3 @@
-// UNSUPPORTED: esimd_emulator
 //
 // Windows doesn't yet have full shutdown().
 // UNSUPPORTED: ze_debug && windows
diff --git a/sycl/test-e2e/ESIMD/slm_block_load_store.cpp b/sycl/test-e2e/ESIMD/slm_block_load_store.cpp
index d95786eaa6e26..10d8463079bfa 100644
--- a/sycl/test-e2e/ESIMD/slm_block_load_store.cpp
+++ b/sycl/test-e2e/ESIMD/slm_block_load_store.cpp
@@ -8,7 +8,6 @@
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 //
-// UNSUPPORTED: esimd_emulator
 //
 // Note: "lin" format below is used for Win L0 as well.
 // REQUIRES-INTEL-DRIVER: lin: 26816, win: 101.4576
diff --git a/sycl/test-e2e/ESIMD/spec_const/spec_const_bool.cpp b/sycl/test-e2e/ESIMD/spec_const/spec_const_bool.cpp
index a862d444a4ca3..47b9f082aa887 100644
--- a/sycl/test-e2e/ESIMD/spec_const/spec_const_bool.cpp
+++ b/sycl/test-e2e/ESIMD/spec_const/spec_const_bool.cpp
@@ -7,8 +7,6 @@
 //===----------------------------------------------------------------------===//
 // RUN: %{build} -I%S/.. -o %t.out
 // RUN: %{run} %t.out
-// TODO online_compiler check fails for esimd_emulator
-// XFAIL: esimd_emulator
 
 #include <cstdint>
 
diff --git a/sycl/test-e2e/ESIMD/spec_const/spec_const_char.cpp b/sycl/test-e2e/ESIMD/spec_const/spec_const_char.cpp
index 3482a4946e23f..f98c365047a79 100644
--- a/sycl/test-e2e/ESIMD/spec_const/spec_const_char.cpp
+++ b/sycl/test-e2e/ESIMD/spec_const/spec_const_char.cpp
@@ -7,8 +7,6 @@
 //===----------------------------------------------------------------------===//
 // RUN: %{build} -I%S/.. -o %t.out
 // RUN: %{run} %t.out
-// TODO online_compiler check fails for esimd_emulator
-// XFAIL: esimd_emulator
 
 #include <cstdint>
 
diff --git a/sycl/test-e2e/ESIMD/spec_const/spec_const_double.cpp b/sycl/test-e2e/ESIMD/spec_const/spec_const_double.cpp
index b0ca23d3e2836..fafbfb246252a 100644
--- a/sycl/test-e2e/ESIMD/spec_const/spec_const_double.cpp
+++ b/sycl/test-e2e/ESIMD/spec_const/spec_const_double.cpp
@@ -8,8 +8,6 @@
 // REQUIRES: aspect-fp64
 // RUN: %{build} -I%S/.. -o %t.out
 // RUN: %{run} %t.out
-// TODO online_compiler check fails for esimd_emulator
-// XFAIL: esimd_emulator
 
 #include <cstdint>
 
diff --git a/sycl/test-e2e/ESIMD/spec_const/spec_const_float.cpp b/sycl/test-e2e/ESIMD/spec_const/spec_const_float.cpp
index a50da6043c714..de7ea270e7115 100644
--- a/sycl/test-e2e/ESIMD/spec_const/spec_const_float.cpp
+++ b/sycl/test-e2e/ESIMD/spec_const/spec_const_float.cpp
@@ -7,8 +7,6 @@
 //===----------------------------------------------------------------------===//
 // RUN: %{build} -I%S/.. -o %t.out
 // RUN: %{run} %t.out
-// TODO online_compiler check fails for esimd_emulator
-// XFAIL: esimd_emulator
 
 #include <cstdint>
 
diff --git a/sycl/test-e2e/ESIMD/spec_const/spec_const_int.cpp b/sycl/test-e2e/ESIMD/spec_const/spec_const_int.cpp
index 1cfe71e46e003..dd3958398cbb4 100644
--- a/sycl/test-e2e/ESIMD/spec_const/spec_const_int.cpp
+++ b/sycl/test-e2e/ESIMD/spec_const/spec_const_int.cpp
@@ -7,8 +7,6 @@
 //===----------------------------------------------------------------------===//
 // RUN: %{build} -I%S/.. -o %t.out
 // RUN: %{run} %t.out
-// TODO online_compiler check fails for esimd_emulator
-// XFAIL: esimd_emulator
 
 #include <cstdint>
 
diff --git a/sycl/test-e2e/ESIMD/spec_const/spec_const_int64.cpp b/sycl/test-e2e/ESIMD/spec_const/spec_const_int64.cpp
index 3f7df5b614aa5..7a78d9559359e 100644
--- a/sycl/test-e2e/ESIMD/spec_const/spec_const_int64.cpp
+++ b/sycl/test-e2e/ESIMD/spec_const/spec_const_int64.cpp
@@ -7,8 +7,6 @@
 //===----------------------------------------------------------------------===//
 // RUN: %{build} -I%S/.. -o %t.out
 // RUN: %{run} %t.out
-// TODO online_compiler check fails for esimd_emulator
-// XFAIL: esimd_emulator
 
 #include <cstdint>
 
diff --git a/sycl/test-e2e/ESIMD/spec_const/spec_const_redefine.cpp b/sycl/test-e2e/ESIMD/spec_const/spec_const_redefine.cpp
index 5824787a3a8b6..67823aeac76aa 100644
--- a/sycl/test-e2e/ESIMD/spec_const/spec_const_redefine.cpp
+++ b/sycl/test-e2e/ESIMD/spec_const/spec_const_redefine.cpp
@@ -1,4 +1,3 @@
-// UNSUPPORTED: esimd_emulator
 // RUN: %{build} -o %t.out
 // RUN: env SYCL_PI_TRACE=2 %{run} %t.out 2>&1 | FileCheck %s
 
diff --git a/sycl/test-e2e/ESIMD/spec_const/spec_const_short.cpp b/sycl/test-e2e/ESIMD/spec_const/spec_const_short.cpp
index deb19b8c8b2ac..8c49da145c3ed 100644
--- a/sycl/test-e2e/ESIMD/spec_const/spec_const_short.cpp
+++ b/sycl/test-e2e/ESIMD/spec_const/spec_const_short.cpp
@@ -7,8 +7,6 @@
 //===----------------------------------------------------------------------===//
 // RUN: %{build} -I%S/.. -o %t.out
 // RUN: %{run} %t.out
-// TODO online_compiler check fails for esimd_emulator
-// XFAIL: esimd_emulator
 
 #include <cstdint>
 
diff --git a/sycl/test-e2e/ESIMD/spec_const/spec_const_uchar.cpp b/sycl/test-e2e/ESIMD/spec_const/spec_const_uchar.cpp
index 3462bba70d4cd..25c41f333e2ba 100644
--- a/sycl/test-e2e/ESIMD/spec_const/spec_const_uchar.cpp
+++ b/sycl/test-e2e/ESIMD/spec_const/spec_const_uchar.cpp
@@ -7,8 +7,6 @@
 //===----------------------------------------------------------------------===//
 // RUN: %{build} -I%S/.. -o %t.out
 // RUN: %{run} %t.out
-// TODO online_compiler check fails for esimd_emulator
-// XFAIL: esimd_emulator
 
 #include <cstdint>
 
diff --git a/sycl/test-e2e/ESIMD/spec_const/spec_const_uint.cpp b/sycl/test-e2e/ESIMD/spec_const/spec_const_uint.cpp
index 9344b33a56ca1..3e5c5c3ca564b 100644
--- a/sycl/test-e2e/ESIMD/spec_const/spec_const_uint.cpp
+++ b/sycl/test-e2e/ESIMD/spec_const/spec_const_uint.cpp
@@ -7,8 +7,6 @@
 //===----------------------------------------------------------------------===//
 // RUN: %{build} -I%S/.. -o %t.out
 // RUN: %{run} %t.out
-// TODO online_compiler check fails for esimd_emulator
-// XFAIL: esimd_emulator
 
 #include <cstdint>
 
diff --git a/sycl/test-e2e/ESIMD/spec_const/spec_const_uint64.cpp b/sycl/test-e2e/ESIMD/spec_const/spec_const_uint64.cpp
index 4e50064c20fde..3087144ef35a5 100644
--- a/sycl/test-e2e/ESIMD/spec_const/spec_const_uint64.cpp
+++ b/sycl/test-e2e/ESIMD/spec_const/spec_const_uint64.cpp
@@ -7,8 +7,6 @@
 //===----------------------------------------------------------------------===//
 // RUN: %{build} -I%S/.. -o %t.out
 // RUN: %{run} %t.out
-// TODO online_compiler check fails for esimd_emulator
-// XFAIL: esimd_emulator
 
 #include <cstdint>
 
diff --git a/sycl/test-e2e/ESIMD/spec_const/spec_const_ushort.cpp b/sycl/test-e2e/ESIMD/spec_const/spec_const_ushort.cpp
index 6a8e58c89e0b8..f94778c6e020d 100644
--- a/sycl/test-e2e/ESIMD/spec_const/spec_const_ushort.cpp
+++ b/sycl/test-e2e/ESIMD/spec_const/spec_const_ushort.cpp
@@ -7,8 +7,6 @@
 //===----------------------------------------------------------------------===//
 // RUN: %{build} -I%S/.. -o %t.out
 // RUN: %{run} %t.out
-// TODO online_compiler check fails for esimd_emulator
-// XFAIL: esimd_emulator
 
 #include <cstdint>
 
diff --git a/sycl/test-e2e/ESIMD/spec_const/unused_spec_const.cpp b/sycl/test-e2e/ESIMD/spec_const/unused_spec_const.cpp
index cc28a2e95095d..13edf6a9db10f 100644
--- a/sycl/test-e2e/ESIMD/spec_const/unused_spec_const.cpp
+++ b/sycl/test-e2e/ESIMD/spec_const/unused_spec_const.cpp
@@ -1,4 +1,3 @@
-// XFAIL: esimd_emulator
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/ESIMD/sycl_esimd_mix.cpp b/sycl/test-e2e/ESIMD/sycl_esimd_mix.cpp
index ba3f2aabde0ed..c59cfddd500ec 100644
--- a/sycl/test-e2e/ESIMD/sycl_esimd_mix.cpp
+++ b/sycl/test-e2e/ESIMD/sycl_esimd_mix.cpp
@@ -8,9 +8,6 @@
 // This is basic test for mixing SYCL and ESIMD kernels in the same source and
 // in the same program .
 
-// TODO/FIXME: esimd_emulator does not support online compilation that
-//             invokes 'piProgramBuild'/'piKernelCreate'
-// UNSUPPORTED: esimd_emulator
 // RUN: %{build} -o %t.out
 // RUN: env SYCL_PI_TRACE=-1 %{run} %t.out 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-NO-VAR
 // RUN: env SYCL_PROGRAM_COMPILE_OPTIONS="-g" SYCL_PI_TRACE=-1 %{run} %t.out 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-WITH-VAR
diff --git a/sycl/test-e2e/ESIMD/vadd_raw_send.cpp b/sycl/test-e2e/ESIMD/vadd_raw_send.cpp
index 1f3955dd3c77a..10dbe7c450c9d 100644
--- a/sycl/test-e2e/ESIMD/vadd_raw_send.cpp
+++ b/sycl/test-e2e/ESIMD/vadd_raw_send.cpp
@@ -7,8 +7,6 @@
 //===----------------------------------------------------------------------===//
 // REQUIRES: gpu-intel-gen9
 // UNSUPPORTED: gpu-intel-dg1,gpu-intel-dg2,gpu-intel-pvc
-// TODO: esimd_emulator fails due to unimplemented 'raw_send' intrinsic
-// XFAIL: esimd_emulator
 // RUN: %{build} -fno-sycl-esimd-force-stateless-mem -o %t1.out
 // RUN: %{run} %t1.out
 // RUN: %{build} -fno-sycl-esimd-force-stateless-mem -DUSE_CONSTEXPR_API -o %t2.out

From 24700a454a32375c92c457e0dcfb4056d6990a32 Mon Sep 17 00:00:00 2001
From: Jinsong Ji <jinsong.ji@intel.com>
Date: Thu, 2 Nov 2023 23:50:33 -0400
Subject: [PATCH 827/877] [SYCL][CUDA][NFC] Add nocudalib to rdc driver test
 (#11765)

The test will fail on machines without cudalib

clang: error: cannot find libdevice for sm_61; provide path to different
CUDA installation via '--cuda-path', or pass '-nocudalib' to build
without linking with libdevice

For the driver test, we don't really need the cudalib.
---
 clang/test/Driver/sycl-cuda-rdc.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/test/Driver/sycl-cuda-rdc.cpp b/clang/test/Driver/sycl-cuda-rdc.cpp
index ced8fcd24b98f..fc9ffcbeb5f19 100644
--- a/clang/test/Driver/sycl-cuda-rdc.cpp
+++ b/clang/test/Driver/sycl-cuda-rdc.cpp
@@ -4,7 +4,7 @@
 
 // UNSUPPORTED: system-windows
 
-// RUN: %clangxx -### -fsycl -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend --cuda-gpu-arch=sm_61 -fgpu-rdc %s 2>&1 \
+// RUN: %clangxx -### -fsycl -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend --cuda-gpu-arch=sm_61 -fgpu-rdc -nocudalib %s 2>&1 \
 // RUN: | FileCheck %s -check-prefix=CHECK-SYCL_RDC_NVPTX
 
 // Verify that ptxas does not pass "-c"

From 367b662a49baad1a881696448399d68f491452e4 Mon Sep 17 00:00:00 2001
From: Maxime France-Pillois <maxime.francepillois@codeplay.com>
Date: Fri, 3 Nov 2023 09:47:58 +0000
Subject: [PATCH 828/877] [SYCL][Graph] Add support for CUDA backend (#11133)

Adds support for enabling CUDA backend with Sycl-Graph:

- Enables Graph e2e tests with CUDA backend.

Companion PR with CUDA UR adapter changes is here:
https://github.com/oneapi-src/unified-runtime/pull/932

---------

Co-authored-by: Ewan Crawford <ewan@codeplay.com>
Co-authored-by: Julian Miller <julian.miller@intel.com>
Co-authored-by: Ben Tracy <ben.tracy@codeplay.com>
Co-authored-by: Kenneth Benzie (Benie) <k.benzie83@gmail.com>
Co-authored-by: Maxime France-Pillois <maxime.francepillois@codeplay.com>
---
 sycl/doc/design/CommandGraph.md               | 29 +++++++++++++++++--
 .../design/images/SYCL-Graph-Architecture.svg |  5 +++-
 sycl/plugins/unified_runtime/CMakeLists.txt   | 12 ++++----
 .../Explicit/add_node_while_recording.cpp     |  2 +-
 .../Explicit/add_nodes_after_finalize.cpp     |  2 +-
 .../assume_buffer_outlives_graph_property.cpp |  2 +-
 sycl/test-e2e/Graph/Explicit/basic_buffer.cpp |  3 ++
 sycl/test-e2e/Graph/Explicit/basic_usm.cpp    |  4 +--
 .../Graph/Explicit/basic_usm_host.cpp         |  2 +-
 .../Graph/Explicit/basic_usm_mixed.cpp        |  2 +-
 .../Graph/Explicit/basic_usm_shared.cpp       |  2 +-
 .../Graph/Explicit/basic_usm_system.cpp       |  2 +-
 sycl/test-e2e/Graph/Explicit/buffer_copy.cpp  |  3 ++
 .../Graph/Explicit/buffer_copy_2d.cpp         |  3 ++
 .../Explicit/buffer_copy_host2target.cpp      |  3 ++
 .../Explicit/buffer_copy_host2target_2d.cpp   |  3 ++
 .../buffer_copy_host2target_offset.cpp        |  3 ++
 .../Graph/Explicit/buffer_copy_offsets.cpp    |  3 ++
 .../Explicit/buffer_copy_target2host.cpp      |  3 ++
 .../Explicit/buffer_copy_target2host_2d.cpp   |  3 ++
 .../buffer_copy_target2host_offset.cpp        |  3 ++
 sycl/test-e2e/Graph/Explicit/cycle_error.cpp  |  2 +-
 sycl/test-e2e/Graph/Explicit/depends_on.cpp   |  2 +-
 .../Graph/Explicit/dotp_buffer_reduction.cpp  |  2 +-
 .../Graph/Explicit/dotp_usm_reduction.cpp     |  2 +-
 .../test-e2e/Graph/Explicit/double_buffer.cpp |  2 +-
 sycl/test-e2e/Graph/Explicit/empty_node.cpp   |  2 +-
 .../Graph/Explicit/enqueue_ordering.cpp       |  2 +-
 .../Graph/Explicit/event_status_querying.cpp  |  3 ++
 .../Explicit/executable_graph_update.cpp      |  2 +-
 .../executable_graph_update_ordering.cpp      |  2 +-
 sycl/test-e2e/Graph/Explicit/host_task.cpp    |  2 +-
 .../test-e2e/Graph/Explicit/kernel_bundle.cpp |  5 ++--
 .../Graph/Explicit/multiple_exec_graphs.cpp   |  2 +-
 .../Explicit/multiple_kernel_bundles.cpp      |  2 +-
 .../test-e2e/Graph/Explicit/node_ordering.cpp |  2 +-
 .../Graph/Explicit/queue_shortcuts.cpp        |  2 +-
 .../test-e2e/Graph/Explicit/repeated_exec.cpp |  2 +-
 sycl/test-e2e/Graph/Explicit/single_node.cpp  |  2 +-
 sycl/test-e2e/Graph/Explicit/stream.cpp       |  2 +-
 sycl/test-e2e/Graph/Explicit/sub_graph.cpp    |  2 +-
 .../sub_graph_execute_without_parent.cpp      |  2 +-
 .../sub_graph_multiple_submission.cpp         |  2 +-
 .../Graph/Explicit/sub_graph_nested.cpp       |  2 +-
 .../Graph/Explicit/sub_graph_reduction.cpp    |  2 +-
 .../Explicit/sub_graph_two_parent_graphs.cpp  |  2 +-
 .../Explicit/temp_buffer_reinterpret.cpp      |  2 +-
 sycl/test-e2e/Graph/Explicit/usm_copy.cpp     |  2 +-
 sycl/test-e2e/Graph/Explicit/usm_fill.cpp     |  2 +-
 .../test-e2e/Graph/Explicit/usm_fill_host.cpp |  2 +-
 .../Graph/Explicit/usm_fill_shared.cpp        |  2 +-
 .../RecordReplay/add_nodes_after_finalize.cpp |  2 +-
 .../test-e2e/Graph/RecordReplay/after_use.cpp |  2 +-
 .../assume_buffer_outlives_graph_property.cpp |  2 +-
 .../Graph/RecordReplay/barrier_with_work.cpp  |  2 +-
 .../Graph/RecordReplay/basic_buffer.cpp       |  3 ++
 .../test-e2e/Graph/RecordReplay/basic_usm.cpp |  4 +--
 .../Graph/RecordReplay/basic_usm_host.cpp     |  2 +-
 .../Graph/RecordReplay/basic_usm_mixed.cpp    |  2 +-
 .../Graph/RecordReplay/basic_usm_shared.cpp   |  2 +-
 .../Graph/RecordReplay/basic_usm_system.cpp   |  2 +-
 .../Graph/RecordReplay/buffer_copy.cpp        |  3 ++
 .../Graph/RecordReplay/buffer_copy_2d.cpp     |  3 ++
 .../RecordReplay/buffer_copy_host2target.cpp  |  3 ++
 .../buffer_copy_host2target_2d.cpp            |  3 ++
 .../buffer_copy_host2target_offset.cpp        |  3 ++
 .../RecordReplay/buffer_copy_offsets.cpp      |  3 ++
 .../RecordReplay/buffer_copy_target2host.cpp  |  3 ++
 .../buffer_copy_target2host_2d.cpp            |  3 ++
 .../buffer_copy_target2host_offset.cpp        |  3 ++
 .../Graph/RecordReplay/concurrent_queue.cpp   |  2 +-
 .../RecordReplay/dotp_buffer_reduction.cpp    |  2 +-
 .../Graph/RecordReplay/dotp_in_order.cpp      |  2 +-
 .../dotp_in_order_with_empty_nodes.cpp        |  2 +-
 .../RecordReplay/dotp_multiple_queues.cpp     |  2 +-
 .../Graph/RecordReplay/dotp_usm_reduction.cpp |  2 +-
 .../Graph/RecordReplay/double_buffer.cpp      |  2 +-
 .../Graph/RecordReplay/empty_node.cpp         |  2 +-
 .../RecordReplay/event_status_querying.cpp    |  3 ++
 .../exception_inconsistent_contexts.cpp       |  2 +-
 .../RecordReplay/executable_graph_update.cpp  |  2 +-
 .../executable_graph_update_ordering.cpp      |  2 +-
 .../RecordReplay/finalize_while_recording.cpp |  2 +-
 .../test-e2e/Graph/RecordReplay/host_task.cpp |  2 +-
 .../Graph/RecordReplay/kernel_bundle.cpp      |  5 ++--
 .../RecordReplay/multiple_exec_graphs.cpp     |  2 +-
 .../RecordReplay/multiple_kernel_bundles.cpp  |  2 +-
 .../Graph/RecordReplay/queue_shortcuts.cpp    |  2 +-
 .../Graph/RecordReplay/repeated_exec.cpp      |  2 +-
 .../Graph/RecordReplay/return_values.cpp      |  2 +-
 sycl/test-e2e/Graph/RecordReplay/stream.cpp   |  2 +-
 .../test-e2e/Graph/RecordReplay/sub_graph.cpp |  2 +-
 .../sub_graph_execute_without_parent.cpp      |  2 +-
 .../Graph/RecordReplay/sub_graph_in_order.cpp |  2 +-
 .../sub_graph_multiple_submission.cpp         |  2 +-
 .../Graph/RecordReplay/sub_graph_nested.cpp   |  2 +-
 .../RecordReplay/sub_graph_reduction.cpp      |  2 +-
 .../sub_graph_two_parent_graphs.cpp           |  2 +-
 .../Graph/RecordReplay/temp_buffer.cpp        |  2 +-
 .../RecordReplay/temp_buffer_reinterpret.cpp  |  2 +-
 .../Graph/RecordReplay/temp_scope.cpp         |  2 +-
 sycl/test-e2e/Graph/RecordReplay/usm_copy.cpp |  2 +-
 .../Graph/RecordReplay/usm_copy_in_order.cpp  |  2 +-
 sycl/test-e2e/Graph/RecordReplay/usm_fill.cpp |  2 +-
 .../Graph/RecordReplay/usm_fill_host.cpp      |  2 +-
 .../Graph/RecordReplay/usm_fill_shared.cpp    |  2 +-
 .../Graph/RecordReplay/valid_no_end.cpp       |  2 +-
 sycl/test-e2e/Graph/Threading/submit.cpp      |  2 +-
 sycl/test-e2e/Graph/device_query.cpp          |  4 ++-
 sycl/test-e2e/Graph/empty_graph.cpp           |  2 +-
 sycl/test-e2e/Graph/finalize_twice.cpp        |  2 +-
 ...raph_exception_global_device_extension.cpp |  2 +-
 sycl/test-e2e/Graph/invalid_depends_on.cpp    |  2 +-
 sycl/test-e2e/Graph/invalid_event_wait.cpp    |  2 +-
 sycl/test-e2e/Graph/invalid_queue_wait.cpp    |  2 +-
 .../Graph/submission_while_executing.cpp      |  2 +-
 116 files changed, 202 insertions(+), 104 deletions(-)

diff --git a/sycl/doc/design/CommandGraph.md b/sycl/doc/design/CommandGraph.md
index e6a34df4f926b..9b57dd51c0a9a 100644
--- a/sycl/doc/design/CommandGraph.md
+++ b/sycl/doc/design/CommandGraph.md
@@ -149,8 +149,8 @@ yet been implemented.
 Implementation of UR command-buffers
 for each of the supported SYCL 2020 backends.
 
-This is currently only Level Zero but more sub-sections will be added here as
-other backends are implemented.
+Currently Level Zero and CUDA backends are implemented.
+More sub-sections will be added here as other backends are supported.
 
 ### Level Zero
 
@@ -221,3 +221,28 @@ Level Zero:
 
 Future work will include exploring L0 API extensions to improve the mapping of
 UR command-buffer to L0 command-list.
+
+### CUDA
+
+The SYCL Graph CUDA backend relies on the
+[CUDA Graphs feature](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#cuda-graphs),
+which is the CUDA public API for batching series of operations,
+such as kernel launches, connected by dependencies.
+
+UR commands (e.g. kernels) are mapped as graph nodes using the
+[CUDA Driver API](https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__GRAPH.html#group__CUDA__GRAPH).
+The CUDA Driver API is preferred over the CUDA Runtime API to implement
+the SYCL Graph backend to remain consistent with other UR functions.
+Synchronization between commands (UR sync-points) is implemented
+using graph dependencies.
+
+Executable CUDA Graphs can be submitted to a CUDA stream
+in the same way as regular kernels.
+The CUDA backend enables enqueuing events to wait for into a stream.
+It also allows signaling the completion of a submission with an event.
+Therefore, submitting a UR command-buffer consists only of submitting to a stream
+the executable CUDA Graph that represent this series of operations.
+
+An executable CUDA Graph, which contains all commands and synchronization
+information, is saved in the UR command-buffer to allow for efficient
+graph resubmission.
diff --git a/sycl/doc/design/images/SYCL-Graph-Architecture.svg b/sycl/doc/design/images/SYCL-Graph-Architecture.svg
index da0c789c8d560..f67db81aadbbb 100644
--- a/sycl/doc/design/images/SYCL-Graph-Architecture.svg
+++ b/sycl/doc/design/images/SYCL-Graph-Architecture.svg
@@ -1 +1,4 @@
-<svg width="1280" height="720" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" overflow="hidden"><defs><clipPath id="clip0"><rect x="0" y="0" width="1280" height="720"/></clipPath></defs><g clip-path="url(#clip0)"><rect x="0" y="0" width="1280" height="720" fill="#FFFFFF"/><text font-family="Calibri Light,Calibri Light_MSFontService,sans-serif" font-weight="300" font-size="59" transform="translate(354.64 124)">SYCL Graph Architecture</text><rect x="643.5" y="588.5" width="316" height="78" stroke="#2F5597" stroke-width="0.666667" stroke-miterlimit="8" fill="#2F528F"/><text fill="#FFFFFF" font-family="Calibri,Calibri_MSFontService,sans-serif" font-weight="400" font-size="21" transform="translate(728.156 609)">CPU, GPU, FPGA, <tspan font-size="21" x="-36.3734" y="26">hetero/hybrid/converged </tspan><tspan font-size="21" x="6.52667" y="51">architectures …</tspan></text><rect x="643.5" y="519.5" width="316" height="42" stroke="#2F5597" stroke-width="0.666667" stroke-miterlimit="8" fill="#2F528F"/><text fill="#FFFFFF" font-family="Calibri,Calibri_MSFontService,sans-serif" font-weight="400" font-size="21" transform="translate(757.097 547)">Level Zero</text><rect x="253.5" y="262.5" width="706" height="78" stroke="#2F5597" stroke-miterlimit="8" fill="none"/><text fill="#004A86" font-family="Calibri,Calibri_MSFontService,sans-serif" font-weight="400" font-size="21" transform="translate(491.809 308)">SYCL Graph Extensions API</text><path d="M809 493.5 809 512.992 808 512.992 808 493.5ZM812.5 511.659 808.5 519.659 804.5 511.659Z" fill="#4472C4"/><rect x="253.5" y="383.5" width="706" height="41" stroke="#2F5597" stroke-miterlimit="8" fill="none"/><text fill="#004A86" font-family="Calibri,Calibri_MSFontService,sans-serif" font-weight="400" font-size="21" transform="translate(547.024 411)">SYCL Runtime</text><rect x="253.5" y="178.5" width="706" height="41" stroke="#000000" stroke-miterlimit="8" fill="none"/><text fill="#525252" font-family="Calibri,Calibri_MSFontService,sans-serif" font-weight="400" font-size="21" transform="translate(557.162 206)">Application</text><path d="M0.5-3.41779e-05 0.502484 36.3381-0.497516 36.3382-0.5 3.41779e-05ZM4.00239 35.0046 0.00293963 43.0048-3.99761 35.0051Z" fill="#4472C4" transform="matrix(-1 0 0 1 606.503 219.5)"/><path d="M607 340.5 607.001 376.871 606.001 376.871 606 340.5ZM610.501 375.537 606.502 383.537 602.501 375.537Z" fill="#4472C4"/><path d="M442 493.5 442 512.992 441 512.992 441 493.5ZM445.5 511.659 441.5 519.659 437.5 511.659Z" fill="#4472C4"/><rect x="1024.5" y="38.5001" width="227" height="159" stroke="#003C71" stroke-width="1.33333" stroke-miterlimit="8" fill="none"/><text fill="#004A86" font-family="Calibri,Calibri_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(1118.2 58)">Legend</text><path d="M1118.37 59.1202 1157.37 59.1202 1157.37 60.1202 1118.37 60.1202Z" fill="#004A86" fill-rule="evenodd"/><text fill="#004A86" font-family="Calibri,Calibri_MSFontService,sans-serif" font-weight="400" font-size="13" transform="translate(1038.44 182)">A        B</text><path d="M1051.5 177.167 1060.44 177.167 1060.44 177.833 1051.5 177.833ZM1059.1 173.5 1067.1 177.5 1059.1 181.5Z" fill="#4472C4"/><text fill="#004A86" font-family="Calibri,Calibri_MSFontService,sans-serif" font-weight="400" font-size="12" transform="translate(1085.25 181)">A uses / depends on B</text><rect x="1038.5" y="97.5001" width="35" height="14" stroke="#2F5597" stroke-miterlimit="8" fill="#FFFFFF"/><text fill="#004A86" font-family="Calibri,Calibri_MSFontService,sans-serif" font-weight="400" font-size="12" transform="translate(1085.25 108)">SYCL Runtime</text><rect x="1038.5" y="120.5" width="35" height="14" stroke="#2F5597" stroke-miterlimit="8" fill="#44546A"/><text fill="#004A86" font-family="Calibri,Calibri_MSFontService,sans-serif" font-weight="400" font-size="12" transform="translate(1085.25 131)">Implemented Backends</text><rect x="1038.5" y="72.5001" width="35" height="14" stroke="#000000" stroke-miterlimit="8" fill="#FFFFFF"/><text fill="#004A86" font-family="Calibri,Calibri_MSFontService,sans-serif" font-weight="400" font-size="12" transform="translate(1085.25 83)">Application layer</text><rect x="253.5" y="452.5" width="706" height="41" stroke="#2F5597" stroke-miterlimit="8" fill="none"/><text fill="#004A86" font-family="Calibri,Calibri_MSFontService,sans-serif" font-weight="400" font-size="21" transform="translate(402.941 480)">Unified Runtime + Command Buffer Extensions</text><rect x="253.5" y="519.5" width="376" height="42" stroke="#595959" stroke-width="0.666667" stroke-miterlimit="8" fill="#BFBFBF"/><text fill="#FFFFFF" font-family="Calibri,Calibri_MSFontService,sans-serif" font-weight="400" font-size="21" transform="translate(370.422 547)">CUDA (example)</text><path d="M607 424.5 607 446.594 606 446.594 606 424.5ZM610.5 445.261 606.5 453.261 602.5 445.261Z" fill="#4472C4"/><path d="M442 561.5 442 580.992 441 580.992 441 561.5ZM445.5 579.659 441.5 587.659 437.5 579.659Z" fill="#4472C4"/><rect x="252.5" y="587.5" width="376" height="78" stroke="#595959" stroke-width="0.666667" stroke-miterlimit="8" fill="#BFBFBF"/><text fill="#FFFFFF" font-family="Calibri,Calibri_MSFontService,sans-serif" font-weight="400" font-size="21" transform="translate(387.488 633)">NVIDIA GPU</text><path d="M809 561.5 809 580.992 808 580.992 808 561.5ZM812.5 579.659 808.5 587.659 804.5 579.659Z" fill="#4472C4"/><text fill="#004A86" font-family="Calibri,Calibri_MSFontService,sans-serif" font-weight="400" font-size="12" transform="translate(1087.15 157)">Future Backend support</text><rect x="1038.5" y="146.5" width="35" height="14" stroke="#595959" stroke-miterlimit="8" fill="#BFBFBF"/></g></svg>
\ No newline at end of file
+<?xml version="1.0" encoding="UTF-8"?>
+<!-- Do not edit this file with editors other than draw.io -->
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<svg xmlns="http://www.w3.org/2000/svg" style="background-color: rgb(255, 255, 255);" xmlns:xlink="http://www.w3.org/1999/xlink" version="1.1" width="661px" height="476px" viewBox="-0.5 -0.5 661 476" content="&lt;mxfile host=&quot;app.diagrams.net&quot; modified=&quot;2023-11-02T05:05:42.749Z&quot; agent=&quot;Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/119.0&quot; etag=&quot;gizCJzPwjeXM6YQEoSix&quot; version=&quot;22.0.8&quot; type=&quot;device&quot;&gt;&lt;diagram name=&quot;Page-1&quot; id=&quot;rdOIFfZhXihIw1UrQana&quot;&gt;7Vtbd6M2EP41ebQPd8Oj4zhZn5NtfZpuu+lLDgFh02BEZRHb++tXAslcpMQkAWxn+5KgQRf0zTcjzUi+0Cer7Q1yk+VX6IPoQlP87YV+daFpI8cgf6lglwsMlQkWKPRzkVoI7sIfgAkVJk1DH6wrFTGEEQ6TqtCDcQw8XJG5CMFNtVoAo+qoibsAguDOcyNR+nfo42UutU2lkH8B4WLJR1YV9mbl8spMsF66PtyURPr0Qp8gCHH+tNpOQESx47jk7a5feLv/MARi3KRB+nS93Y4236c/bNMBzsCYLm4GtAHt5tmNUjZj9rV4xyEAPkGEFSHCS7iAsRtNC+klgmnsAzqOQkpFnVsIEyJUifBfgPGOqddNMSSiJV5F7K04Fza9NUyRB16bAOOEixYAv1JPz+vRuZQGYEjdALgCGO1IBQQiF4fPVe27jESLfb0CZ/LAoH4L7ALq4ySJQo8MDWNBAVV4N8sQg7vEzWDZEKOrQhmEUTSBEURZW903ge0bRL7GCD6B0htbe9Qt6zXwnwHCYPsqXPytxUjOrFx38uKmMBnDYFWWJXMxugJYFwC+u5/cDjIfReTTLQbxOsNaGc9nrSIOVIL5SIa4Y410tyPEVe5mDkKudAS5IYWcSP5IYxyuwCcAWVOODbLqHMM9EwTR7jttPzR58Z51lxWutpXSjpXe79bNhm7datutZ03HCLm7UoUEhjFel3qeU0HBkz0vuPsza0vx2+qTh/wLCp7sp/J+6piCfX6LwyAkvCiZKPkOojVlAlcrN6ZvLtMgAKjqMs/fju2j27GojfPad1kNDdQ+qX2XJaA++XY1bnfD5QI78GSMtjwbPAYdbbgskdF2n4S2BWR/+2t2NSPYKjfzb58A4b1/OBbCqkje83IZTkOXwUP/E/EZjgD7LXjOkhz/AATPkNma6ZyW7+AKL0E8i3EG8YT6jgl3IvThen4zzp+Gw+FZom8c9CuO2if6ozP3KyrPCx50LKeVBOLfXQL+y2zeKqWDINA8KaV969EyW9pe66e2GVHF3ciZUVprSmnjtCitCcD/noA4SwN50cPTEj14eWj58MgCy7Nju2GpB9mu8gRoP3QXs53jr1cd7LyP406Ov/MWU5t97kt6orVDk3sV3CVZ/F5x10VvIrrx2B/TYz5S8iJ3vQ69KpZV4ME2xKU8Jinds3r0uchi0gJPYvaf/NQbOv+mvr+kP1OiPy57W45USGrqzgtJNt5FPm/WSns5O7q3d96RXusoB0boqK1Eqd5kQ/wJaWc0pJ35P+26oJ0hRsHt0c6oEG+oaAfIRwpzgEIyJ4A+TKxTI0wt/VE/hG5MmPopjtUzYcRtX2uE0UkIVRBGGTr22wlzuqeGzokR8rN4sCYZ6ncS0ja05ktnF97rcCiunRSrDnqnxqwyax2pPbNKth2zIoLWpR8+0++IwkWcvbD+S+ltukuPKJnmFniZPC3Y/6zdY10QwGzGBU95S/pisM4SPPQoizBomzer9Vy5XTRGHg3wPJwiwEcgU88HqQ5MxMLHEFk2MS6tWRCJ53DVZqpxYQxjUAsimWhNIs4wXhCBWZT+zJJZA+2luBSSEDKIMoNdhr4PYsFIOf4M9XZO30a1603MCssH9rrEeLSuYlFTtjvL9ZU05KCEXlEYgwH/fkowuhSYUobtuYskfEleZMubEg9daNKqLUqSZI4qyyqodX/VnirFvLsA2yGg1kl+2TgItxTc49y6tGqeWXKrRevVRhrkaz4IbD/3herAquqxkW2w1f8oZXs5/RSQlbiDfpEVc7ttI9tP/lZAdnRsZMUrbpVL7sqtu5McAL1/e8NXrwgEtAcKHRksGjPxiuxesnM9mTqry2Qb6rAP+ua9V+lHHWJwNlslEVjRtd4nLy5d7wnQO5+/hkJknkftd0spBjYHbsx/VlVIlteejUO8MHCd5uFbYRfKXZokEOFfRi2SFUSV3ZLuTi2yu3eLbpzUPub5gBJohM/vlJhtaKSWGZS4LEuij3rar4E+SLH4JWKeuSl+zqlPfwI=&lt;/diagram&gt;&lt;/mxfile&gt;"><defs/><g><path d="M 220 75 L 220 108.63" fill="none" stroke="rgb(0, 0, 0)" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 220 113.88 L 216.5 106.88 L 220 108.63 L 223.5 106.88 Z" fill="rgb(0, 0, 0)" stroke="rgb(0, 0, 0)" stroke-miterlimit="10" pointer-events="all"/><rect x="0" y="34" width="440" height="41" fill="#d5e8d4" stroke="#82b366" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 438px; height: 1px; padding-top: 55px; margin-left: 1px;"><div style="box-sizing: border-box; font-size: 0px; text-align: center;" data-drawio-colors="color: rgb(0, 0, 0); "><div style="display: inline-block; font-size: 12px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;">Application</div></div></div></foreignObject><text x="220" y="58" fill="rgb(0, 0, 0)" font-family="Helvetica" font-size="12px" text-anchor="middle">Application</text></switch></g><rect x="0" y="115" width="440" height="40" fill="#e1d5e7" stroke="#9673a6" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 438px; height: 1px; padding-top: 135px; margin-left: 1px;"><div style="box-sizing: border-box; font-size: 0px; text-align: center;" data-drawio-colors="color: rgb(0, 0, 0); "><div style="display: inline-block; font-size: 12px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;">SYCL-Graph Extension API</div></div></div></foreignObject><text x="220" y="139" fill="rgb(0, 0, 0)" font-family="Helvetica" font-size="12px" text-anchor="middle">SYCL-Graph Extension API</text></switch></g><rect x="0" y="195" width="440" height="40" fill="#e1d5e7" stroke="#9673a6" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 438px; height: 1px; padding-top: 215px; margin-left: 1px;"><div style="box-sizing: border-box; font-size: 0px; text-align: center;" data-drawio-colors="color: rgb(0, 0, 0); "><div style="display: inline-block; font-size: 12px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;">SYCL Runtime</div></div></div></foreignObject><text x="220" y="219" fill="rgb(0, 0, 0)" font-family="Helvetica" font-size="12px" text-anchor="middle">SYCL Runtime</text></switch></g><path d="M 40 315 L 40 345 L 40 348.63" fill="none" stroke="rgb(0, 0, 0)" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 40 353.88 L 36.5 346.88 L 40 348.63 L 43.5 346.88 Z" fill="rgb(0, 0, 0)" stroke="rgb(0, 0, 0)" stroke-miterlimit="10" pointer-events="all"/><rect x="0" y="275" width="440" height="40" fill="#e1d5e7" stroke="#9673a6" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 438px; height: 1px; padding-top: 295px; margin-left: 1px;"><div style="box-sizing: border-box; font-size: 0px; text-align: center;" data-drawio-colors="color: rgb(0, 0, 0); "><div style="display: inline-block; font-size: 12px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;">Unified Runtime + Command Buffer Extension</div></div></div></foreignObject><text x="220" y="299" fill="rgb(0, 0, 0)" font-family="Helvetica" font-size="12px" text-anchor="middle">Unified Runtime + Command Buffer Extension</text></switch></g><path d="M 40 395 L 40 428.63" fill="none" stroke="rgb(0, 0, 0)" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 40 433.88 L 36.5 426.88 L 40 428.63 L 43.5 426.88 Z" fill="rgb(0, 0, 0)" stroke="rgb(0, 0, 0)" stroke-miterlimit="10" pointer-events="all"/><rect x="0" y="355" width="80" height="40" fill="#dae8fc" stroke="#6c8ebf" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 78px; height: 1px; padding-top: 375px; margin-left: 1px;"><div style="box-sizing: border-box; font-size: 0px; text-align: center;" data-drawio-colors="color: rgb(0, 0, 0); "><div style="display: inline-block; font-size: 12px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;">CUDA</div></div></div></foreignObject><text x="40" y="379" fill="rgb(0, 0, 0)" font-family="Helvetica" font-size="12px" text-anchor="middle">CUDA</text></switch></g><rect x="0" y="435" width="80" height="40" fill="#dae8fc" stroke="#6c8ebf" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 78px; height: 1px; padding-top: 455px; margin-left: 1px;"><div style="box-sizing: border-box; font-size: 0px; text-align: center;" data-drawio-colors="color: rgb(0, 0, 0); "><div style="display: inline-block; font-size: 12px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;">NVIDIA GPU</div></div></div></foreignObject><text x="40" y="459" fill="rgb(0, 0, 0)" font-family="Helvetica" font-size="12px" text-anchor="middle">NVIDIA GPU</text></switch></g><path d="M 139 395 L 139 415 L 139.34 428.63" fill="none" stroke="rgb(0, 0, 0)" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 139.47 433.88 L 135.8 426.97 L 139.34 428.63 L 142.8 426.8 Z" fill="rgb(0, 0, 0)" stroke="rgb(0, 0, 0)" stroke-miterlimit="10" pointer-events="all"/><rect x="99" y="355" width="80" height="40" fill="#dae8fc" stroke="#6c8ebf" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 78px; height: 1px; padding-top: 375px; margin-left: 100px;"><div style="box-sizing: border-box; font-size: 0px; text-align: center;" data-drawio-colors="color: rgb(0, 0, 0); "><div style="display: inline-block; font-size: 12px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;">Level Zero</div></div></div></foreignObject><text x="139" y="379" fill="rgb(0, 0, 0)" font-family="Helvetica" font-size="12px" text-anchor="middle">Level Zero</text></switch></g><rect x="94" y="435" width="91" height="40" fill="#dae8fc" stroke="#6c8ebf" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 89px; height: 1px; padding-top: 455px; margin-left: 95px;"><div style="box-sizing: border-box; font-size: 0px; text-align: center;" data-drawio-colors="color: rgb(0, 0, 0); "><div style="display: inline-block; font-size: 12px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;">Intel CPU, GPU, FPGA, ...</div></div></div></foreignObject><text x="140" y="459" fill="rgb(0, 0, 0)" font-family="Helvetica" font-size="12px" text-anchor="middle">Intel CPU, GPU,...</text></switch></g><path d="M 240 395 L 240 428.63" fill="none" stroke="rgb(0, 0, 0)" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 240 433.88 L 236.5 426.88 L 240 428.63 L 243.5 426.88 Z" fill="rgb(0, 0, 0)" stroke="rgb(0, 0, 0)" stroke-miterlimit="10" pointer-events="all"/><rect x="200" y="355" width="80" height="40" fill="#fff2cc" stroke="#d6b656" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 78px; height: 1px; padding-top: 375px; margin-left: 201px;"><div style="box-sizing: border-box; font-size: 0px; text-align: center;" data-drawio-colors="color: rgb(0, 0, 0); "><div style="display: inline-block; font-size: 12px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;">HIP</div></div></div></foreignObject><text x="240" y="379" fill="rgb(0, 0, 0)" font-family="Helvetica" font-size="12px" text-anchor="middle">HIP</text></switch></g><path d="M 370.5 395 L 370.5 415 L 370.5 414 L 370.5 427.63" fill="none" stroke="rgb(0, 0, 0)" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 370.5 432.88 L 367 425.88 L 370.5 427.63 L 374 425.88 Z" fill="rgb(0, 0, 0)" stroke="rgb(0, 0, 0)" stroke-miterlimit="10" pointer-events="all"/><rect x="301" y="355" width="139" height="40" fill="#fff2cc" stroke="#d6b656" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 137px; height: 1px; padding-top: 375px; margin-left: 302px;"><div style="box-sizing: border-box; font-size: 0px; text-align: center;" data-drawio-colors="color: rgb(0, 0, 0); "><div style="display: inline-block; font-size: 12px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;">OpenCL cl_khr_command_buffer</div></div></div></foreignObject><text x="371" y="379" fill="rgb(0, 0, 0)" font-family="Helvetica" font-size="12px" text-anchor="middle">OpenCL cl_khr_command_b...</text></switch></g><rect x="200" y="435" width="80" height="40" fill="#fff2cc" stroke="#d6b656" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 78px; height: 1px; padding-top: 455px; margin-left: 201px;"><div style="box-sizing: border-box; font-size: 0px; text-align: center;" data-drawio-colors="color: rgb(0, 0, 0); "><div style="display: inline-block; font-size: 12px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;">AMD GPU</div></div></div></foreignObject><text x="240" y="459" fill="rgb(0, 0, 0)" font-family="Helvetica" font-size="12px" text-anchor="middle">AMD GPU</text></switch></g><rect x="330.5" y="434" width="80" height="40" fill="#fff2cc" stroke="#d6b656" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 78px; height: 1px; padding-top: 454px; margin-left: 332px;"><div style="box-sizing: border-box; font-size: 0px; text-align: center;" data-drawio-colors="color: rgb(0, 0, 0); "><div style="display: inline-block; font-size: 12px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;">CPU, GPU, FPGA, ...</div></div></div></foreignObject><text x="371" y="458" fill="rgb(0, 0, 0)" font-family="Helvetica" font-size="12px" text-anchor="middle">CPU, GPU, FPG...</text></switch></g><path d="M 220 155 L 220 188.63" fill="none" stroke="rgb(0, 0, 0)" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 220 193.88 L 216.5 186.88 L 220 188.63 L 223.5 186.88 Z" fill="rgb(0, 0, 0)" stroke="rgb(0, 0, 0)" stroke-miterlimit="10" pointer-events="all"/><path d="M 220 235 L 220 268.63" fill="none" stroke="rgb(0, 0, 0)" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 220 273.88 L 216.5 266.88 L 220 268.63 L 223.5 266.88 Z" fill="rgb(0, 0, 0)" stroke="rgb(0, 0, 0)" stroke-miterlimit="10" pointer-events="all"/><path d="M 239.8 315.84 L 239.97 348.63" fill="none" stroke="rgb(0, 0, 0)" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 239.99 353.88 L 236.46 346.9 L 239.97 348.63 L 243.46 346.86 Z" fill="rgb(0, 0, 0)" stroke="rgb(0, 0, 0)" stroke-miterlimit="10" pointer-events="all"/><path d="M 139.92 314.24 L 139.14 348.63" fill="none" stroke="rgb(0, 0, 0)" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 139.03 353.88 L 135.68 346.81 L 139.14 348.63 L 142.68 346.96 Z" fill="rgb(0, 0, 0)" stroke="rgb(0, 0, 0)" stroke-miterlimit="10" pointer-events="all"/><path d="M 370.48 315 L 370.49 348.63" fill="none" stroke="rgb(0, 0, 0)" stroke-miterlimit="10" pointer-events="stroke"/><path d="M 370.49 353.88 L 366.99 346.88 L 370.49 348.63 L 373.99 346.88 Z" fill="rgb(0, 0, 0)" stroke="rgb(0, 0, 0)" stroke-miterlimit="10" pointer-events="all"/><rect x="10" y="10" width="430" height="20" fill="none" stroke="none" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe flex-start; justify-content: unsafe center; width: 422px; height: 1px; padding-top: 0px; margin-left: 14px;"><div style="box-sizing: border-box; font-size: 0px; text-align: center; max-height: 30px; overflow: hidden;" data-drawio-colors="color: rgb(0, 0, 0); "><div style="display: inline-block; font-size: 12px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;"><div align="center"><b><font style="font-size: 20px;">SYCL-Graph Architecture</font></b></div></div></div></div></foreignObject><text x="225" y="12" fill="rgb(0, 0, 0)" font-family="Helvetica" font-size="12px" text-anchor="middle">SYCL-Graph Architecture</text></switch></g><rect x="480" y="35" width="180" height="160" fill="rgb(255, 255, 255)" stroke="rgb(0, 0, 0)" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 178px; height: 1px; padding-top: 115px; margin-left: 481px;"><div style="box-sizing: border-box; font-size: 0px; text-align: center;" data-drawio-colors="color: rgb(0, 0, 0); "><div style="display: inline-block; font-size: 12px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;"><p style="line-height: 90%;" align="center"><br /></p></div></div></div></foreignObject><text x="570" y="119" fill="rgb(0, 0, 0)" font-family="Helvetica" font-size="12px" text-anchor="middle"></text></switch></g><rect x="490" y="75" width="20" height="20" fill="#d5e8d4" stroke="#82b366" pointer-events="all"/><rect x="490" y="105" width="20" height="20" fill="#e1d5e7" stroke="#9673a6" pointer-events="all"/><rect x="490" y="135" width="20" height="20" fill="#dae8fc" stroke="#6c8ebf" pointer-events="all"/><rect x="490" y="165" width="20" height="20" fill="#fff2cc" stroke="#d6b656" pointer-events="all"/><rect x="520" y="75" width="110" height="20" fill="none" stroke="none" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe flex-start; width: 108px; height: 1px; padding-top: 85px; margin-left: 522px;"><div style="box-sizing: border-box; font-size: 0px; text-align: left;" data-drawio-colors="color: rgb(0, 0, 0); "><div style="display: inline-block; font-size: 12px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;">Application Layer</div></div></div></foreignObject><text x="522" y="89" fill="rgb(0, 0, 0)" font-family="Helvetica" font-size="12px">Application Layer</text></switch></g><rect x="520" y="135" width="130" height="20" fill="none" stroke="none" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe flex-start; width: 128px; height: 1px; padding-top: 145px; margin-left: 522px;"><div style="box-sizing: border-box; font-size: 0px; text-align: left;" data-drawio-colors="color: rgb(0, 0, 0); "><div style="display: inline-block; font-size: 12px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;">Implemented Backend</div></div></div></foreignObject><text x="522" y="149" fill="rgb(0, 0, 0)" font-family="Helvetica" font-size="12px">Implemented Backend</text></switch></g><rect x="520" y="105" width="110" height="20" fill="none" stroke="none" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe flex-start; width: 108px; height: 1px; padding-top: 115px; margin-left: 522px;"><div style="box-sizing: border-box; font-size: 0px; text-align: left;" data-drawio-colors="color: rgb(0, 0, 0); "><div style="display: inline-block; font-size: 12px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;">SYCL Runtime</div></div></div></foreignObject><text x="522" y="119" fill="rgb(0, 0, 0)" font-family="Helvetica" font-size="12px">SYCL Runtime</text></switch></g><rect x="520" y="165" width="140" height="20" fill="none" stroke="none" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe flex-start; width: 138px; height: 1px; padding-top: 175px; margin-left: 522px;"><div style="box-sizing: border-box; font-size: 0px; text-align: left;" data-drawio-colors="color: rgb(0, 0, 0); "><div style="display: inline-block; font-size: 12px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; white-space: normal; overflow-wrap: normal;">Future Backend Support</div></div></div></foreignObject><text x="522" y="179" fill="rgb(0, 0, 0)" font-family="Helvetica" font-size="12px">Future Backend Support</text></switch></g><rect x="530" y="35" width="60" height="30" fill="none" stroke="none" pointer-events="all"/><g transform="translate(-0.5 -0.5)"><switch><foreignObject style="overflow: visible; text-align: left;" pointer-events="none" width="100%" height="100%" requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"><div xmlns="http://www.w3.org/1999/xhtml" style="display: flex; align-items: unsafe center; justify-content: unsafe center; width: 58px; height: 1px; padding-top: 50px; margin-left: 531px;"><div style="box-sizing: border-box; font-size: 0px; text-align: center;" data-drawio-colors="color: rgb(0, 0, 0); "><div style="display: inline-block; font-size: 12px; font-family: Helvetica; color: rgb(0, 0, 0); line-height: 1.2; pointer-events: all; font-weight: bold; text-decoration: underline; white-space: normal; overflow-wrap: normal;">Legend</div></div></div></foreignObject><text x="560" y="54" fill="rgb(0, 0, 0)" font-family="Helvetica" font-size="12px" text-anchor="middle" font-weight="bold" text-decoration="underline">Legend</text></switch></g></g><switch><g requiredFeatures="http://www.w3.org/TR/SVG11/feature#Extensibility"/><a transform="translate(0,-5)" xlink:href="https://www.drawio.com/doc/faq/svg-export-text-problems" target="_blank"><text text-anchor="middle" font-size="10px" x="50%" y="100%">Text is not SVG - cannot display</text></a></switch></svg>
\ No newline at end of file
diff --git a/sycl/plugins/unified_runtime/CMakeLists.txt b/sycl/plugins/unified_runtime/CMakeLists.txt
index 690ef378d1cba..16031abcff630 100644
--- a/sycl/plugins/unified_runtime/CMakeLists.txt
+++ b/sycl/plugins/unified_runtime/CMakeLists.txt
@@ -54,13 +54,13 @@ if(SYCL_PI_UR_USE_FETCH_CONTENT)
   include(FetchContent)
 
   set(UNIFIED_RUNTIME_REPO "https://github.com/oneapi-src/unified-runtime.git")
-  # commit 036b9cfcd8fb4be9394449b5406e6402580a63c9
-  # Merge: 2ab0734 86f96f0
+
+  # commit be53fb3ba5bf1ac33051456d72c61b6a01d94a72
   # Author: Kenneth Benzie (Benie) <k.benzie@codeplay.com>
-  # Date:   Fri Oct 27 16:36:23 2023 +0100
-  #     Merge pull request #961 from hdelan/change-unions-to-stdvariant
-  #     [HIP][CUDA] Change unions in ur_mem_handle_t_ to stdvariant
-  set(UNIFIED_RUNTIME_TAG 036b9cfcd8fb4be9394449b5406e6402580a63c9)
+  # Date:   Tue Oct 31 13:22:52 2023 +0000
+  # - Merge pull request #932 from Bensuo/cuda-cmd-buffers
+  # - [CUDA][EXP] CUDA adapter support for command buffers
+  set(UNIFIED_RUNTIME_TAG be53fb3ba5bf1ac33051456d72c61b6a01d94a72)
 
   if(SYCL_PI_UR_OVERRIDE_FETCH_CONTENT_REPO)
     set(UNIFIED_RUNTIME_REPO "${SYCL_PI_UR_OVERRIDE_FETCH_CONTENT_REPO}")
diff --git a/sycl/test-e2e/Graph/Explicit/add_node_while_recording.cpp b/sycl/test-e2e/Graph/Explicit/add_node_while_recording.cpp
index 1067f0ce8dcb2..8745a10e7ab97 100644
--- a/sycl/test-e2e/Graph/Explicit/add_node_while_recording.cpp
+++ b/sycl/test-e2e/Graph/Explicit/add_node_while_recording.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // Extra run to check for leaks in Level Zero using ZE_DEBUG
diff --git a/sycl/test-e2e/Graph/Explicit/add_nodes_after_finalize.cpp b/sycl/test-e2e/Graph/Explicit/add_nodes_after_finalize.cpp
index 8ad6a413aea03..d41484a285602 100644
--- a/sycl/test-e2e/Graph/Explicit/add_nodes_after_finalize.cpp
+++ b/sycl/test-e2e/Graph/Explicit/add_nodes_after_finalize.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // Extra run to check for leaks in Level Zero using ZE_DEBUG
diff --git a/sycl/test-e2e/Graph/Explicit/assume_buffer_outlives_graph_property.cpp b/sycl/test-e2e/Graph/Explicit/assume_buffer_outlives_graph_property.cpp
index a7ea5b10974a4..91754164259e3 100644
--- a/sycl/test-e2e/Graph/Explicit/assume_buffer_outlives_graph_property.cpp
+++ b/sycl/test-e2e/Graph/Explicit/assume_buffer_outlives_graph_property.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // Extra run to check for leaks in Level Zero using ZE_DEBUG
diff --git a/sycl/test-e2e/Graph/Explicit/basic_buffer.cpp b/sycl/test-e2e/Graph/Explicit/basic_buffer.cpp
index 6191a875bbe41..60d83b985078d 100644
--- a/sycl/test-e2e/Graph/Explicit/basic_buffer.cpp
+++ b/sycl/test-e2e/Graph/Explicit/basic_buffer.cpp
@@ -5,6 +5,9 @@
 // RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
 //
 // CHECK-NOT: LEAK
+//
+// TODO enable cuda once buffer issue investigated and fixed
+// UNSUPPORTED: cuda
 
 #define GRAPH_E2E_EXPLICIT
 
diff --git a/sycl/test-e2e/Graph/Explicit/basic_usm.cpp b/sycl/test-e2e/Graph/Explicit/basic_usm.cpp
index c7a8879bc918a..b6d765c60090e 100644
--- a/sycl/test-e2e/Graph/Explicit/basic_usm.cpp
+++ b/sycl/test-e2e/Graph/Explicit/basic_usm.cpp
@@ -1,5 +1,5 @@
-// REQUIRES: level_zero, gpu
-// RUN: %{build_pthread_inc} -o %t.out
+// REQUIRES: cuda || level_zero, gpu
+// RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // Extra run to check for leaks in Level Zero using ZE_DEBUG
 // RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
diff --git a/sycl/test-e2e/Graph/Explicit/basic_usm_host.cpp b/sycl/test-e2e/Graph/Explicit/basic_usm_host.cpp
index 79e53ff4ba9d9..ac4564ce5797f 100644
--- a/sycl/test-e2e/Graph/Explicit/basic_usm_host.cpp
+++ b/sycl/test-e2e/Graph/Explicit/basic_usm_host.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // Extra run to check for leaks in Level Zero using ZE_DEBUG
diff --git a/sycl/test-e2e/Graph/Explicit/basic_usm_mixed.cpp b/sycl/test-e2e/Graph/Explicit/basic_usm_mixed.cpp
index fa5a2a1f018e6..17a2827372667 100644
--- a/sycl/test-e2e/Graph/Explicit/basic_usm_mixed.cpp
+++ b/sycl/test-e2e/Graph/Explicit/basic_usm_mixed.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // Extra run to check for leaks in Level Zero using ZE_DEBUG
diff --git a/sycl/test-e2e/Graph/Explicit/basic_usm_shared.cpp b/sycl/test-e2e/Graph/Explicit/basic_usm_shared.cpp
index 1b7447940e1fe..39843f07ff876 100644
--- a/sycl/test-e2e/Graph/Explicit/basic_usm_shared.cpp
+++ b/sycl/test-e2e/Graph/Explicit/basic_usm_shared.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // Extra run to check for leaks in Level Zero using ZE_DEBUG
diff --git a/sycl/test-e2e/Graph/Explicit/basic_usm_system.cpp b/sycl/test-e2e/Graph/Explicit/basic_usm_system.cpp
index 26e5473bded66..2a8c69c2afca8 100644
--- a/sycl/test-e2e/Graph/Explicit/basic_usm_system.cpp
+++ b/sycl/test-e2e/Graph/Explicit/basic_usm_system.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // Extra run to check for leaks in Level Zero using ZE_DEBUG
diff --git a/sycl/test-e2e/Graph/Explicit/buffer_copy.cpp b/sycl/test-e2e/Graph/Explicit/buffer_copy.cpp
index 3c291d4d44393..09b6e47608e1e 100644
--- a/sycl/test-e2e/Graph/Explicit/buffer_copy.cpp
+++ b/sycl/test-e2e/Graph/Explicit/buffer_copy.cpp
@@ -5,6 +5,9 @@
 // RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
 //
 // CHECK-NOT: LEAK
+//
+// TODO enable cuda once buffer issue investigated and fixed
+// UNSUPPORTED: cuda
 
 #define GRAPH_E2E_EXPLICIT
 
diff --git a/sycl/test-e2e/Graph/Explicit/buffer_copy_2d.cpp b/sycl/test-e2e/Graph/Explicit/buffer_copy_2d.cpp
index 446d75316e6e2..65084bfb186d0 100644
--- a/sycl/test-e2e/Graph/Explicit/buffer_copy_2d.cpp
+++ b/sycl/test-e2e/Graph/Explicit/buffer_copy_2d.cpp
@@ -5,6 +5,9 @@
 // RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
 //
 // CHECK-NOT: LEAK
+//
+// TODO enable cuda once buffer issue investigated and fixed
+// UNSUPPORTED: cuda
 
 #define GRAPH_E2E_EXPLICIT
 
diff --git a/sycl/test-e2e/Graph/Explicit/buffer_copy_host2target.cpp b/sycl/test-e2e/Graph/Explicit/buffer_copy_host2target.cpp
index 8c233ec8de66e..c8f8f39804721 100644
--- a/sycl/test-e2e/Graph/Explicit/buffer_copy_host2target.cpp
+++ b/sycl/test-e2e/Graph/Explicit/buffer_copy_host2target.cpp
@@ -5,6 +5,9 @@
 // RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
 //
 // CHECK-NOT: LEAK
+//
+// TODO enable cuda once buffer issue investigated and fixed
+// UNSUPPORTED: cuda
 
 #define GRAPH_E2E_EXPLICIT
 
diff --git a/sycl/test-e2e/Graph/Explicit/buffer_copy_host2target_2d.cpp b/sycl/test-e2e/Graph/Explicit/buffer_copy_host2target_2d.cpp
index 9c33e885ce8a5..da31f17c0c7dd 100644
--- a/sycl/test-e2e/Graph/Explicit/buffer_copy_host2target_2d.cpp
+++ b/sycl/test-e2e/Graph/Explicit/buffer_copy_host2target_2d.cpp
@@ -5,6 +5,9 @@
 // RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
 //
 // CHECK-NOT: LEAK
+//
+// TODO enable cuda once buffer issue investigated and fixed
+// UNSUPPORTED: cuda
 
 #define GRAPH_E2E_EXPLICIT
 
diff --git a/sycl/test-e2e/Graph/Explicit/buffer_copy_host2target_offset.cpp b/sycl/test-e2e/Graph/Explicit/buffer_copy_host2target_offset.cpp
index 2c26c24744f0e..d29520d5ac9c5 100644
--- a/sycl/test-e2e/Graph/Explicit/buffer_copy_host2target_offset.cpp
+++ b/sycl/test-e2e/Graph/Explicit/buffer_copy_host2target_offset.cpp
@@ -5,6 +5,9 @@
 // RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
 //
 // CHECK-NOT: LEAK
+//
+// TODO enable cuda once buffer issue investigated and fixed
+// UNSUPPORTED: cuda
 
 #define GRAPH_E2E_EXPLICIT
 
diff --git a/sycl/test-e2e/Graph/Explicit/buffer_copy_offsets.cpp b/sycl/test-e2e/Graph/Explicit/buffer_copy_offsets.cpp
index 746b41f4e0a76..920828cac20d3 100644
--- a/sycl/test-e2e/Graph/Explicit/buffer_copy_offsets.cpp
+++ b/sycl/test-e2e/Graph/Explicit/buffer_copy_offsets.cpp
@@ -5,6 +5,9 @@
 // RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
 //
 // CHECK-NOT: LEAK
+//
+// TODO enable cuda once buffer issue investigated and fixed
+// UNSUPPORTED: cuda
 
 #define GRAPH_E2E_EXPLICIT
 
diff --git a/sycl/test-e2e/Graph/Explicit/buffer_copy_target2host.cpp b/sycl/test-e2e/Graph/Explicit/buffer_copy_target2host.cpp
index e3a9ceb3160a2..9b5175567f9b8 100644
--- a/sycl/test-e2e/Graph/Explicit/buffer_copy_target2host.cpp
+++ b/sycl/test-e2e/Graph/Explicit/buffer_copy_target2host.cpp
@@ -5,6 +5,9 @@
 // RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
 //
 // CHECK-NOT: LEAK
+//
+// TODO enable cuda once buffer issue investigated and fixed
+// UNSUPPORTED: cuda
 
 #define GRAPH_E2E_EXPLICIT
 
diff --git a/sycl/test-e2e/Graph/Explicit/buffer_copy_target2host_2d.cpp b/sycl/test-e2e/Graph/Explicit/buffer_copy_target2host_2d.cpp
index f9945ebf3ee58..09762d7b97897 100644
--- a/sycl/test-e2e/Graph/Explicit/buffer_copy_target2host_2d.cpp
+++ b/sycl/test-e2e/Graph/Explicit/buffer_copy_target2host_2d.cpp
@@ -5,6 +5,9 @@
 // RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
 //
 // CHECK-NOT: LEAK
+//
+// TODO enable cuda once buffer issue investigated and fixed
+// UNSUPPORTED: cuda
 
 #define GRAPH_E2E_EXPLICIT
 
diff --git a/sycl/test-e2e/Graph/Explicit/buffer_copy_target2host_offset.cpp b/sycl/test-e2e/Graph/Explicit/buffer_copy_target2host_offset.cpp
index c51b9e445137c..012b9df7dfe32 100644
--- a/sycl/test-e2e/Graph/Explicit/buffer_copy_target2host_offset.cpp
+++ b/sycl/test-e2e/Graph/Explicit/buffer_copy_target2host_offset.cpp
@@ -5,6 +5,9 @@
 // RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
 //
 // CHECK-NOT: LEAK
+//
+// TODO enable cuda once buffer issue investigated and fixed
+// UNSUPPORTED: cuda
 
 #define GRAPH_E2E_EXPLICIT
 
diff --git a/sycl/test-e2e/Graph/Explicit/cycle_error.cpp b/sycl/test-e2e/Graph/Explicit/cycle_error.cpp
index 2ca29aa67b9cf..bb0558c43f26e 100644
--- a/sycl/test-e2e/Graph/Explicit/cycle_error.cpp
+++ b/sycl/test-e2e/Graph/Explicit/cycle_error.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/Graph/Explicit/depends_on.cpp b/sycl/test-e2e/Graph/Explicit/depends_on.cpp
index 445e73cefae75..77e979f3edf4d 100644
--- a/sycl/test-e2e/Graph/Explicit/depends_on.cpp
+++ b/sycl/test-e2e/Graph/Explicit/depends_on.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // Extra run to check for leaks in Level Zero using ZE_DEBUG
diff --git a/sycl/test-e2e/Graph/Explicit/dotp_buffer_reduction.cpp b/sycl/test-e2e/Graph/Explicit/dotp_buffer_reduction.cpp
index da9bdc71ba466..9557bfd814240 100644
--- a/sycl/test-e2e/Graph/Explicit/dotp_buffer_reduction.cpp
+++ b/sycl/test-e2e/Graph/Explicit/dotp_buffer_reduction.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // Extra run to check for leaks in Level Zero using ZE_DEBUG
diff --git a/sycl/test-e2e/Graph/Explicit/dotp_usm_reduction.cpp b/sycl/test-e2e/Graph/Explicit/dotp_usm_reduction.cpp
index c54477e61dac6..df66e5e9bb146 100644
--- a/sycl/test-e2e/Graph/Explicit/dotp_usm_reduction.cpp
+++ b/sycl/test-e2e/Graph/Explicit/dotp_usm_reduction.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // Extra run to check for leaks in Level Zero using ZE_DEBUG
diff --git a/sycl/test-e2e/Graph/Explicit/double_buffer.cpp b/sycl/test-e2e/Graph/Explicit/double_buffer.cpp
index 94e09289577ff..2468592c0f0cf 100644
--- a/sycl/test-e2e/Graph/Explicit/double_buffer.cpp
+++ b/sycl/test-e2e/Graph/Explicit/double_buffer.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // Extra run to check for leaks in Level Zero using ZE_DEBUG
diff --git a/sycl/test-e2e/Graph/Explicit/empty_node.cpp b/sycl/test-e2e/Graph/Explicit/empty_node.cpp
index 687a25b923d78..638ccbde77e18 100644
--- a/sycl/test-e2e/Graph/Explicit/empty_node.cpp
+++ b/sycl/test-e2e/Graph/Explicit/empty_node.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // Extra run to check for leaks in Level Zero using ZE_DEBUG
diff --git a/sycl/test-e2e/Graph/Explicit/enqueue_ordering.cpp b/sycl/test-e2e/Graph/Explicit/enqueue_ordering.cpp
index b219974b5df18..198da9a7129de 100644
--- a/sycl/test-e2e/Graph/Explicit/enqueue_ordering.cpp
+++ b/sycl/test-e2e/Graph/Explicit/enqueue_ordering.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // Extra run to check for leaks in Level Zero using ZE_DEBUG
diff --git a/sycl/test-e2e/Graph/Explicit/event_status_querying.cpp b/sycl/test-e2e/Graph/Explicit/event_status_querying.cpp
index 4d5831d494aa1..6a4c1db81cfe4 100644
--- a/sycl/test-e2e/Graph/Explicit/event_status_querying.cpp
+++ b/sycl/test-e2e/Graph/Explicit/event_status_querying.cpp
@@ -3,6 +3,9 @@
 // RUN: %{run} %t.out 2>&1 | FileCheck %s
 //
 // CHECK: complete
+//
+// TODO enable cuda once buffer issue investigated and fixed
+// UNSUPPORTED: cuda
 
 #define GRAPH_E2E_EXPLICIT
 
diff --git a/sycl/test-e2e/Graph/Explicit/executable_graph_update.cpp b/sycl/test-e2e/Graph/Explicit/executable_graph_update.cpp
index bcb9eeed01071..9a56e18724d72 100644
--- a/sycl/test-e2e/Graph/Explicit/executable_graph_update.cpp
+++ b/sycl/test-e2e/Graph/Explicit/executable_graph_update.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // Extra run to check for leaks in Level Zero using ZE_DEBUG
diff --git a/sycl/test-e2e/Graph/Explicit/executable_graph_update_ordering.cpp b/sycl/test-e2e/Graph/Explicit/executable_graph_update_ordering.cpp
index dacd28e730139..caed7820467dc 100644
--- a/sycl/test-e2e/Graph/Explicit/executable_graph_update_ordering.cpp
+++ b/sycl/test-e2e/Graph/Explicit/executable_graph_update_ordering.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // Extra run to check for leaks in Level Zero using ZE_DEBUG
diff --git a/sycl/test-e2e/Graph/Explicit/host_task.cpp b/sycl/test-e2e/Graph/Explicit/host_task.cpp
index 1f6b7d931f194..3aff7e2807ee8 100644
--- a/sycl/test-e2e/Graph/Explicit/host_task.cpp
+++ b/sycl/test-e2e/Graph/Explicit/host_task.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // Extra run to check for leaks in Level Zero using ZE_DEBUG
diff --git a/sycl/test-e2e/Graph/Explicit/kernel_bundle.cpp b/sycl/test-e2e/Graph/Explicit/kernel_bundle.cpp
index 9c9c37234456e..0d301200b089d 100644
--- a/sycl/test-e2e/Graph/Explicit/kernel_bundle.cpp
+++ b/sycl/test-e2e/Graph/Explicit/kernel_bundle.cpp
@@ -1,6 +1,7 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
-// RUN: env SYCL_PI_TRACE=2 %{run} %t.out | FileCheck %s
+// RUN: %if ext_oneapi_cuda %{ %{run} %t.out %}
+// RUN: %if ext_oneapi_level_zero %{env SYCL_PI_TRACE=2 %{run} %t.out | FileCheck %s %}
 
 // Checks the PI call trace to ensure that the bundle kernel of the single task
 // is used.
diff --git a/sycl/test-e2e/Graph/Explicit/multiple_exec_graphs.cpp b/sycl/test-e2e/Graph/Explicit/multiple_exec_graphs.cpp
index a414e3f4b8d6c..9de3fd735eee5 100644
--- a/sycl/test-e2e/Graph/Explicit/multiple_exec_graphs.cpp
+++ b/sycl/test-e2e/Graph/Explicit/multiple_exec_graphs.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // Extra run to check for leaks in Level Zero using ZE_DEBUG
diff --git a/sycl/test-e2e/Graph/Explicit/multiple_kernel_bundles.cpp b/sycl/test-e2e/Graph/Explicit/multiple_kernel_bundles.cpp
index 031456b5aad3e..19a05d610e13f 100644
--- a/sycl/test-e2e/Graph/Explicit/multiple_kernel_bundles.cpp
+++ b/sycl/test-e2e/Graph/Explicit/multiple_kernel_bundles.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // Extra run to check for leaks in Level Zero using ZE_DEBUG
diff --git a/sycl/test-e2e/Graph/Explicit/node_ordering.cpp b/sycl/test-e2e/Graph/Explicit/node_ordering.cpp
index 2301aa2dc5897..6e3d965619c98 100644
--- a/sycl/test-e2e/Graph/Explicit/node_ordering.cpp
+++ b/sycl/test-e2e/Graph/Explicit/node_ordering.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // Extra run to check for leaks in Level Zero using ZE_DEBUG
diff --git a/sycl/test-e2e/Graph/Explicit/queue_shortcuts.cpp b/sycl/test-e2e/Graph/Explicit/queue_shortcuts.cpp
index 68e89093c15b1..61d6ddad5a62b 100644
--- a/sycl/test-e2e/Graph/Explicit/queue_shortcuts.cpp
+++ b/sycl/test-e2e/Graph/Explicit/queue_shortcuts.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // Extra run to check for leaks in Level Zero using ZE_DEBUG
diff --git a/sycl/test-e2e/Graph/Explicit/repeated_exec.cpp b/sycl/test-e2e/Graph/Explicit/repeated_exec.cpp
index 305831a6abb9b..495a5c3e7b2f7 100644
--- a/sycl/test-e2e/Graph/Explicit/repeated_exec.cpp
+++ b/sycl/test-e2e/Graph/Explicit/repeated_exec.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // Extra run to check for leaks in Level Zero using ZE_DEBUG
diff --git a/sycl/test-e2e/Graph/Explicit/single_node.cpp b/sycl/test-e2e/Graph/Explicit/single_node.cpp
index a063f3a44def6..0fae78023b49d 100644
--- a/sycl/test-e2e/Graph/Explicit/single_node.cpp
+++ b/sycl/test-e2e/Graph/Explicit/single_node.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // Extra run to check for leaks in Level Zero using ZE_DEBUG
diff --git a/sycl/test-e2e/Graph/Explicit/stream.cpp b/sycl/test-e2e/Graph/Explicit/stream.cpp
index 5c3ba8764ff51..eaa55adafa302 100644
--- a/sycl/test-e2e/Graph/Explicit/stream.cpp
+++ b/sycl/test-e2e/Graph/Explicit/stream.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out %GPU_CHECK_PLACEHOLDER
 // RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out %GPU_CHECK_PLACEHOLDER 2>&1 | FileCheck %s %}
diff --git a/sycl/test-e2e/Graph/Explicit/sub_graph.cpp b/sycl/test-e2e/Graph/Explicit/sub_graph.cpp
index 154ea4e3470e3..ff3a8c1b2eea9 100644
--- a/sycl/test-e2e/Graph/Explicit/sub_graph.cpp
+++ b/sycl/test-e2e/Graph/Explicit/sub_graph.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // Extra run to check for leaks in Level Zero using ZE_DEBUG
diff --git a/sycl/test-e2e/Graph/Explicit/sub_graph_execute_without_parent.cpp b/sycl/test-e2e/Graph/Explicit/sub_graph_execute_without_parent.cpp
index edce73a46ad73..ad0badd14ed98 100644
--- a/sycl/test-e2e/Graph/Explicit/sub_graph_execute_without_parent.cpp
+++ b/sycl/test-e2e/Graph/Explicit/sub_graph_execute_without_parent.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // Extra run to check for leaks in Level Zero using ZE_DEBUG
diff --git a/sycl/test-e2e/Graph/Explicit/sub_graph_multiple_submission.cpp b/sycl/test-e2e/Graph/Explicit/sub_graph_multiple_submission.cpp
index b8863b57c7290..bc66115d8fa35 100644
--- a/sycl/test-e2e/Graph/Explicit/sub_graph_multiple_submission.cpp
+++ b/sycl/test-e2e/Graph/Explicit/sub_graph_multiple_submission.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // Extra run to check for leaks in Level Zero using ZE_DEBUG
diff --git a/sycl/test-e2e/Graph/Explicit/sub_graph_nested.cpp b/sycl/test-e2e/Graph/Explicit/sub_graph_nested.cpp
index fe906bb7aba14..a1dc0e479da73 100644
--- a/sycl/test-e2e/Graph/Explicit/sub_graph_nested.cpp
+++ b/sycl/test-e2e/Graph/Explicit/sub_graph_nested.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // Extra run to check for leaks in Level Zero using ZE_DEBUG
diff --git a/sycl/test-e2e/Graph/Explicit/sub_graph_reduction.cpp b/sycl/test-e2e/Graph/Explicit/sub_graph_reduction.cpp
index 73c1ddc5d520c..97e437cbc0f53 100644
--- a/sycl/test-e2e/Graph/Explicit/sub_graph_reduction.cpp
+++ b/sycl/test-e2e/Graph/Explicit/sub_graph_reduction.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // Extra run to check for leaks in Level Zero using ZE_DEBUG
diff --git a/sycl/test-e2e/Graph/Explicit/sub_graph_two_parent_graphs.cpp b/sycl/test-e2e/Graph/Explicit/sub_graph_two_parent_graphs.cpp
index 4254a861fe344..7847981a86577 100644
--- a/sycl/test-e2e/Graph/Explicit/sub_graph_two_parent_graphs.cpp
+++ b/sycl/test-e2e/Graph/Explicit/sub_graph_two_parent_graphs.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // Extra run to check for leaks in Level Zero using ZE_DEBUG
diff --git a/sycl/test-e2e/Graph/Explicit/temp_buffer_reinterpret.cpp b/sycl/test-e2e/Graph/Explicit/temp_buffer_reinterpret.cpp
index d5b3ff7412b61..c8fd1d803266c 100644
--- a/sycl/test-e2e/Graph/Explicit/temp_buffer_reinterpret.cpp
+++ b/sycl/test-e2e/Graph/Explicit/temp_buffer_reinterpret.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // Extra run to check for leaks in Level Zero using ZE_DEBUG
diff --git a/sycl/test-e2e/Graph/Explicit/usm_copy.cpp b/sycl/test-e2e/Graph/Explicit/usm_copy.cpp
index e0771a3e6d082..6025d1429ae84 100644
--- a/sycl/test-e2e/Graph/Explicit/usm_copy.cpp
+++ b/sycl/test-e2e/Graph/Explicit/usm_copy.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // Extra run to check for leaks in Level Zero using ZE_DEBUG
diff --git a/sycl/test-e2e/Graph/Explicit/usm_fill.cpp b/sycl/test-e2e/Graph/Explicit/usm_fill.cpp
index 463cc77b09871..0605004c6b113 100644
--- a/sycl/test-e2e/Graph/Explicit/usm_fill.cpp
+++ b/sycl/test-e2e/Graph/Explicit/usm_fill.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // Extra run to check for leaks in Level Zero using ZE_DEBUG
diff --git a/sycl/test-e2e/Graph/Explicit/usm_fill_host.cpp b/sycl/test-e2e/Graph/Explicit/usm_fill_host.cpp
index bf53345453a4b..e1fdd8a40fbd9 100644
--- a/sycl/test-e2e/Graph/Explicit/usm_fill_host.cpp
+++ b/sycl/test-e2e/Graph/Explicit/usm_fill_host.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // Extra run to check for leaks in Level Zero using ZE_DEBUG
diff --git a/sycl/test-e2e/Graph/Explicit/usm_fill_shared.cpp b/sycl/test-e2e/Graph/Explicit/usm_fill_shared.cpp
index 134dda9aaff6c..064dc20273548 100644
--- a/sycl/test-e2e/Graph/Explicit/usm_fill_shared.cpp
+++ b/sycl/test-e2e/Graph/Explicit/usm_fill_shared.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // Extra run to check for leaks in Level Zero using ZE_DEBUG
diff --git a/sycl/test-e2e/Graph/RecordReplay/add_nodes_after_finalize.cpp b/sycl/test-e2e/Graph/RecordReplay/add_nodes_after_finalize.cpp
index be0bcef2c8934..711a7c0838a1b 100644
--- a/sycl/test-e2e/Graph/RecordReplay/add_nodes_after_finalize.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/add_nodes_after_finalize.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // Extra run to check for leaks in Level Zero using ZE_DEBUG
diff --git a/sycl/test-e2e/Graph/RecordReplay/after_use.cpp b/sycl/test-e2e/Graph/RecordReplay/after_use.cpp
index 9fe5536f62a1e..01cc5aa51a1a6 100644
--- a/sycl/test-e2e/Graph/RecordReplay/after_use.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/after_use.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // Extra run to check for leaks in Level Zero using ZE_DEBUG
diff --git a/sycl/test-e2e/Graph/RecordReplay/assume_buffer_outlives_graph_property.cpp b/sycl/test-e2e/Graph/RecordReplay/assume_buffer_outlives_graph_property.cpp
index 1a9f955d95739..5a561a0835cb6 100644
--- a/sycl/test-e2e/Graph/RecordReplay/assume_buffer_outlives_graph_property.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/assume_buffer_outlives_graph_property.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // Extra run to check for leaks in Level Zero using ZE_DEBUG
diff --git a/sycl/test-e2e/Graph/RecordReplay/barrier_with_work.cpp b/sycl/test-e2e/Graph/RecordReplay/barrier_with_work.cpp
index b543383e35e03..d88706474430f 100644
--- a/sycl/test-e2e/Graph/RecordReplay/barrier_with_work.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/barrier_with_work.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // Extra run to check for leaks in Level Zero using ZE_DEBUG
diff --git a/sycl/test-e2e/Graph/RecordReplay/basic_buffer.cpp b/sycl/test-e2e/Graph/RecordReplay/basic_buffer.cpp
index 7d0c7c81d780f..aeee4b6f4cf2b 100644
--- a/sycl/test-e2e/Graph/RecordReplay/basic_buffer.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/basic_buffer.cpp
@@ -5,6 +5,9 @@
 // RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
 //
 // CHECK-NOT: LEAK
+//
+// TODO enable cuda once buffer issue investigated and fixed
+// UNSUPPORTED: cuda
 
 #define GRAPH_E2E_RECORD_REPLAY
 
diff --git a/sycl/test-e2e/Graph/RecordReplay/basic_usm.cpp b/sycl/test-e2e/Graph/RecordReplay/basic_usm.cpp
index 78616fc80a13b..988e22b75d049 100644
--- a/sycl/test-e2e/Graph/RecordReplay/basic_usm.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/basic_usm.cpp
@@ -1,5 +1,5 @@
-// REQUIRES: level_zero, gpu
-// RUN: %{build_pthread_inc} -o %t.out
+// REQUIRES: cuda || level_zero, gpu
+// RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // Extra run to check for leaks in Level Zero using ZE_DEBUG
 // RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
diff --git a/sycl/test-e2e/Graph/RecordReplay/basic_usm_host.cpp b/sycl/test-e2e/Graph/RecordReplay/basic_usm_host.cpp
index c3492b6d26722..d60dc0f454242 100644
--- a/sycl/test-e2e/Graph/RecordReplay/basic_usm_host.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/basic_usm_host.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // Extra run to check for leaks in Level Zero using ZE_DEBUG
diff --git a/sycl/test-e2e/Graph/RecordReplay/basic_usm_mixed.cpp b/sycl/test-e2e/Graph/RecordReplay/basic_usm_mixed.cpp
index b4b7f26ceebbf..4645f079cd004 100644
--- a/sycl/test-e2e/Graph/RecordReplay/basic_usm_mixed.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/basic_usm_mixed.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // Extra run to check for leaks in Level Zero using ZE_DEBUG
diff --git a/sycl/test-e2e/Graph/RecordReplay/basic_usm_shared.cpp b/sycl/test-e2e/Graph/RecordReplay/basic_usm_shared.cpp
index c3a140d64eae4..51747fa00f7a6 100644
--- a/sycl/test-e2e/Graph/RecordReplay/basic_usm_shared.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/basic_usm_shared.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // Extra run to check for leaks in Level Zero using ZE_DEBUG
diff --git a/sycl/test-e2e/Graph/RecordReplay/basic_usm_system.cpp b/sycl/test-e2e/Graph/RecordReplay/basic_usm_system.cpp
index e731b586885ac..360cb915f757e 100644
--- a/sycl/test-e2e/Graph/RecordReplay/basic_usm_system.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/basic_usm_system.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // Extra run to check for leaks in Level Zero using ZE_DEBUG
diff --git a/sycl/test-e2e/Graph/RecordReplay/buffer_copy.cpp b/sycl/test-e2e/Graph/RecordReplay/buffer_copy.cpp
index 77270b1e9bebe..b78ee6d160395 100644
--- a/sycl/test-e2e/Graph/RecordReplay/buffer_copy.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/buffer_copy.cpp
@@ -5,6 +5,9 @@
 // RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
 //
 // CHECK-NOT: LEAK
+//
+// TODO enable cuda once buffer issue investigated and fixed
+// UNSUPPORTED: cuda
 
 #define GRAPH_E2E_RECORD_REPLAY
 
diff --git a/sycl/test-e2e/Graph/RecordReplay/buffer_copy_2d.cpp b/sycl/test-e2e/Graph/RecordReplay/buffer_copy_2d.cpp
index d00aa10368368..f72ed81b5856a 100644
--- a/sycl/test-e2e/Graph/RecordReplay/buffer_copy_2d.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/buffer_copy_2d.cpp
@@ -5,6 +5,9 @@
 // RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
 //
 // CHECK-NOT: LEAK
+//
+// TODO enable cuda once buffer issue investigated and fixed
+// UNSUPPORTED: cuda
 
 #define GRAPH_E2E_RECORD_REPLAY
 
diff --git a/sycl/test-e2e/Graph/RecordReplay/buffer_copy_host2target.cpp b/sycl/test-e2e/Graph/RecordReplay/buffer_copy_host2target.cpp
index 7364dea5c7779..b72759f0a4067 100644
--- a/sycl/test-e2e/Graph/RecordReplay/buffer_copy_host2target.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/buffer_copy_host2target.cpp
@@ -5,6 +5,9 @@
 // RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
 //
 // CHECK-NOT: LEAK
+//
+// TODO enable cuda once buffer issue investigated and fixed
+// UNSUPPORTED: cuda
 
 #define GRAPH_E2E_RECORD_REPLAY
 
diff --git a/sycl/test-e2e/Graph/RecordReplay/buffer_copy_host2target_2d.cpp b/sycl/test-e2e/Graph/RecordReplay/buffer_copy_host2target_2d.cpp
index b650bc67faeb7..5af65c7a679a4 100644
--- a/sycl/test-e2e/Graph/RecordReplay/buffer_copy_host2target_2d.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/buffer_copy_host2target_2d.cpp
@@ -5,6 +5,9 @@
 // RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
 //
 // CHECK-NOT: LEAK
+//
+// TODO enable cuda once buffer issue investigated and fixed
+// UNSUPPORTED: cuda
 
 #define GRAPH_E2E_RECORD_REPLAY
 
diff --git a/sycl/test-e2e/Graph/RecordReplay/buffer_copy_host2target_offset.cpp b/sycl/test-e2e/Graph/RecordReplay/buffer_copy_host2target_offset.cpp
index 9f2cb1b787902..da446a0d11085 100644
--- a/sycl/test-e2e/Graph/RecordReplay/buffer_copy_host2target_offset.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/buffer_copy_host2target_offset.cpp
@@ -5,6 +5,9 @@
 // RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
 //
 // CHECK-NOT: LEAK
+//
+// TODO enable cuda once buffer issue investigated and fixed
+// UNSUPPORTED: cuda
 
 #define GRAPH_E2E_RECORD_REPLAY
 
diff --git a/sycl/test-e2e/Graph/RecordReplay/buffer_copy_offsets.cpp b/sycl/test-e2e/Graph/RecordReplay/buffer_copy_offsets.cpp
index 05922690d99f4..75fab329068cb 100644
--- a/sycl/test-e2e/Graph/RecordReplay/buffer_copy_offsets.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/buffer_copy_offsets.cpp
@@ -5,6 +5,9 @@
 // RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
 //
 // CHECK-NOT: LEAK
+//
+// TODO enable cuda once buffer issue investigated and fixed
+// UNSUPPORTED: cuda
 
 #define GRAPH_E2E_RECORD_REPLAY
 
diff --git a/sycl/test-e2e/Graph/RecordReplay/buffer_copy_target2host.cpp b/sycl/test-e2e/Graph/RecordReplay/buffer_copy_target2host.cpp
index 1954e2c5bfef8..bc5889d6c4f6b 100644
--- a/sycl/test-e2e/Graph/RecordReplay/buffer_copy_target2host.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/buffer_copy_target2host.cpp
@@ -5,6 +5,9 @@
 // RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
 //
 // CHECK-NOT: LEAK
+//
+// TODO enable cuda once buffer issue investigated and fixed
+// UNSUPPORTED: cuda
 
 #define GRAPH_E2E_RECORD_REPLAY
 
diff --git a/sycl/test-e2e/Graph/RecordReplay/buffer_copy_target2host_2d.cpp b/sycl/test-e2e/Graph/RecordReplay/buffer_copy_target2host_2d.cpp
index 2c3eaa28e7ad2..46091b899edcb 100644
--- a/sycl/test-e2e/Graph/RecordReplay/buffer_copy_target2host_2d.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/buffer_copy_target2host_2d.cpp
@@ -5,6 +5,9 @@
 // RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
 //
 // CHECK-NOT: LEAK
+//
+// TODO enable cuda once buffer issue investigated and fixed
+// UNSUPPORTED: cuda
 
 #define GRAPH_E2E_RECORD_REPLAY
 
diff --git a/sycl/test-e2e/Graph/RecordReplay/buffer_copy_target2host_offset.cpp b/sycl/test-e2e/Graph/RecordReplay/buffer_copy_target2host_offset.cpp
index 22f8934482d5e..72b1f5772ffca 100644
--- a/sycl/test-e2e/Graph/RecordReplay/buffer_copy_target2host_offset.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/buffer_copy_target2host_offset.cpp
@@ -5,6 +5,9 @@
 // RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
 //
 // CHECK-NOT: LEAK
+//
+// TODO enable cuda once buffer issue investigated and fixed
+// UNSUPPORTED: cuda
 
 #define GRAPH_E2E_RECORD_REPLAY
 
diff --git a/sycl/test-e2e/Graph/RecordReplay/concurrent_queue.cpp b/sycl/test-e2e/Graph/RecordReplay/concurrent_queue.cpp
index 70ce8653fa626..b48ce2f65df57 100644
--- a/sycl/test-e2e/Graph/RecordReplay/concurrent_queue.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/concurrent_queue.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // Extra run to check for leaks in Level Zero using ZE_DEBUG
diff --git a/sycl/test-e2e/Graph/RecordReplay/dotp_buffer_reduction.cpp b/sycl/test-e2e/Graph/RecordReplay/dotp_buffer_reduction.cpp
index 3e72076841306..8a3de62784020 100644
--- a/sycl/test-e2e/Graph/RecordReplay/dotp_buffer_reduction.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/dotp_buffer_reduction.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // Extra run to check for leaks in Level Zero using ZE_DEBUG
diff --git a/sycl/test-e2e/Graph/RecordReplay/dotp_in_order.cpp b/sycl/test-e2e/Graph/RecordReplay/dotp_in_order.cpp
index c9abf112e3556..505a8716dab1f 100644
--- a/sycl/test-e2e/Graph/RecordReplay/dotp_in_order.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/dotp_in_order.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // Extra run to check for leaks in Level Zero using ZE_DEBUG
diff --git a/sycl/test-e2e/Graph/RecordReplay/dotp_in_order_with_empty_nodes.cpp b/sycl/test-e2e/Graph/RecordReplay/dotp_in_order_with_empty_nodes.cpp
index 466f1e833eb31..20aae12d3b2e8 100644
--- a/sycl/test-e2e/Graph/RecordReplay/dotp_in_order_with_empty_nodes.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/dotp_in_order_with_empty_nodes.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // Extra run to check for leaks in Level Zero using ZE_DEBUG
diff --git a/sycl/test-e2e/Graph/RecordReplay/dotp_multiple_queues.cpp b/sycl/test-e2e/Graph/RecordReplay/dotp_multiple_queues.cpp
index a916a8cbc7147..08d2655d2e29c 100644
--- a/sycl/test-e2e/Graph/RecordReplay/dotp_multiple_queues.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/dotp_multiple_queues.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // Extra run to check for leaks in Level Zero using ZE_DEBUG
diff --git a/sycl/test-e2e/Graph/RecordReplay/dotp_usm_reduction.cpp b/sycl/test-e2e/Graph/RecordReplay/dotp_usm_reduction.cpp
index 6738affa87c13..c16582dd20f84 100644
--- a/sycl/test-e2e/Graph/RecordReplay/dotp_usm_reduction.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/dotp_usm_reduction.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // Extra run to check for leaks in Level Zero using ZE_DEBUG
diff --git a/sycl/test-e2e/Graph/RecordReplay/double_buffer.cpp b/sycl/test-e2e/Graph/RecordReplay/double_buffer.cpp
index edfaab45bb417..5eeb2207ec788 100644
--- a/sycl/test-e2e/Graph/RecordReplay/double_buffer.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/double_buffer.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // Extra run to check for leaks in Level Zero using ZE_DEBUG
diff --git a/sycl/test-e2e/Graph/RecordReplay/empty_node.cpp b/sycl/test-e2e/Graph/RecordReplay/empty_node.cpp
index 967cfbaaf58e8..95cf1baa62835 100644
--- a/sycl/test-e2e/Graph/RecordReplay/empty_node.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/empty_node.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // Extra run to check for leaks in Level Zero using ZE_DEBUG
diff --git a/sycl/test-e2e/Graph/RecordReplay/event_status_querying.cpp b/sycl/test-e2e/Graph/RecordReplay/event_status_querying.cpp
index f1a9ae3e49d1a..6b2b383fbb2fd 100644
--- a/sycl/test-e2e/Graph/RecordReplay/event_status_querying.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/event_status_querying.cpp
@@ -3,6 +3,9 @@
 // RUN: %{run} %t.out 2>&1 | FileCheck %s
 //
 // CHECK: complete
+//
+// TODO enable cuda once buffer issue investigated and fixed
+// UNSUPPORTED: cuda
 
 #define GRAPH_E2E_RECORD_REPLAY
 
diff --git a/sycl/test-e2e/Graph/RecordReplay/exception_inconsistent_contexts.cpp b/sycl/test-e2e/Graph/RecordReplay/exception_inconsistent_contexts.cpp
index ebcdc1757447b..b7057a96879f5 100644
--- a/sycl/test-e2e/Graph/RecordReplay/exception_inconsistent_contexts.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/exception_inconsistent_contexts.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 //
diff --git a/sycl/test-e2e/Graph/RecordReplay/executable_graph_update.cpp b/sycl/test-e2e/Graph/RecordReplay/executable_graph_update.cpp
index 84272c20dcd1d..59759b2eb6c75 100644
--- a/sycl/test-e2e/Graph/RecordReplay/executable_graph_update.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/executable_graph_update.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // Extra run to check for leaks in Level Zero using ZE_DEBUG
diff --git a/sycl/test-e2e/Graph/RecordReplay/executable_graph_update_ordering.cpp b/sycl/test-e2e/Graph/RecordReplay/executable_graph_update_ordering.cpp
index 89d724d106301..9e429f7d06a22 100644
--- a/sycl/test-e2e/Graph/RecordReplay/executable_graph_update_ordering.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/executable_graph_update_ordering.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // Extra run to check for leaks in Level Zero using ZE_DEBUG
diff --git a/sycl/test-e2e/Graph/RecordReplay/finalize_while_recording.cpp b/sycl/test-e2e/Graph/RecordReplay/finalize_while_recording.cpp
index 7c29be4f8023e..b681ac07ac365 100644
--- a/sycl/test-e2e/Graph/RecordReplay/finalize_while_recording.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/finalize_while_recording.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // Extra run to check for leaks in Level Zero using ZE_DEBUG
diff --git a/sycl/test-e2e/Graph/RecordReplay/host_task.cpp b/sycl/test-e2e/Graph/RecordReplay/host_task.cpp
index ee88c065efccf..a8c6bd2d6678a 100644
--- a/sycl/test-e2e/Graph/RecordReplay/host_task.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/host_task.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // Extra run to check for leaks in Level Zero using ZE_DEBUG
diff --git a/sycl/test-e2e/Graph/RecordReplay/kernel_bundle.cpp b/sycl/test-e2e/Graph/RecordReplay/kernel_bundle.cpp
index 5d05b84a0dfbe..cf30f29ebc882 100644
--- a/sycl/test-e2e/Graph/RecordReplay/kernel_bundle.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/kernel_bundle.cpp
@@ -1,6 +1,7 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
-// RUN: env SYCL_PI_TRACE=2 %{run} %t.out | FileCheck %s
+// RUN: %if ext_oneapi_cuda %{ %{run} %t.out %}
+// RUN: %if ext_oneapi_level_zero %{env SYCL_PI_TRACE=2 %{run} %t.out | FileCheck %s %}
 
 // Checks the PI call trace to ensure that the bundle kernel of the single task
 // is used.
diff --git a/sycl/test-e2e/Graph/RecordReplay/multiple_exec_graphs.cpp b/sycl/test-e2e/Graph/RecordReplay/multiple_exec_graphs.cpp
index 8a59f12d316b4..1a6754619f7d0 100644
--- a/sycl/test-e2e/Graph/RecordReplay/multiple_exec_graphs.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/multiple_exec_graphs.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // Extra run to check for leaks in Level Zero using ZE_DEBUG
diff --git a/sycl/test-e2e/Graph/RecordReplay/multiple_kernel_bundles.cpp b/sycl/test-e2e/Graph/RecordReplay/multiple_kernel_bundles.cpp
index adcd8909b6ae6..0d9030ebbae86 100644
--- a/sycl/test-e2e/Graph/RecordReplay/multiple_kernel_bundles.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/multiple_kernel_bundles.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // Extra run to check for leaks in Level Zero using ZE_DEBUG
diff --git a/sycl/test-e2e/Graph/RecordReplay/queue_shortcuts.cpp b/sycl/test-e2e/Graph/RecordReplay/queue_shortcuts.cpp
index 4bc6ed9af6976..423fc47a3fe1a 100644
--- a/sycl/test-e2e/Graph/RecordReplay/queue_shortcuts.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/queue_shortcuts.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // Extra run to check for leaks in Level Zero using ZE_DEBUG
diff --git a/sycl/test-e2e/Graph/RecordReplay/repeated_exec.cpp b/sycl/test-e2e/Graph/RecordReplay/repeated_exec.cpp
index 3a702d025b3d3..4e5f0d35dbdd5 100644
--- a/sycl/test-e2e/Graph/RecordReplay/repeated_exec.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/repeated_exec.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // Extra run to check for leaks in Level Zero using ZE_DEBUG
diff --git a/sycl/test-e2e/Graph/RecordReplay/return_values.cpp b/sycl/test-e2e/Graph/RecordReplay/return_values.cpp
index 252b0c2d24410..7804ae8377935 100644
--- a/sycl/test-e2e/Graph/RecordReplay/return_values.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/return_values.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // Extra run to check for leaks in Level Zero using ZE_DEBUG
diff --git a/sycl/test-e2e/Graph/RecordReplay/stream.cpp b/sycl/test-e2e/Graph/RecordReplay/stream.cpp
index 0cf1a4da36712..d395b624d5055 100644
--- a/sycl/test-e2e/Graph/RecordReplay/stream.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/stream.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out %GPU_CHECK_PLACEHOLDER
 // RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out %GPU_CHECK_PLACEHOLDER 2>&1 | FileCheck %s %}
diff --git a/sycl/test-e2e/Graph/RecordReplay/sub_graph.cpp b/sycl/test-e2e/Graph/RecordReplay/sub_graph.cpp
index 62b1d074e4bb9..11f89afad5f20 100644
--- a/sycl/test-e2e/Graph/RecordReplay/sub_graph.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/sub_graph.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // Extra run to check for leaks in Level Zero using ZE_DEBUG
diff --git a/sycl/test-e2e/Graph/RecordReplay/sub_graph_execute_without_parent.cpp b/sycl/test-e2e/Graph/RecordReplay/sub_graph_execute_without_parent.cpp
index 4921e51cc98aa..afa6ec38574a4 100644
--- a/sycl/test-e2e/Graph/RecordReplay/sub_graph_execute_without_parent.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/sub_graph_execute_without_parent.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // Extra run to check for leaks in Level Zero using ZE_DEBUG
diff --git a/sycl/test-e2e/Graph/RecordReplay/sub_graph_in_order.cpp b/sycl/test-e2e/Graph/RecordReplay/sub_graph_in_order.cpp
index 192ee8eb242b6..9bc0d33b92f19 100644
--- a/sycl/test-e2e/Graph/RecordReplay/sub_graph_in_order.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/sub_graph_in_order.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // Extra run to check for leaks in Level Zero using ZE_DEBUG
diff --git a/sycl/test-e2e/Graph/RecordReplay/sub_graph_multiple_submission.cpp b/sycl/test-e2e/Graph/RecordReplay/sub_graph_multiple_submission.cpp
index aedd9e252e252..767cf279b7c06 100644
--- a/sycl/test-e2e/Graph/RecordReplay/sub_graph_multiple_submission.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/sub_graph_multiple_submission.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // Extra run to check for leaks in Level Zero using ZE_DEBUG
diff --git a/sycl/test-e2e/Graph/RecordReplay/sub_graph_nested.cpp b/sycl/test-e2e/Graph/RecordReplay/sub_graph_nested.cpp
index 6fc2b39efade3..75865f6f08697 100644
--- a/sycl/test-e2e/Graph/RecordReplay/sub_graph_nested.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/sub_graph_nested.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // Extra run to check for leaks in Level Zero using ZE_DEBUG
diff --git a/sycl/test-e2e/Graph/RecordReplay/sub_graph_reduction.cpp b/sycl/test-e2e/Graph/RecordReplay/sub_graph_reduction.cpp
index de9cbead9634d..358a906035569 100644
--- a/sycl/test-e2e/Graph/RecordReplay/sub_graph_reduction.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/sub_graph_reduction.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // Extra run to check for leaks in Level Zero using ZE_DEBUG
diff --git a/sycl/test-e2e/Graph/RecordReplay/sub_graph_two_parent_graphs.cpp b/sycl/test-e2e/Graph/RecordReplay/sub_graph_two_parent_graphs.cpp
index 6eb49c39583da..382d1ab67fb18 100644
--- a/sycl/test-e2e/Graph/RecordReplay/sub_graph_two_parent_graphs.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/sub_graph_two_parent_graphs.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // Extra run to check for leaks in Level Zero using ZE_DEBUG
diff --git a/sycl/test-e2e/Graph/RecordReplay/temp_buffer.cpp b/sycl/test-e2e/Graph/RecordReplay/temp_buffer.cpp
index d5ae8492b92df..97aa3e8366872 100644
--- a/sycl/test-e2e/Graph/RecordReplay/temp_buffer.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/temp_buffer.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // Extra run to check for leaks in Level Zero using ZE_DEBUG
diff --git a/sycl/test-e2e/Graph/RecordReplay/temp_buffer_reinterpret.cpp b/sycl/test-e2e/Graph/RecordReplay/temp_buffer_reinterpret.cpp
index a51bcc967b2ee..1a124ae49d3b3 100644
--- a/sycl/test-e2e/Graph/RecordReplay/temp_buffer_reinterpret.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/temp_buffer_reinterpret.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // Extra run to check for leaks in Level Zero using ZE_DEBUG
diff --git a/sycl/test-e2e/Graph/RecordReplay/temp_scope.cpp b/sycl/test-e2e/Graph/RecordReplay/temp_scope.cpp
index ae769c74ed98b..dfad2e35a664b 100644
--- a/sycl/test-e2e/Graph/RecordReplay/temp_scope.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/temp_scope.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // Extra run to check for leaks in Level Zero using ZE_DEBUG
diff --git a/sycl/test-e2e/Graph/RecordReplay/usm_copy.cpp b/sycl/test-e2e/Graph/RecordReplay/usm_copy.cpp
index b24dc65614f1e..b9ddfb58189ce 100644
--- a/sycl/test-e2e/Graph/RecordReplay/usm_copy.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/usm_copy.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // Extra run to check for leaks in Level Zero using ZE_DEBUG
diff --git a/sycl/test-e2e/Graph/RecordReplay/usm_copy_in_order.cpp b/sycl/test-e2e/Graph/RecordReplay/usm_copy_in_order.cpp
index be9bcf3d64f10..2a1d45a6494b0 100644
--- a/sycl/test-e2e/Graph/RecordReplay/usm_copy_in_order.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/usm_copy_in_order.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // Extra run to check for leaks in Level Zero using ZE_DEBUG
diff --git a/sycl/test-e2e/Graph/RecordReplay/usm_fill.cpp b/sycl/test-e2e/Graph/RecordReplay/usm_fill.cpp
index db225c3efa0e3..4b48ef5fb4b5f 100644
--- a/sycl/test-e2e/Graph/RecordReplay/usm_fill.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/usm_fill.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // Extra run to check for leaks in Level Zero using ZE_DEBUG
diff --git a/sycl/test-e2e/Graph/RecordReplay/usm_fill_host.cpp b/sycl/test-e2e/Graph/RecordReplay/usm_fill_host.cpp
index ccd130320a528..ce53c03a22b53 100644
--- a/sycl/test-e2e/Graph/RecordReplay/usm_fill_host.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/usm_fill_host.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // Extra run to check for leaks in Level Zero using ZE_DEBUG
diff --git a/sycl/test-e2e/Graph/RecordReplay/usm_fill_shared.cpp b/sycl/test-e2e/Graph/RecordReplay/usm_fill_shared.cpp
index ca6e79542f356..9b99e85c634ca 100644
--- a/sycl/test-e2e/Graph/RecordReplay/usm_fill_shared.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/usm_fill_shared.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // Extra run to check for leaks in Level Zero using ZE_DEBUG
diff --git a/sycl/test-e2e/Graph/RecordReplay/valid_no_end.cpp b/sycl/test-e2e/Graph/RecordReplay/valid_no_end.cpp
index 04401ee7050f4..ae6847b9c8a88 100644
--- a/sycl/test-e2e/Graph/RecordReplay/valid_no_end.cpp
+++ b/sycl/test-e2e/Graph/RecordReplay/valid_no_end.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // Extra run to check for leaks in Level Zero using ZE_DEBUG
diff --git a/sycl/test-e2e/Graph/Threading/submit.cpp b/sycl/test-e2e/Graph/Threading/submit.cpp
index 543b6d0b78ed8..33ba51988ac2b 100644
--- a/sycl/test-e2e/Graph/Threading/submit.cpp
+++ b/sycl/test-e2e/Graph/Threading/submit.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build_pthread_inc} -o %t.out
 // RUN: %{run} %t.out
 // RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
diff --git a/sycl/test-e2e/Graph/device_query.cpp b/sycl/test-e2e/Graph/device_query.cpp
index cd37707f8a8c2..d9fbe99a16e2d 100644
--- a/sycl/test-e2e/Graph/device_query.cpp
+++ b/sycl/test-e2e/Graph/device_query.cpp
@@ -1,3 +1,4 @@
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
@@ -15,7 +16,8 @@ int main() {
       Device.get_info<exp_ext::info::device::graph_support>();
   auto Backend = Device.get_backend();
 
-  if (Backend == backend::ext_oneapi_level_zero) {
+  if ((Backend == backend::ext_oneapi_level_zero) ||
+      (Backend == backend::ext_oneapi_cuda)) {
     assert(SupportsGraphs == exp_ext::graph_support_level::native);
   } else {
     assert(SupportsGraphs == exp_ext::graph_support_level::unsupported);
diff --git a/sycl/test-e2e/Graph/empty_graph.cpp b/sycl/test-e2e/Graph/empty_graph.cpp
index 6ff6ee8700ad4..376facc78417e 100644
--- a/sycl/test-e2e/Graph/empty_graph.cpp
+++ b/sycl/test-e2e/Graph/empty_graph.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/Graph/finalize_twice.cpp b/sycl/test-e2e/Graph/finalize_twice.cpp
index bf505a8a525f1..8c4dbb08e5594 100644
--- a/sycl/test-e2e/Graph/finalize_twice.cpp
+++ b/sycl/test-e2e/Graph/finalize_twice.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/Graph/graph_exception_global_device_extension.cpp b/sycl/test-e2e/Graph/graph_exception_global_device_extension.cpp
index 56e974b6b96e1..0e43f6a802efd 100644
--- a/sycl/test-e2e/Graph/graph_exception_global_device_extension.cpp
+++ b/sycl/test-e2e/Graph/graph_exception_global_device_extension.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 //
diff --git a/sycl/test-e2e/Graph/invalid_depends_on.cpp b/sycl/test-e2e/Graph/invalid_depends_on.cpp
index b594eba35f643..dd8fb7ee423e4 100644
--- a/sycl/test-e2e/Graph/invalid_depends_on.cpp
+++ b/sycl/test-e2e/Graph/invalid_depends_on.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/Graph/invalid_event_wait.cpp b/sycl/test-e2e/Graph/invalid_event_wait.cpp
index 3e0a757f133a7..5d72f40954715 100644
--- a/sycl/test-e2e/Graph/invalid_event_wait.cpp
+++ b/sycl/test-e2e/Graph/invalid_event_wait.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/Graph/invalid_queue_wait.cpp b/sycl/test-e2e/Graph/invalid_queue_wait.cpp
index ecf5e5771aee6..003c840ead398 100644
--- a/sycl/test-e2e/Graph/invalid_queue_wait.cpp
+++ b/sycl/test-e2e/Graph/invalid_queue_wait.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/Graph/submission_while_executing.cpp b/sycl/test-e2e/Graph/submission_while_executing.cpp
index 4f57321f69539..1cad91e6bab45 100644
--- a/sycl/test-e2e/Graph/submission_while_executing.cpp
+++ b/sycl/test-e2e/Graph/submission_while_executing.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: level_zero, gpu
+// REQUIRES: cuda || level_zero, gpu
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 // RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}

From ffce75edafa1338f643b61ac4e4a6cf5f34b1f42 Mon Sep 17 00:00:00 2001
From: Martin Grant <martin.morrisongrant@codeplay.com>
Date: Fri, 3 Nov 2023 14:32:20 +0000
Subject: [PATCH 829/877] [OpenCL] Add version check for
 urProgramSetSpecializationConstants (#11720)

For https://github.com/oneapi-src/unified-runtime/pull/968
---
 sycl/plugins/unified_runtime/CMakeLists.txt | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/sycl/plugins/unified_runtime/CMakeLists.txt b/sycl/plugins/unified_runtime/CMakeLists.txt
index 16031abcff630..7988148627d21 100644
--- a/sycl/plugins/unified_runtime/CMakeLists.txt
+++ b/sycl/plugins/unified_runtime/CMakeLists.txt
@@ -54,13 +54,13 @@ if(SYCL_PI_UR_USE_FETCH_CONTENT)
   include(FetchContent)
 
   set(UNIFIED_RUNTIME_REPO "https://github.com/oneapi-src/unified-runtime.git")
-
-  # commit be53fb3ba5bf1ac33051456d72c61b6a01d94a72
+  # commit ba994bb401aefad3631012d592be672f6ea0e23b
+  # Merge: be53fb3b f2a365ca
   # Author: Kenneth Benzie (Benie) <k.benzie@codeplay.com>
-  # Date:   Tue Oct 31 13:22:52 2023 +0000
-  # - Merge pull request #932 from Bensuo/cuda-cmd-buffers
-  # - [CUDA][EXP] CUDA adapter support for command buffers
-  set(UNIFIED_RUNTIME_TAG be53fb3ba5bf1ac33051456d72c61b6a01d94a72)
+  # Date:   Fri Nov 3 11:09:52 2023 +0000
+  #     Merge pull request #968 from martygrant/martin/versionCheckSpecializationConstants
+  #     [OpenCL] Add version check for urProgramSetSpecializationConstants
+  set(UNIFIED_RUNTIME_TAG ba994bb401aefad3631012d592be672f6ea0e23b)
 
   if(SYCL_PI_UR_OVERRIDE_FETCH_CONTENT_REPO)
     set(UNIFIED_RUNTIME_REPO "${SYCL_PI_UR_OVERRIDE_FETCH_CONTENT_REPO}")

From 93badef48e7c9138d8ef7defa84f3bd7d90c2050 Mon Sep 17 00:00:00 2001
From: Vyacheslav Klochkov <vyacheslav.n.klochkov@intel.com>
Date: Fri, 3 Nov 2023 11:06:30 -0500
Subject: [PATCH 830/877] [SYCL][ESIMD] Fix the declarations and uses of min
 and max functions (#11754)

min and max() functions are absoletely legal to use in normal
conditions,
e.g. std::min(a, b), std::max(a, b), or esimd::max(simd1, simd2). That
may become a problem if 'windows.h' or 'minwindef.h' is included and min
and max MACRO are defined.

This patch fixes potential problems with that in ESIMD and std::simd
headers.
Also, adds 1 E2E test and 1 compile-only test for min, max.

---------

Signed-off-by: Klochkov, Vyacheslav N <vyacheslav.n.klochkov@intel.com>
---
 sycl/include/std/experimental/simd.hpp        |  4 +-
 sycl/include/sycl/ext/intel/esimd/math.hpp    | 44 ++++-----
 sycl/include/sycl/ext/intel/esimd/memory.hpp  |  8 +-
 .../include/sycl/ext/intel/esimd/xmx/dpas.hpp |  2 +-
 sycl/test-e2e/ESIMD/regression/minmax.cpp     | 92 +++++++++++++++++++
 sycl/test/esimd/esimd_math.cpp                | 13 +++
 6 files changed, 134 insertions(+), 29 deletions(-)
 create mode 100644 sycl/test-e2e/ESIMD/regression/minmax.cpp

diff --git a/sycl/include/std/experimental/simd.hpp b/sycl/include/std/experimental/simd.hpp
index 0544e26494bb9..f4ee8c998f489 100644
--- a/sycl/include/std/experimental/simd.hpp
+++ b/sycl/include/std/experimental/simd.hpp
@@ -1321,10 +1321,10 @@ hmax(const const_where_expression<_MaskType, _SimdType>&);
 
 // algorithms [simd.alg]
 template <class _Tp, class _Abi>
-simd<_Tp, _Abi> min(const simd<_Tp, _Abi>&, const simd<_Tp, _Abi>&) noexcept;
+simd<_Tp, _Abi> (min)(const simd<_Tp, _Abi>&, const simd<_Tp, _Abi>&) noexcept;
 
 template <class _Tp, class _Abi>
-simd<_Tp, _Abi> max(const simd<_Tp, _Abi>&, const simd<_Tp, _Abi>&) noexcept;
+simd<_Tp, _Abi> (max)(const simd<_Tp, _Abi>&, const simd<_Tp, _Abi>&) noexcept;
 
 template <class _Tp, class _Abi>
 std::pair<simd<_Tp, _Abi>, simd<_Tp, _Abi>>
diff --git a/sycl/include/sycl/ext/intel/esimd/math.hpp b/sycl/include/sycl/ext/intel/esimd/math.hpp
index ebf21e0e4e30e..0c2a97ec262e2 100644
--- a/sycl/include/sycl/ext/intel/esimd/math.hpp
+++ b/sycl/include/sycl/ext/intel/esimd/math.hpp
@@ -177,7 +177,7 @@ abs(T1 src0) {
 /// values: saturation_on/saturation_off.
 /// @return vector of component-wise maximum elements.
 template <typename T, int SZ, class Sat = saturation_off_tag>
-__ESIMD_API simd<T, SZ> max(simd<T, SZ> src0, simd<T, SZ> src1, Sat sat = {}) {
+__ESIMD_API simd<T, SZ>(max)(simd<T, SZ> src0, simd<T, SZ> src1, Sat sat = {}) {
   constexpr bool is_sat = std::is_same_v<Sat, saturation_on_tag>;
 
   if constexpr (std::is_floating_point<T>::value) {
@@ -209,10 +209,10 @@ __ESIMD_API simd<T, SZ> max(simd<T, SZ> src0, simd<T, SZ> src1, Sat sat = {}) {
 /// values: saturation_on/saturation_off.
 /// @return vector of component-wise maximum elements.
 template <typename T, int SZ, class Sat = saturation_off_tag>
-__ESIMD_API std::enable_if_t<detail::is_esimd_scalar<T>::value, simd<T, SZ>>
-max(simd<T, SZ> src0, T src1, Sat sat = {}) {
+__ESIMD_API std::enable_if_t<detail::is_esimd_scalar<T>::value, simd<T, SZ>>(
+    max)(simd<T, SZ> src0, T src1, Sat sat = {}) {
   simd<T, SZ> Src1 = src1;
-  simd<T, SZ> Result = esimd::max<T>(src0, Src1, sat);
+  simd<T, SZ> Result = (esimd::max)(src0, Src1, sat);
   return Result;
 }
 
@@ -227,10 +227,10 @@ max(simd<T, SZ> src0, T src1, Sat sat = {}) {
 /// values: saturation_on/saturation_off.
 /// @return vector of component-wise maximum elements.
 template <typename T, int SZ, class Sat = saturation_off_tag>
-__ESIMD_API std::enable_if_t<detail::is_esimd_scalar<T>::value, simd<T, SZ>>
-max(T src0, simd<T, SZ> src1, Sat sat = {}) {
+__ESIMD_API std::enable_if_t<detail::is_esimd_scalar<T>::value, simd<T, SZ>>(
+    max)(T src0, simd<T, SZ> src1, Sat sat = {}) {
   simd<T, SZ> Src0 = src0;
-  simd<T, SZ> Result = esimd::max<T>(Src0, src1, sat);
+  simd<T, SZ> Result = (esimd::max)(Src0, src1, sat);
   return Result;
 }
 
@@ -243,12 +243,12 @@ max(T src0, simd<T, SZ> src1, Sat sat = {}) {
 /// values: saturation_on/saturation_off.
 /// @return maximum value between the two inputs.
 template <typename T, class Sat = saturation_off_tag>
-ESIMD_NODEBUG
-    ESIMD_INLINE std::enable_if_t<detail::is_esimd_scalar<T>::value, T>
-    max(T src0, T src1, Sat sat = {}) {
+ESIMD_NODEBUG ESIMD_INLINE
+std::enable_if_t<detail::is_esimd_scalar<T>::value, T>(max)(T src0, T src1,
+                                                            Sat sat = {}) {
   simd<T, 1> Src0 = src0;
   simd<T, 1> Src1 = src1;
-  simd<T, 1> Result = esimd::max<T>(Src0, Src1, sat);
+  simd<T, 1> Result = (esimd::max)(Src0, Src1, sat);
   return Result[0];
 }
 
@@ -262,7 +262,7 @@ ESIMD_NODEBUG
 /// values: saturation_on/saturation_off.
 /// @return vector of component-wise minimum elements.
 template <typename T, int SZ, class Sat = saturation_off_tag>
-__ESIMD_API simd<T, SZ> min(simd<T, SZ> src0, simd<T, SZ> src1, Sat sat = {}) {
+__ESIMD_API simd<T, SZ>(min)(simd<T, SZ> src0, simd<T, SZ> src1, Sat sat = {}) {
   constexpr bool is_sat = std::is_same_v<Sat, saturation_on_tag>;
 
   if constexpr (std::is_floating_point<T>::value) {
@@ -294,10 +294,10 @@ __ESIMD_API simd<T, SZ> min(simd<T, SZ> src0, simd<T, SZ> src1, Sat sat = {}) {
 /// values: saturation_on/saturation_off.
 /// @return vector of component-wise minimum elements.
 template <typename T, int SZ, class Sat = saturation_off_tag>
-__ESIMD_API std::enable_if_t<detail::is_esimd_scalar<T>::value, simd<T, SZ>>
-min(simd<T, SZ> src0, T src1, Sat sat = {}) {
+__ESIMD_API std::enable_if_t<detail::is_esimd_scalar<T>::value, simd<T, SZ>>(
+    min)(simd<T, SZ> src0, T src1, Sat sat = {}) {
   simd<T, SZ> Src1 = src1;
-  simd<T, SZ> Result = esimd::min<T>(src0, Src1, sat);
+  simd<T, SZ> Result = (esimd::min)(src0, Src1, sat);
   return Result;
 }
 
@@ -312,10 +312,10 @@ min(simd<T, SZ> src0, T src1, Sat sat = {}) {
 /// values: saturation_on/saturation_off.
 /// @return vector of component-wise minimum elements.
 template <typename T, int SZ, class Sat = saturation_off_tag>
-__ESIMD_API std::enable_if_t<detail::is_esimd_scalar<T>::value, simd<T, SZ>>
-min(T src0, simd<T, SZ> src1, Sat sat = {}) {
+__ESIMD_API std::enable_if_t<detail::is_esimd_scalar<T>::value, simd<T, SZ>>(
+    min)(T src0, simd<T, SZ> src1, Sat sat = {}) {
   simd<T, SZ> Src0 = src0;
-  simd<T, SZ> Result = esimd::min<T>(Src0, src1, sat);
+  simd<T, SZ> Result = (esimd::min)(Src0, src1, sat);
   return Result;
 }
 
@@ -328,12 +328,12 @@ min(T src0, simd<T, SZ> src1, Sat sat = {}) {
 /// values: saturation_on/saturation_off.
 /// @return minimum value between the two inputs.
 template <typename T, class Sat = saturation_off_tag>
-ESIMD_NODEBUG
-    ESIMD_INLINE std::enable_if_t<detail::is_esimd_scalar<T>::value, T>
-    min(T src0, T src1, Sat sat = {}) {
+ESIMD_NODEBUG ESIMD_INLINE
+std::enable_if_t<detail::is_esimd_scalar<T>::value, T>(min)(T src0, T src1,
+                                                            Sat sat = {}) {
   simd<T, 1> Src0 = src0;
   simd<T, 1> Src1 = src1;
-  simd<T, 1> Result = esimd::min<T>(Src0, Src1, sat);
+  simd<T, 1> Result = (esimd::min)(Src0, Src1, sat);
   return Result[0];
 }
 
diff --git a/sycl/include/sycl/ext/intel/esimd/memory.hpp b/sycl/include/sycl/ext/intel/esimd/memory.hpp
index 308e95c312ecc..9f6ca4fe99ba7 100644
--- a/sycl/include/sycl/ext/intel/esimd/memory.hpp
+++ b/sycl/include/sycl/ext/intel/esimd/memory.hpp
@@ -247,7 +247,7 @@ block_load_impl(const T *p, simd_mask<1> pred, FlagsT flags) {
 
   constexpr int SmallIntFactor64Bit = sizeof(uint64_t) / sizeof(T);
   constexpr int SmallIntFactor32Bit =
-      std::max(static_cast<size_t>(1), sizeof(uint32_t) / sizeof(T));
+      (std::max)(static_cast<size_t>(1), sizeof(uint32_t) / sizeof(T));
   static_assert(NElts > 0 && NElts % SmallIntFactor32Bit == 0,
                 "Number of elements is not supported by Transposed load");
 
@@ -333,7 +333,7 @@ block_load_impl(const T *p, simd_mask<1> pred, simd<T, NElts> pass_thru,
 
   constexpr int SmallIntFactor64Bit = sizeof(uint64_t) / sizeof(T);
   constexpr int SmallIntFactor32Bit =
-      std::max(static_cast<size_t>(1), sizeof(uint32_t) / sizeof(T));
+      (std::max)(static_cast<size_t>(1), sizeof(uint32_t) / sizeof(T));
   static_assert(NElts > 0 && NElts % SmallIntFactor32Bit == 0,
                 "Number of elements is not supported by Transposed load");
 
@@ -431,7 +431,7 @@ __ESIMD_API
 
   constexpr int SmallIntFactor64Bit = sizeof(uint64_t) / sizeof(T);
   constexpr int SmallIntFactor32Bit =
-      std::max(static_cast<size_t>(1), sizeof(uint32_t) / sizeof(T));
+      (std::max)(static_cast<size_t>(1), sizeof(uint32_t) / sizeof(T));
   static_assert(NElts > 0 && NElts % SmallIntFactor32Bit == 0,
                 "Number of elements is not supported by Transposed load");
 
@@ -531,7 +531,7 @@ __ESIMD_API
 
   constexpr int SmallIntFactor64Bit = sizeof(uint64_t) / sizeof(T);
   constexpr int SmallIntFactor32Bit =
-      std::max(static_cast<size_t>(1), sizeof(uint32_t) / sizeof(T));
+      (std::max)(static_cast<size_t>(1), sizeof(uint32_t) / sizeof(T));
   static_assert(NElts > 0 && NElts % SmallIntFactor32Bit == 0,
                 "Number of elements is not supported by Transposed load");
 
diff --git a/sycl/include/sycl/ext/intel/esimd/xmx/dpas.hpp b/sycl/include/sycl/ext/intel/esimd/xmx/dpas.hpp
index ef0fb1f557a01..5c86b194c192b 100644
--- a/sycl/include/sycl/ext/intel/esimd/xmx/dpas.hpp
+++ b/sycl/include/sycl/ext/intel/esimd/xmx/dpas.hpp
@@ -90,7 +90,7 @@ constexpr int verify_parameters_and_deduce_exec_size() {
   verify_repeat_count<RepeatCount, AElemBitSize, BElemBitSize, IsDPASW>();
 
   constexpr int OpsPerChannel =
-      std::max(std::min(32 / std::max(AElemBitSize, BElemBitSize), 8), 1);
+      (std::max)((std::min)(32 / (std::max)(AElemBitSize, BElemBitSize), 8), 1);
 
   // A(_Mx_K) * B(_Kx_N) + C(_Mx_N)
   // where:
diff --git a/sycl/test-e2e/ESIMD/regression/minmax.cpp b/sycl/test-e2e/ESIMD/regression/minmax.cpp
new file mode 100644
index 0000000000000..e1d2a9fa70312
--- /dev/null
+++ b/sycl/test-e2e/ESIMD/regression/minmax.cpp
@@ -0,0 +1,92 @@
+//==------------- minmax.cpp  - DPC++ ESIMD on-device test -----------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+
+// This test checks that the call of esimd::min() and esimd::max() functions
+// work correctly (even if windows.h or other OLD system header file is included
+// and 'max' and 'min' are defined there as macros, and even more if those files
+// are included without predefinition of NOMINMAX macro).
+
+#if defined(_WIN32)
+// NOMINMAX is intentionally NOT defined to cause potential problems with
+// min/max functions.
+// #define NOMINMAX
+#include <windows.h>
+#endif
+
+#include "../esimd_test_utils.hpp"
+
+#include <iostream>
+#include <sycl/ext/intel/esimd.hpp>
+#include <sycl/sycl.hpp>
+
+using namespace sycl;
+namespace esimd = sycl::ext::intel::esimd;
+
+int main() {
+  constexpr unsigned VL = 32;
+
+  queue Q(esimd_test::ESIMDSelector, esimd_test::createExceptionHandler());
+  esimd_test::printTestLabel(Q);
+
+  float *A = malloc_shared<float>(VL, Q);
+  float *B = malloc_shared<float>(VL, Q);
+  float *Res = malloc_shared<float>(2 * VL, Q);
+
+  for (unsigned i = 0; i < VL; ++i) {
+    A[i] = i;
+    if (i % 3 == 0)
+      B[i] = -i; // A > B
+    else if (i % 3 == 1)
+      B[i] = i; // A == B
+    else if (i % 3 == 2)
+      B[i] = i + 1000; // A >= B
+    Res[i] = 0;
+    Res[i + VL] = 0;
+  }
+
+  try {
+    Q.single_task([=]() SYCL_ESIMD_KERNEL {
+       using namespace sycl::ext::intel::esimd;
+       simd<float, VL> VecA(A);
+       simd<float, VL> VecB(B);
+       auto MaxRes = (esimd::max)(VecA, VecB);
+       auto MinRes = (esimd::min)(VecA, VecB);
+       MinRes.copy_to(Res);
+       MaxRes.copy_to(Res + VL);
+     }).wait();
+  } catch (sycl::exception const &e) {
+    std::cout << "SYCL exception caught: " << e.what() << '\n';
+    free(A, Q);
+    free(B, Q);
+    free(Res, Q);
+    return 1;
+  }
+
+  int NumErrors = 0;
+  for (unsigned i = 0; i < VL; ++i) {
+    float ExpectedMin = (std::min)(A[i], B[i]);
+    float ExpectedMax = (std::max)(A[i], B[i]);
+    if (Res[i] != ExpectedMin) {
+      std::cout << "failed at min/index " << i << ", " << Res[i]
+                << " != " << ExpectedMin << "\n";
+      NumErrors++;
+    }
+    if (Res[i + VL] != ExpectedMax) {
+      std::cout << "failed at max/index " << i << ", " << Res[i + VL]
+                << " != " << ExpectedMax << "\n";
+      NumErrors++;
+    }
+  }
+  free(A, Q);
+  free(B, Q);
+  free(Res, Q);
+  std::cout << ((NumErrors == 0) ? "Passed\n" : "FAILED\n");
+  return NumErrors == 0 ? 0 : 1;
+}
diff --git a/sycl/test/esimd/esimd_math.cpp b/sycl/test/esimd/esimd_math.cpp
index ee8773fa45208..69344cb6efce6 100644
--- a/sycl/test/esimd/esimd_math.cpp
+++ b/sycl/test/esimd/esimd_math.cpp
@@ -1,6 +1,19 @@
 // RUN: %clangxx -fsycl -fsycl-device-only -fsyntax-only -Xclang -verify %s
 // expected-no-diagnostics
 
+// clang-format off
+#ifdef _WIN32
+// This include is added only to checks that the declarations and calls of std::min,
+// std::max, esimd::min() and esimd::max() functions inside ESIMD headers do not
+// cause problems even if if windows.h or other OLD system header file is included
+// and 'max' and 'min' are defined there as macros.
+//
+// NOMINMAX is intentionally NOT defined to perform the required check.
+// #define NOMINMAX
+#include <windows.h>
+#endif // _WIN32
+// clang-format on
+
 #include <limits>
 #include <sycl/ext/intel/esimd.hpp>
 #include <utility>

From 7df615ef9984d234ab7030e38870e74eb20464b7 Mon Sep 17 00:00:00 2001
From: LU-JOHN <111294400+LU-JOHN@users.noreply.github.com>
Date: Fri, 3 Nov 2023 11:08:58 -0500
Subject: [PATCH 831/877] Initialize ptr variable to nullptr (#11753)

Make sure class member CurrentDeviceImageInitializer is initialized.

Signed-off-by: Lu, John <john.lu@intel.com>
---
 clang/tools/clang-offload-wrapper/SymPropReader.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/tools/clang-offload-wrapper/SymPropReader.h b/clang/tools/clang-offload-wrapper/SymPropReader.h
index 3af9fdd860387..50c5f6ff5398d 100644
--- a/clang/tools/clang-offload-wrapper/SymPropReader.h
+++ b/clang/tools/clang-offload-wrapper/SymPropReader.h
@@ -29,7 +29,7 @@ class SymPropReader {
   // Initializer for all device images.  Size is ImageCnt
   Constant *DeviceImagesInitializer{nullptr};
   // Initializer for the ImageIndex-th image in DeviceImagesInitializer
-  Constant *CurrentDeviceImageInitializer;
+  Constant *CurrentDeviceImageInitializer{nullptr};
 
   LLVMContext SymPropsC;
   ExitOnError SymPropsExitOnErr;

From 67ac998f52ea4283257f91aa7705d552ab82a5c9 Mon Sep 17 00:00:00 2001
From: Evgeniy <evgeniy.tyurin@intel.com>
Date: Fri, 3 Nov 2023 09:10:43 -0700
Subject: [PATCH 832/877] [SYCL][ESIMD][E2E] Restore preemption.cpp UNSUPPORTED
 line (#11777)

---
 sycl/test-e2e/ESIMD/preemption.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sycl/test-e2e/ESIMD/preemption.cpp b/sycl/test-e2e/ESIMD/preemption.cpp
index 93f244ad71380..137edfcb992f2 100644
--- a/sycl/test-e2e/ESIMD/preemption.cpp
+++ b/sycl/test-e2e/ESIMD/preemption.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 // REQUIRES: linux
+// UNSUPPORTED: gpu-intel-dg2 || gpu-intel-pvc
 // RUN: %{build} -o %t.out
 // RUN: env IGC_DumpToCustomDir=%t.dump IGC_ShaderDumpEnable=1 %{run} %t.out
 // RUN: grep enablePreemption %t.dump/*.asm

From f634cc9bf799d92029da2b3090db4563191c1f0a Mon Sep 17 00:00:00 2001
From: Alexey Sachkov <alexey.sachkov@intel.com>
Date: Fri, 3 Nov 2023 17:38:28 +0100
Subject: [PATCH 833/877] [SYCL] Refactor `vec::convert` implementation
 (#11770)

Introduced ability to perform vector conversions using a single
instruction instead of doing so per-element.
---
 sycl/include/sycl/detail/vector_convert.hpp | 703 ++++++++++++++------
 sycl/include/sycl/types.hpp                 |  63 +-
 sycl/test/basic_tests/vectors/vectors.cpp   |  18 +-
 3 files changed, 554 insertions(+), 230 deletions(-)

diff --git a/sycl/include/sycl/detail/vector_convert.hpp b/sycl/include/sycl/detail/vector_convert.hpp
index 9e5be22d5a9f8..a03e838669296 100644
--- a/sycl/include/sycl/detail/vector_convert.hpp
+++ b/sycl/include/sycl/detail/vector_convert.hpp
@@ -6,6 +6,52 @@
 //
 //===----------------------------------------------------------------------===//
 
+/// \file
+/// Implementation of \c vec::convert
+///
+/// Implementation consists of a bunch of helper functions to do different
+/// conversions, as well as a single entry point function, which performs a
+/// dispatch into the right helper.
+///
+/// Helpers are made to match corresponding SPIR-V instructions defined in
+/// section 3.42.11. Conversion Instructions (SPIR-V specification version 1.6,
+/// revision 2, unified):
+/// - \c OpConvertFToU - float to unsigned integer
+/// - \c OpConvertFToS - float to signed integer
+/// - \c OpConvertSToF - signed integer to float
+/// - \c OpConvertUToF - unsigned integer to float
+/// - \c OpUConvert - unsigned integer to unsigned integer
+/// - \c OpSConvert - signed integer to signed integer
+/// - \c OpConvertF - float to float
+/// - \c OpSatConvertSToU - signed integer to unsigned integer
+/// - \c OpSatConvertUToS - unsigned integer to signed integer
+///
+/// To get the right SPIR-V instruction emitted from SYCL code, we need to make
+/// a call to a specific built-in in the following format:
+///   \c __spirv_[Op]_R[DestType][N?][_RoundingMode?]
+/// where:
+/// - \c [Op] is the name of instruction from the list above, without "Op"
+///   prefix.
+/// - \c [DestType] is the name of scalar return type
+/// - \c [N?] is vector size; omitted for scalars
+/// - \c [RoundingMode?] is rounding mode suffix, can be omitted
+///
+/// Implementation below is essentially split into two parts: for host and for
+/// device.
+///
+/// Host part is really simple, as we only have scalar conversions available in
+/// there. Most of them are implemented as regular \c static_cast with exception
+/// for float to integer conversions, which require rounding mode handling.
+///
+/// Device part is more complicated, because we need to generate calls to those
+/// \c __spirv* built-ins. To do so, macro code generation is used: we emit
+/// number of function templates, which are conditionally enabled using SFINAE
+/// for each destination type and vector size to each call their own \c __spriv*
+/// built-in.
+///
+/// Finally, there is single entry point which performs a dispatch to the right
+/// helper depending on source and destination types.
+
 #pragma once
 
 #include <sycl/detail/generic_type_traits.hpp> // for is_sigeninteger, is_s...
@@ -25,75 +71,99 @@ enum class rounding_mode { automatic = 0, rte = 1, rtz = 2, rtp = 3, rtn = 4 };
 inline namespace _V1 {
 namespace detail {
 
-template <typename T, typename R>
-using is_int_to_int = std::integral_constant<bool, std::is_integral_v<T> &&
-                                                       std::is_integral_v<R>>;
-
 template <typename T, typename R>
 using is_sint_to_sint =
-    std::integral_constant<bool, is_sigeninteger_v<T> && is_sigeninteger_v<R>>;
+    std::bool_constant<is_sigeninteger_v<T> && is_sigeninteger_v<R>>;
 
 template <typename T, typename R>
 using is_uint_to_uint =
-    std::integral_constant<bool, is_sugeninteger_v<T> && is_sugeninteger_v<R>>;
+    std::bool_constant<is_sugeninteger_v<T> && is_sugeninteger_v<R>>;
+
+template <typename T, typename R>
+using is_sint_to_uint =
+    std::bool_constant<is_sigeninteger_v<T> && is_sugeninteger_v<R>>;
 
 template <typename T, typename R>
-using is_sint_to_from_uint =
-    std::integral_constant<bool,
-                           (is_sugeninteger_v<T> && is_sigeninteger_v<R>) ||
-                               (is_sigeninteger_v<T> && is_sugeninteger_v<R>)>;
+using is_uint_to_sint =
+    std::bool_constant<is_sugeninteger_v<T> && is_sigeninteger_v<R>>;
 
 template <typename T, typename R>
-using is_sint_to_float = std::integral_constant<
-    bool, std::is_integral_v<T> &&
-              !(std::is_unsigned_v<T>)&&detail::is_floating_point<R>::value>;
+using is_sint_to_float =
+    std::bool_constant<std::is_integral_v<T> && !std::is_unsigned_v<T> &&
+                       detail::is_floating_point<R>::value>;
 
 template <typename T, typename R>
 using is_uint_to_float =
-    std::integral_constant<bool, std::is_unsigned_v<T> &&
-                                     detail::is_floating_point<R>::value>;
+    std::bool_constant<std::is_unsigned_v<T> &&
+                       detail::is_floating_point<R>::value>;
 
 template <typename T, typename R>
-using is_int_to_float =
-    std::integral_constant<bool, std::is_integral_v<T> &&
-                                     detail::is_floating_point<R>::value>;
+using is_int_to_float = std::bool_constant<std::is_integral_v<T> &&
+                                           detail::is_floating_point<R>::value>;
 
 template <typename T, typename R>
-using is_float_to_int =
-    std::integral_constant<bool, detail::is_floating_point<T>::value &&
-                                     std::is_integral_v<R>>;
+using is_float_to_uint =
+    std::bool_constant<detail::is_floating_point<T>::value &&
+                       std::is_unsigned_v<R>>;
+
+template <typename T, typename R>
+using is_float_to_sint =
+    std::bool_constant<detail::is_floating_point<T>::value &&
+                       std::is_integral_v<R> && !std::is_unsigned_v<R>>;
 
 template <typename T, typename R>
 using is_float_to_float =
-    std::integral_constant<bool, detail::is_floating_point<T>::value &&
-                                     detail::is_floating_point<R>::value>;
-template <typename T>
-using is_standard_type = std::integral_constant<bool, detail::is_sgentype_v<T>>;
-
-template <typename T, typename R, rounding_mode roundingMode, typename OpenCLT,
-          typename OpenCLR>
-std::enable_if_t<std::is_same_v<T, R>, R> convertImpl(T Value) {
-  return Value;
-}
+    std::bool_constant<detail::is_floating_point<T>::value &&
+                       detail::is_floating_point<R>::value>;
 
 #ifndef __SYCL_DEVICE_ONLY__
+template <typename From, typename To, int VecSize,
+          typename Enable = std::enable_if_t<VecSize == 1>>
+To SConvert(From Value) {
+  return static_cast<To>(Value);
+}
+
+template <typename From, typename To, int VecSize,
+          typename Enable = std::enable_if_t<VecSize == 1>>
+To UConvert(From Value) {
+  return static_cast<To>(Value);
+}
+
+template <typename From, typename To, int VecSize,
+          typename Enable = std::enable_if_t<VecSize == 1>>
+To ConvertSToF(From Value) {
+  return static_cast<To>(Value);
+}
+
+template <typename From, typename To, int VecSize,
+          typename Enable = std::enable_if_t<VecSize == 1>>
+To ConvertUToF(From Value) {
+  return static_cast<To>(Value);
+}
+
+template <typename From, typename To, int VecSize,
+          typename Enable = std::enable_if_t<VecSize == 1>>
+To SatConvertSToU(From Value) {
+  return static_cast<To>(Value);
+}
+
+template <typename From, typename To, int VecSize,
+          typename Enable = std::enable_if_t<VecSize == 1>>
+To SatConvertUToS(From Value) {
+  return static_cast<To>(Value);
+}
 
-// Note for float to half conversions, static_cast calls the conversion operator
-// implemented for host that takes care of the precision requirements.
-template <typename T, typename R, rounding_mode roundingMode, typename OpenCLT,
-          typename OpenCLR>
-std::enable_if_t<!std::is_same_v<T, R> && (is_int_to_int<T, R>::value ||
-                                           is_int_to_float<T, R>::value ||
-                                           is_float_to_float<T, R>::value),
-                 R>
-convertImpl(T Value) {
-  return static_cast<R>(Value);
+template <typename From, typename To, int VecSize,
+          typename Enable = std::enable_if_t<VecSize == 1>,
+          sycl::rounding_mode RM>
+To FConvert(From Value) {
+  return static_cast<To>(Value);
 }
 
-// float to int
-template <typename T, typename R, rounding_mode roundingMode, typename OpenCLT,
-          typename OpenCLR>
-std::enable_if_t<is_float_to_int<T, R>::value, R> convertImpl(T Value) {
+template <typename From, typename To, int VecSize,
+          typename Enable = std::enable_if_t<VecSize == 1>,
+          sycl::rounding_mode roundingMode>
+To ConvertFToS(From Value) {
   switch (roundingMode) {
     // Round to nearest even is default rounding mode for floating-point types
   case rounding_mode::automatic:
@@ -104,7 +174,7 @@ std::enable_if_t<is_float_to_int<T, R>::value, R> convertImpl(T Value) {
     if (Err)
       throw sycl::exception(make_error_code(errc::runtime),
                             "Unable to set rounding mode to FE_TONEAREST");
-    R Result = sycl::rint(Value);
+    To Result = sycl::rint(Value);
     Err = std::fesetround(OldRoundingDirection);
     if (Err)
       throw sycl::exception(make_error_code(errc::runtime),
@@ -122,10 +192,24 @@ std::enable_if_t<is_float_to_int<T, R>::value, R> convertImpl(T Value) {
     return sycl::floor(Value);
   };
   assert(false && "Unsupported rounding mode!");
-  return static_cast<R>(Value);
+  return static_cast<To>(Value);
+}
+
+template <typename From, typename To, int VecSize,
+          typename Enable = std::enable_if_t<VecSize == 1>,
+          sycl::rounding_mode roundingMode>
+To ConvertFToU(From Value) {
+  return ConvertFToS<From, To, VecSize, Enable, roundingMode>(Value);
 }
 #else
 
+// Bunch of helpers to "specialize" each template for its own destination type
+// and vector size.
+
+// Added for unification, to be able to have single enable_if-like trait for all
+// cases regardless of whether rounding mode is actually applicable or not.
+template <rounding_mode Mode> using AnyRM = std::bool_constant<true>;
+
 template <rounding_mode Mode>
 using RteOrAutomatic = std::bool_constant<Mode == rounding_mode::automatic ||
                                           Mode == rounding_mode::rte>;
@@ -139,192 +223,377 @@ using Rtp = std::bool_constant<Mode == rounding_mode::rtp>;
 template <rounding_mode Mode>
 using Rtn = std::bool_constant<Mode == rounding_mode::rtn>;
 
-// convert types with an equal size and diff names
-template <typename T, typename R, rounding_mode roundingMode, typename OpenCLT,
-          typename OpenCLR>
-std::enable_if_t<!std::is_same_v<T, R> && std::is_same_v<OpenCLT, OpenCLR>, R>
-convertImpl(T Value) {
-  return static_cast<R>(Value);
-}
+template <int VecSize> using IsScalar = std::bool_constant<VecSize == 1>;
+
+template <int ExpectedVecSize, int ActualVecSize>
+using IsVectorOf = std::bool_constant<ActualVecSize == ExpectedVecSize>;
+
+// This is a key condition for "specializations" below: it helps restrict each
+// "specialization" to a (mostly) single type with one exception for
+// signed char -> char case.
+template <typename ExpectedType, typename ActualType>
+using IsExpectedIntType =
+    std::bool_constant<std::is_same_v<ExpectedType, ActualType> ||
+                       (std::is_same_v<ExpectedType, sycl::opencl::cl_char> &&
+                        std::is_same_v<ActualType, signed char>)>;
+
+// Helpers which are used for conversions to an integer type
+template <typename ExpectedType, typename ActualType, int VecSize,
+          typename ReturnType,
+          template <sycl::rounding_mode> typename RoundingModeCondition,
+          sycl::rounding_mode RoundingMode>
+struct enable_if_to_int_scalar
+    : std::enable_if<IsExpectedIntType<ExpectedType, ActualType>::value &&
+                         IsScalar<VecSize>::value &&
+                         RoundingModeCondition<RoundingMode>::value,
+                     ReturnType> {};
+
+template <typename ExpectedType, typename ActualType, int VecSize,
+          typename ReturnType,
+          template <sycl::rounding_mode> typename RoundingModeCondition = AnyRM,
+          sycl::rounding_mode RoundingMode = sycl::rounding_mode::automatic>
+using enable_if_to_int_scalar_t =
+    typename enable_if_to_int_scalar<ExpectedType, ActualType, VecSize,
+                                     ReturnType, RoundingModeCondition,
+                                     RoundingMode>::type;
+
+template <typename ExpectedType, typename ActualType, int ExpectedVecSize,
+          int ActualVecSize, typename ReturnType,
+          template <sycl::rounding_mode> typename RoundingModeCondition,
+          sycl::rounding_mode RoundingMode>
+struct enable_if_to_int_vector
+    : std::enable_if<IsExpectedIntType<ExpectedType, ActualType>::value &&
+                         IsVectorOf<ExpectedVecSize, ActualVecSize>::value &&
+                         RoundingModeCondition<RoundingMode>::value,
+                     ReturnType> {};
+
+template <typename ExpectedType, typename ActualType, int ExpectedVecSize,
+          int ActualVecSize, typename ReturnType,
+          template <sycl::rounding_mode> typename RoundingModeCondition = AnyRM,
+          sycl::rounding_mode RoundingMode = sycl::rounding_mode::automatic>
+using enable_if_to_int_vector_t =
+    typename enable_if_to_int_vector<ExpectedType, ActualType, ExpectedVecSize,
+                                     ActualVecSize, ReturnType,
+                                     RoundingModeCondition, RoundingMode>::type;
+
+// signed to signed, unsigned to unsigned conversions
+#define __SYCL_SCALAR_INT_INT_CONVERT(Op, DestType)                            \
+  template <typename From, typename To, int VecSize, typename Enable>          \
+  enable_if_to_int_scalar_t<sycl::opencl::cl_##DestType, Enable, VecSize, To>  \
+      Op##Convert(From value) {                                                \
+    return __spirv_##Op##Convert_R##DestType(value);                           \
+  }
 
-// signed to signed
-#define __SYCL_GENERATE_CONVERT_IMPL(DestType)                                 \
-  template <typename T, typename R, rounding_mode roundingMode,                \
-            typename OpenCLT, typename OpenCLR>                                \
-  std::enable_if_t<is_sint_to_sint<T, R>::value &&                             \
-                       !std::is_same_v<OpenCLT, OpenCLR> &&                    \
-                       (std::is_same_v<OpenCLR, opencl::cl_##DestType> ||      \
-                        (std::is_same_v<OpenCLR, signed char> &&               \
-                         std::is_same_v<DestType, char>)),                     \
-                   R>                                                          \
-  convertImpl(T Value) {                                                       \
-    OpenCLT OpValue = sycl::detail::convertDataToType<T, OpenCLT>(Value);      \
-    return __spirv_SConvert##_R##DestType(OpValue);                            \
+#define __SYCL_VECTOR_INT_INT_CONVERT(Op, N, DestType)                         \
+  template <typename From, typename To, int VecSize, typename Enable>          \
+  enable_if_to_int_vector_t<sycl::opencl::cl_##DestType, Enable, N, VecSize,   \
+                            To>                                                \
+      Op##Convert(From value) {                                                \
+    return __spirv_##Op##Convert_R##DestType##N(value);                        \
   }
 
-__SYCL_GENERATE_CONVERT_IMPL(char)
-__SYCL_GENERATE_CONVERT_IMPL(short)
-__SYCL_GENERATE_CONVERT_IMPL(int)
-__SYCL_GENERATE_CONVERT_IMPL(long)
-
-#undef __SYCL_GENERATE_CONVERT_IMPL
-
-// unsigned to unsigned
-#define __SYCL_GENERATE_CONVERT_IMPL(DestType)                                 \
-  template <typename T, typename R, rounding_mode roundingMode,                \
-            typename OpenCLT, typename OpenCLR>                                \
-  std::enable_if_t<is_uint_to_uint<T, R>::value &&                             \
-                       !std::is_same_v<OpenCLT, OpenCLR> &&                    \
-                       std::is_same_v<OpenCLR, opencl::cl_##DestType>,         \
-                   R>                                                          \
-  convertImpl(T Value) {                                                       \
-    OpenCLT OpValue = sycl::detail::convertDataToType<T, OpenCLT>(Value);      \
-    return __spirv_UConvert##_R##DestType(OpValue);                            \
+#define __SYCL_INT_INT_CONVERT(Op, DestType)                                   \
+  __SYCL_SCALAR_INT_INT_CONVERT(Op, DestType)                                  \
+  __SYCL_VECTOR_INT_INT_CONVERT(Op, 2, DestType)                               \
+  __SYCL_VECTOR_INT_INT_CONVERT(Op, 3, DestType)                               \
+  __SYCL_VECTOR_INT_INT_CONVERT(Op, 4, DestType)                               \
+  __SYCL_VECTOR_INT_INT_CONVERT(Op, 8, DestType)                               \
+  __SYCL_VECTOR_INT_INT_CONVERT(Op, 16, DestType)
+
+__SYCL_INT_INT_CONVERT(S, char)
+__SYCL_INT_INT_CONVERT(S, short)
+__SYCL_INT_INT_CONVERT(S, int)
+__SYCL_INT_INT_CONVERT(S, long)
+
+__SYCL_INT_INT_CONVERT(U, uchar)
+__SYCL_INT_INT_CONVERT(U, ushort)
+__SYCL_INT_INT_CONVERT(U, uint)
+__SYCL_INT_INT_CONVERT(U, ulong)
+
+#undef __SYCL_SCALAR_INT_INT_CONVERT
+#undef __SYCL_VECTOR_INT_INT_CONVERT
+#undef __SYCL_INT_INT_CONVERT
+
+// signed to unsigned, unsigned to signed conversions
+#define __SYCL_SCALAR_SINT_UINT_CONVERT(Op, DestType)                          \
+  template <typename From, typename To, int VecSize, typename Enable>          \
+  enable_if_to_int_scalar_t<sycl::opencl::cl_##DestType, Enable, VecSize, To>  \
+      SatConvert##Op(From value) {                                             \
+    return __spirv_SatConvert##Op##_R##DestType(value);                        \
   }
 
-__SYCL_GENERATE_CONVERT_IMPL(uchar)
-__SYCL_GENERATE_CONVERT_IMPL(ushort)
-__SYCL_GENERATE_CONVERT_IMPL(uint)
-__SYCL_GENERATE_CONVERT_IMPL(ulong)
-
-#undef __SYCL_GENERATE_CONVERT_IMPL
-
-// unsigned to (from) signed
-template <typename T, typename R, rounding_mode roundingMode, typename OpenCLT,
-          typename OpenCLR>
-std::enable_if_t<is_sint_to_from_uint<T, R>::value &&
-                     is_standard_type<OpenCLT>::value &&
-                     is_standard_type<OpenCLR>::value,
-                 R>
-convertImpl(T Value) {
-  return static_cast<R>(Value);
-}
+#define __SYCL_VECTOR_SINT_UINT_CONVERT(Op, N, DestType)                       \
+  template <typename From, typename To, int VecSize, typename Enable>          \
+  enable_if_to_int_vector_t<sycl::opencl::cl_##DestType, Enable, N, VecSize,   \
+                            To>                                                \
+      SatConvert##Op(From value) {                                             \
+    return __spirv_SatConvert##Op##_R##DestType##N(value);                     \
+  }
 
-// sint to float
-#define __SYCL_GENERATE_CONVERT_IMPL(SPIRVOp, DestType)                        \
-  template <typename T, typename R, rounding_mode roundingMode,                \
-            typename OpenCLT, typename OpenCLR>                                \
-  std::enable_if_t<is_sint_to_float<T, R>::value &&                            \
-                       (std::is_same_v<OpenCLR, DestType> ||                   \
-                        (std::is_same_v<OpenCLR, _Float16> &&                  \
-                         std::is_same_v<DestType, half>)),                     \
-                   R>                                                          \
-  convertImpl(T Value) {                                                       \
-    OpenCLT OpValue = sycl::detail::convertDataToType<T, OpenCLT>(Value);      \
-    return __spirv_Convert##SPIRVOp##_R##DestType(OpValue);                    \
+#define __SYCL_SINT_UINT_CONVERT(Op, DestType)                                 \
+  __SYCL_SCALAR_SINT_UINT_CONVERT(Op, DestType)                                \
+  __SYCL_VECTOR_SINT_UINT_CONVERT(Op, 2, DestType)                             \
+  __SYCL_VECTOR_SINT_UINT_CONVERT(Op, 3, DestType)                             \
+  __SYCL_VECTOR_SINT_UINT_CONVERT(Op, 4, DestType)                             \
+  __SYCL_VECTOR_SINT_UINT_CONVERT(Op, 8, DestType)                             \
+  __SYCL_VECTOR_SINT_UINT_CONVERT(Op, 16, DestType)
+
+__SYCL_SINT_UINT_CONVERT(UToS, char)
+__SYCL_SINT_UINT_CONVERT(UToS, short)
+__SYCL_SINT_UINT_CONVERT(UToS, int)
+__SYCL_SINT_UINT_CONVERT(UToS, long)
+
+__SYCL_SINT_UINT_CONVERT(SToU, uchar)
+__SYCL_SINT_UINT_CONVERT(SToU, ushort)
+__SYCL_SINT_UINT_CONVERT(SToU, uint)
+__SYCL_SINT_UINT_CONVERT(SToU, ulong)
+
+#undef __SYCL_SCALAR_SINT_UINT_CONVERT
+#undef __SYCL_VECTOR_SINT_UINT_CONVERT
+#undef __SYCL_SINT_UINT_CONVERT
+
+// float to signed, float to unsigned conversion
+#define __SYCL_SCALAR_FLOAT_INT_CONVERT(Op, DestType, RoundingMode,            \
+                                        RoundingModeCondition)                 \
+  template <typename From, typename To, int VecSize, typename Enable,          \
+            sycl::rounding_mode RM>                                            \
+  enable_if_to_int_scalar_t<sycl::opencl::cl_##DestType, Enable, VecSize, To,  \
+                            RoundingModeCondition, RM>                         \
+      Convert##Op(From Value) {                                                \
+    return __spirv_Convert##Op##_R##DestType##_##RoundingMode(Value);          \
   }
 
-__SYCL_GENERATE_CONVERT_IMPL(SToF, half)
-__SYCL_GENERATE_CONVERT_IMPL(SToF, float)
-__SYCL_GENERATE_CONVERT_IMPL(SToF, double)
-
-#undef __SYCL_GENERATE_CONVERT_IMPL
-
-// uint to float
-#define __SYCL_GENERATE_CONVERT_IMPL(SPIRVOp, DestType)                        \
-  template <typename T, typename R, rounding_mode roundingMode,                \
-            typename OpenCLT, typename OpenCLR>                                \
-  std::enable_if_t<is_uint_to_float<T, R>::value &&                            \
-                       (std::is_same_v<OpenCLR, DestType> ||                   \
-                        (std::is_same_v<OpenCLR, _Float16> &&                  \
-                         std::is_same_v<DestType, half>)),                     \
-                   R>                                                          \
-  convertImpl(T Value) {                                                       \
-    OpenCLT OpValue = sycl::detail::convertDataToType<T, OpenCLT>(Value);      \
-    return __spirv_Convert##SPIRVOp##_R##DestType(OpValue);                    \
+#define __SYCL_VECTOR_FLOAT_INT_CONVERT(Op, N, DestType, RoundingMode,         \
+                                        RoundingModeCondition)                 \
+  template <typename From, typename To, int VecSize, typename Enable,          \
+            sycl::rounding_mode RM>                                            \
+  enable_if_to_int_vector_t<sycl::opencl::cl_##DestType, Enable, N, VecSize,   \
+                            To, RoundingModeCondition, RM>                     \
+      Convert##Op(From Value) {                                                \
+    return __spirv_Convert##Op##_R##DestType##N##_##RoundingMode(Value);       \
   }
 
-__SYCL_GENERATE_CONVERT_IMPL(UToF, half)
-__SYCL_GENERATE_CONVERT_IMPL(UToF, float)
-__SYCL_GENERATE_CONVERT_IMPL(UToF, double)
-
-#undef __SYCL_GENERATE_CONVERT_IMPL
-
-// float to float
-#define __SYCL_GENERATE_CONVERT_IMPL(DestType, RoundingMode,                   \
-                                     RoundingModeCondition)                    \
-  template <typename T, typename R, rounding_mode roundingMode,                \
-            typename OpenCLT, typename OpenCLR>                                \
-  std::enable_if_t<is_float_to_float<T, R>::value &&                           \
-                       !std::is_same_v<OpenCLT, OpenCLR> &&                    \
-                       (std::is_same_v<OpenCLR, DestType> ||                   \
-                        (std::is_same_v<OpenCLR, _Float16> &&                  \
-                         std::is_same_v<DestType, half>)) &&                   \
-                       RoundingModeCondition<roundingMode>::value,             \
-                   R>                                                          \
-  convertImpl(T Value) {                                                       \
-    OpenCLT OpValue = sycl::detail::convertDataToType<T, OpenCLT>(Value);      \
-    return __spirv_FConvert##_R##DestType##_##RoundingMode(OpValue);           \
+#define __SYCL_FLOAT_INT_CONVERT(Op, DestType, RoundingMode,                   \
+                                 RoundingModeCondition)                        \
+  __SYCL_SCALAR_FLOAT_INT_CONVERT(Op, DestType, RoundingMode,                  \
+                                  RoundingModeCondition)                       \
+  __SYCL_VECTOR_FLOAT_INT_CONVERT(Op, 2, DestType, RoundingMode,               \
+                                  RoundingModeCondition)                       \
+  __SYCL_VECTOR_FLOAT_INT_CONVERT(Op, 3, DestType, RoundingMode,               \
+                                  RoundingModeCondition)                       \
+  __SYCL_VECTOR_FLOAT_INT_CONVERT(Op, 4, DestType, RoundingMode,               \
+                                  RoundingModeCondition)                       \
+  __SYCL_VECTOR_FLOAT_INT_CONVERT(Op, 8, DestType, RoundingMode,               \
+                                  RoundingModeCondition)                       \
+  __SYCL_VECTOR_FLOAT_INT_CONVERT(Op, 16, DestType, RoundingMode,              \
+                                  RoundingModeCondition)
+
+#define __SYCL_FLOAT_INT_CONVERT_FOR_TYPE(Op, DestType)                        \
+  __SYCL_FLOAT_INT_CONVERT(Op, DestType, rte, RteOrAutomatic)                  \
+  __SYCL_FLOAT_INT_CONVERT(Op, DestType, rtz, Rtz)                             \
+  __SYCL_FLOAT_INT_CONVERT(Op, DestType, rtp, Rtp)                             \
+  __SYCL_FLOAT_INT_CONVERT(Op, DestType, rtn, Rtn)
+
+__SYCL_FLOAT_INT_CONVERT_FOR_TYPE(FToS, char)
+__SYCL_FLOAT_INT_CONVERT_FOR_TYPE(FToS, short)
+__SYCL_FLOAT_INT_CONVERT_FOR_TYPE(FToS, int)
+__SYCL_FLOAT_INT_CONVERT_FOR_TYPE(FToS, long)
+
+__SYCL_FLOAT_INT_CONVERT_FOR_TYPE(FToU, uchar)
+__SYCL_FLOAT_INT_CONVERT_FOR_TYPE(FToU, ushort)
+__SYCL_FLOAT_INT_CONVERT_FOR_TYPE(FToU, uint)
+__SYCL_FLOAT_INT_CONVERT_FOR_TYPE(FToU, ulong)
+
+#undef __SYCL_SCALAR_FLOAT_INT_CONVERT
+#undef __SYCL_VECTOR_FLOAT_INT_CONVERT
+#undef __SYCL_FLOAT_INT_CONVERT
+#undef __SYCL_FLOAT_INT_CONVERT_FOR_TYPE
+
+// Helpers which are used for conversions to a floating-point type
+template <typename ExpectedType, typename ActualType>
+using IsExpectedFloatType =
+    std::bool_constant<std::is_same_v<ExpectedType, ActualType> ||
+                       (std::is_same_v<ExpectedType, sycl::opencl::cl_half> &&
+                        std::is_same_v<ActualType, _Float16>)>;
+
+template <typename ExpectedType, typename ActualType, int VecSize,
+          typename ReturnType,
+          template <sycl::rounding_mode> typename RoundingModeCondition,
+          sycl::rounding_mode RoundingMode>
+struct enable_if_to_float_scalar
+    : std::enable_if<IsExpectedFloatType<ExpectedType, ActualType>::value &&
+                         IsScalar<VecSize>::value &&
+                         RoundingModeCondition<RoundingMode>::value,
+                     ReturnType> {};
+
+template <typename ExpectedType, typename ActualType, int VecSize,
+          typename ReturnType,
+          template <sycl::rounding_mode> typename RoundingModeCondition = AnyRM,
+          sycl::rounding_mode RoundingMode = sycl::rounding_mode::automatic>
+using enable_if_to_float_scalar_t =
+    typename enable_if_to_float_scalar<ExpectedType, ActualType, VecSize,
+                                       ReturnType, RoundingModeCondition,
+                                       RoundingMode>::type;
+
+template <typename ExpectedType, typename ActualType, int ExpectedVecSize,
+          int ActualVecSize, typename ReturnType,
+          template <sycl::rounding_mode> typename RoundingModeCondition,
+          sycl::rounding_mode RoundingMode>
+struct enable_if_to_float_vector
+    : std::enable_if<IsExpectedFloatType<ExpectedType, ActualType>::value &&
+                         IsVectorOf<ExpectedVecSize, ActualVecSize>::value &&
+                         RoundingModeCondition<RoundingMode>::value,
+                     ReturnType> {};
+
+template <typename ExpectedType, typename ActualType, int ExpectedVecSize,
+          int ActualVecSize, typename ReturnType,
+          template <sycl::rounding_mode> typename RoundingModeCondition = AnyRM,
+          sycl::rounding_mode RoundingMode = sycl::rounding_mode::automatic>
+using enable_if_to_float_vector_t = typename enable_if_to_float_vector<
+    ExpectedType, ActualType, ExpectedVecSize, ActualVecSize, ReturnType,
+    RoundingModeCondition, RoundingMode>::type;
+
+// signed to float, unsigned to float conversions
+#define __SYCL_SCALAR_INT_FLOAT_CONVERT(Op, DestType)                          \
+  template <typename From, typename To, int VecSize, typename Enable>          \
+  enable_if_to_float_scalar_t<sycl::opencl::cl_##DestType, Enable, VecSize,    \
+                              To>                                              \
+      Convert##Op(From value) {                                                \
+    return __spirv_Convert##Op##_R##DestType(value);                           \
   }
 
-#define __SYCL_GENERATE_CONVERT_IMPL_FOR_ROUNDING_MODE(RoundingMode,           \
-                                                       RoundingModeCondition)  \
-  __SYCL_GENERATE_CONVERT_IMPL(double, RoundingMode, RoundingModeCondition)    \
-  __SYCL_GENERATE_CONVERT_IMPL(float, RoundingMode, RoundingModeCondition)     \
-  __SYCL_GENERATE_CONVERT_IMPL(half, RoundingMode, RoundingModeCondition)
-
-__SYCL_GENERATE_CONVERT_IMPL_FOR_ROUNDING_MODE(rte, RteOrAutomatic)
-__SYCL_GENERATE_CONVERT_IMPL_FOR_ROUNDING_MODE(rtz, Rtz)
-__SYCL_GENERATE_CONVERT_IMPL_FOR_ROUNDING_MODE(rtp, Rtp)
-__SYCL_GENERATE_CONVERT_IMPL_FOR_ROUNDING_MODE(rtn, Rtn)
-
-#undef __SYCL_GENERATE_CONVERT_IMPL_FOR_ROUNDING_MODE
-#undef __SYCL_GENERATE_CONVERT_IMPL
-
-// float to int
-#define __SYCL_GENERATE_CONVERT_IMPL(SPIRVOp, DestType, RoundingMode,          \
-                                     RoundingModeCondition)                    \
-  template <typename T, typename R, rounding_mode roundingMode,                \
-            typename OpenCLT, typename OpenCLR>                                \
-  std::enable_if_t<is_float_to_int<T, R>::value &&                             \
-                       (std::is_same_v<OpenCLR, opencl::cl_##DestType> ||      \
-                        (std::is_same_v<OpenCLR, signed char> &&               \
-                         std::is_same_v<DestType, char>)) &&                   \
-                       RoundingModeCondition<roundingMode>::value,             \
-                   R>                                                          \
-  convertImpl(T Value) {                                                       \
-    OpenCLT OpValue = sycl::detail::convertDataToType<T, OpenCLT>(Value);      \
-    return __spirv_Convert##SPIRVOp##_R##DestType##_##RoundingMode(OpValue);   \
+#define __SYCL_VECTOR_INT_FLOAT_CONVERT(Op, N, DestType)                       \
+  template <typename From, typename To, int VecSize, typename Enable>          \
+  enable_if_to_float_vector_t<sycl::opencl::cl_##DestType, Enable, N, VecSize, \
+                              To>                                              \
+      Convert##Op(From value) {                                                \
+    return __spirv_Convert##Op##_R##DestType##N(value);                        \
   }
 
-#define __SYCL_GENERATE_CONVERT_IMPL_FOR_ROUNDING_MODE(RoundingMode,           \
-                                                       RoundingModeCondition)  \
-  __SYCL_GENERATE_CONVERT_IMPL(FToS, int, RoundingMode, RoundingModeCondition) \
-  __SYCL_GENERATE_CONVERT_IMPL(FToS, char, RoundingMode,                       \
-                               RoundingModeCondition)                          \
-  __SYCL_GENERATE_CONVERT_IMPL(FToS, short, RoundingMode,                      \
-                               RoundingModeCondition)                          \
-  __SYCL_GENERATE_CONVERT_IMPL(FToS, long, RoundingMode,                       \
-                               RoundingModeCondition)                          \
-  __SYCL_GENERATE_CONVERT_IMPL(FToU, uint, RoundingMode,                       \
-                               RoundingModeCondition)                          \
-  __SYCL_GENERATE_CONVERT_IMPL(FToU, uchar, RoundingMode,                      \
-                               RoundingModeCondition)                          \
-  __SYCL_GENERATE_CONVERT_IMPL(FToU, ushort, RoundingMode,                     \
-                               RoundingModeCondition)                          \
-  __SYCL_GENERATE_CONVERT_IMPL(FToU, ulong, RoundingMode, RoundingModeCondition)
-
-__SYCL_GENERATE_CONVERT_IMPL_FOR_ROUNDING_MODE(rte, RteOrAutomatic)
-__SYCL_GENERATE_CONVERT_IMPL_FOR_ROUNDING_MODE(rtz, Rtz)
-__SYCL_GENERATE_CONVERT_IMPL_FOR_ROUNDING_MODE(rtp, Rtp)
-__SYCL_GENERATE_CONVERT_IMPL_FOR_ROUNDING_MODE(rtn, Rtn)
-
-#undef __SYCL_GENERATE_CONVERT_IMPL_FOR_ROUNDING_MODE
-#undef __SYCL_GENERATE_CONVERT_IMPL
-
-// Back up
-template <typename T, typename R, rounding_mode roundingMode, typename OpenCLT,
-          typename OpenCLR>
-std::enable_if_t<
-    ((!is_standard_type<T>::value && !is_standard_type<OpenCLT>::value) ||
-     (!is_standard_type<R>::value && !is_standard_type<OpenCLR>::value)) &&
-        !std::is_same_v<OpenCLT, OpenCLR>,
-    R>
-convertImpl(T Value) {
-  return static_cast<R>(Value);
-}
+#define __SYCL_INT_FLOAT_CONVERT(Op, DestType)                                 \
+  __SYCL_SCALAR_INT_FLOAT_CONVERT(Op, DestType)                                \
+  __SYCL_VECTOR_INT_FLOAT_CONVERT(Op, 2, DestType)                             \
+  __SYCL_VECTOR_INT_FLOAT_CONVERT(Op, 3, DestType)                             \
+  __SYCL_VECTOR_INT_FLOAT_CONVERT(Op, 4, DestType)                             \
+  __SYCL_VECTOR_INT_FLOAT_CONVERT(Op, 8, DestType)                             \
+  __SYCL_VECTOR_INT_FLOAT_CONVERT(Op, 16, DestType)
+
+__SYCL_INT_FLOAT_CONVERT(SToF, half)
+__SYCL_INT_FLOAT_CONVERT(SToF, float)
+__SYCL_INT_FLOAT_CONVERT(SToF, double)
+
+__SYCL_INT_FLOAT_CONVERT(UToF, half)
+__SYCL_INT_FLOAT_CONVERT(UToF, float)
+__SYCL_INT_FLOAT_CONVERT(UToF, double)
+
+#undef __SYCL_SCALAR_INT_FLOAT_CONVERT
+#undef __SYCL_VECTOR_INT_FLOAT_CONVERT
+#undef __SYCL_INT_FLOAT_CONVERT
+
+// float to float conversions
+#define __SYCL_SCALAR_FLOAT_FLOAT_CONVERT(DestType, RoundingMode,              \
+                                          RoundingModeCondition)               \
+  template <typename From, typename To, int VecSize, typename Enable,          \
+            sycl::rounding_mode RM>                                            \
+  enable_if_to_float_scalar_t<sycl::opencl::cl_##DestType, Enable, VecSize,    \
+                              To, RoundingModeCondition, RM>                   \
+  FConvert(From Value) {                                                       \
+    return __spirv_FConvert_R##DestType##_##RoundingMode(Value);               \
+  }
+
+#define __SYCL_VECTOR_FLOAT_FLOAT_CONVERT(N, DestType, RoundingMode,           \
+                                          RoundingModeCondition)               \
+  template <typename From, typename To, int VecSize, typename Enable,          \
+            sycl::rounding_mode RM>                                            \
+  enable_if_to_float_vector_t<sycl::opencl::cl_##DestType, Enable, N, VecSize, \
+                              To, RoundingModeCondition, RM>                   \
+  FConvert(From Value) {                                                       \
+    return __spirv_FConvert_R##DestType##N##_##RoundingMode(Value);            \
+  }
+
+#define __SYCL_FLOAT_FLOAT_CONVERT(DestType, RoundingMode,                     \
+                                   RoundingModeCondition)                      \
+  __SYCL_SCALAR_FLOAT_FLOAT_CONVERT(DestType, RoundingMode,                    \
+                                    RoundingModeCondition)                     \
+  __SYCL_VECTOR_FLOAT_FLOAT_CONVERT(2, DestType, RoundingMode,                 \
+                                    RoundingModeCondition)                     \
+  __SYCL_VECTOR_FLOAT_FLOAT_CONVERT(3, DestType, RoundingMode,                 \
+                                    RoundingModeCondition)                     \
+  __SYCL_VECTOR_FLOAT_FLOAT_CONVERT(4, DestType, RoundingMode,                 \
+                                    RoundingModeCondition)                     \
+  __SYCL_VECTOR_FLOAT_FLOAT_CONVERT(8, DestType, RoundingMode,                 \
+                                    RoundingModeCondition)                     \
+  __SYCL_VECTOR_FLOAT_FLOAT_CONVERT(16, DestType, RoundingMode,                \
+                                    RoundingModeCondition)
+
+#define __SYCL_FLOAT_FLOAT_CONVERT_FOR_TYPE(DestType)                          \
+  __SYCL_FLOAT_FLOAT_CONVERT(DestType, rte, RteOrAutomatic)                    \
+  __SYCL_FLOAT_FLOAT_CONVERT(DestType, rtz, Rtz)                               \
+  __SYCL_FLOAT_FLOAT_CONVERT(DestType, rtp, Rtp)                               \
+  __SYCL_FLOAT_FLOAT_CONVERT(DestType, rtn, Rtn)
+
+__SYCL_FLOAT_FLOAT_CONVERT_FOR_TYPE(half)
+__SYCL_FLOAT_FLOAT_CONVERT_FOR_TYPE(float)
+__SYCL_FLOAT_FLOAT_CONVERT_FOR_TYPE(double)
+
+#undef __SYCL_SCALAR_FLOAT_FLOAT_CONVERT
+#undef __SYCL_VECTOR_FLOAT_FLOAT_CONVERT
+#undef __SYCL_FLOAT_FLOAT_CONVERT
+#undef __SYCL_FLOAT_FLOAT_CONVERT_FOR_TYPE
 
 #endif // __SYCL_DEVICE_ONLY__
+
+/// Entry point helper for all kinds of converts between scalars and vectors, it
+/// dispatches to a right function depending on source and destination types.
+///
+/// \tparam FromT \b scalar user-visible type to convert \a from, used to detect
+/// conversion kind. It is expected to be \c DataT template argument of a vector
+/// we are trying to convert \a from
+/// \tparam ToT \b scalar user-visible type to convert \a to, used to detect
+/// conversion kind. It is expected to be \c DataT template argument of a vector
+/// we are trying to convert \a to
+/// \tparam NativeFromT \b scalar or \b vector internal type corresponding to
+/// \c FromT, which is used to hold vector data. It is expected to be
+/// vec<FromT, VecSize>::vector_t of a vector we are trying to convert \a from
+/// if VecSize > 1, or result of detail::ConvertToOpenCLType_t<FromT>
+/// \tparam NativeToT \b scalar or \b vector internal type corresponding to
+/// \c ToT, which is used to hold vector data. It is expected to be
+/// vec<ToT, VecSize>::vector_t of a vector we are trying to convert \a from
+/// if VecSize > 1, or result of detail::ConvertToOpenCLType_t<ToT>
+///
+/// \note Each pair of types FromT, ToT and NativeFromT, NativeToT can't contain
+/// the same type, because there are no no-op convert instructions in SPIR-V.
+template <typename FromT, typename ToT, sycl::rounding_mode RoundingMode,
+          int VecSize, typename NativeFromT, typename NativeToT>
+NativeToT convertImpl(NativeFromT Value) {
+  static_assert(!std::is_same_v<FromT, ToT>);
+  static_assert(!std::is_same_v<NativeFromT, NativeToT>);
+  using ElemTy = typename detail::ConvertToOpenCLType_t<ToT>;
+  if constexpr (is_sint_to_sint<FromT, ToT>::value)
+    return SConvert<NativeFromT, NativeToT, VecSize, ElemTy>(Value);
+  else if constexpr (is_uint_to_uint<FromT, ToT>::value)
+    return UConvert<NativeFromT, NativeToT, VecSize, ElemTy>(Value);
+  else if constexpr (is_sint_to_float<FromT, ToT>::value)
+    return ConvertSToF<NativeFromT, NativeToT, VecSize, ElemTy>(Value);
+  else if constexpr (is_uint_to_float<FromT, ToT>::value)
+    return ConvertUToF<NativeFromT, NativeToT, VecSize, ElemTy>(Value);
+  else if constexpr (is_float_to_float<FromT, ToT>::value)
+    return FConvert<NativeFromT, NativeToT, VecSize, ElemTy, RoundingMode>(
+        Value);
+  else if constexpr (is_float_to_sint<FromT, ToT>::value)
+    return ConvertFToS<NativeFromT, NativeToT, VecSize, ElemTy, RoundingMode>(
+        Value);
+  else if constexpr (is_float_to_uint<FromT, ToT>::value)
+    return ConvertFToU<NativeFromT, NativeToT, VecSize, ElemTy, RoundingMode>(
+        Value);
+  else if constexpr (is_sint_to_uint<FromT, ToT>::value)
+    return SatConvertSToU<NativeFromT, NativeToT, VecSize, ElemTy>(Value);
+  else {
+    static_assert(is_uint_to_sint<FromT, ToT>::value);
+    return SatConvertUToS<NativeFromT, NativeToT, VecSize, ElemTy>(Value);
+  }
+}
+
 } // namespace detail
 } // namespace _V1
 } // namespace sycl
diff --git a/sycl/include/sycl/types.hpp b/sycl/include/sycl/types.hpp
index c0ef7d393b574..c1e59d398387d 100644
--- a/sycl/include/sycl/types.hpp
+++ b/sycl/include/sycl/types.hpp
@@ -634,21 +634,66 @@ template <typename Type, int NumElements> class vec {
   static constexpr size_t get_size() { return byte_size(); }
   static constexpr size_t byte_size() noexcept { return sizeof(m_Data); }
 
+  // convertImpl can't be called with the same From and To types and therefore
+  // we need this version of convert which is mostly no-op.
   template <typename convertT,
             rounding_mode roundingMode = rounding_mode::automatic>
-  vec<convertT, NumElements> convert() const {
+  std::enable_if_t<
+      std::is_same_v<vec_data_t<DataT>, vec_data_t<convertT>> ||
+          std::is_same_v<detail::ConvertToOpenCLType_t<vec_data_t<DataT>>,
+                         detail::ConvertToOpenCLType_t<vec_data_t<convertT>>>,
+      vec<convertT, NumElements>>
+  convert() const {
     static_assert(std::is_integral_v<vec_data_t<convertT>> ||
                       detail::is_floating_point<convertT>::value,
                   "Unsupported convertT");
+    if constexpr (!std::is_same_v<DataT, convertT>) {
+      // Dummy conversion for cases like vec<signed char> -> vec<char>
+      vec<convertT, NumElements> Result;
+      for (size_t I = 0; I < NumElements; ++I) {
+        Result.setValue(I, vec_data<convertT>::get(static_cast<convertT>(
+                               vec_data<DataT>::get(getValue(I)))));
+      }
+      return Result;
+    } else {
+      // No conversion necessary
+      return *this;
+    }
+  }
+
+  template <typename convertT,
+            rounding_mode roundingMode = rounding_mode::automatic>
+  std::enable_if_t<
+      !std::is_same_v<vec_data_t<DataT>, vec_data_t<convertT>> &&
+          !std::is_same_v<detail::ConvertToOpenCLType_t<vec_data_t<DataT>>,
+                          detail::ConvertToOpenCLType_t<vec_data_t<convertT>>>,
+      vec<convertT, NumElements>>
+  convert() const {
+    static_assert(std::is_integral_v<vec_data_t<convertT>> ||
+                      detail::is_floating_point<convertT>::value,
+                  "Unsupported convertT");
+    using T = vec_data_t<DataT>;
+    using R = vec_data_t<convertT>;
     vec<convertT, NumElements> Result;
-    using OpenCLT = detail::ConvertToOpenCLType_t<vec_data_t<DataT>>;
-    using OpenCLR = detail::ConvertToOpenCLType_t<vec_data_t<convertT>>;
-    for (size_t I = 0; I < NumElements; ++I) {
-      Result.setValue(
-          I, vec_data<convertT>::get(
-                 detail::convertImpl<vec_data_t<DataT>, vec_data_t<convertT>,
-                                     roundingMode, OpenCLT, OpenCLR>(
-                     vec_data<DataT>::get(getValue(I)))));
+
+    if constexpr (NativeVec && vec<convertT, NumElements>::NativeVec) {
+#ifdef __SYCL_DEVICE_ONLY__
+      // If both vectors are representable as native vectors, then we can use
+      // a single vector-wide operation to do a conversion:
+      Result.m_Data = detail::convertImpl<
+          T, R, roundingMode, NumElements, VectorDataType,
+          typename vec<convertT, NumElements>::VectorDataType>(m_Data);
+#endif
+    } else {
+      // Otherwise, we fallback to per-element conversion:
+      using OpenCLT = detail::ConvertToOpenCLType_t<vec_data_t<DataT>>;
+      using OpenCLR = detail::ConvertToOpenCLType_t<vec_data_t<convertT>>;
+      for (size_t I = 0; I < NumElements; ++I) {
+        Result.setValue(
+            I, vec_data<convertT>::get(
+                   detail::convertImpl<T, R, roundingMode, 1, OpenCLT, OpenCLR>(
+                       vec_data<DataT>::get(getValue(I)))));
+      }
     }
     return Result;
   }
diff --git a/sycl/test/basic_tests/vectors/vectors.cpp b/sycl/test/basic_tests/vectors/vectors.cpp
index 0c4dafd83c31c..e7bed92b57d36 100644
--- a/sycl/test/basic_tests/vectors/vectors.cpp
+++ b/sycl/test/basic_tests/vectors/vectors.cpp
@@ -24,10 +24,17 @@ void check_vectors(sycl::int4 a, sycl::int4 b, sycl::int4 c, sycl::int4 gold) {
 template <typename From, typename To> void check_convert() {
   sycl::vec<From, 4> vec{1, 2, 3, 4};
   sycl::vec<To, 4> result = vec.template convert<To>();
-  assert((int)result.x() == (int)vec.x());
-  assert((int)result.y() == (int)vec.y());
-  assert((int)result.z() == (int)vec.z());
-  assert((int)result.w() == (int)vec.w());
+  if constexpr (std::is_same_v<To, bool>) {
+    assert((bool)result.x() == (bool)vec.x());
+    assert((bool)result.y() == (bool)vec.y());
+    assert((bool)result.z() == (bool)vec.z());
+    assert((bool)result.w() == (bool)vec.w());
+  } else {
+    assert((int)result.x() == (int)vec.x());
+    assert((int)result.y() == (int)vec.y());
+    assert((int)result.z() == (int)vec.z());
+    assert((int)result.w() == (int)vec.w());
+  }
 }
 
 template <class T>
@@ -50,7 +57,9 @@ template <typename From> void check_convert_from() {
   check_signed_unsigned_convert_to<From, int16_t>();
   check_signed_unsigned_convert_to<From, int32_t>();
   check_signed_unsigned_convert_to<From, int64_t>();
+  check_signed_unsigned_convert_to<From, bool>();
   check_signed_unsigned_convert_to<From, char>();
+  check_signed_unsigned_convert_to<From, signed char>();
   check_signed_unsigned_convert_to<From, short>();
   check_signed_unsigned_convert_to<From, int>();
   check_signed_unsigned_convert_to<From, long>();
@@ -175,6 +184,7 @@ int main() {
   check_convert_from<int32_t>();
   check_convert_from<int64_t>();
   check_convert_from<char>();
+  check_convert_from<signed char>();
   check_convert_from<short>();
   check_convert_from<int>();
   check_convert_from<long>();

From 9bcf25740f62920c8f3115d292f339ec9e6e83ca Mon Sep 17 00:00:00 2001
From: Michael Toguchi <michael.d.toguchi@intel.com>
Date: Fri, 3 Nov 2023 10:23:17 -0700
Subject: [PATCH 834/877] [SYCL][NFC] Organize target tests for to better match
 requirements (#11761)

The target mismatch test for the driver threw all targets into a single
test. This causes issues on dependencies based on AMD and Nvidia
targets. Split those tests out and mark with the approriate requirement.
---
 .../Driver/sycl-target-mismatch-amdgpu.cpp    | 26 +++++++++
 .../Driver/sycl-target-mismatch-nvptx.cpp     | 29 ++++++++++
 clang/test/Driver/sycl-target-mismatch.cpp    | 53 -------------------
 3 files changed, 55 insertions(+), 53 deletions(-)
 create mode 100644 clang/test/Driver/sycl-target-mismatch-amdgpu.cpp
 create mode 100644 clang/test/Driver/sycl-target-mismatch-nvptx.cpp

diff --git a/clang/test/Driver/sycl-target-mismatch-amdgpu.cpp b/clang/test/Driver/sycl-target-mismatch-amdgpu.cpp
new file mode 100644
index 0000000000000..2ae20d5d23d7b
--- /dev/null
+++ b/clang/test/Driver/sycl-target-mismatch-amdgpu.cpp
@@ -0,0 +1,26 @@
+// REQUIRES: amdgpu-registered-target
+
+// RUN: %clangxx -fsycl -fsycl-targets=amdgcn-amd-amdhsa -Xsycl-target-backend --offload-arch=gfx906 -nogpulib\
+// RUN:   %S/Inputs/SYCL/libamdgcn-gfx908.a -### %s 2>&1 \
+// RUN:  | FileCheck %s -check-prefix=AMDGCN_DIAG
+// RUN: %clangxx -fsycl -fsycl-targets=amdgcn-amd-amdhsa -Xsycl-target-backend --offload-arch=gfx906 -nogpulib\
+// RUN:   -L%S/Inputs/SYCL -lamdgcn-gfx908 -### %s 2>&1 \
+// RUN:  | FileCheck %s -check-prefix=AMDGCN_DIAG
+// RUN: %clangxx -fsycl -fsycl-targets=amdgcn-amd-amdhsa -Xsycl-target-backend --offload-arch=gfx906 -nogpulib\
+// RUN:   %S/Inputs/SYCL/objamdgcn-gfx908.o -### %s 2>&1 \
+// RUN:  | FileCheck %s -check-prefix=AMDGCN_DIAG
+// AMDGCN_DIAG: linked binaries do not contain expected 'amdgcn-amd-amdhsa-gfx906' target; found targets: 'amdgcn-amd-amdhsa-gfx908' [-Wsycl-target]
+
+// RUN: %clangxx -fsycl -fsycl-targets=amdgcn-amd-amdhsa -Xsycl-target-backend --offload-arch=gfx908 -nogpulib\
+// RUN:   %S/Inputs/SYCL/libamdgcn-gfx908.a -### %s 2>&1 \
+// RUN:  | FileCheck %s -check-prefix=AMDGCN_MATCH_DIAG
+// RUN: %clangxx -fsycl -fsycl-targets=amdgcn-amd-amdhsa -Xsycl-target-backend --offload-arch=gfx908 -nogpulib\
+// RUN:   -L%S/Inputs/SYCL -lamdgcn-gfx908 -### %s 2>&1 \
+// RUN:  | FileCheck %s -check-prefix=AMDGCN_MATCH_DIAG
+// RUN: %clangxx -fsycl -fsycl-targets=amdgcn-amd-amdhsa -Xsycl-target-backend --offload-arch=gfx908 -nogpulib\
+// RUN:   %S/Inputs/SYCL/objamdgcn-gfx908.o -### %s 2>&1 \
+// RUN:  | FileCheck %s -check-prefix=AMDGCN_MATCH_DIAG
+// RUN: %clangxx -fsycl -fsycl-targets=amdgcn-amd-amdhsa -Xsycl-target-backend --offload-arch=gfx906 -nogpulib\
+// RUN:   -Wno-sycl-target %S/Inputs/SYCL/objamdgcn-gfx908.o -### %s 2>&1 \
+// RUN:  | FileCheck %s -check-prefix=AMDGCN_MATCH_DIAG
+// AMDGCN_MATCH_DIAG-NOT: linked binaries do not contain expected
diff --git a/clang/test/Driver/sycl-target-mismatch-nvptx.cpp b/clang/test/Driver/sycl-target-mismatch-nvptx.cpp
new file mode 100644
index 0000000000000..e27f22579ca2a
--- /dev/null
+++ b/clang/test/Driver/sycl-target-mismatch-nvptx.cpp
@@ -0,0 +1,29 @@
+// REQUIRES: nvptx-registered-target
+
+// RUN: %clangxx -fsycl -nocudalib -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend --cuda-gpu-arch=sm_60  \
+// RUN:   %S/Inputs/SYCL/libnvptx64-sm_50.a -### %s 2>&1 \
+// RUN:  | FileCheck %s -check-prefix=NVPTX64_DIAG
+// RUN: %clangxx -fsycl -nocudalib -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend --cuda-gpu-arch=sm_60 \
+// RUN:   -L%S/Inputs/SYCL -lnvptx64-sm_50 -### %s 2>&1 \
+// RUN:  | FileCheck %s -check-prefix=NVPTX64_DIAG
+// RUN: %clangxx -fsycl -nocudalib -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend --cuda-gpu-arch=sm_60 \
+// RUN:   %S/Inputs/SYCL/objnvptx64-sm_50.o -### %s 2>&1 \
+// RUN:  | FileCheck %s -check-prefix=NVPTX64_DIAG
+// NVPTX64_DIAG: linked binaries do not contain expected 'nvptx64-nvidia-cuda-sm_60' target; found targets: 'nvptx64-nvidia-cuda-sm_50' [-Wsycl-target]
+
+// RUN: %clangxx -fsycl -nocudalib -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend --cuda-gpu-arch=sm_50 \
+// RUN:   %S/Inputs/SYCL/libnvptx64-sm_50.a -### %s 2>&1 \
+// RUN:  | FileCheck %s -check-prefix=NVPTX64_MATCH_DIAG
+// RUN: %clangxx -fsycl -nocudalib -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend --cuda-gpu-arch=sm_50 \
+// RUN:   -L%S/Inputs/SYCL -lnvptx64-sm_50 -### %s 2>&1 \
+// RUN:  | FileCheck %s -check-prefix=NVPTX64_MATCH_DIAG
+// RUN: %clangxx -fsycl -nocudalib -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend --cuda-gpu-arch=sm_50 \
+// RUN:   %S/Inputs/SYCL/objnvptx64-sm_50.o -### %s 2>&1 \
+// RUN:  | FileCheck %s -check-prefix=NVPTX64_MATCH_DIAG
+// RUN: %clangxx -fsycl -nocudalib -fsycl-targets=nvptx64-nvidia-cuda \
+// RUN:   %S/Inputs/SYCL/objnvptx64-sm_50.o -### %s 2>&1 \
+// RUN:  | FileCheck %s -check-prefix=NVPTX64_MATCH_DIAG
+// RUN: %clangxx -fsycl -nocudalib -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend --cuda-gpu-arch=sm_60 \
+// RUN:   -Wno-sycl-target %S/Inputs/SYCL/objnvptx64-sm_50.o -### %s 2>&1 \
+// RUN:  | FileCheck %s -check-prefix=NVPTX64_MATCH_DIAG
+// NVPTX64_MATCH_DIAG-NOT: linked binaries do not contain expected
diff --git a/clang/test/Driver/sycl-target-mismatch.cpp b/clang/test/Driver/sycl-target-mismatch.cpp
index a978b8f7f91e6..f6b27203f92ba 100644
--- a/clang/test/Driver/sycl-target-mismatch.cpp
+++ b/clang/test/Driver/sycl-target-mismatch.cpp
@@ -23,56 +23,3 @@
 // RUN:   -Wno-sycl-target -### %s 2>&1 \
 // RUN:  | FileCheck %s -check-prefix=SPIR64_DIAG
 // SPIR64_DIAG-NOT: linked binaries do not contain expected
-
-// RUN: %clangxx -fsycl -nocudalib -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend --cuda-gpu-arch=sm_60  \
-// RUN:   %S/Inputs/SYCL/libnvptx64-sm_50.a -### %s 2>&1 \
-// RUN:  | FileCheck %s -check-prefix=NVPTX64_DIAG
-// RUN: %clangxx -fsycl -nocudalib -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend --cuda-gpu-arch=sm_60 \
-// RUN:   -L%S/Inputs/SYCL -lnvptx64-sm_50 -### %s 2>&1 \
-// RUN:  | FileCheck %s -check-prefix=NVPTX64_DIAG
-// RUN: %clangxx -fsycl -nocudalib -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend --cuda-gpu-arch=sm_60 \
-// RUN:   %S/Inputs/SYCL/objnvptx64-sm_50.o -### %s 2>&1 \
-// RUN:  | FileCheck %s -check-prefix=NVPTX64_DIAG
-// NVPTX64_DIAG: linked binaries do not contain expected 'nvptx64-nvidia-cuda-sm_60' target; found targets: 'nvptx64-nvidia-cuda-sm_50' [-Wsycl-target]
-
-// RUN: %clangxx -fsycl -nocudalib -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend --cuda-gpu-arch=sm_50 \
-// RUN:   %S/Inputs/SYCL/libnvptx64-sm_50.a -### %s 2>&1 \
-// RUN:  | FileCheck %s -check-prefix=NVPTX64_MATCH_DIAG
-// RUN: %clangxx -fsycl -nocudalib -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend --cuda-gpu-arch=sm_50 \
-// RUN:   -L%S/Inputs/SYCL -lnvptx64-sm_50 -### %s 2>&1 \
-// RUN:  | FileCheck %s -check-prefix=NVPTX64_MATCH_DIAG
-// RUN: %clangxx -fsycl -nocudalib -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend --cuda-gpu-arch=sm_50 \
-// RUN:   %S/Inputs/SYCL/objnvptx64-sm_50.o -### %s 2>&1 \
-// RUN:  | FileCheck %s -check-prefix=NVPTX64_MATCH_DIAG
-// RUN: %clangxx -fsycl -nocudalib -fsycl-targets=nvptx64-nvidia-cuda \
-// RUN:   %S/Inputs/SYCL/objnvptx64-sm_50.o -### %s 2>&1 \
-// RUN:  | FileCheck %s -check-prefix=NVPTX64_MATCH_DIAG
-// RUN: %clangxx -fsycl -nocudalib -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend --cuda-gpu-arch=sm_60 \
-// RUN:   -Wno-sycl-target %S/Inputs/SYCL/objnvptx64-sm_50.o -### %s 2>&1 \
-// RUN:  | FileCheck %s -check-prefix=NVPTX64_MATCH_DIAG
-// NVPTX64_MATCH_DIAG-NOT: linked binaries do not contain expected
-
-// RUN: %clangxx -fsycl -fsycl-targets=amdgcn-amd-amdhsa -Xsycl-target-backend --offload-arch=gfx906 -nogpulib\
-// RUN:   %S/Inputs/SYCL/libamdgcn-gfx908.a -### %s 2>&1 \
-// RUN:  | FileCheck %s -check-prefix=AMDGCN_DIAG
-// RUN: %clangxx -fsycl -fsycl-targets=amdgcn-amd-amdhsa -Xsycl-target-backend --offload-arch=gfx906 -nogpulib\
-// RUN:   -L%S/Inputs/SYCL -lamdgcn-gfx908 -### %s 2>&1 \
-// RUN:  | FileCheck %s -check-prefix=AMDGCN_DIAG
-// RUN: %clangxx -fsycl -fsycl-targets=amdgcn-amd-amdhsa -Xsycl-target-backend --offload-arch=gfx906 -nogpulib\
-// RUN:   %S/Inputs/SYCL/objamdgcn-gfx908.o -### %s 2>&1 \
-// RUN:  | FileCheck %s -check-prefix=AMDGCN_DIAG
-// AMDGCN_DIAG: linked binaries do not contain expected 'amdgcn-amd-amdhsa-gfx906' target; found targets: 'amdgcn-amd-amdhsa-gfx908' [-Wsycl-target]
-
-// RUN: %clangxx -fsycl -fsycl-targets=amdgcn-amd-amdhsa -Xsycl-target-backend --offload-arch=gfx908 -nogpulib\
-// RUN:   %S/Inputs/SYCL/libamdgcn-gfx908.a -### %s 2>&1 \
-// RUN:  | FileCheck %s -check-prefix=AMDGCN_MATCH_DIAG
-// RUN: %clangxx -fsycl -fsycl-targets=amdgcn-amd-amdhsa -Xsycl-target-backend --offload-arch=gfx908 -nogpulib\
-// RUN:   -L%S/Inputs/SYCL -lamdgcn-gfx908 -### %s 2>&1 \
-// RUN:  | FileCheck %s -check-prefix=AMDGCN_MATCH_DIAG
-// RUN: %clangxx -fsycl -fsycl-targets=amdgcn-amd-amdhsa -Xsycl-target-backend --offload-arch=gfx908 -nogpulib\
-// RUN:   %S/Inputs/SYCL/objamdgcn-gfx908.o -### %s 2>&1 \
-// RUN:  | FileCheck %s -check-prefix=AMDGCN_MATCH_DIAG
-// RUN: %clangxx -fsycl -fsycl-targets=amdgcn-amd-amdhsa -Xsycl-target-backend --offload-arch=gfx906 -nogpulib\
-// RUN:   -Wno-sycl-target %S/Inputs/SYCL/objamdgcn-gfx908.o -### %s 2>&1 \
-// RUN:  | FileCheck %s -check-prefix=AMDGCN_MATCH_DIAG
-// AMDGCN_MATCH_DIAG-NOT: linked binaries do not contain expected

From 69f4e164ae133c89d127fb7ecea33f7bcc3e75c4 Mon Sep 17 00:00:00 2001
From: Brox Chen <broxigarchen@outlook.com>
Date: Fri, 3 Nov 2023 14:02:31 -0400
Subject: [PATCH 835/877] [SYCL] reenable ann_arg/ptr e2e test on windows
 (#11731)

reenable ann_arg/ptr e2e test on windows, tagged with unsupported: gpu

issue https://github.com/intel/llvm/issues/11224
---
 .../Annotated_arg_ptr/annotated_arg.cpp       | 26 +++++++------------
 .../Annotated_arg_ptr/annotated_ptr.cpp       |  2 --
 2 files changed, 10 insertions(+), 18 deletions(-)

diff --git a/sycl/test-e2e/Annotated_arg_ptr/annotated_arg.cpp b/sycl/test-e2e/Annotated_arg_ptr/annotated_arg.cpp
index 980302699a0dc..8fad623752873 100644
--- a/sycl/test-e2e/Annotated_arg_ptr/annotated_arg.cpp
+++ b/sycl/test-e2e/Annotated_arg_ptr/annotated_arg.cpp
@@ -1,10 +1,6 @@
-// REQUIRES: accelerator
-//
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 //
-// https://github.com/intel/llvm/issues/11224
-// UNSUPPORTED: windows
 
 #include "common.hpp"
 
@@ -100,29 +96,27 @@ int main() {
     d_ptr[i] = i;
 
   // testing arithmetic overloaded operators
-  annotated_arg<MyStruct<int>, decltype(properties{conduit})> e = MyStruct(5);
-  annotated_arg<MyStruct<int>, decltype(properties{conduit})> f = MyStruct(6);
-  annotated_arg<MyStruct<int>, decltype(properties{conduit})> g = MyStruct(3);
-  annotated_arg<MyStruct<int>, decltype(properties{conduit})> h = MyStruct(2);
+  annotated_arg<MyStruct<int>> e = MyStruct(5);
+  annotated_arg<MyStruct<int>> f = MyStruct(6);
+  annotated_arg<MyStruct<int>> g = MyStruct(3);
+  annotated_arg<MyStruct<int>> h = MyStruct(2);
 
   auto *r1 = malloc_shared<MyStruct<int>>(5, Q);
   auto *r2 = malloc_shared<MyStruct<int>>(5, Q);
   auto *r3 = malloc_shared<MyStruct<int>>(5, Q);
 
   // testing logical overloaded operators
-  annotated_arg<MyStruct<bool>, decltype(properties{conduit})> m =
-      MyStruct(true);
-  annotated_arg<MyStruct<bool>, decltype(properties{conduit})> n =
-      MyStruct(false);
+  annotated_arg<MyStruct<bool>> m = MyStruct(true);
+  annotated_arg<MyStruct<bool>> n = MyStruct(false);
 
   auto *r4 = malloc_shared<MyStruct<bool>>(3, Q);
   auto *r5 = malloc_shared<MyStruct<bool>>(3, Q);
   auto *r6 = malloc_shared<MyStruct<bool>>(3, Q);
 
   // testing bit shift overloaded operators
-  annotated_arg<MyStruct<int>, decltype(properties{conduit})> x = MyStruct(1);
-  annotated_arg<MyStruct<int>, decltype(properties{conduit})> y = MyStruct(2);
-  annotated_arg<MyStruct<int>, decltype(properties{conduit})> z = MyStruct(4);
+  annotated_arg<MyStruct<int>> x = MyStruct(1);
+  annotated_arg<MyStruct<int>> y = MyStruct(2);
+  annotated_arg<MyStruct<int>> z = MyStruct(4);
 
   auto *r7 = malloc_shared<MyStruct<int>>(2, Q);
   auto *r8 = malloc_shared<MyStruct<int>>(2, Q);
@@ -262,7 +256,7 @@ int main() {
   assert(is_device_copyable<device_copyable_annotated_arg>::value &&
          "annotated_arg<device_copyable_class> is not device copyable.");
   using device_copyable_annotated_arg_with_properties =
-      annotated_arg<device_copyable_class, decltype(properties{conduit})>;
+      annotated_arg<device_copyable_class>;
   assert(is_device_copyable<
              device_copyable_annotated_arg_with_properties>::value &&
          "annotated_arg<device_copyable_class, properties> is not device "
diff --git a/sycl/test-e2e/Annotated_arg_ptr/annotated_ptr.cpp b/sycl/test-e2e/Annotated_arg_ptr/annotated_ptr.cpp
index 3c0543dd71231..7e6cf18a372c5 100644
--- a/sycl/test-e2e/Annotated_arg_ptr/annotated_ptr.cpp
+++ b/sycl/test-e2e/Annotated_arg_ptr/annotated_ptr.cpp
@@ -1,8 +1,6 @@
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 //
-// https://github.com/intel/llvm/issues/11224
-// UNSUPPORTED: windows
 
 #include "common.hpp"
 

From 9a4719bdb01e6dc38aff43d57496a1c543742615 Mon Sep 17 00:00:00 2001
From: jinge90 <ge.jin@intel.com>
Date: Mon, 6 Nov 2023 19:10:30 +0800
Subject: [PATCH 836/877] [SYCL] Add rcp for fp32 and fp64 with rounding mode
 supported (#11768)

This PR adds frcp_rd/n/u/z and drcp_rd/n/u/z to sycl::ext::intel::math
which corresponds to CUDA math's __frcp_r* and __drcp_r*
---
 libdevice/imf_wrapper.cpp                     | 12 ++++++++
 libdevice/imf_wrapper_fp64.cpp                | 12 ++++++++
 sycl/include/sycl/builtins.hpp                |  8 +++++
 .../sycl/ext/intel/math/imf_rounding_math.hpp | 24 +++++++++++++++
 .../DeviceLib/imf_fp32_rounding_test.cpp      | 30 +++++++++++++++++++
 .../DeviceLib/imf_fp64_rounding_test.cpp      | 29 ++++++++++++++++++
 6 files changed, 115 insertions(+)

diff --git a/libdevice/imf_wrapper.cpp b/libdevice/imf_wrapper.cpp
index e01b518af43fc..33e13f8c9c1d0 100644
--- a/libdevice/imf_wrapper.cpp
+++ b/libdevice/imf_wrapper.cpp
@@ -1931,21 +1931,33 @@ float __devicelib_imf_fdiv_rd(float, float);
 DEVICE_EXTERN_C_INLINE
 float __imf_fdiv_rd(float x, float y) { return __devicelib_imf_fdiv_rd(x, y); }
 
+DEVICE_EXTERN_C_INLINE
+float __imf_frcp_rd(float x) { return __devicelib_imf_fdiv_rd(1.0f, x); }
+
 DEVICE_EXTERN_C_INLINE
 float __devicelib_imf_fdiv_rn(float, float);
 
 DEVICE_EXTERN_C_INLINE
 float __imf_fdiv_rn(float x, float y) { return __devicelib_imf_fdiv_rn(x, y); }
 
+DEVICE_EXTERN_C_INLINE
+float __imf_frcp_rn(float x) { return __devicelib_imf_fdiv_rn(1.0f, x); }
+
 DEVICE_EXTERN_C_INLINE
 float __devicelib_imf_fdiv_ru(float, float);
 
 DEVICE_EXTERN_C_INLINE
 float __imf_fdiv_ru(float x, float y) { return __devicelib_imf_fdiv_ru(x, y); }
 
+DEVICE_EXTERN_C_INLINE
+float __imf_frcp_ru(float x) { return __devicelib_imf_fdiv_ru(1.0f, x); }
+
 DEVICE_EXTERN_C_INLINE
 float __devicelib_imf_fdiv_rz(float, float);
 
 DEVICE_EXTERN_C_INLINE
 float __imf_fdiv_rz(float x, float y) { return __devicelib_imf_fdiv_rz(x, y); }
+
+DEVICE_EXTERN_C_INLINE
+float __imf_frcp_rz(float x) { return __devicelib_imf_fdiv_rz(1.0f, x); }
 #endif // __LIBDEVICE_IMF_ENABLED__
diff --git a/libdevice/imf_wrapper_fp64.cpp b/libdevice/imf_wrapper_fp64.cpp
index ee75e2c10ea50..8801f78238b78 100644
--- a/libdevice/imf_wrapper_fp64.cpp
+++ b/libdevice/imf_wrapper_fp64.cpp
@@ -482,6 +482,9 @@ double __imf_ddiv_rd(double x, double y) {
   return __devicelib_imf_ddiv_rd(x, y);
 }
 
+DEVICE_EXTERN_C_INLINE
+double __imf_drcp_rd(double x) { return __devicelib_imf_ddiv_rd(1.0, x); }
+
 DEVICE_EXTERN_C_INLINE
 double __devicelib_imf_ddiv_rn(double, double);
 
@@ -490,6 +493,9 @@ double __imf_ddiv_rn(double x, double y) {
   return __devicelib_imf_ddiv_rn(x, y);
 }
 
+DEVICE_EXTERN_C_INLINE
+double __imf_drcp_rn(double x) { return __devicelib_imf_ddiv_rn(1.0, x); }
+
 DEVICE_EXTERN_C_INLINE
 double __devicelib_imf_ddiv_ru(double, double);
 
@@ -498,6 +504,9 @@ double __imf_ddiv_ru(double x, double y) {
   return __devicelib_imf_ddiv_ru(x, y);
 }
 
+DEVICE_EXTERN_C_INLINE
+double __imf_drcp_ru(double x) { return __devicelib_imf_ddiv_ru(1.0, x); }
+
 DEVICE_EXTERN_C_INLINE
 double __devicelib_imf_ddiv_rz(double, double);
 
@@ -505,4 +514,7 @@ DEVICE_EXTERN_C_INLINE
 double __imf_ddiv_rz(double x, double y) {
   return __devicelib_imf_ddiv_rz(x, y);
 }
+
+DEVICE_EXTERN_C_INLINE
+double __imf_drcp_rz(double x) { return __devicelib_imf_ddiv_rz(1.0, x); }
 #endif // __LIBDEVICE_IMF_ENABLED__
diff --git a/sycl/include/sycl/builtins.hpp b/sycl/include/sycl/builtins.hpp
index 87788b73a4420..bf08b5c3759a8 100644
--- a/sycl/include/sycl/builtins.hpp
+++ b/sycl/include/sycl/builtins.hpp
@@ -107,6 +107,10 @@ extern __DPCPP_SYCL_EXTERNAL float __imf_fdiv_rd(float x, float y);
 extern __DPCPP_SYCL_EXTERNAL float __imf_fdiv_rn(float x, float y);
 extern __DPCPP_SYCL_EXTERNAL float __imf_fdiv_ru(float x, float y);
 extern __DPCPP_SYCL_EXTERNAL float __imf_fdiv_rz(float x, float y);
+extern __DPCPP_SYCL_EXTERNAL float __imf_frcp_rd(float x);
+extern __DPCPP_SYCL_EXTERNAL float __imf_frcp_rn(float x);
+extern __DPCPP_SYCL_EXTERNAL float __imf_frcp_ru(float x);
+extern __DPCPP_SYCL_EXTERNAL float __imf_frcp_rz(float x);
 extern __DPCPP_SYCL_EXTERNAL int __imf_float2int_rd(float x);
 extern __DPCPP_SYCL_EXTERNAL int __imf_float2int_rn(float x);
 extern __DPCPP_SYCL_EXTERNAL int __imf_float2int_ru(float x);
@@ -336,6 +340,10 @@ extern __DPCPP_SYCL_EXTERNAL double __imf_ddiv_rd(double x, double y);
 extern __DPCPP_SYCL_EXTERNAL double __imf_ddiv_rn(double x, double y);
 extern __DPCPP_SYCL_EXTERNAL double __imf_ddiv_ru(double x, double y);
 extern __DPCPP_SYCL_EXTERNAL double __imf_ddiv_rz(double x, double y);
+extern __DPCPP_SYCL_EXTERNAL double __imf_drcp_rd(double x);
+extern __DPCPP_SYCL_EXTERNAL double __imf_drcp_rn(double x);
+extern __DPCPP_SYCL_EXTERNAL double __imf_drcp_ru(double x);
+extern __DPCPP_SYCL_EXTERNAL double __imf_drcp_rz(double x);
 extern __DPCPP_SYCL_EXTERNAL float __imf_double2float_rd(double x);
 extern __DPCPP_SYCL_EXTERNAL float __imf_double2float_rn(double x);
 extern __DPCPP_SYCL_EXTERNAL float __imf_double2float_ru(double x);
diff --git a/sycl/include/sycl/ext/intel/math/imf_rounding_math.hpp b/sycl/include/sycl/ext/intel/math/imf_rounding_math.hpp
index 435de7c895b19..db8face7fd5d6 100644
--- a/sycl/include/sycl/ext/intel/math/imf_rounding_math.hpp
+++ b/sycl/include/sycl/ext/intel/math/imf_rounding_math.hpp
@@ -27,6 +27,10 @@ float __imf_fdiv_rz(float, float);
 float __imf_fdiv_rn(float, float);
 float __imf_fdiv_ru(float, float);
 float __imf_fdiv_rd(float, float);
+float __imf_frcp_rz(float);
+float __imf_frcp_rn(float);
+float __imf_frcp_ru(float);
+float __imf_frcp_rd(float);
 
 double __imf_dadd_rz(double, double);
 double __imf_dadd_rn(double, double);
@@ -44,6 +48,10 @@ double __imf_ddiv_rz(double, double);
 double __imf_ddiv_rn(double, double);
 double __imf_ddiv_ru(double, double);
 double __imf_ddiv_rd(double, double);
+double __imf_drcp_rz(double);
+double __imf_drcp_rn(double);
+double __imf_drcp_ru(double);
+double __imf_drcp_rd(double);
 };
 
 namespace sycl {
@@ -114,6 +122,14 @@ template <typename Tp = float> Tp fdiv_rz(Tp x, Tp y) {
   return __imf_fdiv_rz(x, y);
 }
 
+template <typename Tp = float> Tp frcp_rd(Tp x) { return __imf_frcp_rd(x); }
+
+template <typename Tp = float> Tp frcp_rn(Tp x) { return __imf_frcp_rn(x); }
+
+template <typename Tp = float> Tp frcp_ru(Tp x) { return __imf_frcp_ru(x); }
+
+template <typename Tp = float> Tp frcp_rz(Tp x) { return __imf_frcp_rz(x); }
+
 template <typename Tp = double> Tp dadd_rd(Tp x, Tp y) {
   return __imf_dadd_rd(x, y);
 }
@@ -177,6 +193,14 @@ template <typename Tp = double> Tp ddiv_ru(Tp x, Tp y) {
 template <typename Tp = double> Tp ddiv_rz(Tp x, Tp y) {
   return __imf_ddiv_rz(x, y);
 }
+
+template <typename Tp = double> Tp drcp_rd(Tp x) { return __imf_drcp_rd(x); }
+
+template <typename Tp = double> Tp drcp_rn(Tp x) { return __imf_drcp_rn(x); }
+
+template <typename Tp = double> Tp drcp_ru(Tp x) { return __imf_drcp_ru(x); }
+
+template <typename Tp = double> Tp drcp_rz(Tp x) { return __imf_drcp_rz(x); }
 } // namespace ext::intel::math
 } // namespace _V1
 } // namespace sycl
diff --git a/sycl/test-e2e/DeviceLib/imf_fp32_rounding_test.cpp b/sycl/test-e2e/DeviceLib/imf_fp32_rounding_test.cpp
index 0c8872713d656..6d92aa5e8d1ac 100644
--- a/sycl/test-e2e/DeviceLib/imf_fp32_rounding_test.cpp
+++ b/sycl/test-e2e/DeviceLib/imf_fp32_rounding_test.cpp
@@ -121,5 +121,35 @@ int main(int, char **) {
     std::cout << "sycl::ext::intel::math::fdiv_rz passes." << std::endl;
   }
 
+  {
+    std::initializer_list<float> input_vals = {
+        0x1.ba90e6p+1,  0x1.4p+1,        0x1.ea77e6p-2, 0x1.e8330ap+19,
+        -0x1.4ffd68p+5, -0x1.443084p-15, 0x1.605fb2p+6, -0x1.2eb718p-7};
+    std::initializer_list<unsigned> ref_vals_rd = {
+        0x3e9414f5, 0x3ecccccc, 0x40059e85, 0x35863d80,
+        0xbcc30db3, 0xc6ca2743, 0x3c39fbfb, 0xc2d87e72};
+    std::initializer_list<unsigned> ref_vals_rn = {
+        0x3e9414f5, 0x3ecccccd, 0x40059e85, 0x35863d80,
+        0xbcc30db2, 0xc6ca2743, 0x3c39fbfc, 0xc2d87e71};
+    std::initializer_list<unsigned> ref_vals_ru = {
+        0x3e9414f6, 0x3ecccccd, 0x40059e86, 0x35863d81,
+        0xbcc30db2, 0xc6ca2742, 0x3c39fbfc, 0xc2d87e71};
+    std::initializer_list<unsigned> ref_vals_rz = {
+        0x3e9414f5, 0x3ecccccc, 0x40059e85, 0x35863d80,
+        0xbcc30db2, 0xc6ca2742, 0x3c39fbfb, 0xc2d87e71};
+    test(device_queue, input_vals, ref_vals_rd,
+         FT(unsigned, sycl::ext::intel::math::frcp_rd));
+    std::cout << "sycl::ext::intel::math::frcp_rd passes." << std::endl;
+    test(device_queue, input_vals, ref_vals_rn,
+         FT(unsigned, sycl::ext::intel::math::frcp_rn));
+    std::cout << "sycl::ext::intel::math::frcp_rn passes." << std::endl;
+    test(device_queue, input_vals, ref_vals_ru,
+         FT(unsigned, sycl::ext::intel::math::frcp_ru));
+    std::cout << "sycl::ext::intel::math::frcp_ru passes." << std::endl;
+    test(device_queue, input_vals, ref_vals_rz,
+         FT(unsigned, sycl::ext::intel::math::frcp_rz));
+    std::cout << "sycl::ext::intel::math::frcp_rz passes." << std::endl;
+  }
+
   return 0;
 }
diff --git a/sycl/test-e2e/DeviceLib/imf_fp64_rounding_test.cpp b/sycl/test-e2e/DeviceLib/imf_fp64_rounding_test.cpp
index a23d1b30c35c4..a5cb0515412a1 100644
--- a/sycl/test-e2e/DeviceLib/imf_fp64_rounding_test.cpp
+++ b/sycl/test-e2e/DeviceLib/imf_fp64_rounding_test.cpp
@@ -146,5 +146,34 @@ int main(int, char **) {
     std::cout << "sycl::ext::intel::math::ddiv_rz passes." << std::endl;
   }
 
+  {
+    std::initializer_list<double> input_vals1 = {
+        0x1p+2, 0x1.fbd37afb0f8edp-1, 0x1.9238e38e38e35p+6, 0x1.7p+3};
+    std::initializer_list<unsigned long long> ref_vals_rd = {
+        0x3fd0000000000000, 0x3ff021aa6a60809b, 0x3f845de9ef97e71e,
+        0x3fb642c8590b2164};
+    std::initializer_list<unsigned long long> ref_vals_rn = {
+        0x3fd0000000000000, 0x3ff021aa6a60809c, 0x3f845de9ef97e71f,
+        0x3fb642c8590b2164};
+    std::initializer_list<unsigned long long> ref_vals_ru = {
+        0x3fd0000000000000, 0x3ff021aa6a60809c, 0x3f845de9ef97e71f,
+        0x3fb642c8590b2165};
+    std::initializer_list<unsigned long long> ref_vals_rz = {
+        0x3fd0000000000000, 0x3ff021aa6a60809b, 0x3f845de9ef97e71e,
+        0x3fb642c8590b2164};
+    test(device_queue, input_vals1, ref_vals_rd,
+         FT(unsigned long long, sycl::ext::intel::math::drcp_rd));
+    std::cout << "sycl::ext::intel::math::drcp_rd passes." << std::endl;
+    test(device_queue, input_vals1, ref_vals_rn,
+         FT(unsigned long long, sycl::ext::intel::math::drcp_rn));
+    std::cout << "sycl::ext::intel::math::drcp_rn passes." << std::endl;
+    test(device_queue, input_vals1, ref_vals_ru,
+         FT(unsigned long long, sycl::ext::intel::math::drcp_ru));
+    std::cout << "sycl::ext::intel::math::drcp_ru passes." << std::endl;
+    test(device_queue, input_vals1, ref_vals_rz,
+         FT(unsigned long long, sycl::ext::intel::math::drcp_rz));
+    std::cout << "sycl::ext::intel::math::drcp_rz passes." << std::endl;
+  }
+
   return 0;
 }

From d2719b5aaabe233ec921a6d8b41e09600a14c0db Mon Sep 17 00:00:00 2001
From: Konrad Kusiak <konrad.kusiak@codeplay.com>
Date: Mon, 6 Nov 2023 14:16:52 +0000
Subject: [PATCH 837/877] [UR][CUDA] Changed the output of querying
 localMemSize (#11454)

CI for https://github.com/oneapi-src/unified-runtime/pull/931
---
 sycl/plugins/unified_runtime/CMakeLists.txt      | 12 ++++++------
 sycl/test-e2e/Plugin/cuda-max-local-mem-size.cpp |  2 +-
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/sycl/plugins/unified_runtime/CMakeLists.txt b/sycl/plugins/unified_runtime/CMakeLists.txt
index 7988148627d21..7bb97d9337610 100644
--- a/sycl/plugins/unified_runtime/CMakeLists.txt
+++ b/sycl/plugins/unified_runtime/CMakeLists.txt
@@ -54,13 +54,13 @@ if(SYCL_PI_UR_USE_FETCH_CONTENT)
   include(FetchContent)
 
   set(UNIFIED_RUNTIME_REPO "https://github.com/oneapi-src/unified-runtime.git")
-  # commit ba994bb401aefad3631012d592be672f6ea0e23b
-  # Merge: be53fb3b f2a365ca
+  # commit 0e24ab8d136978d984be9e8cba34e23b4fbcf37c
+  # Merge: a62423d d5a4691
   # Author: Kenneth Benzie (Benie) <k.benzie@codeplay.com>
-  # Date:   Fri Nov 3 11:09:52 2023 +0000
-  #     Merge pull request #968 from martygrant/martin/versionCheckSpecializationConstants
-  #     [OpenCL] Add version check for urProgramSetSpecializationConstants
-  set(UNIFIED_RUNTIME_TAG ba994bb401aefad3631012d592be672f6ea0e23b)
+  # Date:   Mon Nov 6 11:42:03 2023 +0000
+  #     Merge pull request #931 from konradkusiak97/konradkusiak/LocalMemSizeQuery
+  #     [UR] [CUDA] Changed the output of querying localMemSize
+  set(UNIFIED_RUNTIME_TAG 0e24ab8d136978d984be9e8cba34e23b4fbcf37c)
 
   if(SYCL_PI_UR_OVERRIDE_FETCH_CONTENT_REPO)
     set(UNIFIED_RUNTIME_REPO "${SYCL_PI_UR_OVERRIDE_FETCH_CONTENT_REPO}")
diff --git a/sycl/test-e2e/Plugin/cuda-max-local-mem-size.cpp b/sycl/test-e2e/Plugin/cuda-max-local-mem-size.cpp
index cdbe8edfa7f47..2960e220e9b91 100644
--- a/sycl/test-e2e/Plugin/cuda-max-local-mem-size.cpp
+++ b/sycl/test-e2e/Plugin/cuda-max-local-mem-size.cpp
@@ -26,5 +26,5 @@ int main() {
      });
    }).wait();
   // CHECK-ZERO: Local memory for kernel exceeds the amount requested using SYCL_PI_CUDA_MAX_LOCAL_MEM_SIZE
-  // CHECK-OVERALLOCATE: Too much local memory allocated for device
+  // CHECK-OVERALLOCATE: Excessive allocation of local memory on the device
 }

From 533b0502232f183c3e63277657b3155da00c5405 Mon Sep 17 00:00:00 2001
From: Artur Gainullin <artur.gainullin@intel.com>
Date: Mon, 6 Nov 2023 09:07:26 -0800
Subject: [PATCH 838/877] [SYCL][Doc] Add October'23 release notes (#11673)

---
 sycl/ReleaseNotes.md | 326 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 326 insertions(+)

diff --git a/sycl/ReleaseNotes.md b/sycl/ReleaseNotes.md
index 16147005a12c0..40289c4c94258 100644
--- a/sycl/ReleaseNotes.md
+++ b/sycl/ReleaseNotes.md
@@ -1,3 +1,329 @@
+# Oct'23 release notes
+
+Release notes for commit range [`cb91c232c661..f4e0d3177338`](https://github.com/intel/llvm/compare/cb91c232c661..f4e0d3177338)
+
+## New features
+
+### SYCL Compiler
+- Enabled Pre-Compiled header (PCH) inclusion while performing host compilation with `-fsycl`. [7f12b287236f]
+- Added support for `-ftarget-export-symbols` option which allows to export symbols for AOT. [c33ba5dafe91]
+- Added support for `reqd_sub_group_size` to `sycl::is_compatible` and implemented device code split based on `reqd-sub-group-size` optional kernel feature. [841768794f7a]
+- Added support for `-ftarget-compile-fast` for spir64_gen and JIT mode. [b6b01b48cfd9][92855bea87f4]
+- Implemented adding C++ libraries to the linker with `-fsycl`. [da5e1b98f1c3]
+- Added support for multiple call operators in a kernel functor. [84cc1e1eff04]
+- Added support to propagate compile flags to device backend compiler.[f45fb51ba6cf]
+- Added new FPGA loop attribute `enable_loop_pipelining`. [4147b8ded75f]
+
+### SYCL Library
+- Enabled [`sycl_ext_oneapi_annotated_arg`](https://github.com/intel/llvm/tree/80b7de1d63f4/sycl/doc/extensions/experimental/sycl_ext_oneapi_annotated_arg.asciidoc) and [`sycl_ext_oneapi_annotated_ptr`](https://github.com/intel/llvm/tree/80b7de1d63f4/sycl/doc/extensions/experimental/sycl_ext_oneapi_annotated_ptr.asciidoc) experimental extensions. [05d3be63f996][80b7de1d63f4][557204abf242]
+- Implemented [`sycl_ext_intel_queue_immediate_command_list`](https://github.com/intel/llvm/tree/2f2560fe74e6/sycl/doc/extensions/supported/sycl_ext_intel_queue_immediate_command_list.asciidoc) extension. [2f2560fe74e6]
+- Implemented [`sycl_ext_oneapi_copy_optimize`](https://github.com/intel/llvm/tree/65cc0cfe809f/sycl/doc/extensions/experimental/sycl_ext_oneapi_copy_optimize.asciidoc) experimental extension. [65cc0cfe809f]
+- Added initial implementation of the experimental SYCL Graph extension [`sycl_ext_oneapi_graph`](https://github.com/intel/llvm/tree/f9d4830babca/sycl/doc/extensions/proposed/sycl_ext_oneapi_graph.asciidoc)  [35465da4824a][f9d4830babca][72341ee375d6][96a605095665]
+- Added `dimensions` member to item/range-like types. [9c7bd9ea046f]
+- Added support of `queue::priority_*` properties for OpenCL backend. [25fd689999f7]
+- Implemented initial version of the SYCL Native CPU Plug-in designed in [`SYCLNativeCPU`](https://github.com/intel/llvm/tree/5b501eeca963/sycl/doc/design/SYCLNativeCPU.md). [b2d0837e9644] [5b501eeca963]
+- Added `__imf_max/min/hadd/fast_*` functions to imf device libraries. [fe01366e8b08][d96e5075303a]
+- Introduced and implemented new `sycl::ext::oneapi::experimental::info::device::architecture` device descriptor and `device::ext_oneapi_architecture_is(ext::oneapi::experimental::architecture)` host API as part of [`sycl_ext_oneapi_device_architecture`](https://github.com/intel/llvm/tree/d0b01b265d92/sycl/doc/extensions/experimental/sycl_ext_oneapi_device_architecture.asciidoc) extension (for Level Zero and OpenCL). [d0b01b265d92]
+- Added experimental implementation of [`sycl_ext_intel_grf_size`](https://github.com/intel/llvm/tree/370aa2a01711/sycl/doc/extensions/experimental/sycl_ext_intel_grf_size.asciidoc) [370aa2a01711]
+- Experimental [`sycl_ext_oneapi_device_global`](https://github.com/intel/llvm/tree/d812d1e13ab5/sycl/doc/extensions/experimental/sycl_ext_oneapi_device_global.asciidoc) extension is supported now. [d812d1e13ab5]
+- Enabled media API that works with image accessors for ESIMD. Those accessors do not depend on stateful/stateless mode. [b95b2d88e446]
+- Allowed implicit conversion from `std::experimental::simd_mask` to `ESIMD::simd_mask`. [de5b479a2de9]
+- Added `host_task` image accessor support. [b6faefb08e23]
+- Started to support 64-bit offsets with accessors in stateless mode for ESIMD APIs. [f28ae00bf6a3][d04ebb03c1c8][62955fc20e14]
+- Added `__imf_llabs` to the imf device libraries. [3c19581f828c]
+- Added explicit conversion of `multi_ptr<T>` to `multi_ptr<const T>`. [10d2f5b613f3]
+- Added support for scalar logical operators with group algorithms. [11ac73003056][90406b2ba07c]
+- Implemented device query for 64 bit atomic support in ESIMD emulator. [c40baa6db64b]
+- Added support of 16 bit data for `lsc_atomic_update` and `lsc_slm_atomic_update` ESIMD API. [3028d82a75d2]
+- Added initial implementation of [`sycl_ext_oneapi_root_group`](https://github.com/intel/llvm/tree/743c35be2da7/sycl/doc/extensions/proposed/sycl_ext_oneapi_root_group.asciidoc) extension. There are severe limitations: `max_num_work_group_sync` query always returns `1` and the implemented barrier has work group scope since all work items in a root group are currently in the same work group. [743c35be2da7]
+- Added support for tf32 type using the unified interface for SYCL Matrix extension. [aba6d85f9f16]
+- Implemented Host Pipes described in [`sycl_ext_intel_dataflow_pipes`](https://github.com/intel/llvm/tree/992ef064289f/sycl/doc/extensions/supported/sycl_ext_intel_dataflow_pipes.asciidoc) extension. [992ef064289f][5bd42eb6a7df]
+- Added support for more math built-ins for `bfloat16` together with the extension specification [`sycl_ext_oneapi_bfloat16_math_functions`](https://github.com/intel/llvm/tree/c7759bb8cd0b/sycl/doc/extensions/experimental/sycl_ext_oneapi_bfloat16_math_functions.asciidoc) update [c7759bb8cd0b]
+- Added decorated `async_work_group_copy` overloads. [d1f67ca228f2]
+- Added initial implementation of the Unified Runtime plugin and routed to it with `SYCL_PREFER_UR`. [5a239c6a08e1]
+- Added support for accessors to `atomic_update` ESIMD API. [59a1aa0841da]
+- Enabled pretty-printing of reference objects by GDB. [e3fc16f56c0e]
+- Added Xmethods that help GDB to support reading local_accessors on GPU from SLM. [ddfc5220afaa]
+- Enabled passing local accessors to ESIMD kernel and via `invoke_simd()` API, enabled usage of `get_pointer()` and `operator[]` for accessors in ESIMD kernel. The support of accessor and local_accessor is still limited comparing to SYCL. [21ca00fb1cc5][82ae85d12bf0]
+- Implement unpadding for 2d block load/store ESIMD API. [9be8a306d0eb]
+- Added SYCL 2020 image classes. [9035cdb0898d][1a3e99307e4f]
+- Added interfaces for SYCL 2020 image accessors, only the host accessors interfaces are actually functional. [a58d8e3e88c0]
+- Added XPTI notifications for SYCL 2020 images as well as for the associated accessor classes. [8c535276785d][73d5c04d0305]
+- Started to print device aspects in `sycl-ls --verbose`. [e32ab1759fed]
+- Enabled group algorithms to be used with `tangle_group` and `opportunistic_group` arguments. [29e629e00e83]
+- Implemented `info::device::backend_version` query. [df743a5c97e1]
+- Added `fixed_size_group` support to algorithms. [01ac0338bfa9]
+- Added simple `abs(int)` to imf libdevice. [2a3aaeea5be8]
+- Added `ballot_group` support to algorithms. [70a35de4eacb]
+- Implemented interoperability support for images for the Level Zero backend. [836ceec36b57]
+- Added `marray` support for math built-in functions. [a7a6de2e40a2][c274f40588ff][2953d2548791]
+- Enabled inline assembly support in ESIMD kernels. [f5a062dd057f][48abe38c0bee]
+- Enhanced interop queue interface to choose standard or immediate commandlists. [0ff3ec40e241]
+- Enabled double type for `atomic_update()` ESIMD API. [98c443543c72]
+- Added support for `addc` and `subb` operations for ESIMD. [0a098933d9ba]
+- Allowed zero-sized 3D accessors. [5cb8279ddb8e]
+
+### Documentation
+- Added experimental SYCL bindless images extension [`sycl_ext_oneapi_bindless_images`](https://github.com/intel/llvm/tree/99cbdd15fef2/sycl/doc/extensions/proposed/sycl_ext_oneapi_bindless_images.asciidoc) proposal. [74e83d5af06c][99cbdd15fef2]
+- Added [`sycl_ext_oneapi_work_group_local`](https://github.com/intel/llvm/tree/1d6cff1a8e5a/sycl/doc/extensions/proposed/sycl_ext_oneapi_work_group_local.asciidoc) extension proposal. [1d6cff1a8e5a]
+- Moved [`sycl_ext_oneapi_device_global`](https://github.com/intel/llvm/tree/d812d1e13ab5/sycl/doc/extensions/experimental/sycl_ext_oneapi_device_global.asciidoc) from proposed to experimental. [d812d1e13ab5]
+- Added [`DeviceConfigFile`](https://github.com/intel/llvm/tree/3f4b778dd163/sycl/doc/design/DeviceConfigFile.md) design document. [3f4b778dd163]
+- Added [`sycl_ext_oneapi_address_cast`](https://github.com/intel/llvm/tree/9cab5598bc82/sycl/doc/extensions/proposed/sycl_ext_oneapi_address_cast.asciidoc) extension. [9cab5598bc82]
+- Added [`sycl_ext_intel_grf_size`](https://github.com/intel/llvm/tree/83d099758770/sycl/doc/extensions/proposed/sycl_ext_intel_grf_size.asciidoc) extension. [83d099758770]
+- Added proposal for [`sycl_ext_oneapi_append_and_shift`](https://github.com/intel/llvm/tree/1a283acaac3c/sycl/doc/extensions/proposed/sycl_ext_oneapi_append_and_shift.asciidoc) extension. [1a283acaac3c]
+- Updated description of accessor-based memory APIs in [`sycl_ext_intel_esimd`](https://github.com/intel/llvm/tree/9e5889918277/sycl/doc/extensions/experimental/sycl_ext_intel_esimd/sycl_ext_intel_esimd.md). [9e5889918277]
+- Added draft of [`sycl_ext_oneapi_prefetch`](https://github.com/intel/llvm/tree/034ac0958662/sycl/doc/extensions/proposed/sycl_ext_oneapi_prefetch.asciidoc). [034ac0958662]
+- Added draft of [`sycl_ext_oneapi_barrier`](https://github.com/intel/llvm/tree/32144edcf982/sycl/doc/extensions/proposed/sycl_ext_oneapi_barrier.asciidoc) [32144edcf982]
+- Added legacy SYCL 1.2.1 image aspect [`sycl_ext_intel_legacy_image` extension](https://github.com/intel/llvm/tree/78b709dd5bb8/sycl/doc/extensions/supported/sycl_ext_intel_legacy_image.asciidoc). [78b709dd5bb8]
+- Added [`sycl_ext_intel_data_flow_pipes_properties`](https://github.com/intel/llvm/tree/5b0a4615874b/sycl/doc/extensions/experimental/sycl_ext_intel_data_flow_pipes_properties.asciidoc) extension. [5b0a4615874b]
+- Updated [`sycl_ext_intel_dataflow_pipes`](https://github.com/intel/llvm/tree/6b2d66b8d52f/sycl/doc/extensions/supported/sycl_ext_intel_dataflow_pipes.asciidoc) extension for FPGA host. [6b2d66b8d52f]
+
+## Improvements
+
+### SYCL Compiler
+- Deprecated `-fsycl-link-huge-device-code` in favor of a new option, `-flink-huge-device-code`. The new option is identical in functionality but allowed with `-fopenmp-targets`. [79ea9b7aa52b]
+- Optimized size of produced binaries when `device_global` is used. [f99335f5e71a]
+- Started to emit an error when PCH is triggered in SYCL mode. [062446584d52]
+- Improved FPGA archive device unbundling with AOCO. [4ed3676f6402]
+- Moved imf `abs` to a separate device library for Deep Learning. [e4f074a6aabe]
+- Fixed the bug report URL for DPC++. [98b7de88a7d9]
+- Started to link with `bfloat16` related device libraries only when they are used. [663042b04b63]
+- Started to properly pass `-fsycl-esimd-force-stateless-mem` to the host code compilation. [8b85b6400b82]
+
+### SYCL Library
+- Combined ADL-S and RPL-S device architectures. [f87be6f17428]
+- Implemented `multi_ptr` default to be legacy to avoid code break with SYCL 1.2.1. [52edb3798749]
+- Started to use aggregate initialization for `group_local_memory` arrays according to updated [`sycl_ext_oneapi_local_memory`](https://github.com/intel/llvm/tree/b50440ec76b9/sycl/doc/extensions/supported/sycl_ext_oneapi_local_memory.asciidoc).[b50440ec76b9]
+- Deprecated ESIMD Emulator.  [177680e3e918]
+- Deprecated `ext::oneapi::sub_group`. [0662e2a81014]
+- Improved error message related to `ext_intel_free_memory` aspect. [c1bfcaf6d6ed]
+- Removed non-necessary barrier after region copy in the Level Zero backend. [768e61e7966c]
+- Fixed `get_info<device::free_memory>` to check its aspect. [dfb75675bc0a]
+- Removed the workaround for release of auxiliary buffers. [fda9171fc218]
+- Enabled optimization for read-only buffer accesses from multiple devices in a context. [dddaf7f5d54d]
+- Removed old specialization constants extension and implementation. [e50ae05e28d3][833a9fe0a9cf]
+- Improved `is_compatible` to check if specific target is defined with `-fsycl-targets` and change the result. [ef033238cfb5]
+- Improved quotient approximation for host `sycl::remquo`. [d335fdf4afa2]
+- Allowed `accessor` constructed with zero-size buffers. [0c0809590f37]
+- Improved resources recycling for the Level Zero backend by cleaning up events on in-order queue wait. [260182a1ad75]
+- Added `code_location` parameter to the rest of `sycl::queue` methods which allows to improve error reporting. [fdd609a5c724]
+- Enabled `xpti::node_create` signal emit for parallel_for that bypasses graph. [4eaaaa963ca2]
+- Improved the accuracy of host `sycl::cospi`. [37aa84b7afd5]
+- Replaced usage of outdated memset OpenCL API `clEnqueueMemsetINTEL` with `clEnqueueMemFillINTEL`. [c5d0e1d4d9be]
+- Added memory pointer to XPTI memory allocation event metadata to allow more informative XPTI memory tracing. [01ffe65da562]
+- Implemented recycling of immediate command lists for queues in a context for the Level Zero backend. [2ddbf8c3b7a5]
+- Optimized `ext_oneapi_submit_barrier()` for in-order queue for the Level Zero backend. [c21fd476c78a]
+- Change the `SYCL_PI_LEVEL_ZERO_USM_RESIDENT` default to force device allocations only. [33874f76c59f]
+- Started to report false for `aspect::image` on all devices. [5cf0f7cf5c21]
+- Removed lambda parameter from "if_architecture_is" according to [`sycl_ext_oneapi_device_architecture`](https://github.com/intel/llvm/tree/71f745bcda7d/sycl/doc/extensions/experimental/sycl_ext_oneapi_device_architecture.asciidoc). [71f745bcda7d]
+- Improved error reporting when `reqd_work_group_size` is not supported by a device. [084c02797084]
+- Adjusted static restriction checks for `block_2d` APIs. [7902d6b35348]
+- Disallowed local accessor misuse according to SYCL 2020. [d7a5ec048368]
+- Implemented passing address space information to SPIR-V Joint Matrix Load/Store intrinsics. [854ab7eab5e4]
+- Enabled immediate command lists by default for the Level Zero backend. [9e5b2ed1ddc3]
+- Improved SYCL RT performance by removing extra map lookup for eliminated kernel arguments during enqueue of cached kernels. [7a1fa6c38e03]
+- Changed the default to `SYCL_PI_LEVEL_ZERO_USM_RESIDENT=2`. [85435666d477]
+- Added heuristics to reduce overhead from immediate command-list cleanup in the Level Zero backend. [ce69d6906e30]
+- Renamed `cluster_group` to `fixed_size_group`. [62c091a0af17]
+- Added error for invalid uniform arguments in InvokeSIMD. [487c1f8cb040]
+- Reduced overhead from queue profiling by using `steady_clock::now()` to retrieve host time as `piGetDeviceAndHostTimer` has large overhead. [070598e433a7]
+- Implemented graceful handling of the case that Level Zero runtime was already unloaded when we do cleanup. [928a919f15d9]
+- Improved error messages for InvokeSIMD. [0c74bbb20e65]
+- Updated `native_specialization_constant()` for the case when no specialization constants are present. [9eb1184686fb]
+- Optimized re-use of command lists in the Level Zero backend. [24ae0153e8e8]
+- Changed the behavior of `sycl::maximum` and `sycl::minimum` to be consistent with `std::max` and `std::min`. [77d648d949c3]
+- Switched to sycl::exception instead of sycl::runtime_error in the majority of SYCL device headers. sycl::runtime_error is deprecated in SYCL 2020. [23a836f39af6]
+
+### SYCL Tools
+- Added USM calls parameter verification layer to `sycl-trace` which provides improved diagnostic on accessing invalid device memory under USM. [758bd264bb90]
+- Started to print supported SG sizes in `sycl-ls --verbose`. [12515177ab03]
+
+### Documentation
+- Updated restrictions in [`sycl_ext_oneapi_device_architecture`](https://github.com/intel/llvm/tree/a69a54f38f2f/sycl/doc/extensions/experimental/sycl_ext_oneapi_device_architecture.asciidoc) extension. [a69a54f38f2f]
+- Added `invoke_simd` SLM example and `simd_view` example to [`README`](https://github.com/intel/llvm/tree/f1986b40e04b/sycl/doc/extensions/supported/sycl_ext_intel_esimd/examples/README.md). [c99668ee5985][f1986b40e04b]
+- Clarified ways to set sub-group sizes in [`sycl_ext_oneapi_named_sub_group_sizes`](https://github.com/intel/llvm/tree/3285e0fe5590/sycl/doc/extensions/proposed/sycl_ext_oneapi_named_sub_group_sizes.asciidoc) [3285e0fe5590]
+- Added `invoke_simd` example to [`README`](https://github.com/intel/llvm/tree/e860b140d506/sycl/doc/extensions/supported/sycl_ext_intel_esimd/examples/README.md). [e860b140d506]
+- Added [`README`](https://github.com/intel/llvm/blob/453ffdcf41dd50ec66b5cc9ea6304fadd10f18e8/sycl/plugins/unified_runtime/README.md) for the Unified Runtime directory. [453ffdcf41dd]
+- Updated the versioning section in [`ABIPolicyGuide`](https://github.com/intel/llvm/tree/dfe6e4e542ab/sycl/doc/developer/ABIPolicyGuide.md). [dfe6e4e542ab]
+- Added `invoke_simd` section to [`README`](https://github.com/intel/llvm/tree/52567e8cd428/sycl/doc/extensions/experimental/sycl_ext_intel_esimd/README.md) and updated [sycl_ext_intel_esimd](https://github.com/intel/llvm/tree/52567e8cd428/sycl/doc/extensions/experimental/sycl_ext_intel_esimd/sycl_ext_intel_esimd.md) [52567e8cd428]
+- Updated [`sycl_ext_oneapi_device_architecture`](https://github.com/intel/llvm/tree/acfc905a994a/sycl/doc/extensions/experimental/sycl_ext_oneapi_device_architecture.asciidoc) and [`sycl_ext_oneapi_device_if`](https://github.com/intel/llvm/tree/acfc905a994a/sycl/doc/extensions/proposed/sycl_ext_oneapi_device_if.asciidoc) to allow captures in `if_device_has`, `if_architecture_is`. [acfc905a994a]
+- Updated [`SPV_INTEL_joint_matrix`](https://github.com/intel/llvm/tree/adbbf0f39c25/sycl/doc/design/spirv-extensions/SPV_INTEL_joint_matrix.asciidoc) extension. [adbbf0f39c25]
+
+## Bug fixes
+
+### SYCL Compiler
+- Fixed `uses_aspects` to be applied to function declarations and not only function definitions. [5d928976f9e2]
+- Fixed `ivdep` attribute in template contexts. [008e3743929e]
+- Fixed option restriction for device with Windows. [be3482ff3f0e]
+- Fixed handling unsupported attributes for ESIMD. [8990c5503d47]
+- Fixed optimization option processing for device options. [9a82d283ae2b]
+- Enabled proper behavior of optional kernel features with `SYCL_EXTERNAL`. [1dce70f413e6]
+- Fixed "SLM init call is supported only in kernels" message for `slm_init()` inside a ESIMD kernel when compiling with `-O0` option. [2056c669b122]
+- Fixed memory leak in `Base64::decode`. [28113ec69167]
+- Fixed crash when using libraries with `-fsycl`. [ef0d151e57a4]
+- Added predefines for `/MDd` to fix certain build issues when using reductions on Windows. [1a602d77b7b2]
+- Fixed compilation issue when kernel name is a `class` defined with `final` keyword. [9eeb7825ea2d]
+- Fixed an issue with setting `VCSLMSize` attribute which was causing JIT compilation failure when using `slm_init()` in ESIMD kernel.  [f24a0bb343b2]
+- Fixed option for undefined symbols for hostdep link to work with gold linker. [554ed2187d32]
+- Fixed directory field of `DIFile`. [797187de2199]
+
+### SYCL Library
+- Fixed the bug in the Level zero backend where `zeModuleDestroy` is called although `ext::oneapi::level_zero::ownership::keep` is passed to `make_kernel_bundle` leading to a double free corruption. [1344fa29e6eb]
+- Addressed specification mismatches for `multi_ptr`. [ed5941feb363]
+- Fixed corner case when using `short` or `char` with exclusive scan. [aa31c1e2deb9]
+- Updated the legacy `multi_ptr` construction from `local_accessor`s to conform with the construction method used by other accessors. [6314af860c8c]
+- Made `get_pointer` `noexcept` to comply with SYCL 2020. [85d84114510c]
+- Started to return the correct `sycl::errc` in case of invalid `sycl::reqd_work_group_size`. [d16a1fd39c08]
+- Fixed `sycl::remquo` truncation error. [43fbde38e26f]
+- Fixed linkage errors when trying to get `mem_channel` property or check it using `has_property()`. [9352172a9b81]
+- Fixed dynamic loading of `ocloc` tool for online_compiler. [1854b263672b]
+- Fixed global memory reporting for Arc. [0a99d87a7f31]
+- Fixed `sycl::sub_group` to follow by-value semantics according to SYCL 2020. [064f332f156c]
+- Fixed invalid value returned by `event.get_info<sycl::info::event::command_execution_status>()` for OpenCL backend. [5a91df97973a]
+- Fixed assignment operator for ESIMD globals. [a3a9dd02d420]
+- Fixed the Level Zero backend to not destroy build log on program build failure.[b7ffbc776b37]
+- Fixed error appearing when validation layer is enabled for the Level Zero with `ZE_DEBUG=6`. [abfdb6306d7c]
+- Allowed group algorithms to accept a function object with an explicit type. [78f09e6f31c6]
+- Fixed `sycl::is_compatible()` for the case when vector of kernels ids is empty. [23fbba1e6b8b]
+- Fixed `multi_ptr` ctor for extended address spaces. [a69e5152874e]
+- Fixed trivially_copyable to device_copyable for `fill` and `copy`. [9c662e228fa4]
+- Fixed range deductions in reduction `parallel_for`. [3024161f1b7b]
+- Adjusted `multi_ptr` deduction guides to comply with SYCL 2020. [3df6fdc58b00]
+- Fixed gather/scatter with accessors when passing scalar for ESIMD backend. [27755824d050]
+- Fixed lost data during implicit conversion in local and host accessors. [7618dffd78ae]
+- Fixed memory leak because of unnecessary kernel retain. [f110fd73f8e7]
+- Fixed the pointer type contained by a read-only accessor to const. [fd4598035b7e]
+- Fixed `max_sub_group_size` query for devices without sub groups.[98cdf4c3acda]
+- Removed undefined behavior in host-side `abs_diff`. [a4105101e3c5]
+- Fixed `vec::as<vec<bool, N>>()`. [72cffff649f5]
+- Fixed segfault when using `ZE_DEBUG` and an exception is thrown on device selection phase. [05866c17ce87]
+- Removed an unnecessary `multi_ptr` conversion which had conflict with an existing one resulting in ambiguity on implicit conversion. [aba3007fe71b]
+- Fixed lost data in accessor implicit conversion. [10551294dcc7]
+- Fixed stack smashing which happened in some device info queries on OpenCL backend. [f4577ff62ace]
+- Stopped shipping the Level Zero loader and headers with the DPCPP toolchain installation. [6581bc013a46][ad8b7d447612]
+- Corrected the PVC device id check in the Level Zero backend. [59a23020e1be]
+- Fixed ambiguity for `bit_cast`. [834df4737a85]
+- Fixed nan/inf handling in `sycl::stream` when in `-fast-math` mode. [757e4a72d58d]
+- Allowed for different types in group algorithms `reduce`, `exclusive_scan`, and `inclusive_scan`. [8ad0e1df6e76]
+- Fixed bug in get methods in the `config_2d_mem_access` class for 2D block stateless load/store API. [7e5f529db655]
+- Fixed compilation error for `sycl::fabs` builtin in `-fast-math` mode. [43cb2c0e4867]
+- Allowed `host_task` deduction tags in placeholder accessors. [21beef77542b]
+- Added iterator operations for zero-dimension accessors to comply with SYCL 2020. [6eb8b4369b64]
+- Fixed rounding issue in `__imf_vavgs`. [38d05361f06e]
+- Fixed bug in `fill` operation for zero-dimensional accessor. [5b46e0dbcd2c]
+- Fixed arguments passed to `ocloc` via `-ftarget-compile-fast`. [d8bb69e5b166]
+- Implemented implicit conversion for local and host accessors to comply with SYCL 2020. [3d4da8fb5eee]
+- Throw exception when empty accessor calls `require()` to comply with SYCL 2020. [aa08c7e9664a]
+- Fixed `operator~` for `sycl::vec<bool, N>`. [58e64ce83be6]
+- Fixed `sub_group` shuffle for vectors of `long long` and `half`.  [eeba87925d53]
+- Fixed static destruction order issue in OpenCL extension fptr cache. [379a094d8726]
+- Fixed leak of active barriers' events in the Level Zero backend. [86860693375c]
+- Implemented proper queries for `aspect::ext_oneapi_srgb`. [23978b462fc7]
+- Added `vec` assignment from scalar and more `vec` modulus overloads to comply with SYCL 2020. [68e13d41f9c3]
+- Fixed handling of `mem_channel` buffer property. [909343107452]
+- Fixed `operator&` and `operator[]` in `local_accessor<const T>`. [d5477c87c700]
+- Fixed the third argugment type for `select` built-in. [528eba8660a9]
+- Fixed `native_specialization_constant()` API implementation. [41344ed7da50]
+- Moved Level Zero specific collectors to dynamic libraries loaded by request in sycl-trace tool to support the case when Level Zero driver is not available. [4c5250643b3c]
+- Added missing `marray` relational functions: `any`, `all`, `bitselect`; fixed scalar `select` relational function; aligned scalar `abs` integer function with SYCL 2020 and fixed math functions which take `multi_ptr` argument. [8b0b210a95db]
+- Added `operator[]` and `element_type` to swizzle vec to align with SYCL 2020. [9050d3325f2c]
+- Fixed buffer range in `atomic_memory_order_acq_rel`. [9fb49f757f04]
+- Started to throw for invalid `global_work_size` query. [61433888d4b5]
+- Fixed the Level Zero backend to report events as submitted, not running, until they are completed. [502e0fdd3f3c]
+- Allowed raw send ESIMD API to use non-standard types like `sycl::half`. [8022c6a4f6ec]
+- Started to throw for invalid `info::kernel::num_args query` to comply with SYCL 2020. [93e8ca3ed2eb]
+- Updated `group_broadcast` to support `vec` types. [09ae6434f55f]
+- Fixed reductions to avoid implicit `atomic64` requirements. [e11b35835722]
+- Added a partial profiling workaround for acc devices because queue profiling is no longer supported for OpenCL version < 2.1 after recent changes regarding command submit profiling info. [097d21c33599]
+- Fixed program build API for the Level Zero backend to retain build-log when program build failed. [7d9f5ac5e5da]
+- Implemented missing std::hash specializations for local_accessor and host_accessor to comply with SYCL 2020. [919246bacb82]
+- Fixed `global_work_size` kernel query. [d666b95af56a]
+- Fixed incorrect `sycl::vec<bool, N>` constructor behavior. [45aa49bb3f06]
+- Fixed `weak_object` for `host_accessor` and `stream`. [225b3ecad28d]
+- Fixed incorrect behaviors in some operations using `sycl::vec<bool, N>`. [61f6ce4579b7]
+- Fixed integration footer for `specialization_id`. [dd96dfaf12da]
+- Fixed compilation break occurring when `bfloat16` constructor is used in a kernel. [bc063ac7a9e1]
+- Fixed crash in subgroup info queries when running on OpenCL backend which doesn't support sub-groups. [2408035da9c9]
+- Added missing support for `target::host_task` specialised accessor constructor using `mode_target Tag`. [c01d8824b1aa]
+- Fixed identityless reductions with unwritten reducers. [cb15f33e861a]
+- Fixed dangling pointer issue in xpti tracing. [8a49660d4b44]
+- Fixed PI event leak in memcpy2d device-host fallback. [24ec33c39c85]
+- Fixed `weak_object` and `owner_less` for `device` objects. [b9b85136d492]
+- Added `noexcept` specifier to `vec::byte_size` to comply with SYCL 2020. [cb6c0580dc30]
+- Fixed undefined behaviour in vector printf specifier. [d190de65a4e1]
+- Fixed `handler::fill` so it works even if pattern is not supported by a backend natively. [7f49367d234b]
+- Fixed mechanism to throw exception when placeholder accessor passed to a command. (#10110) [93f477358d74]
+- Fixed empty zero-dimensional accessor access range. [b4d2b2c966b1]
+- Fixed incorrect write back in a case when a sycl::buffer is constructed with a `const T*` as host data. [d05818637be3][1b2b9ddc7f37]
+- Fixed empty accessor default constructor to not create a placeholder to comply with SYCL 2020. [e753fae9fad7]
+
+### Documentation
+- Fixed link anchors in [`EnvironmentVariables`](https://github.com/intel/llvm/tree/ba1fc2e68d04/sycl/doc/EnvironmentVariables.md) documentation. [ba1fc2e68d04]
+- Fixed InlineAsm example in [`sycl_ext_intel_esimd`](https://github.com/intel/llvm/tree/5ff169559712/sycl/doc/extensions/experimental/sycl_ext_intel_esimd/sycl_ext_intel_esimd.md). [5ff169559712]
+
+## API/ABI breakages
+- Removed deprecated `piclCreateProgramWithSource`. [42f560977a0a]
+- Removed deprecated `barrier` API. [00533e0b1e7e]
+- Removed deprecated `interop_task` [aff7fbb287d0]
+- Removed deprecated `sycl::group_local_memory`. [1dc77a73b391]
+- Removed deprecated `sycl::detail::bitcast` [61f3650c28ed]
+- Removed deprecated `piEnqueueNativeKernel` [c3781336d0f1]
+- Removed deprecated backend enum values: `level_zero`, `cuda`, `esimd_cpu`, `hip`. [81900499327a]
+- Removed the workaround for release of auxiliary buffers. [fda9171fc218]
+- Removed old specialization constants extension and implementation. [e50ae05e28d3][833a9fe0a9cf]
+- Removed non-standard RT namespace from `sycl` namespace. [fd636ef3ab99]
+- Removed deprecated ESIMD APIs. [91e8364164c6]
+- Removed `DISABLE_SYCL_INSTRUMENTATION_METADATA` macro and `_CODELOC*` macro usage from API. [f4525e901cd6]
+- Removed non-standard RT namespace from sycl namespace. [b3e0428ffef4]
+- Removed `getOSModuleHandle` usage. [f73230d8a8ba]
+- Removed deprecated `sycldevice` triple support. [01d7fc097ec6]
+- Dropped support for `sycl_ext_oneapi_extended_atomics` extension. [17ea6d10f1d5]
+- Dropped support for `sycl_ext_oneapi_group_algorithms` extension. [8b9d2106af0d]
+- Removed unneeded backwards compatibility of `make_queue` and `get_native`. [c5f150a41b1d][5918e0c208a1]
+- Removed support for binaries generated for triples with `-sycldevice` environment component. [5fc53134fcfc]
+- Removed lambda parameter from `if_architecture_is` according to [`sycl_ext_oneapi_device_architecture`](https://github.com/intel/llvm/tree/71f745bcda7d/sycl/doc/extensions/experimental/sycl_ext_oneapi_device_architecture.asciidoc). [71f745bcda7d]
+- Updated `sycl::exception` to be SYCL2020 compliant. [ce26ac08a383]
+- Replaced deprecated `sycl::runtime_error` with SYCL 2020 compliant `sycl::exception` in SYCL RT. [f08dc9473d40]
+- Renamed `win_proxy_loader` to `pi_win_proxy_loader`. [2648b7c5e1a4]
+- Promoted the return type changes of SYCL relational builtins that changed between SYCL 1.2.1 and SYCL 2020 out from the guard of `SYCL2020_CONFORMANT_APIS`. [35fe4a21e1cb]
+- Fixed `get_pointer` to return `T*` for `target::device` specialized accessor according to specification. [712cb4e4f75e]
+- Fixed `max_work_item_sizes` return type from `id` to `range` according to SYCL 2020. [8f09d3e603da]
+- Deprecated experimental `set_kernel_properties` API and `use_double_grf/use_large_grf` properties were removed. New API provided in the extension [`sycl_ext_intel_grf_size`](https://github.com/intel/llvm/tree/370aa2a01711/sycl/doc/extensions/experimental/sycl_ext_intel_grf_size.asciidoc) has to be used. [f363bb272b5a]
+
+## Known issues
+- Having MESA OpenCL implementation which provides no devices on a
+  system may cause incorrect device discovery. As a workaround such an OpenCL
+  implementation can be disabled by removing `/etc/OpenCL/vendor/mesa.icd`.
+- `-fsycl-dead-args-optimization` can't help eliminate offset of
+  accessor even though it's created with no offset specified
+- SYCL 2020 barriers show worse performance than SYCL 1.2.1 do. [18c80faa]
+- When using fallback assert in separate compilation flow it requires explicit
+  linking against `lib/libsycl-fallback-cassert.o` or
+  `lib/libsycl-fallback-cassert.spv`
+- Limit alignment of allocation requests at 64KB which is the only alignment
+  supported by Level Zero. 7dfaf3bd
+- User-defined functions with the name and signature matching those of any
+  OpenCL C built-in function (i.e. an exact match of arguments, return type
+  doesn't matter) can lead to Undefined Behavior.
+- A DPC++ system that has FPGAs installed does not support multi-process
+  execution. Creating a context opens the device associated with the context
+  and places a lock on it for that process. No other process may use that
+  device. Some queries about the device through `device.get_info<>()` also
+  open up the device and lock it to that process since the runtime needs
+  to query the actual device to obtain that information.
+- The format of the object files produced by the compiler can change between
+  versions. The workaround is to rebuild the application.
+- Using `sycl::kernel_bundle` API to refer to a kernel defined
+  in another translation unit leads to undefined behavior
+- Linkage errors with the following message:
+  `error LNK2005: "bool const std::_Is_integral<bool>" (??$_Is_integral@_N@std@@3_NB) already defined`
+  can happen when a SYCL application is built using MS Visual Studio 2019
+  version below 16.3.0 and user specifies `-std=c++14` or `/std:c++14`.
+- Printing internal defines isn't supported on Windows. [50628db1]
+- The support of accessor and local_accessor for ESIMD is still limited comparing to SYCL.
+- [`sycl_ext_oneapi_root_group`](https://github.com/intel/llvm/tree/743c35be2da7/sycl/doc/extensions/proposed/sycl_ext_oneapi_root_group.asciidoc) implementation has the following limitations: `max_num_work_group_sync` query always returns `1` and the implemented barrier has work group scope since all work items in a root group are currently in the same work group.
+
+
 # March'23 release notes
 
 Release notes for commit range [`ca54ea30..cb91c232`](https://github.com/intel/llvm/compare/ca54ea30...cb91c232)

From 99b97eef42781f886f362a587fb8a586e02e3c1d Mon Sep 17 00:00:00 2001
From: Steffen Larsen <steffen.larsen@intel.com>
Date: Mon, 6 Nov 2023 17:07:51 +0000
Subject: [PATCH 839/877] [SYCL][Docs] Add -fpreview-breaking-changes to the
 user manual (#11789)

This commit adds the option description of -fpreview-breaking-changes to
the SYCL user manual.

Signed-off-by: Larsen, Steffen <steffen.larsen@intel.com>
---
 sycl/doc/UsersManual.md | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/sycl/doc/UsersManual.md b/sycl/doc/UsersManual.md
index eb196375ecb8a..ba3add911478f 100644
--- a/sycl/doc/UsersManual.md
+++ b/sycl/doc/UsersManual.md
@@ -452,6 +452,15 @@ and not recommended to use in production environment.
     Currently the only supported Device is "pvc". The supported modes are
     "default","small","large", and "auto".
 
+**`-fpreview-breaking-changes`**
+
+    When specified, it informs the compiler driver and compilation phases
+    that it is allowed to break backward compatibility. When this option is
+    specified the compiler will also set the macro
+    __INTEL_PREVIEW_BREAKING_CHANGES.
+    When this option is used in conjunction with -fsycl, the driver will link
+    against an alternate form of libsycl, libsycl-preview.
+
 # Example: SYCL device code compilation
 
 To invoke SYCL device compiler set `-fsycl-device-only` flag.

From eaf8268ecc50eda93cd6a25c412ebfa11a508ddc Mon Sep 17 00:00:00 2001
From: Evgeniy <evgeniy.tyurin@intel.com>
Date: Mon, 6 Nov 2023 09:08:05 -0800
Subject: [PATCH 840/877] [SYCL][ESIMD] Remove emulator mentions in
 documentation (#11778)

---
 sycl/doc/EnvironmentVariables.md                   | 2 +-
 sycl/doc/GetStartedGuide.md                        | 2 --
 sycl/doc/design/ESIMDDesignNotes.md                | 5 -----
 sycl/doc/design/PropagateCompilerFlagsToRuntime.md | 4 ++--
 sycl/test-e2e/README.md                            | 8 +++-----
 5 files changed, 6 insertions(+), 15 deletions(-)

diff --git a/sycl/doc/EnvironmentVariables.md b/sycl/doc/EnvironmentVariables.md
index e7245730e8504..2b1e0a7bce70d 100644
--- a/sycl/doc/EnvironmentVariables.md
+++ b/sycl/doc/EnvironmentVariables.md
@@ -41,7 +41,7 @@ ONEAPI_DEVICE_SELECTOR = <selector-string>
 <accept-filter> ::= <term>
 <discard-filter> ::= !<term>
 <term> ::= <backend>:<devices>
-<backend> ::= { * | level_zero | opencl | cuda | hip | esimd_emulator }  // case insensitive
+<backend> ::= { * | level_zero | opencl | cuda | hip }  // case insensitive
 <devices> ::= <device>[,<device>...]
 <device> ::= { * | cpu | gpu | fpga | <num> | <num>.<num> | <num>.* | *.* | <num>.<num>.<num> | <num>.<num>.* | <num>.*.* | *.*.*  }  // case insensitive
 ```
diff --git a/sycl/doc/GetStartedGuide.md b/sycl/doc/GetStartedGuide.md
index 34e55e7730ef5..564dbe0dcdec6 100644
--- a/sycl/doc/GetStartedGuide.md
+++ b/sycl/doc/GetStartedGuide.md
@@ -125,8 +125,6 @@ flags can be found by launching the script with `--help`):
 * `--hip-platform` -> select the platform used by the hip backend, `AMD` or
   `NVIDIA` (see [HIP AMD](#build-dpc-toolchain-with-support-for-hip-amd) or see
   [HIP NVIDIA](#build-dpc-toolchain-with-support-for-hip-nvidia))
-* `--enable-esimd-emulator` -> enable ESIMD CPU emulation (see
-  [ESIMD CPU emulation](#build-dpc-toolchain-with-support-for-esimd-cpu-emulation))
 * `--enable-all-llvm-targets` -> build compiler (but not a runtime) with all
   supported targets
 * `--shared-libs` -> Build shared libraries
diff --git a/sycl/doc/design/ESIMDDesignNotes.md b/sycl/doc/design/ESIMDDesignNotes.md
index 2943dc3bbf48e..71be756227e7f 100644
--- a/sycl/doc/design/ESIMDDesignNotes.md
+++ b/sycl/doc/design/ESIMDDesignNotes.md
@@ -219,11 +219,6 @@ SYCL runtime (RT) has a few places where ESIMD is handled specially:
   adds `-vc-codegen` JIT options, which makes Intel GPU runtime use the vector
   backend (aka 'VC BE') to JIT-compile the device binary (SPIR-V).
   [Link](https://github.com/intel/llvm/blob/d7a7de79f8a6498bae52331f4789adcac76b8e8c/sycl/source/detail/program_manager/program_manager.cpp#L412).
-- Code related to ESIMD emulator support and ESIMD kernel invocation via ESIMD
-  emulator. Plugin
-  [sources](https://github.com/intel/llvm/blob/sycl/sycl/plugins/esimd_emulator/pi_esimd_emulator.cpp),
-  ESIMD kernel invocation via the emulator
-  [link](https://github.com/intel/llvm/blob/d7a7de79f8a6498bae52331f4789adcac76b8e8c/sycl/source/detail/scheduler/commands.cpp#L2369).
 
 ## TODOs
 
diff --git a/sycl/doc/design/PropagateCompilerFlagsToRuntime.md b/sycl/doc/design/PropagateCompilerFlagsToRuntime.md
index 3981151782140..55ce19c6f5074 100644
--- a/sycl/doc/design/PropagateCompilerFlagsToRuntime.md
+++ b/sycl/doc/design/PropagateCompilerFlagsToRuntime.md
@@ -128,7 +128,7 @@ pi_result piPluginGetBackendOption(pi_platform platform,
 
 In the level-zero and OpenCL plugins, the table provided in the 'Requirements'
 section is used as a guide to identify the appropriate backend option.
-The option is returned in `backend_option`. For other plugins (HIP, cuda, and
-ESIMD emulator), empty string is returned. This API returns `PI_SUCCESS` for
+The option is returned in `backend_option`. For other plugins (HIP, cuda),
+empty string is returned. This API returns `PI_SUCCESS` for
 valid inputs (frontend_option != ""). For invalid inputs, it returns
 `PI_ERROR_INVALID_VALUE`.
diff --git a/sycl/test-e2e/README.md b/sycl/test-e2e/README.md
index f5ecdf25de240..5a2f810c3f0d6 100644
--- a/sycl/test-e2e/README.md
+++ b/sycl/test-e2e/README.md
@@ -163,7 +163,7 @@ semicolon-separated list of configurations. Each configuration includes backend
 separated from comma-separated list of target devices with colon. Example:
 
 ```
--DSYCL_TEST_E2E_TARGETS="opencl:cpu;ext_oneapi_level_zero:gpu;ext_oneapi_cuda:gpu;ext_oneapi_hip:gpu;ext_intel_esimd_emulator:gpu"
+-DSYCL_TEST_E2E_TARGETS="opencl:cpu;ext_oneapi_level_zero:gpu;ext_oneapi_cuda:gpu;ext_oneapi_hip:gpu"
 ```
 
 ***OpenCL_LIBRARY*** - path to OpenCL ICD loader library. OpenCL
@@ -210,8 +210,7 @@ unavailable.
 
  * **windows**, **linux** - host OS;
  * **cpu**, **gpu**, **accelerator** - target device;
- * **cuda**, **hip**, **opencl**, **level_zero**, **esimd_emulator** - target
-     backend;
+ * **cuda**, **hip**, **opencl**, **level_zero** - target backend;
  * **sycl-ls** - sycl-ls tool availability;
  * **cm-compiler** - C for Metal compiler availability;
  * **cl_options** - CL command line options recognized (or not) by compiler;
@@ -240,8 +239,7 @@ configure specific single test execution in the command line:
  * **dpcpp_compiler** - full path to dpcpp compiler;
  * **sycl_devices** - `"backend0:device0[;backendN:deviceN]*"` where `backend`
     is one of `opencl`, `ext_oneapi_hip`, `ext_oneapi_cuda`,
-    `ext_oneapi_level_zero`, `ext_intel_esimd_emulator` and `device` is one of
-    `cpu`, `gpu` or `acc`.
+    `ext_oneapi_level_zero` and `device` is one of `cpu`, `gpu` or `acc`.
  * **dump_ir** - if IR dumping is supported for compiler (True, False);
  * **compatibility_testing** - forces LIT infra to skip the tests compilation
    to support compatibility testing (a SYCL application is built with one

From 15e81fa4a7fdfd6213ea5c5a4d261c531b226de1 Mon Sep 17 00:00:00 2001
From: wangdi4 <101905226+wangdi4@users.noreply.github.com>
Date: Mon, 6 Nov 2023 12:32:19 -0500
Subject: [PATCH 841/877] [SYCL] Refine annotated_ptr address space based on
 compile-time USM kind (#11623)

Right now the annotated_ptr extension applies compile-time properties on
`decorated_global_ptr` for SYCL device code. As the SYCL extension
[sycl_ext_oneapi_usm_malloc_properties](https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/proposed/sycl_ext_oneapi_usm_malloc_properties.asciidoc#sycl_ext_oneapi_usm_malloc_properties)
introduces compile-time property "usm_kind"(implemented in #10235 ),
this PR further refines the address space of this pointer (using
`decorated_device_ptr` or `decorated_host_ptr`) when
1. `usm_kind` property is specified in annotated_ptr type, and
2. both `__SYCL_DEVICE_ONLY__` and `__ENABLE_USM_ADDR_SPACE__` are
turned on (i.e. requiring compiler flags "-fsycl-device-only
-fsycl-targets=spir64_fpga")

Other changes:
- move `annotated_ptr.hpp` out from `annotated_arg` file into a separate
folder
- add new header file `annotated_ptr_properties.hpp` for annotated_ptr
specific properties
---
 .../annotated_ptr/annotated_ptr.hpp           | 12 +++-
 .../annotated_ptr_properties.hpp              | 18 ++++++
 .../annotated_ptr.cpp                         |  0
 .../annotated_ptr/usm_kind_intel_fpga.cpp     | 56 +++++++++++++++++++
 4 files changed, 85 insertions(+), 1 deletion(-)
 rename sycl/test/extensions/{annotated_arg => annotated_ptr}/annotated_ptr.cpp (100%)
 create mode 100644 sycl/test/extensions/annotated_ptr/usm_kind_intel_fpga.cpp

diff --git a/sycl/include/sycl/ext/oneapi/experimental/annotated_ptr/annotated_ptr.hpp b/sycl/include/sycl/ext/oneapi/experimental/annotated_ptr/annotated_ptr.hpp
index 3f7150f84061a..11f9af7a30e05 100644
--- a/sycl/include/sycl/ext/oneapi/experimental/annotated_ptr/annotated_ptr.hpp
+++ b/sycl/include/sycl/ext/oneapi/experimental/annotated_ptr/annotated_ptr.hpp
@@ -177,10 +177,20 @@ __SYCL_TYPE(annotated_ptr) annotated_ptr<T, detail::properties_t<Props...>> {
       T, typename unpack<filtered_properties>::type>;
 
 #ifdef __SYCL_DEVICE_ONLY__
+#ifdef __ENABLE_USM_ADDR_SPACE__
+  using global_pointer_t = std::conditional_t<
+      detail::IsUsmKindDevice<property_list_t>::value,
+      typename sycl::ext::intel::decorated_device_ptr<T>::pointer,
+      std::conditional_t<
+          detail::IsUsmKindHost<property_list_t>::value,
+          typename sycl::ext::intel::decorated_host_ptr<T>::pointer,
+          typename decorated_global_ptr<T>::pointer>>;
+#else
   using global_pointer_t = typename decorated_global_ptr<T>::pointer;
+#endif // __ENABLE_USM_ADDR_SPACE__
 #else
   using global_pointer_t = T *;
-#endif
+#endif // __SYCL_DEVICE_ONLY__
 
   global_pointer_t m_Ptr;
 
diff --git a/sycl/include/sycl/ext/oneapi/experimental/annotated_ptr/annotated_ptr_properties.hpp b/sycl/include/sycl/ext/oneapi/experimental/annotated_ptr/annotated_ptr_properties.hpp
index 6a6cbad0f417e..47e578dd08c55 100644
--- a/sycl/include/sycl/ext/oneapi/experimental/annotated_ptr/annotated_ptr_properties.hpp
+++ b/sycl/include/sycl/ext/oneapi/experimental/annotated_ptr/annotated_ptr_properties.hpp
@@ -62,6 +62,24 @@ struct PropertyMetaInfo<usm_kind_key::value_t<Kind>> {
   static constexpr sycl::usm::alloc value = Kind;
 };
 
+template <typename PropertyListT> struct IsUsmKindDevice : std::false_type {};
+template <typename... Props>
+struct IsUsmKindDevice<detail::properties_t<Props...>>
+    : detail::ContainsProperty<std::remove_const_t<decltype(usm_kind_device)>,
+                               std::tuple<Props...>> {};
+
+template <typename PropertyListT> struct IsUsmKindHost : std::false_type {};
+template <typename... Props>
+struct IsUsmKindHost<detail::properties_t<Props...>>
+    : detail::ContainsProperty<std::remove_const_t<decltype(usm_kind_host)>,
+                               std::tuple<Props...>> {};
+
+template <typename PropertyListT> struct IsUsmKindShared : std::false_type {};
+template <typename... Props>
+struct IsUsmKindShared<detail::properties_t<Props...>>
+    : detail::ContainsProperty<std::remove_const_t<decltype(usm_kind_shared)>,
+                               std::tuple<Props...>> {};
+
 } // namespace detail
 
 } // namespace experimental
diff --git a/sycl/test/extensions/annotated_arg/annotated_ptr.cpp b/sycl/test/extensions/annotated_ptr/annotated_ptr.cpp
similarity index 100%
rename from sycl/test/extensions/annotated_arg/annotated_ptr.cpp
rename to sycl/test/extensions/annotated_ptr/annotated_ptr.cpp
diff --git a/sycl/test/extensions/annotated_ptr/usm_kind_intel_fpga.cpp b/sycl/test/extensions/annotated_ptr/usm_kind_intel_fpga.cpp
new file mode 100644
index 0000000000000..b1b4040e71c65
--- /dev/null
+++ b/sycl/test/extensions/annotated_ptr/usm_kind_intel_fpga.cpp
@@ -0,0 +1,56 @@
+// RUN: %clangxx -fsycl-device-only -fsycl-targets=spir64_fpga -S -emit-llvm %s -o - | FileCheck %s  --check-prefix CHECK-IR
+
+// Tests that the address space of `annotated_ptr` kernel argument is refined
+// when:
+// 1. `usm_kind` property is specified in annotated_ptr type, and
+// 2. both `__SYCL_DEVICE_ONLY__` and `__ENABLE_USM_ADDR_SPACE__` are turned on
+// (i.e. equiv to flags "-fsycl-device-only -fsycl-targets=spir64_fpga")
+
+#include "sycl/sycl.hpp"
+
+#include <sycl/ext/intel/fpga_extensions.hpp>
+
+#include <iostream>
+
+using namespace sycl;
+using namespace ext::oneapi::experimental;
+using namespace ext::intel::experimental;
+
+using annotated_ptr_t1 =
+    annotated_ptr<int,
+                  decltype(properties(buffer_location<0>, usm_kind_device))>;
+using annotated_ptr_t2 =
+    annotated_ptr<int, decltype(properties(buffer_location<1>, usm_kind_host))>;
+using annotated_ptr_t3 =
+    annotated_ptr<int,
+                  decltype(properties(buffer_location<2>, usm_kind_shared))>;
+
+struct MyIP {
+
+  // CHECK-IR: spir_kernel void @_ZTS4MyIP(ptr addrspace(5) {{.*}} %_arg_a, ptr addrspace(6) {{.*}} %_arg_b, ptr addrspace(1) {{.*}} %_arg_c)
+  annotated_ptr_t1 a;
+  annotated_ptr_t2 b;
+  annotated_ptr_t3 c;
+
+  MyIP(int *a_, int *b_, int *c_) : a(a_), b(b_), c(c_) {}
+
+  void operator()() const { *a = *b + *c; }
+};
+
+void TestVectorAddWithAnnotatedMMHosts() {
+  queue q;
+  auto p1 = malloc_device<int>(5, q);
+  auto p2 = malloc_host<int>(5, q);
+  auto p3 = malloc_shared<int>(5, q);
+
+  q.submit([&](handler &h) { h.single_task(MyIP{p1, p2, p3}); }).wait();
+
+  free(p1, q);
+  free(p2, q);
+  free(p3, q);
+}
+
+int main() {
+  TestVectorAddWithAnnotatedMMHosts();
+  return 0;
+}

From 72a1024e4b6a7626610f1cbe2cd3fcc82a492428 Mon Sep 17 00:00:00 2001
From: Jinsong Ji <jinsong.ji@intel.com>
Date: Mon, 6 Nov 2023 12:32:33 -0500
Subject: [PATCH 842/877] [SYCL][clang-offload-wrapper] Use unsigned int in
 constexpr to avoid ambiguous call (#11776)

On windows ICX build , using int may cause build failures.

clang-offload-wrapper/SymPropReader.cpp(168,34): error: call to member
function 'getAggregateElement' is ambiguous
---
 .../clang-offload-wrapper/SymPropReader.cpp      | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/clang/tools/clang-offload-wrapper/SymPropReader.cpp b/clang/tools/clang-offload-wrapper/SymPropReader.cpp
index 922269039265b..448e6f496aa7e 100644
--- a/clang/tools/clang-offload-wrapper/SymPropReader.cpp
+++ b/clang/tools/clang-offload-wrapper/SymPropReader.cpp
@@ -73,8 +73,8 @@ auto getInitializerNumElements(const Constant *Initializer) {
 //  };
 //
 
-constexpr int EntriesBeginIndexInTDI{10};
-constexpr int PropertySetBeginIndexInTDI{12};
+constexpr unsigned int EntriesBeginIndexInTDI{10};
+constexpr unsigned int PropertySetBeginIndexInTDI{12};
 
 // struct __tgt_offload_entry {
 //   void *addr;
@@ -92,8 +92,8 @@ constexpr int nameIndexInTOE{1};
 //   _pi_device_binary_property_struct* PropertiesEnd;
 // };
 
-constexpr int NameIndexInPIDBPSS{0};
-constexpr int PropertiesBeginIndexInPIDBPSS{1};
+constexpr unsigned int NameIndexInPIDBPSS{0};
+constexpr unsigned int PropertiesBeginIndexInPIDBPSS{1};
 
 // struct _pi_device_binary_property_struct {
 //   char *Name;
@@ -102,10 +102,10 @@ constexpr int PropertiesBeginIndexInPIDBPSS{1};
 //   uint64_t ValSize;
 // };
 
-constexpr int NameIndexInPIDBPS{0};
-constexpr int ValAddrIndexInPIDBPS{1};
-constexpr int TypeIndexInPIDBPS{2};
-constexpr int ValSizeIndexInPIDBPS{3};
+constexpr unsigned int NameIndexInPIDBPS{0};
+constexpr unsigned int ValAddrIndexInPIDBPS{1};
+constexpr unsigned int TypeIndexInPIDBPS{2};
+constexpr unsigned int ValSizeIndexInPIDBPS{3};
 
 } // namespace
 

From 52556d8b77df83660096e7172b12cb495f33c603 Mon Sep 17 00:00:00 2001
From: Jakub Chlanda <jakub@codeplay.com>
Date: Mon, 6 Nov 2023 18:34:24 +0100
Subject: [PATCH 843/877]  [SYCL] Introduce min_work_groups_per_cu and
 max_work_groups_per_mp (#11192)

The attributes match to CUDA's launch bounds
`minBlocksPerMultiprocessor`
and `maxBlocksPerCluster` respectively. See:
https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#cluster-dimension-directives-maxclusterrank
and

https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#launch-bounds
for details.
---
 clang/include/clang/Basic/Attr.td             |  16 ++
 clang/include/clang/Basic/AttrDocs.td         |  59 +++++
 .../clang/Basic/DiagnosticSemaKinds.td        |   5 +-
 clang/include/clang/Sema/Sema.h               |  10 +
 clang/lib/CodeGen/CodeGenFunction.cpp         |  18 ++
 clang/lib/CodeGen/Targets/NVPTX.cpp           |  25 +++
 clang/lib/Sema/SemaDecl.cpp                   |   6 +
 clang/lib/Sema/SemaDeclAttr.cpp               | 202 +++++++++++++++++-
 clang/lib/Sema/SemaSYCL.cpp                   |   4 +
 .../lib/Sema/SemaTemplateInstantiateDecl.cpp  |  34 +++
 .../test/CodeGenSYCL/launch_bounds_nvptx.cpp  | 105 +++++++++
 clang/test/CodeGenSYCL/lb_sm_90.cpp           |  46 ++++
 ...a-attribute-supported-attributes-list.test |   2 +
 clang/test/SemaCUDA/launch_bounds.cu          |   4 +-
 clang/test/SemaSYCL/lb_sm_70.cpp              |  61 ++++++
 clang/test/SemaSYCL/lb_sm_90.cpp              |  31 +++
 clang/test/SemaSYCL/lb_sm_90_ast.cpp          | 176 +++++++++++++++
 17 files changed, 793 insertions(+), 11 deletions(-)
 create mode 100644 clang/test/CodeGenSYCL/launch_bounds_nvptx.cpp
 create mode 100644 clang/test/CodeGenSYCL/lb_sm_90.cpp
 create mode 100644 clang/test/SemaSYCL/lb_sm_70.cpp
 create mode 100644 clang/test/SemaSYCL/lb_sm_90.cpp
 create mode 100644 clang/test/SemaSYCL/lb_sm_90_ast.cpp

diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index b7be8c33e81ca..6dc0375e3fe5a 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -1587,6 +1587,22 @@ def SYCLIntelMaxWorkGroupSize : InheritableAttr {
   let SupportsNonconformingLambdaSyntax = 1;
 }
 
+def SYCLIntelMinWorkGroupsPerComputeUnit : InheritableAttr {
+  let Spellings = [CXX11<"intel", "min_work_groups_per_cu">];
+  let Args = [ExprArgument<"Value">];
+  let LangOpts = [SYCLIsDevice, SilentlyIgnoreSYCLIsHost];
+  let Subjects = SubjectList<[Function], ErrorDiag>;
+  let Documentation = [SYCLIntelMinWorkGroupsPerComputeUnitAttrDocs];
+}
+
+def SYCLIntelMaxWorkGroupsPerMultiprocessor : InheritableAttr {
+  let Spellings = [CXX11<"intel", "max_work_groups_per_mp">];
+  let Args = [ExprArgument<"Value">];
+  let LangOpts = [SYCLIsDevice, SilentlyIgnoreSYCLIsHost];
+  let Subjects = SubjectList<[Function], ErrorDiag>;
+  let Documentation = [SYCLIntelMaxWorkGroupsPerMultiprocessorDocs];
+}
+
 def SYCLIntelMaxGlobalWorkDim : InheritableAttr {
   let Spellings = [CXX11<"intel", "max_global_work_dim">];
   let Args = [ExprArgument<"Value">];
diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td
index 45d871448ddbc..92b35dffacc08 100644
--- a/clang/include/clang/Basic/AttrDocs.td
+++ b/clang/include/clang/Basic/AttrDocs.td
@@ -3017,6 +3017,65 @@ In SYCL 2020 mode, the attribute is not propagated to the kernel.
   }];
 }
 
+def SYCLIntelMinWorkGroupsPerComputeUnitAttrDocs: Documentation {
+  let Category = DocCatFunction;
+  let Heading = "intel::min_work_groups_per_cu";
+  let Content = [{
+Applies to a device function/lambda function. Indicates the desired minimum
+number of resident work_groups per multiprocessor. It complies to the
+.minnctapersm PTX directive.
+
+.. code-block:: c++
+
+  [[intel::min_work_groups_per_cu(2)]] void foo() {}
+
+  class Foo {
+  public:
+    [[intel::min_work_groups_per_cu(2)]] void operator()() const {}
+  };
+
+  template <int N>
+  class Functor {
+  public:
+    [[intel::min_work_groups_per_cu(N)]] void operator()() const {}
+  };
+
+  template <int N>
+  [[intel::min_work_groups_per_cu(N)]] void func() {}
+
+  }];
+}
+
+def SYCLIntelMaxWorkGroupsPerMultiprocessorDocs: Documentation {
+  let Category = DocCatFunction;
+  let Heading = "intel::max_work_groups_per_mp";
+  let Content = [{
+Applies to a device function/lambda function. Indicates the desired maximum
+number work_groups per cluster with which the application will ever launch. It
+complies to the .maxclusterrank PTX directive. Note, that the feature requires
+SM_90 or higher.
+
+.. code-block:: c++
+
+  [[intel::max_work_groups_per_mp(2)]] void foo() {}
+
+  class Foo {
+  public:
+    [[intel::max_work_groups_per_mp(2)]] void operator()() const {}
+  };
+
+  template <int N>
+  class Functor {
+  public:
+    [[intel::max_work_groups_per_mp(N)]] void operator()() const {}
+  };
+
+  template <int N>
+  [[intel::max_work_groups_per_mp(N)]] void func() {}
+
+  }];
+}
+
 def SYCLIntelMaxGlobalWorkDimAttrDocs : Documentation {
   let Category = DocCatFunction;
   let Heading = "intel::max_global_work_dim";
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index a78bebf25107e..3e6159a5c3bf7 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -12016,9 +12016,12 @@ def warn_sycl_kernel_return_type : Warning<
 def err_sycl_special_type_num_init_method : Error<
   "types with 'sycl_special_class' attribute must have one and only one '__init' "
   "method defined">;
+def warn_launch_bounds_is_cuda_specific : Warning<
+  "%0 attribute ignored, only applicable when targeting Nvidia devices">,
+  InGroup<IgnoredAttributes>;
 
 def warn_cuda_maxclusterrank_sm_90 : Warning<
-  "maxclusterrank requires sm_90 or higher, CUDA arch provided: %0, ignoring "
+  "'maxclusterrank' requires sm_90 or higher, CUDA arch provided: %0, ignoring "
   "%1 attribute">, InGroup<IgnoredAttributes>;
 
 def err_bit_int_bad_size : Error<"%select{signed|unsigned}0 _BitInt must "
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index 83eacf3b907bf..925ff3b294a97 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -11432,6 +11432,16 @@ class Sema final {
   SYCLIntelMaxGlobalWorkDimAttr *
   MergeSYCLIntelMaxGlobalWorkDimAttr(Decl *D,
                                      const SYCLIntelMaxGlobalWorkDimAttr &A);
+  void AddSYCLIntelMinWorkGroupsPerComputeUnitAttr(
+      Decl *D, const AttributeCommonInfo &CI, Expr *E);
+  SYCLIntelMinWorkGroupsPerComputeUnitAttr *
+  MergeSYCLIntelMinWorkGroupsPerComputeUnitAttr(
+      Decl *D, const SYCLIntelMinWorkGroupsPerComputeUnitAttr &A);
+  void AddSYCLIntelMaxWorkGroupsPerMultiprocessorAttr(
+      Decl *D, const AttributeCommonInfo &CI, Expr *E);
+  SYCLIntelMaxWorkGroupsPerMultiprocessorAttr *
+  MergeSYCLIntelMaxWorkGroupsPerMultiprocessorAttr(
+      Decl *D, const SYCLIntelMaxWorkGroupsPerMultiprocessorAttr &A);
   void AddSYCLIntelBankWidthAttr(Decl *D, const AttributeCommonInfo &CI,
                                  Expr *E);
   SYCLIntelBankWidthAttr *
diff --git a/clang/lib/CodeGen/CodeGenFunction.cpp b/clang/lib/CodeGen/CodeGenFunction.cpp
index 3fe3a66ae03d7..647ae6206d8e3 100644
--- a/clang/lib/CodeGen/CodeGenFunction.cpp
+++ b/clang/lib/CodeGen/CodeGenFunction.cpp
@@ -758,6 +758,24 @@ void CodeGenFunction::EmitKernelMetadata(const FunctionDecl *FD,
                     llvm::MDNode::get(Context, AttrMDArgs));
   }
 
+  auto attrAsMDArg = [&](Expr *E) {
+    const auto *CE = cast<ConstantExpr>(E);
+    std::optional<llvm::APSInt> ArgVal = CE->getResultAsAPSInt();
+    return llvm::ConstantAsMetadata::get(
+        Builder.getInt32(ArgVal->getSExtValue()));
+  };
+
+  if (const auto *A = FD->getAttr<SYCLIntelMinWorkGroupsPerComputeUnitAttr>()) {
+    Fn->setMetadata("min_work_groups_per_cu",
+                    llvm::MDNode::get(Context, {attrAsMDArg(A->getValue())}));
+  }
+
+  if (const auto *A =
+          FD->getAttr<SYCLIntelMaxWorkGroupsPerMultiprocessorAttr>()) {
+    Fn->setMetadata("max_work_groups_per_mp",
+                    llvm::MDNode::get(Context, {attrAsMDArg(A->getValue())}));
+  }
+
   if (const SYCLIntelMaxWorkGroupSizeAttr *A =
           FD->getAttr<SYCLIntelMaxWorkGroupSizeAttr>()) {
 
diff --git a/clang/lib/CodeGen/Targets/NVPTX.cpp b/clang/lib/CodeGen/Targets/NVPTX.cpp
index 820c99a7b3410..bbebb560a67b1 100644
--- a/clang/lib/CodeGen/Targets/NVPTX.cpp
+++ b/clang/lib/CodeGen/Targets/NVPTX.cpp
@@ -245,6 +245,31 @@ void NVPTXTargetCodeGenInfo::setTargetAttributes(
       // And kernel functions are not subject to inlining
       F->addFnAttr(llvm::Attribute::NoInline);
     }
+    if (const auto *MWGS = FD->getAttr<SYCLIntelMaxWorkGroupSizeAttr>()) {
+      auto MaxThreads = (*MWGS->getZDimVal()).getExtValue() *
+                        (*MWGS->getYDimVal()).getExtValue() *
+                        (*MWGS->getXDimVal()).getExtValue();
+      if (MaxThreads > 0)
+        addNVVMMetadata(F, "maxntidx", MaxThreads);
+
+      auto attrValue = [&](Expr *E) {
+        const auto *CE = cast<ConstantExpr>(E);
+        std::optional<llvm::APInt> Val = CE->getResultAsAPSInt();
+        return Val->getZExtValue();
+      };
+
+      if (const auto *MWGPCU =
+              FD->getAttr<SYCLIntelMinWorkGroupsPerComputeUnitAttr>()) {
+        // The value is guaranteed to be > 0, pass it to the metadata.
+        addNVVMMetadata(F, "minnctapersm", attrValue(MWGPCU->getValue()));
+
+        if (const auto *MWGPMP =
+                FD->getAttr<SYCLIntelMaxWorkGroupsPerMultiprocessorAttr>()) {
+          // The value is guaranteed to be > 0, pass it to the metadata.
+          addNVVMMetadata(F, "maxclusterrank", attrValue(MWGPMP->getValue()));
+        }
+      }
+    }
   }
 
   // Perform special handling in CUDA mode.
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index 10a1f2d2aff62..b23120e0754a2 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -2999,6 +2999,12 @@ static bool mergeDeclAttribute(Sema &S, NamedDecl *D,
     NewAttr = S.MergeSYCLIntelInitiationIntervalAttr(D, *A);
   else if (const auto *A = dyn_cast<SYCLWorkGroupSizeHintAttr>(Attr))
     NewAttr = S.MergeSYCLWorkGroupSizeHintAttr(D, *A);
+  else if (const auto *A =
+               dyn_cast<SYCLIntelMinWorkGroupsPerComputeUnitAttr>(Attr))
+    NewAttr = S.MergeSYCLIntelMinWorkGroupsPerComputeUnitAttr(D, *A);
+  else if (const auto *A =
+               dyn_cast<SYCLIntelMaxWorkGroupsPerMultiprocessorAttr>(Attr))
+    NewAttr = S.MergeSYCLIntelMaxWorkGroupsPerMultiprocessorAttr(D, *A);
   else if (const auto *A = dyn_cast<SYCLIntelMaxGlobalWorkDimAttr>(Attr))
     NewAttr = S.MergeSYCLIntelMaxGlobalWorkDimAttr(D, *A);
   else if (const auto *BTFA = dyn_cast<BTFDeclTagAttr>(Attr))
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index 63a1b733c54ea..1f4d844178ef3 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -3744,6 +3744,19 @@ static void handleSYCLIntelMaxWorkGroupSize(Sema &S, Decl *D,
                                      AL.getArgAsExpr(1), AL.getArgAsExpr(2));
 }
 
+// Handles min_work_groups_per_cu attribute.
+static void handleSYCLIntelMinWorkGroupsPerComputeUnit(Sema &S, Decl *D,
+                                                       const ParsedAttr &AL) {
+  S.AddSYCLIntelMinWorkGroupsPerComputeUnitAttr(D, AL, AL.getArgAsExpr(0));
+}
+
+// Handles max_work_groups_per_mp attribute.
+static void
+handleSYCLIntelMaxWorkGroupsPerMultiprocessor(Sema &S, Decl *D,
+                                              const ParsedAttr &AL) {
+  S.AddSYCLIntelMaxWorkGroupsPerMultiprocessorAttr(D, AL, AL.getArgAsExpr(0));
+}
+
 // Handles reqd_work_group_size.
 // If the 'reqd_work_group_size' attribute is specified on a declaration along
 // with 'num_simd_work_items' attribute, the required work group size specified
@@ -4434,6 +4447,127 @@ void Sema::AddSYCLIntelMaxGlobalWorkDimAttr(Decl *D,
   D->addAttr(::new (Context) SYCLIntelMaxGlobalWorkDimAttr(Context, CI, E));
 }
 
+// Check that the value is a non-negative integer constant that can fit in
+// 32-bits. Issue correct error message and return false on failure.
+bool static check32BitInt(const Expr *E, Sema &S, llvm::APSInt &I,
+                          const AttributeCommonInfo &CI) {
+  if (!I.isIntN(32)) {
+    S.Diag(E->getExprLoc(), diag::err_ice_too_large)
+        << llvm::toString(I, 10, false) << 32 << /* Unsigned */ 1;
+    return false;
+  }
+
+  if (I.isSigned() && I.isNegative()) {
+    S.Diag(E->getExprLoc(), diag::err_attribute_requires_positive_integer)
+        << CI << /* Non-negative */ 1;
+    return false;
+  }
+
+  return true;
+}
+
+void Sema::AddSYCLIntelMinWorkGroupsPerComputeUnitAttr(
+    Decl *D, const AttributeCommonInfo &CI, Expr *E) {
+  if (Context.getLangOpts().SYCLIsDevice &&
+      !Context.getTargetInfo().getTriple().isNVPTX()) {
+    Diag(E->getBeginLoc(), diag::warn_launch_bounds_is_cuda_specific)
+        << CI << E->getSourceRange();
+    return;
+  }
+  if (!E->isValueDependent()) {
+    // Validate that we have an integer constant expression and then store the
+    // converted constant expression into the semantic attribute so that we
+    // don't have to evaluate it again later.
+    llvm::APSInt ArgVal;
+    ExprResult Res = VerifyIntegerConstantExpression(E, &ArgVal);
+    if (Res.isInvalid())
+      return;
+    if (!check32BitInt(E, *this, ArgVal, CI))
+      return;
+    E = Res.get();
+
+    // Check to see if there's a duplicate attribute with different values
+    // already applied to the declaration.
+    if (const auto *DeclAttr =
+            D->getAttr<SYCLIntelMinWorkGroupsPerComputeUnitAttr>()) {
+      // If the other attribute argument is instantiation dependent, we won't
+      // have converted it to a constant expression yet and thus we test
+      // whether this is a null pointer.
+      if (const auto *DeclExpr = dyn_cast<ConstantExpr>(DeclAttr->getValue())) {
+        if (ArgVal != DeclExpr->getResultAsAPSInt()) {
+          Diag(CI.getLoc(), diag::warn_duplicate_attribute) << CI;
+          Diag(DeclAttr->getLoc(), diag::note_previous_attribute);
+        }
+        // Drop the duplicate attribute.
+        return;
+      }
+    }
+  }
+
+  D->addAttr(::new (Context)
+                 SYCLIntelMinWorkGroupsPerComputeUnitAttr(Context, CI, E));
+}
+
+// Helper to get CudaArch.
+static CudaArch getCudaArch(const TargetInfo &TI) {
+  if (!TI.getTriple().isNVPTX())
+    llvm_unreachable("getCudaArch is only valid for NVPTX triple");
+  auto &TO = TI.getTargetOpts();
+  return StringToCudaArch(TO.CPU);
+}
+
+void Sema::AddSYCLIntelMaxWorkGroupsPerMultiprocessorAttr(
+    Decl *D, const AttributeCommonInfo &CI, Expr *E) {
+  auto &TI = Context.getTargetInfo();
+  if (Context.getLangOpts().SYCLIsDevice) {
+    if (!TI.getTriple().isNVPTX()) {
+      Diag(E->getBeginLoc(), diag::warn_launch_bounds_is_cuda_specific)
+          << CI << E->getSourceRange();
+      return;
+    }
+
+    // Feature '.maxclusterrank' requires .target sm_90 or higher.
+    auto SM = getCudaArch(TI);
+    if (SM == CudaArch::UNKNOWN || SM < CudaArch::SM_90) {
+      Diag(E->getBeginLoc(), diag::warn_cuda_maxclusterrank_sm_90)
+          << CudaArchToString(SM) << CI << E->getSourceRange();
+      return;
+    }
+  }
+  if (!E->isValueDependent()) {
+    // Validate that we have an integer constant expression and then store the
+    // converted constant expression into the semantic attribute so that we
+    // don't have to evaluate it again later.
+    llvm::APSInt ArgVal;
+    ExprResult Res = VerifyIntegerConstantExpression(E, &ArgVal);
+    if (Res.isInvalid())
+      return;
+    if (!check32BitInt(E, *this, ArgVal, CI))
+      return;
+    E = Res.get();
+
+    // Check to see if there's a duplicate attribute with different values
+    // already applied to the declaration.
+    if (const auto *DeclAttr =
+            D->getAttr<SYCLIntelMaxWorkGroupsPerMultiprocessorAttr>()) {
+      // If the other attribute argument is instantiation dependent, we won't
+      // have converted it to a constant expression yet and thus we test
+      // whether this is a null pointer.
+      if (const auto *DeclExpr = dyn_cast<ConstantExpr>(DeclAttr->getValue())) {
+        if (ArgVal != DeclExpr->getResultAsAPSInt()) {
+          Diag(CI.getLoc(), diag::warn_duplicate_attribute) << CI;
+          Diag(DeclAttr->getLoc(), diag::note_previous_attribute);
+        }
+        // Drop the duplicate attribute.
+        return;
+      }
+    }
+  }
+
+  D->addAttr(::new (Context)
+                 SYCLIntelMaxWorkGroupsPerMultiprocessorAttr(Context, CI, E));
+}
+
 SYCLIntelMaxGlobalWorkDimAttr *Sema::MergeSYCLIntelMaxGlobalWorkDimAttr(
     Decl *D, const SYCLIntelMaxGlobalWorkDimAttr &A) {
   // Check to see if there's a duplicate attribute with different values
@@ -4473,6 +4607,52 @@ static void handleSYCLIntelMaxGlobalWorkDimAttr(Sema &S, Decl *D,
   S.AddSYCLIntelMaxGlobalWorkDimAttr(D, AL, E);
 }
 
+SYCLIntelMinWorkGroupsPerComputeUnitAttr *
+Sema::MergeSYCLIntelMinWorkGroupsPerComputeUnitAttr(
+    Decl *D, const SYCLIntelMinWorkGroupsPerComputeUnitAttr &A) {
+  // Check to see if there's a duplicate attribute with different values
+  // already applied to the declaration.
+  if (const auto *DeclAttr =
+          D->getAttr<SYCLIntelMinWorkGroupsPerComputeUnitAttr>()) {
+    if (const auto *DeclExpr = dyn_cast<ConstantExpr>(DeclAttr->getValue())) {
+      if (const auto *MergeExpr = dyn_cast<ConstantExpr>(A.getValue())) {
+        if (DeclExpr->getResultAsAPSInt() != MergeExpr->getResultAsAPSInt()) {
+          Diag(DeclAttr->getLoc(), diag::warn_duplicate_attribute) << &A;
+          Diag(A.getLoc(), diag::note_previous_attribute);
+        }
+        // Do not add a duplicate attribute.
+        return nullptr;
+      }
+    }
+  }
+
+  return ::new (Context)
+      SYCLIntelMinWorkGroupsPerComputeUnitAttr(Context, A, A.getValue());
+}
+
+SYCLIntelMaxWorkGroupsPerMultiprocessorAttr *
+Sema::MergeSYCLIntelMaxWorkGroupsPerMultiprocessorAttr(
+    Decl *D, const SYCLIntelMaxWorkGroupsPerMultiprocessorAttr &A) {
+  // Check to see if there's a duplicate attribute with different values
+  // already applied to the declaration.
+  if (const auto *DeclAttr =
+          D->getAttr<SYCLIntelMaxWorkGroupsPerMultiprocessorAttr>()) {
+    if (const auto *DeclExpr = dyn_cast<ConstantExpr>(DeclAttr->getValue())) {
+      if (const auto *MergeExpr = dyn_cast<ConstantExpr>(A.getValue())) {
+        if (DeclExpr->getResultAsAPSInt() != MergeExpr->getResultAsAPSInt()) {
+          Diag(DeclAttr->getLoc(), diag::warn_duplicate_attribute) << &A;
+          Diag(A.getLoc(), diag::note_previous_attribute);
+        }
+        // Do not add a duplicate attribute.
+        return nullptr;
+      }
+    }
+  }
+
+  return ::new (Context)
+      SYCLIntelMaxWorkGroupsPerMultiprocessorAttr(Context, A, A.getValue());
+}
+
 // Handles [[intel::loop_fuse]] and [[intel::loop_fuse_independent]].
 void Sema::AddSYCLIntelLoopFuseAttr(Decl *D, const AttributeCommonInfo &CI,
                                     Expr *E) {
@@ -7045,14 +7225,6 @@ bool Sema::CheckRegparmAttr(const ParsedAttr &AL, unsigned &numParams) {
   return false;
 }
 
-// Helper to get CudaArch.
-static CudaArch getCudaArch(const TargetInfo &TI) {
-  if (!TI.getTriple().isNVPTX())
-    llvm_unreachable("getCudaArch is only valid for NVPTX triple");
-  auto &TO = TI.getTargetOpts();
-  return StringToCudaArch(TO.CPU);
-}
-
 // Checks whether an argument of launch_bounds attribute is
 // acceptable, performs implicit conversion to Rvalue, and returns
 // non-nullptr Expr result on success. Otherwise, it returns nullptr
@@ -12047,6 +12219,12 @@ ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D, const ParsedAttr &AL,
   case ParsedAttr::AT_SYCLIntelMaxWorkGroupSize:
     handleSYCLIntelMaxWorkGroupSize(S, D, AL);
     break;
+  case ParsedAttr::AT_SYCLIntelMinWorkGroupsPerComputeUnit:
+    handleSYCLIntelMinWorkGroupsPerComputeUnit(S, D, AL);
+    break;
+  case ParsedAttr::AT_SYCLIntelMaxWorkGroupsPerMultiprocessor:
+    handleSYCLIntelMaxWorkGroupsPerMultiprocessor(S, D, AL);
+    break;
   case ParsedAttr::AT_IntelReqdSubGroupSize:
     handleIntelReqdSubGroupSize(S, D, AL);
     break;
@@ -12578,6 +12756,14 @@ void Sema::ProcessDeclAttributeList(
     } else if (const auto *A = D->getAttr<SYCLIntelMaxWorkGroupSizeAttr>()) {
       Diag(D->getLocation(), diag::err_opencl_kernel_attr) << A;
       D->setInvalidDecl();
+    } else if (const auto *A =
+                   D->getAttr<SYCLIntelMinWorkGroupsPerComputeUnitAttr>()) {
+      Diag(D->getLocation(), diag::err_opencl_kernel_attr) << A;
+      D->setInvalidDecl();
+    } else if (const auto *A =
+                   D->getAttr<SYCLIntelMaxWorkGroupsPerMultiprocessorAttr>()) {
+      Diag(D->getLocation(), diag::err_opencl_kernel_attr) << A;
+      D->setInvalidDecl();
     } else if (const auto *A = D->getAttr<SYCLIntelNoGlobalWorkOffsetAttr>()) {
       Diag(D->getLocation(), diag::err_opencl_kernel_attr) << A;
       D->setInvalidDecl();
diff --git a/clang/lib/Sema/SemaSYCL.cpp b/clang/lib/Sema/SemaSYCL.cpp
index 15182d369143f..e2a8d10ded1ab 100644
--- a/clang/lib/Sema/SemaSYCL.cpp
+++ b/clang/lib/Sema/SemaSYCL.cpp
@@ -538,6 +538,8 @@ static void collectSYCLAttributes(Sema &S, FunctionDecl *FD,
                  SYCLIntelKernelArgsRestrictAttr, SYCLIntelNumSimdWorkItemsAttr,
                  SYCLIntelSchedulerTargetFmaxMhzAttr,
                  SYCLIntelMaxWorkGroupSizeAttr, SYCLIntelMaxGlobalWorkDimAttr,
+                 SYCLIntelMinWorkGroupsPerComputeUnitAttr,
+                 SYCLIntelMaxWorkGroupsPerMultiprocessorAttr,
                  SYCLIntelNoGlobalWorkOffsetAttr, SYCLSimdAttr>(A);
     });
   }
@@ -4489,6 +4491,8 @@ static void PropagateAndDiagnoseDeviceAttr(
   case attr::Kind::SYCLIntelNumSimdWorkItems:
   case attr::Kind::SYCLIntelSchedulerTargetFmaxMhz:
   case attr::Kind::SYCLIntelMaxGlobalWorkDim:
+  case attr::Kind::SYCLIntelMinWorkGroupsPerComputeUnit:
+  case attr::Kind::SYCLIntelMaxWorkGroupsPerMultiprocessor:
   case attr::Kind::SYCLIntelNoGlobalWorkOffset:
   case attr::Kind::SYCLIntelLoopFuse:
   case attr::Kind::SYCLIntelMaxConcurrency:
diff --git a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
index 66e3dc0612525..7d44576a6b7de 100644
--- a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
@@ -754,6 +754,28 @@ static void instantiateSYCLIntelMaxGlobalWorkDimAttr(
     S.AddSYCLIntelMaxGlobalWorkDimAttr(New, *A, Result.getAs<Expr>());
 }
 
+static void instantiateSYCLIntelMinWorkGroupsPerComputeUnitAttr(
+    Sema &S, const MultiLevelTemplateArgumentList &TemplateArgs,
+    const SYCLIntelMinWorkGroupsPerComputeUnitAttr *A, Decl *New) {
+  EnterExpressionEvaluationContext Unevaluated(
+      S, Sema::ExpressionEvaluationContext::ConstantEvaluated);
+  ExprResult Result = S.SubstExpr(A->getValue(), TemplateArgs);
+  if (!Result.isInvalid())
+    S.AddSYCLIntelMinWorkGroupsPerComputeUnitAttr(New, *A,
+                                                  Result.getAs<Expr>());
+}
+
+static void instantiateSYCLIntelMaxWorkGroupsPerMultiprocessorAttr(
+    Sema &S, const MultiLevelTemplateArgumentList &TemplateArgs,
+    const SYCLIntelMaxWorkGroupsPerMultiprocessorAttr *A, Decl *New) {
+  EnterExpressionEvaluationContext Unevaluated(
+      S, Sema::ExpressionEvaluationContext::ConstantEvaluated);
+  ExprResult Result = S.SubstExpr(A->getValue(), TemplateArgs);
+  if (!Result.isInvalid())
+    S.AddSYCLIntelMaxWorkGroupsPerMultiprocessorAttr(New, *A,
+                                                     Result.getAs<Expr>());
+}
+
 static void instantiateSYCLIntelMaxConcurrencyAttr(
     Sema &S, const MultiLevelTemplateArgumentList &TemplateArgs,
     const SYCLIntelMaxConcurrencyAttr *A, Decl *New) {
@@ -1145,6 +1167,18 @@ void Sema::InstantiateAttrs(const MultiLevelTemplateArgumentList &TemplateArgs,
                                                SYCLIntelMaxGlobalWorkDim, New);
       continue;
     }
+    if (const auto *SYCLIntelMinWorkGroupsPerComputeUnit =
+            dyn_cast<SYCLIntelMinWorkGroupsPerComputeUnitAttr>(TmplAttr)) {
+      instantiateSYCLIntelMinWorkGroupsPerComputeUnitAttr(
+          *this, TemplateArgs, SYCLIntelMinWorkGroupsPerComputeUnit, New);
+      continue;
+    }
+    if (const auto *SYCLIntelMaxWorkGroupsPerMultiprocessor =
+            dyn_cast<SYCLIntelMaxWorkGroupsPerMultiprocessorAttr>(TmplAttr)) {
+      instantiateSYCLIntelMaxWorkGroupsPerMultiprocessorAttr(
+          *this, TemplateArgs, SYCLIntelMaxWorkGroupsPerMultiprocessor, New);
+      continue;
+    }
     if (const auto *SYCLIntelLoopFuse =
             dyn_cast<SYCLIntelLoopFuseAttr>(TmplAttr)) {
       instantiateSYCLIntelLoopFuseAttr(*this, TemplateArgs, SYCLIntelLoopFuse,
diff --git a/clang/test/CodeGenSYCL/launch_bounds_nvptx.cpp b/clang/test/CodeGenSYCL/launch_bounds_nvptx.cpp
new file mode 100644
index 0000000000000..13c6d1ce4e5f0
--- /dev/null
+++ b/clang/test/CodeGenSYCL/launch_bounds_nvptx.cpp
@@ -0,0 +1,105 @@
+// RUN: %clang_cc1 -fsycl-is-device -internal-isystem %S/Inputs -sycl-std=2017 -triple nvptx-unknown-unknown -target-cpu sm_90 -disable-llvm-passes -S -emit-llvm -o - %s | FileCheck %s
+
+// Test correct handling of maximum work group size, minimum work groups per
+// compute unit and maximum work groups per multi-processor attributes, that
+// correspond to CUDA's launch bounds. Expect max_work_group_size,
+// min_work_groups_per_cu and max_work_groups_per_mp that are mapped to
+// maxntidx, minnctapersm, maxclusterrank PTX directives respectively.
+
+#include "sycl.hpp"
+
+using namespace sycl;
+queue q;
+
+class Foo {
+public:
+  [[intel::max_work_group_size(8, 8, 8), intel::min_work_groups_per_cu(2),
+    intel::max_work_groups_per_mp(4)]] void
+  operator()() const {}
+};
+
+template <int N> class Functor {
+public:
+  [[intel::max_work_group_size(N, 8, 8), intel::min_work_groups_per_cu(N),
+    intel::max_work_groups_per_mp(N)]] void
+  operator()() const {}
+};
+
+template <int N>
+[[intel::max_work_group_size(N, 8, 8), intel::min_work_groups_per_cu(N),
+  intel::max_work_groups_per_mp(N)]] void
+zoo() {}
+
+[[intel::max_work_group_size(8, 8, 8), intel::min_work_groups_per_cu(2),
+  intel::max_work_groups_per_mp(4)]] void
+bar() {}
+
+int main() {
+  q.submit([&](handler &h) {
+    // Test attribute argument size.
+    Foo boo;
+    h.single_task<class kernel_name1>(boo);
+
+    // Test attribute is applied on lambda.
+    h.single_task<class kernel_name2>(
+        [] [[intel::max_work_group_size(8, 8, 8),
+             intel::min_work_groups_per_cu(2),
+             intel::max_work_groups_per_mp(4)]] () {});
+
+    // Test class template argument.
+    Functor<6> f;
+    h.single_task<class kernel_name3>(f);
+
+    // Test attribute is propagated.
+    h.single_task<class kernel_name4>([]() { bar(); });
+
+    // Test function template argument.
+    h.single_task<class kernel_name5>([]() { zoo<16>(); });
+  });
+  return 0;
+}
+
+// CHECK: define dso_local void @{{.*}}kernel_name1() #0 {{.*}} !min_work_groups_per_cu ![[MWGPC:[0-9]+]] !max_work_groups_per_mp ![[MWGPM:[0-9]+]] !max_work_group_size ![[MWGS:[0-9]+]]
+// CHECK: define dso_local void @{{.*}}kernel_name2() #0 {{.*}} !min_work_groups_per_cu ![[MWGPC:[0-9]+]] !max_work_groups_per_mp ![[MWGPM:[0-9]+]] !max_work_group_size ![[MWGS:[0-9]+]]
+// CHECK: define dso_local void @{{.*}}kernel_name3() #0 {{.*}} !min_work_groups_per_cu ![[MWGPC_MWGPM:[0-9]+]] !max_work_groups_per_mp ![[MWGPC_MWGPM]] !max_work_group_size ![[MWGS_2:[0-9]+]]
+// CHECK: define dso_local void @{{.*}}kernel_name4() #0 {{.*}} !min_work_groups_per_cu ![[MWGPC:[0-9]+]] !max_work_groups_per_mp ![[MWGPM:[0-9]+]] !max_work_group_size ![[MWGS:[0-9]+]]
+// CHECK: define dso_local void @{{.*}}kernel_name5() #0 {{.*}} !min_work_groups_per_cu ![[MWGPC_MWGPM_2:[0-9]+]] !max_work_groups_per_mp ![[MWGPC_MWGPM_2]] !max_work_group_size ![[MWGS_3:[0-9]+]]
+
+// CHECK: {{.*}}@{{.*}}kernel_name1, !"maxntidx", i32 512}
+// CHECK: {{.*}}@{{.*}}kernel_name1, !"minnctapersm", i32 2}
+// CHECK: {{.*}}@{{.*}}kernel_name1, !"maxclusterrank", i32 4}
+// CHECK: {{.*}}@{{.*}}Foo{{.*}}, !"maxntidx", i32 512}
+// CHECK: {{.*}}@{{.*}}Foo{{.*}}, !"minnctapersm", i32 2}
+// CHECK: {{.*}}@{{.*}}Foo{{.*}}, !"maxclusterrank", i32 4}
+// CHECK: {{.*}}@{{.*}}kernel_name2, !"maxntidx", i32 512}
+// CHECK: {{.*}}@{{.*}}kernel_name2, !"minnctapersm", i32 2}
+// CHECK: {{.*}}@{{.*}}kernel_name2, !"maxclusterrank", i32 4}
+// CHECK: {{.*}}@{{.*}}main{{.*}}, !"maxntidx", i32 512}
+// CHECK: {{.*}}@{{.*}}main{{.*}}, !"minnctapersm", i32 2}
+// CHECK: {{.*}}@{{.*}}main{{.*}}, !"maxclusterrank", i32 4}
+// CHECK: {{.*}}@{{.*}}kernel_name3, !"maxntidx", i32 384}
+// CHECK: {{.*}}@{{.*}}kernel_name3, !"minnctapersm", i32 6}
+// CHECK: {{.*}}@{{.*}}kernel_name3, !"maxclusterrank", i32 6}
+// CHECK: {{.*}}@{{.*}}Functor{{.*}}, !"maxntidx", i32 384}
+// CHECK: {{.*}}@{{.*}}Functor{{.*}}, !"minnctapersm", i32 6}
+// CHECK: {{.*}}@{{.*}}Functor{{.*}}, !"maxclusterrank", i32 6}
+// CHECK: {{.*}}@{{.*}}kernel_name4, !"maxntidx", i32 512}
+// CHECK: {{.*}}@{{.*}}kernel_name4, !"minnctapersm", i32 2}
+// CHECK: {{.*}}@{{.*}}kernel_name4, !"maxclusterrank", i32 4}
+// CHECK: {{.*}}@{{.*}}bar{{.*}}, !"maxntidx", i32 512}
+// CHECK: {{.*}}@{{.*}}bar{{.*}}, !"minnctapersm", i32 2}
+// CHECK: {{.*}}@{{.*}}bar{{.*}}, !"maxclusterrank", i32 4}
+// CHECK: {{.*}}@{{.*}}kernel_name5, !"maxntidx", i32 1024}
+// CHECK: {{.*}}@{{.*}}kernel_name5, !"minnctapersm", i32 16}
+// CHECK: {{.*}}@{{.*}}kernel_name5, !"maxclusterrank", i32 16}
+// CHECK: {{.*}}@{{.*}}zoo{{.*}}, !"maxntidx", i32 1024}
+// CHECK: {{.*}}@{{.*}}zoo{{.*}}, !"minnctapersm", i32 16}
+// CHECK: {{.*}}@{{.*}}zoo{{.*}}, !"maxclusterrank", i32 16}
+
+// CHECK: ![[MWGPC]] = !{i32 2}
+// CHECK: ![[MWGPM]] = !{i32 4}
+// CHECK: ![[MWGS]] = !{i32 8, i32 8, i32 8}
+// CHECK: ![[MWGPC_MWGPM]] = !{i32 6}
+// CHECK: ![[MWGS_2]] = !{i32 8, i32 8, i32 6}
+// CHECK: ![[MWGPC_MWGPM_2]] = !{i32 16}
+// CHECK: ![[MWGS_3]] = !{i32 8, i32 8, i32 16}
diff --git a/clang/test/CodeGenSYCL/lb_sm_90.cpp b/clang/test/CodeGenSYCL/lb_sm_90.cpp
new file mode 100644
index 0000000000000..5909edde1479f
--- /dev/null
+++ b/clang/test/CodeGenSYCL/lb_sm_90.cpp
@@ -0,0 +1,46 @@
+// RUN: %clang_cc1 -internal-isystem %S/Inputs -triple nvptx-unknown-unknown -target-cpu sm_90 -fsycl-is-device -Wno-c++23-extensions -S -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK-IR
+
+// Maximum work groups per multi-processor, mapped to maxclusterrank PTX
+// directive, is an SM_90 feature, make sure that correct metadata is generated.
+
+#include "sycl.hpp"
+
+template <int N1, int N2, int N3> class Functor {
+public:
+  [[intel::max_work_group_size(1, 1, N1), intel::min_work_groups_per_cu(N2),
+    intel::max_work_groups_per_mp(N3)]] void
+  operator()() const {}
+};
+
+int main() {
+  sycl::queue Q{};
+
+  sycl::range<1> Gws(32);
+
+  Q.submit([&](sycl::handler &cgh) {
+    cgh.parallel_for<class K1>(Gws, [=] [[intel::max_work_group_size(1, 1, 256),
+                                          intel::min_work_groups_per_cu(2),
+                                          intel::max_work_groups_per_mp(4)]] (
+                                        sycl::id<1>) { volatile int A = 42; });
+  });
+  // CHECK-IR: !min_work_groups_per_cu [[MWGPCU:![0-9]+]]
+  // CHECK-IR: !max_work_groups_per_mp [[MWGPMP:![0-9]+]]
+  // CHECK-IR: !max_work_group_size [[MWGS:![0-9]+]]
+
+  Q.submit([&](sycl::handler &cgh) {
+    cgh.single_task<class F>(Functor<512, 8, 16>{});
+  });
+  // CHECK-IR: !min_work_groups_per_cu [[MWGPCU_F:![0-9]+]]
+  // CHECK-IR: !max_work_groups_per_mp [[MWGPMP_F:![0-9]+]]
+  // CHECK-IR: !max_work_group_size [[MWGS_F:![0-9]+]]
+
+  // CHECK-IR: [[MWGPCU]] = !{i32 2}
+  // CHECK-IR: [[MWGPMP]] = !{i32 4}
+  // CHECK-IR: [[MWGS]] = !{i32 256, i32 1, i32 1}
+
+  // CHECK-IR: [[MWGPCU_F]] = !{i32 8}
+  // CHECK-IR: [[MWGPMP_F]] = !{i32 16}
+  // CHECK-IR: [[MWGS_F]] = !{i32 512, i32 1, i32 1}
+
+  return 0;
+}
diff --git a/clang/test/Misc/pragma-attribute-supported-attributes-list.test b/clang/test/Misc/pragma-attribute-supported-attributes-list.test
index a79c6157e7ffa..b160de34da345 100644
--- a/clang/test/Misc/pragma-attribute-supported-attributes-list.test
+++ b/clang/test/Misc/pragma-attribute-supported-attributes-list.test
@@ -181,6 +181,8 @@
 // CHECK-NEXT: SYCLIntelMaxConcurrency (SubjectMatchRule_function)
 // CHECK-NEXT: SYCLIntelMaxGlobalWorkDim (SubjectMatchRule_function)
 // CHECK-NEXT: SYCLIntelMaxWorkGroupSize (SubjectMatchRule_function)
+// CHECK-NEXT: SYCLIntelMaxWorkGroupsPerMultiprocessor (SubjectMatchRule_function)
+// CHECK-NEXT: SYCLIntelMinWorkGroupsPerComputeUnit (SubjectMatchRule_function)
 // CHECK-NEXT: SYCLIntelNoGlobalWorkOffset (SubjectMatchRule_function)
 // CHECK-NEXT: SYCLIntelNumSimdWorkItems (SubjectMatchRule_function)
 // CHECK-NEXT: SYCLIntelPipeIO (SubjectMatchRule_variable)
diff --git a/clang/test/SemaCUDA/launch_bounds.cu b/clang/test/SemaCUDA/launch_bounds.cu
index 045f475692959..aa83c3560e71d 100644
--- a/clang/test/SemaCUDA/launch_bounds.cu
+++ b/clang/test/SemaCUDA/launch_bounds.cu
@@ -11,7 +11,7 @@ __launch_bounds__(0x10000000000000000) void TestWayTooBigArg(void); // expected-
 
 __launch_bounds__(-128, 7) void TestNegArg1(void); // expected-warning {{'launch_bounds' attribute parameter 0 is negative and will be ignored}}
 __launch_bounds__(128, -7) void TestNegArg2(void); // expected-warning {{'launch_bounds' attribute parameter 1 is negative and will be ignored}}
-__launch_bounds__(128, 2, -8) void TestNegArg2(void); // expected-warning {{maxclusterrank requires sm_90 or higher, CUDA arch provided: sm_75, ignoring 'launch_bounds' attribute}}
+__launch_bounds__(128, 2, -8) void TestNegArg2(void); // expected-warning {{'maxclusterrank' requires sm_90 or higher, CUDA arch provided: sm_75, ignoring 'launch_bounds' attribute}}
 
 __launch_bounds__(1, 2, 3, 4) void Test4Args(void); // expected-error {{'launch_bounds' attribute takes no more than 3 arguments}}
 __launch_bounds__() void TestNoArgs(void); // expected-error {{'launch_bounds' attribute takes at least 1 argument}}
@@ -49,4 +49,4 @@ __launch_bounds__(Args) void TestTemplateVariadicArgs(void) {} // expected-error
 template <int... Args>
 __launch_bounds__(1, Args) void TestTemplateVariadicArgs2(void) {} // expected-error {{expression contains unexpanded parameter pack 'Args'}}
 
-__launch_bounds__(1, 2, 3) void Test3Args(void); // expected-warning {{maxclusterrank requires sm_90 or higher, CUDA arch provided: sm_75, ignoring 'launch_bounds' attribute}}
+__launch_bounds__(1, 2, 3) void Test3Args(void); // expected-warning {{'maxclusterrank' requires sm_90 or higher, CUDA arch provided: sm_75, ignoring 'launch_bounds' attribute}}
diff --git a/clang/test/SemaSYCL/lb_sm_70.cpp b/clang/test/SemaSYCL/lb_sm_70.cpp
new file mode 100644
index 0000000000000..02a7ceee0b221
--- /dev/null
+++ b/clang/test/SemaSYCL/lb_sm_70.cpp
@@ -0,0 +1,61 @@
+// RUN: %clang_cc1 -internal-isystem %S/Inputs -triple nvptx-unknown-unknown -target-cpu sm_70 -fsycl-is-device -Wno-c++23-extensions %s -o -fsyntax-only -verify %s
+
+// Maximum work groups per multi-processor, mapped to maxclusterrank PTX
+// directive, is an SM_90 feature, make sure that correct warning is issued on
+// architectures lower than that. Furthermore, warn/error incorrect values
+// specified for max_work_groups_per_mp and min_work_groups_per_cu.
+
+#include "sycl.hpp"
+
+template <int N1, int N2, int N3> class Functor {
+public:
+  // expected-warning@+2 {{'maxclusterrank' requires sm_90 or higher, CUDA arch provided: sm_70, ignoring 'max_work_groups_per_mp' attribute}}
+  [[intel::max_work_group_size(1, 1, N1), intel::min_work_groups_per_cu(N2),
+    intel::max_work_groups_per_mp(N3)]] void
+  operator()() const {}
+};
+
+int main() {
+  sycl::queue Q{};
+
+  Q.submit([&](sycl::handler &cgh) {
+    // expected-warning@+4 {{'maxclusterrank' requires sm_90 or higher, CUDA arch provided: sm_70, ignoring 'max_work_groups_per_mp' attribute}}
+    cgh.single_task<class T1>(
+        [=] [[intel::max_work_group_size(1, 1, 256),
+              intel::min_work_groups_per_cu(2),
+              intel::max_work_groups_per_mp(4)]] () { volatile int A = 42; });
+
+    constexpr float A = 2.0;
+    // expected-warning@+5{{'maxclusterrank' requires sm_90 or higher, CUDA arch provided: sm_70, ignoring 'max_work_groups_per_mp' attribute}}
+    // expected-error@+3 {{integral constant expression must have integral or unscoped enumeration type, not 'float'}}
+    cgh.single_task<class T2>(
+        [=] [[intel::max_work_group_size(1, 1, 256),
+              intel::min_work_groups_per_cu(A),
+              intel::max_work_groups_per_mp(4)]] () { volatile int A = 42; });
+
+    // expected-error@+4 {{expression is not an integral constant expression}}
+    // expected-note@+3 {{value 2147483648 is outside the range of representable values of type 'int'}}
+    cgh.single_task<class T3>(
+        [=] [[intel::max_work_group_size(1, 1, 256),
+              intel::min_work_groups_per_cu(2147483647 + 1)]] () {
+          volatile int A = 42;
+        });
+
+    // expected-warning@+5 {{attribute 'min_work_groups_per_cu' is already applied with different arguments}}
+    // expected-note@+3 {{previous attribute is here}}
+    cgh.single_task<class T4>(
+        [=] [[intel::max_work_group_size(1, 1, 256),
+              intel::min_work_groups_per_cu(4),
+              intel::min_work_groups_per_cu(8)]] () { volatile int A = 42; });
+
+    // expected-error@+2 {{'min_work_groups_per_cu' attribute requires a non-negative integral compile time constant expression}}
+    cgh.single_task<class T5>(
+        [=] [[intel::min_work_groups_per_cu(-8)]] () { volatile int A = 42; });
+  });
+
+  Q.submit([&](sycl::handler &cgh) {
+    cgh.single_task<class F>(Functor<512, 8, 16>{});
+  });
+
+  return 0;
+}
diff --git a/clang/test/SemaSYCL/lb_sm_90.cpp b/clang/test/SemaSYCL/lb_sm_90.cpp
new file mode 100644
index 0000000000000..9d2f61571565d
--- /dev/null
+++ b/clang/test/SemaSYCL/lb_sm_90.cpp
@@ -0,0 +1,31 @@
+// RUN: %clang_cc1 -internal-isystem %S/Inputs %s -triple nvptx64-nvidia-cuda -target-cpu sm_90 -fsycl-is-device -fsyntax-only -Wno-c++23-extensions -verify
+// expected-no-diagnostics
+
+// Maximum work groups per multi-processor, mapped to maxclusterrank PTX
+// directive, is an SM_90 feature, make sure that no warnings/errors are issued.
+
+#include "sycl.hpp"
+
+template <int N1, int N2, int N3> class Functor {
+public:
+  [[intel::max_work_group_size(1, 1, N1), intel::min_work_groups_per_cu(N2),
+    intel::max_work_groups_per_mp(N3)]] void
+  operator()() const {}
+};
+
+int main() {
+  sycl::queue Q{};
+
+  Q.submit([&](sycl::handler &cgh) {
+    cgh.single_task<class T1>( [=] [[intel::max_work_group_size(1, 1, 256),
+                                          intel::min_work_groups_per_cu(2),
+                                          intel::max_work_groups_per_mp(4)]] (
+) { volatile int A = 42; });
+  });
+
+  Q.submit([&](sycl::handler &cgh) {
+    cgh.single_task<class F>(Functor<512, 8, 16>{});
+  });
+
+  return 0;
+}
diff --git a/clang/test/SemaSYCL/lb_sm_90_ast.cpp b/clang/test/SemaSYCL/lb_sm_90_ast.cpp
new file mode 100644
index 0000000000000..2493445d6b985
--- /dev/null
+++ b/clang/test/SemaSYCL/lb_sm_90_ast.cpp
@@ -0,0 +1,176 @@
+// RUN: %clang_cc1 -fsycl-is-device -internal-isystem %S/Inputs -sycl-std=2017 -ast-dump -triple nvptx-unknown-unknown -target-cpu sm_90 -Wno-c++23-extensions %s | FileCheck %s
+
+// Tests for AST of Intel max_work_group_size, min_work_groups_per_cu and
+// max_work_groups_per_mp attribute.
+
+#include "sycl.hpp"
+
+sycl::queue deviceQueue;
+
+// CHECK: FunctionDecl {{.*}} func1 'void ()'
+// CHECK-NEXT: CompoundStmt
+// CHECK-NEXT: SYCLIntelMaxWorkGroupSizeAttr {{.*}}
+// CHECK-NEXT: ConstantExpr{{.*}}'int'
+// CHECK-NEXT: value: Int 8
+// CHECK-NEXT: IntegerLiteral {{.*}} 'int' 8
+// CHECK-NEXT: ConstantExpr{{.*}}'int'
+// CHECK-NEXT: value: Int 8
+// CHECK-NEXT: IntegerLiteral {{.*}} 'int' 8
+// CHECK-NEXT: ConstantExpr{{.*}}'int'
+// CHECK-NEXT: value: Int 8
+// CHECK-NEXT: IntegerLiteral {{.*}} 'int' 8
+// CHECK-NEXT: SYCLIntelMinWorkGroupsPerComputeUnitAttr {{.*}}
+// CHECK-NEXT: ConstantExpr{{.*}}'int'
+// CHECK-NEXT: value: Int 4
+// CHECK-NEXT: IntegerLiteral {{.*}} 'int' 4
+// CHECK-NEXT: SYCLIntelMaxWorkGroupsPerMultiprocessorAttr {{.*}}
+// CHECK-NEXT: ConstantExpr{{.*}}'int'
+// CHECK-NEXT: value: Int 2
+// CHECK-NEXT: IntegerLiteral {{.*}} 'int' 2
+[[intel::max_work_group_size(8, 8, 8), intel::min_work_groups_per_cu(4),
+  intel::max_work_groups_per_mp(2)]] void
+func1() {}
+
+// Test that checks template parameter support on function.
+// CHECK: FunctionTemplateDecl {{.*}} func2
+// CHECK: FunctionDecl {{.*}} func2 'void ()'
+// CHECK-NEXT: CompoundStmt
+// CHECK-NEXT: SYCLIntelMaxWorkGroupSizeAttr  {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'int' NonTypeTemplateParm {{.*}} 'N' 'int'
+// CHECK-NEXT: ConstantExpr{{.*}}'int'
+// CHECK-NEXT: value: Int 8
+// CHECK-NEXT: IntegerLiteral {{.*}} 'int' 8
+// CHECK-NEXT: ConstantExpr{{.*}}'int'
+// CHECK-NEXT: value: Int 8
+// CHECK-NEXT: IntegerLiteral {{.*}} 'int' 8
+// CHECK-NEXT: SYCLIntelMinWorkGroupsPerComputeUnitAttr   {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'int' NonTypeTemplateParm {{.*}} 'N' 'int'
+// CHECK-NEXT: SYCLIntelMaxWorkGroupsPerMultiprocessorAttr   {{.*}}
+// CHECK-NEXT: DeclRefExpr {{.*}} 'int' NonTypeTemplateParm {{.*}} 'N' 'int'
+
+// CHECK: FunctionDecl {{.*}} func2 'void ()'
+// CHECK-NEXT: TemplateArgument integral 6
+// CHECK-NEXT: CompoundStmt
+// CHECK-NEXT: SYCLIntelMaxWorkGroupSizeAttr {{.*}}
+// CHECK-NEXT: ConstantExpr{{.*}}'int'
+// CHECK-NEXT: value: Int 6
+// CHECK-NEXT: SubstNonTypeTemplateParmExpr
+// CHECK-NEXT: NonTypeTemplateParmDecl
+// CHECK-NEXT: IntegerLiteral {{.*}} 'int' 6
+// CHECK-NEXT: ConstantExpr{{.*}}'int'
+// CHECK-NEXT: value: Int 8
+// CHECK-NEXT: IntegerLiteral {{.*}} 'int' 8
+// CHECK-NEXT: ConstantExpr{{.*}}'int'
+// CHECK-NEXT: value: Int 8
+// CHECK-NEXT: IntegerLiteral {{.*}} 'int' 8
+// CHECK-NEXT: SYCLIntelMinWorkGroupsPerComputeUnitAttr {{.*}}
+// CHECK-NEXT: ConstantExpr{{.*}}'int'
+// CHECK-NEXT: value: Int 6
+// CHECK-NEXT: SubstNonTypeTemplateParmExpr
+// CHECK-NEXT: NonTypeTemplateParmDecl
+// CHECK-NEXT: IntegerLiteral {{.*}} 'int' 6
+// CHECK-NEXT: SYCLIntelMaxWorkGroupsPerMultiprocessorAttr {{.*}}
+// CHECK-NEXT: ConstantExpr{{.*}}'int'
+// CHECK-NEXT: value: Int 6
+// CHECK-NEXT: SubstNonTypeTemplateParmExpr
+// CHECK-NEXT: NonTypeTemplateParmDecl
+// CHECK-NEXT: IntegerLiteral {{.*}} 'int' 6
+template <int N>
+[[intel::max_work_group_size(N, 8, 8), intel::min_work_groups_per_cu(N),
+  intel::max_work_groups_per_mp(N)]] void
+func2() {}
+
+class KernelFunctor {
+public:
+  void operator()() const { func1(); }
+};
+
+// Test that checks template parameter support on class member function.
+template <int N> class KernelFunctor2 {
+public:
+  [[intel::max_work_group_size(N, 8, 8), intel::min_work_groups_per_cu(N),
+    intel::max_work_groups_per_mp(N)]] void
+  operator()() const {}
+};
+
+int main() {
+  deviceQueue.submit([&](sycl::handler &h) {
+    // CHECK-LABEL: FunctionDecl {{.*}}kernel_name_1
+    // CHECK: SYCLIntelMaxWorkGroupSizeAttr
+    // CHECK-NEXT: ConstantExpr{{.*}}'int'
+    // CHECK-NEXT: value: Int 8
+    // CHECK-NEXT: IntegerLiteral {{.*}} 'int' 8
+    // CHECK-NEXT: ConstantExpr{{.*}}'int'
+    // CHECK-NEXT: value: Int 8
+    // CHECK-NEXT: IntegerLiteral {{.*}} 'int' 8
+    // CHECK-NEXT: ConstantExpr{{.*}}'int'
+    // CHECK-NEXT: value: Int 8
+    // CHECK-NEXT: IntegerLiteral {{.*}} 'int' 8
+    // CHECK: SYCLIntelMinWorkGroupsPerComputeUnitAttr
+    // CHECK-NEXT: ConstantExpr{{.*}}'int'
+    // CHECK-NEXT: value: Int 4
+    // CHECK-NEXT: IntegerLiteral {{.*}} 'int' 4
+    // CHECK: SYCLIntelMaxWorkGroupsPerMultiprocessorAttr
+    // CHECK-NEXT: ConstantExpr{{.*}}'int'
+    // CHECK-NEXT: value: Int 2
+    // CHECK-NEXT: IntegerLiteral {{.*}} 'int' 2
+    KernelFunctor f1;
+    h.single_task<class kernel_name_1>(f1);
+
+    // CHECK-LABEL: FunctionDecl {{.*}}kernel_name_2
+    // CHECK: SYCLIntelMaxWorkGroupSizeAttr
+    // CHECK-NEXT: ConstantExpr{{.*}}'int'
+    // CHECK-NEXT: value: Int 3
+    // CHECK-NEXT: SubstNonTypeTemplateParmExpr
+    // CHECK-NEXT: NonTypeTemplateParmDecl
+    // CHECK-NEXT: IntegerLiteral {{.*}} 'int' 3
+    // CHECK-NEXT: ConstantExpr{{.*}}'int'
+    // CHECK-NEXT: value: Int 8
+    // CHECK-NEXT: IntegerLiteral {{.*}} 'int' 8
+    // CHECK-NEXT: ConstantExpr{{.*}}'int'
+    // CHECK-NEXT: value: Int 8
+    // CHECK-NEXT: IntegerLiteral {{.*}} 'int' 8
+    // CHECK: SYCLIntelMinWorkGroupsPerComputeUnitAttr
+    // CHECK-NEXT: ConstantExpr{{.*}}'int'
+    // CHECK-NEXT: value: Int 3
+    // CHECK-NEXT: SubstNonTypeTemplateParmExpr
+    // CHECK-NEXT: NonTypeTemplateParmDecl
+    // CHECK-NEXT: IntegerLiteral {{.*}} 'int' 3
+    // CHECK: SYCLIntelMaxWorkGroupsPerMultiprocessorAttr
+    // CHECK-NEXT: ConstantExpr{{.*}}'int'
+    // CHECK-NEXT: value: Int 3
+    // CHECK-NEXT: SubstNonTypeTemplateParmExpr
+    // CHECK-NEXT: NonTypeTemplateParmDecl
+    // CHECK-NEXT: IntegerLiteral {{.*}} 'int' 3
+    KernelFunctor2<3> f2;
+    h.single_task<class kernel_name_2>(f2);
+
+    // CHECK-LABEL: FunctionDecl {{.*}}kernel_name_3
+    // CHECK: SYCLIntelMaxWorkGroupSizeAttr {{.*}}
+    // CHECK-NEXT: ConstantExpr{{.*}}'int'
+    // CHECK-NEXT: value: Int 8
+    // CHECK-NEXT: IntegerLiteral {{.*}} 'int' 8
+    // CHECK-NEXT: ConstantExpr{{.*}}'int'
+    // CHECK-NEXT: value: Int 8
+    // CHECK-NEXT: IntegerLiteral {{.*}} 'int' 8
+    // CHECK-NEXT: ConstantExpr{{.*}}'int'
+    // CHECK-NEXT: value: Int 8
+    // CHECK-NEXT: IntegerLiteral {{.*}} 'int' 8
+    // CHECK-NEXT: SYCLIntelMinWorkGroupsPerComputeUnitAttr {{.*}}
+    // CHECK-NEXT: ConstantExpr{{.*}}'int'
+    // CHECK-NEXT: value: Int 4
+    // CHECK-NEXT: IntegerLiteral {{.*}} 'int' 4
+    // CHECK-NEXT: SYCLIntelMaxWorkGroupsPerMultiprocessorAttr {{.*}}
+    // CHECK-NEXT: ConstantExpr{{.*}}'int'
+    // CHECK-NEXT: value: Int 6
+    // CHECK-NEXT: IntegerLiteral {{.*}} 'int' 6
+    h.single_task<class kernel_name_3>(
+        [] [[intel::max_work_group_size(8, 8, 8),
+             intel::min_work_groups_per_cu(4),
+             intel::max_work_groups_per_mp(6)]] () {});
+  });
+
+  func2<6>();
+
+  return 0;
+}

From 2e4f5d538fdb599996b427723416cd4551250f96 Mon Sep 17 00:00:00 2001
From: Kseniya Tikhomirova <kseniya.tikhomirova@intel.com>
Date: Mon, 6 Nov 2023 20:08:52 +0100
Subject: [PATCH 844/877] [SYCL] Fix wrong loop in deferred buffer release
 (#11791)

Signed-off-by: Tikhomirova, Kseniya <kseniya.tikhomirova@intel.com>
---
 sycl/source/detail/scheduler/scheduler.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/sycl/source/detail/scheduler/scheduler.cpp b/sycl/source/detail/scheduler/scheduler.cpp
index e2a2286f677e0..dde8333b971c1 100644
--- a/sycl/source/detail/scheduler/scheduler.cpp
+++ b/sycl/source/detail/scheduler/scheduler.cpp
@@ -419,8 +419,9 @@ void Scheduler::releaseResources(BlockingT Blocking) {
   // queue_impl, ~queue_impl is called and buffer for assert (which is created
   // with size only so all confitions for deferred release are satisfied) is
   // added to deferred mem obj storage. So we may end up with leak.
-  while (!isDeferredMemObjectsEmpty())
+  do {
     cleanupDeferredMemObjects(Blocking);
+  } while (Blocking == BlockingT::BLOCKING && !isDeferredMemObjectsEmpty());
 }
 
 MemObjRecord *Scheduler::getMemObjRecord(const Requirement *const Req) {

From a4501750df2fe4334ccab67cfd4bd674ffbdf461 Mon Sep 17 00:00:00 2001
From: Yury Plyakhin <yury.plyakhin@intel.com>
Date: Mon, 6 Nov 2023 11:50:30 -0800
Subject: [PATCH 845/877] [SYCL][Joint Matrix][E2E] Remove xfail for GPU for 2
 tests (#11783)

Remove XFAIL for GPU for joint_matrix_bfloat16_16x16x16 and
joint_matrix_bfloat16_32x64x16 tests
---
 sycl/test-e2e/Matrix/joint_matrix_bfloat16_16x16x16.cpp | 2 +-
 sycl/test-e2e/Matrix/joint_matrix_bfloat16_32x64x16.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_16x16x16.cpp b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_16x16x16.cpp
index 3bee016da1b96..3035eb0908e1b 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_16x16x16.cpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_16x16x16.cpp
@@ -10,7 +10,7 @@
 // RUN: %{build} -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4
 // RUN: %{run} %t.out
 
-// XFAIL: *
+// XFAIL: cpu
 
 #include "common.hpp"
 
diff --git a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_32x64x16.cpp b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_32x64x16.cpp
index 3b66b602c962c..733b0c0e8bf48 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_32x64x16.cpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_32x64x16.cpp
@@ -10,7 +10,7 @@
 // RUN: %{build} -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4
 // RUN: %{run} %t.out
 
-// XFAIL: *
+// XFAIL: cpu
 
 #include "common.hpp"
 

From d4221dfa96f4ed9523b4eb1c4929f24f480599a3 Mon Sep 17 00:00:00 2001
From: Artur Gainullin <artur.gainullin@intel.com>
Date: Mon, 6 Nov 2023 11:51:11 -0800
Subject: [PATCH 846/877] [SYCL][E2E] dmem_varied.cpp test missing required
 sync point (#11782)

Test is missing required wait() between initializing ptrs and reading
it. This is causing flaky failures.
---
 sycl/test-e2e/USM/dmem_varied.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/sycl/test-e2e/USM/dmem_varied.cpp b/sycl/test-e2e/USM/dmem_varied.cpp
index 440ffcfe4c09b..ef8bc916e5185 100644
--- a/sycl/test-e2e/USM/dmem_varied.cpp
+++ b/sycl/test-e2e/USM/dmem_varied.cpp
@@ -79,12 +79,12 @@ int main() {
   }
 
   q.submit([&](handler &h) {
-    h.single_task<class foo1>([=]() {
-      for (size_t i = 0; i < count; ++i) {
-        *ptrs[i] = 1;
-      }
-    });
-  });
+     h.single_task<class foo1>([=]() {
+       for (size_t i = 0; i < count; ++i) {
+         *ptrs[i] = 1;
+       }
+     });
+   }).wait();
 
   size_t *res =
       (size_t *)aligned_alloc_shared(alignof(size_t), sizeof(size_t), q);

From f54f61d92b718ff397bc41369b28b3f6f5f52067 Mon Sep 17 00:00:00 2001
From: Vyacheslav Klochkov <vyacheslav.n.klochkov@intel.com>
Date: Mon, 6 Nov 2023 16:08:21 -0600
Subject: [PATCH 847/877] [ESIMD] Fix implementations of block_load(usm, ...)
 and block_load(acc) (#11797)

1) Fix the big mess in E2E test for block_load(). Test did not really
   check the mask variant. It also used wrong alignments.

2) Fix the comments for USM and ACC block_load implementations.
3) Minor optimization for ACC block_load functions that do not accept
   the byte_offset operand. We can assume align16 for them.

Signed-off-by: Klochkov, Vyacheslav N <vyacheslav.n.klochkov@intel.com>
---
 .../ext/intel/esimd/detail/memory_intrin.hpp  |   2 +-
 sycl/include/sycl/ext/intel/esimd/memory.hpp  | 392 +++++++++++-------
 sycl/test-e2e/ESIMD/esimd_test_utils.hpp      |  27 +-
 .../unified_memory_api/Inputs/block_load.hpp  | 194 +++++----
 sycl/test/esimd/memory_properties.cpp         |   2 +-
 5 files changed, 377 insertions(+), 240 deletions(-)

diff --git a/sycl/include/sycl/ext/intel/esimd/detail/memory_intrin.hpp b/sycl/include/sycl/ext/intel/esimd/detail/memory_intrin.hpp
index e7eed0c5ecfc1..317a33c56000d 100644
--- a/sycl/include/sycl/ext/intel/esimd/detail/memory_intrin.hpp
+++ b/sycl/include/sycl/ext/intel/esimd/detail/memory_intrin.hpp
@@ -212,7 +212,7 @@ __ESIMD_INTRIN __ESIMD_DNS::vector_type_t<T, N * __ESIMD_DNS::to_int<VS>()>
 __esimd_lsc_load_merge_bti(
     __ESIMD_DNS::simd_mask_storage_t<N> pred,
     __ESIMD_DNS::vector_type_t<uint32_t, N> offsets, SurfIndAliasT surf_ind,
-    __ESIMD_DNS::vector_type_t<T, N * __ESIMD_DNS::to_int<VS>()> PassThru = 0)
+    __ESIMD_DNS::vector_type_t<T, N * __ESIMD_DNS::to_int<VS>()> PassThru)
 #ifdef __SYCL_DEVICE_ONLY__
     ;
 #else  // __SYCL_DEVICE_ONLY__
diff --git a/sycl/include/sycl/ext/intel/esimd/memory.hpp b/sycl/include/sycl/ext/intel/esimd/memory.hpp
index 9f6ca4fe99ba7..16d4bcf2ab225 100644
--- a/sycl/include/sycl/ext/intel/esimd/memory.hpp
+++ b/sycl/include/sycl/ext/intel/esimd/memory.hpp
@@ -232,8 +232,8 @@ using DeviceAccessorOffsetT = uint64_t;
 using DeviceAccessorOffsetT = uint32_t;
 #endif
 
-template <typename T, int NElts, cache_hint L1H = cache_hint::none,
-          cache_hint L2H = cache_hint::none, typename FlagsT>
+template <typename T, int NElts, cache_hint L1H, cache_hint L2H,
+          typename FlagsT>
 __ESIMD_API std::enable_if_t<is_simd_flag_type_v<FlagsT>, simd<T, NElts>>
 block_load_impl(const T *p, simd_mask<1> pred, FlagsT flags) {
   // Verify input template arguments.
@@ -317,8 +317,8 @@ block_load_impl(const T *p, simd_mask<1> pred, FlagsT flags) {
 /// @param flags is the alignment specifier type tag.
 /// @return is a vector of type T and size NElts.
 ///
-template <typename T, int NElts, cache_hint L1H = cache_hint::none,
-          cache_hint L2H = cache_hint::none, typename FlagsT>
+template <typename T, int NElts, cache_hint L1H, cache_hint L2H,
+          typename FlagsT>
 __ESIMD_API std::enable_if_t<is_simd_flag_type_v<FlagsT>, simd<T, NElts>>
 block_load_impl(const T *p, simd_mask<1> pred, simd<T, NElts> pass_thru,
                 FlagsT flags) {
@@ -406,9 +406,8 @@ block_load_impl(const T *p, simd_mask<1> pred, simd<T, NElts> pass_thru,
 /// @return is a vector of type T and size NElts. The elements of the returned
 /// vector for which the corresponding element in \p pred is 0 are undefined.
 ///
-template <typename T, int NElts, cache_hint L1H = cache_hint::none,
-          cache_hint L2H = cache_hint::none, typename AccessorT,
-          typename FlagsT = __ESIMD_DNS::dqword_element_aligned_tag>
+template <typename T, int NElts, cache_hint L1H, cache_hint L2H,
+          typename AccessorT, typename FlagsT>
 __ESIMD_API
     std::enable_if_t<detail::is_device_accessor_with_v<
                          AccessorT, detail::accessor_mode_cap::can_read> &&
@@ -506,9 +505,8 @@ __ESIMD_API
 /// @param flags is the alignment specifier type tag.
 /// @return is a vector of type T and size NElts
 ///
-template <typename T, int NElts, cache_hint L1H = cache_hint::none,
-          cache_hint L2H = cache_hint::none, typename AccessorT,
-          typename FlagsT = dqword_element_aligned_tag>
+template <typename T, int NElts, cache_hint L1H, cache_hint L2H,
+          typename AccessorT, typename FlagsT>
 __ESIMD_API
     std::enable_if_t<detail::is_device_accessor_with_v<
                          AccessorT, detail::accessor_mode_cap::can_read> &&
@@ -574,8 +572,8 @@ __ESIMD_API
 #endif // !__ESIMD_FORCE_STATELESS_MEM
 }
 
-template <typename T, int NElts, cache_hint L1H = cache_hint::none,
-          cache_hint L2H = cache_hint::none, typename FlagsT>
+template <typename T, int NElts, cache_hint L1H, cache_hint L2H,
+          typename FlagsT>
 __ESIMD_API std::enable_if_t<is_simd_flag_type_v<FlagsT>>
 block_store_impl(T *p, simd<T, NElts> vals, simd_mask<1> pred, FlagsT flags) {
   detail::check_cache_hint<cache_action::store, L1H, L2H>();
@@ -669,19 +667,22 @@ block_store(Tx *addr, simd<Tx, N> vals, Flags) {
 /// of the type esimd::properties and may include esimd::cache_hint_L1,
 /// esimd::cache_hint_L2, esimd::cache_hint_L3, esimd::alignment.
 
-/// simd<T, N> block_load(const T* ptr, props={});                      // (1)
-/// simd<T, N> block_load(const T* ptr, size_t offset, props={});       // (2)
+/// simd<T, N> block_load(const T* ptr, props={});                 // (usm-bl-1)
+/// simd<T, N> block_load(const T* ptr, size_t byte_offset,
+///                       props={});                               // (usm-bl-2)
 
-/// simd<T, N> block_load(const T* ptr, simd_mask<1> pred, props={});   // (3)
-/// simd<T, N> block_load(const T* ptr, size_t offset, simd_mask<1> pred,
-///                       props={});                                    // (4)
+/// simd<T, N> block_load(const T* ptr, simd_mask<1> pred,
+///                       props={});                               // (usm-bl-3)
+/// simd<T, N> block_load(const T* ptr, size_t byte_offset,
+///                       simd_mask<1> pred, props={});            // (usm-bl-4)
 
 /// simd<T, N> block_load(const T* ptr, simd_mask<1> pred,
-///                       simd<T, N> pass_thru, props={});              // (5)
-/// simd<T, N> block_load(const T* ptr, size_t offset, simd_mask<1> pred,
-///                       simd<T, N> pass_thru, props={});              // (6)
+///                       simd<T, N> pass_thru, props={});         // (usm-bl-5)
+/// simd<T, N> block_load(const T* ptr, size_t byte_offset,
+///                       simd_mask<1> pred, simd<T, N> pass_thru,
+///                       props={});                               // (usm-bl-6)
 
-/// simd<T, N> block_load(const T* ptr, props={}); // (1)
+/// simd<T, N> block_load(const T* ptr, props={});                 // (usm-bl-1)
 /// This function loads a contiguous memory block from USM pointer \p ptr.
 ///
 /// There may be temporary restrictions depending on L1, L2 cache hints,
@@ -730,7 +731,7 @@ block_load(const T *ptr, PropertyListT props = {}) {
 
   if constexpr (L1Hint != cache_hint::none || L2Hint != cache_hint::none) {
     detail::check_cache_hint<detail::cache_action::load, L1Hint, L2Hint>();
-    constexpr int DefaultAlignment = (sizeof(T) <= 4) ? 4 : sizeof(T);
+    constexpr size_t DefaultAlignment = (sizeof(T) <= 4) ? 4 : sizeof(T);
     constexpr size_t Alignment =
         detail::getPropertyValue<PropertyListT, alignment_key>(
             DefaultAlignment);
@@ -747,9 +748,10 @@ block_load(const T *ptr, PropertyListT props = {}) {
   }
 }
 
-/// simd<T, N> block_load(const T* ptr, size_t byte_offset, props={}); // (2)
+/// simd<T, N> block_load(const T* ptr, size_t byte_offset,
+///                       props={});  // (usm-bl-2)
 /// This function loads a contiguous memory block from address referenced
-/// by USM pointer \p ptr and byte-offset \p byte_offset.
+/// by USM pointer \p ptr and the given \p byte_offset.
 ///
 /// There may be temporary restrictions depending on L1, L2 cache hints,
 /// See details in the 'Restrictions' section below. The restrictions will be
@@ -790,7 +792,8 @@ block_load(const T *ptr, size_t byte_offset, PropertyListT props = {}) {
   return block_load<T, N>(AdjustedPtr, props);
 }
 
-/// simd<T, N> block_load(const T* ptr, simd_mask<1> pred, props={}); // (3)
+/// simd<T, N> block_load(const T* ptr, simd_mask<1> pred,
+///                       props={});                               // (usm-bl-3)
 /// This function loads a contiguous memory block from USM pointer \p ptr.
 /// If the predicate \p pred is set to 0, then the load is omitted and the
 /// returned value is undefined.
@@ -846,10 +849,10 @@ block_load(const T *ptr, simd_mask<1> pred, PropertyListT props = {}) {
       ptr, pred, overaligned_tag<Alignment>{});
 }
 
-/// simd<T, N> block_load(const T* ptr, size_t byte_offset, simd_mask<1> pred,
-///                       props={}); // (4)
+/// simd<T, N> block_load(const T* ptr, size_t byte_offset,
+///                       simd_mask<1> pred, props={});            // (usm-bl-4)
 /// This function loads a contiguous memory block from address referenced
-/// by USM pointer \p ptr and byte-offset \p byte_offset.
+/// by USM pointer \p ptr and the given \p byte_offset.
 /// If the predicate \p pred is set to 0, then the load is omitted and the
 /// returned value is undefined.
 ///
@@ -892,7 +895,7 @@ block_load(const T *ptr, size_t byte_offset, simd_mask<1> pred,
 }
 
 /// simd<T, N> block_load(const T* ptr, simd_mask<1> pred,
-///                       simd<T, N> pass_thru, props={}); // (5)
+///                       simd<T, N> pass_thru, props={});         // (usm-bl-5)
 /// This function loads a contiguous memory block from USM pointer \p ptr.
 /// If the predicate \p pred is set to 0, then the load is omitted and the
 /// vector \p pass_thru is returned.
@@ -949,10 +952,11 @@ block_load(const T *ptr, simd_mask<1> pred, simd<T, N> pass_thru,
       ptr, pred, pass_thru, overaligned_tag<Alignment>{});
 }
 
-/// simd<T, N> block_load(const T* ptr, size_t byte_offset, simd_mask<1> pred,
-///                       simd<T, N> pass_thru, props={}); // (6)
+/// simd<T, N> block_load(const T* ptr, size_t byte_offset,
+///                       simd_mask<1> pred, simd<T, N> pass_thru,
+///                       props={});                               // (usm-bl-6)
 /// This function loads a contiguous memory block from address referenced
-/// by USM pointer \p ptr and byte-offset \p byte_offset.
+/// by USM pointer \p ptr and the given \p byte_offset.
 /// If the predicate \p pred is set to 0, then the load is omitted and the
 /// vector \p pass_thru is returned.
 ///
@@ -1020,9 +1024,9 @@ block_load(const Tx *addr, Flags) {
       reinterpret_cast<const VecT *>(addr));
 }
 
-/// Loads a contiguous block of memory from given accessor and offset and
-/// returns the loaded data as a vector. Actual code generated depends on the
-/// alignment parameter.
+/// Loads a contiguous block of memory from the given accessor \p acc and
+/// \p byte_offset and returns the loaded data as a vector.
+/// Actual code generated depends on the alignment parameter.
 /// @tparam Tx Element type.
 /// @tparam N Number of elements to load, <code>N * sizeof(Tx)</code> must be
 ///    1, 2, 4 or 8 owords long.
@@ -1031,7 +1035,7 @@ block_load(const Tx *addr, Flags) {
 ///    \c Flags parameter. If it is less than \c 16, then slower unaligned
 ///    access is generated, otherwise the access is aligned.
 /// @param acc The accessor.
-/// @param offset The offset to load from in bytes.
+/// @param byte_offset The offset to load from in bytes.
 /// @param Flags Specifies the alignment.
 /// @return A vector of loaded elements.
 ///
@@ -1042,10 +1046,11 @@ template <typename Tx, int N, typename AccessorTy,
               detail::is_device_accessor_with_v<
                   AccessorTy, detail::accessor_mode_cap::can_read>>,
           class T = detail::__raw_t<Tx>>
-__ESIMD_API simd<Tx, N>
-block_load(AccessorTy acc, detail::DeviceAccessorOffsetT offset, Flags flags) {
+__ESIMD_API simd<Tx, N> block_load(AccessorTy acc,
+                                   detail::DeviceAccessorOffsetT byte_offset,
+                                   Flags flags) {
 #ifdef __ESIMD_FORCE_STATELESS_MEM
-  return block_load<Tx, N>(__ESIMD_DNS::accessorToPointer<Tx>(acc, offset),
+  return block_load<Tx, N>(__ESIMD_DNS::accessorToPointer<Tx>(acc, byte_offset),
                            flags);
 #else
   std::ignore = flags;
@@ -1064,18 +1069,17 @@ block_load(AccessorTy acc, detail::DeviceAccessorOffsetT offset, Flags flags) {
 
   if constexpr (Flags::template alignment<simd<T, N>> >=
                 detail::OperandSize::OWORD) {
-    return __esimd_oword_ld<T, N>(surf_ind, offset >> 4);
+    return __esimd_oword_ld<T, N>(surf_ind, byte_offset >> 4);
   } else {
-    return __esimd_oword_ld_unaligned<T, N>(surf_ind, offset);
+    return __esimd_oword_ld_unaligned<T, N>(surf_ind, byte_offset);
   }
 #endif
 }
 
 /// Each of the following block load functions loads a contiguous memory block
-/// from the address referenced by accessor 'acc', or from 'acc +
-/// offset', where 'offset' is the offset in bytes (not in elements!). The
-/// parameter 'pred' is the one element predicate. If it is set to 1, then all
-/// 'N' elements are loaded. Otherwise, the block load operation is a NO-OP.
+/// from the address referenced by accessor 'acc', or from 'acc + byte_offset',
+/// The parameter 'pred' is the one element predicate. If it is set to 1, then
+/// all 'N' elements are loaded. Otherwise, the block load operation is a NO-OP.
 /// The parameter 'pass_thru' specifies the values being copied to the returned
 /// result if 'pred' is set to 0.
 /// The parameter 'props' specifies the optional compile-time properties
@@ -1083,26 +1087,26 @@ block_load(AccessorTy acc, detail::DeviceAccessorOffsetT offset, Flags flags) {
 /// esimd::cache_hint_L2, esimd::cache_hint_L3, esimd::alignment.
 
 /// simd<T, N>
-/// block_load(AccessorT acc, OffsetT offset, props = {});         // (acc-1)
-/// simd<T, N> block_load(AccessorT acc, props);                   // (acc-2)
+/// block_load(AccessorT acc, OffsetT byte_offset, props = {});    // (acc-bl-1)
+/// simd<T, N> block_load(AccessorT acc, props = {});              // (acc-bl-2)
 
 /// simd<T, N>
-/// block_load(AccessorT acc, OffsetT offset, simd_mask<1> pred,
-///            simd<T, N> pass_thru, flags = {});                  // (acc-3)
+/// block_load(AccessorT acc, OffsetT byte_offset, simd_mask<1> pred,
+///            simd<T, N> pass_thru, props = {});                  // (acc-bl-3)
 /// simd<T, N>
-/// block_load(AccessorT acc, OffsetT offset, simd_mask<1> pred,
-///            flags = {});                                        // (acc-4)
+/// block_load(AccessorT acc, OffsetT byte_offset,
+///            simd_mask<1> pred, props = {});                     // (acc-bl-4)
 
 /// simd<T, N>
 /// block_load(AccessorT acc, simd_mask<1> pred,
-///            simd<T, N> pass_thru, flags = {});                  // (acc-5)
+///            simd<T, N> pass_thru, props = {});                  // (acc-bl-5)
 /// simd<T, N>
-/// block_load(AccessorT acc, simd_mask<1> pred, flags = {});      // (acc-6)
+/// block_load(AccessorT acc, simd_mask<1> pred, props = {});      // (acc-bl-6)
 
 /// simd<T, N>
-/// block_load(AccessorT acc, OffsetT offset, props = {});         // (acc-1)
+/// block_load(AccessorT acc, OffsetT byte_offset, props = {});    // (acc-bl-1)
 /// This function loads a contiguous memory block referenced
-/// by accessor \p acc and byte-offset \p offset.
+/// by accessor \p acc and \p byte_offset.
 ///
 /// The parameter \p props specifies the optional compile-time properties
 /// of the type esimd::properties and may include esimd::cache_hint_L1,
@@ -1112,22 +1116,30 @@ block_load(AccessorTy acc, detail::DeviceAccessorOffsetT offset, Flags flags) {
 /// the cache_hint::none value is assumed by default.
 ///
 /// Alignment: If \p props does not specify the 'alignment' property, then
-/// the \p offset must be at least 4-byte aligned for elements of 4-bytes or
-///     smaller and 8-byte aligned for 8-byte elements.
-///
-/// Restrictions - cache hint imposed - temporary:
-/// If L1 or L2 cache hint is passed, then:
-/// R1: The pointer must be at least 4-byte aligned for elements of 4-bytes or
-///     smaller and 8-byte aligned for 8-byte elements.
-/// R2: The number of elements for 8-byte data: 1, 2, 3, 4, 8, 16, 32, 64;
-///     for 4-byte data: 1, 2, 3, 4, 8, 16, 32, 64,
+/// the \p byte_offset must be at least 4-byte aligned for elements of 4-bytes
+/// or smaller and 8-byte aligned for 8-byte elements.
+/// The alignment requirement may be less strict if stateless memory mode is ON,
+/// see block_load(usm_ptr, props) (aka usm-bl-01) for details/requirements.
+///
+/// Restrictions: there may be some extra restrictions depending on
+///    a) stateless memory mode enforcement is ON,
+///    b) cache hints are used,
+///    c) number of bytes loaded is either 16,32,64, or 128.
+/// If (b) || !(c), then the target device must be DG2 or PVC (not Gen12).
+/// If (a) && !(b), then there is no restriction on the number of elements
+/// to be loaded and \p byte_offset must be only element-aligned.
+///
+/// Gen12 requirements: !(b) && (c).
+///   It can load 16-, 32-, 64-, or 128-bytes only.
+/// DG2/PVC requirements:
+///   It can load such number of elements depending on the type 'T':
+///     for 8-byte data: 1, 2, 3, 4, 8, 16, 32(max for DG2), 64;
+///     for 4-byte data: 1, 2, 3, 4, 8, 16, 32, 64(max for DG2),
 ///                      or 128(only if alignment is 8-bytes or more);
-///     for 2-byte data: 2, 4, 6, 8, 16, 32, 64, 128,
+///     for 2-byte data: 2, 4, 6, 8, 16, 32, 64, 128(max for DG2),
 ///                      or 256(only if alignment is 8-bytes or more);
-///     for 1-byte data: 4, 8, 12, 16, 32, 64, 128, 256,
+///     for 1-byte data: 4, 8, 12, 16, 32, 64, 128, 256(max for DG2),
 ///                      or 512(only if alignment is 8-bytes or more).
-/// R3: The target device must be DG2, PVC or newer GPU.
-/// (R1), (R2) and (R3) are not applied if there are no cache hints.
 template <typename T, int N, typename AccessorT,
           typename PropertyListT =
               ext::oneapi::experimental::detail::empty_properties_t>
@@ -1136,10 +1148,11 @@ __ESIMD_API std::enable_if_t<
         detail::is_device_accessor_with_v<AccessorT,
                                           detail::accessor_mode_cap::can_read>,
     simd<T, N>>
-block_load(AccessorT acc, detail::DeviceAccessorOffsetT offset,
+block_load(AccessorT acc, detail::DeviceAccessorOffsetT byte_offset,
            PropertyListT props = {}) {
 #ifdef __ESIMD_FORCE_STATELESS_MEM
-  return block_load<T, N>(detail::accessorToPointer<T>(acc, offset), props);
+  return block_load<T, N>(detail::accessorToPointer<T>(acc, byte_offset),
+                          props);
 #else  // !__ESIMD_FORCE_STATELESS_MEM
   constexpr auto L1Hint =
       detail::getPropertyValue<PropertyListT, cache_hint_L1_key>(
@@ -1166,41 +1179,42 @@ block_load(AccessorT acc, detail::DeviceAccessorOffsetT offset,
   if constexpr (L1Hint != cache_hint::none || L2Hint != cache_hint::none ||
                 !IsLegacySize) {
     return detail::block_load_impl<T, N, L1Hint, L2Hint>(
-        acc, offset, simd_mask<1>(1), overaligned_tag<Alignment>{});
+        acc, byte_offset, simd_mask<1>(1), overaligned_tag<Alignment>{});
   } else {
-    return block_load<T, N>(acc, offset, overaligned_tag<Alignment>{});
+    return block_load<T, N>(acc, byte_offset, overaligned_tag<Alignment>{});
   }
 #endif // !__ESIMD_FORCE_STATELESS_MEM
 }
 
-/// simd<T, N> block_load(AccessorT acc, props);                   // (acc-2)
+/// simd<T, N> block_load(AccessorT acc, props = {});              // (acc-bl-2)
 /// This function loads a contiguous memory block referenced
-/// by accessor \p acc using implied offset=0.
+/// by accessor \p acc and implied offset=0.
 ///
 /// The parameter \p props specifies the optional compile-time properties
 /// of the type esimd::properties and may include esimd::cache_hint_L1,
-/// esimd::cache_hint_L2, esimd::alignment. Other properties are ignored.
+/// esimd::cache_hint_L2. Other properties are ignored. If \p props specifies
+/// the alignment property, then it is ignored because this variant implies
+/// zero offset, which means the most favourable 16-byte alignment is used.
 ///
 /// Cache hints: If \p props does not specify any L1 or L2 cache hints, then
 /// the cache_hint::none value is assumed by default.
 ///
-/// Alignment: If \p props does not specify the 'alignment' property, then
-/// the \p offset must be at least 4-byte aligned for elements of 4-bytes or
-///     smaller and 8-byte aligned for 8-byte elements.
-///
-/// Restrictions - cache hint imposed - temporary:
-/// If L1 or L2 cache hint is passed, then:
-/// R1: The pointer must be at least 4-byte aligned for elements of 4-bytes or
-///     smaller and 8-byte aligned for 8-byte elements.
-/// R2: The number of elements for 8-byte data: 1, 2, 3, 4, 8, 16, 32, 64;
-///     for 4-byte data: 1, 2, 3, 4, 8, 16, 32, 64,
-///                      or 128(only if alignment is 8-bytes or more);
-///     for 2-byte data: 2, 4, 6, 8, 16, 32, 64, 128,
-///                      or 256(only if alignment is 8-bytes or more);
-///     for 1-byte data: 4, 8, 12, 16, 32, 64, 128, 256,
-///                      or 512(only if alignment is 8-bytes or more).
-/// R3: The target device must be DG2, PVC or newer GPU.
-/// (R1), (R2) and (R3) are not applied if there are no cache hints.
+/// Restrictions: there may be some extra restrictions depending on
+///    a) stateless memory mode enforcement is ON,
+///    b) cache hints are used,
+///    c) number of bytes loaded is either 16,32,64, or 128.
+/// If (b) || !(c), then the target device must be DG2 or PVC (not Gen12).
+/// If (a) && !(b), then there is no restriction on the number of elements
+/// to be loaded and \p byte_offset must be only element-aligned.
+///
+/// Gen12 requirements: !(b) && (c).
+///   It can load 16-, 32-, 64-, or 128-bytes only.
+/// DG2/PVC requirements:
+///   It can load such number of elements depending on the type 'T':
+///     for 8-byte data: 1, 2, 3, 4, 8, 16, 32(max for DG2), 64;
+///     for 4-byte data: 1, 2, 3, 4, 8, 16, 32, 64(max for DG2), or 128;
+///     for 2-byte data: 2, 4, 6, 8, 16, 32, 64, 128(max for DG2), or 256;
+///     for 1-byte data: 4, 8, 12, 16, 32, 64, 128, 256(max for DG2), or 512.
 template <typename T, int N, typename AccessorT,
           typename PropertyListT =
               ext::oneapi::experimental::detail::empty_properties_t>
@@ -1209,17 +1223,29 @@ __ESIMD_API std::enable_if_t<
         detail::is_device_accessor_with_v<AccessorT,
                                           detail::accessor_mode_cap::can_read>,
     simd<T, N>>
-block_load(AccessorT acc, PropertyListT props = {}) {
-  return block_load<T, N>(acc, 0, props);
+block_load(AccessorT acc, PropertyListT /* props */ = {}) {
+  // Create new properties without the alignment property passed in 'props',
+  // and add alignment<16> as it is usable and most favourable in this case.
+  constexpr auto L1Hint =
+      detail::getPropertyValue<PropertyListT, cache_hint_L1_key>(
+          cache_hint::none);
+  constexpr auto L2Hint =
+      detail::getPropertyValue<PropertyListT, cache_hint_L2_key>(
+          cache_hint::none);
+  static_assert(!PropertyListT::template has_property<cache_hint_L3_key>(),
+                "L3 cache hint is reserved. The old/experimental L3 LSC cache "
+                "hint is cache_level::L2 now.");
+  properties Props{cache_hint_L1<L1Hint>, cache_hint_L2<L2Hint>, alignment<16>};
+  return block_load<T, N>(acc, 0, Props);
 }
 
 /// simd<T, N>
-/// block_load(AccessorT acc, OffsetT offset, simd_mask<1> pred,
-///            simd<T, N> pass_thru, flags = {});                  // (acc-3)
+/// block_load(AccessorT acc, OffsetT byte_offset, simd_mask<1> pred,
+///            simd<T, N> pass_thru, props = {});                  // (acc-bl-3)
 /// This function loads a contiguous memory block referenced
-/// by accessor \p acc using the byte-offset \p offset.
+/// by accessor \p acc and the given \p byte_offset.
 /// If the predicate \p pred is set to 0, then the load is omitted and the
-/// returned \p pass_thru is returned.
+/// \p pass_thru value is returned.
 ///
 /// The parameter \p props specifies the optional compile-time properties
 /// of the type esimd::properties and may include esimd::cache_hint_L1,
@@ -1229,22 +1255,21 @@ block_load(AccessorT acc, PropertyListT props = {}) {
 /// the cache_hint::none value is assumed by default.
 ///
 /// Alignment: If \p props does not specify the 'alignment' property, then
-/// the \p offset must be at least 4-byte aligned for elements of 4-bytes or
-///     smaller and 8-byte aligned for 8-byte elements.
-///
-/// Restrictions - cache hint imposed - temporary:
-/// If L1 or L2 cache hint is passed, then:
-/// R1: The pointer must be at least 4-byte aligned for elements of 4-bytes or
-///     smaller and 8-byte aligned for 8-byte elements.
-/// R2: The number of elements for 8-byte data: 1, 2, 3, 4, 8, 16, 32, 64;
-///     for 4-byte data: 1, 2, 3, 4, 8, 16, 32, 64,
+/// the \p byte_offset must be at least 4-byte aligned for elements of 4-bytes
+/// or smaller and 8-byte aligned for 8-byte elements.
+///
+/// Restrictions - cache hint and predicate imposed - temporary:
+/// R1: \p byte_offset must be at least 4-byte aligned for elements of 4-bytes
+///     or  smaller and 8-byte aligned for 8-byte elements.
+/// R2: The number of elements must be:
+///     for 8-byte data: 1, 2, 3, 4, 8, 16, 32(max for DG2), 64;
+///     for 4-byte data: 1, 2, 3, 4, 8, 16, 32, 64(max for DG2),
 ///                      or 128(only if alignment is 8-bytes or more);
-///     for 2-byte data: 2, 4, 6, 8, 16, 32, 64, 128,
+///     for 2-byte data: 2, 4, 6, 8, 16, 32, 64, 128(max for DG2),
 ///                      or 256(only if alignment is 8-bytes or more);
-///     for 1-byte data: 4, 8, 12, 16, 32, 64, 128, 256,
+///     for 1-byte data: 4, 8, 12, 16, 32, 64, 128, 256(max for DG2),
 ///                      or 512(only if alignment is 8-bytes or more).
 /// R3: The target device must be DG2, PVC or newer GPU.
-/// (R1), (R2) and (R3) are not applied if there are no cache hints.
 template <typename T, int N, typename AccessorT,
           typename PropertyListT =
               ext::oneapi::experimental::detail::empty_properties_t>
@@ -1253,8 +1278,9 @@ __ESIMD_API std::enable_if_t<
         detail::is_device_accessor_with_v<AccessorT,
                                           detail::accessor_mode_cap::can_read>,
     simd<T, N>>
-block_load(AccessorT acc, detail::DeviceAccessorOffsetT offset,
-           simd_mask<1> pred, simd<T, N> pass_thru, PropertyListT props = {}) {
+block_load(AccessorT acc, detail::DeviceAccessorOffsetT byte_offset,
+           simd_mask<1> pred, simd<T, N> pass_thru,
+           PropertyListT /* props */ = {}) {
   constexpr auto L1Hint =
       detail::getPropertyValue<PropertyListT, cache_hint_L1_key>(
           cache_hint::none);
@@ -1265,20 +1291,20 @@ block_load(AccessorT acc, detail::DeviceAccessorOffsetT offset,
                 "L3 cache hint is reserved. The old/experimental L3 LSC cache "
                 "hint is cache_level::L2 now.");
 
-  // If the alignment property is not passed, then assume the offset
+  // If the alignment property is not passed, then assume the byte_offset
   // is element-aligned and is at leat 4-bytes.
   constexpr size_t DefaultAlignment = (sizeof(T) <= 4) ? 4 : sizeof(T);
   constexpr size_t Alignment =
       detail::getPropertyValue<PropertyListT, alignment_key>(DefaultAlignment);
   return detail::block_load_impl<T, N, L1Hint, L2Hint>(
-      acc, offset, pred, pass_thru, overaligned_tag<Alignment>{});
+      acc, byte_offset, pred, pass_thru, overaligned_tag<Alignment>{});
 }
 
 /// simd<T, N>
-/// block_load(AccessorT acc, OffsetT offset, simd_mask<1> pred,
-///            flags = {});                                        // (acc-4)
+/// block_load(AccessorT acc, OffsetT byte_offset, simd_mask<1> pred,
+///            props = {});                                        // (acc-bl-4)
 /// This function loads a contiguous memory block referenced
-/// by accessor \p acc using the byte-offset \p offset.
+/// by accessor \p acc and the given \p byte_offset.
 /// If the predicate \p pred is set to 0, then the load is omitted and the
 /// returned value is undefined.
 ///
@@ -1293,19 +1319,18 @@ block_load(AccessorT acc, detail::DeviceAccessorOffsetT offset,
 /// the \p offset must be at least 4-byte aligned for elements of 4-bytes or
 ///     smaller and 8-byte aligned for 8-byte elements.
 ///
-/// Restrictions - cache hint imposed - temporary:
-/// If L1 or L2 cache hint is passed, then:
-/// R1: The pointer must be at least 4-byte aligned for elements of 4-bytes or
-///     smaller and 8-byte aligned for 8-byte elements.
-/// R2: The number of elements for 8-byte data: 1, 2, 3, 4, 8, 16, 32, 64;
-///     for 4-byte data: 1, 2, 3, 4, 8, 16, 32, 64,
+/// Restrictions - cache hint and predicate imposed - temporary:
+/// R1: \p byte_offset must be at least 4-byte aligned for elements of 4-bytes
+///     or  smaller and 8-byte aligned for 8-byte elements.
+/// R2: The number of elements must be:
+///     for 8-byte data: 1, 2, 3, 4, 8, 16, 32(max for DG2), 64;
+///     for 4-byte data: 1, 2, 3, 4, 8, 16, 32, 64(max for DG2),
 ///                      or 128(only if alignment is 8-bytes or more);
-///     for 2-byte data: 2, 4, 6, 8, 16, 32, 64, 128,
+///     for 2-byte data: 2, 4, 6, 8, 16, 32, 64, 128(max for DG2),
 ///                      or 256(only if alignment is 8-bytes or more);
-///     for 1-byte data: 4, 8, 12, 16, 32, 64, 128, 256,
+///     for 1-byte data: 4, 8, 12, 16, 32, 64, 128, 256(max for DG2),
 ///                      or 512(only if alignment is 8-bytes or more).
 /// R3: The target device must be DG2, PVC or newer GPU.
-/// (R1), (R2) and (R3) are not applied if there are no cache hints.
 template <typename T, int N, typename AccessorT,
           typename PropertyListT =
               ext::oneapi::experimental::detail::empty_properties_t>
@@ -1314,17 +1339,39 @@ __ESIMD_API std::enable_if_t<
         detail::is_device_accessor_with_v<AccessorT,
                                           detail::accessor_mode_cap::can_read>,
     simd<T, N>>
-block_load(AccessorT acc, detail::DeviceAccessorOffsetT offset,
+block_load(AccessorT acc, detail::DeviceAccessorOffsetT byte_offset,
            simd_mask<1> pred, PropertyListT props = {}) {
   simd<T, N> PassThru; // Intentionally uninitialized.
-  return block_load<T, N>(acc, offset, pred, PassThru, props);
+  return block_load<T, N>(acc, byte_offset, pred, PassThru, props);
 }
 
 /// simd<T, N>
 /// block_load(AccessorT acc, simd_mask<1> pred,
-///            simd<T, N> pass_thru, flags = {});                // (acc-5)
-/// Same as (acc-3) variant except that the byte-offset is not passed
-/// and is implied to be 0.
+///            simd<T, N> pass_thru, props = {});                  // (acc-bl-5)
+/// This function loads a contiguous memory block referenced
+/// by accessor \p acc and implied offset=0.
+/// If the predicate \p pred is set to 0, then the load is omitted and the
+/// \p pass_thru value is returned.
+///
+/// The parameter \p props specifies the optional compile-time properties
+/// of the type esimd::properties and may include esimd::cache_hint_L1,
+/// esimd::cache_hint_L2. Other properties are ignored. If \p props specifies
+/// the alignment property, then it is ignored because this variant implies
+/// zero offset, which means the most favourable 16-byte alignment is used.
+///
+/// Cache hints: If \p props does not specify any L1 or L2 cache hints, then
+/// the cache_hint::none value is assumed by default.
+///
+/// Restrictions - cache hint and predicate imposed - temporary:
+/// R1: The number of elements must be:
+///     for 8-byte data: 1, 2, 3, 4, 8, 16, 32(max for DG2), 64;
+///     for 4-byte data: 1, 2, 3, 4, 8, 16, 32, 64(max for DG2),
+///                      or 128(only if alignment is 8-bytes or more);
+///     for 2-byte data: 2, 4, 6, 8, 16, 32, 64, 128(max for DG2),
+///                      or 256(only if alignment is 8-bytes or more);
+///     for 1-byte data: 4, 8, 12, 16, 32, 64, 128, 256(max for DG2),
+///                      or 512(only if alignment is 8-bytes or more).
+/// R2: The target device must be DG2, PVC or newer GPU.
 template <typename T, int N, typename AccessorT,
           typename PropertyListT =
               ext::oneapi::experimental::detail::empty_properties_t>
@@ -1334,14 +1381,48 @@ __ESIMD_API std::enable_if_t<
                                           detail::accessor_mode_cap::can_read>,
     simd<T, N>>
 block_load(AccessorT acc, simd_mask<1> pred, simd<T, N> pass_thru,
-           PropertyListT props = {}) {
-  return block_load<T, N>(acc, 0, pred, pass_thru, props);
+           PropertyListT /* props */ = {}) {
+  // Create new properties without the alignment property passed in 'props',
+  // and add alignment<16> as it is usable and most favourable in this case.
+  constexpr auto L1Hint =
+      detail::getPropertyValue<PropertyListT, cache_hint_L1_key>(
+          cache_hint::none);
+  constexpr auto L2Hint =
+      detail::getPropertyValue<PropertyListT, cache_hint_L2_key>(
+          cache_hint::none);
+  static_assert(!PropertyListT::template has_property<cache_hint_L3_key>(),
+                "L3 cache hint is reserved. The old/experimental L3 LSC cache "
+                "hint is cache_level::L2 now.");
+  properties Props{cache_hint_L1<L1Hint>, cache_hint_L2<L2Hint>, alignment<16>};
+  return block_load<T, N>(acc, 0, pred, pass_thru, Props);
 }
 
 /// simd<T, N>
-/// block_load(AccessorT acc, simd_mask<1> pred, flags = {});    // (acc-6)
-/// Same as (acc-4) variant except that the byte-offset is not passed
-/// and is implied to be 0.
+/// block_load(AccessorT acc, simd_mask<1> pred, props = {});      // (acc-bl-6)
+/// This function loads a contiguous memory block referenced
+/// by accessor \p acc and implied offset=0.
+/// If the predicate \p pred is set to 0, then the load is omitted and some
+/// undefined value is returned.
+///
+/// The parameter \p props specifies the optional compile-time properties
+/// of the type esimd::properties and may include esimd::cache_hint_L1,
+/// esimd::cache_hint_L2. Other properties are ignored. If \p props specifies
+/// the alignment property, then it is ignored because this variant implies
+/// zero offset, which means the most favourable 16-byte alignment is used.
+///
+/// Cache hints: If \p props does not specify any L1 or L2 cache hints, then
+/// the cache_hint::none value is assumed by default.
+///
+/// Restrictions - cache hint and predicate imposed - temporary:
+/// R1: The number of elements must be:
+///     for 8-byte data: 1, 2, 3, 4, 8, 16, 32(max for DG2), 64;
+///     for 4-byte data: 1, 2, 3, 4, 8, 16, 32, 64(max for DG2),
+///                      or 128(only if alignment is 8-bytes or more);
+///     for 2-byte data: 2, 4, 6, 8, 16, 32, 64, 128(max for DG2),
+///                      or 256(only if alignment is 8-bytes or more);
+///     for 1-byte data: 4, 8, 12, 16, 32, 64, 128, 256(max for DG2),
+///                      or 512(only if alignment is 8-bytes or more).
+/// R2: The target device must be DG2, PVC or newer GPU.
 template <typename T, int N, typename AccessorT,
           typename PropertyListT =
               ext::oneapi::experimental::detail::empty_properties_t>
@@ -1350,9 +1431,22 @@ __ESIMD_API std::enable_if_t<
         detail::is_device_accessor_with_v<AccessorT,
                                           detail::accessor_mode_cap::can_read>,
     simd<T, N>>
-block_load(AccessorT acc, simd_mask<1> pred, PropertyListT props = {}) {
+block_load(AccessorT acc, simd_mask<1> pred, PropertyListT /* props */ = {}) {
+  // Create new properties without the alignment property passed in 'props',
+  // and add alignment<16> as it is usable and most favourable in this case.
+  constexpr auto L1Hint =
+      detail::getPropertyValue<PropertyListT, cache_hint_L1_key>(
+          cache_hint::none);
+  constexpr auto L2Hint =
+      detail::getPropertyValue<PropertyListT, cache_hint_L2_key>(
+          cache_hint::none);
+  static_assert(!PropertyListT::template has_property<cache_hint_L3_key>(),
+                "L3 cache hint is reserved. The old/experimental L3 LSC cache "
+                "hint is cache_level::L2 now.");
+  properties Props{cache_hint_L1<L1Hint>, cache_hint_L2<L2Hint>, alignment<16>};
+
   simd<T, N> PassThru; // Intentionally uninitialized.
-  return block_load<T, N>(acc, 0, pred, PassThru, props);
+  return block_load<T, N>(acc, 0, pred, PassThru, Props);
 }
 
 /// Each of the following block store functions stores a contiguous memory block
@@ -2404,16 +2498,16 @@ slm_scatter_rgba(simd<uint32_t, N> offsets,
 /// @tparam T Element type.
 /// @tparam N Number of elements to load.
 /// @tparam Flags The alignment specifier type tag.
-/// @param offset The byte-offset to load from.
+/// @param byte_offset The byte-offset to load from.
 /// @param Flags Specifies the alignment.
 /// @return A vector of loaded elements.
 ///
 template <typename T, int N,
           typename Flags = overaligned_tag<detail::OperandSize::OWORD>>
 __ESIMD_API std::enable_if_t<is_simd_flag_type_v<Flags>, simd<T, N>>
-slm_block_load(uint32_t offset, Flags = {}) {
+slm_block_load(uint32_t byte_offset, Flags = {}) {
   constexpr size_t Align = Flags::template alignment<simd<T, N>>;
-  return __esimd_slm_block_ld<detail::__raw_t<T>, N, Align>(offset);
+  return __esimd_slm_block_ld<detail::__raw_t<T>, N, Align>(byte_offset);
 }
 
 /// Stores elements of the vector \p vals to a contiguous block of SLM memory
@@ -3375,7 +3469,7 @@ __ESIMD_API void media_block_store(AccessorTy acc, unsigned x, unsigned y,
 }
 
 /// Loads a contiguous block of SLM memory referenced by the given
-/// local-accessor \p acc and byte-offset \p offset, then returns the loaded
+/// local-accessor \p acc and \p byte_offset, then returns the loaded
 /// data as a simd object.
 /// The generated code depends on the combination {T, N, Flags}.
 /// Providing flags specifying the alignment of 16-bytes or more produces more
@@ -3383,25 +3477,25 @@ __ESIMD_API void media_block_store(AccessorTy acc, unsigned x, unsigned y,
 /// efficient gather is generated. If the loaded vector is too long
 /// for 1 flat-load GPU instruction, then a series of flat-loads and/or gathers
 /// may be generated.
-/// @tparam Tx Element type.
+/// @tparam T Element type.
 /// @tparam N Number of elements to load.
 /// @tparam AccessorTy Accessor type (auto-deduced).
 /// @tparam Flags The alignment specifier type tag.
 /// @param acc The local accessor.
-/// @param offset The offset to load from in bytes.
+/// @param byte_offset The offset to load from in bytes.
 /// @param Flags Specifies the alignment.
 /// @return A vector of loaded elements.
 ///
-template <typename Tx, int N, typename AccessorTy,
+template <typename T, int N, typename AccessorTy,
           typename Flags = overaligned_tag<detail::OperandSize::OWORD>>
 __ESIMD_API
     std::enable_if_t<detail::is_local_accessor_with_v<
                          AccessorTy, detail::accessor_mode_cap::can_read> &&
                          is_simd_flag_type_v<Flags>,
-                     simd<Tx, N>>
-    block_load(AccessorTy acc, uint32_t offset, Flags = {}) {
-  return slm_block_load<Tx, N, Flags>(offset +
-                                      __ESIMD_DNS::localAccessorToOffset(acc));
+                     simd<T, N>>
+    block_load(AccessorTy acc, uint32_t byte_offset, Flags flags = {}) {
+  return slm_block_load<T, N>(byte_offset + detail::localAccessorToOffset(acc),
+                              flags);
 }
 
 /// Variant of block_store that uses local accessor as a parameter.
diff --git a/sycl/test-e2e/ESIMD/esimd_test_utils.hpp b/sycl/test-e2e/ESIMD/esimd_test_utils.hpp
index bccae5d5c2a75..941005fde942d 100644
--- a/sycl/test-e2e/ESIMD/esimd_test_utils.hpp
+++ b/sycl/test-e2e/ESIMD/esimd_test_utils.hpp
@@ -639,7 +639,8 @@ enum GPUDriverOS { Linux = 1, Windows = 2, LinuxAndWindows = 3 };
 /// for win/opencl see the link:
 /// https://www.intel.com/content/www/us/en/download/726609/intel-arc-iris-xe-graphics-whql-windows.html
 bool isGPUDriverGE(queue Q, GPUDriverOS OSCheck, std::string RequiredVersion,
-                   std::string WinOpenCLRequiredVersion = "") {
+                   std::string WinOpenCLRequiredVersion = "",
+                   bool VerifyFormat = true) {
   auto Dev = Q.get_device();
   if (!Dev.is_gpu())
     return false;
@@ -653,17 +654,16 @@ bool isGPUDriverGE(queue Q, GPUDriverOS OSCheck, std::string RequiredVersion,
 
   // A and B must have digits at the same positions.
   // Otherwise, A and B symbols must be equal, e.g. both be equal to '.'.
-  auto verifyDriverVersionFormat = [](const std::string &A,
-                                      const std::string &B) {
+  auto isExpectedDriverVersionFormat = [](const std::string &A,
+                                          const std::string &B) {
     if (A.size() != B.size())
-      throw std::runtime_error(
-          "Inconsistent expected & actual driver versions");
+      return false;
     for (int I = 0; I < A.size(); I++) {
       if ((A[I] >= '0' && A[I] <= '9' && !(B[I] >= '0' && B[I] <= '9')) &&
           A[I] != B[I])
-        throw std::runtime_error(
-            "Inconsistent expected & actual driver versions");
+        return false;
     }
+    return true;
   };
 
   auto BE = Q.get_backend();
@@ -684,8 +684,17 @@ bool isGPUDriverGE(queue Q, GPUDriverOS OSCheck, std::string RequiredVersion,
       !IsLinux && (OSCheck & GPUDriverOS::Windows)) {
     auto CurrentVersion = Dev.get_info<sycl::info::device::driver_version>();
     CurrentVersion = CurrentVersion.substr(Start, Length);
-    verifyDriverVersionFormat(CurrentVersion, RequiredVersion);
-    IsGE &= CurrentVersion >= RequiredVersion;
+    if (isExpectedDriverVersionFormat(CurrentVersion, RequiredVersion)) {
+      IsGE = CurrentVersion >= RequiredVersion;
+    } else if (VerifyFormat) {
+      std::string Msg =
+          std::string("Inconsistent expected & actual driver versions: ") +
+          CurrentVersion + " vs " + RequiredVersion;
+      throw std::runtime_error(
+          "Inconsistent expected & actual driver versions");
+    } else {
+      IsGE = false;
+    }
   }
   return IsGE;
 }
diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/block_load.hpp b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/block_load.hpp
index be666613a4f72..7b153447ef703 100644
--- a/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/block_load.hpp
+++ b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/block_load.hpp
@@ -20,12 +20,12 @@ using namespace sycl::ext::intel::esimd;
 
 // Returns true iff verification is passed.
 template <typename T>
-bool verify(const T *In, const T *Out, size_t Size, int N,
+bool verify(const T *In, const T *Out, size_t Size, int N, bool UseMask,
             bool UsePassThruOperand) {
   int NumErrors = 0;
   using Tuint = sycl::_V1::ext::intel::esimd::detail::uint_type_t<sizeof(T)>;
   for (int i = 0; i < Size && NumErrors < 32; i++) {
-    bool IsMaskSet = (i / N + 1) % 1;
+    bool IsMaskSet = UseMask ? ((i / N + 1) & 0x1) : true;
     Tuint Expected = sycl::bit_cast<Tuint>(In[i]);
     Tuint Computed = sycl::bit_cast<Tuint>(Out[i]);
 
@@ -39,7 +39,8 @@ bool verify(const T *In, const T *Out, size_t Size, int N,
     if (Computed != Expected) {
       NumErrors++;
       std::cout << "out[" << i << "] = 0x" << std::hex << Computed
-                << " vs etalon = 0x" << Expected << std::dec << std::endl;
+                << " vs etalon = 0x" << Expected << std::dec
+                << ", IsMaskSet = " << IsMaskSet << std::endl;
     }
   }
   std::cout << (NumErrors == 0 ? " passed\n" : " FAILED\n");
@@ -76,7 +77,7 @@ bool testUSM(queue Q, uint32_t Groups, uint32_t Threads,
          uint32_t ElemOffset = GlobalID * N;
 
          simd<T, N> Vals;
-         simd_mask<1> Mask = (GlobalID + 1) % 1;
+         simd_mask<1> Mask = (GlobalID + 1) & 0x1;
          if constexpr (!CheckProperties) {
            if constexpr (UsePassThruOperand) {
              // TODO: these 2 lines work-around the problem with scalar
@@ -144,7 +145,7 @@ bool testUSM(queue Q, uint32_t Groups, uint32_t Threads,
     return false;
   }
 
-  bool Passed = verify<T>(In, Out, Size, N, UsePassThruOperand);
+  bool Passed = verify<T>(In, Out, Size, N, UseMask, UsePassThruOperand);
   sycl::free(In, Q);
   sycl::free(Out, Q);
   return Passed;
@@ -155,75 +156,100 @@ template <typename T, bool TestPVCFeatures> bool testUSM(queue Q) {
   constexpr bool CheckMask = true;
   constexpr bool CheckProperties = true;
 
-  properties AlignOnlyProps{alignment<16>};
+  properties Align16Props{alignment<16>};
+  properties AlignElemProps{alignment<sizeof(T)>};
 
   bool Passed = true;
 
   // Test block_load() that is available on Gen12 and PVC.
   Passed &= testUSM<T, 1, !CheckMask, !CheckMerge, CheckProperties>(
-      Q, 2, 4, AlignOnlyProps);
+      Q, 2, 4, AlignElemProps);
   Passed &= testUSM<T, 2, !CheckMask, !CheckMerge, CheckProperties>(
-      Q, 1, 4, AlignOnlyProps);
+      Q, 1, 4, AlignElemProps);
   Passed &= testUSM<T, 3, !CheckMask, !CheckMerge, CheckProperties>(
-      Q, 2, 8, AlignOnlyProps);
+      Q, 2, 8, AlignElemProps);
   Passed &= testUSM<T, 4, !CheckMask, !CheckMerge, CheckProperties>(
-      Q, 2, 4, AlignOnlyProps);
+      Q, 2, 4, AlignElemProps);
   Passed &= testUSM<T, 8, !CheckMask, !CheckMerge, CheckProperties>(
-      Q, 2, 4, AlignOnlyProps);
+      Q, 2, 4, AlignElemProps);
   Passed &= testUSM<T, 16, !CheckMask, !CheckMerge, CheckProperties>(
-      Q, 2, 4, AlignOnlyProps);
+      Q, 2, 4, Align16Props);
   Passed &= testUSM<T, 32, !CheckMask, !CheckMerge, CheckProperties>(
-      Q, 2, 4, AlignOnlyProps);
+      Q, 2, 4, Align16Props);
+
   // Intentionally check non-power-of-2 simd size - it must work.
-  Passed &= testUSM<T, 33, !CheckMask, !CheckMerge, CheckProperties>(
-      Q, 2, 4, AlignOnlyProps);
-  Passed &= testUSM<T, 67, !CheckMask, !CheckMerge, CheckProperties>(
-      Q, 2, 4, AlignOnlyProps);
+  // Just pass element-size alignment.
+  // These test cases compute wrong values for for the few last elements
+  // if the driver is not new enough.
+  // TODO: windows version with the fix is not known. Enable it eventually.
+  if (sizeof(T) > 2 ||
+      esimd_test::isGPUDriverGE(Q, esimd_test::GPUDriverOS::LinuxAndWindows,
+                                "27556", "win.just.skip.test", false)) {
+    Passed &= testUSM<T, 33, !CheckMask, !CheckMerge, CheckProperties>(
+        Q, 2, 4, AlignElemProps);
+    Passed &= testUSM<T, 67, !CheckMask, !CheckMerge, CheckProperties>(
+        Q, 2, 4, AlignElemProps);
+  }
+
   // Intentionally check big simd size - it must work.
   Passed &= testUSM<T, 512, !CheckMask, !CheckMerge, CheckProperties>(
-      Q, 2, 4, AlignOnlyProps);
+      Q, 2, 4, AlignElemProps);
   Passed &= testUSM<T, 1024, !CheckMask, !CheckMerge, CheckProperties>(
-      Q, 2, 4, AlignOnlyProps);
+      Q, 2, 4, Align16Props);
 
   // Test block_load() without passing compile-time properties argument.
   Passed &= testUSM<T, 16, !CheckMask, !CheckMerge, !CheckProperties>(
-      Q, 2, 4, AlignOnlyProps);
+      Q, 2, 4, Align16Props);
   Passed &= testUSM<T, 32, !CheckMask, !CheckMerge, !CheckProperties>(
-      Q, 2, 4, AlignOnlyProps);
+      Q, 2, 4, Align16Props);
 
   if constexpr (TestPVCFeatures) {
     // Using mask or cache hints adds the requirement to run tests on PVC.
-    // Also, PVC variant currently requires power-or-two elements and
-    // the number of bytes loaded per call must not exceed 512.
+    // Also, PVC variant currently requires a) power-or-two elements,
+    // b) the number of bytes loaded per call must not exceed 512,
+    // c) the alignment of USM ptr + offset to be 4 or 8-bytes(for 8-byte
+    // element vectors).
 
+    constexpr size_t RequiredAlignment = sizeof(T) <= 4 ? 4 : 8;
     properties PVCProps{cache_hint_L1<cache_hint::streaming>,
-                        cache_hint_L2<cache_hint::cached>, alignment<16>};
-
-    if constexpr (sizeof(T) >= 4) // only d/q words are supported now
-      Passed &= testUSM<T, 1, !CheckMask, !CheckMerge, CheckProperties>(
-          Q, 2, 4, PVCProps);
-    if constexpr (sizeof(T) >= 2) // only d/q words are supported now
-      Passed &= testUSM<T, 2, !CheckMask, !CheckMerge, CheckProperties>(
-          Q, 5, 5, PVCProps);
-    Passed &= testUSM<T, 4, !CheckMask, !CheckMerge, CheckProperties>(Q, 5, 5,
-                                                                      PVCProps);
-    Passed &= testUSM<T, 8, !CheckMask, !CheckMerge, CheckProperties>(Q, 5, 5,
-                                                                      PVCProps);
-    Passed &= testUSM<T, 16, !CheckMask, !CheckMerge, CheckProperties>(
-        Q, 5, 5, PVCProps);
-    Passed &= testUSM<T, 32, !CheckMask, !CheckMerge, CheckProperties>(
-        Q, 2, 4, PVCProps);
-    Passed &= testUSM<T, 64, CheckMask, !CheckMerge, CheckProperties>(Q, 7, 1,
-                                                                      PVCProps);
-    if constexpr (128 * sizeof(T) <= 512)
-      Passed &= testUSM<T, 128, CheckMask, CheckMerge, CheckProperties>(
-          Q, 1, 4, PVCProps);
-    if constexpr (256 * sizeof(T) <= 512)
-      Passed &= testUSM<T, 256, CheckMask, CheckMerge, CheckProperties>(
-          Q, 1, 4, PVCProps);
-    if constexpr (512 * sizeof(T) <= 512)
-      Passed &= testUSM<T, 512, CheckMask, CheckMerge, CheckProperties>(
-          Q, 1, 4, PVCProps);
+                        cache_hint_L2<cache_hint::cached>,
+                        alignment<RequiredAlignment>};
+
+    // Only d/q-words are supported now.
+    // Thus we use this I32Factor for testing purposes and convenience.
+    constexpr int I32Factor =
+        std::max(static_cast<int>(sizeof(int) / sizeof(T)), 1);
+    Passed &=
+        testUSM<T, 1 * I32Factor, !CheckMask, !CheckMerge, CheckProperties>(
+            Q, 2, 4, PVCProps);
+    Passed &=
+        testUSM<T, 2 * I32Factor, !CheckMask, !CheckMerge, CheckProperties>(
+            Q, 5, 5, PVCProps);
+    Passed &=
+        testUSM<T, 4 * I32Factor, !CheckMask, !CheckMerge, CheckProperties>(
+            Q, 5, 5, PVCProps);
+    Passed &=
+        testUSM<T, 8 * I32Factor, !CheckMask, !CheckMerge, CheckProperties>(
+            Q, 5, 5, PVCProps);
+    Passed &=
+        testUSM<T, 16 * I32Factor, !CheckMask, !CheckMerge, CheckProperties>(
+            Q, 5, 5, PVCProps);
+    Passed &=
+        testUSM<T, 32 * I32Factor, !CheckMask, !CheckMerge, CheckProperties>(
+            Q, 2, 4, PVCProps);
+
+    // This call (potentially) and the next call (guaranteed) load the biggest
+    // load-able chunk, which requires loading with 8-byte elements, which
+    // requires the alignment to be 8-bytes or more.
+    properties PVCAlign8Props{cache_hint_L1<cache_hint::streaming>,
+                              cache_hint_L2<cache_hint::cached>, alignment<8>};
+    Passed &=
+        testUSM<T, 64 * I32Factor, CheckMask, !CheckMerge, CheckProperties>(
+            Q, 7, 1, PVCAlign8Props);
+    if constexpr (sizeof(T) <= 4)
+      Passed &=
+          testUSM<T, 128 * I32Factor, CheckMask, CheckMerge, CheckProperties>(
+              Q, 1, 4, PVCAlign8Props);
   } // TestPVCFeatures
 
   return Passed;
@@ -258,7 +284,7 @@ bool testACC(queue Q, uint32_t Groups, uint32_t Threads,
   }
 
   try {
-    buffer<T, 1> InBuf(Size);
+    buffer<T, 1> InBuf(In);
     Q.submit([&](handler &CGH) {
        accessor InAcc{InBuf, CGH};
        auto OutPtr = Out.data();
@@ -268,7 +294,7 @@ bool testACC(queue Q, uint32_t Groups, uint32_t Threads,
          uint32_t ElemOffset = GlobalID * N;
 
          simd<T, N> Vals;
-         simd_mask<1> Mask = (GlobalID + 1) % 1;
+         simd_mask<1> Mask = (GlobalID + 1) & 0x1;
          if constexpr (!CheckProperties) {
            if constexpr (UsePassThruOperand) {
              // TODO: these 2 lines work-around the problem with scalar
@@ -333,7 +359,8 @@ bool testACC(queue Q, uint32_t Groups, uint32_t Threads,
     return false;
   }
 
-  bool Passed = verify<T>(In.data(), Out.data(), Size, N, UsePassThruOperand);
+  bool Passed =
+      verify<T>(In.data(), Out.data(), Size, N, UseMask, UsePassThruOperand);
   return Passed;
 }
 
@@ -342,7 +369,9 @@ template <typename T, bool TestPVCFeatures> bool testACC(queue Q) {
   constexpr bool CheckMask = true;
   constexpr bool CheckProperties = true;
 
-  properties AlignOnlyProps{alignment<16>};
+  properties Align16Props{alignment<16>};
+  constexpr size_t RequiredAlignment = sizeof(T) <= 4 ? 4 : 8;
+  properties MinReqAlignProps{alignment<RequiredAlignment>};
 
   bool Passed = true;
 
@@ -350,21 +379,21 @@ template <typename T, bool TestPVCFeatures> bool testACC(queue Q) {
   // 1, 2, 4 or 8  16-byte loads.
   constexpr int NElemsInOword = 16 / sizeof(T);
   Passed &= testACC<T, NElemsInOword, !CheckMask, !CheckMerge, CheckProperties>(
-      Q, 2, 4, AlignOnlyProps);
+      Q, 2, 4, Align16Props);
   Passed &=
       testACC<T, 2 * NElemsInOword, !CheckMask, !CheckMerge, CheckProperties>(
-          Q, 1, 4, AlignOnlyProps);
+          Q, 1, 4, Align16Props);
   Passed &=
       testACC<T, 4 * NElemsInOword, !CheckMask, !CheckMerge, CheckProperties>(
-          Q, 2, 4, AlignOnlyProps);
+          Q, 2, 4, MinReqAlignProps);
   Passed &=
       testACC<T, 8 * NElemsInOword, !CheckMask, !CheckMerge, CheckProperties>(
-          Q, 2, 4, AlignOnlyProps);
+          Q, 2, 4, Align16Props);
 
   // Test block_load() without passing compile-time properties argument.
   Passed &=
       testACC<T, NElemsInOword, !CheckMask, !CheckMerge, !CheckProperties>(
-          Q, 2, 4, AlignOnlyProps);
+          Q, 2, 4, Align16Props);
 
   if constexpr (TestPVCFeatures) {
     // Using mask or cache hints adds the requirement to run tests on PVC.
@@ -374,39 +403,44 @@ template <typename T, bool TestPVCFeatures> bool testACC(queue Q) {
     constexpr int I32Factor =
         std::max(static_cast<int>(sizeof(int) / sizeof(T)), 1);
     properties PVCProps{cache_hint_L1<cache_hint::streaming>,
-                        cache_hint_L2<cache_hint::cached>, alignment<16>};
+                        cache_hint_L2<cache_hint::cached>,
+                        alignment<RequiredAlignment>};
 
-    // Test block_load() that is available on Gen12 and PVC:
-    // 1, 2, 4 or 8  16-byte loads
+    // Test block_load() that is available on PVC:
+    // 1, 2, 3, 4, 8, ... N elements (up to 512-bytes).
     Passed &=
         testACC<T, 1 * I32Factor, !CheckMask, !CheckMerge, CheckProperties>(
-            Q, 2, 4, AlignOnlyProps);
+            Q, 2, 4, MinReqAlignProps);
     Passed &=
-        testACC<T, 2 * I32Factor, !CheckMask, !CheckMerge, CheckProperties>(
-            Q, 1, 4, AlignOnlyProps);
+        testACC<T, 2 * I32Factor, CheckMask, !CheckMerge, CheckProperties>(
+            Q, 1, 4, MinReqAlignProps);
     Passed &=
         testACC<T, 3 * I32Factor, !CheckMask, !CheckMerge, CheckProperties>(
-            Q, 2, 8, AlignOnlyProps);
-    Passed &=
-        testACC<T, 4 * I32Factor, !CheckMask, !CheckMerge, CheckProperties>(
-            Q, 2, 4, AlignOnlyProps);
-    Passed &=
-        testACC<T, 8 * I32Factor, !CheckMask, !CheckMerge, CheckProperties>(
-            Q, 2, 4, AlignOnlyProps);
+            Q, 2, 8, MinReqAlignProps);
+    Passed &= testACC<T, 4 * I32Factor, CheckMask, CheckMerge, CheckProperties>(
+        Q, 2, 4, PVCProps);
+    Passed &= testACC<T, 8 * I32Factor, CheckMask, CheckMerge, CheckProperties>(
+        Q, 2, 4, MinReqAlignProps);
     Passed &=
-        testACC<T, 16 * I32Factor, !CheckMask, !CheckMerge, CheckProperties>(
-            Q, 2, 4, AlignOnlyProps);
+        testACC<T, 16 * I32Factor, CheckMask, CheckMerge, CheckProperties>(
+            Q, 2, 4, MinReqAlignProps);
     Passed &=
-        testACC<T, 32 * I32Factor, !CheckMask, !CheckMerge, CheckProperties>(
-            Q, 2, 4, AlignOnlyProps);
+        testACC<T, 32 * I32Factor, CheckMask, !CheckMerge, CheckProperties>(
+            Q, 2, 4, PVCProps);
+
+    // This call (potentially) and the next call (guaranteed) load the biggest
+    // load-able chunk, which requires loading with 8-byte elements, which
+    // requires the alignment to be 8-bytes or more.
+    properties PVCAlign8Props{cache_hint_L1<cache_hint::streaming>,
+                              cache_hint_L2<cache_hint::cached>, alignment<8>};
     Passed &=
-        testACC<T, 64 * I32Factor, !CheckMask, !CheckMerge, CheckProperties>(
-            Q, 2, 4, AlignOnlyProps);
+        testACC<T, 64 * I32Factor, CheckMask, CheckMerge, CheckProperties>(
+            Q, 2, 4, PVCAlign8Props);
 
     if constexpr (sizeof(T) <= 4)
       Passed &=
-          testACC<T, 128 * I32Factor, !CheckMask, !CheckMerge, CheckProperties>(
-              Q, 2, 4, AlignOnlyProps);
+          testACC<T, 128 * I32Factor, CheckMask, CheckMerge, CheckProperties>(
+              Q, 2, 4, PVCAlign8Props);
   } // TestPVCFeatures
 
   return Passed;
diff --git a/sycl/test/esimd/memory_properties.cpp b/sycl/test/esimd/memory_properties.cpp
index 5ed1eba76eb2d..a30bd3720d747 100644
--- a/sycl/test/esimd/memory_properties.cpp
+++ b/sycl/test/esimd/memory_properties.cpp
@@ -135,7 +135,7 @@ foo(AccType &acc, float *ptrf, int byte_offset32, size_t byte_offset64) {
   // not power-of-two because only svm/legacy block_load supports
   // non-power-of-two vector lengths now.
 
-  // CHECK: call <4 x float> @llvm.genx.oword.ld.unaligned.v4f32(i32 0, i32 {{[^)]+}}, i32 {{[^)]+}})
+  // CHECK: call <4 x float> @llvm.genx.oword.ld.v4f32(i32 0, i32 {{[^)]+}}, i32 {{[^)]+}})
   auto z1 = block_load<float, 4>(acc, props_c);
 
   // CHECK: call <8 x i32> @llvm.genx.oword.ld.unaligned.v8i32(i32 0, i32 {{[^)]+}}, i32 {{[^)]+}})

From 28cf83273a2503d56afd582afd6deb9c730006a4 Mon Sep 17 00:00:00 2001
From: mmoadeli <mahmoud.moadeli@codeplay.com>
Date: Mon, 6 Nov 2023 23:48:55 +0000
Subject: [PATCH 848/877] [SYCL][HIP] Replace for loop to copy results with
 std::memcpy. (#11779)

Replace for loop to copy results with `std::memcpy` which showed
slightly better performance.

Co-authored-by: mmoadeli <mahmoudmoadeli@codeplay.com>
---
 .../sycl/ext/oneapi/matrix/matrix-hip.hpp     | 21 +++++++------------
 1 file changed, 7 insertions(+), 14 deletions(-)

diff --git a/sycl/include/sycl/ext/oneapi/matrix/matrix-hip.hpp b/sycl/include/sycl/ext/oneapi/matrix/matrix-hip.hpp
index 7f9f9b1219cf4..57a052d8fb9f3 100644
--- a/sycl/include/sycl/ext/oneapi/matrix/matrix-hip.hpp
+++ b/sycl/include/sycl/ext/oneapi/matrix/matrix-hip.hpp
@@ -344,15 +344,13 @@ void joint_matrix_mad_hip(
           *reinterpret_cast<const float16x4 *>(&A.wi_marray),
           *reinterpret_cast<const float16x4 *>(&B.wi_marray),
           *reinterpret_cast<const floatx4 *>(&C.wi_marray), 0, 0, 0);
-      for (int i = 0; i < 4; ++i)
-        D.wi_marray[i] = result[i];
+      std::memcpy(&D.wi_marray, &result, 4 * sizeof(float));
     } else if constexpr (M == 32 && N == 32) {
       auto result = __builtin_amdgcn_mfma_f32_32x32x8f16(
           *reinterpret_cast<const float16x4 *>(&A.wi_marray),
           *reinterpret_cast<const float16x4 *>(&B.wi_marray),
           *reinterpret_cast<const floatx16 *>(&C.wi_marray), 0, 0, 0);
-      for (int i = 0; i < 16; ++i)
-        D.wi_marray[i] = result[i];
+      std::memcpy(&D.wi_marray, &result, 16 * sizeof(float));
     }
   } else if constexpr (std::is_same_v<Tm, bfloat16>) {
     if constexpr (M == 16 && N == 16) {
@@ -360,23 +358,20 @@ void joint_matrix_mad_hip(
           *reinterpret_cast<const bfloat16x4 *>(&A.wi_marray),
           *reinterpret_cast<const bfloat16x4 *>(&B.wi_marray),
           *reinterpret_cast<const floatx4 *>(&C.wi_marray), 0, 0, 0);
-      for (int i = 0; i < 4; ++i)
-        D.wi_marray[i] = result[i];
+      std::memcpy(&D.wi_marray, &result, 4 * sizeof(float));
     } else if constexpr (M == 32 && N == 32) {
       auto result = __builtin_amdgcn_mfma_f32_32x32x8bf16_1k(
           *reinterpret_cast<const bfloat16x4 *>(&A.wi_marray),
           *reinterpret_cast<const bfloat16x4 *>(&B.wi_marray),
           *reinterpret_cast<const floatx16 *>(&C.wi_marray), 0, 0, 0);
-      for (int i = 0; i < 16; ++i)
-        D.wi_marray[i] = result[i];
+      std::memcpy(&D.wi_marray, &result, 16 * sizeof(float));
     }
   } else if constexpr (std::is_same_v<Tm, double>) {
     if constexpr (M == 16 && N == 16) {
       auto result = __builtin_amdgcn_mfma_f64_16x16x4f64(
           A.wi_marray[0], B.wi_marray[0],
           *reinterpret_cast<const doublex4 *>(&C.wi_marray), 0, 0, 0);
-      for (int i = 0; i < 4; ++i)
-        D.wi_marray[i] = result[i];
+      std::memcpy(&D.wi_marray, &result, 4 * sizeof(double));
     }
   } else if constexpr (std::is_same_v<Tm, int8_t>) {
     if constexpr (M == 16 && N == 16) {
@@ -384,15 +379,13 @@ void joint_matrix_mad_hip(
           *reinterpret_cast<const Tc *>(&A.wi_marray),
           *reinterpret_cast<const Tc *>(&B.wi_marray),
           *reinterpret_cast<const int32x4 *>(&C.wi_marray), 0, 0, 0);
-      for (int i = 0; i < 4; ++i)
-        D.wi_marray[i] = result[i];
+      std::memcpy(&D.wi_marray, &result, 4 * sizeof(int32_t));
     } else if constexpr (M == 32 && N == 32) {
       auto result = __builtin_amdgcn_mfma_i32_32x32x8i8(
           *reinterpret_cast<const Tc *>(&A.wi_marray),
           *reinterpret_cast<const Tc *>(&B.wi_marray),
           *reinterpret_cast<const int32x16 *>(&C.wi_marray), 0, 0, 0);
-      for (int i = 0; i < 16; ++i)
-        D.wi_marray[i] = result[i];
+      std::memcpy(&D.wi_marray, &result, 16 * sizeof(int32_t));
     }
   }
 }

From c39c2138f5a0eb6b85a2deeecf9b929530b7cb18 Mon Sep 17 00:00:00 2001
From: Joe Todd <joeatodd@users.noreply.github.com>
Date: Tue, 7 Nov 2023 10:41:18 +0000
Subject: [PATCH 849/877] [SYCL][COMPAT] Atomics: Support for multiple
 datatypes, remove runtime memoryOrder (#11788)

This PR replaces #11338, following discussion with DPCT team. Change
summary:
 - Remove runtime `memoryOrder` option (no longer needed)
- Define `type_identity_t` and use it to avoid template deduction
errors.
- New tests for the above (e.g. `int *` addr with `unsigned int`
operand)
 - New free function `atomic_fetch_compare_dec`
---
 sycl/doc/syclcompat/README.md                 | 176 ++++----
 sycl/include/syclcompat/atomic.hpp            | 392 ++++--------------
 sycl/include/syclcompat/memory.hpp            |  19 +-
 sycl/include/syclcompat/traits.hpp            |  44 ++
 .../syclcompat/atomic/atomic_arith.cpp        |  60 +--
 .../syclcompat/atomic/atomic_bitwise.cpp      | 198 +++++----
 .../atomic/atomic_comp_exchange.cpp           | 127 +++---
 .../syclcompat/atomic/atomic_fixt.hpp         |   8 +-
 .../atomic/atomic_memory_acq_rel.cpp          |  65 +--
 .../syclcompat/atomic/atomic_minmax.cpp       |  66 ++-
 10 files changed, 465 insertions(+), 690 deletions(-)
 create mode 100644 sycl/include/syclcompat/traits.hpp

diff --git a/sycl/doc/syclcompat/README.md b/sycl/doc/syclcompat/README.md
index 211684165319b..be05010cca03a 100644
--- a/sycl/doc/syclcompat/README.md
+++ b/sycl/doc/syclcompat/README.md
@@ -836,154 +836,122 @@ wrong queue is used as an argument in any of the member functions of the
 #### Atomic Operations
 
 SYCLcompat provides an interface for common atomic operations (`add`, `sub`,
-`and`, `or`, `xor`, `min`, `max`, `exchange`, `compare_exchange`). While SYCL
-exposes atomic operations through member functions of `sycl::atomic_ref`, this
-library provides access via functions taking a standard pointer argument.
-Template arguments control the `sycl::memory_scope`, `sycl::memory_order` and
-`sycl::access::address_space` of these atomic operations. SYCLcompat also
-exposes overloads for these atomic functions which take a runtime memoryScope
-argument. Every atomic operation is implemented via an API function taking a raw
-pointer as the target. Additional overloads for
+`and`, `or`, `xor`, `min`, `max`, `inc`, `dec`, `exchange`, `compare_exchange`).
+While SYCL exposes atomic operations through member functions of
+`sycl::atomic_ref`, this library provides access via functions taking a standard
+pointer argument. Template arguments control the `sycl::memory_scope`,
+`sycl::memory_order` and `sycl::access::address_space` of these atomic
+operations. SYCLcompat also exposes overloads for these atomic functions which
+take a runtime memoryScope argument. Every atomic operation is implemented via
+an API function taking a raw pointer as the target. Additional overloads for
 `syclcompat::compare_exchange_strong` are provided which take a
-`sycl::multi_ptr` instead of a raw pointer. Addition and subtraction make use of
-`arith_t` to differentiate between numeric and pointer arithmetics.
+`sycl::multi_ptr` instead of a raw pointer. The type of the operand for most
+atomic operations is defined as `syclcompat::type_identity_t<T>` to avoid
+template deduction issues when an operand of a different type (e.g. double
+literal) is supplied. Atomic addition and subtraction free functions make use of
+`syclcompat::arith_t<T>` to differentiate between numeric and pointer
+arithmetics.
 
 The available operations are exposed as follows:
 
 ``` c++
 namespace syclcompat {
 
+template <class T> struct type_identity {
+  using type = T;
+};
+template <class T> using type_identity_t = typename type_identity<T>::type;
+
 template <typename T> struct arith {
   using type = std::conditional_t<std::is_pointer_v<T>, std::ptrdiff_t, T>;
 };
 template <typename T> using arith_t = typename arith<T>::type;
 
-template <typename T,
-          sycl::access::address_space addressSpace =
-              sycl::access::address_space::global_space,
+template <sycl::access::address_space addressSpace =
+              sycl::access::address_space::generic_space,
           sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
-          sycl::memory_scope memoryScope = sycl::memory_scope::device>
+          sycl::memory_scope memoryScope = sycl::memory_scope::device,
+          typename T>
 T atomic_fetch_add(T *addr, arith_t<T> operand);
-template <typename T,
-          sycl::access::address_space addressSpace =
-              sycl::access::address_space::global_space,
-          sycl::memory_scope memoryScope = sycl::memory_scope::device>
-T atomic_fetch_add(T *addr, arith_t<T> operand,
-                   sycl::memory_order memoryOrder);
 
-template <typename T,
-          sycl::access::address_space addressSpace =
-              sycl::access::address_space::global_space,
+template <sycl::access::address_space addressSpace =
+              sycl::access::address_space::generic_space,
           sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
-          sycl::memory_scope memoryScope = sycl::memory_scope::device>
+          sycl::memory_scope memoryScope = sycl::memory_scope::device,
+          typename T>
 T atomic_fetch_sub(T *addr, arith_t<T> operand);
-template <typename T,
-          sycl::access::address_space addressSpace =
-              sycl::access::address_space::global_space,
-          sycl::memory_scope memoryScope = sycl::memory_scope::device>
-T atomic_fetch_sub(T *addr, arith_t<T> operand,
-                          sycl::memory_order memoryOrder);
 
-template <typename T,
-          sycl::access::address_space addressSpace =
-              sycl::access::address_space::global_space,
+template <sycl::access::address_space addressSpace =
+              sycl::access::address_space::generic_space,
           sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
-          sycl::memory_scope memoryScope = sycl::memory_scope::device>
-T atomic_fetch_and(T *addr, T operand);
-template <typename T,
-          sycl::access::address_space addressSpace =
-              sycl::access::address_space::global_space,
-          sycl::memory_scope memoryScope = sycl::memory_scope::device>
-T atomic_fetch_and(T *addr, T operand, sycl::memory_order memoryOrder);
+          sycl::memory_scope memoryScope = sycl::memory_scope::device,
+          typename T>
+T atomic_fetch_and(T *addr, type_identity<T> operand);
 
-template <typename T,
-          sycl::access::address_space addressSpace =
-              sycl::access::address_space::global_space,
+template <sycl::access::address_space addressSpace =
+              sycl::access::address_space::generic_space,
           sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
-          sycl::memory_scope memoryScope = sycl::memory_scope::device>
-T atomic_fetch_or(T *addr, T operand);
-template <typename T,
-          sycl::access::address_space addressSpace =
-              sycl::access::address_space::global_space,
-          sycl::memory_scope memoryScope = sycl::memory_scope::device>
-T atomic_fetch_or(T *addr, T operand, sycl::memory_order memoryOrder);
+          sycl::memory_scope memoryScope = sycl::memory_scope::device,
+          typename T>
+T atomic_fetch_or(T *addr, type_identity<T> operand);
 
-template <typename T,
-          sycl::access::address_space addressSpace =
-              sycl::access::address_space::global_space,
+template <sycl::access::address_space addressSpace =
+              sycl::access::address_space::generic_space,
           sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
-          sycl::memory_scope memoryScope = sycl::memory_scope::device>
-T atomic_fetch_xor(T *addr, T operand);
-template <typename T,
-          sycl::access::address_space addressSpace =
-              sycl::access::address_space::global_space,
-          sycl::memory_scope memoryScope = sycl::memory_scope::device>
-T atomic_fetch_xor(T *addr, T operand, sycl::memory_order memoryOrder);
+          sycl::memory_scope memoryScope = sycl::memory_scope::device,
+          typename T>
+T atomic_fetch_xor(T *addr, type_identity<T> operand);
 
-template <typename T,
-          sycl::access::address_space addressSpace =
-              sycl::access::address_space::global_space,
+template <sycl::access::address_space addressSpace =
+              sycl::access::address_space::generic_space,
           sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
-          sycl::memory_scope memoryScope = sycl::memory_scope::device>
-T atomic_fetch_min(T *addr, T operand);
-template <typename T,
-          sycl::access::address_space addressSpace =
-              sycl::access::address_space::global_space,
-          sycl::memory_scope memoryScope = sycl::memory_scope::device>
-T atomic_fetch_min(T *addr, T operand, sycl::memory_order memoryOrder);
+          sycl::memory_scope memoryScope = sycl::memory_scope::device,
+          typename T>
+T atomic_fetch_min(T *addr, type_identity<T> operand);
 
-template <typename T,
-          sycl::access::address_space addressSpace =
-              sycl::access::address_space::global_space,
+template <sycl::access::address_space addressSpace =
+              sycl::access::address_space::generic_space,
           sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
-          sycl::memory_scope memoryScope = sycl::memory_scope::device>
-T atomic_fetch_max(T *addr, T operand);
-template <typename T,
-          sycl::access::address_space addressSpace =
-              sycl::access::address_space::global_space,
-          sycl::memory_scope memoryScope = sycl::memory_scope::device>
-T atomic_fetch_max(T *addr, T operand, sycl::memory_order memoryOrder);
+          sycl::memory_scope memoryScope = sycl::memory_scope::device,
+          typename T>
+T atomic_fetch_max(T *addr, type_identity<T> operand);
 
 template <sycl::access::address_space addressSpace =
-              sycl::access::address_space::global_space,
+              sycl::access::address_space::generic_space,
           sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
           sycl::memory_scope memoryScope = sycl::memory_scope::device>
 unsigned int atomic_fetch_compare_inc(unsigned int *addr,
                                       unsigned int operand);
+
 template <sycl::access::address_space addressSpace =
-              sycl::access::address_space::global_space,
+              sycl::access::address_space::generic_space,
+          sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
           sycl::memory_scope memoryScope = sycl::memory_scope::device>
-unsigned int atomic_fetch_compare_inc(unsigned int *addr,
-                                      unsigned int operand,
-                                      sycl::memory_order memoryOrder);
+unsigned int atomic_fetch_compare_dec(unsigned int *addr,
+                                      unsigned int operand);
 
-template <typename T,
-          sycl::access::address_space addressSpace =
-              sycl::access::address_space::global_space,
+template <sycl::access::address_space addressSpace =
+              sycl::access::address_space::generic_space,
           sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
-          sycl::memory_scope memoryScope = sycl::memory_scope::device>
-T atomic_exchange(T *addr, T operand);
-template <typename T,
-          sycl::access::address_space addressSpace =
-              sycl::access::address_space::global_space,
-          sycl::memory_scope memoryScope = sycl::memory_scope::device>
-T atomic_exchange(T *addr, T operand, sycl::memory_order memoryOrder);
+          sycl::memory_scope memoryScope = sycl::memory_scope::device,
+          typename T>
+T atomic_exchange(T *addr, type_identity<T> operand);
 
-template <typename T,
-          sycl::access::address_space addressSpace =
-              sycl::access::address_space::global_space,
+template <sycl::access::address_space addressSpace =
+              sycl::access::address_space::generic_space,
           sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
-          sycl::memory_scope memoryScope = sycl::memory_scope::device>
+          sycl::memory_scope memoryScope = sycl::memory_scope::device,
+          typename T>
 T atomic_compare_exchange_strong(
-    sycl::multi_ptr<T, sycl::access::address_space::global_space> addr,
+    sycl::multi_ptr<T, sycl::access::address_space::generic_space> addr,
     T expected, T desired,
     sycl::memory_order success = sycl::memory_order::relaxed,
     sycl::memory_order fail = sycl::memory_order::relaxed);
-template <typename T,
-          sycl::access::address_space addressSpace =
-              sycl::access::address_space::global_space,
+template <sycl::access::address_space addressSpace =
+              sycl::access::address_space::generic_space,
           sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
-          sycl::memory_scope memoryScope = sycl::memory_scope::device>
+          sycl::memory_scope memoryScope = sycl::memory_scope::device,
+          typename T>
 T atomic_compare_exchange_strong(
     T *addr, T expected, T desired,
     sycl::memory_order success = sycl::memory_order::relaxed,
diff --git a/sycl/include/syclcompat/atomic.hpp b/sycl/include/syclcompat/atomic.hpp
index 42057f79be591..91353a9116617 100644
--- a/sycl/include/syclcompat/atomic.hpp
+++ b/sycl/include/syclcompat/atomic.hpp
@@ -38,12 +38,9 @@
 #include <sycl/memory_enums.hpp>
 #include <sycl/multi_ptr.hpp>
 
-namespace syclcompat {
+#include <syclcompat/traits.hpp>
 
-template <typename T> struct arith {
-  using type = std::conditional_t<std::is_pointer_v<T>, std::ptrdiff_t, T>;
-};
-template <typename T> using arith_t = typename arith<T>::type;
+namespace syclcompat {
 
 /// Atomically add the value operand to the value at the addr and assign the
 /// result to the value at addr.
@@ -51,94 +48,34 @@ template <typename T> using arith_t = typename arith<T>::type;
 /// \param operand The value to add to the value at \p addr.
 /// \param memoryOrder The memory ordering used.
 /// \returns The value at the \p addr before the call.
-template <typename T,
-          sycl::access::address_space addressSpace =
-              sycl::access::address_space::global_space,
+template <sycl::access::address_space addressSpace =
+              sycl::access::address_space::generic_space,
           sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
-          sycl::memory_scope memoryScope = sycl::memory_scope::device>
+          sycl::memory_scope memoryScope = sycl::memory_scope::device,
+          typename T>
 inline T atomic_fetch_add(T *addr, arith_t<T> operand) {
   auto atm =
       sycl::atomic_ref<T, memoryOrder, memoryScope, addressSpace>(addr[0]);
   return atm.fetch_add(operand);
 }
 
-/// Atomically add the value operand to the value at the addr and assign the
-/// result to the value at addr.
-/// \param [in, out] addr The pointer to the data.
-/// \param operand The value to add to the value at \p addr.
-/// \param memoryOrder The memory ordering used.
-/// \returns The value at the \p addr before the call.
-template <typename T,
-          sycl::access::address_space addressSpace =
-              sycl::access::address_space::global_space,
-          sycl::memory_scope memoryScope = sycl::memory_scope::device>
-inline T atomic_fetch_add(T *addr, arith_t<T> operand,
-                          sycl::memory_order memoryOrder) {
-  switch (memoryOrder) {
-  case sycl::memory_order::relaxed:
-    return atomic_fetch_add<T, addressSpace, sycl::memory_order::relaxed,
-                            memoryScope>(addr, operand);
-  case sycl::memory_order::acq_rel:
-    return atomic_fetch_add<T, addressSpace, sycl::memory_order::acq_rel,
-                            memoryScope>(addr, operand);
-  case sycl::memory_order::seq_cst:
-    return atomic_fetch_add<T, addressSpace, sycl::memory_order::seq_cst,
-                            memoryScope>(addr, operand);
-  default:
-    assert(false &&
-           "Invalid memory_order for atomics. Valid memory_order for "
-           "atomics are: sycl::memory_order::relaxed, "
-           "sycl::memory_order::acq_rel, sycl::memory_order::seq_cst!");
-  }
-}
-
 /// Atomically subtract the value operand from the value at the addr and
 /// assign the result to the value at addr.
 /// \param [in, out] addr The pointer to the data.
 /// \param operand The value to subtract from the value at \p addr.
 /// \param memoryOrder The memory ordering used.
 /// \returns The value at the \p addr before the call.
-template <typename T,
-          sycl::access::address_space addressSpace =
-              sycl::access::address_space::global_space,
+template <sycl::access::address_space addressSpace =
+              sycl::access::address_space::generic_space,
           sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
-          sycl::memory_scope memoryScope = sycl::memory_scope::device>
+          sycl::memory_scope memoryScope = sycl::memory_scope::device,
+          typename T>
 inline T atomic_fetch_sub(T *addr, arith_t<T> operand) {
   auto atm =
       sycl::atomic_ref<T, memoryOrder, memoryScope, addressSpace>(addr[0]);
   return atm.fetch_sub(operand);
 }
 
-/// Atomically subtract the value operand from the value at the addr and
-/// assign the result to the value at addr.
-/// \param [in, out] addr The pointer to the data.
-/// \param operand The value to subtract from the value at \p addr.
-/// \param memoryOrder The memory ordering used.
-/// \returns The value at the \p addr before the call.
-template <typename T,
-          sycl::access::address_space addressSpace =
-              sycl::access::address_space::global_space,
-          sycl::memory_scope memoryScope = sycl::memory_scope::device>
-inline T atomic_fetch_sub(T *addr, arith_t<T> operand,
-                          sycl::memory_order memoryOrder) {
-  switch (memoryOrder) {
-  case sycl::memory_order::relaxed:
-    return atomic_fetch_sub<T, addressSpace, sycl::memory_order::relaxed,
-                            memoryScope>(addr, operand);
-  case sycl::memory_order::acq_rel:
-    return atomic_fetch_sub<T, addressSpace, sycl::memory_order::acq_rel,
-                            memoryScope>(addr, operand);
-  case sycl::memory_order::seq_cst:
-    return atomic_fetch_sub<T, addressSpace, sycl::memory_order::seq_cst,
-                            memoryScope>(addr, operand);
-  default:
-    assert(false &&
-           "Invalid memory_order for atomics. Valid memory_order for "
-           "atomics are: sycl::memory_order::relaxed, "
-           "sycl::memory_order::acq_rel, sycl::memory_order::seq_cst!");
-  }
-}
-
 /// Atomically perform a bitwise AND between the value operand and the value
 /// at the addr and assign the result to the value at addr.
 /// \param [in, out] addr The pointer to the data.
@@ -146,47 +83,17 @@ inline T atomic_fetch_sub(T *addr, arith_t<T> operand,
 /// the \p addr.
 /// \param memoryOrder The memory ordering used.
 /// \returns The value at the \p addr before the call.
-template <typename T,
-          sycl::access::address_space addressSpace =
-              sycl::access::address_space::global_space,
+template <sycl::access::address_space addressSpace =
+              sycl::access::address_space::generic_space,
           sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
-          sycl::memory_scope memoryScope = sycl::memory_scope::device>
-inline T atomic_fetch_and(T *addr, T operand) {
+          sycl::memory_scope memoryScope = sycl::memory_scope::device,
+          typename T>
+inline T atomic_fetch_and(T *addr, type_identity_t<T> operand) {
   auto atm =
       sycl::atomic_ref<T, memoryOrder, memoryScope, addressSpace>(addr[0]);
   return atm.fetch_and(operand);
 }
 
-/// Atomically perform a bitwise AND between the value operand and the value
-/// at the addr and assign the result to the value at addr.
-/// \param [in, out] addr The pointer to the data.
-/// \param operand The value to use in bitwise AND operation with the value at
-/// the \p addr.
-/// \param memoryOrder The memory ordering used.
-/// \returns The value at the \p addr before the call.
-template <typename T,
-          sycl::access::address_space addressSpace =
-              sycl::access::address_space::global_space,
-          sycl::memory_scope memoryScope = sycl::memory_scope::device>
-inline T atomic_fetch_and(T *addr, T operand, sycl::memory_order memoryOrder) {
-  switch (memoryOrder) {
-  case sycl::memory_order::relaxed:
-    return atomic_fetch_and<T, addressSpace, sycl::memory_order::relaxed,
-                            memoryScope>(addr, operand);
-  case sycl::memory_order::acq_rel:
-    return atomic_fetch_and<T, addressSpace, sycl::memory_order::acq_rel,
-                            memoryScope>(addr, operand);
-  case sycl::memory_order::seq_cst:
-    return atomic_fetch_and<T, addressSpace, sycl::memory_order::seq_cst,
-                            memoryScope>(addr, operand);
-  default:
-    assert(false &&
-           "Invalid memory_order for atomics. Valid memory_order for "
-           "atomics are: sycl::memory_order::relaxed, "
-           "sycl::memory_order::acq_rel, sycl::memory_order::seq_cst!");
-  }
-}
-
 /// Atomically or the value at the addr with the value operand, and assign
 /// the result to the value at addr.
 /// \param [in, out] addr The pointer to the data.
@@ -194,47 +101,17 @@ inline T atomic_fetch_and(T *addr, T operand, sycl::memory_order memoryOrder) {
 /// the \p addr.
 /// \param memoryOrder The memory ordering used.
 /// \returns The value at the \p addr before the call.
-template <typename T,
-          sycl::access::address_space addressSpace =
-              sycl::access::address_space::global_space,
+template <sycl::access::address_space addressSpace =
+              sycl::access::address_space::generic_space,
           sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
-          sycl::memory_scope memoryScope = sycl::memory_scope::device>
-inline T atomic_fetch_or(T *addr, T operand) {
+          sycl::memory_scope memoryScope = sycl::memory_scope::device,
+          typename T>
+inline T atomic_fetch_or(T *addr, type_identity_t<T> operand) {
   auto atm =
       sycl::atomic_ref<T, memoryOrder, memoryScope, addressSpace>(addr[0]);
   return atm.fetch_or(operand);
 }
 
-/// Atomically or the value at the addr with the value operand, and assign
-/// the result to the value at addr.
-/// \param [in, out] addr The pointer to the data.
-/// \param operand The value to use in bitwise OR operation with the value at
-/// the \p addr.
-/// \param memoryOrder The memory ordering used.
-/// \returns The value at the \p addr before the call.
-template <typename T,
-          sycl::access::address_space addressSpace =
-              sycl::access::address_space::global_space,
-          sycl::memory_scope memoryScope = sycl::memory_scope::device>
-inline T atomic_fetch_or(T *addr, T operand, sycl::memory_order memoryOrder) {
-  switch (memoryOrder) {
-  case sycl::memory_order::relaxed:
-    return atomic_fetch_or<T, addressSpace, sycl::memory_order::relaxed,
-                           memoryScope>(addr, operand);
-  case sycl::memory_order::acq_rel:
-    return atomic_fetch_or<T, addressSpace, sycl::memory_order::acq_rel,
-                           memoryScope>(addr, operand);
-  case sycl::memory_order::seq_cst:
-    return atomic_fetch_or<T, addressSpace, sycl::memory_order::seq_cst,
-                           memoryScope>(addr, operand);
-  default:
-    assert(false &&
-           "Invalid memory_order for atomics. Valid memory_order for "
-           "atomics are: sycl::memory_order::relaxed, "
-           "sycl::memory_order::acq_rel, sycl::memory_order::seq_cst!");
-  }
-}
-
 /// Atomically xor the value at the addr with the value operand, and assign
 /// the result to the value at addr.
 /// \param [in, out] addr The pointer to the data.
@@ -242,136 +119,77 @@ inline T atomic_fetch_or(T *addr, T operand, sycl::memory_order memoryOrder) {
 /// the \p addr.
 /// \param memoryOrder The memory ordering used.
 /// \returns The value at the \p addr before the call.
-template <typename T,
-          sycl::access::address_space addressSpace =
-              sycl::access::address_space::global_space,
+template <sycl::access::address_space addressSpace =
+              sycl::access::address_space::generic_space,
           sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
-          sycl::memory_scope memoryScope = sycl::memory_scope::device>
-inline T atomic_fetch_xor(T *addr, T operand) {
+          sycl::memory_scope memoryScope = sycl::memory_scope::device,
+          typename T>
+inline T atomic_fetch_xor(T *addr, type_identity_t<T> operand) {
   auto atm =
       sycl::atomic_ref<T, memoryOrder, memoryScope, addressSpace>(addr[0]);
   return atm.fetch_xor(operand);
 }
 
-/// Atomically xor the value at the addr with the value operand, and assign
-/// the result to the value at addr.
-/// \param [in, out] addr The pointer to the data.
-/// \param operand The value to use in bitwise XOR operation with the value at
-/// the \p addr.
-/// \param memoryOrder The memory ordering used.
-/// \returns The value at the \p addr before the call.
-template <typename T,
-          sycl::access::address_space addressSpace =
-              sycl::access::address_space::global_space,
-          sycl::memory_scope memoryScope = sycl::memory_scope::device>
-inline T atomic_fetch_xor(T *addr, T operand, sycl::memory_order memoryOrder) {
-  switch (memoryOrder) {
-  case sycl::memory_order::relaxed:
-    return atomic_fetch_xor<T, addressSpace, sycl::memory_order::relaxed,
-                            memoryScope>(addr, operand);
-  case sycl::memory_order::acq_rel:
-    return atomic_fetch_xor<T, addressSpace, sycl::memory_order::acq_rel,
-                            memoryScope>(addr, operand);
-  case sycl::memory_order::seq_cst:
-    return atomic_fetch_xor<T, addressSpace, sycl::memory_order::seq_cst,
-                            memoryScope>(addr, operand);
-  default:
-    assert(false &&
-           "Invalid memory_order for atomics. Valid memory_order for "
-           "atomics are: sycl::memory_order::relaxed, "
-           "sycl::memory_order::acq_rel, sycl::memory_order::seq_cst!");
-  }
-}
-
 /// Atomically calculate the minimum of the value at addr and the value
 /// operand and assign the result to the value at addr.
 /// \param [in, out] addr The pointer to the data.
 /// \param operand. \param memoryOrder The memory ordering used.
 /// \returns The value at the \p addr before the call.
-template <typename T,
-          sycl::access::address_space addressSpace =
-              sycl::access::address_space::global_space,
+template <sycl::access::address_space addressSpace =
+              sycl::access::address_space::generic_space,
           sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
-          sycl::memory_scope memoryScope = sycl::memory_scope::device>
-inline T atomic_fetch_min(T *addr, T operand) {
+          sycl::memory_scope memoryScope = sycl::memory_scope::device,
+          typename T>
+inline T atomic_fetch_min(T *addr, type_identity_t<T> operand) {
   auto atm =
       sycl::atomic_ref<T, memoryOrder, memoryScope, addressSpace>(addr[0]);
   return atm.fetch_min(operand);
 }
 
-/// Atomically calculate the minimum of the value at addr and the value
-/// operand and assign the result to the value at addr.
-/// \param [in, out] addr The pointer to the data.
-/// \param operand.
-/// \param memoryOrder The memory ordering used.
-/// \returns The value at the \p addr before the call.
-template <typename T,
-          sycl::access::address_space addressSpace =
-              sycl::access::address_space::global_space,
-          sycl::memory_scope memoryScope = sycl::memory_scope::device>
-inline T atomic_fetch_min(T *addr, T operand, sycl::memory_order memoryOrder) {
-  switch (memoryOrder) {
-  case sycl::memory_order::relaxed:
-    return atomic_fetch_min<T, addressSpace, sycl::memory_order::relaxed,
-                            memoryScope>(addr, operand);
-  case sycl::memory_order::acq_rel:
-    return atomic_fetch_min<T, addressSpace, sycl::memory_order::acq_rel,
-                            memoryScope>(addr, operand);
-  case sycl::memory_order::seq_cst:
-    return atomic_fetch_min<T, addressSpace, sycl::memory_order::seq_cst,
-                            memoryScope>(addr, operand);
-  default:
-    assert(false &&
-           "Invalid memory_order for atomics. Valid memory_order for "
-           "atomics are: sycl::memory_order::relaxed, "
-           "sycl::memory_order::acq_rel, sycl::memory_order::seq_cst!");
-  }
-}
-
 /// Atomically calculate the maximum of the value at addr and the value
 /// operand and assign the result to the value at addr.
 /// \param [in, out] addr The pointer to the data.
 /// \param operand.
 /// \param memoryOrder The memory ordering used.
 /// \returns The value at the \p addr before the call.
-template <typename T,
-          sycl::access::address_space addressSpace =
-              sycl::access::address_space::global_space,
+template <sycl::access::address_space addressSpace =
+              sycl::access::address_space::generic_space,
           sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
-          sycl::memory_scope memoryScope = sycl::memory_scope::device>
-inline T atomic_fetch_max(T *addr, T operand) {
+          sycl::memory_scope memoryScope = sycl::memory_scope::device,
+          typename T>
+inline T atomic_fetch_max(T *addr, type_identity_t<T> operand) {
   auto atm =
       sycl::atomic_ref<T, memoryOrder, memoryScope, addressSpace>(addr[0]);
   return atm.fetch_max(operand);
 }
 
-/// Atomically calculate the maximum of the value at addr and the value
-/// operand and assign the result to the value at addr.
-/// \param [in, out] addr The pointer to the data.
-/// \param operand.
+/// Atomically set \p operand to the value stored in \p addr, if old value
+/// stored in \p addr is equal to zero or greater than \p operand, else decrease
+/// the value stored in \p addr. \param [in, out] addr The pointer to the data.
+/// \param operand The threshold value.
 /// \param memoryOrder The memory ordering used.
-/// \returns The value at the \p addr before the call.
-template <typename T,
-          sycl::access::address_space addressSpace =
-              sycl::access::address_space::global_space,
+/// \returns The old value stored in \p addr.
+template <sycl::access::address_space addressSpace =
+              sycl::access::address_space::generic_space,
+          sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
           sycl::memory_scope memoryScope = sycl::memory_scope::device>
-inline T atomic_fetch_max(T *addr, T operand, sycl::memory_order memoryOrder) {
-  switch (memoryOrder) {
-  case sycl::memory_order::relaxed:
-    return atomic_fetch_max<T, addressSpace, sycl::memory_order::relaxed,
-                            memoryScope>(addr, operand);
-  case sycl::memory_order::acq_rel:
-    return atomic_fetch_max<T, addressSpace, sycl::memory_order::acq_rel,
-                            memoryScope>(addr, operand);
-  case sycl::memory_order::seq_cst:
-    return atomic_fetch_max<T, addressSpace, sycl::memory_order::seq_cst,
-                            memoryScope>(addr, operand);
-  default:
-    assert(false &&
-           "Invalid memory_order for atomics. Valid memory_order for "
-           "atomics are: sycl::memory_order::relaxed, "
-           "sycl::memory_order::acq_rel, sycl::memory_order::seq_cst!");
+unsigned int atomic_fetch_compare_dec(unsigned int *addr,
+                                      unsigned int operand) {
+  auto atm =
+      sycl::atomic_ref<unsigned int, memoryOrder, memoryScope, addressSpace>(
+          addr[0]);
+  unsigned int old;
+
+  while (true) {
+    old = atm.load();
+    if (old == 0 || old > operand) {
+      if (atm.compare_exchange_strong(old, operand))
+        break;
+    } else if (atm.compare_exchange_strong(old, old - 1))
+      break;
   }
+
+  return old;
 }
 
 /// Atomically increment the value stored in \p addr if old value stored in \p
@@ -381,7 +199,7 @@ inline T atomic_fetch_max(T *addr, T operand, sycl::memory_order memoryOrder) {
 /// \param memoryOrder The memory ordering used.
 /// \returns The old value stored in \p addr.
 template <sycl::access::address_space addressSpace =
-              sycl::access::address_space::global_space,
+              sycl::access::address_space::generic_space,
           sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
           sycl::memory_scope memoryScope = sycl::memory_scope::device>
 inline unsigned int atomic_fetch_compare_inc(unsigned int *addr,
@@ -401,80 +219,22 @@ inline unsigned int atomic_fetch_compare_inc(unsigned int *addr,
   return old;
 }
 
-/// Atomically increment the value stored in \p addr if old value stored in \p
-/// addr is less than \p operand, else set 0 to the value stored in \p addr.
-/// \param [in, out] addr The pointer to the data.
-/// \param operand The threshold value.
-/// \param memoryOrder The memory ordering used.
-/// \returns The old value stored in \p addr.
-template <sycl::access::address_space addressSpace =
-              sycl::access::address_space::global_space,
-          sycl::memory_scope memoryScope = sycl::memory_scope::device>
-inline unsigned int atomic_fetch_compare_inc(unsigned int *addr,
-                                             unsigned int operand,
-                                             sycl::memory_order memoryOrder) {
-  switch (memoryOrder) {
-  case sycl::memory_order::relaxed:
-    return atomic_fetch_compare_inc<addressSpace, sycl::memory_order::relaxed,
-                                    memoryScope>(addr, operand);
-  case sycl::memory_order::acq_rel:
-    return atomic_fetch_compare_inc<addressSpace, sycl::memory_order::acq_rel,
-                                    memoryScope>(addr, operand);
-  case sycl::memory_order::seq_cst:
-    return atomic_fetch_compare_inc<addressSpace, sycl::memory_order::seq_cst,
-                                    memoryScope>(addr, operand);
-  default:
-    assert(false &&
-           "Invalid memory_order for atomics. Valid memory_order for "
-           "atomics are: sycl::memory_order::relaxed, "
-           "sycl::memory_order::acq_rel, sycl::memory_order::seq_cst!");
-  }
-}
-
 /// Atomically exchange the value at the address addr with the value operand.
 /// \param [in, out] addr The pointer to the data.
 /// \param operand The value to be exchanged with the value pointed by \p addr.
 /// \param memoryOrder The memory ordering used.
 /// \returns The value at the \p addr before the call.
-template <typename T,
-          sycl::access::address_space addressSpace =
-              sycl::access::address_space::global_space,
+template <sycl::access::address_space addressSpace =
+              sycl::access::address_space::generic_space,
           sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
-          sycl::memory_scope memoryScope = sycl::memory_scope::device>
-inline T atomic_exchange(T *addr, T operand) {
+          sycl::memory_scope memoryScope = sycl::memory_scope::device,
+          typename T>
+inline T atomic_exchange(T *addr, type_identity_t<T> operand) {
   auto atm =
       sycl::atomic_ref<T, memoryOrder, memoryScope, addressSpace>(addr[0]);
   return atm.exchange(operand);
 }
 
-/// Atomically exchange the value at the address addr with the value operand.
-/// \param [in, out] addr The pointer to the data.
-/// \param operand The value to be exchanged with the value pointed by \p
-/// addr. \param memoryOrder The memory ordering used. \returns The value at
-/// the \p addr before the call.
-template <typename T,
-          sycl::access::address_space addressSpace =
-              sycl::access::address_space::global_space,
-          sycl::memory_scope memoryScope = sycl::memory_scope::device>
-inline T atomic_exchange(T *addr, T operand, sycl::memory_order memoryOrder) {
-  switch (memoryOrder) {
-  case sycl::memory_order::relaxed:
-    return atomic_exchange<T, addressSpace, sycl::memory_order::relaxed,
-                           memoryScope>(addr, operand);
-  case sycl::memory_order::acq_rel:
-    return atomic_exchange<T, addressSpace, sycl::memory_order::acq_rel,
-                           memoryScope>(addr, operand);
-  case sycl::memory_order::seq_cst:
-    return atomic_exchange<T, addressSpace, sycl::memory_order::seq_cst,
-                           memoryScope>(addr, operand);
-  default:
-    assert(false &&
-           "Invalid memory_order for atomics. Valid memory_order for "
-           "atomics are: sycl::memory_order::relaxed, "
-           "sycl::memory_order::acq_rel, sycl::memory_order::seq_cst!");
-  }
-}
-
 /// Atomically compare the value at \p addr to the value expected and exchange
 /// with the value desired if the value at \p addr is equal to the value
 /// expected. Returns the value at the \p addr before the call.
@@ -485,14 +245,14 @@ inline T atomic_exchange(T *addr, T operand, sycl::memory_order memoryOrder) {
 /// \param success The memory ordering used when comparison succeeds.
 /// \param fail The memory ordering used when comparison fails.
 /// \returns The value at the \p addr before the call.
-template <typename T,
-          sycl::access::address_space addressSpace =
-              sycl::access::address_space::global_space,
+template <sycl::access::address_space addressSpace =
+              sycl::access::address_space::generic_space,
           sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
-          sycl::memory_scope memoryScope = sycl::memory_scope::device>
+          sycl::memory_scope memoryScope = sycl::memory_scope::device,
+          typename T>
 T atomic_compare_exchange_strong(
-    sycl::multi_ptr<T, sycl::access::address_space::global_space> addr,
-    T expected, T desired,
+    sycl::multi_ptr<T, sycl::access::address_space::generic_space> addr,
+    type_identity_t<T> expected, type_identity_t<T> desired,
     sycl::memory_order success = sycl::memory_order::relaxed,
     sycl::memory_order fail = sycl::memory_order::relaxed) {
   auto atm = sycl::atomic_ref<T, memoryOrder, memoryScope, addressSpace>(*addr);
@@ -511,13 +271,13 @@ T atomic_compare_exchange_strong(
 /// \param success The memory ordering used when comparison succeeds.
 /// \param fail The memory ordering used when comparison fails.
 /// \returns The value at the \p addr before the call.
-template <typename T,
-          sycl::access::address_space addressSpace =
-              sycl::access::address_space::global_space,
+template <sycl::access::address_space addressSpace =
+              sycl::access::address_space::generic_space,
           sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
-          sycl::memory_scope memoryScope = sycl::memory_scope::device>
+          sycl::memory_scope memoryScope = sycl::memory_scope::device,
+          typename T>
 T atomic_compare_exchange_strong(
-    T *addr, T expected, T desired,
+    T *addr, type_identity_t<T> expected, type_identity_t<T> desired,
     sycl::memory_order success = sycl::memory_order::relaxed,
     sycl::memory_order fail = sycl::memory_order::relaxed) {
   auto atm =
diff --git a/sycl/include/syclcompat/memory.hpp b/sycl/include/syclcompat/memory.hpp
index 98bf55fec1cc3..028056424bd17 100644
--- a/sycl/include/syclcompat/memory.hpp
+++ b/sycl/include/syclcompat/memory.hpp
@@ -47,6 +47,7 @@
 #include <sycl/usm.hpp>
 
 #include <syclcompat/device.hpp>
+#include <syclcompat/traits.hpp>
 
 #if defined(__linux__)
 #include <sys/mman.h>
@@ -564,13 +565,6 @@ static sycl::event memcpy_async(void *to_ptr, const void *from_ptr, size_t size,
   return detail::memcpy(q, to_ptr, from_ptr, size);
 }
 
-namespace detail {
-template <class T> struct dont_deduce {
-  using type = T;
-};
-template <class T> using dont_deduce_t = typename dont_deduce<T>::type;
-} // namespace detail
-
 /// Asynchronously copies \p count T's from the address specified by \p
 /// from_ptr to the address specified by \p to_ptr. The return of the function
 /// does NOT guarantee the copy is completed.
@@ -582,10 +576,9 @@ template <class T> using dont_deduce_t = typename dont_deduce<T>::type;
 /// \param q Queue to execute the copy task.
 /// \returns no return value.
 template <typename T>
-static sycl::event memcpy_async(detail::dont_deduce_t<T> *to_ptr,
-                                const detail::dont_deduce_t<T> *from_ptr,
-                                size_t count,
-                                sycl::queue q = get_default_queue()) {
+static sycl::event
+memcpy_async(type_identity_t<T> *to_ptr, const type_identity_t<T> *from_ptr,
+             size_t count, sycl::queue q = get_default_queue()) {
   return detail::memcpy(q, static_cast<void *>(to_ptr),
                         static_cast<const void *>(from_ptr), count * sizeof(T));
 }
@@ -601,8 +594,8 @@ static sycl::event memcpy_async(detail::dont_deduce_t<T> *to_ptr,
 /// \param q Queue to execute the copy task.
 /// \returns no return value.
 template <typename T>
-static void memcpy(detail::dont_deduce_t<T> *to_ptr,
-                   const detail::dont_deduce_t<T> *from_ptr, size_t count,
+static void memcpy(type_identity_t<T> *to_ptr,
+                   const type_identity_t<T> *from_ptr, size_t count,
                    sycl::queue q = get_default_queue()) {
   detail::memcpy(q, static_cast<void *>(to_ptr),
                  static_cast<const void *>(from_ptr), count * sizeof(T))
diff --git a/sycl/include/syclcompat/traits.hpp b/sycl/include/syclcompat/traits.hpp
new file mode 100644
index 0000000000000..f992c67bae8ca
--- /dev/null
+++ b/sycl/include/syclcompat/traits.hpp
@@ -0,0 +1,44 @@
+/***************************************************************************
+ *
+ *  Copyright (C) Codeplay Software Ltd.
+ *
+ *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
+ *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
+ *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ *  SYCL compatibility extension
+ *
+ *  traits.hpp
+ *
+ *  Description:
+ *    Type traits for the SYCL compatibility extension
+ **************************************************************************/
+
+#pragma once
+
+#include <cstddef>
+#include <type_traits>
+
+namespace syclcompat {
+
+// Equivalent to C++20's std::type_identity (used to create non-deduced
+// contexts)
+template <class T> struct type_identity {
+  using type = T;
+};
+template <class T> using type_identity_t = typename type_identity<T>::type;
+
+// Defines the operand type for arithemtic operations on T. This is identity
+// for all types except pointers, for which it is std::ptrdiff_t
+template <typename T> struct arith {
+  using type = std::conditional_t<std::is_pointer_v<T>, std::ptrdiff_t, T>;
+};
+template <typename T> using arith_t = typename arith<T>::type;
+
+} // namespace syclcompat
diff --git a/sycl/test-e2e/syclcompat/atomic/atomic_arith.cpp b/sycl/test-e2e/syclcompat/atomic/atomic_arith.cpp
index f085d2e2dde2e..002621de9bfeb 100644
--- a/sycl/test-e2e/syclcompat/atomic/atomic_arith.cpp
+++ b/sycl/test-e2e/syclcompat/atomic/atomic_arith.cpp
@@ -35,6 +35,7 @@
 // RUN: %clangxx -std=c++20 -fsycl -fsycl-targets=%{sycl_triple} %s -o %t.out
 // RUN: %{run} %t.out
 
+#include <cstddef>
 #include <type_traits>
 
 #include <sycl/sycl.hpp>
@@ -53,21 +54,13 @@
 // In every case we test two API overloads, one taking an explicit runtime
 // memory_order argument. We use `relaxed` in every case because these tests
 // are *not* checking the memory_order semantics, just the API.
-template <typename T, bool orderArg = false>
-inline void atomic_fetch_add_kernel(T *data, syclcompat::arith_t<T> operand) {
-  if constexpr (orderArg) {
-    syclcompat::atomic_fetch_add(data, operand, sycl::memory_order::relaxed);
-  } else {
-    syclcompat::atomic_fetch_add(data, operand);
-  }
+template <typename T1, typename T2>
+inline void atomic_fetch_add_kernel(T1 *data, T2 operand) {
+  syclcompat::atomic_fetch_add(data, operand);
 }
-template <typename T, bool orderArg = false>
-inline void atomic_fetch_sub_kernel(T *data, syclcompat::arith_t<T> operand) {
-  if constexpr (orderArg) {
-    syclcompat::atomic_fetch_sub(data, operand, sycl::memory_order::relaxed);
-  } else {
-    syclcompat::atomic_fetch_sub(data, operand);
-  }
+template <typename T1, typename T2>
+inline void atomic_fetch_sub_kernel(T1 *data, T2 operand) {
+  syclcompat::atomic_fetch_sub(data, operand);
 }
 
 template <typename T> void test_atomic_arith() {
@@ -79,13 +72,9 @@ template <typename T> void test_atomic_arith() {
   constexpr T init = static_cast<T>(0);
   constexpr T operand = static_cast<T>(1);
 
-  AtomicLauncher<atomic_fetch_add_kernel<T>, T>(grid, threads)
+  AtomicLauncher<atomic_fetch_add_kernel<T, T>, T>(grid, threads)
       .launch_test(init, sum, operand);
-  AtomicLauncher<atomic_fetch_sub_kernel<T>, T>(grid, threads)
-      .launch_test(sum, init, operand);
-  AtomicLauncher<atomic_fetch_add_kernel<T, true>, T>(grid, threads)
-      .launch_test(init, sum, operand);
-  AtomicLauncher<atomic_fetch_sub_kernel<T, true>, T>(grid, threads)
+  AtomicLauncher<atomic_fetch_sub_kernel<T, T>, T>(grid, threads)
       .launch_test(sum, init, operand);
 }
 
@@ -101,21 +90,38 @@ template <typename T> void test_atomic_ptr_arith() {
   T final = init + (grid.x * threads.x);
   constexpr std::ptrdiff_t operand = static_cast<std::ptrdiff_t>(1);
 
-  AtomicLauncher<atomic_fetch_add_kernel<T>, T>(grid, threads)
+  AtomicLauncher<atomic_fetch_add_kernel<T, std::ptrdiff_t>, T>(grid, threads)
       .launch_test(init, final, operand);
 
-  AtomicLauncher<atomic_fetch_sub_kernel<T>, T>(grid, threads)
+  AtomicLauncher<atomic_fetch_sub_kernel<T, std::ptrdiff_t>, T>(grid, threads)
       .launch_test(final, init, operand);
 
-  AtomicLauncher<atomic_fetch_add_kernel<T, true>, T>(grid, threads)
-      .launch_test(init, final, operand);
-
-  AtomicLauncher<atomic_fetch_sub_kernel<T, true>, T>(grid, threads)
-      .launch_test(final, init, operand);
   syclcompat::free(init);
 }
 
+void test_atomic_arith_t1_t2() {
+  std::cout << __PRETTY_FUNCTION__ << std::endl;
+  using data_t = float;
+  using operand_t = int;
+
+  constexpr syclcompat::dim3 grid{4};
+  constexpr syclcompat::dim3 threads{32};
+  constexpr data_t sum = static_cast<data_t>(grid.x * threads.x);
+  constexpr data_t init = static_cast<data_t>(0);
+  constexpr operand_t operand = static_cast<operand_t>(1);
+
+  AtomicLauncher<atomic_fetch_add_kernel<data_t, operand_t>, data_t>(grid,
+                                                                     threads)
+      .launch_test(init, sum, operand);
+  AtomicLauncher<atomic_fetch_sub_kernel<data_t, operand_t>, data_t>(grid,
+                                                                     threads)
+      .launch_test(sum, init, operand);
+}
+
 int main() {
   INSTANTIATE_ALL_TYPES(atomic_value_type_list, test_atomic_arith);
   INSTANTIATE_ALL_TYPES(atomic_ptr_type_list, test_atomic_ptr_arith);
+  test_atomic_arith_t1_t2();
+
+  return 0;
 }
diff --git a/sycl/test-e2e/syclcompat/atomic/atomic_bitwise.cpp b/sycl/test-e2e/syclcompat/atomic/atomic_bitwise.cpp
index 8a204ab1407c2..f212701c10572 100644
--- a/sycl/test-e2e/syclcompat/atomic/atomic_bitwise.cpp
+++ b/sycl/test-e2e/syclcompat/atomic/atomic_bitwise.cpp
@@ -56,38 +56,20 @@
 // In every case we test two API overloads, one taking an explicit runtime
 // memory_order argument. We use `relaxed` in every case because these tests
 // are *not* checking the memory_order semantics, just the API.
-template <typename T, bool orderArg = false>
-void atomic_fetch_and_kernel(T *data, T operand, T operand0) {
-  if constexpr (orderArg) {
-    syclcompat::atomic_fetch_and(
-        data, (syclcompat::global_id::x() == 0 ? operand0 : operand),
-        sycl::memory_order::relaxed);
-  } else {
-    syclcompat::atomic_fetch_and(
-        data, (syclcompat::global_id::x() == 0 ? operand0 : operand));
-  }
+template <typename T1, typename T2>
+void atomic_fetch_and_kernel(T1 *data, T2 operand, T2 operand0) {
+  syclcompat::atomic_fetch_and(
+      data, (syclcompat::global_id::x() == 0 ? operand0 : operand));
 }
-template <typename T, bool orderArg = false>
-void atomic_fetch_or_kernel(T *data, T operand, T operand0) {
-  if constexpr (orderArg) {
-    syclcompat::atomic_fetch_or(
-        data, (syclcompat::global_id::x() == 0 ? operand0 : operand),
-        sycl::memory_order::relaxed);
-  } else {
-    syclcompat::atomic_fetch_or(
-        data, (syclcompat::global_id::x() == 0 ? operand0 : operand));
-  }
+template <typename T1, typename T2>
+void atomic_fetch_or_kernel(T1 *data, T2 operand, T2 operand0) {
+  syclcompat::atomic_fetch_or(
+      data, (syclcompat::global_id::x() == 0 ? operand0 : operand));
 }
-template <typename T, bool orderArg = false>
-void atomic_fetch_xor_kernel(T *data, T operand, T operand0) {
-  if constexpr (orderArg) {
-    syclcompat::atomic_fetch_xor(
-        data, (syclcompat::global_id::x() == 0 ? operand0 : operand),
-        sycl::memory_order::relaxed);
-  } else {
-    syclcompat::atomic_fetch_xor(
-        data, (syclcompat::global_id::x() == 0 ? operand0 : operand));
-  }
+template <typename T1, typename T2>
+void atomic_fetch_xor_kernel(T1 *data, T2 operand, T2 operand0) {
+  syclcompat::atomic_fetch_xor(
+      data, (syclcompat::global_id::x() == 0 ? operand0 : operand));
 }
 
 template <typename T> void test_atomic_and() {
@@ -97,30 +79,16 @@ template <typename T> void test_atomic_and() {
   constexpr syclcompat::dim3 threads{32};
 
   // All 0 -> 0
-  AtomicLauncher<atomic_fetch_and_kernel<T>, T>(grid, threads)
+  AtomicLauncher<atomic_fetch_and_kernel<T, T>, T>(grid, threads)
       .launch_test(static_cast<T>(0), static_cast<T>(0), static_cast<T>(0),
                    static_cast<T>(0));
 
   // All 1 -> 1
-  AtomicLauncher<atomic_fetch_and_kernel<T>, T>(grid, threads)
+  AtomicLauncher<atomic_fetch_and_kernel<T, T>, T>(grid, threads)
       .launch_test(static_cast<T>(1), static_cast<T>(1), static_cast<T>(1),
                    static_cast<T>(1));
   // Most 1, one 0 -> 0
-  AtomicLauncher<atomic_fetch_and_kernel<T>, T>(grid, threads)
-      .launch_test(static_cast<T>(1), static_cast<T>(0), static_cast<T>(1),
-                   static_cast<T>(0));
-
-  // All 0 -> 0
-  AtomicLauncher<atomic_fetch_and_kernel<T, true>, T>(grid, threads)
-      .launch_test(static_cast<T>(0), static_cast<T>(0), static_cast<T>(0),
-                   static_cast<T>(0));
-
-  // All 1 -> 1
-  AtomicLauncher<atomic_fetch_and_kernel<T, true>, T>(grid, threads)
-      .launch_test(static_cast<T>(1), static_cast<T>(1), static_cast<T>(1),
-                   static_cast<T>(1));
-  // Most 1, one 0 -> 0
-  AtomicLauncher<atomic_fetch_and_kernel<T, true>, T>(grid, threads)
+  AtomicLauncher<atomic_fetch_and_kernel<T, T>, T>(grid, threads)
       .launch_test(static_cast<T>(1), static_cast<T>(0), static_cast<T>(1),
                    static_cast<T>(0));
 }
@@ -132,36 +100,19 @@ template <typename T> void test_atomic_or() {
   constexpr syclcompat::dim3 threads{32};
 
   // All 0 -> 0
-  AtomicLauncher<atomic_fetch_or_kernel<T>, T>(grid, threads)
+  AtomicLauncher<atomic_fetch_or_kernel<T, T>, T>(grid, threads)
       .launch_test(static_cast<T>(0), static_cast<T>(0), static_cast<T>(0),
                    static_cast<T>(0));
   // All 1 -> 1
-  AtomicLauncher<atomic_fetch_or_kernel<T>, T>(grid, threads)
+  AtomicLauncher<atomic_fetch_or_kernel<T, T>, T>(grid, threads)
       .launch_test(static_cast<T>(1), static_cast<T>(1), static_cast<T>(1),
                    static_cast<T>(1));
   // Most 1, one 0 -> 1
-  AtomicLauncher<atomic_fetch_or_kernel<T>, T>(grid, threads)
+  AtomicLauncher<atomic_fetch_or_kernel<T, T>, T>(grid, threads)
       .launch_test(static_cast<T>(1), static_cast<T>(1), static_cast<T>(1),
                    static_cast<T>(0));
   // Init 1, all 0 -> 1
-  AtomicLauncher<atomic_fetch_or_kernel<T>, T>(grid, threads)
-      .launch_test(static_cast<T>(1), static_cast<T>(1), static_cast<T>(0),
-                   static_cast<T>(0));
-
-  // All 0 -> 0
-  AtomicLauncher<atomic_fetch_or_kernel<T, true>, T>(grid, threads)
-      .launch_test(static_cast<T>(0), static_cast<T>(0), static_cast<T>(0),
-                   static_cast<T>(0));
-  // All 1 -> 1
-  AtomicLauncher<atomic_fetch_or_kernel<T, true>, T>(grid, threads)
-      .launch_test(static_cast<T>(1), static_cast<T>(1), static_cast<T>(1),
-                   static_cast<T>(1));
-  // Most 1, one 0 -> 1
-  AtomicLauncher<atomic_fetch_or_kernel<T, true>, T>(grid, threads)
-      .launch_test(static_cast<T>(1), static_cast<T>(1), static_cast<T>(1),
-                   static_cast<T>(0));
-  // Init 1, all 0 -> 1
-  AtomicLauncher<atomic_fetch_or_kernel<T, true>, T>(grid, threads)
+  AtomicLauncher<atomic_fetch_or_kernel<T, T>, T>(grid, threads)
       .launch_test(static_cast<T>(1), static_cast<T>(1), static_cast<T>(0),
                    static_cast<T>(0));
 }
@@ -173,42 +124,121 @@ template <typename T> void test_atomic_xor() {
   constexpr syclcompat::dim3 threads{2}; // 2 threads, 3 values inc. init
 
   // 000 -> 0
-  AtomicLauncher<atomic_fetch_xor_kernel<T>, T>(grid, threads)
+  AtomicLauncher<atomic_fetch_xor_kernel<T, T>, T>(grid, threads)
       .launch_test(static_cast<T>(0), static_cast<T>(0), static_cast<T>(0),
                    static_cast<T>(0));
   // 111 -> 1
-  AtomicLauncher<atomic_fetch_xor_kernel<T>, T>(grid, threads)
+  AtomicLauncher<atomic_fetch_xor_kernel<T, T>, T>(grid, threads)
       .launch_test(static_cast<T>(1), static_cast<T>(1), static_cast<T>(1),
                    static_cast<T>(1));
   // 110 -> 0
-  AtomicLauncher<atomic_fetch_xor_kernel<T>, T>(grid, threads)
+  AtomicLauncher<atomic_fetch_xor_kernel<T, T>, T>(grid, threads)
       .launch_test(static_cast<T>(1), static_cast<T>(0), static_cast<T>(1),
                    static_cast<T>(0));
   // 010 -> 1
-  AtomicLauncher<atomic_fetch_xor_kernel<T>, T>(grid, threads)
+  AtomicLauncher<atomic_fetch_xor_kernel<T, T>, T>(grid, threads)
       .launch_test(static_cast<T>(0), static_cast<T>(1), static_cast<T>(1),
                    static_cast<T>(0));
+}
+
+void test_atomic_and_t1_t2() {
+  std::cout << __PRETTY_FUNCTION__ << std::endl;
+
+  constexpr syclcompat::dim3 grid{4};
+  constexpr syclcompat::dim3 threads{32};
+
+  using data_t = long;
+  using operand_t = unsigned int;
+
+  // All 0 -> 0
+  AtomicLauncher<atomic_fetch_and_kernel<data_t, operand_t>, data_t>(grid,
+                                                                     threads)
+      .launch_test(static_cast<data_t>(0), static_cast<data_t>(0),
+                   static_cast<operand_t>(0), static_cast<operand_t>(0));
+
+  // All 1 -> 1
+  AtomicLauncher<atomic_fetch_and_kernel<data_t, operand_t>, data_t>(grid,
+                                                                     threads)
+      .launch_test(static_cast<data_t>(1), static_cast<data_t>(1),
+                   static_cast<operand_t>(1), static_cast<operand_t>(1));
+  // Most 1, one 0 -> 0
+  AtomicLauncher<atomic_fetch_and_kernel<data_t, operand_t>, data_t>(grid,
+                                                                     threads)
+      .launch_test(static_cast<data_t>(1), static_cast<data_t>(0),
+                   static_cast<operand_t>(1), static_cast<operand_t>(0));
+}
+
+void test_atomic_or_t1_t2() {
+  std::cout << __PRETTY_FUNCTION__ << std::endl;
+
+  constexpr syclcompat::dim3 grid{4};
+  constexpr syclcompat::dim3 threads{32};
+
+  using data_t = long;
+  using operand_t = unsigned int;
+
+  // All 0 -> 0
+  AtomicLauncher<atomic_fetch_or_kernel<data_t, operand_t>, data_t>(grid,
+                                                                    threads)
+      .launch_test(static_cast<data_t>(0), static_cast<data_t>(0),
+                   static_cast<operand_t>(0), static_cast<operand_t>(0));
+  // All 1 -> 1
+  AtomicLauncher<atomic_fetch_or_kernel<data_t, operand_t>, data_t>(grid,
+                                                                    threads)
+      .launch_test(static_cast<data_t>(1), static_cast<data_t>(1),
+                   static_cast<operand_t>(1), static_cast<operand_t>(1));
+  // Most 1, one 0 -> 1
+  AtomicLauncher<atomic_fetch_or_kernel<data_t, operand_t>, data_t>(grid,
+                                                                    threads)
+      .launch_test(static_cast<data_t>(1), static_cast<data_t>(1),
+                   static_cast<operand_t>(1), static_cast<operand_t>(0));
+  // Init 1, all 0 -> 1
+  AtomicLauncher<atomic_fetch_or_kernel<data_t, operand_t>, data_t>(grid,
+                                                                    threads)
+      .launch_test(static_cast<data_t>(1), static_cast<data_t>(1),
+                   static_cast<operand_t>(0), static_cast<operand_t>(0));
+}
+
+void test_atomic_xor_t1_t2() {
+  std::cout << __PRETTY_FUNCTION__ << std::endl;
+
+  constexpr syclcompat::dim3 grid{1};
+  constexpr syclcompat::dim3 threads{2}; // 2 threads, 3 values inc. init
+
+  using data_t = long;
+  using operand_t = unsigned int;
 
   // 000 -> 0
-  AtomicLauncher<atomic_fetch_xor_kernel<T, true>, T>(grid, threads)
-      .launch_test(static_cast<T>(0), static_cast<T>(0), static_cast<T>(0),
-                   static_cast<T>(0));
+  AtomicLauncher<atomic_fetch_xor_kernel<data_t, operand_t>, data_t>(grid,
+                                                                     threads)
+      .launch_test(static_cast<data_t>(0), static_cast<data_t>(0),
+                   static_cast<operand_t>(0), static_cast<operand_t>(0));
   // 111 -> 1
-  AtomicLauncher<atomic_fetch_xor_kernel<T, true>, T>(grid, threads)
-      .launch_test(static_cast<T>(1), static_cast<T>(1), static_cast<T>(1),
-                   static_cast<T>(1));
+  AtomicLauncher<atomic_fetch_xor_kernel<data_t, operand_t>, data_t>(grid,
+                                                                     threads)
+      .launch_test(static_cast<data_t>(1), static_cast<data_t>(1),
+                   static_cast<operand_t>(1), static_cast<operand_t>(1));
   // 110 -> 0
-  AtomicLauncher<atomic_fetch_xor_kernel<T, true>, T>(grid, threads)
-      .launch_test(static_cast<T>(1), static_cast<T>(0), static_cast<T>(1),
-                   static_cast<T>(0));
+  AtomicLauncher<atomic_fetch_xor_kernel<data_t, operand_t>, data_t>(grid,
+                                                                     threads)
+      .launch_test(static_cast<data_t>(1), static_cast<data_t>(0),
+                   static_cast<operand_t>(1), static_cast<operand_t>(0));
   // 010 -> 1
-  AtomicLauncher<atomic_fetch_xor_kernel<T, true>, T>(grid, threads)
-      .launch_test(static_cast<T>(0), static_cast<T>(1), static_cast<T>(1),
-                   static_cast<T>(0));
+  AtomicLauncher<atomic_fetch_xor_kernel<data_t, operand_t>, data_t>(grid,
+                                                                     threads)
+      .launch_test(static_cast<data_t>(0), static_cast<data_t>(1),
+                   static_cast<operand_t>(1), static_cast<operand_t>(0));
 }
 
 int main() {
   INSTANTIATE_ALL_TYPES(integral_type_list, test_atomic_and);
   INSTANTIATE_ALL_TYPES(integral_type_list, test_atomic_or);
   INSTANTIATE_ALL_TYPES(integral_type_list, test_atomic_xor);
+
+  // Avoid combinatorial explosion by only testing the interface
+  test_atomic_and_t1_t2();
+  test_atomic_or_t1_t2();
+  test_atomic_xor_t1_t2();
+
+  return 0;
 }
diff --git a/sycl/test-e2e/syclcompat/atomic/atomic_comp_exchange.cpp b/sycl/test-e2e/syclcompat/atomic/atomic_comp_exchange.cpp
index 24f149a4d4b72..9dc513ae5a86a 100644
--- a/sycl/test-e2e/syclcompat/atomic/atomic_comp_exchange.cpp
+++ b/sycl/test-e2e/syclcompat/atomic/atomic_comp_exchange.cpp
@@ -53,35 +53,25 @@
 // In every case we test two API overloads, one taking an explicit runtime
 // memory_order argument. We use `relaxed` in every case because these tests
 // are *not* checking the memory_order semantics, just the API.
-template <typename T, bool orderArg = false>
+template <typename T>
 inline void atomic_fetch_compare_inc_kernel(T *data, T operand) {
-  if constexpr (orderArg) {
-    syclcompat::atomic_fetch_compare_inc(data, operand,
-                                         sycl::memory_order::relaxed);
-  } else {
-    syclcompat::atomic_fetch_compare_inc(data, operand);
-  }
+  syclcompat::atomic_fetch_compare_inc(data, operand);
 }
-template <typename T, bool orderArg = false>
-inline void atomic_exchange_kernel(T *data, T operand) {
-  if constexpr (orderArg) {
-    syclcompat::atomic_exchange(data, operand, sycl::memory_order::relaxed);
-  } else {
-    syclcompat::atomic_exchange(data, operand);
-  }
+template <typename T>
+inline void atomic_fetch_compare_dec_kernel(T *data, T operand) {
+  syclcompat::atomic_fetch_compare_dec(data, operand);
 }
-template <typename T, bool orderArg = false>
-inline void atomic_compare_exchange_strong_kernel(T *data, T expected,
-                                                  T desired) {
-  if constexpr (orderArg) {
-    syclcompat::atomic_compare_exchange_strong(data, expected, desired,
-                                               sycl::memory_order::relaxed);
-  } else {
-    syclcompat::atomic_compare_exchange_strong(data, expected, desired);
-  }
+template <typename T1, typename T2>
+inline void atomic_exchange_kernel(T1 *data, T2 operand) {
+  syclcompat::atomic_exchange(data, operand);
+}
+template <typename T1, typename T2, typename T3>
+inline void atomic_compare_exchange_strong_kernel(T1 *data, T2 expected,
+                                                  T3 desired) {
+  syclcompat::atomic_compare_exchange_strong(data, expected, desired);
 }
 
-void test_atomic_comp() {
+void test_atomic_comp_inc() {
   std::cout << __PRETTY_FUNCTION__ << std::endl;
 
   constexpr syclcompat::dim3 grid{1};
@@ -93,13 +83,20 @@ void test_atomic_comp() {
   AtomicLauncher<atomic_fetch_compare_inc_kernel<unsigned int>, unsigned int>(
       grid, threads)
       .launch_test(1, 0, 6);
+}
 
-  AtomicLauncher<atomic_fetch_compare_inc_kernel<unsigned int, true>,
-                 unsigned int>(grid, threads)
-      .launch_test(0, 6, 6);
-  AtomicLauncher<atomic_fetch_compare_inc_kernel<unsigned int, true>,
-                 unsigned int>(grid, threads)
-      .launch_test(1, 0, 6);
+void test_atomic_comp_dec() {
+  std::cout << __PRETTY_FUNCTION__ << std::endl;
+
+  constexpr syclcompat::dim3 grid{1};
+  constexpr syclcompat::dim3 threads{6};
+
+  AtomicLauncher<atomic_fetch_compare_dec_kernel<unsigned int>, unsigned int>(
+      grid, threads)
+      .launch_test(6, 0, 0);
+  AtomicLauncher<atomic_fetch_compare_dec_kernel<unsigned int>, unsigned int>(
+      grid, threads)
+      .launch_test(0, 6, 11);
 }
 
 template <typename T> void test_atomic_exch() {
@@ -108,13 +105,9 @@ template <typename T> void test_atomic_exch() {
   constexpr syclcompat::dim3 grid{4};
   constexpr syclcompat::dim3 threads{32};
 
-  AtomicLauncher<atomic_exchange_kernel<T>, T>(grid, threads)
+  AtomicLauncher<atomic_exchange_kernel<T, T>, T>(grid, threads)
       .launch_test(static_cast<T>(0), static_cast<T>(1), static_cast<T>(1));
-  AtomicLauncher<atomic_exchange_kernel<T>, T>(grid, threads)
-      .launch_test(static_cast<T>(0), static_cast<T>(0), static_cast<T>(0));
-  AtomicLauncher<atomic_exchange_kernel<T, true>, T>(grid, threads)
-      .launch_test(static_cast<T>(0), static_cast<T>(1), static_cast<T>(1));
-  AtomicLauncher<atomic_exchange_kernel<T, true>, T>(grid, threads)
+  AtomicLauncher<atomic_exchange_kernel<T, T>, T>(grid, threads)
       .launch_test(static_cast<T>(0), static_cast<T>(0), static_cast<T>(0));
 }
 
@@ -128,13 +121,9 @@ template <typename T> void test_atomic_ptr_exch() {
   T ptr1 = (T)syclcompat::malloc(sizeof(ValType));
   T ptr2 = (T)syclcompat::malloc(sizeof(ValType));
 
-  AtomicLauncher<atomic_exchange_kernel<T>, T>(grid, threads)
+  AtomicLauncher<atomic_exchange_kernel<T, T>, T>(grid, threads)
       .launch_test(ptr1, ptr2, ptr2);
-  AtomicLauncher<atomic_exchange_kernel<T>, T>(grid, threads)
-      .launch_test(ptr1, ptr1, ptr1);
-  AtomicLauncher<atomic_exchange_kernel<T, true>, T>(grid, threads)
-      .launch_test(ptr1, ptr2, ptr2);
-  AtomicLauncher<atomic_exchange_kernel<T, true>, T>(grid, threads)
+  AtomicLauncher<atomic_exchange_kernel<T, T>, T>(grid, threads)
       .launch_test(ptr1, ptr1, ptr1);
   syclcompat::free(ptr1);
   syclcompat::free(ptr2);
@@ -146,17 +135,11 @@ template <typename T> void test_atomic_exch_strong() {
   constexpr syclcompat::dim3 grid{4};
   constexpr syclcompat::dim3 threads{32};
 
-  AtomicLauncher<atomic_compare_exchange_strong_kernel<T>, T>(grid, threads)
-      .launch_test(static_cast<T>(0), static_cast<T>(1), static_cast<T>(0),
-                   static_cast<T>(1));
-  AtomicLauncher<atomic_compare_exchange_strong_kernel<T>, T>(grid, threads)
-      .launch_test(static_cast<T>(0), static_cast<T>(0), static_cast<T>(1),
-                   static_cast<T>(2));
-  AtomicLauncher<atomic_compare_exchange_strong_kernel<T, true>, T>(grid,
+  AtomicLauncher<atomic_compare_exchange_strong_kernel<T, T, T>, T>(grid,
                                                                     threads)
       .launch_test(static_cast<T>(0), static_cast<T>(1), static_cast<T>(0),
                    static_cast<T>(1));
-  AtomicLauncher<atomic_compare_exchange_strong_kernel<T, true>, T>(grid,
+  AtomicLauncher<atomic_compare_exchange_strong_kernel<T, T, T>, T>(grid,
                                                                     threads)
       .launch_test(static_cast<T>(0), static_cast<T>(0), static_cast<T>(1),
                    static_cast<T>(2));
@@ -173,14 +156,10 @@ template <typename T> void test_atomic_ptr_exch_strong() {
   T ptr2 = (T)syclcompat::malloc(sizeof(ValType));
   T ptr3 = (T)syclcompat::malloc(sizeof(ValType));
 
-  AtomicLauncher<atomic_compare_exchange_strong_kernel<T>, T>(grid, threads)
-      .launch_test(ptr1, ptr2, ptr1, ptr2);
-  AtomicLauncher<atomic_compare_exchange_strong_kernel<T>, T>(grid, threads)
-      .launch_test(ptr1, ptr1, ptr2, ptr3);
-  AtomicLauncher<atomic_compare_exchange_strong_kernel<T, true>, T>(grid,
+  AtomicLauncher<atomic_compare_exchange_strong_kernel<T, T, T>, T>(grid,
                                                                     threads)
       .launch_test(ptr1, ptr2, ptr1, ptr2);
-  AtomicLauncher<atomic_compare_exchange_strong_kernel<T, true>, T>(grid,
+  AtomicLauncher<atomic_compare_exchange_strong_kernel<T, T, T>, T>(grid,
                                                                     threads)
       .launch_test(ptr1, ptr1, ptr2, ptr3);
   syclcompat::free(ptr1);
@@ -188,6 +167,36 @@ template <typename T> void test_atomic_ptr_exch_strong() {
   syclcompat::free(ptr3);
 }
 
+void test_atomic_exch_t1_t2() {
+  std::cout << __PRETTY_FUNCTION__ << std::endl;
+
+  constexpr syclcompat::dim3 grid{4};
+  constexpr syclcompat::dim3 threads{32};
+
+  AtomicLauncher<atomic_exchange_kernel<float, int>, float>(grid, threads)
+      .launch_test(static_cast<float>(0), static_cast<float>(1),
+                   static_cast<int>(1));
+  AtomicLauncher<atomic_exchange_kernel<float, int>, float>(grid, threads)
+      .launch_test(static_cast<float>(0), static_cast<float>(0),
+                   static_cast<int>(0));
+}
+
+void test_atomic_exch_strong_t1_t2_t3() {
+  std::cout << __PRETTY_FUNCTION__ << std::endl;
+
+  constexpr syclcompat::dim3 grid{4};
+  constexpr syclcompat::dim3 threads{32};
+
+  AtomicLauncher<atomic_compare_exchange_strong_kernel<float, int, unsigned>,
+                 float>(grid, threads)
+      .launch_test(static_cast<float>(0), static_cast<float>(1),
+                   static_cast<int>(0), static_cast<unsigned>(1));
+  AtomicLauncher<atomic_compare_exchange_strong_kernel<float, int, unsigned>,
+                 float>(grid, threads)
+      .launch_test(static_cast<float>(0), static_cast<float>(0),
+                   static_cast<int>(1), static_cast<unsigned>(2));
+}
+
 int main() {
   INSTANTIATE_ALL_TYPES(atomic_value_type_list, test_atomic_exch);
   INSTANTIATE_ALL_TYPES(atomic_value_type_list, test_atomic_exch_strong);
@@ -195,5 +204,9 @@ int main() {
   INSTANTIATE_ALL_TYPES(atomic_ptr_type_list, test_atomic_ptr_exch);
   INSTANTIATE_ALL_TYPES(atomic_ptr_type_list, test_atomic_ptr_exch_strong);
 
-  test_atomic_comp();
+  test_atomic_comp_inc();
+  test_atomic_comp_dec();
+  test_atomic_exch_t1_t2();
+  test_atomic_exch_strong_t1_t2_t3();
+  return 0;
 }
diff --git a/sycl/test-e2e/syclcompat/atomic/atomic_fixt.hpp b/sycl/test-e2e/syclcompat/atomic/atomic_fixt.hpp
index 5764d6d79ce9e..9ac58f6800502 100644
--- a/sycl/test-e2e/syclcompat/atomic/atomic_fixt.hpp
+++ b/sycl/test-e2e/syclcompat/atomic/atomic_fixt.hpp
@@ -72,7 +72,7 @@ template <auto F, typename T> class AtomicLauncher {
                  sycl::queue q = syclcompat::get_default_queue())
       : grid_{grid}, threads_{threads}, q_{q},
         skip_{should_skip<T>(q.get_device())} {
-    data_ = (T *)syclcompat::malloc(sizeof(T));
+    data_ = (T *)syclcompat::malloc(sizeof(T), q_);
   };
   ~AtomicLauncher() { syclcompat::free(data_); }
   template <typename... Args>
@@ -80,10 +80,10 @@ template <auto F, typename T> class AtomicLauncher {
     if (skip_)
       return;
 
-    syclcompat::memcpy(data_, &init_val, sizeof(T));
-    syclcompat::launch<F>(grid_, threads_, data_, args...);
+    syclcompat::memcpy(data_, &init_val, sizeof(T), q_);
+    syclcompat::launch<F>(grid_, threads_, q_, data_, args...);
     T result_val;
-    syclcompat::memcpy(&result_val, data_, sizeof(T));
+    syclcompat::memcpy(&result_val, data_, sizeof(T), q_);
     syclcompat::wait();
     assert(result_val == expected_result);
   }
diff --git a/sycl/test-e2e/syclcompat/atomic/atomic_memory_acq_rel.cpp b/sycl/test-e2e/syclcompat/atomic/atomic_memory_acq_rel.cpp
index 185ccd4f8c45b..6b31bcc626ee0 100644
--- a/sycl/test-e2e/syclcompat/atomic/atomic_memory_acq_rel.cpp
+++ b/sycl/test-e2e/syclcompat/atomic/atomic_memory_acq_rel.cpp
@@ -48,8 +48,7 @@ using namespace sycl;
 
 using address_space = sycl::access::address_space;
 
-template <memory_order order, bool orderArg = false>
-void test_acquire_global() {
+template <memory_order order> void test_acquire_global() {
   std::cout << __PRETTY_FUNCTION__ << std::endl;
 
   const size_t N_items = 256;
@@ -77,13 +76,9 @@ void test_acquire_global() {
                         address_space::global_space>(val[1]);
          for (int i = 0; i < N_iters; i++) {
            if (it.get_id(0) == 0) {
-             if constexpr (orderArg) {
-               syclcompat::atomic_fetch_add<int, address_space::global_space>(
-                   &val[0], 1, order);
-             } else {
-               syclcompat::atomic_fetch_add<int, address_space::global_space,
-                                            order>(&val[0], 1);
-             }
+
+             syclcompat::atomic_fetch_add<address_space::global_space, order>(
+                 &val[0], 1);
              val_p[1]++;
            } else {
              // syclcompat:: doesn't offer load/store so using sycl::atomic_ref
@@ -101,7 +96,7 @@ void test_acquire_global() {
   assert(error == 0);
 }
 
-template <memory_order order, bool orderArg = false> void test_acquire_local() {
+template <memory_order order> void test_acquire_local() {
   std::cout << __PRETTY_FUNCTION__ << std::endl;
 
   const size_t local_size = 256;
@@ -136,17 +131,12 @@ template <memory_order order, bool orderArg = false> void test_acquire_local() {
                         address_space::local_space>(val[1]);
          for (int i = 0; i < N_iters; i++) {
            if (it.get_local_id(0) == 0) {
-             if constexpr (orderArg) {
-               syclcompat::atomic_fetch_add<int, address_space::local_space>(
-                   &val[0], 1, order);
-             } else {
-               syclcompat::atomic_fetch_add<int, address_space::local_space,
-                                            order>(&val[0], 1);
-             }
+             syclcompat::atomic_fetch_add<address_space::local_space, order>(
+                 &val[0], 1);
              val_p[1]++;
            } else {
-             // syclcompat:: doesn't offer load/store so using sycl::atomic_ref
-             // here
+             // syclcompat:: doesn't offer load/store so using
+             // sycl::atomic_ref here
              int tmp1 = atm1.load(memory_order::acquire);
              int tmp0 = atm0.load(memory_order::relaxed);
              if (tmp0 < tmp1) {
@@ -160,8 +150,7 @@ template <memory_order order, bool orderArg = false> void test_acquire_local() {
   assert(error == 0);
 }
 
-template <memory_order order, bool orderArg = false>
-void test_release_global() {
+template <memory_order order> void test_release_global() {
   std::cout << __PRETTY_FUNCTION__ << std::endl;
 
   const size_t N_items = 256;
@@ -190,13 +179,8 @@ void test_release_global() {
          for (int i = 0; i < N_iters; i++) {
            if (it.get_id(0) == 0) {
              val_p[0]++;
-             if constexpr (orderArg) {
-               syclcompat::atomic_fetch_add<int, address_space::global_space>(
-                   &val[1], 1, order);
-             } else {
-               syclcompat::atomic_fetch_add<int, address_space::global_space,
-                                            order>(&val[1], 1);
-             }
+             syclcompat::atomic_fetch_add<address_space::global_space, order>(
+                 &val[1], 1);
            } else {
              // syclcompat:: doesn't offer load/store so using sycl::atomic_ref
              // here
@@ -213,7 +197,7 @@ void test_release_global() {
   assert(error == 0);
 }
 
-template <memory_order order, bool orderArg = false> void test_release_local() {
+template <memory_order order> void test_release_local() {
   std::cout << __PRETTY_FUNCTION__ << std::endl;
 
   const size_t local_size = 256;
@@ -249,16 +233,11 @@ template <memory_order order, bool orderArg = false> void test_release_local() {
          for (int i = 0; i < N_iters; i++) {
            if (it.get_local_id(0) == 0) {
              val_p[0]++;
-             if constexpr (orderArg) {
-               syclcompat::atomic_fetch_add<int, address_space::local_space>(
-                   &val[1], 1, order);
-             } else {
-               syclcompat::atomic_fetch_add<int, address_space::local_space,
-                                            order>(&val[1], 1);
-             }
+             syclcompat::atomic_fetch_add<address_space::local_space, order>(
+                 &val[1], 1);
            } else {
-             // syclcompat:: doesn't offer load/store so using sycl::atomic_ref
-             // here
+             // syclcompat:: doesn't offer load/store so using
+             // sycl::atomic_ref here
              int tmp1 = atm1.load(memory_order::acquire);
              int tmp0 = atm0.load(memory_order::relaxed);
              if (tmp0 < tmp1) {
@@ -287,11 +266,6 @@ int main() {
     test_acquire_local<memory_order::acq_rel>();
     test_release_global<memory_order::acq_rel>();
     test_release_local<memory_order::acq_rel>();
-    // Test alternative API
-    test_acquire_global<memory_order::acq_rel, true>();
-    test_acquire_local<memory_order::acq_rel, true>();
-    test_release_global<memory_order::acq_rel, true>();
-    test_release_local<memory_order::acq_rel, true>();
   }
 
   if (is_supported(supported_memory_orders, memory_order::seq_cst)) {
@@ -299,11 +273,6 @@ int main() {
     test_acquire_local<memory_order::seq_cst>();
     test_release_global<memory_order::seq_cst>();
     test_release_local<memory_order::seq_cst>();
-    // Test alternative API
-    test_acquire_global<memory_order::seq_cst, true>();
-    test_acquire_local<memory_order::seq_cst, true>();
-    test_release_global<memory_order::seq_cst, true>();
-    test_release_local<memory_order::seq_cst, true>();
   }
 
   return 0;
diff --git a/sycl/test-e2e/syclcompat/atomic/atomic_minmax.cpp b/sycl/test-e2e/syclcompat/atomic/atomic_minmax.cpp
index e67cf27593119..4dd5cd3a634a1 100644
--- a/sycl/test-e2e/syclcompat/atomic/atomic_minmax.cpp
+++ b/sycl/test-e2e/syclcompat/atomic/atomic_minmax.cpp
@@ -53,27 +53,15 @@
 // In every case we test two API overloads, one taking an explicit runtime
 // memory_order argument. We use `relaxed` in every case because these tests
 // are *not* checking the memory_order semantics, just the API.
-template <typename T, bool orderArg = false>
-inline void atomic_fetch_min_kernel(T *data, T operand, T operand0) {
-  if constexpr (orderArg) {
-    syclcompat::atomic_fetch_min(
-        data, (syclcompat::global_id::x() == 0 ? operand0 : operand),
-        sycl::memory_order::relaxed);
-  } else {
-    syclcompat::atomic_fetch_min(
-        data, (syclcompat::global_id::x() == 0 ? operand0 : operand));
-  }
+template <typename T1, typename T2>
+inline void atomic_fetch_min_kernel(T1 *data, T2 operand, T2 operand0) {
+  syclcompat::atomic_fetch_min(
+      data, (syclcompat::global_id::x() == 0 ? operand0 : operand));
 }
-template <typename T, bool orderArg = false>
-inline void atomic_fetch_max_kernel(T *data, T operand, T operand0) {
-  if constexpr (orderArg) {
-    syclcompat::atomic_fetch_max(
-        data, (syclcompat::global_id::x() == 0 ? operand0 : operand),
-        sycl::memory_order::relaxed);
-  } else {
-    syclcompat::atomic_fetch_max(
-        data, (syclcompat::global_id::x() == 0 ? operand0 : operand));
-  }
+template <typename T1, typename T2>
+inline void atomic_fetch_max_kernel(T1 *data, T2 operand, T2 operand0) {
+  syclcompat::atomic_fetch_max(
+      data, (syclcompat::global_id::x() == 0 ? operand0 : operand));
 }
 
 template <typename T> void test_atomic_minmax() {
@@ -82,16 +70,10 @@ template <typename T> void test_atomic_minmax() {
   constexpr syclcompat::dim3 grid{4};
   constexpr syclcompat::dim3 threads{32};
 
-  AtomicLauncher<atomic_fetch_min_kernel<T>, T>(grid, threads)
+  AtomicLauncher<atomic_fetch_min_kernel<T, T>, T>(grid, threads)
       .launch_test(static_cast<T>(100), static_cast<T>(1), static_cast<T>(200),
                    static_cast<T>(1));
-  AtomicLauncher<atomic_fetch_max_kernel<T>, T>(grid, threads)
-      .launch_test(static_cast<T>(100), static_cast<T>(200),
-                   static_cast<T>(200), static_cast<T>(1));
-  AtomicLauncher<atomic_fetch_min_kernel<T, true>, T>(grid, threads)
-      .launch_test(static_cast<T>(100), static_cast<T>(1), static_cast<T>(200),
-                   static_cast<T>(1));
-  AtomicLauncher<atomic_fetch_max_kernel<T, true>, T>(grid, threads)
+  AtomicLauncher<atomic_fetch_max_kernel<T, T>, T>(grid, threads)
       .launch_test(static_cast<T>(100), static_cast<T>(200),
                    static_cast<T>(200), static_cast<T>(1));
 }
@@ -102,22 +84,32 @@ template <typename T> void test_signed_atomic_minmax() {
   constexpr syclcompat::dim3 grid{4};
   constexpr syclcompat::dim3 threads{32};
 
-  AtomicLauncher<atomic_fetch_min_kernel<T>, T>(grid, threads)
+  AtomicLauncher<atomic_fetch_min_kernel<T, T>, T>(grid, threads)
       .launch_test(static_cast<T>(-1), static_cast<T>(-4), static_cast<T>(-4),
                    static_cast<T>(100));
-  AtomicLauncher<atomic_fetch_max_kernel<T>, T>(grid, threads)
-      .launch_test(static_cast<T>(-40), static_cast<T>(-30),
-                   static_cast<T>(-30), static_cast<T>(-100));
-  AtomicLauncher<atomic_fetch_min_kernel<T, true>, T>(grid, threads)
-      .launch_test(static_cast<T>(-1), static_cast<T>(-4), static_cast<T>(-4),
-                   static_cast<T>(100));
-  AtomicLauncher<atomic_fetch_max_kernel<T, true>, T>(grid, threads)
+  AtomicLauncher<atomic_fetch_max_kernel<T, T>, T>(grid, threads)
       .launch_test(static_cast<T>(-40), static_cast<T>(-30),
                    static_cast<T>(-30), static_cast<T>(-100));
 }
 
+void test_signed_atomic_minmax_t1_t2() {
+  std::cout << __PRETTY_FUNCTION__ << std::endl;
+
+  constexpr syclcompat::dim3 grid{4};
+  constexpr syclcompat::dim3 threads{32};
+
+  AtomicLauncher<atomic_fetch_min_kernel<float, int>, float>(grid, threads)
+      .launch_test(static_cast<float>(-1), static_cast<float>(-4),
+                   static_cast<int>(-4), static_cast<int>(100));
+  AtomicLauncher<atomic_fetch_max_kernel<float, int>, float>(grid, threads)
+      .launch_test(static_cast<float>(-40), static_cast<float>(-30),
+                   static_cast<int>(-30), static_cast<int>(-100));
+}
+
 int main() {
   INSTANTIATE_ALL_TYPES(atomic_value_type_list, test_atomic_minmax);
-
   INSTANTIATE_ALL_TYPES(signed_type_list, test_signed_atomic_minmax);
+  test_signed_atomic_minmax_t1_t2();
+
+  return 0;
 }

From aa0171deb3a52701f9bb6fcf6066510c68b98322 Mon Sep 17 00:00:00 2001
From: Chen Cheng <110446443+ChengChen002@users.noreply.github.com>
Date: Tue, 7 Nov 2023 19:47:51 +0800
Subject: [PATCH 850/877] [OpenCL-AOT] Fix out of range issue of finding file
 extension (#11767)

Co-authored-by: Alexey Sachkov <alexey.sachkov@intel.com>
---
 opencl/opencl-aot/source/utils.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/opencl/opencl-aot/source/utils.cpp b/opencl/opencl-aot/source/utils.cpp
index bc7dab991ad7d..73a60defb118b 100644
--- a/opencl/opencl-aot/source/utils.cpp
+++ b/opencl/opencl-aot/source/utils.cpp
@@ -405,7 +405,10 @@ readBinaryFile(std::string FileName) {
 
 bool isFileEndsWithGivenExtentionName(const std::string &FileName,
                                       const char *Ext) {
-  return FileName.substr(FileName.find_last_of('.')) == Ext;
+  std::size_t LastCharPosition = FileName.find_last_of('.');
+  if (LastCharPosition == std::string::npos)
+    return false;
+  return FileName.substr(LastCharPosition) == Ext;
 }
 
 bool isFileStartsWithGivenMagicNumber(const std::vector<char> &BinaryData,

From 3436a5a18e3c166d737924ff6e06c4b9937be5a1 Mon Sep 17 00:00:00 2001
From: Konrad Kusiak <konrad.kusiak@codeplay.com>
Date: Tue, 7 Nov 2023 15:53:15 +0000
Subject: [PATCH 851/877] [SYCL][CUDA][HIP] Fix device architecture extension
 query (#11630)

Small fix to strip down additional CPU information from AMD query, like:
`gfx90a:sramecc+:xnack-` to just `gfx90a`
---
 sycl/source/detail/device_info.hpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/sycl/source/detail/device_info.hpp b/sycl/source/detail/device_info.hpp
index 68d105fef0f61..319c1cf467793 100644
--- a/sycl/source/detail/device_info.hpp
+++ b/sycl/source/detail/device_info.hpp
@@ -674,7 +674,10 @@ struct get_device_info_impl<
       Dev->getPlugin()->call<PiApiKind::piDeviceGetInfo>(
           Dev->getHandleRef(), PiInfoCode<info::device::version>::value,
           ResultSize, DeviceArch.get(), nullptr);
-      return MapArchIDToArchName(DeviceArch.get());
+      std::string DeviceArchCopy(DeviceArch.get());
+      std::string DeviceArchSubstr =
+          DeviceArchCopy.substr(0, DeviceArchCopy.find(":"));
+      return MapArchIDToArchName(DeviceArchSubstr.data());
     } else if (Dev->is_cpu() && backend::opencl == CurrentBackend) {
       auto MapArchIDToArchName = [](const int arch) {
         INTEL_CPU_ARCHES(CMP_INTEL);

From 36987599051c56be0b0da5a92dc037b0b9b2b807 Mon Sep 17 00:00:00 2001
From: Evgeniy <evgeniy.tyurin@intel.com>
Date: Tue, 7 Nov 2023 07:53:39 -0800
Subject: [PATCH 852/877] [SYCL][ESIMD][E2E] Remove emulator support from E2E
 tests (#11780)

---
 sycl/test-e2e/Basic/library_loading.cpp       |  5 ---
 .../device_architecture_on_host.cpp           |  2 +-
 sycl/test-e2e/DeviceCodeSplit/grf.cpp         |  3 --
 .../is_compatible_esimd_emulator.cpp          | 41 -------------------
 sycl/test-e2e/Regression/device_num.cpp       |  1 -
 sycl/test-e2e/format.py                       |  7 ----
 sycl/test/lit.cfg.py                          |  3 --
 sycl/test/lit.site.cfg.py.in                  |  1 -
 8 files changed, 1 insertion(+), 62 deletions(-)
 delete mode 100644 sycl/test-e2e/OptionalKernelFeatures/is_compatible/is_compatible_esimd_emulator.cpp

diff --git a/sycl/test-e2e/Basic/library_loading.cpp b/sycl/test-e2e/Basic/library_loading.cpp
index 4639bfbe5d7ee..8a3244e6599af 100644
--- a/sycl/test-e2e/Basic/library_loading.cpp
+++ b/sycl/test-e2e/Basic/library_loading.cpp
@@ -2,9 +2,6 @@
 // RUN: %{build} -o %t.out
 // RUN: env SYCL_PI_TRACE=-1 %{run-unfiltered-devices} %t.out &> %t_trace_no_filter.txt || true
 // RUN: FileCheck --input-file=%t_trace_no_filter.txt --check-prefix=CHECK-NO-FILTER %s -dump-input=fail
-// RUN: FileCheck --input-file=%t_trace_no_filter.txt --check-prefix=CHECK-ESIMD %s -dump-input=fail
-// RUN: env SYCL_PI_TRACE=-1 ONEAPI_DEVICE_SELECTOR='esimd_emulator:*' %{run-unfiltered-devices} %t.out &> %t_trace_esimd_filter.txt || true
-// RUN: FileCheck --input-file=%t_trace_esimd_filter.txt --check-prefix=CHECK-ESIMD-FILTER %s
 // Checks pi traces on library loading
 
 #include <sycl/sycl.hpp>
@@ -14,8 +11,6 @@ using namespace sycl;
 int main() {
   // CHECK-NO-FILTER-DAG: {{(SYCL_PI_TRACE\[-1\]: dlopen\(.*/libpi_cuda.so\) failed with)|(SYCL_PI_TRACE\[basic\]: Plugin found and successfully loaded: libpi_cuda.so)}}
   // CHECK-NO-FILTER-DAG: {{(SYCL_PI_TRACE\[-1\]: dlopen\(.*/libpi_hip.so\) failed with)|(SYCL_PI_TRACE\[basic\]: Plugin found and successfully loaded: libpi_hip.so)}}
-  // CHECK-ESIMD-NOT: {{(SYCL_PI_TRACE\[-1\]: dlopen\(.*/libpi_esimd_emulator.so\) failed with)|(SYCL_PI_TRACE\[basic\]: Plugin found and successfully loaded: libpi_esimd_emulator.so)}}
-  // CHECK-ESIMD-FILTER: {{(SYCL_PI_TRACE\[-1\]: dlopen\(.*/libpi_esimd_emulator.so\) failed with)|(SYCL_PI_TRACE\[basic\]: Plugin found and successfully loaded: libpi_esimd_emulator.so)}}
   queue q;
   q.submit([&](handler &cgh) {});
 }
diff --git a/sycl/test-e2e/DeviceArchitecture/device_architecture_on_host.cpp b/sycl/test-e2e/DeviceArchitecture/device_architecture_on_host.cpp
index 3c516a160b6f5..4f13d16b4a694 100644
--- a/sycl/test-e2e/DeviceArchitecture/device_architecture_on_host.cpp
+++ b/sycl/test-e2e/DeviceArchitecture/device_architecture_on_host.cpp
@@ -1,4 +1,4 @@
-// UNSUPPORTED: esimd_emulator, accelerator
+// UNSUPPORTED: accelerator
 
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
diff --git a/sycl/test-e2e/DeviceCodeSplit/grf.cpp b/sycl/test-e2e/DeviceCodeSplit/grf.cpp
index 6578fe2166117..2f09cf751754b 100644
--- a/sycl/test-e2e/DeviceCodeSplit/grf.cpp
+++ b/sycl/test-e2e/DeviceCodeSplit/grf.cpp
@@ -15,9 +15,6 @@
 
 // REQUIRES: gpu && gpu-intel-pvc
 // UNSUPPORTED: cuda || hip
-// TODO/FIXME: esimd_emulator does not support online compilation that
-//             invokes 'piProgramBuild'/'piKernelCreate'
-// UNSUPPORTED: esimd_emulator
 // RUN: %{build} -o %t.out
 // RUN: env SYCL_PI_TRACE=-1 %{run} %t.out 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-NO-VAR
 // RUN: env SYCL_PROGRAM_COMPILE_OPTIONS="-g" SYCL_PI_TRACE=-1 %{run} %t.out 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-WITH-VAR
diff --git a/sycl/test-e2e/OptionalKernelFeatures/is_compatible/is_compatible_esimd_emulator.cpp b/sycl/test-e2e/OptionalKernelFeatures/is_compatible/is_compatible_esimd_emulator.cpp
deleted file mode 100644
index 1042b983167cc..0000000000000
--- a/sycl/test-e2e/OptionalKernelFeatures/is_compatible/is_compatible_esimd_emulator.cpp
+++ /dev/null
@@ -1,41 +0,0 @@
-// REQUIRES: esimd_emulator
-
-// RUN: %clangxx -fsycl %S/Inputs/is_compatible_with_env.cpp %t_negative_case.out
-// RUN: env ONEAPI_DEVICE_SELECTOR=ext_intel_esimd_emulator:gpu %{run} not %t_negative_case.out
-
-// RUN: %{build} -o %t.out
-// RUN: %{run} %t.out
-
-// Just an example from
-// https://github.com/intel/llvm/tree/sycl/sycl/doc/extensions/experimental/sycl_ext_intel_esimd
-
-#include <sycl/ext/intel/esimd.hpp>
-#include <sycl/sycl.hpp>
-
-int main() {
-  sycl::device dev;
-  if (sycl::is_compatible<class Test>(dev)) {
-    float *A = malloc_shared<float>(Size, q);
-    float *B = malloc_shared<float>(Size, q);
-    float *C = malloc_shared<float>(Size, q);
-
-    for (unsigned i = 0; i != Size; i++) {
-      A[i] = B[i] = i;
-    }
-
-    q.submit([&](handler &cgh) {
-       cgh.parallel_for<class Test>(Size / VL,
-                                    [=](id<1> i) [[intel::sycl_explicit_simd]] {
-                                      auto offset = i * VL;
-                                      // pointer arithmetic, so offset is in
-                                      // elements:
-                                      simd<float, VL> va(A + offset);
-                                      simd<float, VL> vb(B + offset);
-                                      simd<float, VL> vc = va + vb;
-                                      vc.copy_to(C + offset);
-                                    });
-     }).wait_and_throw();
-    return 0;
-  }
-  return 1;
-}
diff --git a/sycl/test-e2e/Regression/device_num.cpp b/sycl/test-e2e/Regression/device_num.cpp
index f559cef287bb1..41bec79112e1a 100644
--- a/sycl/test-e2e/Regression/device_num.cpp
+++ b/sycl/test-e2e/Regression/device_num.cpp
@@ -26,7 +26,6 @@ const std::map<backend, std::string> BackendStringMap = {
     {backend::opencl, "opencl"},
     {backend::host, "host"},
     {backend::ext_oneapi_level_zero, "ext_oneapi_level_zero"},
-    {backend::ext_intel_esimd_emulator, "ext_intel_esimd_emulator"},
     {backend::ext_oneapi_cuda, "ext_oneapi_cuda"},
     {backend::ext_oneapi_hip, "ext_oneapi_hip"},
     {backend::ext_native_cpu, "ext_native_cpu"}};
diff --git a/sycl/test-e2e/format.py b/sycl/test-e2e/format.py
index c6f7c64efe677..0ebe12bc63de8 100644
--- a/sycl/test-e2e/format.py
+++ b/sycl/test-e2e/format.py
@@ -170,13 +170,6 @@ def get_extra_env(sycl_devices):
             if 'ext_oneapi_cuda:gpu' in sycl_devices:
                 extra_env.append('SYCL_PI_CUDA_ENABLE_IMAGE_SUPPORT=1')
 
-            # ESIMD_EMULATOR backend uses CM_EMU library package for
-            # multi-threaded execution on CPU, and the package emulates
-            # multiple target platforms. In case user does not specify
-            # what target platform to emulate, 'skl' is chosen by default.
-            if 'ext_intel_esimd_emulator:gpu' in sycl_devices and not "CM_RT_PLATFORM" in os.environ:
-                extra_env.append('CM_RT_PLATFORM=skl')
-
             return extra_env
 
         extra_env = get_extra_env(devices_for_test)
diff --git a/sycl/test/lit.cfg.py b/sycl/test/lit.cfg.py
index 81edee9ddf3e0..e2afabe612363 100644
--- a/sycl/test/lit.cfg.py
+++ b/sycl/test/lit.cfg.py
@@ -126,9 +126,6 @@
 if config.hip_be == "ON":
     config.available_features.add('hip_be')
 
-if config.esimd_emulator_be == "ON":
-    config.available_features.add('esimd_emulator_be')
-
 if config.opencl_be == "ON":
     config.available_features.add('opencl_be')
 
diff --git a/sycl/test/lit.site.cfg.py.in b/sycl/test/lit.site.cfg.py.in
index c799fa9da9675..884aa2f0a4204 100644
--- a/sycl/test/lit.site.cfg.py.in
+++ b/sycl/test/lit.site.cfg.py.in
@@ -27,7 +27,6 @@ config.sycl_threads_lib = '@SYCL_THREADS_LIB@'
 config.sycl_use_libcxx = '@SYCL_USE_LIBCXX@'
 config.extra_environment = lit_config.params.get("extra_environment", "@LIT_EXTRA_ENVIRONMENT@")
 config.cuda_be = '@SYCL_BUILD_PI_CUDA@'
-config.esimd_emulator_be = '@SYCL_BUILD_PI_ESIMD_EMULATOR@'
 config.hip_be = '@SYCL_BUILD_PI_HIP@'
 config.opencl_be = '@SYCL_BUILD_PI_OPENCL@'
 config.level_zero_be = '@SYCL_BUILD_PI_LEVEL_ZERO@'

From 0e25d400a983546495970a877cbad81b93e371a0 Mon Sep 17 00:00:00 2001
From: Evgeniy <evgeniy.tyurin@intel.com>
Date: Tue, 7 Nov 2023 07:54:21 -0800
Subject: [PATCH 853/877] [SYCL][ESIMD][Doc] Add an example of using BFN API
 (#11781)

---
 .../sycl_ext_intel_esimd/examples/README.md   |  22 ++++
 .../sycl_ext_intel_esimd/examples/bfn.md      | 124 ++++++++++++++++++
 2 files changed, 146 insertions(+)
 create mode 100644 sycl/doc/extensions/supported/sycl_ext_intel_esimd/examples/bfn.md

diff --git a/sycl/doc/extensions/supported/sycl_ext_intel_esimd/examples/README.md b/sycl/doc/extensions/supported/sycl_ext_intel_esimd/examples/README.md
index 172cbef94c147..f30c8f63cc43d 100644
--- a/sycl/doc/extensions/supported/sycl_ext_intel_esimd/examples/README.md
+++ b/sycl/doc/extensions/supported/sycl_ext_intel_esimd/examples/README.md
@@ -240,4 +240,26 @@ is to show the basic ESIMD APIs in well known examples.
    }).wait_and_throw();
    ```
 
+5) Using BFN API to efficiently compute a binary function of three parameters.
+   Please see the full source code here: ["bfn"]("bfn.md").
+
+   Illustrates using a special API to compute a binary function of three operands.
+   The function is computed for every bit in the integral input.
+
+   ```c++
+   // Let's compute f(x, y, z) = ~x | y ^ z.
+   constexpr esimd::bfn_t F = ~bfn_t::x | bfn_t::y ^ bfn_t::z;
+
+   q.parallel_for(Size / VL, [=](id<1> i) [[intel::sycl_explicit_simd]] {
+     using namespace esimd;
+     auto offset = i * VL;
+     simd<unsigned, VL> X(in1 + offset);
+     simd<unsigned, VL> Y(in2 + offset);
+     simd<unsigned, VL> Z(in3 + offset);
+
+     simd<unsigned, VL> val = bfn<F>(X, Y, Z);
+     val.copy_to(res + offset);
+   }).wait();
+   ```
+
 6) TODO: Add more examples here.
diff --git a/sycl/doc/extensions/supported/sycl_ext_intel_esimd/examples/bfn.md b/sycl/doc/extensions/supported/sycl_ext_intel_esimd/examples/bfn.md
new file mode 100644
index 0000000000000..7d6a8c9bfc5e5
--- /dev/null
+++ b/sycl/doc/extensions/supported/sycl_ext_intel_esimd/examples/bfn.md
@@ -0,0 +1,124 @@
+## Computing a binary function of three operands
+
+Compile and run:
+```bash
+> clang++ -fsycl bnf.cpp
+
+> ONEAPI_DEVICE_SELECTOR=level_zero:gpu ./a.out
+```
+
+Illustrates using a special API to compute a binary function of three operands.
+The function is computed for every bit in the integral input.
+
+Source code:
+```C++
+#include <sycl/ext/intel/esimd.hpp>
+#include <sycl/sycl.hpp>
+
+using namespace sycl;
+using namespace sycl::ext::intel;
+
+using bfn_t = esimd::bfn_t;
+
+// Let's compute f(x, y, z) = ~x | y ^ z.
+constexpr esimd::bfn_t F = ~bfn_t::x | bfn_t::y ^ bfn_t::z;
+
+constexpr unsigned Size = 128;
+constexpr unsigned VL = 32;
+
+inline auto create_exception_handler() {
+  return [](exception_list l) {
+    for (auto ep : l) {
+      try {
+        std::rethrow_exception(ep);
+      } catch (sycl::exception &e0) {
+        std::cout << "sycl::exception: " << e0.what() << std::endl;
+      } catch (std::exception &e) {
+        std::cout << "std::exception: " << e.what() << std::endl;
+      } catch (...) {
+        std::cout << "generic exception\n";
+      }
+    }
+  };
+}
+
+struct usm_deleter {
+  queue q;
+  void operator()(void *ptr) {
+    if (ptr)
+      sycl::free(ptr, q);
+  }
+};
+
+void init(unsigned *in1, unsigned *in2, unsigned *in3, unsigned *res,
+          size_t size) {
+  for (int i = 0; i < size; i++) {
+    in1[i] = i * 3;
+    in2[i] = i * 3 + 1;
+    in3[i] = i * 3 + 2;
+    res[i] = 0;
+  }
+}
+
+// Compute reference gold value.
+unsigned ref(unsigned x, unsigned y, unsigned z) {
+  unsigned res = 0;
+  for (unsigned i = 0; i < sizeof(x) * 8; i++) {
+    unsigned mask = unsigned(0x1) << i;
+    res = (res & ~mask) |
+          ((static_cast<uint8_t>(F) >> ((((x >> i) & unsigned(0x1))) +
+                                        (((y >> i) & unsigned(0x1)) << 1) +
+                                        (((z >> i) & unsigned(0x1)) << 2)) &
+            unsigned(0x1))
+           << i);
+  }
+  return res;
+}
+
+int main() {
+  int err_cnt = 0;
+
+  try {
+    queue q(gpu_selector_v, create_exception_handler());
+    auto dev = q.get_device();
+    std::cout << "Running on " << dev.get_info<sycl::info::device::name>()
+              << "\n";
+
+    unsigned *in1 = malloc_shared<unsigned>(Size, q);
+    unsigned *in2 = malloc_shared<unsigned>(Size, q);
+    unsigned *in3 = malloc_shared<unsigned>(Size, q);
+    unsigned *res = malloc_shared<unsigned>(Size, q);
+
+    std::unique_ptr<unsigned, usm_deleter> guard_in1(in1, usm_deleter{q});
+    std::unique_ptr<unsigned, usm_deleter> guard_in2(in2, usm_deleter{q});
+    std::unique_ptr<unsigned, usm_deleter> guard_in3(in3, usm_deleter{q});
+    std::unique_ptr<unsigned, usm_deleter> guard_res(res, usm_deleter{q});
+
+    init(in1, in2, in3, res, Size);
+
+    q.parallel_for(Size / VL, [=](id<1> i) [[intel::sycl_explicit_simd]] {
+       using namespace esimd;
+       auto offset = i * VL;
+       simd<unsigned, VL> X(in1 + offset);
+       simd<unsigned, VL> Y(in2 + offset);
+       simd<unsigned, VL> Z(in3 + offset);
+
+       simd<unsigned, VL> val = bfn<F>(X, Y, Z);
+       val.copy_to(res + offset);
+     }).wait_and_throw();
+
+    for (int i = 0; i < Size; i++) {
+      unsigned gold = ref(in1[i], in2[i], in3[i]);
+      unsigned test = res[i];
+      if (gold != test)
+        err_cnt++;
+    }
+  } catch (sycl::exception &e) {
+    std::cout << "SYCL exception caught: " << e.what() << "\n";
+    return 1;
+  }
+
+  std::cout << (err_cnt > 0 ? "FAILED\n" : "Passed\n");
+  return err_cnt > 0 ? 1 : 0;
+}
+```

From 8cb3343f6435975d1e7ef5ee94a7fd0a387b7df4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?F=C3=A1bio?= <fabio.mestre@codeplay.com>
Date: Tue, 7 Nov 2023 15:58:52 +0000
Subject: [PATCH 854/877] [UR] Update UR to
 612a263613b235b0257fcaf2f128fc61b06e0c24 (#11694)

---
 sycl/plugins/unified_runtime/CMakeLists.txt | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/sycl/plugins/unified_runtime/CMakeLists.txt b/sycl/plugins/unified_runtime/CMakeLists.txt
index 7bb97d9337610..2fae1c7a516f7 100644
--- a/sycl/plugins/unified_runtime/CMakeLists.txt
+++ b/sycl/plugins/unified_runtime/CMakeLists.txt
@@ -54,13 +54,13 @@ if(SYCL_PI_UR_USE_FETCH_CONTENT)
   include(FetchContent)
 
   set(UNIFIED_RUNTIME_REPO "https://github.com/oneapi-src/unified-runtime.git")
-  # commit 0e24ab8d136978d984be9e8cba34e23b4fbcf37c
-  # Merge: a62423d d5a4691
+  # commit 612a263613b235b0257fcaf2f128fc61b06e0c24
+  # Merge: 7db941d 55409e4
   # Author: Kenneth Benzie (Benie) <k.benzie@codeplay.com>
-  # Date:   Mon Nov 6 11:42:03 2023 +0000
-  #     Merge pull request #931 from konradkusiak97/konradkusiak/LocalMemSizeQuery
-  #     [UR] [CUDA] Changed the output of querying localMemSize
-  set(UNIFIED_RUNTIME_TAG 0e24ab8d136978d984be9e8cba34e23b4fbcf37c)
+  # Date:   Tue Nov 7 10:21:37 2023 +0000
+  #     Merge pull request #1011 from fabiomestre/fabio/fix_opencl_leak
+  #     [OpenCL] Fix memory leak
+  set(UNIFIED_RUNTIME_TAG 612a263613b235b0257fcaf2f128fc61b06e0c24)
 
   if(SYCL_PI_UR_OVERRIDE_FETCH_CONTENT_REPO)
     set(UNIFIED_RUNTIME_REPO "${SYCL_PI_UR_OVERRIDE_FETCH_CONTENT_REPO}")

From 75cf8d8e567b6a7b17594ff4467eaae31431be7f Mon Sep 17 00:00:00 2001
From: Kseniya Tikhomirova <kseniya.tikhomirova@intel.com>
Date: Tue, 7 Nov 2023 17:59:00 +0100
Subject: [PATCH 855/877] [SYCL][NFC] Fix test check for code location file
 name (#11803)

Signed-off-by: Tikhomirova, Kseniya <kseniya.tikhomirova@intel.com>
---
 sycl/test-e2e/Tracing/task_execution_handler.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sycl/test-e2e/Tracing/task_execution_handler.cpp b/sycl/test-e2e/Tracing/task_execution_handler.cpp
index e4da6bdd4026b..bdac40e3ac40c 100644
--- a/sycl/test-e2e/Tracing/task_execution_handler.cpp
+++ b/sycl/test-e2e/Tracing/task_execution_handler.cpp
@@ -19,7 +19,7 @@ int main() {
     // CHECK-DAG:          sym_column_no : {{.*}}
     // CHECK-DAG:          sym_function_name : {{.*}}
     // CHECK-DAG:          kernel_name : {{.*}}
-    // CHECK-DAG:          sym_source_file_name : {{.*}}/task_execution_handler.cpp
+    // CHECK-DAG:          sym_source_file_name : {{.*}}task_execution_handler.cpp
     // CHECK-DAG:          sycl_device_name : {{.*}}
     // CHECK-DAG:          sycl_device_type : {{.*}}
     // CHECK-DAG:          sym_line_no : {{.*}}
@@ -30,7 +30,7 @@ int main() {
     // CHECK-DAG:          sym_column_no : {{.*}}
     // CHECK-DAG:          sym_function_name : {{.*}}
     // CHECK-DAG:          kernel_name : {{.*}}
-    // CHECK-DAG:          sym_source_file_name : {{.*}}/task_execution_handler.cpp
+    // CHECK-DAG:          sym_source_file_name : {{.*}}task_execution_handler.cpp
     // CHECK-DAG:          sycl_device_name : {{.*}}
     // CHECK-DAG:          sycl_device_type : {{.*}}
     // CHECK-DAG:          sym_line_no : {{.*}}

From 31a0aee7c5a3acccebb3a8919b9e8146a291b60e Mon Sep 17 00:00:00 2001
From: Artur Gainullin <artur.gainullin@intel.com>
Date: Tue, 7 Nov 2023 12:07:24 -0800
Subject: [PATCH 856/877] [SYCL] Fix multi_ptr conversion operator (#11807)

---
 sycl/include/sycl/multi_ptr.hpp               |  2 +-
 .../Regression/void_global_ptr_conversion.cpp | 26 ++++++++++++++++---
 2 files changed, 23 insertions(+), 5 deletions(-)

diff --git a/sycl/include/sycl/multi_ptr.hpp b/sycl/include/sycl/multi_ptr.hpp
index 8dd42cd00e2ba..47caf29cbb679 100644
--- a/sycl/include/sycl/multi_ptr.hpp
+++ b/sycl/include/sycl/multi_ptr.hpp
@@ -552,7 +552,7 @@ class multi_ptr<const void, Space, DecorateAddress> {
   template <typename ElementType,
             typename = typename std::enable_if_t<std::is_const_v<ElementType>>>
   explicit operator multi_ptr<ElementType, Space, DecorateAddress>() const {
-    multi_ptr<ElementType, Space, DecorateAddress>{
+    return multi_ptr<ElementType, Space, DecorateAddress>{
         detail::cast_AS<typename multi_ptr<ElementType, Space,
                                            access::decorated::yes>::pointer>(
             m_Pointer)};
diff --git a/sycl/test-e2e/Regression/void_global_ptr_conversion.cpp b/sycl/test-e2e/Regression/void_global_ptr_conversion.cpp
index 96366f259cd32..33f687fb55cf9 100644
--- a/sycl/test-e2e/Regression/void_global_ptr_conversion.cpp
+++ b/sycl/test-e2e/Regression/void_global_ptr_conversion.cpp
@@ -1,14 +1,14 @@
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out | FileCheck %s
 
-// Regression test to check that conversion to/from global_ptr<void...> works as
-// expected.
+// Regression test to check that conversion to/from global_ptr<void...> and
+// to/from global_ptr<const void...> works as expected.
 
 // CHECK: 1
-// CHECK: 2
 // CHECK: 3
-// CHECK: 4
 // CHECK: 5
+// CHECK: 7
+// CHECK: 9
 
 #include <iostream>
 #include <sycl/sycl.hpp>
@@ -22,6 +22,7 @@ int main() {
 
   int *buf = sycl::malloc_host<int>(bufLen, queue);
 
+  // Conversion to/from multi_ptr<void>
   queue.submit([&](sycl::handler &cgh) {
     auto kernel = [=](sycl::id<1> id) {
       buf[id] = 1;
@@ -39,6 +40,23 @@ int main() {
   });
   queue.wait();
 
+  // Conversion to/from multi_ptr<const void>
+  queue.submit([&](sycl::handler &cgh) {
+    auto kernel = [=](sycl::id<1> id) {
+      sycl::global_ptr<const int, decorated::yes> IntermediatePtr =
+          sycl::address_space_cast<address_space::global_space, decorated::yes,
+                                   const int>(buf);
+      sycl::global_ptr<const void, decorated::yes> voidIntermediatePtr{
+          IntermediatePtr};
+      sycl::global_ptr<const int, decorated::yes> Ptr =
+          static_cast<sycl::global_ptr<const int, decorated::yes>>(
+              voidIntermediatePtr);
+      buf[id] = Ptr[id] + static_cast<int>(id);
+    };
+    cgh.parallel_for(sycl::range<1>(bufLen), kernel);
+  });
+  queue.wait();
+
   for (int i = 0; i < bufLen; ++i) {
     std::cout << buf[i] << std::endl;
   }

From 8ea8566654ec14d1dfb93e4cc7dbaba763fc96e2 Mon Sep 17 00:00:00 2001
From: Nick Sarnie <sarnex@users.noreply.github.com>
Date: Tue, 7 Nov 2023 20:16:16 +0000
Subject: [PATCH 857/877] [SYCL][ESIMD][E2E] Add ze_debug as unsupported for
 slm_init_no_inline (#11793)

QA asked us to do this.

---------

Signed-off-by: Sarnie, Nick <nick.sarnie@intel.com>
---
 sycl/test-e2e/ESIMD/slm_init_no_inline.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/sycl/test-e2e/ESIMD/slm_init_no_inline.cpp b/sycl/test-e2e/ESIMD/slm_init_no_inline.cpp
index 6cd99ccdaf04d..5056858e0df4f 100644
--- a/sycl/test-e2e/ESIMD/slm_init_no_inline.cpp
+++ b/sycl/test-e2e/ESIMD/slm_init_no_inline.cpp
@@ -1,6 +1,8 @@
 // TODO: Investigate fail of this test on Gen12 platform
 // REQUIRES-INTEL-DRIVER: lin: 26516, win: 101.4827
 // REQUIRES: gpu-intel-pvc
+// TODO: Support ze_debug once GPU hang introduced in new GPU driver is solved
+// UNSUPPORTED: ze_debug
 // DEFINE: %{inlineflags} = %if cl_options %{/clang:-fno-inline-functions%} %else %{-fno-inline-functions%}
 // RUN: %{build} %{inlineflags} -o %t.out
 // RUN: %{run} %t.out

From 946b1f2361670c7ac24e342ee77b1f467778cf94 Mon Sep 17 00:00:00 2001
From: Dmitry Vodopyanov <dmitry.vodopyanov@intel.com>
Date: Wed, 8 Nov 2023 17:00:42 +0100
Subject: [PATCH 858/877] [SYCL][Joint Matrix] Implement opt kernel feature for
 joint_matrix and joint_matrix_mad (#11738)

This patch adds two new properties `joint_matrix` and `joint_matrix_mad`
to device requirements in
sycl-post-link.
SYCL RT reads these properties and throws exception if objects of
`joint_matrix` type or `joint_matrix_mad` functions are not supported by
the current device. "Unsupported" means matrix type and sizes provided
by user are not compatible with the list of all supported matrix types
and sizes from the runtime query in
`get_info<...matrix_combinations>`.
---
 .../device-requirements/joint-matrix.ll       |  82 +++++
 .../sycl-post-link/SYCLDeviceRequirements.cpp |  26 ++
 .../oneapi/matrix/matrix-unified-utils.hpp    |  23 +-
 .../sycl/ext/oneapi/matrix/matrix-unified.hpp |   4 +-
 .../sycl/ext/oneapi/matrix/query-types.hpp    |   1 +
 sycl/source/detail/device_info.hpp            |   2 +-
 .../program_manager/program_manager.cpp       | 297 ++++++++++++++++++
 .../XMX8/joint_matrix_opt_kernel_feature.cpp  |  14 +
 .../joint_matrix_opt_kernel_feature.cpp       |  14 +
 .../joint_matrix_opt_kernel_feature_impl.hpp  |  95 ++++++
 ...trix_opt_kernel_feature_unsupported_hw.cpp |  30 ++
 11 files changed, 578 insertions(+), 10 deletions(-)
 create mode 100644 llvm/test/tools/sycl-post-link/device-requirements/joint-matrix.ll
 create mode 100644 sycl/test-e2e/Matrix/XMX8/joint_matrix_opt_kernel_feature.cpp
 create mode 100644 sycl/test-e2e/Matrix/joint_matrix_opt_kernel_feature.cpp
 create mode 100644 sycl/test-e2e/Matrix/joint_matrix_opt_kernel_feature_impl.hpp
 create mode 100644 sycl/test-e2e/Matrix/joint_matrix_opt_kernel_feature_unsupported_hw.cpp

diff --git a/llvm/test/tools/sycl-post-link/device-requirements/joint-matrix.ll b/llvm/test/tools/sycl-post-link/device-requirements/joint-matrix.ll
new file mode 100644
index 0000000000000..a710c0a9233e0
--- /dev/null
+++ b/llvm/test/tools/sycl-post-link/device-requirements/joint-matrix.ll
@@ -0,0 +1,82 @@
+; Original code:
+;
+; // Compiled with clang++ -fsycl -fsycl-device-only -fno-sycl-instrument-device-code -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4 -O2 -S -emit-llvm -o - %s
+; int main() {
+;   sycl::queue q;
+;   q.submit([&](sycl::handler &cgh) {
+;     cgh.parallel_for(nd_range<2>({1, 16}, {1, 16}), [=](nd_item<2> it) {
+;       joint_matrix<sycl::sub_group, float, use::a, 8, 16, layout::row_major> a;
+;       joint_matrix<sycl::sub_group, float, use::b, 16, 16, layout::row_major> b;
+;       joint_matrix<sycl::sub_group, float, use::accumulator, 8, 16> c;
+;       sub_group sg = it.get_sub_group();
+;       joint_matrix_mad(sg, c, a, b, c);
+;     });
+;   });
+;   q.submit([&](sycl::handler &cgh) {
+;     cgh.parallel_for(nd_range<2>({1, 16}, {1, 16}), [=](nd_item<2> it) {
+;       joint_matrix<sycl::sub_group, double, use::a, 16, 16, layout::row_major>
+;           a;
+;     });
+;   });
+;   return 0;
+; }
+
+; RUN: sycl-post-link -split=kernel %s -o %t.table
+; RUN: FileCheck %s -input-file=%t_0.prop --check-prefix CHECK-PROP-KERNEL-SPLIT-0
+; RUN: FileCheck %s -input-file=%t_1.prop --check-prefix CHECK-PROP-KERNEL-SPLIT-1
+
+; CHECK-PROP-KERNEL-SPLIT-0: [SYCL/device requirements]
+; CHECK-PROP-KERNEL-SPLIT-0: joint_matrix=2|gMAAAAAAAAQbhRncph3X0lHclpjOmB3MywSdzVmO6EGL4wSM2sTbhRncph3X0lHclpjOmB3MywSdzVmO6E2YjVXb1xWY09mcsgDLxYzOtFGdylGefRXewVmO6YGczIDL1NXZ6ojYsEjNsEjN
+; CHECK-PROP-KERNEL-SPLIT-0-NEXT: joint_matrix_mad=2|4JAAAAAAAAQbhRncph3X0lHclpjOmB3MywSbhRncph3X0lHclpjOmB3MywSbhRncph3X0lHclpjOmB3MywSbhRncph3X0lHclpjOmB3MywCOsEjNsEjN
+
+; CHECK-PROP-KERNEL-SPLIT-1: [SYCL/device requirements]
+; CHECK-PROP-KERNEL-SPLIT-1: joint_matrix=2|wDAAAAAAAAQbhRncph3X0lHclpjOmBnN0wSdzVmO6EGLxYDLxYD
+
+; ModuleID = '/tmp/test.bc'
+source_filename = "llvm-link"
+target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64"
+target triple = "spir64-unknown-unknown"
+
+$_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_EUlNS0_7nd_itemILi2EEEE_ = comdat any
+
+$_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE0_clES2_EUlNS0_7nd_itemILi2EEEE_ = comdat any
+
+; Function Attrs: convergent norecurse nounwind
+define weak_odr dso_local spir_kernel void @_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_EUlNS0_7nd_itemILi2EEEE_() local_unnamed_addr #0 comdat !srcloc !5 !kernel_arg_buffer_location !6 !sycl_fixed_targets !6 !sycl_joint_matrix !7 !sycl_joint_matrix_mad !8 !sycl_kernel_omit_args !6 {
+entry:
+  %call.i.i = tail call spir_func noundef target("spirv.JointMatrixINTEL", float, 8, 16, 3, 3, 2) @_Z27__spirv_JointMatrixMadINTELIffLm8ELm16ELm16ELN5__spv9MatrixUseE0ELS1_1ELS1_2ELNS0_12MatrixLayoutE0ELS2_0ELS2_3ELNS0_5Scope4FlagE3EEPNS0_24__spirv_JointMatrixINTELIT0_XT1_EXT3_EXT9_EXT10_EXT6_EEEPNS5_IT_XT1_EXT2_EXT7_EXT10_EXT4_EEEPNS5_IS9_XT2_EXT3_EXT8_EXT10_EXT5_EEES8_S4_(target("spirv.JointMatrixINTEL", float, 8, 16, 0, 3, 0) noundef undef, target("spirv.JointMatrixINTEL", float, 16, 16, 0, 3, 1) noundef undef, target("spirv.JointMatrixINTEL", float, 8, 16, 3, 3, 2) noundef undef, i32 noundef 3) #3
+  ret void
+}
+
+; Function Attrs: convergent nounwind
+declare dso_local spir_func noundef target("spirv.JointMatrixINTEL", float, 8, 16, 3, 3, 2) @_Z27__spirv_JointMatrixMadINTELIffLm8ELm16ELm16ELN5__spv9MatrixUseE0ELS1_1ELS1_2ELNS0_12MatrixLayoutE0ELS2_0ELS2_3ELNS0_5Scope4FlagE3EEPNS0_24__spirv_JointMatrixINTELIT0_XT1_EXT3_EXT9_EXT10_EXT6_EEEPNS5_IT_XT1_EXT2_EXT7_EXT10_EXT4_EEEPNS5_IS9_XT2_EXT3_EXT8_EXT10_EXT5_EEES8_S4_(target("spirv.JointMatrixINTEL", float, 8, 16, 0, 3, 0) noundef, target("spirv.JointMatrixINTEL", float, 16, 16, 0, 3, 1) noundef, target("spirv.JointMatrixINTEL", float, 8, 16, 3, 3, 2) noundef, i32 noundef) local_unnamed_addr #1
+
+; Function Attrs: norecurse nounwind
+define weak_odr dso_local spir_kernel void @_ZTSZZ4mainENKUlRN4sycl3_V17handlerEE0_clES2_EUlNS0_7nd_itemILi2EEEE_() local_unnamed_addr #2 comdat !srcloc !9 !kernel_arg_buffer_location !6 !sycl_fixed_targets !6 !sycl_joint_matrix !10 !sycl_kernel_omit_args !6 {
+entry:
+  ret void
+}
+
+declare dso_local spir_func i32 @_Z18__spirv_ocl_printfPU3AS2Kcz(ptr addrspace(2), ...)
+
+attributes #0 = { convergent norecurse nounwind "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "sycl-module-id"="sss.cpp" "sycl-optlevel"="2" "uniform-work-group-size"="true" }
+attributes #1 = { convergent nounwind "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+attributes #2 = { norecurse nounwind "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "sycl-module-id"="sss.cpp" "sycl-optlevel"="2" "uniform-work-group-size"="true" }
+attributes #3 = { convergent nounwind }
+
+!llvm.module.flags = !{!0, !1}
+!opencl.spir.version = !{!2}
+!spirv.Source = !{!3}
+!llvm.ident = !{!4}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 7, !"frame-pointer", i32 2}
+!2 = !{i32 1, i32 2}
+!3 = !{i32 4, i32 100000}
+!4 = !{!""}
+!5 = !{i32 1091}
+!6 = !{}
+!7 = !{!"matrix_type::fp32,use::a,8,16;matrix_type::fp32,use::accumulator,8,16;matrix_type::fp32,use::b,16,16"}
+!8 = !{!"matrix_type::fp32,matrix_type::fp32,matrix_type::fp32,matrix_type::fp32,8,16,16"}
+!9 = !{i32 1529}
+!10 = !{!"matrix_type::fp64,use::a,16,16"}
diff --git a/llvm/tools/sycl-post-link/SYCLDeviceRequirements.cpp b/llvm/tools/sycl-post-link/SYCLDeviceRequirements.cpp
index d9d1e7d29f6d4..70d3a90f51785 100644
--- a/llvm/tools/sycl-post-link/SYCLDeviceRequirements.cpp
+++ b/llvm/tools/sycl-post-link/SYCLDeviceRequirements.cpp
@@ -9,6 +9,7 @@
 #include "SYCLDeviceRequirements.h"
 #include "ModuleSplitter.h"
 
+#include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Support/PropertySetIO.h"
@@ -88,6 +89,31 @@ void llvm::getSYCLDeviceRequirements(
   if (ReqdWorkGroupSize)
     Requirements["reqd_work_group_size_uint64_t"] = *ReqdWorkGroupSize;
 
+  auto ExtractStringFromMDNodeOperand =
+      [=](const MDNode *N, unsigned OpNo) -> llvm::SmallString<256> {
+    MDString *S = cast<llvm::MDString>(N->getOperand(OpNo).get());
+    return S->getString();
+  };
+
+  // { LLVM-IR metadata name , [SYCL/Device requirements] property name }, see:
+  // https://github.com/intel/llvm/blob/sycl/sycl/doc/design/OptionalDeviceFeatures.md#create-the-sycldevice-requirements-property-set
+  // Scan the module and if the metadata is present fill the corresponing
+  // property with metadata's aspects
+  constexpr std::pair<const char *, const char *> MatrixMDs[] = {
+      {"sycl_joint_matrix", "joint_matrix"},
+      {"sycl_joint_matrix_mad", "joint_matrix_mad"}};
+
+  for (const auto &[MDName, MappedName] : MatrixMDs) {
+    llvm::SmallString<256> Val;
+    for (const Function &F : MD.getModule())
+      if (const MDNode *MDN = F.getMetadata(MDName))
+        Val = ExtractStringFromMDNodeOperand(
+            MDN, 0); // there is always only one operand
+    if (Val.empty())
+      continue;
+    Requirements[MappedName] = Val;
+  }
+
   // There should only be at most one function with
   // intel_reqd_sub_group_size metadata when considering the entry
   // points of a module, but not necessarily when considering all the
diff --git a/sycl/include/sycl/ext/oneapi/matrix/matrix-unified-utils.hpp b/sycl/include/sycl/ext/oneapi/matrix/matrix-unified-utils.hpp
index 68b29fc07c22a..8a50c435fb0c7 100644
--- a/sycl/include/sycl/ext/oneapi/matrix/matrix-unified-utils.hpp
+++ b/sycl/include/sycl/ext/oneapi/matrix/matrix-unified-utils.hpp
@@ -38,20 +38,29 @@ namespace detail {
 using UseToUseStringPair =
     std::pair<ext::oneapi::experimental::matrix::use, const char *>;
 
-constexpr const char *
-convertMatrixUseToString(ext::oneapi::experimental::matrix::use Use) {
-  constexpr UseToUseStringPair UseToUseStringMap[] = {
-      {ext::oneapi::experimental::matrix::use::a, "use::a"},
-      {ext::oneapi::experimental::matrix::use::b, "use::b"},
-      {ext::oneapi::experimental::matrix::use::accumulator, "use::accumulator"},
-  };
+constexpr UseToUseStringPair UseToUseStringMap[] = {
+    {ext::oneapi::experimental::matrix::use::a, "use::a"},
+    {ext::oneapi::experimental::matrix::use::b, "use::b"},
+    {ext::oneapi::experimental::matrix::use::accumulator, "use::accumulator"},
+};
 
+constexpr const char *
+convertMatrixUseEnumToString(ext::oneapi::experimental::matrix::use Use) {
   for (const auto &Item : UseToUseStringMap) {
     if (Item.first == Use)
       return Item.second;
   }
   return "";
 }
+
+constexpr std::optional<ext::oneapi::experimental::matrix::use>
+convertMatrixUseStringToEnum(const char *UseString) {
+  for (const auto &Item : UseToUseStringMap) {
+    if (std::string_view(Item.second) == UseString)
+      return Item.first;
+  }
+  return std::nullopt;
+}
 } // namespace detail
 } // namespace _V1
 } // namespace sycl
diff --git a/sycl/include/sycl/ext/oneapi/matrix/matrix-unified.hpp b/sycl/include/sycl/ext/oneapi/matrix/matrix-unified.hpp
index d8a751680900a..494b4d581870c 100644
--- a/sycl/include/sycl/ext/oneapi/matrix/matrix-unified.hpp
+++ b/sycl/include/sycl/ext/oneapi/matrix/matrix-unified.hpp
@@ -22,7 +22,7 @@
 #include <sycl/detail/defines_elementary.hpp> // for __SYCL_ALWAYS_...
 #include <sycl/detail/pi.h>                   // for PI_ERROR_INVAL...
 #include <sycl/exception.hpp>                 // for runtime_error
-#include <sycl/ext/oneapi/matrix/matrix-unified-utils.hpp> // for layout, use, tf32, convertMatrixUseToString
+#include <sycl/ext/oneapi/matrix/matrix-unified-utils.hpp> // for layout, use, tf32, convertMatrixUseEnumToString
 #include <sycl/ext/oneapi/matrix/query-types.hpp> // for convertTypeToMatrixTypeString
 #include <sycl/marray.hpp>                                 // for marray
 #include <sycl/multi_ptr.hpp>                              // for multi_ptr
@@ -65,7 +65,7 @@ struct joint_matrix {
       "sycl-joint-matrix-type", "sycl-joint-matrix-use",
       "sycl-joint-matrix-rows", "sycl-joint-matrix-cols",
       sycl::detail::convertTypeToMatrixTypeString<T>(),
-      sycl::detail::convertMatrixUseToString(Use), Rows, Cols)]]
+      sycl::detail::convertMatrixUseEnumToString(Use), Rows, Cols)]]
 #endif // defined(__SYCL_DEVICE_ONLY__)
   joint_matrix() {
 #ifndef __SYCL_DEVICE_ONLY__
diff --git a/sycl/include/sycl/ext/oneapi/matrix/query-types.hpp b/sycl/include/sycl/ext/oneapi/matrix/query-types.hpp
index 66decfeb15e2c..945300cf3eeae 100644
--- a/sycl/include/sycl/ext/oneapi/matrix/query-types.hpp
+++ b/sycl/include/sycl/ext/oneapi/matrix/query-types.hpp
@@ -46,6 +46,7 @@ struct combination {
 
 } // namespace ext::oneapi::experimental::matrix
 
+// Type to matrix type string conversion used in compile-time
 namespace detail {
 template <typename T> constexpr const char *convertTypeToMatrixTypeString() {
   return "";
diff --git a/sycl/source/detail/device_info.hpp b/sycl/source/detail/device_info.hpp
index 319c1cf467793..df462f9cfc208 100644
--- a/sycl/source/detail/device_info.hpp
+++ b/sycl/source/detail/device_info.hpp
@@ -759,7 +759,7 @@ struct get_device_info_impl<
            matrix_type::sint32, matrix_type::sint32},
           {8, 0, 0, 0, 8, 32, matrix_type::sint8, matrix_type::uint8,
            matrix_type::sint32, matrix_type::sint32},
-          {8, 0, 0, 0, 8, 16, matrix_type::sint8, matrix_type::sint8,
+          {8, 0, 0, 0, 8, 32, matrix_type::sint8, matrix_type::sint8,
            matrix_type::sint32, matrix_type::sint32},
           {8, 0, 0, 0, 8, 16, matrix_type::fp16, matrix_type::fp16,
            matrix_type::fp32, matrix_type::fp32},
diff --git a/sycl/source/detail/program_manager/program_manager.cpp b/sycl/source/detail/program_manager/program_manager.cpp
index cb4027d4f18a1..0446ac0ece0a9 100644
--- a/sycl/source/detail/program_manager/program_manager.cpp
+++ b/sycl/source/detail/program_manager/program_manager.cpp
@@ -30,6 +30,8 @@
 #include <sycl/exception.hpp>
 #include <sycl/stl.hpp>
 
+#include <sycl/ext/oneapi/matrix/query-types.hpp>
+
 #include <algorithm>
 #include <cassert>
 #include <cstdint>
@@ -2421,6 +2423,243 @@ multiply_with_overflow_check(T x, T y) {
     return x * y;
 }
 
+namespace matrix_ext = ext::oneapi::experimental::matrix;
+
+// Matrix type string to matrix_type enum value conversion
+// Note: matrix type strings are defined in template specialization for
+// convertTypeToMatrixTypeString above
+std::optional<matrix_ext::matrix_type>
+convertMatrixTypeStringMatrixTypeEnumValue(
+    const std::string &MatrixTypeString) {
+  assert(!MatrixTypeString.empty() &&
+         "MatrixTypeString type string can't be empty. Check if required "
+         "template specialization for convertTypeToMatrixTypeString exists.");
+  std::string_view MatrixTypeStringView = MatrixTypeString;
+  std::string Prefix("matrix_type::");
+  assert((MatrixTypeStringView.substr(0, Prefix.size()) == Prefix) &&
+         "MatrixTypeString has incorrect prefix, should be \"matrix_type::\".");
+  MatrixTypeStringView.remove_prefix(Prefix.size());
+  if ("bf16" == MatrixTypeStringView)
+    return matrix_ext::matrix_type::bf16;
+  else if ("fp16" == MatrixTypeStringView)
+    return matrix_ext::matrix_type::fp16;
+  else if ("tf32" == MatrixTypeStringView)
+    return matrix_ext::matrix_type::tf32;
+  else if ("fp32" == MatrixTypeStringView)
+    return matrix_ext::matrix_type::fp32;
+  else if ("fp64" == MatrixTypeStringView)
+    return matrix_ext::matrix_type::fp64;
+  else if ("sint8" == MatrixTypeStringView)
+    return matrix_ext::matrix_type::sint8;
+  else if ("sint16" == MatrixTypeStringView)
+    return matrix_ext::matrix_type::sint16;
+  else if ("sint32" == MatrixTypeStringView)
+    return matrix_ext::matrix_type::sint32;
+  else if ("sint64" == MatrixTypeStringView)
+    return matrix_ext::matrix_type::sint64;
+  else if ("uint8" == MatrixTypeStringView)
+    return matrix_ext::matrix_type::uint8;
+  else if ("uint16" == MatrixTypeStringView)
+    return matrix_ext::matrix_type::uint16;
+  else if ("uint32" == MatrixTypeStringView)
+    return matrix_ext::matrix_type::uint32;
+  else if ("uint64" == MatrixTypeStringView)
+    return matrix_ext::matrix_type::uint64;
+  return std::nullopt;
+}
+
+bool isMatrixSupportedByHW(const std::string &MatrixTypeStrUser,
+                           size_t RowsUser, size_t ColsUser,
+                           matrix_ext::matrix_type MatrixTypeRuntime,
+                           size_t MaxRowsRuntime, size_t MaxColsRuntime,
+                           size_t RowsRuntime, size_t ColsRuntime) {
+  std::optional<matrix_ext::matrix_type> MatrixTypeUserOpt =
+      convertMatrixTypeStringMatrixTypeEnumValue(MatrixTypeStrUser);
+  if (!MatrixTypeUserOpt)
+    return false;
+  bool IsMatrixTypeSupported = (MatrixTypeUserOpt.value() == MatrixTypeRuntime);
+  bool IsRowsSupported = ((RowsRuntime != 0) ? (RowsUser == RowsRuntime)
+                                             : (RowsUser <= MaxRowsRuntime));
+  bool IsColsSupported = ((ColsRuntime != 0) ? (ColsUser == ColsRuntime)
+                                             : (ColsUser <= MaxColsRuntime));
+  return IsMatrixTypeSupported && IsRowsSupported && IsColsSupported;
+}
+
+std::optional<sycl::exception> checkDevSupportJointMatrix(
+    const device &Dev, const std::string &JointMatrixProStr,
+    const std::vector<ext::oneapi::experimental::matrix::combination>
+        &SupportedMatrixCombinations) {
+  std::istringstream JointMatrixStrStream(JointMatrixProStr);
+  std::string SingleJointMatrix;
+
+  // start to parse the value which is generated by
+  // SYCLPropagateJointMatrixUsage pass
+  while (std::getline(JointMatrixStrStream, SingleJointMatrix, ';')) {
+    std::istringstream SingleJointMatrixStrStream(SingleJointMatrix);
+    std::vector<std::string> JointMatrixVec;
+    std::string Item;
+
+    while (std::getline(SingleJointMatrixStrStream, Item, ',')) {
+      JointMatrixVec.push_back(Item);
+    }
+
+    assert(JointMatrixVec.size() == 4 &&
+           "Property set is corrupted, it must have 4 elements.");
+
+    const std::string &MatrixTypeUser = JointMatrixVec[0];
+    const std::string &UseStrUser = JointMatrixVec[1];
+    size_t RowsUser, ColsUser = 0;
+    try {
+      RowsUser = std::stoi(JointMatrixVec[2]);
+      ColsUser = std::stoi(JointMatrixVec[3]);
+    } catch (std::logic_error &) {
+      // ignore exceptions, one way or another a user will see sycl::exception
+      // with the message about incorrect rows or cols, because they are
+      // initialized with 0 above
+    }
+
+    bool IsMatrixCompatible = false;
+
+    for (const auto &Combination : SupportedMatrixCombinations) {
+      std::optional<ext::oneapi::experimental::matrix::use> Use =
+          detail::convertMatrixUseStringToEnum(UseStrUser.c_str());
+      assert(Use && "Property set has empty matrix::use value.");
+      switch (Use.value()) {
+      case matrix_ext::use::a:
+        IsMatrixCompatible |= isMatrixSupportedByHW(
+            MatrixTypeUser, RowsUser, ColsUser, Combination.atype,
+            Combination.max_msize, Combination.max_ksize, Combination.msize,
+            Combination.ksize);
+        break;
+      case matrix_ext::use::b:
+        IsMatrixCompatible |= isMatrixSupportedByHW(
+            MatrixTypeUser, RowsUser, ColsUser, Combination.btype,
+            Combination.max_ksize, Combination.max_nsize, Combination.ksize,
+            Combination.nsize);
+        break;
+      case matrix_ext::use::accumulator: {
+        IsMatrixCompatible |= isMatrixSupportedByHW(
+            MatrixTypeUser, RowsUser, ColsUser, Combination.ctype,
+            Combination.max_msize, Combination.max_nsize, Combination.msize,
+            Combination.nsize);
+        IsMatrixCompatible |= isMatrixSupportedByHW(
+            MatrixTypeUser, RowsUser, ColsUser, Combination.dtype,
+            Combination.max_msize, Combination.max_nsize, Combination.msize,
+            Combination.nsize);
+        break;
+      }
+      }
+
+      // early exit if we have a match
+      if (IsMatrixCompatible)
+        break;
+    }
+
+    if (!IsMatrixCompatible)
+      return sycl::exception(make_error_code(errc::kernel_not_supported),
+                             "joint_matrix with parameters " + MatrixTypeUser +
+                                 ", " + UseStrUser +
+                                 ", Rows=" + std::to_string(RowsUser) +
+                                 ", Cols=" + std::to_string(ColsUser) +
+                                 " is not supported on this device");
+  }
+  return std::nullopt;
+}
+
+std::optional<sycl::exception> checkDevSupportJointMatrixMad(
+    const device &Dev, const std::string &JointMatrixProStr,
+    const std::vector<ext::oneapi::experimental::matrix::combination>
+        &SupportedMatrixCombinations) {
+  std::istringstream JointMatrixMadStrStream(JointMatrixProStr);
+  std::string SingleJointMatrixMad;
+
+  // start to parse the value which is generated by
+  // SYCLPropagateJointMatrixUsage pass
+  while (std::getline(JointMatrixMadStrStream, SingleJointMatrixMad, ';')) {
+    std::istringstream SingleJointMatrixMadStrStream(SingleJointMatrixMad);
+    std::vector<std::string> JointMatrixMadVec;
+    std::string Item;
+
+    while (std::getline(SingleJointMatrixMadStrStream, Item, ',')) {
+      JointMatrixMadVec.push_back(Item);
+    }
+
+    assert(JointMatrixMadVec.size() == 7 &&
+           "Property set is corrupted, it must have 7 elements.");
+
+    const std::string &MatrixTypeAStrUser = JointMatrixMadVec[0];
+    const std::string &MatrixTypeBStrUser = JointMatrixMadVec[1];
+    const std::string &MatrixTypeCStrUser = JointMatrixMadVec[2];
+    const std::string &MatrixTypeDStrUser = JointMatrixMadVec[3];
+    size_t MSizeUser, KSizeUser, NSizeUser = 0;
+    try {
+      MSizeUser = std::stoi(JointMatrixMadVec[4]);
+      KSizeUser = std::stoi(JointMatrixMadVec[5]);
+      NSizeUser = std::stoi(JointMatrixMadVec[6]);
+    } catch (std::logic_error &) {
+      // ignore exceptions, one way or another a user will see sycl::exception
+      // with the message about incorrect size(s), because they are
+      // initialized with 0 above
+    }
+
+    std::optional<matrix_ext::matrix_type> MatrixTypeAUserOpt =
+        convertMatrixTypeStringMatrixTypeEnumValue(MatrixTypeAStrUser);
+    std::optional<matrix_ext::matrix_type> MatrixTypeBUserOpt =
+        convertMatrixTypeStringMatrixTypeEnumValue(MatrixTypeBStrUser);
+    std::optional<matrix_ext::matrix_type> MatrixTypeCUserOpt =
+        convertMatrixTypeStringMatrixTypeEnumValue(MatrixTypeCStrUser);
+    std::optional<matrix_ext::matrix_type> MatrixTypeDUserOpt =
+        convertMatrixTypeStringMatrixTypeEnumValue(MatrixTypeDStrUser);
+
+    bool IsMatrixMadCompatible = false;
+
+    for (const auto &Combination : SupportedMatrixCombinations) {
+      if (!MatrixTypeAUserOpt || !MatrixTypeBUserOpt || !MatrixTypeCUserOpt ||
+          !MatrixTypeDUserOpt)
+        continue;
+
+      bool IsMatrixTypeACompatible =
+          (MatrixTypeAUserOpt.value() == Combination.atype);
+      bool IsMatrixTypeBCompatible =
+          (MatrixTypeBUserOpt.value() == Combination.btype);
+      bool IsMatrixTypeCCompatible =
+          (MatrixTypeCUserOpt.value() == Combination.ctype);
+      bool IsMatrixTypeDCompatible =
+          (MatrixTypeDUserOpt.value() == Combination.dtype);
+      bool IsMSizeCompatible =
+          ((Combination.msize != 0) ? (MSizeUser == Combination.msize)
+                                    : (MSizeUser <= Combination.max_msize));
+      bool IsKSizeCompatible =
+          ((Combination.ksize != 0) ? (KSizeUser == Combination.ksize)
+                                    : (KSizeUser <= Combination.max_ksize));
+      bool IsNSizeCompatible =
+          ((Combination.nsize != 0) ? (NSizeUser == Combination.nsize)
+                                    : (NSizeUser <= Combination.max_nsize));
+
+      IsMatrixMadCompatible =
+          IsMatrixTypeACompatible && IsMatrixTypeBCompatible &&
+          IsMatrixTypeCCompatible && IsMatrixTypeDCompatible &&
+          IsMSizeCompatible && IsKSizeCompatible && IsNSizeCompatible;
+
+      // early exit if we have a match
+      if (IsMatrixMadCompatible)
+        break;
+    }
+
+    if (!IsMatrixMadCompatible)
+      return sycl::exception(
+          make_error_code(errc::kernel_not_supported),
+          "joint_matrix_mad function with parameters atype=" +
+              MatrixTypeAStrUser + ", btype=" + MatrixTypeBStrUser +
+              ", ctype=" + MatrixTypeCStrUser + ", dtype=" +
+              MatrixTypeDStrUser + ", M=" + std::to_string(MSizeUser) + ", K=" +
+              std::to_string(KSizeUser) + ", N=" + std::to_string(NSizeUser) +
+              " is not supported on this "
+              "device");
+  }
+  return std::nullopt;
+}
+
 std::optional<sycl::exception>
 checkDevSupportDeviceRequirements(const device &Dev,
                                   const RTDeviceBinaryImage &Img) {
@@ -2439,6 +2678,8 @@ checkDevSupportDeviceRequirements(const device &Dev,
   };
 
   auto AspectsPropIt = getPropIt("aspects");
+  auto JointMatrixPropIt = getPropIt("joint_matrix");
+  auto JointMatrixMadPropIt = getPropIt("joint_matrix_mad");
   auto ReqdWGSizeUint32TPropIt = getPropIt("reqd_work_group_size");
   auto ReqdWGSizeUint64TPropIt = getPropIt("reqd_work_group_size_uint64_t");
   auto ReqdSubGroupSizePropIt = getPropIt("reqd_sub_group_size");
@@ -2458,6 +2699,62 @@ checkDevSupportDeviceRequirements(const device &Dev,
     }
   }
 
+  // TODO: remove checks for CUDA and HIP from if-statement below when runtime
+  // query for them in matrix_combinations is implemented
+  if (JointMatrixPropIt &&
+      (Dev.get_backend() != sycl::backend::ext_oneapi_cuda) &&
+      (Dev.get_backend() != sycl::backend::ext_oneapi_hip)) {
+    std::vector<ext::oneapi::experimental::matrix::combination> Combinations =
+        Dev.get_info<
+            ext::oneapi::experimental::info::device::matrix_combinations>();
+
+    if (Combinations.empty())
+      return sycl::exception(make_error_code(errc::kernel_not_supported),
+                             "no matrix hardware on the target device, "
+                             "joint_matrix is not supported");
+
+    ByteArray JointMatrixByteArray =
+        DeviceBinaryProperty(*(JointMatrixPropIt.value())).asByteArray();
+    // Drop 8 bytes describing the size of the byte array.
+    JointMatrixByteArray.dropBytes(8);
+    std::string JointMatrixByteArrayToStr;
+    while (!JointMatrixByteArray.empty()) {
+      JointMatrixByteArrayToStr += JointMatrixByteArray.consume<char>();
+    }
+    std::optional<sycl::exception> Result = checkDevSupportJointMatrix(
+        Dev, JointMatrixByteArrayToStr, Combinations);
+    if (Result)
+      return Result.value();
+  }
+
+  // TODO: remove checks for CUDA and HIP from if-statement below when runtime
+  // query for them in matrix_combinations is implemented
+  if (JointMatrixMadPropIt &&
+      (Dev.get_backend() != sycl::backend::ext_oneapi_cuda) &&
+      (Dev.get_backend() != sycl::backend::ext_oneapi_hip)) {
+    std::vector<ext::oneapi::experimental::matrix::combination> Combinations =
+        Dev.get_info<
+            ext::oneapi::experimental::info::device::matrix_combinations>();
+
+    if (Combinations.empty())
+      return sycl::exception(make_error_code(errc::kernel_not_supported),
+                             "no matrix hardware on the target device, "
+                             "joint_matrix_mad is not supported");
+
+    ByteArray JointMatrixMadByteArray =
+        DeviceBinaryProperty(*(JointMatrixMadPropIt.value())).asByteArray();
+    // Drop 8 bytes describing the size of the byte array.
+    JointMatrixMadByteArray.dropBytes(8);
+    std::string JointMatrixMadByteArrayToStr;
+    while (!JointMatrixMadByteArray.empty()) {
+      JointMatrixMadByteArrayToStr += JointMatrixMadByteArray.consume<char>();
+    }
+    std::optional<sycl::exception> Result = checkDevSupportJointMatrixMad(
+        Dev, JointMatrixMadByteArrayToStr, Combinations);
+    if (Result)
+      return Result.value();
+  }
+
   // Checking if device supports defined required work group size
   if (ReqdWGSizeUint32TPropIt || ReqdWGSizeUint64TPropIt) {
     /// TODO: Before intel/llvm#10620, the reqd_work_group_size attribute
diff --git a/sycl/test-e2e/Matrix/XMX8/joint_matrix_opt_kernel_feature.cpp b/sycl/test-e2e/Matrix/XMX8/joint_matrix_opt_kernel_feature.cpp
new file mode 100644
index 0000000000000..1737c957ec5f7
--- /dev/null
+++ b/sycl/test-e2e/Matrix/XMX8/joint_matrix_opt_kernel_feature.cpp
@@ -0,0 +1,14 @@
+// REQUIRES: matrix-xmx8
+
+// RUN: %{build} -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4
+// RUN: %{run} %t.out
+
+// Test checks that exception will be thrown in case matrix parameters are
+// incompatible on the current device
+
+#include "../common.hpp"
+
+#define SG_SZ 8
+constexpr size_t SN = 8;
+
+#include "../joint_matrix_opt_kernel_feature_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/joint_matrix_opt_kernel_feature.cpp b/sycl/test-e2e/Matrix/joint_matrix_opt_kernel_feature.cpp
new file mode 100644
index 0000000000000..881b068c8df1b
--- /dev/null
+++ b/sycl/test-e2e/Matrix/joint_matrix_opt_kernel_feature.cpp
@@ -0,0 +1,14 @@
+// REQUIRES: matrix
+
+// RUN: %{build} -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4
+// RUN: %{run} %t.out
+
+// Test checks that exception will be thrown in case matrix parameters are
+// incompatible on the current device
+
+#include "common.hpp"
+
+#define SG_SZ 16
+static constexpr size_t SN = 16;
+
+#include "joint_matrix_opt_kernel_feature_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/joint_matrix_opt_kernel_feature_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_opt_kernel_feature_impl.hpp
new file mode 100644
index 0000000000000..ead1f172c7c5f
--- /dev/null
+++ b/sycl/test-e2e/Matrix/joint_matrix_opt_kernel_feature_impl.hpp
@@ -0,0 +1,95 @@
+using namespace sycl;
+using namespace sycl::ext::oneapi::experimental::matrix;
+
+static constexpr size_t M_MULTIPLIER = 16;
+
+template <typename T1, typename T2, size_t M, size_t N, size_t K,
+          int vnniFactor, size_t TM, size_t TN, size_t TK>
+void matrix_multiply(big_matrix<T1, M, N> &C, big_matrix<T2, M, K> &A,
+                     big_matrix<T2, K / vnniFactor, N * vnniFactor> &B) {
+  size_t NDRangeM = M / TM;
+  size_t NDRangeN = N / TN;
+  buffer<T2, 2> bufA(A.get_data(), range<2>(M, K));
+  buffer<T2, 2> bufB(B.get_data(), range<2>(K, N));
+  buffer<T1, 2> bufC(C.get_data(), range<2>(M, N));
+
+  queue q;
+  q.submit([&](handler &cgh) {
+     sycl::accessor accC{bufC, cgh, sycl::read_write};
+     sycl::accessor accA{bufA, cgh, sycl::read_only};
+     sycl::accessor accB{bufB, cgh, sycl::read_only};
+
+     cgh.parallel_for(
+         nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}),
+         [=](nd_item<2> spmd_item) [[intel::reqd_sub_group_size(SG_SZ)]] {
+           const auto global_idx = spmd_item.get_global_id(0);
+           const auto global_idy = spmd_item.get_global_id(1);
+           const auto sg_startx = global_idx - spmd_item.get_local_id(0);
+           const auto sg_starty = global_idy - spmd_item.get_local_id(1);
+
+           sub_group sg = spmd_item.get_sub_group();
+           joint_matrix<sub_group, T2, use::a, TM, TK, layout::row_major> sub_a;
+           joint_matrix<sub_group, T2, use::b, TK, TN, layout::ext_intel_packed>
+               sub_b;
+           joint_matrix<sub_group, T1, use::accumulator, TM, TN> sub_c;
+
+           joint_matrix_load(
+               sg, sub_c,
+               accC.template get_multi_ptr<access::decorated::no>() +
+                   (sg_startx * TM) * N + sg_starty / SG_SZ * TN,
+               N, layout::row_major);
+           for (int k = 0; k < K / TK; k += 1) {
+             joint_matrix_load(
+                 sg, sub_a,
+                 accA.template get_multi_ptr<access::decorated::no>() +
+                     (sg_startx * TM) * K + k * TK,
+                 K);
+             joint_matrix_load(
+                 sg, sub_b,
+                 accB.template get_multi_ptr<access::decorated::no>() +
+                     (k * TK / vnniFactor) * (N * vnniFactor) +
+                     sg_starty / SG_SZ * TN * vnniFactor,
+                 N * vnniFactor);
+             joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
+           }
+         });
+   }).wait();
+}
+
+template <typename Ta, typename Tc, int vnni_factor, size_t tM, size_t tN,
+          size_t tK>
+void init_and_multiply() {
+  static constexpr size_t MATRIX_M = tM * M_MULTIPLIER;
+  static constexpr size_t MATRIX_N = 128;
+  static constexpr size_t MATRIX_K = 128;
+
+  Ta A[MATRIX_M][MATRIX_K];
+  Ta B[MATRIX_K][MATRIX_N];
+  Ta Bvnni[MATRIX_K / vnni_factor][MATRIX_N * vnni_factor];
+  Tc C[MATRIX_M][MATRIX_N];
+
+  matrix_rand(MATRIX_M, MATRIX_K, (Ta *)A, (Ta)50);
+  matrix_rand(MATRIX_K, MATRIX_N, (Ta *)B, (Ta)50);
+  matrix_fill(MATRIX_M, MATRIX_N, (Tc *)C, (Tc)1);
+
+  big_matrix<Tc, MATRIX_M, MATRIX_N> MC((Tc *)&C);
+  big_matrix<Ta, MATRIX_M, MATRIX_K> MA((Ta *)&A);
+  matrix_vnni<Ta>(MATRIX_K, MATRIX_N, (Ta *)&B, (Ta *)&Bvnni, vnni_factor);
+  big_matrix<Ta, MATRIX_K / vnni_factor, MATRIX_N * vnni_factor> MBvnni(
+      (Ta *)&Bvnni);
+
+  matrix_multiply<Tc, Ta, MATRIX_M, MATRIX_N, MATRIX_K, vnni_factor, tM, tN,
+                  tK>(MC, MA, MBvnni);
+}
+
+int main() {
+  try {
+    init_and_multiply<bfloat16, float, 2, 1, 500,
+                      16>(); // 500 is not correct size
+  } catch (const sycl::exception &e) {
+    if (e.code() == errc::kernel_not_supported)
+      return 0;
+  }
+
+  return 1;
+}
diff --git a/sycl/test-e2e/Matrix/joint_matrix_opt_kernel_feature_unsupported_hw.cpp b/sycl/test-e2e/Matrix/joint_matrix_opt_kernel_feature_unsupported_hw.cpp
new file mode 100644
index 0000000000000..b20ee36448ef0
--- /dev/null
+++ b/sycl/test-e2e/Matrix/joint_matrix_opt_kernel_feature_unsupported_hw.cpp
@@ -0,0 +1,30 @@
+// REQUIRES: gpu-intel-gen12
+
+// RUN: %{build} -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4
+// RUN: %{run} %t.out
+
+// Test checks that exception will be thrown in case object of joint_matrix type
+// is used on unsupported HW, in this case, on Gen12.
+
+#include <sycl/sycl.hpp>
+
+using namespace sycl::ext::oneapi::experimental::matrix;
+
+int main() {
+  sycl::queue q;
+
+  try {
+    q.submit([&](sycl::handler &cgh) {
+      cgh.single_task([]() {
+        joint_matrix<sycl::sub_group, double, use::b, 2, 2, layout::row_major>
+            m; // matrix type and sizes do not matter
+      });
+    });
+  } catch (const sycl::exception &e) {
+    assert((e.code() == sycl::errc::kernel_not_supported) &&
+           (std::string(e.what()) ==
+            std::string("no matrix hardware on the target device, joint_matrix "
+                        "is not supported")));
+  }
+  return 0;
+}

From 5cdc096e59daeee86685f616db6036f82fa2a757 Mon Sep 17 00:00:00 2001
From: "Kenneth Benzie (Benie)" <k.benzie@codeplay.com>
Date: Wed, 8 Nov 2023 18:00:19 +0000
Subject: [PATCH 859/877] [UR] Bump to ec7982bac6cb3a6b9ed610cd6b7cb41fcbc780dc
 (#11811)

Combines the following L0 changes:

* https://github.com/oneapi-src/unified-runtime/pull/1033
* https://github.com/oneapi-src/unified-runtime/pull/1028
* https://github.com/oneapi-src/unified-runtime/pull/1022
---
 sycl/plugins/unified_runtime/CMakeLists.txt | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/sycl/plugins/unified_runtime/CMakeLists.txt b/sycl/plugins/unified_runtime/CMakeLists.txt
index 2fae1c7a516f7..1d4d0c52caa19 100644
--- a/sycl/plugins/unified_runtime/CMakeLists.txt
+++ b/sycl/plugins/unified_runtime/CMakeLists.txt
@@ -54,13 +54,13 @@ if(SYCL_PI_UR_USE_FETCH_CONTENT)
   include(FetchContent)
 
   set(UNIFIED_RUNTIME_REPO "https://github.com/oneapi-src/unified-runtime.git")
-  # commit 612a263613b235b0257fcaf2f128fc61b06e0c24
-  # Merge: 7db941d 55409e4
+  # commit ec7982bac6cb3a6b9ed610cd6b7cb41fcbc780dc
+  # Merge: 62e6d2f9 5fb82924
   # Author: Kenneth Benzie (Benie) <k.benzie@codeplay.com>
-  # Date:   Tue Nov 7 10:21:37 2023 +0000
-  #     Merge pull request #1011 from fabiomestre/fabio/fix_opencl_leak
-  #     [OpenCL] Fix memory leak
-  set(UNIFIED_RUNTIME_TAG 612a263613b235b0257fcaf2f128fc61b06e0c24)
+  # Date:   Wed Nov 8 13:32:46 2023 +0000
+  #     Merge pull request #1022 from 0x12CC/l0_usm_error_checking_2
+  #     [UR][L0] Propagate OOM errors from `USMAllocationMakeResident`
+  set(UNIFIED_RUNTIME_TAG ec7982bac6cb3a6b9ed610cd6b7cb41fcbc780dc)
 
   if(SYCL_PI_UR_OVERRIDE_FETCH_CONTENT_REPO)
     set(UNIFIED_RUNTIME_REPO "${SYCL_PI_UR_OVERRIDE_FETCH_CONTENT_REPO}")

From 1b201d8c6e468d0c82f9e47d18b766cafd3b1ff7 Mon Sep 17 00:00:00 2001
From: Fraser Cormack <fraser@codeplay.com>
Date: Wed, 8 Nov 2023 18:12:37 +0000
Subject: [PATCH 860/877] [SYCL] Fix `operator-` for `sycl::vec<bool, N>`
 (#11816)

Since bool vectors have a backing storage of chars, the unary minus
operator was in fact producing `(char)-1`, and thus not adhering to the
ABI where bools should be either 0 or 1.

This could manifest itself in bugs, for instance where the "bool"
elements wouldn't compare equal to other bools, such as those in arrays.

This only manifested itself in device code (possibly because the array
of bools is also an array of bytes under the hood), hence the test case
that differs in style somewhat from the rest.
---
 sycl/include/sycl/types.hpp      |  6 +++++-
 sycl/test-e2e/Basic/vec_bool.cpp | 25 +++++++++++++++++++++++++
 2 files changed, 30 insertions(+), 1 deletion(-)

diff --git a/sycl/include/sycl/types.hpp b/sycl/include/sycl/types.hpp
index c1e59d398387d..76773e343b583 100644
--- a/sycl/include/sycl/types.hpp
+++ b/sycl/include/sycl/types.hpp
@@ -1060,7 +1060,11 @@ template <typename Type, int NumElements> class vec {
 
   // operator -
   template <typename T = vec> EnableIfNotUsingArray<T> operator-() const {
-    return vec{-m_Data};
+    vec Ret{-m_Data};
+    if constexpr (std::is_same_v<Type, bool>) {
+      Ret.ConvertToDataT();
+    }
+    return Ret;
   }
 
   template <typename T = vec> EnableIfUsingArray<T> operator-() const {
diff --git a/sycl/test-e2e/Basic/vec_bool.cpp b/sycl/test-e2e/Basic/vec_bool.cpp
index 33e875f5f25ec..f0ecd244ffea9 100644
--- a/sycl/test-e2e/Basic/vec_bool.cpp
+++ b/sycl/test-e2e/Basic/vec_bool.cpp
@@ -57,6 +57,31 @@ int main() {
   }
   check_result(resVec, expected);
 
+  // Test negate (operator -)
+  bool res = true;
+  {
+    sycl::buffer<bool, 1> bufResVec(&res, 1);
+
+    q.submit([&](sycl::handler &cgh) {
+       sycl::accessor accRes(bufResVec, cgh, sycl::write_only);
+       cgh.single_task([=]() {
+         std::array<bool, size> arrTrue;
+         for (int i = 0; i < size; ++i) {
+           arrTrue[i] = -(static_cast<bool>(1));
+         }
+         auto vecTrue = sycl::vec<bool, size>(static_cast<bool>(1));
+         sycl::vec<bool, size> resVec = -vecTrue;
+         // Check that the vector matches the array.
+         for (int i = 0; i < size; ++i) {
+           if (resVec[i] != arrTrue[i]) {
+             accRes[0] = false;
+           }
+         }
+       });
+     }).wait_and_throw();
+  }
+  assert(res && "Incorrect result");
+
   // Test left shift (operator <<) 1
   {
     init_arr(expected, true);

From c7f373628eed5a7a5ae4e17b19608bdc0a3b19be Mon Sep 17 00:00:00 2001
From: Dmitry Sidorov <dmitry.sidorov@intel.com>
Date: Wed, 8 Nov 2023 21:47:45 +0100
Subject: [PATCH 861/877] [SPIR-V][DOC] Update SPV_INTEL_joint_matrix (#11764)

Old version is temporary kept in SPV_INTEL_joint_matrix_legacy.asciidoc

---------

Signed-off-by: Sidorov, Dmitry <dmitry.sidorov@intel.com>
---
 .../SPV_INTEL_joint_matrix.asciidoc           | 694 +++++-----------
 .../SPV_INTEL_joint_matrix_legacy.asciidoc    | 748 ++++++++++++++++++
 2 files changed, 928 insertions(+), 514 deletions(-)
 create mode 100644 sycl/doc/design/spirv-extensions/SPV_INTEL_joint_matrix_legacy.asciidoc

diff --git a/sycl/doc/design/spirv-extensions/SPV_INTEL_joint_matrix.asciidoc b/sycl/doc/design/spirv-extensions/SPV_INTEL_joint_matrix.asciidoc
index aad37186a4abe..2bc9dd3b8279a 100644
--- a/sycl/doc/design/spirv-extensions/SPV_INTEL_joint_matrix.asciidoc
+++ b/sycl/doc/design/spirv-extensions/SPV_INTEL_joint_matrix.asciidoc
@@ -1,29 +1,23 @@
 :extension_name: SPV_INTEL_joint_matrix
-:main_capability_name: JointMatrixINTEL
-:main_capability_token: 6118
-:packed_capability_name: PackedJointMatrixINTEL
+:main_capability_name: CooperativeMatrixKHR
+:packed_capability_name: PackedCooperativeMatrixINTEL
 :packed_capability_token: 6434
-:wi_capability_name: JointMatrixWIInstructionsINTEL
-:wi_capability_token: 6435
-:tf32_capability_name: JointMatrixTF32ComponentTypeINTEL
+:invocation_capability_name: CooperativeMatrixInvocationInstructionsINTEL
+:invocation_capability_token: 6435
+:tf32_capability_name: CooperativeMatrixTF32ComponentTypeINTEL
 :tf32_capability_token: 6436
-:bf16_capability_name: JointMatrixBF16ComponentTypeINTEL
+:bf16_capability_name: CooperativeMatrixBFloat16ComponentTypeINTEL
 :bf16_capability_token: 6437
-:packed2_capability_name: JointMatrixPackedInt2ComponentTypeINTEL
-:packed2_capability_token: 6438
-:packed4_capability_name: JointMatrixPackedInt4ComponentTypeINTEL
-:packed4_capability_token: 6439
-:OpTypeJointMatrixINTEL_token: 6184
-:OpJointMatrixLoadINTEL_token: 6120
-:OpJointMatrixStoreINTEL_token: 6121
-:OpJointMatrixMadINTEL_token: 6122
-:OpJointMatrixSUMadINTEL_token: 6128
-:OpJointMatrixUSMadINTEL_token: 6129
-:OpJointMatrixUUMadINTEL_token: 6130
-:OpJointMatrixWorkItemLengthINTEL_token: 6410
-:OpJointMatrixGetElementCoordINTEL_token: 6440
-
-:DPCPP_URL: https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/experimental/sycl_ext_oneapi_matrix/sycl_ext_intel_matrix.asciidoc
+:capability_prefetch_name: CooperativeMatrixPrefetchINTEL
+:capability_prefetch_token: 6411
+:OpCooperativeMatrixGetElementCoordINTEL_token: 6440
+:OpCooperativeMatrixApplyFunctionINTEL_token: 6448
+:OpCooperativeMatrixPrefetchINTEL_token: 6449
+
+:DPCPP_URL: https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/experimental/sycl_ext_matrix/sycl_ext_intel_matrix.asciidoc
+:bfloat16_conv_url: http://htmlpreview.github.io/?https://github.com/KhronosGroup/SPIRV-Registry/blob/main/extensions/INTEL/SPV_INTEL_bfloat16_conversion.html
+:tf32_conv_url: https://github.com/intel/llvm/pull/6990
+:cache_control_url: http://htmlpreview.github.io/?https://github.com/KhronosGroup/SPIRV-Registry/blob/main/extensions/INTEL/SPV_INTEL_cache_controls.html
 
 {extension_name}
 ================
@@ -72,8 +66,8 @@ please let us know!
 
 [width="40%",cols="25,25"]
 |========================================
-| Last Modified Date | 2023-02-01
-| Revision           | 10
+| Last Modified Date | 2023-11-06
+| Revision           | 15
 |========================================
 
 == Dependencies
@@ -81,14 +75,25 @@ please let us know!
 This extension is written against the SPIR-V Specification,
 Version 1.6 Revision 2.
 
+This extension is written against *SPV_KHR_cooperative_matrix* extension
+specification Revision 3.
+
+This extension is written against *SPV_INTEL_bfloat16_conversion* extension
+specification Revision 1.
+
+This extension is written against *SPV_INTEL_tensor_float32_rounding* extension
+specification Revision 2.
+
 This extension requires SPIR-V 1.0.
 
+
 == Overview
 
-This extension adds a type and instructions for joint matrices. Such matrices
-are shared among a group of work-items and are not private to each work-item.
-The type introduced with this extension allows to specify memory scope and
-location where the joint matrix is used in math operation.
+This extension adds new capabilities to *SPV_KHR_cooperative_matrix* such as special
+interpretations of matrix's element type and 'Packed' layout to support Intel
+VNNI instructions. The extension also adds new instructions for element-wise
+function apply instruction, get matrix element coordinate and matrix prefetch,
+adding mechanisms to specify cache level for matrix load and store instructions.
 
 == Extension Name
 
@@ -107,37 +112,28 @@ This extension introduces new capabilities:
 
 [subs="attributes"]
 ----
-{main_capability_name}
 {packed_capability_name}
-{wi_capability_name}
+{invocation_capability_name}
 {tf32_capability_name}
 {bf16_capability_name}
-{packed2_capability_name}
-{packed4_capability_name}
+{capability_prefetch_name}
 ----
 
 == New Instructions
-
-Instructions added under the *{main_capability_name}* capability:
+Instructions added under the *{invocation_capability_name}* capability:
 
 ----
 
-OpTypeJointMatrixINTEL
-OpJointMatrixLoadINTEL
-OpJointMatrixStoreINTEL
-OpJointMatrixMadINTEL
-OpJointMatrixSUMadINTEL
-OpJointMatrixUSMadINTEL
-OpJointMatrixUUMadINTEL
+OpCooperativeMatrixGetElementCoordINTEL
+OpCooperativeMatrixApplyFunctionINTEL
 
 ----
 
-Instructions added under the *{wi_capability_name}* capability:
+Instructions added under the *{capability_prefetch_name}* capability:
 
 ----
 
-OpJointMatrixWorkItemLengthINTEL
-OpJointMatrixGetElementCoordINTEL
+OpCooperativeMatrixPrefetchINTEL
 
 ----
 
@@ -148,104 +144,59 @@ OpJointMatrixGetElementCoordINTEL
 [cols="70%,30%"]
 [grid="rows"]
 |====
-|*{main_capability_name}*            | {main_capability_token}
 |*{packed_capability_name}*          | {packed_capability_token}
-|*{wi_capability_name}*              | {wi_capability_token}
+|*{invocation_capability_name}*      | {invocation_capability_token}
 |*{tf32_capability_name}*            | {tf32_capability_token}
 |*{bf16_capability_name}*            | {bf16_capability_token}
-|*{packed2_capability_name}*         | {packed2_capability_token}
-|*{packed4_capability_name}*         | {packed4_capability_token}
-|*OpTypeJointMatrixINTEL*            | {OpTypeJointMatrixINTEL_token}
-|*OpJointMatrixLoadINTEL*            | {OpJointMatrixLoadINTEL_token}
-|*OpJointMatrixStoreINTEL*           | {OpJointMatrixStoreINTEL_token}
-|*OpJointMatrixMadINTEL*             | {OpJointMatrixMadINTEL_token}
-|*OpJointMatrixSUMadINTEL*           | {OpJointMatrixSUMadINTEL_token}
-|*OpJointMatrixUSMadINTEL*           | {OpJointMatrixUSMadINTEL_token}
-|*OpJointMatrixUUMadINTEL*           | {OpJointMatrixUUMadINTEL_token}
-|*OpJointMatrixWorkItemLengthINTEL*  | {OpJointMatrixWorkItemLengthINTEL_token}
-|*OpJointMatrixGetElementCoordINTEL* | {OpJointMatrixGetElementCoordINTEL_token}
+|*{capability_prefetch_name}*        | {capability_prefetch_token}
+|*OpCooperativeMatrixGetElementCoordINTEL* | {OpCooperativeMatrixGetElementCoordINTEL_token}
+|*OpCooperativeMatrixApplyFunctionINTEL*   | {OpCooperativeMatrixApplyFunctionINTEL_token}
+|*OpCooperativeMatrixPrefetchINTEL*        | {OpCooperativeMatrixPrefetchINTEL_token}
 |====
 
-== Modifications to the SPIR-V Specification, Version 1.6
-
-=== 2.16 Validation Rules
-
-Joint matrix types (or types containing them) can only be allocated in *Function*
-or *Private* <<Storage Class, Storage Class>>.
-
-=== 2.2 Terms
-Add new terms to section 2.2.2 Types:
-
-_Joint Matrix_: A two-dimensional ordered collection of scalars, whose storage
-is spread across multiple invocations.
+== Modifications to the SPIR-V Specification, Version 1.6 and SPV_KHR_cooperative_matrix, Revision 3
 
-Add _Joint Matrix_ to the definition of _Composite_.
+=== Cooperative Matrix Layout
 
-Add _Joint Matrix_ to the definition of _Concrete Type_.
-
-=== Joint Matrix Layout
-
-Add section 3.XX, Joint Matrix Layout.
-'Layout' indicates how the values of joint matrix are arranged in memory.
+Modify section 3.X, Cooperative Matrix Layout adding *PackedINTEL* layout
 
 [options="header"]
 |====
 2+^| Layout ^| Enabling capability 
-| 0 | *RowMajor*               |  *{main_capability_name}*
-| 1 | *ColumnMajor*            |  *{main_capability_name}*
-| 2 | *Packed* +
+| 0x2 | *PackedINTEL* +
 Suitable for Vector Neural Network Instruction (VNNI) format used in Intel AMX
 and Intel XMX. It specifies that the data was prepacked by user before loading
-a joint matrix.
+a cooperative matrix.
 More info could be found in {DPCPP_URL}[DPCPP matrix extension spec] | *{packed_capability_name}*
 |====
 
-=== Joint Matrix Use
 
-Add section 3.XX, Joint Matrix Use.
-'Use' specifies where the joint matrix is used in math operation.
+=== Cooperative Matrix Operands
 
-[options="header"]
-|====
-2+^| Use ^| Enabling capability
-| 0 | *MatrixA*     | *{main_capability_name}*
-| 1 | *MatrixB*     | *{main_capability_name}*
-| 2 | *Accumulator* | *{main_capability_name}*
-|====
-
-=== Joint Matrix Component Type Interpretation
-
-Add section 3.XX, Joint Matrix Component Type Interpretation.
-To be used by 'Component Type Interpretation' optional parameter of
-*TypeJointMatrixINTEL*.
+Modify section 3.X, Cooperative Matrix Operands adding new entries to the table
+to specify Component Type Interpretation
 
 [options="header"]
 |====
 2+^| Interpretation ^| Enabling capability
-| 0 | *None* |
-| 1 | *TF32* +
-'Component Type' must be _float_. Interpret 'Component Type' of joint matrix
-as TF32. | *{tf32_capability_name}*
-| 2 | *Bfloat16* +
-'Component Type' must be 16-bit _integer_. Interpret 'Component Type' of joint
-matrix as Bfloat16. | *{bf16_capability_name}*
-| 3 | *PackedInt2* +
-'Component Type' must be _integer_. Interpret <N>-bit _integer_ 'Component Type'
-of joint matrix as a vector of 2-bit integers. Number of components of this
-vector is limited by enabled SPIR-V capabilities, which brings limitations on
-possible width of the _integer_. +
-If a joint matrix type that has *ComponentTypeInterpretation* operand with
-*PackedInt2* value is used in an arithmetic instruction, then to verify
-this instruction's inputs 'Column' and 'Row' of the matrix should be taken with
-a factor of a size of this packed vector.  | *{packed2_capability_name}*
-| 4 | *PackedInt4* +
-Interpret <N>-bit _integer_ 'Component Type' of joint matrix as a vector of 4-bit integers.
-Number of components of this vector is limited by enabled SPIR-V capabilities,
-which brings limitations on possible width of the _integer_. +
-If a joint matrix type that has *ComponentTypeInterpretation* operand with
-*PackedInt4* value is used in an arithmetic instruction, then to verify
-this instruction's inputs 'Column' and 'Row' of the matrix should be taken with
-a factor of a size of this packed vector.  | *{packed4_capability_name}*
+| 0x20 | *MatrixAAndBTF32ComponentsINTEL* +
+'Component Type' of 'A' and 'B' must be 32-bit _floating-point type_. Interpret 'Component Type' of
+'A' and 'B' cooperative matrices as TF32. | *{tf32_capability_name}*
+| 0x40 | *MatrixAAndBBFloat16ComponentsINTEL* +
+'Component Type' of 'A' and 'B' must be 16-bit _integer_. Interpret 'Component Type' of
+'A' and 'B' cooperative matrices as BFloat16. +
+It is mutually exclusive with *Matrix{A,B}SignedComponents* Cooperative Matrix Operands.
+| *{bf16_capability_name}*
+| 0x80 | *MatrixCBFloat16ComponentsINTEL* +
+'Component Type' of 'C' must be 16-bit _integer_. Interpret 'Component Type' of
+'C' cooperative matrix as BFloat16. +
+It is mutually exclusive with *MatrixCSignedComponents* Cooperative Matrix Operands.
+| *{bf16_capability_name}*
+| 0x100 | *MatrixResultBFloat16ComponentsINTEL* +
+'Component Type' of 'Result' must be 16-bit _integer_. Interpret 'Component Type' of
+'Result' cooperative matrix as BFloat16. +
+It is mutually exclusive with *MatrixResultSignedComponents* Cooperative Matrix Operands.
+| *{bf16_capability_name}*
 |====
 
 === Capabilities
@@ -256,265 +207,135 @@ Modify Section 3.31, Capability, adding rows to the Capability table:
 [options="header"]
 |====
 2+^| Capability ^| Implicitly Declares 
-| {main_capability_token} | *{main_capability_name}* +
- +
-Uses *TypeJointMatrixINTEL* +
-See also extension: *{extension_name}*
-|
 | {packed_capability_token} | *{packed_capability_name}* +
  +
-Uses *Packed* layout to <<Joint Matrix Layout,*Joint Matrix Layout*>>. +
-See also extension: *{extension_name}*
+Uses *PackedINTEL* layout to +Cooperative Matrix Layout+. +
 | *{main_capability_name}* +
-| {wi_capability_token} | *{wi_capability_name}* +
+| {invocation_capability_token} | *{invocation_capability_name}* +
  +
-Uses <<OpJointMatrixWorkItemLengthINTEL,*OpJointMatrixWorkItemLengthINTEL*>> and
-<<OpJointMatrixGetElementCoordINTEL,*OpJointMatrixGetElementCoordINTEL*>>
+Uses *OpCooperativeMatrixGetElementCoordINTEL* and *OpCooperativeMatrixApplyFunctionINTEL*
 instructions. +
-See also extension: *{extension_name}*
 | *{main_capability_name}* +
 | {tf32_capability_token} | *{tf32_capability_name}* +
  +
-Uses *TF32* in 3.XX, Joint Matrix Component Type Interpretation +
+Uses *TF32* in 3.X, Cooperative Matrix Operands +
  +
-See also extension: *{extension_name}*
 | *{main_capability_name}* +
 | {bf16_capability_token} | *{bf16_capability_name}* +
  +
-Uses *BF16* in 3.XX, Joint Matrix Component Type Interpretation +
- +
-See also extension: *{extension_name}*
-| *{main_capability_name}* +
-| {packed2_capability_token} | *{packed2_capability_name}* +
- +
-Uses *PackedInt2* in 3.XX, Joint Matrix Component Type Interpretation +
+Uses *BFloat16* in 3.X, Cooperative Matrix Operands +
  +
-See also extension: *{extension_name}*
 | *{main_capability_name}* +
-| {packed4_capability_token} | *{packed4_capability_name}* +
+| {capability_prefetch_token} | *{capability_prefetch_name}* +
  +
-Uses *PackedInt4* in 3.XX, Joint Matrix Component Type Interpretation +
+Uses *OpCooperativeMatrixPrefetchINTEL* instructions. +
  +
-See also extension: *{extension_name}*
 | *{main_capability_name}* +
 |====
 --
 
 === Instructions
 
-==== 3.42.6 Type-Declaration Instructions
+==== 3.42.8. Memory Instructions
 
-[cols="1,1,7*3",width="100%"]
-|=====
-8+|[[OpTypeJointMatrixINTEL]]*OpTypeJointMatrixINTEL* +
- +
-Declare a joint matrix type. +
- +
-'Component Type' is the type of each component in the resulting type. It must be
-a scalar 'numerical type'. +
- +
-'Row Count' is the number of rows in the joint matrix type. It must be an '<id>'
-of 'constant instruction' with scalar 32-bit integer. It is invalid for 'Column Count' to be 0 or
-<<OpConstantNull,*OpConstantNull*>>. +
- +
-'Column Count' is the number of columns in the joint matrix type. It must be an '<id>'
-of 'constant instruction' with scalar 32-bit integer. It is invalid for 'Column Count' to be 0 or
-<<OpConstantNull,*OpConstantNull*>>. +
- +
-Execution is a 'Scope'. Must be an '<id>' of 'constant instruction'
-with scalar 32-bit integer. Its value must be either _Workgroup_ or
-_Subgroup_ from the table 3.27. Scope <id>. +
- +
-'Use' parameter shows location of the matrix in a math operation.
-Must be an '<id>' of 'constant instruction' with scalar 32-bit integer type. Its
-value must be one of the values in the table 3.XX, <<Joint Matrix Use,Joint Matrix Use>>. +
- +
-_Optional_ 'Component Type Interpretation' specifies how to interpret
-'Component Type' when components of a joint matrix are storages for values of
-different types. Must be an '<id>' of 'constant instruction' with scalar 32-bit
-integer type. Its value must be one of the values in the table 3.XX,
-<<Joint Matrix Component Type Interpretation,Joint Matrix Component Type Interpretation>>. +
- +
+Modify *OpCooperativeMatrixLoadKHR* adding: +
+Note: To specify cache level for *OpCooperativeMatrixLoadKHR* one
+can use *CacheControlLoadINTEL* decoration from {cache_control_url}[SPV_INTEL_cache_controls extension]. +
 
-1+|Capability: +
-*{main_capability_name}*
-1+| 7+ | {OpTypeJointMatrixINTEL_token}
-| 'Result <id>'
-| '<id>' +
-'Component Type'
-| '<id>' +
-'Row Count'
-| '<id>' +
-'Column Count'
-| '<id>' +
-'Scope'
-| '<id>' +
-'Use'
-|_Optional_ '<id>' +
-'Component Type Interpretation'
-|=====
-
-==== 3.42.8. Memory Instructions
+Modify *OpCooperativeMatrixStoreKHR* adding: +
+Note: To specify cache level for *OpCooperativeMatrixStoreKHR* one
+can use *CacheControlStoreINTEL* decoration from {cache_control_url}[SPV_INTEL_cache_controls extension]. +
+ +
 
-[cols="1,1,6*3",width="100%"]
+[cols="1,1,8*3",width="100%"]
 |=====
-7+|[[OpJointMatrixLoadINTEL]]*OpJointMatrixLoadINTEL* +
- +
-Load a joint matrix through a pointer. +
+9+|[[OpCooperativeMatrixPrefetchINTEL]]*OpCooperativeMatrixPrefetchINTEL* +
  +
-'Result Type' is the type of the loaded joint matrix. It must be
-<<OpTypeJointMatrixINTEL,OpTypeJointMatrixINTEL>>. +
+The instruction does not modify the behaviour of the program. The instruction
+prefetches 'Rows' X 'Columns' block of data. +
  +
-'Pointer' is the pointer to load through. It specifies start of memory region 
-where elements of the joint matrix are stored and arranged according to 'Layout'.
-The <<Storage Class,Storage Class>> of 'Pointer' must be *Workgroup*,
-*CrossWorkgroup*, *StorageBuffer*, *Generic* or *PhysicalStorageBuffer*. +
+'Pointer' is a pointer to a memory to prefetch. Its type must be an *OpTypePointer*
+whose 'Type' operand is a scalar or vector type. If the *Shader* capability was
+declared, 'Pointer' must point into an array and any *ArrayStride* decoration on
+'Pointer' is ignored. +
  +
-'Stride' describes the number of elements between consecutive rows for the
-RowMajor 'layout', or between columns for the ColumnMajor 'layout'. +
+'X offset' must be a constant instruction with scalar 32-bit integer type.
+It specifies offset in bytes along X axis from the 'Pointer' where prefetched
+memory region starts from. +
  +
-'Layout' indicates how the values in memory are arranged.
-Must be an '<id>' of 'constant instruction' with scalar 32-bit integer type. Its
-value must be one of the values in the table 3.XX,
-<<Joint Matrix Layout,Joint Matrix Layout>>. +
- +
-If present, any 'Memory Operands' must begin with a 
-<<Memory_Operands,*memory operand*>> literal. If not present, it is the same as
-specifying the <<Memory_Operands,*memory operand*>> *None*. +
- +
-For a given dynamic instance of this instruction, all operands of this
-instruction must be the same for all invocations in a given scope instance
-(where the scope is the scope the joint matrix type was created with).
-All invocations in a given scope instance must be active or all must be
-inactive.
- +
-
-1+|Capability: +
-*{main_capability_name}*
-1+| 6 + variable | {OpJointMatrixLoadINTEL_token}
-| '<id>' +
-'Result Type'
-|'Result <id>'
-| '<id>' +
-'Pointer'
-| '<id>' +
-'Stride'
-| '<id>' +
-'<<Joint Matrix Layout,Layout>>'
-| Optional +
-'Memory Operands'
-|=====
-
-[cols="1,1,5*3",width="100%"]
-|=====
-6+|[[OpJointMatrixStoreINTEL]]*OpJointMatrixStoreINTEL* +
+'Y offset' must be a constant instruction with scalar 32-bit integer type.
+It specifies offset in bytes along Y axis from the 'Pointer' where prefetched
+memory region starts from. +
  +
-Store a joint matrix through a pointer. +
+'Rows' must be a constant instruction with scalar 32-bit integer type. +
  +
-'Pointer' is the pointer to store through. It specifies start of memory region 
-where elements of the joint matrix must be stored and arranged according to 'Layout'. +
-The <<Storage Class,Storage Class>> of 'Pointer' must be *Workgroup*,
-*CrossWorkgroup*, *StorageBuffer*, *Generic* or *PhysicalStorageBuffer*. +
+'Columns' must be a constant instruction with scalar 32-bit integer type. +
  +
-'Object' is the joint matrix to store. It must be
-<<OpTypeJointMatrixINTEL,*OpTypeJointMatrixINTEL*>>. +
+'Cache Level' is an unsigned 32-bit integer telling the cache level to which
+the control applies. The value `0` indicates the cache level closest to the
+processing unit, the value `1` indicates the next furthest cache level, etc.
+If some cache level does not exist, the instruction is ignored. +
  +
-'Stride' describes the number of elements between consecutive rows for the
-RowMajor 'layout', or between columns for the ColumnMajor 'layout'. +
+'MemoryLayout' specifies how matrix elements are laid out in memory. It must come
+from a 32-bit integer 'constant instruction' whose value corresponds to a
+'Cooperative Matrix Layout'. See the _Cooperative Matrix Layout_ table for
+a description of the layouts and detailed layout-specific rules. +
  +
-'Layout' indicates how the values stored are arranged in memory.
-Must be an '<id>' of 'constant instruction' with scalar 32-bit integer type. Its
-value must be one of the values in the table 3.XX,
-<<Joint Matrix Layout,Joint Matrix Layout>>. +
+'Stride' further qualifies how matrix elements are laid out in memory. It must be a
+scalar 'integer type' and its exact semantics depend on 'MemoryLayout'. +
  +
-If present, any 'Memory Operands' must begin with a
-<<Memory_Operands,*memory operand*>> literal. If not present, it is the same as
-specifying the <<Memory_Operands,*memory operand*>> *None*. +
- +
-For a given dynamic instance of this instruction, all operands of this
-instruction must be the same for all invocations in a given scope instance
-(where the scope is the scope the joint matrix type was created with).
-All invocations in a given scope instance must be active or all must be
-inactive.
- +
-
 1+|Capability: +
-*{main_capability_name}*
-1+| 5 + variable | {OpJointMatrixStoreINTEL_token}
-| '<id>' +
-'Pointer'
-| '<id>' +
-'Object'
-| '<id>' +
-'Stride'
-| '<id>' +
-'<<Joint Matrix Layout,Layout>>'
-| Optional +
-'Memory Operands'
+*{capability_prefetch_name}*
+1+| 8+variable | {OpCooperativeMatrixPrefetchINTEL_token} | '<id>' +
+'Pointer' | '<id>' +
+'X offset' | '<id>' +
+'Y offset' | '<id>' +
+'Rows' | '<id>' +
+'Columns' | Literal +
+'Cache Level' | '<id>' +
+'MemoryLayout' | Optional '<id>' +
+'Stride' |
 |=====
 
-==== 3.42.12. Composite Instructions
-
-Modify OpCompositeConstruct to make an exception for joint matrix types:
-"If the 'Result Type' is <<OpTypeJointMatrixINTEL,*OpTypeJointMatrixINTEL*>>
-then there must be only one 'Constituent' and it will be used to initialize all
-elements of the joint matrix."
+==== 3.42.11. Conversion Instructions
 
+If *{bf16_capability_name}* and *BFloat16ConversionINTEL* capabilities are
+declared, then allow cooperative matrix types for the following conversion
+instructions (if the component types are appropriate): *OpConvertFToBF16INTEL*,
+*OpConvertBF16ToFINTEL* (See also: {bfloat16_conv_url}[SPV_INTEL_bfloat16_conversion]
+extension).
 
-Modify *OpVectorExtractDynamic* and *OpVectorInsertDynamic* to accept
-<<OpTypeJointMatrixINTEL,*OpTypeJointMatrixINTEL*>> as the 'Vector' operand.
-In this case the instructions operate on an implicit vector which represents
-part of the joint matrix and holds components owned by the current work-item.
-If the 'index' operand of these instructions is less than zero or exceeds the
-value returned by
-<<OpJointMatrixWorkItemLengthINTEL,*OpJointMatrixWorkItemLengthINTEL*>>,
-behavior is undefined.
+If *{tf32_capability_name}* and *TensorFloat32RoundingINTEL* capabilities are
+declared, then allow cooperative matrix types for the following conversion
+instructions (if the component types are appropriate): *OpRoundFToTF32INTEL*
+(See also: {tf32_conv_url}[SPV_INTEL_tensor_float32_rounding] extension).
 
-[cols="1,1,3*3",width="100%"]
-|=====
-4+|[[OpJointMatrixWorkItemLengthINTEL]]*OpJointMatrixWorkItemLengthINTEL* +
- +
-Return number of components owned by the current work-item in a joint matrix. +
- +
-'Result Type' must be an 'integer type' scalar. +
- +
-'Matrix' is an ID of <<OpTypeJointMatrixINTEL,*OpTypeJointMatrixINTEL*>>.
-The instruction returns the number of the components of this joint matrix type
-owned by the current work-item. +
-
-1+|Capability: +
-*{wi_capability_name}*
-1+| 4 | {OpJointMatrixWorkItemLengthINTEL_token}
-| '<id>' +
-'Result Type'
-| 'Result <id>'
-| '<id>' +
-'Matrix'
-|=====
+==== 3.42.12. Composite Instructions
 
 [cols="1,1,4*3",width="100%"]
 |=====
-5+|[[OpJointMatrixGetElementCoordINTEL]]*OpJointMatrixGetElementCoordINTEL* +
+5+|[[OpCooperativeMatrixGetElementCoordINTEL]]*OpCooperativeMatrixGetElementCoordINTEL* +
+ +
+*NOTE* the instruction is being deprecated. +
  +
 Returns (Row, Column) coordinate of dynamically selected element of a matrix.  +
  +
-'Result Type' must be an integer 2-elements vector, where the first component
+'Result Type' must be a 32-bit integer 2-elements vector, where the first component
 contains the row with the selected element, and the second element contains the
 column with the selected element. +
  +
-'Matrix' is an ID of <<OpTypeJointMatrixINTEL,*OpTypeJointMatrixINTEL*>>.
-The instruction returns the element's coordinate of this joint matrix type. +
+'Matrix' is an ID of *OpTypeCooperativeMatrixKHR*. The instruction returns the
+element's coordinate of this cooperative matrix type. +
  +
-'Index' must be a 'scalar integer'. It is interpreted as an index into the list
-of components owned by this work-item in the joint matrix. The behavior is
+'Index' must be a 32-bit 'scalar integer'. It is interpreted as an index into the list
+of components owned by this work-item in the cooperative matrix. The behavior is
 undefined if 'Index' is less than zero or greater than or equal to the number
-that <<OpJointMatrixWorkItemLengthINTEL,*OpJointMatrixWorkItemLengthINTEL*>>
-returns for this work-item. +
+that *OpCooperativeMatrixLengthKHR* returns for this work-item. +
  +
 
 1+|Capability: +
-*{wi_capability_name}*
-1+| 5 | {OpJointMatrixGetElementCoordINTEL_token}
+*{invocation_capability_name}*
+1+| 5 | {OpCooperativeMatrixGetElementCoordINTEL_token}
 | '<id>' +
 'Result Type'
 | 'Result <id>'
@@ -524,216 +345,56 @@ returns for this work-item. +
 'Index'
 |=====
 
-==== 3.42.13. Arithmetic Instructions
-
-[cols="1,1,5*3",width="100%"]
-|=====
-6+|[[OpJointMatrixMadINTEL]]*OpJointMatrixMadINTEL* +
- +
-Multiply matrix 'A' by matrix 'B' and add matrix 'C' to the result of the
-multiplication: `A x B + C`. Here 'A' is a `M x K` matrix, 'B' is a `K x N`
-matrix and 'C' is a `M x N` matrix. +
- +
-It is invalid to have sizes of operands that do not meet the conditions above.
-All operands and the 'Result Type' must be
-<<OpTypeJointMatrixINTEL,*OpTypeJointMatrixINTEL*>>. +
- +
-'A' must be a <<OpTypeJointMatrixINTEL,*OpTypeJointMatrixINTEL*>> whose
-'Row Count' equals to 'M' and 'Column Count' equals to 'K'.
-'Use' argument of matrix type must be 'MatrixA'. +
- +
-'B' must be a <<OpTypeJointMatrixINTEL,*OpTypeJointMatrixINTEL*>> whose
-'Row Count' equals to 'K' and 'Column Count' equals to 'N'.
-'Use' argument of matrix type must be 'MatrixB'. +
- +
-'C' and 'Result Type' must be a
-<<OpTypeJointMatrixINTEL,*OpTypeJointMatrixINTEL*>> with 'Row Count' equals
-to 'M' and 'Column Count' equals to 'N'. 'Use' argument of joint matrix type
-must be 'Accumulator'. +
- +
-'Scope' of 'A', 'B', 'C' and 'Result' matrices must match.
- +
-All invocations in a given 'Scope' instance must be active or all must be
-inactive.
- +
-Behavior is undefined if not all invocations of this module within 'Scope' of
-'Result' reach this point of execution. +
- +
-Behavior is undefined unless all invocations within 'Scope' of 'Result'
-execute the same dynamic instance of this instruction. +
- +
-
-1+|Capability: +
-*{main_capability_name}*
-1+| 6 | {OpJointMatrixMadINTEL_token}
-| '<id>' +
-'Result Type'
-|'Result <id>'
-| '<id>' +
-'A'
-| '<id>' +
-'B'
-| '<id>' +
-'C'
-|=====
-
 [cols="1,1,5*3",width="100%"]
 |=====
-6+|[[OpJointMatrixSUMadINTEL]]*OpJointMatrixSUMadINTEL* +
- +
-Multiply matrix 'A' by matrix 'B' and add matrix 'C' to the result of the
-multiplication: `A x B + C`. Here 'A' is a `M x K` matrix, 'B' is a `K x N`
-matrix and 'C' is a `M x N` matrix. +
- +
-It is invalid to have sizes of operands that do not meet the conditions above.
-All operands and the 'Result Type' must be
-<<OpTypeJointMatrixINTEL,*OpTypeJointMatrixINTEL*>>. +
+6+|[[OpCooperativeMatrixApplyFunctionINTEL]]*OpCooperativeMatrixApplyFunctionINTEL* +
  +
-'A' must be a <<OpTypeJointMatrixINTEL,*OpTypeJointMatrixINTEL*>> whose
-'Component Type' is signed 'integer type', 'Row Count' equals to 'M' and
-'Column Count' equals to 'K'. 'Use' argument of matrix type must be 'MatrixA'. +
+Apply the function for each element of the matrix. Results in a new matrix within
+the same scope and with the same number of rows and columns. +
  +
-'B' must be a <<OpTypeJointMatrixINTEL,*OpTypeJointMatrixINTEL*>> whose
-'Component Type' is unsigned 'integer type', 'Row Count' equals to 'K'
-and 'Column Count' equals to 'N'. 'Use' argument of joint matrix type must be
-'MatrixB'. +
+'Result Type' is the type of the return value of the function. It must be an
+*OpTypeCooperativeMatrix* with the same _Scope_, _Rows_ and _Columns_ as the type of
+'Matrix' operand. _Component type_ as well as _Use_ of 'Result Type' and 'Matrix' can
+differ. +
  +
-'C' and 'Result Type' must be a
-<<OpTypeJointMatrixINTEL,*OpTypeJointMatrixINTEL*>> with signed 'integer type'
-'Component Type', 'Row Count' equals to 'M' and 'Column Count' equals to 'N'.
-'Use' argument of joint matrix type must be 'Accumulator'. +
+'Function' is an *OpFunction* instruction whose *OpTypeFunction* operand has _Result Type_
+of scalar _numerical type_. This could be a forward reference. The 'Function' will be
+invoked (_Rows_ - 'Y')_x_(_Cols_ - 'X') times within the cooperative matrix scope. The first parameter of the
+'Function' must be scalar _numerical type_ that corresponds to an element of
+the matrix to which 'Function' is being applied.
  +
-'Scope' of 'A', 'B', 'C' and 'Result' matrices must match.
+'Matrix' is a cooperative matrix which elements are used as the first parameter of
+the 'Function'. +
  +
-All invocations in a given 'Scope' instance must be active or all must be
-inactive.
+'Argument N' is the object to copy to parameter N. +
  +
-Behavior is undefined if not all invocations of this module within 'Scope' of
-'Result' reach this point of execution. +
- +
-Behavior is undefined unless all invocations within 'Scope' of 'Result'
-execute the same dynamic instance of this instruction. +
+*Note* the first parameter is omitted in this list of parameters, as it is copied
+from the unique element of the 'Matrix'. Following two parameters must be (X, Y)
+coordinate of a first element of the matrix to apply the function, for example
+(0, 0) would mean, that *OpCooperativeMatrixApplyFunctionINTEL* affects the
+entire matrix. +
  +
 
 1+|Capability: +
-*{main_capability_name}*
-1+| 6 | {OpJointMatrixSUMadINTEL_token}
+*{invocation_capability_name}*
+1+| 4 + variable | {OpCooperativeMatrixApplyFunctionINTEL_token}
 | '<id>' +
 'Result Type'
-|'Result <id>'
-| '<id>' +
-'A'
+| 'Result <id>'
 | '<id>' +
-'B'
+'Function'
 | '<id>' +
-'C'
+'Matrix'
+| '<id>, <id>, ..., <id>' +
+'Argument 1', 'Argument 2', ..., 'Argument N'
 |=====
 
-[cols="1,1,5*3",width="100%"]
-|=====
-6+|[[OpJointMatrixUSMadINTEL]]*OpJointMatrixUSMadINTEL* +
- +
-Multiply matrix 'A' by matrix 'B' and add matrix 'C' to the result of the
-multiplication: `A x B + C`. Here 'A' is a `M x K` matrix, 'B' is a `K x N`
-matrix and 'C' is a `M x N` matrix. +
- +
-It is invalid to have sizes of operands that do not meet the conditions above.
-All operands and the 'Result Type' must be
-<<OpTypeJointMatrixINTEL,*OpTypeJointMatrixINTEL*>>. +
- +
-'A' must be a <<OpTypeJointMatrixINTEL,*OpTypeJointMatrixINTEL*>> whose
-'Component Type' is unsigned 'integer type', 'Row Count' equals to 'M' and
-'Column Count' equals to 'K'. 'Use' argument of joint matrix type must be 'MatrixA'. +
- +
-'B' must be a <<OpTypeJointMatrixINTEL,*OpTypeJointMatrixINTEL*>> whose
-'Component Type' is signed 'integer type', 'Row Count' equals to 'K' and
-'Column Count' equals to 'N'. 'Use' argument of matrix type must be 'MatrixB'. +
- +
-'C' and 'Result Type' must be a
-<<OpTypeJointMatrixINTEL,*OpTypeJointMatrixINTEL*>> with signed 'integer type'
-'Component Type', 'Row Count' equals to 'M' and 'Column Count' equals to 'N'.
-'Use' argument of joint matrix type must be 'Accumulator'. +
- +
-'Scope' of 'A', 'B', 'C' and 'Result' matrices must match.
- +
-All invocations in a given 'Scope' instance must be active or all must be
-inactive.
- +
-Behavior is undefined if not all invocations of this module within 'Scope' of
-'Result' reach this point of execution. +
- +
-Behavior is undefined unless all invocations within 'Scope' of 'Result'
-execute the same dynamic instance of this instruction. +
- +
-
-1+|Capability: +
-*{main_capability_name}*
-1+| 6 | {OpJointMatrixUSMadINTEL_token}
-| '<id>' +
-'Result Type'
-|'Result <id>'
-| '<id>' +
-'A'
-| '<id>' +
-'B'
-| '<id>' +
-'C'
-|=====
+=== Issues
 
-[cols="1,1,5*3",width="100%"]
-|=====
-6+|[[OpJointMatrixUUMadINTEL]]*OpJointMatrixUUMadINTEL* +
- +
-Multiply matrix 'A' by matrix 'B' and add matrix 'C' to the result of the
-multiplication: `A x B + C`. Here 'A' is a `M x K` matrix, 'B' is a `K x N`
-matrix and 'C' is a `M x N` matrix. +
+1. Should we keep *OpCooperativeMatrixGetElementCoordINTEL* once we have *OpCooperativeMatrixApplyFunctionINTEL*? +
  +
-It is invalid to have sizes of operands that do not meet the conditions above.
-All operands and the 'Result Type' must be
-<<OpTypeJointMatrixINTEL,*OpTypeJointMatrixINTEL*>>. +
- +
-'A' must be a <<OpTypeJointMatrixINTEL,*OpTypeJointMatrixINTEL*>> whose
-'Component Type' is unsigned 'integer type', 'Row Count' equals to 'M' and
-'Column Count' equals to 'K'. 'Use' argument of joint matrix type must be 'MatrixA'. +
- +
-'B' must be a <<OpTypeJointMatrixINTEL,*OpTypeJointMatrixINTEL*>> whose
-'Component Type' is unsigned 'integer type', 'Row Count' equals to 'K' and
-'Column Count' equals to 'N'. 'Use' argument of joint matrix type must be 'MatrixB'. +
- +
-'C' and 'Result Type' must be a
-<<OpTypeJointMatrixINTEL,*OpTypeJointMatrixINTEL*>> with unsigned 'integer type'
-'Component Type', 'Row Count' equals to 'M' and 'Column Count' equals to 'N'.
-'Use' argument of joint matrix type must be 'Accumulator'. +
- +
-'Scope' of 'A', 'B', 'C' and 'Result' matrices must match.
- +
-All invocations in a given 'Scope' instance must be active or all must be
-inactive.
- +
-Behavior is undefined if not all invocations of this module within 'Scope' of
-'Result' reach this point of execution. +
- +
-Behavior is undefined unless all invocations within 'Scope' of 'Result'
-execute the same dynamic instance of this instruction. +
- +
-
-1+|Capability: +
-*{main_capability_name}*
-1+| 6 | {OpJointMatrixUUMadINTEL_token}
-| '<id>' +
-'Result Type'
-|'Result <id>'
-| '<id>' +
-'A'
-| '<id>' +
-'B'
-| '<id>' +
-'C'
-|=====
-
-=== Issues
+*RESOLVED*: No, *OpCooperativeMatrixGetElementCoordINTEL* will be removed, for now put deprecation note. +
 
-None
 
 Revision History
 ----------------
@@ -745,7 +406,7 @@ Revision History
 |Rev|Date|Author|Changes
 |1|2021-02-16|Alexey Sotkin|Initial revision
 |2|2021-09-06|Dmitry Sidorov|Split OpJointMatrixMadINTEL instruction into 4
-|3|2021-12-28|Dmitry Sidorov|Add Joint Matrix to Composite definition
+|3|2021-12-28|Dmitry Sidorov|Add Joint matrix to Composite definition
 |4|2022-03-10|Dmitry Sidorov|Add OpJointMatrixWorkItemLengthINTEL instruction
 |5|2022-04-01|Dmitry Sidorov|Add Use parameter to TypeJointMatrixINTEL
 |6|2022-09-07|Dmitry Sidorov|Make Use parameter to be mandatory
@@ -753,4 +414,9 @@ Revision History
 |8|2022-12-02|Dmitry Sidorov|Remove Scope from the instructions and Layout from the type
 |9|2022-12-07|Dmitry Sidorov|Split main capability into 3
 |10|2023-02-01|Dmitry Sidorov|Move ComponentTypeInterpretation to an optional type parameter
+|11|2023-07-05|Dmitry Sidorov|Update on top of SPV_KHR_cooperative_matrix
+|12|2023-09-25|Dmitry Sidorov|Add apply function instruction
+|13|2023-09-25|Dmitry Sidorov|Add convertion instructions for tf32 and bf16
+|14|2023-10-11|Dmitry Sidorov|Add matrix prefetch instruction
+|15|2023-11-06|Dmitry Sidorov|Put deprecation note on OpCooperativeMatrixGetElementCoordINTEL
 |========================================
diff --git a/sycl/doc/design/spirv-extensions/SPV_INTEL_joint_matrix_legacy.asciidoc b/sycl/doc/design/spirv-extensions/SPV_INTEL_joint_matrix_legacy.asciidoc
new file mode 100644
index 0000000000000..9e1cd8dac58f6
--- /dev/null
+++ b/sycl/doc/design/spirv-extensions/SPV_INTEL_joint_matrix_legacy.asciidoc
@@ -0,0 +1,748 @@
+:extension_name: SPV_INTEL_joint_matrix
+:main_capability_name: JointMatrixINTEL
+:main_capability_token: 6118
+:packed_capability_name: PackedJointMatrixINTEL
+:packed_capability_token: 6434
+:wi_capability_name: JointMatrixWIInstructionsINTEL
+:wi_capability_token: 6435
+:tf32_capability_name: JointMatrixTF32ComponentTypeINTEL
+:tf32_capability_token: 6436
+:bf16_capability_name: JointMatrixBF16ComponentTypeINTEL
+:bf16_capability_token: 6437
+:packed2_capability_name: JointMatrixPackedInt2ComponentTypeINTEL
+:packed2_capability_token: 6438
+:packed4_capability_name: JointMatrixPackedInt4ComponentTypeINTEL
+:packed4_capability_token: 6439
+:OpTypeJointMatrixINTEL_token: 6184
+:OpJointMatrixLoadINTEL_token: 6120
+:OpJointMatrixStoreINTEL_token: 6121
+:OpJointMatrixMadINTEL_token: 6122
+:OpJointMatrixSUMadINTEL_token: 6128
+:OpJointMatrixUSMadINTEL_token: 6129
+:OpJointMatrixUUMadINTEL_token: 6130
+:OpJointMatrixWorkItemLengthINTEL_token: 6410
+:OpJointMatrixGetElementCoordINTEL_token: 6440
+
+:DPCPP_URL: https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/experimental/sycl_ext_oneapi_matrix/sycl_ext_intel_matrix.asciidoc
+
+{extension_name}
+================
+
+
+== Name Strings
+
+{extension_name}
+
+== Contact
+
+To report problems with this extension, please open a new issue at:
+
+https://github.com/intel/llvm
+
+== Contributors
+
+- Alexey Sotkin, Intel +
+- Dounia Khaldi, Intel +
+- Mateusz Belicki, Intel +
+- Dmitry Sidorov, Intel +
+- Ben Ashbaugh, Intel +
+- Greg Lueck, Intel +
+- Victor Mustya, Intel +
+- Arvind Sudarsanam, Intel +
+
+== Notice
+
+Copyright (c) 2023 Intel Corporation.  All rights reserved.
+
+== Status
+
+It's a legacy version of experimental extension that reflects current joint matrix
+implementation in DPC++.
+
+
+== Version
+
+[width="40%",cols="25,25"]
+|========================================
+| Last Modified Date | 2023-02-01
+| Revision           | 10
+|========================================
+
+== Dependencies
+
+This extension is written against the SPIR-V Specification,
+Version 1.6 Revision 2.
+
+This extension requires SPIR-V 1.0.
+
+== Overview
+
+This extension adds a type and instructions for joint matrices. Such matrices
+are shared among a group of work-items and are not private to each work-item.
+The type introduced with this extension allows to specify memory scope and
+location where the joint matrix is used in math operation.
+
+== Extension Name
+
+
+To use this extension within a SPIR-V module, the appropriate *OpExtension* must
+be present in the module:
+
+[subs="attributes"]
+----
+OpExtension "{extension_name}"
+----
+
+== New Capabilities
+
+This extension introduces new capabilities:
+
+[subs="attributes"]
+----
+{main_capability_name}
+{packed_capability_name}
+{wi_capability_name}
+{tf32_capability_name}
+{bf16_capability_name}
+{packed2_capability_name}
+{packed4_capability_name}
+----
+
+== New Instructions
+
+Instructions added under the *{main_capability_name}* capability:
+
+----
+
+OpTypeJointMatrixINTEL
+OpJointMatrixLoadINTEL
+OpJointMatrixStoreINTEL
+OpJointMatrixMadINTEL
+OpJointMatrixSUMadINTEL
+OpJointMatrixUSMadINTEL
+OpJointMatrixUUMadINTEL
+
+----
+
+Instructions added under the *{wi_capability_name}* capability:
+
+----
+
+OpJointMatrixWorkItemLengthINTEL
+OpJointMatrixGetElementCoordINTEL
+
+----
+
+
+== Token Number Assignments
+
+[width="40%"]
+[cols="70%,30%"]
+[grid="rows"]
+|====
+|*{main_capability_name}*            | {main_capability_token}
+|*{packed_capability_name}*          | {packed_capability_token}
+|*{wi_capability_name}*              | {wi_capability_token}
+|*{tf32_capability_name}*            | {tf32_capability_token}
+|*{bf16_capability_name}*            | {bf16_capability_token}
+|*{packed2_capability_name}*         | {packed2_capability_token}
+|*{packed4_capability_name}*         | {packed4_capability_token}
+|*OpTypeJointMatrixINTEL*            | {OpTypeJointMatrixINTEL_token}
+|*OpJointMatrixLoadINTEL*            | {OpJointMatrixLoadINTEL_token}
+|*OpJointMatrixStoreINTEL*           | {OpJointMatrixStoreINTEL_token}
+|*OpJointMatrixMadINTEL*             | {OpJointMatrixMadINTEL_token}
+|*OpJointMatrixSUMadINTEL*           | {OpJointMatrixSUMadINTEL_token}
+|*OpJointMatrixUSMadINTEL*           | {OpJointMatrixUSMadINTEL_token}
+|*OpJointMatrixUUMadINTEL*           | {OpJointMatrixUUMadINTEL_token}
+|*OpJointMatrixWorkItemLengthINTEL*  | {OpJointMatrixWorkItemLengthINTEL_token}
+|*OpJointMatrixGetElementCoordINTEL* | {OpJointMatrixGetElementCoordINTEL_token}
+|====
+
+== Modifications to the SPIR-V Specification, Version 1.6
+
+=== 2.16 Validation Rules
+
+Joint matrix types (or types containing them) can only be allocated in *Function*
+or *Private* <<Storage Class, Storage Class>>.
+
+=== 2.2 Terms
+Add new terms to section 2.2.2 Types:
+
+_Joint Matrix_: A two-dimensional ordered collection of scalars, whose storage
+is spread across multiple invocations.
+
+Add _Joint Matrix_ to the definition of _Composite_.
+
+Add _Joint Matrix_ to the definition of _Concrete Type_.
+
+=== Joint Matrix Layout
+
+Add section 3.XX, Joint Matrix Layout.
+'Layout' indicates how the values of joint matrix are arranged in memory.
+
+[options="header"]
+|====
+2+^| Layout ^| Enabling capability 
+| 0 | *RowMajor*               |  *{main_capability_name}*
+| 1 | *ColumnMajor*            |  *{main_capability_name}*
+| 2 | *Packed* +
+Suitable for Vector Neural Network Instruction (VNNI) format used in Intel AMX
+and Intel XMX. It specifies that the data was prepacked by user before loading
+a joint matrix.
+More info could be found in {DPCPP_URL}[DPCPP matrix extension spec] | *{packed_capability_name}*
+|====
+
+=== Joint Matrix Use
+
+Add section 3.XX, Joint Matrix Use.
+'Use' specifies where the joint matrix is used in math operation.
+
+[options="header"]
+|====
+2+^| Use ^| Enabling capability
+| 0 | *MatrixA*     | *{main_capability_name}*
+| 1 | *MatrixB*     | *{main_capability_name}*
+| 2 | *Accumulator* | *{main_capability_name}*
+|====
+
+=== Joint Matrix Component Type Interpretation
+
+Add section 3.XX, Joint Matrix Component Type Interpretation.
+To be used by 'Component Type Interpretation' optional parameter of
+*TypeJointMatrixINTEL*.
+
+[options="header"]
+|====
+2+^| Interpretation ^| Enabling capability
+| 0 | *None* |
+| 1 | *TF32* +
+'Component Type' must be _float_. Interpret 'Component Type' of joint matrix
+as TF32. | *{tf32_capability_name}*
+| 2 | *Bfloat16* +
+'Component Type' must be 16-bit _integer_. Interpret 'Component Type' of joint
+matrix as Bfloat16. | *{bf16_capability_name}*
+| 3 | *PackedInt2* +
+'Component Type' must be _integer_. Interpret <N>-bit _integer_ 'Component Type'
+of joint matrix as a vector of 2-bit integers. Number of components of this
+vector is limited by enabled SPIR-V capabilities, which brings limitations on
+possible width of the _integer_. +
+If a joint matrix type that has *ComponentTypeInterpretation* operand with
+*PackedInt2* value is used in an arithmetic instruction, then to verify
+this instruction's inputs 'Column' and 'Row' of the matrix should be taken with
+a factor of a size of this packed vector.  | *{packed2_capability_name}*
+| 4 | *PackedInt4* +
+Interpret <N>-bit _integer_ 'Component Type' of joint matrix as a vector of 4-bit integers.
+Number of components of this vector is limited by enabled SPIR-V capabilities,
+which brings limitations on possible width of the _integer_. +
+If a joint matrix type that has *ComponentTypeInterpretation* operand with
+*PackedInt4* value is used in an arithmetic instruction, then to verify
+this instruction's inputs 'Column' and 'Row' of the matrix should be taken with
+a factor of a size of this packed vector.  | *{packed4_capability_name}*
+|====
+
+=== Capabilities
+
+Modify Section 3.31, Capability, adding rows to the Capability table:
+
+--
+[options="header"]
+|====
+2+^| Capability ^| Implicitly Declares 
+| {main_capability_token} | *{main_capability_name}* +
+ +
+Uses *TypeJointMatrixINTEL* +
+See also extension: *{extension_name}*
+|
+| {packed_capability_token} | *{packed_capability_name}* +
+ +
+Uses *Packed* layout to <<Joint Matrix Layout,*Joint Matrix Layout*>>. +
+See also extension: *{extension_name}*
+| *{main_capability_name}* +
+| {wi_capability_token} | *{wi_capability_name}* +
+ +
+Uses <<OpJointMatrixWorkItemLengthINTEL,*OpJointMatrixWorkItemLengthINTEL*>> and
+<<OpJointMatrixGetElementCoordINTEL,*OpJointMatrixGetElementCoordINTEL*>>
+instructions. +
+See also extension: *{extension_name}*
+| *{main_capability_name}* +
+| {tf32_capability_token} | *{tf32_capability_name}* +
+ +
+Uses *TF32* in 3.XX, Joint Matrix Component Type Interpretation +
+ +
+See also extension: *{extension_name}*
+| *{main_capability_name}* +
+| {bf16_capability_token} | *{bf16_capability_name}* +
+ +
+Uses *BF16* in 3.XX, Joint Matrix Component Type Interpretation +
+ +
+See also extension: *{extension_name}*
+| *{main_capability_name}* +
+| {packed2_capability_token} | *{packed2_capability_name}* +
+ +
+Uses *PackedInt2* in 3.XX, Joint Matrix Component Type Interpretation +
+ +
+See also extension: *{extension_name}*
+| *{main_capability_name}* +
+| {packed4_capability_token} | *{packed4_capability_name}* +
+ +
+Uses *PackedInt4* in 3.XX, Joint Matrix Component Type Interpretation +
+ +
+See also extension: *{extension_name}*
+| *{main_capability_name}* +
+|====
+--
+
+=== Instructions
+
+==== 3.42.6 Type-Declaration Instructions
+
+[cols="1,1,7*3",width="100%"]
+|=====
+8+|[[OpTypeJointMatrixINTEL]]*OpTypeJointMatrixINTEL* +
+ +
+Declare a joint matrix type. +
+ +
+'Component Type' is the type of each component in the resulting type. It must be
+a scalar 'numerical type'. +
+ +
+'Row Count' is the number of rows in the joint matrix type. It must be an '<id>'
+of 'constant instruction' with scalar 32-bit integer. It is invalid for 'Column Count' to be 0 or
+<<OpConstantNull,*OpConstantNull*>>. +
+ +
+'Column Count' is the number of columns in the joint matrix type. It must be an '<id>'
+of 'constant instruction' with scalar 32-bit integer. It is invalid for 'Column Count' to be 0 or
+<<OpConstantNull,*OpConstantNull*>>. +
+ +
+Execution is a 'Scope'. Must be an '<id>' of 'constant instruction'
+with scalar 32-bit integer. Its value must be either _Workgroup_ or
+_Subgroup_ from the table 3.27. Scope <id>. +
+ +
+'Use' parameter shows location of the matrix in a math operation.
+Must be an '<id>' of 'constant instruction' with scalar 32-bit integer type. Its
+value must be one of the values in the table 3.XX, <<Joint Matrix Use,Joint Matrix Use>>. +
+ +
+_Optional_ 'Component Type Interpretation' specifies how to interpret
+'Component Type' when components of a joint matrix are storages for values of
+different types. Must be an '<id>' of 'constant instruction' with scalar 32-bit
+integer type. Its value must be one of the values in the table 3.XX,
+<<Joint Matrix Component Type Interpretation,Joint Matrix Component Type Interpretation>>. +
+ +
+
+1+|Capability: +
+*{main_capability_name}*
+1+| 7+ | {OpTypeJointMatrixINTEL_token}
+| 'Result <id>'
+| '<id>' +
+'Component Type'
+| '<id>' +
+'Row Count'
+| '<id>' +
+'Column Count'
+| '<id>' +
+'Scope'
+| '<id>' +
+'Use'
+|_Optional_ '<id>' +
+'Component Type Interpretation'
+|=====
+
+==== 3.42.8. Memory Instructions
+
+[cols="1,1,6*3",width="100%"]
+|=====
+7+|[[OpJointMatrixLoadINTEL]]*OpJointMatrixLoadINTEL* +
+ +
+Load a joint matrix through a pointer. +
+ +
+'Result Type' is the type of the loaded joint matrix. It must be
+<<OpTypeJointMatrixINTEL,OpTypeJointMatrixINTEL>>. +
+ +
+'Pointer' is the pointer to load through. It specifies start of memory region 
+where elements of the joint matrix are stored and arranged according to 'Layout'.
+The <<Storage Class,Storage Class>> of 'Pointer' must be *Workgroup*,
+*CrossWorkgroup*, *StorageBuffer*, *Generic* or *PhysicalStorageBuffer*. +
+ +
+'Stride' describes the number of elements between consecutive rows for the
+RowMajor 'layout', or between columns for the ColumnMajor 'layout'. +
+ +
+'Layout' indicates how the values in memory are arranged.
+Must be an '<id>' of 'constant instruction' with scalar 32-bit integer type. Its
+value must be one of the values in the table 3.XX,
+<<Joint Matrix Layout,Joint Matrix Layout>>. +
+ +
+If present, any 'Memory Operands' must begin with a 
+<<Memory_Operands,*memory operand*>> literal. If not present, it is the same as
+specifying the <<Memory_Operands,*memory operand*>> *None*. +
+ +
+For a given dynamic instance of this instruction, all operands of this
+instruction must be the same for all invocations in a given scope instance
+(where the scope is the scope the joint matrix type was created with).
+All invocations in a given scope instance must be active or all must be
+inactive.
+ +
+
+1+|Capability: +
+*{main_capability_name}*
+1+| 6 + variable | {OpJointMatrixLoadINTEL_token}
+| '<id>' +
+'Result Type'
+|'Result <id>'
+| '<id>' +
+'Pointer'
+| '<id>' +
+'Stride'
+| '<id>' +
+'<<Joint Matrix Layout,Layout>>'
+| Optional +
+'Memory Operands'
+|=====
+
+[cols="1,1,5*3",width="100%"]
+|=====
+6+|[[OpJointMatrixStoreINTEL]]*OpJointMatrixStoreINTEL* +
+ +
+Store a joint matrix through a pointer. +
+ +
+'Pointer' is the pointer to store through. It specifies start of memory region 
+where elements of the joint matrix must be stored and arranged according to 'Layout'. +
+The <<Storage Class,Storage Class>> of 'Pointer' must be *Workgroup*,
+*CrossWorkgroup*, *StorageBuffer*, *Generic* or *PhysicalStorageBuffer*. +
+ +
+'Object' is the joint matrix to store. It must be
+<<OpTypeJointMatrixINTEL,*OpTypeJointMatrixINTEL*>>. +
+ +
+'Stride' describes the number of elements between consecutive rows for the
+RowMajor 'layout', or between columns for the ColumnMajor 'layout'. +
+ +
+'Layout' indicates how the values stored are arranged in memory.
+Must be an '<id>' of 'constant instruction' with scalar 32-bit integer type. Its
+value must be one of the values in the table 3.XX,
+<<Joint Matrix Layout,Joint Matrix Layout>>. +
+ +
+If present, any 'Memory Operands' must begin with a
+<<Memory_Operands,*memory operand*>> literal. If not present, it is the same as
+specifying the <<Memory_Operands,*memory operand*>> *None*. +
+ +
+For a given dynamic instance of this instruction, all operands of this
+instruction must be the same for all invocations in a given scope instance
+(where the scope is the scope the joint matrix type was created with).
+All invocations in a given scope instance must be active or all must be
+inactive.
+ +
+
+1+|Capability: +
+*{main_capability_name}*
+1+| 5 + variable | {OpJointMatrixStoreINTEL_token}
+| '<id>' +
+'Pointer'
+| '<id>' +
+'Object'
+| '<id>' +
+'Stride'
+| '<id>' +
+'<<Joint Matrix Layout,Layout>>'
+| Optional +
+'Memory Operands'
+|=====
+
+==== 3.42.12. Composite Instructions
+
+Modify OpCompositeConstruct to make an exception for joint matrix types:
+"If the 'Result Type' is <<OpTypeJointMatrixINTEL,*OpTypeJointMatrixINTEL*>>
+then there must be only one 'Constituent' and it will be used to initialize all
+elements of the joint matrix."
+
+
+Modify *OpVectorExtractDynamic* and *OpVectorInsertDynamic* to accept
+<<OpTypeJointMatrixINTEL,*OpTypeJointMatrixINTEL*>> as the 'Vector' operand.
+In this case the instructions operate on an implicit vector which represents
+part of the joint matrix and holds components owned by the current work-item.
+If the 'index' operand of these instructions is less than zero or exceeds the
+value returned by
+<<OpJointMatrixWorkItemLengthINTEL,*OpJointMatrixWorkItemLengthINTEL*>>,
+behavior is undefined.
+
+[cols="1,1,3*3",width="100%"]
+|=====
+4+|[[OpJointMatrixWorkItemLengthINTEL]]*OpJointMatrixWorkItemLengthINTEL* +
+ +
+Return number of components owned by the current work-item in a joint matrix. +
+ +
+'Result Type' must be an 'integer type' scalar. +
+ +
+'Matrix' is an ID of <<OpTypeJointMatrixINTEL,*OpTypeJointMatrixINTEL*>>.
+The instruction returns the number of the components of this joint matrix type
+owned by the current work-item. +
+
+1+|Capability: +
+*{wi_capability_name}*
+1+| 4 | {OpJointMatrixWorkItemLengthINTEL_token}
+| '<id>' +
+'Result Type'
+| 'Result <id>'
+| '<id>' +
+'Matrix'
+|=====
+
+[cols="1,1,4*3",width="100%"]
+|=====
+5+|[[OpJointMatrixGetElementCoordINTEL]]*OpJointMatrixGetElementCoordINTEL* +
+ +
+Returns (Row, Column) coordinate of dynamically selected element of a matrix.  +
+ +
+'Result Type' must be an integer 2-elements vector, where the first component
+contains the row with the selected element, and the second element contains the
+column with the selected element. +
+ +
+'Matrix' is an ID of <<OpTypeJointMatrixINTEL,*OpTypeJointMatrixINTEL*>>.
+The instruction returns the element's coordinate of this joint matrix type. +
+ +
+'Index' must be a 'scalar integer'. It is interpreted as an index into the list
+of components owned by this work-item in the joint matrix. The behavior is
+undefined if 'Index' is less than zero or greater than or equal to the number
+that <<OpJointMatrixWorkItemLengthINTEL,*OpJointMatrixWorkItemLengthINTEL*>>
+returns for this work-item. +
+ +
+
+1+|Capability: +
+*{wi_capability_name}*
+1+| 5 | {OpJointMatrixGetElementCoordINTEL_token}
+| '<id>' +
+'Result Type'
+| 'Result <id>'
+| '<id>' +
+'Matrix'
+| '<id>' +
+'Index'
+|=====
+
+==== 3.42.13. Arithmetic Instructions
+
+[cols="1,1,5*3",width="100%"]
+|=====
+6+|[[OpJointMatrixMadINTEL]]*OpJointMatrixMadINTEL* +
+ +
+Multiply matrix 'A' by matrix 'B' and add matrix 'C' to the result of the
+multiplication: `A x B + C`. Here 'A' is a `M x K` matrix, 'B' is a `K x N`
+matrix and 'C' is a `M x N` matrix. +
+ +
+It is invalid to have sizes of operands that do not meet the conditions above.
+All operands and the 'Result Type' must be
+<<OpTypeJointMatrixINTEL,*OpTypeJointMatrixINTEL*>>. +
+ +
+'A' must be a <<OpTypeJointMatrixINTEL,*OpTypeJointMatrixINTEL*>> whose
+'Row Count' equals to 'M' and 'Column Count' equals to 'K'.
+'Use' argument of matrix type must be 'MatrixA'. +
+ +
+'B' must be a <<OpTypeJointMatrixINTEL,*OpTypeJointMatrixINTEL*>> whose
+'Row Count' equals to 'K' and 'Column Count' equals to 'N'.
+'Use' argument of matrix type must be 'MatrixB'. +
+ +
+'C' and 'Result Type' must be a
+<<OpTypeJointMatrixINTEL,*OpTypeJointMatrixINTEL*>> with 'Row Count' equals
+to 'M' and 'Column Count' equals to 'N'. 'Use' argument of joint matrix type
+must be 'Accumulator'. +
+ +
+'Scope' of 'A', 'B', 'C' and 'Result' matrices must match.
+ +
+All invocations in a given 'Scope' instance must be active or all must be
+inactive.
+ +
+Behavior is undefined if not all invocations of this module within 'Scope' of
+'Result' reach this point of execution. +
+ +
+Behavior is undefined unless all invocations within 'Scope' of 'Result'
+execute the same dynamic instance of this instruction. +
+ +
+
+1+|Capability: +
+*{main_capability_name}*
+1+| 6 | {OpJointMatrixMadINTEL_token}
+| '<id>' +
+'Result Type'
+|'Result <id>'
+| '<id>' +
+'A'
+| '<id>' +
+'B'
+| '<id>' +
+'C'
+|=====
+
+[cols="1,1,5*3",width="100%"]
+|=====
+6+|[[OpJointMatrixSUMadINTEL]]*OpJointMatrixSUMadINTEL* +
+ +
+Multiply matrix 'A' by matrix 'B' and add matrix 'C' to the result of the
+multiplication: `A x B + C`. Here 'A' is a `M x K` matrix, 'B' is a `K x N`
+matrix and 'C' is a `M x N` matrix. +
+ +
+It is invalid to have sizes of operands that do not meet the conditions above.
+All operands and the 'Result Type' must be
+<<OpTypeJointMatrixINTEL,*OpTypeJointMatrixINTEL*>>. +
+ +
+'A' must be a <<OpTypeJointMatrixINTEL,*OpTypeJointMatrixINTEL*>> whose
+'Component Type' is signed 'integer type', 'Row Count' equals to 'M' and
+'Column Count' equals to 'K'. 'Use' argument of matrix type must be 'MatrixA'. +
+ +
+'B' must be a <<OpTypeJointMatrixINTEL,*OpTypeJointMatrixINTEL*>> whose
+'Component Type' is unsigned 'integer type', 'Row Count' equals to 'K'
+and 'Column Count' equals to 'N'. 'Use' argument of joint matrix type must be
+'MatrixB'. +
+ +
+'C' and 'Result Type' must be a
+<<OpTypeJointMatrixINTEL,*OpTypeJointMatrixINTEL*>> with signed 'integer type'
+'Component Type', 'Row Count' equals to 'M' and 'Column Count' equals to 'N'.
+'Use' argument of joint matrix type must be 'Accumulator'. +
+ +
+'Scope' of 'A', 'B', 'C' and 'Result' matrices must match.
+ +
+All invocations in a given 'Scope' instance must be active or all must be
+inactive.
+ +
+Behavior is undefined if not all invocations of this module within 'Scope' of
+'Result' reach this point of execution. +
+ +
+Behavior is undefined unless all invocations within 'Scope' of 'Result'
+execute the same dynamic instance of this instruction. +
+ +
+
+1+|Capability: +
+*{main_capability_name}*
+1+| 6 | {OpJointMatrixSUMadINTEL_token}
+| '<id>' +
+'Result Type'
+|'Result <id>'
+| '<id>' +
+'A'
+| '<id>' +
+'B'
+| '<id>' +
+'C'
+|=====
+
+[cols="1,1,5*3",width="100%"]
+|=====
+6+|[[OpJointMatrixUSMadINTEL]]*OpJointMatrixUSMadINTEL* +
+ +
+Multiply matrix 'A' by matrix 'B' and add matrix 'C' to the result of the
+multiplication: `A x B + C`. Here 'A' is a `M x K` matrix, 'B' is a `K x N`
+matrix and 'C' is a `M x N` matrix. +
+ +
+It is invalid to have sizes of operands that do not meet the conditions above.
+All operands and the 'Result Type' must be
+<<OpTypeJointMatrixINTEL,*OpTypeJointMatrixINTEL*>>. +
+ +
+'A' must be a <<OpTypeJointMatrixINTEL,*OpTypeJointMatrixINTEL*>> whose
+'Component Type' is unsigned 'integer type', 'Row Count' equals to 'M' and
+'Column Count' equals to 'K'. 'Use' argument of joint matrix type must be 'MatrixA'. +
+ +
+'B' must be a <<OpTypeJointMatrixINTEL,*OpTypeJointMatrixINTEL*>> whose
+'Component Type' is signed 'integer type', 'Row Count' equals to 'K' and
+'Column Count' equals to 'N'. 'Use' argument of matrix type must be 'MatrixB'. +
+ +
+'C' and 'Result Type' must be a
+<<OpTypeJointMatrixINTEL,*OpTypeJointMatrixINTEL*>> with signed 'integer type'
+'Component Type', 'Row Count' equals to 'M' and 'Column Count' equals to 'N'.
+'Use' argument of joint matrix type must be 'Accumulator'. +
+ +
+'Scope' of 'A', 'B', 'C' and 'Result' matrices must match.
+ +
+All invocations in a given 'Scope' instance must be active or all must be
+inactive.
+ +
+Behavior is undefined if not all invocations of this module within 'Scope' of
+'Result' reach this point of execution. +
+ +
+Behavior is undefined unless all invocations within 'Scope' of 'Result'
+execute the same dynamic instance of this instruction. +
+ +
+
+1+|Capability: +
+*{main_capability_name}*
+1+| 6 | {OpJointMatrixUSMadINTEL_token}
+| '<id>' +
+'Result Type'
+|'Result <id>'
+| '<id>' +
+'A'
+| '<id>' +
+'B'
+| '<id>' +
+'C'
+|=====
+
+[cols="1,1,5*3",width="100%"]
+|=====
+6+|[[OpJointMatrixUUMadINTEL]]*OpJointMatrixUUMadINTEL* +
+ +
+Multiply matrix 'A' by matrix 'B' and add matrix 'C' to the result of the
+multiplication: `A x B + C`. Here 'A' is a `M x K` matrix, 'B' is a `K x N`
+matrix and 'C' is a `M x N` matrix. +
+ +
+It is invalid to have sizes of operands that do not meet the conditions above.
+All operands and the 'Result Type' must be
+<<OpTypeJointMatrixINTEL,*OpTypeJointMatrixINTEL*>>. +
+ +
+'A' must be a <<OpTypeJointMatrixINTEL,*OpTypeJointMatrixINTEL*>> whose
+'Component Type' is unsigned 'integer type', 'Row Count' equals to 'M' and
+'Column Count' equals to 'K'. 'Use' argument of joint matrix type must be 'MatrixA'. +
+ +
+'B' must be a <<OpTypeJointMatrixINTEL,*OpTypeJointMatrixINTEL*>> whose
+'Component Type' is unsigned 'integer type', 'Row Count' equals to 'K' and
+'Column Count' equals to 'N'. 'Use' argument of joint matrix type must be 'MatrixB'. +
+ +
+'C' and 'Result Type' must be a
+<<OpTypeJointMatrixINTEL,*OpTypeJointMatrixINTEL*>> with unsigned 'integer type'
+'Component Type', 'Row Count' equals to 'M' and 'Column Count' equals to 'N'.
+'Use' argument of joint matrix type must be 'Accumulator'. +
+ +
+'Scope' of 'A', 'B', 'C' and 'Result' matrices must match.
+ +
+All invocations in a given 'Scope' instance must be active or all must be
+inactive.
+ +
+Behavior is undefined if not all invocations of this module within 'Scope' of
+'Result' reach this point of execution. +
+ +
+Behavior is undefined unless all invocations within 'Scope' of 'Result'
+execute the same dynamic instance of this instruction. +
+ +
+
+1+|Capability: +
+*{main_capability_name}*
+1+| 6 | {OpJointMatrixUUMadINTEL_token}
+| '<id>' +
+'Result Type'
+|'Result <id>'
+| '<id>' +
+'A'
+| '<id>' +
+'B'
+| '<id>' +
+'C'
+|=====
+
+=== Issues
+
+None
+
+Revision History
+----------------
+
+[cols="5,15,15,70"]
+[grid="rows"]
+[options="header"]
+|========================================
+|Rev|Date|Author|Changes
+|1|2021-02-16|Alexey Sotkin|Initial revision
+|2|2021-09-06|Dmitry Sidorov|Split OpJointMatrixMadINTEL instruction into 4
+|3|2021-12-28|Dmitry Sidorov|Add Joint Matrix to Composite definition
+|4|2022-03-10|Dmitry Sidorov|Add OpJointMatrixWorkItemLengthINTEL instruction
+|5|2022-04-01|Dmitry Sidorov|Add Use parameter to TypeJointMatrixINTEL
+|6|2022-09-07|Dmitry Sidorov|Make Use parameter to be mandatory
+|7|2022-10-13|Dmitry Sidorov|Add ComponentTypeInterpretation decoration and OpJointMatrixGetElementCoordINTEL
+|8|2022-12-02|Dmitry Sidorov|Remove Scope from the instructions and Layout from the type
+|9|2022-12-07|Dmitry Sidorov|Split main capability into 3
+|10|2023-02-01|Dmitry Sidorov|Move ComponentTypeInterpretation to an optional type parameter
+|========================================

From 84df5b9ddba950b3de9463d32bb3e514a576cca2 Mon Sep 17 00:00:00 2001
From: Dmitry Vodopyanov <dmitry.vodopyanov@intel.com>
Date: Wed, 8 Nov 2023 22:14:09 +0100
Subject: [PATCH 862/877] [SYCL] Fix post-commit after
 https://github.com/intel/llvm/pull/11738 (#11825)

---
 sycl/source/detail/program_manager/program_manager.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/sycl/source/detail/program_manager/program_manager.cpp b/sycl/source/detail/program_manager/program_manager.cpp
index 0446ac0ece0a9..de4bfb86e8da0 100644
--- a/sycl/source/detail/program_manager/program_manager.cpp
+++ b/sycl/source/detail/program_manager/program_manager.cpp
@@ -2486,7 +2486,7 @@ bool isMatrixSupportedByHW(const std::string &MatrixTypeStrUser,
 }
 
 std::optional<sycl::exception> checkDevSupportJointMatrix(
-    const device &Dev, const std::string &JointMatrixProStr,
+    const std::string &JointMatrixProStr,
     const std::vector<ext::oneapi::experimental::matrix::combination>
         &SupportedMatrixCombinations) {
   std::istringstream JointMatrixStrStream(JointMatrixProStr);
@@ -2567,7 +2567,7 @@ std::optional<sycl::exception> checkDevSupportJointMatrix(
 }
 
 std::optional<sycl::exception> checkDevSupportJointMatrixMad(
-    const device &Dev, const std::string &JointMatrixProStr,
+    const std::string &JointMatrixProStr,
     const std::vector<ext::oneapi::experimental::matrix::combination>
         &SupportedMatrixCombinations) {
   std::istringstream JointMatrixMadStrStream(JointMatrixProStr);
@@ -2721,8 +2721,8 @@ checkDevSupportDeviceRequirements(const device &Dev,
     while (!JointMatrixByteArray.empty()) {
       JointMatrixByteArrayToStr += JointMatrixByteArray.consume<char>();
     }
-    std::optional<sycl::exception> Result = checkDevSupportJointMatrix(
-        Dev, JointMatrixByteArrayToStr, Combinations);
+    std::optional<sycl::exception> Result =
+        checkDevSupportJointMatrix(JointMatrixByteArrayToStr, Combinations);
     if (Result)
       return Result.value();
   }
@@ -2750,7 +2750,7 @@ checkDevSupportDeviceRequirements(const device &Dev,
       JointMatrixMadByteArrayToStr += JointMatrixMadByteArray.consume<char>();
     }
     std::optional<sycl::exception> Result = checkDevSupportJointMatrixMad(
-        Dev, JointMatrixMadByteArrayToStr, Combinations);
+        JointMatrixMadByteArrayToStr, Combinations);
     if (Result)
       return Result.value();
   }

From 9dfaf274e2cbe42e992be0519fa7d2b0eb0face5 Mon Sep 17 00:00:00 2001
From: Fraser Cormack <fraser@codeplay.com>
Date: Wed, 8 Nov 2023 22:55:06 +0000
Subject: [PATCH 863/877] [SYCL] Fix `convert` to `sycl::vec<bool, N>` (#11822)

Since vectors of bools are backed by byte-sized storage, we must ensure
that conversion results from other vector types are correctly brought
into the expected range of bools, e.g., `(char)0` or `(char)1`.
---
 sycl/include/sycl/types.hpp      |  3 +++
 sycl/test-e2e/Basic/vec_bool.cpp | 27 +++++++++++++++++++++++++++
 2 files changed, 30 insertions(+)

diff --git a/sycl/include/sycl/types.hpp b/sycl/include/sycl/types.hpp
index 76773e343b583..3fcd5a54dd29c 100644
--- a/sycl/include/sycl/types.hpp
+++ b/sycl/include/sycl/types.hpp
@@ -695,6 +695,9 @@ template <typename Type, int NumElements> class vec {
                        vec_data<DataT>::get(getValue(I)))));
       }
     }
+    if constexpr (std::is_same_v<convertT, bool>) {
+      Result.ConvertToDataT();
+    }
     return Result;
   }
 
diff --git a/sycl/test-e2e/Basic/vec_bool.cpp b/sycl/test-e2e/Basic/vec_bool.cpp
index f0ecd244ffea9..9c540a1685b4b 100644
--- a/sycl/test-e2e/Basic/vec_bool.cpp
+++ b/sycl/test-e2e/Basic/vec_bool.cpp
@@ -489,4 +489,31 @@ int main() {
      }).wait_and_throw();
   }
   check_result(resVec, expected);
+
+  // Test convert()
+  bool resConv = true;
+  {
+    sycl::buffer<bool, 1> bufResVec(&resConv, 1);
+
+    q.submit([&](sycl::handler &cgh) {
+       sycl::accessor accRes(bufResVec, cgh, sycl::write_only);
+       cgh.single_task([=]() {
+         // Check that converting a value not representable by a bool (2) is
+         // correctly converted to bool.
+         auto inputVec = sycl::vec<int, size>(2);
+         sycl::vec<bool, size> expectedVec;
+         for (size_t i = 0; i < size; ++i) {
+           expectedVec[i] = (bool)inputVec[i];
+         }
+         auto convertVec = inputVec.convert<bool>();
+         // Check that the two vectors are equal.
+         for (int i = 0; i < size; ++i) {
+           if (expectedVec[i] != convertVec[i]) {
+             accRes[0] = false;
+           }
+         }
+       });
+     }).wait_and_throw();
+  }
+  assert(resConv && "Incorrect result");
 }

From 1f4ff101cb227e51593483242112e61a8c725ae6 Mon Sep 17 00:00:00 2001
From: jinge90 <ge.jin@intel.com>
Date: Thu, 9 Nov 2023 16:48:38 +0800
Subject: [PATCH 864/877] [SYCL][TEST] Remove check_has.cpp dependency on
 metadata order (#11809)

Previously, we committed https://github.com/intel/llvm/pull/9642 aiming
to break dependency on metadata order for check_has.cpp but the commit
was broken unexpectedly. This PR re-lands it.

---------

Signed-off-by: jinge90 <ge.jin@intel.com>
---
 sycl/test/check_device_code/device_has.cpp | 84 +++++++++++-----------
 1 file changed, 40 insertions(+), 44 deletions(-)

diff --git a/sycl/test/check_device_code/device_has.cpp b/sycl/test/check_device_code/device_has.cpp
index b3a0c7702e982..cb56d38b91bec 100644
--- a/sycl/test/check_device_code/device_has.cpp
+++ b/sycl/test/check_device_code/device_has.cpp
@@ -1,51 +1,48 @@
-// RUN: %clangxx -fsycl -Xclang -fsycl-is-device -fsycl-device-only -Xclang -fno-sycl-early-optimizations -S -emit-llvm %s -o - | FileCheck %s
+// RUN: %clangxx -fsycl -Xclang -fsycl-is-device -fsycl-device-only -Xclang -fno-sycl-early-optimizations -S -emit-llvm %s -o %t.ll
+// RUN: FileCheck %s --input-file %t.ll --check-prefix=CHECK-ASPECTS
+// RUN: FileCheck %s --input-file %t.ll --check-prefix=CHECK-SRCLOC
 
 // Tests for IR of device_has(aspect, ...) attribute and
-// !sycl_used_aspects metadata
+// !sycl_used_aspects metadata.
+// We run FileCheck for 2 times to break metadata order dependency since
+// compiler has no guarantee for meta data order.
 #include <sycl/sycl.hpp>
 
 using namespace sycl;
 queue q;
 
-// CHECK: define weak_odr dso_local spir_kernel void @{{.*}}kernel_name_1
-// CHECK-SAME: !sycl_declared_aspects ![[ASPECTS1:[0-9]+]]
-// CHECK-SAME: !srcloc ![[SRCLOC1:[0-9]+]]
+// CHECK-ASPECTS: define weak_odr dso_local spir_kernel void @{{.*}}kernel_name_1{{.*}} !sycl_declared_aspects ![[ASPECTS1:[0-9]+]] {{.*}}
+// CHECK-SRCLOC: define weak_odr dso_local spir_kernel void @{{.*}}kernel_name_1{{.*}} !srcloc ![[SRCLOC1:[0-9]+]] {{.*}}
 
-// CHECK: define {{.*}}spir_func void @{{.*}}func1
-// CHECK-SAME: !sycl_declared_aspects ![[ASPECTS1]]
-// CHECK-SAME: !srcloc ![[SRCLOC2:[0-9]+]]
-// CHECK-SAME: !sycl_used_aspects ![[ASPECTS1]]
+// CHECK-ASPECTS: define {{.*}}spir_func void @{{.*}}func1{{.*}} !sycl_declared_aspects ![[ASPECTS1]]
+// CHECK-ASPECTS-SAME: !sycl_used_aspects ![[ASPECTS1]]
+// CHECK-SRCLOC: define {{.*}}spir_func void @{{.*}}func1{{.*}} !srcloc ![[SRCLOC2:[0-9]+]]
 [[sycl::device_has(sycl::aspect::cpu)]] void func1() {}
 
-// CHECK: define {{.*}}spir_func void @{{.*}}func2
-// CHECK-SAME: !sycl_declared_aspects ![[ASPECTS2:[0-9]+]]
-// CHECK-SAME: !srcloc ![[SRCLOC3:[0-9]+]]
-// CHECK-SAME: !sycl_used_aspects ![[ASPECTS2]]
+// CHECK-ASPECTS: define {{.*}}spir_func void @{{.*}}func2{{.*}} !sycl_declared_aspects ![[ASPECTS2:[0-9]+]]
+// CHECK-ASPECTS-SAME: !sycl_used_aspects ![[ASPECTS2]]
+// CHECK-SRCLOC: define {{.*}}spir_func void @{{.*}}func2{{.*}} !srcloc ![[SRCLOC3:[0-9]+]]
 [[sycl::device_has(sycl::aspect::fp16, sycl::aspect::gpu)]] void func2() {}
 
-// CHECK: define {{.*}}spir_func void @{{.*}}func3
-// CHECK-SAME: !sycl_declared_aspects ![[EMPTYASPECTS:[0-9]+]]
-// CHECK-SAME: !srcloc ![[SRCLOC4:[0-9]+]]
+// CHECK-ASPECTS: define {{.*}}spir_func void @{{.*}}func3{{.*}} !sycl_declared_aspects ![[EMPTYASPECTS:[0-9]+]]
+// CHECK-SRCLOC: define {{.*}}spir_func void @{{.*}}func3{{.*}} !srcloc ![[SRCLOC4:[0-9]+]]
 [[sycl::device_has()]] void func3() {}
 
-// CHECK: define {{.*}}spir_func void @{{.*}}func4
-// CHECK-SAME: !sycl_declared_aspects ![[ASPECTS3:[0-9]+]]
-// CHECK-SAME: !srcloc ![[SRCLOC5:[0-9]+]]
-// CHECK-SAME: !sycl_used_aspects ![[ASPECTS3]]
+// CHECK-ASPECTS: define {{.*}}spir_func void @{{.*}}func4{{.*}} !sycl_declared_aspects ![[ASPECTS3:[0-9]+]]
+// CHECK-ASPECTS-SAME: !sycl_used_aspects ![[ASPECTS3]]
+// CHECK-SRCLOC: define {{.*}}spir_func void @{{.*}}func4{{.*}} !srcloc ![[SRCLOC5:[0-9]+]]
 template <sycl::aspect Aspect> [[sycl::device_has(Aspect)]] void func4() {}
 
-// CHECK: define {{.*}}spir_func void @{{.*}}func5
-// CHECK-SAME: !sycl_declared_aspects ![[ASPECTS1]]
-// CHECK-SAME: !srcloc ![[SRCLOC6:[0-9]+]]
-// CHECK-SAME: !sycl_used_aspects ![[ASPECTS1]]
+// CHECK-ASPECTS: define {{.*}}spir_func void @{{.*}}func5{{.*}} !sycl_declared_aspects ![[ASPECTS1]]
+// CHECK-ASPECTS-SAME: !sycl_used_aspects ![[ASPECTS1]]
+// CHECK-SRCLOC: define {{.*}}spir_func void @{{.*}}func5{{.*}} !srcloc ![[SRCLOC6:[0-9]+]]
 [[sycl::device_has(sycl::aspect::cpu)]] void func5();
 void func5() {}
 
 constexpr sycl::aspect getAspect() { return sycl::aspect::cpu; }
-// CHECK: define {{.*}}spir_func void @{{.*}}func6
-// CHECK-SAME: !sycl_declared_aspects ![[ASPECTS1]]
-// CHECK-SAME: !srcloc ![[SRCLOC7:[0-9]+]]
-// CHECK-SAME: !sycl_used_aspects ![[ASPECTS1]]
+// CHECK-ASPECTS: define {{.*}}spir_func void @{{.*}}func6{{.*}} !sycl_declared_aspects ![[ASPECTS1]]
+// CHECK-ASPECTS-SAME: !sycl_used_aspects ![[ASPECTS1]]
+// CHECK-SRCLOC: define {{.*}}spir_func void @{{.*}}func6{{.*}} !srcloc ![[SRCLOC7:[0-9]+]]
 [[sycl::device_has(getAspect())]] void func6() {}
 
 class KernelFunctor {
@@ -64,24 +61,23 @@ void foo() {
   q.submit([&](handler &h) {
     KernelFunctor f1;
     h.single_task<class kernel_name_1>(f1);
-    // CHECK: define weak_odr dso_local spir_kernel void @{{.*}}kernel_name_2
-    // CHECK-SAME: !sycl_declared_aspects ![[ASPECTS4:[0-9]+]]
-    // CHECK-SAME: !srcloc ![[SRCLOC8:[0-9]+]] {{.*}}
+    // CHECK-ASPECTS: define weak_odr dso_local spir_kernel void @{{.*}}kernel_name_2{{.*}} !sycl_declared_aspects ![[ASPECTS4:[0-9]+]]
+    // CHECK-SRCLOC: define weak_odr dso_local spir_kernel void @{{.*}}kernel_name_2{{.*}} !srcloc ![[SRCLOC8:[0-9]+]] {{.*}}
     h.single_task<class kernel_name_2>(
         []() [[sycl::device_has(sycl::aspect::gpu)]] {});
   });
 }
 
-// CHECK-DAG: [[ASPECTS1]] = !{i32 1}
-// CHECK-DAG: [[SRCLOC1]] = !{i32 {{[0-9]+}}}
-// CHECK-DAG: [[EMPTYASPECTS]] = !{}
-// CHECK-DAG: [[SRCLOC2]] = !{i32 {{[0-9]+}}}
-// CHECK-DAG: [[ASPECTS2]] = !{i32 5, i32 2}
-// CHECK-DAG: [[SRCLOC3]] = !{i32 {{[0-9]+}}}
-// CHECK-DAG: [[SRCLOC4]] = !{i32 {{[0-9]+}}}
-// CHECK-DAG: [[ASPECTS3]] = !{i32 0}
-// CHECK-DAG: [[SRCLOC5]] = !{i32 {{[0-9]+}}}
-// CHECK-DAG: [[SRCLOC6]] = !{i32 {{[0-9]+}}}
-// CHECK-DAG: [[SRCLOC7]] = !{i32 {{[0-9]+}}}
-// CHECK-DAG: [[ASPECTS4]] = !{i32 2}
-// CHECK-DAG: [[SRCLOC8]] = !{i32 {{[0-9]+}}}
+// CHECK-ASPECTS-DAG: [[ASPECTS1]] = !{i32 1}
+// CHECK-SRCLOC-DAG: [[SRCLOC1]] = !{i32 {{[0-9]+}}}
+// CHECK-ASPECTS-DAG: [[EMPTYASPECTS]] = !{}
+// CHECK-SRCLOC-DAG: [[SRCLOC2]] = !{i32 {{[0-9]+}}}
+// CHECK-ASPECTS-DAG: [[ASPECTS2]] = !{i32 5, i32 2}
+// CHECK-SRCLOC-DAG: [[SRCLOC3]] = !{i32 {{[0-9]+}}}
+// CHECK-SRCLOC-DAG: [[SRCLOC4]] = !{i32 {{[0-9]+}}}
+// CHECK-ASPECTS-DAG: [[ASPECTS3]] = !{i32 0}
+// CHECK-SRCLOC-DAG: [[SRCLOC5]] = !{i32 {{[0-9]+}}}
+// CHECK-SRCLOC-DAG: [[SRCLOC6]] = !{i32 {{[0-9]+}}}
+// CHECK-SRCLOC-DAG: [[SRCLOC7]] = !{i32 {{[0-9]+}}}
+// CHECK-ASPECTS-DAG: [[ASPECTS4]] = !{i32 2}
+// CHECK-SRCLOC-DAG: [[SRCLOC8]] = !{i32 {{[0-9]+}}}

From fca7f5e2e3fb558e613c5daa0eb547b8e25cc524 Mon Sep 17 00:00:00 2001
From: Ewan Crawford <ewan@codeplay.com>
Date: Thu, 9 Nov 2023 14:40:14 +0000
Subject: [PATCH 865/877] [SYCL][Graph] Enable specialization constants with
 graph (#11556)

- Enables specialization constants handling in SYCL-Graph extension.
- Adds E2E tests that verify this behavior.
- Removes unittests tests that checked for unsupported feature exception
throwing.

---------

Co-authored-by: Maxime France-Pillois <maxime.francepillois@codeplay.com>
---
 sycl/include/sycl/handler.hpp                 |   8 -
 sycl/source/detail/scheduler/commands.cpp     |  15 +-
 sycl/source/handler.cpp                       |  16 +-
 .../Explicit/spec_constants_handler_api.cpp   |  16 ++
 .../spec_constants_kernel_bundle_api.cpp      |  17 ++
 .../Inputs/spec_constants_handler_api.cpp     | 210 ++++++++++++++++
 .../spec_constants_kernel_bundle_api.cpp      | 232 ++++++++++++++++++
 .../spec_constants_handler_api.cpp            |  16 ++
 .../spec_constants_kernel_bundle_api.cpp      |  17 ++
 sycl/test-e2e/Graph/graph_common.hpp          |  12 +
 sycl/unittests/Extensions/CommandGraph.cpp    |  45 ----
 11 files changed, 535 insertions(+), 69 deletions(-)
 create mode 100644 sycl/test-e2e/Graph/Explicit/spec_constants_handler_api.cpp
 create mode 100644 sycl/test-e2e/Graph/Explicit/spec_constants_kernel_bundle_api.cpp
 create mode 100644 sycl/test-e2e/Graph/Inputs/spec_constants_handler_api.cpp
 create mode 100644 sycl/test-e2e/Graph/Inputs/spec_constants_kernel_bundle_api.cpp
 create mode 100644 sycl/test-e2e/Graph/RecordReplay/spec_constants_handler_api.cpp
 create mode 100644 sycl/test-e2e/Graph/RecordReplay/spec_constants_kernel_bundle_api.cpp

diff --git a/sycl/include/sycl/handler.hpp b/sycl/include/sycl/handler.hpp
index f0c6b2150e404..98faff7578e4f 100644
--- a/sycl/include/sycl/handler.hpp
+++ b/sycl/include/sycl/handler.hpp
@@ -1776,10 +1776,6 @@ class __SYCL_EXPORT handler {
   void set_specialization_constant(
       typename std::remove_reference_t<decltype(SpecName)>::value_type Value) {
 
-    throwIfGraphAssociated<
-        ext::oneapi::experimental::detail::UnsupportedGraphFeatures::
-            sycl_specialization_constants>();
-
     setStateSpecConstSet();
 
     std::shared_ptr<detail::kernel_bundle_impl> KernelBundleImplPtr =
@@ -1794,10 +1790,6 @@ class __SYCL_EXPORT handler {
   typename std::remove_reference_t<decltype(SpecName)>::value_type
   get_specialization_constant() const {
 
-    throwIfGraphAssociated<
-        ext::oneapi::experimental::detail::UnsupportedGraphFeatures::
-            sycl_specialization_constants>();
-
     if (isStateExplicitKernelBundle())
       throw sycl::exception(make_error_code(errc::invalid),
                             "Specialization constants cannot be read after "
diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp
index f7613940022ee..be989a4e7c4f6 100644
--- a/sycl/source/detail/scheduler/commands.cpp
+++ b/sycl/source/detail/scheduler/commands.cpp
@@ -2405,6 +2405,7 @@ pi_int32 enqueueImpCommandBufferKernel(
   pi_kernel PiKernel = nullptr;
   std::mutex *KernelMutex = nullptr;
   pi_program PiProgram = nullptr;
+  std::shared_ptr<device_image_impl> DeviceImageImpl = nullptr;
 
   auto Kernel = CommandGroup.MSyclKernel;
   auto KernelBundleImplPtr = CommandGroup.MKernelBundle;
@@ -2417,7 +2418,6 @@ pi_int32 enqueueImpCommandBufferKernel(
   // they can simply be launched directly.
   if (KernelBundleImplPtr && !KernelBundleImplPtr->isInterop()) {
     std::shared_ptr<kernel_impl> SyclKernelImpl;
-    std::shared_ptr<device_image_impl> DeviceImageImpl;
     auto KernelName = CommandGroup.MKernelName;
     kernel_id KernelID =
         detail::ProgramManager::getInstance().getSYCLKernelID(KernelName);
@@ -2439,13 +2439,12 @@ pi_int32 enqueueImpCommandBufferKernel(
             ContextImpl, DeviceImpl, CommandGroup.MKernelName);
   }
 
-  auto SetFunc = [&Plugin, &PiKernel, &Ctx, &getMemAllocationFunc](
-                     sycl::detail::ArgDesc &Arg, size_t NextTrueIndex) {
-    sycl::detail::SetArgBasedOnType(
-        Plugin, PiKernel,
-        nullptr /* TODO: Handle spec constants and pass device image here */
-        ,
-        getMemAllocationFunc, Ctx, false, Arg, NextTrueIndex);
+  auto SetFunc = [&Plugin, &PiKernel, &DeviceImageImpl, &Ctx,
+                  &getMemAllocationFunc](sycl::detail::ArgDesc &Arg,
+                                         size_t NextTrueIndex) {
+    sycl::detail::SetArgBasedOnType(Plugin, PiKernel, DeviceImageImpl,
+                                    getMemAllocationFunc, Ctx, false, Arg,
+                                    NextTrueIndex);
   };
   // Copy args for modification
   auto Args = CommandGroup.MArgs;
diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp
index a54c7b0e20283..350bbd4260ff5 100644
--- a/sycl/source/handler.cpp
+++ b/sycl/source/handler.cpp
@@ -111,9 +111,10 @@ bool handler::isStateExplicitKernelBundle() const {
 std::shared_ptr<detail::kernel_bundle_impl>
 handler::getOrInsertHandlerKernelBundle(bool Insert) const {
   if (!MImpl->MKernelBundle && Insert) {
-    MImpl->MKernelBundle =
-        detail::getSyclObjImpl(get_kernel_bundle<bundle_state::input>(
-            MQueue->get_context(), {MQueue->get_device()}, {}));
+    auto Ctx = MGraph ? MGraph->getContext() : MQueue->get_context();
+    auto Dev = MGraph ? MGraph->getDevice() : MQueue->get_device();
+    MImpl->MKernelBundle = detail::getSyclObjImpl(
+        get_kernel_bundle<bundle_state::input>(Ctx, {Dev}, {}));
   }
   return MImpl->MKernelBundle;
 }
@@ -179,10 +180,10 @@ event handler::finalize() {
       // Make sure implicit non-interop kernel bundles have the kernel
       if (!KernelBundleImpPtr->isInterop() &&
           !MImpl->isStateExplicitKernelBundle()) {
+        auto Dev = MGraph ? MGraph->getDevice() : MQueue->get_device();
         kernel_id KernelID =
             detail::ProgramManager::getInstance().getSYCLKernelID(MKernelName);
-        bool KernelInserted =
-            KernelBundleImpPtr->add_kernel(KernelID, MQueue->get_device());
+        bool KernelInserted = KernelBundleImpPtr->add_kernel(KernelID, Dev);
         // If kernel was not inserted and the bundle is in input mode we try
         // building it and trying to find the kernel in executable mode
         if (!KernelInserted &&
@@ -194,8 +195,7 @@ event handler::finalize() {
               build(KernelBundle);
           KernelBundleImpPtr = detail::getSyclObjImpl(ExecKernelBundle);
           setHandlerKernelBundle(KernelBundleImpPtr);
-          KernelInserted =
-              KernelBundleImpPtr->add_kernel(KernelID, MQueue->get_device());
+          KernelInserted = KernelBundleImpPtr->add_kernel(KernelID, Dev);
         }
         // If the kernel was not found in executable mode we throw an exception
         if (!KernelInserted)
@@ -835,7 +835,7 @@ void handler::verifyUsedKernelBundle(const std::string &KernelName) {
 
   kernel_id KernelID = detail::get_kernel_id_impl(KernelName);
   device Dev =
-      (MGraph) ? MGraph->getDevice() : detail::getDeviceFromHandler(*this);
+      MGraph ? MGraph->getDevice() : detail::getDeviceFromHandler(*this);
   if (!UsedKernelBundleImplPtr->has_kernel(KernelID, Dev))
     throw sycl::exception(
         make_error_code(errc::kernel_not_supported),
diff --git a/sycl/test-e2e/Graph/Explicit/spec_constants_handler_api.cpp b/sycl/test-e2e/Graph/Explicit/spec_constants_handler_api.cpp
new file mode 100644
index 0000000000000..823786de3d694
--- /dev/null
+++ b/sycl/test-e2e/Graph/Explicit/spec_constants_handler_api.cpp
@@ -0,0 +1,16 @@
+// REQUIRES: cuda || level_zero, gpu
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using ZE_DEBUG
+// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
+//
+// CHECK-NOT: LEAK
+
+// The following limitation is not restricted to Sycl-Graph
+// but comes from the orignal test : `SpecConstants/2020/handler-api.cpp`
+// FIXME: ACC devices use emulation path, which is not yet supported
+// UNSUPPORTED: accelerator
+
+#define GRAPH_E2E_EXPLICIT
+
+#include "../Inputs/spec_constants_handler_api.cpp"
diff --git a/sycl/test-e2e/Graph/Explicit/spec_constants_kernel_bundle_api.cpp b/sycl/test-e2e/Graph/Explicit/spec_constants_kernel_bundle_api.cpp
new file mode 100644
index 0000000000000..3417dbc9754bd
--- /dev/null
+++ b/sycl/test-e2e/Graph/Explicit/spec_constants_kernel_bundle_api.cpp
@@ -0,0 +1,17 @@
+// REQUIRES: cuda || level_zero, gpu
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using ZE_DEBUG
+// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
+//
+// CHECK-NOT: LEAK
+
+// The following limitation is not restricted to Sycl-Graph
+// but comes from the orignal test : `SpecConstants/2020/kernel-bundle-api.cpp`
+// FIXME: ACC devices use emulation path, which is not yet supported
+// UNSUPPORTED: accelerator
+// UNSUPPORTED: hip
+
+#define GRAPH_E2E_EXPLICIT
+
+#include "../Inputs/spec_constants_kernel_bundle_api.cpp"
diff --git a/sycl/test-e2e/Graph/Inputs/spec_constants_handler_api.cpp b/sycl/test-e2e/Graph/Inputs/spec_constants_handler_api.cpp
new file mode 100644
index 0000000000000..a278378655844
--- /dev/null
+++ b/sycl/test-e2e/Graph/Inputs/spec_constants_handler_api.cpp
@@ -0,0 +1,210 @@
+// This test is intended to check basic operations with SYCL 2020 specialization
+// constants using Graph and sycl::handler and sycl::kernel_handler APIs
+// This test was taken from `SpecConstants/2020/handler-api.cpp`.
+// Variable names have been changed to meet PascalCase naming convention
+// requirements.
+
+#include "../graph_common.hpp"
+
+constexpr sycl::specialization_id<int> IntId;
+constexpr sycl::specialization_id<int> IntId2(2);
+constexpr sycl::specialization_id<float> FloatId(3.14);
+
+class TestDefaultValuesKernel;
+class EmptyKernel;
+class TestSetAndGetOnDevice;
+
+bool test_default_values(sycl::queue Queue);
+bool test_set_and_get_on_host(sycl::queue Queue);
+bool test_set_and_get_on_device(sycl::queue Queue);
+
+bool test_set_and_get_on_device(sycl::queue Queue);
+
+int main() {
+  auto ExceptionHandler = [&](sycl::exception_list Exceptions) {
+    for (std::exception_ptr const &E : Exceptions) {
+      try {
+        std::rethrow_exception(E);
+      } catch (sycl::exception const &E) {
+        std::cout << "An async SYCL exception was caught: " << E.what()
+                  << std::endl;
+        std::exit(1);
+      }
+    }
+  };
+
+  queue Queue{ExceptionHandler,
+              {sycl::ext::intel::property::queue::no_immediate_command_list{}}};
+
+  unsigned Errors = 0;
+  if (!test_default_values(Queue)) {
+    std::cout << "Test for default values of specialization constants failed!"
+              << std::endl;
+    Errors++;
+  }
+
+  if (!test_set_and_get_on_host(Queue)) {
+    std::cout << "Test for set and get API on host failed!" << std::endl;
+    Errors++;
+  }
+
+  if (!test_set_and_get_on_device(Queue)) {
+    std::cout << "Test for set and get API on device failed!" << std::endl;
+    Errors++;
+  }
+
+  return (Errors == 0) ? 0 : 1;
+};
+
+bool test_default_values(sycl::queue Queue) {
+  sycl::buffer<int> IntBuffer(1);
+  IntBuffer.set_write_back(false);
+  sycl::buffer<int> IntBuffer2(1);
+  IntBuffer2.set_write_back(false);
+  sycl::buffer<float> FloatBuffer(1);
+  FloatBuffer.set_write_back(false);
+
+  {
+    exp_ext::command_graph Graph{
+        Queue.get_context(),
+        Queue.get_device(),
+        {exp_ext::property::graph::assume_buffer_outlives_graph{}}};
+
+    add_node(Graph, Queue, ([&](sycl::handler &CGH) {
+               auto IntAcc =
+                   IntBuffer.get_access<sycl::access::mode::write>(CGH);
+               auto IntAcc2 =
+                   IntBuffer2.get_access<sycl::access::mode::write>(CGH);
+               auto FloatAcc =
+                   FloatBuffer.get_access<sycl::access::mode::write>(CGH);
+
+               CGH.single_task<TestDefaultValuesKernel>(
+                   [=](sycl::kernel_handler KH) {
+                     IntAcc[0] = KH.get_specialization_constant<IntId>();
+                     IntAcc2[0] = KH.get_specialization_constant<IntId2>();
+                     FloatAcc[0] = KH.get_specialization_constant<FloatId>();
+                   });
+             }));
+
+    auto GraphExec = Graph.finalize();
+
+    Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); });
+    Queue.wait_and_throw();
+  }
+
+  unsigned Errors = 0;
+  sycl::host_accessor IntAcc(IntBuffer, sycl::read_only);
+  if (!check_value(
+          0, IntAcc[0],
+          "integer specialization constant (defined without default value)"))
+    Errors++;
+
+  sycl::host_accessor IntAcc2(IntBuffer2, sycl::read_only);
+  if (!check_value(2, IntAcc2[0], "integer specialization constant"))
+    Errors++;
+
+  sycl::host_accessor FloatAcc(FloatBuffer, sycl::read_only);
+  if (!check_value(3.14f, FloatAcc[0], "float specialization constant"))
+    Errors++;
+
+  return Errors == 0;
+}
+
+bool test_set_and_get_on_host(sycl::queue Queue) {
+  unsigned Errors = 0;
+
+  exp_ext::command_graph Graph{
+      Queue.get_context(),
+      Queue.get_device(),
+      {exp_ext::property::graph::assume_buffer_outlives_graph{}}};
+
+  add_node(
+      Graph, Queue, ([&](sycl::handler &CGH) {
+        if (!check_value(
+                0, CGH.get_specialization_constant<IntId>(),
+                "integer specializaiton constant before setting any value"))
+          ++Errors;
+
+        if (!check_value(
+                3.14f, CGH.get_specialization_constant<FloatId>(),
+                "float specializaiton constant before setting any value"))
+          ++Errors;
+
+        int NewIntValue = 8;
+        float NewFloatValue = 3.0f;
+        CGH.set_specialization_constant<IntId>(NewIntValue);
+        CGH.set_specialization_constant<FloatId>(NewFloatValue);
+
+        if (!check_value(
+                NewIntValue, CGH.get_specialization_constant<IntId>(),
+                "integer specializaiton constant after setting a new value"))
+          ++Errors;
+
+        if (!check_value(
+                NewFloatValue, CGH.get_specialization_constant<FloatId>(),
+                "float specializaiton constant after setting a new value"))
+          ++Errors;
+
+        CGH.single_task<EmptyKernel>([=]() {});
+      }));
+
+  return Errors == 0;
+}
+
+bool test_set_and_get_on_device(sycl::queue Queue) {
+  sycl::buffer<int> IntBuffer(1);
+  IntBuffer.set_write_back(false);
+  sycl::buffer<int> IntBuffer2(1);
+  IntBuffer2.set_write_back(false);
+  sycl::buffer<float> FloatBuffer(1);
+  FloatBuffer.set_write_back(false);
+
+  int NewIntValue = 8;
+  int NewIntValue2 = 0;
+  float NewFloatValue = 3.0f;
+
+  {
+    exp_ext::command_graph Graph{
+        Queue.get_context(),
+        Queue.get_device(),
+        {exp_ext::property::graph::assume_buffer_outlives_graph{}}};
+
+    add_node(
+        Graph, Queue, ([&](sycl::handler &CGH) {
+          auto IntAcc = IntBuffer.get_access<sycl::access::mode::write>(CGH);
+          auto IntAcc2 = IntBuffer2.get_access<sycl::access::mode::write>(CGH);
+          auto FloatAcc =
+              FloatBuffer.get_access<sycl::access::mode::write>(CGH);
+
+          CGH.set_specialization_constant<IntId>(NewIntValue);
+          CGH.set_specialization_constant<IntId2>(NewIntValue2);
+          CGH.set_specialization_constant<FloatId>(NewFloatValue);
+
+          CGH.single_task<TestSetAndGetOnDevice>([=](sycl::kernel_handler KH) {
+            IntAcc[0] = KH.get_specialization_constant<IntId>();
+            IntAcc2[0] = KH.get_specialization_constant<IntId2>();
+            FloatAcc[0] = KH.get_specialization_constant<FloatId>();
+          });
+        }));
+
+    auto GraphExec = Graph.finalize();
+
+    Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); });
+    Queue.wait_and_throw();
+  }
+
+  unsigned Errors = 0;
+  sycl::host_accessor IntAcc(IntBuffer, sycl::read_only);
+  if (!check_value(NewIntValue, IntAcc[0], "integer specialization constant"))
+    Errors++;
+
+  sycl::host_accessor IntAcc2(IntBuffer2, sycl::read_only);
+  if (!check_value(NewIntValue2, IntAcc2[0], "integer specialization constant"))
+    Errors++;
+
+  sycl::host_accessor FloatAcc(FloatBuffer, sycl::read_only);
+  if (!check_value(NewFloatValue, FloatAcc[0], "float specialization constant"))
+    Errors++;
+
+  return Errors == 0;
+}
diff --git a/sycl/test-e2e/Graph/Inputs/spec_constants_kernel_bundle_api.cpp b/sycl/test-e2e/Graph/Inputs/spec_constants_kernel_bundle_api.cpp
new file mode 100644
index 0000000000000..63b8f350c38a3
--- /dev/null
+++ b/sycl/test-e2e/Graph/Inputs/spec_constants_kernel_bundle_api.cpp
@@ -0,0 +1,232 @@
+// This test is intended to check basic operations with SYCL 2020 specialization
+// constants using Graph and sycl::kernel_bundle and sycl::kernel_handler APIs
+// This test was taken from `SpecConstants/2020/kernel-bundle-api.cpp`.
+// Variable names have been changed to meet PascalCase naming convention
+// requirements and native constants test was removed.
+
+#include "../graph_common.hpp"
+
+constexpr sycl::specialization_id<int> IntId(2);
+constexpr sycl::specialization_id<float> FloatId(3.14f);
+
+class TestDefaultValuesKernel;
+class EmptyKernel;
+class TestSetAndGetOnDevice;
+
+bool test_default_values(sycl::queue Queue);
+bool test_set_and_get_on_host(sycl::queue Queue);
+bool test_set_and_get_on_device(sycl::queue Queue);
+
+int main() {
+  auto ExceptionHandler = [&](sycl::exception_list Exceptions) {
+    for (std::exception_ptr const &E : Exceptions) {
+      try {
+        std::rethrow_exception(E);
+      } catch (sycl::exception const &E) {
+        std::cout << "An async SYCL exception was caught: " << E.what()
+                  << std::endl;
+        std::exit(1);
+      }
+    }
+  };
+
+  queue Queue{ExceptionHandler,
+              {sycl::ext::intel::property::queue::no_immediate_command_list{}}};
+
+  unsigned Errors = 0;
+  if (!test_default_values(Queue)) {
+    std::cout << "Test for default values of specialization constants failed!"
+              << std::endl;
+    Errors++;
+  }
+
+  if (!test_set_and_get_on_host(Queue)) {
+    std::cout << "Test for set and get API on host failed!" << std::endl;
+    Errors++;
+  }
+
+  if (!test_set_and_get_on_device(Queue)) {
+    std::cout << "Test for set and get API on device failed!" << std::endl;
+    Errors++;
+  }
+
+  return (Errors == 0) ? 0 : 1;
+};
+
+bool test_default_values(sycl::queue Queue) {
+  if (!sycl::has_kernel_bundle<sycl::bundle_state::input>(
+          Queue.get_context())) {
+    std::cout << "Cannot obtain kernel_bundle in input state, skipping default "
+                 "values test"
+              << std::endl;
+    return true;
+  }
+
+  sycl::buffer<int> IntBuffer(1);
+  IntBuffer.set_write_back(false);
+  sycl::buffer<float> FloatBuffer(1);
+  FloatBuffer.set_write_back(false);
+
+  auto InputBundle =
+      sycl::get_kernel_bundle<sycl::bundle_state::input>(Queue.get_context());
+  auto ExecBundle = sycl::build(InputBundle);
+
+  {
+    exp_ext::command_graph Graph{
+        Queue.get_context(),
+        Queue.get_device(),
+        {exp_ext::property::graph::assume_buffer_outlives_graph{}}};
+
+    add_node(Graph, Queue, ([&](sycl::handler &CGH) {
+               CGH.use_kernel_bundle(ExecBundle);
+               auto IntAcc =
+                   IntBuffer.get_access<sycl::access::mode::write>(CGH);
+               auto FloatAcc =
+                   FloatBuffer.get_access<sycl::access::mode::write>(CGH);
+
+               CGH.single_task<TestDefaultValuesKernel>(
+                   [=](sycl::kernel_handler KH) {
+                     IntAcc[0] = KH.get_specialization_constant<IntId>();
+                     FloatAcc[0] = KH.get_specialization_constant<FloatId>();
+                   });
+             }));
+
+    auto GraphExec = Graph.finalize();
+
+    Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); });
+    Queue.wait_and_throw();
+  }
+  unsigned Errors = 0;
+
+  sycl::host_accessor IntAcc(IntBuffer, sycl::read_only);
+  if (!check_value(2, IntAcc[0], "integer specialization constant"))
+    Errors++;
+
+  sycl::host_accessor FloatAcc(FloatBuffer, sycl::read_only);
+  if (!check_value(3.14f, FloatAcc[0], "float specialization constant"))
+    Errors++;
+
+  return Errors == 0;
+}
+
+bool test_set_and_get_on_host(sycl::queue Queue) {
+  if (!sycl::has_kernel_bundle<sycl::bundle_state::input>(
+          Queue.get_context())) {
+    std::cout << "Cannot obtain kernel_bundle in input state, skipping default "
+                 "values test"
+              << std::endl;
+    return true;
+  }
+
+  unsigned Errors = 0;
+
+  try {
+    auto InputBundle =
+        sycl::get_kernel_bundle<sycl::bundle_state::input>(Queue.get_context());
+
+    if (!InputBundle.contains_specialization_constants()) {
+      std::cout
+          << "Obtained kernel_bundle is expected to contain specialization "
+             "constants, but it doesn't!"
+          << std::endl;
+      return false;
+    }
+
+    // Check default values
+    if (!check_value(
+            2, InputBundle.get_specialization_constant<IntId>(),
+            "integer specialization constant before setting any value"))
+      ++Errors;
+
+    if (!check_value(3.14f, InputBundle.get_specialization_constant<FloatId>(),
+                     "float specialization constant before setting any value"))
+      ++Errors;
+
+    // Update values
+    int NewIntValue = 42;
+    float NewFloatValue = 3.0f;
+
+    InputBundle.set_specialization_constant<IntId>(NewIntValue);
+    InputBundle.set_specialization_constant<FloatId>(NewFloatValue);
+
+    // And re-check them again
+    if (!check_value(
+            NewIntValue, InputBundle.get_specialization_constant<IntId>(),
+            "integer specialization constant after setting a new value"))
+      ++Errors;
+
+    if (!check_value(NewFloatValue,
+                     InputBundle.get_specialization_constant<FloatId>(),
+                     "float specialization constant after setting a value"))
+      ++Errors;
+
+    // Let's try to build the bundle
+    auto ExecBundle = sycl::build(InputBundle);
+
+    // And ensure that updated spec constant values are still there
+    if (!check_value(NewIntValue,
+                     ExecBundle.get_specialization_constant<IntId>(),
+                     "integer specialization constant after build"))
+      ++Errors;
+
+    if (!check_value(NewFloatValue,
+                     ExecBundle.get_specialization_constant<FloatId>(),
+                     "float specialization constant after build"))
+      ++Errors;
+  } catch (sycl::exception &e) {
+  }
+
+  return Errors == 0;
+}
+
+bool test_set_and_get_on_device(sycl::queue Queue) {
+  sycl::buffer<int> IntBuffer(1);
+  IntBuffer.set_write_back(false);
+  sycl::buffer<float> FloatBuffer(1);
+  FloatBuffer.set_write_back(false);
+
+  int NewIntValue = 42;
+  float NewFloatValue = 2.0f;
+
+  auto InputBundle =
+      sycl::get_kernel_bundle<sycl::bundle_state::input>(Queue.get_context());
+  InputBundle.set_specialization_constant<IntId>(NewIntValue);
+  InputBundle.set_specialization_constant<FloatId>(NewFloatValue);
+  auto ExecBundle = sycl::build(InputBundle);
+
+  {
+    exp_ext::command_graph Graph{
+        Queue.get_context(),
+        Queue.get_device(),
+        {exp_ext::property::graph::assume_buffer_outlives_graph{}}};
+
+    add_node(
+        Graph, Queue, ([&](sycl::handler &CGH) {
+          CGH.use_kernel_bundle(ExecBundle);
+          auto IntAcc = IntBuffer.get_access<sycl::access::mode::write>(CGH);
+          auto FloatAcc =
+              FloatBuffer.get_access<sycl::access::mode::write>(CGH);
+
+          CGH.single_task<TestSetAndGetOnDevice>([=](sycl::kernel_handler KH) {
+            IntAcc[0] = KH.get_specialization_constant<IntId>();
+            FloatAcc[0] = KH.get_specialization_constant<FloatId>();
+          });
+        }));
+
+    auto GraphExec = Graph.finalize();
+
+    Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); });
+    Queue.wait_and_throw();
+  }
+
+  unsigned Errors = 0;
+  sycl::host_accessor IntAcc(IntBuffer, sycl::read_only);
+  if (!check_value(NewIntValue, IntAcc[0], "integer specialization constant"))
+    Errors++;
+
+  sycl::host_accessor FloatAcc(FloatBuffer, sycl::read_only);
+  if (!check_value(NewFloatValue, FloatAcc[0], "float specialization constant"))
+    Errors++;
+
+  return Errors == 0;
+}
diff --git a/sycl/test-e2e/Graph/RecordReplay/spec_constants_handler_api.cpp b/sycl/test-e2e/Graph/RecordReplay/spec_constants_handler_api.cpp
new file mode 100644
index 0000000000000..c602e6e7d2808
--- /dev/null
+++ b/sycl/test-e2e/Graph/RecordReplay/spec_constants_handler_api.cpp
@@ -0,0 +1,16 @@
+// REQUIRES: cuda || level_zero, gpu
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using ZE_DEBUG
+// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
+//
+// CHECK-NOT: LEAK
+
+// The following limitation is not restricted to Sycl-Graph
+// but comes from the orignal test : `SpecConstants/2020/handler-api.cpp`
+// FIXME: ACC devices use emulation path, which is not yet supported
+// UNSUPPORTED: accelerator
+
+#define GRAPH_E2E_RECORD_REPLAY
+
+#include "../Inputs/spec_constants_handler_api.cpp"
diff --git a/sycl/test-e2e/Graph/RecordReplay/spec_constants_kernel_bundle_api.cpp b/sycl/test-e2e/Graph/RecordReplay/spec_constants_kernel_bundle_api.cpp
new file mode 100644
index 0000000000000..da0ed273e102b
--- /dev/null
+++ b/sycl/test-e2e/Graph/RecordReplay/spec_constants_kernel_bundle_api.cpp
@@ -0,0 +1,17 @@
+// REQUIRES: cuda || level_zero, gpu
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using ZE_DEBUG
+// RUN: %if ext_oneapi_level_zero %{env ZE_DEBUG=4 %{run} %t.out 2>&1 | FileCheck %s %}
+//
+// CHECK-NOT: LEAK
+
+// The following limitation is not restricted to Sycl-Graph
+// but comes from the orignal test : `SpecConstants/2020/kernel-bundle-api.cpp`
+// FIXME: ACC devices use emulation path, which is not yet supported
+// UNSUPPORTED: accelerator
+// UNSUPPORTED: hip
+
+#define GRAPH_E2E_RECORD_REPLAY
+
+#include "../Inputs/spec_constants_kernel_bundle_api.cpp"
diff --git a/sycl/test-e2e/Graph/graph_common.hpp b/sycl/test-e2e/Graph/graph_common.hpp
index cd2ca0bf366ae..eda7f3255f4ba 100644
--- a/sycl/test-e2e/Graph/graph_common.hpp
+++ b/sycl/test-e2e/Graph/graph_common.hpp
@@ -433,6 +433,18 @@ class Barrier {
   std::size_t threadNum;
 };
 
+template <typename T>
+bool inline check_value(const T &Ref, const T &Got,
+                        const std::string &VariableName) {
+  if (Got != Ref) {
+    std::cout << "Unexpected value of " << VariableName << ": " << Got
+              << " (got) vs " << Ref << " (expected)" << std::endl;
+    return false;
+  }
+
+  return true;
+}
+
 template <typename T>
 bool inline check_value(const size_t index, const T &Ref, const T &Got,
                         const std::string &VariableName) {
diff --git a/sycl/unittests/Extensions/CommandGraph.cpp b/sycl/unittests/Extensions/CommandGraph.cpp
index b37f9d7221ca1..58ddec5c14f4e 100644
--- a/sycl/unittests/Extensions/CommandGraph.cpp
+++ b/sycl/unittests/Extensions/CommandGraph.cpp
@@ -22,21 +22,6 @@
 using namespace sycl;
 using namespace sycl::ext::oneapi;
 
-// Spec constant for testing.
-constexpr specialization_id<int> SpecConst1{7};
-
-namespace sycl {
-inline namespace _V1 {
-namespace detail {
-
-// Necessary for get_specialization_constant() to work in unit tests.
-template <> const char *get_spec_constant_symbolic_ID<SpecConst1>() {
-  return "SC1";
-}
-} // namespace detail
-} // namespace _V1
-} // namespace sycl
-
 // anonymous namespace used to avoid code redundancy by defining functions
 // used by multiple times by unitests.
 // Defining anonymous namespace prevents from function naming conflits
@@ -1602,36 +1587,6 @@ TEST_F(CommandGraphTest, Memcpy2DExceptionCheck) {
   sycl::free(USMMemDst, Queue);
 }
 
-// Tests that using specialization constants in a graph will throw.
-TEST_F(CommandGraphTest, SpecializationConstant) {
-
-  ASSERT_THROW(
-      {
-        try {
-          Graph.add([&](handler &CGH) {
-            CGH.set_specialization_constant<SpecConst1>(8);
-          });
-        } catch (const sycl::exception &e) {
-          ASSERT_EQ(e.code(), make_error_code(sycl::errc::invalid));
-          throw;
-        }
-      },
-      sycl::exception);
-  ASSERT_THROW(
-      {
-        try {
-          Graph.add([&](handler &CGH) {
-            int Value = CGH.get_specialization_constant<SpecConst1>();
-            (void)Value;
-          });
-        } catch (const sycl::exception &e) {
-          ASSERT_EQ(e.code(), make_error_code(sycl::errc::invalid));
-          throw;
-        }
-      },
-      sycl::exception);
-}
-
 // Tests that using reductions in a graph will throw.
 TEST_F(CommandGraphTest, Reductions) {
   int ReduVar = 0;

From 022b63498cd022d256c615dbb47037b8e500e1e1 Mon Sep 17 00:00:00 2001
From: Kseniya Tikhomirova <kseniya.tikhomirova@intel.com>
Date: Thu, 9 Nov 2023 15:45:49 +0100
Subject: [PATCH 866/877] [SYCL][E2E] Fix itt stubs linking for E2E test
 (#11814)

Signed-off-by: Tikhomirova, Kseniya <kseniya.tikhomirova@intel.com>
---
 sycl/test-e2e/SeparateCompile/test.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/sycl/test-e2e/SeparateCompile/test.cpp b/sycl/test-e2e/SeparateCompile/test.cpp
index 9fad9305666b6..6d6e7f6bfaf52 100644
--- a/sycl/test-e2e/SeparateCompile/test.cpp
+++ b/sycl/test-e2e/SeparateCompile/test.cpp
@@ -29,9 +29,11 @@
 // add the device lib instrumentation (itt_compiler_wrapper)
 // >> ---- unbundle compiler wrapper device object
 // RUN: clang-offload-bundler -type=o -targets=sycl-spir64-unknown-unknown -input=%sycl_static_libs_dir/libsycl-itt-compiler-wrappers%obj_ext -output=compiler_wrappers.bc -unbundle
+// RUN: clang-offload-bundler -type=o -targets=sycl-spir64-unknown-unknown -input=%sycl_static_libs_dir/libsycl-itt-stubs%obj_ext -output=itt_stubs.bc -unbundle
+// RUN: clang-offload-bundler -type=o -targets=sycl-spir64-unknown-unknown -input=%sycl_static_libs_dir/libsycl-itt-user-wrappers%obj_ext -output=user_wrappers.bc -unbundle
 //
 // >> ---- link device code
-// RUN: llvm-link -o=app.bc a_kernel.bc b_kernel.bc compiler_wrappers.bc
+// RUN: llvm-link -o=app.bc a_kernel.bc b_kernel.bc compiler_wrappers.bc itt_stubs.bc user_wrappers.bc
 //
 // >> ---- produce entries data
 // RUN: sycl-post-link -split=auto -emit-param-info -symbols -emit-exported-symbols  -o test.table app.bc

From 4b487027112eaedff3356c214e657c3e1d2d3773 Mon Sep 17 00:00:00 2001
From: Vyacheslav Klochkov <vyacheslav.n.klochkov@intel.com>
Date: Thu, 9 Nov 2023 10:07:59 -0600
Subject: [PATCH 867/877] [ESIMD] Implement unified memory API - part 3 -
 slm_block_load() (#11665)

This patch also implements support for 8- and 16-bit data types in
slm_block_load().

---------

Signed-off-by: Klochkov, Vyacheslav N <vyacheslav.n.klochkov@intel.com>
---
 .../ext/intel/esimd/detail/memory_intrin.hpp  |  55 +++
 sycl/include/sycl/ext/intel/esimd/memory.hpp  | 465 +++++++++++++++++-
 .../esimd/detail/memory_intrin.hpp            |  57 ---
 .../ext/intel/experimental/esimd/memory.hpp   |  40 +-
 .../unified_memory_api/Inputs/block_load.hpp  | 432 +++++++++++++++-
 .../unified_memory_api/block_load_slm.cpp     |  47 ++
 .../unified_memory_api/block_load_slm_acc.cpp |  41 ++
 .../block_load_slm_acc_pvc.cpp                |  40 ++
 .../unified_memory_api/block_load_slm_pvc.cpp |  40 ++
 sycl/test/esimd/memory_properties.cpp         |  58 ++-
 10 files changed, 1153 insertions(+), 122 deletions(-)
 create mode 100644 sycl/test-e2e/ESIMD/unified_memory_api/block_load_slm.cpp
 create mode 100644 sycl/test-e2e/ESIMD/unified_memory_api/block_load_slm_acc.cpp
 create mode 100644 sycl/test-e2e/ESIMD/unified_memory_api/block_load_slm_acc_pvc.cpp
 create mode 100644 sycl/test-e2e/ESIMD/unified_memory_api/block_load_slm_pvc.cpp

diff --git a/sycl/include/sycl/ext/intel/esimd/detail/memory_intrin.hpp b/sycl/include/sycl/ext/intel/esimd/detail/memory_intrin.hpp
index 317a33c56000d..18eec87b50a38 100644
--- a/sycl/include/sycl/ext/intel/esimd/detail/memory_intrin.hpp
+++ b/sycl/include/sycl/ext/intel/esimd/detail/memory_intrin.hpp
@@ -182,6 +182,61 @@ __esimd_svm_block_st(__ESIMD_DNS::vector_type_t<Ty, N> *addr,
 }
 #endif // __SYCL_DEVICE_ONLY__
 
+/// SLM gather/block_load.
+/// Supported platforms: DG2, PVC
+///
+/// Collects elements located at slm and returns them
+/// as a single \ref simd object.
+///
+/// @tparam Ty is element type.
+/// @tparam L1H is L1 cache hint.
+/// @tparam L2H is L2 cache hint
+/// @tparam AddressScale is the address scale.
+/// @tparam ImmOffset is the immediate offset added to each address.
+/// @tparam DS is the data size.
+/// @tparam VS is the number of elements to load per address.
+/// @tparam Transposed indicates if the data is transposed during the transfer.
+/// @tparam N is the SIMD size of operation (the number of addresses to access)
+/// @param pred is predicates.
+/// @param offsets is the zero-based offsets for SLM buffer in bytes.
+/// @param pass_thru contains the vector which elements are copied
+/// to the returned result when the corresponding element of \p pred is 0.
+/// @return is a vector of type T and size N * to_int<VS>()
+template <typename Ty, __ESIMD_NS::cache_hint L1H, __ESIMD_NS::cache_hint L2H,
+          uint16_t AddressScale, int ImmOffset, __ESIMD_DNS::lsc_data_size DS,
+          __ESIMD_DNS::lsc_vector_size VS,
+          __ESIMD_DNS::lsc_data_order _Transposed, int N>
+__ESIMD_INTRIN __ESIMD_DNS::vector_type_t<Ty, N * __ESIMD_DNS::to_int<VS>()>
+__esimd_lsc_load_merge_slm(
+    __ESIMD_DNS::simd_mask_storage_t<N> pred,
+    __ESIMD_DNS::vector_type_t<uint32_t, N> offsets,
+    __ESIMD_DNS::vector_type_t<Ty, N * __ESIMD_DNS::to_int<VS>()> pass_thru)
+#ifdef __SYCL_DEVICE_ONLY__
+    ;
+#else  // __SYCL_DEVICE_ONLY__
+{
+  __ESIMD_UNSUPPORTED_ON_HOST;
+}
+#endif // __SYCL_DEVICE_ONLY__
+
+/// Similar to __esimd_lsc_load_merge_slm(), but the argument pass_thru is not
+/// explicitly specified, which results into random values in those elements of
+/// the returned result for which the corresponding element in \p pred is 0.
+template <typename Ty, __ESIMD_NS::cache_hint L1H, __ESIMD_NS::cache_hint L2H,
+          uint16_t AddressScale, int ImmOffset, __ESIMD_DNS::lsc_data_size DS,
+          __ESIMD_DNS::lsc_vector_size VS,
+          __ESIMD_DNS::lsc_data_order _Transposed, int N>
+__ESIMD_INTRIN __ESIMD_DNS::vector_type_t<Ty, N * __ESIMD_DNS::to_int<VS>()>
+__esimd_lsc_load_slm(__ESIMD_DNS::simd_mask_storage_t<N> pred,
+                     __ESIMD_DNS::vector_type_t<uint32_t, N> offsets)
+#ifdef __SYCL_DEVICE_ONLY__
+    ;
+#else  // __SYCL_DEVICE_ONLY__
+{
+  __ESIMD_UNSUPPORTED_ON_HOST;
+}
+#endif // __SYCL_DEVICE_ONLY__
+
 /// Surface-based gather.
 /// Supported platforms: DG2, PVC
 ///
diff --git a/sycl/include/sycl/ext/intel/esimd/memory.hpp b/sycl/include/sycl/ext/intel/esimd/memory.hpp
index 16d4bcf2ab225..d7f9130df4007 100644
--- a/sycl/include/sycl/ext/intel/esimd/memory.hpp
+++ b/sycl/include/sycl/ext/intel/esimd/memory.hpp
@@ -2505,11 +2505,472 @@ slm_scatter_rgba(simd<uint32_t, N> offsets,
 template <typename T, int N,
           typename Flags = overaligned_tag<detail::OperandSize::OWORD>>
 __ESIMD_API std::enable_if_t<is_simd_flag_type_v<Flags>, simd<T, N>>
-slm_block_load(uint32_t byte_offset, Flags = {}) {
+slm_block_load(uint32_t byte_offset, Flags) {
   constexpr size_t Align = Flags::template alignment<simd<T, N>>;
   return __esimd_slm_block_ld<detail::__raw_t<T>, N, Align>(byte_offset);
 }
 
+/// Each of the following slm_block_load functions loads a contiguous memory
+/// block from SLM (Shared Local Memory) and the \p byte_offset.
+/// The parameter 'pred' is the one element predicate. If it is set to 1, then
+/// all 'N' elements are loaded. Otherwise, the block load operation is a NO-OP.
+/// The parameter 'pass_thru' specifies the values being copied to the returned
+/// result if 'pred' is set to 0.
+/// The parameter 'props' specifies the optional compile-time properties
+/// list. Only esimd::alignment property is used. Other properties are ignored.
+
+/// simd<T, N> slm_block_load(uint32_t byte_offset, props={});     // (slm-bl-1)
+/// simd<T, N> slm_block_load(uint32_t byte_offset,
+///                           simd_mask<1> pred, props={});        // (slm-bl-2)
+/// simd<T, N> slm_block_load(uint32_t byte_offset,
+///                           simd_mask<1> pred,
+///                           simd<T, N> pass_thru, props={});     // (slm-bl-3)
+
+/// The following functions do the same work as slm_block_load(). They accept
+/// a local accessor \p lacc and the load is done from SLM associated
+/// with \p lacc plus \p byte_offset applied to it. If \p byte_offset
+/// is omitted, then zero offset is used.
+/// simd<T, N> block_load(local_accessor lacc, uint32_t byte_offset,
+///                       props={});                              // (lacc-bl-1)
+/// simd<T, N> block_load(local_accessor lacc, props={});         // (lacc-bl-2)
+/// simd<T, N> block_load(local_accessor lacc, uint32_t byte_offset,
+///                       simd_mask<1> pred, props={});           // (lacc-bl-3)
+/// simd<T, N> block_load(local_accessor lacc,
+///                       simd_mask<1> pred, props={});           // (lacc-bl-4)
+/// simd<T, N> block_load(local_accessor lacc, uint32_t byte_offset,
+///                       simd_mask<1> pred, simd<T, N> pass_thru,
+///                       props={});                              // (lacc-bl-5)
+/// simd<T, N> block_load(local_accessor lacc,
+///                       simd_mask<1> pred, simd<T, N> pass_thru,
+///                       props={});                              // (lacc-bl-6)
+
+/// simd<T, N> slm_block_load(uint32_t byte_offset, props = {});   // (slm-bl-1)
+/// Loads a contiguous memory block from SLM (Shared Local Memory) at
+/// the given \p byte_offset. The parameter 'props' specifies the optional
+/// compile-time properties list. Only esimd::alignment property is used. Other
+/// properties are ignored.
+///
+/// Alignment: If \p props does not specify the 'alignment' property, then
+/// the default expected alignment is 16-bytes to generate block_load
+/// instruction on all known target devices (Gen12, DG2, PVC, etc).
+/// On Gen12 (opposing to DG2 and PVC) the alignment smaller than 8-bytes
+/// is valid, but requires JIT compiler generating a slower GATHER instead
+/// of faster BLOCK_LOAD.
+/// !!! Passing \p byte_offset not aligned by 16-bytes and not specifying
+/// the actual alignment in \p props produces incorrect load results on Gen12.
+template <typename T, int N,
+          typename PropertyListT =
+              ext::oneapi::experimental::detail::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    ext::oneapi::experimental::is_property_list_v<PropertyListT>, simd<T, N>>
+slm_block_load(uint32_t byte_offset, PropertyListT props = {}) {
+  constexpr size_t DefaultAlignment = detail::OperandSize::OWORD;
+  constexpr size_t Alignment =
+      detail::getPropertyValue<PropertyListT, alignment_key>(DefaultAlignment);
+  return __esimd_slm_block_ld<detail::__raw_t<T>, N, Alignment>(byte_offset);
+}
+
+/// simd<T, N> slm_block_load(uint32_t byte_offset, simd_mask<N> pred,
+///                           props = {});                        // (slm-bl-2)
+/// Loads a contiguous memory block from SLM (Shared Local Memory) at the
+/// given \p byte_offset.
+/// The parameter \p pred is the one-element predicate. If it is set to 1,
+/// then all 'N' elements are loaded. Otherwise, the block load operation
+/// is a NO-OP.
+///
+/// The parameter 'props' specifies the optional compile-time properties
+/// list. Only esimd::alignment property is used. Other properties are ignored.
+///
+/// Alignment: If \p props does not specify the 'alignment' property, then
+/// the default expected alignment is the minimally required (see (R1) below).
+///
+/// Restrictions - predicate imposed - temporary:
+/// R1: The \p byte_offset must be at least 4-byte aligned for 4-byte or smaller
+///     elements and 8-byte aligned for 8-byte elements.
+/// R2: The number of elements must be:
+///     for 8-byte data: 1, 2, 3, 4, 8, 16, 32(max for DG2), 64;
+///     for 4-byte data: 1, 2, 3, 4, 8, 16, 32, 64(max for DG2),
+///                      or 128(only if alignment is 8-bytes or more);
+///     for 2-byte data: 2, 4, 6, 8, 16, 32, 64, 128(max for DG2),
+///                      or 256(only if alignment is 8-bytes or more);
+///     for 1-byte data: 4, 8, 12, 16, 32, 64, 128, 256(max for DG2),
+///                      or 512(only if alignment is 8-bytes or more).
+/// R3: The target device must be DG2, PVC or newer GPU.
+template <typename T, int N,
+          typename PropertyListT =
+              ext::oneapi::experimental::detail::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    ext::oneapi::experimental::is_property_list_v<PropertyListT>, simd<T, N>>
+slm_block_load(uint32_t byte_offset, simd_mask<1> pred,
+               PropertyListT props = {}) {
+  // Verify input template arguments.
+  constexpr size_t DefaultAlignment = sizeof(T) <= 4 ? 4 : sizeof(T);
+  constexpr size_t Alignment =
+      detail::getPropertyValue<PropertyListT, alignment_key>(DefaultAlignment);
+  static_assert(
+      (Alignment >= __ESIMD_DNS::OperandSize::DWORD && sizeof(T) <= 4) ||
+          (Alignment >= __ESIMD_DNS::OperandSize::QWORD && sizeof(T) > 4),
+      "Incorrect alignment for the data type");
+
+  constexpr int SmallIntFactor64Bit = sizeof(uint64_t) / sizeof(T);
+  constexpr int SmallIntFactor32Bit =
+      (std::max)(static_cast<size_t>(1), sizeof(uint32_t) / sizeof(T));
+  static_assert(N > 0 && N % SmallIntFactor32Bit == 0,
+                "Number of elements is not supported by Transposed load");
+
+  // If alignment >= 8 and (N * sizeof(T)) % 8 == 0) we can load QWORDs.
+  // Don't do it for 4-byte vectors (unless it is greater than 256-bytes),
+  // because it would require a bit-cast, which is supposed to be NO-OP, but
+  // might confuse GPU BE sometimes. 1- and 2-byte vectors are casted anyways.
+  constexpr bool Use64BitData =
+      Alignment >= __ESIMD_DNS::OperandSize::QWORD &&
+      (N * sizeof(T)) % sizeof(uint64_t) == 0 &&
+      (sizeof(T) != sizeof(uint32_t) || N * sizeof(T) > 256);
+  constexpr int SmallIntFactor =
+      Use64BitData ? SmallIntFactor64Bit : SmallIntFactor32Bit;
+  constexpr int FactoredN = N / SmallIntFactor;
+  detail::check_lsc_vector_size<FactoredN>();
+
+  // Prepare template arguments for the call of intrinsic.
+  using LoadElemT = __ESIMD_DNS::__raw_t<
+      std::conditional_t<SmallIntFactor == 1, T,
+                         std::conditional_t<Use64BitData, uint64_t, uint32_t>>>;
+
+  constexpr uint16_t AddressScale = 1;
+  constexpr int ImmOffset = 0;
+  constexpr detail::lsc_data_size DS =
+      Use64BitData ? detail::lsc_data_size::u64 : detail::lsc_data_size::u32;
+  constexpr auto VS = detail::to_lsc_vector_size<FactoredN>();
+  constexpr auto Transposed = detail::lsc_data_order::transpose;
+  constexpr int NLanes = 1;
+
+  // Prepare non-template arguments and call the intrinsic.
+  simd<uint32_t, NLanes> Offsets = byte_offset;
+  simd<LoadElemT, FactoredN> Result =
+      __esimd_lsc_load_slm<LoadElemT, cache_hint::none, cache_hint::none,
+                           AddressScale, ImmOffset, DS, VS, Transposed, NLanes>(
+          pred.data(), Offsets.data());
+  return Result.template bit_cast_view<T>();
+}
+
+/// simd<T, N> slm_block_load(uint32_t byte_offset,
+///                           simd_mask<1> pred,
+///                           simd<T, N> pass_thru, props={});     // (slm-bl-3)
+/// Loads a contiguous memory block from SLM (Shared Local Memory) at the
+/// given \p byte_offset.
+/// The parameter \p pred is the one-element predicate. If it is set to 1,
+/// then all 'N' elements are loaded. Otherwise, the block load operation
+/// is a NO-OP.
+/// The parameter 'pass_thru' specifies the values being copied to the returned
+/// result if 'pred' is set to 0.
+///
+/// The parameter 'props' specifies the optional compile-time properties
+/// list. Only esimd::alignment property is used. Other properties are ignored.
+///
+/// Alignment: If \p props does not specify the 'alignment' property, then
+/// the default expected alignment is the minimally required (see (R1) below).
+///
+/// Restrictions - predicate imposed - temporary:
+/// R1: The \p byte_offset must be at least 4-byte aligned for 4-byte or smaller
+///     elements and 8-byte aligned for 8-byte elements.
+/// R2: The number of elements must be:
+///     for 8-byte data: 1, 2, 3, 4, 8, 16, 32(max for DG2), 64;
+///     for 4-byte data: 1, 2, 3, 4, 8, 16, 32, 64(max for DG2),
+///                      or 128(only if alignment is 8-bytes or more);
+///     for 2-byte data: 2, 4, 6, 8, 16, 32, 64, 128(max for DG2),
+///                      or 256(only if alignment is 8-bytes or more);
+///     for 1-byte data: 4, 8, 12, 16, 32, 64, 128, 256(max for DG2),
+///                      or 512(only if alignment is 8-bytes or more).
+/// R3: The target device must be DG2, PVC or newer GPU.
+template <typename T, int N,
+          typename PropertyListT =
+              ext::oneapi::experimental::detail::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    ext::oneapi::experimental::is_property_list_v<PropertyListT>, simd<T, N>>
+slm_block_load(uint32_t offset, simd_mask<1> pred, simd<T, N> pass_thru,
+               PropertyListT props = {}) {
+  // Verify input template arguments.
+  constexpr size_t DefaultAlignment = sizeof(T) <= 4 ? 4 : sizeof(T);
+  constexpr size_t Alignment =
+      detail::getPropertyValue<PropertyListT, alignment_key>(DefaultAlignment);
+  static_assert(
+      (Alignment >= __ESIMD_DNS::OperandSize::DWORD && sizeof(T) <= 4) ||
+          (Alignment >= __ESIMD_DNS::OperandSize::QWORD && sizeof(T) > 4),
+      "Incorrect alignment for the data type");
+
+  constexpr int SmallIntFactor64Bit = sizeof(uint64_t) / sizeof(T);
+  constexpr int SmallIntFactor32Bit =
+      (std::max)(static_cast<size_t>(1), sizeof(uint32_t) / sizeof(T));
+  static_assert(N > 0 && N % SmallIntFactor32Bit == 0,
+                "Number of elements is not supported by Transposed load");
+
+  // If alignment >= 8 and (N * sizeof(T)) % 8 == 0) we can load QWORDs.
+  // Don't do it for 4-byte vectors (unless it is greater than 256-bytes),
+  // because it would require a bit-cast, which is supposed to be NO-OP, but
+  // might confuse GPU BE sometimes. 1- and 2-byte vectors are casted anyways.
+  constexpr bool Use64BitData =
+      Alignment >= __ESIMD_DNS::OperandSize::QWORD &&
+      (N * sizeof(T)) % sizeof(uint64_t) == 0 &&
+      (sizeof(T) != sizeof(uint32_t) || N * sizeof(T) > 256);
+  constexpr int SmallIntFactor =
+      Use64BitData ? SmallIntFactor64Bit : SmallIntFactor32Bit;
+  constexpr int FactoredN = N / SmallIntFactor;
+  detail::check_lsc_vector_size<FactoredN>();
+
+  // Prepare template arguments for the call of intrinsic.
+  using LoadElemT = __ESIMD_DNS::__raw_t<
+      std::conditional_t<SmallIntFactor == 1, T,
+                         std::conditional_t<Use64BitData, uint64_t, uint32_t>>>;
+
+  constexpr uint16_t AddressScale = 1;
+  constexpr int ImmOffset = 0;
+  constexpr detail::lsc_data_size DS =
+      Use64BitData ? detail::lsc_data_size::u64 : detail::lsc_data_size::u32;
+  constexpr auto VS = detail::to_lsc_vector_size<FactoredN>();
+  constexpr auto Transposed = detail::lsc_data_order::transpose;
+  constexpr int NLanes = 1;
+
+  // Prepare non-template arguments and call the intrinsic.
+  simd<uint32_t, NLanes> Offsets = offset;
+  simd<LoadElemT, FactoredN> PassThru =
+      pass_thru.template bit_cast_view<LoadElemT>();
+  simd<LoadElemT, FactoredN> Result =
+      __esimd_lsc_load_merge_slm<LoadElemT, cache_hint::none, cache_hint::none,
+                                 AddressScale, ImmOffset, DS, VS, Transposed,
+                                 NLanes>(pred.data(), Offsets.data(),
+                                         PassThru.data());
+  return Result.template bit_cast_view<T>();
+}
+
+/// simd<T, N> block_load(local_accessor lacc, uint32_t byte_offset,
+///                       props={});                              // (lacc-bl-1)
+/// Loads a contiguous memory block from SLM (Shared Local Memory) associated
+/// with the local accessor \p lacc at the given \p byte_offset.
+///
+/// The parameter 'props' specifies the optional compile-time properties
+/// list. Only esimd::alignment property is used. Other properties are ignored.
+///
+/// Alignment: If \p props does not specify the 'alignment' property, then
+/// the default expected alignment is 16-bytes to generate block_load
+/// instruction on all known target devices (Gen12, DG2, PVC, etc).
+/// On Gen12 (opposing to DG2 and PVC) the alignment smaller than 8-bytes
+/// is valid, but requires JIT compiler generating a slower GATHER instead
+/// of faster BLOCK_LOAD.
+/// !!! Passing local accessor associated with SLM starting from offset that
+/// is NOT aligned by 16-bytes and NOT specifying the actual alignment in
+/// \p props produces incorrect load results on Gen12.
+///
+/// Note: if two or more local accessors are used in the same kernel, then
+/// 16-byte alignment is guaranteed only for one of them.
+/// Other local accessors may or may not get 16-byte alignment. N-th local
+/// accessor's alignment depends on N-1 local accessor sizes, and their
+/// element-alignment/padding. Only element-alignment is guaranteed for them.
+template <typename T, int N, typename AccessorT,
+          typename PropertyListT =
+              ext::oneapi::experimental::detail::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    detail::is_local_accessor_with_v<AccessorT,
+                                     detail::accessor_mode_cap::can_read> &&
+        ext::oneapi::experimental::is_property_list_v<PropertyListT>,
+    simd<T, N>>
+block_load(AccessorT lacc, uint32_t byte_offset, PropertyListT props = {}) {
+  byte_offset += detail::localAccessorToOffset(lacc);
+  return slm_block_load<T, N>(byte_offset, props);
+}
+
+/// simd<T, N> block_load(local_accessor lacc, props={});         // (lacc-bl-2)
+/// Loads a contiguous memory block from SLM (Shared Local Memory) associated
+/// with the local accessor \p lacc at zero offset.
+///
+/// The parameter 'props' specifies the optional compile-time properties
+/// list. Only esimd::alignment property is used. Other properties are ignored.
+///
+/// Alignment: If \p props does not specify the 'alignment' property, then
+/// the default expected alignment is 16-bytes to generate block_load
+/// instruction on all known target devices (Gen12, DG2, PVC, etc).
+/// On Gen12 (opposing to DG2 and PVC) the alignment smaller than 8-bytes
+/// is valid, but requires JIT compiler generating a slower GATHER instead
+/// of faster BLOCK_LOAD.
+/// !!! Passing local accessor associated with SLM starting from offset that
+/// is NOT aligned by 16-bytes and NOT specifying the actual alignment in
+/// \p props produces incorrect load results on Gen12.
+///
+/// Note: if two or more local accessors are used in the same kernel, then
+/// 16-byte alignment is guaranteed only for one of them.
+/// Other local accessors may or may not get 16-byte alignment. N-th local
+/// accessor's alignment depends on N-1 local accessor sizes, and their
+/// element-alignment/padding. Only element-alignment is guaranteed for them.
+template <typename T, int N, typename AccessorT,
+          typename PropertyListT =
+              ext::oneapi::experimental::detail::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    detail::is_local_accessor_with_v<AccessorT,
+                                     detail::accessor_mode_cap::can_read> &&
+        ext::oneapi::experimental::is_property_list_v<PropertyListT>,
+    simd<T, N>>
+block_load(AccessorT lacc, PropertyListT props = {}) {
+  return slm_block_load<T, N>(detail::localAccessorToOffset(lacc), props);
+}
+
+/// simd<T, N> block_load(local_accessor lacc, uint32_t byte_offset,
+///                       simd_mask<1> pred, props={});           // (lacc-bl-3)
+/// Loads a contiguous memory block from SLM (Shared Local Memory) associated
+/// the local accessor \p lacc at the given \p byte_offset.
+///
+/// The parameter \p pred is the one-element predicate. If it is set to 1,
+/// then all 'N' elements are loaded. Otherwise, the block load operation
+/// is a NO-OP, and some undefined value is returned.
+///
+/// The parameter 'props' specifies the optional compile-time properties
+/// list. Only esimd::alignment property is used. Other properties are ignored.
+///
+/// Alignment: If \p props does not specify the 'alignment' property, then
+/// the default expected alignment is the minimally required (see (R1) below).
+///
+/// Restrictions - predicate imposed - temporary:
+/// R1: The \p lacc + \p byte_offset must be at least 4-byte aligned for 4-byte
+///     or smaller elements and 8-byte aligned for 8-byte elements.
+/// R2: The number of elements must be:
+///     for 8-byte data: 1, 2, 3, 4, 8, 16, 32(max for DG2), 64;
+///     for 4-byte data: 1, 2, 3, 4, 8, 16, 32, 64(max for DG2),
+///                      or 128(only if alignment is 8-bytes or more);
+///     for 2-byte data: 2, 4, 6, 8, 16, 32, 64, 128(max for DG2),
+///                      or 256(only if alignment is 8-bytes or more);
+///     for 1-byte data: 4, 8, 12, 16, 32, 64, 128, 256(max for DG2),
+///                      or 512(only if alignment is 8-bytes or more).
+/// R3: The target device must be DG2, PVC or newer GPU.
+template <typename T, int N, typename AccessorT,
+          typename PropertyListT =
+              ext::oneapi::experimental::detail::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    detail::is_local_accessor_with_v<AccessorT,
+                                     detail::accessor_mode_cap::can_read> &&
+        ext::oneapi::experimental::is_property_list_v<PropertyListT>,
+    simd<T, N>>
+block_load(AccessorT lacc, uint32_t byte_offset, simd_mask<1> pred,
+           PropertyListT props = {}) {
+  byte_offset += detail::localAccessorToOffset(lacc);
+  return slm_block_load<T, N>(byte_offset, pred, props);
+}
+
+/// simd<T, N> block_load(local_accessor lacc,
+///                       simd_mask<1> pred, props={});           // (lacc-bl-4)
+/// Loads a contiguous memory block from SLM (Shared Local Memory) associated
+/// with the local accessor \p lacc at zero offset.
+///
+/// The parameter \p pred is the one-element predicate. If it is set to 1,
+/// then all 'N' elements are loaded. Otherwise, the block load operation
+/// is a NO-OP, and some undefined value is returned.
+///
+/// The parameter 'props' specifies the optional compile-time properties
+/// list. Only esimd::alignment property is used. Other properties are ignored.
+///
+/// Alignment: If \p props does not specify the 'alignment' property, then
+/// the default expected alignment is the minimally required (see (R1) below).
+///
+/// Restrictions - predicate imposed - temporary:
+/// R1: The local accessor \p lacc must point to memory at least 4-byte aligned
+///     for elements of 4-bytes or smaller and 8-byte aligned for 8-byte
+///     elements.
+/// R2: The number of elements must be:
+///     for 8-byte data: 1, 2, 3, 4, 8, 16, 32(max for DG2), 64;
+///     for 4-byte data: 1, 2, 3, 4, 8, 16, 32, 64(max for DG2), or 128;
+///     for 2-byte data: 2, 4, 6, 8, 16, 32, 64, 128(max for DG2), or 256;
+///     for 1-byte data: 4, 8, 12, 16, 32, 64, 128, 256(max for DG2), or 512.
+/// R3: The target device must be DG2, PVC or newer GPU.
+template <typename T, int N, typename AccessorT,
+          typename PropertyListT =
+              ext::oneapi::experimental::detail::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    detail::is_local_accessor_with_v<AccessorT,
+                                     detail::accessor_mode_cap::can_read> &&
+        ext::oneapi::experimental::is_property_list_v<PropertyListT>,
+    simd<T, N>>
+block_load(AccessorT lacc, simd_mask<1> pred, PropertyListT props = {}) {
+  return slm_block_load<T, N>(detail::localAccessorToOffset(lacc), pred, props);
+}
+
+/// simd<T, N> block_load(local_accessor lacc, uint32_t byte_offset,
+///                       simd_mask<1> pred, simd<T, N> pass_thru,
+///                       props={});                              // (lacc-bl-5)
+/// Loads a contiguous memory block from SLM (Shared Local Memory) associated
+/// the local accessor \p lacc at the given \p byte_offset.
+/// The parameter \p pred is the one-element predicate. If it is set to 1,
+/// then all 'N' elements are loaded. Otherwise, the block load operation
+/// is a NO-OP, and \p pass_thru value is returned.
+///
+/// The parameter 'props' specifies the optional compile-time properties
+/// list. Only esimd::alignment property is used. Other properties are ignored.
+///
+/// Alignment: If \p props does not specify the 'alignment' property, then
+/// the default expected alignment is the minimally required (see (R1) below).
+///
+/// Restrictions - predicate imposed - temporary:
+/// R1: The \p lacc + \p byte_offset must be at least 4-byte aligned for 4-byte
+///     or smaller elements and 8-byte aligned for 8-byte elements.
+/// R2: The number of elements must be:
+///     for 8-byte data: 1, 2, 3, 4, 8, 16, 32(max for DG2), 64;
+///     for 4-byte data: 1, 2, 3, 4, 8, 16, 32, 64(max for DG2),
+///                      or 128(only if alignment is 8-bytes or more);
+///     for 2-byte data: 2, 4, 6, 8, 16, 32, 64, 128(max for DG2),
+///                      or 256(only if alignment is 8-bytes or more);
+///     for 1-byte data: 4, 8, 12, 16, 32, 64, 128, 256(max for DG2),
+///                      or 512(only if alignment is 8-bytes or more).
+/// R3: The target device must be DG2, PVC or newer GPU.
+template <typename T, int N, typename AccessorT,
+          typename PropertyListT =
+              ext::oneapi::experimental::detail::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    detail::is_local_accessor_with_v<AccessorT,
+                                     detail::accessor_mode_cap::can_read> &&
+        ext::oneapi::experimental::is_property_list_v<PropertyListT>,
+    simd<T, N>>
+block_load(AccessorT lacc, uint32_t byte_offset, simd_mask<1> pred,
+           simd<T, N> pass_thru, PropertyListT props = {}) {
+  byte_offset += __ESIMD_DNS::localAccessorToOffset(lacc);
+  return slm_block_load<T, N>(byte_offset, pred, pass_thru, props);
+}
+
+/// simd<T, N> block_load(local_accessor lacc,
+///                       simd_mask<1> pred, simd<T, N> pass_thru,
+///                       props={});                              // (lacc-bl-6)
+/// Loads a contiguous memory block from SLM (Shared Local Memory) associated
+/// with the local accessor \p lacc at zero offset.
+///
+/// The parameter \p pred is the one-element predicate. If it is set to 1,
+/// then all 'N' elements are loaded. Otherwise, the block load operation
+/// is a NO-OP, and \p pass_thru value is returned.
+///
+/// The parameter 'props' specifies the optional compile-time properties
+/// list. Only esimd::alignment property is used. Other properties are ignored.
+///
+/// Alignment: If \p props does not specify the 'alignment' property, then
+/// the default expected alignment is the minimally required (see (R1) below).
+///
+/// Restrictions - predicate imposed - temporary:
+/// R1: The local accessor \p lacc must point to memory at least 4-byte aligned
+///     for elements of 4-bytes or smaller and 8-byte aligned for 8-byte
+///     elements.
+/// R1: The number of elements must be:
+///     for 8-byte data: 1, 2, 3, 4, 8, 16, 32(max for DG2), 64;
+///     for 4-byte data: 1, 2, 3, 4, 8, 16, 32, 64(max for DG2), or 128;
+///     for 2-byte data: 2, 4, 6, 8, 16, 32, 64, 128(max for DG2), or 256;
+///     for 1-byte data: 4, 8, 12, 16, 32, 64, 128, 256(max for DG2), or 512.
+/// R2: The target device must be DG2, PVC or newer GPU.
+template <typename T, int N, typename AccessorT,
+          typename PropertyListT =
+              ext::oneapi::experimental::detail::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    detail::is_local_accessor_with_v<AccessorT,
+                                     detail::accessor_mode_cap::can_read> &&
+        ext::oneapi::experimental::is_property_list_v<PropertyListT>,
+    simd<T, N>>
+block_load(AccessorT lacc, simd_mask<1> pred, simd<T, N> pass_thru,
+           PropertyListT props = {}) {
+  return slm_block_load<T, N>(__ESIMD_DNS::localAccessorToOffset(lacc), pred,
+                              pass_thru, props);
+}
+
 /// Stores elements of the vector \p vals to a contiguous block of SLM memory
 /// at the given byte-offset \p offset.
 /// The generated code depends on the combination {T, N, Flags}.
@@ -3493,7 +3954,7 @@ __ESIMD_API
                          AccessorTy, detail::accessor_mode_cap::can_read> &&
                          is_simd_flag_type_v<Flags>,
                      simd<T, N>>
-    block_load(AccessorTy acc, uint32_t byte_offset, Flags flags = {}) {
+    block_load(AccessorTy acc, uint32_t byte_offset, Flags flags) {
   return slm_block_load<T, N>(byte_offset + detail::localAccessorToOffset(acc),
                               flags);
 }
diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/detail/memory_intrin.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/detail/memory_intrin.hpp
index 16953b35e9bdb..c295c246cc721 100644
--- a/sycl/include/sycl/ext/intel/experimental/esimd/detail/memory_intrin.hpp
+++ b/sycl/include/sycl/ext/intel/experimental/esimd/detail/memory_intrin.hpp
@@ -88,63 +88,6 @@ __ESIMD_INTRIN void __esimd_raw_send_nbarrier_signal(
 }
 #endif // __SYCL_DEVICE_ONLY__
 
-/// SLM gather.
-/// Supported platforms: DG2, PVC
-///
-/// Collects elements located at slm and returns them
-/// as a single \ref simd object.
-///
-/// @tparam Ty is element type.
-/// @tparam L1H is L1 cache hint.
-/// @tparam L3H is L3 cache hint
-/// @tparam AddressScale is the address scale.
-/// @tparam ImmOffset is the immediate offset added to each address.
-/// @tparam DS is the data size.
-/// @tparam VS is the number of elements to load per address.
-/// @tparam Transposed indicates if the data is transposed during the transfer.
-/// @tparam N is the SIMD size of operation (the number of addresses to access)
-/// @param pred is predicates.
-/// @param offsets is the zero-based offsets for SLM buffer in bytes.
-/// @param OldValues contains the vector which elements are copied
-/// to the returned result when the corresponding element of \p pred is 0.
-/// @return is a vector of type T and size N * to_int<VS>()
-template <typename Ty, __ESIMD_ENS::cache_hint L1H, __ESIMD_ENS::cache_hint L3H,
-          uint16_t AddressScale, int ImmOffset, __ESIMD_ENS::lsc_data_size DS,
-          __ESIMD_EDNS::lsc_vector_size VS,
-          __ESIMD_EDNS::lsc_data_order _Transposed, int N>
-__ESIMD_INTRIN __ESIMD_DNS::vector_type_t<Ty, N * __ESIMD_EDNS::to_int<VS>()>
-__esimd_lsc_load_merge_slm(
-    __ESIMD_DNS::simd_mask_storage_t<N> pred,
-    __ESIMD_DNS::vector_type_t<uint32_t, N> offsets,
-    __ESIMD_DNS::vector_type_t<Ty, N * __ESIMD_EDNS::to_int<VS>()> OldValues =
-        0)
-#ifdef __SYCL_DEVICE_ONLY__
-    ;
-#else  // __SYCL_DEVICE_ONLY__
-{
-  __ESIMD_UNSUPPORTED_ON_HOST;
-}
-#endif // __SYCL_DEVICE_ONLY__
-
-/// Similar to __esimd_lsc_load_merge_slm(), but the argument OldValues is not
-/// explicitly specified, which results into random values in those elements of
-/// the returned result for which the corresponding element in \p pred is 0.
-template <typename Ty, __ESIMD_ENS::cache_hint L1H, __ESIMD_ENS::cache_hint L3H,
-          uint16_t AddressScale, int ImmOffset, __ESIMD_ENS::lsc_data_size DS,
-          __ESIMD_EDNS::lsc_vector_size VS,
-          __ESIMD_EDNS::lsc_data_order _Transposed, int N>
-__ESIMD_INTRIN __ESIMD_DNS::vector_type_t<Ty, N * __ESIMD_EDNS::to_int<VS>()>
-__esimd_lsc_load_slm(__ESIMD_DNS::simd_mask_storage_t<N> pred,
-                     __ESIMD_DNS::vector_type_t<uint32_t, N> offsets)
-#ifdef __SYCL_DEVICE_ONLY__
-    ;
-#else  // __SYCL_DEVICE_ONLY__
-{
-  return __esimd_lsc_load_merge_slm<Ty, L1H, L3H, AddressScale, ImmOffset, DS,
-                                    VS, _Transposed, N>(pred, offsets);
-}
-#endif // __SYCL_DEVICE_ONLY__
-
 /// Surface-based prefetch gather.
 /// Supported platforms: DG2, PVC
 ///
diff --git a/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp b/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp
index 39af99b98ac6c..6f4783461f26f 100644
--- a/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp
+++ b/sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp
@@ -643,7 +643,7 @@ lsc_slm_gather(__ESIMD_NS::simd<uint32_t, N> offsets,
 ///
 /// @tparam T is element type.
 /// @tparam NElts is the number of elements to load per address.
-/// @tparam DS is the data size.
+/// @tparam DS is the data size (unused/obsolete).
 /// @param offset is the zero-based offset for SLM buffer in bytes.
 /// @param pred is the predicate; if it contains 0, then the actual load
 /// is not performed and the returned value is undefined.
@@ -652,21 +652,9 @@ lsc_slm_gather(__ESIMD_NS::simd<uint32_t, N> offsets,
 template <typename T, int NElts, lsc_data_size DS = lsc_data_size::default_size>
 __ESIMD_API __ESIMD_NS::simd<T, NElts>
 lsc_slm_block_load(uint32_t offset, __ESIMD_NS::simd_mask<1> pred = 1) {
-  detail::check_lsc_vector_size<NElts>();
-  detail::check_lsc_data_size<T, DS>();
-  constexpr uint16_t AddressScale = 1;
-  constexpr int ImmOffset = 0;
-  constexpr lsc_data_size FDS = detail::finalize_data_size<T, DS>();
-  static_assert(FDS == lsc_data_size::u32 || FDS == lsc_data_size::u64,
-                "Transposed load is supported only for data size u32 or u64");
-  constexpr detail::lsc_vector_size VS = detail::to_lsc_vector_size<NElts>();
-
-  constexpr auto Transposed = detail::lsc_data_order::transpose;
-  constexpr int N = 1;
-  __ESIMD_NS::simd<uint32_t, N> offsets = offset;
-  return __esimd_lsc_load_slm<T, cache_hint::none, cache_hint::none,
-                              AddressScale, ImmOffset, FDS, VS, Transposed, N>(
-      pred.data(), offsets.data());
+  constexpr size_t DefaultAlignment = sizeof(T) <= 4 ? 4 : sizeof(T);
+  __ESIMD_NS::properties Props{__ESIMD_NS::alignment<DefaultAlignment>};
+  return __ESIMD_NS::slm_block_load<T, NElts>(offset, pred, Props);
 }
 
 /// Transposed SLM gather with 1 channel.
@@ -678,7 +666,7 @@ lsc_slm_block_load(uint32_t offset, __ESIMD_NS::simd_mask<1> pred = 1) {
 ///
 /// @tparam T is element type.
 /// @tparam NElts is the number of elements to load per address.
-/// @tparam DS is the data size.
+/// @tparam DS is the data size (unused/obsolete).
 /// @param offset is the zero-based offset for SLM buffer in bytes.
 /// @param pred is the predicate; if it contains 0, then the actual load
 /// is not performed and \p pass_thru is returned.
@@ -690,21 +678,9 @@ template <typename T, int NElts, lsc_data_size DS = lsc_data_size::default_size>
 __ESIMD_API __ESIMD_NS::simd<T, NElts>
 lsc_slm_block_load(uint32_t offset, __ESIMD_NS::simd_mask<1> pred,
                    __ESIMD_NS::simd<T, NElts> pass_thru) {
-  detail::check_lsc_vector_size<NElts>();
-  detail::check_lsc_data_size<T, DS>();
-  constexpr uint16_t AddressScale = 1;
-  constexpr int ImmOffset = 0;
-  constexpr lsc_data_size FDS = detail::finalize_data_size<T, DS>();
-  static_assert(FDS == lsc_data_size::u32 || FDS == lsc_data_size::u64,
-                "Transposed load is supported only for data size u32 or u64");
-  constexpr detail::lsc_vector_size VS = detail::to_lsc_vector_size<NElts>();
-  constexpr auto Transposed = detail::lsc_data_order::transpose;
-  constexpr int N = 1;
-  __ESIMD_NS::simd<uint32_t, N> offsets = offset;
-  return __esimd_lsc_load_merge_slm<T, cache_hint::none, cache_hint::none,
-                                    AddressScale, ImmOffset, FDS, VS,
-                                    Transposed, N>(pred.data(), offsets.data(),
-                                                   pass_thru.data());
+  constexpr size_t DefaultAlignment = sizeof(T) <= 4 ? 4 : sizeof(T);
+  __ESIMD_NS::properties Props{__ESIMD_NS::alignment<DefaultAlignment>};
+  return __ESIMD_NS::slm_block_load<T, NElts>(offset, pred, pass_thru, Props);
 }
 
 /// USM pointer gather.
diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/block_load.hpp b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/block_load.hpp
index 7b153447ef703..eefbabba8d53d 100644
--- a/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/block_load.hpp
+++ b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/block_load.hpp
@@ -78,14 +78,13 @@ bool testUSM(queue Q, uint32_t Groups, uint32_t Threads,
 
          simd<T, N> Vals;
          simd_mask<1> Mask = (GlobalID + 1) & 0x1;
+         // TODO: these 2 lines work-around the problem with scalar
+         // conversions to bfloat16. It could be just: "simd<T, N>
+         // PassThru(ElemOffset, 1);"
+         simd<uint32_t, N> PassThruInt(ElemOffset, 1);
+         simd<T, N> PassThru = PassThruInt;
          if constexpr (!CheckProperties) {
            if constexpr (UsePassThruOperand) {
-             // TODO: these 2 lines work-around the problem with scalar
-             // conversions to bfloat16. It could be just: "simd<T, N>
-             // PassThru(ElemOffset, 1);"
-             simd<uint32_t, N> PassThruInt(ElemOffset, 1);
-             simd<T, N> PassThru = PassThruInt;
-
              if (GlobalID & 0x1)
                Vals = block_load<T, N>(In + ElemOffset, Mask, PassThru);
              else
@@ -106,12 +105,6 @@ bool testUSM(queue Q, uint32_t Groups, uint32_t Threads,
            }
          } else { // if CheckProperties
            if constexpr (UsePassThruOperand) {
-             // TODO: these 2 lines work-around the problem with scalar
-             // conversions to bfloat16. It could be just: "simd<T, N>
-             // PassThru(ElemOffset, 1);"
-             simd<uint32_t, N> PassThruInt(ElemOffset, 1);
-             simd<T, N> PassThru = PassThruInt;
-
              if (GlobalID & 0x1)
                Vals = block_load<T, N>(In + ElemOffset, Mask, PassThru,
                                        LoadPropertiesT{});
@@ -295,14 +288,13 @@ bool testACC(queue Q, uint32_t Groups, uint32_t Threads,
 
          simd<T, N> Vals;
          simd_mask<1> Mask = (GlobalID + 1) & 0x1;
+         // TODO: these 2 lines work-around the problem with scalar
+         // conversions to bfloat16. It could be just: "simd<T, N>
+         // PassThru(ElemOffset, 1);"
+         simd<uint32_t, N> PassThruInt(ElemOffset, 1);
+         simd<T, N> PassThru = PassThruInt;
          if constexpr (!CheckProperties) {
            if constexpr (UsePassThruOperand) {
-             // TODO: these 2 lines work-around the problem with scalar
-             // conversions to bfloat16. It could be just: "simd<T, N>
-             // PassThru(ElemOffset, 1);"
-             simd<uint32_t, N> PassThruInt(ElemOffset, 1);
-             simd<T, N> PassThru = PassThruInt;
-
              if (ElemOffset == 0) // try the variant without byte-offset
                Vals = block_load<T, N>(InAcc, Mask, PassThru);
              else
@@ -323,12 +315,6 @@ bool testACC(queue Q, uint32_t Groups, uint32_t Threads,
            }
          } else { // if CheckProperties
            if constexpr (UsePassThruOperand) {
-             // TODO: these 2 lines work-around the problem with scalar
-             // conversions to bfloat16. It could be just: "simd<T, N>
-             // PassThru(ElemOffset, 1);"
-             simd<uint32_t, N> PassThruInt(ElemOffset, 1);
-             simd<T, N> PassThru = PassThruInt;
-
              if (ElemOffset == 0)
                Vals =
                    block_load<T, N>(InAcc, Mask, PassThru, LoadPropertiesT{});
@@ -445,3 +431,401 @@ template <typename T, bool TestPVCFeatures> bool testACC(queue Q) {
 
   return Passed;
 }
+
+template <typename T, uint16_t N, bool UseMask, bool UsePassThruOperand,
+          bool CheckProperties, typename LoadPropertiesT>
+bool testSLMAcc(queue Q, uint32_t Groups, uint32_t GroupSize,
+                LoadPropertiesT LoadProperties) {
+  using shared_allocator = sycl::usm_allocator<T, sycl::usm::alloc::shared, 16>;
+  using shared_vector = std::vector<T, shared_allocator>;
+
+  static_assert(!UsePassThruOperand || UseMask, "Cannot merge without mask");
+
+  uint32_t Size = Groups * GroupSize * N;
+  std::cout << "SLM ACC case: T=" << esimd_test::type_name<T>() << ",N=" << N
+            << ",UseMask=" << UseMask
+            << ",UsePassThruOperand=" << UsePassThruOperand;
+
+  sycl::range<1> GlobalRange{Groups};
+  sycl::range<1> LocalRange{GroupSize};
+  sycl::nd_range<1> Range{GlobalRange * LocalRange, LocalRange};
+
+  constexpr size_t Alignment = getAlignment<T, N, UseMask>(LoadProperties);
+  shared_vector In(Size, shared_allocator{Q});
+  shared_vector Out(Size, shared_allocator{Q});
+  for (int i = 0; i < Size; i++) {
+    In[i] = esimd_test::getRandomValue<T>();
+    Out[i] = 0;
+  }
+
+  try {
+    Q.submit([&](handler &CGH) {
+       local_accessor<T, 1> LocalAcc(GroupSize * N, CGH);
+       auto InPtr = In.data();
+       auto OutPtr = Out.data();
+
+       CGH.parallel_for(Range, [=](sycl::nd_item<1> NDI) SYCL_ESIMD_KERNEL {
+         uint16_t GlobalID = NDI.get_global_id(0);
+         uint16_t LocalID = NDI.get_local_id(0);
+         uint32_t GroupID = NDI.get_group(0);
+         uint32_t LocalElemOffset = LocalID * N;
+         uint32_t GlobalElemOffset = GlobalID * N;
+         if (LocalID == 0)
+           for (int I = 0; I < GroupSize * N; I++)
+             LocalAcc[I] = InPtr[GlobalID * N + I];
+         barrier();
+
+         simd<T, N> Vals;
+         simd_mask<1> Mask = (GlobalID + 1) & 0x1;
+         // TODO: these 2 lines work-around the problem with scalar
+         // conversions to bfloat16. It could be just: "simd<T, N>
+         // PassThru(GlobalElemOffset, 1);"
+         simd<uint32_t, N> PassThruInt(GlobalElemOffset, 1);
+         simd<T, N> PassThru = PassThruInt;
+         if constexpr (!CheckProperties) {
+           if constexpr (UsePassThruOperand) {
+             if (LocalElemOffset == 0) // try the variant without byte-offset
+               Vals = block_load<T, N>(LocalAcc, Mask, PassThru);
+             else
+               Vals = block_load<T, N>(LocalAcc, LocalElemOffset * sizeof(T),
+                                       Mask, PassThru);
+           } else { // if !UsePassThruOperand
+             if constexpr (UseMask) {
+               if (LocalElemOffset == 0)
+                 Vals = block_load<T, N>(LocalAcc, Mask);
+               else
+                 Vals = block_load<T, N>(LocalAcc, LocalElemOffset * sizeof(T),
+                                         Mask);
+             } else { // if !UseMask
+               if (LocalElemOffset == 0)
+                 Vals = block_load<T, N>(LocalAcc);
+               else
+                 Vals = block_load<T, N>(LocalAcc, LocalElemOffset * sizeof(T));
+             }
+           }
+         } else { // if CheckProperties
+           if constexpr (UsePassThruOperand) {
+             if (LocalElemOffset == 0)
+               Vals = block_load<T, N>(LocalAcc, Mask, PassThru,
+                                       LoadPropertiesT{});
+             else
+               Vals = block_load<T, N>(LocalAcc, LocalElemOffset * sizeof(T),
+                                       Mask, PassThru, LoadPropertiesT{});
+           } else { // if !UsePassThruOperand
+             if constexpr (UseMask) {
+               if (LocalElemOffset == 0)
+                 Vals = block_load<T, N>(LocalAcc, Mask, LoadPropertiesT{});
+               else
+                 Vals = block_load<T, N>(LocalAcc, LocalElemOffset * sizeof(T),
+                                         Mask, LoadPropertiesT{});
+             } else { // if !UseMask
+               if (LocalElemOffset == 0)
+                 Vals = block_load<T, N>(LocalAcc, LoadPropertiesT{});
+               else
+                 Vals = block_load<T, N>(LocalAcc, LocalElemOffset * sizeof(T),
+                                         LoadPropertiesT{});
+             }
+           }
+         }
+         Vals.copy_to(OutPtr + GlobalID * N);
+       });
+     }).wait();
+  } catch (sycl::exception const &e) {
+    std::cout << "SYCL exception caught: " << e.what() << '\n';
+    return false;
+  }
+
+  bool Passed =
+      verify<T>(In.data(), Out.data(), Size, N, UseMask, UsePassThruOperand);
+  return Passed;
+}
+
+template <typename T, bool TestPVCFeatures> bool testSLMAcc(queue Q) {
+  constexpr bool CheckMerge = true;
+  constexpr bool CheckMask = true;
+  constexpr bool CheckProperties = true;
+
+  bool Passed = true;
+
+  // Test block_load() from SLM that doesn't use the mask is implemented
+  // for any N > 1.
+  // Ensure that for every call of block_load(local_accessor, offset, ...)
+  // the 'alignment' property is specified correctly.
+  properties Align16Props{alignment<16>};
+  properties AlignElemProps{alignment<sizeof(T)>};
+  Passed &= testSLMAcc<T, 1, !CheckMask, !CheckMerge, CheckProperties>(
+      Q, 2, 4, AlignElemProps);
+  Passed &= testSLMAcc<T, 2, !CheckMask, !CheckMerge, CheckProperties>(
+      Q, 1, 4, AlignElemProps);
+  Passed &= testSLMAcc<T, 4, !CheckMask, !CheckMerge, CheckProperties>(
+      Q, 2, 4, AlignElemProps);
+  Passed &= testSLMAcc<T, 8, !CheckMask, !CheckMerge, CheckProperties>(
+      Q, 2, 4, AlignElemProps);
+  Passed &= testSLMAcc<T, 16, !CheckMask, !CheckMerge, CheckProperties>(
+      Q, 2, 4, Align16Props);
+  Passed &= testSLMAcc<T, 32, !CheckMask, !CheckMerge, CheckProperties>(
+      Q, 2, 4, Align16Props);
+  Passed &= testSLMAcc<T, 64, !CheckMask, !CheckMerge, CheckProperties>(
+      Q, 2, 4, Align16Props);
+
+  // Test block_load() without passing compile-time properties argument.
+  Passed &= testSLMAcc<T, 16, !CheckMask, !CheckMerge, !CheckProperties>(
+      Q, 2, 4, Align16Props);
+
+  // Test N that is not power of 2, which definitely would require element-size
+  // alignment - it works even for byte- and word-vectors if mask is not used.
+  // Alignment that is smaller than 16-bytes is not assumed/expected by default
+  // and requires explicit passing of the esimd::alignment property.
+  Passed &= testSLMAcc<T, 3, !CheckMask, !CheckMerge, CheckProperties>(
+      Q, 2, 4, AlignElemProps);
+
+  // These test case compute wrong values for for the few last elements
+  // if the driver is not new enough.
+  // TODO: windows version with the fix is not known. Enable it eventually.
+  if (sizeof(T) > 2 ||
+      esimd_test::isGPUDriverGE(Q, esimd_test::GPUDriverOS::LinuxAndWindows,
+                                "27556", "win.just.skip.test", false)) {
+    Passed &= testSLMAcc<T, 17, !CheckMask, !CheckMerge, CheckProperties>(
+        Q, 2, 4, AlignElemProps);
+
+    Passed &= testSLMAcc<T, 113, !CheckMask, !CheckMerge, CheckProperties>(
+        Q, 2, 4, AlignElemProps);
+  }
+
+  if constexpr (TestPVCFeatures) {
+    // Using the mask adds the requirement to run tests on PVC.
+    // Also, PVC variant currently requires power-or-two elements and
+    // the number of bytes loaded per call must not exceed 512.
+
+    constexpr int I32Factor =
+        std::max(static_cast<int>(sizeof(int) / sizeof(T)), 1);
+    constexpr size_t ReqiredAlignment = sizeof(T) <= 4 ? 4 : 8;
+    properties PVCProps{alignment<ReqiredAlignment>};
+
+    // Test block_load() that is available on PVC:
+    // 1, 2, 3, 4, 8, ... N elements (up to 512-bytes).
+    Passed &=
+        testSLMAcc<T, 1 * I32Factor, CheckMask, !CheckMerge, CheckProperties>(
+            Q, 2, 4, PVCProps);
+    Passed &=
+        testSLMAcc<T, 2 * I32Factor, CheckMask, CheckMerge, CheckProperties>(
+            Q, 1, 4, PVCProps);
+    Passed &=
+        testSLMAcc<T, 3 * I32Factor, CheckMask, !CheckMerge, CheckProperties>(
+            Q, 2, 8, PVCProps);
+    Passed &=
+        testSLMAcc<T, 4 * I32Factor, CheckMask, CheckMerge, CheckProperties>(
+            Q, 2, 4, PVCProps);
+    Passed &=
+        testSLMAcc<T, 8 * I32Factor, CheckMask, !CheckMerge, CheckProperties>(
+            Q, 2, 4, PVCProps);
+    Passed &=
+        testSLMAcc<T, 16 * I32Factor, CheckMask, CheckMerge, CheckProperties>(
+            Q, 2, 4, PVCProps);
+    Passed &=
+        testSLMAcc<T, 32 * I32Factor, CheckMask, !CheckMerge, CheckProperties>(
+            Q, 2, 4, PVCProps);
+    Passed &=
+        testSLMAcc<T, 64 * I32Factor, CheckMask, CheckMerge, CheckProperties>(
+            Q, 2, 4, PVCProps);
+
+    if constexpr (sizeof(T) <= 4)
+      Passed &= testSLMAcc<T, 128 * I32Factor, CheckMask, CheckMerge,
+                           CheckProperties>(Q, 2, 4, Align16Props);
+  } // TestPVCFeatures
+
+  return Passed;
+}
+
+template <typename T, uint16_t N, bool UseMask, bool UsePassThruOperand,
+          bool CheckProperties, typename LoadPropertiesT>
+bool testSLM(queue Q, uint32_t Groups, LoadPropertiesT LoadProperties) {
+  using shared_allocator = sycl::usm_allocator<T, sycl::usm::alloc::shared, 16>;
+  using shared_vector = std::vector<T, shared_allocator>;
+
+  constexpr int GroupSize = 8;
+
+  static_assert(!UsePassThruOperand || UseMask, "Cannot merge without mask");
+
+  uint32_t Size = Groups * GroupSize * N;
+  std::cout << "SLM case: T=" << esimd_test::type_name<T>() << ",N=" << N
+            << ",UseMask=" << UseMask
+            << ",UsePassThruOperand=" << UsePassThruOperand;
+
+  sycl::range<1> GlobalRange{Groups};
+  sycl::range<1> LocalRange{GroupSize};
+  sycl::nd_range<1> Range{GlobalRange * LocalRange, LocalRange};
+
+  constexpr size_t Alignment = getAlignment<T, N, UseMask>(LoadProperties);
+  // alloc a bit more space (+15 elements) to be able to block read/write using
+  // 16-element chunks in initialization/aux code.
+  shared_vector In(Size + 15, shared_allocator{Q});
+  shared_vector Out(Size + 15, shared_allocator{Q});
+  for (int i = 0; i < Size; i++) {
+    In[i] = esimd_test::getRandomValue<T>();
+    Out[i] = 0;
+  }
+
+  try {
+    Q.submit([&](handler &CGH) {
+       auto InPtr = In.data();
+       auto OutPtr = Out.data();
+
+       CGH.parallel_for(Range, [=](sycl::nd_item<1> NDI) SYCL_ESIMD_KERNEL {
+         uint16_t GlobalID = NDI.get_global_id(0);
+         uint16_t LocalID = NDI.get_local_id(0);
+         uint32_t GroupID = NDI.get_group(0);
+         uint32_t LocalElemOffset = LocalID * N;
+         uint32_t GlobalElemOffset = GlobalID * N;
+
+         // Allocate a bit more to safely initialize it with 4-element chunks.
+         constexpr uint32_t SLMSize = (GroupSize * N + 4) * sizeof(T);
+         slm_init<SLMSize>();
+
+         if (LocalID == 0) {
+           for (int I = 0; I < GroupSize * N; I += 4) {
+             simd<T, 4> InVec(InPtr + GlobalID * N + I);
+             slm_block_store(I * sizeof(T), InVec);
+           }
+         }
+         barrier();
+
+         simd<T, N> Vals;
+         simd_mask<1> Mask = (GlobalID + 1) & 0x1;
+         // TODO: these 2 lines work-around the problem with scalar
+         // conversions to bfloat16. It could be just: "simd<T, N>
+         // PassThru(GlobalElemOffset, 1);"
+         simd<uint32_t, N> PassThruInt(GlobalElemOffset, 1);
+         simd<T, N> PassThru = PassThruInt;
+         if constexpr (!CheckProperties) {
+           if constexpr (UsePassThruOperand) {
+             Vals = slm_block_load<T, N>(LocalElemOffset * sizeof(T), Mask,
+                                         PassThru);
+           } else { // if !UsePassThruOperand
+             if constexpr (UseMask) {
+               Vals = slm_block_load<T, N>(LocalElemOffset * sizeof(T), Mask);
+             } else { // if !UseMask
+               Vals = slm_block_load<T, N>(LocalElemOffset * sizeof(T));
+             }
+           }
+         } else { // if CheckProperties
+           if constexpr (UsePassThruOperand) {
+             Vals = slm_block_load<T, N>(LocalElemOffset * sizeof(T), Mask,
+                                         PassThru, LoadPropertiesT{});
+           } else { // if !UsePassThruOperand
+             if constexpr (UseMask) {
+               Vals = slm_block_load<T, N>(LocalElemOffset * sizeof(T), Mask,
+                                           LoadPropertiesT{});
+             } else { // if !UseMask
+               Vals = slm_block_load<T, N>(LocalElemOffset * sizeof(T),
+                                           LoadPropertiesT{});
+             }
+           }
+         }
+         Vals.copy_to(OutPtr + GlobalID * N);
+       });
+     }).wait();
+  } catch (sycl::exception const &e) {
+    std::cout << "SYCL exception caught: " << e.what() << '\n';
+    return false;
+  }
+
+  bool Passed =
+      verify<T>(In.data(), Out.data(), Size, N, UseMask, UsePassThruOperand);
+  return Passed;
+}
+
+template <typename T, bool TestPVCFeatures> bool testSLM(queue Q) {
+  constexpr bool CheckMerge = true;
+  constexpr bool CheckMask = true;
+  constexpr bool CheckProperties = true;
+
+  bool Passed = true;
+
+  // Test slm_block_load() from SLM that doesn't use the mask is implemented
+  // for any N > 1.
+  // Ensure that for every call of slm_block_load(offset, ...)
+  // the 'alignment' property is specified correctly.
+  properties Align16Props{alignment<16>};
+  properties AlignElemProps{alignment<sizeof(T)>};
+  Passed &= testSLM<T, 1, !CheckMask, !CheckMerge, CheckProperties>(
+      Q, 2, AlignElemProps);
+  Passed &= testSLM<T, 2, !CheckMask, !CheckMerge, CheckProperties>(
+      Q, 1, AlignElemProps);
+  Passed &= testSLM<T, 4, !CheckMask, !CheckMerge, CheckProperties>(
+      Q, 2, AlignElemProps);
+  Passed &= testSLM<T, 8, !CheckMask, !CheckMerge, CheckProperties>(
+      Q, 2, AlignElemProps);
+  Passed &= testSLM<T, 16, !CheckMask, !CheckMerge, CheckProperties>(
+      Q, 2, Align16Props);
+  Passed &= testSLM<T, 32, !CheckMask, !CheckMerge, CheckProperties>(
+      Q, 2, Align16Props);
+  Passed &= testSLM<T, 64, !CheckMask, !CheckMerge, CheckProperties>(
+      Q, 2, Align16Props);
+
+  // Test block_load() without passing compile-time properties argument.
+  Passed &= testSLM<T, 16, !CheckMask, !CheckMerge, !CheckProperties>(
+      Q, 2, Align16Props);
+
+  // Test N that is not power of 2, which definitely would require element-size
+  // alignment - it works even for byte- and word-vectors if mask is not used.
+  // Alignment that is smaller than 16-bytes is not assumed/expected by default
+  // and requires explicit passing of the esimd::alignment property.
+  //
+  // These test case may compute wrong values for some of elements
+  // if the driver is not new enough.
+  if (esimd_test::isGPUDriverGE(Q, esimd_test::GPUDriverOS::LinuxAndWindows,
+                                "27556", "win.just.skip.test", false)) {
+    Passed &= testSLM<T, 3, !CheckMask, !CheckMerge, CheckProperties>(
+        Q, 2, AlignElemProps);
+
+    Passed &= testSLM<T, 17, !CheckMask, !CheckMerge, CheckProperties>(
+        Q, 2, AlignElemProps);
+
+    Passed &= testSLM<T, 113, !CheckMask, !CheckMerge, CheckProperties>(
+        Q, 2, AlignElemProps);
+  }
+
+  if constexpr (TestPVCFeatures) {
+    // Using the mask adds the requirement to run tests on PVC.
+    // Also, PVC variant currently requires power-or-two elements and
+    // the number of bytes loaded per call must not exceed 512.
+
+    constexpr int I32Factor =
+        std::max(static_cast<int>(sizeof(int) / sizeof(T)), 1);
+    constexpr size_t ReqiredAlignment = sizeof(T) <= 4 ? 4 : 8;
+    properties PVCProps{alignment<ReqiredAlignment>};
+
+    // Test block_load() that is available on PVC:
+    // 1, 2, 3, 4, 8, ... N elements (up to 512-bytes).
+    Passed &=
+        testSLM<T, 1 * I32Factor, CheckMask, !CheckMerge, CheckProperties>(
+            Q, 2, PVCProps);
+    Passed &= testSLM<T, 2 * I32Factor, CheckMask, CheckMerge, CheckProperties>(
+        Q, 1, PVCProps);
+    Passed &=
+        testSLM<T, 3 * I32Factor, CheckMask, !CheckMerge, CheckProperties>(
+            Q, 2, PVCProps);
+    Passed &= testSLM<T, 4 * I32Factor, CheckMask, CheckMerge, CheckProperties>(
+        Q, 2, PVCProps);
+    Passed &=
+        testSLM<T, 8 * I32Factor, CheckMask, !CheckMerge, CheckProperties>(
+            Q, 2, PVCProps);
+    Passed &=
+        testSLM<T, 16 * I32Factor, CheckMask, CheckMerge, CheckProperties>(
+            Q, 2, PVCProps);
+    Passed &=
+        testSLM<T, 32 * I32Factor, CheckMask, !CheckMerge, CheckProperties>(
+            Q, 2, PVCProps);
+    Passed &=
+        testSLM<T, 64 * I32Factor, CheckMask, CheckMerge, CheckProperties>(
+            Q, 2, PVCProps);
+
+    if constexpr (sizeof(T) <= 4)
+      Passed &=
+          testSLM<T, 128 * I32Factor, CheckMask, CheckMerge, CheckProperties>(
+              Q, 2, Align16Props);
+  } // TestPVCFeatures
+
+  return Passed;
+}
diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/block_load_slm.cpp b/sycl/test-e2e/ESIMD/unified_memory_api/block_load_slm.cpp
new file mode 100644
index 0000000000000..2ed68aa25358b
--- /dev/null
+++ b/sycl/test-e2e/ESIMD/unified_memory_api/block_load_slm.cpp
@@ -0,0 +1,47 @@
+//==------- block_load_slm.cpp - DPC++ ESIMD on-device test ---------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// REQUIRES-INTEL-DRIVER: win: 101.4887
+// Somehow the driver version check above does not work, i.e. Windows CI runs
+// the test with 31.0.101.4502 (if opencl:gpu) and 1.3.26370 (if level-zero:gpu)
+// It seems the driver check infrastructure may need some fix/tuning.
+// TODO: Enable the test when Windows CI driver reaches 101.4887 version, or
+// driver version check is fixed/tuned.
+// UNSUPPORTED: windows
+
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+
+// The test verifies esimd::slm_block_load() functions reading from SLM memory
+// and using optional compile-time esimd::properties.
+// The slm_block_load() calls in this test do not use the mask operand and
+// do not require PVC features.
+
+#include "Inputs/block_load.hpp"
+
+int main() {
+  auto Q = queue{gpu_selector_v};
+  esimd_test::printTestLabel(Q);
+
+  constexpr bool TestPVCFeatures = true;
+  bool Passed = true;
+
+  Passed &= testSLM<int8_t, !TestPVCFeatures>(Q);
+  Passed &= testSLM<int16_t, !TestPVCFeatures>(Q);
+  if (Q.get_device().has(sycl::aspect::fp16))
+    Passed &= testSLM<sycl::half, !TestPVCFeatures>(Q);
+  Passed &= testSLM<uint32_t, !TestPVCFeatures>(Q);
+  Passed &= testSLM<float, !TestPVCFeatures>(Q);
+  Passed &=
+      testSLM<ext::intel::experimental::esimd::tfloat32, !TestPVCFeatures>(Q);
+  Passed &= testSLM<int64_t, !TestPVCFeatures>(Q);
+  if (Q.get_device().has(sycl::aspect::fp64))
+    Passed &= testSLM<double, !TestPVCFeatures>(Q);
+
+  std::cout << (Passed ? "Passed\n" : "FAILED\n");
+  return Passed ? 0 : 1;
+}
diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/block_load_slm_acc.cpp b/sycl/test-e2e/ESIMD/unified_memory_api/block_load_slm_acc.cpp
new file mode 100644
index 0000000000000..879945f90c0ec
--- /dev/null
+++ b/sycl/test-e2e/ESIMD/unified_memory_api/block_load_slm_acc.cpp
@@ -0,0 +1,41 @@
+//==------- block_load_slm_acc.cpp - DPC++ ESIMD on-device test ------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// REQUIRES-INTEL-DRIVER: lin: 27202, win: 101.4677
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+
+// The test verifies esimd::block_load() functions accepting local accessor and
+// using optional compile-time esimd::properties.
+// The block_load() calls in this test do not use the mask operand and do not
+// require PVC features.
+
+#include "Inputs/block_load.hpp"
+
+int main() {
+  auto Q = queue{gpu_selector_v};
+  esimd_test::printTestLabel(Q);
+
+  constexpr bool TestPVCFeatures = true;
+  bool Passed = true;
+
+  Passed &= testSLMAcc<int8_t, !TestPVCFeatures>(Q);
+  Passed &= testSLMAcc<int16_t, !TestPVCFeatures>(Q);
+  if (Q.get_device().has(sycl::aspect::fp16))
+    Passed &= testSLMAcc<sycl::half, !TestPVCFeatures>(Q);
+  Passed &= testSLMAcc<uint32_t, !TestPVCFeatures>(Q);
+  Passed &= testSLMAcc<float, !TestPVCFeatures>(Q);
+  Passed &=
+      testSLMAcc<ext::intel::experimental::esimd::tfloat32, !TestPVCFeatures>(
+          Q);
+  Passed &= testSLMAcc<int64_t, !TestPVCFeatures>(Q);
+  if (Q.get_device().has(sycl::aspect::fp64))
+    Passed &= testSLMAcc<double, !TestPVCFeatures>(Q);
+
+  std::cout << (Passed ? "Passed\n" : "FAILED\n");
+  return Passed ? 0 : 1;
+}
diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/block_load_slm_acc_pvc.cpp b/sycl/test-e2e/ESIMD/unified_memory_api/block_load_slm_acc_pvc.cpp
new file mode 100644
index 0000000000000..b00627ed75948
--- /dev/null
+++ b/sycl/test-e2e/ESIMD/unified_memory_api/block_load_slm_acc_pvc.cpp
@@ -0,0 +1,40 @@
+//==----- block_load_slm_acc_pvc.cpp - DPC++ ESIMD on-device test ----------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// REQUIRES: gpu-intel-pvc
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+
+// The test verifies esimd::block_load() functions accepting local accessor
+// and using optional compile-time esimd::properties.
+// The block_load() calls in this test use the mask operand which requires
+// PVC+ target device.
+
+#include "Inputs/block_load.hpp"
+
+int main() {
+  auto Q = queue{gpu_selector_v};
+  esimd_test::printTestLabel(Q);
+
+  constexpr bool TestPVCFeatures = true;
+  bool Passed = true;
+
+  Passed &= testSLMAcc<int8_t, TestPVCFeatures>(Q);
+  Passed &= testSLMAcc<int16_t, TestPVCFeatures>(Q);
+  if (Q.get_device().has(sycl::aspect::fp16))
+    Passed &= testSLMAcc<sycl::half, TestPVCFeatures>(Q);
+  Passed &= testSLMAcc<uint32_t, TestPVCFeatures>(Q);
+  Passed &= testSLMAcc<float, TestPVCFeatures>(Q);
+  Passed &=
+      testSLMAcc<ext::intel::experimental::esimd::tfloat32, TestPVCFeatures>(Q);
+  Passed &= testSLMAcc<int64_t, TestPVCFeatures>(Q);
+  if (Q.get_device().has(sycl::aspect::fp64))
+    Passed &= testSLMAcc<double, TestPVCFeatures>(Q);
+
+  std::cout << (Passed ? "Passed\n" : "FAILED\n");
+  return Passed ? 0 : 1;
+}
diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/block_load_slm_pvc.cpp b/sycl/test-e2e/ESIMD/unified_memory_api/block_load_slm_pvc.cpp
new file mode 100644
index 0000000000000..be9d2461e8cd6
--- /dev/null
+++ b/sycl/test-e2e/ESIMD/unified_memory_api/block_load_slm_pvc.cpp
@@ -0,0 +1,40 @@
+//==------- block_load_slm_pvc.cpp - DPC++ ESIMD on-device test ------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// REQUIRES: gpu-intel-pvc
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+
+// The test verifies esimd::slm_block_load() functions reading from SLM memory
+// and using optional compile-time esimd::properties.
+// The esimd::slm_block_load() calls in this test use the mask operand which
+// require PVC+ target device.
+
+#include "Inputs/block_load.hpp"
+
+int main() {
+  auto Q = queue{gpu_selector_v};
+  esimd_test::printTestLabel(Q);
+
+  constexpr bool TestPVCFeatures = true;
+  bool Passed = true;
+
+  Passed &= testSLM<int8_t, TestPVCFeatures>(Q);
+  Passed &= testSLM<int16_t, TestPVCFeatures>(Q);
+  if (Q.get_device().has(sycl::aspect::fp16))
+    Passed &= testSLM<sycl::half, TestPVCFeatures>(Q);
+  Passed &= testSLM<uint32_t, TestPVCFeatures>(Q);
+  Passed &= testSLM<float, TestPVCFeatures>(Q);
+  Passed &=
+      testSLM<ext::intel::experimental::esimd::tfloat32, TestPVCFeatures>(Q);
+  Passed &= testSLM<int64_t, TestPVCFeatures>(Q);
+  if (Q.get_device().has(sycl::aspect::fp64))
+    Passed &= testSLM<double, TestPVCFeatures>(Q);
+
+  std::cout << (Passed ? "Passed\n" : "FAILED\n");
+  return Passed ? 0 : 1;
+}
diff --git a/sycl/test/esimd/memory_properties.cpp b/sycl/test/esimd/memory_properties.cpp
index a30bd3720d747..21daa5758693e 100644
--- a/sycl/test/esimd/memory_properties.cpp
+++ b/sycl/test/esimd/memory_properties.cpp
@@ -11,18 +11,21 @@
 using namespace sycl::ext::intel::esimd;
 
 using AccType = sycl::accessor<uint8_t, 1, sycl::access::mode::read_write>;
+using LocalAccType = sycl::local_accessor<double, 1>;
 
-SYCL_ESIMD_FUNCTION SYCL_EXTERNAL void
-foo(AccType &, float *, int byte_offset32, size_t byte_offset64);
+SYCL_ESIMD_FUNCTION SYCL_EXTERNAL void foo(AccType &, LocalAccType &, float *,
+                                           int byte_offset32,
+                                           size_t byte_offset64);
 
 class EsimdFunctor {
 public:
   AccType acc;
+  LocalAccType local_acc;
   float *ptr;
   int byte_offset32;
   size_t byte_offset64;
   void operator()() __attribute__((sycl_explicit_simd)) {
-    foo(acc, ptr, byte_offset32, byte_offset64);
+    foo(acc, local_acc, ptr, byte_offset32, byte_offset64);
   }
 };
 
@@ -31,13 +34,16 @@ __attribute__((sycl_kernel)) void kernel(Func kernelFunc) {
   kernelFunc();
 }
 
-void bar(AccType &acc, float *ptr, int byte_offset32, size_t byte_offset64) {
-  EsimdFunctor esimdf{acc, ptr, byte_offset32, byte_offset64};
+void bar(AccType &acc, LocalAccType &local_acc, float *ptr, int byte_offset32,
+         size_t byte_offset64) {
+  EsimdFunctor esimdf{acc, local_acc, ptr, byte_offset32, byte_offset64};
   kernel<class kernel_esimd>(esimdf);
 }
 
-SYCL_ESIMD_FUNCTION SYCL_EXTERNAL void
-foo(AccType &acc, float *ptrf, int byte_offset32, size_t byte_offset64) {
+SYCL_ESIMD_FUNCTION SYCL_EXTERNAL void foo(AccType &acc,
+                                           LocalAccType &local_acc, float *ptrf,
+                                           int byte_offset32,
+                                           size_t byte_offset64) {
   properties props_a{cache_hint_L1<cache_hint::streaming>,
                      cache_hint_L2<cache_hint::cached>, alignment<16>};
   static_assert(props_a.has_property<cache_hint_L1_key>(), "Missing L1 hint");
@@ -143,4 +149,42 @@ foo(AccType &acc, float *ptrf, int byte_offset32, size_t byte_offset64) {
 
   // CHECK: call <16 x float> @llvm.genx.oword.ld.v16f32(i32 0, i32 {{[^)]+}}, i32 {{[^)]+}})
   auto z3 = block_load<float, 16>(acc, byte_offset64, props_c16);
+
+  // Now try SLM block_load() with and without cache hints that are ignored.
+
+  // CHECK: load <11 x double>, ptr addrspace(3) {{[^)]+}}, align 16
+  auto slm_bl1 = slm_block_load<double, 11>(byte_offset32, props_a);
+
+  // CHECK: call <8 x double> @llvm.genx.lsc.load.slm.v8f64.v1i1.v1i32(<1 x i1> {{[^)]+}}, i8 0, i8 0, i8 0, i16 1, i32 0, i8 4, i8 5, i8 2, i8 0, <1 x i32> {{[^)]+}}, i32 0)
+  auto slm_bl2 = slm_block_load<double, 8>(byte_offset32, mask, props_c16);
+
+  simd<double, 8> pass_thrud = 2.0;
+  // CHECK: call <8 x double> @llvm.genx.lsc.load.merge.slm.v8f64.v1i1.v1i32(<1 x i1> {{[^)]+}}, i8 0, i8 0, i8 0, i16 1, i32 0, i8 4, i8 5, i8 2, i8 0, <1 x i32> {{[^)]+}}, i32 0, <8 x double> {{[^)]+}})
+  auto slm_bl3 =
+      slm_block_load<double, 8>(byte_offset32, mask, pass_thrud, props_c16);
+
+  // Now try block_load() accepting local accessor.
+
+  // CHECK: load <2 x double>, ptr addrspace(3) {{[^)]+}}, align 16
+  auto lacc_bl1 = block_load<double, 2>(local_acc, props_a);
+
+  // CHECK: load <5 x double>, ptr addrspace(3) {{[^)]+}}, align 8
+  auto lacc_bl2 = block_load<double, 5>(local_acc, byte_offset32, props_b);
+
+  // CHECK: call <8 x double> @llvm.genx.lsc.load.slm.v8f64.v1i1.v1i32(<1 x i1> {{[^)]+}}, i8 0, i8 0, i8 0, i16 1, i32 0, i8 4, i8 5, i8 2, i8 0, <1 x i32> {{[^)]+}}, i32 0)
+  auto lacc_bl3 = block_load<double, 8>(local_acc, mask, props_a);
+
+  // CHECK: call <16 x double> @llvm.genx.lsc.load.merge.slm.v16f64.v1i1.v1i32(<1 x i1> {{[^)]+}}, i8 0, i8 0, i8 0, i16 1, i32 0, i8 4, i8 6, i8 2, i8 0, <1 x i32> {{[^)]+}}, i32 0, <16 x double> {{[^)]+}})
+  simd<double, 16> pass_thrud16 = 2.0;
+  auto lacc_bl4 =
+      block_load<double, 16>(local_acc, mask, pass_thrud16, props_b);
+
+  // CHECK: call <32 x double> @llvm.genx.lsc.load.slm.v32f64.v1i1.v1i32(<1 x i1> {{[^)]+}}, i8 0, i8 0, i8 0, i16 1, i32 0, i8 4, i8 7, i8 2, i8 0, <1 x i32> {{[^)]+}}, i32 0)
+  auto lacc_bl5 =
+      block_load<double, 32>(local_acc, byte_offset32, mask, props_a);
+
+  // CHECK: call <4 x double> @llvm.genx.lsc.load.merge.slm.v4f64.v1i1.v1i32(<1 x i1> {{[^)]+}}, i8 0, i8 0, i8 0, i16 1, i32 0, i8 4, i8 4, i8 2, i8 0, <1 x i32> {{[^)]+}}, i32 0, <4 x double> {{[^)]+}})
+  simd<double, 4> pass_thrud4 = 2.0;
+  auto lacc_bl6 = block_load<double, 4>(local_acc, byte_offset32, mask,
+                                        pass_thrud4, props_a);
 }

From 5f466d164feebcd8b00386d60d81fa89913364f9 Mon Sep 17 00:00:00 2001
From: Jakub Chlanda <jakub@codeplay.com>
Date: Thu, 9 Nov 2023 17:44:23 +0100
Subject: [PATCH 868/877] [SYCL][CUDA] Issue warnings for incomplete launch
 bounds attribute set (#11832)

Cuda specifies the launch bounds as:
```
__launch_bounds__(maxThreadsPerBlock, minBlocksPerMultiprocessor, maxBlocksPerCluster)
```
making it impossible to specify `maxBlocksPerCluster` without the
preceding two attributes (similarly for `minBlocksPerMultiprocessor`),
issue warnings and ignore attributes if the condition is not met.
---
 .../clang/Basic/DiagnosticCommonKinds.td      |  6 +++
 clang/lib/CodeGen/Targets/NVPTX.cpp           | 46 +++++++++++++------
 clang/test/SemaSYCL/lb_sm_90.cpp              | 36 ++++++++++++---
 3 files changed, 68 insertions(+), 20 deletions(-)

diff --git a/clang/include/clang/Basic/DiagnosticCommonKinds.td b/clang/include/clang/Basic/DiagnosticCommonKinds.td
index dba2e28dcb100..f6de9eb9a9852 100644
--- a/clang/include/clang/Basic/DiagnosticCommonKinds.td
+++ b/clang/include/clang/Basic/DiagnosticCommonKinds.td
@@ -436,4 +436,10 @@ def warn_try_not_valid_on_target : Warning<
   "target '%0' does not support exception handling;"
   " 'catch' block is ignored">,
   InGroup<OpenMPTargetException>;
+
+// Launch bound
+def warn_launch_bounds_missing_attr: Warning<
+  "%0 attribute ignored, as it requires: maximum work group size"
+  "%select{| and minimum work groups per compute unit}1 to be also specified">,
+  InGroup<IgnoredAttributes>;
 }
diff --git a/clang/lib/CodeGen/Targets/NVPTX.cpp b/clang/lib/CodeGen/Targets/NVPTX.cpp
index 1f5d675559d19..0006de5067c15 100644
--- a/clang/lib/CodeGen/Targets/NVPTX.cpp
+++ b/clang/lib/CodeGen/Targets/NVPTX.cpp
@@ -245,29 +245,47 @@ void NVPTXTargetCodeGenInfo::setTargetAttributes(
       // And kernel functions are not subject to inlining
       F->addFnAttr(llvm::Attribute::NoInline);
     }
+    bool HasMaxWorkGroupSize = false;
+    bool HasMinWorkGroupPerCU = false;
     if (const auto *MWGS = FD->getAttr<SYCLIntelMaxWorkGroupSizeAttr>()) {
       auto MaxThreads = (*MWGS->getZDimVal()).getExtValue() *
                         (*MWGS->getYDimVal()).getExtValue() *
                         (*MWGS->getXDimVal()).getExtValue();
-      if (MaxThreads > 0)
+      if (MaxThreads > 0) {
         addNVVMMetadata(F, "maxntidx", MaxThreads);
+        HasMaxWorkGroupSize = true;
+      }
+    }
 
-      auto attrValue = [&](Expr *E) {
-        const auto *CE = cast<ConstantExpr>(E);
-        std::optional<llvm::APInt> Val = CE->getResultAsAPSInt();
-        return Val->getZExtValue();
-      };
-
-      if (const auto *MWGPCU =
-              FD->getAttr<SYCLIntelMinWorkGroupsPerComputeUnitAttr>()) {
+    auto attrValue = [&](Expr *E) {
+      const auto *CE = cast<ConstantExpr>(E);
+      std::optional<llvm::APInt> Val = CE->getResultAsAPSInt();
+      return Val->getZExtValue();
+    };
+
+    if (const auto *MWGPCU =
+            FD->getAttr<SYCLIntelMinWorkGroupsPerComputeUnitAttr>()) {
+      if (!HasMaxWorkGroupSize && FD->hasAttr<OpenCLKernelAttr>()) {
+        M.getDiags().Report(D->getLocation(),
+                            diag::warn_launch_bounds_missing_attr)
+            << MWGPCU << 0;
+      } else {
         // The value is guaranteed to be > 0, pass it to the metadata.
         addNVVMMetadata(F, "minnctapersm", attrValue(MWGPCU->getValue()));
+        HasMinWorkGroupPerCU = true;
+      }
+    }
 
-        if (const auto *MWGPMP =
-                FD->getAttr<SYCLIntelMaxWorkGroupsPerMultiprocessorAttr>()) {
-          // The value is guaranteed to be > 0, pass it to the metadata.
-          addNVVMMetadata(F, "maxclusterrank", attrValue(MWGPMP->getValue()));
-        }
+    if (const auto *MWGPMP =
+            FD->getAttr<SYCLIntelMaxWorkGroupsPerMultiprocessorAttr>()) {
+      if ((!HasMaxWorkGroupSize || !HasMinWorkGroupPerCU) &&
+          FD->hasAttr<OpenCLKernelAttr>()) {
+        M.getDiags().Report(D->getLocation(),
+                            diag::warn_launch_bounds_missing_attr)
+            << MWGPMP << 1;
+      } else {
+        // The value is guaranteed to be > 0, pass it to the metadata.
+        addNVVMMetadata(F, "maxclusterrank", attrValue(MWGPMP->getValue()));
       }
     }
   }
diff --git a/clang/test/SemaSYCL/lb_sm_90.cpp b/clang/test/SemaSYCL/lb_sm_90.cpp
index 9d2f61571565d..54a53b15cff90 100644
--- a/clang/test/SemaSYCL/lb_sm_90.cpp
+++ b/clang/test/SemaSYCL/lb_sm_90.cpp
@@ -1,5 +1,4 @@
-// RUN: %clang_cc1 -internal-isystem %S/Inputs %s -triple nvptx64-nvidia-cuda -target-cpu sm_90 -fsycl-is-device -fsyntax-only -Wno-c++23-extensions -verify
-// expected-no-diagnostics
+// RUN: %clang_cc1 -internal-isystem %S/Inputs %s -triple nvptx64-nvidia-cuda -target-cpu sm_90 -fsycl-is-device -fsyntax-only -Wno-c++23-extensions -verify -S
 
 // Maximum work groups per multi-processor, mapped to maxclusterrank PTX
 // directive, is an SM_90 feature, make sure that no warnings/errors are issued.
@@ -13,19 +12,44 @@ template <int N1, int N2, int N3> class Functor {
   operator()() const {}
 };
 
+// expected-warning@+1 {{'max_work_groups_per_mp' attribute ignored, as it requires: maximum work group size and minimum work groups per compute unit to be also specified}}
+template <int N1, int N2> class Functor_2 {
+public:
+  [[intel::max_work_group_size(1, 1, N1),
+    intel::max_work_groups_per_mp(N2)]] void
+  operator()() const {}
+};
+
 int main() {
   sycl::queue Q{};
 
   Q.submit([&](sycl::handler &cgh) {
-    cgh.single_task<class T1>( [=] [[intel::max_work_group_size(1, 1, 256),
-                                          intel::min_work_groups_per_cu(2),
-                                          intel::max_work_groups_per_mp(4)]] (
-) { volatile int A = 42; });
+    cgh.single_task<class T1>(
+        [=] [[intel::max_work_group_size(1, 1, 256),
+              intel::min_work_groups_per_cu(2),
+              intel::max_work_groups_per_mp(4)]] () { volatile int A = 42; });
+
+    // expected-warning@+2 {{'max_work_groups_per_mp' attribute ignored, as it requires: maximum work group size and minimum work groups per compute unit to be also specified}}
+    cgh.single_task<class T2>(
+        [=] [[intel::max_work_group_size(1, 1, 256),
+              intel::max_work_groups_per_mp(4)]] () { volatile int A = 42; });
+
+    // expected-warning@+2 {{'max_work_groups_per_mp' attribute ignored, as it requires: maximum work group size and minimum work groups per compute unit to be also specified}}
+    cgh.single_task<class T3>(
+        [=] [[intel::max_work_groups_per_mp(4)]] () { volatile int A = 42; });
+
+    // expected-warning@+2 {{'min_work_groups_per_cu' attribute ignored, as it requires: maximum work group size to be also specified}} cgh.single_task<class T4>(
+    cgh.single_task<class T4>(
+        [=] [[intel::min_work_groups_per_cu(4)]] () { volatile int A = 42; });
   });
 
   Q.submit([&](sycl::handler &cgh) {
     cgh.single_task<class F>(Functor<512, 8, 16>{});
   });
 
+  Q.submit([&](sycl::handler &cgh) {
+    cgh.single_task<class F2>(Functor_2<512, 8>{});
+  });
+
   return 0;
 }

From 07b15fe83c9fc584f0952448663913a59cf1e26b Mon Sep 17 00:00:00 2001
From: Konrad Kusiak <konrad.kusiak@codeplay.com>
Date: Thu, 9 Nov 2023 16:44:41 +0000
Subject: [PATCH 869/877] [SYCL][Test E2E] Add 2 extra characters to escaping
 function (#11815)

Fixes generating filters like this one:
`DeviceName:{{gfx90a:sramecc\+:xnack\-}},DriverVersion:{{HIP 50422.80}}`
---
 sycl/test-e2e/Config/select_device.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sycl/test-e2e/Config/select_device.cpp b/sycl/test-e2e/Config/select_device.cpp
index ca3e10f0c2ee1..5b4544ccb6b8b 100644
--- a/sycl/test-e2e/Config/select_device.cpp
+++ b/sycl/test-e2e/Config/select_device.cpp
@@ -73,7 +73,7 @@ struct DevDescT {
 };
 
 static void addEscapeSymbolToSpecialCharacters(std::string &str) {
-  std::vector<std::string> specialCharacters{"(", ")", "[", "]", "."};
+  std::vector<std::string> specialCharacters{"(", ")", "[", "]", ".", "+", "-"};
   for (const auto &character : specialCharacters) {
     size_t pos = 0;
     while ((pos = str.find(character, pos)) != std::string::npos) {

From 2c117d7a61041fcb06c0e42715fcc81459e4bb8c Mon Sep 17 00:00:00 2001
From: Michael Toguchi <michael.d.toguchi@intel.com>
Date: Thu, 9 Nov 2023 11:17:38 -0700
Subject: [PATCH 870/877] [Driver][SYCL] Disable parallel for range rounding at
 -O0 (#11799)

When using -O0 to disable optimizations, also set
-fsycl-disable-range-rounding to further disable optimizations to
improve debugability.
---
 clang/lib/Driver/ToolChains/Clang.cpp         | 10 +++++++++-
 clang/test/Driver/sycl-offload.c              | 20 +++++++++++++++++++
 .../basic_tests/parallel_for_type_check.cpp   |  2 +-
 3 files changed, 30 insertions(+), 2 deletions(-)

diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 61bb833fa9d96..f65897c0be3ee 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -5398,8 +5398,16 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
         break;
       }
     }
-    if (HasFPGA) {
+    // At -O0, imply -fsycl-disable-range-rounding.
+    bool DisableRangeRounding = false;
+    if (Arg *A = Args.getLastArg(options::OPT_O_Group)) {
+      if (A->getOption().matches(options::OPT_O0))
+        DisableRangeRounding = true;
+    }
+    if (DisableRangeRounding || HasFPGA)
       CmdArgs.push_back("-fsycl-disable-range-rounding");
+
+    if (HasFPGA) {
       // Pass -fintelfpga to both the host and device SYCL compilations if set.
       CmdArgs.push_back("-fintelfpga");
     }
diff --git a/clang/test/Driver/sycl-offload.c b/clang/test/Driver/sycl-offload.c
index d6f30de74b530..5a9c820b251aa 100644
--- a/clang/test/Driver/sycl-offload.c
+++ b/clang/test/Driver/sycl-offload.c
@@ -428,6 +428,26 @@
 // RUN:   | FileCheck -check-prefix=CHK-TOOLS-OPTS2 %s
 // CHK-TOOLS-OPTS2: clang-offload-wrapper{{.*}} "-link-opts=-DFOO1 -DFOO2"
 
+/// -fsycl-disable-range-rounding settings
+// RUN: %clang -### -target x86_64-unknown-linux-gnu -fsycl \
+// RUN:        -fsycl-targets=spir64 -O0 %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHK-DISABLE-RANGE-ROUNDING %s
+// RUN: %clang_cl -### -fsycl -fsycl-targets=spir64 -Od %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHK-DISABLE-RANGE-ROUNDING %s
+// CHK-DISABLE-RANGE-ROUNDING: "-fsycl-disable-range-rounding"
+
+// RUN: %clang -### -target x86_64-unknown-linux-gnu -fsycl \
+// RUN:        -fsycl-targets=spir64 -O2 %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHK-RANGE-ROUNDING %s
+// RUN: %clang_cl -### -fsycl -fsycl-targets=spir64 -O2 %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHK-RANGE-ROUNDING %s
+// RUN: %clang -### -target x86_64-unknown-linux-gnu -fsycl \
+// RUN:        -fsycl-targets=spir64 %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHK-RANGE-ROUNDING %s
+// RUN: %clang_cl -### -fsycl -fsycl-targets=spir64 %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHK-RANGE-ROUNDING %s
+// CHK-RANGE-ROUNDING-NOT: "-fsycl-disable-range-rounding"
+
 /// ###########################################################################
 
 /// Verify that triple-boundarch pairs are correct with multi-targetting
diff --git a/sycl/test/basic_tests/parallel_for_type_check.cpp b/sycl/test/basic_tests/parallel_for_type_check.cpp
index 46563d7907363..cbd7f3bfb5fb1 100644
--- a/sycl/test/basic_tests/parallel_for_type_check.cpp
+++ b/sycl/test/basic_tests/parallel_for_type_check.cpp
@@ -1,4 +1,4 @@
-// RUN: %clangxx -fsycl -fsycl-device-only -D__SYCL_INTERNAL_API -O0 -c -emit-llvm -S -o - %s | FileCheck %s
+// RUN: %clangxx -fsycl -fsycl-device-only -D__SYCL_INTERNAL_API -c -emit-llvm -S -o - %s | FileCheck %s
 
 // This test performs basic type check for sycl::id that is used in result type.
 // Check that sycl::id is converted from sycl::item.

From 516d2a26a88f849e1a3655321ecfb2e65dd552f8 Mon Sep 17 00:00:00 2001
From: Michael Toguchi <michael.d.toguchi@intel.com>
Date: Thu, 9 Nov 2023 11:18:07 -0700
Subject: [PATCH 871/877] [NFC] Warning cleanup (#11831)

Unused variable - clean it up.
---
 clang/lib/Driver/ToolChains/Clang.cpp | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index f65897c0be3ee..57a31ec9c16b0 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -6114,10 +6114,8 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
     A->render(Args, CmdArgs);
   }
 
-  if (Arg *A = Args.getLastArg(options::OPT_faltmathlib_EQ)) {
-    StringRef Name = A->getValue();
+  if (Arg *A = Args.getLastArg(options::OPT_faltmathlib_EQ))
     A->render(Args, CmdArgs);
-  }
 
   if (Args.hasFlag(options::OPT_fmerge_all_constants,
                    options::OPT_fno_merge_all_constants, false))

From edd58d95fc72c3e773219dee50a5171a15d493de Mon Sep 17 00:00:00 2001
From: Dounia Khaldi <dounia.khaldi@intel.com>
Date: Thu, 9 Nov 2023 15:13:22 -0600
Subject: [PATCH 872/877] [SYCL][Matrix tests][E2E] remove xfail as SPR test
 passes (#11839)

---
 sycl/test-e2e/Matrix/SG32/joint_matrix_bfloat16_16x16x16.cpp | 2 +-
 sycl/test-e2e/Matrix/joint_matrix_bfloat16_16x16x16.cpp      | 2 --
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_bfloat16_16x16x16.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_bfloat16_16x16x16.cpp
index e53a6f76e0a67..759129e536fc3 100644
--- a/sycl/test-e2e/Matrix/SG32/joint_matrix_bfloat16_16x16x16.cpp
+++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_bfloat16_16x16x16.cpp
@@ -10,7 +10,7 @@
 // RUN: %{build} -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4
 // RUN: %{run} %t.out
 
-// XFAIL: *
+// XFAIL: gpu
 
 #include "../common.hpp"
 
diff --git a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_16x16x16.cpp b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_16x16x16.cpp
index 3035eb0908e1b..55a3f50e44169 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_16x16x16.cpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_16x16x16.cpp
@@ -10,8 +10,6 @@
 // RUN: %{build} -o %t.out -DSYCL_EXT_ONEAPI_MATRIX_VERSION=4
 // RUN: %{run} %t.out
 
-// XFAIL: cpu
-
 #include "common.hpp"
 
 using namespace sycl;

From b5d69df53620a1c9d33f7575cd3e6dd1ade54508 Mon Sep 17 00:00:00 2001
From: Nikita Kornev <nikita.kornev@intel.com>
Date: Thu, 9 Nov 2023 22:13:29 +0100
Subject: [PATCH 873/877] [SYCL] Add E2E for prefetch & improve interface
 (#11834)

---
 .../sycl/ext/oneapi/experimental/prefetch.hpp |  60 ++++---
 sycl/test-e2e/Prefetch/joint_prefetch.cpp     | 151 ++++++++++++++++++
 sycl/test-e2e/Prefetch/prefetch.cpp           | 136 ++++++++++++++++
 sycl/test/extensions/prefetch.cpp             |  23 ++-
 4 files changed, 348 insertions(+), 22 deletions(-)
 create mode 100644 sycl/test-e2e/Prefetch/joint_prefetch.cpp
 create mode 100644 sycl/test-e2e/Prefetch/prefetch.cpp

diff --git a/sycl/include/sycl/ext/oneapi/experimental/prefetch.hpp b/sycl/include/sycl/ext/oneapi/experimental/prefetch.hpp
index 9271af5059402..29d2ca0dee351 100644
--- a/sycl/include/sycl/ext/oneapi/experimental/prefetch.hpp
+++ b/sycl/include/sycl/ext/oneapi/experimental/prefetch.hpp
@@ -100,28 +100,33 @@ void joint_prefetch_impl(Group g, T *ptr, size_t bytes, Properties properties) {
 } // namespace detail
 
 template <typename Properties = empty_properties_t>
-void prefetch(void *ptr, Properties properties = {}) {
+std::enable_if_t<is_property_list_v<std::decay_t<Properties>>>
+prefetch(void *ptr, Properties properties = {}) {
   detail::prefetch_impl(ptr, 1, properties);
 }
 
 template <typename Properties = empty_properties_t>
-void prefetch(void *ptr, size_t bytes, Properties properties = {}) {
+std::enable_if_t<is_property_list_v<std::decay_t<Properties>>>
+prefetch(void *ptr, size_t bytes, Properties properties = {}) {
   detail::prefetch_impl(ptr, bytes, properties);
 }
 
 template <typename T, typename Properties = empty_properties_t>
-void prefetch(T *ptr, Properties properties = {}) {
+std::enable_if_t<is_property_list_v<std::decay_t<Properties>>>
+prefetch(T *ptr, Properties properties = {}) {
   detail::prefetch_impl(ptr, sizeof(T), properties);
 }
 
 template <typename T, typename Properties = empty_properties_t>
-void prefetch(T *ptr, size_t count, Properties properties = {}) {
+std::enable_if_t<is_property_list_v<std::decay_t<Properties>>>
+prefetch(T *ptr, size_t count, Properties properties = {}) {
   detail::prefetch_impl(ptr, count * sizeof(T), properties);
 }
 
 template <access::address_space AddressSpace, access::decorated IsDecorated,
           typename Properties = empty_properties_t>
-std::enable_if_t<detail::check_prefetch_AS<AddressSpace>>
+std::enable_if_t<detail::check_prefetch_AS<AddressSpace> &&
+                 is_property_list_v<std::decay_t<Properties>>>
 prefetch(multi_ptr<void, AddressSpace, IsDecorated> ptr,
          Properties properties = {}) {
   detail::prefetch_impl(ptr.get(), 1, properties);
@@ -129,7 +134,8 @@ prefetch(multi_ptr<void, AddressSpace, IsDecorated> ptr,
 
 template <access::address_space AddressSpace, access::decorated IsDecorated,
           typename Properties = empty_properties_t>
-std::enable_if_t<detail::check_prefetch_AS<AddressSpace>>
+std::enable_if_t<detail::check_prefetch_AS<AddressSpace> &&
+                 is_property_list_v<std::decay_t<Properties>>>
 prefetch(multi_ptr<void, AddressSpace, IsDecorated> ptr, size_t bytes,
          Properties properties = {}) {
   detail::prefetch_impl(ptr.get(), bytes, properties);
@@ -138,7 +144,8 @@ prefetch(multi_ptr<void, AddressSpace, IsDecorated> ptr, size_t bytes,
 template <typename T, access::address_space AddressSpace,
           access::decorated IsDecorated,
           typename Properties = empty_properties_t>
-std::enable_if_t<detail::check_prefetch_AS<AddressSpace>>
+std::enable_if_t<detail::check_prefetch_AS<AddressSpace> &&
+                 is_property_list_v<std::decay_t<Properties>>>
 prefetch(multi_ptr<T, AddressSpace, IsDecorated> ptr,
          Properties properties = {}) {
   detail::prefetch_impl(ptr.get(), sizeof(T), properties);
@@ -147,7 +154,8 @@ prefetch(multi_ptr<T, AddressSpace, IsDecorated> ptr,
 template <typename T, access::address_space AddressSpace,
           access::decorated IsDecorated,
           typename Properties = empty_properties_t>
-std::enable_if_t<detail::check_prefetch_AS<AddressSpace>>
+std::enable_if_t<detail::check_prefetch_AS<AddressSpace> &&
+                 is_property_list_v<std::decay_t<Properties>>>
 prefetch(multi_ptr<T, AddressSpace, IsDecorated> ptr, size_t count,
          Properties properties = {}) {
   detail::prefetch_impl(ptr.get(), count * sizeof(T), properties);
@@ -157,7 +165,8 @@ template <typename DataT, int Dimensions, access_mode AccessMode,
           access::placeholder IsPlaceholder,
           typename Properties = empty_properties_t>
 std::enable_if_t<detail::check_prefetch_acc_mode<AccessMode> &&
-                 (Dimensions > 0)>
+                 (Dimensions > 0) &&
+                 is_property_list_v<std::decay_t<Properties>>>
 prefetch(
     accessor<DataT, Dimensions, AccessMode, target::device, IsPlaceholder> acc,
     id<Dimensions> offset, Properties properties = {}) {
@@ -168,7 +177,8 @@ template <typename DataT, int Dimensions, access_mode AccessMode,
           access::placeholder IsPlaceholder,
           typename Properties = empty_properties_t>
 std::enable_if_t<detail::check_prefetch_acc_mode<AccessMode> &&
-                 (Dimensions > 0)>
+                 (Dimensions > 0) &&
+                 is_property_list_v<std::decay_t<Properties>>>
 prefetch(
     accessor<DataT, Dimensions, AccessMode, target::device, IsPlaceholder> acc,
     size_t offset, size_t count, Properties properties = {}) {
@@ -176,25 +186,29 @@ prefetch(
 }
 
 template <typename Group, typename Properties = empty_properties_t>
-std::enable_if_t<sycl::is_group_v<std::decay_t<Group>>>
+std::enable_if_t<sycl::is_group_v<std::decay_t<Group>> &&
+                 is_property_list_v<std::decay_t<Properties>>>
 joint_prefetch(Group g, void *ptr, Properties properties = {}) {
   detail::joint_prefetch_impl(g, ptr, 1, properties);
 }
 
 template <typename Group, typename Properties = empty_properties_t>
-std::enable_if_t<sycl::is_group_v<std::decay_t<Group>>>
+std::enable_if_t<sycl::is_group_v<std::decay_t<Group>> &&
+                 is_property_list_v<std::decay_t<Properties>>>
 joint_prefetch(Group g, void *ptr, size_t bytes, Properties properties = {}) {
   detail::joint_prefetch_impl(g, ptr, bytes, properties);
 }
 
 template <typename Group, typename T, typename Properties = empty_properties_t>
-std::enable_if_t<sycl::is_group_v<std::decay_t<Group>>>
+std::enable_if_t<sycl::is_group_v<std::decay_t<Group>> &&
+                 is_property_list_v<std::decay_t<Properties>>>
 joint_prefetch(Group g, T *ptr, Properties properties = {}) {
   detail::joint_prefetch_impl(g, ptr, sizeof(T), properties);
 }
 
 template <typename Group, typename T, typename Properties = empty_properties_t>
-std::enable_if_t<sycl::is_group_v<std::decay_t<Group>>>
+std::enable_if_t<sycl::is_group_v<std::decay_t<Group>> &&
+                 is_property_list_v<std::decay_t<Properties>>>
 joint_prefetch(Group g, T *ptr, size_t count, Properties properties = {}) {
   detail::joint_prefetch_impl(g, ptr, count * sizeof(T), properties);
 }
@@ -203,7 +217,8 @@ template <typename Group, access::address_space AddressSpace,
           access::decorated IsDecorated,
           typename Properties = empty_properties_t>
 std::enable_if_t<detail::check_prefetch_AS<AddressSpace> &&
-                 sycl::is_group_v<std::decay_t<Group>>>
+                 sycl::is_group_v<std::decay_t<Group>> &&
+                 is_property_list_v<std::decay_t<Properties>>>
 joint_prefetch(Group g, multi_ptr<void, AddressSpace, IsDecorated> ptr,
                Properties properties = {}) {
   detail::joint_prefetch_impl(g, ptr.get(), 1, properties);
@@ -213,7 +228,8 @@ template <typename Group, access::address_space AddressSpace,
           access::decorated IsDecorated,
           typename Properties = empty_properties_t>
 std::enable_if_t<detail::check_prefetch_AS<AddressSpace> &&
-                 sycl::is_group_v<std::decay_t<Group>>>
+                 sycl::is_group_v<std::decay_t<Group>> &&
+                 is_property_list_v<std::decay_t<Properties>>>
 joint_prefetch(Group g, multi_ptr<void, AddressSpace, IsDecorated> ptr,
                size_t bytes, Properties properties = {}) {
   detail::joint_prefetch_impl(g, ptr.get(), bytes, properties);
@@ -223,7 +239,8 @@ template <typename Group, typename T, access::address_space AddressSpace,
           access::decorated IsDecorated,
           typename Properties = empty_properties_t>
 std::enable_if_t<detail::check_prefetch_AS<AddressSpace> &&
-                 sycl::is_group_v<std::decay_t<Group>>>
+                 sycl::is_group_v<std::decay_t<Group>> &&
+                 is_property_list_v<std::decay_t<Properties>>>
 joint_prefetch(Group g, multi_ptr<T, AddressSpace, IsDecorated> ptr,
                Properties properties = {}) {
   detail::joint_prefetch_impl(g, ptr.get(), sizeof(T), properties);
@@ -233,7 +250,8 @@ template <typename Group, typename T, access::address_space AddressSpace,
           access::decorated IsDecorated,
           typename Properties = empty_properties_t>
 std::enable_if_t<detail::check_prefetch_AS<AddressSpace> &&
-                 sycl::is_group_v<std::decay_t<Group>>>
+                 sycl::is_group_v<std::decay_t<Group>> &&
+                 is_property_list_v<std::decay_t<Properties>>>
 joint_prefetch(Group g, multi_ptr<T, AddressSpace, IsDecorated> ptr,
                size_t count, Properties properties = {}) {
   detail::joint_prefetch_impl(g, ptr.get(), count * sizeof(T), properties);
@@ -243,7 +261,8 @@ template <typename Group, typename DataT, int Dimensions,
           access_mode AccessMode, access::placeholder IsPlaceholder,
           typename Properties = empty_properties_t>
 std::enable_if_t<detail::check_prefetch_acc_mode<AccessMode> &&
-                 (Dimensions > 0) && sycl::is_group_v<std::decay_t<Group>>>
+                 (Dimensions > 0) && sycl::is_group_v<std::decay_t<Group>> &&
+                 is_property_list_v<std::decay_t<Properties>>>
 joint_prefetch(
     Group g,
     accessor<DataT, Dimensions, AccessMode, target::device, IsPlaceholder> acc,
@@ -255,7 +274,8 @@ template <typename Group, typename DataT, int Dimensions,
           access_mode AccessMode, access::placeholder IsPlaceholder,
           typename Properties = empty_properties_t>
 std::enable_if_t<detail::check_prefetch_acc_mode<AccessMode> &&
-                 (Dimensions > 0) && sycl::is_group_v<std::decay_t<Group>>>
+                 (Dimensions > 0) && sycl::is_group_v<std::decay_t<Group>> &&
+                 is_property_list_v<std::decay_t<Properties>>>
 joint_prefetch(
     Group g,
     accessor<DataT, Dimensions, AccessMode, target::device, IsPlaceholder> acc,
diff --git a/sycl/test-e2e/Prefetch/joint_prefetch.cpp b/sycl/test-e2e/Prefetch/joint_prefetch.cpp
new file mode 100644
index 0000000000000..71b3626426b9b
--- /dev/null
+++ b/sycl/test-e2e/Prefetch/joint_prefetch.cpp
@@ -0,0 +1,151 @@
+// REQUIRES: gpu && (level_zero || opencl)
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+
+#include <numeric>
+#include <sycl/sycl.hpp>
+
+using namespace sycl;
+namespace syclex = sycl::ext::oneapi::experimental;
+
+constexpr size_t N = 128;
+constexpr size_t NumWI = 4;
+constexpr size_t arrSize = N / NumWI;
+
+#define COMMA ,
+
+#define TEST_PREFETCH_WO_COUNT(PREFETCH_ARG)                                   \
+  {                                                                            \
+    std::vector<int> res(N);                                                   \
+    {                                                                          \
+      buffer<int, 1> buf(res.data(), N);                                       \
+      q.submit([&](handler &h) {                                               \
+        auto acc = buf.get_access<access_mode::write>(h);                      \
+        h.parallel_for(                                                        \
+            nd_range<1>(range<1>(NumWI), range<1>(NumWI)),                     \
+            [=](nd_item<1> idx) {                                              \
+              syclex::joint_prefetch(                                          \
+                  idx.get_group(), PREFETCH_ARG,                               \
+                  syclex::properties{syclex::prefetch_hint_L1});               \
+              for (int i = idx.get_local_linear_id() * arrSize;                \
+                   i < idx.get_local_linear_id() * arrSize + arrSize; i++)     \
+                acc[i] = dataChar[0] * dataChar[0];                            \
+            });                                                                \
+      });                                                                      \
+      q.wait();                                                                \
+    }                                                                          \
+    for (int i = 0; i < N; i++)                                                \
+      assert(res[i] == dataChar[0] * dataChar[0]);                             \
+  }
+
+#define TEST_PREFETCH_W_COUNT(PREFETCH_ARG)                                    \
+  {                                                                            \
+    std::vector<int> res(N);                                                   \
+    {                                                                          \
+      buffer<int, 1> buf(res.data(), N);                                       \
+      q.submit([&](handler &h) {                                               \
+        auto acc = buf.get_access<access_mode::write>(h);                      \
+        h.parallel_for(                                                        \
+            nd_range<1>(range<1>(NumWI), range<1>(NumWI)),                     \
+            [=](nd_item<1> idx) {                                              \
+              syclex::joint_prefetch(                                          \
+                  idx.get_group(), PREFETCH_ARG, arrSize,                      \
+                  syclex::properties{syclex::prefetch_hint_L1});               \
+              for (int i = idx.get_local_linear_id() * arrSize;                \
+                   i < idx.get_local_linear_id() * arrSize + arrSize; i++)     \
+                acc[i] = dataChar[i] * dataChar[i];                            \
+            });                                                                \
+      });                                                                      \
+      q.wait();                                                                \
+    }                                                                          \
+    for (int i = 0; i < N; i++)                                                \
+      assert(res[i] == dataChar[i] * dataChar[i]);                             \
+  }
+
+void testPrefetchWithAcc(queue q, const std::vector<int> &data,
+                         bool prefetchOneElem = true) {
+  std::vector<int> res(N);
+  {
+    buffer<int, 1> bufRes(res.data(), N);
+    buffer<int, 1> bufData(data.data(), N);
+    q.submit([&](handler &h) {
+      auto accRes = bufRes.get_access<access_mode::write>(h);
+      auto accData = bufData.get_access<access_mode::read>(h);
+      h.parallel_for(
+          nd_range<1>(range<1>(NumWI), range<1>(NumWI)), [=](nd_item<1> idx) {
+            if (prefetchOneElem)
+              syclex::joint_prefetch(
+                  idx.get_group(), accData, id(0),
+                  syclex::properties{syclex::prefetch_hint_L1});
+            else
+              syclex::joint_prefetch(
+                  idx.get_group(), accData,
+                  id(idx.get_local_linear_id() * arrSize), arrSize,
+                  syclex::properties{syclex::prefetch_hint_L1});
+            for (int i = idx.get_local_linear_id() * arrSize;
+                 i < idx.get_local_linear_id() * arrSize + arrSize; i++)
+              accRes[i] = prefetchOneElem ? accData[0] * accData[0]
+                                          : accData[i] * accData[i];
+          });
+    });
+    q.wait();
+  }
+  for (int i = 0; i < N; i++)
+    assert(res[i] == (prefetchOneElem ? data[0] * data[0] : data[i] * data[i]));
+}
+
+int main() {
+  queue q;
+
+  if (q.get_device().has(aspect::usm_shared_allocations)) {
+    auto *dataChar = malloc_shared<char>(N, q);
+    auto *dataVoid = reinterpret_cast<void *>(dataChar);
+    auto mPtrChar = address_space_cast<access::address_space::global_space,
+                                       access::decorated::yes>(dataChar);
+    auto mPtrVoid = address_space_cast<access::address_space::global_space,
+                                       access::decorated::yes>(dataVoid);
+
+    std::iota(dataChar, dataChar + N, 0);
+
+    // void prefetch(void* ptr, Properties properties = {});
+    TEST_PREFETCH_WO_COUNT(dataVoid)
+    // void prefetch(T* ptr, Properties properties = {});
+    TEST_PREFETCH_WO_COUNT(dataChar)
+    // void prefetch(void* ptr, size_t bytes, Properties properties = {});
+    TEST_PREFETCH_W_COUNT(reinterpret_cast<void *>(
+        &dataChar[idx.get_local_linear_id() * arrSize]))
+    // void prefetch(T* ptr, size_t count, Properties properties = {});
+    TEST_PREFETCH_W_COUNT(&dataChar[idx.get_local_linear_id() * arrSize])
+    // void prefetch(multi_ptr<void, AddressSpace, IsDecorated> ptr,
+    //               Properties properties = {});
+    TEST_PREFETCH_WO_COUNT(mPtrVoid)
+    // void prefetch(multi_ptr<T, AddressSpace, IsDecorated> ptr,
+    //               Properties properties = {});
+    TEST_PREFETCH_WO_COUNT(mPtrChar)
+    // void prefetch(multi_ptr<void, AddressSpace, IsDecorated> ptr,
+    //               size_t bytes, Properties properties = {});
+    TEST_PREFETCH_W_COUNT(
+        address_space_cast<access::address_space::global_space COMMA
+                               access::decorated::yes>(reinterpret_cast<void *>(
+            &dataChar[idx.get_local_linear_id() * arrSize])))
+    // void prefetch(multi_ptr<T, AddressSpace, IsDecorated> ptr, size_t count,
+    //               Properties properties = {});
+    TEST_PREFETCH_W_COUNT(
+        address_space_cast<
+            access::address_space::global_space COMMA access::decorated::yes>(
+            &dataChar[idx.get_local_linear_id() * arrSize]))
+  }
+  {
+    std::vector<int> data(N);
+    std::iota(data.begin(), data.end(), 0);
+
+    // void prefetch(accessor<DataT, Dimensions, AccessMode, target::device,
+    //               IsPlaceholder> acc, id<Dimensions> offset,
+    //               Properties properties = {});
+    testPrefetchWithAcc(q, data);
+    // void prefetch(accessor<DataT, Dimensions, AccessMode, target::device,
+    //               IsPlaceholder> acc, id<Dimensions> offset, size_t count,
+    //               Properties properties = {});
+    testPrefetchWithAcc(q, data, false);
+  }
+}
diff --git a/sycl/test-e2e/Prefetch/prefetch.cpp b/sycl/test-e2e/Prefetch/prefetch.cpp
new file mode 100644
index 0000000000000..4e70368ebc1a6
--- /dev/null
+++ b/sycl/test-e2e/Prefetch/prefetch.cpp
@@ -0,0 +1,136 @@
+// REQUIRES: gpu && (level_zero || opencl)
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+
+#include <numeric>
+#include <sycl/sycl.hpp>
+
+using namespace sycl;
+namespace syclex = sycl::ext::oneapi::experimental;
+
+constexpr size_t N = 128;
+constexpr size_t NumWI = 4;
+constexpr size_t arrSize = N / NumWI;
+
+#define COMMA ,
+
+#define TEST_PREFETCH_WO_COUNT(PREFETCH_ARG)                                   \
+  {                                                                            \
+    std::vector<int> res(N);                                                   \
+    {                                                                          \
+      buffer<int, 1> buf(res.data(), N);                                       \
+      q.submit([&](handler &h) {                                               \
+        auto acc = buf.get_access<access_mode::write>(h);                      \
+        syclex::prefetch(PREFETCH_ARG,                                         \
+                         syclex::properties{syclex::prefetch_hint_L1});        \
+        h.parallel_for(NumWI, [=](id<1> idx) {                                 \
+          for (int i = idx * arrSize; i < idx * arrSize + arrSize; i++)        \
+            acc[i] = dataChar[0] * dataChar[0];                                \
+        });                                                                    \
+      });                                                                      \
+      q.wait();                                                                \
+    }                                                                          \
+    for (int i = 0; i < N; i++)                                                \
+      assert(res[i] == dataChar[0] * dataChar[0]);                             \
+  }
+
+#define TEST_PREFETCH_W_COUNT(PREFETCH_ARG)                                    \
+  {                                                                            \
+    std::vector<int> res(N);                                                   \
+    {                                                                          \
+      buffer<int, 1> buf(res.data(), N);                                       \
+      q.submit([&](handler &h) {                                               \
+        auto acc = buf.get_access<access_mode::write>(h);                      \
+        h.parallel_for(NumWI, [=](id<1> idx) {                                 \
+          syclex::prefetch(PREFETCH_ARG, arrSize,                              \
+                           syclex::properties{syclex::prefetch_hint_L1});      \
+          for (int i = idx * arrSize; i < idx * arrSize + arrSize; i++)        \
+            acc[i] = dataChar[i] * dataChar[i];                                \
+        });                                                                    \
+      });                                                                      \
+      q.wait();                                                                \
+    }                                                                          \
+    for (int i = 0; i < N; i++)                                                \
+      assert(res[i] == dataChar[i] * dataChar[i]);                             \
+  }
+
+void testPrefetchWithAcc(queue q, const std::vector<int> &data,
+                         bool prefetchOneElem = true) {
+  std::vector<int> res(N);
+  {
+    buffer<int, 1> bufRes(res.data(), N);
+    buffer<int, 1> bufData(data.data(), N);
+    q.submit([&](handler &h) {
+      auto accRes = bufRes.get_access<access_mode::write>(h);
+      auto accData = bufData.get_access<access_mode::read>(h);
+      if (prefetchOneElem)
+        syclex::prefetch(accData, id(0),
+                         syclex::properties{syclex::prefetch_hint_L1});
+      h.parallel_for(NumWI, [=](id<1> idx) {
+        if (!prefetchOneElem)
+          syclex::prefetch(accData, id(idx * arrSize), arrSize,
+                           syclex::properties{syclex::prefetch_hint_L1});
+        for (int i = idx * arrSize; i < idx * arrSize + arrSize; i++)
+          accRes[i] = prefetchOneElem ? accData[0] * accData[0]
+                                      : accData[i] * accData[i];
+      });
+    });
+    q.wait();
+  }
+  for (int i = 0; i < N; i++)
+    assert(res[i] == (prefetchOneElem ? data[0] * data[0] : data[i] * data[i]));
+}
+
+int main() {
+  queue q;
+
+  if (q.get_device().has(aspect::usm_shared_allocations)) {
+    auto *dataChar = malloc_shared<char>(N, q);
+    auto *dataVoid = reinterpret_cast<void *>(dataChar);
+    auto mPtrChar = address_space_cast<access::address_space::global_space,
+                                       access::decorated::yes>(dataChar);
+    auto mPtrVoid = address_space_cast<access::address_space::global_space,
+                                       access::decorated::yes>(dataVoid);
+    std::iota(dataChar, dataChar + N, 0);
+
+    // void prefetch(void* ptr, Properties properties = {});
+    TEST_PREFETCH_WO_COUNT(dataVoid)
+    // void prefetch(T* ptr, Properties properties = {});
+    TEST_PREFETCH_WO_COUNT(dataChar)
+    // void prefetch(void* ptr, size_t bytes, Properties properties = {});
+    TEST_PREFETCH_W_COUNT(reinterpret_cast<void *>(&dataChar[idx * arrSize]))
+    // void prefetch(T* ptr, size_t count, Properties properties = {});
+    TEST_PREFETCH_W_COUNT(&dataChar[idx * arrSize])
+    // void prefetch(multi_ptr<void, AddressSpace, IsDecorated> ptr,
+    //               Properties properties = {});
+    TEST_PREFETCH_WO_COUNT(mPtrVoid)
+    // void prefetch(multi_ptr<T, AddressSpace, IsDecorated> ptr,
+    //               Properties properties = {});
+    TEST_PREFETCH_WO_COUNT(mPtrChar)
+    // void prefetch(multi_ptr<void, AddressSpace, IsDecorated> ptr,
+    //               size_t bytes, Properties properties = {});
+    TEST_PREFETCH_W_COUNT(
+        address_space_cast<
+            access::address_space::global_space COMMA access::decorated::yes>(
+            reinterpret_cast<void *>(&dataChar[idx * arrSize])))
+    // void prefetch(multi_ptr<T, AddressSpace, IsDecorated> ptr, size_t count,
+    //               Properties properties = {});
+    TEST_PREFETCH_W_COUNT(
+        address_space_cast<
+            access::address_space::global_space COMMA access::decorated::yes>(
+            &dataChar[idx * arrSize]))
+  }
+  {
+    std::vector<int> data(N);
+    std::iota(data.begin(), data.end(), 0);
+
+    // void prefetch(accessor<DataT, Dimensions, AccessMode, target::device,
+    //               IsPlaceholder> acc, id<Dimensions> offset,
+    //               Properties properties = {});
+    testPrefetchWithAcc(q, data);
+    // void prefetch(accessor<DataT, Dimensions, AccessMode, target::device,
+    //               IsPlaceholder> acc, id<Dimensions> offset, size_t count,
+    //               Properties properties = {});
+    testPrefetchWithAcc(q, data, false);
+  }
+}
diff --git a/sycl/test/extensions/prefetch.cpp b/sycl/test/extensions/prefetch.cpp
index 56fd678f83336..af9e99d7fb4b9 100644
--- a/sycl/test/extensions/prefetch.cpp
+++ b/sycl/test/extensions/prefetch.cpp
@@ -2,10 +2,9 @@
 
 #include <sycl/sycl.hpp>
 
-int data[] = {0, 1, 2, 3};
-
 int main() {
   namespace syclex = sycl::ext::oneapi::experimental;
+  int data[] = {0, 1, 2, 3};
   void *dataPtrVoid = data;
   int *dataPtrInt = data;
   auto prop = syclex::properties{syclex::prefetch_hint_L1};
@@ -17,41 +16,61 @@ int main() {
       auto acc = buf.get_access<sycl::access_mode::read>(h);
       h.parallel_for<class Kernel>(
           sycl::nd_range<1>(1, 1), ([=](sycl::nd_item<1> index) {
+            syclex::prefetch(dataPtrVoid);
             syclex::prefetch(dataPtrVoid, prop);
+            syclex::prefetch(dataPtrVoid, 16);
             syclex::prefetch(dataPtrVoid, 16, prop);
 
+            syclex::prefetch(dataPtrInt);
             syclex::prefetch(dataPtrInt, prop);
+            syclex::prefetch(dataPtrInt, 4);
             syclex::prefetch(dataPtrInt, 4, prop);
 
             auto mPtrVoid = sycl::address_space_cast<
                 sycl::access::address_space::global_space,
                 sycl::access::decorated::yes>(dataPtrVoid);
+            syclex::prefetch(mPtrVoid);
             syclex::prefetch(mPtrVoid, prop);
+            syclex::prefetch(mPtrVoid, 16);
             syclex::prefetch(mPtrVoid, 16, prop);
 
             auto mPtrInt = sycl::address_space_cast<
                 sycl::access::address_space::global_space,
                 sycl::access::decorated::yes>(dataPtrInt);
+            syclex::prefetch(mPtrInt);
             syclex::prefetch(mPtrInt, prop);
+            syclex::prefetch(mPtrInt, 8);
             syclex::prefetch(mPtrInt, 8, prop);
 
+            syclex::prefetch(acc, sycl::id(0));
             syclex::prefetch(acc, sycl::id(0), prop);
+            syclex::prefetch(acc, sycl::id(0), 4);
             syclex::prefetch(acc, sycl::id(0), 4, prop);
 
             auto g = index.get_group();
+            syclex::joint_prefetch(g, dataPtrVoid);
             syclex::joint_prefetch(g, dataPtrVoid, prop);
+            syclex::joint_prefetch(g, dataPtrVoid, 16);
             syclex::joint_prefetch(g, dataPtrVoid, 16, prop);
 
+            syclex::joint_prefetch(g, dataPtrInt);
             syclex::joint_prefetch(g, dataPtrInt, prop);
+            syclex::joint_prefetch(g, dataPtrInt, 4);
             syclex::joint_prefetch(g, dataPtrInt, 4, prop);
 
+            syclex::joint_prefetch(g, mPtrVoid);
             syclex::joint_prefetch(g, mPtrVoid, prop);
+            syclex::joint_prefetch(g, mPtrVoid, 16);
             syclex::joint_prefetch(g, mPtrVoid, 16, prop);
 
+            syclex::joint_prefetch(g, mPtrInt);
             syclex::joint_prefetch(g, mPtrInt, prop);
+            syclex::joint_prefetch(g, mPtrInt, 8);
             syclex::joint_prefetch(g, mPtrInt, 8, prop);
 
+            syclex::joint_prefetch(g, acc, sycl::id(0));
             syclex::joint_prefetch(g, acc, sycl::id(0), prop);
+            syclex::joint_prefetch(g, acc, sycl::id(0), 4);
             syclex::joint_prefetch(g, acc, sycl::id(0), 4, prop);
           }));
     });

From 73d4d25a3d4cca3a869e0c2bcf112895595c4b4d Mon Sep 17 00:00:00 2001
From: Nick Sarnie <sarnex@users.noreply.github.com>
Date: Thu, 9 Nov 2023 23:25:36 +0000
Subject: [PATCH 874/877] [SYCL][ESIMD][E2E] Require O2 for InlineAsm global
 test (#11841)

This started causing a hang on a newer GPU driver with O0, we have an
internal tracker for this. Force O2 for now.

Signed-off-by: Sarnie, Nick <nick.sarnie@intel.com>
---
 sycl/test-e2e/ESIMD/InlineAsm/asm_glb.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/sycl/test-e2e/ESIMD/InlineAsm/asm_glb.cpp b/sycl/test-e2e/ESIMD/InlineAsm/asm_glb.cpp
index 54ca257fb95e3..6d6158e84d488 100644
--- a/sycl/test-e2e/ESIMD/InlineAsm/asm_glb.cpp
+++ b/sycl/test-e2e/ESIMD/InlineAsm/asm_glb.cpp
@@ -5,7 +5,8 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// RUN: %{build} -o %t.out
+// Force -O2 as it currently fails in O0 due to missing VC support
+// RUN: %{build} -O2 -o %t.out
 // RUN: %{run} %t.out
 
 #include "../esimd_test_utils.hpp"

From 24ea16240484c45bde616977fcfd37d29054f5d7 Mon Sep 17 00:00:00 2001
From: Vyacheslav Klochkov <vyacheslav.n.klochkov@intel.com>
Date: Thu, 9 Nov 2023 18:31:11 -0600
Subject: [PATCH 875/877] [SYCL][ESIMD][NFC] Replace the uses of std::max in
 ESIMD with ternary op (#11847)

Even though std::max is completely legal/correct to use in SYCL/ESIMD
especially in 'constexpr' context, it may cause problems on Windows in
some non-trivial configurations with some odd order of includes of
system and SYCL header files.

Signed-off-by: Klochkov, Vyacheslav N <vyacheslav.n.klochkov@intel.com>
---
 sycl/include/sycl/ext/intel/esimd/memory.hpp   | 16 +++++++---------
 sycl/include/sycl/ext/intel/esimd/xmx/dpas.hpp |  5 ++++-
 2 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/sycl/include/sycl/ext/intel/esimd/memory.hpp b/sycl/include/sycl/ext/intel/esimd/memory.hpp
index d7f9130df4007..6f3a1ebb11a2e 100644
--- a/sycl/include/sycl/ext/intel/esimd/memory.hpp
+++ b/sycl/include/sycl/ext/intel/esimd/memory.hpp
@@ -247,7 +247,7 @@ block_load_impl(const T *p, simd_mask<1> pred, FlagsT flags) {
 
   constexpr int SmallIntFactor64Bit = sizeof(uint64_t) / sizeof(T);
   constexpr int SmallIntFactor32Bit =
-      (std::max)(static_cast<size_t>(1), sizeof(uint32_t) / sizeof(T));
+      sizeof(uint32_t) / sizeof(T) > 1 ? sizeof(uint32_t) / sizeof(T) : 1;
   static_assert(NElts > 0 && NElts % SmallIntFactor32Bit == 0,
                 "Number of elements is not supported by Transposed load");
 
@@ -333,7 +333,7 @@ block_load_impl(const T *p, simd_mask<1> pred, simd<T, NElts> pass_thru,
 
   constexpr int SmallIntFactor64Bit = sizeof(uint64_t) / sizeof(T);
   constexpr int SmallIntFactor32Bit =
-      (std::max)(static_cast<size_t>(1), sizeof(uint32_t) / sizeof(T));
+      sizeof(uint32_t) / sizeof(T) > 1 ? sizeof(uint32_t) / sizeof(T) : 1;
   static_assert(NElts > 0 && NElts % SmallIntFactor32Bit == 0,
                 "Number of elements is not supported by Transposed load");
 
@@ -430,7 +430,7 @@ __ESIMD_API
 
   constexpr int SmallIntFactor64Bit = sizeof(uint64_t) / sizeof(T);
   constexpr int SmallIntFactor32Bit =
-      (std::max)(static_cast<size_t>(1), sizeof(uint32_t) / sizeof(T));
+      sizeof(uint32_t) / sizeof(T) > 1 ? sizeof(uint32_t) / sizeof(T) : 1;
   static_assert(NElts > 0 && NElts % SmallIntFactor32Bit == 0,
                 "Number of elements is not supported by Transposed load");
 
@@ -529,7 +529,7 @@ __ESIMD_API
 
   constexpr int SmallIntFactor64Bit = sizeof(uint64_t) / sizeof(T);
   constexpr int SmallIntFactor32Bit =
-      (std::max)(static_cast<size_t>(1), sizeof(uint32_t) / sizeof(T));
+      sizeof(uint32_t) / sizeof(T) > 1 ? sizeof(uint32_t) / sizeof(T) : 1;
   static_assert(NElts > 0 && NElts % SmallIntFactor32Bit == 0,
                 "Number of elements is not supported by Transposed load");
 
@@ -586,9 +586,7 @@ block_store_impl(T *p, simd<T, NElts> vals, simd_mask<1> pred, FlagsT flags) {
 
   constexpr int SmallIntFactor64Bit = sizeof(uint64_t) / sizeof(T);
   constexpr int SmallIntFactor32Bit =
-      sizeof(uint32_t) / sizeof(T) > static_cast<size_t>(1)
-          ? sizeof(uint32_t) / sizeof(T)
-          : static_cast<size_t>(1);
+      sizeof(uint32_t) / sizeof(T) > 1 ? sizeof(uint32_t) / sizeof(T) : 1;
   static_assert(NElts > 0 && NElts % SmallIntFactor32Bit == 0,
                 "Number of elements is not supported by Transposed store");
 
@@ -2614,7 +2612,7 @@ slm_block_load(uint32_t byte_offset, simd_mask<1> pred,
 
   constexpr int SmallIntFactor64Bit = sizeof(uint64_t) / sizeof(T);
   constexpr int SmallIntFactor32Bit =
-      (std::max)(static_cast<size_t>(1), sizeof(uint32_t) / sizeof(T));
+      sizeof(uint32_t) / sizeof(T) > 1 ? sizeof(uint32_t) / sizeof(T) : 1;
   static_assert(N > 0 && N % SmallIntFactor32Bit == 0,
                 "Number of elements is not supported by Transposed load");
 
@@ -2700,7 +2698,7 @@ slm_block_load(uint32_t offset, simd_mask<1> pred, simd<T, N> pass_thru,
 
   constexpr int SmallIntFactor64Bit = sizeof(uint64_t) / sizeof(T);
   constexpr int SmallIntFactor32Bit =
-      (std::max)(static_cast<size_t>(1), sizeof(uint32_t) / sizeof(T));
+      sizeof(uint32_t) / sizeof(T) > 1 ? sizeof(uint32_t) / sizeof(T) : 1;
   static_assert(N > 0 && N % SmallIntFactor32Bit == 0,
                 "Number of elements is not supported by Transposed load");
 
diff --git a/sycl/include/sycl/ext/intel/esimd/xmx/dpas.hpp b/sycl/include/sycl/ext/intel/esimd/xmx/dpas.hpp
index 5c86b194c192b..0a23acf1b053c 100644
--- a/sycl/include/sycl/ext/intel/esimd/xmx/dpas.hpp
+++ b/sycl/include/sycl/ext/intel/esimd/xmx/dpas.hpp
@@ -89,8 +89,11 @@ constexpr int verify_parameters_and_deduce_exec_size() {
                 "Cannot deduce element size of input arguments");
   verify_repeat_count<RepeatCount, AElemBitSize, BElemBitSize, IsDPASW>();
 
+  constexpr int MaxElemBitSize =
+      AElemBitSize > BElemBitSize ? AElemBitSize : BElemBitSize;
+  constexpr int MaxElemsInDword = 32 / MaxElemBitSize;
   constexpr int OpsPerChannel =
-      (std::max)((std::min)(32 / (std::max)(AElemBitSize, BElemBitSize), 8), 1);
+      MaxElemsInDword > 8 ? 8 : (MaxElemsInDword < 1 ? 1 : MaxElemsInDword);
 
   // A(_Mx_K) * B(_Kx_N) + C(_Mx_N)
   // where:

From c6362a010912d3a71b3eefd92b9e39060920b371 Mon Sep 17 00:00:00 2001
From: Vyacheslav Klochkov <vyacheslav.n.klochkov@intel.com>
Date: Thu, 9 Nov 2023 21:56:21 -0600
Subject: [PATCH 876/877] [ESIMD] Fix perf regression caused by assumed align
 in block_load(usm) (#11850)

The element-size address alignment is valid from correctness point of
view, but using 1-byte and 2-byte alignment implicitly causes
performance regression for block_load(const int8_t *, ...) and
block_load(const int16_t *, ...) because GPU BE have to generate slower
GATHER instead of more efficient BLOCK-LOAD. Without this fix block-load
causes up to 44% performance slow-down on some apps that used
block_load() with alignment assumptions used before block_load(usm, ...,
compile_time_props) was implemented.

The reasoning for the expected/assumed alignment from element-size to
4-bytes for byte- and word-vectors is such:
   The idea of block_load() call (opposing to gather() call) is to have
   efficient block-load, and thus the assumed alignment is such that
   allows to generate block-load. This is a bit more tricky for user
   but that is how block_load/store API always worked before: block-load
   had restrictions that needed to be honored.
   To be on safer side, user can always pass the guaranteed alignment.

---------

Signed-off-by: Klochkov, Vyacheslav N <vyacheslav.n.klochkov@intel.com>
---
 sycl/include/sycl/ext/intel/esimd/memory.hpp | 27 ++++++++++----------
 sycl/test/esimd/memory_properties.cpp        |  9 +++++++
 2 files changed, 22 insertions(+), 14 deletions(-)

diff --git a/sycl/include/sycl/ext/intel/esimd/memory.hpp b/sycl/include/sycl/ext/intel/esimd/memory.hpp
index 6f3a1ebb11a2e..57ebec64ca773 100644
--- a/sycl/include/sycl/ext/intel/esimd/memory.hpp
+++ b/sycl/include/sycl/ext/intel/esimd/memory.hpp
@@ -695,9 +695,11 @@ block_store(Tx *addr, simd<Tx, N> vals, Flags) {
 /// the cache_hint::none value is assumed by default.
 ///
 /// Alignment: If \p props does not specify the 'alignment' property, then
-/// the default assumed alignment is the minimally required element-size
-/// alignment. Note that additional/temporary restrictions may apply
-/// (see Restrictions below).
+/// the default assumed alignment is 4-bytes for 4-byte or smaller elements
+/// and 8-bytes for 8-byte elements. The address may be element-size aligned
+/// even for byte- and word-elements, but in such case the smaller alignment
+/// property must explicitly passed to this function. Extra restrictions
+/// may be in place - see Restrictions/R1 below.
 ///
 /// Restrictions - cache hint imposed - temporary:
 /// If L1 or L2 cache hint is passed, then:
@@ -727,21 +729,16 @@ block_load(const T *ptr, PropertyListT props = {}) {
                 "L3 cache hint is reserved. The old/experimental L3 LSC cache "
                 "hint is cache_level::L2 now.");
 
+  constexpr size_t DefaultAlignment = (sizeof(T) <= 4) ? 4 : sizeof(T);
+  constexpr size_t Alignment =
+      detail::getPropertyValue<PropertyListT, alignment_key>(DefaultAlignment);
   if constexpr (L1Hint != cache_hint::none || L2Hint != cache_hint::none) {
     detail::check_cache_hint<detail::cache_action::load, L1Hint, L2Hint>();
-    constexpr size_t DefaultAlignment = (sizeof(T) <= 4) ? 4 : sizeof(T);
-    constexpr size_t Alignment =
-        detail::getPropertyValue<PropertyListT, alignment_key>(
-            DefaultAlignment);
 
     simd_mask<1> Mask = 1;
     return detail::block_load_impl<T, N, L1Hint, L2Hint>(
         ptr, Mask, overaligned_tag<Alignment>{});
   } else {
-    // If the alignment property is not passed, then assume the pointer
-    // is element-aligned.
-    constexpr size_t Alignment =
-        detail::getPropertyValue<PropertyListT, alignment_key>(sizeof(T));
     return block_load<T, N>(ptr, overaligned_tag<Alignment>{});
   }
 }
@@ -763,9 +760,11 @@ block_load(const T *ptr, PropertyListT props = {}) {
 /// the cache_hint::none value is assumed by default.
 ///
 /// Alignment: If \p props does not specify the 'alignment' property, then
-/// the default assumed alignment is the minimally required element-size
-/// alignment. Note that additional/temporary restrictions may apply
-/// (see Restrictions below).
+/// the default assumed alignment is 4-bytes for 4-byte or smaller elements
+/// and 8-bytes for 8-byte elements. The address may be element-size aligned
+/// even for byte- and word-elements, but in such case the smaller alignment
+/// property must explicitly passed to this function. Extra restrictions
+/// may be in place - see Restrictions/R1 below.
 ///
 /// Restrictions - cache hint imposed - temporary:
 /// If L1 or L2 cache hint is passed, then:
diff --git a/sycl/test/esimd/memory_properties.cpp b/sycl/test/esimd/memory_properties.cpp
index 21daa5758693e..b9693fafa22d4 100644
--- a/sycl/test/esimd/memory_properties.cpp
+++ b/sycl/test/esimd/memory_properties.cpp
@@ -64,6 +64,7 @@ SYCL_ESIMD_FUNCTION SYCL_EXTERNAL void foo(AccType &acc,
   simd<float, N> pass_thru = 1;
   simd<int, N> pass_thrui = 1;
   const int *ptri = reinterpret_cast<const int *>(ptrf);
+  const int8_t *ptrb = reinterpret_cast<const int8_t *>(ptrf);
 
   // CHECK: call <4 x float> @llvm.genx.lsc.load.stateless.v4f32.v1i1.v1i64(<1 x i1> {{[^)]+}}, i8 0, i8 5, i8 2, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i64> {{[^)]+}}, i32 0)
   auto d1 = block_load<float, N>(ptrf, props_a);
@@ -187,4 +188,12 @@ SYCL_ESIMD_FUNCTION SYCL_EXTERNAL void foo(AccType &acc,
   simd<double, 4> pass_thrud4 = 2.0;
   auto lacc_bl6 = block_load<double, 4>(local_acc, byte_offset32, mask,
                                         pass_thrud4, props_a);
+
+  // Check the default/assumed alignment when the alignment property is
+  // not specified explicitly.
+  // TODO: Extend this kind of tests:
+  //   {usm, acc, local_acc, slm} x {byte, word, dword, qword}.
+
+  // CHECK: load <16 x i8>, ptr addrspace(4) {{[^)]+}}, align 4
+  auto align_check1 = block_load<int8_t, 16>(ptrb);
 }

From b402905d5ad35995c7f363dbd80bd5721e6ccd68 Mon Sep 17 00:00:00 2001
From: Ewan Crawford <ewan@codeplay.com>
Date: Mon, 30 Oct 2023 08:10:20 +0000
Subject: [PATCH 877/877] [SYCL][Graph] Update spec supported features

The following features defined in the specification
as unsupported, have working implementations upstream.

* https://github.com/intel/llvm/pull/11418
* https://github.com/intel/llvm/pull/11505
* https://github.com/intel/llvm/pull/11556
---
 .../experimental/sycl_ext_oneapi_graph.asciidoc | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc
index 960839ff72024..52962dbfc2731 100644
--- a/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc
+++ b/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc
@@ -1164,11 +1164,12 @@ passed an invalid event.
 
 The new handler methods, and queue shortcuts, defined by
 link:../supported/sycl_ext_oneapi_enqueue_barrier.asciidoc[sycl_ext_oneapi_enqueue_barrier]
-cannot be used in graph nodes. A synchronous exception will be thrown with
-error code `invalid` if a user tries to add them to a graph.
-
-Removing this restriction is something we may look at for future revisions of
-`sycl_ext_oneapi_graph`.
+can only be used in graph nodes created using the Record & Replay API, as
+barriers rely on events to enforce dependencies. A synchronous exception will be
+thrown with error code `invalid` if a user tries to add them to a graph using
+the Explicit API. Empty nodes created with the `node::depends_on_all_leaves`
+property can be used instead of barriers when a user is building a graph with
+the explicit API.
 
 ==== sycl_ext_oneapi_memcpy2d
 
@@ -1737,12 +1738,14 @@ if used in application code.
 . Using `handler::memset` in a graph node.
 . Using `handler::prefetch` in a graph node.
 . Using `handler::memadvise` in a graph node.
-. Using specialization constants in a graph node.
 . Using reductions in a graph node.
 . Using sycl streams in a graph node.
-. Using a kernel bundle in a graph node.
 . Profiling an event returned from graph submission with
   `event::get_profiling_info()`.
+* Level Zero immediate command-lists are not supported, and
+  `sycl::ext::intel::property::queue::no_immediate_command_list`
+  should be set on construction to any queues an executable
+  graph is submitted to.
 
 == Revision History